1895 files changed, 256062 insertions, 108481 deletions
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
index 84da76b..4c29aea 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -332,8 +332,8 @@ FunctionModRefBehavior AAResults::getModRefBehavior(const Function *F) {
 
 ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
                                     const MemoryLocation &Loc) {
-  // Be conservative in the face of volatile/atomic.
-  if (!L->isUnordered())
+  // Be conservative in the face of atomic.
+  if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered))
     return MRI_ModRef;
 
   // If the load address doesn't alias the given address, it doesn't read
@@ -347,8 +347,8 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
 
 ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
                                     const MemoryLocation &Loc) {
-  // Be conservative in the face of volatile/atomic.
-  if (!S->isUnordered())
+  // Be conservative in the face of atomic.
+  if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered))
     return MRI_ModRef;
 
   if (Loc.Ptr) {
@@ -367,6 +367,14 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
   return MRI_Mod;
 }
 
+ModRefInfo AAResults::getModRefInfo(const FenceInst *S, const MemoryLocation &Loc) {
+  // If we know that the location is a constant memory location, the fence
+  // cannot modify this location.
+  if (Loc.Ptr && pointsToConstantMemory(Loc))
+    return MRI_Ref;
+  return MRI_ModRef;
+}
+
 ModRefInfo AAResults::getModRefInfo(const VAArgInst *V,
                                     const MemoryLocation &Loc) {
 
@@ -689,7 +697,7 @@ AAResults llvm::createLegacyPMAAResults(Pass &P, Function &F,
 
 bool llvm::isNoAliasCall(const Value *V) {
   if (auto CS = ImmutableCallSite(V))
-    return CS.paramHasAttr(0, Attribute::NoAlias);
+    return CS.hasRetAttr(Attribute::NoAlias);
   return false;
 }
 
diff --git a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index 4d6a6c9..435c782 100644
--- a/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/contrib/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -14,9 +14,9 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
index 701b0e1..4dfa254 100644
--- a/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/contrib/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -17,8 +17,8 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -199,9 +199,10 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
   // Check the unknown instructions...
   if (!UnknownInsts.empty()) {
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
-      if (AA.getModRefInfo(UnknownInsts[i],
-                           MemoryLocation(Ptr, Size, AAInfo)) != MRI_NoModRef)
-        return true;
+      if (auto *Inst = getUnknownInst(i))
+        if (AA.getModRefInfo(Inst, MemoryLocation(Ptr, Size, AAInfo)) !=
+            MRI_NoModRef)
+          return true;
   }
 
   return false;
@@ -217,10 +218,12 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
     return false;
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
-    ImmutableCallSite C1(getUnknownInst(i)), C2(Inst);
-    if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
-        AA.getModRefInfo(C2, C1) != MRI_NoModRef)
-      return true;
+    if (auto *UnknownInst = getUnknownInst(i)) {
+      ImmutableCallSite C1(UnknownInst), C2(Inst);
+      if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
+          AA.getModRefInfo(C2, C1) != MRI_NoModRef)
+        return true;
+    }
   }
 
   for (iterator I = begin(), E = end(); I != E; ++I)
@@ -471,7 +474,8 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 
     // If there are any call sites in the alias set, add them to this AST.
     for (unsigned i = 0, e = AS.UnknownInsts.size(); i != e; ++i)
-      add(AS.UnknownInsts[i]);
+      if (auto *Inst = AS.getUnknownInst(i))
+        add(Inst);
 
     // Loop over all of the pointers in this alias set.
     for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
@@ -489,19 +493,6 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 // dangling pointers to deleted instructions.
 //
 void AliasSetTracker::deleteValue(Value *PtrVal) {
-  // If this is a call instruction, remove the callsite from the appropriate
-  // AliasSet (if present).
-  if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
-    if (Inst->mayReadOrWriteMemory()) {
-      // Scan all the alias sets to see if this call site is contained.
-      for (iterator I = begin(), E = end(); I != E;) {
-        iterator Cur = I++;
-        if (!Cur->Forward)
-          Cur->removeUnknownInst(*this, Inst);
-      }
-    }
-  }
-
   // First, look up the PointerRec for this pointer.
   PointerMapType::iterator I = PointerMap.find_as(PtrVal);
   if (I == PointerMap.end()) return;  // Noop
@@ -633,7 +624,8 @@ void AliasSet::print(raw_ostream &OS) const {
     OS << "\n    " << UnknownInsts.size() << " Unknown instructions: ";
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
       if (i) OS << ", ";
-      UnknownInsts[i]->printAsOperand(OS);
+      if (auto *I = getUnknownInst(i))
+        I->printAsOperand(OS);
     }
   }
   OS << "\n";
diff --git a/contrib/llvm/lib/Analysis/Analysis.cpp b/contrib/llvm/lib/Analysis/Analysis.cpp
index 0e7cf40..0e0b5c9 100644
--- a/contrib/llvm/lib/Analysis/Analysis.cpp
+++ b/contrib/llvm/lib/Analysis/Analysis.cpp
@@ -57,6 +57,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLazyBranchProbabilityInfoPassPass(Registry);
   initializeLazyBlockFrequencyInfoPassPass(Registry);
   initializeLazyValueInfoWrapperPassPass(Registry);
+  initializeLazyValueInfoPrinterPass(Registry);
   initializeLintPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
@@ -78,6 +79,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeTypeBasedAAWrapperPassPass(Registry);
   initializeScopedNoAliasAAWrapperPassPass(Registry);
   initializeLCSSAVerificationPassPass(Registry);
+  initializeMemorySSAWrapperPassPass(Registry);
+  initializeMemorySSAPrinterLegacyPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
diff --git a/contrib/llvm/lib/Analysis/AssumptionCache.cpp b/contrib/llvm/lib/Analysis/AssumptionCache.cpp
index 5851594..3ff2789 100644
--- a/contrib/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/contrib/llvm/lib/Analysis/AssumptionCache.cpp
@@ -24,15 +24,21 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-SmallVector<WeakVH, 1> &AssumptionCache::getOrInsertAffectedValues(Value *V) {
+static cl::opt<bool>
+    VerifyAssumptionCache("verify-assumption-cache", cl::Hidden,
+                          cl::desc("Enable verification of assumption cache"),
+                          cl::init(false));
+
+SmallVector<WeakTrackingVH, 1> &
+AssumptionCache::getOrInsertAffectedValues(Value *V) {
   // Try using find_as first to avoid creating extra value handles just for the
   // purpose of doing the lookup.
   auto AVI = AffectedValues.find_as(V);
   if (AVI != AffectedValues.end())
     return AVI->second;
 
-  auto AVIP = AffectedValues.insert({
-      AffectedValueCallbackVH(V, this), SmallVector<WeakVH, 1>()});
+  auto AVIP = AffectedValues.insert(
+      {AffectedValueCallbackVH(V, this), SmallVector<WeakTrackingVH, 1>()});
   return AVIP.first->second;
 }
 
@@ -47,9 +53,11 @@ void AssumptionCache::updateAffectedValues(CallInst *CI) {
     } else if (auto *I = dyn_cast<Instruction>(V)) {
       Affected.push_back(I);
 
-      if (I->getOpcode() == Instruction::BitCast ||
-          I->getOpcode() == Instruction::PtrToInt) {
-        auto *Op = I->getOperand(0);
+      // Peek through unary operators to find the source of the condition.
+      Value *Op;
+      if (match(I, m_BitCast(m_Value(Op))) ||
+          match(I, m_PtrToInt(m_Value(Op))) ||
+          match(I, m_Not(m_Value(Op)))) {
         if (isa<Instruction>(Op) || isa<Argument>(Op))
           Affected.push_back(Op);
       }
@@ -76,18 +84,11 @@ void AssumptionCache::updateAffectedValues(CallInst *CI) {
         Value *B;
         ConstantInt *C;
         // (A & B) or (A | B) or (A ^ B).
-        if (match(V,
-                  m_CombineOr(m_And(m_Value(A), m_Value(B)),
-                    m_CombineOr(m_Or(m_Value(A), m_Value(B)),
-                                m_Xor(m_Value(A), m_Value(B)))))) {
+        if (match(V, m_BitwiseLogic(m_Value(A), m_Value(B)))) {
           AddAffected(A);
           AddAffected(B);
         // (A << C) or (A >>_s C) or (A >>_u C) where C is some constant.
-        } else if (match(V,
-                         m_CombineOr(m_Shl(m_Value(A), m_ConstantInt(C)),
-                           m_CombineOr(m_LShr(m_Value(A), m_ConstantInt(C)),
-                                       m_AShr(m_Value(A),
-                                              m_ConstantInt(C)))))) {
+        } else if (match(V, m_Shift(m_Value(A), m_ConstantInt(C)))) {
           AddAffected(A);
         }
       };
@@ -229,7 +230,13 @@ AssumptionCache &AssumptionCacheTracker::getAssumptionCache(Function &F) {
 }
 
 void AssumptionCacheTracker::verifyAnalysis() const {
-#ifndef NDEBUG
+  // FIXME: In the long term the verifier should not be controllable with a
+  // flag. We should either fix all passes to correctly update the assumption
+  // cache and enable the verifier unconditionally or somehow arrange for the
+  // assumption list to be updated automatically by passes.
+  if (!VerifyAssumptionCache)
+    return;
+
   SmallPtrSet<const CallInst *, 4> AssumptionSet;
   for (const auto &I : AssumptionCaches) {
     for (auto &VH : I.second->assumptions())
@@ -238,11 +245,10 @@ void AssumptionCacheTracker::verifyAnalysis() const {
 
     for (const BasicBlock &B : cast<Function>(*I.first))
       for (const Instruction &II : B)
-        if (match(&II, m_Intrinsic<Intrinsic::assume>()))
-          assert(AssumptionSet.count(cast<CallInst>(&II)) &&
-                 "Assumption in scanned function not in cache");
+        if (match(&II, m_Intrinsic<Intrinsic::assume>()) &&
+            !AssumptionSet.count(cast<CallInst>(&II)))
+          report_fatal_error("Assumption in scanned function not in cache");
   }
-#endif
 }
 
 AssumptionCacheTracker::AssumptionCacheTracker() : ImmutablePass(ID) {
diff --git a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index c8d0579..e682a64 100644
--- a/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -17,13 +17,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -36,6 +36,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "basicaa"
@@ -127,7 +128,9 @@ static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
   uint64_t Size;
-  if (getObjectSize(V, Size, DL, &TLI, RoundToAlign))
+  ObjectSizeOpts Opts;
+  Opts.RoundToAlign = RoundToAlign;
+  if (getObjectSize(V, Size, DL, &TLI, Opts))
     return Size;
   return MemoryLocation::UnknownSize;
 }
@@ -635,7 +638,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
 /// Returns true if this is a writeonly (i.e Mod only) parameter.
 static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
                              const TargetLibraryInfo &TLI) {
-  if (CS.paramHasAttr(ArgIdx + 1, Attribute::WriteOnly))
+  if (CS.paramHasAttr(ArgIdx, Attribute::WriteOnly))
     return true;
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
@@ -644,9 +647,9 @@ static bool isWriteOnlyParam(ImmutableCallSite CS, unsigned ArgIdx,
   // whenever possible.
   // FIXME Consider handling this in InferFunctionAttr.cpp together with other
   // attributes.
-  LibFunc::Func F;
+  LibFunc F;
   if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
-      F == LibFunc::memset_pattern16 && TLI.has(F))
+      F == LibFunc_memset_pattern16 && TLI.has(F))
     if (ArgIdx == 0)
       return true;
 
@@ -664,10 +667,10 @@ ModRefInfo BasicAAResult::getArgModRefInfo(ImmutableCallSite CS,
   if (isWriteOnlyParam(CS, ArgIdx, TLI))
     return MRI_Mod;
 
-  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadOnly))
+  if (CS.paramHasAttr(ArgIdx, Attribute::ReadOnly))
     return MRI_Ref;
 
-  if (CS.paramHasAttr(ArgIdx + 1, Attribute::ReadNone))
+  if (CS.paramHasAttr(ArgIdx, Attribute::ReadNone))
     return MRI_NoModRef;
 
   return AAResultBase::getArgModRefInfo(CS, ArgIdx);
@@ -680,8 +683,11 @@ static bool isIntrinsicCall(ImmutableCallSite CS, Intrinsic::ID IID) {
 
 #ifndef NDEBUG
 static const Function *getParent(const Value *V) {
-  if (const Instruction *inst = dyn_cast<Instruction>(V))
+  if (const Instruction *inst = dyn_cast<Instruction>(V)) {
+    if (!inst->getParent())
+      return nullptr;
     return inst->getParent()->getParent();
+  }
 
   if (const Argument *arg = dyn_cast<Argument>(V))
     return arg->getParent();
@@ -749,7 +755,11 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
   // as an argument, and itself doesn't capture it.
   if (!isa<Constant>(Object) && CS.getInstruction() != Object &&
       isNonEscapingLocalObject(Object)) {
-    bool PassedAsArg = false;
+
+    // Optimistically assume that call doesn't touch Object and check this
+    // assumption in the following loop.
+    ModRefInfo Result = MRI_NoModRef;
+
     unsigned OperandNo = 0;
     for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
          CI != CE; ++CI, ++OperandNo) {
@@ -761,20 +771,38 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
            OperandNo < CS.getNumArgOperands() && !CS.isByValArgument(OperandNo)))
         continue;
 
+      // Call doesn't access memory through this operand, so we don't care
+      // if it aliases with Object.
+      if (CS.doesNotAccessMemory(OperandNo))
+        continue;
+
       // If this is a no-capture pointer argument, see if we can tell that it
-      // is impossible to alias the pointer we're checking.  If not, we have to
-      // assume that the call could touch the pointer, even though it doesn't
-      // escape.
+      // is impossible to alias the pointer we're checking.
       AliasResult AR =
           getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object));
-      if (AR) {
-        PassedAsArg = true;
-        break;
+
+      // Operand doesnt alias 'Object', continue looking for other aliases
+      if (AR == NoAlias)
+        continue;
+      // Operand aliases 'Object', but call doesn't modify it. Strengthen
+      // initial assumption and keep looking in case if there are more aliases.
+      if (CS.onlyReadsMemory(OperandNo)) {
+        Result = static_cast<ModRefInfo>(Result | MRI_Ref);
+        continue;
       }
+      // Operand aliases 'Object' but call only writes into it.
+      if (CS.doesNotReadMemory(OperandNo)) {
+        Result = static_cast<ModRefInfo>(Result | MRI_Mod);
+        continue;
+      }
+      // This operand aliases 'Object' and call reads and writes into it.
+      Result = MRI_ModRef;
+      break;
     }
 
-    if (!PassedAsArg)
-      return MRI_NoModRef;
+    // Early return if we improved mod ref information
+    if (Result != MRI_ModRef)
+      return Result;
   }
 
   // If the CallSite is to malloc or calloc, we can assume that it doesn't
@@ -784,7 +812,7 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
   // well.  Or alternatively, replace all of this with inaccessiblememonly once
   // that's implemented fully. 
   auto *Inst = CS.getInstruction();
-  if (isMallocLikeFn(Inst, &TLI) || isCallocLikeFn(Inst, &TLI)) {
+  if (isMallocOrCallocLikeFn(Inst, &TLI)) {
     // Be conservative if the accessed pointer may alias the allocation -
     // fallback to the generic handling below.
     if (getBestAAResults().alias(MemoryLocation(Inst), Loc) == NoAlias)
@@ -900,10 +928,9 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
                                             uint64_t V2Size,
                                             const DataLayout &DL) {
 
-  assert(GEP1->getPointerOperand()->stripPointerCasts() ==
-         GEP2->getPointerOperand()->stripPointerCasts() &&
-         GEP1->getPointerOperand()->getType() ==
-         GEP2->getPointerOperand()->getType() &&
+  assert(GEP1->getPointerOperand()->stripPointerCastsAndBarriers() ==
+             GEP2->getPointerOperand()->stripPointerCastsAndBarriers() &&
+         GEP1->getPointerOperandType() == GEP2->getPointerOperandType() &&
          "Expected GEPs with the same pointer operand");
 
   // Try to determine whether GEP1 and GEP2 index through arrays, into structs,
@@ -979,15 +1006,32 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
     // Because they cannot partially overlap and because fields in an array
     // cannot overlap, if we can prove the final indices are different between
     // GEP1 and GEP2, we can conclude GEP1 and GEP2 don't alias.
-    
+
     // If the last indices are constants, we've already checked they don't
     // equal each other so we can exit early.
     if (C1 && C2)
       return NoAlias;
-    if (isKnownNonEqual(GEP1->getOperand(GEP1->getNumOperands() - 1),
-                        GEP2->getOperand(GEP2->getNumOperands() - 1),
-                        DL))
-      return NoAlias;
+    {
+      Value *GEP1LastIdx = GEP1->getOperand(GEP1->getNumOperands() - 1);
+      Value *GEP2LastIdx = GEP2->getOperand(GEP2->getNumOperands() - 1);
+      if (isa<PHINode>(GEP1LastIdx) || isa<PHINode>(GEP2LastIdx)) {
+        // If one of the indices is a PHI node, be safe and only use
+        // computeKnownBits so we don't make any assumptions about the
+        // relationships between the two indices. This is important if we're
+        // asking about values from different loop iterations. See PR32314.
+        // TODO: We may be able to change the check so we only do this when
+        // we definitely looked through a PHINode.
+        if (GEP1LastIdx != GEP2LastIdx &&
+            GEP1LastIdx->getType() == GEP2LastIdx->getType()) {
+          KnownBits Known1 = computeKnownBits(GEP1LastIdx, DL);
+          KnownBits Known2 = computeKnownBits(GEP2LastIdx, DL);
+          if (Known1.Zero.intersects(Known2.One) ||
+              Known1.One.intersects(Known2.Zero))
+            return NoAlias;
+        }
+      } else if (isKnownNonEqual(GEP1LastIdx, GEP2LastIdx, DL))
+        return NoAlias;
+    }
     return MayAlias;
   } else if (!LastIndexedStruct || !C1 || !C2) {
     return MayAlias;
@@ -1161,10 +1205,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     // If we know the two GEPs are based off of the exact same pointer (and not
     // just the same underlying object), see if that tells us anything about
     // the resulting pointers.
-    if (GEP1->getPointerOperand()->stripPointerCasts() ==
-        GEP2->getPointerOperand()->stripPointerCasts() &&
-        GEP1->getPointerOperand()->getType() ==
-        GEP2->getPointerOperand()->getType()) {
+    if (GEP1->getPointerOperand()->stripPointerCastsAndBarriers() ==
+            GEP2->getPointerOperand()->stripPointerCastsAndBarriers() &&
+        GEP1->getPointerOperandType() == GEP2->getPointerOperandType()) {
       AliasResult R = aliasSameBasePointerGEPs(GEP1, V1Size, GEP2, V2Size, DL);
       // If we couldn't find anything interesting, don't abandon just yet.
       if (R != MayAlias)
@@ -1261,9 +1304,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         // give up if we can't determine conditions that hold for every cycle:
         const Value *V = DecompGEP1.VarIndices[i].V;
 
-        bool SignKnownZero, SignKnownOne;
-        ComputeSignBit(const_cast<Value *>(V), SignKnownZero, SignKnownOne, DL,
-                       0, &AC, nullptr, DT);
+        KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT);
+        bool SignKnownZero = Known.isNonNegative();
+        bool SignKnownOne = Known.isNegative();
 
         // Zero-extension widens the variable, and so forces the sign
         // bit to zero.
@@ -1305,11 +1348,7 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   // Statically, we can see that the base objects are the same, but the
   // pointers have dynamic offsets which we can't resolve. And none of our
   // little tricks above worked.
-  //
-  // TODO: Returning PartialAlias instead of MayAlias is a mild hack; the
-  // practical effect of this is protecting TBAA in the case of dynamic
-  // indices into arrays of unions or malloc'd memory.
-  return PartialAlias;
+  return MayAlias;
 }
 
 static AliasResult MergeAliasResults(AliasResult A, AliasResult B) {
@@ -1478,8 +1517,8 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, uint64_t V1Size,
     return NoAlias;
 
   // Strip off any casts if they exist.
-  V1 = V1->stripPointerCasts();
-  V2 = V2->stripPointerCasts();
+  V1 = V1->stripPointerCastsAndBarriers();
+  V2 = V2->stripPointerCastsAndBarriers();
 
   // If V1 or V2 is undef, the result is NoAlias because we can always pick a
   // value for undef that aliases nothing in the program.
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index 4cdbe4d..07a2a92 100644
--- a/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "block-freq"
 
-#ifndef NDEBUG
 static cl::opt<GVDAGType> ViewBlockFreqPropagationDAG(
     "view-block-freq-propagation-dags", cl::Hidden,
     cl::desc("Pop up a window to show a dag displaying how block "
@@ -55,8 +54,29 @@ cl::opt<unsigned>
                                 "is no less than the max frequency of the "
                                 "function multiplied by this percent."));
 
+// Command line option to turn on CFG dot dump after profile annotation.
+cl::opt<bool>
+    PGOViewCounts("pgo-view-counts", cl::init(false), cl::Hidden,
+                  cl::desc("A boolean option to show CFG dag with "
+                           "block profile counts and branch probabilities "
+                           "right after PGO profile annotation step. The "
+                           "profile counts are computed using branch "
+                           "probabilities from the runtime profile data and "
+                           "block frequency propagation algorithm. To view "
+                           "the raw counts from the profile, use option "
+                           "-pgo-view-raw-counts instead. To limit graph "
+                           "display to only one function, use filtering option "
+                           "-view-bfi-func-name."));
+
 namespace llvm {
 
+static GVDAGType getGVDT() {
+
+  if (PGOViewCounts)
+    return GVDT_Count;
+  return ViewBlockFreqPropagationDAG;
+}
+
 template <>
 struct GraphTraits<BlockFrequencyInfo *> {
   typedef const BasicBlock *NodeRef;
@@ -89,8 +109,7 @@ struct DOTGraphTraits<BlockFrequencyInfo *> : public BFIDOTGTraitsBase {
   std::string getNodeLabel(const BasicBlock *Node,
                            const BlockFrequencyInfo *Graph) {
 
-    return BFIDOTGTraitsBase::getNodeLabel(Node, Graph,
-                                           ViewBlockFreqPropagationDAG);
+    return BFIDOTGTraitsBase::getNodeLabel(Node, Graph, getGVDT());
   }
 
   std::string getNodeAttributes(const BasicBlock *Node,
@@ -107,7 +126,6 @@ struct DOTGraphTraits<BlockFrequencyInfo *> : public BFIDOTGTraitsBase {
 };
 
 } // end namespace llvm
-#endif
 
 BlockFrequencyInfo::BlockFrequencyInfo() {}
 
@@ -132,19 +150,26 @@ BlockFrequencyInfo &BlockFrequencyInfo::operator=(BlockFrequencyInfo &&RHS) {
 // template instantiated which is not available in the header.
 BlockFrequencyInfo::~BlockFrequencyInfo() {}
 
+bool BlockFrequencyInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                                    FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<BlockFrequencyAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 void BlockFrequencyInfo::calculate(const Function &F,
                                    const BranchProbabilityInfo &BPI,
                                    const LoopInfo &LI) {
   if (!BFI)
     BFI.reset(new ImplType);
   BFI->calculate(F, BPI, LI);
-#ifndef NDEBUG
   if (ViewBlockFreqPropagationDAG != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F.getName().equals(ViewBlockFreqFuncName))) {
     view();
   }
-#endif
 }
 
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
@@ -171,16 +196,32 @@ void BlockFrequencyInfo::setBlockFreq(const BasicBlock *BB, uint64_t Freq) {
   BFI->setBlockFreq(BB, Freq);
 }
 
+void BlockFrequencyInfo::setBlockFreqAndScale(
+    const BasicBlock *ReferenceBB, uint64_t Freq,
+    SmallPtrSetImpl<BasicBlock *> &BlocksToScale) {
+  assert(BFI && "Expected analysis to be available");
+  // Use 128 bits APInt to avoid overflow.
+  APInt NewFreq(128, Freq);
+  APInt OldFreq(128, BFI->getBlockFreq(ReferenceBB).getFrequency());
+  APInt BBFreq(128, 0);
+  for (auto *BB : BlocksToScale) {
+    BBFreq = BFI->getBlockFreq(BB).getFrequency();
+    // Multiply first by NewFreq and then divide by OldFreq
+    // to minimize loss of precision.
+    BBFreq *= NewFreq;
+    // udiv is an expensive operation in the general case. If this ends up being
+    // a hot spot, one of the options proposed in
+    // https://reviews.llvm.org/D28535#650071 could be used to avoid this.
+    BBFreq = BBFreq.udiv(OldFreq);
+    BFI->setBlockFreq(BB, BBFreq.getLimitedValue());
+  }
+  BFI->setBlockFreq(ReferenceBB, Freq);
+}
+
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
 void BlockFrequencyInfo::view() const {
-// This code is only for debugging.
-#ifndef NDEBUG
   ViewGraph(const_cast<BlockFrequencyInfo *>(this), "BlockFrequencyDAGs");
-#else
-  errs() << "BlockFrequencyInfo::view is only available in debug builds on "
-            "systems with Graphviz or gv!\n";
-#endif // NDEBUG
 }
 
 const Function *BlockFrequencyInfo::getFunction() const {
diff --git a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 9850e02..e5d8c33 100644
--- a/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/contrib/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -28,7 +28,9 @@ ScaledNumber<uint64_t> BlockMass::toScaled() const {
   return ScaledNumber<uint64_t>(getMass() + 1, -64);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void BlockMass::dump() const { print(dbgs()); }
+#endif
 
 static char getHexDigit(int N) {
   assert(N < 16);
diff --git a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 3eabb78..a329e5a 100644
--- a/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/contrib/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -30,6 +31,7 @@ using namespace llvm;
 INITIALIZE_PASS_BEGIN(BranchProbabilityInfoWrapperPass, "branch-prob",
                       "Branch Probability Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(BranchProbabilityInfoWrapperPass, "branch-prob",
                     "Branch Probability Analysis", false, true)
 
@@ -58,19 +60,12 @@ char BranchProbabilityInfoWrapperPass::ID = 0;
 static const uint32_t LBH_TAKEN_WEIGHT = 124;
 static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
 
-/// \brief Unreachable-terminating branch taken weight.
+/// \brief Unreachable-terminating branch taken probability.
 ///
-/// This is the weight for a branch being taken to a block that terminates
+/// This is the probability for a branch being taken to a block that terminates
 /// (eventually) in unreachable. These are predicted as unlikely as possible.
-static const uint32_t UR_TAKEN_WEIGHT = 1;
-
-/// \brief Unreachable-terminating branch not-taken weight.
-///
-/// This is the weight for a branch not being taken toward a block that
-/// terminates (eventually) in unreachable. Such a branch is essentially never
-/// taken. Set the weight to an absurdly high value so that nested loops don't
-/// easily subsume it.
-static const uint32_t UR_NONTAKEN_WEIGHT = 1024*1024 - 1;
+/// All reachable probability will equally share the remaining part.
+static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 
 /// \brief Weight for a branch taken going into a cold block.
 ///
@@ -108,11 +103,9 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
 /// instruction. This is essentially never taken.
 static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 
-/// \brief Calculate edge weights for successors lead to unreachable.
-///
-/// Predict that a successor which leads necessarily to an
-/// unreachable-terminated block as extremely unlikely.
-bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+/// \brief Add \p BB to PostDominatedByUnreachable set if applicable.
+void
+BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0) {
     if (isa<UnreachableInst>(TI) ||
@@ -122,39 +115,85 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
         // never execute.
         BB->getTerminatingDeoptimizeCall())
       PostDominatedByUnreachable.insert(BB);
-    return false;
+    return;
+  }
+
+  // If the terminator is an InvokeInst, check only the normal destination block
+  // as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    if (PostDominatedByUnreachable.count(II->getNormalDest()))
+      PostDominatedByUnreachable.insert(BB);
+    return;
+  }
+
+  for (auto *I : successors(BB))
+    // If any of successor is not post dominated then BB is also not.
+    if (!PostDominatedByUnreachable.count(I))
+      return;
+
+  PostDominatedByUnreachable.insert(BB);
+}
+
+/// \brief Add \p BB to PostDominatedByColdCall set if applicable.
+void
+BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
+  assert(!PostDominatedByColdCall.count(BB));
+  const TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0)
+    return;
+
+  // If all of successor are post dominated then BB is also done.
+  if (llvm::all_of(successors(BB), [&](const BasicBlock *SuccBB) {
+        return PostDominatedByColdCall.count(SuccBB);
+      })) {
+    PostDominatedByColdCall.insert(BB);
+    return;
   }
 
+  // If the terminator is an InvokeInst, check only the normal destination
+  // block as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    if (PostDominatedByColdCall.count(II->getNormalDest())) {
+      PostDominatedByColdCall.insert(BB);
+      return;
+    }
+
+  // Otherwise, if the block itself contains a cold function, add it to the
+  // set of blocks post-dominated by a cold call.
+  for (auto &I : *BB)
+    if (const CallInst *CI = dyn_cast<CallInst>(&I))
+      if (CI->hasFnAttr(Attribute::Cold)) {
+        PostDominatedByColdCall.insert(BB);
+        return;
+      }
+}
+
+/// \brief Calculate edge weights for successors lead to unreachable.
+///
+/// Predict that a successor which leads necessarily to an
+/// unreachable-terminated block as extremely unlikely.
+bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
+  assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
+
+  // Return false here so that edge weights for InvokeInst could be decided
+  // in calcInvokeHeuristics().
+  if (isa<InvokeInst>(TI))
+    return false;
+
   SmallVector<unsigned, 4> UnreachableEdges;
   SmallVector<unsigned, 4> ReachableEdges;
 
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
     if (PostDominatedByUnreachable.count(*I))
       UnreachableEdges.push_back(I.getSuccessorIndex());
     else
       ReachableEdges.push_back(I.getSuccessorIndex());
-  }
 
-  // If all successors are in the set of blocks post-dominated by unreachable,
-  // this block is too.
-  if (UnreachableEdges.size() == TI->getNumSuccessors())
-    PostDominatedByUnreachable.insert(BB);
-
-  // Skip probabilities if this block has a single successor or if all were
-  // reachable.
-  if (TI->getNumSuccessors() == 1 || UnreachableEdges.empty())
+  // Skip probabilities if all were reachable.
+  if (UnreachableEdges.empty())
     return false;
 
-  // If the terminator is an InvokeInst, check only the normal destination block
-  // as the unwind edge of InvokeInst is also very unlikely taken.
-  if (auto *II = dyn_cast<InvokeInst>(TI))
-    if (PostDominatedByUnreachable.count(II->getNormalDest())) {
-      PostDominatedByUnreachable.insert(BB);
-      // Return false here so that edge weights for InvokeInst could be decided
-      // in calcInvokeHeuristics().
-      return false;
-    }
-
   if (ReachableEdges.empty()) {
     BranchProbability Prob(1, UnreachableEdges.size());
     for (unsigned SuccIdx : UnreachableEdges)
@@ -162,12 +201,10 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
     return true;
   }
 
-  auto UnreachableProb = BranchProbability::getBranchProbability(
-      UR_TAKEN_WEIGHT, (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) *
-                           uint64_t(UnreachableEdges.size()));
-  auto ReachableProb = BranchProbability::getBranchProbability(
-      UR_NONTAKEN_WEIGHT,
-      (UR_TAKEN_WEIGHT + UR_NONTAKEN_WEIGHT) * uint64_t(ReachableEdges.size()));
+  auto UnreachableProb = UR_TAKEN_PROB;
+  auto ReachableProb =
+      (BranchProbability::getOne() - UR_TAKEN_PROB * UnreachableEdges.size()) /
+      ReachableEdges.size();
 
   for (unsigned SuccIdx : UnreachableEdges)
     setEdgeProbability(BB, SuccIdx, UnreachableProb);
@@ -178,11 +215,12 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
 }
 
 // Propagate existing explicit probabilities from either profile data or
-// 'expect' intrinsic processing.
+// 'expect' intrinsic processing. Examine metadata against unreachable
+// heuristic. The probability of the edge coming to unreachable block is
+// set to min of metadata and unreachable heuristic.
 bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
-  if (TI->getNumSuccessors() == 1)
-    return false;
+  assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
     return false;
 
@@ -203,6 +241,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   // be scaled to fit in 32 bits.
   uint64_t WeightSum = 0;
   SmallVector<uint32_t, 2> Weights;
+  SmallVector<unsigned, 2> UnreachableIdxs;
+  SmallVector<unsigned, 2> ReachableIdxs;
   Weights.reserve(TI->getNumSuccessors());
   for (unsigned i = 1, e = WeightsNode->getNumOperands(); i != e; ++i) {
     ConstantInt *Weight =
@@ -213,6 +253,10 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
            "Too many bits for uint32_t");
     Weights.push_back(Weight->getZExtValue());
     WeightSum += Weights.back();
+    if (PostDominatedByUnreachable.count(TI->getSuccessor(i - 1)))
+      UnreachableIdxs.push_back(i - 1);
+    else
+      ReachableIdxs.push_back(i - 1);
   }
   assert(Weights.size() == TI->getNumSuccessors() && "Checked above");
 
@@ -221,22 +265,49 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   uint64_t ScalingFactor =
       (WeightSum > UINT32_MAX) ? WeightSum / UINT32_MAX + 1 : 1;
 
-  WeightSum = 0;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    Weights[i] /= ScalingFactor;
-    WeightSum += Weights[i];
+  if (ScalingFactor > 1) {
+    WeightSum = 0;
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+      Weights[i] /= ScalingFactor;
+      WeightSum += Weights[i];
+    }
   }
+  assert(WeightSum <= UINT32_MAX &&
+         "Expected weights to scale down to 32 bits");
 
-  if (WeightSum == 0) {
+  if (WeightSum == 0 || ReachableIdxs.size() == 0) {
     for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      setEdgeProbability(BB, i, {1, e});
-  } else {
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      setEdgeProbability(BB, i, {Weights[i], static_cast<uint32_t>(WeightSum)});
+      Weights[i] = 1;
+    WeightSum = TI->getNumSuccessors();
   }
 
-  assert(WeightSum <= UINT32_MAX &&
-         "Expected weights to scale down to 32 bits");
+  // Set the probability.
+  SmallVector<BranchProbability, 2> BP;
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    BP.push_back({ Weights[i], static_cast<uint32_t>(WeightSum) });
+
+  // Examine the metadata against unreachable heuristic.
+  // If the unreachable heuristic is more strong then we use it for this edge.
+  if (UnreachableIdxs.size() > 0 && ReachableIdxs.size() > 0) {
+    auto ToDistribute = BranchProbability::getZero();
+    auto UnreachableProb = UR_TAKEN_PROB;
+    for (auto i : UnreachableIdxs)
+      if (UnreachableProb < BP[i]) {
+        ToDistribute += BP[i] - UnreachableProb;
+        BP[i] = UnreachableProb;
+      }
+
+    // If we modified the probability of some edges then we must distribute
+    // the difference between reachable blocks.
+    if (ToDistribute > BranchProbability::getZero()) {
+      BranchProbability PerEdge = ToDistribute / ReachableIdxs.size();
+      for (auto i : ReachableIdxs)
+        BP[i] += PerEdge;
+    }
+  }
+
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    setEdgeProbability(BB, i, BP[i]);
 
   return true;
 }
@@ -251,7 +322,11 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
 /// Return false, otherwise.
 bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
-  if (TI->getNumSuccessors() == 0)
+  assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
+
+  // Return false here so that edge weights for InvokeInst could be decided
+  // in calcInvokeHeuristics().
+  if (isa<InvokeInst>(TI))
     return false;
 
   // Determine which successors are post-dominated by a cold block.
@@ -263,34 +338,8 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
     else
       NormalEdges.push_back(I.getSuccessorIndex());
 
-  // If all successors are in the set of blocks post-dominated by cold calls,
-  // this block is in the set post-dominated by cold calls.
-  if (ColdEdges.size() == TI->getNumSuccessors())
-    PostDominatedByColdCall.insert(BB);
-  else {
-    // Otherwise, if the block itself contains a cold function, add it to the
-    // set of blocks postdominated by a cold call.
-    assert(!PostDominatedByColdCall.count(BB));
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (const CallInst *CI = dyn_cast<CallInst>(I))
-        if (CI->hasFnAttr(Attribute::Cold)) {
-          PostDominatedByColdCall.insert(BB);
-          break;
-        }
-  }
-
-  if (auto *II = dyn_cast<InvokeInst>(TI)) {
-    // If the terminator is an InvokeInst, consider only the normal destination
-    // block.
-    if (PostDominatedByColdCall.count(II->getNormalDest()))
-      PostDominatedByColdCall.insert(BB);
-    // Return false here so that edge weights for InvokeInst could be decided
-    // in calcInvokeHeuristics().
-    return false;
-  }
-
-  // Skip probabilities if this block has a single successor.
-  if (TI->getNumSuccessors() == 1 || ColdEdges.empty())
+  // Skip probabilities if no cold edges.
+  if (ColdEdges.empty())
     return false;
 
   if (NormalEdges.empty()) {
@@ -410,7 +459,8 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
   return true;
 }
 
-bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB) {
+bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
+                                               const TargetLibraryInfo *TLI) {
   const BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
     return false;
@@ -433,8 +483,37 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB) {
         if (AndRHS->getUniqueInteger().isPowerOf2())
           return false;
 
+  // Check if the LHS is the return value of a library function
+  LibFunc Func = NumLibFuncs;
+  if (TLI)
+    if (CallInst *Call = dyn_cast<CallInst>(CI->getOperand(0)))
+      if (Function *CalledFn = Call->getCalledFunction())
+        TLI->getLibFunc(*CalledFn, Func);
+
   bool isProb;
-  if (CV->isZero()) {
+  if (Func == LibFunc_strcasecmp ||
+      Func == LibFunc_strcmp ||
+      Func == LibFunc_strncasecmp ||
+      Func == LibFunc_strncmp ||
+      Func == LibFunc_memcmp) {
+    // strcmp and similar functions return zero, negative, or positive, if the
+    // first string is equal, less, or greater than the second. We consider it
+    // likely that the strings are not equal, so a comparison with zero is
+    // probably false, but also a comparison with any other number is also
+    // probably false given that what exactly is returned for nonzero values is
+    // not specified. Any kind of comparison other than equality we know
+    // nothing about.
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
+  } else if (CV->isZero()) {
     switch (CI->getPredicate()) {
     case CmpInst::ICMP_EQ:
       // X == 0   ->  Unlikely
@@ -459,7 +538,7 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB) {
     // InstCombine canonicalizes X <= 0 into X < 1.
     // X <= 0   ->  Unlikely
     isProb = false;
-  } else if (CV->isAllOnesValue()) {
+  } else if (CV->isMinusOne()) {
     switch (CI->getPredicate()) {
     case CmpInst::ICMP_EQ:
       // X == -1  ->  Unlikely
@@ -660,7 +739,8 @@ void BranchProbabilityInfo::eraseBlock(const BasicBlock *BB) {
   }
 }
 
-void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
+void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
+                                      const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
@@ -671,17 +751,22 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
   // the successors of a block iteratively.
   for (auto BB : post_order(&F.getEntryBlock())) {
     DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n");
-    if (calcUnreachableHeuristics(BB))
+    updatePostDominatedByUnreachable(BB);
+    updatePostDominatedByColdCall(BB);
+    // If there is no at least two successors, no sense to set probability.
+    if (BB->getTerminator()->getNumSuccessors() < 2)
       continue;
     if (calcMetadataWeights(BB))
       continue;
+    if (calcUnreachableHeuristics(BB))
+      continue;
     if (calcColdCallHeuristics(BB))
       continue;
     if (calcLoopBranchHeuristics(BB, LI))
       continue;
     if (calcPointerHeuristics(BB))
       continue;
-    if (calcZeroHeuristics(BB))
+    if (calcZeroHeuristics(BB, TLI))
       continue;
     if (calcFloatingPointHeuristics(BB))
       continue;
@@ -695,12 +780,14 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
 void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
 bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
   const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  BPI.calculate(F, LI);
+  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  BPI.calculate(F, LI, &TLI);
   return false;
 }
 
@@ -715,7 +802,7 @@ AnalysisKey BranchProbabilityAnalysis::Key;
 BranchProbabilityInfo
 BranchProbabilityAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
   BranchProbabilityInfo BPI;
-  BPI.calculate(F, AM.getResult<LoopAnalysis>(F));
+  BPI.calculate(F, AM.getResult<LoopAnalysis>(F), &AM.getResult<TargetLibraryAnalysis>(F));
   return BPI;
 }
 
diff --git a/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
index e48ff23..0de7ad9 100644
--- a/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -68,17 +68,6 @@ CFLAndersAAResult::CFLAndersAAResult(CFLAndersAAResult &&RHS)
     : AAResultBase(std::move(RHS)), TLI(RHS.TLI) {}
 CFLAndersAAResult::~CFLAndersAAResult() {}
 
-static const Function *parentFunctionOfValue(const Value *Val) {
-  if (auto *Inst = dyn_cast<Instruction>(Val)) {
-    auto *Bb = Inst->getParent();
-    return Bb->getParent();
-  }
-
-  if (auto *Arg = dyn_cast<Argument>(Val))
-    return Arg->getParent();
-  return nullptr;
-}
-
 namespace {
 
 enum class MatchState : uint8_t {
@@ -307,7 +296,7 @@ class CFLAndersAAResult::FunctionInfo {
 
 public:
   FunctionInfo(const Function &, const SmallVectorImpl<Value *> &,
-               const ReachabilitySet &, AliasAttrMap);
+               const ReachabilitySet &, const AliasAttrMap &);
 
   bool mayAlias(const Value *, uint64_t, const Value *, uint64_t) const;
   const AliasSummary &getAliasSummary() const { return Summary; }
@@ -470,7 +459,7 @@ static void populateExternalAttributes(
 
 CFLAndersAAResult::FunctionInfo::FunctionInfo(
     const Function &Fn, const SmallVectorImpl<Value *> &RetVals,
-    const ReachabilitySet &ReachSet, AliasAttrMap AMap) {
+    const ReachabilitySet &ReachSet, const AliasAttrMap &AMap) {
   populateAttrMap(AttrMap, AMap);
   populateExternalAttributes(Summary.RetParamAttributes, Fn, RetVals, AMap);
   populateAliasMap(AliasMap, ReachSet);
@@ -789,10 +778,10 @@ void CFLAndersAAResult::scan(const Function &Fn) {
   // resize and invalidating the reference returned by operator[]
   auto FunInfo = buildInfoFrom(Fn);
   Cache[&Fn] = std::move(FunInfo);
-  Handles.push_front(FunctionHandle(const_cast<Function *>(&Fn), this));
+  Handles.emplace_front(const_cast<Function *>(&Fn), this);
 }
 
-void CFLAndersAAResult::evict(const Function &Fn) { Cache.erase(&Fn); }
+void CFLAndersAAResult::evict(const Function *Fn) { Cache.erase(Fn); }
 
 const Optional<CFLAndersAAResult::FunctionInfo> &
 CFLAndersAAResult::ensureCached(const Function &Fn) {
diff --git a/contrib/llvm/lib/Analysis/CFLGraph.h b/contrib/llvm/lib/Analysis/CFLGraph.h
index e526e0e..95874b8 100644
--- a/contrib/llvm/lib/Analysis/CFLGraph.h
+++ b/contrib/llvm/lib/Analysis/CFLGraph.h
@@ -16,7 +16,6 @@
 #define LLVM_ANALYSIS_CFLGRAPH_H
 
 #include "AliasAnalysisSummary.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
@@ -210,6 +209,11 @@ template <typename CFLAA> class CFLGraphBuilder {
 
     void addDerefEdge(Value *From, Value *To, bool IsRead) {
       assert(From != nullptr && To != nullptr);
+      // FIXME: This is subtly broken, due to how we model some instructions
+      // (e.g. extractvalue, extractelement) as loads. Since those take
+      // non-pointer operands, we'll entirely skip adding edges for those.
+      //
+      // addAssignEdge seems to have a similar issue with insertvalue, etc.
       if (!From->getType()->isPointerTy() || !To->getType()->isPointerTy())
         return;
       addNode(From);
@@ -400,8 +404,7 @@ template <typename CFLAA> class CFLGraphBuilder {
       // TODO: address other common library functions such as realloc(),
       // strdup(),
       // etc.
-      if (isMallocLikeFn(Inst, &TLI) || isCallocLikeFn(Inst, &TLI) ||
-          isFreeCall(Inst, &TLI))
+      if (isMallocOrCallocLikeFn(Inst, &TLI) || isFreeCall(Inst, &TLI))
         return;
 
       // TODO: Add support for noalias args/all the other fun function
@@ -430,7 +433,7 @@ template <typename CFLAA> class CFLGraphBuilder {
 
       if (Inst->getType()->isPointerTy()) {
         auto *Fn = CS.getCalledFunction();
-        if (Fn == nullptr || !Fn->doesNotAlias(0))
+        if (Fn == nullptr || !Fn->returnDoesNotAlias())
           // No need to call addNode() since we've added Inst at the
           // beginning of this function and we know it is not a global.
           Graph.addAttr(InstantiatedValue{Inst, 0}, getAttrUnknown());
@@ -541,6 +544,7 @@ template <typename CFLAA> class CFLGraphBuilder {
       case Instruction::ExtractValue: {
         auto *Ptr = CE->getOperand(0);
         addLoadEdge(Ptr, CE);
+        break;
       }
       case Instruction::ShuffleVector: {
         auto *From1 = CE->getOperand(0);
diff --git a/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
index dde24ef..adbdd82 100644
--- a/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -80,9 +80,6 @@ public:
   const AliasSummary &getAliasSummary() const { return Summary; }
 };
 
-/// Try to go from a Value* to a Function*. Never returns nullptr.
-static Optional<Function *> parentFunctionOfValue(Value *);
-
 const StratifiedIndex StratifiedLink::SetSentinel =
     std::numeric_limits<StratifiedIndex>::max();
 
@@ -91,19 +88,6 @@ const StratifiedIndex StratifiedLink::SetSentinel =
 //===----------------------------------------------------------------------===//
 
 /// Determines whether it would be pointless to add the given Value to our sets.
-static bool canSkipAddingToSets(Value *Val);
-
-static Optional<Function *> parentFunctionOfValue(Value *Val) {
-  if (auto *Inst = dyn_cast<Instruction>(Val)) {
-    auto *Bb = Inst->getParent();
-    return Bb->getParent();
-  }
-
-  if (auto *Arg = dyn_cast<Argument>(Val))
-    return Arg->getParent();
-  return None;
-}
-
 static bool canSkipAddingToSets(Value *Val) {
   // Constants can share instances, which may falsely unify multiple
   // sets, e.g. in
@@ -248,7 +232,7 @@ void CFLSteensAAResult::scan(Function *Fn) {
   auto FunInfo = buildSetsFrom(Fn);
   Cache[Fn] = std::move(FunInfo);
 
-  Handles.push_front(FunctionHandle(Fn, this));
+  Handles.emplace_front(Fn, this);
 }
 
 void CFLSteensAAResult::evict(Function *Fn) { Cache.erase(Fn); }
@@ -284,9 +268,9 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
     return NoAlias;
 
   Function *Fn = nullptr;
-  auto MaybeFnA = parentFunctionOfValue(ValA);
-  auto MaybeFnB = parentFunctionOfValue(ValB);
-  if (!MaybeFnA.hasValue() && !MaybeFnB.hasValue()) {
+  Function *MaybeFnA = const_cast<Function *>(parentFunctionOfValue(ValA));
+  Function *MaybeFnB = const_cast<Function *>(parentFunctionOfValue(ValB));
+  if (!MaybeFnA && !MaybeFnB) {
     // The only times this is known to happen are when globals + InlineAsm are
     // involved
     DEBUG(dbgs()
@@ -294,12 +278,12 @@ AliasResult CFLSteensAAResult::query(const MemoryLocation &LocA,
     return MayAlias;
   }
 
-  if (MaybeFnA.hasValue()) {
-    Fn = *MaybeFnA;
-    assert((!MaybeFnB.hasValue() || *MaybeFnB == *MaybeFnA) &&
+  if (MaybeFnA) {
+    Fn = MaybeFnA;
+    assert((!MaybeFnB || MaybeFnB == MaybeFnA) &&
            "Interprocedural queries not supported");
   } else {
-    Fn = *MaybeFnB;
+    Fn = MaybeFnB;
   }
 
   assert(Fn != nullptr);
diff --git a/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp b/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp
index 054bdc4..74b5d79 100644
--- a/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/contrib/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -117,6 +117,7 @@ bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
       PA.allAnalysesInSetPreserved<AllAnalysesOn<LazyCallGraph::SCC>>();
 
   // Ok, we have a graph, so we can propagate the invalidation down into it.
+  G->buildRefSCCs();
   for (auto &RC : G->postorder_ref_sccs())
     for (auto &C : RC) {
       Optional<PreservedAnalyses> InnerPA;
@@ -195,18 +196,117 @@ FunctionAnalysisManagerCGSCCProxy::run(LazyCallGraph::SCC &C,
 bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
     LazyCallGraph::SCC &C, const PreservedAnalyses &PA,
     CGSCCAnalysisManager::Invalidator &Inv) {
-  for (LazyCallGraph::Node &N : C)
-    FAM->invalidate(N.getFunction(), PA);
+  // If literally everything is preserved, we're done.
+  if (PA.areAllPreserved())
+    return false; // This is still a valid proxy.
+
+  // If this proxy isn't marked as preserved, then even if the result remains
+  // valid, the key itself may no longer be valid, so we clear everything.
+  //
+  // Note that in order to preserve this proxy, a module pass must ensure that
+  // the FAM has been completely updated to handle the deletion of functions.
+  // Specifically, any FAM-cached results for those functions need to have been
+  // forcibly cleared. When preserved, this proxy will only invalidate results
+  // cached on functions *still in the module* at the end of the module pass.
+  auto PAC = PA.getChecker<FunctionAnalysisManagerCGSCCProxy>();
+  if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<LazyCallGraph::SCC>>()) {
+    for (LazyCallGraph::Node &N : C)
+      FAM->clear(N.getFunction());
 
-  // This proxy doesn't need to handle invalidation itself. Instead, the
-  // module-level CGSCC proxy handles it above by ensuring that if the
-  // module-level FAM proxy becomes invalid the entire SCC layer, which
-  // includes this proxy, is cleared.
+    return true;
+  }
+
+  // Directly check if the relevant set is preserved.
+  bool AreFunctionAnalysesPreserved =
+      PA.allAnalysesInSetPreserved<AllAnalysesOn<Function>>();
+
+  // Now walk all the functions to see if any inner analysis invalidation is
+  // necessary.
+  for (LazyCallGraph::Node &N : C) {
+    Function &F = N.getFunction();
+    Optional<PreservedAnalyses> FunctionPA;
+
+    // Check to see whether the preserved set needs to be pruned based on
+    // SCC-level analysis invalidation that triggers deferred invalidation
+    // registered with the outer analysis manager proxy for this function.
+    if (auto *OuterProxy =
+            FAM->getCachedResult<CGSCCAnalysisManagerFunctionProxy>(F))
+      for (const auto &OuterInvalidationPair :
+           OuterProxy->getOuterInvalidations()) {
+        AnalysisKey *OuterAnalysisID = OuterInvalidationPair.first;
+        const auto &InnerAnalysisIDs = OuterInvalidationPair.second;
+        if (Inv.invalidate(OuterAnalysisID, C, PA)) {
+          if (!FunctionPA)
+            FunctionPA = PA;
+          for (AnalysisKey *InnerAnalysisID : InnerAnalysisIDs)
+            FunctionPA->abandon(InnerAnalysisID);
+        }
+      }
+
+    // Check if we needed a custom PA set, and if so we'll need to run the
+    // inner invalidation.
+    if (FunctionPA) {
+      FAM->invalidate(F, *FunctionPA);
+      continue;
+    }
+
+    // Otherwise we only need to do invalidation if the original PA set didn't
+    // preserve all function analyses.
+    if (!AreFunctionAnalysesPreserved)
+      FAM->invalidate(F, PA);
+  }
+
+  // Return false to indicate that this result is still a valid proxy.
   return false;
 }
 
 } // End llvm namespace
 
+/// When a new SCC is created for the graph and there might be function
+/// analysis results cached for the functions now in that SCC two forms of
+/// updates are required.
+///
+/// First, a proxy from the SCC to the FunctionAnalysisManager needs to be
+/// created so that any subsequent invalidation events to the SCC are
+/// propagated to the function analysis results cached for functions within it.
+///
+/// Second, if any of the functions within the SCC have analysis results with
+/// outer analysis dependencies, then those dependencies would point to the
+/// *wrong* SCC's analysis result. We forcibly invalidate the necessary
+/// function analyses so that they don't retain stale handles.
+static void updateNewSCCFunctionAnalyses(LazyCallGraph::SCC &C,
+                                         LazyCallGraph &G,
+                                         CGSCCAnalysisManager &AM) {
+  // Get the relevant function analysis manager.
+  auto &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, G).getManager();
+
+  // Now walk the functions in this SCC and invalidate any function analysis
+  // results that might have outer dependencies on an SCC analysis.
+  for (LazyCallGraph::Node &N : C) {
+    Function &F = N.getFunction();
+
+    auto *OuterProxy =
+        FAM.getCachedResult<CGSCCAnalysisManagerFunctionProxy>(F);
+    if (!OuterProxy)
+      // No outer analyses were queried, nothing to do.
+      continue;
+
+    // Forcibly abandon all the inner analyses with dependencies, but
+    // invalidate nothing else.
+    auto PA = PreservedAnalyses::all();
+    for (const auto &OuterInvalidationPair :
+         OuterProxy->getOuterInvalidations()) {
+      const auto &InnerAnalysisIDs = OuterInvalidationPair.second;
+      for (AnalysisKey *InnerAnalysisID : InnerAnalysisIDs)
+        PA.abandon(InnerAnalysisID);
+    }
+
+    // Now invalidate anything we found.
+    FAM.invalidate(F, PA);
+  }
+}
+
 namespace {
 /// Helper function to update both the \c CGSCCAnalysisManager \p AM and the \c
 /// CGSCCPassManager's \c CGSCCUpdateResult \p UR based on a range of newly
@@ -235,7 +335,6 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
     dbgs() << "Enqueuing the existing SCC in the worklist:" << *C << "\n";
 
   SCC *OldC = C;
-  (void)OldC;
 
   // Update the current SCC. Note that if we have new SCCs, this must actually
   // change the SCC.
@@ -244,6 +343,26 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
   C = &*NewSCCRange.begin();
   assert(G.lookupSCC(N) == C && "Failed to update current SCC!");
 
+  // If we had a cached FAM proxy originally, we will want to create more of
+  // them for each SCC that was split off.
+  bool NeedFAMProxy =
+      AM.getCachedResult<FunctionAnalysisManagerCGSCCProxy>(*OldC) != nullptr;
+
+  // We need to propagate an invalidation call to all but the newly current SCC
+  // because the outer pass manager won't do that for us after splitting them.
+  // FIXME: We should accept a PreservedAnalysis from the CG updater so that if
+  // there are preserved ananalyses we can avoid invalidating them here for
+  // split-off SCCs.
+  // We know however that this will preserve any FAM proxy so go ahead and mark
+  // that.
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  AM.invalidate(*OldC, PA);
+
+  // Ensure the now-current SCC's function analyses are updated.
+  if (NeedFAMProxy)
+    updateNewSCCFunctionAnalyses(*C, G, AM);
+
   for (SCC &NewC :
        reverse(make_range(std::next(NewSCCRange.begin()), NewSCCRange.end()))) {
     assert(C != &NewC && "No need to re-visit the current SCC!");
@@ -251,6 +370,14 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
     UR.CWorklist.insert(&NewC);
     if (DebugLogging)
       dbgs() << "Enqueuing a newly formed SCC:" << NewC << "\n";
+
+    // Ensure new SCCs' function analyses are updated.
+    if (NeedFAMProxy)
+      updateNewSCCFunctionAnalyses(NewC, G, AM);
+
+    // Also propagate a normal invalidation to the new SCC as only the current
+    // will get one from the pass manager infrastructure.
+    AM.invalidate(NewC, PA);
   }
   return C;
 }
@@ -273,9 +400,9 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   // demoted edges.
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Constant *, 16> Visited;
-  SmallPtrSet<Function *, 16> RetainedEdges;
-  SmallSetVector<Function *, 4> PromotedRefTargets;
-  SmallSetVector<Function *, 4> DemotedCallTargets;
+  SmallPtrSet<Node *, 16> RetainedEdges;
+  SmallSetVector<Node *, 4> PromotedRefTargets;
+  SmallSetVector<Node *, 4> DemotedCallTargets;
 
   // First walk the function and handle all called functions. We do this first
   // because if there is a single call edge, whether there are ref edges is
@@ -284,7 +411,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (auto CS = CallSite(&I))
       if (Function *Callee = CS.getCalledFunction())
         if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
-          const Edge *E = N.lookup(*Callee);
+          Node &CalleeN = *G.lookup(*Callee);
+          Edge *E = N->lookup(CalleeN);
           // FIXME: We should really handle adding new calls. While it will
           // make downstream usage more complex, there is no fundamental
           // limitation and it will allow passes within the CGSCC to be a bit
@@ -293,9 +421,9 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
           assert(E && "No function transformations should introduce *new* "
                       "call edges! Any new calls should be modeled as "
                       "promoted existing ref edges!");
-          RetainedEdges.insert(Callee);
+          RetainedEdges.insert(&CalleeN);
           if (!E->isCall())
-            PromotedRefTargets.insert(Callee);
+            PromotedRefTargets.insert(&CalleeN);
         }
 
   // Now walk all references.
@@ -305,25 +433,31 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
         if (Visited.insert(C).second)
           Worklist.push_back(C);
 
-  LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
-    const Edge *E = N.lookup(Referee);
+  auto VisitRef = [&](Function &Referee) {
+    Node &RefereeN = *G.lookup(Referee);
+    Edge *E = N->lookup(RefereeN);
     // FIXME: Similarly to new calls, we also currently preclude
     // introducing new references. See above for details.
     assert(E && "No function transformations should introduce *new* ref "
                 "edges! Any new ref edges would require IPO which "
                 "function passes aren't allowed to do!");
-    RetainedEdges.insert(&Referee);
+    RetainedEdges.insert(&RefereeN);
     if (E->isCall())
-      DemotedCallTargets.insert(&Referee);
-  });
+      DemotedCallTargets.insert(&RefereeN);
+  };
+  LazyCallGraph::visitReferences(Worklist, Visited, VisitRef);
+
+  // Include synthetic reference edges to known, defined lib functions.
+  for (auto *F : G.getLibFunctions())
+    VisitRef(*F);
 
   // First remove all of the edges that are no longer present in this function.
   // We have to build a list of dead targets first and then remove them as the
   // data structures will all be invalidated by removing them.
   SmallVector<PointerIntPair<Node *, 1, Edge::Kind>, 4> DeadTargets;
-  for (Edge &E : N)
-    if (!RetainedEdges.count(&E.getFunction()))
-      DeadTargets.push_back({E.getNode(), E.getKind()});
+  for (Edge &E : *N)
+    if (!RetainedEdges.count(&E.getNode()))
+      DeadTargets.push_back({&E.getNode(), E.getKind()});
   for (auto DeadTarget : DeadTargets) {
     Node &TargetN = *DeadTarget.getPointer();
     bool IsCall = DeadTarget.getInt() == Edge::Call;
@@ -346,14 +480,6 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
         // For separate SCCs this is trivial.
         RC->switchTrivialInternalEdgeToRef(N, TargetN);
       } else {
-        // Otherwise we may end up re-structuring the call graph. First,
-        // invalidate any SCC analyses. We have to do this before we split
-        // functions into new SCCs and lose track of where their analyses are
-        // cached.
-        // FIXME: We should accept a more precise preserved set here. For
-        // example, it might be possible to preserve some function analyses
-        // even as the SCC structure is changed.
-        AM.invalidate(*C, PreservedAnalyses::none());
         // Now update the call graph.
         C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, TargetN), G,
                                    N, C, AM, UR, DebugLogging);
@@ -397,9 +523,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   // Next demote all the call edges that are now ref edges. This helps make
   // the SCCs small which should minimize the work below as we don't want to
   // form cycles that this would break.
-  for (Function *RefTarget : DemotedCallTargets) {
-    Node &TargetN = *G.lookup(*RefTarget);
-    SCC &TargetC = *G.lookupSCC(TargetN);
+  for (Node *RefTarget : DemotedCallTargets) {
+    SCC &TargetC = *G.lookupSCC(*RefTarget);
     RefSCC &TargetRC = TargetC.getOuterRefSCC();
 
     // The easy case is when the target RefSCC is not this RefSCC. This is
@@ -407,10 +532,10 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (&TargetRC != RC) {
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
-      RC->switchOutgoingEdgeToRef(N, TargetN);
+      RC->switchOutgoingEdgeToRef(N, *RefTarget);
       if (DebugLogging)
         dbgs() << "Switch outgoing call edge to a ref edge from '" << N
-               << "' to '" << TargetN << "'\n";
+               << "' to '" << *RefTarget << "'\n";
       continue;
     }
 
@@ -418,26 +543,18 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     // some SCCs.
     if (C != &TargetC) {
       // For separate SCCs this is trivial.
-      RC->switchTrivialInternalEdgeToRef(N, TargetN);
+      RC->switchTrivialInternalEdgeToRef(N, *RefTarget);
       continue;
     }
 
-    // Otherwise we may end up re-structuring the call graph. First, invalidate
-    // any SCC analyses. We have to do this before we split functions into new
-    // SCCs and lose track of where their analyses are cached.
-    // FIXME: We should accept a more precise preserved set here. For example,
-    // it might be possible to preserve some function analyses even as the SCC
-    // structure is changed.
-    AM.invalidate(*C, PreservedAnalyses::none());
     // Now update the call graph.
-    C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, TargetN), G,
-                               N, C, AM, UR, DebugLogging);
+    C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, *RefTarget), G, N,
+                               C, AM, UR, DebugLogging);
   }
 
   // Now promote ref edges into call edges.
-  for (Function *CallTarget : PromotedRefTargets) {
-    Node &TargetN = *G.lookup(*CallTarget);
-    SCC &TargetC = *G.lookupSCC(TargetN);
+  for (Node *CallTarget : PromotedRefTargets) {
+    SCC &TargetC = *G.lookupSCC(*CallTarget);
     RefSCC &TargetRC = TargetC.getOuterRefSCC();
 
     // The easy case is when the target RefSCC is not this RefSCC. This is
@@ -445,38 +562,61 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (&TargetRC != RC) {
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
-      RC->switchOutgoingEdgeToCall(N, TargetN);
+      RC->switchOutgoingEdgeToCall(N, *CallTarget);
       if (DebugLogging)
         dbgs() << "Switch outgoing ref edge to a call edge from '" << N
-               << "' to '" << TargetN << "'\n";
+               << "' to '" << *CallTarget << "'\n";
       continue;
     }
     if (DebugLogging)
       dbgs() << "Switch an internal ref edge to a call edge from '" << N
-             << "' to '" << TargetN << "'\n";
+             << "' to '" << *CallTarget << "'\n";
 
     // Otherwise we are switching an internal ref edge to a call edge. This
     // may merge away some SCCs, and we add those to the UpdateResult. We also
     // need to make sure to update the worklist in the event SCCs have moved
-    // before the current one in the post-order sequence.
+    // before the current one in the post-order sequence
+    bool HasFunctionAnalysisProxy = false;
     auto InitialSCCIndex = RC->find(*C) - RC->begin();
-    auto InvalidatedSCCs = RC->switchInternalEdgeToCall(N, TargetN);
-    if (!InvalidatedSCCs.empty()) {
+    bool FormedCycle = RC->switchInternalEdgeToCall(
+        N, *CallTarget, [&](ArrayRef<SCC *> MergedSCCs) {
+          for (SCC *MergedC : MergedSCCs) {
+            assert(MergedC != &TargetC && "Cannot merge away the target SCC!");
+
+            HasFunctionAnalysisProxy |=
+                AM.getCachedResult<FunctionAnalysisManagerCGSCCProxy>(
+                    *MergedC) != nullptr;
+
+            // Mark that this SCC will no longer be valid.
+            UR.InvalidatedSCCs.insert(MergedC);
+
+            // FIXME: We should really do a 'clear' here to forcibly release
+            // memory, but we don't have a good way of doing that and
+            // preserving the function analyses.
+            auto PA = PreservedAnalyses::allInSet<AllAnalysesOn<Function>>();
+            PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+            AM.invalidate(*MergedC, PA);
+          }
+        });
+
+    // If we formed a cycle by creating this call, we need to update more data
+    // structures.
+    if (FormedCycle) {
       C = &TargetC;
       assert(G.lookupSCC(N) == C && "Failed to update current SCC!");
 
-      // Any analyses cached for this SCC are no longer precise as the shape
-      // has changed by introducing this cycle.
-      AM.invalidate(*C, PreservedAnalyses::none());
-
-      for (SCC *InvalidatedC : InvalidatedSCCs) {
-        assert(InvalidatedC != C && "Cannot invalidate the current SCC!");
-        UR.InvalidatedSCCs.insert(InvalidatedC);
+      // If one of the invalidated SCCs had a cached proxy to a function
+      // analysis manager, we need to create a proxy in the new current SCC as
+      // the invaliadted SCCs had their functions moved.
+      if (HasFunctionAnalysisProxy)
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, G);
 
-        // Also clear any cached analyses for the SCCs that are dead. This
-        // isn't really necessary for correctness but can release memory.
-        AM.clear(*InvalidatedC);
-      }
+      // Any analyses cached for this SCC are no longer precise as the shape
+      // has changed by introducing this cycle. However, we have taken care to
+      // update the proxies so it remains valide.
+      auto PA = PreservedAnalyses::allInSet<AllAnalysesOn<Function>>();
+      PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+      AM.invalidate(*C, PA);
     }
     auto NewSCCIndex = RC->find(*C) - RC->begin();
     if (InitialSCCIndex < NewSCCIndex) {
diff --git a/contrib/llvm/lib/Analysis/CallGraph.cpp b/contrib/llvm/lib/Analysis/CallGraph.cpp
index 458b7bf..ff5242f 100644
--- a/contrib/llvm/lib/Analysis/CallGraph.cpp
+++ b/contrib/llvm/lib/Analysis/CallGraph.cpp
@@ -21,23 +21,18 @@ using namespace llvm;
 //
 
 CallGraph::CallGraph(Module &M)
-    : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)),
+    : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
       CallsExternalNode(llvm::make_unique<CallGraphNode>(nullptr)) {
   // Add every function to the call graph.
   for (Function &F : M)
     addToCallGraph(&F);
-
-  // If we didn't find a main function, use the external call graph node
-  if (!Root)
-    Root = ExternalCallingNode;
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
-    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root),
+    : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)),
       ExternalCallingNode(Arg.ExternalCallingNode),
       CallsExternalNode(std::move(Arg.CallsExternalNode)) {
   Arg.FunctionMap.clear();
-  Arg.Root = nullptr;
   Arg.ExternalCallingNode = nullptr;
 }
 
@@ -57,21 +52,9 @@ CallGraph::~CallGraph() {
 void CallGraph::addToCallGraph(Function *F) {
   CallGraphNode *Node = getOrInsertFunction(F);
 
-  // If this function has external linkage, anything could call it.
-  if (!F->hasLocalLinkage()) {
-    ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-    // Found the entry point?
-    if (F->getName() == "main") {
-      if (Root) // Found multiple external mains?  Don't pick one.
-        Root = ExternalCallingNode;
-      else
-        Root = Node; // Found a main, keep track of it!
-    }
-  }
-
-  // If this function has its address taken, anything could call it.
-  if (F->hasAddressTaken())
+  // If this function has external linkage or has its address taken, anything
+  // could call it.
+  if (!F->hasLocalLinkage() || F->hasAddressTaken())
     ExternalCallingNode->addCalledFunction(CallSite(), Node);
 
   // If this function is not defined in this translation unit, it could call
@@ -96,13 +79,6 @@ void CallGraph::addToCallGraph(Function *F) {
 }
 
 void CallGraph::print(raw_ostream &OS) const {
-  OS << "CallGraph Root is: ";
-  if (Function *F = Root->getFunction())
-    OS << F->getName() << "\n";
-  else {
-    OS << "<<null function: 0x" << Root << ">>\n";
-  }
-
   // Print in a deterministic order by sorting CallGraphNodes by name.  We do
   // this here to avoid slowing down the non-printing fast path.
 
@@ -125,8 +101,9 @@ void CallGraph::print(raw_ostream &OS) const {
     CN->print(OS);
 }
 
-LLVM_DUMP_METHOD
-void CallGraph::dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void CallGraph::dump() const { print(dbgs()); }
+#endif
 
 // removeFunctionFromModule - Unlink the function from this module, returning
 // it.  Because this removes the function from the module, the call graph node
@@ -194,8 +171,9 @@ void CallGraphNode::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
-LLVM_DUMP_METHOD
-void CallGraphNode::dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void CallGraphNode::dump() const { print(dbgs()); }
+#endif
 
 /// removeCallEdgeFor - This method removes the edge in the node for the
 /// specified call site.  Note that this method takes linear time, so it
@@ -307,8 +285,10 @@ void CallGraphWrapperPass::print(raw_ostream &OS, const Module *) const {
   G->print(OS);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void CallGraphWrapperPass::dump() const { print(dbgs(), nullptr); }
+#endif
 
 namespace {
 struct CallGraphPrinterLegacyPass : public ModulePass {
diff --git a/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
index 9cef781..facda24 100644
--- a/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/contrib/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -204,7 +204,7 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
     // Get the set of call sites currently in the function.
     for (CallGraphNode::iterator I = CGN->begin(), E = CGN->end(); I != E; ) {
       // If this call site is null, then the function pass deleted the call
-      // entirely and the WeakVH nulled it out.  
+      // entirely and the WeakTrackingVH nulled it out.
       if (!I->first ||
           // If we've already seen this call site, then the FunctionPass RAUW'd
           // one call with another, which resulted in two "uses" in the edge
@@ -347,7 +347,8 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
       DevirtualizedCall = true;
     
     // After scanning this function, if we still have entries in callsites, then
-    // they are dangling pointers.  WeakVH should save us for this, so abort if
+    // they are dangling pointers.  WeakTrackingVH should save us for this, so
+    // abort if
     // this happens.
     assert(CallSites.empty() && "Dangling pointers found in call sites map");
     
@@ -476,10 +477,8 @@ bool CGPassManager::runOnModule(Module &M) {
     if (DevirtualizedCall)
       DEBUG(dbgs() << "  CGSCCPASSMGR: Stopped iteration after " << Iteration
                    << " times, due to -max-cg-scc-iterations\n");
-    
-    if (Iteration > MaxSCCIterations)
-      MaxSCCIterations = Iteration;
-    
+
+    MaxSCCIterations.updateMax(Iteration);
   }
   Changed |= doFinalization(CG);
   return Changed;
@@ -609,16 +608,28 @@ namespace {
     }
 
     bool runOnSCC(CallGraphSCC &SCC) override {
-      Out << Banner;
+      bool BannerPrinted = false;
+      auto PrintBannerOnce = [&] () {
+        if (BannerPrinted)
+          return;
+        Out << Banner;
+        BannerPrinted = true;
+        };
       for (CallGraphNode *CGN : SCC) {
-        if (CGN->getFunction()) {
-          if (isFunctionInPrintList(CGN->getFunction()->getName()))
-            CGN->getFunction()->print(Out);
-        } else
+        if (Function *F = CGN->getFunction()) {
+          if (!F->isDeclaration() && isFunctionInPrintList(F->getName())) {
+            PrintBannerOnce();
+            F->print(Out);
+          }
+        } else if (llvm::isFunctionInPrintList("*")) {
+          PrintBannerOnce();
           Out << "\nPrinting <null> Function\n";
+        }
       }
       return false;
     }
+    
+    StringRef getPassName() const override { return "Print CallGraph IR"; }
   };
   
 } // end anonymous namespace.
diff --git a/contrib/llvm/lib/Analysis/CallPrinter.cpp b/contrib/llvm/lib/Analysis/CallPrinter.cpp
index af942e9..e7017e7 100644
--- a/contrib/llvm/lib/Analysis/CallPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/CallPrinter.cpp
@@ -14,8 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallPrinter.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/DOTGraphTraitsPass.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Analysis/CaptureTracking.cpp b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
index 9862c3c..3b0026b 100644
--- a/contrib/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/contrib/llvm/lib/Analysis/CaptureTracking.cpp
@@ -16,11 +16,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -94,8 +94,8 @@ namespace {
         // guarantee that 'I' never reaches 'BeforeHere' through a back-edge or
         // by its successors, i.e, prune if:
         //
-        //  (1) BB is an entry block or have no sucessors.
-        //  (2) There's no path coming back through BB sucessors.
+        //  (1) BB is an entry block or have no successors.
+        //  (2) There's no path coming back through BB successors.
         if (BB == &BB->getParent()->getEntryBlock() ||
             !BB->getTerminator()->getNumSuccessors())
           return true;
diff --git a/contrib/llvm/lib/Analysis/CodeMetrics.cpp b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
index bdffdd8..e4d9292 100644
--- a/contrib/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/contrib/llvm/lib/Analysis/CodeMetrics.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
diff --git a/contrib/llvm/lib/Analysis/ConstantFolding.cpp b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
index 7386727..0f5ec3f 100644
--- a/contrib/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/contrib/llvm/lib/Analysis/ConstantFolding.cpp
@@ -22,8 +22,8 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/config.h"
@@ -42,6 +42,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <cerrno>
@@ -686,25 +687,21 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1,
   // bits.
 
   if (Opc == Instruction::And) {
-    unsigned BitWidth = DL.getTypeSizeInBits(Op0->getType()->getScalarType());
-    APInt KnownZero0(BitWidth, 0), KnownOne0(BitWidth, 0);
-    APInt KnownZero1(BitWidth, 0), KnownOne1(BitWidth, 0);
-    computeKnownBits(Op0, KnownZero0, KnownOne0, DL);
-    computeKnownBits(Op1, KnownZero1, KnownOne1, DL);
-    if ((KnownOne1 | KnownZero0).isAllOnesValue()) {
+    KnownBits Known0 = computeKnownBits(Op0, DL);
+    KnownBits Known1 = computeKnownBits(Op1, DL);
+    if ((Known1.One | Known0.Zero).isAllOnesValue()) {
       // All the bits of Op0 that the 'and' could be masking are already zero.
       return Op0;
     }
-    if ((KnownOne0 | KnownZero1).isAllOnesValue()) {
+    if ((Known0.One | Known1.Zero).isAllOnesValue()) {
       // All the bits of Op1 that the 'and' could be masking are already zero.
       return Op1;
     }
 
-    APInt KnownZero = KnownZero0 | KnownZero1;
-    APInt KnownOne = KnownOne0 & KnownOne1;
-    if ((KnownZero | KnownOne).isAllOnesValue()) {
-      return ConstantInt::get(Op0->getType(), KnownOne);
-    }
+    Known0.Zero |= Known1.Zero;
+    Known0.One &= Known1.One;
+    if (Known0.isConstant())
+      return ConstantInt::get(Op0->getType(), Known0.getConstant());
   }
 
   // If the constant expr is something like &A[123] - &A[4].f, fold this into a
@@ -1018,9 +1015,11 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
   case Instruction::ICmp:
   case Instruction::FCmp: llvm_unreachable("Invalid for compares");
   case Instruction::Call:
-    if (auto *F = dyn_cast<Function>(Ops.back()))
-      if (canConstantFoldCallTo(F))
-        return ConstantFoldCall(F, Ops.slice(0, Ops.size() - 1), TLI);
+    if (auto *F = dyn_cast<Function>(Ops.back())) {
+      ImmutableCallSite CS(cast<CallInst>(InstOrCE));
+      if (canConstantFoldCallTo(CS, F))
+        return ConstantFoldCall(CS, F, Ops.slice(0, Ops.size() - 1), TLI);
+    }
     return nullptr;
   case Instruction::Select:
     return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
@@ -1058,8 +1057,8 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL,
       if (It == FoldedOps.end()) {
         if (auto *FoldedC =
                 ConstantFoldConstantImpl(NewC, DL, TLI, FoldedOps)) {
-          NewC = FoldedC;
           FoldedOps.insert({NewC, FoldedC});
+          NewC = FoldedC;
         } else {
           FoldedOps.insert({NewC, NewC});
         }
@@ -1173,7 +1172,9 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
                                                 const DataLayout &DL,
                                                 const TargetLibraryInfo *TLI) {
   // fold: icmp (inttoptr x), null         -> icmp x, 0
+  // fold: icmp null, (inttoptr x)         -> icmp 0, x
   // fold: icmp (ptrtoint x), 0            -> icmp x, null
+  // fold: icmp 0, (ptrtoint x)            -> icmp null, x
   // fold: icmp (inttoptr x), (inttoptr y) -> icmp trunc/zext x, trunc/zext y
   // fold: icmp (ptrtoint x), (ptrtoint y) -> icmp x, y
   //
@@ -1243,6 +1244,11 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
       return ConstantFoldBinaryOpOperands(OpC, LHS, RHS, DL);
     }
+  } else if (isa<ConstantExpr>(Ops1)) {
+    // If RHS is a constant expression, but the left side isn't, swap the
+    // operands and try again.
+    Predicate = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)Predicate);
+    return ConstantFoldCompareInstOperands(Predicate, Ops1, Ops0, DL, TLI);
   }
 
   return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
@@ -1352,7 +1358,9 @@ llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
 //  Constant Folding for Calls
 //
 
-bool llvm::canConstantFoldCallTo(const Function *F) {
+bool llvm::canConstantFoldCallTo(ImmutableCallSite CS, const Function *F) {
+  if (CS.isNoBuiltin())
+    return false;
   switch (F->getIntrinsicID()) {
   case Intrinsic::fabs:
   case Intrinsic::minnum:
@@ -1401,7 +1409,7 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
     return true;
   default:
     return false;
-  case 0: break;
+  case Intrinsic::not_intrinsic: break;
   }
 
   if (!F->hasName())
@@ -1438,6 +1446,36 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
            Name == "sinf" || Name == "sinhf" || Name == "sqrtf";
   case 't':
     return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf";
+  case '_':
+
+    // Check for various function names that get used for the math functions
+    // when the header files are preprocessed with the macro
+    // __FINITE_MATH_ONLY__ enabled.
+    // The '12' here is the length of the shortest name that can match.
+    // We need to check the size before looking at Name[1] and Name[2]
+    // so we may as well check a limit that will eliminate mismatches.
+    if (Name.size() < 12 || Name[1] != '_')
+      return false;
+    switch (Name[2]) {
+    default:
+      return false;
+    case 'a':
+      return Name == "__acos_finite" || Name == "__acosf_finite" ||
+             Name == "__asin_finite" || Name == "__asinf_finite" ||
+             Name == "__atan2_finite" || Name == "__atan2f_finite";
+    case 'c':
+      return Name == "__cosh_finite" || Name == "__coshf_finite";
+    case 'e':
+      return Name == "__exp_finite" || Name == "__expf_finite" ||
+             Name == "__exp2_finite" || Name == "__exp2f_finite";
+    case 'l':
+      return Name == "__log_finite" || Name == "__logf_finite" ||
+             Name == "__log10_finite" || Name == "__log10f_finite";
+    case 'p':
+      return Name == "__pow_finite" || Name == "__powf_finite";
+    case 's':
+      return Name == "__sinh_finite" || Name == "__sinhf_finite";
+    }
   }
 }
 
@@ -1518,9 +1556,9 @@ Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
   bool isExact = false;
   APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
                                               : APFloat::rmNearestTiesToEven;
-  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
-                                                  /*isSigned=*/true, mode,
-                                                  &isExact);
+  APFloat::opStatus status =
+      Val.convertToInteger(makeMutableArrayRef(UIntVal), ResultWidth,
+                           /*isSigned=*/true, mode, &isExact);
   if (status != APFloat::opOK &&
       (!roundTowardZero || status != APFloat::opInexact))
     return nullptr;
@@ -1550,6 +1588,9 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
       // cosine(arg) is between -1 and 1. cosine(invalid arg) is NaN
       if (IntrinsicID == Intrinsic::cos)
         return Constant::getNullValue(Ty);
+      if (IntrinsicID == Intrinsic::bswap ||
+          IntrinsicID == Intrinsic::bitreverse)
+        return Operands[0];
     }
     if (auto *Op = dyn_cast<ConstantFP>(Operands[0])) {
       if (IntrinsicID == Intrinsic::convert_to_fp16) {
@@ -1630,94 +1671,108 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
           return ConstantFoldFP(sin, V, Ty);
         case Intrinsic::cos:
           return ConstantFoldFP(cos, V, Ty);
+        case Intrinsic::sqrt:
+          return ConstantFoldFP(sqrt, V, Ty);
       }
 
       if (!TLI)
         return nullptr;
 
-      switch (Name[0]) {
+      char NameKeyChar = Name[0];
+      if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_')
+        NameKeyChar = Name[2];
+
+      switch (NameKeyChar) {
       case 'a':
-        if ((Name == "acos" && TLI->has(LibFunc::acos)) ||
-            (Name == "acosf" && TLI->has(LibFunc::acosf)))
+        if ((Name == "acos" && TLI->has(LibFunc_acos)) ||
+            (Name == "acosf" && TLI->has(LibFunc_acosf)) ||
+            (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) ||
+            (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite)))
           return ConstantFoldFP(acos, V, Ty);
-        else if ((Name == "asin" && TLI->has(LibFunc::asin)) ||
-                 (Name == "asinf" && TLI->has(LibFunc::asinf)))
+        else if ((Name == "asin" && TLI->has(LibFunc_asin)) ||
+                 (Name == "asinf" && TLI->has(LibFunc_asinf)) ||
+                 (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) ||
+                 (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite)))
           return ConstantFoldFP(asin, V, Ty);
-        else if ((Name == "atan" && TLI->has(LibFunc::atan)) ||
-                 (Name == "atanf" && TLI->has(LibFunc::atanf)))
+        else if ((Name == "atan" && TLI->has(LibFunc_atan)) ||
+                 (Name == "atanf" && TLI->has(LibFunc_atanf)))
           return ConstantFoldFP(atan, V, Ty);
         break;
       case 'c':
-        if ((Name == "ceil" && TLI->has(LibFunc::ceil)) ||
-            (Name == "ceilf" && TLI->has(LibFunc::ceilf)))
+        if ((Name == "ceil" && TLI->has(LibFunc_ceil)) ||
+            (Name == "ceilf" && TLI->has(LibFunc_ceilf)))
           return ConstantFoldFP(ceil, V, Ty);
-        else if ((Name == "cos" && TLI->has(LibFunc::cos)) ||
-                 (Name == "cosf" && TLI->has(LibFunc::cosf)))
+        else if ((Name == "cos" && TLI->has(LibFunc_cos)) ||
+                 (Name == "cosf" && TLI->has(LibFunc_cosf)))
           return ConstantFoldFP(cos, V, Ty);
-        else if ((Name == "cosh" && TLI->has(LibFunc::cosh)) ||
-                 (Name == "coshf" && TLI->has(LibFunc::coshf)))
+        else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) ||
+                 (Name == "coshf" && TLI->has(LibFunc_coshf)) ||
+                 (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) ||
+                 (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite)))
           return ConstantFoldFP(cosh, V, Ty);
         break;
       case 'e':
-        if ((Name == "exp" && TLI->has(LibFunc::exp)) ||
-            (Name == "expf" && TLI->has(LibFunc::expf)))
+        if ((Name == "exp" && TLI->has(LibFunc_exp)) ||
+            (Name == "expf" && TLI->has(LibFunc_expf)) ||
+            (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) ||
+            (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite)))
           return ConstantFoldFP(exp, V, Ty);
-        if ((Name == "exp2" && TLI->has(LibFunc::exp2)) ||
-            (Name == "exp2f" && TLI->has(LibFunc::exp2f)))
+        if ((Name == "exp2" && TLI->has(LibFunc_exp2)) ||
+            (Name == "exp2f" && TLI->has(LibFunc_exp2f)) ||
+            (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) ||
+            (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite)))
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
           return ConstantFoldBinaryFP(pow, 2.0, V, Ty);
         break;
       case 'f':
-        if ((Name == "fabs" && TLI->has(LibFunc::fabs)) ||
-            (Name == "fabsf" && TLI->has(LibFunc::fabsf)))
+        if ((Name == "fabs" && TLI->has(LibFunc_fabs)) ||
+            (Name == "fabsf" && TLI->has(LibFunc_fabsf)))
           return ConstantFoldFP(fabs, V, Ty);
-        else if ((Name == "floor" && TLI->has(LibFunc::floor)) ||
-                 (Name == "floorf" && TLI->has(LibFunc::floorf)))
+        else if ((Name == "floor" && TLI->has(LibFunc_floor)) ||
+                 (Name == "floorf" && TLI->has(LibFunc_floorf)))
           return ConstantFoldFP(floor, V, Ty);
         break;
       case 'l':
-        if ((Name == "log" && V > 0 && TLI->has(LibFunc::log)) ||
-            (Name == "logf" && V > 0 && TLI->has(LibFunc::logf)))
+        if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) ||
+            (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) ||
+            (Name == "__log_finite" && V > 0 &&
+              TLI->has(LibFunc_log_finite)) ||
+            (Name == "__logf_finite" && V > 0 &&
+              TLI->has(LibFunc_logf_finite)))
           return ConstantFoldFP(log, V, Ty);
-        else if ((Name == "log10" && V > 0 && TLI->has(LibFunc::log10)) ||
-                 (Name == "log10f" && V > 0 && TLI->has(LibFunc::log10f)))
+        else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) ||
+                 (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) ||
+                 (Name == "__log10_finite" && V > 0 &&
+                   TLI->has(LibFunc_log10_finite)) ||
+                 (Name == "__log10f_finite" && V > 0 &&
+                   TLI->has(LibFunc_log10f_finite)))
           return ConstantFoldFP(log10, V, Ty);
-        else if (IntrinsicID == Intrinsic::sqrt &&
-                 (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())) {
-          if (V >= -0.0)
-            return ConstantFoldFP(sqrt, V, Ty);
-          else {
-            // Unlike the sqrt definitions in C/C++, POSIX, and IEEE-754 - which
-            // all guarantee or favor returning NaN - the square root of a
-            // negative number is not defined for the LLVM sqrt intrinsic.
-            // This is because the intrinsic should only be emitted in place of
-            // libm's sqrt function when using "no-nans-fp-math".
-            return UndefValue::get(Ty);
-          }
-        }
         break;
       case 'r':
-        if ((Name == "round" && TLI->has(LibFunc::round)) ||
-            (Name == "roundf" && TLI->has(LibFunc::roundf)))
+        if ((Name == "round" && TLI->has(LibFunc_round)) ||
+            (Name == "roundf" && TLI->has(LibFunc_roundf)))
           return ConstantFoldFP(round, V, Ty);
+        break;
       case 's':
-        if ((Name == "sin" && TLI->has(LibFunc::sin)) ||
-            (Name == "sinf" && TLI->has(LibFunc::sinf)))
+        if ((Name == "sin" && TLI->has(LibFunc_sin)) ||
+            (Name == "sinf" && TLI->has(LibFunc_sinf)))
           return ConstantFoldFP(sin, V, Ty);
-        else if ((Name == "sinh" && TLI->has(LibFunc::sinh)) ||
-                 (Name == "sinhf" && TLI->has(LibFunc::sinhf)))
+        else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) ||
+                 (Name == "sinhf" && TLI->has(LibFunc_sinhf)) ||
+                 (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) ||
+                 (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite)))
           return ConstantFoldFP(sinh, V, Ty);
-        else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc::sqrt)) ||
-                 (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc::sqrtf)))
+        else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) ||
+                 (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf)))
           return ConstantFoldFP(sqrt, V, Ty);
         break;
       case 't':
-        if ((Name == "tan" && TLI->has(LibFunc::tan)) ||
-            (Name == "tanf" && TLI->has(LibFunc::tanf)))
+        if ((Name == "tan" && TLI->has(LibFunc_tan)) ||
+            (Name == "tanf" && TLI->has(LibFunc_tanf)))
           return ConstantFoldFP(tan, V, Ty);
-        else if ((Name == "tanh" && TLI->has(LibFunc::tanh)) ||
-                 (Name == "tanhf" && TLI->has(LibFunc::tanhf)))
+        else if ((Name == "tanh" && TLI->has(LibFunc_tanh)) ||
+                 (Name == "tanhf" && TLI->has(LibFunc_tanhf)))
           return ConstantFoldFP(tanh, V, Ty);
         break;
       default:
@@ -1767,6 +1822,7 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                 dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
           return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
                                              /*roundTowardZero=*/false, Ty);
+        break;
       case Intrinsic::x86_sse_cvttss2si:
       case Intrinsic::x86_sse_cvttss2si64:
       case Intrinsic::x86_sse2_cvttsd2si:
@@ -1775,15 +1831,10 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
                 dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
           return ConstantFoldSSEConvertToInt(FPOp->getValueAPF(),
                                              /*roundTowardZero=*/true, Ty);
+        break;
       }
     }
 
-    if (isa<UndefValue>(Operands[0])) {
-      if (IntrinsicID == Intrinsic::bswap)
-        return Operands[0];
-      return nullptr;
-    }
-
     return nullptr;
   }
 
@@ -1822,14 +1873,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
 
         if (!TLI)
           return nullptr;
-        if ((Name == "pow" && TLI->has(LibFunc::pow)) ||
-            (Name == "powf" && TLI->has(LibFunc::powf)))
+        if ((Name == "pow" && TLI->has(LibFunc_pow)) ||
+            (Name == "powf" && TLI->has(LibFunc_powf)) ||
+            (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) ||
+            (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite)))
           return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty);
-        if ((Name == "fmod" && TLI->has(LibFunc::fmod)) ||
-            (Name == "fmodf" && TLI->has(LibFunc::fmodf)))
+        if ((Name == "fmod" && TLI->has(LibFunc_fmod)) ||
+            (Name == "fmodf" && TLI->has(LibFunc_fmodf)))
           return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty);
-        if ((Name == "atan2" && TLI->has(LibFunc::atan2)) ||
-            (Name == "atan2f" && TLI->has(LibFunc::atan2f)))
+        if ((Name == "atan2" && TLI->has(LibFunc_atan2)) ||
+            (Name == "atan2f" && TLI->has(LibFunc_atan2f)) ||
+            (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) ||
+            (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite)))
           return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty);
       } else if (auto *Op2C = dyn_cast<ConstantInt>(Operands[1])) {
         if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy())
@@ -1980,6 +2035,14 @@ Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
   for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
+      // These intrinsics use a scalar type for their second argument.
+      if (J == 1 &&
+          (IntrinsicID == Intrinsic::cttz || IntrinsicID == Intrinsic::ctlz ||
+           IntrinsicID == Intrinsic::powi)) {
+        Lane[J] = Operands[J];
+        continue;
+      }
+
       Constant *Agg = Operands[J]->getAggregateElement(I);
       if (!Agg)
         return nullptr;
@@ -2000,8 +2063,11 @@ Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID,
 } // end anonymous namespace
 
 Constant *
-llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
+llvm::ConstantFoldCall(ImmutableCallSite CS, Function *F,
+                       ArrayRef<Constant *> Operands,
                        const TargetLibraryInfo *TLI) {
+  if (CS.isNoBuiltin())
+    return nullptr;
   if (!F->hasName())
     return nullptr;
   StringRef Name = F->getName();
@@ -2018,11 +2084,13 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
 bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
   // FIXME: Refactor this code; this duplicates logic in LibCallsShrinkWrap
   // (and to some extent ConstantFoldScalarCall).
+  if (CS.isNoBuiltin())
+    return false;
   Function *F = CS.getCalledFunction();
   if (!F)
     return false;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!TLI || !TLI->getLibFunc(*F, Func))
     return false;
 
@@ -2030,20 +2098,20 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
     if (ConstantFP *OpC = dyn_cast<ConstantFP>(CS.getArgOperand(0))) {
       const APFloat &Op = OpC->getValueAPF();
       switch (Func) {
-      case LibFunc::logl:
-      case LibFunc::log:
-      case LibFunc::logf:
-      case LibFunc::log2l:
-      case LibFunc::log2:
-      case LibFunc::log2f:
-      case LibFunc::log10l:
-      case LibFunc::log10:
-      case LibFunc::log10f:
+      case LibFunc_logl:
+      case LibFunc_log:
+      case LibFunc_logf:
+      case LibFunc_log2l:
+      case LibFunc_log2:
+      case LibFunc_log2f:
+      case LibFunc_log10l:
+      case LibFunc_log10:
+      case LibFunc_log10f:
         return Op.isNaN() || (!Op.isZero() && !Op.isNegative());
 
-      case LibFunc::expl:
-      case LibFunc::exp:
-      case LibFunc::expf:
+      case LibFunc_expl:
+      case LibFunc_exp:
+      case LibFunc_expf:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
           return Op.compare(APFloat(-745.0)) != APFloat::cmpLessThan &&
@@ -2053,9 +2121,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
                  Op.compare(APFloat(88.0f)) != APFloat::cmpGreaterThan;
         break;
 
-      case LibFunc::exp2l:
-      case LibFunc::exp2:
-      case LibFunc::exp2f:
+      case LibFunc_exp2l:
+      case LibFunc_exp2:
+      case LibFunc_exp2f:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
           return Op.compare(APFloat(-1074.0)) != APFloat::cmpLessThan &&
@@ -2065,17 +2133,17 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
                  Op.compare(APFloat(127.0f)) != APFloat::cmpGreaterThan;
         break;
 
-      case LibFunc::sinl:
-      case LibFunc::sin:
-      case LibFunc::sinf:
-      case LibFunc::cosl:
-      case LibFunc::cos:
-      case LibFunc::cosf:
+      case LibFunc_sinl:
+      case LibFunc_sin:
+      case LibFunc_sinf:
+      case LibFunc_cosl:
+      case LibFunc_cos:
+      case LibFunc_cosf:
         return !Op.isInfinity();
 
-      case LibFunc::tanl:
-      case LibFunc::tan:
-      case LibFunc::tanf: {
+      case LibFunc_tanl:
+      case LibFunc_tan:
+      case LibFunc_tanf: {
         // FIXME: Stop using the host math library.
         // FIXME: The computation isn't done in the right precision.
         Type *Ty = OpC->getType();
@@ -2086,23 +2154,23 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
         break;
       }
 
-      case LibFunc::asinl:
-      case LibFunc::asin:
-      case LibFunc::asinf:
-      case LibFunc::acosl:
-      case LibFunc::acos:
-      case LibFunc::acosf:
+      case LibFunc_asinl:
+      case LibFunc_asin:
+      case LibFunc_asinf:
+      case LibFunc_acosl:
+      case LibFunc_acos:
+      case LibFunc_acosf:
         return Op.compare(APFloat(Op.getSemantics(), "-1")) !=
                    APFloat::cmpLessThan &&
                Op.compare(APFloat(Op.getSemantics(), "1")) !=
                    APFloat::cmpGreaterThan;
 
-      case LibFunc::sinh:
-      case LibFunc::cosh:
-      case LibFunc::sinhf:
-      case LibFunc::coshf:
-      case LibFunc::sinhl:
-      case LibFunc::coshl:
+      case LibFunc_sinh:
+      case LibFunc_cosh:
+      case LibFunc_sinhf:
+      case LibFunc_coshf:
+      case LibFunc_sinhl:
+      case LibFunc_coshl:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
           return Op.compare(APFloat(-710.0)) != APFloat::cmpLessThan &&
@@ -2112,9 +2180,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
                  Op.compare(APFloat(89.0f)) != APFloat::cmpGreaterThan;
         break;
 
-      case LibFunc::sqrtl:
-      case LibFunc::sqrt:
-      case LibFunc::sqrtf:
+      case LibFunc_sqrtl:
+      case LibFunc_sqrt:
+      case LibFunc_sqrtf:
         return Op.isNaN() || Op.isZero() || !Op.isNegative();
 
       // FIXME: Add more functions: sqrt_finite, atanh, expm1, log1p,
@@ -2133,9 +2201,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
       const APFloat &Op1 = Op1C->getValueAPF();
 
       switch (Func) {
-      case LibFunc::powl:
-      case LibFunc::pow:
-      case LibFunc::powf: {
+      case LibFunc_powl:
+      case LibFunc_pow:
+      case LibFunc_powf: {
         // FIXME: Stop using the host math library.
         // FIXME: The computation isn't done in the right precision.
         Type *Ty = Op0C->getType();
@@ -2149,9 +2217,9 @@ bool llvm::isMathLibCallNoop(CallSite CS, const TargetLibraryInfo *TLI) {
         break;
       }
 
-      case LibFunc::fmodl:
-      case LibFunc::fmod:
-      case LibFunc::fmodf:
+      case LibFunc_fmodl:
+      case LibFunc_fmod:
+      case LibFunc_fmodf:
         return Op0.isNaN() || Op1.isNaN() ||
                (!Op0.isInfinity() && !Op1.isZero());
 
diff --git a/contrib/llvm/lib/Analysis/CostModel.cpp b/contrib/llvm/lib/Analysis/CostModel.cpp
index 6b773979..32bfea5 100644
--- a/contrib/llvm/lib/Analysis/CostModel.cpp
+++ b/contrib/llvm/lib/Analysis/CostModel.cpp
@@ -447,25 +447,25 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
     Type *CondTy = SI->getCondition()->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
   }
   case Instruction::Store: {
     const StoreInst *SI = cast<StoreInst>(I);
     Type *ValTy = SI->getValueOperand()->getType();
     return TTI->getMemoryOpCost(I->getOpcode(), ValTy,
-                                 SI->getAlignment(),
-                                 SI->getPointerAddressSpace());
+                                SI->getAlignment(),
+                                SI->getPointerAddressSpace(), I);
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(I);
     return TTI->getMemoryOpCost(I->getOpcode(), I->getType(),
-                                 LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
+                                LI->getAlignment(),
+                                LI->getPointerAddressSpace(), I);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -481,7 +481,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast: {
     Type *SrcTy = I->getOperand(0)->getType();
-    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I);
   }
   case Instruction::ExtractElement: {
     const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
@@ -542,9 +542,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   }
   case Instruction::Call:
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      SmallVector<Value *, 4> Args;
-      for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
-        Args.push_back(II->getArgOperand(J));
+      SmallVector<Value *, 4> Args(II->arg_operands());
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
diff --git a/contrib/llvm/lib/Analysis/DemandedBits.cpp b/contrib/llvm/lib/Analysis/DemandedBits.cpp
index 688c1db..9c53f91 100644
--- a/contrib/llvm/lib/Analysis/DemandedBits.cpp
+++ b/contrib/llvm/lib/Analysis/DemandedBits.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -72,8 +73,7 @@ static bool isAlwaysLive(Instruction *I) {
 
 void DemandedBits::determineLiveOperandBits(
     const Instruction *UserI, const Instruction *I, unsigned OperandNo,
-    const APInt &AOut, APInt &AB, APInt &KnownZero, APInt &KnownOne,
-    APInt &KnownZero2, APInt &KnownOne2) {
+    const APInt &AOut, APInt &AB, KnownBits &Known, KnownBits &Known2) {
   unsigned BitWidth = AB.getBitWidth();
 
   // We're called once per operand, but for some instructions, we need to
@@ -85,16 +85,12 @@ void DemandedBits::determineLiveOperandBits(
   auto ComputeKnownBits =
       [&](unsigned BitWidth, const Value *V1, const Value *V2) {
         const DataLayout &DL = I->getModule()->getDataLayout();
-        KnownZero = APInt(BitWidth, 0);
-        KnownOne = APInt(BitWidth, 0);
-        computeKnownBits(const_cast<Value *>(V1), KnownZero, KnownOne, DL, 0,
-                         &AC, UserI, &DT);
+        Known = KnownBits(BitWidth);
+        computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT);
 
         if (V2) {
-          KnownZero2 = APInt(BitWidth, 0);
-          KnownOne2 = APInt(BitWidth, 0);
-          computeKnownBits(const_cast<Value *>(V2), KnownZero2, KnownOne2, DL,
-                           0, &AC, UserI, &DT);
+          Known2 = KnownBits(BitWidth);
+          computeKnownBits(V2, Known2, DL, 0, &AC, UserI, &DT);
         }
       };
 
@@ -110,6 +106,11 @@ void DemandedBits::determineLiveOperandBits(
         // the output.
         AB = AOut.byteSwap();
         break;
+      case Intrinsic::bitreverse:
+        // The alive bits of the input are the reversed alive bits of
+        // the output.
+        AB = AOut.reverseBits();
+        break;
       case Intrinsic::ctlz:
         if (OperandNo == 0) {
           // We need some output bits, so we need all bits of the
@@ -117,7 +118,7 @@ void DemandedBits::determineLiveOperandBits(
           // known to be one.
           ComputeKnownBits(BitWidth, I, nullptr);
           AB = APInt::getHighBitsSet(BitWidth,
-                 std::min(BitWidth, KnownOne.countLeadingZeros()+1));
+                 std::min(BitWidth, Known.countMaxLeadingZeros()+1));
         }
         break;
       case Intrinsic::cttz:
@@ -127,7 +128,7 @@ void DemandedBits::determineLiveOperandBits(
           // known to be one.
           ComputeKnownBits(BitWidth, I, nullptr);
           AB = APInt::getLowBitsSet(BitWidth,
-                 std::min(BitWidth, KnownOne.countTrailingZeros()+1));
+                 std::min(BitWidth, Known.countMaxTrailingZeros()+1));
         }
         break;
       }
@@ -142,9 +143,8 @@ void DemandedBits::determineLiveOperandBits(
     break;
   case Instruction::Shl:
     if (OperandNo == 0)
-      if (ConstantInt *CI =
-            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
-        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+      if (auto *ShiftAmtC = dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1);
         AB = AOut.lshr(ShiftAmt);
 
         // If the shift is nuw/nsw, then the high bits are not dead
@@ -158,9 +158,8 @@ void DemandedBits::determineLiveOperandBits(
     break;
   case Instruction::LShr:
     if (OperandNo == 0)
-      if (ConstantInt *CI =
-            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
-        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+      if (auto *ShiftAmtC = dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1);
         AB = AOut.shl(ShiftAmt);
 
         // If the shift is exact, then the low bits are not dead
@@ -171,16 +170,15 @@ void DemandedBits::determineLiveOperandBits(
     break;
   case Instruction::AShr:
     if (OperandNo == 0)
-      if (ConstantInt *CI =
-            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
-        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+      if (auto *ShiftAmtC = dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = ShiftAmtC->getLimitedValue(BitWidth - 1);
         AB = AOut.shl(ShiftAmt);
         // Because the high input bit is replicated into the
         // high-order bits of the result, if we need any of those
         // bits, then we must keep the highest input bit.
         if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt))
             .getBoolValue())
-          AB.setBit(BitWidth-1);
+          AB.setSignBit();
 
         // If the shift is exact, then the low bits are not dead
         // (they must be zero).
@@ -197,11 +195,11 @@ void DemandedBits::determineLiveOperandBits(
     // dead).
     if (OperandNo == 0) {
       ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
-      AB &= ~KnownZero2;
+      AB &= ~Known2.Zero;
     } else {
       if (!isa<Instruction>(UserI->getOperand(0)))
         ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
-      AB &= ~(KnownZero & ~KnownZero2);
+      AB &= ~(Known.Zero & ~Known2.Zero);
     }
     break;
   case Instruction::Or:
@@ -213,11 +211,11 @@ void DemandedBits::determineLiveOperandBits(
     // dead).
     if (OperandNo == 0) {
       ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
-      AB &= ~KnownOne2;
+      AB &= ~Known2.One;
     } else {
       if (!isa<Instruction>(UserI->getOperand(0)))
         ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
-      AB &= ~(KnownOne & ~KnownOne2);
+      AB &= ~(Known.One & ~Known2.One);
     }
     break;
   case Instruction::Xor:
@@ -238,7 +236,7 @@ void DemandedBits::determineLiveOperandBits(
     if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(),
                                       AOut.getBitWidth() - BitWidth))
         .getBoolValue())
-      AB.setBit(BitWidth-1);
+      AB.setSignBit();
     break;
   case Instruction::Select:
     if (OperandNo != 0)
@@ -315,7 +313,7 @@ void DemandedBits::performAnalysis() {
     if (!UserI->getType()->isIntegerTy())
       Visited.insert(UserI);
 
-    APInt KnownZero, KnownOne, KnownZero2, KnownOne2;
+    KnownBits Known, Known2;
     // Compute the set of alive bits for each operand. These are anded into the
     // existing set, if any, and if that changes the set of alive bits, the
     // operand is added to the work-list.
@@ -332,8 +330,7 @@ void DemandedBits::performAnalysis() {
             // Bits of each operand that are used to compute alive bits of the
             // output are alive, all others are dead.
             determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB,
-                                     KnownZero, KnownOne,
-                                     KnownZero2, KnownOne2);
+                                     Known, Known2);
           }
 
           // If we've added to the set of alive bits (or the operand has not
diff --git a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
index a332a07..34eccc0 100644
--- a/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -385,9 +385,9 @@ void DependenceInfo::Constraint::setAny(ScalarEvolution *NewSE) {
   Kind = Any;
 }
 
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // For debugging purposes. Dumps the constraint out to OS.
-void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
+LLVM_DUMP_METHOD void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   if (isEmpty())
     OS << " Empty\n";
   else if (isAny())
@@ -403,6 +403,7 @@ void DependenceInfo::Constraint::dump(raw_ostream &OS) const {
   else
     llvm_unreachable("unknown constraint type in Constraint::dump");
 }
+#endif
 
 
 // Updates X with the intersection
@@ -2983,7 +2984,7 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
                                SmallVectorImpl<Constraint> &Constraints,
                                bool &Consistent) {
   bool Result = false;
-  for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) {
+  for (unsigned LI : Loops.set_bits()) {
     DEBUG(dbgs() << "\t    Constraint[" << LI << "] is");
     DEBUG(Constraints[LI].dump(dbgs()));
     if (Constraints[LI].isDistance())
@@ -3265,7 +3266,7 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
 // For debugging purposes, dump a small bit vector to dbgs().
 static void dumpSmallBitVector(SmallBitVector &BV) {
   dbgs() << "{";
-  for (int VI = BV.find_first(); VI >= 0; VI = BV.find_next(VI)) {
+  for (unsigned VI : BV.set_bits()) {
     dbgs() << VI;
     if (BV.find_next(VI) >= 0)
       dbgs() << ' ';
@@ -3341,7 +3342,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
     UsefulGEP = isLoopInvariant(SrcPtrSCEV, LI->getLoopFor(Src->getParent())) &&
                 isLoopInvariant(DstPtrSCEV, LI->getLoopFor(Dst->getParent())) &&
-                (SrcGEP->getNumOperands() == DstGEP->getNumOperands());
+                (SrcGEP->getNumOperands() == DstGEP->getNumOperands()) &&
+                isKnownPredicate(CmpInst::ICMP_EQ, SrcPtrSCEV, DstPtrSCEV);
   }
   unsigned Pairs = UsefulGEP ? SrcGEP->idx_end() - SrcGEP->idx_begin() : 1;
   SmallVector<Subscript, 4> Pair(Pairs);
@@ -3370,7 +3372,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
   if (Delinearize && CommonLevels > 1) {
     if (tryDelinearize(Src, Dst, Pair)) {
-      DEBUG(dbgs() << "    delinerized GEP\n");
+      DEBUG(dbgs() << "    delinearized GEP\n");
       Pairs = Pair.size();
     }
   }
@@ -3505,7 +3507,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
   NewConstraint.setAny(SE);
 
   // test separable subscripts
-  for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+  for (unsigned SI : Separable.set_bits()) {
     DEBUG(dbgs() << "testing subscript " << SI);
     switch (Pair[SI].Classification) {
     case Subscript::ZIV:
@@ -3544,14 +3546,14 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
     SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
     for (unsigned II = 0; II <= MaxLevels; ++II)
       Constraints[II].setAny(SE);
-    for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+    for (unsigned SI : Coupled.set_bits()) {
       DEBUG(dbgs() << "testing subscript group " << SI << " { ");
       SmallBitVector Group(Pair[SI].Group);
       SmallBitVector Sivs(Pairs);
       SmallBitVector Mivs(Pairs);
       SmallBitVector ConstrainedLevels(MaxLevels + 1);
       SmallVector<Subscript *, 4> PairsInGroup;
-      for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+      for (unsigned SJ : Group.set_bits()) {
         DEBUG(dbgs() << SJ << " ");
         if (Pair[SJ].Classification == Subscript::SIV)
           Sivs.set(SJ);
@@ -3563,7 +3565,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       DEBUG(dbgs() << "}\n");
       while (Sivs.any()) {
         bool Changed = false;
-        for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+        for (unsigned SJ : Sivs.set_bits()) {
           DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
           // SJ is an SIV subscript that's part of the current coupled group
           unsigned Level;
@@ -3587,7 +3589,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
           DEBUG(dbgs() << "    propagating\n");
           DEBUG(dbgs() << "\tMivs = ");
           DEBUG(dumpSmallBitVector(Mivs));
-          for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+          for (unsigned SJ : Mivs.set_bits()) {
             // SJ is an MIV subscript that's part of the current coupled group
             DEBUG(dbgs() << "\tSJ = " << SJ << "\n");
             if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops,
@@ -3621,7 +3623,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       }
 
       // test & propagate remaining RDIVs
-      for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+      for (unsigned SJ : Mivs.set_bits()) {
         if (Pair[SJ].Classification == Subscript::RDIV) {
           DEBUG(dbgs() << "RDIV test\n");
           if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
@@ -3634,7 +3636,7 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
       // test remaining MIVs
       // This code is temporary.
       // Better to somehow test all remaining subscripts simultaneously.
-      for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+      for (unsigned SJ : Mivs.set_bits()) {
         if (Pair[SJ].Classification == Subscript::MIV) {
           DEBUG(dbgs() << "MIV test\n");
           if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
@@ -3646,9 +3648,8 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst,
 
       // update Result.DV from constraint vector
       DEBUG(dbgs() << "    updating\n");
-      for (int SJ = ConstrainedLevels.find_first(); SJ >= 0;
-           SJ = ConstrainedLevels.find_next(SJ)) {
-        if (SJ > (int)CommonLevels)
+      for (unsigned SJ : ConstrainedLevels.set_bits()) {
+        if (SJ > CommonLevels)
           break;
         updateDirection(Result.DV[SJ - 1], Constraints[SJ]);
         if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE)
@@ -3796,7 +3797,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
 
   if (Delinearize && CommonLevels > 1) {
     if (tryDelinearize(Src, Dst, Pair)) {
-      DEBUG(dbgs() << "    delinerized GEP\n");
+      DEBUG(dbgs() << "    delinearized GEP\n");
       Pairs = Pair.size();
     }
   }
@@ -3858,7 +3859,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
   NewConstraint.setAny(SE);
 
   // test separable subscripts
-  for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+  for (unsigned SI : Separable.set_bits()) {
     switch (Pair[SI].Classification) {
     case Subscript::SIV: {
       unsigned Level;
@@ -3885,12 +3886,12 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
     SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
     for (unsigned II = 0; II <= MaxLevels; ++II)
       Constraints[II].setAny(SE);
-    for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+    for (unsigned SI : Coupled.set_bits()) {
       SmallBitVector Group(Pair[SI].Group);
       SmallBitVector Sivs(Pairs);
       SmallBitVector Mivs(Pairs);
       SmallBitVector ConstrainedLevels(MaxLevels + 1);
-      for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+      for (unsigned SJ : Group.set_bits()) {
         if (Pair[SJ].Classification == Subscript::SIV)
           Sivs.set(SJ);
         else
@@ -3898,7 +3899,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
       }
       while (Sivs.any()) {
         bool Changed = false;
-        for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+        for (unsigned SJ : Sivs.set_bits()) {
           // SJ is an SIV subscript that's part of the current coupled group
           unsigned Level;
           const SCEV *SplitIter = nullptr;
@@ -3913,7 +3914,7 @@ const SCEV *DependenceInfo::getSplitIteration(const Dependence &Dep,
         }
         if (Changed) {
           // propagate, possibly creating new SIVs and ZIVs
-          for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+          for (unsigned SJ : Mivs.set_bits()) {
             // SJ is an MIV subscript that's part of the current coupled group
             if (propagate(Pair[SJ].Src, Pair[SJ].Dst,
                           Pair[SJ].Loops, Constraints, Result.Consistent)) {
diff --git a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 1b36569..2d39a0b 100644
--- a/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -241,7 +241,7 @@ void DivergencePropagator::exploreDataDependency(Value *V) {
   // Follow def-use chains of V.
   for (User *U : V->users()) {
     Instruction *UserInst = cast<Instruction>(U);
-    if (DV.insert(UserInst).second)
+    if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second)
       Worklist.push_back(UserInst);
   }
 }
diff --git a/contrib/llvm/lib/Analysis/DomPrinter.cpp b/contrib/llvm/lib/Analysis/DomPrinter.cpp
index 7acfb41..8abc0e7 100644
--- a/contrib/llvm/lib/Analysis/DomPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/DomPrinter.cpp
@@ -80,6 +80,22 @@ struct DOTGraphTraits<PostDominatorTree*>
 };
 }
 
+void DominatorTree::viewGraph(const Twine &Name, const Twine &Title) {
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "DomTree dump not available, build with DEBUG\n";
+#endif  // NDEBUG
+}
+
+void DominatorTree::viewGraph() {
+#ifndef NDEBUG
+  this->viewGraph("domtree", "Dominator Tree for function");
+#else
+  errs() << "DomTree dump not available, build with DEBUG\n";
+#endif  // NDEBUG
+}
+
 namespace {
 struct DominatorTreeWrapperPassAnalysisGraphTraits {
   static DominatorTree *getGraph(DominatorTreeWrapperPass *DTWP) {
diff --git a/contrib/llvm/lib/Analysis/DominanceFrontier.cpp b/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
index 15856c3..c08c6cf 100644
--- a/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
+++ b/contrib/llvm/lib/Analysis/DominanceFrontier.cpp
@@ -14,7 +14,8 @@
 using namespace llvm;
 
 namespace llvm {
-template class DominanceFrontierBase<BasicBlock>;
+template class DominanceFrontierBase<BasicBlock, false>;
+template class DominanceFrontierBase<BasicBlock, true>;
 template class ForwardDominanceFrontierBase<BasicBlock>;
 }
 
@@ -56,6 +57,16 @@ LLVM_DUMP_METHOD void DominanceFrontierWrapperPass::dump() const {
 }
 #endif
 
+/// Handle invalidation explicitly.
+bool DominanceFrontier::invalidate(Function &F, const PreservedAnalyses &PA,
+                                   FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<DominanceFrontierAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 AnalysisKey DominanceFrontierAnalysis::Key;
 
 DominanceFrontier DominanceFrontierAnalysis::run(Function &F,
diff --git a/contrib/llvm/lib/Analysis/EHPersonalities.cpp b/contrib/llvm/lib/Analysis/EHPersonalities.cpp
index ebf0a37..b12ae98 100644
--- a/contrib/llvm/lib/Analysis/EHPersonalities.cpp
+++ b/contrib/llvm/lib/Analysis/EHPersonalities.cpp
@@ -27,8 +27,10 @@ EHPersonality llvm::classifyEHPersonality(const Value *Pers) {
   return StringSwitch<EHPersonality>(F->getName())
     .Case("__gnat_eh_personality", EHPersonality::GNU_Ada)
     .Case("__gxx_personality_v0",  EHPersonality::GNU_CXX)
+    .Case("__gxx_personality_seh0",EHPersonality::GNU_CXX)
     .Case("__gxx_personality_sj0", EHPersonality::GNU_CXX_SjLj)
     .Case("__gcc_personality_v0",  EHPersonality::GNU_C)
+    .Case("__gcc_personality_seh0",EHPersonality::GNU_C)
     .Case("__gcc_personality_sj0", EHPersonality::GNU_C_SjLj)
     .Case("__objc_personality_v0", EHPersonality::GNU_ObjC)
     .Case("_except_handler3",      EHPersonality::MSVC_X86SEH)
diff --git a/contrib/llvm/lib/Analysis/GlobalsModRef.cpp b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
index 33f00cb..4ef0233 100644
--- a/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/contrib/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -475,7 +475,9 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     const std::vector<CallGraphNode *> &SCC = *I;
     assert(!SCC.empty() && "SCC with no functions?");
 
-    if (!SCC[0]->getFunction() || !SCC[0]->getFunction()->isDefinitionExact()) {
+    Function *F = SCC[0]->getFunction();
+
+    if (!F || !F->isDefinitionExact()) {
       // Calls externally or not exact - can't say anything useful. Remove any
       // existing function records (may have been created when scanning
       // globals).
@@ -484,19 +486,18 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
       continue;
     }
 
-    FunctionInfo &FI = FunctionInfos[SCC[0]->getFunction()];
+    FunctionInfo &FI = FunctionInfos[F];
     bool KnowNothing = false;
 
     // Collect the mod/ref properties due to called functions.  We only compute
     // one mod-ref set.
     for (unsigned i = 0, e = SCC.size(); i != e && !KnowNothing; ++i) {
-      Function *F = SCC[i]->getFunction();
       if (!F) {
         KnowNothing = true;
         break;
       }
 
-      if (F->isDeclaration()) {
+      if (F->isDeclaration() || F->hasFnAttribute(Attribute::OptimizeNone)) {
         // Try to get mod/ref behaviour from function attributes.
         if (F->doesNotAccessMemory()) {
           // Can't do better than that!
@@ -545,6 +546,13 @@ void GlobalsAAResult::AnalyzeCallGraph(CallGraph &CG, Module &M) {
     for (auto *Node : SCC) {
       if (FI.getModRefInfo() == MRI_ModRef)
         break; // The mod/ref lattice saturates here.
+
+      // Don't prove any properties based on the implementation of an optnone
+      // function. Function attributes were already used as a best approximation
+      // above.
+      if (Node->getFunction()->hasFnAttribute(Attribute::OptimizeNone))
+        continue;
+
       for (Instruction &I : instructions(Node->getFunction())) {
         if (FI.getModRefInfo() == MRI_ModRef)
           break; // The mod/ref lattice saturates here.
diff --git a/contrib/llvm/lib/Analysis/IVUsers.cpp b/contrib/llvm/lib/Analysis/IVUsers.cpp
index a661b01..c30feb9 100644
--- a/contrib/llvm/lib/Analysis/IVUsers.cpp
+++ b/contrib/llvm/lib/Analysis/IVUsers.cpp
@@ -76,9 +76,8 @@ static bool isInteresting(const SCEV *S, const Instruction *I, const Loop *L,
   // An add is interesting if exactly one of its operands is interesting.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     bool AnyInterestingYet = false;
-    for (SCEVAddExpr::op_iterator OI = Add->op_begin(), OE = Add->op_end();
-         OI != OE; ++OI)
-      if (isInteresting(*OI, I, L, SE, LI)) {
+    for (const auto *Op : Add->operands())
+      if (isInteresting(Op, I, L, SE, LI)) {
         if (AnyInterestingYet)
           return false;
         AnyInterestingYet = true;
@@ -118,6 +117,50 @@ static bool isSimplifiedLoopNest(BasicBlock *BB, const DominatorTree *DT,
   return true;
 }
 
+/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
+/// and now we need to decide whether the user should use the preinc or post-inc
+/// value.  If this user should use the post-inc version of the IV, return true.
+///
+/// Choosing wrong here can break dominance properties (if we choose to use the
+/// post-inc value when we cannot) or it can end up adding extra live-ranges to
+/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
+/// should use the post-inc value).
+static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
+                                       const Loop *L, DominatorTree *DT) {
+  // If the user is in the loop, use the preinc value.
+  if (L->contains(User))
+    return false;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (!LatchBlock)
+    return false;
+
+  // Ok, the user is outside of the loop.  If it is dominated by the latch
+  // block, use the post-inc value.
+  if (DT->dominates(LatchBlock, User->getParent()))
+    return true;
+
+  // There is one case we have to be careful of: PHI nodes.  These little guys
+  // can live in blocks that are not dominated by the latch block, but (since
+  // their uses occur in the predecessor block, not the block the PHI lives in)
+  // should still use the post-inc value.  Check for this case now.
+  PHINode *PN = dyn_cast<PHINode>(User);
+  if (!PN || !Operand)
+    return false; // not a phi, not dominated by latch block.
+
+  // Look at all of the uses of Operand by the PHI node.  If any use corresponds
+  // to a block that is not dominated by the latch block, give up and use the
+  // preincremented value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == Operand &&
+        !DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+      return false;
+
+  // Okay, all uses of Operand by PN are in predecessor blocks that really are
+  // dominated by the latch block.  Use the post-incremented value.
+  return true;
+}
+
 /// AddUsersImpl - Inspect the specified instruction.  If it is a
 /// reducible SCEV, recursively add its users to the IVUsesByStride set and
 /// return true.  Otherwise, return false.
@@ -208,10 +251,16 @@ bool IVUsers::AddUsersImpl(Instruction *I,
       // The regular return value here is discarded; instead of recording
       // it, we just recompute it when we need it.
       const SCEV *OriginalISE = ISE;
-      ISE = TransformForPostIncUse(NormalizeAutodetect,
-                                   ISE, User, I,
-                                   NewUse.PostIncLoops,
-                                   *SE, *DT);
+
+      auto NormalizePred = [&](const SCEVAddRecExpr *AR) {
+        auto *L = AR->getLoop();
+        bool Result = IVUseShouldUsePostIncValue(User, I, L, DT);
+        if (Result)
+          NewUse.PostIncLoops.insert(L);
+        return Result;
+      };
+
+      ISE = normalizeForPostIncUseIf(ISE, NormalizePred, *SE);
 
       // PostIncNormalization effectively simplifies the expression under
       // pre-increment assumptions. Those assumptions (no wrapping) might not
@@ -219,8 +268,7 @@ bool IVUsers::AddUsersImpl(Instruction *I,
       // transformation is invertible.
       if (OriginalISE != ISE) {
         const SCEV *DenormalizedISE =
-          TransformForPostIncUse(Denormalize, ISE, User, I,
-              NewUse.PostIncLoops, *SE, *DT);
+            denormalizeForPostIncUse(ISE, NewUse.PostIncLoops, *SE);
 
         // If we normalized the expression, but denormalization doesn't give the
         // original one, discard this user.
@@ -338,11 +386,8 @@ const SCEV *IVUsers::getReplacementExpr(const IVStrideUse &IU) const {
 
 /// getExpr - Return the expression for the use.
 const SCEV *IVUsers::getExpr(const IVStrideUse &IU) const {
-  return
-    TransformForPostIncUse(Normalize, getReplacementExpr(IU),
-                           IU.getUser(), IU.getOperandValToReplace(),
-                           const_cast<PostIncLoopSet &>(IU.getPostIncLoops()),
-                           *SE, *DT);
+  return normalizeForPostIncUse(getReplacementExpr(IU), IU.getPostIncLoops(),
+                                *SE);
 }
 
 static const SCEVAddRecExpr *findAddRecForLoop(const SCEV *S, const Loop *L) {
@@ -353,9 +398,8 @@ static const SCEVAddRecExpr *findAddRecForLoop(const SCEV *S, const Loop *L) {
   }
 
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
-         I != E; ++I)
-      if (const SCEVAddRecExpr *AR = findAddRecForLoop(*I, L))
+    for (const auto *Op : Add->operands())
+      if (const SCEVAddRecExpr *AR = findAddRecForLoop(Op, L))
         return AR;
     return nullptr;
   }
diff --git a/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index 3da33ac..ed233d2 100644
--- a/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -43,7 +43,7 @@ static cl::opt<unsigned>
 // The percent threshold for the direct-call target (this call site vs the
 // total call count) for it to be considered as the promotion target.
 static cl::opt<unsigned>
-    ICPPercentThreshold("icp-percent-threshold", cl::init(33), cl::Hidden,
+    ICPPercentThreshold("icp-percent-threshold", cl::init(30), cl::Hidden,
                         cl::ZeroOrMore,
                         cl::desc("The percentage threshold for the promotion"));
 
diff --git a/contrib/llvm/lib/Analysis/InlineCost.cpp b/contrib/llvm/lib/Analysis/InlineCost.cpp
index 4109049..3569366 100644
--- a/contrib/llvm/lib/Analysis/InlineCost.cpp
+++ b/contrib/llvm/lib/Analysis/InlineCost.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -48,11 +49,16 @@ static cl::opt<int> HintThreshold(
     "inlinehint-threshold", cl::Hidden, cl::init(325),
     cl::desc("Threshold for inlining functions with inline hint"));
 
+static cl::opt<int>
+    ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
+                          cl::init(45),
+                          cl::desc("Threshold for inlining cold callsites"));
+
 // We introduce this threshold to help performance of instrumentation based
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
 static cl::opt<int> ColdThreshold(
-    "inlinecold-threshold", cl::Hidden, cl::init(225),
+    "inlinecold-threshold", cl::Hidden, cl::init(45),
     cl::desc("Threshold for inlining functions with cold attribute"));
 
 static cl::opt<int>
@@ -60,6 +66,12 @@ static cl::opt<int>
                          cl::ZeroOrMore,
                          cl::desc("Threshold for hot callsites "));
 
+static cl::opt<int> ColdCallSiteRelFreq(
+    "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+    cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
+             "entry frequency, for a callsite to be cold in the absence of "
+             "profile information."));
+
 namespace {
 
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -72,12 +84,18 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// Getter for the cache of @llvm.assume intrinsics.
   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
 
+  /// Getter for BlockFrequencyInfo
+  Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
+
   /// Profile summary information.
   ProfileSummaryInfo *PSI;
 
   /// The called function.
   Function &F;
 
+  // Cache the DataLayout since we use it a lot.
+  const DataLayout &DL;
+
   /// The candidate callsite being analyzed. Please do not use this to do
   /// analysis in the caller function; we want the inline cost query to be
   /// easily cacheable. Instead, use the cover function paramHasAttr.
@@ -133,9 +151,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   void disableSROA(Value *V);
   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                           int InstructionCost);
-  bool isGEPOffsetConstant(GetElementPtrInst &GEP);
+  bool isGEPFree(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
+  template <typename Callable>
+  bool simplifyInstruction(Instruction &I, Callable Evaluate);
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 
   /// Return true if the given argument to the function being considered for
@@ -158,6 +178,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// Return true if size growth is allowed when inlining the callee at CS.
   bool allowSizeGrowth(CallSite CS);
 
+  /// Return true if \p CS is a cold callsite.
+  bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
+
   // Custom analysis routines.
   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
 
@@ -202,9 +225,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
 public:
   CallAnalyzer(const TargetTransformInfo &TTI,
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+               Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
                ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
                const InlineParams &Params)
-      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee),
+      : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
+        PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()),
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
@@ -286,23 +311,11 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
   SROACostSavings += InstructionCost;
 }
 
-/// \brief Check whether a GEP's indices are all constant.
-///
-/// Respects any simplified values known during the analysis of this callsite.
-bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
-  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-    if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
-      return false;
-
-  return true;
-}
-
 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 ///
 /// Returns false if unable to compute the offset for any reason. Respects any
 /// simplified values known during the analysis of this callsite.
 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
   unsigned IntPtrWidth = DL.getPointerSizeInBits();
   assert(IntPtrWidth == Offset.getBitWidth());
 
@@ -331,13 +344,27 @@ bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
   return true;
 }
 
+/// \brief Use TTI to check whether a GEP is free.
+///
+/// Respects any simplified values known during the analysis of this callsite.
+bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
+  SmallVector<Value *, 4> Indices;
+  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+    if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
+       Indices.push_back(SimpleOp);
+     else
+       Indices.push_back(*I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getGEPCost(GEP.getSourceElementType(), GEP.getPointerOperand(),
+                        Indices);
+}
+
 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // Check whether inlining will turn a dynamic alloca into a static
   // alloca and handle that case.
   if (I.isArrayAllocation()) {
     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
-      const DataLayout &DL = F.getParent()->getDataLayout();
       Type *Ty = I.getAllocatedType();
       AllocatedSize = SaturatingMultiplyAdd(
           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
@@ -347,7 +374,6 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
 
   // Accumulate the allocated size.
   if (I.isStaticAlloca()) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
     Type *Ty = I.getAllocatedType();
     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
   }
@@ -396,7 +422,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
         // Non-constant GEPs aren't folded, and disable SROA.
         if (SROACandidate)
           disableSROA(CostIt);
-        return false;
+        return isGEPFree(I);
       }
 
       // Add the result as a new mapping to Base + Offset.
@@ -411,7 +437,15 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
     }
   }
 
-  if (isGEPOffsetConstant(I)) {
+  // Lambda to check whether a GEP's indices are all constant.
+  auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
+    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+      if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
+        return false;
+    return true;
+  };
+
+  if (IsGEPOffsetConstant(I)) {
     if (SROACandidate)
       SROAArgValues[&I] = SROAArg;
 
@@ -422,19 +456,36 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   // Variable GEPs will require math and will disable SROA.
   if (SROACandidate)
     disableSROA(CostIt);
-  return false;
+  return isGEPFree(I);
+}
+
+/// Simplify \p I if its operands are constants and update SimplifiedValues.
+/// \p Evaluate is a callable specific to instruction type that evaluates the
+/// instruction when all the operands are constants.
+template <typename Callable>
+bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
+  SmallVector<Constant *, 2> COps;
+  for (Value *Op : I.operands()) {
+    Constant *COp = dyn_cast<Constant>(Op);
+    if (!COp)
+      COp = SimplifiedValues.lookup(Op);
+    if (!COp)
+      return false;
+    COps.push_back(COp);
+  }
+  auto *C = Evaluate(COps);
+  if (!C)
+    return false;
+  SimplifiedValues[&I] = C;
+  return true;
 }
 
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getBitCast(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offsets through casts
   std::pair<Value *, APInt> BaseAndOffset =
@@ -455,19 +506,14 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 
 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getPtrToInt(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offset pairs when converted to a plain integer provided the
   // integer is large enough to represent the pointer.
   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
-  const DataLayout &DL = F.getParent()->getDataLayout();
   if (IntegerSize >= DL.getPointerSizeInBits()) {
     std::pair<Value *, APInt> BaseAndOffset =
         ConstantOffsetPtrs.lookup(I.getOperand(0));
@@ -492,20 +538,15 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getIntToPtr(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offset pairs when round-tripped through a pointer without
   // modifications provided the integer is not too large.
   Value *Op = I.getOperand(0);
   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
-  const DataLayout &DL = F.getParent()->getDataLayout();
   if (IntegerSize <= DL.getPointerSizeInBits()) {
     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
     if (BaseAndOffset.first)
@@ -523,14 +564,10 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
+      }))
+    return true;
 
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
@@ -540,16 +577,10 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
   Value *Operand = I.getOperand(0);
-  Constant *COp = dyn_cast<Constant>(Operand);
-  if (!COp)
-    COp = SimplifiedValues.lookup(Operand);
-  if (COp) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
-    if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
-  }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantFoldInstOperands(&I, COps[0], DL);
+      }))
+    return true;
 
   // Disable any SROA on the argument to arbitrary unary operators.
   disableSROA(Operand);
@@ -558,8 +589,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
 }
 
 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
-  unsigned ArgNo = A->getArgNo();
-  return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
+  return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
 }
 
 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
@@ -610,6 +640,26 @@ bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
   return true;
 }
 
+bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
+  // If global profile summary is available, then callsite's coldness is
+  // determined based on that.
+  if (PSI->hasProfileSummary())
+    return PSI->isColdCallSite(CS, CallerBFI);
+  if (!CallerBFI)
+    return false;
+
+  // In the absence of global profile summary, determine if the callsite is cold
+  // relative to caller's entry. We could potentially cache the computation of
+  // scaled entry frequency, but the added complexity is not worth it unless
+  // this scaling shows up high in the profiles.
+  const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
+  auto CallSiteBB = CS.getInstruction()->getParent();
+  auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
+  auto CallerEntryFreq =
+      CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
+  return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
   // If no size growth is allowed for this inlining, set Threshold to 0.
   if (!allowSizeGrowth(CS)) {
@@ -642,17 +692,34 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
     if (Callee.hasFnAttribute(Attribute::InlineHint))
       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
     if (PSI) {
-      uint64_t TotalWeight;
-      if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
-          PSI->isHotCount(TotalWeight)) {
-        Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
-      } else if (PSI->isFunctionEntryHot(&Callee)) {
-        // If callsite hotness can not be determined, we may still know
-        // that the callee is hot and treat it as a weaker hint for threshold
-        // increase.
-        Threshold = MaxIfValid(Threshold, Params.HintThreshold);
-      } else if (PSI->isFunctionEntryCold(&Callee)) {
-        Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+      BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
+      // FIXME: After switching to the new passmanager, simplify the logic below
+      // by checking only the callsite hotness/coldness. The check for CallerBFI
+      // exists only because we do not have BFI available with the old PM.
+      //
+      // Use callee's hotness information only if we have no way of determining
+      // callsite's hotness information. Callsite hotness can be determined if
+      // sample profile is used (which adds hotness metadata to calls) or if
+      // caller's BlockFrequencyInfo is available.
+      if (CallerBFI || PSI->hasSampleProfile()) {
+        if (PSI->isHotCallSite(CS, CallerBFI)) {
+          DEBUG(dbgs() << "Hot callsite.\n");
+          Threshold = Params.HotCallSiteThreshold.getValue();
+        } else if (isColdCallSite(CS, CallerBFI)) {
+          DEBUG(dbgs() << "Cold callsite.\n");
+          Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
+        }
+      } else {
+        if (PSI->isFunctionEntryHot(&Callee)) {
+          DEBUG(dbgs() << "Hot callee.\n");
+          // If callsite hotness can not be determined, we may still know
+          // that the callee is hot and treat it as a weaker hint for threshold
+          // increase.
+          Threshold = MaxIfValid(Threshold, Params.HintThreshold);
+        } else if (PSI->isFunctionEntryCold(&Callee)) {
+          DEBUG(dbgs() << "Cold callee.\n");
+          Threshold = MinIfValid(Threshold, Params.ColdThreshold);
+        }
       }
     }
   }
@@ -665,20 +732,10 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   // First try to handle simplified comparisons.
-  if (!isa<Constant>(LHS))
-    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-      LHS = SimpleLHS;
-  if (!isa<Constant>(RHS))
-    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-      RHS = SimpleRHS;
-  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
-    if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C =
-              ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
-        SimplifiedValues[&I] = C;
-        return true;
-      }
-  }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
+      }))
+    return true;
 
   if (I.getOpcode() == Instruction::FCmp)
     return false;
@@ -756,24 +813,18 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
 
 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  if (!isa<Constant>(LHS))
-    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-      LHS = SimpleLHS;
-  if (!isa<Constant>(RHS))
-    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-      RHS = SimpleRHS;
-  Value *SimpleV = nullptr;
-  if (auto FI = dyn_cast<FPMathOperator>(&I))
-    SimpleV =
-        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
-  else
-    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+  auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
+    Value *SimpleV = nullptr;
+    if (auto FI = dyn_cast<FPMathOperator>(&I))
+      SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
+                                FI->getFastMathFlags(), DL);
+    else
+      SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
+    return dyn_cast_or_null<Constant>(SimpleV);
+  };
 
-  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
-    SimplifiedValues[&I] = C;
+  if (simplifyInstruction(I, Evaluate))
     return true;
-  }
 
   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
   disableSROA(LHS);
@@ -814,13 +865,10 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
 
 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
   // Constant folding for extract value is trivial.
-  Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
-  if (!C)
-    C = SimplifiedValues.lookup(I.getAggregateOperand());
-  if (C) {
-    SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getExtractValue(COps[0], I.getIndices());
+      }))
     return true;
-  }
 
   // SROA can look through these but give them a cost.
   return false;
@@ -828,17 +876,12 @@ bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
 
 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   // Constant folding for insert value is trivial.
-  Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
-  if (!AggC)
-    AggC = SimplifiedValues.lookup(I.getAggregateOperand());
-  Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
-  if (!InsertedC)
-    InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
-  if (AggC && InsertedC) {
-    SimplifiedValues[&I] =
-        ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
+                                            /*InsertedValueOperand*/ COps[1],
+                                            I.getIndices());
+      }))
     return true;
-  }
 
   // SROA can look through these but give them a cost.
   return false;
@@ -855,7 +898,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
   // because we have to continually rebuild the argument list even when no
   // simplifications can be performed. Until that is fixed with remapping
   // inside of instsimplify, directly constant fold calls here.
-  if (!canConstantFoldCallTo(F))
+  if (!canConstantFoldCallTo(CS, F))
     return false;
 
   // Try to re-map the arguments to constants.
@@ -871,7 +914,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 
     ConstantArgs.push_back(C);
   }
-  if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
+  if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
     SimplifiedValues[CS.getInstruction()] = C;
     return true;
   }
@@ -959,7 +1002,8 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   // out. Pretend to inline the function, with a custom threshold.
   auto IndirectCallParams = Params;
   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
-  CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams);
+  CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
+                  IndirectCallParams);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
     // threshold to get the bonus we want to apply, but don't go below zero.
@@ -995,22 +1039,74 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
     if (isa<ConstantInt>(V))
       return true;
 
-  // Otherwise, we need to accumulate a cost proportional to the number of
-  // distinct successor blocks. This fan-out in the CFG cannot be represented
-  // for free even if we can represent the core switch as a jumptable that
-  // takes a single instruction.
+  // Assume the most general case where the switch is lowered into
+  // either a jump table, bit test, or a balanced binary tree consisting of
+  // case clusters without merging adjacent clusters with the same
+  // destination. We do not consider the switches that are lowered with a mix
+  // of jump table/bit test/binary search tree. The cost of the switch is
+  // proportional to the size of the tree or the size of jump table range.
   //
   // NB: We convert large switches which are just used to initialize large phi
   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
   // inlining those. It will prevent inlining in cases where the optimization
   // does not (yet) fire.
-  SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
-  SuccessorBlocks.insert(SI.getDefaultDest());
-  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
-    SuccessorBlocks.insert(I.getCaseSuccessor());
-  // Add cost corresponding to the number of distinct destinations. The first
-  // we model as free because of fallthrough.
-  Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
+
+  // Maximum valid cost increased in this function.
+  int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+
+  // Exit early for a large switch, assuming one case needs at least one
+  // instruction.
+  // FIXME: This is not true for a bit test, but ignore such case for now to
+  // save compile-time.
+  int64_t CostLowerBound =
+      std::min((int64_t)CostUpperBound,
+               (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
+
+  if (CostLowerBound > Threshold) {
+    Cost = CostLowerBound;
+    return false;
+  }
+
+  unsigned JumpTableSize = 0;
+  unsigned NumCaseCluster =
+      TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
+
+  // If suitable for a jump table, consider the cost for the table size and
+  // branch to destination.
+  if (JumpTableSize) {
+    int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+                     4 * InlineConstants::InstrCost;
+
+    Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
+    return false;
+  }
+
+  // Considering forming a binary search, we should find the number of nodes
+  // which is same as the number of comparisons when lowered. For a given
+  // number of clusters, n, we can define a recursive function, f(n), to find
+  // the number of nodes in the tree. The recursion is :
+  // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+  // and f(n) = n, when n <= 3.
+  // This will lead a binary tree where the leaf should be either f(2) or f(3)
+  // when n > 3.  So, the number of comparisons from leaves should be n, while
+  // the number of non-leaf should be :
+  //   2^(log2(n) - 1) - 1
+  //   = 2^log2(n) * 2^-1 - 1
+  //   = n / 2 - 1.
+  // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+  // number of comparisons in a simple closed form :
+  //   n + n / 2 - 1 = n * 3 / 2 - 1
+  if (NumCaseCluster <= 3) {
+    // Suppose a comparison includes one compare and one conditional branch.
+    Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
+    return false;
+  }
+
+  int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+  int64_t SwitchCost =
+      ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+
+  Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
   return false;
 }
 
@@ -1098,19 +1194,10 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // is expensive or the function has the "use-soft-float" attribute, this may
     // eventually become a library call. Treat the cost as such.
     if (I->getType()->isFloatingPointTy()) {
-      bool hasSoftFloatAttr = false;
-
       // If the function has the "use-soft-float" attribute, mark it as
       // expensive.
-      if (F.hasFnAttribute("use-soft-float")) {
-        Attribute Attr = F.getFnAttribute("use-soft-float");
-        StringRef Val = Attr.getValueAsString();
-        if (Val == "true")
-          hasSoftFloatAttr = true;
-      }
-
       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
-          hasSoftFloatAttr)
+          (F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
         Cost += InlineConstants::CallPenalty;
     }
 
@@ -1155,7 +1242,6 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
   if (!V->getType()->isPointerTy())
     return nullptr;
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
   unsigned IntPtrWidth = DL.getPointerSizeInBits();
   APInt Offset = APInt::getNullValue(IntPtrWidth);
 
@@ -1212,7 +1298,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
   FiftyPercentVectorBonus = 3 * Threshold / 2;
   TenPercentVectorBonus = 3 * Threshold / 4;
-  const DataLayout &DL = F.getParent()->getDataLayout();
 
   // Track whether the post-inlining function would have more than one basic
   // block. A single basic block is often intended for inlining. Balloon the
@@ -1225,36 +1310,10 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   // the rest of the function body.
   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
 
-  // Give out bonuses per argument, as the instructions setting them up will
-  // be gone after inlining.
-  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
-    if (CS.isByValArgument(I)) {
-      // We approximate the number of loads and stores needed by dividing the
-      // size of the byval type by the target's pointer size.
-      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
-      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
-      unsigned PointerSize = DL.getPointerSizeInBits();
-      // Ceiling division.
-      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+  // Give out bonuses for the callsite, as the instructions setting them up
+  // will be gone after inlining.
+  Cost -= getCallsiteCost(CS, DL);
 
-      // If it generates more than 8 stores it is likely to be expanded as an
-      // inline memcpy so we take that as an upper bound. Otherwise we assume
-      // one load and one store per word copied.
-      // FIXME: The maxStoresPerMemcpy setting from the target should be used
-      // here instead of a magic number of 8, but it's not available via
-      // DataLayout.
-      NumStores = std::min(NumStores, 8U);
-
-      Cost -= 2 * NumStores * InlineConstants::InstrCost;
-    } else {
-      // For non-byval arguments subtract off one instruction per call
-      // argument.
-      Cost -= InlineConstants::InstrCost;
-    }
-  }
-  // The call instruction also disappears after inlining.
-  Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty;
-  
   // If there is only one call of the function, and it has internal linkage,
   // the cost of inlining it drops dramatically.
   bool OnlyOneCallAndLocalLinkage =
@@ -1371,7 +1430,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       Value *Cond = SI->getCondition();
       if (ConstantInt *SimpleCond =
               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
-        BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
+        BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor());
         continue;
       }
     }
@@ -1430,13 +1489,6 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 }
 #endif
 
-/// \brief Test that two functions either have or have not the given attribute
-///        at the same time.
-template <typename AttrKind>
-static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
-  return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
-}
-
 /// \brief Test that there are no attribute conflicts between Caller and Callee
 ///        that prevent inlining.
 static bool functionsHaveCompatibleAttributes(Function *Caller,
@@ -1446,18 +1498,52 @@ static bool functionsHaveCompatibleAttributes(Function *Caller,
          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
 }
 
+int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
+  int Cost = 0;
+  for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
+    if (CS.isByValArgument(I)) {
+      // We approximate the number of loads and stores needed by dividing the
+      // size of the byval type by the target's pointer size.
+      PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
+      unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
+      unsigned PointerSize = DL.getPointerSizeInBits();
+      // Ceiling division.
+      unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
+
+      // If it generates more than 8 stores it is likely to be expanded as an
+      // inline memcpy so we take that as an upper bound. Otherwise we assume
+      // one load and one store per word copied.
+      // FIXME: The maxStoresPerMemcpy setting from the target should be used
+      // here instead of a magic number of 8, but it's not available via
+      // DataLayout.
+      NumStores = std::min(NumStores, 8U);
+
+      Cost += 2 * NumStores * InlineConstants::InstrCost;
+    } else {
+      // For non-byval arguments subtract off one instruction per call
+      // argument.
+      Cost += InlineConstants::InstrCost;
+    }
+  }
+  // The call instruction also disappears after inlining.
+  Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
+  return Cost;
+}
+
 InlineCost llvm::getInlineCost(
     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
     ProfileSummaryInfo *PSI) {
   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
-                       GetAssumptionCache, PSI);
+                       GetAssumptionCache, GetBFI, PSI);
 }
 
 InlineCost llvm::getInlineCost(
     CallSite CS, Function *Callee, const InlineParams &Params,
     TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
     ProfileSummaryInfo *PSI) {
 
   // Cannot inline indirect calls.
@@ -1492,7 +1578,8 @@ InlineCost llvm::getInlineCost(
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                      << "...\n");
 
-  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params);
+  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
+                  Params);
   bool ShouldInline = CA.analyzeCall(CS);
 
   DEBUG(CA.dump());
@@ -1565,7 +1652,9 @@ InlineParams llvm::getInlineParams(int Threshold) {
   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
   Params.HotCallSiteThreshold = HotCallSiteThreshold;
 
-  // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
+  // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
+  Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
+
   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
   // -inlinehint-threshold commandline option is not explicitly given. If that
   // option is present, then its value applies even for callees with size and
diff --git a/contrib/llvm/lib/Analysis/InstCount.cpp b/contrib/llvm/lib/Analysis/InstCount.cpp
index de2b9c0..95ab6ee 100644
--- a/contrib/llvm/lib/Analysis/InstCount.cpp
+++ b/contrib/llvm/lib/Analysis/InstCount.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/Pass.h"
@@ -26,14 +26,12 @@ using namespace llvm;
 STATISTIC(TotalInsts , "Number of instructions (of all types)");
 STATISTIC(TotalBlocks, "Number of basic blocks");
 STATISTIC(TotalFuncs , "Number of non-external functions");
-STATISTIC(TotalMemInst, "Number of memory instructions");
 
 #define HANDLE_INST(N, OPCODE, CLASS) \
   STATISTIC(Num ## OPCODE ## Inst, "Number of " #OPCODE " insts");
 
 #include "llvm/IR/Instruction.def"
 
-
 namespace {
   class InstCount : public FunctionPass, public InstVisitor<InstCount> {
     friend class InstVisitor<InstCount>;
@@ -76,13 +74,6 @@ FunctionPass *llvm::createInstCountPass() { return new InstCount(); }
 // function.
 //
 bool InstCount::runOnFunction(Function &F) {
-  unsigned StartMemInsts =
-    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst;
   visit(F);
-  unsigned EndMemInsts =
-    NumGetElementPtrInst + NumLoadInst + NumStoreInst + NumCallInst +
-    NumInvokeInst + NumAllocaInst;
-  TotalMemInst += EndMemInsts-StartMemInsts;
   return false;
 }
diff --git a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
index 796e6e4..b4f3b87 100644
--- a/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/contrib/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -21,9 +21,12 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
@@ -34,6 +37,7 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/KnownBits.h"
 #include <algorithm>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -45,49 +49,30 @@ enum { RecursionLimit = 3 };
 STATISTIC(NumExpand,  "Number of expansions");
 STATISTIC(NumReassoc, "Number of reassociations");
 
-namespace {
-struct Query {
-  const DataLayout &DL;
-  const TargetLibraryInfo *TLI;
-  const DominatorTree *DT;
-  AssumptionCache *AC;
-  const Instruction *CxtI;
-
-  Query(const DataLayout &DL, const TargetLibraryInfo *tli,
-        const DominatorTree *dt, AssumptionCache *ac = nullptr,
-        const Instruction *cxti = nullptr)
-      : DL(DL), TLI(tli), DT(dt), AC(ac), CxtI(cxti) {}
-};
-} // end anonymous namespace
-
-static Value *SimplifyAndInst(Value *, Value *, const Query &, unsigned);
-static Value *SimplifyBinOp(unsigned, Value *, Value *, const Query &,
+static Value *SimplifyAndInst(Value *, Value *, const SimplifyQuery &, unsigned);
+static Value *SimplifyBinOp(unsigned, Value *, Value *, const SimplifyQuery &,
                             unsigned);
 static Value *SimplifyFPBinOp(unsigned, Value *, Value *, const FastMathFlags &,
-                              const Query &, unsigned);
-static Value *SimplifyCmpInst(unsigned, Value *, Value *, const Query &,
+                              const SimplifyQuery &, unsigned);
+static Value *SimplifyCmpInst(unsigned, Value *, Value *, const SimplifyQuery &,
                               unsigned);
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                               const Query &Q, unsigned MaxRecurse);
-static Value *SimplifyOrInst(Value *, Value *, const Query &, unsigned);
-static Value *SimplifyXorInst(Value *, Value *, const Query &, unsigned);
+                               const SimplifyQuery &Q, unsigned MaxRecurse);
+static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
+static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyCastInst(unsigned, Value *, Type *,
-                               const Query &, unsigned);
+                               const SimplifyQuery &, unsigned);
 
-/// For a boolean type, or a vector of boolean type, return false, or
-/// a vector with every element false, as appropriate for the type.
+/// For a boolean type or a vector of boolean type, return false or a vector
+/// with every element false.
 static Constant *getFalse(Type *Ty) {
-  assert(Ty->getScalarType()->isIntegerTy(1) &&
-         "Expected i1 type or a vector of i1!");
-  return Constant::getNullValue(Ty);
+  return ConstantInt::getFalse(Ty);
 }
 
-/// For a boolean type, or a vector of boolean type, return true, or
-/// a vector with every element true, as appropriate for the type.
+/// For a boolean type or a vector of boolean type, return true or a vector
+/// with every element true.
 static Constant *getTrue(Type *Ty) {
-  assert(Ty->getScalarType()->isIntegerTy(1) &&
-         "Expected i1 type or a vector of i1!");
-  return Constant::getAllOnesValue(Ty);
+  return ConstantInt::getTrue(Ty);
 }
 
 /// isSameCompare - Is V equivalent to the comparison "LHS Pred RHS"?
@@ -118,13 +103,8 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
     return false;
 
   // If we have a DominatorTree then do a precise test.
-  if (DT) {
-    if (!DT->isReachableFromEntry(P->getParent()))
-      return true;
-    if (!DT->isReachableFromEntry(I->getParent()))
-      return false;
+  if (DT)
     return DT->dominates(I, P);
-  }
 
   // Otherwise, if the instruction is in the entry block and is not an invoke,
   // then it obviously dominates all phi nodes.
@@ -140,10 +120,9 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
 /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
-static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                          unsigned OpcToExpand, const Query &Q,
-                          unsigned MaxRecurse) {
-  Instruction::BinaryOps OpcodeToExpand = (Instruction::BinaryOps)OpcToExpand;
+static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
+                          Instruction::BinaryOps OpcodeToExpand,
+                          const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -199,9 +178,10 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
-static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
-                                       const Query &Q, unsigned MaxRecurse) {
-  Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc;
+static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
+                                       Value *LHS, Value *RHS,
+                                       const SimplifyQuery &Q,
+                                       unsigned MaxRecurse) {
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -298,8 +278,9 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
 /// try to simplify the binop by seeing whether evaluating it on both branches
 /// of the select results in the same value. Returns the common value if so,
 /// otherwise returns null.
-static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
-                                    const Query &Q, unsigned MaxRecurse) {
+static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
+                                    Value *RHS, const SimplifyQuery &Q,
+                                    unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -370,7 +351,7 @@ static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
 /// comparison by seeing whether both branches of the select result in the same
 /// value. Returns the common value if so, otherwise returns null.
 static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
-                                  Value *RHS, const Query &Q,
+                                  Value *RHS, const SimplifyQuery &Q,
                                   unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
@@ -451,8 +432,9 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
 /// try to simplify the binop by seeing whether evaluating it on the incoming
 /// phi values yields the same result for every value. If so returns the common
 /// value, otherwise returns null.
-static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
-                                 const Query &Q, unsigned MaxRecurse) {
+static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
+                                 Value *RHS, const SimplifyQuery &Q,
+                                 unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -494,7 +476,7 @@ static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
 /// yields the same result every time. If so returns the common result,
 /// otherwise returns null.
 static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
-                               const Query &Q, unsigned MaxRecurse) {
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -527,17 +509,26 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   return CommonValue;
 }
 
+static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
+                                       Value *&Op0, Value *&Op1,
+                                       const SimplifyQuery &Q) {
+  if (auto *CLHS = dyn_cast<Constant>(Op0)) {
+    if (auto *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
+
+    // Canonicalize the constant to the RHS if this is a commutative operation.
+    if (Instruction::isCommutative(Opcode))
+      std::swap(Op0, Op1);
+  }
+  return nullptr;
+}
+
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                              const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Add, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
+    return C;
 
   // X + undef -> undef
   if (match(Op1, m_Undef()))
@@ -556,12 +547,20 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     return Y;
 
   // X + ~X -> -1   since   ~X = -X-1
+  Type *Ty = Op0->getType();
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
-    return Constant::getAllOnesValue(Op0->getType());
+    return Constant::getAllOnesValue(Ty);
+
+  // add nsw/nuw (xor Y, signmask), signmask --> Y
+  // The no-wrapping add guarantees that the top bit will be set by the add.
+  // Therefore, the xor must be clearing the already set sign bit of Y.
+  if ((isNSW || isNUW) && match(Op1, m_SignMask()) &&
+      match(Op0, m_Xor(m_Value(Y), m_SignMask())))
+    return Y;
 
   /// i1 add -> xor.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -583,11 +582,8 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const DataLayout &DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+                             const SimplifyQuery &Query) {
+  return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query, RecursionLimit);
 }
 
 /// \brief Compute the base pointer and cumulative constant offsets for V.
@@ -602,7 +598,7 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// folding.
 static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
                                                 bool AllowNonInbounds = false) {
-  assert(V->getType()->getScalarType()->isPointerTy());
+  assert(V->getType()->isPtrOrPtrVectorTy());
 
   Type *IntPtrTy = DL.getIntPtrType(V->getType())->getScalarType();
   APInt Offset = APInt::getNullValue(IntPtrTy->getIntegerBitWidth());
@@ -631,8 +627,7 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
         }
       break;
     }
-    assert(V->getType()->getScalarType()->isPointerTy() &&
-           "Unexpected operand type!");
+    assert(V->getType()->isPtrOrPtrVectorTy() && "Unexpected operand type!");
   } while (Visited.insert(V).second);
 
   Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
@@ -664,10 +659,9 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
 /// Given operands for a Sub, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                              const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0))
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Sub, CLHS, CRHS, Q.DL);
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
+    return C;
 
   // X - undef -> undef
   // undef - X -> undef
@@ -688,11 +682,8 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     if (isNUW)
       return Op0;
 
-    unsigned BitWidth = Op1->getType()->getScalarSizeInBits();
-    APInt KnownZero(BitWidth, 0);
-    APInt KnownOne(BitWidth, 0);
-    computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-    if (KnownZero == ~APInt::getSignBit(BitWidth)) {
+    KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (Known.Zero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
       if (isNSW)
@@ -779,7 +770,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // i1 sub -> xor.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -796,24 +787,16 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const DataLayout &DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+                             const SimplifyQuery &Q) {
+  return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for an FAdd, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FAdd, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
+    return C;
 
   // fadd X, -0 ==> X
   if (match(Op1, m_NegZero()))
@@ -845,11 +828,9 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// Given operands for an FSub, see if we can fold the result.  If not, this
 /// returns null.
 static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FSub, CLHS, CRHS, Q.DL);
-  }
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
+    return C;
 
   // fsub X, 0 ==> X
   if (match(Op1, m_Zero()))
@@ -878,40 +859,28 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 }
 
 /// Given the operands for an FMul, see if we can fold the result
-static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
-                               FastMathFlags FMF,
-                               const Query &Q,
-                               unsigned MaxRecurse) {
- if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FMul, CLHS, CRHS, Q.DL);
+static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
+    return C;
 
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
- }
-
- // fmul X, 1.0 ==> X
- if (match(Op1, m_FPOne()))
-   return Op0;
+  // fmul X, 1.0 ==> X
+  if (match(Op1, m_FPOne()))
+    return Op0;
 
- // fmul nnan nsz X, 0 ==> 0
- if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
-   return Op1;
+  // fmul nnan nsz X, 0 ==> 0
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
+    return Op1;
 
- return nullptr;
+  return nullptr;
 }
 
 /// Given operands for a Mul, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Mul, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
+    return C;
 
   // X * undef -> 0
   if (match(Op1, m_Undef()))
@@ -932,7 +901,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
     return X;
 
   // i1 mul -> and.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
     if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -964,77 +933,87 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
 }
 
 Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyFAddInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyFAddInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
+
 Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyFSubInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyFSubInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyFMulInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyFMulInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyMulInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifyMulInst(Op0, Op1, Q, RecursionLimit);
 }
 
-/// Given operands for an SDiv or UDiv, see if we can fold the result.
-/// If not, this returns null.
-static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
-                          const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
-
-  bool isSigned = Opcode == Instruction::SDiv;
+/// Check for common or similar folds of integer division or integer remainder.
+static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
+  Type *Ty = Op0->getType();
 
   // X / undef -> undef
+  // X % undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
-  // X / 0 -> undef, we don't need to preserve faults!
+  // X / 0 -> undef
+  // X % 0 -> undef
+  // We don't need to preserve faults!
   if (match(Op1, m_Zero()))
-    return UndefValue::get(Op1->getType());
+    return UndefValue::get(Ty);
+
+  // If any element of a constant divisor vector is zero, the whole op is undef.
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (Op1C && Ty->isVectorTy()) {
+    unsigned NumElts = Ty->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = Op1C->getAggregateElement(i);
+      if (Elt && Elt->isNullValue())
+        return UndefValue::get(Ty);
+    }
+  }
 
   // undef / X -> 0
+  // undef % X -> 0
   if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
+    return Constant::getNullValue(Ty);
 
-  // 0 / X -> 0, we don't need to preserve faults!
+  // 0 / X -> 0
+  // 0 % X -> 0
   if (match(Op0, m_Zero()))
     return Op0;
 
+  // X / X -> 1
+  // X % X -> 0
+  if (Op0 == Op1)
+    return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty);
+
   // X / 1 -> X
-  if (match(Op1, m_One()))
-    return Op0;
+  // X % 1 -> 0
+  // If this is a boolean op (single-bit element type), we can't have
+  // division-by-zero or remainder-by-zero, so assume the divisor is 1.
+  if (match(Op1, m_One()) || Ty->isIntOrIntVectorTy(1))
+    return IsDiv ? Op0 : Constant::getNullValue(Ty);
 
-  if (Op0->getType()->isIntegerTy(1))
-    // It can't be division by zero, hence it must be division by one.
-    return Op0;
+  return nullptr;
+}
 
-  // X / X -> 1
-  if (Op0 == Op1)
-    return ConstantInt::get(Op0->getType(), 1);
+/// Given operands for an SDiv or UDiv, see if we can fold the result.
+/// If not, this returns null.
+static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
+                          const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
+
+  if (Value *V = simplifyDivRem(Op0, Op1, true))
+    return V;
+
+  bool isSigned = Opcode == Instruction::SDiv;
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
   Value *X = nullptr, *Y = nullptr;
@@ -1061,7 +1040,7 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (!isSigned && match(Op0, m_UDiv(m_Value(X), m_ConstantInt(C1))) &&
       match(Op1, m_ConstantInt(C2))) {
     bool Overflow;
-    C1->getValue().umul_ov(C2->getValue(), Overflow);
+    (void)C1->getValue().umul_ov(C2->getValue(), Overflow);
     if (Overflow)
       return Constant::getNullValue(Op0->getType());
   }
@@ -1083,7 +1062,7 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
 
 /// Given operands for an SDiv, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyDiv(Instruction::SDiv, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1091,17 +1070,13 @@ static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifySDivInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifySDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a UDiv, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1119,16 +1094,15 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyUDivInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifyUDivInst(Op0, Op1, Q, RecursionLimit);
 }
 
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                               const Query &Q, unsigned) {
+                               const SimplifyQuery &Q, unsigned) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
+    return C;
+
   // undef / X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1166,49 +1140,19 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 }
 
 Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyFDivInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyFDivInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for an SRem or URem, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
-                          const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
-
-  // X % undef -> undef
-  if (match(Op1, m_Undef()))
-    return Op1;
-
-  // undef % X -> 0
-  if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
-
-  // 0 % X -> 0, we don't need to preserve faults!
-  if (match(Op0, m_Zero()))
-    return Op0;
-
-  // X % 0 -> undef, we don't need to preserve faults!
-  if (match(Op1, m_Zero()))
-    return UndefValue::get(Op0->getType());
-
-  // X % 1 -> 0
-  if (match(Op1, m_One()))
-    return Constant::getNullValue(Op0->getType());
+                          const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
 
-  if (Op0->getType()->isIntegerTy(1))
-    // It can't be remainder by zero, hence it must be remainder by one.
-    return Constant::getNullValue(Op0->getType());
-
-  // X % X -> 0
-  if (Op0 == Op1)
-    return Constant::getNullValue(Op0->getType());
+  if (Value *V = simplifyDivRem(Op0, Op1, false))
+    return V;
 
   // (X % Y) % Y -> X % Y
   if ((Opcode == Instruction::SRem &&
@@ -1234,7 +1178,7 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
 
 /// Given operands for an SRem, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyRem(Instruction::SRem, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1242,17 +1186,13 @@ static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifySRemInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifySRemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a URem, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1270,16 +1210,15 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyURemInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifyURemInst(Op0, Op1, Q, RecursionLimit);
 }
 
 static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                               const Query &, unsigned) {
+                               const SimplifyQuery &Q, unsigned) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
+    return C;
+
   // undef % X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1298,12 +1237,8 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 }
 
 Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyFRemInst(Op0, Op1, FMF, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyFRemInst(Op0, Op1, FMF, Q, RecursionLimit);
 }
 
 /// Returns true if a shift by \c Amount always yields undef.
@@ -1335,11 +1270,10 @@ static bool isUndefShift(Value *Amount) {
 
 /// Given operands for an Shl, LShr or AShr, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
-                            const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
+static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
+                            Value *Op1, const SimplifyQuery &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
@@ -1367,18 +1301,14 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
 
   // If any bits in the shift amount make that value greater than or equal to
   // the number of bits in the type, the shift is undefined.
-  unsigned BitWidth = Op1->getType()->getScalarSizeInBits();
-  APInt KnownZero(BitWidth, 0);
-  APInt KnownOne(BitWidth, 0);
-  computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-  if (KnownOne.getLimitedValue() >= BitWidth)
+  KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+  if (Known.One.getLimitedValue() >= Known.getBitWidth())
     return UndefValue::get(Op0->getType());
 
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
-  unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
-  APInt ShiftAmountMask = APInt::getLowBitsSet(BitWidth, NumValidShiftBits);
-  if ((KnownZero & ShiftAmountMask) == ShiftAmountMask)
+  unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth());
+  if (Known.countMinTrailingZeros() >= NumValidShiftBits)
     return Op0;
 
   return nullptr;
@@ -1386,8 +1316,8 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
 
 /// \brief Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
-                                 bool isExact, const Query &Q,
+static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
+                                 Value *Op1, bool isExact, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1403,12 +1333,8 @@ static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
 
   // The low bit cannot be shifted out of an exact shift if it is set.
   if (isExact) {
-    unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
-    APInt Op0KnownZero(BitWidth, 0);
-    APInt Op0KnownOne(BitWidth, 0);
-    computeKnownBits(Op0, Op0KnownZero, Op0KnownOne, Q.DL, /*Depth=*/0, Q.AC,
-                     Q.CxtI, Q.DT);
-    if (Op0KnownOne[0])
+    KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+    if (Op0Known.One[0])
       return Op0;
   }
 
@@ -1418,7 +1344,7 @@ static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
 /// Given operands for an Shl, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                              const Query &Q, unsigned MaxRecurse) {
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Instruction::Shl, Op0, Op1, Q, MaxRecurse))
     return V;
 
@@ -1435,17 +1361,14 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const DataLayout &DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+                             const SimplifyQuery &Q) {
+  return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Q, RecursionLimit);
 }
 
 /// Given operands for an LShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                               const Query &Q, unsigned MaxRecurse) {
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::LShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
       return V;
@@ -1459,18 +1382,14 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
 }
 
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyLShrInst(Op0, Op1, isExact, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyLShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
 /// Given operands for an AShr, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                               const Query &Q, unsigned MaxRecurse) {
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (Value *V = SimplifyRightShift(Instruction::AShr, Op0, Op1, isExact, Q,
                                     MaxRecurse))
     return V;
@@ -1493,14 +1412,12 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
 }
 
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyAShrInst(Op0, Op1, isExact, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
 static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
                                          ICmpInst *UnsignedICmp, bool IsAnd) {
   Value *X, *Y;
@@ -1569,29 +1486,75 @@ static Value *simplifyAndOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
 
 /// Commuted variants are assumed to be handled by calling this function again
 /// with the parameters swapped.
-static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
-  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
-    return X;
+static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
+  ICmpInst::Predicate Pred0, Pred1;
+  Value *A ,*B;
+  if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
+      !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
+    return nullptr;
 
-  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
-    return X;
+  // We have (icmp Pred0, A, B) | (icmp Pred1, A, B).
+  // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
+  // can eliminate Op0 from this 'or'.
+  if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
+    return Op1;
+
+  // Check for any combination of predicates that cover the entire range of
+  // possibilities.
+  if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
+      (Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) ||
+      (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) ||
+      (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE))
+    return getTrue(Op0->getType());
+
+  return nullptr;
+}
+
+/// Test if a pair of compares with a shared operand and 2 constants has an
+/// empty set intersection, full set union, or if one compare is a superset of
+/// the other.
+static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                                bool IsAnd) {
+  // Look for this pattern: {and/or} (icmp X, C0), (icmp X, C1)).
+  if (Cmp0->getOperand(0) != Cmp1->getOperand(0))
+    return nullptr;
 
-  // Look for this pattern: (icmp V, C0) & (icmp V, C1)).
-  Type *ITy = Op0->getType();
-  ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
-  Value *V;
-  if (match(Op0, m_ICmp(Pred0, m_Value(V), m_APInt(C0))) &&
-      match(Op1, m_ICmp(Pred1, m_Specific(V), m_APInt(C1)))) {
-    // Make a constant range that's the intersection of the two icmp ranges.
-    // If the intersection is empty, we know that the result is false.
-    auto Range0 = ConstantRange::makeAllowedICmpRegion(Pred0, *C0);
-    auto Range1 = ConstantRange::makeAllowedICmpRegion(Pred1, *C1);
-    if (Range0.intersectWith(Range1).isEmptySet())
-      return getFalse(ITy);
-  }
+  if (!match(Cmp0->getOperand(1), m_APInt(C0)) ||
+      !match(Cmp1->getOperand(1), m_APInt(C1)))
+    return nullptr;
+
+  auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0);
+  auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1);
+
+  // For and-of-compares, check if the intersection is empty:
+  // (icmp X, C0) && (icmp X, C1) --> empty set --> false
+  if (IsAnd && Range0.intersectWith(Range1).isEmptySet())
+    return getFalse(Cmp0->getType());
+
+  // For or-of-compares, check if the union is full:
+  // (icmp X, C0) || (icmp X, C1) --> full set --> true
+  if (!IsAnd && Range0.unionWith(Range1).isFullSet())
+    return getTrue(Cmp0->getType());
+
+  // Is one range a superset of the other?
+  // If this is and-of-compares, take the smaller set:
+  // (icmp sgt X, 4) && (icmp sgt X, 42) --> icmp sgt X, 42
+  // If this is or-of-compares, take the larger set:
+  // (icmp sgt X, 4) || (icmp sgt X, 42) --> icmp sgt X, 4
+  if (Range0.contains(Range1))
+    return IsAnd ? Cmp1 : Cmp0;
+  if (Range1.contains(Range0))
+    return IsAnd ? Cmp0 : Cmp1;
 
+  return nullptr;
+}
+
+static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) & (icmp V, C0)
+  ICmpInst::Predicate Pred0, Pred1;
+  const APInt *C0, *C1;
+  Value *V;
   if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
     return nullptr;
 
@@ -1602,6 +1565,7 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (AddInst->getOperand(1) != Op1->getOperand(1))
     return nullptr;
 
+  Type *ITy = Op0->getType();
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
 
@@ -1632,17 +1596,132 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   return nullptr;
 }
 
+static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
+    return X;
+  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true))
+    return X;
+
+  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
+    return X;
+  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0))
+    return X;
+
+  if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
+    return X;
+
+  if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1))
+    return X;
+  if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0))
+    return X;
+
+  return nullptr;
+}
+
+static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
+  // (icmp (add V, C0), C1) | (icmp V, C0)
+  ICmpInst::Predicate Pred0, Pred1;
+  const APInt *C0, *C1;
+  Value *V;
+  if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
+    return nullptr;
+
+  if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
+    return nullptr;
+
+  auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
+  if (AddInst->getOperand(1) != Op1->getOperand(1))
+    return nullptr;
+
+  Type *ITy = Op0->getType();
+  bool isNSW = AddInst->hasNoSignedWrap();
+  bool isNUW = AddInst->hasNoUnsignedWrap();
+
+  const APInt Delta = *C1 - *C0;
+  if (C0->isStrictlyPositive()) {
+    if (Delta == 2) {
+      if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE)
+        return getTrue(ITy);
+      if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW)
+        return getTrue(ITy);
+    }
+    if (Delta == 1) {
+      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE)
+        return getTrue(ITy);
+      if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW)
+        return getTrue(ITy);
+    }
+  }
+  if (C0->getBoolValue() && isNUW) {
+    if (Delta == 2)
+      if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE)
+        return getTrue(ITy);
+    if (Delta == 1)
+      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE)
+        return getTrue(ITy);
+  }
+
+  return nullptr;
+}
+
+static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
+    return X;
+  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false))
+    return X;
+
+  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
+    return X;
+  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0))
+    return X;
+
+  if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
+    return X;
+
+  if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1))
+    return X;
+  if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0))
+    return X;
+
+  return nullptr;
+}
+
+static Value *simplifyAndOrOfICmps(Value *Op0, Value *Op1, bool IsAnd) {
+  // Look through casts of the 'and' operands to find compares.
+  auto *Cast0 = dyn_cast<CastInst>(Op0);
+  auto *Cast1 = dyn_cast<CastInst>(Op1);
+  if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() &&
+      Cast0->getSrcTy() == Cast1->getSrcTy()) {
+    Op0 = Cast0->getOperand(0);
+    Op1 = Cast1->getOperand(0);
+  }
+
+  auto *Cmp0 = dyn_cast<ICmpInst>(Op0);
+  auto *Cmp1 = dyn_cast<ICmpInst>(Op1);
+  if (!Cmp0 || !Cmp1)
+    return nullptr;
+
+  Value *V =
+      IsAnd ? simplifyAndOfICmps(Cmp0, Cmp1) : simplifyOrOfICmps(Cmp0, Cmp1);
+  if (!V)
+    return nullptr;
+  if (!Cast0)
+    return V;
+
+  // If we looked through casts, we can only handle a constant simplification
+  // because we are not allowed to create a cast instruction here.
+  if (auto *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType());
+
+  return nullptr;
+}
+
 /// Given operands for an And, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::And, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
+    return C;
 
   // X & undef -> 0
   if (match(Op1, m_Undef()))
@@ -1666,16 +1745,31 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
     return Constant::getNullValue(Op0->getType());
 
   // (A | ?) & A = A
-  Value *A = nullptr, *B = nullptr;
-  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_c_Or(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A & (A | ?) = A
-  if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_c_Or(m_Specific(Op0), m_Value())))
     return Op0;
 
+  // A mask that only clears known zeros of a shifted value is a no-op.
+  Value *X;
+  const APInt *Mask;
+  const APInt *ShAmt;
+  if (match(Op1, m_APInt(Mask))) {
+    // If all bits in the inverted and shifted mask are clear:
+    // and (shl X, ShAmt), Mask --> shl X, ShAmt
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShAmt))) &&
+        (~(*Mask)).lshr(*ShAmt).isNullValue())
+      return Op0;
+
+    // If all bits in the inverted and shifted mask are clear:
+    // and (lshr X, ShAmt), Mask --> lshr X, ShAmt
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
+        (~(*Mask)).shl(*ShAmt).isNullValue())
+      return Op0;
+  }
+
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
@@ -1687,32 +1781,8 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
       return Op1;
   }
 
-  if (auto *ICILHS = dyn_cast<ICmpInst>(Op0)) {
-    if (auto *ICIRHS = dyn_cast<ICmpInst>(Op1)) {
-      if (Value *V = SimplifyAndOfICmps(ICILHS, ICIRHS))
-        return V;
-      if (Value *V = SimplifyAndOfICmps(ICIRHS, ICILHS))
-        return V;
-    }
-  }
-
-  // The compares may be hidden behind casts. Look through those and try the
-  // same folds as above.
-  auto *Cast0 = dyn_cast<CastInst>(Op0);
-  auto *Cast1 = dyn_cast<CastInst>(Op1);
-  if (Cast0 && Cast1 && Cast0->getOpcode() == Cast1->getOpcode() &&
-      Cast0->getSrcTy() == Cast1->getSrcTy()) {
-    auto *Cmp0 = dyn_cast<ICmpInst>(Cast0->getOperand(0));
-    auto *Cmp1 = dyn_cast<ICmpInst>(Cast1->getOperand(0));
-    if (Cmp0 && Cmp1) {
-      Instruction::CastOps CastOpc = Cast0->getOpcode();
-      Type *ResultType = Cast0->getType();
-      if (auto *V = dyn_cast_or_null<Constant>(SimplifyAndOfICmps(Cmp0, Cmp1)))
-        return ConstantExpr::getCast(CastOpc, V, ResultType);
-      if (auto *V = dyn_cast_or_null<Constant>(SimplifyAndOfICmps(Cmp1, Cmp0)))
-        return ConstantExpr::getCast(CastOpc, V, ResultType);
-    }
-  }
+  if (Value *V = simplifyAndOrOfICmps(Op0, Op1, true))
+    return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::And, Op0, Op1, Q,
@@ -1746,105 +1816,16 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyAndInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
-}
-
-/// Commuted variants are assumed to be handled by calling this function again
-/// with the parameters swapped.
-static Value *simplifyOrOfICmpsWithSameOperands(ICmpInst *Op0, ICmpInst *Op1) {
-  ICmpInst::Predicate Pred0, Pred1;
-  Value *A ,*B;
-  if (!match(Op0, m_ICmp(Pred0, m_Value(A), m_Value(B))) ||
-      !match(Op1, m_ICmp(Pred1, m_Specific(A), m_Specific(B))))
-    return nullptr;
-
-  // We have (icmp Pred0, A, B) | (icmp Pred1, A, B).
-  // If Op1 is always implied true by Op0, then Op0 is a subset of Op1, and we
-  // can eliminate Op0 from this 'or'.
-  if (ICmpInst::isImpliedTrueByMatchingCmp(Pred0, Pred1))
-    return Op1;
-
-  // Check for any combination of predicates that cover the entire range of
-  // possibilities.
-  if ((Pred0 == ICmpInst::getInversePredicate(Pred1)) ||
-      (Pred0 == ICmpInst::ICMP_NE && ICmpInst::isTrueWhenEqual(Pred1)) ||
-      (Pred0 == ICmpInst::ICMP_SLE && Pred1 == ICmpInst::ICMP_SGE) ||
-      (Pred0 == ICmpInst::ICMP_ULE && Pred1 == ICmpInst::ICMP_UGE))
-    return getTrue(Op0->getType());
-
-  return nullptr;
-}
-
-/// Commuted variants are assumed to be handled by calling this function again
-/// with the parameters swapped.
-static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
-  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
-    return X;
-
-  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
-    return X;
-
-  // (icmp (add V, C0), C1) | (icmp V, C0)
-  ICmpInst::Predicate Pred0, Pred1;
-  const APInt *C0, *C1;
-  Value *V;
-  if (!match(Op0, m_ICmp(Pred0, m_Add(m_Value(V), m_APInt(C0)), m_APInt(C1))))
-    return nullptr;
-
-  if (!match(Op1, m_ICmp(Pred1, m_Specific(V), m_Value())))
-    return nullptr;
-
-  auto *AddInst = cast<BinaryOperator>(Op0->getOperand(0));
-  if (AddInst->getOperand(1) != Op1->getOperand(1))
-    return nullptr;
-
-  Type *ITy = Op0->getType();
-  bool isNSW = AddInst->hasNoSignedWrap();
-  bool isNUW = AddInst->hasNoUnsignedWrap();
-
-  const APInt Delta = *C1 - *C0;
-  if (C0->isStrictlyPositive()) {
-    if (Delta == 2) {
-      if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_SLE)
-        return getTrue(ITy);
-      if (Pred0 == ICmpInst::ICMP_SGE && Pred1 == ICmpInst::ICMP_SLE && isNSW)
-        return getTrue(ITy);
-    }
-    if (Delta == 1) {
-      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_SLE)
-        return getTrue(ITy);
-      if (Pred0 == ICmpInst::ICMP_SGT && Pred1 == ICmpInst::ICMP_SLE && isNSW)
-        return getTrue(ITy);
-    }
-  }
-  if (C0->getBoolValue() && isNUW) {
-    if (Delta == 2)
-      if (Pred0 == ICmpInst::ICMP_UGE && Pred1 == ICmpInst::ICMP_ULE)
-        return getTrue(ITy);
-    if (Delta == 1)
-      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_ULE)
-        return getTrue(ITy);
-  }
-
-  return nullptr;
+Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifyAndInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for an Or, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                              unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Or, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
+    return C;
 
   // X | undef -> -1
   if (match(Op1, m_Undef()))
@@ -1868,34 +1849,61 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
     return Constant::getAllOnesValue(Op0->getType());
 
   // (A & ?) | A = A
-  Value *A = nullptr, *B = nullptr;
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_c_And(m_Specific(Op1), m_Value())))
     return Op1;
 
   // A | (A & ?) = A
-  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_c_And(m_Specific(Op0), m_Value())))
     return Op0;
 
   // ~(A & ?) | A = -1
-  if (match(Op0, m_Not(m_And(m_Value(A), m_Value(B)))) &&
-      (A == Op1 || B == Op1))
+  if (match(Op0, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op1->getType());
 
   // A | ~(A & ?) = -1
-  if (match(Op1, m_Not(m_And(m_Value(A), m_Value(B)))) &&
-      (A == Op0 || B == Op0))
+  if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
-  if (auto *ICILHS = dyn_cast<ICmpInst>(Op0)) {
-    if (auto *ICIRHS = dyn_cast<ICmpInst>(Op1)) {
-      if (Value *V = SimplifyOrOfICmps(ICILHS, ICIRHS))
-        return V;
-      if (Value *V = SimplifyOrOfICmps(ICIRHS, ICILHS))
-        return V;
-    }
-  }
+  Value *A, *B;
+  // (A & ~B) | (A ^ B) -> (A ^ B)
+  // (~B & A) | (A ^ B) -> (A ^ B)
+  // (A & ~B) | (B ^ A) -> (B ^ A)
+  // (~B & A) | (B ^ A) -> (B ^ A)
+  if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+      (match(Op0, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op0, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op1;
+
+  // Commute the 'or' operands.
+  // (A ^ B) | (A & ~B) -> (A ^ B)
+  // (A ^ B) | (~B & A) -> (A ^ B)
+  // (B ^ A) | (A & ~B) -> (B ^ A)
+  // (B ^ A) | (~B & A) -> (B ^ A)
+  if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+      (match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op0;
+
+  // (A & B) | (~A ^ B) -> (~A ^ B)
+  // (B & A) | (~A ^ B) -> (~A ^ B)
+  // (A & B) | (B ^ ~A) -> (B ^ ~A)
+  // (B & A) | (B ^ ~A) -> (B ^ ~A)
+  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+      (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op1;
+
+  // (~A ^ B) | (A & B) -> (~A ^ B)
+  // (~A ^ B) | (B & A) -> (~A ^ B)
+  // (B ^ ~A) | (A & B) -> (B ^ ~A)
+  // (B ^ ~A) | (B & A) -> (B ^ ~A)
+  if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+      (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) ||
+       match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B)))))
+    return Op0;
+
+  if (Value *V = simplifyAndOrOfICmps(Op0, Op1, false))
+    return V;
 
   // Try some generic simplifications for associative operations.
   if (Value *V = SimplifyAssociativeBinOp(Instruction::Or, Op0, Op1, Q,
@@ -1914,37 +1922,27 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                                          MaxRecurse))
       return V;
 
-  // (A & C)|(B & D)
-  Value *C = nullptr, *D = nullptr;
-  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
-      match(Op1, m_And(m_Value(B), m_Value(D)))) {
-    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
-    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
-    if (C1 && C2 && (C1->getValue() == ~C2->getValue())) {
+  // (A & C1)|(B & C2)
+  const APInt *C1, *C2;
+  if (match(Op0, m_And(m_Value(A), m_APInt(C1))) &&
+      match(Op1, m_And(m_Value(B), m_APInt(C2)))) {
+    if (*C1 == ~*C2) {
       // (A & C1)|(B & C2)
       // If we have: ((V + N) & C1) | (V & C2)
       // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
       // replace with V+N.
-      Value *V1, *V2;
-      if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+
-          match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+      Value *N;
+      if (C2->isMask() && // C2 == 0+1+
+          match(A, m_c_Add(m_Specific(B), m_Value(N)))) {
         // Add commutes, try both ways.
-        if (V1 == B &&
-            MaskedValueIsZero(V2, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
-          return A;
-        if (V2 == B &&
-            MaskedValueIsZero(V1, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+        if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return A;
       }
       // Or commutes, try both ways.
-      if ((C1->getValue() & (C1->getValue() + 1)) == 0 &&
-          match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+      if (C1->isMask() &&
+          match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
         // Add commutes, try both ways.
-        if (V1 == A &&
-            MaskedValueIsZero(V2, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
-          return B;
-        if (V2 == A &&
-            MaskedValueIsZero(V1, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+        if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
       }
     }
@@ -1959,25 +1957,16 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                            const TargetLibraryInfo *TLI,
-                            const DominatorTree *DT, AssumptionCache *AC,
-                            const Instruction *CxtI) {
-  return ::SimplifyOrInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                          RecursionLimit);
+Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifyOrInst(Op0, Op1, Q, RecursionLimit);
 }
 
 /// Given operands for a Xor, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
+static Value *SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Xor, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
+    return C;
 
   // A ^ undef -> undef
   if (match(Op1, m_Undef()))
@@ -2013,14 +2002,11 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
   return nullptr;
 }
 
-Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout &DL,
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyXorInst(Op0, Op1, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q) {
+  return ::SimplifyXorInst(Op0, Op1, Q, RecursionLimit);
 }
 
+
 static Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
@@ -2254,34 +2240,55 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
 
 /// Fold an icmp when its operands have i1 scalar type.
 static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
-                                  Value *RHS, const Query &Q) {
+                                  Value *RHS, const SimplifyQuery &Q) {
   Type *ITy = GetCompareTy(LHS); // The return type.
   Type *OpTy = LHS->getType();   // The operand type.
-  if (!OpTy->getScalarType()->isIntegerTy(1))
+  if (!OpTy->isIntOrIntVectorTy(1))
     return nullptr;
 
-  switch (Pred) {
-  default:
-    break;
-  case ICmpInst::ICMP_EQ:
-    // X == 1 -> X
-    if (match(RHS, m_One()))
-      return LHS;
-    break;
-  case ICmpInst::ICMP_NE:
-    // X != 0 -> X
-    if (match(RHS, m_Zero()))
+  // A boolean compared to true/false can be simplified in 14 out of the 20
+  // (10 predicates * 2 constants) possible combinations. Cases not handled here
+  // require a 'not' of the LHS, so those must be transformed in InstCombine.
+  if (match(RHS, m_Zero())) {
+    switch (Pred) {
+    case CmpInst::ICMP_NE:  // X !=  0 -> X
+    case CmpInst::ICMP_UGT: // X >u  0 -> X
+    case CmpInst::ICMP_SLT: // X <s  0 -> X
       return LHS;
-    break;
-  case ICmpInst::ICMP_UGT:
-    // X >u 0 -> X
-    if (match(RHS, m_Zero()))
+
+    case CmpInst::ICMP_ULT: // X <u  0 -> false
+    case CmpInst::ICMP_SGT: // X >s  0 -> false
+      return getFalse(ITy);
+
+    case CmpInst::ICMP_UGE: // X >=u 0 -> true
+    case CmpInst::ICMP_SLE: // X <=s 0 -> true
+      return getTrue(ITy);
+
+    default: break;
+    }
+  } else if (match(RHS, m_One())) {
+    switch (Pred) {
+    case CmpInst::ICMP_EQ:  // X ==   1 -> X
+    case CmpInst::ICMP_UGE: // X >=u  1 -> X
+    case CmpInst::ICMP_SLE: // X <=s -1 -> X
       return LHS;
+
+    case CmpInst::ICMP_UGT: // X >u   1 -> false
+    case CmpInst::ICMP_SLT: // X <s  -1 -> false
+      return getFalse(ITy);
+
+    case CmpInst::ICMP_ULE: // X <=u  1 -> true
+    case CmpInst::ICMP_SGE: // X >=s -1 -> true
+      return getTrue(ITy);
+
+    default: break;
+    }
+  }
+
+  switch (Pred) {
+  default:
     break;
   case ICmpInst::ICMP_UGE:
-    // X >=u 1 -> X
-    if (match(RHS, m_One()))
-      return LHS;
     if (isImpliedCondition(RHS, LHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
@@ -2296,16 +2303,6 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SLT:
-    // X <s 0 -> X
-    if (match(RHS, m_Zero()))
-      return LHS;
-    break;
-  case ICmpInst::ICMP_SLE:
-    // X <=s -1 -> X
-    if (match(RHS, m_One()))
-      return LHS;
-    break;
   case ICmpInst::ICMP_ULE:
     if (isImpliedCondition(LHS, RHS, Q.DL).getValueOr(false))
       return getTrue(ITy);
@@ -2317,12 +2314,11 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS,
 
 /// Try hard to fold icmp with zero RHS because this is a common case.
 static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
-                                   Value *RHS, const Query &Q) {
+                                   Value *RHS, const SimplifyQuery &Q) {
   if (!match(RHS, m_Zero()))
     return nullptr;
 
   Type *ITy = GetCompareTy(LHS); // The return type.
-  bool LHSKnownNonNegative, LHSKnownNegative;
   switch (Pred) {
   default:
     llvm_unreachable("Unknown ICmp predicate!");
@@ -2340,43 +2336,202 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SLT:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  case ICmpInst::ICMP_SLT: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnownNonNegative)
+    if (LHSKnown.isNonNegative())
       return getFalse(ITy);
     break;
-  case ICmpInst::ICMP_SLE:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SLE: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+    if (LHSKnown.isNonNegative() &&
+        isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getFalse(ITy);
     break;
-  case ICmpInst::ICMP_SGE:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SGE: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnownNonNegative)
+    if (LHSKnown.isNonNegative())
       return getTrue(ITy);
     break;
-  case ICmpInst::ICMP_SGT:
-    ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC,
-                   Q.CxtI, Q.DT);
-    if (LHSKnownNegative)
+  }
+  case ICmpInst::ICMP_SGT: {
+    KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+    if (LHSKnown.isNonNegative() &&
+        isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
       return getTrue(ITy);
     break;
   }
+  }
 
   return nullptr;
 }
 
+/// Many binary operators with a constant operand have an easy-to-compute
+/// range of outputs. This can be used to fold a comparison to always true or
+/// always false.
+static void setLimitsForBinOp(BinaryOperator &BO, APInt &Lower, APInt &Upper) {
+  unsigned Width = Lower.getBitWidth();
+  const APInt *C;
+  switch (BO.getOpcode()) {
+  case Instruction::Add:
+    if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
+      // FIXME: If we have both nuw and nsw, we should reduce the range further.
+      if (BO.hasNoUnsignedWrap()) {
+        // 'add nuw x, C' produces [C, UINT_MAX].
+        Lower = *C;
+      } else if (BO.hasNoSignedWrap()) {
+        if (C->isNegative()) {
+          // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C].
+          Lower = APInt::getSignedMinValue(Width);
+          Upper = APInt::getSignedMaxValue(Width) + *C + 1;
+        } else {
+          // 'add nsw x, +C' produces [SINT_MIN + C, SINT_MAX].
+          Lower = APInt::getSignedMinValue(Width) + *C;
+          Upper = APInt::getSignedMaxValue(Width) + 1;
+        }
+      }
+    }
+    break;
+
+  case Instruction::And:
+    if (match(BO.getOperand(1), m_APInt(C)))
+      // 'and x, C' produces [0, C].
+      Upper = *C + 1;
+    break;
+
+  case Instruction::Or:
+    if (match(BO.getOperand(1), m_APInt(C)))
+      // 'or x, C' produces [C, UINT_MAX].
+      Lower = *C;
+    break;
+
+  case Instruction::AShr:
+    if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
+      // 'ashr x, C' produces [INT_MIN >> C, INT_MAX >> C].
+      Lower = APInt::getSignedMinValue(Width).ashr(*C);
+      Upper = APInt::getSignedMaxValue(Width).ashr(*C) + 1;
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      unsigned ShiftAmount = Width - 1;
+      if (!C->isNullValue() && BO.isExact())
+        ShiftAmount = C->countTrailingZeros();
+      if (C->isNegative()) {
+        // 'ashr C, x' produces [C, C >> (Width-1)]
+        Lower = *C;
+        Upper = C->ashr(ShiftAmount) + 1;
+      } else {
+        // 'ashr C, x' produces [C >> (Width-1), C]
+        Lower = C->ashr(ShiftAmount);
+        Upper = *C + 1;
+      }
+    }
+    break;
+
+  case Instruction::LShr:
+    if (match(BO.getOperand(1), m_APInt(C)) && C->ult(Width)) {
+      // 'lshr x, C' produces [0, UINT_MAX >> C].
+      Upper = APInt::getAllOnesValue(Width).lshr(*C) + 1;
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      // 'lshr C, x' produces [C >> (Width-1), C].
+      unsigned ShiftAmount = Width - 1;
+      if (!C->isNullValue() && BO.isExact())
+        ShiftAmount = C->countTrailingZeros();
+      Lower = C->lshr(ShiftAmount);
+      Upper = *C + 1;
+    }
+    break;
+
+  case Instruction::Shl:
+    if (match(BO.getOperand(0), m_APInt(C))) {
+      if (BO.hasNoUnsignedWrap()) {
+        // 'shl nuw C, x' produces [C, C << CLZ(C)]
+        Lower = *C;
+        Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
+      } else if (BO.hasNoSignedWrap()) { // TODO: What if both nuw+nsw?
+        if (C->isNegative()) {
+          // 'shl nsw C, x' produces [C << CLO(C)-1, C]
+          unsigned ShiftAmount = C->countLeadingOnes() - 1;
+          Lower = C->shl(ShiftAmount);
+          Upper = *C + 1;
+        } else {
+          // 'shl nsw C, x' produces [C, C << CLZ(C)-1]
+          unsigned ShiftAmount = C->countLeadingZeros() - 1;
+          Lower = *C;
+          Upper = C->shl(ShiftAmount) + 1;
+        }
+      }
+    }
+    break;
+
+  case Instruction::SDiv:
+    if (match(BO.getOperand(1), m_APInt(C))) {
+      APInt IntMin = APInt::getSignedMinValue(Width);
+      APInt IntMax = APInt::getSignedMaxValue(Width);
+      if (C->isAllOnesValue()) {
+        // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
+        //    where C != -1 and C != 0 and C != 1
+        Lower = IntMin + 1;
+        Upper = IntMax + 1;
+      } else if (C->countLeadingZeros() < Width - 1) {
+        // 'sdiv x, C' produces [INT_MIN / C, INT_MAX / C]
+        //    where C != -1 and C != 0 and C != 1
+        Lower = IntMin.sdiv(*C);
+        Upper = IntMax.sdiv(*C);
+        if (Lower.sgt(Upper))
+          std::swap(Lower, Upper);
+        Upper = Upper + 1;
+        assert(Upper != Lower && "Upper part of range has wrapped!");
+      }
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      if (C->isMinSignedValue()) {
+        // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
+        Lower = *C;
+        Upper = Lower.lshr(1) + 1;
+      } else {
+        // 'sdiv C, x' produces [-|C|, |C|].
+        Upper = C->abs() + 1;
+        Lower = (-Upper) + 1;
+      }
+    }
+    break;
+
+  case Instruction::UDiv:
+    if (match(BO.getOperand(1), m_APInt(C)) && !C->isNullValue()) {
+      // 'udiv x, C' produces [0, UINT_MAX / C].
+      Upper = APInt::getMaxValue(Width).udiv(*C) + 1;
+    } else if (match(BO.getOperand(0), m_APInt(C))) {
+      // 'udiv C, x' produces [0, C].
+      Upper = *C + 1;
+    }
+    break;
+
+  case Instruction::SRem:
+    if (match(BO.getOperand(1), m_APInt(C))) {
+      // 'srem x, C' produces (-|C|, |C|).
+      Upper = C->abs();
+      Lower = (-Upper) + 1;
+    }
+    break;
+
+  case Instruction::URem:
+    if (match(BO.getOperand(1), m_APInt(C)))
+      // 'urem x, C' produces [0, C).
+      Upper = *C;
+    break;
+
+  default:
+    break;
+  }
+}
+
 static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
                                        Value *RHS) {
   const APInt *C;
@@ -2390,114 +2545,12 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   if (RHS_CR.isFullSet())
     return ConstantInt::getTrue(GetCompareTy(RHS));
 
-  // Many binary operators with constant RHS have easy to compute constant
-  // range.  Use them to check whether the comparison is a tautology.
+  // Find the range of possible values for binary operators.
   unsigned Width = C->getBitWidth();
   APInt Lower = APInt(Width, 0);
   APInt Upper = APInt(Width, 0);
-  const APInt *C2;
-  if (match(LHS, m_URem(m_Value(), m_APInt(C2)))) {
-    // 'urem x, C2' produces [0, C2).
-    Upper = *C2;
-  } else if (match(LHS, m_SRem(m_Value(), m_APInt(C2)))) {
-    // 'srem x, C2' produces (-|C2|, |C2|).
-    Upper = C2->abs();
-    Lower = (-Upper) + 1;
-  } else if (match(LHS, m_UDiv(m_APInt(C2), m_Value()))) {
-    // 'udiv C2, x' produces [0, C2].
-    Upper = *C2 + 1;
-  } else if (match(LHS, m_UDiv(m_Value(), m_APInt(C2)))) {
-    // 'udiv x, C2' produces [0, UINT_MAX / C2].
-    APInt NegOne = APInt::getAllOnesValue(Width);
-    if (*C2 != 0)
-      Upper = NegOne.udiv(*C2) + 1;
-  } else if (match(LHS, m_SDiv(m_APInt(C2), m_Value()))) {
-    if (C2->isMinSignedValue()) {
-      // 'sdiv INT_MIN, x' produces [INT_MIN, INT_MIN / -2].
-      Lower = *C2;
-      Upper = Lower.lshr(1) + 1;
-    } else {
-      // 'sdiv C2, x' produces [-|C2|, |C2|].
-      Upper = C2->abs() + 1;
-      Lower = (-Upper) + 1;
-    }
-  } else if (match(LHS, m_SDiv(m_Value(), m_APInt(C2)))) {
-    APInt IntMin = APInt::getSignedMinValue(Width);
-    APInt IntMax = APInt::getSignedMaxValue(Width);
-    if (C2->isAllOnesValue()) {
-      // 'sdiv x, -1' produces [INT_MIN + 1, INT_MAX]
-      //    where C2 != -1 and C2 != 0 and C2 != 1
-      Lower = IntMin + 1;
-      Upper = IntMax + 1;
-    } else if (C2->countLeadingZeros() < Width - 1) {
-      // 'sdiv x, C2' produces [INT_MIN / C2, INT_MAX / C2]
-      //    where C2 != -1 and C2 != 0 and C2 != 1
-      Lower = IntMin.sdiv(*C2);
-      Upper = IntMax.sdiv(*C2);
-      if (Lower.sgt(Upper))
-        std::swap(Lower, Upper);
-      Upper = Upper + 1;
-      assert(Upper != Lower && "Upper part of range has wrapped!");
-    }
-  } else if (match(LHS, m_NUWShl(m_APInt(C2), m_Value()))) {
-    // 'shl nuw C2, x' produces [C2, C2 << CLZ(C2)]
-    Lower = *C2;
-    Upper = Lower.shl(Lower.countLeadingZeros()) + 1;
-  } else if (match(LHS, m_NSWShl(m_APInt(C2), m_Value()))) {
-    if (C2->isNegative()) {
-      // 'shl nsw C2, x' produces [C2 << CLO(C2)-1, C2]
-      unsigned ShiftAmount = C2->countLeadingOnes() - 1;
-      Lower = C2->shl(ShiftAmount);
-      Upper = *C2 + 1;
-    } else {
-      // 'shl nsw C2, x' produces [C2, C2 << CLZ(C2)-1]
-      unsigned ShiftAmount = C2->countLeadingZeros() - 1;
-      Lower = *C2;
-      Upper = C2->shl(ShiftAmount) + 1;
-    }
-  } else if (match(LHS, m_LShr(m_Value(), m_APInt(C2)))) {
-    // 'lshr x, C2' produces [0, UINT_MAX >> C2].
-    APInt NegOne = APInt::getAllOnesValue(Width);
-    if (C2->ult(Width))
-      Upper = NegOne.lshr(*C2) + 1;
-  } else if (match(LHS, m_LShr(m_APInt(C2), m_Value()))) {
-    // 'lshr C2, x' produces [C2 >> (Width-1), C2].
-    unsigned ShiftAmount = Width - 1;
-    if (*C2 != 0 && cast<BinaryOperator>(LHS)->isExact())
-      ShiftAmount = C2->countTrailingZeros();
-    Lower = C2->lshr(ShiftAmount);
-    Upper = *C2 + 1;
-  } else if (match(LHS, m_AShr(m_Value(), m_APInt(C2)))) {
-    // 'ashr x, C2' produces [INT_MIN >> C2, INT_MAX >> C2].
-    APInt IntMin = APInt::getSignedMinValue(Width);
-    APInt IntMax = APInt::getSignedMaxValue(Width);
-    if (C2->ult(Width)) {
-      Lower = IntMin.ashr(*C2);
-      Upper = IntMax.ashr(*C2) + 1;
-    }
-  } else if (match(LHS, m_AShr(m_APInt(C2), m_Value()))) {
-    unsigned ShiftAmount = Width - 1;
-    if (*C2 != 0 && cast<BinaryOperator>(LHS)->isExact())
-      ShiftAmount = C2->countTrailingZeros();
-    if (C2->isNegative()) {
-      // 'ashr C2, x' produces [C2, C2 >> (Width-1)]
-      Lower = *C2;
-      Upper = C2->ashr(ShiftAmount) + 1;
-    } else {
-      // 'ashr C2, x' produces [C2 >> (Width-1), C2]
-      Lower = C2->ashr(ShiftAmount);
-      Upper = *C2 + 1;
-    }
-  } else if (match(LHS, m_Or(m_Value(), m_APInt(C2)))) {
-    // 'or x, C2' produces [C2, UINT_MAX].
-    Lower = *C2;
-  } else if (match(LHS, m_And(m_Value(), m_APInt(C2)))) {
-    // 'and x, C2' produces [0, C2].
-    Upper = *C2 + 1;
-  } else if (match(LHS, m_NUWAdd(m_Value(), m_APInt(C2)))) {
-    // 'add nuw x, C2' produces [C2, UINT_MAX].
-    Lower = *C2;
-  }
+  if (auto *BO = dyn_cast<BinaryOperator>(LHS))
+    setLimitsForBinOp(*BO, Lower, Upper);
 
   ConstantRange LHS_CR =
       Lower != Upper ? ConstantRange(Lower, Upper) : ConstantRange(Width, true);
@@ -2516,8 +2569,11 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+/// TODO: A large part of this logic is duplicated in InstCombine's
+/// foldICmpBinOp(). We should be able to share that and avoid the code
+/// duplication.
 static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
-                                    Value *RHS, const Query &Q,
+                                    Value *RHS, const SimplifyQuery &Q,
                                     unsigned MaxRecurse) {
   Type *ITy = GetCompareTy(LHS); // The return type.
 
@@ -2597,15 +2653,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return getTrue(ITy);
 
       if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) {
-        bool RHSKnownNonNegative, RHSKnownNegative;
-        bool YKnownNonNegative, YKnownNegative;
-        ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0,
-                       Q.AC, Q.CxtI, Q.DT);
-        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
-                       Q.CxtI, Q.DT);
-        if (RHSKnownNonNegative && YKnownNegative)
+        KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        if (RHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy);
-        if (RHSKnownNegative || YKnownNonNegative)
+        if (RHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy);
       }
     }
@@ -2617,31 +2669,25 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         return getFalse(ITy);
 
       if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) {
-        bool LHSKnownNonNegative, LHSKnownNegative;
-        bool YKnownNonNegative, YKnownNegative;
-        ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0,
-                       Q.AC, Q.CxtI, Q.DT);
-        ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC,
-                       Q.CxtI, Q.DT);
-        if (LHSKnownNonNegative && YKnownNegative)
+        KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+        if (LHSKnown.isNonNegative() && YKnown.isNegative())
           return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy);
-        if (LHSKnownNegative || YKnownNonNegative)
+        if (LHSKnown.isNegative() || YKnown.isNonNegative())
           return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy);
       }
     }
   }
 
   // icmp pred (and X, Y), X
-  if (LBO && match(LBO, m_CombineOr(m_And(m_Value(), m_Specific(RHS)),
-                                    m_And(m_Specific(RHS), m_Value())))) {
+  if (LBO && match(LBO, m_c_And(m_Value(), m_Specific(RHS)))) {
     if (Pred == ICmpInst::ICMP_UGT)
       return getFalse(ITy);
     if (Pred == ICmpInst::ICMP_ULE)
       return getTrue(ITy);
   }
   // icmp pred X, (and X, Y)
-  if (RBO && match(RBO, m_CombineOr(m_And(m_Value(), m_Specific(LHS)),
-                                    m_And(m_Specific(LHS), m_Value())))) {
+  if (RBO && match(RBO, m_c_And(m_Value(), m_Specific(LHS)))) {
     if (Pred == ICmpInst::ICMP_UGE)
       return getTrue(ITy);
     if (Pred == ICmpInst::ICMP_ULT)
@@ -2672,28 +2718,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
   // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
-    bool KnownNonNegative, KnownNegative;
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
@@ -2703,28 +2748,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 
   // icmp pred X, (urem Y, X)
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
-    bool KnownNonNegative, KnownNegative;
     switch (Pred) {
     default:
       break;
     case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SGE: {
+      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
-    case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC,
-                     Q.CxtI, Q.DT);
-      if (!KnownNonNegative)
+    case ICmpInst::ICMP_SLE: {
+      KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+      if (!Known.isNonNegative())
         break;
       LLVM_FALLTHROUGH;
+    }
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
@@ -2773,14 +2817,14 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
         // - CI2 is one
         // - CI isn't zero
         if (LBO->hasNoSignedWrap() || LBO->hasNoUnsignedWrap() ||
-            *CI2Val == 1 || !CI->isZero()) {
+            CI2Val->isOneValue() || !CI->isZero()) {
           if (Pred == ICmpInst::ICMP_EQ)
             return ConstantInt::getFalse(RHS->getContext());
           if (Pred == ICmpInst::ICMP_NE)
             return ConstantInt::getTrue(RHS->getContext());
         }
       }
-      if (CIVal->isSignBit() && *CI2Val == 1) {
+      if (CIVal->isSignMask() && CI2Val->isOneValue()) {
         if (Pred == ICmpInst::ICMP_UGT)
           return ConstantInt::getFalse(RHS->getContext());
         if (Pred == ICmpInst::ICMP_ULE)
@@ -2796,10 +2840,19 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
       break;
     case Instruction::UDiv:
     case Instruction::LShr:
-      if (ICmpInst::isSigned(Pred))
+      if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact())
         break;
-      LLVM_FALLTHROUGH;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse - 1))
+          return V;
+      break;
     case Instruction::SDiv:
+      if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact())
+        break;
+      if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0),
+                                      RBO->getOperand(0), Q, MaxRecurse - 1))
+        return V;
+      break;
     case Instruction::AShr:
       if (!LBO->isExact() || !RBO->isExact())
         break;
@@ -2827,7 +2880,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
 /// Simplify integer comparisons where at least one operand of the compare
 /// matches an integer min/max idiom.
 static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
-                                     Value *RHS, const Query &Q,
+                                     Value *RHS, const SimplifyQuery &Q,
                                      unsigned MaxRecurse) {
   Type *ITy = GetCompareTy(LHS); // The return type.
   Value *A, *B;
@@ -3031,7 +3084,7 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                               const Query &Q, unsigned MaxRecurse) {
+                               const SimplifyQuery &Q, unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isIntPredicate(Pred) && "Not an integer compare!");
 
@@ -3064,8 +3117,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
   if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
-    auto RHS_Instr = dyn_cast<Instruction>(RHS);
-    auto LHS_Instr = dyn_cast<Instruction>(LHS);
+    auto RHS_Instr = cast<Instruction>(RHS);
+    auto LHS_Instr = cast<Instruction>(LHS);
 
     if (RHS_Instr->getMetadata(LLVMContext::MD_range) &&
         LHS_Instr->getMetadata(LLVMContext::MD_range)) {
@@ -3245,11 +3298,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   }
 
   // icmp eq|ne X, Y -> false|true if X != Y
-  if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+  if (ICmpInst::isEquality(Pred) &&
       isKnownNonEqual(LHS, RHS, Q.DL, Q.AC, Q.CxtI, Q.DT)) {
-    LLVMContext &Ctx = LHS->getType()->getContext();
-    return Pred == ICmpInst::ICMP_NE ?
-      ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
+    return Pred == ICmpInst::ICMP_NE ? getTrue(ITy) : getFalse(ITy);
   }
 
   if (Value *V = simplifyICmpWithBinOp(Pred, LHS, RHS, Q, MaxRecurse))
@@ -3297,22 +3348,6 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
-  // If a bit is known to be zero for A and known to be one for B,
-  // then A and B cannot be equal.
-  if (ICmpInst::isEquality(Pred)) {
-    const APInt *RHSVal;
-    if (match(RHS, m_APInt(RHSVal))) {
-      unsigned BitWidth = RHSVal->getBitWidth();
-      APInt LHSKnownZero(BitWidth, 0);
-      APInt LHSKnownOne(BitWidth, 0);
-      computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, Q.DL, /*Depth=*/0, Q.AC,
-                       Q.CxtI, Q.DT);
-      if (((LHSKnownZero & *RHSVal) != 0) || ((LHSKnownOne & ~(*RHSVal)) != 0))
-        return Pred == ICmpInst::ICMP_EQ ? ConstantInt::getFalse(ITy)
-                                         : ConstantInt::getTrue(ITy);
-    }
-  }
-
   // If the comparison is with the result of a select instruction, check whether
   // comparing with either branch of the select always yields the same value.
   if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
@@ -3329,18 +3364,14 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyICmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyICmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 /// Given operands for an FCmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                               FastMathFlags FMF, const Query &Q,
+                               FastMathFlags FMF, const SimplifyQuery &Q,
                                unsigned MaxRecurse) {
   CmpInst::Predicate Pred = (CmpInst::Predicate)Predicate;
   assert(CmpInst::isFPPredicate(Pred) && "Not an FP compare!");
@@ -3462,22 +3493,22 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              FastMathFlags FMF, const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF,
-                            Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+                              FastMathFlags FMF, const SimplifyQuery &Q) {
+  return ::SimplifyFCmpInst(Predicate, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// See if V simplifies when its operand Op is replaced with RepOp.
 static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                           const Query &Q,
+                                           const SimplifyQuery &Q,
                                            unsigned MaxRecurse) {
   // Trivial replacement.
   if (V == Op)
     return RepOp;
 
+  // We cannot replace a constant, and shouldn't even try.
+  if (isa<Constant>(Op))
+    return nullptr;
+
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return nullptr;
@@ -3620,7 +3651,7 @@ static Value *simplifySelectWithFakeICmpEq(Value *CmpLHS, Value *TrueVal,
 /// Try to simplify a select instruction when its condition operand is an
 /// integer comparison.
 static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
-                                         Value *FalseVal, const Query &Q,
+                                         Value *FalseVal, const SimplifyQuery &Q,
                                          unsigned MaxRecurse) {
   ICmpInst::Predicate Pred;
   Value *CmpLHS, *CmpRHS;
@@ -3699,7 +3730,7 @@ static Value *simplifySelectWithICmpCond(Value *CondVal, Value *TrueVal,
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
-                                 Value *FalseVal, const Query &Q,
+                                 Value *FalseVal, const SimplifyQuery &Q,
                                  unsigned MaxRecurse) {
   // select true, X, Y  -> X
   // select false, X, Y -> Y
@@ -3715,9 +3746,9 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
     return TrueVal;
 
   if (isa<UndefValue>(CondVal)) {  // select undef, X, Y -> X or Y
-    if (isa<Constant>(TrueVal))
-      return TrueVal;
-    return FalseVal;
+    if (isa<Constant>(FalseVal))
+      return FalseVal;
+    return TrueVal;
   }
   if (isa<UndefValue>(TrueVal))   // select C, undef, X -> X
     return FalseVal;
@@ -3732,18 +3763,14 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
 }
 
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
-                                const DataLayout &DL,
-                                const TargetLibraryInfo *TLI,
-                                const DominatorTree *DT, AssumptionCache *AC,
-                                const Instruction *CxtI) {
-  return ::SimplifySelectInst(Cond, TrueVal, FalseVal,
-                              Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+                                const SimplifyQuery &Q) {
+  return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Q, RecursionLimit);
 }
 
 /// Given operands for an GetElementPtrInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
-                              const Query &Q, unsigned) {
+                              const SimplifyQuery &Q, unsigned) {
   // The type of the GEP pointer operand.
   unsigned AS =
       cast<PointerType>(Ops[0]->getType()->getScalarType())->getAddressSpace();
@@ -3757,6 +3784,8 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
   Type *GEPTy = PointerType::get(LastType, AS);
   if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
     GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+  else if (VectorType *VT = dyn_cast<VectorType>(Ops[1]->getType()))
+    GEPTy = VectorType::get(GEPTy, VT->getNumElements());
 
   if (isa<UndefValue>(Ops[0]))
     return UndefValue::get(GEPTy);
@@ -3842,27 +3871,25 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
   }
 
   // Check to see if this is constant foldable.
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    if (!isa<Constant>(Ops[i]))
-      return nullptr;
+  if (!all_of(Ops, [](Value *V) { return isa<Constant>(V); }))
+    return nullptr;
 
-  return ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
-                                        Ops.slice(1));
+  auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
+                                            Ops.slice(1));
+  if (auto *CEFolded = ConstantFoldConstant(CE, Q.DL))
+    return CEFolded;
+  return CE;
 }
 
 Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
-                             const DataLayout &DL,
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyGEPInst(SrcTy, Ops,
-                           Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+                             const SimplifyQuery &Q) {
+  return ::SimplifyGEPInst(SrcTy, Ops, Q, RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
-                                      ArrayRef<unsigned> Idxs, const Query &Q,
+                                      ArrayRef<unsigned> Idxs, const SimplifyQuery &Q,
                                       unsigned) {
   if (Constant *CAgg = dyn_cast<Constant>(Agg))
     if (Constant *CVal = dyn_cast<Constant>(Val))
@@ -3888,18 +3915,16 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
   return nullptr;
 }
 
-Value *llvm::SimplifyInsertValueInst(
-    Value *Agg, Value *Val, ArrayRef<unsigned> Idxs, const DataLayout &DL,
-    const TargetLibraryInfo *TLI, const DominatorTree *DT, AssumptionCache *AC,
-    const Instruction *CxtI) {
-  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query(DL, TLI, DT, AC, CxtI),
-                                   RecursionLimit);
+Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
+                                     ArrayRef<unsigned> Idxs,
+                                     const SimplifyQuery &Q) {
+  return ::SimplifyInsertValueInst(Agg, Val, Idxs, Q, RecursionLimit);
 }
 
 /// Given operands for an ExtractValueInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
-                                       const Query &, unsigned) {
+                                       const SimplifyQuery &, unsigned) {
   if (auto *CAgg = dyn_cast<Constant>(Agg))
     return ConstantFoldExtractValueInstruction(CAgg, Idxs);
 
@@ -3922,18 +3947,13 @@ static Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
 }
 
 Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
-                                      const DataLayout &DL,
-                                      const TargetLibraryInfo *TLI,
-                                      const DominatorTree *DT,
-                                      AssumptionCache *AC,
-                                      const Instruction *CxtI) {
-  return ::SimplifyExtractValueInst(Agg, Idxs, Query(DL, TLI, DT, AC, CxtI),
-                                    RecursionLimit);
+                                      const SimplifyQuery &Q) {
+  return ::SimplifyExtractValueInst(Agg, Idxs, Q, RecursionLimit);
 }
 
 /// Given operands for an ExtractElementInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &,
+static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &,
                                          unsigned) {
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
@@ -3956,15 +3976,13 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const Query &,
   return nullptr;
 }
 
-Value *llvm::SimplifyExtractElementInst(
-    Value *Vec, Value *Idx, const DataLayout &DL, const TargetLibraryInfo *TLI,
-    const DominatorTree *DT, AssumptionCache *AC, const Instruction *CxtI) {
-  return ::SimplifyExtractElementInst(Vec, Idx, Query(DL, TLI, DT, AC, CxtI),
-                                      RecursionLimit);
+Value *llvm::SimplifyExtractElementInst(Value *Vec, Value *Idx,
+                                        const SimplifyQuery &Q) {
+  return ::SimplifyExtractElementInst(Vec, Idx, Q, RecursionLimit);
 }
 
 /// See if we can fold the given phi. If not, returns null.
-static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
+static Value *SimplifyPHINode(PHINode *PN, const SimplifyQuery &Q) {
   // If all of the PHI's incoming values are the same then replace the PHI node
   // with the common value.
   Value *CommonValue = nullptr;
@@ -3997,7 +4015,7 @@ static Value *SimplifyPHINode(PHINode *PN, const Query &Q) {
 }
 
 static Value *SimplifyCastInst(unsigned CastOpc, Value *Op,
-                               Type *Ty, const Query &Q, unsigned MaxRecurse) {
+                               Type *Ty, const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (auto *C = dyn_cast<Constant>(Op))
     return ConstantFoldCastOperand(CastOpc, C, Ty, Q.DL);
 
@@ -4031,12 +4049,141 @@ static Value *SimplifyCastInst(unsigned CastOpc, Value *Op,
 }
 
 Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
-                              const DataLayout &DL,
-                              const TargetLibraryInfo *TLI,
-                              const DominatorTree *DT, AssumptionCache *AC,
-                              const Instruction *CxtI) {
-  return ::SimplifyCastInst(CastOpc, Op, Ty, Query(DL, TLI, DT, AC, CxtI),
-                            RecursionLimit);
+                              const SimplifyQuery &Q) {
+  return ::SimplifyCastInst(CastOpc, Op, Ty, Q, RecursionLimit);
+}
+
+/// For the given destination element of a shuffle, peek through shuffles to
+/// match a root vector source operand that contains that element in the same
+/// vector lane (ie, the same mask index), so we can eliminate the shuffle(s).
+static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
+                                   int MaskVal, Value *RootVec,
+                                   unsigned MaxRecurse) {
+  if (!MaxRecurse--)
+    return nullptr;
+
+  // Bail out if any mask value is undefined. That kind of shuffle may be
+  // simplified further based on demanded bits or other folds.
+  if (MaskVal == -1)
+    return nullptr;
+
+  // The mask value chooses which source operand we need to look at next.
+  int InVecNumElts = Op0->getType()->getVectorNumElements();
+  int RootElt = MaskVal;
+  Value *SourceOp = Op0;
+  if (MaskVal >= InVecNumElts) {
+    RootElt = MaskVal - InVecNumElts;
+    SourceOp = Op1;
+  }
+
+  // If the source operand is a shuffle itself, look through it to find the
+  // matching root vector.
+  if (auto *SourceShuf = dyn_cast<ShuffleVectorInst>(SourceOp)) {
+    return foldIdentityShuffles(
+        DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1),
+        SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse);
+  }
+
+  // TODO: Look through bitcasts? What if the bitcast changes the vector element
+  // size?
+
+  // The source operand is not a shuffle. Initialize the root vector value for
+  // this shuffle if that has not been done yet.
+  if (!RootVec)
+    RootVec = SourceOp;
+
+  // Give up as soon as a source operand does not match the existing root value.
+  if (RootVec != SourceOp)
+    return nullptr;
+
+  // The element must be coming from the same lane in the source vector
+  // (although it may have crossed lanes in intermediate shuffles).
+  if (RootElt != DestElt)
+    return nullptr;
+
+  return RootVec;
+}
+
+static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
+                                        Type *RetTy, const SimplifyQuery &Q,
+                                        unsigned MaxRecurse) {
+  if (isa<UndefValue>(Mask))
+    return UndefValue::get(RetTy);
+
+  Type *InVecTy = Op0->getType();
+  unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
+  unsigned InVecNumElts = InVecTy->getVectorNumElements();
+
+  SmallVector<int, 32> Indices;
+  ShuffleVectorInst::getShuffleMask(Mask, Indices);
+  assert(MaskNumElts == Indices.size() &&
+         "Size of Indices not same as number of mask elements?");
+
+  // Canonicalization: If mask does not select elements from an input vector,
+  // replace that input vector with undef.
+  bool MaskSelects0 = false, MaskSelects1 = false;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    if (Indices[i] == -1)
+      continue;
+    if ((unsigned)Indices[i] < InVecNumElts)
+      MaskSelects0 = true;
+    else
+      MaskSelects1 = true;
+  }
+  if (!MaskSelects0)
+    Op0 = UndefValue::get(InVecTy);
+  if (!MaskSelects1)
+    Op1 = UndefValue::get(InVecTy);
+
+  auto *Op0Const = dyn_cast<Constant>(Op0);
+  auto *Op1Const = dyn_cast<Constant>(Op1);
+
+  // If all operands are constant, constant fold the shuffle.
+  if (Op0Const && Op1Const)
+    return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
+
+  // Canonicalization: if only one input vector is constant, it shall be the
+  // second one.
+  if (Op0Const && !Op1Const) {
+    std::swap(Op0, Op1);
+    ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
+  }
+
+  // A shuffle of a splat is always the splat itself. Legal if the shuffle's
+  // value type is same as the input vectors' type.
+  if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
+    if (isa<UndefValue>(Op1) && RetTy == InVecTy &&
+        OpShuf->getMask()->getSplatValue())
+      return Op0;
+
+  // Don't fold a shuffle with undef mask elements. This may get folded in a
+  // better way using demanded bits or other analysis.
+  // TODO: Should we allow this?
+  if (find(Indices, -1) != Indices.end())
+    return nullptr;
+
+  // Check if every element of this shuffle can be mapped back to the
+  // corresponding element of a single root vector. If so, we don't need this
+  // shuffle. This handles simple identity shuffles as well as chains of
+  // shuffles that may widen/narrow and/or move elements across lanes and back.
+  Value *RootVec = nullptr;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    // Note that recursion is limited for each vector element, so if any element
+    // exceeds the limit, this will fail to simplify.
+    RootVec =
+        foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse);
+
+    // We can't replace a widening/narrowing shuffle with one of its operands.
+    if (!RootVec || RootVec->getType() != RetTy)
+      return nullptr;
+  }
+  return RootVec;
+}
+
+/// Given operands for a ShuffleVectorInst, fold the result or return null.
+Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
+                                       Type *RetTy, const SimplifyQuery &Q) {
+  return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
 }
 
 //=== Helper functions for higher up the class hierarchy.
@@ -4044,64 +4191,46 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
 /// Given operands for a BinaryOperator, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                            const Query &Q, unsigned MaxRecurse) {
+                            const SimplifyQuery &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
-    return SimplifyAddInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-
   case Instruction::Sub:
-    return SimplifySubInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-
-  case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::Mul:
+    return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FMul:
-    return SimplifyFMulInst (LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-  case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SDiv:
+    return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::UDiv:
+    return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FDiv:
-      return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SRem:
+    return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::URem:
+    return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FRem:
-      return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::Shl:
-    return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::LShr:
-    return SimplifyLShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
+    return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::AShr:
-    return SimplifyAShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
-  case Instruction::And: return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::Or:  return SimplifyOrInst (LHS, RHS, Q, MaxRecurse);
-  case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
+  case Instruction::And:
+    return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Or:
+    return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Xor:
+    return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   default:
-    if (Constant *CLHS = dyn_cast<Constant>(LHS))
-      if (Constant *CRHS = dyn_cast<Constant>(RHS))
-        return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
-
-    // If the operation is associative, try some generic simplifications.
-    if (Instruction::isAssociative(Opcode))
-      if (Value *V = SimplifyAssociativeBinOp(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    // If the operation is with the result of a select instruction check whether
-    // operating on either branch of the select always yields the same value.
-    if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-      if (Value *V = ThreadBinOpOverSelect(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    // If the operation is with the result of a phi instruction, check whether
-    // operating on all incoming values of the phi always yields the same value.
-    if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-      if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    return nullptr;
+    llvm_unreachable("Unexpected opcode");
   }
 }
 
@@ -4110,7 +4239,7 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
 /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
 static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                              const FastMathFlags &FMF, const Query &Q,
+                              const FastMathFlags &FMF, const SimplifyQuery &Q,
                               unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::FAdd:
@@ -4127,36 +4256,26 @@ static Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                           const DataLayout &DL, const TargetLibraryInfo *TLI,
-                           const DominatorTree *DT, AssumptionCache *AC,
-                           const Instruction *CxtI) {
-  return ::SimplifyBinOp(Opcode, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
-                         RecursionLimit);
+                           const SimplifyQuery &Q) {
+  return ::SimplifyBinOp(Opcode, LHS, RHS, Q, RecursionLimit);
 }
 
 Value *llvm::SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                             const FastMathFlags &FMF, const DataLayout &DL,
-                             const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+                             FastMathFlags FMF, const SimplifyQuery &Q) {
+  return ::SimplifyFPBinOp(Opcode, LHS, RHS, FMF, Q, RecursionLimit);
 }
 
 /// Given operands for a CmpInst, see if we can fold the result.
 static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              const Query &Q, unsigned MaxRecurse) {
+                              const SimplifyQuery &Q, unsigned MaxRecurse) {
   if (CmpInst::isIntPredicate((CmpInst::Predicate)Predicate))
     return SimplifyICmpInst(Predicate, LHS, RHS, Q, MaxRecurse);
   return SimplifyFCmpInst(Predicate, LHS, RHS, FastMathFlags(), Q, MaxRecurse);
 }
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                             const DataLayout &DL, const TargetLibraryInfo *TLI,
-                             const DominatorTree *DT, AssumptionCache *AC,
-                             const Instruction *CxtI) {
-  return ::SimplifyCmpInst(Predicate, LHS, RHS, Query(DL, TLI, DT, AC, CxtI),
-                           RecursionLimit);
+                             const SimplifyQuery &Q) {
+  return ::SimplifyCmpInst(Predicate, LHS, RHS, Q, RecursionLimit);
 }
 
 static bool IsIdempotent(Intrinsic::ID ID) {
@@ -4249,7 +4368,7 @@ static bool maskIsAllZeroOrUndef(Value *Mask) {
 
 template <typename IterTy>
 static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
-                                const Query &Q, unsigned MaxRecurse) {
+                                const SimplifyQuery &Q, unsigned MaxRecurse) {
   Intrinsic::ID IID = F->getIntrinsicID();
   unsigned NumOperands = std::distance(ArgBegin, ArgEnd);
 
@@ -4267,6 +4386,7 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
     case Intrinsic::fabs: {
       if (SignBitMustBeZero(*ArgBegin, Q.TLI))
         return *ArgBegin;
+      return nullptr;
     }
     default:
       return nullptr;
@@ -4296,19 +4416,21 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
     case Intrinsic::uadd_with_overflow:
     case Intrinsic::sadd_with_overflow: {
       // X + undef -> undef
-      if (isa<UndefValue>(RHS))
+      if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
         return UndefValue::get(ReturnType);
 
       return nullptr;
     }
     case Intrinsic::umul_with_overflow:
     case Intrinsic::smul_with_overflow: {
+      // 0 * X -> { 0, false }
       // X * 0 -> { 0, false }
-      if (match(RHS, m_Zero()))
+      if (match(LHS, m_Zero()) || match(RHS, m_Zero()))
         return Constant::getNullValue(ReturnType);
 
+      // undef * X -> { 0, false }
       // X * undef -> { 0, false }
-      if (match(RHS, m_Undef()))
+      if (match(LHS, m_Undef()) || match(RHS, m_Undef()))
         return Constant::getNullValue(ReturnType);
 
       return nullptr;
@@ -4341,8 +4463,9 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
 }
 
 template <typename IterTy>
-static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
-                           const Query &Q, unsigned MaxRecurse) {
+static Value *SimplifyCall(ImmutableCallSite CS, Value *V, IterTy ArgBegin,
+                           IterTy ArgEnd, const SimplifyQuery &Q,
+                           unsigned MaxRecurse) {
   Type *Ty = V->getType();
   if (PointerType *PTy = dyn_cast<PointerType>(Ty))
     Ty = PTy->getElementType();
@@ -4361,7 +4484,7 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
     if (Value *Ret = SimplifyIntrinsic(F, ArgBegin, ArgEnd, Q, MaxRecurse))
       return Ret;
 
-  if (!canConstantFoldCallTo(F))
+  if (!canConstantFoldCallTo(CS, F))
     return nullptr;
 
   SmallVector<Constant *, 4> ConstantArgs;
@@ -4373,181 +4496,170 @@ static Value *SimplifyCall(Value *V, IterTy ArgBegin, IterTy ArgEnd,
     ConstantArgs.push_back(C);
   }
 
-  return ConstantFoldCall(F, ConstantArgs, Q.TLI);
+  return ConstantFoldCall(CS, F, ConstantArgs, Q.TLI);
 }
 
-Value *llvm::SimplifyCall(Value *V, User::op_iterator ArgBegin,
-                          User::op_iterator ArgEnd, const DataLayout &DL,
-                          const TargetLibraryInfo *TLI, const DominatorTree *DT,
-                          AssumptionCache *AC, const Instruction *CxtI) {
-  return ::SimplifyCall(V, ArgBegin, ArgEnd, Query(DL, TLI, DT, AC, CxtI),
-                        RecursionLimit);
+Value *llvm::SimplifyCall(ImmutableCallSite CS, Value *V,
+                          User::op_iterator ArgBegin, User::op_iterator ArgEnd,
+                          const SimplifyQuery &Q) {
+  return ::SimplifyCall(CS, V, ArgBegin, ArgEnd, Q, RecursionLimit);
 }
 
-Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
-                          const DataLayout &DL, const TargetLibraryInfo *TLI,
-                          const DominatorTree *DT, AssumptionCache *AC,
-                          const Instruction *CxtI) {
-  return ::SimplifyCall(V, Args.begin(), Args.end(),
-                        Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+Value *llvm::SimplifyCall(ImmutableCallSite CS, Value *V,
+                          ArrayRef<Value *> Args, const SimplifyQuery &Q) {
+  return ::SimplifyCall(CS, V, Args.begin(), Args.end(), Q, RecursionLimit);
 }
 
 /// See if we can compute a simplified version of this instruction.
 /// If not, this returns null.
-Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
-                                 const TargetLibraryInfo *TLI,
-                                 const DominatorTree *DT, AssumptionCache *AC) {
+
+Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
+                                 OptimizationRemarkEmitter *ORE) {
+  const SimplifyQuery Q = SQ.CxtI ? SQ : SQ.getWithInstruction(I);
   Value *Result;
 
   switch (I->getOpcode()) {
   default:
-    Result = ConstantFoldInstruction(I, DL, TLI);
+    Result = ConstantFoldInstruction(I, Q.DL, Q.TLI);
     break;
   case Instruction::FAdd:
     Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
+                              I->getFastMathFlags(), Q);
     break;
   case Instruction::Add:
     Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
-                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), DL,
-                             TLI, DT, AC, I);
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::FSub:
     Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
+                              I->getFastMathFlags(), Q);
     break;
   case Instruction::Sub:
     Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
-                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), DL,
-                             TLI, DT, AC, I);
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::FMul:
     Result = SimplifyFMulInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
+                              I->getFastMathFlags(), Q);
     break;
   case Instruction::Mul:
-    Result =
-        SimplifyMulInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
+    Result = SimplifyMulInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::SDiv:
-    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
-                              AC, I);
+    Result = SimplifySDivInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::UDiv:
-    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
-                              AC, I);
+    Result = SimplifyUDivInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FDiv:
     Result = SimplifyFDivInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
+                              I->getFastMathFlags(), Q);
     break;
   case Instruction::SRem:
-    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
-                              AC, I);
+    Result = SimplifySRemInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::URem:
-    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT,
-                              AC, I);
+    Result = SimplifyURemInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FRem:
     Result = SimplifyFRemInst(I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
+                              I->getFastMathFlags(), Q);
     break;
   case Instruction::Shl:
     Result = SimplifyShlInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
-                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), DL,
-                             TLI, DT, AC, I);
+                             cast<BinaryOperator>(I)->hasNoUnsignedWrap(), Q);
     break;
   case Instruction::LShr:
     Result = SimplifyLShrInst(I->getOperand(0), I->getOperand(1),
-                              cast<BinaryOperator>(I)->isExact(), DL, TLI, DT,
-                              AC, I);
+                              cast<BinaryOperator>(I)->isExact(), Q);
     break;
   case Instruction::AShr:
     Result = SimplifyAShrInst(I->getOperand(0), I->getOperand(1),
-                              cast<BinaryOperator>(I)->isExact(), DL, TLI, DT,
-                              AC, I);
+                              cast<BinaryOperator>(I)->isExact(), Q);
     break;
   case Instruction::And:
-    Result =
-        SimplifyAndInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
+    Result = SimplifyAndInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::Or:
-    Result =
-        SimplifyOrInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
+    Result = SimplifyOrInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::Xor:
-    Result =
-        SimplifyXorInst(I->getOperand(0), I->getOperand(1), DL, TLI, DT, AC, I);
+    Result = SimplifyXorInst(I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::ICmp:
-    Result =
-        SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(), I->getOperand(0),
-                         I->getOperand(1), DL, TLI, DT, AC, I);
+    Result = SimplifyICmpInst(cast<ICmpInst>(I)->getPredicate(),
+                              I->getOperand(0), I->getOperand(1), Q);
     break;
   case Instruction::FCmp:
-    Result = SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(),
-                              I->getOperand(0), I->getOperand(1),
-                              I->getFastMathFlags(), DL, TLI, DT, AC, I);
+    Result =
+        SimplifyFCmpInst(cast<FCmpInst>(I)->getPredicate(), I->getOperand(0),
+                         I->getOperand(1), I->getFastMathFlags(), Q);
     break;
   case Instruction::Select:
     Result = SimplifySelectInst(I->getOperand(0), I->getOperand(1),
-                                I->getOperand(2), DL, TLI, DT, AC, I);
+                                I->getOperand(2), Q);
     break;
   case Instruction::GetElementPtr: {
-    SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
+    SmallVector<Value *, 8> Ops(I->op_begin(), I->op_end());
     Result = SimplifyGEPInst(cast<GetElementPtrInst>(I)->getSourceElementType(),
-                             Ops, DL, TLI, DT, AC, I);
+                             Ops, Q);
     break;
   }
   case Instruction::InsertValue: {
     InsertValueInst *IV = cast<InsertValueInst>(I);
     Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
                                      IV->getInsertedValueOperand(),
-                                     IV->getIndices(), DL, TLI, DT, AC, I);
+                                     IV->getIndices(), Q);
     break;
   }
   case Instruction::ExtractValue: {
     auto *EVI = cast<ExtractValueInst>(I);
     Result = SimplifyExtractValueInst(EVI->getAggregateOperand(),
-                                      EVI->getIndices(), DL, TLI, DT, AC, I);
+                                      EVI->getIndices(), Q);
     break;
   }
   case Instruction::ExtractElement: {
     auto *EEI = cast<ExtractElementInst>(I);
-    Result = SimplifyExtractElementInst(
-        EEI->getVectorOperand(), EEI->getIndexOperand(), DL, TLI, DT, AC, I);
+    Result = SimplifyExtractElementInst(EEI->getVectorOperand(),
+                                        EEI->getIndexOperand(), Q);
+    break;
+  }
+  case Instruction::ShuffleVector: {
+    auto *SVI = cast<ShuffleVectorInst>(I);
+    Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
+                                       SVI->getMask(), SVI->getType(), Q);
     break;
   }
   case Instruction::PHI:
-    Result = SimplifyPHINode(cast<PHINode>(I), Query(DL, TLI, DT, AC, I));
+    Result = SimplifyPHINode(cast<PHINode>(I), Q);
     break;
   case Instruction::Call: {
     CallSite CS(cast<CallInst>(I));
-    Result = SimplifyCall(CS.getCalledValue(), CS.arg_begin(), CS.arg_end(), DL,
-                          TLI, DT, AC, I);
+    Result = SimplifyCall(CS, CS.getCalledValue(), CS.arg_begin(), CS.arg_end(),
+                          Q);
     break;
   }
 #define HANDLE_CAST_INST(num, opc, clas) case Instruction::opc:
 #include "llvm/IR/Instruction.def"
 #undef HANDLE_CAST_INST
-    Result = SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(),
-                              DL, TLI, DT, AC, I);
+    Result =
+        SimplifyCastInst(I->getOpcode(), I->getOperand(0), I->getType(), Q);
+    break;
+  case Instruction::Alloca:
+    // No simplifications for Alloca and it can't be constant folded.
+    Result = nullptr;
     break;
   }
 
   // In general, it is possible for computeKnownBits to determine all bits in a
   // value even when the operands are not all constants.
   if (!Result && I->getType()->isIntOrIntVectorTy()) {
-    unsigned BitWidth = I->getType()->getScalarSizeInBits();
-    APInt KnownZero(BitWidth, 0);
-    APInt KnownOne(BitWidth, 0);
-    computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT);
-    if ((KnownZero | KnownOne).isAllOnesValue())
-      Result = ConstantInt::get(I->getType(), KnownOne);
+    KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
+    if (Known.isConstant())
+      Result = ConstantInt::get(I->getType(), Known.getConstant());
   }
 
   /// If called on unreachable code, the above logic may report that the
@@ -4599,7 +4711,7 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
     I = Worklist[Idx];
 
     // See if this instruction simplifies.
-    SimpleV = SimplifyInstruction(I, DL, TLI, DT, AC);
+    SimpleV = SimplifyInstruction(I, {DL, TLI, DT, AC});
     if (!SimpleV)
       continue;
 
@@ -4638,3 +4750,31 @@ bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
   assert(SimpleV && "Must provide a simplified value.");
   return replaceAndRecursivelySimplifyImpl(I, SimpleV, TLI, DT, AC);
 }
+
+namespace llvm {
+const SimplifyQuery getBestSimplifyQuery(Pass &P, Function &F) {
+  auto *DTWP = P.getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto *TLIWP = P.getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  auto *TLI = TLIWP ? &TLIWP->getTLI() : nullptr;
+  auto *ACWP = P.getAnalysisIfAvailable<AssumptionCacheTracker>();
+  auto *AC = ACWP ? &ACWP->getAssumptionCache(F) : nullptr;
+  return {F.getParent()->getDataLayout(), TLI, DT, AC};
+}
+
+const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &AR,
+                                         const DataLayout &DL) {
+  return {DL, &AR.TLI, &AR.DT, &AR.AC};
+}
+
+template <class T, class... TArgs>
+const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
+                                         Function &F) {
+  auto *DT = AM.template getCachedResult<DominatorTreeAnalysis>(F);
+  auto *TLI = AM.template getCachedResult<TargetLibraryAnalysis>(F);
+  auto *AC = AM.template getCachedResult<AssumptionAnalysis>(F);
+  return {F.getParent()->getDataLayout(), TLI, DT, AC};
+}
+template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
+                                                  Function &);
+}
diff --git a/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp b/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp
index d1374acd..3992657 100644
--- a/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/contrib/llvm/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -17,17 +17,9 @@
 #include <queue>
 
 namespace llvm {
-template <class NodeTy>
-void IDFCalculator<NodeTy>::calculate(
+template <class NodeTy, bool IsPostDom>
+void IDFCalculator<NodeTy, IsPostDom>::calculate(
     SmallVectorImpl<BasicBlock *> &PHIBlocks) {
-  // If we haven't computed dominator tree levels, do so now.
-  if (DomLevels.empty()) {
-    for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
-         DFI != DFE; ++DFI) {
-      DomLevels[*DFI] = DFI.getPathLength() - 1;
-    }
-  }
-
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
   typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
@@ -37,7 +29,7 @@ void IDFCalculator<NodeTy>::calculate(
 
   for (BasicBlock *BB : *DefBlocks) {
     if (DomTreeNode *Node = DT.getNode(BB))
-      PQ.push(std::make_pair(Node, DomLevels.lookup(Node)));
+      PQ.push({Node, Node->getLevel()});
   }
 
   SmallVector<DomTreeNode *, 32> Worklist;
@@ -64,10 +56,7 @@ void IDFCalculator<NodeTy>::calculate(
       BasicBlock *BB = Node->getBlock();
       // Succ is the successor in the direction we are calculating IDF, so it is
       // successor for IDF, and predecessor for Reverse IDF.
-      for (auto SuccIter = GraphTraits<NodeTy>::child_begin(BB),
-                End = GraphTraits<NodeTy>::child_end(BB);
-           SuccIter != End; ++SuccIter) {
-        BasicBlock *Succ = *SuccIter;
+      for (auto *Succ : children<NodeTy>(BB)) {
         DomTreeNode *SuccNode = DT.getNode(Succ);
 
         // Quickly skip all CFG edges that are also dominator tree edges instead
@@ -75,7 +64,7 @@ void IDFCalculator<NodeTy>::calculate(
         if (SuccNode->getIDom() == Node)
           continue;
 
-        unsigned SuccLevel = DomLevels.lookup(SuccNode);
+        const unsigned SuccLevel = SuccNode->getLevel();
         if (SuccLevel > RootLevel)
           continue;
 
@@ -99,6 +88,6 @@ void IDFCalculator<NodeTy>::calculate(
   }
 }
 
-template class IDFCalculator<BasicBlock *>;
-template class IDFCalculator<Inverse<BasicBlock *>>;
+template class IDFCalculator<BasicBlock *, false>;
+template class IDFCalculator<Inverse<BasicBlock *>, true>;
 }
diff --git a/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp b/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp
index 596b6fc..a8178ec 100644
--- a/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyBlockFrequencyInfo.cpp
@@ -9,7 +9,7 @@
 //
 // This is an alternative analysis pass to BlockFrequencyInfoWrapperPass.  The
 // difference is that with this pass the block frequencies are not computed when
-// the analysis pass is executed but rather when the BFI results is explicitly
+// the analysis pass is executed but rather when the BFI result is explicitly
 // requested by the analysis client.
 //
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp b/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp
index b51c6be..e2884d0 100644
--- a/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyBranchProbabilityInfo.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/LazyBranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 
 using namespace llvm;
 
@@ -24,6 +25,7 @@ using namespace llvm;
 INITIALIZE_PASS_BEGIN(LazyBranchProbabilityInfoPass, DEBUG_TYPE,
                       "Lazy Branch Probability Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(LazyBranchProbabilityInfoPass, DEBUG_TYPE,
                     "Lazy Branch Probability Analysis", true, true)
 
@@ -41,6 +43,7 @@ void LazyBranchProbabilityInfoPass::print(raw_ostream &OS,
 
 void LazyBranchProbabilityInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -48,16 +51,19 @@ void LazyBranchProbabilityInfoPass::releaseMemory() { LBPI.reset(); }
 
 bool LazyBranchProbabilityInfoPass::runOnFunction(Function &F) {
   LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  LBPI = llvm::make_unique<LazyBranchProbabilityInfo>(&F, &LI);
+  TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LBPI = llvm::make_unique<LazyBranchProbabilityInfo>(&F, &LI, &TLI);
   return false;
 }
 
 void LazyBranchProbabilityInfoPass::getLazyBPIAnalysisUsage(AnalysisUsage &AU) {
   AU.addRequired<LazyBranchProbabilityInfoPass>();
   AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
 void llvm::initializeLazyBPIPassPass(PassRegistry &Registry) {
   INITIALIZE_PASS_DEPENDENCY(LazyBranchProbabilityInfoPass);
   INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
+  INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass);
 }
diff --git a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
index f7cf8c6..d287f81 100644
--- a/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/contrib/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -8,36 +8,59 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "lcg"
 
+void LazyCallGraph::EdgeSequence::insertEdgeInternal(Node &TargetN,
+                                                     Edge::Kind EK) {
+  EdgeIndexMap.insert({&TargetN, Edges.size()});
+  Edges.emplace_back(TargetN, EK);
+}
+
+void LazyCallGraph::EdgeSequence::setEdgeKind(Node &TargetN, Edge::Kind EK) {
+  Edges[EdgeIndexMap.find(&TargetN)->second].setKind(EK);
+}
+
+bool LazyCallGraph::EdgeSequence::removeEdgeInternal(Node &TargetN) {
+  auto IndexMapI = EdgeIndexMap.find(&TargetN);
+  if (IndexMapI == EdgeIndexMap.end())
+    return false;
+
+  Edges[IndexMapI->second] = Edge();
+  EdgeIndexMap.erase(IndexMapI);
+  return true;
+}
+
 static void addEdge(SmallVectorImpl<LazyCallGraph::Edge> &Edges,
-                    DenseMap<Function *, int> &EdgeIndexMap, Function &F,
-                    LazyCallGraph::Edge::Kind EK) {
-  if (!EdgeIndexMap.insert({&F, Edges.size()}).second)
+                    DenseMap<LazyCallGraph::Node *, int> &EdgeIndexMap,
+                    LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK) {
+  if (!EdgeIndexMap.insert({&N, Edges.size()}).second)
     return;
 
-  DEBUG(dbgs() << "    Added callable function: " << F.getName() << "\n");
-  Edges.emplace_back(LazyCallGraph::Edge(F, EK));
+  DEBUG(dbgs() << "    Added callable function: " << N.getName() << "\n");
+  Edges.emplace_back(LazyCallGraph::Edge(N, EK));
 }
 
-LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
-    : G(&G), F(F), DFSNumber(0), LowLink(0) {
-  DEBUG(dbgs() << "  Adding functions called by '" << F.getName()
+LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
+  assert(!Edges && "Must not have already populated the edges for this node!");
+
+  DEBUG(dbgs() << "  Adding functions called by '" << getName()
                << "' to the graph.\n");
 
+  Edges = EdgeSequence();
+
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Function *, 4> Callees;
   SmallPtrSet<Constant *, 16> Visited;
@@ -58,14 +81,15 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
   // alias. Then a test of the address of the weak function against the new
   // strong definition's address would be an effective way to determine the
   // safety of optimizing a direct call edge.
-  for (BasicBlock &BB : F)
+  for (BasicBlock &BB : *F)
     for (Instruction &I : BB) {
       if (auto CS = CallSite(&I))
         if (Function *Callee = CS.getCalledFunction())
           if (!Callee->isDeclaration())
             if (Callees.insert(Callee).second) {
               Visited.insert(Callee);
-              addEdge(Edges, EdgeIndexMap, *Callee, LazyCallGraph::Edge::Call);
+              addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*Callee),
+                      LazyCallGraph::Edge::Call);
             }
 
       for (Value *Op : I.operand_values())
@@ -78,50 +102,59 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
   // function containing) operands to all of the instructions in the function.
   // Process them (recursively) collecting every function found.
   visitReferences(Worklist, Visited, [&](Function &F) {
-    addEdge(Edges, EdgeIndexMap, F, LazyCallGraph::Edge::Ref);
+    addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(F),
+            LazyCallGraph::Edge::Ref);
   });
-}
 
-void LazyCallGraph::Node::insertEdgeInternal(Function &Target, Edge::Kind EK) {
-  if (Node *N = G->lookup(Target))
-    return insertEdgeInternal(*N, EK);
+  // Add implicit reference edges to any defined libcall functions (if we
+  // haven't found an explicit edge).
+  for (auto *F : G->LibFunctions)
+    if (!Visited.count(F))
+      addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*F),
+              LazyCallGraph::Edge::Ref);
 
-  EdgeIndexMap.insert({&Target, Edges.size()});
-  Edges.emplace_back(Target, EK);
+  return *Edges;
 }
 
-void LazyCallGraph::Node::insertEdgeInternal(Node &TargetN, Edge::Kind EK) {
-  EdgeIndexMap.insert({&TargetN.getFunction(), Edges.size()});
-  Edges.emplace_back(TargetN, EK);
+void LazyCallGraph::Node::replaceFunction(Function &NewF) {
+  assert(F != &NewF && "Must not replace a function with itself!");
+  F = &NewF;
 }
 
-void LazyCallGraph::Node::setEdgeKind(Function &TargetF, Edge::Kind EK) {
-  Edges[EdgeIndexMap.find(&TargetF)->second].setKind(EK);
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const {
+  dbgs() << *this << '\n';
 }
+#endif
 
-void LazyCallGraph::Node::removeEdgeInternal(Function &Target) {
-  auto IndexMapI = EdgeIndexMap.find(&Target);
-  assert(IndexMapI != EdgeIndexMap.end() &&
-         "Target not in the edge set for this caller?");
+static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
+  LibFunc LF;
 
-  Edges[IndexMapI->second] = Edge();
-  EdgeIndexMap.erase(IndexMapI);
+  // Either this is a normal library function or a "vectorizable" function.
+  return TLI.getLibFunc(F, LF) || TLI.isFunctionVectorizable(F.getName());
 }
 
-void LazyCallGraph::Node::dump() const {
-  dbgs() << *this << '\n';
-}
-
-LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
+LazyCallGraph::LazyCallGraph(Module &M, TargetLibraryInfo &TLI) {
   DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
                << "\n");
-  for (Function &F : M)
-    if (!F.isDeclaration() && !F.hasLocalLinkage())
-      if (EntryIndexMap.insert({&F, EntryEdges.size()}).second) {
-        DEBUG(dbgs() << "  Adding '" << F.getName()
-                     << "' to entry set of the graph.\n");
-        EntryEdges.emplace_back(F, Edge::Ref);
-      }
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    // If this function is a known lib function to LLVM then we want to
+    // synthesize reference edges to it to model the fact that LLVM can turn
+    // arbitrary code into a library function call.
+    if (isKnownLibFunction(F, TLI))
+      LibFunctions.insert(&F);
+
+    if (F.hasLocalLinkage())
+      continue;
+
+    // External linkage defined functions have edges to them from other
+    // modules.
+    DEBUG(dbgs() << "  Adding '" << F.getName()
+                 << "' to entry set of the graph.\n");
+    addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
+  }
 
   // Now add entry nodes for functions reachable via initializers to globals.
   SmallVector<Constant *, 16> Worklist;
@@ -134,21 +167,16 @@ LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
   DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
                   "entry set.\n");
   visitReferences(Worklist, Visited, [&](Function &F) {
-    addEdge(EntryEdges, EntryIndexMap, F, LazyCallGraph::Edge::Ref);
+    addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F),
+            LazyCallGraph::Edge::Ref);
   });
-
-  for (const Edge &E : EntryEdges)
-    RefSCCEntryNodes.push_back(&E.getFunction());
 }
 
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
     : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
-      EntryEdges(std::move(G.EntryEdges)),
-      EntryIndexMap(std::move(G.EntryIndexMap)), SCCBPA(std::move(G.SCCBPA)),
+      EntryEdges(std::move(G.EntryEdges)), SCCBPA(std::move(G.SCCBPA)),
       SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)),
-      DFSStack(std::move(G.DFSStack)),
-      RefSCCEntryNodes(std::move(G.RefSCCEntryNodes)),
-      NextDFSNumber(G.NextDFSNumber) {
+      LibFunctions(std::move(G.LibFunctions)) {
   updateGraphPtrs();
 }
 
@@ -156,20 +184,19 @@ LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   BPA = std::move(G.BPA);
   NodeMap = std::move(G.NodeMap);
   EntryEdges = std::move(G.EntryEdges);
-  EntryIndexMap = std::move(G.EntryIndexMap);
   SCCBPA = std::move(G.SCCBPA);
   SCCMap = std::move(G.SCCMap);
   LeafRefSCCs = std::move(G.LeafRefSCCs);
-  DFSStack = std::move(G.DFSStack);
-  RefSCCEntryNodes = std::move(G.RefSCCEntryNodes);
-  NextDFSNumber = G.NextDFSNumber;
+  LibFunctions = std::move(G.LibFunctions);
   updateGraphPtrs();
   return *this;
 }
 
-void LazyCallGraph::SCC::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LazyCallGraph::SCC::dump() const {
   dbgs() << *this << '\n';
 }
+#endif
 
 #ifndef NDEBUG
 void LazyCallGraph::SCC::verify() {
@@ -184,8 +211,8 @@ void LazyCallGraph::SCC::verify() {
            "Must set DFS numbers to -1 when adding a node to an SCC!");
     assert(N->LowLink == -1 &&
            "Must set low link to -1 when adding a node to an SCC!");
-    for (Edge &E : *N)
-      assert(E.getNode() && "Can't have an edge to a raw function!");
+    for (Edge &E : **N)
+      assert(E.getNode() && "Can't have an unpopulated node!");
   }
 }
 #endif
@@ -195,10 +222,9 @@ bool LazyCallGraph::SCC::isParentOf(const SCC &C) const {
     return false;
 
   for (Node &N : *this)
-    for (Edge &E : N.calls())
-      if (Node *CalleeN = E.getNode())
-        if (OuterRefSCC->G->lookupSCC(*CalleeN) == &C)
-          return true;
+    for (Edge &E : N->calls())
+      if (OuterRefSCC->G->lookupSCC(E.getNode()) == &C)
+        return true;
 
   // No edges found.
   return false;
@@ -218,11 +244,8 @@ bool LazyCallGraph::SCC::isAncestorOf(const SCC &TargetC) const {
   do {
     const SCC &C = *Worklist.pop_back_val();
     for (Node &N : C)
-      for (Edge &E : N.calls()) {
-        Node *CalleeN = E.getNode();
-        if (!CalleeN)
-          continue;
-        SCC *CalleeC = G.lookupSCC(*CalleeN);
+      for (Edge &E : N->calls()) {
+        SCC *CalleeC = G.lookupSCC(E.getNode());
         if (!CalleeC)
           continue;
 
@@ -243,9 +266,11 @@ bool LazyCallGraph::SCC::isAncestorOf(const SCC &TargetC) const {
 
 LazyCallGraph::RefSCC::RefSCC(LazyCallGraph &G) : G(&G) {}
 
-void LazyCallGraph::RefSCC::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LazyCallGraph::RefSCC::dump() const {
   dbgs() << *this << '\n';
 }
+#endif
 
 #ifndef NDEBUG
 void LazyCallGraph::RefSCC::verify() {
@@ -279,10 +304,10 @@ void LazyCallGraph::RefSCC::verify() {
   for (int i = 0, Size = SCCs.size(); i < Size; ++i) {
     SCC &SourceSCC = *SCCs[i];
     for (Node &N : SourceSCC)
-      for (Edge &E : N) {
+      for (Edge &E : *N) {
         if (!E.isCall())
           continue;
-        SCC &TargetSCC = *G->lookupSCC(*E.getNode());
+        SCC &TargetSCC = *G->lookupSCC(E.getNode());
         if (&TargetSCC.getOuterRefSCC() == this) {
           assert(SCCIndices.find(&TargetSCC)->second <= i &&
                  "Edge between SCCs violates post-order relationship.");
@@ -299,8 +324,8 @@ void LazyCallGraph::RefSCC::verify() {
     auto HasConnectingEdge = [&] {
       for (SCC &C : *ParentRC)
         for (Node &N : C)
-          for (Edge &E : N)
-            if (G->lookupRefSCC(*E.getNode()) == this)
+          for (Edge &E : *N)
+            if (G->lookupRefSCC(E.getNode()) == this)
               return true;
       return false;
     };
@@ -459,9 +484,11 @@ updatePostorderSequenceForEdgeInsertion(
   return make_range(SCCs.begin() + SourceIdx, SCCs.begin() + TargetIdx);
 }
 
-SmallVector<LazyCallGraph::SCC *, 1>
-LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+bool
+LazyCallGraph::RefSCC::switchInternalEdgeToCall(
+    Node &SourceN, Node &TargetN,
+    function_ref<void(ArrayRef<SCC *> MergeSCCs)> MergeCB) {
+  assert(!(*SourceN)[TargetN].isCall() && "Must start with a ref edge!");
   SmallVector<SCC *, 1> DeletedSCCs;
 
 #ifndef NDEBUG
@@ -477,8 +504,8 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   // If the two nodes are already part of the same SCC, we're also done as
   // we've just added more connectivity.
   if (&SourceSCC == &TargetSCC) {
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
-    return DeletedSCCs;
+    SourceN->setEdgeKind(TargetN, Edge::Call);
+    return false; // No new cycle.
   }
 
   // At this point we leverage the postorder list of SCCs to detect when the
@@ -490,8 +517,8 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   int SourceIdx = SCCIndices[&SourceSCC];
   int TargetIdx = SCCIndices[&TargetSCC];
   if (TargetIdx < SourceIdx) {
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
-    return DeletedSCCs;
+    SourceN->setEdgeKind(TargetN, Edge::Call);
+    return false; // No new cycle.
   }
 
   // Compute the SCCs which (transitively) reach the source.
@@ -504,11 +531,9 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     ConnectedSet.insert(&SourceSCC);
     auto IsConnected = [&](SCC &C) {
       for (Node &N : C)
-        for (Edge &E : N.calls()) {
-          assert(E.getNode() && "Must have formed a node within an SCC!");
-          if (ConnectedSet.count(G->lookupSCC(*E.getNode())))
+        for (Edge &E : N->calls())
+          if (ConnectedSet.count(G->lookupSCC(E.getNode())))
             return true;
-        }
 
       return false;
     };
@@ -535,11 +560,10 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     do {
       SCC &C = *Worklist.pop_back_val();
       for (Node &N : C)
-        for (Edge &E : N) {
-          assert(E.getNode() && "Must have formed a node within an SCC!");
+        for (Edge &E : *N) {
           if (!E.isCall())
             continue;
-          SCC &EdgeC = *G->lookupSCC(*E.getNode());
+          SCC &EdgeC = *G->lookupSCC(E.getNode());
           if (&EdgeC.getOuterRefSCC() != this)
             // Not in this RefSCC...
             continue;
@@ -561,12 +585,16 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
       SourceSCC, TargetSCC, SCCs, SCCIndices, ComputeSourceConnectedSet,
       ComputeTargetConnectedSet);
 
+  // Run the user's callback on the merged SCCs before we actually merge them.
+  if (MergeCB)
+    MergeCB(makeArrayRef(MergeRange.begin(), MergeRange.end()));
+
   // If the merge range is empty, then adding the edge didn't actually form any
   // new cycles. We're done.
   if (MergeRange.begin() == MergeRange.end()) {
     // Now that the SCC structure is finalized, flip the kind to call.
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
-    return DeletedSCCs;
+    SourceN->setEdgeKind(TargetN, Edge::Call);
+    return false; // No new cycle.
   }
 
 #ifndef NDEBUG
@@ -600,15 +628,15 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     SCCIndices[C] -= IndexOffset;
 
   // Now that the SCC structure is finalized, flip the kind to call.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+  SourceN->setEdgeKind(TargetN, Edge::Call);
 
-  // And we're done!
-  return DeletedSCCs;
+  // And we're done, but we did form a new cycle.
+  return true;
 }
 
 void LazyCallGraph::RefSCC::switchTrivialInternalEdgeToRef(Node &SourceN,
                                                            Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
 #ifndef NDEBUG
   // In a debug build, verify the RefSCC is valid to start with and when this
@@ -625,12 +653,12 @@ void LazyCallGraph::RefSCC::switchTrivialInternalEdgeToRef(Node &SourceN,
          "Source and Target must be in separate SCCs for this to be trivial!");
 
   // Set the edge kind.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 }
 
 iterator_range<LazyCallGraph::RefSCC::iterator>
 LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
 #ifndef NDEBUG
   // In a debug build, verify the RefSCC is valid to start with and when this
@@ -650,7 +678,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
                                                 "full CG update.");
 
   // Set the edge kind.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 
   // Otherwise we are removing a call edge from a single SCC. This may break
   // the cycle. In order to compute the new set of SCCs, we need to do a small
@@ -665,7 +693,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
   // etc.
 
   SCC &OldSCC = TargetSCC;
-  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<std::pair<Node *, EdgeSequence::call_iterator>, 16> DFSStack;
   SmallVector<Node *, 16> PendingSCCStack;
   SmallVector<SCC *, 4> NewSCCs;
 
@@ -706,14 +734,14 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->call_begin()});
+    DFSStack.push_back({RootN, (*RootN)->call_begin()});
     do {
       Node *N;
-      call_edge_iterator I;
+      EdgeSequence::call_iterator I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->call_end();
+      auto E = (*N)->call_end();
       while (I != E) {
-        Node &ChildN = *I->getNode();
+        Node &ChildN = I->getNode();
         if (ChildN.DFSNumber == 0) {
           // We haven't yet visited this child, so descend, pushing the current
           // node onto the stack.
@@ -723,8 +751,8 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
                  "Found a node with 0 DFS number but already in an SCC!");
           ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
           N = &ChildN;
-          I = N->call_begin();
-          E = N->call_end();
+          I = (*N)->call_begin();
+          E = (*N)->call_end();
           continue;
         }
 
@@ -817,17 +845,19 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
 
 void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
                                                      Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+  assert(!(*SourceN)[TargetN].isCall() && "Must start with a ref edge!");
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) != this &&
          "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // Edges between RefSCCs are the same regardless of call or ref, so we can
   // just flip the edge here.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+  SourceN->setEdgeKind(TargetN, Edge::Call);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -837,17 +867,19 @@ void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
 
 void LazyCallGraph::RefSCC::switchOutgoingEdgeToRef(Node &SourceN,
                                                     Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) != this &&
          "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // Edges between RefSCCs are the same regardless of call or ref, so we can
   // just flip the edge here.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -860,7 +892,7 @@ void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
 
-  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+  SourceN->insertEdgeInternal(TargetN, Edge::Ref);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -871,14 +903,16 @@ void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
 void LazyCallGraph::RefSCC::insertOutgoingEdge(Node &SourceN, Node &TargetN,
                                                Edge::Kind EK) {
   // First insert it into the caller.
-  SourceN.insertEdgeInternal(TargetN, EK);
+  SourceN->insertEdgeInternal(TargetN, EK);
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
 
   RefSCC &TargetC = *G->lookupRefSCC(TargetN);
   assert(&TargetC != this && "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(TargetC.isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // The only change required is to add this SCC to the parent set of the
   // callee.
@@ -895,8 +929,10 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
   assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
   RefSCC &SourceC = *G->lookupRefSCC(SourceN);
   assert(&SourceC != this && "Source must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(SourceC.isDescendantOf(*this) &&
          "Source must be a descendant of the Target.");
+#endif
 
   SmallVector<RefSCC *, 1> DeletedRefSCCs;
 
@@ -951,9 +987,8 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
       RefSCC &RC = *Worklist.pop_back_val();
       for (SCC &C : RC)
         for (Node &N : C)
-          for (Edge &E : N) {
-            assert(E.getNode() && "Must have formed a node!");
-            RefSCC &EdgeRC = *G->lookupRefSCC(*E.getNode());
+          for (Edge &E : *N) {
+            RefSCC &EdgeRC = *G->lookupRefSCC(E.getNode());
             if (G->getRefSCCIndex(EdgeRC) <= SourceIdx)
               // Not in the postorder sequence between source and target.
               continue;
@@ -1003,10 +1038,8 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
       SCCIndices[&InnerC] = SCCIndex++;
       for (Node &N : InnerC) {
         G->SCCMap[&N] = &InnerC;
-        for (Edge &E : N) {
-          assert(E.getNode() &&
-                 "Cannot have a null node within a visited SCC!");
-          RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+        for (Edge &E : *N) {
+          RefSCC &ChildRC = *G->lookupRefSCC(E.getNode());
           if (MergeSet.count(&ChildRC))
             continue;
           ChildRC.Parents.erase(RC);
@@ -1042,7 +1075,7 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
 
   // At this point we have a merged RefSCC with a post-order SCCs list, just
   // connect the nodes to form the new edge.
-  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+  SourceN->insertEdgeInternal(TargetN, Edge::Ref);
 
   // We return the list of SCCs which were merged so that callers can
   // invalidate any data they have associated with those SCCs. Note that these
@@ -1069,15 +1102,16 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 #endif
 
   // First remove it from the node.
-  SourceN.removeEdgeInternal(TargetN.getFunction());
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 
   bool HasOtherEdgeToChildRC = false;
   bool HasOtherChildRC = false;
   for (SCC *InnerC : SCCs) {
     for (Node &N : *InnerC) {
-      for (Edge &E : N) {
-        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-        RefSCC &OtherChildRC = *G->lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &OtherChildRC = *G->lookupRefSCC(E.getNode());
         if (&OtherChildRC == &TargetRC) {
           HasOtherEdgeToChildRC = true;
           break;
@@ -1116,7 +1150,7 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 
 SmallVector<LazyCallGraph::RefSCC *, 1>
 LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() &&
+  assert(!(*SourceN)[TargetN].isCall() &&
          "Cannot remove a call edge, it must first be made a ref edge");
 
 #ifndef NDEBUG
@@ -1127,7 +1161,9 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
 #endif
 
   // First remove the actual edge.
-  SourceN.removeEdgeInternal(TargetN.getFunction());
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 
   // We return a list of the resulting *new* RefSCCs in post-order.
   SmallVector<RefSCC *, 1> Result;
@@ -1186,7 +1222,7 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
     PostOrderMapping[&N] = Number;
   };
 
-  SmallVector<std::pair<Node *, edge_iterator>, 4> DFSStack;
+  SmallVector<std::pair<Node *, EdgeSequence::iterator>, 4> DFSStack;
   SmallVector<Node *, 4> PendingRefSCCStack;
   do {
     assert(DFSStack.empty() &&
@@ -1205,18 +1241,18 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->begin()});
+    DFSStack.push_back({RootN, (*RootN)->begin()});
     do {
       Node *N;
-      edge_iterator I;
+      EdgeSequence::iterator I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->end();
+      auto E = (*N)->end();
 
       assert(N->DFSNumber != 0 && "We should always assign a DFS number "
                                   "before processing a node.");
 
       while (I != E) {
-        Node &ChildN = I->getNode(*G);
+        Node &ChildN = I->getNode();
         if (ChildN.DFSNumber == 0) {
           // Mark that we should start at this child when next this node is the
           // top of the stack. We don't start at the next child to ensure this
@@ -1226,8 +1262,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
           // Continue, resetting to the child node.
           ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
           N = &ChildN;
-          I = ChildN.begin();
-          E = ChildN.end();
+          I = ChildN->begin();
+          E = ChildN->end();
           continue;
         }
         if (ChildN.DFSNumber == -1) {
@@ -1382,9 +1418,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
 #endif
   for (SCC *C : SCCs)
     for (Node &N : *C) {
-      for (Edge &E : N) {
-        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-        RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &ChildRC = *G->lookupRefSCC(E.getNode());
         if (&ChildRC == this)
           continue;
         ChildRC.Parents.insert(this);
@@ -1408,9 +1443,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
   for (RefSCC *ParentRC : OldParents)
     for (SCC &ParentC : *ParentRC)
       for (Node &ParentN : ParentC)
-        for (Edge &E : ParentN) {
-          assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-          RefSCC &RC = *G->lookupRefSCC(*E.getNode());
+        for (Edge &E : *ParentN) {
+          RefSCC &RC = *G->lookupRefSCC(E.getNode());
           if (&RC != ParentRC)
             RC.Parents.insert(ParentRC);
         }
@@ -1448,8 +1482,10 @@ void LazyCallGraph::RefSCC::handleTrivialEdgeInsertion(Node &SourceN,
     return;
   }
 
+#ifdef EXPENSIVE_CHECKS
   assert(TargetRC.isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
   // The only change required is to add this RefSCC to the parent set of the
   // target. This is a set and so idempotent if the edge already existed.
   TargetRC.Parents.insert(this);
@@ -1461,25 +1497,29 @@ void LazyCallGraph::RefSCC::insertTrivialCallEdge(Node &SourceN,
   // Check that the RefSCC is still valid when we finish.
   auto ExitVerifier = make_scope_exit([this] { verify(); });
 
-  // Check that we aren't breaking some invariants of the SCC graph.
+#ifdef EXPENSIVE_CHECKS
+  // Check that we aren't breaking some invariants of the SCC graph. Note that
+  // this is quadratic in the number of edges in the call graph!
   SCC &SourceC = *G->lookupSCC(SourceN);
   SCC &TargetC = *G->lookupSCC(TargetN);
   if (&SourceC != &TargetC)
     assert(SourceC.isAncestorOf(TargetC) &&
            "Call edge is not trivial in the SCC graph!");
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // NDEBUG
+
   // First insert it into the source or find the existing edge.
-  auto InsertResult = SourceN.EdgeIndexMap.insert(
-      {&TargetN.getFunction(), SourceN.Edges.size()});
+  auto InsertResult =
+      SourceN->EdgeIndexMap.insert({&TargetN, SourceN->Edges.size()});
   if (!InsertResult.second) {
     // Already an edge, just update it.
-    Edge &E = SourceN.Edges[InsertResult.first->second];
+    Edge &E = SourceN->Edges[InsertResult.first->second];
     if (E.isCall())
       return; // Nothing to do!
     E.setKind(Edge::Call);
   } else {
     // Create the new edge.
-    SourceN.Edges.emplace_back(TargetN, Edge::Call);
+    SourceN->Edges.emplace_back(TargetN, Edge::Call);
   }
 
   // Now that we have the edge, handle the graph fallout.
@@ -1491,39 +1531,75 @@ void LazyCallGraph::RefSCC::insertTrivialRefEdge(Node &SourceN, Node &TargetN) {
   // Check that the RefSCC is still valid when we finish.
   auto ExitVerifier = make_scope_exit([this] { verify(); });
 
+#ifdef EXPENSIVE_CHECKS
   // Check that we aren't breaking some invariants of the RefSCC graph.
   RefSCC &SourceRC = *G->lookupRefSCC(SourceN);
   RefSCC &TargetRC = *G->lookupRefSCC(TargetN);
   if (&SourceRC != &TargetRC)
     assert(SourceRC.isAncestorOf(TargetRC) &&
            "Ref edge is not trivial in the RefSCC graph!");
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // NDEBUG
+
   // First insert it into the source or find the existing edge.
-  auto InsertResult = SourceN.EdgeIndexMap.insert(
-      {&TargetN.getFunction(), SourceN.Edges.size()});
+  auto InsertResult =
+      SourceN->EdgeIndexMap.insert({&TargetN, SourceN->Edges.size()});
   if (!InsertResult.second)
     // Already an edge, we're done.
     return;
 
   // Create the new edge.
-  SourceN.Edges.emplace_back(TargetN, Edge::Ref);
+  SourceN->Edges.emplace_back(TargetN, Edge::Ref);
 
   // Now that we have the edge, handle the graph fallout.
   handleTrivialEdgeInsertion(SourceN, TargetN);
 }
 
-void LazyCallGraph::insertEdge(Node &SourceN, Function &Target, Edge::Kind EK) {
-  assert(SCCMap.empty() && DFSStack.empty() &&
+void LazyCallGraph::RefSCC::replaceNodeFunction(Node &N, Function &NewF) {
+  Function &OldF = N.getFunction();
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid when we finish.
+  auto ExitVerifier = make_scope_exit([this] { verify(); });
+
+  assert(G->lookupRefSCC(N) == this &&
+         "Cannot replace the function of a node outside this RefSCC.");
+
+  assert(G->NodeMap.find(&NewF) == G->NodeMap.end() &&
+         "Must not have already walked the new function!'");
+
+  // It is important that this replacement not introduce graph changes so we
+  // insist that the caller has already removed every use of the original
+  // function and that all uses of the new function correspond to existing
+  // edges in the graph. The common and expected way to use this is when
+  // replacing the function itself in the IR without changing the call graph
+  // shape and just updating the analysis based on that.
+  assert(&OldF != &NewF && "Cannot replace a function with itself!");
+  assert(OldF.use_empty() &&
+         "Must have moved all uses from the old function to the new!");
+#endif
+
+  N.replaceFunction(NewF);
+
+  // Update various call graph maps.
+  G->NodeMap.erase(&OldF);
+  G->NodeMap[&NewF] = &N;
+}
+
+void LazyCallGraph::insertEdge(Node &SourceN, Node &TargetN, Edge::Kind EK) {
+  assert(SCCMap.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return SourceN.insertEdgeInternal(Target, EK);
+  return SourceN->insertEdgeInternal(TargetN, EK);
 }
 
-void LazyCallGraph::removeEdge(Node &SourceN, Function &Target) {
-  assert(SCCMap.empty() && DFSStack.empty() &&
+void LazyCallGraph::removeEdge(Node &SourceN, Node &TargetN) {
+  assert(SCCMap.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return SourceN.removeEdgeInternal(Target);
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 }
 
 void LazyCallGraph::removeDeadFunction(Function &F) {
@@ -1532,18 +1608,10 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   assert(F.use_empty() &&
          "This routine should only be called on trivially dead functions!");
 
-  auto EII = EntryIndexMap.find(&F);
-  if (EII != EntryIndexMap.end()) {
-    EntryEdges[EII->second] = Edge();
-    EntryIndexMap.erase(EII);
-  }
-
-  // It's safe to just remove un-visited functions from the RefSCC entry list.
-  // FIXME: This is a linear operation which could become hot and benefit from
-  // an index map.
-  auto RENI = find(RefSCCEntryNodes, &F);
-  if (RENI != RefSCCEntryNodes.end())
-    RefSCCEntryNodes.erase(RENI);
+  // We shouldn't remove library functions as they are never really dead while
+  // the call graph is in use -- every function definition refers to them.
+  assert(!isLibFunction(F) &&
+         "Must not remove lib functions from the call graph!");
 
   auto NI = NodeMap.find(&F);
   if (NI == NodeMap.end())
@@ -1553,22 +1621,16 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   Node &N = *NI->second;
   NodeMap.erase(NI);
 
-  if (SCCMap.empty() && DFSStack.empty()) {
-    // No SCC walk has begun, so removing this is fine and there is nothing
+  // Remove this from the entry edges if present.
+  EntryEdges.removeEdgeInternal(N);
+
+  if (SCCMap.empty()) {
+    // No SCCs have been formed, so removing this is fine and there is nothing
     // else necessary at this point but clearing out the node.
     N.clear();
     return;
   }
 
-  // Check that we aren't going to break the DFS walk.
-  assert(all_of(DFSStack,
-                [&N](const std::pair<Node *, edge_iterator> &Element) {
-                  return Element.first != &N;
-                }) &&
-         "Tried to remove a function currently in the DFS stack!");
-  assert(find(PendingRefSCCStack, &N) == PendingRefSCCStack.end() &&
-         "Tried to remove a function currently pending to add to a RefSCC!");
-
   // Cannot remove a function which has yet to be visited in the DFS walk, so
   // if we have a node at all then we must have an SCC and RefSCC.
   auto CI = SCCMap.find(&N);
@@ -1583,13 +1645,19 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   // Validate these properties first.
   assert(C.size() == 1 && "Dead functions must be in a singular SCC");
   assert(RC.size() == 1 && "Dead functions must be in a singular RefSCC");
-  assert(RC.Parents.empty() && "Cannot have parents of a dead RefSCC!");
+
+  // Clean up any remaining reference edges. Note that we walk an unordered set
+  // here but are just removing and so the order doesn't matter.
+  for (RefSCC &ParentRC : RC.parents())
+    for (SCC &ParentC : ParentRC)
+      for (Node &ParentN : ParentC)
+        if (ParentN)
+          ParentN->removeEdgeInternal(N);
 
   // Now remove this RefSCC from any parents sets and the leaf list.
-  for (Edge &E : N)
-    if (Node *TargetN = E.getNode())
-      if (RefSCC *TargetRC = lookupRefSCC(*TargetN))
-        TargetRC->Parents.erase(&RC);
+  for (Edge &E : *N)
+    if (RefSCC *TargetRC = lookupRefSCC(E.getNode()))
+      TargetRC->Parents.erase(&RC);
   // FIXME: This is a linear operation which could become hot and benefit from
   // an index map.
   auto LRI = find(LeafRefSCCs, &RC);
@@ -1622,15 +1690,14 @@ void LazyCallGraph::updateGraphPtrs() {
   {
     SmallVector<Node *, 16> Worklist;
     for (Edge &E : EntryEdges)
-      if (Node *EntryN = E.getNode())
-        Worklist.push_back(EntryN);
+      Worklist.push_back(&E.getNode());
 
     while (!Worklist.empty()) {
-      Node *N = Worklist.pop_back_val();
-      N->G = this;
-      for (Edge &E : N->Edges)
-        if (Node *TargetN = E.getNode())
-          Worklist.push_back(TargetN);
+      Node &N = *Worklist.pop_back_val();
+      N.G = this;
+      if (N)
+        for (Edge &E : *N)
+          Worklist.push_back(&E.getNode());
     }
   }
 
@@ -1647,34 +1714,18 @@ void LazyCallGraph::updateGraphPtrs() {
   }
 }
 
-/// Build the internal SCCs for a RefSCC from a sequence of nodes.
-///
-/// Appends the SCCs to the provided vector and updates the map with their
-/// indices. Both the vector and map must be empty when passed into this
-/// routine.
-void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
-  assert(RC.SCCs.empty() && "Already built SCCs!");
-  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
-
-  for (Node *N : Nodes) {
-    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
-           "We cannot have a low link in an SCC lower than its root on the "
-           "stack!");
-
-    // This node will go into the next RefSCC, clear out its DFS and low link
-    // as we scan.
-    N->DFSNumber = N->LowLink = 0;
-  }
+template <typename RootsT, typename GetBeginT, typename GetEndT,
+          typename GetNodeT, typename FormSCCCallbackT>
+void LazyCallGraph::buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
+                                     GetEndT &&GetEnd, GetNodeT &&GetNode,
+                                     FormSCCCallbackT &&FormSCC) {
+  typedef decltype(GetBegin(std::declval<Node &>())) EdgeItT;
 
-  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
-  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
-  // internal storage as we won't need it for the outer graph's DFS any longer.
-
-  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<std::pair<Node *, EdgeItT>, 16> DFSStack;
   SmallVector<Node *, 16> PendingSCCStack;
 
   // Scan down the stack and DFS across the call edges.
-  for (Node *RootN : Nodes) {
+  for (Node *RootN : Roots) {
     assert(DFSStack.empty() &&
            "Cannot begin a new root with a non-empty DFS stack!");
     assert(PendingSCCStack.empty() &&
@@ -1690,25 +1741,23 @@ void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->call_begin()});
+    DFSStack.push_back({RootN, GetBegin(*RootN)});
     do {
       Node *N;
-      call_edge_iterator I;
+      EdgeItT I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->call_end();
+      auto E = GetEnd(*N);
       while (I != E) {
-        Node &ChildN = *I->getNode();
+        Node &ChildN = GetNode(I);
         if (ChildN.DFSNumber == 0) {
           // We haven't yet visited this child, so descend, pushing the current
           // node onto the stack.
           DFSStack.push_back({N, I});
 
-          assert(!lookupSCC(ChildN) &&
-                 "Found a node with 0 DFS number but already in an SCC!");
           ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
           N = &ChildN;
-          I = N->call_begin();
-          E = N->call_end();
+          I = GetBegin(*N);
+          E = GetEnd(*N);
           continue;
         }
 
@@ -1750,20 +1799,93 @@ void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
           }));
       // Form a new SCC out of these nodes and then clear them off our pending
       // stack.
-      RC.SCCs.push_back(createSCC(RC, SCCNodes));
-      for (Node &N : *RC.SCCs.back()) {
-        N.DFSNumber = N.LowLink = -1;
-        SCCMap[&N] = RC.SCCs.back();
-      }
+      FormSCC(SCCNodes);
       PendingSCCStack.erase(SCCNodes.end().base(), PendingSCCStack.end());
     } while (!DFSStack.empty());
   }
+}
+
+/// Build the internal SCCs for a RefSCC from a sequence of nodes.
+///
+/// Appends the SCCs to the provided vector and updates the map with their
+/// indices. Both the vector and map must be empty when passed into this
+/// routine.
+void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
+  assert(RC.SCCs.empty() && "Already built SCCs!");
+  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
+
+  for (Node *N : Nodes) {
+    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
+           "We cannot have a low link in an SCC lower than its root on the "
+           "stack!");
+
+    // This node will go into the next RefSCC, clear out its DFS and low link
+    // as we scan.
+    N->DFSNumber = N->LowLink = 0;
+  }
+
+  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
+  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
+  // internal storage as we won't need it for the outer graph's DFS any longer.
+  buildGenericSCCs(
+      Nodes, [](Node &N) { return N->call_begin(); },
+      [](Node &N) { return N->call_end(); },
+      [](EdgeSequence::call_iterator I) -> Node & { return I->getNode(); },
+      [this, &RC](node_stack_range Nodes) {
+        RC.SCCs.push_back(createSCC(RC, Nodes));
+        for (Node &N : *RC.SCCs.back()) {
+          N.DFSNumber = N.LowLink = -1;
+          SCCMap[&N] = RC.SCCs.back();
+        }
+      });
 
   // Wire up the SCC indices.
   for (int i = 0, Size = RC.SCCs.size(); i < Size; ++i)
     RC.SCCIndices[RC.SCCs[i]] = i;
 }
 
+void LazyCallGraph::buildRefSCCs() {
+  if (EntryEdges.empty() || !PostOrderRefSCCs.empty())
+    // RefSCCs are either non-existent or already built!
+    return;
+
+  assert(RefSCCIndices.empty() && "Already mapped RefSCC indices!");
+
+  SmallVector<Node *, 16> Roots;
+  for (Edge &E : *this)
+    Roots.push_back(&E.getNode());
+
+  // The roots will be popped of a stack, so use reverse to get a less
+  // surprising order. This doesn't change any of the semantics anywhere.
+  std::reverse(Roots.begin(), Roots.end());
+
+  buildGenericSCCs(
+      Roots,
+      [](Node &N) {
+        // We need to populate each node as we begin to walk its edges.
+        N.populate();
+        return N->begin();
+      },
+      [](Node &N) { return N->end(); },
+      [](EdgeSequence::iterator I) -> Node & { return I->getNode(); },
+      [this](node_stack_range Nodes) {
+        RefSCC *NewRC = createRefSCC(*this);
+        buildSCCs(*NewRC, Nodes);
+        connectRefSCC(*NewRC);
+
+        // Push the new node into the postorder list and remember its position
+        // in the index map.
+        bool Inserted =
+            RefSCCIndices.insert({NewRC, PostOrderRefSCCs.size()}).second;
+        (void)Inserted;
+        assert(Inserted && "Cannot already have this RefSCC in the index map!");
+        PostOrderRefSCCs.push_back(NewRC);
+#ifndef NDEBUG
+        NewRC->verify();
+#endif
+      });
+}
+
 // FIXME: We should move callers of this to embed the parent linking and leaf
 // tracking into their DFS in order to remove a full walk of all edges.
 void LazyCallGraph::connectRefSCC(RefSCC &RC) {
@@ -1773,10 +1895,8 @@ void LazyCallGraph::connectRefSCC(RefSCC &RC) {
   bool IsLeaf = true;
   for (SCC &C : RC)
     for (Node &N : C)
-      for (Edge &E : N) {
-        assert(E.getNode() &&
-               "Cannot have a missing node in a visited part of the graph!");
-        RefSCC &ChildRC = *lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &ChildRC = *lookupRefSCC(E.getNode());
         if (&ChildRC == &RC)
           continue;
         ChildRC.Parents.insert(&RC);
@@ -1788,113 +1908,13 @@ void LazyCallGraph::connectRefSCC(RefSCC &RC) {
     LeafRefSCCs.push_back(&RC);
 }
 
-bool LazyCallGraph::buildNextRefSCCInPostOrder() {
-  if (DFSStack.empty()) {
-    Node *N;
-    do {
-      // If we've handled all candidate entry nodes to the SCC forest, we're
-      // done.
-      if (RefSCCEntryNodes.empty())
-        return false;
-
-      N = &get(*RefSCCEntryNodes.pop_back_val());
-    } while (N->DFSNumber != 0);
-
-    // Found a new root, begin the DFS here.
-    N->LowLink = N->DFSNumber = 1;
-    NextDFSNumber = 2;
-    DFSStack.push_back({N, N->begin()});
-  }
-
-  for (;;) {
-    Node *N;
-    edge_iterator I;
-    std::tie(N, I) = DFSStack.pop_back_val();
-
-    assert(N->DFSNumber > 0 && "We should always assign a DFS number "
-                               "before placing a node onto the stack.");
-
-    auto E = N->end();
-    while (I != E) {
-      Node &ChildN = I->getNode(*this);
-      if (ChildN.DFSNumber == 0) {
-        // We haven't yet visited this child, so descend, pushing the current
-        // node onto the stack.
-        DFSStack.push_back({N, N->begin()});
-
-        assert(!SCCMap.count(&ChildN) &&
-               "Found a node with 0 DFS number but already in an SCC!");
-        ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
-        N = &ChildN;
-        I = N->begin();
-        E = N->end();
-        continue;
-      }
-
-      // If the child has already been added to some child component, it
-      // couldn't impact the low-link of this parent because it isn't
-      // connected, and thus its low-link isn't relevant so skip it.
-      if (ChildN.DFSNumber == -1) {
-        ++I;
-        continue;
-      }
-
-      // Track the lowest linked child as the lowest link for this node.
-      assert(ChildN.LowLink > 0 && "Must have a positive low-link number!");
-      if (ChildN.LowLink < N->LowLink)
-        N->LowLink = ChildN.LowLink;
-
-      // Move to the next edge.
-      ++I;
-    }
-
-    // We've finished processing N and its descendents, put it on our pending
-    // SCC stack to eventually get merged into an SCC of nodes.
-    PendingRefSCCStack.push_back(N);
-
-    // If this node is linked to some lower entry, continue walking up the
-    // stack.
-    if (N->LowLink != N->DFSNumber) {
-      assert(!DFSStack.empty() &&
-             "We never found a viable root for an SCC to pop off!");
-      continue;
-    }
-
-    // Otherwise, form a new RefSCC from the top of the pending node stack.
-    int RootDFSNumber = N->DFSNumber;
-    // Find the range of the node stack by walking down until we pass the
-    // root DFS number.
-    auto RefSCCNodes = node_stack_range(
-        PendingRefSCCStack.rbegin(),
-        find_if(reverse(PendingRefSCCStack), [RootDFSNumber](const Node *N) {
-          return N->DFSNumber < RootDFSNumber;
-        }));
-    // Form a new RefSCC out of these nodes and then clear them off our pending
-    // stack.
-    RefSCC *NewRC = createRefSCC(*this);
-    buildSCCs(*NewRC, RefSCCNodes);
-    connectRefSCC(*NewRC);
-    PendingRefSCCStack.erase(RefSCCNodes.end().base(),
-                             PendingRefSCCStack.end());
-
-    // Push the new node into the postorder list and return true indicating we
-    // successfully grew the postorder sequence by one.
-    bool Inserted =
-        RefSCCIndices.insert({NewRC, PostOrderRefSCCs.size()}).second;
-    (void)Inserted;
-    assert(Inserted && "Cannot already have this RefSCC in the index map!");
-    PostOrderRefSCCs.push_back(NewRC);
-    return true;
-  }
-}
-
 AnalysisKey LazyCallGraphAnalysis::Key;
 
 LazyCallGraphPrinterPass::LazyCallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
 
 static void printNode(raw_ostream &OS, LazyCallGraph::Node &N) {
   OS << "  Edges in function: " << N.getFunction().getName() << "\n";
-  for (const LazyCallGraph::Edge &E : N)
+  for (LazyCallGraph::Edge &E : N.populate())
     OS << "    " << (E.isCall() ? "call" : "ref ") << " -> "
        << E.getFunction().getName() << "\n";
 
@@ -1929,6 +1949,7 @@ PreservedAnalyses LazyCallGraphPrinterPass::run(Module &M,
   for (Function &F : M)
     printNode(OS, G.get(F));
 
+  G.buildRefSCCs();
   for (LazyCallGraph::RefSCC &C : G.postorder_ref_sccs())
     printRefSCC(OS, C);
 
@@ -1941,7 +1962,7 @@ LazyCallGraphDOTPrinterPass::LazyCallGraphDOTPrinterPass(raw_ostream &OS)
 static void printNodeDOT(raw_ostream &OS, LazyCallGraph::Node &N) {
   std::string Name = "\"" + DOT::EscapeString(N.getFunction().getName()) + "\"";
 
-  for (const LazyCallGraph::Edge &E : N) {
+  for (LazyCallGraph::Edge &E : N.populate()) {
     OS << "  " << Name << " -> \""
        << DOT::EscapeString(E.getFunction().getName()) << "\"";
     if (!E.isCall()) // It is a ref edge.
diff --git a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
index d442310..102081e 100644
--- a/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <stack>
@@ -39,6 +41,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "lazy-value-info"
 
+// This is the number of worklist items we will process to try to discover an
+// answer for a given value.
+static const unsigned MaxProcessedPerValue = 500;
+
 char LazyValueInfoWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
@@ -136,7 +142,7 @@ public:
     return Val;
   }
 
-  ConstantRange getConstantRange() const {
+  const ConstantRange &getConstantRange() const {
     assert(isConstantRange() &&
            "Cannot get the constant-range of a non-constant-range!");
     return Range;
@@ -244,7 +250,7 @@ public:
     if (NewR.isFullSet())
       markOverdefined();
     else
-      markConstantRange(NewR);
+      markConstantRange(std::move(NewR));
   }
 };
 
@@ -296,7 +302,7 @@ static bool hasSingleValue(const LVILatticeVal &Val) {
 ///   contradictory.  If this happens, we return some valid lattice value so as
 ///   not confuse the rest of LVI.  Ideally, we'd always return Undefined, but
 ///   we do not make this guarantee.  TODO: This would be a useful enhancement.
-static LVILatticeVal intersect(LVILatticeVal A, LVILatticeVal B) {
+static LVILatticeVal intersect(const LVILatticeVal &A, const LVILatticeVal &B) {
   // Undefined is the strongest state.  It means the value is known to be along
   // an unreachable path.
   if (A.isUndefined())
@@ -366,22 +372,22 @@ namespace {
     struct ValueCacheEntryTy {
       ValueCacheEntryTy(Value *V, LazyValueInfoCache *P) : Handle(V, P) {}
       LVIValueHandle Handle;
-      SmallDenseMap<AssertingVH<BasicBlock>, LVILatticeVal, 4> BlockVals;
+      SmallDenseMap<PoisoningVH<BasicBlock>, LVILatticeVal, 4> BlockVals;
     };
 
-    /// This is all of the cached information for all values,
-    /// mapped from Value* to key information.
-    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
-
     /// This tracks, on a per-block basis, the set of values that are
     /// over-defined at the end of that block.
-    typedef DenseMap<AssertingVH<BasicBlock>, SmallPtrSet<Value *, 4>>
+    typedef DenseMap<PoisoningVH<BasicBlock>, SmallPtrSet<Value *, 4>>
         OverDefinedCacheTy;
-    OverDefinedCacheTy OverDefinedCache;
-
     /// Keep track of all blocks that we have ever seen, so we
     /// don't spend time removing unused blocks from our caches.
-    DenseSet<AssertingVH<BasicBlock> > SeenBlocks;
+    DenseSet<PoisoningVH<BasicBlock> > SeenBlocks;
+
+    /// This is all of the cached information for all values,
+    /// mapped from Value* to key information.
+    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
+    OverDefinedCacheTy OverDefinedCache;
+
 
   public:
     void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
@@ -459,15 +465,15 @@ namespace {
 }
 
 void LazyValueInfoCache::eraseValue(Value *V) {
-  SmallVector<AssertingVH<BasicBlock>, 4> ToErase;
-  for (auto &I : OverDefinedCache) {
-    SmallPtrSetImpl<Value *> &ValueSet = I.second;
+  for (auto I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E;) {
+    // Copy and increment the iterator immediately so we can erase behind
+    // ourselves.
+    auto Iter = I++;
+    SmallPtrSetImpl<Value *> &ValueSet = Iter->second;
     ValueSet.erase(V);
     if (ValueSet.empty())
-      ToErase.push_back(I.first);
+      OverDefinedCache.erase(Iter);
   }
-  for (auto &BB : ToErase)
-    OverDefinedCache.erase(BB);
 
   ValueCache.erase(V);
 }
@@ -480,7 +486,7 @@ void LVIValueHandle::deleted() {
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
   // Shortcut if we have never seen this block.
-  DenseSet<AssertingVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
+  DenseSet<PoisoningVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
   if (I == SeenBlocks.end())
     return;
   SeenBlocks.erase(I);
@@ -551,6 +557,30 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
   }
 }
 
+
+namespace {
+/// An assembly annotator class to print LazyValueCache information in
+/// comments.
+class LazyValueInfoImpl;
+class LazyValueInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+  LazyValueInfoImpl *LVIImpl;
+  // While analyzing which blocks we can solve values for, we need the dominator
+  // information. Since this is an optional parameter in LVI, we require this
+  // DomTreeAnalysis pass in the printer pass, and pass the dominator
+  // tree to the LazyValueInfoAnnotatedWriter.
+  DominatorTree &DT;
+
+public:
+  LazyValueInfoAnnotatedWriter(LazyValueInfoImpl *L, DominatorTree &DTree)
+      : LVIImpl(L), DT(DTree) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS);
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS);
+};
+}
 namespace {
   // The actual implementation of the lazy analysis and update.  Note that the
   // inheritance from LazyValueInfoCache is intended to be temporary while
@@ -563,7 +593,7 @@ namespace {
     /// This stack holds the state of the value solver during a query.
     /// It basically emulates the callstack of the naive
     /// recursive value lookup process.
-    std::stack<std::pair<BasicBlock*, Value*> > BlockValueStack;
+    SmallVector<std::pair<BasicBlock*, Value*>, 8> BlockValueStack;
 
     /// Keeps track of which block-value pairs are in BlockValueStack.
     DenseSet<std::pair<BasicBlock*, Value*> > BlockValueSet;
@@ -576,7 +606,7 @@ namespace {
 
       DEBUG(dbgs() << "PUSH: " << *BV.second << " in " << BV.first->getName()
                    << "\n");
-      BlockValueStack.push(BV);
+      BlockValueStack.push_back(BV);
       return true;
     }
 
@@ -598,13 +628,13 @@ namespace {
   bool solveBlockValuePHINode(LVILatticeVal &BBLV, PHINode *PN, BasicBlock *BB);
   bool solveBlockValueSelect(LVILatticeVal &BBLV, SelectInst *S,
                              BasicBlock *BB);
-  bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, Instruction *BBI,
+  bool solveBlockValueBinaryOp(LVILatticeVal &BBLV, BinaryOperator *BBI,
                                BasicBlock *BB);
-  bool solveBlockValueCast(LVILatticeVal &BBLV, Instruction *BBI,
+  bool solveBlockValueCast(LVILatticeVal &BBLV, CastInst *CI,
                            BasicBlock *BB);
   void intersectAssumeOrGuardBlockValueConstantRange(Value *Val,
                                                      LVILatticeVal &BBLV,
-                                              Instruction *BBI);
+                                                     Instruction *BBI);
 
   void solve();
 
@@ -629,6 +659,12 @@ namespace {
       TheCache.clear();
     }
 
+    /// Printing the LazyValueInfo Analysis.
+    void printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) {
+        LazyValueInfoAnnotatedWriter Writer(this, DTree);
+        F.print(OS, &Writer);
+    }
+
     /// This is part of the update interface to inform the cache
     /// that a block has been deleted.
     void eraseBlock(BasicBlock *BB) {
@@ -645,25 +681,52 @@ namespace {
   };
 } // end anonymous namespace
 
+
 void LazyValueInfoImpl::solve() {
+  SmallVector<std::pair<BasicBlock *, Value *>, 8> StartingStack(
+      BlockValueStack.begin(), BlockValueStack.end());
+
+  unsigned processedCount = 0;
   while (!BlockValueStack.empty()) {
-    std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
+    processedCount++;
+    // Abort if we have to process too many values to get a result for this one.
+    // Because of the design of the overdefined cache currently being per-block
+    // to avoid naming-related issues (IE it wants to try to give different
+    // results for the same name in different blocks), overdefined results don't
+    // get cached globally, which in turn means we will often try to rediscover
+    // the same overdefined result again and again.  Once something like
+    // PredicateInfo is used in LVI or CVP, we should be able to make the
+    // overdefined cache global, and remove this throttle.
+    if (processedCount > MaxProcessedPerValue) {
+      DEBUG(dbgs() << "Giving up on stack because we are getting too deep\n");
+      // Fill in the original values
+      while (!StartingStack.empty()) {
+        std::pair<BasicBlock *, Value *> &e = StartingStack.back();
+        TheCache.insertResult(e.second, e.first,
+                              LVILatticeVal::getOverdefined());
+        StartingStack.pop_back();
+      }
+      BlockValueSet.clear();
+      BlockValueStack.clear();
+      return;
+    }
+    std::pair<BasicBlock *, Value *> e = BlockValueStack.back();
     assert(BlockValueSet.count(e) && "Stack value should be in BlockValueSet!");
 
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
-      assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
+      assert(BlockValueStack.back() == e && "Nothing should have been pushed!");
       assert(TheCache.hasCachedValueInfo(e.second, e.first) &&
              "Result should be in cache!");
 
       DEBUG(dbgs() << "POP " << *e.second << " in " << e.first->getName()
                    << " = " << TheCache.getCachedValueInfo(e.second, e.first) << "\n");
 
-      BlockValueStack.pop();
+      BlockValueStack.pop_back();
       BlockValueSet.erase(e);
     } else {
       // More work needs to be done before revisiting.
-      assert(BlockValueStack.top() != e && "Stack should have been pushed!");
+      assert(BlockValueStack.back() != e && "Stack should have been pushed!");
     }
   }
 }
@@ -743,7 +806,7 @@ bool LazyValueInfoImpl::solveBlockValueImpl(LVILatticeVal &Res,
   // that for all other pointer typed values, we terminate the search at the
   // definition.  We could easily extend this to look through geps, bitcasts,
   // and the like to prove non-nullness, but it's not clear that's worth it
-  // compile time wise.  The context-insensative value walk done inside
+  // compile time wise.  The context-insensitive value walk done inside
   // isKnownNonNull gets most of the profitable cases at much less expense.
   // This does mean that we have a sensativity to where the defining
   // instruction is placed, even if it could legally be hoisted much higher.
@@ -754,12 +817,12 @@ bool LazyValueInfoImpl::solveBlockValueImpl(LVILatticeVal &Res,
     return true;
   }
   if (BBI->getType()->isIntegerTy()) {
-    if (isa<CastInst>(BBI))
-      return solveBlockValueCast(Res, BBI, BB);
-    
+    if (auto *CI = dyn_cast<CastInst>(BBI))
+      return solveBlockValueCast(Res, CI, BB);
+
     BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI);
     if (BO && isa<ConstantInt>(BO->getOperand(1)))
-      return solveBlockValueBinaryOp(Res, BBI, BB);
+      return solveBlockValueBinaryOp(Res, BO, BB);
   }
 
   DEBUG(dbgs() << " compute BB '" << BB->getName()
@@ -825,7 +888,7 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   // value is overdefined.
   if (BB == &BB->getParent()->getEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
-    // Bofore giving up, see if we can prove the pointer non-null local to
+    // Before giving up, see if we can prove the pointer non-null local to
     // this particular block.
     if (Val->getType()->isPointerTy() &&
         (isKnownNonNull(Val) || isObjectDereferencedInBlock(Val, BB))) {
@@ -839,13 +902,19 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   }
 
   // Loop over all of our predecessors, merging what we know from them into
-  // result.
-  bool EdgesMissing = false;
+  // result.  If we encounter an unexplored predecessor, we eagerly explore it
+  // in a depth first manner.  In practice, this has the effect of discovering
+  // paths we can't analyze eagerly without spending compile times analyzing
+  // other paths.  This heuristic benefits from the fact that predecessors are
+  // frequently arranged such that dominating ones come first and we quickly
+  // find a path to function entry.  TODO: We should consider explicitly
+  // canonicalizing to make this true rather than relying on this happy
+  // accident.  
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     LVILatticeVal EdgeResult;
-    EdgesMissing |= !getEdgeValue(Val, *PI, BB, EdgeResult);
-    if (EdgesMissing)
-      continue;
+    if (!getEdgeValue(Val, *PI, BB, EdgeResult))
+      // Explore that input, then return here
+      return false;
 
     Result.mergeIn(EdgeResult, DL);
 
@@ -866,8 +935,6 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
       return true;
     }
   }
-  if (EdgesMissing)
-    return false;
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
@@ -880,8 +947,8 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
   LVILatticeVal Result;  // Start Undefined.
 
   // Loop over all of our predecessors, merging what we know from them into
-  // result.
-  bool EdgesMissing = false;
+  // result.  See the comment about the chosen traversal order in
+  // solveBlockValueNonLocal; the same reasoning applies here.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
@@ -889,9 +956,9 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
     // Note that we can provide PN as the context value to getEdgeValue, even
     // though the results will be cached, because PN is the value being used as
     // the cache key in the caller.
-    EdgesMissing |= !getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN);
-    if (EdgesMissing)
-      continue;
+    if (!getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN))
+      // Explore that input, then return here
+      return false;
 
     Result.mergeIn(EdgeResult, DL);
 
@@ -905,8 +972,6 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
       return true;
     }
   }
-  if (EdgesMissing)
-    return false;
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
@@ -982,8 +1047,8 @@ bool LazyValueInfoImpl::solveBlockValueSelect(LVILatticeVal &BBLV,
   }
 
   if (TrueVal.isConstantRange() && FalseVal.isConstantRange()) {
-    ConstantRange TrueCR = TrueVal.getConstantRange();
-    ConstantRange FalseCR = FalseVal.getConstantRange();
+    const ConstantRange &TrueCR = TrueVal.getConstantRange();
+    const ConstantRange &FalseCR = FalseVal.getConstantRange();
     Value *LHS = nullptr;
     Value *RHS = nullptr;
     SelectPatternResult SPR = matchSelectPattern(SI, LHS, RHS);
@@ -1071,9 +1136,9 @@ bool LazyValueInfoImpl::solveBlockValueSelect(LVILatticeVal &BBLV,
 }
 
 bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV,
-                                             Instruction *BBI,
-                                             BasicBlock *BB) {
-  if (!BBI->getOperand(0)->getType()->isSized()) {
+                                            CastInst *CI,
+                                            BasicBlock *BB) {
+  if (!CI->getOperand(0)->getType()->isSized()) {
     // Without knowing how wide the input is, we can't analyze it in any useful
     // way.
     BBLV = LVILatticeVal::getOverdefined();
@@ -1083,7 +1148,7 @@ bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV,
   // Filter out casts we don't know how to reason about before attempting to
   // recurse on our operand.  This can cut a long search short if we know we're
   // not going to be able to get any useful information anways.
-  switch (BBI->getOpcode()) {
+  switch (CI->getOpcode()) {
   case Instruction::Trunc:
   case Instruction::SExt:
   case Instruction::ZExt:
@@ -1100,44 +1165,43 @@ bool LazyValueInfoImpl::solveBlockValueCast(LVILatticeVal &BBLV,
   // Figure out the range of the LHS.  If that fails, we still apply the
   // transfer rule on the full set since we may be able to locally infer
   // interesting facts.
-  if (!hasBlockValue(BBI->getOperand(0), BB))
-    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+  if (!hasBlockValue(CI->getOperand(0), BB))
+    if (pushBlockValue(std::make_pair(BB, CI->getOperand(0))))
       // More work to do before applying this transfer rule.
       return false;
 
   const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(BBI->getOperand(0)->getType());
+    DL.getTypeSizeInBits(CI->getOperand(0)->getType());
   ConstantRange LHSRange = ConstantRange(OperandBitWidth);
-  if (hasBlockValue(BBI->getOperand(0), BB)) {
-    LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(BBI->getOperand(0), LHSVal,
-                                                  BBI);
+  if (hasBlockValue(CI->getOperand(0), BB)) {
+    LVILatticeVal LHSVal = getBlockValue(CI->getOperand(0), BB);
+    intersectAssumeOrGuardBlockValueConstantRange(CI->getOperand(0), LHSVal,
+                                                  CI);
     if (LHSVal.isConstantRange())
       LHSRange = LHSVal.getConstantRange();
   }
 
-  const unsigned ResultBitWidth =
-    cast<IntegerType>(BBI->getType())->getBitWidth();
+  const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth();
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  auto CastOp = (Instruction::CastOps) BBI->getOpcode();
-  BBLV = LVILatticeVal::getRange(LHSRange.castOp(CastOp, ResultBitWidth));
+  BBLV = LVILatticeVal::getRange(LHSRange.castOp(CI->getOpcode(),
+                                                 ResultBitWidth));
   return true;
 }
 
 bool LazyValueInfoImpl::solveBlockValueBinaryOp(LVILatticeVal &BBLV,
-                                                 Instruction *BBI,
+                                                 BinaryOperator *BO,
                                                  BasicBlock *BB) {
 
-  assert(BBI->getOperand(0)->getType()->isSized() &&
+  assert(BO->getOperand(0)->getType()->isSized() &&
          "all operands to binary operators are sized");
 
   // Filter out operators we don't know how to reason about before attempting to
   // recurse on our operand(s).  This can cut a long search short if we know
-  // we're not going to be able to get any useful information anways.
-  switch (BBI->getOpcode()) {
+  // we're not going to be able to get any useful information anyways.
+  switch (BO->getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
@@ -1159,29 +1223,29 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(LVILatticeVal &BBLV,
   // Figure out the range of the LHS.  If that fails, use a conservative range,
   // but apply the transfer rule anyways.  This lets us pick up facts from
   // expressions like "and i32 (call i32 @foo()), 32"
-  if (!hasBlockValue(BBI->getOperand(0), BB))
-    if (pushBlockValue(std::make_pair(BB, BBI->getOperand(0))))
+  if (!hasBlockValue(BO->getOperand(0), BB))
+    if (pushBlockValue(std::make_pair(BB, BO->getOperand(0))))
       // More work to do before applying this transfer rule.
       return false;
 
   const unsigned OperandBitWidth =
-    DL.getTypeSizeInBits(BBI->getOperand(0)->getType());
+    DL.getTypeSizeInBits(BO->getOperand(0)->getType());
   ConstantRange LHSRange = ConstantRange(OperandBitWidth);
-  if (hasBlockValue(BBI->getOperand(0), BB)) {
-    LVILatticeVal LHSVal = getBlockValue(BBI->getOperand(0), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(BBI->getOperand(0), LHSVal,
-                                                  BBI);
+  if (hasBlockValue(BO->getOperand(0), BB)) {
+    LVILatticeVal LHSVal = getBlockValue(BO->getOperand(0), BB);
+    intersectAssumeOrGuardBlockValueConstantRange(BO->getOperand(0), LHSVal,
+                                                  BO);
     if (LHSVal.isConstantRange())
       LHSRange = LHSVal.getConstantRange();
   }
 
-  ConstantInt *RHS = cast<ConstantInt>(BBI->getOperand(1));
+  ConstantInt *RHS = cast<ConstantInt>(BO->getOperand(1));
   ConstantRange RHSRange = ConstantRange(RHS->getValue());
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  auto BinOp = (Instruction::BinaryOps) BBI->getOpcode();
+  Instruction::BinaryOps BinOp = BO->getOpcode();
   BBLV = LVILatticeVal::getRange(LHSRange.binaryOp(BinOp, RHSRange));
   return true;
 }
@@ -1260,12 +1324,12 @@ getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
     return getValueFromICmpCondition(Val, ICI, isTrueDest);
 
   // Handle conditions in the form of (cond1 && cond2), we know that on the
-  // true dest path both of the conditions hold.
-  if (!isTrueDest)
-    return LVILatticeVal::getOverdefined();
-
+  // true dest path both of the conditions hold. Similarly for conditions of
+  // the form (cond1 || cond2), we know that on the false dest path neither
+  // condition holds.
   BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond);
-  if (!BO || BO->getOpcode() != BinaryOperator::And)
+  if (!BO || (isTrueDest && BO->getOpcode() != BinaryOperator::And) ||
+             (!isTrueDest && BO->getOpcode() != BinaryOperator::Or))
     return LVILatticeVal::getOverdefined();
 
   auto RHS = getValueFromCondition(Val, BO->getOperand(0), isTrueDest, Visited);
@@ -1333,14 +1397,14 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
     unsigned BitWidth = Val->getType()->getIntegerBitWidth();
     ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
 
-    for (SwitchInst::CaseIt i : SI->cases()) {
-      ConstantRange EdgeVal(i.getCaseValue()->getValue());
+    for (auto Case : SI->cases()) {
+      ConstantRange EdgeVal(Case.getCaseValue()->getValue());
       if (DefaultCase) {
         // It is possible that the default destination is the destination of
         // some cases. There is no need to perform difference for those cases.
-        if (i.getCaseSuccessor() != BBTo)
+        if (Case.getCaseSuccessor() != BBTo)
           EdgesVals = EdgesVals.difference(EdgeVal);
-      } else if (i.getCaseSuccessor() == BBTo)
+      } else if (Case.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
     Result = LVILatticeVal::getRange(std::move(EdgesVals));
@@ -1352,8 +1416,8 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 /// \brief Compute the value of Val on the edge BBFrom -> BBTo or the value at
 /// the basic block if the edge does not constrain Val.
 bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                      BasicBlock *BBTo, LVILatticeVal &Result,
-                                      Instruction *CxtI) {
+                                     BasicBlock *BBTo, LVILatticeVal &Result,
+                                     Instruction *CxtI) {
   // If already a constant, there is nothing to compute.
   if (Constant *VC = dyn_cast<Constant>(Val)) {
     Result = LVILatticeVal::get(VC);
@@ -1503,6 +1567,18 @@ void LazyValueInfo::releaseMemory() {
   }
 }
 
+bool LazyValueInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                               FunctionAnalysisManager::Invalidator &Inv) {
+  // We need to invalidate if we have either failed to preserve this analyses
+  // result directly or if any of its dependencies have been invalidated.
+  auto PAC = PA.getChecker<LazyValueAnalysis>();
+  if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+      (DT && Inv.invalidate<DominatorTreeAnalysis>(F, PA)))
+    return true;
+
+  return false;
+}
+
 void LazyValueInfoWrapperPass::releaseMemory() { Info.releaseMemory(); }
 
 LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
@@ -1510,7 +1586,7 @@ LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM)
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
 
-  return LazyValueInfo(&AC, &TLI, DT);
+  return LazyValueInfo(&AC, &F.getParent()->getDataLayout(), &TLI, DT);
 }
 
 /// Returns true if we can statically tell that this value will never be a
@@ -1540,7 +1616,7 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
   if (Result.isConstant())
     return Result.getConstant();
   if (Result.isConstantRange()) {
-    ConstantRange CR = Result.getConstantRange();
+    const ConstantRange &CR = Result.getConstantRange();
     if (const APInt *SingleVal = CR.getSingleElement())
       return ConstantInt::get(V->getContext(), *SingleVal);
   }
@@ -1577,71 +1653,90 @@ Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
   if (Result.isConstant())
     return Result.getConstant();
   if (Result.isConstantRange()) {
-    ConstantRange CR = Result.getConstantRange();
+    const ConstantRange &CR = Result.getConstantRange();
     if (const APInt *SingleVal = CR.getSingleElement())
       return ConstantInt::get(V->getContext(), *SingleVal);
   }
   return nullptr;
 }
 
+ConstantRange LazyValueInfo::getConstantRangeOnEdge(Value *V,
+                                                    BasicBlock *FromBB,
+                                                    BasicBlock *ToBB,
+                                                    Instruction *CxtI) {
+  unsigned Width = V->getType()->getIntegerBitWidth();
+  const DataLayout &DL = FromBB->getModule()->getDataLayout();
+  LVILatticeVal Result =
+      getImpl(PImpl, AC, &DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+
+  if (Result.isUndefined())
+    return ConstantRange(Width, /*isFullSet=*/false);
+  if (Result.isConstantRange())
+    return Result.getConstantRange();
+  // We represent ConstantInt constants as constant ranges but other kinds
+  // of integer constants, i.e. ConstantExpr will be tagged as constants
+  assert(!(Result.isConstant() && isa<ConstantInt>(Result.getConstant())) &&
+         "ConstantInt value must be represented as constantrange");
+  return ConstantRange(Width, /*isFullSet=*/true);
+}
+
 static LazyValueInfo::Tristate getPredicateResult(unsigned Pred, Constant *C,
-                                                  LVILatticeVal &Result,
+                                                  const LVILatticeVal &Val,
                                                   const DataLayout &DL,
                                                   TargetLibraryInfo *TLI) {
 
   // If we know the value is a constant, evaluate the conditional.
   Constant *Res = nullptr;
-  if (Result.isConstant()) {
-    Res = ConstantFoldCompareInstOperands(Pred, Result.getConstant(), C, DL,
-                                          TLI);
+  if (Val.isConstant()) {
+    Res = ConstantFoldCompareInstOperands(Pred, Val.getConstant(), C, DL, TLI);
     if (ConstantInt *ResCI = dyn_cast<ConstantInt>(Res))
       return ResCI->isZero() ? LazyValueInfo::False : LazyValueInfo::True;
     return LazyValueInfo::Unknown;
   }
 
-  if (Result.isConstantRange()) {
+  if (Val.isConstantRange()) {
     ConstantInt *CI = dyn_cast<ConstantInt>(C);
     if (!CI) return LazyValueInfo::Unknown;
 
-    ConstantRange CR = Result.getConstantRange();
+    const ConstantRange &CR = Val.getConstantRange();
     if (Pred == ICmpInst::ICMP_EQ) {
       if (!CR.contains(CI->getValue()))
         return LazyValueInfo::False;
 
-      if (CR.isSingleElement() && CR.contains(CI->getValue()))
+      if (CR.isSingleElement())
         return LazyValueInfo::True;
     } else if (Pred == ICmpInst::ICMP_NE) {
       if (!CR.contains(CI->getValue()))
         return LazyValueInfo::True;
 
-      if (CR.isSingleElement() && CR.contains(CI->getValue()))
+      if (CR.isSingleElement())
+        return LazyValueInfo::False;
+    } else {
+      // Handle more complex predicates.
+      ConstantRange TrueValues = ConstantRange::makeExactICmpRegion(
+          (ICmpInst::Predicate)Pred, CI->getValue());
+      if (TrueValues.contains(CR))
+        return LazyValueInfo::True;
+      if (TrueValues.inverse().contains(CR))
         return LazyValueInfo::False;
     }
-
-    // Handle more complex predicates.
-    ConstantRange TrueValues = ConstantRange::makeExactICmpRegion(
-        (ICmpInst::Predicate)Pred, CI->getValue());
-    if (TrueValues.contains(CR))
-      return LazyValueInfo::True;
-    if (TrueValues.inverse().contains(CR))
-      return LazyValueInfo::False;
     return LazyValueInfo::Unknown;
   }
 
-  if (Result.isNotConstant()) {
+  if (Val.isNotConstant()) {
     // If this is an equality comparison, we can try to fold it knowing that
     // "V != C1".
     if (Pred == ICmpInst::ICMP_EQ) {
       // !C1 == C -> false iff C1 == C.
       Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
-                                            Result.getNotConstant(), C, DL,
+                                            Val.getNotConstant(), C, DL,
                                             TLI);
       if (Res->isNullValue())
         return LazyValueInfo::False;
     } else if (Pred == ICmpInst::ICMP_NE) {
       // !C1 != C -> true iff C1 == C.
       Res = ConstantFoldCompareInstOperands(ICmpInst::ICMP_NE,
-                                            Result.getNotConstant(), C, DL,
+                                            Val.getNotConstant(), C, DL,
                                             TLI);
       if (Res->isNullValue())
         return LazyValueInfo::True;
@@ -1780,3 +1875,97 @@ void LazyValueInfo::eraseBlock(BasicBlock *BB) {
     getImpl(PImpl, AC, &DL, DT).eraseBlock(BB);
   }
 }
+
+
+void LazyValueInfo::printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) {
+  if (PImpl) {
+    getImpl(PImpl, AC, DL, DT).printLVI(F, DTree, OS);
+  }
+}
+
+// Print the LVI for the function arguments at the start of each basic block.
+void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot(
+    const BasicBlock *BB, formatted_raw_ostream &OS) {
+  // Find if there are latticevalues defined for arguments of the function.
+  auto *F = BB->getParent();
+  for (auto &Arg : F->args()) {
+    LVILatticeVal Result = LVIImpl->getValueInBlock(
+        const_cast<Argument *>(&Arg), const_cast<BasicBlock *>(BB));
+    if (Result.isUndefined())
+      continue;
+    OS << "; LatticeVal for: '" << Arg << "' is: " << Result << "\n";
+  }
+}
+
+// This function prints the LVI analysis for the instruction I at the beginning
+// of various basic blocks. It relies on calculated values that are stored in
+// the LazyValueInfoCache, and in the absence of cached values, recalculte the
+// LazyValueInfo for `I`, and print that info.
+void LazyValueInfoAnnotatedWriter::emitInstructionAnnot(
+    const Instruction *I, formatted_raw_ostream &OS) {
+
+  auto *ParentBB = I->getParent();
+  SmallPtrSet<const BasicBlock*, 16> BlocksContainingLVI;
+  // We can generate (solve) LVI values only for blocks that are dominated by
+  // the I's parent. However, to avoid generating LVI for all dominating blocks,
+  // that contain redundant/uninteresting information, we print LVI for
+  // blocks that may use this LVI information (such as immediate successor
+  // blocks, and blocks that contain uses of `I`).
+  auto printResult = [&](const BasicBlock *BB) {
+    if (!BlocksContainingLVI.insert(BB).second)
+      return;
+    LVILatticeVal Result = LVIImpl->getValueInBlock(
+        const_cast<Instruction *>(I), const_cast<BasicBlock *>(BB));
+      OS << "; LatticeVal for: '" << *I << "' in BB: '";
+      BB->printAsOperand(OS, false);
+      OS << "' is: " << Result << "\n";
+  };
+
+  printResult(ParentBB);
+  // Print the LVI analysis results for the the immediate successor blocks, that
+  // are dominated by `ParentBB`.
+  for (auto *BBSucc : successors(ParentBB))
+    if (DT.dominates(ParentBB, BBSucc))
+      printResult(BBSucc);
+
+  // Print LVI in blocks where `I` is used.
+  for (auto *U : I->users())
+    if (auto *UseI = dyn_cast<Instruction>(U))
+      if (!isa<PHINode>(UseI) || DT.dominates(ParentBB, UseI->getParent()))
+        printResult(UseI->getParent());
+
+}
+
+namespace {
+// Printer class for LazyValueInfo results.
+class LazyValueInfoPrinter : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  LazyValueInfoPrinter() : FunctionPass(ID) {
+    initializeLazyValueInfoPrinterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<LazyValueInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  // Get the mandatory dominator tree analysis and pass this in to the
+  // LVIPrinter. We cannot rely on the LVI's DT, since it's optional.
+  bool runOnFunction(Function &F) override {
+    dbgs() << "LVI for function '" << F.getName() << "':\n";
+    auto &LVI = getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+    auto &DTree = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    LVI.printLVI(F, DTree, dbgs());
+    return false;
+  }
+};
+}
+
+char LazyValueInfoPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(LazyValueInfoPrinter, "print-lazy-value-info",
+                "Lazy Value Info Printer Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(LazyValueInfoPrinter, "print-lazy-value-info",
+                "Lazy Value Info Printer Pass", false, false)
diff --git a/contrib/llvm/lib/Analysis/Lint.cpp b/contrib/llvm/lib/Analysis/Lint.cpp
index 2ca46b1..ada600a 100644
--- a/contrib/llvm/lib/Analysis/Lint.cpp
+++ b/contrib/llvm/lib/Analysis/Lint.cpp
@@ -58,18 +58,19 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -404,7 +405,7 @@ void Lint::visitMemoryReference(Instruction &I,
   Assert(!isa<UndefValue>(UnderlyingObject),
          "Undefined behavior: Undef pointer dereference", &I);
   Assert(!isa<ConstantInt>(UnderlyingObject) ||
-             !cast<ConstantInt>(UnderlyingObject)->isAllOnesValue(),
+             !cast<ConstantInt>(UnderlyingObject)->isMinusOne(),
          "Unusual: All-ones pointer dereference", &I);
   Assert(!isa<ConstantInt>(UnderlyingObject) ||
              !cast<ConstantInt>(UnderlyingObject)->isOne(),
@@ -533,11 +534,8 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 
   VectorType *VecTy = dyn_cast<VectorType>(V->getType());
   if (!VecTy) {
-    unsigned BitWidth = V->getType()->getIntegerBitWidth();
-    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC,
-                     dyn_cast<Instruction>(V), DT);
-    return KnownZero.isAllOnesValue();
+    KnownBits Known = computeKnownBits(V, DL, 0, AC, dyn_cast<Instruction>(V), DT);
+    return Known.isZero();
   }
 
   // Per-component check doesn't work with zeroinitializer
@@ -550,15 +548,13 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 
   // For a vector, KnownZero will only be true if all values are zero, so check
   // this per component
-  unsigned BitWidth = VecTy->getElementType()->getIntegerBitWidth();
   for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) {
     Constant *Elem = C->getAggregateElement(I);
     if (isa<UndefValue>(Elem))
       return true;
 
-    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-    computeKnownBits(Elem, KnownZero, KnownOne, DL);
-    if (KnownZero.isAllOnesValue())
+    KnownBits Known = computeKnownBits(Elem, DL);
+    if (Known.isZero())
       return true;
   }
 
@@ -699,7 +695,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
 
   // As a last resort, try SimplifyInstruction or constant folding.
   if (Instruction *Inst = dyn_cast<Instruction>(V)) {
-    if (Value *W = SimplifyInstruction(Inst, *DL, TLI, DT, AC))
+    if (Value *W = SimplifyInstruction(Inst, {*DL, TLI, DT, AC}))
       return findValueImpl(W, OffsetOk, Visited);
   } else if (auto *C = dyn_cast<Constant>(V)) {
     if (Value *W = ConstantFoldConstant(C, *DL, TLI))
diff --git a/contrib/llvm/lib/Analysis/Loads.cpp b/contrib/llvm/lib/Analysis/Loads.cpp
index e46541e..591b0fc 100644
--- a/contrib/llvm/lib/Analysis/Loads.cpp
+++ b/contrib/llvm/lib/Analysis/Loads.cpp
@@ -117,6 +117,16 @@ static bool isDereferenceableAndAlignedPointer(
 }
 
 bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
+                                              const APInt &Size,
+                                              const DataLayout &DL,
+                                              const Instruction *CtxI,
+                                              const DominatorTree *DT) {
+  SmallPtrSet<const Value *, 32> Visited;
+  return ::isDereferenceableAndAlignedPointer(V, Align, Size, DL, CtxI, DT,
+                                              Visited);
+}
+
+bool llvm::isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
                                               const DataLayout &DL,
                                               const Instruction *CtxI,
                                               const DominatorTree *DT) {
@@ -312,21 +322,26 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
                                       BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA, bool *IsLoadCSE) {
-  if (MaxInstsToScan == 0)
-    MaxInstsToScan = ~0U;
-
-  Value *Ptr = Load->getPointerOperand();
-  Type *AccessTy = Load->getType();
-
-  // We can never remove a volatile load
-  if (Load->isVolatile())
-    return nullptr;
-
-  // Anything stronger than unordered is currently unimplemented.
+                                      AliasAnalysis *AA, bool *IsLoad,
+                                      unsigned *NumScanedInst) {
+  // Don't CSE load that is volatile or anything stronger than unordered.
   if (!Load->isUnordered())
     return nullptr;
 
+  return FindAvailablePtrLoadStore(
+      Load->getPointerOperand(), Load->getType(), Load->isAtomic(), ScanBB,
+      ScanFrom, MaxInstsToScan, AA, IsLoad, NumScanedInst);
+}
+
+Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy,
+                                       bool AtLeastAtomic, BasicBlock *ScanBB,
+                                       BasicBlock::iterator &ScanFrom,
+                                       unsigned MaxInstsToScan,
+                                       AliasAnalysis *AA, bool *IsLoadCSE,
+                                       unsigned *NumScanedInst) {
+  if (MaxInstsToScan == 0)
+    MaxInstsToScan = ~0U;
+
   const DataLayout &DL = ScanBB->getModule()->getDataLayout();
 
   // Try to get the store size for the type.
@@ -344,6 +359,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
     // Restore ScanFrom to expected value in case next test succeeds
     ScanFrom++;
 
+    if (NumScanedInst)
+      ++(*NumScanedInst);
+
     // Don't scan huge blocks.
     if (MaxInstsToScan-- == 0)
       return nullptr;
@@ -359,7 +377,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
 
         // We can value forward from an atomic to a non-atomic, but not the
         // other way around.
-        if (LI->isAtomic() < Load->isAtomic())
+        if (LI->isAtomic() < AtLeastAtomic)
           return nullptr;
 
         if (IsLoadCSE)
@@ -378,7 +396,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
 
         // We can value forward from an atomic to a non-atomic, but not the
         // other way around.
-        if (SI->isAtomic() < Load->isAtomic())
+        if (SI->isAtomic() < AtLeastAtomic)
           return nullptr;
 
         if (IsLoadCSE)
diff --git a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bf80072..4ba1258 100644
--- a/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -135,21 +135,6 @@ bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
 
-void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message,
-                                    const Loop *TheLoop, const char *PassName,
-                                    OptimizationRemarkEmitter &ORE) {
-  DebugLoc DL = TheLoop->getStartLoc();
-  const Value *V = TheLoop->getHeader();
-  if (const Instruction *I = Message.getInstr()) {
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-    V = I->getParent();
-  }
-  ORE.emitOptimizationRemarkAnalysis(PassName, DL, V, Message.str());
-}
-
 Value *llvm::stripIntegerCast(Value *V) {
   if (auto *CI = dyn_cast<CastInst>(V))
     if (CI->getOperand(0)->getType()->isIntegerTy())
@@ -172,11 +157,6 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
     // Strip casts.
     StrideVal = stripIntegerCast(StrideVal);
 
-    // Replace symbolic stride by one.
-    Value *One = ConstantInt::get(StrideVal->getType(), 1);
-    ValueToValueMap RewriteMap;
-    RewriteMap[StrideVal] = One;
-
     ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
@@ -518,7 +498,7 @@ class AccessAnalysis {
 public:
   /// \brief Read or write access location.
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+  typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
@@ -570,7 +550,7 @@ public:
     DepChecker.clearDependences();
   }
 
-  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
+  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
 
 private:
   typedef SetVector<MemAccessInfo> PtrAccessSet;
@@ -584,8 +564,8 @@ private:
 
   const DataLayout &DL;
 
-  /// Set of accesses that need a further dependence check.
-  MemAccessInfoSet CheckDeps;
+  /// List of accesses that need a further dependence check.
+  MemAccessInfoList CheckDeps;
 
   /// Set of pointers that are read only.
   SmallPtrSet<Value*, 16> ReadOnlyPtr;
@@ -842,7 +822,7 @@ void AccessAnalysis::processMemAccesses() {
           // there is no other write to the ptr - this is an optimization to
           // catch "a[i] = a[i] + " without having to do a dependence check).
           if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
-            CheckDeps.insert(Access);
+            CheckDeps.push_back(Access);
             IsRTCheckAnalysisNeeded = true;
           }
 
@@ -1205,6 +1185,73 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   return false;
 }
 
+/// Given a non-constant (unknown) dependence-distance \p Dist between two 
+/// memory accesses, that have the same stride whose absolute value is given
+/// in \p Stride, and that have the same type size \p TypeByteSize,
+/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
+/// possible to prove statically that the dependence distance is larger
+/// than the range that the accesses will travel through the execution of
+/// the loop. If so, return true; false otherwise. This is useful for
+/// example in loops such as the following (PR31098):
+///     for (i = 0; i < D; ++i) {
+///                = out[i];
+///       out[i+D] =
+///     }
+static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
+                                     const SCEV &BackedgeTakenCount,
+                                     const SCEV &Dist, uint64_t Stride,
+                                     uint64_t TypeByteSize) {
+
+  // If we can prove that
+  //      (**) |Dist| > BackedgeTakenCount * Step
+  // where Step is the absolute stride of the memory accesses in bytes, 
+  // then there is no dependence.
+  //
+  // Ratioanle: 
+  // We basically want to check if the absolute distance (|Dist/Step|) 
+  // is >= the loop iteration count (or > BackedgeTakenCount). 
+  // This is equivalent to the Strong SIV Test (Practical Dependence Testing, 
+  // Section 4.2.1); Note, that for vectorization it is sufficient to prove 
+  // that the dependence distance is >= VF; This is checked elsewhere.
+  // But in some cases we can prune unknown dependence distances early, and 
+  // even before selecting the VF, and without a runtime test, by comparing 
+  // the distance against the loop iteration count. Since the vectorized code 
+  // will be executed only if LoopCount >= VF, proving distance >= LoopCount 
+  // also guarantees that distance >= VF.
+  //
+  const uint64_t ByteStride = Stride * TypeByteSize;
+  const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
+  const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
+
+  const SCEV *CastedDist = &Dist;
+  const SCEV *CastedProduct = Product;
+  uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
+  uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
+
+  // The dependence distance can be positive/negative, so we sign extend Dist; 
+  // The multiplication of the absolute stride in bytes and the 
+  // backdgeTakenCount is non-negative, so we zero extend Product.
+  if (DistTypeSize > ProductTypeSize)
+    CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
+  else
+    CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
+
+  // Is  Dist - (BackedgeTakenCount * Step) > 0 ?
+  // (If so, then we have proven (**) because |Dist| >= Dist)
+  const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
+  if (SE.isKnownPositive(Minus))
+    return true;
+
+  // Second try: Is  -Dist - (BackedgeTakenCount * Step) > 0 ?
+  // (If so, then we have proven (**) because |Dist| >= -1*Dist)
+  const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
+  Minus = SE.getMinusSCEV(NegDist, CastedProduct);
+  if (SE.isKnownPositive(Minus))
+    return true;
+
+  return false;
+}
+
 /// \brief Check the dependence for two accesses with the same stride \p Stride.
 /// \p Distance is the positive distance and \p TypeByteSize is type size in
 /// bytes.
@@ -1292,21 +1339,26 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::Unknown;
   }
 
+  Type *ATy = APtr->getType()->getPointerElementType();
+  Type *BTy = BPtr->getType()->getPointerElementType();
+  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
+  uint64_t Stride = std::abs(StrideAPtr);
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
   if (!C) {
+    if (TypeByteSize == DL.getTypeAllocSize(BTy) &&
+        isSafeDependenceDistance(DL, *(PSE.getSE()),
+                                 *(PSE.getBackedgeTakenCount()), *Dist, Stride,
+                                 TypeByteSize))
+      return Dependence::NoDep;
+
     DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
     ShouldRetryWithRuntimeCheck = true;
     return Dependence::Unknown;
   }
 
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
-  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
-  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
-
   const APInt &Val = C->getAPInt();
   int64_t Distance = Val.getSExtValue();
-  uint64_t Stride = std::abs(StrideAPtr);
 
   // Attempt to prove strided accesses independent.
   if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
@@ -1427,12 +1479,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 }
 
 bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
-                                   MemAccessInfoSet &CheckDeps,
+                                   MemAccessInfoList &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
   MaxSafeDepDistBytes = -1;
-  while (!CheckDeps.empty()) {
-    MemAccessInfo CurAccess = *CheckDeps.begin();
+  SmallPtrSet<MemAccessInfo, 8> Visited;
+  for (MemAccessInfo CurAccess : CheckDeps) {
+    if (Visited.count(CurAccess))
+      continue;
 
     // Get the relevant memory access set.
     EquivalenceClasses<MemAccessInfo>::iterator I =
@@ -1446,7 +1500,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
 
     // Check every access pair.
     while (AI != AE) {
-      CheckDeps.erase(*AI);
+      Visited.insert(*AI);
       EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
       while (OI != AE) {
         // Check every accessing instruction pair in program order.
@@ -1885,7 +1939,10 @@ expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
     Value *NewPtr = (Inst && TheLoop->contains(Inst))
                         ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
                         : Ptr;
-    return {NewPtr, NewPtr};
+    // We must return a half-open range, which means incrementing Sc.
+    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
+    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
+    return {NewPtr, NewPtrPlusOne};
   } else {
     Value *Start = nullptr, *End = nullptr;
     DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
diff --git a/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp b/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp
index 5be3ee3..e4a0f90 100644
--- a/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp
+++ b/contrib/llvm/lib/Analysis/LoopAnalysisManager.cpp
@@ -31,24 +31,10 @@ bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
     FunctionAnalysisManager::Invalidator &Inv) {
   // First compute the sequence of IR units covered by this proxy. We will want
   // to visit this in postorder, but because this is a tree structure we can do
-  // this by building a preorder sequence and walking it in reverse.
-  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
-  // Note that we want to walk the roots in reverse order because we will end
-  // up reversing the preorder sequence. However, it happens that the loop nest
-  // roots are in reverse order within the LoopInfo object. So we just walk
-  // forward here.
-  // FIXME: If we change the order of LoopInfo we will want to add a reverse
-  // here.
-  for (Loop *RootL : *LI) {
-    assert(PreOrderWorklist.empty() &&
-           "Must start with an empty preorder walk worklist.");
-    PreOrderWorklist.push_back(RootL);
-    do {
-      Loop *L = PreOrderWorklist.pop_back_val();
-      PreOrderWorklist.append(L->begin(), L->end());
-      PreOrderLoops.push_back(L);
-    } while (!PreOrderWorklist.empty());
-  }
+  // this by building a preorder sequence and walking it backwards. We also
+  // want siblings in forward program order to match the LoopPassManager so we
+  // get the preorder with siblings reversed.
+  SmallVector<Loop *, 4> PreOrderLoops = LI->getLoopsInReverseSiblingPreorder();
 
   // If this proxy or the loop info is going to be invalidated, we also need
   // to clear all the keys coming from that analysis. We also completely blow
@@ -145,7 +131,6 @@ LoopAnalysisManagerFunctionProxy::run(Function &F,
 
 PreservedAnalyses llvm::getLoopPassPreservedAnalyses() {
   PreservedAnalyses PA;
-  PA.preserve<AssumptionAnalysis>();
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
   PA.preserve<LoopAnalysisManagerFunctionProxy>();
diff --git a/contrib/llvm/lib/Analysis/LoopInfo.cpp b/contrib/llvm/lib/Analysis/LoopInfo.cpp
index f449ce9..697b586 100644
--- a/contrib/llvm/lib/Analysis/LoopInfo.cpp
+++ b/contrib/llvm/lib/Analysis/LoopInfo.cpp
@@ -40,9 +40,9 @@ template class llvm::LoopInfoBase<BasicBlock, Loop>;
 
 // Always verify loopinfo if expensive checking is enabled.
 #ifdef EXPENSIVE_CHECKS
-static bool VerifyLoopInfo = true;
+bool llvm::VerifyLoopInfo = true;
 #else
-static bool VerifyLoopInfo = false;
+bool llvm::VerifyLoopInfo = false;
 #endif
 static cl::opt<bool,true>
 VerifyLoopInfoX("verify-loop-info", cl::location(VerifyLoopInfo),
@@ -131,13 +131,13 @@ PHINode *Loop::getCanonicalInductionVariable() const {
     PHINode *PN = cast<PHINode>(I);
     if (ConstantInt *CI =
         dyn_cast<ConstantInt>(PN->getIncomingValueForBlock(Incoming)))
-      if (CI->isNullValue())
+      if (CI->isZero())
         if (Instruction *Inc =
             dyn_cast<Instruction>(PN->getIncomingValueForBlock(Backedge)))
           if (Inc->getOpcode() == Instruction::Add &&
                 Inc->getOperand(0) == PN)
             if (ConstantInt *CI = dyn_cast<ConstantInt>(Inc->getOperand(1)))
-              if (CI->equalsInt(1))
+              if (CI->isOne())
                 return PN;
   }
   return nullptr;
@@ -211,9 +211,11 @@ bool Loop::isSafeToClone() const {
 
 MDNode *Loop::getLoopID() const {
   MDNode *LoopID = nullptr;
-  if (isLoopSimplifyForm()) {
-    LoopID = getLoopLatch()->getTerminator()->getMetadata(LLVMContext::MD_loop);
+  if (BasicBlock *Latch = getLoopLatch()) {
+    LoopID = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop);
   } else {
+    assert(!getLoopLatch() &&
+           "The loop should have no single latch at this point");
     // Go through each predecessor of the loop header and check the
     // terminator for the metadata.
     BasicBlock *H = getHeader();
@@ -248,11 +250,12 @@ void Loop::setLoopID(MDNode *LoopID) const {
   assert(LoopID->getNumOperands() > 0 && "Loop ID needs at least one operand");
   assert(LoopID->getOperand(0) == LoopID && "Loop ID should refer to itself");
 
-  if (isLoopSimplifyForm()) {
-    getLoopLatch()->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
+  if (BasicBlock *Latch = getLoopLatch()) {
+    Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
     return;
   }
 
+  assert(!getLoopLatch() && "The loop should have no single latch at this point");
   BasicBlock *H = getHeader();
   for (BasicBlock *BB : this->blocks()) {
     TerminatorInst *TI = BB->getTerminator();
@@ -457,7 +460,7 @@ protected:
 void UnloopUpdater::updateBlockParents() {
   if (Unloop.getNumBlocks()) {
     // Perform a post order CFG traversal of all blocks within this loop,
-    // propagating the nearest loop from sucessors to predecessors.
+    // propagating the nearest loop from successors to predecessors.
     LoopBlocksTraversal Traversal(DFS, LI);
     for (BasicBlock *POI : Traversal) {
 
@@ -606,10 +609,19 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
   return NearLoop;
 }
 
-LoopInfo::LoopInfo(const DominatorTreeBase<BasicBlock> &DomTree) {
+LoopInfo::LoopInfo(const DomTreeBase<BasicBlock> &DomTree) {
   analyze(DomTree);
 }
 
+bool LoopInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                          FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<LoopAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 void LoopInfo::markAsRemoved(Loop *Unloop) {
   assert(!Unloop->isInvalid() && "Loop has already been removed");
   Unloop->invalidate();
diff --git a/contrib/llvm/lib/Analysis/LoopPass.cpp b/contrib/llvm/lib/Analysis/LoopPass.cpp
index 3f4a079..e988f64 100644
--- a/contrib/llvm/lib/Analysis/LoopPass.cpp
+++ b/contrib/llvm/lib/Analysis/LoopPass.cpp
@@ -54,6 +54,8 @@ public:
     }
     return false;
   }
+
+  StringRef getPassName() const override { return "Print Loop IR"; }
 };
 
 char PrintLoopPassWrapper::ID = 0;
@@ -71,30 +73,23 @@ LPPassManager::LPPassManager()
   CurrentLoop = nullptr;
 }
 
-// Inset loop into loop nest (LoopInfo) and loop queue (LQ).
-Loop &LPPassManager::addLoop(Loop *ParentLoop) {
-  // Create a new loop. LI will take ownership.
-  Loop *L = new Loop();
-
-  // Insert into the loop nest and the loop queue.
-  if (!ParentLoop) {
+// Insert loop into loop nest (LoopInfo) and loop queue (LQ).
+void LPPassManager::addLoop(Loop &L) {
+  if (!L.getParentLoop()) {
     // This is the top level loop.
-    LI->addTopLevelLoop(L);
-    LQ.push_front(L);
-    return *L;
+    LQ.push_front(&L);
+    return;
   }
 
-  ParentLoop->addChildLoop(L);
   // Insert L into the loop queue after the parent loop.
   for (auto I = LQ.begin(), E = LQ.end(); I != E; ++I) {
-    if (*I == L->getParentLoop()) {
+    if (*I == L.getParentLoop()) {
       // deque does not support insert after.
       ++I;
-      LQ.insert(I, 1, L);
-      break;
+      LQ.insert(I, 1, &L);
+      return;
     }
   }
-  return *L;
 }
 
 /// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
diff --git a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
index e7a85ae..5c0cbb2 100644
--- a/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/MemDepPrinter.cpp
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/LLVMContext.h"
diff --git a/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp b/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp
index fa0cc5a..4231a78 100644
--- a/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstIterator.h"
diff --git a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
index 2d82740..7327c07 100644
--- a/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -37,6 +37,7 @@ enum AllocType : uint8_t {
   CallocLike         = 1<<2, // allocates + bzero
   ReallocLike        = 1<<3, // reallocates
   StrDupLike         = 1<<4,
+  MallocOrCallocLike = MallocLike | CallocLike,
   AllocLike          = MallocLike | CallocLike | StrDupLike,
   AnyAlloc           = AllocLike | ReallocLike
 };
@@ -50,35 +51,35 @@ struct AllocFnsTy {
 
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
-static const std::pair<LibFunc::Func, AllocFnsTy> AllocationFnData[] = {
-  {LibFunc::malloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc::valloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc::Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc::ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc::Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
-  {LibFunc::ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
-  {LibFunc::Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc::ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc::Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
-  {LibFunc::ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
-  {LibFunc::msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc::msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc::msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
-  {LibFunc::msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
-  {LibFunc::msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc::msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc::msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
-  {LibFunc::msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
-  {LibFunc::calloc,              {CallocLike,  2, 0,   1}},
-  {LibFunc::realloc,             {ReallocLike, 2, 1,  -1}},
-  {LibFunc::reallocf,            {ReallocLike, 2, 1,  -1}},
-  {LibFunc::strdup,              {StrDupLike,  1, -1, -1}},
-  {LibFunc::strndup,             {StrDupLike,  2, 1,  -1}}
+static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
+  {LibFunc_malloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc_valloc,              {MallocLike,  1, 0,  -1}},
+  {LibFunc_Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
+  {LibFunc_ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc_Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
+  {LibFunc_ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
+  {LibFunc_Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
+  {LibFunc_ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc_Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
+  {LibFunc_ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
+  {LibFunc_msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
+  {LibFunc_msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
+  {LibFunc_msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
+  {LibFunc_msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
+  {LibFunc_msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
+  {LibFunc_msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
+  {LibFunc_msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
+  {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
+  {LibFunc_calloc,              {CallocLike,  2, 0,   1}},
+  {LibFunc_realloc,             {ReallocLike, 2, 1,  -1}},
+  {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
+  {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
+  {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}}
   // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
-static Function *getCalledFunction(const Value *V, bool LookThroughBitCast,
-                                   bool &IsNoBuiltin) {
+static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast,
+                                         bool &IsNoBuiltin) {
   // Don't care about intrinsics in this case.
   if (isa<IntrinsicInst>(V))
     return nullptr;
@@ -86,13 +87,13 @@ static Function *getCalledFunction(const Value *V, bool LookThroughBitCast,
   if (LookThroughBitCast)
     V = V->stripPointerCasts();
 
-  CallSite CS(const_cast<Value*>(V));
+  ImmutableCallSite CS(V);
   if (!CS.getInstruction())
     return nullptr;
 
   IsNoBuiltin = CS.isNoBuiltin();
 
-  Function *Callee = CS.getCalledFunction();
+  const Function *Callee = CS.getCalledFunction();
   if (!Callee || !Callee->isDeclaration())
     return nullptr;
   return Callee;
@@ -106,12 +107,12 @@ getAllocationDataForFunction(const Function *Callee, AllocType AllocTy,
                              const TargetLibraryInfo *TLI) {
   // Make sure that the function is available.
   StringRef FnName = Callee->getName();
-  LibFunc::Func TLIFn;
+  LibFunc TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
     return None;
 
   const auto *Iter = find_if(
-      AllocationFnData, [TLIFn](const std::pair<LibFunc::Func, AllocFnsTy> &P) {
+      AllocationFnData, [TLIFn](const std::pair<LibFunc, AllocFnsTy> &P) {
         return P.first == TLIFn;
       });
 
@@ -183,7 +184,7 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
   ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CS && CS.paramHasAttr(AttributeSet::ReturnIndex, Attribute::NoAlias);
+  return CS && CS.hasRetAttr(Attribute::NoAlias);
 }
 
 
@@ -220,6 +221,14 @@ bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory similiar to malloc or calloc.
+bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                                  bool LookThroughBitCast) {
+  return getAllocationData(V, MallocOrCallocLike, TLI,
+                           LookThroughBitCast).hasValue();
+}
+
+/// \brief Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
 bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                          bool LookThroughBitCast) {
@@ -333,33 +342,33 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
     return nullptr;
 
   StringRef FnName = Callee->getName();
-  LibFunc::Func TLIFn;
+  LibFunc TLIFn;
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
     return nullptr;
 
   unsigned ExpectedNumParams;
-  if (TLIFn == LibFunc::free ||
-      TLIFn == LibFunc::ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc::ZdaPv || // operator delete[](void*)
-      TLIFn == LibFunc::msvc_delete_ptr32 || // operator delete(void*)
-      TLIFn == LibFunc::msvc_delete_ptr64 || // operator delete(void*)
-      TLIFn == LibFunc::msvc_delete_array_ptr32 || // operator delete[](void*)
-      TLIFn == LibFunc::msvc_delete_array_ptr64)   // operator delete[](void*)
+  if (TLIFn == LibFunc_free ||
+      TLIFn == LibFunc_ZdlPv || // operator delete(void*)
+      TLIFn == LibFunc_ZdaPv || // operator delete[](void*)
+      TLIFn == LibFunc_msvc_delete_ptr32 || // operator delete(void*)
+      TLIFn == LibFunc_msvc_delete_ptr64 || // operator delete(void*)
+      TLIFn == LibFunc_msvc_delete_array_ptr32 || // operator delete[](void*)
+      TLIFn == LibFunc_msvc_delete_array_ptr64)   // operator delete[](void*)
     ExpectedNumParams = 1;
-  else if (TLIFn == LibFunc::ZdlPvj ||              // delete(void*, uint)
-           TLIFn == LibFunc::ZdlPvm ||              // delete(void*, ulong)
-           TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
-           TLIFn == LibFunc::ZdaPvj ||              // delete[](void*, uint)
-           TLIFn == LibFunc::ZdaPvm ||              // delete[](void*, ulong)
-           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_ptr32_int ||      // delete(void*, uint)
-           TLIFn == LibFunc::msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
-           TLIFn == LibFunc::msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
-           TLIFn == LibFunc::msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
-           TLIFn == LibFunc::msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
-           TLIFn == LibFunc::msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
+  else if (TLIFn == LibFunc_ZdlPvj ||              // delete(void*, uint)
+           TLIFn == LibFunc_ZdlPvm ||              // delete(void*, ulong)
+           TLIFn == LibFunc_ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
+           TLIFn == LibFunc_ZdaPvj ||              // delete[](void*, uint)
+           TLIFn == LibFunc_ZdaPvm ||              // delete[](void*, ulong)
+           TLIFn == LibFunc_ZdaPvRKSt9nothrow_t || // delete[](void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_ptr32_int ||      // delete(void*, uint)
+           TLIFn == LibFunc_msvc_delete_ptr64_longlong || // delete(void*, ulonglong)
+           TLIFn == LibFunc_msvc_delete_ptr32_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_ptr64_nothrow || // delete(void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_array_ptr32_int ||      // delete[](void*, uint)
+           TLIFn == LibFunc_msvc_delete_array_ptr64_longlong || // delete[](void*, ulonglong)
+           TLIFn == LibFunc_msvc_delete_array_ptr32_nothrow || // delete[](void*, nothrow)
+           TLIFn == LibFunc_msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else
     return nullptr;
@@ -391,13 +400,11 @@ static APInt getSizeWithOverflow(const SizeOffsetType &Data) {
 
 /// \brief Compute the size of the object pointed by Ptr. Returns true and the
 /// object size in Size if successful, and false otherwise.
-/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
-/// byval arguments, and global variables.
+/// If RoundToAlign is true, then Size is rounded up to the alignment of
+/// allocas, byval arguments, and global variables.
 bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
-                         const TargetLibraryInfo *TLI, bool RoundToAlign,
-                         llvm::ObjSizeMode Mode) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(),
-                                  RoundToAlign, Mode);
+                         const TargetLibraryInfo *TLI, ObjectSizeOpts Opts) {
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), Opts);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -414,19 +421,23 @@ ConstantInt *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
          "ObjectSize must be a call to llvm.objectsize!");
 
   bool MaxVal = cast<ConstantInt>(ObjectSize->getArgOperand(1))->isZero();
-  ObjSizeMode Mode;
+  ObjectSizeOpts EvalOptions;
   // Unless we have to fold this to something, try to be as accurate as
   // possible.
   if (MustSucceed)
-    Mode = MaxVal ? ObjSizeMode::Max : ObjSizeMode::Min;
+    EvalOptions.EvalMode =
+        MaxVal ? ObjectSizeOpts::Mode::Max : ObjectSizeOpts::Mode::Min;
   else
-    Mode = ObjSizeMode::Exact;
+    EvalOptions.EvalMode = ObjectSizeOpts::Mode::Exact;
+
+  EvalOptions.NullIsUnknownSize =
+      cast<ConstantInt>(ObjectSize->getArgOperand(2))->isOne();
 
   // FIXME: Does it make sense to just return a failure value if the size won't
   // fit in the output and `!MustSucceed`?
   uint64_t Size;
   auto *ResultType = cast<IntegerType>(ObjectSize->getType());
-  if (getObjectSize(ObjectSize->getArgOperand(0), Size, DL, TLI, false, Mode) &&
+  if (getObjectSize(ObjectSize->getArgOperand(0), Size, DL, TLI, EvalOptions) &&
       isUIntN(ResultType->getBitWidth(), Size))
     return ConstantInt::get(ResultType, Size);
 
@@ -443,7 +454,7 @@ STATISTIC(ObjectVisitorLoad,
 
 
 APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
-  if (RoundToAlign && Align)
+  if (Options.RoundToAlign && Align)
     return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align));
   return Size;
 }
@@ -451,9 +462,8 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
 ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout &DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
-                                                 bool RoundToAlign,
-                                                 ObjSizeMode Mode)
-    : DL(DL), TLI(TLI), RoundToAlign(RoundToAlign), Mode(Mode) {
+                                                 ObjectSizeOpts Options)
+    : DL(DL), TLI(TLI), Options(Options) {
   // Pointer size must be rechecked for each object visited since it could have
   // a different address space.
 }
@@ -495,6 +505,22 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
   return unknown();
 }
 
+/// When we're compiling N-bit code, and the user uses parameters that are
+/// greater than N bits (e.g. uint64_t on a 32-bit build), we can run into
+/// trouble with APInt size issues. This function handles resizing + overflow
+/// checks for us. Check and zext or trunc \p I depending on IntTyBits and
+/// I's value.
+bool ObjectSizeOffsetVisitor::CheckedZextOrTrunc(APInt &I) {
+  // More bits than we can handle. Checking the bit width isn't necessary, but
+  // it's faster than checking active bits, and should give `false` in the
+  // vast majority of cases.
+  if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits)
+    return false;
+  if (I.getBitWidth() != IntTyBits)
+    I = I.zextOrTrunc(IntTyBits);
+  return true;
+}
+
 SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
@@ -505,8 +531,14 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
 
   Value *ArraySize = I.getArraySize();
   if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) {
-    Size *= C->getValue().zextOrSelf(IntTyBits);
-    return std::make_pair(align(Size, I.getAlignment()), Zero);
+    APInt NumElems = C->getValue();
+    if (!CheckedZextOrTrunc(NumElems))
+      return unknown();
+
+    bool Overflow;
+    Size = Size.umul_ov(NumElems, Overflow);
+    return Overflow ? unknown() : std::make_pair(align(Size, I.getAlignment()),
+                                                 Zero);
   }
   return unknown();
 }
@@ -551,21 +583,6 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
   if (!Arg)
     return unknown();
 
-  // When we're compiling N-bit code, and the user uses parameters that are
-  // greater than N bits (e.g. uint64_t on a 32-bit build), we can run into
-  // trouble with APInt size issues. This function handles resizing + overflow
-  // checks for us.
-  auto CheckedZextOrTrunc = [&](APInt &I) {
-    // More bits than we can handle. Checking the bit width isn't necessary, but
-    // it's faster than checking active bits, and should give `false` in the
-    // vast majority of cases.
-    if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits)
-      return false;
-    if (I.getBitWidth() != IntTyBits)
-      I = I.zextOrTrunc(IntTyBits);
-    return true;
-  };
-
   APInt Size = Arg->getValue();
   if (!CheckedZextOrTrunc(Size))
     return unknown();
@@ -596,7 +613,9 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
 }
 
 SizeOffsetType
-ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) {
+ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull& CPN) {
+  if (Options.NullIsUnknownSize && CPN.getType()->getAddressSpace() == 0)
+    return unknown();
   return std::make_pair(Zero, Zero);
 }
 
@@ -663,12 +682,12 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
     if (TrueResult == FalseResult) {
       return TrueSide;
     }
-    if (Mode == ObjSizeMode::Min) {
+    if (Options.EvalMode == ObjectSizeOpts::Mode::Min) {
       if (TrueResult.slt(FalseResult))
         return TrueSide;
       return FalseSide;
     }
-    if (Mode == ObjSizeMode::Max) {
+    if (Options.EvalMode == ObjectSizeOpts::Mode::Max) {
       if (TrueResult.sgt(FalseResult))
         return TrueSide;
       return FalseSide;
@@ -719,7 +738,10 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, RoundToAlign);
+  ObjectSizeOpts ObjSizeOptions;
+  ObjSizeOptions.RoundToAlign = RoundToAlign;
+
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, ObjSizeOptions);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
diff --git a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 66a0d14..263cf42 100644
--- a/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -15,17 +15,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/OrderedBasicBlock.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -310,11 +310,11 @@ unsigned MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
 }
 
 static bool isVolatile(Instruction *Inst) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
     return LI->isVolatile();
-  else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
     return SI->isVolatile();
-  else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
+  if (auto *AI = dyn_cast<AtomicCmpXchgInst>(Inst))
     return AI->isVolatile();
   return false;
 }
@@ -691,6 +691,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
       // load query, we can safely ignore it (scan past it).
       if (isLoad)
         continue;
+      LLVM_FALLTHROUGH;
     default:
       // Otherwise, there is a potential dependence.  Return a clobber.
       return MemDepResult::getClobber(Inst);
diff --git a/contrib/llvm/lib/Analysis/MemoryLocation.cpp b/contrib/llvm/lib/Analysis/MemoryLocation.cpp
index a0ae72f..9db6c49 100644
--- a/contrib/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/contrib/llvm/lib/Analysis/MemoryLocation.cpp
@@ -142,9 +142,9 @@ MemoryLocation MemoryLocation::getForArgument(ImmutableCallSite CS,
   // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
-  LibFunc::Func F;
+  LibFunc F;
   if (CS.getCalledFunction() && TLI.getLibFunc(*CS.getCalledFunction(), F) &&
-      F == LibFunc::memset_pattern16 && TLI.has(F)) {
+      F == LibFunc_memset_pattern16 && TLI.has(F)) {
     assert((ArgIdx == 0 || ArgIdx == 1) &&
            "Invalid argument index for memset_pattern16");
     if (ArgIdx == 1)
diff --git a/contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp b/contrib/llvm/lib/Analysis/MemorySSA.cpp
index 1ce4225..86de474 100644
--- a/contrib/llvm/lib/Transforms/Utils/MemorySSA.cpp
+++ b/contrib/llvm/lib/Analysis/MemorySSA.cpp
@@ -10,7 +10,7 @@
 // This file implements the MemorySSA class.
 //
 //===----------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -39,15 +39,10 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Scalar.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "memoryssa"
 using namespace llvm;
-STATISTIC(NumClobberCacheLookups, "Number of Memory SSA version cache lookups");
-STATISTIC(NumClobberCacheHits, "Number of Memory SSA version cache hits");
-STATISTIC(NumClobberCacheInserts, "Number of MemorySSA version cache inserts");
-
 INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
                       true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -145,8 +140,8 @@ public:
 
 private:
   union {
-      ImmutableCallSite CS;
-      MemoryLocation Loc;
+    ImmutableCallSite CS;
+    MemoryLocation Loc;
   };
 };
 }
@@ -218,12 +213,16 @@ static bool instructionClobbersQuery(MemoryDef *MD,
                                      AliasAnalysis &AA) {
   Instruction *DefInst = MD->getMemoryInst();
   assert(DefInst && "Defining instruction not actually an instruction");
+  ImmutableCallSite UseCS(UseInst);
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
     // These intrinsics will show up as affecting memory, but they are just
     // markers.
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
+      if (UseCS)
+        return false;
+      return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), UseLoc);
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
@@ -234,7 +233,6 @@ static bool instructionClobbersQuery(MemoryDef *MD,
     }
   }
 
-  ImmutableCallSite UseCS(UseInst);
   if (UseCS) {
     ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
     return I != MRI_NoModRef;
@@ -269,8 +267,8 @@ static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU,
 }
 
 // Return true when MD may alias MU, return false otherwise.
-bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
-                         AliasAnalysis &AA) {
+bool MemorySSAUtil::defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
+                                        AliasAnalysis &AA) {
   return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA);
 }
 }
@@ -302,7 +300,6 @@ static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
   Instruction *Inst = MD->getMemoryInst();
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
       return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), Loc);
     default:
@@ -320,95 +317,8 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysis &AA,
   // FIXME: We should handle invariant groups, as well. It's a bit harder,
   // because we need to pay close attention to invariant group barriers.
   return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
-                              AA.pointsToConstantMemory(I));
-}
-
-/// Cache for our caching MemorySSA walker.
-class WalkerCache {
-  DenseMap<ConstMemoryAccessPair, MemoryAccess *> Accesses;
-  DenseMap<const MemoryAccess *, MemoryAccess *> Calls;
-
-public:
-  MemoryAccess *lookup(const MemoryAccess *MA, const MemoryLocation &Loc,
-                       bool IsCall) const {
-    ++NumClobberCacheLookups;
-    MemoryAccess *R = IsCall ? Calls.lookup(MA) : Accesses.lookup({MA, Loc});
-    if (R)
-      ++NumClobberCacheHits;
-    return R;
-  }
-
-  bool insert(const MemoryAccess *MA, MemoryAccess *To,
-              const MemoryLocation &Loc, bool IsCall) {
-    // This is fine for Phis, since there are times where we can't optimize
-    // them.  Making a def its own clobber is never correct, though.
-    assert((MA != To || isa<MemoryPhi>(MA)) &&
-           "Something can't clobber itself!");
-
-    ++NumClobberCacheInserts;
-    bool Inserted;
-    if (IsCall)
-      Inserted = Calls.insert({MA, To}).second;
-    else
-      Inserted = Accesses.insert({{MA, Loc}, To}).second;
-
-    return Inserted;
-  }
-
-  bool remove(const MemoryAccess *MA, const MemoryLocation &Loc, bool IsCall) {
-    return IsCall ? Calls.erase(MA) : Accesses.erase({MA, Loc});
-  }
-
-  void clear() {
-    Accesses.clear();
-    Calls.clear();
-  }
-
-  bool contains(const MemoryAccess *MA) const {
-    for (auto &P : Accesses)
-      if (P.first.first == MA || P.second == MA)
-        return true;
-    for (auto &P : Calls)
-      if (P.first == MA || P.second == MA)
-        return true;
-    return false;
-  }
-};
-
-/// Walks the defining uses of MemoryDefs. Stops after we hit something that has
-/// no defining use (e.g. a MemoryPhi or liveOnEntry). Note that, when comparing
-/// against a null def_chain_iterator, this will compare equal only after
-/// walking said Phi/liveOnEntry.
-struct def_chain_iterator
-    : public iterator_facade_base<def_chain_iterator, std::forward_iterator_tag,
-                                  MemoryAccess *> {
-  def_chain_iterator() : MA(nullptr) {}
-  def_chain_iterator(MemoryAccess *MA) : MA(MA) {}
-
-  MemoryAccess *operator*() const { return MA; }
-
-  def_chain_iterator &operator++() {
-    // N.B. liveOnEntry has a null defining access.
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
-      MA = MUD->getDefiningAccess();
-    else
-      MA = nullptr;
-    return *this;
-  }
-
-  bool operator==(const def_chain_iterator &O) const { return MA == O.MA; }
-
-private:
-  MemoryAccess *MA;
-};
-
-static iterator_range<def_chain_iterator>
-def_chain(MemoryAccess *MA, MemoryAccess *UpTo = nullptr) {
-#ifdef EXPENSIVE_CHECKS
-  assert((!UpTo || find(def_chain(MA), UpTo) != def_chain_iterator()) &&
-         "UpTo isn't in the def chain!");
-#endif
-  return make_range(def_chain_iterator(MA), def_chain_iterator(UpTo));
+                              AA.pointsToConstantMemory(cast<LoadInst>(I)->
+                                                          getPointerOperand()));
 }
 
 /// Verifies that `Start` is clobbered by `ClobberAt`, and that nothing
@@ -512,91 +422,24 @@ class ClobberWalker {
   const MemorySSA &MSSA;
   AliasAnalysis &AA;
   DominatorTree &DT;
-  WalkerCache &WC;
   UpwardsMemoryQuery *Query;
-  bool UseCache;
 
   // Phi optimization bookkeeping
   SmallVector<DefPath, 32> Paths;
   DenseSet<ConstMemoryAccessPair> VisitedPhis;
-  DenseMap<const BasicBlock *, MemoryAccess *> WalkTargetCache;
-
-  void setUseCache(bool Use) { UseCache = Use; }
-  bool shouldIgnoreCache() const {
-    // UseCache will only be false when we're debugging, or when expensive
-    // checks are enabled. In either case, we don't care deeply about speed.
-    return LLVM_UNLIKELY(!UseCache);
-  }
-
-  void addCacheEntry(const MemoryAccess *What, MemoryAccess *To,
-                     const MemoryLocation &Loc) const {
-// EXPENSIVE_CHECKS because most of these queries are redundant.
-#ifdef EXPENSIVE_CHECKS
-    assert(MSSA.dominates(To, What));
-#endif
-    if (shouldIgnoreCache())
-      return;
-    WC.insert(What, To, Loc, Query->IsCall);
-  }
-
-  MemoryAccess *lookupCache(const MemoryAccess *MA, const MemoryLocation &Loc) {
-    return shouldIgnoreCache() ? nullptr : WC.lookup(MA, Loc, Query->IsCall);
-  }
-
-  void cacheDefPath(const DefPath &DN, MemoryAccess *Target) const {
-    if (shouldIgnoreCache())
-      return;
-
-    for (MemoryAccess *MA : def_chain(DN.First, DN.Last))
-      addCacheEntry(MA, Target, DN.Loc);
-
-    // DefPaths only express the path we walked. So, DN.Last could either be a
-    // thing we want to cache, or not.
-    if (DN.Last != Target)
-      addCacheEntry(DN.Last, Target, DN.Loc);
-  }
 
   /// Find the nearest def or phi that `From` can legally be optimized to.
-  ///
-  /// FIXME: Deduplicate this with MSSA::findDominatingDef. Ideally, MSSA should
-  /// keep track of this information for us, and allow us O(1) lookups of this
-  /// info.
-  MemoryAccess *getWalkTarget(const MemoryPhi *From) {
+  const MemoryAccess *getWalkTarget(const MemoryPhi *From) const {
     assert(From->getNumOperands() && "Phi with no operands?");
 
     BasicBlock *BB = From->getBlock();
-    auto At = WalkTargetCache.find(BB);
-    if (At != WalkTargetCache.end())
-      return At->second;
-
-    SmallVector<const BasicBlock *, 8> ToCache;
-    ToCache.push_back(BB);
-
     MemoryAccess *Result = MSSA.getLiveOnEntryDef();
     DomTreeNode *Node = DT.getNode(BB);
     while ((Node = Node->getIDom())) {
-      auto At = WalkTargetCache.find(BB);
-      if (At != WalkTargetCache.end()) {
-        Result = At->second;
-        break;
-      }
-
-      auto *Accesses = MSSA.getBlockAccesses(Node->getBlock());
-      if (Accesses) {
-        auto Iter = find_if(reverse(*Accesses), [](const MemoryAccess &MA) {
-          return !isa<MemoryUse>(MA);
-        });
-        if (Iter != Accesses->rend()) {
-          Result = const_cast<MemoryAccess *>(&*Iter);
-          break;
-        }
-      }
-
-      ToCache.push_back(Node->getBlock());
+      auto *Defs = MSSA.getBlockDefs(Node->getBlock());
+      if (Defs)
+        return &*Defs->rbegin();
     }
-
-    for (const BasicBlock *BB : ToCache)
-      WalkTargetCache.insert({BB, Result});
     return Result;
   }
 
@@ -606,7 +449,6 @@ class ClobberWalker {
     /// both.
     MemoryAccess *Result;
     bool IsKnownClobber;
-    bool FromCache;
   };
 
   /// Walk to the next Phi or Clobber in the def chain starting at Desc.Last.
@@ -614,29 +456,25 @@ class ClobberWalker {
   /// StopAt.
   ///
   /// This does not test for whether StopAt is a clobber
-  UpwardsWalkResult walkToPhiOrClobber(DefPath &Desc,
-                                       MemoryAccess *StopAt = nullptr) {
+  UpwardsWalkResult
+  walkToPhiOrClobber(DefPath &Desc,
+                     const MemoryAccess *StopAt = nullptr) const {
     assert(!isa<MemoryUse>(Desc.Last) && "Uses don't exist in my world");
 
     for (MemoryAccess *Current : def_chain(Desc.Last)) {
       Desc.Last = Current;
       if (Current == StopAt)
-        return {Current, false, false};
+        return {Current, false};
 
       if (auto *MD = dyn_cast<MemoryDef>(Current))
         if (MSSA.isLiveOnEntryDef(MD) ||
             instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA))
-          return {MD, true, false};
-
-      // Cache checks must be done last, because if Current is a clobber, the
-      // cache will contain the clobber for Current.
-      if (MemoryAccess *MA = lookupCache(Current, Desc.Loc))
-        return {MA, true, true};
+          return {MD, true};
     }
 
     assert(isa<MemoryPhi>(Desc.Last) &&
            "Ended at a non-clobber that's not a phi?");
-    return {Desc.Last, false, false};
+    return {Desc.Last, false};
   }
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
@@ -666,7 +504,7 @@ class ClobberWalker {
   /// If this returns None, NewPaused is a vector of searches that terminated
   /// at StopWhere. Otherwise, NewPaused is left in an unspecified state.
   Optional<TerminatedPath>
-  getBlockingAccess(MemoryAccess *StopWhere,
+  getBlockingAccess(const MemoryAccess *StopWhere,
                     SmallVectorImpl<ListIndex> &PausedSearches,
                     SmallVectorImpl<ListIndex> &NewPaused,
                     SmallVectorImpl<TerminatedPath> &Terminated) {
@@ -701,11 +539,11 @@ class ClobberWalker {
 
       UpwardsWalkResult Res = walkToPhiOrClobber(Node, /*StopAt=*/StopWhere);
       if (Res.IsKnownClobber) {
-        assert(Res.Result != StopWhere || Res.FromCache);
+        assert(Res.Result != StopWhere);
         // If this wasn't a cache hit, we hit a clobber when walking. That's a
         // failure.
         TerminatedPath Term{Res.Result, PathIndex};
-        if (!Res.FromCache || !MSSA.dominates(Res.Result, StopWhere))
+        if (!MSSA.dominates(Res.Result, StopWhere))
           return Term;
 
         // Otherwise, it's a valid thing to potentially optimize to.
@@ -830,7 +668,7 @@ class ClobberWalker {
       assert(!MSSA.isLiveOnEntryDef(Current) &&
              "liveOnEntry wasn't treated as a clobber?");
 
-      MemoryAccess *Target = getWalkTarget(Current);
+      const auto *Target = getWalkTarget(Current);
       // If a TerminatedPath doesn't dominate Target, then it wasn't a legal
       // optimization for the prior phi.
       assert(all_of(TerminatedPaths, [&](const TerminatedPath &P) {
@@ -842,8 +680,6 @@ class ClobberWalker {
       // For the moment, this is fine, since we do nothing with blocker info.
       if (Optional<TerminatedPath> Blocker = getBlockingAccess(
               Target, PausedSearches, NewPaused, TerminatedPaths)) {
-        // Cache our work on the blocking node, since we know that's correct.
-        cacheDefPath(Paths[Blocker->LastNode], Blocker->Clobber);
 
         // Find the node we started at. We can't search based on N->Last, since
         // we may have gone around a loop with a different MemoryLocation.
@@ -908,7 +744,7 @@ class ClobberWalker {
         // If we couldn't find the dominating phi/liveOnEntry in the above loop,
         // do it now.
         if (!DefChainEnd)
-          for (MemoryAccess *MA : def_chain(Target))
+          for (auto *MA : def_chain(const_cast<MemoryAccess *>(Target)))
             DefChainEnd = MA;
 
         // If any of the terminated paths don't dominate the phi we'll try to
@@ -946,35 +782,6 @@ class ClobberWalker {
     }
   }
 
-  /// Caches everything in an OptznResult.
-  void cacheOptResult(const OptznResult &R) {
-    if (R.OtherClobbers.empty()) {
-      // If we're not going to be caching OtherClobbers, don't bother with
-      // marking visited/etc.
-      for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode))
-        cacheDefPath(N, R.PrimaryClobber.Clobber);
-      return;
-    }
-
-    // PrimaryClobber is our answer. If we can cache anything back, we need to
-    // stop caching when we visit PrimaryClobber.
-    SmallBitVector Visited(Paths.size());
-    for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode)) {
-      Visited[defPathIndex(N)] = true;
-      cacheDefPath(N, R.PrimaryClobber.Clobber);
-    }
-
-    for (const TerminatedPath &P : R.OtherClobbers) {
-      for (const DefPath &N : const_def_path(P.LastNode)) {
-        ListIndex NIndex = defPathIndex(N);
-        if (Visited[NIndex])
-          break;
-        Visited[NIndex] = true;
-        cacheDefPath(N, P.Clobber);
-      }
-    }
-  }
-
   void verifyOptResult(const OptznResult &R) const {
     assert(all_of(R.OtherClobbers, [&](const TerminatedPath &P) {
       return MSSA.dominates(P.Clobber, R.PrimaryClobber.Clobber);
@@ -987,17 +794,14 @@ class ClobberWalker {
   }
 
 public:
-  ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT,
-                WalkerCache &WC)
-      : MSSA(MSSA), AA(AA), DT(DT), WC(WC), UseCache(true) {}
+  ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT)
+      : MSSA(MSSA), AA(AA), DT(DT) {}
 
-  void reset() { WalkTargetCache.clear(); }
+  void reset() {}
 
   /// Finds the nearest clobber for the given query, optimizing phis if
   /// possible.
-  MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q,
-                            bool UseWalkerCache = true) {
-    setUseCache(UseWalkerCache);
+  MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q) {
     Query = &Q;
 
     MemoryAccess *Current = Start;
@@ -1012,13 +816,11 @@ public:
     UpwardsWalkResult WalkResult = walkToPhiOrClobber(FirstDesc);
     MemoryAccess *Result;
     if (WalkResult.IsKnownClobber) {
-      cacheDefPath(FirstDesc, WalkResult.Result);
       Result = WalkResult.Result;
     } else {
       OptznResult OptRes = tryOptimizePhi(cast<MemoryPhi>(FirstDesc.Last),
                                           Current, Q.StartingLoc);
       verifyOptResult(OptRes);
-      cacheOptResult(OptRes);
       resetPhiOptznState();
       Result = OptRes.PrimaryClobber.Clobber;
     }
@@ -1049,41 +851,10 @@ struct RenamePassData {
 } // anonymous namespace
 
 namespace llvm {
-/// \brief A MemorySSAWalker that does AA walks and caching of lookups to
-/// disambiguate accesses.
-///
-/// FIXME: The current implementation of this can take quadratic space in rare
-/// cases. This can be fixed, but it is something to note until it is fixed.
-///
-/// In order to trigger this behavior, you need to store to N distinct locations
-/// (that AA can prove don't alias), perform M stores to other memory
-/// locations that AA can prove don't alias any of the initial N locations, and
-/// then load from all of the N locations. In this case, we insert M cache
-/// entries for each of the N loads.
-///
-/// For example:
-/// define i32 @foo() {
-///   %a = alloca i32, align 4
-///   %b = alloca i32, align 4
-///   store i32 0, i32* %a, align 4
-///   store i32 0, i32* %b, align 4
-///
-///   ; Insert M stores to other memory that doesn't alias %a or %b here
-///
-///   %c = load i32, i32* %a, align 4 ; Caches M entries in
-///                                   ; CachedUpwardsClobberingAccess for the
-///                                   ; MemoryLocation %a
-///   %d = load i32, i32* %b, align 4 ; Caches M entries in
-///                                   ; CachedUpwardsClobberingAccess for the
-///                                   ; MemoryLocation %b
-///
-///   ; For completeness' sake, loading %a or %b again would not cache *another*
-///   ; M entries.
-///   %r = add i32 %c, %d
-///   ret i32 %r
-/// }
+/// \brief A MemorySSAWalker that does AA walks to disambiguate accesses. It no
+/// longer does caching on its own,
+/// but the name has been retained for the moment.
 class MemorySSA::CachingWalker final : public MemorySSAWalker {
-  WalkerCache Cache;
   ClobberWalker Walker;
   bool AutoResetWalker;
 
@@ -1104,10 +875,7 @@ public:
   /// answer a clobber query.
   void setAutoResetWalker(bool AutoReset) { AutoResetWalker = AutoReset; }
 
-  /// Drop the walker's persistent data structures. At the moment, this means
-  /// "drop the walker's cache of BasicBlocks ->
-  /// earliest-MemoryAccess-we-can-optimize-to". This is necessary if we're
-  /// going to have DT updates, if we remove MemoryAccesses, etc.
+  /// Drop the walker's persistent data structures.
   void resetClobberWalker() { Walker.reset(); }
 
   void verify(const MemorySSA *MSSA) override {
@@ -1116,18 +884,37 @@ public:
   }
 };
 
+void MemorySSA::renameSuccessorPhis(BasicBlock *BB, MemoryAccess *IncomingVal,
+                                    bool RenameAllUses) {
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    if (RenameAllUses) {
+      int PhiIndex = Phi->getBasicBlockIndex(BB);
+      assert(PhiIndex != -1 && "Incomplete phi during partial rename");
+      Phi->setIncomingValue(PhiIndex, IncomingVal);
+    } else
+      Phi->addIncoming(IncomingVal, BB);
+  }
+}
+
 /// \brief Rename a single basic block into MemorySSA form.
 /// Uses the standard SSA renaming algorithm.
 /// \returns The new incoming value.
-MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
-                                     MemoryAccess *IncomingVal) {
+MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB, MemoryAccess *IncomingVal,
+                                     bool RenameAllUses) {
   auto It = PerBlockAccesses.find(BB);
   // Skip most processing if the list is empty.
   if (It != PerBlockAccesses.end()) {
     AccessList *Accesses = It->second.get();
     for (MemoryAccess &L : *Accesses) {
       if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(&L)) {
-        if (MUD->getDefiningAccess() == nullptr)
+        if (MUD->getDefiningAccess() == nullptr || RenameAllUses)
           MUD->setDefiningAccess(IncomingVal);
         if (isa<MemoryDef>(&L))
           IncomingVal = &L;
@@ -1136,18 +923,6 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
       }
     }
   }
-
-  // Pass through values to our successors
-  for (const BasicBlock *S : successors(BB)) {
-    auto It = PerBlockAccesses.find(S);
-    // Rename the phi nodes in our successor block
-    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
-      continue;
-    AccessList *Accesses = It->second.get();
-    auto *Phi = cast<MemoryPhi>(&Accesses->front());
-    Phi->addIncoming(IncomingVal, BB);
-  }
-
   return IncomingVal;
 }
 
@@ -1156,11 +931,19 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
 /// We walk the dominator tree in preorder, renaming accesses, and then filling
 /// in phi nodes in our successors.
 void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
-                           SmallPtrSet<BasicBlock *, 16> &Visited) {
+                           SmallPtrSetImpl<BasicBlock *> &Visited,
+                           bool SkipVisited, bool RenameAllUses) {
   SmallVector<RenamePassData, 32> WorkStack;
-  IncomingVal = renameBlock(Root->getBlock(), IncomingVal);
+  // Skip everything if we already renamed this block and we are skipping.
+  // Note: You can't sink this into the if, because we need it to occur
+  // regardless of whether we skip blocks or not.
+  bool AlreadyVisited = !Visited.insert(Root->getBlock()).second;
+  if (SkipVisited && AlreadyVisited)
+    return;
+
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses);
+  renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses);
   WorkStack.push_back({Root, Root->begin(), IncomingVal});
-  Visited.insert(Root->getBlock());
 
   while (!WorkStack.empty()) {
     DomTreeNode *Node = WorkStack.back().DTN;
@@ -1173,20 +956,25 @@ void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
       DomTreeNode *Child = *ChildIt;
       ++WorkStack.back().ChildIt;
       BasicBlock *BB = Child->getBlock();
-      Visited.insert(BB);
-      IncomingVal = renameBlock(BB, IncomingVal);
+      // Note: You can't sink this into the if, because we need it to occur
+      // regardless of whether we skip blocks or not.
+      AlreadyVisited = !Visited.insert(BB).second;
+      if (SkipVisited && AlreadyVisited) {
+        // We already visited this during our renaming, which can happen when
+        // being asked to rename multiple blocks. Figure out the incoming val,
+        // which is the last def.
+        // Incoming value can only change if there is a block def, and in that
+        // case, it's the last block def in the list.
+        if (auto *BlockDefs = getWritableBlockDefs(BB))
+          IncomingVal = &*BlockDefs->rbegin();
+      } else
+        IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses);
+      renameSuccessorPhis(BB, IncomingVal, RenameAllUses);
       WorkStack.push_back({Child, Child->begin(), IncomingVal});
     }
   }
 }
 
-/// \brief Compute dominator levels, used by the phi insertion algorithm above.
-void MemorySSA::computeDomLevels(DenseMap<DomTreeNode *, unsigned> &DomLevels) {
-  for (auto DFI = df_begin(DT->getRootNode()), DFE = df_end(DT->getRootNode());
-       DFI != DFE; ++DFI)
-    DomLevels[*DFI] = DFI.getPathLength() - 1;
-}
-
 /// \brief This handles unreachable block accesses by deleting phi nodes in
 /// unreachable blocks, and marking all other unreachable MemoryAccess's as
 /// being uses of the live on entry definition.
@@ -1247,6 +1035,13 @@ MemorySSA::AccessList *MemorySSA::getOrCreateAccessList(const BasicBlock *BB) {
     Res.first->second = make_unique<AccessList>();
   return Res.first->second.get();
 }
+MemorySSA::DefsList *MemorySSA::getOrCreateDefsList(const BasicBlock *BB) {
+  auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<DefsList>();
+  return Res.first->second.get();
+}
 
 /// This class is a batch walker of all MemoryUse's in the program, and points
 /// their defining access at the thing that actually clobbers them.  Because it
@@ -1315,7 +1110,10 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
 
   // Pop everything that doesn't dominate the current block off the stack,
   // increment the PopEpoch to account for this.
-  while (!VersionStack.empty()) {
+  while (true) {
+    assert(
+        !VersionStack.empty() &&
+        "Version stack should have liveOnEntry sentinel dominating everything");
     BasicBlock *BackBlock = VersionStack.back()->getBlock();
     if (DT->dominates(BackBlock, BB))
       break;
@@ -1323,6 +1121,7 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
       VersionStack.pop_back();
     ++PopEpoch;
   }
+
   for (MemoryAccess &MA : *Accesses) {
     auto *MU = dyn_cast<MemoryUse>(&MA);
     if (!MU) {
@@ -1443,20 +1242,13 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
 
 /// Optimize uses to point to their actual clobbering definitions.
 void MemorySSA::OptimizeUses::optimizeUses() {
-
-  // We perform a non-recursive top-down dominator tree walk
-  struct StackInfo {
-    const DomTreeNode *Node;
-    DomTreeNode::const_iterator Iter;
-  };
-
   SmallVector<MemoryAccess *, 16> VersionStack;
-  SmallVector<StackInfo, 16> DomTreeWorklist;
   DenseMap<MemoryLocOrCall, MemlocStackInfo> LocStackInfo;
   VersionStack.push_back(MSSA->getLiveOnEntryDef());
 
   unsigned long StackEpoch = 1;
   unsigned long PopEpoch = 1;
+  // We perform a non-recursive top-down dominator tree walk.
   for (const auto *DomNode : depth_first(DT->getRootNode()))
     optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
                         LocStackInfo);
@@ -1477,14 +1269,8 @@ void MemorySSA::placePHINodes(
             });
 
   // Now place MemoryPhi nodes.
-  for (auto &BB : IDFBlocks) {
-    // Insert phi node
-    AccessList *Accesses = getOrCreateAccessList(BB);
-    MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
-    ValueToMemoryAccess[BB] = Phi;
-    // Phi's always are placed at the front of the block.
-    Accesses->push_front(Phi);
-  }
+  for (auto &BB : IDFBlocks)
+    createMemoryPhi(BB);
 }
 
 void MemorySSA::buildMemorySSA() {
@@ -1504,27 +1290,30 @@ void MemorySSA::buildMemorySSA() {
   // could just look up the memory access for every possible instruction in the
   // stream.
   SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
-  SmallPtrSet<BasicBlock *, 32> DefUseBlocks;
   // Go through each block, figure out where defs occur, and chain together all
   // the accesses.
   for (BasicBlock &B : F) {
     BBNumbers[&B] = NextBBNum++;
     bool InsertIntoDef = false;
     AccessList *Accesses = nullptr;
+    DefsList *Defs = nullptr;
     for (Instruction &I : B) {
       MemoryUseOrDef *MUD = createNewAccess(&I);
       if (!MUD)
         continue;
-      InsertIntoDef |= isa<MemoryDef>(MUD);
 
       if (!Accesses)
         Accesses = getOrCreateAccessList(&B);
       Accesses->push_back(MUD);
+      if (isa<MemoryDef>(MUD)) {
+        InsertIntoDef = true;
+        if (!Defs)
+          Defs = getOrCreateDefsList(&B);
+        Defs->push_back(*MUD);
+      }
     }
     if (InsertIntoDef)
       DefiningBlocks.insert(&B);
-    if (Accesses)
-      DefUseBlocks.insert(&B);
   }
   placePHINodes(DefiningBlocks, BBNumbers);
 
@@ -1558,14 +1347,94 @@ MemorySSA::CachingWalker *MemorySSA::getWalkerImpl() {
   return Walker.get();
 }
 
+// This is a helper function used by the creation routines. It places NewAccess
+// into the access and defs lists for a given basic block, at the given
+// insertion point.
+void MemorySSA::insertIntoListsForBlock(MemoryAccess *NewAccess,
+                                        const BasicBlock *BB,
+                                        InsertionPlace Point) {
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // If it's a phi node, it goes first, otherwise, it goes after any phi
+    // nodes.
+    if (isa<MemoryPhi>(NewAccess)) {
+      Accesses->push_front(NewAccess);
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_front(*NewAccess);
+    } else {
+      auto AI = find_if_not(
+          *Accesses, [](const MemoryAccess &MA) { return isa<MemoryPhi>(MA); });
+      Accesses->insert(AI, NewAccess);
+      if (!isa<MemoryUse>(NewAccess)) {
+        auto *Defs = getOrCreateDefsList(BB);
+        auto DI = find_if_not(
+            *Defs, [](const MemoryAccess &MA) { return isa<MemoryPhi>(MA); });
+        Defs->insert(DI, *NewAccess);
+      }
+    }
+  } else {
+    Accesses->push_back(NewAccess);
+    if (!isa<MemoryUse>(NewAccess)) {
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_back(*NewAccess);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+void MemorySSA::insertIntoListsBefore(MemoryAccess *What, const BasicBlock *BB,
+                                      AccessList::iterator InsertPt) {
+  auto *Accesses = getWritableBlockAccesses(BB);
+  bool WasEnd = InsertPt == Accesses->end();
+  Accesses->insert(AccessList::iterator(InsertPt), What);
+  if (!isa<MemoryUse>(What)) {
+    auto *Defs = getOrCreateDefsList(BB);
+    // If we got asked to insert at the end, we have an easy job, just shove it
+    // at the end. If we got asked to insert before an existing def, we also get
+    // an terator. If we got asked to insert before a use, we have to hunt for
+    // the next def.
+    if (WasEnd) {
+      Defs->push_back(*What);
+    } else if (isa<MemoryDef>(InsertPt)) {
+      Defs->insert(InsertPt->getDefsIterator(), *What);
+    } else {
+      while (InsertPt != Accesses->end() && !isa<MemoryDef>(InsertPt))
+        ++InsertPt;
+      // Either we found a def, or we are inserting at the end
+      if (InsertPt == Accesses->end())
+        Defs->push_back(*What);
+      else
+        Defs->insert(InsertPt->getDefsIterator(), *What);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+// Move What before Where in the IR.  The end result is taht What will belong to
+// the right lists and have the right Block set, but will not otherwise be
+// correct. It will not have the right defining access, and if it is a def,
+// things below it will not properly be updated.
+void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+                       AccessList::iterator Where) {
+  // Keep it in the lookup tables, remove from the lists
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsBefore(What, BB, Where);
+}
+
+void MemorySSA::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+                       InsertionPlace Point) {
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsForBlock(What, BB, Point);
+}
+
 MemoryPhi *MemorySSA::createMemoryPhi(BasicBlock *BB) {
   assert(!getMemoryAccess(BB) && "MemoryPhi already exists for this BB");
-  AccessList *Accesses = getOrCreateAccessList(BB);
   MemoryPhi *Phi = new MemoryPhi(BB->getContext(), BB, NextID++);
-  ValueToMemoryAccess[BB] = Phi;
   // Phi's always are placed at the front of the block.
-  Accesses->push_front(Phi);
-  BlockNumberingValid.erase(BB);
+  insertIntoListsForBlock(Phi, BB, Beginning);
+  ValueToMemoryAccess[BB] = Phi;
   return Phi;
 }
 
@@ -1580,72 +1449,19 @@ MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
   return NewAccess;
 }
 
-MemoryAccess *MemorySSA::createMemoryAccessInBB(Instruction *I,
-                                                MemoryAccess *Definition,
-                                                const BasicBlock *BB,
-                                                InsertionPlace Point) {
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  auto *Accesses = getOrCreateAccessList(BB);
-  if (Point == Beginning) {
-    // It goes after any phi nodes
-    auto AI = find_if(
-        *Accesses, [](const MemoryAccess &MA) { return !isa<MemoryPhi>(MA); });
-
-    Accesses->insert(AI, NewAccess);
-  } else {
-    Accesses->push_back(NewAccess);
+// Return true if the instruction has ordering constraints.
+// Note specifically that this only considers stores and loads
+// because others are still considered ModRef by getModRefInfo.
+static inline bool isOrdered(const Instruction *I) {
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (!SI->isUnordered())
+      return true;
+  } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (!LI->isUnordered())
+      return true;
   }
-  BlockNumberingValid.erase(BB);
-  return NewAccess;
-}
-
-MemoryUseOrDef *MemorySSA::createMemoryAccessBefore(Instruction *I,
-                                                    MemoryAccess *Definition,
-                                                    MemoryUseOrDef *InsertPt) {
-  assert(I->getParent() == InsertPt->getBlock() &&
-         "New and old access must be in the same block");
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
-  Accesses->insert(AccessList::iterator(InsertPt), NewAccess);
-  BlockNumberingValid.erase(InsertPt->getBlock());
-  return NewAccess;
-}
-
-MemoryUseOrDef *MemorySSA::createMemoryAccessAfter(Instruction *I,
-                                                   MemoryAccess *Definition,
-                                                   MemoryAccess *InsertPt) {
-  assert(I->getParent() == InsertPt->getBlock() &&
-         "New and old access must be in the same block");
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  auto *Accesses = getOrCreateAccessList(InsertPt->getBlock());
-  Accesses->insertAfter(AccessList::iterator(InsertPt), NewAccess);
-  BlockNumberingValid.erase(InsertPt->getBlock());
-  return NewAccess;
-}
-
-void MemorySSA::spliceMemoryAccessAbove(MemoryDef *Where,
-                                        MemoryUseOrDef *What) {
-  assert(What != getLiveOnEntryDef() &&
-         Where != getLiveOnEntryDef() && "Can't splice (above) LOE.");
-  assert(dominates(Where, What) && "Only upwards splices are permitted.");
-
-  if (Where == What)
-    return;
-  if (isa<MemoryDef>(What)) {
-    // TODO: possibly use removeMemoryAccess' more efficient RAUW
-    What->replaceAllUsesWith(What->getDefiningAccess());
-    What->setDefiningAccess(Where->getDefiningAccess());
-    Where->setDefiningAccess(What);
-  }
-  AccessList *Src = getWritableBlockAccesses(What->getBlock());
-  AccessList *Dest = getWritableBlockAccesses(Where->getBlock());
-  Dest->splice(AccessList::iterator(Where), *Src, What);
-
-  BlockNumberingValid.erase(What->getBlock());
-  if (What->getBlock() != Where->getBlock())
-    BlockNumberingValid.erase(Where->getBlock());
+  return false;
 }
-
 /// \brief Helper function to create new memory accesses
 MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   // The assume intrinsic has a control dependency which we model by claiming
@@ -1658,7 +1474,15 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
 
   // Find out what affect this instruction has on memory.
   ModRefInfo ModRef = AA->getModRefInfo(I);
-  bool Def = bool(ModRef & MRI_Mod);
+  // The isOrdered check is used to ensure that volatiles end up as defs
+  // (atomics end up as ModRef right now anyway).  Until we separate the
+  // ordering chain from the memory chain, this enables people to see at least
+  // some relative ordering to volatiles.  Note that getClobberingMemoryAccess
+  // will still give an answer that bypasses other volatile loads.  TODO:
+  // Separate memory aliasing and ordering into two different chains so that we
+  // can precisely represent both "what memory will this read/write/is clobbered
+  // by" and "what instructions can I move this past".
+  bool Def = bool(ModRef & MRI_Mod) || isOrdered(I);
   bool Use = bool(ModRef & MRI_Ref);
 
   // It's possible for an instruction to not modify memory at all. During
@@ -1678,33 +1502,6 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   return MUD;
 }
 
-MemoryAccess *MemorySSA::findDominatingDef(BasicBlock *UseBlock,
-                                           enum InsertionPlace Where) {
-  // Handle the initial case
-  if (Where == Beginning)
-    // The only thing that could define us at the beginning is a phi node
-    if (MemoryPhi *Phi = getMemoryAccess(UseBlock))
-      return Phi;
-
-  DomTreeNode *CurrNode = DT->getNode(UseBlock);
-  // Need to be defined by our dominator
-  if (Where == Beginning)
-    CurrNode = CurrNode->getIDom();
-  Where = End;
-  while (CurrNode) {
-    auto It = PerBlockAccesses.find(CurrNode->getBlock());
-    if (It != PerBlockAccesses.end()) {
-      auto &Accesses = It->second;
-      for (MemoryAccess &RA : reverse(*Accesses)) {
-        if (isa<MemoryDef>(RA) || isa<MemoryPhi>(RA))
-          return &RA;
-      }
-    }
-    CurrNode = CurrNode->getIDom();
-  }
-  return LiveOnEntryDef.get();
-}
-
 /// \brief Returns true if \p Replacer dominates \p Replacee .
 bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
                              const MemoryAccess *Replacee) const {
@@ -1722,24 +1519,7 @@ bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
   return true;
 }
 
-/// \brief If all arguments of a MemoryPHI are defined by the same incoming
-/// argument, return that argument.
-static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
-  MemoryAccess *MA = nullptr;
-
-  for (auto &Arg : MP->operands()) {
-    if (!MA)
-      MA = cast<MemoryAccess>(Arg);
-    else if (MA != Arg)
-      return nullptr;
-  }
-  return MA;
-}
-
 /// \brief Properly remove \p MA from all of MemorySSA's lookup tables.
-///
-/// Because of the way the intrusive list and use lists work, it is important to
-/// do removal in the right order.
 void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   assert(MA->use_empty() &&
          "Trying to remove memory access that still has uses");
@@ -1760,69 +1540,46 @@ void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   auto VMA = ValueToMemoryAccess.find(MemoryInst);
   if (VMA->second == MA)
     ValueToMemoryAccess.erase(VMA);
+}
 
+/// \brief Properly remove \p MA from all of MemorySSA's lists.
+///
+/// Because of the way the intrusive list and use lists work, it is important to
+/// do removal in the right order.
+/// ShouldDelete defaults to true, and will cause the memory access to also be
+/// deleted, not just removed.
+void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
+  // The access list owns the reference, so we erase it from the non-owning list
+  // first.
+  if (!isa<MemoryUse>(MA)) {
+    auto DefsIt = PerBlockDefs.find(MA->getBlock());
+    std::unique_ptr<DefsList> &Defs = DefsIt->second;
+    Defs->remove(*MA);
+    if (Defs->empty())
+      PerBlockDefs.erase(DefsIt);
+  }
+
+  // The erase call here will delete it. If we don't want it deleted, we call
+  // remove instead.
   auto AccessIt = PerBlockAccesses.find(MA->getBlock());
   std::unique_ptr<AccessList> &Accesses = AccessIt->second;
-  Accesses->erase(MA);
+  if (ShouldDelete)
+    Accesses->erase(MA);
+  else
+    Accesses->remove(MA);
+
   if (Accesses->empty())
     PerBlockAccesses.erase(AccessIt);
 }
 
-void MemorySSA::removeMemoryAccess(MemoryAccess *MA) {
-  assert(!isLiveOnEntryDef(MA) && "Trying to remove the live on entry def");
-  // We can only delete phi nodes if they have no uses, or we can replace all
-  // uses with a single definition.
-  MemoryAccess *NewDefTarget = nullptr;
-  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
-    // Note that it is sufficient to know that all edges of the phi node have
-    // the same argument.  If they do, by the definition of dominance frontiers
-    // (which we used to place this phi), that argument must dominate this phi,
-    // and thus, must dominate the phi's uses, and so we will not hit the assert
-    // below.
-    NewDefTarget = onlySingleValue(MP);
-    assert((NewDefTarget || MP->use_empty()) &&
-           "We can't delete this memory phi");
-  } else {
-    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
-  }
-
-  // Re-point the uses at our defining access
-  if (!MA->use_empty()) {
-    // Reset optimized on users of this store, and reset the uses.
-    // A few notes:
-    // 1. This is a slightly modified version of RAUW to avoid walking the
-    // uses twice here.
-    // 2. If we wanted to be complete, we would have to reset the optimized
-    // flags on users of phi nodes if doing the below makes a phi node have all
-    // the same arguments. Instead, we prefer users to removeMemoryAccess those
-    // phi nodes, because doing it here would be N^3.
-    if (MA->hasValueHandle())
-      ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
-    // Note: We assume MemorySSA is not used in metadata since it's not really
-    // part of the IR.
-
-    while (!MA->use_empty()) {
-      Use &U = *MA->use_begin();
-      if (MemoryUse *MU = dyn_cast<MemoryUse>(U.getUser()))
-        MU->resetOptimized();
-      U.set(NewDefTarget);
-    }
-  }
-
-  // The call below to erase will destroy MA, so we can't change the order we
-  // are doing things here
-  removeFromLookups(MA);
-}
-
 void MemorySSA::print(raw_ostream &OS) const {
   MemorySSAAnnotatedWriter Writer(this);
   F.print(OS, &Writer);
 }
 
-void MemorySSA::dump() const {
-  MemorySSAAnnotatedWriter Writer(this);
-  F.print(dbgs(), &Writer);
-}
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MemorySSA::dump() const { print(dbgs()); }
+#endif
 
 void MemorySSA::verifyMemorySSA() const {
   verifyDefUses(F);
@@ -1838,26 +1595,41 @@ void MemorySSA::verifyOrdering(Function &F) const {
   // lists think, as well as the order in the blocks vs the order in the access
   // lists.
   SmallVector<MemoryAccess *, 32> ActualAccesses;
+  SmallVector<MemoryAccess *, 32> ActualDefs;
   for (BasicBlock &B : F) {
     const AccessList *AL = getBlockAccesses(&B);
+    const auto *DL = getBlockDefs(&B);
     MemoryAccess *Phi = getMemoryAccess(&B);
-    if (Phi)
+    if (Phi) {
       ActualAccesses.push_back(Phi);
+      ActualDefs.push_back(Phi);
+    }
+
     for (Instruction &I : B) {
       MemoryAccess *MA = getMemoryAccess(&I);
-      assert((!MA || AL) && "We have memory affecting instructions "
-                            "in this block but they are not in the "
-                            "access list");
-      if (MA)
+      assert((!MA || (AL && (isa<MemoryUse>(MA) || DL))) &&
+             "We have memory affecting instructions "
+             "in this block but they are not in the "
+             "access list or defs list");
+      if (MA) {
         ActualAccesses.push_back(MA);
+        if (isa<MemoryDef>(MA))
+          ActualDefs.push_back(MA);
+      }
     }
     // Either we hit the assert, really have no accesses, or we have both
-    // accesses and an access list
-    if (!AL)
+    // accesses and an access list.
+    // Same with defs.
+    if (!AL && !DL)
       continue;
     assert(AL->size() == ActualAccesses.size() &&
            "We don't have the same number of accesses in the block as on the "
            "access list");
+    assert((DL || ActualDefs.size() == 0) &&
+           "Either we should have a defs list, or we should have no defs");
+    assert((!DL || DL->size() == ActualDefs.size()) &&
+           "We don't have the same number of defs in the block as on the "
+           "def list");
     auto ALI = AL->begin();
     auto AAI = ActualAccesses.begin();
     while (ALI != AL->end() && AAI != ActualAccesses.end()) {
@@ -1866,6 +1638,16 @@ void MemorySSA::verifyOrdering(Function &F) const {
       ++AAI;
     }
     ActualAccesses.clear();
+    if (DL) {
+      auto DLI = DL->begin();
+      auto ADI = ActualDefs.begin();
+      while (DLI != DL->end() && ADI != ActualDefs.end()) {
+        assert(&*DLI == *ADI && "Not the same defs in the same order");
+        ++DLI;
+        ++ADI;
+      }
+    }
+    ActualDefs.clear();
   }
 }
 
@@ -2016,6 +1798,15 @@ bool MemorySSA::dominates(const MemoryAccess *Dominator,
 
 const static char LiveOnEntryStr[] = "liveOnEntry";
 
+void MemoryAccess::print(raw_ostream &OS) const {
+  switch (getValueID()) {
+  case MemoryPhiVal: return static_cast<const MemoryPhi *>(this)->print(OS);
+  case MemoryDefVal: return static_cast<const MemoryDef *>(this)->print(OS);
+  case MemoryUseVal: return static_cast<const MemoryUse *>(this)->print(OS);
+  }
+  llvm_unreachable("invalid value id");
+}
+
 void MemoryDef::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
 
@@ -2053,8 +1844,6 @@ void MemoryPhi::print(raw_ostream &OS) const {
   OS << ')';
 }
 
-MemoryAccess::~MemoryAccess() {}
-
 void MemoryUse::print(raw_ostream &OS) const {
   MemoryAccess *UO = getDefiningAccess();
   OS << "MemoryUse(";
@@ -2066,8 +1855,11 @@ void MemoryUse::print(raw_ostream &OS) const {
 }
 
 void MemoryAccess::dump() const {
+// Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   print(dbgs());
   dbgs() << "\n";
+#endif
 }
 
 char MemorySSAPrinterLegacyPass::ID = 0;
@@ -2079,7 +1871,6 @@ MemorySSAPrinterLegacyPass::MemorySSAPrinterLegacyPass() : FunctionPass(ID) {
 void MemorySSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MemorySSAWrapperPass>();
-  AU.addPreserved<MemorySSAWrapperPass>();
 }
 
 bool MemorySSAPrinterLegacyPass::runOnFunction(Function &F) {
@@ -2145,35 +1936,13 @@ MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
 
 MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
                                         DominatorTree *D)
-    : MemorySSAWalker(M), Walker(*M, *A, *D, Cache), AutoResetWalker(true) {}
+    : MemorySSAWalker(M), Walker(*M, *A, *D), AutoResetWalker(true) {}
 
 MemorySSA::CachingWalker::~CachingWalker() {}
 
 void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
-  // TODO: We can do much better cache invalidation with differently stored
-  // caches.  For now, for MemoryUses, we simply remove them
-  // from the cache, and kill the entire call/non-call cache for everything
-  // else.  The problem is for phis or defs, currently we'd need to follow use
-  // chains down and invalidate anything below us in the chain that currently
-  // terminates at this access.
-
-  // See if this is a MemoryUse, if so, just remove the cached info. MemoryUse
-  // is by definition never a barrier, so nothing in the cache could point to
-  // this use. In that case, we only need invalidate the info for the use
-  // itself.
-
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(MA)) {
-    UpwardsMemoryQuery Q(MU->getMemoryInst(), MU);
-    Cache.remove(MU, Q.StartingLoc, Q.IsCall);
-    MU->resetOptimized();
-  } else {
-    // If it is not a use, the best we can do right now is destroy the cache.
-    Cache.clear();
-  }
-
-#ifdef EXPENSIVE_CHECKS
-  verifyRemoved(MA);
-#endif
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
+    MUD->resetOptimized();
 }
 
 /// \brief Walk the use-def chains starting at \p MA and find
@@ -2184,9 +1953,9 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
   MemoryAccess *New = Walker.findClobber(StartingAccess, Q);
 #ifdef EXPENSIVE_CHECKS
-  MemoryAccess *NewNoCache =
-      Walker.findClobber(StartingAccess, Q, /*UseWalkerCache=*/false);
+  MemoryAccess *NewNoCache = Walker.findClobber(StartingAccess, Q);
   assert(NewNoCache == New && "Cache made us hand back a different result?");
+  (void)NewNoCache;
 #endif
   if (AutoResetWalker)
     resetClobberWalker();
@@ -2215,9 +1984,6 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
   Q.Inst = I;
   Q.IsCall = false;
 
-  if (auto *CacheResult = Cache.lookup(StartingUseOrDef, Loc, Q.IsCall))
-    return CacheResult;
-
   // Unlike the other function, do not walk to the def of a def, because we are
   // handed something we already believe is the clobbering access.
   MemoryAccess *DefiningAccess = isa<MemoryUse>(StartingUseOrDef)
@@ -2242,9 +2008,9 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   // If this is an already optimized use or def, return the optimized result.
   // Note: Currently, we do not store the optimized def result because we'd need
   // a separate field, since we can't use it as the defining access.
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-    if (MU->isOptimized())
-      return MU->getDefiningAccess();
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    if (MUD->isOptimized())
+      return MUD->getOptimized();
 
   const Instruction *I = StartingAccess->getMemoryInst();
   UpwardsMemoryQuery Q(I, StartingAccess);
@@ -2254,14 +2020,10 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (!Q.IsCall && I->isFenceLike())
     return StartingAccess;
 
-  if (auto *CacheResult = Cache.lookup(StartingAccess, Q.StartingLoc, Q.IsCall))
-    return CacheResult;
-
   if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
     MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
-    Cache.insert(StartingAccess, LiveOnEntry, Q.StartingLoc, Q.IsCall);
-    if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-      MU->setDefiningAccess(LiveOnEntry, true);
+    if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+      MUD->setOptimized(LiveOnEntry);
     return LiveOnEntry;
   }
 
@@ -2278,17 +2040,12 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   DEBUG(dbgs() << *DefiningAccess << "\n");
   DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
   DEBUG(dbgs() << *Result << "\n");
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-    MU->setDefiningAccess(Result, true);
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    MUD->setOptimized(Result);
 
   return Result;
 }
 
-// Verify that MA doesn't exist in any of the caches.
-void MemorySSA::CachingWalker::verifyRemoved(MemoryAccess *MA) {
-  assert(!Cache.contains(MA) && "Found removed MemoryAccess in cache.");
-}
-
 MemoryAccess *
 DoNothingMemorySSAWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
@@ -2303,3 +2060,15 @@ MemoryAccess *DoNothingMemorySSAWalker::getClobberingMemoryAccess(
   return StartingAccess;
 }
 } // namespace llvm
+
+void MemoryPhi::deleteMe(DerivedUser *Self) {
+  delete static_cast<MemoryPhi *>(Self);
+}
+
+void MemoryDef::deleteMe(DerivedUser *Self) {
+  delete static_cast<MemoryDef *>(Self);
+}
+
+void MemoryUse::deleteMe(DerivedUser *Self) {
+  delete static_cast<MemoryUse *>(Self);
+}
diff --git a/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp b/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp
new file mode 100644
index 0000000..1ff8447
--- /dev/null
+++ b/contrib/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -0,0 +1,488 @@
+//===-- MemorySSAUpdater.cpp - Memory SSA Updater--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the MemorySSAUpdater class.
+//
+//===----------------------------------------------------------------===//
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include <algorithm>
+
+#define DEBUG_TYPE "memoryssa"
+using namespace llvm;
+
+// This is the marker algorithm from "Simple and Efficient Construction of
+// Static Single Assignment Form"
+// The simple, non-marker algorithm places phi nodes at any join
+// Here, we place markers, and only place phi nodes if they end up necessary.
+// They are only necessary if they break a cycle (IE we recursively visit
+// ourselves again), or we discover, while getting the value of the operands,
+// that there are two or more definitions needing to be merged.
+// This still will leave non-minimal form in the case of irreducible control
+// flow, where phi nodes may be in cycles with themselves, but unnecessary.
+MemoryAccess *MemorySSAUpdater::getPreviousDefRecursive(BasicBlock *BB) {
+  // Single predecessor case, just recurse, we can only have one definition.
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    return getPreviousDefFromEnd(Pred);
+  } else if (VisitedBlocks.count(BB)) {
+    // We hit our node again, meaning we had a cycle, we must insert a phi
+    // node to break it so we have an operand. The only case this will
+    // insert useless phis is if we have irreducible control flow.
+    return MSSA->createMemoryPhi(BB);
+  } else if (VisitedBlocks.insert(BB).second) {
+    // Mark us visited so we can detect a cycle
+    SmallVector<MemoryAccess *, 8> PhiOps;
+
+    // Recurse to get the values in our predecessors for placement of a
+    // potential phi node. This will insert phi nodes if we cycle in order to
+    // break the cycle and have an operand.
+    for (auto *Pred : predecessors(BB))
+      PhiOps.push_back(getPreviousDefFromEnd(Pred));
+
+    // Now try to simplify the ops to avoid placing a phi.
+    // This may return null if we never created a phi yet, that's okay
+    MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MSSA->getMemoryAccess(BB));
+    bool PHIExistsButNeedsUpdate = false;
+    // See if the existing phi operands match what we need.
+    // Unlike normal SSA, we only allow one phi node per block, so we can't just
+    // create a new one.
+    if (Phi && Phi->getNumOperands() != 0)
+      if (!std::equal(Phi->op_begin(), Phi->op_end(), PhiOps.begin())) {
+        PHIExistsButNeedsUpdate = true;
+      }
+
+    // See if we can avoid the phi by simplifying it.
+    auto *Result = tryRemoveTrivialPhi(Phi, PhiOps);
+    // If we couldn't simplify, we may have to create a phi
+    if (Result == Phi) {
+      if (!Phi)
+        Phi = MSSA->createMemoryPhi(BB);
+
+      // These will have been filled in by the recursive read we did above.
+      if (PHIExistsButNeedsUpdate) {
+        std::copy(PhiOps.begin(), PhiOps.end(), Phi->op_begin());
+        std::copy(pred_begin(BB), pred_end(BB), Phi->block_begin());
+      } else {
+        unsigned i = 0;
+        for (auto *Pred : predecessors(BB))
+          Phi->addIncoming(PhiOps[i++], Pred);
+      }
+
+      Result = Phi;
+    }
+    if (MemoryPhi *MP = dyn_cast<MemoryPhi>(Result))
+      InsertedPHIs.push_back(MP);
+    // Set ourselves up for the next variable by resetting visited state.
+    VisitedBlocks.erase(BB);
+    return Result;
+  }
+  llvm_unreachable("Should have hit one of the three cases above");
+}
+
+// This starts at the memory access, and goes backwards in the block to find the
+// previous definition. If a definition is not found the block of the access,
+// it continues globally, creating phi nodes to ensure we have a single
+// definition.
+MemoryAccess *MemorySSAUpdater::getPreviousDef(MemoryAccess *MA) {
+  auto *LocalResult = getPreviousDefInBlock(MA);
+
+  return LocalResult ? LocalResult : getPreviousDefRecursive(MA->getBlock());
+}
+
+// This starts at the memory access, and goes backwards in the block to the find
+// the previous definition. If the definition is not found in the block of the
+// access, it returns nullptr.
+MemoryAccess *MemorySSAUpdater::getPreviousDefInBlock(MemoryAccess *MA) {
+  auto *Defs = MSSA->getWritableBlockDefs(MA->getBlock());
+
+  // It's possible there are no defs, or we got handed the first def to start.
+  if (Defs) {
+    // If this is a def, we can just use the def iterators.
+    if (!isa<MemoryUse>(MA)) {
+      auto Iter = MA->getReverseDefsIterator();
+      ++Iter;
+      if (Iter != Defs->rend())
+        return &*Iter;
+    } else {
+      // Otherwise, have to walk the all access iterator.
+      auto End = MSSA->getWritableBlockAccesses(MA->getBlock())->rend();
+      for (auto &U : make_range(++MA->getReverseIterator(), End))
+        if (!isa<MemoryUse>(U))
+          return cast<MemoryAccess>(&U);
+      // Note that if MA comes before Defs->begin(), we won't hit a def.
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+// This starts at the end of block
+MemoryAccess *MemorySSAUpdater::getPreviousDefFromEnd(BasicBlock *BB) {
+  auto *Defs = MSSA->getWritableBlockDefs(BB);
+
+  if (Defs)
+    return &*Defs->rbegin();
+
+  return getPreviousDefRecursive(BB);
+}
+// Recurse over a set of phi uses to eliminate the trivial ones
+MemoryAccess *MemorySSAUpdater::recursePhi(MemoryAccess *Phi) {
+  if (!Phi)
+    return nullptr;
+  TrackingVH<MemoryAccess> Res(Phi);
+  SmallVector<TrackingVH<Value>, 8> Uses;
+  std::copy(Phi->user_begin(), Phi->user_end(), std::back_inserter(Uses));
+  for (auto &U : Uses) {
+    if (MemoryPhi *UsePhi = dyn_cast<MemoryPhi>(&*U)) {
+      auto OperRange = UsePhi->operands();
+      tryRemoveTrivialPhi(UsePhi, OperRange);
+    }
+  }
+  return Res;
+}
+
+// Eliminate trivial phis
+// Phis are trivial if they are defined either by themselves, or all the same
+// argument.
+// IE phi(a, a) or b = phi(a, b) or c = phi(a, a, c)
+// We recursively try to remove them.
+template <class RangeType>
+MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
+                                                    RangeType &Operands) {
+  // Detect equal or self arguments
+  MemoryAccess *Same = nullptr;
+  for (auto &Op : Operands) {
+    // If the same or self, good so far
+    if (Op == Phi || Op == Same)
+      continue;
+    // not the same, return the phi since it's not eliminatable by us
+    if (Same)
+      return Phi;
+    Same = cast<MemoryAccess>(Op);
+  }
+  // Never found a non-self reference, the phi is undef
+  if (Same == nullptr)
+    return MSSA->getLiveOnEntryDef();
+  if (Phi) {
+    Phi->replaceAllUsesWith(Same);
+    removeMemoryAccess(Phi);
+  }
+
+  // We should only end up recursing in case we replaced something, in which
+  // case, we may have made other Phis trivial.
+  return recursePhi(Same);
+}
+
+void MemorySSAUpdater::insertUse(MemoryUse *MU) {
+  InsertedPHIs.clear();
+  MU->setDefiningAccess(getPreviousDef(MU));
+  // Unlike for defs, there is no extra work to do.  Because uses do not create
+  // new may-defs, there are only two cases:
+  //
+  // 1. There was a def already below us, and therefore, we should not have
+  // created a phi node because it was already needed for the def.
+  //
+  // 2. There is no def below us, and therefore, there is no extra renaming work
+  // to do.
+}
+
+// Set every incoming edge {BB, MP->getBlock()} of MemoryPhi MP to NewDef.
+static void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB,
+                                      MemoryAccess *NewDef) {
+  // Replace any operand with us an incoming block with the new defining
+  // access.
+  int i = MP->getBasicBlockIndex(BB);
+  assert(i != -1 && "Should have found the basic block in the phi");
+  // We can't just compare i against getNumOperands since one is signed and the
+  // other not. So use it to index into the block iterator.
+  for (auto BBIter = MP->block_begin() + i; BBIter != MP->block_end();
+       ++BBIter) {
+    if (*BBIter != BB)
+      break;
+    MP->setIncomingValue(i, NewDef);
+    ++i;
+  }
+}
+
+// A brief description of the algorithm:
+// First, we compute what should define the new def, using the SSA
+// construction algorithm.
+// Then, we update the defs below us (and any new phi nodes) in the graph to
+// point to the correct new defs, to ensure we only have one variable, and no
+// disconnected stores.
+void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
+  InsertedPHIs.clear();
+
+  // See if we had a local def, and if not, go hunting.
+  MemoryAccess *DefBefore = getPreviousDefInBlock(MD);
+  bool DefBeforeSameBlock = DefBefore != nullptr;
+  if (!DefBefore)
+    DefBefore = getPreviousDefRecursive(MD->getBlock());
+
+  // There is a def before us, which means we can replace any store/phi uses
+  // of that thing with us, since we are in the way of whatever was there
+  // before.
+  // We now define that def's memorydefs and memoryphis
+  if (DefBeforeSameBlock) {
+    for (auto UI = DefBefore->use_begin(), UE = DefBefore->use_end();
+         UI != UE;) {
+      Use &U = *UI++;
+      // Leave the uses alone
+      if (isa<MemoryUse>(U.getUser()))
+        continue;
+      U.set(MD);
+    }
+  }
+
+  // and that def is now our defining access.
+  // We change them in this order otherwise we will appear in the use list
+  // above and reset ourselves.
+  MD->setDefiningAccess(DefBefore);
+
+  SmallVector<MemoryAccess *, 8> FixupList(InsertedPHIs.begin(),
+                                           InsertedPHIs.end());
+  if (!DefBeforeSameBlock) {
+    // If there was a local def before us, we must have the same effect it
+    // did. Because every may-def is the same, any phis/etc we would create, it
+    // would also have created.  If there was no local def before us, we
+    // performed a global update, and have to search all successors and make
+    // sure we update the first def in each of them (following all paths until
+    // we hit the first def along each path). This may also insert phi nodes.
+    // TODO: There are other cases we can skip this work, such as when we have a
+    // single successor, and only used a straight line of single pred blocks
+    // backwards to find the def.  To make that work, we'd have to track whether
+    // getDefRecursive only ever used the single predecessor case.  These types
+    // of paths also only exist in between CFG simplifications.
+    FixupList.push_back(MD);
+  }
+
+  while (!FixupList.empty()) {
+    unsigned StartingPHISize = InsertedPHIs.size();
+    fixupDefs(FixupList);
+    FixupList.clear();
+    // Put any new phis on the fixup list, and process them
+    FixupList.append(InsertedPHIs.end() - StartingPHISize, InsertedPHIs.end());
+  }
+  // Now that all fixups are done, rename all uses if we are asked.
+  if (RenameUses) {
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    BasicBlock *StartBlock = MD->getBlock();
+    // We are guaranteed there is a def in the block, because we just got it
+    // handed to us in this function.
+    MemoryAccess *FirstDef = &*MSSA->getWritableBlockDefs(StartBlock)->begin();
+    // Convert to incoming value if it's a memorydef. A phi *is* already an
+    // incoming value.
+    if (auto *MD = dyn_cast<MemoryDef>(FirstDef))
+      FirstDef = MD->getDefiningAccess();
+
+    MSSA->renamePass(MD->getBlock(), FirstDef, Visited);
+    // We just inserted a phi into this block, so the incoming value will become
+    // the phi anyway, so it does not matter what we pass.
+    for (auto *MP : InsertedPHIs)
+      MSSA->renamePass(MP->getBlock(), nullptr, Visited);
+  }
+}
+
+void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<MemoryAccess *> &Vars) {
+  SmallPtrSet<const BasicBlock *, 8> Seen;
+  SmallVector<const BasicBlock *, 16> Worklist;
+  for (auto *NewDef : Vars) {
+    // First, see if there is a local def after the operand.
+    auto *Defs = MSSA->getWritableBlockDefs(NewDef->getBlock());
+    auto DefIter = NewDef->getDefsIterator();
+
+    // If there is a local def after us, we only have to rename that.
+    if (++DefIter != Defs->end()) {
+      cast<MemoryDef>(DefIter)->setDefiningAccess(NewDef);
+      continue;
+    }
+
+    // Otherwise, we need to search down through the CFG.
+    // For each of our successors, handle it directly if their is a phi, or
+    // place on the fixup worklist.
+    for (const auto *S : successors(NewDef->getBlock())) {
+      if (auto *MP = MSSA->getMemoryAccess(S))
+        setMemoryPhiValueForBlock(MP, NewDef->getBlock(), NewDef);
+      else
+        Worklist.push_back(S);
+    }
+
+    while (!Worklist.empty()) {
+      const BasicBlock *FixupBlock = Worklist.back();
+      Worklist.pop_back();
+
+      // Get the first def in the block that isn't a phi node.
+      if (auto *Defs = MSSA->getWritableBlockDefs(FixupBlock)) {
+        auto *FirstDef = &*Defs->begin();
+        // The loop above and below should have taken care of phi nodes
+        assert(!isa<MemoryPhi>(FirstDef) &&
+               "Should have already handled phi nodes!");
+        // We are now this def's defining access, make sure we actually dominate
+        // it
+        assert(MSSA->dominates(NewDef, FirstDef) &&
+               "Should have dominated the new access");
+
+        // This may insert new phi nodes, because we are not guaranteed the
+        // block we are processing has a single pred, and depending where the
+        // store was inserted, it may require phi nodes below it.
+        cast<MemoryDef>(FirstDef)->setDefiningAccess(getPreviousDef(FirstDef));
+        return;
+      }
+      // We didn't find a def, so we must continue.
+      for (const auto *S : successors(FixupBlock)) {
+        // If there is a phi node, handle it.
+        // Otherwise, put the block on the worklist
+        if (auto *MP = MSSA->getMemoryAccess(S))
+          setMemoryPhiValueForBlock(MP, FixupBlock, NewDef);
+        else {
+          // If we cycle, we should have ended up at a phi node that we already
+          // processed.  FIXME: Double check this
+          if (!Seen.insert(S).second)
+            continue;
+          Worklist.push_back(S);
+        }
+      }
+    }
+  }
+}
+
+// Move What before Where in the MemorySSA IR.
+template <class WhereType>
+void MemorySSAUpdater::moveTo(MemoryUseOrDef *What, BasicBlock *BB,
+                              WhereType Where) {
+  // Replace all our users with our defining access.
+  What->replaceAllUsesWith(What->getDefiningAccess());
+
+  // Let MemorySSA take care of moving it around in the lists.
+  MSSA->moveTo(What, BB, Where);
+
+  // Now reinsert it into the IR and do whatever fixups needed.
+  if (auto *MD = dyn_cast<MemoryDef>(What))
+    insertDef(MD);
+  else
+    insertUse(cast<MemoryUse>(What));
+}
+
+// Move What before Where in the MemorySSA IR.
+void MemorySSAUpdater::moveBefore(MemoryUseOrDef *What, MemoryUseOrDef *Where) {
+  moveTo(What, Where->getBlock(), Where->getIterator());
+}
+
+// Move What after Where in the MemorySSA IR.
+void MemorySSAUpdater::moveAfter(MemoryUseOrDef *What, MemoryUseOrDef *Where) {
+  moveTo(What, Where->getBlock(), ++Where->getIterator());
+}
+
+void MemorySSAUpdater::moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
+                                   MemorySSA::InsertionPlace Where) {
+  return moveTo(What, BB, Where);
+}
+
+/// \brief If all arguments of a MemoryPHI are defined by the same incoming
+/// argument, return that argument.
+static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
+  MemoryAccess *MA = nullptr;
+
+  for (auto &Arg : MP->operands()) {
+    if (!MA)
+      MA = cast<MemoryAccess>(Arg);
+    else if (MA != Arg)
+      return nullptr;
+  }
+  return MA;
+}
+
+void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA) {
+  assert(!MSSA->isLiveOnEntryDef(MA) &&
+         "Trying to remove the live on entry def");
+  // We can only delete phi nodes if they have no uses, or we can replace all
+  // uses with a single definition.
+  MemoryAccess *NewDefTarget = nullptr;
+  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
+    // Note that it is sufficient to know that all edges of the phi node have
+    // the same argument.  If they do, by the definition of dominance frontiers
+    // (which we used to place this phi), that argument must dominate this phi,
+    // and thus, must dominate the phi's uses, and so we will not hit the assert
+    // below.
+    NewDefTarget = onlySingleValue(MP);
+    assert((NewDefTarget || MP->use_empty()) &&
+           "We can't delete this memory phi");
+  } else {
+    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
+  }
+
+  // Re-point the uses at our defining access
+  if (!isa<MemoryUse>(MA) && !MA->use_empty()) {
+    // Reset optimized on users of this store, and reset the uses.
+    // A few notes:
+    // 1. This is a slightly modified version of RAUW to avoid walking the
+    // uses twice here.
+    // 2. If we wanted to be complete, we would have to reset the optimized
+    // flags on users of phi nodes if doing the below makes a phi node have all
+    // the same arguments. Instead, we prefer users to removeMemoryAccess those
+    // phi nodes, because doing it here would be N^3.
+    if (MA->hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
+    // Note: We assume MemorySSA is not used in metadata since it's not really
+    // part of the IR.
+
+    while (!MA->use_empty()) {
+      Use &U = *MA->use_begin();
+      if (auto *MUD = dyn_cast<MemoryUseOrDef>(U.getUser()))
+        MUD->resetOptimized();
+      U.set(NewDefTarget);
+    }
+  }
+
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  MSSA->removeFromLookups(MA);
+  MSSA->removeFromLists(MA);
+}
+
+MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB(
+    Instruction *I, MemoryAccess *Definition, const BasicBlock *BB,
+    MemorySSA::InsertionPlace Point) {
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsForBlock(NewAccess, BB, Point);
+  return NewAccess;
+}
+
+MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessBefore(
+    Instruction *I, MemoryAccess *Definition, MemoryUseOrDef *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
+                              InsertPt->getIterator());
+  return NewAccess;
+}
+
+MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessAfter(
+    Instruction *I, MemoryAccess *Definition, MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
+                              ++InsertPt->getIterator());
+  return NewAccess;
+}
diff --git a/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index f675830..e12cdf9 100644
--- a/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
diff --git a/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index f5ba637..e9e354e 100644
--- a/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -28,7 +28,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -37,7 +37,8 @@ using namespace llvm;
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
-static void findRefEdges(const User *CurUser, SetVector<ValueInfo> &RefEdges,
+static void findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
+                         SetVector<ValueInfo> &RefEdges,
                          SmallPtrSet<const User *, 8> &Visited) {
   SmallVector<const User *, 32> Worklist;
   Worklist.push_back(CurUser);
@@ -61,7 +62,7 @@ static void findRefEdges(const User *CurUser, SetVector<ValueInfo> &RefEdges,
         // the reference set unless it is a callee. Callees are handled
         // specially by WriteFunction and are added to a separate list.
         if (!(CS && CS.isCallee(&OI)))
-          RefEdges.insert(GV);
+          RefEdges.insert(Index.getOrInsertValueInfo(GV));
         continue;
       }
       Worklist.push_back(Operand);
@@ -84,6 +85,92 @@ static bool isNonRenamableLocal(const GlobalValue &GV) {
   return GV.hasSection() && GV.hasLocalLinkage();
 }
 
+/// Determine whether this call has all constant integer arguments (excluding
+/// "this") and summarize it to VCalls or ConstVCalls as appropriate.
+static void addVCallToSet(DevirtCallSite Call, GlobalValue::GUID Guid,
+                          SetVector<FunctionSummary::VFuncId> &VCalls,
+                          SetVector<FunctionSummary::ConstVCall> &ConstVCalls) {
+  std::vector<uint64_t> Args;
+  // Start from the second argument to skip the "this" pointer.
+  for (auto &Arg : make_range(Call.CS.arg_begin() + 1, Call.CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64) {
+      VCalls.insert({Guid, Call.Offset});
+      return;
+    }
+    Args.push_back(CI->getZExtValue());
+  }
+  ConstVCalls.insert({{Guid, Call.Offset}, std::move(Args)});
+}
+
+/// If this intrinsic call requires that we add information to the function
+/// summary, do so via the non-constant reference arguments.
+static void addIntrinsicToSummary(
+    const CallInst *CI, SetVector<GlobalValue::GUID> &TypeTests,
+    SetVector<FunctionSummary::VFuncId> &TypeTestAssumeVCalls,
+    SetVector<FunctionSummary::VFuncId> &TypeCheckedLoadVCalls,
+    SetVector<FunctionSummary::ConstVCall> &TypeTestAssumeConstVCalls,
+    SetVector<FunctionSummary::ConstVCall> &TypeCheckedLoadConstVCalls) {
+  switch (CI->getCalledFunction()->getIntrinsicID()) {
+  case Intrinsic::type_test: {
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+    auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!TypeId)
+      break;
+    GlobalValue::GUID Guid = GlobalValue::getGUID(TypeId->getString());
+
+    // Produce a summary from type.test intrinsics. We only summarize type.test
+    // intrinsics that are used other than by an llvm.assume intrinsic.
+    // Intrinsics that are assumed are relevant only to the devirtualization
+    // pass, not the type test lowering pass.
+    bool HasNonAssumeUses = llvm::any_of(CI->uses(), [](const Use &CIU) {
+      auto *AssumeCI = dyn_cast<CallInst>(CIU.getUser());
+      if (!AssumeCI)
+        return true;
+      Function *F = AssumeCI->getCalledFunction();
+      return !F || F->getIntrinsicID() != Intrinsic::assume;
+    });
+    if (HasNonAssumeUses)
+      TypeTests.insert(Guid);
+
+    SmallVector<DevirtCallSite, 4> DevirtCalls;
+    SmallVector<CallInst *, 4> Assumes;
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI);
+    for (auto &Call : DevirtCalls)
+      addVCallToSet(Call, Guid, TypeTestAssumeVCalls,
+                    TypeTestAssumeConstVCalls);
+
+    break;
+  }
+
+  case Intrinsic::type_checked_load: {
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(2));
+    auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!TypeId)
+      break;
+    GlobalValue::GUID Guid = GlobalValue::getGUID(TypeId->getString());
+
+    SmallVector<DevirtCallSite, 4> DevirtCalls;
+    SmallVector<Instruction *, 4> LoadedPtrs;
+    SmallVector<Instruction *, 4> Preds;
+    bool HasNonCallUses = false;
+    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+                                               HasNonCallUses, CI);
+    // Any non-call uses of the result of llvm.type.checked.load will
+    // prevent us from optimizing away the llvm.type.test.
+    if (HasNonCallUses)
+      TypeTests.insert(Guid);
+    for (auto &Call : DevirtCalls)
+      addVCallToSet(Call, Guid, TypeCheckedLoadVCalls,
+                    TypeCheckedLoadConstVCalls);
+
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 static void
 computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
                        const Function &F, BlockFrequencyInfo *BFI,
@@ -99,6 +186,10 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
   MapVector<ValueInfo, CalleeInfo> CallGraphEdges;
   SetVector<ValueInfo> RefEdges;
   SetVector<GlobalValue::GUID> TypeTests;
+  SetVector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
+      TypeCheckedLoadVCalls;
+  SetVector<FunctionSummary::ConstVCall> TypeTestAssumeConstVCalls,
+      TypeCheckedLoadConstVCalls;
   ICallPromotionAnalysis ICallAnalysis;
 
   bool HasInlineAsmMaybeReferencingInternal = false;
@@ -108,7 +199,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       ++NumInsts;
-      findRefEdges(&I, RefEdges, Visited);
+      findRefEdges(Index, &I, RefEdges, Visited);
       auto CS = ImmutableCallSite(&I);
       if (!CS)
         continue;
@@ -133,29 +224,15 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // Check if this is a direct call to a known function or a known
       // intrinsic, or an indirect call with profile data.
       if (CalledFunction) {
-        if (CalledFunction->isIntrinsic()) {
-          if (CalledFunction->getIntrinsicID() != Intrinsic::type_test)
-            continue;
-          // Produce a summary from type.test intrinsics. We only summarize
-          // type.test intrinsics that are used other than by an llvm.assume
-          // intrinsic. Intrinsics that are assumed are relevant only to the
-          // devirtualization pass, not the type test lowering pass.
-          bool HasNonAssumeUses = llvm::any_of(CI->uses(), [](const Use &CIU) {
-            auto *AssumeCI = dyn_cast<CallInst>(CIU.getUser());
-            if (!AssumeCI)
-              return true;
-            Function *F = AssumeCI->getCalledFunction();
-            return !F || F->getIntrinsicID() != Intrinsic::assume;
-          });
-          if (HasNonAssumeUses) {
-            auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
-            if (auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata()))
-              TypeTests.insert(GlobalValue::getGUID(TypeId->getString()));
-          }
+        if (CI && CalledFunction->isIntrinsic()) {
+          addIntrinsicToSummary(
+              CI, TypeTests, TypeTestAssumeVCalls, TypeCheckedLoadVCalls,
+              TypeTestAssumeConstVCalls, TypeCheckedLoadConstVCalls);
+          continue;
         }
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
-        auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
+        auto ScaledCount = PSI->getProfileCount(&I, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
 
@@ -163,7 +240,9 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         // to record the call edge to the alias in that case. Eventually
         // an alias summary will be created to associate the alias and
         // aliasee.
-        CallGraphEdges[cast<GlobalValue>(CalledValue)].updateHotness(Hotness);
+        CallGraphEdges[Index.getOrInsertValueInfo(
+                           cast<GlobalValue>(CalledValue))]
+            .updateHotness(Hotness);
       } else {
         // Skip inline assembly calls.
         if (CI && CI->isInlineAsm())
@@ -178,11 +257,17 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
             ICallAnalysis.getPromotionCandidatesForInstruction(
                 &I, NumVals, TotalCount, NumCandidates);
         for (auto &Candidate : CandidateProfileData)
-          CallGraphEdges[Candidate.Value].updateHotness(
-              getHotness(Candidate.Count, PSI));
+          CallGraphEdges[Index.getOrInsertValueInfo(Candidate.Value)]
+              .updateHotness(getHotness(Candidate.Count, PSI));
       }
     }
 
+  // Explicit add hot edges to enforce importing for designated GUIDs for
+  // sample PGO, to enable the same inlines as the profiled optimized binary.
+  for (auto &I : F.getImportGUIDs())
+    CallGraphEdges[Index.getOrInsertValueInfo(I)].updateHotness(
+        CalleeInfo::HotnessType::Critical);
+
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport =
       NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
@@ -190,10 +275,13 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // FIXME: refactor this to use the same code that inliner is using.
       F.isVarArg();
   GlobalValueSummary::GVFlags Flags(F.getLinkage(), NotEligibleForImport,
-                                    /* LiveRoot = */ false);
+                                    /* Live = */ false);
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(),
-      TypeTests.takeVector());
+      TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(),
+      TypeCheckedLoadVCalls.takeVector(),
+      TypeTestAssumeConstVCalls.takeVector(),
+      TypeCheckedLoadConstVCalls.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
@@ -204,10 +292,10 @@ computeVariableSummary(ModuleSummaryIndex &Index, const GlobalVariable &V,
                        DenseSet<GlobalValue::GUID> &CantBePromoted) {
   SetVector<ValueInfo> RefEdges;
   SmallPtrSet<const User *, 8> Visited;
-  findRefEdges(&V, RefEdges, Visited);
+  findRefEdges(Index, &V, RefEdges, Visited);
   bool NonRenamableLocal = isNonRenamableLocal(V);
   GlobalValueSummary::GVFlags Flags(V.getLinkage(), NonRenamableLocal,
-                                    /* LiveRoot = */ false);
+                                    /* Live = */ false);
   auto GVarSummary =
       llvm::make_unique<GlobalVarSummary>(Flags, RefEdges.takeVector());
   if (NonRenamableLocal)
@@ -220,7 +308,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
                     DenseSet<GlobalValue::GUID> &CantBePromoted) {
   bool NonRenamableLocal = isNonRenamableLocal(A);
   GlobalValueSummary::GVFlags Flags(A.getLinkage(), NonRenamableLocal,
-                                    /* LiveRoot = */ false);
+                                    /* Live = */ false);
   auto AS = llvm::make_unique<AliasSummary>(Flags, ArrayRef<ValueInfo>{});
   auto *Aliasee = A.getBaseObject();
   auto *AliaseeSummary = Index.getGlobalValueSummary(*Aliasee);
@@ -233,18 +321,16 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
 
 // Set LiveRoot flag on entries matching the given value name.
 static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) {
-  auto SummaryList =
-      Index.findGlobalValueSummaryList(GlobalValue::getGUID(Name));
-  if (SummaryList == Index.end())
-    return;
-  for (auto &Summary : SummaryList->second)
-    Summary->setLiveRoot();
+  if (ValueInfo VI = Index.getValueInfo(GlobalValue::getGUID(Name)))
+    for (auto &Summary : VI.getSummaryList())
+      Summary->setLive(true);
 }
 
 ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     const Module &M,
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
     ProfileSummaryInfo *PSI) {
+  assert(PSI);
   ModuleSummaryIndex Index;
 
   // Identify the local values in the llvm.used and llvm.compiler.used sets,
@@ -326,9 +412,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     // be listed on the llvm.used or llvm.compiler.used global and marked as
     // referenced from there.
     ModuleSymbolTable::CollectAsmSymbols(
-        Triple(M.getTargetTriple()), M.getModuleInlineAsm(),
-        [&M, &Index, &CantBePromoted](StringRef Name,
-                                      object::BasicSymbolRef::Flags Flags) {
+        M, [&M, &Index, &CantBePromoted](StringRef Name,
+                                         object::BasicSymbolRef::Flags Flags) {
           // Symbols not marked as Weak or Global are local definitions.
           if (Flags & (object::BasicSymbolRef::SF_Weak |
                        object::BasicSymbolRef::SF_Global))
@@ -338,8 +423,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
             return;
           assert(GV->isDeclaration() && "Def in module asm already has definition");
           GlobalValueSummary::GVFlags GVFlags(GlobalValue::InternalLinkage,
-                                              /* NotEligibleToImport */ true,
-                                              /* LiveRoot */ true);
+                                              /* NotEligibleToImport = */ true,
+                                              /* Live = */ true);
           CantBePromoted.insert(GlobalValue::getGUID(Name));
           // Create the appropriate summary type.
           if (isa<Function>(GV)) {
@@ -347,7 +432,11 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                 llvm::make_unique<FunctionSummary>(
                     GVFlags, 0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
-                    ArrayRef<GlobalValue::GUID>{});
+                    ArrayRef<GlobalValue::GUID>{},
+                    ArrayRef<FunctionSummary::VFuncId>{},
+                    ArrayRef<FunctionSummary::VFuncId>{},
+                    ArrayRef<FunctionSummary::ConstVCall>{},
+                    ArrayRef<FunctionSummary::ConstVCall>{});
             Index.addGlobalValueSummary(Name, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
@@ -358,13 +447,27 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
         });
   }
 
+  bool IsThinLTO = true;
+  if (auto *MD =
+          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("ThinLTO")))
+    IsThinLTO = MD->getZExtValue();
+
   for (auto &GlobalList : Index) {
-    assert(GlobalList.second.size() == 1 &&
+    // Ignore entries for references that are undefined in the current module.
+    if (GlobalList.second.SummaryList.empty())
+      continue;
+
+    assert(GlobalList.second.SummaryList.size() == 1 &&
            "Expected module's index to have one summary per GUID");
-    auto &Summary = GlobalList.second[0];
+    auto &Summary = GlobalList.second.SummaryList[0];
+    if (!IsThinLTO) {
+      Summary->setNotEligibleToImport();
+      continue;
+    }
+
     bool AllRefsCanBeExternallyReferenced =
         llvm::all_of(Summary->refs(), [&](const ValueInfo &VI) {
-          return !CantBePromoted.count(VI.getValue()->getGUID());
+          return !CantBePromoted.count(VI.getGUID());
         });
     if (!AllRefsCanBeExternallyReferenced) {
       Summary->setNotEligibleToImport();
@@ -374,9 +477,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     if (auto *FuncSummary = dyn_cast<FunctionSummary>(Summary.get())) {
       bool AllCallsCanBeExternallyReferenced = llvm::all_of(
           FuncSummary->calls(), [&](const FunctionSummary::EdgeTy &Edge) {
-            auto GUID = Edge.first.isGUID() ? Edge.first.getGUID()
-                                            : Edge.first.getValue()->getGUID();
-            return !CantBePromoted.count(GUID);
+            return !CantBePromoted.count(Edge.first.getGUID());
           });
       if (!AllCallsCanBeExternallyReferenced)
         Summary->setNotEligibleToImport();
diff --git a/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
index 1e75c08..f374dd3 100644
--- a/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
+++ b/contrib/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -20,8 +20,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/IR/Intrinsics.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Analysis/OptimizationDiagnosticInfo.cpp b/contrib/llvm/lib/Analysis/OptimizationDiagnosticInfo.cpp
index fa8b07d..eb259fd 100644
--- a/contrib/llvm/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/contrib/llvm/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -23,14 +23,14 @@
 
 using namespace llvm;
 
-OptimizationRemarkEmitter::OptimizationRemarkEmitter(Function *F)
+OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
     : F(F), BFI(nullptr) {
-  if (!F->getContext().getDiagnosticHotnessRequested())
+  if (!F->getContext().getDiagnosticsHotnessRequested())
     return;
 
   // First create a dominator tree.
   DominatorTree DT;
-  DT.recalculate(*F);
+  DT.recalculate(*const_cast<Function *>(F));
 
   // Generate LoopInfo from it.
   LoopInfo LI;
@@ -45,6 +45,18 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(Function *F)
   BFI = OwnedBFI.get();
 }
 
+bool OptimizationRemarkEmitter::invalidate(
+    Function &F, const PreservedAnalyses &PA,
+    FunctionAnalysisManager::Invalidator &Inv) {
+  // This analysis has no state and so can be trivially preserved but it needs
+  // a fresh view of BFI if it was constructed with one.
+  if (BFI && Inv.invalidate<BlockFrequencyAnalysis>(F, PA))
+    return true;
+
+  // Otherwise this analysis result remains valid.
+  return false;
+}
+
 Optional<uint64_t> OptimizationRemarkEmitter::computeHotness(const Value *V) {
   if (!BFI)
     return None;
@@ -55,53 +67,59 @@ Optional<uint64_t> OptimizationRemarkEmitter::computeHotness(const Value *V) {
 namespace llvm {
 namespace yaml {
 
-template <> struct MappingTraits<DiagnosticInfoOptimizationBase *> {
-  static void mapping(IO &io, DiagnosticInfoOptimizationBase *&OptDiag) {
-    assert(io.outputting() && "input not yet implemented");
+void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
+    IO &io, DiagnosticInfoOptimizationBase *&OptDiag) {
+  assert(io.outputting() && "input not yet implemented");
+
+  if (io.mapTag("!Passed",
+                (OptDiag->getKind() == DK_OptimizationRemark ||
+                 OptDiag->getKind() == DK_MachineOptimizationRemark)))
+    ;
+  else if (io.mapTag(
+               "!Missed",
+               (OptDiag->getKind() == DK_OptimizationRemarkMissed ||
+                OptDiag->getKind() == DK_MachineOptimizationRemarkMissed)))
+    ;
+  else if (io.mapTag(
+               "!Analysis",
+               (OptDiag->getKind() == DK_OptimizationRemarkAnalysis ||
+                OptDiag->getKind() == DK_MachineOptimizationRemarkAnalysis)))
+    ;
+  else if (io.mapTag("!AnalysisFPCommute",
+                     OptDiag->getKind() ==
+                         DK_OptimizationRemarkAnalysisFPCommute))
+    ;
+  else if (io.mapTag("!AnalysisAliasing",
+                     OptDiag->getKind() ==
+                         DK_OptimizationRemarkAnalysisAliasing))
+    ;
+  else if (io.mapTag("!Failure", OptDiag->getKind() == DK_OptimizationFailure))
+    ;
+  else
+    llvm_unreachable("Unknown remark type");
 
-    if (io.mapTag("!Passed", OptDiag->getKind() == DK_OptimizationRemark))
-      ;
-    else if (io.mapTag("!Missed",
-                       OptDiag->getKind() == DK_OptimizationRemarkMissed))
-      ;
-    else if (io.mapTag("!Analysis",
-                       OptDiag->getKind() == DK_OptimizationRemarkAnalysis))
-      ;
-    else if (io.mapTag("!AnalysisFPCommute",
-                       OptDiag->getKind() ==
-                           DK_OptimizationRemarkAnalysisFPCommute))
-      ;
-    else if (io.mapTag("!AnalysisAliasing",
-                       OptDiag->getKind() ==
-                           DK_OptimizationRemarkAnalysisAliasing))
-      ;
-    else
-      llvm_unreachable("todo");
-
-    // These are read-only for now.
-    DebugLoc DL = OptDiag->getDebugLoc();
-    StringRef FN = GlobalValue::getRealLinkageName(
-        OptDiag->getFunction().getName());
-
-    StringRef PassName(OptDiag->PassName);
-    io.mapRequired("Pass", PassName);
-    io.mapRequired("Name", OptDiag->RemarkName);
-    if (!io.outputting() || DL)
-      io.mapOptional("DebugLoc", DL);
-    io.mapRequired("Function", FN);
-    io.mapOptional("Hotness", OptDiag->Hotness);
-    io.mapOptional("Args", OptDiag->Args);
-  }
-};
+  // These are read-only for now.
+  DiagnosticLocation DL = OptDiag->getLocation();
+  StringRef FN =
+      GlobalValue::dropLLVMManglingEscape(OptDiag->getFunction().getName());
 
-template <> struct MappingTraits<DebugLoc> {
-  static void mapping(IO &io, DebugLoc &DL) {
+  StringRef PassName(OptDiag->PassName);
+  io.mapRequired("Pass", PassName);
+  io.mapRequired("Name", OptDiag->RemarkName);
+  if (!io.outputting() || DL.isValid())
+    io.mapOptional("DebugLoc", DL);
+  io.mapRequired("Function", FN);
+  io.mapOptional("Hotness", OptDiag->Hotness);
+  io.mapOptional("Args", OptDiag->Args);
+}
+
+template <> struct MappingTraits<DiagnosticLocation> {
+  static void mapping(IO &io, DiagnosticLocation &DL) {
     assert(io.outputting() && "input not yet implemented");
 
-    auto *Scope = cast<DIScope>(DL.getScope());
-    StringRef File = Scope->getFilename();
+    StringRef File = DL.getFilename();
     unsigned Line = DL.getLine();
-    unsigned Col = DL.getCol();
+    unsigned Col = DL.getColumn();
 
     io.mapRequired("File", File);
     io.mapRequired("Line", Line);
@@ -116,8 +134,8 @@ template <> struct MappingTraits<DiagnosticInfoOptimizationBase::Argument> {
   static void mapping(IO &io, DiagnosticInfoOptimizationBase::Argument &A) {
     assert(io.outputting() && "input not yet implemented");
     io.mapRequired(A.Key.data(), A.Val);
-    if (A.DLoc)
-      io.mapOptional("DebugLoc", A.DLoc);
+    if (A.Loc.isValid())
+      io.mapOptional("DebugLoc", A.Loc);
   }
 };
 
@@ -127,18 +145,27 @@ template <> struct MappingTraits<DiagnosticInfoOptimizationBase::Argument> {
 LLVM_YAML_IS_SEQUENCE_VECTOR(DiagnosticInfoOptimizationBase::Argument)
 
 void OptimizationRemarkEmitter::computeHotness(
-    DiagnosticInfoOptimizationBase &OptDiag) {
-  Value *V = OptDiag.getCodeRegion();
+    DiagnosticInfoIROptimization &OptDiag) {
+  const Value *V = OptDiag.getCodeRegion();
   if (V)
     OptDiag.setHotness(computeHotness(V));
 }
 
-void OptimizationRemarkEmitter::emit(DiagnosticInfoOptimizationBase &OptDiag) {
+void OptimizationRemarkEmitter::emit(
+    DiagnosticInfoOptimizationBase &OptDiagBase) {
+  auto &OptDiag = cast<DiagnosticInfoIROptimization>(OptDiagBase);
   computeHotness(OptDiag);
+  // If a diagnostic has a hotness value, then only emit it if its hotness
+  // meets the threshold.
+  if (OptDiag.getHotness() &&
+      *OptDiag.getHotness() <
+          F->getContext().getDiagnosticsHotnessThreshold()) {
+    return;
+  }
 
   yaml::Output *Out = F->getContext().getDiagnosticsOutputFile();
   if (Out) {
-    auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiag);
+    auto *P = const_cast<DiagnosticInfoOptimizationBase *>(&OptDiagBase);
     *Out << P;
   }
   // FIXME: now that IsVerbose is part of DI, filtering for this will be moved
@@ -147,72 +174,6 @@ void OptimizationRemarkEmitter::emit(DiagnosticInfoOptimizationBase &OptDiag) {
     F->getContext().diagnose(OptDiag);
 }
 
-void OptimizationRemarkEmitter::emitOptimizationRemark(const char *PassName,
-                                                       const DebugLoc &DLoc,
-                                                       const Value *V,
-                                                       const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemark(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemark(const char *PassName,
-                                                       Loop *L,
-                                                       const Twine &Msg) {
-  emitOptimizationRemark(PassName, L->getStartLoc(), L->getHeader(), Msg);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg, bool IsVerbose) {
-  LLVMContext &Ctx = F->getContext();
-  if (!IsVerbose || shouldEmitVerbose())
-    Ctx.diagnose(
-        OptimizationRemarkMissed(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
-    const char *PassName, Loop *L, const Twine &Msg, bool IsVerbose) {
-  emitOptimizationRemarkMissed(PassName, L->getStartLoc(), L->getHeader(), Msg,
-                               IsVerbose);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysis(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg, bool IsVerbose) {
-  LLVMContext &Ctx = F->getContext();
-  if (!IsVerbose || shouldEmitVerbose())
-    Ctx.diagnose(
-        OptimizationRemarkAnalysis(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysis(
-    const char *PassName, Loop *L, const Twine &Msg, bool IsVerbose) {
-  emitOptimizationRemarkAnalysis(PassName, L->getStartLoc(), L->getHeader(),
-                                 Msg, IsVerbose);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisFPCommute(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, *F, DLoc, Msg,
-                                                   computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisAliasing(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, *F, DLoc, Msg,
-                                                  computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisAliasing(
-    const char *PassName, Loop *L, const Twine &Msg) {
-  emitOptimizationRemarkAnalysisAliasing(PassName, L->getStartLoc(),
-                                         L->getHeader(), Msg);
-}
-
 OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
     : FunctionPass(ID) {
   initializeOptimizationRemarkEmitterWrapperPassPass(
@@ -222,7 +183,7 @@ OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
 bool OptimizationRemarkEmitterWrapperPass::runOnFunction(Function &Fn) {
   BlockFrequencyInfo *BFI;
 
-  if (Fn.getContext().getDiagnosticHotnessRequested())
+  if (Fn.getContext().getDiagnosticsHotnessRequested())
     BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
   else
     BFI = nullptr;
@@ -244,7 +205,7 @@ OptimizationRemarkEmitterAnalysis::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   BlockFrequencyInfo *BFI;
 
-  if (F.getContext().getDiagnosticHotnessRequested())
+  if (F.getContext().getDiagnosticsHotnessRequested())
     BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
   else
     BFI = nullptr;
diff --git a/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
index 0f0016f..a04c0ae 100644
--- a/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
+++ b/contrib/llvm/lib/Analysis/OrderedBasicBlock.cpp
@@ -55,7 +55,7 @@ bool OrderedBasicBlock::comesBefore(const Instruction *A,
   assert(II != IE && "Instruction not found?");
   assert((Inst == A || Inst == B) && "Should find A or B");
   LastInstFound = II;
-  return Inst == A;
+  return Inst != B;
 }
 
 /// \brief Find out whether \p A dominates \p B, meaning whether \p A
diff --git a/contrib/llvm/lib/Analysis/PHITransAddr.cpp b/contrib/llvm/lib/Analysis/PHITransAddr.cpp
index 84ecd4a..682af4d 100644
--- a/contrib/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/contrib/llvm/lib/Analysis/PHITransAddr.cpp
@@ -227,7 +227,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
     if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(),
-                                   GEPOps, DL, TLI, DT, AC)) {
+                                   GEPOps, {DL, TLI, DT, AC})) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
@@ -276,7 +276,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
         }
 
     // See if the add simplifies away.
-    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, DL, TLI, DT, AC)) {
+    if (Value *Res = SimplifyAddInst(LHS, RHS, isNSW, isNUW, {DL, TLI, DT, AC})) {
       // If we simplified the operands, the LHS is no longer an input, but Res
       // is.
       RemoveInstInputs(LHS, InstInputs);
diff --git a/contrib/llvm/lib/Analysis/PostDominators.cpp b/contrib/llvm/lib/Analysis/PostDominators.cpp
index cb9438a..1caf151 100644
--- a/contrib/llvm/lib/Analysis/PostDominators.cpp
+++ b/contrib/llvm/lib/Analysis/PostDominators.cpp
@@ -31,6 +31,15 @@ char PostDominatorTreeWrapperPass::ID = 0;
 INITIALIZE_PASS(PostDominatorTreeWrapperPass, "postdomtree",
                 "Post-Dominator Tree Construction", true, true)
 
+bool PostDominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
+                                   FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<PostDominatorTreeAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 bool PostDominatorTreeWrapperPass::runOnFunction(Function &F) {
   DT.recalculate(F);
   return false;
diff --git a/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 16d3614..12b86da 100644
--- a/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/contrib/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -12,9 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ProfileSummary.h"
@@ -55,22 +56,43 @@ static uint64_t getMinCountForPercentile(SummaryEntryVector &DS,
 // The profile summary metadata may be attached either by the frontend or by
 // any backend passes (IR level instrumentation, for example). This method
 // checks if the Summary is null and if so checks if the summary metadata is now
-// available in the module and parses it to get the Summary object.
-void ProfileSummaryInfo::computeSummary() {
+// available in the module and parses it to get the Summary object. Returns true
+// if a valid Summary is available.
+bool ProfileSummaryInfo::computeSummary() {
   if (Summary)
-    return;
+    return true;
   auto *SummaryMD = M.getProfileSummary();
   if (!SummaryMD)
-    return;
+    return false;
   Summary.reset(ProfileSummary::getFromMD(SummaryMD));
+  return true;
+}
+
+Optional<uint64_t>
+ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
+                                    BlockFrequencyInfo *BFI) {
+  if (!Inst)
+    return None;
+  assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+         "We can only get profile count for call/invoke instruction.");
+  if (hasSampleProfile()) {
+    // In sample PGO mode, check if there is a profile metadata on the
+    // instruction. If it is present, determine hotness solely based on that,
+    // since the sampled entry count may not be accurate.
+    uint64_t TotalCount;
+    if (Inst->extractProfTotalWeight(TotalCount))
+      return TotalCount;
+  }
+  if (BFI)
+    return BFI->getBlockProfileCount(Inst->getParent());
+  return None;
 }
 
 /// Returns true if the function's entry is hot. If it returns false, it
 /// either means it is not hot or it is unknown whether it is hot or not (for
 /// example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
-  computeSummary();
-  if (!F || !Summary)
+  if (!F || !computeSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining hotness is based on
@@ -79,17 +101,53 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
   return FunctionCount && isHotCount(FunctionCount.getValue());
 }
 
+/// Returns true if the function's entry or total call edge count is hot.
+/// If it returns false, it either means it is not hot or it is unknown
+/// whether it is hot or not (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (isHotCount(FunctionCount.getValue()))
+      return true;
+
+  uint64_t TotalCallCount = 0;
+  for (const auto &BB : *F)
+    for (const auto &I : BB)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I))
+        if (auto CallCount = getProfileCount(&I, nullptr))
+          TotalCallCount += CallCount.getValue();
+  return isHotCount(TotalCallCount);
+}
+
+/// Returns true if the function's entry and total call edge count is cold.
+/// If it returns false, it either means it is not cold or it is unknown
+/// whether it is cold or not (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (!isColdCount(FunctionCount.getValue()))
+      return false;
+  
+  uint64_t TotalCallCount = 0;
+  for (const auto &BB : *F)
+    for (const auto &I : BB) 
+      if (isa<CallInst>(I) || isa<InvokeInst>(I))
+        if (auto CallCount = getProfileCount(&I, nullptr))
+          TotalCallCount += CallCount.getValue();
+  return isColdCount(TotalCallCount);
+}
+
 /// Returns true if the function's entry is a cold. If it returns false, it
 /// either means it is not cold or it is unknown whether it is cold or not (for
 /// example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
-  computeSummary();
   if (!F)
     return false;
-  if (F->hasFnAttribute(Attribute::Cold)) {
+  if (F->hasFnAttribute(Attribute::Cold))
     return true;
-  }
-  if (!Summary)
+  if (!computeSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining coldness is based on
@@ -100,9 +158,7 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
 
 /// Compute the hot and cold thresholds.
 void ProfileSummaryInfo::computeThresholds() {
-  if (!Summary)
-    computeSummary();
-  if (!Summary)
+  if (!computeSummary())
     return;
   auto &DetailedSummary = Summary->getDetailedSummary();
   HotCountThreshold =
@@ -125,20 +181,25 @@ bool ProfileSummaryInfo::isColdCount(uint64_t C) {
 
 bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) {
   auto Count = BFI->getBlockProfileCount(B);
-  if (Count && isHotCount(*Count))
-    return true;
-  // Use extractProfTotalWeight to get BB count.
-  // For Sample PGO, BFI may not provide accurate BB count due to errors
-  // magnified during sample count propagation. This serves as a backup plan
-  // to ensure all hot BB will not be missed.
-  // The query currently has false positives as branch instruction cloning does
-  // not update/scale branch weights. Unlike false negatives, this will not cause
-  // performance problem.
-  uint64_t TotalCount;
-  if (B->getTerminator()->extractProfTotalWeight(TotalCount) &&
-      isHotCount(TotalCount))
-    return true;
-  return false;
+  return Count && isHotCount(*Count);
+}
+
+bool ProfileSummaryInfo::isColdBB(const BasicBlock *B,
+                                  BlockFrequencyInfo *BFI) {
+  auto Count = BFI->getBlockProfileCount(B);
+  return Count && isColdCount(*Count);
+}
+
+bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS,
+                                       BlockFrequencyInfo *BFI) {
+  auto C = getProfileCount(CS.getInstruction(), BFI);
+  return C && isHotCount(*C);
+}
+
+bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
+                                        BlockFrequencyInfo *BFI) {
+  auto C = getProfileCount(CS.getInstruction(), BFI);
+  return C && isColdCount(*C);
 }
 
 INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info",
diff --git a/contrib/llvm/lib/Analysis/RegionInfo.cpp b/contrib/llvm/lib/Analysis/RegionInfo.cpp
index 8c084dd..9004873 100644
--- a/contrib/llvm/lib/Analysis/RegionInfo.cpp
+++ b/contrib/llvm/lib/Analysis/RegionInfo.cpp
@@ -10,28 +10,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/RegionInfo.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/RegionInfoImpl.h"
-#include "llvm/Analysis/RegionIterator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #ifndef NDEBUG
 #include "llvm/Analysis/RegionPrinter.h"
 #endif
+#include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "region"
 
 namespace llvm {
+
 template class RegionBase<RegionTraits<Function>>;
 template class RegionNodeBase<RegionTraits<Function>>;
 template class RegionInfoBase<RegionTraits<Function>>;
-}
+
+} // end namespace llvm
 
 STATISTIC(numRegions,       "The # of regions");
 STATISTIC(numSimpleRegions, "The # of simple regions");
@@ -44,7 +45,6 @@ VerifyRegionInfoX(
   cl::location(RegionInfoBase<RegionTraits<Function>>::VerifyRegionInfo),
   cl::desc("Verify region info (time consuming)"));
 
-
 static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style",
   cl::location(RegionInfo::printStyle),
   cl::Hidden,
@@ -56,7 +56,6 @@ static cl::opt<Region::PrintStyle, true> printStyleX("print-region-style",
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator")));
 
-
 //===----------------------------------------------------------------------===//
 // Region implementation
 //
@@ -68,19 +67,23 @@ Region::Region(BasicBlock *Entry, BasicBlock *Exit,
 
 }
 
-Region::~Region() { }
+Region::~Region() = default;
 
 //===----------------------------------------------------------------------===//
 // RegionInfo implementation
 //
 
-RegionInfo::RegionInfo() :
-  RegionInfoBase<RegionTraits<Function>>() {
+RegionInfo::RegionInfo() = default;
 
-}
-
-RegionInfo::~RegionInfo() {
+RegionInfo::~RegionInfo() = default;
 
+bool RegionInfo::invalidate(Function &F, const PreservedAnalyses &PA,
+                            FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<RegionInfoAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
 }
 
 void RegionInfo::updateStatistics(Region *R) {
@@ -117,9 +120,7 @@ RegionInfoPass::RegionInfoPass() : FunctionPass(ID) {
   initializeRegionInfoPassPass(*PassRegistry::getPassRegistry());
 }
 
-RegionInfoPass::~RegionInfoPass() {
-
-}
+RegionInfoPass::~RegionInfoPass() = default;
 
 bool RegionInfoPass::runOnFunction(Function &F) {
   releaseMemory();
@@ -172,10 +173,12 @@ INITIALIZE_PASS_END(RegionInfoPass, "regions",
 // the link time optimization.
 
 namespace llvm {
+
   FunctionPass *createRegionInfoPass() {
     return new RegionInfoPass();
   }
-}
+
+} // end namespace llvm
 
 //===----------------------------------------------------------------------===//
 // RegionInfoAnalysis implementation
diff --git a/contrib/llvm/lib/Analysis/RegionPass.cpp b/contrib/llvm/lib/Analysis/RegionPass.cpp
index 7358aa6..b38e622 100644
--- a/contrib/llvm/lib/Analysis/RegionPass.cpp
+++ b/contrib/llvm/lib/Analysis/RegionPass.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/Analysis/RegionIterator.h"
+#include "llvm/IR/OptBisect.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -206,6 +207,8 @@ public:
 
     return false;
   }
+
+  StringRef getPassName() const override { return "Print Region IR"; }
 };
 
 char PrintRegionPass::ID = 0;
@@ -278,3 +281,18 @@ Pass *RegionPass::createPrinterPass(raw_ostream &O,
                                   const std::string &Banner) const {
   return new PrintRegionPass(Banner, O);
 }
+
+bool RegionPass::skipRegion(Region &R) const {
+  Function &F = *R.getEntry()->getParent();
+  if (!F.getContext().getOptBisect().shouldRunPass(this, R))
+    return true;
+
+  if (F.hasFnAttribute(Attribute::OptimizeNone)) {
+    // Report this only once per function.
+    if (R.getEntry() == &F.getEntryBlock())
+      DEBUG(dbgs() << "Skipping pass '" << getPassName()
+            << "' on function " << F.getName() << "\n");
+    return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Analysis/RegionPrinter.cpp b/contrib/llvm/lib/Analysis/RegionPrinter.cpp
index 30a4e01..5986b8c 100644
--- a/contrib/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/contrib/llvm/lib/Analysis/RegionPrinter.cpp
@@ -9,14 +9,14 @@
 // Print out the region tree of a function using dotty/graphviz.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/RegionPrinter.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
-#include "llvm/Analysis/RegionPrinter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
index ed328f1..9539fd7 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -89,9 +89,10 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -125,18 +126,47 @@ static cl::opt<bool>
 static cl::opt<unsigned> MulOpsInlineThreshold(
     "scev-mulops-inline-threshold", cl::Hidden,
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
-    cl::init(1000));
+    cl::init(32));
+
+static cl::opt<unsigned> AddOpsInlineThreshold(
+    "scev-addops-inline-threshold", cl::Hidden,
+    cl::desc("Threshold for inlining addition operands into a SCEV"),
+    cl::init(500));
 
 static cl::opt<unsigned> MaxSCEVCompareDepth(
     "scalar-evolution-max-scev-compare-depth", cl::Hidden,
     cl::desc("Maximum depth of recursive SCEV complexity comparisons"),
     cl::init(32));
 
+static cl::opt<unsigned> MaxSCEVOperationsImplicationDepth(
+    "scalar-evolution-max-scev-operations-implication-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive SCEV operations implication analysis"),
+    cl::init(2));
+
 static cl::opt<unsigned> MaxValueCompareDepth(
     "scalar-evolution-max-value-compare-depth", cl::Hidden,
     cl::desc("Maximum depth of recursive value complexity comparisons"),
     cl::init(2));
 
+static cl::opt<unsigned>
+    MaxArithDepth("scalar-evolution-max-arith-depth", cl::Hidden,
+                  cl::desc("Maximum depth of recursive arithmetics"),
+                  cl::init(32));
+
+static cl::opt<unsigned> MaxConstantEvolvingDepth(
+    "scalar-evolution-max-constant-evolving-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive constant evolving"), cl::init(32));
+
+static cl::opt<unsigned>
+    MaxExtDepth("scalar-evolution-max-ext-depth", cl::Hidden,
+                cl::desc("Maximum depth of recursive SExt/ZExt"),
+                cl::init(8));
+
+static cl::opt<unsigned>
+    MaxAddRecSize("scalar-evolution-max-add-rec-size", cl::Hidden,
+                  cl::desc("Max coefficients in AddRec during evolving"),
+                  cl::init(16));
+
 //===----------------------------------------------------------------------===//
 //                           SCEV class definitions
 //===----------------------------------------------------------------------===//
@@ -145,11 +175,12 @@ static cl::opt<unsigned> MaxValueCompareDepth(
 // Implementation of the SCEV class.
 //
 
-LLVM_DUMP_METHOD
-void SCEV::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SCEV::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void SCEV::print(raw_ostream &OS) const {
   switch (static_cast<SCEVTypes>(getSCEVType())) {
@@ -300,7 +331,7 @@ bool SCEV::isOne() const {
 
 bool SCEV::isAllOnesValue() const {
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(this))
-    return SC->getValue()->isAllOnesValue();
+    return SC->getValue()->isMinusOne();
   return false;
 }
 
@@ -563,7 +594,7 @@ CompareValueComplexity(SmallSet<std::pair<Value *, Value *>, 8> &EqCache,
 static int CompareSCEVComplexity(
     SmallSet<std::pair<const SCEV *, const SCEV *>, 8> &EqCacheSCEV,
     const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS,
-    unsigned Depth = 0) {
+    DominatorTree &DT, unsigned Depth = 0) {
   // Fast-path: SCEVs are uniqued so we can do a quick equality check.
   if (LHS == RHS)
     return 0;
@@ -608,12 +639,19 @@ static int CompareSCEVComplexity(
     const SCEVAddRecExpr *LA = cast<SCEVAddRecExpr>(LHS);
     const SCEVAddRecExpr *RA = cast<SCEVAddRecExpr>(RHS);
 
-    // Compare addrec loop depths.
+    // There is always a dominance between two recs that are used by one SCEV,
+    // so we can safely sort recs by loop header dominance. We require such
+    // order in getAddExpr.
     const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop();
     if (LLoop != RLoop) {
-      unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth();
-      if (LDepth != RDepth)
-        return (int)LDepth - (int)RDepth;
+      const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader();
+      assert(LHead != RHead && "Two loops share the same header?");
+      if (DT.dominates(LHead, RHead))
+        return 1;
+      else
+        assert(DT.dominates(RHead, LHead) &&
+               "No dominance between recurrences used by one SCEV?");
+      return -1;
     }
 
     // Addrec complexity grows with operand count.
@@ -624,7 +662,7 @@ static int CompareSCEVComplexity(
     // Lexicographically compare.
     for (unsigned i = 0; i != LNumOps; ++i) {
       int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i),
-                                    RA->getOperand(i), Depth + 1);
+                                    RA->getOperand(i), DT,  Depth + 1);
       if (X != 0)
         return X;
     }
@@ -648,7 +686,7 @@ static int CompareSCEVComplexity(
       if (i >= RNumOps)
         return 1;
       int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i),
-                                    RC->getOperand(i), Depth + 1);
+                                    RC->getOperand(i), DT, Depth + 1);
       if (X != 0)
         return X;
     }
@@ -662,10 +700,10 @@ static int CompareSCEVComplexity(
 
     // Lexicographically compare udiv expressions.
     int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(),
-                                  Depth + 1);
+                                  DT, Depth + 1);
     if (X != 0)
       return X;
-    X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(),
+    X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT,
                               Depth + 1);
     if (X == 0)
       EqCacheSCEV.insert({LHS, RHS});
@@ -680,7 +718,7 @@ static int CompareSCEVComplexity(
 
     // Compare cast expressions by operand.
     int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(),
-                                  RC->getOperand(), Depth + 1);
+                                  RC->getOperand(), DT, Depth + 1);
     if (X == 0)
       EqCacheSCEV.insert({LHS, RHS});
     return X;
@@ -703,7 +741,7 @@ static int CompareSCEVComplexity(
 /// land in memory.
 ///
 static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
-                              LoopInfo *LI) {
+                              LoopInfo *LI, DominatorTree &DT) {
   if (Ops.size() < 2) return;  // Noop
 
   SmallSet<std::pair<const SCEV *, const SCEV *>, 8> EqCache;
@@ -711,15 +749,16 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
     // This is the common case, which also happens to be trivially simple.
     // Special case it.
     const SCEV *&LHS = Ops[0], *&RHS = Ops[1];
-    if (CompareSCEVComplexity(EqCache, LI, RHS, LHS) < 0)
+    if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0)
       std::swap(LHS, RHS);
     return;
   }
 
   // Do the rough sort by complexity.
   std::stable_sort(Ops.begin(), Ops.end(),
-                   [&EqCache, LI](const SCEV *LHS, const SCEV *RHS) {
-                     return CompareSCEVComplexity(EqCache, LI, LHS, RHS) < 0;
+                   [&EqCache, LI, &DT](const SCEV *LHS, const SCEV *RHS) {
+                     return
+                         CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0;
                    });
 
   // Now that we are sorted by complexity, group elements of the same
@@ -1073,7 +1112,7 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
     APInt Mult(W, i);
     unsigned TwoFactors = Mult.countTrailingZeros();
     T += TwoFactors;
-    Mult = Mult.lshr(TwoFactors);
+    Mult.lshrInPlace(TwoFactors);
     OddFactorial *= Mult;
   }
 
@@ -1230,12 +1269,12 @@ static const SCEV *getSignedOverflowLimitForStep(const SCEV *Step,
   if (SE->isKnownPositive(Step)) {
     *Pred = ICmpInst::ICMP_SLT;
     return SE->getConstant(APInt::getSignedMinValue(BitWidth) -
-                           SE->getSignedRange(Step).getSignedMax());
+                           SE->getSignedRangeMax(Step));
   }
   if (SE->isKnownNegative(Step)) {
     *Pred = ICmpInst::ICMP_SGT;
     return SE->getConstant(APInt::getSignedMaxValue(BitWidth) -
-                           SE->getSignedRange(Step).getSignedMin());
+                           SE->getSignedRangeMin(Step));
   }
   return nullptr;
 }
@@ -1250,13 +1289,14 @@ static const SCEV *getUnsignedOverflowLimitForStep(const SCEV *Step,
   *Pred = ICmpInst::ICMP_ULT;
 
   return SE->getConstant(APInt::getMinValue(BitWidth) -
-                         SE->getUnsignedRange(Step).getUnsignedMax());
+                         SE->getUnsignedRangeMax(Step));
 }
 
 namespace {
 
 struct ExtendOpTraitsBase {
-  typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *);
+  typedef const SCEV *(ScalarEvolution::*GetExtendExprTy)(const SCEV *, Type *,
+                                                          unsigned);
 };
 
 // Used to make code generic over signed and unsigned overflow.
@@ -1314,7 +1354,7 @@ const ExtendOpTraitsBase::GetExtendExprTy ExtendOpTraits<
 // "sext/zext(PostIncAR)"
 template <typename ExtendOpTy>
 static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
-                                        ScalarEvolution *SE) {
+                                        ScalarEvolution *SE, unsigned Depth) {
   auto WrapType = ExtendOpTraits<ExtendOpTy>::WrapType;
   auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;
 
@@ -1361,9 +1401,9 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
   unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
   Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
   const SCEV *OperandExtendedStart =
-      SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy),
-                     (SE->*GetExtendExpr)(Step, WideTy));
-  if ((SE->*GetExtendExpr)(Start, WideTy) == OperandExtendedStart) {
+      SE->getAddExpr((SE->*GetExtendExpr)(PreStart, WideTy, Depth),
+                     (SE->*GetExtendExpr)(Step, WideTy, Depth));
+  if ((SE->*GetExtendExpr)(Start, WideTy, Depth) == OperandExtendedStart) {
     if (PreAR && AR->getNoWrapFlags(WrapType)) {
       // If we know `AR` == {`PreStart`+`Step`,+,`Step`} is `WrapType` (FlagNSW
       // or FlagNUW) and that `PreStart` + `Step` is `WrapType` too, then
@@ -1388,15 +1428,17 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
 // Get the normalized zero or sign extended expression for this AddRec's Start.
 template <typename ExtendOpTy>
 static const SCEV *getExtendAddRecStart(const SCEVAddRecExpr *AR, Type *Ty,
-                                        ScalarEvolution *SE) {
+                                        ScalarEvolution *SE,
+                                        unsigned Depth) {
   auto GetExtendExpr = ExtendOpTraits<ExtendOpTy>::GetExtendExpr;
 
-  const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE);
+  const SCEV *PreStart = getPreStartForExtend<ExtendOpTy>(AR, Ty, SE, Depth);
   if (!PreStart)
-    return (SE->*GetExtendExpr)(AR->getStart(), Ty);
+    return (SE->*GetExtendExpr)(AR->getStart(), Ty, Depth);
 
-  return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty),
-                        (SE->*GetExtendExpr)(PreStart, Ty));
+  return SE->getAddExpr((SE->*GetExtendExpr)(AR->getStepRecurrence(*SE), Ty,
+                                             Depth),
+                        (SE->*GetExtendExpr)(PreStart, Ty, Depth));
 }
 
 // Try to prove away overflow by looking at "nearby" add recurrences.  A
@@ -1476,8 +1518,8 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start,
   return false;
 }
 
-const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
-                                               Type *Ty) {
+const SCEV *
+ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1491,7 +1533,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
 
   // zext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
-    return getZeroExtendExpr(SZ->getOperand(), Ty);
+    return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1);
 
   // Before doing any expensive analysis, check to see if we've already
   // computed a SCEV for this Op and Ty.
@@ -1501,6 +1543,12 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   ID.AddPointer(Ty);
   void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  if (Depth > MaxExtDepth) {
+    SCEV *S = new (SCEVAllocator) SCEVZeroExtendExpr(ID.Intern(SCEVAllocator),
+                                                     Op, Ty);
+    UniqueSCEVs.InsertNode(S, IP);
+    return S;
+  }
 
   // zext(trunc(x)) --> zext(x) or x or trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
@@ -1535,8 +1583,8 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
       // we don't need to do any further analysis.
       if (AR->hasNoUnsignedWrap())
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
-            getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
+            getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1560,37 +1608,49 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
         if (MaxBECount == RecastedMaxBECount) {
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
-          const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul), WideTy);
-          const SCEV *WideStart = getZeroExtendExpr(Start, WideTy);
+          const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step,
+                                        SCEV::FlagAnyWrap, Depth + 1);
+          const SCEV *ZAdd = getZeroExtendExpr(getAddExpr(Start, ZMul,
+                                                          SCEV::FlagAnyWrap,
+                                                          Depth + 1),
+                                               WideTy, Depth + 1);
+          const SCEV *WideStart = getZeroExtendExpr(Start, WideTy, Depth + 1);
           const SCEV *WideMaxBECount =
-            getZeroExtendExpr(CastedMaxBECount, WideTy);
+            getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1);
           const SCEV *OperandExtendedAdd =
             getAddExpr(WideStart,
                        getMulExpr(WideMaxBECount,
-                                  getZeroExtendExpr(Step, WideTy)));
+                                  getZeroExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NUW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
-                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getZeroExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as signed.
           // This covers loops that count down.
           OperandExtendedAdd =
             getAddExpr(WideStart,
                        getMulExpr(WideMaxBECount,
-                                  getSignExtendExpr(Step, WideTy)));
+                                  getSignExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (ZAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NW, which is propagated to this AddRec.
             // Negative step causes unsigned wrap, but it still can't self-wrap.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
-                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getSignExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         }
       }
@@ -1611,7 +1671,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
         // is safe.
         if (isKnownPositive(Step)) {
           const SCEV *N = getConstant(APInt::getMinValue(BitWidth) -
-                                      getUnsignedRange(Step).getUnsignedMax());
+                                      getUnsignedRangeMax(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, AR, N) ||
               (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_ULT, Start, N) &&
                isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
@@ -1621,12 +1681,14 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
-                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getZeroExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         } else if (isKnownNegative(Step)) {
           const SCEV *N = getConstant(APInt::getMaxValue(BitWidth) -
-                                      getSignedRange(Step).getSignedMin());
+                                      getSignedRangeMin(Step));
           if (isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT, AR, N) ||
               (isLoopEntryGuardedByCond(L, ICmpInst::ICMP_UGT, Start, N) &&
                isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_UGT,
@@ -1637,8 +1699,10 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
-                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getSignExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         }
       }
@@ -1646,8 +1710,8 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
       if (proveNoWrapByVaryingStart<SCEVZeroExtendExpr>(Start, Step, L)) {
         const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNUW);
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this),
-            getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+            getExtendAddRecStart<SCEVZeroExtendExpr>(AR, Ty, this, Depth + 1),
+            getZeroExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
       }
     }
 
@@ -1658,8 +1722,8 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
       // commute the zero extension with the addition operation.
       SmallVector<const SCEV *, 4> Ops;
       for (const auto *Op : SA->operands())
-        Ops.push_back(getZeroExtendExpr(Op, Ty));
-      return getAddExpr(Ops, SCEV::FlagNUW);
+        Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
+      return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1);
     }
   }
 
@@ -1672,8 +1736,8 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
   return S;
 }
 
-const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
-                                               Type *Ty) {
+const SCEV *
+ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1687,11 +1751,11 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
 
   // sext(sext(x)) --> sext(x)
   if (const SCEVSignExtendExpr *SS = dyn_cast<SCEVSignExtendExpr>(Op))
-    return getSignExtendExpr(SS->getOperand(), Ty);
+    return getSignExtendExpr(SS->getOperand(), Ty, Depth + 1);
 
   // sext(zext(x)) --> zext(x)
   if (const SCEVZeroExtendExpr *SZ = dyn_cast<SCEVZeroExtendExpr>(Op))
-    return getZeroExtendExpr(SZ->getOperand(), Ty);
+    return getZeroExtendExpr(SZ->getOperand(), Ty, Depth + 1);
 
   // Before doing any expensive analysis, check to see if we've already
   // computed a SCEV for this Op and Ty.
@@ -1701,6 +1765,13 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
   ID.AddPointer(Ty);
   void *IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
+  // Limit recursion depth.
+  if (Depth > MaxExtDepth) {
+    SCEV *S = new (SCEVAllocator) SCEVSignExtendExpr(ID.Intern(SCEVAllocator),
+                                                     Op, Ty);
+    UniqueSCEVs.InsertNode(S, IP);
+    return S;
+  }
 
   // sext(trunc(x)) --> sext(x) or x or trunc(x)
   if (const SCEVTruncateExpr *ST = dyn_cast<SCEVTruncateExpr>(Op)) {
@@ -1726,8 +1797,9 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
           const APInt &C2 = SC2->getAPInt();
           if (C1.isStrictlyPositive() && C2.isStrictlyPositive() &&
               C2.ugt(C1) && C2.isPowerOf2())
-            return getAddExpr(getSignExtendExpr(SC1, Ty),
-                              getSignExtendExpr(SMul, Ty));
+            return getAddExpr(getSignExtendExpr(SC1, Ty, Depth + 1),
+                              getSignExtendExpr(SMul, Ty, Depth + 1),
+                              SCEV::FlagAnyWrap, Depth + 1);
         }
       }
     }
@@ -1738,8 +1810,8 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       // commute the sign extension with the addition operation.
       SmallVector<const SCEV *, 4> Ops;
       for (const auto *Op : SA->operands())
-        Ops.push_back(getSignExtendExpr(Op, Ty));
-      return getAddExpr(Ops, SCEV::FlagNSW);
+        Ops.push_back(getSignExtendExpr(Op, Ty, Depth + 1));
+      return getAddExpr(Ops, SCEV::FlagNSW, Depth + 1);
     }
   }
   // If the input value is a chrec scev, and we can prove that the value
@@ -1762,8 +1834,8 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
       // we don't need to do any further analysis.
       if (AR->hasNoSignedWrap())
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
-            getSignExtendExpr(Step, Ty), L, SCEV::FlagNSW);
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+            getSignExtendExpr(Step, Ty, Depth + 1), L, SCEV::FlagNSW);
 
       // Check whether the backedge-taken count is SCEVCouldNotCompute.
       // Note that this serves two purposes: It filters out loops that are
@@ -1787,29 +1859,39 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         if (MaxBECount == RecastedMaxBECount) {
           Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
-          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
-          const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul), WideTy);
-          const SCEV *WideStart = getSignExtendExpr(Start, WideTy);
+          const SCEV *SMul = getMulExpr(CastedMaxBECount, Step,
+                                        SCEV::FlagAnyWrap, Depth + 1);
+          const SCEV *SAdd = getSignExtendExpr(getAddExpr(Start, SMul,
+                                                          SCEV::FlagAnyWrap,
+                                                          Depth + 1),
+                                               WideTy, Depth + 1);
+          const SCEV *WideStart = getSignExtendExpr(Start, WideTy, Depth + 1);
           const SCEV *WideMaxBECount =
-            getZeroExtendExpr(CastedMaxBECount, WideTy);
+            getZeroExtendExpr(CastedMaxBECount, WideTy, Depth + 1);
           const SCEV *OperandExtendedAdd =
             getAddExpr(WideStart,
                        getMulExpr(WideMaxBECount,
-                                  getSignExtendExpr(Step, WideTy)));
+                                  getSignExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (SAdd == OperandExtendedAdd) {
             // Cache knowledge of AR NSW, which is propagated to this AddRec.
             const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
-                getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getSignExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
           // Similar to above, only this time treat the step value as unsigned.
           // This covers loops that count up with an unsigned step.
           OperandExtendedAdd =
             getAddExpr(WideStart,
                        getMulExpr(WideMaxBECount,
-                                  getZeroExtendExpr(Step, WideTy)));
+                                  getZeroExtendExpr(Step, WideTy, Depth + 1),
+                                  SCEV::FlagAnyWrap, Depth + 1),
+                       SCEV::FlagAnyWrap, Depth + 1);
           if (SAdd == OperandExtendedAdd) {
             // If AR wraps around then
             //
@@ -1823,8 +1905,10 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
 
             // Return the expression with the addrec on the outside.
             return getAddRecExpr(
-                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
-                getZeroExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+                getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this,
+                                                         Depth + 1),
+                getZeroExtendExpr(Step, Ty, Depth + 1), L,
+                AR->getNoWrapFlags());
           }
         }
       }
@@ -1855,8 +1939,8 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
           // Cache knowledge of AR NSW, then propagate NSW to the wide AddRec.
           const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
           return getAddRecExpr(
-              getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
-              getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+              getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+              getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
         }
       }
 
@@ -1870,25 +1954,26 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         const APInt &C2 = SC2->getAPInt();
         if (C1.isStrictlyPositive() && C2.isStrictlyPositive() && C2.ugt(C1) &&
             C2.isPowerOf2()) {
-          Start = getSignExtendExpr(Start, Ty);
+          Start = getSignExtendExpr(Start, Ty, Depth + 1);
           const SCEV *NewAR = getAddRecExpr(getZero(AR->getType()), Step, L,
                                             AR->getNoWrapFlags());
-          return getAddExpr(Start, getSignExtendExpr(NewAR, Ty));
+          return getAddExpr(Start, getSignExtendExpr(NewAR, Ty, Depth + 1),
+                            SCEV::FlagAnyWrap, Depth + 1);
         }
       }
 
       if (proveNoWrapByVaryingStart<SCEVSignExtendExpr>(Start, Step, L)) {
         const_cast<SCEVAddRecExpr *>(AR)->setNoWrapFlags(SCEV::FlagNSW);
         return getAddRecExpr(
-            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this),
-            getSignExtendExpr(Step, Ty), L, AR->getNoWrapFlags());
+            getExtendAddRecStart<SCEVSignExtendExpr>(AR, Ty, this, Depth + 1),
+            getSignExtendExpr(Step, Ty, Depth + 1), L, AR->getNoWrapFlags());
       }
     }
 
   // If the input value is provably positive and we could not simplify
   // away the sext build a zext instead.
   if (isKnownNonNegative(Op))
-    return getZeroExtendExpr(Op, Ty);
+    return getZeroExtendExpr(Op, Ty, Depth + 1);
 
   // The cast wasn't folded; create an explicit cast node.
   // Recompute the insert position, as it may have been invalidated.
@@ -2093,9 +2178,66 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
   return Flags;
 }
 
+bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L) {
+  if (!isLoopInvariant(S, L))
+    return false;
+  // If a value depends on a SCEVUnknown which is defined after the loop, we
+  // conservatively assume that we cannot calculate it at the loop's entry.
+  struct FindDominatedSCEVUnknown {
+    bool Found = false;
+    const Loop *L;
+    DominatorTree &DT;
+    LoopInfo &LI;
+
+    FindDominatedSCEVUnknown(const Loop *L, DominatorTree &DT, LoopInfo &LI)
+        : L(L), DT(DT), LI(LI) {}
+
+    bool checkSCEVUnknown(const SCEVUnknown *SU) {
+      if (auto *I = dyn_cast<Instruction>(SU->getValue())) {
+        if (DT.dominates(L->getHeader(), I->getParent()))
+          Found = true;
+        else
+          assert(DT.dominates(I->getParent(), L->getHeader()) &&
+                 "No dominance relationship between SCEV and loop?");
+      }
+      return false;
+    }
+
+    bool follow(const SCEV *S) {
+      switch (static_cast<SCEVTypes>(S->getSCEVType())) {
+      case scConstant:
+        return false;
+      case scAddRecExpr:
+      case scTruncate:
+      case scZeroExtend:
+      case scSignExtend:
+      case scAddExpr:
+      case scMulExpr:
+      case scUMaxExpr:
+      case scSMaxExpr:
+      case scUDivExpr:
+        return true;
+      case scUnknown:
+        return checkSCEVUnknown(cast<SCEVUnknown>(S));
+      case scCouldNotCompute:
+        llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+      }
+      return false;
+    }
+
+    bool isDone() { return Found; }
+  };
+
+  FindDominatedSCEVUnknown FSU(L, DT, LI);
+  SCEVTraversal<FindDominatedSCEVUnknown> ST(FSU);
+  ST.visitAll(S);
+  return !FSU.Found;
+}
+
 /// Get a canonical add expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
-                                        SCEV::NoWrapFlags Flags) {
+                                        SCEV::NoWrapFlags Flags,
+                                        unsigned Depth) {
   assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
          "only nuw or nsw allowed");
   assert(!Ops.empty() && "Cannot get empty add!");
@@ -2108,7 +2250,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags);
 
@@ -2134,6 +2276,10 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (Ops.size() == 1) return Ops[0];
   }
 
+  // Limit recursion calls depth.
+  if (Depth > MaxArithDepth)
+    return getOrCreateAddExpr(Ops, Flags);
+
   // Okay, check to see if the same value occurs in the operand list more than
   // once.  If so, merge them together into an multiply expression.  Since we
   // sorted the list, these values are required to be adjacent.
@@ -2147,7 +2293,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         ++Count;
       // Merge the values into a multiply.
       const SCEV *Scale = getConstant(Ty, Count);
-      const SCEV *Mul = getMulExpr(Scale, Ops[i]);
+      const SCEV *Mul = getMulExpr(Scale, Ops[i], SCEV::FlagAnyWrap, Depth + 1);
       if (Ops.size() == Count)
         return Mul;
       Ops[i] = Mul;
@@ -2197,7 +2343,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
           }
         }
         if (Ok)
-          LargeOps.push_back(getMulExpr(LargeMulOps));
+          LargeOps.push_back(getMulExpr(LargeMulOps, SCEV::FlagAnyWrap, Depth + 1));
       } else {
         Ok = false;
         break;
@@ -2205,7 +2351,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     }
     if (Ok) {
       // Evaluate the expression in the larger type.
-      const SCEV *Fold = getAddExpr(LargeOps, Flags);
+      const SCEV *Fold = getAddExpr(LargeOps, Flags, Depth + 1);
       // If it folds to something simple, use it. Otherwise, don't.
       if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
         return getTruncateExpr(Fold, DstType);
@@ -2220,6 +2366,9 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   if (Idx < Ops.size()) {
     bool DeletedAdd = false;
     while (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Ops[Idx])) {
+      if (Ops.size() > AddOpsInlineThreshold ||
+          Add->getNumOperands() > AddOpsInlineThreshold)
+        break;
       // If we have an add, expand the add operands onto the end of the operands
       // list.
       Ops.erase(Ops.begin()+Idx);
@@ -2231,7 +2380,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // and they are not necessarily sorted.  Recurse to resort and resimplify
     // any operands we just acquired.
     if (DeletedAdd)
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
   }
 
   // Skip over the add expression until we get to a multiply.
@@ -2266,13 +2415,15 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         Ops.push_back(getConstant(AccumulatedConstant));
       for (auto &MulOp : MulOpLists)
         if (MulOp.first != 0)
-          Ops.push_back(getMulExpr(getConstant(MulOp.first),
-                                   getAddExpr(MulOp.second)));
+          Ops.push_back(getMulExpr(
+              getConstant(MulOp.first),
+              getAddExpr(MulOp.second, SCEV::FlagAnyWrap, Depth + 1),
+              SCEV::FlagAnyWrap, Depth + 1));
       if (Ops.empty())
         return getZero(Ty);
       if (Ops.size() == 1)
         return Ops[0];
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
   }
 
@@ -2295,11 +2446,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
                                                 Mul->op_begin()+MulOp);
             MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
-            InnerMul = getMulExpr(MulOps);
+            InnerMul = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1);
           }
-          const SCEV *One = getOne(Ty);
-          const SCEV *AddOne = getAddExpr(One, InnerMul);
-          const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV);
+          SmallVector<const SCEV *, 2> TwoOps = {getOne(Ty), InnerMul};
+          const SCEV *AddOne = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
+          const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV,
+                                            SCEV::FlagAnyWrap, Depth + 1);
           if (Ops.size() == 2) return OuterMul;
           if (AddOp < Idx) {
             Ops.erase(Ops.begin()+AddOp);
@@ -2309,7 +2461,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             Ops.erase(Ops.begin()+AddOp-1);
           }
           Ops.push_back(OuterMul);
-          return getAddExpr(Ops);
+          return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
         }
 
       // Check this multiply against other multiplies being added together.
@@ -2328,22 +2480,25 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
               SmallVector<const SCEV *, 4> MulOps(Mul->op_begin(),
                                                   Mul->op_begin()+MulOp);
               MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
-              InnerMul1 = getMulExpr(MulOps);
+              InnerMul1 = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1);
             }
             const SCEV *InnerMul2 = OtherMul->getOperand(OMulOp == 0);
             if (OtherMul->getNumOperands() != 2) {
               SmallVector<const SCEV *, 4> MulOps(OtherMul->op_begin(),
                                                   OtherMul->op_begin()+OMulOp);
               MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end());
-              InnerMul2 = getMulExpr(MulOps);
+              InnerMul2 = getMulExpr(MulOps, SCEV::FlagAnyWrap, Depth + 1);
             }
-            const SCEV *InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
-            const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
+            SmallVector<const SCEV *, 2> TwoOps = {InnerMul1, InnerMul2};
+            const SCEV *InnerMulSum =
+                getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
+            const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum,
+                                              SCEV::FlagAnyWrap, Depth + 1);
             if (Ops.size() == 2) return OuterMul;
             Ops.erase(Ops.begin()+Idx);
             Ops.erase(Ops.begin()+OtherMulIdx-1);
             Ops.push_back(OuterMul);
-            return getAddExpr(Ops);
+            return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
           }
       }
     }
@@ -2363,7 +2518,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (isLoopInvariant(Ops[i], AddRecLoop)) {
+      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
@@ -2379,7 +2534,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       // This follows from the fact that the no-wrap flags on the outer add
       // expression are applicable on the 0th iteration, when the add recurrence
       // will be equal to its start value.
-      AddRecOps[0] = getAddExpr(LIOps, Flags);
+      AddRecOps[0] = getAddExpr(LIOps, Flags, Depth + 1);
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer add and the inner addrec are guaranteed to have no overflow.
@@ -2396,7 +2551,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
           Ops[i] = NewRec;
           break;
         }
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
 
     // Okay, if there weren't any loop invariants to be folded, check to see if
@@ -2404,31 +2559,40 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // added together.  If so, we can fold them.
     for (unsigned OtherIdx = Idx+1;
          OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-         ++OtherIdx)
+         ++OtherIdx) {
+      // We expect the AddRecExpr's to be sorted in reverse dominance order,
+      // so that the 1st found AddRecExpr is dominated by all others.
+      assert(DT.dominates(
+           cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()->getHeader(),
+           AddRec->getLoop()->getHeader()) &&
+        "AddRecExprs are not sorted in reverse dominance order?");
       if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
         // Other + {A,+,B}<L> + {C,+,D}<L>  -->  Other + {A+C,+,B+D}<L>
         SmallVector<const SCEV *, 4> AddRecOps(AddRec->op_begin(),
                                                AddRec->op_end());
         for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-             ++OtherIdx)
-          if (const auto *OtherAddRec = dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
-            if (OtherAddRec->getLoop() == AddRecLoop) {
-              for (unsigned i = 0, e = OtherAddRec->getNumOperands();
-                   i != e; ++i) {
-                if (i >= AddRecOps.size()) {
-                  AddRecOps.append(OtherAddRec->op_begin()+i,
-                                   OtherAddRec->op_end());
-                  break;
-                }
-                AddRecOps[i] = getAddExpr(AddRecOps[i],
-                                          OtherAddRec->getOperand(i));
+             ++OtherIdx) {
+          const auto *OtherAddRec = cast<SCEVAddRecExpr>(Ops[OtherIdx]);
+          if (OtherAddRec->getLoop() == AddRecLoop) {
+            for (unsigned i = 0, e = OtherAddRec->getNumOperands();
+                 i != e; ++i) {
+              if (i >= AddRecOps.size()) {
+                AddRecOps.append(OtherAddRec->op_begin()+i,
+                                 OtherAddRec->op_end());
+                break;
               }
-              Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+              SmallVector<const SCEV *, 2> TwoOps = {
+                  AddRecOps[i], OtherAddRec->getOperand(i)};
+              AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
             }
+            Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+          }
+        }
         // Step size has changed, so we cannot guarantee no self-wraparound.
         Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
-        return getAddExpr(Ops);
+        return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
       }
+    }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
     // next one.
@@ -2436,17 +2600,44 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Okay, it looks like we really DO need an add expr.  Check to see if we
   // already have one, otherwise create a new one.
+  return getOrCreateAddExpr(Ops, Flags);
+}
+
+const SCEV *
+ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                    SCEV::NoWrapFlags Flags) {
   FoldingSetNodeID ID;
   ID.AddInteger(scAddExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
   void *IP = nullptr;
   SCEVAddExpr *S =
-    static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+      static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
   if (!S) {
     const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
     std::uninitialized_copy(Ops.begin(), Ops.end(), O);
-    S = new (SCEVAllocator) SCEVAddExpr(ID.Intern(SCEVAllocator),
+    S = new (SCEVAllocator)
+        SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size());
+    UniqueSCEVs.InsertNode(S, IP);
+  }
+  S->setNoWrapFlags(Flags);
+  return S;
+}
+
+const SCEV *
+ScalarEvolution::getOrCreateMulExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                    SCEV::NoWrapFlags Flags) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(scMulExpr);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = nullptr;
+  SCEVMulExpr *S =
+    static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+  if (!S) {
+    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+    std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+    S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator),
                                         O, Ops.size());
     UniqueSCEVs.InsertNode(S, IP);
   }
@@ -2506,7 +2697,8 @@ static bool containsConstantSomewhere(const SCEV *StartExpr) {
 
 /// Get a canonical multiply expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
-                                        SCEV::NoWrapFlags Flags) {
+                                        SCEV::NoWrapFlags Flags,
+                                        unsigned Depth) {
   assert(Flags == maskFlags(Flags, SCEV::FlagNUW | SCEV::FlagNSW) &&
          "only nuw or nsw allowed");
   assert(!Ops.empty() && "Cannot get empty mul!");
@@ -2519,10 +2711,14 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
 
+  // Limit recursion calls depth.
+  if (Depth > MaxArithDepth)
+    return getOrCreateMulExpr(Ops, Flags);
+
   // If there are any constants, fold them together.
   unsigned Idx = 0;
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
@@ -2534,8 +2730,11 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
           // apply this transformation as well.
           if (Add->getNumOperands() == 2)
             if (containsConstantSomewhere(Add))
-              return getAddExpr(getMulExpr(LHSC, Add->getOperand(0)),
-                                getMulExpr(LHSC, Add->getOperand(1)));
+              return getAddExpr(getMulExpr(LHSC, Add->getOperand(0),
+                                           SCEV::FlagAnyWrap, Depth + 1),
+                                getMulExpr(LHSC, Add->getOperand(1),
+                                           SCEV::FlagAnyWrap, Depth + 1),
+                                SCEV::FlagAnyWrap, Depth + 1);
 
     ++Idx;
     while (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(Ops[Idx])) {
@@ -2549,7 +2748,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     }
 
     // If we are left with a constant one being multiplied, strip it off.
-    if (cast<SCEVConstant>(Ops[0])->getValue()->equalsInt(1)) {
+    if (cast<SCEVConstant>(Ops[0])->getValue()->isOne()) {
       Ops.erase(Ops.begin());
       --Idx;
     } else if (cast<SCEVConstant>(Ops[0])->getValue()->isZero()) {
@@ -2563,17 +2762,19 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
           SmallVector<const SCEV *, 4> NewOps;
           bool AnyFolded = false;
           for (const SCEV *AddOp : Add->operands()) {
-            const SCEV *Mul = getMulExpr(Ops[0], AddOp);
+            const SCEV *Mul = getMulExpr(Ops[0], AddOp, SCEV::FlagAnyWrap,
+                                         Depth + 1);
             if (!isa<SCEVMulExpr>(Mul)) AnyFolded = true;
             NewOps.push_back(Mul);
           }
           if (AnyFolded)
-            return getAddExpr(NewOps);
+            return getAddExpr(NewOps, SCEV::FlagAnyWrap, Depth + 1);
         } else if (const auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ops[1])) {
           // Negation preserves a recurrence's no self-wrap property.
           SmallVector<const SCEV *, 4> Operands;
           for (const SCEV *AddRecOp : AddRec->operands())
-            Operands.push_back(getMulExpr(Ops[0], AddRecOp));
+            Operands.push_back(getMulExpr(Ops[0], AddRecOp, SCEV::FlagAnyWrap,
+                                          Depth + 1));
 
           return getAddRecExpr(Operands, AddRec->getLoop(),
                                AddRec->getNoWrapFlags(SCEV::FlagNW));
@@ -2595,18 +2796,18 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     while (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[Idx])) {
       if (Ops.size() > MulOpsInlineThreshold)
         break;
-      // If we have an mul, expand the mul operands onto the end of the operands
-      // list.
+      // If we have an mul, expand the mul operands onto the end of the
+      // operands list.
       Ops.erase(Ops.begin()+Idx);
       Ops.append(Mul->op_begin(), Mul->op_end());
       DeletedMul = true;
     }
 
-    // If we deleted at least one mul, we added operands to the end of the list,
-    // and they are not necessarily sorted.  Recurse to resort and resimplify
-    // any operands we just acquired.
+    // If we deleted at least one mul, we added operands to the end of the
+    // list, and they are not necessarily sorted.  Recurse to resort and
+    // resimplify any operands we just acquired.
     if (DeletedMul)
-      return getMulExpr(Ops);
+      return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
   }
 
   // If there are any add recurrences in the operands list, see if any other
@@ -2617,13 +2818,13 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Scan over all recurrences, trying to fold loop invariants into them.
   for (; Idx < Ops.size() && isa<SCEVAddRecExpr>(Ops[Idx]); ++Idx) {
-    // Scan all of the other operands to this mul and add them to the vector if
-    // they are loop invariant w.r.t. the recurrence.
+    // Scan all of the other operands to this mul and add them to the vector
+    // if they are loop invariant w.r.t. the recurrence.
     SmallVector<const SCEV *, 8> LIOps;
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (isLoopInvariant(Ops[i], AddRecLoop)) {
+      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
@@ -2634,9 +2835,10 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       //  NLI * LI * {Start,+,Step}  -->  NLI * {LI*Start,+,LI*Step}
       SmallVector<const SCEV *, 4> NewOps;
       NewOps.reserve(AddRec->getNumOperands());
-      const SCEV *Scale = getMulExpr(LIOps);
+      const SCEV *Scale = getMulExpr(LIOps, SCEV::FlagAnyWrap, Depth + 1);
       for (unsigned i = 0, e = AddRec->getNumOperands(); i != e; ++i)
-        NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i)));
+        NewOps.push_back(getMulExpr(Scale, AddRec->getOperand(i),
+                                    SCEV::FlagAnyWrap, Depth + 1));
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer mul and the inner addrec are guaranteed to have no overflow.
@@ -2655,12 +2857,12 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
           Ops[i] = NewRec;
           break;
         }
-      return getMulExpr(Ops);
+      return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
 
-    // Okay, if there weren't any loop invariants to be folded, check to see if
-    // there are multiple AddRec's with the same loop induction variable being
-    // multiplied together.  If so, we can fold them.
+    // Okay, if there weren't any loop invariants to be folded, check to see
+    // if there are multiple AddRec's with the same loop induction variable
+    // being multiplied together.  If so, we can fold them.
 
     // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
     // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
@@ -2681,6 +2883,12 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       if (!OtherAddRec || OtherAddRec->getLoop() != AddRecLoop)
         continue;
 
+      // Limit max number of arguments to avoid creation of unreasonably big
+      // SCEVAddRecs with very complex operands.
+      if (AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1 >
+          MaxAddRecSize)
+        continue;
+
       bool Overflow = false;
       Type *Ty = AddRec->getType();
       bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
@@ -2702,7 +2910,9 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
             const SCEV *CoeffTerm = getConstant(Ty, Coeff);
             const SCEV *Term1 = AddRec->getOperand(y-z);
             const SCEV *Term2 = OtherAddRec->getOperand(z);
-            Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
+            Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1, Term2,
+                                               SCEV::FlagAnyWrap, Depth + 1),
+                              SCEV::FlagAnyWrap, Depth + 1);
           }
         }
         AddRecOps.push_back(Term);
@@ -2720,7 +2930,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       }
     }
     if (OpsModified)
-      return getMulExpr(Ops);
+      return getMulExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
     // next one.
@@ -2728,22 +2938,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Okay, it looks like we really DO need an mul expr.  Check to see if we
   // already have one, otherwise create a new one.
-  FoldingSetNodeID ID;
-  ID.AddInteger(scMulExpr);
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    ID.AddPointer(Ops[i]);
-  void *IP = nullptr;
-  SCEVMulExpr *S =
-    static_cast<SCEVMulExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
-  if (!S) {
-    const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
-    std::uninitialized_copy(Ops.begin(), Ops.end(), O);
-    S = new (SCEVAllocator) SCEVMulExpr(ID.Intern(SCEVAllocator),
-                                        O, Ops.size());
-    UniqueSCEVs.InsertNode(S, IP);
-  }
-  S->setNoWrapFlags(Flags);
-  return S;
+  return getOrCreateMulExpr(Ops, Flags);
 }
 
 /// Get a canonical unsigned division expression, or something simpler if
@@ -2755,7 +2950,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
          "SCEVUDivExpr operand types don't match!");
 
   if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
-    if (RHSC->getValue()->equalsInt(1))
+    if (RHSC->getValue()->isOne())
       return LHS;                               // X udiv 1 --> x
     // If the denominator is zero, the result of the udiv is undefined. Don't
     // try to analyze it, because the resolution chosen here may differ from
@@ -2875,7 +3070,7 @@ static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
   else if (ABW < BBW)
     A = A.zext(BBW);
 
-  return APIntOps::GreatestCommonDivisor(A, B);
+  return APIntOps::GreatestCommonDivisor(std::move(A), std::move(B));
 }
 
 /// Get a canonical unsigned division expression, or something simpler if
@@ -2889,7 +3084,7 @@ const SCEV *ScalarEvolution::getUDivExactExpr(const SCEV *LHS,
   // end of this file for inspiration.
 
   const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS);
-  if (!Mul)
+  if (!Mul || !Mul->hasNoUnsignedWrap())
     return getUDivExpr(LHS, RHS);
 
   if (const SCEVConstant *RHSCst = dyn_cast<SCEVConstant>(RHS)) {
@@ -3116,7 +3311,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -3217,7 +3412,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
 #endif
 
   // Sort by complexity, this groups all similar expression types together.
-  GroupByComplexity(Ops, &LI);
+  GroupByComplexity(Ops, &LI, DT);
 
   // If there are any constants, fold them together.
   unsigned Idx = 0;
@@ -3385,6 +3580,10 @@ Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   return getDataLayout().getIntPtrType(Ty);
 }
 
+Type *ScalarEvolution::getWiderType(Type *T1, Type *T2) const {
+  return  getTypeSizeInBits(T1) >= getTypeSizeInBits(T2) ? T1 : T2;
+}
+
 const SCEV *ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute.get();
 }
@@ -3542,7 +3741,8 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
 }
 
 const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
-                                          SCEV::NoWrapFlags Flags) {
+                                          SCEV::NoWrapFlags Flags,
+                                          unsigned Depth) {
   // Fast path: X - X --> 0.
   if (LHS == RHS)
     return getZero(LHS->getType());
@@ -3551,7 +3751,7 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   // makes it so that we cannot make much use of NUW.
   auto AddFlags = SCEV::FlagAnyWrap;
   const bool RHSIsNotMinSigned =
-      !getSignedRange(RHS).getSignedMin().isMinSignedValue();
+      !getSignedRangeMin(RHS).isMinSignedValue();
   if (maskFlags(Flags, SCEV::FlagNSW) == SCEV::FlagNSW) {
     // Let M be the minimum representable signed value. Then (-1)*RHS
     // signed-wraps if and only if RHS is M. That can happen even for
@@ -3576,7 +3776,7 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
   // larger scope than intended.
   auto NegFlags = RHSIsNotMinSigned ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags);
+  return getAddExpr(LHS, getNegativeSCEV(RHS, NegFlags), AddFlags, Depth);
 }
 
 const SCEV *
@@ -3770,7 +3970,7 @@ public:
       : SCEVRewriteVisitor(SE), L(L), Valid(true) {}
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant))
+    if (!SE.isLoopInvariant(Expr, L))
       Valid = false;
     return Expr;
   }
@@ -3804,7 +4004,7 @@ public:
 
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     // Only allow AddRecExprs for this loop.
-    if (!(SE.getLoopDisposition(Expr, L) == ScalarEvolution::LoopInvariant))
+    if (!SE.isLoopInvariant(Expr, L))
       Valid = false;
     return Expr;
   }
@@ -3909,9 +4109,9 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
 
   case Instruction::Xor:
     if (auto *RHSC = dyn_cast<ConstantInt>(Op->getOperand(1)))
-      // If the RHS of the xor is a signbit, then this is just an add.
-      // Instcombine turns add of signbit into xor as a strength reduction step.
-      if (RHSC->getValue().isSignBit())
+      // If the RHS of the xor is a signmask, then this is just an add.
+      // Instcombine turns add of signmask into xor as a strength reduction step.
+      if (RHSC->getValue().isSignMask())
         return BinaryOp(Instruction::Add, Op->getOperand(0), Op->getOperand(1));
     return BinaryOp(Op);
 
@@ -3984,6 +4184,369 @@ static Optional<BinaryOp> MatchBinaryOp(Value *V, DominatorTree &DT) {
   return None;
 }
 
+/// Helper function to createAddRecFromPHIWithCasts. We have a phi 
+/// node whose symbolic (unknown) SCEV is \p SymbolicPHI, which is updated via
+/// the loop backedge by a SCEVAddExpr, possibly also with a few casts on the 
+/// way. This function checks if \p Op, an operand of this SCEVAddExpr, 
+/// follows one of the following patterns:
+/// Op == (SExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
+/// Op == (ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy)
+/// If the SCEV expression of \p Op conforms with one of the expected patterns
+/// we return the type of the truncation operation, and indicate whether the
+/// truncated type should be treated as signed/unsigned by setting 
+/// \p Signed to true/false, respectively.
+static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
+                               bool &Signed, ScalarEvolution &SE) {
+
+  // The case where Op == SymbolicPHI (that is, with no type conversions on 
+  // the way) is handled by the regular add recurrence creating logic and 
+  // would have already been triggered in createAddRecForPHI. Reaching it here
+  // means that createAddRecFromPHI had failed for this PHI before (e.g., 
+  // because one of the other operands of the SCEVAddExpr updating this PHI is
+  // not invariant). 
+  //
+  // Here we look for the case where Op = (ext(trunc(SymbolicPHI))), and in 
+  // this case predicates that allow us to prove that Op == SymbolicPHI will
+  // be added.
+  if (Op == SymbolicPHI)
+    return nullptr;
+
+  unsigned SourceBits = SE.getTypeSizeInBits(SymbolicPHI->getType());
+  unsigned NewBits = SE.getTypeSizeInBits(Op->getType());
+  if (SourceBits != NewBits)
+    return nullptr;
+
+  const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
+  const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
+  if (!SExt && !ZExt)
+    return nullptr;
+  const SCEVTruncateExpr *Trunc =
+      SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
+           : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
+  if (!Trunc)
+    return nullptr;
+  const SCEV *X = Trunc->getOperand();
+  if (X != SymbolicPHI)
+    return nullptr;
+  Signed = SExt ? true : false; 
+  return Trunc->getType();
+}
+
+static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
+  if (!PN->getType()->isIntegerTy())
+    return nullptr;
+  const Loop *L = LI.getLoopFor(PN->getParent());
+  if (!L || L->getHeader() != PN->getParent())
+    return nullptr;
+  return L;
+}
+
+// Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the
+// computation that updates the phi follows the following pattern:
+//   (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum
+// which correspond to a phi->trunc->sext/zext->add->phi update chain.
+// If so, try to see if it can be rewritten as an AddRecExpr under some
+// Predicates. If successful, return them as a pair. Also cache the results
+// of the analysis.
+//
+// Example usage scenario:
+//    Say the Rewriter is called for the following SCEV:
+//         8 * ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
+//    where:
+//         %X = phi i64 (%Start, %BEValue)
+//    It will visitMul->visitAdd->visitSExt->visitTrunc->visitUnknown(%X),
+//    and call this function with %SymbolicPHI = %X.
+//
+//    The analysis will find that the value coming around the backedge has 
+//    the following SCEV:
+//         BEValue = ((sext i32 (trunc i64 %X to i32) to i64) + %Step)
+//    Upon concluding that this matches the desired pattern, the function
+//    will return the pair {NewAddRec, SmallPredsVec} where:
+//         NewAddRec = {%Start,+,%Step}
+//         SmallPredsVec = {P1, P2, P3} as follows:
+//           P1(WrapPred): AR: {trunc(%Start),+,(trunc %Step)}<nsw> Flags: <nssw>
+//           P2(EqualPred): %Start == (sext i32 (trunc i64 %Start to i32) to i64)
+//           P3(EqualPred): %Step == (sext i32 (trunc i64 %Step to i32) to i64)
+//    The returned pair means that SymbolicPHI can be rewritten into NewAddRec
+//    under the predicates {P1,P2,P3}.
+//    This predicated rewrite will be cached in PredicatedSCEVRewrites:
+//         PredicatedSCEVRewrites[{%X,L}] = {NewAddRec, {P1,P2,P3)} 
+//
+// TODO's:
+//
+// 1) Extend the Induction descriptor to also support inductions that involve
+//    casts: When needed (namely, when we are called in the context of the 
+//    vectorizer induction analysis), a Set of cast instructions will be 
+//    populated by this method, and provided back to isInductionPHI. This is
+//    needed to allow the vectorizer to properly record them to be ignored by
+//    the cost model and to avoid vectorizing them (otherwise these casts,
+//    which are redundant under the runtime overflow checks, will be 
+//    vectorized, which can be costly).  
+//
+// 2) Support additional induction/PHISCEV patterns: We also want to support
+//    inductions where the sext-trunc / zext-trunc operations (partly) occur 
+//    after the induction update operation (the induction increment):
+//
+//      (Trunc iy (SExt/ZExt ix (%SymbolicPHI + InvariantAccum) to iy) to ix)
+//    which correspond to a phi->add->trunc->sext/zext->phi update chain.
+//
+//      (Trunc iy ((SExt/ZExt ix (%SymbolicPhi) to iy) + InvariantAccum) to ix)
+//    which correspond to a phi->trunc->add->sext/zext->phi update chain.
+//
+// 3) Outline common code with createAddRecFromPHI to avoid duplication.
+//
+Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+ScalarEvolution::createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI) {
+  SmallVector<const SCEVPredicate *, 3> Predicates;
+
+  // *** Part1: Analyze if we have a phi-with-cast pattern for which we can 
+  // return an AddRec expression under some predicate.
+ 
+  auto *PN = cast<PHINode>(SymbolicPHI->getValue());
+  const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
+  assert (L && "Expecting an integer loop header phi");
+
+  // The loop may have multiple entrances or multiple exits; we can analyze
+  // this phi as an addrec if it has a unique entry value and a unique
+  // backedge value.
+  Value *BEValueV = nullptr, *StartValueV = nullptr;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (L->contains(PN->getIncomingBlock(i))) {
+      if (!BEValueV) {
+        BEValueV = V;
+      } else if (BEValueV != V) {
+        BEValueV = nullptr;
+        break;
+      }
+    } else if (!StartValueV) {
+      StartValueV = V;
+    } else if (StartValueV != V) {
+      StartValueV = nullptr;
+      break;
+    }
+  }
+  if (!BEValueV || !StartValueV)
+    return None;
+
+  const SCEV *BEValue = getSCEV(BEValueV);
+
+  // If the value coming around the backedge is an add with the symbolic
+  // value we just inserted, possibly with casts that we can ignore under
+  // an appropriate runtime guard, then we found a simple induction variable!
+  const auto *Add = dyn_cast<SCEVAddExpr>(BEValue);
+  if (!Add)
+    return None;
+
+  // If there is a single occurrence of the symbolic value, possibly
+  // casted, replace it with a recurrence. 
+  unsigned FoundIndex = Add->getNumOperands();
+  Type *TruncTy = nullptr;
+  bool Signed;
+  for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+    if ((TruncTy = 
+             isSimpleCastedPHI(Add->getOperand(i), SymbolicPHI, Signed, *this)))
+      if (FoundIndex == e) {
+        FoundIndex = i;
+        break;
+      }
+
+  if (FoundIndex == Add->getNumOperands())
+    return None;
+
+  // Create an add with everything but the specified operand.
+  SmallVector<const SCEV *, 8> Ops;
+  for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+    if (i != FoundIndex)
+      Ops.push_back(Add->getOperand(i));
+  const SCEV *Accum = getAddExpr(Ops);
+
+  // The runtime checks will not be valid if the step amount is
+  // varying inside the loop.
+  if (!isLoopInvariant(Accum, L))
+    return None;
+
+  
+  // *** Part2: Create the predicates 
+
+  // Analysis was successful: we have a phi-with-cast pattern for which we
+  // can return an AddRec expression under the following predicates:
+  //
+  // P1: A Wrap predicate that guarantees that Trunc(Start) + i*Trunc(Accum)
+  //     fits within the truncated type (does not overflow) for i = 0 to n-1.
+  // P2: An Equal predicate that guarantees that 
+  //     Start = (Ext ix (Trunc iy (Start) to ix) to iy)
+  // P3: An Equal predicate that guarantees that 
+  //     Accum = (Ext ix (Trunc iy (Accum) to ix) to iy)
+  //
+  // As we next prove, the above predicates guarantee that: 
+  //     Start + i*Accum = (Ext ix (Trunc iy ( Start + i*Accum ) to ix) to iy)
+  //
+  //
+  // More formally, we want to prove that:
+  //     Expr(i+1) = Start + (i+1) * Accum 
+  //               = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum 
+  //
+  // Given that:
+  // 1) Expr(0) = Start 
+  // 2) Expr(1) = Start + Accum 
+  //            = (Ext ix (Trunc iy (Start) to ix) to iy) + Accum :: from P2
+  // 3) Induction hypothesis (step i):
+  //    Expr(i) = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum 
+  //
+  // Proof:
+  //  Expr(i+1) =
+  //   = Start + (i+1)*Accum
+  //   = (Start + i*Accum) + Accum
+  //   = Expr(i) + Accum  
+  //   = (Ext ix (Trunc iy (Expr(i-1)) to ix) to iy) + Accum + Accum 
+  //                                                             :: from step i
+  //
+  //   = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy) + Accum + Accum 
+  //
+  //   = (Ext ix (Trunc iy (Start + (i-1)*Accum) to ix) to iy)
+  //     + (Ext ix (Trunc iy (Accum) to ix) to iy)
+  //     + Accum                                                     :: from P3
+  //
+  //   = (Ext ix (Trunc iy ((Start + (i-1)*Accum) + Accum) to ix) to iy) 
+  //     + Accum                            :: from P1: Ext(x)+Ext(y)=>Ext(x+y)
+  //
+  //   = (Ext ix (Trunc iy (Start + i*Accum) to ix) to iy) + Accum
+  //   = (Ext ix (Trunc iy (Expr(i)) to ix) to iy) + Accum 
+  //
+  // By induction, the same applies to all iterations 1<=i<n:
+  //
+  
+  // Create a truncated addrec for which we will add a no overflow check (P1).
+  const SCEV *StartVal = getSCEV(StartValueV);
+  const SCEV *PHISCEV = 
+      getAddRecExpr(getTruncateExpr(StartVal, TruncTy),
+                    getTruncateExpr(Accum, TruncTy), L, SCEV::FlagAnyWrap); 
+  const auto *AR = cast<SCEVAddRecExpr>(PHISCEV);
+
+  SCEVWrapPredicate::IncrementWrapFlags AddedFlags =
+      Signed ? SCEVWrapPredicate::IncrementNSSW
+             : SCEVWrapPredicate::IncrementNUSW;
+  const SCEVPredicate *AddRecPred = getWrapPredicate(AR, AddedFlags);
+  Predicates.push_back(AddRecPred);
+
+  // Create the Equal Predicates P2,P3:
+  auto AppendPredicate = [&](const SCEV *Expr) -> void {
+    assert (isLoopInvariant(Expr, L) && "Expr is expected to be invariant");
+    const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy);
+    const SCEV *ExtendedExpr =
+        Signed ? getSignExtendExpr(TruncatedExpr, Expr->getType())
+               : getZeroExtendExpr(TruncatedExpr, Expr->getType());
+    if (Expr != ExtendedExpr &&
+        !isKnownPredicate(ICmpInst::ICMP_EQ, Expr, ExtendedExpr)) {
+      const SCEVPredicate *Pred = getEqualPredicate(Expr, ExtendedExpr);
+      DEBUG (dbgs() << "Added Predicate: " << *Pred);
+      Predicates.push_back(Pred);
+    }
+  };
+  
+  AppendPredicate(StartVal);
+  AppendPredicate(Accum);
+  
+  // *** Part3: Predicates are ready. Now go ahead and create the new addrec in
+  // which the casts had been folded away. The caller can rewrite SymbolicPHI
+  // into NewAR if it will also add the runtime overflow checks specified in
+  // Predicates.  
+  auto *NewAR = getAddRecExpr(StartVal, Accum, L, SCEV::FlagAnyWrap);
+
+  std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>> PredRewrite =
+      std::make_pair(NewAR, Predicates);
+  // Remember the result of the analysis for this SCEV at this locayyytion.
+  PredicatedSCEVRewrites[{SymbolicPHI, L}] = PredRewrite;
+  return PredRewrite;
+}
+
+Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+ScalarEvolution::createAddRecFromPHIWithCasts(const SCEVUnknown *SymbolicPHI) {
+
+  auto *PN = cast<PHINode>(SymbolicPHI->getValue());
+  const Loop *L = isIntegerLoopHeaderPHI(PN, LI);
+  if (!L)
+    return None;
+
+  // Check to see if we already analyzed this PHI.
+  auto I = PredicatedSCEVRewrites.find({SymbolicPHI, L});
+  if (I != PredicatedSCEVRewrites.end()) {
+    std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>> Rewrite =
+        I->second;
+    // Analysis was done before and failed to create an AddRec:
+    if (Rewrite.first == SymbolicPHI) 
+      return None;
+    // Analysis was done before and succeeded to create an AddRec under
+    // a predicate:
+    assert(isa<SCEVAddRecExpr>(Rewrite.first) && "Expected an AddRec");
+    assert(!(Rewrite.second).empty() && "Expected to find Predicates");
+    return Rewrite;
+  }
+
+  Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+    Rewrite = createAddRecFromPHIWithCastsImpl(SymbolicPHI);
+
+  // Record in the cache that the analysis failed
+  if (!Rewrite) {
+    SmallVector<const SCEVPredicate *, 3> Predicates;
+    PredicatedSCEVRewrites[{SymbolicPHI, L}] = {SymbolicPHI, Predicates};
+    return None;
+  }
+
+  return Rewrite;
+}
+
+/// A helper function for createAddRecFromPHI to handle simple cases.
+///
+/// This function tries to find an AddRec expression for the simplest (yet most
+/// common) cases: PN = PHI(Start, OP(Self, LoopInvariant)).
+/// If it fails, createAddRecFromPHI will use a more general, but slow,
+/// technique for finding the AddRec expression.
+const SCEV *ScalarEvolution::createSimpleAffineAddRec(PHINode *PN,
+                                                      Value *BEValueV,
+                                                      Value *StartValueV) {
+  const Loop *L = LI.getLoopFor(PN->getParent());
+  assert(L && L->getHeader() == PN->getParent());
+  assert(BEValueV && StartValueV);
+
+  auto BO = MatchBinaryOp(BEValueV, DT);
+  if (!BO)
+    return nullptr;
+
+  if (BO->Opcode != Instruction::Add)
+    return nullptr;
+
+  const SCEV *Accum = nullptr;
+  if (BO->LHS == PN && L->isLoopInvariant(BO->RHS))
+    Accum = getSCEV(BO->RHS);
+  else if (BO->RHS == PN && L->isLoopInvariant(BO->LHS))
+    Accum = getSCEV(BO->LHS);
+
+  if (!Accum)
+    return nullptr;
+
+  SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+  if (BO->IsNUW)
+    Flags = setFlags(Flags, SCEV::FlagNUW);
+  if (BO->IsNSW)
+    Flags = setFlags(Flags, SCEV::FlagNSW);
+
+  const SCEV *StartVal = getSCEV(StartValueV);
+  const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
+
+  ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+
+  // We can add Flags to the post-inc expression only if we
+  // know that it is *undefined behavior* for BEValueV to
+  // overflow.
+  if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
+    if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
+      (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
+
+  return PHISCEV;
+}
+
 const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
   const Loop *L = LI.getLoopFor(PN->getParent());
   if (!L || L->getHeader() != PN->getParent())
@@ -4009,127 +4572,134 @@ const SCEV *ScalarEvolution::createAddRecFromPHI(PHINode *PN) {
       break;
     }
   }
-  if (BEValueV && StartValueV) {
-    // While we are analyzing this PHI node, handle its value symbolically.
-    const SCEV *SymbolicName = getUnknown(PN);
-    assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
-           "PHI node already processed?");
-    ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName});
+  if (!BEValueV || !StartValueV)
+    return nullptr;
 
-    // Using this symbolic name for the PHI, analyze the value coming around
-    // the back-edge.
-    const SCEV *BEValue = getSCEV(BEValueV);
+  assert(ValueExprMap.find_as(PN) == ValueExprMap.end() &&
+         "PHI node already processed?");
 
-    // NOTE: If BEValue is loop invariant, we know that the PHI node just
-    // has a special value for the first iteration of the loop.
+  // First, try to find AddRec expression without creating a fictituos symbolic
+  // value for PN.
+  if (auto *S = createSimpleAffineAddRec(PN, BEValueV, StartValueV))
+    return S;
+
+  // Handle PHI node value symbolically.
+  const SCEV *SymbolicName = getUnknown(PN);
+  ValueExprMap.insert({SCEVCallbackVH(PN, this), SymbolicName});
+
+  // Using this symbolic name for the PHI, analyze the value coming around
+  // the back-edge.
+  const SCEV *BEValue = getSCEV(BEValueV);
+
+  // NOTE: If BEValue is loop invariant, we know that the PHI node just
+  // has a special value for the first iteration of the loop.
+
+  // If the value coming around the backedge is an add with the symbolic
+  // value we just inserted, then we found a simple induction variable!
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
+    // If there is a single occurrence of the symbolic value, replace it
+    // with a recurrence.
+    unsigned FoundIndex = Add->getNumOperands();
+    for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
+      if (Add->getOperand(i) == SymbolicName)
+        if (FoundIndex == e) {
+          FoundIndex = i;
+          break;
+        }
 
-    // If the value coming around the backedge is an add with the symbolic
-    // value we just inserted, then we found a simple induction variable!
-    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BEValue)) {
-      // If there is a single occurrence of the symbolic value, replace it
-      // with a recurrence.
-      unsigned FoundIndex = Add->getNumOperands();
+    if (FoundIndex != Add->getNumOperands()) {
+      // Create an add with everything but the specified operand.
+      SmallVector<const SCEV *, 8> Ops;
       for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
-        if (Add->getOperand(i) == SymbolicName)
-          if (FoundIndex == e) {
-            FoundIndex = i;
-            break;
+        if (i != FoundIndex)
+          Ops.push_back(Add->getOperand(i));
+      const SCEV *Accum = getAddExpr(Ops);
+
+      // This is not a valid addrec if the step amount is varying each
+      // loop iteration, but is not itself an addrec in this loop.
+      if (isLoopInvariant(Accum, L) ||
+          (isa<SCEVAddRecExpr>(Accum) &&
+           cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
+        SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+
+        if (auto BO = MatchBinaryOp(BEValueV, DT)) {
+          if (BO->Opcode == Instruction::Add && BO->LHS == PN) {
+            if (BO->IsNUW)
+              Flags = setFlags(Flags, SCEV::FlagNUW);
+            if (BO->IsNSW)
+              Flags = setFlags(Flags, SCEV::FlagNSW);
           }
-
-      if (FoundIndex != Add->getNumOperands()) {
-        // Create an add with everything but the specified operand.
-        SmallVector<const SCEV *, 8> Ops;
-        for (unsigned i = 0, e = Add->getNumOperands(); i != e; ++i)
-          if (i != FoundIndex)
-            Ops.push_back(Add->getOperand(i));
-        const SCEV *Accum = getAddExpr(Ops);
-
-        // This is not a valid addrec if the step amount is varying each
-        // loop iteration, but is not itself an addrec in this loop.
-        if (isLoopInvariant(Accum, L) ||
-            (isa<SCEVAddRecExpr>(Accum) &&
-             cast<SCEVAddRecExpr>(Accum)->getLoop() == L)) {
-          SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
-
-          if (auto BO = MatchBinaryOp(BEValueV, DT)) {
-            if (BO->Opcode == Instruction::Add && BO->LHS == PN) {
-              if (BO->IsNUW)
-                Flags = setFlags(Flags, SCEV::FlagNUW);
-              if (BO->IsNSW)
-                Flags = setFlags(Flags, SCEV::FlagNSW);
-            }
-          } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
-            // If the increment is an inbounds GEP, then we know the address
-            // space cannot be wrapped around. We cannot make any guarantee
-            // about signed or unsigned overflow because pointers are
-            // unsigned but we may have a negative index from the base
-            // pointer. We can guarantee that no unsigned wrap occurs if the
-            // indices form a positive value.
-            if (GEP->isInBounds() && GEP->getOperand(0) == PN) {
-              Flags = setFlags(Flags, SCEV::FlagNW);
-
-              const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
-              if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
-                Flags = setFlags(Flags, SCEV::FlagNUW);
-            }
-
-            // We cannot transfer nuw and nsw flags from subtraction
-            // operations -- sub nuw X, Y is not the same as add nuw X, -Y
-            // for instance.
+        } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
+          // If the increment is an inbounds GEP, then we know the address
+          // space cannot be wrapped around. We cannot make any guarantee
+          // about signed or unsigned overflow because pointers are
+          // unsigned but we may have a negative index from the base
+          // pointer. We can guarantee that no unsigned wrap occurs if the
+          // indices form a positive value.
+          if (GEP->isInBounds() && GEP->getOperand(0) == PN) {
+            Flags = setFlags(Flags, SCEV::FlagNW);
+
+            const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
+            if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
+              Flags = setFlags(Flags, SCEV::FlagNUW);
           }
 
-          const SCEV *StartVal = getSCEV(StartValueV);
-          const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
+          // We cannot transfer nuw and nsw flags from subtraction
+          // operations -- sub nuw X, Y is not the same as add nuw X, -Y
+          // for instance.
+        }
 
-          // Okay, for the entire analysis of this edge we assumed the PHI
-          // to be symbolic.  We now need to go back and purge all of the
-          // entries for the scalars that use the symbolic expression.
-          forgetSymbolicName(PN, SymbolicName);
-          ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
+        const SCEV *StartVal = getSCEV(StartValueV);
+        const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags);
 
-          // We can add Flags to the post-inc expression only if we
-          // know that it us *undefined behavior* for BEValueV to
-          // overflow.
-          if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
-            if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
-              (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
+        // Okay, for the entire analysis of this edge we assumed the PHI
+        // to be symbolic.  We now need to go back and purge all of the
+        // entries for the scalars that use the symbolic expression.
+        forgetSymbolicName(PN, SymbolicName);
+        ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV;
 
-          return PHISCEV;
-        }
+        // We can add Flags to the post-inc expression only if we
+        // know that it is *undefined behavior* for BEValueV to
+        // overflow.
+        if (auto *BEInst = dyn_cast<Instruction>(BEValueV))
+          if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L))
+            (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags);
+
+        return PHISCEV;
       }
-    } else {
-      // Otherwise, this could be a loop like this:
-      //     i = 0;  for (j = 1; ..; ++j) { ....  i = j; }
-      // In this case, j = {1,+,1}  and BEValue is j.
-      // Because the other in-value of i (0) fits the evolution of BEValue
-      // i really is an addrec evolution.
-      //
-      // We can generalize this saying that i is the shifted value of BEValue
-      // by one iteration:
-      //   PHI(f(0), f({1,+,1})) --> f({0,+,1})
-      const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this);
-      const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this);
-      if (Shifted != getCouldNotCompute() &&
-          Start != getCouldNotCompute()) {
-        const SCEV *StartVal = getSCEV(StartValueV);
-        if (Start == StartVal) {
-          // Okay, for the entire analysis of this edge we assumed the PHI
-          // to be symbolic.  We now need to go back and purge all of the
-          // entries for the scalars that use the symbolic expression.
-          forgetSymbolicName(PN, SymbolicName);
-          ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted;
-          return Shifted;
-        }
+    }
+  } else {
+    // Otherwise, this could be a loop like this:
+    //     i = 0;  for (j = 1; ..; ++j) { ....  i = j; }
+    // In this case, j = {1,+,1}  and BEValue is j.
+    // Because the other in-value of i (0) fits the evolution of BEValue
+    // i really is an addrec evolution.
+    //
+    // We can generalize this saying that i is the shifted value of BEValue
+    // by one iteration:
+    //   PHI(f(0), f({1,+,1})) --> f({0,+,1})
+    const SCEV *Shifted = SCEVShiftRewriter::rewrite(BEValue, L, *this);
+    const SCEV *Start = SCEVInitRewriter::rewrite(Shifted, L, *this);
+    if (Shifted != getCouldNotCompute() &&
+        Start != getCouldNotCompute()) {
+      const SCEV *StartVal = getSCEV(StartValueV);
+      if (Start == StartVal) {
+        // Okay, for the entire analysis of this edge we assumed the PHI
+        // to be symbolic.  We now need to go back and purge all of the
+        // entries for the scalars that use the symbolic expression.
+        forgetSymbolicName(PN, SymbolicName);
+        ValueExprMap[SCEVCallbackVH(PN, this)] = Shifted;
+        return Shifted;
       }
     }
-
-    // Remove the temporary PHI node SCEV that has been inserted while intending
-    // to create an AddRecExpr for this PHI node. We can not keep this temporary
-    // as it will prevent later (possibly simpler) SCEV expressions to be added
-    // to the ValueExprMap.
-    eraseValueFromMap(PN);
   }
 
+  // Remove the temporary PHI node SCEV that has been inserted while intending
+  // to create an AddRecExpr for this PHI node. We can not keep this temporary
+  // as it will prevent later (possibly simpler) SCEV expressions to be added
+  // to the ValueExprMap.
+  eraseValueFromMap(PN);
+
   return nullptr;
 }
 
@@ -4289,7 +4859,7 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
   // PHI's incoming blocks are in a different loop, in which case doing so
   // risks breaking LCSSA form. Instcombine would normally zap these, but
   // it doesn't have DominatorTree information, so it may miss cases.
-  if (Value *V = SimplifyInstruction(PN, getDataLayout(), &TLI, &DT, &AC))
+  if (Value *V = SimplifyInstruction(PN, {getDataLayout(), &TLI, &DT, &AC}))
     if (LI.replacementPreservesLCSSAForm(PN, V))
       return getSCEV(V);
 
@@ -4409,8 +4979,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   return getGEPExpr(GEP, IndexExprs);
 }
 
-uint32_t
-ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
     return C->getAPInt().countTrailingZeros();
 
@@ -4420,14 +4989,16 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
 
   if (const SCEVZeroExtendExpr *E = dyn_cast<SCEVZeroExtendExpr>(S)) {
     uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
-    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
-             getTypeSizeInBits(E->getType()) : OpRes;
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType())
+               ? getTypeSizeInBits(E->getType())
+               : OpRes;
   }
 
   if (const SCEVSignExtendExpr *E = dyn_cast<SCEVSignExtendExpr>(S)) {
     uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
-    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
-             getTypeSizeInBits(E->getType()) : OpRes;
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType())
+               ? getTypeSizeInBits(E->getType())
+               : OpRes;
   }
 
   if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
@@ -4444,8 +5015,8 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
     uint32_t BitWidth = getTypeSizeInBits(M->getType());
     for (unsigned i = 1, e = M->getNumOperands();
          SumOpRes != BitWidth && i != e; ++i)
-      SumOpRes = std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)),
-                          BitWidth);
+      SumOpRes =
+          std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)), BitWidth);
     return SumOpRes;
   }
 
@@ -4475,17 +5046,25 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
 
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     // For a SCEVUnknown, ask ValueTracking.
-    unsigned BitWidth = getTypeSizeInBits(U->getType());
-    APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-    computeKnownBits(U->getValue(), Zeros, Ones, getDataLayout(), 0, &AC,
-                     nullptr, &DT);
-    return Zeros.countTrailingOnes();
+    KnownBits Known = computeKnownBits(U->getValue(), getDataLayout(), 0, &AC, nullptr, &DT);
+    return Known.countMinTrailingZeros();
   }
 
   // SCEVUDivExpr
   return 0;
 }
 
+uint32_t ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+  auto I = MinTrailingZerosCache.find(S);
+  if (I != MinTrailingZerosCache.end())
+    return I->second;
+
+  uint32_t Result = GetMinTrailingZerosImpl(S);
+  auto InsertPair = MinTrailingZerosCache.insert({S, Result});
+  assert(InsertPair.second && "Should insert a new key");
+  return InsertPair.first->second;
+}
+
 /// Helper method to assign a range to V from metadata present in the IR.
 static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
@@ -4498,9 +5077,9 @@ static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
 /// Determine the range for a particular SCEV.  If SignHint is
 /// HINT_RANGE_UNSIGNED (resp. HINT_RANGE_SIGNED) then getRange prefers ranges
 /// with a "cleaner" unsigned (resp. signed) representation.
-ConstantRange
-ScalarEvolution::getRange(const SCEV *S,
-                          ScalarEvolution::RangeSignHint SignHint) {
+const ConstantRange &
+ScalarEvolution::getRangeRef(const SCEV *S,
+                             ScalarEvolution::RangeSignHint SignHint) {
   DenseMap<const SCEV *, ConstantRange> &Cache =
       SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED ? UnsignedRanges
                                                        : SignedRanges;
@@ -4531,54 +5110,54 @@ ScalarEvolution::getRange(const SCEV *S,
   }
 
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    ConstantRange X = getRange(Add->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(Add->getOperand(0), SignHint);
     for (unsigned i = 1, e = Add->getNumOperands(); i != e; ++i)
-      X = X.add(getRange(Add->getOperand(i), SignHint));
+      X = X.add(getRangeRef(Add->getOperand(i), SignHint));
     return setRange(Add, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
-    ConstantRange X = getRange(Mul->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(Mul->getOperand(0), SignHint);
     for (unsigned i = 1, e = Mul->getNumOperands(); i != e; ++i)
-      X = X.multiply(getRange(Mul->getOperand(i), SignHint));
+      X = X.multiply(getRangeRef(Mul->getOperand(i), SignHint));
     return setRange(Mul, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
-    ConstantRange X = getRange(SMax->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(SMax->getOperand(0), SignHint);
     for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i)
-      X = X.smax(getRange(SMax->getOperand(i), SignHint));
+      X = X.smax(getRangeRef(SMax->getOperand(i), SignHint));
     return setRange(SMax, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
-    ConstantRange X = getRange(UMax->getOperand(0), SignHint);
+    ConstantRange X = getRangeRef(UMax->getOperand(0), SignHint);
     for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i)
-      X = X.umax(getRange(UMax->getOperand(i), SignHint));
+      X = X.umax(getRangeRef(UMax->getOperand(i), SignHint));
     return setRange(UMax, SignHint, ConservativeResult.intersectWith(X));
   }
 
   if (const SCEVUDivExpr *UDiv = dyn_cast<SCEVUDivExpr>(S)) {
-    ConstantRange X = getRange(UDiv->getLHS(), SignHint);
-    ConstantRange Y = getRange(UDiv->getRHS(), SignHint);
+    ConstantRange X = getRangeRef(UDiv->getLHS(), SignHint);
+    ConstantRange Y = getRangeRef(UDiv->getRHS(), SignHint);
     return setRange(UDiv, SignHint,
                     ConservativeResult.intersectWith(X.udiv(Y)));
   }
 
   if (const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(S)) {
-    ConstantRange X = getRange(ZExt->getOperand(), SignHint);
+    ConstantRange X = getRangeRef(ZExt->getOperand(), SignHint);
     return setRange(ZExt, SignHint,
                     ConservativeResult.intersectWith(X.zeroExtend(BitWidth)));
   }
 
   if (const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(S)) {
-    ConstantRange X = getRange(SExt->getOperand(), SignHint);
+    ConstantRange X = getRangeRef(SExt->getOperand(), SignHint);
     return setRange(SExt, SignHint,
                     ConservativeResult.intersectWith(X.signExtend(BitWidth)));
   }
 
   if (const SCEVTruncateExpr *Trunc = dyn_cast<SCEVTruncateExpr>(S)) {
-    ConstantRange X = getRange(Trunc->getOperand(), SignHint);
+    ConstantRange X = getRangeRef(Trunc->getOperand(), SignHint);
     return setRange(Trunc, SignHint,
                     ConservativeResult.intersectWith(X.truncate(BitWidth)));
   }
@@ -4632,7 +5211,7 @@ ScalarEvolution::getRange(const SCEV *S,
       }
     }
 
-    return setRange(AddRec, SignHint, ConservativeResult);
+    return setRange(AddRec, SignHint, std::move(ConservativeResult));
   }
 
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
@@ -4647,11 +5226,11 @@ ScalarEvolution::getRange(const SCEV *S,
     const DataLayout &DL = getDataLayout();
     if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) {
       // For a SCEVUnknown, ask ValueTracking.
-      APInt Zeros(BitWidth, 0), Ones(BitWidth, 0);
-      computeKnownBits(U->getValue(), Zeros, Ones, DL, 0, &AC, nullptr, &DT);
-      if (Ones != ~Zeros + 1)
+      KnownBits Known = computeKnownBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
+      if (Known.One != ~Known.Zero + 1)
         ConservativeResult =
-            ConservativeResult.intersectWith(ConstantRange(Ones, ~Zeros + 1));
+            ConservativeResult.intersectWith(ConstantRange(Known.One,
+                                                           ~Known.Zero + 1));
     } else {
       assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED &&
              "generalize as needed!");
@@ -4662,10 +5241,78 @@ ScalarEvolution::getRange(const SCEV *S,
                           APInt::getSignedMaxValue(BitWidth).ashr(NS - 1) + 1));
     }
 
-    return setRange(U, SignHint, ConservativeResult);
+    return setRange(U, SignHint, std::move(ConservativeResult));
   }
 
-  return setRange(S, SignHint, ConservativeResult);
+  return setRange(S, SignHint, std::move(ConservativeResult));
+}
+
+// Given a StartRange, Step and MaxBECount for an expression compute a range of
+// values that the expression can take. Initially, the expression has a value
+// from StartRange and then is changed by Step up to MaxBECount times. Signed
+// argument defines if we treat Step as signed or unsigned.
+static ConstantRange getRangeForAffineARHelper(APInt Step,
+                                               const ConstantRange &StartRange,
+                                               const APInt &MaxBECount,
+                                               unsigned BitWidth, bool Signed) {
+  // If either Step or MaxBECount is 0, then the expression won't change, and we
+  // just need to return the initial range.
+  if (Step == 0 || MaxBECount == 0)
+    return StartRange;
+
+  // If we don't know anything about the initial value (i.e. StartRange is
+  // FullRange), then we don't know anything about the final range either.
+  // Return FullRange.
+  if (StartRange.isFullSet())
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // If Step is signed and negative, then we use its absolute value, but we also
+  // note that we're moving in the opposite direction.
+  bool Descending = Signed && Step.isNegative();
+
+  if (Signed)
+    // This is correct even for INT_SMIN. Let's look at i8 to illustrate this:
+    // abs(INT_SMIN) = abs(-128) = abs(0x80) = -0x80 = 0x80 = 128.
+    // This equations hold true due to the well-defined wrap-around behavior of
+    // APInt.
+    Step = Step.abs();
+
+  // Check if Offset is more than full span of BitWidth. If it is, the
+  // expression is guaranteed to overflow.
+  if (APInt::getMaxValue(StartRange.getBitWidth()).udiv(Step).ult(MaxBECount))
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // Offset is by how much the expression can change. Checks above guarantee no
+  // overflow here.
+  APInt Offset = Step * MaxBECount;
+
+  // Minimum value of the final range will match the minimal value of StartRange
+  // if the expression is increasing and will be decreased by Offset otherwise.
+  // Maximum value of the final range will match the maximal value of StartRange
+  // if the expression is decreasing and will be increased by Offset otherwise.
+  APInt StartLower = StartRange.getLower();
+  APInt StartUpper = StartRange.getUpper() - 1;
+  APInt MovedBoundary = Descending ? (StartLower - std::move(Offset))
+                                   : (StartUpper + std::move(Offset));
+
+  // It's possible that the new minimum/maximum value will fall into the initial
+  // range (due to wrap around). This means that the expression can take any
+  // value in this bitwidth, and we have to return full range.
+  if (StartRange.contains(MovedBoundary))
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  APInt NewLower =
+      Descending ? std::move(MovedBoundary) : std::move(StartLower);
+  APInt NewUpper =
+      Descending ? std::move(StartUpper) : std::move(MovedBoundary);
+  NewUpper += 1;
+
+  // If we end up with full range, return a proper full range.
+  if (NewLower == NewUpper)
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // No overflow detected, return [StartLower, StartUpper + Offset + 1) range.
+  return ConstantRange(std::move(NewLower), std::move(NewUpper));
 }
 
 ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
@@ -4676,60 +5323,29 @@ ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
          getTypeSizeInBits(MaxBECount->getType()) <= BitWidth &&
          "Precondition!");
 
-  ConstantRange Result(BitWidth, /* isFullSet = */ true);
-
-  // Check for overflow.  This must be done with ConstantRange arithmetic
-  // because we could be called from within the ScalarEvolution overflow
-  // checking code.
-
   MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType());
-  ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
-  ConstantRange ZExtMaxBECountRange = MaxBECountRange.zextOrTrunc(BitWidth * 2);
+  APInt MaxBECountValue = getUnsignedRangeMax(MaxBECount);
 
+  // First, consider step signed.
+  ConstantRange StartSRange = getSignedRange(Start);
   ConstantRange StepSRange = getSignedRange(Step);
-  ConstantRange SExtStepSRange = StepSRange.sextOrTrunc(BitWidth * 2);
-
-  ConstantRange StartURange = getUnsignedRange(Start);
-  ConstantRange EndURange =
-      StartURange.add(MaxBECountRange.multiply(StepSRange));
-
-  // Check for unsigned overflow.
-  ConstantRange ZExtStartURange = StartURange.zextOrTrunc(BitWidth * 2);
-  ConstantRange ZExtEndURange = EndURange.zextOrTrunc(BitWidth * 2);
-  if (ZExtStartURange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-      ZExtEndURange) {
-    APInt Min = APIntOps::umin(StartURange.getUnsignedMin(),
-                               EndURange.getUnsignedMin());
-    APInt Max = APIntOps::umax(StartURange.getUnsignedMax(),
-                               EndURange.getUnsignedMax());
-    bool IsFullRange = Min.isMinValue() && Max.isMaxValue();
-    if (!IsFullRange)
-      Result =
-          Result.intersectWith(ConstantRange(Min, Max + 1));
-  }
 
-  ConstantRange StartSRange = getSignedRange(Start);
-  ConstantRange EndSRange =
-      StartSRange.add(MaxBECountRange.multiply(StepSRange));
-
-  // Check for signed overflow. This must be done with ConstantRange
-  // arithmetic because we could be called from within the ScalarEvolution
-  // overflow checking code.
-  ConstantRange SExtStartSRange = StartSRange.sextOrTrunc(BitWidth * 2);
-  ConstantRange SExtEndSRange = EndSRange.sextOrTrunc(BitWidth * 2);
-  if (SExtStartSRange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-      SExtEndSRange) {
-    APInt Min =
-        APIntOps::smin(StartSRange.getSignedMin(), EndSRange.getSignedMin());
-    APInt Max =
-        APIntOps::smax(StartSRange.getSignedMax(), EndSRange.getSignedMax());
-    bool IsFullRange = Min.isMinSignedValue() && Max.isMaxSignedValue();
-    if (!IsFullRange)
-      Result =
-          Result.intersectWith(ConstantRange(Min, Max + 1));
-  }
+  // If Step can be both positive and negative, we need to find ranges for the
+  // maximum absolute step values in both directions and union them.
+  ConstantRange SR =
+      getRangeForAffineARHelper(StepSRange.getSignedMin(), StartSRange,
+                                MaxBECountValue, BitWidth, /* Signed = */ true);
+  SR = SR.unionWith(getRangeForAffineARHelper(StepSRange.getSignedMax(),
+                                              StartSRange, MaxBECountValue,
+                                              BitWidth, /* Signed = */ true));
 
-  return Result;
+  // Next, consider step unsigned.
+  ConstantRange UR = getRangeForAffineARHelper(
+      getUnsignedRangeMax(Step), getUnsignedRange(Start),
+      MaxBECountValue, BitWidth, /* Signed = */ false);
+
+  // Finally, intersect signed and unsigned ranges.
+  return SR.intersectWith(UR);
 }
 
 ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
@@ -4875,7 +5491,8 @@ bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
     return false;
 
   // Only proceed if we can prove that I does not yield poison.
-  if (!isKnownNotFullPoison(I)) return false;
+  if (!programUndefinedIfFullPoison(I))
+    return false;
 
   // At this point we know that if I is executed, then it does not wrap
   // according to at least one of NSW or NUW. If I is not executed, then we do
@@ -5128,9 +5745,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       // For an expression like x&255 that merely masks off the high bits,
       // use zext(trunc(x)) as the SCEV expression.
       if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
-        if (CI->isNullValue())
+        if (CI->isZero())
           return getSCEV(BO->RHS);
-        if (CI->isAllOnesValue())
+        if (CI->isMinusOne())
           return getSCEV(BO->LHS);
         const APInt &A = CI->getValue();
 
@@ -5141,19 +5758,34 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         unsigned LZ = A.countLeadingZeros();
         unsigned TZ = A.countTrailingZeros();
         unsigned BitWidth = A.getBitWidth();
-        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-        computeKnownBits(BO->LHS, KnownZero, KnownOne, getDataLayout(),
+        KnownBits Known(BitWidth);
+        computeKnownBits(BO->LHS, Known, getDataLayout(),
                          0, &AC, nullptr, &DT);
 
         APInt EffectiveMask =
             APInt::getLowBitsSet(BitWidth, BitWidth - LZ - TZ).shl(TZ);
-        if ((LZ != 0 || TZ != 0) && !((~A & ~KnownZero) & EffectiveMask)) {
-          const SCEV *MulCount = getConstant(ConstantInt::get(
-              getContext(), APInt::getOneBitSet(BitWidth, TZ)));
+        if ((LZ != 0 || TZ != 0) && !((~A & ~Known.Zero) & EffectiveMask)) {
+          const SCEV *MulCount = getConstant(APInt::getOneBitSet(BitWidth, TZ));
+          const SCEV *LHS = getSCEV(BO->LHS);
+          const SCEV *ShiftedLHS = nullptr;
+          if (auto *LHSMul = dyn_cast<SCEVMulExpr>(LHS)) {
+            if (auto *OpC = dyn_cast<SCEVConstant>(LHSMul->getOperand(0))) {
+              // For an expression like (x * 8) & 8, simplify the multiply.
+              unsigned MulZeros = OpC->getAPInt().countTrailingZeros();
+              unsigned GCD = std::min(MulZeros, TZ);
+              APInt DivAmt = APInt::getOneBitSet(BitWidth, TZ - GCD);
+              SmallVector<const SCEV*, 4> MulOps;
+              MulOps.push_back(getConstant(OpC->getAPInt().lshr(GCD)));
+              MulOps.append(LHSMul->op_begin() + 1, LHSMul->op_end());
+              auto *NewMul = getMulExpr(MulOps, LHSMul->getNoWrapFlags());
+              ShiftedLHS = getUDivExpr(NewMul, getConstant(DivAmt));
+            }
+          }
+          if (!ShiftedLHS)
+            ShiftedLHS = getUDivExpr(LHS, MulCount);
           return getMulExpr(
               getZeroExtendExpr(
-                  getTruncateExpr(
-                      getUDivExactExpr(getSCEV(BO->LHS), MulCount),
+                  getTruncateExpr(ShiftedLHS,
                       IntegerType::get(getContext(), BitWidth - LZ - TZ)),
                   BO->LHS->getType()),
               MulCount);
@@ -5190,7 +5822,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     case Instruction::Xor:
       if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS)) {
         // If the RHS of xor is -1, then this is a not operation.
-        if (CI->isAllOnesValue())
+        if (CI->isMinusOne())
           return getNotSCEV(getSCEV(BO->LHS));
 
         // Model xor(and(x, C), C) as and(~x, C), if C is a low-bits mask.
@@ -5211,7 +5843,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
                 // If C is a low-bits mask, the zero extend is serving to
                 // mask off the high bits. Complement the operand and
                 // re-apply the zext.
-                if (APIntOps::isMask(Z0TySize, CI->getValue()))
+                if (CI->getValue().isMask(Z0TySize))
                   return getZeroExtendExpr(getNotSCEV(Z0), UTy);
 
                 // If C is a single bit, it may be in the sign-bit position
@@ -5219,7 +5851,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
                 // using an add, which is equivalent, and re-apply the zext.
                 APInt Trunc = CI->getValue().trunc(Z0TySize);
                 if (Trunc.zext(getTypeSizeInBits(UTy)) == CI->getValue() &&
-                    Trunc.isSignBit())
+                    Trunc.isSignMask())
                   return getZeroExtendExpr(getAddExpr(Z0, getConstant(Trunc)),
                                            UTy);
               }
@@ -5255,28 +5887,55 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     break;
 
     case Instruction::AShr:
-      // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS))
-        if (Operator *L = dyn_cast<Operator>(BO->LHS))
-          if (L->getOpcode() == Instruction::Shl &&
-              L->getOperand(1) == BO->RHS) {
-            uint64_t BitWidth = getTypeSizeInBits(BO->LHS->getType());
-
-            // If the shift count is not less than the bitwidth, the result of
-            // the shift is undefined. Don't try to analyze it, because the
-            // resolution chosen here may differ from the resolution chosen in
-            // other parts of the compiler.
-            if (CI->getValue().uge(BitWidth))
-              break;
+      // AShr X, C, where C is a constant.
+      ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS);
+      if (!CI)
+        break;
+
+      Type *OuterTy = BO->LHS->getType();
+      uint64_t BitWidth = getTypeSizeInBits(OuterTy);
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (CI->getValue().uge(BitWidth))
+        break;
 
-            uint64_t Amt = BitWidth - CI->getZExtValue();
-            if (Amt == BitWidth)
-              return getSCEV(L->getOperand(0)); // shift by zero --> noop
+      if (CI->isZero())
+        return getSCEV(BO->LHS); // shift by zero --> noop
+
+      uint64_t AShrAmt = CI->getZExtValue();
+      Type *TruncTy = IntegerType::get(getContext(), BitWidth - AShrAmt);
+
+      Operator *L = dyn_cast<Operator>(BO->LHS);
+      if (L && L->getOpcode() == Instruction::Shl) {
+        // X = Shl A, n
+        // Y = AShr X, m
+        // Both n and m are constant.
+
+        const SCEV *ShlOp0SCEV = getSCEV(L->getOperand(0));
+        if (L->getOperand(1) == BO->RHS)
+          // For a two-shift sext-inreg, i.e. n = m,
+          // use sext(trunc(x)) as the SCEV expression.
+          return getSignExtendExpr(
+              getTruncateExpr(ShlOp0SCEV, TruncTy), OuterTy);
+
+        ConstantInt *ShlAmtCI = dyn_cast<ConstantInt>(L->getOperand(1));
+        if (ShlAmtCI && ShlAmtCI->getValue().ult(BitWidth)) {
+          uint64_t ShlAmt = ShlAmtCI->getZExtValue();
+          if (ShlAmt > AShrAmt) {
+            // When n > m, use sext(mul(trunc(x), 2^(n-m)))) as the SCEV
+            // expression. We already checked that ShlAmt < BitWidth, so
+            // the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as
+            // ShlAmt - AShrAmt < Amt.
+            APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt,
+                                            ShlAmt - AShrAmt);
             return getSignExtendExpr(
-                getTruncateExpr(getSCEV(L->getOperand(0)),
-                                IntegerType::get(getContext(), Amt)),
-                BO->LHS->getType());
+                getMulExpr(getTruncateExpr(ShlOp0SCEV, TruncTy),
+                getConstant(Mul)), OuterTy);
           }
+        }
+      }
       break;
     }
   }
@@ -5348,7 +6007,7 @@ static unsigned getConstantTripCount(const SCEVConstant *ExitCount) {
   return ((unsigned)ExitConst->getZExtValue()) + 1;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripCount(L, ExitingBB);
 
@@ -5356,7 +6015,7 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
   return 0;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
+unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L,
                                                     BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
@@ -5366,13 +6025,13 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
   return getConstantTripCount(ExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantMaxTripCount(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
   const auto *MaxExitCount =
       dyn_cast<SCEVConstant>(getMaxBackedgeTakenCount(L));
   return getConstantTripCount(MaxExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripMultiple(L, ExitingBB);
 
@@ -5393,7 +6052,7 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
 /// As explained in the comments for getSmallConstantTripCount, this assumes
 /// that control exits the loop via ExitingBlock.
 unsigned
-ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
+ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
                                               BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
@@ -5403,17 +6062,16 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
     return 1;
 
   // Get the trip count from the BE count by adding 1.
-  const SCEV *TCMul = getAddExpr(ExitCount, getOne(ExitCount->getType()));
-  // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt
-  // to factor simple cases.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul))
-    TCMul = Mul->getOperand(0);
-
-  const SCEVConstant *MulC = dyn_cast<SCEVConstant>(TCMul);
-  if (!MulC)
-    return 1;
+  const SCEV *TCExpr = getAddExpr(ExitCount, getOne(ExitCount->getType()));
+
+  const SCEVConstant *TC = dyn_cast<SCEVConstant>(TCExpr);
+  if (!TC)
+    // Attempt to factor more general cases. Returns the greatest power of
+    // two divisor. If overflow happens, the trip count expression is still
+    // divisible by the greatest power of 2 divisor returned.
+    return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
 
-  ConstantInt *Result = MulC->getValue();
+  ConstantInt *Result = TC->getValue();
 
   // Guard against huge trip counts (this requires checking
   // for zero to handle the case where the trip count == -1 and the
@@ -5428,7 +6086,8 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
 /// Get the expression for the number of loop iterations for which this loop is
 /// guaranteed not to exit via ExitingBlock. Otherwise return
 /// SCEVCouldNotCompute.
-const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
+const SCEV *ScalarEvolution::getExitCount(const Loop *L,
+                                          BasicBlock *ExitingBlock) {
   return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
 }
 
@@ -5569,6 +6228,16 @@ void ScalarEvolution::forgetLoop(const Loop *L) {
   RemoveLoopFromBackedgeMap(BackedgeTakenCounts);
   RemoveLoopFromBackedgeMap(PredicatedBackedgeTakenCounts);
 
+  // Drop information about predicated SCEV rewrites for this loop.
+  for (auto I = PredicatedSCEVRewrites.begin();
+       I != PredicatedSCEVRewrites.end();) {
+    std::pair<const SCEV *, const Loop *> Entry = I->first;
+    if (Entry.second == L)
+      PredicatedSCEVRewrites.erase(I++);
+    else
+      ++I;
+  }
+
   // Drop information about expressions based on loop-header PHIs.
   SmallVector<Instruction *, 16> Worklist;
   PushLoopPHIs(L, Worklist);
@@ -5681,6 +6350,8 @@ ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
   if (any_of(ExitNotTaken, PredicateNotAlwaysTrue) || !getMax())
     return SE->getCouldNotCompute();
 
+  assert((isa<SCEVCouldNotCompute>(getMax()) || isa<SCEVConstant>(getMax())) &&
+         "No point in having a non-constant max backedge taken count!");
   return getMax();
 }
 
@@ -5705,6 +6376,45 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S,
   return false;
 }
 
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E)
+    : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+    const SCEV *E, const SCEV *M, bool MaxOrZero,
+    ArrayRef<const SmallPtrSetImpl<const SCEVPredicate *> *> PredSetList)
+    : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) {
+  assert((isa<SCEVCouldNotCompute>(ExactNotTaken) ||
+          !isa<SCEVCouldNotCompute>(MaxNotTaken)) &&
+         "Exact is not allowed to be less precise than Max");
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+  for (auto *PredSet : PredSetList)
+    for (auto *P : *PredSet)
+      addPredicate(P);
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(
+    const SCEV *E, const SCEV *M, bool MaxOrZero,
+    const SmallPtrSetImpl<const SCEVPredicate *> &PredSet)
+    : ExitLimit(E, M, MaxOrZero, {&PredSet}) {
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+}
+
+ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M,
+                                      bool MaxOrZero)
+    : ExitLimit(E, M, MaxOrZero, None) {
+  assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
+          isa<SCEVConstant>(MaxNotTaken)) &&
+         "No point in having a non-constant max backedge taken count!");
+}
+
 /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
 /// computable exit into a persistent ExitNotTakenInfo array.
 ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
@@ -5728,6 +6438,8 @@ ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
 
         return ExitNotTakenInfo(ExitBB, EL.ExactNotTaken, std::move(Predicate));
       });
+  assert((isa<SCEVCouldNotCompute>(MaxCount) || isa<SCEVConstant>(MaxCount)) &&
+         "No point in having a non-constant max backedge taken count!");
 }
 
 /// Invalidate this result and free the ExitNotTakenInfo array.
@@ -5886,24 +6598,74 @@ ScalarEvolution::computeExitLimit(const Loop *L, BasicBlock *ExitingBlock,
   return getCouldNotCompute();
 }
 
-ScalarEvolution::ExitLimit
-ScalarEvolution::computeExitLimitFromCond(const Loop *L,
-                                          Value *ExitCond,
-                                          BasicBlock *TBB,
-                                          BasicBlock *FBB,
-                                          bool ControlsExit,
-                                          bool AllowPredicates) {
+ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCond(
+    const Loop *L, Value *ExitCond, BasicBlock *TBB, BasicBlock *FBB,
+    bool ControlsExit, bool AllowPredicates) {
+  ScalarEvolution::ExitLimitCacheTy Cache(L, TBB, FBB, AllowPredicates);
+  return computeExitLimitFromCondCached(Cache, L, ExitCond, TBB, FBB,
+                                        ControlsExit, AllowPredicates);
+}
+
+Optional<ScalarEvolution::ExitLimit>
+ScalarEvolution::ExitLimitCache::find(const Loop *L, Value *ExitCond,
+                                      BasicBlock *TBB, BasicBlock *FBB,
+                                      bool ControlsExit, bool AllowPredicates) {
+  (void)this->L;
+  (void)this->TBB;
+  (void)this->FBB;
+  (void)this->AllowPredicates;
+
+  assert(this->L == L && this->TBB == TBB && this->FBB == FBB &&
+         this->AllowPredicates == AllowPredicates &&
+         "Variance in assumed invariant key components!");
+  auto Itr = TripCountMap.find({ExitCond, ControlsExit});
+  if (Itr == TripCountMap.end())
+    return None;
+  return Itr->second;
+}
+
+void ScalarEvolution::ExitLimitCache::insert(const Loop *L, Value *ExitCond,
+                                             BasicBlock *TBB, BasicBlock *FBB,
+                                             bool ControlsExit,
+                                             bool AllowPredicates,
+                                             const ExitLimit &EL) {
+  assert(this->L == L && this->TBB == TBB && this->FBB == FBB &&
+         this->AllowPredicates == AllowPredicates &&
+         "Variance in assumed invariant key components!");
+
+  auto InsertResult = TripCountMap.insert({{ExitCond, ControlsExit}, EL});
+  assert(InsertResult.second && "Expected successful insertion!");
+  (void)InsertResult;
+}
+
+ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondCached(
+    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, BasicBlock *TBB,
+    BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) {
+
+  if (auto MaybeEL =
+          Cache.find(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates))
+    return *MaybeEL;
+
+  ExitLimit EL = computeExitLimitFromCondImpl(Cache, L, ExitCond, TBB, FBB,
+                                              ControlsExit, AllowPredicates);
+  Cache.insert(L, ExitCond, TBB, FBB, ControlsExit, AllowPredicates, EL);
+  return EL;
+}
+
+ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
+    ExitLimitCacheTy &Cache, const Loop *L, Value *ExitCond, BasicBlock *TBB,
+    BasicBlock *FBB, bool ControlsExit, bool AllowPredicates) {
   // Check if the controlling expression for this loop is an And or Or.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
       bool EitherMayExit = L->contains(TBB);
-      ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
-                                               ControlsExit && !EitherMayExit,
-                                               AllowPredicates);
-      ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
-                                               ControlsExit && !EitherMayExit,
-                                               AllowPredicates);
+      ExitLimit EL0 = computeExitLimitFromCondCached(
+          Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit,
+          AllowPredicates);
+      ExitLimit EL1 = computeExitLimitFromCondCached(
+          Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit,
+          AllowPredicates);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (EitherMayExit) {
@@ -5939,7 +6701,7 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L,
       // to not.
       if (isa<SCEVCouldNotCompute>(MaxBECount) &&
           !isa<SCEVCouldNotCompute>(BECount))
-        MaxBECount = BECount;
+        MaxBECount = getConstant(getUnsignedRangeMax(BECount));
 
       return ExitLimit(BECount, MaxBECount, false,
                        {&EL0.Predicates, &EL1.Predicates});
@@ -5947,12 +6709,12 @@ ScalarEvolution::computeExitLimitFromCond(const Loop *L,
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
       bool EitherMayExit = L->contains(FBB);
-      ExitLimit EL0 = computeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
-                                               ControlsExit && !EitherMayExit,
-                                               AllowPredicates);
-      ExitLimit EL1 = computeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
-                                               ControlsExit && !EitherMayExit,
-                                               AllowPredicates);
+      ExitLimit EL0 = computeExitLimitFromCondCached(
+          Cache, L, BO->getOperand(0), TBB, FBB, ControlsExit && !EitherMayExit,
+          AllowPredicates);
+      ExitLimit EL1 = computeExitLimitFromCondCached(
+          Cache, L, BO->getOperand(1), TBB, FBB, ControlsExit && !EitherMayExit,
+          AllowPredicates);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (EitherMayExit) {
@@ -6337,13 +7099,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit(
     // {K,ashr,<positive-constant>} stabilizes to signum(K) in at most
     // bitwidth(K) iterations.
     Value *FirstValue = PN->getIncomingValueForBlock(Predecessor);
-    bool KnownZero, KnownOne;
-    ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr,
-                   Predecessor->getTerminator(), &DT);
+    KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr,
+                                       Predecessor->getTerminator(), &DT);
     auto *Ty = cast<IntegerType>(RHS->getType());
-    if (KnownZero)
+    if (Known.isNonNegative())
       StableValue = ConstantInt::get(Ty, 0);
-    else if (KnownOne)
+    else if (Known.isNegative())
       StableValue = ConstantInt::get(Ty, -1, true);
     else
       return getCouldNotCompute();
@@ -6383,7 +7144,7 @@ static bool CanConstantFold(const Instruction *I) {
 
   if (const CallInst *CI = dyn_cast<CallInst>(I))
     if (const Function *F = CI->getCalledFunction())
-      return canConstantFoldCallTo(F);
+      return canConstantFoldCallTo(CI, F);
   return false;
 }
 
@@ -6408,7 +7169,10 @@ static bool canConstantEvolve(Instruction *I, const Loop *L) {
 /// recursing through each instruction operand until reaching a loop header phi.
 static PHINode *
 getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
-                               DenseMap<Instruction *, PHINode *> &PHIMap) {
+                               DenseMap<Instruction *, PHINode *> &PHIMap,
+                               unsigned Depth) {
+  if (Depth > MaxConstantEvolvingDepth)
+    return nullptr;
 
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
@@ -6428,7 +7192,7 @@ getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
     if (!P) {
       // Recurse and memoize the results, whether a phi is found or not.
       // This recursive call invalidates pointers into PHIMap.
-      P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap);
+      P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap, Depth + 1);
       PHIMap[OpInst] = P;
     }
     if (!P)
@@ -6455,7 +7219,7 @@ static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
 
   // Record non-constant instructions contained by the loop.
   DenseMap<Instruction *, PHINode *> PHIMap;
-  return getConstantEvolvingPHIOperands(I, L, PHIMap);
+  return getConstantEvolvingPHIOperands(I, L, PHIMap, 0);
 }
 
 /// EvaluateExpression - Given an expression that passes the
@@ -6829,6 +7593,25 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
             const SCEV *BackedgeTakenCount = getBackedgeTakenCount(LI);
             if (const SCEVConstant *BTCC =
                   dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
+
+              // This trivial case can show up in some degenerate cases where
+              // the incoming IR has not yet been fully simplified.
+              if (BTCC->getValue()->isZero()) {
+                Value *InitValue = nullptr;
+                bool MultipleInitValues = false;
+                for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+                  if (!LI->contains(PN->getIncomingBlock(i))) {
+                    if (!InitValue)
+                      InitValue = PN->getIncomingValue(i);
+                    else if (InitValue != PN->getIncomingValue(i)) {
+                      MultipleInitValues = true;
+                      break;
+                    }
+                  }
+                  if (!MultipleInitValues && InitValue)
+                    return getSCEV(InitValue);
+                }
+              }
               // Okay, we know how many times the containing loop executes.  If
               // this is a constant evolving PHI node, get the final value at
               // the specified iteration number.
@@ -7014,10 +7797,10 @@ const SCEV *ScalarEvolution::getSCEVAtScope(Value *V, const Loop *L) {
 /// A and B isn't important.
 ///
 /// If the equation does not have a solution, SCEVCouldNotCompute is returned.
-static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
+static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const SCEV *B,
                                                ScalarEvolution &SE) {
   uint32_t BW = A.getBitWidth();
-  assert(BW == B.getBitWidth() && "Bit widths must be the same.");
+  assert(BW == SE.getTypeSizeInBits(B->getType()));
   assert(A != 0 && "A must be non-zero.");
 
   // 1. D = gcd(A, N)
@@ -7031,7 +7814,7 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   //
   // B is divisible by D if and only if the multiplicity of prime factor 2 for B
   // is not less than multiplicity of this prime factor for D.
-  if (B.countTrailingZeros() < Mult2)
+  if (SE.GetMinTrailingZeros(B) < Mult2)
     return SE.getCouldNotCompute();
 
   // 3. Compute I: the multiplicative inverse of (A / D) in arithmetic
@@ -7049,9 +7832,8 @@ static const SCEV *SolveLinEquationWithOverflow(const APInt &A, const APInt &B,
   // I * (B / D) mod (N / D)
   // To simplify the computation, we factor out the divide by D:
   // (I * B mod N) / D
-  APInt Result = (I * B).lshr(Mult2);
-
-  return SE.getConstant(Result);
+  const SCEV *D = SE.getConstant(APInt::getOneBitSet(BW, Mult2));
+  return SE.getUDivExactExpr(SE.getMulExpr(B, SE.getConstant(I)), D);
 }
 
 /// Find the roots of the quadratic equation for the given quadratic chrec
@@ -7074,50 +7856,50 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
   const APInt &M = MC->getAPInt();
   const APInt &N = NC->getAPInt();
   APInt Two(BitWidth, 2);
-  APInt Four(BitWidth, 4);
-
-  {
-    using namespace APIntOps;
-    const APInt& C = L;
-    // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
-    // The B coefficient is M-N/2
-    APInt B(M);
-    B -= sdiv(N,Two);
-
-    // The A coefficient is N/2
-    APInt A(N.sdiv(Two));
-
-    // Compute the B^2-4ac term.
-    APInt SqrtTerm(B);
-    SqrtTerm *= B;
-    SqrtTerm -= Four * (A * C);
-
-    if (SqrtTerm.isNegative()) {
-      // The loop is provably infinite.
-      return None;
-    }
 
-    // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
-    // integer value or else APInt::sqrt() will assert.
-    APInt SqrtVal(SqrtTerm.sqrt());
+  // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
 
-    // Compute the two solutions for the quadratic formula.
-    // The divisions must be performed as signed divisions.
-    APInt NegB(-B);
-    APInt TwoA(A << 1);
-    if (TwoA.isMinValue())
-      return None;
+  // The A coefficient is N/2
+  APInt A = N.sdiv(Two);
+
+  // The B coefficient is M-N/2
+  APInt B = M;
+  B -= A; // A is the same as N/2.
 
-    LLVMContext &Context = SE.getContext();
+  // The C coefficient is L.
+  const APInt& C = L;
 
-    ConstantInt *Solution1 =
-      ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
-    ConstantInt *Solution2 =
-      ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+  // Compute the B^2-4ac term.
+  APInt SqrtTerm = B;
+  SqrtTerm *= B;
+  SqrtTerm -= 4 * (A * C);
 
-    return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
-                          cast<SCEVConstant>(SE.getConstant(Solution2)));
-  } // end APIntOps namespace
+  if (SqrtTerm.isNegative()) {
+    // The loop is provably infinite.
+    return None;
+  }
+
+  // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest
+  // integer value or else APInt::sqrt() will assert.
+  APInt SqrtVal = SqrtTerm.sqrt();
+
+  // Compute the two solutions for the quadratic formula.
+  // The divisions must be performed as signed divisions.
+  APInt NegB = -std::move(B);
+  APInt TwoA = std::move(A);
+  TwoA <<= 1;
+  if (TwoA.isNullValue())
+    return None;
+
+  LLVMContext &Context = SE.getContext();
+
+  ConstantInt *Solution1 =
+    ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA));
+  ConstantInt *Solution2 =
+    ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA));
+
+  return std::make_pair(cast<SCEVConstant>(SE.getConstant(Solution1)),
+                        cast<SCEVConstant>(SE.getConstant(Solution2)));
 }
 
 ScalarEvolution::ExitLimit
@@ -7197,7 +7979,7 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // to 0, it must be counting down to equal 0. Consequently, N = Start / -Step.
   // We have not yet seen any such cases.
   const SCEVConstant *StepC = dyn_cast<SCEVConstant>(Step);
-  if (!StepC || StepC->getValue()->equalsInt(0))
+  if (!StepC || StepC->getValue()->isZero())
     return getCouldNotCompute();
 
   // For positive steps (counting up until unsigned overflow):
@@ -7211,8 +7993,8 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
   // Handle unitary steps, which cannot wraparound.
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
-  if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) {
-    APInt MaxBECount = getUnsignedRange(Distance).getUnsignedMax();
+  if (StepC->getValue()->isOne() || StepC->getValue()->isMinusOne()) {
+    APInt MaxBECount = getUnsignedRangeMax(Distance);
 
     // When a loop like "for (int i = 0; i != n; ++i) { /* body */ }" is rotated,
     // we end up with a loop whose backedge-taken count is n - 1.  Detect this
@@ -7233,62 +8015,6 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
     return ExitLimit(Distance, getConstant(MaxBECount), false, Predicates);
   }
 
-  // As a special case, handle the instance where Step is a positive power of
-  // two. In this case, determining whether Step divides Distance evenly can be
-  // done by counting and comparing the number of trailing zeros of Step and
-  // Distance.
-  if (!CountDown) {
-    const APInt &StepV = StepC->getAPInt();
-    // StepV.isPowerOf2() returns true if StepV is an positive power of two.  It
-    // also returns true if StepV is maximally negative (eg, INT_MIN), but that
-    // case is not handled as this code is guarded by !CountDown.
-    if (StepV.isPowerOf2() &&
-        GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) {
-      // Here we've constrained the equation to be of the form
-      //
-      //   2^(N + k) * Distance' = (StepV == 2^N) * X (mod 2^W)  ... (0)
-      //
-      // where we're operating on a W bit wide integer domain and k is
-      // non-negative.  The smallest unsigned solution for X is the trip count.
-      //
-      // (0) is equivalent to:
-      //
-      //      2^(N + k) * Distance' - 2^N * X = L * 2^W
-      // <=>  2^N(2^k * Distance' - X) = L * 2^(W - N) * 2^N
-      // <=>  2^k * Distance' - X = L * 2^(W - N)
-      // <=>  2^k * Distance'     = L * 2^(W - N) + X    ... (1)
-      //
-      // The smallest X satisfying (1) is unsigned remainder of dividing the LHS
-      // by 2^(W - N).
-      //
-      // <=>  X = 2^k * Distance' URem 2^(W - N)   ... (2)
-      //
-      // E.g. say we're solving
-      //
-      //   2 * Val = 2 * X  (in i8)   ... (3)
-      //
-      // then from (2), we get X = Val URem i8 128 (k = 0 in this case).
-      //
-      // Note: It is tempting to solve (3) by setting X = Val, but Val is not
-      // necessarily the smallest unsigned value of X that satisfies (3).
-      // E.g. if Val is i8 -127 then the smallest value of X that satisfies (3)
-      // is i8 1, not i8 -127
-
-      const auto *ModuloResult = getUDivExactExpr(Distance, Step);
-
-      // Since SCEV does not have a URem node, we construct one using a truncate
-      // and a zero extend.
-
-      unsigned NarrowWidth = StepV.getBitWidth() - StepV.countTrailingZeros();
-      auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth);
-      auto *WideTy = Distance->getType();
-
-      const SCEV *Limit =
-          getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy);
-      return ExitLimit(Limit, Limit, false, Predicates);
-    }
-  }
-
   // If the condition controls loop exit (the loop exits only if the expression
   // is true) and the addition is no-wrap we can use unsigned divide to
   // compute the backedge count.  In this case, the step may not divide the
@@ -7298,16 +8024,20 @@ ScalarEvolution::howFarToZero(const SCEV *V, const Loop *L, bool ControlsExit,
       loopHasNoAbnormalExits(AddRec->getLoop())) {
     const SCEV *Exact =
         getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-    return ExitLimit(Exact, Exact, false, Predicates);
+    const SCEV *Max =
+        Exact == getCouldNotCompute()
+            ? Exact
+            : getConstant(getUnsignedRangeMax(Exact));
+    return ExitLimit(Exact, Max, false, Predicates);
   }
 
-  // Then, try to solve the above equation provided that Start is constant.
-  if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start)) {
-    const SCEV *E = SolveLinEquationWithOverflow(
-        StepC->getValue()->getValue(), -StartC->getValue()->getValue(), *this);
-    return ExitLimit(E, E, false, Predicates);
-  }
-  return getCouldNotCompute();
+  // Solve the general equation.
+  const SCEV *E = SolveLinEquationWithOverflow(StepC->getAPInt(),
+                                               getNegativeSCEV(Start), *this);
+  const SCEV *M = E == getCouldNotCompute()
+                      ? E
+                      : getConstant(getUnsignedRangeMax(E));
+  return ExitLimit(E, M, false, Predicates);
 }
 
 ScalarEvolution::ExitLimit
@@ -7319,7 +8049,7 @@ ScalarEvolution::howFarToNonZero(const SCEV *V, const Loop *L) {
   // If the value is a constant, check to see if it is known to be non-zero
   // already.  If so, the backedge will execute zero times.
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
-    if (!C->getValue()->isNullValue())
+    if (!C->getValue()->isZero())
       return getZero(C->getType());
     return getCouldNotCompute();  // Otherwise it will loop infinitely.
   }
@@ -7503,12 +8233,12 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
   // adding or subtracting 1 from one of the operands.
   switch (Pred) {
   case ICmpInst::ICMP_SLE:
-    if (!getSignedRange(RHS).getSignedMax().isMaxSignedValue()) {
+    if (!getSignedRangeMax(RHS).isMaxSignedValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SLT;
       Changed = true;
-    } else if (!getSignedRange(LHS).getSignedMin().isMinSignedValue()) {
+    } else if (!getSignedRangeMin(LHS).isMinSignedValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SLT;
@@ -7516,12 +8246,12 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     }
     break;
   case ICmpInst::ICMP_SGE:
-    if (!getSignedRange(RHS).getSignedMin().isMinSignedValue()) {
+    if (!getSignedRangeMin(RHS).isMinSignedValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SGT;
       Changed = true;
-    } else if (!getSignedRange(LHS).getSignedMax().isMaxSignedValue()) {
+    } else if (!getSignedRangeMax(LHS).isMaxSignedValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
                        SCEV::FlagNSW);
       Pred = ICmpInst::ICMP_SGT;
@@ -7529,23 +8259,23 @@ bool ScalarEvolution::SimplifyICmpOperands(ICmpInst::Predicate &Pred,
     }
     break;
   case ICmpInst::ICMP_ULE:
-    if (!getUnsignedRange(RHS).getUnsignedMax().isMaxValue()) {
+    if (!getUnsignedRangeMax(RHS).isMaxValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), 1, true), RHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_ULT;
       Changed = true;
-    } else if (!getUnsignedRange(LHS).getUnsignedMin().isMinValue()) {
+    } else if (!getUnsignedRangeMin(LHS).isMinValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), LHS);
       Pred = ICmpInst::ICMP_ULT;
       Changed = true;
     }
     break;
   case ICmpInst::ICMP_UGE:
-    if (!getUnsignedRange(RHS).getUnsignedMin().isMinValue()) {
+    if (!getUnsignedRangeMin(RHS).isMinValue()) {
       RHS = getAddExpr(getConstant(RHS->getType(), (uint64_t)-1, true), RHS);
       Pred = ICmpInst::ICMP_UGT;
       Changed = true;
-    } else if (!getUnsignedRange(LHS).getUnsignedMax().isMaxValue()) {
+    } else if (!getUnsignedRangeMax(LHS).isMaxValue()) {
       LHS = getAddExpr(getConstant(RHS->getType(), 1, true), LHS,
                        SCEV::FlagNUW);
       Pred = ICmpInst::ICMP_UGT;
@@ -7579,19 +8309,19 @@ trivially_false:
 }
 
 bool ScalarEvolution::isKnownNegative(const SCEV *S) {
-  return getSignedRange(S).getSignedMax().isNegative();
+  return getSignedRangeMax(S).isNegative();
 }
 
 bool ScalarEvolution::isKnownPositive(const SCEV *S) {
-  return getSignedRange(S).getSignedMin().isStrictlyPositive();
+  return getSignedRangeMin(S).isStrictlyPositive();
 }
 
 bool ScalarEvolution::isKnownNonNegative(const SCEV *S) {
-  return !getSignedRange(S).getSignedMin().isNegative();
+  return !getSignedRangeMin(S).isNegative();
 }
 
 bool ScalarEvolution::isKnownNonPositive(const SCEV *S) {
-  return !getSignedRange(S).getSignedMax().isStrictlyPositive();
+  return !getSignedRangeMax(S).isStrictlyPositive();
 }
 
 bool ScalarEvolution::isKnownNonZero(const SCEV *S) {
@@ -7822,6 +8552,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
 
   case ICmpInst::ICMP_SGE:
     std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_SLE:
     // X s<= (X + C)<nsw> if C >= 0
     if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) && C.isNonNegative())
@@ -7835,6 +8566,7 @@ bool ScalarEvolution::isKnownPredicateViaNoOverflow(ICmpInst::Predicate Pred,
 
   case ICmpInst::ICMP_SGT:
     std::swap(LHS, RHS);
+    LLVM_FALLTHROUGH;
   case ICmpInst::ICMP_SLT:
     // X s< (X + C)<nsw> if C > 0
     if (MatchBinaryAddToConst(RHS, LHS, C, SCEV::FlagNSW) &&
@@ -8175,7 +8907,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
     // predicate we're interested in folding.
 
     APInt Min = ICmpInst::isSigned(Pred) ?
-        getSignedRange(V).getSignedMin() : getUnsignedRange(V).getUnsignedMin();
+        getSignedRangeMin(V) : getUnsignedRangeMin(V);
 
     if (Min == C->getAPInt()) {
       // Given (V >= Min && V != Min) we conclude V >= (Min + 1).
@@ -8192,6 +8924,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
           if (isImpliedCondOperands(Pred, LHS, RHS, V,
                                     getConstant(SharperMin)))
             return true;
+          LLVM_FALLTHROUGH;
 
         case ICmpInst::ICMP_SGT:
         case ICmpInst::ICMP_UGT:
@@ -8206,6 +8939,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS,
 
           if (isImpliedCondOperands(Pred, LHS, RHS, V, getConstant(Min)))
             return true;
+          LLVM_FALLTHROUGH;
 
         default:
           // No change
@@ -8488,19 +9222,161 @@ static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
   llvm_unreachable("covered switch fell through?!");
 }
 
+bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
+                                             const SCEV *LHS, const SCEV *RHS,
+                                             const SCEV *FoundLHS,
+                                             const SCEV *FoundRHS,
+                                             unsigned Depth) {
+  assert(getTypeSizeInBits(LHS->getType()) ==
+             getTypeSizeInBits(RHS->getType()) &&
+         "LHS and RHS have different sizes?");
+  assert(getTypeSizeInBits(FoundLHS->getType()) ==
+             getTypeSizeInBits(FoundRHS->getType()) &&
+         "FoundLHS and FoundRHS have different sizes?");
+  // We want to avoid hurting the compile time with analysis of too big trees.
+  if (Depth > MaxSCEVOperationsImplicationDepth)
+    return false;
+  // We only want to work with ICMP_SGT comparison so far.
+  // TODO: Extend to ICMP_UGT?
+  if (Pred == ICmpInst::ICMP_SLT) {
+    Pred = ICmpInst::ICMP_SGT;
+    std::swap(LHS, RHS);
+    std::swap(FoundLHS, FoundRHS);
+  }
+  if (Pred != ICmpInst::ICMP_SGT)
+    return false;
+
+  auto GetOpFromSExt = [&](const SCEV *S) {
+    if (auto *Ext = dyn_cast<SCEVSignExtendExpr>(S))
+      return Ext->getOperand();
+    // TODO: If S is a SCEVConstant then you can cheaply "strip" the sext off
+    // the constant in some cases.
+    return S;
+  };
+
+  // Acquire values from extensions.
+  auto *OrigFoundLHS = FoundLHS;
+  LHS = GetOpFromSExt(LHS);
+  FoundLHS = GetOpFromSExt(FoundLHS);
+
+  // Is the SGT predicate can be proved trivially or using the found context.
+  auto IsSGTViaContext = [&](const SCEV *S1, const SCEV *S2) {
+    return isKnownViaSimpleReasoning(ICmpInst::ICMP_SGT, S1, S2) ||
+           isImpliedViaOperations(ICmpInst::ICMP_SGT, S1, S2, OrigFoundLHS,
+                                  FoundRHS, Depth + 1);
+  };
+
+  if (auto *LHSAddExpr = dyn_cast<SCEVAddExpr>(LHS)) {
+    // We want to avoid creation of any new non-constant SCEV. Since we are
+    // going to compare the operands to RHS, we should be certain that we don't
+    // need any size extensions for this. So let's decline all cases when the
+    // sizes of types of LHS and RHS do not match.
+    // TODO: Maybe try to get RHS from sext to catch more cases?
+    if (getTypeSizeInBits(LHS->getType()) != getTypeSizeInBits(RHS->getType()))
+      return false;
+
+    // Should not overflow.
+    if (!LHSAddExpr->hasNoSignedWrap())
+      return false;
+
+    auto *LL = LHSAddExpr->getOperand(0);
+    auto *LR = LHSAddExpr->getOperand(1);
+    auto *MinusOne = getNegativeSCEV(getOne(RHS->getType()));
+
+    // Checks that S1 >= 0 && S2 > RHS, trivially or using the found context.
+    auto IsSumGreaterThanRHS = [&](const SCEV *S1, const SCEV *S2) {
+      return IsSGTViaContext(S1, MinusOne) && IsSGTViaContext(S2, RHS);
+    };
+    // Try to prove the following rule:
+    // (LHS = LL + LR) && (LL >= 0) && (LR > RHS) => (LHS > RHS).
+    // (LHS = LL + LR) && (LR >= 0) && (LL > RHS) => (LHS > RHS).
+    if (IsSumGreaterThanRHS(LL, LR) || IsSumGreaterThanRHS(LR, LL))
+      return true;
+  } else if (auto *LHSUnknownExpr = dyn_cast<SCEVUnknown>(LHS)) {
+    Value *LL, *LR;
+    // FIXME: Once we have SDiv implemented, we can get rid of this matching.
+    using namespace llvm::PatternMatch;
+    if (match(LHSUnknownExpr->getValue(), m_SDiv(m_Value(LL), m_Value(LR)))) {
+      // Rules for division.
+      // We are going to perform some comparisons with Denominator and its
+      // derivative expressions. In general case, creating a SCEV for it may
+      // lead to a complex analysis of the entire graph, and in particular it
+      // can request trip count recalculation for the same loop. This would
+      // cache as SCEVCouldNotCompute to avoid the infinite recursion. To avoid
+      // this, we only want to create SCEVs that are constants in this section.
+      // So we bail if Denominator is not a constant.
+      if (!isa<ConstantInt>(LR))
+        return false;
+
+      auto *Denominator = cast<SCEVConstant>(getSCEV(LR));
+
+      // We want to make sure that LHS = FoundLHS / Denominator. If it is so,
+      // then a SCEV for the numerator already exists and matches with FoundLHS.
+      auto *Numerator = getExistingSCEV(LL);
+      if (!Numerator || Numerator->getType() != FoundLHS->getType())
+        return false;
+
+      // Make sure that the numerator matches with FoundLHS and the denominator
+      // is positive.
+      if (!HasSameValue(Numerator, FoundLHS) || !isKnownPositive(Denominator))
+        return false;
+
+      auto *DTy = Denominator->getType();
+      auto *FRHSTy = FoundRHS->getType();
+      if (DTy->isPointerTy() != FRHSTy->isPointerTy())
+        // One of types is a pointer and another one is not. We cannot extend
+        // them properly to a wider type, so let us just reject this case.
+        // TODO: Usage of getEffectiveSCEVType for DTy, FRHSTy etc should help
+        // to avoid this check.
+        return false;
+
+      // Given that:
+      // FoundLHS > FoundRHS, LHS = FoundLHS / Denominator, Denominator > 0.
+      auto *WTy = getWiderType(DTy, FRHSTy);
+      auto *DenominatorExt = getNoopOrSignExtend(Denominator, WTy);
+      auto *FoundRHSExt = getNoopOrSignExtend(FoundRHS, WTy);
+
+      // Try to prove the following rule:
+      // (FoundRHS > Denominator - 2) && (RHS <= 0) => (LHS > RHS).
+      // For example, given that FoundLHS > 2. It means that FoundLHS is at
+      // least 3. If we divide it by Denominator < 4, we will have at least 1.
+      auto *DenomMinusTwo = getMinusSCEV(DenominatorExt, getConstant(WTy, 2));
+      if (isKnownNonPositive(RHS) &&
+          IsSGTViaContext(FoundRHSExt, DenomMinusTwo))
+        return true;
+
+      // Try to prove the following rule:
+      // (FoundRHS > -1 - Denominator) && (RHS < 0) => (LHS > RHS).
+      // For example, given that FoundLHS > -3. Then FoundLHS is at least -2.
+      // If we divide it by Denominator > 2, then:
+      // 1. If FoundLHS is negative, then the result is 0.
+      // 2. If FoundLHS is non-negative, then the result is non-negative.
+      // Anyways, the result is non-negative.
+      auto *MinusOne = getNegativeSCEV(getOne(WTy));
+      auto *NegDenomMinusOne = getMinusSCEV(MinusOne, DenominatorExt);
+      if (isKnownNegative(RHS) &&
+          IsSGTViaContext(FoundRHSExt, NegDenomMinusOne))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool
+ScalarEvolution::isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS) {
+  return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
+         IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
+         IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
+         isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
+}
+
 bool
 ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
                                              const SCEV *LHS, const SCEV *RHS,
                                              const SCEV *FoundLHS,
                                              const SCEV *FoundRHS) {
-  auto IsKnownPredicateFull =
-      [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
-    return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
-           IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
-           IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
-           isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
-  };
-
   switch (Pred) {
   default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
   case ICmpInst::ICMP_EQ:
@@ -8510,30 +9386,34 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
     break;
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_SLE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_SGE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_SGE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_SLE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_ULE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_UGE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_UGE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_ULE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS))
       return true;
     break;
   }
 
+  // Maybe it can be proved via operations?
+  if (isImpliedViaOperations(Pred, LHS, RHS, FoundLHS, FoundRHS))
+    return true;
+
   return false;
 }
 
@@ -8551,7 +9431,7 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
   if (!Addend)
     return false;
 
-  APInt ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getAPInt();
+  const APInt &ConstFoundRHS = cast<SCEVConstant>(FoundRHS)->getAPInt();
 
   // `FoundLHSRange` is the range we know `FoundLHS` to be in by virtue of the
   // antecedent "`FoundLHS` `Pred` `FoundRHS`".
@@ -8563,7 +9443,7 @@ bool ScalarEvolution::isImpliedCondOperandsViaRanges(ICmpInst::Predicate Pred,
 
   // We can also compute the range of values for `LHS` that satisfy the
   // consequent, "`LHS` `Pred` `RHS`":
-  APInt ConstRHS = cast<SCEVConstant>(RHS)->getAPInt();
+  const APInt &ConstRHS = cast<SCEVConstant>(RHS)->getAPInt();
   ConstantRange SatisfyingLHSRange =
       ConstantRange::makeSatisfyingICmpRegion(Pred, ConstRHS);
 
@@ -8582,22 +9462,20 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
   const SCEV *One = getOne(Stride->getType());
 
   if (IsSigned) {
-    APInt MaxRHS = getSignedRange(RHS).getSignedMax();
+    APInt MaxRHS = getSignedRangeMax(RHS);
     APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
-    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
-                                .getSignedMax();
+    APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One));
 
     // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
-    return (MaxValue - MaxStrideMinusOne).slt(MaxRHS);
+    return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS);
   }
 
-  APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
+  APInt MaxRHS = getUnsignedRangeMax(RHS);
   APInt MaxValue = APInt::getMaxValue(BitWidth);
-  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
-                              .getUnsignedMax();
+  APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One));
 
   // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
-  return (MaxValue - MaxStrideMinusOne).ult(MaxRHS);
+  return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS);
 }
 
 bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
@@ -8608,22 +9486,20 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
   const SCEV *One = getOne(Stride->getType());
 
   if (IsSigned) {
-    APInt MinRHS = getSignedRange(RHS).getSignedMin();
+    APInt MinRHS = getSignedRangeMin(RHS);
     APInt MinValue = APInt::getSignedMinValue(BitWidth);
-    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
-                               .getSignedMax();
+    APInt MaxStrideMinusOne = getSignedRangeMax(getMinusSCEV(Stride, One));
 
     // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
-    return (MinValue + MaxStrideMinusOne).sgt(MinRHS);
+    return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS);
   }
 
-  APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
+  APInt MinRHS = getUnsignedRangeMin(RHS);
   APInt MinValue = APInt::getMinValue(BitWidth);
-  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
-                            .getUnsignedMax();
+  APInt MaxStrideMinusOne = getUnsignedRangeMax(getMinusSCEV(Stride, One));
 
   // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
-  return (MinValue + MaxStrideMinusOne).ugt(MinRHS);
+  return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
 }
 
 const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step,
@@ -8759,8 +9635,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
   } else {
     // Calculate the maximum backedge count based on the range of values
     // permitted by Start, End, and Stride.
-    APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin()
-                              : getUnsignedRange(Start).getUnsignedMin();
+    APInt MinStart = IsSigned ? getSignedRangeMin(Start)
+                              : getUnsignedRangeMin(Start);
 
     unsigned BitWidth = getTypeSizeInBits(LHS->getType());
 
@@ -8768,8 +9644,8 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
 
     if (PositiveStride)
       StrideForMaxBECount =
-        IsSigned ? getSignedRange(Stride).getSignedMin()
-                 : getUnsignedRange(Stride).getUnsignedMin();
+        IsSigned ? getSignedRangeMin(Stride)
+                 : getUnsignedRangeMin(Stride);
     else
       // Using a stride of 1 is safe when computing max backedge taken count for
       // a loop with unknown stride.
@@ -8783,15 +9659,16 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
     // the case End = RHS. This is safe because in the other case (End - Start)
     // is zero, leading to a zero maximum backedge taken count.
     APInt MaxEnd =
-      IsSigned ? APIntOps::smin(getSignedRange(RHS).getSignedMax(), Limit)
-               : APIntOps::umin(getUnsignedRange(RHS).getUnsignedMax(), Limit);
+      IsSigned ? APIntOps::smin(getSignedRangeMax(RHS), Limit)
+               : APIntOps::umin(getUnsignedRangeMax(RHS), Limit);
 
     MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
                                 getConstant(StrideForMaxBECount), false);
   }
 
-  if (isa<SCEVCouldNotCompute>(MaxBECount))
-    MaxBECount = BECount;
+  if (isa<SCEVCouldNotCompute>(MaxBECount) &&
+      !isa<SCEVCouldNotCompute>(BECount))
+    MaxBECount = getConstant(getUnsignedRangeMax(BECount));
 
   return ExitLimit(BECount, MaxBECount, MaxOrZero, Predicates);
 }
@@ -8842,11 +9719,11 @@ ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
 
   const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
 
-  APInt MaxStart = IsSigned ? getSignedRange(Start).getSignedMax()
-                            : getUnsignedRange(Start).getUnsignedMax();
+  APInt MaxStart = IsSigned ? getSignedRangeMax(Start)
+                            : getUnsignedRangeMax(Start);
 
-  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
-                             : getUnsignedRange(Stride).getUnsignedMin();
+  APInt MinStride = IsSigned ? getSignedRangeMin(Stride)
+                             : getUnsignedRangeMin(Stride);
 
   unsigned BitWidth = getTypeSizeInBits(LHS->getType());
   APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1)
@@ -8856,8 +9733,8 @@ ScalarEvolution::howManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
   // the case End = RHS. This is safe because in the other case (Start - End)
   // is zero, leading to a zero maximum backedge taken count.
   APInt MinEnd =
-    IsSigned ? APIntOps::smax(getSignedRange(RHS).getSignedMin(), Limit)
-             : APIntOps::umax(getUnsignedRange(RHS).getUnsignedMin(), Limit);
+    IsSigned ? APIntOps::smax(getSignedRangeMin(RHS), Limit)
+             : APIntOps::umax(getUnsignedRangeMin(RHS), Limit);
 
 
   const SCEV *MaxBECount = getCouldNotCompute();
@@ -8914,9 +9791,8 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
     // the upper value of the range must be the first possible exit value.
     // If A is negative then the lower of the range is the last possible loop
     // value.  Also note that we already checked for a full range.
-    APInt One(BitWidth,1);
     APInt A = cast<SCEVConstant>(getOperand(1))->getAPInt();
-    APInt End = A.sge(One) ? (Range.getUpper() - One) : Range.getLower();
+    APInt End = A.sge(1) ? (Range.getUpper() - 1) : Range.getLower();
 
     // The exit value should be (End+A)/A.
     APInt ExitVal = (End + A).udiv(A);
@@ -8932,7 +9808,7 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(const ConstantRange &Range,
     // Ensure that the previous value is in the range.  This is a sanity check.
     assert(Range.contains(
            EvaluateConstantChrecAtConstant(this,
-           ConstantInt::get(SE.getContext(), ExitVal - One), SE)->getValue()) &&
+           ConstantInt::get(SE.getContext(), ExitVal - 1), SE)->getValue()) &&
            "Linear scev computation is off in a bad way!");
     return SE.getConstant(ExitValue);
   } else if (isQuadratic()) {
@@ -9083,8 +9959,11 @@ struct SCEVCollectAddRecMultiplies {
       bool HasAddRec = false;
       SmallVector<const SCEV *, 0> Operands;
       for (auto Op : Mul->operands()) {
-        if (isa<SCEVUnknown>(Op)) {
+        const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Op);
+        if (Unknown && !isa<CallInst>(Unknown->getValue())) {
           Operands.push_back(Op);
+        } else if (Unknown) {
+          HasAddRec = true;
         } else {
           bool ContainsAddRec;
           SCEVHasAddRec ContiansAddRec(ContainsAddRec);
@@ -9238,7 +10117,7 @@ const SCEV *ScalarEvolution::getElementSize(Instruction *Inst) {
 
 void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
                                           SmallVectorImpl<const SCEV *> &Sizes,
-                                          const SCEV *ElementSize) const {
+                                          const SCEV *ElementSize) {
   if (Terms.size() < 1 || !ElementSize)
     return;
 
@@ -9254,7 +10133,7 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
     });
 
   // Remove duplicates.
-  std::sort(Terms.begin(), Terms.end());
+  array_pod_sort(Terms.begin(), Terms.end());
   Terms.erase(std::unique(Terms.begin(), Terms.end()), Terms.end());
 
   // Put larger terms first.
@@ -9262,13 +10141,11 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
     return numberOfTerms(LHS) > numberOfTerms(RHS);
   });
 
-  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
-
   // Try to divide all terms by the element size. If term is not divisible by
   // element size, proceed with the original term.
   for (const SCEV *&Term : Terms) {
     const SCEV *Q, *R;
-    SCEVDivision::divide(SE, Term, ElementSize, &Q, &R);
+    SCEVDivision::divide(*this, Term, ElementSize, &Q, &R);
     if (!Q->isZero())
       Term = Q;
   }
@@ -9277,7 +10154,7 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
 
   // Remove constant factors.
   for (const SCEV *T : Terms)
-    if (const SCEV *NewT = removeConstantFactors(SE, T))
+    if (const SCEV *NewT = removeConstantFactors(*this, T))
       NewTerms.push_back(NewT);
 
   DEBUG({
@@ -9286,8 +10163,7 @@ void ScalarEvolution::findArrayDimensions(SmallVectorImpl<const SCEV *> &Terms,
         dbgs() << *T << "\n";
     });
 
-  if (NewTerms.empty() ||
-      !findArrayDimensionsRec(SE, NewTerms, Sizes)) {
+  if (NewTerms.empty() || !findArrayDimensionsRec(*this, NewTerms, Sizes)) {
     Sizes.clear();
     return;
   }
@@ -9524,6 +10400,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       ValueExprMap(std::move(Arg.ValueExprMap)),
       PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)),
       WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
+      MinTrailingZerosCache(std::move(Arg.MinTrailingZerosCache)),
       BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
       PredicatedBackedgeTakenCounts(
           std::move(Arg.PredicatedBackedgeTakenCounts)),
@@ -9538,6 +10415,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       UniqueSCEVs(std::move(Arg.UniqueSCEVs)),
       UniquePreds(std::move(Arg.UniquePreds)),
       SCEVAllocator(std::move(Arg.SCEVAllocator)),
+      PredicatedSCEVRewrites(std::move(Arg.PredicatedSCEVRewrites)),
       FirstUnknown(Arg.FirstUnknown) {
   Arg.FirstUnknown = nullptr;
 }
@@ -9621,6 +10499,13 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     OS << "Unpredictable predicated backedge-taken count. ";
   }
   OS << "\n";
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    OS << "Trip multiple is " << SE->getSmallConstantTripMultiple(L) << "\n";
+  }
 }
 
 static StringRef loopDispositionToStr(ScalarEvolution::LoopDisposition LD) {
@@ -9929,6 +10814,16 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   SignedRanges.erase(S);
   ExprValueMap.erase(S);
   HasRecMap.erase(S);
+  MinTrailingZerosCache.erase(S);
+
+  for (auto I = PredicatedSCEVRewrites.begin(); 
+       I != PredicatedSCEVRewrites.end();) {
+    std::pair<const SCEV *, const Loop *> Entry = I->first;
+    if (Entry.first == S)
+      PredicatedSCEVRewrites.erase(I++);
+    else
+      ++I;
+  }
 
   auto RemoveSCEVFromBackedgeMap =
       [S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
@@ -9946,84 +10841,75 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   RemoveSCEVFromBackedgeMap(PredicatedBackedgeTakenCounts);
 }
 
-typedef DenseMap<const Loop *, std::string> VerifyMap;
+void ScalarEvolution::verify() const {
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+  ScalarEvolution SE2(F, TLI, AC, DT, LI);
 
-/// replaceSubString - Replaces all occurrences of From in Str with To.
-static void replaceSubString(std::string &Str, StringRef From, StringRef To) {
-  size_t Pos = 0;
-  while ((Pos = Str.find(From, Pos)) != std::string::npos) {
-    Str.replace(Pos, From.size(), To.data(), To.size());
-    Pos += To.size();
-  }
-}
+  SmallVector<Loop *, 8> LoopStack(LI.begin(), LI.end());
 
-/// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis.
-static void
-getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) {
-  std::string &S = Map[L];
-  if (S.empty()) {
-    raw_string_ostream OS(S);
-    SE.getBackedgeTakenCount(L)->print(OS);
+  // Map's SCEV expressions from one ScalarEvolution "universe" to another.
+  struct SCEVMapper : public SCEVRewriteVisitor<SCEVMapper> {
+    const SCEV *visitConstant(const SCEVConstant *Constant) {
+      return SE.getConstant(Constant->getAPInt());
+    }
+    const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+      return SE.getUnknown(Expr->getValue());
+    }
 
-    // false and 0 are semantically equivalent. This can happen in dead loops.
-    replaceSubString(OS.str(), "false", "0");
-    // Remove wrap flags, their use in SCEV is highly fragile.
-    // FIXME: Remove this when SCEV gets smarter about them.
-    replaceSubString(OS.str(), "<nw>", "");
-    replaceSubString(OS.str(), "<nsw>", "");
-    replaceSubString(OS.str(), "<nuw>", "");
-  }
+    const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+      return SE.getCouldNotCompute();
+    }
+    SCEVMapper(ScalarEvolution &SE) : SCEVRewriteVisitor<SCEVMapper>(SE) {}
+  };
 
-  for (auto *R : reverse(*L))
-    getLoopBackedgeTakenCounts(R, Map, SE); // recurse.
-}
+  SCEVMapper SCM(SE2);
 
-void ScalarEvolution::verify() const {
-  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+  while (!LoopStack.empty()) {
+    auto *L = LoopStack.pop_back_val();
+    LoopStack.insert(LoopStack.end(), L->begin(), L->end());
 
-  // Gather stringified backedge taken counts for all loops using SCEV's caches.
-  // FIXME: It would be much better to store actual values instead of strings,
-  //        but SCEV pointers will change if we drop the caches.
-  VerifyMap BackedgeDumpsOld, BackedgeDumpsNew;
-  for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I)
-    getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE);
+    auto *CurBECount = SCM.visit(
+        const_cast<ScalarEvolution *>(this)->getBackedgeTakenCount(L));
+    auto *NewBECount = SE2.getBackedgeTakenCount(L);
 
-  // Gather stringified backedge taken counts for all loops using a fresh
-  // ScalarEvolution object.
-  ScalarEvolution SE2(F, TLI, AC, DT, LI);
-  for (LoopInfo::reverse_iterator I = LI.rbegin(), E = LI.rend(); I != E; ++I)
-    getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE2);
-
-  // Now compare whether they're the same with and without caches. This allows
-  // verifying that no pass changed the cache.
-  assert(BackedgeDumpsOld.size() == BackedgeDumpsNew.size() &&
-         "New loops suddenly appeared!");
-
-  for (VerifyMap::iterator OldI = BackedgeDumpsOld.begin(),
-                           OldE = BackedgeDumpsOld.end(),
-                           NewI = BackedgeDumpsNew.begin();
-       OldI != OldE; ++OldI, ++NewI) {
-    assert(OldI->first == NewI->first && "Loop order changed!");
-
-    // Compare the stringified SCEVs. We don't care if undef backedgetaken count
-    // changes.
-    // FIXME: We currently ignore SCEV changes from/to CouldNotCompute. This
-    // means that a pass is buggy or SCEV has to learn a new pattern but is
-    // usually not harmful.
-    if (OldI->second != NewI->second &&
-        OldI->second.find("undef") == std::string::npos &&
-        NewI->second.find("undef") == std::string::npos &&
-        OldI->second != "***COULDNOTCOMPUTE***" &&
-        NewI->second != "***COULDNOTCOMPUTE***") {
-      dbgs() << "SCEVValidator: SCEV for loop '"
-             << OldI->first->getHeader()->getName()
-             << "' changed from '" << OldI->second
-             << "' to '" << NewI->second << "'!\n";
+    if (CurBECount == SE2.getCouldNotCompute() ||
+        NewBECount == SE2.getCouldNotCompute()) {
+      // NB! This situation is legal, but is very suspicious -- whatever pass
+      // change the loop to make a trip count go from could not compute to
+      // computable or vice-versa *should have* invalidated SCEV.  However, we
+      // choose not to assert here (for now) since we don't want false
+      // positives.
+      continue;
+    }
+
+    if (containsUndefs(CurBECount) || containsUndefs(NewBECount)) {
+      // SCEV treats "undef" as an unknown but consistent value (i.e. it does
+      // not propagate undef aggressively).  This means we can (and do) fail
+      // verification in cases where a transform makes the trip count of a loop
+      // go from "undef" to "undef+1" (say).  The transform is fine, since in
+      // both cases the loop iterates "undef" times, but SCEV thinks we
+      // increased the trip count of the loop by 1 incorrectly.
+      continue;
+    }
+
+    if (SE.getTypeSizeInBits(CurBECount->getType()) >
+        SE.getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE2.getZeroExtendExpr(NewBECount, CurBECount->getType());
+    else if (SE.getTypeSizeInBits(CurBECount->getType()) <
+             SE.getTypeSizeInBits(NewBECount->getType()))
+      CurBECount = SE2.getZeroExtendExpr(CurBECount, NewBECount->getType());
+
+    auto *ConstantDelta =
+        dyn_cast<SCEVConstant>(SE2.getMinusSCEV(CurBECount, NewBECount));
+
+    if (ConstantDelta && ConstantDelta->getAPInt() != 0) {
+      dbgs() << "Trip Count Changed!\n";
+      dbgs() << "Old: " << *CurBECount << "\n";
+      dbgs() << "New: " << *NewBECount << "\n";
+      dbgs() << "Delta: " << *ConstantDelta << "\n";
       std::abort();
     }
   }
-
-  // TODO: Verify more things.
 }
 
 bool ScalarEvolution::invalidate(
@@ -10098,10 +10984,11 @@ void ScalarEvolutionWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequiredTransitive<TargetLibraryInfoWrapperPass>();
 }
 
-const SCEVPredicate *
-ScalarEvolution::getEqualPredicate(const SCEVUnknown *LHS,
-                                   const SCEVConstant *RHS) {
+const SCEVPredicate *ScalarEvolution::getEqualPredicate(const SCEV *LHS,
+                                                        const SCEV *RHS) {
   FoldingSetNodeID ID;
+  assert(LHS->getType() == RHS->getType() &&
+         "Type mismatch between LHS and RHS");
   // Unique this node based on the arguments
   ID.AddInteger(SCEVPredicate::P_Equal);
   ID.AddPointer(LHS);
@@ -10164,8 +11051,7 @@ public:
           if (IPred->getLHS() == Expr)
             return IPred->getRHS();
     }
-
-    return Expr;
+    return convertToAddRecWithPreds(Expr);
   }
 
   const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
@@ -10201,17 +11087,41 @@ public:
   }
 
 private:
-  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
-                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
-    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+  bool addOverflowAssumption(const SCEVPredicate *P) {
     if (!NewPreds) {
       // Check if we've already made this assumption.
-      return Pred && Pred->implies(A);
+      return Pred && Pred->implies(P);
     }
-    NewPreds->insert(A);
+    NewPreds->insert(P);
     return true;
   }
 
+  bool addOverflowAssumption(const SCEVAddRecExpr *AR,
+                             SCEVWrapPredicate::IncrementWrapFlags AddedFlags) {
+    auto *A = SE.getWrapPredicate(AR, AddedFlags);
+    return addOverflowAssumption(A);
+  }
+
+  // If \p Expr represents a PHINode, we try to see if it can be represented
+  // as an AddRec, possibly under a predicate (PHISCEVPred). If it is possible 
+  // to add this predicate as a runtime overflow check, we return the AddRec.
+  // If \p Expr does not meet these conditions (is not a PHI node, or we 
+  // couldn't create an AddRec for it, or couldn't add the predicate), we just 
+  // return \p Expr.
+  const SCEV *convertToAddRecWithPreds(const SCEVUnknown *Expr) {
+    if (!isa<PHINode>(Expr->getValue()))
+      return Expr;
+    Optional<std::pair<const SCEV *, SmallVector<const SCEVPredicate *, 3>>>
+    PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr);
+    if (!PredicatedRewrite)
+      return Expr;
+    for (auto *P : PredicatedRewrite->second){
+      if (!addOverflowAssumption(P))
+        return Expr;
+    }
+    return PredicatedRewrite->first;
+  }
+  
   SmallPtrSetImpl<const SCEVPredicate *> *NewPreds;
   SCEVUnionPredicate *Pred;
   const Loop *L;
@@ -10248,9 +11158,11 @@ SCEVPredicate::SCEVPredicate(const FoldingSetNodeIDRef ID,
     : FastID(ID), Kind(Kind) {}
 
 SCEVEqualPredicate::SCEVEqualPredicate(const FoldingSetNodeIDRef ID,
-                                       const SCEVUnknown *LHS,
-                                       const SCEVConstant *RHS)
-    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {}
+                                       const SCEV *LHS, const SCEV *RHS)
+    : SCEVPredicate(ID, P_Equal), LHS(LHS), RHS(RHS) {
+  assert(LHS->getType() == RHS->getType() && "LHS and RHS types don't match");
+  assert(LHS != RHS && "LHS and RHS are the same SCEV");
+}
 
 bool SCEVEqualPredicate::implies(const SCEVPredicate *N) const {
   const auto *Op = dyn_cast<SCEVEqualPredicate>(N);
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
index d15a7db..47bdac0 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -748,18 +748,56 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
   // Emit instructions to mul all the operands. Hoist as much as possible
   // out of loops.
   Value *Prod = nullptr;
-  for (const auto &I : OpsAndLoops) {
-    const SCEV *Op = I.second;
+  auto I = OpsAndLoops.begin();
+
+  // Expand the calculation of X pow N in the following manner:
+  // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then:
+  // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK).
+  const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() {
+    auto E = I;
+    // Calculate how many times the same operand from the same loop is included
+    // into this power.
+    uint64_t Exponent = 0;
+    const uint64_t MaxExponent = UINT64_MAX >> 1;
+    // No one sane will ever try to calculate such huge exponents, but if we
+    // need this, we stop on UINT64_MAX / 2 because we need to exit the loop
+    // below when the power of 2 exceeds our Exponent, and we want it to be
+    // 1u << 31 at most to not deal with unsigned overflow.
+    while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) {
+      ++Exponent;
+      ++E;
+    }
+    assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?");
+
+    // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them
+    // that are needed into the result.
+    Value *P = expandCodeFor(I->second, Ty);
+    Value *Result = nullptr;
+    if (Exponent & 1)
+      Result = P;
+    for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) {
+      P = InsertBinop(Instruction::Mul, P, P);
+      if (Exponent & BinExp)
+        Result = Result ? InsertBinop(Instruction::Mul, Result, P) : P;
+    }
+
+    I = E;
+    assert(Result && "Nothing was expanded?");
+    return Result;
+  };
+
+  while (I != OpsAndLoops.end()) {
     if (!Prod) {
       // This is the first operand. Just expand it.
-      Prod = expand(Op);
-    } else if (Op->isAllOnesValue()) {
+      Prod = ExpandOpBinPowN();
+    } else if (I->second->isAllOnesValue()) {
       // Instead of doing a multiply by negative one, just do a negate.
       Prod = InsertNoopCastOfTo(Prod, Ty);
       Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod);
+      ++I;
     } else {
       // A simple mul.
-      Value *W = expandCodeFor(Op, Ty);
+      Value *W = ExpandOpBinPowN();
       Prod = InsertNoopCastOfTo(Prod, Ty);
       // Canonicalize a constant to the RHS.
       if (isa<Constant>(Prod)) std::swap(Prod, W);
@@ -1268,8 +1306,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   if (PostIncLoops.count(L)) {
     PostIncLoopSet Loops;
     Loops.insert(L);
-    Normalized = cast<SCEVAddRecExpr>(TransformForPostIncUse(
-        Normalize, S, nullptr, nullptr, Loops, SE, SE.DT));
+    Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE));
   }
 
   // Strip off any non-loop-dominating component from the addrec start.
@@ -1306,12 +1343,17 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   // Expand the core addrec. If we need post-loop scaling, force it to
   // expand to an integer type to avoid the need for additional casting.
   Type *ExpandTy = PostLoopScale ? IntTy : STy;
+  // We can't use a pointer type for the addrec if the pointer type is
+  // non-integral.
+  Type *AddRecPHIExpandTy =
+      DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy;
+
   // In some cases, we decide to reuse an existing phi node but need to truncate
   // it and/or invert the step.
   Type *TruncTy = nullptr;
   bool InvertStep = false;
-  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy,
-                                          TruncTy, InvertStep);
+  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy,
+                                          IntTy, TruncTy, InvertStep);
 
   // Accommodate post-inc mode, if necessary.
   Value *Result;
@@ -1384,8 +1426,15 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   // Re-apply any non-loop-dominating offset.
   if (PostLoopOffset) {
     if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
-      const SCEV *const OffsetArray[1] = { PostLoopOffset };
-      Result = expandAddToGEP(OffsetArray, OffsetArray+1, PTy, IntTy, Result);
+      if (Result->getType()->isIntegerTy()) {
+        Value *Base = expandCodeFor(PostLoopOffset, ExpandTy);
+        const SCEV *const OffsetArray[1] = {SE.getUnknown(Result)};
+        Result = expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Base);
+      } else {
+        const SCEV *const OffsetArray[1] = {PostLoopOffset};
+        Result =
+            expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Result);
+      }
     } else {
       Result = InsertNoopCastOfTo(Result, IntTy);
       Result = Builder.CreateAdd(Result,
@@ -1773,9 +1822,10 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
 ///
 /// This does not depend on any SCEVExpander state but should be used in
 /// the same context that SCEVExpander is used.
-unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
-                                           SmallVectorImpl<WeakVH> &DeadInsts,
-                                           const TargetTransformInfo *TTI) {
+unsigned
+SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
+                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts,
+                                  const TargetTransformInfo *TTI) {
   // Find integer phis in order of increasing width.
   SmallVector<PHINode*, 8> Phis;
   for (auto &I : *L->getHeader()) {
@@ -1800,7 +1850,7 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
   // so narrow phis can reuse them.
   for (PHINode *Phi : Phis) {
     auto SimplifyPHINode = [&](PHINode *PN) -> Value * {
-      if (Value *V = SimplifyInstruction(PN, DL, &SE.TLI, &SE.DT, &SE.AC))
+      if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC}))
         return V;
       if (!SE.isSCEVable(PN->getType()))
         return nullptr;
diff --git a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
index c1f9503..3740039 100644
--- a/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/contrib/llvm/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -12,243 +12,107 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Dominators.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionNormalization.h"
 using namespace llvm;
 
-/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
-/// and now we need to decide whether the user should use the preinc or post-inc
-/// value.  If this user should use the post-inc version of the IV, return true.
-///
-/// Choosing wrong here can break dominance properties (if we choose to use the
-/// post-inc value when we cannot) or it can end up adding extra live-ranges to
-/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
-/// should use the post-inc value).
-static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
-                                       const Loop *L, DominatorTree *DT) {
-  // If the user is in the loop, use the preinc value.
-  if (L->contains(User)) return false;
-
-  BasicBlock *LatchBlock = L->getLoopLatch();
-  if (!LatchBlock)
-    return false;
-
-  // Ok, the user is outside of the loop.  If it is dominated by the latch
-  // block, use the post-inc value.
-  if (DT->dominates(LatchBlock, User->getParent()))
-    return true;
-
-  // There is one case we have to be careful of: PHI nodes.  These little guys
-  // can live in blocks that are not dominated by the latch block, but (since
-  // their uses occur in the predecessor block, not the block the PHI lives in)
-  // should still use the post-inc value.  Check for this case now.
-  PHINode *PN = dyn_cast<PHINode>(User);
-  if (!PN || !Operand) return false; // not a phi, not dominated by latch block.
-
-  // Look at all of the uses of Operand by the PHI node.  If any use corresponds
-  // to a block that is not dominated by the latch block, give up and use the
-  // preincremented value.
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-    if (PN->getIncomingValue(i) == Operand &&
-        !DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
-      return false;
-
-  // Okay, all uses of Operand by PN are in predecessor blocks that really are
-  // dominated by the latch block.  Use the post-incremented value.
-  return true;
-}
+/// TransformKind - Different types of transformations that
+/// TransformForPostIncUse can do.
+enum TransformKind {
+  /// Normalize - Normalize according to the given loops.
+  Normalize,
+  /// Denormalize - Perform the inverse transform on the expression with the
+  /// given loop set.
+  Denormalize
+};
 
 namespace {
-
-/// Hold the state used during post-inc expression transformation, including a
-/// map of transformed expressions.
-class PostIncTransform {
-  TransformKind Kind;
-  PostIncLoopSet &Loops;
-  ScalarEvolution &SE;
-  DominatorTree &DT;
-
-  DenseMap<const SCEV*, const SCEV*> Transformed;
-
-public:
-  PostIncTransform(TransformKind kind, PostIncLoopSet &loops,
-                   ScalarEvolution &se, DominatorTree &dt):
-    Kind(kind), Loops(loops), SE(se), DT(dt) {}
-
-  const SCEV *TransformSubExpr(const SCEV *S, Instruction *User,
-                               Value *OperandValToReplace);
-
-protected:
-  const SCEV *TransformImpl(const SCEV *S, Instruction *User,
-                            Value *OperandValToReplace);
+struct NormalizeDenormalizeRewriter
+    : public SCEVRewriteVisitor<NormalizeDenormalizeRewriter> {
+  const TransformKind Kind;
+
+  // NB! Pred is a function_ref.  Storing it here is okay only because
+  // we're careful about the lifetime of NormalizeDenormalizeRewriter.
+  const NormalizePredTy Pred;
+
+  NormalizeDenormalizeRewriter(TransformKind Kind, NormalizePredTy Pred,
+                               ScalarEvolution &SE)
+      : SCEVRewriteVisitor<NormalizeDenormalizeRewriter>(SE), Kind(Kind),
+        Pred(Pred) {}
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr);
 };
-
 } // namespace
 
-/// Implement post-inc transformation for all valid expression types.
-const SCEV *PostIncTransform::
-TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
-
-  if (const SCEVCastExpr *X = dyn_cast<SCEVCastExpr>(S)) {
-    const SCEV *O = X->getOperand();
-    const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
-    if (O != N)
-      switch (S->getSCEVType()) {
-      case scZeroExtend: return SE.getZeroExtendExpr(N, S->getType());
-      case scSignExtend: return SE.getSignExtendExpr(N, S->getType());
-      case scTruncate: return SE.getTruncateExpr(N, S->getType());
-      default: llvm_unreachable("Unexpected SCEVCastExpr kind!");
-      }
-    return S;
+const SCEV *
+NormalizeDenormalizeRewriter::visitAddRecExpr(const SCEVAddRecExpr *AR) {
+  SmallVector<const SCEV *, 8> Operands;
+
+  transform(AR->operands(), std::back_inserter(Operands),
+            [&](const SCEV *Op) { return visit(Op); });
+
+  if (!Pred(AR))
+    return SE.getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagAnyWrap);
+
+  // Normalization and denormalization are fancy names for decrementing and
+  // incrementing a SCEV expression with respect to a set of loops.  Since
+  // Pred(AR) has returned true, we know we need to normalize or denormalize AR
+  // with respect to its loop.
+
+  if (Kind == Denormalize) {
+    // Denormalization / "partial increment" is essentially the same as \c
+    // SCEVAddRecExpr::getPostIncExpr.  Here we use an explicit loop to make the
+    // symmetry with Normalization clear.
+    for (int i = 0, e = Operands.size() - 1; i < e; i++)
+      Operands[i] = SE.getAddExpr(Operands[i], Operands[i + 1]);
+  } else {
+    assert(Kind == Normalize && "Only two possibilities!");
+
+    // Normalization / "partial decrement" is a bit more subtle.  Since
+    // incrementing a SCEV expression (in general) changes the step of the SCEV
+    // expression as well, we cannot use the step of the current expression.
+    // Instead, we have to use the step of the very expression we're trying to
+    // compute!
+    //
+    // We solve the issue by recursively building up the result, starting from
+    // the "least significant" operand in the add recurrence:
+    //
+    // Base case:
+    //   Single operand add recurrence.  It's its own normalization.
+    //
+    // N-operand case:
+    //   {S_{N-1},+,S_{N-2},+,...,+,S_0} = S
+    //
+    //   Since the step recurrence of S is {S_{N-2},+,...,+,S_0}, we know its
+    //   normalization by induction.  We subtract the normalized step
+    //   recurrence from S_{N-1} to get the normalization of S.
+
+    for (int i = Operands.size() - 2; i >= 0; i--)
+      Operands[i] = SE.getMinusSCEV(Operands[i], Operands[i + 1]);
   }
 
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-    // An addrec. This is the interesting part.
-    SmallVector<const SCEV *, 8> Operands;
-    const Loop *L = AR->getLoop();
-    // The addrec conceptually uses its operands at loop entry.
-    Instruction *LUser = &L->getHeader()->front();
-    // Transform each operand.
-    for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
-         I != E; ++I) {
-      Operands.push_back(TransformSubExpr(*I, LUser, nullptr));
-    }
-    // Conservatively use AnyWrap until/unless we need FlagNW.
-    const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
-    switch (Kind) {
-    case NormalizeAutodetect:
-      // Normalize this SCEV by subtracting the expression for the final step.
-      // We only allow affine AddRecs to be normalized, otherwise we would not
-      // be able to correctly denormalize.
-      // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2}
-      // Normalized form:   {-2,+,1,+,2}
-      // Denormalized form: {1,+,3,+,2}
-      //
-      // However, denormalization would use a different step expression than
-      // normalization (see getPostIncExpr), generating the wrong final
-      // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
-      if (AR->isAffine() &&
-          IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
-        Loops.insert(L);
-      }
-#if 0
-      // This assert is conceptually correct, but ScalarEvolution currently
-      // sometimes fails to canonicalize two equal SCEVs to exactly the same
-      // form. It's possibly a pessimization when this happens, but it isn't a
-      // correctness problem, so disable this assert for now.
-      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
-             "SCEV normalization is not invertible!");
-#endif
-      break;
-    case Normalize:
-      // We want to normalize step expression, because otherwise we might not be
-      // able to denormalize to the original expression.
-      //
-      // Here is an example what will happen if we don't normalize step:
-      //  ORIGINAL ISE:
-      //    {(100 /u {1,+,1}<%bb16>),+,(100 /u {1,+,1}<%bb16>)}<%bb25>
-      //  NORMALIZED ISE:
-      //    {((-1 * (100 /u {1,+,1}<%bb16>)) + (100 /u {0,+,1}<%bb16>)),+,
-      //     (100 /u {0,+,1}<%bb16>)}<%bb25>
-      //  DENORMALIZED BACK ISE:
-      //    {((2 * (100 /u {1,+,1}<%bb16>)) + (-1 * (100 /u {2,+,1}<%bb16>))),+,
-      //     (100 /u {1,+,1}<%bb16>)}<%bb25>
-      //  Note that the initial value changes after normalization +
-      //  denormalization, which isn't correct.
-      if (Loops.count(L)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
-      }
-#if 0
-      // See the comment on the assert above.
-      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
-             "SCEV normalization is not invertible!");
-#endif
-      break;
-    case Denormalize:
-      // Here we want to normalize step expressions for the same reasons, as
-      // stated above.
-      if (Loops.count(L)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getAddExpr(Result, TransformedStep);
-      }
-      break;
-    }
-    return Result;
-  }
-
-  if (const SCEVNAryExpr *X = dyn_cast<SCEVNAryExpr>(S)) {
-    SmallVector<const SCEV *, 8> Operands;
-    bool Changed = false;
-    // Transform each operand.
-    for (SCEVNAryExpr::op_iterator I = X->op_begin(), E = X->op_end();
-         I != E; ++I) {
-      const SCEV *O = *I;
-      const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
-      Changed |= N != O;
-      Operands.push_back(N);
-    }
-    // If any operand actually changed, return a transformed result.
-    if (Changed)
-      switch (S->getSCEVType()) {
-      case scAddExpr: return SE.getAddExpr(Operands);
-      case scMulExpr: return SE.getMulExpr(Operands);
-      case scSMaxExpr: return SE.getSMaxExpr(Operands);
-      case scUMaxExpr: return SE.getUMaxExpr(Operands);
-      default: llvm_unreachable("Unexpected SCEVNAryExpr kind!");
-      }
-    return S;
-  }
-
-  if (const SCEVUDivExpr *X = dyn_cast<SCEVUDivExpr>(S)) {
-    const SCEV *LO = X->getLHS();
-    const SCEV *RO = X->getRHS();
-    const SCEV *LN = TransformSubExpr(LO, User, OperandValToReplace);
-    const SCEV *RN = TransformSubExpr(RO, User, OperandValToReplace);
-    if (LO != LN || RO != RN)
-      return SE.getUDivExpr(LN, RN);
-    return S;
-  }
-
-  llvm_unreachable("Unexpected SCEV kind!");
+  return SE.getAddRecExpr(Operands, AR->getLoop(), SCEV::FlagAnyWrap);
 }
 
-/// Manage recursive transformation across an expression DAG. Revisiting
-/// expressions would lead to exponential recursion.
-const SCEV *PostIncTransform::
-TransformSubExpr(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
-
-  if (isa<SCEVConstant>(S) || isa<SCEVUnknown>(S))
-    return S;
-
-  const SCEV *Result = Transformed.lookup(S);
-  if (Result)
-    return Result;
+const SCEV *llvm::normalizeForPostIncUse(const SCEV *S,
+                                         const PostIncLoopSet &Loops,
+                                         ScalarEvolution &SE) {
+  auto Pred = [&](const SCEVAddRecExpr *AR) {
+    return Loops.count(AR->getLoop());
+  };
+  return NormalizeDenormalizeRewriter(Normalize, Pred, SE).visit(S);
+}
 
-  Result = TransformImpl(S, User, OperandValToReplace);
-  Transformed[S] = Result;
-  return Result;
+const SCEV *llvm::normalizeForPostIncUseIf(const SCEV *S, NormalizePredTy Pred,
+                                           ScalarEvolution &SE) {
+  return NormalizeDenormalizeRewriter(Normalize, Pred, SE).visit(S);
 }
 
-/// Top level driver for transforming an expression DAG into its requested
-/// post-inc form (either "Normalized" or "Denormalized").
-const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
-                                         const SCEV *S,
-                                         Instruction *User,
-                                         Value *OperandValToReplace,
-                                         PostIncLoopSet &Loops,
-                                         ScalarEvolution &SE,
-                                         DominatorTree &DT) {
-  PostIncTransform Transform(Kind, Loops, SE, DT);
-  return Transform.TransformSubExpr(S, User, OperandValToReplace);
+const SCEV *llvm::denormalizeForPostIncUse(const SCEV *S,
+                                           const PostIncLoopSet &Loops,
+                                           ScalarEvolution &SE) {
+  auto Pred = [&](const SCEVAddRecExpr *AR) {
+    return Loops.count(AR->getLoop());
+  };
+  return NormalizeDenormalizeRewriter(Denormalize, Pred, SE).visit(S);
 }
diff --git a/contrib/llvm/lib/Analysis/SparsePropagation.cpp b/contrib/llvm/lib/Analysis/SparsePropagation.cpp
index 79dc84e..470f4be 100644
--- a/contrib/llvm/lib/Analysis/SparsePropagation.cpp
+++ b/contrib/llvm/lib/Analysis/SparsePropagation.cpp
@@ -195,7 +195,7 @@ void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
     Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
-  SwitchInst::CaseIt Case = SI.findCaseValue(cast<ConstantInt>(C));
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(cast<ConstantInt>(C));
   Succs[Case.getSuccessorIndex()] = true;
 }
 
diff --git a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 112118a..2be5d5c 100644
--- a/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/contrib/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
@@ -82,24 +83,24 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
 
   if (T.getArch() == Triple::r600 ||
       T.getArch() == Triple::amdgcn) {
-    TLI.setUnavailable(LibFunc::ldexp);
-    TLI.setUnavailable(LibFunc::ldexpf);
-    TLI.setUnavailable(LibFunc::ldexpl);
-    TLI.setUnavailable(LibFunc::exp10);
-    TLI.setUnavailable(LibFunc::exp10f);
-    TLI.setUnavailable(LibFunc::exp10l);
-    TLI.setUnavailable(LibFunc::log10);
-    TLI.setUnavailable(LibFunc::log10f);
-    TLI.setUnavailable(LibFunc::log10l);
+    TLI.setUnavailable(LibFunc_ldexp);
+    TLI.setUnavailable(LibFunc_ldexpf);
+    TLI.setUnavailable(LibFunc_ldexpl);
+    TLI.setUnavailable(LibFunc_exp10);
+    TLI.setUnavailable(LibFunc_exp10f);
+    TLI.setUnavailable(LibFunc_exp10l);
+    TLI.setUnavailable(LibFunc_log10);
+    TLI.setUnavailable(LibFunc_log10f);
+    TLI.setUnavailable(LibFunc_log10l);
   }
 
   // There are no library implementations of mempcy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
   if (T.getArch() == Triple::r600 ||
       T.getArch() == Triple::amdgcn) {
-    TLI.setUnavailable(LibFunc::memcpy);
-    TLI.setUnavailable(LibFunc::memset);
-    TLI.setUnavailable(LibFunc::memset_pattern16);
+    TLI.setUnavailable(LibFunc_memcpy);
+    TLI.setUnavailable(LibFunc_memset);
+    TLI.setUnavailable(LibFunc_memset_pattern16);
     return;
   }
 
@@ -107,21 +108,21 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // All versions of watchOS support it.
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
-      TLI.setUnavailable(LibFunc::memset_pattern16);
+      TLI.setUnavailable(LibFunc_memset_pattern16);
   } else if (T.isiOS()) {
     if (T.isOSVersionLT(3, 0))
-      TLI.setUnavailable(LibFunc::memset_pattern16);
+      TLI.setUnavailable(LibFunc_memset_pattern16);
   } else if (!T.isWatchOS()) {
-    TLI.setUnavailable(LibFunc::memset_pattern16);
+    TLI.setUnavailable(LibFunc_memset_pattern16);
   }
 
   if (!hasSinCosPiStret(T)) {
-    TLI.setUnavailable(LibFunc::sinpi);
-    TLI.setUnavailable(LibFunc::sinpif);
-    TLI.setUnavailable(LibFunc::cospi);
-    TLI.setUnavailable(LibFunc::cospif);
-    TLI.setUnavailable(LibFunc::sincospi_stret);
-    TLI.setUnavailable(LibFunc::sincospif_stret);
+    TLI.setUnavailable(LibFunc_sinpi);
+    TLI.setUnavailable(LibFunc_sinpif);
+    TLI.setUnavailable(LibFunc_cospi);
+    TLI.setUnavailable(LibFunc_cospif);
+    TLI.setUnavailable(LibFunc_sincospi_stret);
+    TLI.setUnavailable(LibFunc_sincospif_stret);
   }
 
   if (T.isMacOSX() && T.getArch() == Triple::x86 &&
@@ -131,179 +132,223 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // has a $UNIX2003 suffix. The two implementations are identical except
     // for the return value in some edge cases.  However, we don't want to
     // generate code that depends on the old symbols.
-    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
-    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
+    TLI.setAvailableWithName(LibFunc_fwrite, "fwrite$UNIX2003");
+    TLI.setAvailableWithName(LibFunc_fputs, "fputs$UNIX2003");
   }
 
   // iprintf and friends are only available on XCore and TCE.
   if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
-    TLI.setUnavailable(LibFunc::iprintf);
-    TLI.setUnavailable(LibFunc::siprintf);
-    TLI.setUnavailable(LibFunc::fiprintf);
+    TLI.setUnavailable(LibFunc_iprintf);
+    TLI.setUnavailable(LibFunc_siprintf);
+    TLI.setUnavailable(LibFunc_fiprintf);
   }
 
   if (T.isOSWindows() && !T.isOSCygMing()) {
     // Win32 does not support long double
-    TLI.setUnavailable(LibFunc::acosl);
-    TLI.setUnavailable(LibFunc::asinl);
-    TLI.setUnavailable(LibFunc::atanl);
-    TLI.setUnavailable(LibFunc::atan2l);
-    TLI.setUnavailable(LibFunc::ceill);
-    TLI.setUnavailable(LibFunc::copysignl);
-    TLI.setUnavailable(LibFunc::cosl);
-    TLI.setUnavailable(LibFunc::coshl);
-    TLI.setUnavailable(LibFunc::expl);
-    TLI.setUnavailable(LibFunc::fabsf); // Win32 and Win64 both lack fabsf
-    TLI.setUnavailable(LibFunc::fabsl);
-    TLI.setUnavailable(LibFunc::floorl);
-    TLI.setUnavailable(LibFunc::fmaxl);
-    TLI.setUnavailable(LibFunc::fminl);
-    TLI.setUnavailable(LibFunc::fmodl);
-    TLI.setUnavailable(LibFunc::frexpl);
-    TLI.setUnavailable(LibFunc::ldexpf);
-    TLI.setUnavailable(LibFunc::ldexpl);
-    TLI.setUnavailable(LibFunc::logl);
-    TLI.setUnavailable(LibFunc::modfl);
-    TLI.setUnavailable(LibFunc::powl);
-    TLI.setUnavailable(LibFunc::sinl);
-    TLI.setUnavailable(LibFunc::sinhl);
-    TLI.setUnavailable(LibFunc::sqrtl);
-    TLI.setUnavailable(LibFunc::tanl);
-    TLI.setUnavailable(LibFunc::tanhl);
+    TLI.setUnavailable(LibFunc_acosl);
+    TLI.setUnavailable(LibFunc_asinl);
+    TLI.setUnavailable(LibFunc_atanl);
+    TLI.setUnavailable(LibFunc_atan2l);
+    TLI.setUnavailable(LibFunc_ceill);
+    TLI.setUnavailable(LibFunc_copysignl);
+    TLI.setUnavailable(LibFunc_cosl);
+    TLI.setUnavailable(LibFunc_coshl);
+    TLI.setUnavailable(LibFunc_expl);
+    TLI.setUnavailable(LibFunc_fabsf); // Win32 and Win64 both lack fabsf
+    TLI.setUnavailable(LibFunc_fabsl);
+    TLI.setUnavailable(LibFunc_floorl);
+    TLI.setUnavailable(LibFunc_fmaxl);
+    TLI.setUnavailable(LibFunc_fminl);
+    TLI.setUnavailable(LibFunc_fmodl);
+    TLI.setUnavailable(LibFunc_frexpl);
+    TLI.setUnavailable(LibFunc_ldexpf);
+    TLI.setUnavailable(LibFunc_ldexpl);
+    TLI.setUnavailable(LibFunc_logl);
+    TLI.setUnavailable(LibFunc_modfl);
+    TLI.setUnavailable(LibFunc_powl);
+    TLI.setUnavailable(LibFunc_sinl);
+    TLI.setUnavailable(LibFunc_sinhl);
+    TLI.setUnavailable(LibFunc_sqrtl);
+    TLI.setUnavailable(LibFunc_tanl);
+    TLI.setUnavailable(LibFunc_tanhl);
 
     // Win32 only has C89 math
-    TLI.setUnavailable(LibFunc::acosh);
-    TLI.setUnavailable(LibFunc::acoshf);
-    TLI.setUnavailable(LibFunc::acoshl);
-    TLI.setUnavailable(LibFunc::asinh);
-    TLI.setUnavailable(LibFunc::asinhf);
-    TLI.setUnavailable(LibFunc::asinhl);
-    TLI.setUnavailable(LibFunc::atanh);
-    TLI.setUnavailable(LibFunc::atanhf);
-    TLI.setUnavailable(LibFunc::atanhl);
-    TLI.setUnavailable(LibFunc::cbrt);
-    TLI.setUnavailable(LibFunc::cbrtf);
-    TLI.setUnavailable(LibFunc::cbrtl);
-    TLI.setUnavailable(LibFunc::exp2);
-    TLI.setUnavailable(LibFunc::exp2f);
-    TLI.setUnavailable(LibFunc::exp2l);
-    TLI.setUnavailable(LibFunc::expm1);
-    TLI.setUnavailable(LibFunc::expm1f);
-    TLI.setUnavailable(LibFunc::expm1l);
-    TLI.setUnavailable(LibFunc::log2);
-    TLI.setUnavailable(LibFunc::log2f);
-    TLI.setUnavailable(LibFunc::log2l);
-    TLI.setUnavailable(LibFunc::log1p);
-    TLI.setUnavailable(LibFunc::log1pf);
-    TLI.setUnavailable(LibFunc::log1pl);
-    TLI.setUnavailable(LibFunc::logb);
-    TLI.setUnavailable(LibFunc::logbf);
-    TLI.setUnavailable(LibFunc::logbl);
-    TLI.setUnavailable(LibFunc::nearbyint);
-    TLI.setUnavailable(LibFunc::nearbyintf);
-    TLI.setUnavailable(LibFunc::nearbyintl);
-    TLI.setUnavailable(LibFunc::rint);
-    TLI.setUnavailable(LibFunc::rintf);
-    TLI.setUnavailable(LibFunc::rintl);
-    TLI.setUnavailable(LibFunc::round);
-    TLI.setUnavailable(LibFunc::roundf);
-    TLI.setUnavailable(LibFunc::roundl);
-    TLI.setUnavailable(LibFunc::trunc);
-    TLI.setUnavailable(LibFunc::truncf);
-    TLI.setUnavailable(LibFunc::truncl);
+    TLI.setUnavailable(LibFunc_acosh);
+    TLI.setUnavailable(LibFunc_acoshf);
+    TLI.setUnavailable(LibFunc_acoshl);
+    TLI.setUnavailable(LibFunc_asinh);
+    TLI.setUnavailable(LibFunc_asinhf);
+    TLI.setUnavailable(LibFunc_asinhl);
+    TLI.setUnavailable(LibFunc_atanh);
+    TLI.setUnavailable(LibFunc_atanhf);
+    TLI.setUnavailable(LibFunc_atanhl);
+    TLI.setUnavailable(LibFunc_cbrt);
+    TLI.setUnavailable(LibFunc_cbrtf);
+    TLI.setUnavailable(LibFunc_cbrtl);
+    TLI.setUnavailable(LibFunc_exp2);
+    TLI.setUnavailable(LibFunc_exp2f);
+    TLI.setUnavailable(LibFunc_exp2l);
+    TLI.setUnavailable(LibFunc_expm1);
+    TLI.setUnavailable(LibFunc_expm1f);
+    TLI.setUnavailable(LibFunc_expm1l);
+    TLI.setUnavailable(LibFunc_log2);
+    TLI.setUnavailable(LibFunc_log2f);
+    TLI.setUnavailable(LibFunc_log2l);
+    TLI.setUnavailable(LibFunc_log1p);
+    TLI.setUnavailable(LibFunc_log1pf);
+    TLI.setUnavailable(LibFunc_log1pl);
+    TLI.setUnavailable(LibFunc_logb);
+    TLI.setUnavailable(LibFunc_logbf);
+    TLI.setUnavailable(LibFunc_logbl);
+    TLI.setUnavailable(LibFunc_nearbyint);
+    TLI.setUnavailable(LibFunc_nearbyintf);
+    TLI.setUnavailable(LibFunc_nearbyintl);
+    TLI.setUnavailable(LibFunc_rint);
+    TLI.setUnavailable(LibFunc_rintf);
+    TLI.setUnavailable(LibFunc_rintl);
+    TLI.setUnavailable(LibFunc_round);
+    TLI.setUnavailable(LibFunc_roundf);
+    TLI.setUnavailable(LibFunc_roundl);
+    TLI.setUnavailable(LibFunc_trunc);
+    TLI.setUnavailable(LibFunc_truncf);
+    TLI.setUnavailable(LibFunc_truncl);
 
     // Win32 provides some C99 math with mangled names
-    TLI.setAvailableWithName(LibFunc::copysign, "_copysign");
+    TLI.setAvailableWithName(LibFunc_copysign, "_copysign");
 
     if (T.getArch() == Triple::x86) {
       // Win32 on x86 implements single-precision math functions as macros
-      TLI.setUnavailable(LibFunc::acosf);
-      TLI.setUnavailable(LibFunc::asinf);
-      TLI.setUnavailable(LibFunc::atanf);
-      TLI.setUnavailable(LibFunc::atan2f);
-      TLI.setUnavailable(LibFunc::ceilf);
-      TLI.setUnavailable(LibFunc::copysignf);
-      TLI.setUnavailable(LibFunc::cosf);
-      TLI.setUnavailable(LibFunc::coshf);
-      TLI.setUnavailable(LibFunc::expf);
-      TLI.setUnavailable(LibFunc::floorf);
-      TLI.setUnavailable(LibFunc::fminf);
-      TLI.setUnavailable(LibFunc::fmaxf);
-      TLI.setUnavailable(LibFunc::fmodf);
-      TLI.setUnavailable(LibFunc::logf);
-      TLI.setUnavailable(LibFunc::log10f);
-      TLI.setUnavailable(LibFunc::modff);
-      TLI.setUnavailable(LibFunc::powf);
-      TLI.setUnavailable(LibFunc::sinf);
-      TLI.setUnavailable(LibFunc::sinhf);
-      TLI.setUnavailable(LibFunc::sqrtf);
-      TLI.setUnavailable(LibFunc::tanf);
-      TLI.setUnavailable(LibFunc::tanhf);
+      TLI.setUnavailable(LibFunc_acosf);
+      TLI.setUnavailable(LibFunc_asinf);
+      TLI.setUnavailable(LibFunc_atanf);
+      TLI.setUnavailable(LibFunc_atan2f);
+      TLI.setUnavailable(LibFunc_ceilf);
+      TLI.setUnavailable(LibFunc_copysignf);
+      TLI.setUnavailable(LibFunc_cosf);
+      TLI.setUnavailable(LibFunc_coshf);
+      TLI.setUnavailable(LibFunc_expf);
+      TLI.setUnavailable(LibFunc_floorf);
+      TLI.setUnavailable(LibFunc_fminf);
+      TLI.setUnavailable(LibFunc_fmaxf);
+      TLI.setUnavailable(LibFunc_fmodf);
+      TLI.setUnavailable(LibFunc_logf);
+      TLI.setUnavailable(LibFunc_log10f);
+      TLI.setUnavailable(LibFunc_modff);
+      TLI.setUnavailable(LibFunc_powf);
+      TLI.setUnavailable(LibFunc_sinf);
+      TLI.setUnavailable(LibFunc_sinhf);
+      TLI.setUnavailable(LibFunc_sqrtf);
+      TLI.setUnavailable(LibFunc_tanf);
+      TLI.setUnavailable(LibFunc_tanhf);
     }
 
+    // These definitions are due to math-finite.h header on Linux
+    TLI.setUnavailable(LibFunc_acos_finite);
+    TLI.setUnavailable(LibFunc_acosf_finite);
+    TLI.setUnavailable(LibFunc_acosl_finite);
+    TLI.setUnavailable(LibFunc_acosh_finite);
+    TLI.setUnavailable(LibFunc_acoshf_finite);
+    TLI.setUnavailable(LibFunc_acoshl_finite);
+    TLI.setUnavailable(LibFunc_asin_finite);
+    TLI.setUnavailable(LibFunc_asinf_finite);
+    TLI.setUnavailable(LibFunc_asinl_finite);
+    TLI.setUnavailable(LibFunc_atan2_finite);
+    TLI.setUnavailable(LibFunc_atan2f_finite);
+    TLI.setUnavailable(LibFunc_atan2l_finite);
+    TLI.setUnavailable(LibFunc_atanh_finite);
+    TLI.setUnavailable(LibFunc_atanhf_finite);
+    TLI.setUnavailable(LibFunc_atanhl_finite);
+    TLI.setUnavailable(LibFunc_cosh_finite);
+    TLI.setUnavailable(LibFunc_coshf_finite);
+    TLI.setUnavailable(LibFunc_coshl_finite);
+    TLI.setUnavailable(LibFunc_exp10_finite);
+    TLI.setUnavailable(LibFunc_exp10f_finite);
+    TLI.setUnavailable(LibFunc_exp10l_finite);
+    TLI.setUnavailable(LibFunc_exp2_finite);
+    TLI.setUnavailable(LibFunc_exp2f_finite);
+    TLI.setUnavailable(LibFunc_exp2l_finite);
+    TLI.setUnavailable(LibFunc_exp_finite);
+    TLI.setUnavailable(LibFunc_expf_finite);
+    TLI.setUnavailable(LibFunc_expl_finite);
+    TLI.setUnavailable(LibFunc_log10_finite);
+    TLI.setUnavailable(LibFunc_log10f_finite);
+    TLI.setUnavailable(LibFunc_log10l_finite);
+    TLI.setUnavailable(LibFunc_log2_finite);
+    TLI.setUnavailable(LibFunc_log2f_finite);
+    TLI.setUnavailable(LibFunc_log2l_finite);
+    TLI.setUnavailable(LibFunc_log_finite);
+    TLI.setUnavailable(LibFunc_logf_finite);
+    TLI.setUnavailable(LibFunc_logl_finite);
+    TLI.setUnavailable(LibFunc_pow_finite);
+    TLI.setUnavailable(LibFunc_powf_finite);
+    TLI.setUnavailable(LibFunc_powl_finite);
+    TLI.setUnavailable(LibFunc_sinh_finite);
+    TLI.setUnavailable(LibFunc_sinhf_finite);
+    TLI.setUnavailable(LibFunc_sinhl_finite);
+
     // Win32 does *not* provide provide these functions, but they are
     // generally available on POSIX-compliant systems:
-    TLI.setUnavailable(LibFunc::access);
-    TLI.setUnavailable(LibFunc::bcmp);
-    TLI.setUnavailable(LibFunc::bcopy);
-    TLI.setUnavailable(LibFunc::bzero);
-    TLI.setUnavailable(LibFunc::chmod);
-    TLI.setUnavailable(LibFunc::chown);
-    TLI.setUnavailable(LibFunc::closedir);
-    TLI.setUnavailable(LibFunc::ctermid);
-    TLI.setUnavailable(LibFunc::fdopen);
-    TLI.setUnavailable(LibFunc::ffs);
-    TLI.setUnavailable(LibFunc::fileno);
-    TLI.setUnavailable(LibFunc::flockfile);
-    TLI.setUnavailable(LibFunc::fseeko);
-    TLI.setUnavailable(LibFunc::fstat);
-    TLI.setUnavailable(LibFunc::fstatvfs);
-    TLI.setUnavailable(LibFunc::ftello);
-    TLI.setUnavailable(LibFunc::ftrylockfile);
-    TLI.setUnavailable(LibFunc::funlockfile);
-    TLI.setUnavailable(LibFunc::getc_unlocked);
-    TLI.setUnavailable(LibFunc::getitimer);
-    TLI.setUnavailable(LibFunc::getlogin_r);
-    TLI.setUnavailable(LibFunc::getpwnam);
-    TLI.setUnavailable(LibFunc::gettimeofday);
-    TLI.setUnavailable(LibFunc::htonl);
-    TLI.setUnavailable(LibFunc::htons);
-    TLI.setUnavailable(LibFunc::lchown);
-    TLI.setUnavailable(LibFunc::lstat);
-    TLI.setUnavailable(LibFunc::memccpy);
-    TLI.setUnavailable(LibFunc::mkdir);
-    TLI.setUnavailable(LibFunc::ntohl);
-    TLI.setUnavailable(LibFunc::ntohs);
-    TLI.setUnavailable(LibFunc::open);
-    TLI.setUnavailable(LibFunc::opendir);
-    TLI.setUnavailable(LibFunc::pclose);
-    TLI.setUnavailable(LibFunc::popen);
-    TLI.setUnavailable(LibFunc::pread);
-    TLI.setUnavailable(LibFunc::pwrite);
-    TLI.setUnavailable(LibFunc::read);
-    TLI.setUnavailable(LibFunc::readlink);
-    TLI.setUnavailable(LibFunc::realpath);
-    TLI.setUnavailable(LibFunc::rmdir);
-    TLI.setUnavailable(LibFunc::setitimer);
-    TLI.setUnavailable(LibFunc::stat);
-    TLI.setUnavailable(LibFunc::statvfs);
-    TLI.setUnavailable(LibFunc::stpcpy);
-    TLI.setUnavailable(LibFunc::stpncpy);
-    TLI.setUnavailable(LibFunc::strcasecmp);
-    TLI.setUnavailable(LibFunc::strncasecmp);
-    TLI.setUnavailable(LibFunc::times);
-    TLI.setUnavailable(LibFunc::uname);
-    TLI.setUnavailable(LibFunc::unlink);
-    TLI.setUnavailable(LibFunc::unsetenv);
-    TLI.setUnavailable(LibFunc::utime);
-    TLI.setUnavailable(LibFunc::utimes);
-    TLI.setUnavailable(LibFunc::write);
+    TLI.setUnavailable(LibFunc_access);
+    TLI.setUnavailable(LibFunc_bcmp);
+    TLI.setUnavailable(LibFunc_bcopy);
+    TLI.setUnavailable(LibFunc_bzero);
+    TLI.setUnavailable(LibFunc_chmod);
+    TLI.setUnavailable(LibFunc_chown);
+    TLI.setUnavailable(LibFunc_closedir);
+    TLI.setUnavailable(LibFunc_ctermid);
+    TLI.setUnavailable(LibFunc_fdopen);
+    TLI.setUnavailable(LibFunc_ffs);
+    TLI.setUnavailable(LibFunc_fileno);
+    TLI.setUnavailable(LibFunc_flockfile);
+    TLI.setUnavailable(LibFunc_fseeko);
+    TLI.setUnavailable(LibFunc_fstat);
+    TLI.setUnavailable(LibFunc_fstatvfs);
+    TLI.setUnavailable(LibFunc_ftello);
+    TLI.setUnavailable(LibFunc_ftrylockfile);
+    TLI.setUnavailable(LibFunc_funlockfile);
+    TLI.setUnavailable(LibFunc_getc_unlocked);
+    TLI.setUnavailable(LibFunc_getitimer);
+    TLI.setUnavailable(LibFunc_getlogin_r);
+    TLI.setUnavailable(LibFunc_getpwnam);
+    TLI.setUnavailable(LibFunc_gettimeofday);
+    TLI.setUnavailable(LibFunc_htonl);
+    TLI.setUnavailable(LibFunc_htons);
+    TLI.setUnavailable(LibFunc_lchown);
+    TLI.setUnavailable(LibFunc_lstat);
+    TLI.setUnavailable(LibFunc_memccpy);
+    TLI.setUnavailable(LibFunc_mkdir);
+    TLI.setUnavailable(LibFunc_ntohl);
+    TLI.setUnavailable(LibFunc_ntohs);
+    TLI.setUnavailable(LibFunc_open);
+    TLI.setUnavailable(LibFunc_opendir);
+    TLI.setUnavailable(LibFunc_pclose);
+    TLI.setUnavailable(LibFunc_popen);
+    TLI.setUnavailable(LibFunc_pread);
+    TLI.setUnavailable(LibFunc_pwrite);
+    TLI.setUnavailable(LibFunc_read);
+    TLI.setUnavailable(LibFunc_readlink);
+    TLI.setUnavailable(LibFunc_realpath);
+    TLI.setUnavailable(LibFunc_rmdir);
+    TLI.setUnavailable(LibFunc_setitimer);
+    TLI.setUnavailable(LibFunc_stat);
+    TLI.setUnavailable(LibFunc_statvfs);
+    TLI.setUnavailable(LibFunc_stpcpy);
+    TLI.setUnavailable(LibFunc_stpncpy);
+    TLI.setUnavailable(LibFunc_strcasecmp);
+    TLI.setUnavailable(LibFunc_strncasecmp);
+    TLI.setUnavailable(LibFunc_times);
+    TLI.setUnavailable(LibFunc_uname);
+    TLI.setUnavailable(LibFunc_unlink);
+    TLI.setUnavailable(LibFunc_unsetenv);
+    TLI.setUnavailable(LibFunc_utime);
+    TLI.setUnavailable(LibFunc_utimes);
+    TLI.setUnavailable(LibFunc_write);
 
     // Win32 does *not* provide provide these functions, but they are
     // specified by C99:
-    TLI.setUnavailable(LibFunc::atoll);
-    TLI.setUnavailable(LibFunc::frexpf);
-    TLI.setUnavailable(LibFunc::llabs);
+    TLI.setUnavailable(LibFunc_atoll);
+    TLI.setUnavailable(LibFunc_frexpf);
+    TLI.setUnavailable(LibFunc_llabs);
   }
 
   switch (T.getOS()) {
@@ -311,28 +356,28 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // exp10 and exp10f are not available on OS X until 10.9 and iOS until 7.0
     // and their names are __exp10 and __exp10f. exp10l is not available on
     // OS X or iOS.
-    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc_exp10l);
     if (T.isMacOSXVersionLT(10, 9)) {
-      TLI.setUnavailable(LibFunc::exp10);
-      TLI.setUnavailable(LibFunc::exp10f);
+      TLI.setUnavailable(LibFunc_exp10);
+      TLI.setUnavailable(LibFunc_exp10f);
     } else {
-      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
-      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+      TLI.setAvailableWithName(LibFunc_exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc_exp10f, "__exp10f");
     }
     break;
   case Triple::IOS:
   case Triple::TvOS:
   case Triple::WatchOS:
-    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc_exp10l);
     if (!T.isWatchOS() && (T.isOSVersionLT(7, 0) ||
                            (T.isOSVersionLT(9, 0) &&
                             (T.getArch() == Triple::x86 ||
                              T.getArch() == Triple::x86_64)))) {
-      TLI.setUnavailable(LibFunc::exp10);
-      TLI.setUnavailable(LibFunc::exp10f);
+      TLI.setUnavailable(LibFunc_exp10);
+      TLI.setUnavailable(LibFunc_exp10f);
     } else {
-      TLI.setAvailableWithName(LibFunc::exp10, "__exp10");
-      TLI.setAvailableWithName(LibFunc::exp10f, "__exp10f");
+      TLI.setAvailableWithName(LibFunc_exp10, "__exp10");
+      TLI.setAvailableWithName(LibFunc_exp10f, "__exp10f");
     }
     break;
   case Triple::Linux:
@@ -344,9 +389,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     // Fall through to disable all of them.
     LLVM_FALLTHROUGH;
   default:
-    TLI.setUnavailable(LibFunc::exp10);
-    TLI.setUnavailable(LibFunc::exp10f);
-    TLI.setUnavailable(LibFunc::exp10l);
+    TLI.setUnavailable(LibFunc_exp10);
+    TLI.setUnavailable(LibFunc_exp10f);
+    TLI.setUnavailable(LibFunc_exp10l);
   }
 
   // ffsl is available on at least Darwin, Mac OS X, iOS, FreeBSD, and
@@ -364,7 +409,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   case Triple::Linux:
     break;
   default:
-    TLI.setUnavailable(LibFunc::ffsl);
+    TLI.setUnavailable(LibFunc_ffsl);
   }
 
   // ffsll is available on at least FreeBSD and Linux (GLIBC):
@@ -380,7 +425,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   case Triple::Linux:
     break;
   default:
-    TLI.setUnavailable(LibFunc::ffsll);
+    TLI.setUnavailable(LibFunc_ffsll);
   }
 
   // The following functions are available on at least FreeBSD:
@@ -388,30 +433,30 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // http://svn.freebsd.org/base/head/lib/libc/string/flsl.c
   // http://svn.freebsd.org/base/head/lib/libc/string/flsll.c
   if (!T.isOSFreeBSD()) {
-    TLI.setUnavailable(LibFunc::fls);
-    TLI.setUnavailable(LibFunc::flsl);
-    TLI.setUnavailable(LibFunc::flsll);
+    TLI.setUnavailable(LibFunc_fls);
+    TLI.setUnavailable(LibFunc_flsl);
+    TLI.setUnavailable(LibFunc_flsll);
   }
 
   // The following functions are available on at least Linux:
   if (!T.isOSLinux()) {
-    TLI.setUnavailable(LibFunc::dunder_strdup);
-    TLI.setUnavailable(LibFunc::dunder_strtok_r);
-    TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
-    TLI.setUnavailable(LibFunc::dunder_isoc99_sscanf);
-    TLI.setUnavailable(LibFunc::under_IO_getc);
-    TLI.setUnavailable(LibFunc::under_IO_putc);
-    TLI.setUnavailable(LibFunc::memalign);
-    TLI.setUnavailable(LibFunc::fopen64);
-    TLI.setUnavailable(LibFunc::fseeko64);
-    TLI.setUnavailable(LibFunc::fstat64);
-    TLI.setUnavailable(LibFunc::fstatvfs64);
-    TLI.setUnavailable(LibFunc::ftello64);
-    TLI.setUnavailable(LibFunc::lstat64);
-    TLI.setUnavailable(LibFunc::open64);
-    TLI.setUnavailable(LibFunc::stat64);
-    TLI.setUnavailable(LibFunc::statvfs64);
-    TLI.setUnavailable(LibFunc::tmpfile64);
+    TLI.setUnavailable(LibFunc_dunder_strdup);
+    TLI.setUnavailable(LibFunc_dunder_strtok_r);
+    TLI.setUnavailable(LibFunc_dunder_isoc99_scanf);
+    TLI.setUnavailable(LibFunc_dunder_isoc99_sscanf);
+    TLI.setUnavailable(LibFunc_under_IO_getc);
+    TLI.setUnavailable(LibFunc_under_IO_putc);
+    TLI.setUnavailable(LibFunc_memalign);
+    TLI.setUnavailable(LibFunc_fopen64);
+    TLI.setUnavailable(LibFunc_fseeko64);
+    TLI.setUnavailable(LibFunc_fstat64);
+    TLI.setUnavailable(LibFunc_fstatvfs64);
+    TLI.setUnavailable(LibFunc_ftello64);
+    TLI.setUnavailable(LibFunc_lstat64);
+    TLI.setUnavailable(LibFunc_open64);
+    TLI.setUnavailable(LibFunc_stat64);
+    TLI.setUnavailable(LibFunc_statvfs64);
+    TLI.setUnavailable(LibFunc_tmpfile64);
   }
 
   // As currently implemented in clang, NVPTX code has no standard library to
@@ -427,9 +472,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   // optimizations, so this situation should be fixed.
   if (T.isNVPTX()) {
     TLI.disableAllFunctions();
-    TLI.setAvailable(LibFunc::nvvm_reflect);
+    TLI.setAvailable(LibFunc_nvvm_reflect);
   } else {
-    TLI.setUnavailable(LibFunc::nvvm_reflect);
+    TLI.setUnavailable(LibFunc_nvvm_reflect);
   }
 
   TLI.addVectorizableFunctionsFromVecLib(ClVectorLibrary);
@@ -496,13 +541,13 @@ static StringRef sanitizeFunctionName(StringRef funcName) {
 
   // Check for \01 prefix that is used to mangle __asm declarations and
   // strip it if present.
-  return GlobalValue::getRealLinkageName(funcName);
+  return GlobalValue::dropLLVMManglingEscape(funcName);
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
-                                       LibFunc::Func &F) const {
+                                       LibFunc &F) const {
   StringRef const *Start = &StandardNames[0];
-  StringRef const *End = &StandardNames[LibFunc::NumLibFuncs];
+  StringRef const *End = &StandardNames[NumLibFuncs];
 
   funcName = sanitizeFunctionName(funcName);
   if (funcName.empty())
@@ -513,14 +558,14 @@ bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName,
         return LHS < RHS;
       });
   if (I != End && *I == funcName) {
-    F = (LibFunc::Func)(I - Start);
+    F = (LibFunc)(I - Start);
     return true;
   }
   return false;
 }
 
 bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
-                                                   LibFunc::Func F,
+                                                   LibFunc F,
                                                    const DataLayout *DL) const {
   LLVMContext &Ctx = FTy.getContext();
   Type *PCharTy = Type::getInt8PtrTy(Ctx);
@@ -531,504 +576,706 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   unsigned NumParams = FTy.getNumParams();
 
   switch (F) {
-  case LibFunc::strlen:
+  case LibFunc_strlen:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy());
 
-  case LibFunc::strchr:
-  case LibFunc::strrchr:
+  case LibFunc_strchr:
+  case LibFunc_strrchr:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1)->isIntegerTy());
 
-  case LibFunc::strtol:
-  case LibFunc::strtod:
-  case LibFunc::strtof:
-  case LibFunc::strtoul:
-  case LibFunc::strtoll:
-  case LibFunc::strtold:
-  case LibFunc::strtoull:
+  case LibFunc_strtol:
+  case LibFunc_strtod:
+  case LibFunc_strtof:
+  case LibFunc_strtoul:
+  case LibFunc_strtoll:
+  case LibFunc_strtold:
+  case LibFunc_strtoull:
     return ((NumParams == 2 || NumParams == 3) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::strcat:
+  case LibFunc_strcat:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1) == FTy.getReturnType());
 
-  case LibFunc::strncat:
+  case LibFunc_strncat:
     return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0) == FTy.getReturnType() &&
             FTy.getParamType(1) == FTy.getReturnType() &&
             FTy.getParamType(2)->isIntegerTy());
 
-  case LibFunc::strcpy_chk:
-  case LibFunc::stpcpy_chk:
+  case LibFunc_strcpy_chk:
+  case LibFunc_stpcpy_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
+  case LibFunc_strcpy:
+  case LibFunc_stpcpy:
     return (NumParams == 2 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(0) == PCharTy);
 
-  case LibFunc::strncpy_chk:
-  case LibFunc::stpncpy_chk:
+  case LibFunc_strncpy_chk:
+  case LibFunc_stpncpy_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::strncpy:
-  case LibFunc::stpncpy:
+  case LibFunc_strncpy:
+  case LibFunc_stpncpy:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(0) == PCharTy &&
             FTy.getParamType(2)->isIntegerTy());
 
-  case LibFunc::strxfrm:
+  case LibFunc_strxfrm:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::strcmp:
+  case LibFunc_strcmp:
     return (NumParams == 2 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1));
 
-  case LibFunc::strncmp:
+  case LibFunc_strncmp:
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getParamType(2)->isIntegerTy());
 
-  case LibFunc::strspn:
-  case LibFunc::strcspn:
+  case LibFunc_strspn:
+  case LibFunc_strcspn:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(0) == FTy.getParamType(1) &&
             FTy.getReturnType()->isIntegerTy());
 
-  case LibFunc::strcoll:
-  case LibFunc::strcasecmp:
-  case LibFunc::strncasecmp:
+  case LibFunc_strcoll:
+  case LibFunc_strcasecmp:
+  case LibFunc_strncasecmp:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::strstr:
+  case LibFunc_strstr:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::strpbrk:
+  case LibFunc_strpbrk:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0) == FTy.getParamType(1));
 
-  case LibFunc::strtok:
-  case LibFunc::strtok_r:
+  case LibFunc_strtok:
+  case LibFunc_strtok_r:
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::scanf:
-  case LibFunc::setbuf:
-  case LibFunc::setvbuf:
+  case LibFunc_scanf:
+  case LibFunc_setbuf:
+  case LibFunc_setvbuf:
     return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::strdup:
-  case LibFunc::strndup:
+  case LibFunc_strdup:
+  case LibFunc_strndup:
     return (NumParams >= 1 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy());
-  case LibFunc::sscanf:
-  case LibFunc::stat:
-  case LibFunc::statvfs:
-  case LibFunc::sprintf:
+  case LibFunc_sscanf:
+  case LibFunc_stat:
+  case LibFunc_statvfs:
+  case LibFunc_siprintf:
+  case LibFunc_sprintf:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::snprintf:
+  case LibFunc_snprintf:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::setitimer:
+  case LibFunc_setitimer:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::system:
+  case LibFunc_system:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::malloc:
+  case LibFunc_malloc:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
-  case LibFunc::memcmp:
-    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(1)->isPointerTy() &&
-            FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_memcmp:
+    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::memchr:
-  case LibFunc::memrchr:
-    return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
+    return (NumParams == 3 && FTy.getReturnType()->isPointerTy() &&
+            FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(1)->isIntegerTy(32) &&
-            FTy.getParamType(2)->isIntegerTy() &&
-            FTy.getReturnType()->isPointerTy());
-  case LibFunc::modf:
-  case LibFunc::modff:
-  case LibFunc::modfl:
+            IsSizeTTy(FTy.getParamType(2)));
+  case LibFunc_modf:
+  case LibFunc_modff:
+  case LibFunc_modfl:
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::memcpy_chk:
-  case LibFunc::memmove_chk:
+  case LibFunc_memcpy_chk:
+  case LibFunc_memmove_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::memcpy:
-  case LibFunc::mempcpy:
-  case LibFunc::memmove:
+  case LibFunc_memcpy:
+  case LibFunc_mempcpy:
+  case LibFunc_memmove:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy() &&
             IsSizeTTy(FTy.getParamType(2)));
 
-  case LibFunc::memset_chk:
+  case LibFunc_memset_chk:
     --NumParams;
     if (!IsSizeTTy(FTy.getParamType(NumParams)))
       return false;
     LLVM_FALLTHROUGH;
-  case LibFunc::memset:
+  case LibFunc_memset:
     return (NumParams == 3 && FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isIntegerTy() &&
             IsSizeTTy(FTy.getParamType(2)));
 
-  case LibFunc::memccpy:
+  case LibFunc_memccpy:
     return (NumParams >= 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::memalign:
+  case LibFunc_memalign:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::realloc:
-    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
-            FTy.getReturnType()->isPointerTy());
-  case LibFunc::read:
+  case LibFunc_realloc:
+  case LibFunc_reallocf:
+    return (NumParams == 2 && FTy.getReturnType() == PCharTy &&
+            FTy.getParamType(0) == FTy.getReturnType() &&
+            IsSizeTTy(FTy.getParamType(1)));
+  case LibFunc_read:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::rewind:
-  case LibFunc::rmdir:
-  case LibFunc::remove:
-  case LibFunc::realpath:
+  case LibFunc_rewind:
+  case LibFunc_rmdir:
+  case LibFunc_remove:
+  case LibFunc_realpath:
     return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::rename:
+  case LibFunc_rename:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::readlink:
+  case LibFunc_readlink:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::write:
+  case LibFunc_write:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::bcopy:
-  case LibFunc::bcmp:
+  case LibFunc_bcopy:
+  case LibFunc_bcmp:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::bzero:
+  case LibFunc_bzero:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::calloc:
+  case LibFunc_calloc:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
 
-  case LibFunc::atof:
-  case LibFunc::atoi:
-  case LibFunc::atol:
-  case LibFunc::atoll:
-  case LibFunc::ferror:
-  case LibFunc::getenv:
-  case LibFunc::getpwnam:
-  case LibFunc::pclose:
-  case LibFunc::perror:
-  case LibFunc::printf:
-  case LibFunc::puts:
-  case LibFunc::uname:
-  case LibFunc::under_IO_getc:
-  case LibFunc::unlink:
-  case LibFunc::unsetenv:
+  case LibFunc_atof:
+  case LibFunc_atoi:
+  case LibFunc_atol:
+  case LibFunc_atoll:
+  case LibFunc_ferror:
+  case LibFunc_getenv:
+  case LibFunc_getpwnam:
+  case LibFunc_iprintf:
+  case LibFunc_pclose:
+  case LibFunc_perror:
+  case LibFunc_printf:
+  case LibFunc_puts:
+  case LibFunc_uname:
+  case LibFunc_under_IO_getc:
+  case LibFunc_unlink:
+  case LibFunc_unsetenv:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
 
-  case LibFunc::chmod:
-  case LibFunc::chown:
-  case LibFunc::clearerr:
-  case LibFunc::closedir:
-  case LibFunc::ctermid:
-  case LibFunc::fclose:
-  case LibFunc::feof:
-  case LibFunc::fflush:
-  case LibFunc::fgetc:
-  case LibFunc::fileno:
-  case LibFunc::flockfile:
-  case LibFunc::free:
-  case LibFunc::fseek:
-  case LibFunc::fseeko64:
-  case LibFunc::fseeko:
-  case LibFunc::fsetpos:
-  case LibFunc::ftell:
-  case LibFunc::ftello64:
-  case LibFunc::ftello:
-  case LibFunc::ftrylockfile:
-  case LibFunc::funlockfile:
-  case LibFunc::getc:
-  case LibFunc::getc_unlocked:
-  case LibFunc::getlogin_r:
-  case LibFunc::mkdir:
-  case LibFunc::mktime:
-  case LibFunc::times:
+  case LibFunc_access:
+  case LibFunc_chmod:
+  case LibFunc_chown:
+  case LibFunc_clearerr:
+  case LibFunc_closedir:
+  case LibFunc_ctermid:
+  case LibFunc_fclose:
+  case LibFunc_feof:
+  case LibFunc_fflush:
+  case LibFunc_fgetc:
+  case LibFunc_fileno:
+  case LibFunc_flockfile:
+  case LibFunc_free:
+  case LibFunc_fseek:
+  case LibFunc_fseeko64:
+  case LibFunc_fseeko:
+  case LibFunc_fsetpos:
+  case LibFunc_ftell:
+  case LibFunc_ftello64:
+  case LibFunc_ftello:
+  case LibFunc_ftrylockfile:
+  case LibFunc_funlockfile:
+  case LibFunc_getc:
+  case LibFunc_getc_unlocked:
+  case LibFunc_getlogin_r:
+  case LibFunc_mkdir:
+  case LibFunc_mktime:
+  case LibFunc_times:
     return (NumParams != 0 && FTy.getParamType(0)->isPointerTy());
 
-  case LibFunc::access:
-    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::fopen:
+  case LibFunc_fopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fdopen:
+  case LibFunc_fdopen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fputc:
-  case LibFunc::fstat:
-  case LibFunc::frexp:
-  case LibFunc::frexpf:
-  case LibFunc::frexpl:
-  case LibFunc::fstatvfs:
+  case LibFunc_fputc:
+  case LibFunc_fstat:
+  case LibFunc_frexp:
+  case LibFunc_frexpf:
+  case LibFunc_frexpl:
+  case LibFunc_fstatvfs:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fgets:
+  case LibFunc_fgets:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::fread:
+  case LibFunc_fread:
     return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(3)->isPointerTy());
-  case LibFunc::fwrite:
+  case LibFunc_fwrite:
     return (NumParams == 4 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isIntegerTy() &&
             FTy.getParamType(2)->isIntegerTy() &&
             FTy.getParamType(3)->isPointerTy());
-  case LibFunc::fputs:
+  case LibFunc_fputs:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fscanf:
-  case LibFunc::fprintf:
-    return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
+  case LibFunc_fscanf:
+  case LibFunc_fiprintf:
+  case LibFunc_fprintf:
+    return (NumParams >= 2 && FTy.getReturnType()->isIntegerTy() &&
+            FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fgetpos:
+  case LibFunc_fgetpos:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::gets:
-  case LibFunc::getchar:
-  case LibFunc::getitimer:
+  case LibFunc_getchar:
+    return (NumParams == 0 && FTy.getReturnType()->isIntegerTy());
+  case LibFunc_gets:
+    return (NumParams == 1 && FTy.getParamType(0) == PCharTy);
+  case LibFunc_getitimer:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::ungetc:
+  case LibFunc_ungetc:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::utime:
-  case LibFunc::utimes:
+  case LibFunc_utime:
+  case LibFunc_utimes:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::putc:
+  case LibFunc_putc:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::pread:
-  case LibFunc::pwrite:
+  case LibFunc_pread:
+  case LibFunc_pwrite:
     return (NumParams == 4 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::popen:
+  case LibFunc_popen:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::vscanf:
+  case LibFunc_vscanf:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::vsscanf:
+  case LibFunc_vsscanf:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::vfscanf:
+  case LibFunc_vfscanf:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::valloc:
+  case LibFunc_valloc:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::vprintf:
+  case LibFunc_vprintf:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::vfprintf:
-  case LibFunc::vsprintf:
+  case LibFunc_vfprintf:
+  case LibFunc_vsprintf:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::vsnprintf:
+  case LibFunc_vsnprintf:
     return (NumParams == 4 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
-  case LibFunc::open:
+  case LibFunc_open:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::opendir:
+  case LibFunc_opendir:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy());
-  case LibFunc::tmpfile:
+  case LibFunc_tmpfile:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::htonl:
-  case LibFunc::htons:
-  case LibFunc::ntohl:
-  case LibFunc::ntohs:
-  case LibFunc::lstat:
+  case LibFunc_htonl:
+  case LibFunc_ntohl:
+    return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getReturnType() == FTy.getParamType(0));
+  case LibFunc_htons:
+  case LibFunc_ntohs:
+    return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(16) &&
+            FTy.getReturnType() == FTy.getParamType(0));
+  case LibFunc_lstat:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::lchown:
+  case LibFunc_lchown:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::qsort:
+  case LibFunc_qsort:
     return (NumParams == 4 && FTy.getParamType(3)->isPointerTy());
-  case LibFunc::dunder_strdup:
-  case LibFunc::dunder_strndup:
+  case LibFunc_dunder_strdup:
+  case LibFunc_dunder_strndup:
     return (NumParams >= 1 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy());
-  case LibFunc::dunder_strtok_r:
+  case LibFunc_dunder_strtok_r:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::under_IO_putc:
+  case LibFunc_under_IO_putc:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::dunder_isoc99_scanf:
+  case LibFunc_dunder_isoc99_scanf:
     return (NumParams >= 1 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::stat64:
-  case LibFunc::lstat64:
-  case LibFunc::statvfs64:
+  case LibFunc_stat64:
+  case LibFunc_lstat64:
+  case LibFunc_statvfs64:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::dunder_isoc99_sscanf:
+  case LibFunc_dunder_isoc99_sscanf:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::fopen64:
+  case LibFunc_fopen64:
     return (NumParams == 2 && FTy.getReturnType()->isPointerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
-  case LibFunc::tmpfile64:
+  case LibFunc_tmpfile64:
     return (FTy.getReturnType()->isPointerTy());
-  case LibFunc::fstat64:
-  case LibFunc::fstatvfs64:
+  case LibFunc_fstat64:
+  case LibFunc_fstatvfs64:
     return (NumParams == 2 && FTy.getParamType(1)->isPointerTy());
-  case LibFunc::open64:
+  case LibFunc_open64:
     return (NumParams >= 2 && FTy.getParamType(0)->isPointerTy());
-  case LibFunc::gettimeofday:
+  case LibFunc_gettimeofday:
     return (NumParams == 2 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy());
 
-  case LibFunc::Znwj:                    // new(unsigned int);
-  case LibFunc::Znwm:                    // new(unsigned long);
-  case LibFunc::Znaj:                    // new[](unsigned int);
-  case LibFunc::Znam:                    // new[](unsigned long);
-  case LibFunc::msvc_new_int:            // new(unsigned int);
-  case LibFunc::msvc_new_longlong:       // new(unsigned long long);
-  case LibFunc::msvc_new_array_int:      // new[](unsigned int);
-  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long);
-    return (NumParams == 1);
-
-  case LibFunc::memset_pattern16:
+  // new(unsigned int);
+  case LibFunc_Znwj:
+  // new(unsigned long);
+  case LibFunc_Znwm:
+  // new[](unsigned int);
+  case LibFunc_Znaj:
+  // new[](unsigned long);
+  case LibFunc_Znam:
+  // new(unsigned int);
+  case LibFunc_msvc_new_int:
+  // new(unsigned long long);
+  case LibFunc_msvc_new_longlong:
+  // new[](unsigned int);
+  case LibFunc_msvc_new_array_int:
+  // new[](unsigned long long);
+  case LibFunc_msvc_new_array_longlong:
+    return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
+
+  // new(unsigned int, nothrow);
+  case LibFunc_ZnwjRKSt9nothrow_t:
+  // new(unsigned long, nothrow);
+  case LibFunc_ZnwmRKSt9nothrow_t:
+  // new[](unsigned int, nothrow);
+  case LibFunc_ZnajRKSt9nothrow_t:
+  // new[](unsigned long, nothrow);
+  case LibFunc_ZnamRKSt9nothrow_t:
+  // new(unsigned int, nothrow);
+  case LibFunc_msvc_new_int_nothrow:
+  // new(unsigned long long, nothrow);
+  case LibFunc_msvc_new_longlong_nothrow:
+  // new[](unsigned int, nothrow);
+  case LibFunc_msvc_new_array_int_nothrow:
+  // new[](unsigned long long, nothrow);
+  case LibFunc_msvc_new_array_longlong_nothrow:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
+
+  // void operator delete[](void*);
+  case LibFunc_ZdaPv:
+  // void operator delete(void*);
+  case LibFunc_ZdlPv:
+  // void operator delete[](void*);
+  case LibFunc_msvc_delete_array_ptr32:
+  // void operator delete[](void*);
+  case LibFunc_msvc_delete_array_ptr64:
+  // void operator delete(void*);
+  case LibFunc_msvc_delete_ptr32:
+  // void operator delete(void*);
+  case LibFunc_msvc_delete_ptr64:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+
+  // void operator delete[](void*, nothrow);
+  case LibFunc_ZdaPvRKSt9nothrow_t:
+  // void operator delete[](void*, unsigned int);
+  case LibFunc_ZdaPvj:
+  // void operator delete[](void*, unsigned long);
+  case LibFunc_ZdaPvm:
+  // void operator delete(void*, nothrow);
+  case LibFunc_ZdlPvRKSt9nothrow_t:
+  // void operator delete(void*, unsigned int);
+  case LibFunc_ZdlPvj:
+  // void operator delete(void*, unsigned long);
+  case LibFunc_ZdlPvm:
+  // void operator delete[](void*, unsigned int);
+  case LibFunc_msvc_delete_array_ptr32_int:
+  // void operator delete[](void*, nothrow);
+  case LibFunc_msvc_delete_array_ptr32_nothrow:
+  // void operator delete[](void*, unsigned long long);
+  case LibFunc_msvc_delete_array_ptr64_longlong:
+  // void operator delete[](void*, nothrow);
+  case LibFunc_msvc_delete_array_ptr64_nothrow:
+  // void operator delete(void*, unsigned int);
+  case LibFunc_msvc_delete_ptr32_int:
+  // void operator delete(void*, nothrow);
+  case LibFunc_msvc_delete_ptr32_nothrow:
+  // void operator delete(void*, unsigned long long);
+  case LibFunc_msvc_delete_ptr64_longlong:
+  // void operator delete(void*, nothrow);
+  case LibFunc_msvc_delete_ptr64_nothrow:
+    return (NumParams == 2 && FTy.getParamType(0)->isPointerTy());
+
+  case LibFunc_memset_pattern16:
     return (!FTy.isVarArg() && NumParams == 3 &&
-            isa<PointerType>(FTy.getParamType(0)) &&
-            isa<PointerType>(FTy.getParamType(1)) &&
-            isa<IntegerType>(FTy.getParamType(2)));
-
-  // int __nvvm_reflect(const char *);
-  case LibFunc::nvvm_reflect:
-    return (NumParams == 1 && isa<PointerType>(FTy.getParamType(0)));
-
-  case LibFunc::sin:
-  case LibFunc::sinf:
-  case LibFunc::sinl:
-  case LibFunc::cos:
-  case LibFunc::cosf:
-  case LibFunc::cosl:
-  case LibFunc::tan:
-  case LibFunc::tanf:
-  case LibFunc::tanl:
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
-  case LibFunc::log:
-  case LibFunc::logf:
-  case LibFunc::logl:
-  case LibFunc::log10:
-  case LibFunc::log10f:
-  case LibFunc::log10l:
-  case LibFunc::log2:
-  case LibFunc::log2f:
-  case LibFunc::log2l:
-  case LibFunc::fabs:
-  case LibFunc::fabsf:
-  case LibFunc::fabsl:
-  case LibFunc::floor:
-  case LibFunc::floorf:
-  case LibFunc::floorl:
-  case LibFunc::ceil:
-  case LibFunc::ceilf:
-  case LibFunc::ceill:
-  case LibFunc::trunc:
-  case LibFunc::truncf:
-  case LibFunc::truncl:
-  case LibFunc::rint:
-  case LibFunc::rintf:
-  case LibFunc::rintl:
-  case LibFunc::nearbyint:
-  case LibFunc::nearbyintf:
-  case LibFunc::nearbyintl:
-  case LibFunc::round:
-  case LibFunc::roundf:
-  case LibFunc::roundl:
-  case LibFunc::sqrt:
-  case LibFunc::sqrtf:
-  case LibFunc::sqrtl:
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1)->isPointerTy() &&
+            FTy.getParamType(2)->isIntegerTy());
+
+  case LibFunc_cxa_guard_abort:
+  case LibFunc_cxa_guard_acquire:
+  case LibFunc_cxa_guard_release:
+  case LibFunc_nvvm_reflect:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+
+  case LibFunc_sincospi_stret:
+  case LibFunc_sincospif_stret:
+    return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy());
+
+  case LibFunc_acos:
+  case LibFunc_acos_finite:
+  case LibFunc_acosf:
+  case LibFunc_acosf_finite:
+  case LibFunc_acosh:
+  case LibFunc_acosh_finite:
+  case LibFunc_acoshf:
+  case LibFunc_acoshf_finite:
+  case LibFunc_acoshl:
+  case LibFunc_acoshl_finite:
+  case LibFunc_acosl:
+  case LibFunc_acosl_finite:
+  case LibFunc_asin:
+  case LibFunc_asin_finite:
+  case LibFunc_asinf:
+  case LibFunc_asinf_finite:
+  case LibFunc_asinh:
+  case LibFunc_asinhf:
+  case LibFunc_asinhl:
+  case LibFunc_asinl:
+  case LibFunc_asinl_finite:
+  case LibFunc_atan:
+  case LibFunc_atanf:
+  case LibFunc_atanh:
+  case LibFunc_atanh_finite:
+  case LibFunc_atanhf:
+  case LibFunc_atanhf_finite:
+  case LibFunc_atanhl:
+  case LibFunc_atanhl_finite:
+  case LibFunc_atanl:
+  case LibFunc_cbrt:
+  case LibFunc_cbrtf:
+  case LibFunc_cbrtl:
+  case LibFunc_ceil:
+  case LibFunc_ceilf:
+  case LibFunc_ceill:
+  case LibFunc_cos:
+  case LibFunc_cosf:
+  case LibFunc_cosh:
+  case LibFunc_cosh_finite:
+  case LibFunc_coshf:
+  case LibFunc_coshf_finite:
+  case LibFunc_coshl:
+  case LibFunc_coshl_finite:
+  case LibFunc_cosl:
+  case LibFunc_exp10:
+  case LibFunc_exp10_finite:
+  case LibFunc_exp10f:
+  case LibFunc_exp10f_finite:
+  case LibFunc_exp10l:
+  case LibFunc_exp10l_finite:
+  case LibFunc_exp2:
+  case LibFunc_exp2_finite:
+  case LibFunc_exp2f:
+  case LibFunc_exp2f_finite:
+  case LibFunc_exp2l:
+  case LibFunc_exp2l_finite:
+  case LibFunc_exp:
+  case LibFunc_exp_finite:
+  case LibFunc_expf:
+  case LibFunc_expf_finite:
+  case LibFunc_expl:
+  case LibFunc_expl_finite:
+  case LibFunc_expm1:
+  case LibFunc_expm1f:
+  case LibFunc_expm1l:
+  case LibFunc_fabs:
+  case LibFunc_fabsf:
+  case LibFunc_fabsl:
+  case LibFunc_floor:
+  case LibFunc_floorf:
+  case LibFunc_floorl:
+  case LibFunc_log10:
+  case LibFunc_log10_finite:
+  case LibFunc_log10f:
+  case LibFunc_log10f_finite:
+  case LibFunc_log10l:
+  case LibFunc_log10l_finite:
+  case LibFunc_log1p:
+  case LibFunc_log1pf:
+  case LibFunc_log1pl:
+  case LibFunc_log2:
+  case LibFunc_log2_finite:
+  case LibFunc_log2f:
+  case LibFunc_log2f_finite:
+  case LibFunc_log2l:
+  case LibFunc_log2l_finite:
+  case LibFunc_log:
+  case LibFunc_log_finite:
+  case LibFunc_logb:
+  case LibFunc_logbf:
+  case LibFunc_logbl:
+  case LibFunc_logf:
+  case LibFunc_logf_finite:
+  case LibFunc_logl:
+  case LibFunc_logl_finite:
+  case LibFunc_nearbyint:
+  case LibFunc_nearbyintf:
+  case LibFunc_nearbyintl:
+  case LibFunc_rint:
+  case LibFunc_rintf:
+  case LibFunc_rintl:
+  case LibFunc_round:
+  case LibFunc_roundf:
+  case LibFunc_roundl:
+  case LibFunc_sin:
+  case LibFunc_sinf:
+  case LibFunc_sinh:
+  case LibFunc_sinh_finite:
+  case LibFunc_sinhf:
+  case LibFunc_sinhf_finite:
+  case LibFunc_sinhl:
+  case LibFunc_sinhl_finite:
+  case LibFunc_sinl:
+  case LibFunc_sqrt:
+  case LibFunc_sqrt_finite:
+  case LibFunc_sqrtf:
+  case LibFunc_sqrtf_finite:
+  case LibFunc_sqrtl:
+  case LibFunc_sqrtl_finite:
+  case LibFunc_tan:
+  case LibFunc_tanf:
+  case LibFunc_tanh:
+  case LibFunc_tanhf:
+  case LibFunc_tanhl:
+  case LibFunc_tanl:
+  case LibFunc_trunc:
+  case LibFunc_truncf:
+  case LibFunc_truncl:
     return (NumParams == 1 && FTy.getReturnType()->isFloatingPointTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::fmin:
-  case LibFunc::fminf:
-  case LibFunc::fminl:
-  case LibFunc::fmax:
-  case LibFunc::fmaxf:
-  case LibFunc::fmaxl:
-  case LibFunc::copysign:
-  case LibFunc::copysignf:
-  case LibFunc::copysignl:
-  case LibFunc::pow:
-  case LibFunc::powf:
-  case LibFunc::powl:
+  case LibFunc_atan2:
+  case LibFunc_atan2_finite:
+  case LibFunc_atan2f:
+  case LibFunc_atan2f_finite:
+  case LibFunc_atan2l:
+  case LibFunc_atan2l_finite:
+  case LibFunc_fmin:
+  case LibFunc_fminf:
+  case LibFunc_fminl:
+  case LibFunc_fmax:
+  case LibFunc_fmaxf:
+  case LibFunc_fmaxl:
+  case LibFunc_fmod:
+  case LibFunc_fmodf:
+  case LibFunc_fmodl:
+  case LibFunc_copysign:
+  case LibFunc_copysignf:
+  case LibFunc_copysignl:
+  case LibFunc_pow:
+  case LibFunc_pow_finite:
+  case LibFunc_powf:
+  case LibFunc_powf_finite:
+  case LibFunc_powl:
+  case LibFunc_powl_finite:
     return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
             FTy.getReturnType() == FTy.getParamType(0) &&
             FTy.getReturnType() == FTy.getParamType(1));
 
-  case LibFunc::ffs:
-  case LibFunc::ffsl:
-  case LibFunc::ffsll:
-  case LibFunc::fls:
-  case LibFunc::flsl:
-  case LibFunc::flsll:
+  case LibFunc_ldexp:
+  case LibFunc_ldexpf:
+  case LibFunc_ldexpl:
+    return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() &&
+            FTy.getReturnType() == FTy.getParamType(0) &&
+            FTy.getParamType(1)->isIntegerTy(32));
+
+  case LibFunc_ffs:
+  case LibFunc_ffsl:
+  case LibFunc_ffsll:
+  case LibFunc_fls:
+  case LibFunc_flsl:
+  case LibFunc_flsll:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getParamType(0)->isIntegerTy());
 
-  case LibFunc::isdigit:
-  case LibFunc::isascii:
-  case LibFunc::toascii:
+  case LibFunc_isdigit:
+  case LibFunc_isascii:
+  case LibFunc_toascii:
+  case LibFunc_putchar:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy(32) &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::abs:
-  case LibFunc::labs:
-  case LibFunc::llabs:
+  case LibFunc_abs:
+  case LibFunc_labs:
+  case LibFunc_llabs:
     return (NumParams == 1 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::cxa_atexit:
+  case LibFunc_cxa_atexit:
     return (NumParams == 3 && FTy.getReturnType()->isIntegerTy() &&
             FTy.getParamType(0)->isPointerTy() &&
             FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy());
 
-  case LibFunc::sinpi:
-  case LibFunc::cospi:
+  case LibFunc_sinpi:
+  case LibFunc_cospi:
     return (NumParams == 1 && FTy.getReturnType()->isDoubleTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  case LibFunc::sinpif:
-  case LibFunc::cospif:
+  case LibFunc_sinpif:
+  case LibFunc_cospif:
     return (NumParams == 1 && FTy.getReturnType()->isFloatTy() &&
             FTy.getReturnType() == FTy.getParamType(0));
 
-  default:
-    // Assume the other functions are correct.
-    // FIXME: It'd be really nice to cover them all.
-    return true;
+  case LibFunc_strnlen:
+    return (NumParams == 2 && FTy.getReturnType() == FTy.getParamType(1) &&
+            FTy.getParamType(0) == PCharTy &&
+            FTy.getParamType(1) == SizeTTy);
+
+  case LibFunc_posix_memalign:
+    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+            FTy.getParamType(0)->isPointerTy() &&
+            FTy.getParamType(1) == SizeTTy && FTy.getParamType(2) == SizeTTy);
+
+  case LibFunc_wcslen:
+    return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
+            FTy.getReturnType()->isIntegerTy());
+
+  case LibFunc::NumLibFuncs:
+    break;
   }
+
+  llvm_unreachable("Invalid libfunc");
 }
 
 bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
-                                       LibFunc::Func &F) const {
+                                       LibFunc &F) const {
   const DataLayout *DL =
       FDecl.getParent() ? &FDecl.getParent()->getDataLayout() : nullptr;
   return getLibFunc(FDecl.getName(), F) &&
@@ -1134,6 +1381,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"powf", "__svml_powf8", 8},
         {"powf", "__svml_powf16", 16},
 
+        { "__pow_finite", "__svml_pow2", 2 },
+        { "__pow_finite", "__svml_pow4", 4 },
+        { "__pow_finite", "__svml_pow8", 8 },
+
+        { "__powf_finite", "__svml_powf4", 4 },
+        { "__powf_finite", "__svml_powf8", 8 },
+        { "__powf_finite", "__svml_powf16", 16 },
+
         {"llvm.pow.f64", "__svml_pow2", 2},
         {"llvm.pow.f64", "__svml_pow4", 4},
         {"llvm.pow.f64", "__svml_pow8", 8},
@@ -1150,6 +1405,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"expf", "__svml_expf8", 8},
         {"expf", "__svml_expf16", 16},
 
+        { "__exp_finite", "__svml_exp2", 2 },
+        { "__exp_finite", "__svml_exp4", 4 },
+        { "__exp_finite", "__svml_exp8", 8 },
+
+        { "__expf_finite", "__svml_expf4", 4 },
+        { "__expf_finite", "__svml_expf8", 8 },
+        { "__expf_finite", "__svml_expf16", 16 },
+
         {"llvm.exp.f64", "__svml_exp2", 2},
         {"llvm.exp.f64", "__svml_exp4", 4},
         {"llvm.exp.f64", "__svml_exp8", 8},
@@ -1166,6 +1429,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
         {"logf", "__svml_logf8", 8},
         {"logf", "__svml_logf16", 16},
 
+        { "__log_finite", "__svml_log2", 2 },
+        { "__log_finite", "__svml_log4", 4 },
+        { "__log_finite", "__svml_log8", 8 },
+
+        { "__logf_finite", "__svml_logf4", 4 },
+        { "__logf_finite", "__svml_logf8", 8 },
+        { "__logf_finite", "__svml_logf16", 16 },
+
         {"llvm.log.f64", "__svml_log2", 2},
         {"llvm.log.f64", "__svml_log4", 4},
         {"llvm.log.f64", "__svml_log8", 8},
@@ -1248,6 +1519,21 @@ TargetLibraryInfoImpl &TargetLibraryAnalysis::lookupInfoImpl(const Triple &T) {
   return *Impl;
 }
 
+unsigned TargetLibraryInfoImpl::getTargetWCharSize(const Triple &T) {
+  // See also clang/lib/Basic/Targets.cpp.
+  if (T.isPS4() || T.isOSWindows() || T.isArch16Bit())
+    return 2;
+  if (T.getArch() == Triple::xcore)
+    return 1;
+  return 4;
+}
+
+unsigned TargetLibraryInfoImpl::getWCharSize(const Module &M) const {
+  if (auto *ShortWChar = cast_or_null<ConstantAsMetadata>(
+      M.getModuleFlag("wchar_size")))
+    return cast<ConstantInt>(ShortWChar->getValue())->getZExtValue();
+  return getTargetWCharSize(Triple(M.getTargetTriple()));
+}
 
 TargetLibraryInfoWrapperPass::TargetLibraryInfoWrapperPass()
     : ImmutablePass(ID), TLIImpl(), TLI(TLIImpl) {
diff --git a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
index 5c0d1aa..25813c6 100644
--- a/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <utility>
 
@@ -23,6 +24,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "tti"
 
+static cl::opt<bool> UseWideMemcpyLoopLowering(
+    "use-wide-memcpy-loop-lowering", cl::init(false),
+    cl::desc("Enables the new wide memcpy loop lowering in Transforms/Utils."),
+    cl::Hidden);
+
 namespace {
 /// \brief No-op implementation of the TTI interface using the utility base
 /// classes.
@@ -76,6 +82,11 @@ int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
   return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
 }
 
+int TargetTransformInfo::getExtCost(const Instruction *I,
+                                    const Value *Src) const {
+  return TTIImpl->getExtCost(I, Src);
+}
+
 int TargetTransformInfo::getIntrinsicCost(
     Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments) const {
   int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments);
@@ -83,8 +94,15 @@ int TargetTransformInfo::getIntrinsicCost(
   return Cost;
 }
 
-int TargetTransformInfo::getUserCost(const User *U) const {
-  int Cost = TTIImpl->getUserCost(U);
+unsigned
+TargetTransformInfo::getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
+                                                      unsigned &JTSize) const {
+  return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize);
+}
+
+int TargetTransformInfo::getUserCost(const User *U,
+    ArrayRef<const Value *> Operands) const {
+  int Cost = TTIImpl->getUserCost(U, Operands);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -97,13 +115,21 @@ bool TargetTransformInfo::isSourceOfDivergence(const Value *V) const {
   return TTIImpl->isSourceOfDivergence(V);
 }
 
+bool llvm::TargetTransformInfo::isAlwaysUniform(const Value *V) const {
+  return TTIImpl->isAlwaysUniform(V);
+}
+
+unsigned TargetTransformInfo::getFlatAddressSpace() const {
+  return TTIImpl->getFlatAddressSpace();
+}
+
 bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
   return TTIImpl->isLoweredToCall(F);
 }
 
 void TargetTransformInfo::getUnrollingPreferences(
-    Loop *L, UnrollingPreferences &UP) const {
-  return TTIImpl->getUnrollingPreferences(L, UP);
+    Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
+  return TTIImpl->getUnrollingPreferences(L, SE, UP);
 }
 
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
@@ -123,6 +149,10 @@ bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                         Scale, AddrSpace);
 }
 
+bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
+  return TTIImpl->isLSRCostLess(C1, C2);
+}
+
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedStore(DataType);
 }
@@ -139,6 +169,10 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const {
   return TTIImpl->isLegalMaskedGather(DataType);
 }
 
+bool TargetTransformInfo::prefersVectorizedAddressing() const {
+  return TTIImpl->prefersVectorizedAddressing();
+}
+
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               int64_t BaseOffset,
                                               bool HasBaseReg,
@@ -182,10 +216,29 @@ bool TargetTransformInfo::shouldBuildLookupTablesForConstant(Constant *C) const
   return TTIImpl->shouldBuildLookupTablesForConstant(C);
 }
 
+unsigned TargetTransformInfo::
+getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const {
+  return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
+}
+
+unsigned TargetTransformInfo::
+getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                 unsigned VF) const {
+  return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
+}
+
+bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
+  return TTIImpl->supportsEfficientVectorElementLoadStore();
+}
+
 bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const {
+  return TTIImpl->expandMemCmp(I, MaxLoadSize);
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
@@ -254,6 +307,16 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const {
+  return TTIImpl->getMinVectorRegisterBitWidth();
+}
+
+bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
+    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
+  return TTIImpl->shouldConsiderAddressTypePromotion(
+      I, AllowPromotionWithoutCommonHeader);
+}
+
 unsigned TargetTransformInfo::getCacheLineSize() const {
   return TTIImpl->getCacheLineSize();
 }
@@ -293,8 +356,10 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
 }
 
 int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                          Type *Src) const {
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+                                 Type *Src, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -314,8 +379,10 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
 }
 
 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) const {
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+                                 Type *CondTy, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -329,8 +396,11 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
 
 int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
-                                         unsigned AddressSpace) const {
-  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+                                         unsigned AddressSpace,
+                                         const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -363,17 +433,17 @@ int TargetTransformInfo::getInterleavedMemoryOpCost(
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Type *> Tys,
-                                               FastMathFlags FMF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+                                    ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                    unsigned ScalarizationCostPassed) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                            ScalarizationCostPassed);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Value *> Args,
-                                               FastMathFlags FMF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF);
+           ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -414,11 +484,34 @@ bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
   return TTIImpl->getTgtMemIntrinsic(Inst, Info);
 }
 
+unsigned TargetTransformInfo::getAtomicMemIntrinsicMaxElementSize() const {
+  return TTIImpl->getAtomicMemIntrinsicMaxElementSize();
+}
+
 Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
     IntrinsicInst *Inst, Type *ExpectedType) const {
   return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
 }
 
+Type *TargetTransformInfo::getMemcpyLoopLoweringType(LLVMContext &Context,
+                                                     Value *Length,
+                                                     unsigned SrcAlign,
+                                                     unsigned DestAlign) const {
+  return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAlign,
+                                            DestAlign);
+}
+
+void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
+    SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+    unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const {
+  TTIImpl->getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
+                                             SrcAlign, DestAlign);
+}
+
+bool TargetTransformInfo::useWideIRMemcpyLoopLowering() const {
+  return UseWideMemcpyLoopLowering;
+}
+
 bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
                                               const Function *Callee) const {
   return TTIImpl->areInlineCompatible(Caller, Callee);
@@ -462,6 +555,15 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
+                                                Type *Ty, ReductionFlags Flags) const {
+  return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
+}
+
+bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
+  return TTIImpl->shouldExpandReduction(II);
+}
+
 TargetTransformInfo::Concept::~Concept() {}
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index e920c4c..86c528d 100644
--- a/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/contrib/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -23,10 +23,10 @@
 //
 // The scalar TBAA metadata format is very simple. TBAA MDNodes have up to
 // three fields, e.g.:
-//   !0 = metadata !{ metadata !"an example type tree" }
-//   !1 = metadata !{ metadata !"int", metadata !0 }
-//   !2 = metadata !{ metadata !"float", metadata !0 }
-//   !3 = metadata !{ metadata !"const float", metadata !2, i64 1 }
+//   !0 = !{ !"an example type tree" }
+//   !1 = !{ !"int", !0 }
+//   !2 = !{ !"float", !0 }
+//   !3 = !{ !"const float", !2, i64 1 }
 //
 // The first field is an identity field. It can be any value, usually
 // an MDString, which uniquely identifies the type. The most important
@@ -58,7 +58,7 @@
 //
 // The struct type node has a name and a list of pairs, one pair for each member
 // of the struct. The first element of each pair is a type node (a struct type
-// node or a sclar type node), specifying the type of the member, the second
+// node or a scalar type node), specifying the type of the member, the second
 // element of each pair is the offset of the member.
 //
 // Given an example
@@ -74,13 +74,13 @@
 // instruction. The base type is !4 (struct B), the access type is !2 (scalar
 // type short) and the offset is 4.
 //
-// !0 = metadata !{metadata !"Simple C/C++ TBAA"}
-// !1 = metadata !{metadata !"omnipotent char", metadata !0} // Scalar type node
-// !2 = metadata !{metadata !"short", metadata !1}           // Scalar type node
-// !3 = metadata !{metadata !"A", metadata !2, i64 0}        // Struct type node
-// !4 = metadata !{metadata !"B", metadata !2, i64 0, metadata !3, i64 4}
+// !0 = !{!"Simple C/C++ TBAA"}
+// !1 = !{!"omnipotent char", !0} // Scalar type node
+// !2 = !{!"short", !1}           // Scalar type node
+// !3 = !{!"A", !2, i64 0}        // Struct type node
+// !4 = !{!"B", !2, i64 0, !3, i64 4}
 //                                                           // Struct type node
-// !5 = metadata !{metadata !4, metadata !2, i64 4}          // Path tag node
+// !5 = !{!4, !2, i64 4}          // Path tag node
 //
 // The struct type nodes and the scalar type nodes form a type DAG.
 //         Root (!0)
diff --git a/contrib/llvm/lib/Analysis/TypeMetadataUtils.cpp b/contrib/llvm/lib/Analysis/TypeMetadataUtils.cpp
index f567541..6871e48 100644
--- a/contrib/llvm/lib/Analysis/TypeMetadataUtils.cpp
+++ b/contrib/llvm/lib/Analysis/TypeMetadataUtils.cpp
@@ -39,7 +39,7 @@ findCallsAtConstantOffset(SmallVectorImpl<DevirtCallSite> &DevirtCalls,
 
 // Search for virtual calls that load from VPtr and add them to DevirtCalls.
 static void
-findLoadCallsAtConstantOffset(Module *M,
+findLoadCallsAtConstantOffset(const Module *M,
                               SmallVectorImpl<DevirtCallSite> &DevirtCalls,
                               Value *VPtr, int64_t Offset) {
   for (const Use &U : VPtr->uses()) {
@@ -62,10 +62,10 @@ findLoadCallsAtConstantOffset(Module *M,
 
 void llvm::findDevirtualizableCallsForTypeTest(
     SmallVectorImpl<DevirtCallSite> &DevirtCalls,
-    SmallVectorImpl<CallInst *> &Assumes, CallInst *CI) {
+    SmallVectorImpl<CallInst *> &Assumes, const CallInst *CI) {
   assert(CI->getCalledFunction()->getIntrinsicID() == Intrinsic::type_test);
 
-  Module *M = CI->getParent()->getParent()->getParent();
+  const Module *M = CI->getParent()->getParent()->getParent();
 
   // Find llvm.assume intrinsics for this llvm.type.test call.
   for (const Use &CIU : CI->uses()) {
@@ -86,7 +86,8 @@ void llvm::findDevirtualizableCallsForTypeTest(
 void llvm::findDevirtualizableCallsForTypeCheckedLoad(
     SmallVectorImpl<DevirtCallSite> &DevirtCalls,
     SmallVectorImpl<Instruction *> &LoadedPtrs,
-    SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses, CallInst *CI) {
+    SmallVectorImpl<Instruction *> &Preds, bool &HasNonCallUses,
+    const CallInst *CI) {
   assert(CI->getCalledFunction()->getIntrinsicID() ==
          Intrinsic::type_checked_load);
 
@@ -96,7 +97,7 @@ void llvm::findDevirtualizableCallsForTypeCheckedLoad(
     return;
   }
 
-  for (Use &U : CI->uses()) {
+  for (const Use &U : CI->uses()) {
     auto CIU = U.getUser();
     if (auto EVI = dyn_cast<ExtractValueInst>(CIU)) {
       if (EVI->getNumIndices() == 1 && EVI->getIndices()[0] == 0) {
diff --git a/contrib/llvm/lib/Analysis/ValueTracking.cpp b/contrib/llvm/lib/Analysis/ValueTracking.cpp
index be62858..cdfe74d 100644
--- a/contrib/llvm/lib/Analysis/ValueTracking.cpp
+++ b/contrib/llvm/lib/Analysis/ValueTracking.cpp
@@ -17,14 +17,16 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -37,6 +39,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <array>
@@ -57,8 +60,8 @@ static cl::opt<bool>
 DontImproveNonNegativePhiBits("dont-improve-non-negative-phi-bits",
                               cl::Hidden, cl::init(true));
 
-/// Returns the bitwidth of the given scalar or pointer type (if unknown returns
-/// 0). For vector types, returns the element type's bitwidth.
+/// Returns the bitwidth of the given scalar or pointer type. For vector types,
+/// returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
@@ -76,6 +79,9 @@ struct Query {
   AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
+  // Unlike the other analyses, this may be a nullptr because not all clients
+  // provide it currently.
+  OptimizationRemarkEmitter *ORE;
 
   /// Set of assumptions that should be excluded from further queries.
   /// This is because of the potential for mutual recursion to cause
@@ -83,18 +89,18 @@ struct Query {
   /// classic case of this is assume(x = y), which will attempt to determine
   /// bits in x from bits in y, which will attempt to determine bits in y from
   /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
-  /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and
-  /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so
-  /// on.
+  /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
+  /// (all of which can call computeKnownBits), and so on.
   std::array<const Value *, MaxDepth> Excluded;
   unsigned NumExcluded;
 
   Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI,
-        const DominatorTree *DT)
-      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), NumExcluded(0) {}
+        const DominatorTree *DT, OptimizationRemarkEmitter *ORE = nullptr)
+      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE), NumExcluded(0) {}
 
   Query(const Query &Q, const Value *NewExcl)
-      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), NumExcluded(Q.NumExcluded) {
+      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
+        NumExcluded(Q.NumExcluded) {
     Excluded = Q.Excluded;
     Excluded[NumExcluded++] = NewExcl;
     assert(NumExcluded <= Excluded.size());
@@ -125,15 +131,28 @@ static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
   return nullptr;
 }
 
-static void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
+static void computeKnownBits(const Value *V, KnownBits &Known,
                              unsigned Depth, const Query &Q);
 
-void llvm::computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
+void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
-                            const DominatorTree *DT) {
-  ::computeKnownBits(V, KnownZero, KnownOne, Depth,
-                     Query(DL, AC, safeCxtI(V, CxtI), DT));
+                            const DominatorTree *DT,
+                            OptimizationRemarkEmitter *ORE) {
+  ::computeKnownBits(V, Known, Depth,
+                     Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
+}
+
+static KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                                  const Query &Q);
+
+KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
+                                 unsigned Depth, AssumptionCache *AC,
+                                 const Instruction *CxtI,
+                                 const DominatorTree *DT,
+                                 OptimizationRemarkEmitter *ORE) {
+  return ::computeKnownBits(V, Depth,
+                            Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
@@ -145,22 +164,24 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
   assert(LHS->getType()->isIntOrIntVectorTy() &&
          "LHS and RHS should be integers");
   IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
-  APInt LHSKnownZero(IT->getBitWidth(), 0), LHSKnownOne(IT->getBitWidth(), 0);
-  APInt RHSKnownZero(IT->getBitWidth(), 0), RHSKnownOne(IT->getBitWidth(), 0);
-  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, 0, AC, CxtI, DT);
-  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, 0, AC, CxtI, DT);
-  return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
+  KnownBits LHSKnown(IT->getBitWidth());
+  KnownBits RHSKnown(IT->getBitWidth());
+  computeKnownBits(LHS, LHSKnown, DL, 0, AC, CxtI, DT);
+  computeKnownBits(RHS, RHSKnown, DL, 0, AC, CxtI, DT);
+  return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
 }
 
-static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                           unsigned Depth, const Query &Q);
 
-void llvm::ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                          const DataLayout &DL, unsigned Depth,
-                          AssumptionCache *AC, const Instruction *CxtI,
-                          const DominatorTree *DT) {
-  ::ComputeSignBit(V, KnownZero, KnownOne, Depth,
-                   Query(DL, AC, safeCxtI(V, CxtI), DT));
+bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
+  for (const User *U : CxtI->users()) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    return false;
+  }
+  return true;
 }
 
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
@@ -187,9 +208,8 @@ bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
                               unsigned Depth,
                               AssumptionCache *AC, const Instruction *CxtI,
                               const DominatorTree *DT) {
-  bool NonNegative, Negative;
-  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
-  return NonNegative;
+  KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+  return Known.isNonNegative();
 }
 
 bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
@@ -207,9 +227,8 @@ bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
 bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
-  bool NonNegative, Negative;
-  ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT);
-  return Negative;
+  KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
+  return Known.isNegative();
 }
 
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
@@ -246,91 +265,65 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
 
 static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    bool NSW,
-                                   APInt &KnownZero, APInt &KnownOne,
-                                   APInt &KnownZero2, APInt &KnownOne2,
+                                   KnownBits &KnownOut, KnownBits &Known2,
                                    unsigned Depth, const Query &Q) {
-  if (!Add) {
-    if (const ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
-      // We know that the top bits of C-X are clear if X contains less bits
-      // than C (i.e. no wrap-around can happen).  For example, 20-X is
-      // positive if we can prove that X is >= 0 and < 16.
-      if (!CLHS->getValue().isNegative()) {
-        unsigned BitWidth = KnownZero.getBitWidth();
-        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
-        // NLZ can't be BitWidth with no sign bit
-        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
-
-        // If all of the MaskV bits are known to be zero, then we know the
-        // output top bits are zero, because we now know that the output is
-        // from [0-C].
-        if ((KnownZero2 & MaskV) == MaskV) {
-          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
-          // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
-        }
-      }
-    }
-  }
-
-  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned BitWidth = KnownOut.getBitWidth();
 
   // If an initial sequence of bits in the result is not needed, the
   // corresponding bits in the operands are not needed.
-  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  computeKnownBits(Op0, LHSKnownZero, LHSKnownOne, Depth + 1, Q);
-  computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
+  KnownBits LHSKnown(BitWidth);
+  computeKnownBits(Op0, LHSKnown, Depth + 1, Q);
+  computeKnownBits(Op1, Known2, Depth + 1, Q);
 
   // Carry in a 1 for a subtract, rather than a 0.
-  APInt CarryIn(BitWidth, 0);
+  uint64_t CarryIn = 0;
   if (!Add) {
     // Sum = LHS + ~RHS + 1
-    std::swap(KnownZero2, KnownOne2);
-    CarryIn.setBit(0);
+    std::swap(Known2.Zero, Known2.One);
+    CarryIn = 1;
   }
 
-  APInt PossibleSumZero = ~LHSKnownZero + ~KnownZero2 + CarryIn;
-  APInt PossibleSumOne = LHSKnownOne + KnownOne2 + CarryIn;
+  APInt PossibleSumZero = ~LHSKnown.Zero + ~Known2.Zero + CarryIn;
+  APInt PossibleSumOne = LHSKnown.One + Known2.One + CarryIn;
 
   // Compute known bits of the carry.
-  APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnownZero ^ KnownZero2);
-  APInt CarryKnownOne = PossibleSumOne ^ LHSKnownOne ^ KnownOne2;
+  APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnown.Zero ^ Known2.Zero);
+  APInt CarryKnownOne = PossibleSumOne ^ LHSKnown.One ^ Known2.One;
 
   // Compute set of known bits (where all three relevant bits are known).
-  APInt LHSKnown = LHSKnownZero | LHSKnownOne;
-  APInt RHSKnown = KnownZero2 | KnownOne2;
-  APInt CarryKnown = CarryKnownZero | CarryKnownOne;
-  APInt Known = LHSKnown & RHSKnown & CarryKnown;
+  APInt LHSKnownUnion = LHSKnown.Zero | LHSKnown.One;
+  APInt RHSKnownUnion = Known2.Zero | Known2.One;
+  APInt CarryKnownUnion = CarryKnownZero | CarryKnownOne;
+  APInt Known = LHSKnownUnion & RHSKnownUnion & CarryKnownUnion;
 
   assert((PossibleSumZero & Known) == (PossibleSumOne & Known) &&
          "known bits of sum differ");
 
   // Compute known bits of the result.
-  KnownZero = ~PossibleSumOne & Known;
-  KnownOne = PossibleSumOne & Known;
+  KnownOut.Zero = ~PossibleSumOne & Known;
+  KnownOut.One = PossibleSumOne & Known;
 
   // Are we still trying to solve for the sign bit?
-  if (!Known.isNegative()) {
+  if (!Known.isSignBitSet()) {
     if (NSW) {
       // Adding two non-negative numbers, or subtracting a negative number from
       // a non-negative one, can't wrap into negative.
-      if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
-        KnownZero |= APInt::getSignBit(BitWidth);
+      if (LHSKnown.isNonNegative() && Known2.isNonNegative())
+        KnownOut.makeNonNegative();
       // Adding two negative numbers, or subtracting a non-negative number from
       // a negative one, can't wrap into non-negative.
-      else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
-        KnownOne |= APInt::getSignBit(BitWidth);
+      else if (LHSKnown.isNegative() && Known2.isNegative())
+        KnownOut.makeNegative();
     }
   }
 }
 
 static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
-                                APInt &KnownZero, APInt &KnownOne,
-                                APInt &KnownZero2, APInt &KnownOne2,
+                                KnownBits &Known, KnownBits &Known2,
                                 unsigned Depth, const Query &Q) {
-  unsigned BitWidth = KnownZero.getBitWidth();
-  computeKnownBits(Op1, KnownZero, KnownOne, Depth + 1, Q);
-  computeKnownBits(Op0, KnownZero2, KnownOne2, Depth + 1, Q);
+  unsigned BitWidth = Known.getBitWidth();
+  computeKnownBits(Op1, Known, Depth + 1, Q);
+  computeKnownBits(Op0, Known2, Depth + 1, Q);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
@@ -340,10 +333,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
       // The product of a number with itself is non-negative.
       isKnownNonNegative = true;
     } else {
-      bool isKnownNonNegativeOp1 = KnownZero.isNegative();
-      bool isKnownNonNegativeOp0 = KnownZero2.isNegative();
-      bool isKnownNegativeOp1 = KnownOne.isNegative();
-      bool isKnownNegativeOp0 = KnownOne2.isNegative();
+      bool isKnownNonNegativeOp1 = Known.isNonNegative();
+      bool isKnownNonNegativeOp0 = Known2.isNonNegative();
+      bool isKnownNegativeOp1 = Known.isNegative();
+      bool isKnownNegativeOp0 = Known2.isNegative();
       // The product of two numbers with the same sign is non-negative.
       isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) ||
         (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
@@ -361,38 +354,37 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
   // Also compute a conservative estimate for high known-0 bits.
   // More trickiness is possible, but this is sufficient for the
   // interesting case of alignment computation.
-  KnownOne.clearAllBits();
-  unsigned TrailZ = KnownZero.countTrailingOnes() +
-                    KnownZero2.countTrailingOnes();
-  unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
-                             KnownZero2.countLeadingOnes(),
+  unsigned TrailZ = Known.countMinTrailingZeros() +
+                    Known2.countMinTrailingZeros();
+  unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                             Known2.countMinLeadingZeros(),
                              BitWidth) - BitWidth;
 
   TrailZ = std::min(TrailZ, BitWidth);
   LeadZ = std::min(LeadZ, BitWidth);
-  KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-              APInt::getHighBitsSet(BitWidth, LeadZ);
+  Known.resetAll();
+  Known.Zero.setLowBits(TrailZ);
+  Known.Zero.setHighBits(LeadZ);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
   // which case we prefer to follow the result of the direct computation,
   // though as the program is invoking undefined behaviour we can choose
   // whatever we like here.
-  if (isKnownNonNegative && !KnownOne.isNegative())
-    KnownZero.setBit(BitWidth - 1);
-  else if (isKnownNegative && !KnownZero.isNegative())
-    KnownOne.setBit(BitWidth - 1);
+  if (isKnownNonNegative && !Known.isNegative())
+    Known.makeNonNegative();
+  else if (isKnownNegative && !Known.isNonNegative())
+    Known.makeNegative();
 }
 
 void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
-                                             APInt &KnownZero,
-                                             APInt &KnownOne) {
-  unsigned BitWidth = KnownZero.getBitWidth();
+                                             KnownBits &Known) {
+  unsigned BitWidth = Known.getBitWidth();
   unsigned NumRanges = Ranges.getNumOperands() / 2;
   assert(NumRanges >= 1);
 
-  KnownZero.setAllBits();
-  KnownOne.setAllBits();
+  Known.Zero.setAllBits();
+  Known.One.setAllBits();
 
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Lower =
@@ -406,8 +398,8 @@ void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
         (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();
 
     APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
-    KnownOne &= Range.getUnsignedMax() & Mask;
-    KnownZero &= ~Range.getUnsignedMax() & Mask;
+    Known.One &= Range.getUnsignedMax() & Mask;
+    Known.Zero &= ~Range.getUnsignedMax() & Mask;
   }
 }
 
@@ -516,15 +508,14 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
   return !isEphemeralValueOf(Inv, CxtI);
 }
 
-static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
-                                       APInt &KnownOne, unsigned Depth,
-                                       const Query &Q) {
+static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
+                                       unsigned Depth, const Query &Q) {
   // Use of assumptions is context-sensitive. If we don't have a context, we
   // cannot use them!
   if (!Q.AC || !Q.CxtI)
     return;
 
-  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned BitWidth = Known.getBitWidth();
 
   // Note that the patterns below need to be kept in sync with the code
   // in AssumptionCache::updateAffectedValues.
@@ -549,8 +540,13 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
     if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
-      KnownZero.clearAllBits();
-      KnownOne.setAllBits();
+      Known.setAllOnes();
+      return;
+    }
+    if (match(Arg, m_Not(m_Specific(V))) &&
+        isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
+      assert(BitWidth == 1 && "assume operand is not i1?");
+      Known.setAllZero();
       return;
     }
 
@@ -568,282 +564,284 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
     // assume(v = a)
     if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
         Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      KnownZero |= RHSKnownZero;
-      KnownOne  |= RHSKnownOne;
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      Known.Zero |= RHSKnown.Zero;
+      Known.One  |= RHSKnown.One;
     // assume(v & b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0);
-      computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      KnownBits MaskKnown(BitWidth);
+      computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // known bits from the RHS to V.
-      KnownZero |= RHSKnownZero & MaskKnownOne;
-      KnownOne  |= RHSKnownOne  & MaskKnownOne;
+      Known.Zero |= RHSKnown.Zero & MaskKnown.One;
+      Known.One  |= RHSKnown.One  & MaskKnown.One;
     // assume(~(v & b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      APInt MaskKnownZero(BitWidth, 0), MaskKnownOne(BitWidth, 0);
-      computeKnownBits(B, MaskKnownZero, MaskKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      KnownBits MaskKnown(BitWidth);
+      computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // inverted known bits from the RHS to V.
-      KnownZero |= RHSKnownOne  & MaskKnownOne;
-      KnownOne  |= RHSKnownZero & MaskKnownOne;
+      Known.Zero |= RHSKnown.One  & MaskKnown.One;
+      Known.One  |= RHSKnown.Zero & MaskKnown.One;
     // assume(v | b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      KnownBits BKnown(BitWidth);
+      computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V.
-      KnownZero |= RHSKnownZero & BKnownZero;
-      KnownOne  |= RHSKnownOne  & BKnownZero;
+      Known.Zero |= RHSKnown.Zero & BKnown.Zero;
+      Known.One  |= RHSKnown.One  & BKnown.Zero;
     // assume(~(v | b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      KnownBits BKnown(BitWidth);
+      computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V.
-      KnownZero |= RHSKnownOne  & BKnownZero;
-      KnownOne  |= RHSKnownZero & BKnownZero;
+      Known.Zero |= RHSKnown.One  & BKnown.Zero;
+      Known.One  |= RHSKnown.Zero & BKnown.Zero;
     // assume(v ^ b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      KnownBits BKnown(BitWidth);
+      computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V. For those bits in B that are known to be one,
       // we can propagate inverted known bits from the RHS to V.
-      KnownZero |= RHSKnownZero & BKnownZero;
-      KnownOne  |= RHSKnownOne  & BKnownZero;
-      KnownZero |= RHSKnownOne  & BKnownOne;
-      KnownOne  |= RHSKnownZero & BKnownOne;
+      Known.Zero |= RHSKnown.Zero & BKnown.Zero;
+      Known.One  |= RHSKnown.One  & BKnown.Zero;
+      Known.Zero |= RHSKnown.One  & BKnown.One;
+      Known.One  |= RHSKnown.Zero & BKnown.One;
     // assume(~(v ^ b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
-      APInt BKnownZero(BitWidth, 0), BKnownOne(BitWidth, 0);
-      computeKnownBits(B, BKnownZero, BKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+      KnownBits BKnown(BitWidth);
+      computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V. For those bits in B that are
       // known to be one, we can propagate known bits from the RHS to V.
-      KnownZero |= RHSKnownOne  & BKnownZero;
-      KnownOne  |= RHSKnownZero & BKnownZero;
-      KnownZero |= RHSKnownZero & BKnownOne;
-      KnownOne  |= RHSKnownOne  & BKnownOne;
+      Known.Zero |= RHSKnown.One  & BKnown.Zero;
+      Known.One  |= RHSKnown.Zero & BKnown.Zero;
+      Known.Zero |= RHSKnown.Zero & BKnown.One;
+      Known.One  |= RHSKnown.One  & BKnown.One;
     // assume(v << c = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
-      KnownZero |= RHSKnownZero.lshr(C->getZExtValue());
-      KnownOne  |= RHSKnownOne.lshr(C->getZExtValue());
+      RHSKnown.Zero.lshrInPlace(C->getZExtValue());
+      Known.Zero |= RHSKnown.Zero;
+      RHSKnown.One.lshrInPlace(C->getZExtValue());
+      Known.One  |= RHSKnown.One;
     // assume(~(v << c) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
-      KnownZero |= RHSKnownOne.lshr(C->getZExtValue());
-      KnownOne  |= RHSKnownZero.lshr(C->getZExtValue());
+      RHSKnown.One.lshrInPlace(C->getZExtValue());
+      Known.Zero |= RHSKnown.One;
+      RHSKnown.Zero.lshrInPlace(C->getZExtValue());
+      Known.One  |= RHSKnown.Zero;
     // assume(v >> c = a)
     } else if (match(Arg,
-                     m_c_ICmp(Pred, m_CombineOr(m_LShr(m_V, m_ConstantInt(C)),
-                                                m_AShr(m_V, m_ConstantInt(C))),
+                     m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
                               m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
-      KnownZero |= RHSKnownZero << C->getZExtValue();
-      KnownOne  |= RHSKnownOne  << C->getZExtValue();
+      Known.Zero |= RHSKnown.Zero << C->getZExtValue();
+      Known.One  |= RHSKnown.One  << C->getZExtValue();
     // assume(~(v >> c) = a)
-    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_CombineOr(
-                                             m_LShr(m_V, m_ConstantInt(C)),
-                                             m_AShr(m_V, m_ConstantInt(C)))),
+    } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
-      KnownZero |= RHSKnownOne  << C->getZExtValue();
-      KnownOne  |= RHSKnownZero << C->getZExtValue();
+      Known.Zero |= RHSKnown.One  << C->getZExtValue();
+      Known.One  |= RHSKnown.Zero << C->getZExtValue();
     // assume(v >=_s c) where c is non-negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
-      if (RHSKnownZero.isNegative()) {
+      if (RHSKnown.isNonNegative()) {
         // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
+        Known.makeNonNegative();
       }
     // assume(v >_s c) where c is at least -1.
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
-      if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isNegative()) {
+      if (RHSKnown.isAllOnes() || RHSKnown.isNonNegative()) {
         // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
+        Known.makeNonNegative();
       }
     // assume(v <=_s c) where c is negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
-      if (RHSKnownOne.isNegative()) {
+      if (RHSKnown.isNegative()) {
         // We know that the sign bit is one.
-        KnownOne |= APInt::getSignBit(BitWidth);
+        Known.makeNegative();
       }
     // assume(v <_s c) where c is non-positive
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
-      if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isNegative()) {
+      if (RHSKnown.isZero() || RHSKnown.isNegative()) {
         // We know that the sign bit is one.
-        KnownOne |= APInt::getSignBit(BitWidth);
+        Known.makeNegative();
       }
     // assume(v <=_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
-      KnownZero |=
-        APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
-    // assume(v <_u c)
+      Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
+      // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-      APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-      computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
+      KnownBits RHSKnown(BitWidth);
+      computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
-        KnownZero |=
-          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()+1);
+        Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
       else
-        KnownZero |=
-          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+        Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
     }
   }
 
   // If assumptions conflict with each other or previous known bits, then we
-  // have a logical fallacy. This should only happen when a program has
-  // undefined behavior. We can't assert/crash, so clear out the known bits and
-  // hope for the best.
-
-  // FIXME: Publish a warning/remark that we have encountered UB or the compiler
-  // is broken.
-
-  // FIXME: Implement a stronger version of "I give up" by invalidating/clearing
-  // the assumption cache. This should indicate that the cache is corrupted so
-  // future callers will not waste time repopulating it with faulty assumptions.
-
-  if ((KnownZero & KnownOne) != 0) {
-    KnownZero.clearAllBits();
-    KnownOne.clearAllBits();
+  // have a logical fallacy. It's possible that the assumption is not reachable,
+  // so this isn't a real bug. On the other hand, the program may have undefined
+  // behavior, or we might have a bug in the compiler. We can't assert/crash, so
+  // clear out the known bits, try to warn the user, and hope for the best.
+  if (Known.Zero.intersects(Known.One)) {
+    Known.resetAll();
+
+    if (Q.ORE) {
+      auto *CxtI = const_cast<Instruction *>(Q.CxtI);
+      OptimizationRemarkAnalysis ORA("value-tracking", "BadAssumption", CxtI);
+      Q.ORE->emit(ORA << "Detected conflicting code assumptions. Program may "
+                         "have undefined behavior, or compiler may have "
+                         "internal error.");
+    }
   }
 }
 
 // Compute known bits from a shift operator, including those with a
-// non-constant shift amount. KnownZero and KnownOne are the outputs of this
-// function. KnownZero2 and KnownOne2 are pre-allocated temporaries with the
-// same bit width as KnownZero and KnownOne. KZF and KOF are operator-specific
-// functors that, given the known-zero or known-one bits respectively, and a
-// shift amount, compute the implied known-zero or known-one bits of the shift
-// operator's result respectively for that shift amount. The results from calling
-// KZF and KOF are conservatively combined for all permitted shift amounts.
+// non-constant shift amount. Known is the outputs of this function. Known2 is a
+// pre-allocated temporary with the/ same bit width as Known. KZF and KOF are
+// operator-specific functors that, given the known-zero or known-one bits
+// respectively, and a shift amount, compute the implied known-zero or known-one
+// bits of the shift operator's result respectively for that shift amount. The
+// results from calling KZF and KOF are conservatively combined for all
+// permitted shift amounts.
 static void computeKnownBitsFromShiftOperator(
-    const Operator *I, APInt &KnownZero, APInt &KnownOne, APInt &KnownZero2,
-    APInt &KnownOne2, unsigned Depth, const Query &Q,
+    const Operator *I, KnownBits &Known, KnownBits &Known2,
+    unsigned Depth, const Query &Q,
     function_ref<APInt(const APInt &, unsigned)> KZF,
     function_ref<APInt(const APInt &, unsigned)> KOF) {
-  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned BitWidth = Known.getBitWidth();
 
   if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
     unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
-    KnownZero = KZF(KnownZero, ShiftAmt);
-    KnownOne  = KOF(KnownOne, ShiftAmt);
-    // If there is conflict between KnownZero and KnownOne, this must be an
-    // overflowing left shift, so the shift result is undefined. Clear KnownZero
-    // and KnownOne bits so that other code could propagate this undef.
-    if ((KnownZero & KnownOne) != 0) {
-      KnownZero.clearAllBits();
-      KnownOne.clearAllBits();
-    }
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    Known.Zero = KZF(Known.Zero, ShiftAmt);
+    Known.One  = KOF(Known.One, ShiftAmt);
+    // If there is conflict between Known.Zero and Known.One, this must be an
+    // overflowing left shift, so the shift result is undefined. Clear Known
+    // bits so that other code could propagate this undef.
+    if ((Known.Zero & Known.One) != 0)
+      Known.resetAll();
 
     return;
   }
 
-  computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
+  computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
 
-  // Note: We cannot use KnownZero.getLimitedValue() here, because if
+  // If the shift amount could be greater than or equal to the bit-width of the LHS, the
+  // value could be undef, so we don't know anything about it.
+  if ((~Known.Zero).uge(BitWidth)) {
+    Known.resetAll();
+    return;
+  }
+
+  // Note: We cannot use Known.Zero.getLimitedValue() here, because if
   // BitWidth > 64 and any upper bits are known, we'll end up returning the
   // limit value (which implies all bits are known).
-  uint64_t ShiftAmtKZ = KnownZero.zextOrTrunc(64).getZExtValue();
-  uint64_t ShiftAmtKO = KnownOne.zextOrTrunc(64).getZExtValue();
+  uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
+  uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();
 
   // It would be more-clearly correct to use the two temporaries for this
   // calculation. Reusing the APInts here to prevent unnecessary allocations.
-  KnownZero.clearAllBits();
-  KnownOne.clearAllBits();
+  Known.resetAll();
 
   // If we know the shifter operand is nonzero, we can sometimes infer more
   // known bits. However this is expensive to compute, so be lazy about it and
@@ -851,16 +849,18 @@ static void computeKnownBitsFromShiftOperator(
   Optional<bool> ShifterOperandIsNonZero;
 
   // Early exit if we can't constrain any well-defined shift amount.
-  if (!(ShiftAmtKZ & (BitWidth - 1)) && !(ShiftAmtKO & (BitWidth - 1))) {
+  if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) &&
+      !(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) {
     ShifterOperandIsNonZero =
         isKnownNonZero(I->getOperand(1), Depth + 1, Q);
     if (!*ShifterOperandIsNonZero)
       return;
   }
 
-  computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
+  computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
-  KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+  Known.Zero.setAllBits();
+  Known.One.setAllBits();
   for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
     // Combine the shifted known input bits only for those shift amounts
     // compatible with its known constraints.
@@ -879,8 +879,8 @@ static void computeKnownBitsFromShiftOperator(
         continue;
     }
 
-    KnownZero &= KZF(KnownZero2, ShiftAmt);
-    KnownOne  &= KOF(KnownOne2, ShiftAmt);
+    Known.Zero &= KZF(Known2.Zero, ShiftAmt);
+    Known.One  &= KOF(Known2.One, ShiftAmt);
   }
 
   // If there are no compatible shift amounts, then we've proven that the shift
@@ -888,33 +888,30 @@ static void computeKnownBitsFromShiftOperator(
   // return anything we'd like, but we need to make sure the sets of known bits
   // stay disjoint (it should be better for some other code to actually
   // propagate the undef than to pick a value here using known bits).
-  if ((KnownZero & KnownOne) != 0) {
-    KnownZero.clearAllBits();
-    KnownOne.clearAllBits();
-  }
+  if (Known.Zero.intersects(Known.One))
+    Known.resetAll();
 }
 
-static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
-                                         APInt &KnownOne, unsigned Depth,
-                                         const Query &Q) {
-  unsigned BitWidth = KnownZero.getBitWidth();
+static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
+                                         unsigned Depth, const Query &Q) {
+  unsigned BitWidth = Known.getBitWidth();
 
-  APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
+  KnownBits Known2(Known);
   switch (I->getOpcode()) {
   default: break;
   case Instruction::Load:
     if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
-      computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne);
+      computeKnownBitsFromRangeMetadata(*MD, Known);
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
-    KnownOne &= KnownOne2;
+    Known.One &= Known2.One;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    KnownZero |= KnownZero2;
+    Known.Zero |= Known2.Zero;
 
     // and(x, add (x, -1)) is a common idiom that always clears the low bit;
     // here we handle the more general case of adding any odd number by
@@ -922,118 +919,113 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     // TODO: This could be generalized to clearing any bit set in y where the
     // following bit is known to be unset in y.
     Value *Y = nullptr;
-    if (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
-                                      m_Value(Y))) ||
-        match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
-                                      m_Value(Y)))) {
-      APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0);
-      computeKnownBits(Y, KnownZero3, KnownOne3, Depth + 1, Q);
-      if (KnownOne3.countTrailingOnes() > 0)
-        KnownZero |= APInt::getLowBitsSet(BitWidth, 1);
+    if (!Known.Zero[0] && !Known.One[0] &&
+        (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
+                                       m_Value(Y))) ||
+         match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
+                                       m_Value(Y))))) {
+      Known2.resetAll();
+      computeKnownBits(Y, Known2, Depth + 1, Q);
+      if (Known2.countMinTrailingOnes() > 0)
+        Known.Zero.setBit(0);
     }
     break;
   }
   case Instruction::Or: {
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
-    KnownZero &= KnownZero2;
+    Known.Zero &= Known2.Zero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
-    KnownOne |= KnownOne2;
+    Known.One |= Known2.One;
     break;
   }
   case Instruction::Xor: {
-    computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
-    KnownZero = KnownZeroOut;
+    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
+    Known.Zero = std::move(KnownZeroOut);
     break;
   }
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, KnownZero,
-                        KnownOne, KnownZero2, KnownOne2, Depth, Q);
+    computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known,
+                        Known2, Depth, Q);
     break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
-    unsigned LeadZ = KnownZero2.countLeadingOnes();
-
-    KnownOne2.clearAllBits();
-    KnownZero2.clearAllBits();
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
-    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
-    if (RHSUnknownLeadingOnes != BitWidth)
-      LeadZ = std::min(BitWidth,
-                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
-
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+    unsigned LeadZ = Known2.countMinLeadingZeros();
+
+    Known2.resetAll();
+    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
+    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+    if (RHSMaxLeadingZeros != BitWidth)
+      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
+
+    Known.Zero.setHighBits(LeadZ);
     break;
   }
   case Instruction::Select: {
-    computeKnownBits(I->getOperand(2), KnownZero, KnownOne, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
-
-    const Value *LHS;
-    const Value *RHS;
+    const Value *LHS, *RHS;
     SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
     if (SelectPatternResult::isMinOrMax(SPF)) {
-      computeKnownBits(RHS, KnownZero, KnownOne, Depth + 1, Q);
-      computeKnownBits(LHS, KnownZero2, KnownOne2, Depth + 1, Q);
+      computeKnownBits(RHS, Known, Depth + 1, Q);
+      computeKnownBits(LHS, Known2, Depth + 1, Q);
     } else {
-      computeKnownBits(I->getOperand(2), KnownZero, KnownOne, Depth + 1, Q);
-      computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
+      computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
+      computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
     }
 
     unsigned MaxHighOnes = 0;
     unsigned MaxHighZeros = 0;
     if (SPF == SPF_SMAX) {
       // If both sides are negative, the result is negative.
-      if (KnownOne[BitWidth - 1] && KnownOne2[BitWidth - 1])
+      if (Known.isNegative() && Known2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
         MaxHighOnes =
-            std::max(KnownOne.countLeadingOnes(), KnownOne2.countLeadingOnes());
+            std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
       // If either side is non-negative, the result is non-negative.
-      else if (KnownZero[BitWidth - 1] || KnownZero2[BitWidth - 1])
+      else if (Known.isNonNegative() || Known2.isNonNegative())
         MaxHighZeros = 1;
     } else if (SPF == SPF_SMIN) {
       // If both sides are non-negative, the result is non-negative.
-      if (KnownZero[BitWidth - 1] && KnownZero2[BitWidth - 1])
+      if (Known.isNonNegative() && Known2.isNonNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
-        MaxHighZeros = std::max(KnownZero.countLeadingOnes(),
-                                KnownZero2.countLeadingOnes());
+        MaxHighZeros = std::max(Known.countMinLeadingZeros(),
+                                Known2.countMinLeadingZeros());
       // If either side is negative, the result is negative.
-      else if (KnownOne[BitWidth - 1] || KnownOne2[BitWidth - 1])
+      else if (Known.isNegative() || Known2.isNegative())
         MaxHighOnes = 1;
     } else if (SPF == SPF_UMAX) {
       // We can derive a lower bound on the result by taking the max of the
       // leading one bits.
       MaxHighOnes =
-          std::max(KnownOne.countLeadingOnes(), KnownOne2.countLeadingOnes());
+          std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
     } else if (SPF == SPF_UMIN) {
       // We can derive an upper bound on the result by taking the max of the
       // leading zero bits.
       MaxHighZeros =
-          std::max(KnownZero.countLeadingOnes(), KnownZero2.countLeadingOnes());
+          std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     }
 
     // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
     if (MaxHighOnes > 0)
-      KnownOne |= APInt::getHighBitsSet(BitWidth, MaxHighOnes);
+      Known.One.setHighBits(MaxHighOnes);
     if (MaxHighZeros > 0)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, MaxHighZeros);
+      Known.Zero.setHighBits(MaxHighZeros);
     break;
   }
   case Instruction::FPTrunc:
@@ -1057,14 +1049,12 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType());
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
-    KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
-    KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
-    KnownZero = KnownZero.zextOrTrunc(BitWidth);
-    KnownOne = KnownOne.zextOrTrunc(BitWidth);
+    Known = Known.zextOrTrunc(SrcBitWidth);
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    Known = Known.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+      Known.Zero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::BitCast: {
@@ -1073,7 +1063,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
-      computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
+      computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
       break;
     }
     break;
@@ -1082,90 +1072,75 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     // Compute the bits in the result that are not present in the input.
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
-    KnownZero = KnownZero.trunc(SrcBitWidth);
-    KnownOne = KnownOne.trunc(SrcBitWidth);
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
-
+    Known = Known.trunc(SrcBitWidth);
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
-    if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
-    else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
-      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    Known = Known.sext(BitWidth);
     break;
   }
   case Instruction::Shl: {
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    auto KZF = [BitWidth, NSW](const APInt &KnownZero, unsigned ShiftAmt) {
-      APInt KZResult =
-          (KnownZero << ShiftAmt) |
-          APInt::getLowBitsSet(BitWidth, ShiftAmt); // Low bits known 0.
+    auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
+      APInt KZResult = KnownZero << ShiftAmt;
+      KZResult.setLowBits(ShiftAmt); // Low bits known 0.
       // If this shift has "nsw" keyword, then the result is either a poison
       // value or has the same sign bit as the first operand.
-      if (NSW && KnownZero.isNegative())
-        KZResult.setBit(BitWidth - 1);
+      if (NSW && KnownZero.isSignBitSet())
+        KZResult.setSignBit();
       return KZResult;
     };
 
-    auto KOF = [BitWidth, NSW](const APInt &KnownOne, unsigned ShiftAmt) {
+    auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
       APInt KOResult = KnownOne << ShiftAmt;
-      if (NSW && KnownOne.isNegative())
-        KOResult.setBit(BitWidth - 1);
+      if (NSW && KnownOne.isSignBitSet())
+        KOResult.setSignBit();
       return KOResult;
     };
 
-    computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
-                                      KnownZero2, KnownOne2, Depth, Q, KZF,
-                                      KOF);
+    computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::LShr: {
     // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
-      return APIntOps::lshr(KnownZero, ShiftAmt) |
-             // High bits known zero.
-             APInt::getHighBitsSet(BitWidth, ShiftAmt);
+    auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
+      APInt KZResult = KnownZero.lshr(ShiftAmt);
+      // High bits known zero.
+      KZResult.setHighBits(ShiftAmt);
+      return KZResult;
     };
 
-    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
-      return APIntOps::lshr(KnownOne, ShiftAmt);
+    auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
+      return KnownOne.lshr(ShiftAmt);
     };
 
-    computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
-                                      KnownZero2, KnownOne2, Depth, Q, KZF,
-                                      KOF);
+    computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::AShr: {
     // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
-    auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
-      return APIntOps::ashr(KnownZero, ShiftAmt);
+    auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
+      return KnownZero.ashr(ShiftAmt);
     };
 
-    auto KOF = [BitWidth](const APInt &KnownOne, unsigned ShiftAmt) {
-      return APIntOps::ashr(KnownOne, ShiftAmt);
+    auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
+      return KnownOne.ashr(ShiftAmt);
     };
 
-    computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
-                                      KnownZero2, KnownOne2, Depth, Q, KZF,
-                                      KOF);
+    computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
-                           KnownZero, KnownOne, KnownZero2, KnownOne2, Depth,
-                           Q);
+                           Known, Known2, Depth, Q);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
-                           KnownZero, KnownOne, KnownZero2, KnownOne2, Depth,
-                           Q);
+                           Known, Known2, Depth, Q);
     break;
   }
   case Instruction::SRem:
@@ -1173,37 +1148,33 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1,
-                         Q);
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
         // The low bits of the first operand are unchanged by the srem.
-        KnownZero = KnownZero2 & LowBits;
-        KnownOne = KnownOne2 & LowBits;
+        Known.Zero = Known2.Zero & LowBits;
+        Known.One = Known2.One & LowBits;
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
-        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
-          KnownZero |= ~LowBits;
+        if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
+          Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
-        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
-          KnownOne |= ~LowBits;
+        if (Known2.isNegative() && LowBits.intersects(Known2.One))
+          Known.One |= ~LowBits;
 
-        assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+        assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+        break;
       }
     }
 
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
-    if (KnownZero.isNonNegative()) {
-      APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       Q);
-      // If it's known zero, our sign bit is also zero.
-      if (LHSKnownZero.isNegative())
-        KnownZero.setBit(BitWidth - 1);
-    }
+    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+    // If it's known zero, our sign bit is also zero.
+    if (Known2.isNonNegative())
+      Known.makeNonNegative();
 
     break;
   case Instruction::URem: {
@@ -1211,22 +1182,22 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       const APInt &RA = Rem->getValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
-        KnownZero |= ~LowBits;
-        KnownOne &= LowBits;
+        computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+        Known.Zero |= ~LowBits;
+        Known.One &= LowBits;
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
-    computeKnownBits(I->getOperand(1), KnownZero2, KnownOne2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
 
-    unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
-                                KnownZero2.countLeadingOnes());
-    KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    unsigned Leaders =
+        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
+    Known.resetAll();
+    Known.Zero.setHighBits(Leaders);
     break;
   }
 
@@ -1237,16 +1208,15 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
 
     if (Align > 0)
-      KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+      Known.Zero.setLowBits(countTrailingZeros(Align));
     break;
   }
   case Instruction::GetElementPtr: {
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
-    APInt LocalKnownZero(BitWidth, 0), LocalKnownOne(BitWidth, 0);
-    computeKnownBits(I->getOperand(0), LocalKnownZero, LocalKnownOne, Depth + 1,
-                     Q);
-    unsigned TrailZ = LocalKnownZero.countTrailingOnes();
+    KnownBits LocalKnown(BitWidth);
+    computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
+    unsigned TrailZ = LocalKnown.countMinTrailingZeros();
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
@@ -1276,15 +1246,15 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         }
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
-        LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
-        computeKnownBits(Index, LocalKnownZero, LocalKnownOne, Depth + 1, Q);
+        LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
+        computeKnownBits(Index, LocalKnown, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
-                                   LocalKnownZero.countTrailingOnes()));
+                                   LocalKnown.countMinTrailingZeros()));
       }
     }
 
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ);
+    Known.Zero.setLowBits(TrailZ);
     break;
   }
   case Instruction::PHI: {
@@ -1319,15 +1289,14 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
-          computeKnownBits(R, KnownZero2, KnownOne2, Depth + 1, Q);
+          computeKnownBits(R, Known2, Depth + 1, Q);
 
           // We need to take the minimum number of known bits
-          APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
-          computeKnownBits(L, KnownZero3, KnownOne3, Depth + 1, Q);
+          KnownBits Known3(Known);
+          computeKnownBits(L, Known3, Depth + 1, Q);
 
-          KnownZero = APInt::getLowBitsSet(
-              BitWidth, std::min(KnownZero2.countTrailingOnes(),
-                                 KnownZero3.countTrailingOnes()));
+          Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
+                                         Known3.countMinTrailingZeros()));
 
           if (DontImproveNonNegativePhiBits)
             break;
@@ -1344,25 +1313,25 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
             // (add non-negative, non-negative) --> non-negative
             // (add negative, negative) --> negative
             if (Opcode == Instruction::Add) {
-              if (KnownZero2.isNegative() && KnownZero3.isNegative())
-                KnownZero.setBit(BitWidth - 1);
-              else if (KnownOne2.isNegative() && KnownOne3.isNegative())
-                KnownOne.setBit(BitWidth - 1);
+              if (Known2.isNonNegative() && Known3.isNonNegative())
+                Known.makeNonNegative();
+              else if (Known2.isNegative() && Known3.isNegative())
+                Known.makeNegative();
             }
 
             // (sub nsw non-negative, negative) --> non-negative
             // (sub nsw negative, non-negative) --> negative
             else if (Opcode == Instruction::Sub && LL == I) {
-              if (KnownZero2.isNegative() && KnownOne3.isNegative())
-                KnownZero.setBit(BitWidth - 1);
-              else if (KnownOne2.isNegative() && KnownZero3.isNegative())
-                KnownOne.setBit(BitWidth - 1);
+              if (Known2.isNonNegative() && Known3.isNegative())
+                Known.makeNonNegative();
+              else if (Known2.isNegative() && Known3.isNonNegative())
+                Known.makeNegative();
             }
 
             // (mul nsw non-negative, non-negative) --> non-negative
-            else if (Opcode == Instruction::Mul && KnownZero2.isNegative() &&
-                     KnownZero3.isNegative())
-              KnownZero.setBit(BitWidth - 1);
+            else if (Opcode == Instruction::Mul && Known2.isNonNegative() &&
+                     Known3.isNonNegative())
+              Known.makeNonNegative();
           }
 
           break;
@@ -1376,27 +1345,26 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
 
     // Otherwise take the unions of the known bit sets of the operands,
     // taking conservative care to avoid excessive recursion.
-    if (Depth < MaxDepth - 1 && !KnownZero && !KnownOne) {
+    if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) {
       // Skip if every incoming value references to ourself.
       if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
-      KnownZero = APInt::getAllOnesValue(BitWidth);
-      KnownOne = APInt::getAllOnesValue(BitWidth);
+      Known.Zero.setAllBits();
+      Known.One.setAllBits();
       for (Value *IncValue : P->incoming_values()) {
         // Skip direct self references.
         if (IncValue == P) continue;
 
-        KnownZero2 = APInt(BitWidth, 0);
-        KnownOne2 = APInt(BitWidth, 0);
+        Known2 = KnownBits(BitWidth);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
-        computeKnownBits(IncValue, KnownZero2, KnownOne2, MaxDepth - 1, Q);
-        KnownZero &= KnownZero2;
-        KnownOne &= KnownOne2;
+        computeKnownBits(IncValue, Known2, MaxDepth - 1, Q);
+        Known.Zero &= Known2.Zero;
+        Known.One &= Known2.One;
         // If all bits have been ruled out, there's no need to check
         // more operands.
-        if (!KnownZero && !KnownOne)
+        if (!Known.Zero && !Known.One)
           break;
       }
     }
@@ -1408,45 +1376,60 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     // and then intersect with known bits based on other properties of the
     // function.
     if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
-      computeKnownBitsFromRangeMetadata(*MD, KnownZero, KnownOne);
+      computeKnownBitsFromRangeMetadata(*MD, Known);
     if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) {
-      computeKnownBits(RV, KnownZero2, KnownOne2, Depth + 1, Q);
-      KnownZero |= KnownZero2;
-      KnownOne |= KnownOne2;
+      computeKnownBits(RV, Known2, Depth + 1, Q);
+      Known.Zero |= Known2.Zero;
+      Known.One |= Known2.One;
     }
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
+      case Intrinsic::bitreverse:
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        Known.Zero |= Known2.Zero.reverseBits();
+        Known.One |= Known2.One.reverseBits();
+        break;
       case Intrinsic::bswap:
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
-        KnownZero |= KnownZero2.byteSwap();
-        KnownOne |= KnownOne2.byteSwap();
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        Known.Zero |= Known2.Zero.byteSwap();
+        Known.One |= Known2.One.byteSwap();
         break;
-      case Intrinsic::ctlz:
+      case Intrinsic::ctlz: {
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        // If we have a known 1, its position is our upper bound.
+        unsigned PossibleLZ = Known2.One.countLeadingZeros();
+        // If this call is undefined for 0, the result will be less than 2^n.
+        if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
+          PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
+        unsigned LowBits = Log2_32(PossibleLZ)+1;
+        Known.Zero.setBitsFrom(LowBits);
+        break;
+      }
       case Intrinsic::cttz: {
-        unsigned LowBits = Log2_32(BitWidth)+1;
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        // If we have a known 1, its position is our upper bound.
+        unsigned PossibleTZ = Known2.One.countTrailingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
-          LowBits -= 1;
-        KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+          PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
+        unsigned LowBits = Log2_32(PossibleTZ)+1;
+        Known.Zero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::ctpop: {
-        computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
-        unsigned BitsPossiblySet = BitWidth - KnownZero2.countPopulation();
-        unsigned LeadingZeros =
-          APInt(BitWidth, BitsPossiblySet).countLeadingZeros();
-        assert(LeadingZeros <= BitWidth);
-        KnownZero |= APInt::getHighBitsSet(BitWidth, LeadingZeros);
-        KnownOne &= ~KnownZero;
+        unsigned BitsPossiblySet = Known2.countMaxPopulation();
+        unsigned LowBits = Log2_32(BitsPossiblySet)+1;
+        Known.Zero.setBitsFrom(LowBits);
         // TODO: we could bound KnownOne using the lower bound on the number
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero |= APInt::getHighBitsSet(64, 32);
+        Known.Zero.setBitsFrom(32);
         break;
       }
     }
@@ -1456,7 +1439,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     // tracking the specific element. But at least we might find information
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
-    computeKnownBits(I->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     break;
   case Instruction::ExtractValue:
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
@@ -1468,20 +1451,19 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         case Intrinsic::uadd_with_overflow:
         case Intrinsic::sadd_with_overflow:
           computeKnownBitsAddSub(true, II->getArgOperand(0),
-                                 II->getArgOperand(1), false, KnownZero,
-                                 KnownOne, KnownZero2, KnownOne2, Depth, Q);
+                                 II->getArgOperand(1), false, Known, Known2,
+                                 Depth, Q);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           computeKnownBitsAddSub(false, II->getArgOperand(0),
-                                 II->getArgOperand(1), false, KnownZero,
-                                 KnownOne, KnownZero2, KnownOne2, Depth, Q);
+                                 II->getArgOperand(1), false, Known, Known2,
+                                 Depth, Q);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
-                              KnownZero, KnownOne, KnownZero2, KnownOne2, Depth,
-                              Q);
+                              Known, Known2, Depth, Q);
           break;
         }
       }
@@ -1490,7 +1472,15 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
 }
 
 /// Determine which bits of V are known to be either zero or one and return
-/// them in the KnownZero/KnownOne bit sets.
+/// them.
+KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
+  KnownBits Known(getBitWidth(V->getType(), Q.DL));
+  computeKnownBits(V, Known, Depth, Q);
+  return Known;
+}
+
+/// Determine which bits of V are known to be either zero or one and return
+/// them in the Known bit set.
 ///
 /// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
 /// we cannot optimize based on the assumption that it is zero without changing
@@ -1504,33 +1494,29 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
 /// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
-void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
-                      unsigned Depth, const Query &Q) {
+void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
+                      const Query &Q) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
-  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned BitWidth = Known.getBitWidth();
 
-  assert((V->getType()->isIntOrIntVectorTy() ||
-          V->getType()->getScalarType()->isPointerTy()) &&
+  assert((V->getType()->isIntOrIntVectorTy(BitWidth) ||
+          V->getType()->isPtrOrPtrVectorTy()) &&
          "Not integer or pointer type!");
-  assert((Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth) &&
-         (!V->getType()->isIntOrIntVectorTy() ||
-          V->getType()->getScalarSizeInBits() == BitWidth) &&
-         KnownZero.getBitWidth() == BitWidth &&
-         KnownOne.getBitWidth() == BitWidth &&
-         "V, KnownOne and KnownZero should have same BitWidth");
+  assert(Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth &&
+         "V and Known should have same BitWidth");
+  (void)BitWidth;
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
     // We know all of the bits for a scalar constant or a splat vector constant!
-    KnownOne = *C;
-    KnownZero = ~KnownOne;
+    Known.One = *C;
+    Known.Zero = ~Known.One;
     return;
   }
   // Null and aggregate-zero are all-zeros.
   if (isa<ConstantPointerNull>(V) || isa<ConstantAggregateZero>(V)) {
-    KnownOne.clearAllBits();
-    KnownZero = APInt::getAllOnesValue(BitWidth);
+    Known.setAllZero();
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
@@ -1538,12 +1524,12 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
     // We know that CDS must be a vector of integers. Take the intersection of
     // each element.
-    KnownZero.setAllBits(); KnownOne.setAllBits();
-    APInt Elt(KnownZero.getBitWidth(), 0);
+    Known.Zero.setAllBits(); Known.One.setAllBits();
+    APInt Elt(BitWidth, 0);
     for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
       Elt = CDS->getElementAsInteger(i);
-      KnownZero &= ~Elt;
-      KnownOne &= Elt;
+      Known.Zero &= ~Elt;
+      Known.One &= Elt;
     }
     return;
   }
@@ -1551,25 +1537,24 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   if (const auto *CV = dyn_cast<ConstantVector>(V)) {
     // We know that CV must be a vector of integers. Take the intersection of
     // each element.
-    KnownZero.setAllBits(); KnownOne.setAllBits();
-    APInt Elt(KnownZero.getBitWidth(), 0);
+    Known.Zero.setAllBits(); Known.One.setAllBits();
+    APInt Elt(BitWidth, 0);
     for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
       Constant *Element = CV->getAggregateElement(i);
       auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
       if (!ElementCI) {
-        KnownZero.clearAllBits();
-        KnownOne.clearAllBits();
+        Known.resetAll();
         return;
       }
       Elt = ElementCI->getValue();
-      KnownZero &= ~Elt;
-      KnownOne &= Elt;
+      Known.Zero &= ~Elt;
+      Known.One &= Elt;
     }
     return;
   }
 
   // Start out not knowing anything.
-  KnownZero.clearAllBits(); KnownOne.clearAllBits();
+  Known.resetAll();
 
   // We can't imply anything about undefs.
   if (isa<UndefValue>(V))
@@ -1588,44 +1573,27 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   // the bits of its aliasee.
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
     if (!GA->isInterposable())
-      computeKnownBits(GA->getAliasee(), KnownZero, KnownOne, Depth + 1, Q);
+      computeKnownBits(GA->getAliasee(), Known, Depth + 1, Q);
     return;
   }
 
   if (const Operator *I = dyn_cast<Operator>(V))
-    computeKnownBitsFromOperator(I, KnownZero, KnownOne, Depth, Q);
+    computeKnownBitsFromOperator(I, Known, Depth, Q);
 
-  // Aligned pointers have trailing zeros - refine KnownZero set
+  // Aligned pointers have trailing zeros - refine Known.Zero set
   if (V->getType()->isPointerTy()) {
     unsigned Align = V->getPointerAlignment(Q.DL);
     if (Align)
-      KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+      Known.Zero.setLowBits(countTrailingZeros(Align));
   }
 
-  // computeKnownBitsFromAssume strictly refines KnownZero and
-  // KnownOne. Therefore, we run them after computeKnownBitsFromOperator.
+  // computeKnownBitsFromAssume strictly refines Known.
+  // Therefore, we run them after computeKnownBitsFromOperator.
 
   // Check whether a nearby assume intrinsic can determine some known bits.
-  computeKnownBitsFromAssume(V, KnownZero, KnownOne, Depth, Q);
+  computeKnownBitsFromAssume(V, Known, Depth, Q);
 
-  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-}
-
-/// Determine whether the sign bit is known to be zero or one.
-/// Convenience wrapper around computeKnownBits.
-void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
-                    unsigned Depth, const Query &Q) {
-  unsigned BitWidth = getBitWidth(V->getType(), Q.DL);
-  if (!BitWidth) {
-    KnownZero = false;
-    KnownOne = false;
-    return;
-  }
-  APInt ZeroBits(BitWidth, 0);
-  APInt OneBits(BitWidth, 0);
-  computeKnownBits(V, ZeroBits, OneBits, Depth, Q);
-  KnownOne = OneBits[BitWidth - 1];
-  KnownZero = ZeroBits[BitWidth - 1];
+  assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
 }
 
 /// Return true if the given value is known to have exactly one
@@ -1648,9 +1616,9 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
   if (match(V, m_Shl(m_One(), m_Value())))
     return true;
 
-  // (signbit) >>l X is clearly a power of two if the one is not shifted off the
-  // bottom.  If it is shifted off the bottom then the result is undefined.
-  if (match(V, m_LShr(m_SignBit(), m_Value())))
+  // (signmask) >>l X is clearly a power of two if the one is not shifted off
+  // the bottom.  If it is shifted off the bottom then the result is undefined.
+  if (match(V, m_LShr(m_SignMask(), m_Value())))
     return true;
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
@@ -1697,18 +1665,18 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
           return true;
 
       unsigned BitWidth = V->getType()->getScalarSizeInBits();
-      APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0);
-      computeKnownBits(X, LHSZeroBits, LHSOneBits, Depth, Q);
+      KnownBits LHSBits(BitWidth);
+      computeKnownBits(X, LHSBits, Depth, Q);
 
-      APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0);
-      computeKnownBits(Y, RHSZeroBits, RHSOneBits, Depth, Q);
+      KnownBits RHSBits(BitWidth);
+      computeKnownBits(Y, RHSBits, Depth, Q);
       // If i8 V is a power of two or zero:
       //  ZeroBits: 1 1 1 0 1 1 1 1
       // ~ZeroBits: 0 0 0 1 0 0 0 0
-      if ((~(LHSZeroBits & RHSZeroBits)).isPowerOf2())
+      if ((~(LHSBits.Zero & RHSBits.Zero)).isPowerOf2())
         // If OrZero isn't set, we cannot give back a zero result.
         // Make sure either the LHS or RHS has a bit set.
-        if (OrZero || RHSOneBits.getBoolValue() || LHSOneBits.getBoolValue())
+        if (OrZero || RHSBits.One.getBoolValue() || LHSBits.One.getBoolValue())
           return true;
     }
   }
@@ -1805,10 +1773,12 @@ static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value)
   return true;
 }
 
-/// Return true if the given value is known to be non-zero when defined.
-/// For vectors return true if every element is known to be non-zero when
-/// defined. Supports values with integer or pointer type and vectors of
-/// integers.
+/// Return true if the given value is known to be non-zero when defined. For
+/// vectors, return true if every element is known to be non-zero when
+/// defined. For pointers, if the context instruction and dominator tree are
+/// specified, perform context-sensitive analysis and return true if the
+/// pointer couldn't possibly be null at the specified instruction.
+/// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
@@ -1851,7 +1821,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
 
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
-    if (isKnownNonNull(V))
+    if (isKnownNonNullAt(V, Q.CxtI, Q.DT))
       return true;
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
       if (isGEPKnownNonNull(GEP, Depth, Q))
@@ -1871,16 +1841,15 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
 
   // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
   // if the lowest bit is shifted off the end.
-  if (BitWidth && match(V, m_Shl(m_Value(X), m_Value(Y)))) {
+  if (match(V, m_Shl(m_Value(X), m_Value(Y)))) {
     // shl nuw can't remove any non-zero bits.
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     if (BO->hasNoUnsignedWrap())
       return isKnownNonZero(X, Depth, Q);
 
-    APInt KnownZero(BitWidth, 0);
-    APInt KnownOne(BitWidth, 0);
-    computeKnownBits(X, KnownZero, KnownOne, Depth, Q);
-    if (KnownOne[0])
+    KnownBits Known(BitWidth);
+    computeKnownBits(X, Known, Depth, Q);
+    if (Known.One[0])
       return true;
   }
   // shr X, Y != 0 if X is negative.  Note that the value of the shift is not
@@ -1891,25 +1860,20 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     if (BO->isExact())
       return isKnownNonZero(X, Depth, Q);
 
-    bool XKnownNonNegative, XKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
-    if (XKnownNegative)
+    KnownBits Known = computeKnownBits(X, Depth, Q);
+    if (Known.isNegative())
       return true;
 
     // If the shifter operand is a constant, and all of the bits shifted
     // out are known to be zero, and X is known non-zero then at least one
     // non-zero bit must remain.
     if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
-      computeKnownBits(X, KnownZero, KnownOne, Depth, Q);
-
       auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
       // Is there a known one in the portion not shifted out?
-      if (KnownOne.countLeadingZeros() < BitWidth - ShiftVal)
+      if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
         return true;
       // Are all the bits to be shifted out known zero?
-      if (KnownZero.countTrailingOnes() >= ShiftVal)
+      if (Known.countMinTrailingZeros() >= ShiftVal)
         return isKnownNonZero(X, Depth, Q);
     }
   }
@@ -1919,40 +1883,34 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
-    bool XKnownNonNegative, XKnownNegative;
-    bool YKnownNonNegative, YKnownNegative;
-    ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q);
-    ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q);
+    KnownBits XKnown = computeKnownBits(X, Depth, Q);
+    KnownBits YKnown = computeKnownBits(Y, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
-    if (XKnownNonNegative && YKnownNonNegative)
+    if (XKnown.isNonNegative() && YKnown.isNonNegative())
       if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
     // zero unless both X and Y equal INT_MIN.
-    if (BitWidth && XKnownNegative && YKnownNegative) {
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
+    if (XKnown.isNegative() && YKnown.isNegative()) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
-      computeKnownBits(X, KnownZero, KnownOne, Depth, Q);
-      if ((KnownOne & Mask) != 0)
+      if (XKnown.One.intersects(Mask))
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
-      computeKnownBits(Y, KnownZero, KnownOne, Depth, Q);
-      if ((KnownOne & Mask) != 0)
+      if (YKnown.One.intersects(Mask))
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
-    if (XKnownNonNegative &&
+    if (XKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
       return true;
-    if (YKnownNonNegative &&
+    if (YKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
       return true;
   }
@@ -1992,17 +1950,15 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     }
     // Check if all incoming values are non-zero constant.
     bool AllNonZeroConstants = all_of(PN->operands(), [](Value *V) {
-      return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZeroValue();
+      return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZero();
     });
     if (AllNonZeroConstants)
       return true;
   }
 
-  if (!BitWidth) return false;
-  APInt KnownZero(BitWidth, 0);
-  APInt KnownOne(BitWidth, 0);
-  computeKnownBits(V, KnownZero, KnownOne, Depth, Q);
-  return KnownOne != 0;
+  KnownBits Known(BitWidth);
+  computeKnownBits(V, Known, Depth, Q);
+  return Known.One != 0;
 }
 
 /// Return true if V2 == V1 + X, where X is known non-zero.
@@ -2022,7 +1978,7 @@ static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) {
 
 /// Return true if it is known that V1 != V2.
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) {
-  if (V1->getType()->isVectorTy() || V1 == V2)
+  if (V1 == V2)
     return false;
   if (V1->getType() != V2->getType())
     // We can't look through casts yet.
@@ -2030,19 +1986,14 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) {
   if (isAddOfNonZero(V1, V2, Q) || isAddOfNonZero(V2, V1, Q))
     return true;
 
-  if (IntegerType *Ty = dyn_cast<IntegerType>(V1->getType())) {
+  if (V1->getType()->isIntOrIntVectorTy()) {
     // Are any known bits in V1 contradictory to known bits in V2? If V1
     // has a known zero where V2 has a known one, they must not be equal.
-    auto BitWidth = Ty->getBitWidth();
-    APInt KnownZero1(BitWidth, 0);
-    APInt KnownOne1(BitWidth, 0);
-    computeKnownBits(V1, KnownZero1, KnownOne1, 0, Q);
-    APInt KnownZero2(BitWidth, 0);
-    APInt KnownOne2(BitWidth, 0);
-    computeKnownBits(V2, KnownZero2, KnownOne2, 0, Q);
-
-    auto OppositeBits = (KnownZero1 & KnownOne2) | (KnownZero2 & KnownOne1);
-    if (OppositeBits.getBoolValue())
+    KnownBits Known1 = computeKnownBits(V1, 0, Q);
+    KnownBits Known2 = computeKnownBits(V2, 0, Q);
+
+    if (Known1.Zero.intersects(Known2.One) ||
+        Known2.Zero.intersects(Known1.One))
       return true;
   }
   return false;
@@ -2059,9 +2010,9 @@ static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) {
 /// for all of the elements in the vector.
 bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
                        const Query &Q) {
-  APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
-  computeKnownBits(V, KnownZero, KnownOne, Depth, Q);
-  return (KnownZero & Mask) == Mask;
+  KnownBits Known(Mask.getBitWidth());
+  computeKnownBits(V, Known, Depth, Q);
+  return Mask.isSubsetOf(Known.Zero);
 }
 
 /// For vector constants, loop over the elements and find the constant with the
@@ -2092,13 +2043,29 @@ static unsigned computeNumSignBitsVectorConstant(const Value *V,
   return MinSignBits;
 }
 
+static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
+                                       const Query &Q);
+
+static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
+                                   const Query &Q) {
+  unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
+  assert(Result > 0 && "At least one sign bit needs to be present!");
+  return Result;
+}
+
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign bit
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
 /// vector element with the mininum number of known sign bits.
-unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
+static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
+                                       const Query &Q) {
+
+  // We return the minimum number of sign bits that are guaranteed to be present
+  // in V, so for undef we have to conservatively return 1.  We don't have the
+  // same behavior for poison though -- that's a FIXME today.
+
   unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
@@ -2174,7 +2141,10 @@ unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
-      Tmp += ShAmt->getZExtValue();
+      unsigned ShAmtLimited = ShAmt->getZExtValue();
+      if (ShAmtLimited >= TyBits)
+        break;  // Bad shift.
+      Tmp += ShAmtLimited;
       if (Tmp > TyBits) Tmp = TyBits;
     }
     return Tmp;
@@ -2220,17 +2190,17 @@ unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
     // Special case decrementing a value (ADD X, -1):
     if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
-        APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        computeKnownBits(U->getOperand(0), KnownZero, KnownOne, Depth + 1, Q);
+        KnownBits Known(TyBits);
+        computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
+        if ((Known.Zero | 1).isAllOnesValue())
           return TyBits;
 
         // If we are subtracting one from a positive number, there is no carry
         // out of the result.
-        if (KnownZero.isNegative())
+        if (Known.isNonNegative())
           return Tmp;
       }
 
@@ -2245,16 +2215,16 @@ unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
     // Handle NEG.
     if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
-        APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-        computeKnownBits(U->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
+        KnownBits Known(TyBits);
+        computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(TyBits, 1)).isAllOnesValue())
+        if ((Known.Zero | 1).isAllOnesValue())
           return TyBits;
 
         // If the input is known to be positive (the sign bit is known clear),
         // the output of the NEG has the same number of sign bits as the input.
-        if (KnownZero.isNegative())
+        if (Known.isNonNegative())
           return Tmp2;
 
         // Otherwise, we treat this like a SUB.
@@ -2306,19 +2276,12 @@ unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
   if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
     return VecSignBits;
 
-  APInt KnownZero(TyBits, 0), KnownOne(TyBits, 0);
-  computeKnownBits(V, KnownZero, KnownOne, Depth, Q);
+  KnownBits Known(TyBits);
+  computeKnownBits(V, Known, Depth, Q);
 
   // If we know that the sign bit is either zero or one, determine the number of
   // identical bits in the top of the input value.
-  if (KnownZero.isNegative())
-    return std::max(FirstAnswer, KnownZero.countLeadingOnes());
-
-  if (KnownOne.isNegative())
-    return std::max(FirstAnswer, KnownOne.countLeadingOnes());
-
-  // computeKnownBits gave us no extra information about the top bits.
-  return FirstAnswer;
+  return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
 /// This function computes the integer multiple of Base that equals V.
@@ -2368,6 +2331,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
   case Instruction::SExt:
     if (!LookThroughSExt) return false;
     // otherwise fall through to ZExt
+    LLVM_FALLTHROUGH;
   case Instruction::ZExt:
     return ComputeMultiple(I->getOperand(0), Base, Multiple,
                            LookThroughSExt, Depth+1);
@@ -2453,7 +2417,7 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
   if (!TLI)
     return Intrinsic::not_intrinsic;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   // We're going to make assumptions on the semantics of the functions, check
   // that the target knows that it's available in this environment and it does
   // not have local linkage.
@@ -2468,81 +2432,81 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
   switch (Func) {
   default:
     break;
-  case LibFunc::sin:
-  case LibFunc::sinf:
-  case LibFunc::sinl:
+  case LibFunc_sin:
+  case LibFunc_sinf:
+  case LibFunc_sinl:
     return Intrinsic::sin;
-  case LibFunc::cos:
-  case LibFunc::cosf:
-  case LibFunc::cosl:
+  case LibFunc_cos:
+  case LibFunc_cosf:
+  case LibFunc_cosl:
     return Intrinsic::cos;
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
     return Intrinsic::exp;
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
     return Intrinsic::exp2;
-  case LibFunc::log:
-  case LibFunc::logf:
-  case LibFunc::logl:
+  case LibFunc_log:
+  case LibFunc_logf:
+  case LibFunc_logl:
     return Intrinsic::log;
-  case LibFunc::log10:
-  case LibFunc::log10f:
-  case LibFunc::log10l:
+  case LibFunc_log10:
+  case LibFunc_log10f:
+  case LibFunc_log10l:
     return Intrinsic::log10;
-  case LibFunc::log2:
-  case LibFunc::log2f:
-  case LibFunc::log2l:
+  case LibFunc_log2:
+  case LibFunc_log2f:
+  case LibFunc_log2l:
     return Intrinsic::log2;
-  case LibFunc::fabs:
-  case LibFunc::fabsf:
-  case LibFunc::fabsl:
+  case LibFunc_fabs:
+  case LibFunc_fabsf:
+  case LibFunc_fabsl:
     return Intrinsic::fabs;
-  case LibFunc::fmin:
-  case LibFunc::fminf:
-  case LibFunc::fminl:
+  case LibFunc_fmin:
+  case LibFunc_fminf:
+  case LibFunc_fminl:
     return Intrinsic::minnum;
-  case LibFunc::fmax:
-  case LibFunc::fmaxf:
-  case LibFunc::fmaxl:
+  case LibFunc_fmax:
+  case LibFunc_fmaxf:
+  case LibFunc_fmaxl:
     return Intrinsic::maxnum;
-  case LibFunc::copysign:
-  case LibFunc::copysignf:
-  case LibFunc::copysignl:
+  case LibFunc_copysign:
+  case LibFunc_copysignf:
+  case LibFunc_copysignl:
     return Intrinsic::copysign;
-  case LibFunc::floor:
-  case LibFunc::floorf:
-  case LibFunc::floorl:
+  case LibFunc_floor:
+  case LibFunc_floorf:
+  case LibFunc_floorl:
     return Intrinsic::floor;
-  case LibFunc::ceil:
-  case LibFunc::ceilf:
-  case LibFunc::ceill:
+  case LibFunc_ceil:
+  case LibFunc_ceilf:
+  case LibFunc_ceill:
     return Intrinsic::ceil;
-  case LibFunc::trunc:
-  case LibFunc::truncf:
-  case LibFunc::truncl:
+  case LibFunc_trunc:
+  case LibFunc_truncf:
+  case LibFunc_truncl:
     return Intrinsic::trunc;
-  case LibFunc::rint:
-  case LibFunc::rintf:
-  case LibFunc::rintl:
+  case LibFunc_rint:
+  case LibFunc_rintf:
+  case LibFunc_rintl:
     return Intrinsic::rint;
-  case LibFunc::nearbyint:
-  case LibFunc::nearbyintf:
-  case LibFunc::nearbyintl:
+  case LibFunc_nearbyint:
+  case LibFunc_nearbyintf:
+  case LibFunc_nearbyintl:
     return Intrinsic::nearbyint;
-  case LibFunc::round:
-  case LibFunc::roundf:
-  case LibFunc::roundl:
+  case LibFunc_round:
+  case LibFunc_roundf:
+  case LibFunc_roundl:
     return Intrinsic::round;
-  case LibFunc::pow:
-  case LibFunc::powf:
-  case LibFunc::powl:
+  case LibFunc_pow:
+  case LibFunc_powf:
+  case LibFunc_powl:
     return Intrinsic::pow;
-  case LibFunc::sqrt:
-  case LibFunc::sqrtf:
-  case LibFunc::sqrtl:
+  case LibFunc_sqrt:
+  case LibFunc_sqrtf:
+  case LibFunc_sqrtl:
     if (ICS->hasNoNaNs())
       return Intrinsic::sqrt;
     return Intrinsic::not_intrinsic;
@@ -2607,6 +2571,11 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             bool SignBitOnly,
                                             unsigned Depth) {
+  // TODO: This function does not do the right thing when SignBitOnly is true
+  // and we're lowering to a hypothetical IEEE 754-compliant-but-evil platform
+  // which flips the sign bits of NaNs.  See
+  // https://llvm.org/bugs/show_bug.cgi?id=31702.
+
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     return !CFP->getValueAPF().isNegative() ||
            (!SignBitOnly && CFP->getValueAPF().isZero());
@@ -2650,7 +2619,8 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::Call:
-    Intrinsic::ID IID = getIntrinsicForCallSite(cast<CallInst>(I), TLI);
+    const auto *CI = cast<CallInst>(I);
+    Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
     switch (IID) {
     default:
       break;
@@ -2667,16 +2637,37 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
     case Intrinsic::exp:
     case Intrinsic::exp2:
     case Intrinsic::fabs:
-    case Intrinsic::sqrt:
       return true;
+
+    case Intrinsic::sqrt:
+      // sqrt(x) is always >= -0 or NaN.  Moreover, sqrt(x) == -0 iff x == -0.
+      if (!SignBitOnly)
+        return true;
+      return CI->hasNoNaNs() && (CI->hasNoSignedZeros() ||
+                                 CannotBeNegativeZero(CI->getOperand(0), TLI));
+
     case Intrinsic::powi:
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (ConstantInt *Exponent = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // powi(x,n) is non-negative if n is even.
-        if (CI->getBitWidth() <= 64 && CI->getSExtValue() % 2u == 0)
+        if (Exponent->getBitWidth() <= 64 && Exponent->getSExtValue() % 2u == 0)
           return true;
       }
+      // TODO: This is not correct.  Given that exp is an integer, here are the
+      // ways that pow can return a negative value:
+      //
+      //   pow(x, exp)    --> negative if exp is odd and x is negative.
+      //   pow(-0, exp)   --> -inf if exp is negative odd.
+      //   pow(-0, exp)   --> -0 if exp is positive odd.
+      //   pow(-inf, exp) --> -0 if exp is negative odd.
+      //   pow(-inf, exp) --> -inf if exp is positive odd.
+      //
+      // Therefore, if !SignBitOnly, we can return true if x >= +0 or x is NaN,
+      // but we must return false if x == -0.  Unfortunately we do not currently
+      // have a way of expressing this constraint.  See details in
+      // https://llvm.org/bugs/show_bug.cgi?id=31702.
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1);
+
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
       // x*x+y is non-negative if y is non-negative.
@@ -2970,14 +2961,16 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
   return Ptr;
 }
 
-bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP) {
+bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
+                                       unsigned CharSize) {
   // Make sure the GEP has exactly three arguments.
   if (GEP->getNumOperands() != 3)
     return false;
 
-  // Make sure the index-ee is a pointer to array of i8.
+  // Make sure the index-ee is a pointer to array of \p CharSize integers.
+  // CharSize.
   ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
-  if (!AT || !AT->getElementType()->isIntegerTy(8))
+  if (!AT || !AT->getElementType()->isIntegerTy(CharSize))
     return false;
 
   // Check to make sure that the first operand of the GEP is an integer and
@@ -2989,11 +2982,9 @@ bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP) {
   return true;
 }
 
-/// This function computes the length of a null-terminated C string pointed to
-/// by V. If successful, it returns true and returns the string in Str.
-/// If unsuccessful, it returns false.
-bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
-                                 uint64_t Offset, bool TrimAtNul) {
+bool llvm::getConstantDataArrayInfo(const Value *V,
+                                    ConstantDataArraySlice &Slice,
+                                    unsigned ElementSize, uint64_t Offset) {
   assert(V);
 
   // Look through bitcast instructions and geps.
@@ -3004,7 +2995,7 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     // The GEP operator should be based on a pointer to string constant, and is
     // indexing into the string constant.
-    if (!isGEPBasedOnPointerToString(GEP))
+    if (!isGEPBasedOnPointerToString(GEP, ElementSize))
       return false;
 
     // If the second index isn't a ConstantInt, then this is a variable index
@@ -3015,8 +3006,8 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
       StartIdx = CI->getZExtValue();
     else
       return false;
-    return getConstantStringInfo(GEP->getOperand(0), Str, StartIdx + Offset,
-                                 TrimAtNul);
+    return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
+                                    StartIdx + Offset);
   }
 
   // The GEP instruction, constant or instruction, must reference a global
@@ -3026,30 +3017,72 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return false;
 
-  // Handle the all-zeros case.
+  const ConstantDataArray *Array;
+  ArrayType *ArrayTy;
   if (GV->getInitializer()->isNullValue()) {
-    // This is a degenerate case. The initializer is constant zero so the
-    // length of the string must be zero.
-    Str = "";
-    return true;
+    Type *GVTy = GV->getValueType();
+    if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
+      // A zeroinitializer for the array; there is no ConstantDataArray.
+      Array = nullptr;
+    } else {
+      const DataLayout &DL = GV->getParent()->getDataLayout();
+      uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
+      uint64_t Length = SizeInBytes / (ElementSize / 8);
+      if (Length <= Offset)
+        return false;
+
+      Slice.Array = nullptr;
+      Slice.Offset = 0;
+      Slice.Length = Length - Offset;
+      return true;
+    }
+  } else {
+    // This must be a ConstantDataArray.
+    Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
+    if (!Array)
+      return false;
+    ArrayTy = Array->getType();
   }
+  if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
+    return false;
 
-  // This must be a ConstantDataArray.
-  const auto *Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
-  if (!Array || !Array->isString())
+  uint64_t NumElts = ArrayTy->getArrayNumElements();
+  if (Offset > NumElts)
     return false;
 
-  // Get the number of elements in the array.
-  uint64_t NumElts = Array->getType()->getArrayNumElements();
+  Slice.Array = Array;
+  Slice.Offset = Offset;
+  Slice.Length = NumElts - Offset;
+  return true;
+}
 
-  // Start out with the entire array in the StringRef.
-  Str = Array->getAsString();
+/// This function computes the length of a null-terminated C string pointed to
+/// by V. If successful, it returns true and returns the string in Str.
+/// If unsuccessful, it returns false.
+bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
+                                 uint64_t Offset, bool TrimAtNul) {
+  ConstantDataArraySlice Slice;
+  if (!getConstantDataArrayInfo(V, Slice, 8, Offset))
+    return false;
 
-  if (Offset > NumElts)
+  if (Slice.Array == nullptr) {
+    if (TrimAtNul) {
+      Str = StringRef();
+      return true;
+    }
+    if (Slice.Length == 1) {
+      Str = StringRef("", 1);
+      return true;
+    }
+    // We cannot instantiate a StringRef as we do not have an appropriate string
+    // of 0s at hand.
     return false;
+  }
 
+  // Start out with the entire array in the StringRef.
+  Str = Slice.Array->getAsString();
   // Skip over 'offset' bytes.
-  Str = Str.substr(Offset);
+  Str = Str.substr(Slice.Offset);
 
   if (TrimAtNul) {
     // Trim off the \0 and anything after it.  If the array is not nul
@@ -3067,7 +3100,8 @@ bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 static uint64_t GetStringLengthH(const Value *V,
-                                 SmallPtrSetImpl<const PHINode*> &PHIs) {
+                                 SmallPtrSetImpl<const PHINode*> &PHIs,
+                                 unsigned CharSize) {
   // Look through noop bitcast instructions.
   V = V->stripPointerCasts();
 
@@ -3080,7 +3114,7 @@ static uint64_t GetStringLengthH(const Value *V,
     // If it was new, see if all the input strings are the same length.
     uint64_t LenSoFar = ~0ULL;
     for (Value *IncValue : PN->incoming_values()) {
-      uint64_t Len = GetStringLengthH(IncValue, PHIs);
+      uint64_t Len = GetStringLengthH(IncValue, PHIs, CharSize);
       if (Len == 0) return 0; // Unknown length -> unknown.
 
       if (Len == ~0ULL) continue;
@@ -3096,9 +3130,9 @@ static uint64_t GetStringLengthH(const Value *V,
 
   // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
   if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
-    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs);
+    uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs, CharSize);
     if (Len1 == 0) return 0;
-    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs);
+    uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs, CharSize);
     if (Len2 == 0) return 0;
     if (Len1 == ~0ULL) return Len2;
     if (Len2 == ~0ULL) return Len1;
@@ -3107,20 +3141,30 @@ static uint64_t GetStringLengthH(const Value *V,
   }
 
   // Otherwise, see if we can read the string.
-  StringRef StrData;
-  if (!getConstantStringInfo(V, StrData))
+  ConstantDataArraySlice Slice;
+  if (!getConstantDataArrayInfo(V, Slice, CharSize))
     return 0;
 
-  return StrData.size()+1;
+  if (Slice.Array == nullptr)
+    return 1;
+
+  // Search for nul characters
+  unsigned NullIndex = 0;
+  for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
+    if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
+      break;
+  }
+
+  return NullIndex + 1;
 }
 
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
-uint64_t llvm::GetStringLength(const Value *V) {
+uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
   if (!V->getType()->isPointerTy()) return 0;
 
   SmallPtrSet<const PHINode*, 32> PHIs;
-  uint64_t Len = GetStringLengthH(V, PHIs);
+  uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
   // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
   // an empty string as a length.
   return Len == ~0ULL ? 1 : Len;
@@ -3167,6 +3211,9 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
       if (GA->isInterposable())
         return V;
       V = GA->getAliasee();
+    } else if (isa<AllocaInst>(V)) {
+      // An alloca can't be further simplified.
+      return V;
     } else {
       if (auto CS = CallSite(V))
         if (Value *RV = CS.getReturnedArgOperand()) {
@@ -3177,7 +3224,7 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
         // TODO: Acquire a DominatorTree and AssumptionCache and use them.
-        if (Value *Simplified = SimplifyInstruction(I, DL, nullptr)) {
+        if (Value *Simplified = SimplifyInstruction(I, {DL, I})) {
           V = Simplified;
           continue;
         }
@@ -3230,6 +3277,69 @@ void llvm::GetUnderlyingObjects(Value *V, SmallVectorImpl<Value *> &Objects,
   } while (!Worklist.empty());
 }
 
+/// This is the function that does the work of looking through basic
+/// ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const Operator *U = dyn_cast<Operator>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (U->getOpcode() == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant, a multiplied value, or a phi, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed by the multiply,
+      // because our callers only care when the result is an
+      // identifiable object.
+      if (U->getOpcode() != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
+           !isa<PHINode>(U->getOperand(1))))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
+  } while (true);
+}
+
+/// This is a wrapper around GetUnderlyingObjects and adds support for basic
+/// ptrtoint+arithmetic+inttoptr sequences.
+void llvm::getUnderlyingObjectsForCodeGen(const Value *V,
+                          SmallVectorImpl<Value *> &Objects,
+                          const DataLayout &DL) {
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 4> Working(1, V);
+  do {
+    V = Working.pop_back_val();
+
+    SmallVector<Value *, 4> Objs;
+    GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
+
+    for (Value *V : Objs) {
+      if (!Visited.insert(V).second)
+        continue;
+      if (Operator::getOpcode(V) == Instruction::IntToPtr) {
+        const Value *O =
+          getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+        if (O->getType()->isPointerTy()) {
+          Working.push_back(O);
+          continue;
+        }
+      }
+      // If GetUnderlyingObjects fails to find an identifiable object,
+      // getUnderlyingObjectsForCodeGen also fails for safety.
+      if (!isIdentifiedObject(V)) {
+        Objects.clear();
+        return;
+      }
+      Objects.push_back(const_cast<Value *>(V));
+    }
+  } while (!Working.empty());
+}
+
 /// Return true if the only users of this pointer are lifetime markers.
 bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
   for (const User *U : V->users()) {
@@ -3298,59 +3408,12 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
                                               LI->getAlignment(), DL, CtxI, DT);
   }
   case Instruction::Call: {
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-      switch (II->getIntrinsicID()) {
-      // These synthetic intrinsics have no side-effects and just mark
-      // information about their operands.
-      // FIXME: There are other no-op synthetic instructions that potentially
-      // should be considered at least *safe* to speculate...
-      case Intrinsic::dbg_declare:
-      case Intrinsic::dbg_value:
-        return true;
+    auto *CI = cast<const CallInst>(Inst);
+    const Function *Callee = CI->getCalledFunction();
 
-      case Intrinsic::bitreverse:
-      case Intrinsic::bswap:
-      case Intrinsic::ctlz:
-      case Intrinsic::ctpop:
-      case Intrinsic::cttz:
-      case Intrinsic::objectsize:
-      case Intrinsic::sadd_with_overflow:
-      case Intrinsic::smul_with_overflow:
-      case Intrinsic::ssub_with_overflow:
-      case Intrinsic::uadd_with_overflow:
-      case Intrinsic::umul_with_overflow:
-      case Intrinsic::usub_with_overflow:
-        return true;
-      // These intrinsics are defined to have the same behavior as libm
-      // functions except for setting errno.
-      case Intrinsic::sqrt:
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-        return true;
-      // These intrinsics are defined to have the same behavior as libm
-      // functions, and the corresponding libm functions never set errno.
-      case Intrinsic::trunc:
-      case Intrinsic::copysign:
-      case Intrinsic::fabs:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-        return true;
-      // These intrinsics are defined to have the same behavior as libm
-      // functions, which never overflow when operating on the IEEE754 types
-      // that we support, and never set errno otherwise.
-      case Intrinsic::ceil:
-      case Intrinsic::floor:
-      case Intrinsic::nearbyint:
-      case Intrinsic::rint:
-      case Intrinsic::round:
-        return true;
-      // TODO: are convert_{from,to}_fp16 safe?
-      // TODO: can we list target-specific intrinsics here?
-      default: break;
-      }
-    }
-    return false; // The called function could have undefined behavior or
-                  // side-effects, even if marked readnone nounwind.
+    // The called function could have undefined behavior or side-effects, even
+    // if marked readnone nounwind.
+    return Callee && Callee->isSpeculatable();
   }
   case Instruction::VAArg:
   case Instruction::Alloca:
@@ -3423,6 +3486,16 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
     NumUsesExplored++;
+
+    // If the value is used as an argument to a call or invoke, then argument
+    // attributes may provide an answer about null-ness.
+    if (auto CS = ImmutableCallSite(U))
+      if (auto *CalledFunc = CS.getCalledFunction())
+        for (const Argument &Arg : CalledFunc->args())
+          if (CS.getArgOperand(Arg.getArgNo()) == V &&
+              Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
+            return true;
+
     // Consider only compare instructions uniquely controlling a branch
     CmpInst::Predicate Pred;
     if (!match(const_cast<User *>(U),
@@ -3477,38 +3550,34 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
   // we can guarantee that the result does not overflow.
   // Ref: "Hacker's Delight" by Henry Warren
   unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-  APInt LHSKnownZero(BitWidth, 0);
-  APInt LHSKnownOne(BitWidth, 0);
-  APInt RHSKnownZero(BitWidth, 0);
-  APInt RHSKnownOne(BitWidth, 0);
-  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, DL, /*Depth=*/0, AC, CxtI,
-                   DT);
-  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, DL, /*Depth=*/0, AC, CxtI,
-                   DT);
+  KnownBits LHSKnown(BitWidth);
+  KnownBits RHSKnown(BitWidth);
+  computeKnownBits(LHS, LHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
+  computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   // Note that underestimating the number of zero bits gives a more
   // conservative answer.
-  unsigned ZeroBits = LHSKnownZero.countLeadingOnes() +
-                      RHSKnownZero.countLeadingOnes();
+  unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
+                      RHSKnown.countMinLeadingZeros();
   // First handle the easy case: if we have enough zero bits there's
   // definitely no overflow.
   if (ZeroBits >= BitWidth)
     return OverflowResult::NeverOverflows;
 
   // Get the largest possible values for each operand.
-  APInt LHSMax = ~LHSKnownZero;
-  APInt RHSMax = ~RHSKnownZero;
+  APInt LHSMax = ~LHSKnown.Zero;
+  APInt RHSMax = ~RHSKnown.Zero;
 
   // We know the multiply operation doesn't overflow if the maximum values for
   // each operand will not overflow after we multiply them together.
   bool MaxOverflow;
-  LHSMax.umul_ov(RHSMax, MaxOverflow);
+  (void)LHSMax.umul_ov(RHSMax, MaxOverflow);
   if (!MaxOverflow)
     return OverflowResult::NeverOverflows;
 
   // We know it always overflows if multiplying the smallest possible values for
   // the operands also results in overflow.
   bool MinOverflow;
-  LHSKnownOne.umul_ov(RHSKnownOne, MinOverflow);
+  (void)LHSKnown.One.umul_ov(RHSKnown.One, MinOverflow);
   if (MinOverflow)
     return OverflowResult::AlwaysOverflows;
 
@@ -3521,21 +3590,17 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
-  if (LHSKnownNonNegative || LHSKnownNegative) {
-    bool RHSKnownNonNegative, RHSKnownNegative;
-    ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
-                   AC, CxtI, DT);
-
-    if (LHSKnownNegative && RHSKnownNegative) {
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
+    KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
+
+    if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
       // The sign bit is set in both cases: this MUST overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::AlwaysOverflows;
     }
 
-    if (LHSKnownNonNegative && RHSKnownNonNegative) {
+    if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
       // The sign bit is clear in both cases: this CANNOT overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::NeverOverflows;
@@ -3545,6 +3610,51 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
   return OverflowResult::MayOverflow;
 }
 
+/// \brief Return true if we can prove that adding the two values of the
+/// knownbits will not overflow.
+/// Otherwise return false.
+static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
+                                    const KnownBits &RHSKnown) {
+  // Addition of two 2's complement numbers having opposite signs will never
+  // overflow.
+  if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
+    return true;
+
+  // If either of the values is known to be non-negative, adding them can only
+  // overflow if the second is also non-negative, so we can assume that.
+  // Two non-negative numbers will only overflow if there is a carry to the 
+  // sign bit, so we can check if even when the values are as big as possible
+  // there is no overflow to the sign bit.
+  if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
+    APInt MaxLHS = ~LHSKnown.Zero;
+    MaxLHS.clearSignBit();
+    APInt MaxRHS = ~RHSKnown.Zero;
+    MaxRHS.clearSignBit();
+    APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
+    return Result.isSignBitClear();
+  }
+
+  // If either of the values is known to be negative, adding them can only
+  // overflow if the second is also negative, so we can assume that.
+  // Two negative number will only overflow if there is no carry to the sign
+  // bit, so we can check if even when the values are as small as possible
+  // there is overflow to the sign bit.
+  if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
+    APInt MinLHS = LHSKnown.One;
+    MinLHS.clearSignBit();
+    APInt MinRHS = RHSKnown.One;
+    MinRHS.clearSignBit();
+    APInt Result = std::move(MinLHS) + std::move(MinRHS);
+    return Result.isSignBitSet();
+  }
+
+  // If we reached here it means that we know nothing about the sign bits.
+  // In this case we can't know if there will be an overflow, since by 
+  // changing the sign bits any two values can be made to overflow.
+  return false;
+}
+
 static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
                                                   const Value *RHS,
                                                   const AddOperator *Add,
@@ -3556,18 +3666,29 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
     return OverflowResult::NeverOverflows;
   }
 
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0,
-                 AC, CxtI, DT);
+  // If LHS and RHS each have at least two sign bits, the addition will look
+  // like
+  //
+  // XX..... +
+  // YY.....
+  //
+  // If the carry into the most significant position is 0, X and Y can't both
+  // be 1 and therefore the carry out of the addition is also 0.
+  //
+  // If the carry into the most significant position is 1, X and Y can't both
+  // be 0 and therefore the carry out of the addition is also 1.
+  //
+  // Since the carry into the most significant position is always equal to
+  // the carry out of the addition, there is no signed overflow.
+  if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
+      ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
+    return OverflowResult::NeverOverflows;
+
+  KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
+  KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
-  if ((LHSKnownNonNegative && RHSKnownNegative) ||
-      (LHSKnownNegative && RHSKnownNonNegative)) {
-    // The sign bits are opposite: this CANNOT overflow.
+  if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
     return OverflowResult::NeverOverflows;
-  }
 
   // The remaining code needs Add to be available. Early returns if not so.
   if (!Add)
@@ -3578,14 +3699,13 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
   // @llvm.assume'ed non-negative rather than proved so from analyzing its
   // operands.
   bool LHSOrRHSKnownNonNegative =
-      (LHSKnownNonNegative || RHSKnownNonNegative);
-  bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative);
+      (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
+  bool LHSOrRHSKnownNegative = 
+      (LHSKnown.isNegative() || RHSKnown.isNegative());
   if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
-    bool AddKnownNonNegative, AddKnownNegative;
-    ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL,
-                   /*Depth=*/0, AC, CxtI, DT);
-    if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) ||
-        (AddKnownNegative && LHSOrRHSKnownNegative)) {
+    KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
+    if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
+        (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
       return OverflowResult::NeverOverflows;
     }
   }
@@ -3700,6 +3820,8 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
     return false;
   if (isa<ReturnInst>(I))
     return false;
+  if (isa<UnreachableInst>(I))
+    return false;
 
   // Calls can throw, or contain an infinite loop, or kill the process.
   if (auto CS = ImmutableCallSite(I)) {
@@ -3748,79 +3870,33 @@ bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
 
 bool llvm::propagatesFullPoison(const Instruction *I) {
   switch (I->getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Xor:
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::AddrSpaceCast:
-      // These operations all propagate poison unconditionally. Note that poison
-      // is not any particular value, so xor or subtraction of poison with
-      // itself still yields poison, not zero.
-      return true;
-
-    case Instruction::AShr:
-    case Instruction::SExt:
-      // For these operations, one bit of the input is replicated across
-      // multiple output bits. A replicated poison bit is still poison.
-      return true;
-
-    case Instruction::Shl: {
-      // Left shift *by* a poison value is poison. The number of
-      // positions to shift is unsigned, so no negative values are
-      // possible there. Left shift by zero places preserves poison. So
-      // it only remains to consider left shift of poison by a positive
-      // number of places.
-      //
-      // A left shift by a positive number of places leaves the lowest order bit
-      // non-poisoned. However, if such a shift has a no-wrap flag, then we can
-      // make the poison operand violate that flag, yielding a fresh full-poison
-      // value.
-      auto *OBO = cast<OverflowingBinaryOperator>(I);
-      return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap();
-    }
-
-    case Instruction::Mul: {
-      // A multiplication by zero yields a non-poison zero result, so we need to
-      // rule out zero as an operand. Conservatively, multiplication by a
-      // non-zero constant is not multiplication by zero.
-      //
-      // Multiplication by a non-zero constant can leave some bits
-      // non-poisoned. For example, a multiplication by 2 leaves the lowest
-      // order bit unpoisoned. So we need to consider that.
-      //
-      // Multiplication by 1 preserves poison. If the multiplication has a
-      // no-wrap flag, then we can make the poison operand violate that flag
-      // when multiplied by any integer other than 0 and 1.
-      auto *OBO = cast<OverflowingBinaryOperator>(I);
-      if (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) {
-        for (Value *V : OBO->operands()) {
-          if (auto *CI = dyn_cast<ConstantInt>(V)) {
-            // A ConstantInt cannot yield poison, so we can assume that it is
-            // the other operand that is poison.
-            return !CI->isZero();
-          }
-        }
-      }
-      return false;
-    }
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Xor:
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::GetElementPtr:
+    // These operations all propagate poison unconditionally. Note that poison
+    // is not any particular value, so xor or subtraction of poison with
+    // itself still yields poison, not zero.
+    return true;
 
-    case Instruction::ICmp:
-      // Comparing poison with any value yields poison.  This is why, for
-      // instance, x s< (x +nsw 1) can be folded to true.
-      return true;
+  case Instruction::AShr:
+  case Instruction::SExt:
+    // For these operations, one bit of the input is replicated across
+    // multiple output bits. A replicated poison bit is still poison.
+    return true;
 
-    case Instruction::GetElementPtr:
-      // A GEP implicitly represents a sequence of additions, subtractions,
-      // truncations, sign extensions and multiplications. The multiplications
-      // are by the non-zero sizes of some set of types, so we do not have to be
-      // concerned with multiplication by zero. If the GEP is in-bounds, then
-      // these operations are implicitly no-signed-wrap so poison is propagated
-      // by the arguments above for Add, Sub, Trunc, SExt and Mul.
-      return cast<GEPOperator>(I)->isInBounds();
+  case Instruction::ICmp:
+    // Comparing poison with any value yields poison.  This is why, for
+    // instance, x s< (x +nsw 1) can be folded to true.
+    return true;
 
-    default:
-      return false;
+  default:
+    return false;
   }
 }
 
@@ -3849,7 +3925,7 @@ const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) {
   }
 }
 
-bool llvm::isKnownNotFullPoison(const Instruction *PoisonI) {
+bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
   // We currently only look for uses of poison values within the same basic
   // block, as that makes it easier to guarantee that the uses will be
   // executed given that PoisonI is executed.
@@ -3923,6 +3999,37 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
                                        Value *CmpLHS, Value *CmpRHS,
                                        Value *TrueVal, Value *FalseVal,
                                        Value *&LHS, Value *&RHS) {
+  // Assume success. If there's no match, callers should not use these anyway.
+  LHS = TrueVal;
+  RHS = FalseVal;
+
+  // Recognize variations of:
+  // CLAMP(v,l,h) ==> ((v) < (l) ? (l) : ((v) > (h) ? (h) : (v)))
+  const APInt *C1;
+  if (CmpRHS == TrueVal && match(CmpRHS, m_APInt(C1))) {
+    const APInt *C2;
+
+    // (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
+    if (match(FalseVal, m_SMin(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->slt(*C2) && Pred == CmpInst::ICMP_SLT)
+      return {SPF_SMAX, SPNB_NA, false};
+
+    // (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
+    if (match(FalseVal, m_SMax(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->sgt(*C2) && Pred == CmpInst::ICMP_SGT)
+      return {SPF_SMIN, SPNB_NA, false};
+
+    // (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
+    if (match(FalseVal, m_UMin(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->ult(*C2) && Pred == CmpInst::ICMP_ULT)
+      return {SPF_UMAX, SPNB_NA, false};
+
+    // (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
+    if (match(FalseVal, m_UMax(m_Specific(CmpLHS), m_APInt(C2))) &&
+        C1->ugt(*C2) && Pred == CmpInst::ICMP_UGT)
+      return {SPF_UMIN, SPNB_NA, false};
+  }
+
   if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
@@ -3930,23 +4037,16 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
   // (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
   // (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
   if (match(TrueVal, m_Zero()) &&
-      match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS)))) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
-  }
 
   // Z = X -nsw Y
   // (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
   // (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
   if (match(FalseVal, m_Zero()) &&
-      match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS)))) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
-  }
 
-  const APInt *C1;
   if (!match(CmpRHS, m_APInt(C1)))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
@@ -3957,41 +4057,29 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
     // Is the sign bit set?
     // (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
     // (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
-    if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue()) {
-      LHS = TrueVal;
-      RHS = FalseVal;
+    if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue())
       return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
-    }
 
     // Is the sign bit clear?
     // (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
     // (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
     if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
-        C2->isMinSignedValue()) {
-      LHS = TrueVal;
-      RHS = FalseVal;
+        C2->isMinSignedValue())
       return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
-    }
   }
 
   // Look through 'not' ops to find disguised signed min/max.
   // (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
   // (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
   if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
-      match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
-  }
 
   // (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
   // (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
   if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
-      match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2) {
-    LHS = TrueVal;
-    RHS = FalseVal;
+      match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
-  }
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
@@ -4118,58 +4206,64 @@ static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
 
 static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
                               Instruction::CastOps *CastOp) {
-  CastInst *CI = dyn_cast<CastInst>(V1);
-  Constant *C = dyn_cast<Constant>(V2);
-  if (!CI)
+  auto *Cast1 = dyn_cast<CastInst>(V1);
+  if (!Cast1)
     return nullptr;
-  *CastOp = CI->getOpcode();
-
-  if (auto *CI2 = dyn_cast<CastInst>(V2)) {
-    // If V1 and V2 are both the same cast from the same type, we can look
-    // through V1.
-    if (CI2->getOpcode() == CI->getOpcode() &&
-        CI2->getSrcTy() == CI->getSrcTy())
-      return CI2->getOperand(0);
-    return nullptr;
-  } else if (!C) {
+
+  *CastOp = Cast1->getOpcode();
+  Type *SrcTy = Cast1->getSrcTy();
+  if (auto *Cast2 = dyn_cast<CastInst>(V2)) {
+    // If V1 and V2 are both the same cast from the same type, look through V1.
+    if (*CastOp == Cast2->getOpcode() && SrcTy == Cast2->getSrcTy())
+      return Cast2->getOperand(0);
     return nullptr;
   }
 
-  Constant *CastedTo = nullptr;
-
-  if (isa<ZExtInst>(CI) && CmpI->isUnsigned())
-    CastedTo = ConstantExpr::getTrunc(C, CI->getSrcTy());
-
-  if (isa<SExtInst>(CI) && CmpI->isSigned())
-    CastedTo = ConstantExpr::getTrunc(C, CI->getSrcTy(), true);
-
-  if (isa<TruncInst>(CI))
-    CastedTo = ConstantExpr::getIntegerCast(C, CI->getSrcTy(), CmpI->isSigned());
-
-  if (isa<FPTruncInst>(CI))
-    CastedTo = ConstantExpr::getFPExtend(C, CI->getSrcTy(), true);
-
-  if (isa<FPExtInst>(CI))
-    CastedTo = ConstantExpr::getFPTrunc(C, CI->getSrcTy(), true);
-
-  if (isa<FPToUIInst>(CI))
-    CastedTo = ConstantExpr::getUIToFP(C, CI->getSrcTy(), true);
-
-  if (isa<FPToSIInst>(CI))
-    CastedTo = ConstantExpr::getSIToFP(C, CI->getSrcTy(), true);
-
-  if (isa<UIToFPInst>(CI))
-    CastedTo = ConstantExpr::getFPToUI(C, CI->getSrcTy(), true);
+  auto *C = dyn_cast<Constant>(V2);
+  if (!C)
+    return nullptr;
 
-  if (isa<SIToFPInst>(CI))
-    CastedTo = ConstantExpr::getFPToSI(C, CI->getSrcTy(), true);
+  Constant *CastedTo = nullptr;
+  switch (*CastOp) {
+  case Instruction::ZExt:
+    if (CmpI->isUnsigned())
+      CastedTo = ConstantExpr::getTrunc(C, SrcTy);
+    break;
+  case Instruction::SExt:
+    if (CmpI->isSigned())
+      CastedTo = ConstantExpr::getTrunc(C, SrcTy, true);
+    break;
+  case Instruction::Trunc:
+    CastedTo = ConstantExpr::getIntegerCast(C, SrcTy, CmpI->isSigned());
+    break;
+  case Instruction::FPTrunc:
+    CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
+    break;
+  case Instruction::FPExt:
+    CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
+    break;
+  case Instruction::FPToUI:
+    CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
+    break;
+  case Instruction::FPToSI:
+    CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
+    break;
+  case Instruction::UIToFP:
+    CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
+    break;
+  case Instruction::SIToFP:
+    CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
+    break;
+  default:
+    break;
+  }
 
   if (!CastedTo)
     return nullptr;
 
-  Constant *CastedBack =
-      ConstantExpr::getCast(CI->getOpcode(), CastedTo, C->getType(), true);
   // Make sure the cast doesn't lose any information.
+  Constant *CastedBack =
+      ConstantExpr::getCast(*CastOp, CastedTo, C->getType(), true);
   if (CastedBack != C)
     return nullptr;
 
@@ -4253,11 +4347,10 @@ static bool isTruePredicate(CmpInst::Predicate Pred,
       // If X & C == 0 then (X | C) == X +_{nuw} C
       if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
           match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
-        unsigned BitWidth = CA->getBitWidth();
-        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-        computeKnownBits(X, KnownZero, KnownOne, DL, Depth + 1, AC, CxtI, DT);
+        KnownBits Known(CA->getBitWidth());
+        computeKnownBits(X, Known, DL, Depth + 1, AC, CxtI, DT);
 
-        if ((KnownZero & *CA) == *CA && (KnownZero & *CB) == *CB)
+        if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
           return true;
       }
 
@@ -4361,35 +4454,64 @@ isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, const Value *ALHS,
 }
 
 Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
-                                        const DataLayout &DL, bool InvertAPred,
+                                        const DataLayout &DL, bool LHSIsFalse,
                                         unsigned Depth, AssumptionCache *AC,
                                         const Instruction *CxtI,
                                         const DominatorTree *DT) {
+  // Bail out when we hit the limit.
+  if (Depth == MaxDepth)
+    return None;
+
   // A mismatch occurs when we compare a scalar cmp to a vector cmp, for example.
   if (LHS->getType() != RHS->getType())
     return None;
 
   Type *OpTy = LHS->getType();
-  assert(OpTy->getScalarType()->isIntegerTy(1));
+  assert(OpTy->isIntOrIntVectorTy(1));
 
   // LHS ==> RHS by definition
-  if (!InvertAPred && LHS == RHS)
-    return true;
+  if (LHS == RHS)
+    return !LHSIsFalse;
 
   if (OpTy->isVectorTy())
     // TODO: extending the code below to handle vectors
     return None;
   assert(OpTy->isIntegerTy(1) && "implied by above");
 
-  ICmpInst::Predicate APred, BPred;
-  Value *ALHS, *ARHS;
   Value *BLHS, *BRHS;
+  ICmpInst::Predicate BPred;
+  // We expect the RHS to be an icmp.
+  if (!match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS))))
+    return None;
 
-  if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS))) ||
-      !match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS))))
+  Value *ALHS, *ARHS;
+  ICmpInst::Predicate APred;
+  // The LHS can be an 'or', 'and', or 'icmp'.
+  if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS)))) {
+    // The remaining tests are all recursive, so bail out if we hit the limit.
+    if (Depth == MaxDepth)
+      return None;
+    // If the result of an 'or' is false, then we know both legs of the 'or' are
+    // false.  Similarly, if the result of an 'and' is true, then we know both
+    // legs of the 'and' are true.
+    if ((LHSIsFalse && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) ||
+        (!LHSIsFalse && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
+      if (Optional<bool> Implication = isImpliedCondition(
+              ALHS, RHS, DL, LHSIsFalse, Depth + 1, AC, CxtI, DT))
+        return Implication;
+      if (Optional<bool> Implication = isImpliedCondition(
+              ARHS, RHS, DL, LHSIsFalse, Depth + 1, AC, CxtI, DT))
+        return Implication;
+      return None;
+    }
     return None;
+  }
+  // All of the below logic assumes both LHS and RHS are icmps.
+  assert(isa<ICmpInst>(LHS) && isa<ICmpInst>(RHS) && "Expected icmps.");
 
-  if (InvertAPred)
+  // The rest of the logic assumes the LHS condition is true.  If that's not the
+  // case, invert the predicate to make it so.
+  if (LHSIsFalse)
     APred = CmpInst::getInversePredicate(APred);
 
   // Can we infer anything when the two compares have matching operands?
diff --git a/contrib/llvm/lib/Analysis/VectorUtils.cpp b/contrib/llvm/lib/Analysis/VectorUtils.cpp
index 7e598f4..554d132 100644
--- a/contrib/llvm/lib/Analysis/VectorUtils.cpp
+++ b/contrib/llvm/lib/Analysis/VectorUtils.cpp
@@ -11,18 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/Constants.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -300,7 +301,7 @@ const llvm::Value *llvm::getSplatValue(const Value *V) {
   auto *InsertEltInst =
     dyn_cast<InsertElementInst>(ShuffleInst->getOperand(0));
   if (!InsertEltInst || !isa<ConstantInt>(InsertEltInst->getOperand(2)) ||
-      !cast<ConstantInt>(InsertEltInst->getOperand(2))->isNullValue())
+      !cast<ConstantInt>(InsertEltInst->getOperand(2))->isZero())
     return nullptr;
 
   return InsertEltInst->getOperand(1);
@@ -488,3 +489,88 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
 
   return Inst;
 }
+
+Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
+                                     unsigned NumVecs) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVecs; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start,
+                                 unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                     unsigned NumInts, unsigned NumUndefs) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumInts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndefs; i++)
+    Mask.push_back(Undef);
+
+  return ConstantVector::get(Mask);
+}
+
+/// A helper function for concatenating vectors. This function concatenates two
+/// vectors having the same element type. If the second vector has fewer
+/// elements than the first, it is padded with undefs.
+static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+  if (NumElts1 > NumElts2) {
+    // Extend with UNDEFs.
+    Constant *ExtMask =
+        createSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+  }
+
+  Constant *Mask = createSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+Value *llvm::concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) {
+  unsigned NumVecs = Vecs.size();
+  assert(NumVecs > 1 && "Should be at least two vectors");
+
+  SmallVector<Value *, 8> ResList;
+  ResList.append(Vecs.begin(), Vecs.end());
+  do {
+    SmallVector<Value *, 8> TmpList;
+    for (unsigned i = 0; i < NumVecs - 1; i += 2) {
+      Value *V0 = ResList[i], *V1 = ResList[i + 1];
+      assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
+             "Only the last vector may have a different type");
+
+      TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
+    }
+
+    // Push the last vector if the total number of vectors is odd.
+    if (NumVecs % 2 != 0)
+      TmpList.push_back(ResList[NumVecs - 1]);
+
+    ResList = TmpList;
+    NumVecs = ResList.size();
+  } while (NumVecs > 1);
+
+  return ResList[0];
+}
diff --git a/contrib/llvm/lib/AsmParser/LLLexer.cpp b/contrib/llvm/lib/AsmParser/LLLexer.cpp
index 752942f..90e0d6a 100644
--- a/contrib/llvm/lib/AsmParser/LLLexer.cpp
+++ b/contrib/llvm/lib/AsmParser/LLLexer.cpp
@@ -542,12 +542,13 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(release);
   KEYWORD(acq_rel);
   KEYWORD(seq_cst);
-  KEYWORD(singlethread);
+  KEYWORD(syncscope);
 
   KEYWORD(nnan);
   KEYWORD(ninf);
   KEYWORD(nsz);
   KEYWORD(arcp);
+  KEYWORD(contract);
   KEYWORD(fast);
   KEYWORD(nuw);
   KEYWORD(nsw);
@@ -587,7 +588,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(spir_func);
   KEYWORD(intel_ocl_bicc);
   KEYWORD(x86_64_sysvcc);
-  KEYWORD(x86_64_win64cc);
+  KEYWORD(win64cc);
   KEYWORD(x86_regcallcc);
   KEYWORD(webkit_jscc);
   KEYWORD(swiftcc);
@@ -600,6 +601,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(hhvm_ccc);
   KEYWORD(cxx_fast_tlscc);
   KEYWORD(amdgpu_vs);
+  KEYWORD(amdgpu_hs);
   KEYWORD(amdgpu_gs);
   KEYWORD(amdgpu_ps);
   KEYWORD(amdgpu_cs);
@@ -647,6 +649,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(returned);
   KEYWORD(returns_twice);
   KEYWORD(signext);
+  KEYWORD(speculatable);
   KEYWORD(sret);
   KEYWORD(ssp);
   KEYWORD(sspreq);
diff --git a/contrib/llvm/lib/AsmParser/LLParser.cpp b/contrib/llvm/lib/AsmParser/LLParser.cpp
index 4cd986e..13679ce 100644
--- a/contrib/llvm/lib/AsmParser/LLParser.cpp
+++ b/contrib/llvm/lib/AsmParser/LLParser.cpp
@@ -15,9 +15,10 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/BasicBlock.h"
@@ -41,7 +42,6 @@
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -130,10 +130,9 @@ bool LLParser::ValidateEndOfModule() {
       B.merge(NumberedAttrBuilders[Attr]);
 
     if (Function *Fn = dyn_cast<Function>(V)) {
-      AttributeSet AS = Fn->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = Fn->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
 
       FnAttrs.merge(B);
 
@@ -144,33 +143,29 @@ bool LLParser::ValidateEndOfModule() {
         FnAttrs.removeAttribute(Attribute::Alignment);
       }
 
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
+                            AttributeSet::get(Context, FnAttrs));
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
-      AttributeSet AS = CI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = CI->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
+                            AttributeSet::get(Context, FnAttrs));
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
-      AttributeSet AS = II->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = II->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
+                            AttributeSet::get(Context, FnAttrs));
       II->setAttributes(AS);
+    } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
+      AttrBuilder Attrs(GV->getAttributes());
+      Attrs.merge(B);
+      GV->setAttributes(AttributeSet::get(Context,Attrs));
     } else {
       llvm_unreachable("invalid object with forward attribute group reference");
     }
@@ -841,10 +836,10 @@ bool LLParser::parseIndirectSymbol(
 /// ParseGlobal
 ///   ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-///       OptionalExternallyInitialized GlobalType Type Const
+///       OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
 ///   ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass
 ///       OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace
-///       OptionalExternallyInitialized GlobalType Type Const
+///       OptionalExternallyInitialized GlobalType Type Const OptionalAttrs
 ///
 /// Everything up to and including OptionalUnnamedAddr has been parsed
 /// already.
@@ -959,6 +954,16 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
     }
   }
 
+  AttrBuilder Attrs;
+  LocTy BuiltinLoc;
+  std::vector<unsigned> FwdRefAttrGrps;
+  if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
+    return true;
+  if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) {
+    GV->setAttributes(AttributeSet::get(Context, Attrs));
+    ForwardRefAttrGroups[GV] = FwdRefAttrGrps;
+  }
+
   return false;
 }
 
@@ -1104,6 +1109,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_readonly: B.addAttribute(Attribute::ReadOnly); break;
     case lltok::kw_returns_twice:
       B.addAttribute(Attribute::ReturnsTwice); break;
+    case lltok::kw_speculatable: B.addAttribute(Attribute::Speculatable); break;
     case lltok::kw_ssp: B.addAttribute(Attribute::StackProtect); break;
     case lltok::kw_sspreq: B.addAttribute(Attribute::StackProtectReq); break;
     case lltok::kw_sspstrong:
@@ -1664,7 +1670,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'spir_func'
 ///   ::= 'spir_kernel'
 ///   ::= 'x86_64_sysvcc'
-///   ::= 'x86_64_win64cc'
+///   ::= 'win64cc'
 ///   ::= 'webkit_jscc'
 ///   ::= 'anyregcc'
 ///   ::= 'preserve_mostcc'
@@ -1676,8 +1682,7 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///   ::= 'hhvm_ccc'
 ///   ::= 'cxx_fast_tlscc'
 ///   ::= 'amdgpu_vs'
-///   ::= 'amdgpu_tcs'
-///   ::= 'amdgpu_tes'
+///   ::= 'amdgpu_hs'
 ///   ::= 'amdgpu_gs'
 ///   ::= 'amdgpu_ps'
 ///   ::= 'amdgpu_cs'
@@ -1707,7 +1712,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
-  case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break;
+  case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
   case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
   case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
@@ -1719,6 +1724,7 @@ bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
   case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
   case lltok::kw_amdgpu_vs:      CC = CallingConv::AMDGPU_VS; break;
+  case lltok::kw_amdgpu_hs:      CC = CallingConv::AMDGPU_HS; break;
   case lltok::kw_amdgpu_gs:      CC = CallingConv::AMDGPU_GS; break;
   case lltok::kw_amdgpu_ps:      CC = CallingConv::AMDGPU_PS; break;
   case lltok::kw_amdgpu_cs:      CC = CallingConv::AMDGPU_CS; break;
@@ -1855,6 +1861,34 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
   return false;
 }
 
+/// ParseOptionalCommaAddrSpace
+///   ::=
+///   ::= ',' addrspace(1)
+///
+/// This returns with AteExtraComma set to true if it ate an excess comma at the
+/// end.
+bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace,
+                                           LocTy &Loc,
+                                           bool &AteExtraComma) {
+  AteExtraComma = false;
+  while (EatIfPresent(lltok::comma)) {
+    // Metadata at the end is an early exit.
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      return false;
+    }
+
+    Loc = Lex.getLoc();
+    if (Lex.getKind() != lltok::kw_addrspace)
+      return Error(Lex.getLoc(), "expected metadata or 'addrspace'");
+
+    if (ParseOptionalAddrSpace(AddrSpace))
+      return true;
+  }
+
+  return false;
+}
+
 bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
                                        Optional<unsigned> &HowManyArg) {
   Lex.Lex();
@@ -1885,20 +1919,42 @@ bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
 }
 
 /// ParseScopeAndOrdering
-///   if isAtomic: ::= 'singlethread'? AtomicOrdering
+///   if isAtomic: ::= SyncScope? AtomicOrdering
 ///   else: ::=
 ///
 /// This sets Scope and Ordering to the parsed values.
-bool LLParser::ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
+bool LLParser::ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
                                      AtomicOrdering &Ordering) {
   if (!isAtomic)
     return false;
 
-  Scope = CrossThread;
-  if (EatIfPresent(lltok::kw_singlethread))
-    Scope = SingleThread;
+  return ParseScope(SSID) || ParseOrdering(Ordering);
+}
+
+/// ParseScope
+///   ::= syncscope("singlethread" | "<target scope>")?
+///
+/// This sets synchronization scope ID to the ID of the parsed value.
+bool LLParser::ParseScope(SyncScope::ID &SSID) {
+  SSID = SyncScope::System;
+  if (EatIfPresent(lltok::kw_syncscope)) {
+    auto StartParenAt = Lex.getLoc();
+    if (!EatIfPresent(lltok::lparen))
+      return Error(StartParenAt, "Expected '(' in syncscope");
+
+    std::string SSN;
+    auto SSNAt = Lex.getLoc();
+    if (ParseStringConstant(SSN))
+      return Error(SSNAt, "Expected synchronization scope name");
+
+    auto EndParenAt = Lex.getLoc();
+    if (!EatIfPresent(lltok::rparen))
+      return Error(EndParenAt, "Expected ')' in syncscope");
 
-  return ParseOrdering(Ordering);
+    SSID = Context.getOrInsertSyncScopeID(SSN);
+  }
+
+  return false;
 }
 
 /// ParseOrdering
@@ -2098,7 +2154,6 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
   if (ParseToken(lltok::lparen, "expected '(' in call"))
     return true;
 
-  unsigned AttrIndex = 1;
   while (Lex.getKind() != lltok::rparen) {
     // If this isn't the first argument, we need a comma.
     if (!ArgList.empty() &&
@@ -2132,9 +2187,8 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
       if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
         return true;
     }
-    ArgList.push_back(ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(),
-                                                             AttrIndex++,
-                                                             ArgAttrs)));
+    ArgList.push_back(ParamInfo(
+        ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs)));
   }
 
   if (IsMustTailCall && InVarArgsFunc)
@@ -2239,9 +2293,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (!FunctionType::isValidArgumentType(ArgTy))
       return Error(TypeLoc, "invalid type for function argument");
 
-    unsigned AttrIndex = 1;
-    ArgList.emplace_back(TypeLoc, ArgTy, AttributeSet::get(ArgTy->getContext(),
-                                                           AttrIndex++, Attrs),
+    ArgList.emplace_back(TypeLoc, ArgTy,
+                         AttributeSet::get(ArgTy->getContext(), Attrs),
                          std::move(Name));
 
     while (EatIfPresent(lltok::comma)) {
@@ -2268,10 +2321,9 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       if (!ArgTy->isFirstClassType())
         return Error(TypeLoc, "invalid type for function argument");
 
-      ArgList.emplace_back(
-          TypeLoc, ArgTy,
-          AttributeSet::get(ArgTy->getContext(), AttrIndex++, Attrs),
-          std::move(Name));
+      ArgList.emplace_back(TypeLoc, ArgTy,
+                           AttributeSet::get(ArgTy->getContext(), Attrs),
+                           std::move(Name));
     }
   }
 
@@ -2295,7 +2347,7 @@ bool LLParser::ParseFunctionType(Type *&Result) {
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     if (!ArgList[i].Name.empty())
       return Error(ArgList[i].Loc, "argument name invalid in function type");
-    if (ArgList[i].Attrs.hasAttributes(i + 1))
+    if (ArgList[i].Attrs.hasAttributes())
       return Error(ArgList[i].Loc,
                    "argument attributes invalid in function type");
   }
@@ -2472,7 +2524,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
       continue;
     P.second.first->replaceAllUsesWith(
         UndefValue::get(P.second.first->getType()));
-    delete P.second.first;
+    P.second.first->deleteValue();
   }
 
   for (const auto &P : ForwardRefValIDs) {
@@ -2480,7 +2532,7 @@ LLParser::PerFunctionState::~PerFunctionState() {
       continue;
     P.second.first->replaceAllUsesWith(
         UndefValue::get(P.second.first->getType()));
-    delete P.second.first;
+    P.second.first->deleteValue();
   }
 }
 
@@ -2612,7 +2664,7 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
                        getTypeString(FI->second.first->getType()) + "'");
 
       Sentinel->replaceAllUsesWith(Inst);
-      delete Sentinel;
+      Sentinel->deleteValue();
       ForwardRefValIDs.erase(FI);
     }
 
@@ -2629,7 +2681,7 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
                      getTypeString(FI->second.first->getType()) + "'");
 
     Sentinel->replaceAllUsesWith(Inst);
-    delete Sentinel;
+    Sentinel->deleteValue();
     ForwardRefVals.erase(FI);
   }
 
@@ -3031,7 +3083,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     } else {
       assert(Opc == Instruction::ICmp && "Unexpected opcode for CmpInst!");
       if (!Val0->getType()->isIntOrIntVectorTy() &&
-          !Val0->getType()->getScalarType()->isPointerTy())
+          !Val0->getType()->isPtrOrPtrVectorTy())
         return Error(ID.Loc, "icmp requires pointer or integer operands");
       ID.ConstantVal = ConstantExpr::getICmp(Pred, Val0, Val1);
     }
@@ -3180,7 +3232,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
     if (Opc == Instruction::GetElementPtr) {
       if (Elts.size() == 0 ||
-          !Elts[0]->getType()->getScalarType()->isPointerTy())
+          !Elts[0]->getType()->isPtrOrPtrVectorTy())
         return Error(ID.Loc, "base of getelementptr must be a pointer");
 
       Type *BaseType = Elts[0]->getType();
@@ -3196,7 +3248,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
       for (Constant *Val : Indices) {
         Type *ValTy = Val->getType();
-        if (!ValTy->getScalarType()->isIntegerTy())
+        if (!ValTy->isIntOrIntVectorTy())
           return Error(ID.Loc, "getelementptr index must be an integer");
         if (ValTy->isVectorTy()) {
           unsigned ValNumEl = ValTy->getVectorNumElements();
@@ -3908,7 +3960,8 @@ bool LLParser::ParseDIBasicType(MDNode *&Result, bool IsDistinct) {
 /// ParseDIDerivedType:
 ///   ::= !DIDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
 ///                      line: 7, scope: !1, baseType: !2, size: 32,
-///                      align: 32, offset: 0, flags: 0, extraData: !3)
+///                      align: 32, offset: 0, flags: 0, extraData: !3,
+///                      dwarfAddressSpace: 3)
 bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
@@ -3921,14 +3974,20 @@ bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
   OPTIONAL(flags, DIFlagField, );                                              \
-  OPTIONAL(extraData, MDField, );
+  OPTIONAL(extraData, MDField, );                                              \
+  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Optional<unsigned> DWARFAddressSpace;
+  if (dwarfAddressSpace.Val != UINT32_MAX)
+    DWARFAddressSpace = dwarfAddressSpace.Val;
+
   Result = GET_OR_DISTINCT(DIDerivedType,
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, flags.Val, extraData.Val));
+                            offset.Val, DWARFAddressSpace, flags.Val,
+                            extraData.Val));
   return false;
 }
 
@@ -4029,7 +4088,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(imports, MDField, );                                                \
   OPTIONAL(macros, MDField, );                                                 \
   OPTIONAL(dwoId, MDUnsignedField, );                                          \
-  OPTIONAL(splitDebugInlining, MDBoolField, = true);
+  OPTIONAL(splitDebugInlining, MDBoolField, = true);                           \
+  OPTIONAL(debugInfoForProfiling, MDBoolField, = false);
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4037,7 +4097,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
       retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val,
-      splitDebugInlining.Val);
+      splitDebugInlining.Val, debugInfoForProfiling.Val);
   return false;
 }
 
@@ -4048,7 +4108,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
 ///                     virtuality: DW_VIRTUALTIY_pure_virtual,
 ///                     virtualIndex: 10, thisAdjustment: 4, flags: 11,
 ///                     isOptimized: false, templateParams: !4, declaration: !5,
-///                     variables: !6)
+///                     variables: !6, thrownTypes: !7)
 bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   auto Loc = Lex.getLoc();
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
@@ -4070,7 +4130,8 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(unit, MDField, );                                                   \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(declaration, MDField, );                                            \
-  OPTIONAL(variables, MDField, );
+  OPTIONAL(variables, MDField, );                                              \
+  OPTIONAL(thrownTypes, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4080,12 +4141,12 @@ bool LLParser::ParseDISubprogram(MDNode *&Result, bool IsDistinct) {
         "missing 'distinct', required for !DISubprogram when 'isDefinition'");
 
   Result = GET_OR_DISTINCT(
-      DISubprogram, (Context, scope.Val, name.Val, linkageName.Val, file.Val,
-                     line.Val, type.Val, isLocal.Val, isDefinition.Val,
-                     scopeLine.Val, containingType.Val, virtuality.Val,
-                     virtualIndex.Val, thisAdjustment.Val, flags.Val,
-                     isOptimized.Val, unit.Val, templateParams.Val,
-                     declaration.Val, variables.Val));
+      DISubprogram,
+      (Context, scope.Val, name.Val, linkageName.Val, file.Val, line.Val,
+       type.Val, isLocal.Val, isDefinition.Val, scopeLine.Val,
+       containingType.Val, virtuality.Val, virtualIndex.Val, thisAdjustment.Val,
+       flags.Val, isOptimized.Val, unit.Val, templateParams.Val,
+       declaration.Val, variables.Val, thrownTypes.Val));
   return false;
 }
 
@@ -4125,15 +4186,13 @@ bool LLParser::ParseDILexicalBlockFile(MDNode *&Result, bool IsDistinct) {
 bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, );                                                  \
-  OPTIONAL(file, MDField, );                                                   \
   OPTIONAL(name, MDStringField, );                                             \
-  OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(exportSymbols, MDBoolField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
   Result = GET_OR_DISTINCT(DINamespace,
-  (Context, scope.Val, file.Val, name.Val, line.Val, exportSymbols.Val));
+                           (Context, scope.Val, name.Val, exportSymbols.Val));
   return false;
 }
 
@@ -4352,13 +4411,15 @@ bool LLParser::ParseDIImportedEntity(MDNode *&Result, bool IsDistinct) {
   REQUIRED(tag, DwarfTagField, );                                              \
   REQUIRED(scope, MDField, );                                                  \
   OPTIONAL(entity, MDField, );                                                 \
+  OPTIONAL(file, MDField, );                                                   \
   OPTIONAL(line, LineField, );                                                 \
   OPTIONAL(name, MDStringField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIImportedEntity, (Context, tag.Val, scope.Val,
-                                              entity.Val, line.Val, name.Val));
+  Result = GET_OR_DISTINCT(
+      DIImportedEntity,
+      (Context, tag.Val, scope.Val, entity.Val, file.Val, line.Val, name.Val));
   return false;
 }
 
@@ -4589,6 +4650,9 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
     C = cast<Constant>(V);
     return false;
   }
+  case ValID::t_Null:
+    C = Constant::getNullValue(Ty);
+    return false;
   default:
     return Error(Loc, "expected a constant value");
   }
@@ -4735,25 +4799,14 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::vector<Type*> ParamTypeList;
   SmallVector<AttributeSet, 8> Attrs;
 
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
-
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     ParamTypeList.push_back(ArgList[i].Ty);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
-  if (FuncAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FuncAttrs));
-
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL =
+      AttributeList::get(Context, AttributeSet::get(Context, FuncAttrs),
+                         AttributeSet::get(Context, RetAttrs), Attrs);
 
   if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
@@ -5363,13 +5416,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
     return true;
 
   // Set up the Attribute for the function.
-  SmallVector<AttributeSet, 8> Attrs;
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
-
-  SmallVector<Value*, 8> Args;
+  SmallVector<Value *, 8> Args;
+  SmallVector<AttributeSet, 8> ArgAttrs;
 
   // Loop through FunctionType's arguments and ensure they are specified
   // correctly.  Also, gather any parameter attributes.
@@ -5387,26 +5435,19 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    ArgAttrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes()) {
-    if (FnAttrs.hasAlignmentAttr())
-      return Error(CallLoc, "invoke instructions may not have an alignment");
-
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FnAttrs));
-  }
+  if (FnAttrs.hasAlignmentAttr())
+    return Error(CallLoc, "invoke instructions may not have an alignment");
 
   // Finish off the Attribute and check them
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL =
+      AttributeList::get(Context, AttributeSet::get(Context, FnAttrs),
+                         AttributeSet::get(Context, RetAttrs), ArgAttrs);
 
   InvokeInst *II =
       InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args, BundleList);
@@ -5680,7 +5721,7 @@ bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
   } else {
     assert(Opc == Instruction::ICmp && "Unknown opcode for CmpInst!");
     if (!LHS->getType()->isIntOrIntVectorTy() &&
-        !LHS->getType()->getScalarType()->isPointerTy())
+        !LHS->getType()->isPtrOrPtrVectorTy())
       return Error(Loc, "icmp requires integer operands");
     Inst = new ICmpInst(CmpInst::Predicate(Pred), LHS, RHS);
   }
@@ -5968,10 +6009,6 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Set up the Attribute for the function.
   SmallVector<AttributeSet, 8> Attrs;
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
 
   SmallVector<Value*, 8> Args;
 
@@ -5991,26 +6028,19 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes()) {
-    if (FnAttrs.hasAlignmentAttr())
-      return Error(CallLoc, "call instructions may not have an alignment");
-
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FnAttrs));
-  }
+  if (FnAttrs.hasAlignmentAttr())
+    return Error(CallLoc, "call instructions may not have an alignment");
 
   // Finish off the Attribute and check them
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL =
+      AttributeList::get(Context, AttributeSet::get(Context, FnAttrs),
+                         AttributeSet::get(Context, RetAttrs), Attrs);
 
   CallInst *CI = CallInst::Create(Ty, Callee, Args, BundleList);
   CI->setTailCallKind(TCK);
@@ -6032,8 +6062,9 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 ///       (',' 'align' i32)?
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
-  LocTy SizeLoc, TyLoc;
+  LocTy SizeLoc, TyLoc, ASLoc;
   unsigned Alignment = 0;
+  unsigned AddrSpace = 0;
   Type *Ty = nullptr;
 
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
@@ -6047,12 +6078,21 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   if (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::kw_align) {
-      if (ParseOptionalAlignment(Alignment)) return true;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
+      if (ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
+        return true;
+    } else if (Lex.getKind() == lltok::kw_addrspace) {
+      ASLoc = Lex.getLoc();
+      if (ParseOptionalAddrSpace(AddrSpace))
+        return true;
     } else if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
     } else {
       if (ParseTypeAndValue(Size, SizeLoc, PFS) ||
-          ParseOptionalCommaAlign(Alignment, AteExtraComma))
+          ParseOptionalCommaAlign(Alignment, AteExtraComma) ||
+          (!AteExtraComma &&
+           ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma)))
         return true;
     }
   }
@@ -6060,7 +6100,14 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  AllocaInst *AI = new AllocaInst(Ty, Size, Alignment);
+  const DataLayout &DL = M->getDataLayout();
+  unsigned AS = DL.getAllocaAddrSpace();
+  if (AS != AddrSpace) {
+    // TODO: In the future it should be possible to specify addrspace per-alloca.
+    return Error(ASLoc, "address space must match datalayout");
+  }
+
+  AllocaInst *AI = new AllocaInst(Ty, AS, Size, Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;
@@ -6077,7 +6124,7 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   bool isAtomic = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
-  SynchronizationScope Scope = CrossThread;
+  SyncScope::ID SSID = SyncScope::System;
 
   if (Lex.getKind() == lltok::kw_atomic) {
     isAtomic = true;
@@ -6095,7 +6142,7 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
   if (ParseType(Ty) ||
       ParseToken(lltok::comma, "expected comma after load's type") ||
       ParseTypeAndValue(Val, Loc, PFS) ||
-      ParseScopeAndOrdering(isAtomic, Scope, Ordering) ||
+      ParseScopeAndOrdering(isAtomic, SSID, Ordering) ||
       ParseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
@@ -6111,7 +6158,7 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(ExplicitTypeLoc,
                  "explicit pointee type doesn't match operand's pointee type");
 
-  Inst = new LoadInst(Ty, Val, "", isVolatile, Alignment, Ordering, Scope);
+  Inst = new LoadInst(Ty, Val, "", isVolatile, Alignment, Ordering, SSID);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
@@ -6126,7 +6173,7 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   bool isAtomic = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
-  SynchronizationScope Scope = CrossThread;
+  SyncScope::ID SSID = SyncScope::System;
 
   if (Lex.getKind() == lltok::kw_atomic) {
     isAtomic = true;
@@ -6142,7 +6189,7 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
   if (ParseTypeAndValue(Val, Loc, PFS) ||
       ParseToken(lltok::comma, "expected ',' after store operand") ||
       ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
-      ParseScopeAndOrdering(isAtomic, Scope, Ordering) ||
+      ParseScopeAndOrdering(isAtomic, SSID, Ordering) ||
       ParseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
@@ -6158,7 +6205,7 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
       Ordering == AtomicOrdering::AcquireRelease)
     return Error(Loc, "atomic store cannot use Acquire ordering");
 
-  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment, Ordering, Scope);
+  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment, Ordering, SSID);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
@@ -6170,7 +6217,7 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   AtomicOrdering SuccessOrdering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
-  SynchronizationScope Scope = CrossThread;
+  SyncScope::ID SSID = SyncScope::System;
   bool isVolatile = false;
   bool isWeak = false;
 
@@ -6185,7 +6232,7 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
       ParseTypeAndValue(Cmp, CmpLoc, PFS) ||
       ParseToken(lltok::comma, "expected ',' after cmpxchg cmp operand") ||
       ParseTypeAndValue(New, NewLoc, PFS) ||
-      ParseScopeAndOrdering(true /*Always atomic*/, Scope, SuccessOrdering) ||
+      ParseScopeAndOrdering(true /*Always atomic*/, SSID, SuccessOrdering) ||
       ParseOrdering(FailureOrdering))
     return true;
 
@@ -6208,7 +6255,7 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
   if (!New->getType()->isFirstClassType())
     return Error(NewLoc, "cmpxchg operand must be a first class value");
   AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
-      Ptr, Cmp, New, SuccessOrdering, FailureOrdering, Scope);
+      Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SSID);
   CXI->setVolatile(isVolatile);
   CXI->setWeak(isWeak);
   Inst = CXI;
@@ -6222,7 +6269,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
   bool AteExtraComma = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
-  SynchronizationScope Scope = CrossThread;
+  SyncScope::ID SSID = SyncScope::System;
   bool isVolatile = false;
   AtomicRMWInst::BinOp Operation;
 
@@ -6248,7 +6295,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
       ParseToken(lltok::comma, "expected ',' after atomicrmw address") ||
       ParseTypeAndValue(Val, ValLoc, PFS) ||
-      ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+      ParseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
     return true;
 
   if (Ordering == AtomicOrdering::Unordered)
@@ -6265,7 +6312,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
                          " integer");
 
   AtomicRMWInst *RMWI =
-    new AtomicRMWInst(Operation, Ptr, Val, Ordering, Scope);
+    new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
   RMWI->setVolatile(isVolatile);
   Inst = RMWI;
   return AteExtraComma ? InstExtraComma : InstNormal;
@@ -6275,8 +6322,8 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'fence' 'singlethread'? AtomicOrdering
 int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
-  SynchronizationScope Scope = CrossThread;
-  if (ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+  SyncScope::ID SSID = SyncScope::System;
+  if (ParseScopeAndOrdering(true /*Always atomic*/, SSID, Ordering))
     return true;
 
   if (Ordering == AtomicOrdering::Unordered)
@@ -6284,7 +6331,7 @@ int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
   if (Ordering == AtomicOrdering::Monotonic)
     return TokError("fence cannot be monotonic");
 
-  Inst = new FenceInst(Context, Ordering, Scope);
+  Inst = new FenceInst(Context, Ordering, SSID);
   return InstNormal;
 }
 
@@ -6326,7 +6373,7 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
       break;
     }
     if (ParseTypeAndValue(Val, EltLoc, PFS)) return true;
-    if (!Val->getType()->getScalarType()->isIntegerTy())
+    if (!Val->getType()->isIntOrIntVectorTy())
       return Error(EltLoc, "getelementptr index must be an integer");
 
     if (Val->getType()->isVectorTy()) {
diff --git a/contrib/llvm/lib/AsmParser/LLParser.h b/contrib/llvm/lib/AsmParser/LLParser.h
index 16d4e8b..d5b0593 100644
--- a/contrib/llvm/lib/AsmParser/LLParser.h
+++ b/contrib/llvm/lib/AsmParser/LLParser.h
@@ -193,6 +193,10 @@ namespace llvm {
         case lltok::kw_ninf: FMF.setNoInfs();          Lex.Lex(); continue;
         case lltok::kw_nsz:  FMF.setNoSignedZeros();   Lex.Lex(); continue;
         case lltok::kw_arcp: FMF.setAllowReciprocal(); Lex.Lex(); continue;
+        case lltok::kw_contract:
+          FMF.setAllowContract(true);
+          Lex.Lex();
+          continue;
         default: return FMF;
         }
       return FMF;
@@ -237,11 +241,14 @@ namespace llvm {
     bool ParseOptionalCallingConv(unsigned &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
     bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
-    bool ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
+    bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
                                AtomicOrdering &Ordering);
+    bool ParseScope(SyncScope::ID &SSID);
     bool ParseOrdering(AtomicOrdering &Ordering);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
+    bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
+                                     bool &AteExtraComma);
     bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
     bool parseAllocSizeArguments(unsigned &ElemSizeArg,
                                  Optional<unsigned> &HowManyArg);
@@ -393,7 +400,7 @@ namespace llvm {
       Value *V;
       AttributeSet Attrs;
       ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
-        : Loc(loc), V(v), Attrs(attrs) {}
+          : Loc(loc), V(v), Attrs(attrs) {}
     };
     bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
                             PerFunctionState &PFS,
@@ -447,7 +454,7 @@ namespace llvm {
       AttributeSet Attrs;
       std::string Name;
       ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
-        : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
+          : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
     };
     bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
     bool ParseFunctionHeader(Function *&Fn, bool isDefine);
diff --git a/contrib/llvm/lib/AsmParser/LLToken.h b/contrib/llvm/lib/AsmParser/LLToken.h
index 048aeee..0f3707b 100644
--- a/contrib/llvm/lib/AsmParser/LLToken.h
+++ b/contrib/llvm/lib/AsmParser/LLToken.h
@@ -93,11 +93,12 @@ enum Kind {
   kw_release,
   kw_acq_rel,
   kw_seq_cst,
-  kw_singlethread,
+  kw_syncscope,
   kw_nnan,
   kw_ninf,
   kw_nsz,
   kw_arcp,
+  kw_contract,
   kw_fast,
   kw_nuw,
   kw_nsw,
@@ -140,7 +141,7 @@ enum Kind {
   kw_spir_kernel,
   kw_spir_func,
   kw_x86_64_sysvcc,
-  kw_x86_64_win64cc,
+  kw_win64cc,
   kw_webkit_jscc,
   kw_anyregcc,
   kw_swiftcc,
@@ -152,6 +153,7 @@ enum Kind {
   kw_hhvm_ccc,
   kw_cxx_fast_tlscc,
   kw_amdgpu_vs,
+  kw_amdgpu_hs,
   kw_amdgpu_gs,
   kw_amdgpu_ps,
   kw_amdgpu_cs,
@@ -197,6 +199,7 @@ enum Kind {
   kw_returned,
   kw_returns_twice,
   kw_signext,
+  kw_speculatable,
   kw_ssp,
   kw_sspreq,
   kw_sspstrong,
diff --git a/contrib/llvm/lib/BinaryFormat/Dwarf.cpp b/contrib/llvm/lib/BinaryFormat/Dwarf.cpp
new file mode 100644
index 0000000..37c4579
--- /dev/null
+++ b/contrib/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -0,0 +1,577 @@
+//===-- llvm/BinaryFormat/Dwarf.cpp - Dwarf Framework ------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for generic dwarf information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+StringRef llvm::dwarf::TagString(unsigned Tag) {
+  switch (Tag) {
+  default:
+    return StringRef();
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+  case DW_TAG_##NAME:                                                          \
+    return "DW_TAG_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::getTag(StringRef TagString) {
+  return StringSwitch<unsigned>(TagString)
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+  .Case("DW_TAG_" #NAME, DW_TAG_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(DW_TAG_invalid);
+}
+
+unsigned llvm::dwarf::TagVersion(dwarf::Tag Tag) {
+  switch (Tag) {
+  default:
+    return 0;
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+  case DW_TAG_##NAME:                                                          \
+    return VERSION;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::TagVendor(dwarf::Tag Tag) {
+  switch (Tag) {
+  default:
+    return 0;
+#define HANDLE_DW_TAG(ID, NAME, VERSION, VENDOR)                               \
+  case DW_TAG_##NAME:                                                          \
+    return DWARF_VENDOR_##VENDOR;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::ChildrenString(unsigned Children) {
+  switch (Children) {
+  case DW_CHILDREN_no:
+    return "DW_CHILDREN_no";
+  case DW_CHILDREN_yes:
+    return "DW_CHILDREN_yes";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::AttributeString(unsigned Attribute) {
+  switch (Attribute) {
+  default:
+    return StringRef();
+#define HANDLE_DW_AT(ID, NAME, VERSION, VENDOR)                                \
+  case DW_AT_##NAME:                                                           \
+    return "DW_AT_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::AttributeVersion(dwarf::Attribute Attribute) {
+  switch (Attribute) {
+  default:
+    return 0;
+#define HANDLE_DW_AT(ID, NAME, VERSION, VENDOR)                                \
+  case DW_AT_##NAME:                                                           \
+    return VERSION;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::AttributeVendor(dwarf::Attribute Attribute) {
+  switch (Attribute) {
+  default:
+    return 0;
+#define HANDLE_DW_AT(ID, NAME, VERSION, VENDOR)                                \
+  case DW_AT_##NAME:                                                           \
+    return DWARF_VENDOR_##VENDOR;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::FormEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_FORM(ID, NAME, VERSION, VENDOR)                              \
+  case DW_FORM_##NAME:                                                         \
+    return "DW_FORM_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::FormVersion(dwarf::Form Form) {
+  switch (Form) {
+  default:
+    return 0;
+#define HANDLE_DW_FORM(ID, NAME, VERSION, VENDOR)                              \
+  case DW_FORM_##NAME:                                                         \
+    return VERSION;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::FormVendor(dwarf::Form Form) {
+  switch (Form) {
+  default:
+    return 0;
+#define HANDLE_DW_FORM(ID, NAME, VERSION, VENDOR)                              \
+  case DW_FORM_##NAME:                                                         \
+    return DWARF_VENDOR_##VENDOR;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::OperationEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_OP(ID, NAME, VERSION, VENDOR)                                \
+  case DW_OP_##NAME:                                                           \
+    return "DW_OP_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  case DW_OP_LLVM_fragment:
+    return "DW_OP_LLVM_fragment";
+  }
+}
+
+unsigned llvm::dwarf::getOperationEncoding(StringRef OperationEncodingString) {
+  return StringSwitch<unsigned>(OperationEncodingString)
+#define HANDLE_DW_OP(ID, NAME, VERSION, VENDOR)                                \
+  .Case("DW_OP_" #NAME, DW_OP_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Case("DW_OP_LLVM_fragment", DW_OP_LLVM_fragment)
+      .Default(0);
+}
+
+unsigned llvm::dwarf::OperationVersion(dwarf::LocationAtom Op) {
+  switch (Op) {
+  default:
+    return 0;
+#define HANDLE_DW_OP(ID, NAME, VERSION, VENDOR)                                \
+  case DW_OP_##NAME:                                                           \
+    return VERSION;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::OperationVendor(dwarf::LocationAtom Op) {
+  switch (Op) {
+  default:
+    return 0;
+#define HANDLE_DW_OP(ID, NAME, VERSION, VENDOR)                                \
+  case DW_OP_##NAME:                                                           \
+    return DWARF_VENDOR_##VENDOR;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR)                               \
+  case DW_ATE_##NAME:                                                          \
+    return "DW_ATE_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::getAttributeEncoding(StringRef EncodingString) {
+  return StringSwitch<unsigned>(EncodingString)
+#define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR)                               \
+  .Case("DW_ATE_" #NAME, DW_ATE_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(0);
+}
+
+unsigned llvm::dwarf::AttributeEncodingVersion(dwarf::TypeKind ATE) {
+  switch (ATE) {
+  default:
+    return 0;
+#define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR)                               \
+  case DW_ATE_##NAME:                                                          \
+    return VERSION;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::AttributeEncodingVendor(dwarf::TypeKind ATE) {
+  switch (ATE) {
+  default:
+    return 0;
+#define HANDLE_DW_ATE(ID, NAME, VERSION, VENDOR)                               \
+  case DW_ATE_##NAME:                                                          \
+    return DWARF_VENDOR_##VENDOR;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::DecimalSignString(unsigned Sign) {
+  switch (Sign) {
+  case DW_DS_unsigned:
+    return "DW_DS_unsigned";
+  case DW_DS_leading_overpunch:
+    return "DW_DS_leading_overpunch";
+  case DW_DS_trailing_overpunch:
+    return "DW_DS_trailing_overpunch";
+  case DW_DS_leading_separate:
+    return "DW_DS_leading_separate";
+  case DW_DS_trailing_separate:
+    return "DW_DS_trailing_separate";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::EndianityString(unsigned Endian) {
+  switch (Endian) {
+  case DW_END_default:
+    return "DW_END_default";
+  case DW_END_big:
+    return "DW_END_big";
+  case DW_END_little:
+    return "DW_END_little";
+  case DW_END_lo_user:
+    return "DW_END_lo_user";
+  case DW_END_hi_user:
+    return "DW_END_hi_user";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::AccessibilityString(unsigned Access) {
+  switch (Access) {
+  // Accessibility codes
+  case DW_ACCESS_public:
+    return "DW_ACCESS_public";
+  case DW_ACCESS_protected:
+    return "DW_ACCESS_protected";
+  case DW_ACCESS_private:
+    return "DW_ACCESS_private";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::VisibilityString(unsigned Visibility) {
+  switch (Visibility) {
+  case DW_VIS_local:
+    return "DW_VIS_local";
+  case DW_VIS_exported:
+    return "DW_VIS_exported";
+  case DW_VIS_qualified:
+    return "DW_VIS_qualified";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::VirtualityString(unsigned Virtuality) {
+  switch (Virtuality) {
+  default:
+    return StringRef();
+#define HANDLE_DW_VIRTUALITY(ID, NAME)                                         \
+  case DW_VIRTUALITY_##NAME:                                                   \
+    return "DW_VIRTUALITY_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::getVirtuality(StringRef VirtualityString) {
+  return StringSwitch<unsigned>(VirtualityString)
+#define HANDLE_DW_VIRTUALITY(ID, NAME)                                         \
+  .Case("DW_VIRTUALITY_" #NAME, DW_VIRTUALITY_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(DW_VIRTUALITY_invalid);
+}
+
+StringRef llvm::dwarf::LanguageString(unsigned Language) {
+  switch (Language) {
+  default:
+    return StringRef();
+#define HANDLE_DW_LANG(ID, NAME, VERSION, VENDOR)                              \
+  case DW_LANG_##NAME:                                                         \
+    return "DW_LANG_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::getLanguage(StringRef LanguageString) {
+  return StringSwitch<unsigned>(LanguageString)
+#define HANDLE_DW_LANG(ID, NAME, VERSION, VENDOR)                              \
+  .Case("DW_LANG_" #NAME, DW_LANG_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(0);
+}
+
+unsigned llvm::dwarf::LanguageVersion(dwarf::SourceLanguage Lang) {
+  switch (Lang) {
+  default:
+    return 0;
+#define HANDLE_DW_LANG(ID, NAME, VERSION, VENDOR)                              \
+  case DW_LANG_##NAME:                                                         \
+    return VERSION;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::LanguageVendor(dwarf::SourceLanguage Lang) {
+  switch (Lang) {
+  default:
+    return 0;
+#define HANDLE_DW_LANG(ID, NAME, VERSION, VENDOR)                              \
+  case DW_LANG_##NAME:                                                         \
+    return DWARF_VENDOR_##VENDOR;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::CaseString(unsigned Case) {
+  switch (Case) {
+  case DW_ID_case_sensitive:
+    return "DW_ID_case_sensitive";
+  case DW_ID_up_case:
+    return "DW_ID_up_case";
+  case DW_ID_down_case:
+    return "DW_ID_down_case";
+  case DW_ID_case_insensitive:
+    return "DW_ID_case_insensitive";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::ConventionString(unsigned CC) {
+  switch (CC) {
+  default:
+    return StringRef();
+#define HANDLE_DW_CC(ID, NAME)                                                 \
+  case DW_CC_##NAME:                                                           \
+    return "DW_CC_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::getCallingConvention(StringRef CCString) {
+  return StringSwitch<unsigned>(CCString)
+#define HANDLE_DW_CC(ID, NAME) .Case("DW_CC_" #NAME, DW_CC_##NAME)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(0);
+}
+
+StringRef llvm::dwarf::InlineCodeString(unsigned Code) {
+  switch (Code) {
+  case DW_INL_not_inlined:
+    return "DW_INL_not_inlined";
+  case DW_INL_inlined:
+    return "DW_INL_inlined";
+  case DW_INL_declared_not_inlined:
+    return "DW_INL_declared_not_inlined";
+  case DW_INL_declared_inlined:
+    return "DW_INL_declared_inlined";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::ArrayOrderString(unsigned Order) {
+  switch (Order) {
+  case DW_ORD_row_major:
+    return "DW_ORD_row_major";
+  case DW_ORD_col_major:
+    return "DW_ORD_col_major";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::DiscriminantString(unsigned Discriminant) {
+  switch (Discriminant) {
+  case DW_DSC_label:
+    return "DW_DSC_label";
+  case DW_DSC_range:
+    return "DW_DSC_range";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::LNStandardString(unsigned Standard) {
+  switch (Standard) {
+  default:
+    return StringRef();
+#define HANDLE_DW_LNS(ID, NAME)                                                \
+  case DW_LNS_##NAME:                                                          \
+    return "DW_LNS_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::LNExtendedString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_LNE(ID, NAME)                                                \
+  case DW_LNE_##NAME:                                                          \
+    return "DW_LNE_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::MacinfoString(unsigned Encoding) {
+  switch (Encoding) {
+  // Macinfo Type Encodings
+  case DW_MACINFO_define:
+    return "DW_MACINFO_define";
+  case DW_MACINFO_undef:
+    return "DW_MACINFO_undef";
+  case DW_MACINFO_start_file:
+    return "DW_MACINFO_start_file";
+  case DW_MACINFO_end_file:
+    return "DW_MACINFO_end_file";
+  case DW_MACINFO_vendor_ext:
+    return "DW_MACINFO_vendor_ext";
+  case DW_MACINFO_invalid:
+    return "DW_MACINFO_invalid";
+  }
+  return StringRef();
+}
+
+unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
+  return StringSwitch<unsigned>(MacinfoString)
+      .Case("DW_MACINFO_define", DW_MACINFO_define)
+      .Case("DW_MACINFO_undef", DW_MACINFO_undef)
+      .Case("DW_MACINFO_start_file", DW_MACINFO_start_file)
+      .Case("DW_MACINFO_end_file", DW_MACINFO_end_file)
+      .Case("DW_MACINFO_vendor_ext", DW_MACINFO_vendor_ext)
+      .Default(DW_MACINFO_invalid);
+}
+
+StringRef llvm::dwarf::CallFrameString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_CFA(ID, NAME)                                                \
+  case DW_CFA_##NAME:                                                          \
+    return "DW_CFA_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::ApplePropertyString(unsigned Prop) {
+  switch (Prop) {
+  default:
+    return StringRef();
+#define HANDLE_DW_APPLE_PROPERTY(ID, NAME)                                     \
+  case DW_APPLE_PROPERTY_##NAME:                                               \
+    return "DW_APPLE_PROPERTY_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::UnitTypeString(unsigned UT) {
+  switch (UT) {
+  default:
+    return StringRef();
+#define HANDLE_DW_UT(ID, NAME)                                                 \
+  case DW_UT_##NAME:                                                           \
+    return "DW_UT_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+StringRef llvm::dwarf::AtomTypeString(unsigned AT) {
+  switch (AT) {
+  case dwarf::DW_ATOM_null:
+    return "DW_ATOM_null";
+  case dwarf::DW_ATOM_die_offset:
+    return "DW_ATOM_die_offset";
+  case DW_ATOM_cu_offset:
+    return "DW_ATOM_cu_offset";
+  case DW_ATOM_die_tag:
+    return "DW_ATOM_die_tag";
+  case DW_ATOM_type_flags:
+    return "DW_ATOM_type_flags";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::GDBIndexEntryKindString(GDBIndexEntryKind Kind) {
+  switch (Kind) {
+  case GIEK_NONE:
+    return "NONE";
+  case GIEK_TYPE:
+    return "TYPE";
+  case GIEK_VARIABLE:
+    return "VARIABLE";
+  case GIEK_FUNCTION:
+    return "FUNCTION";
+  case GIEK_OTHER:
+    return "OTHER";
+  case GIEK_UNUSED5:
+    return "UNUSED5";
+  case GIEK_UNUSED6:
+    return "UNUSED6";
+  case GIEK_UNUSED7:
+    return "UNUSED7";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryKind value");
+}
+
+StringRef
+llvm::dwarf::GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage) {
+  switch (Linkage) {
+  case GIEL_EXTERNAL:
+    return "EXTERNAL";
+  case GIEL_STATIC:
+    return "STATIC";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryLinkage value");
+}
+
+StringRef llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) {
+  switch (Attr) {
+  case DW_AT_accessibility:
+    return AccessibilityString(Val);
+  case DW_AT_virtuality:
+    return VirtualityString(Val);
+  case DW_AT_language:
+    return LanguageString(Val);
+  case DW_AT_encoding:
+    return AttributeEncodingString(Val);
+  case DW_AT_decimal_sign:
+    return DecimalSignString(Val);
+  case DW_AT_endianity:
+    return EndianityString(Val);
+  case DW_AT_visibility:
+    return VisibilityString(Val);
+  case DW_AT_identifier_case:
+    return CaseString(Val);
+  case DW_AT_calling_convention:
+    return ConventionString(Val);
+  case DW_AT_inline:
+    return InlineCodeString(Val);
+  case DW_AT_ordering:
+    return ArrayOrderString(Val);
+  case DW_AT_discr_value:
+    return DiscriminantString(Val);
+  }
+
+  return StringRef();
+}
+
+bool llvm::dwarf::isValidFormForVersion(Form F, unsigned Version,
+                                        bool ExtensionsOk) {
+  if (FormVendor(F) == DWARF_VENDOR_DWARF) {
+    unsigned FV = FormVersion(F);
+    return FV > 0 && FV <= Version;
+  }
+  return ExtensionsOk;
+}
diff --git a/contrib/llvm/lib/BinaryFormat/Magic.cpp b/contrib/llvm/lib/BinaryFormat/Magic.cpp
new file mode 100644
index 0000000..b19a07a
--- /dev/null
+++ b/contrib/llvm/lib/BinaryFormat/Magic.cpp
@@ -0,0 +1,217 @@
+//===- llvm/BinaryFormat/Magic.cpp - File magic identification --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/Magic.h"
+
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FileSystem.h"
+
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
+using namespace llvm;
+using namespace llvm::support::endian;
+using namespace llvm::sys::fs;
+
+template <size_t N>
+static bool startswith(StringRef Magic, const char (&S)[N]) {
+  return Magic.startswith(StringRef(S, N - 1));
+}
+
+/// @brief Identify the magic in magic.
+file_magic llvm::identify_magic(StringRef Magic) {
+  if (Magic.size() < 4)
+    return file_magic::unknown;
+  switch ((unsigned char)Magic[0]) {
+  case 0x00: {
+    // COFF bigobj, CL.exe's LTO object file, or short import library file
+    if (startswith(Magic, "\0\0\xFF\xFF")) {
+      size_t MinSize =
+          offsetof(COFF::BigObjHeader, UUID) + sizeof(COFF::BigObjMagic);
+      if (Magic.size() < MinSize)
+        return file_magic::coff_import_library;
+
+      const char *Start = Magic.data() + offsetof(COFF::BigObjHeader, UUID);
+      if (memcmp(Start, COFF::BigObjMagic, sizeof(COFF::BigObjMagic)) == 0)
+        return file_magic::coff_object;
+      if (memcmp(Start, COFF::ClGlObjMagic, sizeof(COFF::BigObjMagic)) == 0)
+        return file_magic::coff_cl_gl_object;
+      return file_magic::coff_import_library;
+    }
+    // Windows resource file
+    if (Magic.size() >= sizeof(COFF::WinResMagic) &&
+        memcmp(Magic.data(), COFF::WinResMagic, sizeof(COFF::WinResMagic)) == 0)
+      return file_magic::windows_resource;
+    // 0x0000 = COFF unknown machine type
+    if (Magic[1] == 0)
+      return file_magic::coff_object;
+    if (startswith(Magic, "\0asm"))
+      return file_magic::wasm_object;
+    break;
+  }
+  case 0xDE: // 0x0B17C0DE = BC wraper
+    if (startswith(Magic, "\xDE\xC0\x17\x0B"))
+      return file_magic::bitcode;
+    break;
+  case 'B':
+    if (startswith(Magic, "BC\xC0\xDE"))
+      return file_magic::bitcode;
+    break;
+  case '!':
+    if (startswith(Magic, "!<arch>\n") || startswith(Magic, "!<thin>\n"))
+      return file_magic::archive;
+    break;
+
+  case '\177':
+    if (startswith(Magic, "\177ELF") && Magic.size() >= 18) {
+      bool Data2MSB = Magic[5] == 2;
+      unsigned high = Data2MSB ? 16 : 17;
+      unsigned low = Data2MSB ? 17 : 16;
+      if (Magic[high] == 0) {
+        switch (Magic[low]) {
+        default:
+          return file_magic::elf;
+        case 1:
+          return file_magic::elf_relocatable;
+        case 2:
+          return file_magic::elf_executable;
+        case 3:
+          return file_magic::elf_shared_object;
+        case 4:
+          return file_magic::elf_core;
+        }
+      }
+      // It's still some type of ELF file.
+      return file_magic::elf;
+    }
+    break;
+
+  case 0xCA:
+    if (startswith(Magic, "\xCA\xFE\xBA\xBE") ||
+        startswith(Magic, "\xCA\xFE\xBA\xBF")) {
+      // This is complicated by an overlap with Java class files.
+      // See the Mach-O section in /usr/share/file/magic for details.
+      if (Magic.size() >= 8 && Magic[7] < 43)
+        return file_magic::macho_universal_binary;
+    }
+    break;
+
+  // The two magic numbers for mach-o are:
+  // 0xfeedface - 32-bit mach-o
+  // 0xfeedfacf - 64-bit mach-o
+  case 0xFE:
+  case 0xCE:
+  case 0xCF: {
+    uint16_t type = 0;
+    if (startswith(Magic, "\xFE\xED\xFA\xCE") ||
+        startswith(Magic, "\xFE\xED\xFA\xCF")) {
+      /* Native endian */
+      size_t MinSize;
+      if (Magic[3] == char(0xCE))
+        MinSize = sizeof(MachO::mach_header);
+      else
+        MinSize = sizeof(MachO::mach_header_64);
+      if (Magic.size() >= MinSize)
+        type = Magic[12] << 24 | Magic[13] << 12 | Magic[14] << 8 | Magic[15];
+    } else if (startswith(Magic, "\xCE\xFA\xED\xFE") ||
+               startswith(Magic, "\xCF\xFA\xED\xFE")) {
+      /* Reverse endian */
+      size_t MinSize;
+      if (Magic[0] == char(0xCE))
+        MinSize = sizeof(MachO::mach_header);
+      else
+        MinSize = sizeof(MachO::mach_header_64);
+      if (Magic.size() >= MinSize)
+        type = Magic[15] << 24 | Magic[14] << 12 | Magic[13] << 8 | Magic[12];
+    }
+    switch (type) {
+    default:
+      break;
+    case 1:
+      return file_magic::macho_object;
+    case 2:
+      return file_magic::macho_executable;
+    case 3:
+      return file_magic::macho_fixed_virtual_memory_shared_lib;
+    case 4:
+      return file_magic::macho_core;
+    case 5:
+      return file_magic::macho_preload_executable;
+    case 6:
+      return file_magic::macho_dynamically_linked_shared_lib;
+    case 7:
+      return file_magic::macho_dynamic_linker;
+    case 8:
+      return file_magic::macho_bundle;
+    case 9:
+      return file_magic::macho_dynamically_linked_shared_lib_stub;
+    case 10:
+      return file_magic::macho_dsym_companion;
+    case 11:
+      return file_magic::macho_kext_bundle;
+    }
+    break;
+  }
+  case 0xF0: // PowerPC Windows
+  case 0x83: // Alpha 32-bit
+  case 0x84: // Alpha 64-bit
+  case 0x66: // MPS R4000 Windows
+  case 0x50: // mc68K
+  case 0x4c: // 80386 Windows
+  case 0xc4: // ARMNT Windows
+    if (Magic[1] == 0x01)
+      return file_magic::coff_object;
+    LLVM_FALLTHROUGH;
+
+  case 0x90: // PA-RISC Windows
+  case 0x68: // mc68K Windows
+    if (Magic[1] == 0x02)
+      return file_magic::coff_object;
+    break;
+
+  case 'M': // Possible MS-DOS stub on Windows PE file
+    if (startswith(Magic, "MZ")) {
+      uint32_t off = read32le(Magic.data() + 0x3c);
+      // PE/COFF file, either EXE or DLL.
+      if (off < Magic.size() &&
+          memcmp(Magic.data() + off, COFF::PEMagic, sizeof(COFF::PEMagic)) == 0)
+        return file_magic::pecoff_executable;
+    }
+    break;
+
+  case 0x64: // x86-64 or ARM64 Windows.
+    if (Magic[1] == char(0x86) || Magic[1] == char(0xaa))
+      return file_magic::coff_object;
+    break;
+
+  default:
+    break;
+  }
+  return file_magic::unknown;
+}
+
+std::error_code llvm::identify_magic(const Twine &Path, file_magic &Result) {
+  int FD;
+  if (std::error_code EC = openFileForRead(Path, FD))
+    return EC;
+
+  char Buffer[32];
+  int Length = read(FD, Buffer, sizeof(Buffer));
+  if (close(FD) != 0 || Length < 0)
+    return std::error_code(errno, std::generic_category());
+
+  Result = identify_magic(StringRef(Buffer, Length));
+  return std::error_code();
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index a46e49c..2b4970a 100644
--- a/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -28,8 +28,8 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -40,13 +40,13 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalIndirectSymbol.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
@@ -372,12 +372,26 @@ Expected<std::string> readTriple(BitstreamCursor &Stream) {
 
 class BitcodeReaderBase {
 protected:
-  BitcodeReaderBase(BitstreamCursor Stream) : Stream(std::move(Stream)) {
+  BitcodeReaderBase(BitstreamCursor Stream, StringRef Strtab)
+      : Stream(std::move(Stream)), Strtab(Strtab) {
     this->Stream.setBlockInfo(&BlockInfo);
   }
 
   BitstreamBlockInfo BlockInfo;
   BitstreamCursor Stream;
+  StringRef Strtab;
+
+  /// In version 2 of the bitcode we store names of global values and comdats in
+  /// a string table rather than in the VST.
+  bool UseStrtab = false;
+
+  Expected<unsigned> parseVersionRecord(ArrayRef<uint64_t> Record);
+
+  /// If this module uses a string table, pop the reference to the string table
+  /// and return the referenced string and the rest of the record. Otherwise
+  /// just return the record itself.
+  std::pair<StringRef, ArrayRef<uint64_t>>
+  readNameFromStrtab(ArrayRef<uint64_t> Record);
 
   bool readBlockInfo();
 
@@ -395,6 +409,27 @@ Error BitcodeReaderBase::error(const Twine &Message) {
   return ::error(FullMsg);
 }
 
+Expected<unsigned>
+BitcodeReaderBase::parseVersionRecord(ArrayRef<uint64_t> Record) {
+  if (Record.size() < 1)
+    return error("Invalid record");
+  unsigned ModuleVersion = Record[0];
+  if (ModuleVersion > 2)
+    return error("Invalid value");
+  UseStrtab = ModuleVersion >= 2;
+  return ModuleVersion;
+}
+
+std::pair<StringRef, ArrayRef<uint64_t>>
+BitcodeReaderBase::readNameFromStrtab(ArrayRef<uint64_t> Record) {
+  if (!UseStrtab)
+    return {"", Record};
+  // Invalid reference. Let the caller complain about the record being empty.
+  if (Record[0] + Record[1] > Strtab.size())
+    return {"", {}};
+  return {StringRef(Strtab.data() + Record[0], Record[1]), Record.slice(2)};
+}
+
 class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   LLVMContext &Context;
   Module *TheModule = nullptr;
@@ -405,6 +440,9 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   bool SeenValueSymbolTable = false;
   uint64_t VSTOffset = 0;
 
+  std::vector<std::string> SectionTable;
+  std::vector<std::string> GCTable;
+
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
   Optional<MetadataLoader> MDLoader;
@@ -419,10 +457,10 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
 
   /// The set of attributes by index.  Index zero in the file is for null, and
   /// is thus not represented here.  As such all indices are off by one.
-  std::vector<AttributeSet> MAttributes;
+  std::vector<AttributeList> MAttributes;
 
   /// The set of attribute groups.
-  std::map<unsigned, AttributeSet> MAttributeGroups;
+  std::map<unsigned, AttributeList> MAttributeGroups;
 
   /// While parsing a function body, this is a list of the basic blocks for the
   /// function.
@@ -475,10 +513,11 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   TBAAVerifier TBAAVerifyHelper;
 
   std::vector<std::string> BundleTags;
+  SmallVector<SyncScope::ID, 8> SSIDs;
 
 public:
-  BitcodeReader(BitstreamCursor Stream, StringRef ProducerIdentification,
-                LLVMContext &Context);
+  BitcodeReader(BitstreamCursor Stream, StringRef Strtab,
+                StringRef ProducerIdentification, LLVMContext &Context);
 
   Error materializeForwardReferencedFunctions();
 
@@ -520,10 +559,10 @@ private:
     return FunctionBBs[ID];
   }
 
-  AttributeSet getAttributes(unsigned i) const {
+  AttributeList getAttributes(unsigned i) const {
     if (i-1 < MAttributes.size())
       return MAttributes[i-1];
-    return AttributeSet();
+    return AttributeList();
   }
 
   /// Read a value/type pair out of the specified record from slot 'Slot'.
@@ -598,15 +637,26 @@ private:
   Error parseAlignmentValue(uint64_t Exponent, unsigned &Alignment);
   Error parseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
   Error parseModule(uint64_t ResumeBit, bool ShouldLazyLoadMetadata = false);
+
+  Error parseComdatRecord(ArrayRef<uint64_t> Record);
+  Error parseGlobalVarRecord(ArrayRef<uint64_t> Record);
+  Error parseFunctionRecord(ArrayRef<uint64_t> Record);
+  Error parseGlobalIndirectSymbolRecord(unsigned BitCode,
+                                        ArrayRef<uint64_t> Record);
+
   Error parseAttributeBlock();
   Error parseAttributeGroupBlock();
   Error parseTypeTable();
   Error parseTypeTableBody();
   Error parseOperandBundleTags();
+  Error parseSyncScopeNames();
 
   Expected<Value *> recordValue(SmallVectorImpl<uint64_t> &Record,
                                 unsigned NameIndex, Triple &TT);
+  void setDeferredFunctionInfo(unsigned FuncBitcodeOffsetDelta, Function *F,
+                               ArrayRef<uint64_t> Record);
   Error parseValueSymbolTable(uint64_t Offset = 0);
+  Error parseGlobalValueSymbolTable();
   Error parseConstants();
   Error rememberAndSkipFunctionBodies();
   Error rememberAndSkipFunctionBody();
@@ -620,6 +670,8 @@ private:
   Error findFunctionInStream(
       Function *F,
       DenseMap<Function *, uint64_t>::iterator DeferredFunctionInfoIterator);
+
+  SyncScope::ID getDecodedSyncScopeID(unsigned Val);
 };
 
 /// Class to manage reading and parsing function summary index bitcode
@@ -639,15 +691,16 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// Used to enable on-demand parsing of the VST.
   uint64_t VSTOffset = 0;
 
-  // Map to save ValueId to GUID association that was recorded in the
+  // Map to save ValueId to ValueInfo association that was recorded in the
   // ValueSymbolTable. It is used after the VST is parsed to convert
   // call graph edges read from the function summary from referencing
-  // callees by their ValueId to using the GUID instead, which is how
+  // callees by their ValueId to using the ValueInfo instead, which is how
   // they are recorded in the summary index being built.
-  // We save a second GUID which is the same as the first one, but ignoring the
-  // linkage, i.e. for value other than local linkage they are identical.
-  DenseMap<unsigned, std::pair<GlobalValue::GUID, GlobalValue::GUID>>
-      ValueIdToCallGraphGUIDMap;
+  // We save a GUID which refers to the same global as the ValueInfo, but
+  // ignoring the linkage, i.e. for values other than local linkage they are
+  // identical.
+  DenseMap<unsigned, std::pair<ValueInfo, GlobalValue::GUID>>
+      ValueIdToValueInfoMap;
 
   /// Map populated during module path string table parsing, from the
   /// module ID to a string reference owned by the index's module
@@ -658,13 +711,25 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// Original source file name recorded in a bitcode record.
   std::string SourceFileName;
 
+  /// The string identifier given to this module by the client, normally the
+  /// path to the bitcode file.
+  StringRef ModulePath;
+
+  /// For per-module summary indexes, the unique numerical identifier given to
+  /// this module by the client.
+  unsigned ModuleId;
+
 public:
-  ModuleSummaryIndexBitcodeReader(
-      BitstreamCursor Stream, ModuleSummaryIndex &TheIndex);
+  ModuleSummaryIndexBitcodeReader(BitstreamCursor Stream, StringRef Strtab,
+                                  ModuleSummaryIndex &TheIndex,
+                                  StringRef ModulePath, unsigned ModuleId);
 
-  Error parseModule(StringRef ModulePath);
+  Error parseModule();
 
 private:
+  void setValueGUID(uint64_t ValueID, StringRef ValueName,
+                    GlobalValue::LinkageTypes Linkage,
+                    StringRef SourceFileName);
   Error parseValueSymbolTable(
       uint64_t Offset,
       DenseMap<unsigned, GlobalValue::LinkageTypes> &ValueIdToLinkageMap);
@@ -672,11 +737,13 @@ private:
   std::vector<FunctionSummary::EdgeTy> makeCallList(ArrayRef<uint64_t> Record,
                                                     bool IsOldProfileFormat,
                                                     bool HasProfile);
-  Error parseEntireSummary(StringRef ModulePath);
+  Error parseEntireSummary(unsigned ID);
   Error parseModuleStringTable();
 
-  std::pair<GlobalValue::GUID, GlobalValue::GUID>
-  getGUIDFromValueId(unsigned ValueId);
+  std::pair<ValueInfo, GlobalValue::GUID>
+  getValueInfoFromValueId(unsigned ValueId);
+
+  ModuleSummaryIndex::ModuleInfo *addThisModule();
 };
 
 } // end anonymous namespace
@@ -694,10 +761,10 @@ std::error_code llvm::errorToErrorCodeAndEmitErrors(LLVMContext &Ctx,
   return std::error_code();
 }
 
-BitcodeReader::BitcodeReader(BitstreamCursor Stream,
+BitcodeReader::BitcodeReader(BitstreamCursor Stream, StringRef Strtab,
                              StringRef ProducerIdentification,
                              LLVMContext &Context)
-    : BitcodeReaderBase(std::move(Stream)), Context(Context),
+    : BitcodeReaderBase(std::move(Stream), Strtab), Context(Context),
       ValueList(Context) {
   this->ProducerIdentification = ProducerIdentification;
 }
@@ -802,11 +869,11 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   auto Linkage = GlobalValue::LinkageTypes(RawFlags & 0xF); // 4 bits
   RawFlags = RawFlags >> 4;
   bool NotEligibleToImport = (RawFlags & 0x1) || Version < 3;
-  // The LiveRoot flag wasn't introduced until version 3. For dead stripping
+  // The Live flag wasn't introduced until version 3. For dead stripping
   // to work correctly on earlier versions, we must conservatively treat all
   // values as live.
-  bool LiveRoot = (RawFlags & 0x2) || Version < 3;
-  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, LiveRoot);
+  bool Live = (RawFlags & 0x2) || Version < 3;
+  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live);
 }
 
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
@@ -935,14 +1002,6 @@ static AtomicOrdering getDecodedOrdering(unsigned Val) {
   }
 }
 
-static SynchronizationScope getDecodedSynchScope(unsigned Val) {
-  switch (Val) {
-  case bitc::SYNCHSCOPE_SINGLETHREAD: return SingleThread;
-  default: // Map unknown scopes to cross-thread.
-  case bitc::SYNCHSCOPE_CROSSTHREAD: return CrossThread;
-  }
-}
-
 static Comdat::SelectionKind getDecodedComdatSelectionKind(unsigned Val) {
   switch (Val) {
   default: // Map unknown selection kinds to any.
@@ -971,6 +1030,8 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
     FMF.setNoSignedZeros();
   if (0 != (Val & FastMathFlags::AllowReciprocal))
     FMF.setAllowReciprocal();
+  if (0 != (Val & FastMathFlags::AllowContract))
+    FMF.setAllowContract(true);
   return FMF;
 }
 
@@ -1066,6 +1127,7 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   case Attribute::SwiftSelf:       return 1ULL << 51;
   case Attribute::SwiftError:      return 1ULL << 52;
   case Attribute::WriteOnly:       return 1ULL << 53;
+  case Attribute::Speculatable:    return 1ULL << 54;
   case Attribute::Dereferenceable:
     llvm_unreachable("dereferenceable attribute not supported in raw format");
     break;
@@ -1132,7 +1194,7 @@ Error BitcodeReader::parseAttributeBlock() {
 
   SmallVector<uint64_t, 64> Record;
 
-  SmallVector<AttributeSet, 8> Attrs;
+  SmallVector<AttributeList, 8> Attrs;
 
   // Read all the records.
   while (true) {
@@ -1162,10 +1224,10 @@ Error BitcodeReader::parseAttributeBlock() {
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
         decodeLLVMAttributesForBitcode(B, Record[i+1]);
-        Attrs.push_back(AttributeSet::get(Context, Record[i], B));
+        Attrs.push_back(AttributeList::get(Context, Record[i], B));
       }
 
-      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      MAttributes.push_back(AttributeList::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -1173,7 +1235,7 @@ Error BitcodeReader::parseAttributeBlock() {
       for (unsigned i = 0, e = Record.size(); i != e; ++i)
         Attrs.push_back(MAttributeGroups[Record[i]]);
 
-      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      MAttributes.push_back(AttributeList::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -1262,6 +1324,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::ReturnsTwice;
   case bitc::ATTR_KIND_S_EXT:
     return Attribute::SExt;
+  case bitc::ATTR_KIND_SPECULATABLE:
+    return Attribute::Speculatable;
   case bitc::ATTR_KIND_STACK_ALIGNMENT:
     return Attribute::StackAlignment;
   case bitc::ATTR_KIND_STACK_PROTECT:
@@ -1391,7 +1455,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
         }
       }
 
-      MAttributeGroups[GrpID] = AttributeSet::get(Context, Idx, B);
+      MAttributeGroups[GrpID] = AttributeList::get(Context, Idx, B);
       break;
     }
     }
@@ -1677,6 +1741,44 @@ Error BitcodeReader::parseOperandBundleTags() {
   }
 }
 
+Error BitcodeReader::parseSyncScopeNames() {
+  if (Stream.EnterSubBlock(bitc::SYNC_SCOPE_NAMES_BLOCK_ID))
+    return error("Invalid record");
+
+  if (!SSIDs.empty())
+    return error("Invalid multiple synchronization scope names blocks");
+
+  SmallVector<uint64_t, 64> Record;
+  while (true) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock: // Handled for us already.
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      if (SSIDs.empty())
+        return error("Invalid empty synchronization scope names block");
+      return Error::success();
+    case BitstreamEntry::Record:
+      // The interesting case.
+      break;
+    }
+
+    // Synchronization scope names are implicitly mapped to synchronization
+    // scope IDs by their order.
+
+    if (Stream.readRecord(Entry.ID, Record) != bitc::SYNC_SCOPE_NAME)
+      return error("Invalid record");
+
+    SmallString<16> SSN;
+    if (convertToString(Record, 0, SSN))
+      return error("Invalid record");
+
+    SSIDs.push_back(Context.getOrInsertSyncScopeID(SSN));
+    Record.clear();
+  }
+}
+
 /// Associate a value with its name from the given index in the provided record.
 Expected<Value *> BitcodeReader::recordValue(SmallVectorImpl<uint64_t> &Record,
                                              unsigned NameIndex, Triple &TT) {
@@ -1725,6 +1827,54 @@ static uint64_t jumpToValueSymbolTable(uint64_t Offset,
   return CurrentBit;
 }
 
+void BitcodeReader::setDeferredFunctionInfo(unsigned FuncBitcodeOffsetDelta,
+                                            Function *F,
+                                            ArrayRef<uint64_t> Record) {
+  // Note that we subtract 1 here because the offset is relative to one word
+  // before the start of the identification or module block, which was
+  // historically always the start of the regular bitcode header.
+  uint64_t FuncWordOffset = Record[1] - 1;
+  uint64_t FuncBitOffset = FuncWordOffset * 32;
+  DeferredFunctionInfo[F] = FuncBitOffset + FuncBitcodeOffsetDelta;
+  // Set the LastFunctionBlockBit to point to the last function block.
+  // Later when parsing is resumed after function materialization,
+  // we can simply skip that last function block.
+  if (FuncBitOffset > LastFunctionBlockBit)
+    LastFunctionBlockBit = FuncBitOffset;
+}
+
+/// Read a new-style GlobalValue symbol table.
+Error BitcodeReader::parseGlobalValueSymbolTable() {
+  unsigned FuncBitcodeOffsetDelta =
+      Stream.getAbbrevIDWidth() + bitc::BlockIDWidth;
+
+  if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
+    return error("Invalid record");
+
+  SmallVector<uint64_t, 64> Record;
+  while (true) {
+    BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::SubBlock:
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return Error::success();
+    case BitstreamEntry::Record:
+      break;
+    }
+
+    Record.clear();
+    switch (Stream.readRecord(Entry.ID, Record)) {
+    case bitc::VST_CODE_FNENTRY: // [valueid, offset]
+      setDeferredFunctionInfo(FuncBitcodeOffsetDelta,
+                              cast<Function>(ValueList[Record[0]]), Record);
+      break;
+    }
+  }
+}
+
 /// Parse the value symbol table at either the current parsing location or
 /// at the given bit offset if provided.
 Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
@@ -1732,8 +1882,18 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
   // Pass in the Offset to distinguish between calling for the module-level
   // VST (where we want to jump to the VST offset) and the function-level
   // VST (where we don't).
-  if (Offset > 0)
+  if (Offset > 0) {
     CurrentBit = jumpToValueSymbolTable(Offset, Stream);
+    // If this module uses a string table, read this as a module-level VST.
+    if (UseStrtab) {
+      if (Error Err = parseGlobalValueSymbolTable())
+        return Err;
+      Stream.JumpToBit(CurrentBit);
+      return Error::success();
+    }
+    // Otherwise, the VST will be in a similar format to a function-level VST,
+    // and will contain symbol names.
+  }
 
   // Compute the delta between the bitcode indices in the VST (the word offset
   // to the word-aligned ENTER_SUBBLOCK for the function block, and that
@@ -1794,29 +1954,10 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
         return Err;
       Value *V = ValOrErr.get();
 
-      auto *GO = dyn_cast<GlobalObject>(V);
-      if (!GO) {
-        // If this is an alias, need to get the actual Function object
-        // it aliases, in order to set up the DeferredFunctionInfo entry below.
-        auto *GA = dyn_cast<GlobalAlias>(V);
-        if (GA)
-          GO = GA->getBaseObject();
-        assert(GO);
-      }
-
-      // Note that we subtract 1 here because the offset is relative to one word
-      // before the start of the identification or module block, which was
-      // historically always the start of the regular bitcode header.
-      uint64_t FuncWordOffset = Record[1] - 1;
-      Function *F = dyn_cast<Function>(GO);
-      assert(F);
-      uint64_t FuncBitOffset = FuncWordOffset * 32;
-      DeferredFunctionInfo[F] = FuncBitOffset + FuncBitcodeOffsetDelta;
-      // Set the LastFunctionBlockBit to point to the last function block.
-      // Later when parsing is resumed after function materialization,
-      // we can simply skip that last function block.
-      if (FuncBitOffset > LastFunctionBlockBit)
-        LastFunctionBlockBit = FuncBitOffset;
+      // Ignore function offsets emitted for aliases of functions in older
+      // versions of LLVM.
+      if (auto *F = dyn_cast<Function>(V))
+        setDeferredFunctionInfo(FuncBitcodeOffsetDelta, F, Record);
       break;
     }
     case bitc::VST_CODE_BBENTRY: {
@@ -2501,6 +2642,16 @@ Error BitcodeReader::materializeMetadata() {
     if (Error Err = MDLoader->parseModuleMetadata())
       return Err;
   }
+
+  // Upgrade "Linker Options" module flag to "llvm.linker.options" module-level
+  // metadata.
+  if (Metadata *Val = TheModule->getModuleFlag("Linker Options")) {
+    NamedMDNode *LinkerOpts =
+        TheModule->getOrInsertNamedMetadata("llvm.linker.options");
+    for (const MDOperand &MDOptions : cast<MDNode>(Val)->operands())
+      LinkerOpts->addOperand(cast<MDNode>(MDOptions));
+  }
+
   DeferredMetadataInfo.clear();
   return Error::success();
 }
@@ -2539,6 +2690,7 @@ Error BitcodeReader::globalCleanup() {
 
   // Look for intrinsic functions which need to be upgraded at some point
   for (Function &F : *TheModule) {
+    MDLoader->upgradeDebugIntrinsics(F);
     Function *NewFn;
     if (UpgradeIntrinsicFunction(&F, NewFn))
       UpgradedIntrinsics[&F] = NewFn;
@@ -2607,6 +2759,272 @@ bool BitcodeReaderBase::readBlockInfo() {
   return false;
 }
 
+Error BitcodeReader::parseComdatRecord(ArrayRef<uint64_t> Record) {
+  // v1: [selection_kind, name]
+  // v2: [strtab_offset, strtab_size, selection_kind]
+  StringRef Name;
+  std::tie(Name, Record) = readNameFromStrtab(Record);
+
+  if (Record.size() < 1)
+    return error("Invalid record");
+  Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
+  std::string OldFormatName;
+  if (!UseStrtab) {
+    if (Record.size() < 2)
+      return error("Invalid record");
+    unsigned ComdatNameSize = Record[1];
+    OldFormatName.reserve(ComdatNameSize);
+    for (unsigned i = 0; i != ComdatNameSize; ++i)
+      OldFormatName += (char)Record[2 + i];
+    Name = OldFormatName;
+  }
+  Comdat *C = TheModule->getOrInsertComdat(Name);
+  C->setSelectionKind(SK);
+  ComdatList.push_back(C);
+  return Error::success();
+}
+
+Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
+  // v1: [pointer type, isconst, initid, linkage, alignment, section,
+  // visibility, threadlocal, unnamed_addr, externally_initialized,
+  // dllstorageclass, comdat, attributes] (name in VST)
+  // v2: [strtab_offset, strtab_size, v1]
+  StringRef Name;
+  std::tie(Name, Record) = readNameFromStrtab(Record);
+
+  if (Record.size() < 6)
+    return error("Invalid record");
+  Type *Ty = getTypeByID(Record[0]);
+  if (!Ty)
+    return error("Invalid record");
+  bool isConstant = Record[1] & 1;
+  bool explicitType = Record[1] & 2;
+  unsigned AddressSpace;
+  if (explicitType) {
+    AddressSpace = Record[1] >> 2;
+  } else {
+    if (!Ty->isPointerTy())
+      return error("Invalid type for value");
+    AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
+    Ty = cast<PointerType>(Ty)->getElementType();
+  }
+
+  uint64_t RawLinkage = Record[3];
+  GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+  unsigned Alignment;
+  if (Error Err = parseAlignmentValue(Record[4], Alignment))
+    return Err;
+  std::string Section;
+  if (Record[5]) {
+    if (Record[5] - 1 >= SectionTable.size())
+      return error("Invalid ID");
+    Section = SectionTable[Record[5] - 1];
+  }
+  GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
+  // Local linkage must have default visibility.
+  if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage))
+    // FIXME: Change to an error if non-default in 4.0.
+    Visibility = getDecodedVisibility(Record[6]);
+
+  GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
+  if (Record.size() > 7)
+    TLM = getDecodedThreadLocalMode(Record[7]);
+
+  GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
+  if (Record.size() > 8)
+    UnnamedAddr = getDecodedUnnamedAddrType(Record[8]);
+
+  bool ExternallyInitialized = false;
+  if (Record.size() > 9)
+    ExternallyInitialized = Record[9];
+
+  GlobalVariable *NewGV =
+      new GlobalVariable(*TheModule, Ty, isConstant, Linkage, nullptr, Name,
+                         nullptr, TLM, AddressSpace, ExternallyInitialized);
+  NewGV->setAlignment(Alignment);
+  if (!Section.empty())
+    NewGV->setSection(Section);
+  NewGV->setVisibility(Visibility);
+  NewGV->setUnnamedAddr(UnnamedAddr);
+
+  if (Record.size() > 10)
+    NewGV->setDLLStorageClass(getDecodedDLLStorageClass(Record[10]));
+  else
+    upgradeDLLImportExportLinkage(NewGV, RawLinkage);
+
+  ValueList.push_back(NewGV);
+
+  // Remember which value to use for the global initializer.
+  if (unsigned InitID = Record[2])
+    GlobalInits.push_back(std::make_pair(NewGV, InitID - 1));
+
+  if (Record.size() > 11) {
+    if (unsigned ComdatID = Record[11]) {
+      if (ComdatID > ComdatList.size())
+        return error("Invalid global variable comdat ID");
+      NewGV->setComdat(ComdatList[ComdatID - 1]);
+    }
+  } else if (hasImplicitComdat(RawLinkage)) {
+    NewGV->setComdat(reinterpret_cast<Comdat *>(1));
+  }
+
+  if (Record.size() > 12) {
+    auto AS = getAttributes(Record[12]).getFnAttributes();
+    NewGV->setAttributes(AS);
+  }
+  return Error::success();
+}
+
+Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
+  // v1: [type, callingconv, isproto, linkage, paramattr, alignment, section,
+  // visibility, gc, unnamed_addr, prologuedata, dllstorageclass, comdat,
+  // prefixdata] (name in VST)
+  // v2: [strtab_offset, strtab_size, v1]
+  StringRef Name;
+  std::tie(Name, Record) = readNameFromStrtab(Record);
+
+  if (Record.size() < 8)
+    return error("Invalid record");
+  Type *Ty = getTypeByID(Record[0]);
+  if (!Ty)
+    return error("Invalid record");
+  if (auto *PTy = dyn_cast<PointerType>(Ty))
+    Ty = PTy->getElementType();
+  auto *FTy = dyn_cast<FunctionType>(Ty);
+  if (!FTy)
+    return error("Invalid type for value");
+  auto CC = static_cast<CallingConv::ID>(Record[1]);
+  if (CC & ~CallingConv::MaxID)
+    return error("Invalid calling convention ID");
+
+  Function *Func =
+      Function::Create(FTy, GlobalValue::ExternalLinkage, Name, TheModule);
+
+  Func->setCallingConv(CC);
+  bool isProto = Record[2];
+  uint64_t RawLinkage = Record[3];
+  Func->setLinkage(getDecodedLinkage(RawLinkage));
+  Func->setAttributes(getAttributes(Record[4]));
+
+  unsigned Alignment;
+  if (Error Err = parseAlignmentValue(Record[5], Alignment))
+    return Err;
+  Func->setAlignment(Alignment);
+  if (Record[6]) {
+    if (Record[6] - 1 >= SectionTable.size())
+      return error("Invalid ID");
+    Func->setSection(SectionTable[Record[6] - 1]);
+  }
+  // Local linkage must have default visibility.
+  if (!Func->hasLocalLinkage())
+    // FIXME: Change to an error if non-default in 4.0.
+    Func->setVisibility(getDecodedVisibility(Record[7]));
+  if (Record.size() > 8 && Record[8]) {
+    if (Record[8] - 1 >= GCTable.size())
+      return error("Invalid ID");
+    Func->setGC(GCTable[Record[8] - 1]);
+  }
+  GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
+  if (Record.size() > 9)
+    UnnamedAddr = getDecodedUnnamedAddrType(Record[9]);
+  Func->setUnnamedAddr(UnnamedAddr);
+  if (Record.size() > 10 && Record[10] != 0)
+    FunctionPrologues.push_back(std::make_pair(Func, Record[10] - 1));
+
+  if (Record.size() > 11)
+    Func->setDLLStorageClass(getDecodedDLLStorageClass(Record[11]));
+  else
+    upgradeDLLImportExportLinkage(Func, RawLinkage);
+
+  if (Record.size() > 12) {
+    if (unsigned ComdatID = Record[12]) {
+      if (ComdatID > ComdatList.size())
+        return error("Invalid function comdat ID");
+      Func->setComdat(ComdatList[ComdatID - 1]);
+    }
+  } else if (hasImplicitComdat(RawLinkage)) {
+    Func->setComdat(reinterpret_cast<Comdat *>(1));
+  }
+
+  if (Record.size() > 13 && Record[13] != 0)
+    FunctionPrefixes.push_back(std::make_pair(Func, Record[13] - 1));
+
+  if (Record.size() > 14 && Record[14] != 0)
+    FunctionPersonalityFns.push_back(std::make_pair(Func, Record[14] - 1));
+
+  ValueList.push_back(Func);
+
+  // If this is a function with a body, remember the prototype we are
+  // creating now, so that we can match up the body with them later.
+  if (!isProto) {
+    Func->setIsMaterializable(true);
+    FunctionsWithBodies.push_back(Func);
+    DeferredFunctionInfo[Func] = 0;
+  }
+  return Error::success();
+}
+
+Error BitcodeReader::parseGlobalIndirectSymbolRecord(
+    unsigned BitCode, ArrayRef<uint64_t> Record) {
+  // v1 ALIAS_OLD: [alias type, aliasee val#, linkage] (name in VST)
+  // v1 ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility,
+  // dllstorageclass] (name in VST)
+  // v1 IFUNC: [alias type, addrspace, aliasee val#, linkage,
+  // visibility, dllstorageclass] (name in VST)
+  // v2: [strtab_offset, strtab_size, v1]
+  StringRef Name;
+  std::tie(Name, Record) = readNameFromStrtab(Record);
+
+  bool NewRecord = BitCode != bitc::MODULE_CODE_ALIAS_OLD;
+  if (Record.size() < (3 + (unsigned)NewRecord))
+    return error("Invalid record");
+  unsigned OpNum = 0;
+  Type *Ty = getTypeByID(Record[OpNum++]);
+  if (!Ty)
+    return error("Invalid record");
+
+  unsigned AddrSpace;
+  if (!NewRecord) {
+    auto *PTy = dyn_cast<PointerType>(Ty);
+    if (!PTy)
+      return error("Invalid type for value");
+    Ty = PTy->getElementType();
+    AddrSpace = PTy->getAddressSpace();
+  } else {
+    AddrSpace = Record[OpNum++];
+  }
+
+  auto Val = Record[OpNum++];
+  auto Linkage = Record[OpNum++];
+  GlobalIndirectSymbol *NewGA;
+  if (BitCode == bitc::MODULE_CODE_ALIAS ||
+      BitCode == bitc::MODULE_CODE_ALIAS_OLD)
+    NewGA = GlobalAlias::create(Ty, AddrSpace, getDecodedLinkage(Linkage), Name,
+                                TheModule);
+  else
+    NewGA = GlobalIFunc::create(Ty, AddrSpace, getDecodedLinkage(Linkage), Name,
+                                nullptr, TheModule);
+  // Old bitcode files didn't have visibility field.
+  // Local linkage must have default visibility.
+  if (OpNum != Record.size()) {
+    auto VisInd = OpNum++;
+    if (!NewGA->hasLocalLinkage())
+      // FIXME: Change to an error if non-default in 4.0.
+      NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
+  }
+  if (OpNum != Record.size())
+    NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
+  else
+    upgradeDLLImportExportLinkage(NewGA, Linkage);
+  if (OpNum != Record.size())
+    NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
+  if (OpNum != Record.size())
+    NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
+  ValueList.push_back(NewGA);
+  IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
+  return Error::success();
+}
+
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
                                  bool ShouldLazyLoadMetadata) {
   if (ResumeBit)
@@ -2615,8 +3033,6 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
     return error("Invalid record");
 
   SmallVector<uint64_t, 64> Record;
-  std::vector<std::string> SectionTable;
-  std::vector<std::string> GCTable;
 
   // Read all the records for this module.
   while (true) {
@@ -2750,6 +3166,10 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
         if (Error Err = parseOperandBundleTags())
           return Err;
         break;
+      case bitc::SYNC_SCOPE_NAMES_BLOCK_ID:
+        if (Error Err = parseSyncScopeNames())
+          return Err;
+        break;
       }
       continue;
 
@@ -2762,21 +3182,11 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
     auto BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: break;  // Default behavior, ignore unknown content.
-    case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
-      if (Record.size() < 1)
-        return error("Invalid record");
-      // Only version #0 and #1 are supported so far.
-      unsigned module_version = Record[0];
-      switch (module_version) {
-        default:
-          return error("Invalid value");
-        case 0:
-          UseRelativeIDs = false;
-          break;
-        case 1:
-          UseRelativeIDs = true;
-          break;
-      }
+    case bitc::MODULE_CODE_VERSION: {
+      Expected<unsigned> VersionOrErr = parseVersionRecord(Record);
+      if (!VersionOrErr)
+        return VersionOrErr.takeError();
+      UseRelativeIDs = *VersionOrErr >= 1;
       break;
     }
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
@@ -2822,249 +3232,28 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       GCTable.push_back(S);
       break;
     }
-    case bitc::MODULE_CODE_COMDAT: { // COMDAT: [selection_kind, name]
-      if (Record.size() < 2)
-        return error("Invalid record");
-      Comdat::SelectionKind SK = getDecodedComdatSelectionKind(Record[0]);
-      unsigned ComdatNameSize = Record[1];
-      std::string ComdatName;
-      ComdatName.reserve(ComdatNameSize);
-      for (unsigned i = 0; i != ComdatNameSize; ++i)
-        ComdatName += (char)Record[2 + i];
-      Comdat *C = TheModule->getOrInsertComdat(ComdatName);
-      C->setSelectionKind(SK);
-      ComdatList.push_back(C);
-      break;
-    }
-    // GLOBALVAR: [pointer type, isconst, initid,
-    //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr, externally_initialized, dllstorageclass,
-    //             comdat]
+    case bitc::MODULE_CODE_COMDAT: {
+      if (Error Err = parseComdatRecord(Record))
+        return Err;
+      break;
+    }
     case bitc::MODULE_CODE_GLOBALVAR: {
-      if (Record.size() < 6)
-        return error("Invalid record");
-      Type *Ty = getTypeByID(Record[0]);
-      if (!Ty)
-        return error("Invalid record");
-      bool isConstant = Record[1] & 1;
-      bool explicitType = Record[1] & 2;
-      unsigned AddressSpace;
-      if (explicitType) {
-        AddressSpace = Record[1] >> 2;
-      } else {
-        if (!Ty->isPointerTy())
-          return error("Invalid type for value");
-        AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
-        Ty = cast<PointerType>(Ty)->getElementType();
-      }
-
-      uint64_t RawLinkage = Record[3];
-      GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-      unsigned Alignment;
-      if (Error Err = parseAlignmentValue(Record[4], Alignment))
+      if (Error Err = parseGlobalVarRecord(Record))
         return Err;
-      std::string Section;
-      if (Record[5]) {
-        if (Record[5]-1 >= SectionTable.size())
-          return error("Invalid ID");
-        Section = SectionTable[Record[5]-1];
-      }
-      GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
-      // Local linkage must have default visibility.
-      if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage))
-        // FIXME: Change to an error if non-default in 4.0.
-        Visibility = getDecodedVisibility(Record[6]);
-
-      GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
-      if (Record.size() > 7)
-        TLM = getDecodedThreadLocalMode(Record[7]);
-
-      GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
-      if (Record.size() > 8)
-        UnnamedAddr = getDecodedUnnamedAddrType(Record[8]);
-
-      bool ExternallyInitialized = false;
-      if (Record.size() > 9)
-        ExternallyInitialized = Record[9];
-
-      GlobalVariable *NewGV =
-        new GlobalVariable(*TheModule, Ty, isConstant, Linkage, nullptr, "", nullptr,
-                           TLM, AddressSpace, ExternallyInitialized);
-      NewGV->setAlignment(Alignment);
-      if (!Section.empty())
-        NewGV->setSection(Section);
-      NewGV->setVisibility(Visibility);
-      NewGV->setUnnamedAddr(UnnamedAddr);
-
-      if (Record.size() > 10)
-        NewGV->setDLLStorageClass(getDecodedDLLStorageClass(Record[10]));
-      else
-        upgradeDLLImportExportLinkage(NewGV, RawLinkage);
-
-      ValueList.push_back(NewGV);
-
-      // Remember which value to use for the global initializer.
-      if (unsigned InitID = Record[2])
-        GlobalInits.push_back(std::make_pair(NewGV, InitID-1));
-
-      if (Record.size() > 11) {
-        if (unsigned ComdatID = Record[11]) {
-          if (ComdatID > ComdatList.size())
-            return error("Invalid global variable comdat ID");
-          NewGV->setComdat(ComdatList[ComdatID - 1]);
-        }
-      } else if (hasImplicitComdat(RawLinkage)) {
-        NewGV->setComdat(reinterpret_cast<Comdat *>(1));
-      }
-
       break;
     }
-    // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
-    //             alignment, section, visibility, gc, unnamed_addr,
-    //             prologuedata, dllstorageclass, comdat, prefixdata]
     case bitc::MODULE_CODE_FUNCTION: {
-      if (Record.size() < 8)
-        return error("Invalid record");
-      Type *Ty = getTypeByID(Record[0]);
-      if (!Ty)
-        return error("Invalid record");
-      if (auto *PTy = dyn_cast<PointerType>(Ty))
-        Ty = PTy->getElementType();
-      auto *FTy = dyn_cast<FunctionType>(Ty);
-      if (!FTy)
-        return error("Invalid type for value");
-      auto CC = static_cast<CallingConv::ID>(Record[1]);
-      if (CC & ~CallingConv::MaxID)
-        return error("Invalid calling convention ID");
-
-      Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                        "", TheModule);
-
-      Func->setCallingConv(CC);
-      bool isProto = Record[2];
-      uint64_t RawLinkage = Record[3];
-      Func->setLinkage(getDecodedLinkage(RawLinkage));
-      Func->setAttributes(getAttributes(Record[4]));
-
-      unsigned Alignment;
-      if (Error Err = parseAlignmentValue(Record[5], Alignment))
+      if (Error Err = parseFunctionRecord(Record))
         return Err;
-      Func->setAlignment(Alignment);
-      if (Record[6]) {
-        if (Record[6]-1 >= SectionTable.size())
-          return error("Invalid ID");
-        Func->setSection(SectionTable[Record[6]-1]);
-      }
-      // Local linkage must have default visibility.
-      if (!Func->hasLocalLinkage())
-        // FIXME: Change to an error if non-default in 4.0.
-        Func->setVisibility(getDecodedVisibility(Record[7]));
-      if (Record.size() > 8 && Record[8]) {
-        if (Record[8]-1 >= GCTable.size())
-          return error("Invalid ID");
-        Func->setGC(GCTable[Record[8] - 1]);
-      }
-      GlobalValue::UnnamedAddr UnnamedAddr = GlobalValue::UnnamedAddr::None;
-      if (Record.size() > 9)
-        UnnamedAddr = getDecodedUnnamedAddrType(Record[9]);
-      Func->setUnnamedAddr(UnnamedAddr);
-      if (Record.size() > 10 && Record[10] != 0)
-        FunctionPrologues.push_back(std::make_pair(Func, Record[10]-1));
-
-      if (Record.size() > 11)
-        Func->setDLLStorageClass(getDecodedDLLStorageClass(Record[11]));
-      else
-        upgradeDLLImportExportLinkage(Func, RawLinkage);
-
-      if (Record.size() > 12) {
-        if (unsigned ComdatID = Record[12]) {
-          if (ComdatID > ComdatList.size())
-            return error("Invalid function comdat ID");
-          Func->setComdat(ComdatList[ComdatID - 1]);
-        }
-      } else if (hasImplicitComdat(RawLinkage)) {
-        Func->setComdat(reinterpret_cast<Comdat *>(1));
-      }
-
-      if (Record.size() > 13 && Record[13] != 0)
-        FunctionPrefixes.push_back(std::make_pair(Func, Record[13]-1));
-
-      if (Record.size() > 14 && Record[14] != 0)
-        FunctionPersonalityFns.push_back(std::make_pair(Func, Record[14] - 1));
-
-      ValueList.push_back(Func);
-
-      // If this is a function with a body, remember the prototype we are
-      // creating now, so that we can match up the body with them later.
-      if (!isProto) {
-        Func->setIsMaterializable(true);
-        FunctionsWithBodies.push_back(Func);
-        DeferredFunctionInfo[Func] = 0;
-      }
       break;
     }
-    // ALIAS: [alias type, addrspace, aliasee val#, linkage]
-    // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
-    // IFUNC: [alias type, addrspace, aliasee val#, linkage, visibility, dllstorageclass]
     case bitc::MODULE_CODE_IFUNC:
     case bitc::MODULE_CODE_ALIAS:
     case bitc::MODULE_CODE_ALIAS_OLD: {
-      bool NewRecord = BitCode != bitc::MODULE_CODE_ALIAS_OLD;
-      if (Record.size() < (3 + (unsigned)NewRecord))
-        return error("Invalid record");
-      unsigned OpNum = 0;
-      Type *Ty = getTypeByID(Record[OpNum++]);
-      if (!Ty)
-        return error("Invalid record");
-
-      unsigned AddrSpace;
-      if (!NewRecord) {
-        auto *PTy = dyn_cast<PointerType>(Ty);
-        if (!PTy)
-          return error("Invalid type for value");
-        Ty = PTy->getElementType();
-        AddrSpace = PTy->getAddressSpace();
-      } else {
-        AddrSpace = Record[OpNum++];
-      }
-
-      auto Val = Record[OpNum++];
-      auto Linkage = Record[OpNum++];
-      GlobalIndirectSymbol *NewGA;
-      if (BitCode == bitc::MODULE_CODE_ALIAS ||
-          BitCode == bitc::MODULE_CODE_ALIAS_OLD)
-        NewGA = GlobalAlias::create(Ty, AddrSpace, getDecodedLinkage(Linkage),
-                                    "", TheModule);
-      else
-        NewGA = GlobalIFunc::create(Ty, AddrSpace, getDecodedLinkage(Linkage),
-                                    "", nullptr, TheModule);
-      // Old bitcode files didn't have visibility field.
-      // Local linkage must have default visibility.
-      if (OpNum != Record.size()) {
-        auto VisInd = OpNum++;
-        if (!NewGA->hasLocalLinkage())
-          // FIXME: Change to an error if non-default in 4.0.
-          NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
-      }
-      if (OpNum != Record.size())
-        NewGA->setDLLStorageClass(getDecodedDLLStorageClass(Record[OpNum++]));
-      else
-        upgradeDLLImportExportLinkage(NewGA, Linkage);
-      if (OpNum != Record.size())
-        NewGA->setThreadLocalMode(getDecodedThreadLocalMode(Record[OpNum++]));
-      if (OpNum != Record.size())
-        NewGA->setUnnamedAddr(getDecodedUnnamedAddrType(Record[OpNum++]));
-      ValueList.push_back(NewGA);
-      IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
-      break;
-    }
-    /// MODULE_CODE_PURGEVALS: [numvals]
-    case bitc::MODULE_CODE_PURGEVALS:
-      // Trim down the value list to the specified size.
-      if (Record.size() < 1 || Record[0] > ValueList.size())
-        return error("Invalid record");
-      ValueList.shrinkTo(Record[0]);
+      if (Error Err = parseGlobalIndirectSymbolRecord(BitCode, Record))
+        return Err;
       break;
+    }
     /// MODULE_CODE_VSTOFFSET: [offset]
     case bitc::MODULE_CODE_VSTOFFSET:
       if (Record.size() < 1)
@@ -3840,7 +4029,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() < 4)
         return error("Invalid record");
       unsigned OpNum = 0;
-      AttributeSet PAL = getAttributes(Record[OpNum++]);
+      AttributeList PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
       BasicBlock *NormalBB = getBasicBlock(Record[OpNum++]);
       BasicBlock *UnwindBB = getBasicBlock(Record[OpNum++]);
@@ -4017,7 +4206,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
       if (!Ty || !Size)
         return error("Invalid record");
-      AllocaInst *AI = new AllocaInst(Ty, Size, Align);
+
+      // FIXME: Make this an optional field.
+      const DataLayout &DL = TheModule->getDataLayout();
+      unsigned AS = DL.getAllocaAddrSpace();
+
+      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align);
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
@@ -4048,7 +4242,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_LOADATOMIC: {
-       // LOADATOMIC: [opty, op, align, vol, ordering, synchscope]
+       // LOADATOMIC: [opty, op, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
@@ -4070,12 +4264,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
       if (Ordering != AtomicOrdering::NotAtomic && Record[OpNum] == 0)
         return error("Invalid record");
-      SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 3]);
+      SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
 
       unsigned Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new LoadInst(Op, "", Record[OpNum+1], Align, Ordering, SynchScope);
+      I = new LoadInst(Op, "", Record[OpNum+1], Align, Ordering, SSID);
 
       InstructionList.push_back(I);
       break;
@@ -4104,7 +4298,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_STOREATOMIC:
     case bitc::FUNC_CODE_INST_STOREATOMIC_OLD: {
-      // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, synchscope]
+      // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -4124,20 +4318,20 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           Ordering == AtomicOrdering::Acquire ||
           Ordering == AtomicOrdering::AcquireRelease)
         return error("Invalid record");
-      SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 3]);
+      SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
       if (Ordering != AtomicOrdering::NotAtomic && Record[OpNum] == 0)
         return error("Invalid record");
 
       unsigned Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align, Ordering, SynchScope);
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align, Ordering, SSID);
       InstructionList.push_back(I);
       break;
     }
     case bitc::FUNC_CODE_INST_CMPXCHG_OLD:
     case bitc::FUNC_CODE_INST_CMPXCHG: {
-      // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, synchscope,
+      // CMPXCHG:[ptrty, ptr, cmp, new, vol, successordering, ssid,
       //          failureordering?, isweak?]
       unsigned OpNum = 0;
       Value *Ptr, *Cmp, *New;
@@ -4154,7 +4348,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (SuccessOrdering == AtomicOrdering::NotAtomic ||
           SuccessOrdering == AtomicOrdering::Unordered)
         return error("Invalid record");
-      SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 2]);
+      SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 2]);
 
       if (Error Err = typeCheckLoadStoreInst(Cmp->getType(), Ptr->getType()))
         return Err;
@@ -4166,7 +4360,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         FailureOrdering = getDecodedOrdering(Record[OpNum + 3]);
 
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering, FailureOrdering,
-                                SynchScope);
+                                SSID);
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
 
       if (Record.size() < 8) {
@@ -4183,7 +4377,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_ATOMICRMW: {
-      // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, synchscope]
+      // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Ptr, *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -4200,13 +4394,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Ordering == AtomicOrdering::NotAtomic ||
           Ordering == AtomicOrdering::Unordered)
         return error("Invalid record");
-      SynchronizationScope SynchScope = getDecodedSynchScope(Record[OpNum + 3]);
-      I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
+      SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
+      I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
+    case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, ssid]
       if (2 != Record.size())
         return error("Invalid record");
       AtomicOrdering Ordering = getDecodedOrdering(Record[0]);
@@ -4214,8 +4408,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           Ordering == AtomicOrdering::Unordered ||
           Ordering == AtomicOrdering::Monotonic)
         return error("Invalid record");
-      SynchronizationScope SynchScope = getDecodedSynchScope(Record[1]);
-      I = new FenceInst(Context, Ordering, SynchScope);
+      SyncScope::ID SSID = getDecodedSyncScopeID(Record[1]);
+      I = new FenceInst(Context, Ordering, SSID);
       InstructionList.push_back(I);
       break;
     }
@@ -4225,7 +4419,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
 
       unsigned OpNum = 0;
-      AttributeSet PAL = getAttributes(Record[OpNum++]);
+      AttributeList PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
 
       FastMathFlags FMF;
@@ -4343,11 +4537,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     // Add instruction to end of current BB.  If there is no current BB, reject
     // this file.
     if (!CurBB) {
-      delete I;
+      I->deleteValue();
       return error("Invalid instruction with no BB");
     }
     if (!OperandBundles.empty()) {
-      delete I;
+      I->deleteValue();
       return error("Operand bundles found with no consumer");
     }
     CurBB->getInstList().push_back(I);
@@ -4411,6 +4605,14 @@ Error BitcodeReader::findFunctionInStream(
   return Error::success();
 }
 
+SyncScope::ID BitcodeReader::getDecodedSyncScopeID(unsigned Val) {
+  if (Val == SyncScope::SingleThread || Val == SyncScope::System)
+    return SyncScope::ID(Val);
+  if (Val >= SSIDs.size())
+    return SyncScope::System; // Map unknown synchronization scopes to system.
+  return SSIDs[Val];
+}
+
 //===----------------------------------------------------------------------===//
 // GVMaterializer implementation
 //===----------------------------------------------------------------------===//
@@ -4540,14 +4742,37 @@ std::vector<StructType *> BitcodeReader::getIdentifiedStructTypes() const {
 }
 
 ModuleSummaryIndexBitcodeReader::ModuleSummaryIndexBitcodeReader(
-    BitstreamCursor Cursor, ModuleSummaryIndex &TheIndex)
-    : BitcodeReaderBase(std::move(Cursor)), TheIndex(TheIndex) {}
-
-std::pair<GlobalValue::GUID, GlobalValue::GUID>
-ModuleSummaryIndexBitcodeReader::getGUIDFromValueId(unsigned ValueId) {
-  auto VGI = ValueIdToCallGraphGUIDMap.find(ValueId);
-  assert(VGI != ValueIdToCallGraphGUIDMap.end());
-  return VGI->second;
+    BitstreamCursor Cursor, StringRef Strtab, ModuleSummaryIndex &TheIndex,
+    StringRef ModulePath, unsigned ModuleId)
+    : BitcodeReaderBase(std::move(Cursor), Strtab), TheIndex(TheIndex),
+      ModulePath(ModulePath), ModuleId(ModuleId) {}
+
+ModuleSummaryIndex::ModuleInfo *
+ModuleSummaryIndexBitcodeReader::addThisModule() {
+  return TheIndex.addModule(ModulePath, ModuleId);
+}
+
+std::pair<ValueInfo, GlobalValue::GUID>
+ModuleSummaryIndexBitcodeReader::getValueInfoFromValueId(unsigned ValueId) {
+  auto VGI = ValueIdToValueInfoMap[ValueId];
+  assert(VGI.first);
+  return VGI;
+}
+
+void ModuleSummaryIndexBitcodeReader::setValueGUID(
+    uint64_t ValueID, StringRef ValueName, GlobalValue::LinkageTypes Linkage,
+    StringRef SourceFileName) {
+  std::string GlobalId =
+      GlobalValue::getGlobalIdentifier(ValueName, Linkage, SourceFileName);
+  auto ValueGUID = GlobalValue::getGUID(GlobalId);
+  auto OriginalNameID = ValueGUID;
+  if (GlobalValue::isLocalLinkage(Linkage))
+    OriginalNameID = GlobalValue::getGUID(ValueName);
+  if (PrintSummaryGUIDs)
+    dbgs() << "GUID " << ValueGUID << "(" << OriginalNameID << ") is "
+           << ValueName << "\n";
+  ValueIdToValueInfoMap[ValueID] =
+      std::make_pair(TheIndex.getOrInsertValueInfo(ValueGUID), OriginalNameID);
 }
 
 // Specialized value symbol table parser used when reading module index
@@ -4556,6 +4781,10 @@ ModuleSummaryIndexBitcodeReader::getGUIDFromValueId(unsigned ValueId) {
 Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
     uint64_t Offset,
     DenseMap<unsigned, GlobalValue::LinkageTypes> &ValueIdToLinkageMap) {
+  // With a strtab the VST is not required to parse the summary.
+  if (UseStrtab)
+    return Error::success();
+
   assert(Offset > 0 && "Expected non-zero VST offset");
   uint64_t CurrentBit = jumpToValueSymbolTable(Offset, Stream);
 
@@ -4597,17 +4826,7 @@ Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
       assert(VLI != ValueIdToLinkageMap.end() &&
              "No linkage found for VST entry?");
       auto Linkage = VLI->second;
-      std::string GlobalId =
-          GlobalValue::getGlobalIdentifier(ValueName, Linkage, SourceFileName);
-      auto ValueGUID = GlobalValue::getGUID(GlobalId);
-      auto OriginalNameID = ValueGUID;
-      if (GlobalValue::isLocalLinkage(Linkage))
-        OriginalNameID = GlobalValue::getGUID(ValueName);
-      if (PrintSummaryGUIDs)
-        dbgs() << "GUID " << ValueGUID << "(" << OriginalNameID << ") is "
-               << ValueName << "\n";
-      ValueIdToCallGraphGUIDMap[ValueID] =
-          std::make_pair(ValueGUID, OriginalNameID);
+      setValueGUID(ValueID, ValueName, Linkage, SourceFileName);
       ValueName.clear();
       break;
     }
@@ -4621,18 +4840,7 @@ Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
       assert(VLI != ValueIdToLinkageMap.end() &&
              "No linkage found for VST entry?");
       auto Linkage = VLI->second;
-      std::string FunctionGlobalId = GlobalValue::getGlobalIdentifier(
-          ValueName, VLI->second, SourceFileName);
-      auto FunctionGUID = GlobalValue::getGUID(FunctionGlobalId);
-      auto OriginalNameID = FunctionGUID;
-      if (GlobalValue::isLocalLinkage(Linkage))
-        OriginalNameID = GlobalValue::getGUID(ValueName);
-      if (PrintSummaryGUIDs)
-        dbgs() << "GUID " << FunctionGUID << "(" << OriginalNameID << ") is "
-               << ValueName << "\n";
-      ValueIdToCallGraphGUIDMap[ValueID] =
-          std::make_pair(FunctionGUID, OriginalNameID);
-
+      setValueGUID(ValueID, ValueName, Linkage, SourceFileName);
       ValueName.clear();
       break;
     }
@@ -4642,7 +4850,8 @@ Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
       GlobalValue::GUID RefGUID = Record[1];
       // The "original name", which is the second value of the pair will be
       // overriden later by a FS_COMBINED_ORIGINAL_NAME in the combined index.
-      ValueIdToCallGraphGUIDMap[ValueID] = std::make_pair(RefGUID, RefGUID);
+      ValueIdToValueInfoMap[ValueID] =
+          std::make_pair(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID);
       break;
     }
     }
@@ -4652,7 +4861,7 @@ Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
 // Parse just the blocks needed for building the index out of the module.
 // At the end of this routine the module Index is populated with a map
 // from global value id to GlobalValueSummary objects.
-Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
+Error ModuleSummaryIndexBitcodeReader::parseModule() {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
     return error("Invalid record");
 
@@ -4691,6 +4900,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
           return error("Invalid record");
         break;
       case bitc::GLOBALVAL_SUMMARY_BLOCK_ID:
+      case bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID:
         assert(!SeenValueSymbolTable &&
                "Already read VST when parsing summary block?");
         // We might not have a VST if there were no values in the
@@ -4703,7 +4913,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
           SeenValueSymbolTable = true;
         }
         SeenGlobalValSummary = true;
-        if (Error Err = parseEntireSummary(ModulePath))
+        if (Error Err = parseEntireSummary(Entry.ID))
           return Err;
         break;
       case bitc::MODULE_STRTAB_BLOCK_ID:
@@ -4719,6 +4929,11 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
         switch (BitCode) {
         default:
           break; // Default behavior, ignore unknown content.
+        case bitc::MODULE_CODE_VERSION: {
+          if (Error Err = parseVersionRecord(Record).takeError())
+            return Err;
+          break;
+        }
         /// MODULE_CODE_SOURCE_FILENAME: [namechar x N]
         case bitc::MODULE_CODE_SOURCE_FILENAME: {
           SmallString<128> ValueName;
@@ -4731,12 +4946,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
         case bitc::MODULE_CODE_HASH: {
           if (Record.size() != 5)
             return error("Invalid hash length " + Twine(Record.size()).str());
-          if (TheIndex.modulePaths().empty())
-            // We always seed the index with the module.
-            TheIndex.addModulePath(ModulePath, 0);
-          if (TheIndex.modulePaths().size() != 1)
-            return error("Don't expect multiple modules defined?");
-          auto &Hash = TheIndex.modulePaths().begin()->second.second;
+          auto &Hash = addThisModule()->second.second;
           int Pos = 0;
           for (auto &Val : Record) {
             assert(!(Val >> 32) && "Unexpected high bits set");
@@ -4753,37 +4963,26 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
           // was historically always the start of the regular bitcode header.
           VSTOffset = Record[0] - 1;
           break;
-        // GLOBALVAR: [pointer type, isconst, initid,
-        //             linkage, alignment, section, visibility, threadlocal,
-        //             unnamed_addr, externally_initialized, dllstorageclass,
-        //             comdat]
-        case bitc::MODULE_CODE_GLOBALVAR: {
-          if (Record.size() < 6)
-            return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
-          break;
-        }
-        // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
-        //             alignment, section, visibility, gc, unnamed_addr,
-        //             prologuedata, dllstorageclass, comdat, prefixdata]
-        case bitc::MODULE_CODE_FUNCTION: {
-          if (Record.size() < 8)
-            return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
-          break;
-        }
-        // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility,
-        // dllstorageclass]
+        // v1 GLOBALVAR: [pointer type, isconst,     initid,       linkage, ...]
+        // v1 FUNCTION:  [type,         callingconv, isproto,      linkage, ...]
+        // v1 ALIAS:     [alias type,   addrspace,   aliasee val#, linkage, ...]
+        // v2: [strtab offset, strtab size, v1]
+        case bitc::MODULE_CODE_GLOBALVAR:
+        case bitc::MODULE_CODE_FUNCTION:
         case bitc::MODULE_CODE_ALIAS: {
-          if (Record.size() < 6)
+          StringRef Name;
+          ArrayRef<uint64_t> GVRecord;
+          std::tie(Name, GVRecord) = readNameFromStrtab(Record);
+          if (GVRecord.size() <= 3)
             return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
+          uint64_t RawLinkage = GVRecord[3];
           GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
+          if (!UseStrtab) {
+            ValueIdToLinkageMap[ValueId++] = Linkage;
+            break;
+          }
+
+          setValueGUID(ValueId++, Name, Linkage, SourceFileName);
           break;
         }
         }
@@ -4798,7 +4997,7 @@ ModuleSummaryIndexBitcodeReader::makeRefList(ArrayRef<uint64_t> Record) {
   std::vector<ValueInfo> Ret;
   Ret.reserve(Record.size());
   for (uint64_t RefValueId : Record)
-    Ret.push_back(getGUIDFromValueId(RefValueId).first);
+    Ret.push_back(getValueInfoFromValueId(RefValueId).first);
   return Ret;
 }
 
@@ -4808,23 +5007,22 @@ std::vector<FunctionSummary::EdgeTy> ModuleSummaryIndexBitcodeReader::makeCallLi
   Ret.reserve(Record.size());
   for (unsigned I = 0, E = Record.size(); I != E; ++I) {
     CalleeInfo::HotnessType Hotness = CalleeInfo::HotnessType::Unknown;
-    GlobalValue::GUID CalleeGUID = getGUIDFromValueId(Record[I]).first;
+    ValueInfo Callee = getValueInfoFromValueId(Record[I]).first;
     if (IsOldProfileFormat) {
       I += 1; // Skip old callsitecount field
       if (HasProfile)
         I += 1; // Skip old profilecount field
     } else if (HasProfile)
       Hotness = static_cast<CalleeInfo::HotnessType>(Record[++I]);
-    Ret.push_back(FunctionSummary::EdgeTy{CalleeGUID, CalleeInfo{Hotness}});
+    Ret.push_back(FunctionSummary::EdgeTy{Callee, CalleeInfo{Hotness}});
   }
   return Ret;
 }
 
 // Eagerly parse the entire summary block. This populates the GlobalValueSummary
 // objects in the index.
-Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
-    StringRef ModulePath) {
-  if (Stream.EnterSubBlock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID))
+Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
+  if (Stream.EnterSubBlock(ID))
     return error("Invalid record");
   SmallVector<uint64_t, 64> Record;
 
@@ -4846,8 +5044,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
   // Keep around the last seen summary to be used when we see an optional
   // "OriginalName" attachement.
   GlobalValueSummary *LastSeenSummary = nullptr;
-  bool Combined = false;
+  GlobalValue::GUID LastSeenGUID = 0;
+
+  // We can expect to see any number of type ID information records before
+  // each function summary records; these variables store the information
+  // collected so far so that it can be used to create the summary object.
   std::vector<GlobalValue::GUID> PendingTypeTests;
+  std::vector<FunctionSummary::VFuncId> PendingTypeTestAssumeVCalls,
+      PendingTypeCheckedLoadVCalls;
+  std::vector<FunctionSummary::ConstVCall> PendingTypeTestAssumeConstVCalls,
+      PendingTypeCheckedLoadConstVCalls;
 
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -4857,16 +5063,6 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      // For a per-module index, remove any entries that still have empty
-      // summaries. The VST parsing creates entries eagerly for all symbols,
-      // but not all have associated summaries (e.g. it doesn't know how to
-      // distinguish between VST_CODE_ENTRY for function declarations vs global
-      // variables with initializers that end up with a summary). Remove those
-      // entries now so that we don't need to rely on the combined index merger
-      // to clean them up (especially since that may not run for the first
-      // module's index if we merge into that).
-      if (!Combined)
-        TheIndex.removeEmptySummaryEntries();
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -4885,6 +5081,13 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
     switch (BitCode) {
     default: // Default behavior: ignore.
       break;
+    case bitc::FS_VALUE_GUID: { // [valueid, refguid]
+      uint64_t ValueID = Record[0];
+      GlobalValue::GUID RefGUID = Record[1];
+      ValueIdToValueInfoMap[ValueID] =
+          std::make_pair(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID);
+      break;
+    }
     // FS_PERMODULE: [valueid, flags, instcount, numrefs, numrefs x valueid,
     //                n x (valueid)]
     // FS_PERMODULE_PROFILE: [valueid, flags, instcount, numrefs,
@@ -4914,12 +5117,19 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
           IsOldProfileFormat, HasProfile);
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, std::move(Refs), std::move(Calls),
-          std::move(PendingTypeTests));
+          std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls),
+          std::move(PendingTypeCheckedLoadVCalls),
+          std::move(PendingTypeTestAssumeConstVCalls),
+          std::move(PendingTypeCheckedLoadConstVCalls));
       PendingTypeTests.clear();
-      auto GUID = getGUIDFromValueId(ValueID);
-      FS->setModulePath(TheIndex.addModulePath(ModulePath, 0)->first());
-      FS->setOriginalName(GUID.second);
-      TheIndex.addGlobalValueSummary(GUID.first, std::move(FS));
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
+      auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
+      FS->setModulePath(addThisModule()->first());
+      FS->setOriginalName(VIAndOriginalGUID.second);
+      TheIndex.addGlobalValueSummary(VIAndOriginalGUID.first, std::move(FS));
       break;
     }
     // FS_ALIAS: [valueid, flags, valueid]
@@ -4937,15 +5147,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       // string table section in the per-module index, we create a single
       // module path string table entry with an empty (0) ID to take
       // ownership.
-      AS->setModulePath(TheIndex.addModulePath(ModulePath, 0)->first());
+      AS->setModulePath(addThisModule()->first());
 
-      GlobalValue::GUID AliaseeGUID = getGUIDFromValueId(AliaseeID).first;
-      auto *AliaseeSummary = TheIndex.getGlobalValueSummary(AliaseeGUID);
-      if (!AliaseeSummary)
+      GlobalValue::GUID AliaseeGUID =
+          getValueInfoFromValueId(AliaseeID).first.getGUID();
+      auto AliaseeInModule =
+          TheIndex.findSummaryInModule(AliaseeGUID, ModulePath);
+      if (!AliaseeInModule)
         return error("Alias expects aliasee summary to be parsed");
-      AS->setAliasee(AliaseeSummary);
+      AS->setAliasee(AliaseeInModule);
 
-      auto GUID = getGUIDFromValueId(ValueID);
+      auto GUID = getValueInfoFromValueId(ValueID);
       AS->setOriginalName(GUID.second);
       TheIndex.addGlobalValueSummary(GUID.first, std::move(AS));
       break;
@@ -4958,8 +5170,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       std::vector<ValueInfo> Refs =
           makeRefList(ArrayRef<uint64_t>(Record).slice(2));
       auto FS = llvm::make_unique<GlobalVarSummary>(Flags, std::move(Refs));
-      FS->setModulePath(TheIndex.addModulePath(ModulePath, 0)->first());
-      auto GUID = getGUIDFromValueId(ValueID);
+      FS->setModulePath(addThisModule()->first());
+      auto GUID = getValueInfoFromValueId(ValueID);
       FS->setOriginalName(GUID.second);
       TheIndex.addGlobalValueSummary(GUID.first, std::move(FS));
       break;
@@ -4986,15 +5198,22 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       std::vector<FunctionSummary::EdgeTy> Edges = makeCallList(
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
           IsOldProfileFormat, HasProfile);
-      GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      ValueInfo VI = getValueInfoFromValueId(ValueID).first;
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, std::move(Refs), std::move(Edges),
-          std::move(PendingTypeTests));
+          std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls),
+          std::move(PendingTypeCheckedLoadVCalls),
+          std::move(PendingTypeTestAssumeConstVCalls),
+          std::move(PendingTypeCheckedLoadConstVCalls));
       PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       LastSeenSummary = FS.get();
+      LastSeenGUID = VI.getGUID();
       FS->setModulePath(ModuleIdMap[ModuleId]);
-      TheIndex.addGlobalValueSummary(GUID, std::move(FS));
-      Combined = true;
+      TheIndex.addGlobalValueSummary(VI, std::move(FS));
       break;
     }
     // FS_COMBINED_ALIAS: [valueid, modid, flags, valueid]
@@ -5010,16 +5229,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       LastSeenSummary = AS.get();
       AS->setModulePath(ModuleIdMap[ModuleId]);
 
-      auto AliaseeGUID = getGUIDFromValueId(AliaseeValueId).first;
+      auto AliaseeGUID =
+          getValueInfoFromValueId(AliaseeValueId).first.getGUID();
       auto AliaseeInModule =
           TheIndex.findSummaryInModule(AliaseeGUID, AS->modulePath());
       if (!AliaseeInModule)
         return error("Alias expects aliasee summary to be parsed");
       AS->setAliasee(AliaseeInModule);
 
-      GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
-      TheIndex.addGlobalValueSummary(GUID, std::move(AS));
-      Combined = true;
+      ValueInfo VI = getValueInfoFromValueId(ValueID).first;
+      LastSeenGUID = VI.getGUID();
+      TheIndex.addGlobalValueSummary(VI, std::move(AS));
       break;
     }
     // FS_COMBINED_GLOBALVAR_INIT_REFS: [valueid, modid, flags, n x valueid]
@@ -5033,9 +5253,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       auto FS = llvm::make_unique<GlobalVarSummary>(Flags, std::move(Refs));
       LastSeenSummary = FS.get();
       FS->setModulePath(ModuleIdMap[ModuleId]);
-      GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
-      TheIndex.addGlobalValueSummary(GUID, std::move(FS));
-      Combined = true;
+      ValueInfo VI = getValueInfoFromValueId(ValueID).first;
+      LastSeenGUID = VI.getGUID();
+      TheIndex.addGlobalValueSummary(VI, std::move(FS));
       break;
     }
     // FS_COMBINED_ORIGINAL_NAME: [original_name]
@@ -5044,8 +5264,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       if (!LastSeenSummary)
         return error("Name attachment that does not follow a combined record");
       LastSeenSummary->setOriginalName(OriginalName);
+      TheIndex.addOriginalName(LastSeenGUID, OriginalName);
       // Reset the LastSeenSummary
       LastSeenSummary = nullptr;
+      LastSeenGUID = 0;
       break;
     }
     case bitc::FS_TYPE_TESTS: {
@@ -5054,6 +5276,42 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
                               Record.end());
       break;
     }
+    case bitc::FS_TYPE_TEST_ASSUME_VCALLS: {
+      assert(PendingTypeTestAssumeVCalls.empty());
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I+1]});
+      break;
+    }
+    case bitc::FS_TYPE_CHECKED_LOAD_VCALLS: {
+      assert(PendingTypeCheckedLoadVCalls.empty());
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I+1]});
+      break;
+    }
+    case bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL: {
+      PendingTypeTestAssumeConstVCalls.push_back(
+          {{Record[0], Record[1]}, {Record.begin() + 2, Record.end()}});
+      break;
+    }
+    case bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL: {
+      PendingTypeCheckedLoadConstVCalls.push_back(
+          {{Record[0], Record[1]}, {Record.begin() + 2, Record.end()}});
+      break;
+    }
+    case bitc::FS_CFI_FUNCTION_DEFS: {
+      std::set<std::string> &CfiFunctionDefs = TheIndex.cfiFunctionDefs();
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        CfiFunctionDefs.insert(
+            {Strtab.data() + Record[I], static_cast<size_t>(Record[I + 1])});
+      break;
+    }
+    case bitc::FS_CFI_FUNCTION_DECLS: {
+      std::set<std::string> &CfiFunctionDecls = TheIndex.cfiFunctionDecls();
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        CfiFunctionDecls.insert(
+            {Strtab.data() + Record[I], static_cast<size_t>(Record[I + 1])});
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
@@ -5068,7 +5326,7 @@ Error ModuleSummaryIndexBitcodeReader::parseModuleStringTable() {
   SmallVector<uint64_t, 64> Record;
 
   SmallString<128> ModulePath;
-  ModulePathStringTableTy::iterator LastSeenModulePath;
+  ModuleSummaryIndex::ModuleInfo *LastSeenModule = nullptr;
 
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -5095,8 +5353,8 @@ Error ModuleSummaryIndexBitcodeReader::parseModuleStringTable() {
       if (convertToString(Record, 1, ModulePath))
         return error("Invalid record");
 
-      LastSeenModulePath = TheIndex.addModulePath(ModulePath, ModuleId);
-      ModuleIdMap[ModuleId] = LastSeenModulePath->first();
+      LastSeenModule = TheIndex.addModule(ModulePath, ModuleId);
+      ModuleIdMap[ModuleId] = LastSeenModule->first();
 
       ModulePath.clear();
       break;
@@ -5105,15 +5363,15 @@ Error ModuleSummaryIndexBitcodeReader::parseModuleStringTable() {
     case bitc::MST_CODE_HASH: {
       if (Record.size() != 5)
         return error("Invalid hash length " + Twine(Record.size()).str());
-      if (LastSeenModulePath == TheIndex.modulePaths().end())
+      if (!LastSeenModule)
         return error("Invalid hash that does not follow a module path");
       int Pos = 0;
       for (auto &Val : Record) {
         assert(!(Val >> 32) && "Unexpected high bits set");
-        LastSeenModulePath->second.second[Pos++] = Val;
+        LastSeenModule->second.second[Pos++] = Val;
       }
-      // Reset LastSeenModulePath to avoid overriding the hash unexpectedly.
-      LastSeenModulePath = TheIndex.modulePaths().end();
+      // Reset LastSeenModule to avoid overriding the hash unexpectedly.
+      LastSeenModule = nullptr;
       break;
     }
     }
@@ -5148,18 +5406,56 @@ const std::error_category &llvm::BitcodeErrorCategory() {
   return *ErrorCategory;
 }
 
+static Expected<StringRef> readBlobInRecord(BitstreamCursor &Stream,
+                                            unsigned Block, unsigned RecordID) {
+  if (Stream.EnterSubBlock(Block))
+    return error("Invalid record");
+
+  StringRef Strtab;
+  while (1) {
+    BitstreamEntry Entry = Stream.advance();
+    switch (Entry.Kind) {
+    case BitstreamEntry::EndBlock:
+      return Strtab;
+
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+
+    case BitstreamEntry::SubBlock:
+      if (Stream.SkipBlock())
+        return error("Malformed block");
+      break;
+
+    case BitstreamEntry::Record:
+      StringRef Blob;
+      SmallVector<uint64_t, 1> Record;
+      if (Stream.readRecord(Entry.ID, Record, &Blob) == RecordID)
+        Strtab = Blob;
+      break;
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // External interface
 //===----------------------------------------------------------------------===//
 
 Expected<std::vector<BitcodeModule>>
 llvm::getBitcodeModuleList(MemoryBufferRef Buffer) {
+  auto FOrErr = getBitcodeFileContents(Buffer);
+  if (!FOrErr)
+    return FOrErr.takeError();
+  return std::move(FOrErr->Mods);
+}
+
+Expected<BitcodeFileContents>
+llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
   Expected<BitstreamCursor> StreamOrErr = initStream(Buffer);
   if (!StreamOrErr)
     return StreamOrErr.takeError();
   BitstreamCursor &Stream = *StreamOrErr;
 
-  std::vector<BitcodeModule> Modules;
+  BitcodeFileContents F;
   while (true) {
     uint64_t BCBegin = Stream.getCurrentByteNo();
 
@@ -5167,7 +5463,7 @@ llvm::getBitcodeModuleList(MemoryBufferRef Buffer) {
     // of the bitcode stream (e.g. Apple's ar tool). If we are close enough to
     // the end that there cannot possibly be another module, stop looking.
     if (BCBegin + 8 >= Stream.getBitcodeBytes().size())
-      return Modules;
+      return F;
 
     BitstreamEntry Entry = Stream.advance();
     switch (Entry.Kind) {
@@ -5193,10 +5489,49 @@ llvm::getBitcodeModuleList(MemoryBufferRef Buffer) {
         if (Stream.SkipBlock())
           return error("Malformed block");
 
-        Modules.push_back({Stream.getBitcodeBytes().slice(
-                               BCBegin, Stream.getCurrentByteNo() - BCBegin),
-                           Buffer.getBufferIdentifier(), IdentificationBit,
-                           ModuleBit});
+        F.Mods.push_back({Stream.getBitcodeBytes().slice(
+                              BCBegin, Stream.getCurrentByteNo() - BCBegin),
+                          Buffer.getBufferIdentifier(), IdentificationBit,
+                          ModuleBit});
+        continue;
+      }
+
+      if (Entry.ID == bitc::STRTAB_BLOCK_ID) {
+        Expected<StringRef> Strtab =
+            readBlobInRecord(Stream, bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB);
+        if (!Strtab)
+          return Strtab.takeError();
+        // This string table is used by every preceding bitcode module that does
+        // not have its own string table. A bitcode file may have multiple
+        // string tables if it was created by binary concatenation, for example
+        // with "llvm-cat -b".
+        for (auto I = F.Mods.rbegin(), E = F.Mods.rend(); I != E; ++I) {
+          if (!I->Strtab.empty())
+            break;
+          I->Strtab = *Strtab;
+        }
+        // Similarly, the string table is used by every preceding symbol table;
+        // normally there will be just one unless the bitcode file was created
+        // by binary concatenation.
+        if (!F.Symtab.empty() && F.StrtabForSymtab.empty())
+          F.StrtabForSymtab = *Strtab;
+        continue;
+      }
+
+      if (Entry.ID == bitc::SYMTAB_BLOCK_ID) {
+        Expected<StringRef> SymtabOrErr =
+            readBlobInRecord(Stream, bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB);
+        if (!SymtabOrErr)
+          return SymtabOrErr.takeError();
+
+        // We can expect the bitcode file to have multiple symbol tables if it
+        // was created by binary concatenation. In that case we silently
+        // ignore any subsequent symbol tables, which is fine because this is a
+        // low level function. The client is expected to notice that the number
+        // of modules in the symbol table does not match the number of modules
+        // in the input file and regenerate the symbol table.
+        if (F.Symtab.empty())
+          F.Symtab = *SymtabOrErr;
         continue;
       }
 
@@ -5236,8 +5571,8 @@ BitcodeModule::getModuleImpl(LLVMContext &Context, bool MaterializeAll,
   }
 
   Stream.JumpToBit(ModuleBit);
-  auto *R =
-      new BitcodeReader(std::move(Stream), ProducerIdentification, Context);
+  auto *R = new BitcodeReader(std::move(Stream), Strtab, ProducerIdentification,
+                              Context);
 
   std::unique_ptr<Module> M =
       llvm::make_unique<Module>(ModuleIdentifier, Context);
@@ -5266,22 +5601,37 @@ BitcodeModule::getLazyModule(LLVMContext &Context, bool ShouldLazyLoadMetadata,
   return getModuleImpl(Context, false, ShouldLazyLoadMetadata, IsImporting);
 }
 
+// Parse the specified bitcode buffer and merge the index into CombinedIndex.
+// We don't use ModuleIdentifier here because the client may need to control the
+// module path used in the combined summary (e.g. when reading summaries for
+// regular LTO modules).
+Error BitcodeModule::readSummary(ModuleSummaryIndex &CombinedIndex,
+                                 StringRef ModulePath, uint64_t ModuleId) {
+  BitstreamCursor Stream(Buffer);
+  Stream.JumpToBit(ModuleBit);
+
+  ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, CombinedIndex,
+                                    ModulePath, ModuleId);
+  return R.parseModule();
+}
+
 // Parse the specified bitcode buffer, returning the function info index.
 Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
   BitstreamCursor Stream(Buffer);
   Stream.JumpToBit(ModuleBit);
 
   auto Index = llvm::make_unique<ModuleSummaryIndex>();
-  ModuleSummaryIndexBitcodeReader R(std::move(Stream), *Index);
+  ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, *Index,
+                                    ModuleIdentifier, 0);
 
-  if (Error Err = R.parseModule(ModuleIdentifier))
+  if (Error Err = R.parseModule())
     return std::move(Err);
 
   return std::move(Index);
 }
 
 // Check if the given bitcode buffer contains a global value summary block.
-Expected<bool> BitcodeModule::hasSummary() {
+Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() {
   BitstreamCursor Stream(Buffer);
   Stream.JumpToBit(ModuleBit);
 
@@ -5295,11 +5645,14 @@ Expected<bool> BitcodeModule::hasSummary() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
-      return false;
+      return BitcodeLTOInfo{/*IsThinLTO=*/false, /*HasSummary=*/false};
 
     case BitstreamEntry::SubBlock:
       if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID)
-        return true;
+        return BitcodeLTOInfo{/*IsThinLTO=*/true, /*HasSummary=*/true};
+
+      if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID)
+        return BitcodeLTOInfo{/*IsThinLTO=*/false, /*HasSummary=*/true};
 
       // Ignore other sub-blocks.
       if (Stream.SkipBlock())
@@ -5384,6 +5737,16 @@ Expected<std::string> llvm::getBitcodeProducerString(MemoryBufferRef Buffer) {
   return readIdentificationCode(*StreamOrErr);
 }
 
+Error llvm::readModuleSummaryIndex(MemoryBufferRef Buffer,
+                                   ModuleSummaryIndex &CombinedIndex,
+                                   uint64_t ModuleId) {
+  Expected<BitcodeModule> BM = getSingleModule(Buffer);
+  if (!BM)
+    return BM.takeError();
+
+  return BM->readSummary(CombinedIndex, BM->getModuleIdentifier(), ModuleId);
+}
+
 Expected<std::unique_ptr<ModuleSummaryIndex>>
 llvm::getModuleSummaryIndex(MemoryBufferRef Buffer) {
   Expected<BitcodeModule> BM = getSingleModule(Buffer);
@@ -5393,10 +5756,22 @@ llvm::getModuleSummaryIndex(MemoryBufferRef Buffer) {
   return BM->getSummary();
 }
 
-Expected<bool> llvm::hasGlobalValueSummary(MemoryBufferRef Buffer) {
+Expected<BitcodeLTOInfo> llvm::getBitcodeLTOInfo(MemoryBufferRef Buffer) {
   Expected<BitcodeModule> BM = getSingleModule(Buffer);
   if (!BM)
     return BM.takeError();
 
-  return BM->hasSummary();
+  return BM->getLTOInfo();
+}
+
+Expected<std::unique_ptr<ModuleSummaryIndex>>
+llvm::getModuleSummaryIndexForFile(StringRef Path,
+                                   bool IgnoreEmptyThinLTOIndexFile) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
+      MemoryBuffer::getFileOrSTDIN(Path);
+  if (!FileOrErr)
+    return errorCodeToError(FileOrErr.getError());
+  if (IgnoreEmptyThinLTOIndexFile && !(*FileOrErr)->getBufferSize())
+    return nullptr;
+  return getModuleSummaryIndex(**FileOrErr);
 }
diff --git a/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index b89f5be..10fbcde 100644
--- a/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -358,6 +359,9 @@ class PlaceholderQueue {
   std::deque<DistinctMDOperandPlaceholder> PHs;
 
 public:
+  ~PlaceholderQueue() {
+    assert(empty() && "PlaceholderQueue hasn't been flushed before being destroyed");
+  }
   bool empty() { return PHs.empty(); }
   DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
   void flush(BitcodeReaderMetadataList &MetadataList);
@@ -403,6 +407,11 @@ void PlaceholderQueue::flush(BitcodeReaderMetadataList &MetadataList) {
 
 } // anonynous namespace
 
+static Error error(const Twine &Message) {
+  return make_error<StringError>(
+      Message, make_error_code(BitcodeError::CorruptedBitcode));
+}
+
 class MetadataLoader::MetadataLoaderImpl {
   BitcodeReaderMetadataList MetadataList;
   BitcodeReaderValueList &ValueList;
@@ -449,6 +458,7 @@ class MetadataLoader::MetadataLoaderImpl {
   bool StripTBAA = false;
   bool HasSeenOldLoopTags = false;
   bool NeedUpgradeToDIGlobalVariableExpression = false;
+  bool NeedDeclareExpressionUpgrade = false;
 
   /// True if metadata is being parsed for a module being ThinLTO imported.
   bool IsImporting = false;
@@ -457,7 +467,7 @@ class MetadataLoader::MetadataLoaderImpl {
                          PlaceholderQueue &Placeholders, StringRef Blob,
                          unsigned &NextMetadataNo);
   Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
-                             std::function<void(StringRef)> CallBack);
+                             function_ref<void(StringRef)> CallBack);
   Error parseGlobalObjectAttachment(GlobalObject &GO,
                                     ArrayRef<uint64_t> Record);
   Error parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
@@ -469,8 +479,8 @@ class MetadataLoader::MetadataLoaderImpl {
     for (auto CU_SP : CUSubprograms)
       if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
         for (auto &Op : SPs->operands())
-          if (auto *SP = dyn_cast_or_null<MDNode>(Op))
-            SP->replaceOperandWith(7, CU_SP.first);
+          if (auto *SP = dyn_cast_or_null<DISubprogram>(Op))
+            SP->replaceUnit(CU_SP.first);
     CUSubprograms.clear();
   }
 
@@ -495,7 +505,7 @@ class MetadataLoader::MetadataLoaderImpl {
 
     // Upgrade variables attached to globals.
     for (auto &GV : TheModule.globals()) {
-      SmallVector<MDNode *, 1> MDs, NewMDs;
+      SmallVector<MDNode *, 1> MDs;
       GV.getMetadata(LLVMContext::MD_dbg, MDs);
       GV.eraseMetadata(LLVMContext::MD_dbg);
       for (auto *MD : MDs)
@@ -508,6 +518,108 @@ class MetadataLoader::MetadataLoaderImpl {
     }
   }
 
+  /// Remove a leading DW_OP_deref from DIExpressions in a dbg.declare that
+  /// describes a function argument.
+  void upgradeDeclareExpressions(Function &F) {
+    if (!NeedDeclareExpressionUpgrade)
+      return;
+
+    for (auto &BB : F)
+      for (auto &I : BB)
+        if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
+          if (auto *DIExpr = DDI->getExpression())
+            if (DIExpr->startsWithDeref() &&
+                dyn_cast_or_null<Argument>(DDI->getAddress())) {
+              SmallVector<uint64_t, 8> Ops;
+              Ops.append(std::next(DIExpr->elements_begin()),
+                         DIExpr->elements_end());
+              auto *E = DIExpression::get(Context, Ops);
+              DDI->setOperand(2, MetadataAsValue::get(Context, E));
+            }
+  }
+
+  /// Upgrade the expression from previous versions.
+  Error upgradeDIExpression(uint64_t FromVersion,
+                            MutableArrayRef<uint64_t> &Expr,
+                            SmallVectorImpl<uint64_t> &Buffer) {
+    auto N = Expr.size();
+    switch (FromVersion) {
+    default:
+      return error("Invalid record");
+    case 0:
+      if (N >= 3 && Expr[N - 3] == dwarf::DW_OP_bit_piece)
+        Expr[N - 3] = dwarf::DW_OP_LLVM_fragment;
+      LLVM_FALLTHROUGH;
+    case 1:
+      // Move DW_OP_deref to the end.
+      if (N && Expr[0] == dwarf::DW_OP_deref) {
+        auto End = Expr.end();
+        if (Expr.size() >= 3 &&
+            *std::prev(End, 3) == dwarf::DW_OP_LLVM_fragment)
+          End = std::prev(End, 3);
+        std::move(std::next(Expr.begin()), End, Expr.begin());
+        *std::prev(End) = dwarf::DW_OP_deref;
+      }
+      NeedDeclareExpressionUpgrade = true;
+      LLVM_FALLTHROUGH;
+    case 2: {
+      // Change DW_OP_plus to DW_OP_plus_uconst.
+      // Change DW_OP_minus to DW_OP_uconst, DW_OP_minus
+      auto SubExpr = ArrayRef<uint64_t>(Expr);
+      while (!SubExpr.empty()) {
+        // Skip past other operators with their operands
+        // for this version of the IR, obtained from
+        // from historic DIExpression::ExprOperand::getSize().
+        size_t HistoricSize;
+        switch (SubExpr.front()) {
+        default:
+          HistoricSize = 1;
+          break;
+        case dwarf::DW_OP_constu:
+        case dwarf::DW_OP_minus:
+        case dwarf::DW_OP_plus:
+          HistoricSize = 2;
+          break;
+        case dwarf::DW_OP_LLVM_fragment:
+          HistoricSize = 3;
+          break;
+        }
+
+        // If the expression is malformed, make sure we don't
+        // copy more elements than we should.
+        HistoricSize = std::min(SubExpr.size(), HistoricSize);
+        ArrayRef<uint64_t> Args = SubExpr.slice(1, HistoricSize-1);
+
+        switch (SubExpr.front()) {
+        case dwarf::DW_OP_plus:
+          Buffer.push_back(dwarf::DW_OP_plus_uconst);
+          Buffer.append(Args.begin(), Args.end());
+          break;
+        case dwarf::DW_OP_minus:
+          Buffer.push_back(dwarf::DW_OP_constu);
+          Buffer.append(Args.begin(), Args.end());
+          Buffer.push_back(dwarf::DW_OP_minus);
+          break;
+        default:
+          Buffer.push_back(*SubExpr.begin());
+          Buffer.append(Args.begin(), Args.end());
+          break;
+        }
+
+        // Continue with remaining elements.
+        SubExpr = SubExpr.slice(HistoricSize);
+      }
+      Expr = MutableArrayRef<uint64_t>(Buffer);
+      LLVM_FALLTHROUGH;
+    }
+    case 3:
+      // Up-to-date!
+      break;
+    }
+
+    return Error::success();
+  }
+
   void upgradeDebugInfo() {
     upgradeCUSubprograms();
     upgradeCUVariables();
@@ -520,7 +632,7 @@ public:
                      bool IsImporting)
       : MetadataList(TheModule.getContext()), ValueList(ValueList),
         Stream(Stream), Context(TheModule.getContext()), TheModule(TheModule),
-        getTypeByID(getTypeByID), IsImporting(IsImporting) {}
+        getTypeByID(std::move(getTypeByID)), IsImporting(IsImporting) {}
 
   Error parseMetadata(bool ModuleLevel);
 
@@ -562,13 +674,9 @@ public:
 
   unsigned size() const { return MetadataList.size(); }
   void shrinkTo(unsigned N) { MetadataList.shrinkTo(N); }
+  void upgradeDebugIntrinsics(Function &F) { upgradeDeclareExpressions(F); }
 };
 
-Error error(const Twine &Message) {
-  return make_error<StringError>(
-      Message, make_error_code(BitcodeError::CorruptedBitcode));
-}
-
 Expected<bool>
 MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
   IndexCursor = Stream;
@@ -1107,9 +1215,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() != 12)
+    if (Record.size() < 12 || Record.size() > 13)
       return error("Invalid record");
 
+    // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
+    // that there is no DWARF address space associated with DIDerivedType.
+    Optional<unsigned> DWARFAddressSpace;
+    if (Record.size() > 12 && Record[12])
+      DWARFAddressSpace = Record[12] - 1;
+
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
     MetadataList.assignValue(
@@ -1118,7 +1232,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[3]), Record[4],
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                         Record[9], Flags, getDITypeRefOrNull(Record[11]))),
+                         Record[9], DWARFAddressSpace, Flags,
+                         getDITypeRefOrNull(Record[11]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1240,7 +1355,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPILE_UNIT: {
-    if (Record.size() < 14 || Record.size() > 17)
+    if (Record.size() < 14 || Record.size() > 18)
       return error("Invalid record");
 
     // Ignore Record[0], which indicates whether this compile unit is
@@ -1253,7 +1368,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         getMDOrNull(Record[12]), getMDOrNull(Record[13]),
         Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
         Record.size() <= 14 ? 0 : Record[14],
-        Record.size() <= 16 ? true : Record[16]);
+        Record.size() <= 16 ? true : Record[16],
+        Record.size() <= 17 ? false : Record[17]);
 
     MetadataList.assignValue(CU, NextMetadataNo);
     NextMetadataNo++;
@@ -1264,7 +1380,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_SUBPROGRAM: {
-    if (Record.size() < 18 || Record.size() > 20)
+    if (Record.size() < 18 || Record.size() > 21)
       return error("Invalid record");
 
     IsDistinct =
@@ -1280,29 +1396,31 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     unsigned Offset = Record.size() >= 19 ? 1 : 0;
     bool HasFn = Offset && !HasUnit;
     bool HasThisAdj = Record.size() >= 20;
+    bool HasThrownTypes = Record.size() >= 21;
     DISubprogram *SP = GET_OR_DISTINCT(
-        DISubprogram, (Context,
-                       getDITypeRefOrNull(Record[1]),          // scope
-                       getMDString(Record[2]),                 // name
-                       getMDString(Record[3]),                 // linkageName
-                       getMDOrNull(Record[4]),                 // file
-                       Record[5],                              // line
-                       getMDOrNull(Record[6]),                 // type
-                       Record[7],                              // isLocal
-                       Record[8],                              // isDefinition
-                       Record[9],                              // scopeLine
-                       getDITypeRefOrNull(Record[10]),         // containingType
-                       Record[11],                             // virtuality
-                       Record[12],                             // virtualIndex
-                       HasThisAdj ? Record[19] : 0,            // thisAdjustment
-                       static_cast<DINode::DIFlags>(Record[13] // flags
-                                                    ),
-                       Record[14],                       // isOptimized
-                       HasUnit ? CUorFn : nullptr,       // unit
-                       getMDOrNull(Record[15 + Offset]), // templateParams
-                       getMDOrNull(Record[16 + Offset]), // declaration
-                       getMDOrNull(Record[17 + Offset])  // variables
-                       ));
+        DISubprogram,
+        (Context,
+         getDITypeRefOrNull(Record[1]),                     // scope
+         getMDString(Record[2]),                            // name
+         getMDString(Record[3]),                            // linkageName
+         getMDOrNull(Record[4]),                            // file
+         Record[5],                                         // line
+         getMDOrNull(Record[6]),                            // type
+         Record[7],                                         // isLocal
+         Record[8],                                         // isDefinition
+         Record[9],                                         // scopeLine
+         getDITypeRefOrNull(Record[10]),                    // containingType
+         Record[11],                                        // virtuality
+         Record[12],                                        // virtualIndex
+         HasThisAdj ? Record[19] : 0,                       // thisAdjustment
+         static_cast<DINode::DIFlags>(Record[13]),          // flags
+         Record[14],                                        // isOptimized
+         HasUnit ? CUorFn : nullptr,                        // unit
+         getMDOrNull(Record[15 + Offset]),                  // templateParams
+         getMDOrNull(Record[16 + Offset]),                  // declaration
+         getMDOrNull(Record[17 + Offset]),                  // variables
+         HasThrownTypes ? getMDOrNull(Record[20]) : nullptr // thrownTypes
+         ));
     MetadataList.assignValue(SP, NextMetadataNo);
     NextMetadataNo++;
 
@@ -1347,16 +1465,20 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_NAMESPACE: {
-    if (Record.size() != 5)
+    // Newer versions of DINamespace dropped file and line.
+    MDString *Name;
+    if (Record.size() == 3)
+      Name = getMDString(Record[2]);
+    else if (Record.size() == 5)
+      Name = getMDString(Record[3]);
+    else
       return error("Invalid record");
 
     IsDistinct = Record[0] & 1;
     bool ExportSymbols = Record[0] & 2;
     MetadataList.assignValue(
         GET_OR_DISTINCT(DINamespace,
-                        (Context, getMDOrNull(Record[1]),
-                         getMDOrNull(Record[2]), getMDString(Record[3]),
-                         Record[4], ExportSymbols)),
+                        (Context, getMDOrNull(Record[1]), Name, ExportSymbols)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1433,6 +1555,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     } else if (Version == 0) {
       // Upgrade old metadata, which stored a global variable reference or a
       // ConstantInt here.
+      NeedUpgradeToDIGlobalVariableExpression = true;
       Metadata *Expr = getMDOrNull(Record[9]);
       uint32_t AlignInBits = 0;
       if (Record.size() > 11) {
@@ -1463,8 +1586,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       DIGlobalVariableExpression *DGVE = nullptr;
       if (Attach || Expr)
         DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
-      else
-        NeedUpgradeToDIGlobalVariableExpression = true;
       if (Attach)
         Attach->addDebugInfo(DGVE);
 
@@ -1485,7 +1606,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     bool HasAlignment = Record[0] & 2;
     // 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
     // DW_TAG_arg_variable, if we have alignment flag encoded it means, that
-    // this is newer version of record which doesn't have artifical tag.
+    // this is newer version of record which doesn't have artificial tag.
     bool HasTag = !HasAlignment && Record.size() > 8;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7 + HasTag]);
     uint32_t AlignInBits = 0;
@@ -1510,16 +1631,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       return error("Invalid record");
 
     IsDistinct = Record[0] & 1;
-    bool HasOpFragment = Record[0] & 2;
+    uint64_t Version = Record[0] >> 1;
     auto Elts = MutableArrayRef<uint64_t>(Record).slice(1);
-    if (!HasOpFragment)
-      if (unsigned N = Elts.size())
-        if (N >= 3 && Elts[N - 3] == dwarf::DW_OP_bit_piece)
-          Elts[N - 3] = dwarf::DW_OP_LLVM_fragment;
+
+    SmallVector<uint64_t, 6> Buffer;
+    if (Error Err = upgradeDIExpression(Version, Elts, Buffer))
+      return Err;
 
     MetadataList.assignValue(
-        GET_OR_DISTINCT(DIExpression, (Context, makeArrayRef(Record).slice(1))),
-        NextMetadataNo);
+        GET_OR_DISTINCT(DIExpression, (Context, Elts)), NextMetadataNo);
     NextMetadataNo++;
     break;
   }
@@ -1551,15 +1671,17 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_IMPORTED_ENTITY: {
-    if (Record.size() != 6)
+    if (Record.size() != 6 && Record.size() != 7)
       return error("Invalid record");
 
     IsDistinct = Record[0];
+    bool HasFile = (Record.size() == 7);
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIImportedEntity,
                         (Context, Record[1], getMDOrNull(Record[2]),
-                         getDITypeRefOrNull(Record[3]), Record[4],
-                         getMDString(Record[5]))),
+                         getDITypeRefOrNull(Record[3]),
+                         HasFile ? getMDOrNull(Record[6]) : nullptr,
+                         HasFile ? Record[4] : 0, getMDString(Record[5]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1611,7 +1733,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
 Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
     ArrayRef<uint64_t> Record, StringRef Blob,
-    std::function<void(StringRef)> CallBack) {
+    function_ref<void(StringRef)> CallBack) {
   // All the MDStrings in the block are emitted together in a single
   // record.  The strings are concatenated and stored in a blob along with
   // their sizes.
@@ -1808,8 +1930,8 @@ MetadataLoader::MetadataLoader(BitstreamCursor &Stream, Module &TheModule,
                                BitcodeReaderValueList &ValueList,
                                bool IsImporting,
                                std::function<Type *(unsigned)> getTypeByID)
-    : Pimpl(llvm::make_unique<MetadataLoaderImpl>(Stream, TheModule, ValueList,
-                                                  getTypeByID, IsImporting)) {}
+    : Pimpl(llvm::make_unique<MetadataLoaderImpl>(
+          Stream, TheModule, ValueList, std::move(getTypeByID), IsImporting)) {}
 
 Error MetadataLoader::parseMetadata(bool ModuleLevel) {
   return Pimpl->parseMetadata(ModuleLevel);
@@ -1848,3 +1970,7 @@ bool MetadataLoader::isStrippingTBAA() { return Pimpl->isStrippingTBAA(); }
 
 unsigned MetadataLoader::size() const { return Pimpl->size(); }
 void MetadataLoader::shrinkTo(unsigned N) { return Pimpl->shrinkTo(N); }
+
+void MetadataLoader::upgradeDebugIntrinsics(Function &F) {
+  return Pimpl->upgradeDebugIntrinsics(F);
+}
diff --git a/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.h b/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.h
index 442dfc9..f23dcc0 100644
--- a/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.h
+++ b/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.h
@@ -79,6 +79,9 @@ public:
 
   unsigned size() const;
   void shrinkTo(unsigned N);
+
+  /// Perform bitcode upgrades on llvm.dbg.* calls.
+  void upgradeDebugIntrinsics(Function &F);
 };
 }
 
diff --git a/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp b/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp
index 7152a51..f2a3439 100644
--- a/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp
+++ b/contrib/llvm/lib/Bitcode/Reader/ValueList.cpp
@@ -58,7 +58,7 @@ void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
   if (Idx >= size())
     resize(Idx + 1);
 
-  WeakVH &OldV = ValuePtrs[Idx];
+  WeakTrackingVH &OldV = ValuePtrs[Idx];
   if (!OldV) {
     OldV = V;
     return;
@@ -73,7 +73,7 @@ void BitcodeReaderValueList::assignValue(Value *V, unsigned Idx) {
     // If there was a forward reference to this value, replace it.
     Value *PrevVal = OldV;
     OldV->replaceAllUsesWith(V);
-    delete PrevVal;
+    PrevVal->deleteValue();
   }
 }
 
@@ -194,6 +194,6 @@ void BitcodeReaderValueList::resolveConstantForwardRefs() {
 
     // Update all ValueHandles, they should be the only users at this point.
     Placeholder->replaceAllUsesWith(RealVal);
-    delete Placeholder;
+    Placeholder->deleteValue();
   }
 }
diff --git a/contrib/llvm/lib/Bitcode/Reader/ValueList.h b/contrib/llvm/lib/Bitcode/Reader/ValueList.h
index 3119d77..72775a3 100644
--- a/contrib/llvm/lib/Bitcode/Reader/ValueList.h
+++ b/contrib/llvm/lib/Bitcode/Reader/ValueList.h
@@ -20,7 +20,7 @@ namespace llvm {
 class Constant;
 
 class BitcodeReaderValueList {
-  std::vector<WeakVH> ValuePtrs;
+  std::vector<WeakTrackingVH> ValuePtrs;
 
   /// As we resolve forward-referenced constants, we add information about them
   /// to this vector.  This allows us to resolve them in bulk instead of
diff --git a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index ebb2022..dcffde1 100644
--- a/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -28,10 +28,13 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/UseListOrder.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/IRSymtab.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SHA1.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <map>
@@ -76,21 +79,24 @@ protected:
   /// The stream created and owned by the client.
   BitstreamWriter &Stream;
 
-  /// Saves the offset of the VSTOffset record that must eventually be
-  /// backpatched with the offset of the actual VST.
-  uint64_t VSTOffsetPlaceholder = 0;
+  StringTableBuilder &StrtabBuilder;
 
 public:
   /// Constructs a BitcodeWriterBase object that writes to the provided
   /// \p Stream.
-  BitcodeWriterBase(BitstreamWriter &Stream) : Stream(Stream) {}
+  BitcodeWriterBase(BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder)
+      : Stream(Stream), StrtabBuilder(StrtabBuilder) {}
 
 protected:
-  bool hasVSTOffsetPlaceholder() { return VSTOffsetPlaceholder != 0; }
-  void writeValueSymbolTableForwardDecl();
   void writeBitcodeHeader();
+  void writeModuleVersion();
 };
 
+void BitcodeWriterBase::writeModuleVersion() {
+  // VERSION: [version#]
+  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, ArrayRef<uint64_t>{2});
+}
+
 /// Class to manage the bitcode writing for a module.
 class ModuleBitcodeWriter : public BitcodeWriterBase {
   /// Pointer to the buffer allocated by caller for bitcode writing.
@@ -108,6 +114,16 @@ class ModuleBitcodeWriter : public BitcodeWriterBase {
   /// True if a module hash record should be written.
   bool GenerateHash;
 
+  SHA1 Hasher;
+
+  /// If non-null, when GenerateHash is true, the resulting hash is written
+  /// into ModHash. When GenerateHash is false, that specified value
+  /// is used as the hash instead of computing from the generated bitcode.
+  /// Can be used to produce the same module hash for a minimized bitcode
+  /// used just for the thin link as in the regular full bitcode that will
+  /// be used in the backend.
+  ModuleHash *ModHash;
+
   /// The start bit of the identification block.
   uint64_t BitcodeStartBit;
 
@@ -119,15 +135,22 @@ class ModuleBitcodeWriter : public BitcodeWriterBase {
   /// Tracks the last value id recorded in the GUIDToValueMap.
   unsigned GlobalValueId;
 
+  /// Saves the offset of the VSTOffset record that must eventually be
+  /// backpatched with the offset of the actual VST.
+  uint64_t VSTOffsetPlaceholder = 0;
+
 public:
   /// Constructs a ModuleBitcodeWriter object for the given Module,
   /// writing to the provided \p Buffer.
   ModuleBitcodeWriter(const Module *M, SmallVectorImpl<char> &Buffer,
+                      StringTableBuilder &StrtabBuilder,
                       BitstreamWriter &Stream, bool ShouldPreserveUseListOrder,
-                      const ModuleSummaryIndex *Index, bool GenerateHash)
-      : BitcodeWriterBase(Stream), Buffer(Buffer), M(*M),
+                      const ModuleSummaryIndex *Index, bool GenerateHash,
+                      ModuleHash *ModHash = nullptr)
+      : BitcodeWriterBase(Stream, StrtabBuilder), Buffer(Buffer), M(*M),
         VE(*M, ShouldPreserveUseListOrder), Index(Index),
-        GenerateHash(GenerateHash), BitcodeStartBit(Stream.GetCurrentBitNo()) {
+        GenerateHash(GenerateHash), ModHash(ModHash),
+        BitcodeStartBit(Stream.GetCurrentBitNo()) {
     // Assign ValueIds to any callee values in the index that came from
     // indirect call profiles and were recorded as a GUID not a Value*
     // (which would have been assigned an ID by the ValueEnumerator).
@@ -138,14 +161,14 @@ public:
       return;
     for (const auto &GUIDSummaryLists : *Index)
       // Examine all summaries for this GUID.
-      for (auto &Summary : GUIDSummaryLists.second)
+      for (auto &Summary : GUIDSummaryLists.second.SummaryList)
         if (auto FS = dyn_cast<FunctionSummary>(Summary.get()))
           // For each call in the function summary, see if the call
           // is to a GUID (which means it is for an indirect call,
           // otherwise we would have a Value for it). If so, synthesize
           // a value id.
           for (auto &CallEdge : FS->calls())
-            if (CallEdge.first.isGUID())
+            if (!CallEdge.first.getValue())
               assignValueId(CallEdge.first.getGUID());
   }
 
@@ -155,10 +178,13 @@ public:
 private:
   uint64_t bitcodeStartBit() { return BitcodeStartBit; }
 
+  size_t addToStrtab(StringRef Str);
+
   void writeAttributeGroupTable();
   void writeAttributeTable();
   void writeTypeTable();
   void writeComdats();
+  void writeValueSymbolTableForwardDecl();
   void writeModuleInfo();
   void writeValueAsMetadata(const ValueAsMetadata *MD,
                             SmallVectorImpl<uint64_t> &Record);
@@ -240,6 +266,7 @@ private:
                                     const GlobalObject &GO);
   void writeModuleMetadataKinds();
   void writeOperandBundleTags();
+  void writeSyncScopeNames();
   void writeConstants(unsigned FirstVal, unsigned LastVal, bool isGlobal);
   void writeModuleConstants();
   bool pushValueAndType(const Value *V, unsigned InstID,
@@ -251,9 +278,9 @@ private:
                        SmallVectorImpl<uint64_t> &Vals);
   void writeInstruction(const Instruction &I, unsigned InstID,
                         SmallVectorImpl<unsigned> &Vals);
-  void writeValueSymbolTable(
-      const ValueSymbolTable &VST, bool IsModuleLevel = false,
-      DenseMap<const Function *, uint64_t> *FunctionToBitcodeIndex = nullptr);
+  void writeFunctionLevelValueSymbolTable(const ValueSymbolTable &VST);
+  void writeGlobalValueSymbolTable(
+      DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
   void writeUseList(UseListOrder &&Order);
   void writeUseListBlock(const Function *F);
   void
@@ -285,11 +312,15 @@ private:
   }
   // Helper to get the valueId for the type of value recorded in VI.
   unsigned getValueId(ValueInfo VI) {
-    if (VI.isGUID())
+    if (!VI.getValue())
       return getValueId(VI.getGUID());
     return VE.getValueID(VI.getValue());
   }
   std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; }
+
+  unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
+    return unsigned(SSID);
+  }
 };
 
 /// Class to manage the bitcode writing for a combined index.
@@ -312,186 +343,75 @@ public:
   /// Constructs a IndexBitcodeWriter object for the given combined index,
   /// writing to the provided \p Buffer. When writing a subset of the index
   /// for a distributed backend, provide a \p ModuleToSummariesForIndex map.
-  IndexBitcodeWriter(BitstreamWriter &Stream, const ModuleSummaryIndex &Index,
+  IndexBitcodeWriter(BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder,
+                     const ModuleSummaryIndex &Index,
                      const std::map<std::string, GVSummaryMapTy>
                          *ModuleToSummariesForIndex = nullptr)
-      : BitcodeWriterBase(Stream), Index(Index),
+      : BitcodeWriterBase(Stream, StrtabBuilder), Index(Index),
         ModuleToSummariesForIndex(ModuleToSummariesForIndex) {
     // Assign unique value ids to all summaries to be written, for use
     // in writing out the call graph edges. Save the mapping from GUID
     // to the new global value id to use when writing those edges, which
     // are currently saved in the index in terms of GUID.
-    for (const auto &I : *this)
+    forEachSummary([&](GVInfo I) {
       GUIDToValueIdMap[I.first] = ++GlobalValueId;
+    });
   }
 
   /// The below iterator returns the GUID and associated summary.
   typedef std::pair<GlobalValue::GUID, GlobalValueSummary *> GVInfo;
 
-  /// Iterator over the value GUID and summaries to be written to bitcode,
-  /// hides the details of whether they are being pulled from the entire
-  /// index or just those in a provided ModuleToSummariesForIndex map.
-  class iterator
-      : public llvm::iterator_facade_base<iterator, std::forward_iterator_tag,
-                                          GVInfo> {
-    /// Enables access to parent class.
-    const IndexBitcodeWriter &Writer;
-
-    // Iterators used when writing only those summaries in a provided
-    // ModuleToSummariesForIndex map:
-
-    /// Points to the last element in outer ModuleToSummariesForIndex map.
-    std::map<std::string, GVSummaryMapTy>::const_iterator ModuleSummariesBack;
-    /// Iterator on outer ModuleToSummariesForIndex map.
-    std::map<std::string, GVSummaryMapTy>::const_iterator ModuleSummariesIter;
-    /// Iterator on an inner global variable summary map.
-    GVSummaryMapTy::const_iterator ModuleGVSummariesIter;
-
-    // Iterators used when writing all summaries in the index:
-
-    /// Points to the last element in the Index outer GlobalValueMap.
-    const_gvsummary_iterator IndexSummariesBack;
-    /// Iterator on outer GlobalValueMap.
-    const_gvsummary_iterator IndexSummariesIter;
-    /// Iterator on an inner GlobalValueSummaryList.
-    GlobalValueSummaryList::const_iterator IndexGVSummariesIter;
-
-  public:
-    /// Construct iterator from parent \p Writer and indicate if we are
-    /// constructing the end iterator.
-    iterator(const IndexBitcodeWriter &Writer, bool IsAtEnd) : Writer(Writer) {
-      // Set up the appropriate set of iterators given whether we are writing
-      // the full index or just a subset.
-      // Can't setup the Back or inner iterators if the corresponding map
-      // is empty. This will be handled specially in operator== as well.
-      if (Writer.ModuleToSummariesForIndex &&
-          !Writer.ModuleToSummariesForIndex->empty()) {
-        for (ModuleSummariesBack = Writer.ModuleToSummariesForIndex->begin();
-             std::next(ModuleSummariesBack) !=
-             Writer.ModuleToSummariesForIndex->end();
-             ModuleSummariesBack++)
-          ;
-        ModuleSummariesIter = !IsAtEnd
-                                  ? Writer.ModuleToSummariesForIndex->begin()
-                                  : ModuleSummariesBack;
-        ModuleGVSummariesIter = !IsAtEnd ? ModuleSummariesIter->second.begin()
-                                         : ModuleSummariesBack->second.end();
-      } else if (!Writer.ModuleToSummariesForIndex &&
-                 Writer.Index.begin() != Writer.Index.end()) {
-        for (IndexSummariesBack = Writer.Index.begin();
-             std::next(IndexSummariesBack) != Writer.Index.end();
-             IndexSummariesBack++)
-          ;
-        IndexSummariesIter =
-            !IsAtEnd ? Writer.Index.begin() : IndexSummariesBack;
-        IndexGVSummariesIter = !IsAtEnd ? IndexSummariesIter->second.begin()
-                                        : IndexSummariesBack->second.end();
-      }
+  /// Calls the callback for each value GUID and summary to be written to
+  /// bitcode. This hides the details of whether they are being pulled from the
+  /// entire index or just those in a provided ModuleToSummariesForIndex map.
+  template<typename Functor>
+  void forEachSummary(Functor Callback) {
+    if (ModuleToSummariesForIndex) {
+      for (auto &M : *ModuleToSummariesForIndex)
+        for (auto &Summary : M.second)
+          Callback(Summary);
+    } else {
+      for (auto &Summaries : Index)
+        for (auto &Summary : Summaries.second.SummaryList)
+          Callback({Summaries.first, Summary.get()});
     }
+  }
 
-    /// Increment the appropriate set of iterators.
-    iterator &operator++() {
-      // First the inner iterator is incremented, then if it is at the end
-      // and there are more outer iterations to go, the inner is reset to
-      // the start of the next inner list.
-      if (Writer.ModuleToSummariesForIndex) {
-        ++ModuleGVSummariesIter;
-        if (ModuleGVSummariesIter == ModuleSummariesIter->second.end() &&
-            ModuleSummariesIter != ModuleSummariesBack) {
-          ++ModuleSummariesIter;
-          ModuleGVSummariesIter = ModuleSummariesIter->second.begin();
-        }
-      } else {
-        ++IndexGVSummariesIter;
-        if (IndexGVSummariesIter == IndexSummariesIter->second.end() &&
-            IndexSummariesIter != IndexSummariesBack) {
-          ++IndexSummariesIter;
-          IndexGVSummariesIter = IndexSummariesIter->second.begin();
+  /// Calls the callback for each entry in the modulePaths StringMap that
+  /// should be written to the module path string table. This hides the details
+  /// of whether they are being pulled from the entire index or just those in a
+  /// provided ModuleToSummariesForIndex map.
+  template <typename Functor> void forEachModule(Functor Callback) {
+    if (ModuleToSummariesForIndex) {
+      for (const auto &M : *ModuleToSummariesForIndex) {
+        const auto &MPI = Index.modulePaths().find(M.first);
+        if (MPI == Index.modulePaths().end()) {
+          // This should only happen if the bitcode file was empty, in which
+          // case we shouldn't be importing (the ModuleToSummariesForIndex
+          // would only include the module we are writing and index for).
+          assert(ModuleToSummariesForIndex->size() == 1);
+          continue;
         }
+        Callback(*MPI);
       }
-      return *this;
-    }
-
-    /// Access the <GUID,GlobalValueSummary*> pair corresponding to the current
-    /// outer and inner iterator positions.
-    GVInfo operator*() {
-      if (Writer.ModuleToSummariesForIndex)
-        return std::make_pair(ModuleGVSummariesIter->first,
-                              ModuleGVSummariesIter->second);
-      return std::make_pair(IndexSummariesIter->first,
-                            IndexGVSummariesIter->get());
-    }
-
-    /// Checks if the iterators are equal, with special handling for empty
-    /// indexes.
-    bool operator==(const iterator &RHS) const {
-      if (Writer.ModuleToSummariesForIndex) {
-        // First ensure that both are writing the same subset.
-        if (Writer.ModuleToSummariesForIndex !=
-            RHS.Writer.ModuleToSummariesForIndex)
-          return false;
-        // Already determined above that maps are the same, so if one is
-        // empty, they both are.
-        if (Writer.ModuleToSummariesForIndex->empty())
-          return true;
-        // Ensure the ModuleGVSummariesIter are iterating over the same
-        // container before checking them below.
-        if (ModuleSummariesIter != RHS.ModuleSummariesIter)
-          return false;
-        return ModuleGVSummariesIter == RHS.ModuleGVSummariesIter;
-      }
-      // First ensure RHS also writing the full index, and that both are
-      // writing the same full index.
-      if (RHS.Writer.ModuleToSummariesForIndex ||
-          &Writer.Index != &RHS.Writer.Index)
-        return false;
-      // Already determined above that maps are the same, so if one is
-      // empty, they both are.
-      if (Writer.Index.begin() == Writer.Index.end())
-        return true;
-      // Ensure the IndexGVSummariesIter are iterating over the same
-      // container before checking them below.
-      if (IndexSummariesIter != RHS.IndexSummariesIter)
-        return false;
-      return IndexGVSummariesIter == RHS.IndexGVSummariesIter;
+    } else {
+      for (const auto &MPSE : Index.modulePaths())
+        Callback(MPSE);
     }
-  };
-
-  /// Obtain the start iterator over the summaries to be written.
-  iterator begin() { return iterator(*this, /*IsAtEnd=*/false); }
-  /// Obtain the end iterator over the summaries to be written.
-  iterator end() { return iterator(*this, /*IsAtEnd=*/true); }
+  }
 
   /// Main entry point for writing a combined index to bitcode.
   void write();
 
 private:
-  void writeIndex();
   void writeModStrings();
-  void writeCombinedValueSymbolTable();
   void writeCombinedGlobalValueSummary();
 
-  /// Indicates whether the provided \p ModulePath should be written into
-  /// the module string table, e.g. if full index written or if it is in
-  /// the provided subset.
-  bool doIncludeModule(StringRef ModulePath) {
-    return !ModuleToSummariesForIndex ||
-           ModuleToSummariesForIndex->count(ModulePath);
-  }
-
-  bool hasValueId(GlobalValue::GUID ValGUID) {
-    const auto &VMI = GUIDToValueIdMap.find(ValGUID);
-    return VMI != GUIDToValueIdMap.end();
-  }
-  unsigned getValueId(GlobalValue::GUID ValGUID) {
-    const auto &VMI = GUIDToValueIdMap.find(ValGUID);
-    // If this GUID doesn't have an entry, assign one.
-    if (VMI == GUIDToValueIdMap.end()) {
-      GUIDToValueIdMap[ValGUID] = ++GlobalValueId;
-      return GlobalValueId;
-    } else {
-      return VMI->second;
-    }
+  Optional<unsigned> getValueId(GlobalValue::GUID ValGUID) {
+    auto VMI = GUIDToValueIdMap.find(ValGUID);
+    if (VMI == GUIDToValueIdMap.end())
+      return None;
+    return VMI->second;
   }
   std::map<GlobalValue::GUID, unsigned> &valueIds() { return GUIDToValueIdMap; }
 };
@@ -570,14 +490,6 @@ static unsigned getEncodedOrdering(AtomicOrdering Ordering) {
   llvm_unreachable("Invalid ordering");
 }
 
-static unsigned getEncodedSynchScope(SynchronizationScope SynchScope) {
-  switch (SynchScope) {
-  case SingleThread: return bitc::SYNCHSCOPE_SINGLETHREAD;
-  case CrossThread: return bitc::SYNCHSCOPE_CROSSTHREAD;
-  }
-  llvm_unreachable("Invalid synch scope");
-}
-
 static void writeStringRecord(BitstreamWriter &Stream, unsigned Code,
                               StringRef Str, unsigned AbbrevToUse) {
   SmallVector<unsigned, 64> Vals;
@@ -671,6 +583,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_RETURNS_TWICE;
   case Attribute::SExt:
     return bitc::ATTR_KIND_S_EXT;
+  case Attribute::Speculatable:
+    return bitc::ATTR_KIND_SPECULATABLE;
   case Attribute::StackAlignment:
     return bitc::ATTR_KIND_STACK_ALIGNMENT;
   case Attribute::StackProtect:
@@ -709,63 +623,62 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
 }
 
 void ModuleBitcodeWriter::writeAttributeGroupTable() {
-  const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
+  const std::vector<ValueEnumerator::IndexAndAttrSet> &AttrGrps =
+      VE.getAttributeGroups();
   if (AttrGrps.empty()) return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
-  for (unsigned i = 0, e = AttrGrps.size(); i != e; ++i) {
-    AttributeSet AS = AttrGrps[i];
-    for (unsigned i = 0, e = AS.getNumSlots(); i != e; ++i) {
-      AttributeSet A = AS.getSlotAttributes(i);
-
-      Record.push_back(VE.getAttributeGroupID(A));
-      Record.push_back(AS.getSlotIndex(i));
-
-      for (AttributeSet::iterator I = AS.begin(0), E = AS.end(0);
-           I != E; ++I) {
-        Attribute Attr = *I;
-        if (Attr.isEnumAttribute()) {
-          Record.push_back(0);
-          Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
-        } else if (Attr.isIntAttribute()) {
-          Record.push_back(1);
-          Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
-          Record.push_back(Attr.getValueAsInt());
-        } else {
-          StringRef Kind = Attr.getKindAsString();
-          StringRef Val = Attr.getValueAsString();
-
-          Record.push_back(Val.empty() ? 3 : 4);
-          Record.append(Kind.begin(), Kind.end());
+  for (ValueEnumerator::IndexAndAttrSet Pair : AttrGrps) {
+    unsigned AttrListIndex = Pair.first;
+    AttributeSet AS = Pair.second;
+    Record.push_back(VE.getAttributeGroupID(Pair));
+    Record.push_back(AttrListIndex);
+
+    for (Attribute Attr : AS) {
+      if (Attr.isEnumAttribute()) {
+        Record.push_back(0);
+        Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
+      } else if (Attr.isIntAttribute()) {
+        Record.push_back(1);
+        Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
+        Record.push_back(Attr.getValueAsInt());
+      } else {
+        StringRef Kind = Attr.getKindAsString();
+        StringRef Val = Attr.getValueAsString();
+
+        Record.push_back(Val.empty() ? 3 : 4);
+        Record.append(Kind.begin(), Kind.end());
+        Record.push_back(0);
+        if (!Val.empty()) {
+          Record.append(Val.begin(), Val.end());
           Record.push_back(0);
-          if (!Val.empty()) {
-            Record.append(Val.begin(), Val.end());
-            Record.push_back(0);
-          }
         }
       }
-
-      Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record);
-      Record.clear();
     }
+
+    Stream.EmitRecord(bitc::PARAMATTR_GRP_CODE_ENTRY, Record);
+    Record.clear();
   }
 
   Stream.ExitBlock();
 }
 
 void ModuleBitcodeWriter::writeAttributeTable() {
-  const std::vector<AttributeSet> &Attrs = VE.getAttributes();
+  const std::vector<AttributeList> &Attrs = VE.getAttributeLists();
   if (Attrs.empty()) return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    const AttributeSet &A = Attrs[i];
-    for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i)
-      Record.push_back(VE.getAttributeGroupID(A.getSlotAttributes(i)));
+    AttributeList AL = Attrs[i];
+    for (unsigned i = AL.index_begin(), e = AL.index_end(); i != e; ++i) {
+      AttributeSet AS = AL.getAttributes(i);
+      if (AS.hasAttributes())
+        Record.push_back(VE.getAttributeGroupID({i, AS}));
+    }
 
     Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
     Record.clear();
@@ -972,7 +885,7 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
   uint64_t RawFlags = 0;
 
   RawFlags |= Flags.NotEligibleToImport; // bool
-  RawFlags |= (Flags.LiveRoot << 1);
+  RawFlags |= (Flags.Live << 1);
   // Linkage don't need to be remapped at that time for the summary. Any future
   // change to the getEncodedLinkage() function will need to be taken into
   // account here as well.
@@ -1035,16 +948,19 @@ static unsigned getEncodedUnnamedAddr(const GlobalValue &GV) {
   llvm_unreachable("Invalid unnamed_addr");
 }
 
+size_t ModuleBitcodeWriter::addToStrtab(StringRef Str) {
+  if (GenerateHash)
+    Hasher.update(Str);
+  return StrtabBuilder.add(Str);
+}
+
 void ModuleBitcodeWriter::writeComdats() {
   SmallVector<unsigned, 64> Vals;
   for (const Comdat *C : VE.getComdats()) {
-    // COMDAT: [selection_kind, name]
+    // COMDAT: [strtab offset, strtab size, selection_kind]
+    Vals.push_back(addToStrtab(C->getName()));
+    Vals.push_back(C->getName().size());
     Vals.push_back(getEncodedComdatSelectionKind(*C));
-    size_t Size = C->getName().size();
-    assert(isUInt<32>(Size));
-    Vals.push_back(Size);
-    for (char Chr : C->getName())
-      Vals.push_back((unsigned char)Chr);
     Stream.EmitRecord(bitc::MODULE_CODE_COMDAT, Vals, /*AbbrevToUse=*/0);
     Vals.clear();
   }
@@ -1053,7 +969,7 @@ void ModuleBitcodeWriter::writeComdats() {
 /// Write a record that will eventually hold the word offset of the
 /// module-level VST. For now the offset is 0, which will be backpatched
 /// after the real VST is written. Saves the bit offset to backpatch.
-void BitcodeWriterBase::writeValueSymbolTableForwardDecl() {
+void ModuleBitcodeWriter::writeValueSymbolTableForwardDecl() {
   // Write a placeholder value in for the offset of the real VST,
   // which is written after the function blocks so that it can include
   // the offset of each function. The placeholder offset will be
@@ -1079,19 +995,18 @@ void BitcodeWriterBase::writeValueSymbolTableForwardDecl() {
 enum StringEncoding { SE_Char6, SE_Fixed7, SE_Fixed8 };
 
 /// Determine the encoding to use for the given string name and length.
-static StringEncoding getStringEncoding(const char *Str, unsigned StrLen) {
+static StringEncoding getStringEncoding(StringRef Str) {
   bool isChar6 = true;
-  for (const char *C = Str, *E = C + StrLen; C != E; ++C) {
+  for (char C : Str) {
     if (isChar6)
-      isChar6 = BitCodeAbbrevOp::isChar6(*C);
-    if ((unsigned char)*C & 128)
+      isChar6 = BitCodeAbbrevOp::isChar6(C);
+    if ((unsigned char)C & 128)
       // don't bother scanning the rest.
       return SE_Fixed8;
   }
   if (isChar6)
     return SE_Char6;
-  else
-    return SE_Fixed7;
+  return SE_Fixed7;
 }
 
 /// Emit top-level description of module, including target triple, inline asm,
@@ -1156,6 +1071,8 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     // Add an abbrev for common globals with no visibility or thread localness.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_GLOBALVAR));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                               Log2_32_Ceil(MaxGlobalType+1)));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // AddrSpace << 2
@@ -1179,15 +1096,41 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
 
-  // Emit the global variable information.
   SmallVector<unsigned, 64> Vals;
+  // Emit the module's source file name.
+  {
+    StringEncoding Bits = getStringEncoding(M.getSourceFileName());
+    BitCodeAbbrevOp AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8);
+    if (Bits == SE_Char6)
+      AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Char6);
+    else if (Bits == SE_Fixed7)
+      AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7);
+
+    // MODULE_CODE_SOURCE_FILENAME: [namechar x N]
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_SOURCE_FILENAME));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abbv->Add(AbbrevOpToUse);
+    unsigned FilenameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+    for (const auto P : M.getSourceFileName())
+      Vals.push_back((unsigned char)P);
+
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::MODULE_CODE_SOURCE_FILENAME, Vals, FilenameAbbrev);
+    Vals.clear();
+  }
+
+  // Emit the global variable information.
   for (const GlobalVariable &GV : M.globals()) {
     unsigned AbbrevToUse = 0;
 
-    // GLOBALVAR: [type, isconst, initid,
+    // GLOBALVAR: [strtab offset, strtab size, type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
     //             unnamed_addr, externally_initialized, dllstorageclass,
-    //             comdat]
+    //             comdat, attributes]
+    Vals.push_back(addToStrtab(GV.getName()));
+    Vals.push_back(GV.getName().size());
     Vals.push_back(VE.getTypeID(GV.getValueType()));
     Vals.push_back(GV.getType()->getAddressSpace() << 2 | 2 | GV.isConstant());
     Vals.push_back(GV.isDeclaration() ? 0 :
@@ -1200,13 +1143,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
         GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
         GV.isExternallyInitialized() ||
         GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
-        GV.hasComdat()) {
+        GV.hasComdat() ||
+        GV.hasAttributes()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
       Vals.push_back(getEncodedUnnamedAddr(GV));
       Vals.push_back(GV.isExternallyInitialized());
       Vals.push_back(getEncodedDLLStorageClass(GV));
       Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0);
+
+      auto AL = GV.getAttributesAsList(AttributeList::FunctionIndex);
+      Vals.push_back(VE.getAttributeListID(AL));
     } else {
       AbbrevToUse = SimpleGVarAbbrev;
     }
@@ -1217,14 +1164,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
 
   // Emit the function proto information.
   for (const Function &F : M) {
-    // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
-    //             section, visibility, gc, unnamed_addr, prologuedata,
-    //             dllstorageclass, comdat, prefixdata, personalityfn]
+    // FUNCTION:  [strtab offset, strtab size, type, callingconv, isproto,
+    //             linkage, paramattrs, alignment, section, visibility, gc,
+    //             unnamed_addr, prologuedata, dllstorageclass, comdat,
+    //             prefixdata, personalityfn]
+    Vals.push_back(addToStrtab(F.getName()));
+    Vals.push_back(F.getName().size());
     Vals.push_back(VE.getTypeID(F.getFunctionType()));
     Vals.push_back(F.getCallingConv());
     Vals.push_back(F.isDeclaration());
     Vals.push_back(getEncodedLinkage(F));
-    Vals.push_back(VE.getAttributeID(F.getAttributes()));
+    Vals.push_back(VE.getAttributeListID(F.getAttributes()));
     Vals.push_back(Log2_32(F.getAlignment())+1);
     Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0);
     Vals.push_back(getEncodedVisibility(F));
@@ -1246,8 +1196,10 @@ void ModuleBitcodeWriter::writeModuleInfo() {
 
   // Emit the alias information.
   for (const GlobalAlias &A : M.aliases()) {
-    // ALIAS: [alias type, aliasee val#, linkage, visibility, dllstorageclass,
-    //         threadlocal, unnamed_addr]
+    // ALIAS: [strtab offset, strtab size, alias type, aliasee val#, linkage,
+    //         visibility, dllstorageclass, threadlocal, unnamed_addr]
+    Vals.push_back(addToStrtab(A.getName()));
+    Vals.push_back(A.getName().size());
     Vals.push_back(VE.getTypeID(A.getValueType()));
     Vals.push_back(A.getType()->getAddressSpace());
     Vals.push_back(VE.getValueID(A.getAliasee()));
@@ -1263,7 +1215,10 @@ void ModuleBitcodeWriter::writeModuleInfo() {
 
   // Emit the ifunc information.
   for (const GlobalIFunc &I : M.ifuncs()) {
-    // IFUNC: [ifunc type, address space, resolver val#, linkage, visibility]
+    // IFUNC: [strtab offset, strtab size, ifunc type, address space, resolver
+    //         val#, linkage, visibility]
+    Vals.push_back(addToStrtab(I.getName()));
+    Vals.push_back(I.getName().size());
     Vals.push_back(VE.getTypeID(I.getValueType()));
     Vals.push_back(I.getType()->getAddressSpace());
     Vals.push_back(VE.getValueID(I.getResolver()));
@@ -1273,34 +1228,6 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.clear();
   }
 
-  // Emit the module's source file name.
-  {
-    StringEncoding Bits = getStringEncoding(M.getSourceFileName().data(),
-                                            M.getSourceFileName().size());
-    BitCodeAbbrevOp AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8);
-    if (Bits == SE_Char6)
-      AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Char6);
-    else if (Bits == SE_Fixed7)
-      AbbrevOpToUse = BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7);
-
-    // MODULE_CODE_SOURCE_FILENAME: [namechar x N]
-    auto Abbv = std::make_shared<BitCodeAbbrev>();
-    Abbv->Add(BitCodeAbbrevOp(bitc::MODULE_CODE_SOURCE_FILENAME));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(AbbrevOpToUse);
-    unsigned FilenameAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
-    for (const auto P : M.getSourceFileName())
-      Vals.push_back((unsigned char)P);
-
-    // Emit the finished record.
-    Stream.EmitRecord(bitc::MODULE_CODE_SOURCE_FILENAME, Vals, FilenameAbbrev);
-    Vals.clear();
-  }
-
-  // If we have a VST, write the VSTOFFSET record placeholder.
-  if (M.getValueSymbolTable().empty())
-    return;
   writeValueSymbolTableForwardDecl();
 }
 
@@ -1326,6 +1253,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
       Flags |= FastMathFlags::NoSignedZeros;
     if (FPMO->hasAllowReciprocal())
       Flags |= FastMathFlags::AllowReciprocal;
+    if (FPMO->hasAllowContract())
+      Flags |= FastMathFlags::AllowContract;
   }
 
   return Flags;
@@ -1473,6 +1402,13 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getExtraData()));
 
+  // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
+  // that there is no DWARF address space associated with DIDerivedType.
+  if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
+    Record.push_back(*DWARFAddressSpace + 1);
+  else
+    Record.push_back(0);
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -1549,6 +1485,7 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
   Record.push_back(N->getDWOId());
   Record.push_back(VE.getMetadataOrNullID(N->getMacros().get()));
   Record.push_back(N->getSplitDebugInlining());
+  Record.push_back(N->getDebugInfoForProfiling());
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -1578,6 +1515,7 @@ void ModuleBitcodeWriter::writeDISubprogram(const DISubprogram *N,
   Record.push_back(VE.getMetadataOrNullID(N->getDeclaration()));
   Record.push_back(VE.getMetadataOrNullID(N->getVariables().get()));
   Record.push_back(N->getThisAdjustment());
+  Record.push_back(VE.getMetadataOrNullID(N->getThrownTypes().get()));
 
   Stream.EmitRecord(bitc::METADATA_SUBPROGRAM, Record, Abbrev);
   Record.clear();
@@ -1613,9 +1551,7 @@ void ModuleBitcodeWriter::writeDINamespace(const DINamespace *N,
                                            unsigned Abbrev) {
   Record.push_back(N->isDistinct() | N->getExportSymbols() << 1);
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
-  Record.push_back(VE.getMetadataOrNullID(N->getFile()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
-  Record.push_back(N->getLine());
 
   Stream.EmitRecord(bitc::METADATA_NAMESPACE, Record, Abbrev);
   Record.clear();
@@ -1738,9 +1674,8 @@ void ModuleBitcodeWriter::writeDIExpression(const DIExpression *N,
                                             SmallVectorImpl<uint64_t> &Record,
                                             unsigned Abbrev) {
   Record.reserve(N->getElements().size() + 1);
-
-  const uint64_t HasOpFragmentFlag = 1 << 1;
-  Record.push_back((uint64_t)N->isDistinct() | HasOpFragmentFlag);
+  const uint64_t Version = 3 << 1;
+  Record.push_back((uint64_t)N->isDistinct() | Version);
   Record.append(N->elements_begin(), N->elements_end());
 
   Stream.EmitRecord(bitc::METADATA_EXPRESSION, Record, Abbrev);
@@ -1783,6 +1718,7 @@ void ModuleBitcodeWriter::writeDIImportedEntity(
   Record.push_back(VE.getMetadataOrNullID(N->getEntity()));
   Record.push_back(N->getLine());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawFile()));
 
   Stream.EmitRecord(bitc::METADATA_IMPORTED_ENTITY, Record, Abbrev);
   Record.clear();
@@ -2104,6 +2040,24 @@ void ModuleBitcodeWriter::writeOperandBundleTags() {
   Stream.ExitBlock();
 }
 
+void ModuleBitcodeWriter::writeSyncScopeNames() {
+  SmallVector<StringRef, 8> SSNs;
+  M.getContext().getSyncScopeNames(SSNs);
+  if (SSNs.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::SYNC_SCOPE_NAMES_BLOCK_ID, 2);
+
+  SmallVector<uint64_t, 64> Record;
+  for (auto SSN : SSNs) {
+    Record.append(SSN.begin(), SSN.end());
+    Stream.EmitRecord(bitc::SYNC_SCOPE_NAME, Record, 0);
+    Record.clear();
+  }
+
+  Stream.ExitBlock();
+}
+
 static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
   if ((int64_t)V >= 0)
     Vals.push_back(V << 1);
@@ -2559,7 +2513,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
       pushValue(SI.getCondition(), InstID, Vals);
       Vals.push_back(VE.getValueID(SI.getDefaultDest()));
-      for (SwitchInst::ConstCaseIt Case : SI.cases()) {
+      for (auto Case : SI.cases()) {
         Vals.push_back(VE.getValueID(Case.getCaseValue()));
         Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
       }
@@ -2584,7 +2538,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
     Code = bitc::FUNC_CODE_INST_INVOKE;
 
-    Vals.push_back(VE.getAttributeID(II->getAttributes()));
+    Vals.push_back(VE.getAttributeListID(II->getAttributes()));
     Vals.push_back(II->getCallingConv() | 1 << 13);
     Vals.push_back(VE.getValueID(II->getNormalDest()));
     Vals.push_back(VE.getValueID(II->getUnwindDest()));
@@ -2720,7 +2674,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Vals.push_back(cast<LoadInst>(I).isVolatile());
     if (cast<LoadInst>(I).isAtomic()) {
       Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering()));
-      Vals.push_back(getEncodedSynchScope(cast<LoadInst>(I).getSynchScope()));
+      Vals.push_back(getEncodedSyncScopeID(cast<LoadInst>(I).getSyncScopeID()));
     }
     break;
   case Instruction::Store:
@@ -2734,7 +2688,8 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Vals.push_back(cast<StoreInst>(I).isVolatile());
     if (cast<StoreInst>(I).isAtomic()) {
       Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering()));
-      Vals.push_back(getEncodedSynchScope(cast<StoreInst>(I).getSynchScope()));
+      Vals.push_back(
+          getEncodedSyncScopeID(cast<StoreInst>(I).getSyncScopeID()));
     }
     break;
   case Instruction::AtomicCmpXchg:
@@ -2746,7 +2701,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Vals.push_back(
         getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getSuccessOrdering()));
     Vals.push_back(
-        getEncodedSynchScope(cast<AtomicCmpXchgInst>(I).getSynchScope()));
+        getEncodedSyncScopeID(cast<AtomicCmpXchgInst>(I).getSyncScopeID()));
     Vals.push_back(
         getEncodedOrdering(cast<AtomicCmpXchgInst>(I).getFailureOrdering()));
     Vals.push_back(cast<AtomicCmpXchgInst>(I).isWeak());
@@ -2760,12 +2715,12 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
     Vals.push_back(getEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
     Vals.push_back(
-        getEncodedSynchScope(cast<AtomicRMWInst>(I).getSynchScope()));
+        getEncodedSyncScopeID(cast<AtomicRMWInst>(I).getSyncScopeID()));
     break;
   case Instruction::Fence:
     Code = bitc::FUNC_CODE_INST_FENCE;
     Vals.push_back(getEncodedOrdering(cast<FenceInst>(I).getOrdering()));
-    Vals.push_back(getEncodedSynchScope(cast<FenceInst>(I).getSynchScope()));
+    Vals.push_back(getEncodedSyncScopeID(cast<FenceInst>(I).getSyncScopeID()));
     break;
   case Instruction::Call: {
     const CallInst &CI = cast<CallInst>(I);
@@ -2776,7 +2731,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
     Code = bitc::FUNC_CODE_INST_CALL;
 
-    Vals.push_back(VE.getAttributeID(CI.getAttributes()));
+    Vals.push_back(VE.getAttributeListID(CI.getAttributes()));
 
     unsigned Flags = getOptimizationFlags(&I);
     Vals.push_back(CI.getCallingConv() << bitc::CALL_CCONV |
@@ -2820,77 +2775,59 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
   Vals.clear();
 }
 
-/// Emit names for globals/functions etc. \p IsModuleLevel is true when
-/// we are writing the module-level VST, where we are including a function
-/// bitcode index and need to backpatch the VST forward declaration record.
-void ModuleBitcodeWriter::writeValueSymbolTable(
-    const ValueSymbolTable &VST, bool IsModuleLevel,
-    DenseMap<const Function *, uint64_t> *FunctionToBitcodeIndex) {
-  if (VST.empty()) {
-    // writeValueSymbolTableForwardDecl should have returned early as
-    // well. Ensure this handling remains in sync by asserting that
-    // the placeholder offset is not set.
-    assert(!IsModuleLevel || !hasVSTOffsetPlaceholder());
-    return;
-  }
+/// Write a GlobalValue VST to the module. The purpose of this data structure is
+/// to allow clients to efficiently find the function body.
+void ModuleBitcodeWriter::writeGlobalValueSymbolTable(
+  DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) {
+  // Get the offset of the VST we are writing, and backpatch it into
+  // the VST forward declaration record.
+  uint64_t VSTOffset = Stream.GetCurrentBitNo();
+  // The BitcodeStartBit was the stream offset of the identification block.
+  VSTOffset -= bitcodeStartBit();
+  assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned");
+  // Note that we add 1 here because the offset is relative to one word
+  // before the start of the identification block, which was historically
+  // always the start of the regular bitcode header.
+  Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32 + 1);
+
+  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
+
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
+  unsigned FnEntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  for (const Function &F : M) {
+    uint64_t Record[2];
 
-  if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
-    // Get the offset of the VST we are writing, and backpatch it into
-    // the VST forward declaration record.
-    uint64_t VSTOffset = Stream.GetCurrentBitNo();
-    // The BitcodeStartBit was the stream offset of the identification block.
-    VSTOffset -= bitcodeStartBit();
-    assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned");
+    if (F.isDeclaration())
+      continue;
+
+    Record[0] = VE.getValueID(&F);
+
+    // Save the word offset of the function (from the start of the
+    // actual bitcode written to the stream).
+    uint64_t BitcodeIndex = FunctionToBitcodeIndex[&F] - bitcodeStartBit();
+    assert((BitcodeIndex & 31) == 0 && "function block not 32-bit aligned");
     // Note that we add 1 here because the offset is relative to one word
     // before the start of the identification block, which was historically
     // always the start of the regular bitcode header.
-    Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32 + 1);
-  }
-
-  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
+    Record[1] = BitcodeIndex / 32 + 1;
 
-  // For the module-level VST, add abbrev Ids for the VST_CODE_FNENTRY
-  // records, which are not used in the per-function VSTs.
-  unsigned FnEntry8BitAbbrev;
-  unsigned FnEntry7BitAbbrev;
-  unsigned FnEntry6BitAbbrev;
-  unsigned GUIDEntryAbbrev;
-  if (IsModuleLevel && hasVSTOffsetPlaceholder()) {
-    // 8-bit fixed-width VST_CODE_FNENTRY function strings.
-    auto Abbv = std::make_shared<BitCodeAbbrev>();
-    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8));
-    FnEntry8BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+    Stream.EmitRecord(bitc::VST_CODE_FNENTRY, Record, FnEntryAbbrev);
+  }
 
-    // 7-bit fixed width VST_CODE_FNENTRY function strings.
-    Abbv = std::make_shared<BitCodeAbbrev>();
-    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
-    FnEntry7BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+  Stream.ExitBlock();
+}
 
-    // 6-bit char6 VST_CODE_FNENTRY function strings.
-    Abbv = std::make_shared<BitCodeAbbrev>();
-    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_FNENTRY));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // funcoffset
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Char6));
-    FnEntry6BitAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+/// Emit names for arguments, instructions and basic blocks in a function.
+void ModuleBitcodeWriter::writeFunctionLevelValueSymbolTable(
+    const ValueSymbolTable &VST) {
+  if (VST.empty())
+    return;
 
-    // FIXME: Change the name of this record as it is now used by
-    // the per-module index as well.
-    Abbv = std::make_shared<BitCodeAbbrev>();
-    Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
-    GUIDEntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-  }
+  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
 
   // FIXME: Set up the abbrev, we know how many values there are!
   // FIXME: We know if the type names can use 7-bit ascii.
@@ -2898,51 +2835,18 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
 
   for (const ValueName &Name : VST) {
     // Figure out the encoding to use for the name.
-    StringEncoding Bits =
-        getStringEncoding(Name.getKeyData(), Name.getKeyLength());
+    StringEncoding Bits = getStringEncoding(Name.getKey());
 
     unsigned AbbrevToUse = VST_ENTRY_8_ABBREV;
     NameVals.push_back(VE.getValueID(Name.getValue()));
 
-    Function *F = dyn_cast<Function>(Name.getValue());
-    if (!F) {
-      // If value is an alias, need to get the aliased base object to
-      // see if it is a function.
-      auto *GA = dyn_cast<GlobalAlias>(Name.getValue());
-      if (GA && GA->getBaseObject())
-        F = dyn_cast<Function>(GA->getBaseObject());
-    }
-
     // VST_CODE_ENTRY:   [valueid, namechar x N]
-    // VST_CODE_FNENTRY: [valueid, funcoffset, namechar x N]
     // VST_CODE_BBENTRY: [bbid, namechar x N]
     unsigned Code;
     if (isa<BasicBlock>(Name.getValue())) {
       Code = bitc::VST_CODE_BBENTRY;
       if (Bits == SE_Char6)
         AbbrevToUse = VST_BBENTRY_6_ABBREV;
-    } else if (F && !F->isDeclaration()) {
-      // Must be the module-level VST, where we pass in the Index and
-      // have a VSTOffsetPlaceholder. The function-level VST should not
-      // contain any Function symbols.
-      assert(FunctionToBitcodeIndex);
-      assert(hasVSTOffsetPlaceholder());
-
-      // Save the word offset of the function (from the start of the
-      // actual bitcode written to the stream).
-      uint64_t BitcodeIndex = (*FunctionToBitcodeIndex)[F] - bitcodeStartBit();
-      assert((BitcodeIndex & 31) == 0 && "function block not 32-bit aligned");
-      // Note that we add 1 here because the offset is relative to one word
-      // before the start of the identification block, which was historically
-      // always the start of the regular bitcode header.
-      NameVals.push_back(BitcodeIndex / 32 + 1);
-
-      Code = bitc::VST_CODE_FNENTRY;
-      AbbrevToUse = FnEntry8BitAbbrev;
-      if (Bits == SE_Char6)
-        AbbrevToUse = FnEntry6BitAbbrev;
-      else if (Bits == SE_Fixed7)
-        AbbrevToUse = FnEntry7BitAbbrev;
     } else {
       Code = bitc::VST_CODE_ENTRY;
       if (Bits == SE_Char6)
@@ -2958,47 +2862,7 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
     Stream.EmitRecord(Code, NameVals, AbbrevToUse);
     NameVals.clear();
   }
-  // Emit any GUID valueIDs created for indirect call edges into the
-  // module-level VST.
-  if (IsModuleLevel && hasVSTOffsetPlaceholder())
-    for (const auto &GI : valueIds()) {
-      NameVals.push_back(GI.second);
-      NameVals.push_back(GI.first);
-      Stream.EmitRecord(bitc::VST_CODE_COMBINED_ENTRY, NameVals,
-                        GUIDEntryAbbrev);
-      NameVals.clear();
-    }
-  Stream.ExitBlock();
-}
-
-/// Emit function names and summary offsets for the combined index
-/// used by ThinLTO.
-void IndexBitcodeWriter::writeCombinedValueSymbolTable() {
-  assert(hasVSTOffsetPlaceholder() && "Expected non-zero VSTOffsetPlaceholder");
-  // Get the offset of the VST we are writing, and backpatch it into
-  // the VST forward declaration record.
-  uint64_t VSTOffset = Stream.GetCurrentBitNo();
-  assert((VSTOffset & 31) == 0 && "VST block not 32-bit aligned");
-  Stream.BackpatchWord(VSTOffsetPlaceholder, VSTOffset / 32);
 
-  Stream.EnterSubblock(bitc::VALUE_SYMTAB_BLOCK_ID, 4);
-
-  auto Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::VST_CODE_COMBINED_ENTRY));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // refguid
-  unsigned EntryAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
-  SmallVector<uint64_t, 64> NameVals;
-  for (const auto &GVI : valueIds()) {
-    // VST_CODE_COMBINED_ENTRY: [valueid, refguid]
-    NameVals.push_back(GVI.second);
-    NameVals.push_back(GVI.first);
-
-    // Emit the finished record.
-    Stream.EmitRecord(bitc::VST_CODE_COMBINED_ENTRY, NameVals, EntryAbbrev);
-    NameVals.clear();
-  }
   Stream.ExitBlock();
 }
 
@@ -3102,7 +2966,7 @@ void ModuleBitcodeWriter::writeFunction(
 
   // Emit names for all the instructions etc.
   if (auto *Symtab = F.getValueSymbolTable())
-    writeValueSymbolTable(*Symtab);
+    writeFunctionLevelValueSymbolTable(*Symtab);
 
   if (NeedsMetadataAttachment)
     writeFunctionMetadataAttachment(F);
@@ -3329,42 +3193,77 @@ void IndexBitcodeWriter::writeModStrings() {
   unsigned AbbrevHash = Stream.EmitAbbrev(std::move(Abbv));
 
   SmallVector<unsigned, 64> Vals;
-  for (const auto &MPSE : Index.modulePaths()) {
-    if (!doIncludeModule(MPSE.getKey()))
-      continue;
-    StringEncoding Bits =
-        getStringEncoding(MPSE.getKey().data(), MPSE.getKey().size());
-    unsigned AbbrevToUse = Abbrev8Bit;
-    if (Bits == SE_Char6)
-      AbbrevToUse = Abbrev6Bit;
-    else if (Bits == SE_Fixed7)
-      AbbrevToUse = Abbrev7Bit;
+  forEachModule(
+      [&](const StringMapEntry<std::pair<uint64_t, ModuleHash>> &MPSE) {
+        StringRef Key = MPSE.getKey();
+        const auto &Value = MPSE.getValue();
+        StringEncoding Bits = getStringEncoding(Key);
+        unsigned AbbrevToUse = Abbrev8Bit;
+        if (Bits == SE_Char6)
+          AbbrevToUse = Abbrev6Bit;
+        else if (Bits == SE_Fixed7)
+          AbbrevToUse = Abbrev7Bit;
+
+        Vals.push_back(Value.first);
+        Vals.append(Key.begin(), Key.end());
+
+        // Emit the finished record.
+        Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse);
+
+        // Emit an optional hash for the module now
+        const auto &Hash = Value.second;
+        if (llvm::any_of(Hash, [](uint32_t H) { return H; })) {
+          Vals.assign(Hash.begin(), Hash.end());
+          // Emit the hash record.
+          Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash);
+        }
 
-    Vals.push_back(MPSE.getValue().first);
+        Vals.clear();
+      });
+  Stream.ExitBlock();
+}
 
-    for (const auto P : MPSE.getKey())
-      Vals.push_back((unsigned char)P);
+/// Write the function type metadata related records that need to appear before
+/// a function summary entry (whether per-module or combined).
+static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
+                                             FunctionSummary *FS) {
+  if (!FS->type_tests().empty())
+    Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
 
-    // Emit the finished record.
-    Stream.EmitRecord(bitc::MST_CODE_ENTRY, Vals, AbbrevToUse);
+  SmallVector<uint64_t, 64> Record;
 
-    Vals.clear();
-    // Emit an optional hash for the module now
-    auto &Hash = MPSE.getValue().second;
-    bool AllZero = true; // Detect if the hash is empty, and do not generate it
-    for (auto Val : Hash) {
-      if (Val)
-        AllZero = false;
-      Vals.push_back(Val);
+  auto WriteVFuncIdVec = [&](uint64_t Ty,
+                             ArrayRef<FunctionSummary::VFuncId> VFs) {
+    if (VFs.empty())
+      return;
+    Record.clear();
+    for (auto &VF : VFs) {
+      Record.push_back(VF.GUID);
+      Record.push_back(VF.Offset);
     }
-    if (!AllZero) {
-      // Emit the hash record.
-      Stream.EmitRecord(bitc::MST_CODE_HASH, Vals, AbbrevHash);
+    Stream.EmitRecord(Ty, Record);
+  };
+
+  WriteVFuncIdVec(bitc::FS_TYPE_TEST_ASSUME_VCALLS,
+                  FS->type_test_assume_vcalls());
+  WriteVFuncIdVec(bitc::FS_TYPE_CHECKED_LOAD_VCALLS,
+                  FS->type_checked_load_vcalls());
+
+  auto WriteConstVCallVec = [&](uint64_t Ty,
+                                ArrayRef<FunctionSummary::ConstVCall> VCs) {
+    for (auto &VC : VCs) {
+      Record.clear();
+      Record.push_back(VC.VFunc.GUID);
+      Record.push_back(VC.VFunc.Offset);
+      Record.insert(Record.end(), VC.Args.begin(), VC.Args.end());
+      Stream.EmitRecord(Ty, Record);
     }
+  };
 
-    Vals.clear();
-  }
-  Stream.ExitBlock();
+  WriteConstVCallVec(bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL,
+                     FS->type_test_assume_const_vcalls());
+  WriteConstVCallVec(bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL,
+                     FS->type_checked_load_const_vcalls());
 }
 
 // Helper to emit a single function summary record.
@@ -3375,8 +3274,7 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
-  if (!FS->type_tests().empty())
-    Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+  writeFunctionTypeMetadataRecords(Stream, FS);
 
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
@@ -3406,15 +3304,14 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
 void ModuleBitcodeWriter::writeModuleLevelReferences(
     const GlobalVariable &V, SmallVector<uint64_t, 64> &NameVals,
     unsigned FSModRefsAbbrev) {
-  auto Summaries =
-      Index->findGlobalValueSummaryList(GlobalValue::getGUID(V.getName()));
-  if (Summaries == Index->end()) {
+  auto VI = Index->getValueInfo(GlobalValue::getGUID(V.getName()));
+  if (!VI || VI.getSummaryList().empty()) {
     // Only declarations should not have a summary (a declaration might however
     // have a summary if the def was in module level asm).
     assert(V.isDeclaration());
     return;
   }
-  auto *Summary = Summaries->second.front().get();
+  auto *Summary = VI.getSummaryList()[0].get();
   NameVals.push_back(VE.getValueID(&V));
   GlobalVarSummary *VS = cast<GlobalVarSummary>(Summary);
   NameVals.push_back(getEncodedGVSummaryFlags(VS->flags()));
@@ -3439,7 +3336,15 @@ static const uint64_t INDEX_VERSION = 3;
 /// Emit the per-module summary section alongside the rest of
 /// the module's bitcode.
 void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
-  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 4);
+  // By default we compile with ThinLTO if the module has a summary, but the
+  // client can request full LTO with a module flag.
+  bool IsThinLTO = true;
+  if (auto *MD =
+          mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("ThinLTO")))
+    IsThinLTO = MD->getZExtValue();
+  Stream.EnterSubblock(IsThinLTO ? bitc::GLOBALVAL_SUMMARY_BLOCK_ID
+                                 : bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID,
+                       4);
 
   Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
 
@@ -3448,6 +3353,11 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
     return;
   }
 
+  for (const auto &GVI : valueIds()) {
+    Stream.EmitRecord(bitc::FS_VALUE_GUID,
+                      ArrayRef<uint64_t>{GVI.second, GVI.first});
+  }
+
   // Abbrev for FS_PERMODULE.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
@@ -3498,15 +3408,14 @@ void ModuleBitcodeWriter::writePerModuleGlobalValueSummary() {
     if (!F.hasName())
       report_fatal_error("Unexpected anonymous function when writing summary");
 
-    auto Summaries =
-        Index->findGlobalValueSummaryList(GlobalValue::getGUID(F.getName()));
-    if (Summaries == Index->end()) {
+    ValueInfo VI = Index->getValueInfo(GlobalValue::getGUID(F.getName()));
+    if (!VI || VI.getSummaryList().empty()) {
       // Only declarations should not have a summary (a declaration might
       // however have a summary if the def was in module level asm).
       assert(F.isDeclaration());
       continue;
     }
-    auto *Summary = Summaries->second.front().get();
+    auto *Summary = VI.getSummaryList()[0].get();
     writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F),
                                         FSCallsAbbrev, FSCallsProfileAbbrev, F);
   }
@@ -3540,6 +3449,11 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3);
   Stream.EmitRecord(bitc::FS_VERSION, ArrayRef<uint64_t>{INDEX_VERSION});
 
+  for (const auto &GVI : valueIds()) {
+    Stream.EmitRecord(bitc::FS_VALUE_GUID,
+                      ArrayRef<uint64_t>{GVI.second, GVI.first});
+  }
+
   // Abbrev for FS_COMBINED.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
@@ -3604,27 +3518,30 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.clear();
   };
 
-  for (const auto &I : *this) {
+  forEachSummary([&](GVInfo I) {
     GlobalValueSummary *S = I.second;
     assert(S);
 
-    assert(hasValueId(I.first));
-    unsigned ValueId = getValueId(I.first);
-    SummaryToValueIdMap[S] = ValueId;
+    auto ValueId = getValueId(I.first);
+    assert(ValueId);
+    SummaryToValueIdMap[S] = *ValueId;
 
     if (auto *AS = dyn_cast<AliasSummary>(S)) {
       // Will process aliases as a post-pass because the reader wants all
       // global to be loaded first.
       Aliases.push_back(AS);
-      continue;
+      return;
     }
 
     if (auto *VS = dyn_cast<GlobalVarSummary>(S)) {
-      NameVals.push_back(ValueId);
+      NameVals.push_back(*ValueId);
       NameVals.push_back(Index.getModuleId(VS->modulePath()));
       NameVals.push_back(getEncodedGVSummaryFlags(VS->flags()));
       for (auto &RI : VS->refs()) {
-        NameVals.push_back(getValueId(RI.getGUID()));
+        auto RefValueId = getValueId(RI.getGUID());
+        if (!RefValueId)
+          continue;
+        NameVals.push_back(*RefValueId);
       }
 
       // Emit the finished record.
@@ -3632,22 +3549,28 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
                         FSModRefsAbbrev);
       NameVals.clear();
       MaybeEmitOriginalName(*S);
-      continue;
+      return;
     }
 
     auto *FS = cast<FunctionSummary>(S);
-    if (!FS->type_tests().empty())
-      Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+    writeFunctionTypeMetadataRecords(Stream, FS);
 
-    NameVals.push_back(ValueId);
+    NameVals.push_back(*ValueId);
     NameVals.push_back(Index.getModuleId(FS->modulePath()));
     NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
     NameVals.push_back(FS->instCount());
-    NameVals.push_back(FS->refs().size());
+    // Fill in below
+    NameVals.push_back(0);
 
+    unsigned Count = 0;
     for (auto &RI : FS->refs()) {
-      NameVals.push_back(getValueId(RI.getGUID()));
+      auto RefValueId = getValueId(RI.getGUID());
+      if (!RefValueId)
+        continue;
+      NameVals.push_back(*RefValueId);
+      Count++;
     }
+    NameVals[4] = Count;
 
     bool HasProfileData = false;
     for (auto &EI : FS->calls()) {
@@ -3659,9 +3582,20 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     for (auto &EI : FS->calls()) {
       // If this GUID doesn't have a value id, it doesn't have a function
       // summary and we don't need to record any calls to it.
-      if (!hasValueId(EI.first.getGUID()))
-        continue;
-      NameVals.push_back(getValueId(EI.first.getGUID()));
+      GlobalValue::GUID GUID = EI.first.getGUID();
+      auto CallValueId = getValueId(GUID);
+      if (!CallValueId) {
+        // For SamplePGO, the indirect call targets for local functions will
+        // have its original name annotated in profile. We try to find the
+        // corresponding PGOFuncName as the GUID.
+        GUID = Index.getGUIDFromOriginalID(GUID);
+        if (GUID == 0)
+          continue;
+        CallValueId = getValueId(GUID);
+        if (!CallValueId)
+          continue;
+      }
+      NameVals.push_back(*CallValueId);
       if (HasProfileData)
         NameVals.push_back(static_cast<uint8_t>(EI.second.Hotness));
     }
@@ -3674,7 +3608,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     Stream.EmitRecord(Code, NameVals, FSAbbrev);
     NameVals.clear();
     MaybeEmitOriginalName(*S);
-  }
+  });
 
   for (auto *AS : Aliases) {
     auto AliasValueId = SummaryToValueIdMap[AS];
@@ -3692,12 +3626,30 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     MaybeEmitOriginalName(*AS);
   }
 
+  if (!Index.cfiFunctionDefs().empty()) {
+    for (auto &S : Index.cfiFunctionDefs()) {
+      NameVals.push_back(StrtabBuilder.add(S));
+      NameVals.push_back(S.size());
+    }
+    Stream.EmitRecord(bitc::FS_CFI_FUNCTION_DEFS, NameVals);
+    NameVals.clear();
+  }
+
+  if (!Index.cfiFunctionDecls().empty()) {
+    for (auto &S : Index.cfiFunctionDecls()) {
+      NameVals.push_back(StrtabBuilder.add(S));
+      NameVals.push_back(S.size());
+    }
+    Stream.EmitRecord(bitc::FS_CFI_FUNCTION_DECLS, NameVals);
+    NameVals.clear();
+  }
+
   Stream.ExitBlock();
 }
 
 /// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the
 /// current llvm version, and a record for the epoch number.
-void writeIdentificationBlock(BitstreamWriter &Stream) {
+static void writeIdentificationBlock(BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
 
   // Write the "user readable" string identifying the bitcode producer
@@ -3722,17 +3674,23 @@ void writeIdentificationBlock(BitstreamWriter &Stream) {
 void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
   // Emit the module's hash.
   // MODULE_CODE_HASH: [5*i32]
-  SHA1 Hasher;
-  Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
-                                  Buffer.size() - BlockStartPos));
-  StringRef Hash = Hasher.result();
-  uint32_t Vals[5];
-  for (int Pos = 0; Pos < 20; Pos += 4) {
-    Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
-  }
+  if (GenerateHash) {
+    uint32_t Vals[5];
+    Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
+                                    Buffer.size() - BlockStartPos));
+    StringRef Hash = Hasher.result();
+    for (int Pos = 0; Pos < 20; Pos += 4) {
+      Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
+    }
 
-  // Emit the finished record.
-  Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+
+    if (ModHash)
+      // Save the written hash value.
+      std::copy(std::begin(Vals), std::end(Vals), std::begin(*ModHash));
+  } else if (ModHash)
+    Stream.EmitRecord(bitc::MODULE_CODE_HASH, ArrayRef<uint32_t>(*ModHash));
 }
 
 void ModuleBitcodeWriter::write() {
@@ -3741,10 +3699,7 @@ void ModuleBitcodeWriter::write() {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
   size_t BlockStartPos = Buffer.size();
 
-  SmallVector<unsigned, 1> Vals;
-  unsigned CurVersion = 1;
-  Vals.push_back(CurVersion);
-  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
+  writeModuleVersion();
 
   // Emit blockinfo, which defines the standard abbreviations etc.
   writeBlockInfo();
@@ -3778,6 +3733,7 @@ void ModuleBitcodeWriter::write() {
     writeUseListBlock(nullptr);
 
   writeOperandBundleTags();
+  writeSyncScopeNames();
 
   // Emit function bodies.
   DenseMap<const Function *, uint64_t> FunctionToBitcodeIndex;
@@ -3790,12 +3746,9 @@ void ModuleBitcodeWriter::write() {
   if (Index)
     writePerModuleGlobalValueSummary();
 
-  writeValueSymbolTable(M.getValueSymbolTable(),
-                        /* IsModuleLevel */ true, &FunctionToBitcodeIndex);
+  writeGlobalValueSymbolTable(FunctionToBitcodeIndex);
 
-  if (GenerateHash) {
-    writeModuleHash(BlockStartPos);
-  }
+  writeModuleHash(BlockStartPos);
 
   Stream.ExitBlock();
 }
@@ -3881,23 +3834,105 @@ BitcodeWriter::BitcodeWriter(SmallVectorImpl<char> &Buffer)
   writeBitcodeHeader(*Stream);
 }
 
-BitcodeWriter::~BitcodeWriter() = default;
+BitcodeWriter::~BitcodeWriter() { assert(WroteStrtab); }
+
+void BitcodeWriter::writeBlob(unsigned Block, unsigned Record, StringRef Blob) {
+  Stream->EnterSubblock(Block, 3);
+
+  auto Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(Record));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
+  auto AbbrevNo = Stream->EmitAbbrev(std::move(Abbv));
+
+  Stream->EmitRecordWithBlob(AbbrevNo, ArrayRef<uint64_t>{Record}, Blob);
+
+  Stream->ExitBlock();
+}
+
+void BitcodeWriter::writeSymtab() {
+  assert(!WroteStrtab && !WroteSymtab);
+
+  // If any module has module-level inline asm, we will require a registered asm
+  // parser for the target so that we can create an accurate symbol table for
+  // the module.
+  for (Module *M : Mods) {
+    if (M->getModuleInlineAsm().empty())
+      continue;
+
+    std::string Err;
+    const Triple TT(M->getTargetTriple());
+    const Target *T = TargetRegistry::lookupTarget(TT.str(), Err);
+    if (!T || !T->hasMCAsmParser())
+      return;
+  }
+
+  WroteSymtab = true;
+  SmallVector<char, 0> Symtab;
+  // The irsymtab::build function may be unable to create a symbol table if the
+  // module is malformed (e.g. it contains an invalid alias). Writing a symbol
+  // table is not required for correctness, but we still want to be able to
+  // write malformed modules to bitcode files, so swallow the error.
+  if (Error E = irsymtab::build(Mods, Symtab, StrtabBuilder, Alloc)) {
+    consumeError(std::move(E));
+    return;
+  }
+
+  writeBlob(bitc::SYMTAB_BLOCK_ID, bitc::SYMTAB_BLOB,
+            {Symtab.data(), Symtab.size()});
+}
+
+void BitcodeWriter::writeStrtab() {
+  assert(!WroteStrtab);
+
+  std::vector<char> Strtab;
+  StrtabBuilder.finalizeInOrder();
+  Strtab.resize(StrtabBuilder.getSize());
+  StrtabBuilder.write((uint8_t *)Strtab.data());
+
+  writeBlob(bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB,
+            {Strtab.data(), Strtab.size()});
+
+  WroteStrtab = true;
+}
+
+void BitcodeWriter::copyStrtab(StringRef Strtab) {
+  writeBlob(bitc::STRTAB_BLOCK_ID, bitc::STRTAB_BLOB, Strtab);
+  WroteStrtab = true;
+}
 
 void BitcodeWriter::writeModule(const Module *M,
                                 bool ShouldPreserveUseListOrder,
                                 const ModuleSummaryIndex *Index,
-                                bool GenerateHash) {
-  ModuleBitcodeWriter ModuleWriter(
-      M, Buffer, *Stream, ShouldPreserveUseListOrder, Index, GenerateHash);
+                                bool GenerateHash, ModuleHash *ModHash) {
+  assert(!WroteStrtab);
+
+  // The Mods vector is used by irsymtab::build, which requires non-const
+  // Modules in case it needs to materialize metadata. But the bitcode writer
+  // requires that the module is materialized, so we can cast to non-const here,
+  // after checking that it is in fact materialized.
+  assert(M->isMaterialized());
+  Mods.push_back(const_cast<Module *>(M));
+
+  ModuleBitcodeWriter ModuleWriter(M, Buffer, StrtabBuilder, *Stream,
+                                   ShouldPreserveUseListOrder, Index,
+                                   GenerateHash, ModHash);
   ModuleWriter.write();
 }
 
+void BitcodeWriter::writeIndex(
+    const ModuleSummaryIndex *Index,
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
+  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index,
+                                 ModuleToSummariesForIndex);
+  IndexWriter.write();
+}
+
 /// WriteBitcodeToFile - Write the specified module to the specified output
 /// stream.
 void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
                               bool ShouldPreserveUseListOrder,
                               const ModuleSummaryIndex *Index,
-                              bool GenerateHash) {
+                              bool GenerateHash, ModuleHash *ModHash) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256*1024);
 
@@ -3908,7 +3943,10 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
     Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
 
   BitcodeWriter Writer(Buffer);
-  Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash);
+  Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash,
+                     ModHash);
+  Writer.writeSymtab();
+  Writer.writeStrtab();
 
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
     emitDarwinBCHeaderAndTrailer(Buffer, TT);
@@ -3920,13 +3958,7 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
 void IndexBitcodeWriter::write() {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
 
-  SmallVector<unsigned, 1> Vals;
-  unsigned CurVersion = 1;
-  Vals.push_back(CurVersion);
-  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
-
-  // If we have a VST, write the VSTOFFSET record placeholder.
-  writeValueSymbolTableForwardDecl();
+  writeModuleVersion();
 
   // Write the module paths in the combined index.
   writeModStrings();
@@ -3934,10 +3966,6 @@ void IndexBitcodeWriter::write() {
   // Write the summary combined index records.
   writeCombinedGlobalValueSummary();
 
-  // Need a special VST writer for the combined index (we don't have a
-  // real VST and real values when this is invoked).
-  writeCombinedValueSymbolTable();
-
   Stream.ExitBlock();
 }
 
@@ -3951,11 +3979,9 @@ void llvm::WriteIndexToFile(
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256 * 1024);
 
-  BitstreamWriter Stream(Buffer);
-  writeBitcodeHeader(Stream);
-
-  IndexBitcodeWriter IndexWriter(Stream, Index, ModuleToSummariesForIndex);
-  IndexWriter.write();
+  BitcodeWriter Writer(Buffer);
+  Writer.writeIndex(&Index, ModuleToSummariesForIndex);
+  Writer.writeStrtab();
 
   Out.write((char *)&Buffer.front(), Buffer.size());
 }
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 5d5bfab..bb626ba 100644
--- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -314,10 +314,13 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   // Remember what is the cutoff between globalvalue's and other constants.
   unsigned FirstConstant = Values.size();
 
-  // Enumerate the global variable initializers.
-  for (const GlobalVariable &GV : M.globals())
+  // Enumerate the global variable initializers and attributes.
+  for (const GlobalVariable &GV : M.globals()) {
     if (GV.hasInitializer())
       EnumerateValue(GV.getInitializer());
+    if (GV.hasAttributes())
+      EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex));
+  }
 
   // Enumerate the aliasees.
   for (const GlobalAlias &GA : M.aliases())
@@ -432,12 +435,14 @@ unsigned ValueEnumerator::getValueID(const Value *V) const {
   return I->second-1;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ValueEnumerator::dump() const {
   print(dbgs(), ValueMap, "Default");
   dbgs() << '\n';
   print(dbgs(), MetadataMap, "MetaData");
   dbgs() << '\n';
 }
+#endif
 
 void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
                             const char *Name) const {
@@ -452,7 +457,8 @@ void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map,
       OS << "Value: " << V->getName();
     else
       OS << "Value: [null]\n";
-    V->dump();
+    V->print(errs());
+    errs() << '\n';
 
     OS << " Uses(" << std::distance(V->use_begin(),V->use_end()) << "):";
     for (const Use &U : V->uses()) {
@@ -549,7 +555,7 @@ void ValueEnumerator::EnumerateFunctionLocalMetadata(
 void ValueEnumerator::dropFunctionFromMetadata(
     MetadataMapType::value_type &FirstMD) {
   SmallVector<const MDNode *, 64> Worklist;
-  auto push = [this, &Worklist](MetadataMapType::value_type &MD) {
+  auto push = [&Worklist](MetadataMapType::value_type &MD) {
     auto &Entry = MD.second;
 
     // Nothing to do if this metadata isn't tagged.
@@ -884,23 +890,26 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
   }
 }
 
-void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
+void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
   if (PAL.isEmpty()) return;  // null is always 0.
 
   // Do a lookup.
-  unsigned &Entry = AttributeMap[PAL];
+  unsigned &Entry = AttributeListMap[PAL];
   if (Entry == 0) {
     // Never saw this before, add it.
-    Attribute.push_back(PAL);
-    Entry = Attribute.size();
+    AttributeLists.push_back(PAL);
+    Entry = AttributeLists.size();
   }
 
   // Do lookups for all attribute groups.
-  for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) {
-    AttributeSet AS = PAL.getSlotAttributes(i);
-    unsigned &Entry = AttributeGroupMap[AS];
+  for (unsigned i = PAL.index_begin(), e = PAL.index_end(); i != e; ++i) {
+    AttributeSet AS = PAL.getAttributes(i);
+    if (!AS.hasAttributes())
+      continue;
+    IndexAndAttrSet Pair = {i, AS};
+    unsigned &Entry = AttributeGroupMap[Pair];
     if (Entry == 0) {
-      AttributeGroups.push_back(AS);
+      AttributeGroups.push_back(Pair);
       Entry = AttributeGroups.size();
     }
   }
diff --git a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
index a8d6cf9..e7ccc8d 100644
--- a/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/contrib/llvm/lib/Bitcode/Writer/ValueEnumerator.h
@@ -36,7 +36,7 @@ class LocalAsMetadata;
 class MDNode;
 class MDOperand;
 class NamedMDNode;
-class AttributeSet;
+class AttributeList;
 class ValueSymbolTable;
 class MDSymbolTable;
 class raw_ostream;
@@ -48,6 +48,10 @@ public:
   // For each value, we remember its Value* and occurrence frequency.
   typedef std::vector<std::pair<const Value*, unsigned> > ValueList;
 
+  /// Attribute groups as encoded in bitcode are almost AttributeSets, but they
+  /// include the AttributeList index, so we have to track that in our map.
+  typedef std::pair<unsigned, AttributeSet> IndexAndAttrSet;
+
   UseListOrderStack UseListOrders;
 
 private:
@@ -102,13 +106,13 @@ private:
 
   bool ShouldPreserveUseListOrder;
 
-  typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
+  typedef DenseMap<IndexAndAttrSet, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
-  std::vector<AttributeSet> AttributeGroups;
+  std::vector<IndexAndAttrSet> AttributeGroups;
 
-  typedef DenseMap<AttributeSet, unsigned> AttributeMapType;
-  AttributeMapType AttributeMap;
-  std::vector<AttributeSet> Attribute;
+  typedef DenseMap<AttributeList, unsigned> AttributeListMapType;
+  AttributeListMapType AttributeListMap;
+  std::vector<AttributeList> AttributeLists;
 
   /// GlobalBasicBlockIDs - This map memoizes the basic block ID's referenced by
   /// the "getGlobalBasicBlockID" method.
@@ -166,16 +170,17 @@ public:
   unsigned getInstructionID(const Instruction *I) const;
   void setInstructionID(const Instruction *I);
 
-  unsigned getAttributeID(AttributeSet PAL) const {
+  unsigned getAttributeListID(AttributeList PAL) const {
     if (PAL.isEmpty()) return 0;  // Null maps to zero.
-    AttributeMapType::const_iterator I = AttributeMap.find(PAL);
-    assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!");
+    AttributeListMapType::const_iterator I = AttributeListMap.find(PAL);
+    assert(I != AttributeListMap.end() && "Attribute not in ValueEnumerator!");
     return I->second;
   }
 
-  unsigned getAttributeGroupID(AttributeSet PAL) const {
-    if (PAL.isEmpty()) return 0;  // Null maps to zero.
-    AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(PAL);
+  unsigned getAttributeGroupID(IndexAndAttrSet Group) const {
+    if (!Group.second.hasAttributes())
+      return 0; // Null maps to zero.
+    AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(Group);
     assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!");
     return I->second;
   }
@@ -206,10 +211,8 @@ public:
   const std::vector<const BasicBlock*> &getBasicBlocks() const {
     return BasicBlocks;
   }
-  const std::vector<AttributeSet> &getAttributes() const {
-    return Attribute;
-  }
-  const std::vector<AttributeSet> &getAttributeGroups() const {
+  const std::vector<AttributeList> &getAttributeLists() const { return AttributeLists; }
+  const std::vector<IndexAndAttrSet> &getAttributeGroups() const {
     return AttributeGroups;
   }
 
@@ -283,7 +286,7 @@ private:
   void EnumerateValue(const Value *V);
   void EnumerateType(Type *T);
   void EnumerateOperandType(const Value *V);
-  void EnumerateAttributes(AttributeSet PAL);
+  void EnumerateAttributes(AttributeList PAL);
 
   void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
   void EnumerateNamedMetadata(const Module &M);
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index bb90861..5abf50e 100644
--- a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -128,8 +128,7 @@ AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
    }
 
   DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
-  DEBUG(for (int r = CriticalPathSet.find_first(); r != -1;
-             r = CriticalPathSet.find_next(r))
+  DEBUG(for (unsigned r : CriticalPathSet.set_bits())
           dbgs() << " " << TRI->getName(r));
   DEBUG(dbgs() << '\n');
 }
@@ -163,9 +162,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector Pristine = MFI.getPristineRegs(MF);
-  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
     unsigned Reg = *I;
-    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    if (!IsReturnBlock && !Pristine.test(Reg))
+      continue;
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       unsigned AliasReg = *AI;
       State->UnionGroups(AliasReg, 0);
@@ -569,7 +570,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
 
       DEBUG({
         dbgs() << " ::";
-        for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
+        for (unsigned r : BV.set_bits())
           dbgs() << " " << TRI->getName(r);
         dbgs() << "\n";
       });
@@ -962,10 +963,8 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
               // sure to update that as well.
               const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()];
               if (!SU) continue;
-              for (DbgValueVector::iterator DVI = DbgValues.begin(),
-                     DVE = DbgValues.end(); DVI != DVE; ++DVI)
-                if (DVI->second == Q.second.Operand->getParent())
-                  UpdateDbgValue(*DVI->first, AntiDepReg, NewReg);
+              UpdateDbgValues(DbgValues, Q.second.Operand->getParent(),
+                              AntiDepReg, NewReg);
             }
 
             // We just went back in time and modified history; the
diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp
index 79ecc43..c2aecc6 100644
--- a/contrib/llvm/lib/CodeGen/Analysis.cpp
+++ b/contrib/llvm/lib/CodeGen/Analysis.cpp
@@ -24,8 +24,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 
@@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
   bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS;
   ADS = true;
 
-  AttrBuilder CallerAttrs(F->getAttributes(),
-                          AttributeSet::ReturnIndex);
+  AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex);
   AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
-                          AttributeSet::ReturnIndex);
+                          AttributeList::ReturnIndex);
 
   // Noalias is completely benign as far as calling convention goes, it
   // shouldn't affect whether the call is a tail call.
@@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
   return true;
 }
 
-bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
-  if (!GV->hasLinkOnceODRLinkage())
-    return false;
-
-  // We assume that anyone who sets global unnamed_addr on a non-constant knows
-  // what they're doing.
-  if (GV->hasGlobalUnnamedAddr())
-    return true;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (!Var->isConstant())
-      return false;
-  }
-
-  return GV->hasAtLeastLocalUnnamedAddr();
-}
-
 static void collectFuncletMembers(
     DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
     const MachineBasicBlock *MBB) {
diff --git a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
index 04f7f41..d14d931 100644
--- a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
+++ b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
@@ -60,6 +60,25 @@ public:
     if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg)
       MI.getOperand(0).setReg(NewReg);
   }
+
+  /// Update all DBG_VALUE instructions that may be affected by the dependency
+  /// breaker's update of ParentMI to use NewReg.
+  void UpdateDbgValues(const DbgValueVector &DbgValues, MachineInstr *ParentMI,
+                       unsigned OldReg, unsigned NewReg) {
+    // The following code is dependent on the order in which the DbgValues are
+    // constructed in ScheduleDAGInstrs::buildSchedGraph.
+    MachineInstr *PrevDbgMI = nullptr;
+    for (const auto &DV : make_range(DbgValues.crbegin(), DbgValues.crend())) {
+      MachineInstr *PrevMI = DV.second;
+      if ((PrevMI == ParentMI) || (PrevMI == PrevDbgMI)) {
+        MachineInstr *DbgMI = DV.first;
+        UpdateDbgValue(*DbgMI, OldReg, NewReg);
+        PrevDbgMI = DbgMI;
+      } else if (PrevDbgMI) {
+        break; // If no match and already found a DBG_VALUE, we're done.
+      }
+    }
+  }
 };
 
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index 61149d9..8b1376a 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -14,6 +14,7 @@
 #include "DwarfException.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -27,7 +28,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 24fdbfc..ff427c9 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -12,47 +12,101 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "AsmPrinterHandler.h"
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
 #include "WinException.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ObjectUtils.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Value.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -69,6 +123,10 @@ static const char *const CodeViewLineTablesGroupDescription =
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
+static cl::opt<bool>
+    PrintSchedule("print-schedule", cl::Hidden, cl::init(false),
+                  cl::desc("Print 'sched: [latency:throughput]' in .s output"));
+
 char AsmPrinter::ID = 0;
 
 typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type;
@@ -78,7 +136,6 @@ static gcp_map_type &getGCMap(void *&P) {
   return *(gcp_map_type*)P;
 }
 
-
 /// getGVAlignmentLog2 - Return the alignment to use for the specified global
 /// value in log2 form.  This rounds up to the preferred alignment if possible
 /// and legal.
@@ -107,16 +164,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
-      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
-      isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) {
-  DD = nullptr;
-  MMI = nullptr;
-  LI = nullptr;
-  MF = nullptr;
-  CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr;
-  CurrentFnBegin = nullptr;
-  CurrentFnEnd = nullptr;
-  GCMetadataPrinters = nullptr;
+      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) {
   VerboseAsm = OutStreamer->isVerboseAsm();
 }
 
@@ -171,6 +219,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
   if (isVerbose())
     AU.addRequired<MachineLoopInfo>();
@@ -223,7 +272,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   // don't, this at least helps the user find where a global came from.
   if (MAI->hasSingleParameterDotFile()) {
     // .file "foo.c"
-    OutStreamer->EmitFileDirective(M.getModuleIdentifier());
+    OutStreamer->EmitFileDirective(M.getSourceFileName());
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
@@ -571,7 +620,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 ///
 /// \p Value - The value to emit.
 /// \p Size - The size of the integer (in bytes) to emit.
-void AsmPrinter::EmitDebugValue(const MCExpr *Value,
+void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
                                       unsigned Size) const {
   OutStreamer->EmitValue(Value, Size);
 }
@@ -579,12 +628,17 @@ void AsmPrinter::EmitDebugValue(const MCExpr *Value,
 /// EmitFunctionHeader - This method emits the header for the current
 /// function.
 void AsmPrinter::EmitFunctionHeader() {
+  const Function *F = MF->getFunction();
+
+  if (isVerbose())
+    OutStreamer->GetCommentOS()
+        << "-- Begin function "
+        << GlobalValue::dropLLVMManglingEscape(F->getName()) << '\n';
+
   // Print out constants referenced by the function
   EmitConstantPool();
 
   // Print the 'header' of function.
-  const Function *F = MF->getFunction();
-
   OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(F, TM));
   EmitVisibility(CurrentFnSym, F->getVisibility());
 
@@ -602,8 +656,23 @@ void AsmPrinter::EmitFunctionHeader() {
   }
 
   // Emit the prefix data.
-  if (F->hasPrefixData())
-    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+  if (F->hasPrefixData()) {
+    if (MAI->hasSubsectionsViaSymbols()) {
+      // Preserving prefix data on platforms which use subsections-via-symbols
+      // is a bit tricky. Here we introduce a symbol for the prefix data
+      // and use the .alt_entry attribute to mark the function's real entry point
+      // as an alternative entry point to the prefix-data symbol.
+      MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol();
+      OutStreamer->EmitLabel(PrefixSym);
+
+      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+
+      // Emit an .alt_entry directive for the actual function symbol.
+      OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry);
+    } else {
+      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+    }
+  }
 
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
@@ -660,7 +729,8 @@ void AsmPrinter::EmitFunctionEntryLabel() {
 }
 
 /// emitComments - Pretty-print comments for instructions.
-static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS,
+                         AsmPrinter *AP) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
@@ -668,6 +738,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   int FI;
 
   const MachineFrameInfo &MFI = MF->getFrameInfo();
+  bool Commented = false;
 
   // We assume a single instruction only has a spill or reload, not
   // both.
@@ -675,24 +746,39 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   if (TII->isLoadFromStackSlotPostFE(MI, FI)) {
     if (MFI.isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
-      CommentOS << MMO->getSize() << "-byte Reload\n";
+      CommentOS << MMO->getSize() << "-byte Reload";
+      Commented = true;
     }
   } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) {
-    if (MFI.isSpillSlotObjectIndex(FI))
-      CommentOS << MMO->getSize() << "-byte Folded Reload\n";
+    if (MFI.isSpillSlotObjectIndex(FI)) {
+      CommentOS << MMO->getSize() << "-byte Folded Reload";
+      Commented = true;
+    }
   } else if (TII->isStoreToStackSlotPostFE(MI, FI)) {
     if (MFI.isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
-      CommentOS << MMO->getSize() << "-byte Spill\n";
+      CommentOS << MMO->getSize() << "-byte Spill";
+      Commented = true;
     }
   } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) {
-    if (MFI.isSpillSlotObjectIndex(FI))
-      CommentOS << MMO->getSize() << "-byte Folded Spill\n";
+    if (MFI.isSpillSlotObjectIndex(FI)) {
+      CommentOS << MMO->getSize() << "-byte Folded Spill";
+      Commented = true;
+    }
   }
 
   // Check for spill-induced copies
-  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
-    CommentOS << " Reload Reuse\n";
+  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse)) {
+    Commented = true;
+    CommentOS << " Reload Reuse";
+  }
+
+  if (Commented && AP->EnablePrintSchedInfo)
+    // If any comment was added above and we need sched info comment then
+    // add this new comment just after the above comment w/o "\n" between them.
+    CommentOS << " " << MF->getSubtarget().getSchedInfoStr(MI) << "\n";
+  else if (Commented)
+    CommentOS << "\n";
 }
 
 /// emitImplicitDef - This method emits the specified machine instruction
@@ -739,46 +825,30 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
 
   const DILocalVariable *V = MI->getDebugVariable();
   if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) {
-    StringRef Name = SP->getDisplayName();
+    StringRef Name = SP->getName();
     if (!Name.empty())
       OS << Name << ":";
   }
   OS << V->getName();
-
-  const DIExpression *Expr = MI->getDebugExpression();
-  auto Fragment = Expr->getFragmentInfo();
-  if (Fragment)
-    OS << " [fragment offset=" << Fragment->OffsetInBits
-       << " size=" << Fragment->SizeInBits << "]";
   OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
-  bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
-  int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0;
-
-  for (unsigned i = 0; i < Expr->getNumElements(); ++i) {
-    uint64_t Op = Expr->getElement(i);
-    if (Op == dwarf::DW_OP_LLVM_fragment) {
-      // There can't be any operands after this in a valid expression
-      break;
-    } else if (Deref) {
-      // We currently don't support extra Offsets or derefs after the first
-      // one. Bail out early instead of emitting an incorrect comment
-      OS << " [complex expression]";
-      AP.OutStreamer->emitRawComment(OS.str());
-      return true;
-    } else if (Op == dwarf::DW_OP_deref) {
-      Deref = true;
-      continue;
-    }
-
-    uint64_t ExtraOffset = Expr->getElement(i++);
-    if (Op == dwarf::DW_OP_plus)
-      Offset += ExtraOffset;
-    else {
-      assert(Op == dwarf::DW_OP_minus);
-      Offset -= ExtraOffset;
+  bool MemLoc = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
+  int64_t Offset = MemLoc ? MI->getOperand(1).getImm() : 0;
+  const DIExpression *Expr = MI->getDebugExpression();
+  if (Expr->getNumElements()) {
+    OS << '[';
+    bool NeedSep = false;
+    for (auto Op : Expr->expr_ops()) {
+      if (NeedSep)
+        OS << ", ";
+      else
+        NeedSep = true;
+      OS << dwarf::OperationEncodingString(Op.getOp());
+      for (unsigned I = 0; I < Op.getNumArgs(); ++I)
+        OS << ' ' << Op.getArg(I);
     }
+    OS << "] ";
   }
 
   // Register or immediate value. Register 0 means undef.
@@ -809,7 +879,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
       const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering();
       Offset += TFI->getFrameIndexReference(*AP.MF,
                                             MI->getOperand(0).getIndex(), Reg);
-      Deref = true;
+      MemLoc = true;
     }
     if (Reg == 0) {
       // Suppress offset, it is not meaningful here.
@@ -818,12 +888,12 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
       AP.OutStreamer->emitRawComment(OS.str());
       return true;
     }
-    if (Deref)
+    if (MemLoc)
       OS << '[';
     OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo());
   }
 
-  if (Deref)
+  if (MemLoc)
     OS << '+' << Offset << ']';
 
   // NOTE: Want this comment at start of line, don't emit with AddComment.
@@ -855,6 +925,16 @@ void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
   if (needsCFIMoves() == CFI_M_None)
     return;
 
+  // If there is no "real" instruction following this CFI instruction, skip
+  // emitting it; it would be beyond the end of the function's FDE range.
+  auto *MBB = MI.getParent();
+  auto I = std::next(MI.getIterator());
+  while (I != MBB->end() && I->isTransient())
+    ++I;
+  if (I == MBB->instr_end() &&
+      MBB->getReverseIterator() == MBB->getParent()->rbegin())
+    return;
+
   const std::vector<MCCFIInstruction> &Instrs = MF->getFrameInstructions();
   unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
   const MCCFIInstruction &CFI = Instrs[CFIIndex];
@@ -871,6 +951,19 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
                              MCConstantExpr::create(FrameOffset, OutContext));
 }
 
+static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF,
+                                           MachineModuleInfo *MMI) {
+  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() || MMI->hasDebugInfo())
+    return true;
+
+  // We might emit an EH table that uses function begin and end labels even if
+  // we don't have any landingpads.
+  if (!MF.getFunction()->hasPersonalityFn())
+    return false;
+  return !isNoOpWithoutInvoke(
+      classifyEHPersonality(MF.getFunction()->getPersonalityFn()));
+}
+
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
 void AsmPrinter::EmitFunctionBody() {
@@ -883,6 +976,7 @@ void AsmPrinter::EmitFunctionBody() {
 
   // Print out code for the function.
   bool HasAnyRealCode = false;
+  int NumInstsInFunction = 0;
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
     EmitBasicBlockStart(MBB);
@@ -892,7 +986,7 @@ void AsmPrinter::EmitFunctionBody() {
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
           !MI.isDebugValue()) {
         HasAnyRealCode = true;
-        ++EmittedInsts;
+        ++NumInstsInFunction;
       }
 
       if (ShouldPrintDebugScopes) {
@@ -905,7 +999,7 @@ void AsmPrinter::EmitFunctionBody() {
       }
 
       if (isVerbose())
-        emitComments(MI, OutStreamer->GetCommentOS());
+        emitComments(MI, OutStreamer->GetCommentOS(), this);
 
       switch (MI.getOpcode()) {
       case TargetOpcode::CFI_INSTRUCTION:
@@ -953,18 +1047,34 @@ void AsmPrinter::EmitFunctionBody() {
     EmitBasicBlockEnd(MBB);
   }
 
+  EmittedInsts += NumInstsInFunction;
+  MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount",
+                                      MF->getFunction()->getSubprogram(),
+                                      &MF->front());
+  R << ore::NV("NumInstructions", NumInstsInFunction)
+    << " instructions in function";
+  ORE->emit(R);
+
   // If the function is empty and the object file uses .subsections_via_symbols,
   // then we need to emit *something* to the function body to prevent the
   // labels from collapsing together.  Just emit a noop.
-  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) {
+  // Similarly, don't emit empty functions on Windows either. It can lead to
+  // duplicate entries (two functions with the same RVA) in the Guard CF Table
+  // after linking, causing the kernel not to load the binary:
+  // https://developercommunity.visualstudio.com/content/problem/45366/vc-linker-creates-invalid-dll-with-clang-cl.html
+  // FIXME: Hide this behind some API in e.g. MCAsmInfo or MCTargetStreamer.
+  const Triple &TT = TM.getTargetTriple();
+  if (!HasAnyRealCode && (MAI->hasSubsectionsViaSymbols() ||
+                          (TT.isOSWindows() && TT.isOSBinFormatCOFF()))) {
     MCInst Noop;
-    MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop);
-    OutStreamer->AddComment("avoids zero-length function");
+    MF->getSubtarget().getInstrInfo()->getNoop(Noop);
 
     // Targets can opt-out of emitting the noop here by leaving the opcode
     // unspecified.
-    if (Noop.getOpcode())
+    if (Noop.getOpcode()) {
+      OutStreamer->AddComment("avoids zero-length function");
       OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
+    }
   }
 
   const Function *F = MF->getFunction();
@@ -981,8 +1091,8 @@ void AsmPrinter::EmitFunctionBody() {
   // Emit target-specific gunk after the function body.
   EmitFunctionBodyEnd();
 
-  if (!MF->getLandingPads().empty() || MMI->hasDebugInfo() ||
-      MF->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) {
+  if (needFuncLabelsForEHOrDebugInfo(*MF, MMI) ||
+      MAI->hasDotTypeDotSizeDirective()) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->EmitLabel(CurrentFnEnd);
@@ -1015,6 +1125,9 @@ void AsmPrinter::EmitFunctionBody() {
     HI.Handler->endFunction(MF);
   }
 
+  if (isVerbose())
+    OutStreamer->GetCommentOS() << "-- End function\n";
+
   OutStreamer->AddBlankLine();
 }
 
@@ -1175,11 +1288,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
 
-  // Emit module flags.
-  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
-  M.getModuleFlagsMetadata(ModuleFlags);
-  if (!ModuleFlags.empty())
-    TLOF.emitModuleFlags(*OutStreamer, ModuleFlags, TM);
+  TLOF.emitModuleMetadata(*OutStreamer, M, TM);
 
   if (TM.getTargetTriple().isOSBinFormatELF()) {
     MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
@@ -1238,7 +1347,7 @@ bool AsmPrinter::doFinalization(Module &M) {
         break;
       AliasStack.push_back(Cur);
     }
-    for (const GlobalAlias *AncestorAlias : reverse(AliasStack))
+    for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack))
       emitGlobalIndirectSymbol(M, *AncestorAlias);
     AliasStack.clear();
   }
@@ -1266,7 +1375,7 @@ bool AsmPrinter::doFinalization(Module &M) {
         OutContext.getOrCreateSymbol(StringRef("__morestack_addr"));
     OutStreamer->EmitLabel(AddrSymbol);
 
-    unsigned PtrSize = M.getDataLayout().getPointerSize(0);
+    unsigned PtrSize = MAI->getCodePointerSize();
     OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
                                  PtrSize);
   }
@@ -1304,26 +1413,34 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   CurrentFnBegin = nullptr;
   CurExceptionSym = nullptr;
   bool NeedsLocalForSize = MAI->needsLocalForSize();
-  if (!MF.getLandingPads().empty() || MMI->hasDebugInfo() ||
-      MF.hasEHFunclets() || NeedsLocalForSize) {
+  if (needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
       CurrentFnSymForSize = CurrentFnBegin;
   }
 
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   if (isVerbose())
     LI = &getAnalysis<MachineLoopInfo>();
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  EnablePrintSchedInfo = PrintSchedule.getNumOccurrences()
+                             ? PrintSchedule
+                             : STI.supportPrintSchedInfo();
 }
 
 namespace {
+
 // Keep track the alignment, constpool entries per Section.
   struct SectionCPs {
     MCSection *S;
     unsigned Alignment;
     SmallVector<unsigned, 4> CPEs;
+
     SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {}
   };
-}
+
+} // end anonymous namespace
 
 /// EmitConstantPool - Print to the current output stream assembly
 /// representations of the constants in the constant pool MCP. This is
@@ -1547,7 +1664,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   OutStreamer->EmitValue(Value, EntrySize);
 }
 
-
 /// EmitSpecialLLVMGlobal - Check to see if the specified global is a
 /// special global used by LLVM.  If so, emit it and return true, otherwise
 /// do nothing and return false.
@@ -1598,13 +1714,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
 }
 
 namespace {
+
 struct Structor {
-  Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {}
-  int Priority;
-  llvm::Constant *Func;
-  llvm::GlobalValue *ComdatKey;
+  int Priority = 0;
+  Constant *Func = nullptr;
+  GlobalValue *ComdatKey = nullptr;
+
+  Structor() = default;
 };
-} // end namespace
+
+}  // end anonymous namespace
 
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
@@ -1653,8 +1772,11 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
     const MCSymbol *KeySym = nullptr;
     if (GlobalValue *GV = S.ComdatKey) {
-      if (GV->hasAvailableExternallyLinkage())
-        // If the associated variable is available_externally, some other TU
+      if (GV->isDeclarationForLinker())
+        // If the associated variable is not defined in this module
+        // (it might be available_externally, or have been an
+        // available_externally definition that was dropped by the
+        // EliminateAvailableExternally pass), some other TU
         // will provide its dynamic initializer.
         continue;
 
@@ -1931,7 +2053,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) {
   return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
 }
 
-
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
@@ -1972,7 +2093,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
 static void emitGlobalConstantDataSequential(const DataLayout &DL,
                                              const ConstantDataSequential *CDS,
                                              AsmPrinter &AP) {
-
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, DL);
   if (Value != -1) {
@@ -2006,7 +2126,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
                         CDS->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer->EmitZeros(Padding);
-
 }
 
 static void emitGlobalConstantArray(const DataLayout &DL,
@@ -2145,7 +2264,7 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
       //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
       ExtraBits = Realigned.getRawData()[0] &
         (((uint64_t)-1) >> (64 - ExtraBitsSize));
-      Realigned = Realigned.lshr(ExtraBitsSize);
+      Realigned.lshrInPlace(ExtraBitsSize);
     } else
       ExtraBits = Realigned.getRawData()[BitWidth / 64];
   }
@@ -2420,8 +2539,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
   return OutContext.getOrCreateSymbol(NameStr);
 }
 
-
-
 /// PrintParentLoopComment - Print comments about parent loops of this one.
 static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                    unsigned FunctionNumber) {
@@ -2486,7 +2603,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
 }
 
-
 /// EmitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
@@ -2607,8 +2723,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   return true;
 }
 
-
-
 GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   if (!S.usesMetadata())
     return nullptr;
@@ -2639,7 +2753,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
 }
 
 /// Pin vtable to this file.
-AsmPrinterHandler::~AsmPrinterHandler() {}
+AsmPrinterHandler::~AsmPrinterHandler() = default;
 
 void AsmPrinterHandler::markFunctionEnd() {}
 
@@ -2663,37 +2777,61 @@ void AsmPrinter::emitXRayTable() {
 
   auto PrevSection = OutStreamer->getCurrentSectionOnly();
   auto Fn = MF->getFunction();
-  MCSection *Section = nullptr;
+  MCSection *InstMap = nullptr;
+  MCSection *FnSledIndex = nullptr;
   if (MF->getSubtarget().getTargetTriple().isOSBinFormatELF()) {
     if (Fn->hasComdat()) {
-      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+      InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
                                          ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
                                          Fn->getComdat()->getName());
+      FnSledIndex = OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS,
+                                             ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+                                             Fn->getComdat()->getName());
     } else {
-      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+      InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
                                          ELF::SHF_ALLOC);
+      FnSledIndex = OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS,
+                                             ELF::SHF_ALLOC);
     }
   } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) {
-    Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+    InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
                                          SectionKind::getReadOnlyWithRel());
+    FnSledIndex = OutContext.getMachOSection("__DATA", "xray_fn_idx", 0,
+                                             SectionKind::getReadOnlyWithRel());
   } else {
     llvm_unreachable("Unsupported target");
   }
 
   // Before we switch over, we force a reference to a label inside the
-  // xray_instr_map section. Since this function is always called just
-  // before the function's end, we assume that this is happening after
-  // the last return instruction.
-
-  auto WordSizeBytes = TM.getPointerSize();
-  MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+  // xray_fn_idx sections. This makes sure that the xray_fn_idx section is kept
+  // live by the linker if the function is not garbage-collected. Since this
+  // function is always called just before the function's end, we assume that
+  // this is happening after the last return instruction.
+  auto WordSizeBytes = MAI->getCodePointerSize();
+  MCSymbol *IdxRef = OutContext.createTempSymbol("xray_fn_idx_synth_", true);
   OutStreamer->EmitCodeAlignment(16);
-  OutStreamer->EmitSymbolValue(Tmp, WordSizeBytes, false);
-  OutStreamer->SwitchSection(Section);
-  OutStreamer->EmitLabel(Tmp);
+  OutStreamer->EmitSymbolValue(IdxRef, WordSizeBytes, false);
+
+  // Now we switch to the instrumentation map section. Because this is done
+  // per-function, we are able to create an index entry that will represent the
+  // range of sleds associated with a function.
+  MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true);
+  OutStreamer->SwitchSection(InstMap);
+  OutStreamer->EmitLabel(SledsStart);
   for (const auto &Sled : Sleds)
     Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym);
-
+  MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
+  OutStreamer->EmitLabel(SledsEnd);
+
+  // We then emit a single entry in the index per function. We use the symbols
+  // that bound the instrumentation map as the range for a specific function.
+  // Each entry here will be 2 * word size aligned, as we're writing down two
+  // pointers. This should work for both 32-bit and 64-bit platforms.
+  OutStreamer->SwitchSection(FnSledIndex);
+  OutStreamer->EmitCodeAlignment(2 * WordSizeBytes);
+  OutStreamer->EmitLabel(IdxRef);
+  OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes);
+  OutStreamer->EmitSymbolValue(SledsEnd, WordSizeBytes);
   OutStreamer->SwitchSection(PrevSection);
   Sleds.clear();
 }
@@ -2702,8 +2840,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
   SledKind Kind) {
   auto Fn = MI.getParent()->getParent()->getFunction();
   auto Attr = Fn->getFnAttribute("function-instrument");
+  bool LogArgs = Fn->hasFnAttribute("xray-log-args");
   bool AlwaysInstrument =
     Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
+  if (Kind == SledKind::FUNCTION_ENTER && LogArgs)
+    Kind = SledKind::LOG_ARGS_ENTER;
   Sleds.emplace_back(
     XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn });
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 0185c38..0edf905 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -15,6 +15,7 @@
 #include "DwarfDebug.h"
 #include "DwarfExpression.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -26,7 +27,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 165b8ee..eae79ad 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -48,10 +48,16 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
       static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo);
   assert(DiagInfo && "Diagnostic context not passed down?");
 
+  // Look up a LocInfo for the buffer this diagnostic is coming from.
+  unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc());
+  const MDNode *LocInfo = nullptr;
+  if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size())
+    LocInfo = DiagInfo->LocInfos[BufNum-1];
+
   // If the inline asm had metadata associated with it, pull out a location
   // cookie corresponding to which line the error occurred on.
   unsigned LocCookie = 0;
-  if (const MDNode *LocInfo = DiagInfo->LocInfo) {
+  if (LocInfo) {
     unsigned ErrorLine = Diag.getLineNo()-1;
     if (ErrorLine >= LocInfo->getNumOperands())
       ErrorLine = 0;
@@ -108,7 +114,6 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
 
   SourceMgr &SrcMgr = DiagInfo->SrcMgr;
   SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
-  DiagInfo->LocInfo = LocMDNode;
 
   std::unique_ptr<MemoryBuffer> Buffer;
   // The inline asm source manager will outlive Str, so make a copy of the
@@ -118,6 +123,12 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
   unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
 
+  // Store LocMDNode in DiagInfo, using BufNum as an identifier.
+  if (LocMDNode) {
+    DiagInfo->LocInfos.resize(BufNum);
+    DiagInfo->LocInfos[BufNum-1] = LocMDNode;
+  }
+
   std::unique_ptr<MCAsmParser> Parser(
       createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
@@ -133,6 +144,9 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
                        " we don't have an asm parser for this target\n");
   Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
+  if (Dialect == InlineAsm::AD_Intel)
+    // We need this flag to be able to parse numbers like "0bH"
+    Parser->setParsingInlineAsm(true);
   if (MF) {
     const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
     TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
@@ -144,11 +158,6 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
                         /*NoFinalize*/ true);
   emitInlineAsmEnd(STI, &TAP->getSTI());
 
-  // LocInfo cannot be used for error generation from the backend.
-  // FIXME: associate LocInfo with the SourceBuffer to improve backend
-  // messages.
-  DiagInfo->LocInfo = nullptr;
-
   if (Res && !DiagInfo->DiagHandler)
     report_fatal_error("Error parsing inline asm\n");
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 8344051..a81d56e 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp --*- C++ -*--===//
+//===- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,37 +12,82 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeViewDebug.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
 CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
-    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(),
-      TypeTable(Allocator), CurFn(nullptr) {
+    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), TypeTable(Allocator) {
   // If module doesn't have named metadata anchors or COFF debug section
   // is not available, skip any debug info related stuff.
   if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
@@ -178,7 +223,8 @@ static const DISubprogram *getQualifiedNameComponents(
 static std::string getQualifiedName(ArrayRef<StringRef> QualifiedNameComponents,
                                     StringRef TypeName) {
   std::string FullyQualifiedName;
-  for (StringRef QualifiedNameComponent : reverse(QualifiedNameComponents)) {
+  for (StringRef QualifiedNameComponent :
+       llvm::reverse(QualifiedNameComponents)) {
     FullyQualifiedName.append(QualifiedNameComponent);
     FullyQualifiedName.append("::");
   }
@@ -238,7 +284,7 @@ TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
 
   // The display name includes function template arguments. Drop them to match
   // MSVC.
-  StringRef DisplayName = SP->getDisplayName().split('<').first;
+  StringRef DisplayName = SP->getName().split('<').first;
 
   const DIScope *Scope = SP->getScope().resolve();
   TypeIndex TI;
@@ -319,7 +365,7 @@ static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs,
 void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
                                         const MachineFunction *MF) {
   // Skip this instruction if it has the same location as the previous one.
-  if (DL == CurFn->LastLoc)
+  if (!DL || DL == PrevInstLoc)
     return;
 
   const DIScope *Scope = DL.get()->getScope();
@@ -339,11 +385,11 @@ void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
   if (!CurFn->HaveLineInfo)
     CurFn->HaveLineInfo = true;
   unsigned FileId = 0;
-  if (CurFn->LastLoc.get() && CurFn->LastLoc->getFile() == DL->getFile())
+  if (PrevInstLoc.get() && PrevInstLoc->getFile() == DL->getFile())
     FileId = CurFn->LastFileId;
   else
     FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile());
-  CurFn->LastLoc = DL;
+  PrevInstLoc = DL;
 
   unsigned FuncId = CurFn->FuncId;
   if (const DILocation *SiteLoc = DL->getInlinedAt()) {
@@ -393,7 +439,7 @@ void CodeViewDebug::endModule() {
   // subprograms.
   switchToDebugSectionForSymbol(nullptr);
 
-  MCSymbol *CompilerInfo = beginCVSubsection(ModuleSubstreamKind::Symbols);
+  MCSymbol *CompilerInfo = beginCVSubsection(DebugSubsectionKind::Symbols);
   emitCompilerInformation();
   endCVSubsection(CompilerInfo);
 
@@ -417,7 +463,7 @@ void CodeViewDebug::endModule() {
 
   // Emit UDT records for any types used by global variables.
   if (!GlobalUDTs.empty()) {
-    MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols);
+    MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
     emitDebugInfoForUDTs(GlobalUDTs);
     endCVSubsection(SymbolsEnd);
   }
@@ -469,17 +515,21 @@ void CodeViewDebug::emitTypeInformation() {
     CommentPrefix += ' ';
   }
 
-  TypeDatabase TypeDB;
-  CVTypeDumper CVTD(TypeDB);
-  TypeTable.ForEachRecord([&](TypeIndex Index, ArrayRef<uint8_t> Record) {
+  TypeTableCollection Table(TypeTable.records());
+  Optional<TypeIndex> B = Table.getFirst();
+  while (B) {
+    // This will fail if the record data is invalid.
+    CVType Record = Table.getType(*B);
+
     if (OS.isVerboseAsm()) {
       // Emit a block comment describing the type record for readability.
       SmallString<512> CommentBlock;
       raw_svector_ostream CommentOS(CommentBlock);
       ScopedPrinter SP(CommentOS);
       SP.setPrefix(CommentPrefix);
-      TypeDumpVisitor TDV(TypeDB, &SP, false);
-      Error E = CVTD.dump(Record, TDV);
+      TypeDumpVisitor TDV(Table, &SP, false);
+
+      Error E = codeview::visitTypeRecord(Record, *B, TDV);
       if (E) {
         logAllUnhandledErrors(std::move(E), errs(), "error: ");
         llvm_unreachable("produced malformed type record");
@@ -489,29 +539,10 @@ void CodeViewDebug::emitTypeInformation() {
       // newline.
       OS.emitRawComment(
           CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim());
-    } else {
-#ifndef NDEBUG
-      // Assert that the type data is valid even if we aren't dumping
-      // comments. The MSVC linker doesn't do much type record validation,
-      // so the first link of an invalid type record can succeed while
-      // subsequent links will fail with LNK1285.
-      ByteStream Stream(Record);
-      CVTypeArray Types;
-      StreamReader Reader(Stream);
-      Error E = Reader.readArray(Types, Reader.getLength());
-      if (!E) {
-        TypeVisitorCallbacks C;
-        E = CVTypeVisitor(C).visitTypeStream(Types);
-      }
-      if (E) {
-        logAllUnhandledErrors(std::move(E), errs(), "error: ");
-        llvm_unreachable("produced malformed type record");
-      }
-#endif
     }
-    StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
-    OS.EmitBinaryData(S);
-  });
+    OS.EmitBinaryData(Record.str_data());
+    B = Table.getNext(*B);
+  }
 }
 
 namespace {
@@ -586,7 +617,7 @@ static CPUType mapArchToCVCPUType(Triple::ArchType Type) {
   }
 }
 
-}  // anonymous namespace
+} // end anonymous namespace
 
 void CodeViewDebug::emitCompilerInformation() {
   MCContext &Context = MMI->getContext();
@@ -645,7 +676,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
     return;
 
   OS.AddComment("Inlinee lines subsection");
-  MCSymbol *InlineEnd = beginCVSubsection(ModuleSubstreamKind::InlineeLines);
+  MCSymbol *InlineEnd = beginCVSubsection(DebugSubsectionKind::InlineeLines);
 
   // We don't provide any extra file info.
   // FIXME: Find out if debuggers use this info.
@@ -658,7 +689,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
 
     OS.AddBlankLine();
     unsigned FileId = maybeRecordFile(SP->getFile());
-    OS.AddComment("Inlined function " + SP->getDisplayName() + " starts at " +
+    OS.AddComment("Inlined function " + SP->getName() + " starts at " +
                   SP->getFilename() + Twine(':') + Twine(SP->getLine()));
     OS.AddBlankLine();
     // The filechecksum table uses 8 byte entries for now, and file ids start at
@@ -760,17 +791,17 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   // If we have a display name, build the fully qualified name by walking the
   // chain of scopes.
-  if (!SP->getDisplayName().empty())
+  if (!SP->getName().empty())
     FuncName =
-        getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName());
+        getFullyQualifiedName(SP->getScope().resolve(), SP->getName());
 
   // If our DISubprogram name is empty, use the mangled name.
   if (FuncName.empty())
-    FuncName = GlobalValue::getRealLinkageName(GV->getName());
+    FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
 
   // Emit a symbol subsection, required by VS2012+ to find function boundaries.
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
-  MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols);
+  MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
   {
     MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(),
              *ProcRecordEnd = MMI->getContext().createTempSymbol();
@@ -887,13 +918,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
     if (!Scope)
       continue;
 
+    // If the variable has an attached offset expression, extract it.
+    // FIXME: Try to handle DW_OP_deref as well.
+    int64_t ExprOffset = 0;
+    if (VI.Expr)
+      if (!VI.Expr->extractIfOffset(ExprOffset))
+        continue;
+
     // Get the frame register used and the offset.
     unsigned FrameReg = 0;
     int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
     uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
 
     // Calculate the label ranges.
-    LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+    LocalVarDefRange DefRange =
+        createDefRangeMem(CVReg, FrameOffset + ExprOffset);
     for (const InsnRange &Range : Scope->getRanges()) {
       const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
       const MCSymbol *End = getLabelAfterInsn(Range.second);
@@ -948,10 +987,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
 
       // Handle fragments.
       auto Fragment = DIExpr->getFragmentInfo();
-      if (DIExpr && Fragment) {
+      if (Fragment) {
         IsSubfield = true;
         StructOffset = Fragment->OffsetInBits / 8;
-      } else if (DIExpr && DIExpr->getNumElements() > 0) {
+      } else if (DIExpr->getNumElements() > 0) {
         continue; // Ignore unrecognized exprs.
       }
 
@@ -1014,14 +1053,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
   }
 }
 
-void CodeViewDebug::beginFunction(const MachineFunction *MF) {
-  assert(!CurFn && "Can't process two functions at once!");
-
-  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram())
-    return;
-
-  DebugHandlerBase::beginFunction(MF);
-
+void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   const Function *GV = MF->getFunction();
   assert(FnDebugInfo.count(GV) == false);
   CurFn = &FnDebugInfo[GV];
@@ -1038,11 +1070,11 @@ void CodeViewDebug::beginFunction(const MachineFunction *MF) {
   bool EmptyPrologue = true;
   for (const auto &MBB : *MF) {
     for (const auto &MI : MBB) {
-      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+      if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) &&
           MI.getDebugLoc()) {
         PrologEndLoc = MI.getDebugLoc();
         break;
-      } else if (!MI.isDebugValue()) {
+      } else if (!MI.isMetaInstruction()) {
         EmptyPrologue = false;
       }
     }
@@ -1144,33 +1176,12 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
   DITypeRef ElementTypeRef = Ty->getBaseType();
   TypeIndex ElementTypeIndex = getTypeIndex(ElementTypeRef);
   // IndexType is size_t, which depends on the bitness of the target.
-  TypeIndex IndexType = Asm->MAI->getPointerSize() == 8
+  TypeIndex IndexType = Asm->TM.getPointerSize() == 8
                             ? TypeIndex(SimpleTypeKind::UInt64Quad)
                             : TypeIndex(SimpleTypeKind::UInt32Long);
 
   uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8;
 
-
-  // We want to assert that the element type multiplied by the array lengths
-  // match the size of the overall array. However, if we don't have complete
-  // type information for the base type, we can't make this assertion. This
-  // happens if limited debug info is enabled in this case:
-  //   struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); };
-  //   VTableOptzn array[3];
-  // The DICompositeType of VTableOptzn will have size zero, and the array will
-  // have size 3 * sizeof(void*), and we should avoid asserting.
-  //
-  // There is a related bug in the front-end where an array of a structure,
-  // which was declared as incomplete structure first, ends up not getting a
-  // size assigned to it. (PR28303)
-  // Example:
-  //   struct A(*p)[3];
-  //   struct A { int f; } a[3];
-  bool PartiallyIncomplete = false;
-  if (Ty->getSizeInBits() == 0 || ElementSize == 0) {
-    PartiallyIncomplete = true;
-  }
-
   // Add subranges to array type.
   DINodeArray Elements = Ty->getElements();
   for (int i = Elements.size() - 1; i >= 0; --i) {
@@ -1185,16 +1196,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     // Variable Length Array (VLA) has Count equal to '-1'.
     // Replace with Count '1', assume it is the minimum VLA length.
     // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU.
-    if (Count == -1) {
+    if (Count == -1)
       Count = 1;
-      PartiallyIncomplete = true;
-    }
 
     // Update the element size and element type index for subsequent subranges.
     ElementSize *= Count;
 
     // If this is the outermost array, use the size from the array. It will be
-    // more accurate if PartiallyIncomplete is true.
+    // more accurate if we had a VLA or an incomplete element type size.
     uint64_t ArraySize =
         (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize;
 
@@ -1203,9 +1212,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     ElementTypeIndex = TypeTable.writeKnownType(AR);
   }
 
-  (void)PartiallyIncomplete;
-  assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8));
-
   return ElementTypeIndex;
 }
 
@@ -1376,8 +1382,8 @@ TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty) {
   assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type);
   TypeIndex ClassTI = getTypeIndex(Ty->getClassType());
   TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType());
-  PointerKind PK = Asm->MAI->getPointerSize() == 8 ? PointerKind::Near64
-                                                   : PointerKind::Near32;
+  PointerKind PK = Asm->TM.getPointerSize() == 8 ? PointerKind::Near64
+                                                 : PointerKind::Near32;
   bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
   PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction
                          : PointerMode::PointerToDataMember;
@@ -1492,7 +1498,8 @@ TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty,
 }
 
 TypeIndex CodeViewDebug::lowerTypeVFTableShape(const DIDerivedType *Ty) {
-  unsigned VSlotCount = Ty->getSizeInBits() / (8 * Asm->MAI->getPointerSize());
+  unsigned VSlotCount =
+      Ty->getSizeInBits() / (8 * Asm->MAI->getCodePointerSize());
   SmallVector<VFTableSlotKind, 4> Slots(VSlotCount, VFTableSlotKind::Near);
 
   VFTableShapeRecord VFTSR(Slots);
@@ -1600,7 +1607,7 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
         EnumeratorCount++;
       }
     }
-    FTI = FLRB.end();
+    FTI = FLRB.end(true);
   }
 
   std::string FullName = getFullyQualifiedName(Ty);
@@ -1620,11 +1627,11 @@ struct llvm::ClassInfo {
     uint64_t BaseOffset;
   };
   // [MemberInfo]
-  typedef std::vector<MemberInfo> MemberList;
+  using MemberList = std::vector<MemberInfo>;
 
-  typedef TinyPtrVector<const DISubprogram *> MethodsList;
+  using MethodsList = TinyPtrVector<const DISubprogram *>;
   // MethodName -> MethodsList
-  typedef MapVector<MDString *, MethodsList> MethodsMap;
+  using MethodsMap = MapVector<MDString *, MethodsList>;
 
   /// Base classes.
   std::vector<const DIDerivedType *> Inheritance;
@@ -1736,10 +1743,12 @@ TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
                  SizeInBytes, FullName, Ty->getIdentifier());
   TypeIndex ClassTI = TypeTable.writeKnownType(CR);
 
-  StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(Ty->getFile()));
-  TypeIndex SIDI = TypeTable.writeKnownType(SIDR);
-  UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine());
-  TypeTable.writeKnownType(USLR);
+  if (const auto *File = Ty->getFile()) {
+    StringIdRecord SIDR(TypeIndex(0x0), getFullFilepath(File));
+    TypeIndex SIDI = TypeTable.writeKnownType(SIDR);
+    UdtSourceLineRecord USLR(ClassTI, SIDI, Ty->getLine());
+    TypeTable.writeKnownType(USLR);
+  }
 
   addToUDTs(Ty, ClassTI);
 
@@ -1887,7 +1896,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
           translateMethodOptionFlags(SP), VFTableOffset, Name));
       MemberCount++;
     }
-    assert(Methods.size() > 0 && "Empty methods map entry");
+    assert(!Methods.empty() && "Empty methods map entry");
     if (Methods.size() == 1)
       FLBR.writeMemberType(Methods[0]);
     else {
@@ -1905,7 +1914,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
     MemberCount++;
   }
 
-  TypeIndex FieldTI = FLBR.end();
+  TypeIndex FieldTI = FLBR.end(true);
   return std::make_tuple(FieldTI, Info.VShapeTI, MemberCount,
                          !Info.NestedClasses.empty());
 }
@@ -2115,18 +2124,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {
   }
 }
 
-void CodeViewDebug::endFunction(const MachineFunction *MF) {
-  if (!Asm || !CurFn)  // We haven't created any debug info for this function.
-    return;
-
+void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
   const Function *GV = MF->getFunction();
   assert(FnDebugInfo.count(GV));
   assert(CurFn == &FnDebugInfo[GV]);
 
   collectVariableInfo(GV->getSubprogram());
 
-  DebugHandlerBase::endFunction(MF);
-
   // Don't emit anything if we don't have any line tables.
   if (!CurFn->HaveLineInfo) {
     FnDebugInfo.erase(GV);
@@ -2146,13 +2150,27 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   if (!Asm || !CurFn || MI->isDebugValue() ||
       MI->getFlag(MachineInstr::FrameSetup))
     return;
+
+  // If the first instruction of a new MBB has no location, find the first
+  // instruction with a location and use that.
   DebugLoc DL = MI->getDebugLoc();
-  if (DL == PrevInstLoc || !DL)
+  if (!DL && MI->getParent() != PrevInstBB) {
+    for (const auto &NextMI : *MI->getParent()) {
+      DL = NextMI.getDebugLoc();
+      if (DL)
+        break;
+    }
+  }
+  PrevInstBB = MI->getParent();
+
+  // If we still don't have a debug location, don't record a location.
+  if (!DL)
     return;
+
   maybeRecordLocation(DL, Asm->MF);
 }
 
-MCSymbol *CodeViewDebug::beginCVSubsection(ModuleSubstreamKind Kind) {
+MCSymbol *CodeViewDebug::beginCVSubsection(DebugSubsectionKind Kind) {
   MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
            *EndLabel = MMI->getContext().createTempSymbol();
   OS.EmitIntValue(unsigned(Kind), 4);
@@ -2212,7 +2230,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
         if (!GV->hasComdat() && !GV->isDeclarationForLinker()) {
           if (!EndLabel) {
             OS.AddComment("Symbol subsection for globals");
-            EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols);
+            EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
           }
           // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
           emitDebugInfoForGlobal(GVE->getVariable(), GV, Asm->getSymbol(GV));
@@ -2228,9 +2246,9 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
         if (GV->hasComdat()) {
           MCSymbol *GVSym = Asm->getSymbol(GV);
           OS.AddComment("Symbol subsection for " +
-                        Twine(GlobalValue::getRealLinkageName(GV->getName())));
+                        Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
           switchToDebugSectionForSymbol(GVSym);
-          EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols);
+          EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
           // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
           emitDebugInfoForGlobal(GVE->getVariable(), GV, GVSym);
           endCVSubsection(EndLabel);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 3dd4315..fd8f604 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h ----*- C++ -*--===//
+//===- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h --------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,29 +14,44 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
 
+#include "DbgValueHistoryCalculator.h"
 #include "DebugHandlerBase.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
+#include <cstdint>
+#include <map>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
-class StringRef;
-class LexicalScope;
 struct ClassInfo;
+class StringRef;
+class AsmPrinter;
+class Function;
+class GlobalVariable;
+class MCSectionCOFF;
+class MCStreamer;
+class MCSymbol;
+class MachineFunction;
 
 /// \brief Collects and handles line tables information in a CodeView format.
 class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   MCStreamer &OS;
-  llvm::BumpPtrAllocator Allocator;
+  BumpPtrAllocator Allocator;
   codeview::TypeTableBuilder TypeTable;
 
   /// Represents the most general definition range.
@@ -103,14 +118,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
     SmallVector<LocalVariable, 1> Locals;
 
-    DebugLoc LastLoc;
     const MCSymbol *Begin = nullptr;
     const MCSymbol *End = nullptr;
     unsigned FuncId = 0;
     unsigned LastFileId = 0;
     bool HaveLineInfo = false;
   };
-  FunctionInfo *CurFn;
+  FunctionInfo *CurFn = nullptr;
 
   /// The set of comdat .debug$S sections that we've seen so far. Each section
   /// must start with a magic version number that must only be emitted once.
@@ -176,8 +190,9 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   std::vector<std::pair<std::string, codeview::TypeIndex>> LocalUDTs,
       GlobalUDTs;
 
-  typedef std::map<const DIFile *, std::string> FileToFilepathMapTy;
+  using FileToFilepathMapTy = std::map<const DIFile *, std::string>;
   FileToFilepathMapTy FileToFilepathMap;
+
   StringRef getFullFilepath(const DIFile *S);
 
   unsigned maybeRecordFile(const DIFile *F);
@@ -216,14 +231,14 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// Opens a subsection of the given kind in a .debug$S codeview section.
   /// Returns an end label for use with endCVSubsection when the subsection is
   /// finished.
-  MCSymbol *beginCVSubsection(codeview::ModuleSubstreamKind Kind);
+  MCSymbol *beginCVSubsection(codeview::DebugSubsectionKind Kind);
 
   void endCVSubsection(MCSymbol *EndLabel);
 
   void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt,
                            const InlineSite &Site);
 
-  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+  using InlinedVariable = DbgValueHistoryMap::InlinedVariable;
 
   void collectVariableInfo(const DISubprogram *SP);
 
@@ -299,23 +314,25 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   unsigned getPointerSizeInBytes();
 
+protected:
+  /// \brief Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// \brief Gather post-function debug information.
+  void endFunctionImpl(const MachineFunction *) override;
+
 public:
   CodeViewDebug(AsmPrinter *Asm);
 
-  void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
+  void setSymbolSize(const MCSymbol *, uint64_t) override {}
 
   /// \brief Emit the COFF section that holds the line table information.
   void endModule() override;
 
-  /// \brief Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// \brief Gather post-function debug information.
-  void endFunction(const MachineFunction *) override;
-
   /// \brief Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 };
-} // End of namespace llvm
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index 8799189..886e6e2 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -31,6 +31,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "dwarfdebug"
+
 //===----------------------------------------------------------------------===//
 // DIEAbbrevData Implementation
 //===----------------------------------------------------------------------===//
@@ -42,6 +44,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
   // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
   ID.AddInteger(unsigned(Attribute));
   ID.AddInteger(unsigned(Form));
+  if (Form == dwarf::DW_FORM_implicit_const)
+    ID.AddInteger(Value);
 }
 
 //===----------------------------------------------------------------------===//
@@ -77,15 +81,22 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
                     dwarf::AttributeString(AttrData.getAttribute()).data());
 
     // Emit form type.
+#ifndef NDEBUG
+    // Could be an assertion, but this way we can see the failing form code
+    // easily, which helps track down where it came from.
+    if (!dwarf::isValidFormForVersion(AttrData.getForm(),
+                                      AP->getDwarfVersion())) {
+      DEBUG(dbgs() << "Invalid form " << format("0x%x", AttrData.getForm())
+                   << " for DWARF version " << AP->getDwarfVersion() << "\n");
+      llvm_unreachable("Invalid form for specified DWARF version");
+    }
+#endif
     AP->EmitULEB128(AttrData.getForm(),
                     dwarf::FormEncodingString(AttrData.getForm()).data());
 
     // Emit value for DW_FORM_implicit_const.
-    if (AttrData.getForm() == dwarf::DW_FORM_implicit_const) {
-      assert(AP->getDwarfVersion() >= 5 &&
-            "DW_FORM_implicit_const is supported starting from DWARFv5");
+    if (AttrData.getForm() == dwarf::DW_FORM_implicit_const)
       AP->EmitSLEB128(AttrData.getValue());
-    }
   }
 
   // Mark end of abbreviation.
@@ -94,7 +105,7 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
 }
 
 LLVM_DUMP_METHOD
-void DIEAbbrev::print(raw_ostream &O) {
+void DIEAbbrev::print(raw_ostream &O) const {
   O << "Abbreviation @"
     << format("0x%lx", (long)(intptr_t)this)
     << "  "
@@ -107,13 +118,20 @@ void DIEAbbrev::print(raw_ostream &O) {
     O << "  "
       << dwarf::AttributeString(Data[i].getAttribute())
       << "  "
-      << dwarf::FormEncodingString(Data[i].getForm())
-      << '\n';
+      << dwarf::FormEncodingString(Data[i].getForm());
+
+    if (Data[i].getForm() == dwarf::DW_FORM_implicit_const)
+      O << " " << Data[i].getValue();
+
+    O << '\n';
   }
 }
 
-LLVM_DUMP_METHOD
-void DIEAbbrev::dump() { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIEAbbrev::dump() const {
+  print(dbgs());
+}
+#endif
 
 //===----------------------------------------------------------------------===//
 // DIEAbbrevSet Implementation
@@ -249,10 +267,11 @@ void DIE::print(raw_ostream &O, unsigned IndentCount) const {
   O << "\n";
 }
 
-LLVM_DUMP_METHOD
-void DIE::dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIE::dump() const {
   print(dbgs());
 }
+#endif
 
 unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
                                        DIEAbbrevSet &AbbrevSet,
@@ -340,10 +359,11 @@ void DIEValue::print(raw_ostream &O) const {
   }
 }
 
-LLVM_DUMP_METHOD
-void DIEValue::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DIEValue::dump() const {
   print(dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // DIEInteger Implementation
@@ -354,57 +374,42 @@ void DIEValue::dump() const {
 void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_implicit_const:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     Asm->OutStreamer->AddBlankLine();
     return;
   case dwarf::DW_FORM_flag:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref1:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data1:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
   case dwarf::DW_FORM_ref2:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data2:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
   case dwarf::DW_FORM_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref4:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data4:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
   case dwarf::DW_FORM_ref8:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_sig8:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data8:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup8:
   case dwarf::DW_FORM_GNU_ref_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_strp_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_line_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_sec_offset:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp_sup:
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sup:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_addr:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_addr:
     Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form));
     return;
   case dwarf::DW_FORM_GNU_str_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_addr_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_udata:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -419,35 +424,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
-  case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_flag_present: return 0;
-  case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data1: return sizeof(int8_t);
-  case dwarf::DW_FORM_ref2:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data2: return sizeof(int16_t);
-  case dwarf::DW_FORM_ref4:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data4: return sizeof(int32_t);
-  case dwarf::DW_FORM_ref8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sig8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data8: return sizeof(int64_t);
+  case dwarf::DW_FORM_implicit_const:
+  case dwarf::DW_FORM_flag_present:
+    return 0;
+  case dwarf::DW_FORM_flag:
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
+    return sizeof(int8_t);
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
+    return sizeof(int16_t);
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
+    return sizeof(int32_t);
+  case dwarf::DW_FORM_ref8:
+  case dwarf::DW_FORM_ref_sig8:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_ref_sup8:
+    return sizeof(int64_t);
   case dwarf::DW_FORM_ref_addr:
     if (AP->getDwarfVersion() == 2)
       return AP->getPointerSize();
     LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_ref_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_strp_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_line_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_sec_offset:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp_sup:
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sup:
     switch (AP->OutStreamer->getContext().getDwarfFormat()) {
     case dwarf::DWARF32:
       return 4;
@@ -456,11 +467,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     }
     llvm_unreachable("Invalid DWARF format");
   case dwarf::DW_FORM_GNU_str_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_addr_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_udata:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
@@ -484,7 +492,7 @@ void DIEInteger::print(raw_ostream &O) const {
 /// EmitValue - Emit expression value.
 ///
 void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitDebugValue(Expr, SizeOf(AP, Form));
+  AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
@@ -519,7 +527,7 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getPointerSize();
+  return AP->MAI->getCodePointerSize();
 }
 
 LLVM_DUMP_METHOD
@@ -541,7 +549,7 @@ unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getPointerSize();
+  return AP->MAI->getCodePointerSize();
 }
 
 LLVM_DUMP_METHOD
@@ -647,20 +655,12 @@ void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_ref_addr: {
     // Get the absolute offset for this DIE within the debug info/types section.
     unsigned Addr = Entry->getDebugSectionOffset();
-    if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) {
-      const DwarfDebug *DD = AP->getDwarfDebug();
-      if (DD)
-        assert(!DD->useSplitDwarf() &&
-               "TODO: dwo files can't have relocations.");
-      const DIEUnit *Unit = Entry->getUnit();
-      assert(Unit && "CUDie should belong to a CU.");
-      MCSection *Section = Unit->getSection();
-      if (Section) {
-        const MCSymbol *SectionSym = Section->getBeginSymbol();
-        AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true);
-        return;
-      }
+    if (const MCSymbol *SectionSym =
+            Entry->getUnit()->getCrossSectionRelativeBaseAddress()) {
+      AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true);
+      return;
     }
+
     AP->OutStreamer->EmitIntValue(Addr, SizeOf(AP, Form));
     return;
   }
@@ -683,7 +683,7 @@ unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     return getULEB128Size(Entry->getOffset());
   case dwarf::DW_FORM_ref_addr:
     if (AP->getDwarfVersion() == 2)
-      return AP->getPointerSize();
+      return AP->MAI->getCodePointerSize();
     switch (AP->OutStreamer->getContext().getDwarfFormat()) {
     case dwarf::DWARF32:
       return 4;
@@ -809,7 +809,7 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     return 4;
   if (Form == dwarf::DW_FORM_sec_offset)
     return 4;
-  return AP->getPointerSize();
+  return AP->MAI->getCodePointerSize();
 }
 
 /// EmitValue - Emit label value.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index d8ecc7c..15ade3c 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -11,15 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ByteStreamer.h"
 #include "DIEHash.h"
+#include "ByteStreamer.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
@@ -116,65 +116,17 @@ void DIEHash::addParentContext(const DIE &Parent) {
 
 // Collect all of the attributes for a particular DIE in single structure.
 void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
-#define COLLECT_ATTR(NAME)                                                     \
-  case dwarf::NAME:                                                            \
-    Attrs.NAME = V;                                                            \
-    break
 
   for (const auto &V : Die.values()) {
     DEBUG(dbgs() << "Attribute: "
                  << dwarf::AttributeString(V.getAttribute())
                  << " added.\n");
     switch (V.getAttribute()) {
-      COLLECT_ATTR(DW_AT_name);
-      COLLECT_ATTR(DW_AT_accessibility);
-      COLLECT_ATTR(DW_AT_address_class);
-      COLLECT_ATTR(DW_AT_allocated);
-      COLLECT_ATTR(DW_AT_artificial);
-      COLLECT_ATTR(DW_AT_associated);
-      COLLECT_ATTR(DW_AT_binary_scale);
-      COLLECT_ATTR(DW_AT_bit_offset);
-      COLLECT_ATTR(DW_AT_bit_size);
-      COLLECT_ATTR(DW_AT_bit_stride);
-      COLLECT_ATTR(DW_AT_byte_size);
-      COLLECT_ATTR(DW_AT_byte_stride);
-      COLLECT_ATTR(DW_AT_const_expr);
-      COLLECT_ATTR(DW_AT_const_value);
-      COLLECT_ATTR(DW_AT_containing_type);
-      COLLECT_ATTR(DW_AT_count);
-      COLLECT_ATTR(DW_AT_data_bit_offset);
-      COLLECT_ATTR(DW_AT_data_location);
-      COLLECT_ATTR(DW_AT_data_member_location);
-      COLLECT_ATTR(DW_AT_decimal_scale);
-      COLLECT_ATTR(DW_AT_decimal_sign);
-      COLLECT_ATTR(DW_AT_default_value);
-      COLLECT_ATTR(DW_AT_digit_count);
-      COLLECT_ATTR(DW_AT_discr);
-      COLLECT_ATTR(DW_AT_discr_list);
-      COLLECT_ATTR(DW_AT_discr_value);
-      COLLECT_ATTR(DW_AT_encoding);
-      COLLECT_ATTR(DW_AT_enum_class);
-      COLLECT_ATTR(DW_AT_endianity);
-      COLLECT_ATTR(DW_AT_explicit);
-      COLLECT_ATTR(DW_AT_is_optional);
-      COLLECT_ATTR(DW_AT_location);
-      COLLECT_ATTR(DW_AT_lower_bound);
-      COLLECT_ATTR(DW_AT_mutable);
-      COLLECT_ATTR(DW_AT_ordering);
-      COLLECT_ATTR(DW_AT_picture_string);
-      COLLECT_ATTR(DW_AT_prototyped);
-      COLLECT_ATTR(DW_AT_small);
-      COLLECT_ATTR(DW_AT_segment);
-      COLLECT_ATTR(DW_AT_string_length);
-      COLLECT_ATTR(DW_AT_threads_scaled);
-      COLLECT_ATTR(DW_AT_upper_bound);
-      COLLECT_ATTR(DW_AT_use_location);
-      COLLECT_ATTR(DW_AT_use_UTF8);
-      COLLECT_ATTR(DW_AT_variable_parameter);
-      COLLECT_ATTR(DW_AT_virtuality);
-      COLLECT_ATTR(DW_AT_visibility);
-      COLLECT_ATTR(DW_AT_vtable_elem_location);
-      COLLECT_ATTR(DW_AT_type);
+#define HANDLE_DIE_HASH_ATTR(NAME)                                             \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME = V;                                                            \
+    break;
+#include "DIEHashAttributes.def"
     default:
       break;
     }
@@ -366,62 +318,12 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) {
 // Go through the attributes from \param Attrs in the order specified in 7.27.4
 // and hash them.
 void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
-#define ADD_ATTR(ATTR)                                                         \
+#define HANDLE_DIE_HASH_ATTR(NAME)                                             \
   {                                                                            \
-    if (ATTR)                                                                  \
-      hashAttribute(ATTR, Tag);                                                \
+    if (Attrs.NAME)                                                           \
+      hashAttribute(Attrs.NAME, Tag);                                         \
   }
-
-  ADD_ATTR(Attrs.DW_AT_name);
-  ADD_ATTR(Attrs.DW_AT_accessibility);
-  ADD_ATTR(Attrs.DW_AT_address_class);
-  ADD_ATTR(Attrs.DW_AT_allocated);
-  ADD_ATTR(Attrs.DW_AT_artificial);
-  ADD_ATTR(Attrs.DW_AT_associated);
-  ADD_ATTR(Attrs.DW_AT_binary_scale);
-  ADD_ATTR(Attrs.DW_AT_bit_offset);
-  ADD_ATTR(Attrs.DW_AT_bit_size);
-  ADD_ATTR(Attrs.DW_AT_bit_stride);
-  ADD_ATTR(Attrs.DW_AT_byte_size);
-  ADD_ATTR(Attrs.DW_AT_byte_stride);
-  ADD_ATTR(Attrs.DW_AT_const_expr);
-  ADD_ATTR(Attrs.DW_AT_const_value);
-  ADD_ATTR(Attrs.DW_AT_containing_type);
-  ADD_ATTR(Attrs.DW_AT_count);
-  ADD_ATTR(Attrs.DW_AT_data_bit_offset);
-  ADD_ATTR(Attrs.DW_AT_data_location);
-  ADD_ATTR(Attrs.DW_AT_data_member_location);
-  ADD_ATTR(Attrs.DW_AT_decimal_scale);
-  ADD_ATTR(Attrs.DW_AT_decimal_sign);
-  ADD_ATTR(Attrs.DW_AT_default_value);
-  ADD_ATTR(Attrs.DW_AT_digit_count);
-  ADD_ATTR(Attrs.DW_AT_discr);
-  ADD_ATTR(Attrs.DW_AT_discr_list);
-  ADD_ATTR(Attrs.DW_AT_discr_value);
-  ADD_ATTR(Attrs.DW_AT_encoding);
-  ADD_ATTR(Attrs.DW_AT_enum_class);
-  ADD_ATTR(Attrs.DW_AT_endianity);
-  ADD_ATTR(Attrs.DW_AT_explicit);
-  ADD_ATTR(Attrs.DW_AT_is_optional);
-  ADD_ATTR(Attrs.DW_AT_location);
-  ADD_ATTR(Attrs.DW_AT_lower_bound);
-  ADD_ATTR(Attrs.DW_AT_mutable);
-  ADD_ATTR(Attrs.DW_AT_ordering);
-  ADD_ATTR(Attrs.DW_AT_picture_string);
-  ADD_ATTR(Attrs.DW_AT_prototyped);
-  ADD_ATTR(Attrs.DW_AT_small);
-  ADD_ATTR(Attrs.DW_AT_segment);
-  ADD_ATTR(Attrs.DW_AT_string_length);
-  ADD_ATTR(Attrs.DW_AT_threads_scaled);
-  ADD_ATTR(Attrs.DW_AT_upper_bound);
-  ADD_ATTR(Attrs.DW_AT_use_location);
-  ADD_ATTR(Attrs.DW_AT_use_UTF8);
-  ADD_ATTR(Attrs.DW_AT_variable_parameter);
-  ADD_ATTR(Attrs.DW_AT_virtuality);
-  ADD_ATTR(Attrs.DW_AT_visibility);
-  ADD_ATTR(Attrs.DW_AT_vtable_elem_location);
-  ADD_ATTR(Attrs.DW_AT_type);
-
+#include "DIEHashAttributes.def"
   // FIXME: Add the extended attributes.
 }
 
@@ -478,10 +380,12 @@ void DIEHash::computeHash(const DIE &Die) {
 /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
 /// with the inclusion of the full CU and all top level CU entities.
 // TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
-uint64_t DIEHash::computeCUSignature(const DIE &Die) {
+uint64_t DIEHash::computeCUSignature(StringRef DWOName, const DIE &Die) {
   Numbering.clear();
   Numbering[&Die] = 1;
 
+  if (!DWOName.empty())
+    Hash.update(DWOName);
   // Hash the DIE.
   computeHash(Die);
 
@@ -490,9 +394,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) {
   Hash.final(Result);
 
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
 }
 
 /// This is based on the type signature computation given in section 7.27 of the
@@ -514,7 +418,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
   Hash.final(Result);
 
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
index 996cd7e..29337ae 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -28,64 +28,15 @@ class CompileUnit;
 class DIEHash {
   // Collection of all attributes used in hashing a particular DIE.
   struct DIEAttrs {
-    DIEValue DW_AT_name;
-    DIEValue DW_AT_accessibility;
-    DIEValue DW_AT_address_class;
-    DIEValue DW_AT_allocated;
-    DIEValue DW_AT_artificial;
-    DIEValue DW_AT_associated;
-    DIEValue DW_AT_binary_scale;
-    DIEValue DW_AT_bit_offset;
-    DIEValue DW_AT_bit_size;
-    DIEValue DW_AT_bit_stride;
-    DIEValue DW_AT_byte_size;
-    DIEValue DW_AT_byte_stride;
-    DIEValue DW_AT_const_expr;
-    DIEValue DW_AT_const_value;
-    DIEValue DW_AT_containing_type;
-    DIEValue DW_AT_count;
-    DIEValue DW_AT_data_bit_offset;
-    DIEValue DW_AT_data_location;
-    DIEValue DW_AT_data_member_location;
-    DIEValue DW_AT_decimal_scale;
-    DIEValue DW_AT_decimal_sign;
-    DIEValue DW_AT_default_value;
-    DIEValue DW_AT_digit_count;
-    DIEValue DW_AT_discr;
-    DIEValue DW_AT_discr_list;
-    DIEValue DW_AT_discr_value;
-    DIEValue DW_AT_encoding;
-    DIEValue DW_AT_enum_class;
-    DIEValue DW_AT_endianity;
-    DIEValue DW_AT_explicit;
-    DIEValue DW_AT_is_optional;
-    DIEValue DW_AT_location;
-    DIEValue DW_AT_lower_bound;
-    DIEValue DW_AT_mutable;
-    DIEValue DW_AT_ordering;
-    DIEValue DW_AT_picture_string;
-    DIEValue DW_AT_prototyped;
-    DIEValue DW_AT_small;
-    DIEValue DW_AT_segment;
-    DIEValue DW_AT_string_length;
-    DIEValue DW_AT_threads_scaled;
-    DIEValue DW_AT_upper_bound;
-    DIEValue DW_AT_use_location;
-    DIEValue DW_AT_use_UTF8;
-    DIEValue DW_AT_variable_parameter;
-    DIEValue DW_AT_virtuality;
-    DIEValue DW_AT_visibility;
-    DIEValue DW_AT_vtable_elem_location;
-    DIEValue DW_AT_type;
-
-    // Insert any additional ones here...
+#define HANDLE_DIE_HASH_ATTR(NAME) DIEValue NAME;
+#include "DIEHashAttributes.def"
   };
 
 public:
   DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
 
   /// \brief Computes the CU signature.
-  uint64_t computeCUSignature(const DIE &Die);
+  uint64_t computeCUSignature(StringRef DWOName, const DIE &Die);
 
   /// \brief Computes the type signature.
   uint64_t computeTypeSignature(const DIE &Die);
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def
new file mode 100644
index 0000000..28a0239
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHashAttributes.def
@@ -0,0 +1,55 @@
+#ifndef HANDLE_DIE_HASH_ATTR
+#error "Missing macro definition of HANDLE_DIE_HASH_ATTR"
+#endif
+
+HANDLE_DIE_HASH_ATTR(DW_AT_name)
+HANDLE_DIE_HASH_ATTR(DW_AT_accessibility)
+HANDLE_DIE_HASH_ATTR(DW_AT_address_class)
+HANDLE_DIE_HASH_ATTR(DW_AT_allocated)
+HANDLE_DIE_HASH_ATTR(DW_AT_artificial)
+HANDLE_DIE_HASH_ATTR(DW_AT_associated)
+HANDLE_DIE_HASH_ATTR(DW_AT_binary_scale)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_offset)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_size)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_stride)
+HANDLE_DIE_HASH_ATTR(DW_AT_byte_size)
+HANDLE_DIE_HASH_ATTR(DW_AT_byte_stride)
+HANDLE_DIE_HASH_ATTR(DW_AT_const_expr)
+HANDLE_DIE_HASH_ATTR(DW_AT_const_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_containing_type)
+HANDLE_DIE_HASH_ATTR(DW_AT_count)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_bit_offset)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_member_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_decimal_scale)
+HANDLE_DIE_HASH_ATTR(DW_AT_decimal_sign)
+HANDLE_DIE_HASH_ATTR(DW_AT_default_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_digit_count)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr_list)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_encoding)
+HANDLE_DIE_HASH_ATTR(DW_AT_enum_class)
+HANDLE_DIE_HASH_ATTR(DW_AT_endianity)
+HANDLE_DIE_HASH_ATTR(DW_AT_explicit)
+HANDLE_DIE_HASH_ATTR(DW_AT_is_optional)
+HANDLE_DIE_HASH_ATTR(DW_AT_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_lower_bound)
+HANDLE_DIE_HASH_ATTR(DW_AT_mutable)
+HANDLE_DIE_HASH_ATTR(DW_AT_ordering)
+HANDLE_DIE_HASH_ATTR(DW_AT_picture_string)
+HANDLE_DIE_HASH_ATTR(DW_AT_prototyped)
+HANDLE_DIE_HASH_ATTR(DW_AT_small)
+HANDLE_DIE_HASH_ATTR(DW_AT_segment)
+HANDLE_DIE_HASH_ATTR(DW_AT_string_length)
+HANDLE_DIE_HASH_ATTR(DW_AT_threads_scaled)
+HANDLE_DIE_HASH_ATTR(DW_AT_upper_bound)
+HANDLE_DIE_HASH_ATTR(DW_AT_use_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_use_UTF8)
+HANDLE_DIE_HASH_ATTR(DW_AT_variable_parameter)
+HANDLE_DIE_HASH_ATTR(DW_AT_virtuality)
+HANDLE_DIE_HASH_ATTR(DW_AT_visibility)
+HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_type)
+
+#undef HANDLE_DIE_HASH_ATTR
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
index 22fd7bb..c2ad9db 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -194,6 +194,10 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
         // some variables.
         for (const MachineOperand &MO : MI.operands()) {
           if (MO.isReg() && MO.isDef() && MO.getReg()) {
+            // Ignore call instructions that claim to clobber SP. The AArch64
+            // backend does this for aggregate function arguments.
+            if (MI.isCall() && MO.getReg() == SP)
+              continue;
             // If this is a virtual register, only clobber it since it doesn't
             // have aliases.
             if (TRI->isVirtualRegister(MO.getReg()))
@@ -209,8 +213,7 @@ void llvm::calculateDbgValueHistory(const MachineFunction *MF,
           } else if (MO.isRegMask()) {
             // If this is a register mask operand, clobber all debug values in
             // non-CSRs.
-            for (int I = ChangingRegs.find_first(); I != -1;
-                 I = ChangingRegs.find_next(I)) {
+            for (unsigned I : ChangingRegs.set_bits()) {
               // Don't consider SP to be clobbered by register masks.
               if (unsigned(I) != SP && TRI->isPhysicalRegister(I) &&
                   MO.clobbersPhysReg(I)) {
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 9419098..0971c59 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   return getBaseTypeSize(BaseType);
 }
 
+static bool hasDebugInfo(const MachineModuleInfo *MMI,
+                         const MachineFunction *MF) {
+  if (!MMI->hasDebugInfo())
+    return false;
+  auto *SP = MF->getFunction()->getSubprogram();
+  if (!SP)
+    return false;
+  assert(SP->getUnit());
+  auto EK = SP->getUnit()->getEmissionKind();
+  if (EK == DICompileUnit::NoDebug)
+    return false;
+  return true;
+}
+
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
+  PrevInstBB = nullptr;
+
+  if (!Asm || !hasDebugInfo(MMI, MF)) {
+    skippedNonDebugFunction();
+    return;
+  }
+
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
-  if (LScopes.empty())
+  if (LScopes.empty()) {
+    beginFunctionImpl(MF);
     return;
+  }
 
   // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
@@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   PrevInstLoc = DebugLoc();
   PrevLabel = Asm->getFunctionBegin();
+  beginFunctionImpl(MF);
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
@@ -200,9 +224,9 @@ void DebugHandlerBase::endInstruction() {
     return;
 
   assert(CurMI != nullptr);
-  // Don't create a new label after DBG_VALUE instructions.
-  // They don't generate code.
-  if (!CurMI->isDebugValue()) {
+  // Don't create a new label after DBG_VALUE and other instructions that don't
+  // generate code.
+  if (!CurMI->isMetaInstruction()) {
     PrevLabel = nullptr;
     PrevInstBB = CurMI->getParent();
   }
@@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
+  if (hasDebugInfo(MMI, MF))
+    endFunctionImpl(MF);
   DbgValues.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index c00fa18..659a921 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -80,6 +80,10 @@ protected:
     LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
 
+  virtual void beginFunctionImpl(const MachineFunction *MF) = 0;
+  virtual void endFunctionImpl(const MachineFunction *MF) = 0;
+  virtual void skippedNonDebugFunction() {}
+
   // AsmPrinterHandler overrides.
 public:
   void beginInstruction(const MachineInstr *MI) override;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 36fb150..a68e8cc 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -76,7 +76,8 @@ public:
     const DIExpression *getExpression() const { return Expression; }
     friend bool operator==(const Value &, const Value &);
     friend bool operator<(const Value &, const Value &);
-    void dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump() const {
       if (isLocation()) {
         llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " ";
         if (Loc.isIndirect())
@@ -90,6 +91,7 @@ public:
       if (Expression)
         Expression->dump();
     }
+#endif
   };
 
 private:
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
index 3656e9d..0c551df 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
@@ -10,9 +10,9 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H
 
+#include "ByteStreamer.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
-#include "ByteStreamer.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 05ac1cb..b1ef8cf 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -16,12 +16,12 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index e08306b..dd7f793 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -14,6 +14,7 @@
 #include "DwarfException.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -28,7 +29,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetFrameLowering.h"
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d904372..676c48f 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1,3 +1,16 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
 #include "DwarfCompileUnit.h"
 #include "DwarfExpression.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   bool addToAccelTable = false;
   DIELoc *Loc = nullptr;
   std::unique_ptr<DIEDwarfExpression> DwarfExpr;
-  bool AllConstant = std::all_of(
-      GlobalExprs.begin(), GlobalExprs.end(),
-      [&](const GlobalExpr GE) {
-        return GE.Expr && GE.Expr->isConstant();
-      });
-
   for (const auto &GE : GlobalExprs) {
     const GlobalVariable *Global = GE.Var;
     const DIExpression *Expr = GE.Expr;
+
     // For compatibility with DWARF 3 and earlier,
     // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes
     // DW_AT_const_value(X).
     if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) {
+      addToAccelTable = true;
       addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1));
-      // We cannot describe the location of dllimport'd variables: the
-      // computation of their address requires loads from the IAT.
-    } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) {
-      if (!Loc) {
-        Loc = new (DIEValueAllocator) DIELoc;
-        DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
-      }
+      break;
+    }
+
+    // We cannot describe the location of dllimport'd variables: the
+    // computation of their address requires loads from the IAT.
+    if (Global && Global->hasDLLImportStorageClass())
+      continue;
+
+    // Nothing to describe without address or constant.
+    if (!Global && (!Expr || !Expr->isConstant()))
+      continue;
+
+    if (!Loc) {
       addToAccelTable = true;
-      if (Global) {
-        const MCSymbol *Sym = Asm->getSymbol(Global);
-        if (Global->isThreadLocal()) {
-          if (Asm->TM.Options.EmulatedTLS) {
-            // TODO: add debug info for emulated thread local mode.
-          } else {
-            // FIXME: Make this work with -gsplit-dwarf.
-            unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-            assert((PointerSize == 4 || PointerSize == 8) &&
-                   "Add support for other sizes if necessary");
-            // Based on GCC's support for TLS:
-            if (!DD->useSplitDwarf()) {
-              // 1) Start with a constNu of the appropriate pointer size
-              addUInt(*Loc, dwarf::DW_FORM_data1,
-                      PointerSize == 4 ? dwarf::DW_OP_const4u
-                                       : dwarf::DW_OP_const8u);
-              // 2) containing the (relocated) offset of the TLS variable
-              //    within the module's TLS block.
-              addExpr(*Loc, dwarf::DW_FORM_udata,
-                      Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
-            } else {
-              addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-              addUInt(*Loc, dwarf::DW_FORM_udata,
-                      DD->getAddressPool().getIndex(Sym, /* TLS */ true));
-            }
-            // 3) followed by an OP to make the debugger do a TLS lookup.
+      Loc = new (DIEValueAllocator) DIELoc;
+      DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
+    }
+
+    if (Global) {
+      const MCSymbol *Sym = Asm->getSymbol(Global);
+      if (Global->isThreadLocal()) {
+        if (Asm->TM.Options.EmulatedTLS) {
+          // TODO: add debug info for emulated thread local mode.
+        } else {
+          // FIXME: Make this work with -gsplit-dwarf.
+          unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+          assert((PointerSize == 4 || PointerSize == 8) &&
+                 "Add support for other sizes if necessary");
+          // Based on GCC's support for TLS:
+          if (!DD->useSplitDwarf()) {
+            // 1) Start with a constNu of the appropriate pointer size
             addUInt(*Loc, dwarf::DW_FORM_data1,
-                    DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
-                                          : dwarf::DW_OP_form_tls_address);
+                    PointerSize == 4 ? dwarf::DW_OP_const4u
+                                     : dwarf::DW_OP_const8u);
+            // 2) containing the (relocated) offset of the TLS variable
+            //    within the module's TLS block.
+            addExpr(*Loc, dwarf::DW_FORM_udata,
+                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+          } else {
+            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+            addUInt(*Loc, dwarf::DW_FORM_udata,
+                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));
           }
-        } else {
-          DD->addArangeLabel(SymbolCU(this, Sym));
-          addOpAddress(*Loc, Sym);
+          // 3) followed by an OP to make the debugger do a TLS lookup.
+          addUInt(*Loc, dwarf::DW_FORM_data1,
+                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                        : dwarf::DW_OP_form_tls_address);
         }
+      } else {
+        DD->addArangeLabel(SymbolCU(this, Sym));
+        addOpAddress(*Loc, Sym);
       }
-      if (Expr) {
-        DwarfExpr->addFragmentOffset(Expr);
-        DwarfExpr->AddExpression(Expr);
-      }
+    }
+    if (Expr) {
+      DwarfExpr->addFragmentOffset(Expr);
+      DwarfExpr->addExpression(Expr);
     }
   }
   if (Loc)
@@ -227,17 +245,6 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
   CURanges.back().setEnd(Range.getEnd());
 }
 
-DIE::value_iterator
-DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                                  const MCSymbol *Label, const MCSymbol *Sec) {
-  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-    return addLabel(Die, Attribute,
-                    DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                               : dwarf::DW_FORM_data4,
-                    Label);
-  return addSectionDelta(Die, Attribute, Label, Sec);
-}
-
 void DwarfCompileUnit::initStmtList() {
   // Define start line table label for each Compile Unit.
   MCSymbol *LineTableStartSym =
@@ -362,15 +369,6 @@ void DwarfCompileUnit::constructScopeDIE(
   FinalChildren.push_back(std::move(ScopeDIE));
 }
 
-DIE::value_iterator
-DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
-                                  const MCSymbol *Hi, const MCSymbol *Lo) {
-  return Die.addValue(DIEValueAllocator, Attribute,
-                      DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
-                                                 : dwarf::DW_FORM_data4,
-                      new (DIEValueAllocator) DIEDelta(Hi, Lo));
-}
-
 void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
                                          SmallVector<RangeSpan, 2> Range) {
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
@@ -422,7 +420,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
   auto *InlinedSP = getDISubprogram(DS);
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+  DIE *OriginDIE = getAbstractSPDies()[InlinedSP];
   assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
 
   auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
@@ -507,8 +505,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
         DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
         // If there is an expression, emit raw unsigned bytes.
         DwarfExpr.addFragmentOffset(Expr);
-        DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm());
-        DwarfExpr.AddExpression(Expr);
+        DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm());
+        DwarfExpr.addExpression(Expr);
         addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
       } else
         addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
@@ -529,12 +527,19 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
   for (auto &Fragment : DV.getFrameIndexExprs()) {
     unsigned FrameReg = 0;
+    const DIExpression *Expr = Fragment.Expr;
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
-    DwarfExpr.addFragmentOffset(Fragment.Expr);
-    DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                    FrameReg, Offset);
-    DwarfExpr.AddExpression(Fragment.Expr);
+    DwarfExpr.addFragmentOffset(Expr);
+    SmallVector<uint64_t, 8> Ops;
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(Offset);
+    Ops.append(Expr->elements_begin(), Expr->elements_end());
+    DIExpressionCursor Cursor(Ops);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addMachineRegExpression(
+        *Asm->MF->getSubtarget().getRegisterInfo(), Cursor, FrameReg);
+    DwarfExpr.addExpression(std::move(Cursor));
   }
   addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
 
@@ -609,7 +614,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
 
 void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
     LexicalScope *Scope) {
-  DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+  DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()];
   if (AbsDef)
     return;
 
@@ -659,8 +664,9 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
   else
     EntityDie = getDIE(Entity);
   assert(EntityDie);
-  addSourceLine(*IMDie, Module->getLine(), Module->getScope()->getFilename(),
-                Module->getScope()->getDirectory());
+  auto *File = Module->getFile();
+  addSourceLine(*IMDie, Module->getLine(), File ? File->getFilename() : "",
+                File ? File->getDirectory() : "");
   addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
   StringRef Name = Module->getName();
   if (!Name.empty())
@@ -671,7 +677,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE(
 
 void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
   DIE *D = getDIE(SP);
-  if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+  if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) {
     if (D)
       // If this subprogram has an abstract definition, reference that
       addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
@@ -683,6 +689,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
   }
 }
 
+void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) {
+  DbgVariable *AbsVar = getExistingAbstractVariable(
+      InlinedVariable(Var.getVariable(), Var.getInlinedAt()));
+  auto *VariableDie = Var.getDIE();
+  if (AbsVar && AbsVar->getDIE()) {
+    addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+                      *AbsVar->getDIE());
+  } else
+    applyVariableAttributes(Var, *VariableDie);
+}
+
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) {
+  const DILocalVariable *Cleansed;
+  return getExistingAbstractVariable(IV, Cleansed);
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(
+    InlinedVariable IV, const DILocalVariable *&Cleansed) {
+  // More then one inlined variable corresponds to one abstract variable.
+  Cleansed = IV.first;
+  auto &AbstractVariables = getAbstractVariables();
+  auto I = AbstractVariables.find(Cleansed);
+  if (I != AbstractVariables.end())
+    return I->second.get();
+  return nullptr;
+}
+
+void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var,
+                                        LexicalScope *Scope) {
+  assert(Scope && Scope->isAbstractScope());
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
+  DU->addScopeVariable(Scope, AbsDbgVariable.get());
+  getAbstractVariables()[Var] = std::move(AbsDbgVariable);
+}
+
 void DwarfCompileUnit::emitHeader(bool UseOffsets) {
   // Don't bother labeling the .dwo unit, as its offset isn't used.
   if (!Skeleton) {
@@ -690,27 +732,54 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {
     Asm->OutStreamer->EmitLabel(LabelBegin);
   }
 
-  DwarfUnit::emitHeader(UseOffsets);
+  dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile
+                                : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton
+                                                      : dwarf::DW_UT_compile;
+  DwarfUnit::emitCommonHeader(UseOffsets, UT);
 }
 
 /// addGlobalName - Add a new global name to the compile unit.
-void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
+void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,
                                      const DIScope *Context) {
-  if (includeMinimalInlineScopes())
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
     return;
   std::string FullName = getParentContextString(Context) + Name.str();
   GlobalNames[FullName] = &Die;
 }
 
+void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
+                                                const DIScope *Context) {
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
 /// Add a new global type to the unit.
 void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
                                      const DIScope *Context) {
-  if (includeMinimalInlineScopes())
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
     return;
   std::string FullName = getParentContextString(Context) + Ty->getName().str();
   GlobalTypes[FullName] = &Die;
 }
 
+void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
+                                             const DIScope *Context) {
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
+    return;
+  std::string FullName = getParentContextString(Context) + Ty->getName().str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
 /// addVariableAddress - Add DW_AT_location attribute for a
 /// DbgVariable based on provided MachineLocation.
 void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
@@ -727,22 +796,23 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
 void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
                                   const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-  DIEDwarfExpression Expr(*Asm, *this, *Loc);
-
-  bool validReg;
-  if (Location.isReg())
-    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                  Location.getReg());
-  else
-    validReg =
-        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                   Location.getReg(), Location.getOffset());
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  if (Location.isIndirect())
+    DwarfExpr.setMemoryLocationKind();
 
-  if (!validReg)
+  SmallVector<uint64_t, 8> Ops;
+  if (Location.isIndirect() && Location.getOffset()) {
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(Location.getOffset());
+  }
+  DIExpressionCursor Cursor(Ops);
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
     return;
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Expr.finalize());
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Start with the address based on the location provided, and generate the
@@ -754,23 +824,25 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
                                          const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
-  const DIExpression *Expr = DV.getSingleExpression();
-  DIExpressionCursor ExprCursor(Expr);
+  const DIExpression *DIExpr = DV.getSingleExpression();
+  DwarfExpr.addFragmentOffset(DIExpr);
+  if (Location.isIndirect())
+    DwarfExpr.setMemoryLocationKind();
+
+  SmallVector<uint64_t, 8> Ops;
+  if (Location.isIndirect() && Location.getOffset()) {
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(Location.getOffset());
+  }
+  Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  DIExpressionCursor Cursor(Ops);
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
-  auto Reg = Location.getReg();
-  DwarfExpr.addFragmentOffset(Expr);
-  bool ValidReg =
-      Location.getOffset()
-          ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset())
-          : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg);
-
-  if (!ValidReg)
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
     return;
-
-  DwarfExpr.AddExpression(std::move(ExprCursor));
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Add a Dwarf loclistptr attribute data and value.
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index a8025f1..e386727 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,8 +15,8 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DwarfUnit.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/Support/Dwarf.h"
 
 namespace llvm {
 
@@ -28,7 +28,7 @@ class DwarfFile;
 class MCSymbol;
 class LexicalScope;
 
-class DwarfCompileUnit : public DwarfUnit {
+class DwarfCompileUnit final : public DwarfUnit {
   /// A numeric ID unique among all CUs in the module
   unsigned UniqueID;
 
@@ -68,13 +68,26 @@ class DwarfCompileUnit : public DwarfUnit {
   // ranges/locs.
   const MCSymbol *BaseAddress;
 
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+
   /// \brief Construct a DIE for the given DbgVariable without initializing the
   /// DbgVariable's DIE reference.
   DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
 
   bool isDwoUnit() const override;
 
-  bool includeMinimalInlineScopes() const;
+  DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractSPDies;
+    return DU->getAbstractSPDies();
+  }
+
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+    if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+      return AbstractVariables;
+    return DU->getAbstractVariables();
+  }
 
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
@@ -86,6 +99,8 @@ public:
     return Skeleton;
   }
 
+  bool includeMinimalInlineScopes() const;
+
   void initStmtList();
 
   /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
@@ -112,10 +127,6 @@ public:
   void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
                             const MCSymbol *Label);
 
-  /// addSectionDelta - Add a label delta attribute data and value.
-  DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
-                                      const MCSymbol *Hi, const MCSymbol *Lo);
-
   DwarfCompileUnit &getCU() override { return *this; }
 
   unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
@@ -136,12 +147,6 @@ public:
 
   void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
 
-  /// addSectionLabel - Add a Dwarf section label attribute data and value.
-  ///
-  DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
-                                      const MCSymbol *Label,
-                                      const MCSymbol *Sec);
-
   /// \brief Find DIE for the given subprogram and attach appropriate
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
@@ -189,6 +194,13 @@ public:
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
 
   void finishSubprogramDefinition(const DISubprogram *SP);
+  void finishVariableDefinition(const DbgVariable &Var);
+  /// Find abstract variable associated with Var.
+  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
+                                           const DILocalVariable *&Cleansed);
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
+  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
 
   /// Set the skeleton unit associated with this unit.
   void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
@@ -210,12 +222,19 @@ public:
   }
 
   /// Add a new global name to the compile unit.
-  void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override;
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+
+  /// Add a new global name present in a type unit to this compile unit.
+  void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);
 
   /// Add a new global type to the compile unit.
   void addGlobalType(const DIType *Ty, const DIE &Die,
                      const DIScope *Context) override;
 
+  /// Add a new global type present in a type unit to this compile unit.
+  void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context);
+
   const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
   const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 91a3d09..f1b4d9f 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -38,8 +39,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
@@ -72,6 +71,10 @@ static cl::opt<bool> GenerateARangeSection("generate-arange-section",
                                            cl::desc("Generate dwarf aranges"),
                                            cl::init(false));
 
+static cl::opt<bool> SplitDwarfCrossCuReferences(
+    "split-dwarf-cross-cu-references", cl::Hidden,
+    cl::desc("Enable cross-cu references in DWO files"), cl::init(false));
+
 namespace {
 enum DefaultOnOff { Default, Enable, Disable };
 }
@@ -92,14 +95,6 @@ DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
                  cl::init(Default));
 
 static cl::opt<DefaultOnOff>
-SplitDwarf("split-dwarf", cl::Hidden,
-           cl::desc("Output DWARF5 split debug info."),
-           cl::values(clEnumVal(Default, "Default for platform"),
-                      clEnumVal(Enable, "Enabled"),
-                      clEnumVal(Disable, "Disabled")),
-           cl::init(Default));
-
-static cl::opt<DefaultOnOff>
 DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
                  cl::desc("Generate DWARF pubnames and pubtypes sections"),
                  cl::values(clEnumVal(Default, "Default for platform"),
@@ -127,17 +122,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission";
 static const char *const DbgTimerName = "writer";
 static const char *const DbgTimerDescription = "DWARF Debug Writer";
 
-void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
   BS.EmitInt8(
       Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
                   : dwarf::OperationEncodingString(Op));
 }
 
-void DebugLocDwarfExpression::EmitSigned(int64_t Value) {
+void DebugLocDwarfExpression::emitSigned(int64_t Value) {
   BS.EmitSLEB128(Value, Twine(Value));
 }
 
-void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) {
+void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
   BS.EmitULEB128(Value, Twine(Value));
 }
 
@@ -200,6 +195,12 @@ const DIType *DbgVariable::getType() const {
 }
 
 ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const {
+  if (FrameIndexExprs.size() == 1)
+    return FrameIndexExprs;
+
+  assert(all_of(FrameIndexExprs,
+                [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) &&
+         "multiple FI expressions without DW_OP_LLVM_fragment");
   std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(),
             [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {
               return A.Expr->getFragmentInfo()->OffsetInBits <
@@ -248,17 +249,8 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
 
   HasAppleExtensionAttributes = tuneForLLDB();
 
-  // Handle split DWARF. Off by default for now.
-  if (SplitDwarf == Default)
-    HasSplitDwarf = false;
-  else
-    HasSplitDwarf = SplitDwarf == Enable;
-
-  // Pubnames/pubtypes on by default for GDB.
-  if (DwarfPubSections == Default)
-    HasDwarfPubSections = tuneForGDB();
-  else
-    HasDwarfPubSections = DwarfPubSections == Enable;
+  // Handle split DWARF.
+  HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty();
 
   // SCE defaults to linkage names only for abstract subprograms.
   if (DwarfLinkageNames == DefaultLinkageNames)
@@ -368,25 +360,49 @@ template <typename Func> static void forBothCUs(DwarfCompileUnit &CU, Func F) {
       F(*SkelCU);
 }
 
-void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+bool DwarfDebug::shareAcrossDWOCUs() const {
+  return SplitDwarfCrossCuReferences;
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
+                                                     LexicalScope *Scope) {
   assert(Scope && Scope->getScopeNode());
   assert(Scope->isAbstractScope());
   assert(!Scope->getInlinedAt());
 
   auto *SP = cast<DISubprogram>(Scope->getScopeNode());
 
-  ProcessedSPNodes.insert(SP);
-
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  auto &CU = *CUMap.lookup(SP->getUnit());
-  forBothCUs(CU, [&](DwarfCompileUnit &CU) {
-    CU.constructAbstractSubprogramScopeDIE(Scope);
-  });
+  if (useSplitDwarf() && !shareAcrossDWOCUs() && !SP->getUnit()->getSplitDebugInlining())
+    // Avoid building the original CU if it won't be used
+    SrcCU.constructAbstractSubprogramScopeDIE(Scope);
+  else {
+    auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+    if (auto *SkelCU = CU.getSkeleton()) {
+      (shareAcrossDWOCUs() ? CU : SrcCU)
+          .constructAbstractSubprogramScopeDIE(Scope);
+      if (CU.getCUNode()->getSplitDebugInlining())
+        SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+    } else
+      CU.constructAbstractSubprogramScopeDIE(Scope);
+  }
+}
+
+bool DwarfDebug::hasDwarfPubSections(bool includeMinimalInlineScopes) const {
+  // Opting in to GNU Pubnames/types overrides the default to ensure these are
+  // generated for things like Gold's gdb_index generation.
+  if (GenerateGnuPubSections)
+    return true;
+
+  if (DwarfPubSections == Default)
+    return tuneForGDB() && !includeMinimalInlineScopes;
+
+  return DwarfPubSections == Enable;
 }
 
-void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
-  if (!GenerateGnuPubSections)
+void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const {
+  if (!hasDwarfPubSections(U.includeMinimalInlineScopes()))
     return;
 
   U.addFlag(D, dwarf::DW_AT_GNU_pubnames);
@@ -395,7 +411,9 @@ void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
 // Create new DwarfCompileUnit for the given metadata node with tag
 // DW_TAG_compile_unit.
 DwarfCompileUnit &
-DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
+DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
+  if (auto *CU = CUMap.lookup(DIUnit))
+    return *CU;
   StringRef FN = DIUnit->getFilename();
   CompilationDir = DIUnit->getDirectory();
 
@@ -407,7 +425,7 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
   if (useSplitDwarf()) {
     NewCU.setSkeleton(constructSkeletonCU(NewCU));
     NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
-                    DIUnit->getSplitDebugFilename());
+                  Asm->TM.Options.MCOptions.SplitDwarfFile);
   }
 
   // LTO with assembly output shares a single line table amongst multiple CUs.
@@ -418,7 +436,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
     Asm->OutStreamer->getContext().setMCLineTableCompilationDir(
         NewCU.getUniqueID(), CompilationDir);
 
-  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer());
+  StringRef Producer = DIUnit->getProducer();
+  StringRef Flags = DIUnit->getFlags();
+  if (!Flags.empty()) {
+    std::string ProducerWithFlags = Producer.str() + " " + Flags.str();
+    NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags);
+  } else
+    NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
+
   NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                 DIUnit->getSourceLanguage());
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
@@ -521,7 +546,12 @@ void DwarfDebug::beginModule() {
   }
 
   for (DICompileUnit *CUNode : M->debug_compile_units()) {
-    DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode);
+    if (CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() &&
+        CUNode->getGlobalVariables().empty() &&
+        CUNode->getImportedEntities().empty() && CUNode->getMacros().empty())
+      continue;
+
+    DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(CUNode);
     for (auto *IE : CUNode->getImportedEntities())
       CU.addImportedEntity(IE);
 
@@ -544,7 +574,6 @@ void DwarfDebug::beginModule() {
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
       if (DIType *RT = dyn_cast<DIType>(Ty))
-        if (!RT->isExternalTypeRef())
           // There is no point in force-emitting a forward declaration.
           CU.getOrCreateTypeDIE(RT);
     }
@@ -564,22 +593,17 @@ void DwarfDebug::finishVariableDefinitions() {
     // DIE::getUnit isn't simple - it walks parent pointers, etc.
     DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie());
     assert(Unit);
-    DbgVariable *AbsVar = getExistingAbstractVariable(
-        InlinedVariable(Var->getVariable(), Var->getInlinedAt()));
-    if (AbsVar && AbsVar->getDIE()) {
-      Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
-                        *AbsVar->getDIE());
-    } else
-      Unit->applyVariableAttributes(*Var, *VariableDie);
+    Unit->finishVariableDefinition(*Var);
   }
 }
 
 void DwarfDebug::finishSubprogramDefinitions() {
-  for (const DISubprogram *SP : ProcessedSPNodes)
-    if (SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug)
-      forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) {
-        CU.finishSubprogramDefinition(SP);
-      });
+  for (const DISubprogram *SP : ProcessedSPNodes) {
+    assert(SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug);
+    forBothCUs(
+        getOrCreateDwarfCompileUnit(SP->getUnit()),
+        [&](DwarfCompileUnit &CU) { CU.finishSubprogramDefinition(SP); });
+  }
 }
 
 void DwarfDebug::finalizeModuleInfo() {
@@ -589,6 +613,13 @@ void DwarfDebug::finalizeModuleInfo() {
 
   finishVariableDefinitions();
 
+  // Include the DWO file name in the hash if there's more than one CU.
+  // This handles ThinLTO's situation where imported CUs may very easily be
+  // duplicate with the same CU partially imported into another ThinLTO unit.
+  StringRef DWOName;
+  if (CUMap.size() > 1)
+    DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile;
+
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
   for (const auto &P : CUMap) {
@@ -603,7 +634,8 @@ void DwarfDebug::finalizeModuleInfo() {
     auto *SkCU = TheCU.getSkeleton();
     if (useSplitDwarf()) {
       // Emit a unique identifier for this CU.
-      uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie());
+      uint64_t ID =
+          DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie());
       TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
                     dwarf::DW_FORM_data8, ID);
       SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
@@ -712,63 +744,40 @@ void DwarfDebug::endModule() {
   }
 
   // Emit the pubnames and pubtypes sections if requested.
-  if (HasDwarfPubSections) {
+  // The condition is optimistically correct - any CU not using GMLT (&
+  // implicit/default pubnames state) might still have pubnames.
+  if (hasDwarfPubSections(/* gmlt */ false)) {
     emitDebugPubNames(GenerateGnuPubSections);
     emitDebugPubTypes(GenerateGnuPubSections);
   }
 
   // clean up.
-  AbstractVariables.clear();
-}
-
-// Find abstract variable, if any, associated with Var.
-DbgVariable *
-DwarfDebug::getExistingAbstractVariable(InlinedVariable IV,
-                                        const DILocalVariable *&Cleansed) {
-  // More then one inlined variable corresponds to one abstract variable.
-  Cleansed = IV.first;
-  auto I = AbstractVariables.find(Cleansed);
-  if (I != AbstractVariables.end())
-    return I->second.get();
-  return nullptr;
+  // FIXME: AbstractVariables.clear();
 }
 
-DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
-  const DILocalVariable *Cleansed;
-  return getExistingAbstractVariable(IV, Cleansed);
-}
-
-void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
-                                        LexicalScope *Scope) {
-  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
-  InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
-  AbstractVariables[Var] = std::move(AbsDbgVariable);
-}
-
-void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV,
+void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV,
                                                  const MDNode *ScopeNode) {
   const DILocalVariable *Cleansed = nullptr;
-  if (getExistingAbstractVariable(IV, Cleansed))
+  if (CU.getExistingAbstractVariable(IV, Cleansed))
     return;
 
-  createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
+  CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
                                        cast<DILocalScope>(ScopeNode)));
 }
 
-void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(
+void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU,
     InlinedVariable IV, const MDNode *ScopeNode) {
   const DILocalVariable *Cleansed = nullptr;
-  if (getExistingAbstractVariable(IV, Cleansed))
+  if (CU.getExistingAbstractVariable(IV, Cleansed))
     return;
 
   if (LexicalScope *Scope =
           LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
-    createAbstractVariable(Cleansed, Scope);
+    CU.createAbstractVariable(Cleansed, Scope);
 }
-
 // Collect variable information from side table maintained by MF.
 void DwarfDebug::collectVariableInfoFromMFTable(
-    DenseSet<InlinedVariable> &Processed) {
+    DwarfCompileUnit &TheCU, DenseSet<InlinedVariable> &Processed) {
   for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
     if (!VI.Var)
       continue;
@@ -783,7 +792,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
     if (!Scope)
       continue;
 
-    ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
+    ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode());
     auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
     if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
@@ -954,24 +963,71 @@ DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
   }
 }
 
-DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
+DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU,
+                                                LexicalScope &Scope,
                                                 InlinedVariable IV) {
-  ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
+  ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode());
   ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
   InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
   return ConcreteVariables.back().get();
 }
 
-// Determine whether this DBG_VALUE is valid at the beginning of the function.
-static bool validAtEntry(const MachineInstr *MInsn) {
-  auto MBB = MInsn->getParent();
-  // Is it in the entry basic block?
-  if (!MBB->pred_empty())
+/// Determine whether a *singular* DBG_VALUE is valid for the entirety of its
+/// enclosing lexical scope. The check ensures there are no other instructions
+/// in the same lexical scope preceding the DBG_VALUE and that its range is
+/// either open or otherwise rolls off the end of the scope.
+static bool validThroughout(LexicalScopes &LScopes,
+                            const MachineInstr *DbgValue,
+                            const MachineInstr *RangeEnd) {
+  assert(DbgValue->getDebugLoc() && "DBG_VALUE without a debug location");
+  auto MBB = DbgValue->getParent();
+  auto DL = DbgValue->getDebugLoc();
+  auto *LScope = LScopes.findLexicalScope(DL);
+  // Scope doesn't exist; this is a dead DBG_VALUE.
+  if (!LScope)
     return false;
-  for (MachineBasicBlock::const_reverse_iterator I(MInsn); I != MBB->rend(); ++I)
-    if (!(I->isDebugValue() || I->getFlag(MachineInstr::FrameSetup)))
+  auto &LSRange = LScope->getRanges();
+  if (LSRange.size() == 0)
+    return false;
+
+  // Determine if the DBG_VALUE is valid at the beginning of its lexical block.
+  const MachineInstr *LScopeBegin = LSRange.front().first;
+  // Early exit if the lexical scope begins outside of the current block.
+  if (LScopeBegin->getParent() != MBB)
+    return false;
+  MachineBasicBlock::const_reverse_iterator Pred(DbgValue);
+  for (++Pred; Pred != MBB->rend(); ++Pred) {
+    if (Pred->getFlag(MachineInstr::FrameSetup))
+      break;
+    auto PredDL = Pred->getDebugLoc();
+    if (!PredDL || Pred->isMetaInstruction())
+      continue;
+    // Check whether the instruction preceding the DBG_VALUE is in the same
+    // (sub)scope as the DBG_VALUE.
+    if (DL->getScope() == PredDL->getScope())
       return false;
-  return true;
+    auto *PredScope = LScopes.findLexicalScope(PredDL);
+    if (!PredScope || LScope->dominates(PredScope))
+      return false;
+  }
+
+  // If the range of the DBG_VALUE is open-ended, report success.
+  if (!RangeEnd)
+    return true;
+
+  // Fail if there are instructions belonging to our scope in another block.
+  const MachineInstr *LScopeEnd = LSRange.back().second;
+  if (LScopeEnd->getParent() != MBB)
+    return false;
+
+  // Single, constant DBG_VALUEs in the prologue are promoted to be live
+  // throughout the function. This is a hack, presumably for DWARF v2 and not
+  // necessarily correct. It would be much better to use a dbg.declare instead
+  // if we know the constant is live throughout the scope.
+  if (DbgValue->getOperand(0).isImm() && MBB->pred_empty())
+    return true;
+
+  return false;
 }
 
 // Find variables for each lexical scope.
@@ -979,7 +1035,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
                                      const DISubprogram *SP,
                                      DenseSet<InlinedVariable> &Processed) {
   // Grab the variable info that was squirreled away in the MMI side-table.
-  collectVariableInfoFromMFTable(Processed);
+  collectVariableInfoFromMFTable(TheCU, Processed);
 
   for (const auto &I : DbgValues) {
     InlinedVariable IV = I.first;
@@ -1001,16 +1057,14 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
       continue;
 
     Processed.insert(IV);
-    DbgVariable *RegVar = createConcreteVariable(*Scope, IV);
+    DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV);
 
     const MachineInstr *MInsn = Ranges.front().first;
     assert(MInsn->isDebugValue() && "History must begin with debug value");
 
-    // Check if there is a single DBG_VALUE, valid throughout the function.
-    // A single constant is also considered valid for the entire function.
+    // Check if there is a single DBG_VALUE, valid throughout the var's scope.
     if (Ranges.size() == 1 &&
-        (MInsn->getOperand(0).isImm() ||
-         (validAtEntry(MInsn) && Ranges.front().second == nullptr))) {
+        validThroughout(LScopes, MInsn, Ranges.front().second)) {
       RegVar->initializeDbgValue(MInsn);
       continue;
     }
@@ -1037,7 +1091,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
   for (const DILocalVariable *DV : SP->getVariables()) {
     if (Processed.insert(InlinedVariable(DV, nullptr)).second)
       if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
-        createConcreteVariable(*Scope, InlinedVariable(DV, nullptr));
+        createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr));
   }
 }
 
@@ -1046,8 +1100,12 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
   assert(CurMI);
 
+  const auto *SP = MI->getParent()->getParent()->getFunction()->getSubprogram();
+  if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
+    return;
+
   // Check if source location changes, but ignore DBG_VALUE and CFI locations.
-  if (MI->isDebugValue() || MI->isCFIInstruction())
+  if (MI->isMetaInstruction())
     return;
   const DebugLoc &DL = MI->getDebugLoc();
   // When we emit a line-0 record, we don't update PrevInstLoc; so look at
@@ -1129,7 +1187,7 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
   // the beginning of the function body.
   for (const auto &MBB : *MF)
     for (const auto &MI : MBB)
-      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+      if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) &&
           MI.getDebugLoc())
         return MI.getDebugLoc();
   return DebugLoc();
@@ -1137,75 +1195,50 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
 
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
-void DwarfDebug::beginFunction(const MachineFunction *MF) {
+void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   CurFn = MF;
 
-  // If there's no debug info for the function we're not going to do anything.
-  if (!MMI->hasDebugInfo())
+  auto *SP = MF->getFunction()->getSubprogram();
+  assert(LScopes.empty() || SP == LScopes.getCurrentFunctionScope()->getScopeNode());
+  if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
     return;
 
-  auto DI = MF->getFunction()->getSubprogram();
-  if (!DI)
-    return;
-
-  // Grab the lexical scopes for the function, if we don't have any of those
-  // then we're not going to be able to do anything.
-  DebugHandlerBase::beginFunction(MF);
-  if (LScopes.empty())
-    return;
+  DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
 
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
-  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  // FnScope->getScopeNode() and DI->second should represent the same function,
-  // though they may not be the same MDNode due to inline functions merged in
-  // LTO where the debug info metadata still differs (either due to distinct
-  // written differences - two versions of a linkonce_odr function
-  // written/copied into two separate files, or some sub-optimal metadata that
-  // isn't structurally identical (see: file path/name info from clang, which
-  // includes the directory of the cpp file being built, even when the file name
-  // is absolute (such as an <> lookup header)))
-  auto *SP = cast<DISubprogram>(FnScope->getScopeNode());
-  DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit());
-  if (!TheCU) {
-    assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug &&
-           "DICompileUnit missing from llvm.dbg.cu?");
-    return;
-  }
   if (Asm->OutStreamer->hasRawTextSupport())
     // Use a single line table if we are generating assembly.
     Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
   else
-    Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+    Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID());
 
   // Record beginning of function.
   PrologEndLoc = findPrologueEndLoc(MF);
-  if (DILocation *L = PrologEndLoc) {
+  if (PrologEndLoc) {
     // We'd like to list the prologue as "not statements" but GDB behaves
     // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-    auto *SP = L->getInlinedAtScope()->getSubprogram();
+    auto *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram();
     recordSourceLine(SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT);
   }
 }
 
+void DwarfDebug::skippedNonDebugFunction() {
+  // If we don't have a subprogram for this function then there will be a hole
+  // in the range information. Keep note of this by setting the previously used
+  // section to nullptr.
+  PrevCU = nullptr;
+  CurFn = nullptr;
+}
+
 // Gather and emit post-function debug information.
-void DwarfDebug::endFunction(const MachineFunction *MF) {
+void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
+  const DISubprogram *SP = MF->getFunction()->getSubprogram();
+
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
 
-  const DISubprogram *SP = MF->getFunction()->getSubprogram();
-  if (!MMI->hasDebugInfo() || !SP ||
-      SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) {
-    // If we don't have a subprogram for this function then there will be a hole
-    // in the range information. Keep note of this by setting the previously
-    // used section to nullptr.
-    PrevCU = nullptr;
-    CurFn = nullptr;
-    DebugHandlerBase::endFunction(MF);
-    return;
-  }
-
   // Set DwarfDwarfCompileUnitID in MCContext to default value.
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
 
@@ -1220,17 +1253,14 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd()));
 
   // Under -gmlt, skip building the subprogram if there are no inlined
-  // subroutines inside it.
-  if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
+  // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
+  // is still needed as we need its source location.
+  if (!TheCU.getCUNode()->getDebugInfoForProfiling() &&
+      TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
       LScopes.getAbstractScopesList().empty() && !IsDarwin) {
     assert(InfoHolder.getScopeVariables().empty());
-    assert(DbgValues.empty());
-    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
-    // by a -gmlt CU. Add a test and remove this assertion.
-    assert(AbstractVariables.empty());
     PrevLabel = nullptr;
     CurFn = nullptr;
-    DebugHandlerBase::endFunction(MF);
     return;
   }
 
@@ -1244,12 +1274,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
     for (const DILocalVariable *DV : SP->getVariables()) {
       if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
         continue;
-      ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr),
+      ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr),
                                       DV->getScope());
       assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
              && "ensureAbstractVariableIsCreated inserted abstract scopes");
     }
-    constructAbstractSubprogramScopeDIE(AScope);
+    constructAbstractSubprogramScopeDIE(TheCU, AScope);
   }
 
   ProcessedSPNodes.insert(SP);
@@ -1266,7 +1296,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   InfoHolder.getScopeVariables().clear();
   PrevLabel = nullptr;
   CurFn = nullptr;
-  DebugHandlerBase::endFunction(MF);
 }
 
 // Register a source line with debug info. Returns the  unique label that was
@@ -1361,6 +1390,18 @@ void DwarfDebug::emitAccelTypes() {
 /// computeIndexValue - Compute the gdb index value for the DIE and CU.
 static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
                                                         const DIE *Die) {
+  // Entities that ended up only in a Type Unit reference the CU instead (since
+  // the pub entry has offsets within the CU there's no real offset that can be
+  // provided anyway). As it happens all such entities (namespaces and types,
+  // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out
+  // not to be true it would be necessary to persist this information from the
+  // point at which the entry is added to the index data structure - since by
+  // the time the index is built from that, the original type/namespace DIE in a
+  // type unit has already been destroyed so it can't be queried for properties
+  // like tag, etc.
+  if (Die->getTag() == dwarf::DW_TAG_compile_unit)
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE,
+                                          dwarf::GIEL_EXTERNAL);
   dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
 
   // We could have a specification DIE that has our most of our knowledge,
@@ -1418,7 +1459,7 @@ void DwarfDebug::emitDebugPubSection(
 
     const auto &Globals = (TheU->*Accessor)();
 
-    if (Globals.empty())
+    if (!hasDwarfPubSections(TheU->includeMinimalInlineScopes()))
       continue;
 
     if (auto *Skeleton = TheU->getSkeleton())
@@ -1498,27 +1539,36 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                               ByteStreamer &Streamer,
                               const DebugLocEntry::Value &Value,
                               DwarfExpression &DwarfExpr) {
-  DIExpressionCursor ExprCursor(Value.getExpression());
-  DwarfExpr.addFragmentOffset(Value.getExpression());
+  auto *DIExpr = Value.getExpression();
+  DIExpressionCursor ExprCursor(DIExpr);
+  DwarfExpr.addFragmentOffset(DIExpr);
   // Regular entry.
   if (Value.isInt()) {
     if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
                BT->getEncoding() == dwarf::DW_ATE_signed_char))
-      DwarfExpr.AddSignedConstant(Value.getInt());
+      DwarfExpr.addSignedConstant(Value.getInt());
     else
-      DwarfExpr.AddUnsignedConstant(Value.getInt());
+      DwarfExpr.addUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
-    MachineLocation Loc = Value.getLoc();
+    MachineLocation Location = Value.getLoc();
+    if (Location.isIndirect())
+      DwarfExpr.setMemoryLocationKind();
+    SmallVector<uint64_t, 8> Ops;
+    if (Location.isIndirect() && Location.getOffset()) {
+      Ops.push_back(dwarf::DW_OP_plus_uconst);
+      Ops.push_back(Location.getOffset());
+    }
+    Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    DIExpressionCursor Cursor(Ops);
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
-    if (Loc.getOffset())
-      DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset());
-    else
-      DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg());
+    if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+      return;
+    return DwarfExpr.addExpression(std::move(Cursor));
   } else if (Value.isConstantFP()) {
     APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
-    DwarfExpr.AddUnsignedConstant(RawBytes);
+    DwarfExpr.addUnsignedConstant(RawBytes);
   }
-  DwarfExpr.AddExpression(std::move(ExprCursor));
+  DwarfExpr.addExpression(std::move(ExprCursor));
 }
 
 void DebugLocEntry::finalize(const AsmPrinter &AP,
@@ -1558,10 +1608,13 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) {
 
 // Emit locations into the debug loc section.
 void DwarfDebug::emitDebugLoc() {
+  if (DebugLocs.getLists().empty())
+    return;
+
   // Start the dwarf loc section.
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfLocSection());
-  unsigned char Size = Asm->getDataLayout().getPointerSize();
+  unsigned char Size = Asm->MAI->getCodePointerSize();
   for (const auto &List : DebugLocs.getLists()) {
     Asm->OutStreamer->EmitLabel(List.Label);
     const DwarfCompileUnit *CU = List.CU;
@@ -1691,7 +1744,7 @@ void DwarfDebug::emitDebugARanges() {
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfARangesSection());
 
-  unsigned PtrSize = Asm->getDataLayout().getPointerSize();
+  unsigned PtrSize = Asm->MAI->getCodePointerSize();
 
   // Build a list of CUs used.
   std::vector<DwarfCompileUnit *> CUs;
@@ -1769,12 +1822,15 @@ void DwarfDebug::emitDebugARanges() {
 
 /// Emit address ranges into a debug ranges section.
 void DwarfDebug::emitDebugRanges() {
+  if (CUMap.empty())
+    return;
+
   // Start the dwarf ranges section.
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfRangesSection());
 
   // Size for our labels.
-  unsigned char Size = Asm->getDataLayout().getPointerSize();
+  unsigned char Size = Asm->MAI->getCodePointerSize();
 
   // Grab the specific ranges for the compile units in the module.
   for (const auto &I : CUMap) {
@@ -1848,6 +1904,9 @@ void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
 
 /// Emit macros into a debug macinfo section.
 void DwarfDebug::emitDebugMacinfo() {
+  if (CUMap.empty())
+    return;
+
   // Start the dwarf macinfo section.
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfMacinfoSection());
@@ -1869,7 +1928,7 @@ void DwarfDebug::emitDebugMacinfo() {
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
                                   std::unique_ptr<DwarfCompileUnit> NewU) {
   NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name,
-                  U.getCUNode()->getSplitDebugFilename());
+                  Asm->TM.Options.MCOptions.SplitDwarfFile);
 
   if (!CompilationDir.empty())
     NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
@@ -1940,11 +1999,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
   MD5 Hash;
   Hash.update(Identifier);
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
   MD5::MD5Result Result;
   Hash.final(Result);
-  return support::endian::read64le(Result + 8);
+  return Result.high();
 }
 
 void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 253e3f0..5dfe06c 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -89,7 +89,7 @@ public:
     assert(!MInsn && "Already initialized?");
 
     assert((!E || E->isValid()) && "Expected valid expression");
-    assert(~FI && "Expected valid index");
+    assert(FI != INT_MAX && "Expected valid index");
 
     FrameIndexExprs.push_back({FI, E});
   }
@@ -134,6 +134,13 @@ public:
     assert(!FrameIndexExprs.empty() && "Expected an MMI entry");
     assert(!V.FrameIndexExprs.empty() && "Expected an MMI entry");
 
+    if (FrameIndexExprs.size()) {
+      auto *Expr = FrameIndexExprs.back().Expr;
+      // Get rid of duplicate non-fragment entries. More than one non-fragment
+      // dbg.declare makes no sense so ignore all but the first.
+      if (!Expr || !Expr->isFragment())
+        return;
+    }
     FrameIndexExprs.append(V.FrameIndexExprs.begin(), V.FrameIndexExprs.end());
     assert(all_of(FrameIndexExprs,
                   [](FrameIndexExpr &FIE) {
@@ -210,7 +217,6 @@ class DwarfDebug : public DebugHandlerBase {
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
   /// Collection of abstract variables.
-  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
   SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
 
   /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
@@ -247,9 +253,6 @@ class DwarfDebug : public DebugHandlerBase {
       std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
       TypeUnitsUnderConstruction;
 
-  /// Whether to emit the pubnames/pubtypes sections.
-  bool HasDwarfPubSections;
-
   /// Whether to use the GNU TLS opcode (instead of the standard opcode).
   bool UseGNUTLSOpcode;
 
@@ -313,20 +316,16 @@ class DwarfDebug : public DebugHandlerBase {
 
   typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
 
-  /// Find abstract variable associated with Var.
-  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
-                                           const DILocalVariable *&Cleansed);
-  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
-  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
-  void ensureAbstractVariableIsCreated(InlinedVariable Var,
+  void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var,
                                        const MDNode *Scope);
-  void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var,
+  void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var,
                                                const MDNode *Scope);
 
-  DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
+  DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU,
+                                      LexicalScope &Scope, InlinedVariable IV);
 
   /// Construct a DIE for this abstract scope.
-  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+  void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope);
 
   void finishVariableDefinitions();
 
@@ -420,11 +419,11 @@ class DwarfDebug : public DebugHandlerBase {
 
   /// Flags to let the linker know we have emitted new style pubnames. Only
   /// emit it here if we don't have a skeleton CU for split dwarf.
-  void addGnuPubAttributes(DwarfUnit &U, DIE &D) const;
+  void addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const;
 
   /// Create new DwarfCompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
-  DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit);
+  DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit);
 
   /// Construct imported_module or imported_declaration DIE.
   void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
@@ -446,7 +445,17 @@ class DwarfDebug : public DebugHandlerBase {
                          const DbgValueHistoryMap::InstrRanges &Ranges);
 
   /// Collect variable information from the side table maintained by MF.
-  void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
+  void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
+                                      DenseSet<InlinedVariable> &P);
+
+protected:
+  /// Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function debug information.
+  void endFunctionImpl(const MachineFunction *MF) override;
+
+  void skippedNonDebugFunction() override;
 
 public:
   //===--------------------------------------------------------------------===//
@@ -463,12 +472,6 @@ public:
   /// Emit all Dwarf sections that should come after the content.
   void endModule() override;
 
-  /// Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// Gather and emit post-function debug information.
-  void endFunction(const MachineFunction *MF) override;
-
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
@@ -515,6 +518,8 @@ public:
   /// split dwarf proposal support.
   bool useSplitDwarf() const { return HasSplitDwarf; }
 
+  bool shareAcrossDWOCUs() const;
+
   /// Returns the Dwarf Version.
   uint16_t getDwarfVersion() const;
 
@@ -555,6 +560,8 @@ public:
   /// A helper function to check whether the DIE for a given Scope is
   /// going to be null.
   bool isLexicalScopeDIENull(LexicalScope *Scope);
+
+  bool hasDwarfPubSections(bool includeMinimalInlineScopes) const;
 };
 } // End of namespace llvm
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 61b2c7e6..fe38ee8 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -14,87 +14,88 @@
 #include "DwarfExpression.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
-void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
-  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
-  if (DwarfReg < 32) {
-    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
+ assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+ assert((LocationKind == Unknown || LocationKind == Register) &&
+        "location description already locked down");
+ LocationKind = Register;
+ if (DwarfReg < 32) {
+   emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
   } else {
-    EmitOp(dwarf::DW_OP_regx, Comment);
-    EmitUnsigned(DwarfReg);
+    emitOp(dwarf::DW_OP_regx, Comment);
+    emitUnsigned(DwarfReg);
   }
 }
 
-void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) {
+void DwarfExpression::addBReg(int DwarfReg, int Offset) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  assert(LocationKind != Register && "location description already locked down");
   if (DwarfReg < 32) {
-    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+    emitOp(dwarf::DW_OP_breg0 + DwarfReg);
   } else {
-    EmitOp(dwarf::DW_OP_bregx);
-    EmitUnsigned(DwarfReg);
+    emitOp(dwarf::DW_OP_bregx);
+    emitUnsigned(DwarfReg);
   }
-  EmitSigned(Offset);
-  if (Deref)
-    EmitOp(dwarf::DW_OP_deref);
+  emitSigned(Offset);
+}
+
+void DwarfExpression::addFBReg(int Offset) {
+  emitOp(dwarf::DW_OP_fbreg);
+  emitSigned(Offset);
 }
 
-void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
   if (!SizeInBits)
     return;
 
   const unsigned SizeOfByte = 8;
   if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
-    EmitOp(dwarf::DW_OP_bit_piece);
-    EmitUnsigned(SizeInBits);
-    EmitUnsigned(OffsetInBits);
+    emitOp(dwarf::DW_OP_bit_piece);
+    emitUnsigned(SizeInBits);
+    emitUnsigned(OffsetInBits);
   } else {
-    EmitOp(dwarf::DW_OP_piece);
+    emitOp(dwarf::DW_OP_piece);
     unsigned ByteSize = SizeInBits / SizeOfByte;
-    EmitUnsigned(ByteSize);
+    emitUnsigned(ByteSize);
   }
   this->OffsetInBits += SizeInBits;
 }
 
-void DwarfExpression::AddShr(unsigned ShiftBy) {
-  EmitOp(dwarf::DW_OP_constu);
-  EmitUnsigned(ShiftBy);
-  EmitOp(dwarf::DW_OP_shr);
+void DwarfExpression::addShr(unsigned ShiftBy) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(ShiftBy);
+  emitOp(dwarf::DW_OP_shr);
 }
 
-bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI,
-                                            unsigned MachineReg, int Offset) {
-  if (isFrameRegister(TRI, MachineReg)) {
-    // If variable offset is based in frame register then use fbreg.
-    EmitOp(dwarf::DW_OP_fbreg);
-    EmitSigned(Offset);
-    return true;
-  }
-
-  int DwarfReg = TRI.getDwarfRegNum(MachineReg, false);
-  if (DwarfReg < 0)
-    return false;
-
-  AddRegIndirect(DwarfReg, Offset);
-  return true;
+void DwarfExpression::addAnd(unsigned Mask) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(Mask);
+  emitOp(dwarf::DW_OP_and);
 }
 
-bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
+bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
                                     unsigned MachineReg, unsigned MaxSize) {
-  if (!TRI.isPhysicalRegister(MachineReg))
+  if (!TRI.isPhysicalRegister(MachineReg)) {
+    if (isFrameRegister(TRI, MachineReg)) {
+      DwarfRegs.push_back({-1, 0, nullptr});
+      return true;
+    }
     return false;
+  }
 
   int Reg = TRI.getDwarfRegNum(MachineReg, false);
 
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
-    AddReg(Reg);
+    DwarfRegs.push_back({Reg, 0, nullptr});
     return true;
   }
 
@@ -106,7 +107,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
       unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg);
       unsigned Size = TRI.getSubRegIdxSize(Idx);
       unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
-      AddReg(Reg, "super-register");
+      DwarfRegs.push_back({Reg, 0, "super-register"});
       // Use a DW_OP_bit_piece to describe the sub-register.
       setSubRegisterPiece(Size, RegOffset);
       return true;
@@ -116,8 +117,9 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
   // Otherwise, attempt to find a covering set of sub-register numbers.
   // For example, Q0 on ARM is a composition of D0+D1.
   unsigned CurPos = 0;
-  // The size of the register in bits, assuming 8 bits per byte.
-  unsigned RegSize = TRI.getMinimalPhysRegClass(MachineReg)->getSize() * 8;
+  // The size of the register in bits.
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(MachineReg);
+  unsigned RegSize = TRI.getRegSizeInBits(*RC);
   // Keep track of the bits in the register we already emitted, so we
   // can avoid emitting redundant aliasing subregs.
   SmallBitVector Coverage(RegSize, false);
@@ -136,100 +138,166 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
     // If this sub-register has a DWARF number and we haven't covered
     // its range, emit a DWARF piece for it.
     if (Reg >= 0 && Intersection.any()) {
-      AddReg(Reg, "sub-register");
+      // Emit a piece for any gap in the coverage.
+      if (Offset > CurPos)
+        DwarfRegs.push_back({-1, Offset - CurPos, nullptr});
+      DwarfRegs.push_back(
+          {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"});
       if (Offset >= MaxSize)
 	break;
-      // Emit a piece for the any gap in the coverage.
-      if (Offset > CurPos)
-        AddOpPiece(Offset - CurPos);
-      AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset));
-      CurPos = Offset + Size;
 
       // Mark it as emitted.
       Coverage.set(Offset, Offset + Size);
+      CurPos = Offset + Size;
     }
   }
 
   return CurPos;
 }
 
-void DwarfExpression::AddStackValue() {
+void DwarfExpression::addStackValue() {
   if (DwarfVersion >= 4)
-    EmitOp(dwarf::DW_OP_stack_value);
+    emitOp(dwarf::DW_OP_stack_value);
 }
 
-void DwarfExpression::AddSignedConstant(int64_t Value) {
-  EmitOp(dwarf::DW_OP_consts);
-  EmitSigned(Value);
-  AddStackValue();
+void DwarfExpression::addSignedConstant(int64_t Value) {
+  assert(LocationKind == Implicit || LocationKind == Unknown);
+  LocationKind = Implicit;
+  emitOp(dwarf::DW_OP_consts);
+  emitSigned(Value);
 }
 
-void DwarfExpression::AddUnsignedConstant(uint64_t Value) {
-  EmitOp(dwarf::DW_OP_constu);
-  EmitUnsigned(Value);
-  AddStackValue();
+void DwarfExpression::addUnsignedConstant(uint64_t Value) {
+  assert(LocationKind == Implicit || LocationKind == Unknown);
+  LocationKind = Implicit;
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(Value);
 }
 
-void DwarfExpression::AddUnsignedConstant(const APInt &Value) {
+void DwarfExpression::addUnsignedConstant(const APInt &Value) {
+  assert(LocationKind == Implicit || LocationKind == Unknown);
+  LocationKind = Implicit;
+
   unsigned Size = Value.getBitWidth();
   const uint64_t *Data = Value.getRawData();
 
   // Chop it up into 64-bit pieces, because that's the maximum that
-  // AddUnsignedConstant takes.
+  // addUnsignedConstant takes.
   unsigned Offset = 0;
   while (Offset < Size) {
-    AddUnsignedConstant(*Data++);
+    addUnsignedConstant(*Data++);
     if (Offset == 0 && Size <= 64)
       break;
-    AddOpPiece(std::min(Size-Offset, 64u), Offset);
+    addStackValue();
+    addOpPiece(std::min(Size - Offset, 64u), Offset);
     Offset += 64;
   }
 }
 
-bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
+bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
                                               DIExpressionCursor &ExprCursor,
                                               unsigned MachineReg,
                                               unsigned FragmentOffsetInBits) {
-  if (!ExprCursor)
-    return AddMachineReg(TRI, MachineReg);
+  auto Fragment = ExprCursor.getFragmentInfo();
+  if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U)) {
+    LocationKind = Unknown;
+    return false;
+  }
 
-  // Pattern-match combinations for which more efficient representations exist
-  // first.
-  bool ValidReg = false;
+  bool HasComplexExpression = false;
   auto Op = ExprCursor.peek();
-  switch (Op->getOp()) {
-  default: {
-    auto Fragment = ExprCursor.getFragmentInfo();
-    ValidReg = AddMachineReg(TRI, MachineReg,
-			     Fragment ? Fragment->SizeInBits : ~1U);
-    break;
+  if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment)
+    HasComplexExpression = true;
+
+  // If the register can only be described by a complex expression (i.e.,
+  // multiple subregisters) it doesn't safely compose with another complex
+  // expression. For example, it is not possible to apply a DW_OP_deref
+  // operation to multiple DW_OP_pieces.
+  if (HasComplexExpression && DwarfRegs.size() > 1) {
+    DwarfRegs.clear();
+    LocationKind = Unknown;
+    return false;
   }
-  case dwarf::DW_OP_plus:
-  case dwarf::DW_OP_minus: {
-    // [DW_OP_reg,Offset,DW_OP_plus, DW_OP_deref] --> [DW_OP_breg, Offset].
-    // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset].
-    auto N = ExprCursor.peekNext();
-    if (N && N->getOp() == dwarf::DW_OP_deref) {
-      unsigned Offset = Op->getArg(0);
-      ValidReg = AddMachineRegIndirect(
-          TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
-      ExprCursor.consume(2);
-    } else
-      ValidReg = AddMachineReg(TRI, MachineReg);
-    break;
+
+  // Handle simple register locations.
+  if (LocationKind != Memory && !HasComplexExpression) {
+    for (auto &Reg : DwarfRegs) {
+      if (Reg.DwarfRegNo >= 0)
+        addReg(Reg.DwarfRegNo, Reg.Comment);
+      addOpPiece(Reg.Size);
+    }
+    DwarfRegs.clear();
+    return true;
   }
-  case dwarf::DW_OP_deref:
-    // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
-    ValidReg = AddMachineRegIndirect(TRI, MachineReg);
+
+  // Don't emit locations that cannot be expressed without DW_OP_stack_value.
+  if (DwarfVersion < 4)
+    if (std::any_of(ExprCursor.begin(), ExprCursor.end(),
+                    [](DIExpression::ExprOperand Op) -> bool {
+                      return Op.getOp() == dwarf::DW_OP_stack_value;
+                    })) {
+      DwarfRegs.clear();
+      LocationKind = Unknown;
+      return false;
+    }
+
+  assert(DwarfRegs.size() == 1);
+  auto Reg = DwarfRegs[0];
+  bool FBReg = isFrameRegister(TRI, MachineReg);
+  int SignedOffset = 0;
+  assert(Reg.Size == 0 && "subregister has same size as superregister");
+
+  // Pattern-match combinations for which more efficient representations exist.
+  // [Reg, DW_OP_plus_uconst, Offset] --> [DW_OP_breg, Offset].
+  if (Op && (Op->getOp() == dwarf::DW_OP_plus_uconst)) {
+    SignedOffset = Op->getArg(0);
     ExprCursor.take();
-    break;
   }
 
-  return ValidReg;
+  // [Reg, DW_OP_constu, Offset, DW_OP_plus]  --> [DW_OP_breg, Offset]
+  // [Reg, DW_OP_constu, Offset, DW_OP_minus] --> [DW_OP_breg,-Offset]
+  // If Reg is a subregister we need to mask it out before subtracting.
+  if (Op && Op->getOp() == dwarf::DW_OP_constu) {
+    auto N = ExprCursor.peekNext();
+    if (N && (N->getOp() == dwarf::DW_OP_plus ||
+             (N->getOp() == dwarf::DW_OP_minus && !SubRegisterSizeInBits))) {
+      int Offset = Op->getArg(0);
+      SignedOffset = (N->getOp() == dwarf::DW_OP_minus) ? -Offset : Offset;
+      ExprCursor.consume(2);
+    }
+  }
+
+  if (FBReg)
+    addFBReg(SignedOffset);
+  else
+    addBReg(Reg.DwarfRegNo, SignedOffset);
+  DwarfRegs.clear();
+  return true;
+}
+
+/// Assuming a well-formed expression, match "DW_OP_deref* DW_OP_LLVM_fragment?".
+static bool isMemoryLocation(DIExpressionCursor ExprCursor) {
+  while (ExprCursor) {
+    auto Op = ExprCursor.take();
+    switch (Op->getOp()) {
+    case dwarf::DW_OP_deref:
+    case dwarf::DW_OP_LLVM_fragment:
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
 }
 
-void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
+void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
                                     unsigned FragmentOffsetInBits) {
+  // If we need to mask out a subregister, do it now, unless the next
+  // operation would emit an OpPiece anyway.
+  auto N = ExprCursor.peek();
+  if (SubRegisterSizeInBits && N && (N->getOp() != dwarf::DW_OP_LLVM_fragment))
+    maskSubRegister();
+
   while (ExprCursor) {
     auto Op = ExprCursor.take();
     switch (Op->getOp()) {
@@ -241,49 +309,91 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
       // location.
       assert(OffsetInBits >= FragmentOffset && "fragment offset not added?");
 
-      // If \a AddMachineReg already emitted DW_OP_piece operations to represent
+      // If addMachineReg already emitted DW_OP_piece operations to represent
       // a super-register by splicing together sub-registers, subtract the size
       // of the pieces that was already emitted.
       SizeInBits -= OffsetInBits - FragmentOffset;
 
-      // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a
+      // If addMachineReg requested a DW_OP_bit_piece to stencil out a
       // sub-register that is smaller than the current fragment's size, use it.
       if (SubRegisterSizeInBits)
         SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits);
-      
-      AddOpPiece(SizeInBits, SubRegisterOffsetInBits);
+
+      // Emit a DW_OP_stack_value for implicit location descriptions.
+      if (LocationKind == Implicit)
+        addStackValue();
+
+      // Emit the DW_OP_piece.
+      addOpPiece(SizeInBits, SubRegisterOffsetInBits);
       setSubRegisterPiece(0, 0);
-      break;
+      // Reset the location description kind.
+      LocationKind = Unknown;
+      return;
     }
-    case dwarf::DW_OP_plus:
-      EmitOp(dwarf::DW_OP_plus_uconst);
-      EmitUnsigned(Op->getArg(0));
+    case dwarf::DW_OP_plus_uconst:
+      assert(LocationKind != Register);
+      emitOp(dwarf::DW_OP_plus_uconst);
+      emitUnsigned(Op->getArg(0));
       break;
+    case dwarf::DW_OP_plus:
     case dwarf::DW_OP_minus:
-      // There is no OP_minus_uconst.
-      EmitOp(dwarf::DW_OP_constu);
-      EmitUnsigned(Op->getArg(0));
-      EmitOp(dwarf::DW_OP_minus);
+      emitOp(Op->getOp());
       break;
-    case dwarf::DW_OP_deref:
-      EmitOp(dwarf::DW_OP_deref);
+    case dwarf::DW_OP_deref: {
+      assert(LocationKind != Register);
+      if (LocationKind != Memory && isMemoryLocation(ExprCursor))
+        // Turning this into a memory location description makes the deref
+        // implicit.
+        LocationKind = Memory;
+      else
+        emitOp(dwarf::DW_OP_deref);
       break;
+    }
     case dwarf::DW_OP_constu:
-      EmitOp(dwarf::DW_OP_constu);
-      EmitUnsigned(Op->getArg(0));
+      assert(LocationKind != Register);
+      emitOp(dwarf::DW_OP_constu);
+      emitUnsigned(Op->getArg(0));
       break;
     case dwarf::DW_OP_stack_value:
-      AddStackValue();
+      LocationKind = Implicit;
+      break;
+    case dwarf::DW_OP_swap:
+      assert(LocationKind != Register);
+      emitOp(dwarf::DW_OP_swap);
+      break;
+    case dwarf::DW_OP_xderef:
+      assert(LocationKind != Register);
+      emitOp(dwarf::DW_OP_xderef);
       break;
     default:
       llvm_unreachable("unhandled opcode found in expression");
     }
   }
+
+  if (LocationKind == Implicit)
+    // Turn this into an implicit location description.
+    addStackValue();
+}
+
+/// add masking operations to stencil out a subregister.
+void DwarfExpression::maskSubRegister() {
+  assert(SubRegisterSizeInBits && "no subregister was registered");
+  if (SubRegisterOffsetInBits > 0)
+    addShr(SubRegisterOffsetInBits);
+  uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL;
+  addAnd(Mask);
 }
 
+
 void DwarfExpression::finalize() {
-  if (SubRegisterSizeInBits)
-    AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
+  assert(DwarfRegs.size() == 0 && "dwarf registers not emitted");
+  // Emit any outstanding DW_OP_piece operations to mask out subregisters.
+  if (SubRegisterSizeInBits == 0)
+    return;
+  // Don't emit a DW_OP_piece for a subregister at offset 0.
+  if (SubRegisterOffsetInBits == 0)
+    return;
+  addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
 }
 
 void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
@@ -294,6 +404,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
   assert(FragmentOffset >= OffsetInBits &&
          "overlapping or duplicate fragments");
   if (FragmentOffset > OffsetInBits)
-    AddOpPiece(FragmentOffset - OffsetInBits);
+    addOpPiece(FragmentOffset - OffsetInBits);
   OffsetInBits = FragmentOffset;
 }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index fd90fa0..728f8ad 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -42,6 +42,9 @@ public:
   DIExpressionCursor(ArrayRef<uint64_t> Expr)
       : Start(Expr.begin()), End(Expr.end()) {}
 
+  DIExpressionCursor(const DIExpressionCursor &C)
+      : Start(C.Start), End(C.End) {}
+
   /// Consume one operation.
   Optional<DIExpression::ExprOperand> take() {
     if (Start == End)
@@ -72,6 +75,8 @@ public:
   }
   /// Determine whether there are any operations left in this expression.
   operator bool() const { return Start != End; }
+  DIExpression::expr_op_iterator begin() const { return Start; }
+  DIExpression::expr_op_iterator end() const { return End; }
 
   /// Retrieve the fragment information, if any.
   Optional<DIExpression::FragmentInfo> getFragmentInfo() const {
@@ -84,14 +89,27 @@ public:
 /// entry.
 class DwarfExpression {
 protected:
-  unsigned DwarfVersion;
+  /// Holds information about all subregisters comprising a register location.
+  struct Register {
+    int DwarfRegNo;
+    unsigned Size;
+    const char *Comment;
+  };
+
+  /// The register location, if any.
+  SmallVector<Register, 2> DwarfRegs;
+
   /// Current Fragment Offset in Bits.
   uint64_t OffsetInBits = 0;
+  unsigned DwarfVersion;
 
   /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister. 
   unsigned SubRegisterSizeInBits = 0;
   unsigned SubRegisterOffsetInBits = 0;
 
+  /// The kind of location description being produced.
+  enum { Unknown = 0, Register, Memory, Implicit } LocationKind = Unknown;
+
   /// Push a DW_OP_piece / DW_OP_bit_piece for emitting later, if one is needed
   /// to represent a subregister.
   void setSubRegisterPiece(unsigned SizeInBits, unsigned OffsetInBits) {
@@ -99,35 +117,55 @@ protected:
     SubRegisterOffsetInBits = OffsetInBits;
   }
 
-public:
-  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
-  virtual ~DwarfExpression() {};
-
-  /// This needs to be called last to commit any pending changes.
-  void finalize();
+  /// Add masking operations to stencil out a subregister.
+  void maskSubRegister();
 
   /// Output a dwarf operand and an optional assembler comment.
-  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0;
   /// Emit a raw signed value.
-  virtual void EmitSigned(int64_t Value) = 0;
+  virtual void emitSigned(int64_t Value) = 0;
   /// Emit a raw unsigned value.
-  virtual void EmitUnsigned(uint64_t Value) = 0;
+  virtual void emitUnsigned(uint64_t Value) = 0;
   /// Return whether the given machine register is the frame register in the
   /// current function.
   virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0;
 
-  /// Emit a dwarf register operation.
-  void AddReg(int DwarfReg, const char *Comment = nullptr);
-  /// Emit an (double-)indirect dwarf register operation.
-  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
+  /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF
+  /// register location description.
+  void addReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit a DW_OP_breg operation.
+  void addBReg(int DwarfReg, int Offset);
+  /// Emit DW_OP_fbreg <Offset>.
+  void addFBReg(int Offset);
+
+  /// Emit a partial DWARF register operation.
+  ///
+  /// \param MachineReg           The register number.
+  /// \param MaxSize              If the register must be composed from
+  ///                             sub-registers this is an upper bound
+  ///                             for how many bits the emitted DW_OP_piece
+  ///                             may cover.
+  ///
+  /// If size and offset is zero an operation for the entire register is
+  /// emitted: Some targets do not provide a DWARF register number for every
+  /// register.  If this is the case, this function will attempt to emit a DWARF
+  /// register by emitting a fragment of a super-register or by piecing together
+  /// multiple subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                     unsigned MaxSize = ~1U);
+
 
   /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment.
   /// \param OffsetInBits    This is an optional offset into the location that
   /// is at the top of the DWARF stack.
-  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
 
-  /// Emit a shift-right dwarf expression.
-  void AddShr(unsigned ShiftBy);
+  /// Emit a shift-right dwarf operation.
+  void addShr(unsigned ShiftBy);
+  /// Emit a bitwise and dwarf operation.
+  void addAnd(unsigned Mask);
 
   /// Emit a DW_OP_stack_value, if supported.
   ///
@@ -140,48 +178,39 @@ public:
   /// constant value, so the producers and consumers started to rely on
   /// heuristics to disambiguate the value vs. location status of the
   /// expression.  See PR21176 for more details.
-  void AddStackValue();
+  void addStackValue();
 
-  /// Emit an indirect dwarf register operation for the given machine register.
-  /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                             int Offset = 0);
+  ~DwarfExpression() = default;
+public:
+  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
 
-  /// Emit a partial DWARF register operation.
-  ///
-  /// \param MachineReg           The register number.
-  /// \param MaxSize              If the register must be composed from
-  ///                             sub-registers this is an upper bound
-  ///                             for how many bits the emitted DW_OP_piece
-  ///                             may cover.
-  ///
-  /// If size and offset is zero an operation for the entire register is
-  /// emitted: Some targets do not provide a DWARF register number for every
-  /// register.  If this is the case, this function will attempt to emit a DWARF
-  /// register by emitting a fragment of a super-register or by piecing together
-  /// multiple subregisters that alias the register.
-  ///
-  /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                     unsigned MaxSize = ~1U);
+  /// This needs to be called last to commit any pending changes.
+  void finalize();
 
   /// Emit a signed constant.
-  void AddSignedConstant(int64_t Value);
+  void addSignedConstant(int64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(uint64_t Value);
+  void addUnsignedConstant(uint64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(const APInt &Value);
+  void addUnsignedConstant(const APInt &Value);
+
+  /// Lock this down to become a memory location description.
+  void setMemoryLocationKind() {
+    assert(LocationKind == Unknown);
+    LocationKind = Memory;
+  }
 
   /// Emit a machine register location. As an optimization this may also consume
   /// the prefix of a DwarfExpression if a more efficient representation for
   /// combining the register location and the first operation exists.
   ///
-  /// \param FragmentOffsetInBits     If this is one fragment out of a fragmented
+  /// \param FragmentOffsetInBits     If this is one fragment out of a
+  /// fragmented
   ///                                 location, this is the offset of the
   ///                                 fragment inside the entire variable.
   /// \return                         false if no DWARF register exists
   ///                                 for MachineReg.
-  bool AddMachineRegExpression(const TargetRegisterInfo &TRI,
+  bool addMachineRegExpression(const TargetRegisterInfo &TRI,
                                DIExpressionCursor &Expr, unsigned MachineReg,
                                unsigned FragmentOffsetInBits = 0);
   /// Emit all remaining operations in the DIExpressionCursor.
@@ -189,7 +218,7 @@ public:
   /// \param FragmentOffsetInBits     If this is one fragment out of multiple
   ///                                 locations, this is the offset of the
   ///                                 fragment inside the entire variable.
-  void AddExpression(DIExpressionCursor &&Expr,
+  void addExpression(DIExpressionCursor &&Expr,
                      unsigned FragmentOffsetInBits = 0);
 
   /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to
@@ -198,33 +227,32 @@ public:
 };
 
 /// DwarfExpression implementation for .debug_loc entries.
-class DebugLocDwarfExpression : public DwarfExpression {
+class DebugLocDwarfExpression final : public DwarfExpression {
   ByteStreamer &BS;
 
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
 public:
   DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS)
       : DwarfExpression(DwarfVersion), BS(BS) {}
-
-  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
-  void EmitSigned(int64_t Value) override;
-  void EmitUnsigned(uint64_t Value) override;
-  bool isFrameRegister(const TargetRegisterInfo &TRI,
-                       unsigned MachineReg) override;
 };
 
 /// DwarfExpression implementation for singular DW_AT_location.
-class DIEDwarfExpression : public DwarfExpression {
+class DIEDwarfExpression final : public DwarfExpression {
 const AsmPrinter &AP;
   DwarfUnit &DU;
   DIELoc &DIE;
 
-public:
-  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
-  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
-  void EmitSigned(int64_t Value) override;
-  void EmitUnsigned(uint64_t Value) override;
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
   DIELoc *finalize() {
     DwarfExpression::finalize();
     return &DIE;
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
index d4d2ed2..54924e9 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -53,6 +53,7 @@ class DwarfFile {
 
   // Collection of abstract subprogram DIEs.
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
 
   /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
   /// be shared across CUs, that is why we keep the map here instead
@@ -105,6 +106,9 @@ public:
   DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
     return AbstractSPDies;
   }
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> &getAbstractVariables() {
+    return AbstractVariables;
+  }
 
   void insertDIE(const MDNode *TypeMD, DIE *Die) {
     DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 2a866c0..4f4ebfc 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -18,18 +18,19 @@
 #include "DwarfExpression.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -54,15 +55,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
     : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU),
       DIE(DIE) {}
 
-void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
   DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
 }
 
-void DIEDwarfExpression::EmitSigned(int64_t Value) {
+void DIEDwarfExpression::emitSigned(int64_t Value) {
   DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
 }
 
-void DIEDwarfExpression::EmitUnsigned(uint64_t Value) {
+void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
   DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
 }
 
@@ -73,8 +74,8 @@ bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
 
 DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node,
                      AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
-    : DIEUnit(A->getDwarfVersion(), A->getPointerSize(), UnitTag), CUNode(Node),
-      Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) {
+    : DIEUnit(A->getDwarfVersion(), A->MAI->getCodePointerSize(), UnitTag),
+      CUNode(Node), Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) {
 }
 
 DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
@@ -98,25 +99,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
   default:
     break;
 
-  case dwarf::DW_LANG_C89:
-  case dwarf::DW_LANG_C99:
+  // The languages below have valid values in all DWARF versions.
   case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C89:
   case dwarf::DW_LANG_C_plus_plus:
-  case dwarf::DW_LANG_ObjC:
-  case dwarf::DW_LANG_ObjC_plus_plus:
     return 0;
 
   case dwarf::DW_LANG_Fortran77:
   case dwarf::DW_LANG_Fortran90:
-  case dwarf::DW_LANG_Fortran95:
     return 1;
 
-  // The languages below have valid values only if the DWARF version >= 4.
+  // The languages below have valid values only if the DWARF version >= 3.
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    if (DD->getDwarfVersion() >= 3)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Fortran95:
+    if (DD->getDwarfVersion() >= 3)
+      return 1;
+    break;
+
+  // Starting with DWARF v4, all defined languages have valid values.
+  case dwarf::DW_LANG_D:
   case dwarf::DW_LANG_Java:
   case dwarf::DW_LANG_Python:
   case dwarf::DW_LANG_UPC:
-  case dwarf::DW_LANG_D:
-    if (dwarf::DWARF_VERSION >= 4)
+    if (DD->getDwarfVersion() >= 4)
       return 0;
     break;
 
@@ -127,31 +138,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
   case dwarf::DW_LANG_Modula2:
   case dwarf::DW_LANG_Pascal83:
   case dwarf::DW_LANG_PLI:
-    if (dwarf::DWARF_VERSION >= 4)
+    if (DD->getDwarfVersion() >= 4)
       return 1;
     break;
 
-  // The languages below have valid values only if the DWARF version >= 5.
-  case dwarf::DW_LANG_OpenCL:
-  case dwarf::DW_LANG_Go:
-  case dwarf::DW_LANG_Haskell:
+  // The languages below are new in DWARF v5.
+  case dwarf::DW_LANG_BLISS:
+  case dwarf::DW_LANG_C11:
   case dwarf::DW_LANG_C_plus_plus_03:
   case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_C_plus_plus_14:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
   case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_RenderScript:
   case dwarf::DW_LANG_Rust:
-  case dwarf::DW_LANG_C11:
   case dwarf::DW_LANG_Swift:
-  case dwarf::DW_LANG_Dylan:
-  case dwarf::DW_LANG_C_plus_plus_14:
-    if (dwarf::DWARF_VERSION >= 5)
+    if (DD->getDwarfVersion() >= 5)
       return 0;
     break;
 
-  case dwarf::DW_LANG_Modula3:
-  case dwarf::DW_LANG_Julia:
   case dwarf::DW_LANG_Fortran03:
   case dwarf::DW_LANG_Fortran08:
-    if (dwarf::DWARF_VERSION >= 5)
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Modula3:
+    if (DD->getDwarfVersion() >= 5)
       return 1;
     break;
   }
@@ -160,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
 }
 
 /// Check whether the DIE for this MDNode can be shared across CUs.
-static bool isShareableAcrossCUs(const DINode *D) {
+bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
   // When the MDNode can be part of the type system, the DIE can be shared
   // across CUs.
   // Combining type units and cross-CU DIE sharing is lower value (since
@@ -168,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) {
   // level already) but may be implementable for some value in projects
   // building multiple independent libraries with LTO and then linking those
   // together.
+  if (isDwoUnit() && !DD->shareAcrossDWOCUs())
+    return false;
   return (isa<DIType>(D) ||
           (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
          !GenerateDwarfTypeUnits;
@@ -285,13 +300,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
                dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));
 }
 
-void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
-                                    StringRef Identifier) {
-  uint64_t Signature = DD->makeTypeSignature(Identifier);
-  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8,
-               DIEInteger(Signature));
-}
-
 void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
                             DIEEntry Entry) {
   const DIEUnit *CU = Die.getUnit();
@@ -369,10 +377,6 @@ void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) {
   addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
 }
 
-void DwarfUnit::addSourceLine(DIE &Die, const DINamespace *NS) {
-  addSourceLine(Die, NS->getLine(), NS->getFilename(), NS->getDirectory());
-}
-
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
    VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
    gives the variable VarName either the struct, or a pointer to the struct, as
@@ -465,50 +469,48 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
   // Decode the original location, and use that as the start of the byref
   // variable's location.
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-  SmallVector<uint64_t, 6> DIExpr;
-  DIEDwarfExpression Expr(*Asm, *this, *Loc);
-
-  bool validReg;
-  if (Location.isReg())
-    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                  Location.getReg());
-  else
-    validReg =
-        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                   Location.getReg(), Location.getOffset());
-
-  if (!validReg)
-    return;
-
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  if (Location.isIndirect())
+    DwarfExpr.setMemoryLocationKind();
+
+  SmallVector<uint64_t, 9> Ops;
+  if (Location.isIndirect() && Location.getOffset()) {
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(Location.getOffset());
+  }
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    DIExpr.push_back(dwarf::DW_OP_deref);
+    Ops.push_back(dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    DIExpr.push_back(dwarf::DW_OP_plus);
-    DIExpr.push_back(forwardingFieldOffset);
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  DIExpr.push_back(dwarf::DW_OP_deref);
+  Ops.push_back(dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    DIExpr.push_back(dwarf::DW_OP_plus);
-    DIExpr.push_back(varFieldOffset);
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(varFieldOffset);
   }
-  Expr.AddExpression(makeArrayRef(DIExpr));
-  Expr.finalize();
+
+  DIExpressionCursor Cursor(Ops);
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+    return;
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Return true if type encoding is unsigned.
@@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
     addString(Die,
               DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
                                          : dwarf::DW_AT_MIPS_linkage_name,
-              GlobalValue::getRealLinkageName(LinkageName));
+              GlobalValue::dropLLVMManglingEscape(LinkageName));
 }
 
 void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
@@ -658,6 +660,14 @@ void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
   }
 }
 
+/// Add thrown types.
+void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) {
+  for (const auto *Ty : ThrownTypes) {
+    DIE &TT = createAndAddDIE(dwarf::DW_TAG_thrown_type, Die);
+    addType(TT, cast<DIType>(Ty));
+  }
+}
+
 DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   if (!Context || isa<DIFile>(Context))
     return &getUnitDie();
@@ -672,7 +682,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   return getDIE(Context);
 }
 
-DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
+DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) {
   auto *Context = resolve(Ty->getScope());
   DIE *ContextDIE = getOrCreateContextDIE(Context);
 
@@ -684,8 +694,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
 
   constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));
 
-  if (!Ty->isExternalTypeRef())
-    updateAcceleratorTables(Context, Ty, TyDIE);
+  updateAcceleratorTables(Context, Ty, TyDIE);
   return &TyDIE;
 }
 
@@ -841,6 +850,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy->isForwardDecl())
     addSourceLine(Buffer, DTy);
+
+  // If DWARF address space value is other than None, add it for pointer and
+  // reference types as DW_AT_address_class.
+  if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type ||
+                                      Tag == dwarf::DW_TAG_reference_type))
+    addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
+            DTy->getDWARFAddressSpace().getValue());
 }
 
 void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
@@ -892,13 +908,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
-  if (CTy->isExternalTypeRef()) {
-    StringRef Identifier = CTy->getIdentifier();
-    assert(!Identifier.empty() && "external type ref without identifier");
-    addFlag(Buffer, dwarf::DW_AT_declaration);
-    return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier);
-  }
-
   // Add name if not anonymous or intermediate type.
   StringRef Name = CTy->getName();
 
@@ -1074,7 +1083,6 @@ DIE *DwarfUnit::getOrCreateNameSpace(const DINamespace *NS) {
     Name = "(anonymous namespace)";
   DD->addAccelNamespace(Name, NDie);
   addGlobalName(Name, NDie, NS->getScope());
-  addSourceLine(NDie, NS);
   if (NS->getExportSymbols())
     addFlag(NDie, dwarf::DW_AT_export_symbols);
   return &NDie;
@@ -1180,8 +1188,12 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
 }
 
 void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
-                                          bool Minimal) {
-  if (!Minimal)
+                                          bool SkipSPAttributes) {
+  // If -fdebug-info-for-profiling is enabled, need to emit the subprogram
+  // and its source location.
+  bool SkipSPSourceLocation = SkipSPAttributes &&
+                              !CUNode->getDebugInfoForProfiling();
+  if (!SkipSPSourceLocation)
     if (applySubprogramDefinitionAttributes(SP, SPDie))
       return;
 
@@ -1189,12 +1201,13 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   if (!SP->getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP->getName());
 
+  if (!SkipSPSourceLocation)
+    addSourceLine(SPDie, SP);
+
   // Skip the rest of the attributes under -gmlt to save space.
-  if (Minimal)
+  if (SkipSPAttributes)
     return;
 
-  addSourceLine(SPDie, SP);
-
   // Add the prototype if we have a prototype and we have a C like
   // language.
   uint16_t Language = getLanguage();
@@ -1241,6 +1254,8 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
     constructSubprogramArguments(SPDie, Args);
   }
 
+  addThrownTypes(SPDie, SP->getThrownTypes());
+
   if (SP->isArtificial())
     addFlag(SPDie, dwarf::DW_AT_artificial);
 
@@ -1526,18 +1541,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
   return &StaticMemberDIE;
 }
 
-void DwarfUnit::emitHeader(bool UseOffsets) {
+void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   // Emit size of content not including length itself
   Asm->OutStreamer->AddComment("Length of Unit");
   Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize());
 
   Asm->OutStreamer->AddComment("DWARF version number");
-  Asm->EmitInt16(DD->getDwarfVersion());
-  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
+  unsigned Version = DD->getDwarfVersion();
+  Asm->EmitInt16(Version);
+
+  // DWARF v5 reorders the address size and adds a unit type.
+  if (Version >= 5) {
+    Asm->OutStreamer->AddComment("DWARF Unit Type");
+    Asm->EmitInt8(UT);
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->MAI->getCodePointerSize());
+  }
 
   // We share one abbreviations table across all units so it's always at the
   // start of the section. Use a relocatable offset where needed to ensure
   // linking doesn't invalidate that offset.
+  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   if (UseOffsets)
     Asm->EmitInt32(0);
@@ -1545,12 +1569,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) {
     Asm->emitDwarfSymbolReference(
         TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
 
-  Asm->OutStreamer->AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  if (Version <= 4) {
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->MAI->getCodePointerSize());
+  }
 }
 
 void DwarfTypeUnit::emitHeader(bool UseOffsets) {
-  DwarfUnit::emitHeader(UseOffsets);
+  DwarfUnit::emitCommonHeader(UseOffsets, 
+                              DD->useSplitDwarf() ? dwarf::DW_UT_split_type
+                                                  : dwarf::DW_UT_type);
   Asm->OutStreamer->AddComment("Type Signature");
   Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));
   Asm->OutStreamer->AddComment("Type DIE Offset");
@@ -1559,8 +1587,46 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) {
                                  sizeof(Ty->getOffset()));
 }
 
+DIE::value_iterator
+DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                           const MCSymbol *Hi, const MCSymbol *Lo) {
+  return Die.addValue(DIEValueAllocator, Attribute,
+                      DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                 : dwarf::DW_FORM_data4,
+                      new (DIEValueAllocator) DIEDelta(Hi, Lo));
+}
+
+DIE::value_iterator
+DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                           const MCSymbol *Label, const MCSymbol *Sec) {
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    return addLabel(Die, Attribute,
+                    DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                               : dwarf::DW_FORM_data4,
+                    Label);
+  return addSectionDelta(Die, Attribute, Label, Sec);
+}
+
 bool DwarfTypeUnit::isDwoUnit() const {
   // Since there are no skeleton type units, all type units are dwo type units
   // when split DWARF is being used.
   return DD->useSplitDwarf();
 }
+
+void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalNameForTypeUnit(Name, Context);
+}
+
+void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalTypeUnitType(Ty, Context);
+}
+
+const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const {
+  if (!Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    return nullptr;
+  if (isDwoUnit())
+    return nullptr;
+  return getSection()->getBeginSymbol();
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 8654d6f..4cc01b3 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -65,7 +65,7 @@ public:
 //===----------------------------------------------------------------------===//
 /// This dwarf writer support class manages information associated with a
 /// source file.
-  class DwarfUnit : public DIEUnit {
+class DwarfUnit : public DIEUnit {
 protected:
   /// MDNode for the compile unit.
   const DICompileUnit *CUNode;
@@ -103,9 +103,10 @@ protected:
 
   bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
 
-public:
-  virtual ~DwarfUnit();
+  bool shareAcrossDWOCUs() const;
+  bool isShareableAcrossCUs(const DINode *D) const;
 
+public:
   // Accessors.
   AsmPrinter* getAsmPrinter() const { return Asm; }
   uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
@@ -124,12 +125,12 @@ public:
   std::string getParentContextString(const DIScope *Context) const;
 
   /// Add a new global name to the compile unit.
-  virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) {
-  }
+  virtual void addGlobalName(StringRef Name, const DIE &Die,
+                             const DIScope *Context) = 0;
 
   /// Add a new global type to the compile unit.
   virtual void addGlobalType(const DIType *Ty, const DIE &Die,
-                             const DIScope *Context) {}
+                             const DIScope *Context) = 0;
 
   /// Returns the DIE map slot for the specified debug variable.
   ///
@@ -198,9 +199,6 @@ public:
 
   /// Add a type's DW_AT_signature and set the  declaration flag.
   void addDIETypeSignature(DIE &Die, uint64_t Signature);
-  /// Add an attribute containing the type signature for a unique identifier.
-  void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
-                           StringRef Identifier);
 
   /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
@@ -215,7 +213,6 @@ public:
   void addSourceLine(DIE &Die, const DIGlobalVariable *G);
   void addSourceLine(DIE &Die, const DISubprogram *SP);
   void addSourceLine(DIE &Die, const DIType *Ty);
-  void addSourceLine(DIE &Die, const DINamespace *NS);
   void addSourceLine(DIE &Die, const DIObjCProperty *Ty);
 
   /// Add constant value entry in variable DIE.
@@ -235,6 +232,9 @@ public:
   /// Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DINodeArray TParams);
 
+  /// Add thrown types.
+  void addThrownTypes(DIE &Die, DINodeArray ThrownTypes);
+
   // FIXME: Should be reformulated in terms of addComplexAddress.
   /// Start with the address based on the location provided, and generate the
   /// DWARF information necessary to find the actual Block variable (navigating
@@ -256,15 +256,12 @@ public:
   DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
 
   void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
-                                 bool Minimal = false);
+                                 bool SkipSPAttributes = false);
 
   /// Find existing DIE or create new DIE for the given type.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
   /// Get context owner's DIE.
-  DIE *createTypeDIE(const DICompositeType *Ty);
-
-  /// Get context owner's DIE.
   DIE *getOrCreateContextDIE(const DIScope *Context);
 
   /// Construct DIEs for types that contain vtables.
@@ -282,17 +279,30 @@ public:
   virtual unsigned getHeaderSize() const {
     return sizeof(int16_t) + // DWARF version number
            sizeof(int32_t) + // Offset Into Abbrev. Section
-           sizeof(int8_t);   // Pointer Size (in bytes)
+           sizeof(int8_t) +  // Pointer Size (in bytes)
+           (DD->getDwarfVersion() >= 5 ? sizeof(int8_t)
+                                       : 0); // DWARF v5 unit type
   }
 
   /// Emit the header for this unit, not including the initial length field.
-  virtual void emitHeader(bool UseOffsets);
+  virtual void emitHeader(bool UseOffsets) = 0;
 
   virtual DwarfCompileUnit &getCU() = 0;
 
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
 
+  /// addSectionDelta - Add a label delta attribute data and value.
+  DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCSymbol *Hi, const MCSymbol *Lo);
+
+  /// Add a Dwarf section label attribute data and value.
+  DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCSymbol *Label,
+                                      const MCSymbol *Sec);
+
 protected:
+  ~DwarfUnit();
+
   /// Create new static data member DIE.
   DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
 
@@ -306,6 +316,14 @@ protected:
     return Ref.resolve();
   }
 
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
+  /// Emit the common part of the header for this unit.
+  void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
+
 private:
   void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
@@ -330,15 +348,11 @@ private:
   /// Set D as anonymous type for index which can be reused later.
   void setIndexTyDie(DIE *D) { IndexTyDie = D; }
 
-  /// If this is a named finished type then include it in the list of types for
-  /// the accelerator tables.
-  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
-                               const DIE &TyDIE);
-
   virtual bool isDwoUnit() const = 0;
+  const MCSymbol *getCrossSectionRelativeBaseAddress() const override;
 };
 
-class DwarfTypeUnit : public DwarfUnit {
+class DwarfTypeUnit final : public DwarfUnit {
   uint64_t TypeSignature;
   const DIE *Ty;
   DwarfCompileUnit &CU;
@@ -354,12 +368,19 @@ public:
   void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }
   void setType(const DIE *Ty) { this->Ty = Ty; }
 
+  /// Get context owner's DIE.
+  DIE *createTypeDIE(const DICompositeType *Ty);
+
   /// Emit the header for this unit, not including the initial length field.
   void emitHeader(bool UseOffsets) override;
   unsigned getHeaderSize() const override {
     return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
            sizeof(uint32_t);                               // Type DIE Offset
   }
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+  void addGlobalType(const DIType *Ty, const DIE &Die,
+                     const DIScope *Context) override;
   DwarfCompileUnit &getCU() override { return CU; }
 };
 } // end llvm namespace
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 0a4a7a0..e14d5be 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -309,7 +309,7 @@ computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
   // If some instruction between the previous try-range and the end of the
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
-  if (SawPotentiallyThrowing && !IsSJLJ && LastLabel != nullptr) {
+  if (SawPotentiallyThrowing && !IsSJLJ) {
     CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 6a023b9..c579555 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===//
+//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,22 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter {
 public:
   void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
 };
-}
+
+} // end anonymous namespace
 
 static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
     X("erlang", "erlang-compatible garbage collector");
 
-void llvm::linkErlangGCPrinter() {}
-
 void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                                      AsmPrinter &AP) {
   MCStreamer &OS = *AP.OutStreamer;
@@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     }
   }
 }
+
+void llvm::linkErlangGCPrinter() {}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 8baee4d..035f1a0 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- OcamlGCPrinter.cpp - Ocaml frametable emitter ---------------------===//
+//===- OcamlGCPrinter.cpp - Ocaml frametable emitter ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,23 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/GCs.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
 using namespace llvm;
 
 namespace {
@@ -37,7 +41,8 @@ public:
   void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
   void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
 };
-}
+
+} // end anonymous namespace
 
 static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
     Y("ocaml", "ocaml 3.10-compatible collector");
@@ -50,7 +55,7 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   std::string SymName;
   SymName += "caml";
   size_t Letter = SymName.size();
-  SymName.append(MId.begin(), find(MId, '.'));
+  SymName.append(MId.begin(), llvm::find(MId, '.'));
   SymName += "__";
   SymName += Id;
 
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index 9d7c96a..5d485f2 100644
--- a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -14,6 +14,8 @@
 #include "WinException.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -29,8 +31,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   const Function *F = MF->getFunction();
 
-  shouldEmitMoves = Asm->needsSEHMoves();
+  shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI();
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
@@ -94,14 +94,14 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   // If we're not using CFI, we don't want the CFI or the personality, but we
   // might want EH tables if we had EH pads.
-  if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) {
+  if (!Asm->MAI->usesWindowsCFI()) {
     if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) {
       // If this is 32-bit SEH and we don't have any funclets (really invokes),
       // make sure we emit the parent offset label. Some unreferenced filter
       // functions may still refer to it.
       const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
       StringRef FLinkageName =
-          GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+          GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
       emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
     }
     shouldEmitLSDA = hasEHFunclets;
@@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
   // their funclet entry block's number.
   const MachineFunction *MF = MBB->getParent();
   const Function *F = MF->getFunction();
-  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
   MCContext &Ctx = MF->getContext();
   StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
   return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
@@ -252,7 +252,7 @@ void WinException::endFunclet() {
         !CurrentFuncletEntry->isCleanupFuncletEntry()) {
       // If this is a C++ catch funclet (or the parent function),
       // emit a reference to the LSDA for the parent function.
-      StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+      StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
       MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
           Twine("$cppxdata$", FuncLinkageName));
       Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
@@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
   // Emit a label assignment with the SEH frame offset so we can use it for
   // llvm.x86.seh.recoverfp.
   StringRef FLinkageName =
-      GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+      GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName());
   MCSymbol *ParentFrameOffset =
       Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
   const MCExpr *MCOffset =
@@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   auto &OS = *Asm->OutStreamer;
   const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
 
-  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
 
   SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
   MCSymbol *FuncInfoXData = nullptr;
@@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
 void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
   MCStreamer &OS = *Asm->OutStreamer;
   const Function *F = MF->getFunction();
-  StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName());
 
   bool VerboseAsm = OS.isVerboseAsm();
   auto AddComment = [&](const Twine &Comment) {
diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
index bf5cf10..aa9c8e9 100644
--- a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -35,20 +36,17 @@ using namespace llvm;
 
 namespace {
   class AtomicExpand: public FunctionPass {
-    const TargetMachine *TM;
     const TargetLowering *TLI;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit AtomicExpand(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+    AtomicExpand() : FunctionPass(ID), TLI(nullptr) {
       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnFunction(Function &F) override;
 
   private:
-    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
-                               bool IsStore, bool IsLoad);
+    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
     IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
     LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
     bool tryExpandAtomicLoad(LoadInst *LI);
@@ -98,12 +96,10 @@ namespace {
 
 char AtomicExpand::ID = 0;
 char &llvm::AtomicExpandID = AtomicExpand::ID;
-INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
-                   false, false)
+INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions",
+                false, false)
 
-FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
-  return new AtomicExpand(TM);
-}
+FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
 
 namespace {
 // Helper functions to retrieve the size of atomic instructions.
@@ -173,9 +169,14 @@ bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
 } // end anonymous namespace
 
 bool AtomicExpand::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  if (!TM.getSubtargetImpl(F)->enableAtomicExpand())
     return false;
-  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
@@ -224,22 +225,16 @@ bool AtomicExpand::runOnFunction(Function &F) {
 
     if (TLI->shouldInsertFencesForAtomic(I)) {
       auto FenceOrdering = AtomicOrdering::Monotonic;
-      bool IsStore, IsLoad;
       if (LI && isAcquireOrStronger(LI->getOrdering())) {
         FenceOrdering = LI->getOrdering();
         LI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = false;
-        IsLoad = true;
       } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
         FenceOrdering = SI->getOrdering();
         SI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = true;
-        IsLoad = false;
       } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
                           isAcquireOrStronger(RMWI->getOrdering()))) {
         FenceOrdering = RMWI->getOrdering();
         RMWI->setOrdering(AtomicOrdering::Monotonic);
-        IsStore = IsLoad = true;
       } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
                  (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
                   isAcquireOrStronger(CASI->getSuccessOrdering()))) {
@@ -250,11 +245,10 @@ bool AtomicExpand::runOnFunction(Function &F) {
         FenceOrdering = CASI->getSuccessOrdering();
         CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
         CASI->setFailureOrdering(AtomicOrdering::Monotonic);
-        IsStore = IsLoad = true;
       }
 
       if (FenceOrdering != AtomicOrdering::Monotonic) {
-        MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+        MadeChange |= bracketInstWithFences(I, FenceOrdering);
       }
     }
 
@@ -320,13 +314,12 @@ bool AtomicExpand::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
-                                         bool IsStore, bool IsLoad) {
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) {
   IRBuilder<> Builder(I);
 
-  auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
+  auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
 
-  auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
+  auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
   // The trailing fence is emitted before the instruction instead of after
   // because there is no easy way of setting Builder insertion point after
   // an instruction. So we must erase it from the BB, and insert it back
@@ -368,7 +361,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   auto *NewLI = Builder.CreateLoad(NewAddr);
   NewLI->setAlignment(LI->getAlignment());
   NewLI->setVolatile(LI->isVolatile());
-  NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
+  NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
   
   Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
@@ -451,7 +444,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
   NewSI->setAlignment(SI->getAlignment());
   NewSI->setVolatile(SI->isVolatile());
-  NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
+  NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
   DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
   SI->eraseFromParent();
   return NewSI;
@@ -808,7 +801,7 @@ void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
   AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
       PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
-      CI->getFailureOrdering(), CI->getSynchScope());
+      CI->getFailureOrdering(), CI->getSyncScopeID());
   NewCI->setVolatile(CI->isVolatile());
   // When we're building a strong cmpxchg, we need a loop, so you
   // might think we could use a weak cmpxchg inside. But, using strong
@@ -931,7 +924,7 @@ AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *
   auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
                                             CI->getSuccessOrdering(),
                                             CI->getFailureOrdering(),
-                                            CI->getSynchScope());
+                                            CI->getSyncScopeID());
   NewCI->setVolatile(CI->isVolatile());
   NewCI->setWeak(CI->isWeak());
   DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
@@ -1048,8 +1041,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
-    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                          /*IsLoad=*/true);
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(StartBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1064,8 +1056,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   Builder.SetInsertPoint(ReleasingStoreBB);
   if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
-    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                          /*IsLoad=*/true);
+    TLI->emitLeadingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(TryStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
@@ -1094,8 +1085,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // necessary.
   Builder.SetInsertPoint(SuccessBB);
   if (ShouldInsertFencesForAtomic)
-    TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
-                           /*IsLoad=*/true);
+    TLI->emitTrailingFence(Builder, CI, SuccessOrder);
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(NoStoreBB);
@@ -1107,8 +1097,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
 
   Builder.SetInsertPoint(FailureBB);
   if (ShouldInsertFencesForAtomic)
-    TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
-                           /*IsLoad=*/true);
+    TLI->emitTrailingFence(Builder, CI, FailureOrder);
   Builder.CreateBr(ExitBB);
 
   // Finally, we have control-flow based knowledge of whether the cmpxchg
@@ -1532,7 +1521,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
 
   Type *ResultTy;
   SmallVector<Value *, 6> Args;
-  AttributeSet Attr;
+  AttributeList Attr;
 
   // 'size' argument.
   if (!UseSizedLibcall) {
@@ -1593,7 +1582,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // Now, the return type.
   if (CASExpected) {
     ResultTy = Type::getInt1Ty(Ctx);
-    Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
+    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
   } else if (HasResult && UseSizedLibcall)
     ResultTy = SizedIntTy;
   else
diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
index a67e194..be93ff0 100644
--- a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -15,17 +15,15 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include <utility>
 using namespace llvm;
 
-#define DEBUG_TYPE "basictti"
-
 // This flag is used by the template base class for BasicTTIImpl, and here to
 // provide a definition.
 cl::opt<unsigned>
diff --git a/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp b/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp
new file mode 100644
index 0000000..2c41b59
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BranchCoalescing.cpp
@@ -0,0 +1,758 @@
+//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Coalesce basic blocks guarded by the same branch condition into a single
+/// basic block.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "branch-coalescing"
+
+static cl::opt<cl::boolOrDefault>
+    EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden,
+                           cl::desc("enable coalescing of duplicate branches"));
+
+STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced");
+STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged");
+STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced");
+
+//===----------------------------------------------------------------------===//
+//                               BranchCoalescing
+//===----------------------------------------------------------------------===//
+///
+/// Improve scheduling by coalescing branches that depend on the same condition.
+/// This pass looks for blocks that are guarded by the same branch condition
+/// and attempts to merge the blocks together. Such opportunities arise from
+/// the expansion of select statements in the IR.
+///
+/// For example, consider the following LLVM IR:
+///
+/// %test = icmp eq i32 %x 0
+/// %tmp1 = select i1 %test, double %a, double 2.000000e-03
+/// %tmp2 = select i1 %test, double %b, double 5.000000e-03
+///
+/// This IR expands to the following machine code on PowerPC:
+///
+/// BB#0: derived from LLVM BB %entry
+///    Live Ins: %F1 %F3 %X6
+///        <SNIP1>
+///        %vreg0<def> = COPY %F1; F8RC:%vreg0
+///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4
+///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>;
+///                    mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
+///        BCC 76, %vreg5, <BB#2>; CRRC:%vreg5
+///    Successors according to CFG: BB#1(?%) BB#2(?%)
+///
+/// BB#1: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0
+///    Successors according to CFG: BB#2(?%)
+///
+/// BB#2: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0 BB#1
+///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>;
+///                    F8RC:%vreg9,%vreg8,%vreg0
+///        <SNIP2>
+///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5
+///    Successors according to CFG: BB#3(?%) BB#4(?%)
+///
+/// BB#3: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#2
+///    Successors according to CFG: BB#4(?%)
+///
+/// BB#4: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#2 BB#3
+///        %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>;
+///                     F8RC:%vreg13,%vreg12,%vreg2
+///        <SNIP3>
+///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use>
+///
+/// When this pattern is detected, branch coalescing will try to collapse
+/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3.
+///
+/// If all conditions are meet, IR should collapse to:
+///
+/// BB#0: derived from LLVM BB %entry
+///    Live Ins: %F1 %F3 %X6
+///        <SNIP1>
+///        %vreg0<def> = COPY %F1; F8RC:%vreg0
+///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4
+///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>;
+///                     mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
+///        <SNIP2>
+///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5
+///    Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%)
+///      BB#4(0x55555554 / 0x80000000 = 66.67%)
+///
+/// BB#1: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0
+///    Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%)
+///
+/// BB#4: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0 BB#1
+///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>;
+///                    F8RC:%vreg9,%vreg8,%vreg0
+///        %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>;
+///                     F8RC:%vreg13,%vreg12,%vreg2
+///        <SNIP3>
+///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use>
+///
+/// Branch Coalescing does not split blocks, it moves everything in the same
+/// direction ensuring it does not break use/definition semantics.
+///
+/// PHI nodes and its corresponding use instructions are moved to its successor
+/// block if there are no uses within the successor block PHI nodes.  PHI
+/// node ordering cannot be assumed.
+///
+/// Non-PHI can be moved up to the predecessor basic block or down to the
+/// successor basic block following any PHI instructions. Whether it moves
+/// up or down depends on whether the register(s) defined in the instructions
+/// are used in current block or in any PHI instructions at the beginning of
+/// the successor block.
+
+namespace {
+
+class BranchCoalescing : public MachineFunctionPass {
+  struct CoalescingCandidateInfo {
+    MachineBasicBlock *BranchBlock;       // Block containing the branch
+    MachineBasicBlock *BranchTargetBlock; // Block branched to
+    MachineBasicBlock *FallThroughBlock;  // Fall-through if branch not taken
+    SmallVector<MachineOperand, 4> Cond;
+    bool MustMoveDown;
+    bool MustMoveUp;
+
+    CoalescingCandidateInfo();
+    void clear();
+  };
+
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+
+  void initialize(MachineFunction &F);
+  bool canCoalesceBranch(CoalescingCandidateInfo &Cand);
+  bool identicalOperands(ArrayRef<MachineOperand> OperandList1,
+                         ArrayRef<MachineOperand> OperandList2) const;
+  bool validateCandidates(CoalescingCandidateInfo &SourceRegion,
+                          CoalescingCandidateInfo &TargetRegion) const;
+
+  static bool isBranchCoalescingEnabled() {
+    return EnableBranchCoalescing == cl::BOU_TRUE;
+  }
+
+public:
+  static char ID;
+
+  BranchCoalescing() : MachineFunctionPass(ID) {
+    initializeBranchCoalescingPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Branch Coalescing"; }
+
+  bool mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                       CoalescingCandidateInfo &TargetRegion);
+  bool canMoveToBeginning(const MachineInstr &MI,
+                          const MachineBasicBlock &MBB) const;
+  bool canMoveToEnd(const MachineInstr &MI,
+                    const MachineBasicBlock &MBB) const;
+  bool canMerge(CoalescingCandidateInfo &SourceRegion,
+                CoalescingCandidateInfo &TargetRegion) const;
+  void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB,
+                         MachineBasicBlock *TargetRegionMBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char BranchCoalescing::ID = 0;
+char &llvm::BranchCoalescingID = BranchCoalescing::ID;
+
+INITIALIZE_PASS_BEGIN(BranchCoalescing, DEBUG_TYPE,
+                      "Branch Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(BranchCoalescing, DEBUG_TYPE, "Branch Coalescing",
+                    false, false)
+
+BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo()
+    : BranchBlock(nullptr), BranchTargetBlock(nullptr),
+      FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {}
+
+void BranchCoalescing::CoalescingCandidateInfo::clear() {
+  BranchBlock = nullptr;
+  BranchTargetBlock = nullptr;
+  FallThroughBlock = nullptr;
+  Cond.clear();
+  MustMoveDown = false;
+  MustMoveUp = false;
+}
+
+void BranchCoalescing::initialize(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = &getAnalysis<MachinePostDominatorTree>();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+///
+/// Analyze the branch statement to determine if it can be coalesced. This
+/// method analyses the branch statement for the given candidate to determine
+/// if it can be coalesced. If the branch can be coalesced, then the
+/// BranchTargetBlock and the FallThroughBlock are recorded in the specified
+/// Candidate.
+///
+///\param[in,out] Cand The coalescing candidate to analyze
+///\return true if and only if the branch can be coalesced, false otherwise
+///
+bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
+  DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
+               << " can be coalesced:");
+  MachineBasicBlock *FalseMBB = nullptr;
+
+  if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
+                         Cand.Cond)) {
+    DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+    return false;
+  }
+
+  for (auto &I : Cand.BranchBlock->terminators()) {
+    DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+    if (!I.isBranch())
+      continue;
+
+    if (I.getNumOperands() != I.getNumExplicitOperands()) {
+      DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
+                   << "\n");
+      return false;
+    }
+  }
+
+  if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
+    DEBUG(dbgs() << "EH Pad - skip\n");
+    return false;
+  }
+
+  // For now only consider triangles (i.e, BranchTargetBlock is set,
+  // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
+  if (!Cand.BranchTargetBlock || FalseMBB ||
+      !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
+    DEBUG(dbgs() << "Does not form a triangle - skip\n");
+    return false;
+  }
+
+  // Ensure there are only two successors
+  if (Cand.BranchBlock->succ_size() != 2) {
+    DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+    return false;
+  }
+
+  // Sanity check - the block must be able to fall through
+  assert(Cand.BranchBlock->canFallThrough() &&
+         "Expecting the block to fall through!");
+
+  // We have already ensured there are exactly two successors to
+  // BranchBlock and that BranchTargetBlock is a successor to BranchBlock.
+  // Ensure the single fall though block is empty.
+  MachineBasicBlock *Succ =
+    (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock)
+    ? *Cand.BranchBlock->succ_rbegin()
+    : *Cand.BranchBlock->succ_begin();
+
+  assert(Succ && "Expecting a valid fall-through block\n");
+
+  if (!Succ->empty()) {
+      DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+      return false;
+  }
+
+  if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
+      DEBUG(dbgs()
+            << "Successor of fall through block is not branch taken block\n");
+      return false;
+  }
+
+  Cand.FallThroughBlock = Succ;
+  DEBUG(dbgs() << "Valid Candidate\n");
+  return true;
+}
+
+///
+/// Determine if the two operand lists are identical
+///
+/// \param[in] OpList1 operand list
+/// \param[in] OpList2 operand list
+/// \return true if and only if the operands lists are identical
+///
+bool BranchCoalescing::identicalOperands(
+    ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
+
+  if (OpList1.size() != OpList2.size()) {
+    DEBUG(dbgs() << "Operand list is different size\n");
+    return false;
+  }
+
+  for (unsigned i = 0; i < OpList1.size(); ++i) {
+    const MachineOperand &Op1 = OpList1[i];
+    const MachineOperand &Op2 = OpList2[i];
+
+    DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+                 << "Op2: " << Op2 << "\n");
+
+    if (Op1.isIdenticalTo(Op2)) {
+      DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+      continue;
+    }
+
+    // If the operands are not identical, but are registers, check to see if the
+    // definition of the register produces the same value. If they produce the
+    // same value, consider them to be identical.
+    if (Op1.isReg() && Op2.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
+        TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+      MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
+      MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
+      if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
+        DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+                     << " produce the same value!\n");
+      } else {
+        DEBUG(dbgs() << "Operands produce different values\n");
+        return false;
+      }
+    } else {
+      DEBUG(dbgs() << "The operands are not provably identical.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+///
+/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB
+/// and update them to refer to the new block.  PHI node ordering
+/// cannot be assumed so it does not matter where the PHI instructions
+/// are moved to in TargetMBB.
+///
+/// \param[in] SourceMBB block to move PHI instructions from
+/// \param[in] TargetMBB block to move PHI instructions to
+///
+void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
+                                         MachineBasicBlock *TargetMBB) {
+
+  MachineBasicBlock::iterator MI = SourceMBB->begin();
+  MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
+
+  if (MI == ME) {
+    DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+    return;
+  }
+
+  // Update all PHI instructions in SourceMBB and move to top of TargetMBB
+  for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) {
+    MachineInstr &PHIInst = *Iter;
+    for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = PHIInst.getOperand(i);
+      if (MO.getMBB() == SourceMBB)
+        MO.setMBB(TargetMBB);
+    }
+  }
+  TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME);
+}
+
+///
+/// This function checks if MI can be moved to the beginning of the TargetMBB
+/// following PHI instructions. A MI instruction can be moved to beginning of
+/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to beginning of TargetMBB,
+///         false otherwise.
+///
+bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
+                                          const MachineBasicBlock &TargetMBB
+                                          ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Def : MI.defs()) { // Looking at Def
+    for (auto &Use : MRI->use_instructions(Def.getReg())) {
+      if (Use.isPHI() && Use.getParent() == &TargetMBB) {
+        DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n");
+       return false;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the beginning.\n");
+  return true;
+}
+
+///
+/// This function checks if MI can be moved to the end of the TargetMBB,
+/// immediately before the first terminator.  A MI instruction can be moved
+/// to then end of the TargetMBB if no PHI node defines what MI uses within
+/// it's own MBB.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to end of TargetMBB,
+///         false otherwise.
+///
+bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI,
+                                    const MachineBasicBlock &TargetMBB
+                                    ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Use : MI.uses()) {
+    if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+      MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
+      if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
+        DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
+        return false;
+      } else {
+        DEBUG(dbgs() << "    *** def is in another block -- safe to move!\n");
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the end.\n");
+  return true;
+}
+
+///
+/// This method checks to ensure the two coalescing candidates follows the
+/// expected pattern required for coalescing.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+/// into a block in TargetRegion; false otherwise.
+///
+bool BranchCoalescing::validateCandidates(
+    CoalescingCandidateInfo &SourceRegion,
+    CoalescingCandidateInfo &TargetRegion) const {
+
+  if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock)
+    llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion");
+  else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock))
+    llvm_unreachable("Expecting TargetRegion to dominate SourceRegion");
+  else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock))
+    llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion");
+  else if (!TargetRegion.FallThroughBlock->empty() ||
+           !SourceRegion.FallThroughBlock->empty())
+    llvm_unreachable("Expecting fall-through blocks to be empty");
+
+  return true;
+}
+
+///
+/// This method determines whether the two coalescing candidates can be merged.
+/// In order to be merged, all instructions must be able to
+///   1. Move to the beginning of the SourceRegion.BranchTargetBlock;
+///   2. Move to the end of the TargetRegion.BranchBlock.
+/// Merging involves moving the instructions in the
+/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock).
+///
+/// This function first try to move instructions from the
+/// TargetRegion.BranchTargetBlock down, to the beginning of the
+/// SourceRegion.BranchTargetBlock. This is not possible if any register defined
+/// in TargetRegion.BranchTargetBlock is used in a PHI node in the
+/// SourceRegion.BranchTargetBlock. In this case, check whether the statement
+/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately
+/// before the branch statement). If it cannot move, then these blocks cannot
+/// be merged.
+///
+/// Note that there is no analysis for moving instructions past the fall-through
+/// blocks because they are confirmed to be empty. An assert is thrown if they
+/// are not.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+///         into a block in TargetRegion, false otherwise.
+///
+bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
+                                CoalescingCandidateInfo &TargetRegion) const {
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Walk through PHI nodes first and see if they force the merge into the
+  // SourceRegion.BranchTargetBlock.
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->instr_begin(),
+           E = SourceRegion.BranchBlock->getFirstNonPHI();
+       I != E; ++I) {
+    for (auto &Def : I->defs())
+      for (auto &Use : MRI->use_instructions(Def.getReg())) {
+        if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
+          DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
+                          "PHI within branch target block -- can't merge\n");
+          NumPHINotMoved++;
+          return false;
+        }
+        if (Use.getParent() == SourceRegion.BranchBlock) {
+          DEBUG(dbgs() << "PHI " << *I
+                       << " defines register used in this "
+                          "block -- all must move down\n");
+          SourceRegion.MustMoveDown = true;
+        }
+      }
+  }
+
+  // Walk through the MI to see if they should be merged into
+  // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down)
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->getFirstNonPHI(),
+           E = SourceRegion.BranchBlock->end();
+       I != E; ++I) {
+    if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move down - must move up!\n");
+      SourceRegion.MustMoveUp = true;
+    }
+    if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move up - must move down!\n");
+      SourceRegion.MustMoveDown = true;
+    }
+  }
+
+  return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true;
+}
+
+/// Merge the instructions from SourceRegion.BranchBlock,
+/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into
+/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and
+/// TargetRegion.FallThroughBlock respectively.
+///
+/// The successors for blocks in TargetRegion will be updated to use the
+/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion
+/// will be removed from the function.
+///
+/// A region consists of a BranchBlock, a FallThroughBlock, and a
+/// BranchTargetBlock. Branch coalesce works on patterns where the
+/// TargetRegion's BranchTargetBlock must also be the SourceRegions's
+/// BranchBlock.
+///
+///  Before mergeCandidates:
+///
+///  +---------------------------+
+///  |  TargetRegion.BranchBlock |
+///  +---------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  TargetRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  TargetRegion.BranchTargetBlock  |
+///  |  SourceRegion.BranchBlock        |
+///  +----------------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  SourceRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+///  After mergeCandidates:
+///
+///  +-----------------------------+
+///  |  TargetRegion.BranchBlock   |
+///  |  SourceRegion.BranchBlock   |
+///  +-----------------------------+
+///     /        |
+///    /   +---------------------------------+
+///   |    |  TargetRegion.FallThroughBlock  |
+///   |    |  SourceRegion.FallThroughBlock  |
+///    \   +---------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+/// \param[in] SourceRegion The candidate to move blocks from
+/// \param[in] TargetRegion The candidate to move blocks to
+///
+bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                                       CoalescingCandidateInfo &TargetRegion) {
+
+  if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) {
+    llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!");
+    return false;
+  }
+
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Start the merging process by first handling the BranchBlock.
+  // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block
+  moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+
+  // Move remaining instructions in SourceRegion.BranchBlock into
+  // TargetRegion.BranchBlock
+  MachineBasicBlock::iterator firstInstr =
+      SourceRegion.BranchBlock->getFirstNonPHI();
+  MachineBasicBlock::iterator lastInstr =
+      SourceRegion.BranchBlock->getFirstTerminator();
+
+  MachineBasicBlock *Source = SourceRegion.MustMoveDown
+                                  ? SourceRegion.BranchTargetBlock
+                                  : TargetRegion.BranchBlock;
+
+  MachineBasicBlock::iterator Target =
+      SourceRegion.MustMoveDown
+          ? SourceRegion.BranchTargetBlock->getFirstNonPHI()
+          : TargetRegion.BranchBlock->getFirstTerminator();
+
+  Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr);
+
+  // Once PHI and instructions have been moved we need to clean up the
+  // control flow.
+
+  // Remove SourceRegion.FallThroughBlock before transferring successors of
+  // SourceRegion.BranchBlock to TargetRegion.BranchBlock.
+  SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock);
+  TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.BranchBlock);
+  // Update branch in TargetRegion.BranchBlock to jump to
+  // SourceRegion.BranchTargetBlock
+  // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock.
+  TargetRegion.BranchBlock->ReplaceUsesOfBlockWith(
+      SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+  // Remove the branch statement(s) in SourceRegion.BranchBlock
+  MachineBasicBlock::iterator I =
+      SourceRegion.BranchBlock->terminators().begin();
+  while (I != SourceRegion.BranchBlock->terminators().end()) {
+    MachineInstr &CurrInst = *I;
+    ++I;
+    if (CurrInst.isBranch())
+      CurrInst.eraseFromParent();
+  }
+
+  // Fall-through block should be empty since this is part of the condition
+  // to coalesce the branches.
+  assert(TargetRegion.FallThroughBlock->empty() &&
+         "FallThroughBlocks should be empty!");
+
+  // Transfer successor information and move PHIs down to the
+  // branch-taken block.
+  TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.FallThroughBlock);
+  TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock);
+
+  // Remove the blocks from the function.
+  assert(SourceRegion.BranchBlock->empty() &&
+         "Expecting branch block to be empty!");
+  SourceRegion.BranchBlock->eraseFromParent();
+
+  assert(SourceRegion.FallThroughBlock->empty() &&
+         "Expecting fall-through block to be empty!\n");
+  SourceRegion.FallThroughBlock->eraseFromParent();
+
+  NumBlocksCoalesced++;
+  return true;
+}
+
+bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
+
+  if (skipFunction(*MF.getFunction()) || MF.empty() ||
+      !isBranchCoalescingEnabled())
+    return false;
+
+  bool didSomething = false;
+
+  DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+  initialize(MF);
+
+  DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+
+  CoalescingCandidateInfo Cand1, Cand2;
+  // Walk over blocks and find candidates to merge
+  // Continue trying to merge with the first candidate found, as long as merging
+  // is successfull.
+  for (MachineBasicBlock &MBB : MF) {
+    bool MergedCandidates = false;
+    do {
+      MergedCandidates = false;
+      Cand1.clear();
+      Cand2.clear();
+
+      Cand1.BranchBlock = &MBB;
+
+      // If unable to coalesce the branch, then continue to next block
+      if (!canCoalesceBranch(Cand1))
+        break;
+
+      Cand2.BranchBlock = Cand1.BranchTargetBlock;
+      if (!canCoalesceBranch(Cand2))
+        break;
+
+      // Sanity check
+      // The branch-taken block of the second candidate should post-dominate the
+      // first candidate
+      assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) &&
+             "Branch-taken block should post-dominate first candidate");
+
+      if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
+        DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
+                     << Cand2.BranchBlock->getNumber()
+                     << " have different branches\n");
+        break;
+      }
+      if (!canMerge(Cand2, Cand1)) {
+        DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
+                     << " and " << Cand2.BranchBlock->getNumber() << "\n");
+        NumBlocksNotCoalesced++;
+        continue;
+      }
+      DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+                   << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+      MergedCandidates = mergeCandidates(Cand2, Cand1);
+      if (MergedCandidates)
+        didSomething = true;
+
+      DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+    } while (MergedCandidates);
+  }
+
+#ifndef NDEBUG
+  // Verify MF is still valid after branch coalescing
+  if (didSomething)
+    MF.verify(nullptr, "Error in code produced by branch coalescing");
+#endif // NDEBUG
+
+  DEBUG(dbgs() << "Finished Branch Coalescing\n");
+  return didSomething;
+}
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
index 6fba161..3c439e6 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
@@ -1,4 +1,4 @@
-//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
+//===- BranchFolding.cpp - Fold machine code branch instructions ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,37 +18,55 @@
 //===----------------------------------------------------------------------===//
 
 #include "BranchFolding.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
 using namespace llvm;
 
-#define DEBUG_TYPE "branchfolding"
+#define DEBUG_TYPE "branch-folder"
 
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumBranchOpts, "Number of branches optimized");
 STATISTIC(NumTailMerge , "Number of block tails merged");
 STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
+STATISTIC(NumTailCalls,  "Number of tail calls optimized");
 
 static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
                               cl::init(cl::BOU_UNSET), cl::Hidden);
@@ -67,10 +85,12 @@ TailMergeSize("tail-merge-size",
                               cl::init(3), cl::Hidden);
 
 namespace {
+
   /// BranchFolderPass - Wrap branch folder in a machine function pass.
   class BranchFolderPass : public MachineFunctionPass {
   public:
     static char ID;
+
     explicit BranchFolderPass(): MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
@@ -82,12 +102,13 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
-}
+
+} // end anonymous namespace
 
 char BranchFolderPass::ID = 0;
 char &llvm::BranchFolderPassID = BranchFolderPass::ID;
 
-INITIALIZE_PASS(BranchFolderPass, "branch-folder",
+INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE,
                 "Control Flow Optimizer", false, false)
 
 bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
@@ -123,8 +144,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
   }
 }
 
-/// RemoveDeadBlock - Remove the specified dead machine basic block from the
-/// function, updating the CFG.
 void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
@@ -144,9 +163,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
     MLI->removeBlock(MBB);
 }
 
-/// OptimizeFunction - Perhaps branch folding, tail merging and other
-/// CFG optimizations on the given function.  Block placement changes the layout
-/// and may create new tail merging opportunities.
 bool BranchFolder::OptimizeFunction(MachineFunction &MF,
                                     const TargetInstrInfo *tii,
                                     const TargetRegisterInfo *tri,
@@ -156,13 +172,14 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   TriedMerging.clear();
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   AfterBlockPlacement = AfterPlacement;
   TII = tii;
   TRI = tri;
   MMI = mmi;
   MLI = mli;
+  this->MRI = &MRI;
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
   if (!UpdateLiveIns)
     MRI.invalidateLiveness();
@@ -348,23 +365,18 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   return TailLen;
 }
 
-/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
-/// after it, replacing it with an unconditional branch to NewDest.
 void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                            MachineBasicBlock *NewDest) {
   TII->ReplaceTailWithBranchTo(OldInst, NewDest);
 
   if (UpdateLiveIns) {
     NewDest->clearLiveIns();
-    computeLiveIns(LiveRegs, *TRI, *NewDest);
+    computeLiveIns(LiveRegs, *MRI, *NewDest);
   }
 
   ++NumTailMerge;
 }
 
-/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
-/// MBB so that the part before the iterator falls into the part starting at the
-/// iterator.  This returns the new MBB.
 MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
                                             MachineBasicBlock::iterator BBI1,
                                             const BasicBlock *BB) {
@@ -375,7 +387,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
 
   // Create the fall-through block.
   MachineFunction::iterator MBBI = CurMBB.getIterator();
-  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB);
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(BB);
   CurMBB.getParent()->insert(++MBBI, NewMBB);
 
   // Move all the successors of this block to the specified block.
@@ -388,7 +400,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
 
   // NewMBB belongs to the same loop as CurMBB.
-  if (MLI) 
+  if (MLI)
     if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
       ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
 
@@ -396,7 +408,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
 
   if (UpdateLiveIns)
-    computeLiveIns(LiveRegs, *TRI, *NewMBB);
+    computeLiveIns(LiveRegs, *MRI, *NewMBB);
 
   // Add the new block to the funclet.
   const auto &FuncletI = FuncletMembership.find(&CurMBB);
@@ -436,7 +448,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc dl;  // FIXME: this is nowhere
+  DebugLoc dl = CurMBB->findBranchDebugLoc();
   if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
     MachineBasicBlock *NextBB = &*I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
@@ -497,6 +509,15 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
   return MBFI.printBlockFreq(OS, Freq);
 }
 
+void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) {
+  MBFI.view(Name, isSimple);
+}
+
+uint64_t
+BranchFolder::MBFIWrapper::getEntryFreq() const {
+  return MBFI.getEntryFreq();
+}
+
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
 /// is one, or MBB->end() otherwise.
@@ -504,7 +525,7 @@ static unsigned CountTerminators(MachineBasicBlock *MBB,
                                  MachineBasicBlock::iterator &I) {
   I = MBB->end();
   unsigned NumTerms = 0;
-  for (;;) {
+  while (true) {
     if (I == MBB->begin()) {
       I = MBB->end();
       break;
@@ -516,6 +537,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB,
   return NumTerms;
 }
 
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
+static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) {
+  if (!MBB->succ_empty())
+    return false;
+  if (MBB->empty())
+    return true;
+  return !(MBB->back().isReturn() || MBB->back().isIndirectBranch());
+}
+
 /// ProfitableToMerge - Check if two machine basic blocks have a common tail
 /// and decide if it would be profitable to merge those tails.  Return the
 /// length of the common tail and iterators to the first common instruction
@@ -570,6 +602,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
       return true;
   }
 
+  // If these are identical non-return blocks with no successors, merge them.
+  // Such blocks are typically cold calls to noreturn functions like abort, and
+  // are unlikely to become a fallthrough target after machine block placement.
+  // Tail merging these blocks is unlikely to create additional unconditional
+  // branches, and will reduce the size of this cold code.
+  if (I1 == MBB1->begin() && I2 == MBB2->begin() &&
+      blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2))
+    return true;
+
   // If one of the blocks can be completely merged and happens to be in
   // a position where the other could fall through into it, merge any number
   // of instructions, because it can be done without a branch.
@@ -579,6 +620,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
   if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
     return true;
 
+  // If both blocks are identical and end in a branch, merge them unless they
+  // both have a fallthrough predecessor and successor.
+  // We can only do this after block placement because it depends on whether
+  // there are fallthroughs, and we don't know until after layout.
+  if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) {
+    auto BothFallThrough = [](MachineBasicBlock *MBB) {
+      if (MBB->succ_size() != 0 && !MBB->canFallThrough())
+        return false;
+      MachineFunction::iterator I(MBB);
+      MachineFunction *MF = MBB->getParent();
+      return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough();
+    };
+    if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2))
+      return true;
+  }
+
   // If both blocks have an unconditional branch temporarily stripped out,
   // count that as an additional common instruction for the following
   // heuristics. This heuristic is only accurate for single-succ blocks, so to
@@ -604,16 +661,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
          (I1 == MBB1->begin() || I2 == MBB2->begin());
 }
 
-/// ComputeSameTails - Look through all the blocks in MergePotentials that have
-/// hash CurHash (guaranteed to match the last element).  Build the vector
-/// SameTails of all those that have the (same) largest number of instructions
-/// in common of any pair of these blocks.  SameTails entries contain an
-/// iterator into MergePotentials (from which the MachineBasicBlock can be
-/// found) and a MachineBasicBlock::iterator into that MBB indicating the
-/// instruction where the matching code sequence begins.
-/// Order of elements in SameTails is the reverse of the order in which
-/// those blocks appear in MergePotentials (where they are not necessarily
-/// consecutive).
 unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
                                         unsigned MinCommonTailLength,
                                         MachineBasicBlock *SuccBB,
@@ -650,8 +697,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
   return maxCommonTailLength;
 }
 
-/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
-/// MergePotentials, restoring branches at ends of blocks as appropriate.
 void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
                                         MachineBasicBlock *SuccBB,
                                         MachineBasicBlock *PredBB) {
@@ -671,8 +716,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
   MergePotentials.erase(CurMPIter, MergePotentials.end());
 }
 
-/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
-/// only of the common tail.  Create a block that does by splitting one.
 bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                                              MachineBasicBlock *SuccBB,
                                              unsigned maxCommonTailLength,
@@ -723,6 +766,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   return true;
 }
 
+void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) {
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size());
+  for (unsigned int i = 0 ; i != SameTails.size() ; ++i) {
+    if (i != commonTailIndex)
+      NextCommonInsts[i] = SameTails[i].getTailStartPos();
+    else {
+      assert(SameTails[i].getTailStartPos() == MBB->begin() &&
+          "MBB is not a common tail only block");
+    }
+  }
+
+  for (auto &MI : *MBB) {
+    if (MI.isDebugValue())
+      continue;
+    DebugLoc DL = MI.getDebugLoc();
+    for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
+      if (i == commonTailIndex)
+        continue;
+
+      auto &Pos = NextCommonInsts[i];
+      assert(Pos != SameTails[i].getBlock()->end() &&
+          "Reached BB end within common tail");
+      while (Pos->isDebugValue()) {
+        ++Pos;
+        assert(Pos != SameTails[i].getBlock()->end() &&
+            "Reached BB end within common tail");
+      }
+      assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
+      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
+      NextCommonInsts[i] = ++Pos;
+    }
+    MI.setDebugLoc(DL);
+  }
+}
+
 static void
 mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
                 MachineBasicBlock &MBBCommon) {
@@ -875,10 +955,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
     // Recompute common tail MBB's edge weights and block frequency.
     setCommonTailEdgeWeights(*MBB);
 
-    // Remove the original debug location from the common tail.
-    for (auto &MI : *MBB)
-      if (!MI.isDebugValue())
-        MI.setDebugLoc(DebugLoc());
+    // Merge debug locations across identical instructions for common tail.
+    MergeCommonTailDebugLocs(commonTailIndex);
 
     // MBB is common tail.  Adjust all other BB's to jump to this one.
     // Traversal must be forwards so erases work.
@@ -1043,7 +1121,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
         // Remove the unconditional branch at the end, if any.
         if (TBB && (Cond.empty() || FBB)) {
-          DebugLoc dl;  // FIXME: this is nowhere
+          DebugLoc dl = PBB->findBranchDebugLoc();
           TII->removeBranch(*PBB);
           if (!Cond.empty())
             // reinsert conditional branch only, for now
@@ -1193,8 +1271,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
   return DebugLoc();
 }
 
-/// OptimizeBlock - Analyze and optimize control flow related to the specified
-/// block.  This is never called on the entry block.
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
@@ -1386,6 +1462,43 @@ ReoptimizeBlock:
     }
   }
 
+  if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
+      MF.getFunction()->optForSize()) {
+    // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
+    // direction, thereby defeating careful block placement and regressing
+    // performance. Therefore, only consider this for optsize functions.
+    MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
+    if (TII->isUnconditionalTailCall(TailCall)) {
+      MachineBasicBlock *Pred = *MBB->pred_begin();
+      MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+      SmallVector<MachineOperand, 4> PredCond;
+      bool PredAnalyzable =
+          !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
+
+      if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB &&
+          PredTBB != PredFBB) {
+        // The predecessor has a conditional branch to this block which consists
+        // of only a tail call. Try to fold the tail call into the conditional
+        // branch.
+        if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
+          // TODO: It would be nice if analyzeBranch() could provide a pointer
+          // to the branch instruction so replaceBranchWithTailCall() doesn't
+          // have to search for it.
+          TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
+          ++NumTailCalls;
+          Pred->removeSuccessor(MBB);
+          MadeChange = true;
+          return MadeChange;
+        }
+      }
+      // If the predecessor is falling through to this block, we could reverse
+      // the branch condition and fold the tail call into that. However, after
+      // that we might have to re-arrange the CFG to fall through to the other
+      // block and there is a high risk of regressing code size rather than
+      // improving it.
+    }
+  }
+
   // Analyze the branch in the current block.
   MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
   SmallVector<MachineOperand, 4> CurCond;
@@ -1508,7 +1621,6 @@ ReoptimizeBlock:
   // block doesn't fall through into some other block, see if we can find a
   // place to move this block where a fall-through will happen.
   if (!PrevBB.canFallThrough()) {
-
     // Now we know that there was no fall-through into this block, check to
     // see if it has a fall-through into its successor.
     bool CurFallsThru = MBB->canFallThrough();
@@ -1599,8 +1711,6 @@ ReoptimizeBlock:
 //  Hoist Common Code
 //===----------------------------------------------------------------------===//
 
-/// HoistCommonCode - Hoist common instruction sequences at the start of basic
-/// blocks to their common predecessor.
 bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
   bool MadeChange = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
@@ -1734,9 +1844,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   return PI;
 }
 
-/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
-/// sequence at the start of the function, move the instructions before MBB
-/// terminator if it's legal.
 bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
@@ -1763,8 +1870,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     return false;
 
   bool HasDups = false;
-  SmallVector<unsigned, 4> LocalDefs;
-  SmallSet<unsigned, 4> LocalDefsSet;
+  SmallVector<unsigned, 4> LocalDefs, LocalKills;
+  SmallSet<unsigned, 4> ActiveDefsSet, AllDefsSet;
   MachineBasicBlock::iterator TIB = TBB->begin();
   MachineBasicBlock::iterator FIB = FBB->begin();
   MachineBasicBlock::iterator TIE = TBB->end();
@@ -1818,7 +1925,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
           IsSafe = false;
           break;
         }
-      } else if (!LocalDefsSet.count(Reg)) {
+      } else if (!ActiveDefsSet.count(Reg)) {
         if (Defs.count(Reg)) {
           // Use is defined by the instruction at the point of insertion.
           IsSafe = false;
@@ -1838,18 +1945,22 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
       break;
 
-    // Remove kills from LocalDefsSet, these registers had short live ranges.
+    // Remove kills from ActiveDefsSet, these registers had short live ranges.
     for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isUse() || !MO.isKill())
         continue;
       unsigned Reg = MO.getReg();
-      if (!Reg || !LocalDefsSet.count(Reg))
+      if (!Reg)
         continue;
+      if (!AllDefsSet.count(Reg)) {
+        LocalKills.push_back(Reg);
+        continue;
+      }
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-          LocalDefsSet.erase(*AI);
+          ActiveDefsSet.erase(*AI);
       } else {
-        LocalDefsSet.erase(Reg);
+        ActiveDefsSet.erase(Reg);
       }
     }
 
@@ -1861,7 +1972,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
       LocalDefs.push_back(Reg);
-      addRegAndItsAliases(Reg, TRI, LocalDefsSet);
+      addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
+      addRegAndItsAliases(Reg, TRI, AllDefsSet);
     }
 
     HasDups = true;
@@ -1876,17 +1988,22 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   FBB->erase(FBB->begin(), FIB);
 
   // Update livein's.
-  bool AddedLiveIns = false;
+  bool ChangedLiveIns = false;
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Def = LocalDefs[i];
-    if (LocalDefsSet.count(Def)) {
+    if (ActiveDefsSet.count(Def)) {
       TBB->addLiveIn(Def);
       FBB->addLiveIn(Def);
-      AddedLiveIns = true;
+      ChangedLiveIns = true;
     }
   }
+  for (unsigned K : LocalKills) {
+    TBB->removeLiveIn(K);
+    FBB->removeLiveIn(K);
+    ChangedLiveIns = true;
+  }
 
-  if (AddedLiveIns) {
+  if (ChangedLiveIns) {
     TBB->sortUniqueLiveIns();
     FBB->sortUniqueLiveIns();
   }
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h
index fc48e48..9268113 100644
--- a/contrib/llvm/lib/CodeGen/BranchFolding.h
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.h
@@ -37,6 +37,9 @@ namespace llvm {
                           // flag. Ignored for optsize.
                           unsigned MinCommonTailLength = 0);
 
+    /// Perhaps branch folding, tail merging and other CFG optimizations on the
+    /// given function.  Block placement changes the layout and may create new
+    /// tail merging opportunities.
     bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
                           const TargetRegisterInfo *tri, MachineModuleInfo *mmi,
                           MachineLoopInfo *mli = nullptr,
@@ -105,6 +108,7 @@ namespace llvm {
     bool UpdateLiveIns;
     unsigned MinCommonTailLength;
     const TargetInstrInfo *TII;
+    const MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
     MachineLoopInfo *MLI;
@@ -122,6 +126,8 @@ namespace llvm {
                                   const MachineBasicBlock *MBB) const;
       raw_ostream &printBlockFreq(raw_ostream &OS,
                                   const BlockFrequency Freq) const;
+      void view(const Twine &Name, bool isSimple = true);
+      uint64_t getEntryFreq() const;
 
     private:
       const MachineBlockFrequencyInfo &MBFI;
@@ -137,26 +143,64 @@ namespace llvm {
                        MachineBasicBlock* PredBB,
                        unsigned MinCommonTailLength);
     void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
+
+    /// Delete the instruction OldInst and everything after it, replacing it
+    /// with an unconditional branch to NewDest.
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                  MachineBasicBlock *NewDest);
+
+    /// Given a machine basic block and an iterator into it, split the MBB so
+    /// that the part before the iterator falls into the part starting at the
+    /// iterator.  This returns the new MBB.
     MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
                                   MachineBasicBlock::iterator BBI1,
                                   const BasicBlock *BB);
+
+    /// Look through all the blocks in MergePotentials that have hash CurHash
+    /// (guaranteed to match the last element).  Build the vector SameTails of
+    /// all those that have the (same) largest number of instructions in common
+    /// of any pair of these blocks.  SameTails entries contain an iterator into
+    /// MergePotentials (from which the MachineBasicBlock can be found) and a
+    /// MachineBasicBlock::iterator into that MBB indicating the instruction
+    /// where the matching code sequence begins.  Order of elements in SameTails
+    /// is the reverse of the order in which those blocks appear in
+    /// MergePotentials (where they are not necessarily consecutive).
     unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
                               MachineBasicBlock *SuccBB,
                               MachineBasicBlock *PredBB);
+
+    /// Remove all blocks with hash CurHash from MergePotentials, restoring
+    /// branches at ends of blocks as appropriate.
     void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
                                                 MachineBasicBlock* PredBB);
+
+    /// None of the blocks to be tail-merged consist only of the common tail.
+    /// Create a block that does by splitting one.
     bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                                    MachineBasicBlock *SuccBB,
                                    unsigned maxCommonTailLength,
                                    unsigned &commonTailIndex);
 
+    /// Create merged DebugLocs of identical instructions across SameTails and
+    /// assign it to the instruction in common tail.
+    void MergeCommonTailDebugLocs(unsigned commonTailIndex);
+
     bool OptimizeBranches(MachineFunction &MF);
+
+    /// Analyze and optimize control flow related to the specified block. This
+    /// is never called on the entry block.
     bool OptimizeBlock(MachineBasicBlock *MBB);
+
+    /// Remove the specified dead machine basic block from the function,
+    /// updating the CFG.
     void RemoveDeadBlock(MachineBasicBlock *MBB);
 
+    /// Hoist common instruction sequences at the start of basic blocks to their
+    /// common predecessor.
     bool HoistCommonCode(MachineFunction &MF);
+
+    /// If the successors of MBB has common instruction sequence at the start of
+    /// the function, move the instructions before MBB terminator if it's legal.
     bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
   };
 }
diff --git a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp
index 8b27570..27ee12c 100644
--- a/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/contrib/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -7,17 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -126,14 +126,16 @@ void BranchRelaxation::verify() {
 #endif
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
-void BranchRelaxation::dumpBBs() {
+LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() {
   for (auto &MBB : *MF) {
     const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
     dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
            << format("size=%#x\n", BBI.Size);
   }
 }
+#endif
 
 /// scanFunction - Do the initial scan of the function, building up
 /// information about each block.
@@ -257,7 +259,7 @@ MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI,
 
   // Need to fix live-in lists if we track liveness.
   if (TRI->trackLivenessAfterRegAlloc(*MF))
-    computeLiveIns(LiveRegs, *TRI, *NewBB);
+    computeLiveIns(LiveRegs, MF->getRegInfo(), *NewBB);
 
   ++NumSplit;
 
@@ -343,6 +345,10 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
     // Do it here since if there's no split, no update is needed.
     MBB->replaceSuccessor(FBB, &NewBB);
     NewBB.addSuccessor(FBB);
+
+    // Need to fix live-in lists if we track liveness.
+    if (TRI->trackLivenessAfterRegAlloc(*MF))
+      computeLiveIns(LiveRegs, MF->getRegInfo(), NewBB);
   }
 
   // We now have an appropriate fall-through block in place (either naturally or
diff --git a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp
index ff7c99d..abac555 100644
--- a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp
+++ b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp
@@ -1,4 +1,4 @@
-//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===//
+//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/GCs.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -77,6 +79,7 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
+
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
     const PointerType *PT = cast<PointerType>(Ty);
@@ -110,6 +113,7 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
+
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
     const PointerType *PT = cast<PointerType>(Ty);
@@ -117,7 +121,8 @@ public:
     return (1 == PT->getAddressSpace());
   }
 };
-}
+
+} // end anonymous namespace
 
 // Register all the above so that they can be found at runtime.  Note that
 // these static initializers are important since the registration list is
diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
index dc2d38a..c2ced19 100644
--- a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
index 2e33f14..7cad4d0 100644
--- a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -30,8 +30,7 @@ using namespace llvm;
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
     : CallingConv(CC), IsVarArg(isVarArg), MF(mf),
-      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C),
-      CallOrPrologue(Unknown) {
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) {
   // No stack is used.
   StackOffset = 0;
   MaxStackArgAlign = 1;
diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp
index 4cf9b13..b7fd45a 100644
--- a/contrib/llvm/lib/CodeGen/CodeGen.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/InitializePasses.h"
 #include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
 using namespace llvm;
@@ -21,6 +21,7 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
+  initializeBranchCoalescingPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
   initializeCodeGenPreparePass(Registry);
@@ -31,14 +32,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeEarlyIfConverterPass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeExpandPostRAPass(Registry);
+  initializeFEntryInserterPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
   initializeFuncletLayoutPass(Registry);
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);
   initializeIfConverterPass(Registry);
+  initializeImplicitNullChecksPass(Registry);
   initializeInterleavedAccessPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
+  initializeLiveRangeShrinkPass(Registry);
   initializeLiveStacksPass(Registry);
   initializeLiveVariablesPass(Registry);
   initializeLocalStackSlotPassPass(Registry);
@@ -47,7 +52,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCSEPass(Registry);
-  initializeImplicitNullChecksPass(Registry);
   initializeMachineCombinerPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineDominatorTreePass(Registry);
@@ -55,31 +59,35 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
   initializeMachineModuleInfoPass(Registry);
+  initializeMachineOptimizationRemarkEmitterPassPass(Registry);
+  initializeMachineOutlinerPass(Registry);
   initializeMachinePipelinerPass(Registry);
   initializeMachinePostDominatorTreePass(Registry);
+  initializeMachineRegionInfoPassPass(Registry);
   initializeMachineSchedulerPass(Registry);
   initializeMachineSinkingPass(Registry);
   initializeMachineVerifierPassPass(Registry);
-  initializeXRayInstrumentationPass(Registry);
-  initializePatchableFunctionPass(Registry);
   initializeOptimizePHIsPass(Registry);
   initializePEIPass(Registry);
   initializePHIEliminationPass(Registry);
+  initializePatchableFunctionPass(Registry);
   initializePeepholeOptimizerPass(Registry);
   initializePostMachineSchedulerPass(Registry);
   initializePostRAHazardRecognizerPass(Registry);
   initializePostRASchedulerPass(Registry);
   initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
   initializeProcessImplicitDefsPass(Registry);
+  initializeRABasicPass(Registry);
+  initializeRAFastPass(Registry);
   initializeRAGreedyPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
-  initializeLiveDebugValuesPass(Registry);
-  initializeSafeStackPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
@@ -91,6 +99,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeVirtRegMapPass(Registry);
   initializeVirtRegRewriterPass(Registry);
   initializeWinEHPreparePass(Registry);
+  initializeXRayInstrumentationPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 934b470..dc02a00 100644
--- a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -13,20 +13,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -53,8 +56,11 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -77,9 +83,14 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
-STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+          "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -93,7 +104,7 @@ static cl::opt<bool> DisableSelectToBranch(
   cl::desc("Disable select to branch conversion."));
 
 static cl::opt<bool> AddrSinkUsingGEPs(
-  "addr-sink-using-gep", cl::Hidden, cl::init(false),
+  "addr-sink-using-gep", cl::Hidden, cl::init(true),
   cl::desc("Address sinking in CGP using GEPs."));
 
 static cl::opt<bool> EnableAndCmpSinking(
@@ -123,7 +134,7 @@ static cl::opt<bool> DisablePreheaderProtect(
     cl::desc("Disable protection against removing loop preheaders"));
 
 static cl::opt<bool> ProfileGuidedSectionPrefix(
-    "profile-guided-section-prefix", cl::Hidden, cl::init(true),
+    "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
     cl::desc("Use profile info to add section prefix for hot/cold functions"));
 
 static cl::opt<unsigned> FreqRatioToSkipMerge(
@@ -135,15 +146,29 @@ static cl::opt<bool> ForceSplitStore(
     "force-split-store", cl::Hidden, cl::init(false),
     cl::desc("Force store splitting no matter what the target query says."));
 
+static cl::opt<bool>
+EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
+    cl::desc("Enable merging of redundant sexts when one is dominating"
+    " the other."), cl::init(true));
+
+static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+    cl::desc("The number of loads per basic block for inline expansion of "
+             "memcmp that is only being compared against zero."));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+typedef SmallVector<Instruction *, 16> SExts;
+typedef DenseMap<Value *, SExts> ValueToSExts;
 class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
     const TargetMachine *TM;
+    const TargetSubtargetInfo *SubtargetInfo;
     const TargetLowering *TLI;
+    const TargetRegisterInfo *TRI;
     const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
     const LoopInfo *LI;
@@ -165,6 +190,15 @@ class TypePromotionTransaction;
     /// promotion for the current function.
     InstrToOrigTy PromotedInsts;
 
+    /// Keep track of instructions removed during promotion.
+    SetOfInstrs RemovedInsts;
+
+    /// Keep track of sext chains based on their initial value.
+    DenseMap<Value *, Instruction *> SeenChainsForSExt;
+
+    /// Keep track of SExt promoted.
+    ValueToSExts ValToSExtendedUses;
+
     /// True if CFG is modified in any way.
     bool ModifiedDT;
 
@@ -176,10 +210,11 @@ class TypePromotionTransaction;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
-        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
-        initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
-      }
+    CodeGenPrepare()
+        : FunctionPass(ID), TM(nullptr), TLI(nullptr), TTI(nullptr),
+          DL(nullptr) {
+      initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
+    }
     bool runOnFunction(Function &F) override;
 
     StringRef getPassName() const override { return "CodeGen Prepare"; }
@@ -200,13 +235,13 @@ class TypePromotionTransaction;
     void eliminateMostlyEmptyBlock(BasicBlock *BB);
     bool isMergingEmptyBlockProfitable(BasicBlock *BB, BasicBlock *DestBB,
                                        bool isPreheader);
-    bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
-    bool optimizeInst(Instruction *I, bool& ModifiedDT);
+    bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+    bool optimizeInst(Instruction *I, bool &ModifiedDT);
     bool optimizeMemoryInst(Instruction *I, Value *Addr,
                             Type *AccessTy, unsigned AS);
     bool optimizeInlineAsmInst(CallInst *CS);
-    bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
-    bool moveExtToFormExtLoad(Instruction *&I);
+    bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+    bool optimizeExt(Instruction *&I);
     bool optimizeExtUses(Instruction *I);
     bool optimizeLoadExt(LoadInst *I);
     bool optimizeSelectInst(SelectInst *SI);
@@ -215,26 +250,32 @@ class TypePromotionTransaction;
     bool optimizeExtractElementInst(Instruction *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
     bool placeDbgValues(Function &F);
-    bool sinkAndCmp(Function &F);
-    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
-                        Instruction *&Inst,
-                        const SmallVectorImpl<Instruction *> &Exts,
-                        unsigned CreatedInstCost);
+    bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
+                      LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
+    bool tryToPromoteExts(TypePromotionTransaction &TPT,
+                          const SmallVectorImpl<Instruction *> &Exts,
+                          SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+                          unsigned CreatedInstsCost = 0);
+    bool mergeSExts(Function &F);
+    bool performAddressTypePromotion(
+        Instruction *&Inst,
+        bool AllowPromotionWithoutCommonHeader,
+        bool HasPromoted, TypePromotionTransaction &TPT,
+        SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
     bool splitBranchCondition(Function &F);
     bool simplifyOffsetableRelocate(Instruction &I);
+    bool splitIndirectCriticalEdges(Function &F);
   };
 }
 
 char CodeGenPrepare::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
-                         "Optimize for code generation", false, false)
+INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
+                      "Optimize for code generation", false, false)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_TM_PASS_END(CodeGenPrepare, "codegenprepare",
-                       "Optimize for code generation", false, false)
+INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
+                    "Optimize for code generation", false, false)
 
-FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
-  return new CodeGenPrepare(TM);
-}
+FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
 
 bool CodeGenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
@@ -250,8 +291,12 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   BPI.reset();
 
   ModifiedDT = false;
-  if (TM)
-    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    TM = &TPC->getTM<TargetMachine>();
+    SubtargetInfo = TM->getSubtargetImpl(F);
+    TLI = SubtargetInfo->getTargetLowering();
+    TRI = SubtargetInfo->getRegisterInfo();
+  }
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -260,10 +305,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (ProfileGuidedSectionPrefix) {
     ProfileSummaryInfo *PSI =
         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-    if (PSI->isFunctionEntryHot(&F))
+    if (PSI->isFunctionHotInCallGraph(&F))
       F.setSectionPrefix(".hot");
-    else if (PSI->isFunctionEntryCold(&F))
-      F.setSectionPrefix(".cold");
+    else if (PSI->isFunctionColdInCallGraph(&F))
+      F.setSectionPrefix(".unlikely");
   }
 
   /// This optimization identifies DIV instructions that can be
@@ -290,18 +335,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // find a node corresponding to the value.
   EverMadeChange |= placeDbgValues(F);
 
-  // If there is a mask, compare against zero, and branch that can be combined
-  // into a single target instruction, push the mask and compare into branch
-  // users. Do this before OptimizeBlock -> OptimizeInst ->
-  // OptimizeCmpExpression, which perturbs the pattern being searched for.
-  if (!DisableBranchOpts) {
-    EverMadeChange |= sinkAndCmp(F);
+  if (!DisableBranchOpts)
     EverMadeChange |= splitBranchCondition(F);
-  }
+
+  // Split some critical edges where one of the sources is an indirect branch,
+  // to help generate sane code for PHIs involving such edges.
+  EverMadeChange |= splitIndirectCriticalEdges(F);
 
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
+    SeenChainsForSExt.clear();
+    ValToSExtendedUses.clear();
+    RemovedInsts.clear();
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
@@ -311,6 +357,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       if (ModifiedDTOnIteration)
         break;
     }
+    if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
+      MadeChange |= mergeSExts(F);
+
+    // Really free removed instructions during promotion.
+    for (Instruction *I : RemovedInsts)
+      I->deleteValue();
+
     EverMadeChange |= MadeChange;
   }
 
@@ -432,6 +485,160 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
   return DestBB;
 }
 
+// Return the unique indirectbr predecessor of a block. This may return null
+// even if such a predecessor exists, if it's not useful for splitting.
+// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
+// predecessors of BB.
+static BasicBlock *
+findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
+  // If the block doesn't have any PHIs, we don't care about it, since there's
+  // no point in splitting it.
+  PHINode *PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return nullptr;
+
+  // Verify we have exactly one IBR predecessor.
+  // Conservatively bail out if one of the other predecessors is not a "regular"
+  // terminator (that is, not a switch or a br).
+  BasicBlock *IBB = nullptr;
+  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
+    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+    TerminatorInst *PredTerm = PredBB->getTerminator();
+    switch (PredTerm->getOpcode()) {
+    case Instruction::IndirectBr:
+      if (IBB)
+        return nullptr;
+      IBB = PredBB;
+      break;
+    case Instruction::Br:
+    case Instruction::Switch:
+      OtherPreds.push_back(PredBB);
+      continue;
+    default:
+      return nullptr;
+    }
+  }
+
+  return IBB;
+}
+
+// Split critical edges where the source of the edge is an indirectbr
+// instruction. This isn't always possible, but we can handle some easy cases.
+// This is useful because MI is unable to split such critical edges,
+// which means it will not be able to sink instructions along those edges.
+// This is especially painful for indirect branches with many successors, where
+// we end up having to prepare all outgoing values in the origin block.
+//
+// Our normal algorithm for splitting critical edges requires us to update
+// the outgoing edges of the edge origin block, but for an indirectbr this
+// is hard, since it would require finding and updating the block addresses
+// the indirect branch uses. But if a block only has a single indirectbr
+// predecessor, with the others being regular branches, we can do it in a
+// different way.
+// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr.
+// We can split D into D0 and D1, where D0 contains only the PHIs from D,
+// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and
+// create the following structure:
+// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
+bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) {
+  // Check whether the function has any indirectbrs, and collect which blocks
+  // they may jump to. Since most functions don't have indirect branches,
+  // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
+  SmallSetVector<BasicBlock *, 16> Targets;
+  for (auto &BB : F) {
+    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
+    if (!IBI)
+      continue;
+
+    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
+      Targets.insert(IBI->getSuccessor(Succ));
+  }
+
+  if (Targets.empty())
+    return false;
+
+  bool Changed = false;
+  for (BasicBlock *Target : Targets) {
+    SmallVector<BasicBlock *, 16> OtherPreds;
+    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
+    // If we did not found an indirectbr, or the indirectbr is the only
+    // incoming edge, this isn't the kind of edge we're looking for.
+    if (!IBRPred || OtherPreds.empty())
+      continue;
+
+    // Don't even think about ehpads/landingpads.
+    Instruction *FirstNonPHI = Target->getFirstNonPHI();
+    if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+      continue;
+
+    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+    // It's possible Target was its own successor through an indirectbr.
+    // In this case, the indirectbr now comes from BodyBlock.
+    if (IBRPred == Target)
+      IBRPred = BodyBlock;
+
+    // At this point Target only has PHIs, and BodyBlock has the rest of the
+    // block's body. Create a copy of Target that will be used by the "direct"
+    // preds.
+    ValueToValueMapTy VMap;
+    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
+
+    for (BasicBlock *Pred : OtherPreds) {
+      // If the target is a loop to itself, then the terminator of the split
+      // block needs to be updated.
+      if (Pred == Target)
+        BodyBlock->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+      else
+        Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+    }
+
+    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
+    // they are clones, so the number of PHIs are the same.
+    // (a) Remove the edge coming from IBRPred from the "Direct" PHI
+    // (b) Leave that as the only edge in the "Indirect" PHI.
+    // (c) Merge the two in the body block.
+    BasicBlock::iterator Indirect = Target->begin(),
+                         End = Target->getFirstNonPHI()->getIterator();
+    BasicBlock::iterator Direct = DirectSucc->begin();
+    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
+
+    assert(&*End == Target->getTerminator() &&
+           "Block was expected to only contain PHIs");
+
+    while (Indirect != End) {
+      PHINode *DirPHI = cast<PHINode>(Direct);
+      PHINode *IndPHI = cast<PHINode>(Indirect);
+
+      // Now, clean up - the direct block shouldn't get the indirect value,
+      // and vice versa.
+      DirPHI->removeIncomingValue(IBRPred);
+      Direct++;
+
+      // Advance the pointer here, to avoid invalidation issues when the old
+      // PHI is erased.
+      Indirect++;
+
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
+                             IBRPred);
+
+      // Create a PHI in the body block, to merge the direct and indirect
+      // predecessors.
+      PHINode *MergePHI =
+          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
+      MergePHI->addIncoming(NewIndPHI, Target);
+      MergePHI->addIncoming(DirPHI, DirectSucc);
+
+      IndPHI->replaceAllUsesWith(MergePHI);
+      IndPHI->eraseFromParent();
+    }
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
 /// edges in ways that are non-optimal for isel. Start by eliminating these
@@ -1090,6 +1297,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
   return false;
 }
 
+/// Duplicate and sink the given 'and' instruction into user blocks where it is
+/// used in a compare to allow isel to generate better code for targets where
+/// this operation can be combined.
+///
+/// Return true if any changes are made.
+static bool sinkAndCmp0Expression(Instruction *AndI,
+                                  const TargetLowering &TLI,
+                                  SetOfInstrs &InsertedInsts) {
+  // Double-check that we're not trying to optimize an instruction that was
+  // already optimized by some other part of this pass.
+  assert(!InsertedInsts.count(AndI) &&
+         "Attempting to optimize already optimized and instruction");
+  (void) InsertedInsts;
+
+  // Nothing to do for single use in same basic block.
+  if (AndI->hasOneUse() &&
+      AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
+    return false;
+
+  // Try to avoid cases where sinking/duplicating is likely to increase register
+  // pressure.
+  if (!isa<ConstantInt>(AndI->getOperand(0)) &&
+      !isa<ConstantInt>(AndI->getOperand(1)) &&
+      AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
+    return false;
+
+  for (auto *U : AndI->users()) {
+    Instruction *User = cast<Instruction>(U);
+
+    // Only sink for and mask feeding icmp with 0.
+    if (!isa<ICmpInst>(User))
+      return false;
+
+    auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
+    if (!CmpC || !CmpC->isZero())
+      return false;
+  }
+
+  if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
+    return false;
+
+  DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
+  DEBUG(AndI->getParent()->dump());
+
+  // Push the 'and' into the same block as the icmp 0.  There should only be
+  // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
+  // others, so we don't need to keep track of which BBs we insert into.
+  for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
+
+    // Keep the 'and' in the same place if the use is already in the same block.
+    Instruction *InsertPt =
+        User->getParent() == AndI->getParent() ? AndI : User;
+    Instruction *InsertedAnd =
+        BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
+                               AndI->getOperand(1), "", InsertPt);
+    // Propagate the debug info.
+    InsertedAnd->setDebugLoc(AndI->getDebugLoc());
+
+    // Replace a use of the 'and' with a use of the new 'and'.
+    TheUse = InsertedAnd;
+    ++NumAndUses;
+    DEBUG(User->getParent()->dump());
+  }
+
+  // We removed all uses, nuke the and.
+  AndI->eraseFromParent();
+  return true;
+}
+
 /// Check if the candidates could be combined with a shift instruction, which
 /// includes:
 /// 1. Truncate instruction
@@ -1278,519 +1562,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
   return MadeChange;
 }
 
-// Translate a masked load intrinsic like
-// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask, <16 x i32> %passthru)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-//  %1 = bitcast i8* %addr to i32*
-//  %2 = extractelement <16 x i1> %mask, i32 0
-//  %3 = icmp eq i1 %2, true
-//  br i1 %3, label %cond.load, label %else
-//
-//cond.load:                                        ; preds = %0
-//  %4 = getelementptr i32* %1, i32 0
-//  %5 = load i32* %4
-//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
-//  br label %else
-//
-//else:                                             ; preds = %0, %cond.load
-//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
-//  %7 = extractelement <16 x i1> %mask, i32 1
-//  %8 = icmp eq i1 %7, true
-//  br i1 %8, label %cond.load1, label %else2
-//
-//cond.load1:                                       ; preds = %else
-//  %9 = getelementptr i32* %1, i32 1
-//  %10 = load i32* %9
-//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
-//  br label %else2
-//
-//else2:                                            ; preds = %else, %cond.load1
-//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-//  %12 = extractelement <16 x i1> %mask, i32 2
-//  %13 = icmp eq i1 %12, true
-//  br i1 %13, label %cond.load4, label %else5
-//
-static void scalarizeMaskedLoad(CallInst *CI) {
-  Value *Ptr  = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-  assert(VecType && "Unexpected return type of masked load intrinsic");
-
-  Type *EltTy = CI->getType()->getVectorElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
-
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  bool IsAllOnesMask = isa<Constant>(Mask) &&
-    cast<Constant>(Mask)->isAllOnesValue();
-
-  if (IsAllOnesMask) {
-    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
-  // Bitcast %addr fron i8* to EltTy*
-  Type *NewPtrType =
-    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
-
-  Value *UndefVal = UndefValue::get(VecType);
-
-  // The result vector
-  Value *VResult = UndefVal;
-
-  if (isa<ConstantVector>(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-          continue;
-      Value *Gep =
-          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-      LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
-      VResult = Builder.CreateInsertElement(VResult, Load,
-                                            Builder.getInt32(Idx));
-    }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  %to_load = icmp eq i1 %mask_1, true
-    //  br i1 %to_load, label %cond.load, label %else
-    //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
-
-    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1));
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Gep =
-        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-  }
-
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
-  CI->eraseFromParent();
-}
-
-// Translate a masked store intrinsic, like
-// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
-//                               <16 x i1> %mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set
-//
-//   %1 = bitcast i8* %addr to i32*
-//   %2 = extractelement <16 x i1> %mask, i32 0
-//   %3 = icmp eq i1 %2, true
-//   br i1 %3, label %cond.store, label %else
-//
-// cond.store:                                       ; preds = %0
-//   %4 = extractelement <16 x i32> %val, i32 0
-//   %5 = getelementptr i32* %1, i32 0
-//   store i32 %4, i32* %5
-//   br label %else
-//
-// else:                                             ; preds = %0, %cond.store
-//   %6 = extractelement <16 x i1> %mask, i32 1
-//   %7 = icmp eq i1 %6, true
-//   br i1 %7, label %cond.store1, label %else2
-//
-// cond.store1:                                      ; preds = %else
-//   %8 = extractelement <16 x i32> %val, i32 1
-//   %9 = getelementptr i32* %1, i32 1
-//   store i32 %8, i32* %9
-//   br label %else2
-//   . . .
-static void scalarizeMaskedStore(CallInst *CI) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptr  = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
-  assert(VecType && "Unexpected data type in masked store intrinsic");
-
-  Type *EltTy = VecType->getElementType();
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  // Short-cut if the mask is all-true.
-  bool IsAllOnesMask = isa<Constant>(Mask) &&
-    cast<Constant>(Mask)->isAllOnesValue();
-
-  if (IsAllOnesMask) {
-    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
-    CI->eraseFromParent();
-    return;
-  }
-
-  // Adjust alignment for the scalar instruction.
-  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
-  // Bitcast %addr fron i8* to EltTy*
-  Type *NewPtrType =
-    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
-  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
-
-  if (isa<ConstantVector>(Mask)) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-          continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-      Value *Gep =
-          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
-    //  %to_store = icmp eq i1 %mask_1, true
-    //  br i1 %to_store, label %cond.store, label %else
-    //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1));
-
-    // Create "cond" block
-    //
-    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %store i32 %OneElt, i32* %EltAddr
-    //
-    BasicBlock *CondBlock =
-        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
-    Value *Gep =
-        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
-    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock =
-        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-}
-
-// Translate a masked gather intrinsic like
-// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
-//                               <16 x i1> %Mask, <16 x i32> %Src)
-// to a chain of basic blocks, with loading element one-by-one if
-// the appropriate mask bit is set
-//
-// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> %Mask, i32 0
-// % ToLoad0 = icmp eq i1 % Mask0, true
-// br i1 % ToLoad0, label %cond.load, label %else
-//
-// cond.load:
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// % Load0 = load i32, i32* % Ptr0, align 4
-// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
-// br label %else
-//
-// else:
-// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
-// % Mask1 = extractelement <16 x i1> %Mask, i32 1
-// % ToLoad1 = icmp eq i1 % Mask1, true
-// br i1 % ToLoad1, label %cond.load1, label %else2
-//
-// cond.load1:
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// % Load1 = load i32, i32* % Ptr1, align 4
-// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
-// br label %else2
-// . . .
-// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
-// ret <16 x i32> %Result
-static void scalarizeMaskedGather(CallInst *CI) {
-  Value *Ptrs = CI->getArgOperand(0);
-  Value *Alignment = CI->getArgOperand(1);
-  Value *Mask = CI->getArgOperand(2);
-  Value *Src0 = CI->getArgOperand(3);
-
-  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
-
-  assert(VecType && "Unexpected return type of masked load intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  BasicBlock *CondBlock = nullptr;
-  BasicBlock *PrevIfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  Value *UndefVal = UndefValue::get(VecType);
-
-  // The result vector
-  Value *VResult = UndefVal;
-  unsigned VectorWidth = VecType->getNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  bool IsConstMask = isa<ConstantVector>(Mask);
-
-  if (IsConstMask) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-        continue;
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                                "Ptr" + Twine(Idx));
-      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
-                                                 "Load" + Twine(Idx));
-      VResult = Builder.CreateInsertElement(VResult, Load,
-                                            Builder.getInt32(Idx),
-                                            "Res" + Twine(Idx));
-    }
-    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
-    CI->replaceAllUsesWith(NewI);
-    CI->eraseFromParent();
-    return;
-  }
-
-  PHINode *Phi = nullptr;
-  Value *PrevPhi = UndefVal;
-
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
-    //  %ToLoad1 = icmp eq i1 %Mask1, true
-    //  br i1 %ToLoad1, label %cond.load, label %else
-    //
-    if (Idx > 0) {
-      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
-      Phi->addIncoming(VResult, CondBlock);
-      Phi->addIncoming(PrevPhi, PrevIfBlock);
-      PrevPhi = Phi;
-      VResult = Phi;
-    }
-
-    Value *Predicate = Builder.CreateExtractElement(Mask,
-                                                    Builder.getInt32(Idx),
-                                                    "Mask" + Twine(Idx));
-    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                                    ConstantInt::get(Predicate->getType(), 1),
-                                    "ToLoad" + Twine(Idx));
-
-    // Create "cond" block
-    //
-    //  %EltAddr = getelementptr i32* %1, i32 0
-    //  %Elt = load i32* %EltAddr
-    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
-    //
-    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                              "Ptr" + Twine(Idx));
-    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
-                                               "Load" + Twine(Idx));
-    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
-                                          "Res" + Twine(Idx));
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    PrevIfBlock = IfBlock;
-    IfBlock = NewIfBlock;
-  }
-
-  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
-  Phi->addIncoming(VResult, CondBlock);
-  Phi->addIncoming(PrevPhi, PrevIfBlock);
-  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
-  CI->replaceAllUsesWith(NewI);
-  CI->eraseFromParent();
-}
-
-// Translate a masked scatter intrinsic, like
-// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
-//                                  <16 x i1> %Mask)
-// to a chain of basic blocks, that stores element one-by-one if
-// the appropriate mask bit is set.
-//
-// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
-// % Mask0 = extractelement <16 x i1> % Mask, i32 0
-// % ToStore0 = icmp eq i1 % Mask0, true
-// br i1 %ToStore0, label %cond.store, label %else
-//
-// cond.store:
-// % Elt0 = extractelement <16 x i32> %Src, i32 0
-// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
-// store i32 %Elt0, i32* % Ptr0, align 4
-// br label %else
-//
-// else:
-// % Mask1 = extractelement <16 x i1> % Mask, i32 1
-// % ToStore1 = icmp eq i1 % Mask1, true
-// br i1 % ToStore1, label %cond.store1, label %else2
-//
-// cond.store1:
-// % Elt1 = extractelement <16 x i32> %Src, i32 1
-// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-// store i32 % Elt1, i32* % Ptr1, align 4
-// br label %else2
-//   . . .
-static void scalarizeMaskedScatter(CallInst *CI) {
-  Value *Src = CI->getArgOperand(0);
-  Value *Ptrs = CI->getArgOperand(1);
-  Value *Alignment = CI->getArgOperand(2);
-  Value *Mask = CI->getArgOperand(3);
-
-  assert(isa<VectorType>(Src->getType()) &&
-         "Unexpected data type in masked scatter intrinsic");
-  assert(isa<VectorType>(Ptrs->getType()) &&
-         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
-         "Vector of pointers is expected in masked scatter intrinsic");
-
-  IRBuilder<> Builder(CI->getContext());
-  Instruction *InsertPt = CI;
-  BasicBlock *IfBlock = CI->getParent();
-  Builder.SetInsertPoint(InsertPt);
-  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  unsigned VectorWidth = Src->getType()->getVectorNumElements();
-
-  // Shorten the way if the mask is a vector of constants.
-  bool IsConstMask = isa<ConstantVector>(Mask);
-
-  if (IsConstMask) {
-    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
-        continue;
-      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
-                                                   "Elt" + Twine(Idx));
-      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                                "Ptr" + Twine(Idx));
-      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-    }
-    CI->eraseFromParent();
-    return;
-  }
-  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
-    // Fill the "else" block, created in the previous iteration
-    //
-    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
-    //  % ToStore = icmp eq i1 % Mask1, true
-    //  br i1 % ToStore, label %cond.store, label %else
-    //
-    Value *Predicate = Builder.CreateExtractElement(Mask,
-                                                    Builder.getInt32(Idx),
-                                                    "Mask" + Twine(Idx));
-    Value *Cmp =
-       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
-                          ConstantInt::get(Predicate->getType(), 1),
-                          "ToStore" + Twine(Idx));
-
-    // Create "cond" block
-    //
-    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
-    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
-    //  %store i32 % Elt1, i32* % Ptr1
-    //
-    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
-    Builder.SetInsertPoint(InsertPt);
-
-    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
-                                                 "Elt" + Twine(Idx));
-    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
-                                              "Ptr" + Twine(Idx));
-    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
-
-    // Create "else" block, fill it in the next iteration
-    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
-    Builder.SetInsertPoint(InsertPt);
-    Instruction *OldBr = IfBlock->getTerminator();
-    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
-    OldBr->eraseFromParent();
-    IfBlock = NewIfBlock;
-  }
-  CI->eraseFromParent();
-}
-
 /// If counting leading or trailing zeros is an expensive operation and a zero
 /// input is defined, add a check for zero to avoid calling the intrinsic.
 ///
@@ -1870,7 +1641,657 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
   return true;
 }
 
-bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+  struct ResultBlock {
+    BasicBlock *BB;
+    PHINode *PhiSrc1;
+    PHINode *PhiSrc2;
+    ResultBlock();
+  };
+
+  CallInst *CI;
+  ResultBlock ResBlock;
+  unsigned MaxLoadSize;
+  unsigned NumBlocks;
+  unsigned NumBlocksNonOneByte;
+  unsigned NumLoadsPerBlock;
+  std::vector<BasicBlock *> LoadCmpBlocks;
+  BasicBlock *EndBlock;
+  PHINode *PhiRes;
+  bool IsUsedForZeroCmp;
+  const DataLayout &DL;
+  IRBuilder<> Builder;
+
+  unsigned calculateNumBlocks(unsigned Size);
+  void createLoadCmpBlocks();
+  void createResultBlock();
+  void setupResultBlockPHINodes();
+  void setupEndBlockPHINodes();
+  void emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
+                            unsigned GEPIndex);
+  Value *getCompareLoadPairs(unsigned Index, unsigned Size,
+                             unsigned &NumBytesProcessed);
+  void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
+                                         unsigned &NumBytesProcessed);
+  void emitLoadCompareByteBlock(unsigned Index, unsigned GEPIndex);
+  void emitMemCmpResultBlock();
+  Value *getMemCmpExpansionZeroCase(unsigned Size);
+  Value *getMemCmpEqZeroOneBlock(unsigned Size);
+  Value *getMemCmpOneBlock(unsigned Size);
+  unsigned getLoadSize(unsigned Size);
+  unsigned getNumLoads(unsigned Size);
+
+public:
+  MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize,
+                  unsigned NumLoadsPerBlock, const DataLayout &DL);
+  Value *getMemCmpExpansion(uint64_t Size);
+};
+
+MemCmpExpansion::ResultBlock::ResultBlock()
+    : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {}
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
+                                 unsigned MaxLoadSize, unsigned LoadsPerBlock,
+                                 const DataLayout &TheDataLayout)
+    : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock),
+      DL(TheDataLayout), Builder(CI) {
+
+  // A memcmp with zero-comparison with only one block of load and compare does
+  // not need to set up any extra blocks. This case could be handled in the DAG,
+  // but since we have all of the machinery to flexibly expand any memcpy here,
+  // we choose to handle this case too to avoid fragmented lowering.
+  IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+  NumBlocks = calculateNumBlocks(Size);
+  if ((!IsUsedForZeroCmp && NumLoadsPerBlock != 1) || NumBlocks != 1) {
+    BasicBlock *StartBlock = CI->getParent();
+    EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+    setupEndBlockPHINodes();
+    createResultBlock();
+
+    // If return value of memcmp is not used in a zero equality, we need to
+    // calculate which source was larger. The calculation requires the
+    // two loaded source values of each load compare block.
+    // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+    if (!IsUsedForZeroCmp)
+      setupResultBlockPHINodes();
+
+    // Create the number of required load compare basic blocks.
+    createLoadCmpBlocks();
+
+    // Update the terminator added by splitBasicBlock to branch to the first
+    // LoadCmpBlock.
+    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+  }
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+  for (unsigned i = 0; i < NumBlocks; i++) {
+    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+                                        EndBlock->getParent(), EndBlock);
+    LoadCmpBlocks.push_back(BB);
+  }
+}
+
+void MemCmpExpansion::createResultBlock() {
+  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+                                   EndBlock->getParent(), EndBlock);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp parameters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index,
+                                               unsigned GEPIndex) {
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex.
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
+  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
+  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]);
+
+  if (Index < (LoadCmpBlocks.size() - 1)) {
+    // Early exit branch if difference found to EndBlock. Otherwise, continue to
+    // next LoadCmpBlock,
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                    ConstantInt::get(Diff->getType(), 0));
+    BranchInst *CmpBr =
+        BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp);
+    Builder.Insert(CmpBr);
+  } else {
+    // The last block has an unconditional branch to EndBlock.
+    BranchInst *CmpBr = BranchInst::Create(EndBlock);
+    Builder.Insert(CmpBr);
+  }
+}
+
+unsigned MemCmpExpansion::getNumLoads(unsigned Size) {
+  return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize);
+}
+
+unsigned MemCmpExpansion::getLoadSize(unsigned Size) {
+  return MinAlign(PowerOf2Floor(Size), MaxLoadSize);
+}
+
+/// Generate an equality comparison for one or more pairs of loaded values.
+/// This is used in the case where the memcmp() call is compared equal or not
+/// equal to zero.
+Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
+                                            unsigned &NumBytesProcessed) {
+  std::vector<Value *> XorList, OrList;
+  Value *Diff;
+
+  unsigned RemainingBytes = Size - NumBytesProcessed;
+  unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
+  unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
+
+  // For a single-block expansion, start inserting before the memcmp call.
+  if (LoadCmpBlocks.empty())
+    Builder.SetInsertPoint(CI);
+  else
+    Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+
+  Value *Cmp = nullptr;
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    unsigned LoadSize = getLoadSize(RemainingBytes);
+    unsigned GEPIndex = NumBytesProcessed / LoadSize;
+    NumBytesProcessed += LoadSize;
+    RemainingBytes -= LoadSize;
+
+    Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+    Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+    assert(LoadSize <= MaxLoadSize && "Unexpected load type");
+
+    Value *Source1 = CI->getArgOperand(0);
+    Value *Source2 = CI->getArgOperand(1);
+
+    // Cast source to LoadSizeType*.
+    if (Source1->getType() != LoadSizeType)
+      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+    if (Source2->getType() != LoadSizeType)
+      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+    // Get the base address using the GEPIndex.
+    if (GEPIndex != 0) {
+      Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                  ConstantInt::get(LoadSizeType, GEPIndex));
+      Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                  ConstantInt::get(LoadSizeType, GEPIndex));
+    }
+
+    // Get a constant or load a value for each source address.
+    Value *LoadSrc1 = nullptr;
+    if (auto *Source1C = dyn_cast<Constant>(Source1))
+      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
+    if (!LoadSrc1)
+      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+
+    Value *LoadSrc2 = nullptr;
+    if (auto *Source2C = dyn_cast<Constant>(Source2))
+      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
+    if (!LoadSrc2)
+      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+    if (NumLoads != 1) {
+      if (LoadSizeType != MaxLoadType) {
+        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
+      }
+      // If we have multiple loads per block, we need to generate a composite
+      // comparison using xor+or.
+      Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+      Diff = Builder.CreateZExt(Diff, MaxLoadType);
+      XorList.push_back(Diff);
+    } else {
+      // If there's only one load per block, we just compare the loaded values.
+      Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+    }
+  }
+
+  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+    std::vector<Value *> OutList;
+    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+      OutList.push_back(Or);
+    }
+    if (InList.size() % 2 != 0)
+      OutList.push_back(InList.back());
+    return OutList;
+  };
+
+  if (!Cmp) {
+    // Pairwise OR the XOR results.
+    OrList = pairWiseOr(XorList);
+
+    // Pairwise OR the OR results until one result left.
+    while (OrList.size() != 1) {
+      OrList = pairWiseOr(OrList);
+    }
+    Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0));
+  }
+
+  return Cmp;
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
+    unsigned Index, unsigned Size, unsigned &NumBytesProcessed) {
+  Value *Cmp = getCompareLoadPairs(Index, Size, NumBytesProcessed);
+
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock. Otherwise,
+  // continue to next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes).
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, unsigned LoadSize,
+                                           unsigned GEPIndex) {
+  if (LoadSize == 1) {
+    MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
+    return;
+  }
+
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  assert(LoadSize <= MaxLoadSize && "Unexpected load type");
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex.
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  // Load LoadSizeType from the base address.
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (DL.isLittleEndian()) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  if (LoadSizeType != MaxLoadType) {
+    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
+  }
+
+  // Add the loaded values to the phi nodes for calculating memcmp result only
+  // if result is not used in a zero equality.
+  if (!IsUsedForZeroCmp) {
+    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]);
+    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
+  }
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock. Otherwise, continue
+  // to next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(NextBB, ResBlock.BB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes).
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock() {
+  // Special case: if memcmp result is used in a zero equality, result does not
+  // need to be calculated and can simply return 1.
+  if (IsUsedForZeroCmp) {
+    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+    Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+    PhiRes->addIncoming(Res, ResBlock.BB);
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+    return;
+  }
+  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+  Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+                                  ResBlock.PhiSrc2);
+
+  Value *Res =
+      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+                           ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  BranchInst *NewBr = BranchInst::Create(EndBlock);
+  Builder.Insert(NewBr);
+  PhiRes->addIncoming(Res, ResBlock.BB);
+}
+
+unsigned MemCmpExpansion::calculateNumBlocks(unsigned Size) {
+  unsigned NumBlocks = 0;
+  bool HaveOneByteLoad = false;
+  unsigned RemainingSize = Size;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    if (LoadSize == 1)
+      HaveOneByteLoad = true;
+    NumBlocks += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+  NumBlocksNonOneByte = HaveOneByteLoad ? (NumBlocks - 1) : NumBlocks;
+
+  if (IsUsedForZeroCmp)
+    NumBlocks = NumBlocks / NumLoadsPerBlock +
+                (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0);
+
+  return NumBlocks;
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Builder.SetInsertPoint(ResBlock.BB);
+  ResBlock.PhiSrc1 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1");
+  ResBlock.PhiSrc2 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size) {
+  unsigned NumBytesProcessed = 0;
+  // This loop populates each of the LoadCmpBlocks with the IR sequence to
+  // handle multiple loads per block.
+  for (unsigned i = 0; i < NumBlocks; ++i)
+    emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed);
+
+  emitMemCmpResultBlock();
+  return PhiRes;
+}
+
+/// A memcmp expansion that compares equality with 0 and only has one block of
+/// load and compare can bypass the compare, branch, and phi IR that is required
+/// in the general case.
+Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
+  unsigned NumBytesProcessed = 0;
+  Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed);
+  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
+}
+
+/// A memcmp expansion that only has one block of load and compare can bypass
+/// the compare, branch, and phi IR that is required in the general case.
+Value *MemCmpExpansion::getMemCmpOneBlock(unsigned Size) {
+  assert(NumLoadsPerBlock == 1 && "Only handles one load pair per block");
+
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  // Cast source to LoadSizeType*.
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Load LoadSizeType from the base address.
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (DL.isLittleEndian() && Size != 1) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  // TODO: Instead of comparing ULT, just subtract and return the difference?
+  Value *CmpNE = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+  Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
+  Type *I32 = Builder.getInt32Ty();
+  Value *Sel1 = Builder.CreateSelect(CmpULT, ConstantInt::get(I32, -1),
+                                             ConstantInt::get(I32, 1));
+  return Builder.CreateSelect(CmpNE, Sel1, ConstantInt::get(I32, 0));
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size) {
+  if (IsUsedForZeroCmp)
+    return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
+                            getMemCmpExpansionZeroCase(Size);
+
+  // TODO: Handle more than one load pair per block in getMemCmpOneBlock().
+  if (NumBlocks == 1 && NumLoadsPerBlock == 1)
+    return getMemCmpOneBlock(Size);
+
+  // This loop calls emitLoadCompareBlock for comparing Size bytes of the two
+  // memcmp sources. It starts with loading using the maximum load size set by
+  // the target. It processes any remaining bytes using a load size which is the
+  // next smallest power of 2.
+  unsigned LoadSize = MaxLoadSize;
+  unsigned NumBytesToBeProcessed = Size;
+  unsigned Index = 0;
+  while (NumBytesToBeProcessed) {
+    // Calculate how many blocks we can create with the current load size.
+    unsigned NumBlocks = NumBytesToBeProcessed / LoadSize;
+    unsigned GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
+    NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
+
+    // For each NumBlocks, populate the instruction sequence for loading and
+    // comparing LoadSize bytes.
+    while (NumBlocks--) {
+      emitLoadCompareBlock(Index, LoadSize, GEPIndex);
+      Index++;
+      GEPIndex++;
+    }
+    // Get the next LoadSize to use.
+    LoadSize = LoadSize / 2;
+  }
+
+  emitMemCmpResultBlock();
+  return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced with a new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+///  %0 = bitcast i32* %buffer2 to i8*
+///  %1 = bitcast i32* %buffer1 to i8*
+///  %2 = bitcast i8* %1 to i64*
+///  %3 = bitcast i8* %0 to i64*
+///  %4 = load i64, i64* %2
+///  %5 = load i64, i64* %3
+///  %6 = call i64 @llvm.bswap.i64(i64 %4)
+///  %7 = call i64 @llvm.bswap.i64(i64 %5)
+///  %8 = sub i64 %6, %7
+///  %9 = icmp ne i64 %8, 0
+///  br i1 %9, label %res_block, label %loadbb1
+/// res_block:                                        ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+///  %10 = icmp ult i64 %phi.src1, %phi.src2
+///  %11 = select i1 %10, i32 -1, i32 1
+///  br label %endblock
+/// loadbb1:                                          ; preds = %loadbb
+///  %12 = bitcast i32* %buffer2 to i8*
+///  %13 = bitcast i32* %buffer1 to i8*
+///  %14 = bitcast i8* %13 to i32*
+///  %15 = bitcast i8* %12 to i32*
+///  %16 = getelementptr i32, i32* %14, i32 2
+///  %17 = getelementptr i32, i32* %15, i32 2
+///  %18 = load i32, i32* %16
+///  %19 = load i32, i32* %17
+///  %20 = call i32 @llvm.bswap.i32(i32 %18)
+///  %21 = call i32 @llvm.bswap.i32(i32 %19)
+///  %22 = zext i32 %20 to i64
+///  %23 = zext i32 %21 to i64
+///  %24 = sub i64 %22, %23
+///  %25 = icmp ne i64 %24, 0
+///  br i1 %25, label %res_block, label %loadbb2
+/// loadbb2:                                          ; preds = %loadbb1
+///  %26 = bitcast i32* %buffer2 to i8*
+///  %27 = bitcast i32* %buffer1 to i8*
+///  %28 = bitcast i8* %27 to i16*
+///  %29 = bitcast i8* %26 to i16*
+///  %30 = getelementptr i16, i16* %28, i16 6
+///  %31 = getelementptr i16, i16* %29, i16 6
+///  %32 = load i16, i16* %30
+///  %33 = load i16, i16* %31
+///  %34 = call i16 @llvm.bswap.i16(i16 %32)
+///  %35 = call i16 @llvm.bswap.i16(i16 %33)
+///  %36 = zext i16 %34 to i64
+///  %37 = zext i16 %35 to i64
+///  %38 = sub i64 %36, %37
+///  %39 = icmp ne i64 %38, 0
+///  br i1 %39, label %res_block, label %loadbb3
+/// loadbb3:                                          ; preds = %loadbb2
+///  %40 = bitcast i32* %buffer2 to i8*
+///  %41 = bitcast i32* %buffer1 to i8*
+///  %42 = getelementptr i8, i8* %41, i8 14
+///  %43 = getelementptr i8, i8* %40, i8 14
+///  %44 = load i8, i8* %42
+///  %45 = load i8, i8* %43
+///  %46 = zext i8 %44 to i32
+///  %47 = zext i8 %45 to i32
+///  %48 = sub i32 %46, %47
+///  br label %endblock
+/// endblock:                                         ; preds = %res_block,
+/// %loadbb3
+///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+///  ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+                         const TargetLowering *TLI, const DataLayout *DL) {
+  NumMemCmpCalls++;
+
+  // TTI call to check if target would like to expand memcmp. Also, get the
+  // MaxLoadSize.
+  unsigned MaxLoadSize;
+  if (!TTI->expandMemCmp(CI, MaxLoadSize))
+    return false;
+
+  // Early exit from expansion if -Oz.
+  if (CI->getFunction()->optForMinSize())
+    return false;
+
+  // Early exit from expansion if size is not a constant.
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+
+  // Early exit from expansion if size greater than max bytes to load.
+  uint64_t SizeVal = SizeCast->getZExtValue();
+  unsigned NumLoads = 0;
+  unsigned RemainingSize = SizeVal;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    NumLoads += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+
+  if (NumLoads > TLI->getMaxExpandSizeMemcmp(CI->getFunction()->optForSize())) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  // MemCmpHelper object creates and sets up basic blocks required for
+  // expanding memcmp with size SizeVal.
+  unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock;
+  MemCmpExpansion MemCmpHelper(CI, SizeVal, MaxLoadSize, NumLoadsPerBlock, *DL);
+
+  Value *Res = MemCmpHelper.getMemCmpExpansion(SizeVal);
+
+  // Replace call with result of expansion and erase call.
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+
+  return true;
+}
+
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
   // Lower inline assembly if we can.
@@ -1955,10 +2376,11 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       ConstantInt *RetVal =
           lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
       // Substituting this can cause recursive simplifications, which can
-      // invalidate our iterator.  Use a WeakVH to hold onto it in case this
+      // invalidate our iterator.  Use a WeakTrackingVH to hold onto it in case
+      // this
       // happens.
       Value *CurValue = &*CurInstIterator;
-      WeakVH IterHandle(CurValue);
+      WeakTrackingVH IterHandle(CurValue);
 
       replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
 
@@ -1970,39 +2392,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
       }
       return true;
     }
-    case Intrinsic::masked_load: {
-      // Scalarize unsupported vector masked load
-      if (!TTI->isLegalMaskedLoad(CI->getType())) {
-        scalarizeMaskedLoad(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_store: {
-      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
-        scalarizeMaskedStore(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_gather: {
-      if (!TTI->isLegalMaskedGather(CI->getType())) {
-        scalarizeMaskedGather(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
-    case Intrinsic::masked_scatter: {
-      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
-        scalarizeMaskedScatter(CI);
-        ModifiedDT = true;
-        return true;
-      }
-      return false;
-    }
     case Intrinsic::aarch64_stlxr:
     case Intrinsic::aarch64_stxr: {
       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
@@ -2028,16 +2417,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
 
     if (TLI) {
-      // Unknown address space.
-      // TODO: Target hook to pick which address space the intrinsic cares
-      // about?
-      unsigned AddrSpace = ~0u;
       SmallVector<Value*, 2> PtrOps;
       Type *AccessTy;
-      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
-        while (!PtrOps.empty())
-          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
+      if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
+        while (!PtrOps.empty()) {
+          Value *PtrVal = PtrOps.pop_back_val();
+          unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+          if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
             return true;
+        }
     }
   }
 
@@ -2054,6 +2442,13 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     CI->eraseFromParent();
     return true;
   }
+
+  LibFunc Func;
+  if (TLInfo->getLibFunc(ImmutableCallSite(CI), Func) &&
+      Func == LibFunc_memcmp && expandMemCmp(CI, TTI, TLI, DL)) {
+    ModifiedDT = true;
+    return true;
+  }
   return false;
 }
 
@@ -2168,11 +2563,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    AttributeSet CalleeAttrs = CS.getAttributes();
-    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias) !=
-        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias))
+    AttributeList CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
+            .removeAttribute(Attribute::NoAlias) !=
+        AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
+            .removeAttribute(Attribute::NoAlias))
       continue;
 
     // Make sure the call instruction is followed by an unconditional branch to
@@ -2561,25 +2956,30 @@ class TypePromotionTransaction {
     OperandsHider Hider;
     /// Keep track of the uses replaced, if any.
     UsesReplacer *Replacer;
+    /// Keep track of instructions removed.
+    SetOfInstrs &RemovedInsts;
 
   public:
     /// \brief Remove all reference of \p Inst and optinally replace all its
     /// uses with New.
+    /// \p RemovedInsts Keep track of the instructions removed by this Action.
     /// \pre If !Inst->use_empty(), then New != nullptr
-    InstructionRemover(Instruction *Inst, Value *New = nullptr)
+    InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
+                       Value *New = nullptr)
         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
-          Replacer(nullptr) {
+          Replacer(nullptr), RemovedInsts(RemovedInsts) {
       if (New)
         Replacer = new UsesReplacer(Inst, New);
       DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
+      RemovedInsts.insert(Inst);
+      /// The instructions removed here will be freed after completing
+      /// optimizeBlock() for all blocks as we need to keep track of the
+      /// removed instructions during promotion.
       Inst->removeFromParent();
     }
 
     ~InstructionRemover() override { delete Replacer; }
 
-    /// \brief Really remove the instruction.
-    void commit() override { delete Inst; }
-
     /// \brief Resurrect the instruction and reassign it to the proper uses if
     /// new value was provided when build this action.
     void undo() override {
@@ -2588,6 +2988,7 @@ class TypePromotionTransaction {
       if (Replacer)
         Replacer->undo();
       Hider.undo();
+      RemovedInsts.erase(Inst);
     }
   };
 
@@ -2596,6 +2997,10 @@ public:
   /// The restoration point is a pointer to an action instead of an iterator
   /// because the iterator may be invalidated but not the pointer.
   typedef const TypePromotionAction *ConstRestorationPt;
+
+  TypePromotionTransaction(SetOfInstrs &RemovedInsts)
+      : RemovedInsts(RemovedInsts) {}
+
   /// Advocate every changes made in that transaction.
   void commit();
   /// Undo all the changes made after the given point.
@@ -2627,6 +3032,7 @@ private:
   /// The ordered list of actions made so far.
   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
   typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
+  SetOfInstrs &RemovedInsts;
 };
 
 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
@@ -2638,7 +3044,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
                                                 Value *NewVal) {
   Actions.push_back(
-      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal));
+      make_unique<TypePromotionTransaction::InstructionRemover>(Inst,
+                                                         RemovedInsts, NewVal));
 }
 
 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
@@ -2705,8 +3112,8 @@ void TypePromotionTransaction::rollback(
 /// This encapsulates the logic for matching the target-legal addressing modes.
 class AddressingModeMatcher {
   SmallVectorImpl<Instruction*> &AddrModeInsts;
-  const TargetMachine &TM;
   const TargetLowering &TLI;
+  const TargetRegisterInfo &TRI;
   const DataLayout &DL;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
@@ -2731,14 +3138,14 @@ class AddressingModeMatcher {
   bool IgnoreProfitability;
 
   AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
-                        const TargetMachine &TM, Type *AT, unsigned AS,
+                        const TargetLowering &TLI,
+                        const TargetRegisterInfo &TRI,
+                        Type *AT, unsigned AS,
                         Instruction *MI, ExtAddrMode &AM,
                         const SetOfInstrs &InsertedInsts,
                         InstrToOrigTy &PromotedInsts,
                         TypePromotionTransaction &TPT)
-      : AddrModeInsts(AMI), TM(TM),
-        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
-                 ->getTargetLowering()),
+      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
         PromotedInsts(PromotedInsts), TPT(TPT) {
@@ -2756,13 +3163,15 @@ public:
   static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
                            Instruction *MemoryInst,
                            SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetMachine &TM,
+                           const TargetLowering &TLI,
+                           const TargetRegisterInfo &TRI,
                            const SetOfInstrs &InsertedInsts,
                            InstrToOrigTy &PromotedInsts,
                            TypePromotionTransaction &TPT) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
+    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI,
+                                         AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
                                          PromotedInsts, TPT).matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -3583,18 +3992,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
 /// Check to see if all uses of OpVal by the specified inline asm call are due
 /// to memory operands. If so, return true, otherwise return false.
 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetMachine &TM) {
-  const Function *F = CI->getParent()->getParent();
-  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
+                                    const TargetLowering &TLI,
+                                    const TargetRegisterInfo &TRI) {
+  const Function *F = CI->getFunction();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
+      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
                             ImmutableCallSite(CI));
+
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
     // If this asm operand is our Value*, and if it isn't an indirect memory
     // operand, we can't fold it!
@@ -3607,13 +4016,18 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
   return true;
 }
 
+// Max number of memory uses to look at before aborting the search to conserve
+// compile time.
+static constexpr int MaxMemoryUsesToScan = 20;
+
 /// Recursively walk all the uses of I until we find a memory use.
 /// If we find an obviously non-foldable instruction, return true.
 /// Add the ultimately found memory instructions to MemoryUses.
 static bool FindAllMemoryUses(
     Instruction *I,
     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
-    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetLowering &TLI,
+    const TargetRegisterInfo &TRI, int SeenInsts = 0) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -3626,8 +4040,12 @@ static bool FindAllMemoryUses(
 
   // Loop over all the uses, recursively processing them.
   for (Use &U : I->uses()) {
-    Instruction *UserI = cast<Instruction>(U.getUser());
+    // Conservatively return true if we're seeing a large number or a deep chain
+    // of users. This avoids excessive compilation times in pathological cases.
+    if (SeenInsts++ >= MaxMemoryUsesToScan)
+      return true;
 
+    Instruction *UserI = cast<Instruction>(U.getUser());
     if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
       MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
       continue;
@@ -3635,11 +4053,28 @@ static bool FindAllMemoryUses(
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
       unsigned opNo = U.getOperandNo();
-      if (opNo == 0) return true; // Storing addr, not into addr.
+      if (opNo != StoreInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
       MemoryUses.push_back(std::make_pair(SI, opNo));
       continue;
     }
 
+    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo != AtomicRMWInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(RMW, opNo));
+      continue;
+    }
+
+    if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(CmpX, opNo));
+      continue;
+    }
+
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
       // If this is a cold call, we can sink the addressing calculation into
       // the cold path.  See optimizeCallInst
@@ -3650,12 +4085,13 @@ static bool FindAllMemoryUses(
       if (!IA) return true;
 
       // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
         return true;
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI,
+                          SeenInsts))
       return true;
   }
 
@@ -3743,7 +4179,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // the use is just a particularly nice way of sinking it.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -3775,7 +4211,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     ExtAddrMode Result;
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS,
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI,
+                                  AddressAccessTy, AS,
                                   MemoryInst, Result, InsertedInsts,
                                   PromotedInsts, TPT);
     Matcher.IgnoreProfitability = true;
@@ -3839,84 +4276,70 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Use a worklist to iteratively look through PHI nodes, and ensure that
   // the addressing mode obtained from the non-PHI roots of the graph
   // are equivalent.
-  Value *Consensus = nullptr;
-  unsigned NumUsesConsensus = 0;
-  bool IsNumUsesConsensusValid = false;
+  bool AddrModeFound = false;
+  bool PhiSeen = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
   ExtAddrMode AddrMode;
-  TypePromotionTransaction TPT;
+  TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
   while (!worklist.empty()) {
     Value *V = worklist.back();
     worklist.pop_back();
 
-    // Break use-def graph loops.
-    if (!Visited.insert(V).second) {
-      Consensus = nullptr;
-      break;
-    }
+    // We allow traversing cyclic Phi nodes.
+    // In case of success after this loop we ensure that traversing through
+    // Phi nodes ends up with all cases to compute address of the form
+    //    BaseGV + Base + Scale * Index + Offset
+    // where Scale and Offset are constans and BaseGV, Base and Index
+    // are exactly the same Values in all cases.
+    // It means that BaseGV, Scale and Offset dominate our memory instruction
+    // and have the same value as they had in address computation represented
+    // as Phi. So we can safely sink address computation to memory instruction.
+    if (!Visited.insert(V).second)
+      continue;
 
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
       for (Value *IncValue : P->incoming_values())
         worklist.push_back(IncValue);
+      PhiSeen = true;
       continue;
     }
 
     // For non-PHIs, determine the addressing mode being computed.  Note that
     // the result may differ depending on what other uses our candidate
     // addressing instructions might have.
-    SmallVector<Instruction*, 16> NewAddrModeInsts;
+    AddrModeInsts.clear();
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
-      InsertedInsts, PromotedInsts, TPT);
-
-    // This check is broken into two cases with very similar code to avoid using
-    // getNumUses() as much as possible. Some values have a lot of uses, so
-    // calling getNumUses() unconditionally caused a significant compile-time
-    // regression.
-    if (!Consensus) {
-      Consensus = V;
-      AddrMode = NewAddrMode;
-      AddrModeInsts = NewAddrModeInsts;
-      continue;
-    } else if (NewAddrMode == AddrMode) {
-      if (!IsNumUsesConsensusValid) {
-        NumUsesConsensus = Consensus->getNumUses();
-        IsNumUsesConsensusValid = true;
-      }
+        V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *TRI,
+        InsertedInsts, PromotedInsts, TPT);
 
-      // Ensure that the obtained addressing mode is equivalent to that obtained
-      // for all other roots of the PHI traversal.  Also, when choosing one
-      // such root as representative, select the one with the most uses in order
-      // to keep the cost modeling heuristics in AddressingModeMatcher
-      // applicable.
-      unsigned NumUses = V->getNumUses();
-      if (NumUses > NumUsesConsensus) {
-        Consensus = V;
-        NumUsesConsensus = NumUses;
-        AddrModeInsts = NewAddrModeInsts;
-      }
+    if (!AddrModeFound) {
+      AddrModeFound = true;
+      AddrMode = NewAddrMode;
       continue;
     }
+    if (NewAddrMode == AddrMode)
+      continue;
 
-    Consensus = nullptr;
+    AddrModeFound = false;
     break;
   }
 
   // If the addressing mode couldn't be determined, or if multiple different
   // ones were determined, bail out now.
-  if (!Consensus) {
+  if (!AddrModeFound) {
     TPT.rollback(LastKnownGood);
     return false;
   }
   TPT.commit();
 
   // If all the instructions matched are already in this BB, don't do anything.
-  if (none_of(AddrModeInsts, [&](Value *V) {
+  // If we saw Phi node then it is not local definitely.
+  if (!PhiSeen && none_of(AddrModeInsts, [&](Value *V) {
         return IsNonLocalValue(V, MemoryInst->getParent());
-      })) {
+                  })) {
     DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
     return false;
   }
@@ -3935,11 +4358,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+      SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
   } else if (AddrSinkUsingGEPs ||
              (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
-              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
-                  ->useAA())) {
+              SubtargetInfo->useAA())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -3963,6 +4385,20 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       AddrMode.Scale = 0;
     }
 
+    // It is only safe to sign extend the BaseReg if we know that the math
+    // required to create it did not overflow before we extend it. Since
+    // the original IR value was tossed in favor of a constant back when
+    // the AddrMode was created we need to bail out gracefully if widths
+    // do not match instead of extending it.
+    //
+    // (See below for code to add the scale.)
+    if (AddrMode.Scale) {
+      Type *ScaledRegTy = AddrMode.ScaledReg->getType();
+      if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
+          cast<IntegerType>(ScaledRegTy)->getBitWidth())
+        return false;
+    }
+
     if (AddrMode.BaseGV) {
       if (ResultPtr)
         return false;
@@ -3973,14 +4409,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // If the real base value actually came from an inttoptr, then the matcher
     // will look through it and provide only the integer value. In that case,
     // use it here.
-    if (!ResultPtr && AddrMode.BaseReg) {
-      ResultPtr =
-        Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr");
-      AddrMode.BaseReg = nullptr;
-    } else if (!ResultPtr && AddrMode.Scale == 1) {
-      ResultPtr =
-        Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr");
-      AddrMode.Scale = 0;
+    if (!DL->isNonIntegralPointerType(Addr->getType())) {
+      if (!ResultPtr && AddrMode.BaseReg) {
+        ResultPtr = Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(),
+                                           "sunkaddr");
+        AddrMode.BaseReg = nullptr;
+      } else if (!ResultPtr && AddrMode.Scale == 1) {
+        ResultPtr = Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(),
+                                           "sunkaddr");
+        AddrMode.Scale = 0;
+      }
     }
 
     if (!ResultPtr &&
@@ -4011,19 +4449,11 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         Value *V = AddrMode.ScaledReg;
         if (V->getType() == IntPtrTy) {
           // done.
-        } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
-                   cast<IntegerType>(V->getType())->getBitWidth()) {
-          V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
         } else {
-          // It is only safe to sign extend the BaseReg if we know that the math
-          // required to create it did not overflow before we extend it. Since
-          // the original IR value was tossed in favor of a constant back when
-          // the AddrMode was created we need to bail out gracefully if widths
-          // do not match instead of extending it.
-          Instruction *I = dyn_cast_or_null<Instruction>(ResultIndex);
-          if (I && (ResultIndex != AddrMode.BaseReg))
-            I->eraseFromParent();
-          return false;
+          assert(cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth() &&
+                 "We can't transform if ScaledReg is too narrow");
+          V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
         }
 
         if (AddrMode.Scale != 1)
@@ -4042,7 +4472,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
           // We need to add this separately from the scale above to help with
           // SDAG consecutive load/store merging.
           if (ResultPtr->getType() != I8PtrTy)
-            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+            ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
           ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
         }
 
@@ -4053,14 +4483,27 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         SunkAddr = ResultPtr;
       } else {
         if (ResultPtr->getType() != I8PtrTy)
-          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+          ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
         SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
       }
 
       if (SunkAddr->getType() != Addr->getType())
-        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
     }
   } else {
+    // We'd require a ptrtoint/inttoptr down the line, which we can't do for
+    // non-integral pointers, so in that case bail out now.
+    Type *BaseTy = AddrMode.BaseReg ? AddrMode.BaseReg->getType() : nullptr;
+    Type *ScaleTy = AddrMode.Scale ? AddrMode.ScaledReg->getType() : nullptr;
+    PointerType *BasePtrTy = dyn_cast_or_null<PointerType>(BaseTy);
+    PointerType *ScalePtrTy = dyn_cast_or_null<PointerType>(ScaleTy);
+    if (DL->isNonIntegralPointerType(Addr->getType()) ||
+        (BasePtrTy && DL->isNonIntegralPointerType(BasePtrTy)) ||
+        (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
+        (AddrMode.BaseGV &&
+         DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
+      return false;
+
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
     Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
@@ -4140,9 +4583,9 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // using it.
   if (Repl->use_empty()) {
     // This can cause recursive deletion, which can invalidate our iterator.
-    // Use a WeakVH to hold onto it in case this happens.
+    // Use a WeakTrackingVH to hold onto it in case this happens.
     Value *CurValue = &*CurInstIterator;
-    WeakVH IterHandle(CurValue);
+    WeakTrackingVH IterHandle(CurValue);
     BasicBlock *BB = CurInstIterator->getParent();
 
     RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
@@ -4164,7 +4607,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   bool MadeChange = false;
 
   const TargetRegisterInfo *TRI =
-      TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
+      TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
       TLI->ParseConstraints(*DL, TRI, CS);
   unsigned ArgNo = 0;
@@ -4185,14 +4628,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   return MadeChange;
 }
 
-/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// \brief Check if all the uses of \p Val are equivalent (or free) zero or
 /// sign extensions.
-static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
-  assert(!Inst->use_empty() && "Input must have at least one use");
-  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
+  assert(!Val->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
   bool IsSExt = isa<SExtInst>(FirstUser);
   Type *ExtTy = FirstUser->getType();
-  for (const User *U : Inst->users()) {
+  for (const User *U : Val->users()) {
     const Instruction *UI = cast<Instruction>(U);
     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
       return false;
@@ -4202,11 +4645,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
       continue;
 
     // If IsSExt is true, we are in this situation:
-    // a = Inst
+    // a = Val
     // b = sext ty1 a to ty2
     // c = sext ty1 a to ty3
     // Assuming ty2 is shorter than ty3, this could be turned into:
-    // a = Inst
+    // a = Val
     // b = sext ty1 a to ty2
     // c = sext ty2 b to ty3
     // However, the last sext is not free.
@@ -4233,51 +4676,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
   return true;
 }
 
-/// \brief Try to form ExtLd by promoting \p Exts until they reach a
-/// load instruction.
-/// If an ext(load) can be formed, it is returned via \p LI for the load
-/// and \p Inst for the extension.
-/// Otherwise LI == nullptr and Inst == nullptr.
-/// When some promotion happened, \p TPT contains the proper state to
-/// revert them.
-///
-/// \return true when promoting was necessary to expose the ext(load)
-/// opportunity, false otherwise.
+/// \brief Try to speculatively promote extensions in \p Exts and continue
+/// promoting through newly promoted operands recursively as far as doing so is
+/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
+/// When some promotion happened, \p TPT contains the proper state to revert
+/// them.
 ///
-/// Example:
-/// \code
-/// %ld = load i32* %addr
-/// %add = add nuw i32 %ld, 4
-/// %zext = zext i32 %add to i64
-/// \endcode
-/// =>
-/// \code
-/// %ld = load i32* %addr
-/// %zext = zext i32 %ld to i64
-/// %add = add nuw i64 %zext, 4
-/// \encode
-/// Thanks to the promotion, we can match zext(load i32*) to i64.
-bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
-                                    LoadInst *&LI, Instruction *&Inst,
-                                    const SmallVectorImpl<Instruction *> &Exts,
-                                    unsigned CreatedInstsCost = 0) {
-  // Iterate over all the extensions to see if one form an ext(load).
+/// \return true if some promotion happened, false otherwise.
+bool CodeGenPrepare::tryToPromoteExts(
+    TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
+    SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+    unsigned CreatedInstsCost) {
+  bool Promoted = false;
+
+  // Iterate over all the extensions to try to promote them.
   for (auto I : Exts) {
-    // Check if we directly have ext(load).
-    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
-      Inst = I;
-      // No promotion happened here.
-      return false;
+    // Early check if we directly have ext(load).
+    if (isa<LoadInst>(I->getOperand(0))) {
+      ProfitablyMovedExts.push_back(I);
+      continue;
     }
-    // Check whether or not we want to do any promotion.
+
+    // Check whether or not we want to do any promotion.  The reason we have
+    // this check inside the for loop is to catch the case where an extension
+    // is directly fed by a load because in such case the extension can be moved
+    // up without any promotion on its operands.
     if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
-      continue;
+      return false;
+
     // Get the action to perform the promotion.
-    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
-        I, InsertedInsts, *TLI, PromotedInsts);
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
     // Check if we can promote.
-    if (!TPH)
+    if (!TPH) {
+      // Save the current extension as we cannot move up through its operand.
+      ProfitablyMovedExts.push_back(I);
       continue;
+    }
+
     // Save the current state.
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
@@ -4297,110 +4733,275 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
     // one extension but leave one. However, we optimistically keep going,
     // because the new extension may be removed too.
     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
-    TotalCreatedInstsCost -= ExtCost;
+    // FIXME: It would be possible to propagate a negative value instead of
+    // conservatively ceiling it to 0.
+    TotalCreatedInstsCost =
+        std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
     if (!StressExtLdPromotion &&
         (TotalCreatedInstsCost > 1 ||
          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
-      // The promotion is not profitable, rollback to the previous state.
+      // This promotion is not profitable, rollback to the previous state, and
+      // save the current extension in ProfitablyMovedExts as the latest
+      // speculative promotion turned out to be unprofitable.
+      TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+    // Continue promoting NewExts as far as doing so is profitable.
+    SmallVector<Instruction *, 2> NewlyMovedExts;
+    (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
+    bool NewPromoted = false;
+    for (auto ExtInst : NewlyMovedExts) {
+      Instruction *MovedExt = cast<Instruction>(ExtInst);
+      Value *ExtOperand = MovedExt->getOperand(0);
+      // If we have reached to a load, we need this extra profitability check
+      // as it could potentially be merged into an ext(load).
+      if (isa<LoadInst>(ExtOperand) &&
+          !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
+            (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
+        continue;
+
+      ProfitablyMovedExts.push_back(MovedExt);
+      NewPromoted = true;
+    }
+
+    // If none of speculative promotions for NewExts is profitable, rollback
+    // and save the current extension (I) as the last profitable extension.
+    if (!NewPromoted) {
       TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
       continue;
     }
     // The promotion is profitable.
-    // Check if it exposes an ext(load).
-    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
-    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
-               // If we have created a new extension, i.e., now we have two
-               // extensions. We must make sure one of them is merged with
-               // the load, otherwise we may degrade the code quality.
-               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
-      // Promotion happened.
-      return true;
-    // If this does not help to expose an ext(load) then, rollback.
-    TPT.rollback(LastKnownGood);
+    Promoted = true;
   }
-  // None of the extension can form an ext(load).
-  LI = nullptr;
-  Inst = nullptr;
-  return false;
+  return Promoted;
+}
+
+/// Merging redundant sexts when one is dominating the other.
+bool CodeGenPrepare::mergeSExts(Function &F) {
+  DominatorTree DT(F);
+  bool Changed = false;
+  for (auto &Entry : ValToSExtendedUses) {
+    SExts &Insts = Entry.second;
+    SExts CurPts;
+    for (Instruction *Inst : Insts) {
+      if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
+          Inst->getOperand(0) != Entry.first)
+        continue;
+      bool inserted = false;
+      for (auto &Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          Pt->replaceAllUsesWith(Inst);
+          RemovedInsts.insert(Pt);
+          Pt->removeFromParent();
+          Pt = Inst;
+          inserted = true;
+          Changed = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+        Inst->replaceAllUsesWith(Pt);
+        RemovedInsts.insert(Inst);
+        Inst->removeFromParent();
+        inserted = true;
+        Changed = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+  return Changed;
+}
+
+/// Return true, if an ext(load) can be formed from an extension in
+/// \p MovedExts.
+bool CodeGenPrepare::canFormExtLd(
+    const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
+    Instruction *&Inst, bool HasPromoted) {
+  for (auto *MovedExtInst : MovedExts) {
+    if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
+      LI = cast<LoadInst>(MovedExtInst->getOperand(0));
+      Inst = MovedExtInst;
+      break;
+    }
+  }
+  if (!LI)
+    return false;
+
+  // If they're already in the same block, there's nothing to do.
+  // Make the cheap checks first if we did not promote.
+  // If we promoted, we need to check if it is indeed profitable.
+  if (!HasPromoted && LI->getParent() == Inst->getParent())
+    return false;
+
+  return TLI->isExtLoad(LI, Inst, *DL);
 }
 
 /// Move a zext or sext fed by a load into the same basic block as the load,
 /// unless conditions are unfavorable. This allows SelectionDAG to fold the
 /// extend into the load.
-/// \p I[in/out] the extension may be modified during the process if some
-/// promotions apply.
 ///
-bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
-  // ExtLoad formation infrastructure requires TLI to be effective.
+/// E.g.,
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
+/// allow us to match zext(load i32*) to i64.
+///
+/// Also, try to promote the computations used to obtain a sign extended
+/// value used into memory accesses.
+/// E.g.,
+/// \code
+/// a = add nsw i32 b, 3
+/// d = sext i32 a to i64
+/// e = getelementptr ..., i64 d
+/// \endcode
+/// =>
+/// \code
+/// f = sext i32 b to i64
+/// a = add nsw i64 f, 3
+/// e = getelementptr ..., i64 a
+/// \endcode
+///
+/// \p Inst[in/out] the extension may be modified during the process if some
+/// promotions apply.
+bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
+  // ExtLoad formation and address type promotion infrastructure requires TLI to
+  // be effective.
   if (!TLI)
     return false;
 
-  // Try to promote a chain of computation if it allows to form
-  // an extended load.
-  TypePromotionTransaction TPT;
+  bool AllowPromotionWithoutCommonHeader = false;
+  /// See if it is an interesting sext operations for the address type
+  /// promotion before trying to promote it, e.g., the ones with the right
+  /// type and used in memory accesses.
+  bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
+      *Inst, AllowPromotionWithoutCommonHeader);
+  TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
-    TPT.getRestorationPoint();
+      TPT.getRestorationPoint();
   SmallVector<Instruction *, 1> Exts;
-  Exts.push_back(I);
+  SmallVector<Instruction *, 2> SpeculativelyMovedExts;
+  Exts.push_back(Inst);
+
+  bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
+
   // Look for a load being extended.
   LoadInst *LI = nullptr;
-  Instruction *OldExt = I;
-  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
-  if (!LI || !I) {
-    assert(!HasPromoted && !LI && "If we did not match any load instruction "
-                                  "the code must remain the same");
-    I = OldExt;
-    return false;
+  Instruction *ExtFedByLoad;
+
+  // Try to promote a chain of computation if it allows to form an extended
+  // load.
+  if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
+    assert(LI && ExtFedByLoad && "Expect a valid load and extension");
+    TPT.commit();
+    // Move the extend into the same block as the load
+    ExtFedByLoad->removeFromParent();
+    ExtFedByLoad->insertAfter(LI);
+    // CGP does not check if the zext would be speculatively executed when moved
+    // to the same basic block as the load. Preserving its original location
+    // would pessimize the debugging experience, as well as negatively impact
+    // the quality of sample pgo. We don't want to use "line 0" as that has a
+    // size cost in the line-table section and logically the zext can be seen as
+    // part of the load. Therefore we conservatively reuse the same debug
+    // location for the load and the zext.
+    ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
+    ++NumExtsMoved;
+    Inst = ExtFedByLoad;
+    return true;
   }
 
-  // If they're already in the same block, there's nothing to do.
-  // Make the cheap checks first if we did not promote.
-  // If we promoted, we need to check if it is indeed profitable.
-  if (!HasPromoted && LI->getParent() == I->getParent())
-    return false;
-
-  EVT VT = TLI->getValueType(*DL, I->getType());
-  EVT LoadVT = TLI->getValueType(*DL, LI->getType());
+  // Continue promoting SExts if known as considerable depending on targets.
+  if (ATPConsiderable &&
+      performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
+                                  HasPromoted, TPT, SpeculativelyMovedExts))
+    return true;
 
-  // If the load has other users and the truncate is not free, this probably
-  // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType())) {
-    I = OldExt;
-    TPT.rollback(LastKnownGood);
-    return false;
-  }
+  TPT.rollback(LastKnownGood);
+  return false;
+}
 
-  // Check whether the target supports casts folded into loads.
-  unsigned LType;
-  if (isa<ZExtInst>(I))
-    LType = ISD::ZEXTLOAD;
-  else {
-    assert(isa<SExtInst>(I) && "Unexpected ext type!");
-    LType = ISD::SEXTLOAD;
-  }
-  if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) {
-    I = OldExt;
-    TPT.rollback(LastKnownGood);
+// Perform address type promotion if doing so is profitable.
+// If AllowPromotionWithoutCommonHeader == false, we should find other sext
+// instructions that sign extended the same initial value. However, if
+// AllowPromotionWithoutCommonHeader == true, we expect promoting the
+// extension is just profitable.
+bool CodeGenPrepare::performAddressTypePromotion(
+    Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
+    bool HasPromoted, TypePromotionTransaction &TPT,
+    SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
+  bool Promoted = false;
+  SmallPtrSet<Instruction *, 1> UnhandledExts;
+  bool AllSeenFirst = true;
+  for (auto I : SpeculativelyMovedExts) {
+    Value *HeadOfChain = I->getOperand(0);
+    DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+        SeenChainsForSExt.find(HeadOfChain);
+    // If there is an unhandled SExt which has the same header, try to promote
+    // it as well.
+    if (AlreadySeen != SeenChainsForSExt.end()) {
+      if (AlreadySeen->second != nullptr)
+        UnhandledExts.insert(AlreadySeen->second);
+      AllSeenFirst = false;
+    }
+  }
+
+  if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
+                        SpeculativelyMovedExts.size() == 1)) {
+    TPT.commit();
+    if (HasPromoted)
+      Promoted = true;
+    for (auto I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = nullptr;
+      ValToSExtendedUses[HeadOfChain].push_back(I);
+    }
+    // Update Inst as promotion happen.
+    Inst = SpeculativelyMovedExts.pop_back_val();
+  } else {
+    // This is the first chain visited from the header, keep the current chain
+    // as unhandled. Defer to promote this until we encounter another SExt
+    // chain derived from the same header.
+    for (auto I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = Inst;
+    }
     return false;
   }
 
-  // Move the extend into the same block as the load, so that SelectionDAG
-  // can fold it.
-  TPT.commit();
-  I->removeFromParent();
-  I->insertAfter(LI);
-  // CGP does not check if the zext would be speculatively executed when moved
-  // to the same basic block as the load. Preserving its original location would
-  // pessimize the debugging experience, as well as negatively impact the 
-  // quality of sample pgo. We don't want to use "line 0" as that has a
-  // size cost in the line-table section and logically the zext can be seen as
-  // part of the load. Therefore we conservatively reuse the same debug location
-  // for the load and the zext.
-  I->setDebugLoc(LI->getDebugLoc());
-  ++NumExtsMoved;
-  return true;
+  if (!AllSeenFirst && !UnhandledExts.empty())
+    for (auto VisitedSExt : UnhandledExts) {
+      if (RemovedInsts.count(VisitedSExt))
+        continue;
+      TypePromotionTransaction TPT(RemovedInsts);
+      SmallVector<Instruction *, 1> Exts;
+      SmallVector<Instruction *, 2> Chains;
+      Exts.push_back(VisitedSExt);
+      bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
+      TPT.commit();
+      if (HasPromoted)
+        Promoted = true;
+      for (auto I : Chains) {
+        Value *HeadOfChain = I->getOperand(0);
+        // Mark this as handled.
+        SeenChainsForSExt[HeadOfChain] = nullptr;
+        ValToSExtendedUses[HeadOfChain].push_back(I);
+      }
+    }
+  return Promoted;
 }
 
 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
@@ -4534,13 +5135,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
       !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
     return false;
 
-  // Skip loads we've already transformed or have no reason to transform.
-  if (Load->hasOneUse()) {
-    User *LoadUser = *Load->user_begin();
-    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
-        !dyn_cast<PHINode>(LoadUser))
-      return false;
-  }
+  // Skip loads we've already transformed.
+  if (Load->hasOneUse() &&
+      InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
+    return false;
 
   // Look at all uses of Load, looking through phis, to determine how many bits
   // of the loaded value are needed.
@@ -4590,16 +5188,14 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
       if (!ShlC)
         return false;
       uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
-      auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt);
-      DemandBits |= ShlDemandBits;
+      DemandBits.setLowBits(BitWidth - ShiftAmt);
       break;
     }
 
     case llvm::Instruction::Trunc: {
       EVT TruncVT = TLI->getValueType(*DL, I->getType());
       unsigned TruncBitWidth = TruncVT.getSizeInBits();
-      auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth);
-      DemandBits |= TruncBits;
+      DemandBits.setLowBits(TruncBitWidth);
       break;
     }
 
@@ -4620,7 +5216,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   //
   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
   // mask, since these are the only ands that will be removed by isel.
-  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
+  if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
       WidestAndBits != DemandBits)
     return false;
 
@@ -4636,6 +5232,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   IRBuilder<> Builder(Load->getNextNode());
   auto *NewAnd = dyn_cast<Instruction>(
       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+  // Mark this instruction as "inserted by CGP", so that other
+  // optimizations don't touch it.
+  InsertedInsts.insert(NewAnd);
 
   // Replace all uses of load with new and (except for the use of load in the
   // new and itself).
@@ -4985,7 +5584,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
   ExtInst->insertBefore(SI);
   SI->setCondition(ExtInst);
-  for (SwitchInst::CaseIt Case : SI->cases()) {
+  for (auto Case : SI->cases()) {
     APInt NarrowConst = Case.getCaseValue()->getValue();
     APInt WideConst = (ExtType == Instruction::ZExt) ?
                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
@@ -4995,6 +5594,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
   return true;
 }
 
+
 namespace {
 /// \brief Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.
@@ -5473,7 +6073,7 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
   return true;
 }
 
-bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
   // Bail out if we inserted the instruction to prevent optimizations from
   // stepping on each other's toes.
   if (InsertedInsts.count(I))
@@ -5483,7 +6083,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) {
+    if (Value *V = SimplifyInstruction(P, {*DL, TLInfo})) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
       ++NumPHIsElim;
@@ -5514,7 +6114,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
               TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
-        bool MadeChange = moveExtToFormExtLoad(I);
+        bool MadeChange = optimizeExt(I);
         return MadeChange | optimizeExtUses(I);
       }
     }
@@ -5548,8 +6148,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
     return false;
   }
 
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+      unsigned AS = RMW->getPointerAddressSpace();
+      return optimizeMemoryInst(I, RMW->getPointerOperand(),
+                                RMW->getType(), AS);
+  }
+
+  if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
+      unsigned AS = CmpX->getPointerAddressSpace();
+      return optimizeMemoryInst(I, CmpX->getPointerOperand(),
+                                CmpX->getCompareOperand()->getType(), AS);
+  }
+
   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
 
+  if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
+      EnableAndCmpSinking && TLI)
+    return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
+
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
                 BinOp->getOpcode() == Instruction::LShr)) {
     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
@@ -5612,7 +6228,7 @@ static bool makeBitReverse(Instruction &I, const DataLayout &DL,
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
-bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
   SunkAddrs.clear();
   bool MadeChange = false;
 
@@ -5679,68 +6295,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
   return MadeChange;
 }
 
-// If there is a sequence that branches based on comparing a single bit
-// against zero that can be combined into a single instruction, and the
-// target supports folding these into a single instruction, sink the
-// mask and compare into the branch uses. Do this before OptimizeBlock ->
-// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
-// searched for.
-bool CodeGenPrepare::sinkAndCmp(Function &F) {
-  if (!EnableAndCmpSinking)
-    return false;
-  if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
-    return false;
-  bool MadeChange = false;
-  for (BasicBlock &BB : F) {
-    // Does this BB end with the following?
-    //   %andVal = and %val, #single-bit-set
-    //   %icmpVal = icmp %andResult, 0
-    //   br i1 %cmpVal label %dest1, label %dest2"
-    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
-    if (!Brcc || !Brcc->isConditional())
-      continue;
-    ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
-    if (!Cmp || Cmp->getParent() != &BB)
-      continue;
-    ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
-    if (!Zero || !Zero->isZero())
-      continue;
-    Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
-    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
-      continue;
-    ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
-    if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
-      continue;
-    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
-
-    // Push the "and; icmp" for any users that are conditional branches.
-    // Since there can only be one branch use per BB, we don't need to keep
-    // track of which BBs we insert into.
-    for (Use &TheUse : Cmp->uses()) {
-      // Find brcc use.
-      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
-      if (!BrccUser || !BrccUser->isConditional())
-        continue;
-      BasicBlock *UserBB = BrccUser->getParent();
-      if (UserBB == &BB) continue;
-      DEBUG(dbgs() << "found Brcc use\n");
-
-      // Sink the "and; icmp" to use.
-      MadeChange = true;
-      BinaryOperator *NewAnd =
-        BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
-                                  BrccUser);
-      CmpInst *NewCmp =
-        CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
-                        "", BrccUser);
-      TheUse = NewCmp;
-      ++NumAndCmpsMoved;
-      DEBUG(BrccUser->getParent()->dump());
-    }
-  }
-  return MadeChange;
-}
-
 /// \brief Scale down both weights to fit into uint32_t.
 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
@@ -5833,7 +6387,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F) {
     }
 
     // Update PHI nodes in both successors. The original BB needs to be
-    // replaced in one succesor's PHI nodes, because the branch comes now from
+    // replaced in one successor's PHI nodes, because the branch comes now from
     // the newly generated BB (NewBB). In the other successor we need to add one
     // incoming edge to the PHI nodes, because both branch instructions target
     // now the same successor. Depending on the original branch condition
diff --git a/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp b/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp
index 1e46a7a..7f7350f 100644
--- a/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp
+++ b/contrib/llvm/lib/CodeGen/CountingFunctionInserter.cpp
@@ -41,7 +41,7 @@ namespace {
       Type *VoidTy = Type::getVoidTy(F.getContext());
       Constant *CountingFn =
         F.getParent()->getOrInsertFunction(CountingFunctionName,
-                                           VoidTy, nullptr);
+                                           VoidTy);
       CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt());
       return true;
     }
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 5d60c30..a3cf284 100644
--- a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector Pristine = MFI.getPristineRegs(MF);
-  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
-    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg))
+      continue;
     for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
       unsigned Reg = *AI;
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
@@ -645,10 +648,8 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
           // as well.
           const SUnit *SU = MISUnitMap[Q->second->getParent()];
           if (!SU) continue;
-          for (DbgValueVector::iterator DVI = DbgValues.begin(),
-                 DVE = DbgValues.end(); DVI != DVE; ++DVI)
-            if (DVI->second == Q->second->getParent())
-              UpdateDbgValue(*DVI->first, AntiDepReg, NewReg);
+          UpdateDbgValues(DbgValues, Q->second->getParent(),
+                          AntiDepReg, NewReg);
         }
 
         // We just went back in time and modified history; the
diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
index 7b1b2d6..853b9af 100644
--- a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -23,49 +23,59 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "packets"
-
 #include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <vector>
 
 using namespace llvm;
 
+#define DEBUG_TYPE "packets"
+
 static cl::opt<unsigned> InstrLimit("dfa-instr-limit", cl::Hidden,
   cl::init(0), cl::desc("If present, stops packetizing after N instructions"));
+
 static unsigned InstrCount = 0;
 
 // --------------------------------------------------------------------
 // Definitions shared between DFAPacketizer.cpp and DFAPacketizerEmitter.cpp
 
-namespace {
-  DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) {
-    return (Inp << DFA_MAX_RESOURCES) | FuncUnits;
-  }
+static DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) {
+  return (Inp << DFA_MAX_RESOURCES) | FuncUnits;
+}
 
-  /// Return the DFAInput for an instruction class input vector.
-  /// This function is used in both DFAPacketizer.cpp and in
-  /// DFAPacketizerEmitter.cpp.
-  DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
-    DFAInput InsnInput = 0;
-    assert((InsnClass.size() <= DFA_MAX_RESTERMS) &&
-           "Exceeded maximum number of DFA terms");
-    for (auto U : InsnClass)
-      InsnInput = addDFAFuncUnits(InsnInput, U);
-    return InsnInput;
-  }
+/// Return the DFAInput for an instruction class input vector.
+/// This function is used in both DFAPacketizer.cpp and in
+/// DFAPacketizerEmitter.cpp.
+static DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
+  DFAInput InsnInput = 0;
+  assert((InsnClass.size() <= DFA_MAX_RESTERMS) &&
+         "Exceeded maximum number of DFA terms");
+  for (auto U : InsnClass)
+    InsnInput = addDFAFuncUnits(InsnInput, U);
+  return InsnInput;
 }
+
 // --------------------------------------------------------------------
 
 DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
                              const DFAStateInput (*SIT)[2],
                              const unsigned *SET):
-  InstrItins(I), CurrentState(0), DFAStateInputTable(SIT),
-  DFAStateEntryTable(SET) {
+  InstrItins(I), DFAStateInputTable(SIT), DFAStateEntryTable(SET) {
   // Make sure DFA types are large enough for the number of terms & resources.
   static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
                     (8 * sizeof(DFAInput)),
@@ -75,7 +85,6 @@ DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
       "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
 }
 
-
 // Read the DFA transition table and update CachedTable.
 //
 // Format of the transition tables:
@@ -97,7 +106,6 @@ void DFAPacketizer::ReadTable(unsigned int state) {
       DFAStateInputTable[i][1];
 }
 
-
 // Return the DFAInput for an instruction class.
 DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
   // Note: this logic must match that in DFAPacketizerDefs.h for input vectors.
@@ -112,16 +120,14 @@ DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
   return InsnInput;
 }
 
-
 // Return the DFAInput for an instruction class input vector.
 DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) {
   return getDFAInsnInput(InsnClass);
 }
 
-
 // Check if the resources occupied by a MCInstrDesc are available in the
 // current state.
-bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) {
+bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
   UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
@@ -129,10 +135,9 @@ bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) {
   return CachedTable.count(StateTrans) != 0;
 }
 
-
 // Reserve the resources occupied by a MCInstrDesc and change the current
 // state to reflect that change.
-void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
+void DFAPacketizer::reserveResources(const MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
   UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
@@ -141,24 +146,22 @@ void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
   CurrentState = CachedTable[StateTrans];
 }
 
-
 // Check if the resources occupied by a machine instruction are available
 // in the current state.
-bool DFAPacketizer::canReserveResources(llvm::MachineInstr &MI) {
-  const llvm::MCInstrDesc &MID = MI.getDesc();
+bool DFAPacketizer::canReserveResources(MachineInstr &MI) {
+  const MCInstrDesc &MID = MI.getDesc();
   return canReserveResources(&MID);
 }
 
-
 // Reserve the resources occupied by a machine instruction and change the
 // current state to reflect that change.
-void DFAPacketizer::reserveResources(llvm::MachineInstr &MI) {
-  const llvm::MCInstrDesc &MID = MI.getDesc();
+void DFAPacketizer::reserveResources(MachineInstr &MI) {
+  const MCInstrDesc &MID = MI.getDesc();
   reserveResources(&MID);
 }
 
-
 namespace llvm {
+
 // This class extends ScheduleDAGInstrs and overrides the schedule method
 // to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
@@ -166,9 +169,11 @@ private:
   AliasAnalysis *AA;
   /// Ordered list of DAG postprocessing steps.
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
                        AliasAnalysis *AA);
+
   // Actual scheduling work.
   void schedule() override;
 
@@ -176,11 +181,12 @@ public:
   void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
     Mutations.push_back(std::move(Mutation));
   }
+
 protected:
   void postprocessDAG();
 };
-}
 
+} // end namespace llvm
 
 DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
                                            MachineLoopInfo &MLI,
@@ -189,21 +195,18 @@ DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
   CanHandleTerminators = true;
 }
 
-
 /// Apply each ScheduleDAGMutation step in order.
 void DefaultVLIWScheduler::postprocessDAG() {
   for (auto &M : Mutations)
     M->apply(this);
 }
 
-
 void DefaultVLIWScheduler::schedule() {
   // Build the scheduling graph.
   buildSchedGraph(AA);
   postprocessDAG();
 }
 
-
 VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
                                        MachineLoopInfo &mli, AliasAnalysis *aa)
     : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) {
@@ -211,15 +214,11 @@ VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
   VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA);
 }
 
-
 VLIWPacketizerList::~VLIWPacketizerList() {
-  if (VLIWScheduler)
-    delete VLIWScheduler;
-  if (ResourceTracker)
-    delete ResourceTracker;
+  delete VLIWScheduler;
+  delete ResourceTracker;
 }
 
-
 // End the current packet, bundle packet instructions and reset DFA state.
 void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator MI) {
@@ -239,7 +238,6 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
   DEBUG(dbgs() << "End packet\n");
 }
 
-
 // Bundle machine instructions into packets.
 void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator BeginItr,
@@ -338,7 +336,6 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
   VLIWScheduler->finishBlock();
 }
 
-
 // Add a DAG mutation object to the ordered list.
 void VLIWPacketizerList::addMutation(
       std::unique_ptr<ScheduleDAGMutation> Mutation) {
diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 17c229a..91d18e2 100644
--- a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -23,7 +23,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "codegen-dce"
+#define DEBUG_TYPE "dead-mi-elimination"
 
 STATISTIC(NumDeletes,          "Number of dead instructions deleted");
 
@@ -54,7 +54,7 @@ namespace {
 char DeadMachineInstructionElim::ID = 0;
 char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID;
 
-INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination",
+INITIALIZE_PASS(DeadMachineInstructionElim, DEBUG_TYPE,
                 "Remove dead machine instructions", false, false)
 
 bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
@@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
     // Start out assuming that reserved registers are live out of this block.
     LivePhysRegs = MRI->getReservedRegs();
 
-    // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
+    // Add live-ins from successors to LivePhysRegs. Normally, physregs are not
     // live across blocks, but some targets (x86) can have flags live out of a
     // block.
     for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
diff --git a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
index a7ba694..ab9a059 100644
--- a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
+++ b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -132,8 +132,7 @@ private:
 char DetectDeadLanes::ID = 0;
 char &llvm::DetectDeadLanesID = DetectDeadLanes::ID;
 
-INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes",
-                false, false)
+INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false)
 
 /// Returns true if \p MI will get lowered to a series of COPY instructions.
 /// We call this a COPY-like instruction.
@@ -441,7 +440,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
           const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
           CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
           if (CrossCopy)
-            DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI);
+            DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);
         }
 
         if (!CrossCopy)
diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index 38af19a..2f83326 100644
--- a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -12,12 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -34,8 +35,6 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered");
 
 namespace {
   class DwarfEHPrepare : public FunctionPass {
-    const TargetMachine *TM;
-
     // RewindFunction - _Unwind_Resume or the target equivalent.
     Constant *RewindFunction;
 
@@ -52,15 +51,9 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid.
 
-    // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in
-    // practice.
     DwarfEHPrepare()
-        : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr), DT(nullptr),
-          TLI(nullptr) {}
-
-    DwarfEHPrepare(const TargetMachine *TM)
-        : FunctionPass(ID), TM(TM), RewindFunction(nullptr), DT(nullptr),
-          TLI(nullptr) {}
+        : FunctionPass(ID), RewindFunction(nullptr), DT(nullptr), TLI(nullptr) {
+    }
 
     bool runOnFunction(Function &Fn) override;
 
@@ -78,18 +71,18 @@ namespace {
 } // end anonymous namespace
 
 char DwarfEHPrepare::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare",
-                         "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_BEGIN(DwarfEHPrepare, DEBUG_TYPE,
+                      "Prepare DWARF exceptions", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_TM_PASS_END(DwarfEHPrepare, "dwarfehprepare",
-                       "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE,
+                    "Prepare DWARF exceptions", false, false)
 
-FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
-  return new DwarfEHPrepare(TM);
-}
+FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); }
 
 void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
   AU.addRequired<DominatorTreeWrapperPass>();
 }
@@ -254,9 +247,10 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 }
 
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
-  assert(TM && "DWARF EH preparation requires a target machine");
+  const TargetMachine &TM =
+      getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  TLI = TM.getSubtargetImpl(Fn)->getTargetLowering();
   bool Changed = InsertUnwindResumeCalls(Fn);
   DT = nullptr;
   TLI = nullptr;
diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 7291727..402afe7 100644
--- a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -616,13 +616,13 @@ private:
 char EarlyIfConverter::ID = 0;
 char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
 
-INITIALIZE_PASS_BEGIN(EarlyIfConverter,
-                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
+                      "Early If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(EarlyIfConverter,
-                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
+                    "Early If Converter", false, false)
 
 void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
index 32c57e3..e272d25 100644
--- a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
@@ -6,21 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the execution dependency fix pass.
-//
-// Some X86 SSE instructions like mov, and, or, xor are available in different
-// variants for different operand types. These variant instructions are
-// equivalent, but on Nehalem and newer cpus there is extra latency
-// transferring data between integer and floating point domains.  ARM cores
-// have similar issues when they are configured with both VFP and NEON
-// pipelines.
-//
-// This pass changes the variant instructions to minimize domain crossings.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,193 +23,18 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "execution-fix"
-
-/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
-/// of execution domains.
-///
-/// An open DomainValue represents a set of instructions that can still switch
-/// execution domain. Multiple registers may refer to the same open
-/// DomainValue - they will eventually be collapsed to the same execution
-/// domain.
-///
-/// A collapsed DomainValue represents a single register that has been forced
-/// into one of more execution domains. There is a separate collapsed
-/// DomainValue for each register, but it may contain multiple execution
-/// domains. A register value is initially created in a single execution
-/// domain, but if we were forced to pay the penalty of a domain crossing, we
-/// keep track of the fact that the register is now available in multiple
-/// domains.
-namespace {
-struct DomainValue {
-  // Basic reference counting.
-  unsigned Refs;
-
-  // Bitmask of available domains. For an open DomainValue, it is the still
-  // possible domains for collapsing. For a collapsed DomainValue it is the
-  // domains where the register is available for free.
-  unsigned AvailableDomains;
-
-  // Pointer to the next DomainValue in a chain.  When two DomainValues are
-  // merged, Victim.Next is set to point to Victor, so old DomainValue
-  // references can be updated by following the chain.
-  DomainValue *Next;
-
-  // Twiddleable instructions using or defining these registers.
-  SmallVector<MachineInstr*, 8> Instrs;
-
-  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
-  // track of the domains where the registers are already available.
-  bool isCollapsed() const { return Instrs.empty(); }
-
-  // Is domain available?
-  bool hasDomain(unsigned domain) const {
-    assert(domain <
-               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
-           "undefined behavior");
-    return AvailableDomains & (1u << domain);
-  }
-
-  // Mark domain as available.
-  void addDomain(unsigned domain) {
-    AvailableDomains |= 1u << domain;
-  }
-
-  // Restrict to a single domain available.
-  void setSingleDomain(unsigned domain) {
-    AvailableDomains = 1u << domain;
-  }
-
-  // Return bitmask of domains that are available and in mask.
-  unsigned getCommonDomains(unsigned mask) const {
-    return AvailableDomains & mask;
-  }
-
-  // First domain available.
-  unsigned getFirstDomain() const {
-    return countTrailingZeros(AvailableDomains);
-  }
-
-  DomainValue() : Refs(0) { clear(); }
-
-  // Clear this DomainValue and point to next which has all its data.
-  void clear() {
-    AvailableDomains = 0;
-    Next = nullptr;
-    Instrs.clear();
-  }
-};
-}
-
-namespace {
-/// Information about a live register.
-struct LiveReg {
-  /// Value currently in this register, or NULL when no value is being tracked.
-  /// This counts as a DomainValue reference.
-  DomainValue *Value;
-
-  /// Instruction that defined this register, relative to the beginning of the
-  /// current basic block.  When a LiveReg is used to represent a live-out
-  /// register, this value is relative to the end of the basic block, so it
-  /// will be a negative number.
-  int Def;
-};
-} // anonymous namespace
-
-namespace {
-class ExeDepsFix : public MachineFunctionPass {
-  static char ID;
-  SpecificBumpPtrAllocator<DomainValue> Allocator;
-  SmallVector<DomainValue*,16> Avail;
-
-  const TargetRegisterClass *const RC;
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  RegisterClassInfo RegClassInfo;
-  std::vector<SmallVector<int, 1>> AliasMap;
-  const unsigned NumRegs;
-  LiveReg *LiveRegs;
-  typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
-  LiveOutMap LiveOuts;
-
-  /// List of undefined register reads in this block in forward order.
-  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
-
-  /// Storage for register unit liveness.
-  LivePhysRegs LiveRegSet;
-
-  /// Current instruction number.
-  /// The first instruction in each basic block is 0.
-  int CurInstr;
-
-  /// True when the current block has a predecessor that hasn't been visited
-  /// yet.
-  bool SeenUnknownBackEdge;
-
-public:
-  ExeDepsFix(const TargetRegisterClass *rc)
-    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
-  }
-
-  StringRef getPassName() const override { return "Execution dependency fix"; }
-
-private:
-  iterator_range<SmallVectorImpl<int>::const_iterator>
-  regIndices(unsigned Reg) const;
-
-  // DomainValue allocation.
-  DomainValue *alloc(int domain = -1);
-  DomainValue *retain(DomainValue *DV) {
-    if (DV) ++DV->Refs;
-    return DV;
-  }
-  void release(DomainValue*);
-  DomainValue *resolve(DomainValue*&);
-
-  // LiveRegs manipulations.
-  void setLiveReg(int rx, DomainValue *DV);
-  void kill(int rx);
-  void force(int rx, unsigned domain);
-  void collapse(DomainValue *dv, unsigned domain);
-  bool merge(DomainValue *A, DomainValue *B);
-
-  void enterBasicBlock(MachineBasicBlock*);
-  void leaveBasicBlock(MachineBasicBlock*);
-  void visitInstr(MachineInstr*);
-  void processDefs(MachineInstr*, bool Kill);
-  void visitSoftInstr(MachineInstr*, unsigned mask);
-  void visitHardInstr(MachineInstr*, unsigned domain);
-  void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                unsigned Pref);
-  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
-  void processUndefReads(MachineBasicBlock*);
-};
-}
-
-char ExeDepsFix::ID = 0;
+#define DEBUG_TYPE "execution-deps-fix"
 
 /// Translate TRI register number to a list of indices into our smaller tables
 /// of interesting registers.
 iterator_range<SmallVectorImpl<int>::const_iterator>
-ExeDepsFix::regIndices(unsigned Reg) const {
+ExecutionDepsFix::regIndices(unsigned Reg) const {
   assert(Reg < AliasMap.size() && "Invalid register");
   const auto &Entry = AliasMap[Reg];
   return make_range(Entry.begin(), Entry.end());
 }
 
-DomainValue *ExeDepsFix::alloc(int domain) {
+DomainValue *ExecutionDepsFix::alloc(int domain) {
   DomainValue *dv = Avail.empty() ?
                       new(Allocator.Allocate()) DomainValue :
                       Avail.pop_back_val();
@@ -234,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) {
 
 /// Release a reference to DV.  When the last reference is released,
 /// collapse if needed.
-void ExeDepsFix::release(DomainValue *DV) {
+void ExecutionDepsFix::release(DomainValue *DV) {
   while (DV) {
     assert(DV->Refs && "Bad DomainValue");
     if (--DV->Refs)
@@ -254,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) {
 
 /// Follow the chain of dead DomainValues until a live DomainValue is reached.
 /// Update the referenced pointer when necessary.
-DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
+DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) {
   DomainValue *DV = DVRef;
   if (!DV || !DV->Next)
     return DV;
@@ -271,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
 }
 
 /// Set LiveRegs[rx] = dv, updating reference counts.
-void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
+void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
 
@@ -283,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
 }
 
 // Kill register rx, recycle or collapse any DomainValue.
-void ExeDepsFix::kill(int rx) {
+void ExecutionDepsFix::kill(int rx) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
   if (!LiveRegs[rx].Value)
@@ -294,7 +107,7 @@ void ExeDepsFix::kill(int rx) {
 }
 
 /// Force register rx into domain.
-void ExeDepsFix::force(int rx, unsigned domain) {
+void ExecutionDepsFix::force(int rx, unsigned domain) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
   if (DomainValue *dv = LiveRegs[rx].Value) {
@@ -317,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) {
 
 /// Collapse open DomainValue into given domain. If there are multiple
 /// registers using dv, they each get a unique collapsed DomainValue.
-void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
+void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) {
   assert(dv->hasDomain(domain) && "Cannot collapse");
 
   // Collapse all the instructions.
@@ -333,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
 }
 
 /// All instructions and registers in B are moved to A, and B is released.
-bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
+bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) {
   assert(!A->isCollapsed() && "Cannot merge into collapsed");
   assert(!B->isCollapsed() && "Cannot merge from collapsed");
   if (A == B)
@@ -359,10 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
 }
 
 /// Set up LiveRegs by merging predecessor live-out values.
-void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
-  // Detect back-edges from predecessors we haven't processed yet.
-  SeenUnknownBackEdge = false;
-
+void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
@@ -397,18 +207,21 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Try to coalesce live-out registers from predecessors.
   for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
        pe = MBB->pred_end(); pi != pe; ++pi) {
-    LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
-    if (fi == LiveOuts.end()) {
-      SeenUnknownBackEdge = true;
+    auto fi = MBBInfos.find(*pi);
+    assert(fi != MBBInfos.end() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    LiveReg *Incoming = fi->second.OutRegs;
+    // Incoming is null if this is a backedge from a BB
+    // we haven't processed yet
+    if (Incoming == nullptr) {
       continue;
     }
-    assert(fi->second && "Can't have NULL entries");
 
     for (unsigned rx = 0; rx != NumRegs; ++rx) {
       // Use the most recent predecessor def for each register.
-      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def);
+      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def);
 
-      DomainValue *pdv = resolve(fi->second[rx].Value);
+      DomainValue *pdv = resolve(Incoming[rx].Value);
       if (!pdv)
         continue;
       if (!LiveRegs[rx].Value) {
@@ -432,35 +245,34 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
         force(rx, pdv->getFirstDomain());
     }
   }
-  DEBUG(dbgs() << "BB#" << MBB->getNumber()
-        << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n"));
+  DEBUG(
+      dbgs() << "BB#" << MBB->getNumber()
+             << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
 }
 
-void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
   assert(LiveRegs && "Must enter basic block first.");
-  // Save live registers at end of MBB - used by enterBasicBlock().
-  // Also use LiveOuts as a visited set to detect back-edges.
-  bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second;
-
-  if (First) {
-    // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to
-    // the end of this block instead of the beginning.
-    for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      LiveRegs[i].Def -= CurInstr;
-  } else {
-    // Insertion failed, this must be the second pass.
+  LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
+  // Save register clearances at end of MBB - used by enterBasicBlock().
+  MBBInfos[MBB].OutRegs = LiveRegs;
+
+  // While processing the basic block, we kept `Def` relative to the start
+  // of the basic block for convenience. However, future use of this information
+  // only cares about the clearance from the end of the block, so adjust
+  // everything to be relative to the end of the basic block.
+  for (unsigned i = 0, e = NumRegs; i != e; ++i)
+    LiveRegs[i].Def -= CurInstr;
+  if (OldOutRegs) {
+    // This must be the second pass.
     // Release all the DomainValues instead of keeping them.
     for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      release(LiveRegs[i].Value);
-    delete[] LiveRegs;
+      release(OldOutRegs[i].Value);
+    delete[] OldOutRegs;
   }
   LiveRegs = nullptr;
 }
 
-void ExeDepsFix::visitInstr(MachineInstr *MI) {
-  if (MI->isDebugValue())
-    return;
-
+bool ExecutionDepsFix::visitInstr(MachineInstr *MI) {
   // Update instructions with explicit execution domains.
   std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
   if (DomP.first) {
@@ -470,16 +282,16 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
       visitHardInstr(MI, DomP.first);
   }
 
-  // Process defs to track register ages, and kill values clobbered by generic
-  // instructions.
-  processDefs(MI, !DomP.first);
+  return !DomP.first;
 }
 
 /// \brief Helps avoid false dependencies on undef registers by updating the
 /// machine instructions' undef operand to use a register that the instruction
 /// is truly dependent on, or use a register with clearance higher than Pref.
-void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                          unsigned Pref) {
+/// Returns true if it was able to find a true dependency, thus not requiring
+/// a dependency breaking instruction regardless of clearance.
+bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI,
+                                                unsigned OpIdx, unsigned Pref) {
   MachineOperand &MO = MI->getOperand(OpIdx);
   assert(MO.isUndef() && "Expected undef machine operand");
 
@@ -487,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
 
   // Update only undef operands that are mapped to one register.
   if (AliasMap[OriginalReg].size() != 1)
-    return;
+    return false;
 
   // Get the undef operand's register class
   const TargetRegisterClass *OpRC =
@@ -502,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
     // We found a true dependency - replace the undef register with the true
     // dependency.
     MO.setReg(CurrMO.getReg());
-    return;
+    return true;
   }
 
   // Go over all registers in the register class and find the register with
@@ -527,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   // Update the operand if we found a register with better clearance.
   if (MaxClearanceReg != OriginalReg)
     MO.setReg(MaxClearanceReg);
+
+  return false;
 }
 
 /// \brief Return true to if it makes sense to break dependence on a partial def
 /// or undef use.
-bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
-                                       unsigned Pref) {
+bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                             unsigned Pref) {
   unsigned reg = MI->getOperand(OpIdx).getReg();
   for (int rx : regIndices(reg)) {
     unsigned Clearance = CurInstr - LiveRegs[rx].Def;
@@ -542,14 +356,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
       DEBUG(dbgs() << ": Break dependency.\n");
       continue;
     }
-    // The current clearance seems OK, but we may be ignoring a def from a
-    // back-edge.
-    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-      DEBUG(dbgs() << ": OK .\n");
-      return false;
-    }
-    // A def from an unprocessed back-edge may make us break this dependency.
-    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+    DEBUG(dbgs() << ": OK .\n");
     return false;
   }
   return true;
@@ -559,16 +366,22 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
 // If Kill is set, also kill off DomainValues clobbered by the defs.
 //
 // Also break dependencies on partial defs and undef uses.
-void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
+void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
+                                   bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
   unsigned OpNum;
-  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
-  if (Pref) {
-    pickBestRegisterForUndef(MI, OpNum, Pref);
-    if (shouldBreakDependence(MI, OpNum, Pref))
-      UndefReads.push_back(std::make_pair(MI, OpNum));
+  if (breakDependency) {
+    unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
+    if (Pref) {
+      bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref);
+      // We don't need to bother trying to break a dependency if this
+      // instruction has a true dependency on that register through another
+      // operand - we'll have to wait for it to be available regardless.
+      if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref))
+        UndefReads.push_back(std::make_pair(MI, OpNum));
+    }
   }
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
@@ -584,11 +397,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
       DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                    << '\t' << *MI);
 
-      // Check clearance before partial register updates.
-      // Call breakDependence before setting LiveRegs[rx].Def.
-      unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
-      if (Pref && shouldBreakDependence(MI, i, Pref))
-        TII->breakPartialRegDependency(*MI, i, TRI);
+      if (breakDependency) {
+        // Check clearance before partial register updates.
+        // Call breakDependence before setting LiveRegs[rx].Def.
+        unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
+        if (Pref && shouldBreakDependence(MI, i, Pref))
+          TII->breakPartialRegDependency(*MI, i, TRI);
+      }
 
       // How many instructions since rx was last written?
       LiveRegs[rx].Def = CurInstr;
@@ -607,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
 /// only do it on demand. Note that the occurrence of undefined register reads
 /// that should be broken is very rare, but when they occur we may have many in
 /// a single block.
-void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) {
   if (UndefReads.empty())
     return;
 
@@ -640,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
 
 // A hard instruction only works in one domain. All input registers will be
 // forced into that domain.
-void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
   // Collapse all uses.
   for (unsigned i = mi->getDesc().getNumDefs(),
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
@@ -663,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
 }
 
 // A soft instruction can be changed to work in other domains given by mask.
-void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // Bitmask of available domains for this instruction after taking collapsed
   // operands into account.
   unsigned available = mask;
@@ -774,7 +589,34 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
 }
 
-bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
+void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB,
+                                         bool PrimaryPass) {
+  enterBasicBlock(MBB);
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid doing the work
+  // to try and break dependencies now.
+  bool breakDependency = isBlockDone(MBB);
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isDebugValue()) {
+      bool Kill = false;
+      if (PrimaryPass)
+        Kill = visitInstr(&MI);
+      processDefs(&MI, breakDependency, Kill);
+    }
+  }
+  if (breakDependency)
+    processUndefReads(MBB);
+  leaveBasicBlock(MBB);
+}
+
+bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) {
+  return MBBInfos[MBB].PrimaryCompleted &&
+         MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
+         MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
+}
+
+bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) {
   if (skipFunction(*mf.getFunction()))
     return false;
   MF = &mf;
@@ -810,52 +652,104 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         AliasMap[*AI].push_back(i);
   }
 
+  // Initialize the MMBInfos
+  for (auto &MBB : mf) {
+    MBBInfo InitialInfo;
+    MBBInfos.insert(std::make_pair(&MBB, InitialInfo));
+  }
+
+  /*
+   *  We want to visit every instruction in every basic block in order to update
+   *  it's execution domain or break any false dependencies. However, for the
+   *  dependency breaking, we need to know clearances from all predecessors
+   *  (including any backedges). One way to do so would be to do two complete
+   *  passes over all basic blocks/instructions, the first for recording
+   *  clearances, the second to break the dependencies. However, for functions
+   *  without backedges, or functions with a lot of straight-line code, and
+   *  a small loop, that would be a lot of unnecessary work (since only the
+   *  BBs that are part of the loop require two passes). As an example,
+   *  consider the following loop.
+   *
+   *
+   *     PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
+   *           ^                                  |
+   *           +----------------------------------+
+   *
+   *  The iteration order is as follows:
+   *  Naive: PH A B C D A' B' C' D'
+   *  Optimized: PH A B C A' B' C' D
+   *
+   *  Note that we avoid processing D twice, because we can entirely process
+   *  the predecessors before getting to D. We call a block that is ready
+   *  for its second round of processing `done` (isBlockDone). Once we finish
+   *  processing some block, we update the counters in MBBInfos and re-process
+   *  any successors that are now done.
+   */
+
   MachineBasicBlock *Entry = &*MF->begin();
   ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
-  SmallVector<MachineBasicBlock*, 16> Loops;
+  SmallVector<MachineBasicBlock *, 4> Workqueue;
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
     MachineBasicBlock *MBB = *MBBI;
-    enterBasicBlock(MBB);
-    if (SeenUnknownBackEdge)
-      Loops.push_back(MBB);
-    for (MachineInstr &MI : *MBB)
-      visitInstr(&MI);
-    processUndefReads(MBB);
-    leaveBasicBlock(MBB);
+    // N.B: IncomingProcessed and IncomingCompleted were already updated while
+    // processing this block's predecessors.
+    MBBInfos[MBB].PrimaryCompleted = true;
+    MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
+    bool Primary = true;
+    Workqueue.push_back(MBB);
+    while (!Workqueue.empty()) {
+      MachineBasicBlock *ActiveMBB = &*Workqueue.back();
+      Workqueue.pop_back();
+      processBasicBlock(ActiveMBB, Primary);
+      bool Done = isBlockDone(ActiveMBB);
+      for (auto *Succ : ActiveMBB->successors()) {
+        if (!isBlockDone(Succ)) {
+          if (Primary) {
+            MBBInfos[Succ].IncomingProcessed++;
+          }
+          if (Done) {
+            MBBInfos[Succ].IncomingCompleted++;
+          }
+          if (isBlockDone(Succ)) {
+            Workqueue.push_back(Succ);
+          }
+        }
+      }
+      Primary = false;
+    }
   }
 
-  // Visit all the loop blocks again in order to merge DomainValues from
-  // back-edges.
-  for (MachineBasicBlock *MBB : Loops) {
-    enterBasicBlock(MBB);
-    for (MachineInstr &MI : *MBB)
-      if (!MI.isDebugValue())
-        processDefs(&MI, false);
-    processUndefReads(MBB);
-    leaveBasicBlock(MBB);
+  // We need to go through again and finalize any blocks that are not done yet.
+  // This is possible if blocks have dead predecessors, so we didn't visit them
+  // above.
+  for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator
+           MBBI = RPOT.begin(),
+           MBBE = RPOT.end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock *MBB = *MBBI;
+    if (!isBlockDone(MBB)) {
+      processBasicBlock(MBB, false);
+      // Don't update successors here. We'll get to them anyway through this
+      // loop.
+    }
   }
 
   // Clear the LiveOuts vectors and collapse any remaining DomainValues.
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
-    LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI);
-    if (FI == LiveOuts.end() || !FI->second)
+    auto FI = MBBInfos.find(*MBBI);
+    if (FI == MBBInfos.end() || !FI->second.OutRegs)
       continue;
     for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      if (FI->second[i].Value)
-        release(FI->second[i].Value);
-    delete[] FI->second;
+      if (FI->second.OutRegs[i].Value)
+        release(FI->second.OutRegs[i].Value);
+    delete[] FI->second.OutRegs;
   }
-  LiveOuts.clear();
+  MBBInfos.clear();
   UndefReads.clear();
   Avail.clear();
   Allocator.DestroyAll();
 
   return false;
 }
-
-FunctionPass *
-llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
-  return new ExeDepsFix(RC);
-}
diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
index 0ec79c2..324ea17 100644
--- a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
@@ -14,9 +14,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -41,7 +41,7 @@ namespace {
 
 char ExpandISelPseudos::ID = 0;
 char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID;
-INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos",
+INITIALIZE_PASS(ExpandISelPseudos, DEBUG_TYPE,
                 "Expand ISel Pseudo-instructions", false, false)
 
 bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
diff --git a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
index ab2382e..4ce86f2 100644
--- a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -12,11 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -58,7 +58,7 @@ private:
 char ExpandPostRA::ID = 0;
 char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
 
-INITIALIZE_PASS(ExpandPostRA, "postrapseudos",
+INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE,
                 "Post-RA pseudo instruction expansion pass", false, false)
 
 /// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered
@@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
   MachineOperand &DstMO = MI->getOperand(0);
   MachineOperand &SrcMO = MI->getOperand(1);
 
-  if (SrcMO.getReg() == DstMO.getReg()) {
-    DEBUG(dbgs() << "identity copy: " << *MI);
+  bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg());
+  if (IdentityCopy || SrcMO.isUndef()) {
+    DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy:    ") << *MI);
     // No need to insert an identity copy instruction, but replace with a KILL
     // if liveness is changed.
     if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
diff --git a/contrib/llvm/lib/CodeGen/ExpandReductions.cpp b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp
new file mode 100644
index 0000000..70dca3b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,167 @@
+//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the experimental intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    return Instruction::FAdd;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    return Instruction::FMul;
+  case Intrinsic::experimental_vector_reduce_add:
+    return Instruction::Add;
+  case Intrinsic::experimental_vector_reduce_mul:
+    return Instruction::Mul;
+  case Intrinsic::experimental_vector_reduce_and:
+    return Instruction::And;
+  case Intrinsic::experimental_vector_reduce_or:
+    return Instruction::Or;
+  case Intrinsic::experimental_vector_reduce_xor:
+    return Instruction::Xor;
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+    return Instruction::ICmp;
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return Instruction::FCmp;
+  default:
+    llvm_unreachable("Unexpected ID");
+  }
+}
+
+RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_smax:
+    return RecurrenceDescriptor::MRK_SIntMax;
+  case Intrinsic::experimental_vector_reduce_smin:
+    return RecurrenceDescriptor::MRK_SIntMin;
+  case Intrinsic::experimental_vector_reduce_umax:
+    return RecurrenceDescriptor::MRK_UIntMax;
+  case Intrinsic::experimental_vector_reduce_umin:
+    return RecurrenceDescriptor::MRK_UIntMin;
+  case Intrinsic::experimental_vector_reduce_fmax:
+    return RecurrenceDescriptor::MRK_FloatMax;
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return RecurrenceDescriptor::MRK_FloatMin;
+  default:
+    return RecurrenceDescriptor::MRK_Invalid;
+  }
+}
+
+bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
+  bool Changed = false;
+  SmallVector<IntrinsicInst*, 4> Worklist;
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (auto II = dyn_cast<IntrinsicInst>(&*I))
+      Worklist.push_back(II);
+
+  for (auto *II : Worklist) {
+    IRBuilder<> Builder(II);
+    Value *Vec = nullptr;
+    auto ID = II->getIntrinsicID();
+    auto MRK = RecurrenceDescriptor::MRK_Invalid;
+    switch (ID) {
+    case Intrinsic::experimental_vector_reduce_fadd:
+    case Intrinsic::experimental_vector_reduce_fmul:
+      // FMFs must be attached to the call, otherwise it's an ordered reduction
+      // and it can't be handled by generating this shuffle sequence.
+      // TODO: Implement scalarization of ordered reductions here for targets
+      // without native support.
+      if (!II->getFastMathFlags().unsafeAlgebra())
+        continue;
+      Vec = II->getArgOperand(1);
+      break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin:
+      Vec = II->getArgOperand(0);
+      MRK = getMRK(ID);
+      break;
+    default:
+      continue;
+    }
+    if (!TTI->shouldExpandReduction(II))
+      continue;
+    auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
+    II->replaceAllUsesWith(Rdx);
+    II->eraseFromParent();
+    Changed = true;
+  }
+  return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+  static char ID;
+  ExpandReductions() : FunctionPass(ID) {
+    initializeExpandReductionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    return expandReductions(F, TTI);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+};
+}
+
+char ExpandReductions::ID;
+INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions",
+                      "Expand reduction intrinsics", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ExpandReductions, "expand-reductions",
+                    "Expand reduction intrinsics", false, false)
+
+FunctionPass *llvm::createExpandReductionsPass() {
+  return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  if (!expandReductions(F, &TTI))
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/CodeGen/FEntryInserter.cpp b/contrib/llvm/lib/CodeGen/FEntryInserter.cpp
new file mode 100644
index 0000000..0759bf6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/FEntryInserter.cpp
@@ -0,0 +1,55 @@
+//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file edits function bodies to insert fentry calls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct FEntryInserter : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  FEntryInserter() : MachineFunctionPass(ID) {
+    initializeFEntryInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+}
+
+bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) {
+  const std::string FEntryName =
+      MF.getFunction()->getFnAttribute("fentry-call").getValueAsString();
+  if (FEntryName != "true")
+    return false;
+
+  auto &FirstMBB = *MF.begin();
+  auto &FirstMI = *FirstMBB.begin();
+
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+          TII->get(TargetOpcode::FENTRY_CALL));
+  return true;
+}
+
+char FEntryInserter::ID = 0;
+char &llvm::FEntryInserterID = FEntryInserter::ID;
+INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false,
+                false)
diff --git a/contrib/llvm/lib/CodeGen/FaultMaps.cpp b/contrib/llvm/lib/CodeGen/FaultMaps.cpp
index 2acafaf..2924b01 100644
--- a/contrib/llvm/lib/CodeGen/FaultMaps.cpp
+++ b/contrib/llvm/lib/CodeGen/FaultMaps.cpp
@@ -1,4 +1,4 @@
-//===---------------------------- FaultMaps.cpp ---------------------------===//
+//===- FaultMaps.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,13 +8,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/FaultMaps.h"
-
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
   }
 }
 
-
 const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) {
   switch (FT) {
   default:
     llvm_unreachable("unhandled fault type!");
-
   case FaultMaps::FaultingLoad:
     return "FaultingLoad";
+  case FaultMaps::FaultingLoadStore:
+    return "FaultingLoadStore";
+  case FaultMaps::FaultingStore:
+    return "FaultingStore";
   }
 }
 
diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
index d61afad..9c71b18 100644
--- a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
+++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
@@ -11,10 +11,10 @@
 // funclets being contiguous.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "funclet-layout"
@@ -37,7 +37,7 @@ public:
 
 char FuncletLayout::ID = 0;
 char &llvm::FuncletLayoutID = FuncletLayout::ID;
-INITIALIZE_PASS(FuncletLayout, "funclet-layout",
+INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE,
                 "Contiguously Lay Out Funclets", false, false)
 
 bool FuncletLayout::runOnMachineFunction(MachineFunction &F) {
diff --git a/contrib/llvm/lib/CodeGen/GCMetadata.cpp b/contrib/llvm/lib/CodeGen/GCMetadata.cpp
index be21c73..456fa79 100644
--- a/contrib/llvm/lib/CodeGen/GCMetadata.cpp
+++ b/contrib/llvm/lib/CodeGen/GCMetadata.cpp
@@ -11,22 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 namespace {
 
 class Printer : public FunctionPass {
   static char ID;
+
   raw_ostream &OS;
 
 public:
@@ -38,7 +43,8 @@ public:
   bool runOnFunction(Function &F) override;
   bool doFinalization(Module &M) override;
 };
-}
+
+} // end anonymous namespace
 
 INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
                 "Create Garbage Collector Module Metadata", false, false)
@@ -48,7 +54,7 @@ INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
 GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
     : F(F), S(S), FrameSize(~0LL) {}
 
-GCFunctionInfo::~GCFunctionInfo() {}
+GCFunctionInfo::~GCFunctionInfo() = default;
 
 // -----------------------------------------------------------------------------
 
@@ -67,7 +73,7 @@ GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
     return *I->second;
 
   GCStrategy *S = getGCStrategy(F.getGC());
-  Functions.push_back(make_unique<GCFunctionInfo>(F, *S));
+  Functions.push_back(llvm::make_unique<GCFunctionInfo>(F, *S));
   GCFunctionInfo *GFI = Functions.back().get();
   FInfoMap[&F] = GFI;
   return *GFI;
diff --git a/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
index d183c7f..bc7beb6 100644
--- a/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- GCMetadataPrinter.cpp - Garbage collection infrastructure ---------===//
+//===- GCMetadataPrinter.cpp - Garbage collection infrastructure ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+
 using namespace llvm;
 
 LLVM_INSTANTIATE_REGISTRY(GCMetadataPrinterRegistry)
 
-GCMetadataPrinter::GCMetadataPrinter() {}
+GCMetadataPrinter::GCMetadataPrinter() = default;
 
-GCMetadataPrinter::~GCMetadataPrinter() {}
+GCMetadataPrinter::~GCMetadataPrinter() = default;
diff --git a/contrib/llvm/lib/CodeGen/GCStrategy.cpp b/contrib/llvm/lib/CodeGen/GCStrategy.cpp
index 31ab86f..6be4c16 100644
--- a/contrib/llvm/lib/CodeGen/GCStrategy.cpp
+++ b/contrib/llvm/lib/CodeGen/GCStrategy.cpp
@@ -1,4 +1,4 @@
-//===-- GCStrategy.cpp - Garbage Collector Description --------------------===//
+//===- GCStrategy.cpp - Garbage Collector Description ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,7 +18,4 @@ using namespace llvm;
 
 LLVM_INSTANTIATE_REGISTRY(GCRegistry)
 
-GCStrategy::GCStrategy()
-    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false),
-      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true),
-      UsesMetadata(false) {}
+GCStrategy::GCStrategy() = default;
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 1321221..be0c5c2 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -24,40 +24,42 @@
 using namespace llvm;
 
 bool CallLowering::lowerCall(
-    MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg,
+    MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg,
     ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const {
-  auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout();
+  auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
   SmallVector<ArgInfo, 8> OrigArgs;
   unsigned i = 0;
-  for (auto &Arg : CI.arg_operands()) {
-    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{}};
-    setArgFlags(OrigArg, i + 1, DL, CI);
+  unsigned NumFixedArgs = CS.getFunctionType()->getNumParams();
+  for (auto &Arg : CS.args()) {
+    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
+                    i < NumFixedArgs};
+    setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS);
     OrigArgs.push_back(OrigArg);
     ++i;
   }
 
   MachineOperand Callee = MachineOperand::CreateImm(0);
-  if (Function *F = CI.getCalledFunction())
+  if (const Function *F = CS.getCalledFunction())
     Callee = MachineOperand::CreateGA(F, 0);
   else
     Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
 
-  ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}};
+  ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}};
   if (!OrigRet.Ty->isVoidTy())
-    setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI);
+    setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS);
 
-  return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs);
+  return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs);
 }
 
 template <typename FuncInfoTy>
 void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const DataLayout &DL,
                                const FuncInfoTy &FuncInfo) const {
-  const AttributeSet &Attrs = FuncInfo.getAttributes();
+  const AttributeList &Attrs = FuncInfo.getAttributes();
   if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
     Arg.Flags.setZExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
@@ -81,8 +83,8 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
     // For ByVal, alignment should be passed from FE.  BE will guess if
     // this info is not there but there are cases it cannot get right.
     unsigned FrameAlign;
-    if (FuncInfo.getParamAlignment(OpIdx))
-      FrameAlign = FuncInfo.getParamAlignment(OpIdx);
+    if (FuncInfo.getParamAlignment(OpIdx - 2))
+      FrameAlign = FuncInfo.getParamAlignment(OpIdx - 2);
     else
       FrameAlign = getTLI()->getByValTypeAlignment(ElementTy, DL);
     Arg.Flags.setByValAlign(FrameAlign);
@@ -103,7 +105,6 @@ CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                     const CallInst &FuncInfo) const;
 
 bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
-                                     CCAssignFn *AssignFn,
                                      ArrayRef<ArgInfo> Args,
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
@@ -116,12 +117,20 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
   unsigned NumArgs = Args.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT CurVT = MVT::getVT(Args[i].Ty);
-    if (AssignFn(i, CurVT, CurVT, CCValAssign::Full, Args[i].Flags, CCInfo))
+    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo))
       return false;
   }
 
-  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) {
+    assert(j < ArgLocs.size() && "Skipped too many arg locs");
+
+    CCValAssign &VA = ArgLocs[j];
+    assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
+
+    if (VA.needsCustom()) {
+      j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      continue;
+    }
 
     if (VA.isRegLoc())
       Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA);
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
index fcd2722..29d1209 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -26,6 +26,7 @@ void llvm::initializeGlobalISel(PassRegistry &Registry) {
 void llvm::initializeGlobalISel(PassRegistry &Registry) {
   initializeIRTranslatorPass(Registry);
   initializeLegalizerPass(Registry);
+  initializeLocalizerPass(Registry);
   initializeRegBankSelectPass(Registry);
   initializeInstructionSelectPass(Registry);
 }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 89a042f..ed1bd99 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator --*- C++ -*-==//
+//===- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator ---*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,43 +11,93 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "irtranslator"
 
 using namespace llvm;
 
 char IRTranslator::ID = 0;
+
 INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 
-static void reportTranslationError(const Value &V, const Twine &Message) {
-  std::string ErrStorage;
-  raw_string_ostream Err(ErrStorage);
-  Err << Message << ": " << V << '\n';
-  report_fatal_error(Err.str());
+static void reportTranslationError(MachineFunction &MF,
+                                   const TargetPassConfig &TPC,
+                                   OptimizationRemarkEmitter &ORE,
+                                   OptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(R.getMsg());
+  else
+    ORE.emit(R);
 }
 
-IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) {
+IRTranslator::IRTranslator() : MachineFunctionPass(ID) {
   initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
 }
 
@@ -56,31 +106,33 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-
 unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
   unsigned &ValReg = ValToVReg[&Val];
-  // Check if this is the first time we see Val.
-  if (!ValReg) {
-    // Fill ValRegsSequence with the sequence of registers
-    // we need to concat together to produce the value.
-    assert(Val.getType()->isSized() &&
-           "Don't know how to create an empty vreg");
-    unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL});
-    ValReg = VReg;
-
-    if (auto CV = dyn_cast<Constant>(&Val)) {
-      bool Success = translate(*CV, VReg);
-      if (!Success) {
-        if (!TPC->isGlobalISelAbortEnabled()) {
-          MF->getProperties().set(
-              MachineFunctionProperties::Property::FailedISel);
-          return VReg;
-        }
-        reportTranslationError(Val, "unable to translate constant");
-      }
+
+  if (ValReg)
+    return ValReg;
+
+  // Fill ValRegsSequence with the sequence of registers
+  // we need to concat together to produce the value.
+  assert(Val.getType()->isSized() &&
+         "Don't know how to create an empty vreg");
+  unsigned VReg =
+      MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL));
+  ValReg = VReg;
+
+  if (auto CV = dyn_cast<Constant>(&Val)) {
+    bool Success = translate(*CV, VReg);
+    if (!Success) {
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 MF->getFunction()->getSubprogram(),
+                                 &MF->getFunction()->getEntryBlock());
+      R << "unable to translate constant: " << ore::NV("Type", Val.getType());
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return VReg;
     }
   }
-  return ValReg;
+
+  return VReg;
 }
 
 int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
@@ -112,28 +164,27 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     Alignment = LI->getAlignment();
     ValTy = LI->getType();
-  } else if (!TPC->isGlobalISelAbortEnabled()) {
-    MF->getProperties().set(
-        MachineFunctionProperties::Property::FailedISel);
+  } else {
+    OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
+    R << "unable to translate memop: " << ore::NV("Opcode", &I);
+    reportTranslationError(*MF, *TPC, *ORE, R);
     return 1;
-  } else
-    llvm_unreachable("unhandled memory instruction");
+  }
 
   return Alignment ? Alignment : DL->getABITypeAlignment(ValTy);
 }
 
-MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
+MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
-  if (!MBB) {
-    MBB = MF->CreateMachineBasicBlock(&BB);
-    MF->push_back(MBB);
-
-    if (BB.hasAddressTaken())
-      MBB->setHasAddressTaken();
-  }
+  assert(MBB && "BasicBlock was not encountered before");
   return *MBB;
 }
 
+void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
+  assert(NewPred && "new predecessor must be a real MachineBasicBlock");
+  MachinePreds[Edge].push_back(NewPred);
+}
+
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
   // FIXME: handle signed/unsigned wrapping flags.
@@ -149,6 +200,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
   return true;
 }
 
+bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
+  // -0.0 - X --> G_FNEG
+  if (isa<Constant>(U.getOperand(0)) &&
+      U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) {
+    MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
+        .addDef(getOrCreateVReg(U))
+        .addUse(getOrCreateVReg(*U.getOperand(1)));
+    return true;
+  }
+  return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
+}
+
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   const CmpInst *CI = dyn_cast<CmpInst>(&U);
@@ -158,9 +221,14 @@ bool IRTranslator::translateCompare(const User &U,
   CmpInst::Predicate Pred =
       CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>(
                                     cast<ConstantExpr>(U).getPredicate());
-
   if (CmpInst::isIntPredicate(Pred))
     MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
+  else if (Pred == CmpInst::FCMP_FALSE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getNullValue(CI->getType())));
+  else if (Pred == CmpInst::FCMP_TRUE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
   else
     MIRBuilder.buildFCmp(Pred, Res, Op0, Op1);
 
@@ -183,18 +251,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
     // We want a G_BRCOND to the true BB followed by an unconditional branch.
     unsigned Tst = getOrCreateVReg(*BrInst.getCondition());
     const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++));
-    MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt);
+    MachineBasicBlock &TrueBB = getMBB(TrueTgt);
     MIRBuilder.buildBrCond(Tst, TrueBB);
   }
 
   const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ));
-  MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt);
-  MIRBuilder.buildBr(TgtBB);
+  MachineBasicBlock &TgtBB = getMBB(BrTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the unconditional target is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
 
   // Link successors.
-  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
   for (const BasicBlock *Succ : BrInst.successors())
-    CurBB.addSuccessor(&getOrCreateBB(*Succ));
+    CurBB.addSuccessor(&getMBB(*Succ));
   return true;
 }
 
@@ -209,30 +280,52 @@ bool IRTranslator::translateSwitch(const User &U,
 
   const SwitchInst &SwInst = cast<SwitchInst>(U);
   const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition());
+  const BasicBlock *OrigBB = SwInst.getParent();
 
-  LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL);
+  LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL);
   for (auto &CaseIt : SwInst.cases()) {
     const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());
     const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);
     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue);
-    MachineBasicBlock &CurBB = MIRBuilder.getMBB();
-    MachineBasicBlock &TrueBB = getOrCreateBB(*CaseIt.getCaseSuccessor());
+    MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
+    const BasicBlock *TrueBB = CaseIt.getCaseSuccessor();
+    MachineBasicBlock &TrueMBB = getMBB(*TrueBB);
 
-    MIRBuilder.buildBrCond(Tst, TrueBB);
-    CurBB.addSuccessor(&TrueBB);
+    MIRBuilder.buildBrCond(Tst, TrueMBB);
+    CurMBB.addSuccessor(&TrueMBB);
+    addMachineCFGPred({OrigBB, TrueBB}, &CurMBB);
 
-    MachineBasicBlock *FalseBB =
+    MachineBasicBlock *FalseMBB =
         MF->CreateMachineBasicBlock(SwInst.getParent());
-    MF->push_back(FalseBB);
-    MIRBuilder.buildBr(*FalseBB);
-    CurBB.addSuccessor(FalseBB);
+    // Insert the comparison blocks one after the other.
+    MF->insert(std::next(CurMBB.getIterator()), FalseMBB);
+    MIRBuilder.buildBr(*FalseMBB);
+    CurMBB.addSuccessor(FalseMBB);
 
-    MIRBuilder.setMBB(*FalseBB);
+    MIRBuilder.setMBB(*FalseMBB);
   }
   // handle default case
-  MachineBasicBlock &DefaultBB = getOrCreateBB(*SwInst.getDefaultDest());
-  MIRBuilder.buildBr(DefaultBB);
-  MIRBuilder.getMBB().addSuccessor(&DefaultBB);
+  const BasicBlock *DefaultBB = SwInst.getDefaultDest();
+  MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB);
+  MIRBuilder.buildBr(DefaultMBB);
+  MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
+  CurMBB.addSuccessor(&DefaultMBB);
+  addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB);
+
+  return true;
+}
+
+bool IRTranslator::translateIndirectBr(const User &U,
+                                       MachineIRBuilder &MIRBuilder) {
+  const IndirectBrInst &BrInst = cast<IndirectBrInst>(U);
+
+  const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress());
+  MIRBuilder.buildBrIndirect(Tgt);
+
+  // Link successors.
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+  for (const BasicBlock *Succ : BrInst.successors())
+    CurBB.addSuccessor(&getMBB(*Succ));
 
   return true;
 }
@@ -240,47 +333,38 @@ bool IRTranslator::translateSwitch(const User &U,
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
-  if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic())
-    return false;
-
-  assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment");
   auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOLoad;
 
   unsigned Res = getOrCreateVReg(LI);
   unsigned Addr = getOrCreateVReg(*LI.getPointerOperand());
-  LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL};
+
   MIRBuilder.buildLoad(
       Res, Addr,
       *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
                                 Flags, DL->getTypeStoreSize(LI.getType()),
-                                getMemOpAlignment(LI)));
+                                getMemOpAlignment(LI), AAMDNodes(), nullptr,
+                                LI.getSyncScopeID(), LI.getOrdering()));
   return true;
 }
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
-
-  if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic())
-    return false;
-
-  assert(!SI.isAtomic() && "only non-atomic stores supported at the moment");
   auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOStore;
 
   unsigned Val = getOrCreateVReg(*SI.getValueOperand());
   unsigned Addr = getOrCreateVReg(*SI.getPointerOperand());
-  LLT VTy{*SI.getValueOperand()->getType(), *DL},
-      PTy{*SI.getPointerOperand()->getType(), *DL};
 
   MIRBuilder.buildStore(
       Val, Addr,
       *MF->getMachineMemOperand(
           MachinePointerInfo(SI.getPointerOperand()), Flags,
           DL->getTypeStoreSize(SI.getValueOperand()->getType()),
-          getMemOpAlignment(SI)));
+          getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSyncScopeID(),
+          SI.getOrdering()));
   return true;
 }
 
@@ -290,6 +374,15 @@ bool IRTranslator::translateExtractValue(const User &U,
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
   SmallVector<Value *, 1> Indices;
 
+  // If Src is a single element ConstantStruct, translate extractvalue
+  // to that element to avoid inserting a cast instruction.
+  if (auto CS = dyn_cast<ConstantStruct>(Src))
+    if (CS->getNumOperands() == 1) {
+      unsigned Res = getOrCreateVReg(*CS->getOperand(0));
+      ValToVReg[&U] = Res;
+      return true;
+    }
+
   // getIndexedOffsetInType is designed for GEPs, so the first index is the
   // usual array element rather than looking into the actual aggregate.
   Indices.push_back(ConstantInt::get(Int32Ty, 0));
@@ -305,7 +398,7 @@ bool IRTranslator::translateExtractValue(const User &U,
   uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
 
   unsigned Res = getOrCreateVReg(U);
-  MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src));
+  MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset);
 
   return true;
 }
@@ -331,29 +424,36 @@ bool IRTranslator::translateInsertValue(const User &U,
   uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
 
   unsigned Res = getOrCreateVReg(U);
-  const Value &Inserted = *U.getOperand(1);
-  MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), getOrCreateVReg(Inserted),
-                         Offset);
+  unsigned Inserted = getOrCreateVReg(*U.getOperand(1));
+  MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), Inserted, Offset);
 
   return true;
 }
 
 bool IRTranslator::translateSelect(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
-  MIRBuilder.buildSelect(getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)),
-                         getOrCreateVReg(*U.getOperand(1)),
-                         getOrCreateVReg(*U.getOperand(2)));
+  unsigned Res = getOrCreateVReg(U);
+  unsigned Tst = getOrCreateVReg(*U.getOperand(0));
+  unsigned Op0 = getOrCreateVReg(*U.getOperand(1));
+  unsigned Op1 = getOrCreateVReg(*U.getOperand(2));
+  MIRBuilder.buildSelect(Res, Tst, Op0, Op1);
   return true;
 }
 
 bool IRTranslator::translateBitCast(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) {
+  // If we're bitcasting to the source type, we can reuse the source vreg.
+  if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
+      getLLTForType(*U.getType(), *DL)) {
+    // Get the source vreg now, to avoid invalidating ValToVReg.
+    unsigned SrcReg = getOrCreateVReg(*U.getOperand(0));
     unsigned &Reg = ValToVReg[&U];
+    // If we already assigned a vreg for this bitcast, we can't change that.
+    // Emit a copy to satisfy the users we already emitted.
     if (Reg)
-      MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0)));
+      MIRBuilder.buildCopy(Reg, SrcReg);
     else
-      Reg = getOrCreateVReg(*U.getOperand(0));
+      Reg = SrcReg;
     return true;
   }
   return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
@@ -375,9 +475,10 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
   Value &Op0 = *U.getOperand(0);
   unsigned BaseReg = getOrCreateVReg(Op0);
-  LLT PtrTy{*Op0.getType(), *DL};
-  unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace());
-  LLT OffsetTy = LLT::scalar(PtrSize);
+  Type *PtrIRTy = Op0.getType();
+  LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+  Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy);
+  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
 
   int64_t Offset = 0;
   for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U);
@@ -399,8 +500,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
       if (Offset != 0) {
         unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
-        unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-        MIRBuilder.buildConstant(OffsetReg, Offset);
+        unsigned OffsetReg =
+            getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
         MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);
 
         BaseReg = NewBaseReg;
@@ -408,8 +509,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       // N = N + Idx * ElementSize;
-      unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy);
-      MIRBuilder.buildConstant(ElementSizeReg, ElementSize);
+      unsigned ElementSizeReg =
+          getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));
 
       unsigned IdxReg = getOrCreateVReg(*Idx);
       if (MRI->getType(IdxReg) != OffsetTy) {
@@ -428,8 +529,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   }
 
   if (Offset != 0) {
-    unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-    MIRBuilder.buildConstant(OffsetReg, Offset);
+    unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
     MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg);
     return true;
   }
@@ -438,13 +538,12 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   return true;
 }
 
-bool IRTranslator::translateMemcpy(const CallInst &CI,
-                                   MachineIRBuilder &MIRBuilder) {
-  LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL};
-  if (cast<PointerType>(CI.getArgOperand(0)->getType())->getAddressSpace() !=
-          0 ||
-      cast<PointerType>(CI.getArgOperand(1)->getType())->getAddressSpace() !=
-          0 ||
+bool IRTranslator::translateMemfunc(const CallInst &CI,
+                                    MachineIRBuilder &MIRBuilder,
+                                    unsigned ID) {
+  LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL);
+  Type *DstTy = CI.getArgOperand(0)->getType();
+  if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||
       SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))
     return false;
 
@@ -454,14 +553,32 @@ bool IRTranslator::translateMemcpy(const CallInst &CI,
     Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
   }
 
-  MachineOperand Callee = MachineOperand::CreateES("memcpy");
+  const char *Callee;
+  switch (ID) {
+  case Intrinsic::memmove:
+  case Intrinsic::memcpy: {
+    Type *SrcTy = CI.getArgOperand(1)->getType();
+    if(cast<PointerType>(SrcTy)->getAddressSpace() != 0)
+      return false;
+    Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove";
+    break;
+  }
+  case Intrinsic::memset:
+    Callee = "memset";
+    break;
+  default:
+    return false;
+  }
 
-  return CLI->lowerCall(MIRBuilder, Callee,
+  return CLI->lowerCall(MIRBuilder, CI.getCallingConv(),
+                        MachineOperand::CreateES(Callee),
                         CallLowering::ArgInfo(0, CI.getType()), Args);
 }
 
 void IRTranslator::getStackGuard(unsigned DstReg,
                                  MachineIRBuilder &MIRBuilder) {
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));
   auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD);
   MIB.addDef(DstReg);
 
@@ -482,7 +599,7 @@ void IRTranslator::getStackGuard(unsigned DstReg,
 
 bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                               MachineIRBuilder &MIRBuilder) {
-  LLT Ty{*CI.getOperand(0)->getType(), *DL};
+  LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL);
   LLT s1 = LLT::scalar(1);
   unsigned Width = Ty.getSizeInBits();
   unsigned Res = MRI->createGenericVirtualRegister(Ty);
@@ -494,12 +611,12 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                  .addUse(getOrCreateVReg(*CI.getOperand(1)));
 
   if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) {
-    unsigned Zero = MRI->createGenericVirtualRegister(s1);
-    EntryBuilder.buildConstant(Zero, 0);
+    unsigned Zero = getOrCreateVReg(
+        *Constant::getNullValue(Type::getInt1Ty(CI.getContext())));
     MIB.addUse(Zero);
   }
 
-  MIRBuilder.buildSequence(getOrCreateVReg(CI), Res, 0, Overflow, Width);
+  MIRBuilder.buildSequence(getOrCreateVReg(CI), {Res, Overflow}, {0, Width});
   return true;
 }
 
@@ -508,12 +625,83 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   switch (ID) {
   default:
     break;
-  case Intrinsic::dbg_declare:
-  case Intrinsic::dbg_value:
-    // FIXME: these obviously need to be supported properly.
-    MF->getProperties().set(
-          MachineFunctionProperties::Property::FailedISel);
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    // Stack coloring is not enabled in O0 (which we care about now) so we can
+    // drop these. Make sure someone notices when we start compiling at higher
+    // opts though.
+    if (MF->getTarget().getOptLevel() != CodeGenOpt::None)
+      return false;
+    return true;
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
+    assert(DI.getVariable() && "Missing variable");
+
+    const Value *Address = DI.getAddress();
+    if (!Address || isa<UndefValue>(Address)) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return true;
+    }
+
+    assert(DI.getVariable()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+    auto AI = dyn_cast<AllocaInst>(Address);
+    if (AI && AI->isStaticAlloca()) {
+      // Static allocas are tracked at the MF level, no need for DBG_VALUE
+      // instructions (in fact, they get ignored if they *do* exist).
+      MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(),
+                             getOrCreateFrameIndex(*AI), DI.getDebugLoc());
+    } else
+      MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address),
+                                     DI.getVariable(), DI.getExpression());
+    return true;
+  }
+  case Intrinsic::vaend:
+    // No target I know of cares about va_end. Certainly no in-tree target
+    // does. Simplest intrinsic ever!
+    return true;
+  case Intrinsic::vastart: {
+    auto &TLI = *MF->getSubtarget().getTargetLowering();
+    Value *Ptr = CI.getArgOperand(0);
+    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+
+    MIRBuilder.buildInstr(TargetOpcode::G_VASTART)
+        .addUse(getOrCreateVReg(*Ptr))
+        .addMemOperand(MF->getMachineMemOperand(
+            MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0));
+    return true;
+  }
+  case Intrinsic::dbg_value: {
+    // This form of DBG_VALUE is target-independent.
+    const DbgValueInst &DI = cast<DbgValueInst>(CI);
+    const Value *V = DI.getValue();
+    assert(DI.getVariable()->isValidLocationForIntrinsic(
+               MIRBuilder.getDebugLoc()) &&
+           "Expected inlined-at fields to agree");
+    if (!V) {
+      // Currently the optimizer can produce this; insert an undef to
+      // help debugging.  Probably the optimizer should not do this.
+      MIRBuilder.buildIndirectDbgValue(0, DI.getOffset(), DI.getVariable(),
+                                       DI.getExpression());
+    } else if (const auto *CI = dyn_cast<Constant>(V)) {
+      MIRBuilder.buildConstDbgValue(*CI, DI.getOffset(), DI.getVariable(),
+                                    DI.getExpression());
+    } else {
+      unsigned Reg = getOrCreateVReg(*V);
+      // FIXME: This does not handle register-indirect values at offset 0. The
+      // direct/indirect thing shouldn't really be handled by something as
+      // implicit as reg+noreg vs reg+imm in the first palce, but it seems
+      // pretty baked in right now.
+      if (DI.getOffset() != 0)
+        MIRBuilder.buildIndirectDbgValue(Reg, DI.getOffset(), DI.getVariable(),
+                                         DI.getExpression());
+      else
+        MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(),
+                                       DI.getExpression());
+    }
     return true;
+  }
   case Intrinsic::uadd_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder);
   case Intrinsic::sadd_with_overflow:
@@ -526,8 +714,43 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
   case Intrinsic::smul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
+  case Intrinsic::pow:
+    MIRBuilder.buildInstr(TargetOpcode::G_FPOW)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
+    return true;
+  case Intrinsic::exp:
+    MIRBuilder.buildInstr(TargetOpcode::G_FEXP)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::exp2:
+    MIRBuilder.buildInstr(TargetOpcode::G_FEXP2)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::log:
+    MIRBuilder.buildInstr(TargetOpcode::G_FLOG)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::log2:
+    MIRBuilder.buildInstr(TargetOpcode::G_FLOG2)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    return true;
+  case Intrinsic::fma:
+    MIRBuilder.buildInstr(TargetOpcode::G_FMA)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(1)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(2)));
+    return true;
   case Intrinsic::memcpy:
-    return translateMemcpy(CI, MIRBuilder);
+  case Intrinsic::memmove:
+  case Intrinsic::memset:
+    return translateMemfunc(CI, MIRBuilder, ID);
   case Intrinsic::eh_typeid_for: {
     GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
     unsigned Reg = getOrCreateVReg(CI);
@@ -546,7 +769,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL};
+    LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);
     getStackGuard(GuardVal, MIRBuilder);
 
@@ -564,18 +787,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   return false;
 }
 
+bool IRTranslator::translateInlineAsm(const CallInst &CI,
+                                      MachineIRBuilder &MIRBuilder) {
+  const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue());
+  if (!IA.getConstraintString().empty())
+    return false;
+
+  unsigned ExtraInfo = 0;
+  if (IA.hasSideEffects())
+    ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+  if (IA.getDialect() == InlineAsm::AD_Intel)
+    ExtraInfo |= InlineAsm::Extra_AsmDialect;
+
+  MIRBuilder.buildInstr(TargetOpcode::INLINEASM)
+    .addExternalSymbol(IA.getAsmString().c_str())
+    .addImm(ExtraInfo);
+
+  return true;
+}
+
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
   const Function *F = CI.getCalledFunction();
 
+  if (CI.isInlineAsm())
+    return translateInlineAsm(CI, MIRBuilder);
+
   if (!F || !F->isIntrinsic()) {
     unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
     SmallVector<unsigned, 8> Args;
     for (auto &Arg: CI.arg_operands())
       Args.push_back(getOrCreateVReg(*Arg));
 
-    return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() {
+    MF->getFrameInfo().setHasCalls(true);
+    return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {
       return getOrCreateVReg(*CI.getCalledValue());
     });
   }
@@ -594,11 +840,26 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory());
 
   for (auto &Arg : CI.arg_operands()) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg))
-      MIB.addImm(CI->getSExtValue());
-    else
-      MIB.addUse(getOrCreateVReg(*Arg));
+    // Some intrinsics take metadata parameters. Reject them.
+    if (isa<MetadataAsValue>(Arg))
+      return false;
+    MIB.addUse(getOrCreateVReg(*Arg));
+  }
+
+  // Add a MachineMemOperand if it is a target mem intrinsic.
+  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
+  TargetLowering::IntrinsicInfo Info;
+  // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
+  if (TLI.getTgtMemIntrinsic(Info, CI, ID)) {
+    MachineMemOperand::Flags Flags =
+        Info.vol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+    Flags |=
+        Info.readMem ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore;
+    uint64_t Size = Info.memVT.getSizeInBits() >> 3;
+    MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal),
+                                               Flags, Size, Info.align));
   }
+
   return true;
 }
 
@@ -610,7 +871,7 @@ bool IRTranslator::translateInvoke(const User &U,
   const BasicBlock *ReturnBB = I.getSuccessor(0);
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
-  const Value *Callee(I.getCalledValue());
+  const Value *Callee = I.getCalledValue();
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     return false;
@@ -627,30 +888,30 @@ bool IRTranslator::translateInvoke(const User &U,
   if (!isa<LandingPadInst>(EHPadBB->front()))
     return false;
 
-
   // Emit the actual call, bracketed by EH_LABELs so that the MF knows about
   // the region covered by the try.
   MCSymbol *BeginSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
   unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I);
-  SmallVector<CallLowering::ArgInfo, 8> Args;
+  SmallVector<unsigned, 8> Args;
   for (auto &Arg: I.arg_operands())
-    Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
+    Args.push_back(getOrCreateVReg(*Arg));
 
-  if (!CLI->lowerCall(MIRBuilder, MachineOperand::CreateGA(Fn, 0),
-                      CallLowering::ArgInfo(Res, I.getType()), Args))
+  if (!CLI->lowerCall(MIRBuilder, &I, Res, Args,
+                      [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
 
   // FIXME: track probabilities.
-  MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB),
-                    &ReturnMBB = getOrCreateBB(*ReturnBB);
+  MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB),
+                    &ReturnMBB = getMBB(*ReturnBB);
   MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
   MIRBuilder.getMBB().addSuccessor(&ReturnMBB);
   MIRBuilder.getMBB().addSuccessor(&EHPadMBB);
+  MIRBuilder.buildBr(ReturnMBB);
 
   return true;
 }
@@ -684,37 +945,161 @@ bool IRTranslator::translateLandingPad(const User &U,
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
     .addSym(MF->addLandingPad(&MBB));
 
+  LLT Ty = getLLTForType(*LP.getType(), *DL);
+  unsigned Undef = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildUndef(Undef);
+
+  SmallVector<LLT, 2> Tys;
+  for (Type *Ty : cast<StructType>(LP.getType())->elements())
+    Tys.push_back(getLLTForType(*Ty, *DL));
+  assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
+
   // Mark exception register as live in.
-  SmallVector<unsigned, 2> Regs;
-  SmallVector<uint64_t, 2> Offsets;
-  LLT p0 = LLT::pointer(0, DL->getPointerSizeInBits());
-  if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) {
-    unsigned VReg = MRI->createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(VReg, Reg);
-    Regs.push_back(VReg);
-    Offsets.push_back(0);
+  unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  if (!ExceptionReg)
+    return false;
+
+  MBB.addLiveIn(ExceptionReg);
+  unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]),
+           Tmp = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildCopy(VReg, ExceptionReg);
+  MIRBuilder.buildInsert(Tmp, Undef, VReg, 0);
+
+  unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  if (!SelectorReg)
+    return false;
+
+  MBB.addLiveIn(SelectorReg);
+
+  // N.b. the exception selector register always has pointer type and may not
+  // match the actual IR-level type in the landingpad so an extra cast is
+  // needed.
+  unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
+  MIRBuilder.buildCopy(PtrVReg, SelectorReg);
+
+  VReg = MRI->createGenericVirtualRegister(Tys[1]);
+  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg);
+  MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg,
+                         Tys[0].getSizeInBits());
+  return true;
+}
+
+bool IRTranslator::translateAlloca(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  auto &AI = cast<AllocaInst>(U);
+
+  if (AI.isStaticAlloca()) {
+    unsigned Res = getOrCreateVReg(AI);
+    int FI = getOrCreateFrameIndex(AI);
+    MIRBuilder.buildFrameIndex(Res, FI);
+    return true;
   }
 
-  if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) {
-    unsigned VReg = MRI->createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(VReg, Reg);
-    Regs.push_back(VReg);
-    Offsets.push_back(p0.getSizeInBits());
+  // Now we're in the harder dynamic case.
+  Type *Ty = AI.getAllocatedType();
+  unsigned Align =
+      std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment());
+
+  unsigned NumElts = getOrCreateVReg(*AI.getArraySize());
+
+  Type *IntPtrIRTy = DL->getIntPtrType(AI.getType());
+  LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL);
+  if (MRI->getType(NumElts) != IntPtrTy) {
+    unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy);
+    MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts);
+    NumElts = ExtElts;
+  }
+
+  unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
+  unsigned TySize =
+      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
+  MIRBuilder.buildMul(AllocSize, NumElts, TySize);
+
+  LLT PtrTy = getLLTForType(*AI.getType(), *DL);
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+
+  unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildCopy(SPTmp, SPReg);
+
+  unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
+
+  // Handle alignment. We have to realign if the allocation granule was smaller
+  // than stack alignment, or the specific alloca requires more than stack
+  // alignment.
+  unsigned StackAlign =
+      MF->getSubtarget().getFrameLowering()->getStackAlignment();
+  Align = std::max(Align, StackAlign);
+  if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
+    // Round the size of the allocation up to the stack alignment size
+    // by add SA-1 to the size. This doesn't overflow because we're computing
+    // an address inside an alloca.
+    unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
+    AllocTmp = AlignedAlloc;
   }
 
-  MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets);
+  MIRBuilder.buildCopy(SPReg, AllocTmp);
+  MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
+
+  MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
+  assert(MF->getFrameInfo().hasVarSizedObjects());
   return true;
 }
 
-bool IRTranslator::translateStaticAlloca(const AllocaInst &AI,
-                                         MachineIRBuilder &MIRBuilder) {
-  if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca())
-    return false;
+bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
+  // FIXME: We may need more info about the type. Because of how LLT works,
+  // we're completely discarding the i64/double distinction here (amongst
+  // others). Fortunately the ABIs I know of where that matters don't use va_arg
+  // anyway but that's not guaranteed.
+  MIRBuilder.buildInstr(TargetOpcode::G_VAARG)
+    .addDef(getOrCreateVReg(U))
+    .addUse(getOrCreateVReg(*U.getOperand(0)))
+    .addImm(DL->getABITypeAlignment(U.getType()));
+  return true;
+}
 
-  assert(AI.isStaticAlloca() && "only handle static allocas now");
-  unsigned Res = getOrCreateVReg(AI);
-  int FI = getOrCreateFrameIndex(AI);
-  MIRBuilder.buildFrameIndex(Res, FI);
+bool IRTranslator::translateInsertElement(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (U.getType()->getVectorNumElements() == 1) {
+    unsigned Elt = getOrCreateVReg(*U.getOperand(1));
+    ValToVReg[&U] = Elt;
+    return true;
+  }
+  unsigned Res = getOrCreateVReg(U);
+  unsigned Val = getOrCreateVReg(*U.getOperand(0));
+  unsigned Elt = getOrCreateVReg(*U.getOperand(1));
+  unsigned Idx = getOrCreateVReg(*U.getOperand(2));
+  MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx);
+  return true;
+}
+
+bool IRTranslator::translateExtractElement(const User &U,
+                                           MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (U.getOperand(0)->getType()->getVectorNumElements() == 1) {
+    unsigned Elt = getOrCreateVReg(*U.getOperand(0));
+    ValToVReg[&U] = Elt;
+    return true;
+  }
+  unsigned Res = getOrCreateVReg(U);
+  unsigned Val = getOrCreateVReg(*U.getOperand(0));
+  unsigned Idx = getOrCreateVReg(*U.getOperand(1));
+  MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
+  return true;
+}
+
+bool IRTranslator::translateShuffleVector(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR)
+      .addDef(getOrCreateVReg(U))
+      .addUse(getOrCreateVReg(*U.getOperand(0)))
+      .addUse(getOrCreateVReg(*U.getOperand(1)))
+      .addUse(getOrCreateVReg(*U.getOperand(2)));
   return true;
 }
 
@@ -736,11 +1121,21 @@ void IRTranslator::finishPendingPhis() {
     // won't create extra control flow here, otherwise we need to find the
     // dominating predecessor here (or perhaps force the weirder IRTranslators
     // to provide a simple boundary).
+    SmallSet<const BasicBlock *, 4> HandledPreds;
+
     for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
-      assert(BBToMBB[PI->getIncomingBlock(i)]->isSuccessor(MIB->getParent()) &&
-             "I appear to have misunderstood Machine PHIs");
-      MIB.addUse(getOrCreateVReg(*PI->getIncomingValue(i)));
-      MIB.addMBB(BBToMBB[PI->getIncomingBlock(i)]);
+      auto IRPred = PI->getIncomingBlock(i);
+      if (HandledPreds.count(IRPred))
+        continue;
+
+      HandledPreds.insert(IRPred);
+      unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i));
+      for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
+        assert(Pred->isSuccessor(MIB->getParent()) &&
+               "incorrect CFG at MachineBasicBlock level");
+        MIB.addUse(ValReg);
+        MIB.addMBB(Pred);
+      }
     }
   }
 }
@@ -752,9 +1147,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
     case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
 #include "llvm/IR/Instruction.def"
   default:
-    if (!TPC->isGlobalISelAbortEnabled())
-      return false;
-    llvm_unreachable("unknown opcode");
+    return false;
   }
 }
 
@@ -764,25 +1157,68 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
   else if (auto CF = dyn_cast<ConstantFP>(&C))
     EntryBuilder.buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
-    EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg);
+    EntryBuilder.buildUndef(Reg);
   else if (isa<ConstantPointerNull>(C))
     EntryBuilder.buildConstant(Reg, 0);
   else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder.buildGlobalValue(Reg, GV);
-  else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
+  else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
+    if (!CAZ->getType()->isVectorTy())
+      return false;
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CAZ->getNumElements() == 1)
+      return translate(*CAZ->getElementValue(0u), Reg);
+    std::vector<unsigned> Ops;
+    for (unsigned i = 0; i < CAZ->getNumElements(); ++i) {
+      Constant &Elt = *CAZ->getElementValue(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CV->getNumElements() == 1)
+      return translate(*CV->getElementAsConstant(0), Reg);
+    std::vector<unsigned> Ops;
+    for (unsigned i = 0; i < CV->getNumElements(); ++i) {
+      Constant &Elt = *CV->getElementAsConstant(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
     switch(CE->getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                         \
       case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder);
 #include "llvm/IR/Instruction.def"
     default:
-      if (!TPC->isGlobalISelAbortEnabled())
-        return false;
-      llvm_unreachable("unknown opcode");
+      return false;
+    }
+  } else if (auto CS = dyn_cast<ConstantStruct>(&C)) {
+    // Return the element if it is a single element ConstantStruct.
+    if (CS->getNumOperands() == 1) {
+      unsigned EltReg = getOrCreateVReg(*CS->getOperand(0));
+      EntryBuilder.buildCast(Reg, EltReg);
+      return true;
+    }
+    SmallVector<unsigned, 4> Ops;
+    SmallVector<uint64_t, 4> Indices;
+    uint64_t Offset = 0;
+    for (unsigned i = 0; i < CS->getNumOperands(); ++i) {
+      unsigned OpReg = getOrCreateVReg(*CS->getOperand(i));
+      Ops.push_back(OpReg);
+      Indices.push_back(Offset);
+      Offset += MRI->getType(OpReg).getSizeInBits();
+    }
+    EntryBuilder.buildSequence(Reg, Ops, Indices);
+  } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
+    if (CV->getNumOperands() == 1)
+      return translate(*CV->getOperand(0), Reg);
+    SmallVector<unsigned, 4> Ops;
+    for (unsigned i = 0; i < CV->getNumOperands(); ++i) {
+      Ops.push_back(getOrCreateVReg(*CV->getOperand(i)));
     }
-  } else if (!TPC->isGlobalISelAbortEnabled())
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else
     return false;
-  else
-    llvm_unreachable("unhandled constant kind");
 
   return true;
 }
@@ -793,7 +1229,12 @@ void IRTranslator::finalizeFunction() {
   PendingPHIs.clear();
   ValToVReg.clear();
   FrameIndices.clear();
-  Constants.clear();
+  MachinePreds.clear();
+  // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it
+  // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid
+  // destroying it twice (in ~IRTranslator() and ~LLVMContext())
+  EntryBuilder = MachineIRBuilder();
+  CurBuilder = MachineIRBuilder();
 }
 
 bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
@@ -807,85 +1248,97 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   TPC = &getAnalysis<TargetPassConfig>();
+  ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
-  // Setup a separate basic-block for the arguments and constants, falling
-  // through to the IR-level Function's entry block.
+  // Release the per-function state when we return, whether we succeeded or not.
+  auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); });
+
+  // Setup a separate basic-block for the arguments and constants
   MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();
   MF->push_back(EntryBB);
-  EntryBB->addSuccessor(&getOrCreateBB(F.front()));
   EntryBuilder.setMBB(*EntryBB);
 
+  // Create all blocks, in IR order, to preserve the layout.
+  for (const BasicBlock &BB: F) {
+    auto *&MBB = BBToMBB[&BB];
+
+    MBB = MF->CreateMachineBasicBlock(&BB);
+    MF->push_back(MBB);
+
+    if (BB.hasAddressTaken())
+      MBB->setHasAddressTaken();
+  }
+
+  // Make our arguments/constants entry block fallthrough to the IR entry block.
+  EntryBB->addSuccessor(&getMBB(F.front()));
+
   // Lower the actual args into this basic block.
   SmallVector<unsigned, 8> VRegArgs;
   for (const Argument &Arg: F.args())
     VRegArgs.push_back(getOrCreateVReg(Arg));
-  bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs);
-  if (!Succeeded) {
-    if (!TPC->isGlobalISelAbortEnabled()) {
-      MF->getProperties().set(
-          MachineFunctionProperties::Property::FailedISel);
-      finalizeFunction();
-      return false;
-    }
-    report_fatal_error("Unable to lower arguments");
+  if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) {
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               MF->getFunction()->getSubprogram(),
+                               &MF->getFunction()->getEntryBlock());
+    R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
+    reportTranslationError(*MF, *TPC, *ORE, R);
+    return false;
   }
 
   // And translate the function!
   for (const BasicBlock &BB: F) {
-    MachineBasicBlock &MBB = getOrCreateBB(BB);
+    MachineBasicBlock &MBB = getMBB(BB);
     // Set the insertion point of all the following translations to
     // the end of this basic block.
     CurBuilder.setMBB(MBB);
 
     for (const Instruction &Inst: BB) {
-      Succeeded &= translate(Inst);
-      if (!Succeeded) {
-        if (TPC->isGlobalISelAbortEnabled())
-          reportTranslationError(Inst, "unable to translate instruction");
-        MF->getProperties().set(
-            MachineFunctionProperties::Property::FailedISel);
-        break;
-      }
-    }
-  }
-
-  if (Succeeded) {
-    finishPendingPhis();
-
-    // Now that the MachineFrameInfo has been configured, no further changes to
-    // the reserved registers are possible.
-    MRI->freezeReservedRegs(*MF);
-
-    // Merge the argument lowering and constants block with its single
-    // successor, the LLVM-IR entry block.  We want the basic block to
-    // be maximal.
-    assert(EntryBB->succ_size() == 1 &&
-           "Custom BB used for lowering should have only one successor");
-    // Get the successor of the current entry block.
-    MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
-    assert(NewEntryBB.pred_size() == 1 &&
-           "LLVM-IR entry block has a predecessor!?");
-    // Move all the instruction from the current entry block to the
-    // new entry block.
-    NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
-                      EntryBB->end());
-
-    // Update the live-in information for the new entry block.
-    for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
-      NewEntryBB.addLiveIn(LiveIn);
-    NewEntryBB.sortUniqueLiveIns();
+      if (translate(Inst))
+        continue;
 
-    // Get rid of the now empty basic block.
-    EntryBB->removeSuccessor(&NewEntryBB);
-    MF->remove(EntryBB);
+      std::string InstStrStorage;
+      raw_string_ostream InstStr(InstStrStorage);
+      InstStr << Inst;
 
-    assert(&MF->front() == &NewEntryBB &&
-           "New entry wasn't next in the list of basic block!");
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 Inst.getDebugLoc(), &BB);
+      R << "unable to translate instruction: " << ore::NV("Opcode", &Inst)
+        << ": '" << InstStr.str() << "'";
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return false;
+    }
   }
 
-  finalizeFunction();
+  finishPendingPhis();
+
+  // Merge the argument lowering and constants block with its single
+  // successor, the LLVM-IR entry block.  We want the basic block to
+  // be maximal.
+  assert(EntryBB->succ_size() == 1 &&
+         "Custom BB used for lowering should have only one successor");
+  // Get the successor of the current entry block.
+  MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
+  assert(NewEntryBB.pred_size() == 1 &&
+         "LLVM-IR entry block has a predecessor!?");
+  // Move all the instruction from the current entry block to the
+  // new entry block.
+  NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
+                    EntryBB->end());
+
+  // Update the live-in information for the new entry block.
+  for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
+    NewEntryBB.addLiveIn(LiveIn);
+  NewEntryBB.sortUniqueLiveIns();
+
+  // Get rid of the now empty basic block.
+  EntryBB->removeSuccessor(&NewEntryBB);
+  MF->remove(EntryBB);
+  MF->DeleteMachineBasicBlock(EntryBB);
+
+  assert(&MF->front() == &NewEntryBB &&
+         "New entry wasn't next in the list of basic block!");
 
   return false;
 }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 1d205cd..a16e14f 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,14 +12,19 @@
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 #define DEBUG_TYPE "instruction-select"
@@ -44,17 +49,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static void reportSelectionError(const MachineInstr *MI, const Twine &Message) {
-  const MachineFunction &MF = *MI->getParent()->getParent();
-  std::string ErrStorage;
-  raw_string_ostream Err(ErrStorage);
-  Err << Message << ":\nIn function: " << MF.getName() << '\n';
-  if (MI)
-    Err << *MI << '\n';
-  report_fatal_error(Err.str());
-}
-
 bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // No matter what happens, whether we successfully select the function or not,
+  // nothing is going to use the vreg types after us.  Make sure they disappear.
+  auto ClearVRegTypesOnReturn =
+      make_scope_exit([&]() { MRI.getVRegToType().clear(); });
+
   // If the ISel pipeline failed, do not bother running that pass.
   if (MF.getProperties().hasProperty(
           MachineFunctionProperties::Property::FailedISel))
@@ -66,10 +68,10 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
   assert(ISel && "Cannot work without InstructionSelector");
 
-  // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many
-  // other MF/MFI fields we need to initialize.
+  // An optimization remark emitter. Used to report failures.
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  // FIXME: There are many other MF/MFI fields we need to initialize.
 
 #ifndef NDEBUG
   // Check that our input is fully legal: we require the function to have the
@@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   // that it has the same layering problem, but we only use inline methods so
   // end up not needing to link against the GlobalISel library.
   if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo())
-    for (const MachineBasicBlock &MBB : MF)
-      for (const MachineInstr &MI : MBB)
-        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI))
-          reportSelectionError(&MI, "Instruction is not legal");
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB)
+        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
+          reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                             "instruction is not legal", MI);
+          return false;
+        }
 
 #endif
   // FIXME: We could introduce new blocks and will need to fix the outer loop.
   // Until then, keep track of the number of blocks to assert that we don't.
   const size_t NumBlocks = MF.size();
 
-  bool Failed = false;
   for (MachineBasicBlock *MBB : post_order(&MF)) {
     if (MBB->empty())
       continue;
@@ -115,14 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
       DEBUG(dbgs() << "Selecting: \n  " << MI);
 
+      // We could have folded this instruction away already, making it dead.
+      // If so, erase it.
+      if (isTriviallyDead(MI, MRI)) {
+        DEBUG(dbgs() << "Is dead; erasing.\n");
+        MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        continue;
+      }
+
       if (!ISel->select(MI)) {
-        if (TPC.isGlobalISelAbortEnabled())
-          // FIXME: It would be nice to dump all inserted instructions.  It's
-          // not
-          // obvious how, esp. considering select() can insert after MI.
-          reportSelectionError(&MI, "Cannot select");
-        Failed = true;
-        break;
+        // FIXME: It would be nice to dump all inserted instructions.  It's
+        // not obvious how, esp. considering select() can insert after MI.
+        reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI);
+        return false;
       }
 
       // Dump the range of instructions that MI expanded into.
@@ -136,39 +145,47 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
   // Now that selection is complete, there are no more generic vregs.  Verify
   // that the size of the now-constrained vreg is unchanged and that it has a
   // register class.
   for (auto &VRegToType : MRI.getVRegToType()) {
     unsigned VReg = VRegToType.first;
     auto *RC = MRI.getRegClassOrNull(VReg);
-    auto *MI = MRI.def_instr_begin(VReg) == MRI.def_instr_end()
-                   ? nullptr
-                   : &*MRI.def_instr_begin(VReg);
-    if (!RC) {
-      if (TPC.isGlobalISelAbortEnabled())
-        reportSelectionError(MI, "VReg as no regclass after selection");
-      Failed = true;
-      break;
-    }
+    MachineInstr *MI = nullptr;
+    if (!MRI.def_empty(VReg))
+      MI = &*MRI.def_instr_begin(VReg);
+    else if (!MRI.use_empty(VReg))
+      MI = &*MRI.use_instr_begin(VReg);
+
+    if (MI && !RC) {
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has no regclass after selection", *MI);
+      return false;
+    } else if (!RC)
+      continue;
 
     if (VRegToType.second.isValid() &&
-        VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) {
-      if (TPC.isGlobalISelAbortEnabled())
-        reportSelectionError(
-            MI, "VReg has explicit size different from class size");
-      Failed = true;
-      break;
+        VRegToType.second.getSizeInBits() > TRI.getRegSizeInBits(*RC)) {
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has explicit size different from class size",
+                         *MI);
+      return false;
     }
   }
 
-  MRI.getVRegToType().clear();
-
-  if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) {
-    MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  if (MF.size() != NumBlocks) {
+    MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure",
+                                      MF.getFunction()->getSubprogram(),
+                                      /*MBB=*/nullptr);
+    R << "inserting blocks is not supported yet";
+    reportGISelFailure(MF, TPC, MORE, R);
     return false;
   }
-  assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet");
+
+  auto &TLI = *MF.getSubtarget().getTargetLowering();
+  TLI.finalizeLowering(MF);
 
   // FIXME: Should we accurately track changes?
   return true;
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 5c34da0..bf42722 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp -----------*- C++ -*-==//
+//===- llvm/CodeGen/GlobalISel/InstructionSelector.cpp --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
 
 #define DEBUG_TYPE "instructionselector"
 
 using namespace llvm;
 
-InstructionSelector::InstructionSelector() {}
+InstructionSelector::MatcherState::MatcherState(unsigned MaxRenderers)
+    : Renderers(MaxRenderers, nullptr), MIs() {}
+
+InstructionSelector::InstructionSelector() = default;
+
+bool InstructionSelector::constrainOperandRegToRegClass(
+    MachineInstr &I, unsigned OpIdx, const TargetRegisterClass &RC,
+    const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+    const RegisterBankInfo &RBI) const {
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  return
+      constrainRegToClass(MRI, TII, RBI, I, I.getOperand(OpIdx).getReg(), RC);
+}
 
 bool InstructionSelector::constrainSelectedInstRegOperands(
     MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
@@ -55,6 +79,28 @@ bool InstructionSelector::constrainSelectedInstRegOperands(
     // constrainOperandRegClass does that for us.
     MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),
                                        Reg, OpI));
+
+    // Tie uses to defs as indicated in MCInstrDesc if this hasn't already been
+    // done.
+    if (MO.isUse()) {
+      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO);
+      if (DefIdx != -1 && !I.isRegTiedToUseOperand(DefIdx))
+        I.tieOperands(DefIdx, OpI);
+    }
   }
   return true;
 }
+
+bool InstructionSelector::isOperandImmEqual(
+    const MachineOperand &MO, int64_t Value,
+    const MachineRegisterInfo &MRI) const {
+  if (MO.isReg() && MO.getReg())
+    if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI))
+      return *VRegVal == Value;
+  return false;
+}
+
+bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const {
+  return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() &&
+         MI.implicit_operands().begin() == MI.implicit_operands().end();
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index e863568..b699156 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -15,13 +15,16 @@
 
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
-#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
+#include <iterator>
+
 #define DEBUG_TYPE "legalizer"
 
 using namespace llvm;
@@ -47,71 +50,79 @@ void Legalizer::getAnalysisUsage(AnalysisUsage &AU) const {
 void Legalizer::init(MachineFunction &MF) {
 }
 
-bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
-                                const TargetInstrInfo &TII) {
-  bool Changed = false;
-  if (MI.getOpcode() != TargetOpcode::G_EXTRACT)
-    return Changed;
+bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo &TII,
+                              MachineIRBuilder &MIRBuilder) {
+  if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
+    return false;
 
-  unsigned NumDefs = (MI.getNumOperands() - 1) / 2;
+  unsigned NumDefs = MI.getNumOperands() - 1;
   unsigned SrcReg = MI.getOperand(NumDefs).getReg();
-  MachineInstr &SeqI = *MRI.def_instr_begin(SrcReg);
-  if (SeqI.getOpcode() != TargetOpcode::G_SEQUENCE)
-      return Changed;
-
-  unsigned NumSeqSrcs = (SeqI.getNumOperands() - 1) / 2;
-  bool AllDefsReplaced = true;
-
-  // Try to match each register extracted with a corresponding insertion formed
-  // by the G_SEQUENCE.
-  for (unsigned Idx = 0, SeqIdx = 0; Idx < NumDefs; ++Idx) {
-    MachineOperand &ExtractMO = MI.getOperand(Idx);
-    assert(ExtractMO.isReg() && ExtractMO.isDef() &&
-           "unexpected extract operand");
-
-    unsigned ExtractReg = ExtractMO.getReg();
-    unsigned ExtractPos = MI.getOperand(NumDefs + Idx + 1).getImm();
-
-    while (SeqIdx < NumSeqSrcs &&
-           SeqI.getOperand(2 * SeqIdx + 2).getImm() < ExtractPos)
-      ++SeqIdx;
-
-    if (SeqIdx == NumSeqSrcs) {
-      AllDefsReplaced = false;
-      continue;
-    }
+  MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg);
+  if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
 
-    unsigned OrigReg = SeqI.getOperand(2 * SeqIdx + 1).getReg();
-    if (SeqI.getOperand(2 * SeqIdx + 2).getImm() != ExtractPos ||
-        MRI.getType(OrigReg) != MRI.getType(ExtractReg)) {
-      AllDefsReplaced = false;
-      continue;
-    }
+  const unsigned NumMergeRegs = MergeI.getNumOperands() - 1;
+
+  if (NumMergeRegs < NumDefs) {
+    if (NumDefs % NumMergeRegs != 0)
+      return false;
+
+    MIRBuilder.setInstr(MI);
+    // Transform to UNMERGEs, for example
+    //   %1 = G_MERGE_VALUES %4, %5
+    //   %9, %10, %11, %12 = G_UNMERGE_VALUES %1
+    // to
+    //   %9, %10 = G_UNMERGE_VALUES %4
+    //   %11, %12 = G_UNMERGE_VALUES %5
 
-    assert(!TargetRegisterInfo::isPhysicalRegister(OrigReg) &&
-           "unexpected physical register in G_SEQUENCE");
+    const unsigned NewNumDefs = NumDefs / NumMergeRegs;
+    for (unsigned Idx = 0; Idx < NumMergeRegs; ++Idx) {
+      SmallVector<unsigned, 2> DstRegs;
+      for (unsigned j = 0, DefIdx = Idx * NewNumDefs; j < NewNumDefs;
+           ++j, ++DefIdx)
+        DstRegs.push_back(MI.getOperand(DefIdx).getReg());
 
-    // Finally we can replace the uses.
-    for (auto &Use : MRI.use_operands(ExtractReg)) {
-      Changed = true;
-      Use.setReg(OrigReg);
+      MIRBuilder.buildUnmerge(DstRegs, MergeI.getOperand(Idx + 1).getReg());
     }
-  }
 
-  if (AllDefsReplaced) {
-    // If SeqI was the next instruction in the BB and we removed it, we'd break
-    // the outer iteration.
-    assert(std::next(MachineBasicBlock::iterator(MI)) != SeqI &&
-           "G_SEQUENCE does not dominate G_EXTRACT");
+  } else if (NumMergeRegs > NumDefs) {
+    if (NumMergeRegs % NumDefs != 0)
+      return false;
+
+    MIRBuilder.setInstr(MI);
+    // Transform to MERGEs
+    //   %6 = G_MERGE_VALUES %17, %18, %19, %20
+    //   %7, %8 = G_UNMERGE_VALUES %6
+    // to
+    //   %7 = G_MERGE_VALUES %17, %18
+    //   %8 = G_MERGE_VALUES %19, %20
+
+    const unsigned NumRegs = NumMergeRegs / NumDefs;
+    for (unsigned DefIdx = 0; DefIdx < NumDefs; ++DefIdx) {
+      SmallVector<unsigned, 2> Regs;
+      for (unsigned j = 0, Idx = NumRegs * DefIdx + 1; j < NumRegs; ++j, ++Idx)
+        Regs.push_back(MergeI.getOperand(Idx).getReg());
+
+      MIRBuilder.buildMerge(MI.getOperand(DefIdx).getReg(), Regs);
+    }
 
-    MI.eraseFromParent();
+  } else {
+    // FIXME: is a COPY appropriate if the types mismatch? We know both
+    // registers are allocatable by now.
+    if (MRI.getType(MI.getOperand(0).getReg()) !=
+        MRI.getType(MergeI.getOperand(1).getReg()))
+      return false;
 
-    if (MRI.use_empty(SrcReg))
-      SeqI.eraseFromParent();
-    Changed = true;
+    for (unsigned Idx = 0; Idx < NumDefs; ++Idx)
+      MRI.replaceRegWith(MI.getOperand(Idx).getReg(),
+                         MergeI.getOperand(Idx + 1).getReg());
   }
 
-  return Changed;
+  MI.eraseFromParent();
+  if (MRI.use_empty(MergeI.getOperand(0).getReg()))
+    MergeI.eraseFromParent();
+  return true;
 }
 
 bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
@@ -122,7 +133,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');
   init(MF);
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
-  const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo();
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
   LegalizerHelper Helper(MF);
 
   // FIXME: an instruction may need more than one pass before it is legal. For
@@ -132,7 +143,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   // convergence for performance reasons.
   bool Changed = false;
   MachineBasicBlock::iterator NextMI;
-  for (auto &MBB : MF)
+  for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); MI = NextMI) {
       // Get the next Instruction before we try to legalize, because there's a
       // good chance MI will be deleted.
@@ -142,27 +153,52 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // and are assumed to be legal.
       if (!isPreISelGenericOpcode(MI->getOpcode()))
         continue;
-
-      auto Res = Helper.legalizeInstr(*MI, LegalizerInfo);
-
-      // Error out if we couldn't legalize this instruction. We may want to fall
-      // back to DAG ISel instead in the future.
-      if (Res == LegalizerHelper::UnableToLegalize) {
-        if (!TPC.isGlobalISelAbortEnabled()) {
-          MF.getProperties().set(
-              MachineFunctionProperties::Property::FailedISel);
-          return false;
+      unsigned NumNewInsns = 0;
+      SmallVector<MachineInstr *, 4> WorkList;
+      Helper.MIRBuilder.recordInsertions([&](MachineInstr *MI) {
+        // Only legalize pre-isel generic instructions.
+        // Legalization process could generate Target specific pseudo
+        // instructions with generic types. Don't record them
+        if (isPreISelGenericOpcode(MI->getOpcode())) {
+          ++NumNewInsns;
+          WorkList.push_back(MI);
         }
-        std::string Msg;
-        raw_string_ostream OS(Msg);
-        OS << "unable to legalize instruction: ";
-        MI->print(OS);
-        report_fatal_error(OS.str());
-      }
-
-      Changed |= Res == LegalizerHelper::Legalized;
-    }
+      });
+      WorkList.push_back(&*MI);
+
+      bool Changed = false;
+      LegalizerHelper::LegalizeResult Res;
+      unsigned Idx = 0;
+      do {
+        Res = Helper.legalizeInstrStep(*WorkList[Idx]);
+        // Error out if we couldn't legalize this instruction. We may want to
+        // fall back to DAG ISel instead in the future.
+        if (Res == LegalizerHelper::UnableToLegalize) {
+          Helper.MIRBuilder.stopRecordingInsertions();
+          if (Res == LegalizerHelper::UnableToLegalize) {
+            reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
+                               "unable to legalize instruction",
+                               *WorkList[Idx]);
+            return false;
+          }
+        }
+        Changed |= Res == LegalizerHelper::Legalized;
+        ++Idx;
+
+#ifndef NDEBUG
+        if (NumNewInsns) {
+          DEBUG(dbgs() << ".. .. Emitted " << NumNewInsns << " insns\n");
+          for (auto I = WorkList.end() - NumNewInsns, E = WorkList.end();
+               I != E; ++I)
+            DEBUG(dbgs() << ".. .. New MI: "; (*I)->print(dbgs()));
+          NumNewInsns = 0;
+        }
+#endif
+      } while (Idx < WorkList.size());
 
+      Helper.MIRBuilder.stopRecordingInsertions();
+    }
+  }
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -171,8 +207,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // Get the next Instruction before we try to legalize, because there's a
       // good chance MI will be deleted.
       NextMI = std::next(MI);
-
-      Changed |= combineExtracts(*MI, MRI, TII);
+      Changed |= combineMerges(*MI, MRI, TII, Helper.MIRBuilder);
     }
   }
 
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index eb25b6c..5258370 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -24,120 +24,174 @@
 
 #include <sstream>
 
-#define DEBUG_TYPE "legalize-mir"
+#define DEBUG_TYPE "legalizer"
 
 using namespace llvm;
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF)
-  : MRI(MF.getRegInfo()) {
+    : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {
   MIRBuilder.setMF(MF);
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
-                                   const LegalizerInfo &LegalizerInfo) {
-  auto Action = LegalizerInfo.getAction(MI, MRI);
+LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
+  DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs()));
+
+  auto Action = LI.getAction(MI, MRI);
   switch (std::get<0>(Action)) {
   case LegalizerInfo::Legal:
+    DEBUG(dbgs() << ".. Already legal\n");
     return AlreadyLegal;
   case LegalizerInfo::Libcall:
+    DEBUG(dbgs() << ".. Convert to libcall\n");
     return libcall(MI);
   case LegalizerInfo::NarrowScalar:
+    DEBUG(dbgs() << ".. Narrow scalar\n");
     return narrowScalar(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::WidenScalar:
+    DEBUG(dbgs() << ".. Widen scalar\n");
     return widenScalar(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::Lower:
+    DEBUG(dbgs() << ".. Lower\n");
     return lower(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::FewerElements:
+    DEBUG(dbgs() << ".. Reduce number of elements\n");
     return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action));
+  case LegalizerInfo::Custom:
+    DEBUG(dbgs() << ".. Custom legalization\n");
+    return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized
+                                                  : UnableToLegalize;
   default:
+    DEBUG(dbgs() << ".. Unable to legalize\n");
     return UnableToLegalize;
   }
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::legalizeInstr(MachineInstr &MI,
-                               const LegalizerInfo &LegalizerInfo) {
-  SmallVector<MachineInstr *, 4> WorkList;
-  MIRBuilder.recordInsertions(
-      [&](MachineInstr *MI) { WorkList.push_back(MI); });
-  WorkList.push_back(&MI);
-
-  bool Changed = false;
-  LegalizeResult Res;
-  unsigned Idx = 0;
-  do {
-    Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo);
-    if (Res == UnableToLegalize) {
-      MIRBuilder.stopRecordingInsertions();
-      return UnableToLegalize;
-    }
-    Changed |= Res == Legalized;
-    ++Idx;
-  } while (Idx < WorkList.size());
-
-  MIRBuilder.stopRecordingInsertions();
-
-  return Changed ? Legalized : AlreadyLegal;
-}
-
 void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts,
                                    SmallVectorImpl<unsigned> &VRegs) {
-  unsigned Size = Ty.getSizeInBits();
-  SmallVector<uint64_t, 4> Indexes;
-  for (int i = 0; i < NumParts; ++i) {
+  for (int i = 0; i < NumParts; ++i)
     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
-    Indexes.push_back(i * Size);
+  MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
+  switch (Opcode) {
+  case TargetOpcode::G_SDIV:
+    assert(Size == 32 && "Unsupported size");
+    return RTLIB::SDIV_I32;
+  case TargetOpcode::G_UDIV:
+    assert(Size == 32 && "Unsupported size");
+    return RTLIB::UDIV_I32;
+  case TargetOpcode::G_SREM:
+    assert(Size == 32 && "Unsupported size");
+    return RTLIB::SREM_I32;
+  case TargetOpcode::G_UREM:
+    assert(Size == 32 && "Unsupported size");
+    return RTLIB::UREM_I32;
+  case TargetOpcode::G_FADD:
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32;
+  case TargetOpcode::G_FREM:
+    return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32;
+  case TargetOpcode::G_FPOW:
+    return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;
   }
-  MIRBuilder.buildExtract(VRegs, Indexes, Reg);
+  llvm_unreachable("Unknown libcall function");
+}
+
+LegalizerHelper::LegalizeResult
+llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
+                    const CallLowering::ArgInfo &Result,
+                    ArrayRef<CallLowering::ArgInfo> Args) {
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  const char *Name = TLI.getLibcallName(Libcall);
+
+  MIRBuilder.getMF().getFrameInfo().setHasCalls(true);
+  if (!CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall),
+                     MachineOperand::CreateES(Name), Result, Args))
+    return LegalizerHelper::UnableToLegalize;
+
+  return LegalizerHelper::Legalized;
+}
+
+static LegalizerHelper::LegalizeResult
+simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
+              Type *OpType) {
+  auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
+  return createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), OpType},
+                       {{MI.getOperand(1).getReg(), OpType},
+                        {MI.getOperand(2).getReg(), OpType}});
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::libcall(MachineInstr &MI) {
-  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
-  unsigned Size = Ty.getSizeInBits();
+  LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
+  unsigned Size = LLTy.getSizeInBits();
+  auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+
   MIRBuilder.setInstr(MI);
 
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM: {
+    Type *HLTy = Type::getInt32Ty(Ctx);
+    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
+    if (Status != Legalized)
+      return Status;
+    break;
+  }
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FPOW:
   case TargetOpcode::G_FREM: {
-    auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
-    Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
-    auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
-    auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-    const char *Name =
-        TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32);
-
-    CLI.lowerCall(
-        MIRBuilder, MachineOperand::CreateES(Name),
-        {MI.getOperand(0).getReg(), Ty},
-        {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}});
-    MI.eraseFromParent();
-    return Legalized;
+    Type *HLTy = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
+    auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
+    if (Status != Legalized)
+      return Status;
+    break;
   }
   }
+
+  MI.eraseFromParent();
+  return Legalized;
 }
 
 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                                                               unsigned TypeIdx,
                                                               LLT NarrowTy) {
   // FIXME: Don't know how to handle secondary types yet.
-  if (TypeIdx != 0)
+  if (TypeIdx != 0 && MI.getOpcode() != TargetOpcode::G_EXTRACT)
     return UnableToLegalize;
+
+  MIRBuilder.setInstr(MI);
+
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
+  case TargetOpcode::G_IMPLICIT_DEF: {
+    int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
+                   NarrowTy.getSizeInBits();
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildUndef(Dst);
+      DstRegs.push_back(Dst);
+    }
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_ADD: {
     // Expand in terms of carry-setting/consuming G_ADDE instructions.
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
     int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
                    NarrowTy.getSizeInBits();
 
-    MIRBuilder.setInstr(MI);
-
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
@@ -152,11 +206,193 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                             Src2Regs[i], CarryIn);
 
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
       CarryIn = CarryOut;
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_EXTRACT: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    int64_t NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(1).getReg()).getSizeInBits() / NarrowSize;
+
+    SmallVector<unsigned, 2> SrcRegs, DstRegs;
+    SmallVector<uint64_t, 2> Indexes;
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    unsigned OpReg = MI.getOperand(0).getReg();
+    int64_t OpStart = MI.getOperand(2).getImm();
+    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned SrcStart = i * NarrowSize;
+
+      if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
+        // No part of the extract uses this subregister, ignore it.
+        continue;
+      } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+        // The entire subregister is extracted, forward the value.
+        DstRegs.push_back(SrcRegs[i]);
+        continue;
+      }
+
+      // OpSegStart is where this destination segment would start in OpReg if it
+      // extended infinitely in both directions.
+      int64_t ExtractOffset, SegSize;
+      if (OpStart < SrcStart) {
+        ExtractOffset = 0;
+        SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
+      } else {
+        ExtractOffset = OpStart - SrcStart;
+        SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
+      }
+
+      unsigned SegReg = SrcRegs[i];
+      if (ExtractOffset != 0 || SegSize != NarrowSize) {
+        // A genuine extract is needed.
+        SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+        MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
+      }
+
+      DstRegs.push_back(SegReg);
+    }
+
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_INSERT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    int64_t NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+
+    SmallVector<unsigned, 2> SrcRegs, DstRegs;
+    SmallVector<uint64_t, 2> Indexes;
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    unsigned OpReg = MI.getOperand(2).getReg();
+    int64_t OpStart = MI.getOperand(3).getImm();
+    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstStart = i * NarrowSize;
+
+      if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
+        // No part of the insert affects this subregister, forward the original.
+        DstRegs.push_back(SrcRegs[i]);
+        continue;
+      } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+        // The entire subregister is defined by this insert, forward the new
+        // value.
+        DstRegs.push_back(OpReg);
+        continue;
+      }
+
+      // OpSegStart is where this destination segment would start in OpReg if it
+      // extended infinitely in both directions.
+      int64_t ExtractOffset, InsertOffset, SegSize;
+      if (OpStart < DstStart) {
+        InsertOffset = 0;
+        ExtractOffset = DstStart - OpStart;
+        SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
+      } else {
+        InsertOffset = OpStart - DstStart;
+        ExtractOffset = 0;
+        SegSize =
+            std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
+      }
+
+      unsigned SegReg = OpReg;
+      if (ExtractOffset != 0 || SegSize != OpSize) {
+        // A genuine extract is needed.
+        SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+        MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
+      }
+
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset);
+      DstRegs.push_back(DstReg);
+    }
+
+    assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered");
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_LOAD: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    LLT OffsetTy = LLT::scalar(
+        MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      unsigned SrcReg = 0;
+      unsigned Adjustment = i * NarrowSize / 8;
+
+      MIRBuilder.materializeGEP(SrcReg, MI.getOperand(1).getReg(), OffsetTy,
+                                Adjustment);
+
+      // TODO: This is conservatively correct, but we probably want to split the
+      // memory operands in the future.
+      MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin());
+
+      DstRegs.push_back(DstReg);
+    }
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_STORE: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    LLT OffsetTy = LLT::scalar(
+        MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
+
+    SmallVector<unsigned, 2> SrcRegs;
+    extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = 0;
+      unsigned Adjustment = i * NarrowSize / 8;
+
+      MIRBuilder.materializeGEP(DstReg, MI.getOperand(1).getReg(), OffsetTy,
+                                Adjustment);
+
+      // TODO: This is conservatively correct, but we probably want to split the
+      // memory operands in the future.
+      MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin());
+    }
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    const APInt &Cst = MI.getOperand(1).getCImm()->getValue();
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      ConstantInt *CI =
+          ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize));
+      MIRBuilder.buildConstant(DstReg, *CI);
+      DstRegs.push_back(DstReg);
+    }
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -175,7 +411,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_MUL:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
-  case TargetOpcode::G_SUB: {
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_SHL: {
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
@@ -195,10 +432,16 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_SDIV:
-  case TargetOpcode::G_UDIV: {
-    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV
-                          ? TargetOpcode::G_SEXT
-                          : TargetOpcode::G_ZEXT;
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_SREM:
+  case TargetOpcode::G_UREM:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR: {
+    unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV ||
+                             MI.getOpcode() == TargetOpcode::G_SREM ||
+                             MI.getOpcode() == TargetOpcode::G_ASHR
+                         ? TargetOpcode::G_SEXT
+                         : TargetOpcode::G_ZEXT;
 
     unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse(
@@ -218,6 +461,85 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SELECT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    // Perform operation at larger width (any extension is fine here, high bits
+    // don't affect the result) and then truncate the result back to the
+    // original type.
+    unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
+    unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg());
+    MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg());
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildInstr(TargetOpcode::G_SELECT)
+        .addDef(DstExt)
+        .addReg(MI.getOperand(1).getReg())
+        .addUse(Src1Ext)
+        .addUse(Src2Ext);
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildInstr(MI.getOpcode())
+        .addDef(DstExt)
+        .addUse(MI.getOperand(1).getReg());
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
+
+    if (MI.getOpcode() == TargetOpcode::G_SITOFP) {
+      MIRBuilder.buildSExt(SrcExt, Src);
+    } else {
+      assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op");
+      MIRBuilder.buildZExt(SrcExt, Src);
+    }
+
+    MIRBuilder.buildInstr(MI.getOpcode())
+        .addDef(MI.getOperand(0).getReg())
+        .addUse(SrcExt);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_INSERT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildAnyExt(SrcExt, Src);
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(),
+                                      MI.getOperand(3).getImm());
+    for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) {
+      MIB.addReg(MI.getOperand(OpNum).getReg());
+      MIB.addImm(MI.getOperand(OpNum + 1).getImm());
+    }
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_LOAD: {
     assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
                WideTy.getSizeInBits() &&
@@ -231,12 +553,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_STORE: {
-    assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
-               WideTy.getSizeInBits() &&
-           "illegal to increase number of bytes modified by a store");
+    if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) ||
+        WideTy != LLT::scalar(8))
+      return UnableToLegalize;
+
+    auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+    auto Content = TLI.getBooleanContents(false, false);
+
+    unsigned ExtOp = TargetOpcode::G_ANYEXT;
+    if (Content == TargetLoweringBase::ZeroOrOneBooleanContent)
+      ExtOp = TargetOpcode::G_ZEXT;
+    else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent)
+      ExtOp = TargetOpcode::G_SEXT;
+    else
+      ExtOp = TargetOpcode::G_ANYEXT;
 
     unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg());
+    MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse(
+        MI.getOperand(0).getReg());
     MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(),
                           **MI.memoperands_begin());
     MI.eraseFromParent();
@@ -315,6 +649,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
+    // result.
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned Overflow = MI.getOperand(1).getReg();
+    unsigned LHS = MI.getOperand(2).getReg();
+    unsigned RHS = MI.getOperand(3).getReg();
+
+    MIRBuilder.buildMul(Res, LHS, RHS);
+
+    unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
+                          ? TargetOpcode::G_SMULH
+                          : TargetOpcode::G_UMULH;
+
+    unsigned HiPart = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildInstr(Opcode)
+      .addDef(HiPart)
+      .addUse(LHS)
+      .addUse(RHS);
+
+    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildConstant(Zero, 0);
+    MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FNEG: {
+    // TODO: Handle vector types once we are able to
+    // represent them.
+    if (Ty.isVector())
+      return UnableToLegalize;
+    unsigned Res = MI.getOperand(0).getReg();
+    Type *ZeroTy;
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+    switch (Ty.getSizeInBits()) {
+    case 16:
+      ZeroTy = Type::getHalfTy(Ctx);
+      break;
+    case 32:
+      ZeroTy = Type::getFloatTy(Ctx);
+      break;
+    case 64:
+      ZeroTy = Type::getDoubleTy(Ctx);
+      break;
+    default:
+      llvm_unreachable("unexpected floating-point type");
+    }
+    ConstantFP &ZeroForNegation =
+        *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
+    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildFConstant(Zero, ZeroForNegation);
+    MIRBuilder.buildInstr(TargetOpcode::G_FSUB)
+        .addDef(Res)
+        .addUse(Zero)
+        .addUse(MI.getOperand(1).getReg());
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FSUB: {
+    // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
+    // First, check if G_FNEG is marked as Lower. If so, we may
+    // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
+    if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower)
+      return UnableToLegalize;
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned LHS = MI.getOperand(1).getReg();
+    unsigned RHS = MI.getOperand(2).getReg();
+    unsigned Neg = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS);
+    MIRBuilder.buildInstr(TargetOpcode::G_FADD)
+        .addDef(Res)
+        .addUse(LHS)
+        .addUse(Neg);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -335,7 +746,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     MIRBuilder.setInstr(MI);
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
@@ -343,10 +753,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]);
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
     }
 
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index e496620..76917aa 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -1,4 +1,4 @@
-//===---- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer -------==//
+//===- lib/CodeGen/GlobalISel/LegalizerInfo.cpp - Legalizer ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,16 +18,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Type.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOpcodes.h"
+#include <algorithm>
+#include <cassert>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
-LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
+LegalizerInfo::LegalizerInfo() {
+  DefaultActions[TargetOpcode::G_IMPLICIT_DEF] = NarrowScalar;
+
   // FIXME: these two can be legalized to the fundamental load/store Jakob
   // proposed. Once loads & stores are supported.
   DefaultActions[TargetOpcode::G_ANYEXT] = Legal;
@@ -41,6 +50,9 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
   DefaultActions[TargetOpcode::G_STORE] = NarrowScalar;
 
   DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar;
+  DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar;
+  DefaultActions[TargetOpcode::G_EXTRACT] = NarrowScalar;
+  DefaultActions[TargetOpcode::G_FNEG] = Lower;
 }
 
 void LegalizerInfo::computeTables() {
@@ -71,28 +83,34 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
   // These *have* to be implemented for now, they're the fundamental basis of
   // how everything else is transformed.
 
-  // Nothing is going to go well with types that aren't a power of 2 yet, so
-  // don't even try because we might make things worse.
-  if (!isPowerOf2_64(Aspect.Type.getSizeInBits()))
-      return std::make_pair(Unsupported, LLT());
-
   // FIXME: the long-term plan calls for expansion in terms of load/store (if
   // they're not legal).
-  if (Aspect.Opcode == TargetOpcode::G_SEQUENCE ||
-      Aspect.Opcode == TargetOpcode::G_EXTRACT)
+  if (Aspect.Opcode == TargetOpcode::G_MERGE_VALUES ||
+      Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
     return std::make_pair(Legal, Aspect.Type);
 
+  LLT Ty = Aspect.Type;
   LegalizeAction Action = findInActions(Aspect);
+  // LegalizerHelper is not able to handle non-power-of-2 types right now, so do
+  // not try to legalize them unless they are marked as Legal or Custom.
+  // FIXME: This is a temporary hack until the general non-power-of-2
+  // legalization works.
+  if (!isPowerOf2_64(Ty.getSizeInBits()) &&
+      !(Action == Legal || Action == Custom))
+    return std::make_pair(Unsupported, LLT());
+
   if (Action != NotFound)
     return findLegalAction(Aspect, Action);
 
   unsigned Opcode = Aspect.Opcode;
-  LLT Ty = Aspect.Type;
   if (!Ty.isVector()) {
     auto DefaultAction = DefaultActions.find(Aspect.Opcode);
     if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)
       return std::make_pair(Legal, Ty);
 
+    if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower)
+      return std::make_pair(Lower, Ty);
+
     if (DefaultAction == DefaultActions.end() ||
         DefaultAction->second != NarrowScalar)
       return std::make_pair(Unsupported, LLT());
@@ -152,7 +170,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI,
   return std::get<0>(getAction(MI, MRI)) == Legal;
 }
 
-LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
+Optional<LLT> LegalizerInfo::findLegalType(const InstrAspect &Aspect,
                                  LegalizeAction Action) const {
   switch(Action) {
   default:
@@ -160,23 +178,30 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
   case Legal:
   case Lower:
   case Libcall:
+  case Custom:
     return Aspect.Type;
   case NarrowScalar: {
-    return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
+    return findLegalizableSize(
+        Aspect, [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); });
   }
   case WidenScalar: {
-    return findLegalType(Aspect, [&](LLT Ty) -> LLT {
+    return findLegalizableSize(Aspect, [&](LLT Ty) -> LLT {
       return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize();
     });
   }
   case FewerElements: {
-    return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.halfElements(); });
+    return findLegalizableSize(
+        Aspect, [&](LLT Ty) -> LLT { return Ty.halfElements(); });
   }
   case MoreElements: {
-    return findLegalType(Aspect,
-                         [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
+    return findLegalizableSize(
+        Aspect, [&](LLT Ty) -> LLT { return Ty.doubleElements(); });
   }
   }
 }
+
+bool LegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                   MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &MIRBuilder) const {
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
new file mode 100644
index 0000000..c5d0999
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -0,0 +1,123 @@
+//===- Localizer.cpp ---------------------- Localize some instrs -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the Localizer class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "localizer"
+
+using namespace llvm;
+
+char Localizer::ID = 0;
+INITIALIZE_PASS(Localizer, DEBUG_TYPE,
+                "Move/duplicate certain instructions close to their use", false,
+                false)
+
+Localizer::Localizer() : MachineFunctionPass(ID) {
+  initializeLocalizerPass(*PassRegistry::getPassRegistry());
+}
+
+void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); }
+
+bool Localizer::shouldLocalize(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Constants-like instructions should be close to their users.
+  // We don't want long live-ranges for them.
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FRAME_INDEX:
+    return true;
+  }
+}
+
+bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
+                           MachineBasicBlock *&InsertMBB) {
+  MachineInstr &MIUse = *MOUse.getParent();
+  InsertMBB = MIUse.getParent();
+  if (MIUse.isPHI())
+    InsertMBB = MIUse.getOperand(MIUse.getOperandNo(&MOUse) + 1).getMBB();
+  return InsertMBB == Def.getParent();
+}
+
+bool Localizer::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
+
+  init(MF);
+
+  bool Changed = false;
+  // Keep track of the instructions we localized.
+  // We won't need to process them if we see them later in the CFG.
+  SmallPtrSet<MachineInstr *, 16> LocalizedInstrs;
+  DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
+  // TODO: Do bottom up traversal.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
+        continue;
+      DEBUG(dbgs() << "Should localize: " << MI);
+      assert(MI.getDesc().getNumDefs() == 1 &&
+             "More than one definition not supported yet");
+      unsigned Reg = MI.getOperand(0).getReg();
+      // Check if all the users of MI are local.
+      // We are going to invalidation the list of use operands, so we
+      // can't use range iterator.
+      for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
+           MOIt != MOItEnd;) {
+        MachineOperand &MOUse = *MOIt++;
+        // Check if the use is already local.
+        MachineBasicBlock *InsertMBB;
+        DEBUG(MachineInstr &MIUse = *MOUse.getParent();
+              dbgs() << "Checking use: " << MIUse
+                     << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
+        if (isLocalUse(MOUse, MI, InsertMBB))
+          continue;
+        DEBUG(dbgs() << "Fixing non-local use\n");
+        Changed = true;
+        auto MBBAndReg = std::make_pair(InsertMBB, Reg);
+        auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
+        if (NewVRegIt == MBBWithLocalDef.end()) {
+          // Create the localized instruction.
+          MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
+          LocalizedInstrs.insert(LocalizedMI);
+          // Don't try to be smart for the insertion point.
+          // There is no guarantee that the first seen use is the first
+          // use in the block.
+          InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI);
+
+          // Set a new register for the definition.
+          unsigned NewReg =
+              MRI->createGenericVirtualRegister(MRI->getType(Reg));
+          MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
+          LocalizedMI->getOperand(0).setReg(NewReg);
+          NewVRegIt =
+              MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
+          DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
+        }
+        DEBUG(dbgs() << "Update use with: " << PrintReg(NewVRegIt->second)
+                     << '\n');
+        // Update the user reg.
+        MOUse.setReg(NewVRegIt->second);
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index c04f6e4..4636806 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -54,7 +55,7 @@ void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,
 
 void MachineIRBuilder::recordInsertions(
     std::function<void(MachineInstr *)> Inserted) {
-  InsertedInstr = Inserted;
+  InsertedInstr = std::move(Inserted);
 }
 
 void MachineIRBuilder::stopRecordingInsertions() {
@@ -82,6 +83,70 @@ MachineInstrBuilder MachineIRBuilder::insertInstr(MachineInstrBuilder MIB) {
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildDirectDbgValue(
+    unsigned Reg, const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return buildInstr(TargetOpcode::DBG_VALUE)
+      .addReg(Reg, RegState::Debug)
+      .addReg(0, RegState::Debug)
+      .addMetadata(Variable)
+      .addMetadata(Expr);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildIndirectDbgValue(
+    unsigned Reg, unsigned Offset, const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return buildInstr(TargetOpcode::DBG_VALUE)
+      .addReg(Reg, RegState::Debug)
+      .addImm(Offset)
+      .addMetadata(Variable)
+      .addMetadata(Expr);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
+                                                      const MDNode *Variable,
+                                                      const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return buildInstr(TargetOpcode::DBG_VALUE)
+      .addFrameIndex(FI)
+      .addImm(0)
+      .addMetadata(Variable)
+      .addMetadata(Expr);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
+                                                         unsigned Offset,
+                                                         const MDNode *Variable,
+                                                         const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  auto MIB = buildInstr(TargetOpcode::DBG_VALUE);
+  if (auto *CI = dyn_cast<ConstantInt>(&C)) {
+    if (CI->getBitWidth() > 64)
+      MIB.addCImm(CI);
+    else
+      MIB.addImm(CI->getZExtValue());
+  } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) {
+    MIB.addFPImm(CFP);
+  } else {
+    // Insert %noreg if we didn't find a usable constant and had to drop it.
+    MIB.addReg(0U);
+  }
+
+  return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildFrameIndex(unsigned Res, int Idx) {
   assert(MRI->getType(Res).isPointer() && "invalid operand type");
   return buildInstr(TargetOpcode::G_FRAME_INDEX)
@@ -101,19 +166,24 @@ MachineInstrBuilder MachineIRBuilder::buildGlobalValue(unsigned Res,
       .addGlobalAddress(GV);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildAdd(unsigned Res, unsigned Op0,
+MachineInstrBuilder MachineIRBuilder::buildBinaryOp(unsigned Opcode, unsigned Res, unsigned Op0,
                                                unsigned Op1) {
   assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
          "invalid operand type");
   assert(MRI->getType(Res) == MRI->getType(Op0) &&
          MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
 
-  return buildInstr(TargetOpcode::G_ADD)
+  return buildInstr(Opcode)
       .addDef(Res)
       .addUse(Op0)
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildAdd(unsigned Res, unsigned Op0,
+                                               unsigned Op1) {
+  return buildBinaryOp(TargetOpcode::G_ADD, Res, Op0, Op1);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
                                                unsigned Op1) {
   assert(MRI->getType(Res).isPointer() &&
@@ -126,37 +196,67 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
       .addUse(Op1);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0,
-                                               unsigned Op1) {
-  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
-         "invalid operand type");
-  assert(MRI->getType(Res) == MRI->getType(Op0) &&
-         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
+Optional<MachineInstrBuilder>
+MachineIRBuilder::materializeGEP(unsigned &Res, unsigned Op0,
+                                 const LLT &ValueTy, uint64_t Value) {
+  assert(Res == 0 && "Res is a result argument");
+  assert(ValueTy.isScalar()  && "invalid offset type");
 
-  return buildInstr(TargetOpcode::G_SUB)
+  if (Value == 0) {
+    Res = Op0;
+    return None;
+  }
+
+  Res = MRI->createGenericVirtualRegister(MRI->getType(Op0));
+  unsigned TmpReg = MRI->createGenericVirtualRegister(ValueTy);
+
+  buildConstant(TmpReg, Value);
+  return buildGEP(Res, Op0, TmpReg);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0,
+                                                   uint32_t NumBits) {
+  assert(MRI->getType(Res).isPointer() &&
+         MRI->getType(Res) == MRI->getType(Op0) && "type mismatch");
+
+  return buildInstr(TargetOpcode::G_PTR_MASK)
       .addDef(Res)
       .addUse(Op0)
-      .addUse(Op1);
+      .addImm(NumBits);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0,
+                                               unsigned Op1) {
+  return buildBinaryOp(TargetOpcode::G_SUB, Res, Op0, Op1);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0,
                                                unsigned Op1) {
-  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
-         "invalid operand type");
-  assert(MRI->getType(Res) == MRI->getType(Op0) &&
-         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
+  return buildBinaryOp(TargetOpcode::G_MUL, Res, Op0, Op1);
+}
 
-  return buildInstr(TargetOpcode::G_MUL)
-      .addDef(Res)
-      .addUse(Op0)
-      .addUse(Op1);
+MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0,
+                                               unsigned Op1) {
+  return buildBinaryOp(TargetOpcode::G_AND, Res, Op0, Op1);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildOr(unsigned Res, unsigned Op0,
+                                              unsigned Op1) {
+  return buildBinaryOp(TargetOpcode::G_OR, Res, Op0, Op1);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
   return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildBrIndirect(unsigned Tgt) {
+  assert(MRI->getType(Tgt).isPointer() && "invalid branch destination");
+  return buildInstr(TargetOpcode::G_BRINDIRECT).addUse(Tgt);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildCopy(unsigned Res, unsigned Op) {
+  assert(MRI->getType(Res) == LLT() || MRI->getType(Op) == LLT() ||
+         MRI->getType(Res) == MRI->getType(Op));
   return buildInstr(TargetOpcode::COPY).addDef(Res).addUse(Op);
 }
 
@@ -253,49 +353,78 @@ MachineInstrBuilder MachineIRBuilder::buildZExt(unsigned Res, unsigned Op) {
 
 MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res,
                                                        unsigned Op) {
+  assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector());
+  assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar());
+
   unsigned Opcode = TargetOpcode::COPY;
   if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_SEXT;
   else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
     Opcode = TargetOpcode::G_TRUNC;
+  else
+    assert(MRI->getType(Res) == MRI->getType(Op));
 
   return buildInstr(Opcode).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results,
-                                                   ArrayRef<uint64_t> Indices,
-                                                   unsigned Src) {
-#ifndef NDEBUG
-  assert(Results.size() == Indices.size() && "inconsistent number of regs");
-  assert(!Results.empty() && "invalid trivial extract");
-  assert(std::is_sorted(Indices.begin(), Indices.end()) &&
-         "extract offsets must be in ascending order");
+MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res,
+                                                       unsigned Op) {
+  assert(MRI->getType(Res).isScalar() || MRI->getType(Res).isVector());
+  assert(MRI->getType(Res).isScalar() == MRI->getType(Op).isScalar());
 
-  assert(MRI->getType(Src).isValid() && "invalid operand type");
-  for (auto Res : Results)
-    assert(MRI->getType(Res).isValid() && "invalid operand type");
-#endif
+  unsigned Opcode = TargetOpcode::COPY;
+  if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
+    Opcode = TargetOpcode::G_ZEXT;
+  else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
+    Opcode = TargetOpcode::G_TRUNC;
+  else
+    assert(MRI->getType(Res) == MRI->getType(Op));
 
-  auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT));
-  for (auto Res : Results)
-    MIB.addDef(Res);
+  return buildInstr(Opcode).addDef(Res).addUse(Op);
+}
 
-  MIB.addUse(Src);
+MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) {
+  LLT SrcTy = MRI->getType(Src);
+  LLT DstTy = MRI->getType(Dst);
+  if (SrcTy == DstTy)
+    return buildCopy(Dst, Src);
+
+  unsigned Opcode;
+  if (SrcTy.isPointer() && DstTy.isScalar())
+    Opcode = TargetOpcode::G_PTRTOINT;
+  else if (DstTy.isPointer() && SrcTy.isScalar())
+    Opcode = TargetOpcode::G_INTTOPTR;
+  else {
+    assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet");
+    Opcode = TargetOpcode::G_BITCAST;
+  }
 
-  for (auto Idx : Indices)
-    MIB.addImm(Idx);
+  return buildInstr(Opcode).addDef(Dst).addUse(Src);
+}
 
-  getMBB().insert(getInsertPt(), MIB);
-  if (InsertedInstr)
-    InsertedInstr(MIB);
+MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
+                                                   uint64_t Index) {
+#ifndef NDEBUG
+  assert(MRI->getType(Src).isValid() && "invalid operand type");
+  assert(MRI->getType(Res).isValid() && "invalid operand type");
+  assert(Index + MRI->getType(Res).getSizeInBits() <=
+             MRI->getType(Src).getSizeInBits() &&
+         "extracting off end of register");
+#endif
 
-  return MIB;
+  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) {
+    assert(Index == 0 && "insertion past the end of a register");
+    return buildCast(Res, Src);
+  }
+
+  return buildInstr(TargetOpcode::G_EXTRACT)
+      .addDef(Res)
+      .addUse(Src)
+      .addImm(Index);
 }
 
-MachineInstrBuilder
-MachineIRBuilder::buildSequence(unsigned Res,
-                                ArrayRef<unsigned> Ops,
-                                ArrayRef<uint64_t> Indices) {
+void MachineIRBuilder::buildSequence(unsigned Res, ArrayRef<unsigned> Ops,
+                                     ArrayRef<uint64_t> Indices) {
 #ifndef NDEBUG
   assert(Ops.size() == Indices.size() && "incompatible args");
   assert(!Ops.empty() && "invalid trivial sequence");
@@ -307,15 +436,97 @@ MachineIRBuilder::buildSequence(unsigned Res,
     assert(MRI->getType(Op).isValid() && "invalid operand type");
 #endif
 
-  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_SEQUENCE);
-  MIB.addDef(Res);
+  LLT ResTy = MRI->getType(Res);
+  LLT OpTy = MRI->getType(Ops[0]);
+  unsigned OpSize = OpTy.getSizeInBits();
+  bool MaybeMerge = true;
   for (unsigned i = 0; i < Ops.size(); ++i) {
-    MIB.addUse(Ops[i]);
-    MIB.addImm(Indices[i]);
+    if (MRI->getType(Ops[i]) != OpTy || Indices[i] != i * OpSize) {
+      MaybeMerge = false;
+      break;
+    }
+  }
+
+  if (MaybeMerge && Ops.size() * OpSize == ResTy.getSizeInBits()) {
+    buildMerge(Res, Ops);
+    return;
   }
+
+  unsigned ResIn = MRI->createGenericVirtualRegister(ResTy);
+  buildUndef(ResIn);
+
+  for (unsigned i = 0; i < Ops.size(); ++i) {
+    unsigned ResOut =
+        i + 1 == Ops.size() ? Res : MRI->createGenericVirtualRegister(ResTy);
+    buildInsert(ResOut, ResIn, Ops[i], Indices[i]);
+    ResIn = ResOut;
+  }
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) {
+  return buildInstr(TargetOpcode::G_IMPLICIT_DEF).addDef(Res);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
+                                                 ArrayRef<unsigned> Ops) {
+
+#ifndef NDEBUG
+  assert(!Ops.empty() && "invalid trivial sequence");
+  LLT Ty = MRI->getType(Ops[0]);
+  for (auto Reg : Ops)
+    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() ==
+             MRI->getType(Res).getSizeInBits() &&
+         "input operands do not cover output register");
+#endif
+
+  if (Ops.size() == 1)
+    return buildCast(Res, Ops[0]);
+
+  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES);
+  MIB.addDef(Res);
+  for (unsigned i = 0; i < Ops.size(); ++i)
+    MIB.addUse(Ops[i]);
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
+                                                   unsigned Op) {
+
+#ifndef NDEBUG
+  assert(!Res.empty() && "invalid trivial sequence");
+  LLT Ty = MRI->getType(Res[0]);
+  for (auto Reg : Res)
+    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() ==
+             MRI->getType(Op).getSizeInBits() &&
+         "input operands do not cover output register");
+#endif
+
+  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+  for (unsigned i = 0; i < Res.size(); ++i)
+    MIB.addDef(Res[i]);
+  MIB.addUse(Op);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
+                                                  unsigned Op, unsigned Index) {
+  assert(Index + MRI->getType(Op).getSizeInBits() <=
+             MRI->getType(Res).getSizeInBits() &&
+         "insertion past the end of a register");
+
+  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) {
+    return buildCast(Res, Op);
+  }
+
+  return buildInstr(TargetOpcode::G_INSERT)
+      .addDef(Res)
+      .addUse(Src)
+      .addUse(Op)
+      .addImm(Index);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
                                                      unsigned Res,
                                                      bool HasSideEffects) {
@@ -395,9 +606,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
   if (ResTy.isScalar() || ResTy.isPointer())
     assert(MRI->getType(Tst).isScalar() && "type mismatch");
   else
-    assert(MRI->getType(Tst).isVector() &&
-           MRI->getType(Tst).getNumElements() ==
-               MRI->getType(Op0).getNumElements() &&
+    assert((MRI->getType(Tst).isScalar() ||
+            (MRI->getType(Tst).isVector() &&
+             MRI->getType(Tst).getNumElements() ==
+                 MRI->getType(Op0).getNumElements())) &&
            "type mismatch");
 #endif
 
@@ -408,6 +620,47 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res,
+                                                               unsigned Val,
+                                                               unsigned Elt,
+                                                               unsigned Idx) {
+#ifndef NDEBUG
+  LLT ResTy = MRI->getType(Res);
+  LLT ValTy = MRI->getType(Val);
+  LLT EltTy = MRI->getType(Elt);
+  LLT IdxTy = MRI->getType(Idx);
+  assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type");
+  assert(IdxTy.isScalar() && "invalid operand type");
+  assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch");
+  assert(ResTy.getElementType() == EltTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT)
+      .addDef(Res)
+      .addUse(Val)
+      .addUse(Elt)
+      .addUse(Idx);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res,
+                                                                unsigned Val,
+                                                                unsigned Idx) {
+#ifndef NDEBUG
+  LLT ResTy = MRI->getType(Res);
+  LLT ValTy = MRI->getType(Val);
+  LLT IdxTy = MRI->getType(Idx);
+  assert(ValTy.isVector() && "invalid operand type");
+  assert((ResTy.isScalar() || ResTy.isPointer()) && "invalid operand type");
+  assert(IdxTy.isScalar() && "invalid operand type");
+  assert(ValTy.getElementType() == ResTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT)
+      .addDef(Res)
+      .addUse(Val)
+      .addUse(Idx);
+}
+
 void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src,
                                         bool IsExtend) {
 #ifndef NDEBUG
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index cc026ef..677941d 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -1,4 +1,4 @@
-//===- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect -*- C++ -*-==//
+//==- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,17 +12,39 @@
 
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
 
 #define DEBUG_TYPE "regbankselect"
 
@@ -36,6 +58,7 @@ static cl::opt<RegBankSelect::Mode> RegBankSelectMode(
                           "Use the Greedy mode (best local mapping)")));
 
 char RegBankSelect::ID = 0;
+
 INITIALIZE_PASS_BEGIN(RegBankSelect, DEBUG_TYPE,
                       "Assign register bank of generic virtual registers",
                       false, false);
@@ -47,8 +70,7 @@ INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE,
                     false)
 
 RegBankSelect::RegBankSelect(Mode RunningMode)
-    : MachineFunctionPass(ID), RBI(nullptr), MRI(nullptr), TRI(nullptr),
-      MBFI(nullptr), MBPI(nullptr), OptMode(RunningMode) {
+    : MachineFunctionPass(ID), OptMode(RunningMode) {
   initializeRegBankSelectPass(*PassRegistry::getPassRegistry());
   if (RegBankSelectMode.getNumOccurrences() != 0) {
     OptMode = RegBankSelectMode;
@@ -71,6 +93,7 @@ void RegBankSelect::init(MachineFunction &MF) {
     MBPI = nullptr;
   }
   MIRBuilder.setMF(MF);
+  MORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
 }
 
 void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -131,9 +154,11 @@ bool RegBankSelect::repairReg(
           TargetRegisterInfo::isPhysicalRegister(Dst)) &&
          "We are about to create several defs for Dst");
 
-  // Build the instruction used to repair, then clone it at the right places.
-  MachineInstr *MI = MIRBuilder.buildCopy(Dst, Src);
-  MI->removeFromParent();
+  // Build the instruction used to repair, then clone it at the right
+  // places. Avoiding buildCopy bypasses the check that Src and Dst have the
+  // same types because the type is a placeholder when this function is called.
+  MachineInstr *MI =
+      MIRBuilder.buildInstrNoInsert(TargetOpcode::COPY).addDef(Dst).addUse(Src);
   DEBUG(dbgs() << "Copy: " << PrintReg(Src) << " to: " << PrintReg(Dst)
                << '\n');
   // TODO:
@@ -200,32 +225,30 @@ uint64_t RegBankSelect::getRepairCost(
         RBI->copyCost(*DesiredRegBrank, *CurRegBank,
                       RegisterBankInfo::getSizeInBits(MO.getReg(), *MRI, *TRI));
     // TODO: use a dedicated constant for ImpossibleCost.
-    if (Cost != UINT_MAX)
+    if (Cost != std::numeric_limits<unsigned>::max())
       return Cost;
-    assert(!TPC->isGlobalISelAbortEnabled() &&
-           "Legalization not available yet");
     // Return the legalization cost of that repairing.
   }
-  assert(!TPC->isGlobalISelAbortEnabled() &&
-         "Complex repairing not implemented yet");
-  return UINT_MAX;
+  return std::numeric_limits<unsigned>::max();
 }
 
-RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
+const RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
     MachineInstr &MI, RegisterBankInfo::InstructionMappings &PossibleMappings,
     SmallVectorImpl<RepairingPlacement> &RepairPts) {
   assert(!PossibleMappings.empty() &&
          "Do not know how to map this instruction");
 
-  RegisterBankInfo::InstructionMapping *BestMapping = nullptr;
+  const RegisterBankInfo::InstructionMapping *BestMapping = nullptr;
   MappingCost Cost = MappingCost::ImpossibleCost();
   SmallVector<RepairingPlacement, 4> LocalRepairPts;
-  for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) {
-    MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost);
+  for (const RegisterBankInfo::InstructionMapping *CurMapping :
+       PossibleMappings) {
+    MappingCost CurCost =
+        computeMapping(MI, *CurMapping, LocalRepairPts, &Cost);
     if (CurCost < Cost) {
       DEBUG(dbgs() << "New best: " << CurCost << '\n');
       Cost = CurCost;
-      BestMapping = &CurMapping;
+      BestMapping = CurMapping;
       RepairPts.clear();
       for (RepairingPlacement &RepairPt : LocalRepairPts)
         RepairPts.emplace_back(std::move(RepairPt));
@@ -235,7 +258,7 @@ RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
     // If none of the mapping worked that means they are all impossible.
     // Thus, pick the first one and set an impossible repairing point.
     // It will trigger the failed isel mode.
-    BestMapping = &(*PossibleMappings.begin());
+    BestMapping = *PossibleMappings.begin();
     RepairPts.emplace_back(
         RepairingPlacement(MI, 0, *TRI, *this, RepairingPlacement::Impossible));
   } else
@@ -352,7 +375,7 @@ void RegBankSelect::tryAvoidingSplit(
       // the repairing cost because of the PHIs already proceeded
       // as already stated.
       // Though the code will be correct.
-      assert(0 && "Repairing cost may not be accurate");
+      assert(false && "Repairing cost may not be accurate");
     } else {
       // We need to do non-local repairing. Basically, patch all
       // the uses (i.e., phis) that we already proceeded.
@@ -448,6 +471,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping(
 
     // Sums up the repairing cost of MO at each insertion point.
     uint64_t RepairCost = getRepairCost(MO, ValMapping);
+
+    // This is an impossible to repair cost.
+    if (RepairCost == std::numeric_limits<unsigned>::max())
+      continue;
+
     // Bias used for splitting: 5%.
     const uint64_t PercentageForBias = 5;
     uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
@@ -530,9 +558,11 @@ bool RegBankSelect::applyMapping(
       llvm_unreachable("Other kind should not happen");
     }
   }
+
   // Second, rewrite the instruction.
   DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
   RBI->applyMapping(OpdMapper);
+
   return true;
 }
 
@@ -541,10 +571,10 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
   // Remember the repairing placement for all the operands.
   SmallVector<RepairingPlacement, 4> RepairPts;
 
-  RegisterBankInfo::InstructionMapping BestMapping;
+  const RegisterBankInfo::InstructionMapping *BestMapping;
   if (OptMode == RegBankSelect::Mode::Fast) {
-    BestMapping = RBI->getInstrMapping(MI);
-    MappingCost DefaultCost = computeMapping(MI, BestMapping, RepairPts);
+    BestMapping = &RBI->getInstrMapping(MI);
+    MappingCost DefaultCost = computeMapping(MI, *BestMapping, RepairPts);
     (void)DefaultCost;
     if (DefaultCost == MappingCost::ImpossibleCost())
       return false;
@@ -553,16 +583,16 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
         RBI->getInstrPossibleMappings(MI);
     if (PossibleMappings.empty())
       return false;
-    BestMapping = std::move(findBestMapping(MI, PossibleMappings, RepairPts));
+    BestMapping = &findBestMapping(MI, PossibleMappings, RepairPts);
   }
   // Make sure the mapping is valid for MI.
-  assert(BestMapping.verify(MI) && "Invalid instruction mapping");
+  assert(BestMapping->verify(MI) && "Invalid instruction mapping");
 
-  DEBUG(dbgs() << "Best Mapping: " << BestMapping << '\n');
+  DEBUG(dbgs() << "Best Mapping: " << *BestMapping << '\n');
 
   // After this call, MI may not be valid anymore.
   // Do not use it.
-  return applyMapping(MI, BestMapping, RepairPts);
+  return applyMapping(MI, *BestMapping, RepairPts);
 }
 
 bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
@@ -585,18 +615,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
   // LegalizerInfo as it's currently in the separate GlobalISel library.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) {
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
         if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
-          if (!TPC->isGlobalISelAbortEnabled()) {
-            MF.getProperties().set(
-                MachineFunctionProperties::Property::FailedISel);
-            return false;
-          }
-          std::string ErrStorage;
-          raw_string_ostream Err(ErrStorage);
-          Err << "Instruction is not legal: " << MI << '\n';
-          report_fatal_error(Err.str());
+          reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                             "instruction is not legal", MI);
+          return false;
         }
       }
     }
@@ -622,9 +646,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       if (!assignInstr(MI)) {
-        if (TPC->isGlobalISelAbortEnabled())
-          report_fatal_error("Unable to map instruction");
-        MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+        reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                           "unable to map instruction", MI);
         return false;
       }
     }
@@ -640,11 +663,8 @@ RegBankSelect::RepairingPlacement::RepairingPlacement(
     MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P,
     RepairingPlacement::RepairingKind Kind)
     // Default is, we are going to insert code to repair OpIdx.
-    : Kind(Kind),
-      OpIdx(OpIdx),
-      CanMaterialize(Kind != RepairingKind::Impossible),
-      HasSplit(false),
-      P(P) {
+    : Kind(Kind), OpIdx(OpIdx),
+      CanMaterialize(Kind != RepairingKind::Impossible), P(P) {
   const MachineOperand &MO = MI.getOperand(OpIdx);
   assert(MO.isReg() && "Trying to repair a non-reg operand");
 
@@ -849,7 +869,7 @@ bool RegBankSelect::EdgeInsertPoint::canMaterialize() const {
 }
 
 RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq)
-    : LocalCost(0), NonLocalCost(0), LocalFreq(LocalFreq.getFrequency()) {}
+    : LocalFreq(LocalFreq.getFrequency()) {}
 
 bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) {
   // Check if this overflows.
@@ -922,7 +942,6 @@ bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const {
       OtherLocalAdjust = Cost.LocalCost - LocalCost;
     else
       ThisLocalAdjust = LocalCost - Cost.LocalCost;
-
   } else {
     ThisLocalAdjust = LocalCost;
     OtherLocalAdjust = Cost.LocalCost;
@@ -968,10 +987,12 @@ bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
          LocalFreq == Cost.LocalFreq;
 }
 
-void RegBankSelect::MappingCost::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegBankSelect::MappingCost::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void RegBankSelect::MappingCost::print(raw_ostream &OS) const {
   if (*this == ImpossibleCost()) {
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 49d676f..83b21e6 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -19,10 +19,11 @@ using namespace llvm;
 
 const unsigned RegisterBank::InvalidID = UINT_MAX;
 
-RegisterBank::RegisterBank(unsigned ID, const char *Name, unsigned Size,
-                           const uint32_t *CoveredClasses)
+RegisterBank::RegisterBank(
+    unsigned ID, const char *Name, unsigned Size,
+    const uint32_t *CoveredClasses, unsigned NumRegClasses)
     : ID(ID), Name(Name), Size(Size) {
-  ContainedRegClasses.resize(200);
+  ContainedRegClasses.resize(NumRegClasses);
   ContainedRegClasses.setBitsInMask(CoveredClasses);
 }
 
@@ -47,7 +48,7 @@ bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
 
       // Verify that the Size of the register bank is big enough to cover
       // all the register classes it covers.
-      assert((getSize() >= SubRC.getSize() * 8) &&
+      assert(getSize() >= TRI.getRegSizeInBits(SubRC) &&
              "Size is not big enough for all the subclasses!");
       assert(covers(SubRC) && "Not all subclasses are covered");
     }
@@ -75,9 +76,11 @@ bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
   return &OtherRB == this;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
   print(dbgs(), /* IsForDebug */ true, TRI);
 }
+#endif
 
 void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
                          const TargetRegisterInfo *TRI) const {
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index da5ab0b..a841902 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -45,6 +45,10 @@ STATISTIC(NumOperandsMappingsCreated,
           "Number of operands mappings dynamically created");
 STATISTIC(NumOperandsMappingsAccessed,
           "Number of operands mappings dynamically accessed");
+STATISTIC(NumInstructionMappingsCreated,
+          "Number of instruction mappings dynamically created");
+STATISTIC(NumInstructionMappingsAccessed,
+          "Number of instruction mappings dynamically accessed");
 
 const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX;
 const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
@@ -63,13 +67,6 @@ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks,
 #endif // NDEBUG
 }
 
-RegisterBankInfo::~RegisterBankInfo() {
-  for (auto It : MapOfPartialMappings)
-    delete It.second;
-  for (auto It : MapOfValueMappings)
-    delete It.second;
-}
-
 bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
 #ifndef NDEBUG
   for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
@@ -133,19 +130,27 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
   return &RC;
 }
 
-RegisterBankInfo::InstructionMapping
+/// Check whether or not \p MI should be treated like a copy
+/// for the mappings.
+/// Copy like instruction are special for mapping because
+/// they don't have actual register constraints. Moreover,
+/// they sometimes have register classes assigned and we can
+/// just use that instead of failing to provide a generic mapping.
+static bool isCopyLike(const MachineInstr &MI) {
+  return MI.isCopy() || MI.isPHI() ||
+         MI.getOpcode() == TargetOpcode::REG_SEQUENCE;
+}
+
+const RegisterBankInfo::InstructionMapping &
 RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
   // For copies we want to walk over the operands and try to find one
   // that has a register bank since the instruction itself will not get
   // us any constraint.
-  bool isCopyLike = MI.isCopy() || MI.isPHI();
+  bool IsCopyLike = isCopyLike(MI);
   // For copy like instruction, only the mapping of the definition
   // is important. The rest is not constrained.
-  unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands();
+  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();
 
-  RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1,
-                                               /*OperandsMapping*/ nullptr,
-                                               NumOperandsForMapping);
   const MachineFunction &MF = *MI.getParent()->getParent();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
@@ -175,7 +180,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     // For copy-like instruction, we want to reuse the register bank
     // that is already set on Reg, if any, since those instructions do
     // not have any constraints.
-    const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr;
+    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;
     if (!CurRegBank) {
       // If this is a target specific instruction, we can deduce
       // the register bank from the encoding constraints.
@@ -184,15 +189,15 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
         // All our attempts failed, give up.
         CompleteMapping = false;
 
-        if (!isCopyLike)
+        if (!IsCopyLike)
           // MI does not carry enough information to guess the mapping.
-          return InstructionMapping();
+          return getInvalidInstructionMapping();
         continue;
       }
     }
     const ValueMapping *ValMapping =
         &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank);
-    if (isCopyLike) {
+    if (IsCopyLike) {
       OperandsMapping[0] = ValMapping;
       CompleteMapping = true;
       break;
@@ -200,13 +205,15 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     OperandsMapping[OpIdx] = ValMapping;
   }
 
-  if (isCopyLike && !CompleteMapping)
+  if (IsCopyLike && !CompleteMapping)
     // No way to deduce the type from what we have.
-    return InstructionMapping();
+    return getInvalidInstructionMapping();
 
   assert(CompleteMapping && "Setting an uncomplete mapping");
-  Mapping.setOperandsMapping(getOperandsMapping(OperandsMapping));
-  return Mapping;
+  return getInstructionMapping(
+      DefaultMappingID, /*Cost*/ 1,
+      /*OperandsMapping*/ getOperandsMapping(OperandsMapping),
+      NumOperandsForMapping);
 }
 
 /// Hashing function for PartialMapping.
@@ -234,8 +241,8 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
 
   ++NumPartialMappingsCreated;
 
-  const PartialMapping *&PartMapping = MapOfPartialMappings[Hash];
-  PartMapping = new PartialMapping{StartIdx, Length, RegBank};
+  auto &PartMapping = MapOfPartialMappings[Hash];
+  PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank);
   return *PartMapping;
 }
 
@@ -268,8 +275,8 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
 
   ++NumValueMappingsCreated;
 
-  const ValueMapping *&ValMapping = MapOfValueMappings[Hash];
-  ValMapping = new ValueMapping{BreakDown, NumBreakDowns};
+  auto &ValMapping = MapOfValueMappings[Hash];
+  ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
   return *ValMapping;
 }
 
@@ -282,9 +289,9 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
   // The addresses of the value mapping are unique.
   // Therefore, we can use them directly to hash the operand mapping.
   hash_code Hash = hash_combine_range(Begin, End);
-  const auto &It = MapOfOperandsMappings.find(Hash);
-  if (It != MapOfOperandsMappings.end())
-    return It->second;
+  auto &Res = MapOfOperandsMappings[Hash];
+  if (Res)
+    return Res.get();
 
   ++NumOperandsMappingsCreated;
 
@@ -293,8 +300,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
   // mapping, because we use the pointer of the ValueMapping
   // to hash and we expect them to uniquely identify an instance
   // of value mapping.
-  ValueMapping *&Res = MapOfOperandsMappings[Hash];
-  Res = new ValueMapping[std::distance(Begin, End)];
+  Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End));
   unsigned Idx = 0;
   for (Iterator It = Begin; It != End; ++It, ++Idx) {
     const ValueMapping *ValMap = *It;
@@ -302,7 +308,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
       continue;
     Res[Idx] = *ValMap;
   }
-  return Res;
+  return Res.get();
 }
 
 const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
@@ -317,9 +323,44 @@ const RegisterBankInfo::ValueMapping *RegisterBankInfo::getOperandsMapping(
   return getOperandsMapping(OpdsMapping.begin(), OpdsMapping.end());
 }
 
-RegisterBankInfo::InstructionMapping
+static hash_code
+hashInstructionMapping(unsigned ID, unsigned Cost,
+                       const RegisterBankInfo::ValueMapping *OperandsMapping,
+                       unsigned NumOperands) {
+  return hash_combine(ID, Cost, OperandsMapping, NumOperands);
+}
+
+const RegisterBankInfo::InstructionMapping &
+RegisterBankInfo::getInstructionMappingImpl(
+    bool IsInvalid, unsigned ID, unsigned Cost,
+    const RegisterBankInfo::ValueMapping *OperandsMapping,
+    unsigned NumOperands) const {
+  assert(((IsInvalid && ID == InvalidMappingID && Cost == 0 &&
+           OperandsMapping == nullptr && NumOperands == 0) ||
+          !IsInvalid) &&
+         "Mismatch argument for invalid input");
+  ++NumInstructionMappingsAccessed;
+
+  hash_code Hash =
+      hashInstructionMapping(ID, Cost, OperandsMapping, NumOperands);
+  const auto &It = MapOfInstructionMappings.find(Hash);
+  if (It != MapOfInstructionMappings.end())
+    return *It->second;
+
+  ++NumInstructionMappingsCreated;
+
+  auto &InstrMapping = MapOfInstructionMappings[Hash];
+  if (IsInvalid)
+    InstrMapping = llvm::make_unique<InstructionMapping>();
+  else
+    InstrMapping = llvm::make_unique<InstructionMapping>(
+        ID, Cost, OperandsMapping, NumOperands);
+  return *InstrMapping;
+}
+
+const RegisterBankInfo::InstructionMapping &
 RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
-  RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
   if (Mapping.isValid())
     return Mapping;
   llvm_unreachable("The target must implement this");
@@ -329,14 +370,14 @@ RegisterBankInfo::InstructionMappings
 RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
   InstructionMappings PossibleMappings;
   // Put the default mapping first.
-  PossibleMappings.push_back(getInstrMapping(MI));
+  PossibleMappings.push_back(&getInstrMapping(MI));
   // Then the alternative mapping, if any.
   InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
-  for (InstructionMapping &AltMapping : AltMappings)
-    PossibleMappings.emplace_back(std::move(AltMapping));
+  for (const InstructionMapping *AltMapping : AltMappings)
+    PossibleMappings.push_back(AltMapping);
 #ifndef NDEBUG
-  for (const InstructionMapping &Mapping : PossibleMappings)
-    assert(Mapping.verify(MI) && "Mapping is invalid");
+  for (const InstructionMapping *Mapping : PossibleMappings)
+    assert(Mapping->verify(MI) && "Mapping is invalid");
 #endif
   return PossibleMappings;
 }
@@ -349,6 +390,7 @@ RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
 
 void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
   MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
   DEBUG(dbgs() << "Applying default-like mapping\n");
   for (unsigned OpIdx = 0,
                 EndIdx = OpdMapper.getInstrMapping().getNumOperands();
@@ -359,6 +401,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
       DEBUG(dbgs() << " is not a register, nothing to be done\n");
       continue;
     }
+    if (!MO.getReg()) {
+      DEBUG(dbgs() << " is %%noreg, nothing to be done\n");
+      continue;
+    }
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
+               0 &&
+           "Invalid mapping");
     assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==
                1 &&
            "This mapping is too complex for this function");
@@ -368,9 +417,25 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
       DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
-    DEBUG(dbgs() << " changed, replace " << MO.getReg());
-    MO.setReg(*NewRegs.begin());
-    DEBUG(dbgs() << " with " << MO.getReg());
+    unsigned OrigReg = MO.getReg();
+    unsigned NewReg = *NewRegs.begin();
+    DEBUG(dbgs() << " changed, replace " << PrintReg(OrigReg, nullptr));
+    MO.setReg(NewReg);
+    DEBUG(dbgs() << " with " << PrintReg(NewReg, nullptr));
+
+    // The OperandsMapper creates plain scalar, we may have to fix that.
+    // Check if the types match and if not, fix that.
+    LLT OrigTy = MRI.getType(OrigReg);
+    LLT NewTy = MRI.getType(NewReg);
+    if (OrigTy != NewTy) {
+      assert(OrigTy.getSizeInBits() == NewTy.getSizeInBits() &&
+             "Types with difference size cannot be handled by the default "
+             "mapping");
+      DEBUG(dbgs() << "\nChange type of new opd from " << NewTy << " to "
+                   << OrigTy);
+      MRI.setType(NewReg, OrigTy);
+    }
+    DEBUG(dbgs() << '\n');
   }
 }
 
@@ -394,16 +459,18 @@ unsigned RegisterBankInfo::getSizeInBits(unsigned Reg,
     RC = MRI.getRegClass(Reg);
   }
   assert(RC && "Unable to deduce the register class");
-  return RC->getSize() * 8;
+  return TRI.getRegSizeInBits(*RC);
 }
 
 //------------------------------------------------------------------------------
 // Helper classes implementation.
 //------------------------------------------------------------------------------
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::PartialMapping::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 bool RegisterBankInfo::PartialMapping::verify() const {
   assert(RegBank && "Register bank not set");
@@ -451,10 +518,12 @@ bool RegisterBankInfo::ValueMapping::verify(unsigned MeaningfulBitWidth) const {
   return true;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::ValueMapping::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
   OS << "#BreakDown: " << NumBreakDowns << " ";
@@ -472,8 +541,7 @@ bool RegisterBankInfo::InstructionMapping::verify(
   // Check that all the register operands are properly mapped.
   // Check the constructor invariant.
   // For PHI, we only care about mapping the definition.
-  assert(NumOperands ==
-             ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) &&
+  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&
          "NumOperands must match, see constructor");
   assert(MI.getParent() && MI.getParent()->getParent() &&
          "MI must be connected to a MachineFunction");
@@ -503,10 +571,12 @@ bool RegisterBankInfo::InstructionMapping::verify(
   return true;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::InstructionMapping::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
   OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
@@ -576,6 +646,11 @@ void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
   for (unsigned &NewVReg : NewVRegsForOpIdx) {
     assert(PartMap != ValMapping.end() && "Out-of-bound access");
     assert(NewVReg == 0 && "Register has already been created");
+    // The new registers are always bound to scalar with the right size.
+    // The actual type has to be set when the target does the mapping
+    // of the instruction.
+    // The rationale is that this generic code cannot guess how the
+    // target plans to split the input type.
     NewVReg = MRI.createGenericVirtualRegister(LLT::scalar(PartMap->Length));
     MRI.setRegBank(NewVReg, *PartMap->RegBank);
     ++PartMap;
@@ -619,10 +694,12 @@ RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
   return Res;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RegisterBankInfo::OperandsMapper::dump() const {
   print(dbgs(), true);
   dbgs() << '\n';
 }
+#endif
 
 void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
                                              bool ForDebug) const {
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index e500918..5ecaf5c 100644
--- a/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -11,10 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -22,6 +26,23 @@
 
 using namespace llvm;
 
+unsigned llvm::constrainRegToClass(MachineRegisterInfo &MRI,
+                                   const TargetInstrInfo &TII,
+                                   const RegisterBankInfo &RBI,
+                                   MachineInstr &InsertPt, unsigned Reg,
+                                   const TargetRegisterClass &RegClass) {
+  if (!RBI.constrainGenericRegister(Reg, RegClass, MRI)) {
+    unsigned NewReg = MRI.createVirtualRegister(&RegClass);
+    BuildMI(*InsertPt.getParent(), InsertPt, InsertPt.getDebugLoc(),
+            TII.get(TargetOpcode::COPY), NewReg)
+        .addReg(Reg);
+    return NewReg;
+  }
+
+  return Reg;
+}
+
+
 unsigned llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
@@ -32,14 +53,76 @@ unsigned llvm::constrainOperandRegClass(
          "PhysReg not implemented");
 
   const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF);
+  return constrainRegToClass(MRI, TII, RBI, InsertPt, Reg, *RegClass);
+}
 
-  if (!RBI.constrainGenericRegister(Reg, *RegClass, MRI)) {
-    unsigned NewReg = MRI.createVirtualRegister(RegClass);
-    BuildMI(*InsertPt.getParent(), InsertPt, InsertPt.getDebugLoc(),
-            TII.get(TargetOpcode::COPY), NewReg)
-        .addReg(Reg);
-    return NewReg;
+bool llvm::isTriviallyDead(const MachineInstr &MI,
+                           const MachineRegisterInfo &MRI) {
+  // If we can move an instruction, we can remove it.  Otherwise, it has
+  // a side-effect of some sort.
+  bool SawStore = false;
+  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore))
+    return false;
+
+  // Instructions without side-effects are dead iff they only define dead vregs.
+  for (auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI.use_nodbg_empty(Reg))
+      return false;
   }
+  return true;
+}
 
-  return Reg;
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(R.getMsg());
+  else
+    MORE.emit(R);
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              const char *PassName, StringRef Msg,
+                              const MachineInstr &MI) {
+  MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ",
+                                    MI.getDebugLoc(), MI.getParent());
+  R << Msg << ": " << ore::MNV("Inst", MI);
+  reportGISelFailure(MF, TPC, MORE, R);
+}
+
+Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
+                                           const MachineRegisterInfo &MRI) {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (MI->getOpcode() != TargetOpcode::G_CONSTANT)
+    return None;
+
+  if (MI->getOperand(1).isImm())
+    return MI->getOperand(1).getImm();
+
+  if (MI->getOperand(1).isCImm() &&
+      MI->getOperand(1).getCImm()->getBitWidth() <= 64)
+    return MI->getOperand(1).getCImm()->getSExtValue();
+
+  return None;
+}
+
+const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg,
+                                       const MachineRegisterInfo &MRI) {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
+    return nullptr;
+  return MI->getOperand(1).getFPImm();
 }
diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
index 1ea5349..c6ca49c 100644
--- a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -192,10 +192,7 @@ namespace {
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables",
-                      false, false)
-INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables",
-                    false, false)
+INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false)
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
@@ -556,7 +553,12 @@ bool GlobalMerge::doInitialization(Module &M) {
   // Grab all non-const globals.
   for (auto &GV : M.globals()) {
     // Merge is safe for "normal" internal or external globals only
-    if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection())
+    if (GV.isDeclaration() || GV.isThreadLocal() ||
+        GV.hasSection() || GV.hasImplicitSection())
+      continue;
+
+    // It's not safe to merge globals that may be preempted
+    if (TM && !TM->shouldAssumeDSOLocal(M, &GV))
       continue;
 
     if (!(MergeExternalGlobals && GV.hasExternalLinkage()) &&
diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp
index b9f3d86..ff84053 100644
--- a/contrib/llvm/lib/CodeGen/IfConversion.cpp
+++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "BranchFolding.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
@@ -25,6 +24,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -39,7 +39,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "ifcvt"
+#define DEBUG_TYPE "if-converter"
 
 // Hidden options for help debugging.
 static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
@@ -316,9 +316,9 @@ namespace {
 
 char &llvm::IfConverterID = IfConverter::ID;
 
-INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false)
+INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
+INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()) || (PredicateFtor && !PredicateFtor(MF)))
@@ -588,19 +588,6 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
   return TExit && TExit == FalseBBI.BB;
 }
 
-/// Shrink the provided inclusive range by one instruction.
-/// If the range was one instruction (\p It == \p Begin), It is not modified,
-/// but \p Empty is set to true.
-static inline void shrinkInclusiveRange(
-    MachineBasicBlock::iterator &Begin,
-    MachineBasicBlock::iterator &It,
-    bool &Empty) {
-  if (It == Begin)
-    Empty = true;
-  else
-    It--;
-}
-
 /// Count duplicated instructions and move the iterators to show where they
 /// are.
 /// @param TIB True Iterator Begin
@@ -633,10 +620,8 @@ bool IfConverter::CountDuplicatedInstructions(
   while (TIB != TIE && FIB != FIE) {
     // Skip dbg_value instructions. These do not count.
     TIB = skipDebugInstructionsForward(TIB, TIE);
-    if(TIB == TIE)
-      break;
     FIB = skipDebugInstructionsForward(FIB, FIE);
-    if(FIB == FIE)
+    if (TIB == TIE || FIB == FIE)
       break;
     if (!TIB->isIdenticalTo(*FIB))
       break;
@@ -656,58 +641,42 @@ bool IfConverter::CountDuplicatedInstructions(
   if (TIB == TIE || FIB == FIE)
     return true;
   // Now, in preparation for counting duplicate instructions at the ends of the
-  // blocks, move the end iterators up past any branch instructions.
-  --TIE;
-  --FIE;
-
-  // After this point TIB and TIE define an inclusive range, which means that
-  // TIB == TIE is true when there is one more instruction to consider, not at
-  // the end. Because we may not be able to go before TIB, we need a flag to
-  // indicate a completely empty range.
-  bool TEmpty = false, FEmpty = false;
-
-  // Upon exit TIE and FIE will both point at the last non-shared instruction.
-  // They need to be moved forward to point past the last non-shared
-  // instruction if the range they delimit is non-empty.
-  auto IncrementEndIteratorsOnExit = make_scope_exit([&]() {
-    if (!TEmpty)
-      ++TIE;
-    if (!FEmpty)
-      ++FIE;
-  });
+  // blocks, switch to reverse_iterators. Note that getReverse() returns an
+  // iterator that points to the same instruction, unlike std::reverse_iterator.
+  // We have to do our own shifting so that we get the same range.
+  MachineBasicBlock::reverse_iterator RTIE = std::next(TIE.getReverse());
+  MachineBasicBlock::reverse_iterator RFIE = std::next(FIE.getReverse());
+  const MachineBasicBlock::reverse_iterator RTIB = std::next(TIB.getReverse());
+  const MachineBasicBlock::reverse_iterator RFIB = std::next(FIB.getReverse());
 
   if (!TBB.succ_empty() || !FBB.succ_empty()) {
     if (SkipUnconditionalBranches) {
-      while (!TEmpty && TIE->isUnconditionalBranch())
-        shrinkInclusiveRange(TIB, TIE, TEmpty);
-      while (!FEmpty && FIE->isUnconditionalBranch())
-        shrinkInclusiveRange(FIB, FIE, FEmpty);
+      while (RTIE != RTIB && RTIE->isUnconditionalBranch())
+        ++RTIE;
+      while (RFIE != RFIB && RFIE->isUnconditionalBranch())
+        ++RFIE;
     }
   }
 
-  // If Dups1 includes all of a block, then don't count duplicate
-  // instructions at the end of the blocks.
-  if (TEmpty || FEmpty)
-    return true;
-
   // Count duplicate instructions at the ends of the blocks.
-  while (!TEmpty && !FEmpty) {
+  while (RTIE != RTIB && RFIE != RFIB) {
     // Skip dbg_value instructions. These do not count.
-    TIE = skipDebugInstructionsBackward(TIE, TIB);
-    FIE = skipDebugInstructionsBackward(FIE, FIB);
-    TEmpty = TIE == TIB && TIE->isDebugValue();
-    FEmpty = FIE == FIB && FIE->isDebugValue();
-    if (TEmpty || FEmpty)
+    // Note that these are reverse iterators going forward.
+    RTIE = skipDebugInstructionsForward(RTIE, RTIB);
+    RFIE = skipDebugInstructionsForward(RFIE, RFIB);
+    if (RTIE == RTIB || RFIE == RFIB)
       break;
-    if (!TIE->isIdenticalTo(*FIE))
+    if (!RTIE->isIdenticalTo(*RFIE))
       break;
     // We have to verify that any branch instructions are the same, and then we
     // don't count them toward the # of duplicate instructions.
-    if (!TIE->isBranch())
+    if (!RTIE->isBranch())
       ++Dups2;
-    shrinkInclusiveRange(TIB, TIE, TEmpty);
-    shrinkInclusiveRange(FIB, FIE, FEmpty);
+    ++RTIE;
+    ++RFIE;
   }
+  TIE = std::next(RTIE.getReverse());
+  FIE = std::next(RFIE.getReverse());
   return true;
 }
 
@@ -741,25 +710,21 @@ bool IfConverter::RescanInstructions(
 static void verifySameBranchInstructions(
     MachineBasicBlock *MBB1,
     MachineBasicBlock *MBB2) {
-  MachineBasicBlock::iterator B1 = MBB1->begin();
-  MachineBasicBlock::iterator B2 = MBB2->begin();
-  MachineBasicBlock::iterator E1 = std::prev(MBB1->end());
-  MachineBasicBlock::iterator E2 = std::prev(MBB2->end());
-  bool Empty1 = false, Empty2 = false;
-  while (!Empty1 && !Empty2) {
-    E1 = skipDebugInstructionsBackward(E1, B1);
-    E2 = skipDebugInstructionsBackward(E2, B2);
-    Empty1 = E1 == B1 && E1->isDebugValue();
-    Empty2 = E2 == B2 && E2->isDebugValue();
-
-    if (Empty1 && Empty2)
+  const MachineBasicBlock::reverse_iterator B1 = MBB1->rend();
+  const MachineBasicBlock::reverse_iterator B2 = MBB2->rend();
+  MachineBasicBlock::reverse_iterator E1 = MBB1->rbegin();
+  MachineBasicBlock::reverse_iterator E2 = MBB2->rbegin();
+  while (E1 != B1 && E2 != B2) {
+    skipDebugInstructionsForward(E1, B1);
+    skipDebugInstructionsForward(E2, B2);
+    if (E1 == B1 && E2 == B2)
       break;
 
-    if (Empty1) {
+    if (E1 == B1) {
       assert(!E2->isBranch() && "Branch mis-match, one block is empty.");
       break;
     }
-    if (Empty2) {
+    if (E2 == B2) {
       assert(!E1->isBranch() && "Branch mis-match, one block is empty.");
       break;
     }
@@ -769,8 +734,8 @@ static void verifySameBranchInstructions(
              "Branch mis-match, branch instructions don't match.");
     else
       break;
-    shrinkInclusiveRange(B1, E1, Empty1);
-    shrinkInclusiveRange(B2, E2, Empty2);
+    ++E1;
+    ++E2;
   }
 }
 #endif
@@ -1353,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) {
       return false;
     PI = I++;
   }
-  return true;
+  // Finally see if the last I is indeed a successor to PI.
+  return PI->isSuccessor(&*I);
 }
 
 /// Invalidate predecessor BB info so it would be re-analyzed to determine if it
@@ -1508,8 +1474,11 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     DontKill.addLiveIns(NextMBB);
   }
 
+  // Remove the branches from the entry so we can add the contents of the true
+  // block to it.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
   if (CvtMBB.pred_size() > 1) {
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
@@ -1518,11 +1487,11 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
     // explicitly remove CvtBBI as a successor.
     BBI.BB->removeSuccessor(&CvtMBB, true);
   } else {
+    // Predicate the instructions in the true block.
     RemoveKills(CvtMBB.begin(), CvtMBB.end(), DontKill, *TRI);
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
     // Merge converted block into entry block.
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     MergeBlocks(BBI, *CvtBBI);
   }
 
@@ -1622,8 +1591,11 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
   }
 
+  // Remove the branches from the entry so we can add the contents of the true
+  // block to it.
+  BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
+
   if (CvtMBB.pred_size() > 1) {
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
@@ -1637,7 +1609,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
     // Now merge the entry of the triangle with the true block.
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     MergeBlocks(BBI, *CvtBBI, false);
   }
 
@@ -2183,7 +2154,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   // unknown probabilities into known ones.
   // FIXME: This usage is too tricky and in the future we would like to
   // eliminate all unknown probabilities in MBB.
-  ToBBI.BB->normalizeSuccProbs();
+  if (ToBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
 
   SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(),
                                                 FromMBB.succ_end());
@@ -2263,7 +2235,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
 
   // Normalize the probabilities of ToBBI.BB's successors with all adjustment
   // we've done above.
-  ToBBI.BB->normalizeSuccProbs();
+  if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
 
   ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
   FromBBI.Predicate.clear();
diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 9588dfb..e308f49 100644
--- a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -22,6 +22,7 @@
 // With the help of a runtime that understands the .fault_maps section,
 // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs
 // a page fault.
+// Store and LoadStore are also supported.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,21 +30,22 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass {
   const TargetRegisterInfo *TRI = nullptr;
   AliasAnalysis *AA = nullptr;
   MachineModuleInfo *MMI = nullptr;
+  MachineFrameInfo *MFI = nullptr;
 
   bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,
                                  SmallVectorImpl<NullCheck> &NullCheckList);
-  MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB,
-                                   MachineBasicBlock *HandlerMBB);
+  MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                                    MachineBasicBlock *HandlerMBB);
   void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList);
 
-  /// Is \p MI a memory operation that can be used to implicitly null check the
-  /// value in \p PointerReg?  \p PrevInsts is the set of instruction seen since
+  enum AliasResult {
+    AR_NoAlias,
+    AR_MayAlias,
+    AR_WillAliasEverything
+  };
+  /// Returns AR_NoAlias if \p MI memory operation does not alias with
+  /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if
+  /// they may alias and any further memory operation may alias with \p PrevMI.
+  AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI);
+
+  enum SuitabilityResult {
+    SR_Suitable,
+    SR_Unsuitable,
+    SR_Impossible
+  };
+  /// Return SR_Suitable if \p MI a memory operation that can be used to
+  /// implicitly null check the value in \p PointerReg, SR_Unsuitable if
+  /// \p MI cannot be used to null check and SR_Impossible if there is
+  /// no sense to continue lookup due to any other instruction will not be able
+  /// to be used. \p PrevInsts is the set of instruction seen since
   /// the explicit null check on \p PointerReg.
-  bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
-                          ArrayRef<MachineInstr *> PrevInsts);
+  SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts);
 
   /// Return true if \p FaultingMI can be hoisted from after the the
   /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a
   /// non-null value if we also need to (and legally can) hoist a depedency.
-  bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg,
-                        ArrayRef<MachineInstr *> InstsSeenSoFar,
-                        MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
+  bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg,
+                    ArrayRef<MachineInstr *> InstsSeenSoFar,
+                    MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
 
 public:
   static char ID;
@@ -193,7 +214,7 @@ public:
 }
 
 bool ImplicitNullChecks::canHandle(const MachineInstr *MI) {
-  if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects())
+  if (MI->isCall() || MI->hasUnmodeledSideEffects())
     return false;
   auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); };
   (void)IsRegMask;
@@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,
 
       unsigned RegB = MOB.getReg();
 
-      if (TRI->regsOverlap(RegA, RegB))
+      if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))
         return false;
     }
   }
@@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getRegInfo().getTargetRegisterInfo();
   MMI = &MF.getMMI();
+  MFI = &MF.getFrameInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   SmallVector<NullCheck, 16> NullCheckList;
@@ -283,36 +305,76 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,
   return false;
 }
 
-bool ImplicitNullChecks::isSuitableMemoryOp(
-    MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) {
+ImplicitNullChecks::AliasResult
+ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI,
+                                        MachineInstr *PrevMI) {
+  // If it is not memory access, skip the check.
+  if (!(PrevMI->mayStore() || PrevMI->mayLoad()))
+    return AR_NoAlias;
+  // Load-Load may alias
+  if (!(MI.mayStore() || PrevMI->mayStore()))
+    return AR_NoAlias;
+  // We lost info, conservatively alias. If it was store then no sense to
+  // continue because we won't be able to check against it further.
+  if (MI.memoperands_empty())
+    return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+  if (PrevMI->memoperands_empty())
+    return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+
+  for (MachineMemOperand *MMO1 : MI.memoperands()) {
+    // MMO1 should have a value due it comes from operation we'd like to use
+    // as implicit null check.
+    assert(MMO1->getValue() && "MMO1 should have a Value!");
+    for (MachineMemOperand *MMO2 : PrevMI->memoperands()) {
+      if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) {
+        if (PSV->mayAlias(MFI))
+          return AR_MayAlias;
+        continue;
+      }
+      llvm::AliasResult AAResult = AA->alias(
+          MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
+                         MMO1->getAAInfo()),
+          MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
+                         MMO2->getAAInfo()));
+      if (AAResult != NoAlias)
+        return AR_MayAlias;
+    }
+  }
+  return AR_NoAlias;
+}
+
+ImplicitNullChecks::SuitabilityResult
+ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts) {
   int64_t Offset;
   unsigned BaseReg;
 
   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) ||
       BaseReg != PointerReg)
-    return false;
-
-  // We want the load to be issued at a sane offset from PointerReg, so that
-  // if PointerReg is null then the load reliably page faults.
-  if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize))
-    return false;
-
-  // Finally, we need to make sure that the load instruction actually is
-  // loading from PointerReg, and there isn't some re-definition of PointerReg
-  // between the compare and the load.
-  for (auto *PrevMI : PrevInsts)
-    for (auto &PrevMO : PrevMI->operands())
-      if (PrevMO.isReg() && PrevMO.getReg() &&
-          TRI->regsOverlap(PrevMO.getReg(), PointerReg))
-        return false;
-
-  return true;
+    return SR_Unsuitable;
+
+  // We want the mem access to be issued at a sane offset from PointerReg,
+  // so that if PointerReg is null then the access reliably page faults.
+  if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() &&
+        Offset < PageSize))
+    return SR_Unsuitable;
+
+  // Finally, check whether the current memory access aliases with previous one.
+  for (auto *PrevMI : PrevInsts) {
+    AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
+    if (AR == AR_WillAliasEverything)
+      return SR_Impossible;
+    if (AR == AR_MayAlias)
+      return SR_Unsuitable;
+  }
+  return SR_Suitable;
 }
 
-bool ImplicitNullChecks::canHoistLoadInst(
-    MachineInstr *FaultingMI, unsigned PointerReg,
-    ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc,
-    MachineInstr *&Dependence) {
+bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
+                                      unsigned PointerReg,
+                                      ArrayRef<MachineInstr *> InstsSeenSoFar,
+                                      MachineBasicBlock *NullSucc,
+                                      MachineInstr *&Dependence) {
   auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar);
   if (!DepResult.CanReorder)
     return false;
@@ -359,7 +421,8 @@ bool ImplicitNullChecks::canHoistLoadInst(
     // The Dependency can't be re-defining the base register -- then we won't
     // get the memory operation on the address we want.  This is already
     // checked in \c IsSuitableMemoryOp.
-    assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) &&
+    assert(!(DependenceMO.isDef() &&
+             TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) &&
            "Should have been checked before!");
   }
 
@@ -481,50 +544,76 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
       return false;
 
     MachineInstr *Dependence;
-    if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) &&
-        canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc,
-                         Dependence)) {
+    SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar);
+    if (SR == SR_Impossible)
+      return false;
+    if (SR == SR_Suitable &&
+        canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) {
       NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
                                  NullSucc, Dependence);
       return true;
     }
 
+    // If MI re-defines the PointerReg then we cannot move further.
+    if (any_of(MI.operands(), [&](MachineOperand &MO) {
+          return MO.isReg() && MO.getReg() && MO.isDef() &&
+                 TRI->regsOverlap(MO.getReg(), PointerReg);
+        }))
+      return false;
     InstsSeenSoFar.push_back(&MI);
   }
 
   return false;
 }
 
-/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine
-/// instruction.  The FAULTING_LOAD_OP instruction does the same load as LoadMI
-/// (defining the same register), and branches to HandlerMBB if the load
-/// faults.  The FAULTING_LOAD_OP instruction is inserted at the end of MBB.
-MachineInstr *
-ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
-                                       MachineBasicBlock *MBB,
-                                       MachineBasicBlock *HandlerMBB) {
+/// Wrap a machine instruction, MI, into a FAULTING machine instruction.
+/// The FAULTING instruction does the same load/store as MI
+/// (defining the same register), and branches to HandlerMBB if the mem access
+/// faults.  The FAULTING instruction is inserted at the end of MBB.
+MachineInstr *ImplicitNullChecks::insertFaultingInstr(
+    MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) {
   const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
                                  // all targets.
 
   DebugLoc DL;
-  unsigned NumDefs = LoadMI->getDesc().getNumDefs();
+  unsigned NumDefs = MI->getDesc().getNumDefs();
   assert(NumDefs <= 1 && "other cases unhandled!");
 
   unsigned DefReg = NoRegister;
   if (NumDefs != 0) {
-    DefReg = LoadMI->defs().begin()->getReg();
-    assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 &&
+    DefReg = MI->defs().begin()->getReg();
+    assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 &&
            "expected exactly one def!");
   }
 
-  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg)
-                 .addMBB(HandlerMBB)
-                 .addImm(LoadMI->getOpcode());
+  FaultMaps::FaultKind FK;
+  if (MI->mayLoad())
+    FK =
+        MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad;
+  else
+    FK = FaultMaps::FaultingStore;
 
-  for (auto &MO : LoadMI->uses())
-    MIB.addOperand(MO);
+  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg)
+                 .addImm(FK)
+                 .addMBB(HandlerMBB)
+                 .addImm(MI->getOpcode());
+
+  for (auto &MO : MI->uses()) {
+    if (MO.isReg()) {
+      MachineOperand NewMO = MO;
+      if (MO.isUse()) {
+        NewMO.setIsKill(false);
+      } else {
+        assert(MO.isDef() && "Expected def or use");
+        NewMO.setIsDead(false);
+      }
+      MIB.add(NewMO);
+    } else {
+      MIB.add(MO);
+    }
+  }
 
-  MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end());
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
   return MIB;
 }
@@ -545,18 +634,18 @@ void ImplicitNullChecks::rewriteNullChecks(
       NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);
     }
 
-    // Insert a faulting load where the conditional branch was originally.  We
-    // check earlier ensures that this bit of code motion is legal.  We do not
-    // touch the successors list for any basic block since we haven't changed
-    // control flow, we've just made it implicit.
-    MachineInstr *FaultingLoad = insertFaultingLoad(
+    // Insert a faulting instruction where the conditional branch was
+    // originally. We check earlier ensures that this bit of code motion
+    // is legal.  We do not touch the successors list for any basic block
+    // since we haven't changed control flow, we've just made it implicit.
+    MachineInstr *FaultingInstr = insertFaultingInstr(
         NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());
     // Now the values defined by MemOperation, if any, are live-in of
     // the block of MemOperation.
-    // The original load operation may define implicit-defs alongside
-    // the loaded value.
+    // The original operation may define implicit-defs alongside
+    // the value.
     MachineBasicBlock *MBB = NC.getMemOperation()->getParent();
-    for (const MachineOperand &MO : FaultingLoad->operands()) {
+    for (const MachineOperand &MO : FaultingInstr->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
       unsigned Reg = MO.getReg();
@@ -588,8 +677,8 @@ void ImplicitNullChecks::rewriteNullChecks(
 
 char ImplicitNullChecks::ID = 0;
 char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID;
-INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks",
+INITIALIZE_PASS_BEGIN(ImplicitNullChecks, DEBUG_TYPE,
                       "Implicit null checks", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks",
+INITIALIZE_PASS_END(ImplicitNullChecks, DEBUG_TYPE,
                     "Implicit null checks", false, false)
diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
index 3d81184..eda4f74 100644
--- a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
       Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
 
   // We take the DebugLoc from MI, since OrigMI may be attributed to a
-  // different source location. 
+  // different source location.
   auto *NewMI = LIS.getInstructionFromIndex(DefIdx);
   NewMI->setDebugLoc(MI.getDebugLoc());
 
@@ -643,8 +643,11 @@ void InlineSpiller::reMaterializeAll() {
       Edit->eraseVirtReg(Reg);
       continue;
     }
-    assert((LIS.hasInterval(Reg) && !LIS.getInterval(Reg).empty()) &&
-           "Reg with empty interval has reference");
+
+    assert(LIS.hasInterval(Reg) &&
+           (!LIS.getInterval(Reg).empty() || !MRI.reg_nodbg_empty(Reg)) &&
+           "Empty and not used live-range?!");
+
     RegsToSpill[ResultPos++] = Reg;
   }
   RegsToSpill.erase(RegsToSpill.begin() + ResultPos, RegsToSpill.end());
@@ -686,7 +689,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   return true;
 }
 
-#if !defined(NDEBUG)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 // Dump the range of instructions from B to E with their slot indexes.
 static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
                                                MachineBasicBlock::iterator E,
@@ -856,21 +860,46 @@ void InlineSpiller::insertReload(unsigned NewVReg,
   ++NumReloads;
 }
 
+/// Check if \p Def fully defines a VReg with an undefined value.
+/// If that's the case, that means the value of VReg is actually
+/// not relevant.
+static bool isFullUndefDef(const MachineInstr &Def) {
+  if (!Def.isImplicitDef())
+    return false;
+  assert(Def.getNumOperands() == 1 &&
+         "Implicit def with more than one definition");
+  // We can say that the VReg defined by Def is undef, only if it is
+  // fully defined by Def. Otherwise, some of the lanes may not be
+  // undef and the value of the VReg matters.
+  return !Def.getOperand(0).getSubReg();
+}
+
 /// insertSpill - Insert a spill of NewVReg after MI.
 void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
 
   MachineInstrSpan MIS(MI);
-  TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot,
-                          MRI.getRegClass(NewVReg), &TRI);
+  bool IsRealSpill = true;
+  if (isFullUndefDef(*MI)) {
+    // Don't spill undef value.
+    // Anything works for undef, in particular keeping the memory
+    // uninitialized is a viable option and it saves code size and
+    // run time.
+    BuildMI(MBB, std::next(MI), MI->getDebugLoc(), TII.get(TargetOpcode::KILL))
+        .addReg(NewVReg, getKillRegState(isKill));
+    IsRealSpill = false;
+  } else
+    TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot,
+                            MRI.getRegClass(NewVReg), &TRI);
 
   LIS.InsertMachineInstrRangeInMaps(std::next(MI), MIS.end());
 
   DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
                                            "spill"));
   ++NumSpills;
-  HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
+  if (IsRealSpill)
+    HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
@@ -887,20 +916,10 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     // Debug values are not allowed to affect codegen.
     if (MI->isDebugValue()) {
       // Modify DBG_VALUE now that the value is in a spill slot.
-      bool IsIndirect = MI->isIndirectDebugValue();
-      uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
-      const MDNode *Var = MI->getDebugVariable();
-      const MDNode *Expr = MI->getDebugExpression();
-      DebugLoc DL = MI->getDebugLoc();
-      DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
       MachineBasicBlock *MBB = MI->getParent();
-      assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
-             "Expected inlined-at fields to agree");
-      BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE))
-          .addFrameIndex(StackSlot)
-          .addImm(Offset)
-          .addMetadata(Var)
-          .addMetadata(Expr);
+      DEBUG(dbgs() << "Modifying debug info due to spill:\t" << *MI);
+      buildDbgValueForSpill(*MBB, MI, *MI, StackSlot);
+      MBB->erase(MI);
       continue;
     }
 
diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index ec35b3f..ee4929c 100644
--- a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
@@ -68,8 +69,7 @@ class InterleavedAccess : public FunctionPass {
 
 public:
   static char ID;
-  InterleavedAccess(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
+  InterleavedAccess() : FunctionPass(ID), DT(nullptr), TLI(nullptr) {
     initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
   }
 
@@ -84,7 +84,6 @@ public:
 
 private:
   DominatorTree *DT;
-  const TargetMachine *TM;
   const TargetLowering *TLI;
 
   /// The maximum supported interleave factor.
@@ -108,18 +107,16 @@ private:
 } // end anonymous namespace.
 
 char InterleavedAccess::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(
-    InterleavedAccess, "interleaved-access",
+INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
     "Lower interleaved memory accesses to target specific intrinsics", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_TM_PASS_END(
-    InterleavedAccess, "interleaved-access",
+INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
     "Lower interleaved memory accesses to target specific intrinsics", false,
     false)
 
-FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
-  return new InterleavedAccess(TM);
+FunctionPass *llvm::createInterleavedAccessPass() {
+  return new InterleavedAccess();
 }
 
 /// \brief Check if the mask is a DE-interleave mask of the given factor
@@ -426,13 +423,15 @@ bool InterleavedAccess::lowerInterleavedStore(
 }
 
 bool InterleavedAccess::runOnFunction(Function &F) {
-  if (!TM || !LowerInterleavedAccesses)
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC || !LowerInterleavedAccesses)
     return false;
 
   DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto &TM = TPC->getTM<TargetMachine>();
+  TLI = TM.getSubtargetImpl(F)->getTargetLowering();
   MaxFactor = TLI->getMaxSupportedInterleaveFactor();
 
   // Holds dead instructions that will be erased later.
diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
index afd2406..c6cc909 100644
--- a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) {
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::memmove:
         M.getOrInsertFunction("memmove",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::memset:
         M.getOrInsertFunction("memset",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt32Ty(M.getContext()), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::sqrt:
         EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl");
diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 26794e2..f2defb4 100644
--- a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
@@ -31,21 +30,11 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
-// Enable or disable FastISel. Both options are needed, because
-// FastISel is enabled by default with -fast, and we wish to be
-// able to enable or disable fast-isel independently from -O0.
-static cl::opt<cl::boolOrDefault>
-EnableFastISelOption("fast-isel", cl::Hidden,
-  cl::desc("Enable the \"fast\" instruction selector"));
-
-static cl::opt<bool>
-    EnableGlobalISel("global-isel", cl::Hidden, cl::init(false),
-                     cl::desc("Enable the \"global\" instruction selector"));
-
 void LLVMTargetMachine::initAsmInfo() {
   MRI = TheTarget.createMCRegInfo(getTargetTriple().str());
   MII = TheTarget.createMCInstrInfo();
@@ -71,8 +60,7 @@ void LLVMTargetMachine::initAsmInfo() {
 
   TmpAsmInfo->setPreserveAsmComments(Options.MCOptions.PreserveAsmComments);
 
-  if (Options.CompressDebugSections)
-    TmpAsmInfo->setCompressDebugSections(DebugCompressionType::DCT_ZlibGnu);
+  TmpAsmInfo->setCompressDebugSections(Options.CompressDebugSections);
 
   TmpAsmInfo->setRelaxELFRelocations(Options.RelaxELFRelocations);
 
@@ -85,7 +73,7 @@ void LLVMTargetMachine::initAsmInfo() {
 LLVMTargetMachine::LLVMTargetMachine(const Target &T,
                                      StringRef DataLayoutString,
                                      const Triple &TT, StringRef CPU,
-                                     StringRef FS, TargetOptions Options,
+                                     StringRef FS, const TargetOptions &Options,
                                      Reloc::Model RM, CodeModel::Model CM,
                                      CodeGenOpt::Level OL)
     : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) {
@@ -106,109 +94,31 @@ static MCContext *
 addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
                         bool DisableVerify, AnalysisID StartBefore,
                         AnalysisID StartAfter, AnalysisID StopBefore,
-                        AnalysisID StopAfter,
-                        MachineFunctionInitializer *MFInitializer = nullptr) {
-
-  // When in emulated TLS mode, add the LowerEmuTLS pass.
-  if (TM->Options.EmulatedTLS)
-    PM.add(createLowerEmuTLSPass(TM));
-
-  PM.add(createPreISelIntrinsicLoweringPass());
-
-  // Add internal analysis passes from the target machine.
-  PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
-
+                        AnalysisID StopAfter) {
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
   TargetPassConfig *PassConfig = TM->createPassConfig(PM);
   PassConfig->setStartStopPasses(StartBefore, StartAfter, StopBefore,
                                  StopAfter);
-
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
-
   PM.add(PassConfig);
-
-  PassConfig->addIRPasses();
-
-  PassConfig->addCodeGenPrepare();
-
-  PassConfig->addPassesToHandleExceptions();
-
-  PassConfig->addISelPrepare();
-
   MachineModuleInfo *MMI = new MachineModuleInfo(TM);
-  MMI->setMachineFunctionInitializer(MFInitializer);
   PM.add(MMI);
 
-  // Enable FastISel with -fast, but allow that to be overridden.
-  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
-  if (EnableFastISelOption == cl::BOU_TRUE ||
-      (TM->getOptLevel() == CodeGenOpt::None &&
-       TM->getO0WantsFastISel()))
-    TM->setFastISel(true);
-
-  // Ask the target for an isel.
-  if (LLVM_UNLIKELY(EnableGlobalISel)) {
-    if (PassConfig->addIRTranslator())
-      return nullptr;
-
-    PassConfig->addPreLegalizeMachineIR();
-
-    if (PassConfig->addLegalizeMachineIR())
-      return nullptr;
-
-    // Before running the register bank selector, ask the target if it
-    // wants to run some passes.
-    PassConfig->addPreRegBankSelect();
-
-    if (PassConfig->addRegBankSelect())
-      return nullptr;
-
-    PassConfig->addPreGlobalInstructionSelect();
-
-    if (PassConfig->addGlobalInstructionSelect())
-      return nullptr;
-
-    // Pass to reset the MachineFunction if the ISel failed.
-    PM.add(createResetMachineFunctionPass(
-        PassConfig->reportDiagnosticWhenGlobalISelFallback()));
-
-    // Provide a fallback path when we do not want to abort on
-    // not-yet-supported input.
-    if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) &&
-        PassConfig->addInstSelector())
-      return nullptr;
-
-  } else if (PassConfig->addInstSelector())
+  if (PassConfig->addISelPasses())
     return nullptr;
-
   PassConfig->addMachinePasses();
-
   PassConfig->setInitialized();
 
   return &MMI->getContext();
 }
 
-bool LLVMTargetMachine::addPassesToEmitFile(
-    PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
-    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
-    AnalysisID StopBefore, AnalysisID StopAfter,
-    MachineFunctionInitializer *MFInitializer) {
-  // Add common CodeGen passes.
-  MCContext *Context =
-      addPassesToGenerateCode(this, PM, DisableVerify, StartBefore, StartAfter,
-                              StopBefore, StopAfter, MFInitializer);
-  if (!Context)
-    return true;
-
-  if (StopBefore || StopAfter) {
-    PM.add(createPrintMIRPass(Out));
-    return false;
-  }
-
+bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
+    raw_pwrite_stream &Out, CodeGenFileType FileType,
+    MCContext &Context) {
   if (Options.MCOptions.MCSaveTempLabels)
-    Context->setAllowTemporaryLabels(false);
+    Context.setAllowTemporaryLabels(false);
 
   const MCSubtargetInfo &STI = *getMCSubtargetInfo();
   const MCAsmInfo &MAI = *getMCAsmInfo();
@@ -225,14 +135,14 @@ bool LLVMTargetMachine::addPassesToEmitFile(
     // Create a code emitter if asked to show the encoding.
     MCCodeEmitter *MCE = nullptr;
     if (Options.MCOptions.ShowMCEncoding)
-      MCE = getTarget().createMCCodeEmitter(MII, MRI, *Context);
+      MCE = getTarget().createMCCodeEmitter(MII, MRI, Context);
 
     MCAsmBackend *MAB =
         getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU,
                                        Options.MCOptions);
     auto FOut = llvm::make_unique<formatted_raw_ostream>(Out);
     MCStreamer *S = getTarget().createAsmStreamer(
-        *Context, std::move(FOut), Options.MCOptions.AsmVerbose,
+        Context, std::move(FOut), Options.MCOptions.AsmVerbose,
         Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB,
         Options.MCOptions.ShowMCInst);
     AsmStreamer.reset(S);
@@ -241,7 +151,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, *Context);
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, Context);
     MCAsmBackend *MAB =
         getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU,
                                        Options.MCOptions);
@@ -249,11 +159,11 @@ bool LLVMTargetMachine::addPassesToEmitFile(
       return true;
 
     // Don't waste memory on names of temp labels.
-    Context->setUseNamesOnTempLabels(false);
+    Context.setUseNamesOnTempLabels(false);
 
     Triple T(getTargetTriple().str());
     AsmStreamer.reset(getTarget().createMCObjectStreamer(
-        T, *Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll,
+        T, Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll,
         Options.MCOptions.MCIncrementalLinkerCompatible,
         /*DWARFMustBeAtTheEnd*/ true));
     break;
@@ -261,7 +171,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(
   case CGFT_Null:
     // The Null output is intended for use for performance analysis and testing,
     // not real users.
-    AsmStreamer.reset(getTarget().createNullStreamer(*Context));
+    AsmStreamer.reset(getTarget().createNullStreamer(Context));
     break;
   }
 
@@ -272,8 +182,28 @@ bool LLVMTargetMachine::addPassesToEmitFile(
     return true;
 
   PM.add(Printer);
-  PM.add(createFreeMachineFunctionPass());
+  return false;
+}
 
+bool LLVMTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
+    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
+    AnalysisID StopBefore, AnalysisID StopAfter) {
+  // Add common CodeGen passes.
+  MCContext *Context =
+      addPassesToGenerateCode(this, PM, DisableVerify, StartBefore, StartAfter,
+                              StopBefore, StopAfter);
+  if (!Context)
+    return true;
+
+  if (StopBefore || StopAfter) {
+    PM.add(createPrintMIRPass(Out));
+  } else {
+    if (addAsmPrinter(PM, Out, FileType, *Context))
+      return true;
+  }
+
+  PM.add(createFreeMachineFunctionPass());
   return false;
 }
 
diff --git a/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
new file mode 100644
index 0000000..996d40c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -0,0 +1,97 @@
+///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===//
+///
+///                     The LLVM Compiler Infrastructure
+///
+/// This file is distributed under the University of Illinois Open Source
+/// License. See LICENSE.TXT for details.
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// This is an alternative analysis pass to MachineBlockFrequencyInfo.  The
+/// difference is that with this pass the block frequencies are not computed
+/// when the analysis pass is executed but rather when the BFI result is
+/// explicitly requested by the analysis client.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lazy-machine-block-freq"
+
+INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                      "Lazy Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                    "Lazy Machine Block Frequency Analysis", true, true)
+
+char LazyMachineBlockFrequencyInfoPass::ID = 0;
+
+LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass()
+    : MachineFunctionPass(ID) {
+  initializeLazyMachineBlockFrequencyInfoPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS,
+                                              const Module *M) const {
+  getBFI().print(OS, M);
+}
+
+void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LazyMachineBlockFrequencyInfoPass::releaseMemory() {
+  OwnedMBFI.reset();
+  OwnedMLI.reset();
+  OwnedMDT.reset();
+}
+
+MachineBlockFrequencyInfo &
+LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
+  auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (MBFI) {
+    DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n");
+    return *MBFI;
+  }
+
+  auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
+  auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+  auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
+  DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
+
+  if (!MLI) {
+    DEBUG(dbgs() << "Building LoopInfo on the fly\n");
+    // First create a dominator tree.
+    DEBUG(if (MDT) dbgs() << "DominatorTree is available\n");
+
+    if (!MDT) {
+      DEBUG(dbgs() << "Building DominatorTree on the fly\n");
+      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT->getBase().recalculate(*MF);
+      MDT = OwnedMDT.get();
+    }
+
+    // Generate LoopInfo from it.
+    OwnedMLI = make_unique<MachineLoopInfo>();
+    OwnedMLI->getBase().analyze(MDT->getBase());
+    MLI = OwnedMLI.get();
+  }
+
+  OwnedMBFI = make_unique<MachineBlockFrequencyInfo>();
+  OwnedMBFI->calculate(*MF, MBPI, *MLI);
+  return *OwnedMBFI.get();
+}
+
+bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction(
+    MachineFunction &F) {
+  MF = &F;
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
index 834ed5f..995c58a 100644
--- a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -15,13 +15,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <string>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
@@ -38,6 +47,10 @@ void LexicalScopes::reset() {
 
 /// initialize - Scan machine function and constuct lexical scope nest.
 void LexicalScopes::initialize(const MachineFunction &Fn) {
+  // Don't attempt any lexical scope creation for a NoDebug compile unit.
+  if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return;
   reset();
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) {
 void LexicalScopes::extractLexicalScopes(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
-
   // Scan each instruction and create scopes. First build working set of scopes.
   for (const auto &MBB : *MF) {
     const MachineInstr *RangeBeginMI = nullptr;
@@ -74,8 +86,9 @@ void LexicalScopes::extractLexicalScopes(
         continue;
       }
 
-      // Ignore DBG_VALUE. It does not contribute to any instruction in output.
-      if (MInsn.isDebugValue())
+      // Ignore DBG_VALUE and similar instruction that do not contribute to any
+      // instruction in the output.
+      if (MInsn.isMetaInstruction())
         continue;
 
       if (RangeBeginMI) {
@@ -127,6 +140,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {
 LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
+    // Skip scopes inlined from a NoDebug compile unit.
+    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
+        DICompileUnit::NoDebug)
+      return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
     // Create an inlined scope for inlined function.
@@ -181,10 +198,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
   else
     Parent = getOrCreateLexicalScope(InlinedAt);
 
-  I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
-                                     std::forward_as_tuple(P),
-                                     std::forward_as_tuple(Parent, Scope,
-                                                           InlinedAt, false))
+  I = InlinedLexicalScopeMap
+          .emplace(std::piecewise_construct, std::forward_as_tuple(P),
+                   std::forward_as_tuple(Parent, Scope, InlinedAt, false))
           .first;
   return &I->second;
 }
@@ -241,7 +257,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
 void LexicalScopes::assignInstructionRanges(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
-
   LexicalScope *PrevLexicalScope = nullptr;
   for (const auto &R : MIRanges) {
     LexicalScope *S = MI2ScopeMap.lookup(R.first);
@@ -299,9 +314,8 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
   return Result;
 }
 
-/// dump - Print data structures.
-void LexicalScope::dump(unsigned Indent) const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LexicalScope::dump(unsigned Indent) const {
   raw_ostream &err = dbgs();
   err.indent(Indent);
   err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
@@ -316,5 +330,5 @@ void LexicalScope::dump(unsigned Indent) const {
   for (unsigned i = 0, e = Children.size(); i != e; ++i)
     if (Children[i] != this)
       Children[i]->dump(Indent + 2);
-#endif
 }
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
index c945376..b5e705f 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -24,13 +24,16 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -40,7 +43,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "live-debug-values"
+#define DEBUG_TYPE "livedebugvalues"
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 
@@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass {
 private:
   const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
   LexicalScopes LS;
 
   /// Keeps track of lexical scopes associated with a user value's source
@@ -127,11 +131,13 @@ private:
       if (int RegNo = isDbgValueDescribedByReg(MI)) {
         Kind = RegisterKind;
         Loc.RegisterLoc.RegNo = RegNo;
-        uint64_t Offset =
+        int64_t Offset =
             MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0;
         // We don't support offsets larger than 4GiB here. They are
         // slated to be replaced with DIExpressions anyway.
-        if (Offset >= (1ULL << 32))
+        // With indirect debug values used for spill locations, Offset 
+        // can be negative.
+        if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32))
           Kind = InvalidKind;
         else
           Loc.RegisterLoc.Offset = Offset;
@@ -150,7 +156,9 @@ private:
     /// dominates MBB.
     bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }
 
-    void dump() const { MI.dump(); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump() const { MI.dump(); }
+#endif
 
     bool operator==(const VarLoc &Other) const {
       return Var == Other.Var && Loc.Hash == Other.Loc.Hash;
@@ -167,6 +175,11 @@ private:
   typedef UniqueVector<VarLoc> VarLocMap;
   typedef SparseBitVector<> VarLocSet;
   typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB;
+  struct SpillDebugPair {
+    MachineInstr *SpillInst;
+    MachineInstr *DebugInst;
+  };
+  typedef SmallVector<SpillDebugPair, 4> SpillMap;
 
   /// This holds the working set of currently open ranges. For fast
   /// access, this is done both as a set of VarLocIDs, and a map of
@@ -216,14 +229,21 @@ private:
     }
   };
 
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
+                          unsigned &Reg);
+  int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg);
+
   void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
                           VarLocMap &VarLocIDs);
+  void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                         VarLocMap &VarLocIDs, SpillMap &Spills);
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
                            const VarLocMap &VarLocIDs);
   bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
                               VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
   bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs);
+                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills,
+                bool transferSpills);
 
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
@@ -263,7 +283,7 @@ public:
 
 char LiveDebugValues::ID = 0;
 char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
-INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis",
+INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
                 false, false)
 
 /// Default construct and initialize the pass.
@@ -282,6 +302,7 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
 //            Debug Range Extension Implementation
 //===----------------------------------------------------------------------===//
 
+#ifndef NDEBUG
 void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
                                        const VarLocInMBB &V,
                                        const VarLocMap &VarLocIDs,
@@ -300,6 +321,22 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
   }
   Out << "\n";
 }
+#endif
+
+/// Given a spill instruction, extract the register and offset used to
+/// address the spill location in a target independent way.
+int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI,
+                                                  unsigned &Reg) {
+  assert(MI.hasOneMemOperand() && 
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+}
 
 /// End all previous ranges related to @MI and start a new range from @MI
 /// if it is a DBG_VALUE instr.
@@ -336,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
   SparseBitVector<> KillSet;
   for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.  Assume that call
+    // instructions never clobber SP, because some backends (e.g., AArch64)
+    // never list SP in the regmask.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
-        TRI->isPhysicalRegister(MO.getReg())) {
+        TRI->isPhysicalRegister(MO.getReg()) &&
+        !(MI.isCall() && MO.getReg() == SP)) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         for (unsigned ID : OpenRanges.getVarLocs())
@@ -358,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
   OpenRanges.erase(KillSet, VarLocIDs);
 }
 
+/// Decide if @MI is a spill instruction and return true if it is. We use 2
+/// criteria to make this decision:
+/// - Is this instruction a store to a spill slot?
+/// - Is there a register operand that is both used and killed?
+/// TODO: Store optimization can fold spills into other stores (including
+/// other spills). We do not handle this yet (more than one memory operand).
+bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
+                                         MachineFunction *MF, unsigned &Reg) {
+  const MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  int FI;
+  const MachineMemOperand *MMO;
+
+  // TODO: Handle multiple stores folded into one. 
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  // To identify a spill instruction, use the same criteria as in AsmPrinter.
+  if (!((TII->isStoreToStackSlotPostFE(MI, FI) ||
+         TII->hasStoreToStackSlot(MI, MMO, FI)) &&
+        FrameInfo.isSpillSlotObjectIndex(FI)))
+    return false;
+
+  // In a spill instruction generated by the InlineSpiller the spilled register
+  // has its kill flag set. Return false if we don't find such a register.
+  Reg = 0;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse() && MO.isKill()) {
+      Reg = MO.getReg();
+      break;
+    }
+  }
+  return Reg != 0;
+}
+
+/// A spilled register may indicate that we have to end the current range of
+/// a variable and create a new one for the spill location.
+/// We don't want to insert any instructions in transfer(), so we just create
+/// the DBG_VALUE witout inserting it and keep track of it in @Spills.
+/// It will be inserted into the BB when we're done iterating over the
+/// instructions.
+void LiveDebugValues::transferSpillInst(MachineInstr &MI,
+                                        OpenRangesSet &OpenRanges,
+                                        VarLocMap &VarLocIDs,
+                                        SpillMap &Spills) {
+  unsigned Reg;
+  MachineFunction *MF = MI.getParent()->getParent();
+  if (!isSpillInstruction(MI, MF, Reg))
+    return;
+
+  // Check if the register is the location of a debug value.
+  for (unsigned ID : OpenRanges.getVarLocs()) {
+    if (VarLocIDs[ID].isDescribedByReg() == Reg) {
+      DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '('
+                   << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
+
+      // Create a DBG_VALUE instruction to describe the Var in its spilled
+      // location, but don't insert it yet to avoid invalidating the
+      // iterator in our caller.
+      unsigned SpillBase;
+      int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase);
+      const MachineInstr *DMI = &VarLocIDs[ID].MI;
+      MachineInstr *SpDMI =
+          BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0,
+                  DMI->getDebugVariable(), DMI->getDebugExpression());
+      SpDMI->getOperand(1).setImm(SpillOffset);
+      DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
+            SpDMI->print(dbgs(), false, TII));
+
+      // The newly created DBG_VALUE instruction SpDMI must be inserted after
+      // MI. Keep track of the pairing.
+      SpillDebugPair MIP = {&MI, SpDMI};
+      Spills.push_back(MIP);
+
+      // End all previous ranges of Var.
+      OpenRanges.erase(VarLocIDs[ID].Var);
+
+      // Add the VarLoc to OpenRanges.
+      VarLoc VL(*SpDMI, LS);
+      unsigned SpillLocID = VarLocIDs.insert(VL);
+      OpenRanges.insert(SpillLocID, VL.Var);
+      return;
+    }
+  }
+}
+
 /// Terminate all open ranges at the end of the current basic block.
 bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
                                              OpenRangesSet &OpenRanges,
@@ -383,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
 
 /// This routine creates OpenRanges and OutLocs.
 bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) {
+                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
+                               SpillMap &Spills, bool transferSpills) {
   bool Changed = false;
   transferDebugValue(MI, OpenRanges, VarLocIDs);
   transferRegisterDef(MI, OpenRanges, VarLocIDs);
+  if (transferSpills)
+    transferSpillInst(MI, OpenRanges, VarLocIDs, Spills);
   Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
   return Changed;
 }
@@ -475,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   bool OLChanged = false;
   bool MBBJoined = false;
 
-  VarLocMap VarLocIDs;   // Map VarLoc<>unique ID for use in bitvectors.
+  VarLocMap VarLocIDs;      // Map VarLoc<>unique ID for use in bitvectors.
   OpenRangesSet OpenRanges; // Ranges that are open until end of bb.
-  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
-  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
+  VarLocInMBB OutLocs;      // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;       // Ranges that are incoming after joining.
+  SpillMap Spills;          // DBG_VALUEs associated with spills.
 
   DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
   DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
@@ -490,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       Pending;
 
   // Initialize every mbb with OutLocs.
+  // We are not looking at any spill instructions during the initial pass
+  // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
+  // instructions for spills of registers that are known to be user variables
+  // within the BB in which the spill occurs.
   for (auto &MBB : MF)
     for (auto &MI : MBB)
-      transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+      transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
+               /*transferSpills=*/false);
 
   DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",
                          dbgs()));
@@ -524,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       if (MBBJoined) {
         MBBJoined = false;
         Changed = true;
+        // Now that we have started to extend ranges across BBs we need to
+        // examine spill instructions to see whether they spill registers that
+        // correspond to user variables.
         for (auto &MI : *MBB)
-          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
+                                /*transferSpills=*/true);
+
+        // Add any DBG_VALUE instructions necessitated by spills.
+        for (auto &SP : Spills)
+          MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst),
+                           SP.DebugInst);
+        Spills.clear();
 
         DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
                                "OutLocs after propagating", dbgs()));
@@ -559,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
 
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
   LS.initialize(MF);
 
   bool Changed = ExtendRanges(MF);
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 0934d8c..0c76478 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -45,7 +45,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "livedebug"
+#define DEBUG_TYPE "livedebugvars"
 
 static cl::opt<bool>
 EnableLDV("live-debug-variables", cl::init(true),
@@ -54,11 +54,11 @@ EnableLDV("live-debug-variables", cl::init(true),
 STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
 char LiveDebugVariables::ID = 0;
 
-INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars",
+INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE,
                 "Debug Variable Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars",
+INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE,
                 "Debug Variable Analysis", false, false)
 
 void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -944,7 +944,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
             IsIndirect, Loc.getReg(), offset, Variable, Expression);
   else
     BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
-        .addOperand(Loc)
+        .add(Loc)
         .addImm(offset)
         .addMetadata(Variable)
         .addMetadata(Expression);
@@ -1005,8 +1005,8 @@ bool LiveDebugVariables::doInitialization(Module &M) {
   return Pass::doInitialization(M);
 }
 
-#ifndef NDEBUG
-LLVM_DUMP_METHOD void LiveDebugVariables::dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveDebugVariables::dump() const {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->print(dbgs());
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
index afe87a5..1d7e3d4 100644
--- a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -59,7 +59,7 @@ public:
   void emitDebugValues(VirtRegMap *VRM);
 
   /// dump - Print data structures to dbgs().
-  void dump();
+  void dump() const;
 
 private:
 
diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
index 623af49..9ef9f23 100644
--- a/contrib/llvm/lib/CodeGen/LiveInterval.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
@@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() {
   SubRanges = nullptr;
 }
 
+void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator,
+    LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) {
+
+  LaneBitmask ToApply = LaneMask;
+  for (SubRange &SR : subranges()) {
+    LaneBitmask SRMask = SR.LaneMask;
+    LaneBitmask Matching = SRMask & LaneMask;
+    if (Matching.none())
+      continue;
+
+    SubRange *MatchingRange;
+    if (SRMask == Matching) {
+      // The subrange fits (it does not cover bits outside \p LaneMask).
+      MatchingRange = &SR;
+    } else {
+      // We have to split the subrange into a matching and non-matching part.
+      // Reduce lanemask of existing lane to non-matching part.
+      SR.LaneMask = SRMask & ~Matching;
+      // Create a new subrange for the matching part
+      MatchingRange = createSubRangeFrom(Allocator, Matching, SR);
+    }
+    Apply(*MatchingRange);
+    ToApply &= ~Matching;
+  }
+  // Create a new subrange if there are uncovered bits left.
+  if (ToApply.any()) {
+    SubRange *NewRange = createSubRange(Allocator, ToApply);
+    Apply(*NewRange);
+  }
+}
+
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
   for (const Segment &S : segments)
@@ -1032,6 +1063,7 @@ void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
 // When they exist, Spills.back().start <= LastStart,
 //                 and WriteI[-1].start <= LastStart.
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveRangeUpdater::print(raw_ostream &OS) const {
   if (!isDirty()) {
     if (LR)
@@ -1058,6 +1090,7 @@ void LiveRangeUpdater::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const {
   print(errs());
 }
+#endif
 
 // Determine if A and B should be coalesced.
 static inline bool coalescable(const LiveRange::Segment &A,
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
index 70d3483..471dcea 100644
--- a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -1,4 +1,4 @@
-//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===//
+//===- LiveIntervalAnalysis.cpp - Live Interval Analysis ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,35 +7,52 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the LiveInterval analysis pass which is used
-// by the Linear Scan Register allocator. This pass linearizes the
-// basic blocks of the function in DFS order and computes live intervals for
-// each virtual and physical register.
+/// \file This file implements the LiveInterval analysis pass which is used
+/// by the Linear Scan Register allocator. This pass linearizes the
+/// basic blocks of the function in DFS order and computes live intervals for
+/// each virtual and physical register.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "LiveRangeCalc.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
-#include <cmath>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
@@ -59,11 +76,13 @@ static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
 namespace llvm {
+
 cl::opt<bool> UseSegmentSetForPhysRegs(
     "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
     cl::desc(
         "Use segment set for the computation of the live ranges of physregs."));
-}
+
+} // end namespace llvm
 
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -78,8 +97,7 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
-  DomTree(nullptr), LRCalc(nullptr) {
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) {
   initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
 }
 
@@ -96,16 +114,14 @@ void LiveIntervals::releaseMemory() {
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
-  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
-    delete RegUnitRanges[i];
+  for (LiveRange *LR : RegUnitRanges)
+    delete LR;
   RegUnitRanges.clear();
 
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
 
-/// runOnMachineFunction - calculates LiveIntervals
-///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
   MRI = &MF->getRegInfo();
@@ -135,14 +151,13 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   return true;
 }
 
-/// print - Implement the dump method.
 void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
   // Dump the regunits.
-  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
-    if (LiveRange *LR = RegUnitRanges[i])
-      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n';
+  for (unsigned Unit = 0, UnitE = RegUnitRanges.size(); Unit != UnitE; ++Unit)
+    if (LiveRange *LR = RegUnitRanges[Unit])
+      OS << PrintRegUnit(Unit, TRI) << ' ' << *LR << '\n';
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
@@ -152,8 +167,8 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   }
 
   OS << "RegMasks:";
-  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
-    OS << ' ' << RegMaskSlots[i];
+  for (SlotIndex Idx : RegMaskSlots)
+    OS << ' ' << Idx;
   OS << '\n';
 
   printInstrs(OS);
@@ -165,20 +180,17 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveIntervals::dumpInstrs() const {
+LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
   printInstrs(dbgs());
 }
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ?
-                  llvm::huge_valf : 0.0F;
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
-
-/// computeVirtRegInterval - Compute the live interval of a virtual register,
-/// based on defs and uses.
+/// Compute the live interval of a virtual register, based on defs and uses.
 void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
@@ -200,7 +212,7 @@ void LiveIntervals::computeRegMasks() {
   RegMaskBlocks.resize(MF->getNumBlockIDs());
 
   // Find all instructions with regmask operands.
-  for (MachineBasicBlock &MBB : *MF) {
+  for (const MachineBasicBlock &MBB : *MF) {
     std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()];
     RMB.first = RegMaskSlots.size();
 
@@ -210,7 +222,7 @@ void LiveIntervals::computeRegMasks() {
       RegMaskBits.push_back(Mask);
     }
 
-    for (MachineInstr &MI : MBB) {
+    for (const MachineInstr &MI : MBB) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isRegMask())
           continue;
@@ -245,9 +257,9 @@ void LiveIntervals::computeRegMasks() {
 // interference.
 //
 
-/// computeRegUnitInterval - Compute the live range of a register unit, based
-/// on the uses and defs of aliasing registers.  The range should be empty,
-/// or contain only dead phi-defs from ABI blocks.
+/// Compute the live range of a register unit, based on the uses and defs of
+/// aliasing registers.  The range should be empty, or contain only dead
+/// phi-defs from ABI blocks.
 void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
@@ -257,22 +269,30 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   // may share super-registers. That's OK because createDeadDefs() is
   // idempotent. It is very rare for a register unit to have multiple roots, so
   // uniquing super-registers is probably not worthwhile.
-  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
-    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
-         Supers.isValid(); ++Supers) {
-      if (!MRI->reg_empty(*Supers))
-        LRCalc->createDeadDefs(LR, *Supers);
+  bool IsReserved = true;
+  for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+    for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
+         Super.isValid(); ++Super) {
+      unsigned Reg = *Super;
+      if (!MRI->reg_empty(Reg))
+        LRCalc->createDeadDefs(LR, Reg);
+      // A register unit is considered reserved if all its roots and all their
+      // super registers are reserved.
+      if (!MRI->isReserved(Reg))
+        IsReserved = false;
     }
   }
 
   // Now extend LR to reach all uses.
   // Ignore uses of reserved registers. We only track defs of those.
-  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
-    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
-         Supers.isValid(); ++Supers) {
-      unsigned Reg = *Supers;
-      if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
-        LRCalc->extendToUses(LR, Reg);
+  if (!IsReserved) {
+    for (MCRegUnitRootIterator Root(Unit, TRI); Root.isValid(); ++Root) {
+      for (MCSuperRegIterator Super(*Root, TRI, /*IncludeSelf=*/true);
+           Super.isValid(); ++Super) {
+        unsigned Reg = *Super;
+        if (!MRI->reg_empty(Reg))
+          LRCalc->extendToUses(LR, Reg);
+      }
     }
   }
 
@@ -281,11 +301,9 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
     LR.flushSegmentSet();
 }
 
-
-/// computeLiveInRegUnits - Precompute the live ranges of any register units
-/// that are live-in to an ABI block somewhere. Register values can appear
-/// without a corresponding def when entering the entry block or a landing pad.
-///
+/// Precompute the live ranges of any register units that are live-in to an ABI
+/// block somewhere. Register values can appear without a corresponding def when
+/// entering the entry block or a landing pad.
 void LiveIntervals::computeLiveInRegUnits() {
   RegUnitRanges.resize(TRI->getNumRegUnits());
   DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
@@ -294,18 +312,15 @@ void LiveIntervals::computeLiveInRegUnits() {
   SmallVector<unsigned, 8> NewRanges;
 
   // Check all basic blocks for live-ins.
-  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
-       MFI != MFE; ++MFI) {
-    const MachineBasicBlock *MBB = &*MFI;
-
+  for (const MachineBasicBlock &MBB : *MF) {
     // We only care about ABI blocks: Entry + landing pads.
-    if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty())
+    if ((&MBB != &MF->front() && !MBB.isEHPad()) || MBB.livein_empty())
       continue;
 
     // Create phi-defs at Begin for all live-in registers.
-    SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
-    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
-    for (const auto &LI : MBB->liveins()) {
+    SlotIndex Begin = Indexes->getMBBStartIdx(&MBB);
+    DEBUG(dbgs() << Begin << "\tBB#" << MBB.getNumber());
+    for (const auto &LI : MBB.liveins()) {
       for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
         LiveRange *LR = RegUnitRanges[Unit];
@@ -324,16 +339,13 @@ void LiveIntervals::computeLiveInRegUnits() {
   DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
   // Compute the 'normal' part of the ranges.
-  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) {
-    unsigned Unit = NewRanges[i];
+  for (unsigned Unit : NewRanges)
     computeRegUnitRange(*RegUnitRanges[Unit], Unit);
-  }
 }
 
-
 static void createSegmentsForValues(LiveRange &LR,
-      iterator_range<LiveInterval::vni_iterator> VNIs) {
-  for (auto VNI : VNIs) {
+    iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (VNInfo *VNI : VNIs) {
     if (VNI->isUnused())
       continue;
     SlotIndex Def = VNI->def;
@@ -341,7 +353,7 @@ static void createSegmentsForValues(LiveRange &LR,
   }
 }
 
-typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList;
+using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>;
 
 static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
                                  ShrinkToUsesWorkList &WorkList,
@@ -349,7 +361,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
   // Keep track of the PHIs that are in use.
   SmallPtrSet<VNInfo*, 8> UsedPHIs;
   // Blocks that have already been added to WorkList as live-out.
-  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+  SmallPtrSet<const MachineBasicBlock*, 16> LiveOut;
 
   // Extend intervals to reach all uses in WorkList.
   while (!WorkList.empty()) {
@@ -368,7 +380,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
           !UsedPHIs.insert(VNI).second)
         continue;
       // The PHI is live, make sure the predecessors are live-out.
-      for (auto &Pred : MBB->predecessors()) {
+      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
         if (!LiveOut.insert(Pred).second)
           continue;
         SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
@@ -384,7 +396,7 @@ static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
     LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
 
     // Make sure VNI is live-out from the predecessors.
-    for (auto &Pred : MBB->predecessors()) {
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
       if (!LiveOut.insert(Pred).second)
         continue;
       SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
@@ -415,22 +427,20 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   ShrinkToUsesWorkList WorkList;
 
   // Visit all instructions reading li->reg.
-  for (MachineRegisterInfo::reg_instr_iterator
-       I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end();
-       I != E; ) {
-    MachineInstr *UseMI = &*(I++);
-    if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
+  unsigned Reg = li->reg;
+  for (MachineInstr &UseMI : MRI->reg_instructions(Reg)) {
+    if (UseMI.isDebugValue() || !UseMI.readsVirtualRegister(Reg))
       continue;
-    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
     LiveQueryResult LRQ = li->Query(Idx);
     VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
       // no live value. It is likely caused by a target getting <undef> flags
       // wrong.
-      DEBUG(dbgs() << Idx << '\t' << *UseMI
+      DEBUG(dbgs() << Idx << '\t' << UseMI
                    << "Warning: Instr claims to read non-existent value in "
-                    << *li << '\n');
+                   << *li << '\n');
       continue;
     }
     // Special case: An early-clobber tied operand reads and writes the
@@ -458,7 +468,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 bool LiveIntervals::computeDeadValues(LiveInterval &LI,
                                       SmallVectorImpl<MachineInstr*> *dead) {
   bool MayHaveSplitComponents = false;
-  for (auto VNI : LI.valnos) {
+  for (VNInfo *VNI : LI.valnos) {
     if (VNI->isUnused())
       continue;
     SlotIndex Def = VNI->def;
@@ -548,7 +558,7 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
   SR.segments.swap(NewLR.segments);
 
   // Remove dead PHI value numbers
-  for (auto VNI : SR.valnos) {
+  for (VNInfo *VNI : SR.valnos) {
     if (VNI->isUnused())
       continue;
     const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
@@ -571,8 +581,8 @@ void LiveIntervals::extendToIndices(LiveRange &LR,
                                     ArrayRef<SlotIndex> Undefs) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-    LRCalc->extend(LR, Indices[i], /*PhysReg=*/0, Undefs);
+  for (SlotIndex Idx : Indices)
+    LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs);
 }
 
 void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
@@ -599,13 +609,11 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
   // range. It is possible that KillMBB itself is reachable, so start a DFS
   // from each successor.
-  typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy;
+  using VisitedTy = df_iterator_default_set<MachineBasicBlock*,9>;
   VisitedTy Visited;
-  for (MachineBasicBlock::succ_iterator
-       SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end();
-       SuccI != SuccE; ++SuccI) {
+  for (MachineBasicBlock *Succ : KillMBB->successors()) {
     for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
-         I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited);
+         I = df_ext_begin(Succ, Visited), E = df_ext_end(Succ, Visited);
          I != E;) {
       MachineBasicBlock *MBB = *I;
 
@@ -657,9 +665,9 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     // Find the regunit intervals for the assigned register. They may overlap
     // the virtual register live range, cancelling any kills.
     RU.clear();
-    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
-         ++Units) {
-      const LiveRange &RURange = getRegUnit(*Units);
+    for (MCRegUnitIterator Unit(VRM->getPhys(Reg), TRI); Unit.isValid();
+         ++Unit) {
+      const LiveRange &RURange = getRegUnit(*Unit);
       if (RURange.empty())
         continue;
       RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
@@ -802,9 +810,8 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
     // Conservatively return true instead of scanning huge predecessor lists.
     if (PHIMBB->pred_size() > 100)
       return true;
-    for (MachineBasicBlock::const_pred_iterator
-         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI)
-      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI)))
+    for (const MachineBasicBlock *Pred : PHIMBB->predecessors())
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(Pred)))
         return true;
   }
   return false;
@@ -831,7 +838,6 @@ LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) {
   return S;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                          Register mask functions
 //===----------------------------------------------------------------------===//
@@ -864,7 +870,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
     return false;
 
   bool Found = false;
-  for (;;) {
+  while (true) {
     assert(*SlotI >= LiveI->start);
     // Loop over all slots overlapping this segment.
     while (*SlotI < LiveI->end) {
@@ -895,7 +901,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
 //                         IntervalUpdate class.
 //===----------------------------------------------------------------------===//
 
-// HMEditor is a toolkit used by handleMove to trim or extend live intervals.
+/// Toolkit used by handleMove to trim or extend live intervals.
 class LiveIntervals::HMEditor {
 private:
   LiveIntervals& LIS;
@@ -1241,10 +1247,12 @@ private:
           LiveRange::iterator NewIdxIn = NewIdxOut;
           assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));
           const SlotIndex SplitPos = NewIdxDef;
+          OldIdxVNI = OldIdxIn->valno;
 
           // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
+          OldIdxOut->valno->def = OldIdxIn->start;
           *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
-                                          OldIdxIn->valno);
+                                          OldIdxOut->valno);
           // OldIdxIn and OldIdxVNI are now undef and can be overridden.
           // We Slide [NewIdxIn, OldIdxIn) down one position.
           //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -|
@@ -1514,8 +1522,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     }
   }
 
-  for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) {
-    unsigned Reg = OrigRegs[i];
+  for (unsigned Reg : OrigRegs) {
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
@@ -1524,16 +1531,16 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     if (!LI.hasAtLeastOneValue())
       continue;
 
-    for (LiveInterval::SubRange &S : LI.subranges()) {
+    for (LiveInterval::SubRange &S : LI.subranges())
       repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
-    }
+
     repairOldRegInRange(Begin, End, endIdx, LI, Reg);
   }
 }
 
 void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) {
-  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-    if (LiveRange *LR = getCachedRegUnit(*Units))
+  for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) {
+    if (LiveRange *LR = getCachedRegUnit(*Unit))
       if (VNInfo *VNI = LR->getVNInfoAt(Pos))
         LR->removeValNo(VNI);
   }
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index fc2f233..b3248e5 100644
--- a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -1,4 +1,4 @@
-//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===//
+//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,16 +16,16 @@
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SparseBitVector.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <algorithm>
+#include <cassert>
+#include <cstdlib>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
-
 // Merge a LiveInterval's segments. Guarantee no overlaps.
 void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
   if (Range.empty())
@@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
   LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
-  for (;;) {
+  while (true) {
     assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");
     SegPos.erase();
     if (!SegPos.valid())
@@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     CheckedFirstInterference = true;
 
     // Quickly skip interference check for empty sets.
-    if (VirtReg->empty() || LiveUnion->empty()) {
+    if (LR->empty() || LiveUnion->empty()) {
       SeenAllInterferences = true;
       return 0;
     }
 
-    // In most cases, the union will start before VirtReg.
-    VirtRegI = VirtReg->begin();
+    // In most cases, the union will start before LR.
+    LRI = LR->begin();
     LiveUnionI.setMap(LiveUnion->getMap());
-    LiveUnionI.find(VirtRegI->start);
+    LiveUnionI.find(LRI->start);
   }
 
-  LiveInterval::iterator VirtRegEnd = VirtReg->end();
+  LiveRange::const_iterator LREnd = LR->end();
   LiveInterval *RecentReg = nullptr;
   while (LiveUnionI.valid()) {
-    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
+    assert(LRI != LREnd && "Reached end of LR");
 
     // Check for overlapping interference.
-    while (VirtRegI->start < LiveUnionI.stop() &&
-           VirtRegI->end > LiveUnionI.start()) {
+    while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) {
       // This is an overlap, record the interfering register.
       LiveInterval *VReg = LiveUnionI.value();
       if (VReg != RecentReg && !isSeenInterference(VReg)) {
@@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     }
 
     // The iterators are now not overlapping, LiveUnionI has been advanced
-    // beyond VirtRegI.
-    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap");
+    // beyond LRI.
+    assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap");
 
     // Advance the iterator that ends first.
-    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start());
-    if (VirtRegI == VirtRegEnd)
+    LRI = LR->advanceTo(LRI, LiveUnionI.start());
+    if (LRI == LREnd)
       break;
 
     // Detect overlap, handle above.
-    if (VirtRegI->start < LiveUnionI.stop())
+    if (LRI->start < LiveUnionI.stop())
       continue;
 
     // Still not overlapping. Catch up LiveUnionI.
-    LiveUnionI.advanceTo(VirtRegI->start);
+    LiveUnionI.advanceTo(LRI->start);
   }
   SeenAllInterferences = true;
   return InterferingVRegs.size();
diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
index dcc41c1..cde6ccd 100644
--- a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -53,7 +53,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
         continue;
       removeReg(Reg);
     } else if (O->isRegMask())
-      removeRegsInMask(*O, nullptr);
+      removeRegsInMask(*O);
   }
 
   // Add uses to the set.
@@ -120,12 +120,11 @@ void LivePhysRegs::print(raw_ostream &OS) const {
   OS << "\n";
 }
 
-/// Dumps the currently live registers to the debug output.
-LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
   dbgs() << "  " << *this;
-#endif
 }
+#endif
 
 bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
                              unsigned Reg) const {
@@ -143,63 +142,84 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins()) {
-    MCSubRegIndexIterator S(LI.PhysReg, TRI);
-    if (LI.LaneMask.all() || (LI.LaneMask.any() && !S.isValid())) {
-      addReg(LI.PhysReg);
+    unsigned Reg = LI.PhysReg;
+    LaneBitmask Mask = LI.LaneMask;
+    MCSubRegIndexIterator S(Reg, TRI);
+    assert(Mask.any() && "Invalid livein mask");
+    if (Mask.all() || !S.isValid()) {
+      addReg(Reg);
       continue;
     }
     for (; S.isValid(); ++S) {
       unsigned SI = S.getSubRegIndex();
-      if ((LI.LaneMask & TRI->getSubRegIndexLaneMask(SI)).any())
+      if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any())
         addReg(S.getSubReg());
     }
   }
 }
 
-/// Add pristine registers to the given \p LiveRegs. This function removes
-/// actually saved callee save registers when \p InPrologueEpilogue is false.
-static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
-                         const MachineFrameInfo &MFI,
-                         const TargetRegisterInfo &TRI) {
-  for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+/// Adds all callee saved registers to \p LiveRegs.
+static void addCalleeSavedRegs(LivePhysRegs &LiveRegs,
+                               const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR)
     LiveRegs.addReg(*CSR);
+}
+
+/// Adds pristine registers to the given \p LiveRegs. Pristine registers are
+/// callee saved registers that are unused in the function.
+static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+  /// Add all callee saved regs, then remove the ones that are saved+restored.
+  addCalleeSavedRegs(LiveRegs, MF);
+  /// Remove the ones that are not saved/restored; they are pristine.
   for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
     LiveRegs.removeReg(Info.getReg());
 }
 
 void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
-  // To get the live-outs we simply merge the live-ins of all successors.
-  for (const MachineBasicBlock *Succ : MBB.successors())
-    addBlockLiveIns(*Succ);
+  if (!MBB.succ_empty()) {
+    // To get the live-outs we simply merge the live-ins of all successors.
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      addBlockLiveIns(*Succ);
+  } else if (MBB.isReturnBlock()) {
+    // For the return block: Add all callee saved registers that are saved and
+    // restored (somewhere); This does not include callee saved registers that
+    // are unused and hence not saved and restored; they are called pristine.
+    const MachineFunction &MF = *MBB.getParent();
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid()) {
+      for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+        addReg(Info.getReg());
+    }
+  }
 }
 
 void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.isCalleeSavedInfoValid()) {
-    if (MBB.isReturnBlock()) {
-      // The return block has no successors whose live-ins we could merge
-      // below. So instead we add the callee saved registers manually.
-      for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
-        addReg(*I);
-    } else {
-      addPristines(*this, MF, MFI, *TRI);
-    }
+  if (!MBB.succ_empty()) {
+    addPristines(*this, MF);
+    addLiveOutsNoPristines(MBB);
+  } else if (MBB.isReturnBlock()) {
+    // For the return block: Add all callee saved registers.
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid())
+      addCalleeSavedRegs(*this, MF);
   }
-
-  addLiveOutsNoPristines(MBB);
 }
 
 void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.isCalleeSavedInfoValid())
-    addPristines(*this, MF, MFI, *TRI);
+  addPristines(*this, MF);
   addBlockLiveIns(MBB);
 }
 
-void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI,
+void llvm::computeLiveIns(LivePhysRegs &LiveRegs,
+                          const MachineRegisterInfo &MRI,
                           MachineBasicBlock &MBB) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   assert(MBB.livein_empty());
   LiveRegs.init(TRI);
   LiveRegs.addLiveOutsNoPristines(MBB);
@@ -207,10 +227,12 @@ void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI,
     LiveRegs.stepBackward(MI);
 
   for (unsigned Reg : LiveRegs) {
+    if (MRI.isReserved(Reg))
+      continue;
     // Skip the register if we are about to add one of its super registers.
     bool ContainsSuperReg = false;
     for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) {
-      if (LiveRegs.contains(*SReg)) {
+      if (LiveRegs.contains(*SReg) && !MRI.isReserved(*SReg)) {
         ContainsSuperReg = true;
         break;
       }
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 0128376..8c43c9f 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -20,11 +20,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
+// Reserve an address that indicates a value that is known to be "undef".
+static VNInfo UndefVNI(0xbad, SlotIndex());
+
 void LiveRangeCalc::resetLiveOutMap() {
   unsigned NumBlocks = MF->getNumBlockIDs();
   Seen.clear();
   Seen.resize(NumBlocks);
-  EntryInfoMap.clear();
+  EntryInfos.clear();
   Map.resize(NumBlocks);
 }
 
@@ -75,34 +78,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
         LI.createSubRangeFrom(*Alloc, ClassMask, LI);
       }
 
-      LaneBitmask Mask = SubMask;
-      for (LiveInterval::SubRange &S : LI.subranges()) {
-        // A Mask for subregs common to the existing subrange and current def.
-        LaneBitmask Common = S.LaneMask & Mask;
-        if (Common.none())
-          continue;
-        LiveInterval::SubRange *CommonRange;
-        // A Mask for subregs covered by the subrange but not the current def.
-        LaneBitmask RM = S.LaneMask & ~Mask;
-        if (RM.any()) {
-          // Split the subrange S into two parts: one covered by the current
-          // def (CommonRange), and the one not affected by it (updated S).
-          S.LaneMask = RM;
-          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
-        } else {
-          assert(Common == S.LaneMask);
-          CommonRange = &S;
-        }
+      LI.refineSubRanges(*Alloc, SubMask,
+          [&MO, this](LiveInterval::SubRange &SR) {
         if (MO.isDef())
-          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
-        Mask &= ~Common;
-      }
-      // Create a new SubRange for subregs we did not cover yet.
-      if (Mask.any()) {
-        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
-        if (MO.isDef())
-          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
-      }
+          createDeadDef(*Indexes, *Alloc, SR, MO);
+      });
     }
 
     // Create the def in the main liverange. We do not have to do this if
@@ -289,8 +269,7 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
   if (UndefOnEntry[BN])
     return false;
 
-  auto MarkDefined =
-        [this,BN,&DefOnEntry,&UndefOnEntry] (MachineBasicBlock &B) -> bool {
+  auto MarkDefined = [BN, &DefOnEntry](MachineBasicBlock &B) -> bool {
     for (MachineBasicBlock *S : B.successors())
       DefOnEntry[S->getNumber()] = true;
     DefOnEntry[BN] = true;
@@ -307,11 +286,19 @@ bool LiveRangeCalc::isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
     // Determine if the exit from the block is reached by some def.
     unsigned N = WorkList[i];
     MachineBasicBlock &B = *MF->getBlockNumbered(N);
-    if (Seen[N] && Map[&B].first != nullptr)
-      return MarkDefined(B);
+    if (Seen[N]) {
+      const LiveOutPair &LOB = Map[&B];
+      if (LOB.first != nullptr && LOB.first != &UndefVNI)
+        return MarkDefined(B);
+    }
     SlotIndex Begin, End;
     std::tie(Begin, End) = Indexes->getMBBRange(&B);
-    LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(), End);
+    // Treat End as not belonging to B.
+    // If LR has a segment S that starts at the next block, i.e. [End, ...),
+    // std::upper_bound will return the segment following S. Instead,
+    // S should be treated as the first segment that does not overlap B.
+    LiveRange::iterator UB = std::upper_bound(LR.begin(), LR.end(),
+                                              End.getPrevSlot());
     if (UB != LR.begin()) {
       LiveRange::Segment &Seg = *std::prev(UB);
       if (Seg.end > Begin) {
@@ -384,10 +371,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
 #endif
     FoundUndef |= MBB->pred_empty();
 
-    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-       MachineBasicBlock *Pred = *PI;
-
+    for (MachineBasicBlock *Pred : MBB->predecessors()) {
        // Is this a known live-out block?
        if (Seen.test(Pred->getNumber())) {
          if (VNInfo *VNI = Map[Pred].first) {
@@ -406,7 +390,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
        auto EP = LR.extendInBlock(Undefs, Start, End);
        VNInfo *VNI = EP.first;
        FoundUndef |= EP.second;
-       setLiveOutValue(Pred, VNI);
+       setLiveOutValue(Pred, EP.second ? &UndefVNI : VNI);
        if (VNI) {
          if (TheVNI && TheVNI != VNI)
            UniqueVNI = false;
@@ -425,7 +409,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
   }
 
   LiveIn.clear();
-  FoundUndef |= (TheVNI == nullptr);
+  FoundUndef |= (TheVNI == nullptr || TheVNI == &UndefVNI);
   if (Undefs.size() > 0 && FoundUndef)
     UniqueVNI = false;
 
@@ -436,7 +420,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
 
   // If a unique reaching def was found, blit in the live ranges immediately.
   if (UniqueVNI) {
-    assert(TheVNI != nullptr);
+    assert(TheVNI != nullptr && TheVNI != &UndefVNI);
     LiveRangeUpdater Updater(&LR);
     for (unsigned BN : WorkList) {
       SlotIndex Start, End;
@@ -452,22 +436,26 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
   }
 
   // Prepare the defined/undefined bit vectors.
-  auto EF = EntryInfoMap.find(&LR);
-  if (EF == EntryInfoMap.end()) {
+  EntryInfoMap::iterator Entry;
+  bool DidInsert;
+  std::tie(Entry, DidInsert) = EntryInfos.insert(
+      std::make_pair(&LR, std::make_pair(BitVector(), BitVector())));
+  if (DidInsert) {
+    // Initialize newly inserted entries.
     unsigned N = MF->getNumBlockIDs();
-    EF = EntryInfoMap.insert({&LR, {BitVector(), BitVector()}}).first;
-    EF->second.first.resize(N);
-    EF->second.second.resize(N);
+    Entry->second.first.resize(N);
+    Entry->second.second.resize(N);
   }
-  BitVector &DefOnEntry = EF->second.first;
-  BitVector &UndefOnEntry = EF->second.second;
+  BitVector &DefOnEntry = Entry->second.first;
+  BitVector &UndefOnEntry = Entry->second.second;
 
   // Multiple values were found, so transfer the work list to the LiveIn array
   // where UpdateSSA will use it as a work list.
   LiveIn.reserve(WorkList.size());
   for (unsigned BN : WorkList) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(BN);
-    if (Undefs.size() > 0 && !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry))
+    if (Undefs.size() > 0 &&
+        !isDefOnEntry(LR, Undefs, *MBB, DefOnEntry, UndefOnEntry))
       continue;
     addLiveInBlock(LR, DomTree->getNode(MBB));
     if (MBB == &UseMBB)
@@ -485,9 +473,9 @@ void LiveRangeCalc::updateSSA() {
   assert(DomTree && "Missing dominator tree");
 
   // Interate until convergence.
-  unsigned Changes;
+  bool Changed;
   do {
-    Changes = 0;
+    Changed = false;
     // Propagate live-out values down the dominator tree, inserting phi-defs
     // when necessary.
     for (LiveInBlock &I : LiveIn) {
@@ -510,15 +498,20 @@ void LiveRangeCalc::updateSSA() {
         IDomValue = Map[IDom->getBlock()];
 
         // Cache the DomTree node that defined the value.
-        if (IDomValue.first && !IDomValue.second)
+        if (IDomValue.first && IDomValue.first != &UndefVNI &&
+            !IDomValue.second) {
           Map[IDom->getBlock()].second = IDomValue.second =
             DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
+        }
 
-        for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-               PE = MBB->pred_end(); PI != PE; ++PI) {
-          LiveOutPair &Value = Map[*PI];
+        for (MachineBasicBlock *Pred : MBB->predecessors()) {
+          LiveOutPair &Value = Map[Pred];
           if (!Value.first || Value.first == IDomValue.first)
             continue;
+          if (Value.first == &UndefVNI) {
+            needPHI = true;
+            break;
+          }
 
           // Cache the DomTree node that defined the value.
           if (!Value.second)
@@ -542,7 +535,7 @@ void LiveRangeCalc::updateSSA() {
 
       // Create a phi-def if required.
       if (needPHI) {
-        ++Changes;
+        Changed = true;
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         std::tie(Start, End) = Indexes->getMBBRange(MBB);
@@ -561,7 +554,7 @@ void LiveRangeCalc::updateSSA() {
             LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
-      } else if (IDomValue.first) {
+      } else if (IDomValue.first && IDomValue.first != &UndefVNI) {
         // No phi-def here. Remember incoming value.
         I.Value = IDomValue.first;
 
@@ -573,9 +566,9 @@ void LiveRangeCalc::updateSSA() {
         // MBB is live-out and doesn't define its own value.
         if (LOP.first == IDomValue.first)
           continue;
-        ++Changes;
+        Changed = true;
         LOP = IDomValue;
       }
     }
-  } while (Changes);
+  } while (Changed);
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
index 1a7598f..d41b782 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
@@ -24,6 +24,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/LiveInterval.h"
 
@@ -65,7 +66,8 @@ class LiveRangeCalc {
   /// registers do not overlap), but the defined/undefined information must
   /// be kept separate for each individual range.
   /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }.
-  std::map<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap;
+  typedef DenseMap<LiveRange*,std::pair<BitVector,BitVector>> EntryInfoMap;
+  EntryInfoMap EntryInfos;
 
   /// Map each basic block where a live range is live out to the live-out value
   /// and its defining block.
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 7f1c69c..92cca1a 100644
--- a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
   LiveInterval &LI = LIS.createEmptyInterval(VReg);
+  if (Parent && !Parent->isSpillable())
+    LI.markNotSpillable();
   // Create empty subranges if the OldReg's interval has them. Do not create
   // the main range here---it will be constructed later after the subranges
   // have been finalized.
@@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
+  // FIXME: Getting the interval here actually computes it.
+  // In theory, this may not be what we want, but in practice
+  // the createEmptyIntervalFrom API is used when this is not
+  // the case. Generally speaking we just want to annotate the
+  // LiveInterval when it gets created but we cannot do that at
+  // the moment.
+  if (Parent && !Parent->isSpillable())
+    LIS.getInterval(VReg).markNotSpillable();
   return VReg;
 }
 
@@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
   if (VRM)
     VRM->grow();
 
-  if (Parent && !Parent->isSpillable())
-    LIS.getInterval(VReg).markNotSpillable();
-
   NewRegs.push_back(VReg);
 }
 
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp b/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp
new file mode 100644
index 0000000..552f4b5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -0,0 +1,231 @@
+//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+///===---------------------------------------------------------------------===//
+///
+/// \file
+/// This pass moves instructions close to the definition of its operands to
+/// shrink live range of the def instruction. The code motion is limited within
+/// the basic block. The moved instruction should have 1 def, and more than one
+/// uses, all of which are the only use of the def.
+///
+///===---------------------------------------------------------------------===//
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lrshrink"
+
+STATISTIC(NumInstrsHoistedToShrinkLiveRange,
+          "Number of insructions hoisted to shrink live range.");
+
+using namespace llvm;
+
+namespace {
+class LiveRangeShrink : public MachineFunctionPass {
+public:
+  static char ID;
+
+  LiveRangeShrink() : MachineFunctionPass(ID) {
+    initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Live Range Shrink"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char LiveRangeShrink::ID = 0;
+char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID;
+
+INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false,
+                false)
+namespace {
+typedef DenseMap<MachineInstr *, unsigned> InstOrderMap;
+
+/// Returns \p New if it's dominated by \p Old, otherwise return \p Old.
+/// \p M maintains a map from instruction to its dominating order that satisfies
+/// M[A] > M[B] guarantees that A is dominated by B.
+/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return
+/// \p New.
+MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old,
+                                       const InstOrderMap &M) {
+  auto NewIter = M.find(&New);
+  if (NewIter == M.end())
+    return Old;
+  if (Old == nullptr)
+    return &New;
+  unsigned OrderOld = M.find(Old)->second;
+  unsigned OrderNew = NewIter->second;
+  if (OrderOld != OrderNew)
+    return OrderOld < OrderNew ? &New : Old;
+  // OrderOld == OrderNew, we need to iterate down from Old to see if it
+  // can reach New, if yes, New is dominated by Old.
+  for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew;
+       I = I->getNextNode())
+    if (I == &New)
+      return &New;
+  return Old;
+}
+
+/// Builds Instruction to its dominating order number map \p M by traversing
+/// from instruction \p Start.
+void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) {
+  M.clear();
+  unsigned i = 0;
+  for (MachineInstr &I : make_range(Start, Start->getParent()->end()))
+    M[&I] = i++;
+}
+} // end anonymous namespace
+
+bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  InstOrderMap IOM;
+  // Map from register to instruction order (value of IOM) where the
+  // register is used last. When moving instructions up, we need to
+  // make sure all its defs (including dead def) will not cross its
+  // last use when moving up.
+  DenseMap<unsigned, std::pair<unsigned, MachineInstr *>> UseMap;
+
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+    bool SawStore = false;
+    BuildInstOrderMap(MBB.begin(), IOM);
+    UseMap.clear();
+
+    for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) {
+      MachineInstr &MI = *Next;
+      ++Next;
+      if (MI.isPHI() || MI.isDebugValue())
+        continue;
+      if (MI.mayStore())
+        SawStore = true;
+
+      unsigned CurrentOrder = IOM[&MI];
+      unsigned Barrier = 0;
+      MachineInstr *BarrierMI = nullptr;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDebug())
+          continue;
+        if (MO.isUse())
+          UseMap[MO.getReg()] = std::make_pair(CurrentOrder, &MI);
+        else if (MO.isDead() && UseMap.count(MO.getReg()))
+          // Barrier is the last instruction where MO get used. MI should not
+          // be moved above Barrier.
+          if (Barrier < UseMap[MO.getReg()].first) {
+            Barrier = UseMap[MO.getReg()].first;
+            BarrierMI = UseMap[MO.getReg()].second;
+          }
+      }
+
+      if (!MI.isSafeToMove(nullptr, SawStore)) {
+        // If MI has side effects, it should become a barrier for code motion.
+        // IOM is rebuild from the next instruction to prevent later
+        // instructions from being moved before this MI.
+        if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+          BuildInstOrderMap(Next, IOM);
+          SawStore = false;
+        }
+        continue;
+      }
+
+      const MachineOperand *DefMO = nullptr;
+      MachineInstr *Insert = nullptr;
+
+      // Number of live-ranges that will be shortened. We do not count
+      // live-ranges that are defined by a COPY as it could be coalesced later.
+      unsigned NumEligibleUse = 0;
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || MO.isDead() || MO.isDebug())
+          continue;
+        unsigned Reg = MO.getReg();
+        // Do not move the instruction if it def/uses a physical register,
+        // unless it is a constant physical register or a noreg.
+        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+          if (!Reg || MRI.isConstantPhysReg(Reg))
+            continue;
+          Insert = nullptr;
+          break;
+        }
+        if (MO.isDef()) {
+          // Do not move if there is more than one def.
+          if (DefMO) {
+            Insert = nullptr;
+            break;
+          }
+          DefMO = &MO;
+        } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg) && DefMO &&
+                   MRI.getRegClass(DefMO->getReg()) ==
+                       MRI.getRegClass(MO.getReg())) {
+          // The heuristic does not handle different register classes yet
+          // (registers of different sizes, looser/tighter constraints). This
+          // is because it needs more accurate model to handle register
+          // pressure correctly.
+          MachineInstr &DefInstr = *MRI.def_instr_begin(Reg);
+          if (!DefInstr.isCopy())
+            NumEligibleUse++;
+          Insert = FindDominatedInstruction(DefInstr, Insert, IOM);
+        } else {
+          Insert = nullptr;
+          break;
+        }
+      }
+
+      // If Barrier equals IOM[I], traverse forward to find if BarrierMI is
+      // after Insert, if yes, then we should not hoist.
+      for (MachineInstr *I = Insert; I && IOM[I] == Barrier;
+           I = I->getNextNode())
+        if (I == BarrierMI) {
+          Insert = nullptr;
+          break;
+        }
+      // Move the instruction when # of shrunk live range > 1.
+      if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) {
+        MachineBasicBlock::iterator I = std::next(Insert->getIterator());
+        // Skip all the PHI and debug instructions.
+        while (I != MBB.end() && (I->isPHI() || I->isDebugValue()))
+          I = std::next(I);
+        if (I == MI.getIterator())
+          continue;
+
+        // Update the dominator order to be the same as the insertion point.
+        // We do this to maintain a non-decreasing order without need to update
+        // all instruction orders after the insertion point.
+        unsigned NewOrder = IOM[&*I];
+        IOM[&MI] = NewOrder;
+        NumInstrsHoistedToShrinkLiveRange++;
+
+        // Find MI's debug value following MI.
+        MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
+        if (MI.getOperand(0).isReg())
+          for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
+                 EndIter->getOperand(0).isReg() &&
+                 EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
+               ++EndIter, ++Next)
+            IOM[&*EndIter] = NewOrder;
+        MBB.splice(I, &MBB, MI.getIterator(), EndIter);
+      }
+    }
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 7a51386..60033db 100644
--- a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -1,4 +1,4 @@
-//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//===- LiveRegMatrix.cpp - Track register interference --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,12 +14,19 @@
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
                     "Live Register Matrix", false, false)
 
-LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
-  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {}
 
 void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
@@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   return Result;
 }
 
-LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
                                                unsigned RegUnit) {
   LiveIntervalUnion::Query &Q = Queries[RegUnit];
-  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  Q.init(UserTag, LR, Matrix[RegUnit]);
   return Q;
 }
 
@@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
     return IK_RegUnit;
 
   // Check the matrix for virtual register interference.
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (query(VirtReg, *Units).checkInterference())
-      return IK_VirtReg;
+  bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
+                                  [&](unsigned Unit, const LiveRange &LR) {
+    return query(LR, Unit).checkInterference();
+  });
+  if (Interference)
+    return IK_VirtReg;
 
   return IK_Free;
 }
diff --git a/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp
new file mode 100644
index 0000000..f9ba4ff
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -0,0 +1,132 @@
+//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file imlements the LiveRegUnits set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) {
+  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
+      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg))
+        Units.reset(U);
+    }
+  }
+}
+
+void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) {
+  for (unsigned U = 0, E = TRI->getNumRegUnits(); U != E; ++U) {
+    for (MCRegUnitRootIterator RootReg(U, TRI); RootReg.isValid(); ++RootReg) {
+      if (MachineOperand::clobbersPhysReg(RegMask, *RootReg))
+        Units.set(U);
+    }
+  }
+}
+
+void LiveRegUnits::stepBackward(const MachineInstr &MI) {
+  // Remove defined registers and regmask kills from the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (!O->isDef())
+        continue;
+      unsigned Reg = O->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      removeReg(Reg);
+    } else if (O->isRegMask())
+      removeRegsNotPreserved(O->getRegMask());
+  }
+
+  // Add uses to the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg())
+      continue;
+    unsigned Reg = O->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    addReg(Reg);
+  }
+}
+
+void LiveRegUnits::accumulate(const MachineInstr &MI) {
+  // Add defs, uses and regmask clobbers to the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      unsigned Reg = O->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      if (!O->isDef() && !O->readsReg())
+        continue;
+      addReg(Reg);
+    } else if (O->isRegMask())
+      addRegsInMask(O->getRegMask());
+  }
+}
+
+/// Add live-in registers of basic block \p MBB to \p LiveUnits.
+static void addBlockLiveIns(LiveRegUnits &LiveUnits,
+                            const MachineBasicBlock &MBB) {
+  for (const auto &LI : MBB.liveins())
+    LiveUnits.addRegMasked(LI.PhysReg, LI.LaneMask);
+}
+
+/// Adds all callee saved registers to \p LiveUnits.
+static void addCalleeSavedRegs(LiveRegUnits &LiveUnits,
+                               const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR)
+    LiveUnits.addReg(*CSR);
+}
+
+/// Adds pristine registers to the given \p LiveUnits. Pristine registers are
+/// callee saved registers that are unused in the function.
+static void addPristines(LiveRegUnits &LiveUnits, const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+  /// Add all callee saved regs, then remove the ones that are saved+restored.
+  addCalleeSavedRegs(LiveUnits, MF);
+  /// Remove the ones that are not saved/restored; they are pristine.
+  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+    LiveUnits.removeReg(Info.getReg());
+}
+
+void LiveRegUnits::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  if (!MBB.succ_empty()) {
+    addPristines(*this, MF);
+    // To get the live-outs we simply merge the live-ins of all successors.
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      addBlockLiveIns(*this, *Succ);
+  } else if (MBB.isReturnBlock()) {
+    // For the return block: Add all callee saved registers.
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid())
+      addCalleeSavedRegs(*this, MF);
+  }
+}
+
+void LiveRegUnits::addLiveIns(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  addPristines(*this, MF);
+  addBlockLiveIns(*this, MBB);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
index dbf1f96..b51f8b0 100644
--- a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
@@ -25,10 +25,10 @@ using namespace llvm;
 #define DEBUG_TYPE "livestacks"
 
 char LiveStacks::ID = 0;
-INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
+INITIALIZE_PASS_BEGIN(LiveStacks, DEBUG_TYPE,
                 "Live Stack Slot Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_END(LiveStacks, "livestacks",
+INITIALIZE_PASS_END(LiveStacks, DEBUG_TYPE,
                 "Live Stack Slot Analysis", false, false)
 
 char &llvm::LiveStacksID = LiveStacks::ID;
diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
index 269b990a31..a9aec92 100644
--- a/contrib/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
@@ -64,8 +64,8 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
   return nullptr;
 }
 
-LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
   dbgs() << "  Alive in blocks: ";
   for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
            E = AliveBlocks.end(); I != E; ++I)
@@ -78,8 +78,8 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
       dbgs() << "\n    #" << i << ": " << *Kills[i];
     dbgs() << "\n";
   }
-#endif
 }
+#endif
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
 LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
@@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
                                 MachineBasicBlock *SuccBB) {
   const unsigned NumNew = BB->getNumber();
 
-  SmallSet<unsigned, 16> Defs, Kills;
+  DenseSet<unsigned> Defs, Kills;
 
   MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
   for (; BBI != BBE && BBI->isPHI(); ++BBI) {
diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index e189fb0..b109f19 100644
--- a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -23,6 +22,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -103,10 +103,10 @@ namespace {
 
 char LocalStackSlotPass::ID = 0;
 char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
-INITIALIZE_PASS_BEGIN(LocalStackSlotPass, "localstackalloc",
+INITIALIZE_PASS_BEGIN(LocalStackSlotPass, DEBUG_TYPE,
                       "Local Stack Slot Allocation", false, false)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc",
+INITIALIZE_PASS_END(LocalStackSlotPass, DEBUG_TYPE,
                     "Local Stack Slot Allocation", false, false)
 
 
diff --git a/contrib/llvm/lib/CodeGen/LowLevelType.cpp b/contrib/llvm/lib/CodeGen/LowLevelType.cpp
index d74b730..1c682e7 100644
--- a/contrib/llvm/lib/CodeGen/LowLevelType.cpp
+++ b/contrib/llvm/lib/CodeGen/LowLevelType.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===//
+//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,54 +18,21 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-LLT::LLT(Type &Ty, const DataLayout &DL) {
+LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
   if (auto VTy = dyn_cast<VectorType>(&Ty)) {
-    SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    ElementsOrAddrSpace = VTy->getNumElements();
-    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
+    auto NumElements = VTy->getNumElements();
+    LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL);
+    if (NumElements == 1)
+      return ScalarTy;
+    return LLT::vector(NumElements, ScalarTy);
   } else if (auto PTy = dyn_cast<PointerType>(&Ty)) {
-    Kind = Pointer;
-    SizeInBits = DL.getTypeSizeInBits(&Ty);
-    ElementsOrAddrSpace = PTy->getAddressSpace();
+    return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty));
   } else if (Ty.isSized()) {
     // Aggregates are no different from real scalars as far as GlobalISel is
     // concerned.
-    Kind = Scalar;
-    SizeInBits = DL.getTypeSizeInBits(&Ty);
-    ElementsOrAddrSpace = 1;
+    auto SizeInBits = DL.getTypeSizeInBits(&Ty);
     assert(SizeInBits != 0 && "invalid zero-sized type");
-  } else {
-    Kind = Invalid;
-    SizeInBits = ElementsOrAddrSpace = 0;
+    return LLT::scalar(SizeInBits);
   }
-}
-
-LLT::LLT(MVT VT) {
-  if (VT.isVector()) {
-    SizeInBits = VT.getVectorElementType().getSizeInBits();
-    ElementsOrAddrSpace = VT.getVectorNumElements();
-    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
-  } else if (VT.isValid()) {
-    // Aggregates are no different from real scalars as far as GlobalISel is
-    // concerned.
-    Kind = Scalar;
-    SizeInBits = VT.getSizeInBits();
-    ElementsOrAddrSpace = 1;
-    assert(SizeInBits != 0 && "invalid zero-sized type");
-  } else {
-    Kind = Invalid;
-    SizeInBits = ElementsOrAddrSpace = 0;
-  }
-}
-
-void LLT::print(raw_ostream &OS) const {
-  if (isVector())
-    OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">";
-  else if (isPointer())
-    OS << "p" << getAddressSpace();
-  else if (isValid()) {
-    assert(isScalar() && "unexpected type");
-    OS << "s" << getScalarSizeInBits();
-  } else
-    llvm_unreachable("trying to print an invalid type");
+  return LLT();
 }
diff --git a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
index 6966c8c..0fc48d4 100644
--- a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
+++ b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
@@ -28,14 +29,12 @@ using namespace llvm;
 namespace {
 
 class LowerEmuTLS : public ModulePass {
-  const TargetMachine *TM;
 public:
   static char ID; // Pass identification, replacement for typeid
-  explicit LowerEmuTLS() : ModulePass(ID), TM(nullptr) { }
-  explicit LowerEmuTLS(const TargetMachine *TM)
-      : ModulePass(ID), TM(TM) {
+  LowerEmuTLS() : ModulePass(ID) {
     initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry());
   }
+
   bool runOnModule(Module &M) override;
 private:
   bool addEmuTlsVar(Module &M, const GlobalVariable *GV);
@@ -54,19 +53,22 @@ private:
 
 char LowerEmuTLS::ID = 0;
 
-INITIALIZE_PASS(LowerEmuTLS, "loweremutls",
-                "Add __emutls_[vt]. variables for emultated TLS model",
-                false, false)
+INITIALIZE_PASS(LowerEmuTLS, DEBUG_TYPE,
+                "Add __emutls_[vt]. variables for emultated TLS model", false,
+                false)
 
-ModulePass *llvm::createLowerEmuTLSPass(const TargetMachine *TM) {
-  return new LowerEmuTLS(TM);
-}
+ModulePass *llvm::createLowerEmuTLSPass() { return new LowerEmuTLS(); }
 
 bool LowerEmuTLS::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
-  if (!TM || !TM->Options.EmulatedTLS)
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<TargetMachine>();
+  if (!TM.Options.EmulatedTLS)
     return false;
 
   bool Changed = false;
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 1f1ce6e..58a655a 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -365,6 +365,14 @@ static Cursor maybeLexIRValue(Cursor C, MIToken &Token,
   return lexName(C, Token, MIToken::NamedIRValue, Rule.size(), ErrorCallback);
 }
 
+static Cursor maybeLexStringConstant(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '"')
+    return None;
+  return lexName(C, Token, MIToken::StringConstant, /*PrefixLength=*/0,
+                 ErrorCallback);
+}
+
 static Cursor lexVirtualRegister(Cursor C, MIToken &Token) {
   auto Range = C;
   C.advance(); // Skip '%'
@@ -630,6 +638,8 @@ StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
     return R.remaining();
   if (Cursor R = maybeLexEscapedIRValue(C, Token, ErrorCallback))
     return R.remaining();
+  if (Cursor R = maybeLexStringConstant(C, Token, ErrorCallback))
+    return R.remaining();
 
   Token.reset(MIToken::Error, C.remaining());
   ErrorCallback(C.location(),
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
index edba749..08b82e5 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -16,8 +16,8 @@
 #define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
 
 #include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include <functional>
 
 namespace llvm {
@@ -127,7 +127,8 @@ struct MIToken {
     NamedIRValue,
     IRValue,
     QuotedIRValue, // `<constant value>`
-    SubRegisterIndex
+    SubRegisterIndex,
+    StringConstant
   };
 
 private:
@@ -168,7 +169,8 @@ public:
 
   bool isMemoryOperandFlag() const {
     return Kind == kw_volatile || Kind == kw_non_temporal ||
-           Kind == kw_dereferenceable || Kind == kw_invariant;
+           Kind == kw_dereferenceable || Kind == kw_invariant ||
+           Kind == StringConstant;
   }
 
   bool is(TokenKind K) const { return Kind == K; }
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index c8bed08..c68d87b 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -11,12 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MIParser.h"
 #include "MILexer.h"
+#include "MIParser.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -24,25 +34,57 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
 #include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 
 PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF,
-    SourceMgr &SM, const SlotMapping &IRSlots)
-  : MF(MF), SM(&SM), IRSlots(IRSlots) {
+    SourceMgr &SM, const SlotMapping &IRSlots,
+    const Name2RegClassMap &Names2RegClasses,
+    const Name2RegBankMap &Names2RegBanks)
+  : MF(MF), SM(&SM), IRSlots(IRSlots), Names2RegClasses(Names2RegClasses),
+    Names2RegBanks(Names2RegBanks) {
 }
 
 VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) {
@@ -99,6 +141,8 @@ class MIParser {
   StringMap<unsigned> Names2DirectTargetFlags;
   /// Maps from direct target flag names to the bitmask target flag values.
   StringMap<unsigned> Names2BitmaskTargetFlags;
+  /// Maps from MMO target flag names to MMO target flag values.
+  StringMap<MachineMemOperand::Flags> Names2MMOTargetFlags;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -131,7 +175,8 @@ public:
 
   bool
   parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
-  bool parseBasicBlock(MachineBasicBlock &MBB);
+  bool parseBasicBlock(MachineBasicBlock &MBB,
+                       MachineBasicBlock *&AddFalthroughFrom);
   bool parseBasicBlockLiveins(MachineBasicBlock &MBB);
   bool parseBasicBlockSuccessors(MachineBasicBlock &MBB);
 
@@ -139,6 +184,7 @@ public:
   bool parseVirtualRegister(VRegInfo *&Info);
   bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo);
   bool parseRegisterFlag(unsigned &Flags);
+  bool parseRegisterClassOrBank(VRegInfo &RegInfo);
   bool parseSubRegisterIndex(unsigned &SubReg);
   bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);
   bool parseRegisterOperand(MachineOperand &Dest,
@@ -172,6 +218,7 @@ public:
   bool parseIntrinsicOperand(MachineOperand &Dest);
   bool parsePredicateOperand(MachineOperand &Dest);
   bool parseTargetIndexOperand(MachineOperand &Dest);
+  bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
   bool parseMachineOperand(MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
@@ -184,6 +231,8 @@ public:
   bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
   bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
   bool parseMachinePointerInfo(MachinePointerInfo &Dest);
+  bool parseOptionalScope(LLVMContext &Context, SyncScope::ID &SSID);
+  bool parseOptionalAtomicOrdering(AtomicOrdering &Order);
   bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
 
 private:
@@ -272,6 +321,18 @@ private:
   ///
   /// Return true if the name isn't a name of a bitmask target flag.
   bool getBitmaskTargetFlag(StringRef Name, unsigned &Flag);
+
+  void initNames2MMOTargetFlags();
+
+  /// Try to convert a name of a MachineMemOperand target flag to the
+  /// corresponding target flag.
+  ///
+  /// Return true if the name isn't a name of a target MMO flag.
+  bool getMMOTargetFlag(StringRef Name, MachineMemOperand::Flags &Flag);
+
+  /// parseStringConstant
+  ///   ::= StringConstant
+  bool parseStringConstant(std::string &Result);
 };
 
 } // end anonymous namespace
@@ -512,7 +573,8 @@ bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
   return false;
 }
 
-bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) {
+bool MIParser::parseBasicBlock(MachineBasicBlock &MBB,
+                               MachineBasicBlock *&AddFalthroughFrom) {
   // Skip the definition.
   assert(Token.is(MIToken::MachineBasicBlockLabel));
   lex();
@@ -532,10 +594,12 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) {
   //
   // is equivalent to
   //   liveins: %edi, %esi
+  bool ExplicitSuccessors = false;
   while (true) {
     if (Token.is(MIToken::kw_successors)) {
       if (parseBasicBlockSuccessors(MBB))
         return true;
+      ExplicitSuccessors = true;
     } else if (Token.is(MIToken::kw_liveins)) {
       if (parseBasicBlockLiveins(MBB))
         return true;
@@ -551,10 +615,9 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) {
   // Parse the instructions.
   bool IsInBundle = false;
   MachineInstr *PrevMI = nullptr;
-  while (true) {
-    if (Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof))
-      return false;
-    else if (consumeIfPresent(MIToken::Newline))
+  while (!Token.is(MIToken::MachineBasicBlockLabel) &&
+         !Token.is(MIToken::Eof)) {
+    if (consumeIfPresent(MIToken::Newline))
       continue;
     if (consumeIfPresent(MIToken::rbrace)) {
       // The first parsing pass should verify that all closing '}' have an
@@ -586,6 +649,22 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) {
     assert(Token.isNewlineOrEOF() && "MI is not fully parsed");
     lex();
   }
+
+  // Construct successor list by searching for basic block machine operands.
+  if (!ExplicitSuccessors) {
+    SmallVector<MachineBasicBlock*,4> Successors;
+    bool IsFallthrough;
+    guessSuccessors(MBB, Successors, IsFallthrough);
+    for (MachineBasicBlock *Succ : Successors)
+      MBB.addSuccessor(Succ);
+
+    if (IsFallthrough) {
+      AddFalthroughFrom = &MBB;
+    } else {
+      MBB.normalizeSuccProbs();
+    }
+  }
+
   return false;
 }
 
@@ -599,11 +678,18 @@ bool MIParser::parseBasicBlocks() {
   // The first parsing pass should have verified that this token is a MBB label
   // in the 'parseBasicBlockDefinitions' method.
   assert(Token.is(MIToken::MachineBasicBlockLabel));
+  MachineBasicBlock *AddFalthroughFrom = nullptr;
   do {
     MachineBasicBlock *MBB = nullptr;
     if (parseMBBReference(MBB))
       return true;
-    if (parseBasicBlock(*MBB))
+    if (AddFalthroughFrom) {
+      if (!AddFalthroughFrom->isSuccessor(MBB))
+        AddFalthroughFrom->addSuccessor(MBB);
+      AddFalthroughFrom->normalizeSuccProbs();
+      AddFalthroughFrom = nullptr;
+    }
+    if (parseBasicBlock(*MBB, AddFalthroughFrom))
       return true;
     // The method 'parseBasicBlock' should parse the whole block until the next
     // block or the end of file.
@@ -878,6 +964,66 @@ bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) {
   }
 }
 
+bool MIParser::parseRegisterClassOrBank(VRegInfo &RegInfo) {
+  if (Token.isNot(MIToken::Identifier) && Token.isNot(MIToken::underscore))
+    return error("expected '_', register class, or register bank name");
+  StringRef::iterator Loc = Token.location();
+  StringRef Name = Token.stringValue();
+
+  // Was it a register class?
+  auto RCNameI = PFS.Names2RegClasses.find(Name);
+  if (RCNameI != PFS.Names2RegClasses.end()) {
+    lex();
+    const TargetRegisterClass &RC = *RCNameI->getValue();
+
+    switch (RegInfo.Kind) {
+    case VRegInfo::UNKNOWN:
+    case VRegInfo::NORMAL:
+      RegInfo.Kind = VRegInfo::NORMAL;
+      if (RegInfo.Explicit && RegInfo.D.RC != &RC) {
+        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+        return error(Loc, Twine("conflicting register classes, previously: ") +
+                     Twine(TRI.getRegClassName(RegInfo.D.RC)));
+      }
+      RegInfo.D.RC = &RC;
+      RegInfo.Explicit = true;
+      return false;
+
+    case VRegInfo::GENERIC:
+    case VRegInfo::REGBANK:
+      return error(Loc, "register class specification on generic register");
+    }
+    llvm_unreachable("Unexpected register kind");
+  }
+
+  // Should be a register bank or a generic register.
+  const RegisterBank *RegBank = nullptr;
+  if (Name != "_") {
+    auto RBNameI = PFS.Names2RegBanks.find(Name);
+    if (RBNameI == PFS.Names2RegBanks.end())
+      return error(Loc, "expected '_', register class, or register bank name");
+    RegBank = RBNameI->getValue();
+  }
+
+  lex();
+
+  switch (RegInfo.Kind) {
+  case VRegInfo::UNKNOWN:
+  case VRegInfo::GENERIC:
+  case VRegInfo::REGBANK:
+    RegInfo.Kind = RegBank ? VRegInfo::REGBANK : VRegInfo::GENERIC;
+    if (RegInfo.Explicit && RegInfo.D.RegBank != RegBank)
+      return error(Loc, "conflicting generic register banks");
+    RegInfo.D.RegBank = RegBank;
+    RegInfo.Explicit = true;
+    return false;
+
+  case VRegInfo::NORMAL:
+    return error(Loc, "register bank specification on normal register");
+  }
+  llvm_unreachable("Unexpected register kind");
+}
+
 bool MIParser::parseRegisterFlag(unsigned &Flags) {
   const unsigned OldFlags = Flags;
   switch (Token.kind()) {
@@ -1004,6 +1150,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       return error("subregister index expects a virtual register");
   }
+  if (Token.is(MIToken::colon)) {
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return error("register class specification expects a virtual register");
+    lex();
+    if (parseRegisterClassOrBank(*RegInfo))
+        return true;
+  }
   MachineRegisterInfo &MRI = MF.getRegInfo();
   if ((Flags & RegState::Define) == 0) {
     if (consumeIfPresent(MIToken::lparen)) {
@@ -1598,6 +1751,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask");
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  while (true) {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg;
+    if (parseNamedRegister(Reg))
+      return true;
+    lex();
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+    // TODO: Report an error if the same register is used more than once.
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegMask(Mask);
+  return false;
+}
+
 bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_liveout));
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
@@ -1695,8 +1877,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       Dest = MachineOperand::CreateRegMask(RegMask);
       lex();
       break;
-    }
-    LLVM_FALLTHROUGH;
+    } else
+      return parseCustomRegisterMaskOperand(Dest);
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -1867,7 +2049,14 @@ bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
   case MIToken::kw_invariant:
     Flags |= MachineMemOperand::MOInvariant;
     break;
-  // TODO: parse the target specific memory operand flags.
+  case MIToken::StringConstant: {
+    MachineMemOperand::Flags TF;
+    if (getMMOTargetFlag(Token.stringValue(), TF))
+      return error("use of undefined target MMO flag '" + Token.stringValue() +
+                   "'");
+    Flags |= TF;
+    break;
+  }
   default:
     llvm_unreachable("The current token should be a memory operand flag");
   }
@@ -1909,7 +2098,7 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
     // The token was already consumed, so use return here instead of break.
     return false;
   }
-  case MIToken::kw_call_entry: {
+  case MIToken::kw_call_entry:
     lex();
     switch (Token.kind()) {
     case MIToken::GlobalValue:
@@ -1929,7 +2118,6 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
-  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -1969,6 +2157,48 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   return false;
 }
 
+bool MIParser::parseOptionalScope(LLVMContext &Context,
+                                  SyncScope::ID &SSID) {
+  SSID = SyncScope::System;
+  if (Token.is(MIToken::Identifier) && Token.stringValue() == "syncscope") {
+    lex();
+    if (expectAndConsume(MIToken::lparen))
+      return error("expected '(' in syncscope");
+
+    std::string SSN;
+    if (parseStringConstant(SSN))
+      return true;
+
+    SSID = Context.getOrInsertSyncScopeID(SSN);
+    if (expectAndConsume(MIToken::rparen))
+      return error("expected ')' in syncscope");
+  }
+
+  return false;
+}
+
+bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) {
+  Order = AtomicOrdering::NotAtomic;
+  if (Token.isNot(MIToken::Identifier))
+    return false;
+
+  Order = StringSwitch<AtomicOrdering>(Token.stringValue())
+              .Case("unordered", AtomicOrdering::Unordered)
+              .Case("monotonic", AtomicOrdering::Monotonic)
+              .Case("acquire", AtomicOrdering::Acquire)
+              .Case("release", AtomicOrdering::Release)
+              .Case("acq_rel", AtomicOrdering::AcquireRelease)
+              .Case("seq_cst", AtomicOrdering::SequentiallyConsistent)
+              .Default(AtomicOrdering::NotAtomic);
+
+  if (Order != AtomicOrdering::NotAtomic) {
+    lex();
+    return false;
+  }
+
+  return error("expected an atomic scope, ordering or a size integer literal");
+}
+
 bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   if (expectAndConsume(MIToken::lparen))
     return true;
@@ -1986,6 +2216,19 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     Flags |= MachineMemOperand::MOStore;
   lex();
 
+  // Optional synchronization scope.
+  SyncScope::ID SSID;
+  if (parseOptionalScope(MF.getFunction()->getContext(), SSID))
+    return true;
+
+  // Up to two atomic orderings (cmpxchg provides guarantees on failure).
+  AtomicOrdering Order, FailureOrder;
+  if (parseOptionalAtomicOrdering(Order))
+    return true;
+
+  if (parseOptionalAtomicOrdering(FailureOrder))
+    return true;
+
   if (Token.isNot(MIToken::IntegerLiteral))
     return error("expected the size integer literal after memory operation");
   uint64_t Size;
@@ -2040,8 +2283,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   }
   if (expectAndConsume(MIToken::rparen))
     return true;
-  Dest =
-      MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range);
+  Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range,
+                                 SSID, Order, FailureOrder);
   return false;
 }
 
@@ -2254,6 +2497,35 @@ bool MIParser::getBitmaskTargetFlag(StringRef Name, unsigned &Flag) {
   return false;
 }
 
+void MIParser::initNames2MMOTargetFlags() {
+  if (!Names2MMOTargetFlags.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableMachineMemOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2MMOTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getMMOTargetFlag(StringRef Name,
+                                MachineMemOperand::Flags &Flag) {
+  initNames2MMOTargetFlags();
+  auto FlagInfo = Names2MMOTargetFlags.find(Name);
+  if (FlagInfo == Names2MMOTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+bool MIParser::parseStringConstant(std::string &Result) {
+  if (Token.isNot(MIToken::StringConstant))
+    return error("expected string constant");
+  Result = Token.stringValue();
+  lex();
+  return false;
+}
+
 bool llvm::parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS,
                                              StringRef Src,
                                              SMDiagnostic &Error) {
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
index 93a4d84..2307881 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
@@ -1,4 +1,4 @@
-//===- MIParser.h - Machine Instructions Parser ---------------------------===//
+//===- MIParser.h - Machine Instructions Parser -----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,21 +15,19 @@
 #define LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/Allocator.h"
 
 namespace llvm {
 
-class StringRef;
-class BasicBlock;
 class MachineBasicBlock;
 class MachineFunction;
-class MachineInstr;
-class MachineRegisterInfo;
 class MDNode;
 class RegisterBank;
 struct SlotMapping;
 class SMDiagnostic;
 class SourceMgr;
+class StringRef;
 class TargetRegisterClass;
 
 struct VRegInfo {
@@ -45,11 +43,16 @@ struct VRegInfo {
   unsigned PreferredReg = 0;
 };
 
+using Name2RegClassMap = StringMap<const TargetRegisterClass *>;
+using Name2RegBankMap = StringMap<const RegisterBank *>;
+
 struct PerFunctionMIParsingState {
   BumpPtrAllocator Allocator;
   MachineFunction &MF;
   SourceMgr *SM;
   const SlotMapping &IRSlots;
+  const Name2RegClassMap &Names2RegClasses;
+  const Name2RegBankMap &Names2RegBanks;
 
   DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
   DenseMap<unsigned, VRegInfo*> VRegInfos;
@@ -59,7 +62,9 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
-                            const SlotMapping &IRSlots);
+                            const SlotMapping &IRSlots,
+                            const Name2RegClassMap &Names2RegClasses,
+                            const Name2RegBankMap &Names2RegBanks);
 
   VRegInfo &getVRegInfo(unsigned VReg);
 };
@@ -115,4 +120,4 @@ bool parseMDNode(PerFunctionMIParsingState &PFS, MDNode *&Node, StringRef Src,
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 3dff114..78b57f3 100644
--- a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -50,18 +50,24 @@ namespace llvm {
 /// file.
 class MIRParserImpl {
   SourceMgr SM;
+  yaml::Input In;
   StringRef Filename;
   LLVMContext &Context;
-  StringMap<std::unique_ptr<yaml::MachineFunction>> Functions;
   SlotMapping IRSlots;
   /// Maps from register class names to register classes.
-  StringMap<const TargetRegisterClass *> Names2RegClasses;
+  Name2RegClassMap Names2RegClasses;
   /// Maps from register bank names to register banks.
-  StringMap<const RegisterBank *> Names2RegBanks;
+  Name2RegBankMap Names2RegBanks;
+  /// True when the MIR file doesn't have LLVM IR. Dummy IR functions are
+  /// created and inserted into the given module when this is true.
+  bool NoLLVMIR = false;
+  /// True when a well formed MIR file does not contain any MIR/machine function
+  /// parts.
+  bool NoMIRDocuments = false;
 
 public:
-  MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename,
-                LLVMContext &Context);
+  MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents,
+                StringRef Filename, LLVMContext &Context);
 
   void reportDiagnostic(const SMDiagnostic &Diag);
 
@@ -85,22 +91,22 @@ public:
   /// file.
   ///
   /// Return null if an error occurred.
-  std::unique_ptr<Module> parse();
+  std::unique_ptr<Module> parseIRModule();
+
+  bool parseMachineFunctions(Module &M, MachineModuleInfo &MMI);
 
   /// Parse the machine function in the current YAML document.
   ///
-  /// \param NoLLVMIR - set to true when the MIR file doesn't have LLVM IR.
-  /// A dummy IR function is created and inserted into the given module when
-  /// this parameter is true.
   ///
   /// Return true if an error occurred.
-  bool parseMachineFunction(yaml::Input &In, Module &M, bool NoLLVMIR);
+  bool parseMachineFunction(Module &M, MachineModuleInfo &MMI);
 
   /// Initialize the machine function to the state that's described in the MIR
   /// file.
   ///
   /// Return true if error occurred.
-  bool initializeMachineFunction(MachineFunction &MF);
+  bool initializeMachineFunction(const yaml::MachineFunction &YamlMF,
+                                 MachineFunction &MF);
 
   bool parseRegisterInfo(PerFunctionMIParsingState &PFS,
                          const yaml::MachineFunction &YamlMF);
@@ -144,9 +150,6 @@ private:
   SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error,
                                        SMRange SourceRange);
 
-  /// Create an empty function with the given name.
-  void createDummyFunction(StringRef Name, Module &M);
-
   void initNames2RegClasses(const MachineFunction &MF);
   void initNames2RegBanks(const MachineFunction &MF);
 
@@ -166,10 +169,19 @@ private:
 
 } // end namespace llvm
 
+static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) {
+  reinterpret_cast<MIRParserImpl *>(Context)->reportDiagnostic(Diag);
+}
+
 MIRParserImpl::MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents,
                              StringRef Filename, LLVMContext &Context)
-    : SM(), Filename(Filename), Context(Context) {
-  SM.AddNewSourceBuffer(std::move(Contents), SMLoc());
+    : SM(),
+      In(SM.getMemoryBuffer(
+            SM.AddNewSourceBuffer(std::move(Contents), SMLoc()))->getBuffer(),
+            nullptr, handleYAMLDiag, this),
+      Filename(Filename),
+      Context(Context) {
+  In.setContext(&In);
 }
 
 bool MIRParserImpl::error(const Twine &Message) {
@@ -206,24 +218,16 @@ void MIRParserImpl::reportDiagnostic(const SMDiagnostic &Diag) {
   Context.diagnose(DiagnosticInfoMIRParser(Kind, Diag));
 }
 
-static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) {
-  reinterpret_cast<MIRParserImpl *>(Context)->reportDiagnostic(Diag);
-}
-
-std::unique_ptr<Module> MIRParserImpl::parse() {
-  yaml::Input In(SM.getMemoryBuffer(SM.getMainFileID())->getBuffer(),
-                 /*Ctxt=*/nullptr, handleYAMLDiag, this);
-  In.setContext(&In);
-
+std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
   if (!In.setCurrentDocument()) {
     if (In.error())
       return nullptr;
     // Create an empty module when the MIR file is empty.
+    NoMIRDocuments = true;
     return llvm::make_unique<Module>(Filename, Context);
   }
 
   std::unique_ptr<Module> M;
-  bool NoLLVMIR = false;
   // Parse the block scalar manually so that we can return unique pointer
   // without having to go trough YAML traits.
   if (const auto *BSN =
@@ -237,49 +241,68 @@ std::unique_ptr<Module> MIRParserImpl::parse() {
     }
     In.nextDocument();
     if (!In.setCurrentDocument())
-      return M;
+      NoMIRDocuments = true;
   } else {
     // Create an new, empty module.
     M = llvm::make_unique<Module>(Filename, Context);
     NoLLVMIR = true;
   }
+  return M;
+}
+
+bool MIRParserImpl::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
+  if (NoMIRDocuments)
+    return false;
 
   // Parse the machine functions.
   do {
-    if (parseMachineFunction(In, *M, NoLLVMIR))
-      return nullptr;
+    if (parseMachineFunction(M, MMI))
+      return true;
     In.nextDocument();
   } while (In.setCurrentDocument());
 
-  return M;
-}
-
-bool MIRParserImpl::parseMachineFunction(yaml::Input &In, Module &M,
-                                         bool NoLLVMIR) {
-  auto MF = llvm::make_unique<yaml::MachineFunction>();
-  yaml::EmptyContext Ctx;
-  yaml::yamlize(In, *MF, false, Ctx);
-  if (In.error())
-    return true;
-  auto FunctionName = MF->Name;
-  if (Functions.find(FunctionName) != Functions.end())
-    return error(Twine("redefinition of machine function '") + FunctionName +
-                 "'");
-  Functions.insert(std::make_pair(FunctionName, std::move(MF)));
-  if (NoLLVMIR)
-    createDummyFunction(FunctionName, M);
-  else if (!M.getFunction(FunctionName))
-    return error(Twine("function '") + FunctionName +
-                 "' isn't defined in the provided LLVM IR");
   return false;
 }
 
-void MIRParserImpl::createDummyFunction(StringRef Name, Module &M) {
+/// Create an empty function with the given name.
+static Function *createDummyFunction(StringRef Name, Module &M) {
   auto &Context = M.getContext();
   Function *F = cast<Function>(M.getOrInsertFunction(
       Name, FunctionType::get(Type::getVoidTy(Context), false)));
   BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
   new UnreachableInst(Context, BB);
+  return F;
+}
+
+bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
+  // Parse the yaml.
+  yaml::MachineFunction YamlMF;
+  yaml::EmptyContext Ctx;
+  yaml::yamlize(In, YamlMF, false, Ctx);
+  if (In.error())
+    return true;
+
+  // Search for the corresponding IR function.
+  StringRef FunctionName = YamlMF.Name;
+  Function *F = M.getFunction(FunctionName);
+  if (!F) {
+    if (NoLLVMIR) {
+      F = createDummyFunction(FunctionName, M);
+    } else {
+      return error(Twine("function '") + FunctionName +
+                   "' isn't defined in the provided LLVM IR");
+    }
+  }
+  if (MMI.getMachineFunction(*F) != nullptr)
+    return error(Twine("redefinition of machine function '") + FunctionName +
+                 "'");
+
+  // Create the MachineFunction.
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+  if (initializeMachineFunction(YamlMF, MF))
+    return true;
+
+  return false;
 }
 
 static bool isSSA(const MachineFunction &MF) {
@@ -319,13 +342,12 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
     Properties.set(MachineFunctionProperties::Property::NoVRegs);
 }
 
-bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
-  auto It = Functions.find(MF.getName());
-  if (It == Functions.end())
-    return error(Twine("no machine function information for function '") +
-                 MF.getName() + "' in the MIR file");
+bool
+MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
+                                         MachineFunction &MF) {
   // TODO: Recreate the machine function.
-  const yaml::MachineFunction &YamlMF = *It->getValue();
+  initNames2RegClasses(MF);
+  initNames2RegBanks(MF);
   if (YamlMF.Alignment)
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
@@ -338,7 +360,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   if (YamlMF.Selected)
     MF.getProperties().set(MachineFunctionProperties::Property::Selected);
 
-  PerFunctionMIParsingState PFS(MF, SM, IRSlots);
+  PerFunctionMIParsingState PFS(MF, SM, IRSlots, Names2RegClasses,
+                                Names2RegBanks);
   if (parseRegisterInfo(PFS, YamlMF))
     return true;
   if (!YamlMF.Constants.empty()) {
@@ -362,9 +385,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   }
   PFS.SM = &SM;
 
-  if (MF.empty())
-    return error(Twine("machine function '") + Twine(MF.getName()) +
-                 "' requires at least one machine basic block in its body");
   // Initialize the frame information after creating all the MBBs so that the
   // MBB references in the frame information can be resolved.
   if (initializeFrameInfo(PFS, YamlMF))
@@ -462,17 +482,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
     RegInfo.addLiveIn(Reg, VReg);
   }
 
-  // Parse the callee saved register mask.
-  BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size());
-  if (!YamlMF.CalleeSavedRegisters)
-    return false;
-  for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
-    unsigned Reg = 0;
-    if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
-      return error(Error, RegSource.SourceRange);
-    CalleeSavedRegisterMask[Reg] = true;
+  // Parse the callee saved registers (Registers that will
+  // be saved for the caller).
+  if (YamlMF.CalleeSavedRegisters) {
+    SmallVector<MCPhysReg, 16> CalleeSavedRegisters;
+    for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
+      unsigned Reg = 0;
+      if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
+        return error(Error, RegSource.SourceRange);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+    RegInfo.setCalleeSavedRegs(CalleeSavedRegisters);
   }
-  RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip());
+
   return false;
 }
 
@@ -505,14 +527,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
   }
 
   // Compute MachineRegisterInfo::UsedPhysRegMask
-  if (!YamlMF.CalleeSavedRegisters) {
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
-        for (const MachineOperand &MO : MI.operands()) {
-          if (!MO.isRegMask())
-            continue;
-          MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
-        }
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
       }
     }
   }
@@ -539,7 +559,8 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
     MFI.ensureMaxAlignment(YamlMFI.MaxAlignment);
   MFI.setAdjustsStack(YamlMFI.AdjustsStack);
   MFI.setHasCalls(YamlMFI.HasCalls);
-  MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize);
+  if (YamlMFI.MaxCallFrameSize != ~0u)
+    MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize);
   MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment);
   MFI.setHasVAStart(YamlMFI.HasVAStart);
   MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
@@ -818,7 +839,6 @@ void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) {
 
 const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
                                                       StringRef Name) {
-  initNames2RegClasses(MF);
   auto RegClassInfo = Names2RegClasses.find(Name);
   if (RegClassInfo == Names2RegClasses.end())
     return nullptr;
@@ -827,7 +847,6 @@ const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
 
 const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF,
                                               StringRef Name) {
-  initNames2RegBanks(MF);
   auto RegBankInfo = Names2RegBanks.find(Name);
   if (RegBankInfo == Names2RegBanks.end())
     return nullptr;
@@ -839,16 +858,18 @@ MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
 
 MIRParser::~MIRParser() {}
 
-std::unique_ptr<Module> MIRParser::parseLLVMModule() { return Impl->parse(); }
+std::unique_ptr<Module> MIRParser::parseIRModule() {
+  return Impl->parseIRModule();
+}
 
-bool MIRParser::initializeMachineFunction(MachineFunction &MF) {
-  return Impl->initializeMachineFunction(MF);
+bool MIRParser::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
+  return Impl->parseMachineFunctions(M, MMI);
 }
 
 std::unique_ptr<MIRParser> llvm::createMIRParserFromFile(StringRef Filename,
                                                          SMDiagnostic &Error,
                                                          LLVMContext &Context) {
-  auto FileOrErr = MemoryBuffer::getFile(Filename);
+  auto FileOrErr = MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
     Error = SMDiagnostic(Filename, SourceMgr::DK_Error,
                          "Could not open input file: " + EC.message());
diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
index db87092..ddeacf1 100644
--- a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -12,36 +12,72 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MIRPrinter.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
-#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
+static cl::opt<bool> SimplifyMIR("simplify-mir",
+    cl::desc("Leave out unnecessary information when printing MIR"));
+
 namespace {
 
 /// This structure describes how to print out stack object references.
@@ -104,6 +140,11 @@ class MIPrinter {
   ModuleSlotTracker &MST;
   const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds;
   const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping;
+  /// Synchronization scope names registered with LLVMContext.
+  SmallVector<StringRef, 8> SSNs;
+
+  bool canPredictBranchProbabilities(const MachineBasicBlock &MBB) const;
+  bool canPredictSuccessors(const MachineBasicBlock &MBB) const;
 
 public:
   MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST,
@@ -124,7 +165,9 @@ public:
   void print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
              unsigned I, bool ShouldPrintRegisterTies,
              LLT TypeToPrint, bool IsDef = false);
-  void print(const MachineMemOperand &Op);
+  void print(const LLVMContext &Context, const TargetInstrInfo &TII,
+             const MachineMemOperand &Op);
+  void printSyncScope(const LLVMContext &Context, SyncScope::ID SSID);
 
   void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI);
 };
@@ -139,6 +182,7 @@ template <> struct BlockScalarTraits<Module> {
   static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
     Mod.print(OS, nullptr);
   }
+
   static StringRef input(StringRef Str, void *Ctxt, Module &Mod) {
     llvm_unreachable("LLVM Module is supposed to be parsed separately");
     return "";
@@ -202,9 +246,30 @@ void MIRPrinter::print(const MachineFunction &MF) {
   }
   StrOS.flush();
   yaml::Output Out(OS);
+  if (!SimplifyMIR)
+      Out.setWriteDefaultValues(true);
   Out << YamlMF;
 }
 
+static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS,
+                               const TargetRegisterInfo *TRI) {
+  assert(RegMask && "Can't print an empty register mask");
+  OS << StringRef("CustomRegMask(");
+
+  bool IsRegInRegMaskFound = false;
+  for (int I = 0, E = TRI->getNumRegs(); I < E; I++) {
+    // Check whether the register is asserted in regmask.
+    if (RegMask[I / 32] & (1u << (I % 32))) {
+      if (IsRegInRegMaskFound)
+        OS << ',';
+      printReg(I, OS, TRI);
+      IsRegInRegMaskFound = true;
+    }
+  }
+
+  OS << ')';
+}
+
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineRegisterInfo &RegInfo,
                          const TargetRegisterInfo *TRI) {
@@ -239,20 +304,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
       printReg(I->second, LiveIn.VirtualRegister, TRI);
     MF.LiveIns.push_back(LiveIn);
   }
-  // The used physical register mask is printed as an inverted callee saved
-  // register mask.
-  const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask();
-  if (UsedPhysRegMask.none())
-    return;
-  std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
-  for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) {
-    if (!UsedPhysRegMask[I]) {
+
+  // Prints the callee saved registers.
+  if (RegInfo.isUpdatedCSRsInitialized()) {
+    const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs();
+    std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
+    for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) {
       yaml::FlowStringValue Reg;
-      printReg(I, Reg, TRI);
+      printReg(*I, Reg, TRI);
       CalleeSavedRegisters.push_back(Reg);
     }
+    MF.CalleeSavedRegisters = CalleeSavedRegisters;
   }
-  MF.CalleeSavedRegisters = CalleeSavedRegisters;
 }
 
 void MIRPrinter::convert(ModuleSlotTracker &MST,
@@ -267,7 +330,8 @@ void MIRPrinter::convert(ModuleSlotTracker &MST,
   YamlMFI.MaxAlignment = MFI.getMaxAlignment();
   YamlMFI.AdjustsStack = MFI.adjustsStack();
   YamlMFI.HasCalls = MFI.hasCalls();
-  YamlMFI.MaxCallFrameSize = MFI.getMaxCallFrameSize();
+  YamlMFI.MaxCallFrameSize = MFI.isMaxCallFrameSizeComputed()
+    ? MFI.getMaxCallFrameSize() : ~0u;
   YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment();
   YamlMFI.HasVAStart = MFI.hasVAStart();
   YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
@@ -434,6 +498,62 @@ void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) {
     RegisterMaskIds.insert(std::make_pair(Mask, I++));
 }
 
+void llvm::guessSuccessors(const MachineBasicBlock &MBB,
+                           SmallVectorImpl<MachineBasicBlock*> &Result,
+                           bool &IsFallthrough) {
+  SmallPtrSet<MachineBasicBlock*,8> Seen;
+
+  for (const MachineInstr &MI : MBB) {
+    if (MI.isPHI())
+      continue;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isMBB())
+        continue;
+      MachineBasicBlock *Succ = MO.getMBB();
+      auto RP = Seen.insert(Succ);
+      if (RP.second)
+        Result.push_back(Succ);
+    }
+  }
+  MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
+  IsFallthrough = I == MBB.end() || !I->isBarrier();
+}
+
+bool
+MIPrinter::canPredictBranchProbabilities(const MachineBasicBlock &MBB) const {
+  if (MBB.succ_size() <= 1)
+    return true;
+  if (!MBB.hasSuccessorProbabilities())
+    return true;
+
+  SmallVector<BranchProbability,8> Normalized(MBB.Probs.begin(),
+                                              MBB.Probs.end());
+  BranchProbability::normalizeProbabilities(Normalized.begin(),
+                                            Normalized.end());
+  SmallVector<BranchProbability,8> Equal(Normalized.size());
+  BranchProbability::normalizeProbabilities(Equal.begin(), Equal.end());
+
+  return std::equal(Normalized.begin(), Normalized.end(), Equal.begin());
+}
+
+bool MIPrinter::canPredictSuccessors(const MachineBasicBlock &MBB) const {
+  SmallVector<MachineBasicBlock*,8> GuessedSuccs;
+  bool GuessedFallthrough;
+  guessSuccessors(MBB, GuessedSuccs, GuessedFallthrough);
+  if (GuessedFallthrough) {
+    const MachineFunction &MF = *MBB.getParent();
+    MachineFunction::const_iterator NextI = std::next(MBB.getIterator());
+    if (NextI != MF.end()) {
+      MachineBasicBlock *Next = const_cast<MachineBasicBlock*>(&*NextI);
+      if (!is_contained(GuessedSuccs, Next))
+        GuessedSuccs.push_back(Next);
+    }
+  }
+  if (GuessedSuccs.size() != MBB.succ_size())
+    return false;
+  return std::equal(MBB.succ_begin(), MBB.succ_end(), GuessedSuccs.begin());
+}
+
 void MIPrinter::print(const MachineBasicBlock &MBB) {
   assert(MBB.getNumber() >= 0 && "Invalid MBB number");
   OS << "bb." << MBB.getNumber();
@@ -472,13 +592,15 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
 
   bool HasLineAttributes = false;
   // Print the successors
-  if (!MBB.succ_empty()) {
+  bool canPredictProbs = canPredictBranchProbabilities(MBB);
+  if (!MBB.succ_empty() && (!SimplifyMIR || !canPredictProbs ||
+                            !canPredictSuccessors(MBB))) {
     OS.indent(2) << "successors: ";
     for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) {
       if (I != MBB.succ_begin())
         OS << ", ";
       printMBBReference(**I);
-      if (MBB.hasSuccessorProbabilities())
+      if (!SimplifyMIR || !canPredictProbs)
         OS << '('
            << format("0x%08" PRIx32, MBB.getSuccProbability(I).getNumerator())
            << ')';
@@ -614,11 +736,12 @@ void MIPrinter::print(const MachineInstr &MI) {
 
   if (!MI.memoperands_empty()) {
     OS << " :: ";
+    const LLVMContext &Context = MF->getFunction()->getContext();
     bool NeedComma = false;
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      print(*Op);
+      print(Context, *TII, *Op);
       NeedComma = true;
     }
   }
@@ -823,7 +946,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
     OS << "%const." << Op.getIndex();
     printOffset(Op.getOffset());
     break;
-  case MachineOperand::MO_TargetIndex: {
+  case MachineOperand::MO_TargetIndex:
     OS << "target-index(";
     if (const auto *Name = getTargetIndexName(
             *Op.getParent()->getParent()->getParent(), Op.getIndex()))
@@ -833,15 +956,20 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
     OS << ')';
     printOffset(Op.getOffset());
     break;
-  }
   case MachineOperand::MO_JumpTableIndex:
     OS << "%jump-table." << Op.getIndex();
     break;
-  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_ExternalSymbol: {
+    StringRef Name = Op.getSymbolName();
     OS << '$';
-    printLLVMNameWithoutPrefix(OS, Op.getSymbolName());
+    if (Name.empty()) {
+      OS << "\"\"";
+    } else {
+      printLLVMNameWithoutPrefix(OS, Name);
+    }
     printOffset(Op.getOffset());
     break;
+  }
   case MachineOperand::MO_GlobalAddress:
     Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
     printOffset(Op.getOffset());
@@ -860,7 +988,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
     if (RegMaskInfo != RegisterMaskIds.end())
       OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower();
     else
-      llvm_unreachable("Can't print this machine register mask yet.");
+      printCustomRegMask(Op.getRegMask(), OS, TRI);
     break;
   }
   case MachineOperand::MO_RegisterLiveOut: {
@@ -909,9 +1037,20 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
   }
 }
 
-void MIPrinter::print(const MachineMemOperand &Op) {
+static const char *getTargetMMOFlagName(const TargetInstrInfo &TII,
+                                        unsigned TMMOFlag) {
+  auto Flags = TII.getSerializableMachineMemOperandTargetFlags();
+  for (const auto &I : Flags) {
+    if (I.first == TMMOFlag) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+void MIPrinter::print(const LLVMContext &Context, const TargetInstrInfo &TII,
+                      const MachineMemOperand &Op) {
   OS << '(';
-  // TODO: Print operand's target specific flags.
   if (Op.isVolatile())
     OS << "volatile ";
   if (Op.isNonTemporal())
@@ -920,12 +1059,29 @@ void MIPrinter::print(const MachineMemOperand &Op) {
     OS << "dereferenceable ";
   if (Op.isInvariant())
     OS << "invariant ";
+  if (Op.getFlags() & MachineMemOperand::MOTargetFlag1)
+    OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag1)
+       << "\" ";
+  if (Op.getFlags() & MachineMemOperand::MOTargetFlag2)
+    OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag2)
+       << "\" ";
+  if (Op.getFlags() & MachineMemOperand::MOTargetFlag3)
+    OS << '"' << getTargetMMOFlagName(TII, MachineMemOperand::MOTargetFlag3)
+       << "\" ";
   if (Op.isLoad())
     OS << "load ";
   else {
     assert(Op.isStore() && "Non load machine operand must be a store");
     OS << "store ";
   }
+
+  printSyncScope(Context, Op.getSyncScopeID());
+
+  if (Op.getOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(Op.getOrdering()) << ' ';
+  if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(Op.getFailureOrdering()) << ' ';
+
   OS << Op.getSize();
   if (const Value *Val = Op.getValue()) {
     OS << (Op.isLoad() ? " from " : " into ");
@@ -988,6 +1144,23 @@ void MIPrinter::print(const MachineMemOperand &Op) {
   OS << ')';
 }
 
+void MIPrinter::printSyncScope(const LLVMContext &Context, SyncScope::ID SSID) {
+  switch (SSID) {
+  case SyncScope::System: {
+    break;
+  }
+  default: {
+    if (SSNs.empty())
+      Context.getSyncScopeNames(SSNs);
+
+    OS << "syncscope(\"";
+    PrintEscapedString(SSNs[SSID], OS);
+    OS << "\") ";
+    break;
+  }
+  }
+}
+
 static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
                              const TargetRegisterInfo *TRI) {
   int Reg = TRI->getLLVMRegNum(DwarfReg, true);
diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.h b/contrib/llvm/lib/CodeGen/MIRPrinter.h
deleted file mode 100644
index 16aa903..0000000
--- a/contrib/llvm/lib/CodeGen/MIRPrinter.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===- MIRPrinter.h - MIR serialization format printer --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the functions that print out the LLVM IR and the machine
-// functions using the MIR serialization format.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_MIRPRINTER_H
-#define LLVM_LIB_CODEGEN_MIRPRINTER_H
-
-namespace llvm {
-
-class MachineFunction;
-class Module;
-class raw_ostream;
-
-/// Print LLVM IR using the MIR serialization format to the given output stream.
-void printMIR(raw_ostream &OS, const Module &M);
-
-/// Print a machine function using the MIR serialization format to the given
-/// output stream.
-void printMIR(raw_ostream &OS, const MachineFunction &MF);
-
-} // end namespace llvm
-
-#endif
diff --git a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
index c690bcf..09354cf 100644
--- a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
@@ -12,10 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MIRPrinter.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MIRPrinter.h"
+
 #include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 3869f97..81597af 100644
--- a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -148,8 +149,11 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
 
 MachineBasicBlock::iterator
 MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
   iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition()))
+  while (I != E && (I->isPHI() || I->isPosition() ||
+                    TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels
   // inside the bundle.
@@ -160,8 +164,11 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
 
 MachineBasicBlock::iterator
 MachineBasicBlock::SkipPHIsLabelsAndDebug(MachineBasicBlock::iterator I) {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+
   iterator E = end();
-  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue()))
+  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue() ||
+                    TII->isBasicBlockPrologue(*I)))
     ++I;
   // FIXME: This needs to change if we wish to bundle labels / dbg_values
   // inside the bundle.
@@ -221,11 +228,17 @@ LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
 }
 #endif
 
+bool MachineBasicBlock::isLegalToHoistInto() const {
+  if (isReturnBlock() || hasEHPadSuccessor())
+    return false;
+  return true;
+}
+
 StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
     return LBB->getName();
   else
-    return "(null)";
+    return StringRef("", 0);
 }
 
 /// Return a hopefully unique identifier for this block.
@@ -343,6 +356,13 @@ void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) {
     LiveIns.erase(I);
 }
 
+MachineBasicBlock::livein_iterator
+MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) {
+  // Get non-const version of iterator.
+  LiveInVector::iterator LI = LiveIns.begin() + (I - LiveIns.begin());
+  return LiveIns.erase(LI);
+}
+
 bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const {
   livein_iterator I = find_if(
       LiveIns, [Reg](const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
@@ -417,7 +437,7 @@ void MachineBasicBlock::updateTerminator() {
 
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc DL;  // FIXME: this is nowhere
+  DebugLoc DL = findBranchDebugLoc();
   bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
@@ -485,7 +505,7 @@ void MachineBasicBlock::updateTerminator() {
       // FIXME: This does not seem like a reasonable pattern to support, but it
       // has been seen in the wild coming out of degenerate ARM test cases.
       TII->removeBranch(*this);
-  
+
       // Finally update the unconditional successor to be reached via a branch if
       // it would not be reached by fallthrough.
       if (!isLayoutSuccessor(TBB))
@@ -681,16 +701,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
   return std::next(I) == MachineFunction::const_iterator(MBB);
 }
 
-bool MachineBasicBlock::canFallThrough() {
+MachineBasicBlock *MachineBasicBlock::getFallThrough() {
   MachineFunction::iterator Fallthrough = getIterator();
   ++Fallthrough;
   // If FallthroughBlock is off the end of the function, it can't fall through.
   if (Fallthrough == getParent()->end())
-    return false;
+    return nullptr;
 
   // If FallthroughBlock isn't a successor, no fallthrough is possible.
   if (!isSuccessor(&*Fallthrough))
-    return false;
+    return nullptr;
 
   // Analyze the branches, if any, at the end of the block.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -702,25 +722,31 @@ bool MachineBasicBlock::canFallThrough() {
     // is possible. The isPredicated check is needed because this code can be
     // called during IfConversion, where an instruction which is normally a
     // Barrier is predicated and thus no longer an actual control barrier.
-    return empty() || !back().isBarrier() || TII->isPredicated(back());
+    return (empty() || !back().isBarrier() || TII->isPredicated(back()))
+               ? &*Fallthrough
+               : nullptr;
   }
 
   // If there is no branch, control always falls through.
-  if (!TBB) return true;
+  if (!TBB) return &*Fallthrough;
 
   // If there is some explicit branch to the fallthrough block, it can obviously
   // reach, even though the branch should get folded to fall through implicitly.
   if (MachineFunction::iterator(TBB) == Fallthrough ||
       MachineFunction::iterator(FBB) == Fallthrough)
-    return true;
+    return &*Fallthrough;
 
   // If it's an unconditional branch to some block not the fall through, it
   // doesn't fall through.
-  if (Cond.empty()) return false;
+  if (Cond.empty()) return nullptr;
 
   // Otherwise, if it is conditional and has no explicit false block, it falls
   // through.
-  return FBB == nullptr;
+  return (FBB == nullptr) ? &*Fallthrough : nullptr;
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  return getFallThrough() != nullptr;
 }
 
 MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
@@ -1144,6 +1170,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return {};
 }
 
+/// Find and return the merged DebugLoc of the branch instructions of the block.
+/// Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findBranchDebugLoc() {
+  DebugLoc DL;
+  auto TI = getFirstTerminator();
+  while (TI != end() && !TI->isBranch())
+    ++TI;
+
+  if (TI != end()) {
+    DL = TI->getDebugLoc();
+    for (++TI ; TI != end() ; ++TI)
+      if (TI->isBranch())
+        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc());
+  }
+  return DL;
+}
+
 /// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 7d5124d..4d1ec11 100644
--- a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -26,9 +26,8 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "block-freq"
+#define DEBUG_TYPE "machine-block-freq"
 
-#ifndef NDEBUG
 
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
     "view-machine-block-freq-propagation-dags", cl::Hidden,
@@ -43,10 +42,37 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
                           "integer fractional block frequency representation."),
                clEnumValN(GVDT_Count, "count", "display a graph using the real "
                                                "profile count if available.")));
+// Similar option above, but used to control BFI display only after MBP pass
+cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
+    "view-block-layout-with-bfi", cl::Hidden,
+    cl::desc(
+        "Pop up a window to show a dag displaying MBP layout and associated "
+        "block frequencies of the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count",
+                          "display a graph using the real "
+                          "profile count if available.")));
 
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
+// Command line option to specify hot frequency threshold.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
 extern cl::opt<unsigned> ViewHotFreqPercent;
 
+static GVDAGType getGVDT() {
+  if (ViewBlockLayoutWithBFI != GVDT_None)
+    return ViewBlockLayoutWithBFI;
+
+  return ViewMachineBlockFreqPropagationDAG;
+}
+
 namespace llvm {
 
 template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
@@ -80,12 +106,32 @@ template <>
 struct DOTGraphTraits<MachineBlockFrequencyInfo *>
     : public MBFIDOTGraphTraitsBase {
   explicit DOTGraphTraits(bool isSimple = false)
-      : MBFIDOTGraphTraitsBase(isSimple) {}
+      : MBFIDOTGraphTraitsBase(isSimple), CurFunc(nullptr), LayoutOrderMap() {}
+
+  const MachineFunction *CurFunc;
+  DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
 
   std::string getNodeLabel(const MachineBasicBlock *Node,
                            const MachineBlockFrequencyInfo *Graph) {
-    return MBFIDOTGraphTraitsBase::getNodeLabel(
-        Node, Graph, ViewMachineBlockFreqPropagationDAG);
+
+    int layout_order = -1;
+    // Attach additional ordering information if 'isSimple' is false.
+    if (!isSimple()) {
+      const MachineFunction *F = Node->getParent();
+      if (!CurFunc || F != CurFunc) {
+        if (CurFunc)
+          LayoutOrderMap.clear();
+
+        CurFunc = F;
+        int O = 0;
+        for (auto MBI = F->begin(); MBI != F->end(); ++MBI, ++O) {
+          LayoutOrderMap[&*MBI] = O;
+        }
+      }
+      layout_order = LayoutOrderMap[Node];
+    }
+    return MBFIDOTGraphTraitsBase::getNodeLabel(Node, Graph, getGVDT(),
+                                                layout_order);
   }
 
   std::string getNodeAttributes(const MachineBasicBlock *Node,
@@ -102,13 +148,12 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
 };
 
 } // end namespace llvm
-#endif
 
-INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
+INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE,
                       "Machine Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
+INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE,
                     "Machine Block Frequency Analysis", true, true)
 
 char MachineBlockFrequencyInfo::ID = 0;
@@ -127,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
-  MachineBranchProbabilityInfo &MBPI =
-      getAnalysis<MachineBranchProbabilityInfo>();
-  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+void MachineBlockFrequencyInfo::calculate(
+    const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI,
+    const MachineLoopInfo &MLI) {
   if (!MBFI)
     MBFI.reset(new ImplType);
   MBFI->calculate(F, MBPI, MLI);
-#ifndef NDEBUG
   if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F.getName().equals(ViewBlockFreqFuncName))) {
-    view();
+    view("MachineBlockFrequencyDAGS." + F.getName());
   }
-#endif
+}
+
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
+  MachineBranchProbabilityInfo &MBPI =
+      getAnalysis<MachineBranchProbabilityInfo>();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  calculate(F, MBPI, MLI);
   return false;
 }
 
@@ -148,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); }
 
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
-void MachineBlockFrequencyInfo::view() const {
-// This code is only for debugging.
-#ifndef NDEBUG
-  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this),
-            "MachineBlockFrequencyDAGs");
-#else
-  errs() << "MachineBlockFrequencyInfo::view is only available in debug builds "
-            "on systems with Graphviz or gv!\n";
-#endif // NDEBUG
+void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const {
+  // This code is only for debugging.
+  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple);
 }
 
 BlockFrequency
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 40e3840..447ad62 100644
--- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -25,22 +25,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
 #include "BranchFolding.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -49,6 +50,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <functional>
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "block-placement"
@@ -82,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias(
 // Definition:
 // - Outlining: placement of a basic block outside the chain or hot path.
 
-static cl::opt<bool> OutlineOptionalBranches(
-    "outline-optional-branches",
-    cl::desc("Outlining optional branches will place blocks that are optional "
-              "branches, i.e. branches with a common post dominator, outside "
-              "the hot path or chain"),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<unsigned> OutlineOptionalThreshold(
-    "outline-optional-threshold",
-    cl::desc("Don't outline optional branches that are a single block with an "
-             "instruction count below this threshold"),
-    cl::init(4), cl::Hidden);
-
 static cl::opt<unsigned> LoopToColdBlockRatio(
     "loop-to-cold-block-ratio",
     cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
@@ -136,20 +126,55 @@ BranchFoldPlacement("branch-fold-placement",
               cl::init(true), cl::Hidden);
 
 // Heuristic for tail duplication.
-static cl::opt<unsigned> TailDuplicatePlacementThreshold(
+static cl::opt<unsigned> TailDupPlacementThreshold(
     "tail-dup-placement-threshold",
     cl::desc("Instruction cutoff for tail duplication during layout. "
              "Tail merging during layout is forced to have a threshold "
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for aggressive tail duplication.
+static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
+    "tail-dup-placement-aggressive-threshold",
+    cl::desc("Instruction cutoff for aggressive tail duplication during "
+             "layout. Used at -O3. Tail merging during layout is forced to "
+             "have a threshold that won't conflict."), cl::init(3),
+    cl::Hidden);
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDupPlacementPenalty(
+    "tail-dup-placement-penalty",
+    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
+             "Copying can increase fallthrough, but it also increases icache "
+             "pressure. This parameter controls the penalty to account for that. "
+             "Percent as integer."),
+    cl::init(2),
+    cl::Hidden);
+
+// Heuristic for triangle chains.
+static cl::opt<unsigned> TriangleChainCount(
+    "triangle-chain-count",
+    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
+             "triangle tail duplication heuristic to kick in. 0 to disable."),
+    cl::init(2),
+    cl::Hidden);
+
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
 
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi=
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
-typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
+typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType;
 }
 
 namespace {
@@ -193,12 +218,15 @@ public:
 
   /// \brief Iterator over blocks within the chain.
   typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
+  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator;
 
   /// \brief Beginning of blocks within the chain.
   iterator begin() { return Blocks.begin(); }
+  const_iterator begin() const { return Blocks.begin(); }
 
   /// \brief End of blocks within the chain.
   iterator end() { return Blocks.end(); }
+  const_iterator end() const { return Blocks.end(); }
 
   bool remove(MachineBasicBlock* BB) {
     for(iterator i = begin(); i != end(); ++i) {
@@ -217,25 +245,26 @@ public:
   /// updating the block -> chain mapping. It does not free or tear down the
   /// old chain, but the old chain's block list is no longer valid.
   void merge(MachineBasicBlock *BB, BlockChain *Chain) {
-    assert(BB);
-    assert(!Blocks.empty());
+    assert(BB && "Can't merge a null block.");
+    assert(!Blocks.empty() && "Can't merge into an empty chain.");
 
     // Fast path in case we don't have a chain already.
     if (!Chain) {
-      assert(!BlockToChain[BB]);
+      assert(!BlockToChain[BB] &&
+             "Passed chain is null, but BB has entry in BlockToChain.");
       Blocks.push_back(BB);
       BlockToChain[BB] = this;
       return;
     }
 
-    assert(BB == *Chain->begin());
+    assert(BB == *Chain->begin() && "Passed BB is not head of Chain.");
     assert(Chain->begin() != Chain->end());
 
     // Update the incoming blocks to point to this chain, and add them to the
     // chain structure.
     for (MachineBasicBlock *ChainBB : *Chain) {
       Blocks.push_back(ChainBB);
-      assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain");
+      assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain.");
       BlockToChain[ChainBB] = this;
     }
   }
@@ -264,12 +293,28 @@ public:
 namespace {
 class MachineBlockPlacement : public MachineFunctionPass {
   /// \brief A typedef for a block filter set.
-  typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet;
+  typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet;
+
+  /// Pair struct containing basic block and taildup profitiability
+  struct BlockAndTailDupResult {
+    MachineBasicBlock *BB;
+    bool ShouldTailDup;
+  };
+
+  /// Triple struct containing edge weight and the edge.
+  struct WeightedEdge {
+    BlockFrequency Weight;
+    MachineBasicBlock *Src;
+    MachineBasicBlock *Dest;
+  };
 
   /// \brief work lists of blocks that are ready to be laid out
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
 
+  /// Edges that have already been computed as optimal.
+  DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
+
   /// \brief Machine Function
   MachineFunction *F;
 
@@ -294,7 +339,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   const TargetLoweringBase *TLI;
 
   /// \brief A handle to the post dominator tree.
-  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
 
   /// \brief Duplicator used to duplicate tails during placement.
   ///
@@ -303,10 +348,6 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
-  /// all terminators of the MachineFunction.
-  SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
-
   /// \brief Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -322,7 +363,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// BlockChain it participates in, if any. We use it to, among other things,
   /// allow implicitly defining edges between chains as the existing edges
   /// between basic blocks.
-  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
+  DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
 
 #ifndef NDEBUG
   /// The set of basic blocks that have terminators that cannot be fully
@@ -334,75 +375,107 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
-                           const BlockFilterSet *BlockFilter = nullptr);
+  void markChainSuccessors(
+      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
 
   /// Decrease the UnscheduledPredecessors count for a single block, and
   /// if the count goes to 0, add them to the appropriate work list.
   void markBlockSuccessors(
-      BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB,
+      const BlockChain &Chain, const MachineBasicBlock *BB,
+      const MachineBasicBlock *LoopHeaderBB,
       const BlockFilterSet *BlockFilter = nullptr);
 
-
   BranchProbability
-  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
-                          const BlockFilterSet *BlockFilter,
-                          SmallVector<MachineBasicBlock *, 4> &Successors);
-  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                                 BlockChain &Chain,
-                                 const BlockFilterSet *BlockFilter,
-                                 BranchProbability SuccProb,
-                                 BranchProbability HotProb);
+  collectViableSuccessors(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter,
+      SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool shouldPredBlockBeOutlined(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter,
+      BranchProbability SuccProb, BranchProbability HotProb);
   bool repeatedlyTailDuplicateBlock(
       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-      MachineBasicBlock *LoopHeaderBB,
+      const MachineBasicBlock *LoopHeaderBB,
       BlockChain &Chain, BlockFilterSet *BlockFilter,
       MachineFunction::iterator &PrevUnplacedBlockIt);
-  bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
-                               const BlockChain &Chain,
-                               BlockFilterSet *BlockFilter,
-                               MachineFunction::iterator &PrevUnplacedBlockIt,
-                               bool &DuplicatedToPred);
-  bool
-  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                             BlockChain &SuccChain, BranchProbability SuccProb,
-                             BranchProbability RealSuccProb, BlockChain &Chain,
-                             const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
-                                         BlockChain &Chain,
-                                         const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *
-  selectBestCandidateBlock(BlockChain &Chain,
-                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
-  MachineBasicBlock *
-  getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                        MachineFunction::iterator &PrevUnplacedBlockIt,
-                        const BlockFilterSet *BlockFilter);
+  bool maybeTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *LPred,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      bool &DuplicatedToPred);
+  bool hasBetterLayoutPredecessor(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &SuccChain, BranchProbability SuccProb,
+      BranchProbability RealSuccProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestCandidateBlock(
+      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
 
   /// \brief Add a basic block to the work list if it is appropriate.
   ///
   /// If the optional parameter BlockFilter is provided, only MBB
   /// present in the set will be added to the worklist. If nullptr
   /// is provided, no filtering occurs.
-  void fillWorkLists(MachineBasicBlock *MBB,
+  void fillWorkLists(const MachineBasicBlock *MBB,
                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
                      const BlockFilterSet *BlockFilter);
-  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
                   BlockFilterSet *BlockFilter = nullptr);
-  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
-                                     const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(MachineLoop &L,
-                                      const BlockFilterSet &LoopBlockSet);
-  BlockFilterSet collectLoopBlockSet(MachineLoop &L);
-  void buildLoopChains(MachineLoop &L);
-  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
-                  const BlockFilterSet &LoopBlockSet);
-  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
-                             const BlockFilterSet &LoopBlockSet);
-  void collectMustExecuteBBs();
+  MachineBasicBlock *findBestLoopTop(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
+  void buildLoopChains(const MachineLoop &L);
+  void rotateLoop(
+      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+      const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(
+      BlockChain &LoopChain, const MachineLoop &L,
+      const BlockFilterSet &LoopBlockSet);
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  /// Check the edge frequencies to see if tail duplication will increase
+  /// fallthroughs.
+  bool isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability AdjustedSumProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Check for a trellis layout.
+  bool isTrellis(const MachineBasicBlock *BB,
+                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+                 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Get the best successor given a trellis layout.
+  BlockAndTailDupResult getBestTrellisSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  /// Get the best pair of non-conflicting edges.
+  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+      const MachineBasicBlock *BB,
+      MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges);
+  /// Returns true if a block can tail duplicate into all unplaced
+  /// predecessors. Filters based on loop.
+  bool canTailDuplicateUnplacedPreds(
+      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Find chains of triangles to tail-duplicate where a global analysis works,
+  /// but a local analysis would not find them.
+  void precomputeTriangleChains();
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -415,7 +488,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineDominatorTree>();
+    if (TailDupPlacement)
+      AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -425,20 +499,20 @@ public:
 
 char MachineBlockPlacement::ID = 0;
 char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
-INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
+INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
                       "Branch Probability Basic Block Placement", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
+INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
                     "Branch Probability Basic Block Placement", false, false)
 
 #ifndef NDEBUG
 /// \brief Helper to print the name of a MBB.
 ///
 /// Only used by debug logging.
-static std::string getBlockName(MachineBasicBlock *BB) {
+static std::string getBlockName(const MachineBasicBlock *BB) {
   std::string Result;
   raw_string_ostream OS(Result);
   OS << "BB#" << BB->getNumber();
@@ -455,7 +529,7 @@ static std::string getBlockName(MachineBasicBlock *BB) {
 /// having one fewer active predecessor. It also adds any successors of this
 /// chain which reach the zero-predecessor state to the appropriate worklist.
 void MachineBlockPlacement::markChainSuccessors(
-    BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
+    const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
     const BlockFilterSet *BlockFilter) {
   // Walk all the blocks in this chain, marking their successors as having
   // a predecessor placed.
@@ -471,8 +545,8 @@ void MachineBlockPlacement::markChainSuccessors(
 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
 /// for just that block.
 void MachineBlockPlacement::markBlockSuccessors(
-    BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB,
-    const BlockFilterSet *BlockFilter) {
+    const BlockChain &Chain, const MachineBasicBlock *MBB,
+    const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
   // Add any successors for which this is the only un-placed in-loop
   // predecessor to the worklist as a viable candidate for CFG-neutral
   // placement. No subsequent placement of this block will violate the CFG
@@ -504,7 +578,8 @@ void MachineBlockPlacement::markBlockSuccessors(
 /// the total branch probability of edges from \p BB to those
 /// blocks.
 BranchProbability MachineBlockPlacement::collectViableSuccessors(
-    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter,
     SmallVector<MachineBasicBlock *, 4> &Successors) {
   // Adjust edge probabilities by excluding edges pointing to blocks that is
   // either not in BlockFilter or is already in the current chain. Consider the
@@ -519,8 +594,8 @@ BranchProbability MachineBlockPlacement::collectViableSuccessors(
   // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
   // A->C is chosen as a fall-through, D won't be selected as a successor of C
   // due to CFG constraint (the probability of C->D is not greater than
-  // HotProb to break top-order). If we exclude E that is not in BlockFilter
-  // when calculating the  probability of C->D, D will be selected and we
+  // HotProb to break topo-order). If we exclude E that is not in BlockFilter
+  // when calculating the probability of C->D, D will be selected and we
   // will get A C D B as the layout of this loop.
   auto AdjustedSumProb = BranchProbability::getOne();
   for (MachineBasicBlock *Succ : BB->successors()) {
@@ -561,46 +636,573 @@ getAdjustedProbability(BranchProbability OrigProb,
   return SuccProb;
 }
 
-/// When the option OutlineOptionalBranches is on, this method
-/// checks if the fallthrough candidate block \p Succ (of block
-/// \p BB) also has other unscheduled predecessor blocks which
-/// are also successors of \p BB (forming triangular shape CFG).
-/// If none of such predecessors are small, it returns true.
-/// The caller can choose to select \p Succ as the layout successors
-/// so that \p Succ's predecessors (optional branches) can be
-/// outlined.
-/// FIXME: fold this with more general layout cost analysis.
-bool MachineBlockPlacement::shouldPredBlockBeOutlined(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
-    const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
-    BranchProbability HotProb) {
-  if (!OutlineOptionalBranches)
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
     return false;
-  // If we outline optional branches, look whether Succ is unavoidable, i.e.
-  // dominates all terminators of the MachineFunction. If it does, other
-  // successors must be optional. Don't do this for cold branches.
-  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
-    for (MachineBasicBlock *Pred : Succ->predecessors()) {
-      // Check whether there is an unplaced optional branch.
-      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
-          BlockToChain[Pred] == &Chain)
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// Compare 2 BlockFrequency's with a small penalty for \p A.
+/// In order to be conservative, we apply a X% penalty to account for
+/// increased icache pressure and static heuristics. For small frequencies
+/// we use only the numerators to improve accuracy. For simplicity, we assume the
+/// penalty is less than 100%
+/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
+static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
+                            uint64_t EntryFreq) {
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  BlockFrequency Gain = A - B;
+  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
+}
+
+/// Check the edge frequencies to see if tail duplication will increase
+/// fallthroughs. It only makes sense to call this function when
+/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
+/// always locally profitable if we would have picked \p Succ without
+/// considering duplication.
+bool MachineBlockPlacement::isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability QProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // We need to do a probability calculation to make sure this is profitable.
+  // First: does succ have a successor that post-dominates? This affects the
+  // calculation. The 2 relevant cases are:
+  //    BB         BB
+  //    | \Qout    | \Qout
+  //   P|  C       |P C
+  //    =   C'     =   C'
+  //    |  /Qin    |  /Qin
+  //    | /        | /
+  //    Succ       Succ
+  //    / \        | \  V
+  //  U/   =V      |U \
+  //  /     \      =   D
+  //  D      E     |  /
+  //               | /
+  //               |/
+  //               PDom
+  //  '=' : Branch taken for that CFG edge
+  // In the second case, Placing Succ while duplicating it into C prevents the
+  // fallthrough of Succ into either D or PDom, because they now have C as an
+  // unplaced predecessor
+
+  // Start by figuring out which case we fall into
+  MachineBasicBlock *PDom = nullptr;
+  SmallVector<MachineBasicBlock *, 4> SuccSuccs;
+  // Only scan the relevant successors
+  auto AdjustedSuccSumProb =
+      collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
+  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
+  auto BBFreq = MBFI->getBlockFreq(BB);
+  auto SuccFreq = MBFI->getBlockFreq(Succ);
+  BlockFrequency P = BBFreq * PProb;
+  BlockFrequency Qout = BBFreq * QProb;
+  uint64_t EntryFreq = MBFI->getEntryFreq();
+  // If there are no more successors, it is profitable to copy, as it strictly
+  // increases fallthrough.
+  if (SuccSuccs.size() == 0)
+    return greaterWithBias(P, Qout, EntryFreq);
+
+  auto BestSuccSucc = BranchProbability::getZero();
+  // Find the PDom or the best Succ if no PDom exists.
+  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
+    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
+    if (Prob > BestSuccSucc)
+      BestSuccSucc = Prob;
+    if (PDom == nullptr)
+      if (MPDT->dominates(SuccSucc, Succ)) {
+        PDom = SuccSucc;
+        break;
+      }
+  }
+  // For the comparisons, we need to know Succ's best incoming edge that isn't
+  // from BB.
+  auto SuccBestPred = BlockFrequency(0);
+  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+    if (SuccPred == Succ || SuccPred == BB
+        || BlockToChain[SuccPred] == &Chain
+        || (BlockFilter && !BlockFilter->count(SuccPred)))
+      continue;
+    auto Freq = MBFI->getBlockFreq(SuccPred)
+        * MBPI->getEdgeProbability(SuccPred, Succ);
+    if (Freq > SuccBestPred)
+      SuccBestPred = Freq;
+  }
+  // Qin is Succ's best unplaced incoming edge that isn't BB
+  BlockFrequency Qin = SuccBestPred;
+  // If it doesn't have a post-dominating successor, here is the calculation:
+  //    BB        BB
+  //    | \Qout   |  \
+  //   P|  C      |   =
+  //    =   C'    |    C
+  //    |  /Qin   |     |
+  //    | /       |     C' (+Succ)
+  //    Succ      Succ /|
+  //    / \       |  \/ |
+  //  U/   =V     |  == |
+  //  /     \     | /  \|
+  //  D      E    D     E
+  //  '=' : Branch taken for that CFG edge
+  //  Cost in the first case is: P + V
+  //  For this calculation, we always assume P > Qout. If Qout > P
+  //  The result of this function will be ignored at the caller.
+  //  Let F = SuccFreq - Qin
+  //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
+
+  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
+    BranchProbability UProb = BestSuccSucc;
+    BranchProbability VProb = AdjustedSuccSumProb - UProb;
+    BlockFrequency F = SuccFreq - Qin;
+    BlockFrequency V = SuccFreq * VProb;
+    BlockFrequency QinU = std::min(Qin, F) * UProb;
+    BlockFrequency BaseCost = P + V;
+    BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
+    return greaterWithBias(BaseCost, DupCost, EntryFreq);
+  }
+  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
+  BranchProbability VProb = AdjustedSuccSumProb - UProb;
+  BlockFrequency U = SuccFreq * UProb;
+  BlockFrequency V = SuccFreq * VProb;
+  BlockFrequency F = SuccFreq - Qin;
+  // If there is a post-dominating successor, here is the calculation:
+  // BB         BB                 BB          BB
+  // | \Qout    |   \               | \Qout     |  \
+  // |P C       |    =              |P C        |   =
+  // =   C'     |P    C             =   C'      |P   C
+  // |  /Qin    |      |            |  /Qin     |     |
+  // | /        |      C' (+Succ)   | /         |     C' (+Succ)
+  // Succ       Succ  /|            Succ        Succ /|
+  // | \  V     |   \/ |            | \  V      |  \/ |
+  // |U \       |U  /\ =?           |U =        |U /\ |
+  // =   D      = =  =?|            |   D       | =  =|
+  // |  /       |/     D            |  /        |/    D
+  // | /        |     /             | =         |    /
+  // |/         |    /              |/          |   =
+  // Dom         Dom                Dom         Dom
+  //  '=' : Branch taken for that CFG edge
+  // The cost for taken branches in the first case is P + U
+  // Let F = SuccFreq - Qin
+  // The cost in the second case (assuming independence), given the layout:
+  // BB, Succ, (C+Succ), D, Dom or the layout:
+  // BB, Succ, D, Dom, (C+Succ)
+  // is Qout + max(F, Qin) * U + min(F, Qin)
+  // compare P + U vs Qout + P * U + Qin.
+  //
+  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
+  //
+  // For the 3rd case, the cost is P + 2 * V
+  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
+  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
+  if (UProb > AdjustedSuccSumProb / 2 &&
+      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+                                  Chain, BlockFilter))
+    // Cases 3 & 4
+    return greaterWithBias(
+        (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
+        EntryFreq);
+  // Cases 1 & 2
+  return greaterWithBias((P + U),
+                         (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
+                          std::max(Qin, F) * UProb),
+                         EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a trellis with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+    return false;
+
+  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (MachineBasicBlock *Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred)) {
+        // Make sure that it is actually a triangle.
+        for (MachineBasicBlock *CheckSucc : SuccPred->successors())
+          if (!Successors.count(CheckSucc))
+            return false;
+        continue;
+      }
+      const BlockChain *PredChain = BlockToChain[SuccPred];
+      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+          PredChain == &Chain || PredChain == BlockToChain[Succ])
         continue;
-      // Check whether the optional branch has exactly one BB.
-      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
         continue;
-      // Check whether the optional branch is small.
-      if (Pred->size() < OutlineOptionalThreshold)
+      if (!hasSameSuccessors(*SuccPred, Successors))
         return false;
     }
-    return true;
-  } else
+    // If one of the successors has only BB as a predecessor, it is not a
+    // trellis.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
+}
+
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+          MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+    const MachineBasicBlock *BB,
+    MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>>
+        Edges) {
+  // Sort the edges, and then for each successor, find the best incoming
+  // predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall.
+
+  // Sort for highest frequency.
+  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
+  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
+  auto BestA = Edges[0].begin();
+  auto BestB = Edges[1].begin();
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (BestA->Src == BestB->Src) {
+    // Compare the total fallthrough of (Best + Second Best) for both pairs
+    auto SecondBestA = std::next(BestA);
+    auto SecondBestB = std::next(BestB);
+    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (BestB->Src == BB)
+    std::swap(BestA, BestB);
+  return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = {nullptr, false};
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a trellis even more so.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the trellis.
+  SmallVector<WeightedEdge, 8> Edges[2];
+  int SuccIndex = 0;
+  for (auto Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+            BlockToChain[SuccPred] == &Chain ||
+            BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+                                MBPI->getEdgeProbability(SuccPred, Succ);
+      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+    }
+    ++SuccIndex;
+  }
+
+  // Pick the best combination of 2 edges from all the edges in the trellis.
+  WeightedEdge BestA, BestB;
+  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+  if (BestA.Src != BB) {
+    // If we have a trellis, and BB doesn't have the best fallthrough edges,
+    // we shouldn't choose any successor. We've already looked and there's a
+    // better fallthrough edge for all the successors.
+    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    return Result;
+  }
+
+  // Did we pick the triangle edge? If tail-duplication is profitable, do
+  // that instead. Otherwise merge the triangle edge now while we know it is
+  // optimal.
+  if (BestA.Dest == BestB.Src) {
+    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+    // would be better.
+    MachineBasicBlock *Succ1 = BestA.Dest;
+    MachineBasicBlock *Succ2 = BestB.Dest;
+    // Check to see if tail-duplication would be profitable.
+    if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                              Chain, BlockFilter)) {
+      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+            dbgs() << "    Selected: " << getBlockName(Succ2)
+                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+      Result.BB = Succ2;
+      Result.ShouldTailDup = true;
+      return Result;
+    }
+  }
+  // We have already computed the optimal edge for the other side of the
+  // trellis.
+  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+
+  auto TrellisSucc = BestA.Dest;
+  DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+        dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+               << ", probability: " << SuccProb << " (Trellis)\n");
+  Result.BB = TrellisSucc;
+  return Result;
+}
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
     return false;
+
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    // Skip any blocks that are already placed or not in this loop.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+        // This will result in a trellis after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a trellis tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering
+        // whether Succ can be duplicated into all its unplaced predecessors, we
+        // ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows trellises to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend trellises that the user created in addition
+        // to trellises created by tail-duplication, so we just look for the
+        // CFG.
+        continue;
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Find chains of triangles where we believe it would be profitable to
+/// tail-duplicate them all, but a local analysis would not find them.
+/// There are 3 ways this can be profitable:
+/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
+///    longer chains)
+/// 2) The chains are statically correlated. Branch probabilities have a very
+///    U-shaped distribution.
+///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
+///    If the branches in a chain are likely to be from the same side of the
+///    distribution as their predecessor, but are independent at runtime, this
+///    transformation is profitable. (Because the cost of being wrong is a small
+///    fixed cost, unlike the standard triangle layout where the cost of being
+///    wrong scales with the # of triangles.)
+/// 3) The chains are dynamically correlated. If the probability that a previous
+///    branch was taken positively influences whether the next branch will be
+///    taken
+/// We believe that 2 and 3 are common enough to justify the small margin in 1.
+void MachineBlockPlacement::precomputeTriangleChains() {
+  struct TriangleChain {
+    std::vector<MachineBasicBlock *> Edges;
+    TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
+        : Edges({src, dst}) {}
+
+    void append(MachineBasicBlock *dst) {
+      assert(getKey()->isSuccessor(dst) &&
+             "Attempting to append a block that is not a successor.");
+      Edges.push_back(dst);
+    }
+
+    unsigned count() const { return Edges.size() - 1; }
+
+    MachineBasicBlock *getKey() const {
+      return Edges.back();
+    }
+  };
+
+  if (TriangleChainCount == 0)
+    return;
+
+  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  // Map from last block to the chain that contains it. This allows us to extend
+  // chains as we find new triangles.
+  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
+  for (MachineBasicBlock &BB : *F) {
+    // If BB doesn't have 2 successors, it doesn't start a triangle.
+    if (BB.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PDom = nullptr;
+    for (MachineBasicBlock *Succ : BB.successors()) {
+      if (!MPDT->dominates(Succ, &BB))
+        continue;
+      PDom = Succ;
+      break;
+    }
+    // If BB doesn't have a post-dominating successor, it doesn't form a
+    // triangle.
+    if (PDom == nullptr)
+      continue;
+    // If PDom has a hint that it is low probability, skip this triangle.
+    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
+      continue;
+    // If PDom isn't eligible for duplication, this isn't the kind of triangle
+    // we're looking for.
+    if (!shouldTailDuplicate(PDom))
+      continue;
+    bool CanTailDuplicate = true;
+    // If PDom can't tail-duplicate into it's non-BB predecessors, then this
+    // isn't the kind of triangle we're looking for.
+    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+      if (Pred == &BB)
+        continue;
+      if (!TailDup.canTailDuplicate(PDom, Pred)) {
+        CanTailDuplicate = false;
+        break;
+      }
+    }
+    // If we can't tail-duplicate PDom to its predecessors, then skip this
+    // triangle.
+    if (!CanTailDuplicate)
+      continue;
+
+    // Now we have an interesting triangle. Insert it if it's not part of an
+    // existing chain.
+    // Note: This cannot be replaced with a call insert() or emplace() because
+    // the find key is BB, but the insert/emplace key is PDom.
+    auto Found = TriangleChainMap.find(&BB);
+    // If it is, remove the chain from the map, grow it, and put it back in the
+    // map with the end as the new key.
+    if (Found != TriangleChainMap.end()) {
+      TriangleChain Chain = std::move(Found->second);
+      TriangleChainMap.erase(Found);
+      Chain.append(PDom);
+      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
+    } else {
+      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+    }
+  }
+
+  // Iterating over a DenseMap is safe here, because the only thing in the body
+  // of the loop is inserting into another DenseMap (ComputedEdges).
+  // ComputedEdges is never iterated, so this doesn't lead to non-determinism.
+  for (auto &ChainPair : TriangleChainMap) {
+    TriangleChain &Chain = ChainPair.second;
+    // Benchmarking has shown that due to branch correlation duplicating 2 or
+    // more triangles is profitable, despite the calculations assuming
+    // independence.
+    if (Chain.count() < TriangleChainCount)
+      continue;
+    MachineBasicBlock *dst = Chain.Edges.back();
+    Chain.Edges.pop_back();
+    for (MachineBasicBlock *src : reverse(Chain.Edges)) {
+      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
+            getBlockName(dst) << " as pre-computed based on triangles.\n");
+
+      auto InsertResult = ComputedEdges.insert({src, {dst, true}});
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+
+      dst = src;
+    }
+  }
 }
 
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
-      MachineBasicBlock *BB) {
+      const MachineBasicBlock *BB) {
   if (!BB->getParent()->getFunction()->getEntryCount())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
@@ -609,11 +1211,11 @@ static BranchProbability getLayoutSuccessorProbThreshold(
     if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
       /* See case 1 below for the cost analysis. For BB->Succ to
        * be taken with smaller cost, the following needs to hold:
-       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
-       *   So the threshold T
-       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
-       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
-       * branch bias, we have
+       *   Prob(BB->Succ) > 2 * Prob(BB->Pred)
+       *   So the threshold T in the calculation below
+       *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
+       *   So T / (1 - T) = 2, Yielding T = 2/3
+       * Also adding user specified branch bias, we have
        *   T = (2/3)*(ProfileLikelyProb/50)
        *     = (2*ProfileLikelyProb)/150)
        */
@@ -625,10 +1227,17 @@ static BranchProbability getLayoutSuccessorProbThreshold(
 
 /// Checks to see if the layout candidate block \p Succ has a better layout
 /// predecessor than \c BB. If yes, returns true.
+/// \p SuccProb: The probability adjusted for only remaining blocks.
+///   Only used for logging
+/// \p RealSuccProb: The un-adjusted probability.
+/// \p Chain: The chain that BB belongs to and Succ is being considered for.
+/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
+///    considered
 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
-    BranchProbability SuccProb, BranchProbability RealSuccProb,
-    BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    const BlockChain &SuccChain, BranchProbability SuccProb,
+    BranchProbability RealSuccProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
 
   // There isn't a better layout when there are no unscheduled predecessors.
   if (SuccChain.UnscheduledPredecessors == 0)
@@ -689,9 +1298,9 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   //       |    |                             |  |
   //    ---BB   |                             |  BB
   //    |       |                             |  |
-  //    |  pred--                             |  Succ--
+  //    |  Pred--                             |  Succ--
   //    |  |                                  |       |
-  //    ---succ                               ---pred--
+  //    ---Succ                               ---Pred--
   //
   // cost = freq(S->Pred) + freq(BB->Succ)    cost = 2 * freq (S->Pred)
   //      = freq(S->Pred) + freq(S->BB)
@@ -734,11 +1343,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   //  |  Pred----|                     |  S1----
   //  |  |                             |       |
   //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
   //
   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
   //    + min(freq(Pred->S1), freq(Pred->S2))
   // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
   // is 0. Then the non topo layout is better when
@@ -756,13 +1366,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
         (BlockFilter && !BlockFilter->count(Pred)) ||
-        BlockToChain[Pred] == &Chain)
+        BlockToChain[Pred] == &Chain ||
+        // This check is redundant except for look ahead. This function is
+        // called for lookahead by isProfitableToTailDup when BB hasn't been
+        // placed yet.
+        (Pred == BB))
       continue;
     // Do backward checking.
     // For all cases above, we need a backward checking to filter out edges that
-    // are not 'strongly' biased. With profile data available, the check is
-    // mostly redundant for case 2 (when threshold prob is set at 50%) unless S
-    // has more than two successors.
+    // are not 'strongly' biased.
     // BB  Pred
     //  \ /
     //  Succ
@@ -798,14 +1410,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
 /// breaking CFG structure, but cave and break such structures in the case of
 /// very hot successor edges.
 ///
-/// \returns The best successor block found, or null if none are viable.
-MachineBasicBlock *
-MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
-                                           BlockChain &Chain,
-                                           const BlockFilterSet *BlockFilter) {
+/// \returns The best successor block found, or null if none are viable, along
+/// with a boolean indicating if tail duplication is necessary.
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::selectBestSuccessor(
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(StaticLikelyProb, 100);
 
-  MachineBasicBlock *BestSucc = nullptr;
+  BlockAndTailDupResult BestSucc = { nullptr, false };
   auto BestProb = BranchProbability::getZero();
 
   SmallVector<MachineBasicBlock *, 4> Successors;
@@ -813,22 +1426,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
 
   DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
+
+  // if we already precomputed the best successor for BB, return that if still
+  // applicable.
+  auto FoundEdge = ComputedEdges.find(BB);
+  if (FoundEdge != ComputedEdges.end()) {
+    MachineBasicBlock *Succ = FoundEdge->second.BB;
+    ComputedEdges.erase(FoundEdge);
+    BlockChain *SuccChain = BlockToChain[Succ];
+    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+        SuccChain != &Chain && Succ == *SuccChain->begin())
+      return FoundEdge->second;
+  }
+
+  // if BB is part of a trellis, Use the trellis to determine the optimal
+  // fallthrough edges
+  if (isTrellis(BB, Successors, Chain, BlockFilter))
+    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+                                   BlockFilter);
+
+  // For blocks with CFG violations, we may be able to lay them out anyway with
+  // tail-duplication. We keep this vector so we can perform the probability
+  // calculations the minimum number of times.
+  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
+      DupCandidates;
   for (MachineBasicBlock *Succ : Successors) {
     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
     BranchProbability SuccProb =
         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
 
-    // This heuristic is off by default.
-    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
-                                  HotProb))
-      return Succ;
-
     BlockChain &SuccChain = *BlockToChain[Succ];
     // Skip the edge \c BB->Succ if block \c Succ has a better layout
     // predecessor that yields lower global cost.
     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
-                                   Chain, BlockFilter))
+                                   Chain, BlockFilter)) {
+      // If tail duplication would make Succ profitable, place it.
+      if (TailDupPlacement && shouldTailDuplicate(Succ))
+        DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
       continue;
+    }
 
     DEBUG(
         dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
@@ -836,17 +1472,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
                << "\n");
 
-    if (BestSucc && BestProb >= SuccProb) {
+    if (BestSucc.BB && BestProb >= SuccProb) {
       DEBUG(dbgs() << "    Not the best candidate, continuing\n");
       continue;
     }
 
     DEBUG(dbgs() << "    Setting it as best candidate\n");
-    BestSucc = Succ;
+    BestSucc.BB = Succ;
     BestProb = SuccProb;
   }
-  if (BestSucc)
-    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc) << "\n");
+  // Handle the tail duplication candidates in order of decreasing probability.
+  // Stop at the first one that is profitable. Also stop if they are less
+  // profitable than BestSucc. Position is important because we preserve it and
+  // prefer first best match. Here we aren't comparing in order, so we capture
+  // the position instead.
+  if (DupCandidates.size() != 0) {
+    auto cmp =
+        [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
+           const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
+          return std::get<0>(a) > std::get<0>(b);
+        };
+    std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
+  }
+  for(auto &Tup : DupCandidates) {
+    BranchProbability DupProb;
+    MachineBasicBlock *Succ;
+    std::tie(DupProb, Succ) = Tup;
+    if (DupProb < BestProb)
+      break;
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
+        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+      DEBUG(
+          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
+                 << DupProb
+                 << " (Tail Duplicate)\n");
+      BestSucc.BB = Succ;
+      BestSucc.ShouldTailDup = true;
+      break;
+    }
+  }
+
+  if (BestSucc.BB)
+    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
 
   return BestSucc;
 }
@@ -862,7 +1529,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 ///
 /// \returns The best block found, or null if none are viable.
 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
-    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+    const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
   // Once we need to walk the worklist looking for a candidate, cleanup the
   // worklist of already placed entries.
   // FIXME: If this shows up on profiles, it could be folded (at the cost of
@@ -881,13 +1548,15 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
   MachineBasicBlock *BestBlock = nullptr;
   BlockFrequency BestFreq;
   for (MachineBasicBlock *MBB : WorkList) {
-    assert(MBB->isEHPad() == IsEHPad);
+    assert(MBB->isEHPad() == IsEHPad &&
+           "EHPad mismatch between block and work list.");
 
     BlockChain &SuccChain = *BlockToChain[MBB];
     if (&SuccChain == &Chain)
       continue;
 
-    assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block");
+    assert(SuccChain.UnscheduledPredecessors == 0 &&
+           "Found CFG-violating block");
 
     BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
     DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
@@ -948,16 +1617,19 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 }
 
 void MachineBlockPlacement::fillWorkLists(
-    MachineBasicBlock *MBB,
+    const MachineBasicBlock *MBB,
     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
     const BlockFilterSet *BlockFilter = nullptr) {
   BlockChain &Chain = *BlockToChain[MBB];
   if (!UpdatedPreds.insert(&Chain).second)
     return;
 
-  assert(Chain.UnscheduledPredecessors == 0);
+  assert(
+      Chain.UnscheduledPredecessors == 0 &&
+      "Attempting to place block with unscheduled predecessors in worklist.");
   for (MachineBasicBlock *ChainBB : Chain) {
-    assert(BlockToChain[ChainBB] == &Chain);
+    assert(BlockToChain[ChainBB] == &Chain &&
+           "Block in chain doesn't match BlockToChain map.");
     for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
       if (BlockFilter && !BlockFilter->count(Pred))
         continue;
@@ -970,23 +1642,23 @@ void MachineBlockPlacement::fillWorkLists(
   if (Chain.UnscheduledPredecessors != 0)
     return;
 
-  MBB = *Chain.begin();
-  if (MBB->isEHPad())
-    EHPadWorkList.push_back(MBB);
+  MachineBasicBlock *BB = *Chain.begin();
+  if (BB->isEHPad())
+    EHPadWorkList.push_back(BB);
   else
-    BlockWorkList.push_back(MBB);
+    BlockWorkList.push_back(BB);
 }
 
 void MachineBlockPlacement::buildChain(
-    MachineBasicBlock *BB, BlockChain &Chain,
+    const MachineBasicBlock *HeadBB, BlockChain &Chain,
     BlockFilterSet *BlockFilter) {
-  assert(BB && "BB must not be null.\n");
-  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
+  assert(HeadBB && "BB must not be null.\n");
+  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
 
-  MachineBasicBlock *LoopHeaderBB = BB;
+  const MachineBasicBlock *LoopHeaderBB = HeadBB;
   markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
-  BB = *std::prev(Chain.end());
+  MachineBasicBlock *BB = *std::prev(Chain.end());
   for (;;) {
     assert(BB && "null block found at end of chain in loop.");
     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
@@ -995,7 +1667,11 @@ void MachineBlockPlacement::buildChain(
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
-    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock* BestSucc = Result.BB;
+    bool ShouldTailDup = Result.ShouldTailDup;
+    if (TailDupPlacement)
+      ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -1016,7 +1692,7 @@ void MachineBlockPlacement::buildChain(
 
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
-    if (TailDupPlacement && BestSucc) {
+    if (TailDupPlacement && BestSucc && ShouldTailDup) {
       // If the chosen successor was duplicated into all its predecessors,
       // don't bother laying it out, just go round the loop again with BB as
       // the chain end.
@@ -1052,7 +1728,7 @@ void MachineBlockPlacement::buildChain(
 /// unconditional jump (for the backedge) rotating it in front of the loop
 /// header is always profitable.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
                                        const BlockFilterSet &LoopBlockSet) {
   // Placing the latch block before the header may introduce an extra branch
   // that skips this block the first time the loop is executed, which we want
@@ -1116,7 +1792,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
+MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
                                         const BlockFilterSet &LoopBlockSet) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
@@ -1235,12 +1911,18 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
 /// branches. For example, if the loop has fallthrough into its header and out
 /// of its bottom already, don't rotate it.
 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
-                                       MachineBasicBlock *ExitingBB,
+                                       const MachineBasicBlock *ExitingBB,
                                        const BlockFilterSet &LoopBlockSet) {
   if (!ExitingBB)
     return;
 
   MachineBasicBlock *Top = *LoopChain.begin();
+  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
+
+  // If ExitingBB is already the last one in a chain then nothing to do.
+  if (Bottom == ExitingBB)
+    return;
+
   bool ViableTopFallthrough = false;
   for (MachineBasicBlock *Pred : Top->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
@@ -1255,7 +1937,6 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
   // bottom is a viable exiting block. If so, bail out as rotating will
   // introduce an unnecessary branch.
   if (ViableTopFallthrough) {
-    MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
     for (MachineBasicBlock *Succ : Bottom->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (!LoopBlockSet.count(Succ) &&
@@ -1268,6 +1949,36 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
   if (ExitIt == LoopChain.end())
     return;
 
+  // Rotating a loop exit to the bottom when there is a fallthrough to top
+  // trades the entry fallthrough for an exit fallthrough.
+  // If there is no bottom->top edge, but the chosen exit block does have
+  // a fallthrough, we break that fallthrough for nothing in return.
+
+  // Let's consider an example. We have a built chain of basic blocks
+  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
+  // By doing a rotation we get
+  // Bk+1, ..., Bn, B1, ..., Bk
+  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
+  // If we had a fallthrough Bk -> Bk+1 it is broken now.
+  // It might be compensated by fallthrough Bn -> B1.
+  // So we have a condition to avoid creation of extra branch by loop rotation.
+  // All below must be true to avoid loop rotation:
+  //   If there is a fallthrough to top (B1)
+  //   There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
+  //   There is no fallthrough from bottom (Bn) to top (B1).
+  // Please note that there is no exit fallthrough from Bn because we checked it
+  // above.
+  if (ViableTopFallthrough) {
+    assert(std::next(ExitIt) != LoopChain.end() &&
+           "Exit should not be last BB");
+    MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
+    if (ExitingBB->isSuccessor(NextBlockInChain))
+      if (!Bottom->isSuccessor(Top))
+        return;
+  }
+
+  DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
+               << " at bottom\n");
   std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
 }
 
@@ -1285,7 +1996,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
 ///  Therefore, the cost for a given rotation is the sum of costs listed above.
 ///  We select the best rotation with the smallest cost.
 void MachineBlockPlacement::rotateLoopWithProfile(
-    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
+    BlockChain &LoopChain, const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
   auto HeaderBB = L.getHeader();
   auto HeaderIter = find(LoopChain, HeaderBB);
   auto RotationPos = LoopChain.end();
@@ -1422,7 +2134,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
 /// When profile data is available, exclude cold blocks from the returned set;
 /// otherwise, collect all blocks in the loop.
 MachineBlockPlacement::BlockFilterSet
-MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
+MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   BlockFilterSet LoopBlockSet;
 
   // Filter cold blocks off from LoopBlockSet when profile data is available.
@@ -1459,14 +2171,16 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
 /// as much as possible. We can then stitch the chains together in a way which
 /// both preserves the topological structure and minimizes taken conditional
 /// branches.
-void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
+void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // First recurse through any nested loops, building chains for those inner
   // loops.
-  for (MachineLoop *InnerLoop : L)
+  for (const MachineLoop *InnerLoop : L)
     buildLoopChains(*InnerLoop);
 
-  assert(BlockWorkList.empty());
-  assert(EHPadWorkList.empty());
+  assert(BlockWorkList.empty() &&
+         "BlockWorkList not empty when starting to build loop chains.");
+  assert(EHPadWorkList.empty() &&
+         "EHPadWorkList not empty when starting to build loop chains.");
   BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
 
   // Check if we have profile data for this function. If yes, we will rotate
@@ -1496,10 +2210,11 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   // walk the blocks, and use a set to prevent visiting a particular chain
   // twice.
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
-  assert(LoopChain.UnscheduledPredecessors == 0);
+  assert(LoopChain.UnscheduledPredecessors == 0 &&
+         "LoopChain should not have unscheduled predecessors.");
   UpdatedPreds.insert(&LoopChain);
 
-  for (MachineBasicBlock *LoopBB : LoopBlockSet)
+  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
     fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
 
   buildChain(LoopTop, LoopChain, &LoopBlockSet);
@@ -1533,7 +2248,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
 
     if (!LoopBlockSet.empty()) {
       BadLoop = true;
-      for (MachineBasicBlock *LoopBB : LoopBlockSet)
+      for (const MachineBasicBlock *LoopBB : LoopBlockSet)
         dbgs() << "Loop contains blocks never placed into a chain!\n"
                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -1546,31 +2261,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   EHPadWorkList.clear();
 }
 
-/// When OutlineOpitonalBranches is on, this method collects BBs that
-/// dominates all terminator blocks of the function \p F.
-void MachineBlockPlacement::collectMustExecuteBBs() {
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : *F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : *F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
-}
-
 void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
@@ -1605,16 +2295,15 @@ void MachineBlockPlacement::buildCFGChains() {
     }
   }
 
-  // Turned on with OutlineOptionalBranches option
-  collectMustExecuteBBs();
-
   // Build any loop-based chains.
   PreferredLoopExit = nullptr;
   for (MachineLoop *L : *MLI)
     buildLoopChains(*L);
 
-  assert(BlockWorkList.empty());
-  assert(EHPadWorkList.empty());
+  assert(BlockWorkList.empty() &&
+         "BlockWorkList should be empty before building final chain.");
+  assert(EHPadWorkList.empty() &&
+         "EHPadWorkList should be empty before building final chain.");
 
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   for (MachineBasicBlock &MBB : *F)
@@ -1839,7 +2528,7 @@ void MachineBlockPlacement::alignBlocks() {
 /// @return true if \p BB was removed.
 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-    MachineBasicBlock *LoopHeaderBB,
+    const MachineBasicBlock *LoopHeaderBB,
     BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt) {
   bool Removed, DuplicatedToLPred;
@@ -1901,21 +2590,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
 /// \return  - True if the block was duplicated into all preds and removed.
 bool MachineBlockPlacement::maybeTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *LPred,
-    const BlockChain &Chain, BlockFilterSet *BlockFilter,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt,
     bool &DuplicatedToLPred) {
-
   DuplicatedToLPred = false;
+  if (!shouldTailDuplicate(BB))
+    return false;
+
   DEBUG(dbgs() << "Redoing tail duplication for Succ#"
         << BB->getNumber() << "\n");
-  bool IsSimple = TailDup.isSimpleBB(BB);
-  // Blocks with single successors don't create additional fallthrough
-  // opportunities. Don't duplicate them. TODO: When conditional exits are
-  // analyzable, allow them to be duplicated.
-  if (!IsSimple && BB->succ_size() == 1)
-    return false;
-  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
-    return false;
+
   // This has to be a callback because none of it can be done after
   // BB is deleted.
   bool Removed = false;
@@ -1967,6 +2651,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
       llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
                                  &DuplicatedPreds, &RemovalCallbackRef);
 
@@ -2006,25 +2691,46 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = nullptr;
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
 
+  assert(BlockToChain.empty() &&
+         "BlockToChain map should be empty before starting placement.");
+  assert(ComputedEdges.empty() &&
+         "Computed Edge map should be empty before starting placement.");
+
+  unsigned TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // For agressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
+  }
+
   if (TailDupPlacement) {
-    unsigned TailDupSize = TailDuplicatePlacementThreshold;
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
+    precomputeTriangleChains();
   }
 
-  assert(BlockToChain.empty());
-
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
-  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structured CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
@@ -2032,7 +2738,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
@@ -2041,8 +2747,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
-      // Must redo the dominator tree if blocks were changed.
-      MDT->runOnMachineFunction(MF);
+      ComputedEdges.clear();
+      // Must redo the post-dominator tree if blocks were changed.
+      if (MPDT)
+        MPDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
@@ -2052,6 +2760,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   alignBlocks();
 
   BlockToChain.clear();
+  ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
   if (AlignAllBlock)
@@ -2067,6 +2776,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
         MBI->setAlignment(AlignAllNonFallThruBlocks);
     }
   }
+  if (ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F->getFunction()->getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MBP." + MF.getName(), false);
+  }
+
 
   // We always return true as we have no way to track whether the final order
   // differs from the original order.
diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
index 0766f46..582ff13 100644
--- a/contrib/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/SmallSet.h"
@@ -22,6 +21,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Support/raw_ostream.h"
@@ -108,12 +108,12 @@ namespace {
 
 char MachineCSE::ID = 0;
 char &llvm::MachineCSEID = MachineCSE::ID;
-INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse",
-                "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
+                      "Machine Common Subexpression Elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MachineCSE, "machine-cse",
-                "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
+                    "Machine Common Subexpression Elimination", false, false)
 
 /// The source register of a COPY machine instruction can be propagated to all
 /// its users, and this propagation could increase the probability of finding
@@ -180,8 +180,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
     I = skipDebugInstructionsForward(I, E);
 
     if (I == E)
-      // Reached end of block, register is obviously dead.
-      return true;
+      // Reached end of block, we don't know if register is dead or not.
+      return false;
 
     bool SeenDef = false;
     for (const MachineOperand &MO : I->operands()) {
diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
index 5beed5f..e6f80db 100644
--- a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -8,11 +8,9 @@
 //===----------------------------------------------------------------------===//
 //
 // The machine combiner pass uses machine trace metrics to ensure the combined
-// instructions does not lengthen the critical path or the resource depth.
+// instructions do not lengthen the critical path or the resource depth.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "machine-combiner"
-
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -32,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "machine-combiner"
+
 STATISTIC(NumInstCombined, "Number of machineinst combined");
 
 namespace {
@@ -86,11 +86,11 @@ private:
 char MachineCombiner::ID = 0;
 char &llvm::MachineCombinerID = MachineCombiner::ID;
 
-INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
+INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
                       "Machine InstCombiner", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
+INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
                     false, false)
 
 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -135,7 +135,9 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
   for (auto *InstrPtr : InsInstrs) { // for each Use
     unsigned IDepth = 0;
-    DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";);
+    DEBUG(dbgs() << "NEW INSTR ";
+          InstrPtr->print(dbgs(), TII);
+          dbgs() << "\n";);
     for (const MachineOperand &MO : InstrPtr->operands()) {
       // Check for virtual register operand.
       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
@@ -352,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   return false;
 }
 
+static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
+                                     SmallVector<MachineInstr *, 16> InsInstrs,
+                                     SmallVector<MachineInstr *, 16> DelInstrs,
+                                     MachineTraceMetrics *Traces) {
+  for (auto *InstrPtr : InsInstrs)
+    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
+  for (auto *InstrPtr : DelInstrs)
+    InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+  ++NumInstCombined;
+  Traces->invalidate(MBB);
+  Traces->verifyAnalysis();
+}
+
 /// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
@@ -406,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
       if (!MinInstr)
         MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-      MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
       Traces->verifyAnalysis();
       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                       InstrIdxForVirtReg);
@@ -426,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
-      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
-          (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                   DelInstrs, InstrIdxForVirtReg, P) &&
-           preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
-        for (auto *InstrPtr : InsInstrs)
-          MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
-        for (auto *InstrPtr : DelInstrs)
-          InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
-
-        Changed = true;
-        ++NumInstCombined;
-
-        Traces->invalidate(MBB);
-        Traces->verifyAnalysis();
+      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
         // Eagerly stop after the first pattern fires.
+        Changed = true;
         break;
       } else {
+        // Calculating the trace metrics may be expensive,
+        // so only do this when necessary.
+        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg, P) &&
+            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+          // Eagerly stop after the first pattern fires.
+          Changed = true;
+          break;
+        }
         // Cleanup instructions of the alternative code sequence. There is no
         // use for them.
         MachineFunction *MF = MBB->getParent();
diff --git a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 5de6dec..7d5a681 100644
--- a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -19,6 +18,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -27,7 +27,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "codegen-cp"
+#define DEBUG_TYPE "machine-cp"
 
 STATISTIC(NumDeletes, "Number of dead copies deleted");
 
@@ -79,7 +79,7 @@ namespace {
 char MachineCopyPropagation::ID = 0;
 char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
 
-INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
+INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
                 "Machine Copy Propagation Pass", false, false)
 
 /// Remove any entry in \p Map where the register is a subregister or equal to
@@ -291,17 +291,9 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       if (MO.isDef()) {
         Defs.push_back(Reg);
-      } else {
+        continue;
+      } else if (MO.readsReg())
         ReadRegister(Reg);
-      }
-      // Treat undef use like defs for copy propagation but not for
-      // dead copy. We would need to do a liveness check to be sure the copy
-      // is dead for undef uses.
-      // The backends are allowed to do whatever they want with undef value
-      // and we cannot be sure this register will not be rewritten to break
-      // some false dependencies for the hardware for instance.
-      if (MO.isUndef())
-        Defs.push_back(Reg);
     }
 
     // The instruction has a register mask operand which means that it clobbers
diff --git a/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
index acb7c48..b559e4e 100644
--- a/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -12,11 +12,11 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 
-
 using namespace llvm;
 
 namespace llvm {
-template class DominanceFrontierBase<MachineBasicBlock>;
+template class DominanceFrontierBase<MachineBasicBlock, false>;
+template class DominanceFrontierBase<MachineBasicBlock, true>;
 template class ForwardDominanceFrontierBase<MachineBasicBlock>;
 }
 
diff --git a/contrib/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
index 303a6a9..845e823 100644
--- a/contrib/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -31,7 +31,7 @@ static cl::opt<bool, true> VerifyMachineDomInfoX(
 
 namespace llvm {
 template class DomTreeNodeBase<MachineBasicBlock>;
-template class DominatorTreeBase<MachineBasicBlock>;
+template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
 }
 
 char MachineDominatorTree::ID = 0;
@@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
+  DT.reset(new DomTreeBase<MachineBasicBlock>());
   DT->recalculate(F);
-
   return false;
 }
 
 MachineDominatorTree::MachineDominatorTree()
     : MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new DominatorTreeBase<MachineBasicBlock>(false);
-}
-
-MachineDominatorTree::~MachineDominatorTree() {
-  delete DT;
 }
 
 void MachineDominatorTree::releaseMemory() {
-  DT->releaseMemory();
+  CriticalEdgesToSplit.clear();
+  DT.reset(nullptr);
 }
 
 void MachineDominatorTree::verifyAnalysis() const {
-  if (VerifyMachineDomInfo)
+  if (DT && VerifyMachineDomInfo)
     verifyDomTree();
 }
 
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
-  DT->print(OS);
+  if (DT)
+    DT->print(OS);
 }
 
 void MachineDominatorTree::applySplitCriticalEdges() const {
@@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
 }
 
 void MachineDominatorTree::verifyDomTree() const {
+  if (!DT)
+    return;
   MachineFunction &F = *getRoot()->getParent();
 
-  MachineDominatorTree OtherDT;
-  OtherDT.DT->recalculate(F);
-  if (compare(OtherDT)) {
+  DomTreeBase<MachineBasicBlock> OtherDT;
+  OtherDT.recalculate(F);
+  if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
+      DT->compare(OtherDT)) {
     errs() << "MachineDominatorTree is not up to date!\nComputed:\n";
-    print(errs(), nullptr);
+    DT->print(errs());
     errs() << "\nActual:\n";
-    OtherDT.print(errs(), nullptr);
+    OtherDT.print(errs());
     abort();
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp
new file mode 100644
index 0000000..73d778f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -0,0 +1,244 @@
+//===-- MachineFrameInfo.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Implements MachineFrameInfo that manages the stack frame.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+
+#define DEBUG_TYPE "codegen"
+
+using namespace llvm;
+
+void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
+  if (!StackRealignable)
+    assert(Align <= StackAlignment &&
+           "For targets without stack realignment, Align is out of limit!");
+  if (MaxAlignment < Align) MaxAlignment = Align;
+}
+
+/// Clamp the alignment if requested and emit a warning.
+static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
+                                           unsigned StackAlign) {
+  if (!ShouldClamp || Align <= StackAlign)
+    return Align;
+  DEBUG(dbgs() << "Warning: requested alignment " << Align
+               << " exceeds the stack alignment " << StackAlign
+               << " when stack realignment is off" << '\n');
+  return StackAlign;
+}
+
+int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
+                      bool isSS, const AllocaInst *Alloca) {
+  assert(Size != 0 && "Cannot allocate zero size stack objects!");
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca,
+                                !isSS));
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  assert(Index >= 0 && "Bad frame index!");
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
+                                             unsigned Alignment) {
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  CreateStackObject(Size, Alignment, true);
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
+                                                const AllocaInst *Alloca) {
+  HasVarSizedObjects = true;
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
+  ensureMaxAlignment(Alignment);
+  return (int)Objects.size()-NumFixedObjects-1;
+}
+
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+                                        bool Immutable, bool isAliased) {
+  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned. Note that unlike the non-fixed case, if the
+  // stack needs realignment, we can't assume that the stack will in fact be
+  // aligned.
+  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/   false,
+                                              /*Alloca*/ nullptr, isAliased));
+  return -++NumFixedObjects;
+}
+
+int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
+                                                  int64_t SPOffset,
+                                                  bool Immutable) {
+  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/ true,
+                                              /*Alloca*/ nullptr,
+                                              /*isAliased*/ false));
+  return -++NumFixedObjects;
+}
+
+BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  BitVector BV(TRI->getNumRegs());
+
+  // Before CSI is calculated, no registers are considered pristine. They can be
+  // freely used and PEI will make sure they are saved.
+  if (!isCalleeSavedInfoValid())
+    return BV;
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
+       ++CSR)
+    BV.set(*CSR);
+
+  // Saved CSRs are not pristine.
+  for (auto &I : getCalleeSavedInfo())
+    for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S)
+      BV.reset(*S);
+
+  return BV;
+}
+
+unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  unsigned MaxAlign = getMaxAlignment();
+  int Offset = 0;
+
+  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
+  // It really should be refactored to share code. Until then, changes
+  // should keep in mind that there's tight coupling between the two.
+
+  for (int i = getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) {
+    if (isDeadObjectIndex(i))
+      continue;
+    Offset += getObjectSize(i);
+    unsigned Align = getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  if (adjustsStack() && TFI->hasReservedCallFrame(MF))
+    Offset += getMaxCallFrameSize();
+
+  // Round up the size to a multiple of the alignment.  If the function has
+  // any calls or alloca's, align to the target's StackAlignment value to
+  // ensure that the callee's frame or the alloca data is suitably aligned;
+  // otherwise, for leaf functions, align to the TransientStackAlignment
+  // value.
+  unsigned StackAlign;
+  if (adjustsStack() || hasVarSizedObjects() ||
+      (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0))
+    StackAlign = TFI->getStackAlignment();
+  else
+    StackAlign = TFI->getTransientStackAlignment();
+
+  // If the frame pointer is eliminated, all frame offsets will be relative to
+  // SP not FP. Align to MaxAlign so this works.
+  StackAlign = std::max(StackAlign, MaxAlign);
+  unsigned AlignMask = StackAlign - 1;
+  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+
+  return (unsigned)Offset;
+}
+
+void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+  assert(FrameSetupOpcode != ~0u && FrameDestroyOpcode != ~0u &&
+         "Can only compute MaxCallFrameSize if Setup/Destroy opcode are known");
+
+  MaxCallFrameSize = 0;
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      unsigned Opcode = MI.getOpcode();
+      if (Opcode == FrameSetupOpcode || Opcode == FrameDestroyOpcode) {
+        unsigned Size = TII.getFrameSize(MI);
+        MaxCallFrameSize = std::max(MaxCallFrameSize, Size);
+        AdjustsStack = true;
+      } else if (MI.isInlineAsm()) {
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        unsigned ExtraInfo = MI.getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+          AdjustsStack = true;
+      }
+    }
+  }
+}
+
+void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
+  if (Objects.empty()) return;
+
+  const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
+  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+  OS << "Frame Objects:\n";
+
+  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
+    const StackObject &SO = Objects[i];
+    OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
+    if (SO.Size == ~0ULL) {
+      OS << "dead\n";
+      continue;
+    }
+    if (SO.Size == 0)
+      OS << "variable sized";
+    else
+      OS << "size=" << SO.Size;
+    OS << ", align=" << SO.Alignment;
+
+    if (i < NumFixedObjects)
+      OS << ", fixed";
+    if (i < NumFixedObjects || SO.SPOffset != -1) {
+      int64_t Off = SO.SPOffset - ValOffset;
+      OS << ", at location [SP";
+      if (Off > 0)
+        OS << "+" << Off;
+      else if (Off < 0)
+        OS << Off;
+      OS << "]";
+    }
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineFrameInfo::dump(const MachineFunction &MF) const {
+  print(MF, dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
index c1d5ea9..742b095 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionInitializer.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -52,8 +51,6 @@ static cl::opt<unsigned>
                       cl::desc("Force the alignment of all functions."),
                       cl::init(0), cl::Hidden);
 
-void MachineFunctionInitializer::anchor() {}
-
 static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   typedef MachineFunctionProperties::Property P;
   switch(Prop) {
@@ -169,6 +166,7 @@ void MachineFunction::clear() {
   InstructionRecycler.clear(Allocator);
   OperandRecycler.clear(Allocator);
   BasicBlockRecycler.clear(Allocator);
+  VariableDbgInfos.clear();
   if (RegInfo) {
     RegInfo->~MachineRegisterInfo();
     Allocator.Deallocate(RegInfo);
@@ -307,11 +305,11 @@ MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
 MachineMemOperand *MachineFunction::getMachineMemOperand(
     MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
     unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
-    SynchronizationScope SynchScope, AtomicOrdering Ordering,
+    SyncScope::ID SSID, AtomicOrdering Ordering,
     AtomicOrdering FailureOrdering) {
   return new (Allocator)
       MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges,
-                        SynchScope, Ordering, FailureOrdering);
+                        SSID, Ordering, FailureOrdering);
 }
 
 MachineMemOperand *
@@ -322,13 +320,27 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                MachineMemOperand(MachinePointerInfo(MMO->getValue(),
                                                     MMO->getOffset()+Offset),
                                  MMO->getFlags(), Size, MMO->getBaseAlignment(),
-                                 AAMDNodes(), nullptr, MMO->getSynchScope(),
+                                 AAMDNodes(), nullptr, MMO->getSyncScopeID(),
                                  MMO->getOrdering(), MMO->getFailureOrdering());
   return new (Allocator)
              MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(),
                                                   MMO->getOffset()+Offset),
                                MMO->getFlags(), Size, MMO->getBaseAlignment(),
-                               AAMDNodes(), nullptr, MMO->getSynchScope(),
+                               AAMDNodes(), nullptr, MMO->getSyncScopeID(),
+                               MMO->getOrdering(), MMO->getFailureOrdering());
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      const AAMDNodes &AAInfo) {
+  MachinePointerInfo MPI = MMO->getValue() ?
+             MachinePointerInfo(MMO->getValue(), MMO->getOffset()) :
+             MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset());
+
+  return new (Allocator)
+             MachineMemOperand(MPI, MMO->getFlags(), MMO->getSize(),
+                               MMO->getBaseAlignment(), AAInfo,
+                               MMO->getRanges(), MMO->getSyncScopeID(),
                                MMO->getOrdering(), MMO->getFailureOrdering());
 }
 
@@ -361,7 +373,7 @@ MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
                                (*I)->getFlags() & ~MachineMemOperand::MOStore,
                                (*I)->getSize(), (*I)->getBaseAlignment(),
                                (*I)->getAAInfo(), nullptr,
-                               (*I)->getSynchScope(), (*I)->getOrdering(),
+                               (*I)->getSyncScopeID(), (*I)->getOrdering(),
                                (*I)->getFailureOrdering());
         Result[Index] = JustLoad;
       }
@@ -395,7 +407,7 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
                                (*I)->getFlags() & ~MachineMemOperand::MOLoad,
                                (*I)->getSize(), (*I)->getBaseAlignment(),
                                (*I)->getAAInfo(), nullptr,
-                               (*I)->getSynchScope(), (*I)->getOrdering(),
+                               (*I)->getSyncScopeID(), (*I)->getOrdering(),
                                (*I)->getFailureOrdering());
         Result[Index] = JustStore;
       }
@@ -756,212 +768,6 @@ void llvm::addLandingPadInfo(const LandingPadInst &I, MachineBasicBlock &MBB) {
 /// \}
 
 //===----------------------------------------------------------------------===//
-//  MachineFrameInfo implementation
-//===----------------------------------------------------------------------===//
-
-/// Make sure the function is at least Align bytes aligned.
-void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
-  if (!StackRealignable)
-    assert(Align <= StackAlignment &&
-           "For targets without stack realignment, Align is out of limit!");
-  if (MaxAlignment < Align) MaxAlignment = Align;
-}
-
-/// Clamp the alignment if requested and emit a warning.
-static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
-                                           unsigned StackAlign) {
-  if (!ShouldClamp || Align <= StackAlign)
-    return Align;
-  DEBUG(dbgs() << "Warning: requested alignment " << Align
-               << " exceeds the stack alignment " << StackAlign
-               << " when stack realignment is off" << '\n');
-  return StackAlign;
-}
-
-/// Create a new statically sized stack object, returning a nonnegative
-/// identifier to represent it.
-int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
-                      bool isSS, const AllocaInst *Alloca) {
-  assert(Size != 0 && "Cannot allocate zero size stack objects!");
-  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
-  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca,
-                                !isSS));
-  int Index = (int)Objects.size() - NumFixedObjects - 1;
-  assert(Index >= 0 && "Bad frame index!");
-  ensureMaxAlignment(Alignment);
-  return Index;
-}
-
-/// Create a new statically sized stack object that represents a spill slot,
-/// returning a nonnegative identifier to represent it.
-int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
-                                             unsigned Alignment) {
-  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
-  CreateStackObject(Size, Alignment, true);
-  int Index = (int)Objects.size() - NumFixedObjects - 1;
-  ensureMaxAlignment(Alignment);
-  return Index;
-}
-
-/// Notify the MachineFrameInfo object that a variable sized object has been
-/// created. This must be created whenever a variable sized object is created,
-/// whether or not the index returned is actually used.
-int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
-                                                const AllocaInst *Alloca) {
-  HasVarSizedObjects = true;
-  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
-  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
-  ensureMaxAlignment(Alignment);
-  return (int)Objects.size()-NumFixedObjects-1;
-}
-
-/// Create a new object at a fixed location on the stack.
-/// All fixed objects should be created before other objects are created for
-/// efficiency. By default, fixed objects are immutable. This returns an
-/// index with a negative value.
-int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
-                                        bool Immutable, bool isAliased) {
-  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
-  // The alignment of the frame index can be determined from its offset from
-  // the incoming frame position.  If the frame object is at offset 32 and
-  // the stack is guaranteed to be 16-byte aligned, then we know that the
-  // object is 16-byte aligned. Note that unlike the non-fixed case, if the
-  // stack needs realignment, we can't assume that the stack will in fact be
-  // aligned.
-  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
-  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
-  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
-                                              /*isSS*/   false,
-                                              /*Alloca*/ nullptr, isAliased));
-  return -++NumFixedObjects;
-}
-
-/// Create a spill slot at a fixed location on the stack.
-/// Returns an index with a negative value.
-int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
-                                                  int64_t SPOffset,
-                                                  bool Immutable) {
-  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
-  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
-  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
-                                              /*isSS*/ true,
-                                              /*Alloca*/ nullptr,
-                                              /*isAliased*/ false));
-  return -++NumFixedObjects;
-}
-
-BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  BitVector BV(TRI->getNumRegs());
-
-  // Before CSI is calculated, no registers are considered pristine. They can be
-  // freely used and PEI will make sure they are saved.
-  if (!isCalleeSavedInfoValid())
-    return BV;
-
-  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
-    BV.set(*CSR);
-
-  // Saved CSRs are not pristine.
-  for (auto &I : getCalleeSavedInfo())
-    for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S)
-      BV.reset(*S);
-
-  return BV;
-}
-
-unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
-  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  unsigned MaxAlign = getMaxAlignment();
-  int Offset = 0;
-
-  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
-  // It really should be refactored to share code. Until then, changes
-  // should keep in mind that there's tight coupling between the two.
-
-  for (int i = getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -getObjectOffset(i);
-    if (FixedOff > Offset) Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) {
-    if (isDeadObjectIndex(i))
-      continue;
-    Offset += getObjectSize(i);
-    unsigned Align = getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset+Align-1)/Align*Align;
-
-    MaxAlign = std::max(Align, MaxAlign);
-  }
-
-  if (adjustsStack() && TFI->hasReservedCallFrame(MF))
-    Offset += getMaxCallFrameSize();
-
-  // Round up the size to a multiple of the alignment.  If the function has
-  // any calls or alloca's, align to the target's StackAlignment value to
-  // ensure that the callee's frame or the alloca data is suitably aligned;
-  // otherwise, for leaf functions, align to the TransientStackAlignment
-  // value.
-  unsigned StackAlign;
-  if (adjustsStack() || hasVarSizedObjects() ||
-      (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0))
-    StackAlign = TFI->getStackAlignment();
-  else
-    StackAlign = TFI->getTransientStackAlignment();
-
-  // If the frame pointer is eliminated, all frame offsets will be relative to
-  // SP not FP. Align to MaxAlign so this works.
-  StackAlign = std::max(StackAlign, MaxAlign);
-  unsigned AlignMask = StackAlign - 1;
-  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
-
-  return (unsigned)Offset;
-}
-
-void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
-  if (Objects.empty()) return;
-
-  const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
-  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
-
-  OS << "Frame Objects:\n";
-
-  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
-    const StackObject &SO = Objects[i];
-    OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
-    if (SO.Size == ~0ULL) {
-      OS << "dead\n";
-      continue;
-    }
-    if (SO.Size == 0)
-      OS << "variable sized";
-    else
-      OS << "size=" << SO.Size;
-    OS << ", align=" << SO.Alignment;
-
-    if (i < NumFixedObjects)
-      OS << ", fixed";
-    if (i < NumFixedObjects || SO.SPOffset != -1) {
-      int64_t Off = SO.SPOffset - ValOffset;
-      OS << ", at location [SP";
-      if (Off > 0)
-        OS << "+" << Off;
-      else if (Off < 0)
-        OS << Off;
-      OS << "]";
-    }
-    OS << "\n";
-  }
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void MachineFrameInfo::dump(const MachineFunction &MF) const {
-  print(MF, dbgs());
-}
-#endif
-
-//===----------------------------------------------------------------------===//
 //  MachineJumpTableInfo implementation
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
index 2265676..5ffe330 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -42,7 +42,7 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
     return false;
 
   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
-  MachineFunction &MF = MMI.getMachineFunction(F);
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
 
   MachineFunctionProperties &MFProps = MF.getProperties();
 
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0d533c3..55d9def 100644
--- a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
index 2f2e3b3..535757e 100644
--- a/contrib/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1,4 +1,4 @@
-//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===//
+//===- lib/CodeGen/MachineInstr.cpp ---------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,20 +12,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -34,10 +48,14 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -45,6 +63,14 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 static cl::opt<bool> PrintWholeRegMask(
@@ -256,14 +282,27 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   case MachineOperand::MO_GlobalAddress:
     return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
   case MachineOperand::MO_ExternalSymbol:
-    return !strcmp(getSymbolName(), Other.getSymbolName()) &&
+    return strcmp(getSymbolName(), Other.getSymbolName()) == 0 &&
            getOffset() == Other.getOffset();
   case MachineOperand::MO_BlockAddress:
     return getBlockAddress() == Other.getBlockAddress() &&
            getOffset() == Other.getOffset();
   case MachineOperand::MO_RegisterMask:
-  case MachineOperand::MO_RegisterLiveOut:
-    return getRegMask() == Other.getRegMask();
+  case MachineOperand::MO_RegisterLiveOut: {
+    // Shallow compare of the two RegMasks
+    const uint32_t *RegMask = getRegMask();
+    const uint32_t *OtherRegMask = Other.getRegMask();
+    if (RegMask == OtherRegMask)
+      return true;
+
+    // Calculate the size of the RegMask
+    const MachineFunction *MF = getParent()->getParent()->getParent();
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+
+    // Deep compare of the two RegMasks
+    return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask);
+  }
   case MachineOperand::MO_MCSymbol:
     return getMCSymbol() == Other.getMCSymbol();
   case MachineOperand::MO_CFIIndex:
@@ -403,6 +442,19 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       bool Unused;
       APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused);
       OS << "half " << APF.convertToFloat();
+    } else if (getFPImm()->getType()->isFP128Ty()) {
+      APFloat APF = getFPImm()->getValueAPF();
+      SmallString<16> Str;
+      getFPImm()->getValueAPF().toString(Str);
+      OS << "quad " << Str;
+    } else if (getFPImm()->getType()->isX86_FP80Ty()) {
+      APFloat APF = getFPImm()->getValueAPF();
+      OS << "x86_fp80 0xK";
+      APInt API = APF.bitcastToAPInt();
+      OS << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
+                                 /*Upper=*/true);
+      OS << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
+                                 /*Upper=*/true);
     } else {
       OS << getFPImm()->getValueAPF().convertToDouble();
     }
@@ -491,6 +543,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
     OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred")
        << CmpInst::getPredicateName(Pred) << '>';
+    break;
   }
   }
   if (unsigned TF = getTargetFlags())
@@ -514,6 +567,21 @@ unsigned MachinePointerInfo::getAddrSpace() const {
   return cast<PointerType>(V.get<const Value*>()->getType())->getAddressSpace();
 }
 
+/// isDereferenceable - Return true if V is always dereferenceable for 
+/// Offset + Size byte.
+bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
+                                           const DataLayout &DL) const {
+  if (!V.is<const Value*>())
+    return false;
+
+  const Value *BasePtr = V.get<const Value*>();
+  if (BasePtr == nullptr)
+    return false;
+
+  return isDereferenceableAndAlignedPointer(
+      BasePtr, 1, APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
+}
+
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
 /// constant pool.
 MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) {
@@ -544,7 +612,7 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
                                      uint64_t s, unsigned int a,
                                      const AAMDNodes &AAInfo,
                                      const MDNode *Ranges,
-                                     SynchronizationScope SynchScope,
+                                     SyncScope::ID SSID,
                                      AtomicOrdering Ordering,
                                      AtomicOrdering FailureOrdering)
     : PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlignLog2(Log2_32(a) + 1),
@@ -555,8 +623,8 @@ MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
   assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
   assert((isLoad() || isStore()) && "Not a load/store!");
 
-  AtomicInfo.SynchScope = static_cast<unsigned>(SynchScope);
-  assert(getSynchScope() == SynchScope && "Value truncated");
+  AtomicInfo.SSID = static_cast<unsigned>(SSID);
+  assert(getSyncScopeID() == SSID && "Value truncated");
   AtomicInfo.Ordering = static_cast<unsigned>(Ordering);
   assert(getOrdering() == Ordering && "Value truncated");
   AtomicInfo.FailureOrdering = static_cast<unsigned>(FailureOrdering);
@@ -682,6 +750,12 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
     OS << "(dereferenceable)";
   if (isInvariant())
     OS << "(invariant)";
+  if (getFlags() & MOTargetFlag1)
+    OS << "(flag1)";
+  if (getFlags() & MOTargetFlag2)
+    OS << "(flag2)";
+  if (getFlags() & MOTargetFlag3)
+    OS << "(flag3)";
 }
 
 //===----------------------------------------------------------------------===//
@@ -704,9 +778,7 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
                            DebugLoc dl, bool NoImp)
-    : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0),
-      AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr),
-      debugLoc(std::move(dl)) {
+    : MCID(&tid), debugLoc(std::move(dl)) {
   assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   // Reserve space for the expected number of operands.
@@ -723,9 +795,8 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
 /// MachineInstr ctor - Copies MachineInstr arg exactly
 ///
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
-    : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0),
-      Flags(0), AsmPrinterFlags(0), NumMemRefs(MI.NumMemRefs),
-      MemRefs(MI.MemRefs), debugLoc(MI.getDebugLoc()) {
+    : MCID(&MI.getDesc()), NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
+      debugLoc(MI.getDebugLoc()) {
   assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
 
   CapOperands = OperandCapacity::get(MI.getNumOperands());
@@ -1571,6 +1642,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
   return true;
 }
 
+bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other,
+                            bool UseTBAA) {
+  const MachineFunction *MF = getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // If neither instruction stores to memory, they can't alias in any
+  // meaningful way, even if they read from the same address.
+  if (!mayStore() && !Other.mayStore())
+    return false;
+
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA))
+    return false;
+
+  if (!AA)
+    return true;
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
+    return true;
+
+  MachineMemOperand *MMOa = *memoperands_begin();
+  MachineMemOperand *MMOb = *Other.memoperands_begin();
+
+  if (!MMOa->getValue() || !MMOb->getValue())
+    return true;
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasResult AAResult =
+      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
+                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+                MemoryLocation(MMOb->getValue(), Overlapb,
+                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+
+  return (AAResult != NoAlias);
+}
+
 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
 /// or volatile memory reference, or if the information describing the memory
 /// reference is not available. Return false if it is known to have no ordered
@@ -1589,7 +1719,7 @@ bool MachineInstr::hasOrderedMemoryRef() const {
     return true;
 
   // Check if any of our memory operands are ordered.
-  return any_of(memoperands(), [](const MachineMemOperand *MMO) {
+  return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) {
     return !MMO->isUnordered();
   });
 }
@@ -1692,14 +1822,14 @@ void MachineInstr::copyImplicitOps(MachineFunction &MF,
   }
 }
 
-LLVM_DUMP_METHOD void MachineInstr::dump(const TargetInstrInfo *TII) const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineInstr::dump() const {
   dbgs() << "  ";
-  print(dbgs(), false /* SkipOpers */, TII);
-#endif
+  print(dbgs());
 }
+#endif
 
-void MachineInstr::print(raw_ostream &OS, bool SkipOpers,
+void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc,
                          const TargetInstrInfo *TII) const {
   const Module *M = nullptr;
   if (const MachineBasicBlock *MBB = getParent())
@@ -1707,11 +1837,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers,
       M = MF->getFunction()->getParent();
 
   ModuleSlotTracker MST(M);
-  print(OS, MST, SkipOpers, TII);
+  print(OS, MST, SkipOpers, SkipDebugLoc, TII);
 }
 
 void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                         bool SkipOpers, const TargetInstrInfo *TII) const {
+                         bool SkipOpers, bool SkipDebugLoc,
+                         const TargetInstrInfo *TII) const {
   // We can be a bit tidier if we know the MachineFunction.
   const MachineFunction *MF = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
@@ -1762,7 +1893,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     return;
 
   // Print the rest of the operands.
-  bool OmittedAnyCallClobbers = false;
   bool FirstOp = true;
   unsigned AsmDescOp = ~0u;
   unsigned AsmOpCount = 0;
@@ -1799,31 +1929,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       VirtRegs.push_back(MO.getReg());
 
-    // Omit call-clobbered registers which aren't used anywhere. This makes
-    // call instructions much less noisy on targets where calls clobber lots
-    // of registers. Don't rely on MO.isDead() because we may be called before
-    // LiveVariables is run, or we may be looking at a non-allocatable reg.
-    if (MRI && isCall() &&
-        MO.isReg() && MO.isImplicit() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        if (MRI->use_empty(Reg)) {
-          bool HasAliasLive = false;
-          for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-            unsigned AliasReg = *AI;
-            if (!MRI->use_empty(AliasReg)) {
-              HasAliasLive = true;
-              break;
-            }
-          }
-          if (!HasAliasLive) {
-            OmittedAnyCallClobbers = true;
-            continue;
-          }
-        }
-      }
-    }
-
     if (FirstOp) FirstOp = false; else OS << ",";
     OS << " ";
     if (i < getDesc().NumOperands) {
@@ -1905,12 +2010,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       MO.print(OS, MST, TRI);
   }
 
-  // Briefly indicate whether any call clobbers were omitted.
-  if (OmittedAnyCallClobbers) {
-    if (!FirstOp) OS << ",";
-    OS << " ...";
-  }
-
   bool HaveSemi = false;
   const unsigned PrintableFlags = FrameSetup | FrameDestroy;
   if (Flags & PrintableFlags) {
@@ -1987,6 +2086,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
     if (isIndirectDebugValue())
       OS << " indirect";
+  } else if (SkipDebugLoc) {
+    return;
   } else if (debugLoc && MF) {
     if (!HaveSemi)
       OS << ";";
@@ -2174,8 +2275,8 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     // If there are no uses, including partial uses, the def is dead.
-    if (none_of(UsedRegs,
-                [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
+    if (llvm::none_of(UsedRegs,
+                      [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
       MO.setIsDead();
   }
 
@@ -2263,3 +2364,26 @@ MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
   BB.insert(I, MI);
   return MachineInstrBuilder(MF, MI);
 }
+
+MachineInstr *llvm::buildDbgValueForSpill(MachineBasicBlock &BB,
+                                          MachineBasicBlock::iterator I,
+                                          const MachineInstr &Orig,
+                                          int FrameIndex) {
+  const MDNode *Var = Orig.getDebugVariable();
+  const auto *Expr = cast_or_null<DIExpression>(Orig.getDebugExpression());
+  bool IsIndirect = Orig.isIndirectDebugValue();
+  uint64_t Offset = IsIndirect ? Orig.getOperand(1).getImm() : 0;
+  DebugLoc DL = Orig.getDebugLoc();
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  // If the DBG_VALUE already was a memory location, add an extra
+  // DW_OP_deref. Otherwise just turning this from a register into a
+  // memory/indirect location is sufficient.
+  if (IsIndirect)
+    Expr = DIExpression::prepend(Expr, DIExpression::WithDeref);
+  return BuildMI(BB, I, DL, Orig.getDesc())
+      .addFrameIndex(FrameIndex)
+      .addImm(Offset)
+      .addMetadata(Var)
+      .addMetadata(Expr);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
index b3d1843..c7113f1 100644
--- a/contrib/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -26,6 +25,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/CommandLine.h"
@@ -38,7 +38,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "machine-licm"
+#define DEBUG_TYPE "machinelicm"
 
 static cl::opt<bool>
 AvoidSpeculation("avoid-speculation",
@@ -237,13 +237,13 @@ namespace {
 
 char MachineLICM::ID = 0;
 char &llvm::MachineLICMID = MachineLICM::ID;
-INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm",
-                "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE,
+                      "Machine Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MachineLICM, "machinelicm",
-                "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE,
+                    "Machine Loop Invariant Code Motion", false, false)
 
 /// Test if the given loop is the outer-most loop that has a unique predecessor.
 static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
@@ -330,7 +330,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
 /// Return true if instruction stores to the specified frame.
 static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   // If we lost memory operands, conservatively assume that the instruction
-  // writes to all slots. 
+  // writes to all slots.
   if (MI->memoperands_empty())
     return true;
   for (const MachineMemOperand *MemOp : MI->memoperands()) {
@@ -708,7 +708,7 @@ void MachineLICM::SinkIntoLoop() {
   for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin();
        I != Preheader->instr_end(); ++I) {
     // We need to ensure that we can safely move this instruction into the loop.
-    // As such, it must not have side-effects, e.g. such as a call has.  
+    // As such, it must not have side-effects, e.g. such as a call has.
     if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I))
       Candidates.push_back(&*I);
   }
@@ -837,9 +837,9 @@ MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
 /// constant pool.
 static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
   assert (MI.mayLoad() && "Expected MI that loads!");
-  
+
   // If we lost memory operands, conservatively assume that the instruction
-  // reads from everything.. 
+  // reads from everything..
   if (MI.memoperands_empty())
     return true;
 
@@ -895,8 +895,11 @@ bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
         // it could get allocated to something with a def during allocation.
-        if (!MRI->isConstantPhysReg(Reg))
-          return false;
+        // However, if the physreg is known to always be caller saved/restored
+        // then this use is safe to hoist.
+        if (!MRI->isConstantPhysReg(Reg) &&
+            !(TRI->isCallerPreservedPhysReg(Reg, *I.getParent()->getParent())))
+            return false;
         // Otherwise it's safe to move.
         continue;
       } else if (!MO.isDead()) {
@@ -1337,7 +1340,7 @@ bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
     Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
 
     // Since we are moving the instruction out of its basic block, we do not
-    // retain its debug location. Doing so would degrade the debugging 
+    // retain its debug location. Doing so would degrade the debugging
     // experience and adversely affect the accuracy of profiling information.
     MI->setDebugLoc(DebugLoc());
 
diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
index fdeaf7b..a9aa1d9 100644
--- a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -87,6 +87,22 @@ MachineBasicBlock *MachineLoop::findLoopControlBlock() {
   return nullptr;
 }
 
+DebugLoc MachineLoop::getStartLoc() const {
+  // Try the pre-header first.
+  if (MachineBasicBlock *PHeadMBB = getLoopPreheader())
+    if (const BasicBlock *PHeadBB = PHeadMBB->getBasicBlock())
+      if (DebugLoc DL = PHeadBB->getTerminator()->getDebugLoc())
+        return DL;
+
+  // If we have no pre-header or there are no instructions with debug
+  // info in it, try the header.
+  if (MachineBasicBlock *HeadMBB = getHeader())
+    if (const BasicBlock *HeadBB = HeadMBB->getBasicBlock())
+      return HeadBB->getTerminator()->getDebugLoc();
+
+  return DebugLoc();
+}
+
 MachineBasicBlock *
 MachineLoopInfo::findLoopPreheader(MachineLoop *L,
                                    bool SpeculativePreheader) const {
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 6618857..825290a 100644
--- a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -8,43 +8,51 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionInitializer.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 using namespace llvm::dwarf;
 
 // Handle the Pass registration stuff necessary to use DataLayout's.
-INITIALIZE_TM_PASS(MachineModuleInfo, "machinemoduleinfo",
-                   "Machine Module Information", false, false)
+INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
+                "Machine Module Information", false, false)
 char MachineModuleInfo::ID = 0;
 
 // Out of line virtual method.
-MachineModuleInfoImpl::~MachineModuleInfoImpl() {}
+MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
 namespace llvm {
+
 class MMIAddrLabelMapCallbackPtr final : CallbackVH {
-  MMIAddrLabelMap *Map;
+  MMIAddrLabelMap *Map = nullptr;
+
 public:
-  MMIAddrLabelMapCallbackPtr() : Map(nullptr) {}
-  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(nullptr) {}
+  MMIAddrLabelMapCallbackPtr() = default;
+  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V) {}
 
   void setPtr(BasicBlock *BB) {
     ValueHandleBase::operator=(BB);
@@ -75,11 +83,12 @@ class MMIAddrLabelMap {
   /// This is a per-function list of symbols whose corresponding BasicBlock got
   /// deleted.  These symbols need to be emitted at some point in the file, so
   /// AsmPrinter emits them after the function body.
-  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>
     DeletedAddrLabelsNeedingEmission;
-public:
 
+public:
   MMIAddrLabelMap(MCContext &context) : Context(context) {}
+
   ~MMIAddrLabelMap() {
     assert(DeletedAddrLabelsNeedingEmission.empty() &&
            "Some labels for deleted blocks never got emitted");
@@ -93,7 +102,8 @@ public:
   void UpdateForDeletedBlock(BasicBlock *BB);
   void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
 };
-}
+
+} // end namespace llvm
 
 ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
   assert(BB->hasAddressTaken() &&
@@ -119,7 +129,7 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
 /// If we have any deleted symbols for F, return them.
 void MMIAddrLabelMap::
 takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
-  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >::iterator I =
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I =
     DeletedAddrLabelsNeedingEmission.find(F);
 
   // If there are no entries for the function, just return.
@@ -130,7 +140,6 @@ takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
   DeletedAddrLabelsNeedingEmission.erase(I);
 }
 
-
 void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
   // If the block got deleted, there is no need for the symbol.  If the symbol
   // was already emitted, we can just forget about it, otherwise we need to
@@ -177,7 +186,6 @@ void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
                           OldEntry.Symbols.end());
 }
 
-
 void MMIAddrLabelMapCallbackPtr::deleted() {
   Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
 }
@@ -186,9 +194,6 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
   Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
 }
 
-
-//===----------------------------------------------------------------------===//
-
 MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM)
   : ImmutablePass(ID), TM(*TM),
     Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
@@ -196,11 +201,9 @@ MachineModuleInfo::MachineModuleInfo(const TargetMachine *TM)
   initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
-MachineModuleInfo::~MachineModuleInfo() {
-}
+MachineModuleInfo::~MachineModuleInfo() = default;
 
 bool MachineModuleInfo::doInitialization(Module &M) {
-
   ObjFileMMI = nullptr;
   CurCallSite = 0;
   DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
@@ -211,7 +214,6 @@ bool MachineModuleInfo::doInitialization(Module &M) {
 }
 
 bool MachineModuleInfo::doFinalization(Module &M) {
-
   Personalities.clear();
 
   delete AddrLabelSymbols;
@@ -256,7 +258,14 @@ void MachineModuleInfo::addPersonality(const Function *Personality) {
 
 /// \}
 
-MachineFunction &MachineModuleInfo::getMachineFunction(const Function &F) {
+MachineFunction *
+MachineModuleInfo::getMachineFunction(const Function &F) const {
+  auto I = MachineFunctions.find(&F);
+  return I != MachineFunctions.end() ? I->second.get() : nullptr;
+}
+
+MachineFunction &
+MachineModuleInfo::getOrCreateMachineFunction(const Function &F) {
   // Shortcut for the common case where a sequence of MachineFunctionPasses
   // all query for the same Function.
   if (LastRequest == &F)
@@ -270,10 +279,6 @@ MachineFunction &MachineModuleInfo::getMachineFunction(const Function &F) {
     MF = new MachineFunction(&F, TM, NextFnNum++, *this);
     // Update the set entry.
     I.first->second.reset(MF);
-
-    if (MFInitializer)
-      if (MFInitializer->initializeMachineFunction(*MF))
-        report_fatal_error("Unable to initialize machine function");
   } else {
     MF = I.first->second.get();
   }
@@ -290,10 +295,12 @@ void MachineModuleInfo::deleteMachineFunctionFor(Function &F) {
 }
 
 namespace {
+
 /// This pass frees the MachineFunction object associated with a Function.
 class FreeMachineFunction : public FunctionPass {
 public:
   static char ID;
+
   FreeMachineFunction() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -306,15 +313,19 @@ public:
     MMI.deleteMachineFunctionFor(F);
     return true;
   }
+  
+  StringRef getPassName() const override {
+    return "Free MachineFunction";
+  } 
 };
-char FreeMachineFunction::ID;
+
 } // end anonymous namespace
 
-namespace llvm {
-FunctionPass *createFreeMachineFunctionPass() {
+char FreeMachineFunction::ID;
+
+FunctionPass *llvm::createFreeMachineFunctionPass() {
   return new FreeMachineFunction();
 }
-} // end namespace llvm
 
 //===- MMI building helpers -----------------------------------------------===//
 
diff --git a/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
new file mode 100644
index 0000000..73c3428
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -0,0 +1,108 @@
+///===- MachineOptimizationRemarkEmitter.cpp - Opt Diagnostic -*- C++ -*---===//
+///
+///                     The LLVM Compiler Infrastructure
+///
+/// This file is distributed under the University of Illinois Open Source
+/// License. See LICENSE.TXT for details.
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// Optimization diagnostic interfaces for machine passes.  It's packaged as an
+/// analysis pass so that by using this service passes become dependent on MBFI
+/// as well.  MBFI is used to compute the "hotness" of the diagnostic message.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/LLVMContext.h"
+
+using namespace llvm;
+
+DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
+    StringRef MKey, const MachineInstr &MI)
+    : Argument() {
+  Key = MKey;
+
+  raw_string_ostream OS(Val);
+  MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true);
+}
+
+Optional<uint64_t>
+MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) {
+  if (!MBFI)
+    return None;
+
+  return MBFI->getBlockProfileCount(&MBB);
+}
+
+void MachineOptimizationRemarkEmitter::computeHotness(
+    DiagnosticInfoMIROptimization &Remark) {
+  const MachineBasicBlock *MBB = Remark.getBlock();
+  if (MBB)
+    Remark.setHotness(computeHotness(*MBB));
+}
+
+void MachineOptimizationRemarkEmitter::emit(
+    DiagnosticInfoOptimizationBase &OptDiagCommon) {
+  auto &OptDiag = cast<DiagnosticInfoMIROptimization>(OptDiagCommon);
+  computeHotness(OptDiag);
+
+  LLVMContext &Ctx = MF.getFunction()->getContext();
+
+  // If a diagnostic has a hotness value, then only emit it if its hotness
+  // meets the threshold.
+  if (OptDiag.getHotness() &&
+      *OptDiag.getHotness() < Ctx.getDiagnosticsHotnessThreshold()) {
+    return;
+  }
+
+  yaml::Output *Out = Ctx.getDiagnosticsOutputFile();
+  if (Out) {
+    auto *P = &const_cast<DiagnosticInfoOptimizationBase &>(OptDiagCommon);
+    *Out << P;
+  }
+  // FIXME: now that IsVerbose is part of DI, filtering for this will be moved
+  // from here to clang.
+  if (!OptDiag.isVerbose() || shouldEmitVerbose())
+    Ctx.diagnose(OptDiag);
+}
+
+MachineOptimizationRemarkEmitterPass::MachineOptimizationRemarkEmitterPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineOptimizationRemarkEmitterPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  MachineBlockFrequencyInfo *MBFI;
+
+  if (MF.getFunction()->getContext().getDiagnosticsHotnessRequested())
+    MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
+  else
+    MBFI = nullptr;
+
+  ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  return false;
+}
+
+void MachineOptimizationRemarkEmitterPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+char MachineOptimizationRemarkEmitterPass::ID = 0;
+static const char ore_name[] = "Machine Optimization Remark Emitter";
+#define ORE_NAME "machine-opt-remark-emitter"
+
+INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
+                      false, true)
+INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass)
+INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
+                    false, true)
diff --git a/contrib/llvm/lib/CodeGen/MachineOutliner.cpp b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp
new file mode 100644
index 0000000..fd6b242
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -0,0 +1,1251 @@
+//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Replaces repeated sequences of instructions with function calls.
+///
+/// This works by placing every instruction from every basic block in a
+/// suffix tree, and repeatedly querying that tree for repeated sequences of
+/// instructions. If a sequence of instructions appears often, then it ought
+/// to be beneficial to pull out into a function.
+///
+/// This was originally presented at the 2016 LLVM Developers' Meeting in the
+/// talk "Reducing Code Size Using Outlining". For a high-level overview of
+/// how this pass works, the talk is available on YouTube at
+///
+/// https://www.youtube.com/watch?v=yorld-WSOeU
+///
+/// The slides for the talk are available at
+///
+/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf
+///
+/// The talk provides an overview of how the outliner finds candidates and
+/// ultimately outlines them. It describes how the main data structure for this
+/// pass, the suffix tree, is queried and purged for candidates. It also gives
+/// a simplified suffix tree construction algorithm for suffix trees based off
+/// of the algorithm actually used here, Ukkonen's algorithm.
+///
+/// For the original RFC for this pass, please see
+///
+/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html
+///
+/// For more information on the suffix tree data structure, please see
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+///
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <functional>
+#include <map>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+#define DEBUG_TYPE "machine-outliner"
+
+using namespace llvm;
+
+STATISTIC(NumOutlined, "Number of candidates outlined");
+STATISTIC(FunctionsCreated, "Number of functions created");
+
+namespace {
+
+/// \brief An individual sequence of instructions to be replaced with a call to
+/// an outlined function.
+struct Candidate {
+
+  /// Set to false if the candidate overlapped with another candidate.
+  bool InCandidateList = true;
+
+  /// The start index of this \p Candidate.
+  size_t StartIdx;
+
+  /// The number of instructions in this \p Candidate.
+  size_t Len;
+
+  /// The index of this \p Candidate's \p OutlinedFunction in the list of
+  /// \p OutlinedFunctions.
+  size_t FunctionIdx;
+
+  /// \brief The number of instructions that would be saved by outlining every
+  /// candidate of this type.
+  ///
+  /// This is a fixed value which is not updated during the candidate pruning
+  /// process. It is only used for deciding which candidate to keep if two
+  /// candidates overlap. The true benefit is stored in the OutlinedFunction
+  /// for some given candidate.
+  unsigned Benefit = 0;
+
+  Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx)
+      : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {}
+
+  Candidate() {}
+
+  /// \brief Used to ensure that \p Candidates are outlined in an order that
+  /// preserves the start and end indices of other \p Candidates.
+  bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; }
+};
+
+/// \brief The information necessary to create an outlined function for some
+/// class of candidate.
+struct OutlinedFunction {
+
+  /// The actual outlined function created.
+  /// This is initialized after we go through and create the actual function.
+  MachineFunction *MF = nullptr;
+
+  /// A number assigned to this function which appears at the end of its name.
+  size_t Name;
+
+  /// The number of candidates for this OutlinedFunction.
+  size_t OccurrenceCount = 0;
+
+  /// \brief The sequence of integers corresponding to the instructions in this
+  /// function.
+  std::vector<unsigned> Sequence;
+
+  /// The number of instructions this function would save.
+  unsigned Benefit = 0;
+
+  /// \brief Set to true if candidates for this outlined function should be
+  /// replaced with tail calls to this OutlinedFunction.
+  bool IsTailCall = false;
+
+  OutlinedFunction(size_t Name, size_t OccurrenceCount,
+                   const std::vector<unsigned> &Sequence,
+                   unsigned Benefit, bool IsTailCall)
+      : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence),
+        Benefit(Benefit), IsTailCall(IsTailCall)
+        {}
+};
+
+/// Represents an undefined index in the suffix tree.
+const size_t EmptyIdx = -1;
+
+/// A node in a suffix tree which represents a substring or suffix.
+///
+/// Each node has either no children or at least two children, with the root
+/// being a exception in the empty tree.
+///
+/// Children are represented as a map between unsigned integers and nodes. If
+/// a node N has a child M on unsigned integer k, then the mapping represented
+/// by N is a proper prefix of the mapping represented by M. Note that this,
+/// although similar to a trie is somewhat different: each node stores a full
+/// substring of the full mapping rather than a single character state.
+///
+/// Each internal node contains a pointer to the internal node representing
+/// the same string, but with the first character chopped off. This is stored
+/// in \p Link. Each leaf node stores the start index of its respective
+/// suffix in \p SuffixIdx.
+struct SuffixTreeNode {
+
+  /// The children of this node.
+  ///
+  /// A child existing on an unsigned integer implies that from the mapping
+  /// represented by the current node, there is a way to reach another
+  /// mapping by tacking that character on the end of the current string.
+  DenseMap<unsigned, SuffixTreeNode *> Children;
+
+  /// A flag set to false if the node has been pruned from the tree.
+  bool IsInTree = true;
+
+  /// The start index of this node's substring in the main string.
+  size_t StartIdx = EmptyIdx;
+
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  size_t *EndIdx = nullptr;
+
+  /// For leaves, the start index of the suffix represented by this node.
+  ///
+  /// For all other nodes, this is ignored.
+  size_t SuffixIdx = EmptyIdx;
+
+  /// \brief For internal nodes, a pointer to the internal node representing
+  /// the same sequence with the first character chopped off.
+  ///
+  /// This has two major purposes in the suffix tree. The first is as a
+  /// shortcut in Ukkonen's construction algorithm. One of the things that
+  /// Ukkonen's algorithm does to achieve linear-time construction is
+  /// keep track of which node the next insert should be at. This makes each
+  /// insert O(1), and there are a total of O(N) inserts. The suffix link
+  /// helps with inserting children of internal nodes.
+  ///
+  /// Say we add a child to an internal node with associated mapping S. The 
+  /// next insertion must be at the node representing S - its first character.
+  /// This is given by the way that we iteratively build the tree in Ukkonen's
+  /// algorithm. The main idea is to look at the suffixes of each prefix in the
+  /// string, starting with the longest suffix of the prefix, and ending with
+  /// the shortest. Therefore, if we keep pointers between such nodes, we can
+  /// move to the next insertion point in O(1) time. If we don't, then we'd
+  /// have to query from the root, which takes O(N) time. This would make the
+  /// construction algorithm O(N^2) rather than O(N).
+  ///
+  /// The suffix link is also used during the tree pruning process to let us
+  /// quickly throw out a bunch of potential overlaps. Say we have a sequence
+  /// S we want to outline. Then each of its suffixes contribute to at least
+  /// one overlapping case. Therefore, we can follow the suffix links
+  /// starting at the node associated with S to the root and "delete" those
+  /// nodes, save for the root. For each candidate, this removes
+  /// O(|candidate|) overlaps from the search space. We don't actually
+  /// completely invalidate these nodes though; doing that is far too
+  /// aggressive. Consider the following pathological string:
+  ///
+  /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3
+  ///
+  /// If we, for the sake of example, outlined 1 2 3, then we would throw
+  /// out all instances of 2 3. This isn't desirable. To get around this,
+  /// when we visit a link node, we decrement its occurrence count by the
+  /// number of sequences we outlined in the current step. In the pathological
+  /// example, the 2 3 node would have an occurrence count of 8, while the
+  /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node
+  /// would survive to the next round allowing us to outline the extra
+  /// instances of 2 3.
+  SuffixTreeNode *Link = nullptr;
+
+  /// The parent of this node. Every node except for the root has a parent.
+  SuffixTreeNode *Parent = nullptr;
+
+  /// The number of times this node's string appears in the tree.
+  ///
+  /// This is equal to the number of leaf children of the string. It represents
+  /// the number of suffixes that the node's string is a prefix of.
+  size_t OccurrenceCount = 0;
+
+  /// The length of the string formed by concatenating the edge labels from the
+  /// root to this node.
+  size_t ConcatLen = 0;
+
+  /// Returns true if this node is a leaf.
+  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
+
+  /// Returns true if this node is the root of its owning \p SuffixTree.
+  bool isRoot() const { return StartIdx == EmptyIdx; }
+
+  /// Return the number of elements in the substring associated with this node.
+  size_t size() const {
+
+    // Is it the root? If so, it's the empty string so return 0.
+    if (isRoot())
+      return 0;
+
+    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
+
+    // Size = the number of elements in the string.
+    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
+    return *EndIdx - StartIdx + 1;
+  }
+
+  SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link,
+                 SuffixTreeNode *Parent)
+      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {}
+
+  SuffixTreeNode() {}
+};
+
+/// A data structure for fast substring queries.
+///
+/// Suffix trees represent the suffixes of their input strings in their leaves.
+/// A suffix tree is a type of compressed trie structure where each node
+/// represents an entire substring rather than a single character. Each leaf
+/// of the tree is a suffix.
+///
+/// A suffix tree can be seen as a type of state machine where each state is a
+/// substring of the full string. The tree is structured so that, for a string
+/// of length N, there are exactly N leaves in the tree. This structure allows
+/// us to quickly find repeated substrings of the input string.
+///
+/// In this implementation, a "string" is a vector of unsigned integers.
+/// These integers may result from hashing some data type. A suffix tree can
+/// contain 1 or many strings, which can then be queried as one large string.
+///
+/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
+/// suffix tree construction. Ukkonen's algorithm is explained in more detail
+/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
+/// paper is available at
+///
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+class SuffixTree {
+private:
+  /// Each element is an integer representing an instruction in the module.
+  ArrayRef<unsigned> Str;
+
+  /// Maintains each node in the tree.
+  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
+
+  /// The root of the suffix tree.
+  ///
+  /// The root represents the empty string. It is maintained by the
+  /// \p NodeAllocator like every other node in the tree.
+  SuffixTreeNode *Root = nullptr;
+
+  /// Stores each leaf node in the tree.
+  ///
+  /// This is used for finding outlining candidates.
+  std::vector<SuffixTreeNode *> LeafVector;
+
+  /// Maintains the end indices of the internal nodes in the tree.
+  ///
+  /// Each internal node is guaranteed to never have its end index change
+  /// during the construction algorithm; however, leaves must be updated at
+  /// every step. Therefore, we need to store leaf end indices by reference
+  /// to avoid updating O(N) leaves at every step of construction. Thus,
+  /// every internal node must be allocated its own end index.
+  BumpPtrAllocator InternalEndIdxAllocator;
+
+  /// The end index of each leaf in the tree.
+  size_t LeafEndIdx = -1;
+
+  /// \brief Helper struct which keeps track of the next insertion point in
+  /// Ukkonen's algorithm.
+  struct ActiveState {
+    /// The next node to insert at.
+    SuffixTreeNode *Node;
+
+    /// The index of the first character in the substring currently being added.
+    size_t Idx = EmptyIdx;
+
+    /// The length of the substring we have to add at the current step.
+    size_t Len = 0;
+  };
+
+  /// \brief The point the next insertion will take place at in the
+  /// construction algorithm.
+  ActiveState Active;
+
+  /// Allocate a leaf node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated leaf node.
+  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx,
+                             unsigned Edge) {
+
+    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+
+    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, 
+                                                                   &LeafEndIdx,
+                                                                       nullptr,
+                                                                      &Parent);
+    Parent.Children[Edge] = N;
+
+    return N;
+  }
+
+  /// Allocate an internal node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node. Only null when allocating the root.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param EndIdx The end index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated internal node.
+  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx,
+                                     size_t EndIdx, unsigned Edge) {
+
+    assert(StartIdx <= EndIdx && "String can't start after it ends!");
+    assert(!(!Parent && StartIdx != EmptyIdx) &&
+    "Non-root internal nodes must have parents!");
+
+    size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx);
+    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx,
+                                                                      E,
+                                                                      Root,
+                                                                      Parent);
+    if (Parent)
+      Parent->Children[Edge] = N;
+
+    return N;
+  }
+
+  /// \brief Set the suffix indices of the leaves to the start indices of their
+  /// respective suffixes. Also stores each leaf in \p LeafVector at its
+  /// respective suffix index.
+  ///
+  /// \param[in] CurrNode The node currently being visited.
+  /// \param CurrIdx The current index of the string being visited.
+  void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) {
+
+    bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot();
+
+    // Store the length of the concatenation of all strings from the root to
+    // this node.
+    if (!CurrNode.isRoot()) {
+      if (CurrNode.ConcatLen == 0)
+        CurrNode.ConcatLen = CurrNode.size();
+
+      if (CurrNode.Parent)
+       CurrNode.ConcatLen += CurrNode.Parent->ConcatLen;
+    }
+
+    // Traverse the tree depth-first.
+    for (auto &ChildPair : CurrNode.Children) {
+      assert(ChildPair.second && "Node had a null child!");
+      setSuffixIndices(*ChildPair.second,
+                       CurrIdx + ChildPair.second->size());
+    }
+
+    // Is this node a leaf?
+    if (IsLeaf) {
+      // If yes, give it a suffix index and bump its parent's occurrence count.
+      CurrNode.SuffixIdx = Str.size() - CurrIdx;
+      assert(CurrNode.Parent && "CurrNode had no parent!");
+      CurrNode.Parent->OccurrenceCount++;
+
+      // Store the leaf in the leaf vector for pruning later.
+      LeafVector[CurrNode.SuffixIdx] = &CurrNode;
+    }
+  }
+
+  /// \brief Construct the suffix tree for the prefix of the input ending at
+  /// \p EndIdx.
+  ///
+  /// Used to construct the full suffix tree iteratively. At the end of each
+  /// step, the constructed suffix tree is either a valid suffix tree, or a
+  /// suffix tree with implicit suffixes. At the end of the final step, the
+  /// suffix tree is a valid tree.
+  ///
+  /// \param EndIdx The end index of the current prefix in the main string.
+  /// \param SuffixesToAdd The number of suffixes that must be added
+  /// to complete the suffix tree at the current phase.
+  ///
+  /// \returns The number of suffixes that have not been added at the end of
+  /// this step.
+  unsigned extend(size_t EndIdx, size_t SuffixesToAdd) {
+    SuffixTreeNode *NeedsLink = nullptr;
+
+    while (SuffixesToAdd > 0) {
+    
+      // Are we waiting to add anything other than just the last character?
+      if (Active.Len == 0) {
+        // If not, then say the active index is the end index.
+        Active.Idx = EndIdx;
+      }
+
+      assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+      // The first character in the current substring we're looking at.
+      unsigned FirstChar = Str[Active.Idx];
+
+      // Have we inserted anything starting with FirstChar at the current node?
+      if (Active.Node->Children.count(FirstChar) == 0) {
+        // If not, then we can just insert a leaf and move too the next step.
+        insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+        // The active node is an internal node, and we visited it, so it must
+        // need a link if it doesn't have one.
+        if (NeedsLink) {
+          NeedsLink->Link = Active.Node;
+          NeedsLink = nullptr;
+        }
+      } else {
+        // There's a match with FirstChar, so look for the point in the tree to
+        // insert a new node.
+        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
+
+        size_t SubstringLen = NextNode->size();
+
+        // Is the current suffix we're trying to insert longer than the size of
+        // the child we want to move to?
+        if (Active.Len >= SubstringLen) {
+          // If yes, then consume the characters we've seen and move to the next
+          // node.
+          Active.Idx += SubstringLen;
+          Active.Len -= SubstringLen;
+          Active.Node = NextNode;
+          continue;
+        }
+
+        // Otherwise, the suffix we're trying to insert must be contained in the
+        // next node we want to move to.
+        unsigned LastChar = Str[EndIdx];
+
+        // Is the string we're trying to insert a substring of the next node?
+        if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+          // If yes, then we're done for this step. Remember our insertion point
+          // and move to the next end index. At this point, we have an implicit
+          // suffix tree.
+          if (NeedsLink && !Active.Node->isRoot()) {
+            NeedsLink->Link = Active.Node;
+            NeedsLink = nullptr;
+          }
+
+          Active.Len++;
+          break;
+        }
+
+        // The string we're trying to insert isn't a substring of the next node,
+        // but matches up to a point. Split the node.
+        //
+        // For example, say we ended our search at a node n and we're trying to
+        // insert ABD. Then we'll create a new node s for AB, reduce n to just
+        // representing C, and insert a new leaf node l to represent d. This
+        // allows us to ensure that if n was a leaf, it remains a leaf.
+        //
+        //   | ABC  ---split--->  | AB
+        //   n                    s
+        //                     C / \ D
+        //                      n   l
+
+        // The node s from the diagram
+        SuffixTreeNode *SplitNode =
+            insertInternalNode(Active.Node,
+                               NextNode->StartIdx,
+                               NextNode->StartIdx + Active.Len - 1,
+                               FirstChar);
+
+        // Insert the new node representing the new substring into the tree as
+        // a child of the split node. This is the node l from the diagram.
+        insertLeaf(*SplitNode, EndIdx, LastChar);
+
+        // Make the old node a child of the split node and update its start
+        // index. This is the node n from the diagram.
+        NextNode->StartIdx += Active.Len;
+        NextNode->Parent = SplitNode;
+        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+
+        // SplitNode is an internal node, update the suffix link.
+        if (NeedsLink)
+          NeedsLink->Link = SplitNode;
+
+        NeedsLink = SplitNode;
+      }
+
+      // We've added something new to the tree, so there's one less suffix to
+      // add.
+      SuffixesToAdd--;
+
+      if (Active.Node->isRoot()) {
+        if (Active.Len > 0) {
+          Active.Len--;
+          Active.Idx = EndIdx - SuffixesToAdd + 1;
+        }
+      } else {
+        // Start the next phase at the next smallest suffix.
+        Active.Node = Active.Node->Link;
+      }
+    }
+
+    return SuffixesToAdd;
+  }
+
+public:
+
+  /// Find all repeated substrings that satisfy \p BenefitFn.
+  ///
+  /// If a substring appears at least twice, then it must be represented by
+  /// an internal node which appears in at least two suffixes. Each suffix is
+  /// represented by a leaf node. To do this, we visit each internal node in
+  /// the tree, using the leaf children of each internal node. If an internal
+  /// node represents a beneficial substring, then we use each of its leaf
+  /// children to find the locations of its substring.
+  ///
+  /// \param[out] CandidateList Filled with candidates representing each
+  /// beneficial substring.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each
+  /// type of candidate.
+  /// \param BenefitFn The function to satisfy.
+  ///
+  /// \returns The length of the longest candidate found.
+  size_t findCandidates(std::vector<Candidate> &CandidateList,
+  std::vector<OutlinedFunction> &FunctionList,
+  const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)>
+  &BenefitFn) {
+
+    CandidateList.clear();
+    FunctionList.clear();
+    size_t FnIdx = 0;
+    size_t MaxLen = 0;
+
+    for (SuffixTreeNode* Leaf : LeafVector) {
+      assert(Leaf && "Leaves in LeafVector cannot be null!");
+      if (!Leaf->IsInTree)
+        continue;
+
+      assert(Leaf->Parent && "All leaves must have parents!");
+      SuffixTreeNode &Parent = *(Leaf->Parent);
+
+      // If it doesn't appear enough, or we already outlined from it, skip it.
+      if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree)
+        continue;
+
+      size_t StringLen = Leaf->ConcatLen - Leaf->size();
+
+      // How many instructions would outlining this string save?
+      unsigned Benefit = BenefitFn(Parent,
+        StringLen, Str[Leaf->SuffixIdx + StringLen - 1]);
+
+      // If it's not beneficial, skip it.
+      if (Benefit < 1)
+        continue;
+
+      if (StringLen > MaxLen)
+        MaxLen = StringLen;
+
+      unsigned OccurrenceCount = 0;
+      for (auto &ChildPair : Parent.Children) {
+        SuffixTreeNode *M = ChildPair.second;
+
+        // Is it a leaf? If so, we have an occurrence of this candidate.
+        if (M && M->IsInTree && M->isLeaf()) {
+          OccurrenceCount++;
+          CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx);
+          CandidateList.back().Benefit = Benefit;
+          M->IsInTree = false;
+        }
+      }
+
+      // Save the function for the new candidate sequence.
+      std::vector<unsigned> CandidateSequence;
+      for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
+        CandidateSequence.push_back(Str[i]);
+
+      FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence,
+                                Benefit, false);
+
+      // Move to the next function.
+      FnIdx++;
+      Parent.IsInTree = false;
+    }
+
+    return MaxLen;
+  }
+ 
+  /// Construct a suffix tree from a sequence of unsigned integers.
+  ///
+  /// \param Str The string to construct the suffix tree for.
+  SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
+    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
+    Root->IsInTree = true;
+    Active.Node = Root;
+    LeafVector = std::vector<SuffixTreeNode*>(Str.size());
+
+    // Keep track of the number of suffixes we have to add of the current
+    // prefix.
+    size_t SuffixesToAdd = 0;
+    Active.Node = Root;
+
+    // Construct the suffix tree iteratively on each prefix of the string.
+    // PfxEndIdx is the end index of the current prefix.
+    // End is one past the last element in the string.
+    for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+      SuffixesToAdd++;
+      LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+    }
+
+    // Set the suffix indices of each leaf.
+    assert(Root && "Root node can't be nullptr!");
+    setSuffixIndices(*Root, 0);
+  }
+};
+
+/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings.
+struct InstructionMapper {
+
+  /// \brief The next available integer to assign to a \p MachineInstr that
+  /// cannot be outlined.
+  ///
+  /// Set to -3 for compatability with \p DenseMapInfo<unsigned>.
+  unsigned IllegalInstrNumber = -3;
+
+  /// \brief The next available integer to assign to a \p MachineInstr that can
+  /// be outlined.
+  unsigned LegalInstrNumber = 0;
+
+  /// Correspondence from \p MachineInstrs to unsigned integers.
+  DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>
+      InstructionIntegerMap;
+
+  /// Corresponcence from unsigned integers to \p MachineInstrs.
+  /// Inverse of \p InstructionIntegerMap.
+  DenseMap<unsigned, MachineInstr *> IntegerInstructionMap;
+
+  /// The vector of unsigned integers that the module is mapped to.
+  std::vector<unsigned> UnsignedVec;
+
+  /// \brief Stores the location of the instruction associated with the integer
+  /// at index i in \p UnsignedVec for each index i.
+  std::vector<MachineBasicBlock::iterator> InstrList;
+
+  /// \brief Maps \p *It to a legal integer.
+  ///
+  /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
+  /// \p IntegerInstructionMap, and \p LegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) {
+
+    // Get the integer for this instruction or give it the current
+    // LegalInstrNumber.
+    InstrList.push_back(It);
+    MachineInstr &MI = *It;
+    bool WasInserted;
+    DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator
+    ResultIt;
+    std::tie(ResultIt, WasInserted) =
+    InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber));
+    unsigned MINumber = ResultIt->second;
+
+    // There was an insertion.
+    if (WasInserted) {
+      LegalInstrNumber++;
+      IntegerInstructionMap.insert(std::make_pair(MINumber, &MI));
+    }
+
+    UnsignedVec.push_back(MINumber);
+
+    // Make sure we don't overflow or use any integers reserved by the DenseMap.
+    if (LegalInstrNumber >= IllegalInstrNumber)
+      report_fatal_error("Instruction mapping overflow!");
+
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey()
+          && "Tried to assign DenseMap tombstone or empty key to instruction.");
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey()
+          && "Tried to assign DenseMap tombstone or empty key to instruction.");
+
+    return MINumber;
+  }
+
+  /// Maps \p *It to an illegal integer.
+  ///
+  /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) {
+    unsigned MINumber = IllegalInstrNumber;
+
+    InstrList.push_back(It);
+    UnsignedVec.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+
+    assert(LegalInstrNumber < IllegalInstrNumber &&
+           "Instruction mapping overflow!");
+
+    assert(IllegalInstrNumber !=
+      DenseMapInfo<unsigned>::getEmptyKey() &&
+      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    assert(IllegalInstrNumber !=
+      DenseMapInfo<unsigned>::getTombstoneKey() &&
+      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    return MINumber;
+  }
+
+  /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds
+  /// and appends it to \p UnsignedVec and \p InstrList.
+  ///
+  /// Two instructions are assigned the same integer if they are identical.
+  /// If an instruction is deemed unsafe to outline, then it will be assigned an
+  /// unique integer. The resulting mapping is placed into a suffix tree and
+  /// queried for candidates.
+  ///
+  /// \param MBB The \p MachineBasicBlock to be translated into integers.
+  /// \param TRI \p TargetRegisterInfo for the module.
+  /// \param TII \p TargetInstrInfo for the module.
+  void convertToUnsignedVec(MachineBasicBlock &MBB,
+                            const TargetRegisterInfo &TRI,
+                            const TargetInstrInfo &TII) {
+    for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et;
+         It++) {
+
+      // Keep track of where this instruction is in the module.
+      switch(TII.getOutliningType(*It)) {
+        case TargetInstrInfo::MachineOutlinerInstrType::Illegal:
+          mapToIllegalUnsigned(It);
+          break;
+
+        case TargetInstrInfo::MachineOutlinerInstrType::Legal:
+          mapToLegalUnsigned(It);
+          break;
+
+        case TargetInstrInfo::MachineOutlinerInstrType::Invisible:
+          break;
+      }
+    }
+
+    // After we're done every insertion, uniquely terminate this part of the
+    // "string". This makes sure we won't match across basic block or function
+    // boundaries since the "end" is encoded uniquely and thus appears in no
+    // repeated substring.
+    InstrList.push_back(MBB.end());
+    UnsignedVec.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+  }
+
+  InstructionMapper() {
+    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
+    // changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+                "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+                "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
+};
+
+/// \brief An interprocedural pass which finds repeated sequences of
+/// instructions and replaces them with calls to functions.
+///
+/// Each instruction is mapped to an unsigned integer and placed in a string.
+/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree
+/// is then repeatedly queried for repeated sequences of instructions. Each
+/// non-overlapping repeated sequence is then placed in its own
+/// \p MachineFunction and each instance is then replaced with a call to that
+/// function.
+struct MachineOutliner : public ModulePass {
+
+  static char ID;
+
+  StringRef getPassName() const override { return "Machine Outliner"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfo>();
+    AU.addPreserved<MachineModuleInfo>();
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  MachineOutliner() : ModulePass(ID) {
+    initializeMachineOutlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// \brief Replace the sequences of instructions represented by the
+  /// \p Candidates in \p CandidateList with calls to \p MachineFunctions
+  /// described in \p FunctionList.
+  ///
+  /// \param M The module we are outlining from.
+  /// \param CandidateList A list of candidates to be outlined.
+  /// \param FunctionList A list of functions to be inserted into the module.
+  /// \param Mapper Contains the instruction mappings for the module.
+  bool outline(Module &M, const ArrayRef<Candidate> &CandidateList,
+               std::vector<OutlinedFunction> &FunctionList,
+               InstructionMapper &Mapper);
+
+  /// Creates a function for \p OF and inserts it into the module.
+  MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+                                          InstructionMapper &Mapper);
+
+  /// Find potential outlining candidates and store them in \p CandidateList.
+  ///
+  /// For each type of potential candidate, also build an \p OutlinedFunction
+  /// struct containing the information to build the function for that
+  /// candidate.
+  ///
+  /// \param[out] CandidateList Filled with outlining candidates for the module.
+  /// \param[out] FunctionList Filled with functions corresponding to each type
+  /// of \p Candidate.
+  /// \param ST The suffix tree for the module.
+  /// \param TII TargetInstrInfo for the module.
+  ///
+  /// \returns The length of the longest candidate found. 0 if there are none.
+  unsigned buildCandidateList(std::vector<Candidate> &CandidateList,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              SuffixTree &ST,
+                              InstructionMapper &Mapper,
+                              const TargetInstrInfo &TII);
+
+  /// \brief Remove any overlapping candidates that weren't handled by the
+  /// suffix tree's pruning method.
+  ///
+  /// Pruning from the suffix tree doesn't necessarily remove all overlaps.
+  /// If a short candidate is chosen for outlining, then a longer candidate
+  /// which has that short candidate as a suffix is chosen, the tree's pruning
+  /// method will not find it. Thus, we need to prune before outlining as well.
+  ///
+  /// \param[in,out] CandidateList A list of outlining candidates.
+  /// \param[in,out] FunctionList A list of functions to be outlined.
+  /// \param MaxCandidateLen The length of the longest candidate.
+  /// \param TII TargetInstrInfo for the module.
+  void pruneOverlaps(std::vector<Candidate> &CandidateList,
+                     std::vector<OutlinedFunction> &FunctionList,
+                     unsigned MaxCandidateLen,
+                     const TargetInstrInfo &TII);
+
+  /// Construct a suffix tree on the instructions in \p M and outline repeated
+  /// strings from that tree.
+  bool runOnModule(Module &M) override;
+};
+
+} // Anonymous namespace.
+
+char MachineOutliner::ID = 0;
+
+namespace llvm {
+ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); }
+}
+
+INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE,
+                "Machine Function Outliner", false, false)
+
+void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList,
+                                    std::vector<OutlinedFunction> &FunctionList,
+                                    unsigned MaxCandidateLen,
+                                    const TargetInstrInfo &TII) {
+  // TODO: Experiment with interval trees or other interval-checking structures
+  // to lower the time complexity of this function.
+  // TODO: Can we do better than the simple greedy choice?
+  // Check for overlaps in the range.
+  // This is O(MaxCandidateLen * CandidateList.size()).
+  for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et;
+       It++) {
+    Candidate &C1 = *It;
+    OutlinedFunction &F1 = FunctionList[C1.FunctionIdx];
+
+    // If we removed this candidate, skip it.
+    if (!C1.InCandidateList)
+      continue;
+
+    // Is it still worth it to outline C1?
+    if (F1.Benefit < 1 || F1.OccurrenceCount < 2) {
+      assert(F1.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+      F1.OccurrenceCount--;
+      C1.InCandidateList = false;
+      continue;
+    }
+
+    // The minimum start index of any candidate that could overlap with this
+    // one.
+    unsigned FarthestPossibleIdx = 0;
+
+    // Either the index is 0, or it's at most MaxCandidateLen indices away.
+    if (C1.StartIdx > MaxCandidateLen)
+      FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen;
+
+    // Compare against the candidates in the list that start at at most
+    // FarthestPossibleIdx indices away from C1. There are at most
+    // MaxCandidateLen of these.
+    for (auto Sit = It + 1; Sit != Et; Sit++) {
+      Candidate &C2 = *Sit;
+      OutlinedFunction &F2 = FunctionList[C2.FunctionIdx];
+
+      // Is this candidate too far away to overlap?
+      if (C2.StartIdx < FarthestPossibleIdx)
+        break;
+
+      // Did we already remove this candidate in a previous step?
+      if (!C2.InCandidateList)
+        continue;
+
+      // Is the function beneficial to outline?
+      if (F2.OccurrenceCount < 2 || F2.Benefit < 1) {
+        // If not, remove this candidate and move to the next one.
+        assert(F2.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F2.OccurrenceCount--;
+        C2.InCandidateList = false;
+        continue;
+      }
+
+      size_t C2End = C2.StartIdx + C2.Len - 1;
+
+      // Do C1 and C2 overlap?
+      //
+      // Not overlapping:
+      // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices
+      //
+      // We sorted our candidate list so C2Start <= C1Start. We know that
+      // C2End > C2Start since each candidate has length >= 2. Therefore, all we
+      // have to check is C2End < C2Start to see if we overlap.
+      if (C2End < C1.StartIdx)
+        continue;
+
+      // C1 and C2 overlap.
+      // We need to choose the better of the two.
+      //
+      // Approximate this by picking the one which would have saved us the
+      // most instructions before any pruning.
+      if (C1.Benefit >= C2.Benefit) {
+
+        // C1 is better, so remove C2 and update C2's OutlinedFunction to
+        // reflect the removal.
+        assert(F2.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F2.OccurrenceCount--;
+        F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(),
+                                             F2.OccurrenceCount,
+                                             F2.IsTailCall
+                                             );
+
+        C2.InCandidateList = false;
+
+        DEBUG (
+          dbgs() << "- Removed C2. \n";
+          dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n";
+          dbgs() << "--- C2's benefit: " << F2.Benefit << "\n";
+        );
+
+      } else {
+        // C2 is better, so remove C1 and update C1's OutlinedFunction to
+        // reflect the removal.
+        assert(F1.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F1.OccurrenceCount--;
+        F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(),
+                                             F1.OccurrenceCount,
+                                             F1.IsTailCall
+                                             );
+        C1.InCandidateList = false;
+
+        DEBUG (
+          dbgs() << "- Removed C1. \n";
+          dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n";
+          dbgs() << "--- C1's benefit: " << F1.Benefit << "\n";
+        );
+
+        // C1 is out, so we don't have to compare it against anyone else.
+        break;
+      }
+    }
+  }
+}
+
+unsigned
+MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList,
+                                    std::vector<OutlinedFunction> &FunctionList,
+                                    SuffixTree &ST,
+                                    InstructionMapper &Mapper,
+                                    const TargetInstrInfo &TII) {
+
+  std::vector<unsigned> CandidateSequence; // Current outlining candidate.
+  size_t MaxCandidateLen = 0; // Length of the longest candidate.
+
+  // Function for maximizing query in the suffix tree.
+  // This allows us to define more fine-grained types of things to outline in
+  // the target without putting target-specific info in the suffix tree.
+  auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr,
+                                   size_t StringLen, unsigned EndVal) {
+
+    // The root represents the empty string.
+    if (Curr.isRoot())
+      return 0u;
+
+    // Is this long enough to outline?
+	// TODO: Let the target decide how "long" a string is in terms of the sizes
+	// of the instructions in the string. For example, if a call instruction
+	// is smaller than a one instruction string, we should outline that string.
+    if (StringLen < 2)
+      return 0u;
+
+    size_t Occurrences = Curr.OccurrenceCount;
+
+    // Anything we want to outline has to appear at least twice.
+    if (Occurrences < 2)
+      return 0u;
+
+    // Check if the last instruction in the sequence is a return.
+    MachineInstr *LastInstr =
+    Mapper.IntegerInstructionMap[EndVal];
+    assert(LastInstr && "Last instruction in sequence was unmapped!");
+
+    // The only way a terminator could be mapped as legal is if it was safe to
+    // tail call.
+    bool IsTailCall = LastInstr->isTerminator();
+    return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall);
+  };
+
+  MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn);
+
+  for (auto &OF : FunctionList)
+    OF.IsTailCall = Mapper.
+                    IntegerInstructionMap[OF.Sequence.back()]->isTerminator();
+
+  // Sort the candidates in decending order. This will simplify the outlining
+  // process when we have to remove the candidates from the mapping by
+  // allowing us to cut them out without keeping track of an offset.
+  std::stable_sort(CandidateList.begin(), CandidateList.end());
+
+  return MaxCandidateLen;
+}
+
+MachineFunction *
+MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+  InstructionMapper &Mapper) {
+
+  // Create the function name. This should be unique. For now, just hash the
+  // module name and include it in the function name plus the number of this
+  // function.
+  std::ostringstream NameStream;
+  NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name;
+
+  // Create the function using an IR-level function.
+  LLVMContext &C = M.getContext();
+  Function *F = dyn_cast<Function>(
+      M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C)));
+  assert(F && "Function was null!");
+
+  // NOTE: If this is linkonceodr, then we can take advantage of linker deduping
+  // which gives us better results when we outline from linkonceodr functions.
+  F->setLinkage(GlobalValue::PrivateLinkage);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  IRBuilder<> Builder(EntryBB);
+  Builder.CreateRetVoid();
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
+  MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Insert the new function into the module.
+  MF.insert(MF.begin(), &MBB);
+
+  TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall);
+
+  // Copy over the instructions for the function using the integer mappings in
+  // its sequence.
+  for (unsigned Str : OF.Sequence) {
+    MachineInstr *NewMI =
+        MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second);
+    NewMI->dropMemRefs();
+
+    // Don't keep debug information for outlined instructions.
+    // FIXME: This means outlined functions are currently undebuggable.
+    NewMI->setDebugLoc(DebugLoc());
+    MBB.insert(MBB.end(), NewMI);
+  }
+
+  TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall);
+
+  return &MF;
+}
+
+bool MachineOutliner::outline(Module &M,
+                              const ArrayRef<Candidate> &CandidateList,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              InstructionMapper &Mapper) {
+
+  bool OutlinedSomething = false;
+
+  // Replace the candidates with calls to their respective outlined functions.
+  for (const Candidate &C : CandidateList) {
+
+    // Was the candidate removed during pruneOverlaps?
+    if (!C.InCandidateList)
+      continue;
+
+    // If not, then look at its OutlinedFunction.
+    OutlinedFunction &OF = FunctionList[C.FunctionIdx];
+
+    // Was its OutlinedFunction made unbeneficial during pruneOverlaps?
+    if (OF.OccurrenceCount < 2 || OF.Benefit < 1)
+      continue;
+
+    // If not, then outline it.
+    assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
+    MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent();
+    MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx];
+    unsigned EndIdx = C.StartIdx + C.Len - 1;
+
+    assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
+    MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+    assert(EndIt != MBB->end() && "EndIt out of bounds!");
+
+    EndIt++; // Erase needs one past the end index.
+
+    // Does this candidate have a function yet?
+    if (!OF.MF) {
+      OF.MF = createOutlinedFunction(M, OF, Mapper);
+      FunctionsCreated++;
+    }
+
+    MachineFunction *MF = OF.MF;
+    const TargetSubtargetInfo &STI = MF->getSubtarget();
+    const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+    // Insert a call to the new function and erase the old sequence.
+    TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall);
+    StartIt = Mapper.InstrList[C.StartIdx];
+    MBB->erase(StartIt, EndIt);
+
+    OutlinedSomething = true;
+
+    // Statistics.
+    NumOutlined++;
+  }
+
+  DEBUG (
+    dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";
+  );
+
+  return OutlinedSomething;
+}
+
+bool MachineOutliner::runOnModule(Module &M) {
+
+  // Is there anything in the module at all?
+  if (M.empty())
+    return false;
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  const TargetSubtargetInfo &STI = MMI.getOrCreateMachineFunction(*M.begin())
+                                      .getSubtarget();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+
+  InstructionMapper Mapper;
+
+  // Build instruction mappings for each function in the module.
+  for (Function &F : M) {
+    MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
+
+    // Is the function empty? Safe to outline from?
+    if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF))
+      continue;
+
+    // If it is, look at each MachineBasicBlock in the function.
+    for (MachineBasicBlock &MBB : MF) {
+
+      // Is there anything in MBB?
+      if (MBB.empty())
+        continue;
+
+      // If yes, map it.
+      Mapper.convertToUnsignedVec(MBB, *TRI, *TII);
+    }
+  }
+
+  // Construct a suffix tree, use it to find candidates, and then outline them.
+  SuffixTree ST(Mapper.UnsignedVec);
+  std::vector<Candidate> CandidateList;
+  std::vector<OutlinedFunction> FunctionList;
+
+  // Find all of the outlining candidates.
+  unsigned MaxCandidateLen =
+      buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII);
+
+  // Remove candidates that overlap with other candidates.
+  pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII);
+
+  // Outline each of the candidates and return true if something was outlined.
+  return outline(M, CandidateList, FunctionList, Mapper);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp
index 43a1809..19e9a50 100644
--- a/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/contrib/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -61,7 +61,6 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/SetVector.h"
@@ -69,6 +68,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -552,7 +552,9 @@ public:
     os << "\n";
   }
 
-  void dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
 };
 
 /// This class repesents the scheduled code.  The main data structure is a
@@ -593,7 +595,7 @@ private:
   /// Virtual register information.
   MachineRegisterInfo &MRI;
 
-  DFAPacketizer *Resources;
+  std::unique_ptr<DFAPacketizer> Resources;
 
 public:
   SMSchedule(MachineFunction *mf)
@@ -604,13 +606,6 @@ public:
     InitiationInterval = 0;
   }
 
-  ~SMSchedule() {
-    ScheduledInstrs.clear();
-    InstrToCycle.clear();
-    RegToStageDiff.clear();
-    delete Resources;
-  }
-
   void reset() {
     ScheduledInstrs.clear();
     InstrToCycle.clear();
@@ -720,13 +715,13 @@ char MachinePipeliner::ID = 0;
 int MachinePipeliner::NumTries = 0;
 #endif
 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
-INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner",
+INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
                       "Modulo Software Pipelining", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachinePipeliner, "pipeliner",
+INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
                     "Modulo Software Pipelining", false, false)
 
 /// The "main" function for implementing Swing Modulo Scheduling.
@@ -738,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
     return false;
 
   if (mf.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
       !EnableSWPOptSize.getPosition())
     return false;
 
@@ -960,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     if (Phi.getOperand(i + 1).getMBB() != Loop)
       InitVal = Phi.getOperand(i).getReg();
-    else if (Phi.getOperand(i + 1).getMBB() == Loop)
+    else
       LoopVal = Phi.getOperand(i).getReg();
 
   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
@@ -2514,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis(
     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
     bool IsLast) {
-  // Compute the stage number for the inital value of the Phi, which
+  // Compute the stage number for the initial value of the Phi, which
   // comes from the prolog. The prolog to use depends on to which kernel/
   // epilog that we're adding the Phi.
   unsigned PrologStage = 0;
@@ -3480,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
   // increment value to determine if the accesses may be loop carried.
   if (OffsetS >= OffsetD)
     return OffsetS + AccessSizeS > DeltaS;
-  else if (OffsetS < OffsetD)
+  else
     return OffsetD + AccessSizeD > DeltaD;
 
   return true;
@@ -3980,5 +3975,7 @@ void SMSchedule::print(raw_ostream &os) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Utility function used for debugging to print the schedule.
-void SMSchedule::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
index c3f6e92..4883779 100644
--- a/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
+++ b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
@@ -16,6 +16,10 @@
 
 using namespace llvm;
 
+namespace llvm {
+template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
+}
+
 char MachinePostDominatorTree::ID = 0;
 
 //declare initializeMachinePostDominatorTreePass
@@ -24,8 +28,7 @@ INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
 
 MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
   initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
-                                                       // postdominator
+  DT = new PostDomTreeBase<MachineBasicBlock>();
 }
 
 FunctionPass *
diff --git a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
index fc32183..1e74104 100644
--- a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -1,10 +1,21 @@
+//===- lib/Codegen/MachineRegionInfo.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineRegionInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 
-#define DEBUG_TYPE "region"
+#define DEBUG_TYPE "machine-region-info"
 
 using namespace llvm;
 
@@ -12,36 +23,29 @@ STATISTIC(numMachineRegions,       "The # of machine regions");
 STATISTIC(numMachineSimpleRegions, "The # of simple machine regions");
 
 namespace llvm {
+
 template class RegionBase<RegionTraits<MachineFunction>>;
 template class RegionNodeBase<RegionTraits<MachineFunction>>;
 template class RegionInfoBase<RegionTraits<MachineFunction>>;
-}
+
+} // end namespace llvm
 
 //===----------------------------------------------------------------------===//
 // MachineRegion implementation
-//
 
 MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
                              MachineRegionInfo* RI,
                              MachineDominatorTree *DT, MachineRegion *Parent) :
-  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {
-
-}
+  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {}
 
-MachineRegion::~MachineRegion() { }
+MachineRegion::~MachineRegion() = default;
 
 //===----------------------------------------------------------------------===//
 // MachineRegionInfo implementation
-//
-
-MachineRegionInfo::MachineRegionInfo() :
-  RegionInfoBase<RegionTraits<MachineFunction>>() {
 
-}
+MachineRegionInfo::MachineRegionInfo() = default;
 
-MachineRegionInfo::~MachineRegionInfo() {
-
-}
+MachineRegionInfo::~MachineRegionInfo() = default;
 
 void MachineRegionInfo::updateStatistics(MachineRegion *R) {
   ++numMachineRegions;
@@ -74,9 +78,7 @@ MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) {
   initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry());
 }
 
-MachineRegionInfoPass::~MachineRegionInfoPass() {
-
-}
+MachineRegionInfoPass::~MachineRegionInfoPass() = default;
 
 bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
   releaseMemory();
@@ -86,6 +88,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
   auto DF = &getAnalysis<MachineDominanceFrontier>();
 
   RI.recalculate(F, DT, PDT, DF);
+
+  DEBUG(RI.dump());
+
   return false;
 }
 
@@ -103,9 +108,10 @@ void MachineRegionInfoPass::verifyAnalysis() const {
 
 void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTreeWrapperPass>();
-  AU.addRequired<DominanceFrontierWrapperPass>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
@@ -119,22 +125,24 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
 #endif
 
 char MachineRegionInfoPass::ID = 0;
+char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
 
-INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions",
-                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
+                      "Detect single entry single exit regions", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
-INITIALIZE_PASS_END(MachineRegionInfoPass, "regions",
-                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE,
+                    "Detect single entry single exit regions", true, true)
 
 // Create methods available outside of this file, to use them
 // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
 // the link time optimization.
 
 namespace llvm {
-  FunctionPass *createMachineRegionInfoPass() {
-    return new MachineRegionInfoPass();
-  }
+
+FunctionPass *createMachineRegionInfoPass() {
+  return new MachineRegionInfoPass();
 }
 
+} // end namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 242cb0b..9a92ee2 100644
--- a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +12,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden,
 void MachineRegisterInfo::Delegate::anchor() {}
 
 MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
-    : MF(MF), TheDelegate(nullptr),
-      TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
-                           EnableSubRegLiveness) {
+    : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
+                                   EnableSubRegLiveness),
+      IsUpdatedCSRsInitialized(false) {
   unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
@@ -444,8 +458,8 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
   return TRC.getLaneMask();
 }
 
-#ifndef NDEBUG
-void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const {
   for (MachineInstr &I : use_instructions(Reg))
     I.dump();
 }
@@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
   }
   return false;
 }
+
+void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  if (!IsUpdatedCSRsInitialized) {
+    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      UpdatedCSRs.push_back(*I);
+
+    // Zero value represents the end of the register list
+    // (no more registers should be pushed).
+    UpdatedCSRs.push_back(0);
+
+    IsUpdatedCSRsInitialized = true;
+  }
+
+  // Remove the register (and its aliases from the list).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
+                      UpdatedCSRs.end());
+}
+
+const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
+  if (IsUpdatedCSRsInitialized)
+    return UpdatedCSRs.data();
+
+  return getTargetRegisterInfo()->getCalleeSavedRegs(MF);
+}
+
+void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) {
+  if (IsUpdatedCSRsInitialized)
+    UpdatedCSRs.clear();
+
+  for (MCPhysReg Reg : CSRs)
+    UpdatedCSRs.push_back(Reg);
+
+  // Zero value represents the end of the register list
+  // (no more registers should be pushed).
+  UpdatedCSRs.push_back(0);
+  IsUpdatedCSRsInitialized = true;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
index e06bc51..eaba9a5 100644
--- a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -13,29 +13,66 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 namespace llvm {
+
 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
@@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
 cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
-}
+
+} // end namespace llvm
 
 #ifndef NDEBUG
 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
@@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
 
-// Experimental heuristics
-static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
   cl::desc("Verify machine instrs before and after machine scheduling"));
 
@@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8;
 
 // Pin the vtables to this file.
 void MachineSchedStrategy::anchor() {}
+
 void ScheduleDAGMutation::anchor() {}
 
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
 
-MachineSchedContext::MachineSchedContext():
-    MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) {
+MachineSchedContext::MachineSchedContext() {
   RegClassInfo = new RegisterClassInfo();
 }
 
@@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() {
 }
 
 namespace {
+
 /// Base class for a machine scheduler class that can run at any point.
 class MachineSchedulerBase : public MachineSchedContext,
                              public MachineFunctionPass {
@@ -149,18 +184,20 @@ public:
 protected:
   ScheduleDAGInstrs *createPostMachineScheduler();
 };
-} // namespace
+
+} // end anonymous namespace
 
 char MachineScheduler::ID = 0;
 
 char &llvm::MachineSchedulerID = MachineScheduler::ID;
 
-INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
+INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
+INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
                     "Machine Instruction Scheduler", false, false)
 
 MachineScheduler::MachineScheduler()
@@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
 
 /// MachineSchedOpt allows command line selection of the scheduler.
 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
-               RegisterPassParser<MachineSchedRegistry> >
+               RegisterPassParser<MachineSchedRegistry>>
 MachineSchedOpt("misched",
                 cl::init(&useDefaultMachineSched), cl::Hidden,
                 cl::desc("Machine instruction scheduler to use"));
@@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for (;I != MBB->begin(); --I) {
+      for (; I != MBB->begin(); --I) {
         MachineInstr &MI = *std::prev(I);
         if (isSchedBoundary(&MI, &*MBB, MF, TII))
           break;
@@ -495,7 +532,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
     // thumb2 size reduction is currently an exception, so the PostMIScheduler
     // needs to do this.
     if (FixKillFlags)
-        Scheduler.fixupKills(&*MBB);
+      Scheduler.fixupKills(*MBB);
   }
   Scheduler.finalizeSchedule();
 }
@@ -504,13 +541,14 @@ void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
   // unimplemented
 }
 
-LLVM_DUMP_METHOD
-void ReadyQueue::dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ReadyQueue::dump() const {
   dbgs() << "Queue " << Name << ": ";
-  for (unsigned i = 0, e = Queue.size(); i < e; ++i)
-    dbgs() << Queue[i]->NodeNum << " ";
+  for (const SUnit *SU : Queue)
+    dbgs() << SU->NodeNum << " ";
   dbgs() << "\n";
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // ScheduleDAGMI - Basic machine instruction scheduling. This is
@@ -519,8 +557,7 @@ void ReadyQueue::dump() {
 // ===----------------------------------------------------------------------===/
 
 // Provide a vtable anchor.
-ScheduleDAGMI::~ScheduleDAGMI() {
-}
+ScheduleDAGMI::~ScheduleDAGMI() = default;
 
 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
   return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
@@ -572,10 +609,8 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
 
 /// releaseSuccessors - Call releaseSucc on each of SU's successors.
 void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    releaseSucc(SU, &*I);
-  }
+  for (SDep &Succ : SU->Succs)
+    releaseSucc(SU, &Succ);
 }
 
 /// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
@@ -611,10 +646,8 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
 
 /// releasePredecessors - Call releasePred on each of SU's predecessors.
 void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    releasePred(SU, &*I);
-  }
+  for (SDep &Pred : SU->Preds)
+    releasePred(SU, &Pred);
 }
 
 /// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
@@ -687,8 +720,8 @@ void ScheduleDAGMI::schedule() {
   DEBUG(
     if (EntrySU.getInstr() != nullptr)
       EntrySU.dumpAll(this);
-    for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-      SUnits[su].dumpAll(this);
+    for (const SUnit &SU : SUnits)
+      SU.dumpAll(this);
     if (ExitSU.getInstr() != nullptr)
       ExitSU.dumpAll(this);
   );
@@ -749,28 +782,25 @@ void ScheduleDAGMI::schedule() {
 
 /// Apply each ScheduleDAGMutation step in order.
 void ScheduleDAGMI::postprocessDAG() {
-  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
-    Mutations[i]->apply(this);
-  }
+  for (auto &m : Mutations)
+    m->apply(this);
 }
 
 void ScheduleDAGMI::
 findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
                       SmallVectorImpl<SUnit*> &BotRoots) {
-  for (std::vector<SUnit>::iterator
-         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
-    SUnit *SU = &(*I);
-    assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits");
+  for (SUnit &SU : SUnits) {
+    assert(!SU.isBoundaryNode() && "Boundary node should not be in SUnits");
 
     // Order predecessors so DFSResult follows the critical path.
-    SU->biasCriticalPath();
+    SU.biasCriticalPath();
 
     // A SUnit is ready to top schedule if it has no predecessors.
-    if (!I->NumPredsLeft)
-      TopRoots.push_back(SU);
+    if (!SU.NumPredsLeft)
+      TopRoots.push_back(&SU);
     // A SUnit is ready to bottom schedule if it has no successors.
-    if (!I->NumSuccsLeft)
-      BotRoots.push_back(SU);
+    if (!SU.NumSuccsLeft)
+      BotRoots.push_back(&SU);
   }
   ExitSU.biasCriticalPath();
 }
@@ -785,10 +815,9 @@ void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
   //
   // Nodes with unreleased weak edges can still be roots.
   // Release top roots in forward order.
-  for (SmallVectorImpl<SUnit*>::const_iterator
-         I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) {
-    SchedImpl->releaseTopNode(*I);
-  }
+  for (SUnit *SU : TopRoots)
+    SchedImpl->releaseTopNode(SU);
+
   // Release bottom roots in reverse order so the higher priority nodes appear
   // first. This is more natural and slightly more efficient.
   for (SmallVectorImpl<SUnit*>::const_reverse_iterator
@@ -825,7 +854,7 @@ void ScheduleDAGMI::placeDebugValues() {
     RegionBegin = FirstDbgValue;
   }
 
-  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
          DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
     std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
     MachineInstr *DbgValue = P.first;
@@ -841,7 +870,7 @@ void ScheduleDAGMI::placeDebugValues() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void ScheduleDAGMI::dumpSchedule() const {
+LLVM_DUMP_METHOD void ScheduleDAGMI::dumpSchedule() const {
   for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
     if (SUnit *SU = getSUnit(&(*MI)))
       SU->dump(this);
@@ -992,9 +1021,9 @@ void ScheduleDAGMILive::initRegPressure() {
     }
   }
   DEBUG(dbgs() << "Excess PSets: ";
-        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+        for (const PressureChange &RCPS : RegionCriticalPSets)
           dbgs() << TRI->getRegPressureSetName(
-            RegionCriticalPSets[i].getPSet()) << " ";
+            RCPS.getPSet()) << " ";
         dbgs() << "\n");
 }
 
@@ -1003,16 +1032,15 @@ updateScheduledPressure(const SUnit *SU,
                         const std::vector<unsigned> &NewMaxPressure) {
   const PressureDiff &PDiff = getPressureDiff(SU);
   unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
-  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
-       I != E; ++I) {
-    if (!I->isValid())
+  for (const PressureChange &PC : PDiff) {
+    if (!PC.isValid())
       break;
-    unsigned ID = I->getPSet();
+    unsigned ID = PC.getPSet();
     while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
       ++CritIdx;
     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
       if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
-          && NewMaxPressure[ID] <= INT16_MAX)
+          && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
         RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
     }
     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
@@ -1136,6 +1164,12 @@ void ScheduleDAGMILive::schedule() {
         dbgs() << "  Pressure Diff      : ";
         getPressureDiff(&SU).dump(*TRI);
       }
+      dbgs() << "  Single Issue       : ";
+      if (SchedModel.mustBeginGroup(SU.getInstr()) &&
+         SchedModel.mustEndGroup(SU.getInstr()))
+        dbgs() << "true;";
+      else
+        dbgs() << "false;";
       dbgs() << '\n';
     }
     if (ExitSU.getInstr() != nullptr)
@@ -1396,6 +1430,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Post-process the DAG to create cluster edges between neighboring
 /// loads or between neighboring stores.
 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
@@ -1403,6 +1438,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SUnit *SU;
     unsigned BaseReg;
     int64_t Offset;
+
     MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
         : SU(su), BaseReg(reg), Offset(ofs) {}
 
@@ -1439,31 +1475,31 @@ public:
   LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
       : BaseMemOpClusterMutation(tii, tri, true) {}
 };
-} // anonymous
+
+} // end anonymous namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
-} // namespace llvm
+} // end namespace llvm
 
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
-  for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) {
-    SUnit *SU = MemOps[Idx];
+  for (SUnit *SU : MemOps) {
     unsigned BaseReg;
     int64_t Offset;
     if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI))
@@ -1491,12 +1527,11 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
       // dependent on SUa can prevent load combining due to register reuse.
       // Predecessor edges do not need to be copied from SUb to SUa since nearby
       // loads should have effectively the same inputs.
-      for (SUnit::const_succ_iterator
-             SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
-        if (SI->getSUnit() == SUb)
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SUb)
           continue;
-        DEBUG(dbgs() << "  Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
-        DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+        DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
       }
       ++ClusterLength;
     } else
@@ -1513,17 +1548,15 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   DenseMap<unsigned, unsigned> StoreChainIDs;
   // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
-  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
-    SUnit *SU = &DAG->SUnits[Idx];
-    if ((IsLoad && !SU->getInstr()->mayLoad()) ||
-        (!IsLoad && !SU->getInstr()->mayStore()))
+  for (SUnit &SU : DAG->SUnits) {
+    if ((IsLoad && !SU.getInstr()->mayLoad()) ||
+        (!IsLoad && !SU.getInstr()->mayStore()))
       continue;
 
     unsigned ChainPredID = DAG->SUnits.size();
-    for (SUnit::const_pred_iterator
-           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
-      if (PI->isCtrl()) {
-        ChainPredID = PI->getSUnit()->NodeNum;
+    for (const SDep &Pred : SU.Preds) {
+      if (Pred.isCtrl()) {
+        ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }
     }
@@ -1534,82 +1567,12 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
       StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
     if (Result.second)
       StoreChainDependents.resize(NumChains + 1);
-    StoreChainDependents[Result.first->second].push_back(SU);
+    StoreChainDependents[Result.first->second].push_back(&SU);
   }
 
   // Iterate over the store chains.
-  for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
-    clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);
-}
-
-//===----------------------------------------------------------------------===//
-// MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class MacroFusion : public ScheduleDAGMutation {
-  const TargetInstrInfo &TII;
-public:
-  MacroFusion(const TargetInstrInfo &TII)
-    : TII(TII) {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-} // anonymous
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
-  return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr;
-}
-
-} // namespace llvm
-
-/// \brief Callback from DAG postProcessing to create cluster edges to encourage
-/// fused operations.
-void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch)
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!TII.shouldScheduleAdjacent(Pred, *Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds) {
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    }
-    for (SDep &SuccDep : SU.Succs) {
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-    }
-
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
-    break;
-  }
+  for (auto &SCD : StoreChainDependents)
+    clusterNeighboringMemOps(SCD, DAG);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1617,6 +1580,7 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Post-process the DAG to create weak edges from all uses of a copy to
 /// the one use that defines the copy's source vreg, most likely an induction
 /// variable increment.
@@ -1626,6 +1590,7 @@ class CopyConstrain : public ScheduleDAGMutation {
   // RegionEndIdx is the slot index of the last non-debug instruction in the
   // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
   SlotIndex RegionEndIdx;
+
 public:
   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
 
@@ -1634,17 +1599,18 @@ public:
 protected:
   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
 };
-} // anonymous
+
+} // end anonymous namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  return make_unique<CopyConstrain>(TII, TRI);
+                               const TargetRegisterInfo *TRI) {
+  return llvm::make_unique<CopyConstrain>(TII, TRI);
 }
 
-} // namespace llvm
+} // end namespace llvm
 
 /// constrainLocalCopy handles two possibilities:
 /// 1) Local src:
@@ -1749,16 +1715,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
   const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
   MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
   SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
-  for (SUnit::const_succ_iterator
-         I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
-       I != E; ++I) {
-    if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
+  for (const SDep &Succ : LastLocalSU->Succs) {
+    if (Succ.getKind() != SDep::Data || Succ.getReg() != LocalReg)
       continue;
-    if (I->getSUnit() == GlobalSU)
+    if (Succ.getSUnit() == GlobalSU)
       continue;
-    if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
+    if (!DAG->canAddEdge(GlobalSU, Succ.getSUnit()))
       return;
-    LocalUses.push_back(I->getSUnit());
+    LocalUses.push_back(Succ.getSUnit());
   }
   // Open the top of the GlobalLI hole by constraining any earlier global uses
   // to precede the start of LocalLI.
@@ -1766,15 +1730,14 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
   MachineInstr *FirstLocalDef =
     LIS->getInstructionFromIndex(LocalLI->beginIndex());
   SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
-  for (SUnit::const_pred_iterator
-         I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
-    if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
+  for (const SDep &Pred : GlobalSU->Preds) {
+    if (Pred.getKind() != SDep::Anti || Pred.getReg() != GlobalReg)
       continue;
-    if (I->getSUnit() == FirstLocalSU)
+    if (Pred.getSUnit() == FirstLocalSU)
       continue;
-    if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
+    if (!DAG->canAddEdge(FirstLocalSU, Pred.getSUnit()))
       return;
-    GlobalUses.push_back(I->getSUnit());
+    GlobalUses.push_back(Pred.getSUnit());
   }
   DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
   // Add the weak edges.
@@ -1805,12 +1768,11 @@ void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
   RegionEndIdx = DAG->getLIS()->getInstructionIndex(
       *priorNonDebug(DAG->end(), DAG->begin()));
 
-  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
-    SUnit *SU = &DAG->SUnits[Idx];
-    if (!SU->getInstr()->isCopy())
+  for (SUnit &SU : DAG->SUnits) {
+    if (!SU.getInstr()->isCopy())
       continue;
 
-    constrainLocalCopy(SU, static_cast<ScheduleDAGMILive*>(DAG));
+    constrainLocalCopy(&SU, static_cast<ScheduleDAGMILive*>(DAG));
   }
 }
 
@@ -1836,7 +1798,7 @@ void SchedBoundary::reset() {
   CheckPending = false;
   CurrCycle = 0;
   CurrMOps = 0;
-  MinReadyCycle = UINT_MAX;
+  MinReadyCycle = std::numeric_limits<unsigned>::max();
   ExpectedLatency = 0;
   DependentLatency = 0;
   RetiredMOps = 0;
@@ -1861,10 +1823,9 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   if (!SchedModel->hasInstrSchedModel())
     return;
   RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
-  for (std::vector<SUnit>::iterator
-         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
-    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
-    RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
+  for (SUnit &SU : DAG->SUnits) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&SU);
+    RemIssueCount += SchedModel->getNumMicroOps(SU.getInstr(), SC)
       * SchedModel->getMicroOpFactor();
     for (TargetSchedModel::ProcResIter
            PI = SchedModel->getWriteProcResBegin(SC),
@@ -1937,12 +1898,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
     return true;
   }
+
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
     DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
           << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
   }
+
+  if (CurrMOps > 0 &&
+      ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
+       (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
+    DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
+                 << (isTop()? "begin" : "end") << " group\n");
+    return true;
+  }
+
   if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
     const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
     for (TargetSchedModel::ProcResIter
@@ -1968,12 +1939,11 @@ unsigned SchedBoundary::
 findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
   SUnit *LateSU = nullptr;
   unsigned RemLatency = 0;
-  for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
-       I != E; ++I) {
-    unsigned L = getUnscheduledLatency(*I);
+  for (SUnit *SU : ReadySUs) {
+    unsigned L = getUnscheduledLatency(SU);
     if (L > RemLatency) {
       RemLatency = L;
-      LateSU = *I;
+      LateSU = SU;
     }
   }
   if (LateSU) {
@@ -2039,7 +2009,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
 /// Move the boundary of scheduled code by one cycle.
 void SchedBoundary::bumpCycle(unsigned NextCycle) {
   if (SchedModel->getMicroOpBufferSize() == 0) {
-    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+    assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
+           "MinReadyCycle uninitialized");
     if (MinReadyCycle > NextCycle)
       NextCycle = MinReadyCycle;
   }
@@ -2237,6 +2208,18 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   // one cycle.  Since we commonly reach the max MOps here, opportunistically
   // bump the cycle to avoid uselessly checking everything in the readyQ.
   CurrMOps += IncMOps;
+
+  // Bump the cycle count for issue group constraints.
+  // This must be done after NextCycle has been adjust for all other stalls.
+  // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
+  // currCycle to X.
+  if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
+      (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
+    DEBUG(dbgs() << "  Bump cycle to "
+                 << (isTop() ? "end" : "begin") << " group\n");
+    bumpCycle(++NextCycle);
+  }
+
   while (CurrMOps >= SchedModel->getIssueWidth()) {
     DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
           << " at cycle " << CurrCycle << '\n');
@@ -2250,7 +2233,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
 void SchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
-    MinReadyCycle = UINT_MAX;
+    MinReadyCycle = std::numeric_limits<unsigned>::max();
 
   // Check to see if any of the pending instructions are ready to issue.  If
   // so, add them to the available queue.
@@ -2323,10 +2306,10 @@ SUnit *SchedBoundary::pickOnlyChoice() {
   return nullptr;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // This is useful information to dump after bumpNode.
 // Note that the Queue contents are more useful before pickNodeFromQueue.
-void SchedBoundary::dumpScheduledState() {
+LLVM_DUMP_METHOD void SchedBoundary::dumpScheduledState() const {
   unsigned ResFactor;
   unsigned ResCount;
   if (ZoneCritResIdx) {
@@ -2665,12 +2648,15 @@ void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
   }
 }
 
-void GenericScheduler::dumpPolicy() {
+void GenericScheduler::dumpPolicy() const {
+  // Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "GenericScheduler RegionPolicy: "
          << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
          << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
          << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
          << "\n";
+#endif
 }
 
 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
@@ -2714,17 +2700,16 @@ void GenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
   // Some roots may not feed into ExitSU. Check all of them in case.
-  for (std::vector<SUnit*>::const_iterator
-         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
-    if ((*I)->getDepth() > Rem.CriticalPath)
-      Rem.CriticalPath = (*I)->getDepth();
+  for (const SUnit *SU : Bot.Available) {
+    if (SU->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = SU->getDepth();
   }
   DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
   if (DumpCriticalPathLength) {
     errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
   }
 
-  if (EnableCyclicPath) {
+  if (EnableCyclicPath && SchedModel->getMicroOpBufferSize() > 0) {
     Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
     checkAcyclicLatency();
   }
@@ -2964,10 +2949,10 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
   ReadyQueue &Q = Zone.Available;
-  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+  for (SUnit *SU : Q) {
 
     SchedCandidate TryCand(ZonePolicy);
-    initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);
+    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, TempTracker);
     // Pass SchedBoundary only when comparing nodes from the same boundary.
     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
     tryCandidate(Cand, TryCand, ZoneArg);
@@ -3106,7 +3091,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
 }
 
 void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
-
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
     ++InsertPos;
@@ -3114,18 +3098,17 @@ void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 
   // Find already scheduled copies with a single physreg dependence and move
   // them just above the scheduled instruction.
-  for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end();
-       I != E; ++I) {
-    if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg()))
+  for (SDep &Dep : Deps) {
+    if (Dep.getKind() != SDep::Data || !TRI->isPhysicalRegister(Dep.getReg()))
       continue;
-    SUnit *DepSU = I->getSUnit();
+    SUnit *DepSU = Dep.getSUnit();
     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
       continue;
     MachineInstr *Copy = DepSU->getInstr();
     if (!Copy->isCopy())
       continue;
     DEBUG(dbgs() << "  Rescheduling physreg copy ";
-          I->getSUnit()->dump(DAG));
+          Dep.getSUnit()->dump(DAG));
     DAG->moveInstruction(Copy, InsertPos);
   }
 }
@@ -3154,7 +3137,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
-  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C));
+  ScheduleDAGMILive *DAG =
+      new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
@@ -3195,15 +3179,13 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
   }
 }
 
-
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
   // Some roots may not feed into ExitSU. Check all of them in case.
-  for (SmallVectorImpl<SUnit*>::const_iterator
-         I = BotRoots.begin(), E = BotRoots.end(); I != E; ++I) {
-    if ((*I)->getDepth() > Rem.CriticalPath)
-      Rem.CriticalPath = (*I)->getDepth();
+  for (const SUnit *SU : BotRoots) {
+    if (SU->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = SU->getDepth();
   }
   DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
   if (DumpCriticalPathLength) {
@@ -3229,6 +3211,12 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
               Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
     return;
 
+  // Keep clustered nodes together.
+  if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
+                 Cand.SU == DAG->getNextClusterSucc(),
+                 TryCand, Cand, Cluster))
+    return;
+
   // Avoid critical resource consumption and balance the schedule.
   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
               TryCand, Cand, ResourceReduce))
@@ -3250,9 +3238,9 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
 
 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
   ReadyQueue &Q = Top.Available;
-  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+  for (SUnit *SU : Q) {
     SchedCandidate TryCand(Cand.Policy);
-    TryCand.SU = *I;
+    TryCand.SU = SU;
     TryCand.AtTop = true;
     TryCand.initResourceDelta(DAG, SchedModel);
     tryCandidate(Cand, TryCand);
@@ -3302,7 +3290,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C),
+  return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C),
                            /*RemoveKillFlags=*/true);
 }
 
@@ -3311,14 +3299,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Order nodes by the ILP metric.
 struct ILPOrder {
-  const SchedDFSResult *DFSResult;
-  const BitVector *ScheduledTrees;
+  const SchedDFSResult *DFSResult = nullptr;
+  const BitVector *ScheduledTrees = nullptr;
   bool MaximizeILP;
 
-  ILPOrder(bool MaxILP)
-    : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {}
+  ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
 
   /// \brief Apply a less-than relation on node priority.
   ///
@@ -3347,12 +3335,13 @@ struct ILPOrder {
 
 /// \brief Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
-  ScheduleDAGMILive *DAG;
+  ScheduleDAGMILive *DAG = nullptr;
   ILPOrder Cmp;
 
   std::vector<SUnit*> ReadyQ;
+
 public:
-  ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {}
+  ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
 
   void initialize(ScheduleDAGMI *dag) override {
     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
@@ -3405,14 +3394,16 @@ public:
     std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true));
+  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true));
 }
 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false));
+  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false));
 }
+
 static MachineSchedRegistry ILPMaxRegistry(
   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
 static MachineSchedRegistry ILPMinRegistry(
@@ -3424,6 +3415,7 @@ static MachineSchedRegistry ILPMinRegistry(
 
 #ifndef NDEBUG
 namespace {
+
 /// Apply a less-than relation on the node order, which corresponds to the
 /// instruction order prior to scheduling. IsReverse implements greater-than.
 template<bool IsReverse>
@@ -3444,11 +3436,12 @@ class InstructionShuffler : public MachineSchedStrategy {
   // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
   // gives nodes with a higher number higher priority causing the latest
   // instructions to be scheduled first.
-  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
     TopQ;
   // When scheduling bottom-up, use greater-than as the queue priority.
-  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
     BottomQ;
+
 public:
   InstructionShuffler(bool alternate, bool topdown)
     : IsAlternating(alternate), IsTopDown(topdown) {}
@@ -3492,15 +3485,18 @@ public:
     BottomQ.push(SU);
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
   bool Alternate = !ForceTopDown && !ForceBottomUp;
   bool TopDown = !ForceBottomUp;
   assert((TopDown || !ForceTopDown) &&
          "-misched-topdown incompatible with -misched-bottomup");
-  return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown));
+  return new ScheduleDAGMILive(
+      C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown));
 }
+
 static MachineSchedRegistry ShufflerRegistry(
   "shuffle", "Shuffle machine instructions alternating directions",
   createInstructionShuffler);
@@ -3518,8 +3514,7 @@ template<> struct GraphTraits<
 
 template<>
 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
-
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
     return G->MF.getName();
@@ -3576,7 +3571,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
     return Str;
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 #endif // NDEBUG
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp
index 5f87b68..79e3fea 100644
--- a/contrib/llvm/lib/CodeGen/MachineSink.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SparseBitVector.h"
@@ -33,6 +32,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -173,14 +173,14 @@ namespace {
 
 char MachineSinking::ID = 0;
 char &llvm::MachineSinkingID = MachineSinking::ID;
-INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink",
-                "Machine code sinking", false, false)
+INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
+                      "Machine code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MachineSinking, "machine-sink",
-                "Machine code sinking", false, false)
+INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE,
+                    "Machine code sinking", false, false)
 
 bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
                                                      MachineBasicBlock *MBB) {
diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index ef7e525..6c5abc6 100644
--- a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -1,4 +1,4 @@
-//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,20 +8,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -30,16 +44,14 @@ using namespace llvm;
 char MachineTraceMetrics::ID = 0;
 char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
 
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics,
-                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
+                      "Machine Trace Metrics", false, true)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(MachineTraceMetrics,
-                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
+                    "Machine Trace Metrics", false, true)
 
-MachineTraceMetrics::MachineTraceMetrics()
-  : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr),
-    MRI(nullptr), Loops(nullptr) {
+MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
   std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
 }
 
@@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
   return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                         Ensemble utility functions
 //===----------------------------------------------------------------------===//
@@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
 }
 
 // Virtual destructor serves as an anchor.
-MachineTraceMetrics::Ensemble::~Ensemble() {}
+MachineTraceMetrics::Ensemble::~Ensemble() = default;
 
 const MachineLoop*
 MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
@@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
 // MinInstrCountEnsemble - Pick the trace that executes the least number of
 // instructions.
 namespace {
+
 class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
   const char *getName() const override { return "MinInstr"; }
   const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override;
@@ -306,7 +318,8 @@ public:
   MinInstrCountEnsemble(MachineTraceMetrics *mtm)
     : MachineTraceMetrics::Ensemble(mtm) {}
 };
-}
+
+} // end anonymous namespace
 
 // Select the preferred predecessor for MBB.
 const MachineBasicBlock*
@@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const {
 // revisit blocks.
 
 namespace {
+
 struct LoopBounds {
   MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
   SmallPtrSet<const MachineBasicBlock*, 8> Visited;
   const MachineLoopInfo *Loops;
-  bool Downward;
+  bool Downward = false;
+
   LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
-             const MachineLoopInfo *loops)
-    : Blocks(blocks), Loops(loops), Downward(false) {}
+             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}
 };
-}
+
+} // end anonymous namespace
 
 // Specialize po_iterator_storage in order to prune the post-order traversal so
 // it is limited to the current loop and doesn't traverse the loop back edges.
 namespace llvm {
+
 template<>
 class po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
+
 public:
   po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+
   void finishPostorder(const MachineBasicBlock*) {}
 
   bool insertEdge(Optional<const MachineBasicBlock *> From,
@@ -452,7 +470,8 @@ public:
     return LB.Visited.insert(To).second;
   }
 };
-}
+
+} // end namespace llvm
 
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
@@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const {
 // A data dependency is represented as a defining MI and operand numbers on the
 // defining and using MI.
 namespace {
+
 struct DataDep {
   const MachineInstr *DefMI;
   unsigned DefOp;
@@ -622,7 +642,8 @@ struct DataDep {
     assert((++DefI).atEnd() && "Register has multiple defs");
   }
 };
-}
+
+} // end anonymous namespace
 
 // Get the input data dependencies that must be ready before UseMI can issue.
 // Return true if UseMI has any physreg operands.
@@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI,
 // direction instructions are scanned, it could be the operand that defined the
 // regunit, or the highest operand to read the regunit.
 namespace {
+
 struct LiveRegUnit {
   unsigned RegUnit;
-  unsigned Cycle;
-  const MachineInstr *MI;
-  unsigned Op;
+  unsigned Cycle = 0;
+  const MachineInstr *MI = nullptr;
+  unsigned Op = 0;
 
   unsigned getSparseSetIndex() const { return RegUnit; }
 
-  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {}
+  LiveRegUnit(unsigned RU) : RegUnit(RU) {}
 };
-}
+
+} // end anonymous namespace
 
 // Identify physreg dependencies for UseMI, and update the live regunit
 // tracking set when scanning instructions downwards.
@@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
   return Height;
 }
 
-
 typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
 
 // Push the height of DefMI upwards if required to match UseMI.
diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
index a98139f..fcb5448 100644
--- a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -23,7 +23,6 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
@@ -36,6 +35,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
@@ -87,7 +88,6 @@ namespace {
     RegSet regsLive;
     RegVector regsDefined, regsDead, regsKilled;
     RegMaskVector regMasks;
-    RegSet regsLiveInButUnused;
 
     SlotIndex lastIndex;
 
@@ -188,8 +188,9 @@ namespace {
       return Reg < regsReserved.size() && regsReserved.test(Reg);
     }
 
-    bool isAllocatable(unsigned Reg) {
-      return Reg < TRI->getNumRegs() && MRI->isAllocatable(Reg);
+    bool isAllocatable(unsigned Reg) const {
+      return Reg < TRI->getNumRegs() && TRI->isInAllocatableClass(Reg) &&
+        !regsReserved.test(Reg);
     }
 
     // Analysis information if available
@@ -260,8 +261,8 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     const std::string Banner;
 
-    MachineVerifierPass(const std::string &banner = nullptr)
-      : MachineFunctionPass(ID), Banner(banner) {
+    MachineVerifierPass(std::string banner = std::string())
+      : MachineFunctionPass(ID), Banner(std::move(banner)) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
 
@@ -418,7 +419,6 @@ unsigned MachineVerifier::verify(MachineFunction &MF) {
   regsDead.clear();
   regsKilled.clear();
   regMasks.clear();
-  regsLiveInButUnused.clear();
   MBBInfoMap.clear();
 
   return foundErrors;
@@ -526,9 +526,11 @@ void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
 
 void MachineVerifier::visitMachineFunctionBefore() {
   lastIndex = SlotIndex();
-  regsReserved = MRI->getReservedRegs();
+  regsReserved = MRI->reservedRegsFrozen() ? MRI->getReservedRegs()
+                                           : TRI->getReservedRegs(*MF);
 
-  markReachable(&MF->front());
+  if (!MF->empty())
+    markReachable(&MF->front());
 
   // Build a set of the basic blocks in the function.
   FunctionBlocks.clear();
@@ -548,7 +550,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // Check that the register use lists are sane.
   MRI->verifyUseLists();
 
-  verifyStackFrame();
+  if (!MF->empty())
+    verifyStackFrame();
 }
 
 // Does iterator point to a and b as the first two elements?
@@ -572,7 +575,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     for (const auto &LI : MBB->liveins()) {
       if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
           MBB->getIterator() != MBB->getParent()->begin()) {
-        report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
+        report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB);
       }
     }
   }
@@ -752,11 +755,10 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         regsLive.insert(*SubRegs);
     }
   }
-  regsLiveInButUnused = regsLive;
 
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   BitVector PR = MFI.getPristineRegs(*MF);
-  for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
+  for (unsigned I : PR.set_bits()) {
     for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true);
          SubRegs.isValid(); ++SubRegs)
       regsLive.insert(*SubRegs);
@@ -911,6 +913,39 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   StringRef ErrorInfo;
   if (!TII->verifyInstruction(*MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
+
+  // Verify properties of various specific instruction types
+  switch(MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE:
+    // Generic loads and stores must have a single MachineMemOperand
+    // describing that access.
+    if (!MI->hasOneMemOperand())
+      report("Generic instruction accessing memory must have one mem operand",
+             MI);
+    break;
+  case TargetOpcode::STATEPOINT:
+    if (!MI->getOperand(StatepointOpers::IDPos).isImm() ||
+        !MI->getOperand(StatepointOpers::NBytesPos).isImm() ||
+        !MI->getOperand(StatepointOpers::NCallArgsPos).isImm())
+      report("meta operands to STATEPOINT not constant!", MI);
+    break;
+
+    auto VerifyStackMapConstant = [&](unsigned Offset) {
+      if (!MI->getOperand(Offset).isImm() ||
+          MI->getOperand(Offset).getImm() != StackMaps::ConstantOp || 
+          !MI->getOperand(Offset + 1).isImm()) 
+        report("stack map constant to STATEPOINT not well formed!", MI);
+    };
+    const unsigned VarStart = StatepointOpers(MI).getVarIdx();
+    VerifyStackMapConstant(VarStart + StatepointOpers::CCOffset);
+    VerifyStackMapConstant(VarStart + StatepointOpers::FlagsOffset);
+    VerifyStackMapConstant(VarStart + StatepointOpers::NumDeoptOperandsOffset);
+
+    // TODO: verify we have properly encoded deopt arguments
+  };
 }
 
 void
@@ -950,6 +985,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         report("Operand should be tied", MO, MONum);
       else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
         report("Tied def doesn't match MCInstrDesc", MO, MONum);
+      else if (TargetRegisterInfo::isPhysicalRegister(MO->getReg())) {
+        const MachineOperand &MOTied = MI->getOperand(TiedTo);
+        if (!MOTied.isReg())
+          report("Tied counterpart must be a register", &MOTied, TiedTo);
+        else if (TargetRegisterInfo::isPhysicalRegister(MOTied.getReg()) &&
+                 MO->getReg() != MOTied.getReg())
+          report("Tied physical registers must match.", &MOTied, TiedTo);
+      }
     } else if (MO->isReg() && MO->isTied())
       report("Explicit operand should not be tied", MO, MONum);
   } else {
@@ -1256,8 +1299,6 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
 
   // Both use and def operands can read a register.
   if (MO->readsReg()) {
-    regsLiveInButUnused.erase(Reg);
-
     if (MO->isKill())
       addRegWithSubRegs(regsKilled, Reg);
 
@@ -1913,9 +1954,11 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
       const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
 
-      // All predecessors must have a live-out value if this is not a
-      // subregister liverange.
-      if (!PVNI && LaneMask.none()) {
+      // All predecessors must have a live-out value. However for a phi
+      // instruction with subregister intervals
+      // only one of the subregisters (not necessarily the current one) needs to
+      // be defined.
+      if (!PVNI && (LaneMask.none() || !IsPHI) ) {
         report("Register not marked live out of predecessor", *PI);
         report_context(LR, Reg, LaneMask);
         report_context(*VNI);
@@ -2020,6 +2063,8 @@ namespace {
 void MachineVerifier::verifyStackFrame() {
   unsigned FrameSetupOpcode   = TII->getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
+    return;
 
   SmallVector<StackStateOfBB, 8> SPState;
   SPState.resize(MF->getNumBlockIDs());
@@ -2047,23 +2092,14 @@ void MachineVerifier::verifyStackFrame() {
     // Update stack state by checking contents of MBB.
     for (const auto &I : *MBB) {
       if (I.getOpcode() == FrameSetupOpcode) {
-        // The first operand of a FrameOpcode should be i32.
-        int Size = I.getOperand(0).getImm();
-        assert(Size >= 0 &&
-          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
-
         if (BBState.ExitIsSetup)
           report("FrameSetup is after another FrameSetup", &I);
-        BBState.ExitValue -= Size;
+        BBState.ExitValue -= TII->getFrameTotalSize(I);
         BBState.ExitIsSetup = true;
       }
 
       if (I.getOpcode() == FrameDestroyOpcode) {
-        // The first operand of a FrameOpcode should be i32.
-        int Size = I.getOperand(0).getImm();
-        assert(Size >= 0 &&
-          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
-
+        int Size = TII->getFrameTotalSize(I);
         if (!BBState.ExitIsSetup)
           report("FrameDestroy is not after a FrameSetup", &I);
         int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
diff --git a/contrib/llvm/lib/CodeGen/MacroFusion.cpp b/contrib/llvm/lib/CodeGen/MacroFusion.cpp
new file mode 100644
index 0000000..633a853
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MacroFusion.cpp
@@ -0,0 +1,153 @@
+//===- MacroFusion.cpp - Macro Fusion -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the implementation of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+static void fuseInstructionPair(ScheduleDAGMI &DAG, SUnit &FirstSU,
+                                SUnit &SecondSU) {
+  // Create a single weak edge between the adjacent instrs. The only effect is
+  // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
+  DAG.addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
+
+  // Adjust the latency between the anchor instr and its
+  // predecessors.
+  for (SDep &IDep : SecondSU.Preds)
+    if (IDep.getSUnit() == &FirstSU)
+      IDep.setLatency(0);
+
+  // Adjust the latency between the dependent instr and its
+  // predecessors.
+  for (SDep &IDep : FirstSU.Succs)
+    if (IDep.getSUnit() == &SecondSU)
+      IDep.setLatency(0);
+
+  DEBUG(dbgs() << DAG.MF.getName() << "(): Macro fuse ";
+        FirstSU.print(dbgs(), &DAG); dbgs() << " - ";
+        SecondSU.print(dbgs(), &DAG); dbgs() << " /  ";
+        dbgs() << DAG.TII->getName(FirstSU.getInstr()->getOpcode()) << " - " <<
+                  DAG.TII->getName(SecondSU.getInstr()->getOpcode()) << '\n'; );
+
+  if (&SecondSU != &DAG.ExitSU)
+    // Make instructions dependent on FirstSU also dependent on SecondSU to
+    // prevent them from being scheduled between FirstSU and and SecondSU.
+    for (const SDep &SI : FirstSU.Succs) {
+      if (SI.getSUnit() == &SecondSU)
+        continue;
+      DEBUG(dbgs() << "  Copy Succ ";
+            SI.getSUnit()->print(dbgs(), &DAG); dbgs() << '\n';);
+      DAG.addEdge(SI.getSUnit(), SDep(&SecondSU, SDep::Artificial));
+    }
+
+  ++NumFused;
+}
+
+namespace {
+
+/// \brief Post-process the DAG to create cluster edges between instrs that may
+/// be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+  ShouldSchedulePredTy shouldScheduleAdjacent;
+  bool FuseBlock;
+  bool scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU);
+
+public:
+  MacroFusion(ShouldSchedulePredTy shouldScheduleAdjacent, bool FuseBlock)
+    : shouldScheduleAdjacent(shouldScheduleAdjacent), FuseBlock(FuseBlock) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+} // end anonymous namespace
+
+void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  if (FuseBlock)
+    // For each of the SUnits in the scheduling block, try to fuse the instr in
+    // it with one in its predecessors.
+    for (SUnit &ISU : DAG->SUnits)
+        scheduleAdjacentImpl(*DAG, ISU);
+
+  if (DAG->ExitSU.getInstr())
+    // Try to fuse the instr in the ExitSU with one in its predecessors.
+    scheduleAdjacentImpl(*DAG, DAG->ExitSU);
+}
+
+/// \brief Implement the fusion of instr pairs in the scheduling DAG,
+/// anchored at the instr in AnchorSU..
+bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGMI &DAG, SUnit &AnchorSU) {
+  const MachineInstr &AnchorMI = *AnchorSU.getInstr();
+  const TargetInstrInfo &TII = *DAG.TII;
+  const TargetSubtargetInfo &ST = DAG.MF.getSubtarget();
+
+  // Check if the anchor instr may be fused.
+  if (!shouldScheduleAdjacent(TII, ST, nullptr, AnchorMI))
+    return false;
+
+  // Explorer for fusion candidates among the dependencies of the anchor instr.
+  for (SDep &Dep : AnchorSU.Preds) {
+    // Ignore dependencies that don't enforce ordering.
+    if (Dep.getKind() == SDep::Anti || Dep.getKind() == SDep::Output ||
+        Dep.isWeak())
+      continue;
+
+    SUnit &DepSU = *Dep.getSUnit();
+    if (DepSU.isBoundaryNode())
+      continue;
+
+    const MachineInstr *DepMI = DepSU.getInstr();
+    if (!shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI))
+      continue;
+
+    fuseInstructionPair(DAG, DepSU, AnchorSU);
+    return true;
+  }
+
+  return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createMacroFusionDAGMutation(
+     ShouldSchedulePredTy shouldScheduleAdjacent) {
+  if(EnableMacroFusion)
+    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
+  return nullptr;
+}
+
+std::unique_ptr<ScheduleDAGMutation>
+llvm::createBranchMacroFusionDAGMutation(
+     ShouldSchedulePredTy shouldScheduleAdjacent) {
+  if(EnableMacroFusion)
+    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
index 2a8531f..f7aeb42 100644
--- a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
+++ b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
@@ -12,18 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "phi-opt"
+#define DEBUG_TYPE "opt-phis"
 
 STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
 STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
@@ -59,7 +59,7 @@ namespace {
 
 char OptimizePHIs::ID = 0;
 char &llvm::OptimizePHIsID = OptimizePHIs::ID;
-INITIALIZE_PASS(OptimizePHIs, "opt-phis",
+INITIALIZE_PASS(OptimizePHIs, DEBUG_TYPE,
                 "Optimize machine instruction PHIs", false, false)
 
 bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
index c67a25b..9c898fa 100644
--- a/contrib/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
@@ -34,7 +34,7 @@
 #include <algorithm>
 using namespace llvm;
 
-#define DEBUG_TYPE "phielim"
+#define DEBUG_TYPE "phi-node-elimination"
 
 static cl::opt<bool>
 DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
@@ -112,11 +112,11 @@ STATISTIC(NumReused, "Number of reused lowered phis");
 char PHIElimination::ID = 0;
 char& llvm::PHIEliminationID = PHIElimination::ID;
 
-INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination",
+INITIALIZE_PASS_BEGIN(PHIElimination, DEBUG_TYPE,
                       "Eliminate PHI nodes for register allocation",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination",
+INITIALIZE_PASS_END(PHIElimination, DEBUG_TYPE,
                     "Eliminate PHI nodes for register allocation", false, false)
 
 void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
index ad9166f..513e827 100644
--- a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
+++ b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -75,7 +75,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
                  .addImm(FirstActualI->getOpcode());
 
   for (auto &MO : FirstActualI->operands())
-    MIB.addOperand(MO);
+    MIB.add(MO);
 
   FirstActualI->eraseFromParent();
   MF.ensureAlignment(4);
diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 6d64345..b13f6b6 100644
--- a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -66,7 +66,6 @@
 //     C = copy A    <-- same-bank copy
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -77,8 +76,10 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -119,6 +120,14 @@ static cl::opt<unsigned> RewritePHILimit(
     "rewrite-phi-limit", cl::Hidden, cl::init(10),
     cl::desc("Limit the length of PHI chains to lookup"));
 
+// Limit the length of recurrence chain when evaluating the benefit of
+// commuting operands.
+static cl::opt<unsigned> MaxRecurrenceChain(
+    "recurrence-chain-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum length of recurrence chain when evaluating the benefit "
+             "of commuting operands"));
+
+
 STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
@@ -131,12 +140,14 @@ STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
 namespace {
 
   class ValueTrackerResult;
+  class RecurrenceInstr;
 
   class PeepholeOptimizer : public MachineFunctionPass {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo   *MRI;
     MachineDominatorTree  *DT;  // Machine dominator tree
+    MachineLoopInfo       *MLI;
 
   public:
     static char ID; // Pass identification
@@ -150,6 +161,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
       if (Aggressive) {
         AU.addRequired<MachineDominatorTree>();
         AU.addPreserved<MachineDominatorTree>();
@@ -160,6 +173,9 @@ namespace {
     typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
         RewriteMapTy;
 
+    /// \brief Sequence of instructions that formulate recurrence cycle.
+    typedef SmallVector<RecurrenceInstr, 4> RecurrenceCycle;
+
   private:
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
@@ -170,6 +186,7 @@ namespace {
     bool optimizeCoalescableCopy(MachineInstr *MI);
     bool optimizeUncoalescableCopy(MachineInstr *MI,
                                    SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeRecurrence(MachineInstr &PHI);
     bool findNextSource(unsigned Reg, unsigned SubReg,
                         RewriteMapTy &RewriteMap);
     bool isMoveImmediate(MachineInstr *MI,
@@ -178,6 +195,13 @@ namespace {
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    /// \brief Finds recurrence cycles, but only ones that formulated around
+    /// a def operand and a use operand that are tied. If there is a use
+    /// operand commutable with the tied use operand, find recurrence cycle
+    /// along that operand as well.
+    bool findTargetRecurrence(unsigned Reg,
+                              const SmallSet<unsigned, 2> &TargetReg,
+                              RecurrenceCycle &RC);
 
     /// \brief If copy instruction \p MI is a virtual register copy, track it in
     /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
@@ -222,6 +246,28 @@ namespace {
     }
   };
 
+  /// \brief Helper class to hold instructions that are inside recurrence
+  /// cycles. The recurrence cycle is formulated around 1) a def operand and its
+  /// tied use operand, or 2) a def operand and a use operand that is commutable
+  /// with another use operand which is tied to the def operand. In the latter
+  /// case, index of the tied use operand and the commutable use operand are
+  /// maintained with CommutePair.
+  class RecurrenceInstr {
+  public:
+    typedef std::pair<unsigned, unsigned> IndexPair;
+
+    RecurrenceInstr(MachineInstr *MI) : MI(MI) {}
+    RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2)
+      : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {}
+
+    MachineInstr *getMI() const { return MI; }
+    Optional<IndexPair> getCommutePair() const { return CommutePair; }
+
+  private:
+    MachineInstr *MI;
+    Optional<IndexPair> CommutePair;
+  };
+
   /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
   /// returned sources for a given search and the instructions where the sources
   /// were tracked from.
@@ -412,6 +458,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
 INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                 "Peephole Optimizations", false, false)
 
@@ -1487,6 +1534,113 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   return false;
 }
 
+/// \bried Returns true if \p MO is a virtual register operand.
+static bool isVirtualRegisterOperand(MachineOperand &MO) {
+  if (!MO.isReg())
+    return false;
+  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+}
+
+bool PeepholeOptimizer::findTargetRecurrence(
+    unsigned Reg, const SmallSet<unsigned, 2> &TargetRegs,
+    RecurrenceCycle &RC) {
+  // Recurrence found if Reg is in TargetRegs.
+  if (TargetRegs.count(Reg))
+    return true;
+
+  // TODO: Curerntly, we only allow the last instruction of the recurrence
+  // cycle (the instruction that feeds the PHI instruction) to have more than
+  // one uses to guarantee that commuting operands does not tie registers
+  // with overlapping live range. Once we have actual live range info of
+  // each register, this constraint can be relaxed.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  // Give up if the reccurrence chain length is longer than the limit.
+  if (RC.size() >= MaxRecurrenceChain)
+    return false;
+
+  MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg));
+  unsigned Idx = MI.findRegisterUseOperandIdx(Reg);
+
+  // Only interested in recurrences whose instructions have only one def, which
+  // is a virtual register.
+  if (MI.getDesc().getNumDefs() != 1)
+    return false;
+
+  MachineOperand &DefOp = MI.getOperand(0);
+  if (!isVirtualRegisterOperand(DefOp))
+    return false;
+
+  // Check if def operand of MI is tied to any use operand. We are only
+  // interested in the case that all the instructions in the recurrence chain
+  // have there def operand tied with one of the use operand.
+  unsigned TiedUseIdx;
+  if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx))
+    return false;
+
+  if (Idx == TiedUseIdx) {
+    RC.push_back(RecurrenceInstr(&MI));
+    return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+  } else {
+    // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx.
+    unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+    if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) {
+      RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx));
+      return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC);
+    }
+  }
+
+  return false;
+}
+
+/// \brief Phi instructions will eventually be lowered to copy instructions. If
+/// phi is in a loop header, a recurrence may formulated around the source and
+/// destination of the phi. For such case commuting operands of the instructions
+/// in the recurrence may enable coalescing of the copy instruction generated
+/// from the phi. For example, if there is a recurrence of
+///
+/// LoopHeader:
+///   %vreg1 = phi(%vreg0, %vreg100)
+/// LoopLatch:
+///   %vreg0<def, tied1> = ADD %vreg2<def, tied0>, %vreg1
+///
+/// , the fact that vreg0 and vreg2 are in the same tied operands set makes
+/// the coalescing of copy instruction generated from the phi in
+/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and
+/// %vreg2 have overlapping live range. This introduces additional move
+/// instruction to the final assembly. However, if we commute %vreg2 and
+/// %vreg1 of ADD instruction, the redundant move instruction can be
+/// avoided.
+bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
+  SmallSet<unsigned, 2> TargetRegs;
+  for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) {
+    MachineOperand &MO = PHI.getOperand(Idx);
+    assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction");
+    TargetRegs.insert(MO.getReg());
+  }
+
+  bool Changed = false;
+  RecurrenceCycle RC;
+  if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) {
+    // Commutes operands of instructions in RC if necessary so that the copy to
+    // be generated from PHI can be coalesced.
+    DEBUG(dbgs() << "Optimize recurrence chain from " << PHI);
+    for (auto &RI : RC) {
+      DEBUG(dbgs() << "\tInst: " << *(RI.getMI()));
+      auto CP = RI.getCommutePair();
+      if (CP) {
+        Changed = true;
+        TII->commuteInstruction(*(RI.getMI()), false, (*CP).first,
+                                (*CP).second);
+        DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI()));
+      }
+    }
+  }
+
+  return Changed;
+}
+
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
@@ -1501,6 +1655,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  MLI = &getAnalysis<MachineLoopInfo>();
 
   bool Changed = false;
 
@@ -1529,6 +1684,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     SmallSet<unsigned, 4> CopySrcRegs;
     DenseMap<unsigned, MachineInstr *> CopySrcMIs;
 
+    bool IsLoopHeader = MLI->isLoopHeader(&MBB);
+
     for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
          MII != MIE; ) {
       MachineInstr *MI = &*MII;
@@ -1540,9 +1697,16 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->isDebugValue())
           continue;
 
-      if (MI->isPosition() || MI->isPHI())
+      if (MI->isPosition())
         continue;
 
+      if (IsLoopHeader && MI->isPHI()) {
+        if (optimizeRecurrence(*MI)) {
+          Changed = true;
+          continue;
+        }
+      }
+
       if (!MI->isCopy()) {
         for (const auto &Op : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
@@ -1667,7 +1831,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
               FoldAsLoadDefCandidates.erase(FoldedReg);
               ++NumLoadFold;
-              
+
               // MI is replaced with FoldMI so we can continue trying to fold
               Changed = true;
               MI = FoldMI;
@@ -1675,7 +1839,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           }
         }
       }
-      
+
       // If we run into an instruction we can't fold across, discard
       // the load candidates.  Note: We might be able to fold *into* this
       // instruction, so this needs to be after the folding logic.
diff --git a/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
index 5bc5f75..4a50d89 100644
--- a/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
+++ b/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -23,13 +23,13 @@
 /// This pass traverses all the instructions in a program in top-down order.
 /// In contrast to the instruction scheduling passes, this pass never resets
 /// the hazard recognizer to ensure it can correctly handles noop hazards at
-/// the begining of blocks.
+/// the beginning of blocks.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
index 6081916..f2249f9 100644
--- a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -200,7 +200,7 @@ namespace {
 
 char &llvm::PostRASchedulerID = PostRAScheduler::ID;
 
-INITIALIZE_PASS(PostRAScheduler, "post-RA-sched",
+INITIALIZE_PASS(PostRAScheduler, DEBUG_TYPE,
                 "Post RA top-down list latency scheduler", false, false)
 
 SchedulePostRATDList::SchedulePostRATDList(
@@ -253,7 +253,7 @@ void SchedulePostRATDList::exitRegion() {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dumpSchedule - dump the scheduled Sequence.
-void SchedulePostRATDList::dumpSchedule() const {
+LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
     if (SUnit *SU = Sequence[i])
       SU->dump(this);
@@ -367,7 +367,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     Scheduler.finishBlock();
 
     // Update register kills
-    Scheduler.fixupKills(&MBB);
+    Scheduler.fixupKills(MBB);
   }
 
   return true;
diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
index d27ea2f..0118580 100644
--- a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "processimplicitdefs"
+#define DEBUG_TYPE "processimpdefs"
 
 namespace {
 /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
@@ -51,9 +51,7 @@ public:
 char ProcessImplicitDefs::ID = 0;
 char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
 
-INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs",
-                "Process Implicit Definitions", false, false)
-INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
+INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE,
                 "Process Implicit Definitions", false, false)
 
 void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 5fca7fa..e9f8d43 100644
--- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -45,7 +45,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "pei"
+#define DEBUG_TYPE "prologepilog"
 
 typedef SmallVector<MachineBasicBlock *, 4> MBBVector;
 static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
@@ -54,25 +54,12 @@ static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
                                    const MBBVector &SaveBlocks,
                                    const MBBVector &RestoreBlocks);
 
-static void doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS);
-
 namespace {
 class PEI : public MachineFunctionPass {
 public:
   static char ID;
-  explicit PEI(const TargetMachine *TM = nullptr) : MachineFunctionPass(ID) {
+  PEI() : MachineFunctionPass(ID) {
     initializePEIPass(*PassRegistry::getPassRegistry());
-
-    if (TM && (!TM->usesPhysRegsForPEI())) {
-      SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
-                                     unsigned &, unsigned &, const MBBVector &,
-                                     const MBBVector &) {};
-      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {};
-    } else {
-      SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
-      ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs;
-      UsesCalleeSaves = true;
-    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -95,7 +82,7 @@ private:
                      const MBBVector &SaveBlocks,
                      const MBBVector &RestoreBlocks)>
       SpillCalleeSavedRegisters;
-  std::function<void(MachineFunction &MF, RegScavenger *RS)>
+  std::function<void(MachineFunction &MF, RegScavenger &RS)>
       ScavengeFrameVirtualRegs;
 
   bool UsesCalleeSaves = false;
@@ -140,21 +127,19 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
               cl::desc("Warn for stack size bigger than the given"
                        " number"));
 
-INITIALIZE_TM_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion",
-                         false, false)
+INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false,
+                      false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_TM_PASS_END(PEI, "prologepilog",
-                       "Prologue/Epilogue Insertion & Frame Finalization",
-                       false, false)
+INITIALIZE_PASS_END(PEI, DEBUG_TYPE,
+                    "Prologue/Epilogue Insertion & Frame Finalization", false,
+                    false)
 
-MachineFunctionPass *
-llvm::createPrologEpilogInserterPass(const TargetMachine *TM) {
-  return new PEI(TM);
+MachineFunctionPass *llvm::createPrologEpilogInserterPass() {
+  return new PEI();
 }
 
-STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
           "Number of bytes used for stack in all functions");
 
@@ -174,6 +159,20 @@ typedef SmallSetVector<int, 8> StackObjSet;
 /// frame indexes with appropriate references.
 ///
 bool PEI::runOnMachineFunction(MachineFunction &Fn) {
+  if (!SpillCalleeSavedRegisters) {
+    const TargetMachine &TM = Fn.getTarget();
+    if (!TM.usesPhysRegsForPEI()) {
+      SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
+                                     unsigned &, unsigned &, const MBBVector &,
+                                     const MBBVector &) {};
+      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger &) {};
+    } else {
+      SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
+      ScavengeFrameVirtualRegs = scavengeFrameVirtualRegs;
+      UsesCalleeSaves = true;
+    }
+  }
+
   const Function* F = Fn.getFunction();
   const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
@@ -220,7 +219,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
   if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
-      ScavengeFrameVirtualRegs(Fn, RS);
+      ScavengeFrameVirtualRegs(Fn, *RS);
 
       // Clear any vregs created by virtual scavenging.
       Fn.getRegInfo().clearVirtRegs();
@@ -265,11 +264,8 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
   for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
-      if (I->getOpcode() == FrameSetupOpcode ||
-          I->getOpcode() == FrameDestroyOpcode) {
-        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
-               " instructions should have a single immediate argument!");
-        unsigned Size = I->getOperand(0).getImm();
+      if (TII.isFrameInstr(*I)) {
+        unsigned Size = TII.getFrameSize(*I);
         if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
         AdjustsStack = true;
         FrameSDOps.push_back(I);
@@ -280,6 +276,9 @@ void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
           AdjustsStack = true;
       }
 
+  assert(!MFI.isMaxCallFrameSizeComputed() ||
+         (MFI.getMaxCallFrameSize() == MaxCallFrameSize &&
+          MFI.adjustsStack() == AdjustsStack));
   MFI.setAdjustsStack(AdjustsStack);
   MFI.setMaxCallFrameSize(MaxCallFrameSize);
 
@@ -336,7 +335,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
     return;
 
   const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+  const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs();
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
@@ -376,22 +375,22 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
              FixedSlot->Reg != Reg)
         ++FixedSlot;
 
+      unsigned Size = RegInfo->getSpillSize(*RC);
       if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
         // Nope, just spill it anywhere convenient.
-        unsigned Align = RC->getAlignment();
+        unsigned Align = RegInfo->getSpillAlignment(*RC);
         unsigned StackAlign = TFI->getStackAlignment();
 
         // We may not be able to satisfy the desired alignment specification of
         // the TargetRegisterClass if the stack alignment is smaller. Use the
         // min.
         Align = std::min(Align, StackAlign);
-        FrameIdx = MFI.CreateStackObject(RC->getSize(), Align, true);
+        FrameIdx = MFI.CreateStackObject(Size, Align, true);
         if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
         if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
       } else {
         // Spill it to the stack where we must.
-        FrameIdx =
-            MFI.CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
+        FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset);
       }
 
       CS.setFrameIdx(FrameIdx);
@@ -448,12 +447,13 @@ static void updateLiveness(MachineFunction &MF) {
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     for (MachineBasicBlock *MBB : Visited) {
       MCPhysReg Reg = CSI[i].getReg();
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
-      if (!MBB->isLiveIn(Reg))
+      if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg))
         MBB->addLiveIn(Reg);
     }
   }
@@ -764,6 +764,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   } else if (MaxCSFrameIndex >= MinCSFrameIndex) {
     // Be careful about underflow in comparisons agains MinCSFrameIndex.
     for (unsigned i = MaxCSFrameIndex; i != MinCSFrameIndex - 1; --i) {
+      if (MFI.isDeadObjectIndex(i))
+        continue;
+
       unsigned Align = MFI.getObjectAlignment(i);
       // Adjust to alignment boundary
       Offset = alignTo(Offset, Align, Skew);
@@ -1049,8 +1052,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
-  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   if (RS && FrameIndexEliminationScavenging)
     RS->enterBasicBlock(*BB);
@@ -1059,11 +1060,9 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
 
   for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
-    if (I->getOpcode() == FrameSetupOpcode ||
-        I->getOpcode() == FrameDestroyOpcode) {
-      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+    if (TII.isFrameInstr(*I)) {
+      InsideCallSequence = TII.isFrameSetup(*I);
       SPAdj += TII.getSPAdjust(*I);
-
       I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
       continue;
     }
@@ -1151,90 +1150,3 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
       RS->forward(MI);
   }
 }
-
-/// doScavengeFrameVirtualRegs - Replace all frame index virtual registers
-/// with physical registers. Use the register scavenger to find an
-/// appropriate register to use.
-///
-/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
-/// iterate over the vreg use list, which at this point only contains machine
-/// operands for which eliminateFrameIndex need a new scratch reg.
-static void
-doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) {
-  // Run through the instructions and find any virtual registers.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (MachineBasicBlock &MBB : MF) {
-    RS->enterBasicBlock(MBB);
-
-    int SPAdj = 0;
-
-    // The instruction stream may change in the loop, so check MBB.end()
-    // directly.
-    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
-      // We might end up here again with a NULL iterator if we scavenged a
-      // register for which we inserted spill code for definition by what was
-      // originally the first instruction in MBB.
-      if (I == MachineBasicBlock::iterator(nullptr))
-        I = MBB.begin();
-
-      const MachineInstr &MI = *I;
-      MachineBasicBlock::iterator J = std::next(I);
-      MachineBasicBlock::iterator P =
-                         I == MBB.begin() ? MachineBasicBlock::iterator(nullptr)
-                                          : std::prev(I);
-
-      // RS should process this instruction before we might scavenge at this
-      // location. This is because we might be replacing a virtual register
-      // defined by this instruction, and if so, registers killed by this
-      // instruction are available, and defined registers are not.
-      RS->forward(I);
-
-      for (const MachineOperand &MO : MI.operands()) {
-        if (!MO.isReg())
-          continue;
-        unsigned Reg = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
-          continue;
-
-        // When we first encounter a new virtual register, it
-        // must be a definition.
-        assert(MO.isDef() && "frame index virtual missing def!");
-        // Scavenge a new scratch register
-        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
-        unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
-
-        ++NumScavengedRegs;
-
-        // Replace this reference to the virtual register with the
-        // scratch register.
-        assert(ScratchReg && "Missing scratch register!");
-        MRI.replaceRegWith(Reg, ScratchReg);
-
-        // Because this instruction was processed by the RS before this
-        // register was allocated, make sure that the RS now records the
-        // register as being used.
-        RS->setRegUsed(ScratchReg);
-      }
-
-      // If the scavenger needed to use one of its spill slots, the
-      // spill code will have been inserted in between I and J. This is a
-      // problem because we need the spill code before I: Move I to just
-      // prior to J.
-      if (I != std::prev(J)) {
-        MBB.splice(J, &MBB, I);
-
-        // Before we move I, we need to prepare the RS to visit I again.
-        // Specifically, RS will assert if it sees uses of registers that
-        // it believes are undefined. Because we have already processed
-        // register kills in I, when it visits I again, it will believe that
-        // those registers are undefined. To avoid this situation, unprocess
-        // the instruction I.
-        assert(RS->getCurrentPosition() == I &&
-          "The register scavenger has an unexpected position");
-        I = P;
-        RS->unprocess(P);
-      } else
-        ++I;
-    }
-  }
-}
diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
index 804a4c3..b29e62b 100644
--- a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
+++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {}
 PseudoSourceValue::~PseudoSourceValue() {}
 
 void PseudoSourceValue::printCustom(raw_ostream &O) const {
-  O << PSVNames[Kind];
+  if (Kind < TargetCustom)
+    O << PSVNames[Kind];
+  else
+    O << "TargetCustom" << Kind;
 }
 
 bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
index fb49a93..7b4fbac 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -21,13 +21,12 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -134,18 +133,19 @@ void RegAllocBase::allocatePhysRegs() {
     if (AvailablePhysReg)
       Matrix->assign(*VirtReg, AvailablePhysReg);
 
-    for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
-         I != E; ++I) {
-      LiveInterval *SplitVirtReg = &LIS->getInterval(*I);
+    for (unsigned Reg : SplitVRegs) {
+      assert(LIS->hasInterval(Reg));
+
+      LiveInterval *SplitVirtReg = &LIS->getInterval(Reg);
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
+        assert(SplitVirtReg->empty() && "Non-empty but used interval");
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
         aboutToRemoveInterval(*SplitVirtReg);
         LIS->removeInterval(SplitVirtReg->reg);
         continue;
       }
       DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
-      assert(!SplitVirtReg->empty() && "expecting non-empty interval");
       assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) &&
              "expect split value in virtual register");
       enqueue(SplitVirtReg);
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
index a558e37..7743061 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "AllocationOrder.h"
 #include "LiveDebugVariables.h"
 #include "RegAllocBase.h"
@@ -28,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/PassAnalysisSupport.h"
@@ -58,8 +58,9 @@ namespace {
 /// whenever a register is unavailable. This is not practical in production but
 /// provides a useful baseline both for measuring other allocators and comparing
 /// the speed of the basic algorithm against other styles of allocators.
-class RABasic : public MachineFunctionPass, public RegAllocBase
-{
+class RABasic : public MachineFunctionPass,
+                public RegAllocBase,
+                private LiveRangeEdit::Delegate {
   // context
   MachineFunction *MF;
 
@@ -72,6 +73,9 @@ class RABasic : public MachineFunctionPass, public RegAllocBase
   // selectOrSplit().
   BitVector UsableRegs;
 
+  bool LRE_CanEraseVirtReg(unsigned) override;
+  void LRE_WillShrinkVirtReg(unsigned) override;
+
 public:
   RABasic();
 
@@ -121,17 +125,46 @@ char RABasic::ID = 0;
 
 } // end anonymous namespace
 
+char &llvm::RABasicID = RABasic::ID;
+
+INITIALIZE_PASS_BEGIN(RABasic, "regallocbasic", "Basic Register Allocator",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
+INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(RABasic, "regallocbasic", "Basic Register Allocator", false,
+                    false)
+
+bool RABasic::LRE_CanEraseVirtReg(unsigned VirtReg) {
+  if (VRM->hasPhys(VirtReg)) {
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
+    return true;
+  }
+  // Unassigned virtreg is probably in the priority queue.
+  // RegAllocBase will erase it after dequeueing.
+  return false;
+}
+
+void RABasic::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+  if (!VRM->hasPhys(VirtReg))
+    return;
+
+  // Register is assigned, put it back on the queue for reassignment.
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  Matrix->unassign(LI);
+  enqueue(&LI);
+}
+
 RABasic::RABasic(): MachineFunctionPass(ID) {
-  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
-  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
-  initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
-  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeLiveStacksPass(*PassRegistry::getPassRegistry());
-  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
-  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
-  initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
-  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
 }
 
 void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -176,8 +209,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     Q.collectInterferingVRegs();
-    if (Q.seenUnspillableVReg())
-      return false;
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
       if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
@@ -202,7 +233,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
     Matrix->unassign(Spill);
 
     // Spill the extracted interval.
-    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
     spiller().spill(LRE);
   }
   return true;
@@ -261,7 +292,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
   if (!VirtReg.isSpillable())
     return ~0u;
-  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   spiller().spill(LRE);
 
   // The live virtual register requesting allocation was spilled, so tell
diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
index fd759bc..d5538be 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -203,6 +203,8 @@ namespace {
   char RAFast::ID = 0;
 }
 
+INITIALIZE_PASS(RAFast, "regallocfast", "Fast Register Allocator", false, false)
+
 /// getStackSpaceFor - This allocates space for the specified virtual register
 /// to be held on the stack.
 int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
@@ -212,8 +214,9 @@ int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
     return SS;          // Already has space allocated?
 
   // Allocate a new stack object for this spill location...
-  int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(),
-                                                           RC->getAlignment());
+  unsigned Size = TRI->getSpillSize(*RC);
+  unsigned Align = TRI->getSpillAlignment(*RC);
+  int FrameIdx = MF->getFrameInfo().CreateSpillStackObject(Size, Align);
 
   // Assign the slot.
   StackSlotForVirtReg[VirtReg] = FrameIdx;
@@ -243,8 +246,15 @@ void RAFast::addKillFlag(const LiveReg &LR) {
   if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) {
     if (MO.getReg() == LR.PhysReg)
       MO.setIsKill();
-    else
-      LR.LastUse->addRegisterKilled(LR.PhysReg, TRI, true);
+    // else, don't do anything we are problably redefining a
+    // subreg of this register and given we don't track which
+    // lanes are actually dead, we cannot insert a kill flag here.
+    // Otherwise we may end up in a situation like this:
+    // ... = (MO) physreg:sub1, physreg <implicit-use, kill>
+    // ... <== Here we would allow later pass to reuse physreg:sub1
+    //         which is potentially wrong.
+    // LR:sub0 = ...
+    // ... = LR.sub1 <== This is going to use physreg:sub1
   }
 }
 
@@ -304,19 +314,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
       LiveDbgValueMap[LRI->VirtReg];
     for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
       MachineInstr *DBG = LRIDbgValues[li];
-      const MDNode *Var = DBG->getDebugVariable();
-      const MDNode *Expr = DBG->getDebugExpression();
-      bool IsIndirect = DBG->isIndirectDebugValue();
-      uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
-      DebugLoc DL = DBG->getDebugLoc();
-      assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
-             "Expected inlined-at fields to agree");
-      MachineInstr *NewDV =
-          BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE))
-              .addFrameIndex(FI)
-              .addImm(Offset)
-              .addMetadata(Var)
-              .addMetadata(Expr);
+      MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI);
       assert(NewDV->getParent() == MBB && "dangling parent pointer");
       (void)NewDV;
       DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
index c47cfb1..020e81e 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1,4 +1,4 @@
-//===-- RegAllocGreedy.cpp - greedy register allocator --------------------===//
+//===- RegAllocGreedy.cpp - greedy register allocator ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,34 +19,63 @@
 #include "SpillPlacement.h"
 #include "Spiller.h"
 #include "SplitKit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/PassAnalysisSupport.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
 #include <queue>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -104,13 +133,14 @@ static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
                                        createGreedyRegisterAllocator);
 
 namespace {
+
 class RAGreedy : public MachineFunctionPass,
                  public RegAllocBase,
                  private LiveRangeEdit::Delegate {
   // Convenient shortcuts.
-  typedef std::priority_queue<std::pair<unsigned, unsigned> > PQueue;
-  typedef SmallPtrSet<LiveInterval *, 4> SmallLISet;
-  typedef SmallSet<unsigned, 16> SmallVirtRegSet;
+  using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
+  using SmallLISet = SmallPtrSet<LiveInterval *, 4>;
+  using SmallVirtRegSet = SmallSet<unsigned, 16>;
 
   // context
   MachineFunction *MF;
@@ -125,6 +155,7 @@ class RAGreedy : public MachineFunctionPass,
   MachineBlockFrequencyInfo *MBFI;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
+  MachineOptimizationRemarkEmitter *ORE;
   EdgeBundles *Bundles;
   SpillPlacement *SpillPlacer;
   LiveDebugVariables *DebugVars;
@@ -198,12 +229,12 @@ class RAGreedy : public MachineFunctionPass,
 
   // RegInfo - Keep additional information about each live range.
   struct RegInfo {
-    LiveRangeStage Stage;
+    LiveRangeStage Stage = RS_New;
 
     // Cascade - Eviction loop prevention. See canEvictInterference().
-    unsigned Cascade;
+    unsigned Cascade = 0;
 
-    RegInfo() : Stage(RS_New), Cascade(0) {}
+    RegInfo() = default;
   };
 
   IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo;
@@ -229,10 +260,10 @@ class RAGreedy : public MachineFunctionPass,
 
   /// Cost of evicting interference.
   struct EvictionCost {
-    unsigned BrokenHints; ///< Total number of broken hints.
-    float MaxWeight;      ///< Maximum spill weight evicted.
+    unsigned BrokenHints = 0; ///< Total number of broken hints.
+    float MaxWeight = 0;      ///< Maximum spill weight evicted.
 
-    EvictionCost(): BrokenHints(0), MaxWeight(0) {}
+    EvictionCost() = default;
 
     bool isMax() const { return BrokenHints == ~0u; }
 
@@ -282,8 +313,7 @@ class RAGreedy : public MachineFunctionPass,
     // Set B[i] = C for every live bundle where B[i] was NoCand.
     unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
       unsigned Count = 0;
-      for (int i = LiveBundles.find_first(); i >= 0;
-           i = LiveBundles.find_next(i))
+      for (unsigned i : LiveBundles.set_bits())
         if (B[i] == NoCand) {
           B[i] = C;
           Count++;
@@ -411,15 +441,32 @@ private:
     /// Its currently assigned register.
     /// In case of a physical register Reg == PhysReg.
     unsigned PhysReg;
+
     HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
         : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
   };
-  typedef SmallVector<HintInfo, 4> HintsInfo;
+  using HintsInfo = SmallVector<HintInfo, 4>;
+
   BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
   void collectHintInfo(unsigned, HintsInfo &);
 
   bool isUnusedCalleeSavedReg(unsigned PhysReg) const;
+
+  /// Compute and report the number of spills and reloads for a loop.
+  void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
+                                    unsigned &FoldedReloads, unsigned &Spills,
+                                    unsigned &FoldedSpills);
+
+  /// Report the number of spills and reloads for each loop.
+  void reportNumberOfSplillsReloads() {
+    for (MachineLoop *L : *Loops) {
+      unsigned Reloads, FoldedReloads, Spills, FoldedSpills;
+      reportNumberOfSplillsReloads(L, Reloads, FoldedReloads, Spills,
+                                   FoldedSpills);
+    }
+  }
 };
+
 } // end anonymous namespace
 
 char RAGreedy::ID = 0;
@@ -439,6 +486,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
 INITIALIZE_PASS_DEPENDENCY(SpillPlacement)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_END(RAGreedy, "greedy",
                 "Greedy Register Allocator", false, false)
 
@@ -458,7 +506,6 @@ const char *const RAGreedy::StageName[] = {
 // This helps stabilize decisions based on float comparisons.
 const float Hysteresis = (2007 / 2048.0f); // 0.97998046875
 
-
 FunctionPass* llvm::createGreedyRegisterAllocator() {
   return new RAGreedy();
 }
@@ -490,10 +537,10 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveRegMatrix>();
   AU.addRequired<EdgeBundles>();
   AU.addRequired<SpillPlacement>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                     LiveRangeEdit delegate methods
 //===----------------------------------------------------------------------===//
@@ -616,7 +663,6 @@ LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
   return LI;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                            Direct Assignment
 //===----------------------------------------------------------------------===//
@@ -664,7 +710,6 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
   return CheapReg ? CheapReg : PhysReg;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
@@ -679,7 +724,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
     MCRegUnitIterator Units(PhysReg, TRI);
     for (; Units.isValid(); ++Units) {
       // Instantiate a "subquery", not to be confused with the Queries array.
-      LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]);
+      LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]);
       if (subQ.checkInterference())
         break;
     }
@@ -830,7 +875,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   SmallVector<LiveInterval*, 8> Intfs;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
+    // We usually have the interfering VRegs cached so collectInterferingVRegs()
+    // should be fast, we may need to recalculate if when different physregs
+    // overlap the same register unit so we had different SubRanges queried
+    // against it.
+    Q.collectInterferingVRegs();
     ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
     Intfs.append(IVR.begin(), IVR.end());
   }
@@ -932,7 +981,6 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
   return BestPhys;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                              Region Splitting
 //===----------------------------------------------------------------------===//
@@ -1003,7 +1051,6 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
   return SpillPlacer->scanActiveBundles();
 }
 
-
 /// addThroughConstraints - Add constraints and links to SpillPlacer from the
 /// live-through blocks in Blocks.
 void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
@@ -1061,7 +1108,7 @@ void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
   unsigned Visited = 0;
 #endif
 
-  for (;;) {
+  while (true) {
     ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
     // Find new through blocks in the periphery of PrefRegBundles.
     for (int i = 0, e = NewBundles.size(); i != e; ++i) {
@@ -1139,9 +1186,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
   }
 
   DEBUG({
-    for (int i = Cand.LiveBundles.find_first(); i>=0;
-         i = Cand.LiveBundles.find_next(i))
-    dbgs() << " EB#" << i;
+    for (int i : Cand.LiveBundles.set_bits())
+      dbgs() << " EB#" << i;
     dbgs() << ".\n";
   });
   return true;
@@ -1176,8 +1222,8 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
     SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
-    bool RegIn  = LiveBundles[Bundles->getBundle(BC.Number, 0)];
-    bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, 1)];
+    bool RegIn  = LiveBundles[Bundles->getBundle(BC.Number, false)];
+    bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, true)];
     unsigned Ins = 0;
 
     if (BI.LiveIn)
@@ -1190,8 +1236,8 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
 
   for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
     unsigned Number = Cand.ActiveBlocks[i];
-    bool RegIn  = LiveBundles[Bundles->getBundle(Number, 0)];
-    bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)];
+    bool RegIn  = LiveBundles[Bundles->getBundle(Number, false)];
+    bool RegOut = LiveBundles[Bundles->getBundle(Number, true)];
     if (!RegIn && !RegOut)
       continue;
     if (RegIn && RegOut) {
@@ -1243,7 +1289,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     unsigned IntvIn = 0, IntvOut = 0;
     SlotIndex IntfIn, IntfOut;
     if (BI.LiveIn) {
-      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)];
       if (CandIn != NoCand) {
         GlobalSplitCandidate &Cand = GlobalCand[CandIn];
         IntvIn = Cand.IntvIdx;
@@ -1252,7 +1298,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
       }
     }
     if (BI.LiveOut) {
-      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)];
       if (CandOut != NoCand) {
         GlobalSplitCandidate &Cand = GlobalCand[CandOut];
         IntvOut = Cand.IntvIdx;
@@ -1292,7 +1338,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
       unsigned IntvIn = 0, IntvOut = 0;
       SlotIndex IntfIn, IntfOut;
 
-      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, false)];
       if (CandIn != NoCand) {
         GlobalSplitCandidate &Cand = GlobalCand[CandIn];
         IntvIn = Cand.IntvIdx;
@@ -1300,7 +1346,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
         IntfIn = Cand.Intf.first();
       }
 
-      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, true)];
       if (CandOut != NoCand) {
         GlobalSplitCandidate &Cand = GlobalCand[CandOut];
         IntvOut = Cand.IntvIdx;
@@ -1459,8 +1505,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
     DEBUG({
       dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost)
                                 << " with bundles";
-      for (int i = Cand.LiveBundles.find_first(); i>=0;
-           i = Cand.LiveBundles.find_next(i))
+      for (int i : Cand.LiveBundles.set_bits())
         dbgs() << " EB#" << i;
       dbgs() << ".\n";
     });
@@ -1513,7 +1558,6 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
   return 0;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                            Per-Block Splitting
 //===----------------------------------------------------------------------===//
@@ -1560,7 +1604,6 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   return 0;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                         Per-Instruction Splitting
 //===----------------------------------------------------------------------===//
@@ -1644,12 +1687,10 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   return 0;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             Local Splitting
 //===----------------------------------------------------------------------===//
 
-
 /// calcGapWeights - Compute the maximum spill weight that needs to be evicted
 /// in order to use PhysReg between two entries in SA->UseSlots.
 ///
@@ -1720,7 +1761,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
 
       for (; Gap != NumGaps; ++Gap) {
-        GapWeight[Gap] = llvm::huge_valf;
+        GapWeight[Gap] = huge_valf;
         if (Uses[Gap+1].getBaseIndex() >= I->end)
           break;
       }
@@ -1826,7 +1867,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // Remove any gaps with regmask clobbers.
     if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
       for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
-        GapWeight[RegMaskGaps[i]] = llvm::huge_valf;
+        GapWeight[RegMaskGaps[i]] = huge_valf;
 
     // Try to find the best sequence of gaps to close.
     // The new spill weight must be larger than any gap interference.
@@ -1838,7 +1879,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // It is the spill weight that needs to be evicted.
     float MaxGap = GapWeight[0];
 
-    for (;;) {
+    while (true) {
       // Live before/after split?
       const bool LiveBefore = SplitBefore != 0 || BI.LiveIn;
       const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
@@ -1861,7 +1902,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       // Legally, without causing looping?
       bool Legal = !ProgressRequired || NewGaps < NumGaps;
 
-      if (Legal && MaxGap < llvm::huge_valf) {
+      if (Legal && MaxGap < huge_valf) {
         // Estimate the new spill weight. Each instruction reads or writes the
         // register. Conservatively assume there are no read-modify-write
         // instructions.
@@ -2417,7 +2458,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   do {
     Reg = RecoloringCandidates.pop_back_val();
 
-    // We cannot recolor physcal register.
+    // We cannot recolor physical register.
     if (TargetRegisterInfo::isPhysicalRegister(Reg))
       continue;
 
@@ -2581,7 +2622,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   }
 
   // If we couldn't allocate a register from spilling, there is probably some
-  // invalid inline assembly. The base class wil report it.
+  // invalid inline assembly. The base class will report it.
   if (Stage >= RS_Done || !VirtReg.isSpillable())
     return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
                                    Depth);
@@ -2611,6 +2652,70 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   return 0;
 }
 
+void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
+                                            unsigned &FoldedReloads,
+                                            unsigned &Spills,
+                                            unsigned &FoldedSpills) {
+  Reloads = 0;
+  FoldedReloads = 0;
+  Spills = 0;
+  FoldedSpills = 0;
+
+  // Sum up the spill and reloads in subloops.
+  for (MachineLoop *SubLoop : *L) {
+    unsigned SubReloads;
+    unsigned SubFoldedReloads;
+    unsigned SubSpills;
+    unsigned SubFoldedSpills;
+
+    reportNumberOfSplillsReloads(SubLoop, SubReloads, SubFoldedReloads,
+                                 SubSpills, SubFoldedSpills);
+    Reloads += SubReloads;
+    FoldedReloads += SubFoldedReloads;
+    Spills += SubSpills;
+    FoldedSpills += SubFoldedSpills;
+  }
+
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  int FI;
+
+  for (MachineBasicBlock *MBB : L->getBlocks())
+    // Handle blocks that were not included in subloops.
+    if (Loops->getLoopFor(MBB) == L)
+      for (MachineInstr &MI : *MBB) {
+        const MachineMemOperand *MMO;
+
+        if (TII->isLoadFromStackSlot(MI, FI) && MFI.isSpillSlotObjectIndex(FI))
+          ++Reloads;
+        else if (TII->hasLoadFromStackSlot(MI, MMO, FI) &&
+                 MFI.isSpillSlotObjectIndex(FI))
+          ++FoldedReloads;
+        else if (TII->isStoreToStackSlot(MI, FI) &&
+                 MFI.isSpillSlotObjectIndex(FI))
+          ++Spills;
+        else if (TII->hasStoreToStackSlot(MI, MMO, FI) &&
+                 MFI.isSpillSlotObjectIndex(FI))
+          ++FoldedSpills;
+      }
+
+  if (Reloads || FoldedReloads || Spills || FoldedSpills) {
+    using namespace ore;
+
+    MachineOptimizationRemarkMissed R(DEBUG_TYPE, "LoopSpillReload",
+                                      L->getStartLoc(), L->getHeader());
+    if (Spills)
+      R << NV("NumSpills", Spills) << " spills ";
+    if (FoldedSpills)
+      R << NV("NumFoldedSpills", FoldedSpills) << " folded spills ";
+    if (Reloads)
+      R << NV("NumReloads", Reloads) << " reloads ";
+    if (FoldedReloads)
+      R << NV("NumFoldedReloads", FoldedReloads) << " folded reloads ";
+    ORE->emit(R << "generated in loop");
+  }
+}
+
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                << "********** Function: " << mf.getName() << '\n');
@@ -2633,6 +2738,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Indexes = &getAnalysis<SlotIndexes>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   DomTree = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
   Loops = &getAnalysis<MachineLoopInfo>();
   Bundles = &getAnalysis<EdgeBundles>();
@@ -2658,6 +2764,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   allocatePhysRegs();
   tryHintsRecoloring();
   postOptimization();
+  reportNumberOfSplillsReloads();
 
   releaseMemory();
   return true;
diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 101b30b..9778103 100644
--- a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -1,4 +1,4 @@
-//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===//
+//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -32,30 +32,59 @@
 #include "llvm/CodeGen/RegAllocPBQP.h"
 #include "RegisterCoalescer.h"
 #include "Spiller.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/Math.h"
+#include "llvm/CodeGen/PBQP/Solution.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
 #include <limits>
+#include <map>
 #include <memory>
 #include <queue>
 #include <set>
 #include <sstream>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -86,7 +115,6 @@ namespace {
 /// Programming problems.
 class RegAllocPBQP : public MachineFunctionPass {
 public:
-
   static char ID;
 
   /// Construct a PBQP register allocator.
@@ -113,14 +141,13 @@ public:
   }
 
 private:
-
-  typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
-  typedef std::vector<const LiveInterval*> Node2LIMap;
-  typedef std::vector<unsigned> AllowedSet;
-  typedef std::vector<AllowedSet> AllowedSetMap;
-  typedef std::pair<unsigned, unsigned> RegPair;
-  typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
-  typedef std::set<unsigned> RegSet;
+  using LI2NodeMap = std::map<const LiveInterval *, unsigned>;
+  using Node2LIMap = std::vector<const LiveInterval *>;
+  using AllowedSet = std::vector<unsigned>;
+  using AllowedSetMap = std::vector<AllowedSet>;
+  using RegPair = std::pair<unsigned, unsigned>;
+  using CoalesceMap = std::map<RegPair, PBQP::PBQPNum>;
+  using RegSet = std::set<unsigned>;
 
   char *customPassID;
 
@@ -187,13 +214,12 @@ public:
 /// @brief Add interference edges between overlapping vregs.
 class Interference : public PBQPRAConstraint {
 private:
-
-  typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
-  typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey;
-  typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
-  typedef DenseSet<IKey> DisjointAllowedRegsCache;
-  typedef std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId> IEdgeKey;
-  typedef DenseSet<IEdgeKey> IEdgeCache;
+  using AllowedRegVecPtr = const PBQP::RegAlloc::AllowedRegVector *;
+  using IKey = std::pair<AllowedRegVecPtr, AllowedRegVecPtr>;
+  using IMatrixCache = DenseMap<IKey, PBQPRAGraph::MatrixPtr>;
+  using DisjointAllowedRegsCache = DenseSet<IKey>;
+  using IEdgeKey = std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId>;
+  using IEdgeCache = DenseSet<IEdgeKey>;
 
   bool haveDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
                                PBQPRAGraph::NodeId MId,
@@ -228,8 +254,8 @@ private:
   // for the fast interference graph construction algorithm. The last is there
   // to save us from looking up node ids via the VRegToNode map in the graph
   // metadata.
-  typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>
-    IntervalInfo;
+  using IntervalInfo =
+      std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>;
 
   static SlotIndex getStartPoint(const IntervalInfo &I) {
     return std::get<0>(I)->segments[std::get<1>(I)].start;
@@ -276,7 +302,6 @@ private:
   }
 
 public:
-
   void apply(PBQPRAGraph &G) override {
     // The following is loosely based on the linear scan algorithm introduced in
     // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
@@ -297,9 +322,10 @@ public:
     // Cache known disjoint allowed registers pairs
     DisjointAllowedRegsCache D;
 
-    typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet;
-    typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
-                                decltype(&lowestStartPoint)> IntervalQueue;
+    using IntervalSet = std::set<IntervalInfo, decltype(&lowestEndPoint)>;
+    using IntervalQueue =
+        std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
+                            decltype(&lowestStartPoint)>;
     IntervalSet Active(lowestEndPoint);
     IntervalQueue Inactive(lowestStartPoint);
 
@@ -363,7 +389,6 @@ public:
   }
 
 private:
-
   // Create an Interference edge and add it to the graph, unless it is
   // a null matrix, meaning the nodes' allowed registers do not have any
   // interference. This case occurs frequently between integer and floating
@@ -372,7 +397,6 @@ private:
   bool createInterferenceEdge(PBQPRAGraph &G,
                               PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId,
                               IMatrixCache &C) {
-
     const TargetRegisterInfo &TRI =
         *G.getMetadata().MF.getSubtarget().getRegisterInfo();
     const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
@@ -409,7 +433,6 @@ private:
   }
 };
 
-
 class Coalescing : public PBQPRAConstraint {
 public:
   void apply(PBQPRAGraph &G) override {
@@ -421,7 +444,6 @@ public:
     // gives the Ok.
     for (const auto &MBB : MF) {
       for (const auto &MI : MBB) {
-
         // Skip not-coalescable or already coalesced copies.
         if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
           continue;
@@ -479,7 +501,6 @@ public:
   }
 
 private:
-
   void addVirtRegCoalesce(
                     PBQPRAGraph::RawMatrix &CostMat,
                     const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
@@ -496,14 +517,15 @@ private:
       }
     }
   }
-
 };
 
-} // End anonymous namespace.
+} // end anonymous namespace
 
 // Out-of-line destructor/anchor for PBQPRAConstraint.
-PBQPRAConstraint::~PBQPRAConstraint() {}
+PBQPRAConstraint::~PBQPRAConstraint() = default;
+
 void PBQPRAConstraint::anchor() {}
+
 void PBQPRAConstraintList::anchor() {}
 
 void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
@@ -554,7 +576,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
 
 static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
                                    const MachineFunction &MF) {
-  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs();
   for (unsigned i = 0; CSR[i] != 0; ++i)
     if (TRI.regsOverlap(reg, CSR[i]))
       return true;
@@ -639,7 +661,6 @@ void RegAllocPBQP::spillVReg(unsigned VReg,
                              SmallVectorImpl<unsigned> &NewIntervals,
                              MachineFunction &MF, LiveIntervals &LIS,
                              VirtRegMap &VRM, Spiller &VRegSpiller) {
-
   VRegsToAlloc.erase(VReg);
   LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
                     nullptr, &DeadRemats);
@@ -717,7 +738,15 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
 
     if (PReg == 0) {
       const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg);
-      PReg = RC.getRawAllocationOrder(MF).front();
+      const ArrayRef<MCPhysReg> RawPRegOrder = RC.getRawAllocationOrder(MF);
+      for (unsigned CandidateReg : RawPRegOrder) {
+        if (!VRM.getRegInfo().isReserved(CandidateReg)) {
+          PReg = CandidateReg;
+          break;
+        }
+      }
+      assert(PReg &&
+             "No un-reserved physical registers in this register class");
     }
 
     VRM.assignVirt2Phys(LI.reg, PReg);
@@ -777,7 +806,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   // If there are non-empty intervals allocate them using pbqp.
   if (!VRegsToAlloc.empty()) {
-
     const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
       llvm::make_unique<PBQPRAConstraintList>();
@@ -840,7 +868,8 @@ static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
   });
 }
 
-void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   for (auto NId : nodeIds()) {
     const Vector &Costs = getNodeCosts(NId);
     assert(Costs.getLength() != 0 && "Empty vector in graph.");
@@ -861,7 +890,10 @@ void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
   }
 }
 
-LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const {
+  dump(dbgs());
+}
+#endif
 
 void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
   OS << "graph {\n";
@@ -892,5 +924,3 @@ FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
 FunctionPass* llvm::createDefaultPBQPRegisterAllocator() {
   return createPBQPRegisterAllocator();
 }
-
-#undef DEBUG_TYPE
diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
index ece44c2..855aa37 100644
--- a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "Clobbered Registers: ");
 
-  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
-    if (MRI->isPhysRegModified(PReg, true))
-      RegMask[PReg / 32] &= ~(1u << PReg % 32);
+  const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask();
+  auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
+    RegMask[Reg / 32] &= ~(1u << Reg % 32);
+  };
+  // Scan all the physical registers. When a register is defined in the current
+  // function set it and all the aliasing registers as defined in the regmask.
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+    // If a register is in the UsedPhysRegsMask set then mark it as defined.
+    // All it's aliases will also be in the set, so we can skip setting
+    // as defined all the aliases here.
+    if (UsedPhysRegsMask.test(PReg)) {
+      SetRegAsDefined(PReg);
+      continue;
+    }
+    // If a register is defined by an instruction mark it as defined together
+    // with all it's aliases.
+    if (!MRI->def_empty(PReg)) {
+      for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
+        SetRegAsDefined(*AI);
+    }
+  }
 
   if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {
     const uint32_t *CallPreservedMask =
diff --git a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 178fa18..956dec3 100644
--- a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,11 +15,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -29,8 +39,7 @@ static cl::opt<unsigned>
 StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
          cl::desc("Limit all regclasses to N registers"));
 
-RegisterClassInfo::RegisterClassInfo()
-  : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {}
+RegisterClassInfo::RegisterClassInfo() = default;
 
 void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   bool Update = false;
@@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 
   // Does this MF have different CSRs?
   assert(TRI && "no register info set");
-  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
-  if (Update || CSR != CalleeSaved) {
-    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+
+  // Get the callee saved registers.
+  const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+  if (Update || CSR != CalleeSavedRegs) {
+    // Build a CSRAlias map. Every CSR alias saves the last
     // overlapping CSR.
-    CSRNum.clear();
-    CSRNum.resize(TRI->getNumRegs(), 0);
-    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    CalleeSavedAliases.resize(TRI->getNumRegs(), 0);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+        CalleeSavedAliases[*AI] = *I;
+
     Update = true;
   }
-  CalleeSaved = CSR;
+  CalleeSavedRegs = CSR;
 
   // Different reserved registers?
   const BitVector &RR = MF->getRegInfo().getReservedRegs();
@@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CSRNum[PhysReg])
+    if (CalleeSavedAliases[PhysReg])
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {
@@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     }
   }
   RCI.NumRegs = N + CSRAlias.size();
-  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+  assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
 
   // CSR aliases go after the volatile registers, preserve the target's order.
   for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
@@ -156,9 +167,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
 unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
   const TargetRegisterClass *RC = nullptr;
   unsigned NumRCUnits = 0;
-  for (TargetRegisterInfo::regclass_iterator
-         RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) {
-    const int *PSetID = TRI->getRegClassPressureSets(*RI);
+  for (const TargetRegisterClass *C : TRI->regclasses()) {
+    const int *PSetID = TRI->getRegClassPressureSets(C);
     for (; *PSetID != -1; ++PSetID) {
       if ((unsigned)*PSetID == Idx)
         break;
@@ -168,9 +178,9 @@ unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
 
     // Found a register class that counts against this pressure set.
     // For efficiency, only compute the set order for the largest set.
-    unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit;
+    unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit;
     if (!RC || NUnits > NumRCUnits) {
-      RC = *RI;
+      RC = C;
       NumRCUnits = NUnits;
     }
   }
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 4bb3c22..a67d07b 100644
--- a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -189,6 +190,9 @@ namespace {
     /// This returns true if an interval was modified.
     bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
+    /// We found a copy which can be moved to its less frequent predecessor.
+    bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI);
+
     /// If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
     bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
@@ -811,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
       assert(ASubValNo != nullptr);
 
-      LaneBitmask AMask = SA.LaneMask;
-      for (LiveInterval::SubRange &SB : IntB.subranges()) {
-        LaneBitmask BMask = SB.LaneMask;
-        LaneBitmask Common = BMask & AMask;
-        if (Common.none())
-          continue;
-
-        DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
-                      << " into " << PrintLaneMask(Common) << '\n');
-        LaneBitmask BRest = BMask & ~AMask;
-        LiveInterval::SubRange *CommonRange;
-        if (BRest.any()) {
-          SB.LaneMask = BRest;
-          DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
-                       << '\n');
-          // Duplicate SubRange for newly merged common stuff.
-          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
-        } else {
-          // We van reuse the L SubRange.
-          SB.LaneMask = Common;
-          CommonRange = &SB;
-        }
-        LiveRange RangeCopy(SB, Allocator);
-
-        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
-        assert(BSubValNo->def == CopyIdx);
-        BSubValNo->def = ASubValNo->def;
-        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
-        AMask &= ~BMask;
-      }
-      if (AMask.any()) {
-        DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
-        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
-        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
-        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
-      }
+      IntB.refineSubRanges(Allocator, SA.LaneMask,
+          [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) {
+        VNInfo *BSubValNo = SR.empty()
+          ? SR.getNextValue(CopyIdx, Allocator)
+          : SR.getVNInfoAt(CopyIdx);
+        assert(BSubValNo != nullptr);
+        addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo);
+      });
     }
   }
 
@@ -861,6 +837,191 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   return true;
 }
 
+/// For copy B = A in BB2, if A is defined by A = B in BB0 which is a
+/// predecessor of BB2, and if B is not redefined on the way from A = B
+/// in BB2 to B = A in BB2, B = A in BB2 is partially redundant if the
+/// execution goes through the path from BB0 to BB2. We may move B = A
+/// to the predecessor without such reversed copy.
+/// So we will transform the program from:
+///   BB0:
+///      A = B;    BB1:
+///       ...         ...
+///     /     \      /
+///             BB2:
+///               ...
+///               B = A;
+///
+/// to:
+///
+///   BB0:         BB1:
+///      A = B;        ...
+///       ...          B = A;
+///     /     \       /
+///             BB2:
+///               ...
+///
+/// A special case is when BB0 and BB2 are the same BB which is the only
+/// BB in a loop:
+///   BB1:
+///        ...
+///   BB0/BB2:  ----
+///        B = A;   |
+///        ...      |
+///        A = B;   |
+///          |-------
+///          |
+/// We may hoist B = A from BB0/BB2 to BB1.
+///
+/// The major preconditions for correctness to remove such partial
+/// redundancy include:
+/// 1. A in B = A in BB2 is defined by a PHI in BB2, and one operand of
+///    the PHI is defined by the reversed copy A = B in BB0.
+/// 2. No B is referenced from the start of BB2 to B = A.
+/// 3. No B is defined from A = B to the end of BB0.
+/// 4. BB1 has only one successor.
+///
+/// 2 and 4 implicitly ensure B is not live at the end of BB1.
+/// 4 guarantees BB2 is hotter than BB1, so we can only move a copy to a
+/// colder place, which not only prevent endless loop, but also make sure
+/// the movement of copy is beneficial.
+bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
+                                                MachineInstr &CopyMI) {
+  assert(!CP.isPhys());
+  if (!CopyMI.isFullCopy())
+    return false;
+
+  MachineBasicBlock &MBB = *CopyMI.getParent();
+  if (MBB.isEHPad())
+    return false;
+
+  if (MBB.pred_size() != 2)
+    return false;
+
+  LiveInterval &IntA =
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // A is defined by PHI at the entry of MBB.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
+  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx);
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (!AValNo->isPHIDef())
+    return false;
+
+  // No B is referenced before CopyMI in MBB.
+  if (IntB.overlaps(LIS->getMBBStartIdx(&MBB), CopyIdx))
+    return false;
+
+  // MBB has two predecessors: one contains A = B so no copy will be inserted
+  // for it. The other one will have a copy moved from MBB.
+  bool FoundReverseCopy = false;
+  MachineBasicBlock *CopyLeftBB = nullptr;
+  for (MachineBasicBlock *Pred : MBB.predecessors()) {
+    VNInfo *PVal = IntA.getVNInfoBefore(LIS->getMBBEndIdx(Pred));
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(PVal->def);
+    if (!DefMI || !DefMI->isFullCopy()) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    // Check DefMI is a reverse copy and it is in BB Pred.
+    if (DefMI->getOperand(0).getReg() != IntA.reg ||
+        DefMI->getOperand(1).getReg() != IntB.reg ||
+        DefMI->getParent() != Pred) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    // If there is any other def of B after DefMI and before the end of Pred,
+    // we need to keep the copy of B = A at the end of Pred if we remove
+    // B = A from MBB.
+    bool ValB_Changed = false;
+    for (auto VNI : IntB.valnos) {
+      if (VNI->isUnused())
+        continue;
+      if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) {
+        ValB_Changed = true;
+        break;
+      }
+    }
+    if (ValB_Changed) {
+      CopyLeftBB = Pred;
+      continue;
+    }
+    FoundReverseCopy = true;
+  }
+
+  // If no reverse copy is found in predecessors, nothing to do.
+  if (!FoundReverseCopy)
+    return false;
+
+  // If CopyLeftBB is nullptr, it means every predecessor of MBB contains
+  // reverse copy, CopyMI can be removed trivially if only IntA/IntB is updated.
+  // If CopyLeftBB is not nullptr, move CopyMI from MBB to CopyLeftBB and
+  // update IntA/IntB.
+  //
+  // If CopyLeftBB is not nullptr, ensure CopyLeftBB has a single succ so
+  // MBB is hotter than CopyLeftBB.
+  if (CopyLeftBB && CopyLeftBB->succ_size() > 1)
+    return false;
+
+  // Now ok to move copy.
+  if (CopyLeftBB) {
+    DEBUG(dbgs() << "\tremovePartialRedundancy: Move the copy to BB#"
+                 << CopyLeftBB->getNumber() << '\t' << CopyMI);
+
+    // Insert new copy to CopyLeftBB.
+    auto InsPos = CopyLeftBB->getFirstTerminator();
+    MachineInstr *NewCopyMI = BuildMI(*CopyLeftBB, InsPos, CopyMI.getDebugLoc(),
+                                      TII->get(TargetOpcode::COPY), IntB.reg)
+                                  .addReg(IntA.reg);
+    SlotIndex NewCopyIdx =
+        LIS->InsertMachineInstrInMaps(*NewCopyMI).getRegSlot();
+    IntB.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+    for (LiveInterval::SubRange &SR : IntB.subranges())
+      SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
+
+    // If the newly created Instruction has an address of an instruction that was
+    // deleted before (object recycled by the allocator) it needs to be removed from
+    // the deleted list.
+    ErasedInstrs.erase(NewCopyMI);
+  } else {
+    DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from BB#"
+                 << MBB.getNumber() << '\t' << CopyMI);
+  }
+
+  // Remove CopyMI.
+  // Note: This is fine to remove the copy before updating the live-ranges.
+  // While updating the live-ranges, we only look at slot indices and
+  // never go back to the instruction.
+  LIS->RemoveMachineInstrFromMaps(CopyMI);
+  // Mark instructions as deleted.
+  ErasedInstrs.insert(&CopyMI);
+  CopyMI.eraseFromParent();
+
+  // Update the liveness.
+  SmallVector<SlotIndex, 8> EndPoints;
+  VNInfo *BValNo = IntB.Query(CopyIdx).valueOutOrDead();
+  LIS->pruneValue(*static_cast<LiveRange *>(&IntB), CopyIdx.getRegSlot(),
+                  &EndPoints);
+  BValNo->markUnused();
+  // Extend IntB to the EndPoints of its original live interval.
+  LIS->extendToIndices(IntB, EndPoints);
+
+  // Now, do the same for its subranges.
+  for (LiveInterval::SubRange &SR : IntB.subranges()) {
+    EndPoints.clear();
+    VNInfo *BValNo = SR.Query(CopyIdx).valueOutOrDead();
+    assert(BValNo && "All sublanes should be live");
+    LIS->pruneValue(SR, CopyIdx.getRegSlot(), &EndPoints);
+    BValNo->markUnused();
+    LIS->extendToIndices(SR, EndPoints);
+  }
+
+  // Finally, update the live-range of IntA.
+  shrinkToUses(&IntA);
+  return true;
+}
+
 /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
 /// defining a subregister.
 static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
@@ -1066,6 +1227,34 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         SR->createDeadDef(DefIndex, Alloc);
       }
     }
+
+    // Make sure that the subrange for resultant undef is removed
+    // For example:
+    //   vreg1:sub1<def,read-undef> = LOAD CONSTANT 1
+    //   vreg2<def> = COPY vreg1
+    // ==>
+    //   vreg2:sub1<def, read-undef> = LOAD CONSTANT 1
+    //     ; Correct but need to remove the subrange for vreg2:sub0
+    //     ; as it is now undef
+    if (NewIdx != 0 && DstInt.hasSubRanges()) {
+      // The affected subregister segments can be removed.
+      SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
+      LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(NewIdx);
+      bool UpdatedSubRanges = false;
+      for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+        if ((SR.LaneMask & DstMask).none()) {
+          DEBUG(dbgs() << "Removing undefined SubRange "
+                << PrintLaneMask(SR.LaneMask) << " : " << SR << "\n");
+          // VNI is in ValNo - remove any segments in this SubRange that have this ValNo
+          if (VNInfo *RmValNo = SR.getVNInfoAt(CurrIdx.getRegSlot())) {
+            SR.removeValNo(RmValNo);
+            UpdatedSubRanges = true;
+          }
+        }
+      }
+      if (UpdatedSubRanges)
+        DstInt.removeEmptySubRanges();
+    }
   } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
     // The New instruction may be defining a sub-register of what's actually
     // been asked for. If so it must implicitly define the whole thing.
@@ -1290,7 +1479,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
 
     // If SrcReg wasn't read, it may still be the case that DstReg is live-in
     // because SrcReg is a sub-register.
-    if (DstInt && !Reads && SubIdx)
+    if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
     // Replace SrcReg with DstReg in all UseMI operands.
@@ -1486,6 +1675,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
       }
     }
 
+    // Try and see if we can partially eliminate the copy by moving the copy to
+    // its predecessor.
+    if (!CP.isPartial() && !CP.isPhys())
+      if (removePartialRedundancy(CP, *CopyMI))
+        return true;
+
     // Otherwise, we are unable to join the intervals.
     DEBUG(dbgs() << "\tInterference!\n");
     Again = true;  // May be possible to coalesce later.
@@ -1583,6 +1778,14 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
         return false;
       }
     }
+
+    // We must also check for overlaps with regmask clobbers.
+    BitVector RegMaskUsable;
+    if (LIS->checkRegMaskInterference(RHS, RegMaskUsable) &&
+        !RegMaskUsable.test(DstReg)) {
+      DEBUG(dbgs() << "\t\tRegMask interference\n");
+      return false;
+    }
   }
 
   // Skip any value computations, we are not adding new values to the
@@ -1636,14 +1839,6 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
           DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
           return false;
         }
-
-        // We must also check for clobbers caused by regmasks.
-        for (const auto &MO : MI->operands()) {
-          if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
-            DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
-            return false;
-          }
-        }
       }
     }
 
@@ -2506,11 +2701,17 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
   // Look for values being erased.
   bool DidPrune = false;
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
-    if (Vals[i].Resolution != CR_Erase)
+    // We should trigger in all cases in which eraseInstrs() does something.
+    // match what eraseInstrs() is doing, print a message so
+    if (Vals[i].Resolution != CR_Erase &&
+        (Vals[i].Resolution != CR_Keep || !Vals[i].ErasableImplicitDef ||
+         !Vals[i].Pruned))
       continue;
 
     // Check subranges at the point where the copy will be removed.
     SlotIndex Def = LR.getValNumInfo(i)->def;
+    // Print message so mismatches with eraseInstrs() can be diagnosed.
+    DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def << '\n');
     for (LiveInterval::SubRange &S : LI.subranges()) {
       LiveQueryResult Q = S.Query(Def);
 
@@ -2738,39 +2939,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
                                           LaneBitmask LaneMask,
                                           CoalescerPair &CP) {
   BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
-  for (LiveInterval::SubRange &R : LI.subranges()) {
-    LaneBitmask RMask = R.LaneMask;
-    // LaneMask of subregisters common to subrange R and ToMerge.
-    LaneBitmask Common = RMask & LaneMask;
-    // There is nothing to do without common subregs.
-    if (Common.none())
-      continue;
-
-    DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
-                 << PrintLaneMask(Common) << '\n');
-    // LaneMask of subregisters contained in the R range but not in ToMerge,
-    // they have to split into their own subrange.
-    LaneBitmask LRest = RMask & ~LaneMask;
-    LiveInterval::SubRange *CommonRange;
-    if (LRest.any()) {
-      R.LaneMask = LRest;
-      DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
-      // Duplicate SubRange for newly merged common stuff.
-      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+  LI.refineSubRanges(Allocator, LaneMask,
+      [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) {
+    if (SR.empty()) {
+      SR.assign(ToMerge, Allocator);
     } else {
-      // Reuse the existing range.
-      R.LaneMask = Common;
-      CommonRange = &R;
+      // joinSubRegRange() destroys the merged range, so we need a copy.
+      LiveRange RangeCopy(ToMerge, Allocator);
+      joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP);
     }
-    LiveRange RangeCopy(ToMerge, Allocator);
-    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
-    LaneMask &= ~RMask;
-  }
-
-  if (LaneMask.any()) {
-    DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
-    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
-  }
+  });
 }
 
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
@@ -2952,7 +3130,7 @@ copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
       continue;
     // Skip instruction pointers that have already been erased, for example by
     // dead code elimination.
-    if (ErasedInstrs.erase(CurrList[i])) {
+    if (ErasedInstrs.count(CurrList[i])) {
       CurrList[i] = nullptr;
       continue;
     }
@@ -3077,7 +3255,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
     CurrList(WorkList.begin() + PrevSize, WorkList.end());
   if (copyCoalesceWorkList(CurrList))
     WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
-                               (MachineInstr*)nullptr), WorkList.end());
+                               nullptr), WorkList.end());
 }
 
 void RegisterCoalescer::coalesceLocals() {
diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
index fc84aeb..88e0a3b 100644
--- a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,12 +13,36 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -52,6 +76,7 @@ static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
 void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
                               const TargetRegisterInfo *TRI) {
@@ -97,6 +122,7 @@ void RegPressureTracker::dump() const {
   P.dump(TRI);
 }
 
+LLVM_DUMP_METHOD
 void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   const char *sep = "";
   for (const PressureChange &Change : *this) {
@@ -108,6 +134,7 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   }
   dbgs() << '\n';
 }
+#endif
 
 void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
                                              LaneBitmask PreviousMask,
@@ -264,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const {
           MachineBasicBlock::const_iterator());
 }
 
-
 SlotIndex RegPressureTracker::getCurrSlot() const {
   MachineBasicBlock::const_iterator IdxPos =
     skipDebugInstructionsForward(CurrPos, MBB->end());
@@ -328,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
 
 static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
                                unsigned RegUnit) {
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end())
@@ -340,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                         RegisterMaskPair Pair) {
   unsigned RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end()) {
@@ -352,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
 
 static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                        unsigned RegUnit) {
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end()) {
@@ -366,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                            RegisterMaskPair Pair) {
   unsigned RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I != RegUnits.end()) {
@@ -423,6 +449,8 @@ namespace {
 ///
 /// FIXME: always ignore tied opers
 class RegisterOperandsCollector {
+  friend class llvm::RegisterOperands;
+
   RegisterOperands &RegOpers;
   const TargetRegisterInfo &TRI;
   const MachineRegisterInfo &MRI;
@@ -517,11 +545,9 @@ class RegisterOperandsCollector {
         addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll()));
     }
   }
-
-  friend class llvm::RegisterOperands;
 };
 
-} // namespace
+} // end anonymous namespace
 
 void RegisterOperands::collect(const MachineInstr &MI,
                                const TargetRegisterInfo &TRI,
@@ -674,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
   assert(Pair.LaneMask.any());
 
   unsigned RegUnit = Pair.RegUnit;
-  auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
+  auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
     return Other.RegUnit == RegUnit;
   });
   LaneBitmask PrevMask;
@@ -772,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
         if (!TrackLaneMasks) {
           addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
         } else {
-          auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
-            return Other.RegUnit == Reg;
-          });
+          auto I =
+              llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
+                return Other.RegUnit == Reg;
+              });
           bool IsRedef = I != LiveUses->end();
           if (IsRedef) {
             // ignore re-defs here...
@@ -1154,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
 
       if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
         int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
-        if (CritInc > 0 && CritInc <= INT16_MAX) {
+        if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
           Delta.CriticalMax = PressureChange(PSetID);
           Delta.CriticalMax.setUnitInc(CritInc);
         }
diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
index fdf741f..fc5105a 100644
--- a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//===- RegisterScavenging.cpp - Machine register scavenging ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,27 +16,39 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/RegisterScavenging.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/PassSupport.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <string>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
 
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+
 void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
-  for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
-    LaneBitmask UnitMask = (*RUI).second;
-    if (UnitMask.none() || (LaneMask & UnitMask).any())
-      RegUnitsAvailable.reset((*RUI).first);
-  }
+  LiveUnits.addRegMasked(Reg, LaneMask);
 }
 
 void RegScavenger::init(MachineBasicBlock &MBB) {
@@ -44,6 +56,7 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
+  LiveUnits.init(*TRI);
 
   assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
          "Target changed?");
@@ -51,45 +64,28 @@ void RegScavenger::init(MachineBasicBlock &MBB) {
   // Self-initialize.
   if (!this->MBB) {
     NumRegUnits = TRI->getNumRegUnits();
-    RegUnitsAvailable.resize(NumRegUnits);
     KillRegUnits.resize(NumRegUnits);
     DefRegUnits.resize(NumRegUnits);
     TmpRegUnits.resize(NumRegUnits);
   }
   this->MBB = &MBB;
 
-  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
-         IE = Scavenged.end(); I != IE; ++I) {
-    I->Reg = 0;
-    I->Restore = nullptr;
+  for (ScavengedInfo &SI : Scavenged) {
+    SI.Reg = 0;
+    SI.Restore = nullptr;
   }
 
-  // All register units start out unused.
-  RegUnitsAvailable.set();
-
-  // Pristine CSRs are not available.
-  BitVector PR = MF.getFrameInfo().getPristineRegs(MF);
-  for (int I = PR.find_first(); I>0; I = PR.find_next(I))
-    setRegUsed(I);
-
   Tracking = false;
 }
 
-void RegScavenger::setLiveInsUsed(const MachineBasicBlock &MBB) {
-  for (const auto &LI : MBB.liveins())
-    setRegUsed(LI.PhysReg, LI.LaneMask);
-}
-
 void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) {
   init(MBB);
-  setLiveInsUsed(MBB);
+  LiveUnits.addLiveIns(MBB);
 }
 
 void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
   init(MBB);
-  // Merge live-ins of successors to get live-outs.
-  for (const MachineBasicBlock *Succ : MBB.successors())
-    setLiveInsUsed(*Succ);
+  LiveUnits.addLiveOuts(MBB);
 
   // Move internal iterator at the last instruction of the block.
   if (MBB.begin() != MBB.end()) {
@@ -263,34 +259,13 @@ void RegScavenger::backward() {
   assert(Tracking && "Must be tracking to determine kills and defs");
 
   const MachineInstr &MI = *MBBI;
-  // Defined or clobbered registers are available now.
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isRegMask()) {
-      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd;
-           ++RU) {
-        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
-          if (MO.clobbersPhysReg(*RURI)) {
-            RegUnitsAvailable.set(RU);
-            break;
-          }
-        }
-      }
-    } else if (MO.isReg() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
-          isReserved(Reg))
-        continue;
-      addRegUnits(RegUnitsAvailable, Reg);
-    }
-  }
-  // Mark read registers as unavailable.
-  for (const MachineOperand &MO : MI.uses()) {
-    if (MO.isReg() && MO.readsReg()) {
-      unsigned Reg = MO.getReg();
-      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
-          isReserved(Reg))
-        continue;
-      removeRegUnits(RegUnitsAvailable, Reg);
+  LiveUnits.stepBackward(MI);
+
+  // Expire scavenge spill frameindex uses.
+  for (ScavengedInfo &I : Scavenged) {
+    if (I.Restore == &MI) {
+      I.Reg = 0;
+      I.Restore = nullptr;
     }
   }
 
@@ -302,12 +277,9 @@ void RegScavenger::backward() {
 }
 
 bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
-  if (includeReserved && isReserved(Reg))
-    return true;
-  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
-    if (!RegUnitsAvailable.test(*RUI))
-      return true;
-  return false;
+  if (isReserved(Reg))
+    return includeReserved;
+  return !LiveUnits.available(Reg);
 }
 
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
@@ -393,6 +365,86 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
   return Survivor;
 }
 
+/// Given the bitvector \p Available of free register units at position
+/// \p From. Search backwards to find a register that is part of \p
+/// Candidates and not used/clobbered until the point \p To. If there is
+/// multiple candidates continue searching and pick the one that is not used/
+/// clobbered for the longest time.
+/// Returns the register and the earliest position we know it to be free or
+/// the position MBB.end() if no register is available.
+static std::pair<MCPhysReg, MachineBasicBlock::iterator>
+findSurvivorBackwards(const MachineRegisterInfo &MRI,
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const LiveRegUnits &LiveOut, ArrayRef<MCPhysReg> AllocationOrder,
+    bool RestoreAfter) {
+  bool FoundTo = false;
+  MCPhysReg Survivor = 0;
+  MachineBasicBlock::iterator Pos;
+  MachineBasicBlock &MBB = *From->getParent();
+  unsigned InstrLimit = 25;
+  unsigned InstrCountDown = InstrLimit;
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LiveRegUnits Used(TRI);
+
+  for (MachineBasicBlock::iterator I = From;; --I) {
+    const MachineInstr &MI = *I;
+
+    Used.accumulate(MI);
+
+    if (I == To) {
+      // See if one of the registers in RC wasn't used so far.
+      for (MCPhysReg Reg : AllocationOrder) {
+        if (!MRI.isReserved(Reg) && Used.available(Reg) &&
+            LiveOut.available(Reg))
+          return std::make_pair(Reg, MBB.end());
+      }
+      // Otherwise we will continue up to InstrLimit instructions to find
+      // the register which is not defined/used for the longest time.
+      FoundTo = true;
+      Pos = To;
+      // Note: It was fine so far to start our search at From, however now that
+      // we have to spill, and can only place the restore after From then
+      // add the regs used/defed by std::next(From) to the set.
+      if (RestoreAfter)
+        Used.accumulate(*std::next(From));
+    }
+    if (FoundTo) {
+      if (Survivor == 0 || !Used.available(Survivor)) {
+        MCPhysReg AvilableReg = 0;
+        for (MCPhysReg Reg : AllocationOrder) {
+          if (!MRI.isReserved(Reg) && Used.available(Reg)) {
+            AvilableReg = Reg;
+            break;
+          }
+        }
+        if (AvilableReg == 0)
+          break;
+        Survivor = AvilableReg;
+      }
+      if (--InstrCountDown == 0)
+        break;
+
+      // Keep searching when we find a vreg since the spilled register will
+      // be usefull for this other vreg as well later.
+      bool FoundVReg = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+          FoundVReg = true;
+          break;
+        }
+      }
+      if (FoundVReg) {
+        InstrCountDown = InstrLimit;
+        Pos = I;
+      }
+      if (I == MBB.begin())
+        break;
+    }
+  }
+
+  return std::make_pair(Survivor, Pos);
+}
+
 static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
   unsigned i = 0;
   while (!MI.getOperand(i).isFI()) {
@@ -402,46 +454,18 @@ static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
   return i;
 }
 
-unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
-                                        MachineBasicBlock::iterator I,
-                                        int SPAdj) {
-  MachineInstr &MI = *I;
-  const MachineFunction &MF = *MI.getParent()->getParent();
-  // Consider all allocatable registers in the register class initially
-  BitVector Candidates = TRI->getAllocatableSet(MF, RC);
-
-  // Exclude all the registers being used by the instruction.
-  for (const MachineOperand &MO : MI.operands()) {
-    if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
-        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
-        Candidates.reset(*AI);
-  }
-
-  // Try to find a register that's unused if there is one, as then we won't
-  // have to spill.
-  BitVector Available = getRegsAvailable(RC);
-  Available &= Candidates;
-  if (Available.any())
-    Candidates = Available;
-
-  // Find the register whose use is furthest away.
-  MachineBasicBlock::iterator UseMI;
-  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
-
-  // If we found an unused register there is no reason to spill it.
-  if (!isRegUsed(SReg)) {
-    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
-    return SReg;
-  }
-
+RegScavenger::ScavengedInfo &
+RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
+                    MachineBasicBlock::iterator Before,
+                    MachineBasicBlock::iterator &UseMI) {
   // Find an available scavenging slot with size and alignment matching
   // the requirements of the class RC.
+  const MachineFunction &MF = *Before->getParent()->getParent();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned NeedSize = RC->getSize();
-  unsigned NeedAlign = RC->getAlignment();
+  unsigned NeedSize = TRI->getSpillSize(RC);
+  unsigned NeedAlign = TRI->getSpillAlignment(RC);
 
-  unsigned SI = Scavenged.size(), Diff = UINT_MAX;
+  unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
   int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
   for (unsigned I = 0; I < Scavenged.size(); ++I) {
     if (Scavenged[I].Reg != 0)
@@ -474,42 +498,303 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   }
 
   // Avoid infinite regress
-  Scavenged[SI].Reg = SReg;
+  Scavenged[SI].Reg = Reg;
 
   // If the target knows how to save/restore the register, let it do so;
   // otherwise, use the emergency stack spill slot.
-  if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
-    // Spill the scavenged register before I.
+  if (!TRI->saveScavengerRegister(*MBB, Before, UseMI, &RC, Reg)) {
+    // Spill the scavenged register before \p Before.
     int FI = Scavenged[SI].FrameIndex;
     if (FI < FIB || FI >= FIE) {
       std::string Msg = std::string("Error while trying to spill ") +
-          TRI->getName(SReg) + " from class " + TRI->getRegClassName(RC) +
+          TRI->getName(Reg) + " from class " + TRI->getRegClassName(&RC) +
           ": Cannot scavenge register without an emergency spill slot!";
       report_fatal_error(Msg.c_str());
     }
-    TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex,
-                             RC, TRI);
-    MachineBasicBlock::iterator II = std::prev(I);
+    TII->storeRegToStackSlot(*MBB, Before, Reg, true, Scavenged[SI].FrameIndex,
+                             &RC, TRI);
+    MachineBasicBlock::iterator II = std::prev(Before);
 
     unsigned FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
 
     // Restore the scavenged register before its use (or first terminator).
-    TII->loadRegFromStackSlot(*MBB, UseMI, SReg, Scavenged[SI].FrameIndex,
-                              RC, TRI);
+    TII->loadRegFromStackSlot(*MBB, UseMI, Reg, Scavenged[SI].FrameIndex,
+                              &RC, TRI);
     II = std::prev(UseMI);
 
     FIOperandNum = getFrameIndexOperandNum(*II);
     TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
   }
+  return Scavenged[SI];
+}
 
-  Scavenged[SI].Restore = &*std::prev(UseMI);
+unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+                                        MachineBasicBlock::iterator I,
+                                        int SPAdj) {
+  MachineInstr &MI = *I;
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  // Consider all allocatable registers in the register class initially
+  BitVector Candidates = TRI->getAllocatableSet(MF, RC);
+
+  // Exclude all the registers being used by the instruction.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Candidates.reset(*AI);
+  }
 
-  // Doing this here leads to infinite regress.
-  // Scavenged[SI].Reg = SReg;
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill.
+  BitVector Available = getRegsAvailable(RC);
+  Available &= Candidates;
+  if (Available.any())
+    Candidates = Available;
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+
+  // If we found an unused register there is no reason to spill it.
+  if (!isRegUsed(SReg)) {
+    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
+    return SReg;
+  }
+
+  ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI);
+  Scavenged.Restore = &*std::prev(UseMI);
 
   DEBUG(dbgs() << "Scavenged register (with spill): " << TRI->getName(SReg) <<
         "\n");
 
   return SReg;
 }
+
+unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
+                                                 MachineBasicBlock::iterator To,
+                                                 bool RestoreAfter, int SPAdj) {
+  const MachineBasicBlock &MBB = *To->getParent();
+  const MachineFunction &MF = *MBB.getParent();
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  ArrayRef<MCPhysReg> AllocationOrder = RC.getRawAllocationOrder(MF);
+  std::pair<MCPhysReg, MachineBasicBlock::iterator> P =
+      findSurvivorBackwards(*MRI, MBBI, To, LiveUnits, AllocationOrder,
+                            RestoreAfter);
+  MCPhysReg Reg = P.first;
+  MachineBasicBlock::iterator SpillBefore = P.second;
+  assert(Reg != 0 && "No register left to scavenge!");
+  // Found an available register?
+  if (SpillBefore != MBB.end()) {
+    MachineBasicBlock::iterator ReloadAfter =
+      RestoreAfter ? std::next(MBBI) : MBBI;
+    MachineBasicBlock::iterator ReloadBefore = std::next(ReloadAfter);
+    DEBUG(dbgs() << "Reload before: " << *ReloadBefore << '\n');
+    ScavengedInfo &Scavenged = spill(Reg, RC, SPAdj, SpillBefore, ReloadBefore);
+    Scavenged.Restore = &*std::prev(SpillBefore);
+    LiveUnits.removeReg(Reg);
+    DEBUG(dbgs() << "Scavenged register with spill: " << PrintReg(Reg, TRI)
+          << " until " << *SpillBefore);
+  } else {
+    DEBUG(dbgs() << "Scavenged free register: " << PrintReg(Reg, TRI) << '\n');
+  }
+  return Reg;
+}
+
+/// Allocate a register for the virtual register \p VReg. The last use of
+/// \p VReg is around the current position of the register scavenger \p RS.
+/// \p ReserveAfter controls whether the scavenged register needs to be reserved
+/// after the current instruction, otherwise it will only be reserved before the
+/// current instruction.
+static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
+                             unsigned VReg, bool ReserveAfter) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+#ifndef NDEBUG
+  // Verify that all definitions and uses are in the same basic block.
+  const MachineBasicBlock *CommonMBB = nullptr;
+  // Real definition for the reg, re-definitions are not considered.
+  const MachineInstr *RealDef = nullptr;
+  for (MachineOperand &MO : MRI.reg_nodbg_operands(VReg)) {
+    MachineBasicBlock *MBB = MO.getParent()->getParent();
+    if (CommonMBB == nullptr)
+      CommonMBB = MBB;
+    assert(MBB == CommonMBB && "All defs+uses must be in the same basic block");
+    if (MO.isDef()) {
+      const MachineInstr &MI = *MO.getParent();
+      if (!MI.readsRegister(VReg, &TRI)) {
+        assert((!RealDef || RealDef == &MI) &&
+               "Can have at most one definition which is not a redefinition");
+        RealDef = &MI;
+      }
+    }
+  }
+  assert(RealDef != nullptr && "Must have at least 1 Def");
+#endif
+
+  // We should only have one definition of the register. However to accommodate
+  // the requirements of two address code we also allow definitions in
+  // subsequent instructions provided they also read the register. That way
+  // we get a single contiguous lifetime.
+  //
+  // Definitions in MRI.def_begin() are unordered, search for the first.
+  MachineRegisterInfo::def_iterator FirstDef =
+    std::find_if(MRI.def_begin(VReg), MRI.def_end(),
+                 [VReg, &TRI](const MachineOperand &MO) {
+      return !MO.getParent()->readsRegister(VReg, &TRI);
+    });
+  assert(FirstDef != MRI.def_end() &&
+         "Must have one definition that does not redefine vreg");
+  MachineInstr &DefMI = *FirstDef->getParent();
+
+  // The register scavenger will report a free register inserting an emergency
+  // spill/reload if necessary.
+  int SPAdj = 0;
+  const TargetRegisterClass &RC = *MRI.getRegClass(VReg);
+  unsigned SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
+                                               ReserveAfter, SPAdj);
+  MRI.replaceRegWith(VReg, SReg);
+  ++NumScavengedRegs;
+  return SReg;
+}
+
+/// Allocate (scavenge) vregs inside a single basic block.
+/// Returns true if the target spill callback created new vregs and a 2nd pass
+/// is necessary.
+static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
+                                            RegScavenger &RS,
+                                            MachineBasicBlock &MBB) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  RS.enterBasicBlockEnd(MBB);
+
+  unsigned InitialNumVirtRegs = MRI.getNumVirtRegs();
+  bool NextInstructionReadsVReg = false;
+  for (MachineBasicBlock::iterator I = MBB.end(); I != MBB.begin(); ) {
+    --I;
+    // Move RegScavenger to the position between *I and *std::next(I).
+    RS.backward(I);
+
+    // Look for unassigned vregs in the uses of *std::next(I).
+    if (NextInstructionReadsVReg) {
+      MachineBasicBlock::iterator N = std::next(I);
+      const MachineInstr &NMI = *N;
+      for (const MachineOperand &MO : NMI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        // We only care about virtual registers and ignore virtual registers
+        // created by the target callbacks in the process (those will be handled
+        // in a scavenging round).
+        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+            TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+          continue;
+        if (!MO.readsReg())
+          continue;
+
+        unsigned SReg = scavengeVReg(MRI, RS, Reg, true);
+        N->addRegisterKilled(SReg, &TRI, false);
+        RS.setRegUsed(SReg);
+      }
+    }
+
+    // Look for unassigned vregs in the defs of *I.
+    NextInstructionReadsVReg = false;
+    const MachineInstr &MI = *I;
+    for (const MachineOperand &MO : MI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      // Only vregs, no newly created vregs (see above).
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+          TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+        continue;
+      // We have to look at all operands anyway so we can precalculate here
+      // whether there is a reading operand. This allows use to skip the use
+      // step in the next iteration if there was none.
+      assert(!MO.isInternalRead() && "Cannot assign inside bundles");
+      assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
+      if (MO.readsReg()) {
+        NextInstructionReadsVReg = true;
+      }
+      if (MO.isDef()) {
+        unsigned SReg = scavengeVReg(MRI, RS, Reg, false);
+        I->addRegisterDead(SReg, &TRI, false);
+      }
+    }
+  }
+#ifndef NDEBUG
+  for (const MachineOperand &MO : MBB.front().operands()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+    assert(!MO.isInternalRead() && "Cannot assign inside bundles");
+    assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
+    assert(!MO.readsReg() && "Vreg use in first instruction not allowed");
+  }
+#endif
+
+  return MRI.getNumVirtRegs() != InitialNumVirtRegs;
+}
+
+void llvm::scavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger &RS) {
+  // FIXME: Iterating over the instruction stream is unnecessary. We can simply
+  // iterate over the vreg use list, which at this point only contains machine
+  // operands for which eliminateFrameIndex need a new scratch reg.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Shortcut.
+  if (MRI.getNumVirtRegs() == 0) {
+    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+    return;
+  }
+
+  // Run through the instructions and find any virtual registers.
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBB.empty())
+      continue;
+
+    bool Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
+    if (Again) {
+      DEBUG(dbgs() << "Warning: Required two scavenging passes for block "
+            << MBB.getName() << '\n');
+      Again = scavengeFrameVirtualRegsInBlock(MRI, RS, MBB);
+      // The target required a 2nd run (because it created new vregs while
+      // spilling). Refuse to do another pass to keep compiletime in check.
+      if (Again)
+        report_fatal_error("Incomplete scavenging after 2nd pass");
+    }
+  }
+
+  MRI.clearVirtRegs();
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+}
+
+namespace {
+/// This class runs register scavenging independ of the PrologEpilogInserter.
+/// This is used in for testing.
+class ScavengerTest : public MachineFunctionPass {
+public:
+  static char ID;
+  ScavengerTest() : MachineFunctionPass(ID) {}
+  bool runOnMachineFunction(MachineFunction &MF) {
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    const TargetFrameLowering &TFL = *STI.getFrameLowering();
+
+    RegScavenger RS;
+    // Let's hope that calling those outside of PrologEpilogueInserter works
+    // well enough to initialize the scavenger with some emergency spillslots
+    // for the target.
+    BitVector SavedRegs;
+    TFL.determineCalleeSaves(MF, SavedRegs, &RS);
+    TFL.processFunctionBeforeFrameFinalized(MF, &RS);
+
+    // Let's scavenge the current function
+    scavengeFrameVirtualRegs(MF, RS);
+    return true;
+  }
+};
+char ScavengerTest::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(ScavengerTest, "scavenger-test",
+                "Scavenge virtual registers inside basic blocks", false, false)
diff --git a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
index 66f1966..30757f0 100644
--- a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -1,4 +1,4 @@
-//===- RegisterUsageInfo.cpp - Register Usage Informartion Storage --------===//
+//===- RegisterUsageInfo.cpp - Register Usage Information Storage ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +12,22 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -27,7 +38,7 @@ static cl::opt<bool> DumpRegUsage(
     cl::desc("print register usage details collected for analysis."));
 
 INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
-                "Register Usage Informartion Stroage", false, true)
+                "Register Usage Information Storage", false, true)
 
 char PhysicalRegisterUsageInfo::ID = 0;
 
@@ -63,7 +74,7 @@ PhysicalRegisterUsageInfo::getRegUsageInfo(const Function *FP) {
 void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
   const TargetRegisterInfo *TRI;
 
-  typedef std::pair<const Function *, std::vector<uint32_t>> FuncPtrRegMaskPair;
+  using FuncPtrRegMaskPair = std::pair<const Function *, std::vector<uint32_t>>;
 
   SmallVector<const FuncPtrRegMaskPair *, 64> FPRMPairVector;
 
diff --git a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
index 2f7ee8b..bd5ecbd 100644
--- a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -32,10 +32,10 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 
 using namespace llvm;
 
@@ -112,11 +112,11 @@ char RenameIndependentSubregs::ID;
 
 char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID;
 
-INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs",
+INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, DEBUG_TYPE,
                       "Rename Independent Subregisters", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs",
+INITIALIZE_PASS_END(RenameIndependentSubregs, DEBUG_TYPE,
                     "Rename Independent Subregisters", false, false)
 
 bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
@@ -212,7 +212,7 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
     const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
     const SmallVectorImpl<LiveInterval*> &Intervals) const {
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
-  unsigned Reg = Intervals[0]->reg;;
+  unsigned Reg = Intervals[0]->reg;
   for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
        E = MRI->reg_nodbg_end(); I != E; ) {
     MachineOperand &MO = *I++;
@@ -243,6 +243,15 @@ void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
 
     unsigned VReg = Intervals[ID]->reg;
     MO.setReg(VReg);
+
+    if (MO.isTied() && Reg != VReg) {
+      /// Undef use operands are not tracked in the equivalence class but need
+      /// to be update if they are tied.
+      MO.getParent()->substituteRegister(Reg, VReg, 0, TRI);
+
+      // substituteRegister breaks the iterator, so restart.
+      I = MRI->reg_nodbg_begin(Reg);
+    }
   }
   // TODO: We could attempt to recompute new register classes while visiting
   // the operands: Some of the split register may be fine with less constraint
diff --git a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
index 4519641..01b3db4 100644
--- a/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/ResetMachineFunctionPass.cpp
@@ -14,9 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -30,17 +30,23 @@ namespace {
     /// Tells whether or not this pass should emit a fallback
     /// diagnostic when it resets a function.
     bool EmitFallbackDiag;
+    /// Whether we should abort immediately instead of resetting the function.
+    bool AbortOnFailedISel;
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    ResetMachineFunction(bool EmitFallbackDiag = false)
-        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag) {}
+    ResetMachineFunction(bool EmitFallbackDiag = false,
+                         bool AbortOnFailedISel = false)
+        : MachineFunctionPass(ID), EmitFallbackDiag(EmitFallbackDiag),
+          AbortOnFailedISel(AbortOnFailedISel) {}
 
     StringRef getPassName() const override { return "ResetMachineFunction"; }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       if (MF.getProperties().hasProperty(
               MachineFunctionProperties::Property::FailedISel)) {
+        if (AbortOnFailedISel)
+          report_fatal_error("Instruction selection failed");
         DEBUG(dbgs() << "Reseting: " << MF.getName() << '\n');
         ++NumFunctionsReset;
         MF.reset();
@@ -62,6 +68,7 @@ INITIALIZE_PASS(ResetMachineFunction, DEBUG_TYPE,
                 "reset machine function if ISel failed", false, false)
 
 MachineFunctionPass *
-llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false) {
-  return new ResetMachineFunction(EmitFallbackDiag);
+llvm::createResetMachineFunctionPass(bool EmitFallbackDiag = false,
+                                     bool AbortOnFailedISel = false) {
+  return new ResetMachineFunction(EmitFallbackDiag, AbortOnFailedISel);
 }
diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp
index 2b82df2..8584a9b 100644
--- a/contrib/llvm/lib/CodeGen/SafeStack.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp
@@ -19,10 +19,12 @@
 #include "SafeStackLayout.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -50,7 +52,7 @@
 using namespace llvm;
 using namespace llvm::safestack;
 
-#define DEBUG_TYPE "safestack"
+#define DEBUG_TYPE "safe-stack"
 
 namespace llvm {
 
@@ -92,11 +94,11 @@ public:
 /// determined statically), and the unsafe stack, which contains all
 /// local variables that are accessed in ways that we can't prove to
 /// be safe.
-class SafeStack : public FunctionPass {
-  const TargetMachine *TM;
-  const TargetLoweringBase *TL;
-  const DataLayout *DL;
-  ScalarEvolution *SE;
+class SafeStack {
+  Function &F;
+  const TargetLoweringBase &TL;
+  const DataLayout &DL;
+  ScalarEvolution &SE;
 
   Type *StackPtrTy;
   Type *IntPtrTy;
@@ -171,33 +173,21 @@ class SafeStack : public FunctionPass {
                     uint64_t AllocaSize);
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  SafeStack(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
-    initializeSafeStackPass(*PassRegistry::getPassRegistry());
-  }
-  SafeStack() : SafeStack(nullptr) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-  }
-
-  bool doInitialization(Module &M) override {
-    DL = &M.getDataLayout();
-
-    StackPtrTy = Type::getInt8PtrTy(M.getContext());
-    IntPtrTy = DL->getIntPtrType(M.getContext());
-    Int32Ty = Type::getInt32Ty(M.getContext());
-    Int8Ty = Type::getInt8Ty(M.getContext());
-
-    return false;
-  }
-
-  bool runOnFunction(Function &F) override;
-}; // class SafeStack
+  SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL,
+            ScalarEvolution &SE)
+      : F(F), TL(TL), DL(DL), SE(SE),
+        StackPtrTy(Type::getInt8PtrTy(F.getContext())),
+        IntPtrTy(DL.getIntPtrType(F.getContext())),
+        Int32Ty(Type::getInt32Ty(F.getContext())),
+        Int8Ty(Type::getInt8Ty(F.getContext())) {}
+
+  // Run the transformation on the associated function.
+  // Returns whether the function was changed.
+  bool run();
+};
 
 uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
-  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
   if (AI->isArrayAllocation()) {
     auto C = dyn_cast<ConstantInt>(AI->getArraySize());
     if (!C)
@@ -209,11 +199,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
 
 bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                              const Value *AllocaPtr, uint64_t AllocaSize) {
-  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
-  const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
 
-  uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
-  ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+  uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType());
+  ConstantRange AccessStartRange = SE.getUnsignedRange(Expr);
   ConstantRange SizeRange =
       ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
   ConstantRange AccessRange = AccessStartRange.add(SizeRange);
@@ -226,8 +216,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
                << *AllocaPtr << "\n"
                << "            Access " << *Addr << "\n"
                << "            SCEV " << *Expr
-               << " U: " << SE->getUnsignedRange(Expr)
-               << ", S: " << SE->getSignedRange(Expr) << "\n"
+               << " U: " << SE.getUnsignedRange(Expr)
+               << ", S: " << SE.getSignedRange(Expr) << "\n"
                << "            Range " << AccessRange << "\n"
                << "            AllocaRange " << AllocaRange << "\n"
                << "            " << (Safe ? "safe" : "unsafe") << "\n");
@@ -266,7 +256,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 
       switch (I->getOpcode()) {
       case Instruction::Load: {
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr,
                           AllocaSize))
           return false;
         break;
@@ -282,7 +272,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
           return false;
         }
 
-        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+        if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()),
                           AllocaPtr, AllocaSize))
           return false;
         break;
@@ -343,7 +333,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 }
 
 Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
-  Value *StackGuardVar = TL->getIRStackGuard(IRB);
+  Value *StackGuardVar = TL.getIRStackGuard(IRB);
   if (!StackGuardVar)
     StackGuardVar =
         F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
@@ -390,7 +380,7 @@ void SafeStack::findInsts(Function &F,
     if (!Arg.hasByValAttr())
       continue;
     uint64_t Size =
-        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+        DL.getTypeStoreSize(Arg.getType()->getPointerElementType());
     if (IsSafeStackAlloca(&Arg, Size))
       continue;
 
@@ -451,7 +441,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
   IRBuilder<> IRBFail(CheckTerm);
   // FIXME: respect -fsanitize-trap / -ftrap-function here?
   Constant *StackChkFail = F.getParent()->getOrInsertFunction(
-      "__stack_chk_fail", IRB.getVoidTy(), nullptr);
+      "__stack_chk_fail", IRB.getVoidTy());
   IRBFail.CreateCall(StackChkFail, {});
 }
 
@@ -476,19 +466,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   if (StackGuardSlot) {
     Type *Ty = StackGuardSlot->getAllocatedType();
     unsigned Align =
-        std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+        std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
     SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
                   Align, SSC.getFullLiveRange());
   }
 
   for (Argument *Arg : ByValArguments) {
     Type *Ty = Arg->getType()->getPointerElementType();
-    uint64_t Size = DL->getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
     // Ensure the object is properly aligned.
-    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+    unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty),
                               Arg->getParamAlignment());
     SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
   }
@@ -501,7 +491,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
 
     // Ensure the object is properly aligned.
     unsigned Align =
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
 
     SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
   }
@@ -539,7 +529,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     unsigned Offset = SSL.getObjectOffset(Arg);
     Type *Ty = Arg->getType()->getPointerElementType();
 
-    uint64_t Size = DL->getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
@@ -550,7 +540,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
 
     // Replace alloc with the new location.
     replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
-                      /*Deref=*/true, -Offset);
+                      /*Deref=*/false, -Offset);
     Arg->replaceAllUsesWith(NewArg);
     IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
     IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
@@ -565,7 +555,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     if (Size == 0)
       Size = 1; // Don't create zero-sized stack objects.
 
-    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -Offset);
+    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/false, -Offset);
     replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset);
 
     // Replace uses of the alloca with the new location.
@@ -630,7 +620,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
       ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
 
     Type *Ty = AI->getAllocatedType();
-    uint64_t TySize = DL->getTypeAllocSize(Ty);
+    uint64_t TySize = DL.getTypeAllocSize(Ty);
     Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
 
     Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
@@ -638,7 +628,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
 
     // Align the SP value to satisfy the AllocaInst, type and stack alignments.
     unsigned Align = std::max(
-        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+        std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()),
         (unsigned)StackAlignment);
 
     assert(isPowerOf2_32(Align));
@@ -655,7 +645,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
     if (AI->hasName() && isa<Instruction>(NewAI))
       NewAI->takeName(AI);
 
-    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true);
+    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/false);
     AI->replaceAllUsesWith(NewAI);
     AI->eraseFromParent();
   }
@@ -685,25 +675,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
   }
 }
 
-bool SafeStack::runOnFunction(Function &F) {
-  DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
-
-  if (!F.hasFnAttribute(Attribute::SafeStack)) {
-    DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
-                    " for this function\n");
-    return false;
-  }
-
-  if (F.isDeclaration()) {
-    DEBUG(dbgs() << "[SafeStack]     function definition"
-                    " is not available\n");
-    return false;
-  }
-
-  if (!TM)
-    report_fatal_error("Target machine is required");
-  TL = TM->getSubtargetImpl(F)->getTargetLowering();
-  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+bool SafeStack::run() {
+  assert(F.hasFnAttribute(Attribute::SafeStack) &&
+         "Can't run SafeStack on a function without the attribute");
+  assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration");
 
   ++NumFunctions;
 
@@ -736,7 +711,7 @@ bool SafeStack::runOnFunction(Function &F) {
     ++NumUnsafeStackRestorePointsFunctions;
 
   IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
-  UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB);
+  UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB);
 
   // Load the current stack pointer (we'll also use it as a base pointer).
   // FIXME: use a dedicated register for it ?
@@ -788,14 +763,67 @@ bool SafeStack::runOnFunction(Function &F) {
   return true;
 }
 
+class SafeStackLegacyPass : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID; // Pass identification, replacement for typeid..
+  SafeStackLegacyPass() : FunctionPass(ID), TM(nullptr) {
+    initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+    if (!F.hasFnAttribute(Attribute::SafeStack)) {
+      DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                      " for this function\n");
+      return false;
+    }
+
+    if (F.isDeclaration()) {
+      DEBUG(dbgs() << "[SafeStack]     function definition"
+                      " is not available\n");
+      return false;
+    }
+
+    TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+    if (!TL)
+      report_fatal_error("TargetLowering instance is required");
+
+    auto *DL = &F.getParent()->getDataLayout();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+    // Compute DT and LI only for functions that have the attribute.
+    // This is only useful because the legacy pass manager doesn't let us
+    // compute analyzes lazily.
+    // In the backend pipeline, nothing preserves DT before SafeStack, so we
+    // would otherwise always compute it wastefully, even if there is no
+    // function with the safestack attribute.
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+
+    ScalarEvolution SE(F, TLI, ACT, DT, LI);
+
+    return SafeStack(F, *TL, *DL, SE).run();
+  }
+};
+
 } // anonymous namespace
 
-char SafeStack::ID = 0;
-INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
-                         "Safe Stack instrumentation pass", false, false)
-INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
-                       "Safe Stack instrumentation pass", false, false)
+char SafeStackLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, DEBUG_TYPE,
+                      "Safe Stack instrumentation pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(SafeStackLegacyPass, DEBUG_TYPE,
+                    "Safe Stack instrumentation pass", false, false)
 
-FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
-  return new SafeStack(TM);
-}
+FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); }
diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
index 7fbeadd..21f2fa4 100644
--- a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
@@ -20,9 +20,10 @@ using namespace llvm::safestack;
 
 #define DEBUG_TYPE "safestackcoloring"
 
+// Disabled by default due to PR32143.
 static cl::opt<bool> ClColoring("safe-stack-coloring",
                                 cl::desc("enable safe stack coloring"),
-                                cl::Hidden, cl::init(true));
+                                cl::Hidden, cl::init(false));
 
 const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
   const auto IT = AllocaNumbering.find(AI);
@@ -236,6 +237,7 @@ void StackColoring::calculateLiveIntervals() {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void StackColoring::dumpAllocas() {
   dbgs() << "Allocas:\n";
   for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
@@ -262,6 +264,7 @@ LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {
     dbgs() << "  " << AllocaNo << ": " << Range << "\n";
   }
 }
+#endif
 
 void StackColoring::run() {
   DEBUG(dumpAllocas());
diff --git a/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
new file mode 100644
index 0000000..07b43a8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -0,0 +1,656 @@
+//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem      ===//
+//===                                instrinsics                           ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces masked memory intrinsics - when unsupported by the target
+// - with a chain of basic blocks, that deal with the elements one-by-one if the
+// appropriate mask bit is set.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarize-masked-mem-intrin"
+
+namespace {
+
+class ScalarizeMaskedMemIntrin : public FunctionPass {
+  const TargetTransformInfo *TTI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) {
+    initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "Scalarize Masked Memory Intrinsics";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+private:
+  bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT);
+  bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
+};
+} // namespace
+
+char ScalarizeMaskedMemIntrin::ID = 0;
+INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE,
+                "Scalarize unsupported masked memory intrinsics", false, false)
+
+FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
+  return new ScalarizeMaskedMemIntrin();
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+// cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+// else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+// cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+// else2:                                          ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  Type *EltTy = CI->getType()->getVectorElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask =
+      isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+      VResult =
+          Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+//
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void scalarizeMaskedStore(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask =
+      isa<Constant>(Mask) && cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+      EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_store, label %cond.store, label %else
+    //
+    Value *Predicate =
+        Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      LoadInst *Load =
+          Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+      VResult = Builder.CreateInsertElement(
+          VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %ToLoad1 = icmp eq i1 %Mask1, true
+    //  br i1 %ToLoad1, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToLoad" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    LoadInst *Load =
+        Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx));
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+                                          "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+//   . . .
+static void scalarizeMaskedScatter(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  assert(isa<VectorType>(Src->getType()) &&
+         "Unexpected data type in masked scatter intrinsic");
+  assert(isa<VectorType>(Ptrs->getType()) &&
+         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+         "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                   "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+    //  % ToStore = icmp eq i1 % Mask1, true
+    //  br i1 % ToStore, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToStore" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 % Elt1, i32* % Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                 "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  bool EverMadeChange = false;
+
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator I = F.begin(); I != F.end();) {
+      BasicBlock *BB = &*I++;
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      if (ModifiedDTOnIteration)
+        break;
+    }
+
+    EverMadeChange |= MadeChange;
+  }
+
+  return EverMadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
+  bool MadeChange = false;
+
+  BasicBlock::iterator CurInstIterator = BB.begin();
+  while (CurInstIterator != BB.end()) {
+    if (CallInst *CI = dyn_cast<CallInst>(&*CurInstIterator++))
+      MadeChange |= optimizeCallInst(CI, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
+
+  return MadeChange;
+}
+
+bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
+                                                bool &ModifiedDT) {
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default:
+      break;
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType())) {
+        scalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_gather: {
+      if (!TTI->isLegalMaskedGather(CI->getType())) {
+        scalarizeMaskedGather(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_scatter: {
+      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedScatter(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
index 427d952..5e95f76 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -1,4 +1,4 @@
-//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the ScheduleDAG class, which is a base class used by
-// scheduling implementation classes.
+/// \file Implements the ScheduleDAG class, which is a base class used by
+/// scheduling implementation classes.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <climits>
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "pre-RA-sched"
@@ -33,58 +43,87 @@ static cl::opt<bool> StressSchedOpt(
   cl::desc("Stress test instruction scheduling"));
 #endif
 
-void SchedulingPriorityQueue::anchor() { }
+void SchedulingPriorityQueue::anchor() {}
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
     : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
       TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
-      MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
+      MRI(mf.getRegInfo()) {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;
 #endif
 }
 
-ScheduleDAG::~ScheduleDAG() {}
+ScheduleDAG::~ScheduleDAG() = default;
 
-/// Clear the DAG state (e.g. between scheduling regions).
 void ScheduleDAG::clearDAG() {
   SUnits.clear();
   EntrySU = SUnit();
   ExitSU = SUnit();
 }
 
-/// getInstrDesc helper to handle SDNodes.
 const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
   if (!Node || !Node->isMachineOpcode()) return nullptr;
   return &TII->get(Node->getMachineOpcode());
 }
 
-/// addPred - This adds the specified edge as a pred of the current node if
-/// not already.  It also adds the current node as a successor of the
-/// specified node.
+LLVM_DUMP_METHOD
+raw_ostream &SDep::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
+  switch (getKind()) {
+  case Data:   OS << "Data"; break;
+  case Anti:   OS << "Anti"; break;
+  case Output: OS << "Out "; break;
+  case Order:  OS << "Ord "; break;
+  }
+
+  switch (getKind()) {
+  case Data:
+    OS << " Latency=" << getLatency();
+    if (TRI && isAssignedRegDep())
+      OS << " Reg=" << PrintReg(getReg(), TRI);
+    break;
+  case Anti:
+  case Output:
+    OS << " Latency=" << getLatency();
+    break;
+  case Order:
+    OS << " Latency=" << getLatency();
+    switch(Contents.OrdKind) {
+    case Barrier:      OS << " Barrier"; break;
+    case MayAliasMem:
+    case MustAliasMem: OS << " Memory"; break;
+    case Artificial:   OS << " Artificial"; break;
+    case Weak:         OS << " Weak"; break;
+    case Cluster:      OS << " Cluster"; break;
+    }
+    break;
+  }
+
+  return OS;
+}
+
 bool SUnit::addPred(const SDep &D, bool Required) {
   // If this node already has this dependence, don't add a redundant one.
-  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I) {
+  for (SDep &PredDep : Preds) {
     // Zero-latency weak edges may be added purely for heuristic ordering. Don't
     // add them if another kind of edge already exists.
-    if (!Required && I->getSUnit() == D.getSUnit())
+    if (!Required && PredDep.getSUnit() == D.getSUnit())
       return false;
-    if (I->overlaps(D)) {
-      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
-      if (I->getLatency() < D.getLatency()) {
-        SUnit *PredSU = I->getSUnit();
+    if (PredDep.overlaps(D)) {
+      // Extend the latency if needed. Equivalent to
+      // removePred(PredDep) + addPred(D).
+      if (PredDep.getLatency() < D.getLatency()) {
+        SUnit *PredSU = PredDep.getSUnit();
         // Find the corresponding successor in N.
-        SDep ForwardD = *I;
+        SDep ForwardD = PredDep;
         ForwardD.setSUnit(this);
-        for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(),
-               EE = PredSU->Succs.end(); II != EE; ++II) {
-          if (*II == ForwardD) {
-            II->setLatency(D.getLatency());
+        for (SDep &SuccDep : PredSU->Succs) {
+          if (SuccDep == ForwardD) {
+            SuccDep.setLatency(D.getLatency());
             break;
           }
         }
-        I->setLatency(D.getLatency());
+        PredDep.setLatency(D.getLatency());
       }
       return false;
     }
@@ -95,8 +134,10 @@ bool SUnit::addPred(const SDep &D, bool Required) {
   SUnit *N = D.getSUnit();
   // Update the bookkeeping.
   if (D.getKind() == SDep::Data) {
-    assert(NumPreds < UINT_MAX && "NumPreds will overflow!");
-    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!");
+    assert(NumPreds < std::numeric_limits<unsigned>::max() &&
+           "NumPreds will overflow!");
+    assert(N->NumSuccs < std::numeric_limits<unsigned>::max() &&
+           "NumSuccs will overflow!");
     ++NumPreds;
     ++N->NumSuccs;
   }
@@ -105,7 +146,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
       ++WeakPredsLeft;
     }
     else {
-      assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!");
+      assert(NumPredsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumPredsLeft will overflow!");
       ++NumPredsLeft;
     }
   }
@@ -114,7 +156,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
       ++N->WeakSuccsLeft;
     }
     else {
-      assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+      assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumSuccsLeft will overflow!");
       ++N->NumSuccsLeft;
     }
   }
@@ -127,51 +170,46 @@ bool SUnit::addPred(const SDep &D, bool Required) {
   return true;
 }
 
-/// removePred - This removes the specified edge as a pred of the current
-/// node if it exists.  It also removes the current node as a successor of
-/// the specified node.
 void SUnit::removePred(const SDep &D) {
   // Find the matching predecessor.
-  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I)
-    if (*I == D) {
-      // Find the corresponding successor in N.
-      SDep P = D;
-      P.setSUnit(this);
-      SUnit *N = D.getSUnit();
-      SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P);
-      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
-      N->Succs.erase(Succ);
-      Preds.erase(I);
-      // Update the bookkeeping.
-      if (P.getKind() == SDep::Data) {
-        assert(NumPreds > 0 && "NumPreds will underflow!");
-        assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
-        --NumPreds;
-        --N->NumSuccs;
-      }
-      if (!N->isScheduled) {
-        if (D.isWeak())
-          --WeakPredsLeft;
-        else {
-          assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
-          --NumPredsLeft;
-        }
-      }
-      if (!isScheduled) {
-        if (D.isWeak())
-          --N->WeakSuccsLeft;
-        else {
-          assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
-          --N->NumSuccsLeft;
-        }
-      }
-      if (P.getLatency() != 0) {
-        this->setDepthDirty();
-        N->setHeightDirty();
-      }
-      return;
+  SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D);
+  if (I == Preds.end())
+    return;
+  // Find the corresponding successor in N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P);
+  assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+  N->Succs.erase(Succ);
+  Preds.erase(I);
+  // Update the bookkeeping.
+  if (P.getKind() == SDep::Data) {
+    assert(NumPreds > 0 && "NumPreds will underflow!");
+    assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+    --NumPreds;
+    --N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak())
+      --WeakPredsLeft;
+    else {
+      assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+      --NumPredsLeft;
     }
+  }
+  if (!isScheduled) {
+    if (D.isWeak())
+      --N->WeakSuccsLeft;
+    else {
+      assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+      --N->NumSuccsLeft;
+    }
+  }
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
 }
 
 void SUnit::setDepthDirty() {
@@ -181,9 +219,8 @@ void SUnit::setDepthDirty() {
   do {
     SUnit *SU = WorkList.pop_back_val();
     SU->isDepthCurrent = false;
-    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
-         E = SU->Succs.end(); I != E; ++I) {
-      SUnit *SuccSU = I->getSUnit();
+    for (SDep &SuccDep : SU->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
       if (SuccSU->isDepthCurrent)
         WorkList.push_back(SuccSU);
     }
@@ -197,18 +234,14 @@ void SUnit::setHeightDirty() {
   do {
     SUnit *SU = WorkList.pop_back_val();
     SU->isHeightCurrent = false;
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
-         E = SU->Preds.end(); I != E; ++I) {
-      SUnit *PredSU = I->getSUnit();
+    for (SDep &PredDep : SU->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
       if (PredSU->isHeightCurrent)
         WorkList.push_back(PredSU);
     }
   } while (!WorkList.empty());
 }
 
-/// setDepthToAtLeast - Update this node's successors to reflect the
-/// fact that this node's depth just increased.
-///
 void SUnit::setDepthToAtLeast(unsigned NewDepth) {
   if (NewDepth <= getDepth())
     return;
@@ -217,9 +250,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) {
   isDepthCurrent = true;
 }
 
-/// setHeightToAtLeast - Update this node's predecessors to reflect the
-/// fact that this node's height just increased.
-///
 void SUnit::setHeightToAtLeast(unsigned NewHeight) {
   if (NewHeight <= getHeight())
     return;
@@ -228,8 +258,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) {
   isHeightCurrent = true;
 }
 
-/// ComputeDepth - Calculate the maximal path from the node to the exit.
-///
+/// Calculates the maximal path from the node to the exit.
 void SUnit::ComputeDepth() {
   SmallVector<SUnit*, 8> WorkList;
   WorkList.push_back(this);
@@ -238,12 +267,11 @@ void SUnit::ComputeDepth() {
 
     bool Done = true;
     unsigned MaxPredDepth = 0;
-    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
-         E = Cur->Preds.end(); I != E; ++I) {
-      SUnit *PredSU = I->getSUnit();
+    for (const SDep &PredDep : Cur->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
       if (PredSU->isDepthCurrent)
         MaxPredDepth = std::max(MaxPredDepth,
-                                PredSU->Depth + I->getLatency());
+                                PredSU->Depth + PredDep.getLatency());
       else {
         Done = false;
         WorkList.push_back(PredSU);
@@ -261,8 +289,7 @@ void SUnit::ComputeDepth() {
   } while (!WorkList.empty());
 }
 
-/// ComputeHeight - Calculate the maximal path from the node to the entry.
-///
+/// Calculates the maximal path from the node to the entry.
 void SUnit::ComputeHeight() {
   SmallVector<SUnit*, 8> WorkList;
   WorkList.push_back(this);
@@ -271,12 +298,11 @@ void SUnit::ComputeHeight() {
 
     bool Done = true;
     unsigned MaxSuccHeight = 0;
-    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
-         E = Cur->Succs.end(); I != E; ++I) {
-      SUnit *SuccSU = I->getSUnit();
+    for (const SDep &SuccDep : Cur->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
       if (SuccSU->isHeightCurrent)
         MaxSuccHeight = std::max(MaxSuccHeight,
-                                 SuccSU->Height + I->getLatency());
+                                 SuccSU->Height + SuccDep.getLatency());
       else {
         Done = false;
         WorkList.push_back(SuccSU);
@@ -310,24 +336,31 @@ void SUnit::biasCriticalPath() {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {
-  if (this == &DAG->ExitSU)
-    OS << "ExitSU";
-  else if (this == &DAG->EntrySU)
+LLVM_DUMP_METHOD
+raw_ostream &SUnit::print(raw_ostream &OS,
+                          const SUnit *Entry, const SUnit *Exit) const {
+  if (this == Entry)
     OS << "EntrySU";
+  else if (this == Exit)
+    OS << "ExitSU";
   else
     OS << "SU(" << NodeNum << ")";
+  return OS;
+}
+
+LLVM_DUMP_METHOD
+raw_ostream &SUnit::print(raw_ostream &OS, const ScheduleDAG *G) const {
+  return print(OS, &G->EntrySU, &G->ExitSU);
 }
 
-/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
-/// a group of nodes flagged together.
+LLVM_DUMP_METHOD
 void SUnit::dump(const ScheduleDAG *G) const {
   print(dbgs(), G);
   dbgs() << ": ";
   G->dumpNode(this);
 }
 
-void SUnit::dumpAll(const ScheduleDAG *G) const {
+LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const {
   dump(G);
 
   dbgs() << "  # preds left       : " << NumPredsLeft << "\n";
@@ -343,89 +376,62 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
 
   if (Preds.size() != 0) {
     dbgs() << "  Predecessors:\n";
-    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I) {
-      dbgs() << "   ";
-      switch (I->getKind()) {
-      case SDep::Data:   dbgs() << "data "; break;
-      case SDep::Anti:   dbgs() << "anti "; break;
-      case SDep::Output: dbgs() << "out  "; break;
-      case SDep::Order:  dbgs() << "ord  "; break;
-      }
-      I->getSUnit()->print(dbgs(), G);
-      if (I->isArtificial())
-        dbgs() << " *";
-      dbgs() << ": Latency=" << I->getLatency();
-      if (I->isAssignedRegDep())
-        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
-      dbgs() << "\n";
+    for (const SDep &Dep : Preds) {
+      dbgs() << "    ";
+      Dep.getSUnit()->print(dbgs(), G); dbgs() << ": ";
+      Dep.print(dbgs(), G->TRI); dbgs() << '\n';
     }
   }
   if (Succs.size() != 0) {
     dbgs() << "  Successors:\n";
-    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
-         I != E; ++I) {
-      dbgs() << "   ";
-      switch (I->getKind()) {
-      case SDep::Data:   dbgs() << "data "; break;
-      case SDep::Anti:   dbgs() << "anti "; break;
-      case SDep::Output: dbgs() << "out  "; break;
-      case SDep::Order:  dbgs() << "ord  "; break;
-      }
-      I->getSUnit()->print(dbgs(), G);
-      if (I->isArtificial())
-        dbgs() << " *";
-      dbgs() << ": Latency=" << I->getLatency();
-      if (I->isAssignedRegDep())
-        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
-      dbgs() << "\n";
+    for (const SDep &Dep : Succs) {
+      dbgs() << "    ";
+      Dep.getSUnit()->print(dbgs(), G); dbgs() << ": ";
+      Dep.print(dbgs(), G->TRI); dbgs() << '\n';
     }
   }
 }
 #endif
 
 #ifndef NDEBUG
-/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
-/// their state is consistent. Return the number of scheduled nodes.
-///
 unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
   bool AnyNotSched = false;
   unsigned DeadNodes = 0;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    if (!SUnits[i].isScheduled) {
-      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+  for (const SUnit &SUnit : SUnits) {
+    if (!SUnit.isScheduled) {
+      if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) {
         ++DeadNodes;
         continue;
       }
       if (!AnyNotSched)
         dbgs() << "*** Scheduling failed! ***\n";
-      SUnits[i].dump(this);
+      SUnit.dump(this);
       dbgs() << "has not been scheduled!\n";
       AnyNotSched = true;
     }
-    if (SUnits[i].isScheduled &&
-        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) >
-          unsigned(INT_MAX)) {
+    if (SUnit.isScheduled &&
+        (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) >
+          unsigned(std::numeric_limits<int>::max())) {
       if (!AnyNotSched)
         dbgs() << "*** Scheduling failed! ***\n";
-      SUnits[i].dump(this);
+      SUnit.dump(this);
       dbgs() << "has an unexpected "
            << (isBottomUp ? "Height" : "Depth") << " value!\n";
       AnyNotSched = true;
     }
     if (isBottomUp) {
-      if (SUnits[i].NumSuccsLeft != 0) {
+      if (SUnit.NumSuccsLeft != 0) {
         if (!AnyNotSched)
           dbgs() << "*** Scheduling failed! ***\n";
-        SUnits[i].dump(this);
+        SUnit.dump(this);
         dbgs() << "has successors left!\n";
         AnyNotSched = true;
       }
     } else {
-      if (SUnits[i].NumPredsLeft != 0) {
+      if (SUnit.NumPredsLeft != 0) {
         if (!AnyNotSched)
           dbgs() << "*** Scheduling failed! ***\n";
-        SUnits[i].dump(this);
+        SUnit.dump(this);
         dbgs() << "has predecessors left!\n";
         AnyNotSched = true;
       }
@@ -436,36 +442,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
 }
 #endif
 
-/// InitDAGTopologicalSorting - create the initial topological
-/// ordering from the DAG to be scheduled.
-///
-/// The idea of the algorithm is taken from
-/// "Online algorithms for managing the topological order of
-/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
-/// This is the MNR algorithm, which was first introduced by
-/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
-/// "Maintaining a topological order under edge insertions".
-///
-/// Short description of the algorithm:
-///
-/// Topological ordering, ord, of a DAG maps each node to a topological
-/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
-///
-/// This means that if there is a path from the node X to the node Z,
-/// then ord(X) < ord(Z).
-///
-/// This property can be used to check for reachability of nodes:
-/// if Z is reachable from X, then an insertion of the edge Z->X would
-/// create a cycle.
-///
-/// The algorithm first computes a topological ordering for the DAG by
-/// initializing the Index2Node and Node2Index arrays and then tries to keep
-/// the ordering up-to-date after edge insertions by reordering the DAG.
-///
-/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
-/// the nodes reachable from Y, and then shifts them using Shift to lie
-/// immediately after X in Index2Node.
 void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  // The idea of the algorithm is taken from
+  // "Online algorithms for managing the topological order of
+  // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+  // This is the MNR algorithm, which was first introduced by
+  // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+  // "Maintaining a topological order under edge insertions".
+  //
+  // Short description of the algorithm:
+  //
+  // Topological ordering, ord, of a DAG maps each node to a topological
+  // index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+  //
+  // This means that if there is a path from the node X to the node Z,
+  // then ord(X) < ord(Z).
+  //
+  // This property can be used to check for reachability of nodes:
+  // if Z is reachable from X, then an insertion of the edge Z->X would
+  // create a cycle.
+  //
+  // The algorithm first computes a topological ordering for the DAG by
+  // initializing the Index2Node and Node2Index arrays and then tries to keep
+  // the ordering up-to-date after edge insertions by reordering the DAG.
+  //
+  // On insertion of the edge X->Y, the algorithm first marks by calling DFS
+  // the nodes reachable from Y, and then shifts them using Shift to lie
+  // immediately after X in Index2Node.
   unsigned DAGSize = SUnits.size();
   std::vector<SUnit*> WorkList;
   WorkList.reserve(DAGSize);
@@ -476,18 +479,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
   // Initialize the data structures.
   if (ExitSU)
     WorkList.push_back(ExitSU);
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    int NodeNum = SU->NodeNum;
-    unsigned Degree = SU->Succs.size();
+  for (SUnit &SU : SUnits) {
+    int NodeNum = SU.NodeNum;
+    unsigned Degree = SU.Succs.size();
     // Temporarily use the Node2Index array as scratch space for degree counts.
     Node2Index[NodeNum] = Degree;
 
     // Is it a node without dependencies?
     if (Degree == 0) {
-      assert(SU->Succs.empty() && "SUnit should have no successors");
+      assert(SU.Succs.empty() && "SUnit should have no successors");
       // Collect leaf nodes.
-      WorkList.push_back(SU);
+      WorkList.push_back(&SU);
     }
   }
 
@@ -497,9 +499,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
     WorkList.pop_back();
     if (SU->NodeNum < DAGSize)
       Allocate(SU->NodeNum, --Id);
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      SUnit *SU = I->getSUnit();
+    for (const SDep &PredDep : SU->Preds) {
+      SUnit *SU = PredDep.getSUnit();
       if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])
         // If all dependencies of the node are processed already,
         // then the node can be computed now.
@@ -511,19 +512,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
 
 #ifndef NDEBUG
   // Check correctness of the ordering
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
+  for (SUnit &SU : SUnits)  {
+    for (const SDep &PD : SU.Preds) {
+      assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] &&
       "Wrong topological sorting");
     }
   }
 #endif
 }
 
-/// AddPred - Updates the topological ordering to accommodate an edge
-/// to be added from SUnit X to SUnit Y.
 void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   int UpperBound, LowerBound;
   LowerBound = Node2Index[Y->NodeNum];
@@ -540,16 +537,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   }
 }
 
-/// RemovePred - Updates the topological ordering to accommodate an
-/// an edge to be removed from the specified node N from the predecessors
-/// of the current node M.
 void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
   // InitDAGTopologicalSorting();
 }
 
-/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
-/// all nodes affected by the edge insertion. These nodes will later get new
-/// topological indexes by means of the Shift method.
 void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
                                      bool &HasLoop) {
   std::vector<const SUnit*> WorkList;
@@ -560,8 +551,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     SU = WorkList.back();
     WorkList.pop_back();
     Visited.set(SU->NodeNum);
-    for (int I = SU->Succs.size()-1; I >= 0; --I) {
-      unsigned s = SU->Succs[I].getSUnit()->NodeNum;
+    for (const SDep &SuccDep
+         : make_range(SU->Succs.rbegin(), SU->Succs.rend())) {
+      unsigned s = SuccDep.getSUnit()->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
       if (s >= Node2Index.size())
         continue;
@@ -571,14 +563,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
       }
       // Visit successors if not already and in affected region.
       if (!Visited.test(s) && Node2Index[s] < UpperBound) {
-        WorkList.push_back(SU->Succs[I].getSUnit());
+        WorkList.push_back(SuccDep.getSUnit());
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU,
+                                                         const SUnit &TargetSU,
+                                                         bool &Success) {
+  std::vector<const SUnit*> WorkList;
+  int LowerBound = Node2Index[StartSU.NodeNum];
+  int UpperBound = Node2Index[TargetSU.NodeNum];
+  bool Found = false;
+  BitVector VisitedBack;
+  std::vector<int> Nodes;
+
+  if (LowerBound > UpperBound) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.reserve(SUnits.size());
+  Visited.reset();
+
+  // Starting from StartSU, visit all successors up
+  // to UpperBound.
+  WorkList.push_back(&StartSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      const SUnit *Succ = SU->Succs[I].getSUnit();
+      unsigned s = Succ->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (Succ->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        Found = true;
+        continue;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        Visited.set(s);
+        WorkList.push_back(Succ);
+      }
+    }
+  } while (!WorkList.empty());
+
+  if (!Found) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.clear();
+  VisitedBack.resize(SUnits.size());
+  Found = false;
+
+  // Starting from TargetSU, visit all predecessors up
+  // to LowerBound. SUs that are visited by the two
+  // passes are added to Nodes.
+  WorkList.push_back(&TargetSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (int I = SU->Preds.size()-1; I >= 0; --I) {
+      const SUnit *Pred = SU->Preds[I].getSUnit();
+      unsigned s = Pred->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. EntrySU).
+      if (Pred->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == LowerBound) {
+        Found = true;
+        continue;
+      }
+      if (!VisitedBack.test(s) && Visited.test(s)) {
+        VisitedBack.set(s);
+        WorkList.push_back(Pred);
+        Nodes.push_back(s);
       }
     }
   } while (!WorkList.empty());
+
+  assert(Found && "Error in SUnit Graph!");
+  Success = true;
+  return Nodes;
 }
 
-/// Shift - Renumber the nodes so that the topological ordering is
-/// preserved.
 void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
                                        int UpperBound) {
   std::vector<int> L;
@@ -598,28 +669,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
     }
   }
 
-  for (unsigned j = 0; j < L.size(); ++j) {
-    Allocate(L[j], i - shift);
+  for (unsigned LI : L) {
+    Allocate(LI, i - shift);
     i = i + 1;
   }
 }
 
-
-/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will
-/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU).
 bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
   // Is SU reachable from TargetSU via successor edges?
   if (IsReachable(SU, TargetSU))
     return true;
-  for (SUnit::pred_iterator
-         I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I)
-    if (I->isAssignedRegDep() &&
-        IsReachable(SU, I->getSUnit()))
+  for (const SDep &PredDep : TargetSU->Preds)
+    if (PredDep.isAssignedRegDep() &&
+        IsReachable(SU, PredDep.getSUnit()))
       return true;
   return false;
 }
 
-/// IsReachable - Checks if SU is reachable from TargetSU.
 bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
                                              const SUnit *TargetSU) {
   // If insertion of the edge SU->TargetSU would create a cycle
@@ -637,7 +703,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   return HasLoop;
 }
 
-/// Allocate - assign the topological index to the node n.
 void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
   Node2Index[n] = index;
   Index2Node[index] = n;
@@ -647,4 +712,4 @@ ScheduleDAGTopologicalSort::
 ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)
   : SUnits(sunits), ExitSU(exitsu) {}
 
-ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default;
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 611c5a7..99baa07 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -7,41 +7,63 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the ScheduleDAGInstrs class, which implements re-scheduling
-// of MachineInstrs.
+/// \file This implements the ScheduleDAGInstrs class, which implements
+/// re-scheduling of MachineInstrs.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
     cl::ZeroOrMore, cl::init(false),
@@ -90,77 +112,17 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
                                      bool RemoveKillFlags)
     : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
-      RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
-      TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr),
+      RemoveKillFlags(RemoveKillFlags),
       UnknownValue(UndefValue::get(
-                     Type::getVoidTy(mf.getFunction()->getContext()))),
-      FirstDbgValue(nullptr) {
+                             Type::getVoidTy(mf.getFunction()->getContext()))) {
   DbgValues.clear();
 
   const TargetSubtargetInfo &ST = mf.getSubtarget();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
 }
 
-/// getUnderlyingObjectFromInt - This is the function that does the work of
-/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
-static const Value *getUnderlyingObjectFromInt(const Value *V) {
-  do {
-    if (const Operator *U = dyn_cast<Operator>(V)) {
-      // If we find a ptrtoint, we can transfer control back to the
-      // regular getUnderlyingObjectFromInt.
-      if (U->getOpcode() == Instruction::PtrToInt)
-        return U->getOperand(0);
-      // If we find an add of a constant, a multiplied value, or a phi, it's
-      // likely that the other operand will lead us to the base
-      // object. We don't have to worry about the case where the
-      // object address is somehow being computed by the multiply,
-      // because our callers only care when the result is an
-      // identifiable object.
-      if (U->getOpcode() != Instruction::Add ||
-          (!isa<ConstantInt>(U->getOperand(1)) &&
-           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
-           !isa<PHINode>(U->getOperand(1))))
-        return V;
-      V = U->getOperand(0);
-    } else {
-      return V;
-    }
-    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
-  } while (1);
-}
-
-/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects
-/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
-static void getUnderlyingObjects(const Value *V,
-                                 SmallVectorImpl<Value *> &Objects,
-                                 const DataLayout &DL) {
-  SmallPtrSet<const Value *, 16> Visited;
-  SmallVector<const Value *, 4> Working(1, V);
-  do {
-    V = Working.pop_back_val();
-
-    SmallVector<Value *, 4> Objs;
-    GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
-
-    for (Value *V : Objs) {
-      if (!Visited.insert(V).second)
-        continue;
-      if (Operator::getOpcode(V) == Instruction::IntToPtr) {
-        const Value *O =
-          getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
-        if (O->getType()->isPointerTy()) {
-          Working.push_back(O);
-          continue;
-        }
-      }
-      Objects.push_back(const_cast<Value *>(V));
-    }
-  } while (!Working.empty());
-}
-
-/// getUnderlyingObjectsForInstr - If this machine instr has memory reference
-/// information and it can be tracked to a normal reference to a known
-/// object, return the Value for that object.
+/// If this machine instr has memory reference information and it can be tracked
+/// to a normal reference to a known object, return the Value for that object.
 static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
                                          const MachineFrameInfo &MFI,
                                          UnderlyingObjectsVector &Objects,
@@ -189,12 +151,10 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
         Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
       } else if (const Value *V = MMO->getValue()) {
         SmallVector<Value *, 4> Objs;
-        getUnderlyingObjects(V, Objs, DL);
+        getUnderlyingObjectsForCodeGen(V, Objs, DL);
 
         for (Value *V : Objs) {
-          if (!isIdentifiedObject(V))
-            return false;
-
+          assert(isIdentifiedObject(V));
           Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
         }
       } else
@@ -216,10 +176,6 @@ void ScheduleDAGInstrs::finishBlock() {
   BB = nullptr;
 }
 
-/// Initialize the DAG and common scheduler state for the current scheduling
-/// region. This does not actually create the DAG, only clears it. The
-/// scheduling driver may call BuildSchedGraph multiple times per scheduling
-/// region.
 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
@@ -230,20 +186,10 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
   NumRegionInstrs = regioninstrs;
 }
 
-/// Close the current scheduling region. Don't clear any state in case the
-/// driver wants to refer to the previous scheduling region.
 void ScheduleDAGInstrs::exitRegion() {
   // Nothing to do.
 }
 
-/// addSchedBarrierDeps - Add dependencies from instructions in the current
-/// list of instructions being scheduled to scheduling barrier by adding
-/// the exit SU to the register defs and use list. This is because we want to
-/// make sure instructions which define registers that are either used by
-/// the terminator or are live-out are properly scheduled. This is
-/// especially important when the definition latency of the return value(s)
-/// are too high to be hidden by the branch or when the liveout registers
-/// used by instructions in the fallthrough block.
 void ScheduleDAGInstrs::addSchedBarrierDeps() {
   MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
   ExitSU.setInstr(ExitMI);
@@ -271,7 +217,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
   }
 }
 
-/// MO is an operand of SU's instruction that defines a physical register. Add
+/// MO is an operand of SU's instruction that defines a physical register. Adds
 /// data dependencies from SU to any uses of the physical register.
 void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
@@ -313,9 +259,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-/// addPhysRegDeps - Add register dependencies (data, anti, and output) from
-/// this SUnit to following instructions in the same scheduling region that
-/// depend the physical register referenced at OperIdx.
+/// \brief Adds register dependencies (data, anti, and output) from this SUnit
+/// to following instructions in the same scheduling region that depend the
+/// physical register referenced at OperIdx.
 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   MachineOperand &MO = MI->getOperand(OperIdx);
@@ -406,9 +352,9 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
   return TRI->getSubRegIndexLaneMask(SubReg);
 }
 
-/// addVRegDefDeps - Add register output and data dependencies from this SUnit
-/// to instructions that occur later in the same scheduling region if they read
-/// from or write to the virtual register defined at OperIdx.
+/// Adds register output and data dependencies from this SUnit to instructions
+/// that occur later in the same scheduling region if they read from or write to
+/// the virtual register defined at OperIdx.
 ///
 /// TODO: Hoist loop induction variable increments. This has to be
 /// reevaluated. Generally, IV scheduling should be done before coalescing.
@@ -515,10 +461,10 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
 }
 
-/// addVRegUseDeps - Add a register data dependency if the instruction that
-/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a
-/// register antidependency from this SUnit to instructions that occur later in
-/// the same scheduling region if they write the virtual register.
+/// \brief Adds a register data dependency if the instruction that defines the
+/// virtual register used at OperIdx is mapped to an SUnit. Add a register
+/// antidependency from this SUnit to instructions that occur later in the same
+/// scheduling region if they write the virtual register.
 ///
 /// TODO: Handle ExitSU "uses" properly.
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
@@ -545,87 +491,25 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   }
 }
 
-/// Return true if MI is an instruction we are unable to reason about
+/// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
          (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
 }
 
-/// This returns true if the two MIs need a chain edge between them.
-/// This is called on normal stores and loads.
-static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                             const DataLayout &DL, MachineInstr *MIa,
-                             MachineInstr *MIb) {
-  const MachineFunction *MF = MIa->getParent()->getParent();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-
-  assert ((MIa->mayStore() || MIb->mayStore()) &&
-          "Dependency checked between two loads");
-
-  // Let the target decide if memory accesses cannot possibly overlap.
-  if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA))
-    return false;
-
-  // To this point analysis is generic. From here on we do need AA.
-  if (!AA)
-    return true;
-
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
-    return true;
-
-  MachineMemOperand *MMOa = *MIa->memoperands_begin();
-  MachineMemOperand *MMOb = *MIb->memoperands_begin();
-
-  if (!MMOa->getValue() || !MMOb->getValue())
-    return true;
-
-  // The following interface to AA is fashioned after DAGCombiner::isAlias
-  // and operates with MachineMemOperand offset with some important
-  // assumptions:
-  //   - LLVM fundamentally assumes flat address spaces.
-  //   - MachineOperand offset can *only* result from legalization and
-  //     cannot affect queries other than the trivial case of overlap
-  //     checking.
-  //   - These offsets never wrap and never step outside
-  //     of allocated objects.
-  //   - There should never be any negative offsets here.
-  //
-  // FIXME: Modify API to hide this math from "user"
-  // FIXME: Even before we go to AA we can reason locally about some
-  // memory objects. It can save compile time, and possibly catch some
-  // corner cases not currently covered.
-
-  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
-  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
-
-  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
-  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
-  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
-
-  AliasResult AAResult =
-      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
-                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
-                MemoryLocation(MMOb->getValue(), Overlapb,
-                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
-
-  return (AAResult != NoAlias);
-}
-
-/// Check whether two objects need a chain edge and add it if needed.
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
                                             unsigned Latency) {
-  if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(),
-                       SUb->getInstr())) {
+  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
     SDep Dep(SUa, SDep::MayAliasMem);
     Dep.setLatency(Latency);
     SUb->addPred(Dep);
   }
 }
 
-/// Create an SUnit for each real instruction, numbered in top-down topological
-/// order. The instruction order A < B, implies that no edge exists from B to A.
+/// \brief Creates an SUnit for each real instruction, numbered in top-down
+/// topological order. The instruction order A < B, implies that no edge exists
+/// from B to A.
 ///
 /// Map each real instruction to its SUnit.
 ///
@@ -640,7 +524,7 @@ void ScheduleDAGInstrs::initSUnits() {
   // which is contained within a basic block.
   SUnits.reserve(NumRegionInstrs);
 
-  for (MachineInstr &MI : llvm::make_range(RegionBegin, RegionEnd)) {
+  for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
     if (MI.isDebugValue())
       continue;
 
@@ -682,23 +566,22 @@ void ScheduleDAGInstrs::initSUnits() {
 }
 
 class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
-
   /// Current total number of SUs in map.
-  unsigned NumNodes;
+  unsigned NumNodes = 0;
 
   /// 1 for loads, 0 for stores. (see comment in SUList)
   unsigned TrueMemOrderLatency;
-public:
 
-  Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}
+public:
+  Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}
 
   /// To keep NumNodes up to date, insert() is used instead of
   /// this operator w/ push_back().
   ValueType &operator[](const SUList &Key) {
     llvm_unreachable("Don't use. Use insert() instead."); };
 
-  /// Add SU to the SUList of V. If Map grows huge, reduce its size
-  /// by calling reduce().
+  /// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
+  /// reduce().
   void inline insert(SUnit *SU, ValueType V) {
     MapVector::operator[](V).push_back(SU);
     NumNodes++;
@@ -708,7 +591,7 @@ public:
   void inline clearList(ValueType V) {
     iterator Itr = find(V);
     if (Itr != end()) {
-      assert (NumNodes >= Itr->second.size());
+      assert(NumNodes >= Itr->second.size());
       NumNodes -= Itr->second.size();
 
       Itr->second.clear();
@@ -723,8 +606,8 @@ public:
 
   unsigned inline size() const { return NumNodes; }
 
-  /// Count the number of SUs in this map after a reduction.
-  void reComputeSize(void) {
+  /// Counts the number of SUs in this map after a reduction.
+  void reComputeSize() {
     NumNodes = 0;
     for (auto &I : *this)
       NumNodes += I.second.size();
@@ -754,7 +637,7 @@ void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
 }
 
 void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
-  assert (BarrierChain != nullptr);
+  assert(BarrierChain != nullptr);
 
   for (auto &I : map) {
     SUList &sus = I.second;
@@ -765,7 +648,7 @@ void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
 }
 
 void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
-  assert (BarrierChain != nullptr);
+  assert(BarrierChain != nullptr);
 
   // Go through all lists of SUs.
   for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
@@ -797,9 +680,6 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
   map.reComputeSize();
 }
 
-/// If RegPressure is non-null, compute register pressure as a side effect. The
-/// DAG builder is an efficient place to do it because it already visits
-/// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs,
@@ -1088,10 +968,6 @@ void ScheduleDAGInstrs::Value2SUsMap::dump() {
   }
 }
 
-/// Reduce maps in FIFO order, by N SUs. This is better than turning
-/// every Nth memory SU into BarrierChain in buildSchedGraph(), since
-/// it avoids unnecessary edges between seen SUs above the new
-/// BarrierChain, and those below it.
 void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
                                               Value2SUsMap &loads, unsigned N) {
   DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
@@ -1113,7 +989,7 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
   // The N last elements in NodeNums will be removed, and the SU with
   // the lowest NodeNum of them will become the new BarrierChain to
   // let the not yet seen SUs have a dependency to the removed SUs.
-  assert (N <= NodeNums.size());
+  assert(N <= NodeNums.size());
   SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
   if (BarrierChain) {
     // The aliasing and non-aliasing maps reduce independently of each
@@ -1142,183 +1018,77 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
         loads.dump());
 }
 
-/// \brief Initialize register live-range state for updating kills.
-void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {
-  // Start with no live registers.
-  LiveRegs.reset();
-
-  // Examine the live-in regs of all successors.
-  for (const MachineBasicBlock *Succ : BB->successors()) {
-    for (const auto &LI : Succ->liveins()) {
-      // Repeat, for reg and all subregs.
-      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        LiveRegs.set(*SubRegs);
-    }
-  }
-}
-
-/// \brief If we change a kill flag on the bundle instruction implicit register
-/// operands, then we also need to propagate that to any instructions inside
-/// the bundle which had the same kill state.
-static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
-                                 bool NewKillState,
-                                 const TargetRegisterInfo *TRI) {
-  if (MI->getOpcode() != TargetOpcode::BUNDLE)
-    return;
-
-  // Walk backwards from the last instruction in the bundle to the first.
-  // Once we set a kill flag on an instruction, we bail out, as otherwise we
-  // might set it on too many operands.  We will clear as many flags as we
-  // can though.
-  MachineBasicBlock::instr_iterator Begin = MI->getIterator();
-  MachineBasicBlock::instr_iterator End = getBundleEnd(Begin);
-  while (Begin != End) {
-    if (NewKillState) {
-      if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
-         return;
-    } else
-        (--End)->clearRegisterKills(Reg, TRI);
-  }
-}
-
-bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {
-  // Setting kill flag...
-  if (!MO.isKill()) {
-    MO.setIsKill(true);
-    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
-    return false;
-  }
-
-  // If MO itself is live, clear the kill flag...
-  if (LiveRegs.test(MO.getReg())) {
-    MO.setIsKill(false);
-    toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
-    return false;
-  }
-
-  // If any subreg of MO is live, then create an imp-def for that
-  // subreg and keep MO marked as killed.
-  MO.setIsKill(false);
-  toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
-  bool AllDead = true;
-  const unsigned SuperReg = MO.getReg();
-  MachineInstrBuilder MIB(MF, MI);
-  for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
-    if (LiveRegs.test(*SubRegs)) {
-      MIB.addReg(*SubRegs, RegState::ImplicitDefine);
-      AllDead = false;
-    }
-  }
+static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
+                        MachineInstr &MI, bool addToLiveRegs) {
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
 
-  if(AllDead) {
-    MO.setIsKill(true);
-    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
+    // Things that are available after the instruction are killed by it.
+    bool IsKill = LiveRegs.available(MRI, Reg);
+    MO.setIsKill(IsKill);
+    if (addToLiveRegs)
+      LiveRegs.addReg(Reg);
   }
-  return false;
 }
 
-// FIXME: Reuse the LivePhysRegs utility for this.
-void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
-  DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
-
-  LiveRegs.resize(TRI->getNumRegs());
-  BitVector killedRegs(TRI->getNumRegs());
+void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
+  DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n');
 
-  startBlockForKills(MBB);
+  LiveRegs.init(*TRI);
+  LiveRegs.addLiveOuts(MBB);
 
   // Examine block from end to start...
-  unsigned Count = MBB->size();
-  for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin();
-       I != E; --Count) {
-    MachineInstr &MI = *--I;
+  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
     if (MI.isDebugValue())
       continue;
 
     // Update liveness.  Registers that are defed but not used in this
     // instruction are now dead. Mark register and all subregs as they
     // are completely defined.
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.isRegMask())
-        LiveRegs.clearBitsNotInMask(MO.getRegMask());
-      if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
-      if (Reg == 0) continue;
-      if (!MO.isDef()) continue;
-      // Ignore two-addr defs.
-      if (MI.isRegTiedToUseOperand(i)) continue;
-
-      // Repeat for reg and all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        LiveRegs.reset(*SubRegs);
-    }
-
-    // Examine all used registers and set/clear kill flag. When a
-    // register is used multiple times we only set the kill flag on
-    // the first use. Don't set kill flags on undef operands.
-    killedRegs.reset();
-
-    // toggleKillFlag can append new operands (implicit defs), so using
-    // a range-based loop is not safe. The new operands will be appended
-    // at the end of the operand list and they don't need to be visited,
-    // so iterating until the currently last operand is ok.
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
-      unsigned Reg = MO.getReg();
-      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
-
-      bool kill = false;
-      if (!killedRegs.test(Reg)) {
-        kill = true;
-        // A register is not killed if any subregs are live...
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-          if (LiveRegs.test(*SubRegs)) {
-            kill = false;
-            break;
-          }
-        }
-
-        // If subreg is not live, then register is killed if it became
-        // live in this instruction
-        if (kill)
-          kill = !LiveRegs.test(Reg);
-      }
-
-      if (MO.isKill() != kill) {
-        DEBUG(dbgs() << "Fixing " << MO << " in ");
-        toggleKillFlag(&MI, MO);
-        DEBUG(MI.dump());
-        DEBUG({
-          if (MI.getOpcode() == TargetOpcode::BUNDLE) {
-            MachineBasicBlock::instr_iterator Begin = MI.getIterator();
-            MachineBasicBlock::instr_iterator End = getBundleEnd(Begin);
-            while (++Begin != End)
-              DEBUG(Begin->dump());
-          }
-        });
+    for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+      const MachineOperand &MO = *O;
+      if (MO.isReg()) {
+        if (!MO.isDef())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        LiveRegs.removeReg(Reg);
+      } else if (MO.isRegMask()) {
+        LiveRegs.removeRegsInMask(MO);
       }
-
-      killedRegs.set(Reg);
     }
 
-    // Mark any used register (that is not using undef) and subregs as
-    // now live...
-    for (const MachineOperand &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
-      unsigned Reg = MO.getReg();
-      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
-
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        LiveRegs.set(*SubRegs);
+    // If there is a bundle header fix it up first.
+    if (!MI.isBundled()) {
+      toggleKills(MRI, LiveRegs, MI, true);
+    } else {
+      MachineBasicBlock::instr_iterator First = MI.getIterator();
+      if (MI.isBundle()) {
+        toggleKills(MRI, LiveRegs, MI, false);
+        ++First;
+      }
+      // Some targets make the (questionable) assumtion that the instructions
+      // inside the bundle are ordered and consequently only the last use of
+      // a register inside the bundle can kill it.
+      MachineBasicBlock::instr_iterator I = std::next(First);
+      while (I->isBundledWithSucc())
+        ++I;
+      do {
+        if (!I->isDebugValue())
+          toggleKills(MRI, LiveRegs, *I, true);
+        --I;
+      } while(I != First);
     }
   }
 }
 
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+  // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   SU->getInstr()->dump();
 #endif
@@ -1347,23 +1117,24 @@ std::string ScheduleDAGInstrs::getDAGName() const {
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-/// \brief Internal state used to compute SchedDFSResult.
+
+/// Internal state used to compute SchedDFSResult.
 class SchedDFSImpl {
   SchedDFSResult &R;
 
   /// Join DAG nodes into equivalence classes by their subtree.
   IntEqClasses SubtreeClasses;
   /// List PredSU, SuccSU pairs that represent data edges between subtrees.
-  std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs;
+  std::vector<std::pair<const SUnit *, const SUnit*>> ConnectionPairs;
 
   struct RootData {
     unsigned NodeID;
-    unsigned ParentNodeID;  // Parent node (member of the parent subtree).
-    unsigned SubInstrCount; // Instr count in this tree only, not children.
+    unsigned ParentNodeID;  ///< Parent node (member of the parent subtree).
+    unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
+                                /// children.
 
     RootData(unsigned id): NodeID(id),
-                           ParentNodeID(SchedDFSResult::InvalidSubtreeID),
-                           SubInstrCount(0) {}
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}
 
     unsigned getSparseSetIndex() const { return NodeID; }
   };
@@ -1375,7 +1146,7 @@ public:
     RootSet.setUniverse(R.DFSNodeData.size());
   }
 
-  /// Return true if this node been visited by the DFS traversal.
+  /// Returns true if this node been visited by the DFS traversal.
   ///
   /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
   /// ID. Later, SubtreeID is updated but remains valid.
@@ -1384,7 +1155,7 @@ public:
       != SchedDFSResult::InvalidSubtreeID;
   }
 
-  /// Initialize this node's instruction count. We don't need to flag the node
+  /// Initializes this node's instruction count. We don't need to flag the node
   /// visited until visitPostorder because the DAG cannot have cycles.
   void visitPreorder(const SUnit *SU) {
     R.DFSNodeData[SU->NodeNum].InstrCount =
@@ -1433,8 +1204,8 @@ public:
     RootSet[SU->NodeNum] = RData;
   }
 
-  /// Called once for each tree edge after calling visitPostOrderNode on the
-  /// predecessor. Increment the parent node's instruction count and
+  /// \brief Called once for each tree edge after calling visitPostOrderNode on
+  /// the predecessor. Increment the parent node's instruction count and
   /// preemptively join this subtree to its parent's if it is small enough.
   void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
     R.DFSNodeData[Succ->NodeNum].InstrCount
@@ -1442,13 +1213,13 @@ public:
     joinPredSubtree(PredDep, Succ);
   }
 
-  /// Add a connection for cross edges.
+  /// Adds a connection for cross edges.
   void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
     ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
   }
 
-  /// Set each node's subtree ID to the representative ID and record connections
-  /// between trees.
+  /// Sets each node's subtree ID to the representative ID and record
+  /// connections between trees.
   void finalize() {
     SubtreeClasses.compress();
     R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
@@ -1484,8 +1255,8 @@ public:
   }
 
 protected:
-  /// Join the predecessor subtree with the successor that is its DFS
-  /// parent. Apply some heuristics before joining.
+  /// Joins the predecessor subtree with the successor that is its DFS parent.
+  /// Applies some heuristics before joining.
   bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
                        bool CheckLimit = true) {
     assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
@@ -1531,12 +1302,15 @@ protected:
     } while (FromTree != SchedDFSResult::InvalidSubtreeID);
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 namespace {
-/// \brief Manage the stack used by a reverse depth-first search over the DAG.
+
+/// Manage the stack used by a reverse depth-first search over the DAG.
 class SchedDAGReverseDFS {
-  std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;
+  std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;
+
 public:
   bool isComplete() const { return DFSStack.empty(); }
 
@@ -1558,7 +1332,8 @@ public:
     return getCurr()->Preds.end();
   }
 };
-} // anonymous
+
+} // end anonymous namespace
 
 static bool hasDataSucc(const SUnit *SU) {
   for (const SDep &SuccDep : SU->Succs) {
@@ -1569,7 +1344,7 @@ static bool hasDataSucc(const SUnit *SU) {
   return false;
 }
 
-/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
+/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
 /// search from this root.
 void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
   if (!IsBottomUp)
@@ -1583,7 +1358,7 @@ void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
     SchedDAGReverseDFS DFS;
     Impl.visitPreorder(&SU);
     DFS.follow(&SU);
-    for (;;) {
+    while (true) {
       // Traverse the leftmost path as far as possible.
       while (DFS.getPred() != DFS.getPredEnd()) {
         const SDep &PredDep = *DFS.getPred();
@@ -1626,8 +1401,8 @@ void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
   }
 }
 
-LLVM_DUMP_METHOD
-void ILPValue::print(raw_ostream &OS) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
   OS << InstrCount << " / " << Length << " = ";
   if (!Length)
     OS << "BADILP";
@@ -1635,8 +1410,7 @@ void ILPValue::print(raw_ostream &OS) const {
     OS << format("%g", ((double)InstrCount / Length));
 }
 
-LLVM_DUMP_METHOD
-void ILPValue::dump() const {
+LLVM_DUMP_METHOD void ILPValue::dump() const {
   dbgs() << *this << '\n';
 }
 
@@ -1648,4 +1422,6 @@ raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
   return OS;
 }
 
-} // namespace llvm
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index ca2881c..bb6a459 100644
--- a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
diff --git a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 83bc1ba..b3d83d5 100644
--- a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -1,4 +1,4 @@
-//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===//
+//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,11 +15,13 @@
 
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
     const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
     const char *ParentDebugType)
     : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II),
-      DAG(SchedDAG), IssueWidth(0), IssueCount(0) {
-
+      DAG(SchedDAG) {
   // Determine the maximum depth of any itinerary. This determines the depth of
   // the scoreboard. We always make the scoreboard at least 1 cycle deep to
   // avoid dealing with the boundary condition.
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2c7bffe..432c86d 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -33,6 +34,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
@@ -53,10 +55,6 @@ STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
   static cl::opt<bool>
-    CombinerAA("combiner-alias-analysis", cl::Hidden,
-               cl::desc("Enable DAG combiner alias-analysis heuristics"));
-
-  static cl::opt<bool>
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Enable DAG combiner's use of IR alias analysis"));
 
@@ -117,7 +115,7 @@ namespace {
     SmallPtrSet<SDNode *, 32> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
-    AliasAnalysis &AA;
+    AliasAnalysis *AA;
 
     /// When an instruction is simplified, add all users of the instruction to
     /// the work lists because they might get more simplified now.
@@ -133,6 +131,9 @@ namespace {
     /// Add to the worklist making sure its instance is at the back (next to be
     /// processed.)
     void AddToWorklist(SDNode *N) {
+      assert(N->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted Node added to Worklist");
+
       // Skip handle nodes as they can't usefully be combined and confuse the
       // zero-use deletion strategy.
       if (N->getOpcode() == ISD::HANDLENODE)
@@ -177,6 +178,7 @@ namespace {
     void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
 
   private:
+    unsigned MaximumLegalStoreInBits;
 
     /// Check the specified integer node value to see if it can be simplified or
     /// if things it uses can be simplified by bit propagation.
@@ -232,11 +234,18 @@ namespace {
     SDValue visitTokenFactor(SDNode *N);
     SDValue visitMERGE_VALUES(SDNode *N);
     SDValue visitADD(SDNode *N);
+    SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);
     SDValue visitSUB(SDNode *N);
     SDValue visitADDC(SDNode *N);
+    SDValue visitUADDO(SDNode *N);
+    SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitSUBC(SDNode *N);
+    SDValue visitUSUBO(SDNode *N);
     SDValue visitADDE(SDNode *N);
+    SDValue visitADDCARRY(SDNode *N);
+    SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
     SDValue visitSUBE(SDNode *N);
+    SDValue visitSUBCARRY(SDNode *N);
     SDValue visitMUL(SDNode *N);
     SDValue useDivRem(SDNode *N);
     SDValue visitSDIV(SDNode *N);
@@ -259,6 +268,7 @@ namespace {
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
     SDValue visitRotate(SDNode *N);
+    SDValue visitABS(SDNode *N);
     SDValue visitBSWAP(SDNode *N);
     SDValue visitBITREVERSE(SDNode *N);
     SDValue visitCTLZ(SDNode *N);
@@ -271,9 +281,11 @@ namespace {
     SDValue visitSELECT_CC(SDNode *N);
     SDValue visitSETCC(SDNode *N);
     SDValue visitSETCCE(SDNode *N);
+    SDValue visitSETCCCARRY(SDNode *N);
     SDValue visitSIGN_EXTEND(SDNode *N);
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitAssertZext(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
@@ -336,6 +348,7 @@ namespace {
     SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
 
     SDValue foldSelectOfConstants(SDNode *N);
+    SDValue foldBinOpIntoSelect(SDNode *BO);
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
     SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
     SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
@@ -344,6 +357,8 @@ namespace {
                              bool NotExtCompare = false);
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
+    SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                              const SDLoc &DL);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                           const SDLoc &DL, bool foldBooleans = true);
 
@@ -361,14 +376,14 @@ namespace {
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
     SDValue BuildLogBase2(SDValue Op, const SDLoc &DL);
-    SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags);
-    SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
-    SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags);
-    SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip);
+    SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
     SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
-                                SDNodeFlags *Flags, bool Reciprocal);
+                                SDNodeFlags Flags, bool Reciprocal);
     SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
-                                SDNodeFlags *Flags, bool Reciprocal);
+                                SDNodeFlags Flags, bool Reciprocal);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
@@ -377,6 +392,7 @@ namespace {
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
@@ -384,9 +400,11 @@ namespace {
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
-    SDValue createBuildVecShuffle(SDLoc DL, SDNode *N, ArrayRef<int> VectorMask,
-                                  SDValue VecIn1, SDValue VecIn2,
-                                  unsigned LeftIdx);
+    SDValue reduceBuildVecToTrunc(SDNode *N);
+    SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
+                                  ArrayRef<int> VectorMask, SDValue VecIn1,
+                                  SDValue VecIn2, unsigned LeftIdx);
+    SDValue matchVSelectOpSizesWithSetCC(SDNode *N);
 
     SDValue GetDemandedBits(SDValue V, const APInt &Mask);
 
@@ -416,15 +434,12 @@ namespace {
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
-      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
-      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+      MemOpLink(LSBaseSDNode *N, int64_t Offset)
+          : MemNode(N), OffsetFromBase(Offset) {}
       // Ptr to the mem node.
       LSBaseSDNode *MemNode;
       // Offset from the base ptr.
       int64_t OffsetFromBase;
-      // What is the sequence number of this mem node.
-      // Lowest mem operand in the DAG starts at zero.
-      unsigned SequenceNum;
     };
 
     /// This is a helper function for visitMUL to check the profitability
@@ -435,12 +450,6 @@ namespace {
                                      SDValue &AddNode,
                                      SDValue &ConstNode);
 
-    /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
-    /// constant build_vector of the stored constant values in Stores.
-    SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL,
-                                         ArrayRef<MemOpLink> Stores,
-                                         SmallVectorImpl<SDValue> &Chains,
-                                         EVT Ty) const;
 
     /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
     /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
@@ -451,34 +460,36 @@ namespace {
                           EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
                           bool &NarrowLoad);
 
+    /// Helper function for MergeConsecutiveStores which merges the
+    /// component store chains.
+    SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                unsigned NumStores);
+
     /// This is a helper function for MergeConsecutiveStores. When the source
     /// elements of the consecutive stores are all constants or all extracted
     /// vector elements, try to merge them into one larger store.
-    /// \return number of stores that were merged into a merged store (always
-    /// a prefix of \p StoreNode).
-    bool MergeStoresOfConstantsOrVecElts(
-        SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
-        bool IsConstantSrc, bool UseVector);
+    /// \return True if a merged store was created.
+    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumStores,
+                                         bool IsConstantSrc, bool UseVector,
+                                         bool UseTrunc);
 
     /// This is a helper function for MergeConsecutiveStores.
     /// Stores that may be merged are placed in StoreNodes.
-    /// Loads that may alias with those stores are placed in AliasLoadNodes.
-    void getStoreMergeAndAliasCandidates(
-        StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
-        SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
+    void getStoreMergeCandidates(StoreSDNode *St,
+                                 SmallVectorImpl<MemOpLink> &StoreNodes);
 
     /// Helper function for MergeConsecutiveStores. Checks if
     /// Candidate stores have indirect dependency through their
     /// operands. \return True if safe to merge
     bool checkMergeStoreCandidatesForDependencies(
-        SmallVectorImpl<MemOpLink> &StoreNodes);
+        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);
 
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return number of stores that were merged into a merged store (the
     /// affected nodes are stored as a prefix in \p StoreNodes).
-    bool MergeConsecutiveStores(StoreSDNode *N,
-                                SmallVectorImpl<MemOpLink> &StoreNodes);
+    bool MergeConsecutiveStores(StoreSDNode *N);
 
     /// \brief Try to transform a truncation where C is a constant:
     ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
@@ -489,10 +500,17 @@ namespace {
     SDValue distributeTruncateThroughAnd(SDNode *N);
 
   public:
-    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+    DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) {
       ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+      MaximumLegalStoreInBits = 0;
+      for (MVT VT : MVT::all_valuetypes())
+        if (EVT(VT).isSimple() && VT != MVT::Other &&
+            TLI.isTypeLegal(EVT(VT)) &&
+            VT.getSizeInBits() >= MaximumLegalStoreInBits)
+          MaximumLegalStoreInBits = VT.getSizeInBits();
     }
 
     /// Runs the dag combiner on all nodes in the work list
@@ -607,10 +625,16 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
 
   switch (Op.getOpcode()) {
   default: return false;
-  case ISD::ConstantFP:
-    // Don't invert constant FP values after legalize.  The negated constant
-    // isn't necessarily legal.
-    return LegalOperations ? 0 : 1;
+  case ISD::ConstantFP: {
+    if (!LegalOperations)
+      return 1;
+
+    // Don't invert constant FP values after legalization unless the target says
+    // the negated constant is legal.
+    EVT VT = Op.getValueType();
+    return TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+      TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT);
+  }
   case ISD::FADD:
     // FIXME: determine better conditions for this xform.
     if (!Options->UnsafeFPMath) return 0;
@@ -629,7 +653,8 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
                               Depth + 1);
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
-    if (!Options->UnsafeFPMath && !Op.getNode()->getFlags()->hasNoSignedZeros())
+    if (!Options->NoSignedZerosFPMath &&
+        !Op.getNode()->getFlags().hasNoSignedZeros())
       return 0;
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
@@ -667,7 +692,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
 
   assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
 
-  const SDNodeFlags *Flags = Op.getNode()->getFlags();
+  const SDNodeFlags Flags = Op.getNode()->getFlags();
 
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown code");
@@ -950,8 +975,8 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 /// things it uses can be simplified by bit propagation. If so, return true.
 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
   TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
-  APInt KnownZero, KnownOne;
-  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+  KnownBits Known;
+  if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO))
     return false;
 
   // Revisit the node.
@@ -1006,13 +1031,13 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   switch (Opc) {
   default: break;
   case ISD::AssertSext:
-    return DAG.getNode(ISD::AssertSext, DL, PVT,
-                       SExtPromoteOperand(Op.getOperand(0), PVT),
-                       Op.getOperand(1));
+    if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
+      return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
+    break;
   case ISD::AssertZext:
-    return DAG.getNode(ISD::AssertZext, DL, PVT,
-                       ZExtPromoteOperand(Op.getOperand(0), PVT),
-                       Op.getOperand(1));
+    if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
+      return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
+    break;
   case ISD::Constant: {
     unsigned ExtOpc =
       Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
@@ -1079,37 +1104,44 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
+    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
     SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
-    if (!NN0.getNode())
-      return SDValue();
 
     bool Replace1 = false;
     SDValue N1 = Op.getOperand(1);
-    SDValue NN1;
-    if (N0 == N1)
-      NN1 = NN0;
-    else {
-      NN1 = PromoteOperand(N1, PVT, Replace1);
-      if (!NN1.getNode())
-        return SDValue();
-    }
+    SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
+    SDLoc DL(Op);
 
-    AddToWorklist(NN0.getNode());
-    if (NN1.getNode())
-      AddToWorklist(NN1.getNode());
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
+
+    // We are always replacing N0/N1's use in N and only need
+    // additional replacements if there are additional uses.
+    Replace0 &= !N0->hasOneUse();
+    Replace1 &= (N0 != N1) && !N1->hasOneUse();
+
+    // Combine Op here so it is presreved past replacements.
+    CombineTo(Op.getNode(), RV);
 
-    if (Replace0)
+    // If operands have a use ordering, make sur we deal with
+    // predecessor first.
+    if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
+      std::swap(N0, N1);
+      std::swap(NN0, NN1);
+    }
+
+    if (Replace0) {
+      AddToWorklist(NN0.getNode());
       ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
-    if (Replace1)
+    }
+    if (Replace1) {
+      AddToWorklist(NN1.getNode());
       ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
-
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                       DAG.getNode(Opc, DL, PVT, NN0, NN1));
+    }
+    return Op;
   }
   return SDValue();
 }
@@ -1137,26 +1169,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
+    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
     if (Opc == ISD::SRA)
-      N0 = SExtPromoteOperand(Op.getOperand(0), PVT);
+      N0 = SExtPromoteOperand(N0, PVT);
     else if (Opc == ISD::SRL)
-      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
+      N0 = ZExtPromoteOperand(N0, PVT);
     else
       N0 = PromoteOperand(N0, PVT, Replace);
+
     if (!N0.getNode())
       return SDValue();
 
+    SDLoc DL(Op);
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
+
     AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                       DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1)));
+    // Deal with Op being deleted.
+    if (Op && Op.getOpcode() != ISD::DELETED_NODE)
+      return RV;
   }
   return SDValue();
 }
@@ -1361,8 +1399,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
              N->getNumValues() == 1 && "Type mismatch");
-      SDValue OpV = RV;
-      DAG.ReplaceAllUsesWith(N, &OpV);
+      DAG.ReplaceAllUsesWith(N, &RV);
     }
 
     // Push the new node and any users onto the worklist
@@ -1389,9 +1426,13 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ADD:                return visitADD(N);
   case ISD::SUB:                return visitSUB(N);
   case ISD::ADDC:               return visitADDC(N);
+  case ISD::UADDO:              return visitUADDO(N);
   case ISD::SUBC:               return visitSUBC(N);
+  case ISD::USUBO:              return visitUSUBO(N);
   case ISD::ADDE:               return visitADDE(N);
+  case ISD::ADDCARRY:           return visitADDCARRY(N);
   case ISD::SUBE:               return visitSUBE(N);
+  case ISD::SUBCARRY:           return visitSUBCARRY(N);
   case ISD::MUL:                return visitMUL(N);
   case ISD::SDIV:               return visitSDIV(N);
   case ISD::UDIV:               return visitUDIV(N);
@@ -1415,6 +1456,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SRL:                return visitSRL(N);
   case ISD::ROTR:
   case ISD::ROTL:               return visitRotate(N);
+  case ISD::ABS:                return visitABS(N);
   case ISD::BSWAP:              return visitBSWAP(N);
   case ISD::BITREVERSE:         return visitBITREVERSE(N);
   case ISD::CTLZ:               return visitCTLZ(N);
@@ -1427,9 +1469,11 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
   case ISD::SETCCE:             return visitSETCCE(N);
+  case ISD::SETCCCARRY:         return visitSETCCCARRY(N);
   case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
   case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::AssertZext:         return visitAssertZext(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
   case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
@@ -1530,7 +1574,7 @@ SDValue DAGCombiner::combine(SDNode *N) {
 
   // If N is a commutative binary node, try commuting it to enable more
   // sdisel CSE.
-  if (!RV.getNode() && SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
+  if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
       N->getNumValues() == 1) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
@@ -1574,7 +1618,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   }
 
   SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
-  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
   SmallPtrSet<SDNode*, 16> SeenOps;
   bool Changed = false;             // If we should replace this token factor.
 
@@ -1618,26 +1662,108 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
     }
   }
 
-  SDValue Result;
+  // Remove Nodes that are chained to another node in the list. Do so
+  // by walking up chains breath-first stopping when we've seen
+  // another operand. In general we must climb to the EntryNode, but we can exit
+  // early if we find all remaining work is associated with just one operand as
+  // no further pruning is possible.
+
+  // List of nodes to search through and original Ops from which they originate.
+  SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
+  SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
+  SmallPtrSet<SDNode *, 16> SeenChains;
+  bool DidPruneOps = false;
+
+  unsigned NumLeftToConsider = 0;
+  for (const SDValue &Op : Ops) {
+    Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
+    OpWorkCount.push_back(1);
+  }
+
+  auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
+    // If this is an Op, we can remove the op from the list. Remark any
+    // search associated with it as from the current OpNumber.
+    if (SeenOps.count(Op) != 0) {
+      Changed = true;
+      DidPruneOps = true;
+      unsigned OrigOpNumber = 0;
+      while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
+        OrigOpNumber++;
+      assert((OrigOpNumber != Ops.size()) &&
+             "expected to find TokenFactor Operand");
+      // Re-mark worklist from OrigOpNumber to OpNumber
+      for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
+        if (Worklist[i].second == OrigOpNumber) {
+          Worklist[i].second = OpNumber;
+        }
+      }
+      OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
+      OpWorkCount[OrigOpNumber] = 0;
+      NumLeftToConsider--;
+    }
+    // Add if it's a new chain
+    if (SeenChains.insert(Op).second) {
+      OpWorkCount[OpNumber]++;
+      Worklist.push_back(std::make_pair(Op, OpNumber));
+    }
+  };
+
+  for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
+    // We need at least be consider at least 2 Ops to prune.
+    if (NumLeftToConsider <= 1)
+      break;
+    auto CurNode = Worklist[i].first;
+    auto CurOpNumber = Worklist[i].second;
+    assert((OpWorkCount[CurOpNumber] > 0) &&
+           "Node should not appear in worklist");
+    switch (CurNode->getOpcode()) {
+    case ISD::EntryToken:
+      // Hitting EntryToken is the only way for the search to terminate without
+      // hitting
+      // another operand's search. Prevent us from marking this operand
+      // considered.
+      NumLeftToConsider++;
+      break;
+    case ISD::TokenFactor:
+      for (const SDValue &Op : CurNode->op_values())
+        AddToWorklist(i, Op.getNode(), CurOpNumber);
+      break;
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
+      break;
+    default:
+      if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
+        AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
+      break;
+    }
+    OpWorkCount[CurOpNumber]--;
+    if (OpWorkCount[CurOpNumber] == 0)
+      NumLeftToConsider--;
+  }
 
   // If we've changed things around then replace token factor.
   if (Changed) {
+    SDValue Result;
     if (Ops.empty()) {
       // The entry token is the only possible outcome.
       Result = DAG.getEntryNode();
     } else {
-      // New and improved token factor.
-      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+      if (DidPruneOps) {
+        SmallVector<SDValue, 8> PrunedOps;
+        //
+        for (const SDValue &Op : Ops) {
+          if (SeenChains.count(Op.getNode()) == 0)
+            PrunedOps.push_back(Op);
+        }
+        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
+      } else {
+        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+      }
     }
-
-    // Add users to worklist if AA is enabled, since it may introduce
-    // a lot of new chained token factors while removing memory deps.
-    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-      : DAG.getSubtarget().useAA();
-    return CombineTo(N, Result, UseAA /*add to worklist*/);
+    return Result;
   }
-
-  return Result;
+  return SDValue();
 }
 
 /// MERGE_VALUES can always be eliminated.
@@ -1664,6 +1790,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
   return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
 }
 
+SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
+  auto BinOpcode = BO->getOpcode();
+  assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB ||
+          BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV ||
+          BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM ||
+          BinOpcode == ISD::UREM || BinOpcode == ISD::AND ||
+          BinOpcode == ISD::OR || BinOpcode == ISD::XOR ||
+          BinOpcode == ISD::SHL || BinOpcode == ISD::SRL ||
+          BinOpcode == ISD::SRA || BinOpcode == ISD::FADD ||
+          BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL ||
+          BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) &&
+         "Unexpected binary operator");
+
+  // Bail out if any constants are opaque because we can't constant fold those.
+  SDValue C1 = BO->getOperand(1);
+  if (!isConstantOrConstantVector(C1, true) &&
+      !isConstantFPBuildVectorOrConstantFP(C1))
+    return SDValue();
+
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  // TODO: Handle ISD::SELECT_CC.
+  SDValue Sel = BO->getOperand(0);
+  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
+    return SDValue();
+
+  SDValue CT = Sel.getOperand(1);
+  if (!isConstantOrConstantVector(CT, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CT))
+    return SDValue();
+
+  SDValue CF = Sel.getOperand(2);
+  if (!isConstantOrConstantVector(CF, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CF))
+    return SDValue();
+
+  // We have a select-of-constants followed by a binary operator with a
+  // constant. Eliminate the binop by pulling the constant math into the select.
+  // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1
+  EVT VT = Sel.getValueType();
+  SDLoc DL(Sel);
+  SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1);
+  assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) ||
+          isConstantFPBuildVectorOrConstantFP(NewCT)) &&
+         "Failed to constant fold a binop with constant operands");
+
+  SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1);
+  assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) ||
+          isConstantFPBuildVectorOrConstantFP(NewCF)) &&
+         "Failed to constant fold a binop with constant operands");
+
+  return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -1702,16 +1882,36 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (isNullConstant(N1))
     return N0;
 
-  // fold ((c1-A)+c2) -> (c1+c2)-A
   if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
-    if (N0.getOpcode() == ISD::SUB)
-      if (isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
-        return DAG.getNode(ISD::SUB, DL, VT,
-                           DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
-                           N0.getOperand(1));
+    // fold ((c1-A)+c2) -> (c1+c2)-A
+    if (N0.getOpcode() == ISD::SUB &&
+        isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
+      // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic.
+      return DAG.getNode(ISD::SUB, DL, VT,
+                         DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
+                         N0.getOperand(1));
+    }
+
+    // add (sext i1 X), 1 -> zext (not i1 X)
+    // We don't transform this pattern:
+    //   add (zext i1 X), -1 -> sext (not i1 X)
+    // because most (?) targets generate better code for the zext form.
+    if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
+        isOneConstantOrOneSplatConstant(N1)) {
+      SDValue X = N0.getOperand(0);
+      if ((!LegalOperations ||
+           (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
+            TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
+          X.getScalarValueSizeInBits() == 1) {
+        SDValue Not = DAG.getNOT(DL, X, X.getValueType());
+        return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
       }
+    }
   }
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate add
   if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))
     return RADD;
@@ -1771,9 +1971,60 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
 
   // fold (a+b) -> (a|b) iff a and b share no bits.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
-      VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1))
+      DAG.haveNoCommonBitsSet(N0, N1))
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
+  if (SDValue Combined = visitADDLike(N0, N1, N))
+    return Combined;
+
+  if (SDValue Combined = visitADDLike(N1, N0, N))
+    return Combined;
+
+  return SDValue();
+}
+
+static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
+  bool Masked = false;
+
+  // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
+  while (true) {
+    if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) {
+      V = V.getOperand(0);
+      continue;
+    }
+
+    if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
+      Masked = true;
+      V = V.getOperand(0);
+      continue;
+    }
+
+    break;
+  }
+
+  // If this is not a carry, return.
+  if (V.getResNo() != 1)
+    return SDValue();
+
+  if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
+      V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
+    return SDValue();
+
+  // If the result is masked, then no matter what kind of bool it is we can
+  // return. If it isn't, then we need to make sure the bool type is either 0 or
+  // 1 and not other values.
+  if (Masked ||
+      TLI.getBooleanContents(V.getValueType()) ==
+          TargetLoweringBase::ZeroOrOneBooleanContent)
+    return V;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+  EVT VT = N0.getValueType();
+  SDLoc DL(LocReference);
+
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
@@ -1781,12 +2032,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
                        DAG.getNode(ISD::SHL, DL, VT,
                                    N1.getOperand(0).getOperand(1),
                                    N1.getOperand(1)));
-  if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0)))
-    return DAG.getNode(ISD::SUB, DL, VT, N1,
-                       DAG.getNode(ISD::SHL, DL, VT,
-                                   N0.getOperand(0).getOperand(1),
-                                   N0.getOperand(1)));
 
   if (N1.getOpcode() == ISD::AND) {
     SDValue AndOp0 = N1.getOperand(0);
@@ -1797,7 +2042,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     // and similar xforms where the inner op is either ~0 or 0.
     if (NumSignBits == DestBits &&
         isOneConstantOrOneSplatConstant(N1->getOperand(1)))
-      return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0);
+      return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
   }
 
   // add (sext i1), X -> sub X, (zext i1)
@@ -1818,6 +2063,18 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     }
   }
 
+  // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
+  if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)))
+    return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
+                       N0, N1.getOperand(0), N1.getOperand(2));
+
+  // (add X, Carry) -> (addcarry X, 0, Carry)
+  if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
+    if (SDValue Carry = getAsCarry(TLI, N1))
+      return DAG.getNode(ISD::ADDCARRY, DL,
+                         DAG.getVTList(VT, Carry.getValueType()), N0,
+                         DAG.getConstant(0, DL, VT), Carry);
+
   return SDValue();
 }
 
@@ -1825,40 +2082,90 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  SDLoc DL(N);
 
   // If the flag result is dead, turn this into an ADD.
   if (!N->hasAnyUseOfValue(1))
-    return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1),
-                     DAG.getNode(ISD::CARRY_FALSE,
-                                 SDLoc(N), MVT::Glue));
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // canonicalize constant to RHS.
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
+    return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
 
   // fold (addc x, 0) -> x + no carry out
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
-                                        SDLoc(N), MVT::Glue));
+                                        DL, MVT::Glue));
+
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUADDO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);
 
-  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
-  APInt LHSZero, LHSOne;
-  APInt RHSZero, RHSOne;
-  DAG.computeKnownBits(N0, LHSZero, LHSOne);
+  // fold (uaddo x, 0) -> x + no carry out
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
 
-  if (LHSZero.getBoolValue()) {
-    DAG.computeKnownBits(N1, RHSZero, RHSOne);
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  if (SDValue Combined = visitUADDOLike(N0, N1, N))
+    return Combined;
 
-    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
-    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
-    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
-      return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1),
-                       DAG.getNode(ISD::CARRY_FALSE,
-                                   SDLoc(N), MVT::Glue));
+  if (SDValue Combined = visitUADDOLike(N1, N0, N))
+    return Combined;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
+  auto VT = N0.getValueType();
+
+  // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
+  // If Y + 1 cannot overflow.
+  if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
+    SDValue Y = N1.getOperand(0);
+    SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
+    if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
+      return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
+                         N1.getOperand(2));
   }
 
+  // (uaddo X, Carry) -> (addcarry X, 0, Carry)
+  if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
+    if (SDValue Carry = getAsCarry(TLI, N1))
+      return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
+                         DAG.getConstant(0, SDLoc(N), VT), Carry);
+
   return SDValue();
 }
 
@@ -1881,6 +2188,90 @@ SDValue DAGCombiner::visitADDE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
+
+  // fold (addcarry x, y, false) -> (uaddo x, y)
+  if (isNullConstant(CarryIn))
+    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
+
+  // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
+  if (isNullConstant(N0) && isNullConstant(N1)) {
+    EVT VT = N0.getValueType();
+    EVT CarryVT = CarryIn.getValueType();
+    SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
+    AddToWorklist(CarryExt.getNode());
+    return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
+                                    DAG.getConstant(1, DL, VT)),
+                     DAG.getConstant(0, DL, CarryVT));
+  }
+
+  if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
+    return Combined;
+
+  if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
+    return Combined;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
+                                       SDNode *N) {
+  // Iff the flag result is dead:
+  // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
+  if ((N0.getOpcode() == ISD::ADD ||
+       (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) &&
+      isNullConstant(N1) && !N->hasAnyUseOfValue(1))
+    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
+                       N0.getOperand(0), N0.getOperand(1), CarryIn);
+
+  /**
+   * When one of the addcarry argument is itself a carry, we may be facing
+   * a diamond carry propagation. In which case we try to transform the DAG
+   * to ensure linear carry propagation if that is possible.
+   *
+   * We are trying to get:
+   *   (addcarry X, 0, (addcarry A, B, Z):Carry)
+   */
+  if (auto Y = getAsCarry(TLI, N1)) {
+    /**
+     *            (uaddo A, B)
+     *             /       \
+     *          Carry      Sum
+     *            |          \
+     *            | (addcarry *, 0, Z)
+     *            |       /
+     *             \   Carry
+     *              |   /
+     * (addcarry X, *, *)
+     */
+    if (Y.getOpcode() == ISD::UADDO &&
+        CarryIn.getResNo() == 1 &&
+        CarryIn.getOpcode() == ISD::ADDCARRY &&
+        isNullConstant(CarryIn.getOperand(1)) &&
+        CarryIn.getOperand(0) == Y.getValue(0)) {
+      auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(),
+                              Y.getOperand(0), Y.getOperand(1),
+                              CarryIn.getOperand(2));
+      AddToWorklist(NewY.getNode());
+      return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
+                         DAG.getConstant(0, SDLoc(N), N0.getValueType()),
+                         NewY.getValue(1));
+    }
+  }
+
+  return SDValue();
+}
+
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
@@ -1920,6 +2311,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
                                       N1.getNode());
   }
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
 
   // fold (sub x, c) -> (add x, -c)
@@ -1944,13 +2338,13 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
 
     // 0 - X --> 0 if the sub is NUW.
-    if (N->getFlags()->hasNoUnsignedWrap())
+    if (N->getFlags().hasNoUnsignedWrap())
       return N0;
 
-    if (DAG.MaskedValueIsZero(N1, ~APInt::getSignBit(BitWidth))) {
+    if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
       // N1 is either 0 or the minimum signed value. If the sub is NSW, then
       // N1 must be 0 because negating the minimum signed value is undefined.
-      if (N->getFlags()->hasNoSignedWrap())
+      if (N->getFlags().hasNoSignedWrap())
         return N0;
 
       // 0 - X --> X if X is 0 or the minimum signed value.
@@ -2066,6 +2460,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitUSUBO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // fold (usubo x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  // fold (usubo x, 0) -> x + no borrow
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (isAllOnesConstant(N0))
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSUBE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2078,6 +2504,18 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (subcarry x, y, false) -> (usubo x, y)
+  if (isNullConstant(CarryIn))
+    return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2122,15 +2560,19 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
   // fold (mul x, 0) -> 0
-  if (N1IsConst && ConstValue1 == 0)
+  if (N1IsConst && ConstValue1.isNullValue())
     return N1;
   // We require a splat of the entire scalar bit width for non-contiguous
   // bit patterns.
   bool IsFullSplat =
     ConstValue1.getBitWidth() == VT.getScalarSizeInBits();
   // fold (mul x, 1) -> x
-  if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
+  if (N1IsConst && ConstValue1.isOneValue() && IsFullSplat)
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (mul x, -1) -> 0-x
   if (N1IsConst && ConstValue1.isAllOnesValue()) {
     SDLoc DL(N);
@@ -2297,6 +2739,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   return combined;
 }
 
+static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (DAG.isUndef(N->getOpcode(), {N0, N1}))
+    return DAG.getUNDEF(VT);
+
+  // undef / X -> 0
+  // undef % X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2319,8 +2778,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     return N0;
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
-    return DAG.getNode(ISD::SUB, DL, VT,
-                       DAG.getConstant(0, DL, VT), N0);
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
+
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
 
   // If we know the sign bits of both operands are zero, strength reduce to a
   // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
@@ -2332,9 +2796,8 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   // better results in that case. The target-specific lowering should learn how
   // to handle exact sdivs efficiently.
   if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
-      !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() &&
-      (N1C->getAPIntValue().isPowerOf2() ||
-       (-N1C->getAPIntValue()).isPowerOf2())) {
+      !N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() ||
+                                    (-N1C->getAPIntValue()).isPowerOf2())) {
     // Target-specific implementation of sdiv x, pow2.
     if (SDValue Res = BuildSDIVPow2(N))
       return Res;
@@ -2372,7 +2835,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   // If integer divide is expensive and we satisfy the requirements, emit an
   // alternate sequence.  Targets may check function attributes for size/speed
   // trade-offs.
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildSDIV(N))
       return Op;
@@ -2384,13 +2847,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
-  // undef / X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X / undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2414,6 +2870,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
                                                     N0C, N1C))
       return Folded;
 
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (udiv x, (1 << c)) -> x >>u c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1)) {
@@ -2444,7 +2906,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   }
 
   // fold (udiv x, c) -> alternate
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildUDIV(N))
       return Op;
@@ -2456,13 +2918,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
-  // undef / X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X / undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2482,32 +2937,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
       return Folded;
 
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (isSigned) {
     // If we know the sign bits of both operands are zero, strength reduce to a
     // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
   } else {
-    // fold (urem x, pow2) -> (and x, pow2-1)
+    SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
     if (DAG.isKnownToBeAPowerOfTwo(N1)) {
-      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits());
-      SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+      // fold (urem x, pow2) -> (and x, pow2-1)
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
-    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
     if (N1.getOpcode() == ISD::SHL &&
         DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
-      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits());
-      SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+      // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
   }
 
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
 
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
@@ -2536,13 +2994,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   if (SDValue DivRem = useDivRem(N))
     return DivRem.getValue(1);
 
-  // undef % X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X % undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2932,95 +3383,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   return SDValue();
 }
 
+/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
+SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                                       const SDLoc &DL) {
+  SDValue LL, LR, RL, RR, N0CC, N1CC;
+  if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
+      !isSetCCEquivalent(N1, RL, RR, N1CC))
+    return SDValue();
+
+  assert(N0.getValueType() == N1.getValueType() &&
+         "Unexpected operand types for bitwise logic op");
+  assert(LL.getValueType() == LR.getValueType() &&
+         RL.getValueType() == RR.getValueType() &&
+         "Unexpected operand types for setcc");
+
+  // If we're here post-legalization or the logic op type is not i1, the logic
+  // op type must match a setcc result type. Also, all folds require new
+  // operations on the left and right operands, so those types must match.
+  EVT VT = N0.getValueType();
+  EVT OpVT = LL.getValueType();
+  if (LegalOperations || VT != MVT::i1)
+    if (VT != getSetCCResultType(OpVT))
+      return SDValue();
+  if (OpVT != RL.getValueType())
+    return SDValue();
+
+  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
+  bool IsInteger = OpVT.isInteger();
+  if (LR == RR && CC0 == CC1 && IsInteger) {
+    bool IsZero = isNullConstantOrNullSplatConstant(LR);
+    bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);
+
+    // All bits clear?
+    bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
+    // All sign bits clear?
+    bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
+    // Any bits set?
+    bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
+    // Any sign bits set?
+    bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
+
+    // (and (seteq X,  0), (seteq Y,  0)) --> (seteq (or X, Y),  0)
+    // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
+    // (or  (setne X,  0), (setne Y,  0)) --> (setne (or X, Y),  0)
+    // (or  (setlt X,  0), (setlt Y,  0)) --> (setlt (or X, Y),  0)
+    if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
+      SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(Or.getNode());
+      return DAG.getSetCC(DL, VT, Or, LR, CC1);
+    }
+
+    // All bits set?
+    bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
+    // All sign bits set?
+    bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
+    // Any bits clear?
+    bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
+    // Any sign bits clear?
+    bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
+
+    // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
+    // (and (setlt X,  0), (setlt Y,  0)) --> (setlt (and X, Y),  0)
+    // (or  (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
+    // (or  (setgt X, -1), (setgt Y  -1)) --> (setgt (and X, Y), -1)
+    if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(And.getNode());
+      return DAG.getSetCC(DL, VT, And, LR, CC1);
+    }
+  }
+
+  // TODO: What is the 'or' equivalent of this fold?
+  // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
+  if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE &&
+      ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
+       (isAllOnesConstant(LR) && isNullConstant(RR)))) {
+    SDValue One = DAG.getConstant(1, DL, OpVT);
+    SDValue Two = DAG.getConstant(2, DL, OpVT);
+    SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
+    AddToWorklist(Add.getNode());
+    return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
+  }
+
+  // Try more general transforms if the predicates match and the only user of
+  // the compares is the 'and' or 'or'.
+  if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
+      N0.hasOneUse() && N1.hasOneUse()) {
+    // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
+    // or  (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
+    if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
+      SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
+      SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
+      SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
+      SDValue Zero = DAG.getConstant(0, DL, OpVT);
+      return DAG.getSetCC(DL, VT, Or, Zero, CC1);
+    }
+  }
+
+  // Canonicalize equivalent operands to LL == RL.
+  if (LL == RR && LR == RL) {
+    CC1 = ISD::getSetCCSwappedOperands(CC1);
+    std::swap(RL, RR);
+  }
+
+  // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  // (or  (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  if (LL == RL && LR == RR) {
+    ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger)
+                                : ISD::getSetCCOrOperation(CC0, CC1, IsInteger);
+    if (NewCC != ISD::SETCC_INVALID &&
+        (!LegalOperations ||
+         (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
+          TLI.isOperationLegal(ISD::SETCC, OpVT))))
+      return DAG.getSetCC(DL, VT, LL, LR, NewCC);
+  }
+
+  return SDValue();
+}
+
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an And operation to a single value. This makes them reusable in the context
 /// of visitSELECT(). Rules involving constants are not included as
 /// visitSELECT() already handles those cases.
-SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
-                                  SDNode *LocReference) {
+SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
 
   // fold (and x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
-    return DAG.getConstant(0, SDLoc(LocReference), VT);
-  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
-    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
-    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
-
-    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
-        LL.getValueType().isInteger()) {
-      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
-      if (isNullConstant(LR) && Op1 == ISD::SETEQ) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
-                                       LR.getValueType(), LL, RL);
-          AddToWorklist(ORNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-        }
-      }
-      if (isAllOnesConstant(LR)) {
-        // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
-        if (Op1 == ISD::SETEQ) {
-          EVT CCVT = getSetCCResultType(LR.getValueType());
-          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-            SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0),
-                                          LR.getValueType(), LL, RL);
-            AddToWorklist(ANDNode.getNode());
-            return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
-          }
-        }
-        // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1)
-        if (Op1 == ISD::SETGT) {
-          EVT CCVT = getSetCCResultType(LR.getValueType());
-          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-            SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
-                                         LR.getValueType(), LL, RL);
-            AddToWorklist(ORNode.getNode());
-            return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-          }
-        }
-      }
-    }
-    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
-    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
-        Op0 == Op1 && LL.getValueType().isInteger() &&
-      Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
-                            (isAllOnesConstant(LR) && isNullConstant(RR)))) {
-      EVT CCVT = getSetCCResultType(LL.getValueType());
-      if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-        SDLoc DL(N0);
-        SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(),
-                                      LL, DAG.getConstant(1, DL,
-                                                          LL.getValueType()));
-        AddToWorklist(ADDNode.getNode());
-        return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode,
-                            DAG.getConstant(2, DL, LL.getValueType()),
-                            ISD::SETUGE);
-      }
-    }
-    // canonicalize equivalent to ll == rl
-    if (LL == RR && LR == RL) {
-      Op1 = ISD::getSetCCSwappedOperands(Op1);
-      std::swap(RL, RR);
-    }
-    if (LL == RL && LR == RR) {
-      bool isInteger = LL.getValueType().isInteger();
-      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
-      if (Result != ISD::SETCC_INVALID &&
-          (!LegalOperations ||
-           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
-        EVT CCVT = getSetCCResultType(LL.getValueType());
-        if (N0.getValueType() == CCVT ||
-            (!LegalOperations && N0.getValueType() == MVT::i1))
-          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                              LL, LR, Result);
-      }
-    }
-  }
+    return DAG.getConstant(0, DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
+    return V;
 
   if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
       VT.getSizeInBits() <= 64) {
@@ -3037,13 +3532,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
           if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
             ADDC |= Mask;
             if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
-              SDLoc DL(N0);
+              SDLoc DL0(N0);
               SDValue NewAdd =
-                DAG.getNode(ISD::ADD, DL, VT,
+                DAG.getNode(ISD::ADD, DL0, VT,
                             N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
               CombineTo(N0.getNode(), NewAdd);
               // Return N so it doesn't get rechecked!
-              return SDValue(LocReference, 0);
+              return SDValue(N, 0);
             }
           }
         }
@@ -3068,7 +3563,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
         unsigned MaskBits = AndMask.countTrailingOnes();
         EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
 
-        if (APIntOps::isMask(AndMask) &&
+        if (AndMask.isMask() &&
             // Required bits must not span the two halves of the integer and
             // must fit in the half size type.
             (ShiftBits + MaskBits <= Size / 2) &&
@@ -3108,7 +3603,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                                    bool &NarrowLoad) {
   uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits();
 
-  if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue()))
+  if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits))
     return false;
 
   ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
@@ -3191,13 +3686,17 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate and
   if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
     return RAND;
   // fold (and (or x, C), D) -> D if (C & D) == D
   if (N1C && N0.getOpcode() == ISD::OR)
     if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1)))
-      if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue())
+      if (N1C->getAPIntValue().isSubsetOf(ORI->getAPIntValue()))
         return N1;
   // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
@@ -3299,6 +3798,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
       // preserve semantics once we get rid of the AND.
       SDValue NewLoad(Load, 0);
+
+      // Fold the AND away. NewLoad may get replaced immediately.
+      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
+
       if (Load->getExtensionType() == ISD::EXTLOAD) {
         NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
                               Load->getValueType(0), SDLoc(Load),
@@ -3316,10 +3819,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         }
       }
 
-      // Fold the AND away, taking care not to fold to the old load node if we
-      // replaced it.
-      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
-
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
@@ -3398,9 +3897,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   // Note: the SimplifyDemandedBits fold below can make an information-losing
   // transform, and then we have no way to find this better fold.
   if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
-    ConstantSDNode *SubLHS = isConstOrConstSplat(N0.getOperand(0));
-    SDValue SubRHS = N0.getOperand(1);
-    if (SubLHS && SubLHS->isNullValue()) {
+    if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
+      SDValue SubRHS = N0.getOperand(1);
       if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
         return SubRHS;
@@ -3412,7 +3910,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
   // fold (and (sra)) -> (and (srl)) when possible.
-  if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // fold (zext_inreg (extload x)) -> (zextload x)
@@ -3473,7 +3971,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   EVT VT = N->getValueType(0);
   if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
     return SDValue();
-  if (!TLI.isOperationLegal(ISD::BSWAP, VT))
+  if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
     return SDValue();
 
   // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00)
@@ -3585,27 +4083,36 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
     return false;
 
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  SDValue N0 = N.getOperand(0);
+  unsigned Opc0 = N0.getOpcode();
+  if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
+    return false;
+
+  ConstantSDNode *N1C = nullptr;
+  // SHL or SRL: look upstream for AND mask operand
+  if (Opc == ISD::AND)
+    N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  else if (Opc0 == ISD::AND)
+    N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   if (!N1C)
     return false;
 
-  unsigned Num;
+  unsigned MaskByteOffset;
   switch (N1C->getZExtValue()) {
   default:
     return false;
-  case 0xFF:       Num = 0; break;
-  case 0xFF00:     Num = 1; break;
-  case 0xFF0000:   Num = 2; break;
-  case 0xFF000000: Num = 3; break;
+  case 0xFF:       MaskByteOffset = 0; break;
+  case 0xFF00:     MaskByteOffset = 1; break;
+  case 0xFF0000:   MaskByteOffset = 2; break;
+  case 0xFF000000: MaskByteOffset = 3; break;
   }
 
   // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
-  SDValue N0 = N.getOperand(0);
   if (Opc == ISD::AND) {
-    if (Num == 0 || Num == 2) {
+    if (MaskByteOffset == 0 || MaskByteOffset == 2) {
       // (x >> 8) & 0xff
       // (x >> 8) & 0xff0000
-      if (N0.getOpcode() != ISD::SRL)
+      if (Opc0 != ISD::SRL)
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C || C->getZExtValue() != 8)
@@ -3613,7 +4120,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
     } else {
       // (x << 8) & 0xff00
       // (x << 8) & 0xff000000
-      if (N0.getOpcode() != ISD::SHL)
+      if (Opc0 != ISD::SHL)
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C || C->getZExtValue() != 8)
@@ -3622,7 +4129,7 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   } else if (Opc == ISD::SHL) {
     // (x & 0xff) << 8
     // (x & 0xff0000) << 8
-    if (Num != 0 && Num != 2)
+    if (MaskByteOffset != 0 && MaskByteOffset != 2)
       return false;
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
     if (!C || C->getZExtValue() != 8)
@@ -3630,17 +4137,17 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   } else { // Opc == ISD::SRL
     // (x & 0xff00) >> 8
     // (x & 0xff000000) >> 8
-    if (Num != 1 && Num != 3)
+    if (MaskByteOffset != 1 && MaskByteOffset != 3)
       return false;
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
     if (!C || C->getZExtValue() != 8)
       return false;
   }
 
-  if (Parts[Num])
+  if (Parts[MaskByteOffset])
     return false;
 
-  Parts[Num] = N0.getOperand(0).getNode();
+  Parts[MaskByteOffset] = N0.getOperand(0).getNode();
   return true;
 }
 
@@ -3657,7 +4164,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   EVT VT = N->getValueType(0);
   if (VT != MVT::i32)
     return SDValue();
-  if (!TLI.isOperationLegal(ISD::BSWAP, VT))
+  if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
     return SDValue();
 
   // Look for either
@@ -3672,18 +4179,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (N1.getOpcode() == ISD::OR &&
       N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
     // (or (or (and), (and)), (or (and), (and)))
-    SDValue N000 = N00.getOperand(0);
-    if (!isBSwapHWordElement(N000, Parts))
+    if (!isBSwapHWordElement(N00, Parts))
       return SDValue();
 
-    SDValue N001 = N00.getOperand(1);
-    if (!isBSwapHWordElement(N001, Parts))
+    if (!isBSwapHWordElement(N01, Parts))
       return SDValue();
-    SDValue N010 = N01.getOperand(0);
-    if (!isBSwapHWordElement(N010, Parts))
+    SDValue N10 = N1.getOperand(0);
+    if (!isBSwapHWordElement(N10, Parts))
       return SDValue();
-    SDValue N011 = N01.getOperand(1);
-    if (!isBSwapHWordElement(N011, Parts))
+    SDValue N11 = N1.getOperand(1);
+    if (!isBSwapHWordElement(N11, Parts))
       return SDValue();
   } else {
     // (or (or (or (and), (and)), (and)), (and))
@@ -3723,65 +4228,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
 
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an Or operation to a single value \see visitANDLike().
-SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
+
   // fold (or x, undef) -> -1
-  if (!LegalOperations &&
-      (N0.isUndef() || N1.isUndef())) {
-    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
-    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()),
-                           SDLoc(LocReference), VT);
-  }
-  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
-    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
-    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
-
-    if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) {
-      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
-      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
-      if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR),
-                                       LR.getValueType(), LL, RL);
-          AddToWorklist(ORNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-        }
-      }
-      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
-      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
-      if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR),
-                                        LR.getValueType(), LL, RL);
-          AddToWorklist(ANDNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
-        }
-      }
-    }
-    // canonicalize equivalent to ll == rl
-    if (LL == RR && LR == RL) {
-      Op1 = ISD::getSetCCSwappedOperands(Op1);
-      std::swap(RL, RR);
-    }
-    if (LL == RL && LR == RR) {
-      bool isInteger = LL.getValueType().isInteger();
-      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
-      if (Result != ISD::SETCC_INVALID &&
-          (!LegalOperations ||
-           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
-        EVT CCVT = getSetCCResultType(LL.getValueType());
-        if (N0.getValueType() == CCVT ||
-            (!LegalOperations && N0.getValueType() == MVT::i1))
-          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                              LL, LR, Result);
-      }
-    }
-  }
+  if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
+    return DAG.getAllOnesConstant(DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
+    return V;
 
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
@@ -3802,7 +4258,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
             DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
           SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                                   N0.getOperand(0), N1.getOperand(0));
-          SDLoc DL(LocReference);
           return DAG.getNode(ISD::AND, DL, VT, X,
                              DAG.getConstant(LHSMask | RHSMask, DL, VT));
         }
@@ -3818,7 +4273,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
       (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
-    return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X);
+    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
   }
 
   return SDValue();
@@ -3847,14 +4302,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     // fold (or x, -1) -> -1, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
       // do not return N0, because undef node may exist in N0
-      return DAG.getConstant(
-          APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N),
-          N0.getValueType());
+      return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       // do not return N1, because undef node may exist in N1
-      return DAG.getConstant(
-          APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N),
-          N1.getValueType());
+      return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
     // Do this only if the resulting shuffle is legal.
@@ -3867,7 +4318,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
       bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
       // Ensure both shuffles have a zero input.
-      if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) {
+      if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
         assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
         assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
         const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
@@ -3939,6 +4390,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   // fold (or x, -1) -> -1
   if (isAllOnesConstant(N1))
     return N1;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (or x, c) -> c iff (x & ~c) == 0
   if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
     return N1;
@@ -3955,20 +4410,22 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   // reassociate or
   if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
     return ROR;
+
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
-  // iff (c1 & c2) == 0.
-  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
-             isa<ConstantSDNode>(N0.getOperand(1))) {
-    ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
-    if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) {
-      if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
-                                                   N1C, C1))
-        return DAG.getNode(
-            ISD::AND, SDLoc(N), VT,
-            DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
-      return SDValue();
+  // iff (c1 & c2) != 0.
+  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) {
+    if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) {
+        if (SDValue COR =
+                DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1))
+          return DAG.getNode(
+              ISD::AND, SDLoc(N), VT,
+              DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
+        return SDValue();
+      }
     }
   }
+
   // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
   if (N0.getOpcode() == N1.getOpcode())
     if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
@@ -3978,9 +4435,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
 
+  if (SDValue Load = MatchLoadCombine(N))
+    return Load;
+
   // Simplify the operands using demanded-bits information.
-  if (!VT.isVector() &&
-      SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   return SDValue();
@@ -4134,6 +4593,20 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   return nullptr;
 }
 
+// if Left + Right == Sum (constant or constant splat vector)
+static bool sumMatchConstant(SDValue Left, SDValue Right, unsigned Sum,
+                             SelectionDAG &DAG, const SDLoc &DL) {
+  EVT ShiftVT = Left.getValueType();
+  if (ShiftVT != Right.getValueType()) return false;
+
+  SDValue ShiftSum = DAG.FoldConstantArithmetic(ISD::ADD, DL, ShiftVT,
+                         Left.getNode(), Right.getNode());
+  if (!ShiftSum) return false;
+
+  ConstantSDNode *CSum = isConstOrConstSplat(ShiftSum);
+  return CSum && CSum->getZExtValue() == Sum;
+}
+
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
@@ -4179,31 +4652,24 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
   // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
-  if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
-    uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
-    uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
-    if ((LShVal + RShVal) != EltSizeInBits)
-      return nullptr;
-
+  if (sumMatchConstant(LHSShiftAmt, RHSShiftAmt, EltSizeInBits, DAG, DL)) {
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
                               LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
-      APInt AllBits = APInt::getAllOnesValue(EltSizeInBits);
-      SDValue Mask = DAG.getConstant(AllBits, DL, VT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+      SDValue Mask = AllOnes;
 
       if (LHSMask.getNode()) {
-        APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
+        SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
-                           DAG.getNode(ISD::OR, DL, VT, LHSMask,
-                                       DAG.getConstant(RHSBits, DL, VT)));
+                           DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
       }
       if (RHSMask.getNode()) {
-        APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
+        SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
-                           DAG.getNode(ISD::OR, DL, VT, RHSMask,
-                                       DAG.getConstant(LHSBits, DL, VT)));
+                           DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
       }
 
       Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
@@ -4246,109 +4712,299 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 }
 
 namespace {
-/// Helper struct to parse and store a memory address as base + index + offset.
-/// We ignore sign extensions when it is safe to do so.
-/// The following two expressions are not equivalent. To differentiate we need
-/// to store whether there was a sign extension involved in the index
-/// computation.
-///  (load (i64 add (i64 copyfromreg %c)
-///                 (i64 signextend (add (i8 load %index)
-///                                      (i8 1))))
-/// vs
-///
-/// (load (i64 add (i64 copyfromreg %c)
-///                (i64 signextend (i32 add (i32 signextend (i8 load %index))
-///                                         (i32 1)))))
-struct BaseIndexOffset {
-  SDValue Base;
-  SDValue Index;
-  int64_t Offset;
-  bool IsIndexSignExt;
-
-  BaseIndexOffset() : Offset(0), IsIndexSignExt(false) {}
-
-  BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset,
-                  bool IsIndexSignExt) :
-    Base(Base), Index(Index), Offset(Offset), IsIndexSignExt(IsIndexSignExt) {}
-
-  bool equalBaseIndex(const BaseIndexOffset &Other) {
-    return Other.Base == Base && Other.Index == Index &&
-      Other.IsIndexSignExt == IsIndexSignExt;
-  }
-
-  /// Parses tree in Ptr for base, index, offset addresses.
-  static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG,
-                               int64_t PartialOffset = 0) {
-    bool IsIndexSignExt = false;
-
-    // Split up a folded GlobalAddress+Offset into its component parts.
-    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ptr))
-      if (GA->getOpcode() == ISD::GlobalAddress && GA->getOffset() != 0) {
-        return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(),
-                                                    SDLoc(GA),
-                                                    GA->getValueType(0),
-                                                    /*Offset=*/PartialOffset,
-                                                    /*isTargetGA=*/false,
-                                                    GA->getTargetFlags()),
-                               SDValue(),
-                               GA->getOffset(),
-                               IsIndexSignExt);
-      }
-
-    // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
-    // instruction, then it could be just the BASE or everything else we don't
-    // know how to handle. Just use Ptr as BASE and give up.
-    if (Ptr->getOpcode() != ISD::ADD)
-      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
-
-    // We know that we have at least an ADD instruction. Try to pattern match
-    // the simple case of BASE + OFFSET.
-    if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
-      int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
-      return match(Ptr->getOperand(0), DAG, Offset + PartialOffset);
-    }
-
-    // Inside a loop the current BASE pointer is calculated using an ADD and a
-    // MUL instruction. In this case Ptr is the actual BASE pointer.
-    // (i64 add (i64 %array_ptr)
-    //          (i64 mul (i64 %induction_var)
-    //                   (i64 %element_size)))
-    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
-      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
-
-    // Look at Base + Index + Offset cases.
-    SDValue Base = Ptr->getOperand(0);
-    SDValue IndexOffset = Ptr->getOperand(1);
-
-    // Skip signextends.
-    if (IndexOffset->getOpcode() == ISD::SIGN_EXTEND) {
-      IndexOffset = IndexOffset->getOperand(0);
-      IsIndexSignExt = true;
-    }
-
-    // Either the case of Base + Index (no offset) or something else.
-    if (IndexOffset->getOpcode() != ISD::ADD)
-      return BaseIndexOffset(Base, IndexOffset, PartialOffset, IsIndexSignExt);
-
-    // Now we have the case of Base + Index + offset.
-    SDValue Index = IndexOffset->getOperand(0);
-    SDValue Offset = IndexOffset->getOperand(1);
-
-    if (!isa<ConstantSDNode>(Offset))
-      return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
-
-    // Ignore signextends.
-    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
-      Index = Index->getOperand(0);
-      IsIndexSignExt = true;
-    } else IsIndexSignExt = false;
-
-    int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
-    return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);
+/// Represents known origin of an individual byte in load combine pattern. The
+/// value of the byte is either constant zero or comes from memory.
+struct ByteProvider {
+  // For constant zero providers Load is set to nullptr. For memory providers
+  // Load represents the node which loads the byte from memory.
+  // ByteOffset is the offset of the byte in the value produced by the load.
+  LoadSDNode *Load;
+  unsigned ByteOffset;
+
+  ByteProvider() : Load(nullptr), ByteOffset(0) {}
+
+  static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
+    return ByteProvider(Load, ByteOffset);
   }
+  static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
+
+  bool isConstantZero() const { return !Load; }
+  bool isMemory() const { return Load; }
+
+  bool operator==(const ByteProvider &Other) const {
+    return Other.Load == Load && Other.ByteOffset == ByteOffset;
+  }
+
+private:
+  ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
+      : Load(Load), ByteOffset(ByteOffset) {}
 };
+
+/// Recursively traverses the expression calculating the origin of the requested
+/// byte of the given value. Returns None if the provider can't be calculated.
+///
+/// For all the values except the root of the expression verifies that the value
+/// has exactly one use and if it's not true return None. This way if the origin
+/// of the byte is returned it's guaranteed that the values which contribute to
+/// the byte are not used outside of this expression.
+///
+/// Because the parts of the expression are not allowed to have more than one
+/// use this function iterates over trees, not DAGs. So it never visits the same
+/// node more than once.
+const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
+                                                   unsigned Depth,
+                                                   bool Root = false) {
+  // Typical i64 by i8 pattern requires recursion up to 8 calls depth
+  if (Depth == 10)
+    return None;
+
+  if (!Root && !Op.hasOneUse())
+    return None;
+
+  assert(Op.getValueType().isScalarInteger() && "can't handle other types");
+  unsigned BitWidth = Op.getValueSizeInBits();
+  if (BitWidth % 8 != 0)
+    return None;
+  unsigned ByteWidth = BitWidth / 8;
+  assert(Index < ByteWidth && "invalid index requested");
+  (void) ByteWidth;
+
+  switch (Op.getOpcode()) {
+  case ISD::OR: {
+    auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
+    if (!LHS)
+      return None;
+    auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
+    if (!RHS)
+      return None;
+
+    if (LHS->isConstantZero())
+      return RHS;
+    if (RHS->isConstantZero())
+      return LHS;
+    return None;
+  }
+  case ISD::SHL: {
+    auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!ShiftOp)
+      return None;
+
+    uint64_t BitShift = ShiftOp->getZExtValue();
+    if (BitShift % 8 != 0)
+      return None;
+    uint64_t ByteShift = BitShift / 8;
+
+    return Index < ByteShift
+               ? ByteProvider::getConstantZero()
+               : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
+                                       Depth + 1);
+  }
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND: {
+    SDValue NarrowOp = Op->getOperand(0);
+    unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return None;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    if (Index >= NarrowByteWidth)
+      return Op.getOpcode() == ISD::ZERO_EXTEND
+                 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
+                 : None;
+    return calculateByteProvider(NarrowOp, Index, Depth + 1);
+  }
+  case ISD::BSWAP:
+    return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
+                                 Depth + 1);
+  case ISD::LOAD: {
+    auto L = cast<LoadSDNode>(Op.getNode());
+    if (L->isVolatile() || L->isIndexed())
+      return None;
+
+    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
+      return None;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+    if (Index >= NarrowByteWidth)
+      return L->getExtensionType() == ISD::ZEXTLOAD
+                 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
+                 : None;
+    return ByteProvider::getMemory(L, Index);
+  }
+  }
+
+  return None;
+}
 } // namespace
 
+/// Match a pattern where a wide type scalar value is loaded by several narrow
+/// loads and combined by shifts and ors. Fold it into a single load or a load
+/// and a BSWAP if the targets supports it.
+///
+/// Assuming little endian target:
+///  i8 *a = ...
+///  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+/// =>
+///  i32 val = *((i32)a)
+///
+///  i8 *a = ...
+///  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+/// =>
+///  i32 val = BSWAP(*((i32)a))
+///
+/// TODO: This rule matches complex patterns with OR node roots and doesn't
+/// interact well with the worklist mechanism. When a part of the pattern is
+/// updated (e.g. one of the loads) its direct users are put into the worklist,
+/// but the root node of the pattern which triggers the load combine is not
+/// necessarily a direct user of the changed node. For example, once the address
+/// of t28 load is reassociated load combine won't be triggered:
+///             t25: i32 = add t4, Constant:i32<2>
+///           t26: i64 = sign_extend t25
+///        t27: i64 = add t2, t26
+///       t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
+///     t29: i32 = zero_extend t28
+///   t32: i32 = shl t29, Constant:i8<8>
+/// t33: i32 = or t23, t32
+/// As a possible fix visitLoad can check if the load can be a part of a load
+/// combine pattern and add corresponding OR roots to the worklist.
+SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR &&
+         "Can only match load combining against OR nodes");
+
+  // Handles simple types only
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+  unsigned ByteWidth = VT.getSizeInBits() / 8;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Before legalize we can introduce too wide illegal loads which will be later
+  // split into legal sized loads. This enables us to combine i64 load by i8
+  // patterns to a couple of i32 loads on 32 bit targets.
+  if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
+    return SDValue();
+
+  std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = [](
+    unsigned BW, unsigned i) { return i; };
+  std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
+    unsigned BW, unsigned i) { return BW - i - 1; };
+
+  bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
+  auto MemoryByteOffset = [&] (ByteProvider P) {
+    assert(P.isMemory() && "Must be a memory byte provider");
+    unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
+    assert(LoadBitWidth % 8 == 0 &&
+           "can only analyze providers for individual bytes not bit");
+    unsigned LoadByteWidth = LoadBitWidth / 8;
+    return IsBigEndianTarget
+            ? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
+            : LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
+  };
+
+  Optional<BaseIndexOffset> Base;
+  SDValue Chain;
+
+  SmallSet<LoadSDNode *, 8> Loads;
+  Optional<ByteProvider> FirstByteProvider;
+  int64_t FirstOffset = INT64_MAX;
+
+  // Check if all the bytes of the OR we are looking at are loaded from the same
+  // base address. Collect bytes offsets from Base address in ByteOffsets.
+  SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
+  for (unsigned i = 0; i < ByteWidth; i++) {
+    auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
+    if (!P || !P->isMemory()) // All the bytes must be loaded from memory
+      return SDValue();
+
+    LoadSDNode *L = P->Load;
+    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
+           "Must be enforced by calculateByteProvider");
+    assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
+
+    // All loads must share the same chain
+    SDValue LChain = L->getChain();
+    if (!Chain)
+      Chain = LChain;
+    else if (Chain != LChain)
+      return SDValue();
+
+    // Loads must share the same base address
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
+    int64_t ByteOffsetFromBase = 0;
+    if (!Base)
+      Base = Ptr;
+    else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
+      return SDValue();
+
+    // Calculate the offset of the current byte from the base address
+    ByteOffsetFromBase += MemoryByteOffset(*P);
+    ByteOffsets[i] = ByteOffsetFromBase;
+
+    // Remember the first byte load
+    if (ByteOffsetFromBase < FirstOffset) {
+      FirstByteProvider = P;
+      FirstOffset = ByteOffsetFromBase;
+    }
+
+    Loads.insert(L);
+  }
+  assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
+         "memory, so there must be at least one load which produces the value");
+  assert(Base && "Base address of the accessed memory location must be set");
+  assert(FirstOffset != INT64_MAX && "First byte offset must be set");
+
+  // Check if the bytes of the OR we are looking at match with either big or
+  // little endian value load
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned i = 0; i < ByteWidth; i++) {
+    int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
+    LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
+    BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
+    if (!BigEndian && !LittleEndian)
+      return SDValue();
+  }
+  assert((BigEndian != LittleEndian) && "should be either or");
+  assert(FirstByteProvider && "must be set");
+
+  // Ensure that the first byte is loaded from zero offset of the first load.
+  // So the combined value can be loaded from the first load address.
+  if (MemoryByteOffset(*FirstByteProvider) != 0)
+    return SDValue();
+  LoadSDNode *FirstLoad = FirstByteProvider->Load;
+
+  // The node we are looking at matches with the pattern, check if we can
+  // replace it with a single load and bswap if needed.
+
+  // If the load needs byte swap check if the target supports it
+  bool NeedsBswap = IsBigEndianTarget != BigEndian;
+
+  // Before legalize we can introduce illegal bswaps which will be later
+  // converted to an explicit bswap sequence. This way we end up with a single
+  // load and byte shuffling instead of several loads and byte shuffling.
+  if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Check that a load of the wide type is both allowed and fast on the target
+  bool Fast = false;
+  bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, FirstLoad->getAddressSpace(),
+                                        FirstLoad->getAlignment(), &Fast);
+  if (!Allowed || !Fast)
+    return SDValue();
+
+  SDValue NewLoad =
+      DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
+                  FirstLoad->getPointerInfo(), FirstLoad->getAlignment());
+
+  // Transfer chain users from old loads to the new load.
+  for (LoadSDNode *L : Loads)
+    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+
+  return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
+}
+
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4386,6 +5042,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate xor
   if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
     return RXOR;
@@ -4403,9 +5063,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       default:
         llvm_unreachable("Unhandled SetCC Equivalent!");
       case ISD::SETCC:
-        return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC);
+        return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
       case ISD::SELECT_CC:
-        return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2),
+        return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
                                N0.getOperand(3), NotCC);
       }
     }
@@ -4470,6 +5130,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
                                          N01C->getAPIntValue(), DL, VT));
     }
   }
+
+  // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) &&
+      TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
+      if (C->getAPIntValue() == (OpSizeInBits - 1))
+        return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0));
+  }
+
   // fold (xor x, x) -> 0
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
@@ -4505,8 +5176,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       return Tmp;
 
   // Simplify the expression using non-local knowledge.
-  if (!VT.isVector() &&
-      SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   return SDValue();
@@ -4613,13 +5283,51 @@ SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitRotate(SDNode *N) {
+  SDLoc dl(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  unsigned Bitsize = VT.getScalarSizeInBits();
+
+  // fold (rot x, 0) -> x
+  if (isNullConstantOrNullSplatConstant(N1))
+    return N0;
+
+  // fold (rot x, c) -> (rot x, c % BitSize)
+  if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
+    if (Cst->getAPIntValue().uge(Bitsize)) {
+      uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
+      return DAG.getNode(N->getOpcode(), dl, VT, N0,
+                         DAG.getConstant(RotAmt, dl, N1.getValueType()));
+    }
+  }
+
   // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
-  if (N->getOperand(1).getOpcode() == ISD::TRUNCATE &&
-      N->getOperand(1).getOperand(0).getOpcode() == ISD::AND) {
-    if (SDValue NewOp1 =
-            distributeTruncateThroughAnd(N->getOperand(1).getNode()))
-      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
-                         N->getOperand(0), NewOp1);
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
+  }
+
+  unsigned NextOp = N0.getOpcode();
+  // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
+  if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
+    SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
+    SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
+    if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
+      EVT ShiftVT = C1->getValueType(0);
+      bool SameSide = (N->getOpcode() == NextOp);
+      unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
+      if (SDValue CombinedShift =
+              DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
+        SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
+        SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
+            ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
+            BitsizeC.getNode());
+        return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
+                           CombinedShiftNorm);
+      }
+    }
   }
   return SDValue();
 }
@@ -4662,7 +5370,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
   // fold (shl 0, x) -> 0
-  if (isNullConstant(N0))
+  if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (shl x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -4673,6 +5381,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl undef, x) -> 0
   if (N0.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
                             APInt::getAllOnesValue(OpSizeInBits)))
@@ -4763,7 +5475,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
   // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
   if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
-      cast<BinaryWithFlagsSDNode>(N0)->Flags.hasExact()) {
+      N0->getFlags().hasExact()) {
     if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
       uint64_t C1 = N0C1->getZExtValue();
       uint64_t C2 = N1C->getZExtValue();
@@ -4788,12 +5500,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
         APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
         SDValue Shift;
         if (c2 > c1) {
-          Mask = Mask.shl(c2 - c1);
+          Mask <<= c2 - c1;
           SDLoc DL(N);
           Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
                               DAG.getConstant(c2 - c1, DL, N1.getValueType()));
         } else {
-          Mask = Mask.lshr(c1 - c2);
+          Mask.lshrInPlace(c1 - c2);
           SDLoc DL(N);
           Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
                               DAG.getConstant(c1 - c2, DL, N1.getValueType()));
@@ -4808,9 +5520,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
   if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
       isConstantOrConstantVector(N1, /* No Opaques */ true)) {
-    unsigned BitSize = VT.getScalarSizeInBits();
     SDLoc DL(N);
-    SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT);
+    SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
     SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
   }
@@ -4851,6 +5562,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // Arithmetic shifting an all-sign-bit value is a no-op.
+  // fold (sra 0, x) -> 0
+  // fold (sra -1, x) -> -1
   if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
     return N0;
 
@@ -4865,18 +5578,16 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
-  // fold (sra 0, x) -> 0
-  if (isNullConstant(N0))
-    return N0;
-  // fold (sra -1, x) -> -1
-  if (isAllOnesConstant(N0))
-    return N0;
   // fold (sra x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
     return DAG.getUNDEF(VT);
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
@@ -5016,7 +5727,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
   // fold (srl 0, x) -> 0
-  if (isNullConstant(N0))
+  if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (srl x, c >= size(x)) -> undef
   if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
@@ -5024,6 +5735,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // fold (srl x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // if (srl x, c) is known to be zero, return 0
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(OpSizeInBits)))
@@ -5049,24 +5764,24 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
 
   // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2)))
   if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
-      N0.getOperand(0).getOpcode() == ISD::SRL &&
-      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
-    uint64_t c1 =
-      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
-    uint64_t c2 = N1C->getZExtValue();
-    EVT InnerShiftVT = N0.getOperand(0).getValueType();
-    EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType();
-    uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
-    // This is only valid if the OpSizeInBits + c1 = size of inner shift.
-    if (c1 + OpSizeInBits == InnerShiftSize) {
-      SDLoc DL(N0);
-      if (c1 + c2 >= InnerShiftSize)
-        return DAG.getConstant(0, DL, VT);
-      return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                         DAG.getNode(ISD::SRL, DL, InnerShiftVT,
-                                     N0.getOperand(0)->getOperand(0),
-                                     DAG.getConstant(c1 + c2, DL,
-                                                     ShiftCountVT)));
+      N0.getOperand(0).getOpcode() == ISD::SRL) {
+    if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) {
+      uint64_t c1 = N001C->getZExtValue();
+      uint64_t c2 = N1C->getZExtValue();
+      EVT InnerShiftVT = N0.getOperand(0).getValueType();
+      EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType();
+      uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
+      // This is only valid if the OpSizeInBits + c1 = size of inner shift.
+      if (c1 + OpSizeInBits == InnerShiftSize) {
+        SDLoc DL(N0);
+        if (c1 + c2 >= InnerShiftSize)
+          return DAG.getConstant(0, DL, VT);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                           DAG.getNode(ISD::SRL, DL, InnerShiftVT,
+                                       N0.getOperand(0).getOperand(0),
+                                       DAG.getConstant(c1 + c2, DL,
+                                                       ShiftCountVT)));
+      }
     }
   }
 
@@ -5074,9 +5789,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
     SDLoc DL(N);
-    APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
     SDValue Mask =
-        DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1);
+        DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
     AddToWorklist(Mask.getNode());
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
   }
@@ -5097,7 +5811,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
                           DAG.getConstant(ShiftAmt, DL0,
                                           getShiftAmountTy(SmallVT)));
       AddToWorklist(SmallShift.getNode());
-      APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt);
+      APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
       SDLoc DL(N);
       return DAG.getNode(ISD::AND, DL, VT,
                          DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
@@ -5115,20 +5829,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
   if (N1C && N0.getOpcode() == ISD::CTLZ &&
       N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
-    APInt KnownZero, KnownOne;
-    DAG.computeKnownBits(N0.getOperand(0), KnownZero, KnownOne);
+    KnownBits Known;
+    DAG.computeKnownBits(N0.getOperand(0), Known);
 
     // If any of the input bits are KnownOne, then the input couldn't be all
     // zeros, thus the result of the srl will always be zero.
-    if (KnownOne.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
+    if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
 
     // If all of the bits input the to ctlz node are known to be zero, then
     // the result of the ctlz is "32" and the result of the shift is one.
-    APInt UnknownBits = ~KnownZero;
+    APInt UnknownBits = ~Known.Zero;
     if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
 
     // Otherwise, check to see if there is exactly one bit input to the ctlz.
-    if ((UnknownBits & (UnknownBits - 1)) == 0) {
+    if (UnknownBits.isPowerOf2()) {
       // Okay, we know that only that the single bit specified by UnknownBits
       // could be set on input to the CTLZ node. If this bit is set, the SRL
       // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
@@ -5202,6 +5916,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (abs c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
+  // fold (abs (abs x)) -> (abs x)
+  if (N0.getOpcode() == ISD::ABS)
+    return N0;
+  // fold (abs x) -> x iff not-negative
+  if (DAG.SignBitIsZero(N0))
+    return N0;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -5217,7 +5947,11 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
 
 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
 
+  // fold (bitreverse c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
   // fold (bitreverse (bitreverse x)) -> x
   if (N0.getOpcode() == ISD::BITREVERSE)
     return N0.getOperand(0);
@@ -5311,7 +6045,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   }
 }
 
-// TODO: We should handle other cases of selecting between {-1,0,1} here.
 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5320,6 +6053,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   EVT CondVT = Cond.getValueType();
   SDLoc DL(N);
 
+  if (!VT.isInteger())
+    return SDValue();
+
+  auto *C1 = dyn_cast<ConstantSDNode>(N1);
+  auto *C2 = dyn_cast<ConstantSDNode>(N2);
+  if (!C1 || !C2)
+    return SDValue();
+
+  // Only do this before legalization to avoid conflicting with target-specific
+  // transforms in the other direction (create a select from a zext/sext). There
+  // is also a target-independent combine here in DAGCombiner in the other
+  // direction for (select Cond, -1, 0) when the condition is not i1.
+  if (CondVT == MVT::i1 && !LegalOperations) {
+    if (C1->isNullValue() && C2->isOne()) {
+      // select Cond, 0, 1 --> zext (!Cond)
+      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+      if (VT != MVT::i1)
+        NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
+      return NotCond;
+    }
+    if (C1->isNullValue() && C2->isAllOnesValue()) {
+      // select Cond, 0, -1 --> sext (!Cond)
+      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+      if (VT != MVT::i1)
+        NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
+      return NotCond;
+    }
+    if (C1->isOne() && C2->isNullValue()) {
+      // select Cond, 1, 0 --> zext (Cond)
+      if (VT != MVT::i1)
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+      return Cond;
+    }
+    if (C1->isAllOnesValue() && C2->isNullValue()) {
+      // select Cond, -1, 0 --> sext (Cond)
+      if (VT != MVT::i1)
+        Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+      return Cond;
+    }
+
+    // For any constants that differ by 1, we can transform the select into an
+    // extend and add. Use a target hook because some targets may prefer to
+    // transform in the other direction.
+    if (TLI.convertSelectOfConstantsToMath()) {
+      if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
+        // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+      }
+      if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
+        // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+      }
+    }
+
+    return SDValue();
+  }
+
   // fold (select Cond, 0, 1) -> (xor Cond, 1)
   // We can't do this reliably if integer based booleans have different contents
   // to floating point based booleans. This is because we can't tell whether we
@@ -5329,15 +6123,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   // undiscoverable (or not reasonably discoverable). For example, it could be
   // in another basic block or it could require searching a complicated
   // expression.
-  if (VT.isInteger() &&
-      (CondVT == MVT::i1 || (CondVT.isInteger() &&
-                             TLI.getBooleanContents(false, true) ==
-                                 TargetLowering::ZeroOrOneBooleanContent &&
-                             TLI.getBooleanContents(false, false) ==
-                                 TargetLowering::ZeroOrOneBooleanContent)) &&
-      isNullConstant(N1) && isOneConstant(N2)) {
-    SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
-                                  DAG.getConstant(1, DL, CondVT));
+  if (CondVT.isInteger() &&
+      TLI.getBooleanContents(false, true) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      TLI.getBooleanContents(false, false) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      C1->isNullValue() && C2->isOne()) {
+    SDValue NotCond =
+        DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
     if (VT.bitsEq(CondVT))
       return NotCond;
     return DAG.getZExtOrTrunc(NotCond, DL, VT);
@@ -5352,19 +6145,22 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   EVT VT = N->getValueType(0);
   EVT VT0 = N0.getValueType();
+  SDLoc DL(N);
 
   // fold (select C, X, X) -> X
   if (N1 == N2)
     return N1;
+
   if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) {
     // fold (select true, X, Y) -> X
     // fold (select false, X, Y) -> Y
     return !N0C->isNullValue() ? N1 : N2;
   }
+
   // fold (select X, X, Y) -> (or X, Y)
   // fold (select X, 1, Y) -> (or C, Y)
   if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
+    return DAG.getNode(ISD::OR, DL, VT, N0, N2);
 
   if (SDValue V = foldSelectOfConstants(N))
     return V;
@@ -5373,22 +6169,22 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
     SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
     AddToWorklist(NOTNode.getNode());
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2);
+    return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2);
   }
   // fold (select C, X, 1) -> (or (not C), X)
   if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
     SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
     AddToWorklist(NOTNode.getNode());
-    return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1);
+    return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1);
   }
   // fold (select X, Y, X) -> (and X, Y)
   // fold (select X, Y, 0) -> (and X, Y)
   if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
-    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
+    return DAG.getNode(ISD::AND, DL, VT, N0, N1);
 
   // If we can fold this based on the true/false value, do so.
   if (SimplifySelectOps(N, N1, N2))
-    return SDValue(N, 0);  // Don't revisit N.
+    return SDValue(N, 0); // Don't revisit N.
 
   if (VT0 == MVT::i1) {
     // The code in this block deals with the following 2 equivalences:
@@ -5399,27 +6195,27 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // to the right anyway if we find the inner select exists in the DAG anyway
     // and we always transform to the left side if we know that we can further
     // optimize the combination of the conditions.
-    bool normalizeToSequence
-      = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    bool normalizeToSequence =
+        TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
     // select (and Cond0, Cond1), X, Y
     //   -> select Cond0, (select Cond1, X, Y), Y
     if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
       SDValue Cond0 = N0->getOperand(0);
       SDValue Cond1 = N0->getOperand(1);
-      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
-                                        N1.getValueType(), Cond1, N1, N2);
+      SDValue InnerSelect =
+          DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
       if (normalizeToSequence || !InnerSelect.use_empty())
-        return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0,
+        return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
                            InnerSelect, N2);
     }
     // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
     if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
       SDValue Cond0 = N0->getOperand(0);
       SDValue Cond1 = N0->getOperand(1);
-      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
-                                        N1.getValueType(), Cond1, N1, N2);
+      SDValue InnerSelect =
+          DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
       if (normalizeToSequence || !InnerSelect.use_empty())
-        return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, N1,
+        return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
                            InnerSelect);
     }
 
@@ -5431,15 +6227,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
         // Create the actual and node if we can generate good code for it.
         if (!normalizeToSequence) {
-          SDValue And = DAG.getNode(ISD::AND, SDLoc(N), N0.getValueType(),
-                                    N0, N1_0);
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), And,
-                             N1_1, N2);
+          SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2);
         }
         // Otherwise see if we can optimize the "and" to a better pattern.
         if (SDValue Combined = visitANDLike(N0, N1_0, N))
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Combined,
-                             N1_1, N2);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
+                             N2);
       }
     }
     // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
@@ -5450,15 +6244,13 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
         // Create the actual or node if we can generate good code for it.
         if (!normalizeToSequence) {
-          SDValue Or = DAG.getNode(ISD::OR, SDLoc(N), N0.getValueType(),
-                                   N0, N2_0);
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Or,
-                             N1, N2_2);
+          SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2);
         }
         // Otherwise see if we can optimize to a better pattern.
         if (SDValue Combined = visitORLike(N0, N2_0, N))
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Combined,
-                             N1, N2_2);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
+                             N2_2);
       }
     }
   }
@@ -5469,8 +6261,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) {
         SDValue Cond0 = N0->getOperand(0);
         if (C->isOne())
-          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(),
-                             Cond0, N2, N1);
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N2, N1);
       }
     }
   }
@@ -5487,24 +6278,21 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
     // no signed zeros as well as no nans.
     const TargetOptions &Options = DAG.getTarget().Options;
-    if (Options.UnsafeFPMath &&
-        VT.isFloatingPoint() && N0.hasOneUse() &&
+    if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() &&
         DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
       ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
 
-      if (SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0),
-                                                N0.getOperand(1), N1, N2, CC,
-                                                TLI, DAG))
+      if (SDValue FMinMax = combineMinNumMaxNum(
+              DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG))
         return FMinMax;
     }
 
     if ((!LegalOperations &&
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
         TLI.isOperationLegal(ISD::SELECT_CC, VT))
-      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1),
-                         N1, N2, N0.getOperand(2));
-    return SimplifySelect(SDLoc(N), N0, N1, N2);
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0),
+                         N0.getOperand(1), N1, N2, N0.getOperand(2));
+    return SimplifySelect(DL, N0, N1, N2);
   }
 
   return SDValue();
@@ -5847,7 +6635,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                            ISD::NON_EXTLOAD, MLD->isExpandingLoad());
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MLD->isExpandingLoad()); 
+                                     MLD->isExpandingLoad());
 
     MMO = DAG.getMachineFunction().
     getMachineMemOperand(MLD->getPointerInfo(),
@@ -5908,6 +6696,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
 
     if (isAbs) {
       EVT VT = LHS.getValueType();
+      if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
+        return DAG.getNode(ISD::ABS, DL, VT, LHS);
+
       SDValue Shift = DAG.getNode(
           ISD::SRA, DL, VT, LHS,
           DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
@@ -5921,34 +6712,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (SimplifySelectOps(N, N1, N2))
     return SDValue(N, 0);  // Don't revisit N.
 
-  // If the VSELECT result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (N0.getOpcode() == ISD::SETCC) {
-    EVT VT = N->getValueType(0);
-
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
-    std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
-    std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
-    std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
-
-    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
-    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
-
-    // Add the new VSELECT nodes to the work list in case they need to be split
-    // again.
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-  }
-
   // Fold (vselect (build_vector all_ones), N1, N2) -> N1
   if (ISD::isBuildVectorAllOnes(N0.getNode()))
     return N1;
@@ -6030,6 +6793,19 @@ SDValue DAGCombiner::visitSETCCE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+
+  // If Carry is false, fold to a regular SETCC.
+  if (isNullConstant(Carry))
+    return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
+
+  return SDValue();
+}
+
 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
 /// a build_vector of constants.
 /// This function is called by the DAGCombiner when visiting sext/zext/aext
@@ -6258,6 +7034,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
 
+  // Simplify TF.
+  AddToWorklist(NewChain.getNode());
+
   CombineTo(N, NewValue);
 
   // Replace uses of the original load (before extension)
@@ -6270,9 +7049,55 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
+/// If we're narrowing or widening the result of a vector select and the final
+/// size is the same size as a setcc (compare) feeding the select, then try to
+/// apply the cast operation to the select's operands because matching vector
+/// sizes for a select condition and other operands should be more efficient.
+SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
+  unsigned CastOpcode = Cast->getOpcode();
+  assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
+          CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
+          CastOpcode == ISD::FP_ROUND) &&
+         "Unexpected opcode for vector select narrowing/widening");
+
+  // We only do this transform before legal ops because the pattern may be
+  // obfuscated by target-specific operations after legalization. Do not create
+  // an illegal select op, however, because that may be difficult to lower.
+  EVT VT = Cast->getValueType(0);
+  if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+
+  SDValue VSel = Cast->getOperand(0);
+  if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
+      VSel.getOperand(0).getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Does the setcc have the same vector size as the casted select?
+  SDValue SetCC = VSel.getOperand(0);
+  EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
+  if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
+    return SDValue();
+
+  // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
+  SDValue A = VSel.getOperand(1);
+  SDValue B = VSel.getOperand(2);
+  SDValue CastA, CastB;
+  SDLoc DL(Cast);
+  if (CastOpcode == ISD::FP_ROUND) {
+    // FP_ROUND (fptrunc) has an extra flag operand to pass along.
+    CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
+    CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
+  } else {
+    CastA = DAG.getNode(CastOpcode, DL, VT, A);
+    CastB = DAG.getNode(CastOpcode, DL, VT, B);
+  }
+  return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
+}
+
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
@@ -6281,8 +7106,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext (sext x)) -> (sext x)
   // fold (sext (aext x)) -> (sext x)
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
-    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
-                       N0.getOperand(0));
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
 
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (sext (truncate (load x))) -> (sext (smaller load x))
@@ -6314,12 +7138,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
       // bits, just sext from i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op);
+        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
     } else {
       // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
       // bits, just truncate to i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     }
 
     // fold (sext (truncate x)) -> (sextinreg x).
@@ -6329,7 +7153,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
       else if (OpBits > DestBits)
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op,
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
                          DAG.getValueType(N0.getValueType()));
     }
   }
@@ -6349,17 +7173,20 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
-                      ISD::SIGN_EXTEND);
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0);
     }
   }
 
@@ -6376,8 +7203,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
                                        LN0->getMemOperand());
       CombineTo(N, ExtLoad);
@@ -6411,32 +7237,38 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                          LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
-        SDLoc DL(N);
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
                                     SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
+        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
         CombineTo(N, And);
-        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL,
-                        ISD::SIGN_EXTEND);
-        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        if (NoReplaceTrunc)
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        else
+          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
-    EVT N0VT = N0.getOperand(0).getValueType();
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+    EVT N00VT = N0.getOperand(0).getValueType();
+
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations &&
-        TLI.getBooleanContents(N0VT) ==
+        TLI.getBooleanContents(N00VT) ==
             TargetLowering::ZeroOrNegativeOneBooleanContent) {
       // On some architectures (such as SSE/NEON/etc) the SETCC result type is
       // of the same size as the compared operands. Only optimize sext(setcc())
       // if this is the case.
-      EVT SVT = getSetCCResultType(N0VT);
+      EVT SVT = getSetCCResultType(N00VT);
 
       // We know that the # elements of the results is the same as the
       // # elements of the compare (and the # elements of the compare result
@@ -6444,19 +7276,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == SVT.getSizeInBits())
-        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
-                             N0.getOperand(1),
-                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSetCC(DL, VT, N00, N01, CC);
 
       // If the desired elements are smaller or larger than the source
-      // elements we can use a matching integer vector type and then
-      // truncate/sign extend
-      EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
-      if (SVT == MatchingVectorType) {
-        SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType,
-                               N0.getOperand(0), N0.getOperand(1),
-                               cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
+      // elements, we can use a matching integer vector type and then
+      // truncate/sign extend.
+      EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+      if (SVT == MatchingVecType) {
+        SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
+        return DAG.getSExtOrTrunc(VsetCC, DL, VT);
       }
     }
 
@@ -6465,36 +7293,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     // getBooleanContents().
     unsigned SetCCWidth = N0.getScalarValueSizeInBits();
 
-    SDLoc DL(N);
     // To determine the "true" side of the select, we need to know the high bit
     // of the value returned by the setcc if it evaluates to true.
     // If the type of the setcc is i1, then the true case of the select is just
     // sext(i1 1), that is, -1.
     // If the type of the setcc is larger (say, i8) then the value of the high
-    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value
+    // bit depends on getBooleanContents(), so ask TLI for a real "true" value
     // of the appropriate width.
-    SDValue ExtTrueVal =
-        (SetCCWidth == 1)
-            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()),
-                              DL, VT)
-            : TLI.getConstTrueVal(DAG, VT, DL);
-
-    if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal,
-            DAG.getConstant(0, DL, VT),
-            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+    SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT)
+                                           : TLI.getConstTrueVal(DAG, VT, DL);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    if (SDValue SCC =
+            SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
       return SCC;
 
     if (!VT.isVector()) {
-      EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType());
-      if (!LegalOperations ||
-          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
-        SDLoc DL(N);
-        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC =
-            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC);
-        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal,
-                             DAG.getConstant(0, DL, VT));
+      EVT SetCCVT = getSetCCResultType(N00VT);
+      // Don't do this transform for i1 because there's a select transform
+      // that would reverse it.
+      // TODO: We should not do this transform at all without a target hook
+      // because a sext is likely cheaper than a select?
+      if (SetCCVT.getScalarSizeInBits() != 1 &&
+          (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
+        SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
+        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
       }
     }
   }
@@ -6502,21 +7324,23 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext x) -> (zext x) if the sign bit is known zero.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
       DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
 
   return SDValue();
 }
 
 // isTruncateOf - If N is a truncate of some other value, return true, record
-// the value being truncated in Op and which of Op's bits are zero in KnownZero.
-// This function computes KnownZero to avoid a duplicated call to
+// the value being truncated in Op and which of Op's bits are zero/one in Known.
+// This function computes KnownBits to avoid a duplicated call to
 // computeKnownBits in the caller.
 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
-                         APInt &KnownZero) {
-  APInt KnownOne;
+                         KnownBits &Known) {
   if (N->getOpcode() == ISD::TRUNCATE) {
     Op = N->getOperand(0);
-    DAG.computeKnownBits(Op, KnownZero, KnownOne);
+    DAG.computeKnownBits(Op, Known);
     return true;
   }
 
@@ -6535,9 +7359,9 @@ static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
   else
     return false;
 
-  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+  DAG.computeKnownBits(Op, Known);
 
-  if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue())
+  if (!(Known.Zero | 1).isAllOnesValue())
     return false;
 
   return true;
@@ -6562,8 +7386,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   // This is valid when the truncated bits of x are already zero.
   // FIXME: We should extend this to work for vectors too.
   SDValue Op;
-  APInt KnownZero;
-  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) {
+  KnownBits Known;
+  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) {
     APInt TruncatedBits =
       (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
       APInt(Op.getValueSizeInBits(), 0) :
@@ -6571,14 +7395,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                         N0.getValueSizeInBits(),
                         std::min(Op.getValueSizeInBits(),
                                  VT.getSizeInBits()));
-    if (TruncatedBits == (KnownZero & TruncatedBits)) {
-      if (VT.bitsGT(Op.getValueType()))
-        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
-      if (VT.bitsLT(Op.getValueType()))
-        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-
-      return Op;
-    }
+    if (TruncatedBits.isSubsetOf(Known.Zero))
+      return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
   }
 
   // fold (zext (truncate (load x))) -> (zext (smaller load x))
@@ -6625,14 +7443,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     }
 
     if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
-      SDValue Op = N0.getOperand(0);
-      if (SrcVT.bitsLT(VT)) {
-        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
-        AddToWorklist(Op.getNode());
-      } else if (SrcVT.bitsGT(VT)) {
-        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
-        AddToWorklist(Op.getNode());
-      }
+      SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
+      AddToWorklist(Op.getNode());
       return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
     }
   }
@@ -6646,11 +7458,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                            N0.getValueType()) ||
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
-    if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
-    } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
-    }
+    X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     SDLoc DL(N);
@@ -6677,14 +7485,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
+
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
-                      ISD::ZERO_EXTEND);
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND);
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
+      CombineTo(N, ExtLoad);
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
@@ -6734,11 +7546,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
                                     SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
+        bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
         CombineTo(N, And);
-        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL,
-                        ISD::ZERO_EXTEND);
-        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        if (NoReplaceTrunc)
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+        else
+          CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+        return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
@@ -6837,6 +7652,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                        ShAmt);
   }
 
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
   return SDValue();
 }
 
@@ -6871,14 +7689,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   }
 
   // fold (aext (truncate x))
-  if (N0.getOpcode() == ISD::TRUNCATE) {
-    SDValue TruncOp = N0.getOperand(0);
-    if (TruncOp.getValueType() == VT)
-      return TruncOp; // x iff x size == zext size.
-    if (TruncOp.getValueType().bitsGT(VT))
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
-    return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
-  }
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
   // if the trunc is not free.
@@ -6889,11 +7701,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                           N0.getValueType())) {
     SDLoc DL(N);
     SDValue X = N0.getOperand(0).getOperand(0);
-    if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
-    } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, DL, VT, X);
-    }
+    X = DAG.getAnyExtOrTrunc(X, DL, VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     return DAG.getNode(ISD::AND, DL, VT,
@@ -6917,13 +7725,18 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
-      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ANY_EXTEND);
-      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      // If the load value is used only by N, replace it via CombineTo N.
+      bool NoReplaceTrunc = N0.hasOneUse();
+      CombineTo(N, ExtLoad); 
+      if (NoReplaceTrunc)
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
+      else
+        CombineTo(LN0, Trunc, ExtLoad.getValue(1));
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
@@ -6991,9 +7804,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAssertZext(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT EVT = cast<VTSDNode>(N1)->getVT();
+
+  // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt)
+  if (N0.getOpcode() == ISD::AssertZext &&
+      EVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
+    return N0;
+
+  return SDValue();
+}
+
 /// See if the specified operand can be simplified with the knowledge that only
 /// the bits specified by Mask are used.  If so, return the simpler operand,
 /// otherwise return a null SDValue.
+///
+/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
+/// simplify nodes with multiple uses more aggressively.)
 SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   switch (V.getOpcode()) {
   default: break;
@@ -7029,6 +7858,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
         return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                            SimplifyLHS, V.getOperand(1));
     }
+    break;
+  case ISD::AND: {
+    // X & -1 -> X (ignoring bits which aren't demanded).
+    ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
+    if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask)
+      return V.getOperand(0);
+    break;
+  }
   }
   return SDValue();
 }
@@ -7169,7 +8006,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
                                PtrType, LN0->getBasePtr(),
                                DAG.getConstant(PtrOff, DL, PtrType),
-                               &Flags);
+                               Flags);
   AddToWorklist(NewPtr.getNode());
 
   SDValue Load;
@@ -7244,6 +8081,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
+  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x)
+  if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
+       N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
+       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+      N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
+      return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
+  }
+
   // fold (sext_in_reg (zext x)) -> (sext x)
   // iff we are extending the source sign bit.
   if (N0.getOpcode() == ISD::ZERO_EXTEND) {
@@ -7254,7 +8101,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
-  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
     return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
@@ -7439,18 +8286,20 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
       TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
-    if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) {
-      uint64_t Amt = CAmt->getZExtValue();
-      unsigned Size = VT.getScalarSizeInBits();
-
-      if (Amt < Size) {
-        SDLoc SL(N);
-        EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    SDValue Amt = N0.getOperand(1);
+    KnownBits Known;
+    DAG.computeKnownBits(Amt, Known);
+    unsigned Size = VT.getScalarSizeInBits();
+    if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
+      SDLoc SL(N);
+      EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
 
-        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
-        return DAG.getNode(ISD::SHL, SL, VT, Trunc,
-                           DAG.getConstant(Amt, SL, AmtVT));
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+      if (AmtVT != Amt.getValueType()) {
+        Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
+        AddToWorklist(Amt.getNode());
       }
+      return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
     }
   }
 
@@ -7496,6 +8345,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
                                                      VT.getSizeInBits())))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
   }
+
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
   if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
@@ -7517,6 +8367,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       }
     }
   }
+
   // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
   // where ... are all 'undef'.
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
@@ -7582,6 +8433,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
+  // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
+  // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
+  // When the adde's carry is not used.
+  if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) &&
+      N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
+      (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    SDLoc SL(N);
+    auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+    auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
+    auto VTs = DAG.getVTList(VT, N0->getValueType(1));
+    return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
+  }
+
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
   return SDValue();
 }
 
@@ -7645,11 +8512,11 @@ static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
   switch (N0.getOpcode()) {
   case ISD::AND:
     FPOpcode = ISD::FABS;
-    SignMask = ~APInt::getSignBit(SourceVT.getSizeInBits());
+    SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits());
     break;
   case ISD::XOR:
     FPOpcode = ISD::FNEG;
-    SignMask = APInt::getSignBit(SourceVT.getSizeInBits());
+    SignMask = APInt::getSignMask(SourceVT.getSizeInBits());
     break;
   // TODO: ISD::OR --> ISD::FNABS?
   default:
@@ -7672,6 +8539,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
   // If the input is a BUILD_VECTOR with all constant elements, fold this now.
   // Only do this before legalize, since afterward the target may be depending
   // on the bitconvert.
@@ -7757,7 +8627,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
       assert(VT.getSizeInBits() == 128);
       SDValue SignBit = DAG.getConstant(
-          APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
+          APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
       SDValue FlipBit;
       if (N0.getOpcode() == ISD::FNEG) {
         FlipBit = SignBit;
@@ -7777,7 +8647,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       AddToWorklist(FlipBits.getNode());
       return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
     }
-    APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+    APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
       return DAG.getNode(ISD::XOR, DL, VT,
                          NewConv, DAG.getConstant(SignBit, DL, VT));
@@ -7825,7 +8695,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       }
 
       if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
-        APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2);
+        APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
         SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
         AddToWorklist(Cst.getNode());
         SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
@@ -7846,7 +8716,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         AddToWorklist(FlipBits.getNode());
         return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
       }
-      APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+      APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
       X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, SDLoc(X), VT));
       AddToWorklist(X.getNode());
@@ -8029,7 +8899,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
       APInt ThisVal = OpVal.trunc(DstBitSize);
       Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
-      OpVal = OpVal.lshr(DstBitSize);
+      OpVal.lshrInPlace(DstBitSize);
     }
 
     // For big endian targets, swap the order of the pieces of each element.
@@ -8040,6 +8910,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
+static bool isContractable(SDNode *N) {
+  SDNodeFlags F = N->getFlags();
+  return F.hasAllowContract() || F.hasUnsafeAlgebra();
+}
+
 /// Try to perform FMA combining on a given FADD node.
 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDValue N0 = N->getOperand(0);
@@ -8048,24 +8923,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
 
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !isContractable(N))
+    return SDValue();
+
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
-  ;
-  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
@@ -8073,35 +8951,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
+    if (N.getOpcode() != ISD::FMUL)
+      return false;
+    return AllowFusionGlobally || isContractable(N.getNode());
+  };
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && N0.getOpcode() == ISD::FMUL &&
-      N1.getOpcode() == ISD::FMUL) {
+  if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
     if (N0.getNode()->use_size() > N1.getNode()->use_size())
       std::swap(N0, N1);
   }
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-  if (N0.getOpcode() == ISD::FMUL &&
-      (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1), N1);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
-  if (N1.getOpcode() == ISD::FMUL &&
-      (Aggressive || N1->hasOneUse())) {
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N1.getOperand(0), N1.getOperand(1), N0);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (AllowFusion && LookThroughFPExt) {
+  if (LookThroughFPExt) {
     // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N00))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N00.getOperand(0)),
@@ -8113,7 +8995,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     // Note: Commutes FADD operands.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N10))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N10.getOperand(0)),
@@ -8154,7 +9036,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                      N0));
     }
 
-    if (AllowFusion && LookThroughFPExt) {
+    if (LookThroughFPExt) {
       // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y, (fma (fpext u), (fpext v), z))
       auto FoldFAddFMAFPExtFMul = [&] (
@@ -8169,7 +9051,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N02 = N0.getOperand(2);
         if (N02.getOpcode() == ISD::FP_EXTEND) {
           SDValue N020 = N02.getOperand(0);
-          if (N020.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N020))
             return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
                                         N020.getOperand(0), N020.getOperand(1),
                                         N1);
@@ -8195,7 +9077,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N00 = N0.getOperand(0);
         if (N00.getOpcode() == PreferredFusedOpcode) {
           SDValue N002 = N00.getOperand(2);
-          if (N002.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N002))
             return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
                                         N002.getOperand(0), N002.getOperand(1),
                                         N1);
@@ -8208,7 +9090,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N12 = N1.getOperand(2);
         if (N12.getOpcode() == ISD::FP_EXTEND) {
           SDValue N120 = N12.getOperand(0);
-          if (N120.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N120))
             return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
                                         N120.getOperand(0), N120.getOperand(1),
                                         N0);
@@ -8224,7 +9106,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N10 = N1.getOperand(0);
         if (N10.getOpcode() == PreferredFusedOpcode) {
           SDValue N102 = N10.getOperand(2);
-          if (N102.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N102))
             return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
                                         N102.getOperand(0), N102.getOperand(1),
                                         N0);
@@ -8244,23 +9126,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
-
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the subtraction is not contractable, do not combine.
+  if (!AllowFusionGlobally && !isContractable(N))
+    return SDValue();
+
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
-  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
@@ -8268,9 +9153,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
+    if (N.getOpcode() != ISD::FMUL)
+      return false;
+    return AllowFusionGlobally || isContractable(N.getNode());
+  };
+
   // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-  if (N0.getOpcode() == ISD::FMUL &&
-      (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1),
                        DAG.getNode(ISD::FNEG, SL, VT, N1));
@@ -8278,16 +9170,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
-  if (N1.getOpcode() == ISD::FMUL &&
-      (Aggressive || N1->hasOneUse()))
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse()))
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT,
                                    N1.getOperand(0)),
                        N1.getOperand(1), N0);
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
-  if (N0.getOpcode() == ISD::FNEG &&
-      N0.getOperand(0).getOpcode() == ISD::FMUL &&
+  if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
       (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
@@ -8297,12 +9187,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (AllowFusion && LookThroughFPExt) {
+  if (LookThroughFPExt) {
     // fold (fsub (fpext (fmul x, y)), z)
     //   -> (fma (fpext x), (fpext y), (fneg z))
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N00))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N00.getOperand(0)),
@@ -8316,7 +9206,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // Note: Commutes FSUB operands.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N10))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT,
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8336,7 +9226,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == ISD::FNEG) {
         SDValue N000 = N00.getOperand(0);
-        if (N000.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N000)) {
           return DAG.getNode(ISD::FNEG, SL, VT,
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8358,7 +9248,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == ISD::FP_EXTEND) {
         SDValue N000 = N00.getOperand(0);
-        if (N000.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N000)) {
           return DAG.getNode(ISD::FNEG, SL, VT,
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8378,10 +9268,9 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma x, y (fma u, v, (fneg z)))
     // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
     // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath &&
-        N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
+    if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode &&
+        isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
+        N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8395,9 +9284,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
     // are currently only supported on binary nodes.
-    if (Options.UnsafeFPMath &&
-        N1.getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+    if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode &&
+        isContractableFMUL(N1.getOperand(2))) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8410,14 +9298,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                      N21, N0));
     }
 
-    if (AllowFusion && LookThroughFPExt) {
+    if (LookThroughFPExt) {
       // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
       if (N0.getOpcode() == PreferredFusedOpcode) {
         SDValue N02 = N0.getOperand(2);
         if (N02.getOpcode() == ISD::FP_EXTEND) {
           SDValue N020 = N02.getOperand(0);
-          if (N020.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N020))
             return DAG.getNode(PreferredFusedOpcode, SL, VT,
                                N0.getOperand(0), N0.getOperand(1),
                                DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8440,7 +9328,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         SDValue N00 = N0.getOperand(0);
         if (N00.getOpcode() == PreferredFusedOpcode) {
           SDValue N002 = N00.getOperand(2);
-          if (N002.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N002))
             return DAG.getNode(PreferredFusedOpcode, SL, VT,
                                DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                            N00.getOperand(0)),
@@ -8461,7 +9349,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       if (N1.getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
         SDValue N120 = N1.getOperand(2).getOperand(0);
-        if (N120.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N120)) {
           SDValue N1200 = N120.getOperand(0);
           SDValue N1201 = N120.getOperand(1);
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8488,7 +9376,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         SDValue N100 = N1.getOperand(0).getOperand(0);
         SDValue N101 = N1.getOperand(0).getOperand(1);
         SDValue N102 = N1.getOperand(0).getOperand(2);
-        if (N102.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N102)) {
           SDValue N1020 = N102.getOperand(0);
           SDValue N1021 = N102.getOperand(1);
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8601,6 +9489,14 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   return SDValue();
 }
 
+static bool isFMulNegTwo(SDValue &N) {
+  if (N.getOpcode() != ISD::FMUL)
+    return false;
+  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1)))
+    return CFP->isExactlyValue(-2.0);
+  return false;
+}
+
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8609,7 +9505,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
-  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  const SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector())
@@ -8624,6 +9520,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
@@ -8636,8 +9535,18 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return DAG.getNode(ISD::FSUB, DL, VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
+  // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B))
+  // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B))
+  if ((isFMulNegTwo(N0) && N0.hasOneUse()) ||
+      (isFMulNegTwo(N1) && N1.hasOneUse())) {
+    bool N1IsFMul = isFMulNegTwo(N1);
+    SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags);
+    return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
+  }
+
   // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) {
+  if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros()) {
     // fold (fadd A, 0) -> A
     if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
       if (N1C->isZero())
@@ -8760,7 +9669,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
-  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  const SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector())
@@ -8771,13 +9680,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, DL, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) {
+  if (Options.NoSignedZerosFPMath  || N->getFlags().hasNoSignedZeros()) {
     // (fsub 0, B) -> -B
     if (N0CFP && N0CFP->isZero()) {
       if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
@@ -8828,7 +9740,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
-  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  const SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -8850,6 +9762,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return N0;
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (Options.UnsafeFPMath) {
     // fold (fmul A, 0) -> 0
     if (N1CFP && N1CFP->isZero())
@@ -8914,6 +9829,52 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     }
   }
 
+  // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
+  // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
+  if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
+      (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
+      TLI.isOperationLegal(ISD::FABS, VT)) {
+    SDValue Select = N0, X = N1;
+    if (Select.getOpcode() != ISD::SELECT)
+      std::swap(Select, X);
+
+    SDValue Cond = Select.getOperand(0);
+    auto TrueOpnd  = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
+    auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
+
+    if (TrueOpnd && FalseOpnd &&
+        Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
+        isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
+        cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+      switch (CC) {
+      default: break;
+      case ISD::SETOLT:
+      case ISD::SETULT:
+      case ISD::SETOLE:
+      case ISD::SETULE:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        std::swap(TrueOpnd, FalseOpnd);
+        // Fall through
+      case ISD::SETOGT:
+      case ISD::SETUGT:
+      case ISD::SETOGE:
+      case ISD::SETUGE:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
+            TLI.isOperationLegal(ISD::FNEG, VT))
+          return DAG.getNode(ISD::FNEG, DL, VT,
+                   DAG.getNode(ISD::FABS, DL, VT, X));
+        if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
+          return DAG.getNode(ISD::FABS, DL, VT, X);
+
+        break;
+      }
+    }
+  }
+
   // FMUL -> FMA combines:
   if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
     AddToWorklist(Fused.getNode());
@@ -8969,7 +9930,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
         isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
                          DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
-                                     &Flags), &Flags);
+                                     Flags), Flags);
     }
 
     // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
@@ -8979,7 +9940,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
       return DAG.getNode(ISD::FMA, DL, VT,
                          N0.getOperand(0),
                          DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
-                                     &Flags),
+                                     Flags),
                          N2);
     }
   }
@@ -9005,16 +9966,16 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     if (N1CFP && N0 == N2) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
                          DAG.getNode(ISD::FADD, DL, VT, N1,
-                                     DAG.getConstantFP(1.0, DL, VT), &Flags),
-                         &Flags);
+                                     DAG.getConstantFP(1.0, DL, VT), Flags),
+                         Flags);
     }
 
     // (fma x, c, (fneg x)) -> (fmul x, (c-1))
     if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
                          DAG.getNode(ISD::FADD, DL, VT, N1,
-                                     DAG.getConstantFP(-1.0, DL, VT), &Flags),
-                         &Flags);
+                                     DAG.getConstantFP(-1.0, DL, VT), Flags),
+                         Flags);
     }
   }
 
@@ -9030,8 +9991,8 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
-  const SDNodeFlags *Flags = N->getFlags();
-  if (!UnsafeMath && !Flags->hasAllowReciprocal())
+  const SDNodeFlags Flags = N->getFlags();
+  if (!UnsafeMath && !Flags.hasAllowReciprocal())
     return SDValue();
 
   // Skip if current node is a reciprocal.
@@ -9054,7 +10015,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
     if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
       // This division is eligible for optimization only if global unsafe math
       // is enabled or if this division allows reciprocal formation.
-      if (UnsafeMath || U->getFlags()->hasAllowReciprocal())
+      if (UnsafeMath || U->getFlags().hasAllowReciprocal())
         Users.insert(U);
     }
   }
@@ -9093,7 +10054,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
-  SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+  SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector())
@@ -9104,6 +10065,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (Options.UnsafeFPMath) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
@@ -9204,8 +10168,10 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1,
-                       &cast<BinaryWithFlagsSDNode>(N)->Flags);
+    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
 
   return SDValue();
 }
@@ -9222,7 +10188,7 @@ SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   // For now, create a Flags object for use with all unsafe math transforms.
   SDNodeFlags Flags;
   Flags.setUnsafeAlgebra(true);
-  return buildSqrtEstimate(N0, &Flags);
+  return buildSqrtEstimate(N0, Flags);
 }
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)
@@ -9497,6 +10463,9 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
                        Tmp, N0.getOperand(1));
   }
 
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
   return SDValue();
 }
 
@@ -9563,6 +10532,9 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
   }
 
+  if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
+    return NewVSel;
+
   return SDValue();
 }
 
@@ -9624,11 +10596,11 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
       if (N0.getValueType().isVector()) {
         // For a vector, get a mask such as 0x80... per scalar element
         // and splat it.
-        SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits());
+        SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
         SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
       } else {
         // For a scalar, just generate 0x80...
-        SignMask = APInt::getSignBit(IntVT.getSizeInBits());
+        SignMask = APInt::getSignMask(IntVT.getSizeInBits());
       }
       SDLoc DL0(N0);
       Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
@@ -9648,10 +10620,10 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
       if (Level >= AfterLegalizeDAG &&
           (TLI.isFPImmLegal(CVal, VT) ||
            TLI.isOperationLegal(ISD::ConstantFP, VT)))
-        return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
-                           DAG.getNode(ISD::FNEG, SDLoc(N), VT,
-                                       N0.getOperand(1)),
-                           &cast<BinaryWithFlagsSDNode>(N0)->Flags);
+        return DAG.getNode(
+            ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
+            DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
+            N0->getFlags());
     }
   }
 
@@ -9729,11 +10701,11 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
       if (N0.getValueType().isVector()) {
         // For a vector, get a mask such as 0x7f... per scalar element
         // and splat it.
-        SignMask = ~APInt::getSignBit(N0.getScalarValueSizeInBits());
+        SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
         SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
       } else {
         // For a scalar, just generate 0x7f...
-        SignMask = ~APInt::getSignBit(IntVT.getSizeInBits());
+        SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
       }
       SDLoc DL(N0);
       Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
@@ -10149,7 +11121,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     //   x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
     //
     // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
-    // indexed load/store and the expresion that needs to be re-written.
+    // indexed load/store and the expression that needs to be re-written.
     //
     // Therefore, we have:
     //   t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
@@ -10361,7 +11333,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
-
+        AddUsersToWorklist(Chain.getNode());
         if (N->use_empty())
           deleteAndRecombine(N);
 
@@ -10414,7 +11386,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
       if (PrevST->getBasePtr() == Ptr &&
           PrevST->getValue().getValueType() == N->getValueType(0))
-      return CombineTo(N, Chain.getOperand(1), Chain);
+        return CombineTo(N, PrevST->getOperand(1), Chain);
     }
   }
 
@@ -10432,14 +11404,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     }
   }
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-#ifndef NDEBUG
-  if (CombinerAAOnlyFunc.getNumOccurrences() &&
-      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
-    UseAA = false;
-#endif
-  if (UseAA && LD->isUnindexed()) {
+  if (LD->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -10462,12 +11427,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                                   MVT::Other, Chain, ReplLoad.getValue(1));
 
-      // Make sure the new and old chains are cleaned up.
-      AddToWorklist(Token.getNode());
-
-      // Replace uses with load result and token factor. Don't add users
-      // to work list.
-      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+      // Replace uses with load result and token factor
+      return CombineTo(N, ReplLoad.getValue(0), Token);
     }
   }
 
@@ -10490,7 +11451,7 @@ namespace {
 /// Shift = srl Ty1 Origin, CstTy Amount
 /// Inst = trunc Shift to Ty2
 ///
-/// Then, it will be rewriten into:
+/// Then, it will be rewritten into:
 /// Slice = load SliceTy, Base + SliceOffset
 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
 ///
@@ -10959,7 +11920,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
     // Check if this is a trunc(lshr).
     if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
         isa<ConstantSDNode>(User->getOperand(1))) {
-      Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
+      Shift = User->getConstantOperandVal(1);
       User = *User->use_begin();
     }
 
@@ -11021,6 +11982,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
   SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
                               ArgChains);
   DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  AddToWorklist(Chain.getNode());
   return true;
 }
 
@@ -11414,44 +12376,36 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
   return false;
 }
 
-SDValue DAGCombiner::getMergedConstantVectorStore(
-    SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores,
-    SmallVectorImpl<SDValue> &Chains, EVT Ty) const {
-  SmallVector<SDValue, 8> BuildVector;
+SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         unsigned NumStores) {
+  SmallVector<SDValue, 8> Chains;
+  SmallPtrSet<const SDNode *, 8> Visited;
+  SDLoc StoreDL(StoreNodes[0].MemNode);
+
+  for (unsigned i = 0; i < NumStores; ++i) {
+    Visited.insert(StoreNodes[i].MemNode);
+  }
 
-  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
-    StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode);
-    Chains.push_back(St->getChain());
-    BuildVector.push_back(St->getValue());
+  // don't include nodes that are children
+  for (unsigned i = 0; i < NumStores; ++i) {
+    if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
+      Chains.push_back(StoreNodes[i].MemNode->getChain());
   }
 
-  return DAG.getBuildVector(Ty, SL, BuildVector);
+  assert(Chains.size() > 0 && "Chain should have generated a chain");
+  return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
-                  SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
-                  unsigned NumStores, bool IsConstantSrc, bool UseVector) {
+    SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
+    bool IsConstantSrc, bool UseVector, bool UseTrunc) {
   // Make sure we have something to merge.
   if (NumStores < 2)
     return false;
 
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned LatestNodeUsed = 0;
-
-  for (unsigned i=0; i < NumStores; ++i) {
-    // Find a chain for the new wide-store operand. Notice that some
-    // of the store nodes that we found may not be selected for inclusion
-    // in the wide store. The chain we use needs to be the chain of the
-    // latest store node which is *used* and replaced by the wide store.
-    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
-      LatestNodeUsed = i;
-  }
-
-  SmallVector<SDValue, 8> Chains;
 
   // The latest Node in the DAG.
-  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
   SDLoc DL(StoreNodes[0].MemNode);
 
   SDValue StoredVal;
@@ -11467,7 +12421,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
 
     if (IsConstantSrc) {
-      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty);
+      SmallVector<SDValue, 8> BuildVector;
+      for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
+        SDValue Val = St->getValue();
+        if (MemVT.getScalarType().isInteger())
+          if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue()))
+            Val = DAG.getConstant(
+                (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(),
+                SDLoc(CFP), MemVT);
+        BuildVector.push_back(Val);
+      }
+      StoredVal = DAG.getBuildVector(Ty, DL, BuildVector);
     } else {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumStores; ++i) {
@@ -11477,7 +12442,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
         if (Val.getValueType() != MemVT)
           return false;
         Ops.push_back(Val);
-        Chains.push_back(St->getChain());
       }
 
       // Build the extracted vector elements back into a vector.
@@ -11497,14 +12461,13 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     for (unsigned i = 0; i < NumStores; ++i) {
       unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
-      Chains.push_back(St->getChain());
 
       SDValue Val = St->getValue();
       StoreInt <<= ElementSizeBytes * 8;
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
-        StoreInt |= C->getAPIntValue().zext(SizeInBits);
+        StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits);
       } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
-        StoreInt |= C->getValueAPF().bitcastToAPInt().zext(SizeInBits);
+        StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
       } else {
         llvm_unreachable("Invalid constant element type");
       }
@@ -11515,194 +12478,181 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
   }
 
-  assert(!Chains.empty());
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
-  SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
-                                  FirstInChain->getBasePtr(),
-                                  FirstInChain->getPointerInfo(),
-                                  FirstInChain->getAlignment());
-
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-  if (UseAA) {
-    // Replace all merged stores with the new store.
-    for (unsigned i = 0; i < NumStores; ++i)
-      CombineTo(StoreNodes[i].MemNode, NewStore);
-  } else {
-    // Replace the last store with the new store.
-    CombineTo(LatestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumStores; ++i) {
-      if (StoreNodes[i].MemNode == LatestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      // ReplaceAllUsesWith will replace all uses that existed when it was
-      // called, but graph optimizations may cause new ones to appear. For
-      // example, the case in pr14333 looks like
-      //
-      //  St's chain -> St -> another store -> X
-      //
-      // And the only difference from St to the other store is the chain.
-      // When we change it's chain to be St's chain they become identical,
-      // get CSEed and the net result is that X is now a use of St.
-      // Since we know that St is redundant, just iterate.
-      while (!St->use_empty())
-        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
-  }
-
-  StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end());
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
+
+  // make sure we use trunc store if it's necessary to be legal.
+  SDValue NewStore;
+  if (UseVector || !UseTrunc) {
+    NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
+                            FirstInChain->getPointerInfo(),
+                            FirstInChain->getAlignment());
+  } else { // Must be realized as a trunc store
+    EVT LegalizedStoredValueTy =
+        TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
+    unsigned LegalizedStoreSize = LegalizedStoredValueTy.getSizeInBits();
+    ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
+    SDValue ExtendedStoreVal =
+        DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
+                        LegalizedStoredValueTy);
+    NewStore = DAG.getTruncStore(
+        NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
+        FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
+        FirstInChain->getAlignment(),
+        FirstInChain->getMemOperand()->getFlags());
+  }
+
+  // Replace all merged stores with the new store.
+  for (unsigned i = 0; i < NumStores; ++i)
+    CombineTo(StoreNodes[i].MemNode, NewStore);
+
+  AddToWorklist(NewChain.getNode());
   return true;
 }
 
-void DAGCombiner::getStoreMergeAndAliasCandidates(
-    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
-    SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
+void DAGCombiner::getStoreMergeCandidates(
+    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  EVT MemVT = St->getMemoryVT();
 
   // We must have a base and an offset.
-  if (!BasePtr.Base.getNode())
+  if (!BasePtr.getBase().getNode())
     return;
 
   // Do not handle stores to undef base pointers.
-  if (BasePtr.Base.isUndef())
+  if (BasePtr.getBase().isUndef())
     return;
 
-  // Walk up the chain and look for nodes with offsets from the same
-  // base pointer. Stop when reaching an instruction with a different kind
-  // or instruction which has a different base pointer.
-  EVT MemVT = St->getMemoryVT();
-  unsigned Seq = 0;
-  StoreSDNode *Index = St;
-
-
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-
-  if (UseAA) {
-    // Look at other users of the same chain. Stores on the same chain do not
-    // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized
-    // to be on the same chain, so don't bother looking at adjacent chains.
-
-    SDValue Chain = St->getChain();
-    for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) {
-      if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
-        if (I.getOperandNo() != 0)
-          continue;
-
-        if (OtherST->isVolatile() || OtherST->isIndexed())
-          continue;
-
-        if (OtherST->getMemoryVT() != MemVT)
-          continue;
-
-        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
-
-        if (Ptr.equalBaseIndex(BasePtr))
-          StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
-      }
+  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
+                       isa<ConstantFPSDNode>(St->getValue());
+  bool IsExtractVecSrc =
+      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
+  bool IsLoadSrc = isa<LoadSDNode>(St->getValue());
+  BaseIndexOffset LBasePtr;
+  // Match on loadbaseptr if relevant.
+  if (IsLoadSrc)
+    LBasePtr = BaseIndexOffset::match(
+        cast<LoadSDNode>(St->getValue())->getBasePtr(), DAG);
+
+  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
+                            int64_t &Offset) -> bool {
+    if (Other->isVolatile() || Other->isIndexed())
+      return false;
+    // We can merge constant floats to equivalent integers
+    if (Other->getMemoryVT() != MemVT)
+      if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) &&
+            isa<ConstantFPSDNode>(Other->getValue())))
+        return false;
+    if (IsLoadSrc) {
+      // The Load's Base Ptr must also match
+      if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Other->getValue())) {
+        auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
+        if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
+          return false;
+      } else
+        return false;
     }
-
-    return;
-  }
-
-  while (Index) {
-    // If the chain has more than one use, then we can't reorder the mem ops.
-    if (Index != St && !SDValue(Index, 0)->hasOneUse())
-      break;
-
-    // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
-
-    // Check that the base pointer is the same as the original one.
-    if (!Ptr.equalBaseIndex(BasePtr))
-      break;
-
-    // The memory operands must not be volatile.
-    if (Index->isVolatile() || Index->isIndexed())
-      break;
-
-    // No truncation.
-    if (Index->isTruncatingStore())
-      break;
-
-    // The stored memory type must be the same.
-    if (Index->getMemoryVT() != MemVT)
-      break;
-
-    // We do not allow under-aligned stores in order to prevent
-    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
-    // be irrelevant here; what MATTERS is that we not move memory
-    // operations that potentially overlap past each-other.
-    if (Index->getAlignment() < MemVT.getStoreSize())
-      break;
-
-    // We found a potential memory operand to merge.
-    StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
-
-    // Find the next memory operand in the chain. If the next operand in the
-    // chain is a store then move up and continue the scan with the next
-    // memory operand. If the next operand is a load save it and use alias
-    // information to check if it interferes with anything.
-    SDNode *NextInChain = Index->getChain().getNode();
-    while (1) {
-      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
-        // We found a store node. Use it for the next iteration.
-        Index = STn;
-        break;
-      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
-        if (Ldn->isVolatile()) {
-          Index = nullptr;
-          break;
+    if (IsConstantSrc)
+      if (!(isa<ConstantSDNode>(Other->getValue()) ||
+            isa<ConstantFPSDNode>(Other->getValue())))
+        return false;
+    if (IsExtractVecSrc)
+      if (!(Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+            Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR))
+        return false;
+    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
+    return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
+  };
+  // We looking for a root node which is an ancestor to all mergable
+  // stores. We search up through a load, to our root and then down
+  // through all children. For instance we will find Store{1,2,3} if
+  // St is Store1, Store2. or Store3 where the root is not a load
+  // which always true for nonvolatile ops. TODO: Expand
+  // the search to find all valid candidates through multiple layers of loads.
+  //
+  // Root
+  // |-------|-------|
+  // Load    Load    Store3
+  // |       |
+  // Store1   Store2
+  //
+  // FIXME: We should be able to climb and
+  // descend TokenFactors to find candidates as well.
+
+  SDNode *RootNode = (St->getChain()).getNode();
+
+  if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
+    RootNode = Ldn->getChain().getNode();
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
+      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
+        for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
+          if (I2.getOperandNo() == 0)
+            if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
+              BaseIndexOffset Ptr;
+              int64_t PtrDiff;
+              if (CandidateMatch(OtherST, Ptr, PtrDiff))
+                StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
+            }
+  } else
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
+      if (I.getOperandNo() == 0)
+        if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
+          BaseIndexOffset Ptr;
+          int64_t PtrDiff;
+          if (CandidateMatch(OtherST, Ptr, PtrDiff))
+            StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
         }
-
-        // Save the load node for later. Continue the scan.
-        AliasLoadNodes.push_back(Ldn);
-        NextInChain = Ldn->getChain().getNode();
-        continue;
-      } else {
-        Index = nullptr;
-        break;
-      }
-    }
-  }
 }
 
-// We need to check that merging these stores does not cause a loop
-// in the DAG. Any store candidate may depend on another candidate
+// We need to check that merging these stores does not cause a loop in
+// the DAG. Any store candidate may depend on another candidate
 // indirectly through its operand (we already consider dependencies
 // through the chain). Check in parallel by searching up from
 // non-chain operands of candidates.
+
 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
-    SmallVectorImpl<MemOpLink> &StoreNodes) {
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
+
+  // FIXME: We should be able to truncate a full search of
+  // predecessors by doing a BFS and keeping tabs the originating
+  // stores from which worklist nodes come from in a similar way to
+  // TokenFactor simplfication.
+
   SmallPtrSet<const SDNode *, 16> Visited;
   SmallVector<const SDNode *, 8> Worklist;
-  // search ops of store candidates
-  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+  unsigned int Max = 8192;
+  // Search Ops of store candidates.
+  for (unsigned i = 0; i < NumStores; ++i) {
     SDNode *n = StoreNodes[i].MemNode;
     // Potential loops may happen only through non-chain operands
     for (unsigned j = 1; j < n->getNumOperands(); ++j)
       Worklist.push_back(n->getOperand(j).getNode());
   }
-  // search through DAG. We can stop early if we find a storenode
-  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
-    if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist))
+  // Search through DAG. We can stop early if we find a store node.
+  for (unsigned i = 0; i < NumStores; ++i) {
+    if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
+                                     Max))
+      return false;
+    // Check if we ended early, failing conservatively if so.
+    if (Visited.size() >= Max)
       return false;
   }
   return true;
 }
 
-bool DAGCombiner::MergeConsecutiveStores(
-    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) {
+bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+
+  if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
+    return false;
+
   bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
       Attribute::NoImplicitFloat);
 
@@ -11731,376 +12681,437 @@ bool DAGCombiner::MergeConsecutiveStores(
   if (MemVT.isVector() && IsLoadSrc)
     return false;
 
-  // Only look at ends of store sequences.
-  SDValue Chain = SDValue(St, 0);
-  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
-    return false;
-
-  // Save the LoadSDNodes that we find in the chain.
-  // We need to make sure that these nodes do not interfere with
-  // any of the store nodes.
-  SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
-
-  getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
+  SmallVector<MemOpLink, 8> StoreNodes;
+  // Find potential store merge candidates by searching through chain sub-DAG
+  getStoreMergeCandidates(St, StoreNodes);
 
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
     return false;
 
-  // only do dependence check in AA case
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-  if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes))
-    return false;
-
   // Sort the memory operands according to their distance from the
-  // base pointer.  As a secondary criteria: make sure stores coming
-  // later in the code come first in the list. This is important for
-  // the non-UseAA case, because we're merging stores into the FINAL
-  // store along a chain which potentially contains aliasing stores.
-  // Thus, if there are multiple stores to the same address, the last
-  // one can be considered for merging but not the others.
+  // base pointer.
   std::sort(StoreNodes.begin(), StoreNodes.end(),
             [](MemOpLink LHS, MemOpLink RHS) {
-    return LHS.OffsetFromBase < RHS.OffsetFromBase ||
-           (LHS.OffsetFromBase == RHS.OffsetFromBase &&
-            LHS.SequenceNum < RHS.SequenceNum);
-  });
-
-  // Scan the memory operations on the chain and find the first non-consecutive
-  // store memory address.
-  unsigned LastConsecutiveStore = 0;
-  int64_t StartAddress = StoreNodes[0].OffsetFromBase;
-  for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) {
-
+              return LHS.OffsetFromBase < RHS.OffsetFromBase;
+            });
+
+  // Store Merge attempts to merge the lowest stores. This generally
+  // works out as if successful, as the remaining stores are checked
+  // after the first collection of stores is merged. However, in the
+  // case that a non-mergeable store is found first, e.g., {p[-2],
+  // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
+  // mergeable cases. To prevent this, we prune such stores from the
+  // front of StoreNodes here.
+
+  bool RV = false;
+  while (StoreNodes.size() > 1) {
+    unsigned StartIdx = 0;
+    while ((StartIdx + 1 < StoreNodes.size()) &&
+           StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
+               StoreNodes[StartIdx + 1].OffsetFromBase)
+      ++StartIdx;
+
+    // Bail if we don't have enough candidates to merge.
+    if (StartIdx + 1 >= StoreNodes.size())
+      return RV;
+
+    if (StartIdx)
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
+
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive store memory address.
+    unsigned NumConsecutiveStores = 1;
+    int64_t StartAddress = StoreNodes[0].OffsetFromBase;
     // Check that the addresses are consecutive starting from the second
     // element in the list of stores.
-    if (i > 0) {
+    for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
       int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
       if (CurrAddress - StartAddress != (ElementSizeBytes * i))
         break;
+      NumConsecutiveStores = i + 1;
     }
 
-    // Check if this store interferes with any of the loads that we found.
-    // If we find a load that alias with this store. Stop the sequence.
-    if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) {
-          return isAlias(Ldn, StoreNodes[i].MemNode);
-        }))
-      break;
+    if (NumConsecutiveStores < 2) {
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumConsecutiveStores);
+      continue;
+    }
 
-    // Mark this node as useful.
-    LastConsecutiveStore = i;
-  }
+    // Check that we can merge these candidates without causing a cycle
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes,
+                                                  NumConsecutiveStores)) {
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumConsecutiveStores);
+      continue;
+    }
 
-  // The node with the lowest store address.
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-  unsigned FirstStoreAlign = FirstInChain->getAlignment();
-  LLVMContext &Context = *DAG.getContext();
-  const DataLayout &DL = DAG.getDataLayout();
-
-  // Store the constants into memory as one consecutive store.
-  if (IsConstantSrc) {
-    unsigned LastLegalType = 0;
-    unsigned LastLegalVectorType = 0;
-    bool NonZero = false;
-    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
-      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
-
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
-        NonZero |= !C->isNullValue();
-      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) {
-        NonZero |= !C->getConstantFPValue()->isNullValue();
-      } else {
-        // Non-constant.
-        break;
-      }
+    // The node with the lowest store address.
+    LLVMContext &Context = *DAG.getContext();
+    const DataLayout &DL = DAG.getDataLayout();
 
-      // Find a legal type for the constant store.
-      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-      bool IsFast;
-      if (TLI.isTypeLegal(StoreTy) &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                                 FirstStoreAlign, &IsFast) && IsFast) {
-        LastLegalType = i+1;
-      // Or check whether a truncstore is legal.
-      } else if (TLI.getTypeAction(Context, StoreTy) ==
-                 TargetLowering::TypePromoteInteger) {
-        EVT LegalizedStoredValueTy =
-          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
-        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                   FirstStoreAS, FirstStoreAlign, &IsFast) &&
+    // Store the constants into memory as one consecutive store.
+    if (IsConstantSrc) {
+      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+      unsigned FirstStoreAlign = FirstInChain->getAlignment();
+      unsigned LastLegalType = 1;
+      unsigned LastLegalVectorType = 1;
+      bool LastIntegerTrunc = false;
+      bool NonZero = false;
+      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+        StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue StoredVal = ST->getValue();
+
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
+          NonZero |= !C->isNullValue();
+        } else if (ConstantFPSDNode *C =
+                       dyn_cast<ConstantFPSDNode>(StoredVal)) {
+          NonZero |= !C->getConstantFPValue()->isNullValue();
+        } else {
+          // Non-constant.
+          break;
+        }
+
+        // Find a legal type for the constant store.
+        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+        EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+        bool IsFast = false;
+        if (TLI.isTypeLegal(StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                   FirstStoreAlign, &IsFast) &&
             IsFast) {
+          LastIntegerTrunc = false;
           LastLegalType = i + 1;
+          // Or check whether a truncstore is legal.
+        } else if (TLI.getTypeAction(Context, StoreTy) ==
+                   TargetLowering::TypePromoteInteger) {
+          EVT LegalizedStoredValueTy =
+              TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+              TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
+              TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                     FirstStoreAS, FirstStoreAlign, &IsFast) &&
+              IsFast) {
+            LastIntegerTrunc = true;
+            LastLegalType = i + 1;
+          }
         }
-      }
 
-      // We only use vectors if the constant is known to be zero or the target
-      // allows it and the function is not marked with the noimplicitfloat
-      // attribute.
-      if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1,
-                                                        FirstStoreAS)) &&
-          !NoVectors) {
-        // Find a legal type for the vector store.
-        EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
-        if (TLI.isTypeLegal(Ty) &&
-            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
-                                   FirstStoreAlign, &IsFast) && IsFast)
-          LastLegalVectorType = i + 1;
+        // We only use vectors if the constant is known to be zero or the target
+        // allows it and the function is not marked with the noimplicitfloat
+        // attribute.
+        if ((!NonZero ||
+             TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
+            !NoVectors) {
+          // Find a legal type for the vector store.
+          unsigned Elts = i + 1;
+          if (MemVT.isVector()) {
+            // When merging vector stores, get the total number of elements.
+            Elts *= MemVT.getVectorNumElements();
+          }
+          EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+          if (TLI.isTypeLegal(Ty) &&
+              TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+              TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                     FirstStoreAlign, &IsFast) &&
+              IsFast)
+            LastLegalVectorType = i + 1;
+        }
       }
-    }
 
-    // Check if we found a legal integer type to store.
-    if (LastLegalType == 0 && LastLegalVectorType == 0)
-      return false;
-
-    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
-    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
-
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
-                                           true, UseVector);
-  }
+      // Check if we found a legal integer type that creates a meaningful merge.
+      if (LastLegalType < 2 && LastLegalVectorType < 2) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+        continue;
+      }
 
-  // When extracting multiple vector elements, try to store them
-  // in one vector store rather than a sequence of scalar stores.
-  if (IsExtractVecSrc) {
-    unsigned NumStoresToMerge = 0;
-    bool IsVec = MemVT.isVector();
-    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
-      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      unsigned StoreValOpcode = St->getValue().getOpcode();
-      // This restriction could be loosened.
-      // Bail out if any stored values are not elements extracted from a vector.
-      // It should be possible to handle mixed sources, but load sources need
-      // more careful handling (see the block of code below that handles
-      // consecutive loads).
-      if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
-          StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
-        return false;
+      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
 
-      // Find a legal type for the vector store.
-      unsigned Elts = i + 1;
-      if (IsVec) {
-        // When merging vector stores, get the total number of elements.
-        Elts *= MemVT.getVectorNumElements();
+      bool Merged = MergeStoresOfConstantsOrVecElts(
+          StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
+      if (!Merged) {
+        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+        continue;
       }
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
-      bool IsFast;
-      if (TLI.isTypeLegal(Ty) &&
-          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
-                                 FirstStoreAlign, &IsFast) && IsFast)
-        NumStoresToMerge = i + 1;
+      // Remove merged stores for next iteration.
+      RV = true;
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      continue;
     }
 
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge,
-                                           false, true);
-  }
-
-  // Below we handle the case of multiple consecutive stores that
-  // come from multiple consecutive loads. We merge them into a single
-  // wide load and a single wide store.
-
-  // Look for load nodes which are used by the stored values.
-  SmallVector<MemOpLink, 8> LoadNodes;
+    // When extracting multiple vector elements, try to store them
+    // in one vector store rather than a sequence of scalar stores.
+    if (IsExtractVecSrc) {
+      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+      unsigned FirstStoreAlign = FirstInChain->getAlignment();
+      unsigned NumStoresToMerge = 1;
+      bool IsVec = MemVT.isVector();
+      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        unsigned StoreValOpcode = St->getValue().getOpcode();
+        // This restriction could be loosened.
+        // Bail out if any stored values are not elements extracted from a
+        // vector. It should be possible to handle mixed sources, but load
+        // sources need more careful handling (see the block of code below that
+        // handles consecutive loads).
+        if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
+            StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
+          return RV;
 
-  // Find acceptable loads. Loads need to have the same chain (token factor),
-  // must not be zext, volatile, indexed, and they must be consecutive.
-  BaseIndexOffset LdBasePtr;
-  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
-    StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-    LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
-    if (!Ld) break;
+        // Find a legal type for the vector store.
+        unsigned Elts = i + 1;
+        if (IsVec) {
+          // When merging vector stores, get the total number of elements.
+          Elts *= MemVT.getVectorNumElements();
+        }
+        EVT Ty =
+            EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+        bool IsFast;
+        if (TLI.isTypeLegal(Ty) &&
+            TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                   FirstStoreAlign, &IsFast) &&
+            IsFast)
+          NumStoresToMerge = i + 1;
+      }
 
-    // Loads must only have one use.
-    if (!Ld->hasNUsesOfValue(1, 0))
-      break;
+      bool Merged = MergeStoresOfConstantsOrVecElts(
+          StoreNodes, MemVT, NumStoresToMerge, false, true, false);
+      if (!Merged) {
+        StoreNodes.erase(StoreNodes.begin(),
+                         StoreNodes.begin() + NumStoresToMerge);
+        continue;
+      }
+      // Remove merged stores for next iteration.
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumStoresToMerge);
+      RV = true;
+      continue;
+    }
 
-    // The memory operands must not be volatile.
-    if (Ld->isVolatile() || Ld->isIndexed())
-      break;
+    // Below we handle the case of multiple consecutive stores that
+    // come from multiple consecutive loads. We merge them into a single
+    // wide load and a single wide store.
 
-    // We do not accept ext loads.
-    if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
-      break;
+    // Look for load nodes which are used by the stored values.
+    SmallVector<MemOpLink, 8> LoadNodes;
 
-    // The stored memory type must be the same.
-    if (Ld->getMemoryVT() != MemVT)
-      break;
+    // Find acceptable loads. Loads need to have the same chain (token factor),
+    // must not be zext, volatile, indexed, and they must be consecutive.
+    BaseIndexOffset LdBasePtr;
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
+      if (!Ld)
+        break;
 
-    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
-    // If this is not the first ptr that we check.
-    if (LdBasePtr.Base.getNode()) {
-      // The base ptr must be the same.
-      if (!LdPtr.equalBaseIndex(LdBasePtr))
+      // Loads must only have one use.
+      if (!Ld->hasNUsesOfValue(1, 0))
         break;
-    } else {
-      // Check that all other base pointers are the same as this one.
-      LdBasePtr = LdPtr;
-    }
 
-    // We found a potential memory operand to merge.
-    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0));
-  }
+      // The memory operands must not be volatile.
+      if (Ld->isVolatile() || Ld->isIndexed())
+        break;
 
-  if (LoadNodes.size() < 2)
-    return false;
+      // We do not accept ext loads.
+      if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
+        break;
 
-  // If we have load/store pair instructions and we only have two values,
-  // don't bother.
-  unsigned RequiredAlignment;
-  if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
-      St->getAlignment() >= RequiredAlignment)
-    return false;
+      // The stored memory type must be the same.
+      if (Ld->getMemoryVT() != MemVT)
+        break;
 
-  LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
-  unsigned FirstLoadAS = FirstLoad->getAddressSpace();
-  unsigned FirstLoadAlign = FirstLoad->getAlignment();
-
-  // Scan the memory operations on the chain and find the first non-consecutive
-  // load memory address. These variables hold the index in the store node
-  // array.
-  unsigned LastConsecutiveLoad = 0;
-  // This variable refers to the size and not index in the array.
-  unsigned LastLegalVectorType = 0;
-  unsigned LastLegalIntegerType = 0;
-  StartAddress = LoadNodes[0].OffsetFromBase;
-  SDValue FirstChain = FirstLoad->getChain();
-  for (unsigned i = 1; i < LoadNodes.size(); ++i) {
-    // All loads must share the same chain.
-    if (LoadNodes[i].MemNode->getChain() != FirstChain)
-      break;
+      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+      // If this is not the first ptr that we check.
+      int64_t LdOffset = 0;
+      if (LdBasePtr.getBase().getNode()) {
+        // The base ptr must be the same.
+        if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
+          break;
+      } else {
+        // Check that all other base pointers are the same as this one.
+        LdBasePtr = LdPtr;
+      }
 
-    int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
-    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-      break;
-    LastConsecutiveLoad = i;
-    // Find a legal type for the vector store.
-    EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1);
-    bool IsFastSt, IsFastLd;
-    if (TLI.isTypeLegal(StoreTy) &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
-                               FirstLoadAlign, &IsFastLd) && IsFastLd) {
-      LastLegalVectorType = i + 1;
-    }
-
-    // Find a legal type for the integer store.
-    unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-    StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-    if (TLI.isTypeLegal(StoreTy) &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
-        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
-                               FirstLoadAlign, &IsFastLd) && IsFastLd)
-      LastLegalIntegerType = i + 1;
-    // Or check whether a truncstore and extload is legal.
-    else if (TLI.getTypeAction(Context, StoreTy) ==
-             TargetLowering::TypePromoteInteger) {
-      EVT LegalizedStoredValueTy =
-        TLI.getTypeToTransformTo(Context, StoreTy);
-      if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
-          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                 FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
-          IsFastSt &&
-          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                 FirstLoadAS, FirstLoadAlign, &IsFastLd) &&
-          IsFastLd)
-        LastLegalIntegerType = i+1;
+      // We found a potential memory operand to merge.
+      LoadNodes.push_back(MemOpLink(Ld, LdOffset));
     }
-  }
-
-  // Only use vector types if the vector type is larger than the integer type.
-  // If they are the same, use integers.
-  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
-  unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType);
 
-  // We add +1 here because the LastXXX variables refer to location while
-  // the NumElem refers to array/index size.
-  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
-  NumElem = std::min(LastLegalType, NumElem);
-
-  if (NumElem < 2)
-    return false;
-
-  // Collect the chains from all merged stores.
-  SmallVector<SDValue, 8> MergeStoreChains;
-  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
+    if (LoadNodes.size() < 2) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+      continue;
+    }
 
-  // The latest Node in the DAG.
-  unsigned LatestNodeUsed = 0;
-  for (unsigned i=1; i<NumElem; ++i) {
-    // Find a chain for the new wide-store operand. Notice that some
-    // of the store nodes that we found may not be selected for inclusion
-    // in the wide store. The chain we use needs to be the chain of the
-    // latest store node which is *used* and replaced by the wide store.
-    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
-      LatestNodeUsed = i;
+    // If we have load/store pair instructions and we only have two values,
+    // don't bother merging.
+    unsigned RequiredAlignment;
+    if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+        StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
+      continue;
+    }
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+    unsigned FirstLoadAS = FirstLoad->getAddressSpace();
+    unsigned FirstLoadAlign = FirstLoad->getAlignment();
+
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive load memory address. These variables hold the index in
+    // the store node array.
+    unsigned LastConsecutiveLoad = 1;
+    // This variable refers to the size and not index in the array.
+    unsigned LastLegalVectorType = 1;
+    unsigned LastLegalIntegerType = 1;
+    bool isDereferenceable = true;
+    bool DoIntegerTruncate = false;
+    StartAddress = LoadNodes[0].OffsetFromBase;
+    SDValue FirstChain = FirstLoad->getChain();
+    for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+      // All loads must share the same chain.
+      if (LoadNodes[i].MemNode->getChain() != FirstChain)
+        break;
 
-    MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
-  }
+      int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+      LastConsecutiveLoad = i;
 
-  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
+      if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
+        isDereferenceable = false;
 
-  // Find if it is better to use vectors or integers to load and store
-  // to memory.
-  EVT JointMemOpVT;
-  if (UseVectorTy) {
-    JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
-  } else {
-    unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-    JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
-  }
+      // Find a legal type for the vector store.
+      EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1);
+      bool IsFastSt, IsFastLd;
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign, &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                 FirstLoadAlign, &IsFastLd) &&
+          IsFastLd) {
+        LastLegalVectorType = i + 1;
+      }
 
-  SDLoc LoadDL(LoadNodes[0].MemNode);
-  SDLoc StoreDL(StoreNodes[0].MemNode);
+      // Find a legal type for the integer store.
+      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+      StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign, &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                 FirstLoadAlign, &IsFastLd) &&
+          IsFastLd) {
+        LastLegalIntegerType = i + 1;
+        DoIntegerTruncate = false;
+        // Or check whether a truncstore and extload is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
+        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
+            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy,
+                               StoreTy) &&
+            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
+                               StoreTy) &&
+            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
+            IsFastSt &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                                   FirstLoadAlign, &IsFastLd) &&
+            IsFastLd) {
+          LastLegalIntegerType = i + 1;
+          DoIntegerTruncate = true;
+        }
+      }
+    }
 
-  // The merged loads are required to have the same incoming chain, so
-  // using the first's chain is acceptable.
-  SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
-                                FirstLoad->getBasePtr(),
-                                FirstLoad->getPointerInfo(), FirstLoadAlign);
+    // Only use vector types if the vector type is larger than the integer type.
+    // If they are the same, use integers.
+    bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
+    unsigned LastLegalType =
+        std::max(LastLegalVectorType, LastLegalIntegerType);
 
-  SDValue NewStoreChain =
-    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
+    // We add +1 here because the LastXXX variables refer to location while
+    // the NumElem refers to array/index size.
+    unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
+    NumElem = std::min(LastLegalType, NumElem);
 
-  SDValue NewStore =
-      DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
-                   FirstInChain->getPointerInfo(), FirstStoreAlign);
+    if (NumElem < 2) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+      continue;
+    }
 
-  // Transfer chain users from old loads to the new load.
-  for (unsigned i = 0; i < NumElem; ++i) {
-    LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                  SDValue(NewLoad.getNode(), 1));
-  }
+    // Find if it is better to use vectors or integers to load and store
+    // to memory.
+    EVT JointMemOpVT;
+    if (UseVectorTy) {
+      JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
+    } else {
+      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+      JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+    }
+
+    SDLoc LoadDL(LoadNodes[0].MemNode);
+    SDLoc StoreDL(StoreNodes[0].MemNode);
+
+    // The merged loads are required to have the same incoming chain, so
+    // using the first's chain is acceptable.
+
+    SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+    AddToWorklist(NewStoreChain.getNode());
+
+    MachineMemOperand::Flags MMOFlags = isDereferenceable ?
+                                          MachineMemOperand::MODereferenceable:
+                                          MachineMemOperand::MONone;
+
+    SDValue NewLoad, NewStore;
+    if (UseVectorTy || !DoIntegerTruncate) {
+      NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
+                            FirstLoad->getBasePtr(),
+                            FirstLoad->getPointerInfo(), FirstLoadAlign,
+                            MMOFlags);
+      NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad,
+                              FirstInChain->getBasePtr(),
+                              FirstInChain->getPointerInfo(), FirstStoreAlign);
+    } else { // This must be the truncstore/extload case
+      EVT ExtendedTy =
+          TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
+      NewLoad =
+          DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(),
+                         FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
+                         JointMemOpVT, FirstLoadAlign, MMOFlags);
+      NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
+                                   FirstInChain->getBasePtr(),
+                                   FirstInChain->getPointerInfo(), JointMemOpVT,
+                                   FirstInChain->getAlignment(),
+                                   FirstInChain->getMemOperand()->getFlags());
+    }
+
+    // Transfer chain users from old loads to the new load.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                    SDValue(NewLoad.getNode(), 1));
+    }
 
-  if (UseAA) {
     // Replace the all stores with the new store.
     for (unsigned i = 0; i < NumElem; ++i)
       CombineTo(StoreNodes[i].MemNode, NewStore);
-  } else {
-    // Replace the last store with the new store.
-    CombineTo(LatestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumElem; ++i) {
-      // Remove all Store nodes.
-      if (StoreNodes[i].MemNode == LatestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
+    RV = true;
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+    continue;
   }
-
-  StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end());
-  return true;
+  return RV;
 }
 
 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
@@ -12256,19 +13267,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (SDValue NewST = TransformFPLoadStorePair(N))
     return NewST;
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-#ifndef NDEBUG
-  if (CombinerAAOnlyFunc.getNumOccurrences() &&
-      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
-    UseAA = false;
-#endif
-  if (UseAA && ST->isUnindexed()) {
-    // FIXME: We should do this even without AA enabled. AA will just allow
-    // FindBetterChain to work in more situations. The problem with this is that
-    // any combine that expects memory operations to be on consecutive chains
-    // first needs to be updated to look for users of the same chain.
-
+  if (ST->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes, on this store and any
     // adjacent stores.
     if (findBetterNeighborChains(ST)) {
@@ -12279,10 +13278,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     Chain = ST->getChain();
   }
 
-  // Try transforming N to an indexed store.
-  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
-    return SDValue(N, 0);
-
   // FIXME: is there such a thing as a truncating indexed store?
   if (ST->isTruncatingStore() && ST->isUnindexed() &&
       Value.getValueType().isInteger()) {
@@ -12302,8 +13297,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     if (SimplifyDemandedBits(
             Value,
             APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
-                                 ST->getMemoryVT().getScalarSizeInBits())))
+                                 ST->getMemoryVT().getScalarSizeInBits()))) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   // If this is a load followed by a store to the same location, then the store
@@ -12319,14 +13321,28 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
-  // If this is a store followed by a store with the same value to the same
-  // location, then the store is dead/noop.
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
-    if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
-        ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
-        ST1->isUnindexed() && !ST1->isVolatile()) {
-      // The store is dead, remove it.
-      return Chain;
+    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
+        !ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
+        ST->getMemoryVT() == ST1->getMemoryVT()) {
+      // If this is a store followed by a store with the same value to the same
+      // location, then the store is dead/noop.
+      if (ST1->getValue() == Value) {
+        // The store is dead, remove it.
+        return Chain;
+      }
+
+      // If this is a store who's preceeding store to the same location
+      // and no one other node is chained to that store we can effectively
+      // drop the store. Do not remove stores to undef as they may be used as
+      // data sinks.
+      if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
+          !ST1->getBasePtr().isUndef()) {
+        // ST1 is fully overwritten and can be elided. Combine with it's chain
+        // value.
+        CombineTo(ST1, ST1->getChain());
+        return SDValue();
+      }
     }
   }
 
@@ -12342,29 +13358,31 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Only perform this optimization before the types are legal, because we
   // don't want to perform this optimization on every DAGCombine invocation.
-  if (!LegalTypes) {
+  if ((TLI.mergeStoresAfterLegalization()) ? Level == AfterLegalizeDAG
+                                           : !LegalTypes) {
     for (;;) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
       // or until we merge the last store on the chain.
-      SmallVector<MemOpLink, 8> StoreNodes;
-      bool Changed = MergeConsecutiveStores(ST, StoreNodes);
+      bool Changed = MergeConsecutiveStores(ST);
       if (!Changed) break;
-
-      if (any_of(StoreNodes,
-                 [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) {
-        // ST has been merged and no longer exists.
+      // Return N as merge only uses CombineTo and no worklist clean
+      // up is necessary.
+      if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
         return SDValue(N, 0);
-      }
     }
   }
 
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
   // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
   //
   // Make sure to do this only after attempting to merge stores in order to
   //  avoid changing the types of some subset of stores due to visit order,
   //  preventing their merging.
-  if (isa<ConstantFPSDNode>(Value)) {
+  if (isa<ConstantFPSDNode>(ST->getValue())) {
     if (SDValue NewSt = replaceStoreOfFPConstant(ST))
       return NewSt;
   }
@@ -12493,10 +13511,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
 
   EVT VT = InVec.getValueType();
 
-  // If we can't generate a legal BUILD_VECTOR, exit
-  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
-    return SDValue();
-
   // Check that we know which element is being inserted
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
@@ -12511,8 +13525,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   // do this only if indices are both constants and Idx1 < Idx0.
   if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
       && isa<ConstantSDNode>(InVec.getOperand(2))) {
-    unsigned OtherElt =
-      cast<ConstantSDNode>(InVec.getOperand(2))->getZExtValue();
+    unsigned OtherElt = InVec.getConstantOperandVal(2);
     if (Elt < OtherElt) {
       // Swap nodes.
       SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
@@ -12523,6 +13536,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  // If we can't generate a legal BUILD_VECTOR, exit
+  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return SDValue();
+
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
@@ -12544,11 +13561,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     // All the operands of BUILD_VECTOR must have the same type;
     // we enforce that here.
     EVT OpVT = Ops[0].getValueType();
-    if (InVal.getValueType() != OpVT)
-      InVal = OpVT.bitsGT(InVal.getValueType()) ?
-                DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
-                DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
-    Ops[Elt] = InVal;
+    Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
   }
 
   // Return the new vector
@@ -12568,6 +13581,11 @@ SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
     return SDValue();
 
+  ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
+    ISD::NON_EXTLOAD : ISD::EXTLOAD;
+  if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
+    return SDValue();
+
   Align = NewAlign;
 
   SDValue NewPtr = OriginalLoad->getBasePtr();
@@ -12639,6 +13657,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   EVT VT = InVec.getValueType();
   EVT NVT = N->getValueType(0);
 
+  if (InVec.isUndef())
+    return DAG.getUNDEF(NVT);
+
   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
@@ -13022,7 +14043,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
   return DAG.getNode(Opcode, DL, VT, BV);
 }
 
-SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N,
+SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
                                            unsigned LeftIdx) {
@@ -13088,6 +14109,11 @@ SDValue DAGCombiner::createBuildVecShuffle(SDLoc DL, SDNode *N,
         // when we start sorting the vectors by type.
         return SDValue();
       }
+    } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
+               InVT1.getSizeInBits() == VT.getSizeInBits()) {
+      SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
+      ConcatOps[0] = VecIn2;
+      VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
     } else {
       // TODO: Support cases where the length mismatch isn't exactly by a
       // factor of 2.
@@ -13293,6 +14319,73 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   return Shuffles[0];
 }
 
+// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
+// operations which can be matched to a truncate.
+SDValue DAGCombiner::reduceBuildVecToTrunc(SDNode *N) {
+  // TODO: Add support for big-endian.
+  if (DAG.getDataLayout().isBigEndian())
+    return SDValue();
+  if (N->getNumOperands() < 2)
+    return SDValue();
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = N->getNumOperands();
+
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // If the input is something other than an EXTRACT_VECTOR_ELT with a constant
+  // index, bail out.
+  // TODO: Allow undef elements in some cases?
+  if (any_of(N->ops(), [VT](SDValue Op) {
+        return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+               !isa<ConstantSDNode>(Op.getOperand(1)) ||
+               Op.getValueType() != VT.getVectorElementType();
+      }))
+    return SDValue();
+
+  // Helper for obtaining an EXTRACT_VECTOR_ELT's constant index
+  auto GetExtractIdx = [](SDValue Extract) {
+    return cast<ConstantSDNode>(Extract.getOperand(1))->getSExtValue();
+  };
+
+  // The first BUILD_VECTOR operand must be an an extract from index zero
+  // (assuming no undef and little-endian).
+  if (GetExtractIdx(N->getOperand(0)) != 0)
+    return SDValue();
+
+  // Compute the stride from the first index.
+  int Stride = GetExtractIdx(N->getOperand(1));
+  SDValue ExtractedFromVec = N->getOperand(0).getOperand(0);
+
+  // Proceed only if the stride and the types can be matched to a truncate.
+  if ((Stride == 1 || !isPowerOf2_32(Stride)) ||
+      (ExtractedFromVec.getValueType().getVectorNumElements() !=
+       Stride * NumElems) ||
+      (VT.getScalarSizeInBits() * Stride > 64))
+    return SDValue();
+
+  // Check remaining operands are consistent with the computed stride.
+  for (unsigned i = 1; i != NumElems; ++i) {
+    SDValue Op = N->getOperand(i);
+
+    if ((Op.getOperand(0) != ExtractedFromVec) ||
+        (GetExtractIdx(Op) != Stride * i))
+      return SDValue();
+  }
+
+  // All checks were ok, construct the truncate.
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT NewVT = VT.getVectorVT(
+      Ctx, EVT::getIntegerVT(Ctx, VT.getScalarSizeInBits() * Stride), NumElems);
+  EVT TruncVT =
+      VT.isFloatingPoint() ? VT.changeVectorElementTypeToInteger() : VT;
+
+  SDValue Res = DAG.getBitcast(NewVT, ExtractedFromVec);
+  Res = DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, Res);
+  return DAG.getBitcast(VT, Res);
+}
+
 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   EVT VT = N->getValueType(0);
 
@@ -13300,12 +14393,45 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (ISD::allOperandsUndef(N))
     return DAG.getUNDEF(VT);
 
+  // Check if we can express BUILD VECTOR via subvector extract.
+  if (!LegalTypes && (N->getNumOperands() > 1)) {
+    SDValue Op0 = N->getOperand(0);
+    auto checkElem = [&](SDValue Op) -> uint64_t {
+      if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
+          (Op0.getOperand(0) == Op.getOperand(0)))
+        if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+          return CNode->getZExtValue();
+      return -1;
+    };
+
+    int Offset = checkElem(Op0);
+    for (unsigned i = 0; i < N->getNumOperands(); ++i) {
+      if (Offset + i != checkElem(N->getOperand(i))) {
+        Offset = -1;
+        break;
+      }
+    }
+
+    if ((Offset == 0) &&
+        (Op0.getOperand(0).getValueType() == N->getValueType(0)))
+      return Op0.getOperand(0);
+    if ((Offset != -1) &&
+        ((Offset % N->getValueType(0).getVectorNumElements()) ==
+         0)) // IDX must be multiple of output size.
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
+                         Op0.getOperand(0), Op0.getOperand(1));
+  }
+
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
   if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
     return V;
 
+  if (TLI.isDesirableToCombineBuildVectorToTruncate())
+    if (SDValue V = reduceBuildVecToTrunc(N))
+      return V;
+
   if (SDValue V = reduceBuildVecToShuffle(N))
     return V;
 
@@ -13419,7 +14545,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
 
     if (!isa<ConstantSDNode>(Op.getOperand(1)))
       return SDValue();
-    int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    int ExtIdx = Op.getConstantOperandVal(1);
 
     // Ensure that we are extracting a subvector from a vector the same
     // size as the result.
@@ -13491,8 +14617,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
         return SDValue();
 
-      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
-                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
+      if (VNTNumElms < 2)
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
       if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
         return SDValue();
 
@@ -13607,19 +14736,153 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
+/// If we are extracting a subvector produced by a wide binary operator with at
+/// at least one operand that was the result of a vector concatenation, then try
+/// to use the narrow vector operands directly to avoid the concatenation and
+/// extraction.
+static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
+  // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
+  // some of these bailouts with other transforms.
+
+  // The extract index must be a constant, so we can map it to a concat operand.
+  auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!ExtractIndex)
+    return SDValue();
+
+  // Only handle the case where we are doubling and then halving. A larger ratio
+  // may require more than two narrow binops to replace the wide binop.
+  EVT VT = Extract->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
+         "Extract index is not a multiple of the vector length.");
+  if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
+    return SDValue();
+
+  // We are looking for an optionally bitcasted wide vector binary operator
+  // feeding an extract subvector.
+  SDValue BinOp = Extract->getOperand(0);
+  if (BinOp.getOpcode() == ISD::BITCAST)
+    BinOp = BinOp.getOperand(0);
+
+  // TODO: The motivating case for this transform is an x86 AVX1 target. That
+  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
+  // flavors, but no other 256-bit integer support. This could be extended to
+  // handle any binop, but that may require fixing/adding other folds to avoid
+  // codegen regressions.
+  unsigned BOpcode = BinOp.getOpcode();
+  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+    return SDValue();
+
+  // The binop must be a vector type, so we can chop it in half.
+  EVT WideBVT = BinOp.getValueType();
+  if (!WideBVT.isVector())
+    return SDValue();
+
+  // Bail out if the target does not support a narrower version of the binop.
+  EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
+                                   WideBVT.getVectorNumElements() / 2);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
+    return SDValue();
+
+  // Peek through bitcasts of the binary operator operands if needed.
+  SDValue LHS = BinOp.getOperand(0);
+  if (LHS.getOpcode() == ISD::BITCAST)
+    LHS = LHS.getOperand(0);
+
+  SDValue RHS = BinOp.getOperand(1);
+  if (RHS.getOpcode() == ISD::BITCAST)
+    RHS = RHS.getOperand(0);
+
+  // We need at least one concatenation operation of a binop operand to make
+  // this transform worthwhile. The concat must double the input vector sizes.
+  // TODO: Should we also handle INSERT_SUBVECTOR patterns?
+  bool ConcatL =
+      LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
+  bool ConcatR =
+      RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2;
+  if (!ConcatL && !ConcatR)
+    return SDValue();
+
+  // If one of the binop operands was not the result of a concat, we must
+  // extract a half-sized operand for our new narrow binop. We can't just reuse
+  // the original extract index operand because we may have bitcasted.
+  unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
+  unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
+  EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
+  SDLoc DL(Extract);
+
+  // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
+  // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N)
+  // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN
+  SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum))
+                      : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                                    BinOp.getOperand(0),
+                                    DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));
+
+  SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum))
+                      : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                                    BinOp.getOperand(1),
+                                    DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));
+
+  SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
+  return DAG.getBitcast(VT, NarrowBinOp);
+}
+
+/// If we are extracting a subvector from a wide vector load, convert to a
+/// narrow load to eliminate the extraction:
+/// (extract_subvector (load wide vector)) --> (load narrow vector)
+static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
+  // TODO: Add support for big-endian. The offset calculation must be adjusted.
+  if (DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
+  // TODO: The one-use check is overly conservative. Check the cost of the
+  // extract instead or remove that condition entirely.
+  auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
+  auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() ||
+      !ExtIdx)
+    return SDValue();
+
+  // The narrow load will be offset from the base address of the old load if
+  // we are extracting from something besides index 0 (little-endian).
+  EVT VT = Extract->getValueType(0);
+  SDLoc DL(Extract);
+  SDValue BaseAddr = Ld->getOperand(1);
+  unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();
+
+  // TODO: Use "BaseIndexOffset" to make this more effective.
+  SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
+                                                   VT.getStoreSize());
+  SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
+  DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
+  return NewLd;
+}
+
 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
 
-  if (V->getOpcode() == ISD::CONCAT_VECTORS) {
-    // Combine:
-    //    (extract_subvec (concat V1, V2, ...), i)
-    // Into:
-    //    Vi if possible
-    // Only operand 0 is checked as 'concat' assumes all inputs of the same
-    // type.
-    if (V->getOperand(0).getValueType() != NVT)
-      return SDValue();
+  // Extract from UNDEF is UNDEF.
+  if (V.isUndef())
+    return DAG.getUNDEF(NVT);
+
+  if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
+    if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
+      return NarrowLoad;
+
+  // Combine:
+  //    (extract_subvec (concat V1, V2, ...), i)
+  // Into:
+  //    Vi if possible
+  // Only operand 0 is checked as 'concat' assumes all inputs of the same
+  // type.
+  if (V->getOpcode() == ISD::CONCAT_VECTORS &&
+      isa<ConstantSDNode>(N->getOperand(1)) &&
+      V->getOperand(0).getValueType() == NVT) {
     unsigned Idx = N->getConstantOperandVal(1);
     unsigned NumElems = NVT.getVectorNumElements();
     assert((Idx % NumElems) == 0 &&
@@ -13633,19 +14896,16 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
 
   if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
     // Handle only simple case where vector being inserted and vector
-    // being extracted are of same type, and are half size of larger vectors.
-    EVT BigVT = V->getOperand(0).getValueType();
+    // being extracted are of same size.
     EVT SmallVT = V->getOperand(1).getValueType();
-    if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits())
+    if (!NVT.bitsEq(SmallVT))
       return SDValue();
 
-    // Only handle cases where both indexes are constants with the same type.
+    // Only handle cases where both indexes are constants.
     ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
     ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
 
-    if (InsIdx && ExtIdx &&
-        InsIdx->getValueType(0).getSizeInBits() <= 64 &&
-        ExtIdx->getValueType(0).getSizeInBits() <= 64) {
+    if (InsIdx && ExtIdx) {
       // Combine:
       //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
       // Into:
@@ -13661,6 +14921,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     }
   }
 
+  if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
+    return NarrowBOp;
+
   return SDValue();
 }
 
@@ -13892,6 +15155,167 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
   return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
 }
 
+// Match shuffles that can be converted to any_vector_extend_in_reg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
+// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
+static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
+                                            SelectionDAG &DAG,
+                                            const TargetLowering &TLI,
+                                            bool LegalOperations) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  ArrayRef<int> Mask = SVN->getMask();
+  SDValue N0 = SVN->getOperand(0);
+
+  // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
+  auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
+  // power-of-2 extensions as they are the most likely.
+  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
+    if (!isAnyExtend(Scale))
+      continue;
+
+    EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
+    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
+      return DAG.getBitcast(VT,
+                            DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
+  }
+
+  return SDValue();
+}
+
+// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
+// each source element of a large type into the lowest elements of a smaller
+// destination type. This is often generated during legalization.
+// If the source node itself was a '*_extend_vector_inreg' node then we should
+// then be able to remove it.
+static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
+                                        SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  SDValue N0 = SVN->getOperand(0);
+  while (N0.getOpcode() == ISD::BITCAST)
+    N0 = N0.getOperand(0);
+
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
+      Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
+      Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  ArrayRef<int> Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
+  unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
+
+  if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
+    return SDValue();
+  unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
+
+  // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
+  // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
+  // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
+  auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // At the moment we just handle the case where we've truncated back to the
+  // same size as before the extension.
+  // TODO: handle more extension/truncation cases as cases arise.
+  if (EltSizeInBits != ExtSrcSizeInBits)
+    return SDValue();
+
+  // We can remove *extend_vector_inreg only if the truncation happens at
+  // the same scale as the extension.
+  if (isTruncate(ExtScale))
+    return DAG.getBitcast(VT, N00);
+
+  return SDValue();
+}
+
+// Combine shuffles of splat-shuffles of the form:
+// shuffle (shuffle V, undef, splat-mask), undef, M
+// If splat-mask contains undef elements, we need to be careful about
+// introducing undef's in the folded mask which are not the result of composing
+// the masks of the shuffles.
+static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
+                                     ShuffleVectorSDNode *Splat,
+                                     SelectionDAG &DAG) {
+  ArrayRef<int> SplatMask = Splat->getMask();
+  assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");
+
+  // Prefer simplifying to the splat-shuffle, if possible. This is legal if
+  // every undef mask element in the splat-shuffle has a corresponding undef
+  // element in the user-shuffle's mask or if the composition of mask elements
+  // would result in undef.
+  // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
+  // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
+  //   In this case it is not legal to simplify to the splat-shuffle because we
+  //   may be exposing the users of the shuffle an undef element at index 1
+  //   which was not there before the combine.
+  // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
+  //   In this case the composition of masks yields SplatMask, so it's ok to
+  //   simplify to the splat-shuffle.
+  // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
+  //   In this case the composed mask includes all undef elements of SplatMask
+  //   and in addition sets element zero to undef. It is safe to simplify to
+  //   the splat-shuffle.
+  auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
+                                       ArrayRef<int> SplatMask) {
+    for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
+      if (UserMask[i] != -1 && SplatMask[i] == -1 &&
+          SplatMask[UserMask[i]] != -1)
+        return false;
+    return true;
+  };
+  if (CanSimplifyToExistingSplat(UserMask, SplatMask))
+    return SDValue(Splat, 0);
+
+  // Create a new shuffle with a mask that is composed of the two shuffles'
+  // masks.
+  SmallVector<int, 32> NewMask;
+  for (int Idx : UserMask)
+    NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
+
+  return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
+                              Splat->getOperand(0), Splat->getOperand(1),
+                              NewMask);
+}
+
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -13938,6 +15362,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
   }
 
+  // A shuffle of a single vector that is a splat can always be folded.
+  if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
+    if (N1->isUndef() && N0Shuf->isSplat())
+      return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);
+
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
@@ -13996,6 +15425,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
     return S;
 
+  // Match shuffles that can be converted to any_vector_extend_in_reg.
+  if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
+    return V;
+
+  // Combine "truncate_vector_in_reg" style shuffles.
+  if (SDValue V = combineTruncationShuffle(SVN, DAG))
+    return V;
+
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
       (N1.isUndef() ||
@@ -14253,6 +15690,16 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
 
+  // If inserting an UNDEF, just return the original vector.
+  if (N1.isUndef())
+    return N0;
+
+  // If this is an insert of an extracted vector into an undef vector, we can
+  // just use the input to the extract.
+  if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
+    return N1.getOperand(0);
+
   // Combine INSERT_SUBVECTORs where we are inserting to the same index.
   // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
   // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
@@ -14262,26 +15709,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
                        N1, N2);
 
-  if (N0.getValueType() != N1.getValueType())
+  if (!isa<ConstantSDNode>(N2))
     return SDValue();
 
+  unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+
+  // Canonicalize insert_subvector dag nodes.
+  // Example:
+  // (insert_subvector (insert_subvector A, Idx0), Idx1)
+  // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
+      N1.getValueType() == N0.getOperand(1).getValueType() &&
+      isa<ConstantSDNode>(N0.getOperand(2))) {
+    unsigned OtherIdx = N0.getConstantOperandVal(2);
+    if (InsIdx < OtherIdx) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
+                                  N0.getOperand(0), N1, N2);
+      AddToWorklist(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
+                         VT, NewOp, N0.getOperand(1), N0.getOperand(2));
+    }
+  }
+
   // If the input vector is a concatenation, and the insert replaces
-  // one of the halves, we can optimize into a single concat_vectors.
-  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&
-      N2.getOpcode() == ISD::Constant) {
-    APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue();
-
-    // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->
-    // (concat_vectors Z, Y)
-    if (InsIdx == 0)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,
-                         N0.getOperand(1));
+  // one of the pieces, we can optimize into a single concat_vectors.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
+      N0.getOperand(0).getValueType() == N1.getValueType()) {
+    unsigned Factor = N1.getValueType().getVectorNumElements();
+
+    SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
+    Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;
 
-    // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->
-    // (concat_vectors X, Z)
-    if (InsIdx == VT.getVectorNumElements() / 2)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0),
-                         N1);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
   }
 
   return SDValue();
@@ -14366,9 +15826,9 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
 
       // Extract the sub element from the constant bit mask.
       if (DAG.getDataLayout().isBigEndian()) {
-        Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits);
+        Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits);
       } else {
-        Bits = Bits.lshr(SubIdx * NumSubBits);
+        Bits.lshrInPlace(SubIdx * NumSubBits);
       }
 
       if (Split > 1)
@@ -15041,7 +16501,7 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
 ///     =>
 ///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
 ///     does not require additional intermediate precision]
-SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
+SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
@@ -15096,7 +16556,7 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
 /// As a result, we precompute A/2 prior to the iteration loop.
 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
                                          unsigned Iterations,
-                                         SDNodeFlags *Flags, bool Reciprocal) {
+                                         SDNodeFlags Flags, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
@@ -15140,7 +16600,7 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
 ///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
                                          unsigned Iterations,
-                                         SDNodeFlags *Flags, bool Reciprocal) {
+                                         SDNodeFlags Flags, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
@@ -15185,7 +16645,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
 /// Op can be zero.
-SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags,
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
                                            bool Reciprocal) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
@@ -15238,17 +16698,17 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags,
   return SDValue();
 }
 
-SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
   return buildSqrtEstimateImpl(Op, Flags, true);
 }
 
-SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
   return buildSqrtEstimateImpl(Op, Flags, false);
 }
 
 /// Return true if base is a frame index, which is known not to alias with
 /// anything but itself.  Provides base object and offset as results.
-static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
+static bool findBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
                            const GlobalValue *&GV, const void *&CV) {
   // Assume it is a primitive operation.
   Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr;
@@ -15257,7 +16717,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
   if (Base.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
       Base = Base.getOperand(0);
-      Offset += C->getZExtValue();
+      Offset += C->getSExtValue();
     }
   }
 
@@ -15300,54 +16760,82 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   if (Op1->isInvariant() && Op0->writeMem())
     return false;
 
+  unsigned NumBytes0 = Op0->getMemoryVT().getSizeInBits() >> 3;
+  unsigned NumBytes1 = Op1->getMemoryVT().getSizeInBits() >> 3;
+
+  // Check for BaseIndexOffset matching.
+  BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
+  BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG);
+  int64_t PtrDiff;
+  if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
+    return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
+
+  // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+  // able to calculate their relative offset if at least one arises
+  // from an alloca. However, these allocas cannot overlap and we
+  // can infer there is no alias.
+  if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
+    if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      // If the base are the same frame index but the we couldn't find a
+      // constant offset, (indices are different) be conservative.
+      if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
+                     !MFI.isFixedObjectIndex(B->getIndex())))
+        return false;
+    }
+
+  // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis
+  // modified to use BaseIndexOffset.
+
   // Gather base node and offset information.
-  SDValue Base1, Base2;
-  int64_t Offset1, Offset2;
-  const GlobalValue *GV1, *GV2;
-  const void *CV1, *CV2;
-  bool isFrameIndex1 = FindBaseOffset(Op0->getBasePtr(),
+  SDValue Base0, Base1;
+  int64_t Offset0, Offset1;
+  const GlobalValue *GV0, *GV1;
+  const void *CV0, *CV1;
+  bool IsFrameIndex0 = findBaseOffset(Op0->getBasePtr(),
+                                      Base0, Offset0, GV0, CV0);
+  bool IsFrameIndex1 = findBaseOffset(Op1->getBasePtr(),
                                       Base1, Offset1, GV1, CV1);
-  bool isFrameIndex2 = FindBaseOffset(Op1->getBasePtr(),
-                                      Base2, Offset2, GV2, CV2);
 
-  // If they have a same base address then check to see if they overlap.
-  if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2)))
-    return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 ||
-             (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1);
+  // If they have the same base address, then check to see if they overlap.
+  if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1)))
+    return !((Offset0 + NumBytes0) <= Offset1 ||
+             (Offset1 + NumBytes1) <= Offset0);
 
   // It is possible for different frame indices to alias each other, mostly
   // when tail call optimization reuses return address slots for arguments.
   // To catch this case, look up the actual index of frame indices to compute
   // the real alias relationship.
-  if (isFrameIndex1 && isFrameIndex2) {
+  if (IsFrameIndex0 && IsFrameIndex1) {
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    Offset0 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base0)->getIndex());
     Offset1 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex());
-    Offset2 += MFI.getObjectOffset(cast<FrameIndexSDNode>(Base2)->getIndex());
-    return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 ||
-             (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1);
+    return !((Offset0 + NumBytes0) <= Offset1 ||
+             (Offset1 + NumBytes1) <= Offset0);
   }
 
   // Otherwise, if we know what the bases are, and they aren't identical, then
   // we know they cannot alias.
-  if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2))
+  if ((IsFrameIndex0 || CV0 || GV0) && (IsFrameIndex1 || CV1 || GV1))
     return false;
 
   // If we know required SrcValue1 and SrcValue2 have relatively large alignment
   // compared to the size and offset of the access, we may be able to prove they
-  // do not alias.  This check is conservative for now to catch cases created by
+  // do not alias. This check is conservative for now to catch cases created by
   // splitting vector types.
-  if ((Op0->getOriginalAlignment() == Op1->getOriginalAlignment()) &&
-      (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) &&
-      (Op0->getMemoryVT().getSizeInBits() >> 3 ==
-       Op1->getMemoryVT().getSizeInBits() >> 3) &&
-      (Op0->getOriginalAlignment() > (Op0->getMemoryVT().getSizeInBits() >> 3))) {
-    int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment();
-    int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment();
+  int64_t SrcValOffset0 = Op0->getSrcValueOffset();
+  int64_t SrcValOffset1 = Op1->getSrcValueOffset();
+  unsigned OrigAlignment0 = Op0->getOriginalAlignment();
+  unsigned OrigAlignment1 = Op1->getOriginalAlignment();
+  if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
+      NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) {
+    int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
+    int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;
 
     // There is no overlap between these relatively aligned accesses of similar
-    // size, return no alias.
-    if ((OffAlign1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign2 ||
-        (OffAlign2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign1)
+    // size. Return no alias.
+    if ((OffAlign0 + NumBytes0) <= OffAlign1 ||
+        (OffAlign1 + NumBytes1) <= OffAlign0)
       return false;
   }
 
@@ -15359,20 +16847,18 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
     UseAA = false;
 #endif
-  if (UseAA &&
+
+  if (UseAA && AA &&
       Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
     // Use alias analysis information.
-    int64_t MinOffset = std::min(Op0->getSrcValueOffset(),
-                                 Op1->getSrcValueOffset());
-    int64_t Overlap1 = (Op0->getMemoryVT().getSizeInBits() >> 3) +
-        Op0->getSrcValueOffset() - MinOffset;
-    int64_t Overlap2 = (Op1->getMemoryVT().getSizeInBits() >> 3) +
-        Op1->getSrcValueOffset() - MinOffset;
+    int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
+    int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
     AliasResult AAResult =
-        AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap1,
-                                UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
-                 MemoryLocation(Op1->getMemOperand()->getValue(), Overlap2,
-                                UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
+        AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
+                                 UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
+                  MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
+                                 UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
     if (AAResult == NoAlias)
       return false;
   }
@@ -15454,6 +16940,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       ++Depth;
       break;
 
+    case ISD::CopyFromReg:
+      // Forward past CopyFromReg.
+      Chains.push_back(Chain.getOperand(0));
+      ++Depth;
+      break;
+
     default:
       // For all other instructions we will just have to take what we can get.
       Aliases.push_back(Chain);
@@ -15482,17 +16974,29 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
+// This function tries to collect a bunch of potentially interesting
+// nodes to improve the chains of, all at once. This might seem
+// redundant, as this function gets called when visiting every store
+// node, so why not let the work be done on each store as it's visited?
+//
+// I believe this is mainly important because MergeConsecutiveStores
+// is unable to deal with merging stores of different sizes, so unless
+// we improve the chains of all the potential candidates up-front
+// before running MergeConsecutiveStores, it might only see some of
+// the nodes that will eventually be candidates, and then not be able
+// to go from a partially-merged state to the desired final
+// fully-merged state.
 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
 
   // We must have a base and an offset.
-  if (!BasePtr.Base.getNode())
+  if (!BasePtr.getBase().getNode())
     return false;
 
   // Do not handle stores to undef base pointers.
-  if (BasePtr.Base.isUndef())
+  if (BasePtr.getBase().isUndef())
     return false;
 
   SmallVector<StoreSDNode *, 8> ChainedStores;
@@ -15514,13 +17018,11 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
     BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
 
     // Check that the base pointer is the same as the original one.
-    if (!Ptr.equalBaseIndex(BasePtr))
+    if (!BasePtr.equalBaseIndex(Ptr, DAG))
       break;
 
-    // Find the next memory operand in the chain. If the next operand in the
-    // chain is a store then move up and continue the scan with the next
-    // memory operand. If the next operand is a load save it and use alias
-    // information to check if it interferes with anything.
+    // Walk up the chain to find the next store node, ignoring any
+    // intermediate loads. Any other kind of node will halt the loop.
     SDNode *NextInChain = Index->getChain().getNode();
     while (true) {
       if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
@@ -15539,9 +17041,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
         Index = nullptr;
         break;
       }
-    }
+    } // end while
   }
 
+  // At this point, ChainedStores lists all of the Store nodes
+  // reachable by iterating up through chain nodes matching the above
+  // conditions.  For each such store identified, try to find an
+  // earlier chain to attach the store to which won't violate the
+  // required ordering.
   bool MadeChangeToSt = false;
   SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
 
@@ -15565,7 +17072,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
 }
 
 /// This is the entry point for the file.
-void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
                            CodeGenOpt::Level OptLevel) {
   /// This is the main entry point to this class.
   DAGCombiner(*this, AA, OptLevel).Run(Level);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index e2f33bb..b2599b2 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1,4 +1,4 @@
-//===-- FastISel.cpp - Implementation of the FastISel class ---------------===//
+//===- FastISel.cpp - Implementation of the FastISel class ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -39,35 +39,76 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
@@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
                                     "target-specific selector");
 STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 
-void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
-                                           unsigned AttrIdx) {
-  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
-  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
-  Alignment = CS->getParamAlignment(AttrIdx);
-}
-
 /// Set the current block to which generated machine instructions will be
 /// appended, and clear the local CSE map.
 void FastISel::startNewBlock() {
@@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
       // Try to emit the constant by using an integer constant with a cast.
       const APFloat &Flt = CF->getValueAPF();
       EVT IntVT = TLI.getPointerTy(DL);
-
-      uint64_t x[2];
       uint32_t IntBitWidth = IntVT.getSizeInBits();
+      APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false);
       bool isExact;
-      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                                 APFloat::rmTowardZero, &isExact);
+      (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact);
       if (isExact) {
-        APInt IntVal(IntBitWidth, x);
-
         unsigned IntegerReg =
-            getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+            getRegForValue(ConstantInt::get(V->getContext(), SIntVal));
         if (IntegerReg != 0)
           Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
                            /*Kill=*/false);
@@ -600,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   // have to worry about calling conventions and target-specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // CALLSEQ_START(0...)
+  // CALLSEQ_START(0, 0...)
   // STACKMAP(id, nbytes, ...)
   // CALLSEQ_END(0, 0)
   //
@@ -646,7 +668,7 @@ bool FastISel::selectStackmap(const CallInst *I) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(TargetOpcode::STACKMAP));
   for (auto const &MO : Ops)
-    MIB.addOperand(MO);
+    MIB.add(MO);
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
@@ -672,10 +694,8 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
   Args.reserve(NumArgs);
 
   // Populate the argument list.
-  // Attributes for args start at offset 1, after the return attribute.
   ImmutableCallSite CS(CI);
-  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
-       ArgI != ArgE; ++ArgI) {
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) {
     Value *V = CI->getOperand(ArgI);
 
     assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
@@ -683,7 +703,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, AttrI);
+    Entry.setAttributes(&CS, ArgIdx);
     Args.push_back(Entry);
   }
 
@@ -826,7 +846,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
                                     TII.get(TargetOpcode::PATCHPOINT));
 
   for (auto &MO : Ops)
-    MIB.addOperand(MO);
+    MIB.add(MO);
 
   MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI);
 
@@ -841,9 +861,28 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
   return true;
 }
 
-/// Returns an AttributeSet representing the attributes applied to the return
+bool FastISel::selectXRayCustomEvent(const CallInst *I) {
+  const auto &Triple = TM.getTargetTriple();
+  if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
+    return true; // don't do anything to this instruction.
+  SmallVector<MachineOperand, 8> Ops;
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(0)),
+                                          /*IsDef=*/false));
+  Ops.push_back(MachineOperand::CreateReg(getRegForValue(I->getArgOperand(1)),
+                                          /*IsDef=*/false));
+  MachineInstrBuilder MIB =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::PATCHABLE_EVENT_CALL));
+  for (auto &MO : Ops)
+    MIB.add(MO);
+  // Insert the Patchable Event Call instruction, that gets lowered properly.
+  return true;
+}
+
+
+/// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
-static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
+static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
@@ -852,8 +891,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
-  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
-                           Attrs);
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
 }
 
 bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
@@ -885,9 +924,10 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, ArgI + 1);
+    Entry.setAttributes(&CS, ArgI);
     Args.push_back(Entry);
   }
+  TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args);
 
   CallLoweringInfo CLI;
   CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs);
@@ -1021,7 +1061,7 @@ bool FastISel::lowerCall(const CallInst *CI) {
     Entry.Ty = V->getType();
 
     // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Entry.setAttributes(&CS, i - CS.arg_begin());
     Args.push_back(Entry);
   }
 
@@ -1110,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       return true;
     }
 
-    unsigned Offset = 0;
+    // Byval arguments with frame indices were already handled after argument
+    // lowering and before isel.
+    const auto *Arg =
+        dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+    if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+      return true;
+
     Optional<MachineOperand> Op;
-    if (const auto *Arg = dyn_cast<Argument>(Address))
-      // Some arguments' frame index is recorded during argument lowering.
-      Offset = FuncInfo.getArgumentFrameIndex(Arg);
-    if (Offset)
-      Op = MachineOperand::CreateFI(Offset);
-    if (!Op)
-      if (unsigned Reg = lookUpRegForValue(Address))
-        Op = MachineOperand::CreateReg(Reg, false);
+    if (unsigned Reg = lookUpRegForValue(Address))
+      Op = MachineOperand::CreateReg(Reg, false);
 
     // If we have a VLA that has a "use" in a metadata node that's then used
     // here but it has no other uses, then we have a problem. E.g.,
@@ -1143,13 +1183,15 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
              "Expected inlined-at fields to agree");
       if (Op->isReg()) {
         Op->setIsDebug(true);
+        // A dbg.declare describes the address of a source variable, so lower it
+        // into an indirect DBG_VALUE.
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
-                DI->getVariable(), DI->getExpression());
+                TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true,
+                Op->getReg(), 0, DI->getVariable(), DI->getExpression());
       } else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::DBG_VALUE))
-            .addOperand(*Op)
+            .add(*Op)
             .addImm(0)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
@@ -1229,6 +1271,9 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64:
     return selectPatchpoint(II);
+
+  case Intrinsic::xray_customevent:
+    return selectXRayCustomEvent(II);
   }
 
   return fastLowerIntrinsicCall(II);
@@ -1362,7 +1407,7 @@ bool FastISel::selectInstruction(const Instruction *I) {
 
   if (const auto *Call = dyn_cast<CallInst>(I)) {
     const Function *F = Call->getCalledFunction();
-    LibFunc::Func Func;
+    LibFunc Func;
 
     // As a special case, don't handle calls to builtin library functions that
     // may be translated directly to target instructions.
@@ -1665,7 +1710,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
       TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
       SkipTargetIndependentISel(SkipTargetIndependentISel) {}
 
-FastISel::~FastISel() {}
+FastISel::~FastISel() = default;
 
 bool FastISel::fastLowerArguments() { return false; }
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 377a523..b736037 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   MF = &mf;
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  MachineModuleInfo &MMI = MF->getMMI();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
   unsigned StackAlign = TFI->getStackAlignment();
 
@@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(&I)))
           InitializeRegForValue(&I);
 
-      // Collect llvm.dbg.declare information. This is done now instead of
-      // during the initial isel pass through the IR so that it is done
-      // in a predictable order.
-      if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I)) {
-        assert(DI->getVariable() && "Missing variable");
-        assert(DI->getDebugLoc() && "Missing location");
-        if (MMI.hasDebugInfo()) {
-          // Don't handle byval struct arguments or VLAs, for example.
-          // Non-byval arguments are handled here (they refer to the stack
-          // temporary alloca at this point).
-          const Value *Address = DI->getAddress();
-          if (Address) {
-            if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
-              Address = BCI->getOperand(0);
-            if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
-              DenseMap<const AllocaInst *, int>::iterator SI =
-                StaticAllocaMap.find(AI);
-              if (SI != StaticAllocaMap.end()) { // Check for VLAs.
-                int FI = SI->second;
-                MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
-                                       FI, DI->getDebugLoc());
-              }
-            }
-          }
-        }
-      }
-
       // Decide the preferred extend type for a value.
       PreferredExtendType[&I] = getPreferredExtendForValue(&I);
     }
@@ -400,10 +372,9 @@ FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
   if (!LOI->IsValid)
     return nullptr;
 
-  if (BitWidth > LOI->KnownZero.getBitWidth()) {
+  if (BitWidth > LOI->Known.getBitWidth()) {
     LOI->NumSignBits = 1;
-    LOI->KnownZero = LOI->KnownZero.zextOrTrunc(BitWidth);
-    LOI->KnownOne = LOI->KnownOne.zextOrTrunc(BitWidth);
+    LOI->Known = LOI->Known.zextOrTrunc(BitWidth);
   }
 
   return LOI;
@@ -436,17 +407,15 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   Value *V = PN->getIncomingValue(0);
   if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
     DestLOI.NumSignBits = 1;
-    APInt Zero(BitWidth, 0);
-    DestLOI.KnownZero = Zero;
-    DestLOI.KnownOne = Zero;
+    DestLOI.Known = KnownBits(BitWidth);
     return;
   }
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     APInt Val = CI->getValue().zextOrTrunc(BitWidth);
     DestLOI.NumSignBits = Val.getNumSignBits();
-    DestLOI.KnownZero = ~Val;
-    DestLOI.KnownOne = Val;
+    DestLOI.Known.Zero = ~Val;
+    DestLOI.Known.One = Val;
   } else {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
                                 "CopyToReg node was created.");
@@ -463,25 +432,23 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     DestLOI = *SrcLOI;
   }
 
-  assert(DestLOI.KnownZero.getBitWidth() == BitWidth &&
-         DestLOI.KnownOne.getBitWidth() == BitWidth &&
+  assert(DestLOI.Known.Zero.getBitWidth() == BitWidth &&
+         DestLOI.Known.One.getBitWidth() == BitWidth &&
          "Masks should have the same bit width as the type.");
 
   for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
     Value *V = PN->getIncomingValue(i);
     if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
       DestLOI.NumSignBits = 1;
-      APInt Zero(BitWidth, 0);
-      DestLOI.KnownZero = Zero;
-      DestLOI.KnownOne = Zero;
+      DestLOI.Known = KnownBits(BitWidth);
       return;
     }
 
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       APInt Val = CI->getValue().zextOrTrunc(BitWidth);
       DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
-      DestLOI.KnownZero &= ~Val;
-      DestLOI.KnownOne &= Val;
+      DestLOI.Known.Zero &= ~Val;
+      DestLOI.Known.One &= Val;
       continue;
     }
 
@@ -498,8 +465,8 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
       return;
     }
     DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
-    DestLOI.KnownZero &= SrcLOI->KnownZero;
-    DestLOI.KnownOne &= SrcLOI->KnownOne;
+    DestLOI.Known.Zero &= SrcLOI->Known.Zero;
+    DestLOI.Known.One &= SrcLOI->Known.One;
   }
 }
 
@@ -515,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
 /// If the argument does not have any assigned frame index then 0 is
 /// returned.
 int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
-  DenseMap<const Argument *, int>::iterator I =
-    ByValArgFrameIndexMap.find(A);
+  auto I = ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
     return I->second;
   DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
-  return 0;
+  return INT_MAX;
 }
 
 unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
@@ -557,3 +523,29 @@ void FunctionLoweringInfo::setCurrentSwiftErrorVReg(
     const MachineBasicBlock *MBB, const Value *Val, unsigned VReg) {
   SwiftErrorVRegDefMap[std::make_pair(MBB, Val)] = VReg;
 }
+
+std::pair<unsigned, bool>
+FunctionLoweringInfo::getOrCreateSwiftErrorVRegDefAt(const Instruction *I) {
+  auto Key = PointerIntPair<const Instruction *, 1, bool>(I, true);
+  auto It = SwiftErrorVRegDefUses.find(Key);
+  if (It == SwiftErrorVRegDefUses.end()) {
+    auto &DL = MF->getDataLayout();
+    const TargetRegisterClass *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+    unsigned VReg =  MF->getRegInfo().createVirtualRegister(RC);
+    SwiftErrorVRegDefUses[Key] = VReg;
+    return std::make_pair(VReg, true);
+  }
+  return std::make_pair(It->second, false);
+}
+
+std::pair<unsigned, bool>
+FunctionLoweringInfo::getOrCreateSwiftErrorVRegUseAt(const Instruction *I, const MachineBasicBlock *MBB, const Value *Val) {
+  auto Key = PointerIntPair<const Instruction *, 1, bool>(I, false);
+  auto It = SwiftErrorVRegDefUses.find(Key);
+  if (It == SwiftErrorVRegDefUses.end()) {
+    unsigned VReg = getOrCreateSwiftErrorVReg(MBB, Val);
+    SwiftErrorVRegDefUses[Key] = VReg;
+    return std::make_pair(VReg, true);
+  }
+  return std::make_pair(It->second, false);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4a9042c..b96c96f 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -161,7 +161,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   if (VRBase) {
     DstRC = MRI->getRegClass(VRBase);
   } else if (UseRC) {
-    assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!");
+    assert(TRI->isTypeLegalForClass(*UseRC, VT) &&
+           "Incompatible phys register def and uses!");
     DstRC = UseRC;
   } else {
     DstRC = TLI->getRegClassFor(VT);
@@ -235,7 +236,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
 
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
-      unsigned NumResults = CountResults(Node);
       VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
       assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
       MIB.addReg(VRBase, RegState::Define);
@@ -589,7 +589,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     } else
       AddOperand(MIB, N0, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
                  IsClone, IsCloned);
-    // Add the subregster being inserted
+    // Add the subregister being inserted
     AddOperand(MIB, N1, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
                IsClone, IsCloned);
     MIB.addImm(SubIdx);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index b002825..7e4bc3c 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -899,6 +899,35 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   }
 }
 
+static TargetLowering::LegalizeAction
+getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) {
+  unsigned EqOpc;
+  switch (Opcode) {
+    default: llvm_unreachable("Unexpected FP pseudo-opcode");
+    case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
+    case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
+    case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
+    case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
+    case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
+    case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
+    case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
+    case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
+    case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
+    case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
+    case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
+    case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
+  }
+
+  auto Action = TLI.getOperationAction(EqOpc, VT);
+
+  // We don't currently handle Custom or Promote for strict FP pseudo-ops.
+  // For now, we just expand for those cases.
+  if (Action != TargetLowering::Legal)
+    Action = TargetLowering::Expand;
+
+  return Action;
+}
+
 /// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
@@ -994,7 +1023,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     break;
   case ISD::EXTRACT_ELEMENT:
   case ISD::FLT_ROUNDS_:
-  case ISD::FPOWI:
   case ISD::MERGE_VALUES:
   case ISD::EH_RETURN:
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -1043,6 +1071,25 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       return;
     }
     break;
+  case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FPOW:
+  case ISD::STRICT_FPOWI:
+  case ISD::STRICT_FSIN:
+  case ISD::STRICT_FCOS:
+  case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
+  case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG10:
+  case ISD::STRICT_FLOG2:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
+    // These pseudo-ops get legalized as if they were their non-strict
+    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
+    // is also legal, but if ISD::FSQRT requires expansion then so does
+    // ISD::STRICT_FSQRT.
+    Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(),
+                                     Node->getValueType(0));
+    break;
 
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
@@ -1192,8 +1239,11 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
 
       // If the index is dependent on the store we will introduce a cycle when
       // creating the load (the load uses the index, and by replacing the chain
-      // we will make the index dependent on the load).
-      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist))
+      // we will make the index dependent on the load). Also, the store might be
+      // dependent on the extractelement and introduce a cycle when creating 
+      // the load.
+      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist) ||
+          ST->hasPredecessor(Op.getNode()))
         continue;
 
       StackPtr = ST->getBasePtr();
@@ -1340,7 +1390,7 @@ void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
   // Convert to an integer of the same size.
   if (TLI.isTypeLegal(IVT)) {
     State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
-    State.SignMask = APInt::getSignBit(NumBits);
+    State.SignMask = APInt::getSignMask(NumBits);
     State.SignBit = NumBits - 1;
     return;
   }
@@ -1490,7 +1540,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   SDValue Size  = Tmp2.getOperand(1);
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
@@ -1909,8 +1959,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1935,9 +1985,14 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setTailCall(isTailCall)
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned)
+      .setIsPostTypeLegalization(true);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -1960,8 +2015,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   for (unsigned i = 0; i != NumOps; ++i) {
     Entry.Node = Ops[i];
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1970,9 +2025,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned)
+      .setIsPostTypeLegalization(true);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -1994,8 +2053,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2004,9 +2063,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2019,6 +2081,9 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F80,
                                               RTLIB::Libcall Call_F128,
                                               RTLIB::Libcall Call_PPCF128) {
+  if (Node->isStrictFPOpcode())
+    Node = DAG.mutateStrictFPToFP(Node);
+
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
@@ -2081,8 +2146,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
 
@@ -2090,8 +2155,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = FIPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = isSigned;
-  Entry.isZExt = !isSigned;
+  Entry.IsSExt = isSigned;
+  Entry.IsZExt = !isSigned;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2099,9 +2164,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2126,19 +2194,6 @@ static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// Return true if sincos libcall is available and can be used to combine sin
-/// and cos.
-static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
-                                    const TargetMachine &TM) {
-  if (!isSinCosLibcallAvailable(Node, TLI))
-    return false;
-  // GNU sin/cos functions set errno while sincos does not. Therefore
-  // combining sin and cos is only safe if unsafe-fpmath is enabled.
-  if (TM.getTargetTriple().isGNUEnvironment() && !TM.Options.UnsafeFPMath)
-    return false;
-  return true;
-}
-
 /// Only issue sincos libcall if both sin and cos are needed.
 static bool useSinCos(SDNode *Node) {
   unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
@@ -2185,24 +2240,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
   Entry.Ty = RetTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Pass the return address of sin.
   SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = SinPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Also pass the return address of the cos.
   SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = CosPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2210,9 +2265,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args));
+  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+      std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2529,12 +2584,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
     APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
     APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
     for (unsigned J = 0; J != Sz; J += 8) {
-      MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J));
-      MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J));
-      MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J));
-      MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J));
-      MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J));
-      MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J));
+      MaskHi4 = MaskHi4 | (0xF0ull << J);
+      MaskLo4 = MaskLo4 | (0x0Full << J);
+      MaskHi2 = MaskHi2 | (0xCCull << J);
+      MaskLo2 = MaskLo2 | (0x33ull << J);
+      MaskHi1 = MaskHi1 | (0xAAull << J);
+      MaskLo1 = MaskLo1 | (0x55ull << J);
     }
 
     // BSWAP if the type is wider than a single byte.
@@ -2573,7 +2628,7 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
           DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));
 
     APInt Shift(Sz, 1);
-    Shift = Shift.shl(J);
+    Shift <<= J;
     Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
     Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
   }
@@ -2968,7 +3023,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT NVT = Node->getValueType(0);
     APFloat apf(DAG.EVTToAPFloatSemantics(VT),
                 APInt::getNullValue(VT.getSizeInBits()));
-    APInt x = APInt::getSignBit(NVT.getSizeInBits());
+    APInt x = APInt::getSignMask(NVT.getSizeInBits());
     (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
     Tmp1 = DAG.getConstantFP(apf, dl, VT);
     Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
@@ -3091,7 +3146,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                             TLI.getVectorIdxTy(DAG.getDataLayout()))));
     }
 
-    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+    Tmp1 = DAG.getBuildVector(VT, dl, Ops);
     // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
     Results.push_back(Tmp1);
@@ -3181,7 +3236,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
     // fcos which share the same operand and both are used.
     if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
-         canCombineSinCosLibcall(Node, TLI, TM))
+         isSinCosLibcallAvailable(Node, TLI))
         && useSinCos(Node)) {
       SDVTList VTs = DAG.getVTList(VT, VT);
       Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
@@ -3237,7 +3292,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
         TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
-      const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(Node)->Flags;
+      const SDNodeFlags Flags = Node->getFlags();
       Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
       Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
       Results.push_back(Tmp1);
@@ -3477,17 +3532,24 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         LC = RTLIB::MUL_I128;
       assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
 
-      // The high part is obtained by SRA'ing all but one of the bits of low
-      // part.
-      unsigned LoSize = VT.getSizeInBits();
-      SDValue HiLHS =
-          DAG.getNode(ISD::SRA, dl, VT, RHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout())));
-      SDValue HiRHS =
-          DAG.getNode(ISD::SRA, dl, VT, LHS,
-                      DAG.getConstant(LoSize - 1, dl,
-                                      TLI.getPointerTy(DAG.getDataLayout())));
+      SDValue HiLHS;
+      SDValue HiRHS;
+      if (isSigned) {
+        // The high part is obtained by SRA'ing all but one of the bits of low
+        // part.
+        unsigned LoSize = VT.getSizeInBits();
+        HiLHS =
+            DAG.getNode(ISD::SRA, dl, VT, LHS,
+                        DAG.getConstant(LoSize - 1, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+        HiRHS =
+            DAG.getNode(ISD::SRA, dl, VT, RHS,
+                        DAG.getConstant(LoSize - 1, dl,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+      } else {
+          HiLHS = DAG.getConstant(0, dl, VT);
+          HiRHS = DAG.getConstant(0, dl, VT);
+      }
 
       // Here we're passing the 2 arguments explicitly as 4 arguments that are
       // pre-lowered to the correct types. This all depends upon WideVT not
@@ -3505,16 +3567,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
         Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
       }
-      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
-                               DAG.getIntPtrConstant(0, dl));
-      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
-                            DAG.getIntPtrConstant(1, dl));
-      // Ret is a node with an illegal type. Because such things are not
-      // generally permitted during this phase of legalization, make sure the
-      // node has no more uses. The above EXTRACT_ELEMENT nodes should have been
-      // folded.
-      assert(Ret->use_empty() &&
-             "Unexpected uses of illegally type from expanded lib call.");
+      assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
+             "Ret value is a collection of constituent nodes holding result.");
+      BottomHalf = Ret.getOperand(0);
+      TopHalf = Ret.getOperand(1);
     }
 
     if (isSigned) {
@@ -3790,8 +3846,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
                                     VT.getScalarType(), Ex, Sh));
     }
-    SDValue Result =
-      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars);
+
+    SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
     ReplaceNode(SDValue(Node, 0), Result);
     break;
   }
@@ -3830,10 +3886,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__sync_synchronize",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+        .setLibCallee(
+            CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+            DAG.getExternalSymbol("__sync_synchronize",
+                                  TLI.getPointerTy(DAG.getDataLayout())),
+            std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -3870,10 +3927,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("abort",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "abort", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args));
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
@@ -3890,16 +3947,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::FMAX_PPCF128));
     break;
   case ISD::FSQRT:
+  case ISD::STRICT_FSQRT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
                                       RTLIB::SQRT_PPCF128));
     break;
   case ISD::FSIN:
+  case ISD::STRICT_FSIN:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
                                       RTLIB::SIN_F80, RTLIB::SIN_F128,
                                       RTLIB::SIN_PPCF128));
     break;
   case ISD::FCOS:
+  case ISD::STRICT_FCOS:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
                                       RTLIB::COS_F80, RTLIB::COS_F128,
                                       RTLIB::COS_PPCF128));
@@ -3909,26 +3969,31 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandSinCosLibCall(Node, Results);
     break;
   case ISD::FLOG:
+  case ISD::STRICT_FLOG:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
                                       RTLIB::LOG_F80, RTLIB::LOG_F128,
                                       RTLIB::LOG_PPCF128));
     break;
   case ISD::FLOG2:
+  case ISD::STRICT_FLOG2:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
                                       RTLIB::LOG2_F80, RTLIB::LOG2_F128,
                                       RTLIB::LOG2_PPCF128));
     break;
   case ISD::FLOG10:
+  case ISD::STRICT_FLOG10:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
                                       RTLIB::LOG10_F80, RTLIB::LOG10_F128,
                                       RTLIB::LOG10_PPCF128));
     break;
   case ISD::FEXP:
+  case ISD::STRICT_FEXP:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
                                       RTLIB::EXP_F80, RTLIB::EXP_F128,
                                       RTLIB::EXP_PPCF128));
     break;
   case ISD::FEXP2:
+  case ISD::STRICT_FEXP2:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
                                       RTLIB::EXP2_F80, RTLIB::EXP2_F128,
                                       RTLIB::EXP2_PPCF128));
@@ -3949,11 +4014,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::CEIL_PPCF128));
     break;
   case ISD::FRINT:
+  case ISD::STRICT_FRINT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
                                       RTLIB::RINT_F80, RTLIB::RINT_F128,
                                       RTLIB::RINT_PPCF128));
     break;
   case ISD::FNEARBYINT:
+  case ISD::STRICT_FNEARBYINT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
                                       RTLIB::NEARBYINT_F64,
                                       RTLIB::NEARBYINT_F80,
@@ -3968,11 +4035,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::ROUND_PPCF128));
     break;
   case ISD::FPOWI:
+  case ISD::STRICT_FPOWI:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
                                       RTLIB::POWI_F80, RTLIB::POWI_F128,
                                       RTLIB::POWI_PPCF128));
     break;
   case ISD::FPOW:
+  case ISD::STRICT_FPOW:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
                                       RTLIB::POW_F80, RTLIB::POW_F128,
                                       RTLIB::POW_PPCF128));
@@ -4170,6 +4239,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     ReplacedNode(Node);
     break;
   }
+  case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::UDIV:
@@ -4424,8 +4494,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
       NewOps.push_back(Elt);
     }
 
-    SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps);
-
+    SDValue NewVec = DAG.getBuildVector(MidVT, SL, NewOps);
     Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
     break;
   }
@@ -4519,6 +4588,14 @@ void SelectionDAG::Legalize() {
   AssignTopologicalOrder();
 
   SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  // Use a delete listener to remove nodes which were deleted during
+  // legalization from LegalizeNodes. This is needed to handle the situation
+  // where a new node is allocated by the object pool to the same address of a
+  // previously deleted node.
+  DAGNodeDeletedListener DeleteListener(
+      *this,
+      [&LegalizedNodes](SDNode *N, SDNode *E) { LegalizedNodes.erase(N); });
+
   SelectionDAGLegalize Legalizer(*this, LegalizedNodes);
 
   // Visit all the nodes. We start in topological order, so that we see
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 72b56d8..eaf177d 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -72,7 +72,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
     case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
     case ISD::EXTRACT_VECTOR_ELT:
-      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
+      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break;
     case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
@@ -112,15 +112,15 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
   }
 
-  // If R is null, the sub-method took care of registering the result.
-  if (R.getNode()) {
+  if (R.getNode() && R.getNode() != N) {
     SetSoftenedFloat(SDValue(N, ResNo), R);
-    ReplaceSoftenFloatResult(N, ResNo, R);
+    // Return true only if the node is changed, assuming that the operands
+    // are also converted when necessary.
+    return true;
   }
-  // Return true only if the node is changed,
-  // assuming that the operands are also converted when necessary.
+
   // Otherwise, return false to tell caller to scan operands.
-  return R.getNode() && R.getNode() != N;
+  return false;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
@@ -171,7 +171,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
   }
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, keep the extracted value in register.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      NewOp.getValueType().getVectorElementType(),
@@ -459,7 +462,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
     Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
     if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
-      SoftenFloatResult(Op.getNode(), 0);
+      AddToWorklist(Op.getNode());
   }
 
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
@@ -472,8 +475,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   }
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
-  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
-    Op = GetSoftenedFloat(Op);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
@@ -752,12 +753,17 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
     llvm_unreachable("Do not know how to soften this operator's operand!");
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
+  case ISD::CopyToReg:   Res = SoftenFloatOp_COPY_TO_REG(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::FABS:        Res = SoftenFloatOp_FABS(N); break;
+  case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
+  case ISD::FNEG:        Res = SoftenFloatOp_FNEG(N); break;
   case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
+  case ISD::SELECT:      Res = SoftenFloatOp_SELECT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
   case ISD::STORE:
@@ -790,9 +796,9 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
   if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
     return false;
-  // When the operand type can be kept in registers, SoftenFloatResult
-  // will call ReplaceValueWith to replace all references and we can
-  // skip softening this operand.
+
+  // When the operand type can be kept in registers there is nothing to do for
+  // the following opcodes.
   switch (N->getOperand(OpNo).getOpcode()) {
     case ISD::BITCAST:
     case ISD::ConstantFP:
@@ -806,18 +812,12 @@ bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::SELECT_CC:
       return true;
   }
-  // For some opcodes, SoftenFloatResult handles all conversion of softening
-  // and replacing operands, so that there is no need to soften operands
-  // again, although such opcode could be scanned for other illegal operands.
+
   switch (N->getOpcode()) {
-    case ISD::ConstantFP:
-    case ISD::CopyFromReg:
-    case ISD::CopyToReg:
-    case ISD::FABS:
-    case ISD::FCOPYSIGN:
-    case ISD::FNEG:
-    case ISD::Register:
-    case ISD::SELECT:
+    case ISD::ConstantFP:  // Leaf node.
+    case ISD::CopyFromReg: // Operand is a register that we know to be left 
+                           // unchanged by SoftenFloatResult().
+    case ISD::Register:    // Leaf node.
       return true;
   }
   return false;
@@ -828,6 +828,21 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
                      GetSoftenedFloat(N->getOperand(0)));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_COPY_TO_REG(SDNode *N) {
+  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
+  SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
+
+  if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
+    return SDValue();
+
+  if (N->getNumOperands() == 3)
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2), 0);
+
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2, 
+                                        N->getOperand(3)),
+                 0);
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
   // If we get here, the result must be legal but the source illegal.
   EVT SVT = N->getOperand(0).getValueType();
@@ -883,6 +898,34 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
                  0);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_FABS(SDNode *N) {
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+
+  if (Op == N->getOperand(0))
+    return SDValue();
+
+  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
+  SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
+  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
+
+  if (Op0 == N->getOperand(0) && Op1 == N->getOperand(1))
+    return SDValue();
+
+  return SDValue(DAG.UpdateNodeOperands(N, Op0, Op1), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FNEG(SDNode *N) {
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+
+  if (Op == N->getOperand(0))
+    return SDValue();
+
+  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
   EVT SVT = N->getOperand(0).getValueType();
@@ -912,6 +955,17 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT(SDNode *N) {
+  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
+  SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
+
+  if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
+    return SDValue();
+
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2),
+                 0);
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
@@ -1054,15 +1108,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
                                                  SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  assert(NVT.getSizeInBits() == integerPartWidth &&
+  assert(NVT.getSizeInBits() == 64 &&
          "Do not know how to expand this float constant!");
   APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
   SDLoc dl(N);
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
-                                 APInt(integerPartWidth, C.getRawData()[1])),
+                                 APInt(64, C.getRawData()[1])),
                          dl, NVT);
   Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
-                                 APInt(integerPartWidth, C.getRawData()[0])),
+                                 APInt(64, C.getRawData()[0])),
                          dl, NVT);
 }
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index dc436ce..75fec7b 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -21,6 +21,7 @@
 #include "LegalizeTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -134,6 +135,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULO:
   case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
 
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY:    Res = PromoteIntRes_ADDSUBCARRY(N, ResNo); break;
+
   case ISD::ATOMIC_LOAD:
     Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
 
@@ -510,9 +514,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Simply change the return type of the boolean result.
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
   EVT ValueVTs[] = { N->getValueType(0), NVT };
-  SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Ops[3] = { N->getOperand(0), N->getOperand(1) };
+  unsigned NumOps = N->getNumOperands();
+  assert(NumOps <= 3 && "Too many operands");
+  if (NumOps == 3)
+    Ops[2] = N->getOperand(2);
+
   SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
-                            DAG.getVTList(ValueVTs), Ops);
+                            DAG.getVTList(ValueVTs), makeArrayRef(Ops, NumOps));
 
   // Modified the sum result - switch anything that used the old sum to use
   // the new one.
@@ -606,9 +615,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS,
                               N->getOperand(2));
 
-  assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
   // Convert to the expected type.
-  return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
+  return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
@@ -690,7 +698,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   case TargetLowering::TypePromoteInteger:
     Res = GetPromotedInteger(InOp);
     break;
-  case TargetLowering::TypeSplitVector:
+  case TargetLowering::TypeSplitVector: {
     EVT InVT = InOp.getValueType();
     assert(InVT.isVector() && "Cannot split scalar types");
     unsigned NumElts = InVT.getVectorNumElements();
@@ -709,6 +717,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
   }
+  case TargetLowering::TypeWidenVector: {
+    SDValue WideInOp = GetWidenedVector(InOp);
+
+    // Truncate widened InOp.
+    unsigned NumElem = WideInOp.getValueType().getVectorNumElements();
+    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getValueType(0).getScalarType(), NumElem);
+    SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp);
+
+    // Zero extend so that the elements are of same type as those of NVT
+    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(),
+                                 NumElem);
+    SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc);
+
+    // Extract the low NVT subvector.
+    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx);
+  }
+  }
 
   // Truncate to NVT instead of VT
   return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
@@ -742,6 +770,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+  llvm_unreachable("Not implemented");
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   // Promote the overflow bit trivially.
   if (ResNo == 1)
@@ -904,6 +938,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SRL:
   case ISD::ROTL:
   case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY: Res = PromoteIntOp_ADDSUBCARRY(N, OpNo); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -1089,6 +1126,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   SDValue Cond = N->getOperand(0);
   EVT OpTy = N->getOperand(1).getValueType();
 
+  if (N->getOpcode() == ISD::VSELECT)
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      return Res;
+
   // Promote all the way up to the canonical SetCC type.
   EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
   Cond = PromoteTargetBoolean(Cond, OpVT);
@@ -1252,6 +1293,30 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
                                 N->getOperand(0).getValueType().getScalarType());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDLoc DL(N);
+
+  auto VT = getSetCCResultType(LHS.getValueType());
+  TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(VT);
+  switch (BoolType) {
+  case TargetLoweringBase::UndefinedBooleanContent:
+    Carry = DAG.getAnyExtOrTrunc(Carry, DL, VT);
+    break;
+  case TargetLoweringBase::ZeroOrOneBooleanContent:
+    Carry = DAG.getZExtOrTrunc(Carry, DL, VT);
+    break;
+  case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+    Carry = DAG.getSExtOrTrunc(Carry, DL, VT);
+    break;
+  }
+
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0);
+}
 
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
@@ -1371,6 +1436,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADDE:
   case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
 
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY: ExpandIntRes_ADDSUBCARRY(N, Lo, Hi); break;
+
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
@@ -1501,11 +1569,11 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDLoc dl(N);
 
   APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
-  APInt KnownZero, KnownOne;
-  DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne);
+  KnownBits Known;
+  DAG.computeKnownBits(N->getOperand(1), Known);
 
   // If we don't know anything about the high bits, exit.
-  if (((KnownZero|KnownOne) & HighBitMask) == 0)
+  if (((Known.Zero|Known.One) & HighBitMask) == 0)
     return false;
 
   // Get the incoming operand to be shifted.
@@ -1514,7 +1582,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // If we know that any of the high bits of the shift amount are one, then we
   // can do this as a couple of simple shifts.
-  if (KnownOne.intersects(HighBitMask)) {
+  if (Known.One.intersects(HighBitMask)) {
     // Mask out the high bit, which we know is set.
     Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
                       DAG.getConstant(~HighBitMask, dl, ShTy));
@@ -1539,7 +1607,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // If we know that all of the high bits of the shift amount are zero, then we
   // can do this as a couple of simple shifts.
-  if ((KnownZero & HighBitMask) == HighBitMask) {
+  if (HighBitMask.isSubsetOf(Known.Zero)) {
     // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined
     // shift if x is zero.  We can use XOR here because x is known to be smaller
     // than 32.
@@ -1714,6 +1782,23 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
   SDValue LoOps[2] = { LHSL, RHSL };
   SDValue HiOps[3] = { LHSH, RHSH };
 
+  bool HasOpCarry = TLI.isOperationLegalOrCustom(
+      N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY,
+      TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  if (HasOpCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, getSetCCResultType(NVT));
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps);
+    } else {
+      Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps);
+    }
+    return;
+  }
+
   // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
   // them.  TODO: Teach operation legalization how to expand unsupported
   // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
@@ -1742,9 +1827,11 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
                                    ISD::UADDO : ISD::USUBO,
                                  TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
+
   if (hasOVF) {
-    SDVTList VTList = DAG.getVTList(NVT, NVT);
-    TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
+    EVT OvfVT = getSetCCResultType(NVT);
+    SDVTList VTList = DAG.getVTList(NVT, OvfVT);
     int RevOpc;
     if (N->getOpcode() == ISD::ADD) {
       RevOpc = ISD::SUB;
@@ -1759,12 +1846,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
 
     switch (BoolType) {
     case TargetLoweringBase::UndefinedBooleanContent:
-      OVF = DAG.getNode(ISD::AND, dl, NVT, DAG.getConstant(1, dl, NVT), OVF);
+      OVF = DAG.getNode(ISD::AND, dl, OvfVT, DAG.getConstant(1, dl, OvfVT), OVF);
       LLVM_FALLTHROUGH;
     case TargetLoweringBase::ZeroOrOneBooleanContent:
+      OVF = DAG.getZExtOrTrunc(OVF, dl, NVT);
       Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF);
       break;
     case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+      OVF = DAG.getSExtOrTrunc(OVF, dl, NVT);
       Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF);
     }
     return;
@@ -1775,6 +1864,13 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
     SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
+
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent) {
+      SDValue Carry = DAG.getZExtOrTrunc(Cmp1, dl, NVT);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry);
+      return;
+    }
+
     SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
                                    DAG.getConstant(1, dl, NVT),
                                    DAG.getConstant(0, dl, NVT));
@@ -1789,9 +1885,14 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     SDValue Cmp =
       DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
-    SDValue Borrow = DAG.getSelect(dl, NVT, Cmp,
-                                   DAG.getConstant(1, dl, NVT),
-                                   DAG.getConstant(0, dl, NVT));
+
+    SDValue Borrow;
+    if (BoolType == TargetLoweringBase::ZeroOrOneBooleanContent)
+      Borrow = DAG.getZExtOrTrunc(Cmp, dl, NVT);
+    else
+      Borrow = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT),
+                             DAG.getConstant(0, dl, NVT));
+
     Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
   }
 }
@@ -1842,6 +1943,71 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDLoc dl(N);
+
+  SDValue Ovf;
+
+  bool HasOpCarry = TLI.isOperationLegalOrCustom(
+      N->getOpcode() == ISD::ADD ? ISD::ADDCARRY : ISD::SUBCARRY,
+      TLI.getTypeToExpandTo(*DAG.getContext(), LHS.getValueType()));
+
+  if (HasOpCarry) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH, RHSL, RHSH;
+    GetExpandedInteger(LHS, LHSL, LHSH);
+    GetExpandedInteger(RHS, RHSL, RHSH);
+    SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
+    SDValue LoOps[2] = { LHSL, RHSL };
+    SDValue HiOps[3] = { LHSH, RHSH };
+
+    unsigned Opc = N->getOpcode() == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
+    Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(Opc, dl, VTList, HiOps);
+
+    Ovf = Hi.getValue(1);
+  } else {
+    // Expand the result by simply replacing it with the equivalent
+    // non-overflow-checking operation.
+    auto Opc = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
+    SDValue Sum = DAG.getNode(Opc, dl, LHS.getValueType(), LHS, RHS);
+    SplitInteger(Sum, Lo, Hi);
+
+    // Calculate the overflow: addition overflows iff a + b < a, and subtraction
+    // overflows iff a - b > a.
+    auto Cond = N->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
+    Ovf = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS, Cond);
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ovf);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBCARRY(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), N->getValueType(1));
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH, SDValue() };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
 void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -2508,29 +2674,6 @@ void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
   Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
 }
 
-void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
-                                             SDValue &Lo, SDValue &Hi) {
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  SDLoc dl(N);
-
-  // Expand the result by simply replacing it with the equivalent
-  // non-overflow-checking operation.
-  SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ?
-                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
-                            LHS, RHS);
-  SplitInteger(Sum, Lo, Hi);
-
-  // Calculate the overflow: addition overflows iff a + b < a, and subtraction
-  // overflows iff a - b > a.
-  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS,
-                             N->getOpcode () == ISD::UADDO ?
-                             ISD::SETULT : ISD::SETUGT);
-
-  // Use the calculated overflow everywhere.
-  ReplaceValueWith(SDValue(N, 1), Ofl);
-}
-
 void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
@@ -2586,24 +2729,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = true;
-    Entry.isZExt = false;
+    Entry.IsSExt = true;
+    Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   // Also pass the address of the overflow check.
   Entry.Node = Temp;
   Entry.Ty = PtrTy->getPointerTo();
-  Entry.isSExt = true;
-  Entry.isZExt = false;
+  Entry.IsSExt = true;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
-    .setSExtResult();
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
+      .setSExtResult();
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2743,6 +2887,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
   case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
   case ISD::SETCCE:            Res = ExpandIntOp_SETCCE(N); break;
+  case ISD::SETCCCARRY:        Res = ExpandIntOp_SETCCCARRY(N); break;
   case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
   case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
@@ -2877,14 +3022,16 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
     return;
   }
 
-  // Lower with SETCCE if the target supports it.
+  // Lower with SETCCE or SETCCCARRY if the target supports it.
+  EVT HiVT = LHSHi.getValueType();
+  EVT ExpandVT = TLI.getTypeToExpandTo(*DAG.getContext(), HiVT);
+  bool HasSETCCCARRY = TLI.isOperationLegalOrCustom(ISD::SETCCCARRY, ExpandVT);
+
   // FIXME: Make all targets support this, then remove the other lowering.
-  if (TLI.getOperationAction(
-          ISD::SETCCE,
-          TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) ==
-      TargetLowering::Custom) {
-    // SETCCE can detect < and >= directly. For > and <=, flip operands and
-    // condition code.
+  if (HasSETCCCARRY ||
+      TLI.getOperationAction(ISD::SETCCE, ExpandVT) == TargetLowering::Custom) {
+    // SETCCE/SETCCCARRY can detect < and >= directly. For > and <=, flip
+    // operands and condition code.
     bool FlipOperands = false;
     switch (CCCode) {
     case ISD::SETGT:  CCCode = ISD::SETLT;  FlipOperands = true; break;
@@ -2898,27 +3045,28 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
       std::swap(LHSHi, RHSHi);
     }
     // Perform a wide subtraction, feeding the carry from the low part into
-    // SETCCE. The SETCCE operation is essentially looking at the high part of
-    // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or
-    // positive iff LHS >= RHS.
-    SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue);
-    SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo);
-    SDValue Res =
-        DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()),
-                    LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode));
+    // SETCCE/SETCCCARRY. The SETCCE/SETCCCARRY operation is essentially
+    // looking at the high part of the result of LHS - RHS. It is negative
+    // iff LHS < RHS. It is zero or positive iff LHS >= RHS.
+    EVT LoVT = LHSLo.getValueType();
+    SDVTList VTList = DAG.getVTList(
+        LoVT, HasSETCCCARRY ? getSetCCResultType(LoVT) : MVT::Glue);
+    SDValue LowCmp = DAG.getNode(HasSETCCCARRY ? ISD::USUBO : ISD::SUBC, dl,
+                                 VTList, LHSLo, RHSLo);
+    SDValue Res = DAG.getNode(HasSETCCCARRY ? ISD::SETCCCARRY : ISD::SETCCE, dl,
+                              getSetCCResultType(HiVT), LHSHi, RHSHi,
+                              LowCmp.getValue(1), DAG.getCondCode(CCCode));
     NewLHS = Res;
     NewRHS = SDValue();
     return;
   }
 
-  NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()),
-                             LHSHi, RHSHi, ISD::SETEQ, false,
-                             DagCombineInfo, dl);
+  NewLHS = TLI.SimplifySetCC(getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ,
+                             false, DagCombineInfo, dl);
   if (!NewLHS.getNode())
-    NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
-                          LHSHi, RHSHi, ISD::SETEQ);
-  NewLHS = DAG.getSelect(dl, LoCmp.getValueType(),
-                         NewLHS, LoCmp, HiCmp);
+    NewLHS =
+        DAG.getSetCC(dl, getSetCCResultType(HiVT), LHSHi, RHSHi, ISD::SETEQ);
+  NewLHS = DAG.getSelect(dl, LoCmp.getValueType(), NewLHS, LoCmp, HiCmp);
   NewRHS = SDValue();
 }
 
@@ -2971,8 +3119,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
   }
 
   // Otherwise, update N to have the operands specified.
-  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
-                                DAG.getCondCode(CCCode)), 0);
+  return SDValue(
+      DAG.UpdateNodeOperands(N, NewLHS, NewRHS, DAG.getCondCode(CCCode)), 0);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) {
@@ -2993,6 +3141,24 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) {
                      LowCmp.getValue(1), Cond);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCCCARRY(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+  SDLoc dl = SDLoc(N);
+
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(LHS, LHSLo, LHSHi);
+  GetExpandedInteger(RHS, RHSLo, RHSHi);
+
+  // Expand to a SUBE for the low part and a smaller SETCCCARRY for the high.
+  SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), Carry.getValueType());
+  SDValue LowCmp = DAG.getNode(ISD::SUBCARRY, dl, VTList, LHSLo, RHSLo, Carry);
+  return DAG.getNode(ISD::SETCCCARRY, dl, N->getValueType(0), LHSHi, RHSHi,
+                     LowCmp.getValue(1), Cond);
+}
+
 SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
   // The value being shifted is legal, but the shift amount is too big.
   // It follows that either the result of the shift is undefined, or the
@@ -3226,7 +3392,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
     Ops.push_back(Op);
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
+  return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
 
@@ -3269,7 +3435,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
     Ops.push_back(Op);
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
+  return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
@@ -3317,7 +3483,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
+  return DAG.getBuildVector(NOutVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N) {
@@ -3420,5 +3586,5 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0), NewOps);
+  return DAG.getBuildVector(N->getValueType(0), dl, NewOps);
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cf19d75..001eed9 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -80,6 +80,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
 
     for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) {
       SDValue Res(&Node, i);
+      EVT VT = Res.getValueType();
       bool Failed = false;
 
       unsigned Mapped = 0;
@@ -129,13 +130,17 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
           dbgs() << "Unprocessed value in a map!";
           Failed = true;
         }
-      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) {
+      } else if (isTypeLegal(VT) || IgnoreNodeResults(&Node)) {
         if (Mapped > 1) {
           dbgs() << "Value with legal type was transformed!";
           Failed = true;
         }
       } else {
-        if (Mapped == 0) {
+        // If the value can be kept in HW registers, softening machinery can
+        // leave it unchanged and don't put it to any map.
+        if (Mapped == 0 &&
+            !(getTypeAction(VT) == TargetLowering::TypeSoftenFloat &&
+              isLegalInHWReg(VT))) {
           dbgs() << "Processed value not in any map!";
           Failed = true;
         } else if (Mapped & (Mapped - 1)) {
@@ -199,8 +204,7 @@ bool DAGTypeLegalizer::run() {
   // non-leaves.
   for (SDNode &Node : DAG.allnodes()) {
     if (Node.getNumOperands() == 0) {
-      Node.setNodeId(ReadyToProcess);
-      Worklist.push_back(&Node);
+      AddToWorklist(&Node);
     } else {
       Node.setNodeId(Unanalyzed);
     }
@@ -331,6 +335,7 @@ ScanOperands:
     // to the worklist etc.
     if (NeedsReanalyzing) {
       assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+
       N->setNodeId(NewNode);
       // Recompute the NodeId and correct processed operands, adding the node to
       // the worklist if ready.
@@ -918,9 +923,9 @@ SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
   assert(Op.getValueType().isVector() && "Only applies to vectors!");
   unsigned EltWidth = Op.getScalarValueSizeInBits();
   EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
-  unsigned NumElts = Op.getValueType().getVectorNumElements();
+  auto EltCnt = Op.getValueType().getVectorElementCount();
   return DAG.getNode(ISD::BITCAST, SDLoc(Op),
-                     EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op);
+                     EVT::getVectorVT(*DAG.getContext(), EltNVT, EltCnt), Op);
 }
 
 SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
@@ -1077,8 +1082,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1087,9 +1092,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ec55662..c46d1b0 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -191,6 +191,11 @@ private:
   void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
                     SDValue &Lo, SDValue &Hi);
 
+  void AddToWorklist(SDNode *N) {
+    N->setNodeId(ReadyToProcess);
+    Worklist.push_back(N);
+  }
+
   //===--------------------------------------------------------------------===//
   // Integer Promotion Support: LegalizeIntegerTypes.cpp
   //===--------------------------------------------------------------------===//
@@ -274,6 +279,7 @@ private:
   SDValue PromoteIntRes_SRL(SDNode *N);
   SDValue PromoteIntRes_TRUNCATE(SDNode *N);
   SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
@@ -306,6 +312,7 @@ private:
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -345,6 +352,7 @@ private:
   void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBCARRY       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -373,6 +381,7 @@ private:
   SDValue ExpandIntOp_SELECT_CC(SDNode *N);
   SDValue ExpandIntOp_SETCC(SDNode *N);
   SDValue ExpandIntOp_SETCCE(SDNode *N);
+  SDValue ExpandIntOp_SETCCCARRY(SDNode *N);
   SDValue ExpandIntOp_Shift(SDNode *N);
   SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
   SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
@@ -407,23 +416,13 @@ private:
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary.
-  void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) {
-    // When the result type can be kept in HW registers, the converted
-    // NewRes node could have the same type. We can save the effort in
-    // cloning every user of N in SoftenFloatOperand or other legalization functions,
-    // by calling ReplaceValueWith here to update all users.
-    if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo)))
-      ReplaceValueWith(SDValue(N, ResNo), NewRes);
-  }
-
   // Convert Float Results to Integer for Non-HW-supported Operations.
   bool SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
   SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
@@ -462,17 +461,23 @@ private:
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
   // Return true if we can skip softening the given operand or SDNode because
-  // it was soften before by SoftenFloatResult and references to the operand
-  // were replaced by ReplaceValueWith.
+  // either it was soften before by SoftenFloatResult and references to the 
+  // operand were replaced by ReplaceValueWith or it's value type is legal in HW
+  // registers and the operand can be left unchanged.
   bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
 
   // Convert Float Operand to Integer for Non-HW-supported Operations.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
+  SDValue SoftenFloatOp_COPY_TO_REG(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FABS(SDNode *N);
+  SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatOp_FNEG(SDNode *N);
   SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftenFloatOp_SELECT(SDNode *N);
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
@@ -597,6 +602,7 @@ private:
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
+  SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
@@ -621,6 +627,7 @@ private:
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecOp_VSELECT(SDNode *N);
+  SDValue ScalarizeVecOp_VSETCC(SDNode *N);
   SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo);
 
@@ -666,12 +673,14 @@ private:
   // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
   bool SplitVectorOperand(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
 
   SDValue SplitVecOp_BITCAST(SDNode *N);
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
@@ -713,6 +722,7 @@ private:
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVSELECTAndMask(SDNode *N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_SETCC(SDNode* N);
   SDValue WidenVecRes_UNDEF(SDNode *N);
@@ -782,6 +792,13 @@ private:
   /// By default, the vector will be widened with undefined values.
   SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
 
+  /// Return a mask of vector type MaskVT to replace InMask. Also adjust
+  /// MaskVT to ToMaskVT if needed with vector extension or truncation.
+  SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
+
+  /// Get the target mask VT, and widen if needed.
+  EVT getSETCCWidenedResultTy(SDValue SetCC);
+
   //===--------------------------------------------------------------------===//
   // Generic Splitting: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 3682c32..f330615 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -57,7 +57,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       // Expand the floating point operand only if it was converted to integers.
       // Otherwise, it is a legal type like f128 that can be saved in a register.
       auto SoftenedOp = GetSoftenedFloat(InOp);
-      if (SoftenedOp == InOp)
+      if (isLegalInHWReg(SoftenedOp.getValueType()))
         break;
       SplitInteger(SoftenedOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
@@ -362,8 +362,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
     SmallVector<SDValue, 8> Ops;
     IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
 
-    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
-                              makeArrayRef(Ops.data(), NumElts));
+    SDValue Vec =
+        DAG.getBuildVector(NVT, dl, makeArrayRef(Ops.data(), NumElts));
     return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
   }
 
@@ -396,10 +396,8 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
     NewElts.push_back(Hi);
   }
 
-  SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
-                               EVT::getVectorVT(*DAG.getContext(),
-                                                NewVT, NewElts.size()),
-                               NewElts);
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NewElts.size());
+  SDValue NewVec = DAG.getBuildVector(NewVecVT, dl, NewElts);
 
   // Convert the new vector to the old vector type.
   return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
@@ -458,7 +456,7 @@ SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
   for (unsigned i = 1; i < NumElts; ++i)
     Ops[i] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
@@ -512,8 +510,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
   GetSplitOp(Op, Lo, Hi);
 }
 
-void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
-                                       SDValue &Hi) {
+static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N,
+                                               SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
   SDLoc dl(N);
   GetSplitOp(N->getOperand(1), LL, LH);
@@ -522,9 +536,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
+    // It seems to improve code to generate two narrow SETCCs as opposed to
+    // splitting a wide result vector.
+    else if (Cond.getOpcode() == ISD::SETCC)
+      std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);
     // Check if there are already splitted versions of the vector available and
     // use those instead of splitting the mask operand again.
-    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+    else if (getTypeAction(Cond.getValueType()) ==
+             TargetLowering::TypeSplitVector)
       GetSplitVector(Cond, CL, CH);
     else
       std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index d4fa20f..9355dbe 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -105,6 +105,7 @@ class VectorLegalizer {
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
+  SDValue ExpandFSUB(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
@@ -224,6 +225,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
           }
           return TranslateLegalizeResults(Op, Lowered);
         }
+        LLVM_FALLTHROUGH;
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandLoad(Op));
@@ -621,8 +623,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
     }
 
     NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-    Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
-                        Op.getNode()->getValueType(0), Vals);
+    Value = DAG.getBuildVector(Op.getNode()->getValueType(0), dl, Vals);
   } else {
     SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG);
 
@@ -692,6 +693,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandUINT_TO_FLOAT(Op);
   case ISD::FNEG:
     return ExpandFNEG(Op);
+  case ISD::FSUB:
+    return ExpandFSUB(Op);
   case ISD::SETCC:
     return UnrollVSETCC(Op);
   case ISD::BITREVERSE:
@@ -720,8 +723,6 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
   assert(VT.isVector() && !Mask.getValueType().isVector()
          && Op1.getValueType() == Op2.getValueType() && "Invalid type");
 
-  unsigned NumElem = VT.getVectorNumElements();
-
   // If we can't even use the basic vector operations of
   // AND,OR,XOR, we will have to scalarize the op.
   // Notice that the operation may be 'promoted' which means that it is
@@ -745,8 +746,7 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
           DAG.getConstant(0, DL, BitTy));
 
   // Broadcast the mask so that the entire vector is all-one or all zero.
-  SmallVector<SDValue, 8> Ops(NumElem, Mask);
-  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops);
+  Mask = DAG.getSplatBuildVector(MaskTy, DL, Mask);
 
   // Bitcast the operands to be the same type as the mask.
   // This is needed when we select between FP types because
@@ -1025,6 +1025,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
+  // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal,
+  // we can defer this to operation legalization where it will be lowered as
+  // a+(-b).
+  EVT VT = Op.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FADD, VT))
+    return Op; // Defer to LegalizeDAG
+
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
   EVT VT = Op.getValueType();
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
@@ -1102,7 +1114,7 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
                                            (EltVT.getSizeInBits()), dl, EltVT),
                            DAG.getConstant(0, dl, EltVT));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 6906f67..6aa3270 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    R = ScalarizeVecRes_VecInregOp(N);
+    break;
   case ISD::ANY_EXTEND:
   case ISD::BITREVERSE:
   case ISD::BSWAP:
@@ -97,6 +102,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
   case ISD::ZERO_EXTEND:
+  case ISD::FCANONICALIZE:
     R = ScalarizeVecRes_UnaryOp(N);
     break;
 
@@ -257,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
                      LHS, DAG.getValueType(ExtVT));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+
+  EVT OpVT = Op.getValueType();
+  EVT OpEltVT = OpVT.getVectorElementType();
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    Op = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
+  }
+
+  llvm_unreachable("Illegal extend_vector_inreg opcode");
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   // If the operand is wider than the vector element type then it is implicitly
   // truncated.  Make that explicit here.
@@ -268,7 +302,21 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
-  SDValue Cond = GetScalarizedVector(N->getOperand(0));
+  SDValue Cond = N->getOperand(0);
+  EVT OpVT = Cond.getValueType();
+  SDLoc DL(N);
+  // The vselect result and true/value operands needs scalarizing, but it's
+  // not a given that the Cond does. For instance, in AVX512 v1i1 is legal.
+  // See the similar logic in ScalarizeVecRes_VSETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Cond = GetScalarizedVector(Cond);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Cond = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
   TargetLowering::BooleanContent ScalarBool =
       TLI.getBooleanContents(false, false);
@@ -436,6 +484,9 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::VSELECT:
       Res = ScalarizeVecOp_VSELECT(N);
       break;
+    case ISD::SETCC:
+      Res = ScalarizeVecOp_VSETCC(N);
+      break;
     case ISD::STORE:
       Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
@@ -478,7 +529,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
                            N->getValueType(0).getScalarType(), Elt);
   // Revectorize the result so the types line up with what the uses of this
   // expression expect.
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op);
+  return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Op);
 }
 
 /// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
@@ -486,20 +537,21 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
     Ops[i] = GetScalarizedVector(N->getOperand(i));
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops);
+  return DAG.getBuildVector(N->getValueType(0), SDLoc(N), Ops);
 }
 
 /// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
 /// so just return the element, ignoring the index.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  EVT VT = N->getValueType(0);
   SDValue Res = GetScalarizedVector(N->getOperand(0));
-  if (Res.getValueType() != N->getValueType(0))
-    Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0),
-                      Res);
+  if (Res.getValueType() != VT)
+    Res = VT.isFloatingPoint()
+              ? DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Res)
+              : DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Res);
   return Res;
 }
 
-
 /// If the input condition is a vector that needs to be scalarized, it must be
 /// <1 x i1>, so just convert to a normal ISD::SELECT
 /// (still with vector output type since that was acceptable if we got here).
@@ -511,6 +563,36 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
                      N->getOperand(2));
 }
 
+/// If the operand is a vector that needs to be scalarized then the
+/// result must be v1i1, so just convert to a scalar SETCC and wrap
+/// with a scalar_to_vector since the res type is legal if we got here
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  assert(N->getValueType(0) == MVT::v1i1 && "Expected v1i1 type");
+
+  EVT VT = N->getValueType(0);
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  EVT NVT = VT.getVectorElementType();
+  SDLoc DL(N);
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
+      N->getOperand(2));
+
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+
+  Res = DAG.getNode(ExtendCode, DL, NVT, Res);
+
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Res);
+}
+
 /// If the value to store is a vector that needs to be scalarized, it must be
 /// <1 x ty>. Just store the element.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
@@ -637,6 +719,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
+  case ISD::FCANONICALIZE:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
@@ -695,7 +778,7 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
   GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
   SDLoc dl(N);
 
-  const SDNodeFlags *Flags = N->getFlags();
+  const SDNodeFlags Flags = N->getFlags();
   unsigned Opcode = N->getOpcode();
   Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
   Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
@@ -781,10 +864,10 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
-  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps);
+  Lo = DAG.getBuildVector(LoVT, dl, LoOps);
 
   SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
-  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps);
+  Hi = DAG.getBuildVector(HiVT, dl, HiOps);
 }
 
 void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
@@ -928,7 +1011,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   SDValue InLo, InHi;
-  GetSplitVector(N0, InLo, InHi);
+
+  if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N0, InLo, InHi);
+  else
+    std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);
+
   EVT InLoVT = InLo.getValueType();
   unsigned InNumElements = InLoVT.getVectorNumElements();
 
@@ -1253,12 +1341,9 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
   if ((NumElements & 1) == 0 &&
       SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
     LLVMContext &Ctx = *DAG.getContext();
-    EVT NewSrcVT = EVT::getVectorVT(
-        Ctx, EVT::getIntegerVT(
-                 Ctx, SrcVT.getScalarSizeInBits() * 2),
-        NumElements);
-    EVT SplitSrcVT =
-        EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
+    EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
+    EVT SplitSrcVT = SrcVT.getHalfNumVectorElementsVT(Ctx);
+
     EVT SplitLoVT, SplitHiVT;
     std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
     if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
@@ -1372,7 +1457,7 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
-      Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps);
+      Output = DAG.getBuildVector(NewVT, dl, SVOps);
     } else if (InputUsed[0] == -1U) {
       // No input vectors were used!  The result is undefined.
       Output = DAG.getUNDEF(NewVT);
@@ -1466,8 +1551,31 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
     case ISD::FTRUNC:
+    case ISD::FCANONICALIZE:
       Res = SplitVecOp_UnaryOp(N);
       break;
+
+    case ISD::ANY_EXTEND_VECTOR_INREG:
+    case ISD::SIGN_EXTEND_VECTOR_INREG:
+    case ISD::ZERO_EXTEND_VECTOR_INREG:
+      Res = SplitVecOp_ExtVecInRegOp(N);
+      break;
+
+    case ISD::VECREDUCE_FADD:
+    case ISD::VECREDUCE_FMUL:
+    case ISD::VECREDUCE_ADD:
+    case ISD::VECREDUCE_MUL:
+    case ISD::VECREDUCE_AND:
+    case ISD::VECREDUCE_OR:
+    case ISD::VECREDUCE_XOR:
+    case ISD::VECREDUCE_SMAX:
+    case ISD::VECREDUCE_SMIN:
+    case ISD::VECREDUCE_UMAX:
+    case ISD::VECREDUCE_UMIN:
+    case ISD::VECREDUCE_FMAX:
+    case ISD::VECREDUCE_FMIN:
+      Res = SplitVecOp_VECREDUCE(N, OpNo);
+      break;
     }
   }
 
@@ -1520,6 +1628,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue VecOp = N->getOperand(OpNo);
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+  bool NoNaN = N->getFlags().hasNoNaNs();
+  unsigned CombineOpc = 0;
+  switch (N->getOpcode()) {
+  case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
+  case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
+  case ISD::VECREDUCE_ADD:  CombineOpc = ISD::ADD; break;
+  case ISD::VECREDUCE_MUL:  CombineOpc = ISD::MUL; break;
+  case ISD::VECREDUCE_AND:  CombineOpc = ISD::AND; break;
+  case ISD::VECREDUCE_OR:   CombineOpc = ISD::OR; break;
+  case ISD::VECREDUCE_XOR:  CombineOpc = ISD::XOR; break;
+  case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
+  case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
+  case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
+  case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
+  case ISD::VECREDUCE_FMAX:
+    CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+    break;
+  case ISD::VECREDUCE_FMIN:
+    CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+    break;
+  default:
+    llvm_unreachable("Unexpected reduce ISD node");
+  }
+
+  // Use the appropriate scalar instruction on the split subvectors before
+  // reducing the now partially reduced smaller vector.
+  SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
+  return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
@@ -1615,7 +1765,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
     EltVT = MVT::i8;
     VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                              VecVT.getVectorNumElements());
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps);
+    Vec = DAG.getBuildVector(VecVT, dl, ElementOps);
   }
 
   // Store the vector to the stack.
@@ -1629,6 +1779,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
                         MachinePointerInfo(), EltVT);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
+  SDValue Lo, Hi;
+
+  // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
+  // splitting the result has the same effect as splitting the input operand.
+  SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                                              unsigned OpNo) {
   EVT LoVT, HiVT;
@@ -1881,7 +2041,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
     }
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts);
+  return DAG.getBuildVector(N->getValueType(0), DL, Elts);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
@@ -2165,7 +2325,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
   unsigned NumElts =  VT.getVectorNumElements();
-  const SDNodeFlags *Flags = N->getFlags();
+  const SDNodeFlags Flags = N->getFlags();
   while (!TLI.isTypeLegal(VT) && NumElts != 1) {
     NumElts = NumElts / 2;
     VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
@@ -2313,7 +2473,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
 
   unsigned Opcode = N->getOpcode();
   unsigned InVTNumElts = InVT.getVectorNumElements();
-  const SDNodeFlags *Flags = N->getFlags();
+  const SDNodeFlags Flags = N->getFlags();
   if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
     InOp = GetWidenedVector(N->getOperand(0));
     InVT = InOp.getValueType();
@@ -2323,6 +2483,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
         return DAG.getNode(Opcode, DL, WidenVT, InOp);
       return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
     }
+    if (WidenVT.getSizeInBits() == InVT.getSizeInBits()) {
+      // If both input and result vector types are of same width, extend
+      // operations should be done with SIGN/ZERO_EXTEND_VECTOR_INREG, which
+      // accepts fewer elements in the result than in the input.
+      if (Opcode == ISD::SIGN_EXTEND)
+        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
+      if (Opcode == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+    }
   }
 
   if (TLI.isTypeLegal(InWidenVT)) {
@@ -2375,7 +2544,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
@@ -2430,7 +2599,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
   while (Ops.size() != WidenNumElts)
     Ops.push_back(DAG.getUNDEF(WidenSVT));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, DL, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
@@ -2568,7 +2737,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
       if (InVT.isVector())
         NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
       else
-        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
+        NewVec = DAG.getBuildVector(NewInVT, dl, Ops);
       return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
     }
   }
@@ -2593,7 +2762,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
   assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
   NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps);
+  return DAG.getBuildVector(WidenVT, dl, NewOps);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
@@ -2663,7 +2832,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
@@ -2704,7 +2873,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
     Ops[i] = UndefVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
@@ -2814,6 +2983,213 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
                      WidenVT, N->getOperand(0));
 }
 
+// Return true if this is a node that could have two SETCCs as operands.
+static inline bool isLogicalMaskOp(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return true;
+  }
+  return false;
+}
+
+// This is used just for the assert in convertMask(). Check that this either
+// a SETCC or a previously handled SETCC by convertMask().
+#ifndef NDEBUG
+static inline bool isSETCCorConvertedSETCC(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
+    for (unsigned i = 1; i < N->getNumOperands(); ++i)
+      if (!N->getOperand(i)->isUndef())
+        return false;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() == ISD::TRUNCATE)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::SIGN_EXTEND)
+    N = N.getOperand(0);
+
+  if (isLogicalMaskOp(N.getOpcode()))
+    return isSETCCorConvertedSETCC(N.getOperand(0)) &&
+           isSETCCorConvertedSETCC(N.getOperand(1));
+
+  return (N.getOpcode() == ISD::SETCC ||
+          ISD::isBuildVectorOfConstantSDNodes(N.getNode()));
+}
+#endif
+
+// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
+// to ToMaskVT if needed with vector extension or truncation.
+SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
+                                      EVT ToMaskVT) {
+  // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
+  // FIXME: This code seems to be too restrictive, we might consider
+  // generalizing it or dropping it.
+  assert(isSETCCorConvertedSETCC(InMask) && "Unexpected mask argument.");
+
+  // Make a new Mask node, with a legal result VT.
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i < InMask->getNumOperands(); ++i)
+    Ops.push_back(InMask->getOperand(i));
+  SDValue Mask = DAG.getNode(InMask->getOpcode(), SDLoc(InMask), MaskVT, Ops);
+
+  // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
+  // extend or truncate is needed.
+  LLVMContext &Ctx = *DAG.getContext();
+  unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
+  unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
+  if (MaskScalarBits < ToMaskScalBits) {
+    EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                 MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
+  } else if (MaskScalarBits > ToMaskScalBits) {
+    EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                   MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
+  }
+
+  assert(Mask->getValueType(0).getScalarSizeInBits() ==
+             ToMaskVT.getScalarSizeInBits() &&
+         "Mask should have the right element size by now.");
+
+  // Adjust Mask to the right number of elements.
+  unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
+  if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
+    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
+    Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
+                       ZeroIdx);
+  } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
+    unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
+    EVT SubVT = Mask->getValueType(0);
+    SmallVector<SDValue, 16> SubConcatOps(NumSubVecs);
+    SubConcatOps[0] = Mask;
+    for (unsigned i = 1; i < NumSubVecs; ++i)
+      SubConcatOps[i] = DAG.getUNDEF(SubVT);
+    Mask =
+        DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps);
+  }
+
+  assert((Mask->getValueType(0) == ToMaskVT) &&
+         "A mask of ToMaskVT should have been produced by now.");
+
+  return Mask;
+}
+
+// Get the target mask VT, and widen if needed.
+EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
+  assert(SetCC->getOpcode() == ISD::SETCC);
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
+  return MaskVT;
+}
+
+// This method tries to handle VSELECT and its mask by legalizing operands
+// (which may require widening) and if needed adjusting the mask vector type
+// to match that of the VSELECT. Without it, many cases end up with
+// scalarization of the SETCC, with many unnecessary instructions.
+SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Cond = N->getOperand(0);
+
+  if (N->getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
+    return SDValue();
+
+  // If this is a splitted VSELECT that was previously already handled, do
+  // nothing.
+  if (Cond->getValueType(0).getScalarSizeInBits() != 1)
+    return SDValue();
+
+  EVT VSelVT = N->getValueType(0);
+  // Only handle vector types which are a power of 2.
+  if (!isPowerOf2_64(VSelVT.getSizeInBits()))
+    return SDValue();
+
+  // Don't touch if this will be scalarized.
+  EVT FinalVT = VSelVT;
+  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+    FinalVT = FinalVT.getHalfNumVectorElementsVT(Ctx);
+
+  if (FinalVT.getVectorNumElements() == 1)
+    return SDValue();
+
+  // If there is support for an i1 vector mask, don't touch.
+  if (Cond.getOpcode() == ISD::SETCC) {
+    EVT SetCCOpVT = Cond->getOperand(0).getValueType();
+    while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
+      SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
+    EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
+    if (SetCCResVT.getScalarSizeInBits() == 1)
+      return SDValue();
+  }
+
+  // Get the VT and operands for VSELECT, and widen if needed.
+  SDValue VSelOp1 = N->getOperand(1);
+  SDValue VSelOp2 = N->getOperand(2);
+  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
+    VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
+    VSelOp1 = GetWidenedVector(VSelOp1);
+    VSelOp2 = GetWidenedVector(VSelOp2);
+  }
+
+  // The mask of the VSELECT should have integer elements.
+  EVT ToMaskVT = VSelVT;
+  if (!ToMaskVT.getScalarType().isInteger())
+    ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
+
+  SDValue Mask;
+  if (Cond->getOpcode() == ISD::SETCC) {
+    EVT MaskVT = getSETCCWidenedResultTy(Cond);
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else if (isLogicalMaskOp(Cond->getOpcode()) &&
+             Cond->getOperand(0).getOpcode() == ISD::SETCC &&
+             Cond->getOperand(1).getOpcode() == ISD::SETCC) {
+    // Cond is (AND/OR/XOR (SETCC, SETCC))
+    SDValue SETCC0 = Cond->getOperand(0);
+    SDValue SETCC1 = Cond->getOperand(1);
+    EVT VT0 = getSETCCWidenedResultTy(SETCC0);
+    EVT VT1 = getSETCCWidenedResultTy(SETCC1);
+    unsigned ScalarBits0 = VT0.getScalarSizeInBits();
+    unsigned ScalarBits1 = VT1.getScalarSizeInBits();
+    unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
+    EVT MaskVT;
+    // If the two SETCCs have different VTs, either extend/truncate one of
+    // them to the other "towards" ToMaskVT, or truncate one and extend the
+    // other to ToMaskVT.
+    if (ScalarBits0 != ScalarBits1) {
+      EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
+      EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
+      if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
+        MaskVT = WideVT;
+      else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
+        MaskVT = NarrowVT;
+      else
+        MaskVT = ToMaskVT;
+    } else
+      // If the two SETCCs have the same VT, don't change it.
+      MaskVT = VT0;
+
+    // Make new SETCCs and logical nodes.
+    SETCC0 = convertMask(SETCC0, VT0, MaskVT);
+    SETCC1 = convertMask(SETCC1, VT1, MaskVT);
+    Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
+
+    // Convert the logical op for VSELECT if needed.
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else
+    return SDValue();
+
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
@@ -2821,6 +3197,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
   if (CondVT.isVector()) {
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      return Res;
+
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
                                         CondEltVT, WidenNumElts);
@@ -3093,7 +3472,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
             ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
             DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
@@ -3144,7 +3523,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
           ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
           DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
@@ -3565,10 +3944,9 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   for (; i != WidenNumElts; ++i)
     Ops[i] = UndefVal;
 
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+  return DAG.getBuildVector(WidenVT, dl, Ops);
 }
 
-
 void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // The strategy assumes that we can efficiently store power-of-two widths.
@@ -3737,5 +4115,5 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
     DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
     Ops[Idx] = FillVal;
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+  return DAG.getBuildVector(NVT, dl, Ops);
 }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index ded8e68..a21b4c7 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -57,10 +57,8 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
   RegPressure.resize(NumRC);
   std::fill(RegLimit.begin(), RegLimit.end(), 0);
   std::fill(RegPressure.begin(), RegPressure.end(), 0);
-  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-                                             E = TRI->regclass_end();
-       I != E; ++I)
-    RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
+  for (const TargetRegisterClass *RC : TRI->regclasses())
+    RegLimit[RC->getID()] = TRI->getRegPressureLimit(RC, *IS->MF);
 
   ParallelLiveRanges = 0;
   HorizontalVerticalBalance = 0;
@@ -69,12 +67,11 @@ ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
 unsigned
 ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
   unsigned NumberDeps = 0;
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isCtrl())
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
       continue;
 
-    SUnit *PredSU = I->getSUnit();
+    SUnit *PredSU = Pred.getSUnit();
     const SDNode *ScegN = PredSU->getNode();
 
     if (!ScegN)
@@ -107,12 +104,11 @@ ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
 unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU,
                                                     unsigned RCId) {
   unsigned NumberDeps = 0;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isCtrl())
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl())
       continue;
 
-    SUnit *SuccSU = I->getSUnit();
+    SUnit *SuccSU = Succ.getSUnit();
     const SDNode *ScegN = SuccSU->getNode();
     if (!ScegN)
       continue;
@@ -144,9 +140,8 @@ unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU,
 
 static unsigned numberCtrlDepsInSU(SUnit *SU) {
   unsigned NumberDeps = 0;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I)
-    if (I->isCtrl())
+  for (const SDep &Succ : SU->Succs)
+    if (Succ.isCtrl())
       NumberDeps++;
 
   return NumberDeps;
@@ -154,9 +149,8 @@ static unsigned numberCtrlDepsInSU(SUnit *SU) {
 
 static unsigned numberCtrlPredInSU(SUnit *SU) {
   unsigned NumberDeps = 0;
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I)
-    if (I->isCtrl())
+  for (SDep &Pred : SU->Preds)
+    if (Pred.isCtrl())
       NumberDeps++;
 
   return NumberDeps;
@@ -214,15 +208,14 @@ bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
 /// of SU, return it, otherwise return null.
 SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
   SUnit *OnlyAvailablePred = nullptr;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    SUnit &Pred = *I->getSUnit();
-    if (!Pred.isScheduled) {
+  for (const SDep &Pred : SU->Preds) {
+    SUnit &PredSU = *Pred.getSUnit();
+    if (!PredSU.isScheduled) {
       // We found an available, but not scheduled, predecessor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
-      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+      if (OnlyAvailablePred && OnlyAvailablePred != &PredSU)
         return nullptr;
-      OnlyAvailablePred = &Pred;
+      OnlyAvailablePred = &PredSU;
     }
   }
   return OnlyAvailablePred;
@@ -232,9 +225,8 @@ void ResourcePriorityQueue::push(SUnit *SU) {
   // Look at all of the successors of this node.  Count the number of nodes that
   // this node is the sole unscheduled node for.
   unsigned NumNodesBlocking = 0;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I)
-    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+  for (const SDep &Succ : SU->Succs)
+    if (getSingleUnscheduledPred(Succ.getSUnit()) == SU)
       ++NumNodesBlocking;
 
   NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
@@ -271,14 +263,13 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
   // Now see if there are no other dependencies
   // to instructions already in the packet.
   for (unsigned i = 0, e = Packet.size(); i != e; ++i)
-    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
-         E = Packet[i]->Succs.end(); I != E; ++I) {
+    for (const SDep &Succ : Packet[i]->Succs) {
       // Since we do not add pseudos to packets, might as well
       // ignore order deps.
-      if (I->isCtrl())
+      if (Succ.isCtrl())
         continue;
 
-      if (I->getSUnit() == SU)
+      if (Succ.getSUnit() == SU)
         return false;
     }
 
@@ -364,16 +355,11 @@ int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
     return RegBalance;
 
   if (RawPressure) {
-    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-             E = TRI->regclass_end(); I != E; ++I) {
-      const TargetRegisterClass *RC = *I;
+    for (const TargetRegisterClass *RC : TRI->regclasses())
       RegBalance += rawRegPressureDelta(SU, RC->getID());
-    }
   }
   else {
-    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-         E = TRI->regclass_end(); I != E; ++I) {
-      const TargetRegisterClass *RC = *I;
+    for (const TargetRegisterClass *RC : TRI->regclasses()) {
       if ((RegPressure[RC->getID()] +
            rawRegPressureDelta(SU, RC->getID()) > 0) &&
           (RegPressure[RC->getID()] +
@@ -506,11 +492,10 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
         }
       }
     }
-    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-                              I != E; ++I) {
-      if (I->isCtrl() || (I->getSUnit()->NumRegDefsLeft == 0))
+    for (SDep &Pred : SU->Preds) {
+      if (Pred.isCtrl() || (Pred.getSUnit()->NumRegDefsLeft == 0))
         continue;
-      --I->getSUnit()->NumRegDefsLeft;
+      --Pred.getSUnit()->NumRegDefsLeft;
     }
   }
 
@@ -522,10 +507,9 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
   // number of live ranges. All others, increase it.
   unsigned NumberNonControlDeps = 0;
 
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-                                  I != E; ++I) {
-    adjustPriorityOfUnscheduledPreds(I->getSUnit());
-    if (!I->isCtrl())
+  for (const SDep &Succ : SU->Succs) {
+    adjustPriorityOfUnscheduledPreds(Succ.getSUnit());
+    if (!Succ.isCtrl())
       NumberNonControlDeps++;
   }
 
@@ -602,8 +586,7 @@ SUnit *ResourcePriorityQueue::pop() {
   std::vector<SUnit *>::iterator Best = Queue.begin();
   if (!DisableDFASched) {
     int BestCost = SUSchedulingCost(*Best);
-    for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
-           E = Queue.end(); I != E; ++I) {
+    for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I) {
 
       if (SUSchedulingCost(*I) > BestCost) {
         BestCost = SUSchedulingCost(*I);
@@ -613,8 +596,7 @@ SUnit *ResourcePriorityQueue::pop() {
   }
   // Use default TD scheduling mechanism.
   else {
-    for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
-       E = Queue.end(); I != E; ++I)
+    for (auto I = std::next(Queue.begin()), E = Queue.end(); I != E; ++I)
       if (Picker(*Best, *I))
         Best = I;
   }
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 62e7733..1379940 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "InstrEmitter.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
@@ -160,18 +160,17 @@ void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
 
 void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
   // Bottom up: release predecessors
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    ReleasePred(SU, &*I);
-    if (I->isAssignedRegDep()) {
+  for (SDep &Pred : SU->Preds) {
+    ReleasePred(SU, &Pred);
+    if (Pred.isAssignedRegDep()) {
       // This is a physical register dependency and it's impossible or
       // expensive to copy the register. Make sure nothing that can
       // clobber the register is scheduled between the predecessor and
       // this node.
-      if (!LiveRegDefs[I->getReg()]) {
+      if (!LiveRegDefs[Pred.getReg()]) {
         ++NumLiveRegs;
-        LiveRegDefs[I->getReg()] = I->getSUnit();
-        LiveRegCycles[I->getReg()] = CurCycle;
+        LiveRegDefs[Pred.getReg()] = Pred.getSUnit();
+        LiveRegCycles[Pred.getReg()] = CurCycle;
       }
     }
   }
@@ -191,16 +190,15 @@ void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
   ReleasePredecessors(SU, CurCycle);
 
   // Release all the implicit physical register defs that are live.
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isAssignedRegDep()) {
-      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isAssignedRegDep()) {
+      if (LiveRegCycles[Succ.getReg()] == Succ.getSUnit()->getHeight()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
-        assert(LiveRegDefs[I->getReg()] == SU &&
+        assert(LiveRegDefs[Succ.getReg()] == SU &&
                "Physical register dependency violated?");
         --NumLiveRegs;
-        LiveRegDefs[I->getReg()] = nullptr;
-        LiveRegCycles[I->getReg()] = 0;
+        LiveRegDefs[Succ.getReg()] = nullptr;
+        LiveRegCycles[Succ.getReg()] = 0;
       }
     }
   }
@@ -282,22 +280,20 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
     SmallVector<SDep, 4> LoadPreds;
     SmallVector<SDep, 4> NodePreds;
     SmallVector<SDep, 4> NodeSuccs;
-    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      if (I->isCtrl())
-        ChainPred = *I;
-      else if (I->getSUnit()->getNode() &&
-               I->getSUnit()->getNode()->isOperandOf(LoadNode))
-        LoadPreds.push_back(*I);
+    for (SDep &Pred : SU->Preds) {
+      if (Pred.isCtrl())
+        ChainPred = Pred;
+      else if (Pred.getSUnit()->getNode() &&
+               Pred.getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(Pred);
       else
-        NodePreds.push_back(*I);
+        NodePreds.push_back(Pred);
     }
-    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-         I != E; ++I) {
-      if (I->isCtrl())
-        ChainSuccs.push_back(*I);
+    for (SDep &Succ : SU->Succs) {
+      if (Succ.isCtrl())
+        ChainSuccs.push_back(Succ);
       else
-        NodeSuccs.push_back(*I);
+        NodeSuccs.push_back(Succ);
     }
 
     if (ChainPred.getSUnit()) {
@@ -354,21 +350,19 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
   NewSU = Clone(SU);
 
   // New SUnit has the exact same predecessors.
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I)
-    if (!I->isArtificial())
-      AddPred(NewSU, *I);
+  for (SDep &Pred : SU->Preds)
+    if (!Pred.isArtificial())
+      AddPred(NewSU, Pred);
 
   // Only copy scheduled successors. Cut them from old node's successor
   // list and move them over.
   SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isArtificial())
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
       continue;
-    SUnit *SuccSU = I->getSUnit();
+    SUnit *SuccSU = Succ.getSUnit();
     if (SuccSU->isScheduled) {
-      SDep D = *I;
+      SDep D = Succ;
       D.setSUnit(NewSU);
       AddPred(SuccSU, D);
       D.setSUnit(SU);
@@ -399,16 +393,15 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
   // Only copy scheduled successors. Cut them from old node's successor
   // list and move them over.
   SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    if (I->isArtificial())
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
       continue;
-    SUnit *SuccSU = I->getSUnit();
+    SUnit *SuccSU = Succ.getSUnit();
     if (SuccSU->isScheduled) {
-      SDep D = *I;
+      SDep D = Succ;
       D.setSUnit(CopyToSU);
       AddPred(SuccSU, D);
-      DelDeps.push_back(std::make_pair(SuccSU, *I));
+      DelDeps.push_back(std::make_pair(SuccSU, Succ));
     }
   }
   for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
@@ -479,10 +472,9 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
 
   SmallSet<unsigned, 4> RegAdded;
   // If this node would clobber any "live" register, then it's not ready.
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isAssignedRegDep()) {
-      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isAssignedRegDep()) {
+      CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs,
                          RegAdded, LRegs, TRI);
     }
   }
@@ -755,9 +747,8 @@ void ScheduleDAGLinearize::Schedule() {
     // Glue user must be scheduled together with the glue operand. So other
     // users of the glue operand must be treated as its users.
     SDNode *ImmGUser = Glue->getGluedUser();
-    for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end();
-         ui != ue; ++ui)
-      if (*ui == ImmGUser)
+    for (const SDNode *U : Glue->uses())
+      if (U == ImmGUser)
         --Degree;
     GUser->setNodeId(UDegree + Degree);
     Glue->setNodeId(1);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 3549ccd..70b1fa7 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -15,13 +15,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InlineAsm.h"
@@ -226,6 +226,7 @@ private:
   void UnscheduleNodeBottomUp(SUnit*);
   void RestoreHazardCheckerBottomUp();
   void BacktrackBottomUp(SUnit*, SUnit*);
+  SUnit *TryUnfoldSU(SUnit *);
   SUnit *CopyAndMoveSuccessors(SUnit*);
   void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
                                 const TargetRegisterClass*,
@@ -422,11 +423,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner,
     }
     // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
     if (N->isMachineOpcode()) {
-      if (N->getMachineOpcode() ==
-          (unsigned)TII->getCallFrameDestroyOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         ++NestLevel;
-      } else if (N->getMachineOpcode() ==
-                 (unsigned)TII->getCallFrameSetupOpcode()) {
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         if (NestLevel == 0)
           return false;
         --NestLevel;
@@ -480,12 +479,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
     }
     // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
     if (N->isMachineOpcode()) {
-      if (N->getMachineOpcode() ==
-          (unsigned)TII->getCallFrameDestroyOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         ++NestLevel;
         MaxNest = std::max(MaxNest, NestLevel);
-      } else if (N->getMachineOpcode() ==
-                 (unsigned)TII->getCallFrameSetupOpcode()) {
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         assert(NestLevel != 0);
         --NestLevel;
         if (NestLevel == 0)
@@ -524,21 +521,20 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
 /// interference on flags.
 void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
   // Bottom up: release predecessors
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    ReleasePred(SU, &*I);
-    if (I->isAssignedRegDep()) {
+  for (SDep &Pred : SU->Preds) {
+    ReleasePred(SU, &Pred);
+    if (Pred.isAssignedRegDep()) {
       // This is a physical register dependency and it's impossible or
       // expensive to copy the register. Make sure nothing that can
       // clobber the register is scheduled between the predecessor and
       // this node.
-      SUnit *RegDef = LiveRegDefs[I->getReg()]; (void)RegDef;
-      assert((!RegDef || RegDef == SU || RegDef == I->getSUnit()) &&
+      SUnit *RegDef = LiveRegDefs[Pred.getReg()]; (void)RegDef;
+      assert((!RegDef || RegDef == SU || RegDef == Pred.getSUnit()) &&
              "interference on register dependence");
-      LiveRegDefs[I->getReg()] = I->getSUnit();
-      if (!LiveRegGens[I->getReg()]) {
+      LiveRegDefs[Pred.getReg()] = Pred.getSUnit();
+      if (!LiveRegGens[Pred.getReg()]) {
         ++NumLiveRegs;
-        LiveRegGens[I->getReg()] = SU;
+        LiveRegGens[Pred.getReg()] = SU;
       }
     }
   }
@@ -550,7 +546,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
   if (!LiveRegDefs[CallResource])
     for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
       if (Node->isMachineOpcode() &&
-          Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+          Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         unsigned NestLevel = 0;
         unsigned MaxNest = 0;
         SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
@@ -737,15 +733,14 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
   ReleasePredecessors(SU);
 
   // Release all the implicit physical register defs that are live.
-  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    // LiveRegDegs[I->getReg()] != SU when SU is a two-address node.
-    if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] == SU) {
+  for (SDep &Succ : SU->Succs) {
+    // LiveRegDegs[Succ.getReg()] != SU when SU is a two-address node.
+    if (Succ.isAssignedRegDep() && LiveRegDefs[Succ.getReg()] == SU) {
       assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
       --NumLiveRegs;
-      LiveRegDefs[I->getReg()] = nullptr;
-      LiveRegGens[I->getReg()] = nullptr;
-      releaseInterferences(I->getReg());
+      LiveRegDefs[Succ.getReg()] = nullptr;
+      LiveRegGens[Succ.getReg()] = nullptr;
+      releaseInterferences(Succ.getReg());
     }
   }
   // Release the special call resource dependence, if this is the beginning
@@ -755,7 +750,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
     for (const SDNode *SUNode = SU->getNode(); SUNode;
          SUNode = SUNode->getGluedNode()) {
       if (SUNode->isMachineOpcode() &&
-          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+          SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
         LiveRegDefs[CallResource] = nullptr;
@@ -786,7 +781,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
 }
 
 /// CapturePred - This does the opposite of ReleasePred. Since SU is being
-/// unscheduled, incrcease the succ left count of its predecessors. Remove
+/// unscheduled, increase the succ left count of its predecessors. Remove
 /// them from AvailableQueue if necessary.
 void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
   SUnit *PredSU = PredEdge->getSUnit();
@@ -806,17 +801,16 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
   DEBUG(SU->dump(this));
 
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    CapturePred(&*I);
-    if (I->isAssignedRegDep() && SU == LiveRegGens[I->getReg()]){
+  for (SDep &Pred : SU->Preds) {
+    CapturePred(&Pred);
+    if (Pred.isAssignedRegDep() && SU == LiveRegGens[Pred.getReg()]){
       assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
-      assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
+      assert(LiveRegDefs[Pred.getReg()] == Pred.getSUnit() &&
              "Physical register dependency violated?");
       --NumLiveRegs;
-      LiveRegDefs[I->getReg()] = nullptr;
-      LiveRegGens[I->getReg()] = nullptr;
-      releaseInterferences(I->getReg());
+      LiveRegDefs[Pred.getReg()] = nullptr;
+      LiveRegGens[Pred.getReg()] = nullptr;
+      releaseInterferences(Pred.getReg());
     }
   }
 
@@ -826,7 +820,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (SUNode->isMachineOpcode() &&
-        SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+        SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
       ++NumLiveRegs;
       LiveRegDefs[CallResource] = SU;
       LiveRegGens[CallResource] = CallSeqEndForStart[SU];
@@ -839,7 +833,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
     for (const SDNode *SUNode = SU->getNode(); SUNode;
          SUNode = SUNode->getGluedNode()) {
       if (SUNode->isMachineOpcode() &&
-          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+          SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
         LiveRegDefs[CallResource] = nullptr;
@@ -899,7 +893,7 @@ void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
 
   std::vector<SUnit*>::const_iterator I = (Sequence.end() - LookAhead);
   unsigned HazardCycle = (*I)->getHeight();
-  for (std::vector<SUnit*>::const_iterator E = Sequence.end(); I != E; ++I) {
+  for (auto E = Sequence.end(); I != E; ++I) {
     SUnit *SU = *I;
     for (; SU->getHeight() > HazardCycle; ++HazardCycle) {
       HazardRec->RecedeCycle();
@@ -941,6 +935,146 @@ static bool isOperandOf(const SUnit *SU, SDNode *N) {
   return false;
 }
 
+/// TryUnfold - Attempt to unfold
+SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) {
+  SDNode *N = SU->getNode();
+  // Use while over if to ease fall through.
+  SmallVector<SDNode *, 2> NewNodes;
+  if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+    return nullptr;
+
+  // unfolding an x86 DEC64m operation results in store, dec, load which
+  // can't be handled here so quit
+  if (NewNodes.size() == 3)
+    return nullptr;
+
+  assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+  N = NewNodes[1];
+  SDNode *LoadNode = NewNodes[0];
+  unsigned NumVals = N->getNumValues();
+  unsigned OldNumVals = SU->getNode()->getNumValues();
+
+  // LoadNode may already exist. This can happen when there is another
+  // load from the same location and producing the same type of value
+  // but it has different alignment or volatileness.
+  bool isNewLoad = true;
+  SUnit *LoadSU;
+  if (LoadNode->getNodeId() != -1) {
+    LoadSU = &SUnits[LoadNode->getNodeId()];
+    // If LoadSU has already been scheduled, we should clone it but
+    // this would negate the benefit to unfolding so just return SU.
+    if (LoadSU->isScheduled)
+      return SU;
+    isNewLoad = false;
+  } else {
+    LoadSU = CreateNewSUnit(LoadNode);
+    LoadNode->setNodeId(LoadSU->NodeNum);
+
+    InitNumRegDefsLeft(LoadSU);
+    computeLatency(LoadSU);
+  }
+
+  DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
+
+  // Now that we are committed to unfolding replace DAG Uses.
+  for (unsigned i = 0; i != NumVals; ++i)
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+  DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1),
+                                 SDValue(LoadNode, 1));
+
+  SUnit *NewSU = CreateNewSUnit(N);
+  assert(N->getNodeId() == -1 && "Node already inserted!");
+  N->setNodeId(NewSU->NodeNum);
+
+  const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+  for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+    if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+      NewSU->isTwoAddress = true;
+      break;
+    }
+  }
+  if (MCID.isCommutable())
+    NewSU->isCommutable = true;
+
+  InitNumRegDefsLeft(NewSU);
+  computeLatency(NewSU);
+
+  // Record all the edges to and from the old SU, by category.
+  SmallVector<SDep, 4> ChainPreds;
+  SmallVector<SDep, 4> ChainSuccs;
+  SmallVector<SDep, 4> LoadPreds;
+  SmallVector<SDep, 4> NodePreds;
+  SmallVector<SDep, 4> NodeSuccs;
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      ChainPreds.push_back(Pred);
+    else if (isOperandOf(Pred.getSUnit(), LoadNode))
+      LoadPreds.push_back(Pred);
+    else
+      NodePreds.push_back(Pred);
+  }
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl())
+      ChainSuccs.push_back(Succ);
+    else
+      NodeSuccs.push_back(Succ);
+  }
+
+  // Now assign edges to the newly-created nodes.
+  for (const SDep &Pred : ChainPreds) {
+    RemovePred(SU, Pred);
+    if (isNewLoad)
+      AddPred(LoadSU, Pred);
+  }
+  for (const SDep &Pred : LoadPreds) {
+    RemovePred(SU, Pred);
+    if (isNewLoad)
+      AddPred(LoadSU, Pred);
+  }
+  for (const SDep &Pred : NodePreds) {
+    RemovePred(SU, Pred);
+    AddPred(NewSU, Pred);
+  }
+  for (SDep D : NodeSuccs) {
+    SUnit *SuccDep = D.getSUnit();
+    D.setSUnit(SU);
+    RemovePred(SuccDep, D);
+    D.setSUnit(NewSU);
+    AddPred(SuccDep, D);
+    // Balance register pressure.
+    if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled &&
+        !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
+      --NewSU->NumRegDefsLeft;
+  }
+  for (SDep D : ChainSuccs) {
+    SUnit *SuccDep = D.getSUnit();
+    D.setSUnit(SU);
+    RemovePred(SuccDep, D);
+    if (isNewLoad) {
+      D.setSUnit(LoadSU);
+      AddPred(SuccDep, D);
+    }
+  }
+
+  // Add a data dependency to reflect that NewSU reads the value defined
+  // by LoadSU.
+  SDep D(LoadSU, SDep::Data, 0);
+  D.setLatency(LoadSU->Latency);
+  AddPred(NewSU, D);
+
+  if (isNewLoad)
+    AvailableQueue->addNode(LoadSU);
+  AvailableQueue->addNode(NewSU);
+
+  ++NumUnfolds;
+
+  if (NewSU->NumSuccsLeft == 0)
+    NewSU->isAvailable = true;
+
+  return NewSU;
+}
+
 /// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
 /// successors to the newly created node.
 SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
@@ -966,135 +1100,16 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
       return nullptr;
   }
 
+  // If possible unfold instruction.
   if (TryUnfold) {
-    SmallVector<SDNode*, 2> NewNodes;
-    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+    SUnit *UnfoldSU = TryUnfoldSU(SU);
+    if (!UnfoldSU)
       return nullptr;
-
-    // unfolding an x86 DEC64m operation results in store, dec, load which
-    // can't be handled here so quit
-    if (NewNodes.size() == 3)
-      return nullptr;
-
-    DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
-    assert(NewNodes.size() == 2 && "Expected a load folding node!");
-
-    N = NewNodes[1];
-    SDNode *LoadNode = NewNodes[0];
-    unsigned NumVals = N->getNumValues();
-    unsigned OldNumVals = SU->getNode()->getNumValues();
-    for (unsigned i = 0; i != NumVals; ++i)
-      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
-    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
-                                   SDValue(LoadNode, 1));
-
-    // LoadNode may already exist. This can happen when there is another
-    // load from the same location and producing the same type of value
-    // but it has different alignment or volatileness.
-    bool isNewLoad = true;
-    SUnit *LoadSU;
-    if (LoadNode->getNodeId() != -1) {
-      LoadSU = &SUnits[LoadNode->getNodeId()];
-      isNewLoad = false;
-    } else {
-      LoadSU = CreateNewSUnit(LoadNode);
-      LoadNode->setNodeId(LoadSU->NodeNum);
-
-      InitNumRegDefsLeft(LoadSU);
-      computeLatency(LoadSU);
-    }
-
-    SUnit *NewSU = CreateNewSUnit(N);
-    assert(N->getNodeId() == -1 && "Node already inserted!");
-    N->setNodeId(NewSU->NodeNum);
-
-    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
-    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
-      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
-        NewSU->isTwoAddress = true;
-        break;
-      }
-    }
-    if (MCID.isCommutable())
-      NewSU->isCommutable = true;
-
-    InitNumRegDefsLeft(NewSU);
-    computeLatency(NewSU);
-
-    // Record all the edges to and from the old SU, by category.
-    SmallVector<SDep, 4> ChainPreds;
-    SmallVector<SDep, 4> ChainSuccs;
-    SmallVector<SDep, 4> LoadPreds;
-    SmallVector<SDep, 4> NodePreds;
-    SmallVector<SDep, 4> NodeSuccs;
-    for (SDep &Pred : SU->Preds) {
-      if (Pred.isCtrl())
-        ChainPreds.push_back(Pred);
-      else if (isOperandOf(Pred.getSUnit(), LoadNode))
-        LoadPreds.push_back(Pred);
-      else
-        NodePreds.push_back(Pred);
-    }
-    for (SDep &Succ : SU->Succs) {
-      if (Succ.isCtrl())
-        ChainSuccs.push_back(Succ);
-      else
-        NodeSuccs.push_back(Succ);
-    }
-
-    // Now assign edges to the newly-created nodes.
-    for (const SDep &Pred : ChainPreds) {
-      RemovePred(SU, Pred);
-      if (isNewLoad)
-        AddPred(LoadSU, Pred);
-    }
-    for (const SDep &Pred : LoadPreds) {
-      RemovePred(SU, Pred);
-      if (isNewLoad)
-        AddPred(LoadSU, Pred);
-    }
-    for (const SDep &Pred : NodePreds) {
-      RemovePred(SU, Pred);
-      AddPred(NewSU, Pred);
-    }
-    for (SDep D : NodeSuccs) {
-      SUnit *SuccDep = D.getSUnit();
-      D.setSUnit(SU);
-      RemovePred(SuccDep, D);
-      D.setSUnit(NewSU);
-      AddPred(SuccDep, D);
-      // Balance register pressure.
-      if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled
-          && !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
-        --NewSU->NumRegDefsLeft;
-    }
-    for (SDep D : ChainSuccs) {
-      SUnit *SuccDep = D.getSUnit();
-      D.setSUnit(SU);
-      RemovePred(SuccDep, D);
-      if (isNewLoad) {
-        D.setSUnit(LoadSU);
-        AddPred(SuccDep, D);
-      }
-    }
-
-    // Add a data dependency to reflect that NewSU reads the value defined
-    // by LoadSU.
-    SDep D(LoadSU, SDep::Data, 0);
-    D.setLatency(LoadSU->Latency);
-    AddPred(NewSU, D);
-
-    if (isNewLoad)
-      AvailableQueue->addNode(LoadSU);
-    AvailableQueue->addNode(NewSU);
-
-    ++NumUnfolds;
-
-    if (NewSU->NumSuccsLeft == 0) {
-      NewSU->isAvailable = true;
-      return NewSU;
-    }
-    SU = NewSU;
+    SU = UnfoldSU;
+    N = SU->getNode();
+    // If this can be scheduled don't bother duplicating and just return
+    if (SU->NumSuccsLeft == 0)
+      return SU;
   }
 
   DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
@@ -1265,10 +1280,9 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
   //
   // If SU is the currently live definition of the same register that it uses,
   // then we are free to schedule it.
-  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU)
-      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(),
+  for (SDep &Pred : SU->Preds) {
+    if (Pred.isAssignedRegDep() && LiveRegDefs[Pred.getReg()] != SU)
+      CheckForLiveRegDef(Pred.getSUnit(), Pred.getReg(), LiveRegDefs.get(),
                          RegAdded, LRegs, TRI);
   }
 
@@ -1305,7 +1319,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
     // If we're in the middle of scheduling a call, don't begin scheduling
     // another call. Also, don't allow any physical registers to be live across
     // the call.
-    if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+    if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) ||
+        (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) {
       // Check the special calling-sequence resource.
       unsigned CallResource = TRI->getNumRegs();
       if (LiveRegDefs[CallResource]) {
@@ -1323,6 +1338,18 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
                                RegAdded, LRegs);
 
     const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    if (MCID.hasOptionalDef()) {
+      // Most ARM instructions have an OptionalDef for CPSR, to model the S-bit.
+      // This operand can be either a def of CPSR, if the S bit is set; or a use
+      // of %noreg.  When the OptionalDef is set to a valid register, we need to
+      // handle it in the same way as an ImplicitDef.
+      for (unsigned i = 0; i < MCID.getNumDefs(); ++i)
+        if (MCID.OpInfo[i].isOptionalDef()) {
+          const SDValue &OptionalDef = Node->getOperand(i - Node->getNumValues());
+          unsigned Reg = cast<RegisterSDNode>(OptionalDef)->getReg();
+          CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
+        }
+    }
     if (!MCID.ImplicitDefs)
       continue;
     for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
@@ -1659,9 +1686,8 @@ public:
       RegPressure.resize(NumRC);
       std::fill(RegLimit.begin(), RegLimit.end(), 0);
       std::fill(RegPressure.begin(), RegPressure.end(), 0);
-      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-             E = TRI->regclass_end(); I != E; ++I)
-        RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF);
+      for (const TargetRegisterClass *RC : TRI->regclasses())
+        RegLimit[RC->getID()] = tri->getRegPressureLimit(RC, MF);
     }
   }
 
@@ -1735,8 +1761,7 @@ protected:
 template<class SF>
 static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) {
   std::vector<SUnit *>::iterator Best = Q.begin();
-  for (std::vector<SUnit *>::iterator I = std::next(Q.begin()),
-         E = Q.end(); I != E; ++I)
+  for (auto I = std::next(Q.begin()), E = Q.end(); I != E; ++I)
     if (Picker(*Best, *I))
       Best = I;
   SUnit *V = *Best;
@@ -1788,7 +1813,7 @@ public:
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  void dump(ScheduleDAG *DAG) const override {
+  LLVM_DUMP_METHOD void dump(ScheduleDAG *DAG) const override {
     // Emulate pop() without clobbering NodeQueueIds.
     std::vector<SUnit*> DumpQueue = Queue;
     SF DumpPicker = Picker;
@@ -1836,28 +1861,68 @@ static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
 /// Smaller number is the higher priority.
 static unsigned
 CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
-  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
-  if (SethiUllmanNumber != 0)
-    return SethiUllmanNumber;
+  if (SUNumbers[SU->NodeNum] != 0)
+    return SUNumbers[SU->NodeNum];
+
+  // Use WorkList to avoid stack overflow on excessively large IRs.
+  struct WorkState {
+    WorkState(const SUnit *SU) : SU(SU) {}
+    const SUnit *SU;
+    unsigned PredsProcessed = 0;
+  };
 
-  unsigned Extra = 0;
-  for (const SDep &Pred : SU->Preds) {
-    if (Pred.isCtrl()) continue;  // ignore chain preds
-    SUnit *PredSU = Pred.getSUnit();
-    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
-    if (PredSethiUllman > SethiUllmanNumber) {
-      SethiUllmanNumber = PredSethiUllman;
-      Extra = 0;
-    } else if (PredSethiUllman == SethiUllmanNumber)
-      ++Extra;
-  }
+  SmallVector<WorkState, 16> WorkList;
+  WorkList.push_back(SU);
+  while (!WorkList.empty()) {
+    auto &Temp = WorkList.back();
+    auto *TempSU = Temp.SU;
+    bool AllPredsKnown = true;
+    // Try to find a non-evaluated pred and push it into the processing stack.
+    for (unsigned P = Temp.PredsProcessed; P < TempSU->Preds.size(); ++P) {
+      auto &Pred = TempSU->Preds[P];
+      if (Pred.isCtrl()) continue;  // ignore chain preds
+      SUnit *PredSU = Pred.getSUnit();
+      if (SUNumbers[PredSU->NodeNum] == 0) {
+#ifndef NDEBUG
+        // In debug mode, check that we don't have such element in the stack.
+        for (auto It : WorkList)
+          assert(It.SU != PredSU && "Trying to push an element twice?");
+#endif
+        // Next time start processing this one starting from the next pred.
+        Temp.PredsProcessed = P + 1;
+        WorkList.push_back(PredSU);
+        AllPredsKnown = false;
+        break;
+      }
+    }
 
-  SethiUllmanNumber += Extra;
+    if (!AllPredsKnown)
+      continue;
 
-  if (SethiUllmanNumber == 0)
-    SethiUllmanNumber = 1;
+    // Once all preds are known, we can calculate the answer for this one.
+    unsigned SethiUllmanNumber = 0;
+    unsigned Extra = 0;
+    for (const SDep &Pred : TempSU->Preds) {
+      if (Pred.isCtrl()) continue;  // ignore chain preds
+      SUnit *PredSU = Pred.getSUnit();
+      unsigned PredSethiUllman = SUNumbers[PredSU->NodeNum];
+      assert(PredSethiUllman > 0 && "We should have evaluated this pred!");
+      if (PredSethiUllman > SethiUllmanNumber) {
+        SethiUllmanNumber = PredSethiUllman;
+        Extra = 0;
+      } else if (PredSethiUllman == SethiUllmanNumber)
+        ++Extra;
+    }
+
+    SethiUllmanNumber += Extra;
+    if (SethiUllmanNumber == 0)
+      SethiUllmanNumber = 1;
+    SUNumbers[TempSU->NodeNum] = SethiUllmanNumber;
+    WorkList.pop_back();
+  }
 
-  return SethiUllmanNumber;
+  assert(SUNumbers[SU->NodeNum] > 0 && "SethiUllman should never be zero!");
+  return SUNumbers[SU->NodeNum];
 }
 
 /// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
@@ -1924,19 +1989,17 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
 //                     Register Pressure Tracking
 //===----------------------------------------------------------------------===//
 
-void RegReductionPQBase::dumpRegPressure() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
-         E = TRI->regclass_end(); I != E; ++I) {
-    const TargetRegisterClass *RC = *I;
+LLVM_DUMP_METHOD void RegReductionPQBase::dumpRegPressure() const {
+  for (const TargetRegisterClass *RC : TRI->regclasses()) {
     unsigned Id = RC->getID();
     unsigned RP = RegPressure[Id];
     if (!RP) continue;
     DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
           << RegLimit[Id] << '\n');
   }
-#endif
 }
+#endif
 
 bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
   if (!TLI)
@@ -2092,7 +2155,7 @@ void RegReductionPQBase::scheduledNode(SUnit *SU) {
       RegPressure[RCId] -= Cost;
     }
   }
-  dumpRegPressure();
+  DEBUG(dumpRegPressure());
 }
 
 void RegReductionPQBase::unscheduledNode(SUnit *SU) {
@@ -2172,7 +2235,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
     }
   }
 
-  dumpRegPressure();
+  DEBUG(dumpRegPressure());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 3be622f..3c8526e 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -650,6 +650,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+  // Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   if (!SU->getNode()) {
     dbgs() << "PHYS REG COPY\n";
@@ -704,8 +705,8 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   if (!N->getHasDebugValue())
     return;
 
-  // Opportunistically insert immediate dbg_value uses, i.e. those with source
-  // order number right after the N.
+  // Opportunistically insert immediate dbg_value uses, i.e. those with the same
+  // source order number as N.
   MachineBasicBlock *BB = Emitter.getBlock();
   MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
   ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N);
@@ -713,7 +714,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
     if (DVs[i]->isInvalidated())
       continue;
     unsigned DVOrder = DVs[i]->getOrder();
-    if (!Order || DVOrder == ++Order) {
+    if (!Order || DVOrder == Order) {
       MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap);
       if (DbgMI) {
         Orders.push_back(std::make_pair(DVOrder, DbgMI));
@@ -835,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       GluedNodes.push_back(N);
     while (!GluedNodes.empty()) {
       SDNode *N = GluedNodes.back();
-      Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned,
-                       VRBaseMap);
+      Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);
       // Remember the source order of the inserted instruction.
       if (HasDbg)
         ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index eee4a4b..631cb34 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -18,12 +18,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e225ba8..16f425d 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1,4 +1,4 @@
-//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===//
+//===- SelectionDAG.cpp - Implement the SelectionDAG data structures ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,43 +13,66 @@
 
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeDbgValue.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/IR/CallingConv.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
-#include <cmath>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <set>
+#include <string>
 #include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -93,7 +116,8 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
 //                              ISD Namespace
 //===----------------------------------------------------------------------===//
 
-bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
+bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal,
+                                bool AllowShrink) {
   auto *BV = dyn_cast<BuildVectorSDNode>(N);
   if (!BV)
     return false;
@@ -101,9 +125,11 @@ bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
   APInt SplatUndef;
   unsigned SplatBitSize;
   bool HasUndefs;
-  EVT EltVT = N->getValueType(0).getVectorElementType();
-  return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs) &&
-         EltVT.getSizeInBits() >= SplatBitSize;
+  unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+  unsigned MinSplatBits = AllowShrink ? 0 : EltSize;
+  return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs,
+                             MinSplatBits) &&
+         EltSize >= SplatBitSize;
 }
 
 // FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
@@ -268,7 +294,6 @@ ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
   return ISD::CondCode(Operation);
 }
 
-
 /// For an integer comparison, return 1 if the comparison is a signed operation
 /// and 2 if the result is an unsigned comparison. Return zero if the operation
 /// does not depend on the sign of the input (setne and seteq).
@@ -289,28 +314,28 @@ static int isSignedOp(ISD::CondCode Opcode) {
 }
 
 ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
-                                       bool isInteger) {
-  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+                                       bool IsInteger) {
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed integer setcc with an unsigned integer setcc.
     return ISD::SETCC_INVALID;
 
   unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
 
-  // If the N and U bits get set then the resultant comparison DOES suddenly
-  // care about orderedness, and is true when ordered.
+  // If the N and U bits get set, then the resultant comparison DOES suddenly
+  // care about orderedness, and it is true when ordered.
   if (Op > ISD::SETTRUE2)
     Op &= ~16;     // Clear the U bit if the N bit is set.
 
   // Canonicalize illegal integer setcc's.
-  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+  if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
     Op = ISD::SETNE;
 
   return ISD::CondCode(Op);
 }
 
 ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
-                                        bool isInteger) {
-  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+                                        bool IsInteger) {
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed setcc with an unsigned setcc.
     return ISD::SETCC_INVALID;
 
@@ -318,7 +343,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
   ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
 
   // Canonicalize illegal integer setcc's.
-  if (isInteger) {
+  if (IsInteger) {
     switch (Result) {
     default: break;
     case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
@@ -337,7 +362,6 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
 //===----------------------------------------------------------------------===//
 
 /// AddNodeIDOpcode - Add the node opcode to the NodeID data.
-///
 static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
   ID.AddInteger(OpC);
 }
@@ -349,7 +373,6 @@ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
-///
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDValue> Ops) {
   for (auto& Op : Ops) {
@@ -359,7 +382,6 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
 }
 
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
-///
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDUse> Ops) {
   for (auto& Op : Ops) {
@@ -391,10 +413,9 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     break;
   }
   case ISD::TargetConstantFP:
-  case ISD::ConstantFP: {
+  case ISD::ConstantFP:
     ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
     break;
-  }
   case ISD::TargetGlobalAddress:
   case ISD::GlobalAddress:
   case ISD::TargetGlobalTLSAddress:
@@ -572,6 +593,11 @@ void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
   // worklist.
   while (!DeadNodes.empty()) {
     SDNode *N = DeadNodes.pop_back_val();
+    // Skip to next node if we've already managed to delete the node. This could
+    // happen if replacing a node causes a node previously added to the node to
+    // be deleted.
+    if (N->getOpcode() == ISD::DELETED_NODE)
+      continue;
 
     for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
       DUL->NodeDeleted(N, nullptr);
@@ -639,12 +665,15 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
   // If we have operands, deallocate them.
   removeOperands(N);
 
+  NodeAllocator.Deallocate(AllNodes.remove(N));
+
   // Set the opcode to DELETED_NODE to help catch bugs when node
   // memory is reallocated.
+  // FIXME: There are places in SDag that have grown a dependency on the opcode
+  // value in the released node.
+  __asan_unpoison_memory_region(&N->NodeType, sizeof(N->NodeType));
   N->NodeType = ISD::DELETED_NODE;
 
-  NodeAllocator.Deallocate(AllNodes.remove(N));
-
   // If any of the SDDbgValue nodes refer to this SDNode, invalidate
   // them and forget about that node.
   DbgInfo->erase(N);
@@ -766,7 +795,6 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
 /// maps and modified in place. Add it back to the CSE maps, unless an identical
 /// node already exists, in which case transfer all its users to the existing
 /// node. This transfer can potentially trigger recursive merging.
-///
 void
 SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
   // For node types that aren't CSE'd, just act as if no identical node
@@ -807,8 +835,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
-    if (const SDNodeFlags *Flags = N->getFlags())
-      Node->intersectFlagsWith(Flags);
+    Node->intersectFlagsWith(N->getFlags());
   return Node;
 }
 
@@ -828,12 +855,10 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
-    if (const SDNodeFlags *Flags = N->getFlags())
-      Node->intersectFlagsWith(Flags);
+    Node->intersectFlagsWith(N->getFlags());
   return Node;
 }
 
-
 /// FindModifiedNodeSlot - Find a slot for the specified node if its operands
 /// were replaced with those specified.  If this node is never memoized,
 /// return null, otherwise return a pointer to the slot it would take.  If a
@@ -848,8 +873,7 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
   AddNodeIDCustom(ID, N);
   SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
   if (Node)
-    if (const SDNodeFlags *Flags = N->getFlags())
-      Node->intersectFlagsWith(Flags);
+    Node->intersectFlagsWith(N->getFlags());
   return Node;
 }
 
@@ -863,19 +887,20 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-    : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL),
+    : TM(tm), OptLevel(OL),
       EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
-      Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
-      UpdateListeners(nullptr) {
+      Root(getEntryNode()) {
   InsertNode(&EntryNode);
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf) {
-  MF = &mf;
+void SelectionDAG::init(MachineFunction &NewMF,
+                        OptimizationRemarkEmitter &NewORE) {
+  MF = &NewMF;
+  ORE = &NewORE;
   TLI = getSubtarget().getTargetLowering();
   TSI = getSubtarget().getSelectionDAGInfo();
-  Context = &mf.getFunction()->getContext();
+  Context = &MF->getFunction()->getContext();
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -895,29 +920,6 @@ void SelectionDAG::allnodes_clear() {
 #endif
 }
 
-SDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, const SDLoc &DL,
-                                      SDVTList VTs, SDValue N1, SDValue N2,
-                                      const SDNodeFlags *Flags) {
-  SDValue Ops[] = {N1, N2};
-
-  if (isBinOpWithFlags(Opcode)) {
-    // If no flags were passed in, use a default flags object.
-    SDNodeFlags F;
-    if (Flags == nullptr)
-      Flags = &F;
-
-    auto *FN = newSDNode<BinaryWithFlagsSDNode>(Opcode, DL.getIROrder(),
-                                                DL.getDebugLoc(), VTs, *Flags);
-    createOperands(FN, Ops);
-
-    return FN;
-  }
-
-  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
-  createOperands(N, Ops);
-  return N;
-}
-
 SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
                                           void *&InsertPos) {
   SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
@@ -979,6 +981,12 @@ void SelectionDAG::clear() {
   DbgInfo->clear();
 }
 
+SDValue SelectionDAG::getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType())
+             ? getNode(ISD::FP_EXTEND, DL, VT, Op)
+             : getNode(ISD::FP_ROUND, DL, VT, Op, getIntPtrConstant(0, DL));
+}
+
 SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ANY_EXTEND, DL, VT, Op) :
@@ -1052,7 +1060,6 @@ SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
 }
 
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
-///
 SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue NegOne =
@@ -1331,7 +1338,6 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   return SDValue(N, 0);
 }
 
-
 SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
@@ -1465,7 +1471,7 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
   // Validate that all indices in Mask are within the range of the elements
   // input to the shuffle.
   int NElts = Mask.size();
-  assert(all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
+  assert(llvm::all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
          "Index out of range");
 
   // Copy the mask so we can do any needed cleanup.
@@ -1824,7 +1830,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
       std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
 
   int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-  return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
+  return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
@@ -1837,7 +1843,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
 
   MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
   int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
-  return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
+  return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
 SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
@@ -1953,7 +1959,7 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
 /// use this predicate to simplify operations downstream.
 bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
   unsigned BitWidth = Op.getScalarValueSizeInBits();
-  return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
+  return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
 }
 
 /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
@@ -1961,9 +1967,9 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
 /// for bits that V cannot have.
 bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
                                      unsigned Depth) const {
-  APInt KnownZero, KnownOne;
-  computeKnownBits(Op, KnownZero, KnownOne, Depth);
-  return (KnownZero & Mask) == Mask;
+  KnownBits Known;
+  computeKnownBits(Op, Known, Depth);
+  return Mask.isSubsetOf(Known.Zero);
 }
 
 /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
@@ -1979,33 +1985,30 @@ static const APInt *getValidShiftAmountConstant(SDValue V) {
 }
 
 /// Determine which bits of Op are known to be either zero or one and return
-/// them in the KnownZero/KnownOne bitsets. For vectors, the known bits are
-/// those that are shared by every vector element.
-void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
-                                    APInt &KnownOne, unsigned Depth) const {
+/// them in Known. For vectors, the known bits are those that are shared by
+/// every vector element.
+void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
+                                    unsigned Depth) const {
   EVT VT = Op.getValueType();
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
-  computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth);
+  computeKnownBits(Op, Known, DemandedElts, Depth);
 }
 
 /// Determine which bits of Op are known to be either zero or one and return
-/// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows
-/// us to only collect the known bits that are shared by the requested vector
-/// elements.
-/// TODO: We only support DemandedElts on a few opcodes so far, the remainder
-/// should be added when they become necessary.
-void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
-                                    APInt &KnownOne, const APInt &DemandedElts,
+/// them in Known. The DemandedElts argument allows us to only collect the known
+/// bits that are shared by the requested vector elements.
+void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known,
+                                    const APInt &DemandedElts,
                                     unsigned Depth) const {
   unsigned BitWidth = Op.getScalarValueSizeInBits();
 
-  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  Known = KnownBits(BitWidth);   // Don't know anything.
   if (Depth == 6)
     return;  // Limit search depth.
 
-  APInt KnownZero2, KnownOne2;
+  KnownBits Known2;
   unsigned NumElts = DemandedElts.getBitWidth();
 
   if (!DemandedElts)
@@ -2015,35 +2018,34 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   switch (Opcode) {
   case ISD::Constant:
     // We know all of the bits for a constant!
-    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
-    KnownZero = ~KnownOne;
+    Known.One = cast<ConstantSDNode>(Op)->getAPIntValue();
+    Known.Zero = ~Known.One;
     break;
   case ISD::BUILD_VECTOR:
     // Collect the known bits that are shared by every demanded vector element.
     assert(NumElts == Op.getValueType().getVectorNumElements() &&
            "Unexpected vector size");
-    KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+    Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       if (!DemandedElts[i])
         continue;
 
       SDValue SrcOp = Op.getOperand(i);
-      computeKnownBits(SrcOp, KnownZero2, KnownOne2, Depth + 1);
+      computeKnownBits(SrcOp, Known2, Depth + 1);
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
       if (SrcOp.getValueSizeInBits() != BitWidth) {
         assert(SrcOp.getValueSizeInBits() > BitWidth &&
                "Expected BUILD_VECTOR implicit truncation");
-        KnownOne2 = KnownOne2.trunc(BitWidth);
-        KnownZero2 = KnownZero2.trunc(BitWidth);
+        Known2 = Known2.trunc(BitWidth);
       }
 
       // Known bits are the values that are shared by every demanded element.
-      KnownOne &= KnownOne2;
-      KnownZero &= KnownZero2;
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
 
       // If we don't know any bits, early out.
-      if (!KnownOne && !KnownZero)
+      if (!Known.One && !Known.Zero)
         break;
     }
     break;
@@ -2051,7 +2053,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     // Collect the known bits that are shared by every vector element referenced
     // by the shuffle.
     APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
-    KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+    Known.Zero.setAllBits(); Known.One.setAllBits();
     const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
     assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
     for (unsigned i = 0; i != NumElts; ++i) {
@@ -2062,8 +2064,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       if (M < 0) {
         // For UNDEF elements, we don't know anything about the common state of
         // the shuffle result.
-        KnownOne.clearAllBits();
-        KnownZero.clearAllBits();
+        Known.resetAll();
         DemandedLHS.clearAllBits();
         DemandedRHS.clearAllBits();
         break;
@@ -2077,24 +2078,24 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     // Known bits are the values that are shared by every demanded element.
     if (!!DemandedLHS) {
       SDValue LHS = Op.getOperand(0);
-      computeKnownBits(LHS, KnownZero2, KnownOne2, DemandedLHS, Depth + 1);
-      KnownOne &= KnownOne2;
-      KnownZero &= KnownZero2;
+      computeKnownBits(LHS, Known2, DemandedLHS, Depth + 1);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
     }
     // If we don't know any bits, early out.
-    if (!KnownOne && !KnownZero)
+    if (!Known.One && !Known.Zero)
       break;
     if (!!DemandedRHS) {
       SDValue RHS = Op.getOperand(1);
-      computeKnownBits(RHS, KnownZero2, KnownOne2, DemandedRHS, Depth + 1);
-      KnownOne &= KnownOne2;
-      KnownZero &= KnownZero2;
+      computeKnownBits(RHS, Known2, DemandedRHS, Depth + 1);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
     }
     break;
   }
   case ISD::CONCAT_VECTORS: {
     // Split DemandedElts and test each of the demanded subvectors.
-    KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+    Known.Zero.setAllBits(); Known.One.setAllBits();
     EVT SubVectorVT = Op.getOperand(0).getValueType();
     unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
     unsigned NumSubVectors = Op.getNumOperands();
@@ -2103,12 +2104,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       DemandedSub = DemandedSub.trunc(NumSubVectorElts);
       if (!!DemandedSub) {
         SDValue Sub = Op.getOperand(i);
-        computeKnownBits(Sub, KnownZero2, KnownOne2, DemandedSub, Depth + 1);
-        KnownOne &= KnownOne2;
-        KnownZero &= KnownZero2;
+        computeKnownBits(Sub, Known2, DemandedSub, Depth + 1);
+        Known.One &= Known2.One;
+        Known.Zero &= Known2.Zero;
       }
       // If we don't know any bits, early out.
-      if (!KnownOne && !KnownZero)
+      if (!Known.One && !Known.Zero)
         break;
     }
     break;
@@ -2123,9 +2124,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
       APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
-      computeKnownBits(Src, KnownZero, KnownOne, DemandedSrc, Depth + 1);
+      computeKnownBits(Src, Known, DemandedSrc, Depth + 1);
     } else {
-      computeKnownBits(Src, KnownZero, KnownOne, Depth + 1);
+      computeKnownBits(Src, Known, Depth + 1);
     }
     break;
   }
@@ -2139,7 +2140,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
 
     // Fast handling of 'identity' bitcasts.
     if (BitWidth == SubBitWidth) {
-      computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+      computeKnownBits(N0, Known, DemandedElts, Depth + 1);
       break;
     }
 
@@ -2163,10 +2164,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
           SubDemandedElts.setBit(i * SubScale);
 
       for (unsigned i = 0; i != SubScale; ++i) {
-        computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts.shl(i),
+        computeKnownBits(N0, Known2, SubDemandedElts.shl(i),
                          Depth + 1);
-        KnownOne |= KnownOne2.zext(BitWidth).shl(SubBitWidth * i);
-        KnownZero |= KnownZero2.zext(BitWidth).shl(SubBitWidth * i);
+        Known.One |= Known2.One.zext(BitWidth).shl(SubBitWidth * i);
+        Known.Zero |= Known2.Zero.zext(BitWidth).shl(SubBitWidth * i);
       }
     }
 
@@ -2183,16 +2184,16 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
         if (DemandedElts[i])
           SubDemandedElts.setBit(i / SubScale);
 
-      computeKnownBits(N0, KnownZero2, KnownOne2, SubDemandedElts, Depth + 1);
+      computeKnownBits(N0, Known2, SubDemandedElts, Depth + 1);
 
-      KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+      Known.Zero.setAllBits(); Known.One.setAllBits();
       for (unsigned i = 0; i != NumElts; ++i)
         if (DemandedElts[i]) {
           unsigned Offset = (i % SubScale) * BitWidth;
-          KnownOne &= KnownOne2.lshr(Offset).trunc(BitWidth);
-          KnownZero &= KnownZero2.lshr(Offset).trunc(BitWidth);
+          Known.One &= Known2.One.lshr(Offset).trunc(BitWidth);
+          Known.Zero &= Known2.Zero.lshr(Offset).trunc(BitWidth);
           // If we don't know any bits, early out.
-          if (!KnownOne && !KnownZero)
+          if (!Known.One && !Known.Zero)
             break;
         }
     }
@@ -2200,107 +2201,90 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   }
   case ISD::AND:
     // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
-    KnownOne &= KnownOne2;
+    Known.One &= Known2.One;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    KnownZero |= KnownZero2;
+    Known.Zero |= Known2.Zero;
     break;
   case ISD::OR:
-    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
-    KnownZero &= KnownZero2;
+    Known.Zero &= Known2.Zero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
-    KnownOne |= KnownOne2;
+    Known.One |= Known2.One;
     break;
   case ISD::XOR: {
-    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
-    KnownZero = KnownZeroOut;
+    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
+    Known.Zero = KnownZeroOut;
     break;
   }
   case ISD::MUL: {
-    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
     // If low bits are zero in either operand, output low known-0 bits.
     // Also compute a conservative estimate for high known-0 bits.
     // More trickiness is possible, but this is sufficient for the
     // interesting case of alignment computation.
-    KnownOne.clearAllBits();
-    unsigned TrailZ = KnownZero.countTrailingOnes() +
-                      KnownZero2.countTrailingOnes();
-    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
-                               KnownZero2.countLeadingOnes(),
+    unsigned TrailZ = Known.countMinTrailingZeros() +
+                      Known2.countMinTrailingZeros();
+    unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
+                               Known2.countMinLeadingZeros(),
                                BitWidth) - BitWidth;
 
-    TrailZ = std::min(TrailZ, BitWidth);
-    LeadZ = std::min(LeadZ, BitWidth);
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-                APInt::getHighBitsSet(BitWidth, LeadZ);
+    Known.resetAll();
+    Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
+    Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
     break;
   }
   case ISD::UDIV: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
-    unsigned LeadZ = KnownZero2.countLeadingOnes();
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    unsigned LeadZ = Known2.countMinLeadingZeros();
 
-    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
-    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
-    if (RHSUnknownLeadingOnes != BitWidth)
-      LeadZ = std::min(BitWidth,
-                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
+    if (RHSMaxLeadingZeros != BitWidth)
+      LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    Known.Zero.setHighBits(LeadZ);
     break;
   }
   case ISD::SELECT:
-    computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(2), Known, Depth+1);
     // If we don't know any bits, early out.
-    if (!KnownOne && !KnownZero)
+    if (!Known.One && !Known.Zero)
       break;
-    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    computeKnownBits(Op.getOperand(1), Known2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
     break;
   case ISD::SELECT_CC:
-    computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(3), Known, Depth+1);
     // If we don't know any bits, early out.
-    if (!KnownOne && !KnownZero)
+    if (!Known.One && !Known.Zero)
       break;
-    computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1);
+    computeKnownBits(Op.getOperand(2), Known2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
     break;
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
@@ -2312,51 +2296,46 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+      Known.Zero.setBitsFrom(1);
     break;
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+      Known.Zero.setBitsFrom(1);
     break;
   case ISD::SHL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
-      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                       Depth + 1);
-      KnownZero = KnownZero << *ShAmt;
-      KnownOne = KnownOne << *ShAmt;
+      computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      Known.Zero <<= *ShAmt;
+      Known.One <<= *ShAmt;
       // Low bits are known zero.
-      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue());
+      Known.Zero.setLowBits(ShAmt->getZExtValue());
     }
     break;
   case ISD::SRL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
-      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                       Depth + 1);
-      KnownZero = KnownZero.lshr(*ShAmt);
-      KnownOne  = KnownOne.lshr(*ShAmt);
+      computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      Known.Zero.lshrInPlace(*ShAmt);
+      Known.One.lshrInPlace(*ShAmt);
       // High bits are known zero.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());
-      KnownZero |= HighBits;
+      Known.Zero.setHighBits(ShAmt->getZExtValue());
     }
     break;
   case ISD::SRA:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
-      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                       Depth + 1);
-      KnownZero = KnownZero.lshr(*ShAmt);
-      KnownOne  = KnownOne.lshr(*ShAmt);
+      computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+      Known.Zero.lshrInPlace(*ShAmt);
+      Known.One.lshrInPlace(*ShAmt);
       // If we know the value of the sign bit, then we know it is copied across
       // the high bits by the shift amount.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());
-      APInt SignBit = APInt::getSignBit(BitWidth);
-      SignBit = SignBit.lshr(*ShAmt);  // Adjust to where it is now in the mask.
-      if (KnownZero.intersects(SignBit)) {
-        KnownZero |= HighBits;  // New bits are known zero.
-      } else if (KnownOne.intersects(SignBit)) {
-        KnownOne  |= HighBits;  // New bits are known one.
+      APInt SignMask = APInt::getSignMask(BitWidth);
+      SignMask.lshrInPlace(*ShAmt);  // Adjust to where it is now in the mask.
+      if (Known.Zero.intersects(SignMask)) {
+        Known.Zero.setHighBits(ShAmt->getZExtValue());// New bits are known zero.
+      } else if (Known.One.intersects(SignMask)) {
+        Known.One.setHighBits(ShAmt->getZExtValue()); // New bits are known one.
       }
     }
     break;
@@ -2368,42 +2347,56 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     // present in the input.
     APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
 
-    APInt InSignBit = APInt::getSignBit(EBits);
+    APInt InSignMask = APInt::getSignMask(EBits);
     APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
 
     // If the sign extended bits are demanded, we know that the sign
     // bit is demanded.
-    InSignBit = InSignBit.zext(BitWidth);
+    InSignMask = InSignMask.zext(BitWidth);
     if (NewBits.getBoolValue())
-      InputDemandedBits |= InSignBit;
+      InputDemandedBits |= InSignMask;
 
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    KnownOne &= InputDemandedBits;
-    KnownZero &= InputDemandedBits;
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    Known.One &= InputDemandedBits;
+    Known.Zero &= InputDemandedBits;
 
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
-    if (KnownZero.intersects(InSignBit)) {         // Input sign bit known clear
-      KnownZero |= NewBits;
-      KnownOne  &= ~NewBits;
-    } else if (KnownOne.intersects(InSignBit)) {   // Input sign bit known set
-      KnownOne  |= NewBits;
-      KnownZero &= ~NewBits;
+    if (Known.Zero.intersects(InSignMask)) {        // Input sign bit known clear
+      Known.Zero |= NewBits;
+      Known.One  &= ~NewBits;
+    } else if (Known.One.intersects(InSignMask)) {  // Input sign bit known set
+      Known.One  |= NewBits;
+      Known.Zero &= ~NewBits;
     } else {                              // Input sign bit unknown
-      KnownZero &= ~NewBits;
-      KnownOne  &= ~NewBits;
+      Known.Zero &= ~NewBits;
+      Known.One  &= ~NewBits;
     }
     break;
   }
   case ISD::CTTZ:
-  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF: {
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    // If we have a known 1, its position is our upper bound.
+    unsigned PossibleTZ = Known2.countMaxTrailingZeros();
+    unsigned LowBits = Log2_32(PossibleTZ) + 1;
+    Known.Zero.setBitsFrom(LowBits);
+    break;
+  }
   case ISD::CTLZ:
-  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ_ZERO_UNDEF: {
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    // If we have a known 1, its position is our upper bound.
+    unsigned PossibleLZ = Known2.countMaxLeadingZeros();
+    unsigned LowBits = Log2_32(PossibleLZ) + 1;
+    Known.Zero.setBitsFrom(LowBits);
+    break;
+  }
   case ISD::CTPOP: {
-    unsigned LowBits = Log2_32(BitWidth)+1;
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
-    KnownOne.clearAllBits();
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    // If we know some of the bits are zero, they can't be one.
+    unsigned PossibleOnes = Known2.countMaxPopulation();
+    Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1);
     break;
   }
   case ISD::LOAD: {
@@ -2412,76 +2405,87 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      Known.Zero.setBitsFrom(MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
       if (LD->getExtensionType() == ISD::NON_EXTLOAD)
-        computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne);
+        computeKnownBitsFromRangeMetadata(*Ranges, Known);
     }
     break;
   }
-  case ISD::ZERO_EXTEND: {
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
-    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
-    KnownZero = KnownZero.trunc(InBits);
-    KnownOne = KnownOne.trunc(InBits);
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+    Known = Known.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), Known,
+                     DemandedElts.zext(InVT.getVectorNumElements()),
                      Depth + 1);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
-    KnownZero |= NewBits;
+    Known = Known.zext(BitWidth);
+    Known.Zero.setBitsFrom(InBits);
     break;
   }
+  case ISD::ZERO_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarSizeInBits();
+    Known = Known.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    Known = Known.zext(BitWidth);
+    Known.Zero.setBitsFrom(InBits);
+    break;
+  }
+  // TODO ISD::SIGN_EXTEND_VECTOR_INREG
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
 
-    KnownZero = KnownZero.trunc(InBits);
-    KnownOne = KnownOne.trunc(InBits);
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
+    Known = Known.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
 
     // If the sign bit is known to be zero or one, then sext will extend
     // it to the top bits, else it will just zext.
-    KnownZero = KnownZero.sext(BitWidth);
-    KnownOne = KnownOne.sext(BitWidth);
+    Known = Known.sext(BitWidth);
     break;
   }
   case ISD::ANY_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
-    KnownZero = KnownZero.trunc(InBits);
-    KnownOne = KnownOne.trunc(InBits);
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
+    Known = Known.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    Known = Known.zext(BitWidth);
     break;
   }
   case ISD::TRUNCATE: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
-    KnownZero = KnownZero.zext(InBits);
-    KnownOne = KnownOne.zext(InBits);
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    KnownZero = KnownZero.trunc(BitWidth);
-    KnownOne = KnownOne.trunc(BitWidth);
+    Known = Known.zext(InBits);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    Known = Known.trunc(BitWidth);
     break;
   }
   case ISD::AssertZext: {
     EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-    KnownZero |= (~InMask);
-    KnownOne  &= (~KnownZero);
+    computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    Known.Zero |= (~InMask);
+    Known.One  &= (~Known.Zero);
     break;
   }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    Known.Zero.setBitsFrom(1);
     break;
-
-  case ISD::SUB: {
+  case ISD::USUBO:
+  case ISD::SSUBO:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::SUB:
+  case ISD::SUBC: {
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
       // We know that the top bits of C-X are clear if X contains less bits
       // than C (i.e. no wrap-around can happen).  For example, 20-X is
@@ -2490,22 +2494,47 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
         unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
         // NLZ can't be BitWidth with no sign bit
         APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
+        computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
                          Depth + 1);
 
         // If all of the MaskV bits are known to be zero, then we know the
         // output top bits are zero, because we now know that the output is
         // from [0-C].
-        if ((KnownZero2 & MaskV) == MaskV) {
+        if ((Known2.Zero & MaskV) == MaskV) {
           unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
           // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
+          Known.Zero.setHighBits(NLZ2);
         }
       }
     }
-    LLVM_FALLTHROUGH;
+
+    // If low bits are know to be zero in both operands, then we know they are
+    // going to be 0 in the result. Both addition and complement operations
+    // preserve the low zero bits.
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+    if (KnownZeroLow == 0)
+      break;
+
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+    Known.Zero.setLowBits(KnownZeroLow);
+    break;
   }
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::ADDCARRY:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        Known.Zero.setBitsFrom(1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
   case ISD::ADD:
+  case ISD::ADDC:
   case ISD::ADDE: {
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
@@ -2514,31 +2543,28 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     // known to be clear. For example, if one input has the top 10 bits clear
     // and the other has the top 8 bits clear, we know the top 7 bits of the
     // output must be clear.
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
-    unsigned KnownZeroHigh = KnownZero2.countLeadingOnes();
-    unsigned KnownZeroLow = KnownZero2.countTrailingOnes();
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
 
-    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts,
                      Depth + 1);
-    KnownZeroHigh = std::min(KnownZeroHigh,
-                             KnownZero2.countLeadingOnes());
-    KnownZeroLow = std::min(KnownZeroLow,
-                            KnownZero2.countTrailingOnes());
-
-    if (Opcode == ISD::ADD) {
-      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow);
-      if (KnownZeroHigh > 1)
-        KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1);
+    KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+
+    if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) {
+      // With ADDE and ADDCARRY, a carry bit may be added in, so we can only
+      // use this information if we know (at least) that the low two bits are
+      // clear. We then return to the caller that the low bit is unknown but
+      // that other bits are known zero.
+      if (KnownZeroLow >= 2)
+        Known.Zero.setBits(1, KnownZeroLow);
       break;
     }
 
-    // With ADDE, a carry bit may be added in, so we can only use this
-    // information if we know (at least) that the low two bits are clear.  We
-    // then return to the caller that the low bit is unknown but that other bits
-    // are known zero.
-    if (KnownZeroLow >= 2) // ADDE
-      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow);
+    Known.Zero.setLowBits(KnownZeroLow);
+    if (KnownZeroHigh > 1)
+      Known.Zero.setHighBits(KnownZeroHigh - 1);
     break;
   }
   case ISD::SREM:
@@ -2546,23 +2572,22 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                         Depth + 1);
+        computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
         // The low bits of the first operand are unchanged by the srem.
-        KnownZero = KnownZero2 & LowBits;
-        KnownOne = KnownOne2 & LowBits;
+        Known.Zero = Known2.Zero & LowBits;
+        Known.One = Known2.One & LowBits;
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
-        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
-          KnownZero |= ~LowBits;
+        if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits))
+          Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
-        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
-          KnownOne |= ~LowBits;
-        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+        if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
+          Known.One |= ~LowBits;
+        assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
       }
     }
     break;
@@ -2571,41 +2596,37 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
-        computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
-                         Depth + 1);
+        computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
 
         // The upper bits are all zero, the lower ones are unchanged.
-        KnownZero = KnownZero2 | ~LowBits;
-        KnownOne = KnownOne2 & LowBits;
+        Known.Zero = Known2.Zero | ~LowBits;
+        Known.One = Known2.One & LowBits;
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
-                     Depth + 1);
-    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
 
-    uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
-                                KnownZero2.countLeadingOnes());
-    KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    uint32_t Leaders =
+        std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
+    Known.resetAll();
+    Known.Zero.setHighBits(Leaders);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), Known, Depth+1);
     const unsigned Index = Op.getConstantOperandVal(1);
     const unsigned BitWidth = Op.getValueSizeInBits();
 
     // Remove low part of known bits mask
-    KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth);
-    KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth);
+    Known.Zero = Known.Zero.getHiBits(Known.Zero.getBitWidth() - Index * BitWidth);
+    Known.One = Known.One.getHiBits(Known.One.getBitWidth() - Index * BitWidth);
 
     // Remove high part of known bit mask
-    KnownZero = KnownZero.trunc(BitWidth);
-    KnownOne = KnownOne.trunc(BitWidth);
+    Known = Known.trunc(BitWidth);
     break;
   }
   case ISD::EXTRACT_VECTOR_ELT: {
@@ -2617,24 +2638,20 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
     // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
     // anything about the extended bits.
-    if (BitWidth > EltBitWidth) {
-      KnownZero = KnownZero.trunc(EltBitWidth);
-      KnownOne = KnownOne.trunc(EltBitWidth);
-    }
+    if (BitWidth > EltBitWidth)
+      Known = Known.trunc(EltBitWidth);
     ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
       // If we know the element index, just demand that vector element.
       unsigned Idx = ConstEltNo->getZExtValue();
       APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
-      computeKnownBits(InVec, KnownZero, KnownOne, DemandedElt, Depth + 1);
+      computeKnownBits(InVec, Known, DemandedElt, Depth + 1);
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
-      computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1);
-    }
-    if (BitWidth > EltBitWidth) {
-      KnownZero = KnownZero.zext(BitWidth);
-      KnownOne = KnownOne.zext(BitWidth);
+      computeKnownBits(InVec, Known, Depth + 1);
     }
+    if (BitWidth > EltBitWidth)
+      Known = Known.zext(BitWidth);
     break;
   }
   case ISD::INSERT_VECTOR_ELT: {
@@ -2646,60 +2663,110 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
       // If we know the element index, split the demand between the
       // source vector and the inserted element.
-      KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+      Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
       unsigned EltIdx = CEltNo->getZExtValue();
 
       // If we demand the inserted element then add its common known bits.
       if (DemandedElts[EltIdx]) {
-        computeKnownBits(InVal, KnownZero2, KnownOne2, Depth + 1);
-        KnownOne &= KnownOne2.zextOrTrunc(KnownOne.getBitWidth());
-        KnownZero &= KnownZero2.zextOrTrunc(KnownZero.getBitWidth());;
+        computeKnownBits(InVal, Known2, Depth + 1);
+        Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
+        Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
       }
 
       // If we demand the source vector then add its common known bits, ensuring
       // that we don't demand the inserted element.
       APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
       if (!!VectorElts) {
-        computeKnownBits(InVec, KnownZero2, KnownOne2, VectorElts, Depth + 1);
-        KnownOne &= KnownOne2;
-        KnownZero &= KnownZero2;
+        computeKnownBits(InVec, Known2, VectorElts, Depth + 1);
+        Known.One &= Known2.One;
+        Known.Zero &= Known2.Zero;
       }
     } else {
       // Unknown element index, so ignore DemandedElts and demand them all.
-      computeKnownBits(InVec, KnownZero, KnownOne, Depth + 1);
-      computeKnownBits(InVal, KnownZero2, KnownOne2, Depth + 1);
-      KnownOne &= KnownOne2.zextOrTrunc(KnownOne.getBitWidth());
-      KnownZero &= KnownZero2.zextOrTrunc(KnownZero.getBitWidth());;
+      computeKnownBits(InVec, Known, Depth + 1);
+      computeKnownBits(InVal, Known2, Depth + 1);
+      Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
+      Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
     }
     break;
   }
+  case ISD::BITREVERSE: {
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    Known.Zero = Known2.Zero.reverseBits();
+    Known.One = Known2.One.reverseBits();
+    break;
+  }
   case ISD::BSWAP: {
-    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+    Known.Zero = Known2.Zero.byteSwap();
+    Known.One = Known2.One.byteSwap();
+    break;
+  }
+  case ISD::ABS: {
+    computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1);
+
+    // If the source's MSB is zero then we know the rest of the bits already.
+    if (Known2.isNonNegative()) {
+      Known.Zero = Known2.Zero;
+      Known.One = Known2.One;
+      break;
+    }
+
+    // We only know that the absolute values's MSB will be zero iff there is
+    // a set bit that isn't the sign bit (otherwise it could be INT_MIN).
+    Known2.One.clearSignBit();
+    if (Known2.One.getBoolValue()) {
+      Known.Zero = APInt::getSignMask(BitWidth);
+      break;
+    }
+    break;
+  }
+  case ISD::UMIN: {
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+
+    // UMIN - we know that the result will have the maximum of the
+    // known zero leading bits of the inputs.
+    unsigned LeadZero = Known.countMinLeadingZeros();
+    LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros());
+
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
+    Known.Zero.setHighBits(LeadZero);
+    break;
+  }
+  case ISD::UMAX: {
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts,
                      Depth + 1);
-    KnownZero = KnownZero2.byteSwap();
-    KnownOne = KnownOne2.byteSwap();
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+
+    // UMAX - we know that the result will have the maximum of the
+    // known one leading bits of the inputs.
+    unsigned LeadOne = Known.countMinLeadingOnes();
+    LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes());
+
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
+    Known.One.setHighBits(LeadOne);
     break;
   }
   case ISD::SMIN:
-  case ISD::SMAX:
-  case ISD::UMIN:
-  case ISD::UMAX: {
-    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
+  case ISD::SMAX: {
+    computeKnownBits(Op.getOperand(0), Known, DemandedElts,
                      Depth + 1);
     // If we don't know any bits, early out.
-    if (!KnownOne && !KnownZero)
+    if (!Known.One && !Known.Zero)
       break;
-    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
-                     Depth + 1);
-    KnownZero &= KnownZero2;
-    KnownOne &= KnownOne2;
+    computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1);
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
     break;
   }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
     if (unsigned Align = InferPtrAlignment(Op)) {
       // The low bits are known zero if the pointer is aligned.
-      KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align));
+      Known.Zero.setLowBits(Log2_32(Align));
       break;
     }
     break;
@@ -2712,11 +2779,45 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    TLI->computeKnownBitsForTargetNode(Op, Known, DemandedElts, *this, Depth);
     break;
   }
 
-  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+  assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+}
+
+SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
+                                                             SDValue N1) const {
+  // X + 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  KnownBits N1Known;
+  computeKnownBits(N1, N1Known);
+  if (N1Known.Zero.getBoolValue()) {
+    KnownBits N0Known;
+    computeKnownBits(N0, N0Known);
+
+    bool overflow;
+    (void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
+    if (!overflow)
+      return OFK_Never;
+  }
+
+  // mulhi + 1 never overflow
+  if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
+      (~N1Known.Zero & 0x01) == ~N1Known.Zero)
+    return OFK_Never;
+
+  if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
+    KnownBits N0Known;
+    computeKnownBits(N0, N0Known);
+
+    if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
+      return OFK_Never;
+  }
+
+  return OFK_Sometime;
 }
 
 bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
@@ -2730,7 +2831,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   // A left-shift of a constant one will have exactly one bit set because
   // shifting the bit off the end is undefined.
   if (Val.getOpcode() == ISD::SHL) {
-    auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0));
+    auto *C = isConstOrConstSplat(Val.getOperand(0));
     if (C && C->getAPIntValue() == 1)
       return true;
   }
@@ -2738,14 +2839,14 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   // Similarly, a logical right-shift of a constant sign-bit will have exactly
   // one bit set.
   if (Val.getOpcode() == ISD::SRL) {
-    auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0));
-    if (C && C->getAPIntValue().isSignBit())
+    auto *C = isConstOrConstSplat(Val.getOperand(0));
+    if (C && C->getAPIntValue().isSignMask())
       return true;
   }
 
   // Are all operands of a build vector constant powers of two?
   if (Val.getOpcode() == ISD::BUILD_VECTOR)
-    if (llvm::all_of(Val->ops(), [this, BitWidth](SDValue E) {
+    if (llvm::all_of(Val->ops(), [BitWidth](SDValue E) {
           if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(E))
             return C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2();
           return false;
@@ -2756,22 +2857,34 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   // to handle some common cases.
 
   // Fall back to computeKnownBits to catch other known cases.
-  APInt KnownZero, KnownOne;
-  computeKnownBits(Val, KnownZero, KnownOne);
-  return (KnownZero.countPopulation() == BitWidth - 1) &&
-         (KnownOne.countPopulation() == 1);
+  KnownBits Known;
+  computeKnownBits(Val, Known);
+  return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1);
 }
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ComputeNumSignBits(Op, DemandedElts, Depth);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                                          unsigned Depth) const {
+  EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarSizeInBits();
+  unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
   if (Depth == 6)
     return 1;  // Limit search depth.
 
+  if (!DemandedElts)
+    return 1;  // No demanded elts, better to assume we don't know anything.
+
   switch (Op.getOpcode()) {
   default: break;
   case ISD::AssertSext:
@@ -2786,7 +2899,61 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return Val.getNumSignBits();
   }
 
+  case ISD::BUILD_VECTOR:
+    Tmp = VTBits;
+    for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      SDValue SrcOp = Op.getOperand(i);
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
+
+      // BUILD_VECTOR can implicitly truncate sources, we must handle this.
+      if (SrcOp.getValueSizeInBits() != VTBits) {
+        assert(SrcOp.getValueSizeInBits() > VTBits &&
+               "Expected BUILD_VECTOR implicit truncation");
+        unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
+        Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
+      }
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    return Tmp;
+
+  case ISD::VECTOR_SHUFFLE: {
+    // Collect the minimum number of sign bits that are shared by every vector
+    // element referenced by the shuffle.
+    APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+    assert(NumElts == SVN->getMask().size() && "Unexpected vector size");
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = SVN->getMaskElt(i);
+      if (!DemandedElts[i])
+        continue;
+      // For UNDEF elements, we don't know anything about the common state of
+      // the shuffle result.
+      if (M < 0)
+        return 1;
+      if ((unsigned)M < NumElts)
+        DemandedLHS.setBit((unsigned)M % NumElts);
+      else
+        DemandedRHS.setBit((unsigned)M % NumElts);
+    }
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (!!DemandedLHS)
+      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
+    if (!!DemandedRHS) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    // If we don't know anything, early out and try computeKnownBits fall-back.
+    if (Tmp == 1)
+      break;
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
+
   case ISD::SIGN_EXTEND:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
@@ -2799,7 +2966,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return std::max(Tmp, Tmp2);
 
   case ISD::SRA:
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
     // SRA X, C   -> adds C sign bits.
     if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) {
       APInt ShiftVal = C->getAPIntValue();
@@ -2887,6 +3054,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     }
     break;
   case ISD::ADD:
+  case ISD::ADDC:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
@@ -2895,17 +3063,17 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     // Special case decrementing a value (ADD X, -1):
     if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
       if (CRHS->isAllOnesValue()) {
-        APInt KnownZero, KnownOne;
-        computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+        KnownBits Known;
+        computeKnownBits(Op.getOperand(0), Known, Depth+1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
+        if ((Known.Zero | 1).isAllOnesValue())
           return VTBits;
 
         // If we are subtracting one from a positive number, there is no carry
         // out of the result.
-        if (KnownZero.isNegative())
+        if (Known.isNonNegative())
           return Tmp;
       }
 
@@ -2920,16 +3088,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     // Handle NEG.
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
       if (CLHS->isNullValue()) {
-        APInt KnownZero, KnownOne;
-        computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+        KnownBits Known;
+        computeKnownBits(Op.getOperand(1), Known, Depth+1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
-        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
+        if ((Known.Zero | 1).isAllOnesValue())
           return VTBits;
 
         // If the input is known to be positive (the sign bit is known clear),
         // the output of the NEG has the same number of sign bits as the input.
-        if (KnownZero.isNegative())
+        if (Known.isNonNegative())
           return Tmp2;
 
         // Otherwise, we treat this like a SUB.
@@ -2961,28 +3129,98 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     // result. Otherwise it gives either negative or > bitwidth result
     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
   }
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    unsigned NumElts = InVec.getValueType().getVectorNumElements();
+
+    ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+      // If we know the element index, split the demand between the
+      // source vector and the inserted element.
+      unsigned EltIdx = CEltNo->getZExtValue();
+
+      // If we demand the inserted element then get its sign bits.
+      Tmp = std::numeric_limits<unsigned>::max();
+      if (DemandedElts[EltIdx]) {
+        // TODO - handle implicit truncation of inserted elements.
+        if (InVal.getScalarValueSizeInBits() != VTBits)
+          break;
+        Tmp = ComputeNumSignBits(InVal, Depth + 1);
+      }
+
+      // If we demand the source vector then get its sign bits, and determine
+      // the minimum.
+      APInt VectorElts = DemandedElts;
+      VectorElts.clearBit(EltIdx);
+      if (!!VectorElts) {
+        Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
+        Tmp = std::min(Tmp, Tmp2);
+      }
+    } else {
+      // Unknown element index, so ignore DemandedElts and demand them all.
+      Tmp = ComputeNumSignBits(InVec, Depth + 1);
+      Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
-    // At the moment we keep this simple and skip tracking the specific
-    // element. This way we get the lowest common denominator for all elements
-    // of the vector.
-    // TODO: get information for given vector element
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
     const unsigned BitWidth = Op.getValueSizeInBits();
     const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
     // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
     // anything about sign bits. But if the sizes match we can derive knowledge
     // about sign bits from the vector operand.
-    if (BitWidth == EltBitWidth)
-      return ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    break;
+    if (BitWidth != EltBitWidth)
+      break;
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
+  }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // If we know the element index, just demand that subvector elements,
+    // otherwise demand them all.
+    SDValue Src = Op.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx);
+      return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+    }
+    return ComputeNumSignBits(Src, Depth + 1);
   }
-  case ISD::EXTRACT_SUBVECTOR:
-    return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
   case ISD::CONCAT_VECTORS:
-    // Determine the minimum number of sign bits across all input vectors.
-    // Early out if the result is already 1.
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i)
-      Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1));
+    // Determine the minimum number of sign bits across all demanded
+    // elts of the input vectors. Early out if the result is already 1.
+    Tmp = std::numeric_limits<unsigned>::max();
+    EVT SubVectorVT = Op.getOperand(0).getValueType();
+    unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements();
+    unsigned NumSubVectors = Op.getNumOperands();
+    for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) {
+      APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts);
+      DemandedSub = DemandedSub.trunc(NumSubVectorElts);
+      if (!DemandedSub)
+        continue;
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
 
@@ -3008,20 +3246,22 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_VOID) {
-    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth);
-    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+    unsigned NumBits =
+        TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
+    if (NumBits > 1)
+      FirstAnswer = std::max(FirstAnswer, NumBits);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
-  APInt KnownZero, KnownOne;
-  computeKnownBits(Op, KnownZero, KnownOne, Depth);
+  KnownBits Known;
+  computeKnownBits(Op, Known, DemandedElts, Depth);
 
   APInt Mask;
-  if (KnownZero.isNegative()) {        // sign bit is 0
-    Mask = KnownZero;
-  } else if (KnownOne.isNegative()) {  // sign bit is 1;
-    Mask = KnownOne;
+  if (Known.isNonNegative()) {        // sign bit is 0
+    Mask = Known.Zero;
+  } else if (Known.isNegative()) {  // sign bit is 1;
+    Mask = Known.One;
   } else {
     // Nothing known.
     return FirstAnswer;
@@ -3054,6 +3294,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
   if (getTarget().Options.NoNaNsFPMath)
     return true;
 
+  if (Op->getFlags().hasNoNaNs())
+    return true;
+
   // If the value is a constant, we can obviously see if it is a NaN or not.
   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
     return !C->getValueAPF().isNaN();
@@ -3096,16 +3339,15 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
 bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
   assert(A.getValueType() == B.getValueType() &&
          "Values must have the same type");
-  APInt AZero, AOne;
-  APInt BZero, BOne;
-  computeKnownBits(A, AZero, AOne);
-  computeKnownBits(B, BZero, BOne);
-  return (AZero | BZero).isAllOnesValue();
+  KnownBits AKnown, BKnown;
+  computeKnownBits(A, AKnown);
+  computeKnownBits(B, BKnown);
+  return (AKnown.Zero | BKnown.Zero).isAllOnesValue();
 }
 
 static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
                                   ArrayRef<SDValue> Ops,
-                                  llvm::SelectionDAG &DAG) {
+                                  SelectionDAG &DAG) {
   assert(!Ops.empty() && "Can't concatenate an empty list of vectors!");
   assert(llvm::all_of(Ops,
                       [Ops](SDValue Op) {
@@ -3169,7 +3411,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
-                              SDValue Operand) {
+                              SDValue Operand, const SDNodeFlags Flags) {
   // Constant fold unary operations with an integer constant operand. Even
   // opaque constant will be folded, because the folding of unary operations
   // doesn't create new constants with different values. Nevertheless, the
@@ -3206,6 +3448,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
         return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
       break;
+    case ISD::ABS:
+      return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::BITREVERSE:
+      return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
@@ -3220,6 +3468,17 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     case ISD::CTTZ_ZERO_UNDEF:
       return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
+    case ISD::FP16_TO_FP: {
+      bool Ignored;
+      APFloat FPV(APFloat::IEEEhalf(),
+                  (Val.getBitWidth() == 16) ? Val : Val.trunc(16));
+
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)FPV.convert(EVTToAPFloatSemantics(VT),
+                        APFloat::rmNearestTiesToEven, &Ignored);
+      return getConstantFP(FPV, DL, VT);
+    }
     }
   }
 
@@ -3261,17 +3520,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT: {
-      integerPart x[2];
       bool ignored;
-      static_assert(integerPartWidth >= 64, "APFloat parts too small!");
+      APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
       // FIXME need to be more flexible about rounding mode.
-      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
-                            Opcode==ISD::FP_TO_SINT,
-                            APFloat::rmTowardZero, &ignored);
-      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+      APFloat::opStatus s =
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
+      if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
         break;
-      APInt api(VT.getSizeInBits(), x);
-      return getConstant(api, DL, VT);
+      return getConstant(IntVal, DL, VT);
     }
     case ISD::BITCAST:
       if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
@@ -3281,6 +3537,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
       break;
+    case ISD::FP_TO_FP16: {
+      bool Ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(APFloat::IEEEhalf(),
+                      APFloat::rmNearestTiesToEven, &Ignored);
+      return getConstant(V.bitcastToAPInt(), DL, VT);
+    }
     }
   }
 
@@ -3303,6 +3567,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       case ISD::TRUNCATE:
       case ISD::UINT_TO_FP:
       case ISD::SINT_TO_FP:
+      case ISD::ABS:
+      case ISD::BITREVERSE:
       case ISD::BSWAP:
       case ISD::CTLZ:
       case ISD::CTLZ_ZERO_UNDEF:
@@ -3348,7 +3614,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid sext node, dst < src!");
     if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
-      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // sext(undef) = 0, because the top bits will all be the same.
       return getConstant(0, DL, VT);
@@ -3364,8 +3630,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid zext node, dst < src!");
     if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
-      return getNode(ISD::ZERO_EXTEND, DL, VT,
-                     Operand.getNode()->getOperand(0));
+      return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       // zext(undef) = 0, because the top bits will be zero.
       return getConstant(0, DL, VT);
@@ -3384,13 +3649,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND)
       // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
-      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
     else if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
 
     // (ext (trunx x)) -> x
     if (OpOpcode == ISD::TRUNCATE) {
-      SDValue OpOp = Operand.getNode()->getOperand(0);
+      SDValue OpOp = Operand.getOperand(0);
       if (OpOp.getValueType() == VT)
         return OpOp;
     }
@@ -3406,20 +3671,26 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Operand.getValueType().bitsGT(VT) &&
            "Invalid truncate node, src < dst!");
     if (OpOpcode == ISD::TRUNCATE)
-      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
     if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
         OpOpcode == ISD::ANY_EXTEND) {
       // If the source is smaller than the dest, we still need an extend.
-      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+      if (Operand.getOperand(0).getValueType().getScalarType()
             .bitsLT(VT.getScalarType()))
-        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
-      if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
-        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
-      return Operand.getNode()->getOperand(0);
+        return getNode(OpOpcode, DL, VT, Operand.getOperand(0));
+      if (Operand.getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0));
+      return Operand.getOperand(0);
     }
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
+  case ISD::ABS:
+    assert(VT.isInteger() && VT == Operand.getValueType() &&
+           "Invalid ABS!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
   case ISD::BSWAP:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid BSWAP!");
@@ -3464,15 +3735,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
     if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
       // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
-      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
-                       Operand.getNode()->getOperand(0),
-                       &cast<BinaryWithFlagsSDNode>(Operand.getNode())->Flags);
+      return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
+                     Operand.getOperand(0), Operand.getNode()->getFlags());
     if (OpOpcode == ISD::FNEG)  // --X -> X
-      return Operand.getNode()->getOperand(0);
+      return Operand.getOperand(0);
     break;
   case ISD::FABS:
     if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
-      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+      return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
     break;
   }
 
@@ -3483,10 +3753,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
-    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
+    }
 
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    N->setFlags(Flags);
     createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
@@ -3569,6 +3842,31 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
                           GA->getOffset() + uint64_t(Offset));
 }
 
+bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM: {
+    // If a divisor is zero/undef or any element of a divisor vector is
+    // zero/undef, the whole op is undef.
+    assert(Ops.size() == 2 && "Div/rem should have 2 operands");
+    SDValue Divisor = Ops[1];
+    if (Divisor.isUndef() || isNullConstant(Divisor))
+      return true;
+
+    return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
+           llvm::any_of(Divisor->op_values(),
+                        [](SDValue V) { return V.isUndef() ||
+                                        isNullConstant(V); });
+    // TODO: Handle signed overflow.
+  }
+  // TODO: Handle oversized shifts.
+  default:
+    return false;
+  }
+}
+
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
                                              EVT VT, SDNode *Cst1,
                                              SDNode *Cst2) {
@@ -3578,6 +3876,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
+    return getUNDEF(VT);
+
   // Handle the case of two scalars.
   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
@@ -3591,7 +3892,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   // fold (add Sym, c) -> Sym+c
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
     return FoldSymbolOffset(Opcode, VT, GA, Cst2);
-  if (isCommutativeBinOp(Opcode))
+  if (TLI->isCommutativeBinOp(Opcode))
     if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
       return FoldSymbolOffset(Opcode, VT, GA, Cst1);
 
@@ -3638,13 +3939,16 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
 SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
                                                    const SDLoc &DL, EVT VT,
                                                    ArrayRef<SDValue> Ops,
-                                                   const SDNodeFlags *Flags) {
+                                                   const SDNodeFlags Flags) {
   // If the opcode is a target-specific ISD node, there's nothing we can
   // do here and the operand rules may not line up with the below, so
   // bail early.
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, Ops))
+    return getUNDEF(VT);
+
   // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
   if (!VT.isVector())
     return SDValue();
@@ -3665,8 +3969,8 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   // All operands must be vector types with the same number of elements as
   // the result type and must be either UNDEF or a build vector of constant
   // or UNDEF scalars.
-  if (!all_of(Ops, IsConstantBuildVectorOrUndef) ||
-      !all_of(Ops, IsScalarOrSameVectorSize))
+  if (!llvm::all_of(Ops, IsConstantBuildVectorOrUndef) ||
+      !llvm::all_of(Ops, IsScalarOrSameVectorSize))
     return SDValue();
 
   // If we are comparing vectors, then the result needs to be a i1 boolean
@@ -3676,7 +3980,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
   EVT LegalSVT = VT.getScalarType();
-  if (LegalSVT.isInteger()) {
+  if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(VT.getScalarType()))
       return SDValue();
@@ -3727,15 +4031,14 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
-                              SDValue N1, SDValue N2,
-                              const SDNodeFlags *Flags) {
+                              SDValue N1, SDValue N2, const SDNodeFlags Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
 
   // Canonicalize constant to RHS if commutative.
-  if (isCommutativeBinOp(Opcode)) {
+  if (TLI->isCommutativeBinOp(Opcode)) {
     if (N1C && !N2C) {
       std::swap(N1C, N2C);
       std::swap(N1, N2);
@@ -3910,35 +4213,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
 
-    auto SignExtendInReg = [&](APInt Val) {
+    auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
       unsigned FromBits = EVT.getScalarSizeInBits();
       Val <<= Val.getBitWidth() - FromBits;
-      Val = Val.ashr(Val.getBitWidth() - FromBits);
-      return getConstant(Val, DL, VT.getScalarType());
+      Val.ashrInPlace(Val.getBitWidth() - FromBits);
+      return getConstant(Val, DL, ConstantVT);
     };
 
     if (N1C) {
       const APInt &Val = N1C->getAPIntValue();
-      return SignExtendInReg(Val);
+      return SignExtendInReg(Val, VT);
     }
     if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
       SmallVector<SDValue, 8> Ops;
+      llvm::EVT OpVT = N1.getOperand(0).getValueType();
       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N1.getOperand(i);
         if (Op.isUndef()) {
-          Ops.push_back(getUNDEF(VT.getScalarType()));
+          Ops.push_back(getUNDEF(OpVT));
           continue;
         }
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-          APInt Val = C->getAPIntValue();
-          Val = Val.zextOrTrunc(VT.getScalarSizeInBits());
-          Ops.push_back(SignExtendInReg(Val));
-          continue;
-        }
-        break;
+        ConstantSDNode *C = cast<ConstantSDNode>(Op);
+        APInt Val = C->getAPIntValue();
+        Ops.push_back(SignExtendInReg(Val, OpVT));
       }
-      if (Ops.size() == VT.getVectorNumElements())
-        return getBuildVector(VT, DL, Ops);
+      return getBuildVector(VT, DL, Ops);
     }
     break;
   }
@@ -4040,6 +4339,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       if (VT.getSimpleVT() == N1.getSimpleValueType())
         return N1;
 
+      // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
+      if (N1.isUndef())
+        return getUNDEF(VT);
+
+      // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
+      // the concat have the same type as the extract.
+      if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
+          N1.getNumOperands() > 0 &&
+          VT == N1.getOperand(0).getValueType()) {
+        unsigned Factor = VT.getVectorNumElements();
+        return N1.getOperand(N2C->getZExtValue() / Factor);
+      }
+
       // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
       // during shuffle legalization.
       if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
@@ -4110,7 +4422,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   // Canonicalize an UNDEF to the RHS, even over a constant.
   if (N1.isUndef()) {
-    if (isCommutativeBinOp(Opcode)) {
+    if (TLI->isCommutativeBinOp(Opcode)) {
       std::swap(N1, N2);
     } else {
       switch (Opcode) {
@@ -4186,21 +4498,23 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   // Memoize this node if possible.
   SDNode *N;
   SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {N1, N2};
   if (VT != MVT::Glue) {
-    SDValue Ops[] = {N1, N2};
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
-      if (Flags)
-        E->intersectFlagsWith(Flags);
+      E->intersectFlagsWith(Flags);
       return SDValue(E, 0);
     }
 
-    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    N->setFlags(Flags);
+    createOperands(N, Ops);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
   }
 
   InsertNode(N);
@@ -4392,9 +4706,10 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
 /// used when a memcpy is turned into a memset when the source is a constant
 /// string ptr.
 static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
-                                  const TargetLowering &TLI, StringRef Str) {
+                                  const TargetLowering &TLI,
+                                  const ConstantDataArraySlice &Slice) {
   // Handle vector with all elements zero.
-  if (Str.empty()) {
+  if (Slice.Array == nullptr) {
     if (VT.isInteger())
       return DAG.getConstant(0, dl, VT);
     else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
@@ -4413,15 +4728,15 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
   assert(!VT.isVector() && "Can't handle vector type here!");
   unsigned NumVTBits = VT.getSizeInBits();
   unsigned NumVTBytes = NumVTBits / 8;
-  unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
+  unsigned NumBytes = std::min(NumVTBytes, unsigned(Slice.Length));
 
   APInt Val(NumVTBits, 0);
   if (DAG.getDataLayout().isLittleEndian()) {
     for (unsigned i = 0; i != NumBytes; ++i)
-      Val |= (uint64_t)(unsigned char)Str[i] << i*8;
+      Val |= (uint64_t)(unsigned char)Slice[i] << i*8;
   } else {
     for (unsigned i = 0; i != NumBytes; ++i)
-      Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
+      Val |= (uint64_t)(unsigned char)Slice[i] << (NumVTBytes-i-1)*8;
   }
 
   // If the "cost" of materializing the integer immediate is less than the cost
@@ -4438,9 +4753,8 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
   return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
 }
 
-/// isMemSrcFromString - Returns true if memcpy source is a string constant.
-///
-static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
+/// Returns true if memcpy source is constant data.
+static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
   uint64_t SrcDelta = 0;
   GlobalAddressSDNode *G = nullptr;
   if (Src.getOpcode() == ISD::GlobalAddress)
@@ -4454,8 +4768,8 @@ static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
   if (!G)
     return false;
 
-  return getConstantStringInfo(G->getGlobal(), Str,
-                               SrcDelta + G->getOffset(), false);
+  return getConstantDataArrayInfo(G->getGlobal(), Slice, 8,
+                                  SrcDelta + G->getOffset());
 }
 
 /// Determines the optimal series of memory ops to replace the memset / memcpy.
@@ -4486,23 +4800,23 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
-    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) ||
-        TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {
-      VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);
-    } else {
-      switch (DstAlign & 7) {
-      case 0:  VT = MVT::i64; break;
-      case 4:  VT = MVT::i32; break;
-      case 2:  VT = MVT::i16; break;
-      default: VT = MVT::i8;  break;
-      }
-    }
-
+    // Use the largest integer type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    VT = MVT::i64;
+    while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
+           !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
+      VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+    assert(VT.isInteger());
+
+    // Find the largest legal integer type.
     MVT LVT = MVT::i64;
     while (!TLI.isTypeLegal(LVT))
       LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
     assert(LVT.isInteger());
 
+    // If the type we've chosen is larger than the largest legal integer type
+    // then use that instead.
     if (VT.bitsGT(LVT))
       VT = LVT;
   }
@@ -4587,6 +4901,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   // TODO: In the AlwaysInline case, if the size is big then generate a loop
   // rather than maybe a humongous number of loads and stores.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &C = *DAG.getContext();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4598,30 +4914,30 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
   if (Align > SrcAlign)
     SrcAlign = Align;
-  StringRef Str;
-  bool CopyFromStr = isMemSrcFromString(Src, Str);
-  bool isZeroStr = CopyFromStr && Str.empty();
+  ConstantDataArraySlice Slice;
+  bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
+  bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
-                                (isZeroStr ? 0 : SrcAlign),
-                                false, false, CopyFromStr, true,
+                                (isZeroConstant ? 0 : SrcAlign),
+                                false, false, CopyFromConstant, true,
                                 DstPtrInfo.getAddrSpace(),
                                 SrcPtrInfo.getAddrSpace(),
                                 DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
-    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+    Type *Ty = MemOps[0].getTypeForEVT(C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
       while (NewAlign > Align &&
-             DAG.getDataLayout().exceedsNaturalStackAlignment(NewAlign))
+             DL.exceedsNaturalStackAlignment(NewAlign))
           NewAlign /= 2;
 
     if (NewAlign > Align) {
@@ -4650,18 +4966,29 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       DstOff -= VTSize - Size;
     }
 
-    if (CopyFromStr &&
-        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
+    if (CopyFromConstant &&
+        (isZeroConstant || (VT.isInteger() && !VT.isVector()))) {
       // It's unlikely a store of a vector immediate can be done in a single
       // instruction. It would require a load from a constantpool first.
       // We only handle zero vectors here.
       // FIXME: Handle other cases where store of vector immediate is done in
       // a single instruction.
-      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
+      ConstantDataArraySlice SubSlice;
+      if (SrcOff < Slice.Length) {
+        SubSlice = Slice;
+        SubSlice.move(SrcOff);
+      } else {
+        // This is an out-of-bounds access and hence UB. Pretend we read zero.
+        SubSlice.Array = nullptr;
+        SubSlice.Offset = 0;
+        SubSlice.Length = VTSize;
+      }
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
       if (Value.getNode())
         Store = DAG.getStore(Chain, dl, Value,
                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-                             DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
+                             DstPtrInfo.getWithOffset(DstOff), Align,
+                             MMOFlags);
     }
 
     if (!Store.getNode()) {
@@ -4670,12 +4997,19 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
       // to Load/Store if NVT==VT.
       // FIXME does the case above also need this?
-      EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+      EVT NVT = TLI.getTypeToTransformTo(C, VT);
       assert(NVT.bitsGE(VT));
+
+      bool isDereferenceable =
+        SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+      MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+      if (isDereferenceable)
+        SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
-                             MinAlign(SrcAlign, SrcOff), MMOFlags);
+                             MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
       OutChains.push_back(Value.getValue(1));
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
@@ -4703,6 +5037,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   // Expand memmove to a series of load and store ops if the size operand falls
   // below a certain threshold.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  LLVMContext &C = *DAG.getContext();
   std::vector<EVT> MemOps;
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
@@ -4725,8 +5061,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     return SDValue();
 
   if (DstAlignCanChange) {
-    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+    Type *Ty = MemOps[0].getTypeForEVT(C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -4747,9 +5083,15 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Value;
 
+    bool isDereferenceable =
+      SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+    MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
+    if (isDereferenceable)
+      SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+
     Value =
         DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
-                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, MMOFlags);
+                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
@@ -4943,11 +5285,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5004,11 +5346,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5066,11 +5408,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5104,7 +5446,7 @@ SDValue SelectionDAG::getAtomicCmpSwap(
     unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
     SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
     unsigned Alignment, AtomicOrdering SuccessOrdering,
-    AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) {
+    AtomicOrdering FailureOrdering, SyncScope::ID SSID) {
   assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
          Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
@@ -5120,7 +5462,7 @@ SDValue SelectionDAG::getAtomicCmpSwap(
                MachineMemOperand::MOStore;
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment,
-                            AAMDNodes(), nullptr, SynchScope, SuccessOrdering,
+                            AAMDNodes(), nullptr, SSID, SuccessOrdering,
                             FailureOrdering);
 
   return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO);
@@ -5142,7 +5484,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Val,
                                 const Value *PtrVal, unsigned Alignment,
                                 AtomicOrdering Ordering,
-                                SynchronizationScope SynchScope) {
+                                SyncScope::ID SSID) {
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
@@ -5162,7 +5504,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
                             MemVT.getStoreSize(), Alignment, AAMDNodes(),
-                            nullptr, SynchScope, Ordering);
+                            nullptr, SSID, Ordering);
 
   return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
 }
@@ -5246,7 +5588,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
           Opcode == ISD::PREFETCH ||
           Opcode == ISD::LIFETIME_START ||
           Opcode == ISD::LIFETIME_END ||
-          (Opcode <= INT_MAX &&
+          ((int)Opcode <= std::numeric_limits<int>::max() &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
 
@@ -5580,7 +5922,6 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                                     SDValue Ptr, SDValue Mask, SDValue Src0,
                                     EVT MemVT, MachineMemOperand *MMO,
                                     ISD::LoadExtType ExtTy, bool isExpanding) {
-
   SDVTList VTs = getVTList(VT, MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
   FoldingSetNodeID ID;
@@ -5722,11 +6063,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
-                              ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) {
+                              ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
   unsigned NumOps = Ops.size();
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
-  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
   case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
   case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
   default: break;
@@ -5734,13 +6075,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   switch (Opcode) {
   default: break;
-  case ISD::CONCAT_VECTORS: {
+  case ISD::CONCAT_VECTORS:
     // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
     if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
       return V;
     break;
-  }
-  case ISD::SELECT_CC: {
+  case ISD::SELECT_CC:
     assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
     assert(Ops[0].getValueType() == Ops[1].getValueType() &&
            "LHS and RHS of condition must have same type!");
@@ -5749,14 +6089,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(Ops[2].getValueType() == VT &&
            "select_cc node must be of same type as true and false value!");
     break;
-  }
-  case ISD::BR_CC: {
+  case ISD::BR_CC:
     assert(NumOps == 5 && "BR_CC takes 5 operands!");
     assert(Ops[2].getValueType() == Ops[3].getValueType() &&
            "LHS/RHS of comparison should match types!");
     break;
   }
-  }
 
   // Memoize nodes.
   SDNode *N;
@@ -6238,6 +6576,62 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
   return N;
 }
 
+SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
+  unsigned OrigOpc = Node->getOpcode();
+  unsigned NewOpc;
+  bool IsUnary = false;
+  switch (OrigOpc) {
+  default: 
+    llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
+  case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
+  case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
+  case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
+  case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
+  case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
+  case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
+  case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
+  case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
+  case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break;
+  case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break;
+  case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break;
+  case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break;
+  case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break;
+  case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break;
+  case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break;
+  case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break;
+  case ISD::STRICT_FNEARBYINT:
+    NewOpc = ISD::FNEARBYINT;
+    IsUnary = true;
+    break;
+  }
+
+  // We're taking this node out of the chain, so we need to re-link things.
+  SDValue InputChain = Node->getOperand(0);
+  SDValue OutputChain = SDValue(Node, 1);
+  ReplaceAllUsesOfValueWith(OutputChain, InputChain);
+
+  SDVTList VTs = getVTList(Node->getOperand(1).getValueType());
+  SDNode *Res = nullptr;
+  if (IsUnary)
+    Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
+  else
+    Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
+                                           Node->getOperand(2) });
+  
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  } else {
+    ReplaceAllUsesWith(Node, Res);
+    RemoveDeadNode(Node);
+  }
+
+  return Res; 
+}
 
 /// getMachineNode - These are used for target selectors to create a new node
 /// with specified return type(s), MachineInstr opcode, and operands.
@@ -6384,14 +6778,13 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
 /// else return NULL.
 SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
                                       ArrayRef<SDValue> Ops,
-                                      const SDNodeFlags *Flags) {
+                                      const SDNodeFlags Flags) {
   if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops);
     void *IP = nullptr;
     if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
-      if (Flags)
-        E->intersectFlagsWith(Flags);
+      E->intersectFlagsWith(Flags);
       return E;
     }
   }
@@ -6452,7 +6845,7 @@ public:
     : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
 };
 
-}
+} // end anonymous namespace
 
 /// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
 /// This can cause recursive merging of nodes in the DAG.
@@ -6498,7 +6891,6 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
     AddModifiedNodeToCSEMaps(User);
   }
 
-
   // If we just RAUW'd the root, take note.
   if (FromN == getRoot())
     setRoot(To);
@@ -6668,6 +7060,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
 }
 
 namespace {
+
   /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
   /// to record information about a use.
   struct UseMemo {
@@ -6680,7 +7073,8 @@ namespace {
   bool operator<(const UseMemo &L, const UseMemo &R) {
     return (intptr_t)L.User < (intptr_t)R.User;
   }
-}
+
+} // end anonymous namespace
 
 /// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
 /// uses of other values produced by From.getNode() alone.  The same value
@@ -6746,7 +7140,6 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
 /// based on their topological order. It returns the maximum id and a vector
 /// of the SDNodes* in assigned order by reference.
 unsigned SelectionDAG::AssignTopologicalOrder() {
-
   unsigned DAGSize = 0;
 
   // SortedPos tracks the progress of the algorithm. Nodes before it are
@@ -6872,6 +7265,25 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
     AddDbgValue(I, ToNode, false);
 }
 
+SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
+                                                   SDValue NewMemOp) {
+  assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
+  // The new memory operation must have the same position as the old load in
+  // terms of memory dependency. Create a TokenFactor for the old load and new
+  // memory operation and update uses of the old load's output chain to use that
+  // TokenFactor.
+  SDValue OldChain = SDValue(OldLoad, 1);
+  SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
+  if (!OldLoad->hasAnyUseOfValue(1))
+    return NewChain;
+
+  SDValue TokenFactor =
+      getNode(ISD::TokenFactor, SDLoc(OldLoad), MVT::Other, OldChain, NewChain);
+  ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
+  UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+  return TokenFactor;
+}
+
 //===----------------------------------------------------------------------===//
 //                              SDNode Class
 //===----------------------------------------------------------------------===//
@@ -6973,6 +7385,7 @@ void SDNode::Profile(FoldingSetNodeID &ID) const {
 }
 
 namespace {
+
   struct EVTArray {
     std::vector<EVT> VTs;
 
@@ -6982,11 +7395,12 @@ namespace {
         VTs.push_back(MVT((MVT::SimpleValueType)i));
     }
   };
-}
 
-static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs;
+} // end anonymous namespace
+
+static ManagedStatic<std::set<EVT, EVT::compareRawBits>> EVTs;
 static ManagedStatic<EVTArray> SimpleVTArray;
-static ManagedStatic<sys::SmartMutex<true> > VTMutex;
+static ManagedStatic<sys::SmartMutex<true>> VTMutex;
 
 /// getValueTypeList - Return a pointer to the specified value type.
 ///
@@ -7020,7 +7434,6 @@ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
   return NUses == 0;
 }
 
-
 /// hasAnyUseOfValue - Return true if there are any use of the indicated
 /// value. This method ignores uses of other values defined by this operation.
 bool SDNode::hasAnyUseOfValue(unsigned Value) const {
@@ -7033,9 +7446,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const {
   return false;
 }
 
-
 /// isOnlyUserOf - Return true if this node is the only use of N.
-///
 bool SDNode::isOnlyUserOf(const SDNode *N) const {
   bool Seen = false;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
@@ -7049,8 +7460,22 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {
   return Seen;
 }
 
+/// Return true if the only users of N are contained in Nodes.
+bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (llvm::any_of(Nodes,
+                     [&User](const SDNode *Node) { return User == Node; }))
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
 /// isOperand - Return true if this node is an operand of N.
-///
 bool SDValue::isOperandOf(const SDNode *N) const {
   for (const SDValue &Op : N->op_values())
     if (*this == Op)
@@ -7070,21 +7495,39 @@ bool SDNode::isOperandOf(const SDNode *N) const {
 /// side-effecting instructions on any chain path.  In practice, this looks
 /// through token factors and non-volatile loads.  In order to remain efficient,
 /// this only looks a couple of nodes in, it does not do an exhaustive search.
+///
+/// Note that we only need to examine chains when we're searching for
+/// side-effects; SelectionDAG requires that all side-effects are represented
+/// by chains, even if another operand would force a specific ordering. This
+/// constraint is necessary to allow transformations like splitting loads.
 bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
-                                               unsigned Depth) const {
+                                             unsigned Depth) const {
   if (*this == Dest) return true;
 
   // Don't search too deeply, we just want to be able to see through
   // TokenFactor's etc.
   if (Depth == 0) return false;
 
-  // If this is a token factor, all inputs to the TF happen in parallel.  If any
-  // of the operands of the TF does not reach dest, then we cannot do the xform.
+  // If this is a token factor, all inputs to the TF happen in parallel.
   if (getOpcode() == ISD::TokenFactor) {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-      if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
-        return false;
-    return true;
+    // First, try a shallow search.
+    if (is_contained((*this)->ops(), Dest)) {
+      // We found the chain we want as an operand of this TokenFactor.
+      // Essentially, we reach the chain without side-effects if we could
+      // serialize the TokenFactor into a simple chain of operations with
+      // Dest as the last operation. This is automatically true if the
+      // chain has one use: there are no other ordering constraints.
+      // If the chain has more than one use, we give up: some other
+      // use of Dest might force a side-effect between Dest and the current
+      // node.
+      if (Dest.hasOneUse())
+        return true;
+    }
+    // Next, try a deep search: check whether every operand of the TokenFactor
+    // reaches Dest.
+    return llvm::all_of((*this)->ops(), [=](SDValue Op) {
+      return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
+    });
   }
 
   // Loads don't have side effects, look through them.
@@ -7102,20 +7545,8 @@ bool SDNode::hasPredecessor(const SDNode *N) const {
   return hasPredecessorHelper(N, Visited, Worklist);
 }
 
-uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
-  assert(Num < NumOperands && "Invalid child # of SDNode!");
-  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
-}
-
-const SDNodeFlags *SDNode::getFlags() const {
-  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
-    return &FlagsNode->Flags;
-  return nullptr;
-}
-
-void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) {
-  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
-    FlagsNode->Flags.intersectWith(Flags);
+void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
+  this->Flags.intersectWith(Flags);
 }
 
 SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
@@ -7204,49 +7635,16 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
 
   SDValue Loc = LD->getOperand(1);
   SDValue BaseLoc = Base->getOperand(1);
-  if (Loc.getOpcode() == ISD::FrameIndex) {
-    if (BaseLoc.getOpcode() != ISD::FrameIndex)
-      return false;
-    const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
-    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
-    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
-    int FS  = MFI.getObjectSize(FI);
-    int BFS = MFI.getObjectSize(BFI);
-    if (FS != BFS || FS != (int)Bytes) return false;
-    return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
-  }
-
-  // Handle X + C.
-  if (isBaseWithConstantOffset(Loc)) {
-    int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
-    if (Loc.getOperand(0) == BaseLoc) {
-      // If the base location is a simple address with no offset itself, then
-      // the second load's first add operand should be the base address.
-      if (LocOffset == Dist * (int)Bytes)
-        return true;
-    } else if (isBaseWithConstantOffset(BaseLoc)) {
-      // The base location itself has an offset, so subtract that value from the
-      // second load's offset before comparing to distance * size.
-      int64_t BOffset =
-        cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
-      if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
-        if ((LocOffset - BOffset) == Dist * (int)Bytes)
-          return true;
-      }
-    }
-  }
-  const GlobalValue *GV1 = nullptr;
-  const GlobalValue *GV2 = nullptr;
-  int64_t Offset1 = 0;
-  int64_t Offset2 = 0;
-  bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1);
-  bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
-  if (isGA1 && isGA2 && GV1 == GV2)
-    return Offset1 == (Offset2 + Dist*Bytes);
+
+  auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this);
+  auto LocDecomp = BaseIndexOffset::match(Loc, *this);
+
+  int64_t Offset = 0;
+  if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
+    return (Dist * Bytes == Offset);
   return false;
 }
 
-
 /// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
 /// it cannot be inferred.
 unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
@@ -7255,10 +7653,9 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   int64_t GVOffset = 0;
   if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
     unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
-    APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
-    llvm::computeKnownBits(const_cast<GlobalValue *>(GV), KnownZero, KnownOne,
-                           getDataLayout());
-    unsigned AlignBits = KnownZero.countTrailingOnes();
+    KnownBits Known(PtrWidth);
+    llvm::computeKnownBits(GV, Known, getDataLayout());
+    unsigned AlignBits = Known.countMinTrailingZeros();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
       return MinAlign(Align, GVOffset);
@@ -7292,14 +7689,11 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
 std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
   // Currently all types are split in half.
   EVT LoVT, HiVT;
-  if (!VT.isVector()) {
+  if (!VT.isVector())
     LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
-  } else {
-    unsigned NumElements = VT.getVectorNumElements();
-    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-    LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
-                                   NumElements/2);
-  }
+  else
+    LoVT = HiVT = VT.getHalfNumVectorElementsVT(*getContext());
+
   return std::make_pair(LoVT, HiVT);
 }
 
@@ -7341,59 +7735,58 @@ unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
 }
 
-
 Type *ConstantPoolSDNode::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
   return Val.ConstVal->getType();
 }
 
-bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
-                                        APInt &SplatUndef,
+bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef,
                                         unsigned &SplatBitSize,
                                         bool &HasAnyUndefs,
                                         unsigned MinSplatBits,
-                                        bool isBigEndian) const {
+                                        bool IsBigEndian) const {
   EVT VT = getValueType(0);
   assert(VT.isVector() && "Expected a vector type");
-  unsigned sz = VT.getSizeInBits();
-  if (MinSplatBits > sz)
+  unsigned VecWidth = VT.getSizeInBits();
+  if (MinSplatBits > VecWidth)
     return false;
 
-  SplatValue = APInt(sz, 0);
-  SplatUndef = APInt(sz, 0);
+  // FIXME: The widths are based on this node's type, but build vectors can
+  // truncate their operands.
+  SplatValue = APInt(VecWidth, 0);
+  SplatUndef = APInt(VecWidth, 0);
 
-  // Get the bits.  Bits with undefined values (when the corresponding element
+  // Get the bits. Bits with undefined values (when the corresponding element
   // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
-  // in SplatValue.  If any of the values are not constant, give up and return
+  // in SplatValue. If any of the values are not constant, give up and return
   // false.
-  unsigned int nOps = getNumOperands();
-  assert(nOps > 0 && "isConstantSplat has 0-size build vector");
-  unsigned EltBitSize = VT.getScalarSizeInBits();
+  unsigned int NumOps = getNumOperands();
+  assert(NumOps > 0 && "isConstantSplat has 0-size build vector");
+  unsigned EltWidth = VT.getScalarSizeInBits();
 
-  for (unsigned j = 0; j < nOps; ++j) {
-    unsigned i = isBigEndian ? nOps-1-j : j;
+  for (unsigned j = 0; j < NumOps; ++j) {
+    unsigned i = IsBigEndian ? NumOps - 1 - j : j;
     SDValue OpVal = getOperand(i);
-    unsigned BitPos = j * EltBitSize;
+    unsigned BitPos = j * EltWidth;
 
     if (OpVal.isUndef())
-      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
-    else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
-      SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize).
-                    zextOrTrunc(sz) << BitPos;
-    else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
-      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
-     else
+      SplatUndef.setBits(BitPos, BitPos + EltWidth);
+    else if (auto *CN = dyn_cast<ConstantSDNode>(OpVal))
+      SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
+    else if (auto *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+      SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
+    else
       return false;
   }
 
-  // The build_vector is all constants or undefs.  Find the smallest element
+  // The build_vector is all constants or undefs. Find the smallest element
   // size that splats the vector.
-
   HasAnyUndefs = (SplatUndef != 0);
-  while (sz > 8) {
 
-    unsigned HalfSize = sz / 2;
+  // FIXME: This does not work for vectors with elements less than 8 bits.
+  while (VecWidth > 8) {
+    unsigned HalfSize = VecWidth / 2;
     APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
     APInt LowValue = SplatValue.trunc(HalfSize);
     APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
@@ -7407,10 +7800,10 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
     SplatValue = HighValue | LowValue;
     SplatUndef = HighUndef & LowUndef;
 
-    sz = HalfSize;
+    VecWidth = HalfSize;
   }
 
-  SplatBitSize = sz;
+  SplatBitSize = VecWidth;
   return true;
 }
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
new file mode 100644
index 0000000..0d69441
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -0,0 +1,115 @@
+//===-- llvm/CodeGen/SelectionDAGAddressAnalysis.cpp ------- DAG Address
+//Analysis ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+
+bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
+                                     const SelectionDAG &DAG, int64_t &Off) {
+  // Initial Offset difference.
+  Off = Other.Offset - Offset;
+
+  if ((Other.Index == Index) && (Other.IsIndexSignExt == IsIndexSignExt)) {
+    // Trivial match.
+    if (Other.Base == Base)
+      return true;
+
+    // Match GlobalAddresses
+    if (auto *A = dyn_cast<GlobalAddressSDNode>(Base))
+      if (auto *B = dyn_cast<GlobalAddressSDNode>(Other.Base))
+        if (A->getGlobal() == B->getGlobal()) {
+          Off += B->getOffset() - A->getOffset();
+          return true;
+        }
+
+    const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+    // Match non-equal FrameIndexes - If both frame indices are fixed
+    // we know their relative offsets and can compare them. Otherwise
+    // we must be conservative.
+    if (auto *A = dyn_cast<FrameIndexSDNode>(Base))
+      if (auto *B = dyn_cast<FrameIndexSDNode>(Other.Base))
+        if (MFI.isFixedObjectIndex(A->getIndex()) &&
+            MFI.isFixedObjectIndex(B->getIndex())) {
+          Off += MFI.getObjectOffset(B->getIndex()) -
+                 MFI.getObjectOffset(A->getIndex());
+          return true;
+        }
+  }
+  return false;
+}
+
+/// Parses tree in Ptr for base, index, offset addresses.
+BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
+  // (((B + I*M) + c)) + c ...
+  SDValue Base = Ptr;
+  SDValue Index = SDValue();
+  int64_t Offset = 0;
+  bool IsIndexSignExt = false;
+
+  // Consume constant adds & ors with appropriate masking.
+  while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
+      // Only consider ORs which act as adds.
+      if (Base->getOpcode() == ISD::OR &&
+          !DAG.MaskedValueIsZero(Base->getOperand(0), C->getAPIntValue()))
+        break;
+      Offset += C->getSExtValue();
+      Base = Base->getOperand(0);
+      continue;
+    }
+    break;
+  }
+
+  if (Base->getOpcode() == ISD::ADD) {
+    // TODO: The following code appears to be needless as it just
+    //       bails on some Ptrs early, reducing the cases where we
+    //       find equivalence. We should be able to remove this.
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Base is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Base->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
+
+    // Look at Base + Index + Offset cases.
+    Index = Base->getOperand(1);
+    SDValue PotentialBase = Base->getOperand(0);
+
+    // Skip signextends.
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    }
+
+    // Check if Index Offset pattern
+    if (Index->getOpcode() != ISD::ADD ||
+        !isa<ConstantSDNode>(Index->getOperand(1)))
+      return BaseIndexOffset(PotentialBase, Index, Offset, IsIndexSignExt);
+
+    Offset += cast<ConstantSDNode>(Index->getOperand(1))->getSExtValue();
+    Index = Index->getOperand(0);
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    } else
+      IsIndexSignExt = false;
+    Base = PotentialBase;
+  }
+  return BaseIndexOffset(Base, Index, Offset, IsIndexSignExt);
+}
+} // end namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 996c95b..1273120 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -83,24 +83,6 @@ LimitFPPrecision("limit-float-precision",
                           "for some float libcalls"),
                  cl::location(LimitFloatPrecision),
                  cl::init(0));
-
-static cl::opt<bool>
-EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
-                cl::desc("Enable fast-math-flags for DAG nodes"));
-
-/// Minimum jump table density for normal functions.
-static cl::opt<unsigned>
-JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
-                 cl::desc("Minimum density for building a jump table in "
-                          "a normal function"));
-
-/// Minimum jump table density for -Os or -Oz functions.
-static cl::opt<unsigned>
-OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden,
-                        cl::desc("Minimum density for building a jump table in "
-                                 "an optsize function"));
-
-
 // Limit the width of DAG chains. This is important in general to prevent
 // DAG-based analysis from blowing up. For example, alias analysis and
 // load clustering may not complete in reasonable time. It is difficult to
@@ -117,9 +99,31 @@ OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden,
 // store [4096 x i8] %data, [4096 x i8]* %buffer
 static const unsigned MaxParallelChains = 64;
 
+// True if the Value passed requires ABI mangling as it is a parameter to a
+// function or a return value from a function which is not an intrinsic.
+static bool isABIRegCopy(const Value * V) {
+  const bool IsRetInst = V && isa<ReturnInst>(V);
+  const bool IsCallInst = V && isa<CallInst>(V);
+  const bool IsInLineAsm =
+      IsCallInst && static_cast<const CallInst *>(V)->isInlineAsm();
+  const bool IsIndirectFunctionCall =
+      IsCallInst && !IsInLineAsm &&
+      !static_cast<const CallInst *>(V)->getCalledFunction();
+  // It is possible that the call instruction is an inline asm statement or an
+  // indirect function call in which case the return value of
+  // getCalledFunction() would be nullptr.
+  const bool IsInstrinsicCall =
+      IsCallInst && !IsInLineAsm && !IsIndirectFunctionCall &&
+      static_cast<const CallInst *>(V)->getCalledFunction()->getIntrinsicID() !=
+          Intrinsic::not_intrinsic;
+
+  return IsRetInst || (IsCallInst && (!IsInLineAsm && !IsInstrinsicCall));
+}
+
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
-                                      MVT PartVT, EVT ValueVT, const Value *V);
+                                      MVT PartVT, EVT ValueVT, const Value *V,
+                                      bool IsABIRegCopy);
 
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
@@ -129,10 +133,11 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
                                 const SDValue *Parts, unsigned NumParts,
                                 MVT PartVT, EVT ValueVT, const Value *V,
-                                Optional<ISD::NodeType> AssertOp = None) {
+                                Optional<ISD::NodeType> AssertOp = None,
+                                bool IsABIRegCopy = false) {
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
-                                  PartVT, ValueVT, V);
+                                  PartVT, ValueVT, V, IsABIRegCopy);
 
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -276,7 +281,8 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
 /// ValueVT (ISD::AssertSext).
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
-                                      MVT PartVT, EVT ValueVT, const Value *V) {
+                                      MVT PartVT, EVT ValueVT, const Value *V,
+                                      bool IsABIRegCopy) {
   assert(ValueVT.isVector() && "Not a vector value");
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -287,9 +293,18 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     EVT IntermediateVT;
     MVT RegisterVT;
     unsigned NumIntermediates;
-    unsigned NumRegs =
-    TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
-                               NumIntermediates, RegisterVT);
+    unsigned NumRegs;
+
+    if (IsABIRegCopy) {
+      NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
+          *DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates,
+          RegisterVT);
+    } else {
+      NumRegs =
+          TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                     NumIntermediates, RegisterVT);
+    }
+
     assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
     NumParts = NumRegs; // Silence a compiler warning.
     assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
@@ -318,9 +333,14 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
     // intermediate operands.
+    EVT BuiltVectorTy =
+        EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
+                         (IntermediateVT.isVector()
+                              ? IntermediateVT.getVectorNumElements() * NumParts
+                              : NumIntermediates));
     Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
                                                 : ISD::BUILD_VECTOR,
-                      DL, ValueVT, Ops);
+                      DL, BuiltVectorTy, Ops);
   }
 
   // There is now one part, held in Val.  Correct it to match ValueVT.
@@ -359,23 +379,40 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       TLI.isTypeLegal(ValueVT))
     return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
-  // Handle cases such as i8 -> <1 x i1>
   if (ValueVT.getVectorNumElements() != 1) {
-    diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
-                                      "non-trivial scalar-to-vector conversion");
-    return DAG.getUNDEF(ValueVT);
+     // Certain ABIs require that vectors are passed as integers. For vectors
+     // are the same size, this is an obvious bitcast.
+     if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
+       return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+     } else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
+       // Bitcast Val back the original type and extract the corresponding
+       // vector we want.
+       unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
+       EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
+                                           ValueVT.getVectorElementType(), Elts);
+       Val = DAG.getBitcast(WiderVecType, Val);
+       return DAG.getNode(
+           ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+           DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+     }
+
+     diagnosePossiblyInvalidConstraint(
+         *DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
+     return DAG.getUNDEF(ValueVT);
   }
 
-  if (ValueVT.getVectorNumElements() == 1 &&
-      ValueVT.getVectorElementType() != PartEVT)
-    Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType());
+  // Handle cases such as i8 -> <1 x i1>
+  EVT ValueSVT = ValueVT.getVectorElementType();
+  if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
+    Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
+                                    : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
+  return DAG.getBuildVector(ValueVT, DL, Val);
 }
 
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
-                                 MVT PartVT, const Value *V);
+                                 MVT PartVT, const Value *V, bool IsABIRegCopy);
 
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
@@ -383,12 +420,14 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
 static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
                            SDValue *Parts, unsigned NumParts, MVT PartVT,
                            const Value *V,
-                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND,
+                           bool IsABIRegCopy = false) {
   EVT ValueVT = Val.getValueType();
 
   // Handle the vector case separately.
   if (ValueVT.isVector())
-    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V);
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
+                                IsABIRegCopy);
 
   unsigned PartBits = PartVT.getSizeInBits();
   unsigned OrigNumParts = NumParts;
@@ -513,7 +552,9 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
 /// value split into legal parts.
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
-                                 MVT PartVT, const Value *V) {
+                                 MVT PartVT, const Value *V,
+                                 bool IsABIRegCopy) {
+
   EVT ValueVT = Val.getValueType();
   assert(ValueVT.isVector() && "Not a vector");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -541,7 +582,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
            e = PartVT.getVectorNumElements(); i != e; ++i)
         Ops.push_back(DAG.getUNDEF(ElementVT));
 
-      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops);
+      Val = DAG.getBuildVector(PartVT, DL, Ops);
 
       // FIXME: Use CONCAT for 2x -> 4x.
 
@@ -554,17 +595,23 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
       // Promoted vector extract
       Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
-    } else{
-      // Vector -> scalar conversion.
-      assert(ValueVT.getVectorNumElements() == 1 &&
-             "Only trivial vector-to-scalar conversions should get here!");
-      Val = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
-          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    } else {
+      if (ValueVT.getVectorNumElements() == 1) {
+        Val = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+            DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+      } else {
+        assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
+               "lossy conversion of vector to scalar type");
+        EVT IntermediateType =
+            EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
+        Val = DAG.getBitcast(IntermediateType, Val);
+        Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+      }
     }
 
+    assert(Val.getValueType() == PartVT && "Unexpected vector part value type");
     Parts[0] = Val;
     return;
   }
@@ -573,15 +620,31 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   EVT IntermediateVT;
   MVT RegisterVT;
   unsigned NumIntermediates;
-  unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT,
-                                                IntermediateVT,
-                                                NumIntermediates, RegisterVT);
+  unsigned NumRegs;
+  if (IsABIRegCopy) {
+    NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
+        *DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates,
+        RegisterVT);
+  } else {
+    NumRegs =
+        TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                   NumIntermediates, RegisterVT);
+  }
   unsigned NumElements = ValueVT.getVectorNumElements();
 
   assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
   NumParts = NumRegs; // Silence a compiler warning.
   assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
 
+  // Convert the vector to the appropiate type if necessary.
+  unsigned DestVectorNoElts =
+      NumIntermediates *
+      (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1);
+  EVT BuiltVectorTy = EVT::getVectorVT(
+      *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
+  if (Val.getValueType() != BuiltVectorTy)
+    Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
+
   // Split the vector into intermediate operands.
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
@@ -614,30 +677,35 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   }
 }
 
-RegsForValue::RegsForValue() {}
+RegsForValue::RegsForValue() { IsABIMangled = false; }
 
 RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
-                           EVT valuevt)
-    : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+                           EVT valuevt, bool IsABIMangledValue)
+    : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
+      RegCount(1, regs.size()), IsABIMangled(IsABIMangledValue) {}
 
 RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
-                           const DataLayout &DL, unsigned Reg, Type *Ty) {
+                           const DataLayout &DL, unsigned Reg, Type *Ty,
+                           bool IsABIMangledValue) {
   ComputeValueVTs(TLI, DL, Ty, ValueVTs);
 
+  IsABIMangled = IsABIMangledValue;
+
   for (EVT ValueVT : ValueVTs) {
-    unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT);
-    MVT RegisterVT = TLI.getRegisterType(Context, ValueVT);
+    unsigned NumRegs = IsABIMangledValue
+                           ? TLI.getNumRegistersForCallingConv(Context, ValueVT)
+                           : TLI.getNumRegisters(Context, ValueVT);
+    MVT RegisterVT = IsABIMangledValue
+                         ? TLI.getRegisterTypeForCallingConv(Context, ValueVT)
+                         : TLI.getRegisterType(Context, ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i)
       Regs.push_back(Reg + i);
     RegVTs.push_back(RegisterVT);
+    RegCount.push_back(NumRegs);
     Reg += NumRegs;
   }
 }
 
-/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-/// this value and returns the result as a ValueVT value.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
                                       const SDLoc &dl, SDValue &Chain,
@@ -654,8 +722,10 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
   for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
     // Copy the legal parts from the registers.
     EVT ValueVT = ValueVTs[Value];
-    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
-    MVT RegisterVT = RegVTs[Value];
+    unsigned NumRegs = RegCount[Value];
+    MVT RegisterVT = IsABIMangled
+                         ? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
+                         : RegVTs[Value];
 
     Parts.resize(NumRegs);
     for (unsigned i = 0; i != NumRegs; ++i) {
@@ -683,7 +753,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 
       unsigned RegSize = RegisterVT.getSizeInBits();
       unsigned NumSignBits = LOI->NumSignBits;
-      unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes();
+      unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();
 
       if (NumZeroBits == RegSize) {
         // The current value is a zero.
@@ -739,10 +809,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
   return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
 }
 
-/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-/// specified value into the registers specified by this object.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
                                  const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                                  const Value *V,
@@ -754,9 +820,11 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
   unsigned NumRegs = Regs.size();
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
-    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
-    MVT RegisterVT = RegVTs[Value];
+    unsigned NumParts = RegCount[Value];
+
+    MVT RegisterVT = IsABIMangled
+                         ? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
+                         : RegVTs[Value];
 
     if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
       ExtendKind = ISD::ZERO_EXTEND;
@@ -796,9 +864,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 }
 
-/// AddInlineAsmOperands - Add this value to the specified inlineasm node
-/// operand list.  This adds the code marker and includes the number of
-/// values added into it.
 void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
                                         unsigned MatchingIdx, const SDLoc &dl,
                                         SelectionDAG &DAG,
@@ -840,9 +905,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   }
 }
 
-void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa,
                                const TargetLibraryInfo *li) {
-  AA = &aa;
+  AA = aa;
   GFI = gfi;
   LibInfo = li;
   DL = &DAG.getDataLayout();
@@ -850,12 +915,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   LPadToCallSiteMap.clear();
 }
 
-/// clear - Clear out the current SelectionDAG and the associated
-/// state and prepare this SelectionDAGBuilder object to be used
-/// for a new block. This doesn't clear out information about
-/// additional blocks that are needed to complete switch lowering
-/// or PHI node updating; that information is cleared out as it is
-/// consumed.
 void SelectionDAGBuilder::clear() {
   NodeMap.clear();
   UnusedArgNodeMap.clear();
@@ -867,21 +926,10 @@ void SelectionDAGBuilder::clear() {
   StatepointLowering.clear();
 }
 
-/// clearDanglingDebugInfo - Clear the dangling debug information
-/// map. This function is separated from the clear so that debug
-/// information that is dangling in a basic block can be properly
-/// resolved in a different basic block. This allows the
-/// SelectionDAG to resolve dangling debug information attached
-/// to PHI nodes.
 void SelectionDAGBuilder::clearDanglingDebugInfo() {
   DanglingDebugInfoMap.clear();
 }
 
-/// getRoot - Return the current virtual root of the Selection DAG,
-/// flushing any PendingLoad items. This must be done before emitting
-/// a store or any other node that may need to be ordered after any
-/// prior load instructions.
-///
 SDValue SelectionDAGBuilder::getRoot() {
   if (PendingLoads.empty())
     return DAG.getRoot();
@@ -901,10 +949,6 @@ SDValue SelectionDAGBuilder::getRoot() {
   return Root;
 }
 
-/// getControlRoot - Similar to getRoot, but instead of flushing all the
-/// PendingLoad items, flush all the PendingExports items. It is necessary
-/// to do this before emitting a terminator instruction.
-///
 SDValue SelectionDAGBuilder::getControlRoot() {
   SDValue Root = DAG.getRoot();
 
@@ -937,7 +981,9 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
     HandlePHINodesInSuccessorBlocks(I.getParent());
   }
 
-  ++SDNodeOrder;
+  // Increase the SDNodeOrder if dealing with a non-debug instruction.
+  if (!isa<DbgInfoIntrinsic>(I))
+    ++SDNodeOrder;
 
   CurInst = &I;
 
@@ -1001,10 +1047,12 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
 
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
+
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                     DAG.getDataLayout(), InReg, Ty);
+                     DAG.getDataLayout(), InReg, Ty, isABIRegCopy(V));
     SDValue Chain = DAG.getEntryNode();
-    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
+    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
+                                 V);
     resolveDanglingDebugInfo(V, Result);
   }
 
@@ -1122,8 +1170,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
 
       if (isa<ArrayType>(CDS->getType()))
         return DAG.getMergeValues(Ops, getCurSDLoc());
-      return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
-                                      VT, Ops);
+      return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
     }
 
     if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
@@ -1175,7 +1222,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     }
 
     // Create a BUILD_VECTOR node.
-    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops);
+    return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
   }
 
   // If this is a static alloca, generate it as the frameindex instead of
@@ -1185,14 +1232,15 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end())
       return DAG.getFrameIndex(SI->second,
-                               TLI.getPointerTy(DAG.getDataLayout()));
+                               TLI.getFrameIndexTy(DAG.getDataLayout()));
   }
 
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
-                     Inst->getType());
+                     Inst->getType(), isABIRegCopy(V));
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -1384,7 +1432,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
                                 RetPtr.getValueType(), RetPtr,
                                 DAG.getIntPtrConstant(Offsets[i],
                                                       getCurSDLoc()),
-                                &Flags);
+                                Flags);
       Chains[i] = DAG.getStore(Chain, getCurSDLoc(),
                                SDValue(RetOp.getNode(), RetOp.getResNo() + i),
                                // FIXME: better loc info would be nice.
@@ -1403,16 +1451,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       const Function *F = I.getParent()->getParent();
 
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::SExt))
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                                Attribute::ZExt))
         ExtendKind = ISD::ZERO_EXTEND;
 
       LLVMContext &Context = F->getContext();
-      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                                      Attribute::InReg);
+      bool RetInReg = F->getAttributes().hasAttribute(
+          AttributeList::ReturnIndex, Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
@@ -1420,12 +1468,12 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
           VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
 
-        unsigned NumParts = TLI.getNumRegisters(Context, VT);
-        MVT PartVT = TLI.getRegisterType(Context, VT);
+        unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, VT);
+        MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
-                       &Parts[0], NumParts, PartVT, &I, ExtendKind);
+                       &Parts[0], NumParts, PartVT, &I, ExtendKind, true);
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
@@ -1461,9 +1509,10 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
                                   true /*isfixed*/, 1 /*origidx*/,
                                   0 /*partOffs*/));
     // Create SDNode for the swifterror virtual register.
-    OutVals.push_back(DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVReg(
-                                          FuncInfo.MBB, FuncInfo.SwiftErrorArg),
-                                      EVT(TLI.getPointerTy(DL))));
+    OutVals.push_back(
+        DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVRegUseAt(
+                            &I, FuncInfo.MBB, FuncInfo.SwiftErrorArg).first,
+                        EVT(TLI.getPointerTy(DL))));
   }
 
   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
@@ -1582,7 +1631,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
                                                   MachineBasicBlock *CurBB,
                                                   MachineBasicBlock *SwitchBB,
                                                   BranchProbability TProb,
-                                                  BranchProbability FProb) {
+                                                  BranchProbability FProb,
+                                                  bool InvertCond) {
   const BasicBlock *BB = CurBB->getBasicBlock();
 
   // If the leaf of the tree is a comparison, merge the condition into
@@ -1596,10 +1646,14 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
          isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
       ISD::CondCode Condition;
       if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
-        Condition = getICmpCondCode(IC->getPredicate());
+        ICmpInst::Predicate Pred =
+            InvertCond ? IC->getInversePredicate() : IC->getPredicate();
+        Condition = getICmpCondCode(Pred);
       } else {
         const FCmpInst *FC = cast<FCmpInst>(Cond);
-        Condition = getFCmpCondCode(FC->getPredicate());
+        FCmpInst::Predicate Pred =
+            InvertCond ? FC->getInversePredicate() : FC->getPredicate();
+        Condition = getFCmpCondCode(Pred);
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
       }
@@ -1612,7 +1666,8 @@ SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
   }
 
   // Create a CaseBlock record representing this branch.
-  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
+  ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
+  CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
                nullptr, TBB, FBB, CurBB, TProb, FProb);
   SwitchCases.push_back(CB);
 }
@@ -1625,16 +1680,44 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
                                                MachineBasicBlock *SwitchBB,
                                                Instruction::BinaryOps Opc,
                                                BranchProbability TProb,
-                                               BranchProbability FProb) {
-  // If this node is not part of the or/and tree, emit it as a branch.
+                                               BranchProbability FProb,
+                                               bool InvertCond) {
+  // Skip over not part of the tree and remember to invert op and operands at
+  // next level.
+  if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
+    const Value *CondOp = BinaryOperator::getNotArgument(Cond);
+    if (InBlock(CondOp, CurBB->getBasicBlock())) {
+      FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                           !InvertCond);
+      return;
+    }
+  }
+
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  // Compute the effective opcode for Cond, taking into account whether it needs
+  // to be inverted, e.g.
+  //   and (not (or A, B)), C
+  // gets lowered as
+  //   and (and (not A, not B), C)
+  unsigned BOpc = 0;
+  if (BOp) {
+    BOpc = BOp->getOpcode();
+    if (InvertCond) {
+      if (BOpc == Instruction::And)
+        BOpc = Instruction::Or;
+      else if (BOpc == Instruction::Or)
+        BOpc = Instruction::And;
+    }
+  }
+
+  // If this node is not part of the or/and tree, emit it as a branch.
   if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
-      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() ||
+      BOpc != Opc || !BOp->hasOneUse() ||
       BOp->getParent() != CurBB->getBasicBlock() ||
       !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
       !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
     EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
-                                 TProb, FProb);
+                                 TProb, FProb, InvertCond);
     return;
   }
 
@@ -1669,14 +1752,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     auto NewFalseProb = TProb / 2 + FProb;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
-                         NewTrueProb, NewFalseProb);
+                         NewTrueProb, NewFalseProb, InvertCond);
 
     // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
     SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         Probs[0], Probs[1]);
+                         Probs[0], Probs[1], InvertCond);
   } else {
     assert(Opc == Instruction::And && "Unknown merge op!");
     // Codegen X & Y as:
@@ -1702,14 +1785,14 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
     auto NewFalseProb = FProb / 2;
     // Emit the LHS condition.
     FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
-                         NewTrueProb, NewFalseProb);
+                         NewTrueProb, NewFalseProb, InvertCond);
 
     // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
     SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
     BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
     // Emit the RHS condition into TmpBB.
     FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
-                         Probs[0], Probs[1]);
+                         Probs[0], Probs[1], InvertCond);
   }
 }
 
@@ -1793,7 +1876,8 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
                            getEdgeProbability(BrMBB, Succ0MBB),
-                           getEdgeProbability(BrMBB, Succ1MBB));
+                           getEdgeProbability(BrMBB, Succ1MBB),
+                           /*InvertCond=*/false);
       // If the compares in later blocks need to use values not currently
       // exported from this block, export them now.  This block should always
       // be the first entry.
@@ -2027,7 +2111,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
     Entry.Node = StackSlot;
     Entry.Ty = FnTy->getParamType(0);
     if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
-      Entry.isInReg = true;
+      Entry.IsInReg = true;
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2581,15 +2665,15 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
   Flags.setVectorReduction(vec_redux);
-  if (EnableFMFInDAG) {
-    Flags.setAllowReciprocal(FMF.allowReciprocal());
-    Flags.setNoInfs(FMF.noInfs());
-    Flags.setNoNaNs(FMF.noNaNs());
-    Flags.setNoSignedZeros(FMF.noSignedZeros());
-    Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
-  }
+  Flags.setAllowReciprocal(FMF.allowReciprocal());
+  Flags.setAllowContract(FMF.allowContract());
+  Flags.setNoInfs(FMF.noInfs());
+  Flags.setNoNaNs(FMF.noNaNs());
+  Flags.setNoSignedZeros(FMF.noSignedZeros());
+  Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
+
   SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
-                                     Op1, Op2, &Flags);
+                                     Op1, Op2, Flags);
   setValue(&I, BinNodeValue);
 }
 
@@ -2642,7 +2726,7 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
   SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
-                            &Flags);
+                            Flags);
   setValue(&I, Res);
 }
 
@@ -2654,7 +2738,7 @@ void SelectionDAGBuilder::visitSDiv(const User &I) {
   Flags.setExact(isa<PossiblyExactOperator>(&I) &&
                  cast<PossiblyExactOperator>(&I)->isExact());
   setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
-                           Op2, &Flags));
+                           Op2, Flags));
 }
 
 void SelectionDAGBuilder::visitICmp(const User &I) {
@@ -2914,7 +2998,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) {
                              DestVT, N)); // convert types.
   // Check if the original LLVM IR Operand was a ConstantInt, because getValue()
   // might fold any kind of constant expression to an integer constant and that
-  // is not what we are looking for. Only regcognize a bitcast of a genuine
+  // is not what we are looking for. Only recognize a bitcast of a genuine
   // constant integer as an opaque constant.
   else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
     setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
@@ -3067,14 +3151,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
 
   if (SrcNumElts > MaskNumElts) {
     // Analyze the access pattern of the vector to see if we can extract
-    // two subvectors and do the shuffle. The analysis is done by calculating
-    // the range of elements the mask access on both vectors.
-    int MinRange[2] = { static_cast<int>(SrcNumElts),
-                        static_cast<int>(SrcNumElts)};
-    int MaxRange[2] = {-1, -1};
-
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
-      int Idx = Mask[i];
+    // two subvectors and do the shuffle.
+    int StartIdx[2] = { -1, -1 };  // StartIdx to extract from
+    bool CanExtract = true;
+    for (int Idx : Mask) {
       unsigned Input = 0;
       if (Idx < 0)
         continue;
@@ -3083,41 +3163,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         Input = 1;
         Idx -= SrcNumElts;
       }
-      if (Idx > MaxRange[Input])
-        MaxRange[Input] = Idx;
-      if (Idx < MinRange[Input])
-        MinRange[Input] = Idx;
-    }
-
-    // Check if the access is smaller than the vector size and can we find
-    // a reasonable extract index.
-    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not
-                                   // Extract.
-    int StartIdx[2];  // StartIdx to extract from
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) {
-        RangeUse[Input] = 0; // Unused
-        StartIdx[Input] = 0;
-        continue;
-      }
 
-      // Find a good start index that is a multiple of the mask length. Then
-      // see if the rest of the elements are in range.
-      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
-      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
-          StartIdx[Input] + MaskNumElts <= SrcNumElts)
-        RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+      // If all the indices come from the same MaskNumElts sized portion of
+      // the sources we can use extract. Also make sure the extract wouldn't
+      // extract past the end of the source.
+      int NewStartIdx = alignDown(Idx, MaskNumElts);
+      if (NewStartIdx + MaskNumElts > SrcNumElts ||
+          (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
+        CanExtract = false;
+      // Make sure we always update StartIdx as we use it to track if all
+      // elements are undef.
+      StartIdx[Input] = NewStartIdx;
     }
 
-    if (RangeUse[0] == 0 && RangeUse[1] == 0) {
+    if (StartIdx[0] < 0 && StartIdx[1] < 0) {
       setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
       return;
     }
-    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) {
+    if (CanExtract) {
       // Extract appropriate subvector and generate a vector shuffle
       for (unsigned Input = 0; Input < 2; ++Input) {
         SDValue &Src = Input == 0 ? Src1 : Src2;
-        if (RangeUse[Input] == 0)
+        if (StartIdx[Input] < 0)
           Src = DAG.getUNDEF(VT);
         else {
           Src = DAG.getNode(
@@ -3128,16 +3195,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       }
 
       // Calculate new mask.
-      SmallVector<int, 8> MappedOps;
-      for (unsigned i = 0; i != MaskNumElts; ++i) {
-        int Idx = Mask[i];
-        if (Idx >= 0) {
-          if (Idx < (int)SrcNumElts)
-            Idx -= StartIdx[0];
-          else
-            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
-        }
-        MappedOps.push_back(Idx);
+      SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
+      for (int &Idx : MappedOps) {
+        if (Idx >= (int)SrcNumElts)
+          Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        else if (Idx >= 0)
+          Idx -= StartIdx[0];
       }
 
       setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
@@ -3151,8 +3214,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   EVT EltVT = VT.getVectorElementType();
   EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
   SmallVector<SDValue,8> Ops;
-  for (unsigned i = 0; i != MaskNumElts; ++i) {
-    int Idx = Mask[i];
+  for (int Idx : Mask) {
     SDValue Res;
 
     if (Idx < 0) {
@@ -3168,10 +3230,16 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     Ops.push_back(Res);
   }
 
-  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops));
+  setValue(&I, DAG.getBuildVector(VT, DL, Ops));
 }
 
-void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
+void SelectionDAGBuilder::visitInsertValue(const User &I) {
+  ArrayRef<unsigned> Indices;
+  if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I))
+    Indices = IV->getIndices();
+  else
+    Indices = cast<ConstantExpr>(&I)->getIndices();
+
   const Value *Op0 = I.getOperand(0);
   const Value *Op1 = I.getOperand(1);
   Type *AggTy = I.getType();
@@ -3179,7 +3247,7 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
   bool IntoUndef = isa<UndefValue>(Op0);
   bool FromUndef = isa<UndefValue>(Op1);
 
-  unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> AggValueVTs;
@@ -3219,13 +3287,19 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
                            DAG.getVTList(AggValueVTs), Values));
 }
 
-void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
+void SelectionDAGBuilder::visitExtractValue(const User &I) {
+  ArrayRef<unsigned> Indices;
+  if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I))
+    Indices = EV->getIndices();
+  else
+    Indices = cast<ConstantExpr>(&I)->getIndices();
+
   const Value *Op0 = I.getOperand(0);
   Type *AggTy = Op0->getType();
   Type *ValTy = I.getType();
   bool OutOfUndef = isa<UndefValue>(Op0);
 
-  unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 4> ValValueVTs;
@@ -3281,14 +3355,14 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         // N = N + Offset
         uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
 
-        // In an inbouds GEP with an offset that is nonnegative even when
+        // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
           Flags.setNoUnsignedWrap(true);
 
         N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
-                        DAG.getConstant(Offset, dl, N.getValueType()), &Flags);
+                        DAG.getConstant(Offset, dl, N.getValueType()), Flags);
       }
     } else {
       MVT PtrTy =
@@ -3318,7 +3392,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
           Flags.setNoUnsignedWrap(true);
 
-        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags);
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags);
         continue;
       }
 
@@ -3326,7 +3400,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       SDValue IdxN = getValue(Idx);
 
       if (!IdxN.getValueType().isVector() && VectorWidth) {
-        MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth);
+        EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
         IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
       }
 
@@ -3396,7 +3470,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   Flags.setNoUnsignedWrap(true);
   AllocSize = DAG.getNode(ISD::ADD, dl,
                           AllocSize.getValueType(), AllocSize,
-                          DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags);
+                          DAG.getIntPtrConstant(StackAlign - 1, dl), Flags);
 
   // Mask out the low bits for alignment purposes.
   AllocSize = DAG.getNode(ISD::AND, dl,
@@ -3459,7 +3533,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (isVolatile || NumValues > MaxParallelChains)
     // Serialize volatile loads with other side effects.
     Root = getRoot();
-  else if (AA->pointsToConstantMemory(MemoryLocation(
+  else if (AA && AA->pointsToConstantMemory(MemoryLocation(
                SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
@@ -3500,7 +3574,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     SDValue A = DAG.getNode(ISD::ADD, dl,
                             PtrVT, Ptr,
                             DAG.getConstant(Offsets[i], dl, PtrVT),
-                            &Flags);
+                            Flags);
     auto MMOFlags = MachineMemOperand::MONone;
     if (isVolatile)
       MMOFlags |= MachineMemOperand::MOVolatile;
@@ -3510,6 +3584,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
       MMOFlags |= MachineMemOperand::MOInvariant;
     if (isDereferenceable)
       MMOFlags |= MachineMemOperand::MODereferenceable;
+    MMOFlags |= TLI.getMMOFlags(I);
 
     SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A,
                             MachinePointerInfo(SV, Offsets[i]), Alignment,
@@ -3533,8 +3608,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
 }
 
 void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  assert(TLI.supportSwiftError() &&
+  assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
          "call visitStoreToSwiftError when backend supports swifterror");
 
   SmallVector<EVT, 4> ValueVTs;
@@ -3547,15 +3621,15 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
 
   SDValue Src = getValue(SrcV);
   // Create a virtual register, then update the virtual register.
-  auto &DL = DAG.getDataLayout();
-  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
-  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+  unsigned VReg; bool CreatedVReg;
+  std::tie(VReg, CreatedVReg) = FuncInfo.getOrCreateSwiftErrorVRegDefAt(&I);
   // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
   // Chain can be getRoot or getControlRoot.
   SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
                                       SDValue(Src.getNode(), Src.getResNo()));
   DAG.setRoot(CopyNode);
-  FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg);
+  if (CreatedVReg)
+    FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg);
 }
 
 void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
@@ -3571,8 +3645,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   Type *Ty = I.getType();
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
-  assert(!AA->pointsToConstantMemory(MemoryLocation(
-             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+  assert((!AA || !AA->pointsToConstantMemory(MemoryLocation(
+             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
          "load_from_swift_error should not be constant memory");
 
   SmallVector<EVT, 4> ValueVTs;
@@ -3585,7 +3659,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
   // Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
   SDValue L = DAG.getCopyFromReg(
       getRoot(), getCurSDLoc(),
-      FuncInfo.getOrCreateSwiftErrorVReg(FuncInfo.MBB, SV), ValueVTs[0]);
+      FuncInfo.getOrCreateSwiftErrorVRegUseAt(&I, FuncInfo.MBB, SV).first,
+      ValueVTs[0]);
 
   setValue(&I, L);
 }
@@ -3639,6 +3714,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
     MMOFlags |= MachineMemOperand::MOVolatile;
   if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
     MMOFlags |= MachineMemOperand::MONonTemporal;
+  MMOFlags |= TLI.getMMOFlags(I);
 
   // An aggregate load cannot wrap around the address space, so offsets to its
   // parts don't wrap either.
@@ -3655,7 +3731,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
       ChainI = 0;
     }
     SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
-                              DAG.getConstant(Offsets[i], dl, PtrVT), &Flags);
+                              DAG.getConstant(Offsets[i], dl, PtrVT), Flags);
     SDValue St = DAG.getStore(
         Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add,
         MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo);
@@ -3853,7 +3929,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
 
   // Do not serialize masked loads of constant memory with anything.
-  bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+  bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation(
       PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
@@ -3897,7 +3973,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
   bool ConstantMemory = false;
   if (UniformBase &&
-      AA->pointsToConstantMemory(MemoryLocation(
+      AA && AA->pointsToConstantMemory(MemoryLocation(
           BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
           AAInfo))) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
@@ -3929,7 +4005,7 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
   AtomicOrdering FailureOrder = I.getFailureOrdering();
-  SynchronizationScope Scope = I.getSynchScope();
+  SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
@@ -3939,7 +4015,7 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
       ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
       getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
       getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
-      /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope);
+      /*Alignment=*/ 0, SuccessOrder, FailureOrder, SSID);
 
   SDValue OutChain = L.getValue(2);
 
@@ -3965,7 +4041,7 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
   case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
   }
   AtomicOrdering Order = I.getOrdering();
-  SynchronizationScope Scope = I.getSynchScope();
+  SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
@@ -3976,7 +4052,7 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
                   I.getPointerOperand(),
-                  /* Alignment=*/ 0, Order, Scope);
+                  /* Alignment=*/ 0, Order, SSID);
 
   SDValue OutChain = L.getValue(1);
 
@@ -3990,16 +4066,16 @@ void SelectionDAGBuilder::visitFence(const FenceInst &I) {
   SDValue Ops[3];
   Ops[0] = getRoot();
   Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
-                           TLI.getPointerTy(DAG.getDataLayout()));
-  Ops[2] = DAG.getConstant(I.getSynchScope(), dl,
-                           TLI.getPointerTy(DAG.getDataLayout()));
+                           TLI.getFenceOperandTy(DAG.getDataLayout()));
+  Ops[2] = DAG.getConstant(I.getSyncScopeID(), dl,
+                           TLI.getFenceOperandTy(DAG.getDataLayout()));
   DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
 }
 
 void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering Order = I.getOrdering();
-  SynchronizationScope Scope = I.getSynchScope();
+  SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
@@ -4017,7 +4093,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
                            VT.getStoreSize(),
                            I.getAlignment() ? I.getAlignment() :
                                               DAG.getEVTAlignment(VT),
-                           AAMDNodes(), nullptr, Scope, Order);
+                           AAMDNodes(), nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
   SDValue L =
@@ -4034,7 +4110,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   SDLoc dl = getCurSDLoc();
 
   AtomicOrdering Order = I.getOrdering();
-  SynchronizationScope Scope = I.getSynchScope();
+  SyncScope::ID SSID = I.getSyncScopeID();
 
   SDValue InChain = getRoot();
 
@@ -4051,7 +4127,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
                   getValue(I.getPointerOperand()),
                   getValue(I.getValueOperand()),
                   I.getPointerOperand(), I.getAlignment(),
-                  Order, Scope);
+                  Order, SSID);
 
   DAG.setRoot(OutChain);
 }
@@ -4695,7 +4771,7 @@ static unsigned getUnderlyingArgReg(const SDValue &N) {
 /// At the end of instruction selection, they will be inserted to the entry BB.
 bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     const Value *V, DILocalVariable *Variable, DIExpression *Expr,
-    DILocation *DL, int64_t Offset, bool IsIndirect, const SDValue &N) {
+    DILocation *DL, int64_t Offset, bool IsDbgDeclare, const SDValue &N) {
   const Argument *Arg = dyn_cast<Argument>(V);
   if (!Arg)
     return false;
@@ -4709,9 +4785,11 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction()))
     return false;
 
+  bool IsIndirect = false;
   Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
-  if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+  int FI = FuncInfo.getArgumentFrameIndex(Arg);
+  if (FI != INT_MAX)
     Op = MachineOperand::CreateFI(FI);
 
   if (!Op && N.getNode()) {
@@ -4722,15 +4800,19 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
       if (PR)
         Reg = PR;
     }
-    if (Reg)
+    if (Reg) {
       Op = MachineOperand::CreateReg(Reg, false);
+      IsIndirect = IsDbgDeclare;
+    }
   }
 
   if (!Op) {
     // Check if ValueMap has reg number.
     DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
-    if (VMI != FuncInfo.ValueMap.end())
+    if (VMI != FuncInfo.ValueMap.end()) {
       Op = MachineOperand::CreateReg(VMI->second, false);
+      IsIndirect = IsDbgDeclare;
+    }
   }
 
   if (!Op && N.getNode())
@@ -4752,7 +4834,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   else
     FuncInfo.ArgDbgValues.push_back(
         BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE))
-            .addOperand(*Op)
+            .add(*Op)
             .addImm(Offset)
             .addMetadata(Variable)
             .addMetadata(Expr));
@@ -4764,26 +4846,17 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
 SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
                                              DILocalVariable *Variable,
                                              DIExpression *Expr, int64_t Offset,
-                                             DebugLoc dl,
+                                             const DebugLoc &dl,
                                              unsigned DbgSDNodeOrder) {
-  SDDbgValue *SDV;
-  auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode());
-  if (FISDN && Expr->startsWithDeref()) {
+  if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
     // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
     // stack slot locations as such instead of as indirectly addressed
     // locations.
-    ArrayRef<uint64_t> TrailingElements(Expr->elements_begin() + 1,
-                                        Expr->elements_end());
-    DIExpression *DerefedDIExpr =
-        DIExpression::get(*DAG.getContext(), TrailingElements);
-    int FI = FISDN->getIndex();
-    SDV = DAG.getFrameIndexDbgValue(Variable, DerefedDIExpr, FI, 0, dl,
-                                    DbgSDNodeOrder);
-  } else {
-    SDV = DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false,
-                          Offset, dl, DbgSDNodeOrder);
+    return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), 0, dl,
+                                     DbgSDNodeOrder);
   }
-  return SDV;
+  return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false,
+                         Offset, dl, DbgSDNodeOrder);
 }
 
 // VisualStudio defines setjmp as _setjmp
@@ -4794,9 +4867,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
 #  define setjmp_undefined_for_msvc
 #endif
 
-/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
-/// we want to emit this as a call to a named external function, return the name
-/// otherwise lower it and return null.
+/// Lower the call to the specified intrinsic function. If we want to emit this
+/// as a call to a named external function, return the name. Otherwise, lower it
+/// and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -4897,11 +4970,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     updateDAGForMaybeTailCall(MM);
     return nullptr;
   }
-  case Intrinsic::memcpy_element_atomic: {
-    SDValue Dst = getValue(I.getArgOperand(0));
-    SDValue Src = getValue(I.getArgOperand(1));
-    SDValue NumElements = getValue(I.getArgOperand(2));
-    SDValue ElementSize = getValue(I.getArgOperand(3));
+  case Intrinsic::memcpy_element_unordered_atomic: {
+    const ElementUnorderedAtomicMemCpyInst &MI =
+        cast<ElementUnorderedAtomicMemCpyInst>(I);
+    SDValue Dst = getValue(MI.getRawDest());
+    SDValue Src = getValue(MI.getRawSource());
+    SDValue Length = getValue(MI.getLength());
 
     // Emit a library call.
     TargetLowering::ArgListTy Args;
@@ -4912,31 +4986,101 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
     Entry.Node = Src;
     Args.push_back(Entry);
-    
-    Entry.Ty = I.getArgOperand(2)->getType();
-    Entry.Node = NumElements;
+
+    Entry.Ty = MI.getLength()->getType();
+    Entry.Node = Length;
+    Args.push_back(Entry);
+
+    uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
+    RTLIB::Libcall LibraryCall =
+        RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
+    if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+      report_fatal_error("Unsupported element size");
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
+        TLI.getLibcallCallingConv(LibraryCall),
+        Type::getVoidTy(*DAG.getContext()),
+        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    DAG.setRoot(CallResult.second);
+    return nullptr;
+  }
+  case Intrinsic::memmove_element_unordered_atomic: {
+    auto &MI = cast<ElementUnorderedAtomicMemMoveInst>(I);
+    SDValue Dst = getValue(MI.getRawDest());
+    SDValue Src = getValue(MI.getRawSource());
+    SDValue Length = getValue(MI.getLength());
+
+    // Emit a library call.
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Entry.Node = Dst;
     Args.push_back(Entry);
-    
-    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Entry.Node = ElementSize;
+
+    Entry.Node = Src;
     Args.push_back(Entry);
 
-    uint64_t ElementSizeConstant =
-        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    Entry.Ty = MI.getLength()->getType();
+    Entry.Node = Length;
+    Args.push_back(Entry);
+
+    uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
+    RTLIB::Libcall LibraryCall =
+        RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
+    if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+      report_fatal_error("Unsupported element size");
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
+        TLI.getLibcallCallingConv(LibraryCall),
+        Type::getVoidTy(*DAG.getContext()),
+        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    DAG.setRoot(CallResult.second);
+    return nullptr;
+  }
+  case Intrinsic::memset_element_unordered_atomic: {
+    auto &MI = cast<ElementUnorderedAtomicMemSetInst>(I);
+    SDValue Dst = getValue(MI.getRawDest());
+    SDValue Val = getValue(MI.getValue());
+    SDValue Length = getValue(MI.getLength());
+
+    // Emit a library call.
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Entry.Node = Dst;
+    Args.push_back(Entry);
+
+    Entry.Ty = Type::getInt8Ty(*DAG.getContext());
+    Entry.Node = Val;
+    Args.push_back(Entry);
+
+    Entry.Ty = MI.getLength()->getType();
+    Entry.Node = Length;
+    Args.push_back(Entry);
+
+    uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
     RTLIB::Libcall LibraryCall =
-        RTLIB::getMEMCPY_ELEMENT_ATOMIC(ElementSizeConstant);
+        RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
     if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
       report_fatal_error("Unsupported element size");
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl)
-        .setChain(getRoot())
-        .setCallee(TLI.getLibcallCallingConv(LibraryCall),
-                   Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(
-                       TLI.getLibcallName(LibraryCall),
-                       TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
+        TLI.getLibcallCallingConv(LibraryCall),
+        Type::getVoidTy(*DAG.getContext()),
+        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     DAG.setRoot(CallResult.second);
@@ -4960,6 +5104,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       return nullptr;
     }
 
+    // Byval arguments with frame indices were already handled after argument
+    // lowering and before isel.
+    const auto *Arg =
+        dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
+    if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
+      return nullptr;
+
     SDValue &N = NodeMap[Address];
     if (!N.getNode() && isa<Argument>(Address))
       // Check unused arguments map.
@@ -4978,8 +5129,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       } else if (isa<Argument>(Address)) {
         // Address is an argument, so try to emit its dbg value using
         // virtual register info from the FuncInfo.ValueMap.
-        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
-                                 N);
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N);
         return nullptr;
       } else {
         SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
@@ -4989,22 +5139,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     } else {
       // If Address is an argument then try to emit its dbg value using
       // virtual register info from the FuncInfo.ValueMap.
-      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
+      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true,
                                     N)) {
-        // If variable is pinned by a alloca in dominating bb then
-        // use StaticAllocaMap.
-        if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
-          if (AI->getParent() != DI.getParent()) {
-            DenseMap<const AllocaInst*, int>::iterator SI =
-              FuncInfo.StaticAllocaMap.find(AI);
-            if (SI != FuncInfo.StaticAllocaMap.end()) {
-              SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
-                                              0, dl, SDNodeOrder);
-              DAG.AddDbgValue(SDV, nullptr, false);
-              return nullptr;
-            }
-          }
-        }
         DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       }
     }
@@ -5026,45 +5162,33 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl,
                                     SDNodeOrder);
       DAG.AddDbgValue(SDV, nullptr, false);
-    } else {
-      // Do not use getValue() in here; we don't want to generate code at
-      // this point if it hasn't been done yet.
-      SDValue N = NodeMap[V];
-      if (!N.getNode() && isa<Argument>(V))
-        // Check unused arguments map.
-        N = UnusedArgNodeMap[V];
-      if (N.getNode()) {
-        if (!EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset,
-                                      false, N)) {
-          SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder);
-          DAG.AddDbgValue(SDV, N.getNode(), false);
-        }
-      } else if (!V->use_empty() ) {
-        // Do not call getValue(V) yet, as we don't want to generate code.
-        // Remember it for later.
-        DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
-        DanglingDebugInfoMap[V] = DDI;
-      } else {
-        // We may expand this to cover more cases.  One case where we have no
-        // data available is an unreferenced parameter.
-        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
-      }
+      return nullptr;
     }
 
-    // Build a debug info table entry.
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(V))
-      V = BCI->getOperand(0);
-    const AllocaInst *AI = dyn_cast<AllocaInst>(V);
-    // Don't handle byval struct arguments or VLAs, for example.
-    if (!AI) {
-      DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
-      DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
+    // Do not use getValue() in here; we don't want to generate code at
+    // this point if it hasn't been done yet.
+    SDValue N = NodeMap[V];
+    if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
+      N = UnusedArgNodeMap[V];
+    if (N.getNode()) {
+      if (EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, false,
+                                   N))
+        return nullptr;
+      SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder);
+      DAG.AddDbgValue(SDV, N.getNode(), false);
       return nullptr;
     }
-    DenseMap<const AllocaInst*, int>::iterator SI =
-      FuncInfo.StaticAllocaMap.find(AI);
-    if (SI == FuncInfo.StaticAllocaMap.end())
-      return nullptr; // VLAs.
+
+    if (!V->use_empty() ) {
+      // Do not call getValue(V) yet, as we don't want to generate code.
+      // Remember it for later.
+      DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
+      DanglingDebugInfoMap[V] = DDI;
+      return nullptr;
+    }
+
+    DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
+    DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
     return nullptr;
   }
 
@@ -5202,7 +5326,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue ShOps[2];
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps);
+    ShAmt =  DAG.getBuildVector(ShAmtVT, sdl, ShOps);
     EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
     Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
@@ -5301,6 +5425,25 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return nullptr;
+  case Intrinsic::experimental_constrained_fadd:
+  case Intrinsic::experimental_constrained_fsub:
+  case Intrinsic::experimental_constrained_fmul:
+  case Intrinsic::experimental_constrained_fdiv:
+  case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_sqrt:
+  case Intrinsic::experimental_constrained_pow:
+  case Intrinsic::experimental_constrained_powi:
+  case Intrinsic::experimental_constrained_sin:
+  case Intrinsic::experimental_constrained_cos:
+  case Intrinsic::experimental_constrained_exp:
+  case Intrinsic::experimental_constrained_exp2:
+  case Intrinsic::experimental_constrained_log:
+  case Intrinsic::experimental_constrained_log10:
+  case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_rint:
+  case Intrinsic::experimental_constrained_nearbyint:
+    visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
+    return nullptr;
   case Intrinsic::fmuladd: {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
@@ -5537,7 +5680,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::trap: {
     StringRef TrapFuncName =
         I.getAttributes()
-            .getAttribute(AttributeSet::FunctionIndex, "trap-func-name")
+            .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
             .getValueAsString();
     if (TrapFuncName.empty()) {
       ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
@@ -5548,7 +5691,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee(
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
         CallingConv::C, I.getType(),
         DAG.getExternalSymbol(TrapFuncName.data(),
                               TLI.getPointerTy(DAG.getDataLayout())),
@@ -5629,7 +5772,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       SDValue Ops[2];
       Ops[0] = getRoot();
       Ops[1] =
-          DAG.getFrameIndex(FI, TLI.getPointerTy(DAG.getDataLayout()), true);
+          DAG.getFrameIndex(FI, TLI.getFrameIndexTy(DAG.getDataLayout()), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
       Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
@@ -5690,7 +5833,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       int FI = FuncInfo.StaticAllocaMap[Slot];
       MCSymbol *FrameAllocSym =
           MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-              GlobalValue::getRealLinkageName(MF.getName()), Idx);
+              GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
               TII->get(TargetOpcode::LOCAL_ESCAPE))
           .addSym(FrameAllocSym)
@@ -5711,7 +5854,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
     MCSymbol *FrameAllocSym =
         MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
-            GlobalValue::getRealLinkageName(Fn->getName()), IdxVal);
+            GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
 
     // Create a MCSymbol for the label to avoid any target lowering
     // that would make this PC relative.
@@ -5742,13 +5885,142 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, N);
     return nullptr;
   }
+  case Intrinsic::xray_customevent: {
+    // Here we want to make sure that the intrinsic behaves as if it has a
+    // specific calling convention, and only for x86_64.
+    // FIXME: Support other platforms later.
+    const auto &Triple = DAG.getTarget().getTargetTriple();
+    if (Triple.getArch() != Triple::x86_64 || !Triple.isOSLinux())
+      return nullptr;
 
+    SDLoc DL = getCurSDLoc();
+    SmallVector<SDValue, 8> Ops;
+
+    // We want to say that we always want the arguments in registers.
+    SDValue LogEntryVal = getValue(I.getArgOperand(0));
+    SDValue StrSizeVal = getValue(I.getArgOperand(1));
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Chain = getRoot();
+    Ops.push_back(LogEntryVal);
+    Ops.push_back(StrSizeVal);
+    Ops.push_back(Chain);
+
+    // We need to enforce the calling convention for the callsite, so that
+    // argument ordering is enforced correctly, and that register allocation can
+    // see that some registers may be assumed clobbered and have to preserve
+    // them across calls to the intrinsic.
+    MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
+                                           DL, NodeTys, Ops);
+    SDValue patchableNode = SDValue(MN, 0);
+    DAG.setRoot(patchableNode);
+    setValue(&I, patchableNode);
+    return nullptr;
+  }
   case Intrinsic::experimental_deoptimize:
     LowerDeoptimizeCall(&I);
     return nullptr;
+
+  case Intrinsic::experimental_vector_reduce_fadd:
+  case Intrinsic::experimental_vector_reduce_fmul:
+  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin: {
+    visitVectorReduce(I, Intrinsic);
+    return nullptr;
+  }
+
   }
 }
 
+void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
+    const ConstrainedFPIntrinsic &FPI) {
+  SDLoc sdl = getCurSDLoc();
+  unsigned Opcode;
+  switch (FPI.getIntrinsicID()) {
+  default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+  case Intrinsic::experimental_constrained_fadd:
+    Opcode = ISD::STRICT_FADD;
+    break;
+  case Intrinsic::experimental_constrained_fsub:
+    Opcode = ISD::STRICT_FSUB;
+    break;
+  case Intrinsic::experimental_constrained_fmul:
+    Opcode = ISD::STRICT_FMUL;
+    break;
+  case Intrinsic::experimental_constrained_fdiv:
+    Opcode = ISD::STRICT_FDIV;
+    break;
+  case Intrinsic::experimental_constrained_frem:
+    Opcode = ISD::STRICT_FREM;
+    break;
+  case Intrinsic::experimental_constrained_sqrt:
+    Opcode = ISD::STRICT_FSQRT;
+    break;
+  case Intrinsic::experimental_constrained_pow:
+    Opcode = ISD::STRICT_FPOW;
+    break;
+  case Intrinsic::experimental_constrained_powi:
+    Opcode = ISD::STRICT_FPOWI;
+    break;
+  case Intrinsic::experimental_constrained_sin:
+    Opcode = ISD::STRICT_FSIN;
+    break;
+  case Intrinsic::experimental_constrained_cos:
+    Opcode = ISD::STRICT_FCOS;
+    break;
+  case Intrinsic::experimental_constrained_exp:
+    Opcode = ISD::STRICT_FEXP;
+    break;
+  case Intrinsic::experimental_constrained_exp2:
+    Opcode = ISD::STRICT_FEXP2;
+    break;
+  case Intrinsic::experimental_constrained_log:
+    Opcode = ISD::STRICT_FLOG;
+    break;
+  case Intrinsic::experimental_constrained_log10:
+    Opcode = ISD::STRICT_FLOG10;
+    break;
+  case Intrinsic::experimental_constrained_log2:
+    Opcode = ISD::STRICT_FLOG2;
+    break;
+  case Intrinsic::experimental_constrained_rint:
+    Opcode = ISD::STRICT_FRINT;
+    break;
+  case Intrinsic::experimental_constrained_nearbyint:
+    Opcode = ISD::STRICT_FNEARBYINT;
+    break;
+  }
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Chain = getRoot();
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
+  ValueVTs.push_back(MVT::Other); // Out chain
+
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+  SDValue Result;
+  if (FPI.isUnaryOp())
+    Result = DAG.getNode(Opcode, sdl, VTs, 
+                         { Chain, getValue(FPI.getArgOperand(0)) });
+  else
+    Result = DAG.getNode(Opcode, sdl, VTs, 
+                         { Chain, getValue(FPI.getArgOperand(0)),
+                           getValue(FPI.getArgOperand(1))  });
+
+  assert(Result.getNode()->getNumValues() == 2);
+  SDValue OutChain = Result.getValue(1);
+  DAG.setRoot(OutChain);
+  SDValue FPResult = Result.getValue(0);
+  setValue(&FPI, FPResult);
+}
+
 std::pair<SDValue, SDValue>
 SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
                                     const BasicBlock *EHPadBB) {
@@ -5827,7 +6099,6 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   Type *RetTy = CS.getType();
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   Args.reserve(CS.arg_size());
 
   const Value *SwiftErrorVal = nullptr;
@@ -5843,6 +6114,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
+    TargetLowering::ArgListEntry Entry;
     const Value *V = *i;
 
     // Skip empty types
@@ -5852,24 +6124,25 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
-    // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Entry.setAttributes(&CS, i - CS.arg_begin());
 
     // Use swifterror virtual register as input to the call.
-    if (Entry.isSwiftError && TLI.supportSwiftError()) {
+    if (Entry.IsSwiftError && TLI.supportSwiftError()) {
       SwiftErrorVal = V;
       // We find the virtual register for the actual swifterror argument.
       // Instead of using the Value, we use the virtual register instead.
-      Entry.Node =
-          DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVReg(FuncInfo.MBB, V),
-                          EVT(TLI.getPointerTy(DL)));
+      Entry.Node = DAG.getRegister(FuncInfo
+                                       .getOrCreateSwiftErrorVRegUseAt(
+                                           CS.getInstruction(), FuncInfo.MBB, V)
+                                       .first,
+                                   EVT(TLI.getPointerTy(DL)));
     }
 
     Args.push_back(Entry);
 
     // If we have an explicit sret argument that is an Instruction, (i.e., it
     // might point to function-local memory), we can't meaningfully tail-call.
-    if (Entry.isSRet && isa<Instruction>(V))
+    if (Entry.IsSRet && isa<Instruction>(V))
       isTailCall = false;
   }
 
@@ -5903,38 +6176,29 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (SwiftErrorVal && TLI.supportSwiftError()) {
     // Get the last element of InVals.
     SDValue Src = CLI.InVals.back();
-    const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
-    unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+    unsigned VReg; bool CreatedVReg;
+    std::tie(VReg, CreatedVReg) =
+        FuncInfo.getOrCreateSwiftErrorVRegDefAt(CS.getInstruction());
     SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
     // We update the virtual register for the actual swifterror argument.
-    FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg);
+    if (CreatedVReg)
+      FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg);
     DAG.setRoot(CopyNode);
   }
 }
 
-/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
-/// value is equal or not-equal to zero.
-static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
-  for (const User *U : V->users()) {
-    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
-      if (IC->isEquality())
-        if (const Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
 static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
-                             Type *LoadTy,
                              SelectionDAGBuilder &Builder) {
 
   // Check to see if this load can be trivially constant folded, e.g. if the
   // input is from a string literal.
   if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
     // Cast pointer to the type we really want to load.
+    Type *LoadTy =
+        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+    if (LoadVT.isVector())
+      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
 
@@ -5949,7 +6213,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   bool ConstantMemory = false;
 
   // Do not serialize (non-volatile) loads of constant memory with anything.
-  if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+  if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
     Root = Builder.DAG.getEntryNode();
     ConstantMemory = true;
   } else {
@@ -5967,8 +6231,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
-/// processIntegerCallValue - Record the value for an instruction that
-/// produces an integer result, converting the type where necessary.
+/// Record the value for an instruction that produces an integer result,
+/// converting the type where necessary.
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
@@ -5981,20 +6245,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
   setValue(&I, Value);
 }
 
-/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
-/// If so, return true and lower it, otherwise return false and it will be
-/// lowered like a normal call.
+/// See if we can lower a memcmp call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
-  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
-      !I.getArgOperand(2)->getType()->isIntegerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
-
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
@@ -6005,11 +6262,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   }
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
-  std::pair<SDValue, SDValue> Res =
-    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
-                                getValue(LHS), getValue(RHS), getValue(Size),
-                                MachinePointerInfo(LHS),
-                                MachinePointerInfo(RHS));
+  std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
+      DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
+      getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, true);
     PendingLoads.push_back(Res.second);
@@ -6018,88 +6273,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
-    bool ActuallyDoIt = true;
-    MVT LoadVT;
-    Type *LoadTy;
-    switch (CSize->getZExtValue()) {
-    default:
-      LoadVT = MVT::Other;
-      LoadTy = nullptr;
-      ActuallyDoIt = false;
-      break;
-    case 2:
-      LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(CSize->getContext());
-      break;
-    case 4:
-      LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(CSize->getContext());
-      break;
-    case 8:
-      LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(CSize->getContext());
-      break;
-        /*
-    case 16:
-      LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(CSize->getContext());
-      LoadTy = VectorType::get(LoadTy, 4);
-      break;
-         */
-    }
-
-    // This turns into unaligned loads.  We only do this if the target natively
-    // supports the MVT we'll be loading or if it is small enough (<= 4) that
-    // we'll only produce a small number of byte loads.
+  if (!CSize || !isOnlyUsedInZeroEqualityComparison(&I))
+    return false;
 
-    // Require that we can find a legal MVT, and only do this if the target
-    // supports unaligned loads of that type.  Expanding into byte loads would
-    // bloat the code.
+  // If the target has a fast compare for the given size, it will return a
+  // preferred load type for that size. Require that the load VT is legal and
+  // that the target supports unaligned loads of that type. Otherwise, return
+  // INVALID.
+  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
-      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       // TODO: Check alignment of src and dest ptrs.
-      if (!TLI.isTypeLegal(LoadVT) ||
-          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
-          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
-        ActuallyDoIt = false;
+      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+      if (!TLI.isTypeLegal(LVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     }
 
-    if (ActuallyDoIt) {
-      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
-      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+    return LVT;
+  };
 
-      SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
-                                 ISD::SETNE);
-      processIntegerCallValue(I, Res, false);
-      return true;
-    }
+  // This turns into unaligned loads. We only do this if the target natively
+  // supports the MVT we'll be loading or if it is small enough (<= 4) that
+  // we'll only produce a small number of byte loads.
+  MVT LoadVT;
+  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+  switch (NumBitsToCompare) {
+  default:
+    return false;
+  case 16:
+    LoadVT = MVT::i16;
+    break;
+  case 32:
+    LoadVT = MVT::i32;
+    break;
+  case 64:
+  case 128:
+  case 256:
+    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
+    break;
   }
 
+  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
 
-  return false;
+  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+  // Bitcast to a wide integer type if the loads are vectors.
+  if (LoadVT.isVector()) {
+    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+    LoadL = DAG.getBitcast(CmpVT, LoadL);
+    LoadR = DAG.getBitcast(CmpVT, LoadR);
+  }
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+  processIntegerCallValue(I, Cmp, false);
+  return true;
 }
 
-/// visitMemChrCall -- See if we can lower a memchr call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a memchr call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   const Value *Src = I.getArgOperand(0);
   const Value *Char = I.getArgOperand(1);
   const Value *Length = I.getArgOperand(2);
-  if (!Src->getType()->isPointerTy() ||
-      !Char->getType()->isIntegerTy() ||
-      !Length->getType()->isIntegerTy() ||
-      !I.getType()->isPointerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6115,15 +6361,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
   return false;
 }
 
-///
-/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to
-/// to adjust the dst pointer by the size of the copied memory.
+/// See if we can lower a mempcpy call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
-
-  // Verify argument count: void *mempcpy(void *, const void *, size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   SDValue Dst = getValue(I.getArgOperand(0));
   SDValue Src = getValue(I.getArgOperand(1));
   SDValue Size = getValue(I.getArgOperand(2));
@@ -6158,19 +6401,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   return true;
 }
 
-/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
-/// optimized form.  If so, return true and lower it, otherwise return false
-/// and it will be lowered like a normal call.
+/// See if we can lower a strcpy call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
-  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isPointerTy() ||
-      !I.getType()->isPointerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6187,19 +6424,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
   return false;
 }
 
-/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
-/// If so, return true and lower it, otherwise return false and it will be
-/// lowered like a normal call.
+/// See if we can lower a strcmp call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  int strcmp(void*,void*)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isPointerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6216,17 +6447,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
   return false;
 }
 
-/// visitStrLenCall -- See if we can lower a strlen call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a strlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  size_t strlen(char *)
-  if (I.getNumArgOperands() != 1)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0);
-  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6241,19 +6468,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
   return false;
 }
 
-/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a strnlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isIntegerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6269,16 +6490,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
   return false;
 }
 
-/// visitUnaryFloatCall - If a call instruction is a unary floating-point
-/// operation (as expected), translate it to an SDNode with the specified opcode
-/// and return true.
+/// See if we can lower a unary floating-point operation into an SDNode with
+/// the specified Opcode.  If so, return true and lower it, otherwise return
+/// false and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
                                               unsigned Opcode) {
-  // Sanity check that it really is a unary floating-point call.
-  if (I.getNumArgOperands() != 1 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      !I.onlyReadsMemory())
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp = getValue(I.getArgOperand(0));
@@ -6286,17 +6506,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
   return true;
 }
 
-/// visitBinaryFloatCall - If a call instruction is a binary floating-point
-/// operation (as expected), translate it to an SDNode with the specified opcode
-/// and return true.
+/// See if we can lower a binary floating-point operation into an SDNode with
+/// the specified Opcode. If so, return true and lower it. Otherwise return
+/// false, and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
                                                unsigned Opcode) {
-  // Sanity check that it really is a binary floating-point call.
-  if (I.getNumArgOperands() != 2 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      I.getType() != I.getArgOperand(1)->getType() ||
-      !I.onlyReadsMemory())
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp0 = getValue(I.getArgOperand(0));
@@ -6336,20 +6554,18 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     // Check for well-known libc/libm calls.  If the function is internal, it
     // can't be a library call.  Don't do the check if marked as nobuiltin for
     // some reason.
-    LibFunc::Func Func;
+    LibFunc Func;
     if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
-        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->getLibFunc(*F, Func) &&
         LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
       default: break;
-      case LibFunc::copysign:
-      case LibFunc::copysignf:
-      case LibFunc::copysignl:
-        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.getType() == I.getArgOperand(1)->getType() &&
-            I.onlyReadsMemory()) {
+      case LibFunc_copysign:
+      case LibFunc_copysignf:
+      case LibFunc_copysignl:
+        // We already checked this call's prototype; verify it doesn't modify
+        // errno.
+        if (I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
@@ -6357,122 +6573,122 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
           return;
         }
         break;
-      case LibFunc::fabs:
-      case LibFunc::fabsf:
-      case LibFunc::fabsl:
+      case LibFunc_fabs:
+      case LibFunc_fabsf:
+      case LibFunc_fabsl:
         if (visitUnaryFloatCall(I, ISD::FABS))
           return;
         break;
-      case LibFunc::fmin:
-      case LibFunc::fminf:
-      case LibFunc::fminl:
+      case LibFunc_fmin:
+      case LibFunc_fminf:
+      case LibFunc_fminl:
         if (visitBinaryFloatCall(I, ISD::FMINNUM))
           return;
         break;
-      case LibFunc::fmax:
-      case LibFunc::fmaxf:
-      case LibFunc::fmaxl:
+      case LibFunc_fmax:
+      case LibFunc_fmaxf:
+      case LibFunc_fmaxl:
         if (visitBinaryFloatCall(I, ISD::FMAXNUM))
           return;
         break;
-      case LibFunc::sin:
-      case LibFunc::sinf:
-      case LibFunc::sinl:
+      case LibFunc_sin:
+      case LibFunc_sinf:
+      case LibFunc_sinl:
         if (visitUnaryFloatCall(I, ISD::FSIN))
           return;
         break;
-      case LibFunc::cos:
-      case LibFunc::cosf:
-      case LibFunc::cosl:
+      case LibFunc_cos:
+      case LibFunc_cosf:
+      case LibFunc_cosl:
         if (visitUnaryFloatCall(I, ISD::FCOS))
           return;
         break;
-      case LibFunc::sqrt:
-      case LibFunc::sqrtf:
-      case LibFunc::sqrtl:
-      case LibFunc::sqrt_finite:
-      case LibFunc::sqrtf_finite:
-      case LibFunc::sqrtl_finite:
+      case LibFunc_sqrt:
+      case LibFunc_sqrtf:
+      case LibFunc_sqrtl:
+      case LibFunc_sqrt_finite:
+      case LibFunc_sqrtf_finite:
+      case LibFunc_sqrtl_finite:
         if (visitUnaryFloatCall(I, ISD::FSQRT))
           return;
         break;
-      case LibFunc::floor:
-      case LibFunc::floorf:
-      case LibFunc::floorl:
+      case LibFunc_floor:
+      case LibFunc_floorf:
+      case LibFunc_floorl:
         if (visitUnaryFloatCall(I, ISD::FFLOOR))
           return;
         break;
-      case LibFunc::nearbyint:
-      case LibFunc::nearbyintf:
-      case LibFunc::nearbyintl:
+      case LibFunc_nearbyint:
+      case LibFunc_nearbyintf:
+      case LibFunc_nearbyintl:
         if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
           return;
         break;
-      case LibFunc::ceil:
-      case LibFunc::ceilf:
-      case LibFunc::ceill:
+      case LibFunc_ceil:
+      case LibFunc_ceilf:
+      case LibFunc_ceill:
         if (visitUnaryFloatCall(I, ISD::FCEIL))
           return;
         break;
-      case LibFunc::rint:
-      case LibFunc::rintf:
-      case LibFunc::rintl:
+      case LibFunc_rint:
+      case LibFunc_rintf:
+      case LibFunc_rintl:
         if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
         break;
-      case LibFunc::round:
-      case LibFunc::roundf:
-      case LibFunc::roundl:
+      case LibFunc_round:
+      case LibFunc_roundf:
+      case LibFunc_roundl:
         if (visitUnaryFloatCall(I, ISD::FROUND))
           return;
         break;
-      case LibFunc::trunc:
-      case LibFunc::truncf:
-      case LibFunc::truncl:
+      case LibFunc_trunc:
+      case LibFunc_truncf:
+      case LibFunc_truncl:
         if (visitUnaryFloatCall(I, ISD::FTRUNC))
           return;
         break;
-      case LibFunc::log2:
-      case LibFunc::log2f:
-      case LibFunc::log2l:
+      case LibFunc_log2:
+      case LibFunc_log2f:
+      case LibFunc_log2l:
         if (visitUnaryFloatCall(I, ISD::FLOG2))
           return;
         break;
-      case LibFunc::exp2:
-      case LibFunc::exp2f:
-      case LibFunc::exp2l:
+      case LibFunc_exp2:
+      case LibFunc_exp2f:
+      case LibFunc_exp2l:
         if (visitUnaryFloatCall(I, ISD::FEXP2))
           return;
         break;
-      case LibFunc::memcmp:
+      case LibFunc_memcmp:
         if (visitMemCmpCall(I))
           return;
         break;
-      case LibFunc::mempcpy:
+      case LibFunc_mempcpy:
         if (visitMemPCpyCall(I))
           return;
         break;
-      case LibFunc::memchr:
+      case LibFunc_memchr:
         if (visitMemChrCall(I))
           return;
         break;
-      case LibFunc::strcpy:
+      case LibFunc_strcpy:
         if (visitStrCpyCall(I, false))
           return;
         break;
-      case LibFunc::stpcpy:
+      case LibFunc_stpcpy:
         if (visitStrCpyCall(I, true))
           return;
         break;
-      case LibFunc::strcmp:
+      case LibFunc_strcmp:
         if (visitStrCmpCall(I))
           return;
         break;
-      case LibFunc::strlen:
+      case LibFunc_strlen:
         if (visitStrLenCall(I))
           return;
         break;
-      case LibFunc::strnlen:
+      case LibFunc_strnlen:
         if (visitStrNLenCall(I))
           return;
         break;
@@ -6648,7 +6864,7 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
   unsigned Align = DL.getPrefTypeAlignment(Ty);
   MachineFunction &MF = DAG.getMachineFunction();
   int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
-  SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy(DL));
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
   Chain = DAG.getStore(Chain, Location, OpInfo.CallOperand, StackSlot,
                        MachinePointerInfo::getFixedStack(MF, SSFI));
   OpInfo.CallOperand = StackSlot;
@@ -6671,12 +6887,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
 
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<unsigned, 4> Regs;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
   // If this is a constraint for a single physreg, or a constraint for a
   // register class, find it.
   std::pair<unsigned, const TargetRegisterClass *> PhysReg =
-      TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(),
-                                       OpInfo.ConstraintCode,
+      TLI.getRegForInlineAsmConstraint(&TRI, OpInfo.ConstraintCode,
                                        OpInfo.ConstraintVT);
 
   unsigned NumRegs = 1;
@@ -6684,12 +6900,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
     // If this is a FP input in an integer register (or visa versa) insert a bit
     // cast of the input value.  More generally, handle any case where the input
     // value disagrees with the register class we plan to stick this in.
-    if (OpInfo.Type == InlineAsm::isInput &&
-        PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) {
+    if (OpInfo.Type == InlineAsm::isInput && PhysReg.second &&
+        !TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) {
       // Try to convert to the first EVT that the reg class contains.  If the
       // types are identical size, use a bitcast to convert (e.g. two differing
       // vector types).
-      MVT RegVT = *PhysReg.second->vt_begin();
+      MVT RegVT = *TRI.legalclasstypes_begin(*PhysReg.second);
       if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) {
         OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
                                          RegVT, OpInfo.CallOperand);
@@ -6717,12 +6933,12 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
   if (unsigned AssignedReg = PhysReg.first) {
     const TargetRegisterClass *RC = PhysReg.second;
     if (OpInfo.ConstraintVT == MVT::Other)
-      ValueVT = *RC->vt_begin();
+      ValueVT = *TRI.legalclasstypes_begin(*RC);
 
     // Get the actual register value type.  This is important, because the user
     // may have asked for (e.g.) the AX register in i32 type.  We need to
     // remember that AX is actually i16 to get the right extension.
-    RegVT = *RC->vt_begin();
+    RegVT = *TRI.legalclasstypes_begin(*RC);
 
     // This is a explicit reference to a physical register.
     Regs.push_back(AssignedReg);
@@ -6748,7 +6964,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
   // Otherwise, if this was a reference to an LLVM register class, create vregs
   // for this reference.
   if (const TargetRegisterClass *RC = PhysReg.second) {
-    RegVT = *RC->vt_begin();
+    RegVT = *TRI.legalclasstypes_begin(*RC);
     if (OpInfo.ConstraintVT == MVT::Other)
       ValueVT = RegVT;
 
@@ -7085,8 +7301,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
           SDLoc dl = getCurSDLoc();
           // Use the produced MatchedRegs object to
-          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl,
-                                    Chain, &Flag, CS.getInstruction());
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
+                                    CS.getInstruction());
           MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
                                            true, OpInfo.getMatchedOperand(), dl,
                                            DAG, AsmNodeOperands);
@@ -7361,7 +7577,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
 
   // Populate the argument list.
   // Attributes for args start at offset 1, after the return attribute.
-  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
        ArgI != ArgE; ++ArgI) {
     const Value *V = CS->getOperand(ArgI);
 
@@ -7370,7 +7586,7 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
     TargetLowering::ArgListEntry Entry;
     Entry.Node = getValue(V);
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, AttrI);
+    Entry.setAttributes(&CS, ArgIdx);
     Args.push_back(Entry);
   }
 
@@ -7411,7 +7627,7 @@ static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
     } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
       const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
       Ops.push_back(Builder.DAG.getTargetFrameIndex(
-          FI->getIndex(), TLI.getPointerTy(Builder.DAG.getDataLayout())));
+          FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout())));
     } else
       Ops.push_back(OpVal);
   }
@@ -7437,11 +7653,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   // have to worry about calling conventions and target specific lowering code.
   // Instead we perform the call lowering right here.
   //
-  // chain, flag = CALLSEQ_START(chain, 0)
+  // chain, flag = CALLSEQ_START(chain, 0, 0)
   // chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
   // chain, flag = CALLSEQ_END(chain, 0, 0, flag)
   //
-  Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL);
+  Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
   InFlag = Chain.getValue(1);
 
   // Add the <id> and <numBytes> constants.
@@ -7631,9 +7847,79 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   FuncInfo.MF->getFrameInfo().setHasPatchPoint();
 }
 
-/// Returns an AttributeSet representing the attributes applied to the return
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+                                            unsigned Intrinsic) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Op1 = getValue(I.getArgOperand(0));
+  SDValue Op2;
+  if (I.getNumArgOperands() > 1)
+    Op2 = getValue(I.getArgOperand(1));
+  SDLoc dl = getCurSDLoc();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  SDValue Res;
+  FastMathFlags FMF;
+  if (isa<FPMathOperator>(I))
+    FMF = I.getFastMathFlags();
+  SDNodeFlags SDFlags;
+  SDFlags.setNoNaNs(FMF.noNaNs());
+
+  switch (Intrinsic) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    if (FMF.unsafeAlgebra())
+      Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+    break;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    if (FMF.unsafeAlgebra())
+      Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+    else
+      Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+    break;
+  case Intrinsic::experimental_vector_reduce_add:
+    Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_mul:
+    Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_and:
+    Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_or:
+    Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_xor:
+    Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_smax:
+    Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_smin:
+    Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_umax:
+    Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_umin:
+    Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+    break;
+  case Intrinsic::experimental_vector_reduce_fmax: {
+    Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+    break;
+  }
+  case Intrinsic::experimental_vector_reduce_fmin: {
+    Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+    break;
+  }
+  default:
+    llvm_unreachable("Unhandled vector reduce intrinsic");
+  }
+  setValue(&I, Res);
+}
+
+/// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
-static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
@@ -7642,8 +7928,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
-  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
-                           Attrs);
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
 }
 
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
@@ -7660,6 +7946,22 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   auto &DL = CLI.DAG.getDataLayout();
   ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
 
+  if (CLI.IsPostTypeLegalization) {
+    // If we are lowering a libcall after legalization, split the return type.
+    SmallVector<EVT, 4> OldRetTys = std::move(RetTys);
+    SmallVector<uint64_t, 4> OldOffsets = std::move(Offsets);
+    for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
+      EVT RetVT = OldRetTys[i];
+      uint64_t Offset = OldOffsets[i];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
+      unsigned RegisterVTSize = RegisterVT.getSizeInBits();
+      RetTys.append(NumRegs, RegisterVT);
+      for (unsigned j = 0; j != NumRegs; ++j)
+        Offsets.push_back(Offset + j * RegisterVTSize);
+    }
+  }
+
   SmallVector<ISD::OutputArg, 4> Outs;
   GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);
 
@@ -7679,19 +7981,19 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
     Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy);
 
-    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy(DL));
+    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
     ArgListEntry Entry;
     Entry.Node = DemoteStackSlot;
     Entry.Ty = StackSlotPtrType;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isInReg = false;
-    Entry.isSRet = true;
-    Entry.isNest = false;
-    Entry.isByVal = false;
-    Entry.isReturned = false;
-    Entry.isSwiftSelf = false;
-    Entry.isSwiftError = false;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsInReg = false;
+    Entry.IsSRet = true;
+    Entry.IsNest = false;
+    Entry.IsByVal = false;
+    Entry.IsReturned = false;
+    Entry.IsSwiftSelf = false;
+    Entry.IsSwiftError = false;
     Entry.Alignment = Align;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
@@ -7702,8 +8004,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   } else {
     for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
       EVT VT = RetTys[I];
-      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      MVT RegisterVT =
+          getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs =
+          getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
         ISD::InputArg MyFlags;
         MyFlags.VT = RegisterVT;
@@ -7724,7 +8028,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   ArgListTy &Args = CLI.getArgs();
   if (supportSwiftError()) {
     for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-      if (Args[i].isSwiftError) {
+      if (Args[i].IsSwiftError) {
         ISD::InputArg MyFlags;
         MyFlags.VT = getPointerTy(DL);
         MyFlags.ArgVT = EVT(getPointerTy(DL));
@@ -7740,8 +8044,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   for (unsigned i = 0, e = Args.size(); i != e; ++i) {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
+    // FIXME: Split arguments if CLI.IsPostTypeLegalization
     Type *FinalType = Args[i].Ty;
-    if (Args[i].isByVal)
+    if (Args[i].IsByVal)
       FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         FinalType, CLI.CallConv, CLI.IsVarArg);
@@ -7752,13 +8057,17 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
 
-      if (Args[i].isZExt)
+      // Certain targets (such as MIPS), may have a different ABI alignment
+      // for a type depending on the context. Give the target a chance to
+      // specify the alignment it wants.
+      unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);
+
+      if (Args[i].IsZExt)
         Flags.setZExt();
-      if (Args[i].isSExt)
+      if (Args[i].IsSExt)
         Flags.setSExt();
-      if (Args[i].isInReg) {
+      if (Args[i].IsInReg) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (CLI.CallConv == CallingConv::X86_VectorCall &&
@@ -7771,15 +8080,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // Set InReg Flag
         Flags.setInReg();
       }
-      if (Args[i].isSRet)
+      if (Args[i].IsSRet)
         Flags.setSRet();
-      if (Args[i].isSwiftSelf)
+      if (Args[i].IsSwiftSelf)
         Flags.setSwiftSelf();
-      if (Args[i].isSwiftError)
+      if (Args[i].IsSwiftError)
         Flags.setSwiftError();
-      if (Args[i].isByVal)
+      if (Args[i].IsByVal)
         Flags.setByVal();
-      if (Args[i].isInAlloca) {
+      if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
         // inalloca.  This way we can know how many bytes we should've allocated
@@ -7788,7 +8097,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (Args[i].isByVal || Args[i].isInAlloca) {
+      if (Args[i].IsByVal || Args[i].IsInAlloca) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
@@ -7801,24 +8110,25 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           FrameAlign = getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
-      if (Args[i].isNest)
+      if (Args[i].IsNest)
         Flags.setNest();
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
 
-      MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
-      unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT);
+      MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
+      unsigned NumParts =
+          getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-      if (Args[i].isSExt)
+      if (Args[i].IsSExt)
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (Args[i].isZExt)
+      else if (Args[i].IsZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
       // Conservatively only handle 'returned' on non-vectors for now
-      if (Args[i].isReturned && !Op.getValueType().isVector()) {
+      if (Args[i].IsReturned && !Op.getValueType().isVector()) {
         assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
                "unexpected use of 'returned'");
         // Before passing 'returned' to the target lowering code, ensure that
@@ -7832,13 +8142,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // parameter extension method is not compatible with the return
         // extension method
         if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
-            (ExtendKind != ISD::ANY_EXTEND &&
-             CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt))
-        Flags.setReturned();
+            (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
+             CLI.RetZExt == Args[i].IsZExt))
+          Flags.setReturned();
       }
 
       getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
-                     CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind);
+                     CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind,
+                     true);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
@@ -7916,7 +8227,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     for (unsigned i = 0; i < NumValues; ++i) {
       SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
                                     CLI.DAG.getConstant(Offsets[i], CLI.DL,
-                                                        PtrVT), &Flags);
+                                                        PtrVT), Flags);
       SDValue L = CLI.DAG.getLoad(
           RetTys[i], CLI.DL, CLI.Chain, Add,
           MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
@@ -7938,12 +8249,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     unsigned CurReg = 0;
     for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
       EVT VT = RetTys[I];
-      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      MVT RegisterVT =
+          getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs =
+          getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
 
       ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
                                               NumRegs, RegisterVT, VT, nullptr,
-                                              AssertOp));
+                                              AssertOp, true));
       CurReg += NumRegs;
     }
 
@@ -7979,8 +8292,11 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // If this is an InlineAsm we have to match the registers required, not the
+  // notional registers required by the type.
+
   RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
-                   V->getType());
+                   V->getType(), isABIRegCopy(V));
   SDValue Chain = DAG.getEntryNode();
 
   ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
@@ -8010,6 +8326,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
   return true;
 }
 
+typedef DenseMap<const Argument *,
+                 std::pair<const AllocaInst *, const StoreInst *>>
+    ArgCopyElisionMapTy;
+
+/// Scan the entry block of the function in FuncInfo for arguments that look
+/// like copies into a local alloca. Record any copied arguments in
+/// ArgCopyElisionCandidates.
+static void
+findArgumentCopyElisionCandidates(const DataLayout &DL,
+                                  FunctionLoweringInfo *FuncInfo,
+                                  ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
+  // Record the state of every static alloca used in the entry block. Argument
+  // allocas are all used in the entry block, so we need approximately as many
+  // entries as we have arguments.
+  enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
+  SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
+  unsigned NumArgs = FuncInfo->Fn->arg_size();
+  StaticAllocas.reserve(NumArgs * 2);
+
+  auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
+    if (!V)
+      return nullptr;
+    V = V->stripPointerCasts();
+    const auto *AI = dyn_cast<AllocaInst>(V);
+    if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
+      return nullptr;
+    auto Iter = StaticAllocas.insert({AI, Unknown});
+    return &Iter.first->second;
+  };
+
+  // Look for stores of arguments to static allocas. Look through bitcasts and
+  // GEPs to handle type coercions, as long as the alloca is fully initialized
+  // by the store. Any non-store use of an alloca escapes it and any subsequent
+  // unanalyzed store might write it.
+  // FIXME: Handle structs initialized with multiple stores.
+  for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
+    // Look for stores, and handle non-store uses conservatively.
+    const auto *SI = dyn_cast<StoreInst>(&I);
+    if (!SI) {
+      // We will look through cast uses, so ignore them completely.
+      if (I.isCast())
+        continue;
+      // Ignore debug info intrinsics, they don't escape or store to allocas.
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      // This is an unknown instruction. Assume it escapes or writes to all
+      // static alloca operands.
+      for (const Use &U : I.operands()) {
+        if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
+          *Info = StaticAllocaInfo::Clobbered;
+      }
+      continue;
+    }
+
+    // If the stored value is a static alloca, mark it as escaped.
+    if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
+      *Info = StaticAllocaInfo::Clobbered;
+
+    // Check if the destination is a static alloca.
+    const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
+    StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
+    if (!Info)
+      continue;
+    const AllocaInst *AI = cast<AllocaInst>(Dst);
+
+    // Skip allocas that have been initialized or clobbered.
+    if (*Info != StaticAllocaInfo::Unknown)
+      continue;
+
+    // Check if the stored value is an argument, and that this store fully
+    // initializes the alloca. Don't elide copies from the same argument twice.
+    const Value *Val = SI->getValueOperand()->stripPointerCasts();
+    const auto *Arg = dyn_cast<Argument>(Val);
+    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
+        Arg->getType()->isEmptyTy() ||
+        DL.getTypeStoreSize(Arg->getType()) !=
+            DL.getTypeAllocSize(AI->getAllocatedType()) ||
+        ArgCopyElisionCandidates.count(Arg)) {
+      *Info = StaticAllocaInfo::Clobbered;
+      continue;
+    }
+
+    DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');
+
+    // Mark this alloca and store for argument copy elision.
+    *Info = StaticAllocaInfo::Elidable;
+    ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
+
+    // Stop scanning if we've seen all arguments. This will happen early in -O0
+    // builds, which is useful, because -O0 builds have large entry blocks and
+    // many allocas.
+    if (ArgCopyElisionCandidates.size() == NumArgs)
+      break;
+  }
+}
+
+/// Try to elide argument copies from memory into a local alloca. Succeeds if
+/// ArgVal is a load from a suitable fixed stack object.
+static void tryToElideArgumentCopy(
+    FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
+    DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
+    SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
+    ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
+    SDValue ArgVal, bool &ArgHasUses) {
+  // Check if this is a load from a fixed stack object.
+  auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
+  if (!LNode)
+    return;
+  auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
+  if (!FINode)
+    return;
+
+  // Check that the fixed stack object is the right size and alignment.
+  // Look at the alignment that the user wrote on the alloca instead of looking
+  // at the stack object.
+  auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
+  assert(ArgCopyIter != ArgCopyElisionCandidates.end());
+  const AllocaInst *AI = ArgCopyIter->second.first;
+  int FixedIndex = FINode->getIndex();
+  int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
+  int OldIndex = AllocaIndex;
+  MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
+  if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
+    DEBUG(dbgs() << "  argument copy elision failed due to bad fixed stack "
+                    "object size\n");
+    return;
+  }
+  unsigned RequiredAlignment = AI->getAlignment();
+  if (!RequiredAlignment) {
+    RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
+        AI->getAllocatedType());
+  }
+  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
+    DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
+                    "greater than stack argument alignment ("
+                 << RequiredAlignment << " vs "
+                 << MFI.getObjectAlignment(FixedIndex) << ")\n");
+    return;
+  }
+
+  // Perform the elision. Delete the old stack object and replace its only use
+  // in the variable info map. Mark the stack object as mutable.
+  DEBUG({
+    dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
+           << "  Replacing frame index " << OldIndex << " with " << FixedIndex
+           << '\n';
+  });
+  MFI.RemoveStackObject(OldIndex);
+  MFI.setIsImmutableObjectIndex(FixedIndex, false);
+  AllocaIndex = FixedIndex;
+  ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
+  Chains.push_back(ArgVal.getValue(1));
+
+  // Avoid emitting code for the store implementing the copy.
+  const StoreInst *SI = ArgCopyIter->second.second;
+  ElidedArgCopyInstrs.insert(SI);
+
+  // Check for uses of the argument again so that we can avoid exporting ArgVal
+  // if it is't used by anything other than the store.
+  for (const Value *U : Arg.users()) {
+    if (U != SI) {
+      ArgHasUses = true;
+      break;
+    }
+  }
+}
+
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
@@ -8032,16 +8515,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     Ins.push_back(RetArg);
   }
 
+  // Look for stores of arguments to static allocas. Mark such arguments with a
+  // flag to ask the target to give us the memory location of that argument if
+  // available.
+  ArgCopyElisionMapTy ArgCopyElisionCandidates;
+  findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
+
   // Set up the incoming argument description vector.
-  unsigned Idx = 1;
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
-       I != E; ++I, ++Idx) {
+  for (const Argument &Arg : F.args()) {
+    unsigned ArgNo = Arg.getArgNo();
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
-    bool isArgValueUsed = !I->use_empty();
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    bool isArgValueUsed = !Arg.use_empty();
     unsigned PartBase = 0;
-    Type *FinalType = I->getType();
-    if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
+    Type *FinalType = Arg.getType();
+    if (Arg.hasAttribute(Attribute::ByVal))
       FinalType = cast<PointerType>(FinalType)->getElementType();
     bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
         FinalType, F.getCallingConv(), F.isVarArg());
@@ -8050,17 +8538,22 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
 
-      if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
+      // Certain targets (such as MIPS), may have a different ABI alignment
+      // for a type depending on the context. Give the target a chance to
+      // specify the alignment it wants.
+      unsigned OriginalAlignment =
+          TLI->getABIAlignmentForCallingConv(ArgTy, DL);
+
+      if (Arg.hasAttribute(Attribute::ZExt))
         Flags.setZExt();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
+      if (Arg.hasAttribute(Attribute::SExt))
         Flags.setSExt();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) {
+      if (Arg.hasAttribute(Attribute::InReg)) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (F.getCallingConv() == CallingConv::X86_VectorCall &&
-            isa<StructType>(I->getType())) {
+            isa<StructType>(Arg.getType())) {
           // The first value of a structure is marked
           if (0 == Value)
             Flags.setHvaStart();
@@ -8069,15 +8562,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // Set InReg Flag
         Flags.setInReg();
       }
-      if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
+      if (Arg.hasAttribute(Attribute::StructRet))
         Flags.setSRet();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))
+      if (Arg.hasAttribute(Attribute::SwiftSelf))
         Flags.setSwiftSelf();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftError))
+      if (Arg.hasAttribute(Attribute::SwiftError))
         Flags.setSwiftError();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      if (Arg.hasAttribute(Attribute::ByVal))
         Flags.setByVal();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::InAlloca)) {
+      if (Arg.hasAttribute(Attribute::InAlloca)) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
         // inalloca.  This way we can know how many bytes we should've allocated
@@ -8088,33 +8581,37 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       }
       if (F.getCallingConv() == CallingConv::X86_INTR) {
         // IA Interrupt passes frame (1st parameter) by value in the stack.
-        if (Idx == 1)
+        if (ArgNo == 0)
           Flags.setByVal();
       }
       if (Flags.isByVal() || Flags.isInAlloca()) {
-        PointerType *Ty = cast<PointerType>(I->getType());
+        PointerType *Ty = cast<PointerType>(Arg.getType());
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
         // this info is not there but there are cases it cannot get right.
         unsigned FrameAlign;
-        if (F.getParamAlignment(Idx))
-          FrameAlign = F.getParamAlignment(Idx);
+        if (Arg.getParamAlignment())
+          FrameAlign = Arg.getParamAlignment();
         else
           FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
-      if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
+      if (Arg.hasAttribute(Attribute::Nest))
         Flags.setNest();
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
+      if (ArgCopyElisionCandidates.count(&Arg))
+        Flags.setCopyElisionCandidate();
 
-      MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
-      unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
+      MVT RegisterVT =
+          TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT);
+      unsigned NumRegs =
+          TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
         ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
-                              Idx-1, PartBase+i*RegisterVT.getStoreSize());
+                              ArgNo, PartBase+i*RegisterVT.getStoreSize());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
@@ -8155,7 +8652,6 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
   // Set up the argument values.
   unsigned i = 0;
-  Idx = 1;
   if (!FuncInfo->CanLowerReturn) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
@@ -8177,49 +8673,63 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     DAG.setRoot(NewRoot);
 
     // i indexes lowered arguments.  Bump it past the hidden sret argument.
-    // Idx indexes LLVM arguments.  Don't touch it.
     ++i;
   }
 
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
-      ++I, ++Idx) {
+  SmallVector<SDValue, 4> Chains;
+  DenseMap<int, int> ArgCopyElisionFrameIndexMap;
+  for (const Argument &Arg : F.args()) {
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
+    if (NumValues == 0)
+      continue;
+
+    bool ArgHasUses = !Arg.use_empty();
+
+    // Elide the copying store if the target loaded this argument from a
+    // suitable fixed stack object.
+    if (Ins[i].Flags.isCopyElisionCandidate()) {
+      tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
+                             ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
+                             InVals[i], ArgHasUses);
+    }
 
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
     bool isSwiftErrorArg =
         TLI->supportSwiftError() &&
-        F.getAttributes().hasAttribute(Idx, Attribute::SwiftError);
-    if (I->use_empty() && NumValues && !isSwiftErrorArg) {
-      SDB->setUnusedArgValue(&*I, InVals[i]);
+        Arg.hasAttribute(Attribute::SwiftError);
+    if (!ArgHasUses && !isSwiftErrorArg) {
+      SDB->setUnusedArgValue(&Arg, InVals[i]);
 
       // Also remember any frame index for use in FastISel.
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
-        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     for (unsigned Val = 0; Val != NumValues; ++Val) {
       EVT VT = ValueVTs[Val];
-      MVT PartVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
-      unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT);
+      MVT PartVT =
+          TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT);
+      unsigned NumParts =
+          TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT);
 
       // Even an apparant 'unused' swifterror argument needs to be returned. So
       // we do generate a copy for it that can be used on return from the
       // function.
-      if (!I->use_empty() || isSwiftErrorArg) {
+      if (ArgHasUses || isSwiftErrorArg) {
         Optional<ISD::NodeType> AssertOp;
-        if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
+        if (Arg.hasAttribute(Attribute::SExt))
           AssertOp = ISD::AssertSext;
-        else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
+        else if (Arg.hasAttribute(Attribute::ZExt))
           AssertOp = ISD::AssertZext;
 
-        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
-                                             NumParts, PartVT, VT,
-                                             nullptr, AssertOp));
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
+                                             PartVT, VT, nullptr, AssertOp,
+                                             true));
       }
 
       i += NumParts;
@@ -8232,18 +8742,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // Note down frame index.
     if (FrameIndexSDNode *FI =
         dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
-      FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+      FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
                                      SDB->getCurSDLoc());
 
-    SDB->setValue(&*I, Res);
+    SDB->setValue(&Arg, Res);
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
       if (LoadSDNode *LNode =
           dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
         if (FrameIndexSDNode *FI =
             dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
-        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     // Update the SwiftErrorVRegDefMap.
@@ -8263,18 +8773,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // uses with vregs.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        FuncInfo->ValueMap[&*I] = Reg;
+        FuncInfo->ValueMap[&Arg] = Reg;
         continue;
       }
     }
-    if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) {
-      FuncInfo->InitializeRegForValue(&*I);
-      SDB->CopyToExportRegsIfNeeded(&*I);
+    if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(&Arg);
+      SDB->CopyToExportRegsIfNeeded(&Arg);
     }
   }
 
+  if (!Chains.empty()) {
+    Chains.push_back(NewRoot);
+    NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  DAG.setRoot(NewRoot);
+
   assert(i == InVals.size() && "Argument register count mismatch!");
 
+  // If any argument copy elisions occurred and we have debug info, update the
+  // stale frame indices used in the dbg.declare variable info table.
+  MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
+  if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
+    for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
+      auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
+      if (I != ArgCopyElisionFrameIndexMap.end())
+        VI.Slot = I->second;
+    }
+  }
+
   // Finally, if the target has anything special to do, allow it to do so.
   EmitFunctionEntryCode();
 }
@@ -8402,13 +8930,10 @@ void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
     HasTailCall = true;
 }
 
-bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters,
-                                  const SmallVectorImpl<unsigned> &TotalCases,
-                                  unsigned First, unsigned Last,
-                                  unsigned Density) const {
+uint64_t
+SelectionDAGBuilder::getJumpTableRange(const CaseClusterVector &Clusters,
+                                       unsigned First, unsigned Last) const {
   assert(Last >= First);
-  assert(TotalCases[Last] >= TotalCases[First]);
-
   const APInt &LowCase = Clusters[First].Low->getValue();
   const APInt &HighCase = Clusters[Last].High->getValue();
   assert(LowCase.getBitWidth() == HighCase.getBitWidth());
@@ -8417,26 +8942,17 @@ bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters,
   // comparison to lower. We should discriminate against such consecutive ranges
   // in jump tables.
 
-  uint64_t Diff = (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100);
-  uint64_t Range = Diff + 1;
+  return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1;
+}
 
+uint64_t SelectionDAGBuilder::getJumpTableNumCases(
+    const SmallVectorImpl<unsigned> &TotalCases, unsigned First,
+    unsigned Last) const {
+  assert(Last >= First);
+  assert(TotalCases[Last] >= TotalCases[First]);
   uint64_t NumCases =
       TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);
-
-  assert(NumCases < UINT64_MAX / 100);
-  assert(Range >= NumCases);
-
-  return NumCases * 100 >= Range * Density;
-}
-
-static inline bool areJTsAllowed(const TargetLowering &TLI,
-                                 const SwitchInst *SI) {
-  const Function *Fn = SI->getParent()->getParent();
-  if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
-    return false;
-
-  return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-         TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
+  return NumCases;
 }
 
 bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,
@@ -8475,10 +8991,11 @@ bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,
     JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
   }
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned NumDests = JTProbs.size();
-  if (isSuitableForBitTests(NumDests, NumCmps,
-                            Clusters[First].Low->getValue(),
-                            Clusters[Last].High->getValue())) {
+  if (TLI.isSuitableForBitTests(
+          NumDests, NumCmps, Clusters[First].Low->getValue(),
+          Clusters[Last].High->getValue(), DAG.getDataLayout())) {
     // Clusters[First..Last] should be lowered as bit tests instead.
     return false;
   }
@@ -8499,7 +9016,6 @@ bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,
   }
   JumpTableMBB->normalizeSuccProbs();
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())
                      ->createJumpTableIndex(Table);
 
@@ -8528,17 +9044,12 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
 #endif
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!areJTsAllowed(TLI, SI))
+  if (!TLI.areJTsAllowed(SI->getParent()->getParent()))
     return;
 
-  const bool OptForSize = DefaultMBB->getParent()->getFunction()->optForSize();
-
   const int64_t N = Clusters.size();
   const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries();
   const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;
-  const unsigned MaxJumpTableSize =
-                   OptForSize || TLI.getMaximumJumpTableSize() == 0
-                   ? UINT_MAX : TLI.getMaximumJumpTableSize();
 
   if (N < 2 || N < MinJumpTableEntries)
     return;
@@ -8553,15 +9064,12 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
       TotalCases[i] += TotalCases[i - 1];
   }
 
-  const unsigned MinDensity =
-    OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;
-
   // Cheap case: the whole range may be suitable for jump table.
-  unsigned JumpTableSize = (Clusters[N - 1].High->getValue() -
-                            Clusters[0].Low->getValue())
-                           .getLimitedValue(UINT_MAX - 1) + 1;
-  if (JumpTableSize <= MaxJumpTableSize &&
-      isDense(Clusters, TotalCases, 0, N - 1, MinDensity)) {
+  uint64_t Range = getJumpTableRange(Clusters,0, N - 1);
+  uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1);
+  assert(NumCases < UINT64_MAX / 100);
+  assert(Range >= NumCases);
+  if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
     CaseCluster JTCluster;
     if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
       Clusters[0] = JTCluster;
@@ -8614,11 +9122,11 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
     // Search for a solution that results in fewer partitions.
     for (int64_t j = N - 1; j > i; j--) {
       // Try building a partition from Clusters[i..j].
-      JumpTableSize = (Clusters[j].High->getValue() -
-                       Clusters[i].Low->getValue())
-                      .getLimitedValue(UINT_MAX - 1) + 1;
-      if (JumpTableSize <= MaxJumpTableSize &&
-          isDense(Clusters, TotalCases, i, j, MinDensity)) {
+      uint64_t Range = getJumpTableRange(Clusters, i, j);
+      uint64_t NumCases = getJumpTableNumCases(TotalCases, i, j);
+      assert(NumCases < UINT64_MAX / 100);
+      assert(Range >= NumCases);
+      if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
         unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
         unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];
         int64_t NumEntries = j - i + 1;
@@ -8662,36 +9170,6 @@ void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
   Clusters.resize(DstIndex);
 }
 
-bool SelectionDAGBuilder::rangeFitsInWord(const APInt &Low, const APInt &High) {
-  // FIXME: Using the pointer type doesn't seem ideal.
-  uint64_t BW = DAG.getDataLayout().getPointerSizeInBits();
-  uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
-  return Range <= BW;
-}
-
-bool SelectionDAGBuilder::isSuitableForBitTests(unsigned NumDests,
-                                                unsigned NumCmps,
-                                                const APInt &Low,
-                                                const APInt &High) {
-  // FIXME: I don't think NumCmps is the correct metric: a single case and a
-  // range of cases both require only one branch to lower. Just looking at the
-  // number of clusters and destinations should be enough to decide whether to
-  // build bit tests.
-
-  // To lower a range with bit tests, the range must fit the bitwidth of a
-  // machine word.
-  if (!rangeFitsInWord(Low, High))
-    return false;
-
-  // Decide whether it's profitable to lower this range with bit tests. Each
-  // destination requires a bit test and branch, and there is an overall range
-  // check branch. For a small number of clusters, separate comparisons might be
-  // cheaper, and for many destinations, splitting the range might be better.
-  return (NumDests == 1 && NumCmps >= 3) ||
-         (NumDests == 2 && NumCmps >= 5) ||
-         (NumDests == 3 && NumCmps >= 6);
-}
-
 bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
                                         unsigned First, unsigned Last,
                                         const SwitchInst *SI,
@@ -8713,16 +9191,17 @@ bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
   APInt High = Clusters[Last].High->getValue();
   assert(Low.slt(High));
 
-  if (!isSuitableForBitTests(NumDests, NumCmps, Low, High))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  if (!TLI.isSuitableForBitTests(NumDests, NumCmps, Low, High, DL))
     return false;
 
   APInt LowBound;
   APInt CmpRange;
 
-  const int BitWidth = DAG.getTargetLoweringInfo()
-                           .getPointerTy(DAG.getDataLayout())
-                           .getSizeInBits();
-  assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!");
+  const int BitWidth = TLI.getPointerTy(DL).getSizeInBits();
+  assert(TLI.rangeFitsInWord(Low, High, DL) &&
+         "Case range must fit in bit mask!");
 
   // Check if the clusters cover a contiguous range such that no value in the
   // range will jump to the default statement.
@@ -8812,7 +9291,9 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters,
 
   // If target does not have legal shift left, do not emit bit tests at all.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT PTy = TLI.getPointerTy(DAG.getDataLayout());
+  const DataLayout &DL = DAG.getDataLayout();
+
+  EVT PTy = TLI.getPointerTy(DL);
   if (!TLI.isOperationLegal(ISD::SHL, PTy))
     return;
 
@@ -8843,8 +9324,8 @@ void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters,
       // Try building a partition from Clusters[i..j].
 
       // Check the range.
-      if (!rangeFitsInWord(Clusters[i].Low->getValue(),
-                           Clusters[j].High->getValue()))
+      if (!TLI.rangeFitsInWord(Clusters[i].Low->getValue(),
+                               Clusters[j].High->getValue(), DL))
         continue;
 
       // Check nbr of destinations and cluster types.
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index abde8a8..ac1d6aa 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -38,7 +38,6 @@ class BranchInst;
 class CallInst;
 class DbgValueInst;
 class ExtractElementInst;
-class ExtractValueInst;
 class FCmpInst;
 class FPExtInst;
 class FPToSIInst;
@@ -53,7 +52,6 @@ class IntToPtrInst;
 class IndirectBrInst;
 class InvokeInst;
 class InsertElementInst;
-class InsertValueInst;
 class Instruction;
 class LoadInst;
 class MachineBasicBlock;
@@ -304,10 +302,13 @@ private:
     BranchProbability DefaultProb;
   };
 
-  /// Check whether a range of clusters is dense enough for a jump table.
-  bool isDense(const CaseClusterVector &Clusters,
-               const SmallVectorImpl<unsigned> &TotalCases,
-               unsigned First, unsigned Last, unsigned MinDensity) const;
+  /// Return the range of value in [First..Last].
+  uint64_t getJumpTableRange(const CaseClusterVector &Clusters, unsigned First,
+                             unsigned Last) const;
+
+  /// Return the number of cases in [First..Last].
+  uint64_t getJumpTableNumCases(const SmallVectorImpl<unsigned> &TotalCases,
+                                unsigned First, unsigned Last) const;
 
   /// Build a jump table cluster from Clusters[First..Last]. Returns false if it
   /// decides it's not a good idea.
@@ -319,14 +320,6 @@ private:
   void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI,
                       MachineBasicBlock *DefaultMBB);
 
-  /// Check whether the range [Low,High] fits in a machine word.
-  bool rangeFitsInWord(const APInt &Low, const APInt &High);
-
-  /// Check whether these clusters are suitable for lowering with bit tests based
-  /// on the number of destinations, comparison metric, and range.
-  bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
-                             const APInt &Low, const APInt &High);
-
   /// Build a bit test cluster from Clusters[First..Last]. Returns false if it
   /// decides it's not a good idea.
   bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last,
@@ -609,40 +602,34 @@ public:
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       CodeGenOpt::Level ol)
     : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
-      DAG(dag), FuncInfo(funcinfo),
+      DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo),
       HasTailCall(false) {
   }
 
-  void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+  void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
             const TargetLibraryInfo *li);
 
-  /// clear - Clear out the current SelectionDAG and the associated
-  /// state and prepare this SelectionDAGBuilder object to be used
-  /// for a new block. This doesn't clear out information about
-  /// additional blocks that are needed to complete switch lowering
-  /// or PHI node updating; that information is cleared out as it is
-  /// consumed.
+  /// Clear out the current SelectionDAG and the associated state and prepare
+  /// this SelectionDAGBuilder object to be used for a new block. This doesn't
+  /// clear out information about additional blocks that are needed to complete
+  /// switch lowering or PHI node updating; that information is cleared out as
+  /// it is consumed.
   void clear();
 
-  /// clearDanglingDebugInfo - Clear the dangling debug information
-  /// map. This function is separated from the clear so that debug
-  /// information that is dangling in a basic block can be properly
-  /// resolved in a different basic block. This allows the
-  /// SelectionDAG to resolve dangling debug information attached
-  /// to PHI nodes.
+  /// Clear the dangling debug information map. This function is separated from
+  /// the clear so that debug information that is dangling in a basic block can
+  /// be properly resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached to PHI nodes.
   void clearDanglingDebugInfo();
 
-  /// getRoot - Return the current virtual root of the Selection DAG,
-  /// flushing any PendingLoad items. This must be done before emitting
-  /// a store or any other node that may need to be ordered after any
-  /// prior load instructions.
-  ///
+  /// Return the current virtual root of the Selection DAG, flushing any
+  /// PendingLoad items. This must be done before emitting a store or any other
+  /// node that may need to be ordered after any prior load instructions.
   SDValue getRoot();
 
-  /// getControlRoot - Similar to getRoot, but instead of flushing all the
-  /// PendingLoad items, flush all the PendingExports items. It is necessary
-  /// to do this before emitting a terminator instruction.
-  ///
+  /// Similar to getRoot, but instead of flushing all the PendingLoad items,
+  /// flush all the PendingExports items. It is necessary to do this before
+  /// emitting a terminator instruction.
   SDValue getControlRoot();
 
   SDLoc getCurSDLoc() const {
@@ -688,12 +675,13 @@ public:
                             MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
                             MachineBasicBlock *SwitchBB,
                             Instruction::BinaryOps Opc, BranchProbability TW,
-                            BranchProbability FW);
+                            BranchProbability FW, bool InvertCond);
   void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
                                     MachineBasicBlock *FBB,
                                     MachineBasicBlock *CurBB,
                                     MachineBasicBlock *SwitchBB,
-                                    BranchProbability TW, BranchProbability FW);
+                                    BranchProbability TW, BranchProbability FW,
+                                    bool InvertCond);
   bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
   bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
   void CopyToExportRegsIfNeeded(const Value *V);
@@ -782,6 +770,11 @@ public:
                                         bool VarArgDisallowed,
                                         bool ForceVoidReturnTy);
 
+  /// Returns the type of FrameIndex and TargetFrameIndex nodes.
+  MVT getFrameIndexTy() {
+    return DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout());
+  }
+
 private:
   // Terminator instructions.
   void visitRet(const ReturnInst &I);
@@ -864,8 +857,8 @@ private:
   void visitInsertElement(const User &I);
   void visitShuffleVector(const User &I);
 
-  void visitExtractValue(const ExtractValueInst &I);
-  void visitInsertValue(const InsertValueInst &I);
+  void visitExtractValue(const User &I);
+  void visitInsertValue(const User &I);
   void visitLandingPad(const LandingPadInst &I);
 
   void visitGetElementPtr(const User &I);
@@ -900,6 +893,7 @@ private:
   void visitInlineAsm(ImmutableCallSite CS);
   const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
+  void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
 
   void visitVAStart(const CallInst &I);
   void visitVAArg(const VAArgInst &I);
@@ -913,6 +907,8 @@ private:
   void visitGCRelocate(const GCRelocateInst &I);
   void visitGCResult(const GCResultInst &I);
 
+  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
   }
@@ -932,7 +928,7 @@ private:
   /// instruction selection, they will be inserted to the entry BB.
   bool EmitFuncArgumentDbgValue(const Value *V, DILocalVariable *Variable,
                                 DIExpression *Expr, DILocation *DL,
-                                int64_t Offset, bool IsIndirect,
+                                int64_t Offset, bool IsDbgDeclare,
                                 const SDValue &N);
 
   /// Return the next block after MBB, or nullptr if there is none.
@@ -944,8 +940,8 @@ private:
 
   /// Return the appropriate SDDbgValue based on N.
   SDDbgValue *getDbgValue(SDValue N, DILocalVariable *Variable,
-                          DIExpression *Expr, int64_t Offset, DebugLoc dl,
-                          unsigned DbgSDNodeOrder);
+                          DIExpression *Expr, int64_t Offset,
+                          const DebugLoc &dl, unsigned DbgSDNodeOrder);
 };
 
 /// RegsForValue - This struct represents the registers (physical or virtual)
@@ -958,62 +954,69 @@ private:
 /// type.
 ///
 struct RegsForValue {
-  /// ValueVTs - The value types of the values, which may not be legal, and
+  /// The value types of the values, which may not be legal, and
   /// may need be promoted or synthesized from one or more registers.
-  ///
   SmallVector<EVT, 4> ValueVTs;
 
-  /// RegVTs - The value types of the registers. This is the same size as
-  /// ValueVTs and it records, for each value, what the type of the assigned
-  /// register or registers are. (Individual values are never synthesized
-  /// from more than one type of register.)
+  /// The value types of the registers. This is the same size as ValueVTs and it
+  /// records, for each value, what the type of the assigned register or
+  /// registers are. (Individual values are never synthesized from more than one
+  /// type of register.)
   ///
   /// With virtual registers, the contents of RegVTs is redundant with TLI's
   /// getRegisterType member function, however when with physical registers
   /// it is necessary to have a separate record of the types.
-  ///
   SmallVector<MVT, 4> RegVTs;
 
-  /// Regs - This list holds the registers assigned to the values.
+  /// This list holds the registers assigned to the values.
   /// Each legal or promoted value requires one register, and each
   /// expanded value requires multiple registers.
-  ///
   SmallVector<unsigned, 4> Regs;
 
+  /// This list holds the number of registers for each value.
+  SmallVector<unsigned, 4> RegCount;
+
+  /// Records if this value needs to be treated in an ABI dependant manner,
+  /// different to normal type legalization.
+  bool IsABIMangled;
+
   RegsForValue();
 
-  RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt);
+  RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt,
+               bool IsABIMangledValue = false);
 
   RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
-               const DataLayout &DL, unsigned Reg, Type *Ty);
+               const DataLayout &DL, unsigned Reg, Type *Ty,
+               bool IsABIMangledValue = false);
 
-  /// append - Add the specified values to this one.
+  /// Add the specified values to this one.
   void append(const RegsForValue &RHS) {
     ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
     RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
     Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    RegCount.push_back(RHS.Regs.size());
   }
 
-  /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-  /// this value and returns the result as a ValueVTs value.  This uses
-  /// Chain/Flag as the input and updates them for the output Chain/Flag.
-  /// If the Flag pointer is NULL, no flag is used.
+  /// Emit a series of CopyFromReg nodes that copies from this value and returns
+  /// the result as a ValueVTs value. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no
+  /// flag is used.
   SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
                           const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                           const Value *V = nullptr) const;
 
-  /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified
-  /// value into the registers specified by this object.  This uses Chain/Flag
-  /// as the input and updates them for the output Chain/Flag.  If the Flag
-  /// pointer is nullptr, no flag is used.  If V is not nullptr, then it is used
-  /// in printing better diagnostic messages on error.
+  /// Emit a series of CopyToReg nodes that copies the specified value into the
+  /// registers specified by this object. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no
+  /// flag is used. If V is not nullptr, then it is used in printing better
+  /// diagnostic messages on error.
   void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,
                      SDValue &Chain, SDValue *Flag, const Value *V = nullptr,
                      ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
 
-  /// AddInlineAsmOperands - Add this value to the specified inlineasm node
-  /// operand list.  This adds the code marker, matching input operand index
-  /// (if applicable), and includes the number of values added into it.
+  /// Add this value to the specified inlineasm node operand list. This adds the
+  /// code marker, matching input operand index (if applicable), and includes
+  /// the number of values added into it.
   void AddInlineAsmOperands(unsigned Kind, bool HasMatching,
                             unsigned MatchingIdx, const SDLoc &dl,
                             SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 0faaad8..3dd5897 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
@@ -214,6 +214,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FPOWI:                      return "fpowi";
   case ISD::SETCC:                      return "setcc";
   case ISD::SETCCE:                     return "setcce";
+  case ISD::SETCCCARRY:                 return "setcccarry";
   case ISD::SELECT:                     return "select";
   case ISD::VSELECT:                    return "vselect";
   case ISD::SELECT_CC:                  return "select_cc";
@@ -227,6 +228,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CARRY_FALSE:                return "carry_false";
   case ISD::ADDC:                       return "addc";
   case ISD::ADDE:                       return "adde";
+  case ISD::ADDCARRY:                   return "addcarry";
   case ISD::SADDO:                      return "saddo";
   case ISD::UADDO:                      return "uaddo";
   case ISD::SSUBO:                      return "ssubo";
@@ -235,6 +237,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::UMULO:                      return "umulo";
   case ISD::SUBC:                       return "subc";
   case ISD::SUBE:                       return "sube";
+  case ISD::SUBCARRY:                   return "subcarry";
   case ISD::SHL_PARTS:                  return "shl_parts";
   case ISD::SRA_PARTS:                  return "sra_parts";
   case ISD::SRL_PARTS:                  return "srl_parts";
@@ -300,6 +303,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
 
   // Bit manipulation
+  case ISD::ABS:                        return "abs";
   case ISD::BITREVERSE:                 return "bitreverse";
   case ISD::BSWAP:                      return "bswap";
   case ISD::CTPOP:                      return "ctpop";
@@ -343,6 +347,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     case ISD::SETFALSE:                 return "setfalse";
     case ISD::SETFALSE2:                return "setfalse2";
     }
+  case ISD::VECREDUCE_FADD:             return "vecreduce_fadd";
+  case ISD::VECREDUCE_FMUL:             return "vecreduce_fmul";
+  case ISD::VECREDUCE_ADD:              return "vecreduce_add";
+  case ISD::VECREDUCE_MUL:              return "vecreduce_mul";
+  case ISD::VECREDUCE_AND:              return "vecreduce_and";
+  case ISD::VECREDUCE_OR:               return "vecreduce_or";
+  case ISD::VECREDUCE_XOR:              return "vecreduce_xor";
+  case ISD::VECREDUCE_SMAX:             return "vecreduce_smax";
+  case ISD::VECREDUCE_SMIN:             return "vecreduce_smin";
+  case ISD::VECREDUCE_UMAX:             return "vecreduce_umax";
+  case ISD::VECREDUCE_UMIN:             return "vecreduce_umin";
+  case ISD::VECREDUCE_FMAX:             return "vecreduce_fmax";
+  case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
   }
 }
 
@@ -366,11 +383,13 @@ static Printable PrintNodeId(const SDNode &Node) {
   });
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); }
-void SDNode::dump(const SelectionDAG *G) const {
+LLVM_DUMP_METHOD void SDNode::dump(const SelectionDAG *G) const {
   print(dbgs(), G);
   dbgs() << '\n';
 }
+#endif
 
 void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
   for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
@@ -416,7 +435,7 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
     else {
       OS << "<APFloat(";
-      CSDN->getValueAPF().bitcastToAPInt().dump();
+      CSDN->getValueAPF().bitcastToAPInt().print(OS, false);
       OS << ")>";
     }
   } else if (const GlobalAddressSDNode *GADN =
@@ -566,6 +585,7 @@ static bool shouldPrintInline(const SDNode &Node) {
   return Node.getNumOperands() == 0;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
   for (const SDValue &Op : N->op_values()) {
     if (shouldPrintInline(*Op.getNode()))
@@ -592,6 +612,7 @@ LLVM_DUMP_METHOD void SelectionDAG::dump() const {
   if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
   dbgs() << "\n\n";
 }
+#endif
 
 void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
   OS << PrintNodeId(*this) << ": ";
@@ -618,6 +639,7 @@ static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet;
 static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
                        const SelectionDAG *G, VisitedSDNodeSet &once) {
@@ -646,15 +668,16 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
     DumpNodesr(OS, Op.getNode(), indent+2, G, once);
 }
 
-void SDNode::dumpr() const {
+LLVM_DUMP_METHOD void SDNode::dumpr() const {
   VisitedSDNodeSet once;
   DumpNodesr(dbgs(), this, 0, nullptr, once);
 }
 
-void SDNode::dumpr(const SelectionDAG *G) const {
+LLVM_DUMP_METHOD void SDNode::dumpr(const SelectionDAG *G) const {
   VisitedSDNodeSet once;
   DumpNodesr(dbgs(), this, 0, G, once);
 }
+#endif
 
 static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
                                   const SelectionDAG *G, unsigned depth,
@@ -688,14 +711,17 @@ void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
   printrWithDepth(OS, G, 10);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
   printrWithDepth(dbgs(), G, depth);
 }
 
-void SDNode::dumprFull(const SelectionDAG *G) const {
+LLVM_DUMP_METHOD void SDNode::dumprFull(const SelectionDAG *G) const {
   // Don't print impossibly deep things.
   dumprWithDepth(G, 10);
 }
+#endif
 
 void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
   printr(OS, G);
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 64e6c22..bdf57e8 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1,4 +1,4 @@
-//===-- SelectionDAGISel.cpp - Implement the SelectionDAGISel class -------===//
+//===- SelectionDAGISel.cpp - Implement the SelectionDAGISel class --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,43 +11,73 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackProtector.h"
-#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -59,6 +89,14 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -73,104 +111,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");
 STATISTIC(NumFastIselFailLowerArguments,
           "Number of entry blocks where fast isel failed to lower arguments");
 
-#ifndef NDEBUG
-static cl::opt<bool>
-EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden,
-          cl::desc("Enable extra verbose messages in the \"fast\" "
-                   "instruction selector"));
-
-  // Terminators
-STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret");
-STATISTIC(NumFastIselFailBr,"Fast isel fails on Br");
-STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch");
-STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr");
-STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke");
-STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume");
-STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable");
-
-  // Standard binary operators...
-STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add");
-STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd");
-STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub");
-STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub");
-STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul");
-STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul");
-STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv");
-STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv");
-STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv");
-STATISTIC(NumFastIselFailURem,"Fast isel fails on URem");
-STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem");
-STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem");
-
-  // Logical operators...
-STATISTIC(NumFastIselFailAnd,"Fast isel fails on And");
-STATISTIC(NumFastIselFailOr,"Fast isel fails on Or");
-STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor");
-
-  // Memory instructions...
-STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca");
-STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load");
-STATISTIC(NumFastIselFailStore,"Fast isel fails on Store");
-STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg");
-STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM");
-STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence");
-STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr");
-
-  // Convert instructions...
-STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc");
-STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt");
-STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt");
-STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc");
-STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt");
-STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI");
-STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI");
-STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP");
-STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP");
-STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr");
-STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt");
-STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast");
-
-  // Other instructions...
-STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp");
-STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp");
-STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI");
-STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select");
-STATISTIC(NumFastIselFailCall,"Fast isel fails on Call");
-STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl");
-STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr");
-STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr");
-STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg");
-STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement");
-STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement");
-STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
-STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
-STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
-STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
-
-// Intrinsic instructions...
-STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call");
-STATISTIC(NumFastIselFailSAddWithOverflow,
-          "Fast isel fails on sadd.with.overflow");
-STATISTIC(NumFastIselFailUAddWithOverflow,
-          "Fast isel fails on uadd.with.overflow");
-STATISTIC(NumFastIselFailSSubWithOverflow,
-          "Fast isel fails on ssub.with.overflow");
-STATISTIC(NumFastIselFailUSubWithOverflow,
-          "Fast isel fails on usub.with.overflow");
-STATISTIC(NumFastIselFailSMulWithOverflow,
-          "Fast isel fails on smul.with.overflow");
-STATISTIC(NumFastIselFailUMulWithOverflow,
-          "Fast isel fails on umul.with.overflow");
-STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress");
-STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call");
-STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call");
-STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call");
-#endif
-
-static cl::opt<bool>
-EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
-          cl::desc("Enable verbose messages in the \"fast\" "
-                   "instruction selector"));
 static cl::opt<int> EnableFastISelAbort(
     "fast-isel-abort", cl::Hidden,
     cl::desc("Enable abort calls when \"fast\" instruction selection "
@@ -179,6 +119,11 @@ static cl::opt<int> EnableFastISelAbort(
              "abort for argument lowering, and 3 will never fallback "
              "to SelectionDAG."));
 
+static cl::opt<bool> EnableFastISelFallbackReport(
+    "fast-isel-report-on-fallback", cl::Hidden,
+    cl::desc("Emit a diagnostic when \"fast\" instruction selection "
+             "falls back to SelectionDAG."));
+
 static cl::opt<bool>
 UseMBPI("use-mbpi",
         cl::desc("use Machine Branch Probability Info"),
@@ -238,7 +183,7 @@ MachinePassRegistry RegisterScheduler::Registry;
 ///
 //===---------------------------------------------------------------------===//
 static cl::opt<RegisterScheduler::FunctionPassCtor, false,
-               RegisterPassParser<RegisterScheduler> >
+               RegisterPassParser<RegisterScheduler>>
 ISHeuristic("pre-RA-sched",
             cl::init(&createDefaultScheduler), cl::Hidden,
             cl::desc("Instruction schedulers available (before register"
@@ -249,6 +194,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
                         createDefaultScheduler);
 
 namespace llvm {
+
   //===--------------------------------------------------------------------===//
   /// \brief This class is used by SelectionDAGISel to temporarily override
   /// the optimization level on a per-function basis.
@@ -318,6 +264,7 @@ namespace llvm {
            "Unknown sched type!");
     return createILPListDAGScheduler(IS, OptLevel);
   }
+
 } // end namespace llvm
 
 // EmitInstrWithCustomInserter - This method should be implemented by targets
@@ -357,7 +304,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
   FuncInfo(new FunctionLoweringInfo()),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
-  GFI(),
+  AA(), GFI(),
   OptLevel(OL),
   DAGSize(0) {
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
@@ -375,7 +322,8 @@ SelectionDAGISel::~SelectionDAGISel() {
 }
 
 void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<AAResultsWrapperPass>();
+  if (OptLevel != CodeGenOpt::None)
+    AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<StackProtector>();
   AU.addPreserved<StackProtector>();
@@ -389,11 +337,13 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
 /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
 /// may trap on it.  In this case we have to split the edge so that the path
 /// through the predecessor block that doesn't go to the phi block doesn't
-/// execute the possibly trapping instruction.
-///
+/// execute the possibly trapping instruction. If available, we pass domtree
+/// and loop info to be updated when we split critical edges. This is because
+/// SelectionDAGISel preserves these analyses.
 /// This is required for correctness, so it must be done at -O0.
 ///
-static void SplitCriticalSideEffectEdges(Function &Fn) {
+static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT,
+                                         LoopInfo *LI) {
   // Loop for blocks with phi nodes.
   for (BasicBlock &BB : Fn) {
     PHINode *PN = dyn_cast<PHINode>(BB.begin());
@@ -419,7 +369,7 @@ static void SplitCriticalSideEffectEdges(Function &Fn) {
         // Okay, we have to split this edge.
         SplitCriticalEdge(
             Pred->getTerminator(), GetSuccessorNumber(Pred, &BB),
-            CriticalEdgeSplittingOptions().setMergeIdenticalEdges());
+            CriticalEdgeSplittingOptions(DT, LI).setMergeIdenticalEdges());
         goto ReprocessBlock;
       }
   }
@@ -431,8 +381,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
           MachineFunctionProperties::Property::Selected))
     return false;
   // Do some sanity-checking on the command-line options.
-  assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) &&
-         "-fast-isel-verbose requires -fast-isel");
   assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
          "-fast-isel-abort > 0 requires -fast-isel");
 
@@ -454,23 +402,37 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+  ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+  LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
-  SplitCriticalSideEffectEdges(const_cast<Function &>(Fn));
+  SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
-  CurDAG->init(*MF);
+  CurDAG->init(*MF, *ORE);
   FuncInfo->set(Fn, *MF, CurDAG);
 
+  // Now get the optional analyzes if we want to.
+  // This is based on the possibly changed OptLevel (after optnone is taken
+  // into account).  That's unfortunate but OK because it just means we won't
+  // ask for passes that have been required anyway.
+
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
     FuncInfo->BPI = nullptr;
 
-  SDB->init(GFI, *AA, LibInfo);
+  if (OptLevel != CodeGenOpt::None)
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  else
+    AA = nullptr;
+
+  SDB->init(GFI, AA, LibInfo);
 
   MF->setHasInlineAsm(false);
 
@@ -502,6 +464,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     TLI->initializeSplitCSR(EntryMBB);
 
   SelectAllBasicBlocks(Fn);
+  if (FastISelFailed && EnableFastISelFallbackReport) {
+    DiagnosticInfoISelFallback DiagFallback(Fn);
+    Fn.getContext().diagnose(DiagFallback);
+  }
 
   // If the first basic block in the function has live ins that need to be
   // copied into vregs, emit the copies into the top of the block before
@@ -628,7 +594,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     unsigned To = I->second;
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
-    for (;;) {
+    while (true) {
       DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E) break;
       To = J->second;
@@ -648,13 +614,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     MRI.replaceRegWith(From, To);
   }
 
-  if (TLI->hasCopyImplyingStackAdjustment(MF))
-    MFI.setHasCopyImplyingStackAdjustment(true);
-
-  // Freeze the set of reserved registers now that MachineFrameInfo has been
-  // set up. All the information required by getReservedRegs() should be
-  // available now.
-  MRI.freezeReservedRegs(*MF);
+  TLI->finalizeLowering(*MF);
 
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
@@ -666,13 +626,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
+static void reportFastISelFailure(MachineFunction &MF,
+                                  OptimizationRemarkEmitter &ORE,
+                                  OptimizationRemarkMissed &R,
+                                  bool ShouldAbort) {
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || ShouldAbort)
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (ShouldAbort)
+    report_fatal_error(R.getMsg());
+
+  ORE.emit(R);
+}
+
 void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                         BasicBlock::const_iterator End,
                                         bool &HadTailCall) {
   // Lower the instructions. If a call is emitted as a tail call, cease emitting
   // nodes for this block.
-  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
-    SDB->visit(*I);
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
+    if (!ElidedArgCopyInstrs.count(&*I))
+      SDB->visit(*I);
+  }
 
   // Make sure the root of the DAG is up-to-date.
   CurDAG->setRoot(SDB->getControlRoot());
@@ -689,8 +666,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
 
   Worklist.push_back(CurDAG->getRoot().getNode());
 
-  APInt KnownZero;
-  APInt KnownOne;
+  KnownBits Known;
 
   do {
     SDNode *N = Worklist.pop_back_val();
@@ -719,8 +695,8 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
       continue;
 
     unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
-    CurDAG->computeKnownBits(Src, KnownZero, KnownOne);
-    FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne);
+    CurDAG->computeKnownBits(Src, Known);
+    FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, Known);
   } while (!Worklist.empty());
 }
 
@@ -731,6 +707,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   int BlockNumber = -1;
   (void)BlockNumber;
   bool MatchFilterBB = false; (void)MatchFilterBB;
+
+  // Pre-type legalization allow creation of any node types.
+  CurDAG->NewNodesMustHaveLegalTypes = false;
+
 #ifndef NDEBUG
   MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
                    FilterDAGBasicBlockName ==
@@ -756,7 +736,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
                        GroupDescription, TimePassesIsEnabled);
-    CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+    CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
@@ -777,6 +757,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
+  // Only allow creation of legal node types.
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
@@ -787,12 +768,11 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     {
       NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
                          GroupName, GroupDescription, TimePassesIsEnabled);
-      CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+      CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
           << " '" << BlockName << "'\n"; CurDAG->dump());
-
   }
 
   {
@@ -802,12 +782,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   }
 
   if (Changed) {
+    DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
     {
       NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
                          GroupDescription, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
 
+    DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
@@ -815,7 +801,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     {
       NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
                          GroupName, GroupDescription, TimePassesIsEnabled);
-      CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+      CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
@@ -841,7 +827,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   {
     NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
                        GroupDescription, TimePassesIsEnabled);
-    CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+    CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
@@ -907,10 +893,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 }
 
 namespace {
+
 /// ISelUpdater - helper class to handle updates of the instruction selection
 /// graph.
 class ISelUpdater : public SelectionDAG::DAGUpdateListener {
   SelectionDAG::allnodes_iterator &ISelPosition;
+
 public:
   ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
     : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
@@ -923,6 +911,7 @@ public:
       ++ISelPosition;
   }
 };
+
 } // end anonymous namespace
 
 void SelectionDAGISel::DoInstructionSelection() {
@@ -960,6 +949,19 @@ void SelectionDAGISel::DoInstructionSelection() {
       if (Node->use_empty())
         continue;
 
+      // When we are using non-default rounding modes or FP exception behavior
+      // FP operations are represented by StrictFP pseudo-operations.  They
+      // need to be simplified here so that the target-specific instruction
+      // selectors know how to handle them.
+      //
+      // If the current node is a strict FP pseudo-op, the isStrictFPOp()
+      // function will provide the corresponding normal FP opcode to which the
+      // node should be mutated.
+      //
+      // FIXME: The backends need a way to handle FP constraints.
+      if (Node->isStrictFPOpcode())
+        Node = CurDAG->mutateStrictFPToFP(Node);
+
       Select(Node);
     }
 
@@ -1046,116 +1048,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
          !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
-#ifndef NDEBUG
-// Collect per Instruction statistics for fast-isel misses.  Only those
-// instructions that cause the bail are accounted for.  It does not account for
-// instructions higher in the block.  Thus, summing the per instructions stats
-// will not add up to what is reported by NumFastIselFailures.
-static void collectFailStats(const Instruction *I) {
-  switch (I->getOpcode()) {
-  default: assert (0 && "<Invalid operator> ");
-
-  // Terminators
-  case Instruction::Ret:         NumFastIselFailRet++; return;
-  case Instruction::Br:          NumFastIselFailBr++; return;
-  case Instruction::Switch:      NumFastIselFailSwitch++; return;
-  case Instruction::IndirectBr:  NumFastIselFailIndirectBr++; return;
-  case Instruction::Invoke:      NumFastIselFailInvoke++; return;
-  case Instruction::Resume:      NumFastIselFailResume++; return;
-  case Instruction::Unreachable: NumFastIselFailUnreachable++; return;
-
-  // Standard binary operators...
-  case Instruction::Add:  NumFastIselFailAdd++; return;
-  case Instruction::FAdd: NumFastIselFailFAdd++; return;
-  case Instruction::Sub:  NumFastIselFailSub++; return;
-  case Instruction::FSub: NumFastIselFailFSub++; return;
-  case Instruction::Mul:  NumFastIselFailMul++; return;
-  case Instruction::FMul: NumFastIselFailFMul++; return;
-  case Instruction::UDiv: NumFastIselFailUDiv++; return;
-  case Instruction::SDiv: NumFastIselFailSDiv++; return;
-  case Instruction::FDiv: NumFastIselFailFDiv++; return;
-  case Instruction::URem: NumFastIselFailURem++; return;
-  case Instruction::SRem: NumFastIselFailSRem++; return;
-  case Instruction::FRem: NumFastIselFailFRem++; return;
-
-  // Logical operators...
-  case Instruction::And: NumFastIselFailAnd++; return;
-  case Instruction::Or:  NumFastIselFailOr++; return;
-  case Instruction::Xor: NumFastIselFailXor++; return;
-
-  // Memory instructions...
-  case Instruction::Alloca:        NumFastIselFailAlloca++; return;
-  case Instruction::Load:          NumFastIselFailLoad++; return;
-  case Instruction::Store:         NumFastIselFailStore++; return;
-  case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return;
-  case Instruction::AtomicRMW:     NumFastIselFailAtomicRMW++; return;
-  case Instruction::Fence:         NumFastIselFailFence++; return;
-  case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return;
-
-  // Convert instructions...
-  case Instruction::Trunc:    NumFastIselFailTrunc++; return;
-  case Instruction::ZExt:     NumFastIselFailZExt++; return;
-  case Instruction::SExt:     NumFastIselFailSExt++; return;
-  case Instruction::FPTrunc:  NumFastIselFailFPTrunc++; return;
-  case Instruction::FPExt:    NumFastIselFailFPExt++; return;
-  case Instruction::FPToUI:   NumFastIselFailFPToUI++; return;
-  case Instruction::FPToSI:   NumFastIselFailFPToSI++; return;
-  case Instruction::UIToFP:   NumFastIselFailUIToFP++; return;
-  case Instruction::SIToFP:   NumFastIselFailSIToFP++; return;
-  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return;
-  case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return;
-  case Instruction::BitCast:  NumFastIselFailBitCast++; return;
-
-  // Other instructions...
-  case Instruction::ICmp:           NumFastIselFailICmp++; return;
-  case Instruction::FCmp:           NumFastIselFailFCmp++; return;
-  case Instruction::PHI:            NumFastIselFailPHI++; return;
-  case Instruction::Select:         NumFastIselFailSelect++; return;
-  case Instruction::Call: {
-    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
-      switch (Intrinsic->getIntrinsicID()) {
-      default:
-        NumFastIselFailIntrinsicCall++; return;
-      case Intrinsic::sadd_with_overflow:
-        NumFastIselFailSAddWithOverflow++; return;
-      case Intrinsic::uadd_with_overflow:
-        NumFastIselFailUAddWithOverflow++; return;
-      case Intrinsic::ssub_with_overflow:
-        NumFastIselFailSSubWithOverflow++; return;
-      case Intrinsic::usub_with_overflow:
-        NumFastIselFailUSubWithOverflow++; return;
-      case Intrinsic::smul_with_overflow:
-        NumFastIselFailSMulWithOverflow++; return;
-      case Intrinsic::umul_with_overflow:
-        NumFastIselFailUMulWithOverflow++; return;
-      case Intrinsic::frameaddress:
-        NumFastIselFailFrameaddress++; return;
-      case Intrinsic::sqrt:
-          NumFastIselFailSqrt++; return;
-      case Intrinsic::experimental_stackmap:
-        NumFastIselFailStackMap++; return;
-      case Intrinsic::experimental_patchpoint_void: // fall-through
-      case Intrinsic::experimental_patchpoint_i64:
-        NumFastIselFailPatchPoint++; return;
-      }
-    }
-    NumFastIselFailCall++;
-    return;
-  }
-  case Instruction::Shl:            NumFastIselFailShl++; return;
-  case Instruction::LShr:           NumFastIselFailLShr++; return;
-  case Instruction::AShr:           NumFastIselFailAShr++; return;
-  case Instruction::VAArg:          NumFastIselFailVAArg++; return;
-  case Instruction::ExtractElement: NumFastIselFailExtractElement++; return;
-  case Instruction::InsertElement:  NumFastIselFailInsertElement++; return;
-  case Instruction::ShuffleVector:  NumFastIselFailShuffleVector++; return;
-  case Instruction::ExtractValue:   NumFastIselFailExtractValue++; return;
-  case Instruction::InsertValue:    NumFastIselFailInsertValue++; return;
-  case Instruction::LandingPad:     NumFastIselFailLandingPad++; return;
-  }
-}
-#endif // NDEBUG
-
 /// Set up SwiftErrorVals by going through the function. If the function has
 /// swifterror argument, it will be the first entry.
 static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
@@ -1166,6 +1058,7 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
   FuncInfo->SwiftErrorVals.clear();
   FuncInfo->SwiftErrorVRegDefMap.clear();
   FuncInfo->SwiftErrorVRegUpwardsUse.clear();
+  FuncInfo->SwiftErrorVRegDefUses.clear();
   FuncInfo->SwiftErrorArg = nullptr;
 
   // Check if function has a swifterror argument.
@@ -1190,9 +1083,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
 }
 
 static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
+                                                FastISel *FastIS,
                                                 const TargetLowering *TLI,
                                                 const TargetInstrInfo *TII,
-                                                const BasicBlock *LLVMBB,
                                                 SelectionDAGBuilder *SDB) {
   if (!TLI->supportSwiftError())
     return;
@@ -1202,21 +1095,71 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
   if (FuncInfo->SwiftErrorVals.empty())
     return;
 
-  if (pred_begin(LLVMBB) == pred_end(LLVMBB)) {
-    auto &DL = FuncInfo->MF->getDataLayout();
-    auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
-    for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
-      // We will always generate a copy from the argument. It is always used at
-      // least by the 'return' of the swifterror.
-      if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
+  assert(FuncInfo->MBB == &*FuncInfo->MF->begin() &&
+         "expected to insert into entry block");
+  auto &DL = FuncInfo->MF->getDataLayout();
+  auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
+    // We will always generate a copy from the argument. It is always used at
+    // least by the 'return' of the swifterror.
+    if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
+      continue;
+    unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+    // Assign Undef to Vreg. We construct MI directly to make sure it works
+    // with FastISel.
+    BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
+            SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
+            VReg);
+
+    // Keep FastIS informed about the value we just inserted.
+    if (FastIS)
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+
+    FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
+  }
+}
+
+/// Collect llvm.dbg.declare information. This is done after argument lowering
+/// in case the declarations refer to arguments.
+static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) {
+  MachineFunction *MF = FuncInfo->MF;
+  const DataLayout &DL = MF->getDataLayout();
+  for (const BasicBlock &BB : *FuncInfo->Fn) {
+    for (const Instruction &I : BB) {
+      const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I);
+      if (!DI)
+        continue;
+
+      assert(DI->getVariable() && "Missing variable");
+      assert(DI->getDebugLoc() && "Missing location");
+      const Value *Address = DI->getAddress();
+      if (!Address)
+        continue;
+
+      // Look through casts and constant offset GEPs. These mostly come from
+      // inalloca.
+      APInt Offset(DL.getPointerSizeInBits(0), 0);
+      Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+
+      // Check if the variable is a static alloca or a byval or inalloca
+      // argument passed in memory. If it is not, then we will ignore this
+      // intrinsic and handle this during isel like dbg.value.
+      int FI = std::numeric_limits<int>::max();
+      if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
+        auto SI = FuncInfo->StaticAllocaMap.find(AI);
+        if (SI != FuncInfo->StaticAllocaMap.end())
+          FI = SI->second;
+      } else if (const auto *Arg = dyn_cast<Argument>(Address))
+        FI = FuncInfo->getArgumentFrameIndex(Arg);
+
+      if (FI == std::numeric_limits<int>::max())
         continue;
-      unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
-      // Assign Undef to Vreg. We construct MI directly to make sure it works
-      // with FastISel.
-      BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
-              SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
-              VReg);
-      FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
+
+      DIExpression *Expr = DI->getExpression();
+      if (Offset.getBoolValue())
+        Expr = DIExpression::prepend(Expr, DIExpression::NoDeref,
+                                     Offset.getZExtValue());
+      MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
     }
   }
 }
@@ -1339,7 +1282,82 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
   }
 }
 
+void preassignSwiftErrorRegs(const TargetLowering *TLI,
+                             FunctionLoweringInfo *FuncInfo,
+                             BasicBlock::const_iterator Begin,
+                             BasicBlock::const_iterator End) {
+  if (!TLI->supportSwiftError() || FuncInfo->SwiftErrorVals.empty())
+    return;
+
+  // Iterator over instructions and assign vregs to swifterror defs and uses.
+  for (auto It = Begin; It != End; ++It) {
+    ImmutableCallSite CS(&*It);
+    if (CS) {
+      // A call-site with a swifterror argument is both use and def.
+      const Value *SwiftErrorAddr = nullptr;
+      for (auto &Arg : CS.args()) {
+        if (!Arg->isSwiftError())
+          continue;
+        // Use of swifterror.
+        assert(!SwiftErrorAddr && "Cannot have multiple swifterror arguments");
+        SwiftErrorAddr = &*Arg;
+        assert(SwiftErrorAddr->isSwiftError() &&
+               "Must have a swifterror value argument");
+        unsigned VReg; bool CreatedReg;
+        std::tie(VReg, CreatedReg) = FuncInfo->getOrCreateSwiftErrorVRegUseAt(
+          &*It, FuncInfo->MBB, SwiftErrorAddr);
+        assert(CreatedReg);
+      }
+      if (!SwiftErrorAddr)
+        continue;
+
+      // Def of swifterror.
+      unsigned VReg; bool CreatedReg;
+      std::tie(VReg, CreatedReg) =
+          FuncInfo->getOrCreateSwiftErrorVRegDefAt(&*It);
+      assert(CreatedReg);
+      FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorAddr, VReg);
+
+    // A load is a use.
+    } else if (const LoadInst *LI = dyn_cast<const LoadInst>(&*It)) {
+      const Value *V = LI->getOperand(0);
+      if (!V->isSwiftError())
+        continue;
+
+      unsigned VReg; bool CreatedReg;
+      std::tie(VReg, CreatedReg) =
+          FuncInfo->getOrCreateSwiftErrorVRegUseAt(LI, FuncInfo->MBB, V);
+      assert(CreatedReg);
+
+    // A store is a def.
+    } else if (const StoreInst *SI = dyn_cast<const StoreInst>(&*It)) {
+      const Value *SwiftErrorAddr = SI->getOperand(1);
+      if (!SwiftErrorAddr->isSwiftError())
+        continue;
+
+      // Def of swifterror.
+      unsigned VReg; bool CreatedReg;
+      std::tie(VReg, CreatedReg) =
+          FuncInfo->getOrCreateSwiftErrorVRegDefAt(&*It);
+      assert(CreatedReg);
+      FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorAddr, VReg);
+
+    // A return in a swiferror returning function is a use.
+    } else if (const ReturnInst *R = dyn_cast<const ReturnInst>(&*It)) {
+      const Function *F = R->getParent()->getParent();
+      if(!F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+        continue;
+
+      unsigned VReg; bool CreatedReg;
+      std::tie(VReg, CreatedReg) = FuncInfo->getOrCreateSwiftErrorVRegUseAt(
+          R, FuncInfo->MBB, FuncInfo->SwiftErrorArg);
+      assert(CreatedReg);
+    }
+  }
+}
+
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  FastISelFailed = false;
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel)
@@ -1347,12 +1365,55 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
   setupSwiftErrorVals(Fn, TLI, FuncInfo);
 
-  // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
-  for (ReversePostOrderTraversal<const Function*>::rpo_iterator
-       I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    const BasicBlock *LLVMBB = *I;
 
+  // Lower arguments up front. An RPO iteration always visits the entry block
+  // first.
+  assert(*RPOT.begin() == &Fn.getEntryBlock());
+  ++NumEntryBlocks;
+
+  // Set up FuncInfo for ISel. Entry blocks never have PHIs.
+  FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
+  FuncInfo->InsertPt = FuncInfo->MBB->begin();
+
+  if (!FastIS) {
+    LowerArguments(Fn);
+  } else {
+    // See if fast isel can lower the arguments.
+    FastIS->startNewBlock();
+    if (!FastIS->lowerArguments()) {
+      FastISelFailed = true;
+      // Fast isel failed to lower these arguments
+      ++NumFastIselFailLowerArguments;
+
+      OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                 Fn.getSubprogram(),
+                                 &Fn.getEntryBlock());
+      R << "FastISel didn't lower all arguments: "
+        << ore::NV("Prototype", Fn.getType());
+      reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1);
+
+      // Use SelectionDAG argument lowering
+      LowerArguments(Fn);
+      CurDAG->setRoot(SDB->getControlRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // If we inserted any instructions at the beginning, make a note of
+    // where they are, so we can be sure to emit subsequent instructions
+    // after them.
+    if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+    else
+      FastIS->setLastLocalValue(nullptr);
+  }
+  createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
+
+  processDbgDeclares(FuncInfo);
+
+  // Iterate over all basic blocks in the function.
+  for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
       for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
@@ -1384,8 +1445,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
     if (!FuncInfo->MBB)
       continue; // Some blocks like catchpads have no code or MBB.
-    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
-    createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB);
+
+    // Insert new instructions after any phi or argument setup code.
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = 0;
@@ -1396,43 +1458,21 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
-      FastIS->startNewBlock();
-
-      // Emit code for any incoming arguments. This must happen before
-      // beginning FastISel on the entry block.
-      if (LLVMBB == &Fn.getEntryBlock()) {
-        ++NumEntryBlocks;
-
-        // Lower any arguments needed in this block if this is the entry block.
-        if (!FastIS->lowerArguments()) {
-          // Fast isel failed to lower these arguments
-          ++NumFastIselFailLowerArguments;
-          if (EnableFastISelAbort > 1)
-            report_fatal_error("FastISel didn't lower all arguments");
-
-          // Use SelectionDAG argument lowering
-          LowerArguments(Fn);
-          CurDAG->setRoot(SDB->getControlRoot());
-          SDB->clear();
-          CodeGenAndEmitDAG();
-        }
-
-        // If we inserted any instructions at the beginning, make a note of
-        // where they are, so we can be sure to emit subsequent instructions
-        // after them.
-        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
-          FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
-        else
-          FastIS->setLastLocalValue(nullptr);
-      }
+      if (LLVMBB != &Fn.getEntryBlock())
+        FastIS->startNewBlock();
 
       unsigned NumFastIselRemaining = std::distance(Begin, End);
+
+      // Pre-assign swifterror vregs.
+      preassignSwiftErrorRegs(TLI, FuncInfo, Begin, End);
+
       // Do FastISel on as many instructions as possible.
       for (; BI != Begin; --BI) {
         const Instruction *Inst = &*std::prev(BI);
 
         // If we no longer require this instruction, skip it.
-        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo) ||
+            ElidedArgCopyInstrs.count(Inst)) {
           --NumFastIselRemaining;
           continue;
         }
@@ -1465,22 +1505,28 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
-#ifndef NDEBUG
-        if (EnableFastISelVerbose2)
-          collectFailStats(Inst);
-#endif
+        FastISelFailed = true;
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
-        if (isa<CallInst>(Inst)) {
-
-          if (EnableFastISelVerbose || EnableFastISelAbort) {
-            dbgs() << "FastISel missed call: ";
-            Inst->dump();
+        // We cannot separate out GCrelocates to their own blocks since we need
+        // to keep track of gc-relocates for a particular gc-statepoint. This is
+        // done by SelectionDAGBuilder::LowerAsSTATEPOINT, called before
+        // visitGCRelocate.
+        if (isa<CallInst>(Inst) && !isStatepoint(Inst) && !isGCRelocate(Inst)) {
+          OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                     Inst->getDebugLoc(), LLVMBB);
+
+          R << "FastISel missed call";
+
+          if (R.isEnabled() || EnableFastISelAbort) {
+            std::string InstStrStorage;
+            raw_string_ostream InstStr(InstStrStorage);
+            InstStr << *Inst;
+
+            R << ": " << InstStr.str();
           }
-          if (EnableFastISelAbort > 2)
-            // FastISel selector couldn't handle something and bailed.
-            // For the purpose of debugging, just abort.
-            report_fatal_error("FastISel didn't select the entire block");
+
+          reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2);
 
           if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
               !Inst->use_empty()) {
@@ -1509,35 +1555,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
+        OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                   Inst->getDebugLoc(), LLVMBB);
+
         bool ShouldAbort = EnableFastISelAbort;
-        if (EnableFastISelVerbose || EnableFastISelAbort) {
-          if (isa<TerminatorInst>(Inst)) {
-            // Use a different message for terminator misses.
-            dbgs() << "FastISel missed terminator: ";
-            // Don't abort unless for terminator unless the level is really high
-            ShouldAbort = (EnableFastISelAbort > 2);
-          } else {
-            dbgs() << "FastISel miss: ";
-          }
-          Inst->dump();
+        if (isa<TerminatorInst>(Inst)) {
+          // Use a different message for terminator misses.
+          R << "FastISel missed terminator";
+          // Don't abort for terminator unless the level is really high
+          ShouldAbort = (EnableFastISelAbort > 2);
+        } else {
+          R << "FastISel missed";
         }
-        if (ShouldAbort)
-          // FastISel selector couldn't handle something and bailed.
-          // For the purpose of debugging, just abort.
-          report_fatal_error("FastISel didn't select the entire block");
+
+        if (R.isEnabled() || EnableFastISelAbort) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << *Inst;
+          R << ": " << InstStr.str();
+        }
+
+        reportFastISelFailure(*MF, *ORE, R, ShouldAbort);
 
         NumFastIselFailures += NumFastIselRemaining;
         break;
       }
 
       FastIS->recomputeInsertPt();
-    } else {
-      // Lower any arguments needed in this block if this is the entry block.
-      if (LLVMBB == &Fn.getEntryBlock()) {
-        ++NumEntryBlocks;
-        LowerArguments(Fn);
-      }
     }
+
     if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
           TLI->getSSPStackGuardCheck(*Fn.getParent());
@@ -1556,10 +1602,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       // block.
       bool HadTailCall;
       SelectBasicBlock(Begin, BI, HadTailCall);
+
+      // But if FastISel was run, we already selected some of the block.
+      // If we emitted a tail-call, we need to delete any previously emitted
+      // instruction that follows it.
+      if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end())
+        FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());
     }
 
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
+    ElidedArgCopyInstrs.clear();
   }
 
   propagateSwiftErrorVRegs(FuncInfo);
@@ -1975,11 +2028,11 @@ bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
   // either already zero or is not demanded.  Check for known zero input bits.
   APInt NeededMask = DesiredMask & ~ActualMask;
 
-  APInt KnownZero, KnownOne;
-  CurDAG->computeKnownBits(LHS, KnownZero, KnownOne);
+  KnownBits Known;
+  CurDAG->computeKnownBits(LHS, Known);
 
   // If all the missing bits in the or are already known to be set, match!
-  if ((NeededMask & KnownOne) == NeededMask)
+  if (NeededMask.isSubsetOf(Known.One))
     return true;
 
   // TODO: check to see if missing bits are just not demanded.
@@ -2062,7 +2115,7 @@ static SDNode *findGlueUse(SDNode *N) {
 }
 
 /// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
-/// This function recursively traverses up the operand chain, ignoring
+/// This function iteratively traverses up the operand chain, ignoring
 /// certain nodes.
 static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
                           SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
@@ -2075,30 +2128,36 @@ static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
   // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
   // happen because we scan down to newly selected nodes in the case of glue
   // uses.
-  if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1))
-    return false;
+  std::vector<SDNode *> WorkList;
+  WorkList.push_back(Use);
 
-  // Don't revisit nodes if we already scanned it and didn't fail, we know we
-  // won't fail if we scan it again.
-  if (!Visited.insert(Use).second)
-    return false;
+  while (!WorkList.empty()) {
+    Use = WorkList.back();
+    WorkList.pop_back();
+    if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)
+      continue;
 
-  for (const SDValue &Op : Use->op_values()) {
-    // Ignore chain uses, they are validated by HandleMergeInputChains.
-    if (Op.getValueType() == MVT::Other && IgnoreChains)
+    // Don't revisit nodes if we already scanned it and didn't fail, we know we
+    // won't fail if we scan it again.
+    if (!Visited.insert(Use).second)
       continue;
 
-    SDNode *N = Op.getNode();
-    if (N == Def) {
-      if (Use == ImmedUse || Use == Root)
-        continue;  // We are not looking for immediate use.
-      assert(N != Root);
-      return true;
-    }
+    for (const SDValue &Op : Use->op_values()) {
+      // Ignore chain uses, they are validated by HandleMergeInputChains.
+      if (Op.getValueType() == MVT::Other && IgnoreChains)
+        continue;
 
-    // Traverse up the operand chain.
-    if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains))
-      return true;
+      SDNode *N = Op.getNode();
+      if (N == Def) {
+        if (Use == ImmedUse || Use == Root)
+          continue;  // We are not looking for immediate use.
+        assert(N != Root);
+        return true;
+      }
+
+      // Traverse up the operand chain.
+      WorkList.push_back(N);
+    }
   }
   return false;
 }
@@ -2177,7 +2236,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
     IgnoreChains = false;
   }
 
-
   SmallPtrSet<SDNode*, 16> Visited;
   return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
 }
@@ -2554,7 +2612,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
-          const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+          const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   // Accept if it is exactly the same as a previously recorded node.
   unsigned RecNo = MatcherTable[MatcherIndex++];
   assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
@@ -2564,9 +2622,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// CheckChildSame - Implements OP_CheckChildXSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-             SDValue N,
-             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
-             unsigned ChildNo) {
+              SDValue N,
+              const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes,
+              unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
@@ -2688,7 +2746,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
                                        unsigned Index, SDValue N,
                                        bool &Result,
                                        const SelectionDAGISel &SDISel,
-                 SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+                  SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   switch (Table[Index++]) {
   default:
     Result = false;
@@ -2756,6 +2814,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
 }
 
 namespace {
+
 struct MatchScope {
   /// FailIndex - If this match fails, this is the index to continue with.
   unsigned FailIndex;
@@ -2785,6 +2844,7 @@ class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
   SDNode **NodeToMatch;
   SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;
   SmallVectorImpl<MatchScope> &MatchScopes;
+
 public:
   MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,
                     SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN,
@@ -2816,6 +2876,7 @@ public:
           J.setNode(E);
   }
 };
+
 } // end anonymous namespace
 
 void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
@@ -2921,7 +2982,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
     // is the first time we're selecting an instruction.
     unsigned Idx = 1;
-    while (1) {
+    while (true) {
       // Get the size of this case.
       unsigned CaseSize = MatcherTable[Idx++];
       if (CaseSize & 128)
@@ -2942,7 +3003,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       MatcherIndex = OpcodeOffset[N.getOpcode()];
   }
 
-  while (1) {
+  while (true) {
     assert(MatcherIndex < TableSize && "Invalid index");
 #ifndef NDEBUG
     unsigned CurrentOpcodeIndex = MatcherIndex;
@@ -2957,7 +3018,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // immediately fail, don't even bother pushing a scope for them.
       unsigned FailIndex;
 
-      while (1) {
+      while (true) {
         unsigned NumToSkip = MatcherTable[MatcherIndex++];
         if (NumToSkip & 128)
           NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
@@ -3118,7 +3179,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       unsigned CurNodeOpcode = N.getOpcode();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
-      while (1) {
+      while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
@@ -3149,7 +3210,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
-      while (1) {
+      while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
@@ -3215,7 +3276,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // a single use.
       bool HasMultipleUses = false;
       for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
-        if (!NodeStack[i].hasOneUse()) {
+        if (!NodeStack[i].getNode()->hasOneUse()) {
           HasMultipleUses = true;
           break;
         }
@@ -3381,6 +3442,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
       continue;
     }
+    case OPC_Coverage: {
+      // This is emitted right before MorphNode/EmitNode.
+      // So it should be safe to assume that this node has been selected
+      unsigned index = MatcherTable[MatcherIndex++];
+      index |= (MatcherTable[MatcherIndex++] << 8);
+      dbgs() << "COVERED: " << getPatternForIndex(index) << "\n";
+      dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n";
+      continue;
+    }
 
     case OPC_EmitNode:     case OPC_MorphNodeTo:
     case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
@@ -3473,7 +3543,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
                                                              nullptr));
         }
-
       } else {
         assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
                "NodeToMatch was removed partway through selection");
@@ -3610,7 +3679,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // find a case to check.
     DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
     ++NumDAGIselRetries;
-    while (1) {
+    while (true) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
         return;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 2764688..11561df 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index d27e245..5d78bba 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -17,9 +17,9 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/CallingConv.h"
@@ -110,8 +110,8 @@ StatepointLoweringState::allocateStackSlot(EVT ValueType,
          Builder.FuncInfo.StatepointStackSlots.size() &&
          "Broken invariant");
 
-  StatepointMaxSlotsRequired = std::max<unsigned long>(
-      StatepointMaxSlotsRequired, Builder.FuncInfo.StatepointStackSlots.size());
+  StatepointMaxSlotsRequired.updateMax(
+      Builder.FuncInfo.StatepointStackSlots.size());
 
   return SpillSlot;
 }
@@ -242,7 +242,8 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
 
   // Cache this slot so we find it when going through the normal
   // assignment loop.
-  SDValue Loc = Builder.DAG.getTargetFrameIndex(*Index, Incoming.getValueType());
+  SDValue Loc =
+      Builder.DAG.getTargetFrameIndex(*Index, Builder.getFrameIndexTy());
   Builder.StatepointLowering.setLocation(Incoming, Loc);
 }
 
@@ -343,7 +344,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
                                                        Builder);
     int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
     // We use TargetFrameIndex so that isel will not select it into LEA
-    Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+    Loc = Builder.DAG.getTargetFrameIndex(Index, Builder.getFrameIndexTy());
 
     // TODO: We can create TokenFactor node instead of
     //       chaining stores one after another, this may allow
@@ -391,8 +392,10 @@ static void lowerIncomingStatepointValue(SDValue Incoming, bool LiveInOnly,
     // This handles allocas as arguments to the statepoint (this is only
     // really meaningful for a deopt value.  For GC, we'd be trying to
     // relocate the address of the alloca itself?)
+    assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
+           "Incoming value is a frame index!");
     Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
-                                                  Incoming.getValueType()));
+                                                  Builder.getFrameIndexTy()));
   } else if (LiveInOnly) {
     // If this value is live in (not live-on-return, or live-through), we can
     // treat it the same way patchpoint treats it's "live in" values.  We'll 
@@ -527,8 +530,10 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
     SDValue Incoming = Builder.getValue(V);
     if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
       // This handles allocas as arguments to the statepoint
+      assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
+             "Incoming value is a frame index!");
       Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
-                                                    Incoming.getValueType()));
+                                                    Builder.getFrameIndexTy()));
     }
   }
 
@@ -813,7 +818,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
   SI.GCTransitionArgs =
       ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
   SI.ID = ISP.getID();
-  SI.DeoptState = ArrayRef<const Use>(ISP.vm_state_begin(), ISP.vm_state_end());
+  SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
   SI.StatepointFlags = ISP.getFlags();
   SI.NumPatchBytes = ISP.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
@@ -835,7 +840,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
       //       completely and make statepoint call to return a tuple.
       unsigned Reg = FuncInfo.CreateRegs(RetTy);
       RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                       DAG.getDataLayout(), Reg, RetTy);
+                       DAG.getDataLayout(), Reg, RetTy, true);
       SDValue Chain = DAG.getEntryNode();
 
       RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
@@ -949,8 +954,8 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
     return;
   }
 
-  SDValue SpillSlot = DAG.getTargetFrameIndex(*DerivedPtrLocation,
-                                              SD.getValueType());
+  SDValue SpillSlot =
+      DAG.getTargetFrameIndex(*DerivedPtrLocation, getFrameIndexTy());
 
   // Be conservative: flush all pending loads
   // TODO: Probably we can be less restrictive on this,
@@ -958,7 +963,9 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   SDValue Chain = getRoot();
 
   SDValue SpillLoad =
-      DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot,
+      DAG.getLoad(DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                           Relocate.getType()),
+                  getCurSDLoc(), Chain, SpillSlot,
                   MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
                                                     *DerivedPtrLocation));
 
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 690f0d2..8652df7 100644
--- a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -55,14 +56,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
 
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore noalias because it doesn't affect the call sequence.
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex)
-      .removeAttribute(Attribute::NoAlias).hasAttributes())
+  AttributeList CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
+          .removeAttribute(Attribute::NoAlias)
+          .hasAttributes())
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
     return false;
 
   // Check if the only use is a function return node.
@@ -96,19 +98,19 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
 
 /// \brief Set CallLoweringInfo attribute flags based on a call instruction
 /// and called function attributes.
-void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
-                                                 unsigned AttrIdx) {
-  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
-  isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
-  Alignment  = CS->getParamAlignment(AttrIdx);
+void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                     unsigned ArgIdx) {
+  IsSExt = CS->paramHasAttr(ArgIdx, Attribute::SExt);
+  IsZExt = CS->paramHasAttr(ArgIdx, Attribute::ZExt);
+  IsInReg = CS->paramHasAttr(ArgIdx, Attribute::InReg);
+  IsSRet = CS->paramHasAttr(ArgIdx, Attribute::StructRet);
+  IsNest = CS->paramHasAttr(ArgIdx, Attribute::Nest);
+  IsByVal = CS->paramHasAttr(ArgIdx, Attribute::ByVal);
+  IsInAlloca = CS->paramHasAttr(ArgIdx, Attribute::InAlloca);
+  IsReturned = CS->paramHasAttr(ArgIdx, Attribute::Returned);
+  IsSwiftSelf = CS->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
+  IsSwiftError = CS->paramHasAttr(ArgIdx, Attribute::SwiftError);
+  Alignment  = CS->getParamAlignment(ArgIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
@@ -125,8 +127,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   for (SDValue Op : Ops) {
     Entry.Node = Op;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
-    Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
     Args.push_back(Entry);
   }
 
@@ -138,10 +140,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
   bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
-    .setSExtResult(signExtend).setZExtResult(!signExtend);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setNoReturn(doesNotReturn)
+      .setDiscardResult(!isReturnValueUsed)
+      .setSExtResult(signExtend)
+      .setZExtResult(!signExtend);
   return LowerCallTo(CLI);
 }
 
@@ -334,34 +339,40 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 //  Optimization Methods
 //===----------------------------------------------------------------------===//
 
-/// Check to see if the specified operand of the specified instruction is a
-/// constant integer. If so, check to see if there are any bits set in the
-/// constant that are not demanded. If so, shrink the constant and return true.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
-                                                        const APInt &Demanded) {
-  SDLoc dl(Op);
+/// If the specified instruction has a constant integer operand and there are
+/// bits set in that constant that are not demanded, then clear those bits and
+/// return true.
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                            TargetLoweringOpt &TLO) const {
+  SelectionDAG &DAG = TLO.DAG;
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
+
+  // Do target-specific constant optimization.
+  if (targetShrinkDemandedConstant(Op, Demanded, TLO))
+    return TLO.New.getNode();
 
   // FIXME: ISD::SELECT, ISD::SELECT_CC
-  switch (Op.getOpcode()) {
-  default: break;
+  switch (Opcode) {
+  default:
+    break;
   case ISD::XOR:
   case ISD::AND:
   case ISD::OR: {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!C) return false;
+    auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Op1C)
+      return false;
 
-    if (Op.getOpcode() == ISD::XOR &&
-        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+    // If this is a 'not' op, don't touch it because that's a canonical form.
+    const APInt &C = Op1C->getAPIntValue();
+    if (Opcode == ISD::XOR && Demanded.isSubsetOf(C))
       return false;
 
-    // if we can expand it to have all bits set, do it
-    if (C->getAPIntValue().intersects(~Demanded)) {
+    if (!C.isSubsetOf(Demanded)) {
       EVT VT = Op.getValueType();
-      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
-                                DAG.getConstant(Demanded &
-                                                C->getAPIntValue(),
-                                                dl, VT));
-      return CombineTo(Op, New);
+      SDValue NewC = DAG.getConstant(Demanded & C, DL, VT);
+      SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
+      return TLO.CombineTo(Op, NewOp);
     }
 
     break;
@@ -374,15 +385,17 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
 /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
 /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
 /// generalized for targets with other types of implicit widening casts.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
-                                                         unsigned BitWidth,
-                                                         const APInt &Demanded,
-                                                         const SDLoc &dl) {
+bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
+                                      const APInt &Demanded,
+                                      TargetLoweringOpt &TLO) const {
   assert(Op.getNumOperands() == 2 &&
          "ShrinkDemandedOp only supports binary operators!");
   assert(Op.getNode()->getNumValues() == 1 &&
          "ShrinkDemandedOp only supports nodes with one result!");
 
+  SelectionDAG &DAG = TLO.DAG;
+  SDLoc dl(Op);
+
   // Early return, as this function cannot handle vector types.
   if (Op.getValueType().isVector())
     return false;
@@ -404,31 +417,28 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
     if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
         TLI.isZExtFree(SmallVT, Op.getValueType())) {
       // We found a type with free casts.
-      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
-                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
-                                          Op.getNode()->getOperand(0)),
-                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
-                                          Op.getNode()->getOperand(1)));
+      SDValue X = DAG.getNode(
+          Op.getOpcode(), dl, SmallVT,
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+          DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)));
       bool NeedZext = DemandedSize > SmallVTBits;
       SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
                               dl, Op.getValueType(), X);
-      return CombineTo(Op, Z);
+      return TLO.CombineTo(Op, Z);
     }
   }
   return false;
 }
 
 bool
-TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
-                                                        unsigned OpIdx,
-                                                        const APInt &Demanded,
-                                                        DAGCombinerInfo &DCI) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+TargetLowering::SimplifyDemandedBits(SDNode *User, unsigned OpIdx,
+                                     const APInt &Demanded,
+                                     DAGCombinerInfo &DCI,
+                                     TargetLoweringOpt &TLO) const {
   SDValue Op = User->getOperand(OpIdx);
-  APInt KnownZero, KnownOne;
+  KnownBits Known;
 
-  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne,
-                                *this, 0, true))
+  if (!SimplifyDemandedBits(Op, Demanded, Known, TLO, 0, true))
     return false;
 
 
@@ -440,9 +450,9 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
   // with the value 'x', which will give us:
   // Old = i32 and x, 0xffffff
   // New = x
-  if (Old.hasOneUse()) {
+  if (TLO.Old.hasOneUse()) {
     // For the one use case, we just commit the change.
-    DCI.CommitTargetLoweringOpt(*this);
+    DCI.CommitTargetLoweringOpt(TLO);
     return true;
   }
 
@@ -450,17 +460,17 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
   // AssumeSingleUse flag is not propogated to recursive calls of
   // SimplifyDemanded bits, so the only node with multiple use that
   // it will attempt to combine will be opt.
-  assert(Old == Op);
+  assert(TLO.Old == Op);
 
   SmallVector <SDValue, 4> NewOps;
   for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
     if (i == OpIdx) {
-      NewOps.push_back(New);
+      NewOps.push_back(TLO.New);
       continue;
     }
     NewOps.push_back(User->getOperand(i));
   }
-  DAG.UpdateNodeOperands(User, NewOps);
+  TLO.DAG.UpdateNodeOperands(User, NewOps);
   // Op has less users now, so we may be able to perform additional combines
   // with it.
   DCI.AddToWorklist(Op.getNode());
@@ -470,17 +480,30 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
   return true;
 }
 
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask,
+                                          DAGCombinerInfo &DCI) const {
+
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+  KnownBits Known;
+
+  bool Simplified = SimplifyDemandedBits(Op, DemandedMask, Known, TLO);
+  if (Simplified)
+    DCI.CommitTargetLoweringOpt(TLO);
+  return Simplified;
+}
+
 /// Look at Op. At this point, we know that only the DemandedMask bits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
 /// original and new nodes in Old and New. Otherwise, analyze the expression and
-/// return a mask of KnownOne and KnownZero bits for the expression (used to
-/// simplify the caller).  The KnownZero/One bits may only be accurate for those
-/// bits in the DemandedMask.
+/// return a mask of Known bits for the expression (used to simplify the
+/// caller).  The Known bits may only be accurate for those bits in the
+/// DemandedMask.
 bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                           const APInt &DemandedMask,
-                                          APInt &KnownZero,
-                                          APInt &KnownOne,
+                                          KnownBits &Known,
                                           TargetLoweringOpt &TLO,
                                           unsigned Depth,
                                           bool AssumeSingleUse) const {
@@ -492,14 +515,14 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   auto &DL = TLO.DAG.getDataLayout();
 
   // Don't know anything.
-  KnownZero = KnownOne = APInt(BitWidth, 0);
+  Known = KnownBits(BitWidth);
 
   // Other users may use these bits.
   if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
     if (Depth != 0) {
-      // If not at the root, Just compute the KnownZero/KnownOne bits to
+      // If not at the root, Just compute the Known bits to
       // simplify things downstream.
-      TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth);
+      TLO.DAG.computeKnownBits(Op, Known, Depth);
       return false;
     }
     // If this is the root being simplified, allow it to have multiple uses,
@@ -514,38 +537,36 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     return false;
   }
 
-  APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut;
+  KnownBits Known2, KnownOut;
   switch (Op.getOpcode()) {
   case ISD::Constant:
     // We know all of the bits for a constant!
-    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
-    KnownZero = ~KnownOne;
+    Known.One = cast<ConstantSDNode>(Op)->getAPIntValue();
+    Known.Zero = ~Known.One;
     return false;   // Don't fall through, will infinitely loop.
   case ISD::BUILD_VECTOR:
     // Collect the known bits that are shared by every constant vector element.
-    KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth);
+    Known.Zero.setAllBits(); Known.One.setAllBits();
     for (SDValue SrcOp : Op->ops()) {
       if (!isa<ConstantSDNode>(SrcOp)) {
         // We can only handle all constant values - bail out with no known bits.
-        KnownZero = KnownOne = APInt(BitWidth, 0);
+        Known = KnownBits(BitWidth);
         return false;
       }
-      KnownOne2 = cast<ConstantSDNode>(SrcOp)->getAPIntValue();
-      KnownZero2 = ~KnownOne2;
+      Known2.One = cast<ConstantSDNode>(SrcOp)->getAPIntValue();
+      Known2.Zero = ~Known2.One;
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
-      if (KnownOne2.getBitWidth() != BitWidth) {
-        assert(KnownOne2.getBitWidth() > BitWidth &&
-               KnownZero2.getBitWidth() > BitWidth &&
+      if (Known2.One.getBitWidth() != BitWidth) {
+        assert(Known2.getBitWidth() > BitWidth &&
                "Expected BUILD_VECTOR implicit truncation");
-        KnownOne2 = KnownOne2.trunc(BitWidth);
-        KnownZero2 = KnownZero2.trunc(BitWidth);
+        Known2 = Known2.trunc(BitWidth);
       }
 
       // Known bits are the values that are shared by every element.
       // TODO: support per-element known bits.
-      KnownOne &= KnownOne2;
-      KnownZero &= KnownZero2;
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
     }
     return false;   // Don't fall through, will infinitely loop.
   case ISD::AND:
@@ -553,18 +574,18 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // using the bits from the RHS.  Below, we use knowledge about the RHS to
     // simplify the LHS, here we're using information from the LHS to simplify
     // the RHS.
-    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (ConstantSDNode *RHSC = isConstOrConstSplat(Op.getOperand(1))) {
       SDValue Op0 = Op.getOperand(0);
-      APInt LHSZero, LHSOne;
+      KnownBits LHSKnown;
       // Do not increment Depth here; that can cause an infinite loop.
-      TLO.DAG.computeKnownBits(Op0, LHSZero, LHSOne, Depth);
+      TLO.DAG.computeKnownBits(Op0, LHSKnown, Depth);
       // If the LHS already has zeros where RHSC does, this and is dead.
-      if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
+      if ((LHSKnown.Zero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
         return TLO.CombineTo(Op, Op0);
 
       // If any of the set bits in the RHS are known zero on the LHS, shrink
       // the constant.
-      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+      if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & NewMask, TLO))
         return true;
 
       // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
@@ -573,183 +594,191 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // the xor. For example, for a 32-bit X:
       // and (xor (srl X, 31), -1), 1 --> xor (srl X, 31), 1
       if (isBitwiseNot(Op0) && Op0.hasOneUse() &&
-          LHSOne == ~RHSC->getAPIntValue()) {
+          LHSKnown.One == ~RHSC->getAPIntValue()) {
         SDValue Xor = TLO.DAG.getNode(ISD::XOR, dl, Op.getValueType(),
                                       Op0.getOperand(0), Op.getOperand(1));
         return TLO.CombineTo(Op, Xor);
       }
     }
 
-    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
-                             KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask,
-                             KnownZero2, KnownOne2, TLO, Depth+1))
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), ~Known.Zero & NewMask,
+                             Known2, TLO, Depth+1))
       return true;
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known one on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
-    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+    if (NewMask.isSubsetOf(Known2.Zero | Known.One))
       return TLO.CombineTo(Op, Op.getOperand(0));
-    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+    if (NewMask.isSubsetOf(Known.Zero | Known2.One))
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If all of the demanded bits in the inputs are known zeros, return zero.
-    if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
+    if (NewMask.isSubsetOf(Known.Zero | Known2.Zero))
       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType()));
     // If the RHS is a constant, see if we can simplify it.
-    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+    if (ShrinkDemandedConstant(Op, ~Known2.Zero & NewMask, TLO))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
-    KnownOne &= KnownOne2;
+    Known.One &= Known2.One;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    KnownZero |= KnownZero2;
+    Known.Zero |= Known2.Zero;
     break;
   case ISD::OR:
-    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
-                             KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask,
-                             KnownZero2, KnownOne2, TLO, Depth+1))
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), ~Known.One & NewMask,
+                             Known2, TLO, Depth+1))
       return true;
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
-    if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask))
+    if (NewMask.isSubsetOf(Known2.One | Known.Zero))
       return TLO.CombineTo(Op, Op.getOperand(0));
-    if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask))
-      return TLO.CombineTo(Op, Op.getOperand(1));
-    // If all of the potentially set bits on one side are known to be set on
-    // the other side, just use the 'other' side.
-    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
-      return TLO.CombineTo(Op, Op.getOperand(0));
-    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+    if (NewMask.isSubsetOf(Known.One | Known2.Zero))
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the RHS is a constant, see if we can simplify it.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
       return true;
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
-    KnownZero &= KnownZero2;
+    Known.Zero &= Known2.Zero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
-    KnownOne |= KnownOne2;
+    Known.One |= Known2.One;
     break;
-  case ISD::XOR:
-    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
-                             KnownOne, TLO, Depth+1))
+  case ISD::XOR: {
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2,
-                             KnownOne2, TLO, Depth+1))
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask, Known2, TLO, Depth+1))
       return true;
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
-    if ((KnownZero & NewMask) == NewMask)
+    if (NewMask.isSubsetOf(Known.Zero))
       return TLO.CombineTo(Op, Op.getOperand(0));
-    if ((KnownZero2 & NewMask) == NewMask)
+    if (NewMask.isSubsetOf(Known2.Zero))
       return TLO.CombineTo(Op, Op.getOperand(1));
     // If the operation can be done in a smaller type, do so.
-    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+    if (ShrinkDemandedOp(Op, BitWidth, NewMask, TLO))
       return true;
 
     // If all of the unknown bits are known to be zero on one side or the other
     // (but not both) turn this into an *inclusive* or.
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
-    if ((NewMask & ~KnownZero & ~KnownZero2) == 0)
+    if ((NewMask & ~Known.Zero & ~Known2.Zero) == 0)
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(),
                                                Op.getOperand(0),
                                                Op.getOperand(1)));
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    KnownOut.Zero = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownOut.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
 
     // If all of the demanded bits on one side are known, and all of the set
     // bits on that side are also known to be set on the other side, turn this
     // into an AND, as we know the bits will be cleared.
     //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
     // NB: it is okay if more bits are known than are requested
-    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side
-      if (KnownOne == KnownOne2) { // set bits are the same on both sides
+    if (NewMask.isSubsetOf(Known.Zero|Known.One)) { // all known on one side
+      if (Known.One == Known2.One) { // set bits are the same on both sides
         EVT VT = Op.getValueType();
-        SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, dl, VT);
+        SDValue ANDC = TLO.DAG.getConstant(~Known.One & NewMask, dl, VT);
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT,
                                                  Op.getOperand(0), ANDC));
       }
     }
 
-    // If the RHS is a constant, see if we can simplify it.
-    // for XOR, we prefer to force bits to 1 if they will make a -1.
-    // If we can't force bits, try to shrink the constant.
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
-      APInt Expanded = C->getAPIntValue() | (~NewMask);
-      // If we can expand it to have all bits set, do it.
-      if (Expanded.isAllOnesValue()) {
-        if (Expanded != C->getAPIntValue()) {
-          EVT VT = Op.getValueType();
-          SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0),
-                                        TLO.DAG.getConstant(Expanded, dl, VT));
-          return TLO.CombineTo(Op, New);
-        }
-        // If it already has all the bits set, nothing to change
-        // but don't shrink either!
-      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
-        return true;
+    // If the RHS is a constant, see if we can change it. Don't alter a -1
+    // constant because that's a 'not' op, and that is better for combining and
+    // codegen.
+    ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1));
+    if (C && !C->isAllOnesValue()) {
+      if (NewMask.isSubsetOf(C->getAPIntValue())) {
+        // We're flipping all demanded bits. Flip the undemanded bits too.
+        SDValue New = TLO.DAG.getNOT(dl, Op.getOperand(0), Op.getValueType());
+        return TLO.CombineTo(Op, New);
       }
+      // If we can't turn this into a 'not', try to shrink the constant.
+      if (ShrinkDemandedConstant(Op, NewMask, TLO))
+        return true;
     }
 
-    KnownZero = KnownZeroOut;
-    KnownOne  = KnownOneOut;
+    Known = std::move(KnownOut);
     break;
+  }
   case ISD::SELECT:
-    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero,
-                             KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known, TLO, Depth+1))
       return true;
-    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2,
-                             KnownOne2, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known2, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
       return true;
 
     // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
     break;
   case ISD::SELECT_CC:
-    if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero,
-                             KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(3), NewMask, Known, TLO, Depth+1))
       return true;
-    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2,
-                             KnownOne2, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known2, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+    if (ShrinkDemandedConstant(Op, NewMask, TLO))
       return true;
 
     // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
     break;
+  case ISD::SETCC: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    // If (1) we only need the sign-bit, (2) the setcc operands are the same
+    // width as the setcc result, and (3) the result of a setcc conforms to 0 or
+    // -1, we may be able to bypass the setcc.
+    if (NewMask.isSignMask() && Op0.getScalarValueSizeInBits() == BitWidth &&
+        getBooleanContents(Op.getValueType()) ==
+            BooleanContent::ZeroOrNegativeOneBooleanContent) {
+      // If we're testing X < 0, then this compare isn't needed - just use X!
+      // FIXME: We're limiting to integer types here, but this should also work
+      // if we don't care about FP signed-zero. The use of SETLT with FP means
+      // that we don't care about NaNs.
+      if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
+          (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode())))
+        return TLO.CombineTo(Op, Op0);
+
+      // TODO: Should we check for other forms of sign-bit comparisons?
+      // Examples: X <= -1, X >= 0
+    }
+    if (getBooleanContents(Op0.getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  }
   case ISD::SHL:
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       unsigned ShAmt = SA->getZExtValue();
@@ -781,17 +810,16 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         }
       }
 
-      if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt),
-                               KnownZero, KnownOne, TLO, Depth+1))
+      if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt), Known, TLO, Depth+1))
         return true;
 
       // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
       // are not demanded. This will likely allow the anyext to be folded away.
       if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
-        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        SDValue InnerOp = InOp.getOperand(0);
         EVT InnerVT = InnerOp.getValueType();
         unsigned InnerBits = InnerVT.getSizeInBits();
-        if (ShAmt < InnerBits && NewMask.lshr(InnerBits) == 0 &&
+        if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits &&
             isTypeDesirableForOp(ISD::SHL, InnerVT)) {
           EVT ShTy = getShiftAmountTy(InnerVT, DL);
           if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
@@ -813,12 +841,12 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
             InnerOp.getOpcode() == ISD::SRL &&
             InnerOp.hasOneUse() &&
             isa<ConstantSDNode>(InnerOp.getOperand(1))) {
-          uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
+          unsigned InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
             ->getZExtValue();
           if (InnerShAmt < ShAmt &&
               InnerShAmt < InnerBits &&
-              NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 &&
-              NewMask.trunc(ShAmt) == 0) {
+              NewMask.getActiveBits() <= (InnerBits - InnerShAmt + ShAmt) &&
+              NewMask.countTrailingZeros() >= ShAmt) {
             SDValue NewSA =
               TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
                                   Op.getOperand(1).getValueType());
@@ -831,10 +859,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
         }
       }
 
-      KnownZero <<= SA->getZExtValue();
-      KnownOne  <<= SA->getZExtValue();
+      Known.Zero <<= SA->getZExtValue();
+      Known.One  <<= SA->getZExtValue();
       // low bits known zero.
-      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+      Known.Zero.setLowBits(SA->getZExtValue());
     }
     break;
   case ISD::SRL:
@@ -852,8 +880,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
-      if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
-        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+      if (Op->getFlags().hasExact())
+        InDemandedMask.setLowBits(ShAmt);
 
       // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the top bits (which are shifted out)
@@ -877,15 +905,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       }
 
       // Compute the new bits that are at the top now.
-      if (SimplifyDemandedBits(InOp, InDemandedMask,
-                               KnownZero, KnownOne, TLO, Depth+1))
+      if (SimplifyDemandedBits(InOp, InDemandedMask, Known, TLO, Depth+1))
         return true;
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-      KnownZero = KnownZero.lshr(ShAmt);
-      KnownOne  = KnownOne.lshr(ShAmt);
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShAmt);
+      Known.One.lshrInPlace(ShAmt);
 
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
-      KnownZero |= HighBits;  // High bits known zero.
+      Known.Zero.setHighBits(ShAmt);  // High bits known zero.
     }
     break;
   case ISD::SRA:
@@ -893,7 +919,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // always convert this into a logical shr, even if the shift amount is
     // variable.  The low bit of the shift cannot be an input sign bit unless
     // the shift amount is >= the size of the datatype, which is undefined.
-    if (NewMask == 1)
+    if (NewMask.isOneValue())
       return TLO.CombineTo(Op,
                            TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
                                            Op.getOperand(0), Op.getOperand(1)));
@@ -910,33 +936,30 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
-      if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
-        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+      if (Op->getFlags().hasExact())
+        InDemandedMask.setLowBits(ShAmt);
 
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
-      if (HighBits.intersects(NewMask))
-        InDemandedMask |= APInt::getSignBit(VT.getScalarSizeInBits());
+      if (NewMask.countLeadingZeros() < ShAmt)
+        InDemandedMask.setSignBit();
 
-      if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
-                               KnownZero, KnownOne, TLO, Depth+1))
+      if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, Known, TLO,
+                               Depth+1))
         return true;
-      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-      KnownZero = KnownZero.lshr(ShAmt);
-      KnownOne  = KnownOne.lshr(ShAmt);
-
-      // Handle the sign bit, adjusted to where it is now in the mask.
-      APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt);
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShAmt);
+      Known.One.lshrInPlace(ShAmt);
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+      if (Known.Zero[BitWidth - ShAmt - 1] ||
+          NewMask.countLeadingZeros() >= ShAmt) {
         SDNodeFlags Flags;
-        Flags.setExact(cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact());
+        Flags.setExact(Op->getFlags().hasExact());
         return TLO.CombineTo(Op,
                              TLO.DAG.getNode(ISD::SRL, dl, VT, Op.getOperand(0),
-                                             Op.getOperand(1), &Flags));
+                                             Op.getOperand(1), Flags));
       }
 
       int Log2 = NewMask.exactLogBase2();
@@ -949,9 +972,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                                  Op.getOperand(0), NewSA));
       }
 
-      if (KnownOne.intersects(SignBit))
+      if (Known.One[BitWidth - ShAmt - 1])
         // New bits are known one.
-        KnownOne |= HighBits;
+        Known.One.setHighBits(ShAmt);
     }
     break;
   case ISD::SIGN_EXTEND_INREG: {
@@ -993,7 +1016,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return TLO.CombineTo(Op, Op.getOperand(0));
 
     APInt InSignBit =
-      APInt::getSignBit(ExVT.getScalarSizeInBits()).zext(BitWidth);
+      APInt::getSignMask(ExVT.getScalarSizeInBits()).zext(BitWidth);
     APInt InputDemandedBits =
       APInt::getLowBitsSet(BitWidth,
                            ExVT.getScalarSizeInBits()) &
@@ -1004,24 +1027,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     InputDemandedBits |= InSignBit;
 
     if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits,
-                             KnownZero, KnownOne, TLO, Depth+1))
+                             Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
 
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
 
     // If the input sign bit is known zero, convert this into a zero extension.
-    if (KnownZero.intersects(InSignBit))
+    if (Known.Zero.intersects(InSignBit))
       return TLO.CombineTo(Op, TLO.DAG.getZeroExtendInReg(
                                    Op.getOperand(0), dl, ExVT.getScalarType()));
 
-    if (KnownOne.intersects(InSignBit)) {    // Input sign bit known set
-      KnownOne |= NewBits;
-      KnownZero &= ~NewBits;
+    if (Known.One.intersects(InSignBit)) {    // Input sign bit known set
+      Known.One |= NewBits;
+      Known.Zero &= ~NewBits;
     } else {                       // Input sign bit unknown
-      KnownZero &= ~NewBits;
-      KnownOne &= ~NewBits;
+      Known.Zero &= ~NewBits;
+      Known.One &= ~NewBits;
     }
     break;
   }
@@ -1032,22 +1055,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     APInt MaskLo = NewMask.getLoBits(HalfBitWidth).trunc(HalfBitWidth);
     APInt MaskHi = NewMask.getHiBits(HalfBitWidth).trunc(HalfBitWidth);
 
-    APInt KnownZeroLo, KnownOneLo;
-    APInt KnownZeroHi, KnownOneHi;
+    KnownBits KnownLo, KnownHi;
 
-    if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownZeroLo,
-                             KnownOneLo, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownLo, TLO, Depth + 1))
       return true;
 
-    if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownZeroHi,
-                             KnownOneHi, TLO, Depth + 1))
+    if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownHi, TLO, Depth + 1))
       return true;
 
-    KnownZero = KnownZeroLo.zext(BitWidth) |
-                KnownZeroHi.zext(BitWidth).shl(HalfBitWidth);
+    Known.Zero = KnownLo.Zero.zext(BitWidth) |
+                KnownHi.Zero.zext(BitWidth).shl(HalfBitWidth);
 
-    KnownOne = KnownOneLo.zext(BitWidth) |
-               KnownOneHi.zext(BitWidth).shl(HalfBitWidth);
+    Known.One = KnownLo.One.zext(BitWidth) |
+               KnownHi.One.zext(BitWidth).shl(HalfBitWidth);
     break;
   }
   case ISD::ZERO_EXTEND: {
@@ -1062,20 +1082,18 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                                Op.getValueType(),
                                                Op.getOperand(0)));
 
-    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
-                             KnownZero, KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
-    KnownZero |= NewBits;
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    Known = Known.zext(BitWidth);
+    Known.Zero |= NewBits;
     break;
   }
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
     APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
-    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1);
     APInt NewBits   = ~InMask & NewMask;
 
     // If none of the top bits are demanded, convert this into an any_extend.
@@ -1090,37 +1108,34 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     InDemandedBits |= InSignBit;
     InDemandedBits = InDemandedBits.trunc(InBits);
 
-    if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero,
-                             KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, Known, TLO,
+                             Depth+1))
       return true;
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
+    Known = Known.zext(BitWidth);
 
     // If the sign bit is known zero, convert this to a zero extend.
-    if (KnownZero.intersects(InSignBit))
+    if (Known.Zero.intersects(InSignBit))
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl,
                                                Op.getValueType(),
                                                Op.getOperand(0)));
 
     // If the sign bit is known one, the top bits match.
-    if (KnownOne.intersects(InSignBit)) {
-      KnownOne |= NewBits;
-      assert((KnownZero & NewBits) == 0);
+    if (Known.One.intersects(InSignBit)) {
+      Known.One |= NewBits;
+      assert((Known.Zero & NewBits) == 0);
     } else {   // Otherwise, top bits aren't known.
-      assert((KnownOne & NewBits) == 0);
-      assert((KnownZero & NewBits) == 0);
+      assert((Known.One & NewBits) == 0);
+      assert((Known.Zero & NewBits) == 0);
     }
     break;
   }
   case ISD::ANY_EXTEND: {
     unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
     APInt InMask = NewMask.trunc(OperandBitWidth);
-    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
-                             KnownZero, KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    Known = Known.zext(BitWidth);
     break;
   }
   case ISD::TRUNCATE: {
@@ -1128,11 +1143,9 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // zero/one bits live out.
     unsigned OperandBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
     APInt TruncMask = NewMask.zext(OperandBitWidth);
-    if (SimplifyDemandedBits(Op.getOperand(0), TruncMask,
-                             KnownZero, KnownOne, TLO, Depth+1))
+    if (SimplifyDemandedBits(Op.getOperand(0), TruncMask, Known, TLO, Depth+1))
       return true;
-    KnownZero = KnownZero.trunc(BitWidth);
-    KnownOne = KnownOne.trunc(BitWidth);
+    Known = Known.trunc(BitWidth);
 
     // If the input is only used by this truncate, see if we can shrink it based
     // on the known demanded bits.
@@ -1158,26 +1171,29 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                                       getShiftAmountTy(Op.getValueType(), DL));
         }
 
-        APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
-                                               OperandBitWidth - BitWidth);
-        HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth);
-
-        if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) {
-          // None of the shifted in bits are needed.  Add a truncate of the
-          // shift input, then shift it.
-          SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
-                                             Op.getValueType(),
-                                             In.getOperand(0));
-          return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
-                                                   Op.getValueType(),
-                                                   NewTrunc,
-                                                   Shift));
+        if (ShAmt->getZExtValue() < BitWidth) {
+          APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
+                                                 OperandBitWidth - BitWidth);
+          HighBits.lshrInPlace(ShAmt->getZExtValue());
+          HighBits = HighBits.trunc(BitWidth);
+
+          if (!(HighBits & NewMask)) {
+            // None of the shifted in bits are needed.  Add a truncate of the
+            // shift input, then shift it.
+            SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
+                                               Op.getValueType(),
+                                               In.getOperand(0));
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
+                                                     Op.getValueType(),
+                                                     NewTrunc,
+                                                     Shift));
+          }
         }
         break;
       }
     }
 
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
   case ISD::AssertZext: {
@@ -1187,11 +1203,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     APInt InMask = APInt::getLowBitsSet(BitWidth,
                                         VT.getSizeInBits());
     if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask,
-                             KnownZero, KnownOne, TLO, Depth+1))
+                             Known, TLO, Depth+1))
       return true;
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
 
-    KnownZero |= ~InMask & NewMask;
+    Known.Zero |= ~InMask;
     break;
   }
   case ISD::BITCAST:
@@ -1200,7 +1216,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (!TLO.LegalOperations() &&
         !Op.getValueType().isVector() &&
         !Op.getOperand(0).getValueType().isVector() &&
-        NewMask == APInt::getSignBit(Op.getValueSizeInBits()) &&
+        NewMask == APInt::getSignMask(Op.getValueSizeInBits()) &&
         Op.getOperand(0).getValueType().isFloatingPoint()) {
       bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
       bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
@@ -1229,22 +1245,19 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // of the highest bit demanded of them.
     APInt LoMask = APInt::getLowBitsSet(BitWidth,
                                         BitWidth - NewMask.countLeadingZeros());
-    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2,
-                             KnownOne2, TLO, Depth+1) ||
-        SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
-                             KnownOne2, TLO, Depth+1) ||
+    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, Known2, TLO, Depth+1) ||
+        SimplifyDemandedBits(Op.getOperand(1), LoMask, Known2, TLO, Depth+1) ||
         // See if the operation should be performed at a smaller bit width.
-        TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl)) {
-      const SDNodeFlags *Flags = Op.getNode()->getFlags();
-      if (Flags->hasNoSignedWrap() || Flags->hasNoUnsignedWrap()) {
+        ShrinkDemandedOp(Op, BitWidth, NewMask, TLO)) {
+      SDNodeFlags Flags = Op.getNode()->getFlags();
+      if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
         // Disable the nsw and nuw flags. We can no longer guarantee that we
         // won't wrap after simplification.
-        SDNodeFlags NewFlags = *Flags;
-        NewFlags.setNoSignedWrap(false);
-        NewFlags.setNoUnsignedWrap(false);
+        Flags.setNoSignedWrap(false);
+        Flags.setNoUnsignedWrap(false);
         SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, Op.getValueType(),
                                         Op.getOperand(0), Op.getOperand(1),
-                                        &NewFlags);
+                                        Flags);
         return TLO.CombineTo(Op, NewOp);
       }
       return true;
@@ -1253,13 +1266,13 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   }
   default:
     // Just use computeKnownBits to compute output bits.
-    TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth);
+    TLO.DAG.computeKnownBits(Op, Known, Depth);
     break;
   }
 
   // If we know the value of all of the demanded bits, return this as a
   // constant.
-  if ((NewMask & (KnownZero|KnownOne)) == NewMask) {
+  if (NewMask.isSubsetOf(Known.Zero|Known.One)) {
     // Avoid folding to a constant if any OpaqueConstant is involved.
     const SDNode *N = Op.getNode();
     for (SDNodeIterator I = SDNodeIterator::begin(N),
@@ -1270,17 +1283,17 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
           return false;
     }
     return TLO.CombineTo(Op,
-                         TLO.DAG.getConstant(KnownOne, dl, Op.getValueType()));
+                         TLO.DAG.getConstant(Known.One, dl, Op.getValueType()));
   }
 
   return false;
 }
 
 /// Determine which of the bits specified in Mask are known to be either zero or
-/// one and return them in the KnownZero/KnownOne bitsets.
+/// one and return them in the Known.
 void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                   APInt &KnownZero,
-                                                   APInt &KnownOne,
+                                                   KnownBits &Known,
+                                                   const APInt &DemandedElts,
                                                    const SelectionDAG &DAG,
                                                    unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
@@ -1289,12 +1302,13 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
           Op.getOpcode() == ISD::INTRINSIC_VOID) &&
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+  Known.resetAll();
 }
 
 /// This method can be implemented by targets that want to expose additional
 /// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const APInt &,
                                                          const SelectionDAG &,
                                                          unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
@@ -1306,31 +1320,38 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   return 1;
 }
 
+// FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must
+// work with truncating build vectors and vectors with elements of less than
+// 8 bits.
 bool TargetLowering::isConstTrueVal(const SDNode *N) const {
   if (!N)
     return false;
 
-  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) {
-    const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
-    if (!BV)
-      return false;
-
-    // Only interested in constant splats, we don't care about undef
-    // elements in identifying boolean constants and getConstantSplatNode
-    // returns NULL if all ops are undef;
-    CN = BV->getConstantSplatNode();
+  APInt CVal;
+  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+    CVal = CN->getAPIntValue();
+  } else if (auto *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    auto *CN = BV->getConstantSplatNode();
     if (!CN)
       return false;
+
+    // If this is a truncating build vector, truncate the splat value.
+    // Otherwise, we may fail to match the expected values below.
+    unsigned BVEltWidth = BV->getValueType(0).getScalarSizeInBits();
+    CVal = CN->getAPIntValue();
+    if (BVEltWidth < CVal.getBitWidth())
+      CVal = CVal.trunc(BVEltWidth);
+  } else {
+    return false;
   }
 
   switch (getBooleanContents(N->getValueType(0))) {
   case UndefinedBooleanContent:
-    return CN->getAPIntValue()[0];
+    return CVal[0];
   case ZeroOrOneBooleanContent:
-    return CN->isOne();
+    return CVal.isOneValue();
   case ZeroOrNegativeOneBooleanContent:
-    return CN->isAllOnesValue();
+    return CVal.isAllOnesValue();
   }
 
   llvm_unreachable("Invalid boolean contents");
@@ -1472,8 +1493,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
   }
 
-  // Ensure that the constant occurs on the RHS, and fold constant
-  // comparisons.
+  // Ensure that the constant occurs on the RHS and fold constant comparisons.
   ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
   if (isa<ConstantSDNode>(N0.getNode()) &&
       (DCI.isBeforeLegalizeOps() ||
@@ -1486,7 +1506,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
     // equality comparison, then we're just comparing whether X itself is
     // zero.
-    if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) &&
+    if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) &&
         N0.getOperand(0).getOpcode() == ISD::CTLZ &&
         N0.getOperand(1).getOpcode() == ISD::Constant) {
       const APInt &ShAmt
@@ -1617,14 +1637,13 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
                                       TopSetCC.getOperand(1),
                                       InvCond);
-
         }
       }
     }
 
-    // If the LHS is '(and load, const)', the RHS is 0,
-    // the test is for equality or unsigned, and all 1 bits of the const are
-    // in the same partial word, see if we can shorten the load.
+    // If the LHS is '(and load, const)', the RHS is 0, the test is for
+    // equality or unsigned, and all 1 bits of the const are in the same
+    // partial word, see if we can shorten the load.
     if (DCI.isBeforeLegalize() &&
         !ISD::isSignedIntSetCC(Cond) &&
         N0.getOpcode() == ISD::AND && C1 == 0 &&
@@ -1647,16 +1666,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         for (unsigned width = origWidth / 2; width>=8; width /= 2) {
           APInt newMask = APInt::getLowBitsSet(maskWidth, width);
           for (unsigned offset=0; offset<origWidth/width; offset++) {
-            if ((newMask & Mask) == Mask) {
-              if (!DAG.getDataLayout().isLittleEndian())
-                bestOffset = (origWidth/width - offset - 1) * (width/8);
-              else
+            if (Mask.isSubsetOf(newMask)) {
+              if (DAG.getDataLayout().isLittleEndian())
                 bestOffset = (uint64_t)offset * (width/8);
+              else
+                bestOffset = (origWidth/width - offset - 1) * (width/8);
               bestMask = Mask.lshr(offset * (width/8) * 8);
               bestWidth = width;
               break;
             }
-            newMask = newMask << width;
+            newMask <<= width;
           }
         }
       }
@@ -1692,10 +1711,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         switch (Cond) {
         case ISD::SETUGT:
         case ISD::SETUGE:
-        case ISD::SETEQ: return DAG.getConstant(0, dl, VT);
+        case ISD::SETEQ:
+          return DAG.getConstant(0, dl, VT);
         case ISD::SETULT:
         case ISD::SETULE:
-        case ISD::SETNE: return DAG.getConstant(1, dl, VT);
+        case ISD::SETNE:
+          return DAG.getConstant(1, dl, VT);
         case ISD::SETGT:
         case ISD::SETGE:
           // True if the sign bit of C1 is set.
@@ -1764,12 +1785,12 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                                               ExtSrcTyBits),
                                           dl, ExtDstTy),
                           Cond);
-    } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) &&
+    } else if ((N1C->isNullValue() || N1C->isOne()) &&
                 (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
       // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
       if (N0.getOpcode() == ISD::SETCC &&
           isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
-        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getAPIntValue() != 1);
+        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (!N1C->isOne());
         if (TrueWhenTrue)
           return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
         // Invert the condition.
@@ -1786,7 +1807,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
             N0.getOperand(0).getOpcode() == ISD::XOR &&
             N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
           isa<ConstantSDNode>(N0.getOperand(1)) &&
-          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) {
+          cast<ConstantSDNode>(N0.getOperand(1))->isOne()) {
         // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
         // can only do this if the top bits are known zero.
         unsigned BitWidth = N0.getValueSizeInBits();
@@ -1795,9 +1816,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                                         BitWidth-1))) {
           // Okay, get the un-inverted input value.
           SDValue Val;
-          if (N0.getOpcode() == ISD::XOR)
+          if (N0.getOpcode() == ISD::XOR) {
             Val = N0.getOperand(0);
-          else {
+          } else {
             assert(N0.getOpcode() == ISD::AND &&
                     N0.getOperand(0).getOpcode() == ISD::XOR);
             // ((X^1)&1)^1 -> X & 1
@@ -1809,7 +1830,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           return DAG.getSetCC(dl, VT, Val, N1,
                               Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
         }
-      } else if (N1C->getAPIntValue() == 1 &&
+      } else if (N1C->isOne() &&
                  (VT == MVT::i1 ||
                   getBooleanContents(N0->getValueType(0)) ==
                       ZeroOrOneBooleanContent)) {
@@ -1827,7 +1848,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
         if (Op0.getOpcode() == ISD::AND &&
             isa<ConstantSDNode>(Op0.getOperand(1)) &&
-            cast<ConstantSDNode>(Op0.getOperand(1))->getAPIntValue() == 1) {
+            cast<ConstantSDNode>(Op0.getOperand(1))->isOne()) {
           // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
           if (Op0.getValueType().bitsGT(VT))
             Op0 = DAG.getNode(ISD::AND, dl, VT,
@@ -1862,7 +1883,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
     // Canonicalize GE/LE comparisons to use GT/LT comparisons.
     if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
-      if (C1 == MinVal) return DAG.getConstant(1, dl, VT);  // X >= MIN --> true
+      // X >= MIN --> true
+      if (C1 == MinVal)
+        return DAG.getConstant(1, dl, VT);
+
       // X >= C0 --> X > (C0 - 1)
       APInt C = C1 - 1;
       ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
@@ -1877,7 +1901,10 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     }
 
     if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
-      if (C1 == MaxVal) return DAG.getConstant(1, dl, VT);  // X <= MAX --> true
+      // X <= MAX --> true
+      if (C1 == MaxVal)
+          return DAG.getConstant(1, dl, VT);
+
       // X <= C0 --> X < (C0 + 1)
       APInt C = C1 + 1;
       ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
@@ -2006,7 +2033,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         } else {
           ShiftBits = C1.countTrailingZeros();
         }
-        NewC = NewC.lshr(ShiftBits);
+        NewC.lshrInPlace(ShiftBits);
         if (ShiftBits && NewC.getMinSignedBits() <= 64 &&
           isLegalICmpImmediate(NewC.getSExtValue())) {
           auto &DL = DAG.getDataLayout();
@@ -2050,6 +2077,16 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     if (Cond == ISD::SETO || Cond == ISD::SETUO)
       return DAG.getSetCC(dl, VT, N0, N0, Cond);
 
+    // setcc (fneg x), C -> setcc swap(pred) x, -C
+    if (N0.getOpcode() == ISD::FNEG) {
+      ISD::CondCode SwapCond = ISD::getSetCCSwappedOperands(Cond);
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(SwapCond, N0.getSimpleValueType())) {
+        SDValue NegN1 = DAG.getNode(ISD::FNEG, dl, N0.getValueType(), N1);
+        return DAG.getSetCC(dl, VT, N0.getOperand(0), NegN1, SwapCond);
+      }
+    }
+
     // If the condition is not legal, see if we can find an equivalent one
     // which is legal.
     if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
@@ -2129,7 +2166,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
         if (N0.getOperand(1) == N1.getOperand(1))
           return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
-        if (DAG.isCommutativeBinOp(N0.getOpcode())) {
+        if (isCommutativeBinOp(N0.getOpcode())) {
           // If X op Y == Y op X, try other combinations.
           if (N0.getOperand(0) == N1.getOperand(1))
             return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0),
@@ -2193,7 +2230,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           return DAG.getSetCC(dl, VT, N0.getOperand(1),
                               DAG.getConstant(0, dl, N0.getValueType()), Cond);
         if (N0.getOperand(1) == N1) {
-          if (DAG.isCommutativeBinOp(N0.getOpcode()))
+          if (isCommutativeBinOp(N0.getOpcode()))
             return DAG.getSetCC(dl, VT, N0.getOperand(0),
                                 DAG.getConstant(0, dl, N0.getValueType()),
                                 Cond);
@@ -2220,7 +2257,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         return DAG.getSetCC(dl, VT, N1.getOperand(1),
                         DAG.getConstant(0, dl, N1.getValueType()), Cond);
       if (N1.getOperand(1) == N0) {
-        if (DAG.isCommutativeBinOp(N1.getOpcode()))
+        if (isCommutativeBinOp(N1.getOpcode()))
           return DAG.getSetCC(dl, VT, N1.getOperand(0),
                           DAG.getConstant(0, dl, N1.getValueType()), Cond);
         if (N1.getNode()->hasOneUse()) {
@@ -2445,7 +2482,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         // gcc prints these as sign extended.  Sign extend value to 64 bits
         // now; without this it would get ZExt'd later in
         // ScheduleDAGSDNodes::EmitNode, which is very generic.
-        Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
+        Ops.push_back(DAG.getTargetConstant(C->getSExtValue(),
                                             SDLoc(C), MVT::i64));
       }
       return;
@@ -2470,13 +2507,10 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
-  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
-       E = RI->regclass_end(); RCI != E; ++RCI) {
-    const TargetRegisterClass *RC = *RCI;
-
+  for (const TargetRegisterClass *RC : RI->regclasses()) {
     // If none of the value types for this register class are valid, we
     // can't use it.  For example, 64-bit reg classes on 32-bit targets.
-    if (!isLegalRC(RC))
+    if (!isLegalRC(*RI, *RC))
       continue;
 
     for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
@@ -2488,9 +2522,9 @@ TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
         // If this register class has the requested value type, return it,
         // otherwise keep searching and return the first class found
         // if no other is found which explicitly has the requested type.
-        if (RC->hasType(VT))
+        if (RI->isTypeLegalForClass(*RC, VT))
           return S;
-        else if (!R.second)
+        if (!R.second)
           R = S;
       }
     }
@@ -2914,9 +2948,9 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
                                                         DAG.getDataLayout()));
     SDNodeFlags Flags;
     Flags.setExact(true);
-    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, &Flags);
+    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, Flags);
     Created.push_back(Op1.getNode());
-    d = d.ashr(ShAmt);
+    d.ashrInPlace(ShAmt);
   }
 
   // Calculate the multiplicative inverse, using Newton's method.
@@ -2933,7 +2967,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
 SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                       SelectionDAG &DAG,
                                       std::vector<SDNode *> *Created) const {
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
@@ -2958,7 +2992,7 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
     return SDValue();
 
   // If the sdiv has an 'exact' bit we can use a simpler lowering.
-  if (cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact())
+  if (N->getFlags().hasExact())
     return BuildExactSDIV(*this, N->getOperand(0), Divisor, dl, DAG, *Created);
 
   APInt::ms magics = Divisor.magic();
@@ -3297,7 +3331,7 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
   SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
   SDValue Bias = DAG.getConstant(127, dl, IntVT);
-  SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()), dl,
+  SDValue SignMask = DAG.getConstant(APInt::getSignMask(VT.getSizeInBits()), dl,
                                      IntVT);
   SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, dl, IntVT);
   SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);
@@ -3808,7 +3842,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
-  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
+  CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
index ff7d205..7b60d22 100644
--- a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -16,9 +16,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -27,7 +27,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "shadowstackgclowering"
+#define DEBUG_TYPE "shadow-stack-gc-lowering"
 
 namespace {
 
@@ -66,10 +66,10 @@ private:
 };
 }
 
-INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE,
                       "Shadow Stack GC Lowering", false, false)
 INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
-INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+INITIALIZE_PASS_END(ShadowStackGCLowering, DEBUG_TYPE,
                     "Shadow Stack GC Lowering", false, false)
 
 FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); }
diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
index 4837495..aa75f5e 100644
--- a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -210,13 +210,12 @@ public:
 char ShrinkWrap::ID = 0;
 char &llvm::ShrinkWrapID = ShrinkWrap::ID;
 
-INITIALIZE_PASS_BEGIN(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false,
-                      false)
+INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false)
+INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
 
 bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
                                  RegScavenger *RS) const {
@@ -282,8 +281,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
 
   if (!Restore)
     Restore = &MBB;
-  else
+  else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it
+                                // means the block never returns. If that's the
+                                // case, we don't want to call
+                                // `findNearestCommonDominator`, which will
+                                // return `Restore`.
     Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+  else
+    Restore = nullptr; // Abort, we can't find a restore point in this case.
 
   // Make sure we would be able to insert the restore code before the
   // terminator.
@@ -293,7 +298,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
         continue;
       // One of the terminator needs to happen before the restore point.
       if (MBB.succ_empty()) {
-        Restore = nullptr;
+        Restore = nullptr; // Abort, we can't find a restore point in this case.
         break;
       }
       // Look for a restore point that post-dominates all the successors.
@@ -419,7 +424,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF,
 }
 
 bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.empty() || !isShrinkWrapEnabled(MF))
+  if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF))
     return false;
 
   DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 209bbe5..17a3a84 100644
--- a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -12,11 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -64,6 +64,7 @@ public:
 
 private:
   bool setupEntryBlockAndCallSites(Function &F);
+  bool undoSwiftErrorSelect(Function &F);
   void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
   Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
   void lowerIncomingArguments(Function &F);
@@ -73,7 +74,7 @@ private:
 } // end anonymous namespace
 
 char SjLjEHPrepare::ID = 0;
-INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions",
+INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions",
                 false, false)
 
 // Public Interface To the SjLjEHPrepare pass.
@@ -92,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
                                       doubleUnderDataTy, // __data
                                       VoidPtrTy,         // __personality
                                       VoidPtrTy,         // __lsda
-                                      doubleUnderJBufTy, // __jbuf
-                                      nullptr);
+                                      doubleUnderJBufTy  // __jbuf
+                                      );
 
   return true;
 }
@@ -124,8 +125,11 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
   if (!LiveBBs.insert(BB).second)
     return; // already been here.
 
-  for (BasicBlock *PredBB : predecessors(BB))
-    MarkBlocksLiveIn(PredBB, LiveBBs);
+  df_iterator_default_set<BasicBlock*> Visited;
+
+  for (BasicBlock *B : inverse_depth_first_ext(BB, Visited))
+    LiveBBs.insert(B);
+
 }
 
 /// substituteLPadValues - Substitute the values returned by the landingpad
@@ -174,8 +178,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   // because the value needs to be added to the global context list.
   auto &DL = F.getParent()->getDataLayout();
   unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
-                           &EntryBB->front());
+  FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(),
+                           nullptr, Align, "fn_context", &EntryBB->front());
 
   // Fill in the function context structure.
   for (LandingPadInst *LPI : LPads) {
@@ -458,14 +462,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   return true;
 }
 
+bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) {
+  // We have inserted dummy copies 'select true, arg, undef' in the entry block
+  // for arguments to simplify this pass.
+  // swifterror arguments cannot be used in this way. Undo the select for the
+  // swifterror argument.
+  for (auto &AI : F.args()) {
+    if (AI.isSwiftError()) {
+      assert(AI.hasOneUse() && "Must have converted the argument to a select");
+      auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser());
+      assert(Select && "There must be single select user");
+      auto *OrigSwiftError = cast<Argument>(Select->getTrueValue());
+      Select->replaceAllUsesWith(OrigSwiftError);
+      Select->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
 bool SjLjEHPrepare::runOnFunction(Function &F) {
   Module &M = *F.getParent();
   RegisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), nullptr);
+      PointerType::getUnqual(FunctionContextTy));
   UnregisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), nullptr);
+      PointerType::getUnqual(FunctionContextTy));
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -476,5 +499,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
 
   bool Res = setupEntryBlockAndCallSites(F);
+  if (Res)
+    Res |= undoSwiftErrorSelect(F);
   return Res;
 }
diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
index dba103e9..3656832 100644
--- a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 #define DEBUG_TYPE "slotindexes"
 
 char SlotIndexes::ID = 0;
-INITIALIZE_PASS(SlotIndexes, "slotindexes",
+INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE,
                 "Slot index numbering", false, false)
 
 STATISTIC(NumLocalRenum,  "Number of local renumberings");
@@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   return false;
 }
 
+void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) {
+  assert(!MI.isBundledWithPred() &&
+         "Use removeSingleMachineInstrFromMaps() instread");
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+  // FIXME: Eventually we want to actually delete these indexes.
+  MIEntry.setInstr(nullptr);
+}
+
+void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) {
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+
+  // When removing the first instruction of a bundle update mapping to next
+  // instruction.
+  if (MI.isBundledWithSucc()) {
+    // Only the first instruction of a bundle should have an index assigned.
+    assert(!MI.isBundledWithPred() && "Should have first bundle isntruction");
+
+    MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator());
+    MachineInstr &NextMI = *Next;
+    MIEntry.setInstr(&NextMI);
+    mi2iMap.insert(std::make_pair(&NextMI, MIIndex));
+    return;
+  } else {
+    // FIXME: Eventually we want to actually delete these indexes.
+    MIEntry.setInstr(nullptr);
+  }
+}
+
 void SlotIndexes::renumberIndexes() {
   // Renumber updates the index of every element of the index list.
   DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
index f10c98e..0abe1c4 100644
--- a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -40,14 +40,14 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "spillplacement"
+#define DEBUG_TYPE "spill-code-placement"
 
 char SpillPlacement::ID = 0;
-INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement",
+INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE,
                       "Spill Code Placement Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement",
+INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE,
                     "Spill Code Placement Analysis", true, true)
 
 char &llvm::SpillPlacementID = SpillPlacement::ID;
@@ -310,7 +310,7 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
 
 bool SpillPlacement::scanActiveBundles() {
   RecentPositive.clear();
-  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
+  for (unsigned n : ActiveNodes->set_bits()) {
     update(n);
     // A node that must spill, or a node without any links is not going to
     // change its value ever again, so exclude it from iterations.
@@ -365,7 +365,7 @@ SpillPlacement::finish() {
 
   // Write preferences back to ActiveNodes.
   bool Perfect = true;
-  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n))
+  for (unsigned n : ActiveNodes->set_bits())
     if (!nodes[n].preferReg()) {
       ActiveNodes->reset(n);
       Perfect = false;
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp
index 1c6a84e..323045f 100644
--- a/contrib/llvm/lib/CodeGen/SplitKit.cpp
+++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -52,10 +53,10 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
   std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num];
   SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB);
 
-  SmallVector<const MachineBasicBlock *, 1> EHPadSucessors;
+  SmallVector<const MachineBasicBlock *, 1> EHPadSuccessors;
   for (const MachineBasicBlock *SMBB : MBB.successors())
     if (SMBB->isEHPad())
-      EHPadSucessors.push_back(SMBB);
+      EHPadSuccessors.push_back(SMBB);
 
   // Compute insert points on the first call. The pair is independent of the
   // current live interval.
@@ -67,7 +68,7 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
       LIP.first = LIS.getInstructionIndex(*FirstTerm);
 
     // If there is a landing pad successor, also find the call instruction.
-    if (EHPadSucessors.empty())
+    if (EHPadSuccessors.empty())
       return LIP.first;
     // There may not be a call instruction (?) in which case we ignore LPad.
     LIP.second = LIP.first;
@@ -86,7 +87,7 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
   if (!LIP.second)
     return LIP.first;
 
-  if (none_of(EHPadSucessors, [&](const MachineBasicBlock *EHPad) {
+  if (none_of(EHPadSuccessors, [&](const MachineBasicBlock *EHPad) {
         return LIS.isLiveInToMBB(CurLI, EHPad);
       }))
     return LIP.first;
@@ -487,12 +488,125 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
+SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  bool FirstCopy = !Def.isValid();
+  MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
+      .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
+              | getInternalReadRegState(!FirstCopy), SubIdx)
+      .addReg(FromReg, 0, SubIdx);
+
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  if (FirstCopy) {
+    SlotIndexes &Indexes = *LIS.getSlotIndexes();
+    Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  } else {
+    CopyMI->bundleWithPred();
+  }
+  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx);
+  DestLI.refineSubRanges(Allocator, LaneMask,
+                         [Def, &Allocator](LiveInterval::SubRange& SR) {
+    SR.createDeadDef(Def, Allocator);
+  });
+  return Def;
+}
+
+SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg,
+    LaneBitmask LaneMask, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
+    // The full vreg is copied.
+    MachineInstr *CopyMI =
+        BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
+    SlotIndexes &Indexes = *LIS.getSlotIndexes();
+    return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  }
+
+  // Only a subset of lanes needs to be copied. The following is a simple
+  // heuristic to construct a sequence of COPYs. We could add a target
+  // specific callback if this turns out to be suboptimal.
+  LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx));
+
+  // First pass: Try to find a perfectly matching subregister index. If none
+  // exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  unsigned BestCover = 0;
+  const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
+  assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
+  for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) {
+    // Is this index even compatible with the given class?
+    if (TRI.getSubClassWithSubReg(RC, Idx) != RC)
+      continue;
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
+    // Early exit if we found a perfect match.
+    if (SubRegMask == LaneMask) {
+      BestIdx = Idx;
+      break;
+    }
+
+    // The index must not cover any lanes outside \p LaneMask.
+    if ((SubRegMask & ~LaneMask).any())
+      continue;
+
+    unsigned PopCount = countPopulation(SubRegMask.getAsInteger());
+    PossibleIndexes.push_back(Idx);
+    if (PopCount > BestCover) {
+      BestCover = PopCount;
+      BestIdx = Idx;
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0)
+    report_fatal_error("Impossible to implement partial COPY");
+
+  SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore,
+                                        BestIdx, DestLI, Late, SlotIndex());
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  LaneBitmask LanesLeft = LaneMask & ~(TRI.getSubRegIndexLaneMask(BestIdx));
+  while (LanesLeft.any()) {
+    unsigned BestIdx = 0;
+    int BestCover = INT_MIN;
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == LanesLeft) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Try to cover as much of the remaining lanes as possible but
+      // as few of the already covered lanes as possible.
+      int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger())
+                - countPopulation((SubRegMask & ~LanesLeft).getAsInteger());
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0)
+      report_fatal_error("Impossible to implement partial COPY");
+
+    buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
+                          DestLI, Late, Def);
+    LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return Def;
+}
+
 VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    VNInfo *ParentVNI,
                                    SlotIndex UseIdx,
                                    MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
-  MachineInstr *CopyMI = nullptr;
   SlotIndex Def;
   LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
@@ -505,24 +619,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   LiveInterval &OrigLI = LIS.getInterval(Original);
   VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
 
+  unsigned Reg = LI->reg;
   bool DidRemat = false;
   if (OrigVNI) {
     LiveRangeEdit::Remat RM(ParentVNI);
     RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
-      Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
+      Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
       ++NumRemats;
       DidRemat = true;
     }
   }
   if (!DidRemat) {
-    // Can't remat, just insert a copy from parent.
-    CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
-               .addReg(Edit->getReg());
-    Def = LIS.getSlotIndexes()
-              ->insertMachineInstrInMaps(*CopyMI, Late)
-              .getRegSlot();
+    LaneBitmask LaneMask;
+    if (LI->hasSubRanges()) {
+      LaneMask = LaneBitmask::getNone();
+      for (LiveInterval::SubRange &S : LI->subranges())
+        LaneMask |= S.LaneMask;
+    } else {
+      LaneMask = LaneBitmask::getAll();
+    }
+
     ++NumCopies;
+    Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
   }
 
   // Define the value in Reg.
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm/lib/CodeGen/SplitKit.h
index a75738a..9d409e9 100644
--- a/contrib/llvm/lib/CodeGen/SplitKit.h
+++ b/contrib/llvm/lib/CodeGen/SplitKit.h
@@ -405,6 +405,17 @@ private:
   /// deleteRematVictims - Delete defs that are dead after rematerializing.
   void deleteRematVictims();
 
+  /// Add a copy instruction copying \p FromReg to \p ToReg before
+  /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it
+  /// necessary to construct a sequence of copies to cover it exactly.
+  SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask,
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+      bool Late, unsigned RegIdx);
+
+  SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy);
+
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp
index 89c4b57..e5fc540 100644
--- a/contrib/llvm/lib/CodeGen/StackColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp
@@ -23,7 +23,6 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -38,6 +37,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
@@ -54,7 +54,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "stackcoloring"
+#define DEBUG_TYPE "stack-coloring"
 
 static cl::opt<bool>
 DisableColoring("no-stack-coloring",
@@ -87,10 +87,134 @@ STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
 STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+//
+// Stack Coloring reduces stack usage by merging stack slots when they
+// can't be used together. For example, consider the following C program:
+//
+//     void bar(char *, int);
+//     void foo(bool var) {
+//         A: {
+//             char z[4096];
+//             bar(z, 0);
+//         }
+//
+//         char *p;
+//         char x[4096];
+//         char y[4096];
+//         if (var) {
+//             p = x;
+//         } else {
+//             bar(y, 1);
+//             p = y + 1024;
+//         }
+//     B:
+//         bar(p, 2);
+//     }
+//
+// Naively-compiled, this program would use 12k of stack space. However, the
+// stack slot corresponding to `z` is always destroyed before either of the
+// stack slots for `x` or `y` are used, and then `x` is only used if `var`
+// is true, while `y` is only used if `var` is false. So in no time are 2
+// of the stack slots used together, and therefore we can merge them,
+// compiling the function using only a single 4k alloca:
+//
+//     void foo(bool var) { // equivalent
+//         char x[4096];
+//         char *p;
+//         bar(x, 0);
+//         if (var) {
+//             p = x;
+//         } else {
+//             bar(x, 1);
+//             p = x + 1024;
+//         }
+//         bar(p, 2);
+//     }
+//
+// This is an important optimization if we want stack space to be under
+// control in large functions, both open-coded ones and ones created by
+// inlining.
 //
 // Implementation Notes:
 // ---------------------
 //
+// An important part of the above reasoning is that `z` can't be accessed
+// while the latter 2 calls to `bar` are running. This is justified because
+// `z`'s lifetime is over after we exit from block `A:`, so any further
+// accesses to it would be UB. The way we represent this information
+// in LLVM is by having frontends delimit blocks with `lifetime.start`
+// and `lifetime.end` intrinsics.
+//
+// The effect of these intrinsics seems to be as follows (maybe I should
+// specify this in the reference?):
+//
+//   L1) at start, each stack-slot is marked as *out-of-scope*, unless no
+//   lifetime intrinsic refers to that stack slot, in which case
+//   it is marked as *in-scope*.
+//   L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and
+//   the stack slot is overwritten with `undef`.
+//   L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*.
+//   L4) on function exit, all stack slots are marked as *out-of-scope*.
+//   L5) `lifetime.end` is a no-op when called on a slot that is already
+//   *out-of-scope*.
+//   L6) memory accesses to *out-of-scope* stack slots are UB.
+//   L7) when a stack-slot is marked as *out-of-scope*, all pointers to it
+//   are invalidated, unless the slot is "degenerate". This is used to
+//   justify not marking slots as in-use until the pointer to them is
+//   used, but feels a bit hacky in the presence of things like LICM. See
+//   the "Degenerate Slots" section for more details.
+//
+// Now, let's ground stack coloring on these rules. We'll define a slot
+// as *in-use* at a (dynamic) point in execution if it either can be
+// written to at that point, or if it has a live and non-undef content
+// at that point.
+//
+// Obviously, slots that are never *in-use* together can be merged, and
+// in our example `foo`, the slots for `x`, `y` and `z` are never
+// in-use together (of course, sometimes slots that *are* in-use together
+// might still be mergable, but we don't care about that here).
+//
+// In this implementation, we successively merge pairs of slots that are
+// not *in-use* together. We could be smarter - for example, we could merge
+// a single large slot with 2 small slots, or we could construct the
+// interference graph and run a "smart" graph coloring algorithm, but with
+// that aside, how do we find out whether a pair of slots might be *in-use*
+// together?
+//
+// From our rules, we see that *out-of-scope* slots are never *in-use*,
+// and from (L7) we see that "non-degenerate" slots remain non-*in-use*
+// until their address is taken. Therefore, we can approximate slot activity
+// using dataflow.
+//
+// A subtle point: naively, we might try to figure out which pairs of
+// stack-slots interfere by propagating `S in-use` through the CFG for every
+// stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
+// which they are both *in-use*.
+//
+// That is sound, but overly conservative in some cases: in our (artificial)
+// example `foo`, either `x` or `y` might be in use at the label `B:`, but
+// as `x` is only in use if we came in from the `var` edge and `y` only
+// if we came from the `!var` edge, they still can't be in use together.
+// See PR32488 for an important real-life case.
+//
+// If we wanted to find all points of interference precisely, we could
+// propagate `S in-use` and `S&T in-use` predicates through the CFG. That
+// would be precise, but requires propagating `O(n^2)` dataflow facts.
+//
+// However, we aren't interested in the *set* of points of interference
+// between 2 stack slots, only *whether* there *is* such a point. So we
+// can rely on a little trick: for `S` and `T` to be in-use together,
+// one of them needs to become in-use while the other is in-use (or
+// they might both become in use simultaneously). We can check this
+// by also keeping track of the points at which a stack slot might *start*
+// being in-use.
+//
+// Exact first use:
+// ----------------
+//
 // Consider the following motivating example:
 //
 //     int foo() {
@@ -159,6 +283,9 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 // lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
 // byte stack (better).
 //
+// Degenerate Slots:
+// -----------------
+//
 // Relying entirely on first-use of stack slots is problematic,
 // however, due to the fact that optimizations can sometimes migrate
 // uses of a variable outside of its lifetime start/end region. Here
@@ -238,10 +365,6 @@ STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
 // for "b" then it will appear that 'b' has a degenerate lifetime.
 //
 
-//===----------------------------------------------------------------------===//
-//                           StackColoring Pass
-//===----------------------------------------------------------------------===//
-
 namespace {
 /// StackColoring - A machine pass for merging disjoint stack allocations,
 /// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
@@ -272,8 +395,11 @@ class StackColoring : public MachineFunctionPass {
   /// Maps basic blocks to a serial number.
   SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;
 
-  /// Maps liveness intervals for each slot.
+  /// Maps slots to their use interval. Outside of this interval, slots
+  /// values are either dead or `undef` and they will not be written to.
   SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
+  /// Maps slots to the points where they can become in-use.
+  SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts;
   /// VNInfo is used for the construction of LiveIntervals.
   VNInfo::Allocator VNInfoAllocator;
   /// SlotIndex analysis object.
@@ -372,12 +498,12 @@ private:
 char StackColoring::ID = 0;
 char &llvm::StackColoringID = StackColoring::ID;
 
-INITIALIZE_PASS_BEGIN(StackColoring,
-                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE,
+                      "Merge disjoint stack slots", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_END(StackColoring,
-                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE,
+                    "Merge disjoint stack slots", false, false)
 
 void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<SlotIndexes>();
@@ -385,14 +511,13 @@ void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-#ifndef NDEBUG
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
                                             const BitVector &BV) const {
-  DEBUG(dbgs() << tag << " : { ");
+  dbgs() << tag << " : { ";
   for (unsigned I = 0, E = BV.size(); I != E; ++I)
-    DEBUG(dbgs() << BV.test(I) << " ");
-  DEBUG(dbgs() << "}\n");
+    dbgs() << BV.test(I) << " ";
+  dbgs() << "}\n";
 }
 
 LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
@@ -408,20 +533,19 @@ LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
 
 LLVM_DUMP_METHOD void StackColoring::dump() const {
   for (MachineBasicBlock *MBB : depth_first(MF)) {
-    DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
-                 << MBB->getName() << "]\n");
-    DEBUG(dumpBB(MBB));
+    dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
+           << MBB->getName() << "]\n";
+    dumpBB(MBB);
   }
 }
 
 LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
   for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
-    DEBUG(dbgs() << "Interval[" << I << "]:\n");
-    DEBUG(Intervals[I]->dump());
+    dbgs() << "Interval[" << I << "]:\n";
+    Intervals[I]->dump();
   }
 }
-
-#endif // not NDEBUG
+#endif
 
 static inline int getStartOrEndSlot(const MachineInstr &MI)
 {
@@ -570,9 +694,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot)
 
   // Step 2: compute begin/end sets for each block
 
-  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
-  // deterministic numbering, and because we'll need a post-order iteration
-  // later for solving the liveness dataflow problem.
+  // NOTE: We use a depth-first iteration to ensure that we obtain a
+  // deterministic numbering.
   for (MachineBasicBlock *MBB : depth_first(MF)) {
 
     // Assign a serial number to this basic block.
@@ -676,15 +799,22 @@ void StackColoring::calculateLocalLiveness()
 
 void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
   SmallVector<SlotIndex, 16> Starts;
-  SmallVector<SlotIndex, 16> Finishes;
+  SmallVector<bool, 16> DefinitelyInUse;
 
   // For each block, find which slots are active within this block
   // and update the live intervals.
   for (const MachineBasicBlock &MBB : *MF) {
     Starts.clear();
     Starts.resize(NumSlots);
-    Finishes.clear();
-    Finishes.resize(NumSlots);
+    DefinitelyInUse.clear();
+    DefinitelyInUse.resize(NumSlots);
+
+    // Start the interval of the slots that we previously found to be 'in-use'.
+    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
+    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
+         pos = MBBLiveness.LiveIn.find_next(pos)) {
+      Starts[pos] = Indexes->getMBBStartIdx(&MBB);
+    }
 
     // Create the interval for the basic blocks containing lifetime begin/end.
     for (const MachineInstr &MI : MBB) {
@@ -696,68 +826,35 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
       for (auto Slot : slots) {
         if (IsStart) {
-          if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+          // If a slot is already definitely in use, we don't have to emit
+          // a new start marker because there is already a pre-existing
+          // one.
+          if (!DefinitelyInUse[Slot]) {
+            LiveStarts[Slot].push_back(ThisIndex);
+            DefinitelyInUse[Slot] = true;
+          }
+          if (!Starts[Slot].isValid())
             Starts[Slot] = ThisIndex;
         } else {
-          if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
-            Finishes[Slot] = ThisIndex;
+          if (Starts[Slot].isValid()) {
+            VNInfo *VNI = Intervals[Slot]->getValNumInfo(0);
+            Intervals[Slot]->addSegment(
+                LiveInterval::Segment(Starts[Slot], ThisIndex, VNI));
+            Starts[Slot] = SlotIndex(); // Invalidate the start index
+            DefinitelyInUse[Slot] = false;
+          }
         }
       }
     }
 
-    // Create the interval of the blocks that we previously found to be 'alive'.
-    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
-    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
-         pos = MBBLiveness.LiveIn.find_next(pos)) {
-      Starts[pos] = Indexes->getMBBStartIdx(&MBB);
-    }
-    for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1;
-         pos = MBBLiveness.LiveOut.find_next(pos)) {
-      Finishes[pos] = Indexes->getMBBEndIdx(&MBB);
-    }
-
+    // Finish up started segments
     for (unsigned i = 0; i < NumSlots; ++i) {
-      //
-      // When LifetimeStartOnFirstUse is turned on, data flow analysis
-      // is forward (from starts to ends), not bidirectional. A
-      // consequence of this is that we can wind up in situations
-      // where Starts[i] is invalid but Finishes[i] is valid and vice
-      // versa. Example:
-      //
-      //     LIFETIME_START x
-      //     if (...) {
-      //       <use of x>
-      //       throw ...;
-      //     }
-      //     LIFETIME_END x
-      //     return 2;
-      //
-      //
-      // Here the slot for "x" will not be live into the block
-      // containing the "return 2" (since lifetimes start with first
-      // use, not at the dominating LIFETIME_START marker).
-      //
-      if (Starts[i].isValid() && !Finishes[i].isValid()) {
-        Finishes[i] = Indexes->getMBBEndIdx(&MBB);
-      }
       if (!Starts[i].isValid())
         continue;
 
-      assert(Starts[i] && Finishes[i] && "Invalid interval");
-      VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
-      SlotIndex S = Starts[i];
-      SlotIndex F = Finishes[i];
-      if (S < F) {
-        // We have a single consecutive region.
-        Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
-      } else {
-        // We have two non-consecutive regions. This happens when
-        // LIFETIME_START appears after the LIFETIME_END marker.
-        SlotIndex NewStart = Indexes->getMBBStartIdx(&MBB);
-        SlotIndex NewFin = Indexes->getMBBEndIdx(&MBB);
-        Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
-        Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
-      }
+      SlotIndex EndIdx = Indexes->getMBBEndIdx(&MBB);
+      VNInfo *VNI = Intervals[i]->getValNumInfo(0);
+      Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI));
     }
   }
 }
@@ -793,6 +890,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
 
   // Keep a list of *allocas* which need to be remapped.
   DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+
+  // Keep a list of allocas which has been affected by the remap.
+  SmallPtrSet<const AllocaInst*, 32> MergedAllocas;
+
   for (const std::pair<int, int> &SI : SlotRemap) {
     const AllocaInst *From = MFI->getObjectAllocation(SI.first);
     const AllocaInst *To = MFI->getObjectAllocation(SI.second);
@@ -812,6 +913,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
       Inst = Cast;
     }
 
+    // We keep both slots to maintain AliasAnalysis metadata later.
+    MergedAllocas.insert(From);
+    MergedAllocas.insert(To);
+
     // Allow the stack protector to adjust its value map to account for the
     // upcoming replacement.
     SP->adjustForColoring(From, To);
@@ -843,13 +948,6 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
 
       // Update the MachineMemOperand to use the new alloca.
       for (MachineMemOperand *MMO : I.memoperands()) {
-        // FIXME: In order to enable the use of TBAA when using AA in CodeGen,
-        // we'll also need to update the TBAA nodes in MMOs with values
-        // derived from the merged allocas. When doing this, we'll need to use
-        // the same variant of GetUnderlyingObjects that is used by the
-        // instruction scheduler (that can look through ptrtoint/inttoptr
-        // pairs).
-
         // We've replaced IR-level uses of the remapped allocas, so we only
         // need to replace direct uses here.
         const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue());
@@ -901,6 +999,48 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         MO.setIndex(ToSlot);
         FixedInstr++;
       }
+
+      // We adjust AliasAnalysis information for merged stack slots.
+      MachineSDNode::mmo_iterator NewMemOps =
+          MF->allocateMemRefsArray(I.getNumMemOperands());
+      unsigned MemOpIdx = 0;
+      bool ReplaceMemOps = false;
+      for (MachineMemOperand *MMO : I.memoperands()) {
+        // If this memory location can be a slot remapped here,
+        // we remove AA information.
+        bool MayHaveConflictingAAMD = false;
+        if (MMO->getAAInfo()) {
+          if (const Value *MMOV = MMO->getValue()) {
+            SmallVector<Value *, 4> Objs;
+            getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout());
+
+            if (Objs.empty())
+              MayHaveConflictingAAMD = true;
+            else
+              for (Value *V : Objs) {
+                // If this memory location comes from a known stack slot
+                // that is not remapped, we continue checking.
+                // Otherwise, we need to invalidate AA infomation.
+                const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V);
+                if (AI && MergedAllocas.count(AI)) {
+                  MayHaveConflictingAAMD = true;
+                  break;
+                }
+              }
+          }
+        }
+        if (MayHaveConflictingAAMD) {
+          NewMemOps[MemOpIdx++] = MF->getMachineMemOperand(MMO, AAMDNodes());
+          ReplaceMemOps = true;
+        }
+        else
+          NewMemOps[MemOpIdx++] = MMO;
+      }
+
+      // If any memory operand is updated, set memory references of
+      // this instruction.
+      if (ReplaceMemOps)
+        I.setMemRefs(std::make_pair(NewMemOps, I.getNumMemOperands()));
     }
 
   // Update the location of C++ catch objects for the MSVC personality routine.
@@ -987,6 +1127,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   BasicBlockNumbering.clear();
   Markers.clear();
   Intervals.clear();
+  LiveStarts.clear();
   VNInfoAllocator.Reset();
 
   unsigned NumSlots = MFI->getObjectIndexEnd();
@@ -998,6 +1139,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   SmallVector<int, 8> SortedSlots;
   SortedSlots.reserve(NumSlots);
   Intervals.reserve(NumSlots);
+  LiveStarts.resize(NumSlots);
 
   unsigned NumMarkers = collectMarkers(NumSlots);
 
@@ -1069,6 +1211,9 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
     return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
   });
 
+  for (auto &s : LiveStarts)
+    std::sort(s.begin(), s.end());
+
   bool Changed = true;
   while (Changed) {
     Changed = false;
@@ -1084,12 +1229,22 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
         int SecondSlot = SortedSlots[J];
         LiveInterval *First = &*Intervals[FirstSlot];
         LiveInterval *Second = &*Intervals[SecondSlot];
+        auto &FirstS = LiveStarts[FirstSlot];
+        auto &SecondS = LiveStarts[SecondSlot];
         assert (!First->empty() && !Second->empty() && "Found an empty range");
 
-        // Merge disjoint slots.
-        if (!First->overlaps(*Second)) {
+        // Merge disjoint slots. This is a little bit tricky - see the
+        // Implementation Notes section for an explanation.
+        if (!First->isLiveAtIndexes(SecondS) &&
+            !Second->isLiveAtIndexes(FirstS)) {
           Changed = true;
           First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
+
+          int OldSize = FirstS.size();
+          FirstS.append(SecondS.begin(), SecondS.end());
+          auto Mid = FirstS.begin() + OldSize;
+          std::inplace_merge(FirstS.begin(), Mid, FirstS.end());
+
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
           DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp
index 9b7dd400..b4fa29d 100644
--- a/contrib/llvm/lib/CodeGen/StackMaps.cpp
+++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp
@@ -1,4 +1,4 @@
-//===---------------------------- StackMaps.cpp ---------------------------===//
+//===- StackMaps.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,30 +8,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <iterator>
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "stackmaps"
 
 static cl::opt<int> StackMapVersion(
-    "stackmap-version", cl::init(2),
-    cl::desc("Specify the stackmap encoding version (default = 2)"));
+    "stackmap-version", cl::init(3),
+    cl::desc("Specify the stackmap encoding version (default = 3)"));
 
 const char *StackMaps::WSMP = "Stack Maps: ";
 
@@ -74,7 +85,7 @@ unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
 }
 
 StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) {
-  if (StackMapVersion != 2)
+  if (StackMapVersion != 3)
     llvm_unreachable("Unsupported stackmap version!");
 }
 
@@ -150,7 +161,8 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     if (SubRegIdx)
       Offset = TRI->getSubRegIdxOffset(SubRegIdx);
 
-    Locs.emplace_back(Location::Register, RC->getSize(), DwarfRegNum, Offset);
+    Locs.emplace_back(Location::Register, TRI->getSpillSize(*RC),
+                      DwarfRegNum, Offset);
     return ++MOI;
   }
 
@@ -209,8 +221,9 @@ void StackMaps::print(raw_ostream &OS) {
         OS << "Constant Index " << Loc.Offset;
         break;
       }
-      OS << "\t[encoding: .byte " << Loc.Type << ", .byte " << Loc.Size
-         << ", .short " << Loc.Reg << ", .int " << Loc.Offset << "]\n";
+      OS << "\t[encoding: .byte " << Loc.Type << ", .byte 0"
+         << ", .short " << Loc.Size << ", .short " << Loc.Reg << ", .short 0"
+         << ", .int " << Loc.Offset << "]\n";
       Idx++;
     }
 
@@ -234,7 +247,7 @@ void StackMaps::print(raw_ostream &OS) {
 StackMaps::LiveOutReg
 StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const {
   unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI);
-  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+  unsigned Size = TRI->getSpillSize(*TRI->getMinimalPhysRegClass(Reg));
   return LiveOutReg(Reg, DwarfRegNum, Size);
 }
 
@@ -276,7 +289,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   }
 
   LiveOuts.erase(
-      remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }),
+      llvm::remove_if(LiveOuts,
+                      [](const LiveOutReg &LO) { return LO.Reg == 0; }),
       LiveOuts.end());
 
   return LiveOuts;
@@ -286,7 +300,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
                                     MachineInstr::const_mop_iterator MOI,
                                     MachineInstr::const_mop_iterator MOE,
                                     bool recordResult) {
-
   MCContext &OutContext = AP.OutStreamer->getContext();
   MCSymbol *MILabel = OutContext.createTempSymbol();
   AP.OutStreamer->EmitLabel(MILabel);
@@ -378,6 +391,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
   }
 #endif
 }
+
 void StackMaps::recordStatepoint(const MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
 
@@ -508,11 +522,16 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
 
     for (const auto &Loc : CSLocs) {
       OS.EmitIntValue(Loc.Type, 1);
-      OS.EmitIntValue(Loc.Size, 1);
+      OS.EmitIntValue(0, 1);  // Reserved
+      OS.EmitIntValue(Loc.Size, 2);
       OS.EmitIntValue(Loc.Reg, 2);
+      OS.EmitIntValue(0, 2);  // Reserved
       OS.EmitIntValue(Loc.Offset, 4);
     }
 
+    // Emit alignment to 8 byte.
+    OS.EmitValueToAlignment(8);
+
     // Num live-out registers and padding to align to 4 byte.
     OS.EmitIntValue(0, 2);
     OS.EmitIntValue(LiveOuts.size(), 2);
diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp
index c2c010a..d8e7840 100644
--- a/contrib/llvm/lib/CodeGen/StackProtector.cpp
+++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp
@@ -1,4 +1,4 @@
-//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//===- StackProtector.cpp - Stack Protector Insertion ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,30 +14,40 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <cstdlib>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "stack-protector"
@@ -50,12 +60,14 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
                                           cl::init(true), cl::Hidden);
 
 char StackProtector::ID = 0;
-INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors",
-                false, true)
 
-FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
-  return new StackProtector(TM);
-}
+INITIALIZE_PASS_BEGIN(StackProtector, DEBUG_TYPE,
+                      "Insert stack protectors", false, true)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE,
+                    "Insert stack protectors", false, true)
+
+FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); }
 
 StackProtector::SSPLayoutKind
 StackProtector::getSSPLayout(const AllocaInst *AI) const {
@@ -83,12 +95,19 @@ void StackProtector::adjustForColoring(const AllocaInst *From,
   }
 }
 
+void StackProtector::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
 bool StackProtector::runOnFunction(Function &Fn) {
   F = &Fn;
   M = F->getParent();
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  Trip = TM->getTargetTriple();
   TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
   HasPrologue = false;
   HasIRCheck = false;
@@ -222,7 +241,16 @@ bool StackProtector::RequiresStackProtector() {
   if (F->hasFnAttribute(Attribute::SafeStack))
     return false;
 
+  // We are constructing the OptimizationRemarkEmitter on the fly rather than
+  // using the analysis pass to avoid building DominatorTree and LoopInfo which
+  // are not available this late in the IR pipeline.
+  OptimizationRemarkEmitter ORE(F);
+
   if (F->hasFnAttribute(Attribute::StackProtectReq)) {
+    ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F)
+             << "Stack protection applied to function "
+             << ore::NV("Function", F)
+             << " due to a function attribute or command-line switch");
     NeedsProtector = true;
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
   } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
@@ -236,20 +264,29 @@ bool StackProtector::RequiresStackProtector() {
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         if (AI->isArrayAllocation()) {
+          OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray",
+                                    &I);
+          Remark
+              << "Stack protection applied to function "
+              << ore::NV("Function", F)
+              << " due to a call to alloca or use of a variable length array";
           if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
               Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              ORE.emit(Remark);
               NeedsProtector = true;
             } else if (Strong) {
               // Require protectors for all alloca calls in strong mode.
               Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              ORE.emit(Remark);
               NeedsProtector = true;
             }
           } else {
             // A call to alloca with a variable size requires protectors.
             Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            ORE.emit(Remark);
             NeedsProtector = true;
           }
           continue;
@@ -259,6 +296,11 @@ bool StackProtector::RequiresStackProtector() {
         if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
           Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
                                                    : SSPLK_SmallArray));
+          ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I)
+                   << "Stack protection applied to function "
+                   << ore::NV("Function", F)
+                   << " due to a stack allocated buffer or struct containing a "
+                      "buffer");
           NeedsProtector = true;
           continue;
         }
@@ -266,6 +308,11 @@ bool StackProtector::RequiresStackProtector() {
         if (Strong && HasAddressTaken(AI)) {
           ++NumAddrTaken;
           Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          ORE.emit(
+              OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I)
+              << "Stack protection applied to function "
+              << ore::NV("Function", F)
+              << " due to the address of a local variable being taken");
           NeedsProtector = true;
         }
       }
@@ -448,13 +495,13 @@ BasicBlock *StackProtector::CreateFailBB() {
     Constant *StackChkFail =
         M->getOrInsertFunction("__stack_smash_handler",
                                Type::getVoidTy(Context),
-                               Type::getInt8PtrTy(Context), nullptr);
+                               Type::getInt8PtrTy(Context));
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
     Constant *StackChkFail =
-        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
-                               nullptr);
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
+
     B.CreateCall(StackChkFail, {});
   }
   B.CreateUnreachable();
diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
index 234b204..856bca1 100644
--- a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -22,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
@@ -32,7 +32,7 @@
 #include <vector>
 using namespace llvm;
 
-#define DEBUG_TYPE "stackslotcoloring"
+#define DEBUG_TYPE "stack-slot-coloring"
 
 static cl::opt<bool>
 DisableSharing("no-stack-slot-sharing",
@@ -116,12 +116,12 @@ namespace {
 char StackSlotColoring::ID = 0;
 char &llvm::StackSlotColoringID = StackSlotColoring::ID;
 
-INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring",
+INITIALIZE_PASS_BEGIN(StackSlotColoring, DEBUG_TYPE,
                 "Stack Slot Coloring", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveStacks)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring",
+INITIALIZE_PASS_END(StackSlotColoring, DEBUG_TYPE,
                 "Stack Slot Coloring", false, false)
 
 namespace {
diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
index e2377d8..489a607 100644
--- a/contrib/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
@@ -1,4 +1,4 @@
-//===-- TailDuplication.cpp - Duplicate blocks into predecessors' tails ---===//
+//===- TailDuplication.cpp - Duplicate blocks into predecessors' tails ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,22 +12,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TailDuplicator.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Pass.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "tailduplication"
 
 namespace {
+
 /// Perform tail duplication. Delegates to TailDuplicator
 class TailDuplicatePass : public MachineFunctionPass {
   TailDuplicator Duplicator;
 
 public:
   static char ID;
+
   explicit TailDuplicatePass() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -35,13 +38,13 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
+} // end anonymous namespace
+
 char TailDuplicatePass::ID = 0;
-}
 
 char &llvm::TailDuplicateID = TailDuplicatePass::ID;
 
-INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false,
-                false)
+INITIALIZE_PASS(TailDuplicatePass, DEBUG_TYPE, "Tail Duplication", false, false)
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
diff --git a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
index 7709236..dc7265d 100644
--- a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -1,4 +1,4 @@
-//===-- TailDuplicator.cpp - Duplicate blocks into predecessors' tails ---===//
+//===- TailDuplicator.cpp - Duplicate blocks into predecessors' tails -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,22 +12,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "tailduplication"
@@ -41,15 +55,13 @@ STATISTIC(NumTailDupRemoved,
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumAddedPHIs, "Number of phis added");
 
-namespace llvm {
-
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDuplicateSize(
     "tail-dup-size",
     cl::desc("Maximum instructions to consider tail duplicating"), cl::init(2),
     cl::Hidden);
 
-cl::opt<unsigned> TailDupIndirectBranchSize(
+static cl::opt<unsigned> TailDupIndirectBranchSize(
     "tail-dup-indirect-size",
     cl::desc("Maximum instructions to consider tail duplicating blocks that "
              "end with indirect branches."), cl::init(20),
@@ -138,7 +150,7 @@ bool TailDuplicator::tailDuplicateAndUpdate(
     bool IsSimple, MachineBasicBlock *MBB,
     MachineBasicBlock *ForcedLayoutPred,
     SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds,
-    llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
+    function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   // Save the successors list.
   SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
@@ -725,6 +737,7 @@ bool TailDuplicator::duplicateSimpleBB(
     if (PredTBB == NextBB && PredFBB == nullptr)
       PredTBB = nullptr;
 
+    auto DL = PredBB->findBranchDebugLoc();
     TII->removeBranch(*PredBB);
 
     if (!PredBB->isSuccessor(NewTarget))
@@ -735,7 +748,7 @@ bool TailDuplicator::duplicateSimpleBB(
     }
 
     if (PredTBB)
-      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL);
 
     TDBBs.push_back(PredBB);
   }
@@ -748,7 +761,7 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
   if (PredBB->succ_size() > 1)
     return false;
 
-  MachineBasicBlock *PredTBB, *PredFBB;
+  MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
   SmallVector<MachineOperand, 4> PredCond;
   if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond))
     return false;
@@ -831,7 +844,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     appendCopies(PredBB, CopyInfos, Copies);
 
     // Simplify
-    MachineBasicBlock *PredTBB, *PredFBB;
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
     SmallVector<MachineOperand, 4> PredCond;
     TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond);
 
@@ -970,7 +983,7 @@ void TailDuplicator::appendCopies(MachineBasicBlock *MBB,
 /// the CFG.
 void TailDuplicator::removeDeadBlock(
     MachineBasicBlock *MBB,
-    llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
+    function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
@@ -984,5 +997,3 @@ void TailDuplicator::removeDeadBlock(
   // Remove the block.
   MBB->eraseFromParent();
 }
-
-} // End llvm namespace
diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index f082add..9dd98b4 100644
--- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -1,4 +1,4 @@
-//===----- TargetFrameLoweringImpl.cpp - Implement target frame interface --==//
+//===- TargetFrameLoweringImpl.cpp - Implement target frame interface ------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,19 +14,21 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <cstdlib>
+
 using namespace llvm;
 
-TargetFrameLowering::~TargetFrameLowering() {
-}
+TargetFrameLowering::~TargetFrameLowering() = default;
 
 /// The default implementation just looks at attribute "no-frame-pointer-elim".
 bool TargetFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
@@ -73,7 +75,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
     return;
 
   // Get the callee saved register list...
-  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
 
   // Early exit if there are no callee saved registers.
   if (!CSRegs || CSRegs[0] == 0)
diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 01f91b9..14c5adc 100644
--- a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -345,12 +345,12 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
                                         unsigned SubIdx, unsigned &Size,
                                         unsigned &Offset,
                                         const MachineFunction &MF) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!SubIdx) {
-    Size = RC->getSize();
+    Size = TRI->getSpillSize(*RC);
     Offset = 0;
     return true;
   }
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   unsigned BitSize = TRI->getSubRegIdxSize(SubIdx);
   // Convert bit size to byte size to be consistent with
   // MCRegisterClass::getSize().
@@ -364,10 +364,10 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
   Size = BitSize /= 8;
   Offset = (unsigned)BitOffset / 8;
 
-  assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
+  assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
 
   if (!MF.getDataLayout().isLittleEndian()) {
-    Offset = RC->getSize() - (Offset + Size);
+    Offset = TRI->getSpillSize(*RC) - (Offset + Size);
   }
   return true;
 }
@@ -428,8 +428,8 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
   return nullptr;
 }
 
-void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  llvm_unreachable("Not a MachO target");
+void TargetInstrInfo::getNoop(MCInst &NopInst) const {
+  llvm_unreachable("Not implemented");
 }
 
 static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
@@ -470,7 +470,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
 
   // No need to fold return, the meta data, and function arguments
   for (unsigned i = 0; i < StartIdx; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
 
   for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {
     MachineOperand &MO = MI.getOperand(i);
@@ -490,7 +490,7 @@ static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
       MIB.addImm(SpillOffset);
     }
     else
-      MIB.addOperand(MO);
+      MIB.add(MO);
   }
   return NewMI;
 }
@@ -941,12 +941,10 @@ int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const {
   unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
 
-  if (MI.getOpcode() != FrameSetupOpcode &&
-      MI.getOpcode() != FrameDestroyOpcode)
+  if (!isFrameInstr(MI))
     return 0;
 
-  int SPAdj = MI.getOperand(0).getImm();
-  SPAdj = TFI->alignSPAdjust(SPAdj);
+  int SPAdj = TFI->alignSPAdjust(getFrameSize(MI));
 
   if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) ||
       (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode))
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 003311b..3914ee5 100644
--- a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
@@ -21,6 +20,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -33,6 +33,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -53,6 +54,18 @@ static cl::opt<unsigned> MaximumJumpTableSize
   ("max-jump-table-size", cl::init(0), cl::Hidden,
    cl::desc("Set maximum size of jump tables; zero for no limit."));
 
+/// Minimum jump table density for normal functions.
+static cl::opt<unsigned>
+    JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
+                     cl::desc("Minimum density for building a jump table in "
+                              "a normal function"));
+
+/// Minimum jump table density for -Os or -Oz functions.
+static cl::opt<unsigned> OptsizeJumpTableDensity(
+    "optsize-jump-table-density", cl::init(40), cl::Hidden,
+    cl::desc("Minimum density for building a jump table in "
+             "an optsize function"));
+
 // Although this default value is arbitrary, it is not random. It is assumed
 // that a condition that evaluates the same way by a higher percentage than this
 // is best represented as control flow. Therefore, the default value N should be
@@ -361,11 +374,36 @@ static void InitLibcallNames(const char **Names, const Triple &TT) {
   Names[RTLIB::MEMCPY] = "memcpy";
   Names[RTLIB::MEMMOVE] = "memmove";
   Names[RTLIB::MEMSET] = "memset";
-  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_1] = "__llvm_memcpy_element_atomic_1";
-  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_2] = "__llvm_memcpy_element_atomic_2";
-  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_4] = "__llvm_memcpy_element_atomic_4";
-  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_8] = "__llvm_memcpy_element_atomic_8";
-  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_16] = "__llvm_memcpy_element_atomic_16";
+  Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_1] =
+      "__llvm_memcpy_element_unordered_atomic_1";
+  Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_2] =
+      "__llvm_memcpy_element_unordered_atomic_2";
+  Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_4] =
+      "__llvm_memcpy_element_unordered_atomic_4";
+  Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_8] =
+      "__llvm_memcpy_element_unordered_atomic_8";
+  Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_16] =
+      "__llvm_memcpy_element_unordered_atomic_16";
+  Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1] =
+      "__llvm_memmove_element_unordered_atomic_1";
+  Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2] =
+      "__llvm_memmove_element_unordered_atomic_2";
+  Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4] =
+      "__llvm_memmove_element_unordered_atomic_4";
+  Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8] =
+      "__llvm_memmove_element_unordered_atomic_8";
+  Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16] =
+      "__llvm_memmove_element_unordered_atomic_16";
+  Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_1] =
+      "__llvm_memset_element_unordered_atomic_1";
+  Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_2] =
+      "__llvm_memset_element_unordered_atomic_2";
+  Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_4] =
+      "__llvm_memset_element_unordered_atomic_4";
+  Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_8] =
+      "__llvm_memset_element_unordered_atomic_8";
+  Names[RTLIB::MEMSET_ELEMENT_UNORDERED_ATOMIC_16] =
+      "__llvm_memset_element_unordered_atomic_16";
   Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
@@ -768,22 +806,55 @@ RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) {
   return UNKNOWN_LIBCALL;
 }
 
-RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_ATOMIC(uint64_t ElementSize) {
+RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
+  switch (ElementSize) {
+  case 1:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_1;
+  case 2:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_2;
+  case 4:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_4;
+  case 8:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_8;
+  case 16:
+    return MEMCPY_ELEMENT_UNORDERED_ATOMIC_16;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+}
+
+RTLIB::Libcall RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
   switch (ElementSize) {
   case 1:
-    return MEMCPY_ELEMENT_ATOMIC_1;
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1;
   case 2:
-    return MEMCPY_ELEMENT_ATOMIC_2;
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2;
   case 4:
-    return MEMCPY_ELEMENT_ATOMIC_4;
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4;
   case 8:
-    return MEMCPY_ELEMENT_ATOMIC_8;
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8;
   case 16:
-    return MEMCPY_ELEMENT_ATOMIC_16;
+    return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16;
   default:
     return UNKNOWN_LIBCALL;
   }
+}
 
+RTLIB::Libcall RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) {
+  switch (ElementSize) {
+  case 1:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_1;
+  case 2:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_2;
+  case 4:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_4;
+  case 8:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_8;
+  case 16:
+    return MEMSET_ELEMENT_UNORDERED_ATOMIC_16;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
 }
 
 /// InitCmpLibcallCCs - Set default comparison libcall CC.
@@ -829,16 +900,16 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   initActions();
 
   // Perform these initializations only once.
-  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
-  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
-    = MaxStoresPerMemmoveOptSize = 4;
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+      MaxLoadsPerMemcmp = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+      MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
   UseUnderscoreLongJmp = false;
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
-  MaskAndBranchFoldingIsLegal = false;
   EnableExtLdPromotion = false;
   HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
@@ -851,7 +922,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinFunctionAlignment = 0;
   PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
-  GatherAllAliasesMaxDepth = 6;
+  GatherAllAliasesMaxDepth = 18;
   MinStackArgumentAlignment = 1;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.
@@ -901,6 +972,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMAX, VT, Expand);
     setOperationAction(ISD::UMIN, VT, Expand);
     setOperationAction(ISD::UMAX, VT, Expand);
+    setOperationAction(ISD::ABS, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -910,6 +982,11 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMULO, VT, Expand);
     setOperationAction(ISD::UMULO, VT, Expand);
 
+    // ADDCARRY operations default to expand
+    setOperationAction(ISD::ADDCARRY, VT, Expand);
+    setOperationAction(ISD::SUBCARRY, VT, Expand);
+    setOperationAction(ISD::SETCCCARRY, VT, Expand);
+
     // These default to Expand so they will be expanded to CTLZ/CTTZ by default.
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
@@ -918,6 +995,7 @@ void TargetLoweringBase::initActions() {
     
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
 
     // These operations default to expand for vector types.
     if (VT.isVector()) {
@@ -1184,12 +1262,11 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
 
 /// isLegalRC - Return true if the value types that can be represented by the
 /// specified register class are all legal.
-bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
-  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
-       I != E; ++I) {
+bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI,
+                                   const TargetRegisterClass &RC) const {
+  for (auto I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I)
     if (isTypeLegal(*I))
       return true;
-  }
   return false;
 }
 
@@ -1227,7 +1304,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
 
     // Copy operands before the frame-index.
     for (unsigned i = 0; i < OperIdx; ++i)
-      MIB.addOperand(MI->getOperand(i));
+      MIB.add(MI->getOperand(i));
     // Add frame index operands recognized by stackmaps.cpp
     if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
       // indirect-mem-ref tag, size, #FI, offset.
@@ -1237,18 +1314,18 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
       assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
       MIB.addImm(StackMaps::IndirectMemRefOp);
       MIB.addImm(MFI.getObjectSize(FI));
-      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.add(MI->getOperand(OperIdx));
       MIB.addImm(0);
     } else {
       // direct-mem-ref tag, #FI, offset.
       // Used by patchpoint, and direct alloca arguments to statepoints
       MIB.addImm(StackMaps::DirectMemRefOp);
-      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.add(MI->getOperand(OperIdx));
       MIB.addImm(0);
     }
     // Copy the operands after the frame index.
     for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
-      MIB.addOperand(MI->getOperand(i));
+      MIB.add(MI->getOperand(i));
 
     // Inherit previous memory operands.
     MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
@@ -1296,12 +1373,12 @@ TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
 
   // Find the first legal register class with the largest spill size.
   const TargetRegisterClass *BestRC = RC;
-  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+  for (unsigned i : SuperRegRC.set_bits()) {
     const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
     // We want the largest possible spill size.
-    if (SuperRC->getSize() <= BestRC->getSize())
+    if (TRI->getSpillSize(*SuperRC) <= TRI->getSpillSize(*BestRC))
       continue;
-    if (!isLegalRC(SuperRC))
+    if (!isLegalRC(*TRI, *SuperRC))
       continue;
     BestRC = SuperRC;
   }
@@ -1437,6 +1514,7 @@ void TargetLoweringBase::computeRegisterProperties(
       }
       if (IsLegalWiderType)
         break;
+      LLVM_FALLTHROUGH;
     }
     case TypeWidenVector: {
       // Try to widen the vector.
@@ -1454,6 +1532,7 @@ void TargetLoweringBase::computeRegisterProperties(
       }
       if (IsLegalWiderType)
         break;
+      LLVM_FALLTHROUGH;
     }
     case TypeSplitVector:
     case TypeScalarizeVector: {
@@ -1589,7 +1668,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 /// type of the given function.  This does not require a DAG or a return value,
 /// and is suitable for use before any DAGs for the function are constructed.
 /// TODO: Move this out of TargetLowering.cpp.
-void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
+void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
                          const TargetLowering &TLI, const DataLayout &DL) {
   SmallVector<EVT, 4> ValueVTs;
@@ -1601,9 +1680,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1616,18 +1695,20 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
         VT = MinVT;
     }
 
-    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
-    MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+    unsigned NumParts =
+        TLI.getNumRegistersForCallingConv(ReturnType->getContext(), VT);
+    MVT PartVT =
+        TLI.getRegisterTypeForCallingConv(ReturnType->getContext(), VT);
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
       Flags.setSExt();
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
@@ -1818,7 +1899,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
   Value *Fn = M->getOrInsertFunction("__safestack_pointer_address",
-                                     StackPtrTy->getPointerTo(0), nullptr);
+                                     StackPtrTy->getPointerTo(0));
   return IRB.CreateCall(Fn);
 }
 
@@ -1902,6 +1983,10 @@ void TargetLoweringBase::setMinimumJumpTableEntries(unsigned Val) {
   MinimumJumpTableEntries = Val;
 }
 
+unsigned TargetLoweringBase::getMinimumJumpTableDensity(bool OptForSize) const {
+  return OptForSize ? OptsizeJumpTableDensity : JumpTableDensity;
+}
+
 unsigned TargetLoweringBase::getMaximumJumpTableSize() const {
   return MaximumJumpTableSize;
 }
@@ -1918,11 +2003,7 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {
 /// override the target defaults.
 static StringRef getRecipEstimateForFunc(MachineFunction &MF) {
   const Function *F = MF.getFunction();
-  StringRef RecipAttrName = "reciprocal-estimates";
-  if (!F->hasFnAttribute(RecipAttrName))
-    return StringRef();
-
-  return F->getFnAttribute(RecipAttrName).getValueAsString();
+  return F->getFnAttribute("reciprocal-estimates").getValueAsString();
 }
 
 /// Construct a string for the given reciprocal operation of the given type.
@@ -2097,3 +2178,7 @@ int TargetLoweringBase::getDivRefinementSteps(EVT VT,
                                               MachineFunction &MF) const {
   return getOpRefinementSteps(false, VT, getRecipEstimateForFunc(MF));
 }
+
+void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const {
+  MF.getRegInfo().freezeReservedRegs(MF);
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index eb2a28f..6922e33 100644
--- a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===//
+//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,49 +14,109 @@
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <string>
+
 using namespace llvm;
 using namespace dwarf;
 
+static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags,
+                             StringRef &Section) {
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+
+  for (const auto &MFE: ModuleFlags) {
+    // Ignore flags with 'Require' behaviour.
+    if (MFE.Behavior == Module::Require)
+      continue;
+
+    StringRef Key = MFE.Key->getString();
+    if (Key == "Objective-C Image Info Version") {
+      Version = mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue();
+    } else if (Key == "Objective-C Garbage Collection" ||
+               Key == "Objective-C GC Only" ||
+               Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Class Properties" ||
+               Key == "Objective-C Image Swift Version") {
+      Flags |= mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue();
+    } else if (Key == "Objective-C Image Info Section") {
+      Section = cast<MDString>(MFE.Val)->getString();
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                                  ELF
 //===----------------------------------------------------------------------===//
 
+void TargetLoweringObjectFileELF::emitModuleMetadata(
+    MCStreamer &Streamer, Module &M, const TargetMachine &TM) const {
+  unsigned Version = 0;
+  unsigned Flags = 0;
+  StringRef Section;
+
+  GetObjCImageInfo(M, Version, Flags, Section);
+  if (Section.empty())
+    return;
+
+  auto &C = getContext();
+  auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  Streamer.SwitchSection(S);
+  Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+  Streamer.EmitIntValue(Version, 4);
+  Streamer.EmitIntValue(Flags, 4);
+  Streamer.AddBlankLine();
+}
+
 MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
     const GlobalValue *GV, const TargetMachine &TM,
     MachineModuleInfo *MMI) const {
   unsigned Encoding = getPersonalityEncoding();
-  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect)
+  if ((Encoding & 0x80) == DW_EH_PE_indirect)
     return getContext().getOrCreateSymbol(StringRef("DW.ref.") +
                                           TM.getSymbol(GV)->getName());
-  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr)
+  if ((Encoding & 0x70) == DW_EH_PE_absptr)
     return TM.getSymbol(GV);
   report_fatal_error("We do not support this DWARF encoding yet!");
 }
@@ -86,8 +146,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
 const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
-
-  if (Encoding & dwarf::DW_EH_PE_indirect) {
+  if (Encoding & DW_EH_PE_indirect) {
     MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
 
     MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM);
@@ -102,7 +161,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
 
     return TargetLoweringObjectFile::
       getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
-                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
@@ -117,8 +176,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   // section(".eh_frame") gcc will produce:
   //
   //   .section   .eh_frame,"a",@progbits
-  
-  if (Name == getInstrProfCoverageSectionName(false))
+
+  if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF,
+                                      /*AddSegmentInfo=*/false))
     return SectionKind::getMetadata();
 
   if (Name.empty() || Name[0] != '.') return K;
@@ -149,7 +209,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   return K;
 }
 
-
 static unsigned getELFSectionType(StringRef Name, SectionKind K) {
   // Use SHT_NOTE for section whose name starts with ".note" to allow
   // emitting ELF notes from C variable declaration.
@@ -211,10 +270,47 @@ static const Comdat *getELFComdat(const GlobalValue *GV) {
   return C;
 }
 
+static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
+                                              const TargetMachine &TM) {
+  MDNode *MD = GO->getMetadata(LLVMContext::MD_associated);
+  if (!MD)
+    return nullptr;
+
+  const MDOperand &Op = MD->getOperand(0);
+  if (!Op.get())
+    return nullptr;
+
+  auto *VM = dyn_cast<ValueAsMetadata>(Op);
+  if (!VM)
+    report_fatal_error("MD_associated operand is not ValueAsMetadata");
+
+  GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue());
+  return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
 
+  // Check if '#pragma clang section' name is applicable.
+  // Note that pragma directive overrides -ffunction-section, -fdata-section
+  // and so section name is exactly as user specified and not uniqued.
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GO);
+  if (GV && GV->hasImplicitSection()) {
+    auto Attrs = GV->getAttributes();
+    if (Attrs.hasAttribute("bss-section") && Kind.isBSS()) {
+      SectionName = Attrs.getAttribute("bss-section").getValueAsString();
+    } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) {
+      SectionName = Attrs.getAttribute("rodata-section").getValueAsString();
+    } else if (Attrs.hasAttribute("data-section") && Kind.isData()) {
+      SectionName = Attrs.getAttribute("data-section").getValueAsString();
+    }
+  }
+  const Function *F = dyn_cast<Function>(GO);
+  if (F && F->hasFnAttribute("implicit-section-name")) {
+    SectionName = F->getFnAttribute("implicit-section-name").getValueAsString();
+  }
+
   // Infer section flags from the section name if we can.
   Kind = getELFKindForNamedSection(SectionName, Kind);
 
@@ -224,9 +320,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     Group = C->getName();
     Flags |= ELF::SHF_GROUP;
   }
-  return getContext().getELFSection(SectionName,
-                                    getELFSectionType(SectionName, Kind), Flags,
-                                    /*EntrySize=*/0, Group);
+
+  // A section can have at most one associated section. Put each global with
+  // MD_associated in a unique section.
+  unsigned UniqueID = MCContext::GenericSectionID;
+  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
+  if (AssociatedSymbol) {
+    UniqueID = NextUniqueID++;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+
+  MCSectionELF *Section = getContext().getELFSection(
+      SectionName, getELFSectionType(SectionName, Kind), Flags,
+      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
+  // Make sure that we did not get some other section with incompatible sh_link.
+  // This should not be possible due to UniqueID code above.
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  return Section;
 }
 
 /// Return the section prefix name used by options FunctionsSections and
@@ -248,11 +358,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
-static MCSectionELF *
-selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
-                          SectionKind Kind, Mangler &Mang,
-                          const TargetMachine &TM, bool EmitUniqueSection,
-                          unsigned Flags, unsigned *NextUniqueID) {
+static MCSectionELF *selectELFSectionForGlobal(
+    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
+    unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) {
   unsigned EntrySize = 0;
   if (Kind.isMergeableCString()) {
     if (Kind.isMergeable2ByteCString()) {
@@ -319,7 +428,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
   if (Kind.isExecuteOnly())
     UniqueID = 0;
   return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags,
-                           EntrySize, Group, UniqueID);
+                           EntrySize, Group, UniqueID, AssociatedSymbol);
 }
 
 MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
@@ -337,8 +446,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
   }
   EmitUniqueSection |= GO->hasComdat();
 
-  return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
-                                   EmitUniqueSection, Flags, &NextUniqueID);
+  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
+  if (AssociatedSymbol) {
+    EmitUniqueSection = true;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+
+  MCSectionELF *Section = selectELFSectionForGlobal(
+      getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags,
+      &NextUniqueID, AssociatedSymbol);
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  return Section;
 }
 
 MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
@@ -351,8 +469,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
     return ReadOnlySection;
 
   return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(),
-                                   getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC,
-                                   &NextUniqueID);
+                                   getMangler(), TM, EmitUniqueSection,
+                                   ELF::SHF_ALLOC, &NextUniqueID,
+                                   /* AssociatedSymbol */ nullptr);
 }
 
 bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
@@ -500,40 +619,10 @@ void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
   }
 }
 
-/// emitModuleFlags - Perform code emission for module flags.
-void TargetLoweringObjectFileMachO::emitModuleFlags(
-    MCStreamer &Streamer, ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
-    const TargetMachine &TM) const {
-  unsigned VersionVal = 0;
-  unsigned ImageInfoFlags = 0;
-  MDNode *LinkerOptions = nullptr;
-  StringRef SectionVal;
-
-  for (const auto &MFE : ModuleFlags) {
-    // Ignore flags with 'Require' behavior.
-    if (MFE.Behavior == Module::Require)
-      continue;
-
-    StringRef Key = MFE.Key->getString();
-    Metadata *Val = MFE.Val;
-
-    if (Key == "Objective-C Image Info Version") {
-      VersionVal = mdconst::extract<ConstantInt>(Val)->getZExtValue();
-    } else if (Key == "Objective-C Garbage Collection" ||
-               Key == "Objective-C GC Only" ||
-               Key == "Objective-C Is Simulated" ||
-               Key == "Objective-C Class Properties" ||
-               Key == "Objective-C Image Swift Version") {
-      ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue();
-    } else if (Key == "Objective-C Image Info Section") {
-      SectionVal = cast<MDString>(Val)->getString();
-    } else if (Key == "Linker Options") {
-      LinkerOptions = cast<MDNode>(Val);
-    }
-  }
-
+void TargetLoweringObjectFileMachO::emitModuleMetadata(
+    MCStreamer &Streamer, Module &M, const TargetMachine &TM) const {
   // Emit the linker options if present.
-  if (LinkerOptions) {
+  if (auto *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
     for (const auto &Option : LinkerOptions->operands()) {
       SmallVector<std::string, 4> StrOptions;
       for (const auto &Piece : cast<MDNode>(Option)->operands())
@@ -542,8 +631,15 @@ void TargetLoweringObjectFileMachO::emitModuleFlags(
     }
   }
 
+  unsigned VersionVal = 0;
+  unsigned ImageInfoFlags = 0;
+  StringRef SectionVal;
+
+  GetObjCImageInfo(M, VersionVal, ImageInfoFlags, SectionVal);
+
   // The section is mandatory. If we don't have it, then we don't have GC info.
-  if (SectionVal.empty()) return;
+  if (SectionVal.empty())
+    return;
 
   StringRef Segment, Section;
   unsigned TAA = 0, StubSize = 0;
@@ -723,7 +819,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
 
     return TargetLoweringObjectFile::
       getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
-                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
@@ -1055,18 +1151,9 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable(
                                      COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
 }
 
-void TargetLoweringObjectFileCOFF::emitModuleFlags(
-    MCStreamer &Streamer, ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
-    const TargetMachine &TM) const {
-  MDNode *LinkerOptions = nullptr;
-
-  for (const auto &MFE : ModuleFlags) {
-    StringRef Key = MFE.Key->getString();
-    if (Key == "Linker Options")
-      LinkerOptions = cast<MDNode>(MFE.Val);
-  }
-
-  if (LinkerOptions) {
+void TargetLoweringObjectFileCOFF::emitModuleMetadata(
+    MCStreamer &Streamer, Module &M, const TargetMachine &TM) const {
+  if (NamedMDNode *LinkerOptions = M.getNamedMetadata("llvm.linker.options")) {
     // Emit the linker options to the linker .drectve section.  According to the
     // spec, this section is a space-separated string containing flags for
     // linker.
@@ -1081,6 +1168,24 @@ void TargetLoweringObjectFileCOFF::emitModuleFlags(
       }
     }
   }
+
+  unsigned Version = 0;
+  unsigned Flags = 0;
+  StringRef Section;
+
+  GetObjCImageInfo(M, Version, Flags, Section);
+  if (Section.empty())
+    return;
+
+  auto &C = getContext();
+  auto *S = C.getCOFFSection(
+      Section, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getReadOnly());
+  Streamer.SwitchSection(S);
+  Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+  Streamer.EmitIntValue(Version, 4);
+  Streamer.EmitIntValue(Flags, 4);
+  Streamer.AddBlankLine();
 }
 
 void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
@@ -1122,33 +1227,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
 
 void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
     raw_ostream &OS, const GlobalValue *GV) const {
-  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
-    return;
+  emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler());
+}
 
-  const Triple &TT = getTargetTriple();
+//===----------------------------------------------------------------------===//
+//                                  Wasm
+//===----------------------------------------------------------------------===//
 
-  if (TT.isKnownWindowsMSVCEnvironment())
-    OS << " /EXPORT:";
-  else
-    OS << " -export:";
-
-  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
-    std::string Flag;
-    raw_string_ostream FlagOS(Flag);
-    getMangler().getNameWithPrefix(FlagOS, GV, false);
-    FlagOS.flush();
-    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
-      OS << Flag.substr(1);
-    else
-      OS << Flag;
-  } else {
-    getMangler().getNameWithPrefix(OS, GV, false);
+static const Comdat *getWasmComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" +
+                       C->getName() + "' cannot be lowered.");
+
+  return C;
+}
+
+MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  llvm_unreachable("getExplicitSectionGlobal not yet implemented");
+  return nullptr;
+}
+
+static MCSectionWasm *
+selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
+                           SectionKind Kind, Mangler &Mang,
+                           const TargetMachine &TM, bool EmitUniqueSection,
+                           unsigned Flags, unsigned *NextUniqueID) {
+  StringRef Group = "";
+  if (getWasmComdat(GO))
+    llvm_unreachable("comdat not yet supported for wasm");
+
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name = getSectionPrefixForGlobal(Kind);
+
+  if (const auto *F = dyn_cast<Function>(GO)) {
+    const auto &OptionalPrefix = F->getSectionPrefix();
+    if (OptionalPrefix)
+      Name += *OptionalPrefix;
   }
 
-  if (!GV->getValueType()->isFunctionTy()) {
-    if (TT.isKnownWindowsMSVCEnvironment())
-      OS << ",DATA";
-    else
-      OS << ",data";
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GO, Mang, true);
+  }
+  unsigned UniqueID = MCContext::GenericSectionID;
+  if (EmitUniqueSection && !UniqueSectionNames) {
+    UniqueID = *NextUniqueID;
+    (*NextUniqueID)++;
   }
+  return Ctx.getWasmSection(Name, /*Type=*/0, Flags,
+                            Group, UniqueID);
+}
+
+MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+  if (Kind.isCommon())
+    report_fatal_error("mergable sections not supported yet on wasm");
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniqueSection = false;
+  if (Kind.isText())
+    EmitUniqueSection = TM.getFunctionSections();
+  else
+    EmitUniqueSection = TM.getDataSections();
+  EmitUniqueSection |= GO->hasComdat();
+
+  return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
+                                    EmitUniqueSection, /*Flags=*/0,
+                                    &NextUniqueID);
+}
+
+bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
+}
+
+const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic sanity checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
+}
+
+void
+TargetLoweringObjectFileWasm::InitializeWasm() {
+  // TODO: Initialize StaticCtorSection and StaticDtorSection.
 }
diff --git a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index b6da8e0..ed845e1 100644
--- a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   return false;
 }
 
-/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
-/// is specified on the command line.  When this flag is off(default), the
-/// code generator is not allowed to generate mad (multiply add) if the
-/// result is "less precise" than doing those operations individually.
-bool TargetOptions::LessPreciseFPMAD() const {
-  return UnsafeFPMath || LessPreciseFPMADOption;
-}
-
 /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
 /// that the rounding mode of the FPU can change from its default.
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
diff --git a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
index e7ea2b4..817e58c 100644
--- a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1,4 +1,4 @@
-//===-- TargetPassConfig.cpp - Target independent code generation passes --===//
+//===- TargetPassConfig.cpp - Target independent code generation passes ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,28 +13,37 @@
 //===---------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetPassConfig.h"
-
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
-#include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include <cassert>
+#include <string>
 
 using namespace llvm;
 
@@ -92,6 +101,19 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(false),
     cl::ZeroOrMore);
+static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner",
+    cl::Hidden,
+    cl::desc("Enable machine outliner"));
+// Enable or disable FastISel. Both options are needed, because
+// FastISel is enabled by default with -fast, and we wish to be
+// able to enable or disable fast-isel independently from -O0.
+static cl::opt<cl::boolOrDefault>
+EnableFastISelOption("fast-isel", cl::Hidden,
+  cl::desc("Enable the \"fast\" instruction selector"));
+
+static cl::opt<cl::boolOrDefault>
+    EnableGlobalISel("global-isel", cl::Hidden,
+                     cl::desc("Enable the \"global\" instruction selector"));
 
 static cl::opt<std::string>
 PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
@@ -211,6 +233,7 @@ char TargetPassConfig::EarlyTailDuplicateID = 0;
 char TargetPassConfig::PostRAMachineLICMID = 0;
 
 namespace {
+
 struct InsertedPass {
   AnalysisID TargetPassID;
   IdentifyingPassPtr InsertedPassID;
@@ -231,9 +254,11 @@ struct InsertedPass {
     return NP;
   }
 };
-}
+
+} // end anonymous namespace
 
 namespace llvm {
+
 class PassConfigImpl {
 public:
   // List of passes explicitly substituted by this target. Normally this is
@@ -249,7 +274,8 @@ public:
   /// is inserted after each instance of the first one.
   SmallVector<InsertedPass, 4> InsertedPasses;
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 // Out of line virtual method.
 TargetPassConfig::~TargetPassConfig() {
@@ -258,11 +284,8 @@ TargetPassConfig::~TargetPassConfig() {
 
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
-TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
-    : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false),
-      AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
-      DisableVerify(false), EnableTailMerge(true) {
-
+TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
+    : ImmutablePass(ID), PM(&pm), TM(&TM) {
   Impl = new PassConfigImpl();
 
   // Register all target independent codegen passes to activate their PassIDs,
@@ -278,7 +301,10 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
 
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
-    TM->Options.PrintMachineCode = true;
+    TM.Options.PrintMachineCode = true;
+
+  if (TM.Options.EnableIPRA)
+    setRequiresCodeGenSCCOrder();
 }
 
 CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
@@ -303,12 +329,14 @@ void TargetPassConfig::insertPass(AnalysisID TargetPassID,
 ///
 /// Targets may override this to extend TargetPassConfig.
 TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new TargetPassConfig(this, PM);
+  return new TargetPassConfig(*this, PM);
 }
 
 TargetPassConfig::TargetPassConfig()
-  : ImmutablePass(ID), PM(nullptr) {
-  llvm_unreachable("TargetPassConfig should not be constructed on-the-fly");
+  : ImmutablePass(ID) {
+  report_fatal_error("Trying to construct TargetPassConfig without a target "
+                     "machine. Scheduling a CodeGen pass without a target "
+                     "triple set?");
 }
 
 // Helper to verify the analysis is really immutable.
@@ -421,7 +449,12 @@ void TargetPassConfig::addPrintPass(const std::string &Banner) {
 }
 
 void TargetPassConfig::addVerifyPass(const std::string &Banner) {
-  if (VerifyMachineCode)
+  bool Verify = VerifyMachineCode;
+#ifdef EXPENSIVE_CHECKS
+  if (VerifyMachineCode == cl::BOU_UNSET)
+    Verify = TM->isMachineVerifierClean();
+#endif
+  if (Verify)
     PM->add(createMachineVerifierPass(Banner));
 }
 
@@ -480,6 +513,14 @@ void TargetPassConfig::addIRPasses() {
 
   // Insert calls to mcount-like functions.
   addPass(createCountingFunctionInserterPass());
+
+  // Add scalarization of target's unsupported masked memory intrinsics pass.
+  // the unsupported intrinsic will be replaced with a chain of basic blocks,
+  // that stores/loads element one-by-one if the appropriate mask bit is set.
+  addPass(createScalarizeMaskedMemIntrinPass());
+
+  // Expand reduction intrinsics into shuffle sequences if the target wants to.
+  addPass(createExpandReductionsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
@@ -499,14 +540,14 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     LLVM_FALLTHROUGH;
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-    addPass(createDwarfEHPass(TM));
+    addPass(createDwarfEHPass());
     break;
   case ExceptionHandling::WinEH:
     // We support using both GCC-style and MSVC-style exceptions on Windows, so
     // add both preparation passes. Each pass will only actually run if it
     // recognizes the personality function.
-    addPass(createWinEHPass(TM));
-    addPass(createDwarfEHPass(TM));
+    addPass(createWinEHPass());
+    addPass(createDwarfEHPass());
     break;
   case ExceptionHandling::None:
     addPass(createLowerInvokePass());
@@ -521,7 +562,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
 /// before exception handling preparation passes.
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
-    addPass(createCodeGenPreparePass(TM));
+    addPass(createCodeGenPreparePass());
   addPass(createRewriteSymbolsPass());
 }
 
@@ -531,13 +572,13 @@ void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
   // Force codegen to run according to the callgraph.
-  if (TM->Options.EnableIPRA)
+  if (requiresCodeGenSCCOrder())
     addPass(new DummyCGSCCPass);
 
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
-  addPass(createSafeStackPass(TM));
-  addPass(createStackProtectorPass(TM));
+  addPass(createSafeStackPass());
+  addPass(createStackProtectorPass());
 
   if (PrintISelInput)
     addPass(createPrintFunctionPass(
@@ -549,6 +590,74 @@ void TargetPassConfig::addISelPrepare() {
     addPass(createVerifierPass());
 }
 
+bool TargetPassConfig::addCoreISelPasses() {
+  // Enable FastISel with -fast, but allow that to be overridden.
+  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
+  if (EnableFastISelOption == cl::BOU_TRUE ||
+      (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel()))
+    TM->setFastISel(true);
+
+  // Ask the target for an isel.
+  // Enable GlobalISel if the target wants to, but allow that to be overriden.
+  if (EnableGlobalISel == cl::BOU_TRUE ||
+      (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled())) {
+    if (addIRTranslator())
+      return true;
+
+    addPreLegalizeMachineIR();
+
+    if (addLegalizeMachineIR())
+      return true;
+
+    // Before running the register bank selector, ask the target if it
+    // wants to run some passes.
+    addPreRegBankSelect();
+
+    if (addRegBankSelect())
+      return true;
+
+    addPreGlobalInstructionSelect();
+
+    if (addGlobalInstructionSelect())
+      return true;
+
+    // Pass to reset the MachineFunction if the ISel failed.
+    addPass(createResetMachineFunctionPass(
+        reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
+
+    // Provide a fallback path when we do not want to abort on
+    // not-yet-supported input.
+    if (!isGlobalISelAbortEnabled() && addInstSelector())
+      return true;
+
+  } else if (addInstSelector())
+    return true;
+
+  return false;
+}
+
+bool TargetPassConfig::addISelPasses() {
+  if (TM->Options.EmulatedTLS)
+    addPass(createLowerEmuTLSPass());
+
+  addPass(createPreISelIntrinsicLoweringPass());
+  addPass(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+  addIRPasses();
+  addCodeGenPrepare();
+  addPassesToHandleExceptions();
+  addISelPrepare();
+
+  return addCoreISelPasses();
+}
+
+/// -regalloc=... command line option.
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<RegisterRegAlloc> >
+RegAlloc("regalloc",
+         cl::init(&useDefaultRegisterAllocator),
+         cl::desc("Register allocator to use"));
+
 /// Add the complete set of target-independent postISel code generator passes.
 ///
 /// This can be read as the standard order of major LLVM CodeGen stages. Stages
@@ -607,8 +716,12 @@ void TargetPassConfig::addMachinePasses() {
   // including phi elimination and scheduling.
   if (getOptimizeRegAlloc())
     addOptimizedRegAlloc(createRegAllocPass(true));
-  else
+  else {
+    if (RegAlloc != &useDefaultRegisterAllocator &&
+        RegAlloc != &createFastRegisterAllocator)
+      report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
     addFastRegAlloc(createRegAllocPass(false));
+  }
 
   // Run post-ra passes.
   addPostRegAlloc();
@@ -620,7 +733,7 @@ void TargetPassConfig::addMachinePasses() {
   // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
   // do so if it hasn't been disabled, substituted, or overridden.
   if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
-      addPass(createPrologEpilogInserterPass(TM));
+      addPass(createPrologEpilogInserterPass());
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
@@ -668,9 +781,15 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&StackMapLivenessID, false);
   addPass(&LiveDebugValuesID, false);
 
+  // Insert before XRay Instrumentation.
+  addPass(&FEntryInserterID, false);
+
   addPass(&XRayInstrumentationID, false);
   addPass(&PatchableFunctionID, false);
 
+  if (EnableMachineOutliner)
+    PM->add(createMachineOutlinerPass());
+
   AddingMachinePasses = false;
 }
 
@@ -704,6 +823,10 @@ void TargetPassConfig::addMachineSSAOptimization() {
 
   addPass(&MachineLICMID, false);
   addPass(&MachineCSEID, false);
+
+  // Coalesce basic blocks with the same branch condition
+  addPass(&BranchCoalescingID);
+
   addPass(&MachineSinkingID);
 
   addPass(&PeepholeOptimizerID);
@@ -730,20 +853,13 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
-LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag);
-static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;
+
 static RegisterRegAlloc
 defaultRegAlloc("default",
                 "pick register allocator based on -O option",
                 useDefaultRegisterAllocator);
 
-/// -regalloc=... command line option.
-static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
-               RegisterPassParser<RegisterRegAlloc> >
-RegAlloc("regalloc",
-         cl::init(&useDefaultRegisterAllocator),
-         cl::desc("Register allocator to use"));
-
 static void initializeDefaultRegisterAllocatorOnce() {
   RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
 
@@ -753,7 +869,6 @@ static void initializeDefaultRegisterAllocatorOnce() {
   }
 }
 
-
 /// Instantiate the default register allocator pass for this target for either
 /// the optimized or unoptimized allocation path. This will be added to the pass
 /// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
@@ -903,6 +1018,11 @@ void TargetPassConfig::addBlockPlacement() {
 //===---------------------------------------------------------------------===//
 /// GlobalISel Configuration
 //===---------------------------------------------------------------------===//
+
+bool TargetPassConfig::isGlobalISelEnabled() const {
+  return false;
+}
+
 bool TargetPassConfig::isGlobalISelAbortEnabled() const {
   return EnableGlobalISelAbort == 1;
 }
diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index cd50c5b..eeb00a7 100644
--- a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===//
+//==- TargetRegisterInfo.cpp - Target Register Information Implementation --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <utility>
 
 #define DEBUG_TYPE "target-reg-info"
 
@@ -38,7 +48,7 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
     CoveringLanes(SRICoveringLanes) {
 }
 
-TargetRegisterInfo::~TargetRegisterInfo() {}
+TargetRegisterInfo::~TargetRegisterInfo() = default;
 
 void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet, unsigned Reg)
     const {
@@ -50,8 +60,7 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
     ArrayRef<MCPhysReg> Exceptions) const {
   // Check that all super registers of reserved regs are reserved as well.
   BitVector Checked(getNumRegs());
-  for (int Reg = RegisterSet.find_first(); Reg>=0;
-       Reg = RegisterSet.find_next(Reg)) {
+  for (unsigned Reg : RegisterSet.set_bits()) {
     if (Checked[Reg])
       continue;
     for (MCSuperRegIterator SR(Reg, this); SR.isValid(); ++SR) {
@@ -127,7 +136,7 @@ Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-} // End of llvm namespace
+} // end namespace llvm
 
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
@@ -155,10 +164,9 @@ TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
   // Pick the most sub register class of the right type that contains
   // this physreg.
   const TargetRegisterClass* BestRC = nullptr;
-  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
-    const TargetRegisterClass* RC = *I;
-    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
-        (!BestRC || BestRC->hasSubClass(RC)))
+  for (const TargetRegisterClass* RC : regclasses()) {
+    if ((VT == MVT::Other || isTypeLegalForClass(*RC, VT)) &&
+        RC->contains(reg) && (!BestRC || BestRC->hasSubClass(RC)))
       BestRC = RC;
   }
 
@@ -185,10 +193,9 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
     if (SubClass)
       getAllocatableSetForRC(MF, SubClass, Allocatable);
   } else {
-    for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
-         E = regclass_end(); I != E; ++I)
-      if ((*I)->isAllocatable())
-        getAllocatableSetForRC(MF, *I, Allocatable);
+    for (const TargetRegisterClass *C : regclasses())
+      if (C->isAllocatable())
+        getAllocatableSetForRC(MF, C, Allocatable);
   }
 
   // Mask out the reserved registers
@@ -209,7 +216,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A,
     if (unsigned Common = *A++ & *B++) {
       const TargetRegisterClass *RC =
           TRI->getRegClass(I + countTrailingZeros(Common));
-      if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT))
+      if (SVT == MVT::SimpleValueType::Any || TRI->isTypeLegalForClass(*RC, VT))
         return RC;
     }
   return nullptr;
@@ -267,7 +274,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
   const TargetRegisterClass *BestRC = nullptr;
   unsigned *BestPreA = &PreA;
   unsigned *BestPreB = &PreB;
-  if (RCA->getSize() < RCB->getSize()) {
+  if (getRegSizeInBits(*RCA) < getRegSizeInBits(*RCB)) {
     std::swap(RCA, RCB);
     std::swap(SubA, SubB);
     std::swap(BestPreA, BestPreB);
@@ -275,7 +282,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
 
   // Also terminate the search one we have found a register class as small as
   // RCA.
-  unsigned MinSize = RCA->getSize();
+  unsigned MinSize = getRegSizeInBits(*RCA);
 
   for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) {
     unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA);
@@ -283,7 +290,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
       // Check if a common super-register class exists for this index pair.
       const TargetRegisterClass *RC =
         firstCommonClass(IA.getMask(), IB.getMask(), this);
-      if (!RC || RC->getSize() < MinSize)
+      if (!RC || getRegSizeInBits(*RC) < MinSize)
         continue;
 
       // The indexes must compose identically: PreA+SubA == PreB+SubB.
@@ -292,7 +299,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
         continue;
 
       // Is RC a better candidate than BestRC?
-      if (BestRC && RC->getSize() >= BestRC->getSize())
+      if (BestRC && getRegSizeInBits(*RC) >= getRegSizeInBits(*BestRC))
         continue;
 
       // Yes, RC is the smallest super-register seen so far.
@@ -301,7 +308,7 @@ getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
       *BestPreB = IB.getSubReg();
 
       // Bail early if we reached MinSize. We won't find a better candidate.
-      if (BestRC->getSize() == MinSize)
+      if (getRegSizeInBits(*BestRC) == MinSize)
         return BestRC;
     }
   }
@@ -415,9 +422,9 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void
-TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
-                            const TargetRegisterInfo *TRI) {
+LLVM_DUMP_METHOD
+void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
+                                 const TargetRegisterInfo *TRI) {
   dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n";
 }
 #endif
diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
index 83e52d3..9210ea8 100644
--- a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,11 +13,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const {
 
 static unsigned gcd(unsigned Dividend, unsigned Divisor) {
   // Dividend and Divisor will be naturally swapped as needed.
-  while(Divisor) {
+  while (Divisor) {
     unsigned Rem = Dividend % Divisor;
     Dividend = Divisor;
     Divisor = Rem;
   };
   return Dividend;
 }
+
 static unsigned lcm(unsigned A, unsigned B) {
   unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
   assert((LCM >= A && LCM >= B) && "LCM overflow");
@@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm,
   }
 }
 
+/// Returns true only if instruction is specified as single issue.
+bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->BeginGroup;
+  }
+  return false;
+}
+
+bool TargetSchedModel::mustEndGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->EndGroup;
+  }
+  return false;
+}
+
 unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
                                           const MCSchedClassDesc *SC) const {
   if (hasInstrItineraries()) {
@@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) {
 /// evaluation of predicates that depend on instruction operands or flags.
 const MCSchedClassDesc *TargetSchedModel::
 resolveSchedClass(const MachineInstr *MI) const {
-
   // Get the definition's scheduling class descriptor from this machine model.
   unsigned SchedClass = MI->getDesc().getSchedClass();
   const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
@@ -244,7 +277,11 @@ unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
   if (SCDesc->isValid() && !SCDesc->isVariant())
     return computeInstrLatency(*SCDesc);
 
-  llvm_unreachable("No MI sched latency");
+  if (SCDesc->isValid()) {
+    assert (!SCDesc->isVariant() && "No MI sched latency: SCDesc->isVariant()");
+    return computeInstrLatency(*SCDesc);
+  }
+  return 0;
 }
 
 unsigned
@@ -298,3 +335,68 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   }
   return 0;
 }
+
+static Optional<double>
+getRThroughputFromItineraries(unsigned schedClass,
+                              const InstrItineraryData *IID){
+  double Unknown = std::numeric_limits<double>::infinity();
+  double Throughput = Unknown;
+
+  for (const InstrStage *IS = IID->beginStage(schedClass),
+                        *E = IID->endStage(schedClass);
+       IS != E; ++IS) {
+    unsigned Cycles = IS->getCycles();
+    if (!Cycles)
+      continue;
+    Throughput =
+        std::min(Throughput, countPopulation(IS->getUnits()) * 1.0 / Cycles);
+  }
+  // We need reciprocal throughput that's why we return such value.
+  return 1 / Throughput;
+}
+
+static Optional<double>
+getRThroughputFromInstrSchedModel(const MCSchedClassDesc *SCDesc,
+                                  const TargetSubtargetInfo *STI,
+                                  const MCSchedModel &SchedModel) {
+  double Unknown = std::numeric_limits<double>::infinity();
+  double Throughput = Unknown;
+
+  for (const MCWriteProcResEntry *WPR = STI->getWriteProcResBegin(SCDesc),
+                                 *WEnd = STI->getWriteProcResEnd(SCDesc);
+       WPR != WEnd; ++WPR) {
+    unsigned Cycles = WPR->Cycles;
+    if (!Cycles)
+      return Optional<double>();
+
+    unsigned NumUnits =
+        SchedModel.getProcResource(WPR->ProcResourceIdx)->NumUnits;
+    Throughput = std::min(Throughput, NumUnits * 1.0 / Cycles);
+  }
+  // We need reciprocal throughput that's why we return such value.
+  return 1 / Throughput;
+}
+
+Optional<double>
+TargetSchedModel::computeInstrRThroughput(const MachineInstr *MI) const {
+  if (hasInstrItineraries())
+    return getRThroughputFromItineraries(MI->getDesc().getSchedClass(),
+                                         getInstrItineraries());
+  if (hasInstrSchedModel())
+    return getRThroughputFromInstrSchedModel(resolveSchedClass(MI), STI,
+                                             SchedModel);
+  return Optional<double>();
+}
+
+Optional<double>
+TargetSchedModel::computeInstrRThroughput(unsigned Opcode) const {
+  unsigned SchedClass = TII->get(Opcode).getSchedClass();
+  if (hasInstrItineraries())
+    return getRThroughputFromItineraries(SchedClass, getInstrItineraries());
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+    if (SCDesc->isValid() && !SCDesc->isVariant())
+      return getRThroughputFromInstrSchedModel(SCDesc, STI, SchedModel);
+  }
+  return Optional<double>();
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
index c74707d..f6d5bc8 100644
--- a/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- TargetSubtargetInfo.cpp - General Target Information ---------------==//
+//===- TargetSubtargetInfo.cpp - General Target Information ----------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
 using namespace llvm;
 
-//---------------------------------------------------------------------------
-// TargetSubtargetInfo Class
-//
 TargetSubtargetInfo::TargetSubtargetInfo(
     const Triple &TT, StringRef CPU, StringRef FS,
     ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetFeatureKV> PD,
@@ -26,7 +31,7 @@ TargetSubtargetInfo::TargetSubtargetInfo(
     : MCSubtargetInfo(TT, CPU, FS, PF, PD, ProcSched, WPR, WL, RA, IS, OC, FP) {
 }
 
-TargetSubtargetInfo::~TargetSubtargetInfo() {}
+TargetSubtargetInfo::~TargetSubtargetInfo() = default;
 
 bool TargetSubtargetInfo::enableAtomicExpand() const {
   return true;
@@ -52,3 +57,46 @@ bool TargetSubtargetInfo::enablePostRAScheduler() const {
 bool TargetSubtargetInfo::useAA() const {
   return false;
 }
+
+static std::string createSchedInfoStr(unsigned Latency,
+                                     Optional<double> RThroughput) {
+  static const char *SchedPrefix = " sched: [";
+  std::string Comment;
+  raw_string_ostream CS(Comment);
+  if (Latency > 0 && RThroughput.hasValue())
+    CS << SchedPrefix << Latency << format(":%2.2f", RThroughput.getValue())
+       << "]";
+  else if (Latency > 0)
+    CS << SchedPrefix << Latency << ":?]";
+  else if (RThroughput.hasValue())
+    CS << SchedPrefix << "?:" << RThroughput.getValue() << "]";
+  CS.flush();
+  return Comment;
+}
+
+/// Returns string representation of scheduler comment
+std::string TargetSubtargetInfo::getSchedInfoStr(const MachineInstr &MI) const {
+  if (MI.isPseudo() || MI.isTerminator())
+    return std::string();
+  // We don't cache TSchedModel because it depends on TargetInstrInfo
+  // that could be changed during the compilation
+  TargetSchedModel TSchedModel;
+  TSchedModel.init(getSchedModel(), this, getInstrInfo());
+  unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+  Optional<double> RThroughput = TSchedModel.computeInstrRThroughput(&MI);
+  return createSchedInfoStr(Latency, RThroughput);
+}
+
+/// Returns string representation of scheduler comment
+std::string TargetSubtargetInfo::getSchedInfoStr(MCInst const &MCI) const {
+  // We don't cache TSchedModel because it depends on TargetInstrInfo
+  // that could be changed during the compilation
+  TargetSchedModel TSchedModel;
+  TSchedModel.init(getSchedModel(), this, getInstrInfo());
+  if (!TSchedModel.hasInstrSchedModel())
+    return std::string();
+  unsigned Latency = TSchedModel.computeInstrLatency(MCI.getOpcode());
+  Optional<double> RThroughput =
+      TSchedModel.computeInstrRThroughput(MCI.getOpcode());
+  return createSchedInfoStr(Latency, RThroughput);
+}
diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 0f1b2ed..83c00e2 100644
--- a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -52,7 +52,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "twoaddrinstr"
+#define DEBUG_TYPE "twoaddressinstruction"
 
 STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
 STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
@@ -68,6 +68,13 @@ EnableRescheduling("twoaddr-reschedule",
                    cl::desc("Coalesce copies by rescheduling (default=true)"),
                    cl::init(true), cl::Hidden);
 
+// Limit the number of dataflow edges to traverse when evaluating the benefit
+// of commuting operands.
+static cl::opt<unsigned> MaxDataFlowEdge(
+    "dataflow-edge-limit", cl::Hidden, cl::init(3),
+    cl::desc("Maximum number of dataflow edges to traverse when evaluating "
+             "the benefit of commuting operands"));
+
 namespace {
 class TwoAddressInstructionPass : public MachineFunctionPass {
   MachineFunction *MF;
@@ -155,7 +162,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<AAResultsWrapperPass>();
     AU.addUsedIfAvailable<LiveVariables>();
     AU.addPreserved<LiveVariables>();
     AU.addPreserved<SlotIndexes>();
@@ -171,10 +178,10 @@ public:
 } // end anonymous namespace
 
 char TwoAddressInstructionPass::ID = 0;
-INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction",
+INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE,
                 "Two-Address instruction pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
+INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE,
                 "Two-Address instruction pass", false, false)
 
 char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
@@ -637,10 +644,10 @@ isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
   // To more generally minimize register copies, ideally the logic of two addr
   // instruction pass should be integrated with register allocation pass where
   // interference graph is available.
-  if (isRevCopyChain(regC, regA, 3))
+  if (isRevCopyChain(regC, regA, MaxDataFlowEdge))
     return true;
 
-  if (isRevCopyChain(regB, regA, 3))
+  if (isRevCopyChain(regB, regA, MaxDataFlowEdge))
     return false;
 
   // Since there are no intervening uses for both registers, then commute
@@ -905,7 +912,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     ++End;
   }
 
-  // Check if the reschedule will not break depedencies.
+  // Check if the reschedule will not break dependencies.
   unsigned NumVisited = 0;
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
@@ -1627,7 +1634,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   InstrItins = MF->getSubtarget().getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  if (auto *AAPass = getAnalysisIfAvailable<AAResultsWrapperPass>())
+    AA = &AAPass->getAAResults();
+  else
+    AA = nullptr;
   OptLevel = TM.getOptLevel();
 
   bool MadeChange = false;
@@ -1785,7 +1795,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
                                    TII->get(TargetOpcode::COPY))
                                .addReg(DstReg, RegState::Define, SubIdx)
-                               .addOperand(UseMO);
+                               .add(UseMO);
 
     // The first def needs an <undef> flag because there is no live register
     // before it.
diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index c2db56a..407fd9b 100644
--- a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -195,18 +196,31 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
         }
 
       if (phi->getNumOperands() == 3) {
-        unsigned Input = phi->getOperand(1).getReg();
-        unsigned Output = phi->getOperand(0).getReg();
-
-        phi++->eraseFromParent();
+        const MachineOperand &Input = phi->getOperand(1);
+        const MachineOperand &Output = phi->getOperand(0);
+        unsigned InputReg = Input.getReg();
+        unsigned OutputReg = Output.getReg();
+        assert(Output.getSubReg() == 0 && "Cannot have output subregister");
         ModifiedPHI = true;
 
-        if (Input != Output) {
+        if (InputReg != OutputReg) {
           MachineRegisterInfo &MRI = F.getRegInfo();
-          MRI.constrainRegClass(Input, MRI.getRegClass(Output));
-          MRI.replaceRegWith(Output, Input);
+          unsigned InputSub = Input.getSubReg();
+          if (InputSub == 0 &&
+              MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) {
+            MRI.replaceRegWith(OutputReg, InputReg);
+          } else {
+            // The input register to the PHI has a subregister or it can't be
+            // constrained to the proper register class:
+            // insert a COPY instead of simply replacing the output
+            // with the input.
+            const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo();
+            BuildMI(*BB, BB->getFirstNonPHI(), phi->getDebugLoc(),
+                    TII->get(TargetOpcode::COPY), OutputReg)
+                .addReg(InputReg, getRegState(Input), InputSub);
+          }
+          phi++->eraseFromParent();
         }
-
         continue;
       }
 
diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
index 0d506d6..f8aacdb 100644
--- a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -72,9 +72,21 @@ void VirtRegMap::grow() {
   Virt2SplitMap.resize(NumRegs);
 }
 
+void VirtRegMap::assignVirt2Phys(unsigned virtReg, MCPhysReg physReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg) &&
+         TargetRegisterInfo::isPhysicalRegister(physReg));
+  assert(Virt2PhysMap[virtReg] == NO_PHYS_REG &&
+         "attempt to assign physical register to already mapped "
+         "virtual register");
+  assert(!getRegInfo().isReserved(physReg) &&
+         "Attempt to map virtReg to a reserved physReg");
+  Virt2PhysMap[virtReg] = physReg;
+}
+
 unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
-  int SS = MF->getFrameInfo().CreateSpillStackObject(RC->getSize(),
-                                                     RC->getAlignment());
+  unsigned Size = TRI->getSpillSize(*RC);
+  unsigned Align = TRI->getSpillAlignment(*RC);
+  int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Align);
   ++NumSpillSlots;
   return SS;
 }
@@ -167,6 +179,8 @@ class VirtRegRewriter : public MachineFunctionPass {
   bool readsUndefSubreg(const MachineOperand &MO) const;
   void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
+  void expandCopyBundle(MachineInstr &MI) const;
+  bool subRegLiveThrough(const MachineInstr &MI, unsigned SuperPhysReg) const;
 
 public:
   static char ID;
@@ -367,11 +381,67 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
   }
 
   if (Indexes)
-    Indexes->removeMachineInstrFromMaps(MI);
-  MI.eraseFromParent();
+    Indexes->removeSingleMachineInstrFromMaps(MI);
+  MI.eraseFromBundle();
   DEBUG(dbgs() << "  deleted.\n");
 }
 
+/// The liverange splitting logic sometimes produces bundles of copies when
+/// subregisters are involved. Expand these into a sequence of copy instructions
+/// after processing the last in the bundle. Does not update LiveIntervals
+/// which we shouldn't need for this instruction anymore.
+void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
+  if (!MI.isCopy())
+    return;
+
+  if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
+    // Only do this when the complete bundle is made out of COPYs.
+    MachineBasicBlock &MBB = *MI.getParent();
+    for (MachineBasicBlock::reverse_instr_iterator I =
+         std::next(MI.getReverseIterator()), E = MBB.instr_rend();
+         I != E && I->isBundledWithSucc(); ++I) {
+      if (!I->isCopy())
+        return;
+    }
+
+    for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator();
+         I->isBundledWithPred(); ) {
+      MachineInstr &MI = *I;
+      ++I;
+
+      MI.unbundleFromPred();
+      if (Indexes)
+        Indexes->insertMachineInstrInMaps(MI);
+    }
+  }
+}
+
+/// Check whether (part of) \p SuperPhysReg is live through \p MI.
+/// \pre \p MI defines a subregister of a virtual register that
+/// has been assigned to \p SuperPhysReg.
+bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
+                                        unsigned SuperPhysReg) const {
+  SlotIndex MIIndex = LIS->getInstructionIndex(MI);
+  SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
+  SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
+  for (MCRegUnitIterator Unit(SuperPhysReg, TRI); Unit.isValid(); ++Unit) {
+    const LiveRange &UnitRange = LIS->getRegUnit(*Unit);
+    // If the regunit is live both before and after MI,
+    // we assume it is live through.
+    // Generally speaking, this is not true, because something like
+    // "RU = op RU" would match that description.
+    // However, we know that we are trying to assess whether
+    // a def of a virtual reg, vreg, is live at the same time of RU.
+    // If we are in the "RU = op RU" situation, that means that vreg
+    // is defined at the same time as RU (i.e., "vreg, RU = op RU").
+    // Thus, vreg and RU interferes and vreg cannot be assigned to
+    // SuperPhysReg. Therefore, this situation cannot happen.
+    if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses))
+      return true;
+  }
+  return false;
+}
+
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
   SmallVector<unsigned, 8> SuperDeads;
@@ -409,7 +479,8 @@ void VirtRegRewriter::rewrite() {
             // A virtual register kill refers to the whole register, so we may
             // have to add <imp-use,kill> operands for the super-register.  A
             // partial redef always kills and redefines the super-register.
-            if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+            if ((MO.readsReg() && (MO.isDef() || MO.isKill())) ||
+                (MO.isDef() && subRegLiveThrough(*MI, PhysReg)))
               SuperKills.push_back(PhysReg);
 
             if (MO.isDef()) {
@@ -431,12 +502,14 @@ void VirtRegRewriter::rewrite() {
             }
           }
 
-          // The <def,undef> flag only makes sense for sub-register defs, and
-          // we are substituting a full physreg.  An <imp-use,kill> operand
-          // from the SuperKills list will represent the partial read of the
-          // super-register.
-          if (MO.isDef())
+          // The <def,undef> and <def,internal> flags only make sense for
+          // sub-register defs, and we are substituting a full physreg.  An
+          // <imp-use,kill> operand from the SuperKills list will represent the
+          // partial read of the super-register.
+          if (MO.isDef()) {
             MO.setIsUndef(false);
+            MO.setIsInternalRead(false);
+          }
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, SubReg);
@@ -461,6 +534,8 @@ void VirtRegRewriter::rewrite() {
 
       DEBUG(dbgs() << "> " << *MI);
 
+      expandCopyBundle(*MI);
+
       // We can remove identity copies right now.
       handleIdentityCopy(*MI);
     }
diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
index 568720c..c63a0a9 100644
--- a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -16,13 +16,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCSymbol.h"
@@ -54,7 +54,7 @@ namespace {
 class WinEHPrepare : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
-  WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {}
+  WinEHPrepare() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &Fn) override;
 
@@ -86,6 +86,7 @@ private:
   // All fields are reset by runOnFunction.
   EHPersonality Personality = EHPersonality::Unknown;
 
+  const DataLayout *DL = nullptr;
   DenseMap<BasicBlock *, ColorVector> BlockColors;
   MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;
 };
@@ -93,12 +94,10 @@ private:
 } // end anonymous namespace
 
 char WinEHPrepare::ID = 0;
-INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
-                   false, false)
+INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions",
+                false, false)
 
-FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) {
-  return new WinEHPrepare(TM);
-}
+FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); }
 
 bool WinEHPrepare::runOnFunction(Function &Fn) {
   if (!Fn.hasPersonalityFn())
@@ -111,6 +110,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) {
   if (!isFuncletEHPersonality(Personality))
     return false;
 
+  DL = &Fn.getParent()->getDataLayout();
   return prepareExplicitEH(Fn);
 }
 
@@ -1070,7 +1070,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {
   if (!isa<TerminatorInst>(EHPad)) {
     // If the EHPad isn't a terminator, then we can insert a load in this block
     // that will dominate all uses.
-    SpillSlot = new AllocaInst(PN->getType(), nullptr,
+    SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(PN->getName(), ".wineh.spillslot"),
                                &F.getEntryBlock().front());
     Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"),
@@ -1157,7 +1157,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
                                       Function &F) {
   // Lazilly create the spill slot.
   if (!SpillSlot)
-    SpillSlot = new AllocaInst(V->getType(), nullptr,
+    SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(V->getName(), ".wineh.spillslot"),
                                &F.getEntryBlock().front());
 
diff --git a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
index 63bd762..0b4c6e5 100644
--- a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -1,4 +1,4 @@
-//===-- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. -===//
+//===- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,18 +14,26 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Analysis.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
 namespace {
+
 struct XRayInstrumentation : public MachineFunctionPass {
   static char ID;
 
@@ -33,6 +41,14 @@ struct XRayInstrumentation : public MachineFunctionPass {
     initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry());
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    AU.addPreserved<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
@@ -43,7 +59,7 @@ private:
   // This is the approach to go on CPUs which have a single RET instruction,
   //   like x86/x86_64.
   void replaceRetWithPatchableRet(MachineFunction &MF,
-    const TargetInstrInfo *TII);
+                                  const TargetInstrInfo *TII);
 
   // Prepend the original return instruction with the exit sled code ("patchable
   //   function exit" pseudo-instruction), preserving the original return
@@ -54,13 +70,13 @@ private:
   //   have to call the trampoline and return from it to the original return
   //   instruction of the function being instrumented.
   void prependRetWithPatchableExit(MachineFunction &MF,
-    const TargetInstrInfo *TII);
+                                   const TargetInstrInfo *TII);
 };
-} // anonymous namespace
 
-void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF,
-  const TargetInstrInfo *TII)
-{
+} // end anonymous namespace
+
+void XRayInstrumentation::replaceRetWithPatchableRet(
+    MachineFunction &MF, const TargetInstrInfo *TII) {
   // We look for *all* terminators and returns, then replace those with
   // PATCHABLE_RET instructions.
   SmallVector<MachineInstr *, 4> Terminators;
@@ -81,7 +97,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF,
         auto MIB = BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc))
                        .addImm(T.getOpcode());
         for (auto &MO : T.operands())
-          MIB.addOperand(MO);
+          MIB.add(MO);
         Terminators.push_back(&T);
       }
     }
@@ -91,9 +107,8 @@ void XRayInstrumentation::replaceRetWithPatchableRet(MachineFunction &MF,
     I->eraseFromParent();
 }
 
-void XRayInstrumentation::prependRetWithPatchableExit(MachineFunction &MF,
-  const TargetInstrInfo *TII)
-{
+void XRayInstrumentation::prependRetWithPatchableExit(
+    MachineFunction &MF, const TargetInstrInfo *TII) {
   for (auto &MBB : MF) {
     for (auto &T : MBB.terminators()) {
       unsigned Opc = 0;
@@ -106,7 +121,7 @@ void XRayInstrumentation::prependRetWithPatchableExit(MachineFunction &MF,
       if (Opc != 0) {
         // Prepend the return instruction with PATCHABLE_FUNCTION_EXIT or
         //   PATCHABLE_TAIL_CALL .
-        BuildMI(MBB, T, T.getDebugLoc(),TII->get(Opc));
+        BuildMI(MBB, T, T.getDebugLoc(), TII->get(Opc));
       }
     }
   }
@@ -125,14 +140,24 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
       return false; // XRay threshold attribute not found.
     if (Attr.getValueAsString().getAsInteger(10, XRayThreshold))
       return false; // Invalid value for threshold.
-    if (F.size() < XRayThreshold)
-      return false; // Function is too small.
+
+    // Count the number of MachineInstr`s in MachineFunction
+    int64_t MICount = 0;
+    for (const auto& MBB : MF)
+      MICount += MBB.size();
+
+    // Check if we have a loop.
+    // FIXME: Maybe make this smarter, and see whether the loops are dependent
+    // on inputs or side-effects?
+    MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+    if (MLI.empty() && MICount < XRayThreshold)
+      return false; // Function is too small and has no loops.
   }
 
   // We look for the first non-empty MachineBasicBlock, so that we can insert
   // the function instrumentation in the appropriate place.
-  auto MBI =
-      find_if(MF, [&](const MachineBasicBlock &MBB) { return !MBB.empty(); });
+  auto MBI = llvm::find_if(
+      MF, [&](const MachineBasicBlock &MBB) { return !MBB.empty(); });
   if (MBI == MF.end())
     return false; // The function is empty.
 
@@ -142,12 +167,10 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
 
   if (!MF.getSubtarget().isXRaySupported()) {
     FirstMI.emitError("An attempt to perform XRay instrumentation for an"
-      " unsupported target.");
+                      " unsupported target.");
     return false;
   }
 
-  // FIXME: Do the loop triviality analysis here or in an earlier pass.
-
   // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the
   // MachineFunction.
   BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
@@ -157,6 +180,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
   case Triple::ArchType::arm:
   case Triple::ArchType::thumb:
   case Triple::ArchType::aarch64:
+  case Triple::ArchType::ppc64le:
+  case Triple::ArchType::mips:
+  case Triple::ArchType::mipsel:
+  case Triple::ArchType::mips64:
+  case Triple::ArchType::mips64el:
     // For the architectures which don't have a single return instruction
     prependRetWithPatchableExit(MF, TII);
     break;
@@ -171,5 +199,8 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
 
 char XRayInstrumentation::ID = 0;
 char &llvm::XRayInstrumentationID = XRayInstrumentation::ID;
-INITIALIZE_PASS(XRayInstrumentation, "xray-instrumentation", "Insert XRay ops",
-                false, false)
+INITIALIZE_PASS_BEGIN(XRayInstrumentation, "xray-instrumentation",
+                      "Insert XRay ops", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(XRayInstrumentation, "xray-instrumentation",
+                    "Insert XRay ops", false, false)
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
index 75cfd0d..e0c7ef5 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
@@ -11,20 +11,11 @@
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-template <typename T>
-static Error takeObject(ArrayRef<uint8_t> &Data, const T *&Res) {
-  if (Data.size() < sizeof(*Res))
-    return llvm::make_error<CodeViewError>(cv_error_code::insufficient_buffer);
-  Res = reinterpret_cast<const T *>(Data.data());
-  Data = Data.drop_front(sizeof(*Res));
-  return Error::success();
-}
-
 CVSymbolVisitor::CVSymbolVisitor(SymbolVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
@@ -38,10 +29,8 @@ static Error visitKnownRecord(CVSymbol &Record,
   return Error::success();
 }
 
-Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
-  if (auto EC = Callbacks.visitSymbolBegin(Record))
-    return EC;
-
+static Error finishVisitation(CVSymbol &Record,
+                              SymbolVisitorCallbacks &Callbacks) {
   switch (Record.Type) {
   default:
     if (auto EC = Callbacks.visitUnknownSymbol(Record))
@@ -55,7 +44,7 @@ Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
   }
 #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                \
   SYMBOL_RECORD(EnumVal, EnumVal, AliasName)
-#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def"
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
   }
 
   if (auto EC = Callbacks.visitSymbolEnd(Record))
@@ -64,6 +53,18 @@ Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
   return Error::success();
 }
 
+Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record) {
+  if (auto EC = Callbacks.visitSymbolBegin(Record))
+    return EC;
+  return finishVisitation(Record, Callbacks);
+}
+
+Error CVSymbolVisitor::visitSymbolRecord(CVSymbol &Record, uint32_t Offset) {
+  if (auto EC = Callbacks.visitSymbolBegin(Record, Offset))
+    return EC;
+  return finishVisitation(Record, Callbacks);
+}
+
 Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) {
   for (auto I : Symbols) {
     if (auto EC = visitSymbolRecord(I))
@@ -71,3 +72,13 @@ Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols) {
   }
   return Error::success();
 }
+
+Error CVSymbolVisitor::visitSymbolStream(const CVSymbolArray &Symbols,
+                                         uint32_t InitialOffset) {
+  for (auto I : Symbols) {
+    if (auto EC = visitSymbolRecord(I, InitialOffset))
+      return EC;
+    InitialOffset += I.length();
+  }
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/contrib/llvm/lib/DebugInfo/CodeView/CVTypeDumper.cpp
deleted file mode 100644
index fcd239c..0000000
--- a/contrib/llvm/lib/DebugInfo/CodeView/CVTypeDumper.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- CVTypeDumper.cpp - CodeView type info dumper ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
-  TypeDatabaseVisitor DBV(TypeDB);
-  TypeDeserializer Deserializer;
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(DBV);
-  Pipeline.addCallbackToPipeline(Dumper);
-
-  CVTypeVisitor Visitor(Pipeline);
-
-  CVType RecordCopy = Record;
-  if (auto EC = Visitor.visitTypeRecord(RecordCopy))
-    return EC;
-  return Error::success();
-}
-
-Error CVTypeDumper::dump(const CVTypeArray &Types,
-                         TypeVisitorCallbacks &Dumper) {
-  TypeDatabaseVisitor DBV(TypeDB);
-  TypeDeserializer Deserializer;
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(DBV);
-  Pipeline.addCallbackToPipeline(Dumper);
-
-  CVTypeVisitor Visitor(Pipeline);
-
-  if (auto EC = Visitor.visitTypeStream(Types))
-    return EC;
-  return Error::success();
-}
-
-Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
-  msf::ByteStream Stream(Data);
-  CVTypeArray Types;
-  msf::StreamReader Reader(Stream);
-  if (auto EC = Reader.readArray(Types, Reader.getLength()))
-    return EC;
-
-  return dump(Types, Dumper);
-}
-
-void CVTypeDumper::printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
-                                  TypeIndex TI, TypeDatabase &DB) {
-  StringRef TypeName;
-  if (!TI.isNoneType())
-    TypeName = DB.getTypeName(TI);
-  if (!TypeName.empty())
-    Printer.printHex(FieldName, TypeName, TI.getIndex());
-  else
-    Printer.printHex(FieldName, TI.getIndex());
-}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 5171e24..79b9fde 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -9,16 +9,18 @@
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
-    : Callbacks(Callbacks) {}
 
 template <typename T>
 static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
@@ -39,10 +41,63 @@ static Error visitKnownMember(CVMemberRecord &Record,
   return Error::success();
 }
 
-Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
-  if (auto EC = Callbacks.visitTypeBegin(Record))
+static Error visitMemberRecord(CVMemberRecord &Record,
+                               TypeVisitorCallbacks &Callbacks) {
+  if (auto EC = Callbacks.visitMemberBegin(Record))
+    return EC;
+
+  switch (Record.Kind) {
+  default:
+    if (auto EC = Callbacks.visitUnknownMember(Record))
+      return EC;
+    break;
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  case EnumName: {                                                             \
+    if (auto EC = visitKnownMember<Name##Record>(Record, Callbacks))           \
+      return EC;                                                               \
+    break;                                                                     \
+  }
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                \
+  MEMBER_RECORD(EnumVal, EnumVal, AliasName)
+#define TYPE_RECORD(EnumName, EnumVal, Name)
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+  }
+
+  if (auto EC = Callbacks.visitMemberEnd(Record))
     return EC;
 
+  return Error::success();
+}
+
+namespace {
+
+class CVTypeVisitor {
+public:
+  explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks);
+
+  Error visitTypeRecord(CVType &Record, TypeIndex Index);
+  Error visitTypeRecord(CVType &Record);
+
+  /// Visits the type records in Data. Sets the error flag on parse failures.
+  Error visitTypeStream(const CVTypeArray &Types);
+  Error visitTypeStream(CVTypeRange Types);
+  Error visitTypeStream(TypeCollection &Types);
+
+  Error visitMemberRecord(CVMemberRecord Record);
+  Error visitFieldListMemberStream(BinaryStreamReader &Stream);
+
+private:
+  Error finishVisitation(CVType &Record);
+
+  /// The interface to the class that gets notified of each visitation.
+  TypeVisitorCallbacks &Callbacks;
+};
+
+CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
+    : Callbacks(Callbacks) {}
+
+Error CVTypeVisitor::finishVisitation(CVType &Record) {
   switch (Record.Type) {
   default:
     if (auto EC = Callbacks.visitUnknownType(Record))
@@ -58,7 +113,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
   TYPE_RECORD(EnumVal, EnumVal, AliasName)
 #define MEMBER_RECORD(EnumName, EnumVal, Name)
 #define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
   }
 
   if (auto EC = Callbacks.visitTypeEnd(Record))
@@ -67,36 +122,21 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
   return Error::success();
 }
 
-static Error visitMemberRecord(CVMemberRecord &Record,
-                               TypeVisitorCallbacks &Callbacks) {
-  if (auto EC = Callbacks.visitMemberBegin(Record))
+Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) {
+  if (auto EC = Callbacks.visitTypeBegin(Record, Index))
     return EC;
 
-  switch (Record.Kind) {
-  default:
-    if (auto EC = Callbacks.visitUnknownMember(Record))
-      return EC;
-    break;
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  case EnumName: {                                                             \
-    if (auto EC = visitKnownMember<Name##Record>(Record, Callbacks))           \
-      return EC;                                                               \
-    break;                                                                     \
-  }
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)                \
-  MEMBER_RECORD(EnumVal, EnumVal, AliasName)
-#define TYPE_RECORD(EnumName, EnumVal, Name)
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
-  }
+  return finishVisitation(Record);
+}
 
-  if (auto EC = Callbacks.visitMemberEnd(Record))
+Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+  if (auto EC = Callbacks.visitTypeBegin(Record))
     return EC;
 
-  return Error::success();
+  return finishVisitation(Record);
 }
 
-Error CVTypeVisitor::visitMemberRecord(CVMemberRecord &Record) {
+Error CVTypeVisitor::visitMemberRecord(CVMemberRecord Record) {
   return ::visitMemberRecord(Record, Callbacks);
 }
 
@@ -109,12 +149,26 @@ Error CVTypeVisitor::visitTypeStream(const CVTypeArray &Types) {
   return Error::success();
 }
 
-Error CVTypeVisitor::visitFieldListMemberStream(msf::StreamReader Reader) {
-  FieldListDeserializer Deserializer(Reader);
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(Callbacks);
+Error CVTypeVisitor::visitTypeStream(CVTypeRange Types) {
+  for (auto I : Types) {
+    if (auto EC = visitTypeRecord(I))
+      return EC;
+  }
+  return Error::success();
+}
 
+Error CVTypeVisitor::visitTypeStream(TypeCollection &Types) {
+  Optional<TypeIndex> I = Types.getFirst();
+  while (I) {
+    CVType Type = Types.getType(*I);
+    if (auto EC = visitTypeRecord(Type, *I))
+      return EC;
+    I = Types.getNext(*I);
+  }
+  return Error::success();
+}
+
+Error CVTypeVisitor::visitFieldListMemberStream(BinaryStreamReader &Reader) {
   TypeLeafKind Leaf;
   while (!Reader.empty()) {
     if (auto EC = Reader.readEnum(Leaf))
@@ -122,15 +176,101 @@ Error CVTypeVisitor::visitFieldListMemberStream(msf::StreamReader Reader) {
 
     CVMemberRecord Record;
     Record.Kind = Leaf;
-    if (auto EC = ::visitMemberRecord(Record, Pipeline))
+    if (auto EC = ::visitMemberRecord(Record, Callbacks))
       return EC;
   }
 
   return Error::success();
 }
 
-Error CVTypeVisitor::visitFieldListMemberStream(ArrayRef<uint8_t> Data) {
-  msf::ByteStream S(Data);
-  msf::StreamReader SR(S);
-  return visitFieldListMemberStream(SR);
+struct FieldListVisitHelper {
+  FieldListVisitHelper(TypeVisitorCallbacks &Callbacks, ArrayRef<uint8_t> Data,
+                       VisitorDataSource Source)
+      : Stream(Data, llvm::support::little), Reader(Stream),
+        Deserializer(Reader),
+        Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) {
+    if (Source == VDS_BytesPresent) {
+      Pipeline.addCallbackToPipeline(Deserializer);
+      Pipeline.addCallbackToPipeline(Callbacks);
+    }
+  }
+
+  BinaryByteStream Stream;
+  BinaryStreamReader Reader;
+  FieldListDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  CVTypeVisitor Visitor;
+};
+
+struct VisitHelper {
+  VisitHelper(TypeVisitorCallbacks &Callbacks, VisitorDataSource Source)
+      : Visitor((Source == VDS_BytesPresent) ? Pipeline : Callbacks) {
+    if (Source == VDS_BytesPresent) {
+      Pipeline.addCallbackToPipeline(Deserializer);
+      Pipeline.addCallbackToPipeline(Callbacks);
+    }
+  }
+
+  TypeDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  CVTypeVisitor Visitor;
+};
+}
+
+Error llvm::codeview::visitTypeRecord(CVType &Record, TypeIndex Index,
+                                      TypeVisitorCallbacks &Callbacks,
+                                      VisitorDataSource Source) {
+  VisitHelper V(Callbacks, Source);
+  return V.Visitor.visitTypeRecord(Record, Index);
+}
+
+Error llvm::codeview::visitTypeRecord(CVType &Record,
+                                      TypeVisitorCallbacks &Callbacks,
+                                      VisitorDataSource Source) {
+  VisitHelper V(Callbacks, Source);
+  return V.Visitor.visitTypeRecord(Record);
+}
+
+Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
+                                      TypeVisitorCallbacks &Callbacks,
+                                      VisitorDataSource Source) {
+  VisitHelper V(Callbacks, Source);
+  return V.Visitor.visitTypeStream(Types);
+}
+
+Error llvm::codeview::visitTypeStream(CVTypeRange Types,
+                                      TypeVisitorCallbacks &Callbacks) {
+  VisitHelper V(Callbacks, VDS_BytesPresent);
+  return V.Visitor.visitTypeStream(Types);
+}
+
+Error llvm::codeview::visitTypeStream(TypeCollection &Types,
+                                      TypeVisitorCallbacks &Callbacks) {
+  // When the internal visitor calls Types.getType(Index) the interface is
+  // required to return a CVType with the bytes filled out.  So we can assume
+  // that the bytes will be present when individual records are visited.
+  VisitHelper V(Callbacks, VDS_BytesPresent);
+  return V.Visitor.visitTypeStream(Types);
+}
+
+Error llvm::codeview::visitMemberRecord(CVMemberRecord Record,
+                                        TypeVisitorCallbacks &Callbacks,
+                                        VisitorDataSource Source) {
+  FieldListVisitHelper V(Callbacks, Record.Data, Source);
+  return V.Visitor.visitMemberRecord(Record);
+}
+
+Error llvm::codeview::visitMemberRecord(TypeLeafKind Kind,
+                                        ArrayRef<uint8_t> Record,
+                                        TypeVisitorCallbacks &Callbacks) {
+  CVMemberRecord R;
+  R.Data = Record;
+  R.Kind = Kind;
+  return visitMemberRecord(R, Callbacks, VDS_BytesPresent);
+}
+
+Error llvm::codeview::visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
+                                              TypeVisitorCallbacks &Callbacks) {
+  FieldListVisitHelper V(Callbacks, FieldList, VDS_BytesPresent);
+  return V.Visitor.visitFieldListMemberStream(V.Reader);
 }
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp b/contrib/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp
index 55c10c0..8de266b 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/CodeViewError.cpp
@@ -31,6 +31,8 @@ public:
              "bytes.";
     case cv_error_code::corrupt_record:
       return "The CodeView record is corrupted.";
+    case cv_error_code::no_records:
+      return "There are no records";
     case cv_error_code::operation_unsupported:
       return "The requested operation is not supported.";
     case cv_error_code::unknown_member_record:
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/contrib/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 9bd85cf..4fc1448 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -10,8 +10,8 @@
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -27,6 +27,14 @@ Error CodeViewRecordIO::beginRecord(Optional<uint32_t> MaxLength) {
 Error CodeViewRecordIO::endRecord() {
   assert(!Limits.empty() && "Not in a record!");
   Limits.pop_back();
+  // We would like to assert that we actually read / wrote all the bytes that we
+  // expected to for this record, but unfortunately we can't do this.  Some
+  // producers such as MASM over-allocate for certain types of records and
+  // commit the extraneous data, so when reading we can't be sure every byte
+  // will have been read.  And when writing we over-allocate temporarily since
+  // we don't know how big the record is until we're finished writing it, so
+  // even though we don't commit the extraneous data, we still can't guarantee
+  // we're at the end of the allocated data.
   return Error::success();
 }
 
@@ -49,6 +57,12 @@ uint32_t CodeViewRecordIO::maxFieldLength() const {
   return *Min;
 }
 
+Error CodeViewRecordIO::padToAlignment(uint32_t Align) {
+  if (isReading())
+    return Reader->padToAlignment(Align);
+  return Writer->padToAlignment(Align);
+}
+
 Error CodeViewRecordIO::skipPadding() {
   assert(!isWriting() && "Cannot skip padding while writing!");
 
@@ -145,27 +159,28 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value) {
   if (isWriting()) {
     // Truncate if we attempt to write too much.
     StringRef S = Value.take_front(maxFieldLength() - 1);
-    if (auto EC = Writer->writeZeroString(S))
+    if (auto EC = Writer->writeCString(S))
       return EC;
   } else {
-    if (auto EC = Reader->readZeroString(Value))
+    if (auto EC = Reader->readCString(Value))
       return EC;
   }
   return Error::success();
 }
 
-Error CodeViewRecordIO::mapGuid(StringRef &Guid) {
+Error CodeViewRecordIO::mapGuid(GUID &Guid) {
   constexpr uint32_t GuidSize = 16;
   if (maxFieldLength() < GuidSize)
     return make_error<CodeViewError>(cv_error_code::insufficient_buffer);
 
   if (isWriting()) {
-    assert(Guid.size() == 16 && "Invalid Guid Size!");
-    if (auto EC = Writer->writeFixedString(Guid))
+    if (auto EC = Writer->writeBytes(Guid.Guid))
       return EC;
   } else {
-    if (auto EC = Reader->readFixedString(Guid, 16))
+    ArrayRef<uint8_t> GuidBytes;
+    if (auto EC = Reader->readBytes(GuidBytes, GuidSize))
       return EC;
+    memcpy(Guid.Guid, GuidBytes.data(), GuidSize);
   }
   return Error::success();
 }
@@ -176,7 +191,7 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value) {
       if (auto EC = mapStringZ(V))
         return EC;
     }
-    if (auto EC = Writer->writeInteger(uint8_t(0)))
+    if (auto EC = Writer->writeInteger<uint8_t>(0))
       return EC;
   } else {
     StringRef S;
@@ -194,22 +209,22 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value) {
 Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
   assert(Value < 0 && "Encoded integer is not signed!");
   if (Value >= std::numeric_limits<int8_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_CHAR)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_CHAR))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int8_t>(Value)))
+    if (auto EC = Writer->writeInteger<int8_t>(Value))
       return EC;
   } else if (Value >= std::numeric_limits<int16_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_SHORT)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_SHORT))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int16_t>(Value)))
+    if (auto EC = Writer->writeInteger<int16_t>(Value))
       return EC;
   } else if (Value >= std::numeric_limits<int32_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_LONG)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_LONG))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int32_t>(Value)))
+    if (auto EC = Writer->writeInteger<int32_t>(Value))
       return EC;
   } else {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_QUADWORD)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_QUADWORD))
       return EC;
     if (auto EC = Writer->writeInteger(Value))
       return EC;
@@ -219,20 +234,20 @@ Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
 
 Error CodeViewRecordIO::writeEncodedUnsignedInteger(const uint64_t &Value) {
   if (Value < LF_NUMERIC) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint16_t>(Value))
       return EC;
   } else if (Value <= std::numeric_limits<uint16_t>::max()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_USHORT)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_USHORT))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint16_t>(Value))
       return EC;
   } else if (Value <= std::numeric_limits<uint32_t>::max()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_ULONG)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_ULONG))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<uint32_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint32_t>(Value))
       return EC;
   } else {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_UQUADWORD)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_UQUADWORD))
       return EC;
     if (auto EC = Writer->writeInteger(Value))
       return EC;
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
new file mode 100644
index 0000000..ccc20eb
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
@@ -0,0 +1,116 @@
+//===- DebugChecksumsSubsection.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+struct FileChecksumEntryHeader {
+  using ulittle32_t = support::ulittle32_t;
+
+  ulittle32_t FileNameOffset; // Byte offset of filename in global string table.
+  uint8_t ChecksumSize;       // Number of bytes of checksum.
+  uint8_t ChecksumKind;       // FileChecksumKind
+                              // Checksum bytes follow.
+};
+
+Error VarStreamArrayExtractor<FileChecksumEntry>::
+operator()(BinaryStreamRef Stream, uint32_t &Len, FileChecksumEntry &Item) {
+  BinaryStreamReader Reader(Stream);
+
+  const FileChecksumEntryHeader *Header;
+  if (auto EC = Reader.readObject(Header))
+    return EC;
+
+  Item.FileNameOffset = Header->FileNameOffset;
+  Item.Kind = static_cast<FileChecksumKind>(Header->ChecksumKind);
+  if (auto EC = Reader.readBytes(Item.Checksum, Header->ChecksumSize))
+    return EC;
+
+  Len = alignTo(Header->ChecksumSize + sizeof(FileChecksumEntryHeader), 4);
+  return Error::success();
+}
+
+Error DebugChecksumsSubsectionRef::initialize(BinaryStreamReader Reader) {
+  if (auto EC = Reader.readArray(Checksums, Reader.bytesRemaining()))
+    return EC;
+
+  return Error::success();
+}
+
+Error DebugChecksumsSubsectionRef::initialize(BinaryStreamRef Section) {
+  BinaryStreamReader Reader(Section);
+  return initialize(Reader);
+}
+
+DebugChecksumsSubsection::DebugChecksumsSubsection(
+    DebugStringTableSubsection &Strings)
+    : DebugSubsection(DebugSubsectionKind::FileChecksums), Strings(Strings) {}
+
+void DebugChecksumsSubsection::addChecksum(StringRef FileName,
+                                           FileChecksumKind Kind,
+                                           ArrayRef<uint8_t> Bytes) {
+  FileChecksumEntry Entry;
+  if (!Bytes.empty()) {
+    uint8_t *Copy = Storage.Allocate<uint8_t>(Bytes.size());
+    ::memcpy(Copy, Bytes.data(), Bytes.size());
+    Entry.Checksum = makeArrayRef(Copy, Bytes.size());
+  }
+
+  Entry.FileNameOffset = Strings.insert(FileName);
+  Entry.Kind = Kind;
+  Checksums.push_back(Entry);
+
+  // This maps the offset of this string in the string table to the offset
+  // of this checksum entry in the checksum buffer.
+  OffsetMap[Entry.FileNameOffset] = SerializedSize;
+  assert(SerializedSize % 4 == 0);
+
+  uint32_t Len = alignTo(sizeof(FileChecksumEntryHeader) + Bytes.size(), 4);
+  SerializedSize += Len;
+}
+
+uint32_t DebugChecksumsSubsection::calculateSerializedSize() const {
+  return SerializedSize;
+}
+
+Error DebugChecksumsSubsection::commit(BinaryStreamWriter &Writer) const {
+  for (const auto &FC : Checksums) {
+    FileChecksumEntryHeader Header;
+    Header.ChecksumKind = uint8_t(FC.Kind);
+    Header.ChecksumSize = FC.Checksum.size();
+    Header.FileNameOffset = FC.FileNameOffset;
+    if (auto EC = Writer.writeObject(Header))
+      return EC;
+    if (auto EC = Writer.writeArray(makeArrayRef(FC.Checksum)))
+      return EC;
+    if (auto EC = Writer.padToAlignment(4))
+      return EC;
+  }
+  return Error::success();
+}
+
+uint32_t DebugChecksumsSubsection::mapChecksumOffset(StringRef FileName) const {
+  uint32_t Offset = Strings.getStringId(FileName);
+  auto Iter = OffsetMap.find(Offset);
+  assert(Iter != OffsetMap.end());
+  return Iter->second;
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
new file mode 100644
index 0000000..cef2778
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossExSubsection.cpp
@@ -0,0 +1,53 @@
+//===- DebugCrossExSubsection.cpp -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error DebugCrossModuleExportsSubsectionRef::initialize(
+    BinaryStreamReader Reader) {
+  if (Reader.bytesRemaining() % sizeof(CrossModuleExport) != 0)
+    return make_error<CodeViewError>(
+        cv_error_code::corrupt_record,
+        "Cross Scope Exports section is an invalid size!");
+
+  uint32_t Size = Reader.bytesRemaining() / sizeof(CrossModuleExport);
+  return Reader.readArray(References, Size);
+}
+
+Error DebugCrossModuleExportsSubsectionRef::initialize(BinaryStreamRef Stream) {
+  BinaryStreamReader Reader(Stream);
+  return initialize(Reader);
+}
+
+void DebugCrossModuleExportsSubsection::addMapping(uint32_t Local,
+                                                   uint32_t Global) {
+  Mappings[Local] = Global;
+}
+
+uint32_t DebugCrossModuleExportsSubsection::calculateSerializedSize() const {
+  return Mappings.size() * sizeof(CrossModuleExport);
+}
+
+Error DebugCrossModuleExportsSubsection::commit(
+    BinaryStreamWriter &Writer) const {
+  for (const auto &M : Mappings) {
+    if (auto EC = Writer.writeInteger(M.first))
+      return EC;
+    if (auto EC = Writer.writeInteger(M.second))
+      return EC;
+  }
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
new file mode 100644
index 0000000..88c0076
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugCrossImpSubsection.cpp
@@ -0,0 +1,97 @@
+//===- DebugCrossImpSubsection.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error VarStreamArrayExtractor<CrossModuleImportItem>::
+operator()(BinaryStreamRef Stream, uint32_t &Len,
+           codeview::CrossModuleImportItem &Item) {
+  BinaryStreamReader Reader(Stream);
+  if (Reader.bytesRemaining() < sizeof(CrossModuleImport))
+    return make_error<CodeViewError>(
+        cv_error_code::insufficient_buffer,
+        "Not enough bytes for a Cross Module Import Header!");
+  if (auto EC = Reader.readObject(Item.Header))
+    return EC;
+  if (Reader.bytesRemaining() < Item.Header->Count * sizeof(uint32_t))
+    return make_error<CodeViewError>(
+        cv_error_code::insufficient_buffer,
+        "Not enough to read specified number of Cross Module References!");
+  if (auto EC = Reader.readArray(Item.Imports, Item.Header->Count))
+    return EC;
+  return Error::success();
+}
+
+Error DebugCrossModuleImportsSubsectionRef::initialize(
+    BinaryStreamReader Reader) {
+  return Reader.readArray(References, Reader.bytesRemaining());
+}
+
+Error DebugCrossModuleImportsSubsectionRef::initialize(BinaryStreamRef Stream) {
+  BinaryStreamReader Reader(Stream);
+  return initialize(Reader);
+}
+
+void DebugCrossModuleImportsSubsection::addImport(StringRef Module,
+                                                  uint32_t ImportId) {
+  Strings.insert(Module);
+  std::vector<support::ulittle32_t> Targets = {support::ulittle32_t(ImportId)};
+  auto Result = Mappings.insert(std::make_pair(Module, Targets));
+  if (!Result.second)
+    Result.first->getValue().push_back(Targets[0]);
+}
+
+uint32_t DebugCrossModuleImportsSubsection::calculateSerializedSize() const {
+  uint32_t Size = 0;
+  for (const auto &Item : Mappings) {
+    Size += sizeof(CrossModuleImport);
+    Size += sizeof(support::ulittle32_t) * Item.second.size();
+  }
+  return Size;
+}
+
+Error DebugCrossModuleImportsSubsection::commit(
+    BinaryStreamWriter &Writer) const {
+  using T = decltype(&*Mappings.begin());
+  std::vector<T> Ids;
+  Ids.reserve(Mappings.size());
+
+  for (const auto &M : Mappings)
+    Ids.push_back(&M);
+
+  std::sort(Ids.begin(), Ids.end(), [this](const T &L1, const T &L2) {
+    return Strings.getStringId(L1->getKey()) <
+           Strings.getStringId(L2->getKey());
+  });
+
+  for (const auto &Item : Ids) {
+    CrossModuleImport Imp;
+    Imp.ModuleNameOffset = Strings.getStringId(Item->getKey());
+    Imp.Count = Item->getValue().size();
+    if (auto EC = Writer.writeObject(Imp))
+      return EC;
+    if (auto EC = Writer.writeArray(makeArrayRef(Item->getValue())))
+      return EC;
+  }
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
new file mode 100644
index 0000000..fd558aa
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
@@ -0,0 +1,44 @@
+//===- DebugFrameDataSubsection.cpp -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error DebugFrameDataSubsectionRef::initialize(BinaryStreamReader Reader) {
+  if (auto EC = Reader.readObject(RelocPtr))
+    return EC;
+  if (Reader.bytesRemaining() % sizeof(FrameData) != 0)
+    return make_error<CodeViewError>(cv_error_code::corrupt_record,
+                                     "Invalid frame data record format!");
+
+  uint32_t Count = Reader.bytesRemaining() / sizeof(FrameData);
+  if (auto EC = Reader.readArray(Frames, Count))
+    return EC;
+  return Error::success();
+}
+
+uint32_t DebugFrameDataSubsection::calculateSerializedSize() const {
+  return 4 + sizeof(FrameData) * Frames.size();
+}
+
+Error DebugFrameDataSubsection::commit(BinaryStreamWriter &Writer) const {
+  if (auto EC = Writer.writeInteger<uint32_t>(0))
+    return EC;
+
+  if (auto EC = Writer.writeArray(makeArrayRef(Frames)))
+    return EC;
+  return Error::success();
+}
+
+void DebugFrameDataSubsection::addFrameData(const FrameData &Frame) {
+  Frames.push_back(Frame);
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
new file mode 100644
index 0000000..077c103
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
@@ -0,0 +1,126 @@
+//===- DebugInlineeLinesSubsection.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error VarStreamArrayExtractor<InlineeSourceLine>::
+operator()(BinaryStreamRef Stream, uint32_t &Len, InlineeSourceLine &Item) {
+  BinaryStreamReader Reader(Stream);
+
+  if (auto EC = Reader.readObject(Item.Header))
+    return EC;
+
+  if (HasExtraFiles) {
+    uint32_t ExtraFileCount;
+    if (auto EC = Reader.readInteger(ExtraFileCount))
+      return EC;
+    if (auto EC = Reader.readArray(Item.ExtraFiles, ExtraFileCount))
+      return EC;
+  }
+
+  Len = Reader.getOffset();
+  return Error::success();
+}
+
+DebugInlineeLinesSubsectionRef::DebugInlineeLinesSubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::InlineeLines) {}
+
+Error DebugInlineeLinesSubsectionRef::initialize(BinaryStreamReader Reader) {
+  if (auto EC = Reader.readEnum(Signature))
+    return EC;
+
+  Lines.getExtractor().HasExtraFiles = hasExtraFiles();
+  if (auto EC = Reader.readArray(Lines, Reader.bytesRemaining()))
+    return EC;
+
+  assert(Reader.bytesRemaining() == 0);
+  return Error::success();
+}
+
+bool DebugInlineeLinesSubsectionRef::hasExtraFiles() const {
+  return Signature == InlineeLinesSignature::ExtraFiles;
+}
+
+DebugInlineeLinesSubsection::DebugInlineeLinesSubsection(
+    DebugChecksumsSubsection &Checksums, bool HasExtraFiles)
+    : DebugSubsection(DebugSubsectionKind::InlineeLines), Checksums(Checksums),
+      HasExtraFiles(HasExtraFiles) {}
+
+uint32_t DebugInlineeLinesSubsection::calculateSerializedSize() const {
+  // 4 bytes for the signature
+  uint32_t Size = sizeof(InlineeLinesSignature);
+
+  // one header for each entry.
+  Size += Entries.size() * sizeof(InlineeSourceLineHeader);
+  if (HasExtraFiles) {
+    // If extra files are enabled, one count for each entry.
+    Size += Entries.size() * sizeof(uint32_t);
+
+    // And one file id for each file.
+    Size += ExtraFileCount * sizeof(uint32_t);
+  }
+  assert(Size % 4 == 0);
+  return Size;
+}
+
+Error DebugInlineeLinesSubsection::commit(BinaryStreamWriter &Writer) const {
+  InlineeLinesSignature Sig = InlineeLinesSignature::Normal;
+  if (HasExtraFiles)
+    Sig = InlineeLinesSignature::ExtraFiles;
+
+  if (auto EC = Writer.writeEnum(Sig))
+    return EC;
+
+  for (const auto &E : Entries) {
+    if (auto EC = Writer.writeObject(E.Header))
+      return EC;
+
+    if (!HasExtraFiles)
+      continue;
+
+    if (auto EC = Writer.writeInteger<uint32_t>(E.ExtraFiles.size()))
+      return EC;
+    if (auto EC = Writer.writeArray(makeArrayRef(E.ExtraFiles)))
+      return EC;
+  }
+
+  return Error::success();
+}
+
+void DebugInlineeLinesSubsection::addExtraFile(StringRef FileName) {
+  uint32_t Offset = Checksums.mapChecksumOffset(FileName);
+
+  auto &Entry = Entries.back();
+  Entry.ExtraFiles.push_back(ulittle32_t(Offset));
+  ++ExtraFileCount;
+}
+
+void DebugInlineeLinesSubsection::addInlineSite(TypeIndex FuncId,
+                                                StringRef FileName,
+                                                uint32_t SourceLine) {
+  uint32_t Offset = Checksums.mapChecksumOffset(FileName);
+
+  Entries.emplace_back();
+  auto &Entry = Entries.back();
+  Entry.Header.FileID = Offset;
+  Entry.Header.SourceLineNum = SourceLine;
+  Entry.Header.Inlinee = FuncId;
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
new file mode 100644
index 0000000..57ad408
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
@@ -0,0 +1,161 @@
+//===- DebugLinesSubsection.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error LineColumnExtractor::operator()(BinaryStreamRef Stream, uint32_t &Len,
+                                      LineColumnEntry &Item) {
+  const LineBlockFragmentHeader *BlockHeader;
+  BinaryStreamReader Reader(Stream);
+  if (auto EC = Reader.readObject(BlockHeader))
+    return EC;
+  bool HasColumn = Header->Flags & uint16_t(LF_HaveColumns);
+  uint32_t LineInfoSize =
+      BlockHeader->NumLines *
+      (sizeof(LineNumberEntry) + (HasColumn ? sizeof(ColumnNumberEntry) : 0));
+  if (BlockHeader->BlockSize < sizeof(LineBlockFragmentHeader))
+    return make_error<CodeViewError>(cv_error_code::corrupt_record,
+                                     "Invalid line block record size");
+  uint32_t Size = BlockHeader->BlockSize - sizeof(LineBlockFragmentHeader);
+  if (LineInfoSize > Size)
+    return make_error<CodeViewError>(cv_error_code::corrupt_record,
+                                     "Invalid line block record size");
+  // The value recorded in BlockHeader->BlockSize includes the size of
+  // LineBlockFragmentHeader.
+  Len = BlockHeader->BlockSize;
+  Item.NameIndex = BlockHeader->NameIndex;
+  if (auto EC = Reader.readArray(Item.LineNumbers, BlockHeader->NumLines))
+    return EC;
+  if (HasColumn) {
+    if (auto EC = Reader.readArray(Item.Columns, BlockHeader->NumLines))
+      return EC;
+  }
+  return Error::success();
+}
+
+DebugLinesSubsectionRef::DebugLinesSubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::Lines) {}
+
+Error DebugLinesSubsectionRef::initialize(BinaryStreamReader Reader) {
+  if (auto EC = Reader.readObject(Header))
+    return EC;
+
+  LinesAndColumns.getExtractor().Header = Header;
+  if (auto EC = Reader.readArray(LinesAndColumns, Reader.bytesRemaining()))
+    return EC;
+
+  return Error::success();
+}
+
+bool DebugLinesSubsectionRef::hasColumnInfo() const {
+  return !!(Header->Flags & LF_HaveColumns);
+}
+
+DebugLinesSubsection::DebugLinesSubsection(DebugChecksumsSubsection &Checksums,
+                                           DebugStringTableSubsection &Strings)
+    : DebugSubsection(DebugSubsectionKind::Lines), Checksums(Checksums) {}
+
+void DebugLinesSubsection::createBlock(StringRef FileName) {
+  uint32_t Offset = Checksums.mapChecksumOffset(FileName);
+
+  Blocks.emplace_back(Offset);
+}
+
+void DebugLinesSubsection::addLineInfo(uint32_t Offset, const LineInfo &Line) {
+  Block &B = Blocks.back();
+  LineNumberEntry LNE;
+  LNE.Flags = Line.getRawData();
+  LNE.Offset = Offset;
+  B.Lines.push_back(LNE);
+}
+
+void DebugLinesSubsection::addLineAndColumnInfo(uint32_t Offset,
+                                                const LineInfo &Line,
+                                                uint32_t ColStart,
+                                                uint32_t ColEnd) {
+  Block &B = Blocks.back();
+  assert(B.Lines.size() == B.Columns.size());
+
+  addLineInfo(Offset, Line);
+  ColumnNumberEntry CNE;
+  CNE.StartColumn = ColStart;
+  CNE.EndColumn = ColEnd;
+  B.Columns.push_back(CNE);
+}
+
+Error DebugLinesSubsection::commit(BinaryStreamWriter &Writer) const {
+  LineFragmentHeader Header;
+  Header.CodeSize = CodeSize;
+  Header.Flags = hasColumnInfo() ? LF_HaveColumns : 0;
+  Header.RelocOffset = RelocOffset;
+  Header.RelocSegment = RelocSegment;
+
+  if (auto EC = Writer.writeObject(Header))
+    return EC;
+
+  for (const auto &B : Blocks) {
+    LineBlockFragmentHeader BlockHeader;
+    assert(B.Lines.size() == B.Columns.size() || B.Columns.empty());
+
+    BlockHeader.NumLines = B.Lines.size();
+    BlockHeader.BlockSize = sizeof(LineBlockFragmentHeader);
+    BlockHeader.BlockSize += BlockHeader.NumLines * sizeof(LineNumberEntry);
+    if (hasColumnInfo())
+      BlockHeader.BlockSize += BlockHeader.NumLines * sizeof(ColumnNumberEntry);
+    BlockHeader.NameIndex = B.ChecksumBufferOffset;
+    if (auto EC = Writer.writeObject(BlockHeader))
+      return EC;
+
+    if (auto EC = Writer.writeArray(makeArrayRef(B.Lines)))
+      return EC;
+
+    if (hasColumnInfo()) {
+      if (auto EC = Writer.writeArray(makeArrayRef(B.Columns)))
+        return EC;
+    }
+  }
+  return Error::success();
+}
+
+uint32_t DebugLinesSubsection::calculateSerializedSize() const {
+  uint32_t Size = sizeof(LineFragmentHeader);
+  for (const auto &B : Blocks) {
+    Size += sizeof(LineBlockFragmentHeader);
+    Size += B.Lines.size() * sizeof(LineNumberEntry);
+    if (hasColumnInfo())
+      Size += B.Columns.size() * sizeof(ColumnNumberEntry);
+  }
+  return Size;
+}
+
+void DebugLinesSubsection::setRelocationAddress(uint16_t Segment,
+                                                uint32_t Offset) {
+  RelocOffset = Offset;
+  RelocSegment = Segment;
+}
+
+void DebugLinesSubsection::setCodeSize(uint32_t Size) { CodeSize = Size; }
+
+void DebugLinesSubsection::setFlags(LineFlags Flags) { this->Flags = Flags; }
+
+bool DebugLinesSubsection::hasColumnInfo() const {
+  return Flags & LF_HaveColumns;
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
new file mode 100644
index 0000000..d723282
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
@@ -0,0 +1,90 @@
+//===- DebugStringTableSubsection.cpp - CodeView String Table -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+DebugStringTableSubsectionRef::DebugStringTableSubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::StringTable) {}
+
+Error DebugStringTableSubsectionRef::initialize(BinaryStreamRef Contents) {
+  Stream = Contents;
+  return Error::success();
+}
+
+Error DebugStringTableSubsectionRef::initialize(BinaryStreamReader &Reader) {
+  return Reader.readStreamRef(Stream);
+}
+
+Expected<StringRef>
+DebugStringTableSubsectionRef::getString(uint32_t Offset) const {
+  BinaryStreamReader Reader(Stream);
+  Reader.setOffset(Offset);
+  StringRef Result;
+  if (auto EC = Reader.readCString(Result))
+    return std::move(EC);
+  return Result;
+}
+
+DebugStringTableSubsection::DebugStringTableSubsection()
+    : DebugSubsection(DebugSubsectionKind::StringTable) {}
+
+uint32_t DebugStringTableSubsection::insert(StringRef S) {
+  auto P = Strings.insert({S, StringSize});
+
+  // If a given string didn't exist in the string table, we want to increment
+  // the string table size.
+  if (P.second)
+    StringSize += S.size() + 1; // +1 for '\0'
+  return P.first->second;
+}
+
+uint32_t DebugStringTableSubsection::calculateSerializedSize() const {
+  return StringSize;
+}
+
+Error DebugStringTableSubsection::commit(BinaryStreamWriter &Writer) const {
+  uint32_t Begin = Writer.getOffset();
+  uint32_t End = Begin + StringSize;
+
+  // Write a null string at the beginning.
+  if (auto EC = Writer.writeCString(StringRef()))
+    return EC;
+
+  for (auto &Pair : Strings) {
+    StringRef S = Pair.getKey();
+    uint32_t Offset = Begin + Pair.getValue();
+    Writer.setOffset(Offset);
+    if (auto EC = Writer.writeCString(S))
+      return EC;
+    assert(Writer.getOffset() <= End);
+  }
+
+  Writer.setOffset(End);
+  assert((End - Begin) == StringSize);
+  return Error::success();
+}
+
+uint32_t DebugStringTableSubsection::size() const { return Strings.size(); }
+
+uint32_t DebugStringTableSubsection::getStringId(StringRef S) const {
+  auto Iter = Strings.find(S);
+  assert(Iter != Strings.end());
+  return Iter->second;
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp
new file mode 100644
index 0000000..67b428b
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsection.cpp
@@ -0,0 +1,16 @@
+//===- DebugSubsection.cpp -----------------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+
+using namespace llvm::codeview;
+
+DebugSubsectionRef::~DebugSubsectionRef() {}
+
+DebugSubsection::~DebugSubsection() {}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
new file mode 100644
index 0000000..55f343c
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -0,0 +1,97 @@
+//===- DebugSubsectionRecord.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+DebugSubsectionRecord::DebugSubsectionRecord() = default;
+
+DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind,
+                                             BinaryStreamRef Data,
+                                             CodeViewContainer Container)
+    : Container(Container), Kind(Kind), Data(Data) {}
+
+Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream,
+                                        DebugSubsectionRecord &Info,
+                                        CodeViewContainer Container) {
+  const DebugSubsectionHeader *Header;
+  BinaryStreamReader Reader(Stream);
+  if (auto EC = Reader.readObject(Header))
+    return EC;
+
+  DebugSubsectionKind Kind =
+      static_cast<DebugSubsectionKind>(uint32_t(Header->Kind));
+  if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
+    return EC;
+  Info.Container = Container;
+  Info.Kind = Kind;
+  return Error::success();
+}
+
+uint32_t DebugSubsectionRecord::getRecordLength() const {
+  return sizeof(DebugSubsectionHeader) + Data.getLength();
+}
+
+DebugSubsectionKind DebugSubsectionRecord::kind() const { return Kind; }
+
+BinaryStreamRef DebugSubsectionRecord::getRecordData() const { return Data; }
+
+DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
+    std::shared_ptr<DebugSubsection> Subsection, CodeViewContainer Container)
+    : Subsection(std::move(Subsection)), Container(Container) {}
+
+DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
+    const DebugSubsectionRecord &Contents, CodeViewContainer Container)
+    : Contents(Contents), Container(Container) {}
+
+uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() {
+  uint32_t DataSize = Subsection ? Subsection->calculateSerializedSize()
+                                 : Contents.getRecordData().getLength();
+  // The length of the entire subsection is always padded to 4 bytes,
+  // regardless of the container kind.
+  return sizeof(DebugSubsectionHeader) + alignTo(DataSize, 4);
+}
+
+Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) const {
+  assert(Writer.getOffset() % alignOf(Container) == 0 &&
+         "Debug Subsection not properly aligned");
+
+  DebugSubsectionHeader Header;
+  Header.Kind = uint32_t(Subsection ? Subsection->kind() : Contents.kind());
+  // The value written into the Header's Length field is only padded to the
+  // container's alignment
+  uint32_t DataSize = Subsection ? Subsection->calculateSerializedSize()
+                                 : Contents.getRecordData().getLength();
+  Header.Length = alignTo(DataSize, alignOf(Container));
+
+  if (auto EC = Writer.writeObject(Header))
+    return EC;
+  if (Subsection) {
+    if (auto EC = Subsection->commit(Writer))
+      return EC;
+  } else {
+    if (auto EC = Writer.writeStreamRef(Contents.getRecordData()))
+      return EC;
+  }
+  if (auto EC = Writer.padToAlignment(4))
+    return EC;
+
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
new file mode 100644
index 0000000..9b82433
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
@@ -0,0 +1,95 @@
+//===- DebugSubsectionVisitor.cpp -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
+
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugUnknownSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error llvm::codeview::visitDebugSubsection(
+    const DebugSubsectionRecord &R, DebugSubsectionVisitor &V,
+    const StringsAndChecksumsRef &State) {
+  BinaryStreamReader Reader(R.getRecordData());
+  switch (R.kind()) {
+  case DebugSubsectionKind::Lines: {
+    DebugLinesSubsectionRef Fragment;
+    if (auto EC = Fragment.initialize(Reader))
+      return EC;
+
+    return V.visitLines(Fragment, State);
+  }
+  case DebugSubsectionKind::FileChecksums: {
+    DebugChecksumsSubsectionRef Fragment;
+    if (auto EC = Fragment.initialize(Reader))
+      return EC;
+
+    return V.visitFileChecksums(Fragment, State);
+  }
+  case DebugSubsectionKind::InlineeLines: {
+    DebugInlineeLinesSubsectionRef Fragment;
+    if (auto EC = Fragment.initialize(Reader))
+      return EC;
+    return V.visitInlineeLines(Fragment, State);
+  }
+  case DebugSubsectionKind::CrossScopeExports: {
+    DebugCrossModuleExportsSubsectionRef Section;
+    if (auto EC = Section.initialize(Reader))
+      return EC;
+    return V.visitCrossModuleExports(Section, State);
+  }
+  case DebugSubsectionKind::CrossScopeImports: {
+    DebugCrossModuleImportsSubsectionRef Section;
+    if (auto EC = Section.initialize(Reader))
+      return EC;
+    return V.visitCrossModuleImports(Section, State);
+  }
+  case DebugSubsectionKind::Symbols: {
+    DebugSymbolsSubsectionRef Section;
+    if (auto EC = Section.initialize(Reader))
+      return EC;
+    return V.visitSymbols(Section, State);
+  }
+  case DebugSubsectionKind::StringTable: {
+    DebugStringTableSubsectionRef Section;
+    if (auto EC = Section.initialize(Reader))
+      return EC;
+    return V.visitStringTable(Section, State);
+  }
+  case DebugSubsectionKind::FrameData: {
+    DebugFrameDataSubsectionRef Section;
+    if (auto EC = Section.initialize(Reader))
+      return EC;
+    return V.visitFrameData(Section, State);
+  }
+  case DebugSubsectionKind::CoffSymbolRVA: {
+    DebugSymbolRVASubsectionRef Section;
+    if (auto EC = Section.initialize(Reader))
+      return EC;
+    return V.visitCOFFSymbolRVAs(Section, State);
+  }
+  default: {
+    DebugUnknownSubsectionRef Fragment(R.kind(), R.getRecordData());
+    return V.visitUnknown(Fragment);
+  }
+  }
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
new file mode 100644
index 0000000..60fbf9d
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugSymbolRVASubsection.cpp
@@ -0,0 +1,36 @@
+//===- DebugSymbolRVASubsection.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+DebugSymbolRVASubsectionRef::DebugSymbolRVASubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::CoffSymbolRVA) {}
+
+Error DebugSymbolRVASubsectionRef::initialize(BinaryStreamReader &Reader) {
+  return Reader.readArray(RVAs, Reader.bytesRemaining() / sizeof(uint32_t));
+}
+
+DebugSymbolRVASubsection::DebugSymbolRVASubsection()
+    : DebugSubsection(DebugSubsectionKind::CoffSymbolRVA) {}
+
+Error DebugSymbolRVASubsection::commit(BinaryStreamWriter &Writer) const {
+  return Writer.writeArray(makeArrayRef(RVAs));
+}
+
+uint32_t DebugSymbolRVASubsection::calculateSerializedSize() const {
+  return RVAs.size() * sizeof(uint32_t);
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
new file mode 100644
index 0000000..dc8ba8c
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
@@ -0,0 +1,34 @@
+//===- DebugSymbolsSubsection.cpp -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error DebugSymbolsSubsectionRef::initialize(BinaryStreamReader Reader) {
+  return Reader.readArray(Records, Reader.getLength());
+}
+
+uint32_t DebugSymbolsSubsection::calculateSerializedSize() const {
+  return Length;
+}
+
+Error DebugSymbolsSubsection::commit(BinaryStreamWriter &Writer) const {
+  for (const auto &Record : Records) {
+    if (auto EC = Writer.writeBytes(Record.RecordData))
+      return EC;
+  }
+  return Error::success();
+}
+
+void DebugSymbolsSubsection::addSymbol(CVSymbol Symbol) {
+  Records.push_back(Symbol);
+  Length += Symbol.length();
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/EnumTables.cpp b/contrib/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
index 0e20bcb..4cfb55a 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -1,4 +1,4 @@
-//===- EnumTables.cpp - Enum to string conversion tables --------*- C++ -*-===//
+//===- EnumTables.cpp - Enum to string conversion tables ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include <type_traits>
 
 using namespace llvm;
 using namespace codeview;
@@ -20,13 +22,13 @@ using namespace codeview;
 
 static const EnumEntry<SymbolKind> SymbolTypeNames[] = {
 #define CV_SYMBOL(enum, val) {#enum, enum},
-#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def"
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
 #undef CV_SYMBOL
 };
 
 static const EnumEntry<TypeLeafKind> TypeLeafNames[] = {
 #define CV_TYPE(name, val) {#name, name},
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
 #undef CV_TYPE
 };
 
@@ -82,6 +84,13 @@ static const EnumEntry<uint16_t> RegisterNames[] = {
     CV_ENUM_CLASS_ENT(RegisterId, R15),
 };
 
+static const EnumEntry<uint32_t> PublicSymFlagNames[] = {
+    CV_ENUM_CLASS_ENT(PublicSymFlags, Code),
+    CV_ENUM_CLASS_ENT(PublicSymFlags, Function),
+    CV_ENUM_CLASS_ENT(PublicSymFlags, Managed),
+    CV_ENUM_CLASS_ENT(PublicSymFlags, MSIL),
+};
+
 static const EnumEntry<uint8_t> ProcSymFlagNames[] = {
     CV_ENUM_CLASS_ENT(ProcSymFlags, HasFP),
     CV_ENUM_CLASS_ENT(ProcSymFlags, HasIRET),
@@ -245,20 +254,20 @@ static const EnumEntry<uint32_t> FrameProcSymFlagNames[] = {
 };
 
 static const EnumEntry<uint32_t> ModuleSubstreamKindNames[] = {
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, None),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, Symbols),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, Lines),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, StringTable),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, FileChecksums),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, FrameData),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, InlineeLines),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, CrossScopeImports),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, CrossScopeExports),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, ILLines),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, FuncMDTokenMap),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, TypeMDTokenMap),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, MergedAssemblyInput),
-    CV_ENUM_CLASS_ENT(ModuleSubstreamKind, CoffSymbolRVA),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, None),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, Symbols),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, Lines),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, StringTable),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, FileChecksums),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, FrameData),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, InlineeLines),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, CrossScopeImports),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, CrossScopeExports),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, ILLines),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, FuncMDTokenMap),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, TypeMDTokenMap),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, MergedAssemblyInput),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, CoffSymbolRVA),
 };
 
 static const EnumEntry<uint16_t> ExportSymFlagNames[] = {
@@ -326,6 +335,7 @@ static const EnumEntry<COFF::SectionCharacteristics>
 
 namespace llvm {
 namespace codeview {
+
 ArrayRef<EnumEntry<SymbolKind>> getSymbolTypeNames() {
   return makeArrayRef(SymbolTypeNames);
 }
@@ -338,48 +348,66 @@ ArrayRef<EnumEntry<uint16_t>> getRegisterNames() {
   return makeArrayRef(RegisterNames);
 }
 
+ArrayRef<EnumEntry<uint32_t>> getPublicSymFlagNames() {
+  return makeArrayRef(PublicSymFlagNames);
+}
+
 ArrayRef<EnumEntry<uint8_t>> getProcSymFlagNames() {
   return makeArrayRef(ProcSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint16_t>> getLocalFlagNames() {
   return makeArrayRef(LocalFlags);
 }
+
 ArrayRef<EnumEntry<uint8_t>> getFrameCookieKindNames() {
   return makeArrayRef(FrameCookieKinds);
 }
+
 ArrayRef<EnumEntry<SourceLanguage>> getSourceLanguageNames() {
   return makeArrayRef(SourceLanguages);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getCompileSym2FlagNames() {
   return makeArrayRef(CompileSym2FlagNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getCompileSym3FlagNames() {
   return makeArrayRef(CompileSym3FlagNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getFileChecksumNames() {
   return makeArrayRef(FileChecksumNames);
 }
+
 ArrayRef<EnumEntry<unsigned>> getCPUTypeNames() {
   return makeArrayRef(CPUTypeNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getFrameProcSymFlagNames() {
   return makeArrayRef(FrameProcSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint16_t>> getExportSymFlagNames() {
   return makeArrayRef(ExportSymFlagNames);
 }
+
 ArrayRef<EnumEntry<uint32_t>> getModuleSubstreamKindNames() {
   return makeArrayRef(ModuleSubstreamKindNames);
 }
+
 ArrayRef<EnumEntry<uint8_t>> getThunkOrdinalNames() {
   return makeArrayRef(ThunkOrdinalNames);
 }
+
 ArrayRef<EnumEntry<uint16_t>> getTrampolineNames() {
   return makeArrayRef(TrampolineNames);
 }
+
 ArrayRef<EnumEntry<COFF::SectionCharacteristics>>
 getImageSectionCharacteristicNames() {
   return makeArrayRef(ImageSectionCharacteristicNames);
 }
-}
-}
+
+} // end namespace codeview
+} // end namespace llvm
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/Formatters.cpp b/contrib/llvm/lib/DebugInfo/CodeView/Formatters.cpp
new file mode 100644
index 0000000..b8d89c7
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/Formatters.cpp
@@ -0,0 +1,48 @@
+//===- Formatters.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/GUID.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::codeview::detail;
+
+GuidAdapter::GuidAdapter(StringRef Guid)
+    : FormatAdapter(makeArrayRef(Guid.bytes_begin(), Guid.bytes_end())) {}
+
+GuidAdapter::GuidAdapter(ArrayRef<uint8_t> Guid)
+    : FormatAdapter(std::move(Guid)) {}
+
+void GuidAdapter::format(raw_ostream &Stream, StringRef Style) {
+  static const char *Lookup = "0123456789ABCDEF";
+
+  assert(Item.size() == 16 && "Expected 16-byte GUID");
+  Stream << "{";
+  for (int i = 0; i < 16;) {
+    uint8_t Byte = Item[i];
+    uint8_t HighNibble = (Byte >> 4) & 0xF;
+    uint8_t LowNibble = Byte & 0xF;
+    Stream << Lookup[HighNibble] << Lookup[LowNibble];
+    ++i;
+    if (i >= 4 && i <= 10 && i % 2 == 0)
+      Stream << "-";
+  }
+  Stream << "}";
+}
+
+raw_ostream &llvm::codeview::operator<<(raw_ostream &OS, const GUID &Guid) {
+  codeview::detail::GuidAdapter A(Guid.Guid);
+  A.format(OS, "");
+  return OS;
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
new file mode 100644
index 0000000..5aaf3f1
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -0,0 +1,252 @@
+//===- LazyRandomTypeCollection.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeName.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static void error(Error &&EC) {
+  assert(!static_cast<bool>(EC));
+  if (EC)
+    consumeError(std::move(EC));
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(uint32_t RecordCountHint)
+    : LazyRandomTypeCollection(CVTypeArray(), RecordCountHint,
+                               PartialOffsetArray()) {}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(
+    const CVTypeArray &Types, uint32_t RecordCountHint,
+    PartialOffsetArray PartialOffsets)
+    : NameStorage(Allocator), Types(Types), PartialOffsets(PartialOffsets) {
+  Records.resize(RecordCountHint);
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(ArrayRef<uint8_t> Data,
+                                                   uint32_t RecordCountHint)
+    : LazyRandomTypeCollection(RecordCountHint) {
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(StringRef Data,
+                                                   uint32_t RecordCountHint)
+    : LazyRandomTypeCollection(
+          makeArrayRef(Data.bytes_begin(), Data.bytes_end()), RecordCountHint) {
+}
+
+LazyRandomTypeCollection::LazyRandomTypeCollection(const CVTypeArray &Types,
+                                                   uint32_t NumRecords)
+    : LazyRandomTypeCollection(Types, NumRecords, PartialOffsetArray()) {}
+
+void LazyRandomTypeCollection::reset(StringRef Data, uint32_t RecordCountHint) {
+  Count = 0;
+  PartialOffsets = PartialOffsetArray();
+
+  BinaryStreamReader Reader(Data, support::little);
+  error(Reader.readArray(Types, Reader.getLength()));
+
+  // Clear and then resize, to make sure existing data gets destroyed.
+  Records.clear();
+  Records.resize(RecordCountHint);
+}
+
+void LazyRandomTypeCollection::reset(ArrayRef<uint8_t> Data,
+                                     uint32_t RecordCountHint) {
+  reset(toStringRef(Data), RecordCountHint);
+}
+
+uint32_t LazyRandomTypeCollection::getOffsetOfType(TypeIndex Index) {
+  error(ensureTypeExists(Index));
+  assert(contains(Index));
+
+  return Records[Index.toArrayIndex()].Offset;
+}
+
+CVType LazyRandomTypeCollection::getType(TypeIndex Index) {
+  error(ensureTypeExists(Index));
+  assert(contains(Index));
+
+  return Records[Index.toArrayIndex()].Type;
+}
+
+StringRef LazyRandomTypeCollection::getTypeName(TypeIndex Index) {
+  if (Index.isNoneType() || Index.isSimple())
+    return TypeIndex::simpleTypeName(Index);
+
+  // Try to make sure the type exists.  Even if it doesn't though, it may be
+  // because we're dumping a symbol stream with no corresponding type stream
+  // present, in which case we still want to be able to print <unknown UDT>
+  // for the type names.
+  if (auto EC = ensureTypeExists(Index)) {
+    consumeError(std::move(EC));
+    return "<unknown UDT>";
+  }
+
+  uint32_t I = Index.toArrayIndex();
+  ensureCapacityFor(Index);
+  if (Records[I].Name.data() == nullptr) {
+    StringRef Result = NameStorage.save(computeTypeName(*this, Index));
+    Records[I].Name = Result;
+  }
+  return Records[I].Name;
+}
+
+bool LazyRandomTypeCollection::contains(TypeIndex Index) {
+  if (Records.size() <= Index.toArrayIndex())
+    return false;
+  if (!Records[Index.toArrayIndex()].Type.valid())
+    return false;
+  return true;
+}
+
+uint32_t LazyRandomTypeCollection::size() { return Count; }
+
+uint32_t LazyRandomTypeCollection::capacity() { return Records.size(); }
+
+Error LazyRandomTypeCollection::ensureTypeExists(TypeIndex TI) {
+  if (contains(TI))
+    return Error::success();
+
+  return visitRangeForType(TI);
+}
+
+void LazyRandomTypeCollection::ensureCapacityFor(TypeIndex Index) {
+  uint32_t MinSize = Index.toArrayIndex() + 1;
+
+  if (MinSize <= capacity())
+    return;
+
+  uint32_t NewCapacity = MinSize * 3 / 2;
+
+  assert(NewCapacity > capacity());
+  Records.resize(NewCapacity);
+}
+
+Error LazyRandomTypeCollection::visitRangeForType(TypeIndex TI) {
+  if (PartialOffsets.empty())
+    return fullScanForType(TI);
+
+  auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI,
+                               [](TypeIndex Value, const TypeIndexOffset &IO) {
+                                 return Value < IO.Type;
+                               });
+
+  assert(Next != PartialOffsets.begin());
+  auto Prev = std::prev(Next);
+
+  TypeIndex TIB = Prev->Type;
+  if (contains(TIB)) {
+    // They've asked us to fetch a type index, but the entry we found in the
+    // partial offsets array has already been visited.  Since we visit an entire
+    // block every time, that means this record should have been previously
+    // discovered.  Ultimately, this means this is a request for a non-existant
+    // type index.
+    return make_error<CodeViewError>("Invalid type index");
+  }
+
+  TypeIndex TIE;
+  if (Next == PartialOffsets.end()) {
+    TIE = TypeIndex::fromArrayIndex(capacity());
+  } else {
+    TIE = Next->Type;
+  }
+
+  visitRange(TIB, Prev->Offset, TIE);
+  return Error::success();
+}
+
+Optional<TypeIndex> LazyRandomTypeCollection::getFirst() {
+  TypeIndex TI = TypeIndex::fromArrayIndex(0);
+  if (auto EC = ensureTypeExists(TI)) {
+    consumeError(std::move(EC));
+    return None;
+  }
+  return TI;
+}
+
+Optional<TypeIndex> LazyRandomTypeCollection::getNext(TypeIndex Prev) {
+  // We can't be sure how long this type stream is, given that the initial count
+  // given to the constructor is just a hint.  So just try to make sure the next
+  // record exists, and if anything goes wrong, we must be at the end.
+  if (auto EC = ensureTypeExists(Prev + 1)) {
+    consumeError(std::move(EC));
+    return None;
+  }
+
+  return Prev + 1;
+}
+
+Error LazyRandomTypeCollection::fullScanForType(TypeIndex TI) {
+  assert(PartialOffsets.empty());
+
+  TypeIndex CurrentTI = TypeIndex::fromArrayIndex(0);
+  auto Begin = Types.begin();
+
+  if (Count > 0) {
+    // In the case of type streams which we don't know the number of records of,
+    // it's possible to search for a type index triggering a full scan, but then
+    // later additional records are added since we didn't know how many there
+    // would be until we did a full visitation, then you try to access the new
+    // type triggering another full scan.  To avoid this, we assume that if the
+    // database has some records, this must be what's going on.  We can also
+    // assume that this index must be larger than the largest type index we've
+    // visited, so we start from there and scan forward.
+    uint32_t Offset = Records[LargestTypeIndex.toArrayIndex()].Offset;
+    CurrentTI = LargestTypeIndex + 1;
+    Begin = Types.at(Offset);
+    ++Begin;
+  }
+
+  auto End = Types.end();
+  while (Begin != End) {
+    ensureCapacityFor(CurrentTI);
+    LargestTypeIndex = std::max(LargestTypeIndex, CurrentTI);
+    auto Idx = CurrentTI.toArrayIndex();
+    Records[Idx].Type = *Begin;
+    Records[Idx].Offset = Begin.offset();
+    ++Count;
+    ++Begin;
+    ++CurrentTI;
+  }
+  if (CurrentTI <= TI) {
+    return make_error<CodeViewError>("Type Index does not exist!");
+  }
+  return Error::success();
+}
+
+void LazyRandomTypeCollection::visitRange(TypeIndex Begin, uint32_t BeginOffset,
+                                          TypeIndex End) {
+  auto RI = Types.at(BeginOffset);
+  assert(RI != Types.end());
+
+  ensureCapacityFor(End);
+  while (Begin != End) {
+    LargestTypeIndex = std::max(LargestTypeIndex, Begin);
+    auto Idx = Begin.toArrayIndex();
+    Records[Idx].Type = *RI;
+    Records[Idx].Offset = RI.offset();
+    ++Count;
+    ++Begin;
+    ++RI;
+  }
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/ModuleSubstream.cpp b/contrib/llvm/lib/DebugInfo/CodeView/ModuleSubstream.cpp
deleted file mode 100644
index 768ebaa..0000000
--- a/contrib/llvm/lib/DebugInfo/CodeView/ModuleSubstream.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//===- ModuleSubstream.cpp --------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
-
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::msf;
-
-ModuleSubstream::ModuleSubstream() : Kind(ModuleSubstreamKind::None) {}
-
-ModuleSubstream::ModuleSubstream(ModuleSubstreamKind Kind,
-                                 ReadableStreamRef Data)
-    : Kind(Kind), Data(Data) {}
-
-Error ModuleSubstream::initialize(ReadableStreamRef Stream,
-                                  ModuleSubstream &Info) {
-  const ModuleSubsectionHeader *Header;
-  StreamReader Reader(Stream);
-  if (auto EC = Reader.readObject(Header))
-    return EC;
-
-  ModuleSubstreamKind Kind =
-      static_cast<ModuleSubstreamKind>(uint32_t(Header->Kind));
-  if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
-    return EC;
-  Info.Kind = Kind;
-  return Error::success();
-}
-
-uint32_t ModuleSubstream::getRecordLength() const {
-  return sizeof(ModuleSubsectionHeader) + Data.getLength();
-}
-
-ModuleSubstreamKind ModuleSubstream::getSubstreamKind() const { return Kind; }
-
-ReadableStreamRef ModuleSubstream::getRecordData() const { return Data; }
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
deleted file mode 100644
index 5247932..0000000
--- a/contrib/llvm/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//===- ModuleSubstreamVisitor.cpp -------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::msf;
-
-Error IModuleSubstreamVisitor::visitSymbols(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::Symbols, Data);
-}
-Error IModuleSubstreamVisitor::visitLines(ReadableStreamRef Data,
-                                          const LineSubstreamHeader *Header,
-                                          const LineInfoArray &Lines) {
-  return visitUnknown(ModuleSubstreamKind::Lines, Data);
-}
-Error IModuleSubstreamVisitor::visitStringTable(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::StringTable, Data);
-}
-Error IModuleSubstreamVisitor::visitFileChecksums(
-    ReadableStreamRef Data, const FileChecksumArray &Checksums) {
-  return visitUnknown(ModuleSubstreamKind::FileChecksums, Data);
-}
-Error IModuleSubstreamVisitor::visitFrameData(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::FrameData, Data);
-}
-Error IModuleSubstreamVisitor::visitInlineeLines(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::InlineeLines, Data);
-}
-Error IModuleSubstreamVisitor::visitCrossScopeImports(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::CrossScopeExports, Data);
-}
-Error IModuleSubstreamVisitor::visitCrossScopeExports(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::CrossScopeImports, Data);
-}
-Error IModuleSubstreamVisitor::visitILLines(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::ILLines, Data);
-}
-Error IModuleSubstreamVisitor::visitFuncMDTokenMap(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::FuncMDTokenMap, Data);
-}
-Error IModuleSubstreamVisitor::visitTypeMDTokenMap(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::TypeMDTokenMap, Data);
-}
-Error IModuleSubstreamVisitor::visitMergedAssemblyInput(
-    ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::MergedAssemblyInput, Data);
-}
-Error IModuleSubstreamVisitor::visitCoffSymbolRVA(ReadableStreamRef Data) {
-  return visitUnknown(ModuleSubstreamKind::CoffSymbolRVA, Data);
-}
-
-Error llvm::codeview::visitModuleSubstream(const ModuleSubstream &R,
-                                           IModuleSubstreamVisitor &V) {
-  switch (R.getSubstreamKind()) {
-  case ModuleSubstreamKind::Symbols:
-    return V.visitSymbols(R.getRecordData());
-  case ModuleSubstreamKind::Lines: {
-    StreamReader Reader(R.getRecordData());
-    const LineSubstreamHeader *Header;
-    if (auto EC = Reader.readObject(Header))
-      return EC;
-    VarStreamArrayExtractor<LineColumnEntry> E(Header);
-    LineInfoArray LineInfos(E);
-    if (auto EC = Reader.readArray(LineInfos, Reader.bytesRemaining()))
-      return EC;
-    return V.visitLines(R.getRecordData(), Header, LineInfos);
-  }
-  case ModuleSubstreamKind::StringTable:
-    return V.visitStringTable(R.getRecordData());
-  case ModuleSubstreamKind::FileChecksums: {
-    StreamReader Reader(R.getRecordData());
-    FileChecksumArray Checksums;
-    if (auto EC = Reader.readArray(Checksums, Reader.bytesRemaining()))
-      return EC;
-    return V.visitFileChecksums(R.getRecordData(), Checksums);
-  }
-  case ModuleSubstreamKind::FrameData:
-    return V.visitFrameData(R.getRecordData());
-  case ModuleSubstreamKind::InlineeLines:
-    return V.visitInlineeLines(R.getRecordData());
-  case ModuleSubstreamKind::CrossScopeImports:
-    return V.visitCrossScopeImports(R.getRecordData());
-  case ModuleSubstreamKind::CrossScopeExports:
-    return V.visitCrossScopeExports(R.getRecordData());
-  case ModuleSubstreamKind::ILLines:
-    return V.visitILLines(R.getRecordData());
-  case ModuleSubstreamKind::FuncMDTokenMap:
-    return V.visitFuncMDTokenMap(R.getRecordData());
-  case ModuleSubstreamKind::TypeMDTokenMap:
-    return V.visitTypeMDTokenMap(R.getRecordData());
-  case ModuleSubstreamKind::MergedAssemblyInput:
-    return V.visitMergedAssemblyInput(R.getRecordData());
-  case ModuleSubstreamKind::CoffSymbolRVA:
-    return V.visitCoffSymbolRVA(R.getRecordData());
-  default:
-    return V.visitUnknown(R.getSubstreamKind(), R.getRecordData());
-  }
-}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp b/contrib/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
index 6f29caa..6446670 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/RecordSerialization.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -33,7 +33,7 @@ StringRef llvm::codeview::getBytesAsCString(ArrayRef<uint8_t> LeafData) {
   return getBytesAsCharacters(LeafData).split('\0').first;
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, APSInt &Num) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, APSInt &Num) {
   // Used to avoid overload ambiguity on APInt construtor.
   bool FalseVal = false;
   uint16_t Short;
@@ -103,15 +103,15 @@ Error llvm::codeview::consume(msf::StreamReader &Reader, APSInt &Num) {
 
 Error llvm::codeview::consume(StringRef &Data, APSInt &Num) {
   ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
-  msf::ByteStream S(Bytes);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Bytes, llvm::support::little);
+  BinaryStreamReader SR(S);
   auto EC = consume(SR, Num);
   Data = Data.take_back(SR.bytesRemaining());
   return EC;
 }
 
 /// Decode a numeric leaf value that is known to be a uint64_t.
-Error llvm::codeview::consume_numeric(msf::StreamReader &Reader,
+Error llvm::codeview::consume_numeric(BinaryStreamReader &Reader,
                                       uint64_t &Num) {
   APSInt N;
   if (auto EC = consume(Reader, N))
@@ -123,27 +123,27 @@ Error llvm::codeview::consume_numeric(msf::StreamReader &Reader,
   return Error::success();
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, uint32_t &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, uint32_t &Item) {
   return Reader.readInteger(Item);
 }
 
 Error llvm::codeview::consume(StringRef &Data, uint32_t &Item) {
   ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
-  msf::ByteStream S(Bytes);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Bytes, llvm::support::little);
+  BinaryStreamReader SR(S);
   auto EC = consume(SR, Item);
   Data = Data.take_back(SR.bytesRemaining());
   return EC;
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, int32_t &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, int32_t &Item) {
   return Reader.readInteger(Item);
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, StringRef &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, StringRef &Item) {
   if (Reader.empty())
     return make_error<CodeViewError>(cv_error_code::corrupt_record,
                                      "Null terminated string buffer is empty!");
 
-  return Reader.readZeroString(Item);
+  return Reader.readCString(Item);
 }
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp b/contrib/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
new file mode 100644
index 0000000..306af1d
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/StringsAndChecksums.cpp
@@ -0,0 +1,59 @@
+//===- StringsAndChecksums.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+StringsAndChecksumsRef::StringsAndChecksumsRef() = default;
+
+StringsAndChecksumsRef::StringsAndChecksumsRef(
+    const DebugStringTableSubsectionRef &Strings)
+    : Strings(&Strings) {}
+
+StringsAndChecksumsRef::StringsAndChecksumsRef(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &Checksums)
+    : Strings(&Strings), Checksums(&Checksums) {}
+
+void StringsAndChecksumsRef::initializeStrings(
+    const DebugSubsectionRecord &SR) {
+  assert(SR.kind() == DebugSubsectionKind::StringTable);
+  assert(!Strings && "Found a string table even though we already have one!");
+
+  OwnedStrings = llvm::make_unique<DebugStringTableSubsectionRef>();
+  consumeError(OwnedStrings->initialize(SR.getRecordData()));
+  Strings = OwnedStrings.get();
+}
+
+void StringsAndChecksumsRef::setChecksums(
+    const DebugChecksumsSubsectionRef &CS) {
+  OwnedChecksums = llvm::make_unique<DebugChecksumsSubsectionRef>();
+  *OwnedChecksums = CS;
+  Checksums = OwnedChecksums.get();
+}
+
+void StringsAndChecksumsRef::initializeChecksums(
+    const DebugSubsectionRecord &FCR) {
+  assert(FCR.kind() == DebugSubsectionKind::FileChecksums);
+  if (Checksums)
+    return;
+
+  OwnedChecksums = llvm::make_unique<DebugChecksumsSubsectionRef>();
+  consumeError(OwnedChecksums->initialize(FCR.getRecordData()));
+  Checksums = OwnedChecksums.get();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp b/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
index fd54fba..62e73ac 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -11,7 +11,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
@@ -32,16 +32,16 @@ namespace {
 /// the visitor out of SymbolDumper.h.
 class CVSymbolDumperImpl : public SymbolVisitorCallbacks {
 public:
-  CVSymbolDumperImpl(TypeDatabase &TypeDB, SymbolDumpDelegate *ObjDelegate,
+  CVSymbolDumperImpl(TypeCollection &Types, SymbolDumpDelegate *ObjDelegate,
                      ScopedPrinter &W, bool PrintRecordBytes)
-      : TypeDB(TypeDB), ObjDelegate(ObjDelegate), W(W),
+      : Types(Types), ObjDelegate(ObjDelegate), W(W),
         PrintRecordBytes(PrintRecordBytes), InFunctionScope(false) {}
 
 /// CVSymbolVisitor overrides.
 #define SYMBOL_RECORD(EnumName, EnumVal, Name)                                 \
   Error visitKnownRecord(CVSymbol &CVR, Name &Record) override;
 #define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/CVSymbolTypes.def"
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
 
   Error visitSymbolBegin(CVSymbol &Record) override;
   Error visitSymbolEnd(CVSymbol &Record) override;
@@ -53,7 +53,7 @@ private:
   void printLocalVariableAddrGap(ArrayRef<LocalVariableAddrGap> Gaps);
   void printTypeIndex(StringRef FieldName, TypeIndex TI);
 
-  TypeDatabase &TypeDB;
+  TypeCollection &Types;
   SymbolDumpDelegate *ObjDelegate;
   ScopedPrinter &W;
 
@@ -62,6 +62,18 @@ private:
 };
 }
 
+static StringRef getSymbolKindName(SymbolKind Kind) {
+  switch (Kind) {
+#define SYMBOL_RECORD(EnumName, EnumVal, Name)                                 \
+  case EnumName:                                                               \
+    return #Name;
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
+  default:
+    break;
+  }
+  return "UnknownSym";
+}
+
 void CVSymbolDumperImpl::printLocalVariableAddrRange(
     const LocalVariableAddrRange &Range, uint32_t RelocationOffset) {
   DictScope S(W, "LocalVariableAddrRange");
@@ -82,22 +94,27 @@ void CVSymbolDumperImpl::printLocalVariableAddrGap(
 }
 
 void CVSymbolDumperImpl::printTypeIndex(StringRef FieldName, TypeIndex TI) {
-  CVTypeDumper::printTypeIndex(W, FieldName, TI, TypeDB);
+  codeview::printTypeIndex(W, FieldName, TI, Types);
 }
 
 Error CVSymbolDumperImpl::visitSymbolBegin(CVSymbol &CVR) {
+  W.startLine() << getSymbolKindName(CVR.Type);
+  W.getOStream() << " {\n";
+  W.indent();
+  W.printEnum("Kind", unsigned(CVR.Type), getSymbolTypeNames());
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitSymbolEnd(CVSymbol &CVR) {
   if (PrintRecordBytes && ObjDelegate)
     ObjDelegate->printBinaryBlockWithRelocs("SymData", CVR.content());
+
+  W.unindent();
+  W.startLine() << "}\n";
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, BlockSym &Block) {
-  DictScope S(W, "BlockStart");
-
   StringRef LinkageName;
   W.printHex("PtrParent", Block.Parent);
   W.printHex("PtrEnd", Block.End);
@@ -113,7 +130,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, BlockSym &Block) {
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, Thunk32Sym &Thunk) {
-  DictScope S(W, "Thunk32");
   W.printNumber("Parent", Thunk.Parent);
   W.printNumber("End", Thunk.End);
   W.printNumber("Next", Thunk.Next);
@@ -126,7 +142,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, Thunk32Sym &Thunk) {
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            TrampolineSym &Tramp) {
-  DictScope S(W, "Trampoline");
   W.printEnum("Type", uint16_t(Tramp.Type), getTrampolineNames());
   W.printNumber("Size", Tramp.Size);
   W.printNumber("ThunkOff", Tramp.ThunkOffset);
@@ -137,7 +152,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, SectionSym &Section) {
-  DictScope S(W, "Section");
   W.printNumber("SectionNumber", Section.SectionNumber);
   W.printNumber("Alignment", Section.Alignment);
   W.printNumber("Rva", Section.Rva);
@@ -152,7 +166,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, SectionSym &Section) {
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            CoffGroupSym &CoffGroup) {
-  DictScope S(W, "COFF Group");
   W.printNumber("Size", CoffGroup.Size);
   W.printFlags("Characteristics", CoffGroup.Characteristics,
                getImageSectionCharacteristicNames(),
@@ -165,8 +178,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            BPRelativeSym &BPRel) {
-  DictScope S(W, "BPRelativeSym");
-
   W.printNumber("Offset", BPRel.Offset);
   printTypeIndex("Type", BPRel.Type);
   W.printString("VarName", BPRel.Name);
@@ -175,16 +186,12 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            BuildInfoSym &BuildInfo) {
-  DictScope S(W, "BuildInfo");
-
-  W.printNumber("BuildId", BuildInfo.BuildId);
+  printTypeIndex("BuildId", BuildInfo.BuildId);
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            CallSiteInfoSym &CallSiteInfo) {
-  DictScope S(W, "CallSiteInfo");
-
   StringRef LinkageName;
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("CodeOffset",
@@ -200,8 +207,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            EnvBlockSym &EnvBlock) {
-  DictScope S(W, "EnvBlock");
-
   ListScope L(W, "Entries");
   for (auto Entry : EnvBlock.Fields) {
     W.printString(Entry);
@@ -211,8 +216,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            FileStaticSym &FileStatic) {
-  DictScope S(W, "FileStatic");
-  W.printNumber("Index", FileStatic.Index);
+  printTypeIndex("Index", FileStatic.Index);
   W.printNumber("ModFilenameOffset", FileStatic.ModFilenameOffset);
   W.printFlags("Flags", uint16_t(FileStatic.Flags), getLocalFlagNames());
   W.printString("Name", FileStatic.Name);
@@ -220,7 +224,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ExportSym &Export) {
-  DictScope S(W, "Export");
   W.printNumber("Ordinal", Export.Ordinal);
   W.printFlags("Flags", uint16_t(Export.Flags), getExportSymFlagNames());
   W.printString("Name", Export.Name);
@@ -229,8 +232,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ExportSym &Export) {
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            Compile2Sym &Compile2) {
-  DictScope S(W, "CompilerFlags2");
-
   W.printEnum("Language", Compile2.getLanguage(), getSourceLanguageNames());
   W.printFlags("Flags", Compile2.getFlags(), getCompileSym2FlagNames());
   W.printEnum("Machine", unsigned(Compile2.Machine), getCPUTypeNames());
@@ -254,8 +255,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            Compile3Sym &Compile3) {
-  DictScope S(W, "CompilerFlags3");
-
   W.printEnum("Language", Compile3.getLanguage(), getSourceLanguageNames());
   W.printFlags("Flags", Compile3.getFlags(), getCompileSym3FlagNames());
   W.printEnum("Machine", unsigned(Compile3.Machine), getCPUTypeNames());
@@ -281,8 +280,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            ConstantSym &Constant) {
-  DictScope S(W, "Constant");
-
   printTypeIndex("Type", Constant.Type);
   W.printNumber("Value", Constant.Value);
   W.printString("Name", Constant.Name);
@@ -290,9 +287,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, DataSym &Data) {
-  DictScope S(W, "DataSym");
-
-  W.printEnum("Kind", uint16_t(CVR.kind()), getSymbolTypeNames());
   StringRef LinkageName;
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
@@ -308,15 +302,12 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, DataSym &Data) {
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR,
     DefRangeFramePointerRelFullScopeSym &DefRangeFramePointerRelFullScope) {
-  DictScope S(W, "DefRangeFramePointerRelFullScope");
   W.printNumber("Offset", DefRangeFramePointerRelFullScope.Offset);
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, DefRangeFramePointerRelSym &DefRangeFramePointerRel) {
-  DictScope S(W, "DefRangeFramePointerRel");
-
   W.printNumber("Offset", DefRangeFramePointerRel.Offset);
   printLocalVariableAddrRange(DefRangeFramePointerRel.Range,
                               DefRangeFramePointerRel.getRelocationOffset());
@@ -326,8 +317,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, DefRangeRegisterRelSym &DefRangeRegisterRel) {
-  DictScope S(W, "DefRangeRegisterRel");
-
   W.printNumber("BaseRegister", DefRangeRegisterRel.Hdr.Register);
   W.printBoolean("HasSpilledUDTMember",
                  DefRangeRegisterRel.hasSpilledUDTMember());
@@ -341,8 +330,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, DefRangeRegisterSym &DefRangeRegister) {
-  DictScope S(W, "DefRangeRegister");
-
   W.printNumber("Register", DefRangeRegister.Hdr.Register);
   W.printNumber("MayHaveNoName", DefRangeRegister.Hdr.MayHaveNoName);
   printLocalVariableAddrRange(DefRangeRegister.Range,
@@ -353,8 +340,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, DefRangeSubfieldRegisterSym &DefRangeSubfieldRegister) {
-  DictScope S(W, "DefRangeSubfieldRegister");
-
   W.printNumber("Register", DefRangeSubfieldRegister.Hdr.Register);
   W.printNumber("MayHaveNoName", DefRangeSubfieldRegister.Hdr.MayHaveNoName);
   W.printNumber("OffsetInParent", DefRangeSubfieldRegister.Hdr.OffsetInParent);
@@ -366,17 +351,15 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, DefRangeSubfieldSym &DefRangeSubfield) {
-  DictScope S(W, "DefRangeSubfield");
-
   if (ObjDelegate) {
-    StringRef StringTable = ObjDelegate->getStringTable();
-    auto ProgramStringTableOffset = DefRangeSubfield.Program;
-    if (ProgramStringTableOffset >= StringTable.size())
+    DebugStringTableSubsectionRef Strings = ObjDelegate->getStringTable();
+    auto ExpectedProgram = Strings.getString(DefRangeSubfield.Program);
+    if (!ExpectedProgram) {
+      consumeError(ExpectedProgram.takeError());
       return llvm::make_error<CodeViewError>(
           "String table offset outside of bounds of String Table!");
-    StringRef Program =
-        StringTable.drop_front(ProgramStringTableOffset).split('\0').first;
-    W.printString("Program", Program);
+    }
+    W.printString("Program", *ExpectedProgram);
   }
   W.printNumber("OffsetInParent", DefRangeSubfield.OffsetInParent);
   printLocalVariableAddrRange(DefRangeSubfield.Range,
@@ -387,17 +370,15 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            DefRangeSym &DefRange) {
-  DictScope S(W, "DefRange");
-
   if (ObjDelegate) {
-    StringRef StringTable = ObjDelegate->getStringTable();
-    auto ProgramStringTableOffset = DefRange.Program;
-    if (ProgramStringTableOffset >= StringTable.size())
+    DebugStringTableSubsectionRef Strings = ObjDelegate->getStringTable();
+    auto ExpectedProgram = Strings.getString(DefRange.Program);
+    if (!ExpectedProgram) {
+      consumeError(ExpectedProgram.takeError());
       return llvm::make_error<CodeViewError>(
           "String table offset outside of bounds of String Table!");
-    StringRef Program =
-        StringTable.drop_front(ProgramStringTableOffset).split('\0').first;
-    W.printString("Program", Program);
+    }
+    W.printString("Program", *ExpectedProgram);
   }
   printLocalVariableAddrRange(DefRange.Range, DefRange.getRelocationOffset());
   printLocalVariableAddrGap(DefRange.Gaps);
@@ -406,8 +387,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            FrameCookieSym &FrameCookie) {
-  DictScope S(W, "FrameCookie");
-
   StringRef LinkageName;
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("CodeOffset",
@@ -423,8 +402,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            FrameProcSym &FrameProc) {
-  DictScope S(W, "FrameProc");
-
   W.printHex("TotalFrameBytes", FrameProc.TotalFrameBytes);
   W.printHex("PaddingFrameBytes", FrameProc.PaddingFrameBytes);
   W.printHex("OffsetToPadding", FrameProc.OffsetToPadding);
@@ -440,8 +417,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(
     CVSymbol &CVR, HeapAllocationSiteSym &HeapAllocSite) {
-  DictScope S(W, "HeapAllocationSite");
-
   StringRef LinkageName;
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("CodeOffset",
@@ -458,8 +433,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            InlineSiteSym &InlineSite) {
-  DictScope S(W, "InlineSite");
-
   W.printHex("PtrParent", InlineSite.Parent);
   W.printHex("PtrEnd", InlineSite.End);
   printTypeIndex("Inlinee", InlineSite.Inlinee);
@@ -468,8 +441,8 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   for (auto &Annotation : InlineSite.annotations()) {
     switch (Annotation.OpCode) {
     case BinaryAnnotationsOpCode::Invalid:
-      return llvm::make_error<CodeViewError>(
-          "Invalid binary annotation opcode!");
+      W.printString("(Annotation Padding)");
+      break;
     case BinaryAnnotationsOpCode::CodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeLength:
@@ -515,16 +488,14 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            RegisterSym &Register) {
-  DictScope S(W, "RegisterSym");
-  W.printNumber("Type", Register.Index);
+  printTypeIndex("Type", Register.Index);
   W.printEnum("Seg", uint16_t(Register.Register), getRegisterNames());
   W.printString("Name", Register.Name);
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, PublicSym32 &Public) {
-  DictScope S(W, "PublicSym");
-  W.printNumber("Type", Public.Index);
+  W.printFlags("Flags", uint32_t(Public.Flags), getPublicSymFlagNames());
   W.printNumber("Seg", Public.Segment);
   W.printNumber("Off", Public.Offset);
   W.printString("Name", Public.Name);
@@ -532,7 +503,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, PublicSym32 &Public) {
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcRefSym &ProcRef) {
-  DictScope S(W, "ProcRef");
   W.printNumber("SumName", ProcRef.SumName);
   W.printNumber("SymOffset", ProcRef.SymOffset);
   W.printNumber("Mod", ProcRef.Module);
@@ -541,8 +511,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcRefSym &ProcRef) {
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LabelSym &Label) {
-  DictScope S(W, "Label");
-
   StringRef LinkageName;
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("CodeOffset", Label.getRelocationOffset(),
@@ -558,8 +526,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LabelSym &Label) {
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LocalSym &Local) {
-  DictScope S(W, "Local");
-
   printTypeIndex("Type", Local.Type);
   W.printFlags("Flags", uint16_t(Local.Flags), getLocalFlagNames());
   W.printString("VarName", Local.Name);
@@ -567,16 +533,12 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, LocalSym &Local) {
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ObjNameSym &ObjName) {
-  DictScope S(W, "ObjectName");
-
   W.printHex("Signature", ObjName.Signature);
   W.printString("ObjectName", ObjName.Name);
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
-  DictScope S(W, "ProcStart");
-
   if (InFunctionScope)
     return llvm::make_error<CodeViewError>(
         "Visiting a ProcSym while inside function scope!");
@@ -584,7 +546,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
   InFunctionScope = true;
 
   StringRef LinkageName;
-  W.printEnum("Kind", uint16_t(CVR.kind()), getSymbolTypeNames());
   W.printHex("PtrParent", Proc.Parent);
   W.printHex("PtrEnd", Proc.End);
   W.printHex("PtrNext", Proc.Next);
@@ -607,13 +568,6 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, ProcSym &Proc) {
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            ScopeEndSym &ScopeEnd) {
-  if (CVR.kind() == SymbolKind::S_END)
-    DictScope S(W, "BlockEnd");
-  else if (CVR.kind() == SymbolKind::S_PROC_ID_END)
-    DictScope S(W, "ProcEnd");
-  else if (CVR.kind() == SymbolKind::S_INLINESITE_END)
-    DictScope S(W, "InlineSiteEnd");
-
   InFunctionScope = false;
   return Error::success();
 }
@@ -627,19 +581,15 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, CallerSym &Caller) {
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            RegRelativeSym &RegRel) {
-  DictScope S(W, "RegRelativeSym");
-
   W.printHex("Offset", RegRel.Offset);
   printTypeIndex("Type", RegRel.Type);
-  W.printHex("Register", RegRel.Register);
+  W.printEnum("Register", uint16_t(RegRel.Register), getRegisterNames());
   W.printString("VarName", RegRel.Name);
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
                                            ThreadLocalDataSym &Data) {
-  DictScope S(W, "ThreadLocalDataSym");
-
   StringRef LinkageName;
   if (ObjDelegate) {
     ObjDelegate->printRelocatedField("DataOffset", Data.getRelocationOffset(),
@@ -653,23 +603,20 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
 }
 
 Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR, UDTSym &UDT) {
-  DictScope S(W, "UDT");
   printTypeIndex("Type", UDT.Type);
   W.printString("UDTName", UDT.Name);
   return Error::success();
 }
 
 Error CVSymbolDumperImpl::visitUnknownSymbol(CVSymbol &CVR) {
-  DictScope S(W, "UnknownSym");
-  W.printEnum("Kind", uint16_t(CVR.kind()), getSymbolTypeNames());
   W.printNumber("Length", CVR.length());
   return Error::success();
 }
 
 Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
   SymbolVisitorCallbackPipeline Pipeline;
-  SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
+  SymbolDeserializer Deserializer(ObjDelegate.get(), Container);
+  CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
@@ -679,8 +626,8 @@ Error CVSymbolDumper::dump(CVRecord<SymbolKind> &Record) {
 
 Error CVSymbolDumper::dump(const CVSymbolArray &Symbols) {
   SymbolVisitorCallbackPipeline Pipeline;
-  SymbolDeserializer Deserializer(ObjDelegate.get());
-  CVSymbolDumperImpl Dumper(TypeDB, ObjDelegate.get(), W, PrintRecordBytes);
+  SymbolDeserializer Deserializer(ObjDelegate.get(), Container);
+  CVSymbolDumperImpl Dumper(Types, ObjDelegate.get(), W, PrintRecordBytes);
 
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(Dumper);
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp b/contrib/llvm/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
index bb17314..923837a 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/SymbolRecordMapping.cpp
@@ -40,6 +40,7 @@ Error SymbolRecordMapping::visitSymbolBegin(CVSymbol &Record) {
 }
 
 Error SymbolRecordMapping::visitSymbolEnd(CVSymbol &Record) {
+  error(IO.padToAlignment(alignOf(Container)));
   error(IO.endRecord());
   return Error::success();
 }
@@ -306,7 +307,7 @@ Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR,
 
   error(IO.mapInteger(FrameCookie.CodeOffset));
   error(IO.mapInteger(FrameCookie.Register));
-  error(IO.mapInteger(FrameCookie.CookieKind));
+  error(IO.mapEnum(FrameCookie.CookieKind));
   error(IO.mapInteger(FrameCookie.Flags));
 
   return Error::success();
@@ -360,7 +361,7 @@ Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR,
 Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR,
                                             PublicSym32 &Public) {
 
-  error(IO.mapInteger(Public.Index));
+  error(IO.mapEnum(Public.Flags));
   error(IO.mapInteger(Public.Offset));
   error(IO.mapInteger(Public.Segment));
   error(IO.mapStringZ(Public.Name));
@@ -438,7 +439,7 @@ Error SymbolRecordMapping::visitKnownRecord(CVSymbol &CVR,
 
   error(IO.mapInteger(RegRel.Offset));
   error(IO.mapInteger(RegRel.Type));
-  error(IO.mapInteger(RegRel.Register));
+  error(IO.mapEnum(RegRel.Register));
   error(IO.mapStringZ(RegRel.Name));
 
   return Error::success();
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/contrib/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
new file mode 100644
index 0000000..9a2e776
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -0,0 +1,61 @@
+//===- SymbolSerializer.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator,
+                                   CodeViewContainer Container)
+    : Storage(Allocator), RecordBuffer(MaxRecordLength),
+      Stream(RecordBuffer, support::little), Writer(Stream),
+      Mapping(Writer, Container) {}
+
+Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
+  assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!");
+
+  Writer.setOffset(0);
+
+  if (auto EC = writeRecordPrefix(Record.kind()))
+    return EC;
+
+  CurrentSymbol = Record.kind();
+  if (auto EC = Mapping.visitSymbolBegin(Record))
+    return EC;
+
+  return Error::success();
+}
+
+Error SymbolSerializer::visitSymbolEnd(CVSymbol &Record) {
+  assert(CurrentSymbol.hasValue() && "Not in a symbol mapping!");
+
+  if (auto EC = Mapping.visitSymbolEnd(Record))
+    return EC;
+
+  uint32_t RecordEnd = Writer.getOffset();
+  uint16_t Length = RecordEnd - 2;
+  Writer.setOffset(0);
+  if (auto EC = Writer.writeInteger(Length))
+    return EC;
+
+  uint8_t *StableStorage = Storage.Allocate<uint8_t>(RecordEnd);
+  ::memcpy(StableStorage, &RecordBuffer[0], RecordEnd);
+  Record.RecordData = ArrayRef<uint8_t>(StableStorage, RecordEnd);
+  CurrentSymbol.reset();
+
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
deleted file mode 100644
index d9d5639..0000000
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-//===- TypeDatabaseVisitor.cpp -------------------------------- *- C++ --*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-
-#include "llvm/ADT/SmallString.h"
-
-using namespace llvm;
-
-using namespace llvm::codeview;
-
-Error TypeDatabaseVisitor::visitTypeBegin(CVRecord<TypeLeafKind> &Record) {
-  assert(!IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
-
-  if (Record.Type == LF_FIELDLIST) {
-    // Record that we're in a field list so that members do not get assigned
-    // type indices.
-    IsInFieldList = true;
-  }
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) {
-  if (CVR.Type == LF_FIELDLIST) {
-    assert(IsInFieldList);
-    IsInFieldList = false;
-  }
-  assert(!IsInFieldList);
-
-  // Record every type that is not a field list member, even if Name is empty.
-  // CVUDTNames is indexed by type index, and must have one entry for every
-  // type.  Field list members are not recorded, and are only referenced by
-  // their containing field list record.
-  TypeDB.recordType(Name, CVR);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitMemberBegin(CVMemberRecord &Record) {
-  assert(IsInFieldList);
-  // Reset Name to the empty string. If the visitor sets it, we know it.
-  Name = "";
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitMemberEnd(CVMemberRecord &Record) {
-  assert(IsInFieldList);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            FieldListRecord &FieldList) {
-  Name = "<field list>";
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
-                                            StringIdRecord &String) {
-  // Put this in the database so it gets printed with LF_UDT_SRC_LINE.
-  Name = String.getString();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
-  auto Indices = Args.getIndices();
-  uint32_t Size = Indices.size();
-  SmallString<256> TypeName("(");
-  for (uint32_t I = 0; I < Size; ++I) {
-    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
-    TypeName.append(ArgTypeName);
-    if (I + 1 != Size)
-      TypeName.append(", ");
-  }
-  TypeName.push_back(')');
-  Name = TypeDB.saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
-  Name = Class.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
-  Name = Union.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
-  Name = Enum.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
-  Name = AT.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
-  Name = VFT.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            MemberFuncIdRecord &Id) {
-  Name = Id.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            ProcedureRecord &Proc) {
-  StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType());
-  StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ArgListTypeName);
-  Name = TypeDB.saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            MemberFunctionRecord &MF) {
-  StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType());
-  StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType());
-  StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList());
-  SmallString<256> TypeName(ReturnTypeName);
-  TypeName.push_back(' ');
-  TypeName.append(ClassTypeName);
-  TypeName.append("::");
-  TypeName.append(ArgListTypeName);
-  Name = TypeDB.saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
-  Name = Func.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            TypeServer2Record &TS) {
-  Name = TS.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
-
-  if (Ptr.isPointerToMember()) {
-    const MemberPointerInfo &MI = Ptr.getMemberInfo();
-
-    StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType());
-    StringRef ClassName = TypeDB.getTypeName(MI.getContainingType());
-    SmallString<256> TypeName(PointeeName);
-    TypeName.push_back(' ');
-    TypeName.append(ClassName);
-    TypeName.append("::*");
-    Name = TypeDB.saveTypeName(TypeName);
-  } else {
-    SmallString<256> TypeName;
-    if (Ptr.isConst())
-      TypeName.append("const ");
-    if (Ptr.isVolatile())
-      TypeName.append("volatile ");
-    if (Ptr.isUnaligned())
-      TypeName.append("__unaligned ");
-
-    TypeName.append(TypeDB.getTypeName(Ptr.getReferentType()));
-
-    if (Ptr.getMode() == PointerMode::LValueReference)
-      TypeName.append("&");
-    else if (Ptr.getMode() == PointerMode::RValueReference)
-      TypeName.append("&&");
-    else if (Ptr.getMode() == PointerMode::Pointer)
-      TypeName.append("*");
-
-    if (!TypeName.empty())
-      Name = TypeDB.saveTypeName(TypeName);
-  }
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
-  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
-
-  StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType());
-  SmallString<256> TypeName;
-  if (Mods & uint16_t(ModifierOptions::Const))
-    TypeName.append("const ");
-  if (Mods & uint16_t(ModifierOptions::Volatile))
-    TypeName.append("volatile ");
-  if (Mods & uint16_t(ModifierOptions::Unaligned))
-    TypeName.append("__unaligned ");
-  TypeName.append(ModifiedName);
-  Name = TypeDB.saveTypeName(TypeName);
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            VFTableShapeRecord &Shape) {
-  Name = TypeDB.saveTypeName("<vftable " + utostr(Shape.getEntryCount()) +
-                             " methods>");
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            NestedTypeRecord &Nested) {
-  Name = Nested.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            OneMethodRecord &Method) {
-  Name = Method.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            OverloadedMethodRecord &Method) {
-  Name = Method.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            DataMemberRecord &Field) {
-  Name = Field.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            StaticDataMemberRecord &Field) {
-  Name = Field.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            EnumeratorRecord &Enum) {
-  Name = Enum.getName();
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            BaseClassRecord &Base) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            VirtualBaseClassRecord &VBase) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            ListContinuationRecord &Cont) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(
-    CVType &CVR, UdtModSourceLineRecord &ModSourceLine) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
-                                            UdtSourceLineRecord &SourceLine) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BitFieldRecord &BF) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(
-    CVType &CVR, MethodOverloadListRecord &Overloads) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
-  return Error::success();
-}
-
-Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
-                                            VFPtrRecord &VFP) {
-  return Error::success();
-}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 033585b..e18a35c 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -1,5 +1,4 @@
-//===-- TypeDumpVisitor.cpp - CodeView type info dumper -----------*- C++
-//-*-===//
+//===-- TypeDumpVisitor.cpp - CodeView type info dumper ----------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +10,13 @@
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 
 #include "llvm/ADT/SmallString.h"
-#include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
-#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
@@ -27,7 +24,7 @@ using namespace llvm::codeview;
 
 static const EnumEntry<TypeLeafKind> LeafTypeNames[] = {
 #define CV_TYPE(enum, val) {#enum, enum},
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
 };
 
 #define ENUM_ENTRY(enum_class, enum)                                           \
@@ -145,6 +142,10 @@ static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
     ENUM_ENTRY(FunctionOptions, ConstructorWithVirtualBases),
 };
 
+static const EnumEntry<uint16_t> LabelTypeEnum[] = {
+    ENUM_ENTRY(LabelType, Near), ENUM_ENTRY(LabelType, Far),
+};
+
 #undef ENUM_ENTRY
 
 static StringRef getLeafTypeName(TypeLeafKind LT) {
@@ -152,7 +153,7 @@ static StringRef getLeafTypeName(TypeLeafKind LT) {
 #define TYPE_RECORD(ename, value, name)                                        \
   case ename:                                                                  \
     return #name;
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
   default:
     break;
   }
@@ -160,13 +161,20 @@ static StringRef getLeafTypeName(TypeLeafKind LT) {
 }
 
 void TypeDumpVisitor::printTypeIndex(StringRef FieldName, TypeIndex TI) const {
-  CVTypeDumper::printTypeIndex(*W, FieldName, TI, TypeDB);
+  codeview::printTypeIndex(*W, FieldName, TI, TpiTypes);
+}
+
+void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
+  codeview::printTypeIndex(*W, FieldName, TI, getSourceTypes());
 }
 
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
+  return visitTypeBegin(Record, TypeIndex::fromArrayIndex(TpiTypes.size()));
+}
+
+Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " (" << HexNumber(TypeDB.getNextTypeIndex().getIndex())
-                  << ")";
+  W->getOStream() << " (" << HexNumber(Index.getIndex()) << ")";
   W->getOStream() << " {\n";
   W->indent();
   W->printEnum("TypeLeafKind", unsigned(Record.Type),
@@ -203,15 +211,14 @@ Error TypeDumpVisitor::visitMemberEnd(CVMemberRecord &Record) {
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         FieldListRecord &FieldList) {
-  CVTypeVisitor Visitor(*this);
-  if (auto EC = Visitor.visitFieldListMemberStream(FieldList.Data))
+  if (auto EC = codeview::visitMemberRecordStream(FieldList.Data, *this))
     return EC;
 
   return Error::success();
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringIdRecord &String) {
-  printTypeIndex("Id", String.getId());
+  printItemIndex("Id", String.getId());
   W->printString("StringData", String.getString());
   return Error::success();
 }
@@ -227,6 +234,17 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   return Error::success();
 }
 
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringListRecord &Strs) {
+  auto Indices = Strs.getIndices();
+  uint32_t Size = Indices.size();
+  W->printNumber("NumStrings", Size);
+  ListScope Arguments(*W, "Strings");
+  for (uint32_t I = 0; I < Size; ++I) {
+    printItemIndex("String", Indices[I]);
+  }
+  return Error::success();
+}
+
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   uint16_t Props = static_cast<uint16_t>(Class.getOptions());
   W->printNumber("MemberCount", Class.getMemberCount());
@@ -329,14 +347,14 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
-  printTypeIndex("ParentScope", Func.getParentScope());
+  printItemIndex("ParentScope", Func.getParentScope());
   printTypeIndex("FunctionType", Func.getFunctionType());
   W->printString("Name", Func.getName());
   return Error::success();
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
-  W->printBinary("Signature", TS.getGuid());
+  W->printString("Guid", formatv("{0}", TS.getGuid()).str());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
   return Error::success();
@@ -390,7 +408,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         UdtSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
-  printTypeIndex("SourceFile", Line.getSourceFile());
+  printItemIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   return Error::success();
 }
@@ -398,7 +416,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         UdtModSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
-  printTypeIndex("SourceFile", Line.getSourceFile());
+  printItemIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   W->printNumber("Module", Line.getModule());
   return Error::success();
@@ -409,7 +427,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &Args) {
 
   ListScope Arguments(*W, "Arguments");
   for (auto Arg : Args.getArgs()) {
-    printTypeIndex("ArgType", Arg);
+    printItemIndex("ArgType", Arg);
   }
   return Error::success();
 }
@@ -530,3 +548,8 @@ Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
   printTypeIndex("ContinuationIndex", Cont.getContinuationIndex());
   return Error::success();
 }
+
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, LabelRecord &LR) {
+  W->printEnum("Mode", uint16_t(LR.Mode), makeArrayRef(LabelTypeEnum));
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeDatabase.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeIndex.cpp
index c7f7255..24fe5fc 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeIndex.cpp
@@ -1,4 +1,4 @@
-//===- TypeDatabase.cpp --------------------------------------- *- C++ --*-===//
+//===-- TypeIndex.cpp - CodeView type index ---------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+
+#include "llvm/DebugInfo/CodeView/TypeCollection.h"
+#include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -17,7 +20,6 @@ struct SimpleTypeEntry {
   StringRef Name;
   SimpleTypeKind Kind;
 };
-}
 
 /// The names here all end in "*". If the simple type is a pointer type, we
 /// return the whole name. Otherwise we lop off the last character in our
@@ -64,51 +66,39 @@ static const SimpleTypeEntry SimpleTypeNames[] = {
     {"__bool32*", SimpleTypeKind::Boolean32},
     {"__bool64*", SimpleTypeKind::Boolean64},
 };
+} // namespace
 
-/// Gets the type index for the next type record.
-TypeIndex TypeDatabase::getNextTypeIndex() const {
-  return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size());
-}
-
-/// Records the name of a type, and reserves its type index.
-void TypeDatabase::recordType(StringRef Name, CVType Data) {
-  CVUDTNames.push_back(Name);
-  TypeRecords.push_back(Data);
-}
-
-/// Saves the name in a StringSet and creates a stable StringRef.
-StringRef TypeDatabase::saveTypeName(StringRef TypeName) {
-  return TypeNameStorage.save(TypeName);
-}
+StringRef TypeIndex::simpleTypeName(TypeIndex TI) {
+  assert(TI.isNoneType() || TI.isSimple());
 
-StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
-  if (Index.isNoneType())
+  if (TI.isNoneType())
     return "<no type>";
 
-  if (Index.isSimple()) {
-    // This is a simple type.
-    for (const auto &SimpleTypeName : SimpleTypeNames) {
-      if (SimpleTypeName.Kind == Index.getSimpleKind()) {
-        if (Index.getSimpleMode() == SimpleTypeMode::Direct)
-          return SimpleTypeName.Name.drop_back(1);
-        // Otherwise, this is a pointer type. We gloss over the distinction
-        // between near, far, 64, 32, etc, and just give a pointer type.
-        return SimpleTypeName.Name;
-      }
+  // This is a simple type.
+  for (const auto &SimpleTypeName : SimpleTypeNames) {
+    if (SimpleTypeName.Kind == TI.getSimpleKind()) {
+      if (TI.getSimpleMode() == SimpleTypeMode::Direct)
+        return SimpleTypeName.Name.drop_back(1);
+      // Otherwise, this is a pointer type. We gloss over the distinction
+      // between near, far, 64, 32, etc, and just give a pointer type.
+      return SimpleTypeName.Name;
     }
-    return "<unknown simple type>";
   }
-
-  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  if (I < CVUDTNames.size())
-    return CVUDTNames[I];
-
-  return "<unknown UDT>";
+  return "<unknown simple type>";
 }
 
-bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
-  uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  return I < CVUDTNames.size();
-}
+void llvm::codeview::printTypeIndex(ScopedPrinter &Printer, StringRef FieldName,
+                                    TypeIndex TI, TypeCollection &Types) {
+  StringRef TypeName;
+  if (!TI.isNoneType()) {
+    if (TI.isSimple())
+      TypeName = TypeIndex::simpleTypeName(TI);
+    else
+      TypeName = Types.getTypeName(TI);
+  }
 
-uint32_t TypeDatabase::size() const { return CVUDTNames.size(); }
+  if (!TypeName.empty())
+    Printer.printHex(FieldName, TypeName, TI.getIndex());
+  else
+    Printer.printHex(FieldName, TI.getIndex());
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
new file mode 100644
index 0000000..0d935c4
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
@@ -0,0 +1,484 @@
+//===- TypeIndexDiscovery.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static inline MethodKind getMethodKind(uint16_t Attrs) {
+  Attrs &= uint16_t(MethodOptions::MethodKindMask);
+  Attrs >>= 2;
+  return MethodKind(Attrs);
+}
+
+static inline bool isIntroVirtual(uint16_t Attrs) {
+  MethodKind MK = getMethodKind(Attrs);
+  return MK == MethodKind::IntroducingVirtual ||
+         MK == MethodKind::PureIntroducingVirtual;
+}
+
+static inline PointerMode getPointerMode(uint32_t Attrs) {
+  return static_cast<PointerMode>((Attrs >> PointerRecord::PointerModeShift) &
+                                  PointerRecord::PointerModeMask);
+}
+
+static inline bool isMemberPointer(uint32_t Attrs) {
+  PointerMode Mode = getPointerMode(Attrs);
+  return Mode == PointerMode::PointerToDataMember ||
+         Mode == PointerMode::PointerToMemberFunction;
+}
+
+static inline uint32_t getEncodedIntegerLength(ArrayRef<uint8_t> Data) {
+  uint16_t N = support::endian::read16le(Data.data());
+  if (N < LF_NUMERIC)
+    return 2;
+
+  assert(N <= LF_UQUADWORD);
+
+  constexpr uint32_t Sizes[] = {
+      1,  // LF_CHAR
+      2,  // LF_SHORT
+      2,  // LF_USHORT
+      4,  // LF_LONG
+      4,  // LF_ULONG
+      4,  // LF_REAL32
+      8,  // LF_REAL64
+      10, // LF_REAL80
+      16, // LF_REAL128
+      8,  // LF_QUADWORD
+      8,  // LF_UQUADWORD
+  };
+
+  return Sizes[N - LF_NUMERIC];
+}
+
+static inline uint32_t getCStringLength(ArrayRef<uint8_t> Data) {
+  const char *S = reinterpret_cast<const char *>(Data.data());
+  return strlen(S) + 1;
+}
+
+static void handleMethodOverloadList(ArrayRef<uint8_t> Content,
+                                     SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Offset = 0;
+
+  while (!Content.empty()) {
+    // Array of:
+    //   0: Attrs
+    //   2: Padding
+    //   4: TypeIndex
+    //   if (isIntroVirtual())
+    //     8: VFTableOffset
+
+    // At least 8 bytes are guaranteed.  4 extra bytes come iff function is an
+    // intro virtual.
+    uint32_t Len = 8;
+
+    uint16_t Attrs = support::endian::read16le(Content.data());
+    Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+
+    if (LLVM_UNLIKELY(isIntroVirtual(Attrs)))
+      Len += 4;
+    Offset += Len;
+    Content = Content.drop_front(Len);
+  }
+}
+
+static uint32_t handleBaseClass(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Encoded Integer
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getEncodedIntegerLength(Data.drop_front(8));
+}
+
+static uint32_t handleEnumerator(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                 SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: Encoded Integer
+  // <next>: Name
+  uint32_t Size = 4 + getEncodedIntegerLength(Data.drop_front(4));
+  return Size + getCStringLength(Data.drop_front(Size));
+}
+
+static uint32_t handleDataMember(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                 SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Encoded Integer
+  // <next>: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  uint32_t Size = 8 + getEncodedIntegerLength(Data.drop_front(8));
+  return Size + getCStringLength(Data.drop_front(Size));
+}
+
+static uint32_t handleOverloadedMethod(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getCStringLength(Data.drop_front(8));
+}
+
+static uint32_t handleOneMethod(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Attributes
+  // 4: Type
+  // if (isIntroVirtual)
+  //   8: VFTableOffset
+  // <next>: Name
+  uint32_t Size = 8;
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+
+  uint16_t Attrs = support::endian::read16le(Data.drop_front(2).data());
+  if (LLVM_UNLIKELY(isIntroVirtual(Attrs)))
+    Size += 4;
+
+  return Size + getCStringLength(Data.drop_front(Size));
+}
+
+static uint32_t handleNestedType(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                 SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getCStringLength(Data.drop_front(8));
+}
+
+static uint32_t handleStaticDataMember(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getCStringLength(Data.drop_front(8));
+}
+
+static uint32_t handleVirtualBaseClass(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       bool IsIndirect,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Attrs
+  // 4: TypeIndex
+  // 8: TypeIndex
+  // 12: Encoded Integer
+  // <next>: Encoded Integer
+  uint32_t Size = 12;
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 2});
+  Size += getEncodedIntegerLength(Data.drop_front(Size));
+  Size += getEncodedIntegerLength(Data.drop_front(Size));
+  return Size;
+}
+
+static uint32_t handleVFPtr(ArrayRef<uint8_t> Data, uint32_t Offset,
+                            SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8;
+}
+
+static uint32_t handleListContinuation(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8;
+}
+
+static void handleFieldList(ArrayRef<uint8_t> Content,
+                            SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Offset = 0;
+  uint32_t ThisLen = 0;
+  while (!Content.empty()) {
+    TypeLeafKind Kind =
+        static_cast<TypeLeafKind>(support::endian::read16le(Content.data()));
+    switch (Kind) {
+    case LF_BCLASS:
+      ThisLen = handleBaseClass(Content, Offset, Refs);
+      break;
+    case LF_ENUMERATE:
+      ThisLen = handleEnumerator(Content, Offset, Refs);
+      break;
+    case LF_MEMBER:
+      ThisLen = handleDataMember(Content, Offset, Refs);
+      break;
+    case LF_METHOD:
+      ThisLen = handleOverloadedMethod(Content, Offset, Refs);
+      break;
+    case LF_ONEMETHOD:
+      ThisLen = handleOneMethod(Content, Offset, Refs);
+      break;
+    case LF_NESTTYPE:
+      ThisLen = handleNestedType(Content, Offset, Refs);
+      break;
+    case LF_STMEMBER:
+      ThisLen = handleStaticDataMember(Content, Offset, Refs);
+      break;
+    case LF_VBCLASS:
+    case LF_IVBCLASS:
+      ThisLen =
+          handleVirtualBaseClass(Content, Offset, Kind == LF_VBCLASS, Refs);
+      break;
+    case LF_VFUNCTAB:
+      ThisLen = handleVFPtr(Content, Offset, Refs);
+      break;
+    case LF_INDEX:
+      ThisLen = handleListContinuation(Content, Offset, Refs);
+      break;
+    default:
+      return;
+    }
+    Content = Content.drop_front(ThisLen);
+    Offset += ThisLen;
+    if (!Content.empty()) {
+      uint8_t Pad = Content.front();
+      if (Pad >= LF_PAD0) {
+        uint32_t Skip = Pad & 0x0F;
+        Content = Content.drop_front(Skip);
+        Offset += Skip;
+      }
+    }
+  }
+}
+
+static void handlePointer(ArrayRef<uint8_t> Content,
+                          SmallVectorImpl<TiReference> &Refs) {
+  Refs.push_back({TiRefKind::TypeRef, 0, 1});
+
+  uint32_t Attrs = support::endian::read32le(Content.drop_front(4).data());
+  if (isMemberPointer(Attrs))
+    Refs.push_back({TiRefKind::TypeRef, 8, 1});
+}
+
+static void discoverTypeIndices(ArrayRef<uint8_t> Content, TypeLeafKind Kind,
+                                SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Count;
+  // FIXME: In the future it would be nice if we could avoid hardcoding these
+  // values.  One idea is to define some structures representing these types
+  // that would allow the use of offsetof().
+  switch (Kind) {
+  case TypeLeafKind::LF_FUNC_ID:
+    Refs.push_back({TiRefKind::IndexRef, 0, 1});
+    Refs.push_back({TiRefKind::TypeRef, 4, 1});
+    break;
+  case TypeLeafKind::LF_MFUNC_ID:
+    Refs.push_back({TiRefKind::TypeRef, 0, 2});
+    break;
+  case TypeLeafKind::LF_STRING_ID:
+    Refs.push_back({TiRefKind::IndexRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_SUBSTR_LIST:
+    Count = support::endian::read32le(Content.data());
+    if (Count > 0)
+      Refs.push_back({TiRefKind::IndexRef, 4, Count});
+    break;
+  case TypeLeafKind::LF_BUILDINFO:
+    Count = support::endian::read16le(Content.data());
+    if (Count > 0)
+      Refs.push_back({TiRefKind::IndexRef, 2, Count});
+    break;
+  case TypeLeafKind::LF_UDT_SRC_LINE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    Refs.push_back({TiRefKind::IndexRef, 4, 1});
+    break;
+  case TypeLeafKind::LF_UDT_MOD_SRC_LINE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_MODIFIER:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_PROCEDURE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    Refs.push_back({TiRefKind::TypeRef, 8, 1});
+    break;
+  case TypeLeafKind::LF_MFUNCTION:
+    Refs.push_back({TiRefKind::TypeRef, 0, 3});
+    Refs.push_back({TiRefKind::TypeRef, 16, 1});
+    break;
+  case TypeLeafKind::LF_ARGLIST:
+    Count = support::endian::read32le(Content.data());
+    if (Count > 0)
+      Refs.push_back({TiRefKind::TypeRef, 4, Count});
+    break;
+  case TypeLeafKind::LF_ARRAY:
+    Refs.push_back({TiRefKind::TypeRef, 0, 2});
+    break;
+  case TypeLeafKind::LF_CLASS:
+  case TypeLeafKind::LF_STRUCTURE:
+  case TypeLeafKind::LF_INTERFACE:
+    Refs.push_back({TiRefKind::TypeRef, 4, 3});
+    break;
+  case TypeLeafKind::LF_UNION:
+    Refs.push_back({TiRefKind::TypeRef, 4, 1});
+    break;
+  case TypeLeafKind::LF_ENUM:
+    Refs.push_back({TiRefKind::TypeRef, 4, 2});
+    break;
+  case TypeLeafKind::LF_BITFIELD:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_VFTABLE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 2});
+    break;
+  case TypeLeafKind::LF_VTSHAPE:
+    break;
+  case TypeLeafKind::LF_METHODLIST:
+    handleMethodOverloadList(Content, Refs);
+    break;
+  case TypeLeafKind::LF_FIELDLIST:
+    handleFieldList(Content, Refs);
+    break;
+  case TypeLeafKind::LF_POINTER:
+    handlePointer(Content, Refs);
+    break;
+  default:
+    break;
+  }
+}
+
+static bool discoverTypeIndices(ArrayRef<uint8_t> Content, SymbolKind Kind,
+                                SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Count;
+  // FIXME: In the future it would be nice if we could avoid hardcoding these
+  // values.  One idea is to define some structures representing these types
+  // that would allow the use of offsetof().
+  switch (Kind) {
+  case SymbolKind::S_GPROC32:
+  case SymbolKind::S_LPROC32:
+  case SymbolKind::S_GPROC32_ID:
+  case SymbolKind::S_LPROC32_ID:
+  case SymbolKind::S_LPROC32_DPC:
+  case SymbolKind::S_LPROC32_DPC_ID:
+    Refs.push_back({TiRefKind::IndexRef, 24, 1}); // LF_FUNC_ID
+    break;
+  case SymbolKind::S_UDT:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // UDT
+    break;
+  case SymbolKind::S_GDATA32:
+  case SymbolKind::S_LDATA32:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_BUILDINFO:
+    Refs.push_back({TiRefKind::IndexRef, 0, 1}); // Compile flags
+    break;
+  case SymbolKind::S_LTHREAD32:
+  case SymbolKind::S_GTHREAD32:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_FILESTATIC:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_LOCAL:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_CONSTANT:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1}); // Type
+    break;
+  case SymbolKind::S_REGREL32:
+    Refs.push_back({TiRefKind::TypeRef, 4, 1}); // Type
+    break;
+  case SymbolKind::S_CALLSITEINFO:
+    Refs.push_back({TiRefKind::TypeRef, 8, 1}); // Call signature
+    break;
+  case SymbolKind::S_CALLERS:
+  case SymbolKind::S_CALLEES:
+    // The record is a count followed by an array of type indices.
+    Count = *reinterpret_cast<const ulittle32_t *>(Content.data());
+    Refs.push_back({TiRefKind::IndexRef, 4, Count}); // Callees
+    break;
+  case SymbolKind::S_INLINESITE:
+    Refs.push_back({TiRefKind::IndexRef, 8, 1}); // ID of inlinee
+    break;
+  case SymbolKind::S_HEAPALLOCSITE:
+    // FIXME: It's not clear if this is a type or item reference.
+    Refs.push_back({TiRefKind::IndexRef, 8, 1}); // signature
+    break;
+
+  // Defranges don't have types, just registers and code offsets.
+  case SymbolKind::S_DEFRANGE_REGISTER:
+  case SymbolKind::S_DEFRANGE_REGISTER_REL:
+  case SymbolKind::S_DEFRANGE_FRAMEPOINTER_REL:
+  case SymbolKind::S_DEFRANGE_FRAMEPOINTER_REL_FULL_SCOPE:
+  case SymbolKind::S_DEFRANGE_SUBFIELD_REGISTER:
+  case SymbolKind::S_DEFRANGE_SUBFIELD:
+    break;
+
+  // No type refernces.
+  case SymbolKind::S_LABEL32:
+  case SymbolKind::S_OBJNAME:
+  case SymbolKind::S_COMPILE:
+  case SymbolKind::S_COMPILE2:
+  case SymbolKind::S_COMPILE3:
+  case SymbolKind::S_ENVBLOCK:
+  case SymbolKind::S_BLOCK32:
+  case SymbolKind::S_FRAMEPROC:
+    break;
+  // Scope ending symbols.
+  case SymbolKind::S_END:
+  case SymbolKind::S_INLINESITE_END:
+  case SymbolKind::S_PROC_ID_END:
+    break;
+  default:
+    return false; // Unknown symbol.
+  }
+  return true;
+}
+
+void llvm::codeview::discoverTypeIndices(const CVType &Type,
+                                         SmallVectorImpl<TiReference> &Refs) {
+  ::discoverTypeIndices(Type.content(), Type.kind(), Refs);
+}
+
+void llvm::codeview::discoverTypeIndices(const CVType &Type,
+                                         SmallVectorImpl<TypeIndex> &Indices) {
+
+  Indices.clear();
+
+  SmallVector<TiReference, 4> Refs;
+  discoverTypeIndices(Type, Refs);
+  if (Refs.empty())
+    return;
+
+  BinaryStreamReader Reader(Type.content(), support::little);
+  for (const auto &Ref : Refs) {
+    Reader.setOffset(Ref.Offset);
+    FixedStreamArray<TypeIndex> Run;
+    cantFail(Reader.readArray(Run, Ref.Count));
+    Indices.append(Run.begin(), Run.end());
+  }
+}
+
+void llvm::codeview::discoverTypeIndices(ArrayRef<uint8_t> RecordData,
+                                         SmallVectorImpl<TiReference> &Refs) {
+  const RecordPrefix *P =
+      reinterpret_cast<const RecordPrefix *>(RecordData.data());
+  TypeLeafKind K = static_cast<TypeLeafKind>(uint16_t(P->RecordKind));
+  ::discoverTypeIndices(RecordData.drop_front(sizeof(RecordPrefix)), K, Refs);
+}
+
+bool llvm::codeview::discoverTypeIndices(const CVSymbol &Sym,
+                                         SmallVectorImpl<TiReference> &Refs) {
+  SymbolKind K = Sym.kind();
+  return ::discoverTypeIndices(Sym.content(), K, Refs);
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeName.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeName.cpp
new file mode 100644
index 0000000..2eb8b81
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeName.cpp
@@ -0,0 +1,243 @@
+//===- TypeName.cpp ------------------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeName.h"
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+class TypeNameComputer : public TypeVisitorCallbacks {
+  /// The type collection.  Used to calculate names of nested types.
+  TypeCollection &Types;
+  TypeIndex CurrentTypeIndex = TypeIndex::None();
+
+  /// Name of the current type. Only valid before visitTypeEnd.
+  SmallString<256> Name;
+
+public:
+  explicit TypeNameComputer(TypeCollection &Types) : Types(Types) {}
+
+  StringRef name() const { return Name; }
+
+  /// Paired begin/end actions for all types. Receives all record data,
+  /// including the fixed-length record prefix.
+  Error visitTypeBegin(CVType &Record) override;
+  Error visitTypeBegin(CVType &Record, TypeIndex Index) override;
+  Error visitTypeEnd(CVType &Record) override;
+
+#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
+  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD(EnumName, EnumVal, Name)
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+};
+} // namespace
+
+Error TypeNameComputer::visitTypeBegin(CVType &Record) {
+  llvm_unreachable("Must call visitTypeBegin with a TypeIndex!");
+  return Error::success();
+}
+
+Error TypeNameComputer::visitTypeBegin(CVType &Record, TypeIndex Index) {
+  // Reset Name to the empty string. If the visitor sets it, we know it.
+  Name = "";
+  CurrentTypeIndex = Index;
+  return Error::success();
+}
+
+Error TypeNameComputer::visitTypeEnd(CVType &CVR) { return Error::success(); }
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         FieldListRecord &FieldList) {
+  Name = "<field list>";
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVRecord<TypeLeafKind> &CVR,
+                                         StringIdRecord &String) {
+  Name = String.getString();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
+  auto Indices = Args.getIndices();
+  uint32_t Size = Indices.size();
+  Name = "(";
+  for (uint32_t I = 0; I < Size; ++I) {
+    assert(Indices[I] < CurrentTypeIndex);
+
+    Name.append(Types.getTypeName(Indices[I]));
+    if (I + 1 != Size)
+      Name.append(", ");
+  }
+  Name.push_back(')');
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         StringListRecord &Strings) {
+  auto Indices = Strings.getIndices();
+  uint32_t Size = Indices.size();
+  Name = "\"";
+  for (uint32_t I = 0; I < Size; ++I) {
+    Name.append(Types.getTypeName(Indices[I]));
+    if (I + 1 != Size)
+      Name.append("\" \"");
+  }
+  Name.push_back('\"');
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
+  Name = Class.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, UnionRecord &Union) {
+  Name = Union.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, EnumRecord &Enum) {
+  Name = Enum.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ArrayRecord &AT) {
+  Name = AT.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, VFTableRecord &VFT) {
+  Name = VFT.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, MemberFuncIdRecord &Id) {
+  Name = Id.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ProcedureRecord &Proc) {
+  StringRef Ret = Types.getTypeName(Proc.getReturnType());
+  StringRef Params = Types.getTypeName(Proc.getArgumentList());
+  Name = formatv("{0} {1}", Ret, Params).sstr<256>();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         MemberFunctionRecord &MF) {
+  StringRef Ret = Types.getTypeName(MF.getReturnType());
+  StringRef Class = Types.getTypeName(MF.getClassType());
+  StringRef Params = Types.getTypeName(MF.getArgumentList());
+  Name = formatv("{0} {1}::{2}", Ret, Class, Params).sstr<256>();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
+  Name = Func.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
+  Name = TS.getName();
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) {
+
+  if (Ptr.isPointerToMember()) {
+    const MemberPointerInfo &MI = Ptr.getMemberInfo();
+
+    StringRef Pointee = Types.getTypeName(Ptr.getReferentType());
+    StringRef Class = Types.getTypeName(MI.getContainingType());
+    Name = formatv("{0} {1}::*", Pointee, Class);
+  } else {
+    if (Ptr.isConst())
+      Name.append("const ");
+    if (Ptr.isVolatile())
+      Name.append("volatile ");
+    if (Ptr.isUnaligned())
+      Name.append("__unaligned ");
+
+    Name.append(Types.getTypeName(Ptr.getReferentType()));
+
+    if (Ptr.getMode() == PointerMode::LValueReference)
+      Name.append("&");
+    else if (Ptr.getMode() == PointerMode::RValueReference)
+      Name.append("&&");
+    else if (Ptr.getMode() == PointerMode::Pointer)
+      Name.append("*");
+  }
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) {
+  uint16_t Mods = static_cast<uint16_t>(Mod.getModifiers());
+
+  SmallString<256> TypeName;
+  if (Mods & uint16_t(ModifierOptions::Const))
+    Name.append("const ");
+  if (Mods & uint16_t(ModifierOptions::Volatile))
+    Name.append("volatile ");
+  if (Mods & uint16_t(ModifierOptions::Unaligned))
+    Name.append("__unaligned ");
+  Name.append(Types.getTypeName(Mod.getModifiedType()));
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         VFTableShapeRecord &Shape) {
+  Name = formatv("<vftable {0} methods>", Shape.getEntryCount());
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(
+    CVType &CVR, UdtModSourceLineRecord &ModSourceLine) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         UdtSourceLineRecord &SourceLine) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, BitFieldRecord &BF) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR,
+                                         MethodOverloadListRecord &Overloads) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
+  return Error::success();
+}
+
+Error TypeNameComputer::visitKnownRecord(CVType &CVR, LabelRecord &R) {
+  return Error::success();
+}
+
+std::string llvm::codeview::computeTypeName(TypeCollection &Types,
+                                            TypeIndex Index) {
+  TypeNameComputer Computer(Types);
+  CVType Record = Types.getType(Index);
+  if (auto EC = visitTypeRecord(Record, Index, Computer)) {
+    consumeError(std::move(EC));
+    return "<unknown UDT>";
+  }
+  return Computer.name();
+}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecord.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecord.cpp
deleted file mode 100644
index b951c06..0000000
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecord.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//===-- TypeRecord.cpp ------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-//===----------------------------------------------------------------------===//
-// Type index remapping
-//===----------------------------------------------------------------------===//
-
-static bool remapIndex(ArrayRef<TypeIndex> IndexMap, TypeIndex &Idx) {
-  // Simple types are unchanged.
-  if (Idx.isSimple())
-    return true;
-  unsigned MapPos = Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  if (MapPos < IndexMap.size()) {
-    Idx = IndexMap[MapPos];
-    return true;
-  }
-
-  // This type index is invalid. Remap this to "not translated by cvpack",
-  // and return failure.
-  Idx = TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct);
-  return false;
-}
-
-bool ModifierRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ModifiedType);
-}
-
-bool ProcedureRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReturnType);
-  Success &= remapIndex(IndexMap, ArgumentList);
-  return Success;
-}
-
-bool MemberFunctionRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReturnType);
-  Success &= remapIndex(IndexMap, ClassType);
-  Success &= remapIndex(IndexMap, ThisType);
-  Success &= remapIndex(IndexMap, ArgumentList);
-  return Success;
-}
-
-bool MemberFuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ClassType);
-  Success &= remapIndex(IndexMap, FunctionType);
-  return Success;
-}
-
-bool ArgListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (TypeIndex &Str : StringIndices)
-    Success &= remapIndex(IndexMap, Str);
-  return Success;
-}
-
-bool MemberPointerInfo::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ContainingType);
-}
-
-bool PointerRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReferentType);
-  if (isPointerToMember())
-    Success &= MemberInfo->remapTypeIndices(IndexMap);
-  return Success;
-}
-
-bool NestedTypeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool ArrayRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ElementType);
-  Success &= remapIndex(IndexMap, IndexType);
-  return Success;
-}
-
-bool TagRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, FieldList);
-}
-
-bool ClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= TagRecord::remapTypeIndices(IndexMap);
-  Success &= remapIndex(IndexMap, DerivationList);
-  Success &= remapIndex(IndexMap, VTableShape);
-  return Success;
-}
-
-bool EnumRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= TagRecord::remapTypeIndices(IndexMap);
-  Success &= remapIndex(IndexMap, UnderlyingType);
-  return Success;
-}
-
-bool BitFieldRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool VFTableShapeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool TypeServer2Record::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool StringIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Id);
-}
-
-bool FuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ParentScope);
-  Success &= remapIndex(IndexMap, FunctionType);
-  return Success;
-}
-
-bool UdtSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, UDT);
-  Success &= remapIndex(IndexMap, SourceFile);
-  return Success;
-}
-
-bool UdtModSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, UDT);
-  Success &= remapIndex(IndexMap, SourceFile);
-  return Success;
-}
-
-bool BuildInfoRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (TypeIndex &Arg : ArgIndices)
-    Success &= remapIndex(IndexMap, Arg);
-  return Success;
-}
-
-bool VFTableRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, CompleteClass);
-  Success &= remapIndex(IndexMap, OverriddenVFTable);
-  return Success;
-}
-
-bool OneMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, Type);
-  return Success;
-}
-
-bool MethodOverloadListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (OneMethodRecord &Meth : Methods)
-    if ((Success = Meth.remapTypeIndices(IndexMap)))
-      return Success;
-  return Success;
-}
-
-bool OverloadedMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, MethodList);
-}
-
-bool DataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool StaticDataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool EnumeratorRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool VFPtrRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool BaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool VirtualBaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, BaseType);
-  Success &= remapIndex(IndexMap, VBPtrType);
-  return Success;
-}
-
-bool ListContinuationRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ContinuationIndex);
-}
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index f46e08d..114f6fd 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -67,12 +67,9 @@ static Error mapNameAndUniqueName(CodeViewRecordIO &IO, StringRef &Name,
       error(IO.mapStringZ(N));
       error(IO.mapStringZ(U));
     } else {
-      size_t BytesNeeded = Name.size() + 1;
-      StringRef N = Name;
-      if (BytesNeeded > BytesLeft) {
-        size_t BytesToDrop = std::min(N.size(), BytesToDrop);
-        N = N.drop_back(BytesToDrop);
-      }
+      // Cap the length of the string at however many bytes we have available,
+      // plus one for the required null terminator.
+      auto N = StringRef(Name).take_front(BytesLeft - 1);
       error(IO.mapStringZ(N));
     }
   } else {
@@ -174,6 +171,15 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ArgListRecord &Record) {
   error(IO.mapVectorN<uint32_t>(
+      Record.ArgIndices,
+      [](CodeViewRecordIO &IO, TypeIndex &N) { return IO.mapInteger(N); }));
+
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
+                                          StringListRecord &Record) {
+  error(IO.mapVectorN<uint32_t>(
       Record.StringIndices,
       [](CodeViewRecordIO &IO, TypeIndex &N) { return IO.mapInteger(N); }));
 
@@ -368,6 +374,14 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           TypeServer2Record &Record) {
+  error(IO.mapGuid(Record.Guid));
+  error(IO.mapInteger(Record.Age));
+  error(IO.mapStringZ(Record.Name));
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR, LabelRecord &Record) {
+  error(IO.mapEnum(Record.Mode));
   return Error::success();
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeSerializer.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeSerializer.cpp
index f24fcff..003c13b 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -1,4 +1,4 @@
-//===- TypeSerialzier.cpp ---------------------------------------*- C++ -*-===//
+//===- TypeSerialzier.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,29 +8,136 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
-
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-
-#include <string.h>
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-bool TypeSerializer::isInFieldList() const {
-  return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
+namespace {
+
+struct HashedType {
+  uint64_t Hash;
+  const uint8_t *Data;
+  unsigned Size; // FIXME: Go to uint16_t?
+  TypeIndex Index;
+};
+
+/// Wrapper around a poitner to a HashedType. Hash and equality operations are
+/// based on data in the pointee.
+struct HashedTypePtr {
+  HashedTypePtr() = default;
+  HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
+
+  HashedType *Ptr = nullptr;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<HashedTypePtr> {
+  static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
+
+  static inline HashedTypePtr getTombstoneKey() {
+    return HashedTypePtr(reinterpret_cast<HashedType *>(1));
+  }
+
+  static unsigned getHashValue(HashedTypePtr Val) {
+    assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
+    return Val.Ptr->Hash;
+  }
+
+  static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
+    HashedType *LHS = LHSP.Ptr;
+    HashedType *RHS = RHSP.Ptr;
+    if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
+      return LHS == RHS;
+    if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
+      return false;
+    return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
+  }
+};
+
+} // end namespace llvm
+
+/// Private implementation so that we don't leak our DenseMap instantiations to
+/// users.
+class llvm::codeview::TypeHasher {
+private:
+  /// Storage for type record provided by the caller. Records will outlive the
+  /// hasher object, so they should be allocated here.
+  BumpPtrAllocator &RecordStorage;
+
+  /// Storage for hash keys. These only need to live as long as the hashing
+  /// operation.
+  BumpPtrAllocator KeyStorage;
+
+  /// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
+  /// but DenseMap is inefficient when the keys are long (like type records)
+  /// because it recomputes the hash value of every key when it grows. This
+  /// value type stores the hash out of line in KeyStorage, so that table
+  /// entries are small and easy to rehash.
+  DenseSet<HashedTypePtr> HashedRecords;
+
+public:
+  TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}
+
+  void reset() { HashedRecords.clear(); }
+
+  /// Takes the bytes of type record, inserts them into the hash table, saves
+  /// them, and returns a pointer to an identical stable type record along with
+  /// its type index in the destination stream.
+  TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record, TypeIndex TI);
+};
+
+TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record,
+                                        TypeIndex TI) {
+  assert(Record.size() < UINT32_MAX && "Record too big");
+  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
+
+  // Compute the hash up front so we can store it in the key.
+  HashedType TempHashedType = {hash_value(Record), Record.data(),
+                               unsigned(Record.size()), TI};
+  auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
+  HashedType *&Hashed = Result.first->Ptr;
+
+  if (Result.second) {
+    // This was a new type record. We need stable storage for both the key and
+    // the record. The record should outlive the hashing operation.
+    Hashed = KeyStorage.Allocate<HashedType>();
+    *Hashed = TempHashedType;
+
+    uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+    memcpy(Stable, Record.data(), Record.size());
+    Hashed->Data = Stable;
+    assert(Hashed->Size == Record.size());
+  }
+
+  // Update the caller's copy of Record to point a stable copy.
+  Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);
+  return Hashed->Index;
 }
 
-TypeIndex TypeSerializer::calcNextTypeIndex() const {
-  if (LastTypeIndex.isNoneType())
-    return TypeIndex(TypeIndex::FirstNonSimpleIndex);
-  else
-    return TypeIndex(LastTypeIndex.getIndex() + 1);
+TypeIndex TypeSerializer::nextTypeIndex() const {
+  return TypeIndex::fromArrayIndex(SeenRecords.size());
 }
 
-TypeIndex TypeSerializer::incrementTypeIndex() {
-  TypeIndex Previous = LastTypeIndex;
-  LastTypeIndex = calcNextTypeIndex();
-  return Previous;
+bool TypeSerializer::isInFieldList() const {
+  return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
 }
 
 MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
@@ -51,21 +158,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) {
   return Error::success();
 }
 
-TypeIndex
-TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
-  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
-  StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
-
-  TypeIndex NextTypeIndex = calcNextTypeIndex();
-  auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
-  if (Result.second) {
-    LastTypeIndex = NextTypeIndex;
-    SeenRecords.push_back(Record);
-  }
-  return Result.first->getValue();
-}
-
 Expected<MutableArrayRef<uint8_t>>
 TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
   uint32_t Align = Record.size() % 4;
@@ -83,26 +175,79 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
   return MutableArrayRef<uint8_t>(Record.data(), Record.size() + N);
 }
 
-TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
-    : RecordStorage(Storage), LastTypeIndex(),
-      RecordBuffer(MaxRecordLength * 2), Stream(RecordBuffer), Writer(Stream),
+TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage, bool Hash)
+    : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
+      Stream(RecordBuffer, support::little), Writer(Stream),
       Mapping(Writer) {
   // RecordBuffer needs to be able to hold enough data so that if we are 1
   // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
   // we won't overflow.
+  if (Hash)
+    Hasher = llvm::make_unique<TypeHasher>(Storage);
 }
 
-ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
+TypeSerializer::~TypeSerializer() = default;
+
+ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
   return SeenRecords;
 }
 
-TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
+void TypeSerializer::reset() {
+  if (Hasher)
+    Hasher->reset();
+  Writer.setOffset(0);
+  CurrentSegment = RecordSegment();
+  FieldListSegments.clear();
+  TypeKind.reset();
+  MemberKind.reset();
+  SeenRecords.clear();
+}
+
+TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> &Record) {
+  assert(!TypeKind.hasValue() && "Already in a type mapping!");
+  assert(Writer.getOffset() == 0 && "Stream has data already!");
+
+  if (Hasher) {
+    TypeIndex ActualTI = Hasher->getOrCreateRecord(Record, nextTypeIndex());
+    if (nextTypeIndex() == ActualTI)
+      SeenRecords.push_back(Record);
+    return ActualTI;
+  }
+
+  TypeIndex NewTI = nextTypeIndex();
+  uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+  memcpy(Stable, Record.data(), Record.size());
+  Record = ArrayRef<uint8_t>(Stable, Record.size());
+  SeenRecords.push_back(Record);
+  return NewTI;
+}
 
-TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
+TypeIndex TypeSerializer::insertRecord(const RemappedType &Record) {
   assert(!TypeKind.hasValue() && "Already in a type mapping!");
   assert(Writer.getOffset() == 0 && "Stream has data already!");
 
-  return insertRecordBytesPrivate(Record);
+  TypeIndex TI;
+  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.RecordData;
+  if (Record.Mappings.empty()) {
+    // This record did not remap any type indices.  Just write it.
+    return insertRecordBytes(OriginalData);
+  }
+
+  // At least one type index was remapped.  Before we can hash it we have to
+  // copy the full record bytes, re-write each type index, then hash the copy.
+  // We do this in temporary storage since only the DenseMap can decide whether
+  // this record already exists, and if it does we don't want the memory to
+  // stick around.
+  RemapStorage.resize(OriginalData.size());
+  ::memcpy(&RemapStorage[0], OriginalData.data(), OriginalData.size());
+  uint8_t *ContentBegin = RemapStorage.data() + sizeof(RecordPrefix);
+  for (const auto &M : Record.Mappings) {
+    // First 4 bytes of every record are the record prefix, but the mapping
+    // offset is relative to the content which starts after.
+    *(TypeIndex *)(ContentBegin + M.first) = M.second;
+  }
+  auto RemapRef = makeArrayRef(RemapStorage);
+  return insertRecordBytes(RemapRef);
 }
 
 Error TypeSerializer::visitTypeBegin(CVType &Record) {
@@ -136,11 +281,14 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
       reinterpret_cast<RecordPrefix *>(ThisRecordData.data());
   Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
 
-  uint8_t *Copy = RecordStorage.Allocate<uint8_t>(ThisRecordData.size());
-  ::memcpy(Copy, ThisRecordData.data(), ThisRecordData.size());
-  ThisRecordData = MutableArrayRef<uint8_t>(Copy, ThisRecordData.size());
-  Record = CVType(*TypeKind, ThisRecordData);
-  TypeIndex InsertedTypeIndex = insertRecordBytesPrivate(ThisRecordData);
+  Record.Type = *TypeKind;
+  Record.RecordData = ThisRecordData;
+
+  // insertRecordBytes assumes we're not in a mapping, so do this first.
+  TypeKind.reset();
+  Writer.setOffset(0);
+
+  TypeIndex InsertedTypeIndex = insertRecordBytes(Record.RecordData);
 
   // Write out each additional segment in reverse order, and update each
   // record's continuation index to point to the previous one.
@@ -150,11 +298,9 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
         reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
     assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
     *CI = InsertedTypeIndex.getIndex();
-    InsertedTypeIndex = insertRecordBytesPrivate(X);
+    InsertedTypeIndex = insertRecordBytes(X);
   }
 
-  TypeKind.reset();
-  Writer.setOffset(0);
   FieldListSegments.clear();
   CurrentSegment.SubRecords.clear();
 
@@ -203,15 +349,15 @@ Error TypeSerializer::visitMemberEnd(CVMemberRecord &Record) {
 
     uint8_t *SegmentBytes = RecordStorage.Allocate<uint8_t>(LengthWithSize);
     auto SavedSegment = MutableArrayRef<uint8_t>(SegmentBytes, LengthWithSize);
-    msf::MutableByteStream CS(SavedSegment);
-    msf::StreamWriter CW(CS);
+    MutableBinaryByteStream CS(SavedSegment, support::little);
+    BinaryStreamWriter CW(CS);
     if (auto EC = CW.writeBytes(CopyData))
       return EC;
     if (auto EC = CW.writeEnum(TypeLeafKind::LF_INDEX))
       return EC;
-    if (auto EC = CW.writeInteger(uint16_t(0)))
+    if (auto EC = CW.writeInteger<uint16_t>(0))
       return EC;
-    if (auto EC = CW.writeInteger(uint32_t(0xB0C0B0C0)))
+    if (auto EC = CW.writeInteger<uint32_t>(0xB0C0B0C0))
       return EC;
     FieldListSegments.push_back(SavedSegment);
 
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index ed6cf57..bff3516 100644
--- a/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -10,13 +10,11 @@
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ScopedPrinter.h"
 
@@ -52,136 +50,280 @@ namespace {
 /// - If the type record already exists in the destination stream, discard it
 ///   and update the type index map to forward the source type index to the
 ///   existing destination type index.
-class TypeStreamMerger : public TypeVisitorCallbacks {
+///
+/// As an additional complication, type stream merging actually produces two
+/// streams: an item (or IPI) stream and a type stream, as this is what is
+/// actually stored in the final PDB. We choose which records go where by
+/// looking at the record kind.
+class TypeStreamMerger {
 public:
-  TypeStreamMerger(TypeTableBuilder &DestStream)
-      : DestStream(DestStream), FieldListBuilder(DestStream) {
-    assert(!hadError());
+  explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest)
+      : IndexMap(SourceToDest) {
+    SourceToDest.clear();
   }
 
-/// TypeVisitorCallbacks overrides.
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override;
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+  static const TypeIndex Untranslated;
 
-  Error visitUnknownType(CVType &Record) override;
+  Error mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
+                         const CVTypeArray &IdsAndTypes);
+  Error mergeIdRecords(TypeTableBuilder &Dest,
+                       ArrayRef<TypeIndex> TypeSourceToDest,
+                       const CVTypeArray &Ids);
+  Error mergeTypeRecords(TypeTableBuilder &Dest, const CVTypeArray &Types);
 
-  Error visitTypeBegin(CVType &Record) override;
-  Error visitTypeEnd(CVType &Record) override;
-  Error visitMemberEnd(CVMemberRecord &Record) override;
+private:
+  Error doit(const CVTypeArray &Types);
 
-  bool mergeStream(const CVTypeArray &Types);
+  Error remapAllTypes(const CVTypeArray &Types);
 
-private:
-  template <typename RecordType>
-  Error visitKnownRecordImpl(RecordType &Record) {
-    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);
-    IndexMap.push_back(DestStream.writeKnownType(Record));
-    return Error::success();
-  }
+  Error remapType(const CVType &Type);
 
-  Error visitKnownRecordImpl(FieldListRecord &Record) {
-    CVTypeVisitor Visitor(*this);
+  void addMapping(TypeIndex Idx);
 
-    if (auto EC = Visitor.visitFieldListMemberStream(Record.Data))
-      return EC;
-    return Error::success();
+  bool remapTypeIndex(TypeIndex &Idx);
+  bool remapItemIndex(TypeIndex &Idx);
+
+  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs);
+
+  bool remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map);
+
+  size_t slotForIndex(TypeIndex Idx) const {
+    assert(!Idx.isSimple() && "simple type indices have no slots");
+    return Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
   }
 
-  template <typename RecordType>
-  Error visitKnownMemberRecordImpl(RecordType &Record) {
-    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);
-    FieldListBuilder.writeMemberType(Record);
+  Error errorCorruptRecord() const {
+    return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+  }
+
+  Error writeRecord(TypeTableBuilder &Dest, const RemappedType &Record,
+                    bool RemapSuccess) {
+    TypeIndex DestIdx = Untranslated;
+    if (RemapSuccess)
+      DestIdx = Dest.writeSerializedRecord(Record);
+    addMapping(DestIdx);
     return Error::success();
   }
 
-  bool hadError() { return FoundBadTypeIndex; }
+  Optional<Error> LastError;
+
+  bool IsSecondPass = false;
 
-  bool FoundBadTypeIndex = false;
+  unsigned NumBadIndices = 0;
 
-  BumpPtrAllocator Allocator;
+  TypeIndex CurIndex{TypeIndex::FirstNonSimpleIndex};
 
-  TypeTableBuilder &DestStream;
-  FieldListRecordBuilder FieldListBuilder;
+  TypeTableBuilder *DestIdStream = nullptr;
+  TypeTableBuilder *DestTypeStream = nullptr;
 
-  bool IsInFieldList{false};
-  size_t BeginIndexMapSize = 0;
+  // If we're only mapping id records, this array contains the mapping for
+  // type records.
+  ArrayRef<TypeIndex> TypeLookup;
 
   /// Map from source type index to destination type index. Indexed by source
   /// type index minus 0x1000.
-  SmallVector<TypeIndex, 0> IndexMap;
+  SmallVectorImpl<TypeIndex> &IndexMap;
 };
 
 } // end anonymous namespace
 
-Error TypeStreamMerger::visitTypeBegin(CVRecord<TypeLeafKind> &Rec) {
-  if (Rec.Type == TypeLeafKind::LF_FIELDLIST) {
-    assert(!IsInFieldList);
-    IsInFieldList = true;
-    FieldListBuilder.begin();
-  } else
-    BeginIndexMapSize = IndexMap.size();
-  return Error::success();
-}
+const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
 
-Error TypeStreamMerger::visitTypeEnd(CVRecord<TypeLeafKind> &Rec) {
-  if (Rec.Type == TypeLeafKind::LF_FIELDLIST) {
-    TypeIndex Index = FieldListBuilder.end();
-    IndexMap.push_back(Index);
-    IsInFieldList = false;
+static bool isIdRecord(TypeLeafKind K) {
+  switch (K) {
+  case TypeLeafKind::LF_FUNC_ID:
+  case TypeLeafKind::LF_MFUNC_ID:
+  case TypeLeafKind::LF_STRING_ID:
+  case TypeLeafKind::LF_SUBSTR_LIST:
+  case TypeLeafKind::LF_BUILDINFO:
+  case TypeLeafKind::LF_UDT_SRC_LINE:
+  case TypeLeafKind::LF_UDT_MOD_SRC_LINE:
+    return true;
+  default:
+    return false;
   }
-  return Error::success();
 }
 
-Error TypeStreamMerger::visitMemberEnd(CVMemberRecord &Rec) {
-  assert(IndexMap.size() == BeginIndexMapSize + 1);
-  return Error::success();
+void TypeStreamMerger::addMapping(TypeIndex Idx) {
+  if (!IsSecondPass) {
+    assert(IndexMap.size() == slotForIndex(CurIndex) &&
+           "visitKnownRecord should add one index map entry");
+    IndexMap.push_back(Idx);
+  } else {
+    assert(slotForIndex(CurIndex) < IndexMap.size());
+    IndexMap[slotForIndex(CurIndex)] = Idx;
+  }
 }
 
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  Error TypeStreamMerger::visitKnownRecord(CVType &CVR,                        \
-                                           Name##Record &Record) {             \
-    return visitKnownRecordImpl(Record);                                       \
+bool TypeStreamMerger::remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map) {
+  // Simple types are unchanged.
+  if (Idx.isSimple())
+    return true;
+
+  // Check if this type index refers to a record we've already translated
+  // successfully. If it refers to a type later in the stream or a record we
+  // had to defer, defer it until later pass.
+  unsigned MapPos = slotForIndex(Idx);
+  if (MapPos < Map.size() && Map[MapPos] != Untranslated) {
+    Idx = Map[MapPos];
+    return true;
   }
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  Error TypeStreamMerger::visitKnownMember(CVMemberRecord &CVR,                \
-                                           Name##Record &Record) {             \
-    return visitKnownMemberRecordImpl(Record);                                 \
+
+  // If this is the second pass and this index isn't in the map, then it points
+  // outside the current type stream, and this is a corrupt record.
+  if (IsSecondPass && MapPos >= Map.size()) {
+    // FIXME: Print a more useful error. We can give the current record and the
+    // index that we think its pointing to.
+    LastError = joinErrors(std::move(*LastError), errorCorruptRecord());
   }
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
-
-Error TypeStreamMerger::visitUnknownType(CVType &Rec) {
-  // We failed to translate a type. Translate this index as "not translated".
-  IndexMap.push_back(
-      TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct));
-  return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+
+  ++NumBadIndices;
+
+  // This type index is invalid. Remap this to "not translated by cvpack",
+  // and return failure.
+  Idx = Untranslated;
+  return false;
 }
 
-bool TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
-  assert(IndexMap.empty());
-  TypeVisitorCallbackPipeline Pipeline;
+bool TypeStreamMerger::remapTypeIndex(TypeIndex &Idx) {
+  // If we're mapping a pure index stream, then IndexMap only contains mappings
+  // from OldIdStream -> NewIdStream, in which case we will need to use the
+  // special mapping from OldTypeStream -> NewTypeStream which was computed
+  // externally.  Regardless, we use this special map if and only if we are
+  // doing an id-only mapping.
+  if (DestTypeStream == nullptr)
+    return remapIndex(Idx, TypeLookup);
 
-  TypeDeserializer Deserializer;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(*this);
+  assert(TypeLookup.empty());
+  return remapIndex(Idx, IndexMap);
+}
 
-  CVTypeVisitor Visitor(Pipeline);
+bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) {
+  assert(DestIdStream);
+  return remapIndex(Idx, IndexMap);
+}
 
-  if (auto EC = Visitor.visitTypeStream(Types)) {
-    consumeError(std::move(EC));
-    return false;
+Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest,
+                                         const CVTypeArray &Types) {
+  DestTypeStream = &Dest;
+
+  return doit(Types);
+}
+
+Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest,
+                                       ArrayRef<TypeIndex> TypeSourceToDest,
+                                       const CVTypeArray &Ids) {
+  DestIdStream = &Dest;
+  TypeLookup = TypeSourceToDest;
+
+  return doit(Ids);
+}
+
+Error TypeStreamMerger::mergeTypesAndIds(TypeTableBuilder &DestIds,
+                                         TypeTableBuilder &DestTypes,
+                                         const CVTypeArray &IdsAndTypes) {
+  DestIdStream = &DestIds;
+  DestTypeStream = &DestTypes;
+  return doit(IdsAndTypes);
+}
+
+Error TypeStreamMerger::doit(const CVTypeArray &Types) {
+  if (auto EC = remapAllTypes(Types))
+    return EC;
+
+  // If we found bad indices but no other errors, try doing another pass and see
+  // if we can resolve the indices that weren't in the map on the first pass.
+  // This may require multiple passes, but we should always make progress. MASM
+  // is the only known CodeView producer that makes type streams that aren't
+  // topologically sorted. The standard library contains MASM-produced objects,
+  // so this is important to handle correctly, but we don't have to be too
+  // efficient. MASM type streams are usually very small.
+  while (!LastError && NumBadIndices > 0) {
+    unsigned BadIndicesRemaining = NumBadIndices;
+    IsSecondPass = true;
+    NumBadIndices = 0;
+    CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
+
+    if (auto EC = remapAllTypes(Types))
+      return EC;
+
+    assert(NumBadIndices <= BadIndicesRemaining &&
+           "second pass found more bad indices");
+    if (!LastError && NumBadIndices == BadIndicesRemaining) {
+      return llvm::make_error<CodeViewError>(
+          cv_error_code::corrupt_record, "input type graph contains cycles");
+    }
   }
-  IndexMap.clear();
-  return !hadError();
+
+  if (LastError)
+    return std::move(*LastError);
+  return Error::success();
+}
+
+Error TypeStreamMerger::remapAllTypes(const CVTypeArray &Types) {
+  for (const CVType &Type : Types)
+    if (auto EC = remapType(Type))
+      return EC;
+  return Error::success();
+}
+
+Error TypeStreamMerger::remapType(const CVType &Type) {
+  RemappedType R(Type);
+  SmallVector<TiReference, 32> Refs;
+  discoverTypeIndices(Type.RecordData, Refs);
+  bool MappedAllIndices = remapIndices(R, Refs);
+  TypeTableBuilder &Dest =
+      isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
+  if (auto EC = writeRecord(Dest, R, MappedAllIndices))
+    return EC;
+
+  ++CurIndex;
+  assert((IsSecondPass || IndexMap.size() == slotForIndex(CurIndex)) &&
+         "visitKnownRecord should add one index map entry");
+  return Error::success();
+}
+
+bool TypeStreamMerger::remapIndices(RemappedType &Record,
+                                    ArrayRef<TiReference> Refs) {
+  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.content();
+  bool Success = true;
+  for (auto &Ref : Refs) {
+    uint32_t Offset = Ref.Offset;
+    ArrayRef<uint8_t> Bytes = OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
+    ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
+                            Ref.Count);
+    for (auto TI : TIs) {
+      TypeIndex NewTI = TI;
+      bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
+                             ? remapItemIndex(NewTI)
+                             : remapTypeIndex(NewTI);
+      if (ThisSuccess && NewTI != TI)
+        Record.Mappings.emplace_back(Offset, NewTI);
+      Offset += sizeof(TypeIndex);
+      Success &= ThisSuccess;
+    }
+  }
+  return Success;
+}
+
+Error llvm::codeview::mergeTypeRecords(TypeTableBuilder &Dest,
+                                       SmallVectorImpl<TypeIndex> &SourceToDest,
+                                       const CVTypeArray &Types) {
+  TypeStreamMerger M(SourceToDest);
+  return M.mergeTypeRecords(Dest, Types);
+}
+
+Error llvm::codeview::mergeIdRecords(TypeTableBuilder &Dest,
+                                     ArrayRef<TypeIndex> TypeSourceToDest,
+                                     SmallVectorImpl<TypeIndex> &SourceToDest,
+                                     const CVTypeArray &Ids) {
+  TypeStreamMerger M(SourceToDest);
+  return M.mergeIdRecords(Dest, TypeSourceToDest, Ids);
 }
 
-bool llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestStream,
-                                      const CVTypeArray &Types) {
-  return TypeStreamMerger(DestStream).mergeStream(Types);
+Error llvm::codeview::mergeTypeAndIdRecords(
+    TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
+    SmallVectorImpl<TypeIndex> &SourceToDest, const CVTypeArray &IdsAndTypes) {
+  TypeStreamMerger M(SourceToDest);
+  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes);
 }
diff --git a/contrib/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/contrib/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
new file mode 100644
index 0000000..4eca5ae
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -0,0 +1,67 @@
+//===- TypeTableCollection.cpp -------------------------------- *- C++ --*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeTableCollection.h"
+
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeName.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
+    : NameStorage(Allocator), Records(Records) {
+  Names.resize(Records.size());
+}
+
+Optional<TypeIndex> TypeTableCollection::getFirst() {
+  if (empty())
+    return None;
+  return TypeIndex::fromArrayIndex(0);
+}
+
+Optional<TypeIndex> TypeTableCollection::getNext(TypeIndex Prev) {
+  assert(contains(Prev));
+  ++Prev;
+  if (Prev.toArrayIndex() == size())
+    return None;
+  return Prev;
+}
+
+CVType TypeTableCollection::getType(TypeIndex Index) {
+  assert(Index.toArrayIndex() < Records.size());
+  ArrayRef<uint8_t> Bytes = Records[Index.toArrayIndex()];
+  const RecordPrefix *Prefix =
+      reinterpret_cast<const RecordPrefix *>(Bytes.data());
+  TypeLeafKind Kind = static_cast<TypeLeafKind>(uint16_t(Prefix->RecordKind));
+  return CVType(Kind, Bytes);
+}
+
+StringRef TypeTableCollection::getTypeName(TypeIndex Index) {
+  if (Index.isNoneType() || Index.isSimple())
+    return TypeIndex::simpleTypeName(Index);
+
+  uint32_t I = Index.toArrayIndex();
+  if (Names[I].data() == nullptr) {
+    StringRef Result = NameStorage.save(computeTypeName(*this, Index));
+    Names[I] = Result;
+  }
+  return Names[I];
+}
+
+bool TypeTableCollection::contains(TypeIndex Index) {
+  return Index.toArrayIndex() <= size();
+}
+
+uint32_t TypeTableCollection::size() { return Records.size(); }
+
+uint32_t TypeTableCollection::capacity() { return Records.size(); }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 08bc74a..bb475a6 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
+//===- DWARFAbbreviationDeclaration.cpp -----------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,11 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -58,47 +65,52 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
     if (A && F) {
       Optional<int64_t> V;
       bool IsImplicitConst = (F == DW_FORM_implicit_const);
-      if (IsImplicitConst)
+      if (IsImplicitConst) {
         V = Data.getSLEB128(OffsetPtr);
-      else if (auto Size = DWARFFormValue::getFixedByteSize(F))
-        V = *Size;
-      AttributeSpecs.push_back(AttributeSpec(A, F, V));
-      if (IsImplicitConst)
+        AttributeSpecs.push_back(AttributeSpec(A, F, V));
         continue;
+      }
       // If this abbrevation still has a fixed byte size, then update the
       // FixedAttributeSize as needed.
-      if (FixedAttributeSize) {
-        if (V)
-          FixedAttributeSize->NumBytes += *V;
-        else {
-          switch (F) {
-          case DW_FORM_addr:
-            ++FixedAttributeSize->NumAddrs;
-            break;
-
-          case DW_FORM_ref_addr:
-            ++FixedAttributeSize->NumRefAddrs;
-            break;
-
-          case DW_FORM_strp:
-          case DW_FORM_GNU_ref_alt:
-          case DW_FORM_GNU_strp_alt:
-          case DW_FORM_line_strp:
-          case DW_FORM_sec_offset:
-          case DW_FORM_strp_sup:
-          case DW_FORM_ref_sup:
-            ++FixedAttributeSize->NumDwarfOffsets;
-            break;
-
-          default:
-            // Indicate we no longer have a fixed byte size for this
-            // abbreviation by clearing the FixedAttributeSize optional value
-            // so it doesn't have a value.
-            FixedAttributeSize.reset();
-            break;
-          }
+      switch (F) {
+      case DW_FORM_addr:
+        if (FixedAttributeSize)
+          ++FixedAttributeSize->NumAddrs;
+        break;
+
+      case DW_FORM_ref_addr:
+        if (FixedAttributeSize)
+          ++FixedAttributeSize->NumRefAddrs;
+        break;
+
+      case DW_FORM_strp:
+      case DW_FORM_GNU_ref_alt:
+      case DW_FORM_GNU_strp_alt:
+      case DW_FORM_line_strp:
+      case DW_FORM_sec_offset:
+      case DW_FORM_strp_sup:
+        if (FixedAttributeSize)
+          ++FixedAttributeSize->NumDwarfOffsets;
+        break;
+
+      default:
+        // The form has a byte size that doesn't depend on Params.
+        // If it's a fixed size, keep track of it.
+        if (auto Size =
+                DWARFFormValue::getFixedByteSize(F, DWARFFormParams())) {
+          V = *Size;
+          if (FixedAttributeSize)
+            FixedAttributeSize->NumBytes += *V;
+          break;
         }
+        // Indicate we no longer have a fixed byte size for this
+        // abbreviation by clearing the FixedAttributeSize optional value
+        // so it doesn't have a value.
+        FixedAttributeSize.reset();
+        break;
       }
+      // Record this attribute and its fixed size if it has one.
+      AttributeSpecs.push_back(AttributeSpec(A, F, V));
     } else if (A == 0 && F == 0) {
       // We successfully reached the end of this abbreviation declaration
       // since both attribute and form are zero.
@@ -180,7 +192,8 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
     if (auto FixedSize = Spec.getByteSize(U))
       Offset += *FixedSize;
     else
-      DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset, &U);
+      DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset,
+                                U.getFormParams());
     ++AttrIndex;
   }
   return None;
@@ -205,7 +218,8 @@ Optional<int64_t> DWARFAbbreviationDeclaration::AttributeSpec::getByteSize(
   if (ByteSizeOrValue)
     return ByteSizeOrValue;
   Optional<int64_t> S;
-  auto FixedByteSize = DWARFFormValue::getFixedByteSize(Form, &U);
+  auto FixedByteSize =
+      DWARFFormValue::getFixedByteSize(Form, U.getFormParams());
   if (FixedByteSize)
     S = *FixedByteSize;
   return S;
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 7111ad3..9ae7c9a 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -1,4 +1,4 @@
-//===--- DWARFAcceleratorTable.cpp ----------------------------------------===//
+//===- DWARFAcceleratorTable.cpp ------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,11 +8,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
-#include "llvm/Support/Dwarf.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+#include <utility>
 
-namespace llvm {
+using namespace llvm;
 
 bool DWARFAcceleratorTable::extract() {
   uint32_t Offset = 0;
@@ -46,7 +55,53 @@ bool DWARFAcceleratorTable::extract() {
   return true;
 }
 
-void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
+uint32_t DWARFAcceleratorTable::getNumBuckets() { return Hdr.NumBuckets; }
+uint32_t DWARFAcceleratorTable::getNumHashes() { return Hdr.NumHashes; }
+uint32_t DWARFAcceleratorTable::getSizeHdr() { return sizeof(Hdr); }
+uint32_t DWARFAcceleratorTable::getHeaderDataLength() {
+  return Hdr.HeaderDataLength;
+}
+
+ArrayRef<std::pair<DWARFAcceleratorTable::HeaderData::AtomType,
+                   DWARFAcceleratorTable::HeaderData::Form>>
+DWARFAcceleratorTable::getAtomsDesc() {
+  return HdrData.Atoms;
+}
+
+bool DWARFAcceleratorTable::validateForms() {
+  for (auto Atom : getAtomsDesc()) {
+    DWARFFormValue FormValue(Atom.second);
+    switch (Atom.first) {
+    case dwarf::DW_ATOM_die_offset:
+      if ((!FormValue.isFormClass(DWARFFormValue::FC_Constant) &&
+           !FormValue.isFormClass(DWARFFormValue::FC_Flag)) ||
+          FormValue.getForm() == dwarf::DW_FORM_sdata)
+        return false;
+    default:
+      break;
+    }
+  }
+  return true;
+}
+
+uint32_t DWARFAcceleratorTable::readAtoms(uint32_t &HashDataOffset) {
+  uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
+
+  for (auto Atom : getAtomsDesc()) {
+    DWARFFormValue FormValue(Atom.second);
+    FormValue.extractValue(AccelSection, &HashDataOffset, NULL);
+    switch (Atom.first) {
+    case dwarf::DW_ATOM_die_offset:
+      DieOffset = *FormValue.getAsUnsignedConstant();
+      break;
+    default:
+      break;
+    }
+  }
+  return DieOffset;
+}
+
+LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
   // Dump the header.
   OS << "Magic = " << format("0x%08x", Hdr.Magic) << '\n'
      << "Version = " << format("0x%04x", Hdr.Version) << '\n'
@@ -105,10 +160,7 @@ void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
         continue;
       }
       while (AccelSection.isValidOffsetForDataOfSize(DataOffset, 4)) {
-        unsigned StringOffset = AccelSection.getU32(&DataOffset);
-        RelocAddrMap::const_iterator Reloc = Relocs.find(DataOffset-4);
-        if (Reloc != Relocs.end())
-          StringOffset += Reloc->second.second;
+        unsigned StringOffset = AccelSection.getRelocatedValue(4, &DataOffset);
         if (!StringOffset)
           break;
         OS << format("    Name: %08x \"%s\"\n", StringOffset,
@@ -131,4 +183,3 @@ void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
     }
   }
 }
-}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 948972f..358e9bf 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -15,17 +15,19 @@
 
 using namespace llvm;
 
-void DWARFCompileUnit::dump(raw_ostream &OS) {
+void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   OS << format("0x%08x", getOffset()) << ": Compile Unit:"
      << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " version = " << format("0x%04x", getVersion());
+  if (getVersion() >= 5)
+    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " (next unit at " << format("0x%08x", getNextUnitOffset())
      << ")\n";
 
   if (DWARFDie CUDie = getUnitDIE(false))
-    CUDie.dump(OS, -1U);
+    CUDie.dump(OS, -1U, 0, DumpOpts);
   else
     OS << "<compile unit can't be parsed!>\n\n";
 }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 77f6f65..dd32352 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFContext.cpp --------------------------------------------------===//
+//===- DWARFContext.cpp ---------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,46 +8,191 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocVisitor.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
 #define DEBUG_TYPE "dwarf"
 
-typedef DWARFDebugLine::LineTable DWARFLineTable;
-typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
-typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
+using DWARFLineTable = DWARFDebugLine::LineTable;
+using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
+using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;
 
 static void dumpAccelSection(raw_ostream &OS, StringRef Name,
                              const DWARFSection& Section, StringRef StringSection,
                              bool LittleEndian) {
-  DataExtractor AccelSection(Section.Data, LittleEndian, 0);
+  DWARFDataExtractor AccelSection(Section, LittleEndian, 0);
   DataExtractor StrData(StringSection, LittleEndian, 0);
   OS << "\n." << Name << " contents:\n";
-  DWARFAcceleratorTable Accel(AccelSection, StrData, Section.Relocs);
+  DWARFAcceleratorTable Accel(AccelSection, StrData);
   if (!Accel.extract())
     return;
   Accel.dump(OS);
 }
 
-void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
-                        bool SummarizeTypes) {
+static void
+dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName,
+                                const DWARFSection &StringOffsetsSection,
+                                StringRef StringSection, bool LittleEndian) {
+  DWARFDataExtractor StrOffsetExt(StringOffsetsSection, LittleEndian, 0);
+  uint32_t Offset = 0;
+  uint64_t SectionSize = StringOffsetsSection.Data.size();
+
+  while (Offset < SectionSize) {
+    unsigned Version = 0;
+    DwarfFormat Format = DWARF32;
+    unsigned EntrySize = 4;
+    // Perform validation and extract the segment size from the header.
+    if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 4)) {
+      OS << "error: invalid contribution to string offsets table in section ."
+         << SectionName << ".\n";
+      return;
+    }
+    uint32_t ContributionStart = Offset;
+    uint64_t ContributionSize = StrOffsetExt.getU32(&Offset);
+    // A contribution size of 0xffffffff indicates DWARF64, with the actual size
+    // in the following 8 bytes. Otherwise, the DWARF standard mandates that
+    // the contribution size must be at most 0xfffffff0.
+    if (ContributionSize == 0xffffffff) {
+      if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 8)) {
+        OS << "error: invalid contribution to string offsets table in section ."
+           << SectionName << ".\n";
+        return;
+      }
+      Format = DWARF64;
+      EntrySize = 8;
+      ContributionSize = StrOffsetExt.getU64(&Offset);
+    } else if (ContributionSize > 0xfffffff0) {
+      OS << "error: invalid contribution to string offsets table in section ."
+         << SectionName << ".\n";
+      return;
+    }
+
+    // We must ensure that we don't read a partial record at the end, so we
+    // validate for a multiple of EntrySize. Also, we're expecting a version
+    // number and padding, which adds an additional 4 bytes.
+    uint64_t ValidationSize =
+        4 + ((ContributionSize + EntrySize - 1) & (-(uint64_t)EntrySize));
+    if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, ValidationSize)) {
+      OS << "error: contribution to string offsets table in section ."
+         << SectionName << " has invalid length.\n";
+      return;
+    }
+
+    Version = StrOffsetExt.getU16(&Offset);
+    Offset += 2;
+    OS << format("0x%8.8x: ", ContributionStart);
+    OS << "Contribution size = " << ContributionSize
+       << ", Version = " << Version << "\n";
+
+    uint32_t ContributionBase = Offset;
+    DataExtractor StrData(StringSection, LittleEndian, 0);
+    while (Offset - ContributionBase < ContributionSize) {
+      OS << format("0x%8.8x: ", Offset);
+      // FIXME: We can only extract strings in DWARF32 format at the moment.
+      uint64_t StringOffset =
+          StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
+      if (Format == DWARF32) {
+        uint32_t StringOffset32 = (uint32_t)StringOffset;
+        OS << format("%8.8x ", StringOffset32);
+        const char *S = StrData.getCStr(&StringOffset32);
+        if (S)
+          OS << format("\"%s\"", S);
+      } else
+        OS << format("%16.16" PRIx64 " ", StringOffset);
+      OS << "\n";
+    }
+  }
+}
+
+// Dump a DWARF string offsets section. This may be a DWARF v5 formatted
+// string offsets section, where each compile or type unit contributes a
+// number of entries (string offsets), with each contribution preceded by
+// a header containing size and version number. Alternatively, it may be a
+// monolithic series of string offsets, as generated by the pre-DWARF v5
+// implementation of split DWARF.
+static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
+                                     const DWARFSection &StringOffsetsSection,
+                                     StringRef StringSection, bool LittleEndian,
+                                     unsigned MaxVersion) {
+  if (StringOffsetsSection.Data.empty())
+    return;
+  OS << "\n." << SectionName << " contents:\n";
+  // If we have at least one (compile or type) unit with DWARF v5 or greater,
+  // we assume that the section is formatted like a DWARF v5 string offsets
+  // section.
+  if (MaxVersion >= 5)
+    dumpDWARFv5StringOffsetsSection(OS, SectionName, StringOffsetsSection,
+                                    StringSection, LittleEndian);
+  else {
+    DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);
+    uint32_t offset = 0;
+    uint64_t size = StringOffsetsSection.Data.size();
+    // Ensure that size is a multiple of the size of an entry.
+    if (size & ((uint64_t)(sizeof(uint32_t) - 1))) {
+      OS << "error: size of ." << SectionName << " is not a multiple of "
+         << sizeof(uint32_t) << ".\n";
+      size &= -(uint64_t)sizeof(uint32_t);
+    }
+    DataExtractor StrData(StringSection, LittleEndian, 0);
+    while (offset < size) {
+      OS << format("0x%8.8x: ", offset);
+      uint32_t StringOffset = strOffsetExt.getU32(&offset);
+      OS << format("%8.8x  ", StringOffset);
+      const char *S = StrData.getCStr(&StringOffset);
+      if (S)
+        OS << format("\"%s\"", S);
+      OS << "\n";
+    }
+  }
+}
+
+void DWARFContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
+  DIDumpType DumpType = DumpOpts.DumpType;
+  bool DumpEH = DumpOpts.DumpEH;
+  bool SummarizeTypes = DumpOpts.SummarizeTypes;
+
   if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
     OS << ".debug_abbrev contents:\n";
     getDebugAbbrev()->dump(OS);
@@ -62,14 +207,14 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
   if (DumpType == DIDT_All || DumpType == DIDT_Info) {
     OS << "\n.debug_info contents:\n";
     for (const auto &CU : compile_units())
-      CU->dump(OS);
+      CU->dump(OS, DumpOpts);
   }
 
   if ((DumpType == DIDT_All || DumpType == DIDT_InfoDwo) &&
       getNumDWOCompileUnits()) {
     OS << "\n.debug_info.dwo contents:\n";
     for (const auto &DWOCU : dwo_compile_units())
-      DWOCU->dump(OS);
+      DWOCU->dump(OS, DumpOpts);
   }
 
   if ((DumpType == DIDT_All || DumpType == DIDT_Types) && getNumTypeUnits()) {
@@ -128,13 +273,12 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
       auto CUDIE = CU->getUnitDIE();
       if (!CUDIE)
         continue;
-      if (auto StmtOffset =
-              CUDIE.getAttributeValueAsSectionOffset(DW_AT_stmt_list)) {
-        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
-                               savedAddressByteSize);
+      if (auto StmtOffset = toSectionOffset(CUDIE.find(DW_AT_stmt_list))) {
+        DWARFDataExtractor lineData(getLineSection(), isLittleEndian(),
+                                    savedAddressByteSize);
         DWARFDebugLine::LineTable LineTable;
         uint32_t Offset = *StmtOffset;
-        LineTable.parse(lineData, &getLineSection().Relocs, &Offset);
+        LineTable.parse(lineData, &Offset);
         LineTable.dump(OS);
       }
     }
@@ -153,8 +297,8 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
   if (DumpType == DIDT_All || DumpType == DIDT_LineDwo) {
     OS << "\n.debug_line.dwo contents:\n";
     unsigned stmtOffset = 0;
-    DataExtractor lineData(getLineDWOSection().Data, isLittleEndian(),
-                           savedAddressByteSize);
+    DWARFDataExtractor lineData(getLineDWOSection(), isLittleEndian(),
+                                savedAddressByteSize);
     DWARFDebugLine::LineTable LineTable;
     while (LineTable.Prologue.parse(lineData, &stmtOffset)) {
       LineTable.dump(OS);
@@ -191,8 +335,8 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
     // sizes, but for simplicity we just use the address byte size of the last
     // compile unit (there is no easy and fast way to associate address range
     // list and the compile unit it describes).
-    DataExtractor rangesData(getRangeSection(), isLittleEndian(),
-                             savedAddressByteSize);
+    DWARFDataExtractor rangesData(getRangeSection(), isLittleEndian(),
+                                  savedAddressByteSize);
     offset = 0;
     DWARFDebugRangeList rangeList;
     while (rangeList.extract(rangesData, &offset))
@@ -217,17 +361,15 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
                        true /* GnuStyle */)
         .dump("debug_gnu_pubtypes", OS);
 
-  if ((DumpType == DIDT_All || DumpType == DIDT_StrOffsetsDwo) &&
-      !getStringOffsetDWOSection().empty()) {
-    OS << "\n.debug_str_offsets.dwo contents:\n";
-    DataExtractor strOffsetExt(getStringOffsetDWOSection(), isLittleEndian(),
-                               0);
-    offset = 0;
-    uint64_t size = getStringOffsetDWOSection().size();
-    while (offset < size) {
-      OS << format("0x%8.8x: ", offset);
-      OS << format("%8.8x\n", strOffsetExt.getU32(&offset));
-    }
+  if (DumpType == DIDT_All || DumpType == DIDT_StrOffsets)
+    dumpStringOffsetsSection(OS, "debug_str_offsets", getStringOffsetSection(),
+                             getStringSection(), isLittleEndian(),
+                             getMaxVersion());
+
+  if (DumpType == DIDT_All || DumpType == DIDT_StrOffsetsDwo) {
+    dumpStringOffsetsSection(OS, "debug_str_offsets.dwo",
+                             getStringOffsetDWOSection(), getStringDWOSection(),
+                             isLittleEndian(), getMaxVersion());
   }
 
   if ((DumpType == DIDT_All || DumpType == DIDT_GdbIndex) &&
@@ -253,6 +395,40 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
                      getStringSection(), isLittleEndian());
 }
 
+DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
+  // FIXME: Improve this for the case where this DWO file is really a DWP file
+  // with an index - use the index for lookup instead of a linear search.
+  for (const auto &DWOCU : dwo_compile_units())
+    if (DWOCU->getDWOId() == Hash)
+      return DWOCU.get();
+  return nullptr;
+}
+
+DWARFDie DWARFContext::getDIEForOffset(uint32_t Offset) {
+  parseCompileUnits();
+  if (auto *CU = CUs.getUnitForOffset(Offset))
+    return CU->getDIEForOffset(Offset);
+  return DWARFDie();
+}
+
+bool DWARFContext::verify(raw_ostream &OS, DIDumpType DumpType) {
+  bool Success = true;
+  DWARFVerifier verifier(OS, *this);
+  if (DumpType == DIDT_All || DumpType == DIDT_Info) {
+    if (!verifier.handleDebugInfo())
+      Success = false;
+  }
+  if (DumpType == DIDT_All || DumpType == DIDT_Line) {
+    if (!verifier.handleDebugLine())
+      Success = false;
+  }
+  if (DumpType == DIDT_All || DumpType == DIDT_AppleNames) {
+    if (!verifier.handleAppleNames())
+      Success = false;
+  }
+  return Success;
+}
+
 const DWARFUnitIndex &DWARFContext::getCUIndex() {
   if (CUIndex)
     return *CUIndex;
@@ -310,11 +486,13 @@ const DWARFDebugLoc *DWARFContext::getDebugLoc() {
   if (Loc)
     return Loc.get();
 
-  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
-  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
+  Loc.reset(new DWARFDebugLoc);
   // assume all compile units have the same address byte size
-  if (getNumCompileUnits())
-    Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
+  if (getNumCompileUnits()) {
+    DWARFDataExtractor LocData(getLocSection(), isLittleEndian(),
+                               getCompileUnitAtIndex(0)->getAddressByteSize());
+    Loc->parse(LocData);
+  }
   return Loc.get();
 }
 
@@ -381,13 +559,13 @@ const DWARFDebugMacro *DWARFContext::getDebugMacro() {
 const DWARFLineTable *
 DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   if (!Line)
-    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
+    Line.reset(new DWARFDebugLine);
 
   auto UnitDIE = U->getUnitDIE();
   if (!UnitDIE)
     return nullptr;
 
-  auto Offset = UnitDIE.getAttributeValueAsSectionOffset(DW_AT_stmt_list);
+  auto Offset = toSectionOffset(UnitDIE.find(DW_AT_stmt_list));
   if (!Offset)
     return nullptr; // No line table for this compile unit.
 
@@ -396,9 +574,13 @@ DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
     return lt;
 
+  // Make sure the offset is good before we try to parse.
+  if (stmtOffset >= U->getLineSection().Data.size())
+    return nullptr;  
+
   // We have to parse it first.
-  DataExtractor lineData(U->getLineSection(), isLittleEndian(),
-                         U->getAddressByteSize());
+  DWARFDataExtractor lineData(U->getLineSection(), isLittleEndian(),
+                              U->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset);
 }
 
@@ -409,10 +591,10 @@ void DWARFContext::parseCompileUnits() {
 void DWARFContext::parseTypeUnits() {
   if (!TUs.empty())
     return;
-  for (const auto &I : getTypesSections()) {
+  forEachTypesSections([&](const DWARFSection &S) {
     TUs.emplace_back();
-    TUs.back().parse(*this, I.second);
-  }
+    TUs.back().parse(*this, S);
+  });
 }
 
 void DWARFContext::parseDWOCompileUnits() {
@@ -422,10 +604,10 @@ void DWARFContext::parseDWOCompileUnits() {
 void DWARFContext::parseDWOTypeUnits() {
   if (!DWOTUs.empty())
     return;
-  for (const auto &I : getTypesDWOSections()) {
+  forEachTypesDWOSections([&](const DWARFSection &S) {
     DWOTUs.emplace_back();
-    DWOTUs.back().parseDWO(*this, I.second);
-  }
+    DWOTUs.back().parseDWO(*this, S);
+  });
 }
 
 DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
@@ -440,23 +622,32 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   return getCompileUnitForOffset(CUOffset);
 }
 
-static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
-                                      FunctionNameKind Kind,
-                                      std::string &FunctionName) {
-  if (Kind == FunctionNameKind::None)
-    return false;
+static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU,
+                                                  uint64_t Address,
+                                                  FunctionNameKind Kind,
+                                                  std::string &FunctionName,
+                                                  uint32_t &StartLine) {
   // The address may correspond to instruction in some inlined function,
   // so we have to build the chain of inlined functions and take the
-  // name of the topmost function in it.SmallVectorImpl<DWARFDie> &InlinedChain
+  // name of the topmost function in it.
   SmallVector<DWARFDie, 4> InlinedChain;
   CU->getInlinedChainForAddress(Address, InlinedChain);
-  if (InlinedChain.size() == 0)
+  if (InlinedChain.empty())
     return false;
-  if (const char *Name = InlinedChain[0].getSubroutineName(Kind)) {
+
+  const DWARFDie &DIE = InlinedChain[0];
+  bool FoundResult = false;
+  const char *Name = nullptr;
+  if (Kind != FunctionNameKind::None && (Name = DIE.getSubroutineName(Kind))) {
     FunctionName = Name;
-    return true;
+    FoundResult = true;
+  }
+  if (auto DeclLineResult = DIE.getDeclLine()) {
+    StartLine = DeclLineResult;
+    FoundResult = true;
   }
-  return false;
+
+  return FoundResult;
 }
 
 DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
@@ -466,7 +657,9 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
   if (!CU)
     return Result;
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
+  getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind,
+                                        Result.FunctionName,
+                                        Result.StartLine);
   if (Spec.FLIKind != FileLineInfoKind::None) {
     if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
       LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
@@ -484,13 +677,16 @@ DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
     return Lines;
 
   std::string FunctionName = "<invalid>";
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, FunctionName);
+  uint32_t StartLine = 0;
+  getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind, FunctionName,
+                                        StartLine);
 
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (Spec.FLIKind == FileLineInfoKind::None) {
     DILineInfo Result;
     Result.FunctionName = FunctionName;
+    Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Address, Result));
     return Lines;
   }
@@ -511,6 +707,7 @@ DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
     Result.FunctionName = FunctionName;
     Result.Line = Row.Line;
     Result.Column = Row.Column;
+    Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Row.Address, Result));
   }
 
@@ -543,13 +740,15 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
     return InliningInfo;
   }
 
-  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
+  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0, CallDiscriminator = 0;
   for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
     DWARFDie &FunctionDIE = InlinedChain[i];
     DILineInfo Frame;
     // Get function name if necessary.
     if (const char *Name = FunctionDIE.getSubroutineName(Spec.FNKind))
       Frame.FunctionName = Name;
+    if (auto DeclLineResult = FunctionDIE.getDeclLine())
+      Frame.StartLine = DeclLineResult;
     if (Spec.FLIKind != FileLineInfoKind::None) {
       if (i == 0) {
         // For the topmost frame, initialize the line table of this
@@ -567,10 +766,12 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
                                         Spec.FLIKind, Frame.FileName);
         Frame.Line = CallLine;
         Frame.Column = CallColumn;
+        Frame.Discriminator = CallDiscriminator;
       }
       // Get call file/line/column of a current DIE.
       if (i + 1 < n) {
-        FunctionDIE.getCallerFrame(CallFile, CallLine, CallColumn);
+        FunctionDIE.getCallerFrame(CallFile, CallLine, CallColumn,
+                                   CallDiscriminator);
       }
     }
     InliningInfo.addFrame(Frame);
@@ -578,91 +779,207 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
-DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
-    const LoadedObjectInfo *L)
-    : IsLittleEndian(Obj.isLittleEndian()),
+std::shared_ptr<DWARFContext>
+DWARFContext::getDWOContext(StringRef AbsolutePath) {
+  if (auto S = DWP.lock()) {
+    DWARFContext *Ctxt = S->Context.get();
+    return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
+  }
+
+  std::weak_ptr<DWOFile> *Entry = &DWOFiles[AbsolutePath];
+
+  if (auto S = Entry->lock()) {
+    DWARFContext *Ctxt = S->Context.get();
+    return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
+  }
+
+  SmallString<128> DWPName;
+  Expected<OwningBinary<ObjectFile>> Obj = [&] {
+    if (!CheckedForDWP) {
+      (getFileName() + ".dwp").toVector(DWPName);
+      auto Obj = object::ObjectFile::createObjectFile(DWPName);
+      if (Obj) {
+        Entry = &DWP;
+        return Obj;
+      } else {
+        CheckedForDWP = true;
+        // TODO: Should this error be handled (maybe in a high verbosity mode)
+        // before falling back to .dwo files?
+        consumeError(Obj.takeError());
+      }
+    }
+
+    return object::ObjectFile::createObjectFile(AbsolutePath);
+  }();
+
+  if (!Obj) {
+    // TODO: Actually report errors helpfully.
+    consumeError(Obj.takeError());
+    return nullptr;
+  }
+
+  auto S = std::make_shared<DWOFile>();
+  S->File = std::move(Obj.get());
+  S->Context = llvm::make_unique<DWARFContextInMemory>(*S->File.getBinary());
+  *Entry = S;
+  auto *Ctxt = S->Context.get();
+  return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
+}
+
+static Error createError(const Twine &Reason, llvm::Error E) {
+  return make_error<StringError>(Reason + toString(std::move(E)),
+                                 inconvertibleErrorCode());
+}
+
+/// SymInfo contains information about symbol: it's address
+/// and section index which is -1LL for absolute symbols.
+struct SymInfo {
+  uint64_t Address;
+  uint64_t SectionIndex;
+};
+
+/// Returns the address of symbol relocation used against and a section index.
+/// Used for futher relocations computation. Symbol's section load address is
+static Expected<SymInfo> getSymbolInfo(const object::ObjectFile &Obj,
+                                       const RelocationRef &Reloc,
+                                       const LoadedObjectInfo *L,
+                                       std::map<SymbolRef, SymInfo> &Cache) {
+  SymInfo Ret = {0, (uint64_t)-1LL};
+  object::section_iterator RSec = Obj.section_end();
+  object::symbol_iterator Sym = Reloc.getSymbol();
+
+  std::map<SymbolRef, SymInfo>::iterator CacheIt = Cache.end();
+  // First calculate the address of the symbol or section as it appears
+  // in the object file
+  if (Sym != Obj.symbol_end()) {
+    bool New;
+    std::tie(CacheIt, New) = Cache.insert({*Sym, {0, 0}});
+    if (!New)
+      return CacheIt->second;
+
+    Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
+    if (!SymAddrOrErr)
+      return createError("failed to compute symbol address: ",
+                         SymAddrOrErr.takeError());
+
+    // Also remember what section this symbol is in for later
+    auto SectOrErr = Sym->getSection();
+    if (!SectOrErr)
+      return createError("failed to get symbol section: ",
+                         SectOrErr.takeError());
+
+    RSec = *SectOrErr;
+    Ret.Address = *SymAddrOrErr;
+  } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
+    RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
+    Ret.Address = RSec->getAddress();
+  }
+
+  if (RSec != Obj.section_end())
+    Ret.SectionIndex = RSec->getIndex();
+
+  // If we are given load addresses for the sections, we need to adjust:
+  // SymAddr = (Address of Symbol Or Section in File) -
+  //           (Address of Section in File) +
+  //           (Load Address of Section)
+  // RSec is now either the section being targeted or the section
+  // containing the symbol being targeted. In either case,
+  // we need to perform the same computation.
+  if (L && RSec != Obj.section_end())
+    if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
+      Ret.Address += SectionLoadAddress - RSec->getAddress();
+
+  if (CacheIt != Cache.end())
+    CacheIt->second = Ret;
+
+  return Ret;
+}
+
+static bool isRelocScattered(const object::ObjectFile &Obj,
+                             const RelocationRef &Reloc) {
+  const MachOObjectFile *MachObj = dyn_cast<MachOObjectFile>(&Obj);
+  if (!MachObj)
+    return false;
+  // MachO also has relocations that point to sections and
+  // scattered relocations.
+  auto RelocInfo = MachObj->getRelocation(Reloc.getRawDataRefImpl());
+  return MachObj->isRelocationScattered(RelocInfo);
+}
+
+Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec,
+                                            StringRef Name, StringRef &Data) {
+  if (!Decompressor::isCompressed(Sec))
+    return Error::success();
+
+  Expected<Decompressor> Decompressor =
+      Decompressor::create(Name, Data, IsLittleEndian, AddressSize == 8);
+  if (!Decompressor)
+    return Decompressor.takeError();
+
+  SmallString<32> Out;
+  if (auto Err = Decompressor->resizeAndDecompress(Out))
+    return Err;
+
+  UncompressedSections.emplace_back(std::move(Out));
+  Data = UncompressedSections.back();
+
+  return Error::success();
+}
+
+ErrorPolicy DWARFContextInMemory::defaultErrorHandler(Error E) {
+  errs() << "error: " + toString(std::move(E)) << '\n';
+  return ErrorPolicy::Continue;
+}
+
+DWARFContextInMemory::DWARFContextInMemory(
+    const object::ObjectFile &Obj, const LoadedObjectInfo *L,
+    function_ref<ErrorPolicy(Error)> HandleError)
+    : FileName(Obj.getFileName()), IsLittleEndian(Obj.isLittleEndian()),
       AddressSize(Obj.getBytesInAddress()) {
   for (const SectionRef &Section : Obj.sections()) {
-    StringRef name;
-    Section.getName(name);
+    StringRef Name;
+    Section.getName(Name);
     // Skip BSS and Virtual sections, they aren't interesting.
-    bool IsBSS = Section.isBSS();
-    if (IsBSS)
-      continue;
-    bool IsVirtual = Section.isVirtual();
-    if (IsVirtual)
+    if (Section.isBSS() || Section.isVirtual())
       continue;
-    StringRef data;
 
+    StringRef Data;
     section_iterator RelocatedSection = Section.getRelocatedSection();
     // Try to obtain an already relocated version of this section.
     // Else use the unrelocated section from the object file. We'll have to
     // apply relocations ourselves later.
-    if (!L || !L->getLoadedSectionContents(*RelocatedSection,data))
-      Section.getContents(data);
-
-    if (Decompressor::isCompressed(Section)) {
-      Expected<Decompressor> Decompressor =
-          Decompressor::create(name, data, IsLittleEndian, AddressSize == 8);
-      if (!Decompressor)
-        continue;
-      SmallString<32> Out;
-      if (auto Err = Decompressor->decompress(Out))
-        continue;
-      UncompressedSections.emplace_back(std::move(Out));
-      data = UncompressedSections.back();
+    if (!L || !L->getLoadedSectionContents(*RelocatedSection, Data))
+      Section.getContents(Data);
+
+    if (auto Err = maybeDecompress(Section, Name, Data)) {
+      ErrorPolicy EP = HandleError(
+          createError("failed to decompress '" + Name + "', ", std::move(Err)));
+      if (EP == ErrorPolicy::Halt)
+        return;
+      continue;
     }
 
     // Compressed sections names in GNU style starts from ".z",
     // at this point section is decompressed and we drop compression prefix.
-    name = name.substr(
-        name.find_first_not_of("._z")); // Skip ".", "z" and "_" prefixes.
-
-    StringRef *SectionData =
-        StringSwitch<StringRef *>(name)
-            .Case("debug_info", &InfoSection.Data)
-            .Case("debug_abbrev", &AbbrevSection)
-            .Case("debug_loc", &LocSection.Data)
-            .Case("debug_line", &LineSection.Data)
-            .Case("debug_aranges", &ARangeSection)
-            .Case("debug_frame", &DebugFrameSection)
-            .Case("eh_frame", &EHFrameSection)
-            .Case("debug_str", &StringSection)
-            .Case("debug_ranges", &RangeSection)
-            .Case("debug_macinfo", &MacinfoSection)
-            .Case("debug_pubnames", &PubNamesSection)
-            .Case("debug_pubtypes", &PubTypesSection)
-            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
-            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
-            .Case("debug_info.dwo", &InfoDWOSection.Data)
-            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-            .Case("debug_loc.dwo", &LocDWOSection.Data)
-            .Case("debug_line.dwo", &LineDWOSection.Data)
-            .Case("debug_str.dwo", &StringDWOSection)
-            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-            .Case("debug_addr", &AddrSection)
-            .Case("apple_names", &AppleNamesSection.Data)
-            .Case("apple_types", &AppleTypesSection.Data)
-            .Case("apple_namespaces", &AppleNamespacesSection.Data)
-            .Case("apple_namespac", &AppleNamespacesSection.Data)
-            .Case("apple_objc", &AppleObjCSection.Data)
-            .Case("debug_cu_index", &CUIndexSection)
-            .Case("debug_tu_index", &TUIndexSection)
-            .Case("gdb_index", &GdbIndexSection)
-            // Any more debug info sections go here.
-            .Default(nullptr);
-    if (SectionData) {
-      *SectionData = data;
-      if (name == "debug_ranges") {
+    Name = Name.substr(
+        Name.find_first_not_of("._z")); // Skip ".", "z" and "_" prefixes.
+
+    // Map platform specific debug section names to DWARF standard section
+    // names.
+    Name = Obj.mapDebugSectionName(Name);
+
+    if (StringRef *SectionData = mapSectionToMember(Name)) {
+      *SectionData = Data;
+      if (Name == "debug_ranges") {
         // FIXME: Use the other dwo range section when we emit it.
-        RangeDWOSection = data;
+        RangeDWOSection.Data = Data;
       }
-    } else if (name == "debug_types") {
+    } else if (Name == "debug_types") {
       // Find debug_types data by section rather than name as there are
       // multiple, comdat grouped, debug_types sections.
-      TypesSections[Section].Data = data;
-    } else if (name == "debug_types.dwo") {
-      TypesDWOSections[Section].Data = data;
+      TypesSections[Section].Data = Data;
+    } else if (Name == "debug_types.dwo") {
+      TypesDWOSections[Section].Data = Data;
     }
 
     if (RelocatedSection == Obj.section_end())
@@ -675,7 +992,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     // If the section we're relocating was relocated already by the JIT,
     // then we used the relocated version above, so we do not need to process
     // relocations for it now.
-    if (L && L->getLoadedSectionContents(*RelocatedSection,RelSecData))
+    if (L && L->getLoadedSectionContents(*RelocatedSection, RelSecData))
       continue;
 
     // In Mach-o files, the relocations do not need to be applied if
@@ -687,21 +1004,12 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
       continue;
 
     RelSecName = RelSecName.substr(
-        RelSecName.find_first_not_of("._")); // Skip . and _ prefixes.
+        RelSecName.find_first_not_of("._z")); // Skip . and _ prefixes.
 
     // TODO: Add support for relocations in other sections as needed.
     // Record relocations for the debug_info and debug_line sections.
-    RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
-        .Case("debug_info", &InfoSection.Relocs)
-        .Case("debug_loc", &LocSection.Relocs)
-        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
-        .Case("debug_line", &LineSection.Relocs)
-        .Case("apple_names", &AppleNamesSection.Relocs)
-        .Case("apple_types", &AppleTypesSection.Relocs)
-        .Case("apple_namespaces", &AppleNamespacesSection.Relocs)
-        .Case("apple_namespac", &AppleNamespacesSection.Relocs)
-        .Case("apple_objc", &AppleObjCSection.Relocs)
-        .Default(nullptr);
+    DWARFSection *Sec = mapNameToDWARFSection(RelSecName);
+    RelocAddrMap *Map = Sec ? &Sec->Relocs : nullptr;
     if (!Map) {
       // Find debug_types relocs by section rather than name as there are
       // multiple, comdat grouped, debug_types sections.
@@ -713,103 +1021,93 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
         continue;
     }
 
-    if (Section.relocation_begin() != Section.relocation_end()) {
-      uint64_t SectionSize = RelocatedSection->getSize();
-      for (const RelocationRef &Reloc : Section.relocations()) {
-        uint64_t Address = Reloc.getOffset();
-        uint64_t Type = Reloc.getType();
-        uint64_t SymAddr = 0;
-        uint64_t SectionLoadAddress = 0;
-        object::symbol_iterator Sym = Reloc.getSymbol();
-        object::section_iterator RSec = Obj.section_end();
-
-        // First calculate the address of the symbol or section as it appears
-        // in the objct file
-        if (Sym != Obj.symbol_end()) {
-          Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
-          if (!SymAddrOrErr) {
-            std::string Buf;
-            raw_string_ostream OS(Buf);
-            logAllUnhandledErrors(SymAddrOrErr.takeError(), OS, "");
-            OS.flush();
-            errs() << "error: failed to compute symbol address: "
-                   << Buf << '\n';
-            continue;
-          }
-          SymAddr = *SymAddrOrErr;
-          // Also remember what section this symbol is in for later
-          auto SectOrErr = Sym->getSection();
-          if (!SectOrErr) {
-            std::string Buf;
-            raw_string_ostream OS(Buf);
-            logAllUnhandledErrors(SectOrErr.takeError(), OS, "");
-            OS.flush();
-            errs() << "error: failed to get symbol section: "
-                   << Buf << '\n';
-            continue;
-          }
-          RSec = *SectOrErr;
-        } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
-          // MachO also has relocations that point to sections and
-          // scattered relocations.
-          auto RelocInfo = MObj->getRelocation(Reloc.getRawDataRefImpl());
-          if (MObj->isRelocationScattered(RelocInfo)) {
-            // FIXME: it's not clear how to correctly handle scattered
-            // relocations.
-            continue;
-          } else {
-            RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
-            SymAddr = RSec->getAddress();
-          }
-        }
-
-        // If we are given load addresses for the sections, we need to adjust:
-        // SymAddr = (Address of Symbol Or Section in File) -
-        //           (Address of Section in File) +
-        //           (Load Address of Section)
-        if (L != nullptr && RSec != Obj.section_end()) {
-          // RSec is now either the section being targeted or the section
-          // containing the symbol being targeted. In either case,
-          // we need to perform the same computation.
-          StringRef SecName;
-          RSec->getName(SecName);
-//           llvm::dbgs() << "Name: '" << SecName
-//                        << "', RSec: " << RSec->getRawDataRefImpl()
-//                        << ", Section: " << Section.getRawDataRefImpl() << "\n";
-          SectionLoadAddress = L->getSectionLoadAddress(*RSec);
-          if (SectionLoadAddress != 0)
-            SymAddr += SectionLoadAddress - RSec->getAddress();
-        }
-
-        object::RelocVisitor V(Obj);
-        object::RelocToApply R(V.visit(Type, Reloc, SymAddr));
-        if (V.error()) {
-          SmallString<32> Name;
-          Reloc.getTypeName(Name);
-          errs() << "error: failed to compute relocation: "
-                 << Name << "\n";
-          continue;
-        }
-
-        if (Address + R.Width > SectionSize) {
-          errs() << "error: " << R.Width << "-byte relocation starting "
-                 << Address << " bytes into section " << name << " which is "
-                 << SectionSize << " bytes long.\n";
-          continue;
-        }
-        if (R.Width > 8) {
-          errs() << "error: can't handle a relocation of more than 8 bytes at "
-                    "a time.\n";
-          continue;
-        }
-        DEBUG(dbgs() << "Writing " << format("%p", R.Value)
-                     << " at " << format("%p", Address)
-                     << " with width " << format("%d", R.Width)
-                     << "\n");
-        Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value)));
+    if (Section.relocation_begin() == Section.relocation_end())
+      continue;
+
+    // Symbol to [address, section index] cache mapping.
+    std::map<SymbolRef, SymInfo> AddrCache;
+    for (const RelocationRef &Reloc : Section.relocations()) {
+      // FIXME: it's not clear how to correctly handle scattered
+      // relocations.
+      if (isRelocScattered(Obj, Reloc))
+        continue;
+
+      Expected<SymInfo> SymInfoOrErr = getSymbolInfo(Obj, Reloc, L, AddrCache);
+      if (!SymInfoOrErr) {
+        if (HandleError(SymInfoOrErr.takeError()) == ErrorPolicy::Halt)
+          return;
+        continue;
+      }
+
+      object::RelocVisitor V(Obj);
+      uint64_t Val = V.visit(Reloc.getType(), Reloc, SymInfoOrErr->Address);
+      if (V.error()) {
+        SmallString<32> Type;
+        Reloc.getTypeName(Type);
+        ErrorPolicy EP = HandleError(
+            createError("failed to compute relocation: " + Type + ", ",
+                        errorCodeToError(object_error::parse_failed)));
+        if (EP == ErrorPolicy::Halt)
+          return;
+        continue;
       }
+      RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val};
+      Map->insert({Reloc.getOffset(), Rel});
     }
   }
 }
 
-void DWARFContextInMemory::anchor() { }
+DWARFContextInMemory::DWARFContextInMemory(
+    const StringMap<std::unique_ptr<MemoryBuffer>> &Sections, uint8_t AddrSize,
+    bool isLittleEndian)
+    : IsLittleEndian(isLittleEndian), AddressSize(AddrSize) {
+  for (const auto &SecIt : Sections) {
+    if (StringRef *SectionData = mapSectionToMember(SecIt.first()))
+      *SectionData = SecIt.second->getBuffer();
+  }
+}
+
+DWARFSection *DWARFContextInMemory::mapNameToDWARFSection(StringRef Name) {
+  return StringSwitch<DWARFSection *>(Name)
+      .Case("debug_info", &InfoSection)
+      .Case("debug_loc", &LocSection)
+      .Case("debug_line", &LineSection)
+      .Case("debug_str_offsets", &StringOffsetSection)
+      .Case("debug_ranges", &RangeSection)
+      .Case("debug_info.dwo", &InfoDWOSection)
+      .Case("debug_loc.dwo", &LocDWOSection)
+      .Case("debug_line.dwo", &LineDWOSection)
+      .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+      .Case("debug_addr", &AddrSection)
+      .Case("apple_names", &AppleNamesSection)
+      .Case("apple_types", &AppleTypesSection)
+      .Case("apple_namespaces", &AppleNamespacesSection)
+      .Case("apple_namespac", &AppleNamespacesSection)
+      .Case("apple_objc", &AppleObjCSection)
+      .Default(nullptr);
+}
+
+StringRef *DWARFContextInMemory::mapSectionToMember(StringRef Name) {
+  if (DWARFSection *Sec = mapNameToDWARFSection(Name))
+    return &Sec->Data;
+  return StringSwitch<StringRef *>(Name)
+      .Case("debug_abbrev", &AbbrevSection)
+      .Case("debug_aranges", &ARangeSection)
+      .Case("debug_frame", &DebugFrameSection)
+      .Case("eh_frame", &EHFrameSection)
+      .Case("debug_str", &StringSection)
+      .Case("debug_macinfo", &MacinfoSection)
+      .Case("debug_pubnames", &PubNamesSection)
+      .Case("debug_pubtypes", &PubTypesSection)
+      .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+      .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+      .Case("debug_abbrev.dwo", &AbbrevDWOSection)
+      .Case("debug_str.dwo", &StringDWOSection)
+      .Case("debug_cu_index", &CUIndexSection)
+      .Case("debug_tu_index", &TUIndexSection)
+      .Case("gdb_index", &GdbIndexSection)
+      // Any more debug info sections go here.
+      .Default(nullptr);
+}
+
+void DWARFContextInMemory::anchor() {}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
new file mode 100644
index 0000000..001097e
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -0,0 +1,24 @@
+//===- DWARFDataExtractor.cpp ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+
+using namespace llvm;
+
+uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint32_t *Off,
+                                               uint64_t *SecNdx) const {
+  if (!RelocMap)
+    return getUnsigned(Off, Size);
+  RelocAddrMap::const_iterator AI = RelocMap->find(*Off);
+  if (AI == RelocMap->end())
+    return getUnsigned(Off, Size);
+  if (SecNdx)
+    *SecNdx = AI->second.SectionIndex;
+  return getUnsigned(Off, Size) + AI->second.Value;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
index e63e289..76dd2e4 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
+//===- DWARFDebugAbbrev.cpp -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,10 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 DWARFAbbreviationDeclarationSet::DWARFAbbreviationDeclarationSet() {
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 67589cd..ed5d726 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
+//===- DWARFDebugArangeSet.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
 using namespace llvm;
 
 void DWARFDebugArangeSet::clear() {
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 27a02c4..6601393 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
+//===- DWARFDebugAranges.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,11 +11,13 @@
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/DataExtractor.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
 #include <set>
+#include <vector>
+
 using namespace llvm;
 
 void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
@@ -52,9 +54,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     if (ParsedCUOffsets.insert(CUOffset).second) {
       DWARFAddressRangesVector CURanges;
       CU->collectAddressRanges(CURanges);
-      for (const auto &R : CURanges) {
-        appendRange(CUOffset, R.first, R.second);
-      }
+      for (const auto &R : CURanges)
+        appendRange(CUOffset, R.LowPC, R.HighPC);
     }
   }
 
@@ -81,7 +82,7 @@ void DWARFDebugAranges::construct() {
   std::sort(Endpoints.begin(), Endpoints.end());
   uint64_t PrevAddress = -1ULL;
   for (const auto &E : Endpoints) {
-    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
+    if (PrevAddress < E.Address && !ValidCUs.empty()) {
       // If the address range between two endpoints is described by some
       // CU, first try to extend the last range in Aranges. If we can't
       // do it, start a new range.
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 32b8320..475cf25 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//===- DWARFDebugFrame.h - Parsing of .debug_frame ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,14 +11,14 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -70,7 +70,7 @@ protected:
 
   /// An entry may contain CFI instructions. An instruction consists of an
   /// opcode and an optional sequence of operands.
-  typedef std::vector<uint64_t> Operands;
+  using Operands = std::vector<uint64_t>;
   struct Instruction {
     Instruction(uint8_t Opcode)
       : Opcode(Opcode)
@@ -465,8 +465,7 @@ void FrameEntry::dumpInstructions(raw_ostream &OS) const {
   }
 }
 
-DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {
-}
+DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {}
 
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
@@ -485,17 +484,17 @@ static unsigned getSizeForEncoding(const DataExtractor &Data,
   unsigned format = symbolEncoding & 0x0f;
   switch (format) {
     default: llvm_unreachable("Unknown Encoding");
-    case dwarf::DW_EH_PE_absptr:
-    case dwarf::DW_EH_PE_signed:
+    case DW_EH_PE_absptr:
+    case DW_EH_PE_signed:
       return Data.getAddressSize();
-    case dwarf::DW_EH_PE_udata2:
-    case dwarf::DW_EH_PE_sdata2:
+    case DW_EH_PE_udata2:
+    case DW_EH_PE_sdata2:
       return 2;
-    case dwarf::DW_EH_PE_udata4:
-    case dwarf::DW_EH_PE_sdata4:
+    case DW_EH_PE_udata4:
+    case DW_EH_PE_sdata4:
       return 4;
-    case dwarf::DW_EH_PE_udata8:
-    case dwarf::DW_EH_PE_sdata8:
+    case DW_EH_PE_udata8:
+    case DW_EH_PE_sdata8:
       return 8;
   }
 }
@@ -514,6 +513,19 @@ static uint64_t readPointer(const DataExtractor &Data, uint32_t &Offset,
   }
 }
 
+// This is a workaround for old compilers which do not allow
+// noreturn attribute usage in lambdas. Once the support for those
+// compilers are phased out, we can remove this and return back to
+// a ReportError lambda: [StartOffset](const char *ErrorMsg).
+static void LLVM_ATTRIBUTE_NORETURN ReportError(uint32_t StartOffset,
+                                                const char *ErrorMsg) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << format(ErrorMsg, StartOffset);
+  OS.flush();
+  report_fatal_error(Str);
+}
+
 void DWARFDebugFrame::parse(DataExtractor Data) {
   uint32_t Offset = 0;
   DenseMap<uint32_t, CIE *> CIEs;
@@ -521,14 +533,6 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
   while (Data.isValidOffset(Offset)) {
     uint32_t StartOffset = Offset;
 
-    auto ReportError = [StartOffset](const char *ErrorMsg) {
-      std::string Str;
-      raw_string_ostream OS(Str);
-      OS << format(ErrorMsg, StartOffset);
-      OS.flush();
-      report_fatal_error(Str);
-    };
-
     bool IsDWARF64 = false;
     uint64_t Length = Data.getU32(&Offset);
     uint64_t Id;
@@ -584,13 +588,15 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
         for (unsigned i = 0, e = AugmentationString.size(); i != e; ++i) {
           switch (AugmentationString[i]) {
             default:
-              ReportError("Unknown augmentation character in entry at %lx");
+              ReportError(StartOffset,
+                          "Unknown augmentation character in entry at %lx");
             case 'L':
               LSDAPointerEncoding = Data.getU8(&Offset);
               break;
             case 'P': {
               if (Personality)
-                ReportError("Duplicate personality in entry at %lx");
+                ReportError(StartOffset,
+                            "Duplicate personality in entry at %lx");
               PersonalityEncoding = Data.getU8(&Offset);
               Personality = readPointer(Data, Offset, *PersonalityEncoding);
               break;
@@ -600,7 +606,8 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
               break;
             case 'z':
               if (i)
-                ReportError("'z' must be the first character at %lx");
+                ReportError(StartOffset,
+                            "'z' must be the first character at %lx");
               // Parse the augmentation length first.  We only parse it if
               // the string contains a 'z'.
               AugmentationLength = Data.getULEB128(&Offset);
@@ -612,7 +619,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
 
         if (AugmentationLength.hasValue()) {
           if (Offset != EndAugmentationOffset)
-            ReportError("Parsing augmentation data at %lx failed");
+            ReportError(StartOffset, "Parsing augmentation data at %lx failed");
 
           AugmentationData = Data.getData().slice(StartAugmentationOffset,
                                                   EndAugmentationOffset);
@@ -639,7 +646,8 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
       if (IsEH) {
         // The address size is encoded in the CIE we reference.
         if (!Cie)
-          ReportError("Parsing FDE data at %lx failed due to missing CIE");
+          ReportError(StartOffset,
+                      "Parsing FDE data at %lx failed due to missing CIE");
 
         InitialLocation = readPointer(Data, Offset,
                                       Cie->getFDEPointerEncoding());
@@ -659,7 +667,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
             readPointer(Data, Offset, Cie->getLSDAPointerEncoding());
 
           if (Offset != EndAugmentationOffset)
-            ReportError("Parsing augmentation data at %lx failed");
+            ReportError(StartOffset, "Parsing augmentation data at %lx failed");
         }
       } else {
         InitialLocation = Data.getAddress(&Offset);
@@ -674,7 +682,7 @@ void DWARFDebugFrame::parse(DataExtractor Data) {
     Entries.back()->parseInstructions(Data, &Offset, EndStructureOffset);
 
     if (Offset != EndStructureOffset)
-      ReportError("Parsing entry instructions at %lx failed");
+      ReportError(StartOffset, "Parsing entry instructions at %lx failed");
   }
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index c487e1d..976bc46 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
+//===- DWARFDebugInfoEntry.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,29 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SyntaxHighlighting.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 using namespace dwarf;
-using namespace syntax;
 
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
                                              uint32_t *OffsetPtr) {
-  DataExtractor DebugInfoData = U.getDebugInfoExtractor();
+  DWARFDataExtractor DebugInfoData = U.getDebugInfoExtractor();
   const uint32_t UEndOffset = U.getNextUnitOffset();
   return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0);
 }
+
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
-                                      const DataExtractor &DebugInfoData,
+                                      const DWARFDataExtractor &DebugInfoData,
                                       uint32_t UEndOffset, uint32_t D) {
   Offset = *OffsetPtr;
   Depth = D;
@@ -61,7 +59,7 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
       // Attribute byte size if fixed, just add the size to the offset.
       *OffsetPtr += *FixedSize;
     } else if (!DWARFFormValue::skipValue(AttrSpec.Form, DebugInfoData,
-                                          OffsetPtr, &U)) {
+                                          OffsetPtr, U.getFormParams())) {
       // We failed to skip this attribute's value, restore the original offset
       // and return the failure status.
       *OffsetPtr = Offset;
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 4940594..7d18056 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLine.cpp ------------------------------------------------===//
+//===- DWARFDebugLine.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,22 +8,47 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <utility>
+
 using namespace llvm;
 using namespace dwarf;
-typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
+
+using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
+
+namespace {
+
+struct ContentDescriptor {
+  dwarf::LineNumberEntryFormat Type;
+  dwarf::Form Form;
+};
+
+using ContentDescriptors = SmallVector<ContentDescriptor, 4>;
+
+} // end anonmyous namespace
 
 DWARFDebugLine::Prologue::Prologue() { clear(); }
 
 void DWARFDebugLine::Prologue::clear() {
-  TotalLength = Version = PrologueLength = 0;
+  TotalLength = PrologueLength = 0;
+  SegSelectorSize = 0;
   MinInstLength = MaxOpsPerInst = DefaultIsStmt = LineBase = LineRange = 0;
   OpcodeBase = 0;
-  IsDWARF64 = false;
+  FormParams = DWARFFormParams({0, 0, DWARF32});
   StandardOpcodeLengths.clear();
   IncludeDirectories.clear();
   FileNames.clear();
@@ -32,104 +57,225 @@ void DWARFDebugLine::Prologue::clear() {
 void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
   OS << "Line table prologue:\n"
      << format("    total_length: 0x%8.8" PRIx64 "\n", TotalLength)
-     << format("         version: %u\n", Version)
-     << format(" prologue_length: 0x%8.8" PRIx64 "\n", PrologueLength)
+     << format("         version: %u\n", getVersion());
+  if (getVersion() >= 5)
+    OS << format("    address_size: %u\n", getAddressSize())
+       << format(" seg_select_size: %u\n", SegSelectorSize);
+  OS << format(" prologue_length: 0x%8.8" PRIx64 "\n", PrologueLength)
      << format(" min_inst_length: %u\n", MinInstLength)
-     << format(Version >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
+     << format(getVersion() >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
      << format(" default_is_stmt: %u\n", DefaultIsStmt)
      << format("       line_base: %i\n", LineBase)
      << format("      line_range: %u\n", LineRange)
      << format("     opcode_base: %u\n", OpcodeBase);
 
-  for (uint32_t i = 0; i < StandardOpcodeLengths.size(); ++i)
+  for (uint32_t I = 0; I != StandardOpcodeLengths.size(); ++I)
     OS << format("standard_opcode_lengths[%s] = %u\n",
-                 LNStandardString(i + 1).data(), StandardOpcodeLengths[i]);
+                 LNStandardString(I + 1).data(), StandardOpcodeLengths[I]);
 
   if (!IncludeDirectories.empty())
-    for (uint32_t i = 0; i < IncludeDirectories.size(); ++i)
-      OS << format("include_directories[%3u] = '", i + 1)
-         << IncludeDirectories[i] << "'\n";
+    for (uint32_t I = 0; I != IncludeDirectories.size(); ++I)
+      OS << format("include_directories[%3u] = '", I + 1)
+         << IncludeDirectories[I] << "'\n";
 
   if (!FileNames.empty()) {
     OS << "                Dir  Mod Time   File Len   File Name\n"
        << "                ---- ---------- ---------- -----------"
           "----------------\n";
-    for (uint32_t i = 0; i < FileNames.size(); ++i) {
-      const FileNameEntry &fileEntry = FileNames[i];
-      OS << format("file_names[%3u] %4" PRIu64 " ", i + 1, fileEntry.DirIdx)
-         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ", fileEntry.ModTime,
-                   fileEntry.Length)
-         << fileEntry.Name << '\n';
+    for (uint32_t I = 0; I != FileNames.size(); ++I) {
+      const FileNameEntry &FileEntry = FileNames[I];
+      OS << format("file_names[%3u] %4" PRIu64 " ", I + 1, FileEntry.DirIdx)
+         << format("0x%8.8" PRIx64 " 0x%8.8" PRIx64 " ", FileEntry.ModTime,
+                   FileEntry.Length)
+         << FileEntry.Name << '\n';
     }
   }
 }
 
-bool DWARFDebugLine::Prologue::parse(DataExtractor debug_line_data,
-                                     uint32_t *offset_ptr) {
-  const uint64_t prologue_offset = *offset_ptr;
+// Parse v2-v4 directory and file tables.
+static void
+parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
+                     uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
+                     std::vector<StringRef> &IncludeDirectories,
+                     std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
+  while (*OffsetPtr < EndPrologueOffset) {
+    StringRef S = DebugLineData.getCStrRef(OffsetPtr);
+    if (S.empty())
+      break;
+    IncludeDirectories.push_back(S);
+  }
+
+  while (*OffsetPtr < EndPrologueOffset) {
+    StringRef Name = DebugLineData.getCStrRef(OffsetPtr);
+    if (Name.empty())
+      break;
+    DWARFDebugLine::FileNameEntry FileEntry;
+    FileEntry.Name = Name;
+    FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
+    FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
+    FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
+    FileNames.push_back(FileEntry);
+  }
+}
 
-  clear();
-  TotalLength = debug_line_data.getU32(offset_ptr);
-  if (TotalLength == UINT32_MAX) {
-    IsDWARF64 = true;
-    TotalLength = debug_line_data.getU64(offset_ptr);
-  } else if (TotalLength > 0xffffff00) {
+// Parse v5 directory/file entry content descriptions.
+// Returns the descriptors, or an empty vector if we did not find a path or
+// ran off the end of the prologue.
+static ContentDescriptors
+parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint32_t *OffsetPtr,
+                   uint64_t EndPrologueOffset) {
+  ContentDescriptors Descriptors;
+  int FormatCount = DebugLineData.getU8(OffsetPtr);
+  bool HasPath = false;
+  for (int I = 0; I != FormatCount; ++I) {
+    if (*OffsetPtr >= EndPrologueOffset)
+      return ContentDescriptors();
+    ContentDescriptor Descriptor;
+    Descriptor.Type =
+      dwarf::LineNumberEntryFormat(DebugLineData.getULEB128(OffsetPtr));
+    Descriptor.Form = dwarf::Form(DebugLineData.getULEB128(OffsetPtr));
+    if (Descriptor.Type == dwarf::DW_LNCT_path)
+      HasPath = true;
+    Descriptors.push_back(Descriptor);
+  }
+  return HasPath ? Descriptors : ContentDescriptors();
+}
+
+static bool
+parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
+                     uint32_t *OffsetPtr, uint64_t EndPrologueOffset,
+                     const DWARFFormParams &FormParams,
+                     std::vector<StringRef> &IncludeDirectories,
+                     std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
+  // Get the directory entry description.
+  ContentDescriptors DirDescriptors =
+    parseV5EntryFormat(DebugLineData, OffsetPtr, EndPrologueOffset);
+  if (DirDescriptors.empty())
     return false;
+
+  // Get the directory entries, according to the format described above.
+  int DirEntryCount = DebugLineData.getU8(OffsetPtr);
+  for (int I = 0; I != DirEntryCount; ++I) {
+    if (*OffsetPtr >= EndPrologueOffset)
+      return false;
+    for (auto Descriptor : DirDescriptors) {
+      DWARFFormValue Value(Descriptor.Form);
+      switch (Descriptor.Type) {
+      case DW_LNCT_path:
+        if (!Value.extractValue(DebugLineData, OffsetPtr, nullptr))
+          return false;
+        IncludeDirectories.push_back(Value.getAsCString().getValue());
+        break;
+      default:
+        if (!Value.skipValue(DebugLineData, OffsetPtr, FormParams))
+          return false;
+      }
+    }
   }
-  Version = debug_line_data.getU16(offset_ptr);
-  if (Version < 2)
+
+  // Get the file entry description.
+  ContentDescriptors FileDescriptors =
+    parseV5EntryFormat(DebugLineData, OffsetPtr, EndPrologueOffset);
+  if (FileDescriptors.empty())
     return false;
 
-  PrologueLength =
-      debug_line_data.getUnsigned(offset_ptr, sizeofPrologueLength());
-  const uint64_t end_prologue_offset = PrologueLength + *offset_ptr;
-  MinInstLength = debug_line_data.getU8(offset_ptr);
-  if (Version >= 4)
-    MaxOpsPerInst = debug_line_data.getU8(offset_ptr);
-  DefaultIsStmt = debug_line_data.getU8(offset_ptr);
-  LineBase = debug_line_data.getU8(offset_ptr);
-  LineRange = debug_line_data.getU8(offset_ptr);
-  OpcodeBase = debug_line_data.getU8(offset_ptr);
+  // Get the file entries, according to the format described above.
+  int FileEntryCount = DebugLineData.getU8(OffsetPtr);
+  for (int I = 0; I != FileEntryCount; ++I) {
+    if (*OffsetPtr >= EndPrologueOffset)
+      return false;
+    DWARFDebugLine::FileNameEntry FileEntry;
+    for (auto Descriptor : FileDescriptors) {
+      DWARFFormValue Value(Descriptor.Form);
+      if (!Value.extractValue(DebugLineData, OffsetPtr, nullptr))
+        return false;
+      switch (Descriptor.Type) {
+      case DW_LNCT_path:
+        FileEntry.Name = Value.getAsCString().getValue();
+        break;
+      case DW_LNCT_directory_index:
+        FileEntry.DirIdx = Value.getAsUnsignedConstant().getValue();
+        break;
+      case DW_LNCT_timestamp:
+        FileEntry.ModTime = Value.getAsUnsignedConstant().getValue();
+        break;
+      case DW_LNCT_size:
+        FileEntry.Length = Value.getAsUnsignedConstant().getValue();
+        break;
+      // FIXME: Add MD5
+      default:
+        break;
+      }
+    }
+    FileNames.push_back(FileEntry);
+  }
+  return true;
+}
 
-  StandardOpcodeLengths.reserve(OpcodeBase - 1);
-  for (uint32_t i = 1; i < OpcodeBase; ++i) {
-    uint8_t op_len = debug_line_data.getU8(offset_ptr);
-    StandardOpcodeLengths.push_back(op_len);
+bool DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
+                                     uint32_t *OffsetPtr) {
+  const uint64_t PrologueOffset = *OffsetPtr;
+
+  clear();
+  TotalLength = DebugLineData.getU32(OffsetPtr);
+  if (TotalLength == UINT32_MAX) {
+    FormParams.Format = dwarf::DWARF64;
+    TotalLength = DebugLineData.getU64(OffsetPtr);
+  } else if (TotalLength >= 0xffffff00) {
+    return false;
   }
+  FormParams.Version = DebugLineData.getU16(OffsetPtr);
+  if (getVersion() < 2)
+    return false;
 
-  while (*offset_ptr < end_prologue_offset) {
-    const char *s = debug_line_data.getCStr(offset_ptr);
-    if (s && s[0])
-      IncludeDirectories.push_back(s);
-    else
-      break;
+  if (getVersion() >= 5) {
+    FormParams.AddrSize = DebugLineData.getU8(OffsetPtr);
+    assert(getAddressSize() == DebugLineData.getAddressSize() &&
+           "Line table header and data extractor disagree");
+    SegSelectorSize = DebugLineData.getU8(OffsetPtr);
   }
 
-  while (*offset_ptr < end_prologue_offset) {
-    const char *name = debug_line_data.getCStr(offset_ptr);
-    if (name && name[0]) {
-      FileNameEntry fileEntry;
-      fileEntry.Name = name;
-      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
-      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
-      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
-      FileNames.push_back(fileEntry);
-    } else {
-      break;
-    }
+  PrologueLength = DebugLineData.getUnsigned(OffsetPtr, sizeofPrologueLength());
+  const uint64_t EndPrologueOffset = PrologueLength + *OffsetPtr;
+  MinInstLength = DebugLineData.getU8(OffsetPtr);
+  if (getVersion() >= 4)
+    MaxOpsPerInst = DebugLineData.getU8(OffsetPtr);
+  DefaultIsStmt = DebugLineData.getU8(OffsetPtr);
+  LineBase = DebugLineData.getU8(OffsetPtr);
+  LineRange = DebugLineData.getU8(OffsetPtr);
+  OpcodeBase = DebugLineData.getU8(OffsetPtr);
+
+  StandardOpcodeLengths.reserve(OpcodeBase - 1);
+  for (uint32_t I = 1; I < OpcodeBase; ++I) {
+    uint8_t OpLen = DebugLineData.getU8(OffsetPtr);
+    StandardOpcodeLengths.push_back(OpLen);
   }
 
-  if (*offset_ptr != end_prologue_offset) {
-    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8" PRIx64
-                    " should have ended at 0x%8.8" PRIx64
-                    " but it ended at 0x%8.8" PRIx64 "\n",
-            prologue_offset, end_prologue_offset, (uint64_t)*offset_ptr);
+  if (getVersion() >= 5) {
+    if (!parseV5DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
+                              getFormParams(), IncludeDirectories, FileNames)) {
+      fprintf(stderr,
+              "warning: parsing line table prologue at 0x%8.8" PRIx64
+              " found an invalid directory or file table description at"
+              " 0x%8.8" PRIx64 "\n", PrologueOffset, (uint64_t)*OffsetPtr);
+      return false;
+    }
+  } else
+    parseV2DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
+                         IncludeDirectories, FileNames);
+
+  if (*OffsetPtr != EndPrologueOffset) {
+    fprintf(stderr,
+            "warning: parsing line table prologue at 0x%8.8" PRIx64
+            " should have ended at 0x%8.8" PRIx64
+            " but it ended at 0x%8.8" PRIx64 "\n",
+            PrologueOffset, EndPrologueOffset, (uint64_t)*OffsetPtr);
     return false;
   }
   return true;
 }
 
-DWARFDebugLine::Row::Row(bool default_is_stmt) { reset(default_is_stmt); }
+DWARFDebugLine::Row::Row(bool DefaultIsStmt) { reset(DefaultIsStmt); }
 
 void DWARFDebugLine::Row::postAppend() {
   BasicBlock = false;
@@ -137,20 +283,26 @@ void DWARFDebugLine::Row::postAppend() {
   EpilogueBegin = false;
 }
 
-void DWARFDebugLine::Row::reset(bool default_is_stmt) {
+void DWARFDebugLine::Row::reset(bool DefaultIsStmt) {
   Address = 0;
   Line = 1;
   Column = 0;
   File = 1;
   Isa = 0;
   Discriminator = 0;
-  IsStmt = default_is_stmt;
+  IsStmt = DefaultIsStmt;
   BasicBlock = false;
   EndSequence = false;
   PrologueEnd = false;
   EpilogueBegin = false;
 }
 
+void DWARFDebugLine::Row::dumpTableHeader(raw_ostream &OS) {
+  OS << "Address            Line   Column File   ISA Discriminator Flags\n"
+     << "------------------ ------ ------ ------ --- ------------- "
+        "-------------\n";
+}
+
 void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
   OS << format("0x%16.16" PRIx64 " %6u %6u", Address, Line, Column)
      << format(" %6u %3u %13u ", File, Isa, Discriminator)
@@ -177,9 +329,7 @@ void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
   OS << '\n';
 
   if (!Rows.empty()) {
-    OS << "Address            Line   Column File   ISA Discriminator Flags\n"
-       << "------------------ ------ ------ ------ --- ------------- "
-          "-------------\n";
+    Row::dumpTableHeader(OS);
     for (const Row &R : Rows) {
       R.dump(OS);
     }
@@ -193,7 +343,7 @@ void DWARFDebugLine::LineTable::clear() {
 }
 
 DWARFDebugLine::ParsingState::ParsingState(struct LineTable *LT)
-    : LineTable(LT), RowNumber(0) {
+    : LineTable(LT) {
   resetRowAndSequence();
 }
 
@@ -202,7 +352,7 @@ void DWARFDebugLine::ParsingState::resetRowAndSequence() {
   Sequence.reset();
 }
 
-void DWARFDebugLine::ParsingState::appendRowToMatrix(uint32_t offset) {
+void DWARFDebugLine::ParsingState::appendRowToMatrix(uint32_t Offset) {
   if (Sequence.Empty) {
     // Record the beginning of instruction sequence.
     Sequence.Empty = false;
@@ -223,56 +373,55 @@ void DWARFDebugLine::ParsingState::appendRowToMatrix(uint32_t offset) {
 }
 
 const DWARFDebugLine::LineTable *
-DWARFDebugLine::getLineTable(uint32_t offset) const {
-  LineTableConstIter pos = LineTableMap.find(offset);
-  if (pos != LineTableMap.end())
-    return &pos->second;
+DWARFDebugLine::getLineTable(uint32_t Offset) const {
+  LineTableConstIter Pos = LineTableMap.find(Offset);
+  if (Pos != LineTableMap.end())
+    return &Pos->second;
   return nullptr;
 }
 
 const DWARFDebugLine::LineTable *
-DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
-                                    uint32_t offset) {
-  std::pair<LineTableIter, bool> pos =
-      LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
-  LineTable *LT = &pos.first->second;
-  if (pos.second) {
-    if (!LT->parse(debug_line_data, RelocMap, &offset))
+DWARFDebugLine::getOrParseLineTable(const DWARFDataExtractor &DebugLineData,
+                                    uint32_t Offset) {
+  std::pair<LineTableIter, bool> Pos =
+      LineTableMap.insert(LineTableMapTy::value_type(Offset, LineTable()));
+  LineTable *LT = &Pos.first->second;
+  if (Pos.second) {
+    if (!LT->parse(DebugLineData, &Offset))
       return nullptr;
   }
   return LT;
 }
 
-bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
-                                      const RelocAddrMap *RMap,
-                                      uint32_t *offset_ptr) {
-  const uint32_t debug_line_offset = *offset_ptr;
+bool DWARFDebugLine::LineTable::parse(const DWARFDataExtractor &DebugLineData,
+                                      uint32_t *OffsetPtr) {
+  const uint32_t DebugLineOffset = *OffsetPtr;
 
   clear();
 
-  if (!Prologue.parse(debug_line_data, offset_ptr)) {
+  if (!Prologue.parse(DebugLineData, OffsetPtr)) {
     // Restore our offset and return false to indicate failure!
-    *offset_ptr = debug_line_offset;
+    *OffsetPtr = DebugLineOffset;
     return false;
   }
 
-  const uint32_t end_offset =
-      debug_line_offset + Prologue.TotalLength + Prologue.sizeofTotalLength();
+  const uint32_t EndOffset =
+      DebugLineOffset + Prologue.TotalLength + Prologue.sizeofTotalLength();
 
   ParsingState State(this);
 
-  while (*offset_ptr < end_offset) {
-    uint8_t opcode = debug_line_data.getU8(offset_ptr);
+  while (*OffsetPtr < EndOffset) {
+    uint8_t Opcode = DebugLineData.getU8(OffsetPtr);
 
-    if (opcode == 0) {
+    if (Opcode == 0) {
       // Extended Opcodes always start with a zero opcode followed by
       // a uleb128 length so you can skip ones you don't know about
-      uint32_t ext_offset = *offset_ptr;
-      uint64_t len = debug_line_data.getULEB128(offset_ptr);
-      uint32_t arg_size = len - (*offset_ptr - ext_offset);
+      uint32_t ExtOffset = *OffsetPtr;
+      uint64_t Len = DebugLineData.getULEB128(OffsetPtr);
+      uint32_t ArgSize = Len - (*OffsetPtr - ExtOffset);
 
-      uint8_t sub_opcode = debug_line_data.getU8(offset_ptr);
-      switch (sub_opcode) {
+      uint8_t SubOpcode = DebugLineData.getU8(OffsetPtr);
+      switch (SubOpcode) {
       case DW_LNE_end_sequence:
         // Set the end_sequence register of the state machine to true and
         // append a row to the matrix using the current values of the
@@ -282,7 +431,7 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // address is that of the byte after the last target machine instruction
         // of the sequence.
         State.Row.EndSequence = true;
-        State.appendRowToMatrix(*offset_ptr);
+        State.appendRowToMatrix(*OffsetPtr);
         State.resetRowAndSequence();
         break;
 
@@ -293,16 +442,7 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // relocatable address. All of the other statement program opcodes
         // that affect the address register add a delta to it. This instruction
         // stores a relocatable value into it instead.
-        {
-          // If this address is in our relocation map, apply the relocation.
-          RelocAddrMap::const_iterator AI = RMap->find(*offset_ptr);
-          if (AI != RMap->end()) {
-            const std::pair<uint8_t, int64_t> &R = AI->second;
-            State.Row.Address =
-                debug_line_data.getAddress(offset_ptr) + R.second;
-          } else
-            State.Row.Address = debug_line_data.getAddress(offset_ptr);
-        }
+        State.Row.Address = DebugLineData.getRelocatedAddress(OffsetPtr);
         break;
 
       case DW_LNE_define_file:
@@ -327,33 +467,33 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // the DW_LNE_define_file instruction. These numbers are used in the
         // the file register of the state machine.
         {
-          FileNameEntry fileEntry;
-          fileEntry.Name = debug_line_data.getCStr(offset_ptr);
-          fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
-          fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
-          fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
-          Prologue.FileNames.push_back(fileEntry);
+          FileNameEntry FileEntry;
+          FileEntry.Name = DebugLineData.getCStr(OffsetPtr);
+          FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
+          FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
+          FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
+          Prologue.FileNames.push_back(FileEntry);
         }
         break;
 
       case DW_LNE_set_discriminator:
-        State.Row.Discriminator = debug_line_data.getULEB128(offset_ptr);
+        State.Row.Discriminator = DebugLineData.getULEB128(OffsetPtr);
         break;
 
       default:
         // Length doesn't include the zero opcode byte or the length itself, but
         // it does include the sub_opcode, so we have to adjust for that below
-        (*offset_ptr) += arg_size;
+        (*OffsetPtr) += ArgSize;
         break;
       }
-    } else if (opcode < Prologue.OpcodeBase) {
-      switch (opcode) {
+    } else if (Opcode < Prologue.OpcodeBase) {
+      switch (Opcode) {
       // Standard Opcodes
       case DW_LNS_copy:
         // Takes no arguments. Append a row to the matrix using the
         // current values of the state-machine registers. Then set
         // the basic_block register to false.
-        State.appendRowToMatrix(*offset_ptr);
+        State.appendRowToMatrix(*OffsetPtr);
         break;
 
       case DW_LNS_advance_pc:
@@ -361,25 +501,25 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // min_inst_length field of the prologue, and adds the
         // result to the address register of the state machine.
         State.Row.Address +=
-            debug_line_data.getULEB128(offset_ptr) * Prologue.MinInstLength;
+            DebugLineData.getULEB128(OffsetPtr) * Prologue.MinInstLength;
         break;
 
       case DW_LNS_advance_line:
         // Takes a single signed LEB128 operand and adds that value to
         // the line register of the state machine.
-        State.Row.Line += debug_line_data.getSLEB128(offset_ptr);
+        State.Row.Line += DebugLineData.getSLEB128(OffsetPtr);
         break;
 
       case DW_LNS_set_file:
         // Takes a single unsigned LEB128 operand and stores it in the file
         // register of the state machine.
-        State.Row.File = debug_line_data.getULEB128(offset_ptr);
+        State.Row.File = DebugLineData.getULEB128(OffsetPtr);
         break;
 
       case DW_LNS_set_column:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        State.Row.Column = debug_line_data.getULEB128(offset_ptr);
+        State.Row.Column = DebugLineData.getULEB128(OffsetPtr);
         break;
 
       case DW_LNS_negate_stmt:
@@ -407,10 +547,10 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // than twice that range will it need to use both DW_LNS_advance_pc
         // and a special opcode, requiring three or more bytes.
         {
-          uint8_t adjust_opcode = 255 - Prologue.OpcodeBase;
-          uint64_t addr_offset =
-              (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
-          State.Row.Address += addr_offset;
+          uint8_t AdjustOpcode = 255 - Prologue.OpcodeBase;
+          uint64_t AddrOffset =
+              (AdjustOpcode / Prologue.LineRange) * Prologue.MinInstLength;
+          State.Row.Address += AddrOffset;
         }
         break;
 
@@ -424,7 +564,7 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // judge when the computation of a special opcode overflows and
         // requires the use of DW_LNS_advance_pc. Such assemblers, however,
         // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
-        State.Row.Address += debug_line_data.getU16(offset_ptr);
+        State.Row.Address += DebugLineData.getU16(OffsetPtr);
         break;
 
       case DW_LNS_set_prologue_end:
@@ -442,7 +582,7 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
       case DW_LNS_set_isa:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        State.Row.Isa = debug_line_data.getULEB128(offset_ptr);
+        State.Row.Isa = DebugLineData.getULEB128(OffsetPtr);
         break;
 
       default:
@@ -450,10 +590,10 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
         // of such opcodes because they are specified in the prologue
         // as a multiple of LEB128 operands for each opcode.
         {
-          assert(opcode - 1U < Prologue.StandardOpcodeLengths.size());
-          uint8_t opcode_length = Prologue.StandardOpcodeLengths[opcode - 1];
-          for (uint8_t i = 0; i < opcode_length; ++i)
-            debug_line_data.getULEB128(offset_ptr);
+          assert(Opcode - 1U < Prologue.StandardOpcodeLengths.size());
+          uint8_t OpcodeLength = Prologue.StandardOpcodeLengths[Opcode - 1];
+          for (uint8_t I = 0; I < OpcodeLength; ++I)
+            DebugLineData.getULEB128(OffsetPtr);
         }
         break;
       }
@@ -491,14 +631,14 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
       //
       // line increment = line_base + (adjusted opcode % line_range)
 
-      uint8_t adjust_opcode = opcode - Prologue.OpcodeBase;
-      uint64_t addr_offset =
-          (adjust_opcode / Prologue.LineRange) * Prologue.MinInstLength;
-      int32_t line_offset =
-          Prologue.LineBase + (adjust_opcode % Prologue.LineRange);
-      State.Row.Line += line_offset;
-      State.Row.Address += addr_offset;
-      State.appendRowToMatrix(*offset_ptr);
+      uint8_t AdjustOpcode = Opcode - Prologue.OpcodeBase;
+      uint64_t AddrOffset =
+          (AdjustOpcode / Prologue.LineRange) * Prologue.MinInstLength;
+      int32_t LineOffset =
+          Prologue.LineBase + (AdjustOpcode % Prologue.LineRange);
+      State.Row.Line += LineOffset;
+      State.Row.Address += AddrOffset;
+      State.appendRowToMatrix(*OffsetPtr);
       // Reset discriminator to 0.
       State.Row.Discriminator = 0;
     }
@@ -520,124 +660,122 @@ bool DWARFDebugLine::LineTable::parse(DataExtractor debug_line_data,
     // rudimentary sequences for address ranges [0x0, 0xsomething).
   }
 
-  return end_offset;
+  return EndOffset;
 }
 
 uint32_t
-DWARFDebugLine::LineTable::findRowInSeq(const DWARFDebugLine::Sequence &seq,
-                                        uint64_t address) const {
-  if (!seq.containsPC(address))
+DWARFDebugLine::LineTable::findRowInSeq(const DWARFDebugLine::Sequence &Seq,
+                                        uint64_t Address) const {
+  if (!Seq.containsPC(Address))
     return UnknownRowIndex;
   // Search for instruction address in the rows describing the sequence.
   // Rows are stored in a vector, so we may use arithmetical operations with
   // iterators.
-  DWARFDebugLine::Row row;
-  row.Address = address;
-  RowIter first_row = Rows.begin() + seq.FirstRowIndex;
-  RowIter last_row = Rows.begin() + seq.LastRowIndex;
-  LineTable::RowIter row_pos = std::lower_bound(
-      first_row, last_row, row, DWARFDebugLine::Row::orderByAddress);
-  if (row_pos == last_row) {
-    return seq.LastRowIndex - 1;
+  DWARFDebugLine::Row Row;
+  Row.Address = Address;
+  RowIter FirstRow = Rows.begin() + Seq.FirstRowIndex;
+  RowIter LastRow = Rows.begin() + Seq.LastRowIndex;
+  LineTable::RowIter RowPos = std::lower_bound(
+      FirstRow, LastRow, Row, DWARFDebugLine::Row::orderByAddress);
+  if (RowPos == LastRow) {
+    return Seq.LastRowIndex - 1;
   }
-  uint32_t index = seq.FirstRowIndex + (row_pos - first_row);
-  if (row_pos->Address > address) {
-    if (row_pos == first_row)
+  uint32_t Index = Seq.FirstRowIndex + (RowPos - FirstRow);
+  if (RowPos->Address > Address) {
+    if (RowPos == FirstRow)
       return UnknownRowIndex;
     else
-      index--;
+      Index--;
   }
-  return index;
+  return Index;
 }
 
-uint32_t DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
+uint32_t DWARFDebugLine::LineTable::lookupAddress(uint64_t Address) const {
   if (Sequences.empty())
     return UnknownRowIndex;
   // First, find an instruction sequence containing the given address.
-  DWARFDebugLine::Sequence sequence;
-  sequence.LowPC = address;
-  SequenceIter first_seq = Sequences.begin();
-  SequenceIter last_seq = Sequences.end();
-  SequenceIter seq_pos = std::lower_bound(
-      first_seq, last_seq, sequence, DWARFDebugLine::Sequence::orderByLowPC);
-  DWARFDebugLine::Sequence found_seq;
-  if (seq_pos == last_seq) {
-    found_seq = Sequences.back();
-  } else if (seq_pos->LowPC == address) {
-    found_seq = *seq_pos;
+  DWARFDebugLine::Sequence Sequence;
+  Sequence.LowPC = Address;
+  SequenceIter FirstSeq = Sequences.begin();
+  SequenceIter LastSeq = Sequences.end();
+  SequenceIter SeqPos = std::lower_bound(
+      FirstSeq, LastSeq, Sequence, DWARFDebugLine::Sequence::orderByLowPC);
+  DWARFDebugLine::Sequence FoundSeq;
+  if (SeqPos == LastSeq) {
+    FoundSeq = Sequences.back();
+  } else if (SeqPos->LowPC == Address) {
+    FoundSeq = *SeqPos;
   } else {
-    if (seq_pos == first_seq)
+    if (SeqPos == FirstSeq)
       return UnknownRowIndex;
-    found_seq = *(seq_pos - 1);
+    FoundSeq = *(SeqPos - 1);
   }
-  return findRowInSeq(found_seq, address);
+  return findRowInSeq(FoundSeq, Address);
 }
 
 bool DWARFDebugLine::LineTable::lookupAddressRange(
-    uint64_t address, uint64_t size, std::vector<uint32_t> &result) const {
+    uint64_t Address, uint64_t Size, std::vector<uint32_t> &Result) const {
   if (Sequences.empty())
     return false;
-  uint64_t end_addr = address + size;
+  uint64_t EndAddr = Address + Size;
   // First, find an instruction sequence containing the given address.
-  DWARFDebugLine::Sequence sequence;
-  sequence.LowPC = address;
-  SequenceIter first_seq = Sequences.begin();
-  SequenceIter last_seq = Sequences.end();
-  SequenceIter seq_pos = std::lower_bound(
-      first_seq, last_seq, sequence, DWARFDebugLine::Sequence::orderByLowPC);
-  if (seq_pos == last_seq || seq_pos->LowPC != address) {
-    if (seq_pos == first_seq)
+  DWARFDebugLine::Sequence Sequence;
+  Sequence.LowPC = Address;
+  SequenceIter FirstSeq = Sequences.begin();
+  SequenceIter LastSeq = Sequences.end();
+  SequenceIter SeqPos = std::lower_bound(
+      FirstSeq, LastSeq, Sequence, DWARFDebugLine::Sequence::orderByLowPC);
+  if (SeqPos == LastSeq || SeqPos->LowPC != Address) {
+    if (SeqPos == FirstSeq)
       return false;
-    seq_pos--;
+    SeqPos--;
   }
-  if (!seq_pos->containsPC(address))
+  if (!SeqPos->containsPC(Address))
     return false;
 
-  SequenceIter start_pos = seq_pos;
+  SequenceIter StartPos = SeqPos;
 
   // Add the rows from the first sequence to the vector, starting with the
   // index we just calculated
 
-  while (seq_pos != last_seq && seq_pos->LowPC < end_addr) {
-    const DWARFDebugLine::Sequence &cur_seq = *seq_pos;
+  while (SeqPos != LastSeq && SeqPos->LowPC < EndAddr) {
+    const DWARFDebugLine::Sequence &CurSeq = *SeqPos;
     // For the first sequence, we need to find which row in the sequence is the
     // first in our range.
-    uint32_t first_row_index = cur_seq.FirstRowIndex;
-    if (seq_pos == start_pos)
-      first_row_index = findRowInSeq(cur_seq, address);
+    uint32_t FirstRowIndex = CurSeq.FirstRowIndex;
+    if (SeqPos == StartPos)
+      FirstRowIndex = findRowInSeq(CurSeq, Address);
 
     // Figure out the last row in the range.
-    uint32_t last_row_index = findRowInSeq(cur_seq, end_addr - 1);
-    if (last_row_index == UnknownRowIndex)
-      last_row_index = cur_seq.LastRowIndex - 1;
+    uint32_t LastRowIndex = findRowInSeq(CurSeq, EndAddr - 1);
+    if (LastRowIndex == UnknownRowIndex)
+      LastRowIndex = CurSeq.LastRowIndex - 1;
 
-    assert(first_row_index != UnknownRowIndex);
-    assert(last_row_index != UnknownRowIndex);
+    assert(FirstRowIndex != UnknownRowIndex);
+    assert(LastRowIndex != UnknownRowIndex);
 
-    for (uint32_t i = first_row_index; i <= last_row_index; ++i) {
-      result.push_back(i);
+    for (uint32_t I = FirstRowIndex; I <= LastRowIndex; ++I) {
+      Result.push_back(I);
     }
 
-    ++seq_pos;
+    ++SeqPos;
   }
 
   return true;
 }
 
-bool
-DWARFDebugLine::LineTable::hasFileAtIndex(uint64_t FileIndex) const {
+bool DWARFDebugLine::LineTable::hasFileAtIndex(uint64_t FileIndex) const {
   return FileIndex != 0 && FileIndex <= Prologue.FileNames.size();
 }
 
-bool
-DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
-                                              const char *CompDir,
-                                              FileLineInfoKind Kind,
-                                              std::string &Result) const {
+bool DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                                   const char *CompDir,
+                                                   FileLineInfoKind Kind,
+                                                   std::string &Result) const {
   if (Kind == FileLineInfoKind::None || !hasFileAtIndex(FileIndex))
     return false;
   const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
-  const char *FileName = Entry.Name;
+  StringRef FileName = Entry.Name;
   if (Kind != FileLineInfoKind::AbsoluteFilePath ||
       sys::path::is_absolute(FileName)) {
     Result = FileName;
@@ -646,7 +784,7 @@ DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
 
   SmallString<16> FilePath;
   uint64_t IncludeDirIndex = Entry.DirIdx;
-  const char *IncludeDir = "";
+  StringRef IncludeDir;
   // Be defensive about the contents of Entry.
   if (IncludeDirIndex > 0 &&
       IncludeDirIndex <= Prologue.IncludeDirectories.size())
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index ae5b9d7..c240dd7 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLoc.cpp -------------------------------------------------===//
+//===- DWARFDebugLoc.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,9 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -34,27 +40,19 @@ void DWARFDebugLoc::dump(raw_ostream &OS) const {
   }
 }
 
-void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
+void DWARFDebugLoc::parse(const DWARFDataExtractor &data) {
   uint32_t Offset = 0;
-  while (data.isValidOffset(Offset+AddressSize-1)) {
+  while (data.isValidOffset(Offset+data.getAddressSize()-1)) {
     Locations.resize(Locations.size() + 1);
     LocationList &Loc = Locations.back();
     Loc.Offset = Offset;
     // 2.6.2 Location Lists
     // A location list entry consists of:
     while (true) {
+      // A beginning and ending address offsets.
       Entry E;
-      RelocAddrMap::const_iterator AI = RelocMap.find(Offset);
-      // 1. A beginning address offset. ...
-      E.Begin = data.getUnsigned(&Offset, AddressSize);
-      if (AI != RelocMap.end())
-        E.Begin += AI->second.second;
-
-      AI = RelocMap.find(Offset);
-      // 2. An ending address offset. ...
-      E.End = data.getUnsigned(&Offset, AddressSize);
-      if (AI != RelocMap.end())
-        E.End += AI->second.second;
+      E.Begin = data.getRelocatedAddress(&Offset);
+      E.End = data.getRelocatedAddress(&Offset);
 
       // The end of any given location list is marked by an end of list entry,
       // which consists of a 0 for the beginning address offset and a 0 for the
@@ -71,7 +69,7 @@ void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
     }
   }
   if (data.isValidOffset(Offset))
-    llvm::errs() << "error: failed to consume entire .debug_loc section\n";
+    errs() << "error: failed to consume entire .debug_loc section\n";
 }
 
 void DWARFDebugLocDWO::parse(DataExtractor data) {
@@ -85,8 +83,8 @@ void DWARFDebugLocDWO::parse(DataExtractor data) {
                 data.getU8(&Offset))) != dwarf::DW_LLE_end_of_list) {
 
       if (Kind != dwarf::DW_LLE_startx_length) {
-        llvm::errs() << "error: dumping support for LLE of kind " << (int)Kind
-                     << " not implemented\n";
+        errs() << "error: dumping support for LLE of kind " << (int)Kind
+               << " not implemented\n";
         return;
       }
 
@@ -123,4 +121,3 @@ void DWARFDebugLocDWO::dump(raw_ostream &OS) const {
     }
   }
 }
-
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 7710a90..1b77be6 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugMacro.cpp -----------------------------------------------===//
+//===- DWARFDebugMacro.cpp ------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,9 +9,9 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "SyntaxHighlighting.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index 3c1fe93..5a4e39f 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugPubTable.cpp ---------------------------------------------===//
+//===- DWARFDebugPubTable.cpp ---------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,11 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
-using namespace llvm::dwarf;
+using namespace dwarf;
 
 DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
                                        bool GnuStyle)
@@ -41,7 +45,7 @@ DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
 }
 
 void DWARFDebugPubTable::dump(StringRef Name, raw_ostream &OS) const {
-  OS << "\n." << Name << " contents: a\n";
+  OS << "\n." << Name << " contents:\n";
   for (const Set &S : Sets) {
     OS << "length = " << format("0x%08x", S.Length);
     OS << " version = " << format("0x%04x", S.Version);
@@ -54,7 +58,7 @@ void DWARFDebugPubTable::dump(StringRef Name, raw_ostream &OS) const {
       OS << format("0x%8.8x ", E.SecOffset);
       if (GnuStyle) {
         StringRef EntryLinkage =
-            dwarf::GDBIndexEntryLinkageString(E.Descriptor.Linkage);
+            GDBIndexEntryLinkageString(E.Descriptor.Linkage);
         StringRef EntryKind = dwarf::GDBIndexEntryKindString(E.Descriptor.Kind);
         OS << format("%-8s", EntryLinkage.data()) << ' '
            << format("%-8s", EntryKind.data()) << ' ';
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index d5df688..0b6ae86 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//===- DWARFDebugRangesList.cpp -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,8 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
@@ -19,7 +23,8 @@ void DWARFDebugRangeList::clear() {
   Entries.clear();
 }
 
-bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
+bool DWARFDebugRangeList::extract(const DWARFDataExtractor &data,
+                                  uint32_t *offset_ptr) {
   clear();
   if (!data.isValidOffset(*offset_ptr))
     return false;
@@ -30,8 +35,10 @@ bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
   while (true) {
     RangeListEntry entry;
     uint32_t prev_offset = *offset_ptr;
-    entry.StartAddress = data.getAddress(offset_ptr);
-    entry.EndAddress = data.getAddress(offset_ptr);
+    entry.StartAddress =
+        data.getRelocatedAddress(offset_ptr, &entry.SectionIndex);
+    entry.EndAddress = data.getRelocatedAddress(offset_ptr);
+
     // Check that both values were extracted correctly.
     if (*offset_ptr != prev_offset + 2 * AddressSize) {
       clear();
@@ -61,8 +68,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
       BaseAddress = RLE.EndAddress;
     } else {
-      Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress,
-                                   BaseAddress + RLE.EndAddress));
+      Res.push_back({BaseAddress + RLE.StartAddress,
+                     BaseAddress + RLE.EndAddress, RLE.SectionIndex});
     }
   }
   return Res;
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 89b83b1..111f0bb 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDie.cpp ------------------------------------------------------===//
+//===- DWARFDie.cpp -------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,23 +9,31 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "SyntaxHighlighting.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
 
-namespace {
- static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
+static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
   OS << " (";
   do {
     uint64_t Shift = countTrailingZeros(Val);
@@ -52,14 +60,15 @@ static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges,
     OS << '\n';
     OS.indent(Indent);
     OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")",
-                 AddressSize*2, Range.first,
-                 AddressSize*2, Range.second);
+                 AddressSize*2, Range.LowPC,
+                 AddressSize*2, Range.HighPC);
   }
 }
 
 static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
                           uint32_t *OffsetPtr, dwarf::Attribute Attr,
-                          dwarf::Form Form, unsigned Indent) {
+                          dwarf::Form Form, unsigned Indent,
+                          DIDumpOptions DumpOpts) {
   if (!Die.isValid())
     return;
   const char BaseIndent[] = "            ";
@@ -70,13 +79,15 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
     WithColor(OS, syntax::Attribute) << attrString;
   else
     WithColor(OS, syntax::Attribute).get() << format("DW_AT_Unknown_%x", Attr);
-  
-  auto formString = FormEncodingString(Form);
-  if (!formString.empty())
-    OS << " [" << formString << ']';
-  else
-    OS << format(" [DW_FORM_Unknown_%x]", Form);
-  
+
+  if (!DumpOpts.Brief) {
+    auto formString = FormEncodingString(Form);
+    if (!formString.empty())
+      OS << " [" << formString << ']';
+    else
+      OS << format(" [DW_FORM_Unknown_%x]", Form);
+  }
+
   DWARFUnit *U = Die.getDwarfUnit();
   DWARFFormValue formValue(Form);
   
@@ -122,8 +133,6 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   OS << ")\n";
 }
 
-} // end anonymous namespace
-
 bool DWARFDie::isSubprogramDIE() const {
   return getTag() == DW_TAG_subprogram;
 }
@@ -134,7 +143,7 @@ bool DWARFDie::isSubroutineDIE() const {
 }
 
 Optional<DWARFFormValue>
-DWARFDie::getAttributeValue(dwarf::Attribute Attr) const {
+DWARFDie::find(dwarf::Attribute Attr) const {
   if (!isValid())
     return None;
   auto AbbrevDecl = getAbbreviationDeclarationPtr();
@@ -143,54 +152,41 @@ DWARFDie::getAttributeValue(dwarf::Attribute Attr) const {
   return None;
 }
 
-const char *DWARFDie::getAttributeValueAsString(dwarf::Attribute Attr,
-                                                const char *FailValue) const {
-  auto FormValue = getAttributeValue(Attr);
-  if (!FormValue)
-    return FailValue;
-  Optional<const char *> Result = FormValue->getAsCString();
-  return Result.hasValue() ? Result.getValue() : FailValue;
-}
-
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsAddress(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsAddress();
-  return None;
-}
-
-Optional<int64_t>
-DWARFDie::getAttributeValueAsSignedConstant(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsSignedConstant();
-  return None;
-}
-
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsUnsignedConstant(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsUnsignedConstant();
-  return None;
-}
-
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsReference(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsReference();
+Optional<DWARFFormValue>
+DWARFDie::find(ArrayRef<dwarf::Attribute> Attrs) const {
+  if (!isValid())
+    return None;
+  auto AbbrevDecl = getAbbreviationDeclarationPtr();
+  if (AbbrevDecl) {
+    for (auto Attr : Attrs) {
+      if (auto Value = AbbrevDecl->getAttributeValue(getOffset(), Attr, *U))
+        return Value;
+    }
+  }
   return None;
 }
 
-Optional<uint64_t>
-DWARFDie::getAttributeValueAsSectionOffset(dwarf::Attribute Attr) const {
-  if (auto FormValue = getAttributeValue(Attr))
-    return FormValue->getAsSectionOffset();
+Optional<DWARFFormValue>
+DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
+  if (!isValid())
+    return None;
+  auto Die = *this;
+  if (auto Value = Die.find(Attrs))
+    return Value;
+  if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
+    Die = D;
+  if (auto Value = Die.find(Attrs))
+    return Value;
+  if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification))
+    Die = D;
+  if (auto Value = Die.find(Attrs))
+    return Value;
   return None;
 }
 
-
 DWARFDie
 DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const {
-  auto SpecRef = getAttributeValueAsReference(Attr);
+  auto SpecRef = toReference(find(Attr));
   if (SpecRef) {
     auto SpecUnit = U->getUnitSection().getUnitForOffset(*SpecRef);
     if (SpecUnit)
@@ -201,14 +197,11 @@ DWARFDie::getAttributeValueAsReferencedDie(dwarf::Attribute Attr) const {
 
 Optional<uint64_t>
 DWARFDie::getRangesBaseAttribute() const {
-  auto Result = getAttributeValueAsSectionOffset(DW_AT_rnglists_base);
-  if (Result)
-    return Result;
-  return getAttributeValueAsSectionOffset(DW_AT_GNU_ranges_base);
+  return toSectionOffset(find({DW_AT_rnglists_base, DW_AT_GNU_ranges_base}));
 }
 
 Optional<uint64_t> DWARFDie::getHighPC(uint64_t LowPC) const {
-  if (auto FormValue = getAttributeValue(DW_AT_high_pc)) {
+  if (auto FormValue = find(DW_AT_high_pc)) {
     if (auto Address = FormValue->getAsAddress()) {
       // High PC is an address.
       return Address;
@@ -221,13 +214,16 @@ Optional<uint64_t> DWARFDie::getHighPC(uint64_t LowPC) const {
   return None;
 }
 
-bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const {
-  auto LowPcAddr = getAttributeValueAsAddress(DW_AT_low_pc);
+bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC,
+                               uint64_t &SectionIndex) const {
+  auto F = find(DW_AT_low_pc);
+  auto LowPcAddr = toAddress(F);
   if (!LowPcAddr)
     return false;
   if (auto HighPcAddr = getHighPC(*LowPcAddr)) {
     LowPC = *LowPcAddr;
     HighPC = *HighPcAddr;
+    SectionIndex = F->getSectionIndex();
     return true;
   }
   return false;
@@ -238,12 +234,12 @@ DWARFDie::getAddressRanges() const {
   if (isNULL())
     return DWARFAddressRangesVector();
   // Single range specified by low/high PC.
-  uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(LowPC, HighPC)) {
-    return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC));
-  }
+  uint64_t LowPC, HighPC, Index;
+  if (getLowAndHighPC(LowPC, HighPC, Index))
+    return {{LowPC, HighPC, Index}};
+
   // Multiple ranges from .debug_ranges section.
-  auto RangesOffset = getAttributeValueAsSectionOffset(DW_AT_ranges);
+  auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
   if (RangesOffset) {
     DWARFDebugRangeList RangeList;
     if (U->extractRangeList(*RangesOffset, RangeList))
@@ -267,7 +263,7 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const {
 
 bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const {
   for (const auto& R : getAddressRanges()) {
-    if (R.first <= Address && Address < R.second)
+    if (R.LowPC <= Address && Address < R.HighPC)
       return true;
   }
   return false;
@@ -284,40 +280,35 @@ const char *
 DWARFDie::getName(DINameKind Kind) const {
   if (!isValid() || Kind == DINameKind::None)
     return nullptr;
-  const char *name = nullptr;
   // Try to get mangled name only if it was asked for.
   if (Kind == DINameKind::LinkageName) {
-    if ((name = getAttributeValueAsString(DW_AT_MIPS_linkage_name, nullptr)))
-      return name;
-    if ((name = getAttributeValueAsString(DW_AT_linkage_name, nullptr)))
-      return name;
+    if (auto Name = dwarf::toString(findRecursively({DW_AT_MIPS_linkage_name,
+                                    DW_AT_linkage_name}), nullptr))
+      return Name;
   }
-  if ((name = getAttributeValueAsString(DW_AT_name, nullptr)))
-    return name;
-  // Try to get name from specification DIE.
-  DWARFDie SpecDie = getAttributeValueAsReferencedDie(DW_AT_specification);
-  if (SpecDie && (name = SpecDie.getName(Kind)))
-    return name;
-  // Try to get name from abstract origin DIE.
-  DWARFDie AbsDie = getAttributeValueAsReferencedDie(DW_AT_abstract_origin);
-  if (AbsDie && (name = AbsDie.getName(Kind)))
-    return name;
+  if (auto Name = dwarf::toString(findRecursively(DW_AT_name), nullptr))
+    return Name;
   return nullptr;
 }
 
+uint64_t DWARFDie::getDeclLine() const {
+  return toUnsigned(findRecursively(DW_AT_decl_line), 0);
+}
+
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
-                              uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsignedConstant(DW_AT_call_file).getValueOr(0);
-  CallLine = getAttributeValueAsUnsignedConstant(DW_AT_call_line).getValueOr(0);
-  CallColumn =
-      getAttributeValueAsUnsignedConstant(DW_AT_call_column).getValueOr(0);
+                              uint32_t &CallColumn,
+                              uint32_t &CallDiscriminator) const {
+  CallFile = toUnsigned(find(DW_AT_call_file), 0);
+  CallLine = toUnsigned(find(DW_AT_call_line), 0);
+  CallColumn = toUnsigned(find(DW_AT_call_column), 0);
+  CallDiscriminator = toUnsigned(find(DW_AT_GNU_discriminator), 0);
 }
 
-void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
-                    unsigned Indent) const {
+void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth, unsigned Indent,
+                    DIDumpOptions DumpOpts) const {
   if (!isValid())
     return;
-  DataExtractor debug_info_data = U->getDebugInfoExtractor();
+  DWARFDataExtractor debug_info_data = U->getDebugInfoExtractor();
   const uint32_t Offset = getOffset();
   uint32_t offset = Offset;
   
@@ -334,20 +325,28 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
         else
           WithColor(OS, syntax::Tag).get().indent(Indent)
           << format("DW_TAG_Unknown_%x", getTag());
-        
-        OS << format(" [%u] %c\n", abbrCode,
-                     AbbrevDecl->hasChildren() ? '*' : ' ');
-        
+
+        if (!DumpOpts.Brief)
+          OS << format(" [%u] %c", abbrCode,
+                       AbbrevDecl->hasChildren() ? '*' : ' ');
+        OS << '\n';
+
         // Dump all data in the DIE for the attributes.
         for (const auto &AttrSpec : AbbrevDecl->attributes()) {
+          if (AttrSpec.Form == DW_FORM_implicit_const) {
+            // We are dumping .debug_info section ,
+            // implicit_const attribute values are not really stored here,
+            // but in .debug_abbrev section. So we just skip such attrs.
+            continue;
+          }
           dumpAttribute(OS, *this, &offset, AttrSpec.Attr, AttrSpec.Form,
-                        Indent);
+                        Indent, DumpOpts);
         }
         
         DWARFDie child = getFirstChild();
         if (RecurseDepth > 0 && child) {
           while (child) {
-            child.dump(OS, RecurseDepth-1, Indent+2);
+            child.dump(OS, RecurseDepth-1, Indent+2, DumpOpts);
             child = child.getSibling();
           }
         }
@@ -361,33 +360,6 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
   }
 }
 
-
-void DWARFDie::getInlinedChainForAddress(
-    const uint64_t Address, SmallVectorImpl<DWARFDie> &InlinedChain) const {
-  if (isNULL())
-    return;
-  DWARFDie DIE(*this);
-  while (DIE) {
-    // Append current DIE to inlined chain only if it has correct tag
-    // (e.g. it is not a lexical block).
-    if (DIE.isSubroutineDIE())
-      InlinedChain.push_back(DIE);
-
-    // Try to get child which also contains provided address.
-    DWARFDie Child = DIE.getFirstChild();
-    while (Child) {
-      if (Child.addressRangeContainsAddress(Address)) {
-        // Assume there is only one such child.
-        break;
-      }
-      Child = Child.getSibling();
-    }
-    DIE = Child;
-  }
-  // Reverse the obtained chain to make the root of inlined chain last.
-  std::reverse(InlinedChain.begin(), InlinedChain.end());
-}
-
 DWARFDie DWARFDie::getParent() const {
   if (isValid())
     return U->getParent(Die);
@@ -399,3 +371,53 @@ DWARFDie DWARFDie::getSibling() const {
     return U->getSibling(Die);
   return DWARFDie();
 }
+
+iterator_range<DWARFDie::attribute_iterator>
+DWARFDie::attributes() const {
+  return make_range(attribute_iterator(*this, false),
+                    attribute_iterator(*this, true));
+}
+
+DWARFDie::attribute_iterator::attribute_iterator(DWARFDie D, bool End) :
+    Die(D), AttrValue(0), Index(0) {
+  auto AbbrDecl = Die.getAbbreviationDeclarationPtr();
+  assert(AbbrDecl && "Must have abbreviation declaration");
+  if (End) {
+    // This is the end iterator so we set the index to the attribute count.
+    Index = AbbrDecl->getNumAttributes();
+  } else {
+    // This is the begin iterator so we extract the value for this->Index.
+    AttrValue.Offset = D.getOffset() + AbbrDecl->getCodeByteSize();
+    updateForIndex(*AbbrDecl, 0);
+  }
+}
+
+void DWARFDie::attribute_iterator::updateForIndex(
+    const DWARFAbbreviationDeclaration &AbbrDecl, uint32_t I) {
+  Index = I;
+  // AbbrDecl must be valid before calling this function.
+  auto NumAttrs = AbbrDecl.getNumAttributes();
+  if (Index < NumAttrs) {
+    AttrValue.Attr = AbbrDecl.getAttrByIndex(Index);
+    // Add the previous byte size of any previous attribute value.
+    AttrValue.Offset += AttrValue.ByteSize;
+    AttrValue.Value.setForm(AbbrDecl.getFormByIndex(Index));
+    uint32_t ParseOffset = AttrValue.Offset;
+    auto U = Die.getDwarfUnit();
+    assert(U && "Die must have valid DWARF unit");
+    bool b = AttrValue.Value.extractValue(U->getDebugInfoExtractor(),
+                                          &ParseOffset, U);
+    (void)b;
+    assert(b && "extractValue cannot fail on fully parsed DWARF");
+    AttrValue.ByteSize = ParseOffset - AttrValue.Offset;
+  } else {
+    assert(Index == NumAttrs && "Indexes should be [0, NumAttrs) only");
+    AttrValue.clear();
+  }
+}
+
+DWARFDie::attribute_iterator &DWARFDie::attribute_iterator::operator++() {
+  if (auto AbbrDecl = Die.getAbbreviationDeclarationPtr())
+    updateForIndex(*AbbrDecl, Index + 1);
+  return *this;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index dc9310d..83a7792 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFFormValue.cpp ------------------------------------------------===//
+//===- DWARFFormValue.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,264 +7,240 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "SyntaxHighlighting.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
+#include <cinttypes>
+#include <cstdint>
 #include <limits>
+
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
 
 static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
-  DWARFFormValue::FC_Unknown,       // 0x0
-  DWARFFormValue::FC_Address,       // 0x01 DW_FORM_addr
-  DWARFFormValue::FC_Unknown,       // 0x02 unused
-  DWARFFormValue::FC_Block,         // 0x03 DW_FORM_block2
-  DWARFFormValue::FC_Block,         // 0x04 DW_FORM_block4
-  DWARFFormValue::FC_Constant,      // 0x05 DW_FORM_data2
-  // --- These can be FC_SectionOffset in DWARF3 and below:
-  DWARFFormValue::FC_Constant,      // 0x06 DW_FORM_data4
-  DWARFFormValue::FC_Constant,      // 0x07 DW_FORM_data8
-  // ---
-  DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
-  DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
-  DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
-  DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
-  DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
-  DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
-  DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
-  DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
-  DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
-  DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
-  DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
-  DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
-  DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
-  DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
-  DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
-  DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
-  DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
-  DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
-};
-
-namespace {
-
-/// A helper class that can be used in DWARFFormValue.cpp functions that need
-/// to know the byte size of DW_FORM values that vary in size depending on the
-/// DWARF version, address byte size, or DWARF32 or DWARF64.
-class FormSizeHelper {
-  uint16_t Version;
-  uint8_t AddrSize;
-  llvm::dwarf::DwarfFormat Format;
-
-public:
-  FormSizeHelper(uint16_t V, uint8_t A, llvm::dwarf::DwarfFormat F)
-  : Version(V), AddrSize(A), Format(F) {}
-  uint8_t getAddressByteSize() const { return AddrSize; }
-  uint8_t getRefAddrByteSize() const {
-    if (Version == 2)
-      return AddrSize;
-    return getDwarfOffsetByteSize();
-  }
-  uint8_t getDwarfOffsetByteSize() const {
-    switch (Format) {
-      case dwarf::DwarfFormat::DWARF32:
-        return 4;
-      case dwarf::DwarfFormat::DWARF64:
-        return 8;
-    }
-    llvm_unreachable("Invalid Format value");
-  }
+    DWARFFormValue::FC_Unknown,  // 0x0
+    DWARFFormValue::FC_Address,  // 0x01 DW_FORM_addr
+    DWARFFormValue::FC_Unknown,  // 0x02 unused
+    DWARFFormValue::FC_Block,    // 0x03 DW_FORM_block2
+    DWARFFormValue::FC_Block,    // 0x04 DW_FORM_block4
+    DWARFFormValue::FC_Constant, // 0x05 DW_FORM_data2
+    // --- These can be FC_SectionOffset in DWARF3 and below:
+    DWARFFormValue::FC_Constant, // 0x06 DW_FORM_data4
+    DWARFFormValue::FC_Constant, // 0x07 DW_FORM_data8
+    // ---
+    DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
+    DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
+    DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
+    DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
+    DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
+    DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
+    DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
+    DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
+    DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
+    DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
+    DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
+    DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
+    DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
+    DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
+    DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
+    DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
+    DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
+    DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
 };
 
-} // end anonymous namespace
-
-template <class T>
-static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
+Optional<uint8_t>
+DWARFFormValue::getFixedByteSize(dwarf::Form Form,
+                                 const DWARFFormParams Params) {
   switch (Form) {
-    case DW_FORM_addr:
-      if (U)
-        return U->getAddressByteSize();
-      return None;
+  case DW_FORM_addr:
+    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
+    return Params.AddrSize;
+
+  case DW_FORM_block:          // ULEB128 length L followed by L bytes.
+  case DW_FORM_block1:         // 1 byte length L followed by L bytes.
+  case DW_FORM_block2:         // 2 byte length L followed by L bytes.
+  case DW_FORM_block4:         // 4 byte length L followed by L bytes.
+  case DW_FORM_string:         // C-string with null terminator.
+  case DW_FORM_sdata:          // SLEB128.
+  case DW_FORM_udata:          // ULEB128.
+  case DW_FORM_ref_udata:      // ULEB128.
+  case DW_FORM_indirect:       // ULEB128.
+  case DW_FORM_exprloc:        // ULEB128 length L followed by L bytes.
+  case DW_FORM_strx:           // ULEB128.
+  case DW_FORM_addrx:          // ULEB128.
+  case DW_FORM_loclistx:       // ULEB128.
+  case DW_FORM_rnglistx:       // ULEB128.
+  case DW_FORM_GNU_addr_index: // ULEB128.
+  case DW_FORM_GNU_str_index:  // ULEB128.
+    return None;
 
-    case DW_FORM_block:          // ULEB128 length L followed by L bytes.
-    case DW_FORM_block1:         // 1 byte length L followed by L bytes.
-    case DW_FORM_block2:         // 2 byte length L followed by L bytes.
-    case DW_FORM_block4:         // 4 byte length L followed by L bytes.
-    case DW_FORM_string:         // C-string with null terminator.
-    case DW_FORM_sdata:          // SLEB128.
-    case DW_FORM_udata:          // ULEB128.
-    case DW_FORM_ref_udata:      // ULEB128.
-    case DW_FORM_indirect:       // ULEB128.
-    case DW_FORM_exprloc:        // ULEB128 length L followed by L bytes.
-    case DW_FORM_strx:           // ULEB128.
-    case DW_FORM_addrx:          // ULEB128.
-    case DW_FORM_loclistx:       // ULEB128.
-    case DW_FORM_rnglistx:       // ULEB128.
-    case DW_FORM_GNU_addr_index: // ULEB128.
-    case DW_FORM_GNU_str_index:  // ULEB128.
-      return None;
+  case DW_FORM_ref_addr:
+    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
+    return Params.getRefAddrByteSize();
 
-    case DW_FORM_ref_addr:
-      if (U)
-        return U->getRefAddrByteSize();
-      return None;
+  case DW_FORM_flag:
+  case DW_FORM_data1:
+  case DW_FORM_ref1:
+  case DW_FORM_strx1:
+  case DW_FORM_addrx1:
+    return 1;
 
-    case DW_FORM_flag:
-    case DW_FORM_data1:
-    case DW_FORM_ref1:
-      return 1;
+  case DW_FORM_data2:
+  case DW_FORM_ref2:
+  case DW_FORM_strx2:
+  case DW_FORM_addrx2:
+    return 2;
 
-    case DW_FORM_data2:
-    case DW_FORM_ref2:
-      return 2;
+  case DW_FORM_strx3:
+    return 3;
 
-    case DW_FORM_data4:
-    case DW_FORM_ref4:
-      return 4;
+  case DW_FORM_data4:
+  case DW_FORM_ref4:
+  case DW_FORM_ref_sup4:
+  case DW_FORM_strx4:
+  case DW_FORM_addrx4:
+    return 4;
 
-    case DW_FORM_strp:
-    case DW_FORM_GNU_ref_alt:
-    case DW_FORM_GNU_strp_alt:
-    case DW_FORM_line_strp:
-    case DW_FORM_sec_offset:
-    case DW_FORM_strp_sup:
-    case DW_FORM_ref_sup:
-      if (U)
-        return U->getDwarfOffsetByteSize();
-      return None;
+  case DW_FORM_strp:
+  case DW_FORM_GNU_ref_alt:
+  case DW_FORM_GNU_strp_alt:
+  case DW_FORM_line_strp:
+  case DW_FORM_sec_offset:
+  case DW_FORM_strp_sup:
+    assert(Params.Version && Params.AddrSize && "Invalid Params for form");
+    return Params.getDwarfOffsetByteSize();
 
-    case DW_FORM_data8:
-    case DW_FORM_ref8:
-    case DW_FORM_ref_sig8:
-      return 8;
+  case DW_FORM_data8:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_sig8:
+  case DW_FORM_ref_sup8:
+    return 8;
 
-    case DW_FORM_flag_present:
-      return 0;
+  case DW_FORM_flag_present:
+    return 0;
 
-    case DW_FORM_data16:
-      return 16;
+  case DW_FORM_data16:
+    return 16;
 
-    case DW_FORM_implicit_const:
-      // The implicit value is stored in the abbreviation as a SLEB128, and
-      // there no data in debug info.
-      return 0;
+  case DW_FORM_implicit_const:
+    // The implicit value is stored in the abbreviation as a SLEB128, and
+    // there no data in debug info.
+    return 0;
 
-    default:
-      llvm_unreachable("Handle this form in this switch statement");
+  default:
+    llvm_unreachable("Handle this form in this switch statement");
   }
   return None;
 }
 
-template <class T>
-static bool skipFormValue(dwarf::Form Form, const DataExtractor &DebugInfoData,
-                          uint32_t *OffsetPtr, const T *U) {
+bool DWARFFormValue::skipValue(dwarf::Form Form, DataExtractor DebugInfoData,
+                               uint32_t *OffsetPtr,
+                               const DWARFFormParams Params) {
   bool Indirect = false;
   do {
     switch (Form) {
-        // Blocks of inlined data that have a length field and the data bytes
-        // inlined in the .debug_info.
-      case DW_FORM_exprloc:
-      case DW_FORM_block: {
-        uint64_t size = DebugInfoData.getULEB128(OffsetPtr);
-        *OffsetPtr += size;
-        return true;
-      }
-      case DW_FORM_block1: {
-        uint8_t size = DebugInfoData.getU8(OffsetPtr);
-        *OffsetPtr += size;
-        return true;
-      }
-      case DW_FORM_block2: {
-        uint16_t size = DebugInfoData.getU16(OffsetPtr);
-        *OffsetPtr += size;
-        return true;
-      }
-      case DW_FORM_block4: {
-        uint32_t size = DebugInfoData.getU32(OffsetPtr);
-        *OffsetPtr += size;
+    // Blocks of inlined data that have a length field and the data bytes
+    // inlined in the .debug_info.
+    case DW_FORM_exprloc:
+    case DW_FORM_block: {
+      uint64_t size = DebugInfoData.getULEB128(OffsetPtr);
+      *OffsetPtr += size;
+      return true;
+    }
+    case DW_FORM_block1: {
+      uint8_t size = DebugInfoData.getU8(OffsetPtr);
+      *OffsetPtr += size;
+      return true;
+    }
+    case DW_FORM_block2: {
+      uint16_t size = DebugInfoData.getU16(OffsetPtr);
+      *OffsetPtr += size;
+      return true;
+    }
+    case DW_FORM_block4: {
+      uint32_t size = DebugInfoData.getU32(OffsetPtr);
+      *OffsetPtr += size;
+      return true;
+    }
+
+    // Inlined NULL terminated C-strings.
+    case DW_FORM_string:
+      DebugInfoData.getCStr(OffsetPtr);
+      return true;
+
+    case DW_FORM_addr:
+    case DW_FORM_ref_addr:
+    case DW_FORM_flag_present:
+    case DW_FORM_data1:
+    case DW_FORM_data2:
+    case DW_FORM_data4:
+    case DW_FORM_data8:
+    case DW_FORM_flag:
+    case DW_FORM_ref1:
+    case DW_FORM_ref2:
+    case DW_FORM_ref4:
+    case DW_FORM_ref8:
+    case DW_FORM_ref_sig8:
+    case DW_FORM_ref_sup4:
+    case DW_FORM_ref_sup8:
+    case DW_FORM_strx1:
+    case DW_FORM_strx2:
+    case DW_FORM_strx4:
+    case DW_FORM_addrx1:
+    case DW_FORM_addrx2:
+    case DW_FORM_addrx4:
+    case DW_FORM_sec_offset:
+    case DW_FORM_strp:
+    case DW_FORM_strp_sup:
+    case DW_FORM_line_strp:
+    case DW_FORM_GNU_ref_alt:
+    case DW_FORM_GNU_strp_alt:
+      if (Optional<uint8_t> FixedSize =
+              DWARFFormValue::getFixedByteSize(Form, Params)) {
+        *OffsetPtr += *FixedSize;
         return true;
       }
+      return false;
 
-        // Inlined NULL terminated C-strings.
-      case DW_FORM_string:
-        DebugInfoData.getCStr(OffsetPtr);
-        return true;
+    // signed or unsigned LEB 128 values.
+    case DW_FORM_sdata:
+      DebugInfoData.getSLEB128(OffsetPtr);
+      return true;
 
-      case DW_FORM_addr:
-      case DW_FORM_ref_addr:
-      case DW_FORM_flag_present:
-      case DW_FORM_data1:
-      case DW_FORM_data2:
-      case DW_FORM_data4:
-      case DW_FORM_data8:
-      case DW_FORM_flag:
-      case DW_FORM_ref1:
-      case DW_FORM_ref2:
-      case DW_FORM_ref4:
-      case DW_FORM_ref8:
-      case DW_FORM_ref_sig8:
-      case DW_FORM_ref_sup:
-      case DW_FORM_sec_offset:
-      case DW_FORM_strp:
-      case DW_FORM_strp_sup:
-      case DW_FORM_line_strp:
-      case DW_FORM_GNU_ref_alt:
-      case DW_FORM_GNU_strp_alt:
-        if (Optional<uint8_t> FixedSize = ::getFixedByteSize(Form, U)) {
-          *OffsetPtr += *FixedSize;
-          return true;
-        }
-        return false;
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+    case DW_FORM_strx:
+    case DW_FORM_addrx:
+    case DW_FORM_loclistx:
+    case DW_FORM_rnglistx:
+    case DW_FORM_GNU_addr_index:
+    case DW_FORM_GNU_str_index:
+      DebugInfoData.getULEB128(OffsetPtr);
+      return true;
 
-        // signed or unsigned LEB 128 values.
-      case DW_FORM_sdata:
-        DebugInfoData.getSLEB128(OffsetPtr);
-        return true;
+    case DW_FORM_indirect:
+      Indirect = true;
+      Form = static_cast<dwarf::Form>(DebugInfoData.getULEB128(OffsetPtr));
+      break;
 
-      case DW_FORM_udata:
-      case DW_FORM_ref_udata:
-      case DW_FORM_strx:
-      case DW_FORM_addrx:
-      case DW_FORM_loclistx:
-      case DW_FORM_rnglistx:
-      case DW_FORM_GNU_addr_index:
-      case DW_FORM_GNU_str_index:
-        DebugInfoData.getULEB128(OffsetPtr);
-        return true;
-        
-      case DW_FORM_indirect:
-        Indirect = true;
-        Form = static_cast<dwarf::Form>(DebugInfoData.getULEB128(OffsetPtr));
-        break;
-        
-      default:
-        return false;
+    default:
+      return false;
     }
   } while (Indirect);
   return true;
 }
 
-Optional<uint8_t> DWARFFormValue::getFixedByteSize(dwarf::Form Form,
-                                                   const DWARFUnit *U) {
-  return ::getFixedByteSize(Form, U);
-}
-
-Optional<uint8_t>
-DWARFFormValue::getFixedByteSize(dwarf::Form Form, uint16_t Version,
-                                 uint8_t AddrSize,
-                                 llvm::dwarf::DwarfFormat Format) {
-  FormSizeHelper FSH(Version, AddrSize, Format);
-  return ::getFixedByteSize(Form, &FSH);
-}
-
 bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   // First, check DWARF4 form classes.
   if (Form < makeArrayRef(DWARF4FormClasses).size() &&
@@ -279,6 +255,11 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
     return (FC == FC_Address);
   case DW_FORM_GNU_str_index:
   case DW_FORM_GNU_strp_alt:
+  case DW_FORM_strx:
+  case DW_FORM_strx1:
+  case DW_FORM_strx2:
+  case DW_FORM_strx3:
+  case DW_FORM_strx4:
     return (FC == FC_String);
   case DW_FORM_implicit_const:
     return (FC == FC_Constant);
@@ -287,178 +268,170 @@ bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
   }
   // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
   // Don't check for DWARF version here, as some producers may still do this
-  // by mistake.
-  return (Form == DW_FORM_data4 || Form == DW_FORM_data8) &&
+  // by mistake. Also accept DW_FORM_strp since this is .debug_str section
+  // offset.
+  return (Form == DW_FORM_data4 || Form == DW_FORM_data8 ||
+          Form == DW_FORM_strp) &&
          FC == FC_SectionOffset;
 }
 
-bool DWARFFormValue::extractValue(const DataExtractor &data, 
-                                  uint32_t *offset_ptr,
-                                  const DWARFUnit *cu) {
-  U = cu;
-  bool indirect = false;
-  bool is_block = false;
+bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
+                                  uint32_t *OffsetPtr, const DWARFUnit *CU) {
+  U = CU;
+  bool Indirect = false;
+  bool IsBlock = false;
   Value.data = nullptr;
   // Read the value for the form into value and follow and DW_FORM_indirect
   // instances we run into
   do {
-    indirect = false;
+    Indirect = false;
     switch (Form) {
     case DW_FORM_addr:
     case DW_FORM_ref_addr: {
       if (!U)
         return false;
-      uint16_t AddrSize =
-          (Form == DW_FORM_addr)
-              ? U->getAddressByteSize()
-              : U->getRefAddrByteSize();
-      RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr);
-      if (AI != U->getRelocMap()->end()) {
-        Value.uval = data.getUnsigned(offset_ptr, AddrSize) + AI->second.second;
-      } else
-        Value.uval = data.getUnsigned(offset_ptr, AddrSize);
+      uint16_t Size = (Form == DW_FORM_addr) ? U->getAddressByteSize()
+                                             : U->getRefAddrByteSize();
+      Value.uval = Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex);
       break;
     }
     case DW_FORM_exprloc:
     case DW_FORM_block:
-      Value.uval = data.getULEB128(offset_ptr);
-      is_block = true;
+      Value.uval = Data.getULEB128(OffsetPtr);
+      IsBlock = true;
       break;
     case DW_FORM_block1:
-      Value.uval = data.getU8(offset_ptr);
-      is_block = true;
+      Value.uval = Data.getU8(OffsetPtr);
+      IsBlock = true;
       break;
     case DW_FORM_block2:
-      Value.uval = data.getU16(offset_ptr);
-      is_block = true;
+      Value.uval = Data.getU16(OffsetPtr);
+      IsBlock = true;
       break;
     case DW_FORM_block4:
-      Value.uval = data.getU32(offset_ptr);
-      is_block = true;
+      Value.uval = Data.getU32(OffsetPtr);
+      IsBlock = true;
       break;
     case DW_FORM_data1:
     case DW_FORM_ref1:
     case DW_FORM_flag:
-      Value.uval = data.getU8(offset_ptr);
+    case DW_FORM_strx1:
+    case DW_FORM_addrx1:
+      Value.uval = Data.getU8(OffsetPtr);
       break;
     case DW_FORM_data2:
     case DW_FORM_ref2:
-      Value.uval = data.getU16(offset_ptr);
+    case DW_FORM_strx2:
+    case DW_FORM_addrx2:
+      Value.uval = Data.getU16(OffsetPtr);
+      break;
+    case DW_FORM_strx3:
+      Value.uval = Data.getU24(OffsetPtr);
       break;
     case DW_FORM_data4:
-    case DW_FORM_ref4: {
-      Value.uval = data.getU32(offset_ptr);
-      if (!U)
-        break;
-      RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr-4);
-      if (AI != U->getRelocMap()->end())
-        Value.uval += AI->second.second;
+    case DW_FORM_ref4:
+    case DW_FORM_ref_sup4:
+    case DW_FORM_strx4:
+    case DW_FORM_addrx4:
+      Value.uval = Data.getRelocatedValue(4, OffsetPtr);
       break;
-    }
     case DW_FORM_data8:
     case DW_FORM_ref8:
-      Value.uval = data.getU64(offset_ptr);
+    case DW_FORM_ref_sup8:
+      Value.uval = Data.getU64(OffsetPtr);
       break;
     case DW_FORM_sdata:
-      Value.sval = data.getSLEB128(offset_ptr);
+      Value.sval = Data.getSLEB128(OffsetPtr);
       break;
     case DW_FORM_udata:
     case DW_FORM_ref_udata:
-      Value.uval = data.getULEB128(offset_ptr);
+      Value.uval = Data.getULEB128(OffsetPtr);
       break;
     case DW_FORM_string:
-      Value.cstr = data.getCStr(offset_ptr);
+      Value.cstr = Data.getCStr(OffsetPtr);
       break;
     case DW_FORM_indirect:
-      Form = static_cast<dwarf::Form>(data.getULEB128(offset_ptr));
-      indirect = true;
+      Form = static_cast<dwarf::Form>(Data.getULEB128(OffsetPtr));
+      Indirect = true;
       break;
     case DW_FORM_strp:
     case DW_FORM_sec_offset:
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
     case DW_FORM_line_strp:
-    case DW_FORM_strp_sup:
-    case DW_FORM_ref_sup: {
+    case DW_FORM_strp_sup: {
       if (!U)
         return false;
-      RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr);
-      uint8_t Size = U->getDwarfOffsetByteSize();
-      Value.uval = data.getUnsigned(offset_ptr, Size);
-      if (AI != U->getRelocMap()->end())
-        Value.uval += AI->second.second;
+      Value.uval =
+          Data.getRelocatedValue(U->getDwarfOffsetByteSize(), OffsetPtr);
       break;
     }
     case DW_FORM_flag_present:
       Value.uval = 1;
       break;
     case DW_FORM_ref_sig8:
-      Value.uval = data.getU64(offset_ptr);
+      Value.uval = Data.getU64(OffsetPtr);
       break;
     case DW_FORM_GNU_addr_index:
     case DW_FORM_GNU_str_index:
-      Value.uval = data.getULEB128(offset_ptr);
+    case DW_FORM_strx:
+      Value.uval = Data.getULEB128(OffsetPtr);
       break;
     default:
-      return false;
+      // DWARFFormValue::skipValue() will have caught this and caused all
+      // DWARF DIEs to fail to be parsed, so this code is not be reachable.
+      llvm_unreachable("unsupported form");
     }
-  } while (indirect);
+  } while (Indirect);
 
-  if (is_block) {
-    StringRef str = data.getData().substr(*offset_ptr, Value.uval);
+  if (IsBlock) {
+    StringRef Str = Data.getData().substr(*OffsetPtr, Value.uval);
     Value.data = nullptr;
-    if (!str.empty()) {
-      Value.data = reinterpret_cast<const uint8_t *>(str.data());
-      *offset_ptr += Value.uval;
+    if (!Str.empty()) {
+      Value.data = reinterpret_cast<const uint8_t *>(Str.data());
+      *OffsetPtr += Value.uval;
     }
   }
 
   return true;
 }
 
-bool DWARFFormValue::skipValue(DataExtractor DebugInfoData,
-                               uint32_t *offset_ptr, const DWARFUnit *U) const {
-  return DWARFFormValue::skipValue(Form, DebugInfoData, offset_ptr, U);
-}
-
-bool DWARFFormValue::skipValue(dwarf::Form form, DataExtractor DebugInfoData,
-                               uint32_t *offset_ptr, const DWARFUnit *U) {
-  return skipFormValue(form, DebugInfoData, offset_ptr, U);
-}
-
-bool DWARFFormValue::skipValue(dwarf::Form form, DataExtractor DebugInfoData,
-                               uint32_t *offset_ptr, uint16_t Version,
-                               uint8_t AddrSize,
-                               llvm::dwarf::DwarfFormat Format) {
-  FormSizeHelper FSH(Version, AddrSize, Format);
-  return skipFormValue(form, DebugInfoData, offset_ptr, &FSH);
-}
-
-void
-DWARFFormValue::dump(raw_ostream &OS) const {
-  uint64_t uvalue = Value.uval;
-  bool cu_relative_offset = false;
+void DWARFFormValue::dump(raw_ostream &OS) const {
+  uint64_t UValue = Value.uval;
+  bool CURelativeOffset = false;
 
   switch (Form) {
-  case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_addr:
+    OS << format("0x%016" PRIx64, UValue);
+    break;
   case DW_FORM_GNU_addr_index: {
-    OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
+    OS << format(" indexed (%8.8x) address = ", (uint32_t)UValue);
     uint64_t Address;
     if (U == nullptr)
       OS << "<invalid dwarf unit>";
-    else if (U->getAddrOffsetSectionItem(uvalue, Address))
+    else if (U->getAddrOffsetSectionItem(UValue, Address))
       OS << format("0x%016" PRIx64, Address);
     else
       OS << "<no .debug_addr section>";
     break;
   }
-  case DW_FORM_flag_present: OS << "true"; break;
+  case DW_FORM_flag_present:
+    OS << "true";
+    break;
   case DW_FORM_flag:
-  case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
-  case DW_FORM_data2:     OS << format("0x%04x", (uint16_t)uvalue); break;
-  case DW_FORM_data4:     OS << format("0x%08x", (uint32_t)uvalue); break;
+  case DW_FORM_data1:
+    OS << format("0x%02x", (uint8_t)UValue);
+    break;
+  case DW_FORM_data2:
+    OS << format("0x%04x", (uint16_t)UValue);
+    break;
+  case DW_FORM_data4:
+    OS << format("0x%08x", (uint32_t)UValue);
+    break;
   case DW_FORM_ref_sig8:
-  case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_data8:
+    OS << format("0x%016" PRIx64, UValue);
+    break;
   case DW_FORM_string:
     OS << '"';
     OS.write_escaped(Value.cstr);
@@ -469,83 +442,97 @@ DWARFFormValue::dump(raw_ostream &OS) const {
   case DW_FORM_block1:
   case DW_FORM_block2:
   case DW_FORM_block4:
-    if (uvalue > 0) {
+    if (UValue > 0) {
       switch (Form) {
       case DW_FORM_exprloc:
-      case DW_FORM_block:  OS << format("<0x%" PRIx64 "> ", uvalue);     break;
-      case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
-      case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
-      case DW_FORM_block4: OS << format("<0x%8.8x> ", (uint32_t)uvalue); break;
-      default: break;
+      case DW_FORM_block:
+        OS << format("<0x%" PRIx64 "> ", UValue);
+        break;
+      case DW_FORM_block1:
+        OS << format("<0x%2.2x> ", (uint8_t)UValue);
+        break;
+      case DW_FORM_block2:
+        OS << format("<0x%4.4x> ", (uint16_t)UValue);
+        break;
+      case DW_FORM_block4:
+        OS << format("<0x%8.8x> ", (uint32_t)UValue);
+        break;
+      default:
+        break;
       }
 
-      const uint8_t* data_ptr = Value.data;
-      if (data_ptr) {
-        // uvalue contains size of block
-        const uint8_t* end_data_ptr = data_ptr + uvalue;
-        while (data_ptr < end_data_ptr) {
-          OS << format("%2.2x ", *data_ptr);
-          ++data_ptr;
+      const uint8_t *DataPtr = Value.data;
+      if (DataPtr) {
+        // UValue contains size of block
+        const uint8_t *EndDataPtr = DataPtr + UValue;
+        while (DataPtr < EndDataPtr) {
+          OS << format("%2.2x ", *DataPtr);
+          ++DataPtr;
         }
-      }
-      else
+      } else
         OS << "NULL";
     }
     break;
 
-  case DW_FORM_sdata:     OS << Value.sval; break;
-  case DW_FORM_udata:     OS << Value.uval; break;
-  case DW_FORM_strp: {
-    OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
+  case DW_FORM_sdata:
+    OS << Value.sval;
+    break;
+  case DW_FORM_udata:
+    OS << Value.uval;
+    break;
+  case DW_FORM_strp:
+    OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)UValue);
     dumpString(OS);
     break;
-  }
-  case DW_FORM_GNU_str_index: {
-    OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
+  case DW_FORM_strx:
+  case DW_FORM_strx1:
+  case DW_FORM_strx2:
+  case DW_FORM_strx3:
+  case DW_FORM_strx4:
+  case DW_FORM_GNU_str_index:
+    OS << format(" indexed (%8.8x) string = ", (uint32_t)UValue);
     dumpString(OS);
     break;
-  }
-  case DW_FORM_GNU_strp_alt: {
-    OS << format("alt indirect string, offset: 0x%" PRIx64 "", uvalue);
+  case DW_FORM_GNU_strp_alt:
+    OS << format("alt indirect string, offset: 0x%" PRIx64 "", UValue);
     dumpString(OS);
     break;
-  }
   case DW_FORM_ref_addr:
-    OS << format("0x%016" PRIx64, uvalue);
+    OS << format("0x%016" PRIx64, UValue);
     break;
   case DW_FORM_ref1:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%2.2x", (uint8_t)uvalue);
+    CURelativeOffset = true;
+    OS << format("cu + 0x%2.2x", (uint8_t)UValue);
     break;
   case DW_FORM_ref2:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%4.4x", (uint16_t)uvalue);
+    CURelativeOffset = true;
+    OS << format("cu + 0x%4.4x", (uint16_t)UValue);
     break;
   case DW_FORM_ref4:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%4.4x", (uint32_t)uvalue);
+    CURelativeOffset = true;
+    OS << format("cu + 0x%4.4x", (uint32_t)UValue);
     break;
   case DW_FORM_ref8:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%8.8" PRIx64, uvalue);
+    CURelativeOffset = true;
+    OS << format("cu + 0x%8.8" PRIx64, UValue);
     break;
   case DW_FORM_ref_udata:
-    cu_relative_offset = true;
-    OS << format("cu + 0x%" PRIx64, uvalue);
+    CURelativeOffset = true;
+    OS << format("cu + 0x%" PRIx64, UValue);
     break;
   case DW_FORM_GNU_ref_alt:
-    OS << format("<alt 0x%" PRIx64 ">", uvalue);
+    OS << format("<alt 0x%" PRIx64 ">", UValue);
     break;
 
-    // All DW_FORM_indirect attributes should be resolved prior to calling
-    // this function
+  // All DW_FORM_indirect attributes should be resolved prior to calling
+  // this function
   case DW_FORM_indirect:
     OS << "DW_FORM_indirect";
     break;
 
-    // Should be formatted to 64-bit for DWARF64.
+  // Should be formatted to 64-bit for DWARF64.
   case DW_FORM_sec_offset:
-    OS << format("0x%08x", (uint32_t)uvalue);
+    OS << format("0x%08x", (uint32_t)UValue);
     break;
 
   default:
@@ -553,10 +540,10 @@ DWARFFormValue::dump(raw_ostream &OS) const {
     break;
   }
 
-  if (cu_relative_offset) {
+  if (CURelativeOffset) {
     OS << " => {";
     WithColor(OS, syntax::Address).get()
-      << format("0x%8.8" PRIx64, uvalue + (U ? U->getOffset() : 0));
+        << format("0x%8.8" PRIx64, UValue + (U ? U->getOffset() : 0));
     OS << "}";
   }
 }
@@ -580,8 +567,10 @@ Optional<const char *> DWARFFormValue::getAsCString() const {
   if (Form == DW_FORM_GNU_strp_alt || U == nullptr)
     return None;
   uint32_t Offset = Value.uval;
-  if (Form == DW_FORM_GNU_str_index) {
-    uint32_t StrOffset;
+  if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx ||
+      Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 ||
+      Form == DW_FORM_strx4) {
+    uint64_t StrOffset;
     if (!U->getStringOffsetSectionItem(Offset, StrOffset))
       return None;
     Offset = StrOffset;
@@ -633,15 +622,16 @@ Optional<uint64_t> DWARFFormValue::getAsSectionOffset() const {
 }
 
 Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
-  if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag))
-      || Form == DW_FORM_sdata)
+  if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag)) ||
+      Form == DW_FORM_sdata)
     return None;
   return Value.uval;
 }
 
 Optional<int64_t> DWARFFormValue::getAsSignedConstant() const {
   if ((!isFormClass(FC_Constant) && !isFormClass(FC_Flag)) ||
-      (Form == DW_FORM_udata && uint64_t(std::numeric_limits<int64_t>::max()) < Value.uval))
+      (Form == DW_FORM_udata &&
+       uint64_t(std::numeric_limits<int64_t>::max()) < Value.uval))
     return None;
   switch (Form) {
   case DW_FORM_data4:
@@ -674,4 +664,3 @@ Optional<uint64_t> DWARFFormValue::getAsReferenceUVal() const {
     return None;
   return Value.uval;
 }
-
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index ebb9961..ebd6104 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFGdbIndex.cpp -------------------------------------------------===//
+//===- DWARFGdbIndex.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,9 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
@@ -33,8 +39,9 @@ void DWARFGdbIndex::dumpAddressArea(raw_ostream &OS) const {
      << '\n';
   for (const AddressEntry &Addr : AddressArea)
     OS << format(
-        "    Low address = 0x%llx, High address = 0x%llx, CU index = %d\n",
-        Addr.LowAddress, Addr.HighAddress, Addr.CuIndex);
+        "    Low/High address = [0x%llx, 0x%llx) (Size: 0x%llx), CU id = %d\n",
+        Addr.LowAddress, Addr.HighAddress, Addr.HighAddress - Addr.LowAddress,
+        Addr.CuIndex);
 }
 
 void DWARFGdbIndex::dumpSymbolTable(raw_ostream &OS) const {
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index 88fb203..fd1684d 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//===- DWARFTypeUnit.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,10 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
 
 using namespace llvm;
 
@@ -21,27 +24,33 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
     return false;
   TypeHash = debug_info.getU64(offset_ptr);
   TypeOffset = debug_info.getU32(offset_ptr);
-  return TypeOffset < getLength();
+  // TypeOffset is relative to the beginning of the header,
+  // so we have to account for the leading length field.
+  // FIXME: The size of the length field is 12 in DWARF64.
+  unsigned SizeOfLength = 4;
+  return TypeOffset < getLength() + SizeOfLength;
 }
 
 void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
   DWARFDie TD = getDIEForOffset(TypeOffset + getOffset());
-  const char *Name = TD.getAttributeValueAsString(llvm::dwarf::DW_AT_name, "");
+  const char *Name = TD.getName(DINameKind::ShortName);
 
   if (SummarizeTypes) {
     OS << "name = '" << Name << "'"
-       << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+       << " type_signature = " << format("0x%016" PRIx64, TypeHash)
        << " length = " << format("0x%08x", getLength()) << '\n';
     return;
   }
 
   OS << format("0x%08x", getOffset()) << ": Type Unit:"
      << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " version = " << format("0x%04x", getVersion());
+  if (getVersion() >= 5)
+    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " name = '" << Name << "'"
-     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_signature = " << format("0x%016" PRIx64, TypeHash)
      << " type_offset = " << format("0x%04x", TypeOffset)
      << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
 
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index ee2c569..043bdb8 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFUnit.cpp -----------------------------------------------------===//
+//===- DWARFUnit.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,54 +7,51 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <utility>
 #include <vector>
 
-namespace llvm {
-
+using namespace llvm;
 using namespace dwarf;
 
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
-  parseImpl(C, Section, C.getDebugAbbrev(), C.getRangeSection(),
-            C.getStringSection(), StringRef(), C.getAddrSection(),
-            C.getLineSection().Data, C.isLittleEndian(), false);
+  parseImpl(C, Section, C.getDebugAbbrev(), &C.getRangeSection(),
+            C.getStringSection(), C.getStringOffsetSection(),
+            &C.getAddrSection(), C.getLineSection(), C.isLittleEndian(), false);
 }
 
 void DWARFUnitSectionBase::parseDWO(DWARFContext &C,
                                     const DWARFSection &DWOSection,
                                     DWARFUnitIndex *Index) {
-  parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), C.getRangeDWOSection(),
+  parseImpl(C, DWOSection, C.getDebugAbbrevDWO(), &C.getRangeDWOSection(),
             C.getStringDWOSection(), C.getStringOffsetDWOSection(),
-            C.getAddrSection(), C.getLineDWOSection().Data, C.isLittleEndian(),
+            &C.getAddrSection(), C.getLineDWOSection(), C.isLittleEndian(),
             true);
 }
 
 DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
-                     const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
-                     StringRef SOS, StringRef AOS, StringRef LS, bool LE,
+                     const DWARFDebugAbbrev *DA, const DWARFSection *RS,
+                     StringRef SS, const DWARFSection &SOS,
+                     const DWARFSection *AOS, const DWARFSection &LS, bool LE,
                      bool IsDWO, const DWARFUnitSectionBase &UnitSection,
                      const DWARFUnitIndex::Entry *IndexEntry)
     : Context(DC), InfoSection(Section), Abbrev(DA), RangeSection(RS),
-      LineSection(LS), StringSection(SS), StringOffsetSection([&]() {
-        if (IndexEntry)
-          if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
-            return SOS.slice(C->Offset, C->Offset + C->Length);
-        return SOS;
-      }()),
+      LineSection(LS), StringSection(SS), StringOffsetSection(SOS),
       AddrOffsetSection(AOS), isLittleEndian(LE), isDWO(IsDWO),
       UnitSection(UnitSection), IndexEntry(IndexEntry) {
   clear();
@@ -64,30 +61,40 @@ DWARFUnit::~DWARFUnit() = default;
 
 bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
                                                 uint64_t &Result) const {
-  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
-  if (AddrOffsetSection.size() < Offset + AddrSize)
+  uint32_t Offset = AddrOffsetSectionBase + Index * getAddressByteSize();
+  if (AddrOffsetSection->Data.size() < Offset + getAddressByteSize())
     return false;
-  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
-  Result = DA.getAddress(&Offset);
+  DWARFDataExtractor DA(*AddrOffsetSection, isLittleEndian,
+                        getAddressByteSize());
+  Result = DA.getRelocatedAddress(&Offset);
   return true;
 }
 
 bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
-                                                  uint32_t &Result) const {
-  // FIXME: string offset section entries are 8-byte for DWARF64.
-  const uint32_t ItemSize = 4;
-  uint32_t Offset = Index * ItemSize;
-  if (StringOffsetSection.size() < Offset + ItemSize)
+                                           uint64_t &Result) const {
+  unsigned ItemSize = getDwarfOffsetByteSize();
+  uint32_t Offset = StringOffsetSectionBase + Index * ItemSize;
+  if (StringOffsetSection.Data.size() < Offset + ItemSize)
     return false;
-  DataExtractor DA(StringOffsetSection, isLittleEndian, 0);
-  Result = DA.getU32(&Offset);
+  DWARFDataExtractor DA(StringOffsetSection, isLittleEndian, 0);
+  Result = DA.getRelocatedValue(ItemSize, &Offset);
   return true;
 }
 
 bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
-  Version = debug_info.getU16(offset_ptr);
-  uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
+  // FIXME: Support DWARF64.
+  FormParams.Format = DWARF32;
+  FormParams.Version = debug_info.getU16(offset_ptr);
+  uint64_t AbbrOffset;
+  if (FormParams.Version >= 5) {
+    UnitType = debug_info.getU8(offset_ptr);
+    FormParams.AddrSize = debug_info.getU8(offset_ptr);
+    AbbrOffset = debug_info.getU32(offset_ptr);
+  } else {
+    AbbrOffset = debug_info.getU32(offset_ptr);
+    FormParams.AddrSize = debug_info.getU8(offset_ptr);
+  }
   if (IndexEntry) {
     if (AbbrOffset)
       return false;
@@ -99,15 +106,17 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
       return false;
     AbbrOffset = AbbrEntry->Offset;
   }
-  AddrSize = debug_info.getU8(offset_ptr);
 
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
-  bool VersionOK = DWARFContext::isSupportedVersion(Version);
-  bool AddrSizeOK = AddrSize == 4 || AddrSize == 8;
+  bool VersionOK = DWARFContext::isSupportedVersion(getVersion());
+  bool AddrSizeOK = getAddressByteSize() == 4 || getAddressByteSize() == 8;
 
   if (!LengthOK || !VersionOK || !AddrSizeOK)
     return false;
 
+  // Keep track of the highest DWARF version we encounter across all units.
+  Context.setMaxVersionIfGreater(getVersion());
+
   Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset);
   return Abbrevs != nullptr;
 }
@@ -129,10 +138,11 @@ bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
 }
 
 bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
-                                        DWARFDebugRangeList &RangeList) const {
+                                 DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
   assert(!DieArray.empty());
-  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
+  DWARFDataExtractor RangesData(*RangeSection, isLittleEndian,
+                                getAddressByteSize());
   uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
   return RangeList.extract(RangesData, &ActualRangeListOffset);
 }
@@ -140,9 +150,8 @@ bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
 void DWARFUnit::clear() {
   Offset = 0;
   Length = 0;
-  Version = 0;
   Abbrevs = nullptr;
-  AddrSize = 0;
+  FormParams = DWARFFormParams({0, 0, DWARF32});
   BaseAddr = 0;
   RangeSectionBase = 0;
   AddrOffsetSectionBase = 0;
@@ -151,11 +160,11 @@ void DWARFUnit::clear() {
 }
 
 const char *DWARFUnit::getCompilationDir() {
-  return getUnitDIE().getAttributeValueAsString(DW_AT_comp_dir, nullptr);
+  return dwarf::toString(getUnitDIE().find(DW_AT_comp_dir), nullptr);
 }
 
 Optional<uint64_t> DWARFUnit::getDWOId() {
-  return getUnitDIE().getAttributeValueAsUnsignedConstant(DW_AT_GNU_dwo_id);
+  return toUnsigned(getUnitDIE().find(DW_AT_GNU_dwo_id));
 }
 
 void DWARFUnit::extractDIEsToVector(
@@ -169,7 +178,7 @@ void DWARFUnit::extractDIEsToVector(
   uint32_t DIEOffset = Offset + getHeaderSize();
   uint32_t NextCUOffset = getNextUnitOffset();
   DWARFDebugInfoEntry DIE;
-  DataExtractor DebugInfoData = getDebugInfoExtractor();
+  DWARFDataExtractor DebugInfoData = getDebugInfoExtractor();
   uint32_t Depth = 0;
   bool IsCUDie = true;
 
@@ -225,17 +234,22 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   // If CU DIE was just parsed, copy several attribute values from it.
   if (!HasCUDie) {
     DWARFDie UnitDie = getUnitDIE();
-    auto BaseAddr = UnitDie.getAttributeValueAsAddress(DW_AT_low_pc);
-    if (!BaseAddr)
-      BaseAddr = UnitDie.getAttributeValueAsAddress(DW_AT_entry_pc);
+    auto BaseAddr = toAddress(UnitDie.find({DW_AT_low_pc, DW_AT_entry_pc}));
     if (BaseAddr)
       setBaseAddress(*BaseAddr);
-    AddrOffsetSectionBase =
-        UnitDie.getAttributeValueAsSectionOffset(DW_AT_GNU_addr_base)
-            .getValueOr(0);
-    RangeSectionBase =
-        UnitDie.getAttributeValueAsSectionOffset(DW_AT_rnglists_base)
-            .getValueOr(0);
+    AddrOffsetSectionBase = toSectionOffset(UnitDie.find(DW_AT_GNU_addr_base), 0);
+    RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0);
+
+    // In general, we derive the offset of the unit's contibution to the
+    // debug_str_offsets{.dwo} section from the unit DIE's
+    // DW_AT_str_offsets_base attribute. In dwp files we add to it the offset
+    // we get from the index table.
+    StringOffsetSectionBase =
+        toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0);
+    if (IndexEntry)
+      if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS))
+        StringOffsetSectionBase += C->Offset;
+
     // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for
     // skeleton CU DIE, so that DWARF users not aware of it are not broken.
   }
@@ -243,21 +257,6 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   return DieArray.size();
 }
 
-DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
-    : DWOU(nullptr) {
-  auto Obj = object::ObjectFile::createObjectFile(DWOPath);
-  if (!Obj) {
-    // TODO: Actually report errors helpfully.
-    consumeError(Obj.takeError());
-    return;
-  }
-  DWOFile = std::move(Obj.get());
-  DWOContext.reset(
-      cast<DWARFContext>(new DWARFContextInMemory(*DWOFile.getBinary())));
-  if (DWOContext->getNumDWOCompileUnits() > 0)
-    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
-}
-
 bool DWARFUnit::parseDWO() {
   if (isDWO)
     return false;
@@ -266,28 +265,31 @@ bool DWARFUnit::parseDWO() {
   DWARFDie UnitDie = getUnitDIE();
   if (!UnitDie)
     return false;
-  const char *DWOFileName =
-      UnitDie.getAttributeValueAsString(DW_AT_GNU_dwo_name, nullptr);
+  auto DWOFileName = dwarf::toString(UnitDie.find(DW_AT_GNU_dwo_name));
   if (!DWOFileName)
     return false;
-  const char *CompilationDir =
-      UnitDie.getAttributeValueAsString(DW_AT_comp_dir, nullptr);
+  auto CompilationDir = dwarf::toString(UnitDie.find(DW_AT_comp_dir));
   SmallString<16> AbsolutePath;
-  if (sys::path::is_relative(DWOFileName) && CompilationDir != nullptr) {
-    sys::path::append(AbsolutePath, CompilationDir);
+  if (sys::path::is_relative(*DWOFileName) && CompilationDir &&
+      *CompilationDir) {
+    sys::path::append(AbsolutePath, *CompilationDir);
   }
-  sys::path::append(AbsolutePath, DWOFileName);
-  DWO = llvm::make_unique<DWOHolder>(AbsolutePath);
-  DWARFUnit *DWOCU = DWO->getUnit();
-  // Verify that compile unit in .dwo file is valid.
-  if (!DWOCU || DWOCU->getDWOId() != getDWOId()) {
-    DWO.reset();
+  sys::path::append(AbsolutePath, *DWOFileName);
+  auto DWOId = getDWOId();
+  if (!DWOId)
     return false;
-  }
+  auto DWOContext = Context.getDWOContext(AbsolutePath);
+  if (!DWOContext)
+    return false;
+
+  DWARFCompileUnit *DWOCU = DWOContext->getDWOCompileUnitForHash(*DWOId);
+  if (!DWOCU)
+    return false;
+  DWO = std::shared_ptr<DWARFCompileUnit>(std::move(DWOContext), DWOCU);
   // Share .debug_addr and .debug_ranges section with compile unit in .dwo
-  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
+  DWO->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
   auto DWORangesBase = UnitDie.getRangesBaseAttribute();
-  DWOCU->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
+  DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
   return true;
 }
 
@@ -330,8 +332,8 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
 
   // Collect address ranges from DIEs in .dwo if necessary.
   bool DWOCreated = parseDWO();
-  if (DWO.get())
-    DWO->getUnit()->collectAddressRanges(CURanges);
+  if (DWO)
+    DWO->collectAddressRanges(CURanges);
   if (DWOCreated)
     DWO.reset();
 
@@ -341,41 +343,67 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
     clearDIEs(true);
 }
 
-DWARFDie
-DWARFUnit::getSubprogramForAddress(uint64_t Address) {
-  extractDIEsIfNeeded(false);
-  for (const DWARFDebugInfoEntry &D : DieArray) {
-    DWARFDie DIE(this, &D);
-    if (DIE.isSubprogramDIE() &&
-        DIE.addressRangeContainsAddress(Address)) {
-      return DIE;
+void DWARFUnit::updateAddressDieMap(DWARFDie Die) {
+  if (Die.isSubroutineDIE()) {
+    for (const auto &R : Die.getAddressRanges()) {
+      // Ignore 0-sized ranges.
+      if (R.LowPC == R.HighPC)
+        continue;
+      auto B = AddrDieMap.upper_bound(R.LowPC);
+      if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) {
+        // The range is a sub-range of existing ranges, we need to split the
+        // existing range.
+        if (R.HighPC < B->second.first)
+          AddrDieMap[R.HighPC] = B->second;
+        if (R.LowPC > B->first)
+          AddrDieMap[B->first].first = R.LowPC;
+      }
+      AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die);
     }
   }
-  return DWARFDie();
+  // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to
+  // simplify the logic to update AddrDieMap. The child's range will always
+  // be equal or smaller than the parent's range. With this assumption, when
+  // adding one range into the map, it will at most split a range into 3
+  // sub-ranges.
+  for (DWARFDie Child = Die.getFirstChild(); Child; Child = Child.getSibling())
+    updateAddressDieMap(Child);
+}
+
+DWARFDie DWARFUnit::getSubroutineForAddress(uint64_t Address) {
+  extractDIEsIfNeeded(false);
+  if (AddrDieMap.empty())
+    updateAddressDieMap(getUnitDIE());
+  auto R = AddrDieMap.upper_bound(Address);
+  if (R == AddrDieMap.begin())
+    return DWARFDie();
+  // upper_bound's previous item contains Address.
+  --R;
+  if (Address >= R->second.first)
+    return DWARFDie();
+  return R->second.second;
 }
 
 void
 DWARFUnit::getInlinedChainForAddress(uint64_t Address,
                                      SmallVectorImpl<DWARFDie> &InlinedChain) {
-  // First, find a subprogram that contains the given address (the root
-  // of inlined chain).
-  DWARFDie SubprogramDIE;
+  assert(InlinedChain.empty());
   // Try to look for subprogram DIEs in the DWO file.
   parseDWO();
-  if (DWO)
-    SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
-  else
-    SubprogramDIE = getSubprogramForAddress(Address);
-
-  // Get inlined chain rooted at this subprogram DIE.
-  if (SubprogramDIE)
-    SubprogramDIE.getInlinedChainForAddress(Address, InlinedChain);
-  else
-    InlinedChain.clear();
+  // First, find the subroutine that contains the given address (the leaf
+  // of inlined chain).
+  DWARFDie SubroutineDIE =
+      (DWO ? DWO.get() : this)->getSubroutineForAddress(Address);
+
+  while (SubroutineDIE) {
+    if (SubroutineDIE.isSubroutineDIE())
+      InlinedChain.push_back(SubroutineDIE);
+    SubroutineDIE  = SubroutineDIE.getParent();
+  }
 }
 
-const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
-                                        DWARFSectionKind Kind) {
+const DWARFUnitIndex &llvm::getDWARFUnitIndex(DWARFContext &Context,
+                                              DWARFSectionKind Kind) {
   if (Kind == DW_SECT_INFO)
     return Context.getCUIndex();
   assert(Kind == DW_SECT_TYPES);
@@ -413,11 +441,10 @@ DWARFDie DWARFUnit::getSibling(const DWARFDebugInfoEntry *Die) {
     return DWARFDie();
   
   // Find the next DIE whose depth is the same as the Die's depth.
-  for (size_t I=getDIEIndex(Die)+1, EndIdx = DieArray.size(); I<EndIdx; ++I) {
+  for (size_t I = getDIEIndex(Die) + 1, EndIdx = DieArray.size(); I < EndIdx;
+       ++I) {
     if (DieArray[I].getDepth() == Depth)
       return DWARFDie(this, &DieArray[I]);
   }
   return DWARFDie();
 }
-
-} // end namespace llvm
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 96b3169..59b3d0c 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFUnitIndex.cpp ------------------------------------------------===//
+//===- DWARFUnitIndex.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,11 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
 
-namespace llvm {
+using namespace llvm;
 
 bool DWARFUnitIndex::Header::parse(DataExtractor IndexData,
                                    uint32_t *OffsetPtr) {
@@ -152,6 +156,7 @@ DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const {
       return &Contributions[i];
   return nullptr;
 }
+
 const DWARFUnitIndex::Entry::SectionContribution *
 DWARFUnitIndex::Entry::getOffset() const {
   return &Contributions[Index->InfoColumn];
@@ -165,4 +170,3 @@ DWARFUnitIndex::getFromOffset(uint32_t Offset) const {
         return &Rows[i];
   return nullptr;
 }
-}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
new file mode 100644
index 0000000..4de46be
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -0,0 +1,499 @@
+//===- DWARFVerifier.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+#include <set>
+#include <vector>
+
+using namespace llvm;
+using namespace dwarf;
+using namespace object;
+
+bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
+                                     uint32_t *Offset, unsigned UnitIndex,
+                                     uint8_t &UnitType, bool &isUnitDWARF64) {
+  uint32_t AbbrOffset, Length;
+  uint8_t AddrSize = 0;
+  uint16_t Version;
+  bool Success = true;
+
+  bool ValidLength = false;
+  bool ValidVersion = false;
+  bool ValidAddrSize = false;
+  bool ValidType = true;
+  bool ValidAbbrevOffset = true;
+
+  uint32_t OffsetStart = *Offset;
+  Length = DebugInfoData.getU32(Offset);
+  if (Length == UINT32_MAX) {
+    isUnitDWARF64 = true;
+    OS << format(
+        "Unit[%d] is in 64-bit DWARF format; cannot verify from this point.\n",
+        UnitIndex);
+    return false;
+  }
+  Version = DebugInfoData.getU16(Offset);
+
+  if (Version >= 5) {
+    UnitType = DebugInfoData.getU8(Offset);
+    AddrSize = DebugInfoData.getU8(Offset);
+    AbbrOffset = DebugInfoData.getU32(Offset);
+    ValidType = DWARFUnit::isValidUnitType(UnitType);
+  } else {
+    UnitType = 0;
+    AbbrOffset = DebugInfoData.getU32(Offset);
+    AddrSize = DebugInfoData.getU8(Offset);
+  }
+
+  if (!DCtx.getDebugAbbrev()->getAbbreviationDeclarationSet(AbbrOffset))
+    ValidAbbrevOffset = false;
+
+  ValidLength = DebugInfoData.isValidOffset(OffsetStart + Length + 3);
+  ValidVersion = DWARFContext::isSupportedVersion(Version);
+  ValidAddrSize = AddrSize == 4 || AddrSize == 8;
+  if (!ValidLength || !ValidVersion || !ValidAddrSize || !ValidAbbrevOffset ||
+      !ValidType) {
+    Success = false;
+    OS << format("Units[%d] - start offset: 0x%08x \n", UnitIndex, OffsetStart);
+    if (!ValidLength)
+      OS << "\tError: The length for this unit is too "
+            "large for the .debug_info provided.\n";
+    if (!ValidVersion)
+      OS << "\tError: The 16 bit unit header version is not valid.\n";
+    if (!ValidType)
+      OS << "\tError: The unit type encoding is not valid.\n";
+    if (!ValidAbbrevOffset)
+      OS << "\tError: The offset into the .debug_abbrev section is "
+            "not valid.\n";
+    if (!ValidAddrSize)
+      OS << "\tError: The address size is unsupported.\n";
+  }
+  *Offset = OffsetStart + Length + 4;
+  return Success;
+}
+
+bool DWARFVerifier::verifyUnitContents(DWARFUnit Unit) {
+  uint32_t NumUnitErrors = 0;
+  unsigned NumDies = Unit.getNumDIEs();
+  for (unsigned I = 0; I < NumDies; ++I) {
+    auto Die = Unit.getDIEAtIndex(I);
+    if (Die.getTag() == DW_TAG_null)
+      continue;
+    for (auto AttrValue : Die.attributes()) {
+      NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue);
+      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
+    }
+  }
+  return NumUnitErrors == 0;
+}
+
+bool DWARFVerifier::handleDebugInfo() {
+  OS << "Verifying .debug_info Unit Header Chain...\n";
+
+  DWARFDataExtractor DebugInfoData(DCtx.getInfoSection(), DCtx.isLittleEndian(),
+                                   0);
+  uint32_t NumDebugInfoErrors = 0;
+  uint32_t OffsetStart = 0, Offset = 0, UnitIdx = 0;
+  uint8_t UnitType = 0;
+  bool isUnitDWARF64 = false;
+  bool isHeaderChainValid = true;
+  bool hasDIE = DebugInfoData.isValidOffset(Offset);
+  while (hasDIE) {
+    OffsetStart = Offset;
+    if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
+                          isUnitDWARF64)) {
+      isHeaderChainValid = false;
+      if (isUnitDWARF64)
+        break;
+    } else {
+      std::unique_ptr<DWARFUnit> Unit;
+      switch (UnitType) {
+      case dwarf::DW_UT_type:
+      case dwarf::DW_UT_split_type: {
+        DWARFUnitSection<DWARFTypeUnit> TUSection{};
+        Unit.reset(new DWARFTypeUnit(
+            DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
+            &DCtx.getRangeSection(), DCtx.getStringSection(),
+            DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
+            DCtx.getLineSection(), DCtx.isLittleEndian(), false, TUSection,
+            nullptr));
+        break;
+      }
+      case dwarf::DW_UT_skeleton:
+      case dwarf::DW_UT_split_compile:
+      case dwarf::DW_UT_compile:
+      case dwarf::DW_UT_partial:
+      // UnitType = 0 means that we are
+      // verifying a compile unit in DWARF v4.
+      case 0: {
+        DWARFUnitSection<DWARFCompileUnit> CUSection{};
+        Unit.reset(new DWARFCompileUnit(
+            DCtx, DCtx.getInfoSection(), DCtx.getDebugAbbrev(),
+            &DCtx.getRangeSection(), DCtx.getStringSection(),
+            DCtx.getStringOffsetSection(), &DCtx.getAppleObjCSection(),
+            DCtx.getLineSection(), DCtx.isLittleEndian(), false, CUSection,
+            nullptr));
+        break;
+      }
+      default: { llvm_unreachable("Invalid UnitType."); }
+      }
+      Unit->extract(DebugInfoData, &OffsetStart);
+      if (!verifyUnitContents(*Unit))
+        ++NumDebugInfoErrors;
+    }
+    hasDIE = DebugInfoData.isValidOffset(Offset);
+    ++UnitIdx;
+  }
+  if (UnitIdx == 0 && !hasDIE) {
+    OS << "Warning: .debug_info is empty.\n";
+    isHeaderChainValid = true;
+  }
+  NumDebugInfoErrors += verifyDebugInfoReferences();
+  return (isHeaderChainValid && NumDebugInfoErrors == 0);
+}
+
+unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
+                                                 DWARFAttribute &AttrValue) {
+  unsigned NumErrors = 0;
+  const auto Attr = AttrValue.Attr;
+  switch (Attr) {
+  case DW_AT_ranges:
+    // Make sure the offset in the DW_AT_ranges attribute is valid.
+    if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
+      if (*SectionOffset >= DCtx.getRangeSection().Data.size()) {
+        ++NumErrors;
+        OS << "error: DW_AT_ranges offset is beyond .debug_ranges "
+              "bounds:\n";
+        Die.dump(OS, 0);
+        OS << "\n";
+      }
+    } else {
+      ++NumErrors;
+      OS << "error: DIE has invalid DW_AT_ranges encoding:\n";
+      Die.dump(OS, 0);
+      OS << "\n";
+    }
+    break;
+  case DW_AT_stmt_list:
+    // Make sure the offset in the DW_AT_stmt_list attribute is valid.
+    if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
+      if (*SectionOffset >= DCtx.getLineSection().Data.size()) {
+        ++NumErrors;
+        OS << "error: DW_AT_stmt_list offset is beyond .debug_line "
+              "bounds: "
+           << format("0x%08" PRIx64, *SectionOffset) << "\n";
+        Die.dump(OS, 0);
+        OS << "\n";
+      }
+    } else {
+      ++NumErrors;
+      OS << "error: DIE has invalid DW_AT_stmt_list encoding:\n";
+      Die.dump(OS, 0);
+      OS << "\n";
+    }
+    break;
+
+  default:
+    break;
+  }
+  return NumErrors;
+}
+
+unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
+                                            DWARFAttribute &AttrValue) {
+  unsigned NumErrors = 0;
+  const auto Form = AttrValue.Value.getForm();
+  switch (Form) {
+  case DW_FORM_ref1:
+  case DW_FORM_ref2:
+  case DW_FORM_ref4:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_udata: {
+    // Verify all CU relative references are valid CU offsets.
+    Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
+    assert(RefVal);
+    if (RefVal) {
+      auto DieCU = Die.getDwarfUnit();
+      auto CUSize = DieCU->getNextUnitOffset() - DieCU->getOffset();
+      auto CUOffset = AttrValue.Value.getRawUValue();
+      if (CUOffset >= CUSize) {
+        ++NumErrors;
+        OS << "error: " << FormEncodingString(Form) << " CU offset "
+           << format("0x%08" PRIx64, CUOffset)
+           << " is invalid (must be less than CU size of "
+           << format("0x%08" PRIx32, CUSize) << "):\n";
+        Die.dump(OS, 0);
+        OS << "\n";
+      } else {
+        // Valid reference, but we will verify it points to an actual
+        // DIE later.
+        ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
+      }
+    }
+    break;
+  }
+  case DW_FORM_ref_addr: {
+    // Verify all absolute DIE references have valid offsets in the
+    // .debug_info section.
+    Optional<uint64_t> RefVal = AttrValue.Value.getAsReference();
+    assert(RefVal);
+    if (RefVal) {
+      if (*RefVal >= DCtx.getInfoSection().Data.size()) {
+        ++NumErrors;
+        OS << "error: DW_FORM_ref_addr offset beyond .debug_info "
+              "bounds:\n";
+        Die.dump(OS, 0);
+        OS << "\n";
+      } else {
+        // Valid reference, but we will verify it points to an actual
+        // DIE later.
+        ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
+      }
+    }
+    break;
+  }
+  case DW_FORM_strp: {
+    auto SecOffset = AttrValue.Value.getAsSectionOffset();
+    assert(SecOffset); // DW_FORM_strp is a section offset.
+    if (SecOffset && *SecOffset >= DCtx.getStringSection().size()) {
+      ++NumErrors;
+      OS << "error: DW_FORM_strp offset beyond .debug_str bounds:\n";
+      Die.dump(OS, 0);
+      OS << "\n";
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return NumErrors;
+}
+
+unsigned DWARFVerifier::verifyDebugInfoReferences() {
+  // Take all references and make sure they point to an actual DIE by
+  // getting the DIE by offset and emitting an error
+  OS << "Verifying .debug_info references...\n";
+  unsigned NumErrors = 0;
+  for (auto Pair : ReferenceToDIEOffsets) {
+    auto Die = DCtx.getDIEForOffset(Pair.first);
+    if (Die)
+      continue;
+    ++NumErrors;
+    OS << "error: invalid DIE reference " << format("0x%08" PRIx64, Pair.first)
+       << ". Offset is in between DIEs:\n";
+    for (auto Offset : Pair.second) {
+      auto ReferencingDie = DCtx.getDIEForOffset(Offset);
+      ReferencingDie.dump(OS, 0);
+      OS << "\n";
+    }
+    OS << "\n";
+  }
+  return NumErrors;
+}
+
+void DWARFVerifier::verifyDebugLineStmtOffsets() {
+  std::map<uint64_t, DWARFDie> StmtListToDie;
+  for (const auto &CU : DCtx.compile_units()) {
+    auto Die = CU->getUnitDIE();
+    // Get the attribute value as a section offset. No need to produce an
+    // error here if the encoding isn't correct because we validate this in
+    // the .debug_info verifier.
+    auto StmtSectionOffset = toSectionOffset(Die.find(DW_AT_stmt_list));
+    if (!StmtSectionOffset)
+      continue;
+    const uint32_t LineTableOffset = *StmtSectionOffset;
+    auto LineTable = DCtx.getLineTableForUnit(CU.get());
+    if (LineTableOffset < DCtx.getLineSection().Data.size()) {
+      if (!LineTable) {
+        ++NumDebugLineErrors;
+        OS << "error: .debug_line[" << format("0x%08" PRIx32, LineTableOffset)
+           << "] was not able to be parsed for CU:\n";
+        Die.dump(OS, 0);
+        OS << '\n';
+        continue;
+      }
+    } else {
+      // Make sure we don't get a valid line table back if the offset is wrong.
+      assert(LineTable == nullptr);
+      // Skip this line table as it isn't valid. No need to create an error
+      // here because we validate this in the .debug_info verifier.
+      continue;
+    }
+    auto Iter = StmtListToDie.find(LineTableOffset);
+    if (Iter != StmtListToDie.end()) {
+      ++NumDebugLineErrors;
+      OS << "error: two compile unit DIEs, "
+         << format("0x%08" PRIx32, Iter->second.getOffset()) << " and "
+         << format("0x%08" PRIx32, Die.getOffset())
+         << ", have the same DW_AT_stmt_list section offset:\n";
+      Iter->second.dump(OS, 0);
+      Die.dump(OS, 0);
+      OS << '\n';
+      // Already verified this line table before, no need to do it again.
+      continue;
+    }
+    StmtListToDie[LineTableOffset] = Die;
+  }
+}
+
+void DWARFVerifier::verifyDebugLineRows() {
+  for (const auto &CU : DCtx.compile_units()) {
+    auto Die = CU->getUnitDIE();
+    auto LineTable = DCtx.getLineTableForUnit(CU.get());
+    // If there is no line table we will have created an error in the
+    // .debug_info verifier or in verifyDebugLineStmtOffsets().
+    if (!LineTable)
+      continue;
+    uint32_t MaxFileIndex = LineTable->Prologue.FileNames.size();
+    uint64_t PrevAddress = 0;
+    uint32_t RowIndex = 0;
+    for (const auto &Row : LineTable->Rows) {
+      if (Row.Address < PrevAddress) {
+        ++NumDebugLineErrors;
+        OS << "error: .debug_line["
+           << format("0x%08" PRIx64,
+                     *toSectionOffset(Die.find(DW_AT_stmt_list)))
+           << "] row[" << RowIndex
+           << "] decreases in address from previous row:\n";
+
+        DWARFDebugLine::Row::dumpTableHeader(OS);
+        if (RowIndex > 0)
+          LineTable->Rows[RowIndex - 1].dump(OS);
+        Row.dump(OS);
+        OS << '\n';
+      }
+
+      if (Row.File > MaxFileIndex) {
+        ++NumDebugLineErrors;
+        OS << "error: .debug_line["
+           << format("0x%08" PRIx64,
+                     *toSectionOffset(Die.find(DW_AT_stmt_list)))
+           << "][" << RowIndex << "] has invalid file index " << Row.File
+           << " (valid values are [1," << MaxFileIndex << "]):\n";
+        DWARFDebugLine::Row::dumpTableHeader(OS);
+        Row.dump(OS);
+        OS << '\n';
+      }
+      if (Row.EndSequence)
+        PrevAddress = 0;
+      else
+        PrevAddress = Row.Address;
+      ++RowIndex;
+    }
+  }
+}
+
+bool DWARFVerifier::handleDebugLine() {
+  NumDebugLineErrors = 0;
+  OS << "Verifying .debug_line...\n";
+  verifyDebugLineStmtOffsets();
+  verifyDebugLineRows();
+  return NumDebugLineErrors == 0;
+}
+
+bool DWARFVerifier::handleAppleNames() {
+  NumAppleNamesErrors = 0;
+
+  DWARFDataExtractor AppleNamesSection(DCtx.getAppleNamesSection(),
+                                       DCtx.isLittleEndian(), 0);
+  DataExtractor StrData(DCtx.getStringSection(), DCtx.isLittleEndian(), 0);
+  DWARFAcceleratorTable AppleNames(AppleNamesSection, StrData);
+
+  if (!AppleNames.extract()) {
+    return true;
+  }
+
+  OS << "Verifying .apple_names...\n";
+
+  // Verify that all buckets have a valid hash index or are empty.
+  uint32_t NumBuckets = AppleNames.getNumBuckets();
+  uint32_t NumHashes = AppleNames.getNumHashes();
+
+  uint32_t BucketsOffset =
+      AppleNames.getSizeHdr() + AppleNames.getHeaderDataLength();
+  uint32_t HashesBase = BucketsOffset + NumBuckets * 4;
+  uint32_t OffsetsBase = HashesBase + NumHashes * 4;
+
+  for (uint32_t BucketIdx = 0; BucketIdx < NumBuckets; ++BucketIdx) {
+    uint32_t HashIdx = AppleNamesSection.getU32(&BucketsOffset);
+    if (HashIdx >= NumHashes && HashIdx != UINT32_MAX) {
+      OS << format("error: Bucket[%d] has invalid hash index: %u\n", BucketIdx,
+                   HashIdx);
+      ++NumAppleNamesErrors;
+    }
+  }
+
+  uint32_t NumAtoms = AppleNames.getAtomsDesc().size();
+  if (NumAtoms == 0) {
+    OS << "error: no atoms; failed to read HashData\n";
+    ++NumAppleNamesErrors;
+    return false;
+  }
+
+  if (!AppleNames.validateForms()) {
+    OS << "error: unsupported form; failed to read HashData\n";
+    ++NumAppleNamesErrors;
+    return false;
+  }
+
+  for (uint32_t HashIdx = 0; HashIdx < NumHashes; ++HashIdx) {
+    uint32_t HashOffset = HashesBase + 4 * HashIdx;
+    uint32_t DataOffset = OffsetsBase + 4 * HashIdx;
+    uint32_t Hash = AppleNamesSection.getU32(&HashOffset);
+    uint32_t HashDataOffset = AppleNamesSection.getU32(&DataOffset);
+    if (!AppleNamesSection.isValidOffsetForDataOfSize(HashDataOffset,
+                                                      sizeof(uint64_t))) {
+      OS << format("error: Hash[%d] has invalid HashData offset: 0x%08x\n",
+                   HashIdx, HashDataOffset);
+      ++NumAppleNamesErrors;
+    }
+
+    uint32_t StrpOffset;
+    uint32_t StringOffset;
+    uint32_t StringCount = 0;
+    uint32_t DieOffset = dwarf::DW_INVALID_OFFSET;
+
+    while ((StrpOffset = AppleNamesSection.getU32(&HashDataOffset)) != 0) {
+      const uint32_t NumHashDataObjects =
+          AppleNamesSection.getU32(&HashDataOffset);
+      for (uint32_t HashDataIdx = 0; HashDataIdx < NumHashDataObjects;
+           ++HashDataIdx) {
+        DieOffset = AppleNames.readAtoms(HashDataOffset);
+        if (!DCtx.getDIEForOffset(DieOffset)) {
+          const uint32_t BucketIdx =
+              NumBuckets ? (Hash % NumBuckets) : UINT32_MAX;
+          StringOffset = StrpOffset;
+          const char *Name = StrData.getCStr(&StringOffset);
+          if (!Name)
+            Name = "<NULL>";
+
+          OS << format(
+              "error: .apple_names Bucket[%d] Hash[%d] = 0x%08x "
+              "Str[%u] = 0x%08x "
+              "DIE[%d] = 0x%08x is not a valid DIE offset for \"%s\".\n",
+              BucketIdx, HashIdx, Hash, StringCount, StrpOffset, HashDataIdx,
+              DieOffset, Name);
+
+          ++NumAppleNamesErrors;
+        }
+      }
+      ++StringCount;
+    }
+  }
+  return NumAppleNamesErrors == 0;
+}
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
index 4f561d0..d4f44e4 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
+++ b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
@@ -1,4 +1,4 @@
-//===-- SyntaxHighlighting.cpp ----------------------------------*- C++ -*-===//
+//===- SyntaxHighlighting.cpp ---------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,6 +9,8 @@
 
 #include "SyntaxHighlighting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
@@ -18,16 +20,16 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("use colored syntax highlighting (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-WithColor::WithColor(llvm::raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
+WithColor::WithColor(raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
   // Detect color from terminal type unless the user passed the --color option.
   if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE) {
     switch (Type) {
-    case Address:    OS.changeColor(llvm::raw_ostream::YELLOW);  break;
-    case String:     OS.changeColor(llvm::raw_ostream::GREEN);   break;
-    case Tag:        OS.changeColor(llvm::raw_ostream::BLUE);    break;
-    case Attribute:  OS.changeColor(llvm::raw_ostream::CYAN);    break;
-    case Enumerator: OS.changeColor(llvm::raw_ostream::MAGENTA); break;
-    case Macro:      OS.changeColor(llvm::raw_ostream::RED);     break;
+    case Address:    OS.changeColor(raw_ostream::YELLOW);  break;
+    case String:     OS.changeColor(raw_ostream::GREEN);   break;
+    case Tag:        OS.changeColor(raw_ostream::BLUE);    break;
+    case Attribute:  OS.changeColor(raw_ostream::CYAN);    break;
+    case Enumerator: OS.changeColor(raw_ostream::MAGENTA); break;
+    case Macro:      OS.changeColor(raw_ostream::RED);     break;
     }
   }
 }
diff --git a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
index 16e6835..277de97 100644
--- a/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
+++ b/contrib/llvm/lib/DebugInfo/DWARF/SyntaxHighlighting.h
@@ -1,4 +1,4 @@
-//===-- SyntaxHighlighting.h ------------------------------------*- C++ -*-===//
+//===- SyntaxHighlighting.h -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,10 @@
 #ifndef LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
 #define LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
 
-#include "llvm/Support/raw_ostream.h"
-
 namespace llvm {
+
+class raw_ostream;
+
 namespace dwarf {
 namespace syntax {
 
@@ -22,18 +23,20 @@ enum HighlightColor { Address, String, Tag, Attribute, Enumerator, Macro };
 /// An RAII object that temporarily switches an output stream to a
 /// specific color.
 class WithColor {
-  llvm::raw_ostream &OS;
+  raw_ostream &OS;
 
 public:
   /// To be used like this: WithColor(OS, syntax::String) << "text";
-  WithColor(llvm::raw_ostream &OS, enum HighlightColor Type);
+  WithColor(raw_ostream &OS, enum HighlightColor Type);
   ~WithColor();
 
-  llvm::raw_ostream& get() { return OS; }
-  operator llvm::raw_ostream& () { return OS; }
+  raw_ostream& get() { return OS; }
+  operator raw_ostream& () { return OS; }
 };
-}
-}
-}
 
-#endif
+} // end namespace syntax
+} // end namespace dwarf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
diff --git a/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
index 5b1b5d8..0f4f785 100644
--- a/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp
@@ -1,3 +1,4 @@
+//===- MSFBuilder.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,22 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MSFError.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::msf;
 using namespace llvm::support;
 
-namespace {
-const uint32_t kSuperBlockBlock = 0;
-const uint32_t kFreePageMap0Block = 1;
-const uint32_t kFreePageMap1Block = 2;
-const uint32_t kNumReservedPages = 3;
+static const uint32_t kSuperBlockBlock = 0;
+static const uint32_t kFreePageMap0Block = 1;
+static const uint32_t kFreePageMap1Block = 2;
+static const uint32_t kNumReservedPages = 3;
 
-const uint32_t kDefaultFreePageMap = kFreePageMap0Block;
-const uint32_t kDefaultBlockMapAddr = kNumReservedPages;
-}
+static const uint32_t kDefaultFreePageMap = kFreePageMap0Block;
+static const uint32_t kDefaultBlockMapAddr = kNumReservedPages;
 
 MSFBuilder::MSFBuilder(uint32_t BlockSize, uint32_t MinBlockCount, bool CanGrow,
                        BumpPtrAllocator &Allocator)
@@ -263,7 +272,7 @@ Expected<MSFLayout> MSFBuilder::build() {
 
   // The stream sizes should be re-allocated as a stable pointer and the stream
   // map should have each of its entries allocated as a separate stable pointer.
-  if (StreamData.size() > 0) {
+  if (!StreamData.empty()) {
     ulittle32_t *Sizes = Allocator.Allocate<ulittle32_t>(StreamData.size());
     L.StreamSizes = ArrayRef<ulittle32_t>(Sizes, StreamData.size());
     L.StreamMap.resize(StreamData.size());
diff --git a/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp b/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp
index fdab788..1facf5e 100644
--- a/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp
+++ b/contrib/llvm/lib/DebugInfo/MSF/MSFCommon.cpp
@@ -1,4 +1,4 @@
-//===- MSFCommon.cpp - Common types and functions for MSF files -*- C++ -*-===//
+//===- MSFCommon.cpp - Common types and functions for MSF files -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,6 +9,10 @@
 
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MSFError.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::msf;
diff --git a/contrib/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp b/contrib/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
index e52c88a..e45f4ae 100644
--- a/contrib/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -8,23 +8,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-
-#include "llvm/DebugInfo/MSF/IMSFFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
 #include "llvm/DebugInfo/MSF/MSFStreamLayout.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::msf;
 
 namespace {
+
 template <typename Base> class MappedBlockStreamImpl : public Base {
 public:
   template <typename... Args>
   MappedBlockStreamImpl(Args &&... Params)
       : Base(std::forward<Args>(Params)...) {}
 };
-}
+
+} // end anonymous namespace
 
 static void initializeFpmStreamLayout(const MSFLayout &Layout,
                                       MSFStreamLayout &FpmLayout) {
@@ -39,62 +49,62 @@ static void initializeFpmStreamLayout(const MSFLayout &Layout,
   FpmLayout.Length = msf::getFullFpmByteSize(Layout);
 }
 
-typedef std::pair<uint32_t, uint32_t> Interval;
+using Interval = std::pair<uint32_t, uint32_t>;
+
 static Interval intersect(const Interval &I1, const Interval &I2) {
   return std::make_pair(std::max(I1.first, I2.first),
                         std::min(I1.second, I2.second));
 }
 
-MappedBlockStream::MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
+MappedBlockStream::MappedBlockStream(uint32_t BlockSize,
                                      const MSFStreamLayout &Layout,
-                                     const ReadableStream &MsfData)
-    : BlockSize(BlockSize), NumBlocks(NumBlocks), StreamLayout(Layout),
-      MsfData(MsfData) {}
-
-std::unique_ptr<MappedBlockStream>
-MappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
-                                const MSFStreamLayout &Layout,
-                                const ReadableStream &MsfData) {
+                                     BinaryStreamRef MsfData,
+                                     BumpPtrAllocator &Allocator)
+    : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData),
+      Allocator(Allocator) {}
+
+std::unique_ptr<MappedBlockStream> MappedBlockStream::createStream(
+    uint32_t BlockSize, const MSFStreamLayout &Layout, BinaryStreamRef MsfData,
+    BumpPtrAllocator &Allocator) {
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
-      BlockSize, NumBlocks, Layout, MsfData);
+      BlockSize, Layout, MsfData, Allocator);
 }
 
-std::unique_ptr<MappedBlockStream>
-MappedBlockStream::createIndexedStream(const MSFLayout &Layout,
-                                       const ReadableStream &MsfData,
-                                       uint32_t StreamIndex) {
+std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
+    const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex,
+    BumpPtrAllocator &Allocator) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
-      Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+      Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
-                                         const ReadableStream &MsfData) {
+                                         BinaryStreamRef MsfData,
+                                         BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                   const ReadableStream &MsfData) {
+                                   BinaryStreamRef MsfData,
+                                   BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
-                                   ArrayRef<uint8_t> &Buffer) const {
+                                   ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Size > StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  if (Offset > StreamLayout.Length - Size)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Size))
+    return EC;
 
   if (tryReadContiguously(Offset, Size, Buffer))
     return Error::success();
@@ -153,7 +163,7 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
   // into it, and return an ArrayRef to that.  Do not touch existing pool
   // allocations, as existing clients may be holding a pointer which must
   // not be invalidated.
-  uint8_t *WriteBuffer = static_cast<uint8_t *>(Pool.Allocate(Size, 8));
+  uint8_t *WriteBuffer = static_cast<uint8_t *>(Allocator.Allocate(Size, 8));
   if (auto EC = readBytes(Offset, MutableArrayRef<uint8_t>(WriteBuffer, Size)))
     return EC;
 
@@ -168,15 +178,16 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
   return Error::success();
 }
 
-Error MappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
+                                                    ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Offset >= StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, 1))
+    return EC;
+
   uint32_t First = Offset / BlockSize;
   uint32_t Last = First;
 
-  while (Last < NumBlocks - 1) {
+  while (Last < getNumBlocks() - 1) {
     if (StreamLayout.Blocks[Last] != StreamLayout.Blocks[Last + 1] - 1)
       break;
     ++Last;
@@ -197,10 +208,10 @@ Error MappedBlockStream::readLongestContiguousChunk(
   return Error::success();
 }
 
-uint32_t MappedBlockStream::getLength() const { return StreamLayout.Length; }
+uint32_t MappedBlockStream::getLength() { return StreamLayout.Length; }
 
 bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
-                                            ArrayRef<uint8_t> &Buffer) const {
+                                            ArrayRef<uint8_t> &Buffer) {
   if (Size == 0) {
     Buffer = ArrayRef<uint8_t>();
     return true;
@@ -214,7 +225,7 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
   uint32_t OffsetInBlock = Offset % BlockSize;
   uint32_t BytesFromFirstBlock = std::min(Size, BlockSize - OffsetInBlock);
   uint32_t NumAdditionalBlocks =
-      llvm::alignTo(Size - BytesFromFirstBlock, BlockSize) / BlockSize;
+      alignTo(Size - BytesFromFirstBlock, BlockSize) / BlockSize;
 
   uint32_t RequiredContiguousBlocks = NumAdditionalBlocks + 1;
   uint32_t E = StreamLayout.Blocks[BlockNum];
@@ -241,15 +252,13 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset,
-                                   MutableArrayRef<uint8_t> Buffer) const {
+                                   MutableArrayRef<uint8_t> Buffer) {
   uint32_t BlockNum = Offset / BlockSize;
   uint32_t OffsetInBlock = Offset % BlockSize;
 
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Buffer.size() > StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  if (Offset > StreamLayout.Length - Buffer.size())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Buffer.size()))
+    return EC;
 
   uint32_t BytesLeft = Buffer.size();
   uint32_t BytesWritten = 0;
@@ -275,10 +284,6 @@ Error MappedBlockStream::readBytes(uint32_t Offset,
   return Error::success();
 }
 
-uint32_t MappedBlockStream::getNumBytesCopied() const {
-  return static_cast<uint32_t>(Pool.getBytesAllocated());
-}
-
 void MappedBlockStream::invalidateCache() { CacheMap.shrink_and_clear(); }
 
 void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
@@ -318,69 +323,70 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
 }
 
 WritableMappedBlockStream::WritableMappedBlockStream(
-    uint32_t BlockSize, uint32_t NumBlocks, const MSFStreamLayout &Layout,
-    const WritableStream &MsfData)
-    : ReadInterface(BlockSize, NumBlocks, Layout, MsfData),
+    uint32_t BlockSize, const MSFStreamLayout &Layout,
+    WritableBinaryStreamRef MsfData, BumpPtrAllocator &Allocator)
+    : ReadInterface(BlockSize, Layout, MsfData, Allocator),
       WriteInterface(MsfData) {}
 
 std::unique_ptr<WritableMappedBlockStream>
-WritableMappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
+WritableMappedBlockStream::createStream(uint32_t BlockSize,
                                         const MSFStreamLayout &Layout,
-                                        const WritableStream &MsfData) {
+                                        WritableBinaryStreamRef MsfData,
+                                        BumpPtrAllocator &Allocator) {
   return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
-      BlockSize, NumBlocks, Layout, MsfData);
+      BlockSize, Layout, MsfData, Allocator);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
-                                               const WritableStream &MsfData,
-                                               uint32_t StreamIndex) {
+                                               WritableBinaryStreamRef MsfData,
+                                               uint32_t StreamIndex,
+                                               BumpPtrAllocator &Allocator) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createDirectoryStream(
-    const MSFLayout &Layout, const WritableStream &MsfData) {
+    const MSFLayout &Layout, WritableBinaryStreamRef MsfData,
+    BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                           const WritableStream &MsfData) {
+                                           WritableBinaryStreamRef MsfData,
+                                           BumpPtrAllocator &Allocator) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData, Allocator);
 }
 
 Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
-                                           ArrayRef<uint8_t> &Buffer) const {
+                                           ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readBytes(Offset, Size, Buffer);
 }
 
 Error WritableMappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readLongestContiguousChunk(Offset, Buffer);
 }
 
-uint32_t WritableMappedBlockStream::getLength() const {
+uint32_t WritableMappedBlockStream::getLength() {
   return ReadInterface.getLength();
 }
 
 Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
-                                            ArrayRef<uint8_t> Buffer) const {
+                                            ArrayRef<uint8_t> Buffer) {
   // Make sure we aren't trying to write beyond the end of the stream.
-  if (Buffer.size() > getStreamLength())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-  if (Offset > getStreamLayout().Length - Buffer.size())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Buffer.size()))
+    return EC;
 
   uint32_t BlockNum = Offset / getBlockSize();
   uint32_t OffsetInBlock = Offset % getBlockSize();
@@ -410,6 +416,4 @@ Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
   return Error::success();
 }
 
-Error WritableMappedBlockStream::commit() const {
-  return WriteInterface.commit();
-}
+Error WritableMappedBlockStream::commit() { return WriteInterface.commit(); }
diff --git a/contrib/llvm/lib/DebugInfo/MSF/StreamReader.cpp b/contrib/llvm/lib/DebugInfo/MSF/StreamReader.cpp
deleted file mode 100644
index b85fd14..0000000
--- a/contrib/llvm/lib/DebugInfo/MSF/StreamReader.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- StreamReader.cpp - Reads bytes and objects from a stream -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-
-StreamReader::StreamReader(ReadableStreamRef S) : Stream(S), Offset(0) {}
-
-Error StreamReader::readLongestContiguousChunk(ArrayRef<uint8_t> &Buffer) {
-  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
-    return EC;
-  Offset += Buffer.size();
-  return Error::success();
-}
-
-Error StreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
-  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
-    return EC;
-  Offset += Size;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint8_t &Dest) {
-  const uint8_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint16_t &Dest) {
-  const support::ulittle16_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint32_t &Dest) {
-  const support::ulittle32_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint64_t &Dest) {
-  const support::ulittle64_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int8_t &Dest) {
-  const int8_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int16_t &Dest) {
-  const support::little16_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int32_t &Dest) {
-  const support::little32_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int64_t &Dest) {
-  const support::little64_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readZeroString(StringRef &Dest) {
-  uint32_t Length = 0;
-  // First compute the length of the string by reading 1 byte at a time.
-  uint32_t OriginalOffset = getOffset();
-  const char *C;
-  do {
-    if (auto EC = readObject(C))
-      return EC;
-    if (*C != '\0')
-      ++Length;
-  } while (*C != '\0');
-  // Now go back and request a reference for that many bytes.
-  uint32_t NewOffset = getOffset();
-  setOffset(OriginalOffset);
-
-  ArrayRef<uint8_t> Data;
-  if (auto EC = readBytes(Data, Length))
-    return EC;
-  Dest = StringRef(reinterpret_cast<const char *>(Data.begin()), Data.size());
-
-  // Now set the offset back to where it was after we calculated the length.
-  setOffset(NewOffset);
-  return Error::success();
-}
-
-Error StreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
-  ArrayRef<uint8_t> Bytes;
-  if (auto EC = readBytes(Bytes, Length))
-    return EC;
-  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
-  return Error::success();
-}
-
-Error StreamReader::readStreamRef(ReadableStreamRef &Ref) {
-  return readStreamRef(Ref, bytesRemaining());
-}
-
-Error StreamReader::readStreamRef(ReadableStreamRef &Ref, uint32_t Length) {
-  if (bytesRemaining() < Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  Ref = Stream.slice(Offset, Length);
-  Offset += Length;
-  return Error::success();
-}
-
-Error StreamReader::skip(uint32_t Amount) {
-  if (Amount > bytesRemaining())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  Offset += Amount;
-  return Error::success();
-}
-
-uint8_t StreamReader::peek() const {
-  ArrayRef<uint8_t> Buffer;
-  auto EC = Stream.readBytes(Offset, 1, Buffer);
-  assert(!EC && "Cannot peek an empty buffer!");
-  llvm::consumeError(std::move(EC));
-  return Buffer[0];
-}
diff --git a/contrib/llvm/lib/DebugInfo/MSF/StreamWriter.cpp b/contrib/llvm/lib/DebugInfo/MSF/StreamWriter.cpp
deleted file mode 100644
index cdae7c5..0000000
--- a/contrib/llvm/lib/DebugInfo/MSF/StreamWriter.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===- StreamWrite.cpp - Writes bytes and objects to a stream -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-
-StreamWriter::StreamWriter(WritableStreamRef S) : Stream(S), Offset(0) {}
-
-Error StreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
-  if (auto EC = Stream.writeBytes(Offset, Buffer))
-    return EC;
-  Offset += Buffer.size();
-  return Error::success();
-}
-
-Error StreamWriter::writeInteger(uint8_t Int) { return writeObject(Int); }
-
-Error StreamWriter::writeInteger(uint16_t Int) {
-  return writeObject(support::ulittle16_t(Int));
-}
-
-Error StreamWriter::writeInteger(uint32_t Int) {
-  return writeObject(support::ulittle32_t(Int));
-}
-
-Error StreamWriter::writeInteger(uint64_t Int) {
-  return writeObject(support::ulittle64_t(Int));
-}
-
-Error StreamWriter::writeInteger(int8_t Int) { return writeObject(Int); }
-
-Error StreamWriter::writeInteger(int16_t Int) {
-  return writeObject(support::little16_t(Int));
-}
-
-Error StreamWriter::writeInteger(int32_t Int) {
-  return writeObject(support::little32_t(Int));
-}
-
-Error StreamWriter::writeInteger(int64_t Int) {
-  return writeObject(support::little64_t(Int));
-}
-
-Error StreamWriter::writeZeroString(StringRef Str) {
-  if (auto EC = writeFixedString(Str))
-    return EC;
-  if (auto EC = writeObject('\0'))
-    return EC;
-
-  return Error::success();
-}
-
-Error StreamWriter::writeFixedString(StringRef Str) {
-  ArrayRef<uint8_t> Bytes(Str.bytes_begin(), Str.bytes_end());
-  if (auto EC = Stream.writeBytes(Offset, Bytes))
-    return EC;
-
-  Offset += Str.size();
-  return Error::success();
-}
-
-Error StreamWriter::writeStreamRef(ReadableStreamRef Ref) {
-  if (auto EC = writeStreamRef(Ref, Ref.getLength()))
-    return EC;
-  // Don't increment Offset here, it is done by the overloaded call to
-  // writeStreamRef.
-  return Error::success();
-}
-
-Error StreamWriter::writeStreamRef(ReadableStreamRef Ref, uint32_t Length) {
-  Ref = Ref.slice(0, Length);
-
-  StreamReader SrcReader(Ref);
-  // This is a bit tricky.  If we just call readBytes, we are requiring that it
-  // return us the entire stream as a contiguous buffer.  For large streams this
-  // will allocate a huge amount of space from the pool.  Instead, iterate over
-  // each contiguous chunk until we've consumed the entire stream.
-  while (SrcReader.bytesRemaining() > 0) {
-    ArrayRef<uint8_t> Chunk;
-    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
-      return EC;
-    if (auto EC = writeBytes(Chunk))
-      return EC;
-  }
-  return Error::success();
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
index cae817c..f62c499 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumDebugStreams.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/DIA/DIADataStream.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumDebugStreams.h"
+#include "llvm/DebugInfo/PDB/DIA/DIADataStream.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
index 4741d9c..796ce21 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumLineNumbers.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/DIA/DIALineNumber.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
index ccf8c4e..b9311d0 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSourceFiles.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSourceFiles.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASourceFile.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
index 3c211b5..2666385 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIAEnumSymbols.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index bba5b0f..4c59d2f 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -10,9 +10,14 @@
 #include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -120,16 +125,16 @@ PrivateGetDIAValue(IDiaSymbol *Symbol,
   return Result8;
 }
 
-PDB_UniqueId
+codeview::GUID
 PrivateGetDIAValue(IDiaSymbol *Symbol,
                    HRESULT (__stdcall IDiaSymbol::*Method)(GUID *)) {
   GUID Result;
   if (S_OK != (Symbol->*Method)(&Result))
-    return PDB_UniqueId();
+    return codeview::GUID();
 
-  static_assert(sizeof(PDB_UniqueId) == sizeof(GUID),
-                "PDB_UniqueId is the wrong size!");
-  PDB_UniqueId IdResult;
+  static_assert(sizeof(codeview::GUID) == sizeof(GUID),
+                "GUID is the wrong size!");
+  codeview::GUID IdResult;
   ::memcpy(&IdResult, &Result, sizeof(GUID));
   return IdResult;
 }
@@ -178,9 +183,10 @@ void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
 }
 
 namespace llvm {
-raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid) {
-  const PDB_UniqueId *Id = reinterpret_cast<const PDB_UniqueId *>(&Guid);
-  OS << *Id;
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GUID &G) {
+  StringRef GuidBytes(reinterpret_cast<const char *>(&G), sizeof(G));
+  codeview::detail::GuidAdapter A(GuidBytes);
+  A.format(OS, "");
   return OS;
 }
 }
@@ -366,8 +372,11 @@ DIARawSymbol::findChildren(PDB_SymType Type) const {
   enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
 
   CComPtr<IDiaEnumSymbols> DiaEnumerator;
-  if (S_OK != Symbol->findChildrenEx(EnumVal, nullptr, nsNone, &DiaEnumerator))
-    return nullptr;
+  if (S_OK !=
+      Symbol->findChildrenEx(EnumVal, nullptr, nsNone, &DiaEnumerator)) {
+    if (S_OK != Symbol->findChildren(EnumVal, nullptr, nsNone, &DiaEnumerator))
+      return nullptr;
+  }
 
   return llvm::make_unique<DIAEnumSymbols>(Session, DiaEnumerator);
 }
@@ -715,6 +724,18 @@ uint32_t DIARawSymbol::getVirtualTableShapeId() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_virtualTableShapeId);
 }
 
+std::unique_ptr<PDBSymbolTypeBuiltin>
+DIARawSymbol::getVirtualBaseTableType() const {
+  CComPtr<IDiaSymbol> TableType;
+  if (FAILED(Symbol->get_virtualBaseTableType(&TableType)) || !TableType)
+    return nullptr;
+
+  auto RawVT = llvm::make_unique<DIARawSymbol>(Session, TableType);
+  auto Pointer =
+      llvm::make_unique<PDBSymbolTypePointer>(Session, std::move(RawVT));
+  return unique_dyn_cast<PDBSymbolTypeBuiltin>(Pointer->getPointeeType());
+}
+
 PDB_DataKind DIARawSymbol::getDataKind() const {
   return PrivateGetDIAValue<DWORD, PDB_DataKind>(Symbol,
                                                  &IDiaSymbol::get_dataKind);
@@ -725,7 +746,7 @@ PDB_SymType DIARawSymbol::getSymTag() const {
                                                 &IDiaSymbol::get_symTag);
 }
 
-PDB_UniqueId DIARawSymbol::getGuid() const {
+codeview::GUID DIARawSymbol::getGuid() const {
   return PrivateGetDIAValue(Symbol, &IDiaSymbol::get_guid);
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 6ecf335..ef9390c 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -21,12 +21,22 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
 
-static Error ErrorFromHResult(HRESULT Result, StringRef Context) {
+template <typename... Ts>
+static Error ErrorFromHResult(HRESULT Result, const char *Str, Ts &&... Args) {
+  SmallString<64> MessageStorage;
+  StringRef Context;
+  if (sizeof...(Args) > 0) {
+    MessageStorage = formatv(Str, std::forward<Ts>(Args)...).str();
+    Context = MessageStorage;
+  } else
+    Context = Str;
+
   switch (Result) {
   case E_PDB_NOT_FOUND:
     return make_error<GenericError>(generic_error_code::invalid_path, Context);
@@ -95,8 +105,9 @@ Error DIASession::createFromPdb(StringRef Path,
 
   const wchar_t *Path16Str = reinterpret_cast<const wchar_t*>(Path16.data());
   HRESULT HR;
-  if (FAILED(HR = DiaDataSource->loadDataFromPdb(Path16Str)))
-    return ErrorFromHResult(HR, "Calling loadDataFromPdb");
+  if (FAILED(HR = DiaDataSource->loadDataFromPdb(Path16Str))) {
+    return ErrorFromHResult(HR, "Calling loadDataFromPdb {0}", Path);
+  }
 
   if (FAILED(HR = DiaDataSource->openSession(&DiaSession)))
     return ErrorFromHResult(HR, "Calling openSession");
@@ -140,7 +151,7 @@ void DIASession::setLoadAddress(uint64_t Address) {
   Session->put_loadAddress(Address);
 }
 
-std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() const {
+std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() {
   CComPtr<IDiaSymbol> GlobalScope;
   if (S_OK != Session->get_globalScope(&GlobalScope))
     return nullptr;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp b/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp
index 789f3b8..4fcecb9 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/GenericError.cpp
@@ -26,6 +26,8 @@ public:
     switch (static_cast<generic_error_code>(Condition)) {
     case generic_error_code::unspecified:
       return "An unknown error has occurred.";
+    case generic_error_code::type_server_not_found:
+      return "Type server PDB was not found.";
     case generic_error_code::dia_sdk_not_present:
       return "LLVM was not compiled with support for DIA.  This usually means "
              "that you are are not using MSVC, or your Visual Studio "
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
new file mode 100644
index 0000000..dabcc34
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptor.cpp
@@ -0,0 +1,90 @@
+//===- DbiModuleDescriptor.cpp - PDB module information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::pdb;
+using namespace llvm::support;
+
+DbiModuleDescriptor::DbiModuleDescriptor() = default;
+
+DbiModuleDescriptor::DbiModuleDescriptor(const DbiModuleDescriptor &Info) =
+    default;
+
+DbiModuleDescriptor::~DbiModuleDescriptor() = default;
+
+Error DbiModuleDescriptor::initialize(BinaryStreamRef Stream,
+                                      DbiModuleDescriptor &Info) {
+  BinaryStreamReader Reader(Stream);
+  if (auto EC = Reader.readObject(Info.Layout))
+    return EC;
+
+  if (auto EC = Reader.readCString(Info.ModuleName))
+    return EC;
+
+  if (auto EC = Reader.readCString(Info.ObjFileName))
+    return EC;
+  return Error::success();
+}
+
+bool DbiModuleDescriptor::hasECInfo() const {
+  return (Layout->Flags & ModInfoFlags::HasECFlagMask) != 0;
+}
+
+uint16_t DbiModuleDescriptor::getTypeServerIndex() const {
+  return (Layout->Flags & ModInfoFlags::TypeServerIndexMask) >>
+         ModInfoFlags::TypeServerIndexShift;
+}
+
+uint16_t DbiModuleDescriptor::getModuleStreamIndex() const {
+  return Layout->ModDiStream;
+}
+
+uint32_t DbiModuleDescriptor::getSymbolDebugInfoByteSize() const {
+  return Layout->SymBytes;
+}
+
+uint32_t DbiModuleDescriptor::getC11LineInfoByteSize() const {
+  return Layout->C11Bytes;
+}
+
+uint32_t DbiModuleDescriptor::getC13LineInfoByteSize() const {
+  return Layout->C13Bytes;
+}
+
+uint32_t DbiModuleDescriptor::getNumberOfFiles() const {
+  return Layout->NumFiles;
+}
+
+uint32_t DbiModuleDescriptor::getSourceFileNameIndex() const {
+  return Layout->SrcFileNameNI;
+}
+
+uint32_t DbiModuleDescriptor::getPdbFilePathNameIndex() const {
+  return Layout->PdbFilePathNI;
+}
+
+StringRef DbiModuleDescriptor::getModuleName() const { return ModuleName; }
+
+StringRef DbiModuleDescriptor::getObjFileName() const { return ObjFileName; }
+
+uint32_t DbiModuleDescriptor::getRecordLength() const {
+  uint32_t M = ModuleName.str().size() + 1;
+  uint32_t O = ObjFileName.str().size() + 1;
+  uint32_t Size = sizeof(ModuleInfoHeader) + M + O;
+  Size = alignTo(Size, 4);
+  return Size;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
new file mode 100644
index 0000000..897f78c
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -0,0 +1,196 @@
+//===- DbiModuleDescriptorBuilder.cpp - PDB Mod Info Creation ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+namespace llvm {
+template <> struct BinaryItemTraits<CVSymbol> {
+  static size_t length(const CVSymbol &Item) { return Item.RecordData.size(); }
+
+  static ArrayRef<uint8_t> bytes(const CVSymbol &Item) {
+    return Item.RecordData;
+  }
+};
+}
+
+static uint32_t calculateDiSymbolStreamSize(uint32_t SymbolByteSize,
+                                            uint32_t C13Size) {
+  uint32_t Size = sizeof(uint32_t);   // Signature
+  Size += alignTo(SymbolByteSize, 4); // Symbol Data
+  Size += 0;                          // TODO: Layout.C11Bytes
+  Size += C13Size;                    // C13 Debug Info Size
+  Size += sizeof(uint32_t);           // GlobalRefs substream size (always 0)
+  Size += 0;                          // GlobalRefs substream bytes
+  return Size;
+}
+
+DbiModuleDescriptorBuilder::DbiModuleDescriptorBuilder(StringRef ModuleName,
+                                                       uint32_t ModIndex,
+                                                       msf::MSFBuilder &Msf)
+    : MSF(Msf), ModuleName(ModuleName) {
+  ::memset(&Layout, 0, sizeof(Layout));
+  Layout.Mod = ModIndex;
+}
+
+DbiModuleDescriptorBuilder::~DbiModuleDescriptorBuilder() {}
+
+uint16_t DbiModuleDescriptorBuilder::getStreamIndex() const {
+  return Layout.ModDiStream;
+}
+
+void DbiModuleDescriptorBuilder::setObjFileName(StringRef Name) {
+  ObjFileName = Name;
+}
+
+void DbiModuleDescriptorBuilder::setPdbFilePathNI(uint32_t NI) {
+  PdbFilePathNI = NI;
+}
+
+void DbiModuleDescriptorBuilder::addSymbol(CVSymbol Symbol) {
+  Symbols.push_back(Symbol);
+  // Symbols written to a PDB file are required to be 4 byte aligned.  The same
+  // is not true of object files.
+  assert(Symbol.length() % alignOf(CodeViewContainer::Pdb) == 0 &&
+         "Invalid Symbol alignment!");
+  SymbolByteSize += Symbol.length();
+}
+
+void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) {
+  SourceFiles.push_back(Path);
+}
+
+uint32_t DbiModuleDescriptorBuilder::calculateC13DebugInfoSize() const {
+  uint32_t Result = 0;
+  for (const auto &Builder : C13Builders) {
+    assert(Builder && "Empty C13 Fragment Builder!");
+    Result += Builder->calculateSerializedLength();
+  }
+  return Result;
+}
+
+uint32_t DbiModuleDescriptorBuilder::calculateSerializedLength() const {
+  uint32_t L = sizeof(Layout);
+  uint32_t M = ModuleName.size() + 1;
+  uint32_t O = ObjFileName.size() + 1;
+  return alignTo(L + M + O, sizeof(uint32_t));
+}
+
+template <typename T> struct Foo {
+  explicit Foo(T &&Answer) : Answer(Answer) {}
+
+  T Answer;
+};
+
+template <typename T> Foo<T> makeFoo(T &&t) { return Foo<T>(std::move(t)); }
+
+void DbiModuleDescriptorBuilder::finalize() {
+  Layout.SC.ModuleIndex = Layout.Mod;
+  Layout.FileNameOffs = 0; // TODO: Fix this
+  Layout.Flags = 0;        // TODO: Fix this
+  Layout.C11Bytes = 0;
+  Layout.C13Bytes = calculateC13DebugInfoSize();
+  (void)Layout.Mod;         // Set in constructor
+  (void)Layout.ModDiStream; // Set in finalizeMsfLayout
+  Layout.NumFiles = SourceFiles.size();
+  Layout.PdbFilePathNI = PdbFilePathNI;
+  Layout.SrcFileNameNI = 0;
+
+  // This value includes both the signature field as well as the record bytes
+  // from the symbol stream.
+  Layout.SymBytes = SymbolByteSize + sizeof(uint32_t);
+}
+
+Error DbiModuleDescriptorBuilder::finalizeMsfLayout() {
+  this->Layout.ModDiStream = kInvalidStreamIndex;
+  uint32_t C13Size = calculateC13DebugInfoSize();
+  auto ExpectedSN =
+      MSF.addStream(calculateDiSymbolStreamSize(SymbolByteSize, C13Size));
+  if (!ExpectedSN)
+    return ExpectedSN.takeError();
+  Layout.ModDiStream = *ExpectedSN;
+  return Error::success();
+}
+
+Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
+                                         const msf::MSFLayout &MsfLayout,
+                                         WritableBinaryStreamRef MsfBuffer) {
+  // We write the Modi record to the `ModiWriter`, but we additionally write its
+  // symbol stream to a brand new stream.
+  if (auto EC = ModiWriter.writeObject(Layout))
+    return EC;
+  if (auto EC = ModiWriter.writeCString(ModuleName))
+    return EC;
+  if (auto EC = ModiWriter.writeCString(ObjFileName))
+    return EC;
+  if (auto EC = ModiWriter.padToAlignment(sizeof(uint32_t)))
+    return EC;
+
+  if (Layout.ModDiStream != kInvalidStreamIndex) {
+    auto NS = WritableMappedBlockStream::createIndexedStream(
+        MsfLayout, MsfBuffer, Layout.ModDiStream, MSF.getAllocator());
+    WritableBinaryStreamRef Ref(*NS);
+    BinaryStreamWriter SymbolWriter(Ref);
+    // Write the symbols.
+    if (auto EC =
+            SymbolWriter.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC))
+      return EC;
+    BinaryItemStream<CVSymbol> Records(llvm::support::endianness::little);
+    Records.setItems(Symbols);
+    BinaryStreamRef RecordsRef(Records);
+    if (auto EC = SymbolWriter.writeStreamRef(RecordsRef))
+      return EC;
+    if (auto EC = SymbolWriter.padToAlignment(4))
+      return EC;
+    // TODO: Write C11 Line data
+    assert(SymbolWriter.getOffset() % alignOf(CodeViewContainer::Pdb) == 0 &&
+           "Invalid debug section alignment!");
+    for (const auto &Builder : C13Builders) {
+      assert(Builder && "Empty C13 Fragment Builder!");
+      if (auto EC = Builder->commit(SymbolWriter))
+        return EC;
+    }
+
+    // TODO: Figure out what GlobalRefs substream actually is and populate it.
+    if (auto EC = SymbolWriter.writeInteger<uint32_t>(0))
+      return EC;
+    if (SymbolWriter.bytesRemaining() > 0)
+      return make_error<RawError>(raw_error_code::stream_too_long);
+  }
+  return Error::success();
+}
+
+void DbiModuleDescriptorBuilder::addDebugSubsection(
+    std::shared_ptr<DebugSubsection> Subsection) {
+  assert(Subsection);
+  C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
+      std::move(Subsection), CodeViewContainer::Pdb));
+}
+
+void DbiModuleDescriptorBuilder::addDebugSubsection(
+    const DebugSubsectionRecord &SubsectionContents) {
+  C13Builders.push_back(llvm::make_unique<DebugSubsectionRecordBuilder>(
+      SubsectionContents, CodeViewContainer::Pdb));
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
new file mode 100644
index 0000000..eea70b2
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
@@ -0,0 +1,280 @@
+//===- DbiModuleList.cpp - PDB module information list --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+DbiModuleSourceFilesIterator::DbiModuleSourceFilesIterator(
+    const DbiModuleList &Modules, uint32_t Modi, uint16_t Filei)
+    : Modules(&Modules), Modi(Modi), Filei(Filei) {
+  setValue();
+}
+
+bool DbiModuleSourceFilesIterator::
+operator==(const DbiModuleSourceFilesIterator &R) const {
+  // incompatible iterators are never equal
+  if (!isCompatible(R))
+    return false;
+
+  // If they're compatible, and they're both ends, then they're equal.
+  if (isEnd() && R.isEnd())
+    return true;
+
+  // If one is an end and the other is not, they're not equal.
+  if (isEnd() != R.isEnd())
+    return false;
+
+  // Now we know:
+  // - They're compatible
+  // - They're not *both* end iterators
+  // - Their endness is the same.
+  // Thus, they're compatible iterators pointing to a valid file on the same
+  // module.  All we need to check are the file indices.
+  assert(Modules == R.Modules);
+  assert(Modi == R.Modi);
+  assert(!isEnd());
+  assert(!R.isEnd());
+
+  return (Filei == R.Filei);
+}
+
+bool DbiModuleSourceFilesIterator::
+operator<(const DbiModuleSourceFilesIterator &R) const {
+  assert(isCompatible(R));
+
+  // It's not sufficient to compare the file indices, because default
+  // constructed iterators could be equal to iterators with valid indices.  To
+  // account for this, early-out if they're equal.
+  if (*this == R)
+    return false;
+
+  return Filei < R.Filei;
+}
+
+std::ptrdiff_t DbiModuleSourceFilesIterator::
+operator-(const DbiModuleSourceFilesIterator &R) const {
+  assert(isCompatible(R));
+  assert(!(*this < R));
+
+  // If they're both end iterators, the distance is 0.
+  if (isEnd() && R.isEnd())
+    return 0;
+
+  assert(!R.isEnd());
+
+  // At this point, R cannot be end, but *this can, which means that *this
+  // might be a universal end iterator with none of its fields set.  So in that
+  // case have to rely on R as the authority to figure out how many files there
+  // are to compute the distance.
+  uint32_t Thisi = Filei;
+  if (isEnd()) {
+    uint32_t RealModi = R.Modi;
+    Thisi = R.Modules->getSourceFileCount(RealModi);
+  }
+
+  assert(Thisi >= R.Filei);
+  return Thisi - R.Filei;
+}
+
+DbiModuleSourceFilesIterator &DbiModuleSourceFilesIterator::
+operator+=(std::ptrdiff_t N) {
+  assert(!isEnd());
+
+  Filei += N;
+  assert(Filei <= Modules->getSourceFileCount(Modi));
+  setValue();
+  return *this;
+}
+
+DbiModuleSourceFilesIterator &DbiModuleSourceFilesIterator::
+operator-=(std::ptrdiff_t N) {
+  // Note that we can subtract from an end iterator, but not a universal end
+  // iterator.
+  assert(!isUniversalEnd());
+
+  assert(N <= Filei);
+
+  Filei -= N;
+  return *this;
+}
+
+void DbiModuleSourceFilesIterator::setValue() {
+  if (isEnd()) {
+    ThisValue = "";
+    return;
+  }
+
+  uint32_t Off = Modules->ModuleInitialFileIndex[Modi] + Filei;
+  auto ExpectedValue = Modules->getFileName(Off);
+  if (!ExpectedValue) {
+    consumeError(ExpectedValue.takeError());
+    Filei = Modules->getSourceFileCount(Modi);
+  } else
+    ThisValue = *ExpectedValue;
+}
+
+bool DbiModuleSourceFilesIterator::isEnd() const {
+  if (isUniversalEnd())
+    return true;
+
+  assert(Modules);
+  assert(Modi <= Modules->getModuleCount());
+  assert(Filei <= Modules->getSourceFileCount(Modi));
+
+  if (Modi == Modules->getModuleCount())
+    return true;
+  if (Filei == Modules->getSourceFileCount(Modi))
+    return true;
+  return false;
+}
+
+bool DbiModuleSourceFilesIterator::isUniversalEnd() const { return !Modules; }
+
+bool DbiModuleSourceFilesIterator::isCompatible(
+    const DbiModuleSourceFilesIterator &R) const {
+  // Universal iterators are compatible with any other iterator.
+  if (isUniversalEnd() || R.isUniversalEnd())
+    return true;
+
+  // At this point, neither iterator is a universal end iterator, although one
+  // or both might be non-universal end iterators.  Regardless, the module index
+  // is valid, so they are compatible if and only if they refer to the same
+  // module.
+  return Modi == R.Modi;
+}
+
+Error DbiModuleList::initialize(BinaryStreamRef ModInfo,
+                                BinaryStreamRef FileInfo) {
+  if (auto EC = initializeModInfo(ModInfo))
+    return EC;
+  if (auto EC = initializeFileInfo(FileInfo))
+    return EC;
+
+  return Error::success();
+}
+
+Error DbiModuleList::initializeModInfo(BinaryStreamRef ModInfo) {
+  ModInfoSubstream = ModInfo;
+
+  if (ModInfo.getLength() == 0)
+    return Error::success();
+
+  BinaryStreamReader Reader(ModInfo);
+
+  if (auto EC = Reader.readArray(Descriptors, ModInfo.getLength()))
+    return EC;
+
+  return Error::success();
+}
+
+Error DbiModuleList::initializeFileInfo(BinaryStreamRef FileInfo) {
+  FileInfoSubstream = FileInfo;
+
+  if (FileInfo.getLength() == 0)
+    return Error::success();
+
+  BinaryStreamReader FISR(FileInfo);
+  if (auto EC = FISR.readObject(FileInfoHeader))
+    return EC;
+
+  // First is an array of `NumModules` module indices.  This does not seem to be
+  // used for anything meaningful, so we ignore it.
+  FixedStreamArray<support::ulittle16_t> ModuleIndices;
+  if (auto EC = FISR.readArray(ModuleIndices, FileInfoHeader->NumModules))
+    return EC;
+  if (auto EC = FISR.readArray(ModFileCountArray, FileInfoHeader->NumModules))
+    return EC;
+
+  // Compute the real number of source files.  We can't trust the value in
+  // `FileInfoHeader->NumSourceFiles` because it is a unit16, and the sum of all
+  // source file counts might be larger than a unit16.  So we compute the real
+  // count by summing up the individual counts.
+  uint32_t NumSourceFiles = 0;
+  for (auto Count : ModFileCountArray)
+    NumSourceFiles += Count;
+
+  // In the reference implementation, this array is where the pointer documented
+  // at the definition of ModuleInfoHeader::FileNameOffs points to.  Note that
+  // although the field in ModuleInfoHeader is ignored this array is not, as it
+  // is the authority on where each filename begins in the names buffer.
+  if (auto EC = FISR.readArray(FileNameOffsets, NumSourceFiles))
+    return EC;
+
+  if (auto EC = FISR.readStreamRef(NamesBuffer))
+    return EC;
+
+  auto DescriptorIter = Descriptors.begin();
+  uint32_t NextFileIndex = 0;
+  ModuleInitialFileIndex.resize(FileInfoHeader->NumModules);
+  ModuleDescriptorOffsets.resize(FileInfoHeader->NumModules);
+  for (size_t I = 0; I < FileInfoHeader->NumModules; ++I) {
+    assert(DescriptorIter != Descriptors.end());
+    ModuleInitialFileIndex[I] = NextFileIndex;
+    ModuleDescriptorOffsets[I] = DescriptorIter.offset();
+
+    NextFileIndex += ModFileCountArray[I];
+    ++DescriptorIter;
+  }
+
+  assert(DescriptorIter == Descriptors.end());
+  assert(NextFileIndex == NumSourceFiles);
+
+  return Error::success();
+}
+
+uint32_t DbiModuleList::getModuleCount() const {
+  return FileInfoHeader->NumModules;
+}
+
+uint32_t DbiModuleList::getSourceFileCount() const {
+  return FileNameOffsets.size();
+}
+
+uint16_t DbiModuleList::getSourceFileCount(uint32_t Modi) const {
+  return ModFileCountArray[Modi];
+}
+
+DbiModuleDescriptor DbiModuleList::getModuleDescriptor(uint32_t Modi) const {
+  assert(Modi < getModuleCount());
+  uint32_t Offset = ModuleDescriptorOffsets[Modi];
+  auto Iter = Descriptors.at(Offset);
+  assert(Iter != Descriptors.end());
+  return *Iter;
+}
+
+iterator_range<DbiModuleSourceFilesIterator>
+DbiModuleList::source_files(uint32_t Modi) const {
+  return make_range<DbiModuleSourceFilesIterator>(
+      DbiModuleSourceFilesIterator(*this, Modi, 0),
+      DbiModuleSourceFilesIterator());
+}
+
+Expected<StringRef> DbiModuleList::getFileName(uint32_t Index) const {
+  BinaryStreamReader Names(NamesBuffer);
+  if (Index >= getSourceFileCount())
+    return make_error<RawError>(raw_error_code::index_out_of_bounds);
+
+  uint32_t FileOffset = FileNameOffsets[Index];
+  Names.setOffset(FileOffset);
+  StringRef Name;
+  if (auto EC = Names.readCString(Name))
+    return std::move(EC);
+  return Name;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/DbiStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 4f4a0cf..0eeac7e 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/DbiStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -7,21 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
+#include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/ISectionContribVisitor.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
-#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstddef>
@@ -35,7 +34,7 @@ using namespace llvm::support;
 
 template <typename ContribType>
 static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
-                                 StreamReader &Reader) {
+                                 BinaryStreamReader &Reader) {
   if (Reader.bytesRemaining() % sizeof(ContribType) != 0)
     return make_error<RawError>(
         raw_error_code::corrupt_file,
@@ -48,13 +47,12 @@ static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
 }
 
 DbiStream::DbiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
-    : Pdb(File), Stream(std::move(Stream)), Header(nullptr) {
-}
+    : Pdb(File), Stream(std::move(Stream)), Header(nullptr) {}
 
 DbiStream::~DbiStream() = default;
 
 Error DbiStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (Stream->getLength() < sizeof(DbiStreamHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -74,14 +72,6 @@ Error DbiStream::reload() {
     return make_error<RawError>(raw_error_code::feature_unsupported,
                                 "Unsupported DBI version.");
 
-  auto IS = Pdb.getPDBInfoStream();
-  if (!IS)
-    return IS.takeError();
-
-  if (Header->Age != IS->getAge())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "DBI Age does not match PDB Age.");
-
   if (Stream->getLength() !=
       sizeof(DbiStreamHeader) + Header->ModiSubstreamSize +
           Header->SecContrSubstreamSize + Header->SectionMapSize +
@@ -109,26 +99,27 @@ Error DbiStream::reload() {
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "DBI type server substream not aligned.");
 
-  if (auto EC =
-          Reader.readStreamRef(ModInfoSubstream, Header->ModiSubstreamSize))
-    return EC;
-  if (auto EC = initializeModInfoArray())
+  if (auto EC = Reader.readSubstream(ModiSubstream, Header->ModiSubstreamSize))
     return EC;
 
-  if (auto EC = Reader.readStreamRef(SecContrSubstream,
+  if (auto EC = Reader.readSubstream(SecContrSubstream,
                                      Header->SecContrSubstreamSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(SecMapSubstream, Header->SectionMapSize))
+  if (auto EC = Reader.readSubstream(SecMapSubstream, Header->SectionMapSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(FileInfoSubstream, Header->FileInfoSize))
+  if (auto EC = Reader.readSubstream(FileInfoSubstream, Header->FileInfoSize))
     return EC;
   if (auto EC =
-          Reader.readStreamRef(TypeServerMapSubstream, Header->TypeServerSize))
+          Reader.readSubstream(TypeServerMapSubstream, Header->TypeServerSize))
     return EC;
-  if (auto EC = Reader.readStreamRef(ECSubstream, Header->ECSubstreamSize))
+  if (auto EC = Reader.readSubstream(ECSubstream, Header->ECSubstreamSize))
     return EC;
-  if (auto EC = Reader.readArray(DbgStreams, Header->OptionalDbgHdrSize /
-                                                 sizeof(ulittle16_t)))
+  if (auto EC = Reader.readArray(
+          DbgStreams, Header->OptionalDbgHdrSize / sizeof(ulittle16_t)))
+    return EC;
+
+  if (auto EC = Modules.initialize(ModiSubstream.StreamData,
+                                   FileInfoSubstream.StreamData))
     return EC;
 
   if (auto EC = initializeSectionContributionData())
@@ -137,8 +128,6 @@ Error DbiStream::reload() {
     return EC;
   if (auto EC = initializeSectionMapData())
     return EC;
-  if (auto EC = initializeFileInfo())
-    return EC;
   if (auto EC = initializeFpoRecords())
     return EC;
 
@@ -146,9 +135,9 @@ Error DbiStream::reload() {
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Found unexpected bytes in DBI Stream.");
 
-  if (ECSubstream.getLength() > 0) {
-    StreamReader ECReader(ECSubstream);
-    if (auto EC = ECNames.load(ECReader))
+  if (!ECSubstream.empty()) {
+    BinaryStreamReader ECReader(ECSubstream.StreamData);
+    if (auto EC = ECNames.reload(ECReader))
       return EC;
   }
 
@@ -209,35 +198,42 @@ PDB_Machine DbiStream::getMachineType() const {
   return static_cast<PDB_Machine>(Machine);
 }
 
-msf::FixedStreamArray<object::coff_section> DbiStream::getSectionHeaders() {
+FixedStreamArray<object::coff_section> DbiStream::getSectionHeaders() {
   return SectionHeaders;
 }
 
-msf::FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
+FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
   return FpoRecords;
 }
 
-ArrayRef<ModuleInfoEx> DbiStream::modules() const { return ModuleInfos; }
-msf::FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
+const DbiModuleList &DbiStream::modules() const { return Modules; }
+
+FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
   return SectionMap;
 }
 
 void DbiStream::visitSectionContributions(
     ISectionContribVisitor &Visitor) const {
-  if (SectionContribVersion == DbiSecContribVer60) {
+  if (!SectionContribs.empty()) {
+    assert(SectionContribVersion == DbiSecContribVer60);
     for (auto &SC : SectionContribs)
       Visitor.visit(SC);
-  } else if (SectionContribVersion == DbiSecContribV2) {
+  } else if (!SectionContribs2.empty()) {
+    assert(SectionContribVersion == DbiSecContribV2);
     for (auto &SC : SectionContribs2)
       Visitor.visit(SC);
   }
 }
 
+Expected<StringRef> DbiStream::getECName(uint32_t NI) const {
+  return ECNames.getStringForID(NI);
+}
+
 Error DbiStream::initializeSectionContributionData() {
-  if (SecContrSubstream.getLength() == 0)
+  if (SecContrSubstream.empty())
     return Error::success();
 
-  StreamReader SCReader(SecContrSubstream);
+  BinaryStreamReader SCReader(SecContrSubstream.StreamData);
   if (auto EC = SCReader.readEnum(SectionContribVersion))
     return EC;
 
@@ -250,35 +246,20 @@ Error DbiStream::initializeSectionContributionData() {
                               "Unsupported DBI Section Contribution version");
 }
 
-Error DbiStream::initializeModInfoArray() {
-  if (ModInfoSubstream.getLength() == 0)
-    return Error::success();
-
-  // Since each ModInfo in the stream is a variable length, we have to iterate
-  // them to know how many there actually are.
-  StreamReader Reader(ModInfoSubstream);
-
-  VarStreamArray<ModInfo> ModInfoArray;
-  if (auto EC = Reader.readArray(ModInfoArray, ModInfoSubstream.getLength()))
-    return EC;
-  for (auto &Info : ModInfoArray) {
-    ModuleInfos.emplace_back(Info);
-  }
-
-  return Error::success();
-}
-
 // Initializes this->SectionHeaders.
 Error DbiStream::initializeSectionHeadersData() {
   if (DbgStreams.size() == 0)
     return Error::success();
 
   uint32_t StreamNum = getDebugStreamIndex(DbgHeaderType::SectionHdr);
+  if (StreamNum == kInvalidStreamIndex)
+    return Error::success();
+
   if (StreamNum >= Pdb.getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
 
   auto SHS = MappedBlockStream::createIndexedStream(
-      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum);
+      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator());
 
   size_t StreamLen = SHS->getLength();
   if (StreamLen % sizeof(object::coff_section))
@@ -286,7 +267,7 @@ Error DbiStream::initializeSectionHeadersData() {
                                 "Corrupted section header stream.");
 
   size_t NumSections = StreamLen / sizeof(object::coff_section);
-  msf::StreamReader Reader(*SHS);
+  BinaryStreamReader Reader(*SHS);
   if (auto EC = Reader.readArray(SectionHeaders, NumSections))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Could not read a bitmap.");
@@ -310,7 +291,7 @@ Error DbiStream::initializeFpoRecords() {
     return make_error<RawError>(raw_error_code::no_stream);
 
   auto FS = MappedBlockStream::createIndexedStream(
-      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum);
+      Pdb.getMsfLayout(), Pdb.getMsfBuffer(), StreamNum, Pdb.getAllocator());
 
   size_t StreamLen = FS->getLength();
   if (StreamLen % sizeof(object::FpoData))
@@ -318,7 +299,7 @@ Error DbiStream::initializeFpoRecords() {
                                 "Corrupted New FPO stream.");
 
   size_t NumRecords = StreamLen / sizeof(object::FpoData);
-  msf::StreamReader Reader(*FS);
+  BinaryStreamReader Reader(*FS);
   if (auto EC = Reader.readArray(FpoRecords, NumRecords))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Corrupted New FPO stream.");
@@ -326,82 +307,38 @@ Error DbiStream::initializeFpoRecords() {
   return Error::success();
 }
 
-Error DbiStream::initializeSectionMapData() {
-  if (SecMapSubstream.getLength() == 0)
-    return Error::success();
+BinarySubstreamRef DbiStream::getSectionContributionData() const {
+  return SecContrSubstream;
+}
 
-  StreamReader SMReader(SecMapSubstream);
-  const SecMapHeader *Header;
-  if (auto EC = SMReader.readObject(Header))
-    return EC;
-  if (auto EC = SMReader.readArray(SectionMap, Header->SecCount))
-    return EC;
-  return Error::success();
+BinarySubstreamRef DbiStream::getSecMapSubstreamData() const {
+  return SecMapSubstream;
 }
 
-Error DbiStream::initializeFileInfo() {
-  if (FileInfoSubstream.getLength() == 0)
-    return Error::success();
+BinarySubstreamRef DbiStream::getModiSubstreamData() const {
+  return ModiSubstream;
+}
 
-  const FileInfoSubstreamHeader *FH;
-  StreamReader FISR(FileInfoSubstream);
-  if (auto EC = FISR.readObject(FH))
-    return EC;
+BinarySubstreamRef DbiStream::getFileInfoSubstreamData() const {
+  return FileInfoSubstream;
+}
 
-  // The number of modules in the stream should be the same as reported by
-  // the FileInfoSubstreamHeader.
-  if (FH->NumModules != ModuleInfos.size())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "FileInfo substream count doesn't match DBI.");
+BinarySubstreamRef DbiStream::getTypeServerMapSubstreamData() const {
+  return TypeServerMapSubstream;
+}
 
-  FixedStreamArray<ulittle16_t> ModIndexArray;
-  FixedStreamArray<ulittle16_t> ModFileCountArray;
+BinarySubstreamRef DbiStream::getECSubstreamData() const { return ECSubstream; }
 
-  // First is an array of `NumModules` module indices.  This is not used for the
-  // same reason that `NumSourceFiles` is not used.  It's an array of uint16's,
-  // but it's possible there are more than 64k source files, which would imply
-  // more than 64k modules (e.g. object files) as well.  So we ignore this
-  // field.
-  if (auto EC = FISR.readArray(ModIndexArray, ModuleInfos.size()))
-    return EC;
-  if (auto EC = FISR.readArray(ModFileCountArray, ModuleInfos.size()))
-    return EC;
+Error DbiStream::initializeSectionMapData() {
+  if (SecMapSubstream.empty())
+    return Error::success();
 
-  // Compute the real number of source files.
-  uint32_t NumSourceFiles = 0;
-  for (auto Count : ModFileCountArray)
-    NumSourceFiles += Count;
-
-  // This is the array that in the reference implementation corresponds to
-  // `ModInfo::FileLayout::FileNameOffs`, which is commented there as being a
-  // pointer. Due to the mentioned problems of pointers causing difficulty
-  // when reading from the file on 64-bit systems, we continue to ignore that
-  // field in `ModInfo`, and instead build a vector of StringRefs and stores
-  // them in `ModuleInfoEx`.  The value written to and read from the file is
-  // not used anyway, it is only there as a way to store the offsets for the
-  // purposes of later accessing the names at runtime.
-  if (auto EC = FISR.readArray(FileNameOffsets, NumSourceFiles))
+  BinaryStreamReader SMReader(SecMapSubstream.StreamData);
+  const SecMapHeader *Header;
+  if (auto EC = SMReader.readObject(Header))
     return EC;
-
-  if (auto EC = FISR.readStreamRef(NamesBuffer))
+  if (auto EC = SMReader.readArray(SectionMap, Header->SecCount))
     return EC;
-
-  // We go through each ModuleInfo, determine the number N of source files for
-  // that module, and then get the next N offsets from the Offsets array, using
-  // them to get the corresponding N names from the Names buffer and associating
-  // each one with the corresponding module.
-  uint32_t NextFileIndex = 0;
-  for (size_t I = 0; I < ModuleInfos.size(); ++I) {
-    uint32_t NumFiles = ModFileCountArray[I];
-    ModuleInfos[I].SourceFiles.resize(NumFiles);
-    for (size_t J = 0; J < NumFiles; ++J, ++NextFileIndex) {
-      auto ThisName = getFileNameForIndex(NextFileIndex);
-      if (!ThisName)
-        return ThisName.takeError();
-      ModuleInfos[I].SourceFiles[J] = *ThisName;
-    }
-  }
-
   return Error::success();
 }
 
@@ -411,16 +348,3 @@ uint32_t DbiStream::getDebugStreamIndex(DbgHeaderType Type) const {
     return kInvalidStreamIndex;
   return DbgStreams[T];
 }
-
-Expected<StringRef> DbiStream::getFileNameForIndex(uint32_t Index) const {
-  StreamReader Names(NamesBuffer);
-  if (Index >= FileNameOffsets.size())
-    return make_error<RawError>(raw_error_code::index_out_of_bounds);
-
-  uint32_t FileOffset = FileNameOffsets[Index];
-  Names.setOffset(FileOffset);
-  StringRef Name;
-  if (auto EC = Names.readZeroString(Name))
-    return std::move(EC);
-  return Name;
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 1d5b8d6..25076e4 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/DbiStreamBuilder.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -7,31 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/DbiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Object/COFF.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-namespace {
-class ModiSubstreamBuilder {};
-}
-
 DbiStreamBuilder::DbiStreamBuilder(msf::MSFBuilder &Msf)
     : Msf(Msf), Allocator(Msf.getAllocator()), Age(1), BuildNumber(0),
       PdbDllVersion(0), PdbDllRbld(0), Flags(0), MachineType(PDB_Machine::x86),
       Header(nullptr), DbgStreams((int)DbgHeaderType::Max) {}
 
+DbiStreamBuilder::~DbiStreamBuilder() {}
+
 void DbiStreamBuilder::setVersionHeader(PdbRaw_DbiVer V) { VerHeader = V; }
 
 void DbiStreamBuilder::setAge(uint32_t A) { Age = A; }
@@ -46,17 +45,21 @@ void DbiStreamBuilder::setFlags(uint16_t F) { Flags = F; }
 
 void DbiStreamBuilder::setMachineType(PDB_Machine M) { MachineType = M; }
 
-void DbiStreamBuilder::setSectionContribs(ArrayRef<SectionContrib> Arr) {
-  SectionContribs = Arr;
-}
-
 void DbiStreamBuilder::setSectionMap(ArrayRef<SecMapEntry> SecMap) {
   SectionMap = SecMap;
 }
 
+void DbiStreamBuilder::setSymbolRecordStreamIndex(uint32_t Index) {
+  SymRecordStreamIndex = Index;
+}
+
+void DbiStreamBuilder::setPublicsStreamIndex(uint32_t Index) {
+  PublicsStreamIndex = Index;
+}
+
 Error DbiStreamBuilder::addDbgStream(pdb::DbgHeaderType Type,
                                      ArrayRef<uint8_t> Data) {
-  if (DbgStreams[(int)Type].StreamNumber)
+  if (DbgStreams[(int)Type].StreamNumber != kInvalidStreamIndex)
     return make_error<RawError>(raw_error_code::duplicate_entry,
                                 "The specified stream type already exists");
   auto ExpectedIndex = Msf.addStream(Data.size());
@@ -68,46 +71,62 @@ Error DbiStreamBuilder::addDbgStream(pdb::DbgHeaderType Type,
   return Error::success();
 }
 
+uint32_t DbiStreamBuilder::addECName(StringRef Name) {
+  return ECNamesBuilder.insert(Name);
+}
+
 uint32_t DbiStreamBuilder::calculateSerializedLength() const {
   // For now we only support serializing the header.
   return sizeof(DbiStreamHeader) + calculateFileInfoSubstreamSize() +
          calculateModiSubstreamSize() + calculateSectionContribsStreamSize() +
-         calculateSectionMapStreamSize() + calculateDbgStreamsSize();
+         calculateSectionMapStreamSize() + calculateDbgStreamsSize() +
+         ECNamesBuilder.calculateSerializedSize();
 }
 
-Error DbiStreamBuilder::addModuleInfo(StringRef ObjFile, StringRef Module) {
-  auto Entry = llvm::make_unique<ModuleInfo>();
-  ModuleInfo *M = Entry.get();
-  Entry->Mod = Module;
-  Entry->Obj = ObjFile;
-  auto Result = ModuleInfos.insert(std::make_pair(Module, std::move(Entry)));
+Expected<DbiModuleDescriptorBuilder &>
+DbiStreamBuilder::addModuleInfo(StringRef ModuleName) {
+  uint32_t Index = ModiList.size();
+  auto MIB =
+      llvm::make_unique<DbiModuleDescriptorBuilder>(ModuleName, Index, Msf);
+  auto M = MIB.get();
+  auto Result = ModiMap.insert(std::make_pair(ModuleName, std::move(MIB)));
+
   if (!Result.second)
     return make_error<RawError>(raw_error_code::duplicate_entry,
                                 "The specified module already exists");
-  ModuleInfoList.push_back(M);
-  return Error::success();
+  ModiList.push_back(M);
+  return *M;
 }
 
 Error DbiStreamBuilder::addModuleSourceFile(StringRef Module, StringRef File) {
-  auto ModIter = ModuleInfos.find(Module);
-  if (ModIter == ModuleInfos.end())
+  auto ModIter = ModiMap.find(Module);
+  if (ModIter == ModiMap.end())
     return make_error<RawError>(raw_error_code::no_entry,
                                 "The specified module was not found");
+  return addModuleSourceFile(*ModIter->second, File);
+}
+
+Error DbiStreamBuilder::addModuleSourceFile(DbiModuleDescriptorBuilder &Module,
+                                            StringRef File) {
   uint32_t Index = SourceFileNames.size();
   SourceFileNames.insert(std::make_pair(File, Index));
-  auto &ModEntry = *ModIter;
-  ModEntry.second->SourceFiles.push_back(File);
+  Module.addSourceFile(File);
   return Error::success();
 }
 
+Expected<uint32_t> DbiStreamBuilder::getSourceFileNameIndex(StringRef File) {
+  auto NameIter = SourceFileNames.find(File);
+  if (NameIter == SourceFileNames.end())
+    return make_error<RawError>(raw_error_code::no_entry,
+                                "The specified source file was not found");
+  return NameIter->getValue();
+}
+
 uint32_t DbiStreamBuilder::calculateModiSubstreamSize() const {
   uint32_t Size = 0;
-  for (const auto &M : ModuleInfoList) {
-    Size += sizeof(ModuleInfoHeader);
-    Size += M->Mod.size() + 1;
-    Size += M->Obj.size() + 1;
-  }
-  return alignTo(Size, sizeof(uint32_t));
+  for (const auto &M : ModiList)
+    Size += M->calculateSerializedLength();
+  return Size;
 }
 
 uint32_t DbiStreamBuilder::calculateSectionContribsStreamSize() const {
@@ -123,16 +142,21 @@ uint32_t DbiStreamBuilder::calculateSectionMapStreamSize() const {
   return sizeof(SecMapHeader) + sizeof(SecMapEntry) * SectionMap.size();
 }
 
-uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const {
-  uint32_t Size = 0;
-  Size += sizeof(ulittle16_t);                         // NumModules
-  Size += sizeof(ulittle16_t);                         // NumSourceFiles
-  Size += ModuleInfoList.size() * sizeof(ulittle16_t); // ModIndices
-  Size += ModuleInfoList.size() * sizeof(ulittle16_t); // ModFileCounts
+uint32_t DbiStreamBuilder::calculateNamesOffset() const {
+  uint32_t Offset = 0;
+  Offset += sizeof(ulittle16_t);                         // NumModules
+  Offset += sizeof(ulittle16_t);                         // NumSourceFiles
+  Offset += ModiList.size() * sizeof(ulittle16_t);       // ModIndices
+  Offset += ModiList.size() * sizeof(ulittle16_t);       // ModFileCounts
   uint32_t NumFileInfos = 0;
-  for (const auto &M : ModuleInfoList)
-    NumFileInfos += M->SourceFiles.size();
-  Size += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets
+  for (const auto &M : ModiList)
+    NumFileInfos += M->source_files().size();
+  Offset += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets
+  return Offset;
+}
+
+uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const {
+  uint32_t Size = calculateNamesOffset();
   Size += calculateNamesBufferSize();
   return alignTo(Size, sizeof(uint32_t));
 }
@@ -149,43 +173,19 @@ uint32_t DbiStreamBuilder::calculateDbgStreamsSize() const {
   return DbgStreams.size() * sizeof(uint16_t);
 }
 
-Error DbiStreamBuilder::generateModiSubstream() {
-  uint32_t Size = calculateModiSubstreamSize();
-  auto Data = Allocator.Allocate<uint8_t>(Size);
-
-  ModInfoBuffer = MutableByteStream(MutableArrayRef<uint8_t>(Data, Size));
-
-  StreamWriter ModiWriter(ModInfoBuffer);
-  for (const auto &M : ModuleInfoList) {
-    ModuleInfoHeader Layout = {};
-    Layout.ModDiStream = kInvalidStreamIndex;
-    Layout.NumFiles = M->SourceFiles.size();
-    if (auto EC = ModiWriter.writeObject(Layout))
-      return EC;
-    if (auto EC = ModiWriter.writeZeroString(M->Mod))
-      return EC;
-    if (auto EC = ModiWriter.writeZeroString(M->Obj))
-      return EC;
-  }
-  if (ModiWriter.bytesRemaining() > sizeof(uint32_t))
-    return make_error<RawError>(raw_error_code::invalid_format,
-                                "Unexpected bytes in Modi Stream Data");
-  return Error::success();
-}
-
 Error DbiStreamBuilder::generateFileInfoSubstream() {
   uint32_t Size = calculateFileInfoSubstreamSize();
-  uint32_t NameSize = calculateNamesBufferSize();
   auto Data = Allocator.Allocate<uint8_t>(Size);
-  uint32_t NamesOffset = Size - NameSize;
+  uint32_t NamesOffset = calculateNamesOffset();
 
-  FileInfoBuffer = MutableByteStream(MutableArrayRef<uint8_t>(Data, Size));
+  FileInfoBuffer = MutableBinaryByteStream(MutableArrayRef<uint8_t>(Data, Size),
+                                           llvm::support::little);
 
-  WritableStreamRef MetadataBuffer =
-      WritableStreamRef(FileInfoBuffer).keep_front(NamesOffset);
-  StreamWriter MetadataWriter(MetadataBuffer);
+  WritableBinaryStreamRef MetadataBuffer =
+      WritableBinaryStreamRef(FileInfoBuffer).keep_front(NamesOffset);
+  BinaryStreamWriter MetadataWriter(MetadataBuffer);
 
-  uint16_t ModiCount = std::min<uint32_t>(UINT16_MAX, ModuleInfos.size());
+  uint16_t ModiCount = std::min<uint32_t>(UINT16_MAX, ModiList.size());
   uint16_t FileCount = std::min<uint32_t>(UINT16_MAX, SourceFileNames.size());
   if (auto EC = MetadataWriter.writeInteger(ModiCount)) // NumModules
     return EC;
@@ -195,8 +195,8 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
     if (auto EC = MetadataWriter.writeInteger(I)) // Mod Indices
       return EC;
   }
-  for (const auto MI : ModuleInfoList) {
-    FileCount = static_cast<uint16_t>(MI->SourceFiles.size());
+  for (const auto &MI : ModiList) {
+    FileCount = static_cast<uint16_t>(MI->source_files().size());
     if (auto EC = MetadataWriter.writeInteger(FileCount)) // Mod File Counts
       return EC;
   }
@@ -205,16 +205,16 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
   // A side effect of this is that this will actually compute the various
   // file name offsets, so we can then go back and write the FileNameOffsets
   // array to the other substream.
-  NamesBuffer = WritableStreamRef(FileInfoBuffer).drop_front(NamesOffset);
-  StreamWriter NameBufferWriter(NamesBuffer);
+  NamesBuffer = WritableBinaryStreamRef(FileInfoBuffer).drop_front(NamesOffset);
+  BinaryStreamWriter NameBufferWriter(NamesBuffer);
   for (auto &Name : SourceFileNames) {
     Name.second = NameBufferWriter.getOffset();
-    if (auto EC = NameBufferWriter.writeZeroString(Name.getKey()))
+    if (auto EC = NameBufferWriter.writeCString(Name.getKey()))
       return EC;
   }
 
-  for (const auto MI : ModuleInfoList) {
-    for (StringRef Name : MI->SourceFiles) {
+  for (const auto &MI : ModiList) {
+    for (StringRef Name : MI->source_files()) {
       auto Result = SourceFileNames.find(Name);
       if (Result == SourceFileNames.end())
         return make_error<RawError>(raw_error_code::no_entry,
@@ -224,6 +224,9 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
     }
   }
 
+  if (auto EC = NameBufferWriter.padToAlignment(sizeof(uint32_t)))
+    return EC;
+
   if (NameBufferWriter.bytesRemaining() > 0)
     return make_error<RawError>(raw_error_code::invalid_format,
                                 "The names buffer contained unexpected data.");
@@ -240,13 +243,14 @@ Error DbiStreamBuilder::finalize() {
   if (Header)
     return Error::success();
 
-  DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
+  for (auto &MI : ModiList)
+    MI->finalize();
 
-  if (auto EC = generateModiSubstream())
-    return EC;
   if (auto EC = generateFileInfoSubstream())
     return EC;
 
+  DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
+  ::memset(H, 0, sizeof(DbiStreamHeader));
   H->VersionHeader = *VerHeader;
   H->VersionSignature = -1;
   H->Age = Age;
@@ -256,15 +260,15 @@ Error DbiStreamBuilder::finalize() {
   H->PdbDllVersion = PdbDllVersion;
   H->MachineType = static_cast<uint16_t>(MachineType);
 
-  H->ECSubstreamSize = 0;
+  H->ECSubstreamSize = ECNamesBuilder.calculateSerializedSize();
   H->FileInfoSize = FileInfoBuffer.getLength();
-  H->ModiSubstreamSize = ModInfoBuffer.getLength();
+  H->ModiSubstreamSize = calculateModiSubstreamSize();
   H->OptionalDbgHdrSize = DbgStreams.size() * sizeof(uint16_t);
   H->SecContrSubstreamSize = calculateSectionContribsStreamSize();
   H->SectionMapSize = calculateSectionMapStreamSize();
   H->TypeServerSize = 0;
-  H->SymRecordStreamIndex = kInvalidStreamIndex;
-  H->PublicSymbolStreamIndex = kInvalidStreamIndex;
+  H->SymRecordStreamIndex = SymRecordStreamIndex;
+  H->PublicSymbolStreamIndex = PublicsStreamIndex;
   H->MFCTypeServerIndex = kInvalidStreamIndex;
   H->GlobalSymbolStreamIndex = kInvalidStreamIndex;
 
@@ -273,6 +277,11 @@ Error DbiStreamBuilder::finalize() {
 }
 
 Error DbiStreamBuilder::finalizeMsfLayout() {
+  for (auto &MI : ModiList) {
+    if (auto EC = MI->finalizeMsfLayout())
+      return EC;
+  }
+
   uint32_t Length = calculateSerializedLength();
   if (auto EC = Msf.setStreamSize(StreamDBI, Length))
     return EC;
@@ -298,23 +307,17 @@ static uint16_t toSecMapFlags(uint32_t Flags) {
   return Ret;
 }
 
-// A utility function to create Section Contributions
-// for a given input sections.
-std::vector<SectionContrib> DbiStreamBuilder::createSectionContribs(
-    ArrayRef<object::coff_section> SecHdrs) {
-  std::vector<SectionContrib> Ret;
-
-  // Create a SectionContrib for each input section.
-  for (auto &Sec : SecHdrs) {
-    Ret.emplace_back();
-    auto &Entry = Ret.back();
-    memset(&Entry, 0, sizeof(Entry));
-
-    Entry.Off = Sec.PointerToRawData;
-    Entry.Size = Sec.SizeOfRawData;
-    Entry.Characteristics = Sec.Characteristics;
-  }
-  return Ret;
+void DbiStreamBuilder::addSectionContrib(DbiModuleDescriptorBuilder *ModuleDbi,
+                                         const object::coff_section *SecHdr) {
+  SectionContrib SC;
+  memset(&SC, 0, sizeof(SC));
+  SC.ISect = (uint16_t)~0U; // This represents nil.
+  SC.Off = SecHdr->PointerToRawData;
+  SC.Size = SecHdr->SizeOfRawData;
+  SC.Characteristics = SecHdr->Characteristics;
+  // Use the module index in the module dbi stream or nil (-1).
+  SC.Imod = ModuleDbi ? ModuleDbi->getModuleIndex() : (uint16_t)~0U;
+  SectionContribs.emplace_back(SC);
 }
 
 // A utility function to create a Section Map for a given list of COFF sections.
@@ -358,24 +361,26 @@ std::vector<SecMapEntry> DbiStreamBuilder::createSectionMap(
 }
 
 Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
-                               const msf::WritableStream &Buffer) {
+                               WritableBinaryStreamRef MsfBuffer) {
   if (auto EC = finalize())
     return EC;
 
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamDBI);
+  auto DbiS = WritableMappedBlockStream::createIndexedStream(
+      Layout, MsfBuffer, StreamDBI, Allocator);
 
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*DbiS);
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  if (auto EC = Writer.writeStreamRef(ModInfoBuffer))
-    return EC;
+  for (auto &M : ModiList) {
+    if (auto EC = M->commit(Writer, Layout, MsfBuffer))
+      return EC;
+  }
 
   if (!SectionContribs.empty()) {
     if (auto EC = Writer.writeEnum(DbiSecContribVer60))
       return EC;
-    if (auto EC = Writer.writeArray(SectionContribs))
+    if (auto EC = Writer.writeArray(makeArrayRef(SectionContribs)))
       return EC;
   }
 
@@ -391,6 +396,9 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = Writer.writeStreamRef(FileInfoBuffer))
     return EC;
 
+  if (auto EC = ECNamesBuilder.commit(Writer))
+    return EC;
+
   for (auto &Stream : DbgStreams)
     if (auto EC = Writer.writeInteger(Stream.StreamNumber))
       return EC;
@@ -399,8 +407,8 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
     if (Stream.StreamNumber == kInvalidStreamIndex)
       continue;
     auto WritableStream = WritableMappedBlockStream::createIndexedStream(
-        Layout, Buffer, Stream.StreamNumber);
-    StreamWriter DbgStreamWriter(*WritableStream);
+        Layout, MsfBuffer, Stream.StreamNumber, Allocator);
+    BinaryStreamWriter DbgStreamWriter(*WritableStream);
     if (auto EC = DbgStreamWriter.writeArray(Stream.Data))
       return EC;
   }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/EnumTables.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
index fc9270c..b3837dc 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/EnumTables.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/EnumTables.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/EnumTables.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/GSI.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/GSI.cpp
index 6ecbb5c..b219fe2 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/GSI.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/GSI.cpp
@@ -9,10 +9,10 @@
 
 #include "GSI.h"
 
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 #include "llvm/Support/Error.h"
 
@@ -28,9 +28,9 @@ static Error checkHashHdrVersion(const GSIHashHeader *HashHdr) {
   return Error::success();
 }
 
-Error readGSIHashBuckets(
-    msf::FixedStreamArray<support::ulittle32_t> &HashBuckets,
-    const GSIHashHeader *HashHdr, msf::StreamReader &Reader) {
+Error readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
+                         const GSIHashHeader *HashHdr,
+                         BinaryStreamReader &Reader) {
   if (auto EC = checkHashHdrVersion(HashHdr))
     return EC;
 
@@ -57,7 +57,7 @@ Error readGSIHashBuckets(
 }
 
 Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
-                        msf::StreamReader &Reader) {
+                        BinaryStreamReader &Reader) {
   if (Reader.readObject(HashHdr))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Stream does not contain a GSIHashHeader.");
@@ -70,9 +70,9 @@ Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
   return Error::success();
 }
 
-Error readGSIHashRecords(msf::FixedStreamArray<PSHashRecord> &HashRecords,
+Error readGSIHashRecords(FixedStreamArray<PSHashRecord> &HashRecords,
                          const GSIHashHeader *HashHdr,
-                         msf::StreamReader &Reader) {
+                         BinaryStreamReader &Reader) {
   if (auto EC = checkHashHdrVersion(HashHdr))
     return EC;
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/GSI.h b/contrib/llvm/lib/DebugInfo/PDB/Native/GSI.h
index 82cebd9..9e63bc8 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/GSI.h
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/GSI.h
@@ -25,17 +25,15 @@
 #ifndef LLVM_LIB_DEBUGINFO_PDB_RAW_GSI_H
 #define LLVM_LIB_DEBUGINFO_PDB_RAW_GSI_H
 
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
 
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 
-namespace msf {
-class StreamReader;
-}
+class BinaryStreamReader;
 
 namespace pdb {
 
@@ -56,14 +54,14 @@ struct GSIHashHeader {
   support::ulittle32_t NumBuckets;
 };
 
-Error readGSIHashBuckets(
-    msf::FixedStreamArray<support::ulittle32_t> &HashBuckets,
-    const GSIHashHeader *HashHdr, msf::StreamReader &Reader);
+Error readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
+                         const GSIHashHeader *HashHdr,
+                         BinaryStreamReader &Reader);
 Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
-                        msf::StreamReader &Reader);
-Error readGSIHashRecords(msf::FixedStreamArray<PSHashRecord> &HashRecords,
+                        BinaryStreamReader &Reader);
+Error readGSIHashRecords(FixedStreamArray<PSHashRecord> &HashRecords,
                          const GSIHashHeader *HashHdr,
-                         msf::StreamReader &Reader);
+                         BinaryStreamReader &Reader);
 }
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/GlobalsStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
index 31afc92..a2ee0f0 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/GlobalsStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "GSI.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/GlobalsStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 
@@ -23,7 +23,7 @@ GlobalsStream::GlobalsStream(std::unique_ptr<MappedBlockStream> Stream)
 GlobalsStream::~GlobalsStream() = default;
 
 Error GlobalsStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   const GSIHashHeader *HashHdr;
   if (auto EC = readGSIHashHeader(HashHdr, Reader))
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/Hash.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/Hash.cpp
index b9f685e..61188ec 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/Hash.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/Hash.cpp
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/Hash.h"
-
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/JamCRC.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace llvm::support;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
new file mode 100644
index 0000000..439217f
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/HashTable.cpp
@@ -0,0 +1,308 @@
+//===- HashTable.cpp - PDB Hash Table -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+HashTable::HashTable() : HashTable(8) {}
+
+HashTable::HashTable(uint32_t Capacity) { Buckets.resize(Capacity); }
+
+Error HashTable::load(BinaryStreamReader &Stream) {
+  const Header *H;
+  if (auto EC = Stream.readObject(H))
+    return EC;
+  if (H->Capacity == 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid Hash Table Capacity");
+  if (H->Size > maxLoad(H->Capacity))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid Hash Table Size");
+
+  Buckets.resize(H->Capacity);
+
+  if (auto EC = readSparseBitVector(Stream, Present))
+    return EC;
+  if (Present.count() != H->Size)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Present bit vector does not match size!");
+
+  if (auto EC = readSparseBitVector(Stream, Deleted))
+    return EC;
+  if (Present.intersects(Deleted))
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Present bit vector interesects deleted!");
+
+  for (uint32_t P : Present) {
+    if (auto EC = Stream.readInteger(Buckets[P].first))
+      return EC;
+    if (auto EC = Stream.readInteger(Buckets[P].second))
+      return EC;
+  }
+
+  return Error::success();
+}
+
+uint32_t HashTable::calculateSerializedLength() const {
+  uint32_t Size = sizeof(Header);
+
+  int NumBitsP = Present.find_last() + 1;
+  int NumBitsD = Deleted.find_last() + 1;
+
+  // Present bit set number of words, followed by that many actual words.
+  Size += sizeof(uint32_t);
+  Size += alignTo(NumBitsP, sizeof(uint32_t));
+
+  // Deleted bit set number of words, followed by that many actual words.
+  Size += sizeof(uint32_t);
+  Size += alignTo(NumBitsD, sizeof(uint32_t));
+
+  // One (Key, Value) pair for each entry Present.
+  Size += 2 * sizeof(uint32_t) * size();
+
+  return Size;
+}
+
+Error HashTable::commit(BinaryStreamWriter &Writer) const {
+  Header H;
+  H.Size = size();
+  H.Capacity = capacity();
+  if (auto EC = Writer.writeObject(H))
+    return EC;
+
+  if (auto EC = writeSparseBitVector(Writer, Present))
+    return EC;
+
+  if (auto EC = writeSparseBitVector(Writer, Deleted))
+    return EC;
+
+  for (const auto &Entry : *this) {
+    if (auto EC = Writer.writeInteger(Entry.first))
+      return EC;
+    if (auto EC = Writer.writeInteger(Entry.second))
+      return EC;
+  }
+  return Error::success();
+}
+
+void HashTable::clear() {
+  Buckets.resize(8);
+  Present.clear();
+  Deleted.clear();
+}
+
+uint32_t HashTable::capacity() const { return Buckets.size(); }
+
+uint32_t HashTable::size() const { return Present.count(); }
+
+HashTableIterator HashTable::begin() const { return HashTableIterator(*this); }
+
+HashTableIterator HashTable::end() const {
+  return HashTableIterator(*this, 0, true);
+}
+
+HashTableIterator HashTable::find(uint32_t K) {
+  uint32_t H = K % capacity();
+  uint32_t I = H;
+  Optional<uint32_t> FirstUnused;
+  do {
+    if (isPresent(I)) {
+      if (Buckets[I].first == K)
+        return HashTableIterator(*this, I, false);
+    } else {
+      if (!FirstUnused)
+        FirstUnused = I;
+      // Insertion occurs via linear probing from the slot hint, and will be
+      // inserted at the first empty / deleted location.  Therefore, if we are
+      // probing and find a location that is neither present nor deleted, then
+      // nothing must have EVER been inserted at this location, and thus it is
+      // not possible for a matching value to occur later.
+      if (!isDeleted(I))
+        break;
+    }
+    I = (I + 1) % capacity();
+  } while (I != H);
+
+  // The only way FirstUnused would not be set is if every single entry in the
+  // table were Present.  But this would violate the load factor constraints
+  // that we impose, so it should never happen.
+  assert(FirstUnused);
+  return HashTableIterator(*this, *FirstUnused, true);
+}
+
+void HashTable::set(uint32_t K, uint32_t V) {
+  auto Entry = find(K);
+  if (Entry != end()) {
+    assert(isPresent(Entry.index()));
+    assert(Buckets[Entry.index()].first == K);
+    // We're updating, no need to do anything special.
+    Buckets[Entry.index()].second = V;
+    return;
+  }
+
+  auto &B = Buckets[Entry.index()];
+  assert(!isPresent(Entry.index()));
+  assert(Entry.isEnd());
+  B.first = K;
+  B.second = V;
+  Present.set(Entry.index());
+  Deleted.reset(Entry.index());
+
+  grow();
+
+  assert(find(K) != end());
+}
+
+void HashTable::remove(uint32_t K) {
+  auto Iter = find(K);
+  // It wasn't here to begin with, just exit.
+  if (Iter == end())
+    return;
+
+  assert(Present.test(Iter.index()));
+  assert(!Deleted.test(Iter.index()));
+  Deleted.set(Iter.index());
+  Present.reset(Iter.index());
+}
+
+uint32_t HashTable::get(uint32_t K) {
+  auto I = find(K);
+  assert(I != end());
+  return (*I).second;
+}
+
+uint32_t HashTable::maxLoad(uint32_t capacity) { return capacity * 2 / 3 + 1; }
+
+void HashTable::grow() {
+  uint32_t S = size();
+  if (S < maxLoad(capacity()))
+    return;
+  assert(capacity() != UINT32_MAX && "Can't grow Hash table!");
+
+  uint32_t NewCapacity =
+      (capacity() <= INT32_MAX) ? capacity() * 2 : UINT32_MAX;
+
+  // Growing requires rebuilding the table and re-hashing every item.  Make a
+  // copy with a larger capacity, insert everything into the copy, then swap
+  // it in.
+  HashTable NewMap(NewCapacity);
+  for (auto I : Present) {
+    NewMap.set(Buckets[I].first, Buckets[I].second);
+  }
+
+  Buckets.swap(NewMap.Buckets);
+  std::swap(Present, NewMap.Present);
+  std::swap(Deleted, NewMap.Deleted);
+  assert(capacity() == NewCapacity);
+  assert(size() == S);
+}
+
+Error HashTable::readSparseBitVector(BinaryStreamReader &Stream,
+                                     SparseBitVector<> &V) {
+  uint32_t NumWords;
+  if (auto EC = Stream.readInteger(NumWords))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "Expected hash table number of words"));
+
+  for (uint32_t I = 0; I != NumWords; ++I) {
+    uint32_t Word;
+    if (auto EC = Stream.readInteger(Word))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected hash table word"));
+    for (unsigned Idx = 0; Idx < 32; ++Idx)
+      if (Word & (1U << Idx))
+        V.set((I * 32) + Idx);
+  }
+  return Error::success();
+}
+
+Error HashTable::writeSparseBitVector(BinaryStreamWriter &Writer,
+                                      SparseBitVector<> &Vec) {
+  int ReqBits = Vec.find_last() + 1;
+  uint32_t NumWords = alignTo(ReqBits, sizeof(uint32_t)) / sizeof(uint32_t);
+  if (auto EC = Writer.writeInteger(NumWords))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "Could not write linear map number of words"));
+
+  uint32_t Idx = 0;
+  for (uint32_t I = 0; I != NumWords; ++I) {
+    uint32_t Word = 0;
+    for (uint32_t WordIdx = 0; WordIdx < 32; ++WordIdx, ++Idx) {
+      if (Vec.test(Idx))
+        Word |= (1 << WordIdx);
+    }
+    if (auto EC = Writer.writeInteger(Word))
+      return joinErrors(std::move(EC), make_error<RawError>(
+                                           raw_error_code::corrupt_file,
+                                           "Could not write linear map word"));
+  }
+  return Error::success();
+}
+
+HashTableIterator::HashTableIterator(const HashTable &Map, uint32_t Index,
+                                     bool IsEnd)
+    : Map(&Map), Index(Index), IsEnd(IsEnd) {}
+
+HashTableIterator::HashTableIterator(const HashTable &Map) : Map(&Map) {
+  int I = Map.Present.find_first();
+  if (I == -1) {
+    Index = 0;
+    IsEnd = true;
+  } else {
+    Index = static_cast<uint32_t>(I);
+    IsEnd = false;
+  }
+}
+
+HashTableIterator &HashTableIterator::operator=(const HashTableIterator &R) {
+  Map = R.Map;
+  return *this;
+}
+
+bool HashTableIterator::operator==(const HashTableIterator &R) const {
+  if (IsEnd && R.IsEnd)
+    return true;
+  if (IsEnd != R.IsEnd)
+    return false;
+
+  return (Map == R.Map) && (Index == R.Index);
+}
+
+const std::pair<uint32_t, uint32_t> &HashTableIterator::operator*() const {
+  assert(Map->Present.test(Index));
+  return Map->Buckets[Index];
+}
+
+HashTableIterator &HashTableIterator::operator++() {
+  while (Index < Map->Buckets.size()) {
+    ++Index;
+    if (Map->Present.test(Index))
+      return *this;
+  }
+
+  IsEnd = true;
+  return *this;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
new file mode 100644
index 0000000..8298790
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -0,0 +1,139 @@
+//===- InfoStream.cpp - PDB Info Stream (Stream 1) Access -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
+    : Stream(std::move(Stream)) {}
+
+Error InfoStream::reload() {
+  BinaryStreamReader Reader(*Stream);
+
+  const InfoStreamHeader *H;
+  if (auto EC = Reader.readObject(H))
+    return joinErrors(
+        std::move(EC),
+        make_error<RawError>(raw_error_code::corrupt_file,
+                             "PDB Stream does not contain a header."));
+
+  switch (H->Version) {
+  case PdbImplVC70:
+  case PdbImplVC80:
+  case PdbImplVC110:
+  case PdbImplVC140:
+    break;
+  default:
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported PDB stream version.");
+  }
+
+  Version = H->Version;
+  Signature = H->Signature;
+  Age = H->Age;
+  Guid = H->Guid;
+
+  uint32_t Offset = Reader.getOffset();
+  if (auto EC = NamedStreams.load(Reader))
+    return EC;
+  uint32_t NewOffset = Reader.getOffset();
+  NamedStreamMapByteSize = NewOffset - Offset;
+
+  Reader.setOffset(Offset);
+  if (auto EC = Reader.readSubstream(SubNamedStreams, NamedStreamMapByteSize))
+    return EC;
+
+  bool Stop = false;
+  while (!Stop && !Reader.empty()) {
+    PdbRaw_FeatureSig Sig;
+    if (auto EC = Reader.readEnum(Sig))
+      return EC;
+    // Since this value comes from a file, it's possible we have some strange
+    // value which doesn't correspond to any value.  We don't want to warn on
+    // -Wcovered-switch-default in this case, so switch on the integral value
+    // instead of the enumeration value.
+    switch (uint32_t(Sig)) {
+    case uint32_t(PdbRaw_FeatureSig::VC110):
+      // No other flags for VC110 PDB.
+      Stop = true;
+      LLVM_FALLTHROUGH;
+    case uint32_t(PdbRaw_FeatureSig::VC140):
+      Features |= PdbFeatureContainsIdStream;
+      break;
+    case uint32_t(PdbRaw_FeatureSig::NoTypeMerge):
+      Features |= PdbFeatureNoTypeMerging;
+      break;
+    case uint32_t(PdbRaw_FeatureSig::MinimalDebugInfo):
+      Features |= PdbFeatureMinimalDebugInfo;
+      break;
+    default:
+      continue;
+    }
+    FeatureSignatures.push_back(Sig);
+  }
+  return Error::success();
+}
+
+uint32_t InfoStream::getStreamSize() const { return Stream->getLength(); }
+
+uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
+  uint32_t Result;
+  if (!NamedStreams.get(Name, Result))
+    return 0;
+  return Result;
+}
+
+iterator_range<StringMapConstIterator<uint32_t>>
+InfoStream::named_streams() const {
+  return NamedStreams.entries();
+}
+
+bool InfoStream::containsIdStream() const {
+  return !!(Features & PdbFeatureContainsIdStream);
+}
+
+PdbRaw_ImplVer InfoStream::getVersion() const {
+  return static_cast<PdbRaw_ImplVer>(Version);
+}
+
+uint32_t InfoStream::getSignature() const { return Signature; }
+
+uint32_t InfoStream::getAge() const { return Age; }
+
+GUID InfoStream::getGuid() const { return Guid; }
+
+uint32_t InfoStream::getNamedStreamMapByteSize() const {
+  return NamedStreamMapByteSize;
+}
+
+PdbRaw_Features InfoStream::getFeatures() const { return Features; }
+
+ArrayRef<PdbRaw_FeatureSig> InfoStream::getFeatureSignatures() const {
+  return FeatureSignatures;
+}
+
+const NamedStreamMap &InfoStream::getNamedStreams() const {
+  return NamedStreams;
+}
+
+BinarySubstreamRef InfoStream::getNamedStreamsBuffer() const {
+  return SubNamedStreams;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
new file mode 100644
index 0000000..6450ae7
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -0,0 +1,74 @@
+//===- InfoStreamBuilder.cpp - PDB Info Stream Creation ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
+
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+InfoStreamBuilder::InfoStreamBuilder(msf::MSFBuilder &Msf,
+                                     NamedStreamMap &NamedStreams)
+    : Msf(Msf), Ver(PdbRaw_ImplVer::PdbImplVC70), Sig(-1), Age(0),
+      NamedStreams(NamedStreams) {}
+
+void InfoStreamBuilder::setVersion(PdbRaw_ImplVer V) { Ver = V; }
+
+void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; }
+
+void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
+
+void InfoStreamBuilder::setGuid(GUID G) { Guid = G; }
+
+void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
+  Features.push_back(Sig);
+}
+
+Error InfoStreamBuilder::finalizeMsfLayout() {
+  uint32_t Length = sizeof(InfoStreamHeader) + NamedStreams.finalize() +
+                    (Features.size() + 1) * sizeof(uint32_t);
+  if (auto EC = Msf.setStreamSize(StreamPDB, Length))
+    return EC;
+  return Error::success();
+}
+
+Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
+                                WritableBinaryStreamRef Buffer) const {
+  auto InfoS = WritableMappedBlockStream::createIndexedStream(
+      Layout, Buffer, StreamPDB, Msf.getAllocator());
+  BinaryStreamWriter Writer(*InfoS);
+
+  InfoStreamHeader H;
+  H.Age = Age;
+  H.Signature = Sig;
+  H.Version = Ver;
+  H.Guid = Guid;
+  if (auto EC = Writer.writeObject(H))
+    return EC;
+
+  if (auto EC = NamedStreams.commit(Writer))
+    return EC;
+  if (auto EC = Writer.writeInteger(0))
+    return EC;
+  for (auto E : Features) {
+    if (auto EC = Writer.writeEnum(E))
+      return EC;
+  }
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
new file mode 100644
index 0000000..2e1f61c
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/ModuleDebugStream.cpp
@@ -0,0 +1,123 @@
+//===- ModuleDebugStream.cpp - PDB Module Info Stream Access --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/ModuleDebugStream.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+ModuleDebugStreamRef::ModuleDebugStreamRef(
+    const DbiModuleDescriptor &Module,
+    std::unique_ptr<MappedBlockStream> Stream)
+    : Mod(Module), Stream(std::move(Stream)) {}
+
+ModuleDebugStreamRef::~ModuleDebugStreamRef() = default;
+
+Error ModuleDebugStreamRef::reload() {
+  BinaryStreamReader Reader(*Stream);
+
+  uint32_t SymbolSize = Mod.getSymbolDebugInfoByteSize();
+  uint32_t C11Size = Mod.getC11LineInfoByteSize();
+  uint32_t C13Size = Mod.getC13LineInfoByteSize();
+
+  if (C11Size > 0 && C13Size > 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Module has both C11 and C13 line info");
+
+  BinaryStreamRef S;
+
+  if (auto EC = Reader.readInteger(Signature))
+    return EC;
+  if (auto EC = Reader.readSubstream(SymbolsSubstream, SymbolSize - 4))
+    return EC;
+  if (auto EC = Reader.readSubstream(C11LinesSubstream, C11Size))
+    return EC;
+  if (auto EC = Reader.readSubstream(C13LinesSubstream, C13Size))
+    return EC;
+
+  BinaryStreamReader SymbolReader(SymbolsSubstream.StreamData);
+  if (auto EC =
+          SymbolReader.readArray(SymbolArray, SymbolReader.bytesRemaining()))
+    return EC;
+
+  BinaryStreamReader SubsectionsReader(C13LinesSubstream.StreamData);
+  if (auto EC = SubsectionsReader.readArray(Subsections,
+                                            SubsectionsReader.bytesRemaining()))
+    return EC;
+
+  uint32_t GlobalRefsSize;
+  if (auto EC = Reader.readInteger(GlobalRefsSize))
+    return EC;
+  if (auto EC = Reader.readSubstream(GlobalRefsSubstream, GlobalRefsSize))
+    return EC;
+  if (Reader.bytesRemaining() > 0)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unexpected bytes in module stream.");
+
+  return Error::success();
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getSymbolsSubstream() const {
+  return SymbolsSubstream;
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getC11LinesSubstream() const {
+  return C11LinesSubstream;
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getC13LinesSubstream() const {
+  return C13LinesSubstream;
+}
+
+BinarySubstreamRef ModuleDebugStreamRef::getGlobalRefsSubstream() const {
+  return GlobalRefsSubstream;
+}
+
+iterator_range<codeview::CVSymbolArray::Iterator>
+ModuleDebugStreamRef::symbols(bool *HadError) const {
+  return make_range(SymbolArray.begin(HadError), SymbolArray.end());
+}
+
+iterator_range<ModuleDebugStreamRef::DebugSubsectionIterator>
+ModuleDebugStreamRef::subsections() const {
+  return make_range(Subsections.begin(), Subsections.end());
+}
+
+bool ModuleDebugStreamRef::hasDebugSubsections() const {
+  return !C13LinesSubstream.empty();
+}
+
+Error ModuleDebugStreamRef::commit() { return Error::success(); }
+
+Expected<codeview::DebugChecksumsSubsectionRef>
+ModuleDebugStreamRef::findChecksumsSubsection() const {
+  codeview::DebugChecksumsSubsectionRef Result;
+  for (const auto &SS : subsections()) {
+    if (SS.kind() != DebugSubsectionKind::FileChecksums)
+      continue;
+
+    if (auto EC = Result.initialize(SS.getRecordData()))
+      return std::move(EC);
+    return Result;
+  }
+  return Result;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
new file mode 100644
index 0000000..6cdf6dd
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -0,0 +1,152 @@
+//===- NamedStreamMap.cpp - PDB Named Stream Map --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/PDB/Native/HashTable.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <tuple>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+// FIXME: This shouldn't be necessary, but if we insert the strings in any
+// other order, cvdump cannot read the generated name map.  This suggests that
+// we may be using the wrong hash function.  A closer inspection of the cvdump
+// source code may reveal something, but for now this at least makes us work,
+// even if only by accident.
+static constexpr const char *OrderedStreamNames[] = {"/LinkInfo", "/names",
+                                                     "/src/headerblock"};
+
+NamedStreamMap::NamedStreamMap() = default;
+
+Error NamedStreamMap::load(BinaryStreamReader &Stream) {
+  Mapping.clear();
+  FinalizedHashTable.clear();
+  FinalizedInfo.reset();
+
+  uint32_t StringBufferSize;
+  if (auto EC = Stream.readInteger(StringBufferSize))
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Expected string buffer size"));
+
+  BinaryStreamRef StringsBuffer;
+  if (auto EC = Stream.readStreamRef(StringsBuffer, StringBufferSize))
+    return EC;
+
+  HashTable OffsetIndexMap;
+  if (auto EC = OffsetIndexMap.load(Stream))
+    return EC;
+
+  uint32_t NameOffset;
+  uint32_t NameIndex;
+  for (const auto &Entry : OffsetIndexMap) {
+    std::tie(NameOffset, NameIndex) = Entry;
+
+    // Compute the offset of the start of the string relative to the stream.
+    BinaryStreamReader NameReader(StringsBuffer);
+    NameReader.setOffset(NameOffset);
+    // Pump out our c-string from the stream.
+    StringRef Str;
+    if (auto EC = NameReader.readCString(Str))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Expected name map name"));
+
+    // Add this to a string-map from name to stream number.
+    Mapping.insert({Str, NameIndex});
+  }
+
+  return Error::success();
+}
+
+Error NamedStreamMap::commit(BinaryStreamWriter &Writer) const {
+  assert(FinalizedInfo.hasValue());
+
+  // The first field is the number of bytes of string data.
+  if (auto EC = Writer.writeInteger(FinalizedInfo->StringDataBytes))
+    return EC;
+
+  for (const auto &Name : OrderedStreamNames) {
+    auto Item = Mapping.find(Name);
+    if (Item == Mapping.end())
+      continue;
+    if (auto EC = Writer.writeCString(Item->getKey()))
+      return EC;
+  }
+
+  // And finally the Offset Index map.
+  if (auto EC = FinalizedHashTable.commit(Writer))
+    return EC;
+
+  return Error::success();
+}
+
+uint32_t NamedStreamMap::finalize() {
+  if (FinalizedInfo.hasValue())
+    return FinalizedInfo->SerializedLength;
+
+  // Build the finalized hash table.
+  FinalizedHashTable.clear();
+  FinalizedInfo.emplace();
+
+  for (const auto &Name : OrderedStreamNames) {
+    auto Item = Mapping.find(Name);
+    if (Item == Mapping.end())
+      continue;
+    FinalizedHashTable.set(FinalizedInfo->StringDataBytes, Item->getValue());
+    FinalizedInfo->StringDataBytes += Item->getKeyLength() + 1;
+  }
+
+  // Number of bytes of string data.
+  FinalizedInfo->SerializedLength += sizeof(support::ulittle32_t);
+  // Followed by that many actual bytes of string data.
+  FinalizedInfo->SerializedLength += FinalizedInfo->StringDataBytes;
+  // Followed by the mapping from Offset to Index.
+  FinalizedInfo->SerializedLength +=
+      FinalizedHashTable.calculateSerializedLength();
+  return FinalizedInfo->SerializedLength;
+}
+
+iterator_range<StringMapConstIterator<uint32_t>>
+NamedStreamMap::entries() const {
+  return make_range<StringMapConstIterator<uint32_t>>(Mapping.begin(),
+                                                      Mapping.end());
+}
+
+uint32_t NamedStreamMap::size() const { return Mapping.size(); }
+
+bool NamedStreamMap::get(StringRef Stream, uint32_t &StreamNo) const {
+  auto Iter = Mapping.find(Stream);
+  if (Iter == Mapping.end())
+    return false;
+  StreamNo = Iter->second;
+  return true;
+}
+
+void NamedStreamMap::set(StringRef Stream, uint32_t StreamNo) {
+  FinalizedInfo.reset();
+  Mapping[Stream] = StreamNo;
+}
+
+void NamedStreamMap::remove(StringRef Stream) {
+  FinalizedInfo.reset();
+  Mapping.erase(Stream);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeBuiltinSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeBuiltinSymbol.cpp
new file mode 100644
index 0000000..60416f6
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeBuiltinSymbol.cpp
@@ -0,0 +1,48 @@
+//===- NativeBuiltinSymbol.cpp ------------------------------------ C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeBuiltinSymbol.h"
+
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeBuiltinSymbol::NativeBuiltinSymbol(NativeSession &PDBSession,
+                                         SymIndexId Id, PDB_BuiltinType T,
+                                         uint64_t L)
+    : NativeRawSymbol(PDBSession, Id), Session(PDBSession), Type(T), Length(L) {
+}
+
+NativeBuiltinSymbol::~NativeBuiltinSymbol() {}
+
+std::unique_ptr<NativeRawSymbol> NativeBuiltinSymbol::clone() const {
+  return llvm::make_unique<NativeBuiltinSymbol>(Session, SymbolId, Type, Length);
+}
+
+void NativeBuiltinSymbol::dump(raw_ostream &OS, int Indent) const {
+  // TODO:  Apparently nothing needs this yet.
+}
+
+PDB_SymType NativeBuiltinSymbol::getSymTag() const {
+  return PDB_SymType::BuiltinType;
+}
+
+PDB_BuiltinType NativeBuiltinSymbol::getBuiltinType() const { return Type; }
+
+bool NativeBuiltinSymbol::isConstType() const { return false; }
+
+uint64_t NativeBuiltinSymbol::getLength() const { return Length; }
+
+bool NativeBuiltinSymbol::isUnalignedType() const { return false; }
+
+bool NativeBuiltinSymbol::isVolatileType() const { return false; }
+
+} // namespace pdb
+} // namespace llvm
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
new file mode 100644
index 0000000..7132a99
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -0,0 +1,50 @@
+//===- NativeCompilandSymbol.cpp - Native impl for compilands ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+
+#include "llvm/ADT/STLExtras.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeCompilandSymbol::NativeCompilandSymbol(NativeSession &Session,
+                                             SymIndexId SymbolId,
+                                             DbiModuleDescriptor MI)
+    : NativeRawSymbol(Session, SymbolId), Module(MI) {}
+
+PDB_SymType NativeCompilandSymbol::getSymTag() const {
+  return PDB_SymType::Compiland;
+}
+
+std::unique_ptr<NativeRawSymbol> NativeCompilandSymbol::clone() const {
+  return llvm::make_unique<NativeCompilandSymbol>(Session, SymbolId, Module);
+}
+
+bool NativeCompilandSymbol::isEditAndContinueEnabled() const {
+  return Module.hasECInfo();
+}
+
+uint32_t NativeCompilandSymbol::getLexicalParentId() const { return 0; }
+
+// The usage of getObjFileName for getLibraryName and getModuleName for getName
+// may seem backwards, but it is consistent with DIA, which is what this API
+// was modeled after.  We may rename these methods later to try to eliminate
+// this potential confusion.
+
+std::string NativeCompilandSymbol::getLibraryName() const {
+  return Module.getObjFileName();
+}
+
+std::string NativeCompilandSymbol::getName() const {
+  return Module.getModuleName();
+}
+
+} // namespace pdb
+} // namespace llvm
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
new file mode 100644
index 0000000..a65782e
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -0,0 +1,51 @@
+//==- NativeEnumModules.cpp - Native Symbol Enumerator impl ------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeEnumModules::NativeEnumModules(NativeSession &PDBSession,
+                                     const DbiModuleList &Modules,
+                                     uint32_t Index)
+    : Session(PDBSession), Modules(Modules), Index(Index) {}
+
+uint32_t NativeEnumModules::getChildCount() const {
+  return static_cast<uint32_t>(Modules.getModuleCount());
+}
+
+std::unique_ptr<PDBSymbol>
+NativeEnumModules::getChildAtIndex(uint32_t Index) const {
+  if (Index >= Modules.getModuleCount())
+    return nullptr;
+  return Session.createCompilandSymbol(Modules.getModuleDescriptor(Index));
+}
+
+std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() {
+  if (Index >= Modules.getModuleCount())
+    return nullptr;
+  return getChildAtIndex(Index++);
+}
+
+void NativeEnumModules::reset() { Index = 0; }
+
+NativeEnumModules *NativeEnumModules::clone() const {
+  return new NativeEnumModules(Session, Modules, Index);
+}
+
+}
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
new file mode 100644
index 0000000..3241000
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -0,0 +1,84 @@
+//===- NativeExeSymbol.cpp - native impl for PDBSymbolExe -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeExeSymbol::NativeExeSymbol(NativeSession &Session, SymIndexId SymbolId)
+    : NativeRawSymbol(Session, SymbolId), File(Session.getPDBFile()) {}
+
+std::unique_ptr<NativeRawSymbol> NativeExeSymbol::clone() const {
+  return llvm::make_unique<NativeExeSymbol>(Session, SymbolId);
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeExeSymbol::findChildren(PDB_SymType Type) const {
+  switch (Type) {
+  case PDB_SymType::Compiland: {
+    auto Dbi = File.getPDBDbiStream();
+    if (Dbi) {
+      const DbiModuleList &Modules = Dbi->modules();
+      return std::unique_ptr<IPDBEnumSymbols>(
+          new NativeEnumModules(Session, Modules));
+    }
+    consumeError(Dbi.takeError());
+    break;
+  }
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+uint32_t NativeExeSymbol::getAge() const {
+  auto IS = File.getPDBInfoStream();
+  if (IS)
+    return IS->getAge();
+  consumeError(IS.takeError());
+  return 0;
+}
+
+std::string NativeExeSymbol::getSymbolsFileName() const {
+  return File.getFilePath();
+}
+
+codeview::GUID NativeExeSymbol::getGuid() const {
+  auto IS = File.getPDBInfoStream();
+  if (IS)
+    return IS->getGuid();
+  consumeError(IS.takeError());
+  return codeview::GUID{{0}};
+}
+
+bool NativeExeSymbol::hasCTypes() const {
+  auto Dbi = File.getPDBDbiStream();
+  if (Dbi)
+    return Dbi->hasCTypes();
+  consumeError(Dbi.takeError());
+  return false;
+}
+
+bool NativeExeSymbol::hasPrivateSymbols() const {
+  auto Dbi = File.getPDBDbiStream();
+  if (Dbi)
+    return !Dbi->isStripped();
+  consumeError(Dbi.takeError());
+  return false;
+}
+
+} // namespace pdb
+} // namespace llvm
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
new file mode 100644
index 0000000..df3f418
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -0,0 +1,694 @@
+//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NativeRawSymbol::NativeRawSymbol(NativeSession &PDBSession, SymIndexId SymbolId)
+    : Session(PDBSession), SymbolId(SymbolId) {}
+
+void NativeRawSymbol::dump(raw_ostream &OS, int Indent) const {}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildren(PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildren(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags, uint32_t RVA) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
+  return nullptr;
+}
+
+void NativeRawSymbol::getDataBytes(SmallVector<uint8_t, 32> &bytes) const {
+  bytes.clear();
+}
+
+PDB_MemberAccess NativeRawSymbol::getAccess() const {
+  return PDB_MemberAccess::Private;
+}
+
+uint32_t NativeRawSymbol::getAddressOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getAddressSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getAge() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getArrayIndexTypeId() const {
+  return 0;
+}
+
+void NativeRawSymbol::getBackEndVersion(VersionInfo &Version) const {
+  Version.Major = 0;
+  Version.Minor = 0;
+  Version.Build = 0;
+  Version.QFE = 0;
+}
+
+uint32_t NativeRawSymbol::getBaseDataOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getBaseDataSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getBaseSymbolId() const {
+  return 0;
+}
+
+PDB_BuiltinType NativeRawSymbol::getBuiltinType() const {
+  return PDB_BuiltinType::None;
+}
+
+uint32_t NativeRawSymbol::getBitPosition() const {
+  return 0;
+}
+
+PDB_CallingConv NativeRawSymbol::getCallingConvention() const {
+  return PDB_CallingConv::FarStdCall;
+}
+
+uint32_t NativeRawSymbol::getClassParentId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getCompilerName() const {
+  return {};
+}
+
+uint32_t NativeRawSymbol::getCount() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getCountLiveRanges() const {
+  return 0;
+}
+
+void NativeRawSymbol::getFrontEndVersion(VersionInfo &Version) const {
+  Version.Major = 0;
+  Version.Minor = 0;
+  Version.Build = 0;
+  Version.QFE = 0;
+}
+
+PDB_Lang NativeRawSymbol::getLanguage() const {
+  return PDB_Lang::Cobol;
+}
+
+uint32_t NativeRawSymbol::getLexicalParentId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getLibraryName() const {
+  return {};
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartAddressOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartAddressSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartRelativeVirtualAddress() const {
+  return 0;
+}
+
+codeview::RegisterId NativeRawSymbol::getLocalBasePointerRegisterId() const {
+  return codeview::RegisterId::EAX;
+}
+
+uint32_t NativeRawSymbol::getLowerBoundId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getMemorySpaceKind() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getName() const {
+  return {};
+}
+
+uint32_t NativeRawSymbol::getNumberOfAcceleratorPointerTags() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfColumns() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfModifiers() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfRegisterIndices() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfRows() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getObjectFileName() const {
+  return {};
+}
+
+uint32_t NativeRawSymbol::getOemId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getOemSymbolId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getOffsetInUdt() const {
+  return 0;
+}
+
+PDB_Cpu NativeRawSymbol::getPlatform() const {
+  return PDB_Cpu::Intel8080;
+}
+
+uint32_t NativeRawSymbol::getRank() const {
+  return 0;
+}
+
+codeview::RegisterId NativeRawSymbol::getRegisterId() const {
+  return codeview::RegisterId::EAX;
+}
+
+uint32_t NativeRawSymbol::getRegisterType() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getRelativeVirtualAddress() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSamplerSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSignature() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSizeInUdt() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSlot() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getSourceFileName() const {
+  return {};
+}
+
+uint32_t NativeRawSymbol::getStride() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSubTypeId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getSymbolsFileName() const { return {}; }
+
+uint32_t NativeRawSymbol::getSymIndexId() const { return SymbolId; }
+
+uint32_t NativeRawSymbol::getTargetOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetRelativeVirtualAddress() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getTargetVirtualAddress() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTextureSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTimeStamp() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getToken() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTypeId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUavSlot() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getUndecoratedName() const {
+  return {};
+}
+
+uint32_t NativeRawSymbol::getUnmodifiedTypeId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUpperBoundId() const {
+  return 0;
+}
+
+Variant NativeRawSymbol::getValue() const {
+  return Variant();
+}
+
+uint32_t NativeRawSymbol::getVirtualBaseDispIndex() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getVirtualBaseOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getVirtualTableShapeId() const {
+  return 0;
+}
+
+std::unique_ptr<PDBSymbolTypeBuiltin>
+NativeRawSymbol::getVirtualBaseTableType() const {
+  return nullptr;
+}
+
+PDB_DataKind NativeRawSymbol::getDataKind() const {
+  return PDB_DataKind::Unknown;
+}
+
+PDB_SymType NativeRawSymbol::getSymTag() const {
+  return PDB_SymType::None;
+}
+
+codeview::GUID NativeRawSymbol::getGuid() const { return codeview::GUID{{0}}; }
+
+int32_t NativeRawSymbol::getOffset() const {
+  return 0;
+}
+
+int32_t NativeRawSymbol::getThisAdjust() const {
+  return 0;
+}
+
+int32_t NativeRawSymbol::getVirtualBasePointerOffset() const {
+  return 0;
+}
+
+PDB_LocType NativeRawSymbol::getLocationType() const {
+  return PDB_LocType::Null;
+}
+
+PDB_Machine NativeRawSymbol::getMachineType() const {
+  return PDB_Machine::Invalid;
+}
+
+codeview::ThunkOrdinal NativeRawSymbol::getThunkOrdinal() const {
+  return codeview::ThunkOrdinal::Standard;
+}
+
+uint64_t NativeRawSymbol::getLength() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getLiveRangeLength() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getVirtualAddress() const {
+  return 0;
+}
+
+PDB_UdtType NativeRawSymbol::getUdtKind() const {
+  return PDB_UdtType::Struct;
+}
+
+bool NativeRawSymbol::hasConstructor() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCustomCallingConvention() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasFarReturn() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCompilerGenerated() const {
+  return false;
+}
+
+bool NativeRawSymbol::isConstType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isEditAndContinueEnabled() const {
+  return false;
+}
+
+bool NativeRawSymbol::isFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::getAddressTaken() const {
+  return false;
+}
+
+bool NativeRawSymbol::getNoStackOrdering() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasAlloca() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasAssignmentOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCTypes() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCastOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasDebugInfo() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasEH() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasEHa() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInlAsm() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInlineAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInterruptReturn() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasFramePointer() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasLongJump() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasManagedCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNestedTypes() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNoInlineAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNoReturnAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasOptimizedCodeDebugInfo() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasOverloadedOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSEH() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSecurityChecks() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSetJump() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasStrictGSCheck() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorGroupSharedLocal() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorPointerTagLiveRange() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorStubFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAggregated() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIntroVirtualFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCVTCIL() const {
+  return false;
+}
+
+bool NativeRawSymbol::isConstructorVirtualBase() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCxxReturnUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isDataAligned() const {
+  return false;
+}
+
+bool NativeRawSymbol::isHLSLData() const {
+  return false;
+}
+
+bool NativeRawSymbol::isHotpatchable() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIndirectVirtualBaseClass() const {
+  return false;
+}
+
+bool NativeRawSymbol::isInterfaceUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIntrinsic() const {
+  return false;
+}
+
+bool NativeRawSymbol::isLTCG() const {
+  return false;
+}
+
+bool NativeRawSymbol::isLocationControlFlowDependent() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMSILNetmodule() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMatrixRowMajor() const {
+  return false;
+}
+
+bool NativeRawSymbol::isManagedCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMSILCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMultipleInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isNaked() const {
+  return false;
+}
+
+bool NativeRawSymbol::isNested() const {
+  return false;
+}
+
+bool NativeRawSymbol::isOptimizedAway() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPacked() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerBasedOnSymbolValue() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerToDataMember() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerToMemberFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPureVirtual() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRValueReference() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRefUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isReference() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRestrictedType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isReturnValue() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSafeBuffers() const {
+  return false;
+}
+
+bool NativeRawSymbol::isScoped() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSdl() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSingleInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSplitted() const {
+  return false;
+}
+
+bool NativeRawSymbol::isStatic() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasPrivateSymbols() const {
+  return false;
+}
+
+bool NativeRawSymbol::isUnalignedType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isUnreached() const {
+  return false;
+}
+
+bool NativeRawSymbol::isValueUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtual() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtualBaseClass() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtualInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVolatileType() const {
+  return false;
+}
+
+bool NativeRawSymbol::wasInlined() const {
+  return false;
+}
+
+std::string NativeRawSymbol::getUnused() const {
+  return {};
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
new file mode 100644
index 0000000..76de0d8
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -0,0 +1,218 @@
+//===- NativeSession.cpp - Native implementation of IPDBSession -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeBuiltinSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+namespace {
+// Maps codeview::SimpleTypeKind of a built-in type to the parameters necessary
+// to instantiate a NativeBuiltinSymbol for that type.
+static const struct BuiltinTypeEntry {
+  codeview::SimpleTypeKind Kind;
+  PDB_BuiltinType Type;
+  uint32_t Size;
+} BuiltinTypes[] = {
+    {codeview::SimpleTypeKind::Int32, PDB_BuiltinType::Int, 4},
+    {codeview::SimpleTypeKind::UInt32, PDB_BuiltinType::UInt, 4},
+    {codeview::SimpleTypeKind::UInt32Long, PDB_BuiltinType::UInt, 4},
+    {codeview::SimpleTypeKind::UInt64Quad, PDB_BuiltinType::UInt, 8},
+    {codeview::SimpleTypeKind::NarrowCharacter, PDB_BuiltinType::Char, 1},
+    {codeview::SimpleTypeKind::SignedCharacter, PDB_BuiltinType::Char, 1},
+    {codeview::SimpleTypeKind::UnsignedCharacter, PDB_BuiltinType::UInt, 1},
+    {codeview::SimpleTypeKind::UInt16Short, PDB_BuiltinType::UInt, 2},
+    {codeview::SimpleTypeKind::Boolean8, PDB_BuiltinType::Bool, 1}
+    // This table can be grown as necessary, but these are the only types we've
+    // needed so far.
+};
+} // namespace
+
+NativeSession::NativeSession(std::unique_ptr<PDBFile> PdbFile,
+                             std::unique_ptr<BumpPtrAllocator> Allocator)
+    : Pdb(std::move(PdbFile)), Allocator(std::move(Allocator)) {}
+
+NativeSession::~NativeSession() = default;
+
+Error NativeSession::createFromPdb(StringRef Path,
+                                   std::unique_ptr<IPDBSession> &Session) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
+      MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
+                                   /*RequiresNullTerminator=*/false);
+  if (!ErrorOrBuffer)
+    return make_error<GenericError>(generic_error_code::invalid_path);
+
+  std::unique_ptr<MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
+  auto Stream = llvm::make_unique<MemoryBufferByteStream>(
+      std::move(Buffer), llvm::support::little);
+
+  auto Allocator = llvm::make_unique<BumpPtrAllocator>();
+  auto File = llvm::make_unique<PDBFile>(Path, std::move(Stream), *Allocator);
+  if (auto EC = File->parseFileHeaders())
+    return EC;
+  if (auto EC = File->parseStreamData())
+    return EC;
+
+  Session =
+      llvm::make_unique<NativeSession>(std::move(File), std::move(Allocator));
+
+  return Error::success();
+}
+
+Error NativeSession::createFromExe(StringRef Path,
+                                   std::unique_ptr<IPDBSession> &Session) {
+  return make_error<RawError>(raw_error_code::feature_unsupported);
+}
+
+std::unique_ptr<PDBSymbolCompiland>
+NativeSession::createCompilandSymbol(DbiModuleDescriptor MI) {
+  const auto Id = static_cast<SymIndexId>(SymbolCache.size());
+  SymbolCache.push_back(
+      llvm::make_unique<NativeCompilandSymbol>(*this, Id, MI));
+  return llvm::make_unique<PDBSymbolCompiland>(
+      *this, std::unique_ptr<IPDBRawSymbol>(SymbolCache[Id]->clone()));
+}
+
+SymIndexId NativeSession::findSymbolByTypeIndex(codeview::TypeIndex Index) {
+  // First see if it's already in our cache.
+  const auto Entry = TypeIndexToSymbolId.find(Index);
+  if (Entry != TypeIndexToSymbolId.end())
+    return Entry->second;
+
+  // Symbols for built-in types are created on the fly.
+  if (Index.isSimple()) {
+    // FIXME:  We will eventually need to handle pointers to other simple types,
+    // which are still simple types in the world of CodeView TypeIndexes.
+    if (Index.getSimpleMode() != codeview::SimpleTypeMode::Direct)
+      return 0;
+    const auto Kind = Index.getSimpleKind();
+    const auto It =
+        std::find_if(std::begin(BuiltinTypes), std::end(BuiltinTypes),
+                     [Kind](const BuiltinTypeEntry &Builtin) {
+                       return Builtin.Kind == Kind;
+                     });
+    if (It == std::end(BuiltinTypes))
+      return 0;
+    SymIndexId Id = SymbolCache.size();
+    SymbolCache.emplace_back(
+        llvm::make_unique<NativeBuiltinSymbol>(*this, Id, It->Type, It->Size));
+    TypeIndexToSymbolId[Index] = Id;
+    return Id;
+  }
+
+  // TODO:  Look up PDB type by type index
+
+  return 0;
+}
+
+uint64_t NativeSession::getLoadAddress() const { return 0; }
+
+void NativeSession::setLoadAddress(uint64_t Address) {}
+
+std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
+  const auto Id = static_cast<SymIndexId>(SymbolCache.size());
+  SymbolCache.push_back(llvm::make_unique<NativeExeSymbol>(*this, Id));
+  auto RawSymbol = SymbolCache[Id]->clone();
+  auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
+  std::unique_ptr<PDBSymbolExe> ExeSymbol(
+      static_cast<PDBSymbolExe *>(PdbSymbol.release()));
+  return ExeSymbol;
+}
+
+std::unique_ptr<PDBSymbol>
+NativeSession::getSymbolById(uint32_t SymbolId) const {
+  // If the caller has a SymbolId, it'd better be in our SymbolCache.
+  return SymbolId < SymbolCache.size()
+             ? PDBSymbol::create(*this, SymbolCache[SymbolId]->clone())
+             : nullptr;
+}
+
+std::unique_ptr<PDBSymbol>
+NativeSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
+                               const IPDBSourceFile &File) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+NativeSession::findLineNumbersByAddress(uint64_t Address,
+                                        uint32_t Length) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles>
+NativeSession::findSourceFiles(const PDBSymbolCompiland *Compiland,
+                               StringRef Pattern,
+                               PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBSourceFile>
+NativeSession::findOneSourceFile(const PDBSymbolCompiland *Compiland,
+                                 StringRef Pattern,
+                                 PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
+NativeSession::findCompilandsForSourceFile(StringRef Pattern,
+                                           PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbolCompiland>
+NativeSession::findOneCompilandForSourceFile(StringRef Pattern,
+                                             PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> NativeSession::getAllSourceFiles() const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSourceFiles> NativeSession::getSourceFilesForCompiland(
+    const PDBSymbolCompiland &Compiland) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBSourceFile>
+NativeSession::getSourceFileById(uint32_t FileId) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumDataStreams> NativeSession::getDebugStreams() const {
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/PDBFile.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 5349151..0b6492e 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/PDBFile.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -7,24 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/GlobalsStream.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/PublicsStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -38,12 +39,18 @@ namespace {
 typedef FixedStreamArray<support::ulittle32_t> ulittle_array;
 } // end anonymous namespace
 
-PDBFile::PDBFile(std::unique_ptr<ReadableStream> PdbFileBuffer,
+PDBFile::PDBFile(StringRef Path, std::unique_ptr<BinaryStream> PdbFileBuffer,
                  BumpPtrAllocator &Allocator)
-    : Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
+    : FilePath(Path), Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
 
 PDBFile::~PDBFile() = default;
 
+StringRef PDBFile::getFilePath() const { return FilePath; }
+
+StringRef PDBFile::getFileDirectory() const {
+  return sys::path::parent_path(FilePath);
+}
+
 uint32_t PDBFile::getBlockSize() const { return ContainerLayout.SB->BlockSize; }
 
 uint32_t PDBFile::getFreeBlockMapBlock() const {
@@ -106,7 +113,7 @@ Error PDBFile::setBlockData(uint32_t BlockIndex, uint32_t Offset,
 }
 
 Error PDBFile::parseFileHeaders() {
-  StreamReader Reader(*Buffer);
+  BinaryStreamReader Reader(*Buffer);
 
   // Initialize SB.
   const msf::SuperBlock *SB = nullptr;
@@ -139,8 +146,9 @@ Error PDBFile::parseFileHeaders() {
   // at getBlockSize() intervals, so we have to be compatible.
   // See the function fpmPn() for more information:
   // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L489
-  auto FpmStream = MappedBlockStream::createFpmStream(ContainerLayout, *Buffer);
-  StreamReader FpmReader(*FpmStream);
+  auto FpmStream =
+      MappedBlockStream::createFpmStream(ContainerLayout, *Buffer, Allocator);
+  BinaryStreamReader FpmReader(*FpmStream);
   ArrayRef<uint8_t> FpmBytes;
   if (auto EC = FpmReader.readBytes(FpmBytes,
                                     msf::getFullFpmByteSize(ContainerLayout)))
@@ -177,8 +185,9 @@ Error PDBFile::parseStreamData() {
   // is exactly what we are attempting to parse.  By specifying a custom
   // subclass of IPDBStreamData which only accesses the fields that have already
   // been parsed, we can avoid this and reuse MappedBlockStream.
-  auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer);
-  StreamReader Reader(*DS);
+  auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer,
+                                                     Allocator);
+  BinaryStreamReader Reader(*DS);
   if (auto EC = Reader.readInteger(NumStreams))
     return EC;
 
@@ -221,6 +230,14 @@ ArrayRef<support::ulittle32_t> PDBFile::getDirectoryBlockArray() const {
   return ContainerLayout.DirectoryBlocks;
 }
 
+MSFStreamLayout PDBFile::getStreamLayout(uint32_t StreamIdx) const {
+  MSFStreamLayout Result;
+  auto Blocks = getStreamBlockList(StreamIdx);
+  Result.Blocks.assign(Blocks.begin(), Blocks.end());
+  Result.Length = getStreamByteSize(StreamIdx);
+  return Result;
+}
+
 Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
   if (!Globals) {
     auto DbiS = getPDBDbiStream();
@@ -229,7 +246,8 @@ Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
 
     auto GlobalS = safelyCreateIndexedStream(
         ContainerLayout, *Buffer, DbiS->getGlobalSymbolStreamIndex());
-    if (!GlobalS) return GlobalS.takeError();
+    if (!GlobalS)
+      return GlobalS.takeError();
     auto TempGlobals = llvm::make_unique<GlobalsStream>(std::move(*GlobalS));
     if (auto EC = TempGlobals->reload())
       return std::move(EC);
@@ -241,7 +259,8 @@ Expected<GlobalsStream &> PDBFile::getPDBGlobalsStream() {
 Expected<InfoStream &> PDBFile::getPDBInfoStream() {
   if (!Info) {
     auto InfoS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamPDB);
-    if (!InfoS) return InfoS.takeError();
+    if (!InfoS)
+      return InfoS.takeError();
     auto TempInfo = llvm::make_unique<InfoStream>(std::move(*InfoS));
     if (auto EC = TempInfo->reload())
       return std::move(EC);
@@ -253,7 +272,8 @@ Expected<InfoStream &> PDBFile::getPDBInfoStream() {
 Expected<DbiStream &> PDBFile::getPDBDbiStream() {
   if (!Dbi) {
     auto DbiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamDBI);
-    if (!DbiS) return DbiS.takeError();
+    if (!DbiS)
+      return DbiS.takeError();
     auto TempDbi = llvm::make_unique<DbiStream>(*this, std::move(*DbiS));
     if (auto EC = TempDbi->reload())
       return std::move(EC);
@@ -265,7 +285,8 @@ Expected<DbiStream &> PDBFile::getPDBDbiStream() {
 Expected<TpiStream &> PDBFile::getPDBTpiStream() {
   if (!Tpi) {
     auto TpiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamTPI);
-    if (!TpiS) return TpiS.takeError();
+    if (!TpiS)
+      return TpiS.takeError();
     auto TempTpi = llvm::make_unique<TpiStream>(*this, std::move(*TpiS));
     if (auto EC = TempTpi->reload())
       return std::move(EC);
@@ -277,7 +298,8 @@ Expected<TpiStream &> PDBFile::getPDBTpiStream() {
 Expected<TpiStream &> PDBFile::getPDBIpiStream() {
   if (!Ipi) {
     auto IpiS = safelyCreateIndexedStream(ContainerLayout, *Buffer, StreamIPI);
-    if (!IpiS) return IpiS.takeError();
+    if (!IpiS)
+      return IpiS.takeError();
     auto TempIpi = llvm::make_unique<TpiStream>(*this, std::move(*IpiS));
     if (auto EC = TempIpi->reload())
       return std::move(EC);
@@ -294,7 +316,8 @@ Expected<PublicsStream &> PDBFile::getPDBPublicsStream() {
 
     auto PublicS = safelyCreateIndexedStream(
         ContainerLayout, *Buffer, DbiS->getPublicSymbolStreamIndex());
-    if (!PublicS) return PublicS.takeError();
+    if (!PublicS)
+      return PublicS.takeError();
     auto TempPublics =
         llvm::make_unique<PublicsStream>(*this, std::move(*PublicS));
     if (auto EC = TempPublics->reload())
@@ -313,7 +336,8 @@ Expected<SymbolStream &> PDBFile::getPDBSymbolStream() {
     uint32_t SymbolStreamNum = DbiS->getSymRecordStreamIndex();
     auto SymbolS =
         safelyCreateIndexedStream(ContainerLayout, *Buffer, SymbolStreamNum);
-    if (!SymbolS) return SymbolS.takeError();
+    if (!SymbolS)
+      return SymbolS.takeError();
 
     auto TempSymbols = llvm::make_unique<SymbolStream>(std::move(*SymbolS));
     if (auto EC = TempSymbols->reload())
@@ -323,8 +347,8 @@ Expected<SymbolStream &> PDBFile::getPDBSymbolStream() {
   return *Symbols;
 }
 
-Expected<NameHashTable &> PDBFile::getStringTable() {
-  if (!StringTable || !StringTableStream) {
+Expected<PDBStringTable &> PDBFile::getStringTable() {
+  if (!Strings) {
     auto IS = getPDBInfoStream();
     if (!IS)
       return IS.takeError();
@@ -333,23 +357,39 @@ Expected<NameHashTable &> PDBFile::getStringTable() {
 
     auto NS =
         safelyCreateIndexedStream(ContainerLayout, *Buffer, NameStreamIndex);
-    if (!NS) return NS.takeError();
+    if (!NS)
+      return NS.takeError();
 
-    StreamReader Reader(**NS);
-    auto N = llvm::make_unique<NameHashTable>();
-    if (auto EC = N->load(Reader))
+    auto N = llvm::make_unique<PDBStringTable>();
+    BinaryStreamReader Reader(**NS);
+    if (auto EC = N->reload(Reader))
       return std::move(EC);
-    StringTable = std::move(N);
+    assert(Reader.bytesRemaining() == 0);
     StringTableStream = std::move(*NS);
+    Strings = std::move(N);
   }
-  return *StringTable;
+  return *Strings;
+}
+
+uint32_t PDBFile::getPointerSize() {
+  auto DbiS = getPDBDbiStream();
+  if (!DbiS)
+    return 0;
+  PDB_Machine Machine = DbiS->getMachineType();
+  if (Machine == PDB_Machine::Amd64)
+    return 8;
+  return 4;
 }
 
 bool PDBFile::hasPDBDbiStream() const { return StreamDBI < getNumStreams(); }
 
 bool PDBFile::hasPDBGlobalsStream() {
   auto DbiS = getPDBDbiStream();
-  if (!DbiS) return false;
+  if (!DbiS) {
+    consumeError(DbiS.takeError());
+    return false;
+  }
+
   return DbiS->getGlobalSymbolStreamIndex() < getNumStreams();
 }
 
@@ -359,33 +399,39 @@ bool PDBFile::hasPDBIpiStream() const { return StreamIPI < getNumStreams(); }
 
 bool PDBFile::hasPDBPublicsStream() {
   auto DbiS = getPDBDbiStream();
-  if (!DbiS) return false;
+  if (!DbiS) {
+    consumeError(DbiS.takeError());
+    return false;
+  }
   return DbiS->getPublicSymbolStreamIndex() < getNumStreams();
 }
 
 bool PDBFile::hasPDBSymbolStream() {
   auto DbiS = getPDBDbiStream();
-  if (!DbiS) return false;
+  if (!DbiS)
+    return false;
   return DbiS->getSymRecordStreamIndex() < getNumStreams();
 }
 
 bool PDBFile::hasPDBTpiStream() const { return StreamTPI < getNumStreams(); }
 
-bool PDBFile::hasStringTable() {
+bool PDBFile::hasPDBStringTable() {
   auto IS = getPDBInfoStream();
-  if (!IS) return false;
+  if (!IS)
+    return false;
   return IS->getNamedStreamIndex("/names") < getNumStreams();
 }
 
-/// Wrapper around MappedBlockStream::createIndexedStream()
-/// that checks if a stream with that index actually exists.
-/// If it does not, the return value will have an MSFError with
-/// code msf_error_code::no_stream. Else, the return value will
-/// contain the stream returned by createIndexedStream().
-Expected<std::unique_ptr<MappedBlockStream>> PDBFile::safelyCreateIndexedStream(
-    const MSFLayout &Layout, const ReadableStream &MsfData,
-    uint32_t StreamIndex) const {
+/// Wrapper around MappedBlockStream::createIndexedStream() that checks if a
+/// stream with that index actually exists.  If it does not, the return value
+/// will have an MSFError with code msf_error_code::no_stream.  Else, the return
+/// value will contain the stream returned by createIndexedStream().
+Expected<std::unique_ptr<MappedBlockStream>>
+PDBFile::safelyCreateIndexedStream(const MSFLayout &Layout,
+                                   BinaryStreamRef MsfData,
+                                   uint32_t StreamIndex) const {
   if (StreamIndex >= getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
-  return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex);
+  return MappedBlockStream::createIndexedStream(Layout, MsfData, StreamIndex,
+                                                Allocator);
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
new file mode 100644
index 0000000..9f35fd7
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -0,0 +1,221 @@
+//===- PDBFileBuilder.cpp - PDB File Creation -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+
+#include "llvm/ADT/BitVector.h"
+
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/PublicsStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+using namespace llvm::support;
+
+PDBFileBuilder::PDBFileBuilder(BumpPtrAllocator &Allocator)
+    : Allocator(Allocator) {}
+
+PDBFileBuilder::~PDBFileBuilder() {}
+
+Error PDBFileBuilder::initialize(uint32_t BlockSize) {
+  auto ExpectedMsf = MSFBuilder::create(Allocator, BlockSize);
+  if (!ExpectedMsf)
+    return ExpectedMsf.takeError();
+  Msf = llvm::make_unique<MSFBuilder>(std::move(*ExpectedMsf));
+  return Error::success();
+}
+
+MSFBuilder &PDBFileBuilder::getMsfBuilder() { return *Msf; }
+
+InfoStreamBuilder &PDBFileBuilder::getInfoBuilder() {
+  if (!Info)
+    Info = llvm::make_unique<InfoStreamBuilder>(*Msf, NamedStreams);
+  return *Info;
+}
+
+DbiStreamBuilder &PDBFileBuilder::getDbiBuilder() {
+  if (!Dbi)
+    Dbi = llvm::make_unique<DbiStreamBuilder>(*Msf);
+  return *Dbi;
+}
+
+TpiStreamBuilder &PDBFileBuilder::getTpiBuilder() {
+  if (!Tpi)
+    Tpi = llvm::make_unique<TpiStreamBuilder>(*Msf, StreamTPI);
+  return *Tpi;
+}
+
+TpiStreamBuilder &PDBFileBuilder::getIpiBuilder() {
+  if (!Ipi)
+    Ipi = llvm::make_unique<TpiStreamBuilder>(*Msf, StreamIPI);
+  return *Ipi;
+}
+
+PDBStringTableBuilder &PDBFileBuilder::getStringTableBuilder() {
+  return Strings;
+}
+
+PublicsStreamBuilder &PDBFileBuilder::getPublicsBuilder() {
+  if (!Publics)
+    Publics = llvm::make_unique<PublicsStreamBuilder>(*Msf);
+  return *Publics;
+}
+
+Error PDBFileBuilder::addNamedStream(StringRef Name, uint32_t Size) {
+  auto ExpectedStream = Msf->addStream(Size);
+  if (!ExpectedStream)
+    return ExpectedStream.takeError();
+  NamedStreams.set(Name, *ExpectedStream);
+  return Error::success();
+}
+
+Expected<msf::MSFLayout> PDBFileBuilder::finalizeMsfLayout() {
+
+  if (Ipi && Ipi->getRecordCount() > 0) {
+    // In theory newer PDBs always have an ID stream, but by saying that we're
+    // only going to *really* have an ID stream if there is at least one ID
+    // record, we leave open the opportunity to test older PDBs such as those
+    // that don't have an ID stream.
+    auto &Info = getInfoBuilder();
+    Info.addFeature(PdbRaw_FeatureSig::VC140);
+  }
+
+  uint32_t StringsLen = Strings.calculateSerializedSize();
+
+  if (auto EC = addNamedStream("/names", StringsLen))
+    return std::move(EC);
+  if (auto EC = addNamedStream("/LinkInfo", 0))
+    return std::move(EC);
+
+  if (Info) {
+    if (auto EC = Info->finalizeMsfLayout())
+      return std::move(EC);
+  }
+  if (Dbi) {
+    if (auto EC = Dbi->finalizeMsfLayout())
+      return std::move(EC);
+  }
+  if (Tpi) {
+    if (auto EC = Tpi->finalizeMsfLayout())
+      return std::move(EC);
+  }
+  if (Ipi) {
+    if (auto EC = Ipi->finalizeMsfLayout())
+      return std::move(EC);
+  }
+  if (Publics) {
+    if (auto EC = Publics->finalizeMsfLayout())
+      return std::move(EC);
+    if (Dbi) {
+      Dbi->setPublicsStreamIndex(Publics->getStreamIndex());
+      Dbi->setSymbolRecordStreamIndex(Publics->getRecordStreamIdx());
+    }
+  }
+
+  return Msf->build();
+}
+
+Expected<uint32_t> PDBFileBuilder::getNamedStreamIndex(StringRef Name) const {
+  uint32_t SN = 0;
+  if (!NamedStreams.get(Name, SN))
+    return llvm::make_error<pdb::RawError>(raw_error_code::no_stream);
+  return SN;
+}
+
+Error PDBFileBuilder::commit(StringRef Filename) {
+  assert(!Filename.empty());
+  auto ExpectedLayout = finalizeMsfLayout();
+  if (!ExpectedLayout)
+    return ExpectedLayout.takeError();
+  auto &Layout = *ExpectedLayout;
+
+  uint64_t Filesize = Layout.SB->BlockSize * Layout.SB->NumBlocks;
+  auto OutFileOrError = FileOutputBuffer::create(Filename, Filesize);
+  if (OutFileOrError.getError())
+    return llvm::make_error<pdb::GenericError>(generic_error_code::invalid_path,
+                                               Filename);
+  FileBufferByteStream Buffer(std::move(*OutFileOrError),
+                              llvm::support::little);
+  BinaryStreamWriter Writer(Buffer);
+
+  if (auto EC = Writer.writeObject(*Layout.SB))
+    return EC;
+  uint32_t BlockMapOffset =
+      msf::blockToOffset(Layout.SB->BlockMapAddr, Layout.SB->BlockSize);
+  Writer.setOffset(BlockMapOffset);
+  if (auto EC = Writer.writeArray(Layout.DirectoryBlocks))
+    return EC;
+
+  auto DirStream = WritableMappedBlockStream::createDirectoryStream(
+      Layout, Buffer, Allocator);
+  BinaryStreamWriter DW(*DirStream);
+  if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size()))
+    return EC;
+
+  if (auto EC = DW.writeArray(Layout.StreamSizes))
+    return EC;
+
+  for (const auto &Blocks : Layout.StreamMap) {
+    if (auto EC = DW.writeArray(Blocks))
+      return EC;
+  }
+
+  auto ExpectedSN = getNamedStreamIndex("/names");
+  if (!ExpectedSN)
+    return ExpectedSN.takeError();
+
+  auto NS = WritableMappedBlockStream::createIndexedStream(
+      Layout, Buffer, *ExpectedSN, Allocator);
+  BinaryStreamWriter NSWriter(*NS);
+  if (auto EC = Strings.commit(NSWriter))
+    return EC;
+
+  if (Info) {
+    if (auto EC = Info->commit(Layout, Buffer))
+      return EC;
+  }
+
+  if (Dbi) {
+    if (auto EC = Dbi->commit(Layout, Buffer))
+      return EC;
+  }
+
+  if (Tpi) {
+    if (auto EC = Tpi->commit(Layout, Buffer))
+      return EC;
+  }
+
+  if (Ipi) {
+    if (auto EC = Ipi->commit(Layout, Buffer))
+      return EC;
+  }
+
+  if (Publics) {
+    auto PS = WritableMappedBlockStream::createIndexedStream(
+        Layout, Buffer, Publics->getStreamIndex(), Allocator);
+    BinaryStreamWriter PSWriter(*PS);
+    if (auto EC = Publics->commit(PSWriter))
+      return EC;
+  }
+
+  return Buffer.commit();
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
new file mode 100644
index 0000000..acd45f7
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTable.cpp
@@ -0,0 +1,139 @@
+//===- PDBStringTable.cpp - PDB String Table ---------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::support;
+using namespace llvm::pdb;
+
+uint32_t PDBStringTable::getByteSize() const { return Header->ByteSize; }
+uint32_t PDBStringTable::getNameCount() const { return NameCount; }
+uint32_t PDBStringTable::getHashVersion() const { return Header->HashVersion; }
+uint32_t PDBStringTable::getSignature() const { return Header->Signature; }
+
+Error PDBStringTable::readHeader(BinaryStreamReader &Reader) {
+  if (auto EC = Reader.readObject(Header))
+    return EC;
+
+  if (Header->Signature != PDBStringTableSignature)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Invalid hash table signature");
+  if (Header->HashVersion != 1 && Header->HashVersion != 2)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "Unsupported hash version");
+
+  assert(Reader.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Error PDBStringTable::readStrings(BinaryStreamReader &Reader) {
+  BinaryStreamRef Stream;
+  if (auto EC = Reader.readStreamRef(Stream))
+    return EC;
+
+  if (auto EC = Strings.initialize(Stream)) {
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Invalid hash table byte length"));
+  }
+
+  assert(Reader.bytesRemaining() == 0);
+  return Error::success();
+}
+
+const codeview::DebugStringTableSubsectionRef &
+PDBStringTable::getStringTable() const {
+  return Strings;
+}
+
+Error PDBStringTable::readHashTable(BinaryStreamReader &Reader) {
+  const support::ulittle32_t *HashCount;
+  if (auto EC = Reader.readObject(HashCount))
+    return EC;
+
+  if (auto EC = Reader.readArray(IDs, *HashCount)) {
+    return joinErrors(std::move(EC),
+                      make_error<RawError>(raw_error_code::corrupt_file,
+                                           "Could not read bucket array"));
+  }
+
+  return Error::success();
+}
+
+Error PDBStringTable::readEpilogue(BinaryStreamReader &Reader) {
+  if (auto EC = Reader.readInteger(NameCount))
+    return EC;
+
+  assert(Reader.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Error PDBStringTable::reload(BinaryStreamReader &Reader) {
+
+  BinaryStreamReader SectionReader;
+
+  std::tie(SectionReader, Reader) = Reader.split(sizeof(PDBStringTableHeader));
+  if (auto EC = readHeader(SectionReader))
+    return EC;
+
+  std::tie(SectionReader, Reader) = Reader.split(Header->ByteSize);
+  if (auto EC = readStrings(SectionReader))
+    return EC;
+
+  // We don't know how long the hash table is until we parse it, so let the
+  // function responsible for doing that figure it out.
+  if (auto EC = readHashTable(Reader))
+    return EC;
+
+  std::tie(SectionReader, Reader) = Reader.split(sizeof(uint32_t));
+  if (auto EC = readEpilogue(SectionReader))
+    return EC;
+
+  assert(Reader.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Expected<StringRef> PDBStringTable::getStringForID(uint32_t ID) const {
+  return Strings.getString(ID);
+}
+
+Expected<uint32_t> PDBStringTable::getIDForString(StringRef Str) const {
+  uint32_t Hash =
+      (Header->HashVersion == 1) ? hashStringV1(Str) : hashStringV2(Str);
+  size_t Count = IDs.size();
+  uint32_t Start = Hash % Count;
+  for (size_t I = 0; I < Count; ++I) {
+    // The hash is just a starting point for the search, but if it
+    // doesn't work we should find the string no matter what, because
+    // we iterate the entire array.
+    uint32_t Index = (Start + I) % Count;
+
+    uint32_t ID = IDs[Index];
+    auto ExpectedStr = getStringForID(ID);
+    if (!ExpectedStr)
+      return ExpectedStr.takeError();
+
+    if (*ExpectedStr == Str)
+      return ID;
+  }
+  return make_error<RawError>(raw_error_code::no_entry);
+}
+
+FixedStreamArray<support::ulittle32_t> PDBStringTable::name_ids() const {
+  return IDs;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
new file mode 100644
index 0000000..90acfad
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PDBStringTableBuilder.cpp
@@ -0,0 +1,138 @@
+//===- PDBStringTableBuilder.cpp - PDB String Table -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::msf;
+using namespace llvm::support;
+using namespace llvm::support::endian;
+using namespace llvm::pdb;
+
+uint32_t PDBStringTableBuilder::insert(StringRef S) {
+  return Strings.insert(S);
+}
+
+static uint32_t computeBucketCount(uint32_t NumStrings) {
+  // The /names stream is basically an on-disk open-addressing hash table.
+  // Hash collisions are resolved by linear probing. We cannot make
+  // utilization 100% because it will make the linear probing extremely
+  // slow. But lower utilization wastes disk space. As a reasonable
+  // load factor, we choose 80%. We need +1 because slot 0 is reserved.
+  return (NumStrings + 1) * 1.25;
+}
+
+uint32_t PDBStringTableBuilder::calculateHashTableSize() const {
+  uint32_t Size = sizeof(uint32_t); // Hash table begins with 4-byte size field.
+  Size += sizeof(uint32_t) * computeBucketCount(Strings.size());
+
+  return Size;
+}
+
+uint32_t PDBStringTableBuilder::calculateSerializedSize() const {
+  uint32_t Size = 0;
+  Size += sizeof(PDBStringTableHeader);
+  Size += Strings.calculateSerializedSize();
+  Size += calculateHashTableSize();
+  Size += sizeof(uint32_t); // The /names stream ends with the string count.
+  return Size;
+}
+
+void PDBStringTableBuilder::setStrings(
+    const codeview::DebugStringTableSubsection &Strings) {
+  this->Strings = Strings;
+}
+
+Error PDBStringTableBuilder::writeHeader(BinaryStreamWriter &Writer) const {
+  // Write a header
+  PDBStringTableHeader H;
+  H.Signature = PDBStringTableSignature;
+  H.HashVersion = 1;
+  H.ByteSize = Strings.calculateSerializedSize();
+  if (auto EC = Writer.writeObject(H))
+    return EC;
+  assert(Writer.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Error PDBStringTableBuilder::writeStrings(BinaryStreamWriter &Writer) const {
+  if (auto EC = Strings.commit(Writer))
+    return EC;
+
+  assert(Writer.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Error PDBStringTableBuilder::writeHashTable(BinaryStreamWriter &Writer) const {
+  // Write a hash table.
+  uint32_t BucketCount = computeBucketCount(Strings.size());
+  if (auto EC = Writer.writeInteger(BucketCount))
+    return EC;
+  std::vector<ulittle32_t> Buckets(BucketCount);
+
+  for (auto &Pair : Strings) {
+    StringRef S = Pair.getKey();
+    uint32_t Offset = Pair.getValue();
+    uint32_t Hash = hashStringV1(S);
+
+    for (uint32_t I = 0; I != BucketCount; ++I) {
+      uint32_t Slot = (Hash + I) % BucketCount;
+      if (Slot == 0)
+        continue; // Skip reserved slot
+      if (Buckets[Slot] != 0)
+        continue;
+      Buckets[Slot] = Offset;
+      break;
+    }
+  }
+
+  if (auto EC = Writer.writeArray(ArrayRef<ulittle32_t>(Buckets)))
+    return EC;
+
+  assert(Writer.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Error PDBStringTableBuilder::writeEpilogue(BinaryStreamWriter &Writer) const {
+  if (auto EC = Writer.writeInteger<uint32_t>(Strings.size()))
+    return EC;
+  assert(Writer.bytesRemaining() == 0);
+  return Error::success();
+}
+
+Error PDBStringTableBuilder::commit(BinaryStreamWriter &Writer) const {
+  BinaryStreamWriter SectionWriter;
+
+  std::tie(SectionWriter, Writer) = Writer.split(sizeof(PDBStringTableHeader));
+  if (auto EC = writeHeader(SectionWriter))
+    return EC;
+
+  std::tie(SectionWriter, Writer) =
+      Writer.split(Strings.calculateSerializedSize());
+  if (auto EC = writeStrings(SectionWriter))
+    return EC;
+
+  std::tie(SectionWriter, Writer) = Writer.split(calculateHashTableSize());
+  if (auto EC = writeHashTable(SectionWriter))
+    return EC;
+
+  std::tie(SectionWriter, Writer) = Writer.split(sizeof(uint32_t));
+  if (auto EC = writeEpilogue(SectionWriter))
+    return EC;
+
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/PublicsStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
index b31f605..9c3e654 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/PublicsStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PublicsStream.cpp
@@ -22,15 +22,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
 #include "GSI.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/PublicsStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -41,19 +41,6 @@ using namespace llvm::msf;
 using namespace llvm::support;
 using namespace llvm::pdb;
 
-// This is PSGSIHDR struct defined in
-// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.h
-struct PublicsStream::HeaderInfo {
-  ulittle32_t SymHash;
-  ulittle32_t AddrMap;
-  ulittle32_t NumThunks;
-  ulittle32_t SizeOfThunk;
-  ulittle16_t ISectThunkTable;
-  char Padding[2];
-  ulittle32_t OffThunkTable;
-  ulittle32_t NumSections;
-};
-
 PublicsStream::PublicsStream(PDBFile &File,
                              std::unique_ptr<MappedBlockStream> Stream)
     : Pdb(File), Stream(std::move(Stream)) {}
@@ -69,10 +56,11 @@ uint32_t PublicsStream::getAddrMap() const { return Header->AddrMap; }
 // we skip over the hash table which we believe contains information about
 // public symbols.
 Error PublicsStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   // Check stream size.
-  if (Reader.bytesRemaining() < sizeof(HeaderInfo) + sizeof(GSIHashHeader))
+  if (Reader.bytesRemaining() <
+      sizeof(PublicsStreamHeader) + sizeof(GSIHashHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Publics Stream does not contain a header.");
 
@@ -105,10 +93,12 @@ Error PublicsStream::reload() {
                                            "Could not read a thunk map."));
 
   // Something called "section map" follows.
-  if (auto EC = Reader.readArray(SectionOffsets, Header->NumSections))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Could not read a section map."));
+  if (Reader.bytesRemaining() > 0) {
+    if (auto EC = Reader.readArray(SectionOffsets, Header->NumSections))
+      return joinErrors(std::move(EC),
+                        make_error<RawError>(raw_error_code::corrupt_file,
+                                             "Could not read a section map."));
+  }
 
   if (Reader.bytesRemaining() > 0)
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -128,4 +118,13 @@ PublicsStream::getSymbols(bool *HadError) const {
   return SS.getSymbols(HadError);
 }
 
+Expected<const codeview::CVSymbolArray &>
+PublicsStream::getSymbolArray() const {
+  auto SymbolS = Pdb.getPDBSymbolStream();
+  if (!SymbolS)
+    return SymbolS.takeError();
+
+  return SymbolS->getSymbolArray();
+}
+
 Error PublicsStream::commit() { return Error::success(); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/PublicsStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/PublicsStreamBuilder.cpp
new file mode 100644
index 0000000..28c4a8f
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/PublicsStreamBuilder.cpp
@@ -0,0 +1,89 @@
+//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PublicsStreamBuilder.h"
+
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+
+#include "GSI.h"
+
+using namespace llvm;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+PublicsStreamBuilder::PublicsStreamBuilder(msf::MSFBuilder &Msf) : Msf(Msf) {}
+
+PublicsStreamBuilder::~PublicsStreamBuilder() {}
+
+uint32_t PublicsStreamBuilder::calculateSerializedLength() const {
+  uint32_t Size = 0;
+  Size += sizeof(PublicsStreamHeader);
+  Size += sizeof(GSIHashHeader);
+  Size += HashRecords.size() * sizeof(PSHashRecord);
+  size_t BitmapSizeInBits = alignTo(IPHR_HASH + 1, 32);
+  uint32_t NumBitmapEntries = BitmapSizeInBits / 8;
+  Size += NumBitmapEntries;
+
+  // FIXME: Account for hash buckets.  For now since we we write a zero-bitmap
+  // indicating that no hash buckets are valid, we also write zero byets of hash
+  // bucket data.
+  Size += 0;
+  return Size;
+}
+
+Error PublicsStreamBuilder::finalizeMsfLayout() {
+  Expected<uint32_t> Idx = Msf.addStream(calculateSerializedLength());
+  if (!Idx)
+    return Idx.takeError();
+  StreamIdx = *Idx;
+
+  Expected<uint32_t> RecordIdx = Msf.addStream(0);
+  if (!RecordIdx)
+    return RecordIdx.takeError();
+  RecordStreamIdx = *RecordIdx;
+  return Error::success();
+}
+
+Error PublicsStreamBuilder::commit(BinaryStreamWriter &PublicsWriter) {
+  PublicsStreamHeader PSH;
+  GSIHashHeader GSH;
+
+  // FIXME: Figure out what to put for these values.
+  PSH.AddrMap = 0;
+  PSH.ISectThunkTable = 0;
+  PSH.NumSections = 0;
+  PSH.NumThunks = 0;
+  PSH.OffThunkTable = 0;
+  PSH.SizeOfThunk = 0;
+  PSH.SymHash = 0;
+
+  GSH.VerSignature = GSIHashHeader::HdrSignature;
+  GSH.VerHdr = GSIHashHeader::HdrVersion;
+  GSH.HrSize = 0;
+  GSH.NumBuckets = 0;
+
+  if (auto EC = PublicsWriter.writeObject(PSH))
+    return EC;
+  if (auto EC = PublicsWriter.writeObject(GSH))
+    return EC;
+  if (auto EC = PublicsWriter.writeArray(makeArrayRef(HashRecords)))
+    return EC;
+
+  size_t BitmapSizeInBits = alignTo(IPHR_HASH + 1, 32);
+  uint32_t NumBitmapEntries = BitmapSizeInBits / 8;
+  std::vector<uint8_t> BitmapData(NumBitmapEntries);
+  // FIXME: Build an actual bitmap
+  if (auto EC = PublicsWriter.writeBytes(makeArrayRef(BitmapData)))
+    return EC;
+
+  // FIXME: Write actual hash buckets.
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/RawError.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/RawError.cpp
index f4a5057..548289f 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/RawError.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/RawError.cpp
@@ -1,4 +1,4 @@
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 
@@ -38,6 +38,8 @@ public:
       return "The entry does not exist.";
     case raw_error_code::not_writable:
       return "The PDB does not support writing.";
+    case raw_error_code::stream_too_long:
+      return "The stream was longer than expected.";
     case raw_error_code::invalid_tpi_hash:
       return "The Type record has an invalid hash value.";
     }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/SymbolStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp
index 2f3ac34..9e9ebd1 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/SymbolStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/SymbolStream.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/PDB/Raw/SymbolStream.h"
+#include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -30,7 +29,7 @@ SymbolStream::SymbolStream(std::unique_ptr<MappedBlockStream> Stream)
 SymbolStream::~SymbolStream() {}
 
 Error SymbolStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (auto EC = Reader.readArray(SymbolRecords, Stream->getLength()))
     return EC;
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp
new file mode 100644
index 0000000..77a2d57
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiHashing.cpp
@@ -0,0 +1,89 @@
+//===- TpiHashing.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
+
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/PDB/Native/Hash.h"
+#include "llvm/Support/JamCRC.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+// Corresponds to `fUDTAnon`.
+static bool isAnonymous(StringRef Name) {
+  return Name == "<unnamed-tag>" || Name == "__unnamed" ||
+         Name.endswith("::<unnamed-tag>") || Name.endswith("::__unnamed");
+}
+
+// Computes the hash for a user-defined type record. This could be a struct,
+// class, union, or enum.
+static uint32_t getHashForUdt(const TagRecord &Rec,
+                              ArrayRef<uint8_t> FullRecord) {
+  ClassOptions Opts = Rec.getOptions();
+  bool ForwardRef = bool(Opts & ClassOptions::ForwardReference);
+  bool Scoped = bool(Opts & ClassOptions::Scoped);
+  bool HasUniqueName = bool(Opts & ClassOptions::HasUniqueName);
+  bool IsAnon = HasUniqueName && isAnonymous(Rec.getName());
+
+  if (!ForwardRef && !Scoped && !IsAnon)
+    return hashStringV1(Rec.getName());
+  if (!ForwardRef && HasUniqueName && !IsAnon)
+    return hashStringV1(Rec.getUniqueName());
+  return hashBufferV8(FullRecord);
+}
+
+template <typename T>
+static Expected<uint32_t> getHashForUdt(const CVType &Rec) {
+  T Deserialized;
+  if (auto E = TypeDeserializer::deserializeAs(const_cast<CVType &>(Rec),
+                                               Deserialized))
+    return std::move(E);
+  return getHashForUdt(Deserialized, Rec.data());
+}
+
+template <typename T>
+static Expected<uint32_t> getSourceLineHash(const CVType &Rec) {
+  T Deserialized;
+  if (auto E = TypeDeserializer::deserializeAs(const_cast<CVType &>(Rec),
+                                               Deserialized))
+    return std::move(E);
+  char Buf[4];
+  support::endian::write32le(Buf, Deserialized.getUDT().getIndex());
+  return hashStringV1(StringRef(Buf, 4));
+}
+
+Expected<uint32_t> llvm::pdb::hashTypeRecord(const CVType &Rec) {
+  switch (Rec.kind()) {
+  case LF_CLASS:
+  case LF_STRUCTURE:
+  case LF_INTERFACE:
+    return getHashForUdt<ClassRecord>(Rec);
+  case LF_UNION:
+    return getHashForUdt<UnionRecord>(Rec);
+  case LF_ENUM:
+    return getHashForUdt<EnumRecord>(Rec);
+
+  case LF_UDT_SRC_LINE:
+    return getSourceLineHash<UdtSourceLineRecord>(Rec);
+  case LF_UDT_MOD_SRC_LINE:
+    return getSourceLineHash<UdtModSourceLineRecord>(Rec);
+
+  default:
+    break;
+  }
+
+  // Run CRC32 over the bytes. This corresponds to `hashBufv8`.
+  JamCRC JC(/*Init=*/0U);
+  ArrayRef<char> Bytes(reinterpret_cast<const char *>(Rec.data().data()),
+                       Rec.data().size());
+  JC.update(Bytes);
+  return JC.getCRC();
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStream.cpp
index a1167cd..d3ef87d 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiStream.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -7,19 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
-#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiHashing.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -32,28 +31,13 @@ using namespace llvm::support;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-TpiStream::TpiStream(const PDBFile &File,
-                     std::unique_ptr<MappedBlockStream> Stream)
+TpiStream::TpiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
     : Pdb(File), Stream(std::move(Stream)) {}
 
 TpiStream::~TpiStream() = default;
 
-// Verifies that a given type record matches with a given hash value.
-// Currently we only verify SRC_LINE records.
-Error TpiStream::verifyHashValues() {
-  TpiHashVerifier Verifier(HashValues, Header->NumHashBuckets);
-  TypeDeserializer Deserializer;
-
-  TypeVisitorCallbackPipeline Pipeline;
-  Pipeline.addCallbackToPipeline(Deserializer);
-  Pipeline.addCallbackToPipeline(Verifier);
-
-  CVTypeVisitor Visitor(Pipeline);
-  return Visitor.visitTypeStream(TypeRecords);
-}
-
 Error TpiStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (Reader.bytesRemaining() < sizeof(TpiStreamHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -81,7 +65,13 @@ Error TpiStream::reload() {
                                 "TPI Stream Invalid number of hash buckets.");
 
   // The actual type records themselves come from this stream
-  if (auto EC = Reader.readArray(TypeRecords, Header->TypeRecordBytes))
+  if (auto EC =
+          Reader.readSubstream(TypeRecordsSubstream, Header->TypeRecordBytes))
+    return EC;
+
+  BinaryStreamReader RecordReader(TypeRecordsSubstream.StreamData);
+  if (auto EC =
+          RecordReader.readArray(TypeRecords, TypeRecordsSubstream.size()))
     return EC;
 
   // Hash indices, hash values, etc come from the hash stream.
@@ -91,21 +81,20 @@ Error TpiStream::reload() {
                                   "Invalid TPI hash stream index.");
 
     auto HS = MappedBlockStream::createIndexedStream(
-        Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex);
-    StreamReader HSR(*HS);
+        Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex,
+        Pdb.getAllocator());
+    BinaryStreamReader HSR(*HS);
 
+    // There should be a hash value for every type record, or no hashes at all.
     uint32_t NumHashValues =
         Header->HashValueBuffer.Length / sizeof(ulittle32_t);
-    if (NumHashValues != NumTypeRecords())
+    if (NumHashValues != getNumTypeRecords() && NumHashValues != 0)
       return make_error<RawError>(
           raw_error_code::corrupt_file,
           "TPI hash count does not match with the number of type records.");
     HSR.setOffset(Header->HashValueBuffer.Off);
     if (auto EC = HSR.readArray(HashValues, NumHashValues))
       return EC;
-    std::vector<ulittle32_t> HashValueList;
-    for (auto I : HashValues)
-      HashValueList.push_back(I);
 
     HSR.setOffset(Header->IndexOffsetBuffer.Off);
     uint32_t NumTypeIndexOffsets =
@@ -113,20 +102,17 @@ Error TpiStream::reload() {
     if (auto EC = HSR.readArray(TypeIndexOffsets, NumTypeIndexOffsets))
       return EC;
 
-    HSR.setOffset(Header->HashAdjBuffer.Off);
-    uint32_t NumHashAdjustments =
-        Header->HashAdjBuffer.Length / sizeof(TypeIndexOffset);
-    if (auto EC = HSR.readArray(HashAdjustments, NumHashAdjustments))
-      return EC;
+    if (Header->HashAdjBuffer.Length > 0) {
+      HSR.setOffset(Header->HashAdjBuffer.Off);
+      if (auto EC = HashAdjusters.load(HSR))
+        return EC;
+    }
 
     HashStream = std::move(HS);
-
-    // TPI hash table is a parallel array for the type records.
-    // Verify that the hash values match with type records.
-    if (auto EC = verifyHashValues())
-      return EC;
   }
 
+  Types = llvm::make_unique<LazyRandomTypeCollection>(
+      TypeRecords, getNumTypeRecords(), getTypeIndexOffsets());
   return Error::success();
 }
 
@@ -139,7 +125,7 @@ uint32_t TpiStream::TypeIndexBegin() const { return Header->TypeIndexBegin; }
 
 uint32_t TpiStream::TypeIndexEnd() const { return Header->TypeIndexEnd; }
 
-uint32_t TpiStream::NumTypeRecords() const {
+uint32_t TpiStream::getNumTypeRecords() const {
   return TypeIndexEnd() - TypeIndexBegin();
 }
 
@@ -151,26 +137,24 @@ uint16_t TpiStream::getTypeHashStreamAuxIndex() const {
   return Header->HashAuxStreamIndex;
 }
 
-uint32_t TpiStream::NumHashBuckets() const { return Header->NumHashBuckets; }
+uint32_t TpiStream::getNumHashBuckets() const { return Header->NumHashBuckets; }
 uint32_t TpiStream::getHashKeySize() const { return Header->HashKeySize; }
 
-FixedStreamArray<support::ulittle32_t>
-TpiStream::getHashValues() const {
+BinarySubstreamRef TpiStream::getTypeRecordsSubstream() const {
+  return TypeRecordsSubstream;
+}
+
+FixedStreamArray<support::ulittle32_t> TpiStream::getHashValues() const {
   return HashValues;
 }
 
-FixedStreamArray<TypeIndexOffset>
-TpiStream::getTypeIndexOffsets() const {
+FixedStreamArray<TypeIndexOffset> TpiStream::getTypeIndexOffsets() const {
   return TypeIndexOffsets;
 }
 
-FixedStreamArray<TypeIndexOffset>
-TpiStream::getHashAdjustments() const {
-  return HashAdjustments;
-}
+HashTable &TpiStream::getHashAdjusters() { return HashAdjusters; }
 
-iterator_range<CVTypeArray::Iterator>
-TpiStream::types(bool *HadError) const {
+CVTypeRange TpiStream::types(bool *HadError) const {
   return make_range(TypeRecords.begin(HadError), TypeRecords.end());
 }
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
new file mode 100644
index 0000000..9e943c7
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -0,0 +1,177 @@
+//===- TpiStreamBuilder.cpp -   -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+
+using namespace llvm;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+using namespace llvm::support;
+
+TpiStreamBuilder::TpiStreamBuilder(MSFBuilder &Msf, uint32_t StreamIdx)
+    : Msf(Msf), Allocator(Msf.getAllocator()), Header(nullptr), Idx(StreamIdx) {
+}
+
+TpiStreamBuilder::~TpiStreamBuilder() = default;
+
+void TpiStreamBuilder::setVersionHeader(PdbRaw_TpiVer Version) {
+  VerHeader = Version;
+}
+
+void TpiStreamBuilder::addTypeRecord(ArrayRef<uint8_t> Record,
+                                     Optional<uint32_t> Hash) {
+  // If we just crossed an 8KB threshold, add a type index offset.
+  size_t NewSize = TypeRecordBytes + Record.size();
+  constexpr size_t EightKB = 8 * 1024;
+  if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecords.empty()) {
+    TypeIndexOffsets.push_back(
+        {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
+                             TypeRecords.size()),
+         ulittle32_t(TypeRecordBytes)});
+  }
+  TypeRecordBytes = NewSize;
+
+  TypeRecords.push_back(Record);
+  if (Hash)
+    TypeHashes.push_back(*Hash);
+}
+
+Error TpiStreamBuilder::finalize() {
+  if (Header)
+    return Error::success();
+
+  TpiStreamHeader *H = Allocator.Allocate<TpiStreamHeader>();
+
+  uint32_t Count = TypeRecords.size();
+
+  H->Version = VerHeader;
+  H->HeaderSize = sizeof(TpiStreamHeader);
+  H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
+  H->TypeIndexEnd = H->TypeIndexBegin + Count;
+  H->TypeRecordBytes = TypeRecordBytes;
+
+  H->HashStreamIndex = HashStreamIndex;
+  H->HashAuxStreamIndex = kInvalidStreamIndex;
+  H->HashKeySize = sizeof(ulittle32_t);
+  H->NumHashBuckets = MinTpiHashBuckets;
+
+  // Recall that hash values go into a completely different stream identified by
+  // the `HashStreamIndex` field of the `TpiStreamHeader`.  Therefore, the data
+  // begins at offset 0 of this independent stream.
+  H->HashValueBuffer.Off = 0;
+  H->HashValueBuffer.Length = calculateHashBufferSize();
+
+  // We never write any adjustments into our PDBs, so this is usually some
+  // offset with zero length.
+  H->HashAdjBuffer.Off = H->HashValueBuffer.Off + H->HashValueBuffer.Length;
+  H->HashAdjBuffer.Length = 0;
+
+  H->IndexOffsetBuffer.Off = H->HashAdjBuffer.Off + H->HashAdjBuffer.Length;
+  H->IndexOffsetBuffer.Length = calculateIndexOffsetSize();
+
+  Header = H;
+  return Error::success();
+}
+
+uint32_t TpiStreamBuilder::calculateSerializedLength() {
+  return sizeof(TpiStreamHeader) + TypeRecordBytes;
+}
+
+uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
+  assert((TypeRecords.size() == TypeHashes.size() || TypeHashes.empty()) &&
+         "either all or no type records should have hashes");
+  return TypeHashes.size() * sizeof(ulittle32_t);
+}
+
+uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
+  return TypeIndexOffsets.size() * sizeof(codeview::TypeIndexOffset);
+}
+
+Error TpiStreamBuilder::finalizeMsfLayout() {
+  uint32_t Length = calculateSerializedLength();
+  if (auto EC = Msf.setStreamSize(Idx, Length))
+    return EC;
+
+  uint32_t HashStreamSize =
+      calculateHashBufferSize() + calculateIndexOffsetSize();
+
+  if (HashStreamSize == 0)
+    return Error::success();
+
+  auto ExpectedIndex = Msf.addStream(HashStreamSize);
+  if (!ExpectedIndex)
+    return ExpectedIndex.takeError();
+  HashStreamIndex = *ExpectedIndex;
+  if (!TypeHashes.empty()) {
+    ulittle32_t *H = Allocator.Allocate<ulittle32_t>(TypeHashes.size());
+    MutableArrayRef<ulittle32_t> HashBuffer(H, TypeHashes.size());
+    for (uint32_t I = 0; I < TypeHashes.size(); ++I) {
+      HashBuffer[I] = TypeHashes[I] % MinTpiHashBuckets;
+    }
+    ArrayRef<uint8_t> Bytes(
+        reinterpret_cast<const uint8_t *>(HashBuffer.data()),
+        calculateHashBufferSize());
+    HashValueStream =
+        llvm::make_unique<BinaryByteStream>(Bytes, llvm::support::little);
+  }
+  return Error::success();
+}
+
+Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
+                               WritableBinaryStreamRef Buffer) {
+  if (auto EC = finalize())
+    return EC;
+
+  auto InfoS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
+                                                              Idx, Allocator);
+
+  BinaryStreamWriter Writer(*InfoS);
+  if (auto EC = Writer.writeObject(*Header))
+    return EC;
+
+  for (auto Rec : TypeRecords)
+    if (auto EC = Writer.writeBytes(Rec))
+      return EC;
+
+  if (HashStreamIndex != kInvalidStreamIndex) {
+    auto HVS = WritableMappedBlockStream::createIndexedStream(
+        Layout, Buffer, HashStreamIndex, Allocator);
+    BinaryStreamWriter HW(*HVS);
+    if (HashValueStream) {
+      if (auto EC = HW.writeStreamRef(*HashValueStream))
+        return EC;
+    }
+
+    for (auto &IndexOffset : TypeIndexOffsets) {
+      if (auto EC = HW.writeObject(IndexOffset))
+        return EC;
+    }
+  }
+
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp
index 0d72059..501d4f5 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDB.cpp
@@ -1,4 +1,4 @@
-//===- PDB.cpp - base header file for creating a PDB reader -----*- C++ -*-===//
+//===- PDB.cpp - base header file for creating a PDB reader ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,18 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDB.h"
-
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/IPDBSession.h"
-#include "llvm/DebugInfo/PDB/PDB.h"
 #if LLVM_ENABLE_DIA_SDK
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #endif
-#include "llvm/DebugInfo/PDB/Raw/RawSession.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/Support/Error.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -27,25 +23,25 @@ using namespace llvm::pdb;
 Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
                                 std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
-  if (Type == PDB_ReaderType::Raw)
-    return RawSession::createFromPdb(Path, Session);
+  if (Type == PDB_ReaderType::Native)
+    return NativeSession::createFromPdb(Path, Session);
 
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromPdb(Path, Session);
 #else
-  return llvm::make_error<GenericError>("DIA is not installed on the system");
+  return make_error<GenericError>("DIA is not installed on the system");
 #endif
 }
 
 Error llvm::pdb::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
                                 std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
-  if (Type == PDB_ReaderType::Raw)
-    return RawSession::createFromExe(Path, Session);
+  if (Type == PDB_ReaderType::Native)
+    return NativeSession::createFromExe(Path, Session);
 
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromExe(Path, Session);
 #else
-  return llvm::make_error<GenericError>("DIA is not installed on the system");
+  return make_error<GenericError>("DIA is not installed on the system");
 #endif
 }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp
index 94b81ec..df0feac 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBContext.cpp
@@ -12,8 +12,8 @@
 #include "llvm/DebugInfo/PDB/IPDBLineNumber.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 #include "llvm/Object/COFF.h"
 
@@ -29,8 +29,7 @@ PDBContext::PDBContext(const COFFObjectFile &Object,
     Session->setLoadAddress(ImageBase.get());
 }
 
-void PDBContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
-                      bool SummarizeTypes) {}
+void PDBContext::dump(raw_ostream &OS, DIDumpOptions DumpOpts){}
 
 DILineInfo PDBContext::getLineInfoForAddress(uint64_t Address,
                                              DILineInfoSpecifier Specifier) {
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index b7eee6e..c291185 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -1,4 +1,4 @@
-//===- PDBExtras.cpp - helper functions and classes for PDBs -----*- C++-*-===//
+//===- PDBExtras.cpp - helper functions and classes for PDBs --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,8 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
-
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -269,25 +270,6 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   return OS;
 }
 
-raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
-  static const char *Lookup = "0123456789ABCDEF";
-
-  static_assert(sizeof(PDB_UniqueId) == 16, "Expected 16-byte GUID");
-  ArrayRef<uint8_t> GuidBytes(reinterpret_cast<const uint8_t*>(&Id), 16);
-  OS << "{";
-  for (int i=0; i < 16;) {
-    uint8_t Byte = GuidBytes[i];
-    uint8_t HighNibble = (Byte >> 4) & 0xF;
-    uint8_t LowNibble = Byte & 0xF;
-    OS << Lookup[HighNibble] << Lookup[LowNibble];
-    ++i;
-    if (i>=4 && i<=10 && i%2==0)
-      OS << "-";
-  }
-  OS << "}";
-  return OS;
-}
-
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
                                    const PDB_Machine &Machine) {
   switch (Machine) {
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
index 633e11a..74010c2 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolAnnotation.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
@@ -53,6 +54,9 @@ PDBSymbol::PDBSymbol(const IPDBSession &PDBSession,
                      std::unique_ptr<IPDBRawSymbol> Symbol)
     : Session(PDBSession), RawSymbol(std::move(Symbol)) {}
 
+PDBSymbol::PDBSymbol(PDBSymbol &Symbol)
+    : Session(Symbol.Session), RawSymbol(std::move(Symbol.RawSymbol)) {}
+
 PDBSymbol::~PDBSymbol() = default;
 
 #define FACTORY_SYMTAG_CASE(Tag, Type)                                         \
@@ -99,16 +103,30 @@ PDBSymbol::create(const IPDBSession &PDBSession,
   }
 }
 
-#define TRY_DUMP_TYPE(Type)                                                    \
-  if (const Type *DerivedThis = dyn_cast<Type>(this))                          \
-    Dumper.dump(OS, Indent, *DerivedThis);
-
-#define ELSE_TRY_DUMP_TYPE(Type, Dumper) else TRY_DUMP_TYPE(Type, Dumper)
-
 void PDBSymbol::defaultDump(raw_ostream &OS, int Indent) const {
   RawSymbol->dump(OS, Indent);
 }
 
+void PDBSymbol::dumpProperties() const {
+  outs() << "\n";
+  defaultDump(outs(), 0);
+  outs().flush();
+}
+
+void PDBSymbol::dumpChildStats() const {
+  TagStats Stats;
+  getChildStats(Stats);
+  outs() << "\n";
+  for (auto &Stat : Stats) {
+    outs() << Stat.first << ": " << Stat.second << "\n";
+  }
+  outs().flush();
+}
+
+std::unique_ptr<PDBSymbol> PDBSymbol::clone() const {
+  return Session.getSymbolById(getSymIndexId());
+}
+
 PDB_SymType PDBSymbol::getSymTag() const { return RawSymbol->getSymTag(); }
 uint32_t PDBSymbol::getSymIndexId() const { return RawSymbol->getSymIndexId(); }
 
@@ -141,6 +159,8 @@ PDBSymbol::findInlineFramesByRVA(uint32_t RVA) const {
 std::unique_ptr<IPDBEnumSymbols>
 PDBSymbol::getChildStats(TagStats &Stats) const {
   std::unique_ptr<IPDBEnumSymbols> Result(findAllChildren());
+  if (!Result)
+    return nullptr;
   Stats.clear();
   while (auto Child = Result->getNext()) {
     ++Stats[Child->getSymTag()];
@@ -148,3 +168,7 @@ PDBSymbol::getChildStats(TagStats &Stats) const {
   Result->reset();
   return Result;
 }
+
+std::unique_ptr<PDBSymbol> PDBSymbol::getSymbolByIdHelper(uint32_t Id) const {
+  return Session.getSymbolById(Id);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
index cdb167b..3648272 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolAnnotation::PDBSymbolAnnotation(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Annotation);
+}
 
 void PDBSymbolAnnotation::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
index fd5dc94..7076b4a 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,6 +19,8 @@ using namespace llvm::pdb;
 
 PDBSymbolBlock::PDBSymbolBlock(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Block);
+}
 
 void PDBSymbolBlock::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index ebff088..854cf42 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompiland::PDBSymbolCompiland(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Compiland);
+}
 
 void PDBSymbolCompiland::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
index 6dbd522..f73cd36 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompilandDetails::PDBSymbolCompilandDetails(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CompilandDetails);
+}
 
 void PDBSymbolCompilandDetails::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
index 9c7f0b1..df696fa 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -10,8 +10,8 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h"
 
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompilandEnv::PDBSymbolCompilandEnv(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CompilandEnv);
+}
 
 std::string PDBSymbolCompilandEnv::getValue() const {
   Variant Value = RawSymbol->getValue();
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
index 0ea387a..a7b69a7 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -10,8 +10,8 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolCustom.h"
 
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCustom::PDBSymbolCustom(const IPDBSession &PDBSession,
                                  std::unique_ptr<IPDBRawSymbol> CustomSymbol)
-    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {}
+    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Custom);
+}
 
 void PDBSymbolCustom::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) {
   RawSymbol->getDataBytes(bytes);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
index 62bb6f3..6002668 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -19,10 +19,8 @@ using namespace llvm::pdb;
 
 PDBSymbolData::PDBSymbolData(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> DataSymbol)
-    : PDBSymbol(PDBSession, std::move(DataSymbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolData::getType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(DataSymbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Data);
 }
 
 void PDBSymbolData::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
index 60101c1..7417167 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 
 #include <utility>
 
@@ -18,6 +19,18 @@ using namespace llvm::pdb;
 
 PDBSymbolExe::PDBSymbolExe(const IPDBSession &PDBSession,
                            std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Exe);
+}
 
 void PDBSymbolExe::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
+
+uint32_t PDBSymbolExe::getPointerByteSize() const {
+  auto Pointer = findOneChild<PDBSymbolTypePointer>();
+  if (Pointer)
+    return Pointer->getLength();
+
+  if (getMachineType() == PDB_Machine::x86)
+    return 4;
+  return 8;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index 35251c0..5a5cb4c 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -12,10 +12,10 @@
 #include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
-#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include <unordered_set>
@@ -85,10 +85,8 @@ private:
 
 PDBSymbolFunc::PDBSymbolFunc(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbolTypeFunctionSig> PDBSymbolFunc::getSignature() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeFunctionSig>(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Function);
 }
 
 std::unique_ptr<IPDBEnumChildren<PDBSymbolData>>
@@ -96,8 +94,15 @@ PDBSymbolFunc::getArguments() const {
   return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
-std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolFunc::getClassParent() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
-}
-
 void PDBSymbolFunc::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
+
+bool PDBSymbolFunc::isDestructor() const {
+  std::string Name = getName();
+  if (Name.empty())
+    return false;
+  if (Name[0] == '~')
+    return true;
+  if (Name == "__vecDelDtor")
+    return true;
+  return false;
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
index 77e996f..4a4195b 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolFuncDebugEnd::PDBSymbolFuncDebugEnd(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FuncDebugEnd);
+}
 
 void PDBSymbolFuncDebugEnd::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
index 9c65387..a448a40 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolFuncDebugStart::PDBSymbolFuncDebugStart(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FuncDebugStart);
+}
 
 void PDBSymbolFuncDebugStart::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
index d2cfd11..a67a20d 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolLabel::PDBSymbolLabel(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Label);
+}
 
 void PDBSymbolLabel::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
index 97d6687..dbec16f 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolPublicSymbol::PDBSymbolPublicSymbol(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::PublicSymbol);
+}
 
 void PDBSymbolPublicSymbol::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
index ef8897d..b264819 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolThunk::PDBSymbolThunk(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Thunk);
+}
 
 void PDBSymbolThunk::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
index c010cc5..a8054a4 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -19,12 +19,14 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeArray::PDBSymbolTypeArray(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypeArray::getElementType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::ArrayType);
 }
 
 void PDBSymbolTypeArray::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypeArray::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
index 382c397..0fdf8b6 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeBaseClass::PDBSymbolTypeBaseClass(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::BaseClass);
+}
 
 void PDBSymbolTypeBaseClass::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index e5d65bf..0bf563a 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeBuiltin::PDBSymbolTypeBuiltin(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::BuiltinType);
+}
 
 void PDBSymbolTypeBuiltin::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
index 1d80c97..726e7e1 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeCustom.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeCustom::PDBSymbolTypeCustom(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CustomType);
+}
 
 void PDBSymbolTypeCustom::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
index 535d97d..6c84b98 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -10,8 +10,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeDimension.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeDimension::PDBSymbolTypeDimension(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Dimension);
+}
 
 void PDBSymbolTypeDimension::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
index 788f2b7..2addea0 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -21,15 +21,8 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeEnum::PDBSymbolTypeEnum(const IPDBSession &PDBSession,
                                      std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolTypeEnum::getClassParent() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
-}
-
-std::unique_ptr<PDBSymbolTypeBuiltin>
-PDBSymbolTypeEnum::getUnderlyingType() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeBuiltin>(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Enum);
 }
 
 void PDBSymbolTypeEnum::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
index 5831bae..c018772 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeFriend::PDBSymbolTypeFriend(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Friend);
+}
 
 void PDBSymbolTypeFriend::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
index c6f586d..4d5cd63 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeFunctionArg::PDBSymbolTypeFunctionArg(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FunctionArg);
+}
 
 void PDBSymbolTypeFunctionArg::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
index 057ae26..0304c62 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -12,9 +12,9 @@
 #include "llvm/DebugInfo/PDB/ConcreteSymbolEnumerator.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h"
-#include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 
 #include <utility>
 
@@ -68,10 +68,8 @@ private:
 
 PDBSymbolTypeFunctionSig::PDBSymbolTypeFunctionSig(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getReturnType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FunctionSig);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -79,13 +77,10 @@ PDBSymbolTypeFunctionSig::getArguments() const {
   return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
-std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getClassParent() const {
-  uint32_t ClassId = getClassParentId();
-  if (ClassId == 0)
-    return nullptr;
-  return Session.getSymbolById(ClassId);
-}
-
 void PDBSymbolTypeFunctionSig::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypeFunctionSig::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
index 072d2cf..7cfba82 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeManaged.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeManaged::PDBSymbolTypeManaged(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::ManagedType);
+}
 
 void PDBSymbolTypeManaged::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
index 6997714..6981981 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -19,12 +19,14 @@ using namespace llvm::pdb;
 
 PDBSymbolTypePointer::PDBSymbolTypePointer(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypePointer::getPointeeType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::PointerType);
 }
 
 void PDBSymbolTypePointer::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypePointer::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
index 0f283b9..102b540 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeTypedef::PDBSymbolTypeTypedef(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Typedef);
+}
 
 void PDBSymbolTypeTypedef::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
index c71838c..15dc153 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -9,7 +9,15 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
+#include "llvm/DebugInfo/PDB/UDTLayout.h"
 
 #include <utility>
 
@@ -18,6 +26,8 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeUDT::PDBSymbolTypeUDT(const IPDBSession &PDBSession,
                                    std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::UDT);
+}
 
 void PDBSymbolTypeUDT::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
index 6b76db5..9a21855 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeVTable::PDBSymbolTypeVTable(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::VTable);
+}
 
 void PDBSymbolTypeVTable::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
index ef509d6..ddc0574 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeVTableShape::PDBSymbolTypeVTableShape(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::VTableShape);
+}
 
 void PDBSymbolTypeVTableShape::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
index dbbea9c..fdbe845 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUnknown.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolUnknown.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
diff --git a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
index 6a62d55..f40578f 100644
--- a/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
+++ b/contrib/llvm/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -9,8 +9,8 @@
 
 #include "llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h"
 
-#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
 
 #include <utility>
 
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolUsingNamespace::PDBSymbolUsingNamespace(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::UsingNamespace);
+}
 
 void PDBSymbolUsingNamespace::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/InfoStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/InfoStream.cpp
deleted file mode 100644
index f19535d..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/InfoStream.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===- InfoStream.cpp - PDB Info Stream (Stream 1) Access -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawConstants.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
-    : Stream(std::move(Stream)) {}
-
-Error InfoStream::reload() {
-  StreamReader Reader(*Stream);
-
-  const InfoStreamHeader *H;
-  if (auto EC = Reader.readObject(H))
-    return joinErrors(
-        std::move(EC),
-        make_error<RawError>(raw_error_code::corrupt_file,
-                             "PDB Stream does not contain a header."));
-
-  switch (H->Version) {
-  case PdbImplVC70:
-  case PdbImplVC80:
-  case PdbImplVC110:
-  case PdbImplVC140:
-    break;
-  default:
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Unsupported PDB stream version.");
-  }
-
-  Version = H->Version;
-  Signature = H->Signature;
-  Age = H->Age;
-  Guid = H->Guid;
-
-  return NamedStreams.load(Reader);
-}
-
-uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
-  uint32_t Result;
-  if (!NamedStreams.tryGetValue(Name, Result))
-    return 0;
-  return Result;
-}
-
-iterator_range<StringMapConstIterator<uint32_t>>
-InfoStream::named_streams() const {
-  return NamedStreams.entries();
-}
-
-PdbRaw_ImplVer InfoStream::getVersion() const {
-  return static_cast<PdbRaw_ImplVer>(Version);
-}
-
-uint32_t InfoStream::getSignature() const { return Signature; }
-
-uint32_t InfoStream::getAge() const { return Age; }
-
-PDB_UniqueId InfoStream::getGuid() const { return Guid; }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp
deleted file mode 100644
index 73fbf85..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/InfoStreamBuilder.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//===- InfoStreamBuilder.cpp - PDB Info Stream Creation ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Raw/InfoStreamBuilder.h"
-
-#include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-InfoStreamBuilder::InfoStreamBuilder(msf::MSFBuilder &Msf)
-    : Msf(Msf), Ver(PdbRaw_ImplVer::PdbImplVC70), Sig(-1), Age(0) {}
-
-void InfoStreamBuilder::setVersion(PdbRaw_ImplVer V) { Ver = V; }
-
-void InfoStreamBuilder::setSignature(uint32_t S) { Sig = S; }
-
-void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
-
-void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; }
-
-NameMapBuilder &InfoStreamBuilder::getNamedStreamsBuilder() {
-  return NamedStreams;
-}
-
-uint32_t InfoStreamBuilder::calculateSerializedLength() const {
-  return sizeof(InfoStreamHeader) + NamedStreams.calculateSerializedLength();
-}
-
-Error InfoStreamBuilder::finalizeMsfLayout() {
-  uint32_t Length = calculateSerializedLength();
-  if (auto EC = Msf.setStreamSize(StreamPDB, Length))
-    return EC;
-  return Error::success();
-}
-
-Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
-                                const msf::WritableStream &Buffer) const {
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamPDB);
-  StreamWriter Writer(*InfoS);
-
-  InfoStreamHeader H;
-  H.Age = Age;
-  H.Signature = Sig;
-  H.Version = Ver;
-  H.Guid = Guid;
-  if (auto EC = Writer.writeObject(H))
-    return EC;
-
-  return NamedStreams.commit(Writer);
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/ModInfo.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/ModInfo.cpp
deleted file mode 100644
index b34d770..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/ModInfo.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===- ModInfo.cpp - PDB module information -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MathExtras.h"
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-using namespace llvm::support;
-
-ModInfo::ModInfo() = default;
-
-ModInfo::ModInfo(const ModInfo &Info) = default;
-
-ModInfo::~ModInfo() = default;
-
-Error ModInfo::initialize(ReadableStreamRef Stream, ModInfo &Info) {
-  StreamReader Reader(Stream);
-  if (auto EC = Reader.readObject(Info.Layout))
-    return EC;
-
-  if (auto EC = Reader.readZeroString(Info.ModuleName))
-    return EC;
-
-  if (auto EC = Reader.readZeroString(Info.ObjFileName))
-    return EC;
-  return Error::success();
-}
-
-bool ModInfo::hasECInfo() const {
-  return (Layout->Flags & ModInfoFlags::HasECFlagMask) != 0;
-}
-
-uint16_t ModInfo::getTypeServerIndex() const {
-  return (Layout->Flags & ModInfoFlags::TypeServerIndexMask) >>
-         ModInfoFlags::TypeServerIndexShift;
-}
-
-uint16_t ModInfo::getModuleStreamIndex() const { return Layout->ModDiStream; }
-
-uint32_t ModInfo::getSymbolDebugInfoByteSize() const {
-  return Layout->SymBytes;
-}
-
-uint32_t ModInfo::getLineInfoByteSize() const { return Layout->LineBytes; }
-
-uint32_t ModInfo::getC13LineInfoByteSize() const { return Layout->C13Bytes; }
-
-uint32_t ModInfo::getNumberOfFiles() const { return Layout->NumFiles; }
-
-uint32_t ModInfo::getSourceFileNameIndex() const {
-  return Layout->SrcFileNameNI;
-}
-
-uint32_t ModInfo::getPdbFilePathNameIndex() const {
-  return Layout->PdbFilePathNI;
-}
-
-StringRef ModInfo::getModuleName() const { return ModuleName; }
-
-StringRef ModInfo::getObjFileName() const { return ObjFileName; }
-
-uint32_t ModInfo::getRecordLength() const {
-  uint32_t M = ModuleName.str().size() + 1;
-  uint32_t O = ObjFileName.str().size() + 1;
-  uint32_t Size = sizeof(ModuleInfoHeader) + M + O;
-  Size = alignTo(Size, 4);
-  return Size;
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/ModStream.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/ModStream.cpp
deleted file mode 100644
index 0ffc5b7..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/ModStream.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-//===- ModStream.cpp - PDB Module Info Stream Access ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-#include "llvm/DebugInfo/PDB/Raw/ModInfo.h"
-#include "llvm/DebugInfo/PDB/Raw/ModStream.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-ModStream::ModStream(const ModInfo &Module,
-                     std::unique_ptr<MappedBlockStream> Stream)
-    : Mod(Module), Stream(std::move(Stream)) {}
-
-ModStream::~ModStream() = default;
-
-Error ModStream::reload() {
-  StreamReader Reader(*Stream);
-
-  uint32_t SymbolSize = Mod.getSymbolDebugInfoByteSize();
-  uint32_t C11Size = Mod.getLineInfoByteSize();
-  uint32_t C13Size = Mod.getC13LineInfoByteSize();
-
-  if (C11Size > 0 && C13Size > 0)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Module has both C11 and C13 line info");
-
-  ReadableStreamRef S;
-
-  if (auto EC = Reader.readInteger(Signature))
-    return EC;
-  if (auto EC = Reader.readArray(SymbolsSubstream, SymbolSize - 4))
-    return EC;
-
-  if (auto EC = Reader.readStreamRef(LinesSubstream, C11Size))
-    return EC;
-  if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size))
-    return EC;
-
-  StreamReader LineReader(C13LinesSubstream);
-  if (auto EC = LineReader.readArray(LineInfo, LineReader.bytesRemaining()))
-    return EC;
-
-  uint32_t GlobalRefsSize;
-  if (auto EC = Reader.readInteger(GlobalRefsSize))
-    return EC;
-  if (auto EC = Reader.readStreamRef(GlobalRefsSubstream, GlobalRefsSize))
-    return EC;
-  if (Reader.bytesRemaining() > 0)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Unexpected bytes in module stream.");
-
-  return Error::success();
-}
-
-iterator_range<codeview::CVSymbolArray::Iterator>
-ModStream::symbols(bool *HadError) const {
-  // It's OK if the stream is empty.
-  if (SymbolsSubstream.getUnderlyingStream().getLength() == 0)
-    return make_range(SymbolsSubstream.end(), SymbolsSubstream.end());
-  return make_range(SymbolsSubstream.begin(HadError), SymbolsSubstream.end());
-}
-
-iterator_range<codeview::ModuleSubstreamArray::Iterator>
-ModStream::lines(bool *HadError) const {
-  return make_range(LineInfo.begin(HadError), LineInfo.end());
-}
-
-Error ModStream::commit() { return Error::success(); }
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/NameHashTable.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/NameHashTable.cpp
deleted file mode 100644
index 84cccb35..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/NameHashTable.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-//===- NameHashTable.cpp - PDB Name Hash Table ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Raw/NameHashTable.h"
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/Hash.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/Support/Endian.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::support;
-using namespace llvm::pdb;
-
-NameHashTable::NameHashTable() : Signature(0), HashVersion(0), NameCount(0) {}
-
-Error NameHashTable::load(StreamReader &Stream) {
-  struct Header {
-    support::ulittle32_t Signature;
-    support::ulittle32_t HashVersion;
-    support::ulittle32_t ByteSize;
-  };
-
-  const Header *H;
-  if (auto EC = Stream.readObject(H))
-    return EC;
-
-  if (H->Signature != 0xEFFEEFFE)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid hash table signature");
-  if (H->HashVersion != 1 && H->HashVersion != 2)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Unsupported hash version");
-
-  Signature = H->Signature;
-  HashVersion = H->HashVersion;
-  if (auto EC = Stream.readStreamRef(NamesBuffer, H->ByteSize))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Invalid hash table byte length"));
-
-  const support::ulittle32_t *HashCount;
-  if (auto EC = Stream.readObject(HashCount))
-    return EC;
-
-  if (auto EC = Stream.readArray(IDs, *HashCount))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Could not read bucket array"));
-
-  if (Stream.bytesRemaining() < sizeof(support::ulittle32_t))
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Missing name count");
-
-  if (auto EC = Stream.readInteger(NameCount))
-    return EC;
-  return Error::success();
-}
-
-StringRef NameHashTable::getStringForID(uint32_t ID) const {
-  if (ID == IDs[0])
-    return StringRef();
-
-  // NamesBuffer is a buffer of null terminated strings back to back.  ID is
-  // the starting offset of the string we're looking for.  So just seek into
-  // the desired offset and a read a null terminated stream from that offset.
-  StringRef Result;
-  StreamReader NameReader(NamesBuffer);
-  NameReader.setOffset(ID);
-  if (auto EC = NameReader.readZeroString(Result))
-    consumeError(std::move(EC));
-  return Result;
-}
-
-uint32_t NameHashTable::getIDForString(StringRef Str) const {
-  uint32_t Hash = (HashVersion == 1) ? hashStringV1(Str) : hashStringV2(Str);
-  size_t Count = IDs.size();
-  uint32_t Start = Hash % Count;
-  for (size_t I = 0; I < Count; ++I) {
-    // The hash is just a starting point for the search, but if it
-    // doesn't work we should find the string no matter what, because
-    // we iterate the entire array.
-    uint32_t Index = (Start + I) % Count;
-
-    uint32_t ID = IDs[Index];
-    StringRef S = getStringForID(ID);
-    if (S == Str)
-      return ID;
-  }
-  // IDs[0] contains the ID of the "invalid" entry.
-  return IDs[0];
-}
-
-FixedStreamArray<support::ulittle32_t> NameHashTable::name_ids() const {
-  return IDs;
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/NameMap.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/NameMap.cpp
deleted file mode 100644
index 0f55f58..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/NameMap.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-//===- NameMap.cpp - PDB Name Map -------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/PDB/Raw/NameMap.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-NameMap::NameMap() = default;
-
-Error NameMap::load(StreamReader &Stream) {
-  // This is some sort of weird string-set/hash table encoded in the stream.
-  // It starts with the number of bytes in the table.
-  uint32_t NumberOfBytes;
-  if (auto EC = Stream.readInteger(NumberOfBytes))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map length"));
-  if (Stream.bytesRemaining() < NumberOfBytes)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Invalid name map length");
-
-  // Following that field is the starting offset of strings in the name table.
-  uint32_t StringsOffset = Stream.getOffset();
-  Stream.setOffset(StringsOffset + NumberOfBytes);
-
-  // This appears to be equivalent to the total number of strings *actually*
-  // in the name table.
-  uint32_t HashSize;
-  if (auto EC = Stream.readInteger(HashSize))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map hash size"));
-
-  // This appears to be an upper bound on the number of strings in the name
-  // table.
-  uint32_t MaxNumberOfStrings;
-  if (auto EC = Stream.readInteger(MaxNumberOfStrings))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map max strings"));
-
-  if (MaxNumberOfStrings > (UINT32_MAX / sizeof(uint32_t)))
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Implausible number of strings");
-
-  const uint32_t MaxNumberOfWords = UINT32_MAX / (sizeof(uint32_t) * 8);
-
-  // This appears to be a hash table which uses bitfields to determine whether
-  // or not a bucket is 'present'.
-  uint32_t NumPresentWords;
-  if (auto EC = Stream.readInteger(NumPresentWords))
-    return joinErrors(std::move(EC),
-                      make_error<RawError>(raw_error_code::corrupt_file,
-                                           "Expected name map num words"));
-
-  if (NumPresentWords > MaxNumberOfWords)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Number of present words is too large");
-
-  SparseBitVector<> Present;
-  for (uint32_t I = 0; I != NumPresentWords; ++I) {
-    uint32_t Word;
-    if (auto EC = Stream.readInteger(Word))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map word"));
-    for (unsigned Idx = 0; Idx < 32; ++Idx)
-      if (Word & (1U << Idx))
-        Present.set((I * 32) + Idx);
-  }
-
-  // This appears to be a hash table which uses bitfields to determine whether
-  // or not a bucket is 'deleted'.
-  uint32_t NumDeletedWords;
-  if (auto EC = Stream.readInteger(NumDeletedWords))
-    return joinErrors(
-        std::move(EC),
-        make_error<RawError>(raw_error_code::corrupt_file,
-                             "Expected name map num deleted words"));
-
-  if (NumDeletedWords > MaxNumberOfWords)
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "Number of deleted words is too large");
-
-  SparseBitVector<> Deleted;
-  for (uint32_t I = 0; I != NumDeletedWords; ++I) {
-    uint32_t Word;
-    if (auto EC = Stream.readInteger(Word))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map word"));
-    for (unsigned Idx = 0; Idx < 32; ++Idx)
-      if (Word & (1U << Idx))
-        Deleted.set((I * 32) + Idx);
-  }
-
-  for (unsigned I : Present) {
-    // For all present entries, dump out their mapping.
-    (void)I;
-
-    // This appears to be an offset relative to the start of the strings.
-    // It tells us where the null-terminated string begins.
-    uint32_t NameOffset;
-    if (auto EC = Stream.readInteger(NameOffset))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name offset"));
-
-    // This appears to be a stream number into the stream directory.
-    uint32_t NameIndex;
-    if (auto EC = Stream.readInteger(NameIndex))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name index"));
-
-    // Compute the offset of the start of the string relative to the stream.
-    uint32_t StringOffset = StringsOffset + NameOffset;
-    uint32_t OldOffset = Stream.getOffset();
-    // Pump out our c-string from the stream.
-    StringRef Str;
-    Stream.setOffset(StringOffset);
-    if (auto EC = Stream.readZeroString(Str))
-      return joinErrors(std::move(EC),
-                        make_error<RawError>(raw_error_code::corrupt_file,
-                                             "Expected name map name"));
-
-    Stream.setOffset(OldOffset);
-    // Add this to a string-map from name to stream number.
-    Mapping.insert({Str, NameIndex});
-  }
-
-  return Error::success();
-}
-
-iterator_range<StringMapConstIterator<uint32_t>> NameMap::entries() const {
-  return make_range<StringMapConstIterator<uint32_t>>(Mapping.begin(),
-                                                      Mapping.end());
-}
-
-bool NameMap::tryGetValue(StringRef Name, uint32_t &Value) const {
-  auto Iter = Mapping.find(Name);
-  if (Iter == Mapping.end())
-    return false;
-  Value = Iter->second;
-  return true;
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp
deleted file mode 100644
index f570d59..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/NameMapBuilder.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-//===- NameMapBuilder.cpp - PDB Name Map Builder ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/NameMap.h"
-#include "llvm/DebugInfo/PDB/Raw/NameMapBuilder.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::pdb;
-
-NameMapBuilder::NameMapBuilder() = default;
-
-void NameMapBuilder::addMapping(StringRef Name, uint32_t Mapping) {
-  StringDataBytes += Name.size() + 1;
-  Map.insert({Name, Mapping});
-}
-
-Expected<std::unique_ptr<NameMap>> NameMapBuilder::build() {
-  auto Result = llvm::make_unique<NameMap>();
-  Result->Mapping = Map;
-  return std::move(Result);
-}
-
-uint32_t NameMapBuilder::calculateSerializedLength() const {
-  uint32_t TotalLength = 0;
-
-  TotalLength += sizeof(support::ulittle32_t); // StringDataBytes value
-  TotalLength += StringDataBytes;              // actual string data
-
-  TotalLength += sizeof(support::ulittle32_t); // Hash Size
-  TotalLength += sizeof(support::ulittle32_t); // Max Number of Strings
-  TotalLength += sizeof(support::ulittle32_t); // Num Present Words
-  // One bitmask word for each present entry
-  TotalLength += Map.size() * sizeof(support::ulittle32_t);
-  TotalLength += sizeof(support::ulittle32_t); // Num Deleted Words
-
-  // For each present word, which we are treating as equivalent to the number of
-  // entries in the table, we have a pair of integers.  An offset into the
-  // string data, and a corresponding stream number.
-  TotalLength += Map.size() * 2 * sizeof(support::ulittle32_t);
-
-  return TotalLength;
-}
-
-Error NameMapBuilder::commit(msf::StreamWriter &Writer) const {
-  // The first field is the number of bytes of string data.  So add
-  // up the length of all strings plus a null terminator for each
-  // one.
-  uint32_t NumBytes = 0;
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    NumBytes += B->getKeyLength() + 1;
-  }
-
-  if (auto EC = Writer.writeInteger(NumBytes)) // Number of bytes of string data
-    return EC;
-  // Now all of the string data itself.
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    if (auto EC = Writer.writeZeroString(B->getKey()))
-      return EC;
-  }
-
-  if (auto EC = Writer.writeInteger(Map.size())) // Hash Size
-    return EC;
-
-  if (auto EC = Writer.writeInteger(Map.size())) // Max Number of Strings
-    return EC;
-
-  if (auto EC = Writer.writeInteger(Map.size())) // Num Present Words
-    return EC;
-
-  // For each entry in the mapping, write a bit mask which represents a bucket
-  // to store it in.  We don't use this, so the value we write isn't important
-  // to us, it just has to be there.
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    if (auto EC = Writer.writeInteger(1U))
-      return EC;
-  }
-
-  if (auto EC = Writer.writeInteger(0U)) // Num Deleted Words
-    return EC;
-
-  // Mappings of each word.
-  uint32_t OffsetSoFar = 0;
-  for (auto B = Map.begin(), E = Map.end(); B != E; ++B) {
-    // This is a list of key value pairs where the key is the offset into the
-    // strings buffer, and the value is a stream number.  Write each pair.
-    if (auto EC = Writer.writeInteger(OffsetSoFar))
-      return EC;
-
-    if (auto EC = Writer.writeInteger(B->second))
-      return EC;
-
-    OffsetSoFar += B->getKeyLength() + 1;
-  }
-
-  return Error::success();
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp
deleted file mode 100644
index 6fec0e3..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/PDBFileBuilder.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-//===- PDBFileBuilder.cpp - PDB File Creation -------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Raw/PDBFileBuilder.h"
-
-#include "llvm/ADT/BitVector.h"
-
-#include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/DbiStreamBuilder.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStream.h"
-#include "llvm/DebugInfo/PDB/Raw/InfoStreamBuilder.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStreamBuilder.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-using namespace llvm::support;
-
-PDBFileBuilder::PDBFileBuilder(BumpPtrAllocator &Allocator)
-    : Allocator(Allocator) {}
-
-Error PDBFileBuilder::initialize(uint32_t BlockSize) {
-  auto ExpectedMsf = MSFBuilder::create(Allocator, BlockSize);
-  if (!ExpectedMsf)
-    return ExpectedMsf.takeError();
-  Msf = llvm::make_unique<MSFBuilder>(std::move(*ExpectedMsf));
-  return Error::success();
-}
-
-MSFBuilder &PDBFileBuilder::getMsfBuilder() { return *Msf; }
-
-InfoStreamBuilder &PDBFileBuilder::getInfoBuilder() {
-  if (!Info)
-    Info = llvm::make_unique<InfoStreamBuilder>(*Msf);
-  return *Info;
-}
-
-DbiStreamBuilder &PDBFileBuilder::getDbiBuilder() {
-  if (!Dbi)
-    Dbi = llvm::make_unique<DbiStreamBuilder>(*Msf);
-  return *Dbi;
-}
-
-TpiStreamBuilder &PDBFileBuilder::getTpiBuilder() {
-  if (!Tpi)
-    Tpi = llvm::make_unique<TpiStreamBuilder>(*Msf, StreamTPI);
-  return *Tpi;
-}
-
-TpiStreamBuilder &PDBFileBuilder::getIpiBuilder() {
-  if (!Ipi)
-    Ipi = llvm::make_unique<TpiStreamBuilder>(*Msf, StreamIPI);
-  return *Ipi;
-}
-
-Expected<msf::MSFLayout> PDBFileBuilder::finalizeMsfLayout() const {
-  if (Info) {
-    if (auto EC = Info->finalizeMsfLayout())
-      return std::move(EC);
-  }
-  if (Dbi) {
-    if (auto EC = Dbi->finalizeMsfLayout())
-      return std::move(EC);
-  }
-  if (Tpi) {
-    if (auto EC = Tpi->finalizeMsfLayout())
-      return std::move(EC);
-  }
-  if (Ipi) {
-    if (auto EC = Ipi->finalizeMsfLayout())
-      return std::move(EC);
-  }
-
-  return Msf->build();
-}
-
-Error PDBFileBuilder::commit(StringRef Filename) {
-  auto ExpectedLayout = finalizeMsfLayout();
-  if (!ExpectedLayout)
-    return ExpectedLayout.takeError();
-  auto &Layout = *ExpectedLayout;
-
-  uint64_t Filesize = Layout.SB->BlockSize * Layout.SB->NumBlocks;
-  auto OutFileOrError = FileOutputBuffer::create(Filename, Filesize);
-  if (OutFileOrError.getError())
-    return llvm::make_error<pdb::GenericError>(generic_error_code::invalid_path,
-                                               Filename);
-  FileBufferByteStream Buffer(std::move(*OutFileOrError));
-  StreamWriter Writer(Buffer);
-
-  if (auto EC = Writer.writeObject(*Layout.SB))
-    return EC;
-  uint32_t BlockMapOffset =
-      msf::blockToOffset(Layout.SB->BlockMapAddr, Layout.SB->BlockSize);
-  Writer.setOffset(BlockMapOffset);
-  if (auto EC = Writer.writeArray(Layout.DirectoryBlocks))
-    return EC;
-
-  auto DirStream =
-      WritableMappedBlockStream::createDirectoryStream(Layout, Buffer);
-  StreamWriter DW(*DirStream);
-  if (auto EC =
-          DW.writeInteger(static_cast<uint32_t>(Layout.StreamSizes.size())))
-    return EC;
-
-  if (auto EC = DW.writeArray(Layout.StreamSizes))
-    return EC;
-
-  for (const auto &Blocks : Layout.StreamMap) {
-    if (auto EC = DW.writeArray(Blocks))
-      return EC;
-  }
-
-  if (Info) {
-    if (auto EC = Info->commit(Layout, Buffer))
-      return EC;
-  }
-
-  if (Dbi) {
-    if (auto EC = Dbi->commit(Layout, Buffer))
-      return EC;
-  }
-
-  if (Tpi) {
-    if (auto EC = Tpi->commit(Layout, Buffer))
-      return EC;
-  }
-
-  if (Ipi) {
-    if (auto EC = Ipi->commit(Layout, Buffer))
-      return EC;
-  }
-
-  return Buffer.commit();
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/RawSession.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/RawSession.cpp
deleted file mode 100644
index cd3a206..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/RawSession.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===- RawSession.cpp - Raw implementation of IPDBSession -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/PDB/GenericError.h"
-#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
-#include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
-#include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawSession.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
-#include <memory>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-
-RawSession::RawSession(std::unique_ptr<PDBFile> PdbFile,
-                       std::unique_ptr<BumpPtrAllocator> Allocator)
-    : Pdb(std::move(PdbFile)), Allocator(std::move(Allocator)) {}
-
-RawSession::~RawSession() = default;
-
-Error RawSession::createFromPdb(StringRef Path,
-                                std::unique_ptr<IPDBSession> &Session) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
-      MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
-                                   /*RequiresNullTerminator=*/false);
-  if (!ErrorOrBuffer)
-    return make_error<GenericError>(generic_error_code::invalid_path);
-
-  std::unique_ptr<MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
-  auto Stream = llvm::make_unique<MemoryBufferByteStream>(std::move(Buffer));
-
-  auto Allocator = llvm::make_unique<BumpPtrAllocator>();
-  auto File = llvm::make_unique<PDBFile>(std::move(Stream), *Allocator);
-  if (auto EC = File->parseFileHeaders())
-    return EC;
-  if (auto EC = File->parseStreamData())
-    return EC;
-
-  Session =
-      llvm::make_unique<RawSession>(std::move(File), std::move(Allocator));
-
-  return Error::success();
-}
-
-Error RawSession::createFromExe(StringRef Path,
-                                std::unique_ptr<IPDBSession> &Session) {
-  return make_error<RawError>(raw_error_code::feature_unsupported);
-}
-
-uint64_t RawSession::getLoadAddress() const { return 0; }
-
-void RawSession::setLoadAddress(uint64_t Address) {}
-
-std::unique_ptr<PDBSymbolExe> RawSession::getGlobalScope() const {
-  return nullptr;
-}
-
-std::unique_ptr<PDBSymbol> RawSession::getSymbolById(uint32_t SymbolId) const {
-  return nullptr;
-}
-
-std::unique_ptr<PDBSymbol>
-RawSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumLineNumbers>
-RawSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
-                            const IPDBSourceFile &File) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumLineNumbers>
-RawSession::findLineNumbersByAddress(uint64_t Address, uint32_t Length) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumSourceFiles>
-RawSession::findSourceFiles(const PDBSymbolCompiland *Compiland,
-                            StringRef Pattern,
-                            PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBSourceFile>
-RawSession::findOneSourceFile(const PDBSymbolCompiland *Compiland,
-                              StringRef Pattern,
-                              PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
-RawSession::findCompilandsForSourceFile(StringRef Pattern,
-                                        PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<PDBSymbolCompiland>
-RawSession::findOneCompilandForSourceFile(StringRef Pattern,
-                                          PDB_NameSearchFlags Flags) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumSourceFiles> RawSession::getAllSourceFiles() const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumSourceFiles> RawSession::getSourceFilesForCompiland(
-    const PDBSymbolCompiland &Compiland) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBSourceFile>
-RawSession::getSourceFileById(uint32_t FileId) const {
-  return nullptr;
-}
-
-std::unique_ptr<IPDBEnumDataStreams> RawSession::getDebugStreams() const {
-  return nullptr;
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiHashing.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiHashing.cpp
deleted file mode 100644
index 6c3ddb3..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiHashing.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-//===- TpiHashing.cpp -----------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/PDB/Raw/TpiHashing.h"
-
-#include "llvm/DebugInfo/PDB/Raw/Hash.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-using namespace llvm::pdb;
-
-// Corresponds to `fUDTAnon`.
-template <typename T> static bool isAnonymous(T &Rec) {
-  StringRef Name = Rec.getName();
-  return Name == "<unnamed-tag>" || Name == "__unnamed" ||
-         Name.endswith("::<unnamed-tag>") || Name.endswith("::__unnamed");
-}
-
-// Computes a hash for a given TPI record.
-template <typename T>
-static uint32_t getTpiHash(T &Rec, ArrayRef<uint8_t> FullRecord) {
-  auto Opts = static_cast<uint16_t>(Rec.getOptions());
-
-  bool ForwardRef =
-      Opts & static_cast<uint16_t>(ClassOptions::ForwardReference);
-  bool Scoped = Opts & static_cast<uint16_t>(ClassOptions::Scoped);
-  bool UniqueName = Opts & static_cast<uint16_t>(ClassOptions::HasUniqueName);
-  bool IsAnon = UniqueName && isAnonymous(Rec);
-
-  if (!ForwardRef && !Scoped && !IsAnon)
-    return hashStringV1(Rec.getName());
-  if (!ForwardRef && UniqueName && !IsAnon)
-    return hashStringV1(Rec.getUniqueName());
-  return hashBufferV8(FullRecord);
-}
-
-template <typename T> static uint32_t getSourceLineHash(T &Rec) {
-  char Buf[4];
-  support::endian::write32le(Buf, Rec.getUDT().getIndex());
-  return hashStringV1(StringRef(Buf, 4));
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR,
-                                          UdtSourceLineRecord &Rec) {
-  CVR.Hash = getSourceLineHash(Rec);
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR,
-                                          UdtModSourceLineRecord &Rec) {
-  CVR.Hash = getSourceLineHash(Rec);
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, ClassRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, EnumRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-void TpiHashUpdater::visitKnownRecordImpl(CVType &CVR, UnionRecord &Rec) {
-  CVR.Hash = getTpiHash(Rec, CVR.data());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UdtSourceLineRecord &Rec) {
-  return verifySourceLine(Rec.getUDT());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR,
-                                        UdtModSourceLineRecord &Rec) {
-  return verifySourceLine(Rec.getUDT());
-}
-
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, ClassRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, EnumRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-Error TpiHashVerifier::visitKnownRecord(CVType &CVR, UnionRecord &Rec) {
-  if (getTpiHash(Rec, CVR.data()) % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-
-Error TpiHashVerifier::verifySourceLine(codeview::TypeIndex TI) {
-  char Buf[4];
-  support::endian::write32le(Buf, TI.getIndex());
-  uint32_t Hash = hashStringV1(StringRef(Buf, 4));
-  if (Hash % NumHashBuckets != HashValues[Index])
-    return errorInvalidHash();
-  return Error::success();
-}
-
-Error TpiHashVerifier::visitTypeBegin(CVType &Rec) {
-  ++Index;
-  RawRecord = Rec;
-  return Error::success();
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiStreamBuilder.cpp b/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiStreamBuilder.cpp
deleted file mode 100644
index c769321..0000000
--- a/contrib/llvm/lib/DebugInfo/PDB/Raw/TpiStreamBuilder.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//===- TpiStreamBuilder.cpp -   -------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-#include "llvm/DebugInfo/PDB/Raw/PDBFile.h"
-#include "llvm/DebugInfo/PDB/Raw/RawError.h"
-#include "llvm/DebugInfo/PDB/Raw/RawTypes.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStream.h"
-#include "llvm/DebugInfo/PDB/Raw/TpiStreamBuilder.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-using namespace llvm;
-using namespace llvm::msf;
-using namespace llvm::pdb;
-using namespace llvm::support;
-
-TpiStreamBuilder::TpiStreamBuilder(MSFBuilder &Msf, uint32_t StreamIdx)
-    : Msf(Msf), Allocator(Msf.getAllocator()), Header(nullptr), Idx(StreamIdx) {
-}
-
-TpiStreamBuilder::~TpiStreamBuilder() = default;
-
-void TpiStreamBuilder::setVersionHeader(PdbRaw_TpiVer Version) {
-  VerHeader = Version;
-}
-
-void TpiStreamBuilder::addTypeRecord(const codeview::CVType &Record) {
-  TypeRecords.push_back(Record);
-  TypeRecordStream.setItems(TypeRecords);
-}
-
-Error TpiStreamBuilder::finalize() {
-  if (Header)
-    return Error::success();
-
-  TpiStreamHeader *H = Allocator.Allocate<TpiStreamHeader>();
-
-  uint32_t Count = TypeRecords.size();
-  uint32_t HashBufferSize = calculateHashBufferSize();
-
-  H->Version = *VerHeader;
-  H->HeaderSize = sizeof(TpiStreamHeader);
-  H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
-  H->TypeIndexEnd = H->TypeIndexBegin + Count;
-  H->TypeRecordBytes = TypeRecordStream.getLength();
-
-  H->HashStreamIndex = HashStreamIndex;
-  H->HashAuxStreamIndex = kInvalidStreamIndex;
-  H->HashKeySize = sizeof(ulittle32_t);
-  H->NumHashBuckets = MinTpiHashBuckets;
-
-  // Recall that hash values go into a completely different stream identified by
-  // the `HashStreamIndex` field of the `TpiStreamHeader`.  Therefore, the data
-  // begins at offset 0 of this independent stream.
-  H->HashValueBuffer.Off = 0;
-  H->HashValueBuffer.Length = HashBufferSize;
-  H->HashAdjBuffer.Off = H->HashValueBuffer.Off + H->HashValueBuffer.Length;
-  H->HashAdjBuffer.Length = 0;
-  H->IndexOffsetBuffer.Off = H->HashAdjBuffer.Off + H->HashAdjBuffer.Length;
-  H->IndexOffsetBuffer.Length = 0;
-
-  Header = H;
-  return Error::success();
-}
-
-uint32_t TpiStreamBuilder::calculateSerializedLength() const {
-  return sizeof(TpiStreamHeader) + TypeRecordStream.getLength();
-}
-
-uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
-  if (TypeRecords.empty() || !TypeRecords[0].Hash.hasValue())
-    return 0;
-  return TypeRecords.size() * sizeof(ulittle32_t);
-}
-
-Error TpiStreamBuilder::finalizeMsfLayout() {
-  uint32_t Length = calculateSerializedLength();
-  if (auto EC = Msf.setStreamSize(Idx, Length))
-    return EC;
-
-  uint32_t HashBufferSize = calculateHashBufferSize();
-
-  if (HashBufferSize == 0)
-    return Error::success();
-
-  auto ExpectedIndex = Msf.addStream(HashBufferSize);
-  if (!ExpectedIndex)
-    return ExpectedIndex.takeError();
-  HashStreamIndex = *ExpectedIndex;
-  ulittle32_t *H = Allocator.Allocate<ulittle32_t>(TypeRecords.size());
-  MutableArrayRef<ulittle32_t> HashBuffer(H, TypeRecords.size());
-  for (uint32_t I = 0; I < TypeRecords.size(); ++I) {
-    HashBuffer[I] = *TypeRecords[I].Hash % MinTpiHashBuckets;
-  }
-  ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(HashBuffer.data()),
-                          HashBufferSize);
-  HashValueStream = llvm::make_unique<ByteStream>(Bytes);
-  return Error::success();
-}
-
-Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
-                               const msf::WritableStream &Buffer) {
-  if (auto EC = finalize())
-    return EC;
-
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, Idx);
-
-  StreamWriter Writer(*InfoS);
-  if (auto EC = Writer.writeObject(*Header))
-    return EC;
-
-  auto RecordArray = VarStreamArray<codeview::CVType>(TypeRecordStream);
-  if (auto EC = Writer.writeArray(RecordArray))
-    return EC;
-
-  if (HashStreamIndex != kInvalidStreamIndex) {
-    auto HVS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
-                                                              HashStreamIndex);
-    StreamWriter HW(*HVS);
-    if (auto EC = HW.writeStreamRef(*HashValueStream))
-      return EC;
-  }
-
-  return Error::success();
-}
diff --git a/contrib/llvm/lib/DebugInfo/PDB/UDTLayout.cpp b/contrib/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
new file mode 100644
index 0000000..5f4390b
--- /dev/null
+++ b/contrib/llvm/lib/DebugInfo/PDB/UDTLayout.cpp
@@ -0,0 +1,303 @@
+//===- UDTLayout.cpp ------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/UDTLayout.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolData.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+static std::unique_ptr<PDBSymbol> getSymbolType(const PDBSymbol &Symbol) {
+  const IPDBSession &Session = Symbol.getSession();
+  const IPDBRawSymbol &RawSymbol = Symbol.getRawSymbol();
+  uint32_t TypeId = RawSymbol.getTypeId();
+  return Session.getSymbolById(TypeId);
+}
+
+static uint32_t getTypeLength(const PDBSymbol &Symbol) {
+  auto SymbolType = getSymbolType(Symbol);
+  const IPDBRawSymbol &RawType = SymbolType->getRawSymbol();
+
+  return RawType.getLength();
+}
+
+LayoutItemBase::LayoutItemBase(const UDTLayoutBase *Parent,
+                               const PDBSymbol *Symbol, const std::string &Name,
+                               uint32_t OffsetInParent, uint32_t Size,
+                               bool IsElided)
+    : Symbol(Symbol), Parent(Parent), Name(Name),
+      OffsetInParent(OffsetInParent), SizeOf(Size), LayoutSize(Size),
+      IsElided(IsElided) {
+  UsedBytes.resize(SizeOf, true);
+}
+
+uint32_t LayoutItemBase::deepPaddingSize() const {
+  return UsedBytes.size() - UsedBytes.count();
+}
+
+uint32_t LayoutItemBase::tailPadding() const {
+  int Last = UsedBytes.find_last();
+
+  return UsedBytes.size() - (Last + 1);
+}
+
+DataMemberLayoutItem::DataMemberLayoutItem(
+    const UDTLayoutBase &Parent, std::unique_ptr<PDBSymbolData> Member)
+    : LayoutItemBase(&Parent, Member.get(), Member->getName(),
+                     Member->getOffset(), getTypeLength(*Member), false),
+      DataMember(std::move(Member)) {
+  auto Type = DataMember->getType();
+  if (auto UDT = unique_dyn_cast<PDBSymbolTypeUDT>(Type)) {
+    UdtLayout = llvm::make_unique<ClassLayout>(std::move(UDT));
+    UsedBytes = UdtLayout->usedBytes();
+  }
+}
+
+VBPtrLayoutItem::VBPtrLayoutItem(const UDTLayoutBase &Parent,
+                                 std::unique_ptr<PDBSymbolTypeBuiltin> Sym,
+                                 uint32_t Offset, uint32_t Size)
+    : LayoutItemBase(&Parent, Sym.get(), "<vbptr>", Offset, Size, false),
+      Type(std::move(Sym)) {
+}
+
+const PDBSymbolData &DataMemberLayoutItem::getDataMember() {
+  return *dyn_cast<PDBSymbolData>(Symbol);
+}
+
+bool DataMemberLayoutItem::hasUDTLayout() const { return UdtLayout != nullptr; }
+
+const ClassLayout &DataMemberLayoutItem::getUDTLayout() const {
+  return *UdtLayout;
+}
+
+VTableLayoutItem::VTableLayoutItem(const UDTLayoutBase &Parent,
+                                   std::unique_ptr<PDBSymbolTypeVTable> VT)
+    : LayoutItemBase(&Parent, VT.get(), "<vtbl>", 0, getTypeLength(*VT), false),
+      VTable(std::move(VT)) {
+  auto VTableType = cast<PDBSymbolTypePointer>(VTable->getType());
+  ElementSize = VTableType->getLength();
+}
+
+UDTLayoutBase::UDTLayoutBase(const UDTLayoutBase *Parent, const PDBSymbol &Sym,
+                             const std::string &Name, uint32_t OffsetInParent,
+                             uint32_t Size, bool IsElided)
+    : LayoutItemBase(Parent, &Sym, Name, OffsetInParent, Size, IsElided) {
+  // UDT storage comes from a union of all the children's storage, so start out
+  // uninitialized.
+  UsedBytes.reset(0, Size);
+
+  initializeChildren(Sym);
+  if (LayoutSize < Size)
+    UsedBytes.resize(LayoutSize);
+}
+
+uint32_t UDTLayoutBase::tailPadding() const {
+  uint32_t Abs = LayoutItemBase::tailPadding();
+  if (!LayoutItems.empty()) {
+    const LayoutItemBase *Back = LayoutItems.back();
+    uint32_t ChildPadding = Back->LayoutItemBase::tailPadding();
+    if (Abs < ChildPadding)
+      Abs = 0;
+    else
+      Abs -= ChildPadding;
+  }
+  return Abs;
+}
+
+ClassLayout::ClassLayout(const PDBSymbolTypeUDT &UDT)
+    : UDTLayoutBase(nullptr, UDT, UDT.getName(), 0, UDT.getLength(), false),
+      UDT(UDT) {
+  ImmediateUsedBytes.resize(SizeOf, false);
+  for (auto &LI : LayoutItems) {
+    uint32_t Begin = LI->getOffsetInParent();
+    uint32_t End = Begin + LI->getLayoutSize();
+    End = std::min(SizeOf, End);
+    ImmediateUsedBytes.set(Begin, End);
+  }
+}
+
+ClassLayout::ClassLayout(std::unique_ptr<PDBSymbolTypeUDT> UDT)
+    : ClassLayout(*UDT) {
+  OwnedStorage = std::move(UDT);
+}
+
+uint32_t ClassLayout::immediatePadding() const {
+  return SizeOf - ImmediateUsedBytes.count();
+}
+
+BaseClassLayout::BaseClassLayout(const UDTLayoutBase &Parent,
+                                 uint32_t OffsetInParent, bool Elide,
+                                 std::unique_ptr<PDBSymbolTypeBaseClass> B)
+    : UDTLayoutBase(&Parent, *B, B->getName(), OffsetInParent, B->getLength(),
+                    Elide),
+      Base(std::move(B)) {
+  if (isEmptyBase()) {
+    // Special case an empty base so that it doesn't get treated as padding.
+    UsedBytes.resize(1);
+    UsedBytes.set(0);
+  }
+  IsVirtualBase = Base->isVirtualBaseClass();
+}
+
+void UDTLayoutBase::initializeChildren(const PDBSymbol &Sym) {
+  // Handled bases first, followed by VTables, followed by data members,
+  // followed by functions, followed by other.  This ordering is necessary
+  // so that bases and vtables get initialized before any functions which
+  // may override them.
+  UniquePtrVector<PDBSymbolTypeBaseClass> Bases;
+  UniquePtrVector<PDBSymbolTypeVTable> VTables;
+  UniquePtrVector<PDBSymbolData> Members;
+  UniquePtrVector<PDBSymbolTypeBaseClass> VirtualBaseSyms;
+
+  auto Children = Sym.findAllChildren();
+  while (auto Child = Children->getNext()) {
+    if (auto Base = unique_dyn_cast<PDBSymbolTypeBaseClass>(Child)) {
+      if (Base->isVirtualBaseClass())
+        VirtualBaseSyms.push_back(std::move(Base));
+      else
+        Bases.push_back(std::move(Base));
+    }
+    else if (auto Data = unique_dyn_cast<PDBSymbolData>(Child)) {
+      if (Data->getDataKind() == PDB_DataKind::Member)
+        Members.push_back(std::move(Data));
+      else
+        Other.push_back(std::move(Data));
+    } else if (auto VT = unique_dyn_cast<PDBSymbolTypeVTable>(Child))
+      VTables.push_back(std::move(VT));
+    else if (auto Func = unique_dyn_cast<PDBSymbolFunc>(Child))
+      Funcs.push_back(std::move(Func));
+    else {
+      Other.push_back(std::move(Child));
+    }
+  }
+
+  // We don't want to have any re-allocations in the list of bases, so make
+  // sure to reserve enough space so that our ArrayRefs don't get invalidated.
+  AllBases.reserve(Bases.size() + VirtualBaseSyms.size());
+
+  // Only add non-virtual bases to the class first.  Only at the end of the
+  // class, after all non-virtual bases and data members have been added do we
+  // add virtual bases.  This way the offsets are correctly aligned when we go
+  // to lay out virtual bases.
+  for (auto &Base : Bases) {
+    uint32_t Offset = Base->getOffset();
+    // Non-virtual bases never get elided.
+    auto BL = llvm::make_unique<BaseClassLayout>(*this, Offset, false,
+                                                 std::move(Base));
+
+    AllBases.push_back(BL.get());
+    addChildToLayout(std::move(BL));
+  }
+  NonVirtualBases = AllBases;
+
+  assert(VTables.size() <= 1);
+  if (!VTables.empty()) {
+    auto VTLayout =
+        llvm::make_unique<VTableLayoutItem>(*this, std::move(VTables[0]));
+
+    VTable = VTLayout.get();
+
+    addChildToLayout(std::move(VTLayout));
+  }
+
+  for (auto &Data : Members) {
+    auto DM = llvm::make_unique<DataMemberLayoutItem>(*this, std::move(Data));
+
+    addChildToLayout(std::move(DM));
+  }
+
+  // Make sure add virtual bases before adding functions, since functions may be
+  // overrides of virtual functions declared in a virtual base, so the VTables
+  // and virtual intros need to be correctly initialized.
+  for (auto &VB : VirtualBaseSyms) {
+    int VBPO = VB->getVirtualBasePointerOffset();
+    if (!hasVBPtrAtOffset(VBPO)) {
+      if (auto VBP = VB->getRawSymbol().getVirtualBaseTableType()) {
+        auto VBPL = llvm::make_unique<VBPtrLayoutItem>(*this, std::move(VBP),
+                                                       VBPO, VBP->getLength());
+        VBPtr = VBPL.get();
+        addChildToLayout(std::move(VBPL));
+      }
+    }
+
+    // Virtual bases always go at the end.  So just look for the last place we
+    // ended when writing something, and put our virtual base there.
+    // Note that virtual bases get elided unless this is a top-most derived
+    // class.
+    uint32_t Offset = UsedBytes.find_last() + 1;
+    bool Elide = (Parent != nullptr);
+    auto BL =
+        llvm::make_unique<BaseClassLayout>(*this, Offset, Elide, std::move(VB));
+    AllBases.push_back(BL.get());
+
+    // Only lay this virtual base out directly inside of *this* class if this
+    // is a top-most derived class.  Keep track of it regardless, but only
+    // physically lay it out if it's a topmost derived class.
+    addChildToLayout(std::move(BL));
+  }
+  VirtualBases = makeArrayRef(AllBases).drop_front(NonVirtualBases.size());
+
+  if (Parent != nullptr)
+    LayoutSize = UsedBytes.find_last() + 1;
+}
+
+bool UDTLayoutBase::hasVBPtrAtOffset(uint32_t Off) const {
+  if (VBPtr && VBPtr->getOffsetInParent() == Off)
+    return true;
+  for (BaseClassLayout *BL : AllBases) {
+    if (BL->hasVBPtrAtOffset(Off - BL->getOffsetInParent()))
+      return true;
+  }
+  return false;
+}
+
+void UDTLayoutBase::addChildToLayout(std::unique_ptr<LayoutItemBase> Child) {
+  uint32_t Begin = Child->getOffsetInParent();
+
+  if (!Child->isElided()) {
+    BitVector ChildBytes = Child->usedBytes();
+
+    // Suppose the child occupies 4 bytes starting at offset 12 in a 32 byte
+    // class.  When we call ChildBytes.resize(32), the Child's storage will
+    // still begin at offset 0, so we need to shift it left by offset bytes
+    // to get it into the right position.
+    ChildBytes.resize(UsedBytes.size());
+    ChildBytes <<= Child->getOffsetInParent();
+    UsedBytes |= ChildBytes;
+
+    if (ChildBytes.count() > 0) {
+      auto Loc = std::upper_bound(LayoutItems.begin(), LayoutItems.end(), Begin,
+                                  [](uint32_t Off, const LayoutItemBase *Item) {
+                                    return (Off < Item->getOffsetInParent());
+                                  });
+
+      LayoutItems.insert(Loc, Child.get());
+    }
+  }
+
+  ChildStorage.push_back(std::move(Child));
+}
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index be5c603..c1e2536 100644
--- a/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -78,8 +78,18 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
   std::string Filename = Info.FileName;
   if (Filename == kDILineInfoBadString)
     Filename = kBadString;
-  OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
-  printContext(Filename, Info.Line);
+  if (!Verbose) {
+    OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
+    printContext(Filename, Info.Line);
+    return;
+  }
+  OS << "  Filename: " << Filename << "\n";
+  if (Info.StartLine)
+    OS << "Function start line: " << Info.StartLine << "\n";
+  OS << "  Line: " << Info.Line << "\n";
+  OS << "  Column: " << Info.Column << "\n";
+  if (Info.Discriminator)
+    OS << "  Discriminator: " << Info.Discriminator << "\n";
 }
 
 DIPrinter &DIPrinter::operator<<(const DILineInfo &Info) {
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index f694008..2a89faf 100644
--- a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -1,4 +1,4 @@
-//===-- SymbolizableObjectFile.cpp ----------------------------------------===//
+//===- SymbolizableObjectFile.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "SymbolizableObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolSize.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-
-namespace llvm {
-namespace symbolize {
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
+using namespace llvm;
 using namespace object;
+using namespace symbolize;
 
 static DILineInfoSpecifier
 getDILineInfoSpecifier(FunctionNameKind FNKind) {
@@ -73,14 +87,17 @@ SymbolizableObjectFile::SymbolizableObjectFile(ObjectFile *Obj,
     : Module(Obj), DebugInfoContext(std::move(DICtx)) {}
 
 namespace {
+
 struct OffsetNamePair {
   uint32_t Offset;
   StringRef Name;
+
   bool operator<(const OffsetNamePair &R) const {
     return Offset < R.Offset;
   }
 };
-}
+
+} // end anonymous namespace
 
 std::error_code SymbolizableObjectFile::addCoffExportSymbols(
     const COFFObjectFile *CoffObj) {
@@ -147,7 +164,7 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
     return errorToErrorCode(SymbolNameOrErr.takeError());
   StringRef SymbolName = *SymbolNameOrErr;
   // Mach-O symbol table names have leading underscore, skip it.
-  if (Module->isMachO() && SymbolName.size() > 0 && SymbolName[0] == '_')
+  if (Module->isMachO() && !SymbolName.empty() && SymbolName[0] == '_')
     SymbolName = SymbolName.drop_front();
   // FIXME: If a function has alias, there are two entries in symbol table
   // with same address size. Make sure we choose the correct one.
@@ -252,7 +269,3 @@ DIGlobal SymbolizableObjectFile::symbolizeData(uint64_t ModuleOffset) const {
                          Res.Size);
   return Res;
 }
-
-}  // namespace symbolize
-}  // namespace llvm
-
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 8583b6a..216cca8 100644
--- a/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -1,4 +1,4 @@
-//===-- SymbolizableObjectFile.h -------------------------------- C++ -----===//
+//===- SymbolizableObjectFile.h ---------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,14 +13,20 @@
 #ifndef LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
 #define LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Support/ErrorOr.h"
+#include <cstdint>
 #include <map>
+#include <memory>
+#include <string>
+#include <system_error>
 
 namespace llvm {
+
 class DataExtractor;
-}
 
-namespace llvm {
 namespace symbolize {
 
 class SymbolizableObjectFile : public SymbolizableModule {
@@ -65,6 +71,7 @@ private:
     // If size is 0, assume that symbol occupies the whole memory range up to
     // the following symbol.
     uint64_t Size;
+
     friend bool operator<(const SymbolDesc &s1, const SymbolDesc &s2) {
       return s1.Addr < s2.Addr;
     }
@@ -76,7 +83,8 @@ private:
                          std::unique_ptr<DIContext> DICtx);
 };
 
-}  // namespace symbolize
-}  // namespace llvm
+} // end namespace symbolize
+
+} // end namespace llvm
 
-#endif  // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+#endif // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
diff --git a/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 7e56859..9bfab10 100644
--- a/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/contrib/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -16,6 +16,7 @@
 #include "SymbolizableObjectFile.h"
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Config/config.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/PDB/PDB.h"
@@ -24,7 +25,6 @@
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOUniversal.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/DataExtractor.h"
@@ -39,6 +39,8 @@
 
 #if defined(_MSC_VER)
 #include <Windows.h>
+
+// This must be included after windows.h.
 #include <DbgHelp.h>
 #pragma comment(lib, "dbghelp.lib")
 
@@ -467,8 +469,9 @@ extern "C" char *__cxa_demangle(const char *mangled_name, char *output_buffer,
                                 size_t *length, int *status);
 #endif
 
-std::string LLVMSymbolizer::DemangleName(const std::string &Name,
-                                         const SymbolizableModule *ModInfo) {
+std::string
+LLVMSymbolizer::DemangleName(const std::string &Name,
+                             const SymbolizableModule *DbiModuleDescriptor) {
 #if !defined(_MSC_VER)
   // We can spoil names of symbols with C linkage, so use an heuristic
   // approach to check if the name should be demangled.
@@ -496,7 +499,7 @@ std::string LLVMSymbolizer::DemangleName(const std::string &Name,
     return (result == 0) ? Name : std::string(DemangledName);
   }
 #endif
-  if (ModInfo && ModInfo->isWin32Module())
+  if (DbiModuleDescriptor && DbiModuleDescriptor->isWin32Module())
     return std::string(demanglePE32ExternCFunc(Name));
   return Name;
 }
diff --git a/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp b/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp
index 097b6ca..34f4017 100644
--- a/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/contrib/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -36,6 +36,12 @@ enum {
   success
 };
 
+enum {
+  CV_const = (1 << 0),
+  CV_volatile = (1 << 1),
+  CV_restrict = (1 << 2),
+};
+
 template <class C>
 static const char *parse_type(const char *first, const char *last, C &db);
 template <class C>
@@ -436,15 +442,15 @@ static const char *parse_cv_qualifiers(const char *first, const char *last,
   cv = 0;
   if (first != last) {
     if (*first == 'r') {
-      cv |= 4;
+      cv |= CV_restrict;
       ++first;
     }
     if (*first == 'V') {
-      cv |= 2;
+      cv |= CV_volatile;
       ++first;
     }
     if (*first == 'K') {
-      cv |= 1;
+      cv |= CV_const;
       ++first;
     }
   }
@@ -1396,7 +1402,8 @@ static const char *parse_function_type(const char *first, const char *last,
         int ref_qual = 0;
         while (true) {
           if (t == last) {
-            db.names.pop_back();
+            if (!db.names.empty())
+              db.names.pop_back();
             return first;
           }
           if (*t == 'E') {
@@ -1663,27 +1670,30 @@ static const char *parse_type(const char *first, const char *last, C &db) {
           db.subs.emplace_back();
           for (size_t k = k0; k < k1; ++k) {
             if (is_function) {
-              size_t p = db.names[k].second.size();
-              if (db.names[k].second[p - 2] == '&')
-                p -= 3;
-              else if (db.names[k].second.back() == '&')
+              auto &name = db.names[k].second;
+              size_t p = name.size();
+
+              if (name[p - 2] == '&' && name[p - 1] == '&')
                 p -= 2;
-              if (cv & 1) {
-                db.names[k].second.insert(p, " const");
+              else if (name.back() == '&')
+                p -= 1;
+
+              if (cv & CV_const) {
+                name.insert(p, " const");
                 p += 6;
               }
-              if (cv & 2) {
-                db.names[k].second.insert(p, " volatile");
+              if (cv & CV_volatile) {
+                name.insert(p, " volatile");
                 p += 9;
               }
-              if (cv & 4)
-                db.names[k].second.insert(p, " restrict");
+              if (cv & CV_restrict)
+                name.insert(p, " restrict");
             } else {
-              if (cv & 1)
+              if (cv & CV_const)
                 db.names[k].first.append(" const");
-              if (cv & 2)
+              if (cv & CV_volatile)
                 db.names[k].first.append(" volatile");
-              if (cv & 4)
+              if (cv & CV_restrict)
                 db.names[k].first.append(" restrict");
             }
             db.subs.back().push_back(db.names[k]);
@@ -1937,7 +1947,7 @@ static const char *parse_type(const char *first, const char *last, C &db) {
               break;
             }
           }
-        // drop through
+        // falls through
         default:
           // must check for builtin-types before class-enum-types to avoid
           // ambiguities with operator-names
@@ -2515,6 +2525,9 @@ static std::string base_name(std::string &s) {
       ++p0;
       break;
     }
+    if (!isalpha(*p0) && !isdigit(*p0) && *p0 != '_') {
+      return std::string();
+    }
   }
   return std::string(p0, pe);
 }
@@ -2602,39 +2615,45 @@ static const char *parse_unnamed_type_name(const char *first, const char *last,
       first = t0 + 1;
     } break;
     case 'l': {
+      size_t lambda_pos = db.names.size();
       db.names.push_back(std::string("'lambda'("));
       const char *t0 = first + 2;
       if (first[2] == 'v') {
         db.names.back().first += ')';
         ++t0;
       } else {
-        const char *t1 = parse_type(t0, last, db);
-        if (t1 == t0) {
-          if (!db.names.empty())
-            db.names.pop_back();
-          return first;
-        }
-        if (db.names.size() < 2)
-          return first;
-        auto tmp = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back().first.append(tmp);
-        t0 = t1;
+        bool is_first_it = true;
         while (true) {
-          t1 = parse_type(t0, last, db);
+          long k0 = static_cast<long>(db.names.size());
+          const char *t1 = parse_type(t0, last, db);
+          long k1 = static_cast<long>(db.names.size());
           if (t1 == t0)
             break;
-          if (db.names.size() < 2)
+          if (k0 >= k1)
             return first;
-          tmp = db.names.back().move_full();
-          db.names.pop_back();
-          if (!tmp.empty()) {
-            db.names.back().first.append(", ");
-            db.names.back().first.append(tmp);
-          }
+          // If the call to parse_type above found a pack expansion
+          // substitution, then multiple names could have been
+          // inserted into the name table. Walk through the names,
+          // appending each onto the lambda's parameter list.
+          std::for_each(db.names.begin() + k0, db.names.begin() + k1,
+                        [&](typename C::sub_type::value_type &pair) {
+                          if (pair.empty())
+                            return;
+                          auto &lambda = db.names[lambda_pos].first;
+                          if (!is_first_it)
+                            lambda.append(", ");
+                          is_first_it = false;
+                          lambda.append(pair.move_full());
+                        });
+          db.names.erase(db.names.begin() + k0, db.names.end());
           t0 = t1;
         }
-        if (db.names.empty())
+        if (is_first_it) {
+          if (!db.names.empty())
+            db.names.pop_back();
+          return first;
+        }
+        if (db.names.empty() || db.names.size() - 1 != lambda_pos)
           return first;
         db.names.back().first.append(")");
       }
@@ -3826,6 +3845,8 @@ static const char *parse_call_offset(const char *first, const char *last) {
 //                ::= GV <object name> # Guard variable for one-time
 //                initialization
 //                                     # No <type>
+//                ::= TW <object name> # Thread-local wrapper
+//                ::= TH <object name> # Thread-local initialization
 //      extension ::= TC <first type> <number> _ <second type> # construction
 //      vtable for second-in-first
 //      extension ::= GR <object name> # reference temporary for object
@@ -3919,6 +3940,27 @@ static const char *parse_special_name(const char *first, const char *last,
           }
         }
         break;
+      case 'W':
+        // TW <object name> # Thread-local wrapper
+        t = parse_name(first + 2, last, db);
+        if (t != first + 2) {
+          if (db.names.empty())
+            return first;
+          db.names.back().first.insert(0, "thread-local wrapper routine for ");
+          first = t;
+        }
+        break;
+      case 'H':
+        // TH <object name> # Thread-local initialization
+        t = parse_name(first + 2, last, db);
+        if (t != first + 2) {
+          if (db.names.empty())
+            return first;
+          db.names.back().first.insert(
+              0, "thread-local initialization routine for ");
+          first = t;
+        }
+        break;
       default:
         // T <call-offset> <base encoding>
         {
@@ -3997,6 +4039,8 @@ static const char *parse_encoding(const char *first, const char *last, C &db) {
     save_value<decltype(db.tag_templates)> sb(db.tag_templates);
     if (db.encoding_depth > 1)
       db.tag_templates = true;
+    save_value<decltype(db.parsed_ctor_dtor_cv)> sp(db.parsed_ctor_dtor_cv);
+    db.parsed_ctor_dtor_cv = false;
     switch (*first) {
     case 'G':
     case 'T':
@@ -4074,11 +4118,11 @@ static const char *parse_encoding(const char *first, const char *last, C &db) {
           if (db.names.empty())
             return first;
           db.names.back().first += ')';
-          if (cv & 1)
+          if (cv & CV_const)
             db.names.back().first.append(" const");
-          if (cv & 2)
+          if (cv & CV_volatile)
             db.names.back().first.append(" volatile");
-          if (cv & 4)
+          if (cv & CV_restrict)
             db.names.back().first.append(" restrict");
           if (ref == 1)
             db.names.back().first.append(" &");
@@ -4196,6 +4240,7 @@ template <class StrT> struct string_pair {
   template <size_t N> string_pair(const char (&s)[N]) : first(s, N - 1) {}
 
   size_t size() const { return first.size() + second.size(); }
+  bool empty() const { return first.empty() && second.empty(); }
   StrT full() const { return first + second; }
   StrT move_full() { return std::move(first) + std::move(second); }
 };
@@ -4225,20 +4270,11 @@ char *llvm::itaniumDemangle(const char *mangled_name, char *buf, size_t *n,
       *status = invalid_args;
     return nullptr;
   }
-
-  size_t len = std::strlen(mangled_name);
-  if (len < 2 || strncmp(mangled_name, "_Z", 2)) {
-    if (len < 4 || strncmp(mangled_name, "___Z", 4)) {
-      if (status)
-        *status = invalid_mangled_name;
-      return nullptr;
-    }
-  }
-
   size_t internal_size = buf != nullptr ? *n : 0;
   Db db;
   db.template_param.emplace_back();
   int internal_status = success;
+  size_t len = std::strlen(mangled_name);
   demangle(mangled_name, mangled_name + len, db, internal_status);
   if (internal_status == success && db.fix_forward_references &&
       !db.template_param.empty() && !db.template_param.front().empty()) {
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index b4bed32..2ee72f9 100644
--- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -515,7 +515,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   // to the function tells DynamicLibrary to load the program, not a library.
   if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr, ErrorStr))
     return nullptr;
-  
+
   // If the user specified a memory manager but didn't specify which engine to
   // create, we assume they only want the JIT, and we fail if they only want
   // the interpreter.
@@ -616,7 +616,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         for (unsigned int i = 0; i < elemNum; ++i) {
           Type *ElemTy = STy->getElementType(i);
           if (ElemTy->isIntegerTy())
-            Result.AggregateVal[i].IntVal = 
+            Result.AggregateVal[i].IntVal =
               APInt(ElemTy->getPrimitiveSizeInBits(), 0);
           else if (ElemTy->isAggregateType()) {
               const Constant *ElemUndef = UndefValue::get(ElemTy);
@@ -727,7 +727,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         APFloat apf = APFloat(APFloat::x87DoubleExtended(), GV.IntVal);
         uint64_t v;
         bool ignored;
-        (void)apf.convertToInteger(&v, BitWidth,
+        (void)apf.convertToInteger(makeMutableArrayRef(v), BitWidth,
                                    CE->getOpcode()==Instruction::FPToSI,
                                    APFloat::rmTowardZero, &ignored);
         GV.IntVal = v; // endian?
@@ -979,7 +979,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     // Check if vector holds integers.
     if (ElemTy->isIntegerTy()) {
       if (CAZ) {
-        GenericValue intZero;     
+        GenericValue intZero;
         intZero.IntVal = APInt(ElemTy->getScalarSizeInBits(), 0ull);
         std::fill(Result.AggregateVal.begin(), Result.AggregateVal.end(),
                   intZero);
@@ -1079,7 +1079,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
         *(((float*)Ptr)+i) = Val.AggregateVal[i].FloatVal;
       if (cast<VectorType>(Ty)->getElementType()->isIntegerTy()) {
         unsigned numOfBytes =(Val.AggregateVal[i].IntVal.getBitWidth()+7)/8;
-        StoreIntToMemory(Val.AggregateVal[i].IntVal, 
+        StoreIntToMemory(Val.AggregateVal[i].IntVal,
           (uint8_t*)Ptr + numOfBytes*i, numOfBytes);
       }
     }
@@ -1186,7 +1186,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   DEBUG(Init->dump());
   if (isa<UndefValue>(Init))
     return;
-  
+
   if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
     unsigned ElementSize =
         getDataLayout().getTypeAllocSize(CP->getType()->getElementType());
@@ -1194,12 +1194,12 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
       InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
-  
+
   if (isa<ConstantAggregateZero>(Init)) {
     memset(Addr, 0, (size_t)getDataLayout().getTypeAllocSize(Init->getType()));
     return;
   }
-  
+
   if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
     unsigned ElementSize =
         getDataLayout().getTypeAllocSize(CPA->getType()->getElementType());
@@ -1207,7 +1207,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
       InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
-  
+
   if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
     const StructLayout *SL =
         getDataLayout().getStructLayout(cast<StructType>(CPS->getType()));
diff --git a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 1d7c6e7..e956dbe 100644
--- a/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -188,7 +188,7 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
     for (auto &F : *Mod) {
       auto Attrs = F.getAttributes();
       StringRef Value(options.NoFramePointerElim ? "true" : "false");
-      Attrs = Attrs.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+      Attrs = Attrs.addAttribute(F.getContext(), AttributeList::FunctionIndex,
                                  "no-frame-pointer-elim", Value);
       F.setAttributes(Attrs);
     }
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 0051c69..a7b1fe2 100644
--- a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
 #include "IntelJITEventsWrapper.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Config/config.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
diff --git a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
index e966889..f2d36a7 100644
--- a/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
+++ b/contrib/llvm/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
@@ -22,8 +22,8 @@
 #include <windows.h>
 #pragma optimize("", off)
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#include <pthread.h>
 #include <dlfcn.h>
+#include <pthread.h>
 #include <stdint.h>
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 #include <malloc.h>
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
index 923f6e7..9684443 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -899,10 +899,10 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
 
   // Check to see if any of the cases match...
   BasicBlock *Dest = nullptr;
-  for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
+  for (auto Case : I.cases()) {
+    GenericValue CaseVal = getOperandValue(Case.getCaseValue(), SF);
     if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
-      Dest = cast<BasicBlock>(i.getCaseSuccessor());
+      Dest = cast<BasicBlock>(Case.getCaseSuccessor());
       break;
     }
   }
@@ -1565,7 +1565,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
           Tmp = Tmp.zext(SrcBitSize);
           Tmp = TempSrc.AggregateVal[SrcElt++].IntVal;
           Tmp = Tmp.zext(DstBitSize);
-          Tmp = Tmp.shl(ShiftAmt);
+          Tmp <<= ShiftAmt;
           ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
           Elt.IntVal |= Tmp;
         }
@@ -1580,7 +1580,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
           GenericValue Elt;
           Elt.IntVal = Elt.IntVal.zext(SrcBitSize);
           Elt.IntVal = TempSrc.AggregateVal[i].IntVal;
-          Elt.IntVal = Elt.IntVal.lshr(ShiftAmt);
+          Elt.IntVal.lshrInPlace(ShiftAmt);
           // it could be DstBitSize == SrcBitSize, so check it
           if (DstBitSize < SrcBitSize)
             Elt.IntVal = Elt.IntVal.trunc(DstBitSize);
diff --git a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index ee75bee..64dca93 100644
--- a/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -22,7 +22,7 @@
 #include "Interpreter.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Config/config.h"     // Detect libffi
+#include "llvm/Config/config.h" // Detect libffi
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -33,8 +33,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/UniqueLock.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cmath>
 #include <csignal>
diff --git a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index ff8749f..1164d60 100644
--- a/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -317,7 +317,13 @@ uint64_t MCJIT::getSymbolAddress(const std::string &Name,
     raw_string_ostream MangledNameStream(MangledName);
     Mangler::getNameWithPrefix(MangledNameStream, Name, getDataLayout());
   }
-  return findSymbol(MangledName, CheckFunctionsOnly).getAddress();
+  if (auto Sym = findSymbol(MangledName, CheckFunctionsOnly)) {
+    if (auto AddrOrErr = Sym.getAddress())
+      return *AddrOrErr;
+    else
+      report_fatal_error(AddrOrErr.takeError());
+  } else
+    report_fatal_error(Sym.takeError());
 }
 
 JITSymbol MCJIT::findSymbol(const std::string &Name,
@@ -599,11 +605,12 @@ GenericValue MCJIT::runFunction(Function *F, ArrayRef<GenericValue> ArgValues) {
 
 void *MCJIT::getPointerToNamedFunction(StringRef Name, bool AbortOnFailure) {
   if (!isSymbolSearchingDisabled()) {
-    void *ptr =
-      reinterpret_cast<void*>(
-        static_cast<uintptr_t>(Resolver.findSymbol(Name).getAddress()));
-    if (ptr)
-      return ptr;
+    if (auto Sym = Resolver.findSymbol(Name)) {
+      if (auto AddrOrErr = Sym.getAddress())
+        return reinterpret_cast<void*>(
+                 static_cast<uintptr_t>(*AddrOrErr));
+    } else if (auto Err = Sym.takeError())
+      report_fatal_error(std::move(Err));
   }
 
   /// If a LazyFunctionCreator is installed, use it to get/create the function.
diff --git a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 57b5d85..3581d64 100644
--- a/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 711b887..e3a4568 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
index 8dcd49a..de80cb1 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindings.cpp
@@ -12,6 +12,24 @@
 
 using namespace llvm;
 
+LLVMSharedModuleRef LLVMOrcMakeSharedModule(LLVMModuleRef Mod) {
+  return wrap(new std::shared_ptr<Module>(unwrap(Mod)));
+}
+
+void LLVMOrcDisposeSharedModuleRef(LLVMSharedModuleRef SharedMod) {
+  delete unwrap(SharedMod);
+}
+
+LLVMSharedObjectBufferRef
+LLVMOrcMakeSharedObjectBuffer(LLVMMemoryBufferRef ObjBuffer) {
+  return wrap(new std::shared_ptr<MemoryBuffer>(unwrap(ObjBuffer)));
+}
+
+void
+LLVMOrcDisposeSharedObjectBufferRef(LLVMSharedObjectBufferRef SharedObjBuffer) {
+  delete unwrap(SharedObjBuffer);
+}
+
 LLVMOrcJITStackRef LLVMOrcCreateInstance(LLVMTargetMachineRef TM) {
   TargetMachine *TM2(unwrap(TM));
 
@@ -42,12 +60,13 @@ void LLVMOrcGetMangledSymbol(LLVMOrcJITStackRef JITStack, char **MangledName,
 
 void LLVMOrcDisposeMangledSymbol(char *MangledName) { delete[] MangledName; }
 
-LLVMOrcTargetAddress
+LLVMOrcErrorCode
 LLVMOrcCreateLazyCompileCallback(LLVMOrcJITStackRef JITStack,
+                                 LLVMOrcTargetAddress *RetAddr,
                                  LLVMOrcLazyCompileCallbackFn Callback,
                                  void *CallbackCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  return J.createLazyCompileCallback(Callback, CallbackCtx);
+  return J.createLazyCompileCallback(*RetAddr, Callback, CallbackCtx);
 }
 
 LLVMOrcErrorCode LLVMOrcCreateIndirectStub(LLVMOrcJITStackRef JITStack,
@@ -64,36 +83,44 @@ LLVMOrcErrorCode LLVMOrcSetIndirectStubPointer(LLVMOrcJITStackRef JITStack,
   return J.setIndirectStubPointer(StubName, NewAddr);
 }
 
-LLVMOrcModuleHandle
-LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod,
+LLVMOrcErrorCode
+LLVMOrcAddEagerlyCompiledIR(LLVMOrcJITStackRef JITStack,
+                            LLVMOrcModuleHandle *RetHandle,
+                            LLVMSharedModuleRef Mod,
                             LLVMOrcSymbolResolverFn SymbolResolver,
                             void *SymbolResolverCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  Module *M(unwrap(Mod));
-  return J.addIRModuleEager(M, SymbolResolver, SymbolResolverCtx);
+  std::shared_ptr<Module> *M(unwrap(Mod));
+  return J.addIRModuleEager(*RetHandle, *M, SymbolResolver, SymbolResolverCtx);
 }
 
-LLVMOrcModuleHandle
-LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack, LLVMModuleRef Mod,
+LLVMOrcErrorCode
+LLVMOrcAddLazilyCompiledIR(LLVMOrcJITStackRef JITStack,
+                           LLVMOrcModuleHandle *RetHandle,
+                           LLVMSharedModuleRef Mod,
                            LLVMOrcSymbolResolverFn SymbolResolver,
                            void *SymbolResolverCtx) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  Module *M(unwrap(Mod));
-  return J.addIRModuleLazy(M, SymbolResolver, SymbolResolverCtx);
+  std::shared_ptr<Module> *M(unwrap(Mod));
+  return J.addIRModuleLazy(*RetHandle, *M, SymbolResolver, SymbolResolverCtx);
 }
 
-void LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack, LLVMOrcModuleHandle H) {
+LLVMOrcErrorCode LLVMOrcRemoveModule(LLVMOrcJITStackRef JITStack,
+                                     LLVMOrcModuleHandle H) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  J.removeModule(H);
+  return J.removeModule(H);
 }
 
-LLVMOrcTargetAddress LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
-                                             const char *SymbolName) {
+LLVMOrcErrorCode LLVMOrcGetSymbolAddress(LLVMOrcJITStackRef JITStack,
+                                         LLVMOrcTargetAddress *RetAddr,
+                                         const char *SymbolName) {
   OrcCBindingsStack &J = *unwrap(JITStack);
-  auto Sym = J.findSymbol(SymbolName, true);
-  return Sym.getAddress();
+  return J.findSymbolAddress(*RetAddr, SymbolName, true);
 }
 
-void LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack) {
-  delete unwrap(JITStack);
+LLVMOrcErrorCode LLVMOrcDisposeInstance(LLVMOrcJITStackRef JITStack) {
+  auto *J = unwrap(JITStack);
+  auto Err = J->shutdown();
+  delete J;
+  return Err;
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index a74fae7..e38decf 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -1,4 +1,4 @@
-//===--- OrcCBindingsStack.h - Orc JIT stack for C bindings ---*- C++ -*---===//
+//===- OrcCBindingsStack.h - Orc JIT stack for C bindings -----*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,47 +11,71 @@
 #define LLVM_LIB_EXECUTIONENGINE_ORC_ORCCBINDINGSSTACK_H
 
 #include "llvm-c/OrcBindings.h"
-#include "llvm/ADT/Triple.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
 class OrcCBindingsStack;
 
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::shared_ptr<Module>,
+                                   LLVMSharedModuleRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(std::shared_ptr<MemoryBuffer>,
+                                   LLVMSharedObjectBufferRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcCBindingsStack, LLVMOrcJITStackRef)
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 
 class OrcCBindingsStack {
 public:
-  typedef orc::JITCompileCallbackManager CompileCallbackMgr;
-  typedef orc::ObjectLinkingLayer<> ObjLayerT;
-  typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
-  typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>
-      CODLayerT;
 
-  typedef std::function<std::unique_ptr<CompileCallbackMgr>()>
-      CallbackManagerBuilder;
+  using CompileCallbackMgr = orc::JITCompileCallbackManager;
+  using ObjLayerT = orc::RTDyldObjectLinkingLayer;
+  using CompileLayerT = orc::IRCompileLayer<ObjLayerT, orc::SimpleCompiler>;
+  using CODLayerT =
+        orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>;
 
-  typedef CODLayerT::IndirectStubsManagerBuilderT IndirectStubsManagerBuilder;
+  using CallbackManagerBuilder =
+      std::function<std::unique_ptr<CompileCallbackMgr>()>;
+
+  using IndirectStubsManagerBuilder = CODLayerT::IndirectStubsManagerBuilderT;
 
 private:
   class GenericHandle {
   public:
-    virtual ~GenericHandle() {}
+    virtual ~GenericHandle() = default;
+
     virtual JITSymbol findSymbolIn(const std::string &Name,
                                    bool ExportedSymbolsOnly) = 0;
-    virtual void removeModule() = 0;
+    virtual Error removeModule() = 0;
   };
 
   template <typename LayerT> class GenericHandleImpl : public GenericHandle {
   public:
-    GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle)
+    GenericHandleImpl(LayerT &Layer, typename LayerT::ModuleHandleT Handle)
         : Layer(Layer), Handle(std::move(Handle)) {}
 
     JITSymbol findSymbolIn(const std::string &Name,
@@ -59,31 +83,32 @@ private:
       return Layer.findSymbolIn(Handle, Name, ExportedSymbolsOnly);
     }
 
-    void removeModule() override { return Layer.removeModuleSet(Handle); }
+    Error removeModule() override { return Layer.removeModule(Handle); }
 
   private:
     LayerT &Layer;
-    typename LayerT::ModuleSetHandleT Handle;
+    typename LayerT::ModuleHandleT Handle;
   };
 
   template <typename LayerT>
   std::unique_ptr<GenericHandleImpl<LayerT>>
-  createGenericHandle(LayerT &Layer, typename LayerT::ModuleSetHandleT Handle) {
+  createGenericHandle(LayerT &Layer, typename LayerT::ModuleHandleT Handle) {
     return llvm::make_unique<GenericHandleImpl<LayerT>>(Layer,
                                                         std::move(Handle));
   }
 
 public:
-  // We need a 'ModuleSetHandleT' to conform to the layer concept.
-  typedef unsigned ModuleSetHandleT;
-
-  typedef unsigned ModuleHandleT;
+  using ModuleHandleT = unsigned;
 
   OrcCBindingsStack(TargetMachine &TM,
                     std::unique_ptr<CompileCallbackMgr> CCMgr,
                     IndirectStubsManagerBuilder IndirectStubsMgrBuilder)
       : DL(TM.createDataLayout()), IndirectStubsMgr(IndirectStubsMgrBuilder()),
-        CCMgr(std::move(CCMgr)), ObjectLayer(),
+        CCMgr(std::move(CCMgr)),
+        ObjectLayer(
+          []() {
+            return std::make_shared<SectionMemoryManager>();
+          }),
         CompileLayer(ObjectLayer, orc::SimpleCompiler(TM)),
         CODLayer(CompileLayer,
                  [](Function &F) { return std::set<Function *>({&F}); },
@@ -91,12 +116,14 @@ public:
         CXXRuntimeOverrides(
             [this](const std::string &S) { return mangle(S); }) {}
 
-  ~OrcCBindingsStack() {
+  LLVMOrcErrorCode shutdown() {
     // Run any destructors registered with __cxa_atexit.
     CXXRuntimeOverrides.runDestructors();
     // Run any IR destructors.
     for (auto &DtorRunner : IRStaticDestructorRunners)
-      DtorRunner.runViaLayer(*this);
+      if (auto Err = DtorRunner.runViaLayer(*this))
+        return mapError(std::move(Err));
+    return LLVMOrcErrSuccess;
   }
 
   std::string mangle(StringRef Name) {
@@ -113,14 +140,17 @@ public:
     return reinterpret_cast<PtrTy>(static_cast<uintptr_t>(Addr));
   }
 
-  JITTargetAddress
-  createLazyCompileCallback(LLVMOrcLazyCompileCallbackFn Callback,
+
+  LLVMOrcErrorCode
+  createLazyCompileCallback(JITTargetAddress &RetAddr,
+                            LLVMOrcLazyCompileCallbackFn Callback,
                             void *CallbackCtx) {
     auto CCInfo = CCMgr->getCompileCallback();
     CCInfo.setCompileAction([=]() -> JITTargetAddress {
       return Callback(wrap(this), CallbackCtx);
     });
-    return CCInfo.getAddress();
+    RetAddr = CCInfo.getAddress();
+    return LLVMOrcErrSuccess;
   }
 
   LLVMOrcErrorCode createIndirectStub(StringRef StubName,
@@ -134,12 +164,12 @@ public:
     return mapError(IndirectStubsMgr->updatePointer(Name, Addr));
   }
 
-  std::unique_ptr<JITSymbolResolver>
+  std::shared_ptr<JITSymbolResolver>
   createResolver(LLVMOrcSymbolResolverFn ExternalResolver,
                  void *ExternalResolverCtx) {
     return orc::createLambdaResolver(
         [this, ExternalResolver, ExternalResolverCtx](const std::string &Name)
-            -> JITSymbol {
+          -> JITSymbol {
           // Search order:
           // 1. JIT'd symbols.
           // 2. Runtime overrides.
@@ -147,26 +177,31 @@ public:
 
           if (auto Sym = CODLayer.findSymbol(Name, true))
             return Sym;
+          else if (auto Err = Sym.takeError())
+            return Sym.takeError();
+
           if (auto Sym = CXXRuntimeOverrides.searchOverrides(Name))
             return Sym;
 
           if (ExternalResolver)
             return JITSymbol(
                 ExternalResolver(Name.c_str(), ExternalResolverCtx),
-                llvm::JITSymbolFlags::Exported);
+                JITSymbolFlags::Exported);
 
           return JITSymbol(nullptr);
         },
-        [](const std::string &Name) {
+        [](const std::string &Name) -> JITSymbol {
           return JITSymbol(nullptr);
         });
   }
 
   template <typename LayerT>
-  ModuleHandleT addIRModule(LayerT &Layer, Module *M,
-                            std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
-                            LLVMOrcSymbolResolverFn ExternalResolver,
-                            void *ExternalResolverCtx) {
+  LLVMOrcErrorCode
+  addIRModule(ModuleHandleT &RetHandle, LayerT &Layer,
+              std::shared_ptr<Module> M,
+              std::unique_ptr<RuntimeDyld::MemoryManager> MemMgr,
+              LLVMOrcSymbolResolverFn ExternalResolver,
+              void *ExternalResolverCtx) {
 
     // Attach a data-layout if one isn't already present.
     if (M->getDataLayout().isDefault())
@@ -184,46 +219,52 @@ public:
     auto Resolver = createResolver(ExternalResolver, ExternalResolverCtx);
 
     // Add the module to the JIT.
-    std::vector<Module *> S;
-    S.push_back(std::move(M));
-
-    auto LH = Layer.addModuleSet(std::move(S), std::move(MemMgr),
-                                 std::move(Resolver));
-    ModuleHandleT H = createHandle(Layer, LH);
+    ModuleHandleT H;
+    if (auto LHOrErr = Layer.addModule(std::move(M), std::move(Resolver)))
+      H = createHandle(Layer, *LHOrErr);
+    else
+      return mapError(LHOrErr.takeError());
 
     // Run the static constructors, and save the static destructor runner for
     // execution when the JIT is torn down.
     orc::CtorDtorRunner<OrcCBindingsStack> CtorRunner(std::move(CtorNames), H);
-    CtorRunner.runViaLayer(*this);
+    if (auto Err = CtorRunner.runViaLayer(*this))
+      return mapError(std::move(Err));
 
     IRStaticDestructorRunners.emplace_back(std::move(DtorNames), H);
 
-    return H;
+    RetHandle = H;
+    return LLVMOrcErrSuccess;
   }
 
-  ModuleHandleT addIRModuleEager(Module *M,
-                                 LLVMOrcSymbolResolverFn ExternalResolver,
-                                 void *ExternalResolverCtx) {
-    return addIRModule(CompileLayer, std::move(M),
+  LLVMOrcErrorCode addIRModuleEager(ModuleHandleT &RetHandle,
+                                    std::shared_ptr<Module> M,
+                                    LLVMOrcSymbolResolverFn ExternalResolver,
+                                    void *ExternalResolverCtx) {
+    return addIRModule(RetHandle, CompileLayer, std::move(M),
                        llvm::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
-  ModuleHandleT addIRModuleLazy(Module *M,
-                                LLVMOrcSymbolResolverFn ExternalResolver,
-                                void *ExternalResolverCtx) {
-    return addIRModule(CODLayer, std::move(M),
+  LLVMOrcErrorCode addIRModuleLazy(ModuleHandleT &RetHandle,
+                                   std::shared_ptr<Module> M,
+                                   LLVMOrcSymbolResolverFn ExternalResolver,
+                                   void *ExternalResolverCtx) {
+    return addIRModule(RetHandle, CODLayer, std::move(M),
                        llvm::make_unique<SectionMemoryManager>(),
                        std::move(ExternalResolver), ExternalResolverCtx);
   }
 
-  void removeModule(ModuleHandleT H) {
-    GenericHandles[H]->removeModule();
+  LLVMOrcErrorCode removeModule(ModuleHandleT H) {
+    if (auto Err = GenericHandles[H]->removeModule())
+      return mapError(std::move(Err));
     GenericHandles[H] = nullptr;
     FreeHandleIndexes.push_back(H);
+    return LLVMOrcErrSuccess;
   }
 
-  JITSymbol findSymbol(const std::string &Name, bool ExportedSymbolsOnly) {
+  JITSymbol findSymbol(const std::string &Name,
+                                 bool ExportedSymbolsOnly) {
     if (auto Sym = IndirectStubsMgr->findStub(Name, ExportedSymbolsOnly))
       return Sym;
     return CODLayer.findSymbol(mangle(Name), ExportedSymbolsOnly);
@@ -234,12 +275,31 @@ public:
     return GenericHandles[H]->findSymbolIn(Name, ExportedSymbolsOnly);
   }
 
+  LLVMOrcErrorCode findSymbolAddress(JITTargetAddress &RetAddr,
+                                     const std::string &Name,
+                                     bool ExportedSymbolsOnly) {
+    RetAddr = 0;
+    if (auto Sym = findSymbol(Name, ExportedSymbolsOnly)) {
+      // Successful lookup, non-null symbol:
+      if (auto AddrOrErr = Sym.getAddress()) {
+        RetAddr = *AddrOrErr;
+        return LLVMOrcErrSuccess;
+      } else
+        return mapError(AddrOrErr.takeError());
+    } else if (auto Err = Sym.takeError()) {
+      // Lookup failure - report error.
+      return mapError(std::move(Err));
+    }
+    // Otherwise we had a successful lookup but got a null result. We already
+    // set RetAddr to '0' above, so just return success.
+    return LLVMOrcErrSuccess;
+  }
+
   const std::string &getErrorMessage() const { return ErrMsg; }
 
 private:
   template <typename LayerT>
-  unsigned createHandle(LayerT &Layer,
-                        typename LayerT::ModuleSetHandleT Handle) {
+  unsigned createHandle(LayerT &Layer, typename LayerT::ModuleHandleT Handle) {
     unsigned NewHandle;
     if (!FreeHandleIndexes.empty()) {
       NewHandle = FreeHandleIndexes.back();
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
index c531fe3..df2d320 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcError.cpp
@@ -39,14 +39,21 @@ public:
       return "Remote indirect stubs owner does not exist";
     case OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse:
       return "Remote indirect stubs owner Id already in use";
+    case OrcErrorCode::RPCConnectionClosed:
+      return "RPC connection closed";
+    case OrcErrorCode::RPCCouldNotNegotiateFunction:
+      return "Could not negotiate RPC function";
     case OrcErrorCode::RPCResponseAbandoned:
       return "RPC response abandoned";
+    case OrcErrorCode::JITSymbolNotFound:
+      return "JIT symbol not found";
     case OrcErrorCode::UnexpectedRPCCall:
       return "Unexpected RPC call";
     case OrcErrorCode::UnexpectedRPCResponse:
       return "Unexpected RPC response";
-    case OrcErrorCode::UnknownRPCFunction:
-      return "Unknown RPC function";
+    case OrcErrorCode::UnknownErrorCodeFromRemote:
+      return "Unknown error returned from remote RPC function "
+             "(Use StringError to get error message)";
     }
     llvm_unreachable("Unhandled error code");
   }
@@ -58,10 +65,29 @@ static ManagedStatic<OrcErrorCategory> OrcErrCat;
 namespace llvm {
 namespace orc {
 
-Error orcError(OrcErrorCode ErrCode) {
+char JITSymbolNotFound::ID = 0;
+
+std::error_code orcError(OrcErrorCode ErrCode) {
+  typedef std::underlying_type<OrcErrorCode>::type UT;
+  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
+}
+
+JITSymbolNotFound::JITSymbolNotFound(std::string SymbolName)
+  : SymbolName(std::move(SymbolName)) {}
+
+std::error_code JITSymbolNotFound::convertToErrorCode() const {
   typedef std::underlying_type<OrcErrorCode>::type UT;
-  return errorCodeToError(
-      std::error_code(static_cast<UT>(ErrCode), *OrcErrCat));
+  return std::error_code(static_cast<UT>(OrcErrorCode::JITSymbolNotFound),
+                         *OrcErrCat);
 }
+
+void JITSymbolNotFound::log(raw_ostream &OS) const {
+  OS << "Could not find symbol '" << SymbolName << "'";
+}
+
+const std::string &JITSymbolNotFound::getSymbolName() const {
+  return SymbolName;
+}
+
 }
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
index b7a68e0..f89f21a 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.cpp
@@ -124,5 +124,10 @@ OrcMCJITReplacement::runFunction(Function *F,
   llvm_unreachable("Full-featured argument passing not supported yet!");
 }
 
+void OrcMCJITReplacement::runStaticConstructorsDestructors(bool isDtors) {
+  for (auto &M : LocalModules)
+    ExecutionEngine::runStaticConstructorsDestructors(*M, isDtors);
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index af70960..346a404 100644
--- a/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -1,4 +1,4 @@
-//===---- OrcMCJITReplacement.h - Orc based MCJIT replacement ---*- C++ -*-===//
+//===- OrcMCJITReplacement.h - Orc based MCJIT replacement ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,13 +20,16 @@
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
@@ -34,10 +37,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <algorithm>
 #include <map>
 #include <memory>
 #include <set>
@@ -45,6 +48,9 @@
 #include <vector>
 
 namespace llvm {
+
+class ObjectCache;
+
 namespace orc {
 
 class OrcMCJITReplacement : public ExecutionEngine {
@@ -94,9 +100,8 @@ class OrcMCJITReplacement : public ExecutionEngine {
       return ClientMM->registerEHFrames(Addr, LoadAddr, Size);
     }
 
-    void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr,
-                            size_t Size) override {
-      return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+    void deregisterEHFrames() override {
+      return ClientMM->deregisterEHFrames();
     }
 
     void notifyObjectLoaded(RuntimeDyld &RTDyld,
@@ -152,7 +157,6 @@ class OrcMCJITReplacement : public ExecutionEngine {
   };
 
 private:
-
   static ExecutionEngine *
   createOrcMCJITReplacement(std::string *ErrorMsg,
                             std::shared_ptr<MCJITMemoryManager> MemMgr,
@@ -163,24 +167,26 @@ private:
   }
 
 public:
-  static void Register() {
-    OrcMCJITReplacementCtor = createOrcMCJITReplacement;
-  }
-
   OrcMCJITReplacement(
       std::shared_ptr<MCJITMemoryManager> MemMgr,
       std::shared_ptr<JITSymbolResolver> ClientResolver,
       std::unique_ptr<TargetMachine> TM)
       : ExecutionEngine(TM->createDataLayout()), TM(std::move(TM)),
-        MemMgr(*this, std::move(MemMgr)), Resolver(*this),
+        MemMgr(std::make_shared<MCJITReplacementMemMgr>(*this,
+                                                        std::move(MemMgr))),
+        Resolver(std::make_shared<LinkingResolver>(*this)),
         ClientResolver(std::move(ClientResolver)), NotifyObjectLoaded(*this),
         NotifyFinalized(*this),
-        ObjectLayer(NotifyObjectLoaded, NotifyFinalized),
+        ObjectLayer([this]() { return this->MemMgr; }, NotifyObjectLoaded,
+                    NotifyFinalized),
         CompileLayer(ObjectLayer, SimpleCompiler(*this->TM)),
         LazyEmitLayer(CompileLayer) {}
 
-  void addModule(std::unique_ptr<Module> M) override {
+  static void Register() {
+    OrcMCJITReplacementCtor = createOrcMCJITReplacement;
+  }
 
+  void addModule(std::unique_ptr<Module> M) override {
     // If this module doesn't have a DataLayout attached then attach the
     // default.
     if (M->getDataLayout().isDefault()) {
@@ -188,32 +194,47 @@ public:
     } else {
       assert(M->getDataLayout() == getDataLayout() && "DataLayout Mismatch");
     }
-    Modules.push_back(std::move(M));
-    std::vector<Module *> Ms;
-    Ms.push_back(&*Modules.back());
-    LazyEmitLayer.addModuleSet(std::move(Ms), &MemMgr, &Resolver);
+    auto *MPtr = M.release();
+    ShouldDelete[MPtr] = true;
+    auto Deleter = [this](Module *Mod) {
+      auto I = ShouldDelete.find(Mod);
+      if (I != ShouldDelete.end() && I->second)
+        delete Mod;
+    };
+    LocalModules.push_back(std::shared_ptr<Module>(MPtr, std::move(Deleter)));
+    cantFail(LazyEmitLayer.addModule(LocalModules.back(), Resolver));
   }
 
   void addObjectFile(std::unique_ptr<object::ObjectFile> O) override {
-    std::vector<std::unique_ptr<object::ObjectFile>> Objs;
-    Objs.push_back(std::move(O));
-    ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
+    auto Obj =
+      std::make_shared<object::OwningBinary<object::ObjectFile>>(std::move(O),
+                                                                 nullptr);
+    cantFail(ObjectLayer.addObject(std::move(Obj), Resolver));
   }
 
   void addObjectFile(object::OwningBinary<object::ObjectFile> O) override {
-    std::vector<std::unique_ptr<object::OwningBinary<object::ObjectFile>>> Objs;
-    Objs.push_back(
-      llvm::make_unique<object::OwningBinary<object::ObjectFile>>(
-        std::move(O)));
-    ObjectLayer.addObjectSet(std::move(Objs), &MemMgr, &Resolver);
+    auto Obj =
+      std::make_shared<object::OwningBinary<object::ObjectFile>>(std::move(O));
+    cantFail(ObjectLayer.addObject(std::move(Obj), Resolver));
   }
 
   void addArchive(object::OwningBinary<object::Archive> A) override {
     Archives.push_back(std::move(A));
   }
+  
+  bool removeModule(Module *M) override {
+    for (auto I = LocalModules.begin(), E = LocalModules.end(); I != E; ++I) {
+      if (I->get() == M) {
+	ShouldDelete[M] = false;
+	LocalModules.erase(I);
+	return true;
+      }
+    }
+    return false;
+  }
 
   uint64_t getSymbolAddress(StringRef Name) {
-    return findSymbol(Name).getAddress();
+    return cantFail(findSymbol(Name).getAddress());
   }
 
   JITSymbol findSymbol(StringRef Name) {
@@ -257,13 +278,15 @@ public:
                            ArrayRef<GenericValue> ArgValues) override;
 
   void setObjectCache(ObjectCache *NewCache) override {
-    CompileLayer.setObjectCache(NewCache);
+    CompileLayer.getCompiler().setObjectCache(NewCache);
   }
 
   void setProcessAllSections(bool ProcessAllSections) override {
     ObjectLayer.setProcessAllSections(ProcessAllSections);
   }
 
+  void runStaticConstructorsDestructors(bool isDtors) override;
+
 private:
   JITSymbol findMangledSymbol(StringRef Name) {
     if (auto Sym = LazyEmitLayer.findSymbol(Name, false))
@@ -295,10 +318,12 @@ private:
         }
         std::unique_ptr<object::Binary> &ChildBin = ChildBinOrErr.get();
         if (ChildBin->isObject()) {
-          std::vector<std::unique_ptr<object::ObjectFile>> ObjSet;
-          ObjSet.push_back(std::unique_ptr<object::ObjectFile>(
-              static_cast<object::ObjectFile *>(ChildBin.release())));
-          ObjectLayer.addObjectSet(std::move(ObjSet), &MemMgr, &Resolver);
+          std::unique_ptr<object::ObjectFile> ChildObj(
+            static_cast<object::ObjectFile*>(ChildBinOrErr->release()));
+          auto Obj =
+            std::make_shared<object::OwningBinary<object::ObjectFile>>(
+              std::move(ChildObj), nullptr);
+          cantFail(ObjectLayer.addObject(std::move(Obj), Resolver));
           if (auto Sym = ObjectLayer.findSymbol(Name, true))
             return Sym;
         }
@@ -309,34 +334,19 @@ private:
 
   class NotifyObjectLoadedT {
   public:
-    typedef std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>
-        LoadedObjInfoListT;
+    using LoadedObjInfoListT =
+        std::vector<std::unique_ptr<RuntimeDyld::LoadedObjectInfo>>;
 
     NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
 
-    template <typename ObjListT>
-    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H,
-                    const ObjListT &Objects,
-                    const LoadedObjInfoListT &Infos) const {
+    void operator()(RTDyldObjectLinkingLayerBase::ObjHandleT H,
+                    const RTDyldObjectLinkingLayer::ObjectPtr &Obj,
+                    const LoadedObjectInfo &Info) const {
       M.UnfinalizedSections[H] = std::move(M.SectionsAllocatedSinceLastLoad);
       M.SectionsAllocatedSinceLastLoad = SectionAddrSet();
-      assert(Objects.size() == Infos.size() &&
-             "Incorrect number of Infos for Objects.");
-      for (unsigned I = 0; I < Objects.size(); ++I)
-        M.MemMgr.notifyObjectLoaded(&M, getObject(*Objects[I]));
+      M.MemMgr->notifyObjectLoaded(&M, *Obj->getBinary());
     }
-
   private:
-    static const object::ObjectFile& getObject(const object::ObjectFile &Obj) {
-      return Obj;
-    }
-
-    template <typename ObjT>
-    static const object::ObjectFile&
-    getObject(const object::OwningBinary<ObjT> &Obj) {
-      return *Obj.getBinary();
-    }
-
     OrcMCJITReplacement &M;
   };
 
@@ -344,7 +354,7 @@ private:
   public:
     NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
 
-    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H) {
+    void operator()(RTDyldObjectLinkingLayerBase::ObjHandleT H) {
       M.UnfinalizedSections.erase(H);
     }
 
@@ -361,13 +371,13 @@ private:
     return MangledName;
   }
 
-  typedef ObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
-  typedef IRCompileLayer<ObjectLayerT> CompileLayerT;
-  typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
+  using ObjectLayerT = RTDyldObjectLinkingLayer;
+  using CompileLayerT = IRCompileLayer<ObjectLayerT, orc::SimpleCompiler>;
+  using LazyEmitLayerT = LazyEmittingLayer<CompileLayerT>;
 
   std::unique_ptr<TargetMachine> TM;
-  MCJITReplacementMemMgr MemMgr;
-  LinkingResolver Resolver;
+  std::shared_ptr<MCJITReplacementMemMgr> MemMgr;
+  std::shared_ptr<LinkingResolver> Resolver;
   std::shared_ptr<JITSymbolResolver> ClientResolver;
   Mangler Mang;
 
@@ -381,21 +391,24 @@ private:
   // We need to store ObjLayerT::ObjSetHandles for each of the object sets
   // that have been emitted but not yet finalized so that we can forward the
   // mapSectionAddress calls appropriately.
-  typedef std::set<const void *> SectionAddrSet;
-  struct ObjSetHandleCompare {
-    bool operator()(ObjectLayerT::ObjSetHandleT H1,
-                    ObjectLayerT::ObjSetHandleT H2) const {
+  using SectionAddrSet = std::set<const void *>;
+  struct ObjHandleCompare {
+    bool operator()(ObjectLayerT::ObjHandleT H1,
+                    ObjectLayerT::ObjHandleT H2) const {
       return &*H1 < &*H2;
     }
   };
   SectionAddrSet SectionsAllocatedSinceLastLoad;
-  std::map<ObjectLayerT::ObjSetHandleT, SectionAddrSet, ObjSetHandleCompare>
+  std::map<ObjectLayerT::ObjHandleT, SectionAddrSet, ObjHandleCompare>
       UnfinalizedSections;
 
+  std::map<Module*, bool> ShouldDelete;
+  std::vector<std::shared_ptr<Module>> LocalModules;
   std::vector<object::OwningBinary<object::Archive>> Archives;
 };
 
 } // end namespace orc
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_EXECUTIONENGINE_ORC_MCJITREPLACEMENT_H
diff --git a/contrib/llvm/lib/ExecutionEngine/Orc/RPCUtils.cpp b/contrib/llvm/lib/ExecutionEngine/Orc/RPCUtils.cpp
new file mode 100644
index 0000000..2a7ab5c
--- /dev/null
+++ b/contrib/llvm/lib/ExecutionEngine/Orc/RPCUtils.cpp
@@ -0,0 +1,55 @@
+//===--------------- RPCUtils.cpp - RPCUtils implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// RPCUtils implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/RPCUtils.h"
+
+char llvm::orc::rpc::RPCFatalError::ID = 0;
+char llvm::orc::rpc::ConnectionClosed::ID = 0;
+char llvm::orc::rpc::ResponseAbandoned::ID = 0;
+char llvm::orc::rpc::CouldNotNegotiate::ID = 0;
+
+namespace llvm {
+namespace orc {
+namespace rpc {
+
+std::error_code ConnectionClosed::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCConnectionClosed);
+}
+
+void ConnectionClosed::log(raw_ostream &OS) const {
+  OS << "RPC connection already closed";
+}
+
+std::error_code ResponseAbandoned::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCResponseAbandoned);
+}
+
+void ResponseAbandoned::log(raw_ostream &OS) const {
+  OS << "RPC response abandoned";
+}
+
+CouldNotNegotiate::CouldNotNegotiate(std::string Signature)
+    : Signature(std::move(Signature)) {}
+
+std::error_code CouldNotNegotiate::convertToErrorCode() const {
+  return orcError(OrcErrorCode::RPCCouldNotNegotiateFunction);
+}
+
+void CouldNotNegotiate::log(raw_ostream &OS) const {
+  OS << "Could not negotiate RPC function " << Signature;
+}
+
+
+} // end namespace rpc
+} // end namespace orc
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index de73fbd..99e84b7 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -134,6 +134,18 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr,
 
 #endif
 
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr,
+                                          size_t Size) {
+  registerEHFramesInProcess(Addr, Size);
+  EHFrames.push_back({Addr, Size});
+}
+
+void RTDyldMemoryManager::deregisterEHFrames() {
+  for (auto &Frame : EHFrames)
+    deregisterEHFramesInProcess(Frame.Addr, Frame.Size);
+  EHFrames.clear();
+}
+
 static int jit_noop() {
   return 0;
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 63b56f7..8198836 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "RuntimeDyldCheckerImpl.h"
 #include "RuntimeDyldCOFF.h"
+#include "RuntimeDyldCheckerImpl.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldMachO.h"
-#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MutexGuard.h"
@@ -73,7 +73,9 @@ namespace llvm {
 
 void RuntimeDyldImpl::registerEHFrames() {}
 
-void RuntimeDyldImpl::deregisterEHFrames() {}
+void RuntimeDyldImpl::deregisterEHFrames() {
+  MemMgr.deregisterEHFrames();
+}
 
 #ifndef NDEBUG
 static void dumpSectionMemory(const SectionEntry &S, StringRef State) {
@@ -126,7 +128,10 @@ void RuntimeDyldImpl::resolveRelocations() {
   );
 
   // First, resolve relocations associated with external symbols.
-  resolveExternalSymbols();
+  if (auto Err = resolveExternalSymbols()) {
+    HasError = true;
+    ErrorStr = toString(std::move(Err));
+  }
 
   // Iterate over all outstanding relocations
   for (auto it = Relocations.begin(), e = Relocations.end(); it != e; ++it) {
@@ -241,9 +246,11 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
           continue;
         // Then check the symbol resolver to see if there's a definition
         // elsewhere in this logical dylib.
-        if (auto Sym = Resolver.findSymbolInLogicalDylib(Name))
+        if (auto Sym = Resolver.findSymbolInLogicalDylib(Name)) {
           if (Sym.getFlags().isStrongDefinition())
             continue;
+        } else if (auto Err = Sym.takeError())
+          return std::move(Err);
         // else
         JITSymFlags &= ~JITSymbolFlags::Weak;
       }
@@ -443,7 +450,7 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
        SI != SE; ++SI) {
     const SectionRef &Section = *SI;
 
-    bool IsRequired = isRequiredForExecution(Section);
+    bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections;
 
     // Consider only the sections that are required to be loaded for execution
     if (IsRequired) {
@@ -484,6 +491,14 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
     }
   }
 
+  // Compute Global Offset Table size. If it is not zero we
+  // also update alignment, which is equal to a size of a
+  // single GOT entry.
+  if (unsigned GotSize = computeGOTSize(Obj)) {
+    RWSectionSizes.push_back(GotSize);
+    RWDataAlign = std::max<uint32_t>(RWDataAlign, getGOTEntrySize());
+  }
+
   // Compute the size of all common symbols
   uint64_t CommonSize = 0;
   uint32_t CommonAlign = 1;
@@ -518,6 +533,24 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
   return Error::success();
 }
 
+// compute GOT size
+unsigned RuntimeDyldImpl::computeGOTSize(const ObjectFile &Obj) {
+  size_t GotEntrySize = getGOTEntrySize();
+  if (!GotEntrySize)
+    return 0;
+
+  size_t GotSize = 0;
+  for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
+       SI != SE; ++SI) {
+
+    for (const RelocationRef &Reloc : SI->relocations())
+      if (relocationNeedsGot(Reloc))
+        GotSize += GotEntrySize;
+  }
+
+  return GotSize;
+}
+
 // compute stub buffer size for the given section
 unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
                                                     const SectionRef &Section) {
@@ -717,8 +750,8 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
     Alignment = std::max(Alignment, getStubAlignment());
 
   // Some sections, such as debug info, don't need to be loaded for execution.
-  // Leave those where they are.
-  if (IsRequired) {
+  // Process those only if explicitly requested.
+  if (IsRequired || ProcessAllSections) {
     Allocate = DataSize + PaddingSize + StubBufSize;
     if (!Allocate)
       Allocate = 1;
@@ -762,6 +795,10 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   Sections.push_back(
       SectionEntry(Name, Addr, DataSize, Allocate, (uintptr_t)pData));
 
+  // Debug info sections are linked as if their load address was zero
+  if (!IsRequired)
+    Sections.back().setLoadAddress(0);
+
   if (Checker)
     Checker->registerSection(Obj.getFileName(), SectionID);
 
@@ -921,7 +958,7 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
   }
 }
 
-void RuntimeDyldImpl::resolveExternalSymbols() {
+Error RuntimeDyldImpl::resolveExternalSymbols() {
   while (!ExternalSymbolRelocations.empty()) {
     StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin();
 
@@ -939,10 +976,24 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
         // This is an external symbol, try to get its address from the symbol
         // resolver.
         // First search for the symbol in this logical dylib.
-        Addr = Resolver.findSymbolInLogicalDylib(Name.data()).getAddress();
+        if (auto Sym = Resolver.findSymbolInLogicalDylib(Name.data())) {
+          if (auto AddrOrErr = Sym.getAddress())
+            Addr = *AddrOrErr;
+          else
+            return AddrOrErr.takeError();
+        } else if (auto Err = Sym.takeError())
+          return Err;
+
         // If that fails, try searching for an external symbol.
-        if (!Addr)
-          Addr = Resolver.findSymbol(Name.data()).getAddress();
+        if (!Addr) {
+          if (auto Sym = Resolver.findSymbol(Name.data())) {
+            if (auto AddrOrErr = Sym.getAddress())
+              Addr = *AddrOrErr;
+            else
+              return AddrOrErr.takeError();
+          } else if (auto Err = Sym.takeError())
+            return Err;
+        }
         // The call to getSymbolAddress may have caused additional modules to
         // be loaded, which may have added new entries to the
         // ExternalSymbolRelocations map.  Consquently, we need to update our
@@ -977,6 +1028,8 @@ void RuntimeDyldImpl::resolveExternalSymbols() {
 
     ExternalSymbolRelocations.erase(i);
   }
+
+  return Error::success();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index 1bd28ef..1c54ad6 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -27,9 +27,12 @@ using namespace llvm::object;
 namespace {
 
 class LoadedCOFFObjectInfo final
-    : public RuntimeDyld::LoadedObjectInfoHelper<LoadedCOFFObjectInfo> {
+    : public LoadedObjectInfoHelper<LoadedCOFFObjectInfo,
+                                    RuntimeDyld::LoadedObjectInfo> {
 public:
-  LoadedCOFFObjectInfo(RuntimeDyldImpl &RTDyld, ObjSectionToIDMap ObjSecToIDMap)
+  LoadedCOFFObjectInfo(
+      RuntimeDyldImpl &RTDyld,
+      RuntimeDyld::LoadedObjectInfo::ObjSectionToIDMap ObjSecToIDMap)
       : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {}
 
   OwningBinary<ObjectFile>
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 7bfa794..5bc7434 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -742,7 +742,7 @@ uint64_t RuntimeDyldCheckerImpl::getSymbolLocalAddr(StringRef Symbol) const {
 uint64_t RuntimeDyldCheckerImpl::getSymbolRemoteAddr(StringRef Symbol) const {
   if (auto InternalSymbol = getRTDyld().getSymbol(Symbol))
     return InternalSymbol.getAddress();
-  return getRTDyld().Resolver.findSymbol(Symbol).getAddress();
+  return cantFail(getRTDyld().Resolver.findSymbol(Symbol).getAddress());
 }
 
 uint64_t RuntimeDyldCheckerImpl::readMemoryAtAddr(uint64_t SrcAddr,
@@ -861,6 +861,15 @@ RuntimeDyldCheckerImpl::getSubsectionStartingAt(StringRef Name) const {
                        SymInfo.getOffset());
 }
 
+Optional<uint64_t>
+RuntimeDyldCheckerImpl::getSectionLoadAddress(void *LocalAddress) const {
+  for (auto &S : getRTDyld().Sections) {
+    if (S.getAddress() == LocalAddress)
+      return S.getLoadAddress();
+  }
+  return Optional<uint64_t>();
+}
+
 void RuntimeDyldCheckerImpl::registerSection(
     StringRef FilePath, unsigned SectionID) {
   StringRef FileName = sys::path::filename(FilePath);
@@ -935,3 +944,8 @@ RuntimeDyldChecker::getSectionAddr(StringRef FileName, StringRef SectionName,
                                    bool LocalAddress) {
   return Impl->getSectionAddr(FileName, SectionName, LocalAddress);
 }
+
+Optional<uint64_t>
+RuntimeDyldChecker::getSectionLoadAddress(void *LocalAddress) const {
+  return Impl->getSectionLoadAddress(LocalAddress);
+}
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
index b7263be..b462ef2 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
@@ -60,6 +60,8 @@ private:
                                                   bool IsInsideLoad) const;
   StringRef getSubsectionStartingAt(StringRef Name) const;
 
+  Optional<uint64_t> getSectionLoadAddress(void *LocalAddr) const;
+
   void registerSection(StringRef FilePath, unsigned SectionID);
   void registerStubMap(StringRef FilePath, unsigned SectionID,
                        const RuntimeDyldImpl::StubMap &RTDyldStubs);
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 05615d3..77c9684 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -18,10 +18,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -78,11 +78,11 @@ public:
   void updateSymbolAddress(const SymbolRef &SymRef, uint64_t Addr);
 
   // Methods for type inquiry through isa, cast and dyn_cast
-  static inline bool classof(const Binary *v) {
+  static bool classof(const Binary *v) {
     return (isa<ELFObjectFile<ELFT>>(v) &&
             classof(cast<ELFObjectFile<ELFT>>(v)));
   }
-  static inline bool classof(const ELFObjectFile<ELFT> *v) {
+  static bool classof(const ELFObjectFile<ELFT> *v) {
     return v->isDyldType();
   }
 };
@@ -123,7 +123,8 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
 }
 
 class LoadedELFObjectInfo final
-    : public RuntimeDyld::LoadedObjectInfoHelper<LoadedELFObjectInfo> {
+    : public LoadedObjectInfoHelper<LoadedELFObjectInfo,
+                                    RuntimeDyld::LoadedObjectInfo> {
 public:
   LoadedELFObjectInfo(RuntimeDyldImpl &RTDyld, ObjSectionToIDMap ObjSecToIDMap)
       : LoadedObjectInfoHelper(RTDyld, std::move(ObjSecToIDMap)) {}
@@ -221,22 +222,10 @@ void RuntimeDyldELF::registerEHFrames() {
     uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
     size_t EHFrameSize = Sections[EHFrameSID].getSize();
     MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
-    RegisteredEHFrameSections.push_back(EHFrameSID);
   }
   UnregisteredEHFrameSections.clear();
 }
 
-void RuntimeDyldELF::deregisterEHFrames() {
-  for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
-    SID EHFrameSID = RegisteredEHFrameSections[i];
-    uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress();
-    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress();
-    size_t EHFrameSize = Sections[EHFrameSID].getSize();
-    MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
-  }
-  RegisteredEHFrameSections.clear();
-}
-
 std::unique_ptr<RuntimeDyldELF>
 llvm::RuntimeDyldELF::create(Triple::ArchType Arch,
                              RuntimeDyld::MemoryManager &MemMgr,
@@ -272,6 +261,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
   default:
     llvm_unreachable("Relocation type not implemented yet!");
     break;
+  case ELF::R_X86_64_NONE:
+    break;
   case ELF::R_X86_64_64: {
     support::ulittle64_t::ref(Section.getAddressWithOffset(Offset)) =
         Value + Addend;
@@ -419,6 +410,18 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     // from bits 11:0 of X
     or32AArch64Imm(TargetPtr, Value + Addend);
     break;
+  case ELF::R_AARCH64_LDST8_ABS_LO12_NC:
+    // Operation: S + A
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:0 of X
+    or32AArch64Imm(TargetPtr, getBits(Value + Addend, 0, 11));
+    break;
+  case ELF::R_AARCH64_LDST16_ABS_LO12_NC:
+    // Operation: S + A
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:1 of X
+    or32AArch64Imm(TargetPtr, getBits(Value + Addend, 1, 11));
+    break;
   case ELF::R_AARCH64_LDST32_ABS_LO12_NC:
     // Operation: S + A
     // Immediate goes in bits 21:10 of LD/ST instruction, taken
@@ -431,6 +434,12 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     // from bits 11:3 of X
     or32AArch64Imm(TargetPtr, getBits(Value + Addend, 3, 11));
     break;
+  case ELF::R_AARCH64_LDST128_ABS_LO12_NC:
+    // Operation: S + A
+    // Immediate goes in bits 21:10 of LD/ST instruction, taken
+    // from bits 11:4 of X
+    or32AArch64Imm(TargetPtr, getBits(Value + Addend, 4, 11));
+    break;
   }
 }
 
@@ -729,23 +738,23 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     writeInt16BE(LocalAddress, applyPPCha(Delta));
   } break;
   case ELF::R_PPC64_ADDR32: {
-    int32_t Result = static_cast<int32_t>(Value + Addend);
-    if (SignExtend32<32>(Result) != Result)
+    int64_t Result = static_cast<int64_t>(Value + Addend);
+    if (SignExtend64<32>(Result) != Result)
       llvm_unreachable("Relocation R_PPC64_ADDR32 overflow");
     writeInt32BE(LocalAddress, Result);
   } break;
   case ELF::R_PPC64_REL24: {
     uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
-    if (SignExtend32<26>(delta) != delta)
+    int64_t delta = static_cast<int64_t>(Value - FinalAddress + Addend);
+    if (SignExtend64<26>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL24 overflow");
     // Generates a 'bl <address>' instruction
     writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
   } break;
   case ELF::R_PPC64_REL32: {
     uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
-    if (SignExtend32<32>(delta) != delta)
+    int64_t delta = static_cast<int64_t>(Value - FinalAddress + Addend);
+    if (SignExtend64<32>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL32 overflow");
     writeInt32BE(LocalAddress, delta);
   } break;
@@ -782,20 +791,63 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
     writeInt32BE(LocalAddress, Delta / 2);
     break;
   }
+  case ELF::R_390_PC16: {
+    int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
+    assert(int16_t(Delta) == Delta && "R_390_PC16 overflow");
+    writeInt16BE(LocalAddress, Delta);
+    break;
+  }
   case ELF::R_390_PC32: {
     int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     assert(int32_t(Delta) == Delta && "R_390_PC32 overflow");
     writeInt32BE(LocalAddress, Delta);
     break;
   }
-  case ELF::R_390_64:
-    writeInt64BE(LocalAddress, Value + Addend);
-    break;
   case ELF::R_390_PC64: {
     int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset);
     writeInt64BE(LocalAddress, Delta);
     break;
   }
+  case ELF::R_390_8:
+    *LocalAddress = (uint8_t)(Value + Addend);
+    break;
+  case ELF::R_390_16:
+    writeInt16BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_390_32:
+    writeInt32BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_390_64:
+    writeInt64BE(LocalAddress, Value + Addend);
+    break;
+  }
+}
+
+void RuntimeDyldELF::resolveBPFRelocation(const SectionEntry &Section,
+                                          uint64_t Offset, uint64_t Value,
+                                          uint32_t Type, int64_t Addend) {
+  bool isBE = Arch == Triple::bpfeb;
+
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+    break;
+  case ELF::R_BPF_NONE:
+    break;
+  case ELF::R_BPF_64_64: {
+    write(isBE, Section.getAddressWithOffset(Offset), Value + Addend);
+    DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend)) << " at "
+                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    break;
+  }
+  case ELF::R_BPF_64_32: {
+    Value += Addend;
+    assert(Value <= UINT32_MAX);
+    write(isBE, Section.getAddressWithOffset(Offset), static_cast<uint32_t>(Value));
+    DEBUG(dbgs() << "Writing " << format("%p", Value) << " at "
+                 << format("%p\n", Section.getAddressWithOffset(Offset)));
+    break;
+  }
   }
 }
 
@@ -859,6 +911,10 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
   case Triple::systemz:
     resolveSystemZRelocation(Section, Offset, Value, Type, Addend);
     break;
+  case Triple::bpfel:
+  case Triple::bpfeb:
+    resolveBPFRelocation(Section, Offset, Value, Type, Addend);
+    break;
   default:
     llvm_unreachable("Unsupported CPU type!");
   }
@@ -900,7 +956,7 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
 }
 
 // Sometimes we don't need to create thunk for a branch.
-// This typically happens when branch target is located 
+// This typically happens when branch target is located
 // in the same object file. In such case target is either
 // a weak symbol or symbol in a different executable section.
 // This function checks if branch target is located in the
@@ -941,6 +997,61 @@ bool RuntimeDyldELF::resolveAArch64ShortBranch(
   return true;
 }
 
+void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
+                                          const RelocationValueRef &Value,
+                                          relocation_iterator RelI,
+                                          StubMap &Stubs) {
+
+  DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
+  SectionEntry &Section = Sections[SectionID];
+
+  uint64_t Offset = RelI->getOffset();
+  unsigned RelType = RelI->getType();
+  // Look for an existing stub.
+  StubMap::const_iterator i = Stubs.find(Value);
+  if (i != Stubs.end()) {
+    resolveRelocation(Section, Offset,
+                      (uint64_t)Section.getAddressWithOffset(i->second),
+                      RelType, 0);
+    DEBUG(dbgs() << " Stub function found\n");
+  } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
+    // Create a new stub function.
+    DEBUG(dbgs() << " Create a new stub function\n");
+    Stubs[Value] = Section.getStubOffset();
+    uint8_t *StubTargetAddr = createStubFunction(
+        Section.getAddressWithOffset(Section.getStubOffset()));
+
+    RelocationEntry REmovz_g3(SectionID, StubTargetAddr - Section.getAddress(),
+                              ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
+    RelocationEntry REmovk_g2(SectionID,
+                              StubTargetAddr - Section.getAddress() + 4,
+                              ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
+    RelocationEntry REmovk_g1(SectionID,
+                              StubTargetAddr - Section.getAddress() + 8,
+                              ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
+    RelocationEntry REmovk_g0(SectionID,
+                              StubTargetAddr - Section.getAddress() + 12,
+                              ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
+
+    if (Value.SymbolName) {
+      addRelocationForSymbol(REmovz_g3, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g2, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g1, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g0, Value.SymbolName);
+    } else {
+      addRelocationForSection(REmovz_g3, Value.SectionID);
+      addRelocationForSection(REmovk_g2, Value.SectionID);
+      addRelocationForSection(REmovk_g1, Value.SectionID);
+      addRelocationForSection(REmovk_g0, Value.SectionID);
+    }
+    resolveRelocation(Section, Offset,
+                      reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
+                          Section.getStubOffset())),
+                      RelType, 0);
+    Section.advanceStubOffset(getMaxStubSize());
+  }
+}
+
 Expected<relocation_iterator>
 RuntimeDyldELF::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
@@ -1035,55 +1146,22 @@ RuntimeDyldELF::processRelocationRef(
 
   DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
                << "\n");
-  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be) &&
-      (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26)) {
-    // This is an AArch64 branch relocation, need to use a stub function.
-    DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
-    SectionEntry &Section = Sections[SectionID];
-
-    // Look for an existing stub.
-    StubMap::const_iterator i = Stubs.find(Value);
-    if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.getAddressWithOffset(i->second),
-                        RelType, 0);
-      DEBUG(dbgs() << " Stub function found\n");
-    } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
-      // Create a new stub function.
-      DEBUG(dbgs() << " Create a new stub function\n");
-      Stubs[Value] = Section.getStubOffset();
-      uint8_t *StubTargetAddr = createStubFunction(
-          Section.getAddressWithOffset(Section.getStubOffset()));
-
-      RelocationEntry REmovz_g3(SectionID,
-                                StubTargetAddr - Section.getAddress(),
-                                ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
-      RelocationEntry REmovk_g2(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 4,
-                                ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
-      RelocationEntry REmovk_g1(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 8,
-                                ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
-      RelocationEntry REmovk_g0(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 12,
-                                ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
-
-      if (Value.SymbolName) {
-        addRelocationForSymbol(REmovz_g3, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g2, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g1, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g0, Value.SymbolName);
-      } else {
-        addRelocationForSection(REmovz_g3, Value.SectionID);
-        addRelocationForSection(REmovk_g2, Value.SectionID);
-        addRelocationForSection(REmovk_g1, Value.SectionID);
-        addRelocationForSection(REmovk_g0, Value.SectionID);
-      }
-      resolveRelocation(Section, Offset,
-                        reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
-                            Section.getStubOffset())),
-                        RelType, 0);
-      Section.advanceStubOffset(getMaxStubSize());
+  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be)) {
+    if (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26) {
+      resolveAArch64Branch(SectionID, Value, RelI, Stubs);
+    } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
+      // Craete new GOT entry or find existing one. If GOT entry is
+      // to be created, then we also emit ABS64 relocation for it.
+      uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_AARCH64_ADR_PREL_PG_HI21);
+
+    } else if (RelType == ELF::R_AARCH64_LD64_GOT_LO12_NC) {
+      uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_AARCH64_LDST64_ABS_LO12_NC);
+    } else {
+      processSimpleRelocation(SectionID, Offset, RelType, Value);
     }
   } else if (Arch == Triple::arm) {
     if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL ||
@@ -1232,7 +1310,7 @@ RuntimeDyldELF::processRelocationRef(
       if (i != GOTSymbolOffsets.end())
         RE.SymOffset = i->second;
       else {
-        RE.SymOffset = allocateGOTEntries(SectionID, 1);
+        RE.SymOffset = allocateGOTEntries(1);
         GOTSymbolOffsets[TargetName] = RE.SymOffset;
       }
     }
@@ -1247,12 +1325,13 @@ RuntimeDyldELF::processRelocationRef(
       Obj.getPlatformFlags(AbiVariant);
       AbiVariant &= ELF::EF_PPC64_ABI;
       // A PPC branch relocation will need a stub function if the target is
-      // an external symbol (Symbol::ST_Unknown) or if the target address
-      // is not within the signed 24-bits branch address.
+      // an external symbol (either Value.SymbolName is set, or SymType is
+      // Symbol::ST_Unknown) or if the target address is not within the
+      // signed 24-bits branch address.
       SectionEntry &Section = Sections[SectionID];
       uint8_t *Target = Section.getAddressWithOffset(Offset);
       bool RangeOverflow = false;
-      if (SymType != SymbolRef::ST_Unknown) {
+      if (!Value.SymbolName && SymType != SymbolRef::ST_Unknown) {
         if (AbiVariant != 2) {
           // In the ELFv1 ABI, a function call may point to the .opd entry,
           // so the final symbol value is calculated based on the relocation
@@ -1267,21 +1346,19 @@ RuntimeDyldELF::processRelocationRef(
         }
         uint8_t *RelocTarget =
             Sections[Value.SectionID].getAddressWithOffset(Value.Addend);
-        int32_t delta = static_cast<int32_t>(Target - RelocTarget);
+        int64_t delta = static_cast<int64_t>(Target - RelocTarget);
         // If it is within 26-bits branch range, just set the branch target
-        if (SignExtend32<26>(delta) == delta) {
+        if (SignExtend64<26>(delta) == delta) {
           RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
-          if (Value.SymbolName)
-            addRelocationForSymbol(RE, Value.SymbolName);
-          else
-            addRelocationForSection(RE, Value.SectionID);
+          addRelocationForSection(RE, Value.SectionID);
         } else {
           RangeOverflow = true;
         }
       }
-      if (SymType == SymbolRef::ST_Unknown || RangeOverflow) {
-        // It is an external symbol (SymbolRef::ST_Unknown) or within a range
-        // larger than 24-bits.
+      if (Value.SymbolName || SymType == SymbolRef::ST_Unknown ||
+          RangeOverflow) {
+        // It is an external symbol (either Value.SymbolName is set, or
+        // SymType is SymbolRef::ST_Unknown) or out of range.
         StubMap::const_iterator i = Stubs.find(Value);
         if (i != Stubs.end()) {
           // Symbol function stub already created, just relocate to it
@@ -1335,7 +1412,7 @@ RuntimeDyldELF::processRelocationRef(
                             RelType, 0);
           Section.advanceStubOffset(getMaxStubSize());
         }
-        if (SymType == SymbolRef::ST_Unknown) {
+        if (Value.SymbolName || SymType == SymbolRef::ST_Unknown) {
           // Restore the TOC for external calls
           if (AbiVariant == 2)
             writeInt32BE(Target + 4, 0xE8410018); // ld r2,28(r1)
@@ -1489,14 +1566,15 @@ RuntimeDyldELF::processRelocationRef(
           Section.advanceStubOffset(getMaxStubSize());
 
           // Allocate a GOT Entry
-          uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
+          uint64_t GOTOffset = allocateGOTEntries(1);
 
           // The load of the GOT address has an addend of -4
-          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4);
+          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4,
+                                     ELF::R_X86_64_PC32);
 
           // Fill in the value of the symbol we're targeting into the GOT
           addRelocationForSymbol(
-              computeGOTOffsetRE(SectionID, GOTOffset, 0, ELF::R_X86_64_64),
+              computeGOTOffsetRE(GOTOffset, 0, ELF::R_X86_64_64),
               Value.SymbolName);
         }
 
@@ -1511,11 +1589,13 @@ RuntimeDyldELF::processRelocationRef(
     } else if (RelType == ELF::R_X86_64_GOTPCREL ||
                RelType == ELF::R_X86_64_GOTPCRELX ||
                RelType == ELF::R_X86_64_REX_GOTPCRELX) {
-      uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
-      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend);
+      uint64_t GOTOffset = allocateGOTEntries(1);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_X86_64_PC32);
 
       // Fill in the value of the symbol we're targeting into the GOT
-      RelocationEntry RE = computeGOTOffsetRE(SectionID, GOTOffset, Value.Offset, ELF::R_X86_64_64);
+      RelocationEntry RE =
+          computeGOTOffsetRE(GOTOffset, Value.Offset, ELF::R_X86_64_64);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -1573,9 +1653,7 @@ size_t RuntimeDyldELF::getGOTEntrySize() {
   return Result;
 }
 
-uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
-{
-  (void)SectionID; // The GOT Section is the same for all section in the object file
+uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned no) {
   if (GOTSectionID == 0) {
     GOTSectionID = Sections.size();
     // Reserve a section id. We'll allocate the section later
@@ -1587,17 +1665,38 @@ uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
   return StartOffset;
 }
 
-void RuntimeDyldELF::resolveGOTOffsetRelocation(unsigned SectionID, uint64_t Offset, uint64_t GOTOffset)
-{
+uint64_t RuntimeDyldELF::findOrAllocGOTEntry(const RelocationValueRef &Value,
+                                             unsigned GOTRelType) {
+  auto E = GOTOffsetMap.insert({Value, 0});
+  if (E.second) {
+    uint64_t GOTOffset = allocateGOTEntries(1);
+
+    // Create relocation for newly created GOT entry
+    RelocationEntry RE =
+        computeGOTOffsetRE(GOTOffset, Value.Offset, GOTRelType);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+
+    E.first->second = GOTOffset;
+  }
+
+  return E.first->second;
+}
+
+void RuntimeDyldELF::resolveGOTOffsetRelocation(unsigned SectionID,
+                                                uint64_t Offset,
+                                                uint64_t GOTOffset,
+                                                uint32_t Type) {
   // Fill in the relative address of the GOT Entry into the stub
-  RelocationEntry GOTRE(SectionID, Offset, ELF::R_X86_64_PC32, GOTOffset);
+  RelocationEntry GOTRE(SectionID, Offset, Type, GOTOffset);
   addRelocationForSection(GOTRE, GOTSectionID);
 }
 
-RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(unsigned SectionID, uint64_t GOTOffset, uint64_t SymbolOffset,
-                                                   uint32_t Type)
-{
-  (void)SectionID; // The GOT Section is the same for all section in the object file
+RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(uint64_t GOTOffset,
+                                                   uint64_t SymbolOffset,
+                                                   uint32_t Type) {
   return RelocationEntry(GOTSectionID, GOTOffset, Type, SymbolOffset);
 }
 
@@ -1663,6 +1762,19 @@ bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const {
   return Obj.isELF();
 }
 
+bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
+  unsigned RelTy = R.getType();
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be)
+    return RelTy == ELF::R_AARCH64_ADR_GOT_PAGE ||
+           RelTy == ELF::R_AARCH64_LD64_GOT_LO12_NC;
+
+  if (Arch == Triple::x86_64)
+    return RelTy == ELF::R_X86_64_GOTPCREL ||
+           RelTy == ELF::R_X86_64_GOTPCRELX ||
+           RelTy == ELF::R_X86_64_REX_GOTPCRELX;
+  return false;
+}
+
 bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const {
   if (Arch != Triple::x86_64)
     return true;  // Conservative answer
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index d1867d0..fb5da6d 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -43,6 +43,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI,
                                  const RelocationValueRef &Value);
 
+  void resolveAArch64Branch(unsigned SectionID, const RelocationValueRef &Value,
+                            relocation_iterator RelI, StubMap &Stubs);
+
   void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset,
                             uint32_t Value, uint32_t Type, int32_t Addend);
 
@@ -55,6 +58,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   void resolveSystemZRelocation(const SectionEntry &Section, uint64_t Offset,
                                 uint64_t Value, uint32_t Type, int64_t Addend);
 
+  void resolveBPFRelocation(const SectionEntry &Section, uint64_t Offset,
+                            uint64_t Value, uint32_t Type, int64_t Addend);
+
   unsigned getMaxStubSize() override {
     if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be)
       return 20; // movz; movk; movk; movk; br
@@ -88,24 +94,26 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                             ObjSectionToIDMap &LocalSections,
                             RelocationValueRef &Rel);
 protected:
-  size_t getGOTEntrySize();
+  size_t getGOTEntrySize() override;
 
 private:
   SectionEntry &getSection(unsigned SectionID) { return Sections[SectionID]; }
 
   // Allocate no GOT entries for use in the given section.
-  uint64_t allocateGOTEntries(unsigned SectionID, unsigned no);
+  uint64_t allocateGOTEntries(unsigned no);
+
+  // Find GOT entry corresponding to relocation or create new one.
+  uint64_t findOrAllocGOTEntry(const RelocationValueRef &Value,
+                               unsigned GOTRelType);
 
   // Resolve the relvative address of GOTOffset in Section ID and place
   // it at the given Offset
   void resolveGOTOffsetRelocation(unsigned SectionID, uint64_t Offset,
-                                  uint64_t GOTOffset);
+                                  uint64_t GOTOffset, uint32_t Type);
 
   // For a GOT entry referenced from SectionID, compute a relocation entry
   // that will place the final resolved value in the GOT slot
-  RelocationEntry computeGOTOffsetRE(unsigned SectionID,
-                                     uint64_t GOTOffset,
-                                     uint64_t SymbolOffset,
+  RelocationEntry computeGOTOffsetRE(uint64_t GOTOffset, uint64_t SymbolOffset,
                                      unsigned Type);
 
   // Compute the address in memory where we can find the placeholder
@@ -144,8 +152,11 @@ private:
   // in a table until we receive a request to register all unregistered
   // EH frame sections with the memory manager.
   SmallVector<SID, 2> UnregisteredEHFrameSections;
-  SmallVector<SID, 2> RegisteredEHFrameSections;
 
+  // Map between GOT relocation value and corresponding GOT offset
+  std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
+
+  bool relocationNeedsGot(const RelocationRef &R) const override;
   bool relocationNeedsStub(const RelocationRef &R) const override;
 
 public:
@@ -168,7 +179,6 @@ public:
                        StubMap &Stubs) override;
   bool isCompatibleFile(const object::ObjectFile &Obj) const override;
   void registerEHFrames() override;
-  void deregisterEHFrames() override;
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override;
 };
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 279d0de..95b04fd 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -28,8 +28,8 @@
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include <map>
-#include <unordered_map>
 #include <system_error>
+#include <unordered_map>
 
 using namespace llvm;
 using namespace llvm::object;
@@ -213,7 +213,7 @@ public:
   }
 };
 
-/// @brief Symbol info for RuntimeDyld. 
+/// @brief Symbol info for RuntimeDyld.
 class SymbolTableEntry {
 public:
   SymbolTableEntry()
@@ -417,7 +417,7 @@ protected:
                        StubMap &Stubs) = 0;
 
   /// \brief Resolve relocations to external symbols.
-  void resolveExternalSymbols();
+  Error resolveExternalSymbols();
 
   // \brief Compute an upper bound of the memory that is required to load all
   // sections
@@ -426,6 +426,9 @@ protected:
                               uint64_t &RODataSize, uint32_t &RODataAlign,
                               uint64_t &RWDataSize, uint32_t &RWDataAlign);
 
+  // \brief Compute GOT size
+  unsigned computeGOTSize(const ObjectFile &Obj);
+
   // \brief Compute the stub buffer size required for a section
   unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
@@ -433,6 +436,14 @@ protected:
   // \brief Implementation of the generic part of the loadObject algorithm.
   Expected<ObjSectionToIDMap> loadObjectImpl(const object::ObjectFile &Obj);
 
+  // \brief Return size of Global Offset Table (GOT) entry
+  virtual size_t getGOTEntrySize() { return 0; }
+
+  // \brief Return true if the relocation R may require allocating a GOT entry.
+  virtual bool relocationNeedsGot(const RelocationRef &R) const {
+    return false;
+  }
+
   // \brief Return true if the relocation R may require allocating a stub.
   virtual bool relocationNeedsStub(const RelocationRef &R) const {
     return true;    // Conservative answer
@@ -504,7 +515,7 @@ public:
 
   virtual void registerEHFrames();
 
-  virtual void deregisterEHFrames();
+  void deregisterEHFrames();
 
   virtual Error finalizeLoad(const ObjectFile &ObjImg,
                              ObjSectionToIDMap &SectionMap) {
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 00541e8..80e9c7a 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -27,7 +27,8 @@ using namespace llvm::object;
 namespace {
 
 class LoadedMachOObjectInfo final
-    : public RuntimeDyld::LoadedObjectInfoHelper<LoadedMachOObjectInfo> {
+    : public LoadedObjectInfoHelper<LoadedMachOObjectInfo,
+                                    RuntimeDyld::LoadedObjectInfo> {
 public:
   LoadedMachOObjectInfo(RuntimeDyldImpl &RTDyld,
                         ObjSectionToIDMap ObjSecToIDMap)
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 0398413..901f778 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFI386_H
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFI386_H
 
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/COFF.h"
 #include "../RuntimeDyldCOFF.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
 
 #define DEBUG_TYPE "dyld"
 
@@ -217,7 +217,6 @@ public:
   }
 
   void registerEHFrames() override {}
-  void deregisterEHFrames() override {}
 };
 
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index 8c6af0b..3e4b0c8 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFTHUMB_H
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFFTHUMB_H
 
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/COFF.h"
 #include "../RuntimeDyldCOFF.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
 
 #define DEBUG_TYPE "dyld"
 
@@ -316,7 +316,6 @@ public:
   }
 
   void registerEHFrames() override {}
-  void deregisterEHFrames() override {}
 };
 
 }
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index 109beb3..7cbb438 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFF86_64_H
 #define LLVM_LIB_EXECUTIONENGINE_RUNTIMEDYLD_TARGETS_RUNTIMEDYLDCOFF86_64_H
 
-#include "llvm/Object/COFF.h"
-#include "llvm/Support/COFF.h"
 #include "../RuntimeDyldCOFF.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/COFF.h"
 
 #define DEBUG_TYPE "dyld"
 
@@ -194,9 +194,6 @@ public:
     }
     UnregisteredEHFrameSections.clear();
   }
-  void deregisterEHFrames() override {
-    // Stub
-  }
   Error finalizeLoad(const ObjectFile &Obj,
                      ObjSectionToIDMap &SectionMap) override {
     // Look for and record the EH frame section IDs.
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
index cae4d69..926996d 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldELFMips.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RuntimeDyldELFMips.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/BinaryFormat/ELF.h"
 
 #define DEBUG_TYPE "dyld"
 
diff --git a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
index adca0ee..43461de 100644
--- a/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
+++ b/contrib/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldMachOARM.h
@@ -288,7 +288,6 @@ private:
                       HalfDiffKindBits);
 
     addRelocationForSection(R, SectionAID);
-    addRelocationForSection(R, SectionBID);
 
     return ++RelI;
   }
diff --git a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 50478ea..8904475 100644
--- a/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/contrib/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Config/config.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Process.h"
 
diff --git a/contrib/llvm/lib/IR/AsmWriter.cpp b/contrib/llvm/lib/IR/AsmWriter.cpp
index eecef94..170bc54 100644
--- a/contrib/llvm/lib/IR/AsmWriter.cpp
+++ b/contrib/llvm/lib/IR/AsmWriter.cpp
@@ -1,5 +1,4 @@
-
-//===-- AsmWriter.cpp - Printing LLVM as an assembly file -----------------===//
+//===- AsmWriter.cpp - Printing LLVM as an assembly file ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,62 +14,105 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/Use.h"
 #include "llvm/IR/UseListOrder.h"
-#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 // Make virtual table appear in this compilation unit.
-AssemblyAnnotationWriter::~AssemblyAnnotationWriter() {}
+AssemblyAnnotationWriter::~AssemblyAnnotationWriter() = default;
 
 //===----------------------------------------------------------------------===//
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 struct OrderMap {
   DenseMap<const Value *, std::pair<unsigned, bool>> IDs;
 
   unsigned size() const { return IDs.size(); }
   std::pair<unsigned, bool> &operator[](const Value *V) { return IDs[V]; }
+
   std::pair<unsigned, bool> lookup(const Value *V) const {
     return IDs.lookup(V);
   }
+
   void index(const Value *V) {
     // Explicitly sequence get-size and insert-value operations to avoid UB.
     unsigned ID = IDs.size() + 1;
     IDs[V].first = ID;
   }
 };
-}
+
+} // end anonymous namespace
 
 static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V).first)
@@ -138,7 +180,7 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
                                          unsigned ID, const OrderMap &OM,
                                          UseListOrderStack &Stack) {
   // Predict use-list order for this one.
-  typedef std::pair<const Use *, unsigned> Entry;
+  using Entry = std::pair<const Use *, unsigned>;
   SmallVector<Entry, 64> List;
   for (const Use &U : V->uses())
     // Check if this user will be serialized.
@@ -323,7 +365,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
   case CallingConv::PTX_Device:    Out << "ptx_device"; break;
   case CallingConv::X86_64_SysV:   Out << "x86_64_sysvcc"; break;
-  case CallingConv::X86_64_Win64:  Out << "x86_64_win64cc"; break;
+  case CallingConv::Win64:         Out << "win64cc"; break;
   case CallingConv::SPIR_FUNC:     Out << "spir_func"; break;
   case CallingConv::SPIR_KERNEL:   Out << "spir_kernel"; break;
   case CallingConv::Swift:         Out << "swiftcc"; break;
@@ -331,6 +373,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::HHVM:          Out << "hhvmcc"; break;
   case CallingConv::HHVM_C:        Out << "hhvm_ccc"; break;
   case CallingConv::AMDGPU_VS:     Out << "amdgpu_vs"; break;
+  case CallingConv::AMDGPU_HS:     Out << "amdgpu_hs"; break;
   case CallingConv::AMDGPU_GS:     Out << "amdgpu_gs"; break;
   case CallingConv::AMDGPU_PS:     Out << "amdgpu_ps"; break;
   case CallingConv::AMDGPU_CS:     Out << "amdgpu_cs"; break;
@@ -419,13 +462,10 @@ static void PrintLLVMName(raw_ostream &OS, const Value *V) {
                 isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
 }
 
-
 namespace {
+
 class TypePrinting {
-  TypePrinting(const TypePrinting &) = delete;
-  void operator=(const TypePrinting&) = delete;
 public:
-
   /// NamedTypes - The named types that are used by the current module.
   TypeFinder NamedTypes;
 
@@ -433,6 +473,8 @@ public:
   DenseMap<StructType*, unsigned> NumberedTypes;
 
   TypePrinting() = default;
+  TypePrinting(const TypePrinting &) = delete;
+  TypePrinting &operator=(const TypePrinting &) = delete;
 
   void incorporateTypes(const Module &M);
 
@@ -440,7 +482,8 @@ public:
 
   void printStructBody(StructType *Ty, raw_ostream &OS);
 };
-} // namespace
+
+} // end anonymous namespace
 
 void TypePrinting::incorporateTypes(const Module &M) {
   NamedTypes.run(M, false);
@@ -572,6 +615,7 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
 }
 
 namespace llvm {
+
 //===----------------------------------------------------------------------===//
 // SlotTracker Class: Enumerate slot numbers for unnamed values
 //===----------------------------------------------------------------------===//
@@ -580,32 +624,33 @@ namespace llvm {
 class SlotTracker {
 public:
   /// ValueMap - A mapping of Values to slot numbers.
-  typedef DenseMap<const Value*, unsigned> ValueMap;
+  using ValueMap = DenseMap<const Value *, unsigned>;
 
 private:
   /// TheModule - The module for which we are holding slot numbers.
   const Module* TheModule;
 
   /// TheFunction - The function for which we are holding slot numbers.
-  const Function* TheFunction;
-  bool FunctionProcessed;
+  const Function* TheFunction = nullptr;
+  bool FunctionProcessed = false;
   bool ShouldInitializeAllMetadata;
 
   /// mMap - The slot map for the module level data.
   ValueMap mMap;
-  unsigned mNext;
+  unsigned mNext = 0;
 
   /// fMap - The slot map for the function level data.
   ValueMap fMap;
-  unsigned fNext;
+  unsigned fNext = 0;
 
   /// mdnMap - Map for MDNodes.
   DenseMap<const MDNode*, unsigned> mdnMap;
-  unsigned mdnNext;
+  unsigned mdnNext = 0;
 
   /// asMap - The slot map for attribute sets.
   DenseMap<AttributeSet, unsigned> asMap;
-  unsigned asNext;
+  unsigned asNext = 0;
+
 public:
   /// Construct from a module.
   ///
@@ -614,6 +659,7 @@ public:
   /// within a function (even if no functions have been initialized).
   explicit SlotTracker(const Module *M,
                        bool ShouldInitializeAllMetadata = false);
+
   /// Construct from a function, starting out in incorp state.
   ///
   /// If \c ShouldInitializeAllMetadata, initializes all metadata in all
@@ -622,6 +668,9 @@ public:
   explicit SlotTracker(const Function *F,
                        bool ShouldInitializeAllMetadata = false);
 
+  SlotTracker(const SlotTracker &) = delete;
+  SlotTracker &operator=(const SlotTracker &) = delete;
+
   /// Return the slot number of the specified value in it's type
   /// plane.  If something is not in the SlotTracker, return -1.
   int getLocalSlot(const Value *V);
@@ -644,14 +693,16 @@ public:
   void purgeFunction();
 
   /// MDNode map iterators.
-  typedef DenseMap<const MDNode*, unsigned>::iterator mdn_iterator;
+  using mdn_iterator = DenseMap<const MDNode*, unsigned>::iterator;
+
   mdn_iterator mdn_begin() { return mdnMap.begin(); }
   mdn_iterator mdn_end() { return mdnMap.end(); }
   unsigned mdn_size() const { return mdnMap.size(); }
   bool mdn_empty() const { return mdnMap.empty(); }
 
   /// AttributeSet map iterators.
-  typedef DenseMap<AttributeSet, unsigned>::iterator as_iterator;
+  using as_iterator = DenseMap<AttributeSet, unsigned>::iterator;
+
   as_iterator as_begin()   { return asMap.begin(); }
   as_iterator as_end()     { return asMap.end(); }
   unsigned as_size() const { return asMap.size(); }
@@ -689,11 +740,9 @@ private:
 
   /// Add all of the metadata from an instruction.
   void processInstructionMetadata(const Instruction &I);
-
-  SlotTracker(const SlotTracker &) = delete;
-  void operator=(const SlotTracker &) = delete;
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 ModuleSlotTracker::ModuleSlotTracker(SlotTracker &Machine, const Module *M,
                                      const Function *F)
@@ -704,7 +753,7 @@ ModuleSlotTracker::ModuleSlotTracker(const Module *M,
     : ShouldCreateStorage(M),
       ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), M(M) {}
 
-ModuleSlotTracker::~ModuleSlotTracker() {}
+ModuleSlotTracker::~ModuleSlotTracker() = default;
 
 SlotTracker *ModuleSlotTracker::getMachine() {
   if (!ShouldCreateStorage)
@@ -771,17 +820,13 @@ static SlotTracker *createSlotTracker(const Value *V) {
 // Module level constructor. Causes the contents of the Module (sans functions)
 // to be added to the slot table.
 SlotTracker::SlotTracker(const Module *M, bool ShouldInitializeAllMetadata)
-    : TheModule(M), TheFunction(nullptr), FunctionProcessed(false),
-      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), mNext(0),
-      fNext(0), mdnNext(0), asNext(0) {}
+    : TheModule(M), ShouldInitializeAllMetadata(ShouldInitializeAllMetadata) {}
 
 // Function level constructor. Causes the contents of the Module and the one
 // function provided to be added to the slot table.
 SlotTracker::SlotTracker(const Function *F, bool ShouldInitializeAllMetadata)
     : TheModule(F ? F->getParent() : nullptr), TheFunction(F),
-      FunctionProcessed(false),
-      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata), mNext(0),
-      fNext(0), mdnNext(0), asNext(0) {}
+      ShouldInitializeAllMetadata(ShouldInitializeAllMetadata) {}
 
 inline void SlotTracker::initialize() {
   if (TheModule) {
@@ -803,6 +848,9 @@ void SlotTracker::processModule() {
     if (!Var.hasName())
       CreateModuleSlot(&Var);
     processGlobalObjectMetadata(Var);
+    auto Attrs = Var.getAttributes();
+    if (Attrs.hasAttributes())
+      CreateAttributeSetSlot(Attrs);
   }
 
   for (const GlobalAlias &A : TheModule->aliases()) {
@@ -832,7 +880,7 @@ void SlotTracker::processModule() {
     // Add all the function attributes to the table.
     // FIXME: Add attributes of other objects?
     AttributeSet FnAttrs = F.getAttributes().getFnAttributes();
-    if (FnAttrs.hasAttributes(AttributeSet::FunctionIndex))
+    if (FnAttrs.hasAttributes())
       CreateAttributeSetSlot(FnAttrs);
   }
 
@@ -867,15 +915,10 @@ void SlotTracker::processFunction() {
 
       // We allow direct calls to any llvm.foo function here, because the
       // target may not be linked into the optimizer.
-      if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
-        // Add all the call attributes to the table.
-        AttributeSet Attrs = CI->getAttributes().getFnAttributes();
-        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-          CreateAttributeSetSlot(Attrs);
-      } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+      if (auto CS = ImmutableCallSite(&I)) {
         // Add all the call attributes to the table.
-        AttributeSet Attrs = II->getAttributes().getFnAttributes();
-        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        AttributeSet Attrs = CS.getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes())
           CreateAttributeSetSlot(Attrs);
       }
     }
@@ -949,7 +992,6 @@ int SlotTracker::getMetadataSlot(const MDNode *N) {
   return MI == mdnMap.end() ? -1 : (int)MI->second;
 }
 
-
 /// getLocalSlot - Get the slot number for a value that is local to a function.
 int SlotTracker::getLocalSlot(const Value *V) {
   assert(!isa<Constant>(V) && "Can't get a constant or global slot with this!");
@@ -1016,8 +1058,7 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 }
 
 void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
-  assert(AS.hasAttributes(AttributeSet::FunctionIndex) &&
-         "Doesn't need a slot!");
+  assert(AS.hasAttributes() && "Doesn't need a slot!");
 
   as_iterator I = asMap.find(AS);
   if (I != asMap.end())
@@ -1073,6 +1114,8 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
         Out << " nsz";
       if (FPO->hasAllowReciprocal())
         Out << " arcp";
+      if (FPO->hasAllowContract())
+        Out << " contract";
     }
   }
 
@@ -1106,35 +1149,34 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
-    if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEsingle() ||
-        &CFP->getValueAPF().getSemantics() == &APFloat::IEEEdouble()) {
+    const APFloat &APF = CFP->getValueAPF();
+    if (&APF.getSemantics() == &APFloat::IEEEsingle() ||
+        &APF.getSemantics() == &APFloat::IEEEdouble()) {
       // We would like to output the FP constant value in exponential notation,
       // but we cannot do this if doing so will lose precision.  Check here to
       // make sure that we only output it in exponential format if we can parse
       // the value back and get the same value.
       //
       bool ignored;
-      bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble();
-      bool isInf = CFP->getValueAPF().isInfinity();
-      bool isNaN = CFP->getValueAPF().isNaN();
+      bool isDouble = &APF.getSemantics() == &APFloat::IEEEdouble();
+      bool isInf = APF.isInfinity();
+      bool isNaN = APF.isNaN();
       if (!isInf && !isNaN) {
-        double Val = isDouble ? CFP->getValueAPF().convertToDouble() :
-                                CFP->getValueAPF().convertToFloat();
+        double Val = isDouble ? APF.convertToDouble() : APF.convertToFloat();
         SmallString<128> StrVal;
-        raw_svector_ostream(StrVal) << Val;
-
+        APF.toString(StrVal, 6, 0, false);
         // Check to make sure that the stringized number is not some string like
         // "Inf" or NaN, that atof will accept, but the lexer will not.  Check
         // that the string matches the "[-+]?[0-9]" regex.
         //
-        if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
-            ((StrVal[0] == '-' || StrVal[0] == '+') &&
-             (StrVal[1] >= '0' && StrVal[1] <= '9'))) {
-          // Reparse stringized version!
-          if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
-            Out << StrVal;
-            return;
-          }
+        assert(((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+                ((StrVal[0] == '-' || StrVal[0] == '+') &&
+                 (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+               "[-+]?[0-9] regex does not match!");
+        // Reparse stringized version!
+        if (APFloat(APFloat::IEEEdouble(), StrVal).convertToDouble() == Val) {
+          Out << StrVal;
+          return;
         }
       }
       // Otherwise we could not reparse it to exactly the same value, so we must
@@ -1143,7 +1185,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       // x86, so we must not use these types.
       static_assert(sizeof(double) == sizeof(uint64_t),
                     "assuming that double is 64 bits!");
-      APFloat apf = CFP->getValueAPF();
+      APFloat apf = APF;
       // Floats are represented in ASCII IR as double, convert.
       if (!isDouble)
         apf.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
@@ -1156,27 +1198,27 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     // These appear as a magic letter identifying the type, then a
     // fixed number of hex digits.
     Out << "0x";
-    APInt API = CFP->getValueAPF().bitcastToAPInt();
-    if (&CFP->getValueAPF().getSemantics() == &APFloat::x87DoubleExtended()) {
+    APInt API = APF.bitcastToAPInt();
+    if (&APF.getSemantics() == &APFloat::x87DoubleExtended()) {
       Out << 'K';
       Out << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
                                   /*Upper=*/true);
       Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
                                   /*Upper=*/true);
       return;
-    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEquad()) {
+    } else if (&APF.getSemantics() == &APFloat::IEEEquad()) {
       Out << 'L';
       Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
                                   /*Upper=*/true);
       Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
                                   /*Upper=*/true);
-    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble()) {
+    } else if (&APF.getSemantics() == &APFloat::PPCDoubleDouble()) {
       Out << 'M';
       Out << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
                                   /*Upper=*/true);
       Out << format_hex_no_prefix(API.getHiBits(64).getZExtValue(), 16,
                                   /*Upper=*/true);
-    } else if (&CFP->getValueAPF().getSemantics() == &APFloat::IEEEhalf()) {
+    } else if (&APF.getSemantics() == &APFloat::IEEEhalf()) {
       Out << 'H';
       Out << format_hex_no_prefix(API.getZExtValue(), 4,
                                   /*Upper=*/true);
@@ -1248,7 +1290,6 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
-
   if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
     if (CS->getType()->isPacked())
       Out << '<';
@@ -1381,11 +1422,14 @@ static void writeMDTuple(raw_ostream &Out, const MDTuple *Node,
 }
 
 namespace {
+
 struct FieldSeparator {
-  bool Skip;
+  bool Skip = true;
   const char *Sep;
-  FieldSeparator(const char *Sep = ", ") : Skip(true), Sep(Sep) {}
+
+  FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
 };
+
 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
   if (FS.Skip) {
     FS.Skip = false;
@@ -1393,19 +1437,20 @@ raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
   }
   return OS << FS.Sep;
 }
+
 struct MDFieldPrinter {
   raw_ostream &Out;
   FieldSeparator FS;
-  TypePrinting *TypePrinter;
-  SlotTracker *Machine;
-  const Module *Context;
+  TypePrinting *TypePrinter = nullptr;
+  SlotTracker *Machine = nullptr;
+  const Module *Context = nullptr;
 
-  explicit MDFieldPrinter(raw_ostream &Out)
-      : Out(Out), TypePrinter(nullptr), Machine(nullptr), Context(nullptr) {}
+  explicit MDFieldPrinter(raw_ostream &Out) : Out(Out) {}
   MDFieldPrinter(raw_ostream &Out, TypePrinting *TypePrinter,
                  SlotTracker *Machine, const Module *Context)
       : Out(Out), TypePrinter(TypePrinter), Machine(Machine), Context(Context) {
   }
+
   void printTag(const DINode *N);
   void printMacinfoType(const DIMacroNode *N);
   void printChecksumKind(const DIFile *N);
@@ -1422,7 +1467,8 @@ struct MDFieldPrinter {
                       bool ShouldSkipZero = true);
   void printEmissionKind(StringRef Name, DICompileUnit::DebugEmissionKind EK);
 };
-} // end namespace
+
+} // end anonymous namespace
 
 void MDFieldPrinter::printTag(const DINode *N) {
   Out << FS << "tag: ";
@@ -1518,7 +1564,6 @@ void MDFieldPrinter::printEmissionKind(StringRef Name,
   Out << FS << Name << ": " << DICompileUnit::EmissionKindString(EK);
 }
 
-
 template <class IntTy, class Stringifier>
 void MDFieldPrinter::printDwarfEnum(StringRef Name, IntTy Value,
                                     Stringifier toString, bool ShouldSkipZero) {
@@ -1614,6 +1659,9 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
   Printer.printInt("offset", N->getOffsetInBits());
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printMetadata("extraData", N->getRawExtraData());
+  if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
+    Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
+                     /* ShouldSkipZero */ false);
   Out << ")";
 }
 
@@ -1688,6 +1736,8 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printMetadata("macros", N->getRawMacros());
   Printer.printInt("dwoId", N->getDWOId());
   Printer.printBool("splitDebugInlining", N->getSplitDebugInlining(), true);
+  Printer.printBool("debugInfoForProfiling", N->getDebugInfoForProfiling(),
+                    false);
   Out << ")";
 }
 
@@ -1718,6 +1768,7 @@ static void writeDISubprogram(raw_ostream &Out, const DISubprogram *N,
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printMetadata("declaration", N->getRawDeclaration());
   Printer.printMetadata("variables", N->getRawVariables());
+  Printer.printMetadata("thrownTypes", N->getRawThrownTypes());
   Out << ")";
 }
 
@@ -1754,8 +1805,6 @@ static void writeDINamespace(raw_ostream &Out, const DINamespace *N,
   MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
-  Printer.printMetadata("file", N->getRawFile());
-  Printer.printInt("line", N->getLine());
   Printer.printBool("exportSymbols", N->getExportSymbols(), false);
   Out << ")";
 }
@@ -1915,11 +1964,11 @@ static void writeDIImportedEntity(raw_ostream &Out, const DIImportedEntity *N,
   Printer.printString("name", N->getName());
   Printer.printMetadata("scope", N->getRawScope(), /* ShouldSkipNull */ false);
   Printer.printMetadata("entity", N->getRawEntity());
+  Printer.printMetadata("file", N->getRawFile());
   Printer.printInt("line", N->getLine());
   Out << ")";
 }
 
-
 static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
                                     TypePrinting *TypePrinter,
                                     SlotTracker *Machine,
@@ -2058,6 +2107,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Metadata *MD,
 }
 
 namespace {
+
 class AssemblyWriter {
   formatted_raw_ostream &Out;
   const Module *TheModule;
@@ -2070,6 +2120,8 @@ class AssemblyWriter {
   bool ShouldPreserveUseListOrder;
   UseListOrderStack UseListOrders;
   SmallVector<StringRef, 8> MDNames;
+  /// Synchronization scope names registered with LLVMContext.
+  SmallVector<StringRef, 8> SSNs;
 
 public:
   /// Construct an AssemblyWriter with an external SlotTracker
@@ -2083,12 +2135,17 @@ public:
   void printModule(const Module *M);
 
   void writeOperand(const Value *Op, bool PrintType);
-  void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
+  void writeParamOperand(const Value *Operand, AttributeSet Attrs);
   void writeOperandBundles(ImmutableCallSite CS);
-  void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
-  void writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
+  void writeSyncScope(const LLVMContext &Context,
+                      SyncScope::ID SSID);
+  void writeAtomic(const LLVMContext &Context,
+                   AtomicOrdering Ordering,
+                   SyncScope::ID SSID);
+  void writeAtomicCmpXchg(const LLVMContext &Context,
+                          AtomicOrdering SuccessOrdering,
                           AtomicOrdering FailureOrdering,
-                          SynchronizationScope SynchScope);
+                          SyncScope::ID SSID);
 
   void writeAllMDNodes();
   void writeMDNode(unsigned Slot, const MDNode *Node);
@@ -2099,7 +2156,7 @@ public:
   void printIndirectSymbol(const GlobalIndirectSymbol *GIS);
   void printComdat(const Comdat *C);
   void printFunction(const Function *F);
-  void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
+  void printArgument(const Argument *FA, AttributeSet Attrs);
   void printBasicBlock(const BasicBlock *BB);
   void printInstructionLine(const Instruction &I);
   void printInstruction(const Instruction &I);
@@ -2121,7 +2178,8 @@ private:
   // intrinsic indicating base and derived pointer names.
   void printGCRelocateComment(const GCRelocateInst &Relocate);
 };
-} // namespace
+
+} // end anonymous namespace
 
 AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                                const Module *M, AssemblyAnnotationWriter *AAW,
@@ -2149,36 +2207,48 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
   WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
 }
 
-void AssemblyWriter::writeAtomic(AtomicOrdering Ordering,
-                                 SynchronizationScope SynchScope) {
-  if (Ordering == AtomicOrdering::NotAtomic)
-    return;
+void AssemblyWriter::writeSyncScope(const LLVMContext &Context,
+                                    SyncScope::ID SSID) {
+  switch (SSID) {
+  case SyncScope::System: {
+    break;
+  }
+  default: {
+    if (SSNs.empty())
+      Context.getSyncScopeNames(SSNs);
 
-  switch (SynchScope) {
-  case SingleThread: Out << " singlethread"; break;
-  case CrossThread: break;
+    Out << " syncscope(\"";
+    PrintEscapedString(SSNs[SSID], Out);
+    Out << "\")";
+    break;
+  }
   }
+}
+
+void AssemblyWriter::writeAtomic(const LLVMContext &Context,
+                                 AtomicOrdering Ordering,
+                                 SyncScope::ID SSID) {
+  if (Ordering == AtomicOrdering::NotAtomic)
+    return;
 
+  writeSyncScope(Context, SSID);
   Out << " " << toIRString(Ordering);
 }
 
-void AssemblyWriter::writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
+void AssemblyWriter::writeAtomicCmpXchg(const LLVMContext &Context,
+                                        AtomicOrdering SuccessOrdering,
                                         AtomicOrdering FailureOrdering,
-                                        SynchronizationScope SynchScope) {
+                                        SyncScope::ID SSID) {
   assert(SuccessOrdering != AtomicOrdering::NotAtomic &&
          FailureOrdering != AtomicOrdering::NotAtomic);
 
-  switch (SynchScope) {
-  case SingleThread: Out << " singlethread"; break;
-  case CrossThread: break;
-  }
-
+  writeSyncScope(Context, SSID);
   Out << " " << toIRString(SuccessOrdering);
   Out << " " << toIRString(FailureOrdering);
 }
 
 void AssemblyWriter::writeParamOperand(const Value *Operand,
-                                       AttributeSet Attrs, unsigned Idx) {
+                                       AttributeSet Attrs) {
   if (!Operand) {
     Out << "<null operand!>";
     return;
@@ -2187,8 +2257,8 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
   // Print the type
   TypePrinter.print(Operand->getType(), Out);
   // Print parameter attributes list
-  if (Attrs.hasAttributes(Idx))
-    Out << ' ' << Attrs.getAsString(Idx);
+  if (Attrs.hasAttributes())
+    Out << ' ' << Attrs.getAsString();
   Out << ' ';
   // Print the operand
   WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
@@ -2501,6 +2571,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
   GV->getAllMetadata(MDs);
   printMetadataAttachments(MDs, ", ");
 
+  auto Attrs = GV->getAttributes();
+  if (Attrs.hasAttributes())
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs);
+
   printInfoComment(*GV);
 }
 
@@ -2586,7 +2660,6 @@ void AssemblyWriter::printTypeIdentities() {
 }
 
 /// printFunction - Print all aspects of a function.
-///
 void AssemblyWriter::printFunction(const Function *F) {
   // Print out the return type and name.
   Out << '\n';
@@ -2596,19 +2669,12 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->isMaterializable())
     Out << "; Materializable\n";
 
-  const AttributeSet &Attrs = F->getAttributes();
-  if (Attrs.hasAttributes(AttributeSet::FunctionIndex)) {
+  const AttributeList &Attrs = F->getAttributes();
+  if (Attrs.hasAttributes(AttributeList::FunctionIndex)) {
     AttributeSet AS = Attrs.getFnAttributes();
     std::string AttrStr;
 
-    unsigned Idx = 0;
-    for (unsigned E = AS.getNumSlots(); Idx != E; ++Idx)
-      if (AS.getSlotIndex(Idx) == AttributeSet::FunctionIndex)
-        break;
-
-    for (AttributeSet::iterator I = AS.begin(Idx), E = AS.end(Idx);
-         I != E; ++I) {
-      Attribute Attr = *I;
+    for (const Attribute &Attr : AS) {
       if (!Attr.isStringAttribute()) {
         if (!AttrStr.empty()) AttrStr += ' ';
         AttrStr += Attr.getAsString();
@@ -2641,8 +2707,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   }
 
   FunctionType *FT = F->getFunctionType();
-  if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
-    Out <<  Attrs.getAsString(AttributeSet::ReturnIndex) << ' ';
+  if (Attrs.hasAttributes(AttributeList::ReturnIndex))
+    Out << Attrs.getAsString(AttributeList::ReturnIndex) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
@@ -2658,17 +2724,17 @@ void AssemblyWriter::printFunction(const Function *F) {
       // Output type...
       TypePrinter.print(FT->getParamType(I), Out);
 
-      if (Attrs.hasAttributes(I + 1))
-        Out << ' ' << Attrs.getAsString(I + 1);
+      AttributeSet ArgAttrs = Attrs.getParamAttributes(I);
+      if (ArgAttrs.hasAttributes())
+        Out << ' ' << ArgAttrs.getAsString();
     }
   } else {
     // The arguments are meaningful here, print them in detail.
-    unsigned Idx = 1;
     for (const Argument &Arg : F->args()) {
       // Insert commas as we go... the first arg doesn't get a comma
-      if (Idx != 1)
+      if (Arg.getArgNo() != 0)
         Out << ", ";
-      printArgument(&Arg, Attrs, Idx++);
+      printArgument(&Arg, Attrs.getParamAttributes(Arg.getArgNo()));
     }
   }
 
@@ -2681,7 +2747,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   StringRef UA = getUnnamedAddrEncoding(F->getUnnamedAddr());
   if (!UA.empty())
     Out << ' ' << UA;
-  if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+  if (Attrs.hasAttributes(AttributeList::FunctionIndex))
     Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
   if (F->hasSection()) {
     Out << " section \"";
@@ -2729,15 +2795,13 @@ void AssemblyWriter::printFunction(const Function *F) {
 
 /// printArgument - This member is called for every argument that is passed into
 /// the function.  Simply print it out
-///
-void AssemblyWriter::printArgument(const Argument *Arg,
-                                   AttributeSet Attrs, unsigned Idx) {
+void AssemblyWriter::printArgument(const Argument *Arg, AttributeSet Attrs) {
   // Output type...
   TypePrinter.print(Arg->getType(), Out);
 
   // Output parameter attributes list
-  if (Attrs.hasAttributes(Idx))
-    Out << ' ' << Attrs.getAsString(Idx);
+  if (Attrs.hasAttributes())
+    Out << ' ' << Attrs.getAsString();
 
   // Output name, if available...
   if (Arg->hasName()) {
@@ -2747,7 +2811,6 @@ void AssemblyWriter::printArgument(const Argument *Arg,
 }
 
 /// printBasicBlock - This member is called for each basic block in a method.
-///
 void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
   if (BB->hasName()) {              // Print out the label if it exists...
     Out << "\n";
@@ -2813,7 +2876,6 @@ void AssemblyWriter::printGCRelocateComment(const GCRelocateInst &Relocate) {
 
 /// printInfoComment - Print a little comment after the instruction indicating
 /// which slot it occupies.
-///
 void AssemblyWriter::printInfoComment(const Value &V) {
   if (const auto *Relocate = dyn_cast<GCRelocateInst>(&V))
     printGCRelocateComment(*Relocate);
@@ -2901,12 +2963,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ", ";
     writeOperand(SI.getDefaultDest(), true);
     Out << " [";
-    for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-         i != e; ++i) {
+    for (auto Case : SI.cases()) {
       Out << "\n    ";
-      writeOperand(i.getCaseValue(), true);
+      writeOperand(Case.getCaseValue(), true);
       Out << ", ";
-      writeOperand(i.getCaseSuccessor(), true);
+      writeOperand(Case.getCaseSuccessor(), true);
     }
     Out << "\n  ]";
   } else if (isa<IndirectBrInst>(I)) {
@@ -3015,10 +3076,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Operand = CI->getCalledValue();
     FunctionType *FTy = CI->getFunctionType();
     Type *RetTy = FTy->getReturnType();
-    const AttributeSet &PAL = CI->getAttributes();
+    const AttributeList &PAL = CI->getAttributes();
 
-    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the call instruction.  We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -3032,7 +3093,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     for (unsigned op = 0, Eop = CI->getNumArgOperands(); op < Eop; ++op) {
       if (op > 0)
         Out << ", ";
-      writeParamOperand(CI->getArgOperand(op), PAL, op + 1);
+      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttributes(op));
     }
 
     // Emit an ellipsis if this is a musttail call in a vararg function.  This
@@ -3043,16 +3104,15 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << ", ...";
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    if (PAL.hasAttributes(AttributeList::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(CI);
-
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledValue();
     FunctionType *FTy = II->getFunctionType();
     Type *RetTy = FTy->getReturnType();
-    const AttributeSet &PAL = II->getAttributes();
+    const AttributeList &PAL = II->getAttributes();
 
     // Print the calling convention being used.
     if (II->getCallingConv() != CallingConv::C) {
@@ -3060,8 +3120,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the invoke instruction. We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -3075,11 +3135,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     for (unsigned op = 0, Eop = II->getNumArgOperands(); op < Eop; ++op) {
       if (op)
         Out << ", ";
-      writeParamOperand(II->getArgOperand(op), PAL, op + 1);
+      writeParamOperand(II->getArgOperand(op), PAL.getParamAttributes(op));
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    if (PAL.hasAttributes(AttributeList::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(II);
@@ -3088,7 +3148,6 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(II->getNormalDest(), true);
     Out << " unwind ";
     writeOperand(II->getUnwindDest(), true);
-
   } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
     Out << ' ';
     if (AI->isUsedWithInAlloca())
@@ -3109,6 +3168,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     if (AI->getAlignment()) {
       Out << ", align " << AI->getAlignment();
     }
+
+    unsigned AddrSpace = AI->getType()->getAddressSpace();
+    if (AddrSpace != 0) {
+      Out << ", addrspace(" << AddrSpace << ')';
+    }
   } else if (isa<CastInst>(I)) {
     if (Operand) {
       Out << ' ';
@@ -3171,21 +3235,22 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
   // Print atomic ordering/alignment for memory operations
   if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     if (LI->isAtomic())
-      writeAtomic(LI->getOrdering(), LI->getSynchScope());
+      writeAtomic(LI->getContext(), LI->getOrdering(), LI->getSyncScopeID());
     if (LI->getAlignment())
       Out << ", align " << LI->getAlignment();
   } else if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
     if (SI->isAtomic())
-      writeAtomic(SI->getOrdering(), SI->getSynchScope());
+      writeAtomic(SI->getContext(), SI->getOrdering(), SI->getSyncScopeID());
     if (SI->getAlignment())
       Out << ", align " << SI->getAlignment();
   } else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) {
-    writeAtomicCmpXchg(CXI->getSuccessOrdering(), CXI->getFailureOrdering(),
-                       CXI->getSynchScope());
+    writeAtomicCmpXchg(CXI->getContext(), CXI->getSuccessOrdering(),
+                       CXI->getFailureOrdering(), CXI->getSyncScopeID());
   } else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
-    writeAtomic(RMWI->getOrdering(), RMWI->getSynchScope());
+    writeAtomic(RMWI->getContext(), RMWI->getOrdering(),
+                RMWI->getSyncScopeID());
   } else if (const FenceInst *FI = dyn_cast<FenceInst>(&I)) {
-    writeAtomic(FI->getOrdering(), FI->getSynchScope());
+    writeAtomic(FI->getContext(), FI->getOrdering(), FI->getSyncScopeID());
   }
 
   // Print Metadata info.
@@ -3242,7 +3307,7 @@ void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
 }
 
 void AssemblyWriter::writeAllAttributeGroups() {
-  std::vector<std::pair<AttributeSet, unsigned> > asVec;
+  std::vector<std::pair<AttributeSet, unsigned>> asVec;
   asVec.resize(Machine.as_size());
 
   for (SlotTracker::as_iterator I = Machine.as_begin(), E = Machine.as_end();
@@ -3251,7 +3316,7 @@ void AssemblyWriter::writeAllAttributeGroups() {
 
   for (const auto &I : asVec)
     Out << "attributes #" << I.second << " = { "
-        << I.first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
+        << I.first.getAsString(true) << " }\n";
 }
 
 void AssemblyWriter::printUseListOrder(const UseListOrder &Order) {
@@ -3535,6 +3600,7 @@ void Metadata::print(raw_ostream &OS, ModuleSlotTracker &MST,
   printMetadataImpl(OS, *this, MST, M, /* OnlyAsOperand */ false);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // Value::dump - allow easy printing of Values from the debugger.
 LLVM_DUMP_METHOD
 void Value::dump() const { print(dbgs(), /*IsForDebug=*/true); dbgs() << '\n'; }
@@ -3566,3 +3632,4 @@ void Metadata::dump(const Module *M) const {
   print(dbgs(), M, /*IsForDebug=*/true);
   dbgs() << '\n';
 }
+#endif
diff --git a/contrib/llvm/lib/IR/AttributeImpl.h b/contrib/llvm/lib/IR/AttributeImpl.h
index d0d2710..9c7b61f 100644
--- a/contrib/llvm/lib/IR/AttributeImpl.h
+++ b/contrib/llvm/lib/IR/AttributeImpl.h
@@ -1,4 +1,4 @@
-//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===//
+//===- AttributeImpl.h - Attribute Internals --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,12 @@
 #ifndef LLVM_LIB_IR_ATTRIBUTEIMPL_H
 #define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
-#include "AttributeSetNode.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
 #include <cassert>
-#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <string>
@@ -81,11 +78,13 @@ public:
     else
       Profile(ID, getKindAsString(), getValueAsString());
   }
+
   static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind,
                       uint64_t Val) {
     ID.AddInteger(Kind);
     if (Val) ID.AddInteger(Val);
   }
+
   static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) {
     ID.AddString(Kind);
     if (!Values.empty()) ID.AddString(Values);
@@ -101,6 +100,7 @@ public:
 
 class EnumAttributeImpl : public AttributeImpl {
   virtual void anchor();
+
   Attribute::AttrKind Kind;
 
 protected:
@@ -115,9 +115,10 @@ public:
 };
 
 class IntAttributeImpl : public EnumAttributeImpl {
-  void anchor() override;
   uint64_t Val;
 
+  void anchor() override;
+
 public:
   IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
       : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
@@ -133,6 +134,7 @@ public:
 
 class StringAttributeImpl : public AttributeImpl {
   virtual void anchor();
+
   std::string Kind;
   std::string Val;
 
@@ -144,122 +146,112 @@ public:
   StringRef getStringValue() const { return Val; }
 };
 
-typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
+  unsigned NumAttrs; ///< Number of attributes in this node.
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs);
+
+public:
+  // AttributesSetNode is uniqued, these should not be available.
+  AttributeSetNode(const AttributeSetNode &) = delete;
+  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
+
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static AttributeSetNode *get(LLVMContext &C, const AttrBuilder &B);
+
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  /// \brief Return the number of attributes this AttributeList contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return NumAttrs != 0; }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  using iterator = const Attribute *;
+
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
+  iterator end() const { return begin() + NumAttrs; }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, makeArrayRef(begin(), end()));
+  }
+
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (const auto &Attr : AttrList)
+      Attr.Profile(ID);
+  }
+};
+
+using IndexAttrPair = std::pair<unsigned, AttributeSet>;
 
 //===----------------------------------------------------------------------===//
 /// \class
 /// \brief This class represents a set of attributes that apply to the function,
 /// return type, and parameters.
-class AttributeSetImpl final
+class AttributeListImpl final
     : public FoldingSetNode,
-      private TrailingObjects<AttributeSetImpl, IndexAttrPair> {
-  friend class AttributeSet;
+      private TrailingObjects<AttributeListImpl, AttributeSet> {
+  friend class AttributeList;
   friend TrailingObjects;
 
 private:
-  LLVMContext &Context;
-  unsigned NumSlots; ///< Number of entries in this set.
   /// Bitset with a bit for each available attribute Attribute::AttrKind.
   uint64_t AvailableFunctionAttrs;
+  LLVMContext &Context;
+  unsigned NumAttrSets; ///< Number of entries in this set.
 
   // Helper fn for TrailingObjects class.
-  size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumSlots; }
-
-  /// \brief Return a pointer to the IndexAttrPair for the specified slot.
-  const IndexAttrPair *getNode(unsigned Slot) const {
-    return getTrailingObjects<IndexAttrPair>() + Slot;
-  }
+  size_t numTrailingObjects(OverloadToken<AttributeSet>) { return NumAttrSets; }
 
 public:
-  AttributeSetImpl(LLVMContext &C,
-                   ArrayRef<std::pair<unsigned, AttributeSetNode *>> Slots)
-      : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <=
-                      sizeof(AvailableFunctionAttrs) * CHAR_BIT,
-                  "Too many attributes");
-
-#ifndef NDEBUG
-    if (Slots.size() >= 2) {
-      for (const std::pair<unsigned, AttributeSetNode *> *i = Slots.begin() + 1,
-                                                         *e = Slots.end();
-           i != e; ++i) {
-        assert((i-1)->first <= i->first && "Attribute set not ordered!");
-      }
-    }
-#endif
-    // There's memory after the node where we can store the entries in.
-    std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
-
-    // Initialize AvailableFunctionAttrs summary bitset.
-    if (NumSlots > 0) {
-      static_assert(AttributeSet::FunctionIndex == ~0u,
-                    "FunctionIndex should be biggest possible index");
-      const std::pair<unsigned, AttributeSetNode *> &Last = Slots.back();
-      if (Last.first == AttributeSet::FunctionIndex) {
-        const AttributeSetNode *Node = Last.second;
-        for (Attribute I : *Node) {
-          if (!I.isStringAttribute())
-            AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-        }
-      }
-    }
-  }
+  AttributeListImpl(LLVMContext &C, ArrayRef<AttributeSet> Sets);
 
   // AttributesSetImpt is uniqued, these should not be available.
-  AttributeSetImpl(const AttributeSetImpl &) = delete;
-  AttributeSetImpl &operator=(const AttributeSetImpl &) = delete;
+  AttributeListImpl(const AttributeListImpl &) = delete;
+  AttributeListImpl &operator=(const AttributeListImpl &) = delete;
 
   void operator delete(void *p) { ::operator delete(p); }
 
-  /// \brief Get the context that created this AttributeSetImpl.
+  /// \brief Get the context that created this AttributeListImpl.
   LLVMContext &getContext() { return Context; }
 
-  /// \brief Return the number of slots used in this attribute list. This is
-  /// the number of arguments that have an attribute set on them (including the
-  /// function itself).
-  unsigned getNumSlots() const { return NumSlots; }
-
-  /// \brief Get the index of the given "slot" in the AttrNodes list. This index
-  /// is the index of the return, parameter, or function object that the
-  /// attributes are applied to, not the index into the AttrNodes list where the
-  /// attributes reside.
-  unsigned getSlotIndex(unsigned Slot) const {
-    return getNode(Slot)->first;
-  }
-
-  /// \brief Retrieve the attributes for the given "slot" in the AttrNode list.
-  /// \p Slot is an index into the AttrNodes list, not the index of the return /
-  /// parameter/ function which the attributes apply to.
-  AttributeSet getSlotAttributes(unsigned Slot) const {
-    return AttributeSet::get(Context, *getNode(Slot));
-  }
-
-  /// \brief Retrieve the attribute set node for the given "slot" in the
-  /// AttrNode list.
-  AttributeSetNode *getSlotNode(unsigned Slot) const {
-    return getNode(Slot)->second;
-  }
-
-  /// \brief Return true if the AttributeSetNode for the FunctionIndex has an
+  /// \brief Return true if the AttributeSet or the FunctionIndex has an
   /// enum attribute of the given kind.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  typedef AttributeSetNode::iterator iterator;
-  iterator begin(unsigned Slot) const { return getSlotNode(Slot)->begin(); }
-  iterator end(unsigned Slot) const { return getSlotNode(Slot)->end(); }
+  using iterator = const AttributeSet *;
 
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(getNode(0), getNumSlots()));
-  }
-  static void Profile(FoldingSetNodeID &ID,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*>> Nodes) {
-    for (const auto &Node : Nodes) {
-      ID.AddInteger(Node.first);
-      ID.AddPointer(Node.second);
-    }
-  }
+  iterator begin() const { return getTrailingObjects<AttributeSet>(); }
+  iterator end() const { return begin() + NumAttrSets; }
+
+  void Profile(FoldingSetNodeID &ID) const;
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeSet> Nodes);
 
   void dump() const;
 };
diff --git a/contrib/llvm/lib/IR/AttributeSetNode.h b/contrib/llvm/lib/IR/AttributeSetNode.h
deleted file mode 100644
index 23ce371..0000000
--- a/contrib/llvm/lib/IR/AttributeSetNode.h
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file defines the node class used internally by AttributeSet.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_ATTRIBUTESETNODE_H
-#define LLVM_IR_ATTRIBUTESETNODE_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
-#include <climits>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a group of attributes that apply to one
-/// element: function, return type, or parameter.
-class AttributeSetNode final
-    : public FoldingSetNode,
-      private TrailingObjects<AttributeSetNode, Attribute> {
-  friend TrailingObjects;
-
-  unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint64_t AvailableAttrs;
-
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
-                  "Too many attributes for AvailableAttrs");
-    // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
-
-    for (Attribute I : *this) {
-      if (!I.isStringAttribute()) {
-        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-      }
-    }
-  }
-
-public:
-  // AttributesSetNode is uniqued, these should not be available.
-  AttributeSetNode(const AttributeSetNode &) = delete;
-  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
-
-  void operator delete(void *p) { ::operator delete(p); }
-
-  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
-
-  static AttributeSetNode *get(AttributeSet AS, unsigned Index) {
-    return AS.getAttributes(Index);
-  }
-
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
-
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs & ((uint64_t)1) << Kind;
-  }
-  bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return NumAttrs != 0; }
-
-  Attribute getAttribute(Attribute::AttrKind Kind) const;
-  Attribute getAttribute(StringRef Kind) const;
-
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
-  uint64_t getDereferenceableBytes() const;
-  uint64_t getDereferenceableOrNullBytes() const;
-  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::string getAsString(bool InAttrGrp) const;
-
-  typedef const Attribute *iterator;
-  iterator begin() const { return getTrailingObjects<Attribute>(); }
-  iterator end() const { return begin() + NumAttrs; }
-
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(begin(), end()));
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (const auto &Attr : AttrList)
-      Attr.Profile(ID);
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_IR_ATTRIBUTESETNODE_H
diff --git a/contrib/llvm/lib/IR/Attributes.cpp b/contrib/llvm/lib/IR/Attributes.cpp
index 1ec53cf..8f2e641 100644
--- a/contrib/llvm/lib/IR/Attributes.cpp
+++ b/contrib/llvm/lib/IR/Attributes.cpp
@@ -1,4 +1,4 @@
-//===-- Attributes.cpp - Implement AttributesList -------------------------===//
+//===- Attributes.cpp - Implement AttributesList --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,23 +9,40 @@
 //
 // \file
 // \brief This file implements the Attribute, AttributeImpl, AttrBuilder,
-// AttributeSetImpl, and AttributeSet classes.
+// AttributeListImpl, and AttributeList classes.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
 #include "AttributeImpl.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/Atomic.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -300,6 +317,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "returns_twice";
   if (hasAttribute(Attribute::SExt))
     return "signext";
+  if (hasAttribute(Attribute::Speculatable))
+    return "speculatable";
   if (hasAttribute(Attribute::StackProtect))
     return "ssp";
   if (hasAttribute(Attribute::StackProtectReq))
@@ -411,9 +430,12 @@ bool Attribute::operator<(Attribute A) const {
 //===----------------------------------------------------------------------===//
 
 // Pin the vtables to this file.
-AttributeImpl::~AttributeImpl() {}
+AttributeImpl::~AttributeImpl() = default;
+
 void EnumAttributeImpl::anchor() {}
+
 void IntAttributeImpl::anchor() {}
+
 void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
@@ -473,9 +495,152 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const {
 }
 
 //===----------------------------------------------------------------------===//
+// AttributeSet Definition
+//===----------------------------------------------------------------------===//
+
+AttributeSet AttributeSet::get(LLVMContext &C, const AttrBuilder &B) {
+  return AttributeSet(AttributeSetNode::get(C, B));
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
+  return AttributeSet(AttributeSetNode::get(C, Attrs));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C,
+                                        Attribute::AttrKind Kind) const {
+  if (hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
+                                        StringRef Value) const {
+  AttrBuilder B;
+  B.addAttribute(Kind, Value);
+  return addAttributes(C, AttributeSet::get(C, B));
+}
+
+AttributeSet AttributeSet::addAttributes(LLVMContext &C,
+                                         const AttributeSet AS) const {
+  if (!hasAttributes())
+    return AS;
+
+  if (!AS.hasAttributes())
+    return *this;
+
+  AttrBuilder B(AS);
+  for (Attribute I : *this)
+    B.addAttribute(I);
+
+ return get(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+                                             Attribute::AttrKind Kind) const {
+  if (!hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
+                                             StringRef Kind) const {
+  if (!hasAttribute(Kind)) return *this;
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, B);
+}
+
+AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
+                                              const AttrBuilder &Attrs) const {
+
+  // FIXME it is not obvious how this should work for alignment.
+  // For now, say we can't pass in alignment, which no current use does.
+  assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
+
+  AttrBuilder B(*this);
+  B.remove(Attrs);
+  return get(C, B);
+}
+
+unsigned AttributeSet::getNumAttributes() const {
+  return SetNode ? SetNode->getNumAttributes() : 0;
+}
+
+bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
+  return SetNode ? SetNode->hasAttribute(Kind) : false;
+}
+
+bool AttributeSet::hasAttribute(StringRef Kind) const {
+  return SetNode ? SetNode->hasAttribute(Kind) : false;
+}
+
+Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
+  return SetNode ? SetNode->getAttribute(Kind) : Attribute();
+}
+
+Attribute AttributeSet::getAttribute(StringRef Kind) const {
+  return SetNode ? SetNode->getAttribute(Kind) : Attribute();
+}
+
+unsigned AttributeSet::getAlignment() const {
+  return SetNode ? SetNode->getAlignment() : 0;
+}
+
+unsigned AttributeSet::getStackAlignment() const {
+  return SetNode ? SetNode->getStackAlignment() : 0;
+}
+
+uint64_t AttributeSet::getDereferenceableBytes() const {
+  return SetNode ? SetNode->getDereferenceableBytes() : 0;
+}
+
+uint64_t AttributeSet::getDereferenceableOrNullBytes() const {
+  return SetNode ? SetNode->getDereferenceableOrNullBytes() : 0;
+}
+
+std::pair<unsigned, Optional<unsigned>> AttributeSet::getAllocSizeArgs() const {
+  return SetNode ? SetNode->getAllocSizeArgs()
+                 : std::pair<unsigned, Optional<unsigned>>(0, 0);
+}
+
+std::string AttributeSet::getAsString(bool InAttrGrp) const {
+  return SetNode ? SetNode->getAsString(InAttrGrp) : "";
+}
+
+AttributeSet::iterator AttributeSet::begin() const {
+  return SetNode ? SetNode->begin() : nullptr;
+}
+
+AttributeSet::iterator AttributeSet::end() const {
+  return SetNode ? SetNode->end() : nullptr;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeSet::dump() const {
+  dbgs() << "AS =\n";
+    dbgs() << "  { ";
+    dbgs() << getAsString(true) << " }\n";
+}
+#endif
+
+//===----------------------------------------------------------------------===//
 // AttributeSetNode Definition
 //===----------------------------------------------------------------------===//
 
+AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : AvailableAttrs(0), NumAttrs(Attrs.size()) {
+  // There's memory after the node where we can store the entries in.
+  std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+  for (Attribute I : *this) {
+    if (!I.isStringAttribute()) {
+      AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+    }
+  }
+}
+
 AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
                                         ArrayRef<Attribute> Attrs) {
   if (Attrs.empty())
@@ -504,10 +669,52 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
     pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
   }
 
-  // Return the AttributesListNode that we found or created.
+  // Return the AttributeSetNode that we found or created.
   return PA;
 }
 
+AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
+  // Add target-independent attributes.
+  SmallVector<Attribute, 8> Attrs;
+  for (Attribute::AttrKind Kind = Attribute::None;
+       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
+    if (!B.contains(Kind))
+      continue;
+
+    Attribute Attr;
+    switch (Kind) {
+    case Attribute::Alignment:
+      Attr = Attribute::getWithAlignment(C, B.getAlignment());
+      break;
+    case Attribute::StackAlignment:
+      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
+      break;
+    case Attribute::Dereferenceable:
+      Attr = Attribute::getWithDereferenceableBytes(
+          C, B.getDereferenceableBytes());
+      break;
+    case Attribute::DereferenceableOrNull:
+      Attr = Attribute::getWithDereferenceableOrNullBytes(
+          C, B.getDereferenceableOrNullBytes());
+      break;
+    case Attribute::AllocSize: {
+      auto A = B.getAllocSizeArgs();
+      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
+      break;
+    }
+    default:
+      Attr = Attribute::get(C, Kind);
+    }
+    Attrs.push_back(Attr);
+  }
+
+  // Add target-dependent (string) attributes.
+  for (const auto &TDA : B.td_attrs())
+    Attrs.emplace_back(Attribute::get(C, TDA.first, TDA.second));
+
+  return get(C, Attrs);
+}
+
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
   for (Attribute I : *this)
     if (I.hasAttribute(Kind))
@@ -578,46 +785,91 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSetImpl Definition
+// AttributeListImpl Definition
 //===----------------------------------------------------------------------===//
 
-LLVM_DUMP_METHOD void AttributeSetImpl::dump() const {
-  AttributeSet(const_cast<AttributeSetImpl *>(this)).dump();
+/// Map from AttributeList index to the internal array index. Adding one works:
+///   FunctionIndex: ~0U -> 0
+///   ReturnIndex:    0  -> 1
+///   FirstArgIndex: 1.. -> 2..
+static constexpr unsigned attrIdxToArrayIdx(unsigned Index) {
+  // MSVC warns about '~0U + 1' wrapping around when this is called on
+  // FunctionIndex, so cast to int first.
+  return static_cast<int>(Index) + 1;
+}
+
+AttributeListImpl::AttributeListImpl(LLVMContext &C,
+                                     ArrayRef<AttributeSet> Sets)
+    : AvailableFunctionAttrs(0), Context(C), NumAttrSets(Sets.size()) {
+  assert(!Sets.empty() && "pointless AttributeListImpl");
+
+  // There's memory after the node where we can store the entries in.
+  std::copy(Sets.begin(), Sets.end(), getTrailingObjects<AttributeSet>());
+
+  // Initialize AvailableFunctionAttrs summary bitset.
+  static_assert(Attribute::EndAttrKinds <=
+                    sizeof(AvailableFunctionAttrs) * CHAR_BIT,
+                "Too many attributes");
+  static_assert(attrIdxToArrayIdx(AttributeList::FunctionIndex) == 0U,
+                "function should be stored in slot 0");
+  for (Attribute I : Sets[0]) {
+    if (!I.isStringAttribute())
+      AvailableFunctionAttrs |= 1ULL << I.getKindAsEnum();
+  }
 }
 
+void AttributeListImpl::Profile(FoldingSetNodeID &ID) const {
+  Profile(ID, makeArrayRef(begin(), end()));
+}
+
+void AttributeListImpl::Profile(FoldingSetNodeID &ID,
+                                ArrayRef<AttributeSet> Sets) {
+  for (const auto &Set : Sets)
+    ID.AddPointer(Set.SetNode);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeListImpl::dump() const {
+  AttributeList(const_cast<AttributeListImpl *>(this)).dump();
+}
+#endif
+
 //===----------------------------------------------------------------------===//
-// AttributeSet Construction and Mutation Methods
+// AttributeList Construction and Mutation Methods
 //===----------------------------------------------------------------------===//
 
-AttributeSet
-AttributeSet::getImpl(LLVMContext &C,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Attrs) {
+AttributeList AttributeList::getImpl(LLVMContext &C,
+                                     ArrayRef<AttributeSet> AttrSets) {
+  assert(!AttrSets.empty() && "pointless AttributeListImpl");
+
   LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
-  AttributeSetImpl::Profile(ID, Attrs);
+  AttributeListImpl::Profile(ID, AttrSets);
 
   void *InsertPoint;
-  AttributeSetImpl *PA = pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
+  AttributeListImpl *PA =
+      pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
 
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
   if (!PA) {
-    // Coallocate entries after the AttributeSetImpl itself.
+    // Coallocate entries after the AttributeListImpl itself.
     void *Mem = ::operator new(
-        AttributeSetImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
-    PA = new (Mem) AttributeSetImpl(C, Attrs);
+        AttributeListImpl::totalSizeToAlloc<AttributeSet>(AttrSets.size()));
+    PA = new (Mem) AttributeListImpl(C, AttrSets);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
 
   // Return the AttributesList that we found or created.
-  return AttributeSet(PA);
+  return AttributeList(PA);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<std::pair<unsigned, Attribute> > Attrs){
+AttributeList
+AttributeList::get(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, Attribute>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeSet();
+    return AttributeList();
 
   assert(std::is_sorted(Attrs.begin(), Attrs.end(),
                         [](const std::pair<unsigned, Attribute> &LHS,
@@ -632,8 +884,8 @@ AttributeSet AttributeSet::get(LLVMContext &C,
 
   // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
   // list.
-  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrPairVec;
-  for (ArrayRef<std::pair<unsigned, Attribute> >::iterator I = Attrs.begin(),
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairVec;
+  for (ArrayRef<std::pair<unsigned, Attribute>>::iterator I = Attrs.begin(),
          E = Attrs.end(); I != E; ) {
     unsigned Index = I->first;
     SmallVector<Attribute, 4> AttrVec;
@@ -642,514 +894,427 @@ AttributeSet AttributeSet::get(LLVMContext &C,
       ++I;
     }
 
-    AttrPairVec.emplace_back(Index, AttributeSetNode::get(C, AttrVec));
+    AttrPairVec.emplace_back(Index, AttributeSet::get(C, AttrVec));
   }
 
-  return getImpl(C, AttrPairVec);
+  return get(C, AttrPairVec);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<std::pair<unsigned,
-                                                  AttributeSetNode*> > Attrs) {
+AttributeList
+AttributeList::get(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeSet();
+    return AttributeList();
 
-  return getImpl(C, Attrs);
-}
+  assert(std::is_sorted(Attrs.begin(), Attrs.end(),
+                        [](const std::pair<unsigned, AttributeSet> &LHS,
+                           const std::pair<unsigned, AttributeSet> &RHS) {
+                          return LHS.first < RHS.first;
+                        }) &&
+         "Misordered Attributes list!");
+  assert(none_of(Attrs,
+                 [](const std::pair<unsigned, AttributeSet> &Pair) {
+                   return !Pair.second.hasAttributes();
+                 }) &&
+         "Pointless attribute!");
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               const AttrBuilder &B) {
-  if (!B.hasAttributes())
-    return AttributeSet();
+  unsigned MaxIndex = Attrs.back().first;
 
-  // Add target-independent attributes.
-  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (Attribute::AttrKind Kind = Attribute::None;
-       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
-    if (!B.contains(Kind))
-      continue;
+  SmallVector<AttributeSet, 4> AttrVec(attrIdxToArrayIdx(MaxIndex) + 1);
+  for (auto Pair : Attrs)
+    AttrVec[attrIdxToArrayIdx(Pair.first)] = Pair.second;
 
-    Attribute Attr;
-    switch (Kind) {
-    case Attribute::Alignment:
-      Attr = Attribute::getWithAlignment(C, B.getAlignment());
-      break;
-    case Attribute::StackAlignment:
-      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
-      break;
-    case Attribute::Dereferenceable:
-      Attr = Attribute::getWithDereferenceableBytes(
-          C, B.getDereferenceableBytes());
-      break;
-    case Attribute::DereferenceableOrNull:
-      Attr = Attribute::getWithDereferenceableOrNullBytes(
-          C, B.getDereferenceableOrNullBytes());
-      break;
-    case Attribute::AllocSize: {
-      auto A = B.getAllocSizeArgs();
-      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
+  return getImpl(C, AttrVec);
+}
+
+AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
+                                 AttributeSet RetAttrs,
+                                 ArrayRef<AttributeSet> ArgAttrs) {
+  // Scan from the end to find the last argument with attributes.  Most
+  // arguments don't have attributes, so it's nice if we can have fewer unique
+  // AttributeListImpls by dropping empty attribute sets at the end of the list.
+  unsigned NumSets = 0;
+  for (size_t I = ArgAttrs.size(); I != 0; --I) {
+    if (ArgAttrs[I - 1].hasAttributes()) {
+      NumSets = I + 2;
       break;
     }
-    default:
-      Attr = Attribute::get(C, Kind);
-    }
-    Attrs.emplace_back(Index, Attr);
+  }
+  if (NumSets == 0) {
+    // Check function and return attributes if we didn't have argument
+    // attributes.
+    if (RetAttrs.hasAttributes())
+      NumSets = 2;
+    else if (FnAttrs.hasAttributes())
+      NumSets = 1;
   }
 
-  // Add target-dependent (string) attributes.
-  for (const auto &TDA : B.td_attrs())
-    Attrs.emplace_back(Index, Attribute::get(C, TDA.first, TDA.second));
+  // If all attribute sets were empty, we can use the empty attribute list.
+  if (NumSets == 0)
+    return AttributeList();
+
+  SmallVector<AttributeSet, 8> AttrSets;
+  AttrSets.reserve(NumSets);
+  // If we have any attributes, we always have function attributes.
+  AttrSets.push_back(FnAttrs);
+  if (NumSets > 1)
+    AttrSets.push_back(RetAttrs);
+  if (NumSets > 2) {
+    // Drop the empty argument attribute sets at the end.
+    ArgAttrs = ArgAttrs.take_front(NumSets - 2);
+    AttrSets.insert(AttrSets.end(), ArgAttrs.begin(), ArgAttrs.end());
+  }
 
-  return get(C, Attrs);
+  return getImpl(C, AttrSets);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<Attribute::AttrKind> Kinds) {
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 const AttrBuilder &B) {
+  if (!B.hasAttributes())
+    return AttributeList();
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 8> AttrSets(Index + 1);
+  AttrSets[Index] = AttributeSet::get(C, B);
+  return getImpl(C, AttrSets);
+}
+
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<Attribute::AttrKind> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (Attribute::AttrKind K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<StringRef> Kinds) {
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<StringRef> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (StringRef K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
-  if (Attrs.empty()) return AttributeSet();
-  if (Attrs.size() == 1) return Attrs[0];
-
-  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
-  AttributeSetImpl *A0 = Attrs[0].pImpl;
-  if (A0)
-    AttrNodeVec.append(A0->getNode(0), A0->getNode(A0->getNumSlots()));
-  // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec
-  // ordered by index.  Because we know that each list in Attrs is ordered by
-  // index we only need to merge each successive list in rather than doing a
-  // full sort.
-  for (unsigned I = 1, E = Attrs.size(); I != E; ++I) {
-    AttributeSetImpl *AS = Attrs[I].pImpl;
-    if (!AS) continue;
-    SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator
-      ANVI = AttrNodeVec.begin(), ANVE;
-    for (const IndexAttrPair *AI = AS->getNode(0),
-                             *AE = AS->getNode(AS->getNumSlots());
-         AI != AE; ++AI) {
-      ANVE = AttrNodeVec.end();
-      while (ANVI != ANVE && ANVI->first <= AI->first)
-        ++ANVI;
-      ANVI = AttrNodeVec.insert(ANVI, *AI) + 1;
-    }
+AttributeList AttributeList::get(LLVMContext &C,
+                                 ArrayRef<AttributeList> Attrs) {
+  if (Attrs.empty())
+    return AttributeList();
+  if (Attrs.size() == 1)
+    return Attrs[0];
+
+  unsigned MaxSize = 0;
+  for (AttributeList List : Attrs)
+    MaxSize = std::max(MaxSize, List.getNumAttrSets());
+
+  // If every list was empty, there is no point in merging the lists.
+  if (MaxSize == 0)
+    return AttributeList();
+
+  SmallVector<AttributeSet, 8> NewAttrSets(MaxSize);
+  for (unsigned I = 0; I < MaxSize; ++I) {
+    AttrBuilder CurBuilder;
+    for (AttributeList List : Attrs)
+      CurBuilder.merge(List.getAttributes(I - 1));
+    NewAttrSets[I] = AttributeSet::get(C, CurBuilder);
   }
 
-  return getImpl(C, AttrNodeVec);
+  return getImpl(C, NewAttrSets);
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        Attribute::AttrKind Kind) const {
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          Attribute::AttrKind Kind) const {
   if (hasAttribute(Index, Kind)) return *this;
-  return addAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return addAttributes(C, Index, B);
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        StringRef Kind, StringRef Value) const {
-  llvm::AttrBuilder B;
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          StringRef Kind,
+                                          StringRef Value) const {
+  AttrBuilder B;
   B.addAttribute(Kind, Value);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, B);
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C,
-                                        ArrayRef<unsigned> Indices,
-                                        Attribute A) const {
-  unsigned I = 0, E = pImpl ? pImpl->getNumSlots() : 0;
-  auto IdxI = Indices.begin(), IdxE = Indices.end();
-  SmallVector<AttributeSet, 4> AttrSet;
-
-  while (I != E && IdxI != IdxE) {
-    if (getSlotIndex(I) < *IdxI)
-      AttrSet.emplace_back(getSlotAttributes(I++));
-    else if (getSlotIndex(I) > *IdxI)
-      AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
-    else {
-      AttrBuilder B(getSlotAttributes(I), *IdxI);
-      B.addAttribute(A);
-      AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B));
-      ++I;
-      ++IdxI;
-    }
-  }
-
-  while (I != E)
-    AttrSet.emplace_back(getSlotAttributes(I++));
-
-  while (IdxI != IdxE)
-    AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
-
-  return get(C, AttrSet);
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          Attribute A) const {
+  AttrBuilder B;
+  B.addAttribute(A);
+  return addAttributes(C, Index, B);
 }
 
-AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
-                                         AttributeSet Attrs) const {
-  if (!pImpl) return Attrs;
-  if (!Attrs.pImpl) return *this;
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           const AttrBuilder &B) const {
+  if (!B.hasAttributes())
+    return *this;
+
+  if (!pImpl)
+    return AttributeList::get(C, {{Index, AttributeSet::get(C, B)}});
 
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment. For now, say
   // we can't change a known alignment.
-  unsigned OldAlign = getParamAlignment(Index);
-  unsigned NewAlign = Attrs.getParamAlignment(Index);
+  unsigned OldAlign = getAttributes(Index).getAlignment();
+  unsigned NewAlign = B.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
 #endif
 
-  // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
-  uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
-  uint64_t LastIndex = 0;
-  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
-    if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
-      break;
-    }
-    LastIndex = I + 1;
-    AttrSet.push_back(getSlotAttributes(I));
-  }
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  if (Index >= AttrSets.size())
+    AttrSets.resize(Index + 1);
 
-  // Now add the attribute into the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
+  AttrBuilder Merged(AttrSets[Index]);
+  Merged.merge(B);
+  AttrSets[Index] = AttributeSet::get(C, Merged);
 
-  for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
-    if (Attrs.getSlotIndex(I) == Index) {
-      for (AttributeSetImpl::iterator II = Attrs.pImpl->begin(I),
-             IE = Attrs.pImpl->end(I); II != IE; ++II)
-        B.addAttribute(*II);
-      break;
-    }
+  return getImpl(C, AttrSets);
+}
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+AttributeList AttributeList::addParamAttribute(LLVMContext &C,
+                                               ArrayRef<unsigned> ArgNos,
+                                               Attribute A) const {
+  assert(std::is_sorted(ArgNos.begin(), ArgNos.end()));
 
-  // Add the remaining attribute slots.
-  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
-    AttrSet.push_back(getSlotAttributes(I));
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  unsigned MaxIndex = attrIdxToArrayIdx(ArgNos.back() + FirstArgIndex);
+  if (MaxIndex >= AttrSets.size())
+    AttrSets.resize(MaxIndex + 1);
 
-  return get(C, AttrSet);
+  for (unsigned ArgNo : ArgNos) {
+    unsigned Index = attrIdxToArrayIdx(ArgNo + FirstArgIndex);
+    AttrBuilder B(AttrSets[Index]);
+    B.addAttribute(A);
+    AttrSets[Index] = AttributeSet::get(C, B);
+  }
+
+  return getImpl(C, AttrSets);
 }
 
-AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           Attribute::AttrKind Kind) const {
+AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
+                                             Attribute::AttrKind Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, Index, B);
 }
 
-AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           StringRef Kind) const {
+AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
+                                             StringRef Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  AttrBuilder B;
+  B.addAttribute(Kind);
+  return removeAttributes(C, Index, B);
 }
 
-AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
-                                            AttributeSet Attrs) const {
-  if (!pImpl) return AttributeSet();
-  if (!Attrs.pImpl) return *this;
+AttributeList
+AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
+                                const AttrBuilder &AttrsToRemove) const {
+  if (!pImpl)
+    return AttributeList();
 
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't pass in alignment, which no current use does.
-  assert(!Attrs.hasAttribute(Index, Attribute::Alignment) &&
-         "Attempt to change alignment!");
-
-  // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
-  uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
-  uint64_t LastIndex = 0;
-  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
-    if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
-      break;
-    }
-    LastIndex = I + 1;
-    AttrSet.push_back(getSlotAttributes(I));
-  }
-
-  // Now remove the attribute from the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
-
-  for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
-    if (Attrs.getSlotIndex(I) == Index) {
-      B.removeAttributes(Attrs.pImpl->getSlotAttributes(I), Index);
-      break;
-    }
+  assert(!AttrsToRemove.hasAlignmentAttr() && "Attempt to change alignment!");
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  if (Index >= AttrSets.size())
+    AttrSets.resize(Index + 1);
 
-  // Add the remaining attribute slots.
-  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
-    AttrSet.push_back(getSlotAttributes(I));
+  AttrBuilder B(AttrSets[Index]);
+  B.remove(AttrsToRemove);
+  AttrSets[Index] = AttributeSet::get(C, B);
 
-  return get(C, AttrSet);
+  return getImpl(C, AttrSets);
 }
 
-AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
-                                            const AttrBuilder &Attrs) const {
-  if (!pImpl) return AttributeSet();
-
-  // FIXME it is not obvious how this should work for alignment.
-  // For now, say we can't pass in alignment, which no current use does.
-  assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
-
-  // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
-  uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
-  uint64_t LastIndex = 0;
-  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
-    if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
-      break;
-    }
-    LastIndex = I + 1;
-    AttrSet.push_back(getSlotAttributes(I));
-  }
-
-  // Now remove the attribute from the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
-  B.remove(Attrs);
-
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
-
-  // Add the remaining attribute slots.
-  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
-    AttrSet.push_back(getSlotAttributes(I));
-
-  return get(C, AttrSet);
+AttributeList AttributeList::removeAttributes(LLVMContext &C,
+                                              unsigned WithoutIndex) const {
+  if (!pImpl)
+    return AttributeList();
+  WithoutIndex = attrIdxToArrayIdx(WithoutIndex);
+  if (WithoutIndex >= getNumAttrSets())
+    return *this;
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  AttrSets[WithoutIndex] = AttributeSet();
+  return getImpl(C, AttrSets);
 }
 
-AttributeSet AttributeSet::addDereferenceableAttr(LLVMContext &C, unsigned Index,
-                                                  uint64_t Bytes) const {
-  llvm::AttrBuilder B;
+AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C,
+                                                    unsigned Index,
+                                                    uint64_t Bytes) const {
+  AttrBuilder B;
   B.addDereferenceableAttr(Bytes);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, B);
 }
 
-AttributeSet AttributeSet::addDereferenceableOrNullAttr(LLVMContext &C,
-                                                        unsigned Index,
-                                                        uint64_t Bytes) const {
-  llvm::AttrBuilder B;
+AttributeList
+AttributeList::addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
+                                            uint64_t Bytes) const {
+  AttrBuilder B;
   B.addDereferenceableOrNullAttr(Bytes);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, B);
 }
 
-AttributeSet
-AttributeSet::addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                               unsigned ElemSizeArg,
-                               const Optional<unsigned> &NumElemsArg) {
-  llvm::AttrBuilder B;
+AttributeList
+AttributeList::addAllocSizeAttr(LLVMContext &C, unsigned Index,
+                                unsigned ElemSizeArg,
+                                const Optional<unsigned> &NumElemsArg) {
+  AttrBuilder B;
   B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, B);
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Accessor Methods
+// AttributeList Accessor Methods
 //===----------------------------------------------------------------------===//
 
-LLVMContext &AttributeSet::getContext() const {
-  return pImpl->getContext();
-}
+LLVMContext &AttributeList::getContext() const { return pImpl->getContext(); }
 
-AttributeSet AttributeSet::getParamAttributes(unsigned Index) const {
-  return pImpl && hasAttributes(Index) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(Index, getAttributes(Index)))) :
-    AttributeSet();
+AttributeSet AttributeList::getParamAttributes(unsigned ArgNo) const {
+  return getAttributes(ArgNo + FirstArgIndex);
 }
 
-AttributeSet AttributeSet::getRetAttributes() const {
-  return pImpl && hasAttributes(ReturnIndex) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(ReturnIndex,
-                                       getAttributes(ReturnIndex)))) :
-    AttributeSet();
+AttributeSet AttributeList::getRetAttributes() const {
+  return getAttributes(ReturnIndex);
 }
 
-AttributeSet AttributeSet::getFnAttributes() const {
-  return pImpl && hasAttributes(FunctionIndex) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(FunctionIndex,
-                                       getAttributes(FunctionIndex)))) :
-    AttributeSet();
+AttributeSet AttributeList::getFnAttributes() const {
+  return getAttributes(FunctionIndex);
 }
 
-bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttribute(Kind);
+bool AttributeList::hasAttribute(unsigned Index,
+                                 Attribute::AttrKind Kind) const {
+  return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttribute(Kind);
+bool AttributeList::hasAttribute(unsigned Index, StringRef Kind) const {
+  return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeSet::hasAttributes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttributes();
+bool AttributeList::hasAttributes(unsigned Index) const {
+  return getAttributes(Index).hasAttributes();
 }
 
-bool AttributeSet::hasFnAttribute(Attribute::AttrKind Kind) const {
+bool AttributeList::hasFnAttribute(Attribute::AttrKind Kind) const {
   return pImpl && pImpl->hasFnAttribute(Kind);
 }
 
-bool AttributeSet::hasFnAttribute(StringRef Kind) const {
-  return hasAttribute(AttributeSet::FunctionIndex, Kind);
+bool AttributeList::hasFnAttribute(StringRef Kind) const {
+  return hasAttribute(AttributeList::FunctionIndex, Kind);
+}
+
+bool AttributeList::hasParamAttribute(unsigned ArgNo,
+                                      Attribute::AttrKind Kind) const {
+  return hasAttribute(ArgNo + FirstArgIndex, Kind);
 }
 
-bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr,
-                                    unsigned *Index) const {
+bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
+                                     unsigned *Index) const {
   if (!pImpl) return false;
 
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
-    for (AttributeSetImpl::iterator II = pImpl->begin(I),
-           IE = pImpl->end(I); II != IE; ++II)
-      if (II->hasAttribute(Attr)) {
-        if (Index) *Index = pImpl->getSlotIndex(I);
-        return true;
-      }
+  for (unsigned I = index_begin(), E = index_end(); I != E; ++I) {
+    if (hasAttribute(I, Attr)) {
+      if (Index)
+        *Index = I;
+      return true;
+    }
+  }
 
   return false;
 }
 
-Attribute AttributeSet::getAttribute(unsigned Index,
-                                     Attribute::AttrKind Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAttribute(Kind) : Attribute();
+Attribute AttributeList::getAttribute(unsigned Index,
+                                      Attribute::AttrKind Kind) const {
+  return getAttributes(Index).getAttribute(Kind);
 }
 
-Attribute AttributeSet::getAttribute(unsigned Index,
-                                     StringRef Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAttribute(Kind) : Attribute();
+Attribute AttributeList::getAttribute(unsigned Index, StringRef Kind) const {
+  return getAttributes(Index).getAttribute(Kind);
 }
 
-unsigned AttributeSet::getParamAlignment(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAlignment() : 0;
+unsigned AttributeList::getRetAlignment() const {
+  return getAttributes(ReturnIndex).getAlignment();
 }
 
-unsigned AttributeSet::getStackAlignment(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getStackAlignment() : 0;
+unsigned AttributeList::getParamAlignment(unsigned ArgNo) const {
+  return getAttributes(ArgNo + FirstArgIndex).getAlignment();
 }
 
-uint64_t AttributeSet::getDereferenceableBytes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getDereferenceableBytes() : 0;
+unsigned AttributeList::getStackAlignment(unsigned Index) const {
+  return getAttributes(Index).getStackAlignment();
 }
 
-uint64_t AttributeSet::getDereferenceableOrNullBytes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getDereferenceableOrNullBytes() : 0;
+uint64_t AttributeList::getDereferenceableBytes(unsigned Index) const {
+  return getAttributes(Index).getDereferenceableBytes();
 }
 
-std::pair<unsigned, Optional<unsigned>>
-AttributeSet::getAllocSizeArgs(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAllocSizeArgs() : std::make_pair(0u, Optional<unsigned>(0u));
+uint64_t AttributeList::getDereferenceableOrNullBytes(unsigned Index) const {
+  return getAttributes(Index).getDereferenceableOrNullBytes();
 }
 
-std::string AttributeSet::getAsString(unsigned Index, bool InAttrGrp) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAsString(InAttrGrp) : std::string("");
+std::pair<unsigned, Optional<unsigned>>
+AttributeList::getAllocSizeArgs(unsigned Index) const {
+  return getAttributes(Index).getAllocSizeArgs();
 }
 
-AttributeSetNode *AttributeSet::getAttributes(unsigned Index) const {
-  if (!pImpl) return nullptr;
-
-  // Loop through to find the attribute node we want.
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
-    if (pImpl->getSlotIndex(I) == Index)
-      return pImpl->getSlotNode(I);
+std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
+  return getAttributes(Index).getAsString(InAttrGrp);
+}
 
-  return nullptr;
+AttributeSet AttributeList::getAttributes(unsigned Index) const {
+  Index = attrIdxToArrayIdx(Index);
+  if (!pImpl || Index >= getNumAttrSets())
+    return AttributeSet();
+  return pImpl->begin()[Index];
 }
 
-AttributeSet::iterator AttributeSet::begin(unsigned Slot) const {
-  if (!pImpl)
-    return ArrayRef<Attribute>().begin();
-  return pImpl->begin(Slot);
+AttributeList::iterator AttributeList::begin() const {
+  return pImpl ? pImpl->begin() : nullptr;
 }
 
-AttributeSet::iterator AttributeSet::end(unsigned Slot) const {
-  if (!pImpl)
-    return ArrayRef<Attribute>().end();
-  return pImpl->end(Slot);
+AttributeList::iterator AttributeList::end() const {
+  return pImpl ? pImpl->end() : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Introspection Methods
+// AttributeList Introspection Methods
 //===----------------------------------------------------------------------===//
 
-unsigned AttributeSet::getNumSlots() const {
-  return pImpl ? pImpl->getNumSlots() : 0;
+unsigned AttributeList::getNumAttrSets() const {
+  return pImpl ? pImpl->NumAttrSets : 0;
 }
 
-unsigned AttributeSet::getSlotIndex(unsigned Slot) const {
-  assert(pImpl && Slot < pImpl->getNumSlots() &&
-         "Slot # out of range!");
-  return pImpl->getSlotIndex(Slot);
-}
-
-AttributeSet AttributeSet::getSlotAttributes(unsigned Slot) const {
-  assert(pImpl && Slot < pImpl->getNumSlots() &&
-         "Slot # out of range!");
-  return pImpl->getSlotAttributes(Slot);
-}
-
-LLVM_DUMP_METHOD void AttributeSet::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void AttributeList::dump() const {
   dbgs() << "PAL[\n";
 
-  for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
-    uint64_t Index = getSlotIndex(i);
-    dbgs() << "  { ";
-    if (Index == ~0U)
-      dbgs() << "~0U";
-    else
-      dbgs() << Index;
-    dbgs() << " => " << getAsString(Index) << " }\n";
+  for (unsigned i = index_begin(), e = index_end(); i != e; ++i) {
+    if (getAttributes(i).hasAttributes())
+      dbgs() << "  { " << i << " => " << getAsString(i) << " }\n";
   }
 
   dbgs() << "]\n";
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // AttrBuilder Method Implementations
 //===----------------------------------------------------------------------===//
 
-AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
-    : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0),
-      DerefOrNullBytes(0), AllocSizeArgs(0) {
-  AttributeSetImpl *pImpl = AS.pImpl;
-  if (!pImpl) return;
-
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
-    if (pImpl->getSlotIndex(I) != Index) continue;
-
-    for (AttributeSetImpl::iterator II = pImpl->begin(I),
-           IE = pImpl->end(I); II != IE; ++II)
-      addAttribute(*II);
+// FIXME: Remove this ctor, use AttributeSet.
+AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) {
+  AttributeSet AS = AL.getAttributes(Index);
+  for (const Attribute &A : AS)
+    addAttribute(A);
+}
 
-    break;
-  }
+AttrBuilder::AttrBuilder(AttributeSet AS) {
+  for (const Attribute &A : AS)
+    addAttribute(A);
 }
 
 void AttrBuilder::clear() {
@@ -1213,26 +1378,8 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
   return *this;
 }
 
-AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
-  unsigned Slot = ~0U;
-  for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
-    if (A.getSlotIndex(I) == Index) {
-      Slot = I;
-      break;
-    }
-
-  assert(Slot != ~0U && "Couldn't find index in AttributeSet!");
-
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
-    Attribute Attr = *I;
-    if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
-      removeAttribute(Attr.getKindAsEnum());
-    } else {
-      assert(Attr.isStringAttribute() && "Invalid attribute type!");
-      removeAttribute(Attr.getKindAsString());
-    }
-  }
-
+AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) {
+  remove(A.getAttributes(Index));
   return *this;
 }
 
@@ -1359,7 +1506,7 @@ bool AttrBuilder::overlaps(const AttrBuilder &B) const {
     return true;
 
   // Then check if any target dependent ones do.
-  for (auto I : td_attrs())
+  for (const auto &I : td_attrs())
     if (B.contains(I.first))
       return true;
 
@@ -1374,24 +1521,16 @@ bool AttrBuilder::hasAttributes() const {
   return !Attrs.none() || !TargetDepAttrs.empty();
 }
 
-bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
-  unsigned Slot = ~0U;
-  for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
-    if (A.getSlotIndex(I) == Index) {
-      Slot = I;
-      break;
-    }
+bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const {
+  AttributeSet AS = AL.getAttributes(Index);
 
-  assert(Slot != ~0U && "Couldn't find the index!");
-
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
-    Attribute Attr = *I;
+  for (Attribute Attr : AS) {
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
-      if (Attrs[I->getKindAsEnum()])
+      if (contains(Attr.getKindAsEnum()))
         return true;
     } else {
       assert(Attr.isStringAttribute() && "Invalid attribute kind!");
-      return TargetDepAttrs.find(Attr.getKindAsString())!=TargetDepAttrs.end();
+      return contains(Attr.getKindAsString());
     }
   }
 
@@ -1481,20 +1620,17 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
   // If upgrading the SSP attribute, clear out the old SSP Attributes first.
   // Having multiple SSP attributes doesn't actually hurt, but it adds useless
   // clutter to the IR.
-  AttrBuilder B;
-  B.addAttribute(Attribute::StackProtect)
-    .addAttribute(Attribute::StackProtectStrong)
-    .addAttribute(Attribute::StackProtectReq);
-  AttributeSet OldSSPAttr = AttributeSet::get(Caller.getContext(),
-                                              AttributeSet::FunctionIndex,
-                                              B);
+  AttrBuilder OldSSPAttr;
+  OldSSPAttr.addAttribute(Attribute::StackProtect)
+      .addAttribute(Attribute::StackProtectStrong)
+      .addAttribute(Attribute::StackProtectReq);
 
   if (Callee.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectReq);
   } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectStrong);
   } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
@@ -1502,6 +1638,39 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
     Caller.addFnAttr(Attribute::StackProtect);
 }
 
+/// \brief If the inlined function required stack probes, then ensure that
+/// the calling function has those too.
+static void adjustCallerStackProbes(Function &Caller, const Function &Callee) {
+  if (!Caller.hasFnAttribute("probe-stack") &&
+      Callee.hasFnAttribute("probe-stack")) {
+    Caller.addFnAttr(Callee.getFnAttribute("probe-stack"));
+  }
+}
+
+/// \brief If the inlined function defines the size of guard region
+/// on the stack, then ensure that the calling function defines a guard region
+/// that is no larger.
+static void
+adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
+  if (Callee.hasFnAttribute("stack-probe-size")) {
+    uint64_t CalleeStackProbeSize;
+    Callee.getFnAttribute("stack-probe-size")
+          .getValueAsString()
+          .getAsInteger(0, CalleeStackProbeSize);
+    if (Caller.hasFnAttribute("stack-probe-size")) {
+      uint64_t CallerStackProbeSize;
+      Caller.getFnAttribute("stack-probe-size")
+            .getValueAsString()
+            .getAsInteger(0, CallerStackProbeSize);
+      if (CallerStackProbeSize > CalleeStackProbeSize) {
+        Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
+      }
+    } else {
+      Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
+    }
+  }
+}
+
 #define GET_ATTR_COMPAT_FUNC
 #include "AttributesCompatFunc.inc"
 
@@ -1510,7 +1679,6 @@ bool AttributeFuncs::areInlineCompatible(const Function &Caller,
   return hasCompatibleFnAttrs(Caller, Callee);
 }
 
-
 void AttributeFuncs::mergeAttributesForInlining(Function &Caller,
                                                 const Function &Callee) {
   mergeFnAttrs(Caller, Callee);
diff --git a/contrib/llvm/lib/IR/AutoUpgrade.cpp b/contrib/llvm/lib/IR/AutoUpgrade.cpp
index e3a7bae..a501799 100644
--- a/contrib/llvm/lib/IR/AutoUpgrade.cpp
+++ b/contrib/llvm/lib/IR/AutoUpgrade.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/AutoUpgrade.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -33,10 +34,10 @@ using namespace llvm;
 
 static void rename(GlobalValue *GV) { GV->setName(GV->getName() + ".old"); }
 
-// Upgrade the declarations of the SSE4.1 functions whose arguments have
+// Upgrade the declarations of the SSE4.1 ptest intrinsics whose arguments have
 // changed their type from v4f32 to v2i64.
-static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID,
-                                 Function *&NewFn) {
+static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
+                                  Function *&NewFn) {
   // Check whether this is an old version of the function, which received
   // v4f32 arguments.
   Type *Arg0Type = F->getFunctionType()->getParamType(0);
@@ -65,6 +66,270 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
+  // All of the intrinsics matches below should be marked with which llvm
+  // version started autoupgrading them. At some point in the future we would
+  // like to use this information to remove upgrade code for some older
+  // intrinsics. It is currently undecided how we will determine that future
+  // point.
+  if (Name.startswith("sse2.pcmpeq.") || // Added in 3.1
+      Name.startswith("sse2.pcmpgt.") || // Added in 3.1
+      Name.startswith("avx2.pcmpeq.") || // Added in 3.1
+      Name.startswith("avx2.pcmpgt.") || // Added in 3.1
+      Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
+      Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
+      Name == "sse.add.ss" || // Added in 4.0
+      Name == "sse2.add.sd" || // Added in 4.0
+      Name == "sse.sub.ss" || // Added in 4.0
+      Name == "sse2.sub.sd" || // Added in 4.0
+      Name == "sse.mul.ss" || // Added in 4.0
+      Name == "sse2.mul.sd" || // Added in 4.0
+      Name == "sse.div.ss" || // Added in 4.0
+      Name == "sse2.div.sd" || // Added in 4.0
+      Name == "sse41.pmaxsb" || // Added in 3.9
+      Name == "sse2.pmaxs.w" || // Added in 3.9
+      Name == "sse41.pmaxsd" || // Added in 3.9
+      Name == "sse2.pmaxu.b" || // Added in 3.9
+      Name == "sse41.pmaxuw" || // Added in 3.9
+      Name == "sse41.pmaxud" || // Added in 3.9
+      Name == "sse41.pminsb" || // Added in 3.9
+      Name == "sse2.pmins.w" || // Added in 3.9
+      Name == "sse41.pminsd" || // Added in 3.9
+      Name == "sse2.pminu.b" || // Added in 3.9
+      Name == "sse41.pminuw" || // Added in 3.9
+      Name == "sse41.pminud" || // Added in 3.9
+      Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
+      Name.startswith("avx2.pmax") || // Added in 3.9
+      Name.startswith("avx2.pmin") || // Added in 3.9
+      Name.startswith("avx512.mask.pmax") || // Added in 4.0
+      Name.startswith("avx512.mask.pmin") || // Added in 4.0
+      Name.startswith("avx2.vbroadcast") || // Added in 3.8
+      Name.startswith("avx2.pbroadcast") || // Added in 3.8
+      Name.startswith("avx.vpermil.") || // Added in 3.1
+      Name.startswith("sse2.pshuf") || // Added in 3.9
+      Name.startswith("avx512.pbroadcast") || // Added in 3.9
+      Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
+      Name.startswith("avx512.mask.movddup") || // Added in 3.9
+      Name.startswith("avx512.mask.movshdup") || // Added in 3.9
+      Name.startswith("avx512.mask.movsldup") || // Added in 3.9
+      Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
+      Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
+      Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
+      Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
+      Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
+      Name.startswith("avx512.mask.punpckl") || // Added in 3.9
+      Name.startswith("avx512.mask.punpckh") || // Added in 3.9
+      Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
+      Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
+      Name.startswith("avx512.mask.pand.") || // Added in 3.9
+      Name.startswith("avx512.mask.pandn.") || // Added in 3.9
+      Name.startswith("avx512.mask.por.") || // Added in 3.9
+      Name.startswith("avx512.mask.pxor.") || // Added in 3.9
+      Name.startswith("avx512.mask.and.") || // Added in 3.9
+      Name.startswith("avx512.mask.andn.") || // Added in 3.9
+      Name.startswith("avx512.mask.or.") || // Added in 3.9
+      Name.startswith("avx512.mask.xor.") || // Added in 3.9
+      Name.startswith("avx512.mask.padd.") || // Added in 4.0
+      Name.startswith("avx512.mask.psub.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmull.") || // Added in 4.0
+      Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
+      Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.packsswb.") || // Added in 5.0
+      Name.startswith("avx512.mask.packssdw.") || // Added in 5.0
+      Name.startswith("avx512.mask.packuswb.") || // Added in 5.0
+      Name.startswith("avx512.mask.packusdw.") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.b") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.d") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.q") || // Added in 5.0
+      Name.startswith("avx512.mask.cmp.w") || // Added in 5.0
+      Name.startswith("avx512.mask.ucmp.") || // Added in 5.0
+      Name == "avx512.mask.add.pd.128" || // Added in 4.0
+      Name == "avx512.mask.add.pd.256" || // Added in 4.0
+      Name == "avx512.mask.add.ps.128" || // Added in 4.0
+      Name == "avx512.mask.add.ps.256" || // Added in 4.0
+      Name == "avx512.mask.div.pd.128" || // Added in 4.0
+      Name == "avx512.mask.div.pd.256" || // Added in 4.0
+      Name == "avx512.mask.div.ps.128" || // Added in 4.0
+      Name == "avx512.mask.div.ps.256" || // Added in 4.0
+      Name == "avx512.mask.mul.pd.128" || // Added in 4.0
+      Name == "avx512.mask.mul.pd.256" || // Added in 4.0
+      Name == "avx512.mask.mul.ps.128" || // Added in 4.0
+      Name == "avx512.mask.mul.ps.256" || // Added in 4.0
+      Name == "avx512.mask.sub.pd.128" || // Added in 4.0
+      Name == "avx512.mask.sub.pd.256" || // Added in 4.0
+      Name == "avx512.mask.sub.ps.128" || // Added in 4.0
+      Name == "avx512.mask.sub.ps.256" || // Added in 4.0
+      Name == "avx512.mask.max.pd.128" || // Added in 5.0
+      Name == "avx512.mask.max.pd.256" || // Added in 5.0
+      Name == "avx512.mask.max.ps.128" || // Added in 5.0
+      Name == "avx512.mask.max.ps.256" || // Added in 5.0
+      Name == "avx512.mask.min.pd.128" || // Added in 5.0
+      Name == "avx512.mask.min.pd.256" || // Added in 5.0
+      Name == "avx512.mask.min.ps.128" || // Added in 5.0
+      Name == "avx512.mask.min.ps.256" || // Added in 5.0
+      Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.w") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.w") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
+      Name.startswith("avx512.mask.pslli") || // Added in 4.0
+      Name.startswith("avx512.mask.psrai") || // Added in 4.0
+      Name.startswith("avx512.mask.psrli") || // Added in 4.0
+      Name.startswith("avx512.mask.psllv") || // Added in 4.0
+      Name.startswith("avx512.mask.psrav") || // Added in 4.0
+      Name.startswith("avx512.mask.psrlv") || // Added in 4.0
+      Name.startswith("sse41.pmovsx") || // Added in 3.8
+      Name.startswith("sse41.pmovzx") || // Added in 3.9
+      Name.startswith("avx2.pmovsx") || // Added in 3.9
+      Name.startswith("avx2.pmovzx") || // Added in 3.9
+      Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
+      Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
+      Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
+      Name == "sse2.cvtdq2pd" || // Added in 3.9
+      Name == "sse2.cvtps2pd" || // Added in 3.9
+      Name == "avx.cvtdq2.pd.256" || // Added in 3.9
+      Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
+      Name.startswith("avx.vinsertf128.") || // Added in 3.7
+      Name == "avx2.vinserti128" || // Added in 3.7
+      Name.startswith("avx512.mask.insert") || // Added in 4.0
+      Name.startswith("avx.vextractf128.") || // Added in 3.7
+      Name == "avx2.vextracti128" || // Added in 3.7
+      Name.startswith("avx512.mask.vextract") || // Added in 4.0
+      Name.startswith("sse4a.movnt.") || // Added in 3.9
+      Name.startswith("avx.movnt.") || // Added in 3.2
+      Name.startswith("avx512.storent.") || // Added in 3.9
+      Name == "sse41.movntdqa" || // Added in 5.0
+      Name == "avx2.movntdqa" || // Added in 5.0
+      Name == "avx512.movntdqa" || // Added in 5.0
+      Name == "sse2.storel.dq" || // Added in 3.9
+      Name.startswith("sse.storeu.") || // Added in 3.9
+      Name.startswith("sse2.storeu.") || // Added in 3.9
+      Name.startswith("avx.storeu.") || // Added in 3.9
+      Name.startswith("avx512.mask.storeu.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.p") || // Added in 3.9
+      Name.startswith("avx512.mask.store.b.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.d.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.q.") || // Added in 3.9
+      Name.startswith("avx512.mask.loadu.") || // Added in 3.9
+      Name.startswith("avx512.mask.load.") || // Added in 3.9
+      Name == "sse42.crc32.64.8" || // Added in 3.4
+      Name.startswith("avx.vbroadcast.s") || // Added in 3.5
+      Name.startswith("avx512.mask.palignr.") || // Added in 3.9
+      Name.startswith("avx512.mask.valign.") || // Added in 4.0
+      Name.startswith("sse2.psll.dq") || // Added in 3.7
+      Name.startswith("sse2.psrl.dq") || // Added in 3.7
+      Name.startswith("avx2.psll.dq") || // Added in 3.7
+      Name.startswith("avx2.psrl.dq") || // Added in 3.7
+      Name.startswith("avx512.psll.dq") || // Added in 3.9
+      Name.startswith("avx512.psrl.dq") || // Added in 3.9
+      Name == "sse41.pblendw" || // Added in 3.7
+      Name.startswith("sse41.blendp") || // Added in 3.7
+      Name.startswith("avx.blend.p") || // Added in 3.7
+      Name == "avx2.pblendw" || // Added in 3.7
+      Name.startswith("avx2.pblendd.") || // Added in 3.7
+      Name.startswith("avx.vbroadcastf128") || // Added in 4.0
+      Name == "avx2.vbroadcasti128" || // Added in 3.7
+      Name == "xop.vpcmov" || // Added in 3.8
+      Name == "xop.vpcmov.256" || // Added in 5.0
+      Name.startswith("avx512.mask.move.s") || // Added in 4.0
+      Name.startswith("avx512.cvtmask2") || // Added in 5.0
+      (Name.startswith("xop.vpcom") && // Added in 3.2
+       F->arg_size() == 2))
+    return true;
+
+  return false;
+}
+
+static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
+                                        Function *&NewFn) {
+  // Only handle intrinsics that start with "x86.".
+  if (!Name.startswith("x86."))
+    return false;
+  // Remove "x86." prefix.
+  Name = Name.substr(4);
+
+  if (ShouldUpgradeX86Intrinsic(F, Name)) {
+    NewFn = nullptr;
+    return true;
+  }
+
+  // SSE4.1 ptest functions may have an old signature.
+  if (Name.startswith("sse41.ptest")) { // Added in 3.2
+    if (Name.substr(11) == "c")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestc, NewFn);
+    if (Name.substr(11) == "z")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestz, NewFn);
+    if (Name.substr(11) == "nzc")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
+  }
+  // Several blend and other instructions with masks used the wrong number of
+  // bits.
+  if (Name == "sse41.insertps") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
+                                            NewFn);
+  if (Name == "sse41.dppd") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
+                                            NewFn);
+  if (Name == "sse41.dpps") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
+                                            NewFn);
+  if (Name == "sse41.mpsadbw") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
+                                            NewFn);
+  if (Name == "avx.dp.ps.256") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
+                                            NewFn);
+  if (Name == "avx2.mpsadbw") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
+                                            NewFn);
+
+  // frcz.ss/sd may need to have an argument dropped. Added in 3.2
+  if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
+    rename(F);
+    NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::x86_xop_vfrcz_ss);
+    return true;
+  }
+  if (Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
+    rename(F);
+    NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::x86_xop_vfrcz_sd);
+    return true;
+  }
+  // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
+  if (Name.startswith("xop.vpermil2")) { // Added in 3.9
+    auto Idx = F->getFunctionType()->getParamType(2);
+    if (Idx->isFPOrFPVectorTy()) {
+      rename(F);
+      unsigned IdxSize = Idx->getPrimitiveSizeInBits();
+      unsigned EltSize = Idx->getScalarSizeInBits();
+      Intrinsic::ID Permil2ID;
+      if (EltSize == 64 && IdxSize == 128)
+        Permil2ID = Intrinsic::x86_xop_vpermil2pd;
+      else if (EltSize == 32 && IdxSize == 128)
+        Permil2ID = Intrinsic::x86_xop_vpermil2ps;
+      else if (EltSize == 64 && IdxSize == 256)
+        Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
+      else
+        Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
@@ -155,26 +420,31 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
-  case 'i': {
-    if (Name.startswith("invariant.start")) {
+  case 'i':
+  case 'l': {
+    bool IsLifetimeStart = Name.startswith("lifetime.start");
+    if (IsLifetimeStart || Name.startswith("invariant.start")) {
+      Intrinsic::ID ID = IsLifetimeStart ?
+        Intrinsic::lifetime_start : Intrinsic::invariant_start;
       auto Args = F->getFunctionType()->params();
       Type* ObjectPtr[1] = {Args[1]};
-      if (F->getName() !=
-          Intrinsic::getName(Intrinsic::invariant_start, ObjectPtr)) {
+      if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(
-            F->getParent(), Intrinsic::invariant_start, ObjectPtr);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
         return true;
       }
     }
-    if (Name.startswith("invariant.end")) {
+
+    bool IsLifetimeEnd = Name.startswith("lifetime.end");
+    if (IsLifetimeEnd || Name.startswith("invariant.end")) {
+      Intrinsic::ID ID = IsLifetimeEnd ?
+        Intrinsic::lifetime_end : Intrinsic::invariant_end;
+
       auto Args = F->getFunctionType()->params();
-      Type* ObjectPtr[1] = {Args[2]};
-      if (F->getName() !=
-          Intrinsic::getName(Intrinsic::invariant_end, ObjectPtr)) {
+      Type* ObjectPtr[1] = {Args[IsLifetimeEnd ? 1 : 2]};
+      if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::invariant_end, ObjectPtr);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
         return true;
       }
     }
@@ -202,18 +472,72 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         return true;
       }
     }
+    // Renaming gather/scatter intrinsics with no address space overloading
+    // to the new overload which includes an address space
+    if (Name.startswith("masked.gather.")) {
+      Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()};
+      if (F->getName() != Intrinsic::getName(Intrinsic::masked_gather, Tys)) {
+        rename(F);
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::masked_gather, Tys);
+        return true;
+      }
+    }
+    if (Name.startswith("masked.scatter.")) {
+      auto Args = F->getFunctionType()->params();
+      Type *Tys[] = {Args[0], Args[1]};
+      if (F->getName() != Intrinsic::getName(Intrinsic::masked_scatter, Tys)) {
+        rename(F);
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::masked_scatter, Tys);
+        return true;
+      }
+    }
     break;
   }
+  case 'n': {
+    if (Name.startswith("nvvm.")) {
+      Name = Name.substr(5);
+
+      // The following nvvm intrinsics correspond exactly to an LLVM intrinsic.
+      Intrinsic::ID IID = StringSwitch<Intrinsic::ID>(Name)
+                              .Cases("brev32", "brev64", Intrinsic::bitreverse)
+                              .Case("clz.i", Intrinsic::ctlz)
+                              .Case("popc.i", Intrinsic::ctpop)
+                              .Default(Intrinsic::not_intrinsic);
+      if (IID != Intrinsic::not_intrinsic && F->arg_size() == 1) {
+        NewFn = Intrinsic::getDeclaration(F->getParent(), IID,
+                                          {F->getReturnType()});
+        return true;
+      }
 
+      // The following nvvm intrinsics correspond exactly to an LLVM idiom, but
+      // not to an intrinsic alone.  We expand them in UpgradeIntrinsicCall.
+      //
+      // TODO: We could add lohi.i2d.
+      bool Expand = StringSwitch<bool>(Name)
+                        .Cases("abs.i", "abs.ll", true)
+                        .Cases("clz.ll", "popc.ll", "h2f", true)
+                        .Cases("max.i", "max.ll", "max.ui", "max.ull", true)
+                        .Cases("min.i", "min.ll", "min.ui", "min.ull", true)
+                        .Default(false);
+      if (Expand) {
+        NewFn = nullptr;
+        return true;
+      }
+    }
+    break;
+  }
   case 'o':
     // We only need to change the name to match the mangling including the
     // address space.
-    if (F->arg_size() == 2 && Name.startswith("objectsize.")) {
+    if (Name.startswith("objectsize.")) {
       Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
-      if (F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
+      if (F->arg_size() == 2 ||
+          F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::objectsize, Tys);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize,
+                                          Tys);
         return true;
       }
     }
@@ -226,236 +550,15 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
-  case 'x': {
-    bool IsX86 = Name.startswith("x86.");
-    if (IsX86)
-      Name = Name.substr(4);
-
-    // All of the intrinsics matches below should be marked with which llvm
-    // version started autoupgrading them. At some point in the future we would
-    // like to use this information to remove upgrade code for some older
-    // intrinsics. It is currently undecided how we will determine that future
-    // point.
-    if (IsX86 &&
-        (Name.startswith("sse2.pcmpeq.") || // Added in 3.1
-         Name.startswith("sse2.pcmpgt.") || // Added in 3.1
-         Name.startswith("avx2.pcmpeq.") || // Added in 3.1
-         Name.startswith("avx2.pcmpgt.") || // Added in 3.1
-         Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
-         Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
-         Name == "sse.add.ss" || // Added in 4.0
-         Name == "sse2.add.sd" || // Added in 4.0
-         Name == "sse.sub.ss" || // Added in 4.0
-         Name == "sse2.sub.sd" || // Added in 4.0
-         Name == "sse.mul.ss" || // Added in 4.0
-         Name == "sse2.mul.sd" || // Added in 4.0
-         Name == "sse.div.ss" || // Added in 4.0
-         Name == "sse2.div.sd" || // Added in 4.0
-         Name == "sse41.pmaxsb" || // Added in 3.9
-         Name == "sse2.pmaxs.w" || // Added in 3.9
-         Name == "sse41.pmaxsd" || // Added in 3.9
-         Name == "sse2.pmaxu.b" || // Added in 3.9
-         Name == "sse41.pmaxuw" || // Added in 3.9
-         Name == "sse41.pmaxud" || // Added in 3.9
-         Name == "sse41.pminsb" || // Added in 3.9
-         Name == "sse2.pmins.w" || // Added in 3.9
-         Name == "sse41.pminsd" || // Added in 3.9
-         Name == "sse2.pminu.b" || // Added in 3.9
-         Name == "sse41.pminuw" || // Added in 3.9
-         Name == "sse41.pminud" || // Added in 3.9
-         Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
-         Name.startswith("avx2.pmax") || // Added in 3.9
-         Name.startswith("avx2.pmin") || // Added in 3.9
-         Name.startswith("avx512.mask.pmax") || // Added in 4.0
-         Name.startswith("avx512.mask.pmin") || // Added in 4.0
-         Name.startswith("avx2.vbroadcast") || // Added in 3.8
-         Name.startswith("avx2.pbroadcast") || // Added in 3.8
-         Name.startswith("avx.vpermil.") || // Added in 3.1
-         Name.startswith("sse2.pshuf") || // Added in 3.9
-         Name.startswith("avx512.pbroadcast") || // Added in 3.9
-         Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
-         Name.startswith("avx512.mask.movddup") || // Added in 3.9
-         Name.startswith("avx512.mask.movshdup") || // Added in 3.9
-         Name.startswith("avx512.mask.movsldup") || // Added in 3.9
-         Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
-         Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
-         Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
-         Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
-         Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
-         Name.startswith("avx512.mask.punpckl") || // Added in 3.9
-         Name.startswith("avx512.mask.punpckh") || // Added in 3.9
-         Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
-         Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
-         Name.startswith("avx512.mask.pand.") || // Added in 3.9
-         Name.startswith("avx512.mask.pandn.") || // Added in 3.9
-         Name.startswith("avx512.mask.por.") || // Added in 3.9
-         Name.startswith("avx512.mask.pxor.") || // Added in 3.9
-         Name.startswith("avx512.mask.and.") || // Added in 3.9
-         Name.startswith("avx512.mask.andn.") || // Added in 3.9
-         Name.startswith("avx512.mask.or.") || // Added in 3.9
-         Name.startswith("avx512.mask.xor.") || // Added in 3.9
-         Name.startswith("avx512.mask.padd.") || // Added in 4.0
-         Name.startswith("avx512.mask.psub.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmull.") || // Added in 4.0
-         Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
-         Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
-         Name == "avx512.mask.add.pd.128" || // Added in 4.0
-         Name == "avx512.mask.add.pd.256" || // Added in 4.0
-         Name == "avx512.mask.add.ps.128" || // Added in 4.0
-         Name == "avx512.mask.add.ps.256" || // Added in 4.0
-         Name == "avx512.mask.div.pd.128" || // Added in 4.0
-         Name == "avx512.mask.div.pd.256" || // Added in 4.0
-         Name == "avx512.mask.div.ps.128" || // Added in 4.0
-         Name == "avx512.mask.div.ps.256" || // Added in 4.0
-         Name == "avx512.mask.mul.pd.128" || // Added in 4.0
-         Name == "avx512.mask.mul.pd.256" || // Added in 4.0
-         Name == "avx512.mask.mul.ps.128" || // Added in 4.0
-         Name == "avx512.mask.mul.ps.256" || // Added in 4.0
-         Name == "avx512.mask.sub.pd.128" || // Added in 4.0
-         Name == "avx512.mask.sub.pd.256" || // Added in 4.0
-         Name == "avx512.mask.sub.ps.128" || // Added in 4.0
-         Name == "avx512.mask.sub.ps.256" || // Added in 4.0
-         Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.w") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.w") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
-         Name.startswith("avx512.mask.pslli") || // Added in 4.0
-         Name.startswith("avx512.mask.psrai") || // Added in 4.0
-         Name.startswith("avx512.mask.psrli") || // Added in 4.0
-         Name.startswith("avx512.mask.psllv") || // Added in 4.0
-         Name.startswith("avx512.mask.psrav") || // Added in 4.0
-         Name.startswith("avx512.mask.psrlv") || // Added in 4.0
-         Name.startswith("sse41.pmovsx") || // Added in 3.8
-         Name.startswith("sse41.pmovzx") || // Added in 3.9
-         Name.startswith("avx2.pmovsx") || // Added in 3.9
-         Name.startswith("avx2.pmovzx") || // Added in 3.9
-         Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
-         Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
-         Name == "sse2.cvtdq2pd" || // Added in 3.9
-         Name == "sse2.cvtps2pd" || // Added in 3.9
-         Name == "avx.cvtdq2.pd.256" || // Added in 3.9
-         Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
-         Name.startswith("avx.vinsertf128.") || // Added in 3.7
-         Name == "avx2.vinserti128" || // Added in 3.7
-         Name.startswith("avx512.mask.insert") || // Added in 4.0
-         Name.startswith("avx.vextractf128.") || // Added in 3.7
-         Name == "avx2.vextracti128" || // Added in 3.7
-         Name.startswith("avx512.mask.vextract") || // Added in 4.0
-         Name.startswith("sse4a.movnt.") || // Added in 3.9
-         Name.startswith("avx.movnt.") || // Added in 3.2
-         Name.startswith("avx512.storent.") || // Added in 3.9
-         Name == "sse2.storel.dq" || // Added in 3.9
-         Name.startswith("sse.storeu.") || // Added in 3.9
-         Name.startswith("sse2.storeu.") || // Added in 3.9
-         Name.startswith("avx.storeu.") || // Added in 3.9
-         Name.startswith("avx512.mask.storeu.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.p") || // Added in 3.9
-         Name.startswith("avx512.mask.store.b.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.d.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.q.") || // Added in 3.9
-         Name.startswith("avx512.mask.loadu.") || // Added in 3.9
-         Name.startswith("avx512.mask.load.") || // Added in 3.9
-         Name == "sse42.crc32.64.8" || // Added in 3.4
-         Name.startswith("avx.vbroadcast.s") || // Added in 3.5
-         Name.startswith("avx512.mask.palignr.") || // Added in 3.9
-         Name.startswith("avx512.mask.valign.") || // Added in 4.0
-         Name.startswith("sse2.psll.dq") || // Added in 3.7
-         Name.startswith("sse2.psrl.dq") || // Added in 3.7
-         Name.startswith("avx2.psll.dq") || // Added in 3.7
-         Name.startswith("avx2.psrl.dq") || // Added in 3.7
-         Name.startswith("avx512.psll.dq") || // Added in 3.9
-         Name.startswith("avx512.psrl.dq") || // Added in 3.9
-         Name == "sse41.pblendw" || // Added in 3.7
-         Name.startswith("sse41.blendp") || // Added in 3.7
-         Name.startswith("avx.blend.p") || // Added in 3.7
-         Name == "avx2.pblendw" || // Added in 3.7
-         Name.startswith("avx2.pblendd.") || // Added in 3.7
-         Name.startswith("avx.vbroadcastf128") || // Added in 4.0
-         Name == "avx2.vbroadcasti128" || // Added in 3.7
-         Name == "xop.vpcmov" || // Added in 3.8
-         Name.startswith("avx512.mask.move.s") || // Added in 4.0
-         (Name.startswith("xop.vpcom") && // Added in 3.2
-          F->arg_size() == 2))) {
-      NewFn = nullptr;
+  case 'x':
+    if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
-    }
-    // SSE4.1 ptest functions may have an old signature.
-    if (IsX86 && Name.startswith("sse41.ptest")) { // Added in 3.2
-      if (Name.substr(11) == "c")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
-      if (Name.substr(11) == "z")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
-      if (Name.substr(11) == "nzc")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
-    }
-    // Several blend and other instructions with masks used the wrong number of
-    // bits.
-    if (IsX86 && Name == "sse41.insertps") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.dppd") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.dpps") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.mpsadbw") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
-                                              NewFn);
-    if (IsX86 && Name == "avx.dp.ps.256") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
-                                              NewFn);
-    if (IsX86 && Name == "avx2.mpsadbw") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
-                                              NewFn);
-
-    // frcz.ss/sd may need to have an argument dropped. Added in 3.2
-    if (IsX86 && Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
-      rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                        Intrinsic::x86_xop_vfrcz_ss);
-      return true;
-    }
-    if (IsX86 && Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
-      rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                        Intrinsic::x86_xop_vfrcz_sd);
-      return true;
-    }
-    // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
-    if (IsX86 && Name.startswith("xop.vpermil2")) { // Added in 3.9
-      auto Params = F->getFunctionType()->params();
-      auto Idx = Params[2];
-      if (Idx->getScalarType()->isFloatingPointTy()) {
-        rename(F);
-        unsigned IdxSize = Idx->getPrimitiveSizeInBits();
-        unsigned EltSize = Idx->getScalarSizeInBits();
-        Intrinsic::ID Permil2ID;
-        if (EltSize == 64 && IdxSize == 128)
-          Permil2ID = Intrinsic::x86_xop_vpermil2pd;
-        else if (EltSize == 32 && IdxSize == 128)
-          Permil2ID = Intrinsic::x86_xop_vpermil2ps;
-        else if (EltSize == 64 && IdxSize == 256)
-          Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
-        else
-          Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
-        NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
-        return true;
-      }
-    }
-    break;
   }
+  // Remangle our intrinsic since we upgrade the mangling
+  auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F);
+  if (Result != None) {
+    NewFn = Result.getValue();
+    return true;
   }
 
   //  This may not belong here. This function is effectively being overloaded
@@ -685,12 +788,30 @@ static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI,
 }
 
 static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
-                                   ICmpInst::Predicate Pred) {
+                                   unsigned CC, bool Signed) {
   Value *Op0 = CI.getArgOperand(0);
   unsigned NumElts = Op0->getType()->getVectorNumElements();
-  Value *Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
 
-  Value *Mask = CI.getArgOperand(2);
+  Value *Cmp;
+  if (CC == 3) {
+    Cmp = Constant::getNullValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
+  } else if (CC == 7) {
+    Cmp = Constant::getAllOnesValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
+  } else {
+    ICmpInst::Predicate Pred;
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case 0: Pred = ICmpInst::ICMP_EQ;  break;
+    case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
+    case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
+    case 4: Pred = ICmpInst::ICMP_NE;  break;
+    case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
+    case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
+    }
+    Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
+  }
+
+  Value *Mask = CI.getArgOperand(CI.getNumArgOperands() - 1);
   const auto *C = dyn_cast<Constant>(Mask);
   if (!C || !C->isAllOnesValue())
     Cmp = Builder.CreateAnd(Cmp, getX86MaskVec(Builder, Mask, NumElts));
@@ -733,6 +854,15 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
   return Builder.CreateInsertElement(A, Select, (uint64_t)0);
 }
 
+
+static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
+  Value* Op = CI.getArgOperand(0);
+  Type* ReturnOp = CI.getType();
+  unsigned NumElts = CI.getType()->getVectorNumElements();
+  Value *Mask = getX86MaskVec(Builder, Op, NumElts);
+  return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -753,6 +883,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     bool IsX86 = Name.startswith("x86.");
     if (IsX86)
       Name = Name.substr(4);
+    bool IsNVVM = Name.startswith("nvvm.");
+    if (IsNVVM)
+      Name = Name.substr(5);
 
     if (IsX86 && Name.startswith("sse4a.movnt.")) {
       Module *M = F->getParent();
@@ -838,18 +971,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.startswith("avx512.mask.storeu."))) {
+    if (IsX86 && (Name.startswith("avx512.mask.store"))) {
+      // "avx512.mask.storeu." or "avx512.mask.store."
+      bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
       UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), /*Aligned*/false);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && (Name.startswith("avx512.mask.store."))) {
-      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), /*Aligned*/true);
+                         CI->getArgOperand(2), Aligned);
 
       // Remove intrinsic.
       CI->eraseFromParent();
@@ -858,15 +984,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
     Value *Rep;
     // Upgrade packed integer vector compare intrinsics to compare instructions.
-    if (IsX86 && (Name.startswith("sse2.pcmpeq.") ||
-                  Name.startswith("avx2.pcmpeq."))) {
-      Rep = Builder.CreateICmpEQ(CI->getArgOperand(0), CI->getArgOperand(1),
-                                 "pcmpeq");
-      Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.startswith("sse2.pcmpgt.") ||
-                         Name.startswith("avx2.pcmpgt."))) {
-      Rep = Builder.CreateICmpSGT(CI->getArgOperand(0), CI->getArgOperand(1),
-                                  "pcmpgt");
+    if (IsX86 && (Name.startswith("sse2.pcmp") ||
+                  Name.startswith("avx2.pcmp"))) {
+      // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
+      bool CmpEq = Name[9] == 'e';
+      Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
+                               CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
     } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
       Type *I32Ty = Type::getInt32Ty(C);
@@ -904,10 +1027,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
                                         Builder.CreateFDiv(Elt0, Elt1),
                                         ConstantInt::get(I32Ty, 0));
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmpeq.")) {
-      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_EQ);
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmpgt.")) {
-      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_SGT);
+    } else if (IsX86 && Name.startswith("avx512.mask.pcmp")) {
+      // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
+      bool CmpEq = Name[16] == 'e';
+      Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
+    } else if (IsX86 && Name.startswith("avx512.mask.cmp")) {
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
+    } else if (IsX86 && Name.startswith("avx512.mask.ucmp")) {
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
     } else if (IsX86 && (Name == "sse41.pmaxsb" ||
                          Name == "sse2.pmaxs.w" ||
                          Name == "sse41.pmaxsd" ||
@@ -1019,15 +1148,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
-    } else if (IsX86 && Name == "xop.vpcmov") {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
+    } else if (IsX86 && Name.startswith("xop.vpcmov")) {
       Value *Sel = CI->getArgOperand(2);
-      unsigned NumElts = CI->getType()->getVectorNumElements();
-      Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1));
-      Value *NotSel = Builder.CreateXor(Sel, MinusOne);
-      Value *Sel0 = Builder.CreateAnd(Arg0, Sel);
-      Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
+      Value *NotSel = Builder.CreateNot(Sel);
+      Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
+      Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
       Rep = Builder.CreateOr(Sel0, Sel1);
     } else if (IsX86 && Name == "sse42.crc32.64.8") {
       Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
@@ -1461,6 +1586,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::ctlz,
+                                                         CI->getType()),
+                               { CI->getArgOperand(0), Builder.getInt1(false) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.mask.max.p") ||
+                         Name.startswith("avx512.mask.min.p"))) {
+      bool IsMin = Name[13] == 'i';
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned VecWidth = VecTy->getPrimitiveSizeInBits();
+      unsigned EltWidth = VecTy->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (!IsMin && VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_sse_max_ps;
+      else if (!IsMin && VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_sse2_max_pd;
+      else if (!IsMin && VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx_max_ps_256;
+      else if (!IsMin && VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx_max_pd_256;
+      else if (IsMin && VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_sse_min_ps;
+      else if (IsMin && VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_sse2_min_pd;
+      else if (IsMin && VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx_min_ps_256;
+      else if (IsMin && VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx_min_pd_256;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.pshuf.b.")) {
       VectorType *VecTy = cast<VectorType>(CI->getType());
       Intrinsic::ID IID;
@@ -1501,6 +1663,42 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                { CI->getArgOperand(0), CI->getArgOperand(1) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.pack")) {
+      bool IsUnsigned = Name[16] == 'u';
+      bool IsDW = Name[18] == 'd';
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      Intrinsic::ID IID;
+      if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packsswb_128;
+      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packsswb;
+      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packsswb_512;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packssdw_128;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packssdw;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packssdw_512;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packuswb_128;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packuswb;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packuswb_512;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse41_packusdw;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packusdw;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packusdw_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.psll")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
@@ -1705,6 +1903,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
     } else if (IsX86 && Name.startswith("avx512.mask.move.s")) {
       Rep = upgradeMaskedMove(Builder, *CI);
+    } else if (IsX86 && Name.startswith("avx512.cvtmask2")) {
+      Rep = UpgradeMaskToInt(Builder, *CI);
     } else if (IsX86 && Name.startswith("avx512.mask.vpermilvar.")) {
       Intrinsic::ID IID;
       if (Name.endswith("ps.128"))
@@ -1727,6 +1927,64 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                { CI->getArgOperand(0), CI->getArgOperand(1) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.endswith(".movntdqa")) {
+      Module *M = F->getParent();
+      MDNode *Node = MDNode::get(
+          C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
+
+      Value *Ptr = CI->getArgOperand(0);
+      VectorType *VTy = cast<VectorType>(CI->getType());
+
+      // Convert the type of the pointer to a pointer to the stored type.
+      Value *BC =
+          Builder.CreateBitCast(Ptr, PointerType::getUnqual(VTy), "cast");
+      LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8);
+      LI->setMetadata(M->getMDKindID("nontemporal"), Node);
+      Rep = LI;
+    } else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
+      Value *Arg = CI->getArgOperand(0);
+      Value *Neg = Builder.CreateNeg(Arg, "neg");
+      Value *Cmp = Builder.CreateICmpSGE(
+          Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
+      Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
+    } else if (IsNVVM && (Name == "max.i" || Name == "max.ll" ||
+                          Name == "max.ui" || Name == "max.ull")) {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+      Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull")
+                       ? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
+                       : Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
+      Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
+    } else if (IsNVVM && (Name == "min.i" || Name == "min.ll" ||
+                          Name == "min.ui" || Name == "min.ull")) {
+      Value *Arg0 = CI->getArgOperand(0);
+      Value *Arg1 = CI->getArgOperand(1);
+      Value *Cmp = Name.endswith(".ui") || Name.endswith(".ull")
+                       ? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
+                       : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
+      Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
+    } else if (IsNVVM && Name == "clz.ll") {
+      // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 and returns an i64.
+      Value *Arg = CI->getArgOperand(0);
+      Value *Ctlz = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                    {Arg->getType()}),
+          {Arg, Builder.getFalse()}, "ctlz");
+      Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
+    } else if (IsNVVM && Name == "popc.ll") {
+      // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 and returns an
+      // i64.
+      Value *Arg = CI->getArgOperand(0);
+      Value *Popc = Builder.CreateCall(
+          Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
+                                    {Arg->getType()}),
+          Arg, "ctpop");
+      Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
+    } else if (IsNVVM && Name == "h2f") {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(
+                                   F->getParent(), Intrinsic::convert_from_fp16,
+                                   {Builder.getFloatTy()}),
+                               CI->getArgOperand(0), "h2f");
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
@@ -1737,13 +1995,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
-  std::string Name = CI->getName();
-  if (!Name.empty())
-    CI->setName(Name + ".old");
-
+  CallInst *NewCall = nullptr;
   switch (NewFn->getIntrinsicID()) {
-  default:
-    llvm_unreachable("Unknown function for CallInst upgrade.");
+  default: {
+    // Handle generic mangling change, but nothing else
+    assert(
+        (CI->getCalledFunction()->getName() != NewFn->getName()) &&
+        "Unknown function for CallInst upgrade and isn't just a name change");
+    CI->setCalledFunction(NewFn);
+    return;
+  }
 
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
@@ -1761,43 +2022,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::arm_neon_vst4lane: {
     SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                  CI->arg_operands().end());
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::bitreverse:
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
 
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
            "Mismatch between function args and call args");
-    CI->replaceAllUsesWith(Builder.CreateCall(
-        NewFn, {CI->getArgOperand(0), Builder.getFalse()}, Name));
-    CI->eraseFromParent();
-    return;
-
-  case Intrinsic::objectsize:
-    CI->replaceAllUsesWith(Builder.CreateCall(
-        NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)}, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall =
+        Builder.CreateCall(NewFn, {CI->getArgOperand(0), Builder.getFalse()});
+    break;
 
-  case Intrinsic::ctpop: {
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+  case Intrinsic::objectsize: {
+    Value *NullIsUnknownSize = CI->getNumArgOperands() == 2
+                                   ? Builder.getFalse()
+                                   : CI->getArgOperand(2);
+    NewCall = Builder.CreateCall(
+        NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), NullIsUnknownSize});
+    break;
   }
 
+  case Intrinsic::ctpop:
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
+
+  case Intrinsic::convert_from_fp16:
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
+
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd:
-    CI->replaceAllUsesWith(
-        Builder.CreateCall(NewFn, {CI->getArgOperand(1)}, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(1)});
+    break;
 
   case Intrinsic::x86_xop_vpermil2pd:
   case Intrinsic::x86_xop_vpermil2ps:
@@ -1808,9 +2069,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
     VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
     Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::x86_sse41_ptestc:
@@ -1832,10 +2092,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     Value *BC0 = Builder.CreateBitCast(Arg0, NewVecTy, "cast");
     Value *BC1 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
 
-    CallInst *NewCall = Builder.CreateCall(NewFn, {BC0, BC1}, Name);
-    CI->replaceAllUsesWith(NewCall);
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {BC0, BC1});
+    break;
   }
 
   case Intrinsic::x86_sse41_insertps:
@@ -1851,30 +2109,36 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
     // Replace the last argument with a trunc.
     Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
-
-    CallInst *NewCall = Builder.CreateCall(NewFn, Args);
-    CI->replaceAllUsesWith(NewCall);
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::thread_pointer: {
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {});
+    break;
   }
 
   case Intrinsic::invariant_start:
   case Intrinsic::invariant_end:
   case Intrinsic::masked_load:
-  case Intrinsic::masked_store: {
+  case Intrinsic::masked_store:
+  case Intrinsic::masked_gather:
+  case Intrinsic::masked_scatter: {
     SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                  CI->arg_operands().end());
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
   }
+  assert(NewCall && "Should have either set this variable or returned through "
+                    "the default case");
+  std::string Name = CI->getName();
+  if (!Name.empty()) {
+    CI->setName(Name + ".old");
+    NewCall->setName(Name);
+  }
+  CI->replaceAllUsesWith(NewCall);
+  CI->eraseFromParent();
 }
 
 void llvm::UpgradeCallsToIntrinsic(Function *F) {
@@ -1975,14 +2239,14 @@ bool llvm::UpgradeDebugInfo(Module &M) {
 }
 
 bool llvm::UpgradeModuleFlags(Module &M) {
-  const NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
+  NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
   if (!ModFlags)
     return false;
 
-  bool HasObjCFlag = false, HasClassProperties = false;
+  bool HasObjCFlag = false, HasClassProperties = false, Changed = false;
   for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
     MDNode *Op = ModFlags->getOperand(I);
-    if (Op->getNumOperands() < 2)
+    if (Op->getNumOperands() != 3)
       continue;
     MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
     if (!ID)
@@ -1991,7 +2255,24 @@ bool llvm::UpgradeModuleFlags(Module &M) {
       HasObjCFlag = true;
     if (ID->getString() == "Objective-C Class Properties")
       HasClassProperties = true;
+    // Upgrade PIC/PIE Module Flags. The module flag behavior for these two
+    // field was Error and now they are Max.
+    if (ID->getString() == "PIC Level" || ID->getString() == "PIE Level") {
+      if (auto *Behavior =
+              mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0))) {
+        if (Behavior->getLimitedValue() == Module::Error) {
+          Type *Int32Ty = Type::getInt32Ty(M.getContext());
+          Metadata *Ops[3] = {
+              ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Module::Max)),
+              MDString::get(M.getContext(), ID->getString()),
+              Op->getOperand(2)};
+          ModFlags->setOperand(I, MDNode::get(M.getContext(), Ops));
+          Changed = true;
+        }
+      }
+    }
   }
+
   // "Objective-C Class Properties" is recently added for Objective-C. We
   // upgrade ObjC bitcodes to contain a "Objective-C Class Properties" module
   // flag of value 0, so we can correclty downgrade this flag when trying to
@@ -2000,9 +2281,10 @@ bool llvm::UpgradeModuleFlags(Module &M) {
   if (HasObjCFlag && !HasClassProperties) {
     M.addModuleFlag(llvm::Module::Override, "Objective-C Class Properties",
                     (uint32_t)0);
-    return true;
+    Changed = true;
   }
-  return false;
+
+  return Changed;
 }
 
 static bool isOldLoopArgument(Metadata *MD) {
diff --git a/contrib/llvm/lib/IR/BasicBlock.cpp b/contrib/llvm/lib/IR/BasicBlock.cpp
index 19e7849..2b780ad 100644
--- a/contrib/llvm/lib/IR/BasicBlock.cpp
+++ b/contrib/llvm/lib/IR/BasicBlock.cpp
@@ -117,28 +117,19 @@ const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
-Module *BasicBlock::getModule() {
-  return getParent()->getParent();
-}
-
-TerminatorInst *BasicBlock::getTerminator() {
-  if (InstList.empty()) return nullptr;
-  return dyn_cast<TerminatorInst>(&InstList.back());
-}
-
 const TerminatorInst *BasicBlock::getTerminator() const {
   if (InstList.empty()) return nullptr;
   return dyn_cast<TerminatorInst>(&InstList.back());
 }
 
-CallInst *BasicBlock::getTerminatingMustTailCall() {
+const CallInst *BasicBlock::getTerminatingMustTailCall() const {
   if (InstList.empty())
     return nullptr;
-  ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
+  const ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
   if (!RI || RI == &InstList.front())
     return nullptr;
 
-  Instruction *Prev = RI->getPrevNode();
+  const Instruction *Prev = RI->getPrevNode();
   if (!Prev)
     return nullptr;
 
@@ -162,7 +153,7 @@ CallInst *BasicBlock::getTerminatingMustTailCall() {
   return nullptr;
 }
 
-CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
+const CallInst *BasicBlock::getTerminatingDeoptimizeCall() const {
   if (InstList.empty())
     return nullptr;
   auto *RI = dyn_cast<ReturnInst>(&InstList.back());
@@ -177,22 +168,22 @@ CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHI() {
-  for (Instruction &I : *this)
+const Instruction* BasicBlock::getFirstNonPHI() const {
+  for (const Instruction &I : *this)
     if (!isa<PHINode>(I))
       return &I;
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHIOrDbg() {
-  for (Instruction &I : *this)
+const Instruction* BasicBlock::getFirstNonPHIOrDbg() const {
+  for (const Instruction &I : *this)
     if (!isa<PHINode>(I) && !isa<DbgInfoIntrinsic>(I))
       return &I;
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
-  for (Instruction &I : *this) {
+const Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() const {
+  for (const Instruction &I : *this) {
     if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
       continue;
 
@@ -206,12 +197,12 @@ Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
   return nullptr;
 }
 
-BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
-  Instruction *FirstNonPHI = getFirstNonPHI();
+BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const {
+  const Instruction *FirstNonPHI = getFirstNonPHI();
   if (!FirstNonPHI)
     return end();
 
-  iterator InsertPt = FirstNonPHI->getIterator();
+  const_iterator InsertPt = FirstNonPHI->getIterator();
   if (InsertPt->isEHPad()) ++InsertPt;
   return InsertPt;
 }
@@ -223,10 +214,10 @@ void BasicBlock::dropAllReferences() {
 
 /// If this basic block has a single predecessor block,
 /// return the block, otherwise return a null pointer.
-BasicBlock *BasicBlock::getSinglePredecessor() {
-  pred_iterator PI = pred_begin(this), E = pred_end(this);
+const BasicBlock *BasicBlock::getSinglePredecessor() const {
+  const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr;         // No preds.
-  BasicBlock *ThePred = *PI;
+  const BasicBlock *ThePred = *PI;
   ++PI;
   return (PI == E) ? ThePred : nullptr /*multiple preds*/;
 }
@@ -236,10 +227,10 @@ BasicBlock *BasicBlock::getSinglePredecessor() {
 /// Note that unique predecessor doesn't mean single edge, there can be
 /// multiple edges from the unique predecessor to this block (for example
 /// a switch statement with multiple cases having the same destination).
-BasicBlock *BasicBlock::getUniquePredecessor() {
-  pred_iterator PI = pred_begin(this), E = pred_end(this);
+const BasicBlock *BasicBlock::getUniquePredecessor() const {
+  const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr; // No preds.
-  BasicBlock *PredBB = *PI;
+  const BasicBlock *PredBB = *PI;
   ++PI;
   for (;PI != E; ++PI) {
     if (*PI != PredBB)
@@ -250,18 +241,18 @@ BasicBlock *BasicBlock::getUniquePredecessor() {
   return PredBB;
 }
 
-BasicBlock *BasicBlock::getSingleSuccessor() {
-  succ_iterator SI = succ_begin(this), E = succ_end(this);
+const BasicBlock *BasicBlock::getSingleSuccessor() const {
+  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // no successors
-  BasicBlock *TheSucc = *SI;
+  const BasicBlock *TheSucc = *SI;
   ++SI;
   return (SI == E) ? TheSucc : nullptr /* multiple successors */;
 }
 
-BasicBlock *BasicBlock::getUniqueSuccessor() {
-  succ_iterator SI = succ_begin(this), E = succ_end(this);
+const BasicBlock *BasicBlock::getUniqueSuccessor() const {
+  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // No successors
-  BasicBlock *SuccBB = *SI;
+  const BasicBlock *SuccBB = *SI;
   ++SI;
   for (;SI != E; ++SI) {
     if (*SI != SuccBB)
@@ -272,6 +263,10 @@ BasicBlock *BasicBlock::getUniqueSuccessor() {
   return SuccBB;
 }
 
+iterator_range<BasicBlock::phi_iterator> BasicBlock::phis() {
+  return make_range<phi_iterator>(dyn_cast<PHINode>(&front()), nullptr);
+}
+
 /// This method is used to notify a BasicBlock that the
 /// specified Predecessor of the block is no longer able to reach it.  This is
 /// actually not used to update the Predecessor list, but is actually used to
@@ -360,6 +355,19 @@ bool BasicBlock::canSplitPredecessors() const {
   return true;
 }
 
+bool BasicBlock::isLegalToHoistInto() const {
+  auto *Term = getTerminator();
+  // No terminator means the block is under construction.
+  if (!Term)
+    return true;
+
+  // If the block has no successors, there can be no instructions to hoist.
+  assert(Term->getNumSuccessors() > 0);
+
+  // Instructions should not be hoisted across exception handling boundaries.
+  return !Term->isExceptional();
+}
+
 /// This splits a basic block into two at the specified
 /// instruction.  Note that all instructions BEFORE the specified iterator stay
 /// as part of the original basic block, an unconditional branch is added to
@@ -398,13 +406,11 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
     // Loop over any phi nodes in the basic block, updating the BB field of
     // incoming values...
     BasicBlock *Successor = *I;
-    PHINode *PN;
-    for (BasicBlock::iterator II = Successor->begin();
-         (PN = dyn_cast<PHINode>(II)); ++II) {
-      int IDX = PN->getBasicBlockIndex(this);
-      while (IDX != -1) {
-        PN->setIncomingBlock((unsigned)IDX, New);
-        IDX = PN->getBasicBlockIndex(this);
+    for (auto &PN : Successor->phis()) {
+      int Idx = PN.getBasicBlockIndex(this);
+      while (Idx != -1) {
+        PN.setIncomingBlock((unsigned)Idx, New);
+        Idx = PN.getBasicBlockIndex(this);
       }
     }
   }
@@ -438,9 +444,6 @@ bool BasicBlock::isLandingPad() const {
 }
 
 /// Return the landingpad instruction associated with the landing pad.
-LandingPadInst *BasicBlock::getLandingPadInst() {
-  return dyn_cast<LandingPadInst>(getFirstNonPHI());
-}
 const LandingPadInst *BasicBlock::getLandingPadInst() const {
   return dyn_cast<LandingPadInst>(getFirstNonPHI());
 }
diff --git a/contrib/llvm/lib/IR/Comdat.cpp b/contrib/llvm/lib/IR/Comdat.cpp
index fc1b48d..c735f9b 100644
--- a/contrib/llvm/lib/IR/Comdat.cpp
+++ b/contrib/llvm/lib/IR/Comdat.cpp
@@ -1,4 +1,4 @@
-//===-- Comdat.cpp - Implement Metadata classes --------------------------===//
+//===- Comdat.cpp - Implement Metadata classes ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,10 +13,12 @@
 
 #include "llvm/IR/Comdat.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+
 using namespace llvm;
 
 Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {}
 
-Comdat::Comdat() : Name(nullptr), SK(Comdat::Any) {}
+Comdat::Comdat() = default;
 
 StringRef Comdat::getName() const { return Name->first(); }
diff --git a/contrib/llvm/lib/IR/ConstantFold.cpp b/contrib/llvm/lib/IR/ConstantFold.cpp
index 098ff90..311b0a7 100644
--- a/contrib/llvm/lib/IR/ConstantFold.cpp
+++ b/contrib/llvm/lib/IR/ConstantFold.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ConstantFold.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -222,7 +223,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
   if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
     APInt V = CI->getValue();
     if (ByteStart)
-      V = V.lshr(ByteStart*8);
+      V.lshrInPlace(ByteStart*8);
     V = V.trunc(ByteSize*8);
     return ConstantInt::get(CI->getContext(), V);
   }
@@ -241,7 +242,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
 
     // X | -1 -> -1.
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS))
-      if (RHSC->isAllOnesValue())
+      if (RHSC->isMinusOne())
         return RHSC;
 
     Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
@@ -347,8 +348,7 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
 /// factors factored out. If Folded is false, return null if no factoring was
 /// possible, to avoid endlessly bouncing an unfoldable expression back into the
 /// top-level folder.
-static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
-                                 bool Folded) {
+static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy, bool Folded) {
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Constant *N = ConstantInt::get(DestTy, ATy->getNumElements());
     Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
@@ -403,8 +403,7 @@ static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
 /// factors factored out. If Folded is false, return null if no factoring was
 /// possible, to avoid endlessly bouncing an unfoldable expression back into the
 /// top-level folder.
-static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
-                                  bool Folded) {
+static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy, bool Folded) {
   // The alignment of an array is equal to the alignment of the
   // array element. Note that this is not always true for vectors.
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
@@ -468,8 +467,7 @@ static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
 /// any known factors factored out. If Folded is false, return null if no
 /// factoring was possible, to avoid endlessly bouncing an unfoldable expression
 /// back into the top-level folder.
-static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo,
-                                   Type *DestTy,
+static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo, Type *DestTy,
                                    bool Folded) {
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo, false,
@@ -606,17 +604,15 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
       const APFloat &V = FPC->getValueAPF();
       bool ignored;
-      uint64_t x[2];
       uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      APSInt IntVal(DestBitWidth, opc == Instruction::FPToUI);
       if (APFloat::opInvalidOp ==
-          V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
-                             APFloat::rmTowardZero, &ignored)) {
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored)) {
         // Undefined behavior invoked - the destination type can't represent
         // the input constant.
         return UndefValue::get(DestTy);
       }
-      APInt Val(DestBitWidth, x);
-      return ConstantInt::get(FPC->getContext(), Val);
+      return ConstantInt::get(FPC->getContext(), IntVal);
     }
     return nullptr; // Can't fold.
   case Instruction::IntToPtr:   //always treated as unsigned
@@ -1019,33 +1015,33 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
   if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
     switch (Opcode) {
     case Instruction::Add:
-      if (CI2->equalsInt(0)) return C1;                         // X + 0 == X
+      if (CI2->isZero()) return C1;                             // X + 0 == X
       break;
     case Instruction::Sub:
-      if (CI2->equalsInt(0)) return C1;                         // X - 0 == X
+      if (CI2->isZero()) return C1;                             // X - 0 == X
       break;
     case Instruction::Mul:
-      if (CI2->equalsInt(0)) return C2;                         // X * 0 == 0
-      if (CI2->equalsInt(1))
+      if (CI2->isZero()) return C2;                             // X * 0 == 0
+      if (CI2->isOne())
         return C1;                                              // X * 1 == X
       break;
     case Instruction::UDiv:
     case Instruction::SDiv:
-      if (CI2->equalsInt(1))
+      if (CI2->isOne())
         return C1;                                            // X / 1 == X
-      if (CI2->equalsInt(0))
+      if (CI2->isZero())
         return UndefValue::get(CI2->getType());               // X / 0 == undef
       break;
     case Instruction::URem:
     case Instruction::SRem:
-      if (CI2->equalsInt(1))
+      if (CI2->isOne())
         return Constant::getNullValue(CI2->getType());        // X % 1 == 0
-      if (CI2->equalsInt(0))
+      if (CI2->isZero())
         return UndefValue::get(CI2->getType());               // X % 0 == undef
       break;
     case Instruction::And:
       if (CI2->isZero()) return C2;                           // X & 0 == 0
-      if (CI2->isAllOnesValue())
+      if (CI2->isMinusOne())
         return C1;                                            // X & -1 == X
 
       if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
@@ -1082,12 +1078,12 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       }
       break;
     case Instruction::Or:
-      if (CI2->equalsInt(0)) return C1;    // X | 0 == X
-      if (CI2->isAllOnesValue())
+      if (CI2->isZero()) return C1;        // X | 0 == X
+      if (CI2->isMinusOne())
         return C2;                         // X | -1 == -1
       break;
     case Instruction::Xor:
-      if (CI2->equalsInt(0)) return C1;    // X ^ 0 == X
+      if (CI2->isZero()) return C1;        // X ^ 0 == X
 
       if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
         switch (CE1->getOpcode()) {
@@ -1095,7 +1091,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         case Instruction::ICmp:
         case Instruction::FCmp:
           // cmp pred ^ true -> cmp !pred
-          assert(CI2->equalsInt(1));
+          assert(CI2->isOne());
           CmpInst::Predicate pred = (CmpInst::Predicate)CE1->getPredicate();
           pred = CmpInst::getInversePredicate(pred);
           return ConstantExpr::getCompare(pred, CE1->getOperand(0),
@@ -1130,18 +1126,18 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
       case Instruction::Mul:
         return ConstantInt::get(CI1->getContext(), C1V * C2V);
       case Instruction::UDiv:
-        assert(!CI2->isNullValue() && "Div by zero handled above");
+        assert(!CI2->isZero() && "Div by zero handled above");
         return ConstantInt::get(CI1->getContext(), C1V.udiv(C2V));
       case Instruction::SDiv:
-        assert(!CI2->isNullValue() && "Div by zero handled above");
+        assert(!CI2->isZero() && "Div by zero handled above");
         if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
           return UndefValue::get(CI1->getType());   // MIN_INT / -1 -> undef
         return ConstantInt::get(CI1->getContext(), C1V.sdiv(C2V));
       case Instruction::URem:
-        assert(!CI2->isNullValue() && "Div by zero handled above");
+        assert(!CI2->isZero() && "Div by zero handled above");
         return ConstantInt::get(CI1->getContext(), C1V.urem(C2V));
       case Instruction::SRem:
-        assert(!CI2->isNullValue() && "Div by zero handled above");
+        assert(!CI2->isZero() && "Div by zero handled above");
         if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
           return UndefValue::get(CI1->getType());   // MIN_INT % -1 -> undef
         return ConstantInt::get(CI1->getContext(), C1V.srem(C2V));
@@ -1174,7 +1170,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::Shl:
-      if (CI1->equalsInt(0)) return C1;
+      if (CI1->isZero()) return C1;
       break;
     default:
       break;
@@ -1209,10 +1205,15 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     SmallVector<Constant*, 16> Result;
     Type *Ty = IntegerType::get(VTy->getContext(), 32);
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *LHS =
-        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
-      Constant *RHS =
-        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+      Constant *ExtractIdx = ConstantInt::get(Ty, i);
+      Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
+      Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
+
+      // If any element of a divisor vector is zero, the whole op is undef.
+      if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+           Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+          RHS->isNullValue())
+        return UndefValue::get(VTy);
 
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
     }
@@ -2037,9 +2038,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
                                           Optional<unsigned> InRangeIndex,
                                           ArrayRef<Value *> Idxs) {
   if (Idxs.empty()) return C;
-  Constant *Idx0 = cast<Constant>(Idxs[0]);
-  if ((Idxs.size() == 1 && Idx0->isNullValue()))
-    return C;
 
   if (isa<UndefValue>(C)) {
     Type *GEPTy = GetElementPtrInst::getGEPReturnType(
@@ -2047,10 +2045,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     return UndefValue::get(GEPTy);
   }
 
+  Constant *Idx0 = cast<Constant>(Idxs[0]);
+  if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
+    return C;
+
   if (C->isNullValue()) {
     bool isNull = true;
     for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
-      if (!cast<Constant>(Idxs[i])->isNullValue()) {
+      if (!isa<UndefValue>(Idxs[i]) &&
+          !cast<Constant>(Idxs[i])->isNullValue()) {
         isNull = false;
         break;
       }
@@ -2094,15 +2097,19 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       // Subsequent evaluation would get confused and produce erroneous results.
       //
       // The following prohibits such a GEP from being formed by checking to see
-      // if the index is in-range with respect to an array or vector.
+      // if the index is in-range with respect to an array.
+      // TODO: This code may be extended to handle vectors as well.
       bool PerformFold = false;
       if (Idx0->isNullValue())
         PerformFold = true;
       else if (LastI.isSequential())
         if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx0))
-          PerformFold =
-              !LastI.isBoundedSequential() ||
-              isIndexInRangeOfArrayType(LastI.getSequentialNumElements(), CI);
+          PerformFold = (!LastI.isBoundedSequential() ||
+                         isIndexInRangeOfArrayType(
+                             LastI.getSequentialNumElements(), CI)) &&
+                        !CE->getOperand(CE->getNumOperands() - 1)
+                             ->getType()
+                             ->isVectorTy();
 
       if (PerformFold) {
         SmallVector<Value*, 16> NewIndices;
@@ -2231,7 +2238,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
     NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
 
-    Constant *PrevIdx = cast<Constant>(Idxs[i - 1]);
+    Constant *PrevIdx = NewIdxs[i-1] ? NewIdxs[i-1] :
+                           cast<Constant>(Idxs[i - 1]);
     Constant *Div = ConstantExpr::getSDiv(CI, Factor);
 
     unsigned CommonExtendedWidth =
diff --git a/contrib/llvm/lib/IR/ConstantRange.cpp b/contrib/llvm/lib/IR/ConstantRange.cpp
index a85ad46..4bd1725 100644
--- a/contrib/llvm/lib/IR/ConstantRange.cpp
+++ b/contrib/llvm/lib/IR/ConstantRange.cpp
@@ -1,4 +1,4 @@
-//===-- ConstantRange.cpp - ConstantRange implementation ------------------===//
+//===- ConstantRange.cpp - ConstantRange implementation -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,29 +21,31 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Instruction.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/ConstantRange.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
-/// Initialize a full (the default) or empty set for the specified type.
-///
-ConstantRange::ConstantRange(uint32_t BitWidth, bool Full) {
-  if (Full)
-    Lower = Upper = APInt::getMaxValue(BitWidth);
-  else
-    Lower = Upper = APInt::getMinValue(BitWidth);
-}
+ConstantRange::ConstantRange(uint32_t BitWidth, bool Full)
+    : Lower(Full ? APInt::getMaxValue(BitWidth) : APInt::getMinValue(BitWidth)),
+      Upper(Lower) {}
 
-/// Initialize a range to hold the single specified value.
-///
-ConstantRange::ConstantRange(APIntMoveTy V)
+ConstantRange::ConstantRange(APInt V)
     : Lower(std::move(V)), Upper(Lower + 1) {}
 
-ConstantRange::ConstantRange(APIntMoveTy L, APIntMoveTy U)
+ConstantRange::ConstantRange(APInt L, APInt U)
     : Lower(std::move(L)), Upper(std::move(U)) {
   assert(Lower.getBitWidth() == Upper.getBitWidth() &&
          "ConstantRange with unequal bit widths");
@@ -70,49 +72,49 @@ ConstantRange ConstantRange::makeAllowedICmpRegion(CmpInst::Predicate Pred,
     APInt UMax(CR.getUnsignedMax());
     if (UMax.isMinValue())
       return ConstantRange(W, /* empty */ false);
-    return ConstantRange(APInt::getMinValue(W), UMax);
+    return ConstantRange(APInt::getMinValue(W), std::move(UMax));
   }
   case CmpInst::ICMP_SLT: {
     APInt SMax(CR.getSignedMax());
     if (SMax.isMinSignedValue())
       return ConstantRange(W, /* empty */ false);
-    return ConstantRange(APInt::getSignedMinValue(W), SMax);
+    return ConstantRange(APInt::getSignedMinValue(W), std::move(SMax));
   }
   case CmpInst::ICMP_ULE: {
     APInt UMax(CR.getUnsignedMax());
     if (UMax.isMaxValue())
       return ConstantRange(W);
-    return ConstantRange(APInt::getMinValue(W), UMax + 1);
+    return ConstantRange(APInt::getMinValue(W), std::move(UMax) + 1);
   }
   case CmpInst::ICMP_SLE: {
     APInt SMax(CR.getSignedMax());
     if (SMax.isMaxSignedValue())
       return ConstantRange(W);
-    return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
+    return ConstantRange(APInt::getSignedMinValue(W), std::move(SMax) + 1);
   }
   case CmpInst::ICMP_UGT: {
     APInt UMin(CR.getUnsignedMin());
     if (UMin.isMaxValue())
       return ConstantRange(W, /* empty */ false);
-    return ConstantRange(UMin + 1, APInt::getNullValue(W));
+    return ConstantRange(std::move(UMin) + 1, APInt::getNullValue(W));
   }
   case CmpInst::ICMP_SGT: {
     APInt SMin(CR.getSignedMin());
     if (SMin.isMaxSignedValue())
       return ConstantRange(W, /* empty */ false);
-    return ConstantRange(SMin + 1, APInt::getSignedMinValue(W));
+    return ConstantRange(std::move(SMin) + 1, APInt::getSignedMinValue(W));
   }
   case CmpInst::ICMP_UGE: {
     APInt UMin(CR.getUnsignedMin());
     if (UMin.isMinValue())
       return ConstantRange(W);
-    return ConstantRange(UMin, APInt::getNullValue(W));
+    return ConstantRange(std::move(UMin), APInt::getNullValue(W));
   }
   case CmpInst::ICMP_SGE: {
     APInt SMin(CR.getSignedMin());
     if (SMin.isMinSignedValue())
       return ConstantRange(W);
-    return ConstantRange(SMin, APInt::getSignedMinValue(W));
+    return ConstantRange(std::move(SMin), APInt::getSignedMinValue(W));
   }
   }
 }
@@ -177,7 +179,7 @@ ConstantRange
 ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
                                           const ConstantRange &Other,
                                           unsigned NoWrapKind) {
-  typedef OverflowingBinaryOperator OBO;
+  using OBO = OverflowingBinaryOperator;
 
   // Computes the intersection of CR0 and CR1.  It is different from
   // intersectWith in that the ConstantRange returned will only contain elements
@@ -202,7 +204,7 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
     return ConstantRange(BitWidth, false);
 
   if (auto *C = Other.getSingleElement())
-    if (C->isMinValue())
+    if (C->isNullValue())
       // Full set: nothing signed / unsigned wraps when added to 0.
       return ConstantRange(BitWidth);
 
@@ -214,8 +216,8 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
                                               -Other.getUnsignedMax()));
 
   if (NoWrapKind & OBO::NoSignedWrap) {
-    APInt SignedMin = Other.getSignedMin();
-    APInt SignedMax = Other.getSignedMax();
+    const APInt &SignedMin = Other.getSignedMin();
+    const APInt &SignedMax = Other.getSignedMax();
 
     if (SignedMax.isStrictlyPositive())
       Result = SubsetIntersect(
@@ -232,98 +234,76 @@ ConstantRange::makeGuaranteedNoWrapRegion(Instruction::BinaryOps BinOp,
   return Result;
 }
 
-/// isFullSet - Return true if this set contains all of the elements possible
-/// for this data-type
 bool ConstantRange::isFullSet() const {
   return Lower == Upper && Lower.isMaxValue();
 }
 
-/// isEmptySet - Return true if this set contains no members.
-///
 bool ConstantRange::isEmptySet() const {
   return Lower == Upper && Lower.isMinValue();
 }
 
-/// isWrappedSet - Return true if this set wraps around the top of the range,
-/// for example: [100, 8)
-///
 bool ConstantRange::isWrappedSet() const {
   return Lower.ugt(Upper);
 }
 
-/// isSignWrappedSet - Return true if this set wraps around the INT_MIN of
-/// its bitwidth, for example: i8 [120, 140).
-///
 bool ConstantRange::isSignWrappedSet() const {
   return contains(APInt::getSignedMaxValue(getBitWidth())) &&
          contains(APInt::getSignedMinValue(getBitWidth()));
 }
 
-/// getSetSize - Return the number of elements in this set.
-///
 APInt ConstantRange::getSetSize() const {
-  if (isFullSet()) {
-    APInt Size(getBitWidth()+1, 0);
-    Size.setBit(getBitWidth());
-    return Size;
-  }
+  if (isFullSet())
+    return APInt::getOneBitSet(getBitWidth()+1, getBitWidth());
 
   // This is also correct for wrapped sets.
   return (Upper - Lower).zext(getBitWidth()+1);
 }
 
-/// getUnsignedMax - Return the largest unsigned value contained in the
-/// ConstantRange.
-///
+bool
+ConstantRange::isSizeStrictlySmallerThan(const ConstantRange &Other) const {
+  assert(getBitWidth() == Other.getBitWidth());
+  if (isFullSet())
+    return false;
+  if (Other.isFullSet())
+    return true;
+  return (Upper - Lower).ult(Other.Upper - Other.Lower);
+}
+
+bool
+ConstantRange::isSizeLargerThan(uint64_t MaxSize) const {
+  assert(MaxSize && "MaxSize can't be 0.");
+  // If this a full set, we need special handling to avoid needing an extra bit
+  // to represent the size.
+  if (isFullSet())
+    return APInt::getMaxValue(getBitWidth()).ugt(MaxSize - 1);
+
+  return (Upper - Lower).ugt(MaxSize);
+}
+
 APInt ConstantRange::getUnsignedMax() const {
   if (isFullSet() || isWrappedSet())
     return APInt::getMaxValue(getBitWidth());
   return getUpper() - 1;
 }
 
-/// getUnsignedMin - Return the smallest unsigned value contained in the
-/// ConstantRange.
-///
 APInt ConstantRange::getUnsignedMin() const {
-  if (isFullSet() || (isWrappedSet() && getUpper() != 0))
+  if (isFullSet() || (isWrappedSet() && !getUpper().isNullValue()))
     return APInt::getMinValue(getBitWidth());
   return getLower();
 }
 
-/// getSignedMax - Return the largest signed value contained in the
-/// ConstantRange.
-///
 APInt ConstantRange::getSignedMax() const {
-  APInt SignedMax(APInt::getSignedMaxValue(getBitWidth()));
-  if (!isWrappedSet()) {
-    if (getLower().sle(getUpper() - 1))
-      return getUpper() - 1;
-    return SignedMax;
-  }
-  if (getLower().isNegative() == getUpper().isNegative())
-    return SignedMax;
+  if (isFullSet() || Lower.sgt(Upper))
+    return APInt::getSignedMaxValue(getBitWidth());
   return getUpper() - 1;
 }
 
-/// getSignedMin - Return the smallest signed value contained in the
-/// ConstantRange.
-///
 APInt ConstantRange::getSignedMin() const {
-  APInt SignedMin(APInt::getSignedMinValue(getBitWidth()));
-  if (!isWrappedSet()) {
-    if (getLower().sle(getUpper() - 1))
-      return getLower();
-    return SignedMin;
-  }
-  if ((getUpper() - 1).slt(getLower())) {
-    if (getUpper() != SignedMin)
-      return SignedMin;
-  }
+  if (isFullSet() || (Lower.sgt(Upper) && !getUpper().isMinSignedValue()))
+    return APInt::getSignedMinValue(getBitWidth());
   return getLower();
 }
 
-/// contains - Return true if the specified value is in the set.
-///
 bool ConstantRange::contains(const APInt &V) const {
   if (Lower == Upper)
     return isFullSet();
@@ -333,10 +313,6 @@ bool ConstantRange::contains(const APInt &V) const {
   return Lower.ule(V) || V.ult(Upper);
 }
 
-/// contains - Return true if the argument is a subset of this range.
-/// Two equal sets contain each other. The empty set contained by all other
-/// sets.
-///
 bool ConstantRange::contains(const ConstantRange &Other) const {
   if (isFullSet() || Other.isEmptySet()) return true;
   if (isEmptySet() || Other.isFullSet()) return false;
@@ -355,8 +331,6 @@ bool ConstantRange::contains(const ConstantRange &Other) const {
   return Other.getUpper().ule(Upper) && Lower.ule(Other.getLower());
 }
 
-/// subtract - Subtract the specified constant from the endpoints of this
-/// constant range.
 ConstantRange ConstantRange::subtract(const APInt &Val) const {
   assert(Val.getBitWidth() == getBitWidth() && "Wrong bit width");
   // If the set is empty or full, don't modify the endpoints.
@@ -365,17 +339,10 @@ ConstantRange ConstantRange::subtract(const APInt &Val) const {
   return ConstantRange(Lower - Val, Upper - Val);
 }
 
-/// \brief Subtract the specified range from this range (aka relative complement
-/// of the sets).
 ConstantRange ConstantRange::difference(const ConstantRange &CR) const {
   return intersectWith(CR.inverse());
 }
 
-/// intersectWith - Return the range that results from the intersection of this
-/// range with another range.  The resultant range is guaranteed to include all
-/// elements contained in both input ranges, and to have the smallest possible
-/// set size that does so.  Because there may be two intersections with the
-/// same set size, A.intersectWith(B) might not be equal to B.intersectWith(A).
 ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
   assert(getBitWidth() == CR.getBitWidth() && 
          "ConstantRange types don't agree!");
@@ -414,7 +381,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
       if (CR.Upper.ule(Lower))
         return ConstantRange(CR.Lower, Upper);
 
-      if (getSetSize().ult(CR.getSetSize()))
+      if (isSizeStrictlySmallerThan(CR))
         return *this;
       return CR;
     }
@@ -429,7 +396,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
   if (CR.Upper.ult(Upper)) {
     if (CR.Lower.ult(Upper)) {
-      if (getSetSize().ult(CR.getSetSize()))
+      if (isSizeStrictlySmallerThan(CR))
         return *this;
       return CR;
     }
@@ -445,18 +412,11 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
     return ConstantRange(CR.Lower, Upper);
   }
-  if (getSetSize().ult(CR.getSetSize()))
+  if (isSizeStrictlySmallerThan(CR))
     return *this;
   return CR;
 }
 
-
-/// unionWith - Return the range that results from the union of this range with
-/// another range.  The resultant range is guaranteed to include the elements of
-/// both sets, but may contain more.  For example, [3, 9) union [12,15) is
-/// [3, 15), which includes 9, 10, and 11, which were not included in either
-/// set before.
-///
 ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
   assert(getBitWidth() == CR.getBitWidth() && 
          "ConstantRange types don't agree!");
@@ -475,16 +435,13 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
       return ConstantRange(CR.Lower, Upper);
     }
 
-    APInt L = Lower, U = Upper;
-    if (CR.Lower.ult(L))
-      L = CR.Lower;
-    if ((CR.Upper - 1).ugt(U - 1))
-      U = CR.Upper;
+    APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
+    APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper;
 
-    if (L == 0 && U == 0)
+    if (L.isNullValue() && U.isNullValue())
       return ConstantRange(getBitWidth());
 
-    return ConstantRange(L, U);
+    return ConstantRange(std::move(L), std::move(U));
   }
 
   if (!CR.isWrappedSet()) {
@@ -525,13 +482,10 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const {
   if (CR.Lower.ule(Upper) || Lower.ule(CR.Upper))
     return ConstantRange(getBitWidth());
 
-  APInt L = Lower, U = Upper;
-  if (CR.Upper.ugt(U))
-    U = CR.Upper;
-  if (CR.Lower.ult(L))
-    L = CR.Lower;
+  APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower;
+  APInt U = CR.Upper.ugt(Upper) ? CR.Upper : Upper;
 
-  return ConstantRange(L, U);
+  return ConstantRange(std::move(L), std::move(U));
 }
 
 ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
@@ -558,14 +512,14 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
     auto BW = getBitWidth();
     APInt Min = APInt::getMinValue(BW).zextOrSelf(ResultBitWidth);
     APInt Max = APInt::getMaxValue(BW).zextOrSelf(ResultBitWidth);
-    return ConstantRange(Min, Max);
+    return ConstantRange(std::move(Min), std::move(Max));
   }
   case Instruction::SIToFP: {
     // TODO: use input range if available
     auto BW = getBitWidth();
     APInt SMin = APInt::getSignedMinValue(BW).sextOrSelf(ResultBitWidth);
     APInt SMax = APInt::getSignedMaxValue(BW).sextOrSelf(ResultBitWidth);
-    return ConstantRange(SMin, SMax);
+    return ConstantRange(std::move(SMin), std::move(SMax));
   }
   case Instruction::FPTrunc:
   case Instruction::FPExt:
@@ -577,10 +531,6 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
   };
 }
 
-/// zeroExtend - Return a new range in the specified integer type, which must
-/// be strictly larger than the current type.  The returned range will
-/// correspond to the possible range of values as if the source range had been
-/// zero extended.
 ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
   if (isEmptySet()) return ConstantRange(DstTySize, /*isFullSet=*/false);
 
@@ -591,16 +541,13 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
     APInt LowerExt(DstTySize, 0);
     if (!Upper) // special case: [X, 0) -- not really wrapping around
       LowerExt = Lower.zext(DstTySize);
-    return ConstantRange(LowerExt, APInt::getOneBitSet(DstTySize, SrcTySize));
+    return ConstantRange(std::move(LowerExt),
+                         APInt::getOneBitSet(DstTySize, SrcTySize));
   }
 
   return ConstantRange(Lower.zext(DstTySize), Upper.zext(DstTySize));
 }
 
-/// signExtend - Return a new range in the specified integer type, which must
-/// be strictly larger than the current type.  The returned range will
-/// correspond to the possible range of values as if the source range had been
-/// sign extended.
 ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
   if (isEmptySet()) return ConstantRange(DstTySize, /*isFullSet=*/false);
 
@@ -619,10 +566,6 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
   return ConstantRange(Lower.sext(DstTySize), Upper.sext(DstTySize));
 }
 
-/// truncate - Return a new range in the specified integer type, which must be
-/// strictly smaller than the current type.  The returned range will
-/// correspond to the possible range of values as if the source range had been
-/// truncated to the specified type.
 ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   assert(getBitWidth() > DstTySize && "Not a value truncation");
   if (isEmptySet())
@@ -630,10 +573,6 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   if (isFullSet())
     return ConstantRange(DstTySize, /*isFullSet=*/true);
 
-  APInt MaxValue = APInt::getMaxValue(DstTySize).zext(getBitWidth());
-  APInt MaxBitValue(getBitWidth(), 0);
-  MaxBitValue.setBit(DstTySize);
-
   APInt LowerDiv(Lower), UpperDiv(Upper);
   ConstantRange Union(DstTySize, /*isFullSet=*/false);
 
@@ -641,41 +580,46 @@ ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
   // We use the non-wrapped set code to analyze the [Lower, MaxValue) part, and
   // then we do the union with [MaxValue, Upper)
   if (isWrappedSet()) {
-    // If Upper is greater than Max Value, it covers the whole truncated range.
-    if (Upper.uge(MaxValue))
+    // If Upper is greater than or equal to MaxValue(DstTy), it covers the whole
+    // truncated range.
+    if (Upper.getActiveBits() > DstTySize ||
+        Upper.countTrailingOnes() == DstTySize)
       return ConstantRange(DstTySize, /*isFullSet=*/true);
 
     Union = ConstantRange(APInt::getMaxValue(DstTySize),Upper.trunc(DstTySize));
-    UpperDiv = APInt::getMaxValue(getBitWidth());
+    UpperDiv.setAllBits();
 
     // Union covers the MaxValue case, so return if the remaining range is just
-    // MaxValue.
+    // MaxValue(DstTy).
     if (LowerDiv == UpperDiv)
       return Union;
   }
 
   // Chop off the most significant bits that are past the destination bitwidth.
-  if (LowerDiv.uge(MaxValue)) {
-    APInt Div(getBitWidth(), 0);
-    APInt::udivrem(LowerDiv, MaxBitValue, Div, LowerDiv);
-    UpperDiv = UpperDiv - MaxBitValue * Div;
+  if (LowerDiv.getActiveBits() > DstTySize) {
+    // Mask to just the signficant bits and subtract from LowerDiv/UpperDiv.
+    APInt Adjust = LowerDiv & APInt::getBitsSetFrom(getBitWidth(), DstTySize);
+    LowerDiv -= Adjust;
+    UpperDiv -= Adjust;
   }
 
-  if (UpperDiv.ule(MaxValue))
+  unsigned UpperDivWidth = UpperDiv.getActiveBits();
+  if (UpperDivWidth <= DstTySize)
     return ConstantRange(LowerDiv.trunc(DstTySize),
                          UpperDiv.trunc(DstTySize)).unionWith(Union);
 
   // The truncated value wraps around. Check if we can do better than fullset.
-  APInt UpperModulo = UpperDiv - MaxBitValue;
-  if (UpperModulo.ult(LowerDiv))
-    return ConstantRange(LowerDiv.trunc(DstTySize),
-                         UpperModulo.trunc(DstTySize)).unionWith(Union);
+  if (UpperDivWidth == DstTySize + 1) {
+    // Clear the MSB so that UpperDiv wraps around.
+    UpperDiv.clearBit(DstTySize);
+    if (UpperDiv.ult(LowerDiv))
+      return ConstantRange(LowerDiv.trunc(DstTySize),
+                           UpperDiv.trunc(DstTySize)).unionWith(Union);
+  }
 
   return ConstantRange(DstTySize, /*isFullSet=*/true);
 }
 
-/// zextOrTrunc - make this range have the bit width given by \p DstTySize. The
-/// value is zero extended, truncated, or left alone to make it that width.
 ConstantRange ConstantRange::zextOrTrunc(uint32_t DstTySize) const {
   unsigned SrcTySize = getBitWidth();
   if (SrcTySize > DstTySize)
@@ -685,8 +629,6 @@ ConstantRange ConstantRange::zextOrTrunc(uint32_t DstTySize) const {
   return *this;
 }
 
-/// sextOrTrunc - make this range have the bit width given by \p DstTySize. The
-/// value is sign extended, truncated, or left alone to make it that width.
 ConstantRange ConstantRange::sextOrTrunc(uint32_t DstTySize) const {
   unsigned SrcTySize = getBitWidth();
   if (SrcTySize > DstTySize)
@@ -739,17 +681,16 @@ ConstantRange::add(const ConstantRange &Other) const {
   if (isFullSet() || Other.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
   APInt NewLower = getLower() + Other.getLower();
   APInt NewUpper = getUpper() + Other.getUpper() - 1;
   if (NewLower == NewUpper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  ConstantRange X = ConstantRange(NewLower, NewUpper);
-  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+  ConstantRange X = ConstantRange(std::move(NewLower), std::move(NewUpper));
+  if (X.isSizeStrictlySmallerThan(*this) ||
+      X.isSizeStrictlySmallerThan(Other))
     // We've wrapped, therefore, full set.
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-
   return X;
 }
 
@@ -773,17 +714,16 @@ ConstantRange::sub(const ConstantRange &Other) const {
   if (isFullSet() || Other.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
   APInt NewLower = getLower() - Other.getUpper() + 1;
   APInt NewUpper = getUpper() - Other.getLower();
   if (NewLower == NewUpper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  ConstantRange X = ConstantRange(NewLower, NewUpper);
-  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+  ConstantRange X = ConstantRange(std::move(NewLower), std::move(NewUpper));
+  if (X.isSizeStrictlySmallerThan(*this) ||
+      X.isSizeStrictlySmallerThan(Other))
     // We've wrapped, therefore, full set.
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-
   return X;
 }
 
@@ -817,7 +757,8 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   // from one positive number to another which is as good as we can generate.
   // In this case, skip the extra work of generating signed ranges which aren't
   // going to be better than this range.
-  if (!UR.isWrappedSet() && UR.getLower().isNonNegative())
+  if (!UR.isWrappedSet() &&
+      (UR.getUpper().isNonNegative() || UR.getUpper().isMinSignedValue()))
     return UR;
 
   // Now the signed range. Because we could be dealing with negative numbers
@@ -837,7 +778,7 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   ConstantRange Result_sext(std::min(L, Compare), std::max(L, Compare) + 1);
   ConstantRange SR = Result_sext.truncate(getBitWidth());
 
-  return UR.getSetSize().ult(SR.getSetSize()) ? UR : SR;
+  return UR.isSizeStrictlySmallerThan(SR) ? UR : SR;
 }
 
 ConstantRange
@@ -850,7 +791,7 @@ ConstantRange::smax(const ConstantRange &Other) const {
   APInt NewU = APIntOps::smax(getSignedMax(), Other.getSignedMax()) + 1;
   if (NewU == NewL)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-  return ConstantRange(NewL, NewU);
+  return ConstantRange(std::move(NewL), std::move(NewU));
 }
 
 ConstantRange
@@ -863,7 +804,7 @@ ConstantRange::umax(const ConstantRange &Other) const {
   APInt NewU = APIntOps::umax(getUnsignedMax(), Other.getUnsignedMax()) + 1;
   if (NewU == NewL)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-  return ConstantRange(NewL, NewU);
+  return ConstantRange(std::move(NewL), std::move(NewU));
 }
 
 ConstantRange
@@ -876,7 +817,7 @@ ConstantRange::smin(const ConstantRange &Other) const {
   APInt NewU = APIntOps::smin(getSignedMax(), Other.getSignedMax()) + 1;
   if (NewU == NewL)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-  return ConstantRange(NewL, NewU);
+  return ConstantRange(std::move(NewL), std::move(NewU));
 }
 
 ConstantRange
@@ -889,12 +830,12 @@ ConstantRange::umin(const ConstantRange &Other) const {
   APInt NewU = APIntOps::umin(getUnsignedMax(), Other.getUnsignedMax()) + 1;
   if (NewU == NewL)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-  return ConstantRange(NewL, NewU);
+  return ConstantRange(std::move(NewL), std::move(NewU));
 }
 
 ConstantRange
 ConstantRange::udiv(const ConstantRange &RHS) const {
-  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0)
+  if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
   if (RHS.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
@@ -902,13 +843,13 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
   APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax());
 
   APInt RHS_umin = RHS.getUnsignedMin();
-  if (RHS_umin == 0) {
+  if (RHS_umin.isNullValue()) {
     // We want the lowest value in RHS excluding zero. Usually that would be 1
     // except for a range in the form of [X, 1) in which case it would be X.
     if (RHS.getUpper() == 1)
       RHS_umin = RHS.getLower();
     else
-      RHS_umin = APInt(getBitWidth(), 1);
+      RHS_umin = 1;
   }
 
   APInt Upper = getUnsignedMax().udiv(RHS_umin) + 1;
@@ -918,7 +859,7 @@ ConstantRange::udiv(const ConstantRange &RHS) const {
   if (Lower == Upper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  return ConstantRange(Lower, Upper);
+  return ConstantRange(std::move(Lower), std::move(Upper));
 }
 
 ConstantRange
@@ -931,7 +872,7 @@ ConstantRange::binaryAnd(const ConstantRange &Other) const {
   APInt umin = APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax());
   if (umin.isAllOnesValue())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-  return ConstantRange(APInt::getNullValue(getBitWidth()), umin + 1);
+  return ConstantRange(APInt::getNullValue(getBitWidth()), std::move(umin) + 1);
 }
 
 ConstantRange
@@ -942,9 +883,9 @@ ConstantRange::binaryOr(const ConstantRange &Other) const {
   // TODO: replace this with something less conservative
 
   APInt umax = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
-  if (umax.isMinValue())
+  if (umax.isNullValue())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-  return ConstantRange(umax, APInt::getNullValue(getBitWidth()));
+  return ConstantRange(std::move(umax), APInt::getNullValue(getBitWidth()));
 }
 
 ConstantRange
@@ -952,29 +893,33 @@ ConstantRange::shl(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
 
-  APInt min = getUnsignedMin().shl(Other.getUnsignedMin());
-  APInt max = getUnsignedMax().shl(Other.getUnsignedMax());
+  APInt max = getUnsignedMax();
+  APInt Other_umax = Other.getUnsignedMax();
 
-  // there's no overflow!
-  APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros());
-  if (Zeros.ugt(Other.getUnsignedMax()))
-    return ConstantRange(min, max + 1);
+  // there's overflow!
+  if (Other_umax.uge(max.countLeadingZeros()))
+    return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   // FIXME: implement the other tricky cases
-  return ConstantRange(getBitWidth(), /*isFullSet=*/true);
+
+  APInt min = getUnsignedMin();
+  min <<= Other.getUnsignedMin();
+  max <<= Other_umax;
+
+  return ConstantRange(std::move(min), std::move(max) + 1);
 }
 
 ConstantRange
 ConstantRange::lshr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/false);
-  
-  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin());
+
+  APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()) + 1;
   APInt min = getUnsignedMin().lshr(Other.getUnsignedMax());
-  if (min == max + 1)
+  if (min == max)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  return ConstantRange(min, max + 1);
+  return ConstantRange(std::move(min), std::move(max));
 }
 
 ConstantRange ConstantRange::inverse() const {
@@ -985,8 +930,6 @@ ConstantRange ConstantRange::inverse() const {
   return ConstantRange(Upper, Lower);
 }
 
-/// print - Print out the bounds to a stream...
-///
 void ConstantRange::print(raw_ostream &OS) const {
   if (isFullSet())
     OS << "full-set";
@@ -996,11 +939,11 @@ void ConstantRange::print(raw_ostream &OS) const {
     OS << "[" << Lower << "," << Upper << ")";
 }
 
-/// dump - Allow printing from a debugger easily...
-///
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ConstantRange::dump() const {
   print(dbgs());
 }
+#endif
 
 ConstantRange llvm::getConstantRangeFromMetadata(const MDNode &Ranges) {
   const unsigned NumRanges = Ranges.getNumOperands() / 2;
diff --git a/contrib/llvm/lib/IR/Constants.cpp b/contrib/llvm/lib/IR/Constants.cpp
index 533b924..f56fe70 100644
--- a/contrib/llvm/lib/IR/Constants.cpp
+++ b/contrib/llvm/lib/IR/Constants.cpp
@@ -30,17 +30,13 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
-#include <cstdarg>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
 //                              Constant Class
 //===----------------------------------------------------------------------===//
 
-void Constant::anchor() { }
-
-void ConstantData::anchor() {}
-
 bool Constant::isNegativeZeroValue() const {
   // Floating point values have an explicit -0.0 value.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
@@ -48,8 +44,8 @@ bool Constant::isNegativeZeroValue() const {
 
   // Equivalent for a vector of -0.0's.
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
-      if (SplatCFP && SplatCFP->isZero() && SplatCFP->isNegative())
+    if (CV->getElementType()->isFloatingPointTy() && CV->isSplat())
+      if (CV->getElementAsAPFloat(0).isNegZero())
         return true;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -74,8 +70,8 @@ bool Constant::isZeroValue() const {
 
   // Equivalent for a vector of -0.0's.
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (ConstantFP *SplatCFP = dyn_cast_or_null<ConstantFP>(CV->getSplatValue()))
-      if (SplatCFP && SplatCFP->isZero())
+    if (CV->getElementType()->isFloatingPointTy() && CV->isSplat())
+      if (CV->getElementAsAPFloat(0).isZero())
         return true;
 
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -117,9 +113,13 @@ bool Constant::isAllOnesValue() const {
       return Splat->isAllOnesValue();
 
   // Check for constant vectors which are splats of -1 values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isAllOnesValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isAllOnesValue();
+      return CV->getElementAsAPInt(0).isAllOnesValue();
+    }
+  }
 
   return false;
 }
@@ -131,7 +131,7 @@ bool Constant::isOneValue() const {
 
   // Check for FP which are bitcasted from 1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
-    return CFP->getValueAPF().bitcastToAPInt() == 1;
+    return CFP->getValueAPF().bitcastToAPInt().isOneValue();
 
   // Check for constant vectors which are splats of 1 values.
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
@@ -139,9 +139,13 @@ bool Constant::isOneValue() const {
       return Splat->isOneValue();
 
   // Check for constant vectors which are splats of 1 values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isOneValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isOneValue();
+      return CV->getElementAsAPInt(0).isOneValue();
+    }
+  }
 
   return false;
 }
@@ -161,9 +165,13 @@ bool Constant::isMinSignedValue() const {
       return Splat->isMinSignedValue();
 
   // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isMinSignedValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
+      return CV->getElementAsAPInt(0).isMinSignedValue();
+    }
+  }
 
   return false;
 }
@@ -183,9 +191,13 @@ bool Constant::isNotMinSignedValue() const {
       return Splat->isNotMinSignedValue();
 
   // Check for constant vectors which are splats of INT_MIN values.
-  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
-    if (Constant *Splat = CV->getSplatValue())
-      return Splat->isNotMinSignedValue();
+  if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this)) {
+    if (CV->isSplat()) {
+      if (CV->getElementType()->isFloatingPointTy())
+        return !CV->getElementAsAPFloat(0).bitcastToAPInt().isMinSignedValue();
+      return !CV->getElementAsAPInt(0).isMinSignedValue();
+    }
+  }
 
   // It *may* contain INT_MIN, we can't tell.
   return false;
@@ -496,8 +508,6 @@ void Constant::removeDeadConstantUsers() const {
 //                                ConstantInt
 //===----------------------------------------------------------------------===//
 
-void ConstantInt::anchor() { }
-
 ConstantInt::ConstantInt(IntegerType *Ty, const APInt &V)
     : ConstantData(Ty, ConstantIntVal), Val(V) {
   assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
@@ -518,27 +528,19 @@ ConstantInt *ConstantInt::getFalse(LLVMContext &Context) {
 }
 
 Constant *ConstantInt::getTrue(Type *Ty) {
-  VectorType *VTy = dyn_cast<VectorType>(Ty);
-  if (!VTy) {
-    assert(Ty->isIntegerTy(1) && "True must be i1 or vector of i1.");
-    return ConstantInt::getTrue(Ty->getContext());
-  }
-  assert(VTy->getElementType()->isIntegerTy(1) &&
-         "True must be vector of i1 or i1.");
-  return ConstantVector::getSplat(VTy->getNumElements(),
-                                  ConstantInt::getTrue(Ty->getContext()));
+  assert(Ty->isIntOrIntVectorTy(1) && "Type not i1 or vector of i1.");
+  ConstantInt *TrueC = ConstantInt::getTrue(Ty->getContext());
+  if (auto *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), TrueC);
+  return TrueC;
 }
 
 Constant *ConstantInt::getFalse(Type *Ty) {
-  VectorType *VTy = dyn_cast<VectorType>(Ty);
-  if (!VTy) {
-    assert(Ty->isIntegerTy(1) && "False must be i1 or vector of i1.");
-    return ConstantInt::getFalse(Ty->getContext());
-  }
-  assert(VTy->getElementType()->isIntegerTy(1) &&
-         "False must be vector of i1 or i1.");
-  return ConstantVector::getSplat(VTy->getNumElements(),
-                                  ConstantInt::getFalse(Ty->getContext()));
+  assert(Ty->isIntOrIntVectorTy(1) && "Type not i1 or vector of i1.");
+  ConstantInt *FalseC = ConstantInt::getFalse(Ty->getContext());
+  if (auto *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getNumElements(), FalseC);
+  return FalseC;
 }
 
 // Get a ConstantInt from an APInt.
@@ -618,8 +620,6 @@ static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
   return &APFloat::PPCDoubleDouble();
 }
 
-void ConstantFP::anchor() { }
-
 Constant *ConstantFP::get(Type *Ty, double V) {
   LLVMContext &Context = Ty->getContext();
 
@@ -732,7 +732,7 @@ bool ConstantFP::isExactlyValue(const APFloat &V) const {
 
 /// Remove the constant from the constant table.
 void ConstantFP::destroyConstantImpl() {
-  llvm_unreachable("You can't ConstantInt->destroyConstantImpl()!");
+  llvm_unreachable("You can't ConstantFP->destroyConstantImpl()!");
 }
 
 //===----------------------------------------------------------------------===//
@@ -974,16 +974,6 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
   return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
 }
 
-Constant *ConstantStruct::get(StructType *T, ...) {
-  va_list ap;
-  SmallVector<Constant*, 8> Values;
-  va_start(ap, T);
-  while (Constant *Val = va_arg(ap, llvm::Constant*))
-    Values.push_back(Val);
-  va_end(ap);
-  return get(T, Values);
-}
-
 ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
     : ConstantAggregate(T, ConstantVectorVal, V) {
   assert(V.size() == T->getNumElements() &&
@@ -1027,7 +1017,7 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
     return getSequenceIfElementsMatch<ConstantDataVector>(C, V);
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
-  // the operand list constants a ConstantExpr or something else strange.
+  // the operand list contains a ConstantExpr or something else strange.
   return nullptr;
 }
 
@@ -1183,21 +1173,14 @@ bool ConstantInt::isValueValidForType(Type *Ty, uint64_t Val) {
   unsigned NumBits = Ty->getIntegerBitWidth(); // assert okay
   if (Ty->isIntegerTy(1))
     return Val == 0 || Val == 1;
-  if (NumBits >= 64)
-    return true; // always true, has to fit in largest type
-  uint64_t Max = (1ll << NumBits) - 1;
-  return Val <= Max;
+  return isUIntN(NumBits, Val);
 }
 
 bool ConstantInt::isValueValidForType(Type *Ty, int64_t Val) {
   unsigned NumBits = Ty->getIntegerBitWidth();
   if (Ty->isIntegerTy(1))
     return Val == 0 || Val == 1 || Val == -1;
-  if (NumBits >= 64)
-    return true; // always true, has to fit in largest type
-  int64_t Min = -(1ll << (NumBits-1));
-  int64_t Max = (1ll << (NumBits-1)) - 1;
-  return (Val >= Min && Val <= Max);
+  return isIntN(NumBits, Val);
 }
 
 bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
@@ -1668,9 +1651,9 @@ Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
                                     bool OnlyIfReduced) {
-  assert(C->getType()->getScalarType()->isPointerTy() &&
+  assert(C->getType()->isPtrOrPtrVectorTy() &&
          "PtrToInt source must be pointer or pointer vector");
-  assert(DstTy->getScalarType()->isIntegerTy() && 
+  assert(DstTy->isIntOrIntVectorTy() &&
          "PtrToInt destination must be integer or integer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
@@ -1681,9 +1664,9 @@ Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
 
 Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy,
                                     bool OnlyIfReduced) {
-  assert(C->getType()->getScalarType()->isIntegerTy() &&
+  assert(C->getType()->isIntOrIntVectorTy() &&
          "IntToPtr source must be integer or integer vector");
-  assert(DstTy->getScalarType()->isPointerTy() &&
+  assert(DstTy->isPtrOrPtrVectorTy() &&
          "IntToPtr destination must be a pointer or pointer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
@@ -1818,8 +1801,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) {
 Constant *ConstantExpr::getAlignOf(Type* Ty) {
   // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
-  Type *AligningTy = 
-    StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr);
+  Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty);
   Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0));
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
@@ -1948,8 +1930,8 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
 Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
                                 Constant *RHS, bool OnlyIfReduced) {
   assert(LHS->getType() == RHS->getType());
-  assert(pred >= ICmpInst::FIRST_ICMP_PREDICATE && 
-         pred <= ICmpInst::LAST_ICMP_PREDICATE && "Invalid ICmp Predicate");
+  assert(CmpInst::isIntPredicate((CmpInst::Predicate)pred) &&
+         "Invalid ICmp Predicate");
 
   if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
     return FC;          // Fold a few common cases...
@@ -1973,7 +1955,8 @@ Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
 Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS,
                                 Constant *RHS, bool OnlyIfReduced) {
   assert(LHS->getType() == RHS->getType());
-  assert(pred <= FCmpInst::LAST_FCMP_PREDICATE && "Invalid FCmp Predicate");
+  assert(CmpInst::isFPPredicate((CmpInst::Predicate)pred) &&
+         "Invalid FCmp Predicate");
 
   if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
     return FC;          // Fold a few common cases...
@@ -2285,9 +2268,6 @@ Type *GetElementPtrConstantExpr::getResultElementType() const {
 //===----------------------------------------------------------------------===//
 //                       ConstantData* implementations
 
-void ConstantDataArray::anchor() {}
-void ConstantDataVector::anchor() {}
-
 Type *ConstantDataSequential::getElementType() const {
   return getType()->getElementType();
 }
@@ -2416,32 +2396,32 @@ void ConstantDataSequential::destroyConstantImpl() {
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint8_t> Elts) {
   Type *Ty = ArrayType::get(Type::getInt8Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 1), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt16Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt32Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
   Type *Ty = ArrayType::get(Type::getInt64Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<float> Elts) {
   Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataArray::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
 /// getFP() constructors - Return a constant with array type with an element
@@ -2453,27 +2433,26 @@ Constant *ConstantDataArray::getFP(LLVMContext &Context,
                                    ArrayRef<uint16_t> Elts) {
   Type *Ty = ArrayType::get(Type::getHalfTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
 Constant *ConstantDataArray::getFP(LLVMContext &Context,
                                    ArrayRef<uint32_t> Elts) {
   Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 4), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataArray::getFP(LLVMContext &Context,
                                    ArrayRef<uint64_t> Elts) {
   Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
 Constant *ConstantDataArray::getString(LLVMContext &Context,
                                        StringRef Str, bool AddNull) {
   if (!AddNull) {
     const uint8_t *Data = reinterpret_cast<const uint8_t *>(Str.data());
-    return get(Context, makeArrayRef(const_cast<uint8_t *>(Data),
-               Str.size()));
+    return get(Context, makeArrayRef(Data, Str.size()));
   }
 
   SmallVector<uint8_t, 64> ElementVals;
@@ -2488,32 +2467,32 @@ Constant *ConstantDataArray::getString(LLVMContext &Context,
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint8_t> Elts){
   Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*1), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 1), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
   Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*2), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
   Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
   Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*8), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) {
   Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size()*4), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
   Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
 /// getFP() constructors - Return a constant with vector type with an element
@@ -2525,19 +2504,19 @@ Constant *ConstantDataVector::getFP(LLVMContext &Context,
                                     ArrayRef<uint16_t> Elts) {
   Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 2), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
 Constant *ConstantDataVector::getFP(LLVMContext &Context,
                                     ArrayRef<uint32_t> Elts) {
   Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 4), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataVector::getFP(LLVMContext &Context,
                                     ArrayRef<uint64_t> Elts) {
   Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
-  return getImpl(StringRef(const_cast<char *>(Data), Elts.size() * 8), Ty);
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
 Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
@@ -2592,13 +2571,41 @@ uint64_t ConstantDataSequential::getElementAsInteger(unsigned Elt) const {
   switch (getElementType()->getIntegerBitWidth()) {
   default: llvm_unreachable("Invalid bitwidth for CDS");
   case 8:
-    return *const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(EltPtr));
+    return *reinterpret_cast<const uint8_t *>(EltPtr);
   case 16:
-    return *const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(EltPtr));
+    return *reinterpret_cast<const uint16_t *>(EltPtr);
   case 32:
-    return *const_cast<uint32_t *>(reinterpret_cast<const uint32_t *>(EltPtr));
+    return *reinterpret_cast<const uint32_t *>(EltPtr);
   case 64:
-    return *const_cast<uint64_t *>(reinterpret_cast<const uint64_t *>(EltPtr));
+    return *reinterpret_cast<const uint64_t *>(EltPtr);
+  }
+}
+
+APInt ConstantDataSequential::getElementAsAPInt(unsigned Elt) const {
+  assert(isa<IntegerType>(getElementType()) &&
+         "Accessor can only be used when element is an integer");
+  const char *EltPtr = getElementPointer(Elt);
+
+  // The data is stored in host byte order, make sure to cast back to the right
+  // type to load with the right endianness.
+  switch (getElementType()->getIntegerBitWidth()) {
+  default: llvm_unreachable("Invalid bitwidth for CDS");
+  case 8: {
+    auto EltVal = *reinterpret_cast<const uint8_t *>(EltPtr);
+    return APInt(8, EltVal);
+  }
+  case 16: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APInt(16, EltVal);
+  }
+  case 32: {
+    auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
+    return APInt(32, EltVal);
+  }
+  case 64: {
+    auto EltVal = *reinterpret_cast<const uint64_t *>(EltPtr);
+    return APInt(64, EltVal);
+  }
   }
 }
 
@@ -2626,16 +2633,13 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
 float ConstantDataSequential::getElementAsFloat(unsigned Elt) const {
   assert(getElementType()->isFloatTy() &&
          "Accessor can only be used when element is a 'float'");
-  const float *EltPtr = reinterpret_cast<const float *>(getElementPointer(Elt));
-  return *const_cast<float *>(EltPtr);
+  return *reinterpret_cast<const float *>(getElementPointer(Elt));
 }
 
 double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
   assert(getElementType()->isDoubleTy() &&
          "Accessor can only be used when element is a 'float'");
-  const double *EltPtr =
-      reinterpret_cast<const double *>(getElementPointer(Elt));
-  return *const_cast<double *>(EltPtr);
+  return *reinterpret_cast<const double *>(getElementPointer(Elt));
 }
 
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
@@ -2646,8 +2650,8 @@ Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
 }
 
-bool ConstantDataSequential::isString() const {
-  return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(8);
+bool ConstantDataSequential::isString(unsigned CharSize) const {
+  return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(CharSize);
 }
 
 bool ConstantDataSequential::isCString() const {
@@ -2663,17 +2667,21 @@ bool ConstantDataSequential::isCString() const {
   return Str.drop_back().find(0) == StringRef::npos;
 }
 
-Constant *ConstantDataVector::getSplatValue() const {
+bool ConstantDataVector::isSplat() const {
   const char *Base = getRawDataValues().data();
 
   // Compare elements 1+ to the 0'th element.
   unsigned EltSize = getElementByteSize();
   for (unsigned i = 1, e = getNumElements(); i != e; ++i)
     if (memcmp(Base, Base+i*EltSize, EltSize))
-      return nullptr;
+      return false;
 
+  return true;
+}
+
+Constant *ConstantDataVector::getSplatValue() const {
   // If they're all the same, return the 0th one as a representative.
-  return getElementAsConstant(0);
+  return isSplat() ? getElementAsConstant(0) : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/IR/ConstantsContext.h b/contrib/llvm/lib/IR/ConstantsContext.h
index eda751d..6585304 100644
--- a/contrib/llvm/lib/IR/ConstantsContext.h
+++ b/contrib/llvm/lib/IR/ConstantsContext.h
@@ -22,6 +22,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
@@ -43,8 +44,6 @@ namespace llvm {
 /// UnaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement unary constant exprs.
 class UnaryConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   UnaryConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
     : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
@@ -56,16 +55,12 @@ public:
     return User::operator new(s, 1);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
 
 /// BinaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
                      unsigned Flags)
@@ -80,8 +75,6 @@ public:
     return User::operator new(s, 2);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -89,8 +82,6 @@ public:
 /// SelectConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement select constant exprs.
 class SelectConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   SelectConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C2->getType(), Instruction::Select, &Op<0>(), 3) {
@@ -104,8 +95,6 @@ public:
     return User::operator new(s, 3);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -114,8 +103,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractelement constant exprs.
 class ExtractElementConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   ExtractElementConstantExpr(Constant *C1, Constant *C2)
     : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
@@ -129,8 +116,6 @@ public:
     return User::operator new(s, 2);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -139,8 +124,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertelement constant exprs.
 class InsertElementConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C1->getType(), Instruction::InsertElement,
@@ -155,8 +138,6 @@ public:
     return User::operator new(s, 3);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -165,8 +146,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   ShuffleVectorConstantExpr(Constant *C1, Constant *C2, Constant *C3)
   : ConstantExpr(VectorType::get(
@@ -184,8 +163,6 @@ public:
     return User::operator new(s, 3);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 };
@@ -194,8 +171,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractvalue constant exprs.
 class ExtractValueConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   ExtractValueConstantExpr(Constant *Agg, ArrayRef<unsigned> IdxList,
                            Type *DestTy)
@@ -209,8 +184,6 @@ public:
     return User::operator new(s, 1);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Indices - These identify which value to extract.
   const SmallVector<unsigned, 4> Indices;
 
@@ -229,8 +202,6 @@ public:
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertvalue constant exprs.
 class InsertValueConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   InsertValueConstantExpr(Constant *Agg, Constant *Val,
                           ArrayRef<unsigned> IdxList, Type *DestTy)
@@ -245,8 +216,6 @@ public:
     return User::operator new(s, 2);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Indices - These identify the position for the insertion.
   const SmallVector<unsigned, 4> Indices;
 
@@ -270,8 +239,6 @@ class GetElementPtrConstantExpr : public ConstantExpr {
   GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
                             ArrayRef<Constant *> IdxList, Type *DestTy);
 
-  void anchor() override;
-
 public:
   static GetElementPtrConstantExpr *Create(Type *SrcElementTy, Constant *C,
                                            ArrayRef<Constant *> IdxList,
@@ -300,8 +267,6 @@ public:
 // behind the scenes to implement ICmp and FCmp constant expressions. This is
 // needed in order to store the predicate value for these instructions.
 class CompareConstantExpr : public ConstantExpr {
-  void anchor() override;
-
 public:
   unsigned short predicate;
   CompareConstantExpr(Type *ty, Instruction::OtherOps opc,
@@ -316,8 +281,6 @@ public:
     return User::operator new(s, 2);
   }
 
-  void *operator new(size_t, unsigned) = delete;
-
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
@@ -387,31 +350,34 @@ struct ConstantExprKeyType;
 
 template <class ConstantClass> struct ConstantInfo;
 template <> struct ConstantInfo<ConstantExpr> {
-  typedef ConstantExprKeyType ValType;
-  typedef Type TypeClass;
+  using ValType = ConstantExprKeyType;
+  using TypeClass = Type;
 };
 template <> struct ConstantInfo<InlineAsm> {
-  typedef InlineAsmKeyType ValType;
-  typedef PointerType TypeClass;
+  using ValType = InlineAsmKeyType;
+  using TypeClass = PointerType;
 };
 template <> struct ConstantInfo<ConstantArray> {
-  typedef ConstantAggrKeyType<ConstantArray> ValType;
-  typedef ArrayType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantArray>;
+  using TypeClass = ArrayType;
 };
 template <> struct ConstantInfo<ConstantStruct> {
-  typedef ConstantAggrKeyType<ConstantStruct> ValType;
-  typedef StructType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantStruct>;
+  using TypeClass = StructType;
 };
 template <> struct ConstantInfo<ConstantVector> {
-  typedef ConstantAggrKeyType<ConstantVector> ValType;
-  typedef VectorType TypeClass;
+  using ValType = ConstantAggrKeyType<ConstantVector>;
+  using TypeClass = VectorType;
 };
 
 template <class ConstantClass> struct ConstantAggrKeyType {
   ArrayRef<Constant *> Operands;
+
   ConstantAggrKeyType(ArrayRef<Constant *> Operands) : Operands(Operands) {}
+
   ConstantAggrKeyType(ArrayRef<Constant *> Operands, const ConstantClass *)
       : Operands(Operands) {}
+
   ConstantAggrKeyType(const ConstantClass *C,
                       SmallVectorImpl<Constant *> &Storage) {
     assert(Storage.empty() && "Expected empty storage");
@@ -437,7 +403,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
     return hash_combine_range(Operands.begin(), Operands.end());
   }
 
-  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
+  using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+
   ConstantClass *create(TypeClass *Ty) const {
     return new (Operands.size()) ConstantClass(Ty, Operands);
   }
@@ -457,6 +424,7 @@ struct InlineAsmKeyType {
       : AsmString(AsmString), Constraints(Constraints), FTy(FTy),
         HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack),
         AsmDialect(AsmDialect) {}
+
   InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl<Constant *> &)
       : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()),
         FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()),
@@ -483,7 +451,8 @@ struct InlineAsmKeyType {
                         AsmDialect, FTy);
   }
 
-  typedef ConstantInfo<InlineAsm>::TypeClass TypeClass;
+  using TypeClass = ConstantInfo<InlineAsm>::TypeClass;
+
   InlineAsm *create(TypeClass *Ty) const {
     assert(PointerType::getUnqual(FTy) == Ty);
     return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
@@ -507,11 +476,13 @@ struct ConstantExprKeyType {
       : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
         SubclassData(SubclassData), Ops(Ops), Indexes(Indexes),
         ExplicitTy(ExplicitTy) {}
+
   ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
         Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()) {}
+
   ConstantExprKeyType(const ConstantExpr *CE,
                       SmallVectorImpl<Constant *> &Storage)
       : Opcode(CE->getOpcode()),
@@ -553,7 +524,8 @@ struct ConstantExprKeyType {
                         hash_combine_range(Indexes.begin(), Indexes.end()));
   }
 
-  typedef ConstantInfo<ConstantExpr>::TypeClass TypeClass;
+  using TypeClass = ConstantInfo<ConstantExpr>::TypeClass;
+
   ConstantExpr *create(TypeClass *Ty) const {
     switch (Opcode) {
     default:
@@ -594,16 +566,17 @@ struct ConstantExprKeyType {
 
 template <class ConstantClass> class ConstantUniqueMap {
 public:
-  typedef typename ConstantInfo<ConstantClass>::ValType ValType;
-  typedef typename ConstantInfo<ConstantClass>::TypeClass TypeClass;
-  typedef std::pair<TypeClass *, ValType> LookupKey;
+  using ValType = typename ConstantInfo<ConstantClass>::ValType;
+  using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
+  using LookupKey = std::pair<TypeClass *, ValType>;
 
   /// Key and hash together, so that we compute the hash only once and reuse it.
-  typedef std::pair<unsigned, LookupKey> LookupKeyHashed;
+  using LookupKeyHashed = std::pair<unsigned, LookupKey>;
 
 private:
   struct MapInfo {
-    typedef DenseMapInfo<ConstantClass *> ConstantClassInfo;
+    using ConstantClassInfo = DenseMapInfo<ConstantClass *>;
+
     static inline ConstantClass *getEmptyKey() {
       return ConstantClassInfo::getEmptyKey();
     }
@@ -643,7 +616,7 @@ private:
   };
 
 public:
-  typedef DenseSet<ConstantClass *, MapInfo> MapTy;
+  using MapTy = DenseSet<ConstantClass *, MapInfo>;
 
 private:
   MapTy Map;
diff --git a/contrib/llvm/lib/IR/Core.cpp b/contrib/llvm/lib/IR/Core.cpp
index 00bb476..aba7704 100644
--- a/contrib/llvm/lib/IR/Core.cpp
+++ b/contrib/llvm/lib/IR/Core.cpp
@@ -14,9 +14,7 @@
 
 #include "llvm-c/Core.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Attributes.h"
-#include "AttributeSetNode.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -51,6 +49,7 @@ void llvm::initializeCore(PassRegistry &Registry) {
   initializePrintModulePassWrapperPass(Registry);
   initializePrintFunctionPassWrapperPass(Registry);
   initializePrintBasicBlockPassPass(Registry);
+  initializeSafepointIRVerifierPass(Registry);
   initializeVerifierLegacyPassPass(Registry);
 }
 
@@ -259,7 +258,8 @@ void LLVMSetTarget(LLVMModuleRef M, const char *Triple) {
 }
 
 void LLVMDumpModule(LLVMModuleRef M) {
-  unwrap(M)->dump();
+  unwrap(M)->print(errs(), nullptr,
+                   /*ShouldPreserveUseListOrder=*/false, /*IsForDebug=*/true);
 }
 
 LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
@@ -358,9 +358,11 @@ LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
 
-void LLVMDumpType(LLVMTypeRef Ty) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LLVMDumpType(LLVMTypeRef Ty) {
   return unwrap(Ty)->dump();
 }
+#endif
 
 char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
   std::string buf;
@@ -566,6 +568,14 @@ LLVMTypeRef LLVMGetTypeByName(LLVMModuleRef M, const char *Name) {
 
 /*--.. Operations on array, pointer, and vector types (sequence types) .....--*/
 
+void LLVMGetSubtypes(LLVMTypeRef Tp, LLVMTypeRef *Arr) {
+    int i = 0;
+    for (auto *T : unwrap(Tp)->subtypes()) {
+        Arr[i] = wrap(T);
+        i++;
+    }
+}
+
 LLVMTypeRef LLVMArrayType(LLVMTypeRef ElementType, unsigned ElementCount) {
   return wrap(ArrayType::get(unwrap(ElementType), ElementCount));
 }
@@ -585,6 +595,10 @@ LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) {
   return wrap(cast<SequentialType>(Ty)->getElementType());
 }
 
+unsigned LLVMGetNumContainedTypes(LLVMTypeRef Tp) {
+    return unwrap(Tp)->getNumContainedTypes();
+}
+
 unsigned LLVMGetArrayLength(LLVMTypeRef ArrayTy) {
   return unwrap<ArrayType>(ArrayTy)->getNumElements();
 }
@@ -640,8 +654,8 @@ void LLVMSetValueName(LLVMValueRef Val, const char *Name) {
   unwrap(Val)->setName(Name);
 }
 
-void LLVMDumpValue(LLVMValueRef Val) {
-  unwrap(Val)->dump();
+LLVM_DUMP_METHOD void LLVMDumpValue(LLVMValueRef Val) {
+  unwrap(Val)->print(errs(), /*IsForDebug=*/true);
 }
 
 char* LLVMPrintValueToString(LLVMValueRef Val) {
@@ -861,6 +875,19 @@ LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
   return LLVMMDNodeInContext(LLVMGetGlobalContext(), Vals, Count);
 }
 
+LLVMValueRef LLVMMetadataAsValue(LLVMContextRef C, LLVMMetadataRef MD) {
+  return wrap(MetadataAsValue::get(*unwrap(C), unwrap(MD)));
+}
+
+LLVMMetadataRef LLVMValueAsMetadata(LLVMValueRef Val) {
+  auto *V = unwrap(Val);
+  if (auto *C = dyn_cast<Constant>(V))
+    return wrap(ConstantAsMetadata::get(C));
+  if (auto *MAV = dyn_cast<MetadataAsValue>(V))
+    return wrap(MAV->getMetadata());
+  return wrap(ValueAsMetadata::get(V));
+}
+
 const char *LLVMGetMDString(LLVMValueRef V, unsigned *Length) {
   if (const auto *MD = dyn_cast<MetadataAsValue>(unwrap(V)))
     if (const MDString *S = dyn_cast<MDString>(MD->getMetadata())) {
@@ -1844,18 +1871,14 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 }
 
 unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
-  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
-  if (!ASN)
-    return 0;
-  return ASN->getNumAttributes();
+  auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
+  return AS.getNumAttributes();
 }
 
 void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                               LLVMAttributeRef *Attrs) {
-  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
-  if (!ASN)
-    return;
-  for (auto A: make_range(ASN->begin(), ASN->end()))
+  auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
+  for (auto A : AS)
     *Attrs++ = wrap(A);
 }
 
@@ -1885,13 +1908,8 @@ void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
                                         const char *V) {
   Function *Func = unwrap<Function>(Fn);
-  AttributeSet::AttrIndex Idx =
-    AttributeSet::AttrIndex(AttributeSet::FunctionIndex);
-  AttrBuilder B;
-
-  B.addAttribute(A, V);
-  AttributeSet Set = AttributeSet::get(Func->getContext(), Idx, B);
-  Func->addAttributes(Idx, Set);
+  Attribute Attr = Attribute::get(Func->getContext(), A, V);
+  Func->addAttribute(AttributeList::FunctionIndex, Attr);
 }
 
 /*--.. Operations on parameters ............................................--*/
@@ -1910,10 +1928,8 @@ void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
 }
 
 LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
-  Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin();
-  while (index --> 0)
-    AI++;
-  return wrap(&*AI);
+  Function *Fn = unwrap<Function>(FnRef);
+  return wrap(&Fn->arg_begin()[index]);
 }
 
 LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
@@ -1938,25 +1954,22 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
 
 LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I(A);
-  if (++I == A->getParent()->arg_end())
+  Function *Fn = A->getParent();
+  if (A->getArgNo() + 1 >= Fn->arg_size())
     return nullptr;
-  return wrap(&*I);
+  return wrap(&Fn->arg_begin()[A->getArgNo() + 1]);
 }
 
 LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I(A);
-  if (I == A->getParent()->arg_begin())
+  if (A->getArgNo() == 0)
     return nullptr;
-  return wrap(&*--I);
+  return wrap(&A->getParent()->arg_begin()[A->getArgNo() - 1]);
 }
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
-  AttrBuilder B;
-  B.addAlignmentAttr(align);
-  A->addAttr(AttributeSet::get(A->getContext(),A->getArgNo() + 1, B));
+  A->addAttr(Attribute::getWithAlignment(A->getContext(), align));
 }
 
 /*--.. Operations on basic blocks ..........................................--*/
@@ -2163,12 +2176,8 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
 void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
                                 unsigned align) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
-  AttrBuilder B;
-  B.addAlignmentAttr(align);
-  Call.setAttributes(Call.getAttributes()
-                       .addAttributes(Call->getContext(), index,
-                                      AttributeSet::get(Call->getContext(),
-                                                        index, B)));
+  Attribute AlignAttr = Attribute::getWithAlignment(Call->getContext(), align);
+  Call.addAttribute(index, AlignAttr);
 }
 
 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
@@ -2179,19 +2188,15 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
 unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
                                        LLVMAttributeIndex Idx) {
   auto CS = CallSite(unwrap<Instruction>(C));
-  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
-  if (!ASN)
-    return 0;
-  return ASN->getNumAttributes();
+  auto AS = CS.getAttributes().getAttributes(Idx);
+  return AS.getNumAttributes();
 }
 
 void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
                                LLVMAttributeRef *Attrs) {
   auto CS = CallSite(unwrap<Instruction>(C));
-  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
-  if (!ASN)
-    return;
-  for (auto A: make_range(ASN->begin(), ASN->end()))
+  auto AS = CS.getAttributes().getAttributes(Idx);
+  for (auto A : AS)
     *Attrs++ = wrap(A);
 }
 
@@ -2750,11 +2755,14 @@ static LLVMAtomicOrdering mapToLLVMOrdering(AtomicOrdering Ordering) {
   llvm_unreachable("Invalid AtomicOrdering value!");
 }
 
+// TODO: Should this and other atomic instructions support building with
+// "syncscope"?
 LLVMValueRef LLVMBuildFence(LLVMBuilderRef B, LLVMAtomicOrdering Ordering,
                             LLVMBool isSingleThread, const char *Name) {
   return wrap(
     unwrap(B)->CreateFence(mapFromLLVMOrdering(Ordering),
-                           isSingleThread ? SingleThread : CrossThread,
+                           isSingleThread ? SyncScope::SingleThread
+                                          : SyncScope::System,
                            Name));
 }
 
@@ -3036,7 +3044,8 @@ LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
     case LLVMAtomicRMWBinOpUMin: intop = AtomicRMWInst::UMin; break;
   }
   return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val),
-    mapFromLLVMOrdering(ordering), singleThread ? SingleThread : CrossThread));
+    mapFromLLVMOrdering(ordering), singleThread ? SyncScope::SingleThread
+                                                : SyncScope::System));
 }
 
 LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
@@ -3048,7 +3057,7 @@ LLVMValueRef LLVMBuildAtomicCmpXchg(LLVMBuilderRef B, LLVMValueRef Ptr,
   return wrap(unwrap(B)->CreateAtomicCmpXchg(unwrap(Ptr), unwrap(Cmp),
                 unwrap(New), mapFromLLVMOrdering(SuccessOrdering),
                 mapFromLLVMOrdering(FailureOrdering),
-                singleThread ? SingleThread : CrossThread));
+                singleThread ? SyncScope::SingleThread : SyncScope::System));
 }
 
 
@@ -3056,17 +3065,18 @@ LLVMBool LLVMIsAtomicSingleThread(LLVMValueRef AtomicInst) {
   Value *P = unwrap<Value>(AtomicInst);
 
   if (AtomicRMWInst *I = dyn_cast<AtomicRMWInst>(P))
-    return I->getSynchScope() == SingleThread;
-  return cast<AtomicCmpXchgInst>(P)->getSynchScope() == SingleThread;
+    return I->getSyncScopeID() == SyncScope::SingleThread;
+  return cast<AtomicCmpXchgInst>(P)->getSyncScopeID() ==
+             SyncScope::SingleThread;
 }
 
 void LLVMSetAtomicSingleThread(LLVMValueRef AtomicInst, LLVMBool NewValue) {
   Value *P = unwrap<Value>(AtomicInst);
-  SynchronizationScope Sync = NewValue ? SingleThread : CrossThread;
+  SyncScope::ID SSID = NewValue ? SyncScope::SingleThread : SyncScope::System;
 
   if (AtomicRMWInst *I = dyn_cast<AtomicRMWInst>(P))
-    return I->setSynchScope(Sync);
-  return cast<AtomicCmpXchgInst>(P)->setSynchScope(Sync);
+    return I->setSyncScopeID(SSID);
+  return cast<AtomicCmpXchgInst>(P)->setSyncScopeID(SSID);
 }
 
 LLVMAtomicOrdering LLVMGetCmpXchgSuccessOrdering(LLVMValueRef CmpXchgInst)  {
diff --git a/contrib/llvm/lib/IR/DIBuilder.cpp b/contrib/llvm/lib/IR/DIBuilder.cpp
index d061610..bce28ba 100644
--- a/contrib/llvm/lib/IR/DIBuilder.cpp
+++ b/contrib/llvm/lib/IR/DIBuilder.cpp
@@ -12,14 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DIBuilder.h"
+#include "LLVMContextImpl.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "LLVMContextImpl.h"
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -39,6 +39,21 @@ void DIBuilder::trackIfUnresolved(MDNode *N) {
   UnresolvedNodes.emplace_back(N);
 }
 
+void DIBuilder::finalizeSubprogram(DISubprogram *SP) {
+  MDTuple *Temp = SP->getVariables().get();
+  if (!Temp || !Temp->isTemporary())
+    return;
+
+  SmallVector<Metadata *, 4> Variables;
+
+  auto PV = PreservedVariables.find(SP);
+  if (PV != PreservedVariables.end())
+    Variables.append(PV->second.begin(), PV->second.end());
+
+  DINodeArray AV = getOrCreateArray(Variables);
+  TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
+}
+
 void DIBuilder::finalize() {
   if (!CUNode) {
     assert(!AllowUnresolvedNodes &&
@@ -62,25 +77,11 @@ void DIBuilder::finalize() {
     CUNode->replaceRetainedTypes(MDTuple::get(VMContext, RetainValues));
 
   DISubprogramArray SPs = MDTuple::get(VMContext, AllSubprograms);
-  auto resolveVariables = [&](DISubprogram *SP) {
-    MDTuple *Temp = SP->getVariables().get();
-    if (!Temp)
-      return;
-
-    SmallVector<Metadata *, 4> Variables;
-
-    auto PV = PreservedVariables.find(SP);
-    if (PV != PreservedVariables.end())
-      Variables.append(PV->second.begin(), PV->second.end());
-
-    DINodeArray AV = getOrCreateArray(Variables);
-    TempMDTuple(Temp)->replaceAllUsesWith(AV.get());
-  };
   for (auto *SP : SPs)
-    resolveVariables(SP);
+    finalizeSubprogram(SP);
   for (auto *N : RetainValues)
     if (auto *SP = dyn_cast<DISubprogram>(N))
-      resolveVariables(SP);
+      finalizeSubprogram(SP);
 
   if (!AllGVs.empty())
     CUNode->replaceGlobalVariables(MDTuple::get(VMContext, AllGVs));
@@ -126,7 +127,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
     unsigned Lang, DIFile *File, StringRef Producer, bool isOptimized,
     StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
     DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId,
-    bool SplitDebugInlining) {
+    bool SplitDebugInlining, bool DebugInfoForProfiling) {
 
   assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -136,7 +137,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer,
       SplitName, Kind, nullptr, nullptr, nullptr, nullptr, nullptr, DWOId,
-      SplitDebugInlining);
+      SplitDebugInlining, DebugInfoForProfiling);
 
   // Create a named metadata so that it is easier to find cu in a module.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
@@ -147,10 +148,13 @@ DICompileUnit *DIBuilder::createCompileUnit(
 
 static DIImportedEntity *
 createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
-                     Metadata *NS, unsigned Line, StringRef Name,
+                     Metadata *NS, DIFile *File, unsigned Line, StringRef Name,
                      SmallVectorImpl<TrackingMDNodeRef> &AllImportedModules) {
+  if (Line)
+    assert(File && "Source location has line number but no file");
   unsigned EntitiesCount = C.pImpl->DIImportedEntitys.size();
-  auto *M = DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), Line, Name);
+  auto *M =
+      DIImportedEntity::get(C, Tag, Context, DINodeRef(NS), File, Line, Name);
   if (EntitiesCount < C.pImpl->DIImportedEntitys.size())
     // A new Imported Entity was just added to the context.
     // Add it to the Imported Modules list.
@@ -159,33 +163,38 @@ createImportedModule(LLVMContext &C, dwarf::Tag Tag, DIScope *Context,
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
-                                                  DINamespace *NS,
+                                                  DINamespace *NS, DIFile *File,
                                                   unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, Line, StringRef(), AllImportedModules);
+                                Context, NS, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context,
                                                   DIImportedEntity *NS,
-                                                  unsigned Line) {
+                                                  DIFile *File, unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, NS, Line, StringRef(), AllImportedModules);
+                                Context, NS, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedModule(DIScope *Context, DIModule *M,
-                                                  unsigned Line) {
+                                                  DIFile *File, unsigned Line) {
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_module,
-                                Context, M, Line, StringRef(), AllImportedModules);
+                                Context, M, File, Line, StringRef(),
+                                AllImportedModules);
 }
 
 DIImportedEntity *DIBuilder::createImportedDeclaration(DIScope *Context,
                                                        DINode *Decl,
+                                                       DIFile *File,
                                                        unsigned Line,
                                                        StringRef Name) {
   // Make sure to use the unique identifier based metadata reference for
   // types that have one.
   return ::createImportedModule(VMContext, dwarf::DW_TAG_imported_declaration,
-                                Context, Decl, Line, Name, AllImportedModules);
+                                Context, Decl, File, Line, Name,
+                                AllImportedModules);
 }
 
 DIFile *DIBuilder::createFile(StringRef Filename, StringRef Directory,
@@ -241,17 +250,20 @@ DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
-                            0, 0, DINode::FlagZero);
+                            0, 0, None, DINode::FlagZero);
 }
 
-DIDerivedType *DIBuilder::createPointerType(DIType *PointeeTy,
-                                            uint64_t SizeInBits,
-                                            uint32_t AlignInBits,
-                                            StringRef Name) {
+DIDerivedType *DIBuilder::createPointerType(
+    DIType *PointeeTy,
+    uint64_t SizeInBits,
+    uint32_t AlignInBits,
+    Optional<unsigned> DWARFAddressSpace,
+    StringRef Name) {
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, DINode::FlagZero);
+                            AlignInBits, 0, DWARFAddressSpace,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
@@ -261,15 +273,18 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                                                   DINode::DIFlags Flags) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_ptr_to_member_type, "",
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, Flags, Base);
+                            AlignInBits, 0, None, Flags, Base);
 }
 
-DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy,
-                                              uint64_t SizeInBits,
-                                              uint32_t AlignInBits) {
+DIDerivedType *DIBuilder::createReferenceType(
+    unsigned Tag, DIType *RTy,
+    uint64_t SizeInBits,
+    uint32_t AlignInBits,
+    Optional<unsigned> DWARFAddressSpace) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
-                            SizeInBits, AlignInBits, 0, DINode::FlagZero);
+                            SizeInBits, AlignInBits, 0, DWARFAddressSpace,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
@@ -277,14 +292,14 @@ DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DIScope *Context) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
                             LineNo, getNonCompileUnitScope(Context), Ty, 0, 0,
-                            0, DINode::FlagZero);
+                            0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
-                            FriendTy, 0, 0, 0, DINode::FlagZero);
+                            FriendTy, 0, 0, 0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -292,7 +307,7 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
                                             DINode::DIFlags Flags) {
   assert(Ty && "Unable to create inheritance");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
-                            0, Ty, BaseTy, 0, 0, BaseOffset, Flags);
+                            0, Ty, BaseTy, 0, 0, BaseOffset, None, Flags);
 }
 
 DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
@@ -303,7 +318,7 @@ DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
                                            DINode::DIFlags Flags, DIType *Ty) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, Flags);
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -320,7 +335,7 @@ DIDerivedType *DIBuilder::createBitFieldMemberType(
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), Ty, SizeInBits, /* AlignInBits */ 0,
-      OffsetInBits, Flags,
+      OffsetInBits, None, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
                                                StorageOffsetInBits)));
 }
@@ -333,7 +348,8 @@ DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name, DIFile *File,
   Flags |= DINode::FlagStaticMember;
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty, 0,
-                            AlignInBits, 0, Flags, getConstantOrNull(Val));
+                            AlignInBits, 0, None, Flags,
+                            getConstantOrNull(Val));
 }
 
 DIDerivedType *
@@ -343,7 +359,7 @@ DIBuilder::createObjCIVar(StringRef Name, DIFile *File, unsigned LineNumber,
                           DIType *Ty, MDNode *PropertyNode) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(File), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, Flags,
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags,
                             PropertyNode);
 }
 
@@ -442,14 +458,6 @@ DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
   return DISubroutineType::get(VMContext, Flags, CC, ParameterTypes);
 }
 
-DICompositeType *DIBuilder::createExternalTypeRef(unsigned Tag, DIFile *File,
-                                                  StringRef UniqueIdentifier) {
-  assert(!UniqueIdentifier.empty() && "external type ref without uid");
-  return DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr,
-                              0, 0, 0, DINode::FlagExternalTypeRef, nullptr, 0,
-                              nullptr, nullptr, UniqueIdentifier);
-}
-
 DICompositeType *DIBuilder::createEnumerationType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, DINodeArray Elements,
@@ -677,13 +685,14 @@ DISubprogram *DIBuilder::createFunction(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
     unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
     bool isDefinition, unsigned ScopeLine, DINode::DIFlags Flags,
-    bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl) {
+    bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl,
+    DITypeArray ThrownTypes) {
   auto *Node = getSubprogram(
       /* IsDistinct = */ isDefinition, VMContext,
       getNonCompileUnitScope(Context), Name, LinkageName, File, LineNo, Ty,
       isLocalToUnit, isDefinition, ScopeLine, nullptr, 0, 0, 0, Flags,
       isOptimized, isDefinition ? CUNode : nullptr, TParams, Decl,
-      MDTuple::getTemporary(VMContext, None).release());
+      MDTuple::getTemporary(VMContext, None).release(), ThrownTypes);
 
   if (isDefinition)
     AllSubprograms.push_back(Node);
@@ -695,23 +704,22 @@ DISubprogram *DIBuilder::createTempFunctionFwdDecl(
     DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *File,
     unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
     bool isDefinition, unsigned ScopeLine, DINode::DIFlags Flags,
-    bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl) {
+    bool isOptimized, DITemplateParameterArray TParams, DISubprogram *Decl,
+    DITypeArray ThrownTypes) {
   return DISubprogram::getTemporary(
              VMContext, getNonCompileUnitScope(Context), Name, LinkageName,
              File, LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine, nullptr,
              0, 0, 0, Flags, isOptimized, isDefinition ? CUNode : nullptr,
-             TParams, Decl, nullptr)
+             TParams, Decl, nullptr, ThrownTypes)
       .release();
 }
 
-DISubprogram *DIBuilder::createMethod(DIScope *Context, StringRef Name,
-                                      StringRef LinkageName, DIFile *F,
-                                      unsigned LineNo, DISubroutineType *Ty,
-                                      bool isLocalToUnit, bool isDefinition,
-                                      unsigned VK, unsigned VIndex,
-                                      int ThisAdjustment, DIType *VTableHolder,
-                                      DINode::DIFlags Flags, bool isOptimized,
-                                      DITemplateParameterArray TParams) {
+DISubprogram *DIBuilder::createMethod(
+    DIScope *Context, StringRef Name, StringRef LinkageName, DIFile *F,
+    unsigned LineNo, DISubroutineType *Ty, bool isLocalToUnit,
+    bool isDefinition, unsigned VK, unsigned VIndex, int ThisAdjustment,
+    DIType *VTableHolder, DINode::DIFlags Flags, bool isOptimized,
+    DITemplateParameterArray TParams, DITypeArray ThrownTypes) {
   assert(getNonCompileUnitScope(Context) &&
          "Methods should have both a Context and a context that isn't "
          "the compile unit.");
@@ -720,7 +728,7 @@ DISubprogram *DIBuilder::createMethod(DIScope *Context, StringRef Name,
       /* IsDistinct = */ isDefinition, VMContext, cast<DIScope>(Context), Name,
       LinkageName, F, LineNo, Ty, isLocalToUnit, isDefinition, LineNo,
       VTableHolder, VK, VIndex, ThisAdjustment, Flags, isOptimized,
-      isDefinition ? CUNode : nullptr, TParams, nullptr, nullptr);
+      isDefinition ? CUNode : nullptr, TParams, nullptr, nullptr, ThrownTypes);
 
   if (isDefinition)
     AllSubprograms.push_back(SP);
@@ -729,10 +737,15 @@ DISubprogram *DIBuilder::createMethod(DIScope *Context, StringRef Name,
 }
 
 DINamespace *DIBuilder::createNameSpace(DIScope *Scope, StringRef Name,
-                                        DIFile *File, unsigned LineNo,
                                         bool ExportSymbols) {
-  return DINamespace::get(VMContext, getNonCompileUnitScope(Scope), File, Name,
-                          LineNo, ExportSymbols);
+
+  // It is okay to *not* make anonymous top-level namespaces distinct, because
+  // all nodes that have an anonymous namespace as their parent scope are
+  // guaranteed to be unique and/or are linked to their containing
+  // DICompileUnit. This decision is an explicit tradeoff of link time versus
+  // memory usage versus code simplicity and may get revisited in the future.
+  return DINamespace::get(VMContext, getNonCompileUnitScope(Scope), Name,
+                          ExportSymbols);
 }
 
 DIModule *DIBuilder::createModule(DIScope *Scope, StringRef Name,
diff --git a/contrib/llvm/lib/IR/DataLayout.cpp b/contrib/llvm/lib/IR/DataLayout.cpp
index d15a34c..5de281a 100644
--- a/contrib/llvm/lib/IR/DataLayout.cpp
+++ b/contrib/llvm/lib/IR/DataLayout.cpp
@@ -1,4 +1,4 @@
-//===-- DataLayout.cpp - Data size & alignment routines --------------------==//
+//===- DataLayout.cpp - Data size & alignment routines ---------------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,19 +18,25 @@
 
 #include "llvm/IR/DataLayout.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <cstdlib>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -73,7 +79,6 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
   }
 }
 
-
 /// getElementContainingOffset - Given a valid offset into the structure,
 /// return the structure index that contains it.
 unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
@@ -118,9 +123,6 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-const LayoutAlignElem
-DataLayout::InvalidAlignmentElem = { INVALID_ALIGN, 0, 0, 0 };
-
 //===----------------------------------------------------------------------===//
 // PointerAlignElem, PointerAlign support
 //===----------------------------------------------------------------------===//
@@ -145,9 +147,6 @@ PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
           && TypeByteWidth == rhs.TypeByteWidth);
 }
 
-const PointerAlignElem
-DataLayout::InvalidPointerElem = { 0U, 0U, 0U, ~0U };
-
 //===----------------------------------------------------------------------===//
 //                       DataLayout Class Implementation
 //===----------------------------------------------------------------------===//
@@ -180,6 +179,7 @@ void DataLayout::reset(StringRef Desc) {
 
   LayoutMap = nullptr;
   BigEndian = false;
+  AllocaAddrSpace = 0;
   StackNaturalAlign = 0;
   ManglingMode = MM_None;
   NonIntegralAddressSpaces.clear();
@@ -307,7 +307,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
     case 'a': {
       AlignTypeEnum AlignType;
       switch (Specifier) {
-      default:
+      default: llvm_unreachable("Unexpected specifier!");
       case 'i': AlignType = INTEGER_ALIGN; break;
       case 'v': AlignType = VECTOR_ALIGN; break;
       case 'f': AlignType = FLOAT_ALIGN; break;
@@ -343,7 +343,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       break;
     }
     case 'n':  // Native integer types.
-      for (;;) {
+      while (true) {
         unsigned Width = getInt(Tok);
         if (Width == 0)
           report_fatal_error(
@@ -358,6 +358,12 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       StackNaturalAlign = inBytes(getInt(Tok));
       break;
     }
+    case 'A': { // Default stack/alloca address space.
+      AllocaAddrSpace = getInt(Tok);
+      if (!isUInt<24>(AllocaAddrSpace))
+        report_fatal_error("Invalid address space, must be a 24bit integer");
+      break;
+    }
     case 'm':
       if (!Tok.empty())
         report_fatal_error("Unexpected trailing characters after mangling specifier in datalayout string");
@@ -392,7 +398,7 @@ void DataLayout::parseSpecifier(StringRef Desc) {
   }
 }
 
-DataLayout::DataLayout(const Module *M) : LayoutMap(nullptr) {
+DataLayout::DataLayout(const Module *M) {
   init(M);
 }
 
@@ -400,6 +406,7 @@ void DataLayout::init(const Module *M) { *this = M->getDataLayout(); }
 
 bool DataLayout::operator==(const DataLayout &Other) const {
   bool Ret = BigEndian == Other.BigEndian &&
+             AllocaAddrSpace == Other.AllocaAddrSpace &&
              StackNaturalAlign == Other.StackNaturalAlign &&
              ManglingMode == Other.ManglingMode &&
              LegalIntWidths == Other.LegalIntWidths &&
@@ -408,6 +415,18 @@ bool DataLayout::operator==(const DataLayout &Other) const {
   return Ret;
 }
 
+DataLayout::AlignmentsTy::iterator
+DataLayout::findAlignmentLowerBound(AlignTypeEnum AlignType,
+                                    uint32_t BitWidth) {
+  auto Pair = std::make_pair((unsigned)AlignType, BitWidth);
+  return std::lower_bound(Alignments.begin(), Alignments.end(), Pair,
+                          [](const LayoutAlignElem &LHS,
+                             const std::pair<unsigned, uint32_t> &RHS) {
+                            return std::tie(LHS.AlignType, LHS.TypeBitWidth) <
+                                   std::tie(RHS.first, RHS.second);
+                          });
+}
+
 void
 DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
                          unsigned pref_align, uint32_t bit_width) {
@@ -426,18 +445,17 @@ DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
     report_fatal_error(
         "Preferred alignment cannot be less than the ABI alignment");
 
-  for (LayoutAlignElem &Elem : Alignments) {
-    if (Elem.AlignType == (unsigned)align_type &&
-        Elem.TypeBitWidth == bit_width) {
-      // Update the abi, preferred alignments.
-      Elem.ABIAlign = abi_align;
-      Elem.PrefAlign = pref_align;
-      return;
-    }
+  AlignmentsTy::iterator I = findAlignmentLowerBound(align_type, bit_width);
+  if (I != Alignments.end() &&
+      I->AlignType == (unsigned)align_type && I->TypeBitWidth == bit_width) {
+    // Update the abi, preferred alignments.
+    I->ABIAlign = abi_align;
+    I->PrefAlign = pref_align;
+  } else {
+    // Insert before I to keep the vector sorted.
+    Alignments.insert(I, LayoutAlignElem::get(align_type, abi_align,
+                                              pref_align, bit_width));
   }
-
-  Alignments.push_back(LayoutAlignElem::get(align_type, abi_align,
-                                            pref_align, bit_width));
 }
 
 DataLayout::PointersTy::iterator
@@ -471,45 +489,29 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
 unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
                                       uint32_t BitWidth, bool ABIInfo,
                                       Type *Ty) const {
-  // Check to see if we have an exact match and remember the best match we see.
-  int BestMatchIdx = -1;
-  int LargestInt = -1;
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == (unsigned)AlignType &&
-        Alignments[i].TypeBitWidth == BitWidth)
-      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
-
-    // The best match so far depends on what we're looking for.
-    if (AlignType == INTEGER_ALIGN &&
-        Alignments[i].AlignType == INTEGER_ALIGN) {
-      // The "best match" for integers is the smallest size that is larger than
-      // the BitWidth requested.
-      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
-          Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
-        BestMatchIdx = i;
-      // However, if there isn't one that's larger, then we must use the
-      // largest one we have (see below)
-      if (LargestInt == -1 ||
-          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
-        LargestInt = i;
+  AlignmentsTy::const_iterator I = findAlignmentLowerBound(AlignType, BitWidth);
+  // See if we found an exact match. Of if we are looking for an integer type,
+  // but don't have an exact match take the next largest integer. This is where
+  // the lower_bound will point to when it fails an exact match.
+  if (I != Alignments.end() && I->AlignType == (unsigned)AlignType &&
+      (I->TypeBitWidth == BitWidth || AlignType == INTEGER_ALIGN))
+    return ABIInfo ? I->ABIAlign : I->PrefAlign;
+
+  if (AlignType == INTEGER_ALIGN) {
+    // If we didn't have a larger value try the largest value we have.
+    if (I != Alignments.begin()) {
+      --I; // Go to the previous entry and see if its an integer.
+      if (I->AlignType == INTEGER_ALIGN)
+        return ABIInfo ? I->ABIAlign : I->PrefAlign;
     }
-  }
-
-  // Okay, we didn't find an exact solution.  Fall back here depending on what
-  // is being looked for.
-  if (BestMatchIdx == -1) {
-    // If we didn't find an integer alignment, fall back on most conservative.
-    if (AlignType == INTEGER_ALIGN) {
-      BestMatchIdx = LargestInt;
-    } else if (AlignType == VECTOR_ALIGN) {
-      // By default, use natural alignment for vector types. This is consistent
-      // with what clang and llvm-gcc do.
-      unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-      Align *= cast<VectorType>(Ty)->getNumElements();
-      Align = PowerOf2Ceil(Align);
-      return Align;
-    }
-  }
+  } else if (AlignType == VECTOR_ALIGN) {
+    // By default, use natural alignment for vector types. This is consistent
+    // with what clang and llvm-gcc do.
+    unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+    Align *= cast<VectorType>(Ty)->getNumElements();
+    Align = PowerOf2Ceil(Align);
+    return Align;
+   }
 
   // If we still couldn't find a reasonable default alignment, fall back
   // to a simple heuristic that the alignment is the first power of two
@@ -517,21 +519,15 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
   // approximation of reality, and if the user wanted something less
   // less conservative, they should have specified it explicitly in the data
   // layout.
-  if (BestMatchIdx == -1) {
-    unsigned Align = getTypeStoreSize(Ty);
-    Align = PowerOf2Ceil(Align);
-    return Align;
-  }
-
-  // Since we got a "best match" index, just return it.
-  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
-                 : Alignments[BestMatchIdx].PrefAlign;
+  unsigned Align = getTypeStoreSize(Ty);
+  Align = PowerOf2Ceil(Align);
+  return Align;
 }
 
 namespace {
 
 class StructLayoutMap {
-  typedef DenseMap<StructType*, StructLayout*> LayoutInfoTy;
+  using LayoutInfoTy = DenseMap<StructType*, StructLayout*>;
   LayoutInfoTy LayoutInfo;
 
 public:
@@ -586,7 +582,6 @@ const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
   return L;
 }
 
-
 unsigned DataLayout::getPointerABIAlignment(unsigned AS) const {
   PointersTy::const_iterator I = findPointerLowerBound(AS);
   if (I == Pointers.end() || I->AddressSpace != AS) {
@@ -617,11 +612,8 @@ unsigned DataLayout::getPointerSize(unsigned AS) const {
 unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
   assert(Ty->isPtrOrPtrVectorTy() &&
          "This should only be called with a pointer or pointer vector type");
-
-  if (Ty->isPointerTy())
-    return getTypeSizeInBits(Ty);
-
-  return getTypeSizeInBits(Ty->getScalarType());
+  Ty = Ty->getScalarType();
+  return getPointerSizeInBits(cast<PointerType>(Ty)->getAddressSpace());
 }
 
 /*!
@@ -633,7 +625,7 @@ unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
   == false) for the requested type \a Ty.
  */
 unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
-  int AlignType = -1;
+  AlignTypeEnum AlignType;
 
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
@@ -682,8 +674,7 @@ unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     llvm_unreachable("Bad type for getAlignment!!!");
   }
 
-  return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSizeInBits(Ty),
-                          abi_or_pref, Ty);
+  return getAlignmentInfo(AlignType, getTypeSizeInBits(Ty), abi_or_pref, Ty);
 }
 
 unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
@@ -791,4 +782,3 @@ unsigned DataLayout::getPreferredAlignment(const GlobalVariable *GV) const {
 unsigned DataLayout::getPreferredAlignmentLog(const GlobalVariable *GV) const {
   return Log2_32(getPreferredAlignment(GV));
 }
-
diff --git a/contrib/llvm/lib/IR/DebugInfo.cpp b/contrib/llvm/lib/IR/DebugInfo.cpp
index 6b9bc68..56cec57 100644
--- a/contrib/llvm/lib/IR/DebugInfo.cpp
+++ b/contrib/llvm/lib/IR/DebugInfo.cpp
@@ -1,4 +1,4 @@
-//===--- DebugInfo.cpp - Debug Information Helper Classes -----------------===//
+//===- DebugInfo.cpp - Debug Information Helper Classes -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,21 +13,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DebugInfo.h"
-#include "LLVMContextImpl.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 using namespace llvm::dwarf;
 
@@ -79,9 +86,19 @@ void DebugInfoFinder::processModule(const Module &M) {
         processScope(M->getScope());
     }
   }
-  for (auto &F : M.functions())
+  for (auto &F : M.functions()) {
     if (auto *SP = cast_or_null<DISubprogram>(F.getSubprogram()))
       processSubprogram(SP);
+    // There could be subprograms from inlined functions referenced from
+    // instructions only. Walk the function to find them.
+    for (const BasicBlock &BB : F) {
+      for (const Instruction &I : BB) {
+        if (!I.getDebugLoc())
+          continue;
+        processLocation(M, I.getDebugLoc().get());
+      }
+    }
+  }
 }
 
 void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
@@ -239,6 +256,38 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
   return true;
 }
 
+static MDNode *stripDebugLocFromLoopID(MDNode *N) {
+  assert(N->op_begin() != N->op_end() && "Missing self reference?");
+
+  // if there is no debug location, we do not have to rewrite this MDNode.
+  if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
+        return isa<DILocation>(Op.get());
+      }))
+    return N;
+
+  // If there is only the debug location without any actual loop metadata, we
+  // can remove the metadata.
+  if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
+        return !isa<DILocation>(Op.get());
+      }))
+    return nullptr;
+
+  SmallVector<Metadata *, 4> Args;
+  // Reserve operand 0 for loop id self reference.
+  auto TempNode = MDNode::getTemporary(N->getContext(), None);
+  Args.push_back(TempNode.get());
+  // Add all non-debug location operands back.
+  for (auto Op = N->op_begin() + 1; Op != N->op_end(); Op++) {
+    if (!isa<DILocation>(*Op))
+      Args.push_back(*Op);
+  }
+
+  // Set the first operand to itself.
+  MDNode *LoopID = MDNode::get(N->getContext(), Args);
+  LoopID->replaceOperandWith(0, LoopID);
+  return LoopID;
+}
+
 bool llvm::stripDebugInfo(Function &F) {
   bool Changed = false;
   if (F.getSubprogram()) {
@@ -246,6 +295,7 @@ bool llvm::stripDebugInfo(Function &F) {
     F.setSubprogram(nullptr);
   }
 
+  DenseMap<MDNode*, MDNode*> LoopIDsMap;
   for (BasicBlock &BB : F) {
     for (auto II = BB.begin(), End = BB.end(); II != End;) {
       Instruction &I = *II++; // We may delete the instruction, increment now.
@@ -259,6 +309,15 @@ bool llvm::stripDebugInfo(Function &F) {
         I.setDebugLoc(DebugLoc());
       }
     }
+
+    auto *TermInst = BB.getTerminator();
+    if (auto *LoopID = TermInst->getMetadata(LLVMContext::MD_loop)) {
+      auto *NewLoopID = LoopIDsMap.lookup(LoopID);
+      if (!NewLoopID)
+        NewLoopID = LoopIDsMap[LoopID] = stripDebugLocFromLoopID(LoopID);
+      if (NewLoopID != LoopID)
+        TermInst->setMetadata(LLVMContext::MD_loop, NewLoopID);
+    }
   }
   return Changed;
 }
@@ -410,7 +469,8 @@ private:
         CU->isOptimized(), CU->getFlags(), CU->getRuntimeVersion(),
         CU->getSplitDebugFilename(), DICompileUnit::LineTablesOnly, EnumTypes,
         RetainedTypes, GlobalVariables, ImportedEntities, CU->getMacros(),
-        CU->getDWOId(), CU->getSplitDebugInlining());
+        CU->getDWOId(), CU->getSplitDebugInlining(),
+        CU->getDebugInfoForProfiling());
   }
 
   DILocation *getReplacementMDLocation(DILocation *MLD) {
@@ -472,7 +532,7 @@ private:
   void traverse(MDNode *);
 };
 
-} // Anonymous namespace.
+} // end anonymous namespace
 
 void DebugTypeInfoRemoval::traverse(MDNode *N) {
   if (!N || Replacements.count(N))
@@ -537,7 +597,7 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
     GV.eraseMetadata(LLVMContext::MD_dbg);
 
   DebugTypeInfoRemoval Mapper(M.getContext());
-  auto remap = [&](llvm::MDNode *Node) -> llvm::MDNode * {
+  auto remap = [&](MDNode *Node) -> MDNode * {
     if (!Node)
       return nullptr;
     Mapper.traverseAndRemap(Node);
@@ -558,17 +618,26 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
     }
     for (auto &BB : F) {
       for (auto &I : BB) {
-        if (I.getDebugLoc() == DebugLoc())
-          continue;
-
-        // Make a replacement.
-        auto &DL = I.getDebugLoc();
-        auto *Scope = DL.getScope();
-        MDNode *InlinedAt = DL.getInlinedAt();
-        Scope = remap(Scope);
-        InlinedAt = remap(InlinedAt);
-        I.setDebugLoc(
-            DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt));
+        auto remapDebugLoc = [&](DebugLoc DL) -> DebugLoc {
+          auto *Scope = DL.getScope();
+          MDNode *InlinedAt = DL.getInlinedAt();
+          Scope = remap(Scope);
+          InlinedAt = remap(InlinedAt);
+          return DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt);
+        };
+
+        if (I.getDebugLoc() != DebugLoc())
+          I.setDebugLoc(remapDebugLoc(I.getDebugLoc()));
+
+        // Remap DILocations in untyped MDNodes (e.g., llvm.loop).
+        SmallVector<std::pair<unsigned, MDNode *>, 2> MDs;
+        I.getAllMetadata(MDs);
+        for (auto Attachment : MDs)
+          if (auto *T = dyn_cast_or_null<MDTuple>(Attachment.second))
+            for (unsigned N = 0; N < T->getNumOperands(); ++N)
+              if (auto *Loc = dyn_cast_or_null<DILocation>(T->getOperand(N)))
+                if (Loc != DebugLoc())
+                  T->replaceOperandWith(N, remapDebugLoc(Loc));
       }
     }
   }
diff --git a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
index 8e21a90..c14940b 100644
--- a/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/contrib/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -15,6 +15,7 @@
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
 
 using namespace llvm;
@@ -214,6 +215,10 @@ void GenericDINode::recalculateHash() {
 #define DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(CLASS, OPS)                   \
   return storeImpl(new (array_lengthof(OPS)) CLASS(Context, Storage, OPS),     \
                    Storage, Context.pImpl->CLASS##s)
+#define DEFINE_GETIMPL_STORE_N(CLASS, ARGS, OPS, NUM_OPS)                      \
+  return storeImpl(new (NUM_OPS)                                               \
+                       CLASS(Context, Storage, UNWRAP_ARGS(ARGS), OPS),        \
+                   Storage, Context.pImpl->CLASS##s)
 
 DISubrange *DISubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
                                 StorageType Storage, bool ShouldCreate) {
@@ -245,16 +250,18 @@ DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-    uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
-    Metadata *ExtraData, StorageType Storage, bool ShouldCreate) {
+    uint32_t AlignInBits, uint64_t OffsetInBits,
+    Optional<unsigned> DWARFAddressSpace, DIFlags Flags, Metadata *ExtraData,
+    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                         AlignInBits, OffsetInBits, Flags, ExtraData));
+                         AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
+                         ExtraData));
   Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
   DEFINE_GETIMPL_STORE(
-      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits, Flags),
-      Ops);
+      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
+                      DWARFAddressSpace, Flags), Ops);
 }
 
 DICompositeType *DICompositeType::getImpl(
@@ -383,8 +390,8 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
-    uint64_t DWOId, bool SplitDebugInlining, StorageType Storage,
-    bool ShouldCreate) {
+    uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
+    StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
@@ -397,7 +404,8 @@ DICompileUnit *DICompileUnit::getImpl(
   return storeImpl(new (array_lengthof(Ops))
                        DICompileUnit(Context, Storage, SourceLanguage,
                                      IsOptimized, RuntimeVersion, EmissionKind,
-                                     DWOId, SplitDebugInlining, Ops),
+                                     DWOId, SplitDebugInlining,
+                                     DebugInfoForProfiling, Ops),
                    Storage);
 }
 
@@ -438,21 +446,30 @@ DISubprogram *DISubprogram::getImpl(
     Metadata *ContainingType, unsigned Virtuality, unsigned VirtualIndex,
     int ThisAdjustment, DIFlags Flags, bool IsOptimized, Metadata *Unit,
     Metadata *TemplateParams, Metadata *Declaration, Metadata *Variables,
-    StorageType Storage, bool ShouldCreate) {
+    Metadata *ThrownTypes, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   assert(isCanonical(LinkageName) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(
-      DISubprogram,
-      (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit, IsDefinition,
-       ScopeLine, ContainingType, Virtuality, VirtualIndex, ThisAdjustment,
-       Flags, IsOptimized, Unit, TemplateParams, Declaration, Variables));
-  Metadata *Ops[] = {File,           Scope,       Name,           Name,
-                     LinkageName,    Type,        ContainingType, Unit,
-                     TemplateParams, Declaration, Variables};
-  DEFINE_GETIMPL_STORE(DISubprogram, (Line, ScopeLine, Virtuality, VirtualIndex,
-                                      ThisAdjustment, Flags, IsLocalToUnit,
-                                      IsDefinition, IsOptimized),
-                       Ops);
+      DISubprogram, (Scope, Name, LinkageName, File, Line, Type, IsLocalToUnit,
+                     IsDefinition, ScopeLine, ContainingType, Virtuality,
+                     VirtualIndex, ThisAdjustment, Flags, IsOptimized, Unit,
+                     TemplateParams, Declaration, Variables, ThrownTypes));
+  SmallVector<Metadata *, 11> Ops = {
+      File,        Scope,     Name,           LinkageName,    Type,       Unit,
+      Declaration, Variables, ContainingType, TemplateParams, ThrownTypes};
+  if (!ThrownTypes) {
+    Ops.pop_back();
+    if (!TemplateParams) {
+      Ops.pop_back();
+      if (!ContainingType)
+        Ops.pop_back();
+    }
+  }
+  DEFINE_GETIMPL_STORE_N(DISubprogram,
+                         (Line, ScopeLine, Virtuality, VirtualIndex,
+                          ThisAdjustment, Flags, IsLocalToUnit, IsDefinition,
+                          IsOptimized),
+                         Ops, Ops.size());
 }
 
 bool DISubprogram::describes(const Function *F) const {
@@ -490,13 +507,13 @@ DILexicalBlockFile *DILexicalBlockFile::getImpl(LLVMContext &Context,
 }
 
 DINamespace *DINamespace::getImpl(LLVMContext &Context, Metadata *Scope,
-                                  Metadata *File, MDString *Name, unsigned Line,
-                                  bool ExportSymbols, StorageType Storage,
-                                  bool ShouldCreate) {
+                                  MDString *Name, bool ExportSymbols,
+                                  StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DINamespace, (Scope, File, Name, Line, ExportSymbols));
-  Metadata *Ops[] = {File, Scope, Name};
-  DEFINE_GETIMPL_STORE(DINamespace, (Line, ExportSymbols), Ops);
+  DEFINE_GETIMPL_LOOKUP(DINamespace, (Scope, Name, ExportSymbols));
+  // The nullptr is for DIScope's File operand. This should be refactored.
+  Metadata *Ops[] = {nullptr, Scope, Name};
+  DEFINE_GETIMPL_STORE(DINamespace, (ExportSymbols), Ops);
 }
 
 DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *Scope,
@@ -581,8 +598,7 @@ unsigned DIExpression::ExprOperand::getSize() const {
   case dwarf::DW_OP_LLVM_fragment:
     return 3;
   case dwarf::DW_OP_constu:
-  case dwarf::DW_OP_plus:
-  case dwarf::DW_OP_minus:
+  case dwarf::DW_OP_plus_uconst:
     return 2;
   default:
     return 1;
@@ -611,10 +627,24 @@ bool DIExpression::isValid() const {
         return false;
       break;
     }
+    case dwarf::DW_OP_swap: {
+      // Must be more than one implicit element on the stack.
+
+      // FIXME: A better way to implement this would be to add a local variable
+      // that keeps track of the stack depth and introduce something like a
+      // DW_LLVM_OP_implicit_location as a placeholder for the location this
+      // DIExpression is attached to, or else pass the number of implicit stack
+      // elements into isValid.
+      if (getNumElements() == 1)
+        return false;
+      break;
+    }
     case dwarf::DW_OP_constu:
+    case dwarf::DW_OP_plus_uconst:
     case dwarf::DW_OP_plus:
     case dwarf::DW_OP_minus:
     case dwarf::DW_OP_deref:
+    case dwarf::DW_OP_xderef:
       break;
     }
   }
@@ -631,6 +661,69 @@ DIExpression::getFragmentInfo(expr_op_iterator Start, expr_op_iterator End) {
   return None;
 }
 
+void DIExpression::appendOffset(SmallVectorImpl<uint64_t> &Ops,
+                                int64_t Offset) {
+  if (Offset > 0) {
+    Ops.push_back(dwarf::DW_OP_plus_uconst);
+    Ops.push_back(Offset);
+  } else if (Offset < 0) {
+    Ops.push_back(dwarf::DW_OP_constu);
+    Ops.push_back(-Offset);
+    Ops.push_back(dwarf::DW_OP_minus);
+  }
+}
+
+bool DIExpression::extractIfOffset(int64_t &Offset) const {
+  if (getNumElements() == 0) {
+    Offset = 0;
+    return true;
+  }
+
+  if (getNumElements() == 2 && Elements[0] == dwarf::DW_OP_plus_uconst) {
+    Offset = Elements[1];
+    return true;
+  }
+
+  if (getNumElements() == 3 && Elements[0] == dwarf::DW_OP_constu) {
+    if (Elements[2] == dwarf::DW_OP_plus) {
+      Offset = Elements[1];
+      return true;
+    }
+    if (Elements[2] == dwarf::DW_OP_minus) {
+      Offset = -Elements[1];
+      return true;
+    }
+  }
+
+  return false;
+}
+
+DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref,
+                                    int64_t Offset, bool StackValue) {
+  SmallVector<uint64_t, 8> Ops;
+  appendOffset(Ops, Offset);
+  if (Deref)
+    Ops.push_back(dwarf::DW_OP_deref);
+  if (Expr)
+    for (auto Op : Expr->expr_ops()) {
+      // A DW_OP_stack_value comes at the end, but before a DW_OP_LLVM_fragment.
+      if (StackValue) {
+        if (Op.getOp() == dwarf::DW_OP_stack_value)
+          StackValue = false;
+        else if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
+          Ops.push_back(dwarf::DW_OP_stack_value);
+          StackValue = false;
+        }
+      }
+      Ops.push_back(Op.getOp());
+      for (unsigned I = 0; I < Op.getNumArgs(); ++I)
+        Ops.push_back(Op.getArg(I));
+    }
+  if (StackValue)
+    Ops.push_back(dwarf::DW_OP_stack_value);
+  return DIExpression::get(Expr->getContext(), Ops);
+}
+
 bool DIExpression::isConstant() const {
   // Recognize DW_OP_constu C DW_OP_stack_value (DW_OP_LLVM_fragment Len Ofs)?.
   if (getNumElements() != 3 && getNumElements() != 6)
@@ -667,12 +760,13 @@ DIObjCProperty *DIObjCProperty::getImpl(
 
 DIImportedEntity *DIImportedEntity::getImpl(LLVMContext &Context, unsigned Tag,
                                             Metadata *Scope, Metadata *Entity,
-                                            unsigned Line, MDString *Name,
-                                            StorageType Storage,
+                                            Metadata *File, unsigned Line,
+                                            MDString *Name, StorageType Storage,
                                             bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIImportedEntity, (Tag, Scope, Entity, Line, Name));
-  Metadata *Ops[] = {Scope, Entity, Name};
+  DEFINE_GETIMPL_LOOKUP(DIImportedEntity,
+                        (Tag, Scope, Entity, File, Line, Name));
+  Metadata *Ops[] = {Scope, Entity, Name, File};
   DEFINE_GETIMPL_STORE(DIImportedEntity, (Tag, Line), Ops);
 }
 
diff --git a/contrib/llvm/lib/IR/DebugLoc.cpp b/contrib/llvm/lib/IR/DebugLoc.cpp
index ffa7a6b..6297395 100644
--- a/contrib/llvm/lib/IR/DebugLoc.cpp
+++ b/contrib/llvm/lib/IR/DebugLoc.cpp
@@ -10,6 +10,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "LLVMContextImpl.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -66,8 +67,40 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
                          const_cast<MDNode *>(InlinedAt));
 }
 
+DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+                                   LLVMContext &Ctx,
+                                   DenseMap<const MDNode *, MDNode *> &Cache,
+                                   bool ReplaceLast) {
+  SmallVector<DILocation *, 3> InlinedAtLocations;
+  DILocation *Last = InlinedAt;
+  DILocation *CurInlinedAt = DL;
+
+  // Gather all the inlined-at nodes.
+  while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
+    // Skip any we've already built nodes for.
+    if (auto *Found = Cache[IA]) {
+      Last = cast<DILocation>(Found);
+      break;
+    }
+
+    if (ReplaceLast && !IA->getInlinedAt())
+      break;
+    InlinedAtLocations.push_back(IA);
+    CurInlinedAt = IA;
+  }
+
+  // Starting from the top, rebuild the nodes to point to the new inlined-at
+  // location (then rebuilding the rest of the chain behind it) and update the
+  // map of already-constructed inlined-at nodes.
+  for (const DILocation *MD : reverse(InlinedAtLocations))
+    Cache[MD] = Last = DILocation::getDistinct(
+        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
+
+  return Last;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void DebugLoc::dump() const {
-#ifndef NDEBUG
   if (!Loc)
     return;
 
@@ -79,8 +112,8 @@ LLVM_DUMP_METHOD void DebugLoc::dump() const {
     InlinedAtDL.dump();
   } else
     dbgs() << "\n";
-#endif
 }
+#endif
 
 void DebugLoc::print(raw_ostream &OS) const {
   if (!Loc)
diff --git a/contrib/llvm/lib/IR/DiagnosticInfo.cpp b/contrib/llvm/lib/IR/DiagnosticInfo.cpp
index ea71fde..5129d6b 100644
--- a/contrib/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/contrib/llvm/lib/IR/DiagnosticInfo.cpp
@@ -13,19 +13,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/DiagnosticInfo.h"
-#include "LLVMContextImpl.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
 #include <atomic>
+#include <cassert>
+#include <memory>
 #include <string>
 
 using namespace llvm;
@@ -53,6 +64,8 @@ struct PassRemarksOpt {
   }
 };
 
+} // end anonymous namespace
+
 static PassRemarksOpt PassRemarksOptLoc;
 static PassRemarksOpt PassRemarksMissedOptLoc;
 static PassRemarksOpt PassRemarksAnalysisOptLoc;
@@ -85,7 +98,6 @@ PassRemarksAnalysis(
         "the given regular expression"),
     cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired,
     cl::ZeroOrMore);
-}
 
 int llvm::getNextAvailablePluginDiagnosticKind() {
   static std::atomic<int> PluginKindID(DK_FirstPluginKind);
@@ -97,8 +109,7 @@ const char *OptimizationRemarkAnalysis::AlwaysPrint = "";
 DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I,
                                                  const Twine &MsgStr,
                                                  DiagnosticSeverity Severity)
-    : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr),
-      Instr(&I) {
+    : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr), Instr(&I) {
   if (const MDNode *SrcLoc = I.getMetadata("srcloc")) {
     if (SrcLoc->getNumOperands() != 0)
       if (const auto *CI =
@@ -148,21 +159,31 @@ void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
-bool DiagnosticInfoWithDebugLocBase::isLocationAvailable() const {
-  return getDebugLoc();
+DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) {
+  if (!DL)
+    return;
+  Filename = DL->getFilename();
+  Line = DL->getLine();
+  Column = DL->getColumn();
+}
+
+DiagnosticLocation::DiagnosticLocation(const DISubprogram *SP) {
+  if (!SP)
+    return;
+  Filename = SP->getFilename();
+  Line = SP->getScopeLine();
+  Column = 0;
 }
 
-void DiagnosticInfoWithDebugLocBase::getLocation(StringRef *Filename,
+void DiagnosticInfoWithLocationBase::getLocation(StringRef *Filename,
                                                  unsigned *Line,
                                                  unsigned *Column) const {
-  DILocation *L = getDebugLoc();
-  assert(L != nullptr && "debug location is invalid");
-  *Filename = L->getFilename();
-  *Line = L->getLine();
-  *Column = L->getColumn();
+  *Filename = Loc.getFilename();
+  *Line = Loc.getLine();
+  *Column = Loc.getColumn();
 }
 
-const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
+const std::string DiagnosticInfoWithLocationBase::getLocationStr() const {
   StringRef Filename("<unknown>");
   unsigned Line = 0;
   unsigned Column = 0;
@@ -171,19 +192,19 @@ const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
   return (Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Value *V)
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V)
     : Key(Key) {
   if (auto *F = dyn_cast<Function>(V)) {
     if (DISubprogram *SP = F->getSubprogram())
-      DLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+      Loc = SP;
   }
   else if (auto *I = dyn_cast<Instruction>(V))
-    DLoc = I->getDebugLoc();
+    Loc = I->getDebugLoc();
 
   // Only include names that correspond to user variables.  FIXME: we should use
   // debug info if available to get the name of the user variable.
   if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
-    Val = GlobalValue::getRealLinkageName(V->getName());
+    Val = GlobalValue::dropLLVMManglingEscape(V->getName());
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
@@ -191,7 +212,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Value *V)
     Val = I->getOpcodeName();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Type *T)
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Type *T)
     : Key(Key) {
   raw_string_ostream OS(Val);
   OS << *T;
@@ -211,73 +232,83 @@ void DiagnosticInfoOptimizationBase::print(DiagnosticPrinter &DP) const {
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
                                        StringRef RemarkName,
-                                       const DebugLoc &DLoc, Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(
+                                       const DiagnosticLocation &Loc,
+                                       const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
           DK_OptimizationRemark, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
-                                       StringRef RemarkName, Instruction *Inst)
-    : DiagnosticInfoOptimizationBase(DK_OptimizationRemark, DS_Remark, PassName,
-                                     RemarkName,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc(), Inst->getParent()) {}
+                                       StringRef RemarkName,
+                                       const Instruction *Inst)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
+                                   RemarkName, *Inst->getParent()->getParent(),
+                                   Inst->getDebugLoc(), Inst->getParent()) {}
 
-bool OptimizationRemark::isEnabled() const {
+// Helper to allow for an assert before attempting to return an invalid
+// reference.
+static const BasicBlock &getFirstFunctionBlock(const Function *Func) {
+  assert(!Func->empty() && "Function does not have a body");
+  return Func->front();
+}
+
+OptimizationRemark::OptimizationRemark(const char *PassName,
+                                       StringRef RemarkName,
+                                       const Function *Func)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
+                                   RemarkName, *Func, Func->getSubprogram(),
+                                   &getFirstFunctionBlock(Func)) {}
+
+bool OptimizationRemark::isEnabled(StringRef PassName) {
   return PassRemarksOptLoc.Pattern &&
-         PassRemarksOptLoc.Pattern->match(getPassName());
+         PassRemarksOptLoc.Pattern->match(PassName);
 }
 
-OptimizationRemarkMissed::OptimizationRemarkMissed(const char *PassName,
-                                                   StringRef RemarkName,
-                                                   const DebugLoc &DLoc,
-                                                   Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(
+OptimizationRemarkMissed::OptimizationRemarkMissed(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
           DK_OptimizationRemarkMissed, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemarkMissed::OptimizationRemarkMissed(const char *PassName,
                                                    StringRef RemarkName,
-                                                   Instruction *Inst)
-    : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkMissed, DS_Remark,
-                                     PassName, RemarkName,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc(), Inst->getParent()) {}
+                                                   const Instruction *Inst)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemarkMissed, DS_Remark,
+                                   PassName, RemarkName,
+                                   *Inst->getParent()->getParent(),
+                                   Inst->getDebugLoc(), Inst->getParent()) {}
 
-bool OptimizationRemarkMissed::isEnabled() const {
+bool OptimizationRemarkMissed::isEnabled(StringRef PassName) {
   return PassRemarksMissedOptLoc.Pattern &&
-         PassRemarksMissedOptLoc.Pattern->match(getPassName());
+         PassRemarksMissedOptLoc.Pattern->match(PassName);
 }
 
-OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(const char *PassName,
-                                                       StringRef RemarkName,
-                                                       const DebugLoc &DLoc,
-                                                       Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(
+OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
           DK_OptimizationRemarkAnalysis, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(const char *PassName,
                                                        StringRef RemarkName,
-                                                       Instruction *Inst)
-    : DiagnosticInfoOptimizationBase(DK_OptimizationRemarkAnalysis, DS_Remark,
-                                     PassName, RemarkName,
-                                     *Inst->getParent()->getParent(),
-                                     Inst->getDebugLoc(), Inst->getParent()) {}
-
-OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(enum DiagnosticKind Kind,
-                                                       const char *PassName,
-                                                       StringRef RemarkName,
-                                                       const DebugLoc &DLoc,
-                                                       Value *CodeRegion)
-    : DiagnosticInfoOptimizationBase(Kind, DS_Remark, PassName, RemarkName,
-                                     *cast<BasicBlock>(CodeRegion)->getParent(),
-                                     DLoc, CodeRegion) {}
+                                                       const Instruction *Inst)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemarkAnalysis, DS_Remark,
+                                   PassName, RemarkName,
+                                   *Inst->getParent()->getParent(),
+                                   Inst->getDebugLoc(), Inst->getParent()) {}
+
+OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(
+    enum DiagnosticKind Kind, const char *PassName, StringRef RemarkName,
+    const DiagnosticLocation &Loc, const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(Kind, DS_Remark, PassName, RemarkName,
+                                   *cast<BasicBlock>(CodeRegion)->getParent(),
+                                   Loc, CodeRegion) {}
 
-bool OptimizationRemarkAnalysis::isEnabled() const {
-  return shouldAlwaysPrint() ||
-         (PassRemarksAnalysisOptLoc.Pattern &&
-          PassRemarksAnalysisOptLoc.Pattern->match(getPassName()));
+bool OptimizationRemarkAnalysis::isEnabled(StringRef PassName) {
+  return PassRemarksAnalysisOptLoc.Pattern &&
+         PassRemarksAnalysisOptLoc.Pattern->match(PassName);
 }
 
 void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const {
@@ -285,42 +316,48 @@ void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const {
 }
 
 void llvm::emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
-                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Function &Fn,
+                                  const DiagnosticLocation &Loc,
                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemark(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemark(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
                                         const Function &Fn,
-                                        const DebugLoc &DLoc,
+                                        const DiagnosticLocation &Loc,
                                         const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkMissed(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkMissed(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
                                           const char *PassName,
                                           const Function &Fn,
-                                          const DebugLoc &DLoc,
+                                          const DiagnosticLocation &Loc,
                                           const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkAnalysis(PassName, Fn, Loc, Msg));
 }
 
-void llvm::emitOptimizationRemarkAnalysisFPCommute(LLVMContext &Ctx,
-                                                   const char *PassName,
-                                                   const Function &Fn,
-                                                   const DebugLoc &DLoc,
-                                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, Fn, DLoc, Msg));
+void llvm::emitOptimizationRemarkAnalysisFPCommute(
+    LLVMContext &Ctx, const char *PassName, const Function &Fn,
+    const DiagnosticLocation &Loc, const Twine &Msg) {
+  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkAnalysisAliasing(LLVMContext &Ctx,
                                                   const char *PassName,
                                                   const Function &Fn,
-                                                  const DebugLoc &DLoc,
+                                                  const DiagnosticLocation &Loc,
                                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, Fn, Loc, Msg));
 }
 
+DiagnosticInfoOptimizationFailure::DiagnosticInfoOptimizationFailure(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
+          DK_OptimizationFailure, DS_Warning, PassName, RemarkName,
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
+
 bool DiagnosticInfoOptimizationFailure::isEnabled() const {
   // Only print warnings.
   return getSeverity() == DS_Warning;
@@ -336,18 +373,6 @@ void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
   DP << Str;
 }
 
-void llvm::emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
-                                    const DebugLoc &DLoc, const Twine &Msg) {
-  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
-      Fn, DLoc, Twine("loop not vectorized: " + Msg)));
-}
-
-void llvm::emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn,
-                                     const DebugLoc &DLoc, const Twine &Msg) {
-  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
-      Fn, DLoc, Twine("loop not interleaved: " + Msg)));
-}
-
 void DiagnosticInfoISelFallback::print(DiagnosticPrinter &DP) const {
   DP << "Instruction selection used fallback path for " << getFunction();
 }
diff --git a/contrib/llvm/lib/IR/DiagnosticPrinter.cpp b/contrib/llvm/lib/IR/DiagnosticPrinter.cpp
index 659ff49..ee2df9e 100644
--- a/contrib/llvm/lib/IR/DiagnosticPrinter.cpp
+++ b/contrib/llvm/lib/IR/DiagnosticPrinter.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/Twine.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/IR/Dominators.cpp b/contrib/llvm/lib/IR/Dominators.cpp
index 1880807..4d7e304 100644
--- a/contrib/llvm/lib/IR/Dominators.cpp
+++ b/contrib/llvm/lib/IR/Dominators.cpp
@@ -29,9 +29,9 @@ using namespace llvm;
 
 // Always verify dominfo if expensive checking is enabled.
 #ifdef EXPENSIVE_CHECKS
-static bool VerifyDomInfo = true;
+bool llvm::VerifyDomInfo = true;
 #else
-static bool VerifyDomInfo = false;
+bool llvm::VerifyDomInfo = false;
 #endif
 static cl::opt<bool,true>
 VerifyDomInfoX("verify-dom-info", cl::location(VerifyDomInfo),
@@ -61,17 +61,39 @@ bool BasicBlockEdge::isSingleEdge() const {
 //===----------------------------------------------------------------------===//
 
 template class llvm::DomTreeNodeBase<BasicBlock>;
-template class llvm::DominatorTreeBase<BasicBlock>;
-
-template void llvm::Calculate<Function, BasicBlock *>(
-    DominatorTreeBase<
-        typename std::remove_pointer<GraphTraits<BasicBlock *>::NodeRef>::type>
-        &DT,
-    Function &F);
-template void llvm::Calculate<Function, Inverse<BasicBlock *>>(
-    DominatorTreeBase<typename std::remove_pointer<
-        GraphTraits<Inverse<BasicBlock *>>::NodeRef>::type> &DT,
-    Function &F);
+template class llvm::DominatorTreeBase<BasicBlock, false>; // DomTreeBase
+template class llvm::DominatorTreeBase<BasicBlock, true>; // PostDomTreeBase
+
+template void
+llvm::DomTreeBuilder::Calculate<DomTreeBuilder::BBDomTree, Function>(
+    DomTreeBuilder::BBDomTree &DT, Function &F);
+template void
+llvm::DomTreeBuilder::Calculate<DomTreeBuilder::BBPostDomTree, Function>(
+    DomTreeBuilder::BBPostDomTree &DT, Function &F);
+
+template void llvm::DomTreeBuilder::InsertEdge<DomTreeBuilder::BBDomTree>(
+    DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To);
+template void llvm::DomTreeBuilder::InsertEdge<DomTreeBuilder::BBPostDomTree>(
+    DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
+
+template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBDomTree>(
+    DomTreeBuilder::BBDomTree &DT, BasicBlock *From, BasicBlock *To);
+template void llvm::DomTreeBuilder::DeleteEdge<DomTreeBuilder::BBPostDomTree>(
+    DomTreeBuilder::BBPostDomTree &DT, BasicBlock *From, BasicBlock *To);
+
+template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBDomTree>(
+    const DomTreeBuilder::BBDomTree &DT);
+template bool llvm::DomTreeBuilder::Verify<DomTreeBuilder::BBPostDomTree>(
+    const DomTreeBuilder::BBPostDomTree &DT);
+
+bool DominatorTree::invalidate(Function &F, const PreservedAnalyses &PA,
+                               FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<DominatorTreeAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
 
 // dominates - Return true if Def dominates a use in User. This performs
 // the special checks necessary if Def and User are in the same basic block.
@@ -141,12 +163,6 @@ bool DominatorTree::dominates(const Instruction *Def,
 
 bool DominatorTree::dominates(const BasicBlockEdge &BBE,
                               const BasicBlock *UseBB) const {
-  // Assert that we have a single edge. We could handle them by simply
-  // returning false, but since isSingleEdge is linear on the number of
-  // edges, the callers can normally handle them more efficiently.
-  assert(BBE.isSingleEdge() &&
-         "This function is not efficient in handling multiple edges");
-
   // If the BB the edge ends in doesn't dominate the use BB, then the
   // edge also doesn't.
   const BasicBlock *Start = BBE.getStart();
@@ -179,11 +195,17 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE,
   // trivially dominates itself, so we only have to find if it dominates the
   // other predecessors. Since the only way out of X is via NormalDest, X can
   // only properly dominate a node if NormalDest dominates that node too.
+  int IsDuplicateEdge = 0;
   for (const_pred_iterator PI = pred_begin(End), E = pred_end(End);
        PI != E; ++PI) {
     const BasicBlock *BB = *PI;
-    if (BB == Start)
+    if (BB == Start) {
+      // If there are multiple edges between Start and End, by definition they
+      // can't dominate anything.
+      if (IsDuplicateEdge++)
+        return false;
       continue;
+    }
 
     if (!dominates(End, BB))
       return false;
@@ -192,12 +214,6 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE,
 }
 
 bool DominatorTree::dominates(const BasicBlockEdge &BBE, const Use &U) const {
-  // Assert that we have a single edge. We could handle them by simply
-  // returning false, but since isSingleEdge is linear on the number of
-  // edges, the callers can normally handle them more efficiently.
-  assert(BBE.isSingleEdge() &&
-         "This function is not efficient in handling multiple edges");
-
   Instruction *UserInst = cast<Instruction>(U.getUser());
   // A PHI in the end of the edge is dominated by it.
   PHINode *PN = dyn_cast<PHINode>(UserInst);
@@ -282,6 +298,13 @@ bool DominatorTree::isReachableFromEntry(const Use &U) const {
 }
 
 void DominatorTree::verifyDomTree() const {
+  // Perform the expensive checks only when VerifyDomInfo is set.
+  if (VerifyDomInfo && !verify()) {
+    errs() << "\n~~~~~~~~~~~\n\t\tDomTree verification failed!\n~~~~~~~~~~~\n";
+    print(errs());
+    abort();
+  }
+
   Function &F = *getRoot()->getParent();
 
   DominatorTree OtherDT;
diff --git a/contrib/llvm/lib/IR/Function.cpp b/contrib/llvm/lib/IR/Function.cpp
index 05419aa..85a0198 100644
--- a/contrib/llvm/lib/IR/Function.cpp
+++ b/contrib/llvm/lib/IR/Function.cpp
@@ -1,4 +1,4 @@
-//===-- Function.cpp - Implement the Global object classes ----------------===//
+//===- Function.cpp - Implement the Global object classes -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,37 +14,60 @@
 #include "llvm/IR/Function.h"
 #include "LLVMContextImpl.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
 using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
-template class llvm::SymbolTableListTraits<Argument>;
 template class llvm::SymbolTableListTraits<BasicBlock>;
 
 //===----------------------------------------------------------------------===//
 // Argument Implementation
 //===----------------------------------------------------------------------===//
 
-void Argument::anchor() { }
-
-Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
-  : Value(Ty, Value::ArgumentVal) {
-  Parent = nullptr;
-
-  if (Par)
-    Par->getArgumentList().push_back(this);
+Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
+    : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
   setName(Name);
 }
 
@@ -52,27 +75,9 @@ void Argument::setParent(Function *parent) {
   Parent = parent;
 }
 
-/// getArgNo - Return the index of this formal argument in its containing
-/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1.
-unsigned Argument::getArgNo() const {
-  const Function *F = getParent();
-  assert(F && "Argument is not in a function");
-
-  Function::const_arg_iterator AI = F->arg_begin();
-  unsigned ArgIdx = 0;
-  for (; &*AI != this; ++AI)
-    ++ArgIdx;
-
-  return ArgIdx;
-}
-
-/// hasNonNullAttr - Return true if this argument has the nonnull attribute on
-/// it in its containing function. Also returns true if at least one byte is
-/// known to be dereferenceable and the pointer is in addrspace(0).
 bool Argument::hasNonNullAttr() const {
   if (!getType()->isPointerTy()) return false;
-  if (getParent()->getAttributes().
-        hasAttribute(getArgNo()+1, Attribute::NonNull))
+  if (getParent()->hasParamAttribute(getArgNo(), Attribute::NonNull))
     return true;
   else if (getDereferenceableBytes() > 0 &&
            getType()->getPointerAddressSpace() == 0)
@@ -80,25 +85,19 @@ bool Argument::hasNonNullAttr() const {
   return false;
 }
 
-/// hasByValAttr - Return true if this argument has the byval attribute on it
-/// in its containing function.
 bool Argument::hasByValAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::ByVal);
 }
 
 bool Argument::hasSwiftSelfAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::SwiftSelf);
+  return getParent()->hasParamAttribute(getArgNo(), Attribute::SwiftSelf);
 }
 
 bool Argument::hasSwiftErrorAttr() const {
-  return getParent()->getAttributes().
-    hasAttribute(getArgNo()+1, Attribute::SwiftError);
+  return getParent()->hasParamAttribute(getArgNo(), Attribute::SwiftError);
 }
 
-/// \brief Return true if this argument has the inalloca attribute on it in
-/// its containing function.
 bool Argument::hasInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::InAlloca);
@@ -106,139 +105,96 @@ bool Argument::hasInAllocaAttr() const {
 
 bool Argument::hasByValOrInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
-  AttributeSet Attrs = getParent()->getAttributes();
-  return Attrs.hasAttribute(getArgNo() + 1, Attribute::ByVal) ||
-         Attrs.hasAttribute(getArgNo() + 1, Attribute::InAlloca);
+  AttributeList Attrs = getParent()->getAttributes();
+  return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca);
 }
 
 unsigned Argument::getParamAlignment() const {
   assert(getType()->isPointerTy() && "Only pointers have alignments");
-  return getParent()->getParamAlignment(getArgNo()+1);
-
+  return getParent()->getParamAlignment(getArgNo());
 }
 
 uint64_t Argument::getDereferenceableBytes() const {
   assert(getType()->isPointerTy() &&
          "Only pointers have dereferenceable bytes");
-  return getParent()->getDereferenceableBytes(getArgNo()+1);
+  return getParent()->getParamDereferenceableBytes(getArgNo());
 }
 
 uint64_t Argument::getDereferenceableOrNullBytes() const {
   assert(getType()->isPointerTy() &&
          "Only pointers have dereferenceable bytes");
-  return getParent()->getDereferenceableOrNullBytes(getArgNo()+1);
+  return getParent()->getParamDereferenceableOrNullBytes(getArgNo());
 }
 
-/// hasNestAttr - Return true if this argument has the nest attribute on
-/// it in its containing function.
 bool Argument::hasNestAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::Nest);
 }
 
-/// hasNoAliasAttr - Return true if this argument has the noalias attribute on
-/// it in its containing function.
 bool Argument::hasNoAliasAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::NoAlias);
 }
 
-/// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
-/// on it in its containing function.
 bool Argument::hasNoCaptureAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::NoCapture);
 }
 
-/// hasSRetAttr - Return true if this argument has the sret attribute on
-/// it in its containing function.
 bool Argument::hasStructRetAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::StructRet);
 }
 
-/// hasReturnedAttr - Return true if this argument has the returned attribute on
-/// it in its containing function.
 bool Argument::hasReturnedAttr() const {
   return hasAttribute(Attribute::Returned);
 }
 
-/// hasZExtAttr - Return true if this argument has the zext attribute on it in
-/// its containing function.
 bool Argument::hasZExtAttr() const {
   return hasAttribute(Attribute::ZExt);
 }
 
-/// hasSExtAttr Return true if this argument has the sext attribute on it in its
-/// containing function.
 bool Argument::hasSExtAttr() const {
   return hasAttribute(Attribute::SExt);
 }
 
-/// Return true if this argument has the readonly or readnone attribute on it
-/// in its containing function.
 bool Argument::onlyReadsMemory() const {
-  return getParent()->getAttributes().
-      hasAttribute(getArgNo()+1, Attribute::ReadOnly) ||
-      getParent()->getAttributes().
-      hasAttribute(getArgNo()+1, Attribute::ReadNone);
+  AttributeList Attrs = getParent()->getAttributes();
+  return Attrs.hasParamAttribute(getArgNo(), Attribute::ReadOnly) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::ReadNone);
+}
+
+void Argument::addAttrs(AttrBuilder &B) {
+  AttributeList AL = getParent()->getAttributes();
+  AL = AL.addParamAttributes(Parent->getContext(), getArgNo(), B);
+  getParent()->setAttributes(AL);
+}
+
+void Argument::addAttr(Attribute::AttrKind Kind) {
+  getParent()->addParamAttr(getArgNo(), Kind);
 }
 
-/// addAttr - Add attributes to an argument.
-void Argument::addAttr(AttributeSet AS) {
-  assert(AS.getNumSlots() <= 1 &&
-         "Trying to add more than one attribute set to an argument!");
-  AttrBuilder B(AS, AS.getSlotIndex(0));
-  getParent()->addAttributes(getArgNo() + 1,
-                             AttributeSet::get(Parent->getContext(),
-                                               getArgNo() + 1, B));
+void Argument::addAttr(Attribute Attr) {
+  getParent()->addParamAttr(getArgNo(), Attr);
 }
 
-/// removeAttr - Remove attributes from an argument.
-void Argument::removeAttr(AttributeSet AS) {
-  assert(AS.getNumSlots() <= 1 &&
-         "Trying to remove more than one attribute set from an argument!");
-  AttrBuilder B(AS, AS.getSlotIndex(0));
-  getParent()->removeAttributes(getArgNo() + 1,
-                                AttributeSet::get(Parent->getContext(),
-                                                  getArgNo() + 1, B));
+void Argument::removeAttr(Attribute::AttrKind Kind) {
+  getParent()->removeParamAttr(getArgNo(), Kind);
 }
 
-/// hasAttribute - Checks if an argument has a given attribute.
 bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
-  return getParent()->hasAttribute(getArgNo() + 1, Kind);
+  return getParent()->hasParamAttribute(getArgNo(), Kind);
 }
 
 //===----------------------------------------------------------------------===//
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
 
-bool Function::isMaterializable() const {
-  return getGlobalObjectSubClassData() & (1 << IsMaterializableBit);
-}
-
-void Function::setIsMaterializable(bool V) {
-  unsigned Mask = 1 << IsMaterializableBit;
-  setGlobalObjectSubClassData((~Mask & getGlobalObjectSubClassData()) |
-                              (V ? Mask : 0u));
-}
-
 LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
-FunctionType *Function::getFunctionType() const {
-  return cast<FunctionType>(getValueType());
-}
-
-bool Function::isVarArg() const {
-  return getFunctionType()->isVarArg();
-}
-
-Type *Function::getReturnType() const {
-  return getFunctionType()->getReturnType();
-}
-
 void Function::removeFromParent() {
   getParent()->getFunctionList().remove(getIterator());
 }
@@ -254,7 +210,8 @@ void Function::eraseFromParent() {
 Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal,
-                   OperandTraits<Function>::op_begin(this), 0, Linkage, name) {
+                   OperandTraits<Function>::op_begin(this), 0, Linkage, name),
+      NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -282,7 +239,8 @@ Function::~Function() {
   dropAllReferences();    // After this it is safe to delete instructions.
 
   // Delete all of the method arguments and unlink from symbol table...
-  ArgumentList.clear();
+  if (Arguments)
+    clearArguments();
 
   // Remove the function from the on-the-side GC table.
   clearGC();
@@ -290,16 +248,33 @@ Function::~Function() {
 
 void Function::BuildLazyArguments() const {
   // Create the arguments vector, all arguments start out unnamed.
-  FunctionType *FT = getFunctionType();
-  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
-    assert(!FT->getParamType(i)->isVoidTy() &&
-           "Cannot have void typed arguments!");
-    ArgumentList.push_back(new Argument(FT->getParamType(i)));
+  auto *FT = getFunctionType();
+  if (NumArgs > 0) {
+    Arguments = std::allocator<Argument>().allocate(NumArgs);
+    for (unsigned i = 0, e = NumArgs; i != e; ++i) {
+      Type *ArgTy = FT->getParamType(i);
+      assert(!ArgTy->isVoidTy() && "Cannot have void typed arguments!");
+      new (Arguments + i) Argument(ArgTy, "", const_cast<Function *>(this), i);
+    }
   }
 
   // Clear the lazy arguments bit.
   unsigned SDC = getSubclassDataFromValue();
   const_cast<Function*>(this)->setValueSubclassData(SDC &= ~(1<<0));
+  assert(!hasLazyArguments());
+}
+
+static MutableArrayRef<Argument> makeArgArray(Argument *Args, size_t Count) {
+  return MutableArrayRef<Argument>(Args, Count);
+}
+
+void Function::clearArguments() {
+  for (Argument &A : makeArgArray(Arguments, NumArgs)) {
+    A.setName("");
+    A.~Argument();
+  }
+  std::allocator<Argument>().deallocate(Arguments, NumArgs);
+  Arguments = nullptr;
 }
 
 void Function::stealArgumentListFrom(Function &Src) {
@@ -307,10 +282,10 @@ void Function::stealArgumentListFrom(Function &Src) {
 
   // Drop the current arguments, if any, and set the lazy argument bit.
   if (!hasLazyArguments()) {
-    assert(llvm::all_of(ArgumentList,
+    assert(llvm::all_of(makeArgArray(Arguments, NumArgs),
                         [](const Argument &A) { return A.use_empty(); }) &&
            "Expected arguments to be unused in declaration");
-    ArgumentList.clear();
+    clearArguments();
     setValueSubclassData(getSubclassDataFromValue() | (1 << 0));
   }
 
@@ -319,18 +294,26 @@ void Function::stealArgumentListFrom(Function &Src) {
     return;
 
   // Steal arguments from Src, and fix the lazy argument bits.
-  ArgumentList.splice(ArgumentList.end(), Src.ArgumentList);
+  assert(arg_size() == Src.arg_size());
+  Arguments = Src.Arguments;
+  Src.Arguments = nullptr;
+  for (Argument &A : makeArgArray(Arguments, NumArgs)) {
+    // FIXME: This does the work of transferNodesFromList inefficiently.
+    SmallString<128> Name;
+    if (A.hasName())
+      Name = A.getName();
+    if (!Name.empty())
+      A.setName("");
+    A.setParent(this);
+    if (!Name.empty())
+      A.setName(Name);
+  }
+
   setValueSubclassData(getSubclassDataFromValue() & ~(1 << 0));
+  assert(!hasLazyArguments());
   Src.setValueSubclassData(Src.getSubclassDataFromValue() | (1 << 0));
 }
 
-size_t Function::arg_size() const {
-  return getFunctionType()->getNumParams();
-}
-bool Function::arg_empty() const {
-  return getFunctionType()->getNumParams() == 0;
-}
-
 // dropAllReferences() - This function causes all the subinstructions to "let
 // go" of all references that they are maintaining.  This allows one to
 // 'delete' a whole class at a time, even though there may be circular
@@ -362,53 +345,102 @@ void Function::dropAllReferences() {
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void Function::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
-void Function::addAttributes(unsigned i, AttributeSet Attrs) {
-  AttributeSet PAL = getAttributes();
+void Function::addAttributes(unsigned i, const AttrBuilder &Attrs) {
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
+void Function::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
+void Function::addParamAttr(unsigned ArgNo, Attribute Attr) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
+  setAttributes(PAL);
+}
+
+void Function::addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addParamAttributes(getContext(), ArgNo, Attrs);
+  setAttributes(PAL);
+}
+
 void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void Function::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
-void Function::removeAttributes(unsigned i, AttributeSet Attrs) {
-  AttributeSet PAL = getAttributes();
+void Function::removeAttributes(unsigned i, const AttrBuilder &Attrs) {
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
+void Function::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
+void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
+void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs);
+  setAttributes(PAL);
+}
+
 void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
+void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addDereferenceableParamAttr(getContext(), ArgNo, Bytes);
+  setAttributes(PAL);
+}
+
 void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
+void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo,
+                                                 uint64_t Bytes) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addDereferenceableOrNullParamAttr(getContext(), ArgNo, Bytes);
+  setAttributes(PAL);
+}
+
 const std::string &Function::getGC() const {
   assert(hasGC() && "Function has no collector");
   return getContext().getGC(*this);
@@ -428,24 +460,20 @@ void Function::clearGC() {
 
 /// Copy all additional attributes (those not needed to create a Function) from
 /// the Function Src to this one.
-void Function::copyAttributesFrom(const GlobalValue *Src) {
+void Function::copyAttributesFrom(const Function *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  const Function *SrcF = dyn_cast<Function>(Src);
-  if (!SrcF)
-    return;
-
-  setCallingConv(SrcF->getCallingConv());
-  setAttributes(SrcF->getAttributes());
-  if (SrcF->hasGC())
-    setGC(SrcF->getGC());
+  setCallingConv(Src->getCallingConv());
+  setAttributes(Src->getAttributes());
+  if (Src->hasGC())
+    setGC(Src->getGC());
   else
     clearGC();
-  if (SrcF->hasPersonalityFn())
-    setPersonalityFn(SrcF->getPersonalityFn());
-  if (SrcF->hasPrefixData())
-    setPrefixData(SrcF->getPrefixData());
-  if (SrcF->hasPrologueData())
-    setPrologueData(SrcF->getPrologueData());
+  if (Src->hasPersonalityFn())
+    setPersonalityFn(Src->getPersonalityFn());
+  if (Src->hasPrefixData())
+    setPrefixData(Src->getPrefixData());
+  if (Src->hasPrologueData())
+    setPrologueData(Src->getPrologueData());
 }
 
 /// Table of string intrinsic names indexed by enum value.
@@ -528,15 +556,23 @@ void Function::recalculateIntrinsicID() {
 static std::string getMangledTypeStr(Type* Ty) {
   std::string Result;
   if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
-    Result += "p" + llvm::utostr(PTyp->getAddressSpace()) +
+    Result += "p" + utostr(PTyp->getAddressSpace()) +
       getMangledTypeStr(PTyp->getElementType());
   } else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
-    Result += "a" + llvm::utostr(ATyp->getNumElements()) +
+    Result += "a" + utostr(ATyp->getNumElements()) +
       getMangledTypeStr(ATyp->getElementType());
-  } else if (StructType* STyp = dyn_cast<StructType>(Ty)) {
-    assert(!STyp->isLiteral() && "TODO: implement literal types");
-    Result += STyp->getName();
-  } else if (FunctionType* FT = dyn_cast<FunctionType>(Ty)) {
+  } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
+    if (!STyp->isLiteral()) {
+      Result += "s_";
+      Result += STyp->getName();
+    } else {
+      Result += "sl_";
+      for (auto Elem : STyp->elements())
+        Result += getMangledTypeStr(Elem);
+    }
+    // Ensure nested structs are distinguishable.
+    Result += "s";
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(Ty)) {
     Result += "f_" + getMangledTypeStr(FT->getReturnType());
     for (size_t i = 0; i < FT->getNumParams(); i++)
       Result += getMangledTypeStr(FT->getParamType(i));
@@ -568,7 +604,6 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   return Result;
 }
 
-
 /// IIT_Info - These are enumerators that describe the entries returned by the
 /// getIntrinsicInfoTableEntries function.
 ///
@@ -611,18 +646,18 @@ enum IIT_Info {
   IIT_SAME_VEC_WIDTH_ARG = 31,
   IIT_PTR_TO_ARG = 32,
   IIT_PTR_TO_ELT = 33,
-  IIT_VEC_OF_PTRS_TO_ELT = 34,
+  IIT_VEC_OF_ANYPTRS_TO_ELT = 34,
   IIT_I128 = 35,
   IIT_V512 = 36,
   IIT_V1024 = 37
 };
 
-
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                       SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
+  using namespace Intrinsic;
+
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
   unsigned StructElts = 2;
-  using namespace Intrinsic;
 
   switch (Info) {
   case IIT_Done:
@@ -753,10 +788,11 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo));
     return;
   }
-  case IIT_VEC_OF_PTRS_TO_ELT: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfPtrsToElt,
-                                             ArgInfo));
+  case IIT_VEC_OF_ANYPTRS_TO_ELT: {
+    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::VecOfAnyPtrsToElt, ArgNo, RefNo));
     return;
   }
   case IIT_EMPTYSTRUCT:
@@ -776,7 +812,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   llvm_unreachable("unhandled");
 }
 
-
 #define GET_INTRINSIC_GENERATOR_GLOBAL
 #include "llvm/IR/Intrinsics.gen"
 #undef GET_INTRINSIC_GENERATOR_GLOBAL
@@ -814,10 +849,10 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
     DecodeIITType(NextElt, IITEntries, T);
 }
 
-
 static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              ArrayRef<Type*> Tys, LLVMContext &Context) {
   using namespace Intrinsic;
+
   IITDescriptor D = Infos.front();
   Infos = Infos.slice(1);
 
@@ -845,7 +880,6 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
       Elts[i] = DecodeFixedType(Infos, Tys, Context);
     return StructType::get(Context, makeArrayRef(Elts,D.Struct_NumElements));
   }
-
   case IITDescriptor::Argument:
     return Tys[D.getArgumentNumber()];
   case IITDescriptor::ExtendArgument: {
@@ -887,21 +921,13 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
     Type *EltTy = VTy->getVectorElementType();
     return PointerType::getUnqual(EltTy);
   }
-  case IITDescriptor::VecOfPtrsToElt: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    VectorType *VTy = dyn_cast<VectorType>(Ty);
-    if (!VTy)
-      llvm_unreachable("Expected an argument of Vector Type");
-    Type *EltTy = VTy->getVectorElementType();
-    return VectorType::get(PointerType::getUnqual(EltTy),
-                           VTy->getNumElements());
+  case IITDescriptor::VecOfAnyPtrsToElt:
+    // Return the overloaded type (which determines the pointers address space)
+    return Tys[D.getOverloadArgNumber()];
   }
- }
   llvm_unreachable("unhandled");
 }
 
-
-
 FunctionType *Intrinsic::getType(LLVMContext &Context,
                                  ID id, ArrayRef<Type*> Tys) {
   SmallVector<IITDescriptor, 8> Table;
@@ -1091,11 +1117,22 @@ bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor>
       return (!ThisArgType || !ReferenceType ||
               ThisArgType->getElementType() != ReferenceType->getElementType());
     }
-    case IITDescriptor::VecOfPtrsToElt: {
-      if (D.getArgumentNumber() >= ArgTys.size())
+    case IITDescriptor::VecOfAnyPtrsToElt: {
+      unsigned RefArgNumber = D.getRefArgNumber();
+
+      // This may only be used when referring to a previous argument.
+      if (RefArgNumber >= ArgTys.size())
         return true;
-      VectorType * ReferenceType =
-              dyn_cast<VectorType> (ArgTys[D.getArgumentNumber()]);
+
+      // Record the overloaded type
+      assert(D.getOverloadArgNumber() == ArgTys.size() &&
+             "Table consistency error");
+      ArgTys.push_back(Ty);
+
+      // Verify the overloaded type "matches" the Ref type.
+      // i.e. Ty is a vector with the same width as Ref.
+      // Composed of pointers to the same element type as Ref.
+      VectorType *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
       VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
       if (!ThisArgVecTy || !ReferenceType ||
           (ReferenceType->getVectorNumElements() !=
@@ -1279,9 +1316,10 @@ void Function::setValueSubclassDataBit(unsigned Bit, bool On) {
     setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit));
 }
 
-void Function::setEntryCount(uint64_t Count) {
+void Function::setEntryCount(uint64_t Count,
+                             const DenseSet<GlobalValue::GUID> *S) {
   MDBuilder MDB(getContext());
-  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count));
+  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count, S));
 }
 
 Optional<uint64_t> Function::getEntryCount() const {
@@ -1298,6 +1336,18 @@ Optional<uint64_t> Function::getEntryCount() const {
   return None;
 }
 
+DenseSet<GlobalValue::GUID> Function::getImportGUIDs() const {
+  DenseSet<GlobalValue::GUID> R;
+  if (MDNode *MD = getMetadata(LLVMContext::MD_prof))
+    if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0)))
+      if (MDS->getString().equals("function_entry_count"))
+        for (unsigned i = 2; i < MD->getNumOperands(); i++)
+          R.insert(mdconst::extract<ConstantInt>(MD->getOperand(i))
+                       ->getValue()
+                       .getZExtValue());
+  return R;
+}
+
 void Function::setSectionPrefix(StringRef Prefix) {
   MDBuilder MDB(getContext());
   setMetadata(LLVMContext::MD_section_prefix,
diff --git a/contrib/llvm/lib/IR/GCOV.cpp b/contrib/llvm/lib/IR/GCOV.cpp
index 3bbcf78..d4b4552 100644
--- a/contrib/llvm/lib/IR/GCOV.cpp
+++ b/contrib/llvm/lib/IR/GCOV.cpp
@@ -103,11 +103,17 @@ bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
   return true;
 }
 
+void GCOVFile::print(raw_ostream &OS) const {
+  for (const auto &FPtr : Functions)
+    FPtr->print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump GCOVFile content to dbgs() for debugging purposes.
 LLVM_DUMP_METHOD void GCOVFile::dump() const {
-  for (const auto &FPtr : Functions)
-    FPtr->dump();
+  print(dbgs());
 }
+#endif
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
@@ -343,13 +349,19 @@ uint64_t GCOVFunction::getExitCount() const {
   return Blocks.back()->getCount();
 }
 
+void GCOVFunction::print(raw_ostream &OS) const {
+  OS << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
+     << LineNumber << "\n";
+  for (const auto &Block : Blocks)
+    Block->print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 LLVM_DUMP_METHOD void GCOVFunction::dump() const {
-  dbgs() << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
-         << LineNumber << "\n";
-  for (const auto &Block : Blocks)
-    Block->dump();
+  print(dbgs());
 }
+#endif
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
@@ -400,29 +412,35 @@ void GCOVBlock::collectLineCounts(FileInfo &FI) {
     FI.addBlockLine(Parent.getFilename(), N, this);
 }
 
-/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
-LLVM_DUMP_METHOD void GCOVBlock::dump() const {
-  dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
+void GCOVBlock::print(raw_ostream &OS) const {
+  OS << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!SrcEdges.empty()) {
-    dbgs() << "\tSource Edges : ";
+    OS << "\tSource Edges : ";
     for (const GCOVEdge *Edge : SrcEdges)
-      dbgs() << Edge->Src.Number << " (" << Edge->Count << "), ";
-    dbgs() << "\n";
+      OS << Edge->Src.Number << " (" << Edge->Count << "), ";
+    OS << "\n";
   }
   if (!DstEdges.empty()) {
-    dbgs() << "\tDestination Edges : ";
+    OS << "\tDestination Edges : ";
     for (const GCOVEdge *Edge : DstEdges)
-      dbgs() << Edge->Dst.Number << " (" << Edge->Count << "), ";
-    dbgs() << "\n";
+      OS << Edge->Dst.Number << " (" << Edge->Count << "), ";
+    OS << "\n";
   }
   if (!Lines.empty()) {
-    dbgs() << "\tLines : ";
+    OS << "\tLines : ";
     for (uint32_t N : Lines)
-      dbgs() << (N) << ",";
-    dbgs() << "\n";
+      OS << (N) << ",";
+    OS << "\n";
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
+LLVM_DUMP_METHOD void GCOVBlock::dump() const {
+  print(dbgs());
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // FileInfo implementation.
 
@@ -571,8 +589,12 @@ FileInfo::openCoveragePath(StringRef CoveragePath) {
 /// print -  Print source files with collected line count information.
 void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
                      StringRef GCNOFile, StringRef GCDAFile) {
-  for (const auto &LI : LineInfo) {
-    StringRef Filename = LI.first();
+  SmallVector<StringRef, 4> Filenames;
+  for (const auto &LI : LineInfo)
+    Filenames.push_back(LI.first());
+  std::sort(Filenames.begin(), Filenames.end());
+
+  for (StringRef Filename : Filenames) {
     auto AllLines = LineConsumer(Filename);
 
     std::string CoveragePath = getCoveragePath(Filename, MainFilename);
@@ -585,7 +607,7 @@ void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
     CovOS << "        -:    0:Runs:" << RunCount << "\n";
     CovOS << "        -:    0:Programs:" << ProgramCount << "\n";
 
-    const LineData &Line = LI.second;
+    const LineData &Line = LineInfo[Filename];
     GCOVCoverage FileCoverage(Filename);
     for (uint32_t LineIndex = 0; LineIndex < Line.LastLine || !AllLines.empty();
          ++LineIndex) {
diff --git a/contrib/llvm/lib/IR/Globals.cpp b/contrib/llvm/lib/IR/Globals.cpp
index 6f73565..afd4a36 100644
--- a/contrib/llvm/lib/IR/Globals.cpp
+++ b/contrib/llvm/lib/IR/Globals.cpp
@@ -12,10 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LLVMContextImpl.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constants.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -24,7 +25,6 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "LLVMContextImpl.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -69,6 +69,30 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setDLLStorageClass(Src->getDLLStorageClass());
 }
 
+void GlobalValue::removeFromParent() {
+  switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME)                                              \
+  case Value::NAME##Val:                                                       \
+    return static_cast<NAME *>(this)->removeFromParent();
+#include "llvm/IR/Value.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a global");
+}
+
+void GlobalValue::eraseFromParent() {
+  switch (getValueID()) {
+#define HANDLE_GLOBAL_VALUE(NAME)                                              \
+  case Value::NAME##Val:                                                       \
+    return static_cast<NAME *>(this)->eraseFromParent();
+#include "llvm/IR/Value.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a global");
+}
+
 unsigned GlobalValue::getAlignment() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
@@ -93,24 +117,10 @@ void GlobalObject::setAlignment(unsigned Align) {
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
-unsigned GlobalObject::getGlobalObjectSubClassData() const {
-  unsigned ValueData = getGlobalValueSubClassData();
-  return ValueData >> GlobalObjectBits;
-}
-
-void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
-  unsigned OldData = getGlobalValueSubClassData();
-  setGlobalValueSubClassData((OldData & GlobalObjectMask) |
-                             (Val << GlobalObjectBits));
-  assert(getGlobalObjectSubClassData() == Val && "representation error");
-}
-
-void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalObject::copyAttributesFrom(const GlobalObject *Src) {
   GlobalValue::copyAttributesFrom(Src);
-  if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
-    setAlignment(GV->getAlignment());
-    setSection(GV->getSection());
-  }
+  setAlignment(Src->getAlignment());
+  setSection(Src->getSection());
 }
 
 std::string GlobalValue::getGlobalIdentifier(StringRef Name,
@@ -152,7 +162,7 @@ StringRef GlobalValue::getSection() const {
   return cast<GlobalObject>(this)->getSection();
 }
 
-Comdat *GlobalValue::getComdat() {
+const Comdat *GlobalValue::getComdat() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
     if (const GlobalObject *GO = GA->getBaseObject())
@@ -177,7 +187,9 @@ void GlobalObject::setSection(StringRef S) {
 
   // Get or create a stable section name string and put it in the table in the
   // context.
-  S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  if (!S.empty()) {
+    S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  }
   getContext().pImpl->GlobalObjectSections[this] = S;
 
   // Update the HasSectionHashEntryBit. Setting the section to the empty string
@@ -240,10 +252,10 @@ bool GlobalValue::canIncreaseAlignment() const {
   return true;
 }
 
-GlobalObject *GlobalValue::getBaseObject() {
+const GlobalObject *GlobalValue::getBaseObject() const {
   if (auto *GO = dyn_cast<GlobalObject>(this))
     return GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(this))
+  if (auto *GA = dyn_cast<GlobalIndirectSymbol>(this))
     return GA->getBaseObject();
   return nullptr;
 }
@@ -281,6 +293,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
                    InitVal != nullptr, Link, Name, AddressSpace),
       isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
+  assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) &&
+         "invalid type for global variable");
   setThreadLocalMode(TLMode);
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
@@ -299,6 +313,8 @@ GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                    InitVal != nullptr, Link, Name, AddressSpace),
       isConstantGlobal(constant),
       isExternallyInitializedConstant(isExternallyInitialized) {
+  assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) &&
+         "invalid type for global variable");
   setThreadLocalMode(TLMode);
   if (InitVal) {
     assert(InitVal->getType() == Ty &&
@@ -343,12 +359,11 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
 
 /// Copy all additional attributes (those not needed to create a GlobalVariable)
 /// from the GlobalVariable Src to this one.
-void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
+void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  if (const GlobalVariable *SrcVar = dyn_cast<GlobalVariable>(Src)) {
-    setThreadLocalMode(SrcVar->getThreadLocalMode());
-    setExternallyInitialized(SrcVar->isExternallyInitialized());
-  }
+  setThreadLocalMode(Src->getThreadLocalMode());
+  setExternallyInitialized(Src->isExternallyInitialized());
+  setAttributes(Src->getAttributes());
 }
 
 void GlobalVariable::dropAllReferences() {
diff --git a/contrib/llvm/lib/IR/IRBuilder.cpp b/contrib/llvm/lib/IR/IRBuilder.cpp
index d3e410d..b7fa07c 100644
--- a/contrib/llvm/lib/IR/IRBuilder.cpp
+++ b/contrib/llvm/lib/IR/IRBuilder.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Statepoint.h"
@@ -134,6 +134,37 @@ CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
   return CI;  
 }
 
+CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
+    Value *Dst, Value *Src, Value *Size, uint32_t ElementSize, MDNode *TBAATag,
+    MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Src = getCastedInt8PtrValue(Src);
+
+  Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)};
+  Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
+  Module *M = BB->getParent()->getParent();
+  Value *TheFn = Intrinsic::getDeclaration(
+      M, Intrinsic::memcpy_element_unordered_atomic, Tys);
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  // Set the TBAA Struct info if present.
+  if (TBAAStructTag)
+    CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+  return CI;
+}
+
 CallInst *IRBuilderBase::
 CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
               bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag,
@@ -161,6 +192,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
   return CI;  
 }
 
+static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
+                                    Value *Src) {
+  Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Src};
+  Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
+  auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+  return createCallHelper(Decl, Ops, Builder);
+}
+
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Acc, Src};
+  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+                 Src->getType()};
+  auto Decl = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+  return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
+  Module *M = GetInsertBlock()->getParent()->getParent();
+  Value *Ops[] = {Acc, Src};
+  Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+                 Src->getType()};
+  auto Decl = Intrinsic::getDeclaration(
+      M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+  return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
+  return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
+                               Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
+  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
+                     : Intrinsic::experimental_vector_reduce_umax;
+  return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
+  auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
+                     : Intrinsic::experimental_vector_reduce_umin;
+  return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
+  auto Rdx = getReductionIntrinsic(
+      this, Intrinsic::experimental_vector_reduce_fmax, Src);
+  if (NoNaN) {
+    FastMathFlags FMF;
+    FMF.setNoNaNs();
+    Rdx->setFastMathFlags(FMF);
+  }
+  return Rdx;
+}
+
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
+  auto Rdx = getReductionIntrinsic(
+      this, Intrinsic::experimental_vector_reduce_fmin, Src);
+  if (NoNaN) {
+    FastMathFlags FMF;
+    FMF.setNoNaNs();
+    Rdx->setFastMathFlags(FMF);
+  }
+  return Rdx;
+}
+
 CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
   assert(isa<PointerType>(Ptr->getType()) &&
          "lifetime.start only applies to pointers.");
@@ -172,7 +291,8 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
            "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start,
+                                           { Ptr->getType() });
   return createCallHelper(TheFn, Ops, this);
 }
 
@@ -187,7 +307,8 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
            "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end,
+                                           { Ptr->getType() });
   return createCallHelper(TheFn, Ops, this);
 }
 
@@ -291,11 +412,16 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
     Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
                                      NumElts));
 
-  Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)};
+  if (!PassThru)
+    PassThru = UndefValue::get(DataTy);
+
+  Type *OverloadedTypes[] = {DataTy, PtrsTy};
+  Value * Ops[] = {Ptrs, getInt32(Align), Mask, PassThru};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
-  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, { DataTy }, Name);
+  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes,
+                               Name);
 }
 
 /// \brief Create a call to a Masked Scatter intrinsic.
@@ -321,11 +447,13 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
   if (!Mask)
     Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
                                      NumElts));
+
+  Type *OverloadedTypes[] = {DataTy, PtrsTy};
   Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
-  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, { DataTy });
+  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
 }
 
 template <typename T0, typename T1, typename T2, typename T3>
@@ -482,3 +610,11 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
                   getInt32(DerivedOffset)};
  return createCallHelper(FnGCRelocate, Args, this, Name);
 }
+
+CallInst *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID,
+                                               Value *LHS, Value *RHS,
+                                               const Twine &Name) {
+  Module *M = BB->getParent()->getParent();
+  Function *Fn =  Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  return createCallHelper(Fn, { LHS, RHS }, this, Name);
+}
diff --git a/contrib/llvm/lib/IR/IRPrintingPasses.cpp b/contrib/llvm/lib/IR/IRPrintingPasses.cpp
index 05e206c..955fdc7 100644
--- a/contrib/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/contrib/llvm/lib/IR/IRPrintingPasses.cpp
@@ -70,6 +70,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print Module IR"; }
 };
 
 class PrintFunctionPassWrapper : public FunctionPass {
@@ -91,6 +93,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print Function IR"; }
 };
 
 class PrintBasicBlockPass : public BasicBlockPass {
@@ -111,6 +115,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print BasicBlock IR"; }
 };
 
 }
diff --git a/contrib/llvm/lib/IR/InlineAsm.cpp b/contrib/llvm/lib/IR/InlineAsm.cpp
index 5a91185..ad22efd 100644
--- a/contrib/llvm/lib/IR/InlineAsm.cpp
+++ b/contrib/llvm/lib/IR/InlineAsm.cpp
@@ -1,4 +1,4 @@
-//===-- InlineAsm.cpp - Implement the InlineAsm class ---------------------===//
+//===- InlineAsm.cpp - Implement the InlineAsm class ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,24 +14,19 @@
 #include "llvm/IR/InlineAsm.h"
 #include "ConstantsContext.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
-using namespace llvm;
-
-// Implement the first virtual method in this class in this file so the
-// InlineAsm vtable is emitted here.
-InlineAsm::~InlineAsm() {
-}
+#include <cstddef>
+#include <cstdlib>
 
-InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
-                          StringRef Constraints, bool hasSideEffects,
-                          bool isAlignStack, AsmDialect asmDialect) {
-  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
-                       isAlignStack, asmDialect);
-  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
-  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
-}
+using namespace llvm;
 
 InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
@@ -40,12 +35,20 @@ InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
       AsmString(asmString), Constraints(constraints), FTy(FTy),
       HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
       Dialect(asmDialect) {
-
   // Do various checks on the constraint string and type.
   assert(Verify(getFunctionType(), constraints) &&
          "Function type not legal for constraints!");
 }
 
+InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
+                          StringRef Constraints, bool hasSideEffects,
+                          bool isAlignStack, AsmDialect asmDialect) {
+  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
+                       isAlignStack, asmDialect);
+  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
+  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
+}
+
 void InlineAsm::destroyConstant() {
   getType()->getContext().pImpl->InlineAsms.remove(this);
   delete this;
@@ -55,14 +58,6 @@ FunctionType *InlineAsm::getFunctionType() const {
   return FTy;
 }
     
-///Default constructor.
-InlineAsm::ConstraintInfo::ConstraintInfo() :
-  Type(isInput), isEarlyClobber(false),
-  MatchingInput(-1), isCommutative(false),
-  isIndirect(false), isMultipleAlternative(false),
-  currentAlternativeIndex(0) {
-}
-
 /// Parse - Analyze the specified string (e.g. "==&{eax}") and fill in the
 /// fields in this structure.  If the constraint string is not understood,
 /// return true, otherwise return false.
diff --git a/contrib/llvm/lib/IR/Instruction.cpp b/contrib/llvm/lib/IR/Instruction.cpp
index 2fa0348..365cb01 100644
--- a/contrib/llvm/lib/IR/Instruction.cpp
+++ b/contrib/llvm/lib/IR/Instruction.cpp
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
@@ -42,8 +43,6 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
   InsertAtEnd->getInstList().push_back(this);
 }
 
-
-// Out of line virtual method, so the vtable, etc has a home.
 Instruction::~Instruction() {
   assert(!Parent && "Instruction still linked in the program!");
   if (hasMetadataHashEntry())
@@ -59,12 +58,6 @@ const Module *Instruction::getModule() const {
   return getParent()->getModule();
 }
 
-Module *Instruction::getModule() {
-  return getParent()->getModule();
-}
-
-Function *Instruction::getFunction() { return getParent()->getParent(); }
-
 const Function *Instruction::getFunction() const {
   return getParent()->getParent();
 }
@@ -122,6 +115,29 @@ bool Instruction::hasNoSignedWrap() const {
   return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
 }
 
+void Instruction::dropPoisonGeneratingFlags() {
+  switch (getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+    cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(false);
+    cast<OverflowingBinaryOperator>(this)->setHasNoSignedWrap(false);
+    break;
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::AShr:
+  case Instruction::LShr:
+    cast<PossiblyExactOperator>(this)->setIsExact(false);
+    break;
+
+  case Instruction::GetElementPtr:
+    cast<GetElementPtrInst>(this)->setIsInBounds(false);
+    break;
+  }
+}
+
 bool Instruction::isExact() const {
   return cast<PossiblyExactOperator>(this)->isExact();
 }
@@ -186,6 +202,11 @@ bool Instruction::hasAllowReciprocal() const {
   return cast<FPMathOperator>(this)->hasAllowReciprocal();
 }
 
+bool Instruction::hasAllowContract() const {
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasAllowContract();
+}
+
 FastMathFlags Instruction::getFastMathFlags() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
@@ -195,10 +216,10 @@ void Instruction::copyFastMathFlags(const Instruction *I) {
   copyFastMathFlags(I->getFastMathFlags());
 }
 
-void Instruction::copyIRFlags(const Value *V) {
+void Instruction::copyIRFlags(const Value *V, bool IncludeWrapFlags) {
   // Copy the wrapping flags.
-  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
-    if (isa<OverflowingBinaryOperator>(this)) {
+  if (IncludeWrapFlags && isa<OverflowingBinaryOperator>(this)) {
+    if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
       setHasNoSignedWrap(OB->hasNoSignedWrap());
       setHasNoUnsignedWrap(OB->hasNoUnsignedWrap());
     }
@@ -341,13 +362,13 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
            (LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() ||
             IgnoreAlignment) &&
            LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
-           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
+           LI->getSyncScopeID() == cast<LoadInst>(I2)->getSyncScopeID();
   if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
     return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
            (SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() ||
             IgnoreAlignment) &&
            SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
-           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
+           SI->getSyncScopeID() == cast<StoreInst>(I2)->getSyncScopeID();
   if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
     return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
   if (const CallInst *CI = dyn_cast<CallInst>(I1))
@@ -365,7 +386,7 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
     return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
   if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
     return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
-           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
+           FI->getSyncScopeID() == cast<FenceInst>(I2)->getSyncScopeID();
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
     return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
            CXI->isWeak() == cast<AtomicCmpXchgInst>(I2)->isWeak() &&
@@ -373,12 +394,13 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
                cast<AtomicCmpXchgInst>(I2)->getSuccessOrdering() &&
            CXI->getFailureOrdering() ==
                cast<AtomicCmpXchgInst>(I2)->getFailureOrdering() &&
-           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
+           CXI->getSyncScopeID() ==
+               cast<AtomicCmpXchgInst>(I2)->getSyncScopeID();
   if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
     return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
            RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
            RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
-           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
+           RMWI->getSyncScopeID() == cast<AtomicRMWInst>(I2)->getSyncScopeID();
 
   return true;
 }
@@ -511,6 +533,30 @@ bool Instruction::isAtomic() const {
   }
 }
 
+bool Instruction::hasAtomicLoad() const {
+  assert(isAtomic());
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Load:
+    return true;
+  }
+}
+
+bool Instruction::hasAtomicStore() const {
+  assert(isAtomic());
+  switch (getOpcode()) {
+  default:
+    return false;
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
+  case Instruction::Store:
+    return true;
+  }
+}
+
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
@@ -521,17 +567,6 @@ bool Instruction::mayThrow() const {
   return isa<ResumeInst>(this);
 }
 
-/// Return true if the instruction is associative:
-///
-///   Associative operators satisfy:  x op (y op z) === (x op y) op z
-///
-/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative.
-///
-bool Instruction::isAssociative(unsigned Opcode) {
-  return Opcode == And || Opcode == Or || Opcode == Xor ||
-         Opcode == Add || Opcode == Mul;
-}
-
 bool Instruction::isAssociative() const {
   unsigned Opcode = getOpcode();
   if (isAssociative(Opcode))
@@ -546,51 +581,6 @@ bool Instruction::isAssociative() const {
   }
 }
 
-/// Return true if the instruction is commutative:
-///
-///   Commutative operators satisfy: (x op y) === (y op x)
-///
-/// In LLVM, these are the associative operators, plus SetEQ and SetNE, when
-/// applied to any type.
-///
-bool Instruction::isCommutative(unsigned op) {
-  switch (op) {
-  case Add:
-  case FAdd:
-  case Mul:
-  case FMul:
-  case And:
-  case Or:
-  case Xor:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// Return true if the instruction is idempotent:
-///
-///   Idempotent operators satisfy:  x op x === x
-///
-/// In LLVM, the And and Or operators are idempotent.
-///
-bool Instruction::isIdempotent(unsigned Opcode) {
-  return Opcode == And || Opcode == Or;
-}
-
-/// Return true if the instruction is nilpotent:
-///
-///   Nilpotent operators satisfy:  x op x === Id,
-///
-///   where Id is the identity for the operator, i.e. a constant such that
-///     x op Id === x and Id op x === x for all x.
-///
-/// In LLVM, the Xor operator is nilpotent.
-///
-bool Instruction::isNilpotent(unsigned Opcode) {
-  return Opcode == Xor;
-}
-
 Instruction *Instruction::cloneImpl() const {
   llvm_unreachable("Subclass of Instruction failed to implement cloneImpl");
 }
@@ -651,3 +641,55 @@ Instruction *Instruction::clone() const {
   New->copyMetadata(*this);
   return New;
 }
+
+void Instruction::updateProfWeight(uint64_t S, uint64_t T) {
+  auto *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (ProfileData == nullptr)
+    return;
+
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
+  if (!ProfDataName || (!ProfDataName->getString().equals("branch_weights") &&
+                        !ProfDataName->getString().equals("VP")))
+    return;
+
+  MDBuilder MDB(getContext());
+  SmallVector<Metadata *, 3> Vals;
+  Vals.push_back(ProfileData->getOperand(0));
+  APInt APS(128, S), APT(128, T);
+  if (ProfDataName->getString().equals("branch_weights"))
+    for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+      // Using APInt::div may be expensive, but most cases should fit 64 bits.
+      APInt Val(128,
+                mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i))
+                    ->getValue()
+                    .getZExtValue());
+      Val *= APS;
+      Vals.push_back(MDB.createConstant(
+          ConstantInt::get(Type::getInt64Ty(getContext()),
+                           Val.udiv(APT).getLimitedValue())));
+    }
+  else if (ProfDataName->getString().equals("VP"))
+    for (unsigned i = 1; i < ProfileData->getNumOperands(); i += 2) {
+      // The first value is the key of the value profile, which will not change.
+      Vals.push_back(ProfileData->getOperand(i));
+      // Using APInt::div may be expensive, but most cases should fit 64 bits.
+      APInt Val(128,
+                mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i + 1))
+                    ->getValue()
+                    .getZExtValue());
+      Val *= APS;
+      Vals.push_back(MDB.createConstant(
+          ConstantInt::get(Type::getInt64Ty(getContext()),
+                           Val.udiv(APT).getLimitedValue())));
+    }
+  setMetadata(LLVMContext::MD_prof, MDNode::get(getContext(), Vals));
+}
+
+void Instruction::setProfWeight(uint64_t W) {
+  assert((isa<CallInst>(this) || isa<InvokeInst>(this)) &&
+         "Can only set weights for call and invoke instrucitons");
+  SmallVector<uint32_t, 1> Weights;
+  Weights.push_back(W);
+  MDBuilder MDB(getContext());
+  setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
diff --git a/contrib/llvm/lib/IR/Instructions.cpp b/contrib/llvm/lib/IR/Instructions.cpp
index b679269..2c49564 100644
--- a/contrib/llvm/lib/IR/Instructions.cpp
+++ b/contrib/llvm/lib/IR/Instructions.cpp
@@ -1,4 +1,4 @@
-//===-- Instructions.cpp - Implement the LLVM instructions ----------------===//
+//===- Instructions.cpp - Implement the LLVM instructions -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,16 +14,34 @@
 
 #include "llvm/IR/Instructions.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -41,16 +59,40 @@ User::op_iterator CallSite::getCallee() const {
 //                            TerminatorInst Class
 //===----------------------------------------------------------------------===//
 
-// Out of line virtual method, so the vtable, etc has a home.
-TerminatorInst::~TerminatorInst() {
+unsigned TerminatorInst::getNumSuccessors() const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->getNumSuccessors();
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
 }
 
-//===----------------------------------------------------------------------===//
-//                           UnaryInstruction Class
-//===----------------------------------------------------------------------===//
+BasicBlock *TerminatorInst::getSuccessor(unsigned idx) const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->getSuccessor(idx);
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
+}
 
-// Out of line virtual method, so the vtable, etc has a home.
-UnaryInstruction::~UnaryInstruction() {
+void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<CLASS *>(this)->setSuccessor(idx, B);
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
 }
 
 //===----------------------------------------------------------------------===//
@@ -82,13 +124,10 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
   return nullptr;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                               PHINode Class
 //===----------------------------------------------------------------------===//
 
-void PHINode::anchor() {}
-
 PHINode::PHINode(const PHINode &PN)
     : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()),
       ReservedSpace(PN.getNumOperands()) {
@@ -242,9 +281,6 @@ void LandingPadInst::addClause(Constant *Val) {
 //                        CallInst Implementation
 //===----------------------------------------------------------------------===//
 
-CallInst::~CallInst() {
-}
-
 void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
                     ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr) {
   this->FTy = FTy;
@@ -307,7 +343,7 @@ CallInst::CallInst(const CallInst &CI)
     : Instruction(CI.getType(), Instruction::Call,
                   OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
                   CI.getNumOperands()),
-      AttributeList(CI.AttributeList), FTy(CI.FTy) {
+      Attrs(CI.Attrs), FTy(CI.FTy) {
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
 
@@ -334,59 +370,97 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
 Value *CallInst::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
-    return getArgOperand(Index-1);
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+    return getArgOperand(Index - AttributeList::FirstArgIndex);
   if (const Function *F = getCalledFunction())
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
         Index)
-      return getArgOperand(Index-1);
-      
+      return getArgOperand(Index - AttributeList::FirstArgIndex);
+
   return nullptr;
 }
 
 void CallInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
+void CallInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+  assert(ArgNo < getNumArgOperands() && "Out of bounds");
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
+void CallInst::addParamAttr(unsigned ArgNo, Attribute Attr) {
+  assert(ArgNo < getNumArgOperands() && "Out of bounds");
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
+  setAttributes(PAL);
+}
+
 void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
+void CallInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+  assert(ArgNo < getNumArgOperands() && "Out of bounds");
+  AttributeList PAL = getAttributes();
+  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
+void CallInst::removeParamAttr(unsigned ArgNo, StringRef Kind) {
+  assert(ArgNo < getNumArgOperands() && "Out of bounds");
+  AttributeList PAL = getAttributes();
+  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
 void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
+bool CallInst::hasRetAttr(Attribute::AttrKind Kind) const {
+  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    return true;
+
+  // Look at the callee, if available.
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+  return false;
+}
+
 bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
-  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
+  assert(i < getNumArgOperands() && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, Kind))
+  if (Attrs.hasParamAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(i, Kind);
+    return F->getAttributes().hasParamAttribute(i, Kind);
   return false;
 }
 
@@ -400,8 +474,13 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i,
   // question is a call argument; or be indirectly implied by the kind of its
   // containing operand bundle, if the operand is a bundle operand.
 
+  if (i == AttributeList::ReturnIndex)
+    return hasRetAttr(Kind);
+
+  // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
+  // indices.
   if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i, Kind);
+    return paramHasAttr(i - 1, Kind);
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either a call argument or an operand bundle!");
@@ -466,7 +545,7 @@ static Instruction *createMalloc(Instruction *InsertBefore,
   Value *MallocFunc = MallocF;
   if (!MallocFunc)
     // prototype malloc as "void *malloc(size_t)"
-    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, nullptr);
+    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy);
   PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
   CallInst *MCall = nullptr;
   Instruction *Result = nullptr;
@@ -489,7 +568,8 @@ static Instruction *createMalloc(Instruction *InsertBefore,
   MCall->setTailCall();
   if (Function *F = dyn_cast<Function>(MallocFunc)) {
     MCall->setCallingConv(F->getCallingConv());
-    if (!F->doesNotAlias(0)) F->setDoesNotAlias(0);
+    if (!F->returnDoesNotAlias())
+      F->setReturnDoesNotAlias();
   }
   assert(!MCall->getType()->isVoidTy() && "Malloc has void return type");
 
@@ -520,7 +600,6 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
                       ArraySize, OpB, MallocF, Name);
 }
 
-
 /// CreateMalloc - Generate the IR for a call to malloc:
 /// 1. Compute the malloc call's argument as the specified type's size,
 ///    possibly multiplied by the array size if the array size is not
@@ -560,7 +639,7 @@ static Instruction *createFree(Value *Source,
   Type *VoidTy = Type::getVoidTy(M->getContext());
   Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
-  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, nullptr);
+  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy);
   CallInst *Result = nullptr;
   Value *PtrCast = Source;
   if (InsertBefore) {
@@ -646,7 +725,7 @@ InvokeInst::InvokeInst(const InvokeInst &II)
                      OperandTraits<InvokeInst>::op_end(this) -
                          II.getNumOperands(),
                      II.getNumOperands()),
-      AttributeList(II.AttributeList), FTy(II.FTy) {
+      Attrs(II.Attrs), FTy(II.FTy) {
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
   std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -668,36 +747,36 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
   return NewII;
 }
 
-BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const {
-  return getSuccessor(idx);
-}
-unsigned InvokeInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
-  return setSuccessor(idx, B);
-}
-
 Value *InvokeInst::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
-    return getArgOperand(Index-1);
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+    return getArgOperand(Index - AttributeList::FirstArgIndex);
   if (const Function *F = getCalledFunction())
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
         Index)
-      return getArgOperand(Index-1);
-      
+      return getArgOperand(Index - AttributeList::FirstArgIndex);
+
   return nullptr;
 }
 
+bool InvokeInst::hasRetAttr(Attribute::AttrKind Kind) const {
+  if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    return true;
+
+  // Look at the callee, if available.
+  if (const Function *F = getCalledFunction())
+    return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+  return false;
+}
+
 bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
-  assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
+  assert(i < getNumArgOperands() && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, Kind))
+  if (Attrs.hasParamAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(i, Kind);
+    return F->getAttributes().hasParamAttribute(i, Kind);
   return false;
 }
 
@@ -711,8 +790,13 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
   // question is an invoke argument; or be indirectly implied by the kind of its
   // containing operand bundle, if the operand is a bundle operand.
 
+  if (i == AttributeList::ReturnIndex)
+    return hasRetAttr(Kind);
+
+  // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
+  // indices.
   if (i < (getNumArgOperands() + 1))
-    return paramHasAttr(i, Kind);
+    return paramHasAttr(i - 1, Kind);
 
   assert(hasOperandBundles() && i >= (getBundleOperandsStartIndex() + 1) &&
          "Must be either an invoke argument or an operand bundle!");
@@ -720,37 +804,49 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
+void InvokeInst::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
 void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
+void InvokeInst::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
+  AttributeList PAL = getAttributes();
+  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
+  setAttributes(PAL);
+}
+
 void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void InvokeInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
@@ -780,6 +876,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore)
   if (retVal)
     Op<0>() = retVal;
 }
+
 ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(C), Instruction::Ret,
                    OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
@@ -787,28 +884,12 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd)
   if (retVal)
     Op<0>() = retVal;
 }
+
 ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
   : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret,
                    OperandTraits<ReturnInst>::op_end(this), 0, InsertAtEnd) {
 }
 
-unsigned ReturnInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-
-/// Out-of-line ReturnInst method, put here so the C++ compiler can choose to
-/// emit the vtable for the class in this translation unit.
-void ReturnInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
-  llvm_unreachable("ReturnInst has no successors!");
-}
-
-BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const {
-  llvm_unreachable("ReturnInst has no successors!");
-}
-
-ReturnInst::~ReturnInst() {
-}
-
 //===----------------------------------------------------------------------===//
 //                        ResumeInst Implementation
 //===----------------------------------------------------------------------===//
@@ -831,18 +912,6 @@ ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
   Op<0>() = Exn;
 }
 
-unsigned ResumeInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-
-void ResumeInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
-  llvm_unreachable("ResumeInst has no successors!");
-}
-
-BasicBlock *ResumeInst::getSuccessorV(unsigned idx) const {
-  llvm_unreachable("ResumeInst has no successors!");
-}
-
 //===----------------------------------------------------------------------===//
 //                        CleanupReturnInst Implementation
 //===----------------------------------------------------------------------===//
@@ -885,18 +954,6 @@ CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
   init(CleanupPad, UnwindBB);
 }
 
-BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const {
-  assert(Idx == 0);
-  return getUnwindDest();
-}
-unsigned CleanupReturnInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
-  assert(Idx == 0);
-  setUnwindDest(B);
-}
-
 //===----------------------------------------------------------------------===//
 //                        CatchReturnInst Implementation
 //===----------------------------------------------------------------------===//
@@ -928,18 +985,6 @@ CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
   init(CatchPad, BB);
 }
 
-BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const {
-  assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
-  return getSuccessor();
-}
-unsigned CatchReturnInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) {
-  assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
-  setSuccessor(B);
-}
-
 //===----------------------------------------------------------------------===//
 //                       CatchSwitchInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1023,16 +1068,6 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) {
   setNumHungOffUseOperands(getNumOperands() - 1);
 }
 
-BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const {
-  return getSuccessor(idx);
-}
-unsigned CatchSwitchInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
-  setSuccessor(idx, B);
-}
-
 //===----------------------------------------------------------------------===//
 //                        FuncletPadInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1085,18 +1120,6 @@ UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
                    nullptr, 0, InsertAtEnd) {
 }
 
-unsigned UnreachableInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-
-void UnreachableInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
-  llvm_unreachable("UnreachableInst has no successors!");
-}
-
-BasicBlock *UnreachableInst::getSuccessorV(unsigned idx) const {
-  llvm_unreachable("UnreachableInst has no successors!");
-}
-
 //===----------------------------------------------------------------------===//
 //                        BranchInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1114,6 +1137,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore)
   assert(IfTrue && "Branch destination may not be null!");
   Op<-1>() = IfTrue;
 }
+
 BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
                        Instruction *InsertBefore)
   : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
@@ -1148,7 +1172,6 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
 #endif
 }
 
-
 BranchInst::BranchInst(const BranchInst &BI) :
   TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br,
                  OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
@@ -1172,17 +1195,6 @@ void BranchInst::swapSuccessors() {
   swapProfMetadata();
 }
 
-BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
-  return getSuccessor(idx);
-}
-unsigned BranchInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
-  setSuccessor(idx, B);
-}
-
-
 //===----------------------------------------------------------------------===//
 //                        AllocaInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1199,44 +1211,44 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   return Amt;
 }
 
-AllocaInst::AllocaInst(Type *Ty, const Twine &Name, Instruction *InsertBefore)
-    : AllocaInst(Ty, /*ArraySize=*/nullptr, Name, InsertBefore) {}
-
-AllocaInst::AllocaInst(Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
-
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        Instruction *InsertBefore)
-    : AllocaInst(Ty, ArraySize, /*Align=*/0, Name, InsertBefore) {}
+  : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, ArraySize, /*Align=*/0, Name, InsertAtEnd) {}
+  : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
-    : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                       getAISize(Ty->getContext(), ArraySize), InsertBefore),
-      AllocatedType(Ty) {
+  : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/0, Name, InsertBefore) {}
+
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       const Twine &Name, BasicBlock *InsertAtEnd)
+  : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/0, Name, InsertAtEnd) {}
+
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       unsigned Align, const Twine &Name,
+                       Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertBefore),
+    AllocatedType(Ty) {
   setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
-                       const Twine &Name, BasicBlock *InsertAtEnd)
-    : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                       getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       unsigned Align, const Twine &Name,
+                       BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
       AllocatedType(Ty) {
   setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
-// Out of line virtual method, so the vtable, etc has a home.
-AllocaInst::~AllocaInst() {
-}
-
 void AllocaInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
   assert(Align <= MaximumAlignment &&
@@ -1292,34 +1304,34 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
-               CrossThread, InsertBef) {}
+               SyncScope::System, InsertBef) {}
 
 LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, BasicBlock *InsertAE)
     : LoadInst(Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
-               CrossThread, InsertAE) {}
+               SyncScope::System, InsertAE) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    unsigned Align, AtomicOrdering Order,
-                   SynchronizationScope SynchScope, Instruction *InsertBef)
+                   SyncScope::ID SSID, Instruction *InsertBef)
     : UnaryInstruction(Ty, Load, Ptr, InsertBef) {
   assert(Ty == cast<PointerType>(Ptr->getType())->getElementType());
   setVolatile(isVolatile);
   setAlignment(Align);
-  setAtomic(Order, SynchScope);
+  setAtomic(Order, SSID);
   AssertOK();
   setName(Name);
 }
 
 LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
                    unsigned Align, AtomicOrdering Order,
-                   SynchronizationScope SynchScope,
+                   SyncScope::ID SSID,
                    BasicBlock *InsertAE)
   : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
                      Load, Ptr, InsertAE) {
   setVolatile(isVolatile);
   setAlignment(Align);
-  setAtomic(Order, SynchScope);
+  setAtomic(Order, SSID);
   AssertOK();
   setName(Name);
 }
@@ -1407,16 +1419,16 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, unsigned Align,
                      Instruction *InsertBefore)
     : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
-                CrossThread, InsertBefore) {}
+                SyncScope::System, InsertBefore) {}
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, unsigned Align,
                      BasicBlock *InsertAtEnd)
     : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
-                CrossThread, InsertAtEnd) {}
+                SyncScope::System, InsertAtEnd) {}
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      unsigned Align, AtomicOrdering Order,
-                     SynchronizationScope SynchScope,
+                     SyncScope::ID SSID,
                      Instruction *InsertBefore)
   : Instruction(Type::getVoidTy(val->getContext()), Store,
                 OperandTraits<StoreInst>::op_begin(this),
@@ -1426,13 +1438,13 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   Op<1>() = addr;
   setVolatile(isVolatile);
   setAlignment(Align);
-  setAtomic(Order, SynchScope);
+  setAtomic(Order, SSID);
   AssertOK();
 }
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      unsigned Align, AtomicOrdering Order,
-                     SynchronizationScope SynchScope,
+                     SyncScope::ID SSID,
                      BasicBlock *InsertAtEnd)
   : Instruction(Type::getVoidTy(val->getContext()), Store,
                 OperandTraits<StoreInst>::op_begin(this),
@@ -1442,7 +1454,7 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   Op<1>() = addr;
   setVolatile(isVolatile);
   setAlignment(Align);
-  setAtomic(Order, SynchScope);
+  setAtomic(Order, SSID);
   AssertOK();
 }
 
@@ -1462,13 +1474,13 @@ void StoreInst::setAlignment(unsigned Align) {
 void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
                              AtomicOrdering SuccessOrdering,
                              AtomicOrdering FailureOrdering,
-                             SynchronizationScope SynchScope) {
+                             SyncScope::ID SSID) {
   Op<0>() = Ptr;
   Op<1>() = Cmp;
   Op<2>() = NewVal;
   setSuccessOrdering(SuccessOrdering);
   setFailureOrdering(FailureOrdering);
-  setSynchScope(SynchScope);
+  setSyncScopeID(SSID);
 
   assert(getOperand(0) && getOperand(1) && getOperand(2) &&
          "All operands must be non-null!");
@@ -1495,27 +1507,25 @@ void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
 AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      AtomicOrdering SuccessOrdering,
                                      AtomicOrdering FailureOrdering,
-                                     SynchronizationScope SynchScope,
+                                     SyncScope::ID SSID,
                                      Instruction *InsertBefore)
     : Instruction(
-          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
-                          nullptr),
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
-  Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
+  Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SSID);
 }
 
 AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
                                      AtomicOrdering SuccessOrdering,
                                      AtomicOrdering FailureOrdering,
-                                     SynchronizationScope SynchScope,
+                                     SyncScope::ID SSID,
                                      BasicBlock *InsertAtEnd)
     : Instruction(
-          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()),
-                          nullptr),
+          StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
-  Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope);
+  Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SSID);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1524,12 +1534,12 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
 
 void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
                          AtomicOrdering Ordering,
-                         SynchronizationScope SynchScope) {
+                         SyncScope::ID SSID) {
   Op<0>() = Ptr;
   Op<1>() = Val;
   setOperation(Operation);
   setOrdering(Ordering);
-  setSynchScope(SynchScope);
+  setSyncScopeID(SSID);
 
   assert(getOperand(0) && getOperand(1) &&
          "All operands must be non-null!");
@@ -1544,24 +1554,24 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
 
 AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                              AtomicOrdering Ordering,
-                             SynchronizationScope SynchScope,
+                             SyncScope::ID SSID,
                              Instruction *InsertBefore)
   : Instruction(Val->getType(), AtomicRMW,
                 OperandTraits<AtomicRMWInst>::op_begin(this),
                 OperandTraits<AtomicRMWInst>::operands(this),
                 InsertBefore) {
-  Init(Operation, Ptr, Val, Ordering, SynchScope);
+  Init(Operation, Ptr, Val, Ordering, SSID);
 }
 
 AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
                              AtomicOrdering Ordering,
-                             SynchronizationScope SynchScope,
+                             SyncScope::ID SSID,
                              BasicBlock *InsertAtEnd)
   : Instruction(Val->getType(), AtomicRMW,
                 OperandTraits<AtomicRMWInst>::op_begin(this),
                 OperandTraits<AtomicRMWInst>::operands(this),
                 InsertAtEnd) {
-  Init(Operation, Ptr, Val, Ordering, SynchScope);
+  Init(Operation, Ptr, Val, Ordering, SSID);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1569,27 +1579,25 @@ AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
 //===----------------------------------------------------------------------===//
 
 FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
-                     SynchronizationScope SynchScope,
+                     SyncScope::ID SSID,
                      Instruction *InsertBefore)
   : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) {
   setOrdering(Ordering);
-  setSynchScope(SynchScope);
+  setSyncScopeID(SSID);
 }
 
 FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
-                     SynchronizationScope SynchScope,
+                     SyncScope::ID SSID,
                      BasicBlock *InsertAtEnd)
   : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertAtEnd) {
   setOrdering(Ordering);
-  setSynchScope(SynchScope);
+  setSyncScopeID(SSID);
 }
 
 //===----------------------------------------------------------------------===//
 //                       GetElementPtrInst Implementation
 //===----------------------------------------------------------------------===//
 
-void GetElementPtrInst::anchor() {}
-
 void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
                              const Twine &Name) {
   assert(getNumOperands() == 1 + IdxList.size() &&
@@ -1726,14 +1734,12 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
   setName(Name);
 }
 
-
 bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
   if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy())
     return false;
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           InsertElementInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1780,7 +1786,6 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                      ShuffleVectorInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1827,7 +1832,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return false;
   
   // Mask must be vector of i32.
-  VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
+  auto *MaskTy = dyn_cast<VectorType>(Mask->getType());
   if (!MaskTy || !MaskTy->getElementType()->isIntegerTy(32))
     return false;
 
@@ -1835,10 +1840,10 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
   if (isa<UndefValue>(Mask) || isa<ConstantAggregateZero>(Mask))
     return true;
 
-  if (const ConstantVector *MV = dyn_cast<ConstantVector>(Mask)) {
+  if (const auto *MV = dyn_cast<ConstantVector>(Mask)) {
     unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
     for (Value *Op : MV->operands()) {
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (auto *CI = dyn_cast<ConstantInt>(Op)) {
         if (CI->uge(V1Size*2))
           return false;
       } else if (!isa<UndefValue>(Op)) {
@@ -1848,8 +1853,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return true;
   }
   
-  if (const ConstantDataSequential *CDS =
-        dyn_cast<ConstantDataSequential>(Mask)) {
+  if (const auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     unsigned V1Size = cast<VectorType>(V1->getType())->getNumElements();
     for (unsigned i = 0, e = MaskTy->getNumElements(); i != e; ++i)
       if (CDS->getElementAsInteger(i) >= V1Size*2)
@@ -1861,7 +1865,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
   // used as the shuffle mask. When this occurs, the shuffle mask will
   // fall into this case and fail. To avoid this error, do this bit of
   // ugliness to allow such a mask pass.
-  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Mask))
+  if (const auto *CE = dyn_cast<ConstantExpr>(Mask))
     if (CE->getOpcode() == Instruction::UserOp1)
       return true;
 
@@ -1870,7 +1874,7 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
 
 int ShuffleVectorInst::getMaskValue(Constant *Mask, unsigned i) {
   assert(i < Mask->getType()->getVectorNumElements() && "Index out of range");
-  if (ConstantDataSequential *CDS =dyn_cast<ConstantDataSequential>(Mask))
+  if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask))
     return CDS->getElementAsInteger(i);
   Constant *C = Mask->getAggregateElement(i);
   if (isa<UndefValue>(C))
@@ -1882,7 +1886,7 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
   unsigned NumElts = Mask->getType()->getVectorNumElements();
   
-  if (ConstantDataSequential *CDS=dyn_cast<ConstantDataSequential>(Mask)) {
+  if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)
       Result.push_back(CDS->getElementAsInteger(i));
     return;
@@ -1894,7 +1898,6 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask,
   }
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             InsertValueInst Class
 //===----------------------------------------------------------------------===//
@@ -1907,7 +1910,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
   // (other than weirdness with &*IdxBegin being invalid; see
   // getelementptr's init routine for example). But there's no
   // present need to support it.
-  assert(Idxs.size() > 0 && "InsertValueInst must have at least one index");
+  assert(!Idxs.empty() && "InsertValueInst must have at least one index");
 
   assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) ==
          Val->getType() && "Inserted value must match indexed type!");
@@ -1936,7 +1939,7 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
 
   // There's no fundamental reason why we require at least one index.
   // But there's no present need to support it.
-  assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index");
+  assert(!Idxs.empty() && "ExtractValueInst must have at least one index");
 
   Indices.append(Idxs.begin(), Idxs.end());
   setName(Name);
@@ -1992,8 +1995,8 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                 InsertBefore) {
   Op<0>() = S1;
   Op<1>() = S2;
-  init(iType);
   setName(Name);
+  AssertOK();
 }
 
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
@@ -2005,18 +2008,17 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
                 InsertAtEnd) {
   Op<0>() = S1;
   Op<1>() = S2;
-  init(iType);
   setName(Name);
+  AssertOK();
 }
 
-
-void BinaryOperator::init(BinaryOps iType) {
+void BinaryOperator::AssertOK() {
   Value *LHS = getOperand(0), *RHS = getOperand(1);
   (void)LHS; (void)RHS; // Silence warnings.
   assert(LHS->getType() == RHS->getType() &&
          "Binary operator operand types must match!");
 #ifndef NDEBUG
-  switch (iType) {
+  switch (getOpcode()) {
   case Add: case Sub:
   case Mul:
     assert(getType() == LHS->getType() &&
@@ -2036,8 +2038,7 @@ void BinaryOperator::init(BinaryOps iType) {
   case SDiv: 
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
-            cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Incorrect operand type (not integer) for S/UDIV");
     break;
   case FDiv:
@@ -2050,8 +2051,7 @@ void BinaryOperator::init(BinaryOps iType) {
   case SRem: 
     assert(getType() == LHS->getType() &&
            "Arithmetic operation should return same type as operands!");
-    assert((getType()->isIntegerTy() || (getType()->isVectorTy() && 
-            cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Incorrect operand type (not integer) for S/UREM");
     break;
   case FRem:
@@ -2065,22 +2065,17 @@ void BinaryOperator::init(BinaryOps iType) {
   case AShr:
     assert(getType() == LHS->getType() &&
            "Shift operation should return same type as operands!");
-    assert((getType()->isIntegerTy() ||
-            (getType()->isVectorTy() && 
-             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Tried to create a shift operation on a non-integral type!");
     break;
   case And: case Or:
   case Xor:
     assert(getType() == LHS->getType() &&
            "Logical operation should return same type as operands!");
-    assert((getType()->isIntegerTy() ||
-            (getType()->isVectorTy() && 
-             cast<VectorType>(getType())->getElementType()->isIntegerTy())) &&
+    assert(getType()->isIntOrIntVectorTy() &&
            "Tried to create a logical operation on a non-integral type!");
     break;
-  default:
-    break;
+  default: llvm_unreachable("Invalid opcode provided");
   }
 #endif
 }
@@ -2169,7 +2164,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                             Op->getType(), Name, InsertAtEnd);
 }
 
-
 // isConstantAllOnes - Helper function for several functions below
 static inline bool isConstantAllOnes(const Value *V) {
   if (const Constant *C = dyn_cast<Constant>(V))
@@ -2235,7 +2229,6 @@ const Value *BinaryOperator::getNotArgument(const Value *BinOp) {
   return getNotArgument(const_cast<Value*>(BinOp));
 }
 
-
 // Exchange the two operands to this instruction. This instruction is safe to
 // use on any binary instruction and does not modify the semantics of the
 // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode
@@ -2247,7 +2240,6 @@ bool BinaryOperator::swapOperands() {
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                             FPMathOperator Class
 //===----------------------------------------------------------------------===//
@@ -2261,13 +2253,10 @@ float FPMathOperator::getFPAccuracy() const {
   return Accuracy->getValueAPF().convertToFloat();
 }
 
-
 //===----------------------------------------------------------------------===//
 //                                CastInst Class
 //===----------------------------------------------------------------------===//
 
-void CastInst::anchor() {}
-
 // Just determine if this cast only deals with integral->integral conversion.
 bool CastInst::isIntegerCast() const {
   switch (getOpcode()) {
@@ -2523,13 +2512,12 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0;
     }
-    case 12: {
+    case 12:
       // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
       // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
       if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
         return Instruction::AddrSpaceCast;
       return Instruction::BitCast;
-    }
     case 13:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -2550,7 +2538,6 @@ unsigned CastInst::isEliminableCastPair(
           DstTy->getScalarType()->getPointerElementType())
         return Instruction::AddrSpaceCast;
       return 0;
-
     case 15:
       // FIXME: this state can be merged with (1), but the following assert
       // is useful to check the correcteness of the sequence due to semantic
@@ -3026,7 +3013,6 @@ CastInst::getCastOpcode(
 /// of the types involved.
 bool 
 CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
-
   // Check for type sanity on the arguments
   Type *SrcTy = S->getType();
 
@@ -3078,16 +3064,14 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
     if (VectorType *VT = dyn_cast<VectorType>(SrcTy))
       if (VT->getNumElements() != cast<VectorType>(DstTy)->getNumElements())
         return false;
-    return SrcTy->getScalarType()->isPointerTy() &&
-           DstTy->getScalarType()->isIntegerTy();
+    return SrcTy->isPtrOrPtrVectorTy() && DstTy->isIntOrIntVectorTy();
   case Instruction::IntToPtr:
     if (isa<VectorType>(SrcTy) != isa<VectorType>(DstTy))
       return false;
     if (VectorType *VT = dyn_cast<VectorType>(SrcTy))
       if (VT->getNumElements() != cast<VectorType>(DstTy)->getNumElements())
         return false;
-    return SrcTy->getScalarType()->isIntegerTy() &&
-           DstTy->getScalarType()->isPointerTy();
+    return SrcTy->isIntOrIntVectorTy() && DstTy->isPtrOrPtrVectorTy();
   case Instruction::BitCast: {
     PointerType *SrcPtrTy = dyn_cast<PointerType>(SrcTy->getScalarType());
     PointerType *DstPtrTy = dyn_cast<PointerType>(DstTy->getScalarType());
@@ -3299,8 +3283,6 @@ AddrSpaceCastInst::AddrSpaceCastInst(
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
 
-void CmpInst::anchor() {}
-
 CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
                  Value *RHS, const Twine &Name, Instruction *InsertBefore)
   : Instruction(ty, op,
@@ -3375,7 +3357,6 @@ bool CmpInst::isEquality() const {
   return cast<FCmpInst>(this)->isEquality();
 }
 
-
 CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown cmp predicate!");
@@ -3441,8 +3422,6 @@ StringRef CmpInst::getPredicateName(Predicate Pred) {
   }
 }
 
-void ICmpInst::anchor() {}
-
 ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
   switch (pred) {
     default: llvm_unreachable("Unknown icmp predicate!");
@@ -3655,16 +3634,16 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   setNumHungOffUseOperands(OpNo+2);
-  CaseIt Case(this, NewCaseIdx);
+  CaseHandle Case(this, NewCaseIdx);
   Case.setValue(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt i) {
-  unsigned idx = i.getCaseIndex();
-  
+SwitchInst::CaseIt SwitchInst::removeCase(CaseIt I) {
+  unsigned idx = I->getCaseIndex();
+
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
 
   unsigned NumOps = getNumOperands();
@@ -3680,6 +3659,8 @@ void SwitchInst::removeCase(CaseIt i) {
   OL[NumOps-2].set(nullptr);
   OL[NumOps-2+1].set(nullptr);
   setNumHungOffUseOperands(NumOps-2);
+
+  return CaseIt(this, idx);
 }
 
 /// growOperands - grow operands - This grows the operand list in response
@@ -3693,17 +3674,6 @@ void SwitchInst::growOperands() {
   growHungoffUses(ReservedSpace);
 }
 
-
-BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const {
-  return getSuccessor(idx);
-}
-unsigned SwitchInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) {
-  setSuccessor(idx, B);
-}
-
 //===----------------------------------------------------------------------===//
 //                        IndirectBrInst Implementation
 //===----------------------------------------------------------------------===//
@@ -3783,16 +3753,6 @@ void IndirectBrInst::removeDestination(unsigned idx) {
   setNumHungOffUseOperands(NumOps-1);
 }
 
-BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const {
-  return getSuccessor(idx);
-}
-unsigned IndirectBrInst::getNumSuccessorsV() const {
-  return getNumSuccessors();
-}
-void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) {
-  setSuccessor(idx, B);
-}
-
 //===----------------------------------------------------------------------===//
 //                           cloneImpl() implementations
 //===----------------------------------------------------------------------===//
@@ -3826,6 +3786,7 @@ InsertValueInst *InsertValueInst::cloneImpl() const {
 
 AllocaInst *AllocaInst::cloneImpl() const {
   AllocaInst *Result = new AllocaInst(getAllocatedType(),
+                                      getType()->getAddressSpace(),
                                       (Value *)getOperand(0), getAlignment());
   Result->setUsedWithInAlloca(isUsedWithInAlloca());
   Result->setSwiftError(isSwiftError());
@@ -3834,12 +3795,12 @@ AllocaInst *AllocaInst::cloneImpl() const {
 
 LoadInst *LoadInst::cloneImpl() const {
   return new LoadInst(getOperand(0), Twine(), isVolatile(),
-                      getAlignment(), getOrdering(), getSynchScope());
+                      getAlignment(), getOrdering(), getSyncScopeID());
 }
 
 StoreInst *StoreInst::cloneImpl() const {
   return new StoreInst(getOperand(0), getOperand(1), isVolatile(),
-                       getAlignment(), getOrdering(), getSynchScope());
+                       getAlignment(), getOrdering(), getSyncScopeID());
   
 }
 
@@ -3847,7 +3808,7 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
   AtomicCmpXchgInst *Result =
     new AtomicCmpXchgInst(getOperand(0), getOperand(1), getOperand(2),
                           getSuccessOrdering(), getFailureOrdering(),
-                          getSynchScope());
+                          getSyncScopeID());
   Result->setVolatile(isVolatile());
   Result->setWeak(isWeak());
   return Result;
@@ -3855,14 +3816,14 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
 
 AtomicRMWInst *AtomicRMWInst::cloneImpl() const {
   AtomicRMWInst *Result =
-    new AtomicRMWInst(getOperation(),getOperand(0), getOperand(1),
-                      getOrdering(), getSynchScope());
+    new AtomicRMWInst(getOperation(), getOperand(0), getOperand(1),
+                      getOrdering(), getSyncScopeID());
   Result->setVolatile(isVolatile());
   return Result;
 }
 
 FenceInst *FenceInst::cloneImpl() const {
-  return new FenceInst(getContext(), getOrdering(), getSynchScope());
+  return new FenceInst(getContext(), getOrdering(), getSyncScopeID());
 }
 
 TruncInst *TruncInst::cloneImpl() const {
diff --git a/contrib/llvm/lib/IR/IntrinsicInst.cpp b/contrib/llvm/lib/IR/IntrinsicInst.cpp
index 2402506..8b12c55 100644
--- a/contrib/llvm/lib/IR/IntrinsicInst.cpp
+++ b/contrib/llvm/lib/IR/IntrinsicInst.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
@@ -93,3 +94,56 @@ Value *InstrProfIncrementInst::getStep() const {
   LLVMContext &Context = M->getContext();
   return ConstantInt::get(Type::getInt64Ty(Context), 1);
 }
+
+ConstrainedFPIntrinsic::RoundingMode
+ConstrainedFPIntrinsic::getRoundingMode() const {
+  unsigned NumOperands = getNumArgOperands();
+  Metadata *MD = 
+      dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
+  if (!MD || !isa<MDString>(MD))
+    return rmInvalid;
+  StringRef RoundingArg = cast<MDString>(MD)->getString();
+
+  // For dynamic rounding mode, we use round to nearest but we will set the
+  // 'exact' SDNodeFlag so that the value will not be rounded.
+  return StringSwitch<RoundingMode>(RoundingArg)
+    .Case("round.dynamic",    rmDynamic)
+    .Case("round.tonearest",  rmToNearest)
+    .Case("round.downward",   rmDownward)
+    .Case("round.upward",     rmUpward)
+    .Case("round.towardzero", rmTowardZero)
+    .Default(rmInvalid);
+}
+
+ConstrainedFPIntrinsic::ExceptionBehavior
+ConstrainedFPIntrinsic::getExceptionBehavior() const {
+  unsigned NumOperands = getNumArgOperands();
+  Metadata *MD = 
+      dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1))->getMetadata();
+  if (!MD || !isa<MDString>(MD))
+    return ebInvalid;
+  StringRef ExceptionArg = cast<MDString>(MD)->getString();
+  return StringSwitch<ExceptionBehavior>(ExceptionArg)
+    .Case("fpexcept.ignore",  ebIgnore)
+    .Case("fpexcept.maytrap", ebMayTrap)
+    .Case("fpexcept.strict",  ebStrict)
+    .Default(ebInvalid);
+}
+
+bool ConstrainedFPIntrinsic::isUnaryOp() const {
+  switch (getIntrinsicID()) {
+    default: 
+      return false;
+    case Intrinsic::experimental_constrained_sqrt:
+    case Intrinsic::experimental_constrained_sin:
+    case Intrinsic::experimental_constrained_cos:
+    case Intrinsic::experimental_constrained_exp:
+    case Intrinsic::experimental_constrained_exp2:
+    case Intrinsic::experimental_constrained_log:
+    case Intrinsic::experimental_constrained_log10:
+    case Intrinsic::experimental_constrained_log2:
+    case Intrinsic::experimental_constrained_rint:
+    case Intrinsic::experimental_constrained_nearbyint:
+      return true;
+  }
+}
diff --git a/contrib/llvm/lib/IR/LLVMContext.cpp b/contrib/llvm/lib/IR/LLVMContext.cpp
index dd66f14..c58459d 100644
--- a/contrib/llvm/lib/IR/LLVMContext.cpp
+++ b/contrib/llvm/lib/IR/LLVMContext.cpp
@@ -13,11 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LLVMContext.h"
+#include "LLVMContextImpl.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "LLVMContextImpl.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Metadata.h"
@@ -58,6 +58,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
     {MD_type, "type"},
     {MD_section_prefix, "section_prefix"},
     {MD_absolute_symbol, "absolute_symbol"},
+    {MD_associated, "associated"},
   };
 
   for (auto &MDKind : MDKinds) {
@@ -80,6 +81,18 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   assert(GCTransitionEntry->second == LLVMContext::OB_gc_transition &&
          "gc-transition operand bundle id drifted!");
   (void)GCTransitionEntry;
+
+  SyncScope::ID SingleThreadSSID =
+      pImpl->getOrInsertSyncScopeID("singlethread");
+  assert(SingleThreadSSID == SyncScope::SingleThread &&
+         "singlethread synchronization scope ID drifted!");
+  (void)SingleThreadSSID;
+
+  SyncScope::ID SystemSSID =
+      pImpl->getOrInsertSyncScopeID("");
+  assert(SystemSSID == SyncScope::System &&
+         "system synchronization scope ID drifted!");
+  (void)SystemSSID;
 }
 
 LLVMContext::~LLVMContext() { delete pImpl; }
@@ -124,11 +137,18 @@ void LLVMContext::setDiagnosticHandler(DiagnosticHandlerTy DiagnosticHandler,
   pImpl->RespectDiagnosticFilters = RespectFilters;
 }
 
-void LLVMContext::setDiagnosticHotnessRequested(bool Requested) {
-  pImpl->DiagnosticHotnessRequested = Requested;
+void LLVMContext::setDiagnosticsHotnessRequested(bool Requested) {
+  pImpl->DiagnosticsHotnessRequested = Requested;
 }
-bool LLVMContext::getDiagnosticHotnessRequested() const {
-  return pImpl->DiagnosticHotnessRequested;
+bool LLVMContext::getDiagnosticsHotnessRequested() const {
+  return pImpl->DiagnosticsHotnessRequested;
+}
+
+void LLVMContext::setDiagnosticsHotnessThreshold(uint64_t Threshold) {
+  pImpl->DiagnosticsHotnessThreshold = Threshold;
+}
+uint64_t LLVMContext::getDiagnosticsHotnessThreshold() const {
+  return pImpl->DiagnosticsHotnessThreshold;
 }
 
 yaml::Output *LLVMContext::getDiagnosticsOutputFile() {
@@ -247,6 +267,14 @@ uint32_t LLVMContext::getOperandBundleTagID(StringRef Tag) const {
   return pImpl->getOperandBundleTagID(Tag);
 }
 
+SyncScope::ID LLVMContext::getOrInsertSyncScopeID(StringRef SSN) {
+  return pImpl->getOrInsertSyncScopeID(SSN);
+}
+
+void LLVMContext::getSyncScopeNames(SmallVectorImpl<StringRef> &SSNs) const {
+  pImpl->getSyncScopeNames(SSNs);
+}
+
 void LLVMContext::setGC(const Function &Fn, std::string GCName) {
   auto It = pImpl->GCNames.find(&Fn);
 
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.cpp b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
index c43356c..57dd08b 100644
--- a/contrib/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.cpp
@@ -1,4 +1,4 @@
-//===-- LLVMContextImpl.cpp - Implement LLVMContextImpl -------------------===//
+//===- LLVMContextImpl.cpp - Implement LLVMContextImpl --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLVMContextImpl.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OptBisect.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/ManagedStatic.h"
-#include <algorithm>
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
-  : TheTrueVal(nullptr), TheFalseVal(nullptr),
-    VoidTy(C, Type::VoidTyID),
+  : VoidTy(C, Type::VoidTyID),
     LabelTy(C, Type::LabelTyID),
     HalfTy(C, Type::HalfTyID),
     FloatTy(C, Type::FloatTyID),
@@ -39,17 +38,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     Int16Ty(C, 16),
     Int32Ty(C, 32),
     Int64Ty(C, 64),
-    Int128Ty(C, 128) {
-  InlineAsmDiagHandler = nullptr;
-  InlineAsmDiagContext = nullptr;
-  DiagnosticHandler = nullptr;
-  DiagnosticContext = nullptr;
-  RespectDiagnosticFilters = false;
-  DiagnosticHotnessRequested = false;
-  YieldCallback = nullptr;
-  YieldOpaqueHandle = nullptr;
-  NamedStructTypesUniqueID = 0;
-}
+    Int128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
@@ -114,9 +103,10 @@ LLVMContextImpl::~LLVMContextImpl() {
   }
 
   // Destroy attribute lists.
-  for (FoldingSetIterator<AttributeSetImpl> I = AttrsLists.begin(),
-         E = AttrsLists.end(); I != E; ) {
-    FoldingSetIterator<AttributeSetImpl> Elem = I++;
+  for (FoldingSetIterator<AttributeListImpl> I = AttrsLists.begin(),
+                                             E = AttrsLists.end();
+       I != E;) {
+    FoldingSetIterator<AttributeListImpl> Elem = I++;
     delete &*Elem;
   }
 
@@ -155,7 +145,6 @@ void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
         C->destroyConstant();
       }
     }
-
   } while (Changed);
 }
 
@@ -164,6 +153,7 @@ void Module::dropTriviallyDeadConstantArrays() {
 }
 
 namespace llvm {
+
 /// \brief Make MDOperand transparent for hashing.
 ///
 /// This overload of an implementation detail of the hashing library makes
@@ -178,7 +168,8 @@ namespace llvm {
 /// does not cause MDOperand to be transparent.  In particular, a bare pointer
 /// doesn't get hashed before it's combined, whereas \a MDOperand would.
 static const Metadata *get_hashable_data(const MDOperand &X) { return X.get(); }
-}
+
+} // end namespace llvm
 
 unsigned MDNodeOpsKey::calculateHash(MDNode *N, unsigned Offset) {
   unsigned Hash = hash_combine_range(N->op_begin() + Offset, N->op_end());
@@ -214,26 +205,19 @@ uint32_t LLVMContextImpl::getOperandBundleTagID(StringRef Tag) const {
   return I->second;
 }
 
-// ConstantsContext anchors
-void UnaryConstantExpr::anchor() { }
-
-void BinaryConstantExpr::anchor() { }
-
-void SelectConstantExpr::anchor() { }
-
-void ExtractElementConstantExpr::anchor() { }
-
-void InsertElementConstantExpr::anchor() { }
-
-void ShuffleVectorConstantExpr::anchor() { }
-
-void ExtractValueConstantExpr::anchor() { }
-
-void InsertValueConstantExpr::anchor() { }
-
-void GetElementPtrConstantExpr::anchor() { }
+SyncScope::ID LLVMContextImpl::getOrInsertSyncScopeID(StringRef SSN) {
+  auto NewSSID = SSC.size();
+  assert(NewSSID < std::numeric_limits<SyncScope::ID>::max() &&
+         "Hit the maximum number of synchronization scopes allowed!");
+  return SSC.insert(std::make_pair(SSN, SyncScope::ID(NewSSID))).first->second;
+}
 
-void CompareConstantExpr::anchor() { }
+void LLVMContextImpl::getSyncScopeNames(
+    SmallVectorImpl<StringRef> &SSNs) const {
+  SSNs.resize(SSC.size());
+  for (const auto &SSE : SSC)
+    SSNs[SSE.second] = SSE.first();
+}
 
 /// Singleton instance of the OptBisect class.
 ///
diff --git a/contrib/llvm/lib/IR/LLVMContextImpl.h b/contrib/llvm/lib/IR/LLVMContextImpl.h
index 850c81c..bea2c7a 100644
--- a/contrib/llvm/lib/IR/LLVMContextImpl.h
+++ b/contrib/llvm/lib/IR/LLVMContextImpl.h
@@ -1,4 +1,4 @@
-//===-- LLVMContextImpl.h - The LLVMContextImpl opaque class ----*- C++ -*-===//
+//===- LLVMContextImpl.h - The LLVMContextImpl opaque class -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,48 +21,61 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/TrackingMDRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/YAMLTraits.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
 
-class ConstantInt;
 class ConstantFP;
-class DiagnosticInfoOptimizationRemark;
-class DiagnosticInfoOptimizationRemarkMissed;
-class DiagnosticInfoOptimizationRemarkAnalysis;
-class GCStrategy;
-class LLVMContext;
+class ConstantInt;
 class Type;
 class Value;
+class ValueHandleBase;
 
 struct DenseMapAPIntKeyInfo {
   static inline APInt getEmptyKey() {
     APInt V(nullptr, 0);
-    V.VAL = 0;
+    V.U.VAL = 0;
     return V;
   }
+
   static inline APInt getTombstoneKey() {
     APInt V(nullptr, 0);
-    V.VAL = 1;
+    V.U.VAL = 1;
     return V;
   }
+
   static unsigned getHashValue(const APInt &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
+
   static bool isEqual(const APInt &LHS, const APInt &RHS) {
     return LHS.getBitWidth() == RHS.getBitWidth() && LHS == RHS;
   }
@@ -71,9 +84,11 @@ struct DenseMapAPIntKeyInfo {
 struct DenseMapAPFloatKeyInfo {
   static inline APFloat getEmptyKey() { return APFloat(APFloat::Bogus(), 1); }
   static inline APFloat getTombstoneKey() { return APFloat(APFloat::Bogus(), 2); }
+
   static unsigned getHashValue(const APFloat &Key) {
     return static_cast<unsigned>(hash_value(Key));
   }
+
   static bool isEqual(const APFloat &LHS, const APFloat &RHS) {
     return LHS.bitwiseIsEqual(RHS);
   }
@@ -83,10 +98,13 @@ struct AnonStructTypeKeyInfo {
   struct KeyTy {
     ArrayRef<Type*> ETypes;
     bool isPacked;
+
     KeyTy(const ArrayRef<Type*>& E, bool P) :
       ETypes(E), isPacked(P) {}
+
     KeyTy(const StructType *ST)
         : ETypes(ST->elements()), isPacked(ST->isPacked()) {}
+
     bool operator==(const KeyTy& that) const {
       if (isPacked != that.isPacked)
         return false;
@@ -98,25 +116,31 @@ struct AnonStructTypeKeyInfo {
       return !this->operator==(that);
     }
   };
+
   static inline StructType* getEmptyKey() {
     return DenseMapInfo<StructType*>::getEmptyKey();
   }
+
   static inline StructType* getTombstoneKey() {
     return DenseMapInfo<StructType*>::getTombstoneKey();
   }
+
   static unsigned getHashValue(const KeyTy& Key) {
     return hash_combine(hash_combine_range(Key.ETypes.begin(),
                                            Key.ETypes.end()),
                         Key.isPacked);
   }
+
   static unsigned getHashValue(const StructType *ST) {
     return getHashValue(KeyTy(ST));
   }
+
   static bool isEqual(const KeyTy& LHS, const StructType *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return LHS == KeyTy(RHS);
   }
+
   static bool isEqual(const StructType *LHS, const StructType *RHS) {
     return LHS == RHS;
   }
@@ -127,11 +151,13 @@ struct FunctionTypeKeyInfo {
     const Type *ReturnType;
     ArrayRef<Type*> Params;
     bool isVarArg;
+
     KeyTy(const Type* R, const ArrayRef<Type*>& P, bool V) :
       ReturnType(R), Params(P), isVarArg(V) {}
     KeyTy(const FunctionType *FT)
         : ReturnType(FT->getReturnType()), Params(FT->params()),
           isVarArg(FT->isVarArg()) {}
+
     bool operator==(const KeyTy& that) const {
       if (ReturnType != that.ReturnType)
         return false;
@@ -145,26 +171,32 @@ struct FunctionTypeKeyInfo {
       return !this->operator==(that);
     }
   };
+
   static inline FunctionType* getEmptyKey() {
     return DenseMapInfo<FunctionType*>::getEmptyKey();
   }
+
   static inline FunctionType* getTombstoneKey() {
     return DenseMapInfo<FunctionType*>::getTombstoneKey();
   }
+
   static unsigned getHashValue(const KeyTy& Key) {
     return hash_combine(Key.ReturnType,
                         hash_combine_range(Key.Params.begin(),
                                            Key.Params.end()),
                         Key.isVarArg);
   }
+
   static unsigned getHashValue(const FunctionType *FT) {
     return getHashValue(KeyTy(FT));
   }
+
   static bool isEqual(const KeyTy& LHS, const FunctionType *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return LHS == KeyTy(RHS);
   }
+
   static bool isEqual(const FunctionType *LHS, const FunctionType *RHS) {
     return LHS == RHS;
   }
@@ -174,7 +206,6 @@ struct FunctionTypeKeyInfo {
 class MDNodeOpsKey {
   ArrayRef<Metadata *> RawOps;
   ArrayRef<MDOperand> Ops;
-
   unsigned Hash;
 
 protected:
@@ -212,14 +243,15 @@ public:
 };
 
 template <class NodeTy> struct MDNodeKeyImpl;
-template <class NodeTy> struct MDNodeInfo;
 
 /// Configuration point for MDNodeInfo::isEqual().
 template <class NodeTy> struct MDNodeSubsetEqualImpl {
-  typedef MDNodeKeyImpl<NodeTy> KeyTy;
+  using KeyTy = MDNodeKeyImpl<NodeTy>;
+
   static bool isSubsetEqual(const KeyTy &LHS, const NodeTy *RHS) {
     return false;
   }
+
   static bool isSubsetEqual(const NodeTy *LHS, const NodeTy *RHS) {
     return false;
   }
@@ -252,7 +284,6 @@ template <> struct MDNodeKeyImpl<DILocation> {
   MDNodeKeyImpl(unsigned Line, unsigned Column, Metadata *Scope,
                 Metadata *InlinedAt)
       : Line(Line), Column(Column), Scope(Scope), InlinedAt(InlinedAt) {}
-
   MDNodeKeyImpl(const DILocation *L)
       : Line(L->getLine()), Column(L->getColumn()), Scope(L->getRawScope()),
         InlinedAt(L->getRawInlinedAt()) {}
@@ -261,6 +292,7 @@ template <> struct MDNodeKeyImpl<DILocation> {
     return Line == RHS->getLine() && Column == RHS->getColumn() &&
            Scope == RHS->getRawScope() && InlinedAt == RHS->getRawInlinedAt();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Line, Column, Scope, InlinedAt);
   }
@@ -270,6 +302,7 @@ template <> struct MDNodeKeyImpl<DILocation> {
 template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
   unsigned Tag;
   MDString *Header;
+
   MDNodeKeyImpl(unsigned Tag, MDString *Header, ArrayRef<Metadata *> DwarfOps)
       : MDNodeOpsKey(DwarfOps), Tag(Tag), Header(Header) {}
   MDNodeKeyImpl(const GenericDINode *N)
@@ -299,6 +332,7 @@ template <> struct MDNodeKeyImpl<DISubrange> {
   bool isKeyOf(const DISubrange *RHS) const {
     return Count == RHS->getCount() && LowerBound == RHS->getLowerBound();
   }
+
   unsigned getHashValue() const { return hash_combine(Count, LowerBound); }
 };
 
@@ -313,6 +347,7 @@ template <> struct MDNodeKeyImpl<DIEnumerator> {
   bool isKeyOf(const DIEnumerator *RHS) const {
     return Value == RHS->getValue() && Name == RHS->getRawName();
   }
+
   unsigned getHashValue() const { return hash_combine(Value, Name); }
 };
 
@@ -337,6 +372,7 @@ template <> struct MDNodeKeyImpl<DIBasicType> {
            AlignInBits == RHS->getAlignInBits() &&
            Encoding == RHS->getEncoding();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Tag, Name, SizeInBits, AlignInBits, Encoding);
   }
@@ -352,22 +388,26 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   uint64_t SizeInBits;
   uint64_t OffsetInBits;
   uint32_t AlignInBits;
+  Optional<unsigned> DWARFAddressSpace;
   unsigned Flags;
   Metadata *ExtraData;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-                uint32_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                uint32_t AlignInBits, uint64_t OffsetInBits,
+                Optional<unsigned> DWARFAddressSpace, unsigned Flags,
                 Metadata *ExtraData)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
-        AlignInBits(AlignInBits), Flags(Flags), ExtraData(ExtraData) {}
+        AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
+        Flags(Flags), ExtraData(ExtraData) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
-        Flags(N->getFlags()), ExtraData(N->getRawExtraData()) {}
+        DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
+        ExtraData(N->getRawExtraData()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -375,9 +415,12 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
            SizeInBits == RHS->getSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
-           OffsetInBits == RHS->getOffsetInBits() && Flags == RHS->getFlags() &&
+           OffsetInBits == RHS->getOffsetInBits() &&
+           DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
+           Flags == RHS->getFlags() &&
            ExtraData == RHS->getRawExtraData();
   }
+
   unsigned getHashValue() const {
     // If this is a member inside an ODR type, only hash the type and the name.
     // Otherwise the hash will be stronger than
@@ -396,10 +439,12 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
 };
 
 template <> struct MDNodeSubsetEqualImpl<DIDerivedType> {
-  typedef MDNodeKeyImpl<DIDerivedType> KeyTy;
+  using KeyTy = MDNodeKeyImpl<DIDerivedType>;
+
   static bool isSubsetEqual(const KeyTy &LHS, const DIDerivedType *RHS) {
     return isODRMember(LHS.Tag, LHS.Scope, LHS.Name, RHS);
   }
+
   static bool isSubsetEqual(const DIDerivedType *LHS, const DIDerivedType *RHS) {
     return isODRMember(LHS->getTag(), LHS->getRawScope(), LHS->getRawName(),
                        RHS);
@@ -474,6 +519,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            TemplateParams == RHS->getRawTemplateParams() &&
            Identifier == RHS->getRawIdentifier();
   }
+
   unsigned getHashValue() const {
     // Intentionally computes the hash on a subset of the operands for
     // performance reason. The subset has to be significant enough to avoid
@@ -498,6 +544,7 @@ template <> struct MDNodeKeyImpl<DISubroutineType> {
     return Flags == RHS->getFlags() && CC == RHS->getCC() &&
            TypeArray == RHS->getRawTypeArray();
   }
+
   unsigned getHashValue() const { return hash_combine(Flags, CC, TypeArray); }
 };
 
@@ -521,6 +568,7 @@ template <> struct MDNodeKeyImpl<DIFile> {
            CSKind == RHS->getChecksumKind() &&
            Checksum == RHS->getRawChecksum();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Filename, Directory, CSKind, Checksum);
   }
@@ -546,6 +594,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
   Metadata *TemplateParams;
   Metadata *Declaration;
   Metadata *Variables;
+  Metadata *ThrownTypes;
 
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *LinkageName,
                 Metadata *File, unsigned Line, Metadata *Type,
@@ -553,7 +602,8 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
                 Metadata *ContainingType, unsigned Virtuality,
                 unsigned VirtualIndex, int ThisAdjustment, unsigned Flags,
                 bool IsOptimized, Metadata *Unit, Metadata *TemplateParams,
-                Metadata *Declaration, Metadata *Variables)
+                Metadata *Declaration, Metadata *Variables,
+                Metadata *ThrownTypes)
       : Scope(Scope), Name(Name), LinkageName(LinkageName), File(File),
         Line(Line), Type(Type), IsLocalToUnit(IsLocalToUnit),
         IsDefinition(IsDefinition), ScopeLine(ScopeLine),
@@ -561,7 +611,7 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         VirtualIndex(VirtualIndex), ThisAdjustment(ThisAdjustment),
         Flags(Flags), IsOptimized(IsOptimized), Unit(Unit),
         TemplateParams(TemplateParams), Declaration(Declaration),
-        Variables(Variables) {}
+        Variables(Variables), ThrownTypes(ThrownTypes) {}
   MDNodeKeyImpl(const DISubprogram *N)
       : Scope(N->getRawScope()), Name(N->getRawName()),
         LinkageName(N->getRawLinkageName()), File(N->getRawFile()),
@@ -572,7 +622,8 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
         ThisAdjustment(N->getThisAdjustment()), Flags(N->getFlags()),
         IsOptimized(N->isOptimized()), Unit(N->getRawUnit()),
         TemplateParams(N->getRawTemplateParams()),
-        Declaration(N->getRawDeclaration()), Variables(N->getRawVariables()) {}
+        Declaration(N->getRawDeclaration()), Variables(N->getRawVariables()),
+        ThrownTypes(N->getRawThrownTypes()) {}
 
   bool isKeyOf(const DISubprogram *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
@@ -589,8 +640,10 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
            Unit == RHS->getUnit() &&
            TemplateParams == RHS->getRawTemplateParams() &&
            Declaration == RHS->getRawDeclaration() &&
-           Variables == RHS->getRawVariables();
+           Variables == RHS->getRawVariables() &&
+           ThrownTypes == RHS->getRawThrownTypes();
   }
+
   unsigned getHashValue() const {
     // If this is a declaration inside an ODR type, only hash the type and the
     // name.  Otherwise the hash will be stronger than
@@ -609,20 +662,24 @@ template <> struct MDNodeKeyImpl<DISubprogram> {
 };
 
 template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
-  typedef MDNodeKeyImpl<DISubprogram> KeyTy;
+  using KeyTy = MDNodeKeyImpl<DISubprogram>;
+
   static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope,
-                                    LHS.LinkageName, RHS);
+                                    LHS.LinkageName, LHS.TemplateParams, RHS);
   }
+
   static bool isSubsetEqual(const DISubprogram *LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS->isDefinition(), LHS->getRawScope(),
-                                    LHS->getRawLinkageName(), RHS);
+                                    LHS->getRawLinkageName(),
+                                    LHS->getRawTemplateParams(), RHS);
   }
 
   /// Subprograms compare equal if they declare the same function in an ODR
   /// type.
   static bool isDeclarationOfODRMember(bool IsDefinition, const Metadata *Scope,
                                        const MDString *LinkageName,
+                                       const Metadata *TemplateParams,
                                        const DISubprogram *RHS) {
     // Check whether the LHS is eligible.
     if (IsDefinition || !Scope || !LinkageName)
@@ -633,8 +690,14 @@ template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
       return false;
 
     // Compare to the RHS.
+    // FIXME: We need to compare template parameters here to avoid incorrect
+    // collisions in mapMetadata when RF_MoveDistinctMDs and a ODR-DISubprogram
+    // has a non-ODR template parameter (i.e., a DICompositeType that does not
+    // have an identifier). Eventually we should decouple ODR logic from
+    // uniquing logic.
     return IsDefinition == RHS->isDefinition() && Scope == RHS->getRawScope() &&
-           LinkageName == RHS->getRawLinkageName();
+           LinkageName == RHS->getRawLinkageName() &&
+           TemplateParams == RHS->getRawTemplateParams();
   }
 };
 
@@ -654,6 +717,7 @@ template <> struct MDNodeKeyImpl<DILexicalBlock> {
     return Scope == RHS->getRawScope() && File == RHS->getRawFile() &&
            Line == RHS->getLine() && Column == RHS->getColumn();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, File, Line, Column);
   }
@@ -674,6 +738,7 @@ template <> struct MDNodeKeyImpl<DILexicalBlockFile> {
     return Scope == RHS->getRawScope() && File == RHS->getRawFile() &&
            Discriminator == RHS->getDiscriminator();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, File, Discriminator);
   }
@@ -681,26 +746,22 @@ template <> struct MDNodeKeyImpl<DILexicalBlockFile> {
 
 template <> struct MDNodeKeyImpl<DINamespace> {
   Metadata *Scope;
-  Metadata *File;
   MDString *Name;
-  unsigned Line;
   bool ExportSymbols;
 
-  MDNodeKeyImpl(Metadata *Scope, Metadata *File, MDString *Name, unsigned Line,
-                bool ExportSymbols)
-      : Scope(Scope), File(File), Name(Name), Line(Line),
-        ExportSymbols(ExportSymbols) {}
+  MDNodeKeyImpl(Metadata *Scope, MDString *Name, bool ExportSymbols)
+      : Scope(Scope), Name(Name), ExportSymbols(ExportSymbols) {}
   MDNodeKeyImpl(const DINamespace *N)
-      : Scope(N->getRawScope()), File(N->getRawFile()), Name(N->getRawName()),
-        Line(N->getLine()), ExportSymbols(N->getExportSymbols()) {}
+      : Scope(N->getRawScope()), Name(N->getRawName()),
+        ExportSymbols(N->getExportSymbols()) {}
 
   bool isKeyOf(const DINamespace *RHS) const {
-    return Scope == RHS->getRawScope() && File == RHS->getRawFile() &&
-           Name == RHS->getRawName() && Line == RHS->getLine() &&
+    return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            ExportSymbols == RHS->getExportSymbols();
   }
+
   unsigned getHashValue() const {
-    return hash_combine(Scope, File, Name, Line);
+    return hash_combine(Scope, Name);
   }
 };
 
@@ -710,6 +771,7 @@ template <> struct MDNodeKeyImpl<DIModule> {
   MDString *ConfigurationMacros;
   MDString *IncludePath;
   MDString *ISysRoot;
+
   MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *ConfigurationMacros,
                 MDString *IncludePath, MDString *ISysRoot)
       : Scope(Scope), Name(Name), ConfigurationMacros(ConfigurationMacros),
@@ -725,6 +787,7 @@ template <> struct MDNodeKeyImpl<DIModule> {
            IncludePath == RHS->getRawIncludePath() &&
            ISysRoot == RHS->getRawISysRoot();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Scope, Name,
                         ConfigurationMacros, IncludePath, ISysRoot);
@@ -742,6 +805,7 @@ template <> struct MDNodeKeyImpl<DITemplateTypeParameter> {
   bool isKeyOf(const DITemplateTypeParameter *RHS) const {
     return Name == RHS->getRawName() && Type == RHS->getRawType();
   }
+
   unsigned getHashValue() const { return hash_combine(Name, Type); }
 };
 
@@ -761,6 +825,7 @@ template <> struct MDNodeKeyImpl<DITemplateValueParameter> {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
            Type == RHS->getRawType() && Value == RHS->getValue();
   }
+
   unsigned getHashValue() const { return hash_combine(Tag, Name, Type, Value); }
 };
 
@@ -803,6 +868,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariable> {
                RHS->getRawStaticDataMemberDeclaration() &&
            AlignInBits == RHS->getAlignInBits();
   }
+
   unsigned getHashValue() const {
     // We do not use AlignInBits in hashing function here on purpose:
     // in most cases this param for local variable is zero (for function param
@@ -843,6 +909,7 @@ template <> struct MDNodeKeyImpl<DILocalVariable> {
            Type == RHS->getRawType() && Arg == RHS->getArg() &&
            Flags == RHS->getFlags() && AlignInBits == RHS->getAlignInBits();
   }
+
   unsigned getHashValue() const {
     // We do not use AlignInBits in hashing function here on purpose:
     // in most cases this param for local variable is zero (for function param
@@ -864,6 +931,7 @@ template <> struct MDNodeKeyImpl<DIExpression> {
   bool isKeyOf(const DIExpression *RHS) const {
     return Elements == RHS->getElements();
   }
+
   unsigned getHashValue() const {
     return hash_combine_range(Elements.begin(), Elements.end());
   }
@@ -882,6 +950,7 @@ template <> struct MDNodeKeyImpl<DIGlobalVariableExpression> {
     return Variable == RHS->getRawVariable() &&
            Expression == RHS->getRawExpression();
   }
+
   unsigned getHashValue() const { return hash_combine(Variable, Expression); }
 };
 
@@ -910,6 +979,7 @@ template <> struct MDNodeKeyImpl<DIObjCProperty> {
            SetterName == RHS->getRawSetterName() &&
            Attributes == RHS->getAttributes() && Type == RHS->getRawType();
   }
+
   unsigned getHashValue() const {
     return hash_combine(Name, File, Line, GetterName, SetterName, Attributes,
                         Type);
@@ -920,23 +990,26 @@ template <> struct MDNodeKeyImpl<DIImportedEntity> {
   unsigned Tag;
   Metadata *Scope;
   Metadata *Entity;
+  Metadata *File;
   unsigned Line;
   MDString *Name;
 
-  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, unsigned Line,
-                MDString *Name)
-      : Tag(Tag), Scope(Scope), Entity(Entity), Line(Line), Name(Name) {}
+  MDNodeKeyImpl(unsigned Tag, Metadata *Scope, Metadata *Entity, Metadata *File,
+                unsigned Line, MDString *Name)
+      : Tag(Tag), Scope(Scope), Entity(Entity), File(File), Line(Line),
+        Name(Name) {}
   MDNodeKeyImpl(const DIImportedEntity *N)
       : Tag(N->getTag()), Scope(N->getRawScope()), Entity(N->getRawEntity()),
-        Line(N->getLine()), Name(N->getRawName()) {}
+        File(N->getRawFile()), Line(N->getLine()), Name(N->getRawName()) {}
 
   bool isKeyOf(const DIImportedEntity *RHS) const {
     return Tag == RHS->getTag() && Scope == RHS->getRawScope() &&
-           Entity == RHS->getRawEntity() && Line == RHS->getLine() &&
-           Name == RHS->getRawName();
+           Entity == RHS->getRawEntity() && File == RHS->getFile() &&
+           Line == RHS->getLine() && Name == RHS->getRawName();
   }
+
   unsigned getHashValue() const {
-    return hash_combine(Tag, Scope, Entity, Line, Name);
+    return hash_combine(Tag, Scope, Entity, File, Line, Name);
   }
 };
 
@@ -956,6 +1029,7 @@ template <> struct MDNodeKeyImpl<DIMacro> {
     return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
            Name == RHS->getRawName() && Value == RHS->getRawValue();
   }
+
   unsigned getHashValue() const {
     return hash_combine(MIType, Line, Name, Value);
   }
@@ -978,6 +1052,7 @@ template <> struct MDNodeKeyImpl<DIMacroFile> {
     return MIType == RHS->getMacinfoType() && Line == RHS->getLine() &&
            File == RHS->getRawFile() && Elements == RHS->getRawElements();
   }
+
   unsigned getHashValue() const {
     return hash_combine(MIType, Line, File, Elements);
   }
@@ -985,23 +1060,29 @@ template <> struct MDNodeKeyImpl<DIMacroFile> {
 
 /// \brief DenseMapInfo for MDNode subclasses.
 template <class NodeTy> struct MDNodeInfo {
-  typedef MDNodeKeyImpl<NodeTy> KeyTy;
-  typedef MDNodeSubsetEqualImpl<NodeTy> SubsetEqualTy;
+  using KeyTy = MDNodeKeyImpl<NodeTy>;
+  using SubsetEqualTy = MDNodeSubsetEqualImpl<NodeTy>;
+
   static inline NodeTy *getEmptyKey() {
     return DenseMapInfo<NodeTy *>::getEmptyKey();
   }
+
   static inline NodeTy *getTombstoneKey() {
     return DenseMapInfo<NodeTy *>::getTombstoneKey();
   }
+
   static unsigned getHashValue(const KeyTy &Key) { return Key.getHashValue(); }
+
   static unsigned getHashValue(const NodeTy *N) {
     return KeyTy(N).getHashValue();
   }
+
   static bool isEqual(const KeyTy &LHS, const NodeTy *RHS) {
     if (RHS == getEmptyKey() || RHS == getTombstoneKey())
       return false;
     return SubsetEqualTy::isSubsetEqual(LHS, RHS) || LHS.isKeyOf(RHS);
   }
+
   static bool isEqual(const NodeTy *LHS, const NodeTy *RHS) {
     if (LHS == RHS)
       return true;
@@ -1011,7 +1092,7 @@ template <class NodeTy> struct MDNodeInfo {
   }
 };
 
-#define HANDLE_MDNODE_LEAF(CLASS) typedef MDNodeInfo<CLASS> CLASS##Info;
+#define HANDLE_MDNODE_LEAF(CLASS) using CLASS##Info = MDNodeInfo<CLASS>;
 #include "llvm/IR/Metadata.def"
 
 /// \brief Map-like storage for metadata attachments.
@@ -1084,28 +1165,29 @@ public:
   /// will be automatically deleted if this context is deleted.
   SmallPtrSet<Module*, 4> OwnedModules;
   
-  LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler;
-  void *InlineAsmDiagContext;
-
-  LLVMContext::DiagnosticHandlerTy DiagnosticHandler;
-  void *DiagnosticContext;
-  bool RespectDiagnosticFilters;
-  bool DiagnosticHotnessRequested;
+  LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler = nullptr;
+  void *InlineAsmDiagContext = nullptr;
+
+  LLVMContext::DiagnosticHandlerTy DiagnosticHandler = nullptr;
+  void *DiagnosticContext = nullptr;
+  bool RespectDiagnosticFilters = false;
+  bool DiagnosticsHotnessRequested = false;
+  uint64_t DiagnosticsHotnessThreshold = 0;
   std::unique_ptr<yaml::Output> DiagnosticsOutputFile;
 
-  LLVMContext::YieldCallbackTy YieldCallback;
-  void *YieldOpaqueHandle;
+  LLVMContext::YieldCallbackTy YieldCallback = nullptr;
+  void *YieldOpaqueHandle = nullptr;
 
-  typedef DenseMap<APInt, std::unique_ptr<ConstantInt>, DenseMapAPIntKeyInfo>
-      IntMapTy;
+  using IntMapTy =
+      DenseMap<APInt, std::unique_ptr<ConstantInt>, DenseMapAPIntKeyInfo>;
   IntMapTy IntConstants;
 
-  typedef DenseMap<APFloat, std::unique_ptr<ConstantFP>, DenseMapAPFloatKeyInfo>
-      FPMapTy;
+  using FPMapTy =
+      DenseMap<APFloat, std::unique_ptr<ConstantFP>, DenseMapAPFloatKeyInfo>;
   FPMapTy FPConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
-  FoldingSet<AttributeSetImpl> AttrsLists;
+  FoldingSet<AttributeListImpl> AttrsLists;
   FoldingSet<AttributeSetNode> AttrsSetNodes;
 
   StringMap<MDString, BumpPtrAllocator> MDStringCache;
@@ -1129,13 +1211,13 @@ public:
 
   DenseMap<Type *, std::unique_ptr<ConstantAggregateZero>> CAZConstants;
 
-  typedef ConstantUniqueMap<ConstantArray> ArrayConstantsTy;
+  using ArrayConstantsTy = ConstantUniqueMap<ConstantArray>;
   ArrayConstantsTy ArrayConstants;
   
-  typedef ConstantUniqueMap<ConstantStruct> StructConstantsTy;
+  using StructConstantsTy = ConstantUniqueMap<ConstantStruct>;
   StructConstantsTy StructConstants;
   
-  typedef ConstantUniqueMap<ConstantVector> VectorConstantsTy;
+  using VectorConstantsTy = ConstantUniqueMap<ConstantVector>;
   VectorConstantsTy VectorConstants;
 
   DenseMap<PointerType *, std::unique_ptr<ConstantPointerNull>> CPNConstants;
@@ -1150,8 +1232,8 @@ public:
 
   ConstantUniqueMap<InlineAsm> InlineAsms;
 
-  ConstantInt *TheTrueVal;
-  ConstantInt *TheFalseVal;
+  ConstantInt *TheTrueVal = nullptr;
+  ConstantInt *TheFalseVal = nullptr;
 
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
 
@@ -1159,7 +1241,6 @@ public:
   Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy, TokenTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
-
   
   /// TypeAllocator - All dynamically allocated types are allocated from this.
   /// They live forever until the context is torn down.
@@ -1167,23 +1248,22 @@ public:
   
   DenseMap<unsigned, IntegerType*> IntegerTypes;
 
-  typedef DenseSet<FunctionType *, FunctionTypeKeyInfo> FunctionTypeSet;
+  using FunctionTypeSet = DenseSet<FunctionType *, FunctionTypeKeyInfo>;
   FunctionTypeSet FunctionTypes;
-  typedef DenseSet<StructType *, AnonStructTypeKeyInfo> StructTypeSet;
+  using StructTypeSet = DenseSet<StructType *, AnonStructTypeKeyInfo>;
   StructTypeSet AnonStructTypes;
   StringMap<StructType*> NamedStructTypes;
-  unsigned NamedStructTypesUniqueID;
+  unsigned NamedStructTypesUniqueID = 0;
     
   DenseMap<std::pair<Type *, uint64_t>, ArrayType*> ArrayTypes;
   DenseMap<std::pair<Type *, unsigned>, VectorType*> VectorTypes;
   DenseMap<Type*, PointerType*> PointerTypes;  // Pointers in AddrSpace = 0
   DenseMap<std::pair<Type*, unsigned>, PointerType*> ASPointerTypes;
 
-
   /// ValueHandles - This map keeps track of all of the value handles that are
   /// watching a Value*.  The Value::HasValueHandle bit is used to know
   /// whether or not a value has an entry in this map.
-  typedef DenseMap<Value*, ValueHandleBase*> ValueHandlesTy;
+  using ValueHandlesTy = DenseMap<Value *, ValueHandleBase *>;
   ValueHandlesTy ValueHandles;
   
   /// CustomMDKindNames - Map to hold the metadata string to ID mapping.
@@ -1219,6 +1299,20 @@ public:
   void getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const;
   uint32_t getOperandBundleTagID(StringRef Tag) const;
 
+  /// A set of interned synchronization scopes.  The StringMap maps
+  /// synchronization scope names to their respective synchronization scope IDs.
+  StringMap<SyncScope::ID> SSC;
+
+  /// getOrInsertSyncScopeID - Maps synchronization scope name to
+  /// synchronization scope ID.  Every synchronization scope registered with
+  /// LLVMContext has unique ID except pre-defined ones.
+  SyncScope::ID getOrInsertSyncScopeID(StringRef SSN);
+
+  /// getSyncScopeNames - Populates client supplied SmallVector with
+  /// synchronization scope names registered with LLVMContext.  Synchronization
+  /// scope names are ordered by increasing synchronization scope IDs.
+  void getSyncScopeNames(SmallVectorImpl<StringRef> &SSNs) const;
+
   /// Maintain the GC name for each function.
   ///
   /// This saves allocating an additional word in Function for programs which
@@ -1241,6 +1335,6 @@ public:
   OptBisect &getOptBisect();
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_IR_LLVMCONTEXTIMPL_H
diff --git a/contrib/llvm/lib/IR/LegacyPassManager.cpp b/contrib/llvm/lib/IR/LegacyPassManager.cpp
index 628a67bd..995e1e5 100644
--- a/contrib/llvm/lib/IR/LegacyPassManager.cpp
+++ b/contrib/llvm/lib/IR/LegacyPassManager.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManagers.h"
@@ -465,6 +466,11 @@ public:
   // null.  It may be called multiple times.
   static void createTheTimeInfo();
 
+  // print - Prints out timing information and then resets the timers.
+  void print() {
+    TG.print(*CreateInfoOutputFile());
+  }
+
   /// getPassTimer - Return the timer for the specified pass if it exists.
   Timer *getPassTimer(Pass *P) {
     if (P->getAsPMDataManager())
@@ -587,7 +593,7 @@ AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
     assert(Node && "cached analysis usage must be non null");
 
     AnUsageMap[P] = &Node->AU;
-    AnUsage = &Node->AU;;
+    AnUsage = &Node->AU;
   }
   return AnUsage;
 }
@@ -619,21 +625,21 @@ void PMTopLevelManager::schedulePass(Pass *P) {
     checkAnalysis = false;
 
     const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
-           E = RequiredSet.end(); I != E; ++I) {
+    for (const AnalysisID ID : RequiredSet) {
 
-      Pass *AnalysisPass = findAnalysisPass(*I);
+      Pass *AnalysisPass = findAnalysisPass(ID);
       if (!AnalysisPass) {
-        const PassInfo *PI = findAnalysisPassInfo(*I);
+        const PassInfo *PI = findAnalysisPassInfo(ID);
 
         if (!PI) {
           // Pass P is not in the global PassRegistry
           dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
           dbgs() << "Verify if there is a pass dependency cycle." << "\n";
           dbgs() << "Required Passes:" << "\n";
-          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
-                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
-            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+          for (const AnalysisID ID2 : RequiredSet) {
+            if (ID == ID2)
+              break;
+            Pass *AnalysisPass2 = findAnalysisPass(ID2);
             if (AnalysisPass2) {
               dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
             } else {
@@ -1064,17 +1070,15 @@ void PMDataManager::collectRequiredAndUsedAnalyses(
 void PMDataManager::initializeAnalysisImpl(Pass *P) {
   AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
 
-  for (AnalysisUsage::VectorType::const_iterator
-         I = AnUsage->getRequiredSet().begin(),
-         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
-    Pass *Impl = findAnalysisPass(*I, true);
+  for (const AnalysisID ID : AnUsage->getRequiredSet()) {
+    Pass *Impl = findAnalysisPass(ID, true);
     if (!Impl)
       // This may be analysis pass that is initialized on the fly.
       // If that is not the case then it will raise an assert when it is used.
       continue;
     AnalysisResolver *AR = P->getResolver();
     assert(AR && "Analysis Resolver is not set");
-    AR->addAnalysisImplsPair(*I, Impl);
+    AR->addAnalysisImplsPair(ID, Impl);
   }
 }
 
@@ -1106,21 +1110,19 @@ void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
 
   TPM->collectLastUses(LUses, P);
 
-  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
-         E = LUses.end(); I != E; ++I) {
+  for (Pass *P : LUses) {
     dbgs() << "--" << std::string(Offset*2, ' ');
-    (*I)->dumpPassStructure(0);
+    P->dumpPassStructure(0);
   }
 }
 
 void PMDataManager::dumpPassArguments() const {
-  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
-        E = PassVector.end(); I != E; ++I) {
-    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+  for (Pass *P : PassVector) {
+    if (PMDataManager *PMD = P->getAsPMDataManager())
       PMD->dumpPassArguments();
     else
       if (const PassInfo *PI =
-            TPM->findAnalysisPassInfo((*I)->getPassID()))
+            TPM->findAnalysisPassInfo(P->getPassID()))
         if (!PI->isAnalysisGroup())
           dbgs() << " -" << PI->getPassArgument();
   }
@@ -1249,9 +1251,8 @@ Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
 
 // Destructor
 PMDataManager::~PMDataManager() {
-  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
-         E = PassVector.end(); I != E; ++I)
-    delete *I;
+  for (Pass *P : PassVector)
+    delete P;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1278,35 +1279,35 @@ bool BBPassManager::runOnFunction(Function &F) {
 
   bool Changed = doInitialization(F);
 
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+  for (BasicBlock &BB : F)
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       BasicBlockPass *BP = getContainedPass(Index);
       bool LocalChanged = false;
 
-      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, BB.getName());
       dumpRequiredSet(BP);
 
       initializeAnalysisImpl(BP);
 
       {
         // If the pass crashes, remember this.
-        PassManagerPrettyStackEntry X(BP, *I);
+        PassManagerPrettyStackEntry X(BP, BB);
         TimeRegion PassTimer(getPassTimer(BP));
 
-        LocalChanged |= BP->runOnBasicBlock(*I);
+        LocalChanged |= BP->runOnBasicBlock(BB);
       }
 
       Changed |= LocalChanged;
       if (LocalChanged)
         dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
-                     I->getName());
+                     BB.getName());
       dumpPreservedSet(BP);
       dumpUsedSet(BP);
 
       verifyPreservedAnalysis(BP);
       removeNotPreservedAnalysis(BP);
       recordAvailableAnalysis(BP);
-      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+      removeDeadPasses(BP, BB.getName(), ON_BASICBLOCK_MSG);
     }
 
   return doFinalization(F) || Changed;
@@ -1752,6 +1753,13 @@ Timer *llvm::getPassTimer(Pass *P) {
   return nullptr;
 }
 
+/// If timing is enabled, report the times collected up to now and then reset
+/// them.
+void llvm::reportAndResetTimings() {
+  if (TheTimeInfo)
+    TheTimeInfo->print();
+}
+
 //===----------------------------------------------------------------------===//
 // PMStack implementation
 //
diff --git a/contrib/llvm/lib/IR/MDBuilder.cpp b/contrib/llvm/lib/IR/MDBuilder.cpp
index f4bfd59..b9c4f48 100644
--- a/contrib/llvm/lib/IR/MDBuilder.cpp
+++ b/contrib/llvm/lib/IR/MDBuilder.cpp
@@ -56,11 +56,16 @@ MDNode *MDBuilder::createUnpredictable() {
   return MDNode::get(Context, None);
 }
 
-MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) {
+MDNode *MDBuilder::createFunctionEntryCount(
+    uint64_t Count, const DenseSet<GlobalValue::GUID> *Imports) {
   Type *Int64Ty = Type::getInt64Ty(Context);
-  return MDNode::get(Context,
-                     {createString("function_entry_count"),
-                      createConstant(ConstantInt::get(Int64Ty, Count))});
+  SmallVector<Metadata *, 8> Ops;
+  Ops.push_back(createString("function_entry_count"));
+  Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count)));
+  if (Imports)
+    for (auto ID : *Imports)
+      Ops.push_back(createConstant(ConstantInt::get(Int64Ty, ID)));
+  return MDNode::get(Context, Ops);
 }
 
 MDNode *MDBuilder::createFunctionSectionPrefix(StringRef Prefix) {
diff --git a/contrib/llvm/lib/IR/Mangler.cpp b/contrib/llvm/lib/IR/Mangler.cpp
index 41e11b3..03723bf 100644
--- a/contrib/llvm/lib/IR/Mangler.cpp
+++ b/contrib/llvm/lib/IR/Mangler.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/IR/Mangler.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -172,3 +173,34 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   raw_svector_ostream OS(OutName);
   getNameWithPrefix(OS, GV, CannotUsePrivateLabel);
 }
+
+void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
+                                        const Triple &TT, Mangler &Mangler) {
+  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
+    return;
+
+  if (TT.isKnownWindowsMSVCEnvironment())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
+    std::string Flag;
+    raw_string_ostream FlagOS(Flag);
+    Mangler.getNameWithPrefix(FlagOS, GV, false);
+    FlagOS.flush();
+    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
+      OS << Flag.substr(1);
+    else
+      OS << Flag;
+  } else {
+    Mangler.getNameWithPrefix(OS, GV, false);
+  }
+
+  if (!GV->getValueType()->isFunctionTy()) {
+    if (TT.isKnownWindowsMSVCEnvironment())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+}
diff --git a/contrib/llvm/lib/IR/Metadata.cpp b/contrib/llvm/lib/IR/Metadata.cpp
index 1d19304..ac02ff7 100644
--- a/contrib/llvm/lib/IR/Metadata.cpp
+++ b/contrib/llvm/lib/IR/Metadata.cpp
@@ -11,20 +11,52 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Metadata.h"
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/TrackingMDRef.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -203,7 +235,7 @@ void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) {
     return;
 
   // Copy out uses since UseMap will get touched below.
-  typedef std::pair<void *, std::pair<OwnerTy, uint64_t>> UseTy;
+  using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
   std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
     return L.second.second < R.second.second;
@@ -256,7 +288,7 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) {
   }
 
   // Copy out uses since UseMap could get touched below.
-  typedef std::pair<void *, std::pair<OwnerTy, uint64_t>> UseTy;
+  using UseTy = std::pair<void *, std::pair<OwnerTy, uint64_t>>;
   SmallVector<UseTy, 8> Uses(UseMap.begin(), UseMap.end());
   std::sort(Uses.begin(), Uses.end(), [](const UseTy &L, const UseTy &R) {
     return L.second.second < R.second.second;
@@ -728,8 +760,8 @@ static T *uniquifyImpl(T *N, DenseSet<T *, InfoT> &Store) {
 }
 
 template <class NodeTy> struct MDNode::HasCachedHash {
-  typedef char Yes[1];
-  typedef char No[2];
+  using Yes = char[1];
+  using No = char[2];
   template <class U, U Val> struct SFINAE {};
 
   template <class U>
@@ -937,7 +969,7 @@ static void addRange(SmallVectorImpl<ConstantInt *> &EndPoints,
 
 MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   // Given two ranges, we want to compute the union of the ranges. This
-  // is slightly complitade by having to combine the intervals and merge
+  // is slightly complicated by having to combine the intervals and merge
   // the ones that overlap.
 
   if (!A || !B)
@@ -946,7 +978,7 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   if (A == B)
     return A;
 
-  // First, walk both lists in older of the lower boundary of each interval.
+  // First, walk both lists in order of the lower boundary of each interval.
   // At each step, try to merge the new interval to the last one we adedd.
   SmallVector<ConstantInt *, 4> EndPoints;
   int AI = 0;
@@ -1027,8 +1059,7 @@ static SmallVector<TrackingMDRef, 4> &getNMDOps(void *Operands) {
 }
 
 NamedMDNode::NamedMDNode(const Twine &N)
-    : Name(N.str()), Parent(nullptr),
-      Operands(new SmallVector<TrackingMDRef, 4>()) {}
+    : Name(N.str()), Operands(new SmallVector<TrackingMDRef, 4>()) {}
 
 NamedMDNode::~NamedMDNode() {
   dropAllReferences();
@@ -1308,17 +1339,26 @@ bool Instruction::extractProfTotalWeight(uint64_t &TotalVal) const {
     return false;
 
   auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
-  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+  if (!ProfDataName)
     return false;
 
-  TotalVal = 0;
-  for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
-    auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
-    if (!V)
-      return false;
-    TotalVal += V->getValue().getZExtValue();
+  if (ProfDataName->getString().equals("branch_weights")) {
+    TotalVal = 0;
+    for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+      auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
+      if (!V)
+        return false;
+      TotalVal += V->getValue().getZExtValue();
+    }
+    return true;
+  } else if (ProfDataName->getString().equals("VP") &&
+             ProfileData->getNumOperands() > 3) {
+    TotalVal = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2))
+                   ->getValue()
+                   .getZExtValue();
+    return true;
   }
-  return true;
+  return false;
 }
 
 void Instruction::clearMetadataHashEntries() {
@@ -1432,7 +1472,7 @@ void GlobalObject::copyMetadata(const GlobalObject *Other, unsigned Offset) {
       if (E)
         OrigElements = E->getElements();
       std::vector<uint64_t> Elements(OrigElements.size() + 2);
-      Elements[0] = dwarf::DW_OP_plus;
+      Elements[0] = dwarf::DW_OP_plus_uconst;
       Elements[1] = Offset;
       std::copy(OrigElements.begin(), OrigElements.end(), Elements.begin() + 2);
       E = DIExpression::get(getContext(), Elements);
@@ -1446,7 +1486,7 @@ void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
   addMetadata(
       LLVMContext::MD_type,
       *MDTuple::get(getContext(),
-                    {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                    {ConstantAsMetadata::get(ConstantInt::get(
                          Type::getInt64Ty(getContext()), Offset)),
                      TypeID}));
 }
@@ -1459,6 +1499,15 @@ DISubprogram *Function::getSubprogram() const {
   return cast_or_null<DISubprogram>(getMetadata(LLVMContext::MD_dbg));
 }
 
+bool Function::isDebugInfoForProfiling() const {
+  if (DISubprogram *SP = getSubprogram()) {
+    if (DICompileUnit *CU = SP->getUnit()) {
+      return CU->getDebugInfoForProfiling();
+    }
+  }
+  return false;
+}
+
 void GlobalVariable::addDebugInfo(DIGlobalVariableExpression *GV) {
   addMetadata(LLVMContext::MD_dbg, *GV);
 }
diff --git a/contrib/llvm/lib/IR/Module.cpp b/contrib/llvm/lib/IR/Module.cpp
index 1911f84..c230a50 100644
--- a/contrib/llvm/lib/IR/Module.cpp
+++ b/contrib/llvm/lib/IR/Module.cpp
@@ -1,4 +1,4 @@
-//===-- Module.cpp - Implement the Module class ---------------------------===//
+//===- Module.cpp - Implement the Module class ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,25 +13,44 @@
 
 #include "llvm/IR/Module.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/SymbolTableListTraits.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include <algorithm>
-#include <cstdarg>
-#include <cstdlib>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -69,7 +88,7 @@ Module::~Module() {
   delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
 }
 
-RandomNumberGenerator *Module::createRNG(const Pass* P) const {
+std::unique_ptr<RandomNumberGenerator> Module::createRNG(const Pass* P) const {
   SmallString<32> Salt(P->getPassName());
 
   // This RNG is guaranteed to produce the same random stream only
@@ -84,7 +103,7 @@ RandomNumberGenerator *Module::createRNG(const Pass* P) const {
   // store salt metadata from the Module constructor.
   Salt += sys::path::filename(getModuleIdentifier());
 
-  return new RandomNumberGenerator(Salt);
+  return std::unique_ptr<RandomNumberGenerator>(new RandomNumberGenerator(Salt));
 }
 
 /// getNamedValue - Return the first global value in the module with
@@ -120,9 +139,8 @@ void Module::getOperandBundleTags(SmallVectorImpl<StringRef> &Result) const {
 // it.  This is nice because it allows most passes to get away with not handling
 // the symbol table directly for this common task.
 //
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      FunctionType *Ty,
-                                      AttributeSet AttributeList) {
+Constant *Module::getOrInsertFunction(StringRef Name, FunctionType *Ty,
+                                      AttributeList AttributeList) {
   // See if we have a definition for the specified function already.
   GlobalValue *F = getNamedValue(Name);
   if (!F) {
@@ -145,49 +163,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
 
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       FunctionType *Ty) {
-  return getOrInsertFunction(Name, Ty, AttributeSet());
-}
-
-// getOrInsertFunction - Look up the specified function in the module symbol
-// table.  If it does not exist, add a prototype for the function and return it.
-// This version of the method takes a null terminated list of function
-// arguments, which makes it easier for clients to use.
-//
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      AttributeSet AttributeList,
-                                      Type *RetTy, ...) {
-  va_list Args;
-  va_start(Args, RetTy);
-
-  // Build the list of argument types...
-  std::vector<Type*> ArgTys;
-  while (Type *ArgTy = va_arg(Args, Type*))
-    ArgTys.push_back(ArgTy);
-
-  va_end(Args);
-
-  // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name,
-                             FunctionType::get(RetTy, ArgTys, false),
-                             AttributeList);
-}
-
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      Type *RetTy, ...) {
-  va_list Args;
-  va_start(Args, RetTy);
-
-  // Build the list of argument types...
-  std::vector<Type*> ArgTys;
-  while (Type *ArgTy = va_arg(Args, Type*))
-    ArgTys.push_back(ArgTy);
-
-  va_end(Args);
-
-  // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name,
-                             FunctionType::get(RetTy, ArgTys, false),
-                             AttributeSet());
+  return getOrInsertFunction(Name, Ty, AttributeList());
 }
 
 // getFunction - Look up the specified function in the module symbol table.
@@ -208,7 +184,8 @@ Function *Module::getFunction(StringRef Name) const {
 /// If AllowLocal is set to true, this function will return types that
 /// have an local. By default, these types are not returned.
 ///
-GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
+GlobalVariable *Module::getGlobalVariable(StringRef Name,
+                                          bool AllowLocal) const {
   if (GlobalVariable *Result =
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
@@ -465,6 +442,14 @@ void Module::dropAllReferences() {
     GIF.dropAllReferences();
 }
 
+unsigned Module::getNumberRegisterParameters() const {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("NumRegisterParameters"));
+  if (!Val)
+    return 0;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
+
 unsigned Module::getDwarfVersion() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("Dwarf Version"));
   if (!Val)
@@ -496,7 +481,7 @@ PICLevel::Level Module::getPICLevel() const {
 }
 
 void Module::setPICLevel(PICLevel::Level PL) {
-  addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
+  addModuleFlag(ModFlagBehavior::Max, "PIC Level", PL);
 }
 
 PIELevel::Level Module::getPIELevel() const {
@@ -510,7 +495,7 @@ PIELevel::Level Module::getPIELevel() const {
 }
 
 void Module::setPIELevel(PIELevel::Level PL) {
-  addModuleFlag(ModFlagBehavior::Error, "PIE Level", PL);
+  addModuleFlag(ModFlagBehavior::Max, "PIE Level", PL);
 }
 
 void Module::setProfileSummary(Metadata *M) {
diff --git a/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp b/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp
index 9072f4b..51c4bae 100644
--- a/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/contrib/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -16,61 +16,13 @@
 #include "llvm/ADT/StringMap.h"
 using namespace llvm;
 
-// Create the combined module index/summary from multiple
-// per-module instances.
-void ModuleSummaryIndex::mergeFrom(std::unique_ptr<ModuleSummaryIndex> Other,
-                                   uint64_t NextModuleId) {
-  if (Other->modulePaths().empty())
-    return;
-
-  assert(Other->modulePaths().size() == 1 &&
-         "Can only merge from an single-module index at that time");
-
-  StringRef OtherModPath = Other->modulePaths().begin()->first();
-  StringRef ModPath = addModulePath(OtherModPath, NextModuleId,
-                                    Other->getModuleHash(OtherModPath))
-                          ->first();
-
-  for (auto &OtherGlobalValSummaryLists : *Other) {
-    GlobalValue::GUID ValueGUID = OtherGlobalValSummaryLists.first;
-    GlobalValueSummaryList &List = OtherGlobalValSummaryLists.second;
-
-    // Assert that the value summary list only has one entry, since we shouldn't
-    // have duplicate names within a single per-module index.
-    assert(List.size() == 1);
-    std::unique_ptr<GlobalValueSummary> Summary = std::move(List.front());
-
-    // Note the module path string ref was copied above and is still owned by
-    // the original per-module index. Reset it to the new module path
-    // string reference owned by the combined index.
-    Summary->setModulePath(ModPath);
-
-    // Add new value summary to existing list. There may be duplicates when
-    // combining GlobalValueMap entries, due to COMDAT values. Any local
-    // values were given unique global IDs.
-    addGlobalValueSummary(ValueGUID, std::move(Summary));
-  }
-}
-
-void ModuleSummaryIndex::removeEmptySummaryEntries() {
-  for (auto MI = begin(), MIE = end(); MI != MIE;) {
-    // Only expect this to be called on a per-module index, which has a single
-    // entry per value entry list.
-    assert(MI->second.size() == 1);
-    if (!MI->second[0])
-      MI = GlobalValueMap.erase(MI);
-    else
-      ++MI;
-  }
-}
-
 // Collect for the given module the list of function it defines
 // (GUID -> Summary).
 void ModuleSummaryIndex::collectDefinedFunctionsForModule(
     StringRef ModulePath, GVSummaryMapTy &GVSummaryMap) const {
   for (auto &GlobalList : *this) {
     auto GUID = GlobalList.first;
-    for (auto &GlobSummary : GlobalList.second) {
+    for (auto &GlobSummary : GlobalList.second.SummaryList) {
       auto *Summary = dyn_cast_or_null<FunctionSummary>(GlobSummary.get());
       if (!Summary)
         // Ignore global variable, focus on functions
@@ -88,7 +40,7 @@ void ModuleSummaryIndex::collectDefinedGVSummariesPerModule(
     StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries) const {
   for (auto &GlobalList : *this) {
     auto GUID = GlobalList.first;
-    for (auto &Summary : GlobalList.second) {
+    for (auto &Summary : GlobalList.second.SummaryList) {
       ModuleToDefinedGVSummaries[Summary->modulePath()][GUID] = Summary.get();
     }
   }
@@ -97,10 +49,23 @@ void ModuleSummaryIndex::collectDefinedGVSummariesPerModule(
 GlobalValueSummary *
 ModuleSummaryIndex::getGlobalValueSummary(uint64_t ValueGUID,
                                           bool PerModuleIndex) const {
-  auto SummaryList = findGlobalValueSummaryList(ValueGUID);
-  assert(SummaryList != end() && "GlobalValue not found in index");
-  assert((!PerModuleIndex || SummaryList->second.size() == 1) &&
+  auto VI = getValueInfo(ValueGUID);
+  assert(VI && "GlobalValue not found in index");
+  assert((!PerModuleIndex || VI.getSummaryList().size() == 1) &&
          "Expected a single entry per global value in per-module index");
-  auto &Summary = SummaryList->second[0];
+  auto &Summary = VI.getSummaryList()[0];
   return Summary.get();
 }
+
+bool ModuleSummaryIndex::isGUIDLive(GlobalValue::GUID GUID) const {
+  auto VI = getValueInfo(GUID);
+  if (!VI)
+    return true;
+  const auto &SummaryList = VI.getSummaryList();
+  if (SummaryList.empty())
+    return true;
+  for (auto &I : SummaryList)
+    if (isGlobalValueLive(I.get()))
+      return true;
+  return false;
+}
diff --git a/contrib/llvm/lib/IR/Operator.cpp b/contrib/llvm/lib/IR/Operator.cpp
index 2fba24d..7d819f3 100644
--- a/contrib/llvm/lib/IR/Operator.cpp
+++ b/contrib/llvm/lib/IR/Operator.cpp
@@ -1,4 +1,18 @@
+//===-- Operator.cpp - Implement the LLVM operators -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the non-inline methods for the LLVM Operator classes.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
diff --git a/contrib/llvm/lib/IR/OptBisect.cpp b/contrib/llvm/lib/IR/OptBisect.cpp
index e9574ca..f1c7005 100644
--- a/contrib/llvm/lib/IR/OptBisect.cpp
+++ b/contrib/llvm/lib/IR/OptBisect.cpp
@@ -13,11 +13,12 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/OptBisect.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/OptBisect.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
@@ -39,14 +40,6 @@ static void printPassMessage(const StringRef &Name, int PassNum,
          << "(" << PassNum << ") " << Name << " on " << TargetDesc << "\n";
 }
 
-static void printCaseMessage(int CaseNum, StringRef Msg, bool Running) {
-  if (Running)
-    errs() << "BISECT: running case (";
-  else
-    errs() << "BISECT: NOT running case (";
-  errs() << CaseNum << "): " << Msg << "\n";
-}
-
 static std::string getDescription(const Module &M) {
   return "module (" + M.getName().str() + ")";
 }
@@ -61,13 +54,20 @@ static std::string getDescription(const BasicBlock &BB) {
 }
 
 static std::string getDescription(const Loop &L) {
-  // FIXME: I'd like to be able to provide a better description here, but
-  //        calling L->getHeader() would introduce a new dependency on the
-  //        LLVMCore library.
+  // FIXME: Move into LoopInfo so we can get a better description
+  // (and avoid a circular dependency between IR and Analysis).
   return "loop";
 }
 
+static std::string getDescription(const Region &R) {
+  // FIXME: Move into RegionInfo so we can get a better description
+  // (and avoid a circular dependency between IR and Analysis).
+  return "region";
+}
+
 static std::string getDescription(const CallGraphSCC &SCC) {
+  // FIXME: Move into CallGraphSCCPass to avoid circular dependency between
+  // IR and Analysis.
   std::string Desc = "SCC (";
   bool First = true;
   for (CallGraphNode *CGN : SCC) {
@@ -91,6 +91,7 @@ template bool OptBisect::shouldRunPass(const Pass *, const Function &);
 template bool OptBisect::shouldRunPass(const Pass *, const BasicBlock &);
 template bool OptBisect::shouldRunPass(const Pass *, const Loop &);
 template bool OptBisect::shouldRunPass(const Pass *, const CallGraphSCC &);
+template bool OptBisect::shouldRunPass(const Pass *, const Region &);
 
 template <class UnitT>
 bool OptBisect::shouldRunPass(const Pass *P, const UnitT &U) {
@@ -108,13 +109,3 @@ bool OptBisect::checkPass(const StringRef PassName,
   printPassMessage(PassName, CurBisectNum, TargetDesc, ShouldRun);
   return ShouldRun;
 }
-
-bool OptBisect::shouldRunCase(const Twine &Msg) {
-  if (!BisectEnabled)
-    return true;
-  int CurFuelNum = ++LastBisectNum;
-  bool ShouldRun = (OptBisectLimit == -1 || CurFuelNum <= OptBisectLimit);
-  printCaseMessage(CurFuelNum, Msg.str(), ShouldRun);
-  return ShouldRun;
-}
-
diff --git a/contrib/llvm/lib/IR/Pass.cpp b/contrib/llvm/lib/IR/Pass.cpp
index a42945e..f1b5f2f 100644
--- a/contrib/llvm/lib/IR/Pass.cpp
+++ b/contrib/llvm/lib/IR/Pass.cpp
@@ -118,10 +118,12 @@ void Pass::print(raw_ostream &O,const Module*) const {
   O << "Pass::print not implemented for pass: '" << getPassName() << "'!\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // dump - call print(cerr);
 LLVM_DUMP_METHOD void Pass::dump() const {
   print(dbgs(), nullptr);
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // ImmutablePass Implementation
diff --git a/contrib/llvm/lib/IR/PassManager.cpp b/contrib/llvm/lib/IR/PassManager.cpp
index 8f68bb1..47fdfed 100644
--- a/contrib/llvm/lib/IR/PassManager.cpp
+++ b/contrib/llvm/lib/IR/PassManager.cpp
@@ -91,4 +91,6 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
 }
 }
 
+AnalysisSetKey CFGAnalyses::SetKey;
+
 AnalysisSetKey PreservedAnalyses::AllAnalysesKey;
diff --git a/contrib/llvm/lib/IR/PassRegistry.cpp b/contrib/llvm/lib/IR/PassRegistry.cpp
index 584dee2..c0f6f07 100644
--- a/contrib/llvm/lib/IR/PassRegistry.cpp
+++ b/contrib/llvm/lib/IR/PassRegistry.cpp
@@ -105,8 +105,6 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
           ImplementationInfo->getNormalCtor() &&
           "Cannot specify pass as default if it does not have a default ctor");
       InterfaceInfo->setNormalCtor(ImplementationInfo->getNormalCtor());
-      InterfaceInfo->setTargetMachineCtor(
-          ImplementationInfo->getTargetMachineCtor());
     }
   }
 
diff --git a/contrib/llvm/lib/IR/SafepointIRVerifier.cpp b/contrib/llvm/lib/IR/SafepointIRVerifier.cpp
new file mode 100644
index 0000000..8b328c2
--- /dev/null
+++ b/contrib/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -0,0 +1,437 @@
+//===-- SafepointIRVerifier.cpp - Verify gc.statepoint invariants ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Run a sanity check on the IR to ensure that Safepoints - if they've been
+// inserted - were inserted correctly.  In particular, look for use of
+// non-relocated values after a safepoint.  It's primary use is to check the
+// correctness of safepoint insertion immediately after insertion, but it can
+// also be used to verify that later transforms have not found a way to break
+// safepoint semenatics.
+//
+// In its current form, this verify checks a property which is sufficient, but
+// not neccessary for correctness.  There are some cases where an unrelocated
+// pointer can be used after the safepoint.  Consider this example:
+//
+//    a = ...
+//    b = ...
+//    (a',b') = safepoint(a,b)
+//    c = cmp eq a b
+//    br c, ..., ....
+//
+// Because it is valid to reorder 'c' above the safepoint, this is legal.  In
+// practice, this is a somewhat uncommon transform, but CodeGenPrep does create
+// idioms like this.  The verifier knows about these cases and avoids reporting
+// false positives.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/SafepointIRVerifier.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "safepoint-ir-verifier"
+
+using namespace llvm;
+
+/// This option is used for writing test cases.  Instead of crashing the program
+/// when verification fails, report a message to the console (for FileCheck
+/// usage) and continue execution as if nothing happened.
+static cl::opt<bool> PrintOnly("safepoint-ir-verifier-print-only",
+                               cl::init(false));
+
+static void Verify(const Function &F, const DominatorTree &DT);
+
+struct SafepointIRVerifier : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DominatorTree DT;
+  SafepointIRVerifier() : FunctionPass(ID) {
+    initializeSafepointIRVerifierPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    DT.recalculate(F);
+    Verify(F, DT);
+    return false; // no modifications
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  StringRef getPassName() const override { return "safepoint verifier"; }
+};
+
+void llvm::verifySafepointIR(Function &F) {
+  SafepointIRVerifier pass;
+  pass.runOnFunction(F);
+}
+
+char SafepointIRVerifier::ID = 0;
+
+FunctionPass *llvm::createSafepointIRVerifierPass() {
+  return new SafepointIRVerifier();
+}
+
+INITIALIZE_PASS_BEGIN(SafepointIRVerifier, "verify-safepoint-ir",
+                      "Safepoint IR Verifier", false, true)
+INITIALIZE_PASS_END(SafepointIRVerifier, "verify-safepoint-ir",
+                    "Safepoint IR Verifier", false, true)
+
+static bool isGCPointerType(Type *T) {
+  if (auto *PT = dyn_cast<PointerType>(T))
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.
+    return (1 == PT->getAddressSpace());
+  return false;
+}
+
+static bool containsGCPtrType(Type *Ty) {
+  if (isGCPointerType(Ty))
+    return true;
+  if (VectorType *VT = dyn_cast<VectorType>(Ty))
+    return isGCPointerType(VT->getScalarType());
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+    return containsGCPtrType(AT->getElementType());
+  if (StructType *ST = dyn_cast<StructType>(Ty))
+    return std::any_of(ST->subtypes().begin(), ST->subtypes().end(),
+                       containsGCPtrType);
+  return false;
+}
+
+// Debugging aid -- prints a [Begin, End) range of values.
+template<typename IteratorTy>
+static void PrintValueSet(raw_ostream &OS, IteratorTy Begin, IteratorTy End) {
+  OS << "[ ";
+  while (Begin != End) {
+    OS << **Begin << " ";
+    ++Begin;
+  }
+  OS << "]";
+}
+
+/// The verifier algorithm is phrased in terms of availability.  The set of
+/// values "available" at a given point in the control flow graph is the set of
+/// correctly relocated value at that point, and is a subset of the set of
+/// definitions dominating that point.
+
+/// State we compute and track per basic block.
+struct BasicBlockState {
+  // Set of values available coming in, before the phi nodes
+  DenseSet<const Value *> AvailableIn;
+
+  // Set of values available going out
+  DenseSet<const Value *> AvailableOut;
+
+  // AvailableOut minus AvailableIn.
+  // All elements are Instructions
+  DenseSet<const Value *> Contribution;
+
+  // True if this block contains a safepoint and thus AvailableIn does not
+  // contribute to AvailableOut.
+  bool Cleared = false;
+};
+
+
+/// Gather all the definitions dominating the start of BB into Result.  This is
+/// simply the Defs introduced by every dominating basic block and the function
+/// arguments.
+static void GatherDominatingDefs(const BasicBlock *BB,
+                                 DenseSet<const Value *> &Result,
+                                 const DominatorTree &DT,
+                    DenseMap<const BasicBlock *, BasicBlockState *> &BlockMap) {
+  DomTreeNode *DTN = DT[const_cast<BasicBlock *>(BB)];
+
+  while (DTN->getIDom()) {
+    DTN = DTN->getIDom();
+    const auto &Defs = BlockMap[DTN->getBlock()]->Contribution;
+    Result.insert(Defs.begin(), Defs.end());
+    // If this block is 'Cleared', then nothing LiveIn to this block can be
+    // available after this block completes.  Note: This turns out to be 
+    // really important for reducing memory consuption of the initial available
+    // sets and thus peak memory usage by this verifier.
+    if (BlockMap[DTN->getBlock()]->Cleared)
+      return;
+  }
+
+  for (const Argument &A : BB->getParent()->args())
+    if (containsGCPtrType(A.getType()))
+      Result.insert(&A);
+}
+
+/// Model the effect of an instruction on the set of available values.
+static void TransferInstruction(const Instruction &I, bool &Cleared,
+                              DenseSet<const Value *> &Available) {
+  if (isStatepoint(I)) {
+    Cleared = true;
+    Available.clear();
+  } else if (containsGCPtrType(I.getType()))
+    Available.insert(&I);
+}
+
+/// Compute the AvailableOut set for BB, based on the
+/// BasicBlockState BBS, which is the BasicBlockState for BB. FirstPass is set
+/// when the verifier runs for the first time computing the AvailableOut set
+/// for BB.
+static void TransferBlock(const BasicBlock *BB,
+                          BasicBlockState &BBS, bool FirstPass) {
+
+  const DenseSet<const Value *> &AvailableIn = BBS.AvailableIn; 
+  DenseSet<const Value *> &AvailableOut  = BBS.AvailableOut;
+
+  if (BBS.Cleared) {
+    // AvailableOut does not change no matter how the input changes, just
+    // leave it be.  We need to force this calculation the first time so that
+    // we have a AvailableOut at all.
+    if (FirstPass) {
+      AvailableOut = BBS.Contribution;
+    }
+  } else {
+    // Otherwise, we need to reduce the AvailableOut set by things which are no
+    // longer in our AvailableIn
+    DenseSet<const Value *> Temp = BBS.Contribution;
+    set_union(Temp, AvailableIn);
+    AvailableOut = std::move(Temp);
+  }
+
+  DEBUG(dbgs() << "Transfered block " << BB->getName() << " from ";
+        PrintValueSet(dbgs(), AvailableIn.begin(), AvailableIn.end());
+        dbgs() << " to ";
+        PrintValueSet(dbgs(), AvailableOut.begin(), AvailableOut.end());
+        dbgs() << "\n";);
+}
+
+/// A given derived pointer can have multiple base pointers through phi/selects.
+/// This type indicates when the base pointer is exclusively constant
+/// (ExclusivelySomeConstant), and if that constant is proven to be exclusively
+/// null, we record that as ExclusivelyNull. In all other cases, the BaseType is
+/// NonConstant.
+enum BaseType {
+  NonConstant = 1, // Base pointers is not exclusively constant.
+  ExclusivelyNull,
+  ExclusivelySomeConstant // Base pointers for a given derived pointer is from a
+                          // set of constants, but they are not exclusively
+                          // null.
+};
+
+/// Return the baseType for Val which states whether Val is exclusively
+/// derived from constant/null, or not exclusively derived from constant.
+/// Val is exclusively derived off a constant base when all operands of phi and
+/// selects are derived off a constant base.
+static enum BaseType getBaseType(const Value *Val) {
+
+  SmallVector<const Value *, 32> Worklist;
+  DenseSet<const Value *> Visited;
+  bool isExclusivelyDerivedFromNull = true;
+  Worklist.push_back(Val);
+  // Strip through all the bitcasts and geps to get base pointer. Also check for
+  // the exclusive value when there can be multiple base pointers (through phis
+  // or selects).
+  while(!Worklist.empty()) {
+    const Value *V = Worklist.pop_back_val();
+    if (!Visited.insert(V).second)
+      continue;
+
+    if (const auto *CI = dyn_cast<CastInst>(V)) {
+      Worklist.push_back(CI->stripPointerCasts());
+      continue;
+    }
+    if (const auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
+      Worklist.push_back(GEP->getPointerOperand());
+      continue;
+    }
+    // Push all the incoming values of phi node into the worklist for
+    // processing.
+    if (const auto *PN = dyn_cast<PHINode>(V)) {
+      for (Value *InV: PN->incoming_values())
+        Worklist.push_back(InV);
+      continue;
+    }
+    if (const auto *SI = dyn_cast<SelectInst>(V)) {
+      // Push in the true and false values
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
+    if (isa<Constant>(V)) {
+      // We found at least one base pointer which is non-null, so this derived
+      // pointer is not exclusively derived from null.
+      if (V != Constant::getNullValue(V->getType()))
+        isExclusivelyDerivedFromNull = false;
+      // Continue processing the remaining values to make sure it's exclusively
+      // constant.
+      continue;
+    }
+    // At this point, we know that the base pointer is not exclusively
+    // constant.
+    return BaseType::NonConstant;
+  }
+  // Now, we know that the base pointer is exclusively constant, but we need to
+  // differentiate between exclusive null constant and non-null constant.
+  return isExclusivelyDerivedFromNull ? BaseType::ExclusivelyNull
+                                      : BaseType::ExclusivelySomeConstant;
+}
+
+static void Verify(const Function &F, const DominatorTree &DT) {
+  SpecificBumpPtrAllocator<BasicBlockState> BSAllocator;
+  DenseMap<const BasicBlock *, BasicBlockState *> BlockMap;
+ 
+  DEBUG(dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n");
+  if (PrintOnly)
+    dbgs() << "Verifying gc pointers in function: " << F.getName() << "\n";
+
+
+  for (const BasicBlock &BB : F) {
+    BasicBlockState *BBS = new(BSAllocator.Allocate()) BasicBlockState;
+    for (const auto &I : BB)
+      TransferInstruction(I, BBS->Cleared, BBS->Contribution);
+    BlockMap[&BB] = BBS;
+  }
+
+  for (auto &BBI : BlockMap) {
+    GatherDominatingDefs(BBI.first, BBI.second->AvailableIn, DT, BlockMap);
+    TransferBlock(BBI.first, *BBI.second, true);
+  }
+
+  SetVector<const BasicBlock *> Worklist;
+  for (auto &BBI : BlockMap)
+    Worklist.insert(BBI.first);
+
+  // This loop iterates the AvailableIn and AvailableOut sets to a fixed point.
+  // The AvailableIn and AvailableOut sets decrease as we iterate.
+  while (!Worklist.empty()) {
+    const BasicBlock *BB = Worklist.pop_back_val();
+    BasicBlockState *BBS = BlockMap[BB];
+
+    size_t OldInCount = BBS->AvailableIn.size();
+    for (const BasicBlock *PBB : predecessors(BB))
+      set_intersect(BBS->AvailableIn, BlockMap[PBB]->AvailableOut);
+
+    if (OldInCount == BBS->AvailableIn.size())
+      continue;
+
+    assert(OldInCount > BBS->AvailableIn.size() && "invariant!");
+
+    size_t OldOutCount = BBS->AvailableOut.size();
+    TransferBlock(BB, *BBS, false);
+    if (OldOutCount != BBS->AvailableOut.size()) {
+      assert(OldOutCount > BBS->AvailableOut.size() && "invariant!");
+      Worklist.insert(succ_begin(BB), succ_end(BB));
+    }
+  }
+
+  // We now have all the information we need to decide if the use of a heap
+  // reference is legal or not, given our safepoint semantics.
+
+  bool AnyInvalidUses = false;
+
+  auto ReportInvalidUse = [&AnyInvalidUses](const Value &V,
+                                            const Instruction &I) {
+    errs() << "Illegal use of unrelocated value found!\n";
+    errs() << "Def: " << V << "\n";
+    errs() << "Use: " << I << "\n";
+    if (!PrintOnly)
+      abort();
+    AnyInvalidUses = true;
+  };
+
+  auto isNotExclusivelyConstantDerived = [](const Value *V) {
+    return getBaseType(V) == BaseType::NonConstant;
+  };
+
+  for (const BasicBlock &BB : F) {
+    // We destructively modify AvailableIn as we traverse the block instruction
+    // by instruction.
+    DenseSet<const Value *> &AvailableSet = BlockMap[&BB]->AvailableIn;
+    for (const Instruction &I : BB) {
+      if (const PHINode *PN = dyn_cast<PHINode>(&I)) {
+        if (containsGCPtrType(PN->getType()))
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+            const BasicBlock *InBB = PN->getIncomingBlock(i);
+            const Value *InValue = PN->getIncomingValue(i);
+
+            if (isNotExclusivelyConstantDerived(InValue) &&
+                !BlockMap[InBB]->AvailableOut.count(InValue))
+              ReportInvalidUse(*InValue, *PN);
+          }
+      } else if (isa<CmpInst>(I) &&
+                 containsGCPtrType(I.getOperand(0)->getType())) {
+        Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+        enum BaseType baseTyLHS = getBaseType(LHS),
+                      baseTyRHS = getBaseType(RHS);
+
+        // Returns true if LHS and RHS are unrelocated pointers and they are
+        // valid unrelocated uses.
+        auto hasValidUnrelocatedUse = [&AvailableSet, baseTyLHS, baseTyRHS, &LHS, &RHS] () {
+            // A cmp instruction has valid unrelocated pointer operands only if
+            // both operands are unrelocated pointers.
+            // In the comparison between two pointers, if one is an unrelocated
+            // use, the other *should be* an unrelocated use, for this
+            // instruction to contain valid unrelocated uses. This unrelocated
+            // use can be a null constant as well, or another unrelocated
+            // pointer.
+            if (AvailableSet.count(LHS) || AvailableSet.count(RHS))
+              return false;
+            // Constant pointers (that are not exclusively null) may have
+            // meaning in different VMs, so we cannot reorder the compare
+            // against constant pointers before the safepoint. In other words,
+            // comparison of an unrelocated use against a non-null constant
+            // maybe invalid.
+            if ((baseTyLHS == BaseType::ExclusivelySomeConstant &&
+                 baseTyRHS == BaseType::NonConstant) ||
+                (baseTyLHS == BaseType::NonConstant &&
+                 baseTyRHS == BaseType::ExclusivelySomeConstant))
+              return false;
+            // All other cases are valid cases enumerated below:
+            // 1. Comparison between an exlusively derived null pointer and a
+            // constant base pointer.
+            // 2. Comparison between an exlusively derived null pointer and a
+            // non-constant unrelocated base pointer.
+            // 3. Comparison between 2 unrelocated pointers.
+            return true;
+        };
+        if (!hasValidUnrelocatedUse()) {
+          // Print out all non-constant derived pointers that are unrelocated
+          // uses, which are invalid.
+          if (baseTyLHS == BaseType::NonConstant && !AvailableSet.count(LHS))
+            ReportInvalidUse(*LHS, I);
+          if (baseTyRHS == BaseType::NonConstant && !AvailableSet.count(RHS))
+            ReportInvalidUse(*RHS, I);
+        }
+      } else {
+        for (const Value *V : I.operands())
+          if (containsGCPtrType(V->getType()) &&
+              isNotExclusivelyConstantDerived(V) && !AvailableSet.count(V))
+            ReportInvalidUse(*V, I);
+      }
+
+      bool Cleared = false;
+      TransferInstruction(I, Cleared, AvailableSet);
+      (void)Cleared;
+    }
+  }
+
+  if (PrintOnly && !AnyInvalidUses) {
+    dbgs() << "No illegal uses found by SafepointIRVerifier in: " << F.getName()
+           << "\n";
+  }
+}
diff --git a/contrib/llvm/lib/IR/Statepoint.cpp b/contrib/llvm/lib/IR/Statepoint.cpp
index 63be1e7..18efee21 100644
--- a/contrib/llvm/lib/IR/Statepoint.cpp
+++ b/contrib/llvm/lib/IR/Statepoint.cpp
@@ -44,27 +44,40 @@ bool llvm::isGCRelocate(ImmutableCallSite CS) {
   return CS.getInstruction() && isa<GCRelocateInst>(CS.getInstruction());
 }
 
+bool llvm::isGCRelocate(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
+    return isGCRelocate(CS);
+  return false;
+}
+
 bool llvm::isGCResult(ImmutableCallSite CS) {
   return CS.getInstruction() && isa<GCResultInst>(CS.getInstruction());
 }
 
+bool llvm::isGCResult(const Value *V) {
+  if (auto CS = ImmutableCallSite(V))
+    return isGCResult(CS);
+  return false;
+}
+
 bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
   return Attr.hasAttribute("statepoint-id") ||
          Attr.hasAttribute("statepoint-num-patch-bytes");
 }
 
-StatepointDirectives llvm::parseStatepointDirectivesFromAttrs(AttributeSet AS) {
+StatepointDirectives
+llvm::parseStatepointDirectivesFromAttrs(AttributeList AS) {
   StatepointDirectives Result;
 
   Attribute AttrID =
-      AS.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
+      AS.getAttribute(AttributeList::FunctionIndex, "statepoint-id");
   uint64_t StatepointID;
   if (AttrID.isStringAttribute())
     if (!AttrID.getValueAsString().getAsInteger(10, StatepointID))
       Result.StatepointID = StatepointID;
 
   uint32_t NumPatchBytes;
-  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeSet::FunctionIndex,
+  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeList::FunctionIndex,
                                                 "statepoint-num-patch-bytes");
   if (AttrNumPatchBytes.isStringAttribute())
     if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes))
diff --git a/contrib/llvm/lib/IR/Type.cpp b/contrib/llvm/lib/IR/Type.cpp
index ca86673..20e9c2b 100644
--- a/contrib/llvm/lib/IR/Type.cpp
+++ b/contrib/llvm/lib/IR/Type.cpp
@@ -1,4 +1,4 @@
-//===-- Type.cpp - Implement the Type class -------------------------------===//
+//===- Type.cpp - Implement the Type class --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,10 +13,23 @@
 
 #include "llvm/IR/Type.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include <algorithm>
-#include <cstdarg>
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -41,12 +54,6 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   }
 }
 
-Type *Type::getScalarType() const {
-  if (auto *VTy = dyn_cast<VectorType>(this))
-    return VTy->getElementType();
-  return const_cast<Type*>(this);
-}
-
 bool Type::isIntegerTy(unsigned Bitwidth) const {
   return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
 }
@@ -226,7 +233,6 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) {
   return getInt64Ty(C)->getPointerTo(AS);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                       IntegerType Implementation
 //===----------------------------------------------------------------------===//
@@ -368,7 +374,8 @@ void StructType::setName(StringRef Name) {
   if (Name == getName()) return;
 
   StringMap<StructType *> &SymbolTable = getContext().pImpl->NamedStructTypes;
-  typedef StringMap<StructType *>::MapEntryTy EntryTy;
+
+  using EntryTy = StringMap<StructType *>::MapEntryTy;
 
   // If this struct already had a name, remove its symbol table entry. Don't
   // delete the data yet because it may be part of the new name.
@@ -425,21 +432,6 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) {
   return get(Context, None, isPacked);
 }
 
-StructType *StructType::get(Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  LLVMContext &Ctx = type->getContext();
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  auto *Ret = llvm::StructType::get(Ctx, StructFields);
-  va_end(ap);
-  return Ret;
-}
-
 StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
                                StringRef Name, bool isPacked) {
   StructType *ST = create(Context, Name);
@@ -468,21 +460,6 @@ StructType *StructType::create(ArrayRef<Type*> Elements) {
   return create(Elements[0]->getContext(), Elements, StringRef());
 }
 
-StructType *StructType::create(StringRef Name, Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  LLVMContext &Ctx = type->getContext();
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  auto *Ret = llvm::StructType::create(Ctx, StructFields, Name);
-  va_end(ap);
-  return Ret;
-}
-
 bool StructType::isSized(SmallPtrSetImpl<Type*> *Visited) const {
   if ((getSubclassData() & SCDB_IsSized) != 0)
     return true;
@@ -514,19 +491,6 @@ StringRef StructType::getName() const {
   return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
 }
 
-void StructType::setBody(Type *type, ...) {
-  assert(type && "Cannot create a struct type with no elements with this");
-  va_list ap;
-  SmallVector<llvm::Type*, 8> StructFields;
-  va_start(ap, type);
-  while (type) {
-    StructFields.push_back(type);
-    type = va_arg(ap, llvm::Type*);
-  }
-  setBody(StructFields);
-  va_end(ap);
-}
-
 bool StructType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
@@ -546,7 +510,6 @@ StructType *Module::getTypeByName(StringRef Name) const {
   return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                       CompositeType Implementation
 //===----------------------------------------------------------------------===//
@@ -575,7 +538,7 @@ bool CompositeType::indexValid(const Value *V) const {
   if (auto *STy = dyn_cast<StructType>(this)) {
     // Structure indexes require (vectors of) 32-bit integer constants.  In the
     // vector case all of the indices must be equal.
-    if (!V->getType()->getScalarType()->isIntegerTy(32))
+    if (!V->getType()->isIntOrIntVectorTy(32))
       return false;
     const Constant *C = dyn_cast<Constant>(V);
     if (C && V->getType()->isVectorTy())
@@ -595,7 +558,6 @@ bool CompositeType::indexValid(unsigned Idx) const {
   return true;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           ArrayType Implementation
 //===----------------------------------------------------------------------===//
@@ -667,7 +629,6 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
   return Entry;
 }
 
-
 PointerType::PointerType(Type *E, unsigned AddrSpace)
   : Type(E->getContext(), PointerTyID), PointeeTy(E) {
   ContainedTys = &PointeeTy;
diff --git a/contrib/llvm/lib/IR/TypeFinder.cpp b/contrib/llvm/lib/IR/TypeFinder.cpp
index dc4c1cf..b39678a 100644
--- a/contrib/llvm/lib/IR/TypeFinder.cpp
+++ b/contrib/llvm/lib/IR/TypeFinder.cpp
@@ -1,4 +1,4 @@
-//===-- TypeFinder.cpp - Implement the TypeFinder class -------------------===//
+//===- TypeFinder.cpp - Implement the TypeFinder class --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,10 +14,19 @@
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <utility>
+
 using namespace llvm;
 
 void TypeFinder::run(const Module &M, bool onlyNamed) {
diff --git a/contrib/llvm/lib/IR/User.cpp b/contrib/llvm/lib/IR/User.cpp
index 497b4aa..d460391 100644
--- a/contrib/llvm/lib/IR/User.cpp
+++ b/contrib/llvm/lib/IR/User.cpp
@@ -19,8 +19,6 @@ class BasicBlock;
 //                                 User Class
 //===----------------------------------------------------------------------===//
 
-void User::anchor() {}
-
 void User::replaceUsesOfWith(Value *From, Value *To) {
   if (From == To) return;   // Duh what?
 
@@ -193,12 +191,4 @@ void User::operator delete(void *Usr) {
   }
 }
 
-//===----------------------------------------------------------------------===//
-//                             Operator Class
-//===----------------------------------------------------------------------===//
-
-Operator::~Operator() {
-  llvm_unreachable("should never destroy an Operator");
-}
-
 } // End llvm namespace
diff --git a/contrib/llvm/lib/IR/Value.cpp b/contrib/llvm/lib/IR/Value.cpp
index 91a999b..51a7d42 100644
--- a/contrib/llvm/lib/IR/Value.cpp
+++ b/contrib/llvm/lib/IR/Value.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DerivedUser.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -59,7 +60,7 @@ Value::Value(Type *ty, unsigned scid)
            (SubclassID < ConstantFirstVal || SubclassID > ConstantLastVal))
     assert((VTy->isFirstClassType() || VTy->isVoidTy()) &&
            "Cannot create non-first-class values except for constants!");
-  static_assert(sizeof(Value) == 3 * sizeof(void *) + 2 * sizeof(unsigned),
+  static_assert(sizeof(Value) == 2 * sizeof(void *) + 2 * sizeof(unsigned),
                 "Value too big");
 }
 
@@ -89,6 +90,32 @@ Value::~Value() {
   destroyValueName();
 }
 
+void Value::deleteValue() {
+  switch (getValueID()) {
+#define HANDLE_VALUE(Name)                                                     \
+  case Value::Name##Val:                                                       \
+    delete static_cast<Name *>(this);                                          \
+    break;
+#define HANDLE_MEMORY_VALUE(Name)                                              \
+  case Value::Name##Val:                                                       \
+    static_cast<DerivedUser *>(this)->DeleteValue(                             \
+        static_cast<DerivedUser *>(this));                                     \
+    break;
+#define HANDLE_INSTRUCTION(Name)  /* nothing */
+#include "llvm/IR/Value.def"
+
+#define HANDLE_INST(N, OPC, CLASS)                                             \
+  case Value::InstructionVal + Instruction::OPC:                               \
+    delete static_cast<CLASS *>(this);                                         \
+    break;
+#define HANDLE_USER_INST(N, OPC, CLASS)
+#include "llvm/IR/Instruction.def"
+
+  default:
+    llvm_unreachable("attempting to delete unknown value kind");
+  }
+}
+
 void Value::destroyValueName() {
   ValueName *Name = getValueName();
   if (Name)
@@ -320,7 +347,7 @@ void Value::takeName(Value *V) {
     ST->reinsertValue(this);
 }
 
-void Value::assertModuleIsMaterialized() const {
+void Value::assertModuleIsMaterializedImpl() const {
 #ifndef NDEBUG
   const GlobalValue *GV = dyn_cast<GlobalValue>(this);
   if (!GV)
@@ -432,24 +459,26 @@ namespace {
 enum PointerStripKind {
   PSK_ZeroIndices,
   PSK_ZeroIndicesAndAliases,
+  PSK_ZeroIndicesAndAliasesAndBarriers,
   PSK_InBoundsConstantIndices,
   PSK_InBounds
 };
 
 template <PointerStripKind StripKind>
-static Value *stripPointerCastsAndOffsets(Value *V) {
+static const Value *stripPointerCastsAndOffsets(const Value *V) {
   if (!V->getType()->isPointerTy())
     return V;
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
-  SmallPtrSet<Value *, 4> Visited;
+  SmallPtrSet<const Value *, 4> Visited;
 
   Visited.insert(V);
   do {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       switch (StripKind) {
       case PSK_ZeroIndicesAndAliases:
+      case PSK_ZeroIndicesAndAliasesAndBarriers:
       case PSK_ZeroIndices:
         if (!GEP->hasAllZeroIndices())
           return V;
@@ -467,17 +496,25 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       if (StripKind == PSK_ZeroIndices || GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
+      if (auto CS = ImmutableCallSite(V)) {
+        if (const Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
-
+        // The result of invariant.group.barrier must alias it's argument,
+        // but it can't be marked with returned attribute, that's why it needs
+        // special case.
+        if (StripKind == PSK_ZeroIndicesAndAliasesAndBarriers &&
+            CS.getIntrinsicID() == Intrinsic::invariant_group_barrier) {
+          V = CS.getArgOperand(0);
+          continue;
+        }
+      }
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
@@ -487,20 +524,26 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
 }
 } // end anonymous namespace
 
-Value *Value::stripPointerCasts() {
+const Value *Value::stripPointerCasts() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliases>(this);
 }
 
-Value *Value::stripPointerCastsNoFollowAliases() {
+const Value *Value::stripPointerCastsNoFollowAliases() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
 }
 
-Value *Value::stripInBoundsConstantOffsets() {
+const Value *Value::stripInBoundsConstantOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
-Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
-                                                        APInt &Offset) {
+const Value *Value::stripPointerCastsAndBarriers() const {
+  return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliasesAndBarriers>(
+      this);
+}
+
+const Value *
+Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                 APInt &Offset) const {
   if (!getType()->isPointerTy())
     return this;
 
@@ -510,11 +553,11 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
-  SmallPtrSet<Value *, 4> Visited;
+  SmallPtrSet<const Value *, 4> Visited;
   Visited.insert(this);
-  Value *V = this;
+  const Value *V = this;
   do {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       if (!GEP->isInBounds())
         return V;
       APInt GEPOffset(Offset);
@@ -524,11 +567,11 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       V = GA->getAliasee();
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
+      if (auto CS = ImmutableCallSite(V))
+        if (const Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
@@ -541,7 +584,7 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
   return V;
 }
 
-Value *Value::stripInBoundsOffsets() {
+const Value *Value::stripInBoundsOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
 
@@ -562,9 +605,9 @@ unsigned Value::getPointerDereferenceableBytes(const DataLayout &DL,
       CanBeNull = true;
     }
   } else if (auto CS = ImmutableCallSite(this)) {
-    DerefBytes = CS.getDereferenceableBytes(0);
+    DerefBytes = CS.getDereferenceableBytes(AttributeList::ReturnIndex);
     if (DerefBytes == 0) {
-      DerefBytes = CS.getDereferenceableOrNullBytes(0);
+      DerefBytes = CS.getDereferenceableOrNullBytes(AttributeList::ReturnIndex);
       CanBeNull = true;
     }
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(this)) {
@@ -633,7 +676,7 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
         Align = DL.getPrefTypeAlignment(AllocatedType);
     }
   } else if (auto CS = ImmutableCallSite(this))
-    Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex);
+    Align = CS.getAttributes().getRetAlignment();
   else if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
@@ -643,9 +686,9 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
   return Align;
 }
 
-Value *Value::DoPHITranslation(const BasicBlock *CurBB,
-                               const BasicBlock *PredBB) {
-  PHINode *PN = dyn_cast<PHINode>(this);
+const Value *Value::DoPHITranslation(const BasicBlock *CurBB,
+                                     const BasicBlock *PredBB) const {
+  auto *PN = dyn_cast<PHINode>(this);
   if (PN && PN->getParent() == CurBB)
     return PN->getIncomingValueForBlock(PredBB);
   return this;
@@ -695,7 +738,7 @@ void ValueHandleBase::AddToExistingUseList(ValueHandleBase **List) {
   setPrevPtr(List);
   if (Next) {
     Next->setPrevPtr(&Next);
-    assert(V == Next->V && "Added to wrong list?");
+    assert(getValPtr() == Next->getValPtr() && "Added to wrong list?");
   }
 }
 
@@ -710,14 +753,14 @@ void ValueHandleBase::AddToExistingUseListAfter(ValueHandleBase *List) {
 }
 
 void ValueHandleBase::AddToUseList() {
-  assert(V && "Null pointer doesn't have a use list!");
+  assert(getValPtr() && "Null pointer doesn't have a use list!");
 
-  LLVMContextImpl *pImpl = V->getContext().pImpl;
+  LLVMContextImpl *pImpl = getValPtr()->getContext().pImpl;
 
-  if (V->HasValueHandle) {
+  if (getValPtr()->HasValueHandle) {
     // If this value already has a ValueHandle, then it must be in the
     // ValueHandles map already.
-    ValueHandleBase *&Entry = pImpl->ValueHandles[V];
+    ValueHandleBase *&Entry = pImpl->ValueHandles[getValPtr()];
     assert(Entry && "Value doesn't have any handles?");
     AddToExistingUseList(&Entry);
     return;
@@ -731,10 +774,10 @@ void ValueHandleBase::AddToUseList() {
   DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
   const void *OldBucketPtr = Handles.getPointerIntoBucketsArray();
 
-  ValueHandleBase *&Entry = Handles[V];
+  ValueHandleBase *&Entry = Handles[getValPtr()];
   assert(!Entry && "Value really did already have handles?");
   AddToExistingUseList(&Entry);
-  V->HasValueHandle = true;
+  getValPtr()->HasValueHandle = true;
 
   // If reallocation didn't happen or if this was the first insertion, don't
   // walk the table.
@@ -746,14 +789,14 @@ void ValueHandleBase::AddToUseList() {
   // Okay, reallocation did happen.  Fix the Prev Pointers.
   for (DenseMap<Value*, ValueHandleBase*>::iterator I = Handles.begin(),
        E = Handles.end(); I != E; ++I) {
-    assert(I->second && I->first == I->second->V &&
+    assert(I->second && I->first == I->second->getValPtr() &&
            "List invariant broken!");
     I->second->setPrevPtr(&I->second);
   }
 }
 
 void ValueHandleBase::RemoveFromUseList() {
-  assert(V && V->HasValueHandle &&
+  assert(getValPtr() && getValPtr()->HasValueHandle &&
          "Pointer doesn't have a use list!");
 
   // Unlink this from its use list.
@@ -770,11 +813,11 @@ void ValueHandleBase::RemoveFromUseList() {
   // If the Next pointer was null, then it is possible that this was the last
   // ValueHandle watching VP.  If so, delete its entry from the ValueHandles
   // map.
-  LLVMContextImpl *pImpl = V->getContext().pImpl;
+  LLVMContextImpl *pImpl = getValPtr()->getContext().pImpl;
   DenseMap<Value*, ValueHandleBase*> &Handles = pImpl->ValueHandles;
   if (Handles.isPointerIntoBucketsArray(PrevPtr)) {
-    Handles.erase(V);
-    V->HasValueHandle = false;
+    Handles.erase(getValPtr());
+    getValPtr()->HasValueHandle = false;
   }
 }
 
@@ -804,13 +847,10 @@ void ValueHandleBase::ValueIsDeleted(Value *V) {
     switch (Entry->getKind()) {
     case Assert:
       break;
-    case Tracking:
-      // Mark that this value has been deleted by setting it to an invalid Value
-      // pointer.
-      Entry->operator=(DenseMapInfo<Value *>::getTombstoneKey());
-      break;
     case Weak:
-      // Weak just goes to null, which will unlink it from the list.
+    case WeakTracking:
+      // WeakTracking and Weak just go to null, which unlinks them
+      // from the list.
       Entry->operator=(nullptr);
       break;
     case Callback:
@@ -858,16 +898,10 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
 
     switch (Entry->getKind()) {
     case Assert:
-      // Asserting handle does not follow RAUW implicitly.
-      break;
-    case Tracking:
-      // Tracking goes to new value like a WeakVH. Note that this may make it
-      // something incompatible with its templated type. We don't want to have a
-      // virtual (or inline) interface to handle this though, so instead we make
-      // the TrackingVH accessors guarantee that a client never sees this value.
-
-      LLVM_FALLTHROUGH;
     case Weak:
+      // Asserting and Weak handles do not follow RAUW implicitly.
+      break;
+    case WeakTracking:
       // Weak goes to the new value, which will unlink it from Old's list.
       Entry->operator=(New);
       break;
@@ -879,18 +913,17 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
   }
 
 #ifndef NDEBUG
-  // If any new tracking or weak value handles were added while processing the
+  // If any new weak value handles were added while processing the
   // list, then complain about it now.
   if (Old->HasValueHandle)
     for (Entry = pImpl->ValueHandles[Old]; Entry; Entry = Entry->Next)
       switch (Entry->getKind()) {
-      case Tracking:
-      case Weak:
+      case WeakTracking:
         dbgs() << "After RAUW from " << *Old->getType() << " %"
                << Old->getName() << " to " << *New->getType() << " %"
                << New->getName() << "\n";
-        llvm_unreachable("A tracking or weak value handle still pointed to the"
-                         " old value!\n");
+        llvm_unreachable(
+            "A weak tracking value handle still pointed to the  old value!\n");
       default:
         break;
       }
diff --git a/contrib/llvm/lib/IR/ValueSymbolTable.cpp b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
index 8a6a320..ccdabe0 100644
--- a/contrib/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/contrib/llvm/lib/IR/ValueSymbolTable.cpp
@@ -1,4 +1,4 @@
-//===-- ValueSymbolTable.cpp - Implement the ValueSymbolTable class -------===//
+//===- ValueSymbolTable.cpp - Implement the ValueSymbolTable class --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -99,13 +100,15 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
   return makeUniqueName(V, UniqueName);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // dump - print out the symbol table
 //
 LLVM_DUMP_METHOD void ValueSymbolTable::dump() const {
-  //DEBUG(dbgs() << "ValueSymbolTable:\n");
+  //dbgs() << "ValueSymbolTable:\n";
   for (const auto &I : *this) {
-    //DEBUG(dbgs() << "  '" << I->getKeyData() << "' = ");
+    //dbgs() << "  '" << I->getKeyData() << "' = ";
     I.getValue()->dump();
-    //DEBUG(dbgs() << "\n");
+    //dbgs() << "\n";
   }
 }
+#endif
diff --git a/contrib/llvm/lib/IR/ValueTypes.cpp b/contrib/llvm/lib/IR/ValueTypes.cpp
index 2132e16..cf6ee06 100644
--- a/contrib/llvm/lib/IR/ValueTypes.cpp
+++ b/contrib/llvm/lib/IR/ValueTypes.cpp
@@ -142,6 +142,7 @@ std::string EVT::getEVTString() const {
   case MVT::Other:   return "ch";
   case MVT::Glue:    return "glue";
   case MVT::x86mmx:  return "x86mmx";
+  case MVT::v1i1:    return "v1i1";
   case MVT::v2i1:    return "v2i1";
   case MVT::v4i1:    return "v4i1";
   case MVT::v8i1:    return "v8i1";
@@ -220,6 +221,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::f128:    return Type::getFP128Ty(Context);
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::v1i1:    return VectorType::get(Type::getInt1Ty(Context), 1);
   case MVT::v2i1:    return VectorType::get(Type::getInt1Ty(Context), 2);
   case MVT::v4i1:    return VectorType::get(Type::getInt1Ty(Context), 4);
   case MVT::v8i1:    return VectorType::get(Type::getInt1Ty(Context), 8);
diff --git a/contrib/llvm/lib/IR/Verifier.cpp b/contrib/llvm/lib/IR/Verifier.cpp
index 5855059..454a56a 100644
--- a/contrib/llvm/lib/IR/Verifier.cpp
+++ b/contrib/llvm/lib/IR/Verifier.cpp
@@ -49,7 +49,6 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/ilist.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -59,6 +58,8 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -81,10 +82,10 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
@@ -102,7 +103,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -267,6 +267,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// \brief Keep track of the metadata nodes that have been checked already.
   SmallPtrSet<const Metadata *, 32> MDNodes;
 
+  /// Keep track which DISubprogram is attached to which function.
+  DenseMap<const DISubprogram *, const Function *> DISubprogramAttachments;
+
   /// Track all DICompileUnits visited.
   SmallPtrSet<const Metadata *, 2> CUVisited;
 
@@ -277,6 +280,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// already.
   bool SawFrameEscape;
 
+  /// Whether the current function has a DISubprogram attached to it.
+  bool HasDebugInfo = false;
+
   /// Stores the count of how many objects were passed to llvm.localescape for a
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
@@ -297,6 +303,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   // constant expressions, we can arrive at a particular user many times.
   SmallPtrSet<const Value *, 32> GlobalValueVisited;
 
+  // Keeps track of duplicate function argument debug info.
+  SmallVector<const DILocalVariable *, 16> DebugFnArgs;
+
   TBAAVerifier TBAAVerifyHelper;
 
   void checkAtomicMemAccessSize(Type *Ty, const Instruction *I);
@@ -342,6 +351,7 @@ public:
     visit(const_cast<Function &>(F));
     verifySiblingFuncletUnwinds();
     InstsInThisBlock.clear();
+    DebugFnArgs.clear();
     LandingPadResultTy = nullptr;
     SawFrameEscape = false;
     SiblingFuncletInfo.clear();
@@ -379,7 +389,7 @@ public:
     verifyCompileUnits();
 
     verifyDeoptimizeCallingConvs();
-
+    DISubprogramAttachments.clear();
     return !Broken;
   }
 
@@ -457,6 +467,7 @@ private:
   void visitUserOp1(Instruction &I);
   void visitUserOp2(Instruction &I) { visitUserOp1(I); }
   void visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS);
+  void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI);
   template <class DbgIntrinsicTy>
   void visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII);
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
@@ -481,12 +492,11 @@ private:
   void verifyMustTailCall(CallInst &CI);
   bool performTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
                         unsigned ArgNo, std::string &Suffix);
-  bool verifyAttributeCount(AttributeSet Attrs, unsigned Params);
-  void verifyAttributeTypes(AttributeSet Attrs, unsigned Idx, bool isFunction,
+  bool verifyAttributeCount(AttributeList Attrs, unsigned Params);
+  void verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
                             const Value *V);
-  void verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
-                            bool isReturnValue, const Value *V);
-  void verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+  void verifyParameterAttrs(AttributeSet Attrs, Type *Ty, const Value *V);
+  void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
 
@@ -497,6 +507,7 @@ private:
   void verifySiblingFuncletUnwinds();
 
   void verifyFragmentExpression(const DbgInfoIntrinsic &I);
+  void verifyFnArgs(const DbgInfoIntrinsic &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -652,7 +663,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     if (auto *GVE = dyn_cast<DIGlobalVariableExpression>(MD))
       visitDIGlobalVariableExpression(*GVE);
     else
-      AssertDI(false, "!dbg attachment of global variable must be a DIGlobalVariableExpression");
+      AssertDI(false, "!dbg attachment of global variable must be a "
+                      "DIGlobalVariableExpression");
   }
 
   if (!GV.hasInitializer()) {
@@ -822,28 +834,6 @@ static bool isType(const Metadata *MD) { return !MD || isa<DIType>(MD); }
 static bool isScope(const Metadata *MD) { return !MD || isa<DIScope>(MD); }
 static bool isDINode(const Metadata *MD) { return !MD || isa<DINode>(MD); }
 
-template <class Ty>
-static bool isValidMetadataArrayImpl(const MDTuple &N, bool AllowNull) {
-  for (Metadata *MD : N.operands()) {
-    if (MD) {
-      if (!isa<Ty>(MD))
-        return false;
-    } else {
-      if (!AllowNull)
-        return false;
-    }
-  }
-  return true;
-}
-
-template <class Ty> static bool isValidMetadataArray(const MDTuple &N) {
-  return isValidMetadataArrayImpl<Ty>(N, /* AllowNull */ false);
-}
-
-template <class Ty> static bool isValidMetadataNullArray(const MDTuple &N) {
-  return isValidMetadataArrayImpl<Ty>(N, /* AllowNull */ true);
-}
-
 void Verifier::visitDILocation(const DILocation &N) {
   AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
            "location requires a valid scope", &N, N.getRawScope());
@@ -900,6 +890,13 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
            N.getRawBaseType());
+
+  if (N.getDWARFAddressSpace()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_pointer_type ||
+                 N.getTag() == dwarf::DW_TAG_reference_type,
+             "DWARF address space only applies to pointer or reference types",
+             &N);
+  }
 }
 
 static bool hasConflictingReferenceFlags(unsigned Flags) {
@@ -1024,6 +1021,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   if (auto *F = N.getRawFile())
     AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+  else
+    AssertDI(N.getLine() == 0, "line specified with no file", &N, N.getLine());
   if (auto *T = N.getRawType())
     AssertDI(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
   AssertDI(isType(N.getRawContainingType()), "invalid containing type", &N,
@@ -1054,6 +1053,14 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
     // Subprogram declarations (part of the type hierarchy).
     AssertDI(!Unit, "subprogram declarations must not have a compile unit", &N);
   }
+
+  if (auto *RawThrownTypes = N.getRawThrownTypes()) {
+    auto *ThrownTypes = dyn_cast<MDTuple>(RawThrownTypes);
+    AssertDI(ThrownTypes, "invalid thrown types list", &N, RawThrownTypes);
+    for (Metadata *Op : ThrownTypes->operands())
+      AssertDI(Op && isa<DIType>(Op), "invalid thrown type", &N, ThrownTypes,
+               Op);
+  }
 }
 
 void Verifier::visitDILexicalBlockBase(const DILexicalBlockBase &N) {
@@ -1199,9 +1206,9 @@ void Verifier::visitComdat(const Comdat &C) {
 
 void Verifier::visitModuleIdents(const Module &M) {
   const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident");
-  if (!Idents) 
+  if (!Idents)
     return;
-  
+
   // llvm.ident takes a list of metadata entry. Each entry has only one string.
   // Scan each llvm.ident entry and make sure that this requirement is met.
   for (const MDNode *N : Idents->operands()) {
@@ -1211,7 +1218,7 @@ void Verifier::visitModuleIdents(const Module &M) {
            ("invalid value for llvm.ident metadata entry operand"
             "(the operand should be a string)"),
            N->getOperand(0));
-  } 
+  }
 }
 
 void Verifier::visitModuleFlags(const Module &M) {
@@ -1275,6 +1282,13 @@ Verifier::visitModuleFlag(const MDNode *Op,
     // These behavior types accept any value.
     break;
 
+  case Module::Max: {
+    Assert(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2)),
+           "invalid value for 'max' module flag (expected constant integer)",
+           Op->getOperand(2));
+    break;
+  }
+
   case Module::Require: {
     // The value should itself be an MDNode with two operands, a flag ID (an
     // MDString), and a value.
@@ -1310,73 +1324,90 @@ Verifier::visitModuleFlag(const MDNode *Op,
     Assert(Inserted,
            "module flag identifiers must be unique (or of 'require' type)", ID);
   }
-}
 
-void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
-                                    bool isFunction, const Value *V) {
-  unsigned Slot = ~0U;
-  for (unsigned I = 0, E = Attrs.getNumSlots(); I != E; ++I)
-    if (Attrs.getSlotIndex(I) == Idx) {
-      Slot = I;
-      break;
-    }
+  if (ID->getString() == "wchar_size") {
+    ConstantInt *Value
+      = mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
+    Assert(Value, "wchar_size metadata requires constant integer argument");
+  }
+
+  if (ID->getString() == "Linker Options") {
+    // If the llvm.linker.options named metadata exists, we assume that the
+    // bitcode reader has upgraded the module flag. Otherwise the flag might
+    // have been created by a client directly.
+    Assert(M.getNamedMetadata("llvm.linker.options"),
+           "'Linker Options' named metadata no longer supported");
+  }
+}
+
+/// Return true if this attribute kind only applies to functions.
+static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
+  switch (Kind) {
+  case Attribute::NoReturn:
+  case Attribute::NoUnwind:
+  case Attribute::NoInline:
+  case Attribute::AlwaysInline:
+  case Attribute::OptimizeForSize:
+  case Attribute::StackProtect:
+  case Attribute::StackProtectReq:
+  case Attribute::StackProtectStrong:
+  case Attribute::SafeStack:
+  case Attribute::NoRedZone:
+  case Attribute::NoImplicitFloat:
+  case Attribute::Naked:
+  case Attribute::InlineHint:
+  case Attribute::StackAlignment:
+  case Attribute::UWTable:
+  case Attribute::NonLazyBind:
+  case Attribute::ReturnsTwice:
+  case Attribute::SanitizeAddress:
+  case Attribute::SanitizeThread:
+  case Attribute::SanitizeMemory:
+  case Attribute::MinSize:
+  case Attribute::NoDuplicate:
+  case Attribute::Builtin:
+  case Attribute::NoBuiltin:
+  case Attribute::Cold:
+  case Attribute::OptimizeNone:
+  case Attribute::JumpTable:
+  case Attribute::Convergent:
+  case Attribute::ArgMemOnly:
+  case Attribute::NoRecurse:
+  case Attribute::InaccessibleMemOnly:
+  case Attribute::InaccessibleMemOrArgMemOnly:
+  case Attribute::AllocSize:
+  case Attribute::Speculatable:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
 
-  assert(Slot != ~0U && "Attribute set inconsistency!");
+/// Return true if this is a function attribute that can also appear on
+/// arguments.
+static bool isFuncOrArgAttr(Attribute::AttrKind Kind) {
+  return Kind == Attribute::ReadOnly || Kind == Attribute::WriteOnly ||
+         Kind == Attribute::ReadNone;
+}
 
-  for (AttributeSet::iterator I = Attrs.begin(Slot), E = Attrs.end(Slot);
-         I != E; ++I) {
-    if (I->isStringAttribute())
+void Verifier::verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
+                                    const Value *V) {
+  for (Attribute A : Attrs) {
+    if (A.isStringAttribute())
       continue;
 
-    if (I->getKindAsEnum() == Attribute::NoReturn ||
-        I->getKindAsEnum() == Attribute::NoUnwind ||
-        I->getKindAsEnum() == Attribute::NoInline ||
-        I->getKindAsEnum() == Attribute::AlwaysInline ||
-        I->getKindAsEnum() == Attribute::OptimizeForSize ||
-        I->getKindAsEnum() == Attribute::StackProtect ||
-        I->getKindAsEnum() == Attribute::StackProtectReq ||
-        I->getKindAsEnum() == Attribute::StackProtectStrong ||
-        I->getKindAsEnum() == Attribute::SafeStack ||
-        I->getKindAsEnum() == Attribute::NoRedZone ||
-        I->getKindAsEnum() == Attribute::NoImplicitFloat ||
-        I->getKindAsEnum() == Attribute::Naked ||
-        I->getKindAsEnum() == Attribute::InlineHint ||
-        I->getKindAsEnum() == Attribute::StackAlignment ||
-        I->getKindAsEnum() == Attribute::UWTable ||
-        I->getKindAsEnum() == Attribute::NonLazyBind ||
-        I->getKindAsEnum() == Attribute::ReturnsTwice ||
-        I->getKindAsEnum() == Attribute::SanitizeAddress ||
-        I->getKindAsEnum() == Attribute::SanitizeThread ||
-        I->getKindAsEnum() == Attribute::SanitizeMemory ||
-        I->getKindAsEnum() == Attribute::MinSize ||
-        I->getKindAsEnum() == Attribute::NoDuplicate ||
-        I->getKindAsEnum() == Attribute::Builtin ||
-        I->getKindAsEnum() == Attribute::NoBuiltin ||
-        I->getKindAsEnum() == Attribute::Cold ||
-        I->getKindAsEnum() == Attribute::OptimizeNone ||
-        I->getKindAsEnum() == Attribute::JumpTable ||
-        I->getKindAsEnum() == Attribute::Convergent ||
-        I->getKindAsEnum() == Attribute::ArgMemOnly ||
-        I->getKindAsEnum() == Attribute::NoRecurse ||
-        I->getKindAsEnum() == Attribute::InaccessibleMemOnly ||
-        I->getKindAsEnum() == Attribute::InaccessibleMemOrArgMemOnly ||
-        I->getKindAsEnum() == Attribute::AllocSize) {
-      if (!isFunction) {
-        CheckFailed("Attribute '" + I->getAsString() +
-                    "' only applies to functions!", V);
-        return;
-      }
-    } else if (I->getKindAsEnum() == Attribute::ReadOnly ||
-               I->getKindAsEnum() == Attribute::WriteOnly ||
-               I->getKindAsEnum() == Attribute::ReadNone) {
-      if (Idx == 0) {
-        CheckFailed("Attribute '" + I->getAsString() +
-                    "' does not apply to function returns");
+    if (isFuncOnlyAttr(A.getKindAsEnum())) {
+      if (!IsFunction) {
+        CheckFailed("Attribute '" + A.getAsString() +
+                        "' only applies to functions!",
+                    V);
         return;
       }
-    } else if (isFunction) {
-      CheckFailed("Attribute '" + I->getAsString() +
-                  "' does not apply to functions!", V);
+    } else if (IsFunction && !isFuncOrArgAttr(A.getKindAsEnum())) {
+      CheckFailed("Attribute '" + A.getAsString() +
+                      "' does not apply to functions!",
+                  V);
       return;
     }
   }
@@ -1384,106 +1415,91 @@ void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
 
 // VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
-                                    bool isReturnValue, const Value *V) {
-  if (!Attrs.hasAttributes(Idx))
+void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
+                                    const Value *V) {
+  if (!Attrs.hasAttributes())
     return;
 
-  verifyAttributeTypes(Attrs, Idx, false, V);
-
-  if (isReturnValue)
-    Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
-               !Attrs.hasAttribute(Idx, Attribute::Nest) &&
-               !Attrs.hasAttribute(Idx, Attribute::StructRet) &&
-               !Attrs.hasAttribute(Idx, Attribute::NoCapture) &&
-               !Attrs.hasAttribute(Idx, Attribute::Returned) &&
-               !Attrs.hasAttribute(Idx, Attribute::InAlloca) &&
-               !Attrs.hasAttribute(Idx, Attribute::SwiftSelf) &&
-               !Attrs.hasAttribute(Idx, Attribute::SwiftError),
-           "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', "
-           "'returned', 'swiftself', and 'swifterror' do not apply to return "
-           "values!",
-           V);
+  verifyAttributeTypes(Attrs, /*IsFunction=*/false, V);
 
   // Check for mutually incompatible attributes.  Only inreg is compatible with
   // sret.
   unsigned AttrCount = 0;
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::ByVal);
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::InAlloca);
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::StructRet) ||
-               Attrs.hasAttribute(Idx, Attribute::InReg);
-  AttrCount += Attrs.hasAttribute(Idx, Attribute::Nest);
+  AttrCount += Attrs.hasAttribute(Attribute::ByVal);
+  AttrCount += Attrs.hasAttribute(Attribute::InAlloca);
+  AttrCount += Attrs.hasAttribute(Attribute::StructRet) ||
+               Attrs.hasAttribute(Attribute::InReg);
+  AttrCount += Attrs.hasAttribute(Attribute::Nest);
   Assert(AttrCount <= 1, "Attributes 'byval', 'inalloca', 'inreg', 'nest', "
                          "and 'sret' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::InAlloca) &&
-           Attrs.hasAttribute(Idx, Attribute::ReadOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::InAlloca) &&
+           Attrs.hasAttribute(Attribute::ReadOnly)),
          "Attributes "
          "'inalloca and readonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::StructRet) &&
-           Attrs.hasAttribute(Idx, Attribute::Returned)),
+  Assert(!(Attrs.hasAttribute(Attribute::StructRet) &&
+           Attrs.hasAttribute(Attribute::Returned)),
          "Attributes "
          "'sret and returned' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ZExt) &&
-           Attrs.hasAttribute(Idx, Attribute::SExt)),
+  Assert(!(Attrs.hasAttribute(Attribute::ZExt) &&
+           Attrs.hasAttribute(Attribute::SExt)),
          "Attributes "
          "'zeroext and signext' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
-           Attrs.hasAttribute(Idx, Attribute::ReadOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::ReadNone) &&
+           Attrs.hasAttribute(Attribute::ReadOnly)),
          "Attributes "
          "'readnone and readonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadNone) &&
-           Attrs.hasAttribute(Idx, Attribute::WriteOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::ReadNone) &&
+           Attrs.hasAttribute(Attribute::WriteOnly)),
          "Attributes "
          "'readnone and writeonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::ReadOnly) &&
-           Attrs.hasAttribute(Idx, Attribute::WriteOnly)),
+  Assert(!(Attrs.hasAttribute(Attribute::ReadOnly) &&
+           Attrs.hasAttribute(Attribute::WriteOnly)),
          "Attributes "
          "'readonly and writeonly' are incompatible!",
          V);
 
-  Assert(!(Attrs.hasAttribute(Idx, Attribute::NoInline) &&
-           Attrs.hasAttribute(Idx, Attribute::AlwaysInline)),
+  Assert(!(Attrs.hasAttribute(Attribute::NoInline) &&
+           Attrs.hasAttribute(Attribute::AlwaysInline)),
          "Attributes "
          "'noinline and alwaysinline' are incompatible!",
          V);
 
-  Assert(
-      !AttrBuilder(Attrs, Idx).overlaps(AttributeFuncs::typeIncompatible(Ty)),
-      "Wrong types for attribute: " +
-          AttributeSet::get(Context, Idx, AttributeFuncs::typeIncompatible(Ty))
-              .getAsString(Idx),
-      V);
+  AttrBuilder IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
+  Assert(!AttrBuilder(Attrs).overlaps(IncompatibleAttrs),
+         "Wrong types for attribute: " +
+             AttributeSet::get(Context, IncompatibleAttrs).getAsString(),
+         V);
 
   if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
     SmallPtrSet<Type*, 4> Visited;
     if (!PTy->getElementType()->isSized(&Visited)) {
-      Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal) &&
-                 !Attrs.hasAttribute(Idx, Attribute::InAlloca),
+      Assert(!Attrs.hasAttribute(Attribute::ByVal) &&
+                 !Attrs.hasAttribute(Attribute::InAlloca),
              "Attributes 'byval' and 'inalloca' do not support unsized types!",
              V);
     }
     if (!isa<PointerType>(PTy->getElementType()))
-      Assert(!Attrs.hasAttribute(Idx, Attribute::SwiftError),
+      Assert(!Attrs.hasAttribute(Attribute::SwiftError),
              "Attribute 'swifterror' only applies to parameters "
              "with pointer to pointer type!",
              V);
   } else {
-    Assert(!Attrs.hasAttribute(Idx, Attribute::ByVal),
+    Assert(!Attrs.hasAttribute(Attribute::ByVal),
            "Attribute 'byval' only applies to parameters with pointer type!",
            V);
-    Assert(!Attrs.hasAttribute(Idx, Attribute::SwiftError),
+    Assert(!Attrs.hasAttribute(Attribute::SwiftError),
            "Attribute 'swifterror' only applies to parameters "
            "with pointer type!",
            V);
@@ -1492,7 +1508,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
 
 // Check parameter attributes against a function type.
 // The value V is printed in error messages.
-void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                                    const Value *V) {
   if (Attrs.isEmpty())
     return;
@@ -1503,122 +1519,124 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
   bool SawSwiftSelf = false;
   bool SawSwiftError = false;
 
-  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    unsigned Idx = Attrs.getSlotIndex(i);
-
-    Type *Ty;
-    if (Idx == 0)
-      Ty = FT->getReturnType();
-    else if (Idx-1 < FT->getNumParams())
-      Ty = FT->getParamType(Idx-1);
-    else
-      break;  // VarArgs attributes, verified elsewhere.
+  // Verify return value attributes.
+  AttributeSet RetAttrs = Attrs.getRetAttributes();
+  Assert((!RetAttrs.hasAttribute(Attribute::ByVal) &&
+          !RetAttrs.hasAttribute(Attribute::Nest) &&
+          !RetAttrs.hasAttribute(Attribute::StructRet) &&
+          !RetAttrs.hasAttribute(Attribute::NoCapture) &&
+          !RetAttrs.hasAttribute(Attribute::Returned) &&
+          !RetAttrs.hasAttribute(Attribute::InAlloca) &&
+          !RetAttrs.hasAttribute(Attribute::SwiftSelf) &&
+          !RetAttrs.hasAttribute(Attribute::SwiftError)),
+         "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', "
+         "'returned', 'swiftself', and 'swifterror' do not apply to return "
+         "values!",
+         V);
+  Assert((!RetAttrs.hasAttribute(Attribute::ReadOnly) &&
+          !RetAttrs.hasAttribute(Attribute::WriteOnly) &&
+          !RetAttrs.hasAttribute(Attribute::ReadNone)),
+         "Attribute '" + RetAttrs.getAsString() +
+             "' does not apply to function returns",
+         V);
+  verifyParameterAttrs(RetAttrs, FT->getReturnType(), V);
 
-    verifyParameterAttrs(Attrs, Idx, Ty, Idx == 0, V);
+  // Verify parameter attributes.
+  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
+    Type *Ty = FT->getParamType(i);
+    AttributeSet ArgAttrs = Attrs.getParamAttributes(i);
 
-    if (Idx == 0)
-      continue;
+    verifyParameterAttrs(ArgAttrs, Ty, V);
 
-    if (Attrs.hasAttribute(Idx, Attribute::Nest)) {
+    if (ArgAttrs.hasAttribute(Attribute::Nest)) {
       Assert(!SawNest, "More than one parameter has attribute nest!", V);
       SawNest = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::Returned)) {
+    if (ArgAttrs.hasAttribute(Attribute::Returned)) {
       Assert(!SawReturned, "More than one parameter has attribute returned!",
              V);
       Assert(Ty->canLosslesslyBitCastTo(FT->getReturnType()),
-             "Incompatible "
-             "argument and return types for 'returned' attribute",
+             "Incompatible argument and return types for 'returned' attribute",
              V);
       SawReturned = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::StructRet)) {
+    if (ArgAttrs.hasAttribute(Attribute::StructRet)) {
       Assert(!SawSRet, "Cannot have multiple 'sret' parameters!", V);
-      Assert(Idx == 1 || Idx == 2,
+      Assert(i == 0 || i == 1,
              "Attribute 'sret' is not on first or second parameter!", V);
       SawSRet = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::SwiftSelf)) {
+    if (ArgAttrs.hasAttribute(Attribute::SwiftSelf)) {
       Assert(!SawSwiftSelf, "Cannot have multiple 'swiftself' parameters!", V);
       SawSwiftSelf = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::SwiftError)) {
+    if (ArgAttrs.hasAttribute(Attribute::SwiftError)) {
       Assert(!SawSwiftError, "Cannot have multiple 'swifterror' parameters!",
              V);
       SawSwiftError = true;
     }
 
-    if (Attrs.hasAttribute(Idx, Attribute::InAlloca)) {
-      Assert(Idx == FT->getNumParams(), "inalloca isn't on the last parameter!",
-             V);
+    if (ArgAttrs.hasAttribute(Attribute::InAlloca)) {
+      Assert(i == FT->getNumParams() - 1,
+             "inalloca isn't on the last parameter!", V);
     }
   }
 
-  if (!Attrs.hasAttributes(AttributeSet::FunctionIndex))
+  if (!Attrs.hasAttributes(AttributeList::FunctionIndex))
     return;
 
-  verifyAttributeTypes(Attrs, AttributeSet::FunctionIndex, true, V);
+  verifyAttributeTypes(Attrs.getFnAttributes(), /*IsFunction=*/true, V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly)),
-      "Attributes 'readnone and readonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::ReadOnly)),
+         "Attributes 'readnone and readonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
-      "Attributes 'readnone and writeonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+         "Attributes 'readnone and writeonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
-      "Attributes 'readonly and writeonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadOnly) &&
+           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+         "Attributes 'readonly and writeonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
-                           Attribute::InaccessibleMemOrArgMemOnly)),
-      "Attributes 'readnone and inaccessiblemem_or_argmemonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::InaccessibleMemOrArgMemOnly)),
+         "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
+         "incompatible!",
+         V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
-                           Attribute::InaccessibleMemOnly)),
-      "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
+  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
+           Attrs.hasFnAttribute(Attribute::InaccessibleMemOnly)),
+         "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
 
-  Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::AlwaysInline)),
-      "Attributes 'noinline and alwaysinline' are incompatible!", V);
-
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, 
-                         Attribute::OptimizeNone)) {
-    Assert(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline),
+  Assert(!(Attrs.hasFnAttribute(Attribute::NoInline) &&
+           Attrs.hasFnAttribute(Attribute::AlwaysInline)),
+         "Attributes 'noinline and alwaysinline' are incompatible!", V);
+
+  if (Attrs.hasFnAttribute(Attribute::OptimizeNone)) {
+    Assert(Attrs.hasFnAttribute(Attribute::NoInline),
            "Attribute 'optnone' requires 'noinline'!", V);
 
-    Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                               Attribute::OptimizeForSize),
+    Assert(!Attrs.hasFnAttribute(Attribute::OptimizeForSize),
            "Attributes 'optsize and optnone' are incompatible!", V);
 
-    Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize),
+    Assert(!Attrs.hasFnAttribute(Attribute::MinSize),
            "Attributes 'minsize and optnone' are incompatible!", V);
   }
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                         Attribute::JumpTable)) {
+  if (Attrs.hasFnAttribute(Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
     Assert(GV->hasGlobalUnnamedAddr(),
            "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::AllocSize)) {
+  if (Attrs.hasFnAttribute(Attribute::AllocSize)) {
     std::pair<unsigned, Optional<unsigned>> Args =
-        Attrs.getAllocSizeArgs(AttributeSet::FunctionIndex);
+        Attrs.getAllocSizeArgs(AttributeList::FunctionIndex);
 
     auto CheckParam = [&](StringRef Name, unsigned ParamNo) {
       if (ParamNo >= FT->getNumParams()) {
@@ -1649,8 +1667,8 @@ void Verifier::verifyFunctionMetadata(
   for (const auto &Pair : MDs) {
     if (Pair.first == LLVMContext::MD_prof) {
       MDNode *MD = Pair.second;
-      Assert(MD->getNumOperands() == 2,
-             "!prof annotations should have exactly 2 operands", MD);
+      Assert(MD->getNumOperands() >= 2,
+             "!prof annotations should have no less than 2 operands", MD);
 
       // Check first operand.
       Assert(MD->getOperand(0) != nullptr, "first operand should not be null",
@@ -1725,18 +1743,10 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   }
 }
 
-bool Verifier::verifyAttributeCount(AttributeSet Attrs, unsigned Params) {
-  if (Attrs.getNumSlots() == 0)
-    return true;
-
-  unsigned LastSlot = Attrs.getNumSlots() - 1;
-  unsigned LastIndex = Attrs.getSlotIndex(LastSlot);
-  if (LastIndex <= Params
-      || (LastIndex == AttributeSet::FunctionIndex
-          && (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
-    return true;
-
-  return false;
+bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
+  // There shouldn't be more attribute sets than there are parameters plus the
+  // function and return value.
+  return Attrs.getNumAttrSets() <= Params + 2;
 }
 
 /// Verify that statepoint intrinsic is well formed.
@@ -1844,7 +1854,7 @@ void Verifier::verifyStatepoint(ImmutableCallSite CS) {
   Assert(ExpectedNumArgs <= (int)CS.arg_size(),
          "gc.statepoint too few arguments according to length fields", &CI);
 
-  // Check that the only uses of this gc.statepoint are gc.result or 
+  // Check that the only uses of this gc.statepoint are gc.result or
   // gc.relocate calls which are tied to this statepoint and thus part
   // of the same statepoint sequence
   for (const User *U : CI.users()) {
@@ -1963,7 +1973,7 @@ void Verifier::visitFunction(const Function &F) {
   Assert(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
          "Invalid struct return type!", &F);
 
-  AttributeSet Attrs = F.getAttributes();
+  AttributeList Attrs = F.getAttributes();
 
   Assert(verifyAttributeCount(Attrs, FT->getNumParams()),
          "Attribute after last parameter!", &F);
@@ -1974,7 +1984,7 @@ void Verifier::visitFunction(const Function &F) {
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
   // checking for Attributes that can/can not ever be on functions.
-  Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::Builtin),
+  Assert(!Attrs.hasFnAttribute(Attribute::Builtin),
          "Attribute 'builtin' can only be applied to a callsite.", &F);
 
   // Check that this function meets the restrictions on this calling convention.
@@ -1984,6 +1994,19 @@ void Verifier::visitFunction(const Function &F) {
   default:
   case CallingConv::C:
     break;
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    Assert(F.getReturnType()->isVoidTy(),
+           "Calling convention requires void return type", &F);
+    LLVM_FALLTHROUGH;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    Assert(!F.hasStructRetAttr(),
+           "Calling convention does not allow sret", &F);
+    LLVM_FALLTHROUGH;
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::Intel_OCL_BI:
@@ -2014,7 +2037,7 @@ void Verifier::visitFunction(const Function &F) {
     }
 
     // Check that swifterror argument is only used by loads and stores.
-    if (Attrs.hasAttribute(i+1, Attribute::SwiftError)) {
+    if (Attrs.hasParamAttribute(i, Attribute::SwiftError)) {
       verifySwiftErrorValue(&Arg);
     }
     ++i;
@@ -2078,13 +2101,19 @@ void Verifier::visitFunction(const Function &F) {
       switch (I.first) {
       default:
         break;
-      case LLVMContext::MD_dbg:
+      case LLVMContext::MD_dbg: {
         ++NumDebugAttachments;
         AssertDI(NumDebugAttachments == 1,
                  "function must have a single !dbg attachment", &F, I.second);
         AssertDI(isa<DISubprogram>(I.second),
                  "function !dbg attachment must be a subprogram", &F, I.second);
+        auto *SP = cast<DISubprogram>(I.second);
+        const Function *&AttachedTo = DISubprogramAttachments[SP];
+        AssertDI(!AttachedTo || AttachedTo == &F,
+                 "DISubprogram attached to more than one function", SP, &F);
+        AttachedTo = &F;
         break;
+      }
       case LLVMContext::MD_prof:
         ++NumProfAttachments;
         Assert(NumProfAttachments == 1,
@@ -2113,11 +2142,10 @@ void Verifier::visitFunction(const Function &F) {
          "Function is marked as dllimport, but not external.", &F);
 
   auto *N = F.getSubprogram();
-  if (!N)
+  HasDebugInfo = (N != nullptr);
+  if (!HasDebugInfo)
     return;
 
-  visitDISubprogram(*N);
-
   // Check that all !dbg attachments lead to back to N (or, at least, another
   // subprogram that describes the same function).
   //
@@ -2476,15 +2504,13 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DestTy = I.getType();
 
-  Assert(SrcTy->getScalarType()->isPointerTy(),
-         "PtrToInt source must be pointer", &I);
+  Assert(SrcTy->isPtrOrPtrVectorTy(), "PtrToInt source must be pointer", &I);
 
   if (auto *PTy = dyn_cast<PointerType>(SrcTy->getScalarType()))
     Assert(!DL.isNonIntegralPointerType(PTy),
            "ptrtoint not supported for non-integral pointers");
 
-  Assert(DestTy->getScalarType()->isIntegerTy(),
-         "PtrToInt result must be integral", &I);
+  Assert(DestTy->isIntOrIntVectorTy(), "PtrToInt result must be integral", &I);
   Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(), "PtrToInt type mismatch",
          &I);
 
@@ -2503,10 +2529,9 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DestTy = I.getType();
 
-  Assert(SrcTy->getScalarType()->isIntegerTy(),
+  Assert(SrcTy->isIntOrIntVectorTy(),
          "IntToPtr source must be an integral", &I);
-  Assert(DestTy->getScalarType()->isPointerTy(),
-         "IntToPtr result must be a pointer", &I);
+  Assert(DestTy->isPtrOrPtrVectorTy(), "IntToPtr result must be a pointer", &I);
 
   if (auto *PTy = dyn_cast<PointerType>(DestTy->getScalarType()))
     Assert(!DL.isNonIntegralPointerType(PTy),
@@ -2601,11 +2626,20 @@ void Verifier::verifyCallSite(CallSite CS) {
            "Call parameter type does not match function signature!",
            CS.getArgument(i), FTy->getParamType(i), I);
 
-  AttributeSet Attrs = CS.getAttributes();
+  AttributeList Attrs = CS.getAttributes();
 
   Assert(verifyAttributeCount(Attrs, CS.arg_size()),
          "Attribute after last parameter!", I);
 
+  if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::Speculatable)) {
+    // Don't allow speculatable on call sites, unless the underlying function
+    // declaration is also speculatable.
+    Function *Callee
+      = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+    Assert(Callee && Callee->isSpeculatable(),
+           "speculatable attribute may not apply to call sites", I);
+  }
+
   // Verify call attributes.
   verifyFunctionAttrs(FTy, Attrs, I);
 
@@ -2623,7 +2657,7 @@ void Verifier::verifyCallSite(CallSite CS) {
   // make sure the underlying alloca/parameter it comes from has a swifterror as
   // well.
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    if (CS.paramHasAttr(i+1, Attribute::SwiftError)) {
+    if (CS.paramHasAttr(i, Attribute::SwiftError)) {
       Value *SwiftErrorArg = CS.getArgument(i);
       if (auto AI = dyn_cast<AllocaInst>(SwiftErrorArg->stripInBoundsOffsets())) {
         Assert(AI->isSwiftError(),
@@ -2641,24 +2675,25 @@ void Verifier::verifyCallSite(CallSite CS) {
     bool SawNest = false;
     bool SawReturned = false;
 
-    for (unsigned Idx = 1; Idx < 1 + FTy->getNumParams(); ++Idx) {
-      if (Attrs.hasAttribute(Idx, Attribute::Nest))
+    for (unsigned Idx = 0; Idx < FTy->getNumParams(); ++Idx) {
+      if (Attrs.hasParamAttribute(Idx, Attribute::Nest))
         SawNest = true;
-      if (Attrs.hasAttribute(Idx, Attribute::Returned))
+      if (Attrs.hasParamAttribute(Idx, Attribute::Returned))
         SawReturned = true;
     }
 
     // Check attributes on the varargs part.
-    for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
-      Type *Ty = CS.getArgument(Idx-1)->getType();
-      verifyParameterAttrs(Attrs, Idx, Ty, false, I);
+    for (unsigned Idx = FTy->getNumParams(); Idx < CS.arg_size(); ++Idx) {
+      Type *Ty = CS.getArgument(Idx)->getType();
+      AttributeSet ArgAttrs = Attrs.getParamAttributes(Idx);
+      verifyParameterAttrs(ArgAttrs, Ty, I);
 
-      if (Attrs.hasAttribute(Idx, Attribute::Nest)) {
+      if (ArgAttrs.hasAttribute(Attribute::Nest)) {
         Assert(!SawNest, "More than one parameter has attribute nest!", I);
         SawNest = true;
       }
 
-      if (Attrs.hasAttribute(Idx, Attribute::Returned)) {
+      if (ArgAttrs.hasAttribute(Attribute::Returned)) {
         Assert(!SawReturned, "More than one parameter has attribute returned!",
                I);
         Assert(Ty->canLosslesslyBitCastTo(FTy->getReturnType()),
@@ -2668,11 +2703,12 @@ void Verifier::verifyCallSite(CallSite CS) {
         SawReturned = true;
       }
 
-      Assert(!Attrs.hasAttribute(Idx, Attribute::StructRet),
+      Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
              "Attribute 'sret' cannot be used for vararg call arguments!", I);
 
-      if (Attrs.hasAttribute(Idx, Attribute::InAlloca))
-        Assert(Idx == CS.arg_size(), "inalloca isn't on the last argument!", I);
+      if (ArgAttrs.hasAttribute(Attribute::InAlloca))
+        Assert(Idx == CS.arg_size() - 1, "inalloca isn't on the last argument!",
+               I);
     }
   }
 
@@ -2726,9 +2762,9 @@ void Verifier::verifyCallSite(CallSite CS) {
   // do so causes assertion failures when the inliner sets up inline scope info.
   if (I->getFunction()->getSubprogram() && CS.getCalledFunction() &&
       CS.getCalledFunction()->getSubprogram())
-    Assert(I->getDebugLoc(), "inlinable function call in a function with debug "
-                             "info must have a !dbg location",
-           I);
+    AssertDI(I->getDebugLoc(), "inlinable function call in a function with "
+                               "debug info must have a !dbg location",
+             I);
 
   visitInstruction(*I);
 }
@@ -2745,18 +2781,18 @@ static bool isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-static AttrBuilder getParameterABIAttributes(int I, AttributeSet Attrs) {
+static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
       Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca,
       Attribute::InReg, Attribute::Returned, Attribute::SwiftSelf,
       Attribute::SwiftError};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
-    if (Attrs.hasAttribute(I + 1, AK))
+    if (Attrs.hasParamAttribute(I, AK))
       Copy.addAttribute(AK);
   }
-  if (Attrs.hasAttribute(I + 1, Attribute::Alignment))
-    Copy.addAlignmentAttr(Attrs.getParamAlignment(I + 1));
+  if (Attrs.hasParamAttribute(I, Attribute::Alignment))
+    Copy.addAlignmentAttr(Attrs.getParamAlignment(I));
   return Copy;
 }
 
@@ -2787,8 +2823,8 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
   // - All ABI-impacting function attributes, such as sret, byval, inreg,
   //   returned, and inalloca, must match.
-  AttributeSet CallerAttrs = F->getAttributes();
-  AttributeSet CalleeAttrs = CI.getAttributes();
+  AttributeList CallerAttrs = F->getAttributes();
+  AttributeList CalleeAttrs = CI.getAttributes();
   for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
     AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs);
     AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
@@ -2913,11 +2949,10 @@ void Verifier::visitICmpInst(ICmpInst &IC) {
   Assert(Op0Ty == Op1Ty,
          "Both operands to ICmp instruction are not of the same type!", &IC);
   // Check that the operands are the right type
-  Assert(Op0Ty->isIntOrIntVectorTy() || Op0Ty->getScalarType()->isPointerTy(),
+  Assert(Op0Ty->isIntOrIntVectorTy() || Op0Ty->isPtrOrPtrVectorTy(),
          "Invalid operand types for ICmp instruction", &IC);
   // Check that the predicate is valid.
-  Assert(IC.getPredicate() >= CmpInst::FIRST_ICMP_PREDICATE &&
-             IC.getPredicate() <= CmpInst::LAST_ICMP_PREDICATE,
+  Assert(IC.isIntPredicate(),
          "Invalid predicate in ICmp instruction!", &IC);
 
   visitInstruction(IC);
@@ -2933,8 +2968,7 @@ void Verifier::visitFCmpInst(FCmpInst &FC) {
   Assert(Op0Ty->isFPOrFPVectorTy(),
          "Invalid operand types for FCmp instruction", &FC);
   // Check that the predicate is valid.
-  Assert(FC.getPredicate() >= CmpInst::FIRST_FCMP_PREDICATE &&
-             FC.getPredicate() <= CmpInst::LAST_FCMP_PREDICATE,
+  Assert(FC.isFPPredicate(),
          "Invalid predicate in FCmp instruction!", &FC);
 
   visitInstruction(FC);
@@ -2972,7 +3006,7 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       GetElementPtrInst::getIndexedType(GEP.getSourceElementType(), Idxs);
   Assert(ElTy, "Invalid indices for GEP pointer type!", &GEP);
 
-  Assert(GEP.getType()->getScalarType()->isPointerTy() &&
+  Assert(GEP.getType()->isPtrOrPtrVectorTy() &&
              GEP.getResultElementType() == ElTy,
          "GEP is not of right type for indices!", &GEP, ElTy);
 
@@ -2988,7 +3022,7 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         unsigned IndexWidth = IndexTy->getVectorNumElements();
         Assert(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
       }
-      Assert(IndexTy->getScalarType()->isIntegerTy(),
+      Assert(IndexTy->isIntOrIntVectorTy(),
              "All GEP indices should be of integer type");
     }
   }
@@ -3074,7 +3108,7 @@ void Verifier::visitLoadInst(LoadInst &LI) {
            ElTy, &LI);
     checkAtomicMemAccessSize(ElTy, &LI);
   } else {
-    Assert(LI.getSynchScope() == CrossThread,
+    Assert(LI.getSyncScopeID() == SyncScope::System,
            "Non-atomic load cannot have SynchronizationScope specified", &LI);
   }
 
@@ -3103,7 +3137,7 @@ void Verifier::visitStoreInst(StoreInst &SI) {
            ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
   } else {
-    Assert(SI.getSynchScope() == CrossThread,
+    Assert(SI.getSyncScopeID() == SyncScope::System,
            "Non-atomic store cannot have SynchronizationScope specified", &SI);
   }
   visitInstruction(SI);
@@ -3116,7 +3150,7 @@ void Verifier::verifySwiftErrorCallSite(CallSite CS,
   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
        I != E; ++I, ++Idx) {
     if (*I == SwiftErrorVal) {
-      Assert(CS.paramHasAttr(Idx+1, Attribute::SwiftError),
+      Assert(CS.paramHasAttr(Idx, Attribute::SwiftError),
              "swifterror value when used in a callsite should be marked "
              "with swifterror attribute",
               SwiftErrorVal, CS);
@@ -3148,8 +3182,9 @@ void Verifier::verifySwiftErrorValue(const Value *SwiftErrorVal) {
 void Verifier::visitAllocaInst(AllocaInst &AI) {
   SmallPtrSet<Type*, 4> Visited;
   PointerType *PTy = AI.getType();
-  Assert(PTy->getAddressSpace() == 0,
-         "Allocation instruction pointer not in the generic address space!",
+  // TODO: Relax this restriction?
+  Assert(PTy->getAddressSpace() == DL.getAllocaAddrSpace(),
+         "Allocation instruction pointer not in the stack address space!",
          &AI);
   Assert(AI.getAllocatedType()->isSized(&Visited),
          "Cannot allocate unsized type", &AI);
@@ -3901,7 +3936,7 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
-  for (Value *V : CS.args()) 
+  for (Value *V : CS.args())
     if (auto *MD = dyn_cast<MetadataAsValue>(V))
       visitMetadataAsValue(*MD, CS.getCaller());
 
@@ -3929,6 +3964,26 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "constant int",
            CS);
     break;
+  case Intrinsic::experimental_constrained_fadd:
+  case Intrinsic::experimental_constrained_fsub:
+  case Intrinsic::experimental_constrained_fmul:
+  case Intrinsic::experimental_constrained_fdiv:
+  case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_sqrt:
+  case Intrinsic::experimental_constrained_pow:
+  case Intrinsic::experimental_constrained_powi:
+  case Intrinsic::experimental_constrained_sin:
+  case Intrinsic::experimental_constrained_cos:
+  case Intrinsic::experimental_constrained_exp:
+  case Intrinsic::experimental_constrained_exp2:
+  case Intrinsic::experimental_constrained_log:
+  case Intrinsic::experimental_constrained_log10:
+  case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_rint:
+  case Intrinsic::experimental_constrained_nearbyint:
+    visitConstrainedFPIntrinsic(
+        cast<ConstrainedFPIntrinsic>(*CS.getInstruction()));
+    break;
   case Intrinsic::dbg_declare: // llvm.dbg.declare
     Assert(isa<MetadataAsValue>(CS.getArgOperand(0)),
            "invalid llvm.dbg.declare intrinsic call 1", CS);
@@ -3952,10 +4007,16 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            CS);
     break;
   }
-  case Intrinsic::memcpy_element_atomic: {
-    ConstantInt *ElementSizeCI = dyn_cast<ConstantInt>(CS.getArgOperand(3));
-    Assert(ElementSizeCI, "element size of the element-wise atomic memory "
-                          "intrinsic must be a constant int",
+  case Intrinsic::memcpy_element_unordered_atomic: {
+    const ElementUnorderedAtomicMemCpyInst *MI =
+        cast<ElementUnorderedAtomicMemCpyInst>(CS.getInstruction());
+    ;
+
+    ConstantInt *ElementSizeCI =
+        dyn_cast<ConstantInt>(MI->getRawElementSizeInBytes());
+    Assert(ElementSizeCI,
+           "element size of the element-wise unordered atomic memory "
+           "intrinsic must be a constant int",
            CS);
     const APInt &ElementSizeVal = ElementSizeCI->getValue();
     Assert(ElementSizeVal.isPowerOf2(),
@@ -3963,19 +4024,91 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
            "must be a power of 2",
            CS);
 
+    if (auto *LengthCI = dyn_cast<ConstantInt>(MI->getLength())) {
+      uint64_t Length = LengthCI->getZExtValue();
+      uint64_t ElementSize = MI->getElementSizeInBytes();
+      Assert((Length % ElementSize) == 0,
+             "constant length must be a multiple of the element size in the "
+             "element-wise atomic memory intrinsic",
+             CS);
+    }
+
     auto IsValidAlignment = [&](uint64_t Alignment) {
       return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
     };
-    
-    uint64_t DstAlignment = CS.getParamAlignment(1),
-             SrcAlignment = CS.getParamAlignment(2);
-
+    uint64_t DstAlignment = CS.getParamAlignment(0),
+             SrcAlignment = CS.getParamAlignment(1);
     Assert(IsValidAlignment(DstAlignment),
-           "incorrect alignment of the destination argument",
+           "incorrect alignment of the destination argument", CS);
+    Assert(IsValidAlignment(SrcAlignment),
+           "incorrect alignment of the source argument", CS);
+    break;
+  }
+  case Intrinsic::memmove_element_unordered_atomic: {
+    auto *MI = cast<ElementUnorderedAtomicMemMoveInst>(CS.getInstruction());
+
+    ConstantInt *ElementSizeCI =
+        dyn_cast<ConstantInt>(MI->getRawElementSizeInBytes());
+    Assert(ElementSizeCI,
+           "element size of the element-wise unordered atomic memory "
+           "intrinsic must be a constant int",
+           CS);
+    const APInt &ElementSizeVal = ElementSizeCI->getValue();
+    Assert(ElementSizeVal.isPowerOf2(),
+           "element size of the element-wise atomic memory intrinsic "
+           "must be a power of 2",
            CS);
+
+    if (auto *LengthCI = dyn_cast<ConstantInt>(MI->getLength())) {
+      uint64_t Length = LengthCI->getZExtValue();
+      uint64_t ElementSize = MI->getElementSizeInBytes();
+      Assert((Length % ElementSize) == 0,
+             "constant length must be a multiple of the element size in the "
+             "element-wise atomic memory intrinsic",
+             CS);
+    }
+
+    auto IsValidAlignment = [&](uint64_t Alignment) {
+      return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
+    };
+    uint64_t DstAlignment = CS.getParamAlignment(0),
+             SrcAlignment = CS.getParamAlignment(1);
+    Assert(IsValidAlignment(DstAlignment),
+           "incorrect alignment of the destination argument", CS);
     Assert(IsValidAlignment(SrcAlignment),
-           "incorrect alignment of the source argument",
+           "incorrect alignment of the source argument", CS);
+    break;
+  }
+  case Intrinsic::memset_element_unordered_atomic: {
+    auto *MI = cast<ElementUnorderedAtomicMemSetInst>(CS.getInstruction());
+
+    ConstantInt *ElementSizeCI =
+        dyn_cast<ConstantInt>(MI->getRawElementSizeInBytes());
+    Assert(ElementSizeCI,
+           "element size of the element-wise unordered atomic memory "
+           "intrinsic must be a constant int",
+           CS);
+    const APInt &ElementSizeVal = ElementSizeCI->getValue();
+    Assert(ElementSizeVal.isPowerOf2(),
+           "element size of the element-wise atomic memory intrinsic "
+           "must be a power of 2",
            CS);
+
+    if (auto *LengthCI = dyn_cast<ConstantInt>(MI->getLength())) {
+      uint64_t Length = LengthCI->getZExtValue();
+      uint64_t ElementSize = MI->getElementSizeInBytes();
+      Assert((Length % ElementSize) == 0,
+             "constant length must be a multiple of the element size in the "
+             "element-wise atomic memory intrinsic",
+             CS);
+    }
+
+    auto IsValidAlignment = [&](uint64_t Alignment) {
+      return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
+    };
+    uint64_t DstAlignment = CS.getParamAlignment(0);
+    Assert(IsValidAlignment(DstAlignment),
+           "incorrect alignment of the destination argument", CS);
     break;
   }
   case Intrinsic::gcroot:
@@ -4182,7 +4315,7 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
     // relocated pointer. It can be casted to the correct type later if it's
     // desired. However, they must have the same address space and 'vectorness'
     GCRelocateInst &Relocate = cast<GCRelocateInst>(*CS.getInstruction());
-    Assert(Relocate.getDerivedPtr()->getType()->getScalarType()->isPointerTy(),
+    Assert(Relocate.getDerivedPtr()->getType()->isPtrOrPtrVectorTy(),
            "gc.relocate: relocated value must be a gc pointer", CS);
 
     auto ResultType = CS.getType();
@@ -4205,7 +4338,7 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   }
   case Intrinsic::masked_load: {
     Assert(CS.getType()->isVectorTy(), "masked_load: must return a vector", CS);
-    
+
     Value *Ptr = CS.getArgOperand(0);
     //Value *Alignment = CS.getArgOperand(1);
     Value *Mask = CS.getArgOperand(2);
@@ -4215,12 +4348,12 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     // DataTy is the overloaded type
     Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
-    Assert(DataTy == CS.getType(), 
+    Assert(DataTy == CS.getType(),
            "masked_load: return must match pointer type", CS);
     Assert(PassThru->getType() == DataTy,
            "masked_load: pass through and data type must match", CS);
     Assert(Mask->getType()->getVectorNumElements() ==
-           DataTy->getVectorNumElements(), 
+           DataTy->getVectorNumElements(),
            "masked_load: vector mask must be same length as data", CS);
     break;
   }
@@ -4234,10 +4367,10 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
 
     // DataTy is the overloaded type
     Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
-    Assert(DataTy == Val->getType(), 
+    Assert(DataTy == Val->getType(),
            "masked_store: storee must match pointer type", CS);
     Assert(Mask->getType()->getVectorNumElements() ==
-           DataTy->getVectorNumElements(), 
+           DataTy->getVectorNumElements(),
            "masked_store: vector mask must be same length as data", CS);
     break;
   }
@@ -4294,6 +4427,20 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
   return nullptr;
 }
 
+void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
+  unsigned NumOperands = FPI.getNumArgOperands();
+  Assert(((NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)),
+         "invalid arguments for constrained FP intrinsic", &FPI);
+  Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-1)),
+         "invalid exception behavior argument", &FPI);
+  Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-2)),
+         "invalid rounding mode argument", &FPI);
+  Assert(FPI.getRoundingMode() != ConstrainedFPIntrinsic::rmInvalid,
+         "invalid rounding mode argument", &FPI);
+  Assert(FPI.getExceptionBehavior() != ConstrainedFPIntrinsic::ebInvalid,
+         "invalid exception behavior argument", &FPI);
+}
+
 template <class DbgIntrinsicTy>
 void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
   auto *MD = cast<MetadataAsValue>(DII.getArgOperand(0))->getMetadata();
@@ -4330,6 +4477,8 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
                                " variable and !dbg attachment",
            &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
            Loc->getScope()->getSubprogram());
+
+  verifyFnArgs(DII);
 }
 
 static uint64_t getVariableSize(const DILocalVariable &V) {
@@ -4398,15 +4547,49 @@ void Verifier::verifyFragmentExpression(const DbgInfoIntrinsic &I) {
   AssertDI(FragSize != VarSize, "fragment covers entire variable", &I, V, E);
 }
 
+void Verifier::verifyFnArgs(const DbgInfoIntrinsic &I) {
+  // This function does not take the scope of noninlined function arguments into
+  // account. Don't run it if current function is nodebug, because it may
+  // contain inlined debug intrinsics.
+  if (!HasDebugInfo)
+    return;
+
+  DILocalVariable *Var;
+  if (auto *DV = dyn_cast<DbgValueInst>(&I)) {
+    // For performance reasons only check non-inlined ones.
+    if (DV->getDebugLoc()->getInlinedAt())
+      return;
+    Var = DV->getVariable();
+  } else {
+    auto *DD = cast<DbgDeclareInst>(&I);
+    if (DD->getDebugLoc()->getInlinedAt())
+      return;
+    Var = DD->getVariable();
+  }
+  AssertDI(Var, "dbg intrinsic without variable");
+
+  unsigned ArgNo = Var->getArg();
+  if (!ArgNo)
+    return;
+
+  // Verify there are no duplicate function argument debug info entries.
+  // These will cause hard-to-debug assertions in the DWARF backend.
+  if (DebugFnArgs.size() < ArgNo)
+    DebugFnArgs.resize(ArgNo, nullptr);
+
+  auto *Prev = DebugFnArgs[ArgNo - 1];
+  DebugFnArgs[ArgNo - 1] = Var;
+  AssertDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
+           Prev, Var);
+}
+
 void Verifier::verifyCompileUnits() {
   auto *CUs = M.getNamedMetadata("llvm.dbg.cu");
   SmallPtrSet<const Metadata *, 2> Listed;
   if (CUs)
     Listed.insert(CUs->op_begin(), CUs->op_end());
-  AssertDI(
-      all_of(CUVisited,
-             [&Listed](const Metadata *CU) { return Listed.count(CU); }),
-      "All DICompileUnits must be listed in llvm.dbg.cu");
+  for (auto *CU : CUVisited)
+    AssertDI(Listed.count(CU), "DICompileUnit not listed in llvm.dbg.cu", CU);
   CUVisited.clear();
 }
 
diff --git a/contrib/llvm/lib/LTO/Caching.cpp b/contrib/llvm/lib/LTO/Caching.cpp
index fd5bdb0..e32e46c 100644
--- a/contrib/llvm/lib/LTO/Caching.cpp
+++ b/contrib/llvm/lib/LTO/Caching.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/LTO/Caching.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -21,70 +22,71 @@
 using namespace llvm;
 using namespace llvm::lto;
 
-static void commitEntry(StringRef TempFilename, StringRef EntryPath) {
-  // Rename to final destination (hopefully race condition won't matter here)
-  auto EC = sys::fs::rename(TempFilename, EntryPath);
-  if (EC) {
-    // Renaming failed, probably not the same filesystem, copy and delete.
-    // FIXME: Avoid needing to do this by creating the temporary file in the
-    // cache directory.
-    {
-      auto ReloadedBufferOrErr = MemoryBuffer::getFile(TempFilename);
-      if (auto EC = ReloadedBufferOrErr.getError())
-        report_fatal_error(Twine("Failed to open temp file '") + TempFilename +
-                           "': " + EC.message() + "\n");
+Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
+                                            AddBufferFn AddBuffer) {
+  if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPath))
+    return errorCodeToError(EC);
 
-      raw_fd_ostream OS(EntryPath, EC, sys::fs::F_None);
-      if (EC)
-        report_fatal_error(Twine("Failed to open ") + EntryPath +
-                           " to save cached entry\n");
-      // I'm not sure what are the guarantee if two processes are doing this
-      // at the same time.
-      OS << (*ReloadedBufferOrErr)->getBuffer();
-    }
-    sys::fs::remove(TempFilename);
-  }
-}
-
-NativeObjectCache lto::localCache(std::string CacheDirectoryPath,
-                                  AddFileFn AddFile) {
   return [=](unsigned Task, StringRef Key) -> AddStreamFn {
-    // First, see if we have a cache hit.
+    // This choice of file name allows the cache to be pruned (see pruneCache()
+    // in include/llvm/Support/CachePruning.h).
     SmallString<64> EntryPath;
-    sys::path::append(EntryPath, CacheDirectoryPath, Key);
-    if (sys::fs::exists(EntryPath)) {
-      AddFile(Task, EntryPath);
+    sys::path::append(EntryPath, CacheDirectoryPath, "llvmcache-" + Key);
+    // First, see if we have a cache hit.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getFile(EntryPath);
+    if (MBOrErr) {
+      AddBuffer(Task, std::move(*MBOrErr));
       return AddStreamFn();
     }
 
+    if (MBOrErr.getError() != errc::no_such_file_or_directory)
+      report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
+                         ": " + MBOrErr.getError().message() + "\n");
+
     // This native object stream is responsible for commiting the resulting
-    // file to the cache and calling AddFile to add it to the link.
+    // file to the cache and calling AddBuffer to add it to the link.
     struct CacheStream : NativeObjectStream {
-      AddFileFn AddFile;
+      AddBufferFn AddBuffer;
       std::string TempFilename;
       std::string EntryPath;
       unsigned Task;
 
-      CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddFileFn AddFile,
+      CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddBufferFn AddBuffer,
                   std::string TempFilename, std::string EntryPath,
                   unsigned Task)
-          : NativeObjectStream(std::move(OS)), AddFile(AddFile),
-            TempFilename(TempFilename), EntryPath(EntryPath), Task(Task) {}
+          : NativeObjectStream(std::move(OS)), AddBuffer(std::move(AddBuffer)),
+            TempFilename(std::move(TempFilename)),
+            EntryPath(std::move(EntryPath)), Task(Task) {}
 
       ~CacheStream() {
+        // FIXME: This code could race with the cache pruner, but it is unlikely
+        // that the cache pruner will choose to remove a newly created file.
+
         // Make sure the file is closed before committing it.
         OS.reset();
-        commitEntry(TempFilename, EntryPath);
-        AddFile(Task, EntryPath);
+        // This is atomic on POSIX systems.
+        if (auto EC = sys::fs::rename(TempFilename, EntryPath))
+          report_fatal_error(Twine("Failed to rename temporary file ") +
+                             TempFilename + ": " + EC.message() + "\n");
+
+        ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+            MemoryBuffer::getFile(EntryPath);
+        if (!MBOrErr)
+          report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
+                             ": " + MBOrErr.getError().message() + "\n");
+        AddBuffer(Task, std::move(*MBOrErr));
       }
     };
 
     return [=](size_t Task) -> std::unique_ptr<NativeObjectStream> {
       // Write to a temporary to avoid race condition
       int TempFD;
-      SmallString<64> TempFilename;
+      SmallString<64> TempFilenameModel, TempFilename;
+      sys::path::append(TempFilenameModel, CacheDirectoryPath, "Thin-%%%%%%.tmp.o");
       std::error_code EC =
-          sys::fs::createTemporaryFile("Thin", "tmp.o", TempFD, TempFilename);
+          sys::fs::createUniqueFile(TempFilenameModel, TempFD, TempFilename,
+                                    sys::fs::owner_read | sys::fs::owner_write);
       if (EC) {
         errs() << "Error: " << EC.message() << "\n";
         report_fatal_error("ThinLTO: Can't get a temporary file");
@@ -93,7 +95,7 @@ NativeObjectCache lto::localCache(std::string CacheDirectoryPath,
       // This CacheStream will move the temporary file into the cache when done.
       return llvm::make_unique<CacheStream>(
           llvm::make_unique<raw_fd_ostream>(TempFD, /* ShouldClose */ true),
-          AddFile, TempFilename.str(), EntryPath.str(), Task);
+          AddBuffer, TempFilename.str(), EntryPath.str(), Task);
     };
   };
 }
diff --git a/contrib/llvm/lib/LTO/LTO.cpp b/contrib/llvm/lib/LTO/LTO.cpp
index e3e2f9f..1997394 100644
--- a/contrib/llvm/lib/LTO/LTO.cpp
+++ b/contrib/llvm/lib/LTO/LTO.cpp
@@ -20,9 +20,12 @@
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/Linker/IRMover.h"
-#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -31,6 +34,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -46,6 +50,12 @@ using namespace object;
 
 #define DEBUG_TYPE "lto"
 
+// The values are (type identifier, summary) pairs.
+typedef DenseMap<
+    GlobalValue::GUID,
+    TinyPtrVector<const std::pair<const std::string, TypeIdSummary> *>>
+    TypeIdSummariesByGuidTy;
+
 // Returns a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -54,7 +64,8 @@ static void computeCacheKey(
     StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
     const FunctionImporter::ExportSetTy &ExportList,
     const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
-    const GVSummaryMapTy &DefinedGlobals) {
+    const GVSummaryMapTy &DefinedGlobals,
+    const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
   // Compute the unique hash for this entry.
   // This is based on the current compiler version, the module itself, the
   // export list, the hash for every single module in the import list, the
@@ -63,7 +74,7 @@ static void computeCacheKey(
 
   // Start with the compiler revision
   Hasher.update(LLVM_VERSION_STRING);
-#ifdef HAVE_LLVM_REVISION
+#ifdef LLVM_REVISION
   Hasher.update(LLVM_REVISION);
 #endif
 
@@ -80,6 +91,18 @@ static void computeCacheKey(
     Data[3] = I >> 24;
     Hasher.update(ArrayRef<uint8_t>{Data, 4});
   };
+  auto AddUint64 = [&](uint64_t I) {
+    uint8_t Data[8];
+    Data[0] = I;
+    Data[1] = I >> 8;
+    Data[2] = I >> 16;
+    Data[3] = I >> 24;
+    Data[4] = I >> 32;
+    Data[5] = I >> 40;
+    Data[6] = I >> 48;
+    Data[7] = I >> 56;
+    Hasher.update(ArrayRef<uint8_t>{Data, 8});
+  };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
   // command-line flags (which is unsupported in production), but may set
@@ -91,10 +114,15 @@ static void computeCacheKey(
   AddUnsigned((unsigned)Conf.Options.DebuggerTuning);
   for (auto &A : Conf.MAttrs)
     AddString(A);
-  AddUnsigned(Conf.RelocModel);
+  if (Conf.RelocModel)
+    AddUnsigned(*Conf.RelocModel);
+  else
+    AddUnsigned(-1);
   AddUnsigned(Conf.CodeModel);
   AddUnsigned(Conf.CGOptLevel);
+  AddUnsigned(Conf.CGFileType);
   AddUnsigned(Conf.OptLevel);
+  AddUnsigned(Conf.UseNewPM);
   AddString(Conf.OptPipeline);
   AddString(Conf.AAPipeline);
   AddString(Conf.OverrideTriple);
@@ -107,10 +135,16 @@ static void computeCacheKey(
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&F, sizeof(F)));
 
-  // Include the hash for every module we import functions from
+  // Include the hash for every module we import functions from. The set of
+  // imported symbols for each module may affect code generation and is
+  // sensitive to link order, so include that as well.
   for (auto &Entry : ImportList) {
     auto ModHash = Index.getModuleHash(Entry.first());
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+
+    AddUint64(Entry.second.size());
+    for (auto &Fn : Entry.second)
+      AddUint64(Fn.first);
   }
 
   // Include the hash for the resolved ODR.
@@ -121,12 +155,68 @@ static void computeCacheKey(
                                     sizeof(GlobalValue::LinkageTypes)));
   }
 
+  std::set<GlobalValue::GUID> UsedTypeIds;
+
+  auto AddUsedTypeIds = [&](GlobalValueSummary *GS) {
+    auto *FS = dyn_cast_or_null<FunctionSummary>(GS);
+    if (!FS)
+      return;
+    for (auto &TT : FS->type_tests())
+      UsedTypeIds.insert(TT);
+    for (auto &TT : FS->type_test_assume_vcalls())
+      UsedTypeIds.insert(TT.GUID);
+    for (auto &TT : FS->type_checked_load_vcalls())
+      UsedTypeIds.insert(TT.GUID);
+    for (auto &TT : FS->type_test_assume_const_vcalls())
+      UsedTypeIds.insert(TT.VFunc.GUID);
+    for (auto &TT : FS->type_checked_load_const_vcalls())
+      UsedTypeIds.insert(TT.VFunc.GUID);
+  };
+
   // Include the hash for the linkage type to reflect internalization and weak
-  // resolution.
+  // resolution, and collect any used type identifier resolutions.
   for (auto &GS : DefinedGlobals) {
     GlobalValue::LinkageTypes Linkage = GS.second->linkage();
     Hasher.update(
         ArrayRef<uint8_t>((const uint8_t *)&Linkage, sizeof(Linkage)));
+    AddUsedTypeIds(GS.second);
+  }
+
+  // Imported functions may introduce new uses of type identifier resolutions,
+  // so we need to collect their used resolutions as well.
+  for (auto &ImpM : ImportList)
+    for (auto &ImpF : ImpM.second)
+      AddUsedTypeIds(Index.findSummaryInModule(ImpF.first, ImpM.first()));
+
+  auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
+    AddString(TId);
+
+    AddUnsigned(S.TTRes.TheKind);
+    AddUnsigned(S.TTRes.SizeM1BitWidth);
+
+    AddUint64(S.WPDRes.size());
+    for (auto &WPD : S.WPDRes) {
+      AddUnsigned(WPD.first);
+      AddUnsigned(WPD.second.TheKind);
+      AddString(WPD.second.SingleImplName);
+
+      AddUint64(WPD.second.ResByArg.size());
+      for (auto &ByArg : WPD.second.ResByArg) {
+        AddUint64(ByArg.first.size());
+        for (uint64_t Arg : ByArg.first)
+          AddUint64(Arg);
+        AddUnsigned(ByArg.second.TheKind);
+        AddUint64(ByArg.second.Info);
+      }
+    }
+  };
+
+  // Include the hash for all type identifiers used by this module.
+  for (GlobalValue::GUID TId : UsedTypeIds) {
+    auto SummariesI = TypeIdSummariesByGuid.find(TId);
+    if (SummariesI != TypeIdSummariesByGuid.end())
+      for (auto *Summary : SummariesI->second)
+        AddTypeIdSummary(Summary->first, Summary->second);
   }
 
   if (!Conf.SampleProfile.empty()) {
@@ -164,9 +254,7 @@ static void thinLTOResolveWeakForLinkerGUID(
     }
     // Alias and aliasee can't be turned into available_externally.
     else if (!isa<AliasSummary>(S.get()) &&
-             !GlobalInvolvedWithAlias.count(S.get()) &&
-             (GlobalValue::isLinkOnceODRLinkage(OriginalLinkage) ||
-              GlobalValue::isWeakODRLinkage(OriginalLinkage)))
+             !GlobalInvolvedWithAlias.count(S.get()))
       S->setLinkage(GlobalValue::AvailableExternallyLinkage);
     if (S->linkage() != OriginalLinkage)
       recordNewLinkage(S->modulePath(), GUID, S->linkage());
@@ -190,13 +278,14 @@ void llvm::thinLTOResolveWeakForLinkerInIndex(
   // when needed.
   DenseSet<GlobalValueSummary *> GlobalInvolvedWithAlias;
   for (auto &I : Index)
-    for (auto &S : I.second)
+    for (auto &S : I.second.SummaryList)
       if (auto AS = dyn_cast<AliasSummary>(S.get()))
         GlobalInvolvedWithAlias.insert(&AS->getAliasee());
 
   for (auto &I : Index)
-    thinLTOResolveWeakForLinkerGUID(I.second, I.first, GlobalInvolvedWithAlias,
-                                    isPrevailing, recordNewLinkage);
+    thinLTOResolveWeakForLinkerGUID(I.second.SummaryList, I.first,
+                                    GlobalInvolvedWithAlias, isPrevailing,
+                                    recordNewLinkage);
 }
 
 static void thinLTOInternalizeAndPromoteGUID(
@@ -217,92 +306,42 @@ void llvm::thinLTOInternalizeAndPromoteInIndex(
     ModuleSummaryIndex &Index,
     function_ref<bool(StringRef, GlobalValue::GUID)> isExported) {
   for (auto &I : Index)
-    thinLTOInternalizeAndPromoteGUID(I.second, I.first, isExported);
+    thinLTOInternalizeAndPromoteGUID(I.second.SummaryList, I.first, isExported);
 }
 
-struct InputFile::InputModule {
-  BitcodeModule BM;
-  std::unique_ptr<Module> Mod;
-
-  // The range of ModuleSymbolTable entries for this input module.
-  size_t SymBegin, SymEnd;
-};
-
 // Requires a destructor for std::vector<InputModule>.
 InputFile::~InputFile() = default;
 
 Expected<std::unique_ptr<InputFile>> InputFile::create(MemoryBufferRef Object) {
   std::unique_ptr<InputFile> File(new InputFile);
 
-  ErrorOr<MemoryBufferRef> BCOrErr =
-      IRObjectFile::findBitcodeInMemBuffer(Object);
-  if (!BCOrErr)
-    return errorCodeToError(BCOrErr.getError());
-
-  Expected<std::vector<BitcodeModule>> BMsOrErr =
-      getBitcodeModuleList(*BCOrErr);
-  if (!BMsOrErr)
-    return BMsOrErr.takeError();
-
-  if (BMsOrErr->empty())
-    return make_error<StringError>("Bitcode file does not contain any modules",
-                                   inconvertibleErrorCode());
-
-  // Create an InputModule for each module in the InputFile, and add it to the
-  // ModuleSymbolTable.
-  for (auto BM : *BMsOrErr) {
-    Expected<std::unique_ptr<Module>> MOrErr =
-        BM.getLazyModule(File->Ctx, /*ShouldLazyLoadMetadata*/ true,
-                         /*IsImporting*/ false);
-    if (!MOrErr)
-      return MOrErr.takeError();
-
-    size_t SymBegin = File->SymTab.symbols().size();
-    File->SymTab.addModule(MOrErr->get());
-    size_t SymEnd = File->SymTab.symbols().size();
-
-    for (const auto &C : (*MOrErr)->getComdatSymbolTable()) {
-      auto P = File->ComdatMap.insert(
-          std::make_pair(&C.second, File->Comdats.size()));
-      assert(P.second);
-      (void)P;
-      File->Comdats.push_back(C.first());
-    }
-
-    File->Mods.push_back({BM, std::move(*MOrErr), SymBegin, SymEnd});
+  Expected<IRSymtabFile> FOrErr = readIRSymtab(Object);
+  if (!FOrErr)
+    return FOrErr.takeError();
+
+  File->TargetTriple = FOrErr->TheReader.getTargetTriple();
+  File->SourceFileName = FOrErr->TheReader.getSourceFileName();
+  File->COFFLinkerOpts = FOrErr->TheReader.getCOFFLinkerOpts();
+  File->ComdatTable = FOrErr->TheReader.getComdatTable();
+
+  for (unsigned I = 0; I != FOrErr->Mods.size(); ++I) {
+    size_t Begin = File->Symbols.size();
+    for (const irsymtab::Reader::SymbolRef &Sym :
+         FOrErr->TheReader.module_symbols(I))
+      // Skip symbols that are irrelevant to LTO. Note that this condition needs
+      // to match the one in Skip() in LTO::addRegularLTO().
+      if (Sym.isGlobal() && !Sym.isFormatSpecific())
+        File->Symbols.push_back(Sym);
+    File->ModuleSymIndices.push_back({Begin, File->Symbols.size()});
   }
 
+  File->Mods = FOrErr->Mods;
+  File->Strtab = std::move(FOrErr->Strtab);
   return std::move(File);
 }
 
-Expected<int> InputFile::Symbol::getComdatIndex() const {
-  if (!isGV())
-    return -1;
-  const GlobalObject *GO = getGV()->getBaseObject();
-  if (!GO)
-    return make_error<StringError>("Unable to determine comdat of alias!",
-                                   inconvertibleErrorCode());
-  if (const Comdat *C = GO->getComdat()) {
-    auto I = File->ComdatMap.find(C);
-    assert(I != File->ComdatMap.end());
-    return I->second;
-  }
-  return -1;
-}
-
 StringRef InputFile::getName() const {
-  return Mods[0].BM.getModuleIdentifier();
-}
-
-StringRef InputFile::getSourceFileName() const {
-  return Mods[0].Mod->getSourceFileName();
-}
-
-iterator_range<InputFile::symbol_iterator>
-InputFile::module_symbols(InputModule &IM) {
-  return llvm::make_range(
-      symbol_iterator(SymTab.symbols().data() + IM.SymBegin, SymTab, this),
-      symbol_iterator(SymTab.symbols().data() + IM.SymEnd, SymTab, this));
+  return Mods[0].getModuleIdentifier();
 }
 
 LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
@@ -325,33 +364,40 @@ LTO::LTO(Config Conf, ThinBackend Backend,
 // Requires a destructor for MapVector<BitcodeModule>.
 LTO::~LTO() = default;
 
-// Add the given symbol to the GlobalResolutions map, and resolve its partition.
-void LTO::addSymbolToGlobalRes(SmallPtrSet<GlobalValue *, 8> &Used,
-                               const InputFile::Symbol &Sym,
-                               SymbolResolution Res, unsigned Partition) {
-  GlobalValue *GV = Sym.isGV() ? Sym.getGV() : nullptr;
+// Add the symbols in the given module to the GlobalResolutions map, and resolve
+// their partitions.
+void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
+                               ArrayRef<SymbolResolution> Res,
+                               unsigned Partition, bool InSummary) {
+  auto *ResI = Res.begin();
+  auto *ResE = Res.end();
+  (void)ResE;
+  for (const InputFile::Symbol &Sym : Syms) {
+    assert(ResI != ResE);
+    SymbolResolution Res = *ResI++;
 
-  auto &GlobalRes = GlobalResolutions[Sym.getName()];
-  if (GV) {
-    GlobalRes.UnnamedAddr &= GV->hasGlobalUnnamedAddr();
+    auto &GlobalRes = GlobalResolutions[Sym.getName()];
+    GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
     if (Res.Prevailing)
-      GlobalRes.IRName = GV->getName();
+      GlobalRes.IRName = Sym.getIRName();
+
+    // Set the partition to external if we know it is re-defined by the linker
+    // with -defsym or -wrap options, used elsewhere, e.g. it is visible to a
+    // regular object, is referenced from llvm.compiler_used, or was already
+    // recorded as being referenced from a different partition.
+    if (Res.LinkerRedefined || Res.VisibleToRegularObj || Sym.isUsed() ||
+        (GlobalRes.Partition != GlobalResolution::Unknown &&
+         GlobalRes.Partition != Partition)) {
+      GlobalRes.Partition = GlobalResolution::External;
+    } else
+      // First recorded reference, save the current partition.
+      GlobalRes.Partition = Partition;
+
+    // Flag as visible outside of summary if visible from a regular object or
+    // from a module that does not have a summary.
+    GlobalRes.VisibleOutsideSummary |=
+        (Res.VisibleToRegularObj || Sym.isUsed() || !InSummary);
   }
-  // Set the partition to external if we know it is used elsewhere, e.g.
-  // it is visible to a regular object, is referenced from llvm.compiler_used,
-  // or was already recorded as being referenced from a different partition.
-  if (Res.VisibleToRegularObj || (GV && Used.count(GV)) ||
-      (GlobalRes.Partition != GlobalResolution::Unknown &&
-       GlobalRes.Partition != Partition)) {
-    GlobalRes.Partition = GlobalResolution::External;
-  } else
-    // First recorded reference, save the current partition.
-    GlobalRes.Partition = Partition;
-
-  // Flag as visible outside of ThinLTO if visible from a regular object or
-  // if this is a reference in the regular LTO partition.
-  GlobalRes.VisibleOutsideThinLTO |=
-      (Res.VisibleToRegularObj || (Partition == GlobalResolution::RegularLTO));
 }
 
 static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
@@ -370,8 +416,11 @@ static void writeToResolutionFile(raw_ostream &OS, InputFile *Input,
       OS << 'l';
     if (Res.VisibleToRegularObj)
       OS << 'x';
+    if (Res.LinkerRedefined)
+      OS << 'r';
     OS << '\n';
   }
+  OS.flush();
   assert(ResI == Res.end());
 }
 
@@ -383,102 +432,174 @@ Error LTO::add(std::unique_ptr<InputFile> Input,
     writeToResolutionFile(*Conf.ResolutionFile, Input.get(), Res);
 
   const SymbolResolution *ResI = Res.begin();
-  for (InputFile::InputModule &IM : Input->Mods)
-    if (Error Err = addModule(*Input, IM, ResI, Res.end()))
+  for (unsigned I = 0; I != Input->Mods.size(); ++I)
+    if (Error Err = addModule(*Input, I, ResI, Res.end()))
       return Err;
 
   assert(ResI == Res.end());
   return Error::success();
 }
 
-Error LTO::addModule(InputFile &Input, InputFile::InputModule &IM,
+Error LTO::addModule(InputFile &Input, unsigned ModI,
                      const SymbolResolution *&ResI,
                      const SymbolResolution *ResE) {
-  // FIXME: move to backend
-  Module &M = *IM.Mod;
-
-  if (M.getDataLayoutStr().empty())
-    return make_error<StringError>("input module has no datalayout",
-                                    inconvertibleErrorCode());
-
-  if (!Conf.OverrideTriple.empty())
-    M.setTargetTriple(Conf.OverrideTriple);
-  else if (M.getTargetTriple().empty())
-    M.setTargetTriple(Conf.DefaultTriple);
-
-  Expected<bool> HasThinLTOSummary = IM.BM.hasSummary();
-  if (!HasThinLTOSummary)
-    return HasThinLTOSummary.takeError();
+  Expected<BitcodeLTOInfo> LTOInfo = Input.Mods[ModI].getLTOInfo();
+  if (!LTOInfo)
+    return LTOInfo.takeError();
+
+  BitcodeModule BM = Input.Mods[ModI];
+  auto ModSyms = Input.module_symbols(ModI);
+  addModuleToGlobalRes(ModSyms, {ResI, ResE},
+                       LTOInfo->IsThinLTO ? ThinLTO.ModuleMap.size() + 1 : 0,
+                       LTOInfo->HasSummary);
+
+  if (LTOInfo->IsThinLTO)
+    return addThinLTO(BM, ModSyms, ResI, ResE);
+
+  Expected<RegularLTOState::AddedModule> ModOrErr =
+      addRegularLTO(BM, ModSyms, ResI, ResE);
+  if (!ModOrErr)
+    return ModOrErr.takeError();
+
+  if (!LTOInfo->HasSummary)
+    return linkRegularLTO(std::move(*ModOrErr), /*LivenessFromIndex=*/false);
+
+  // Regular LTO module summaries are added to a dummy module that represents
+  // the combined regular LTO module.
+  if (Error Err = BM.readSummary(ThinLTO.CombinedIndex, "", -1ull))
+    return Err;
+  RegularLTO.ModsWithSummaries.push_back(std::move(*ModOrErr));
+  return Error::success();
+}
 
-  if (*HasThinLTOSummary)
-    return addThinLTO(IM.BM, M, Input.module_symbols(IM), ResI, ResE);
-  else
-    return addRegularLTO(IM.BM, ResI, ResE);
+// Checks whether the given global value is in a non-prevailing comdat
+// (comdat containing values the linker indicated were not prevailing,
+// which we then dropped to available_externally), and if so, removes
+// it from the comdat. This is called for all global values to ensure the
+// comdat is empty rather than leaving an incomplete comdat. It is needed for
+// regular LTO modules, in case we are in a mixed-LTO mode (both regular
+// and thin LTO modules) compilation. Since the regular LTO module will be
+// linked first in the final native link, we want to make sure the linker
+// doesn't select any of these incomplete comdats that would be left
+// in the regular LTO module without this cleanup.
+static void
+handleNonPrevailingComdat(GlobalValue &GV,
+                          std::set<const Comdat *> &NonPrevailingComdats) {
+  Comdat *C = GV.getComdat();
+  if (!C)
+    return;
+
+  if (!NonPrevailingComdats.count(C))
+    return;
+
+  // Additionally need to drop externally visible global values from the comdat
+  // to available_externally, so that there aren't multiply defined linker
+  // errors.
+  if (!GV.hasLocalLinkage())
+    GV.setLinkage(GlobalValue::AvailableExternallyLinkage);
+
+  if (auto GO = dyn_cast<GlobalObject>(&GV))
+    GO->setComdat(nullptr);
 }
 
 // Add a regular LTO object to the link.
-Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
-                         const SymbolResolution *ResE) {
-  if (!RegularLTO.CombinedModule) {
-    RegularLTO.CombinedModule =
-        llvm::make_unique<Module>("ld-temp.o", RegularLTO.Ctx);
-    RegularLTO.Mover = llvm::make_unique<IRMover>(*RegularLTO.CombinedModule);
-  }
+// The resulting module needs to be linked into the combined LTO module with
+// linkRegularLTO.
+Expected<LTO::RegularLTOState::AddedModule>
+LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
+                   const SymbolResolution *&ResI,
+                   const SymbolResolution *ResE) {
+  RegularLTOState::AddedModule Mod;
   Expected<std::unique_ptr<Module>> MOrErr =
       BM.getLazyModule(RegularLTO.Ctx, /*ShouldLazyLoadMetadata*/ true,
                        /*IsImporting*/ false);
   if (!MOrErr)
     return MOrErr.takeError();
-
   Module &M = **MOrErr;
+  Mod.M = std::move(*MOrErr);
+
   if (Error Err = M.materializeMetadata())
-    return Err;
+    return std::move(Err);
   UpgradeDebugInfo(M);
 
   ModuleSymbolTable SymTab;
   SymTab.addModule(&M);
 
-  SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
-
-  std::vector<GlobalValue *> Keep;
-
   for (GlobalVariable &GV : M.globals())
     if (GV.hasAppendingLinkage())
-      Keep.push_back(&GV);
+      Mod.Keep.push_back(&GV);
+
+  DenseSet<GlobalObject *> AliasedGlobals;
+  for (auto &GA : M.aliases())
+    if (GlobalObject *GO = GA.getBaseObject())
+      AliasedGlobals.insert(GO);
+
+  // In this function we need IR GlobalValues matching the symbols in Syms
+  // (which is not backed by a module), so we need to enumerate them in the same
+  // order. The symbol enumeration order of a ModuleSymbolTable intentionally
+  // matches the order of an irsymtab, but when we read the irsymtab in
+  // InputFile::create we omit some symbols that are irrelevant to LTO. The
+  // Skip() function skips the same symbols from the module as InputFile does
+  // from the symbol table.
+  auto MsymI = SymTab.symbols().begin(), MsymE = SymTab.symbols().end();
+  auto Skip = [&]() {
+    while (MsymI != MsymE) {
+      auto Flags = SymTab.getSymbolFlags(*MsymI);
+      if ((Flags & object::BasicSymbolRef::SF_Global) &&
+          !(Flags & object::BasicSymbolRef::SF_FormatSpecific))
+        return;
+      ++MsymI;
+    }
+  };
+  Skip();
 
-  for (const InputFile::Symbol &Sym :
-       make_range(InputFile::symbol_iterator(SymTab.symbols().begin(), SymTab,
-                                             nullptr),
-                  InputFile::symbol_iterator(SymTab.symbols().end(), SymTab,
-                                             nullptr))) {
+  std::set<const Comdat *> NonPrevailingComdats;
+  for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
-    addSymbolToGlobalRes(Used, Sym, Res, 0);
 
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Undefined)
-      continue;
-    if (Res.Prevailing && Sym.isGV()) {
-      GlobalValue *GV = Sym.getGV();
-      Keep.push_back(GV);
-      switch (GV->getLinkage()) {
-      default:
-        break;
-      case GlobalValue::LinkOnceAnyLinkage:
-        GV->setLinkage(GlobalValue::WeakAnyLinkage);
-        break;
-      case GlobalValue::LinkOnceODRLinkage:
-        GV->setLinkage(GlobalValue::WeakODRLinkage);
-        break;
+    assert(MsymI != MsymE);
+    ModuleSymbolTable::Symbol Msym = *MsymI++;
+    Skip();
+
+    if (GlobalValue *GV = Msym.dyn_cast<GlobalValue *>()) {
+      if (Res.Prevailing) {
+        if (Sym.isUndefined())
+          continue;
+        Mod.Keep.push_back(GV);
+        // For symbols re-defined with linker -wrap and -defsym options,
+        // set the linkage to weak to inhibit IPO. The linkage will be
+        // restored by the linker.
+        if (Res.LinkerRedefined)
+          GV->setLinkage(GlobalValue::WeakAnyLinkage);
+
+        GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage();
+        if (GlobalValue::isLinkOnceLinkage(OriginalLinkage))
+          GV->setLinkage(GlobalValue::getWeakLinkage(
+              GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
+      } else if (isa<GlobalObject>(GV) &&
+                 (GV->hasLinkOnceODRLinkage() || GV->hasWeakODRLinkage() ||
+                  GV->hasAvailableExternallyLinkage()) &&
+                 !AliasedGlobals.count(cast<GlobalObject>(GV))) {
+        // Any of the above three types of linkage indicates that the
+        // chosen prevailing symbol will have the same semantics as this copy of
+        // the symbol, so we may be able to link it with available_externally
+        // linkage. We will decide later whether to do that when we link this
+        // module (in linkRegularLTO), based on whether it is undefined.
+        Mod.Keep.push_back(GV);
+        GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
+        if (GV->hasComdat())
+          NonPrevailingComdats.insert(GV->getComdat());
+        cast<GlobalObject>(GV)->setComdat(nullptr);
       }
     }
     // Common resolution: collect the maximum size/alignment over all commons.
     // We also record if we see an instance of a common as prevailing, so that
     // if none is prevailing we can ignore it later.
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Common) {
+    if (Sym.isCommon()) {
       // FIXME: We should figure out what to do about commons defined by asm.
       // For now they aren't reported correctly by ModuleSymbolTable.
-      auto &CommonRes = RegularLTO.Commons[Sym.getGV()->getName()];
+      auto &CommonRes = RegularLTO.Commons[Sym.getIRName()];
       CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
       CommonRes.Align = std::max(CommonRes.Align, Sym.getCommonAlignment());
       CommonRes.Prevailing |= Res.Prevailing;
@@ -486,37 +607,75 @@ Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
 
     // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit.
   }
+  if (!M.getComdatSymbolTable().empty())
+    for (GlobalValue &GV : M.global_values())
+      handleNonPrevailingComdat(GV, NonPrevailingComdats);
+  assert(MsymI == MsymE);
+  return std::move(Mod);
+}
+
+Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
+                          bool LivenessFromIndex) {
+  if (!RegularLTO.CombinedModule) {
+    RegularLTO.CombinedModule =
+        llvm::make_unique<Module>("ld-temp.o", RegularLTO.Ctx);
+    RegularLTO.Mover = llvm::make_unique<IRMover>(*RegularLTO.CombinedModule);
+  }
+
+  std::vector<GlobalValue *> Keep;
+  for (GlobalValue *GV : Mod.Keep) {
+    if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID()))
+      continue;
 
-  return RegularLTO.Mover->move(std::move(*MOrErr), Keep,
+    if (!GV->hasAvailableExternallyLinkage()) {
+      Keep.push_back(GV);
+      continue;
+    }
+
+    // Only link available_externally definitions if we don't already have a
+    // definition.
+    GlobalValue *CombinedGV =
+        RegularLTO.CombinedModule->getNamedValue(GV->getName());
+    if (CombinedGV && !CombinedGV->isDeclaration())
+      continue;
+
+    Keep.push_back(GV);
+  }
+
+  return RegularLTO.Mover->move(std::move(Mod.M), Keep,
                                 [](GlobalValue &, IRMover::ValueAdder) {},
-                                /* LinkModuleInlineAsm */ true,
                                 /* IsPerformingImport */ false);
 }
 
-// Add a ThinLTO object to the link.
-// FIXME: This function should not need to take as many parameters once we have
-// a bitcode symbol table.
-Error LTO::addThinLTO(BitcodeModule BM, Module &M,
-                      iterator_range<InputFile::symbol_iterator> Syms,
+// Add a ThinLTO module to the link.
+Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                       const SymbolResolution *&ResI,
                       const SymbolResolution *ResE) {
-  SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
-
-  Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr = BM.getSummary();
-  if (!SummaryOrErr)
-    return SummaryOrErr.takeError();
-  ThinLTO.CombinedIndex.mergeFrom(std::move(*SummaryOrErr),
-                                  ThinLTO.ModuleMap.size());
+  if (Error Err =
+          BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
+                         ThinLTO.ModuleMap.size()))
+    return Err;
 
   for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
-    addSymbolToGlobalRes(Used, Sym, Res, ThinLTO.ModuleMap.size() + 1);
 
-    if (Res.Prevailing && Sym.isGV())
-      ThinLTO.PrevailingModuleForGUID[Sym.getGV()->getGUID()] =
-          BM.getModuleIdentifier();
+    if (Res.Prevailing) {
+      if (!Sym.getIRName().empty()) {
+        auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+            Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+
+        // For linker redefined symbols (via --wrap or --defsym) we want to
+        // switch the linkage to `weak` to prevent IPOs from happening.
+        // Find the summary in the module for this very GV and record the new
+        // linkage so that we can switch it when we import the GV.
+        if (Res.LinkerRedefined)
+          if (auto S = ThinLTO.CombinedIndex.findSummaryInModule(
+                  GUID, BM.getModuleIdentifier()))
+            S->setLinkage(GlobalValue::WeakAnyLinkage);
+      }
+    }
   }
 
   if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
@@ -533,10 +692,24 @@ unsigned LTO::getMaxTasks() const {
 }
 
 Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
+  // Compute "dead" symbols, we don't want to import/export these!
+  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
+  for (auto &Res : GlobalResolutions) {
+    if (Res.second.VisibleOutsideSummary &&
+        // IRName will be defined if we have seen the prevailing copy of
+        // this value. If not, no need to preserve any ThinLTO copies.
+        !Res.second.IRName.empty())
+      GUIDPreservedSymbols.insert(GlobalValue::getGUID(
+          GlobalValue::dropLLVMManglingEscape(Res.second.IRName)));
+  }
+
+  computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
+
   // Save the status of having a regularLTO combined module, as
   // this is needed for generating the ThinLTO Task ID, and
   // the CombinedModule will be moved at the end of runRegularLTO.
-  bool HasRegularLTO = RegularLTO.CombinedModule != nullptr;
+  bool HasRegularLTO = RegularLTO.CombinedModule != nullptr ||
+                       !RegularLTO.ModsWithSummaries.empty();
   // Invoke regular LTO if there was a regular LTO module to start with.
   if (HasRegularLTO)
     if (auto E = runRegularLTO(AddStream))
@@ -545,6 +718,11 @@ Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
 }
 
 Error LTO::runRegularLTO(AddStreamFn AddStream) {
+  for (auto &M : RegularLTO.ModsWithSummaries)
+    if (Error Err = linkRegularLTO(std::move(M),
+                                   /*LivenessFromIndex=*/true))
+      return Err;
+
   // Make sure commons have the right size/alignment: we kept the largest from
   // all the prevailing when adding the inputs, and we apply it here.
   const DataLayout &DL = RegularLTO.CombinedModule->getDataLayout();
@@ -602,7 +780,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
       return Error::success();
   }
   return backend(Conf, AddStream, RegularLTO.ParallelCodeGenParallelismLevel,
-                 std::move(RegularLTO.CombinedModule));
+                 std::move(RegularLTO.CombinedModule), ThinLTO.CombinedIndex);
 }
 
 /// This class defines the interface to the ThinLTO backend.
@@ -633,6 +811,7 @@ class InProcessThinBackend : public ThinBackendProc {
   ThreadPool BackendThreadPool;
   AddStreamFn AddStream;
   NativeObjectCache Cache;
+  TypeIdSummariesByGuidTy TypeIdSummariesByGuid;
 
   Optional<Error> Err;
   std::mutex ErrMu;
@@ -645,7 +824,14 @@ public:
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
         BackendThreadPool(ThinLTOParallelismLevel),
-        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {}
+        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+    // Create a mapping from type identifier GUIDs to type identifier summaries.
+    // This allows backends to use the type identifier GUIDs stored in the
+    // function summaries to determine which type identifier summaries affect
+    // each function without needing to compute GUIDs in each backend.
+    for (auto &TId : CombinedIndex.typeIds())
+      TypeIdSummariesByGuid[GlobalValue::getGUID(TId.first)].push_back(&TId);
+  }
 
   Error runThinLTOBackendThread(
       AddStreamFn AddStream, NativeObjectCache Cache, unsigned Task,
@@ -654,7 +840,8 @@ public:
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
-      MapVector<StringRef, BitcodeModule> &ModuleMap) {
+      MapVector<StringRef, BitcodeModule> &ModuleMap,
+      const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
     auto RunThinBackend = [&](AddStreamFn AddStream) {
       LTOLLVMContext BackendContext(Conf);
       Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
@@ -677,7 +864,7 @@ public:
     SmallString<40> Key;
     // The module may be cached, this helps handling it.
     computeCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, ExportList,
-                    ResolvedODR, DefinedGlobals);
+                    ResolvedODR, DefinedGlobals, TypeIdSummariesByGuid);
     if (AddStreamFn CacheAddStream = Cache(Task, Key))
       return RunThinBackend(CacheAddStream);
 
@@ -701,10 +888,11 @@ public:
             const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
                 &ResolvedODR,
             const GVSummaryMapTy &DefinedGlobals,
-            MapVector<StringRef, BitcodeModule> &ModuleMap) {
+            MapVector<StringRef, BitcodeModule> &ModuleMap,
+            const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
           Error E = runThinLTOBackendThread(
-              AddStream, Cache, Task, BM, CombinedIndex, ImportList,
-              ExportList, ResolvedODR, DefinedGlobals, ModuleMap);
+              AddStream, Cache, Task, BM, CombinedIndex, ImportList, ExportList,
+              ResolvedODR, DefinedGlobals, ModuleMap, TypeIdSummariesByGuid);
           if (E) {
             std::unique_lock<std::mutex> L(ErrMu);
             if (Err)
@@ -713,9 +901,9 @@ public:
               Err = std::move(E);
           }
         },
-        BM, std::ref(CombinedIndex), std::ref(ImportList),
-        std::ref(ExportList), std::ref(ResolvedODR), std::ref(DefinedGlobals),
-        std::ref(ModuleMap));
+        BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
+        std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap),
+        std::ref(TypeIdSummariesByGuid));
     return Error::success();
   }
 
@@ -842,7 +1030,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   // Collect for each module the list of function it defines (GUID ->
   // Summary).
-  StringMap<std::map<GlobalValue::GUID, GlobalValueSummary *>>
+  StringMap<GVSummaryMapTy>
       ModuleToDefinedGVSummaries(ThinLTO.ModuleMap.size());
   ThinLTO.CombinedIndex.collectDefinedGVSummariesPerModule(
       ModuleToDefinedGVSummaries);
@@ -857,19 +1045,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
     if (!ModuleToDefinedGVSummaries.count(Mod.first))
       ModuleToDefinedGVSummaries.try_emplace(Mod.first);
 
-  // Compute "dead" symbols, we don't want to import/export these!
-  DenseSet<GlobalValue::GUID> GUIDPreservedSymbols;
-  for (auto &Res : GlobalResolutions) {
-    if (Res.second.VisibleOutsideThinLTO &&
-        // IRName will be defined if we have seen the prevailing copy of
-        // this value. If not, no need to preserve any ThinLTO copies.
-        !Res.second.IRName.empty())
-      GUIDPreservedSymbols.insert(GlobalValue::getGUID(Res.second.IRName));
-  }
-
-  auto DeadSymbols =
-      computeDeadSymbols(ThinLTO.CombinedIndex, GUIDPreservedSymbols);
-
   StringMap<FunctionImporter::ImportMapTy> ImportLists(
       ThinLTO.ModuleMap.size());
   StringMap<FunctionImporter::ExportSetTy> ExportLists(
@@ -878,7 +1053,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   if (Conf.OptLevel > 0) {
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                             ImportLists, ExportLists, &DeadSymbols);
+                             ImportLists, ExportLists);
 
     std::set<GlobalValue::GUID> ExportedGUIDs;
     for (auto &Res : GlobalResolutions) {
@@ -890,16 +1065,13 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
       // partition (and we can't get the GUID).
       if (Res.second.IRName.empty())
         continue;
-      auto GUID = GlobalValue::getGUID(Res.second.IRName);
+      auto GUID = GlobalValue::getGUID(
+          GlobalValue::dropLLVMManglingEscape(Res.second.IRName));
       // Mark exported unless index-based analysis determined it to be dead.
-      if (!DeadSymbols.count(GUID))
-        ExportedGUIDs.insert(GlobalValue::getGUID(Res.second.IRName));
+      if (ThinLTO.CombinedIndex.isGUIDLive(GUID))
+        ExportedGUIDs.insert(GUID);
     }
 
-    auto isPrevailing = [&](GlobalValue::GUID GUID,
-                            const GlobalValueSummary *S) {
-      return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
-    };
     auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
       const auto &ExportList = ExportLists.find(ModuleIdentifier);
       return (ExportList != ExportLists.end() &&
@@ -907,17 +1079,20 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
              ExportedGUIDs.count(GUID);
     };
     thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported);
-
-    auto recordNewLinkage = [&](StringRef ModuleIdentifier,
-                                GlobalValue::GUID GUID,
-                                GlobalValue::LinkageTypes NewLinkage) {
-      ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
-    };
-
-    thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing,
-                                       recordNewLinkage);
   }
 
+  auto isPrevailing = [&](GlobalValue::GUID GUID,
+                          const GlobalValueSummary *S) {
+    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+  };
+  auto recordNewLinkage = [&](StringRef ModuleIdentifier,
+                              GlobalValue::GUID GUID,
+                              GlobalValue::LinkageTypes NewLinkage) {
+    ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
+  };
+  thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing,
+                                     recordNewLinkage);
+
   std::unique_ptr<ThinBackendProc> BackendProc =
       ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                       AddStream, Cache);
@@ -937,3 +1112,27 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   return BackendProc->wait();
 }
+
+Expected<std::unique_ptr<tool_output_file>>
+lto::setupOptimizationRemarks(LLVMContext &Context,
+                              StringRef LTORemarksFilename,
+                              bool LTOPassRemarksWithHotness, int Count) {
+  if (LTORemarksFilename.empty())
+    return nullptr;
+
+  std::string Filename = LTORemarksFilename;
+  if (Count != -1)
+    Filename += ".thin." + llvm::utostr(Count) + ".yaml";
+
+  std::error_code EC;
+  auto DiagnosticFile =
+      llvm::make_unique<tool_output_file>(Filename, EC, sys::fs::F_None);
+  if (EC)
+    return errorCodeToError(EC);
+  Context.setDiagnosticsOutputFile(
+      llvm::make_unique<yaml::Output>(DiagnosticFile->os()));
+  if (LTOPassRemarksWithHotness)
+    Context.setDiagnosticsHotnessRequested(true);
+  DiagnosticFile->keep();
+  return std::move(DiagnosticFile);
+}
diff --git a/contrib/llvm/lib/LTO/LTOBackend.cpp b/contrib/llvm/lib/LTO/LTOBackend.cpp
index 809db80..3f72e44 100644
--- a/contrib/llvm/lib/LTO/LTOBackend.cpp
+++ b/contrib/llvm/lib/LTO/LTOBackend.cpp
@@ -25,8 +25,8 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTO.h"
-#include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -112,18 +112,79 @@ Error Config::addSaveTemps(std::string OutputFileName,
 namespace {
 
 std::unique_ptr<TargetMachine>
-createTargetMachine(Config &Conf, StringRef TheTriple,
-                    const Target *TheTarget) {
+createTargetMachine(Config &Conf, const Target *TheTarget, Module &M) {
+  StringRef TheTriple = M.getTargetTriple();
   SubtargetFeatures Features;
   Features.getDefaultSubtargetFeatures(Triple(TheTriple));
   for (const std::string &A : Conf.MAttrs)
     Features.AddFeature(A);
 
+  Reloc::Model RelocModel;
+  if (Conf.RelocModel)
+    RelocModel = *Conf.RelocModel;
+  else
+    RelocModel =
+        M.getPICLevel() == PICLevel::NotPIC ? Reloc::Static : Reloc::PIC_;
+
   return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-      TheTriple, Conf.CPU, Features.getString(), Conf.Options, Conf.RelocModel,
+      TheTriple, Conf.CPU, Features.getString(), Conf.Options, RelocModel,
       Conf.CodeModel, Conf.CGOptLevel));
 }
 
+static void runNewPMPasses(Module &Mod, TargetMachine *TM, unsigned OptLevel,
+                           bool IsThinLTO) {
+  PassBuilder PB(TM);
+  AAManager AA;
+
+  // Parse a custom AA pipeline if asked to.
+  assert(PB.parseAAPipeline(AA, "default"));
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+
+  // Register the AA manager first so that our version is the one used.
+  FAM.registerPass([&] { return std::move(AA); });
+
+  // Register all the basic analyses with the managers.
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  ModulePassManager MPM;
+  // FIXME (davide): verify the input.
+
+  PassBuilder::OptimizationLevel OL;
+
+  switch (OptLevel) {
+  default:
+    llvm_unreachable("Invalid optimization level");
+  case 0:
+    OL = PassBuilder::O0;
+    break;
+  case 1:
+    OL = PassBuilder::O1;
+    break;
+  case 2:
+    OL = PassBuilder::O2;
+    break;
+  case 3:
+    OL = PassBuilder::O3;
+    break;
+  }
+
+  if (IsThinLTO)
+    MPM = PB.buildThinLTODefaultPipeline(OL, false /* DebugLogging */);
+  else
+    MPM = PB.buildLTODefaultPipeline(OL, false /* DebugLogging */);
+  MPM.run(Mod, MAM);
+
+  // FIXME (davide): verify the output.
+}
+
 static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
                                  std::string PipelineDesc,
                                  std::string AAPipelineDesc,
@@ -168,13 +229,16 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 }
 
 static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
-                           bool IsThinLTO) {
+                           bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+                           const ModuleSummaryIndex *ImportSummary) {
   legacy::PassManager passes;
   passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(Triple(TM->getTargetTriple()));
   PMB.Inliner = createFunctionInliningPass();
+  PMB.ExportSummary = ExportSummary;
+  PMB.ImportSummary = ImportSummary;
   // Unconditionally verify input since it is not verified before this
   // point and has unknown origin.
   PMB.VerifyInput = true;
@@ -191,12 +255,16 @@ static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
 }
 
 bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
-         bool IsThinLTO) {
-  if (Conf.OptPipeline.empty())
-    runOldPMPasses(Conf, Mod, TM, IsThinLTO);
-  else
+         bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+         const ModuleSummaryIndex *ImportSummary) {
+  // FIXME: Plumb the combined index into the new pass manager.
+  if (!Conf.OptPipeline.empty())
     runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
                          Conf.DisableVerify);
+  else if (Conf.UseNewPM)
+    runNewPMPasses(Mod, TM, Conf.OptLevel, IsThinLTO);
+  else
+    runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary);
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
 }
 
@@ -207,8 +275,7 @@ void codegen(Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
   auto Stream = AddStream(Task);
   legacy::PassManager CodeGenPasses;
-  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
-                              TargetMachine::CGFT_ObjectFile))
+  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS, Conf.CGFileType))
     report_fatal_error("Failed to setup codegen");
   CodeGenPasses.run(Mod);
 }
@@ -245,7 +312,7 @@ void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream,
               std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
 
               std::unique_ptr<TargetMachine> TM =
-                  createTargetMachine(C, MPartInCtx->getTargetTriple(), T);
+                  createTargetMachine(C, T, *MPartInCtx);
 
               codegen(C, TM.get(), AddStream, ThreadId, *MPartInCtx);
             },
@@ -276,34 +343,40 @@ Expected<const Target *> initAndLookupTarget(Config &C, Module &Mod) {
 
 }
 
-static void handleAsmUndefinedRefs(Module &Mod, TargetMachine &TM) {
-  // Collect the list of undefined symbols used in asm and update
-  // llvm.compiler.used to prevent optimization to drop these from the output.
-  StringSet<> AsmUndefinedRefs;
-  ModuleSymbolTable::CollectAsmSymbols(
-      Triple(Mod.getTargetTriple()), Mod.getModuleInlineAsm(),
-      [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
-        if (Flags & object::BasicSymbolRef::SF_Undefined)
-          AsmUndefinedRefs.insert(Name);
-      });
-  updateCompilerUsed(Mod, TM, AsmUndefinedRefs);
+static void
+finalizeOptimizationRemarks(std::unique_ptr<tool_output_file> DiagOutputFile) {
+  // Make sure we flush the diagnostic remarks file in case the linker doesn't
+  // call the global destructors before exiting.
+  if (!DiagOutputFile)
+    return;
+  DiagOutputFile->keep();
+  DiagOutputFile->os().flush();
 }
 
 Error lto::backend(Config &C, AddStreamFn AddStream,
                    unsigned ParallelCodeGenParallelismLevel,
-                   std::unique_ptr<Module> Mod) {
+                   std::unique_ptr<Module> Mod,
+                   ModuleSummaryIndex &CombinedIndex) {
   Expected<const Target *> TOrErr = initAndLookupTarget(C, *Mod);
   if (!TOrErr)
     return TOrErr.takeError();
 
-  std::unique_ptr<TargetMachine> TM =
-      createTargetMachine(C, Mod->getTargetTriple(), *TOrErr);
+  std::unique_ptr<TargetMachine> TM = createTargetMachine(C, *TOrErr, *Mod);
 
-  handleAsmUndefinedRefs(*Mod, *TM);
+  // Setup optimization remarks.
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Mod->getContext(), C.RemarksFilename, C.RemarksWithHotness);
+  if (!DiagFileOrErr)
+    return DiagFileOrErr.takeError();
+  auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
-  if (!C.CodeGenOnly)
-    if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false))
+  if (!C.CodeGenOnly) {
+    if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false,
+             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr)) {
+      finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       return Error::success();
+    }
+  }
 
   if (ParallelCodeGenParallelismLevel == 1) {
     codegen(C, TM.get(), AddStream, 0, *Mod);
@@ -311,11 +384,12 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
     splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel,
                  std::move(Mod));
   }
+  finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   return Error::success();
 }
 
 Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
-                       Module &Mod, ModuleSummaryIndex &CombinedIndex,
+                       Module &Mod, const ModuleSummaryIndex &CombinedIndex,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> &ModuleMap) {
@@ -323,10 +397,7 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (!TOrErr)
     return TOrErr.takeError();
 
-  std::unique_ptr<TargetMachine> TM =
-      createTargetMachine(Conf, Mod.getTargetTriple(), *TOrErr);
-
-  handleAsmUndefinedRefs(Mod, *TM);
+  std::unique_ptr<TargetMachine> TM = createTargetMachine(Conf, *TOrErr, Mod);
 
   if (Conf.CodeGenOnly) {
     codegen(Conf, TM.get(), AddStream, Task, Mod);
@@ -367,7 +438,8 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
     return Error::success();
 
-  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true))
+  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true,
+           /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex))
     return Error::success();
 
   codegen(Conf, TM.get(), AddStream, Task, Mod);
diff --git a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
index 6af31e6..6a27556 100644
--- a/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/contrib/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/LTO/LTO.h"
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/Linker/Linker.h"
@@ -140,6 +141,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeMemCpyOptLegacyPassPass(R);
   initializeDCELegacyPassPass(R);
   initializeCFGSimplifyPassPass(R);
+  initializeLateCFGSimplifyPassPass(R);
 }
 
 void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) {
@@ -493,36 +495,14 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
     return;
   HasVerifiedInput = true;
 
-  if (LTOStripInvalidDebugInfo) {
-    bool BrokenDebugInfo = false;
-    if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo))
-      report_fatal_error("Broken module found, compilation aborted!");
-    if (BrokenDebugInfo) {
-      emitWarning("Invalid debug info found, debug info will be stripped");
-      StripDebugInfo(*MergedModule);
-    }
-  }
-  if (verifyModule(*MergedModule, &dbgs()))
+  bool BrokenDebugInfo = false;
+  if (verifyModule(*MergedModule, &dbgs(),
+                   LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
     report_fatal_error("Broken module found, compilation aborted!");
-}
-
-bool LTOCodeGenerator::setupOptimizationRemarks() {
-  if (LTORemarksFilename != "") {
-    std::error_code EC;
-    DiagnosticOutputFile = llvm::make_unique<tool_output_file>(
-        LTORemarksFilename, EC, sys::fs::F_None);
-    if (EC) {
-      emitError(EC.message());
-      return false;
-    }
-    Context.setDiagnosticsOutputFile(
-        llvm::make_unique<yaml::Output>(DiagnosticOutputFile->os()));
+  if (BrokenDebugInfo) {
+    emitWarning("Invalid debug info found, debug info will be stripped");
+    StripDebugInfo(*MergedModule);
   }
-
-  if (LTOPassRemarksWithHotness)
-    Context.setDiagnosticHotnessRequested(true);
-
-  return true;
 }
 
 void LTOCodeGenerator::finishOptimizationRemarks() {
@@ -540,8 +520,13 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!this->determineTarget())
     return false;
 
-  if (!setupOptimizationRemarks())
-    return false;
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Context, LTORemarksFilename, LTOPassRemarksWithHotness);
+  if (!DiagFileOrErr) {
+    errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
+    report_fatal_error("Can't get an output file for the remarks");
+  }
+  DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
@@ -567,6 +552,8 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!DisableInline)
     PMB.Inliner = createFunctionInliningPass();
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple);
+  if (Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.OptLevel = OptLevel;
   PMB.VerifyInput = !DisableVerify;
   PMB.VerifyOutput = !DisableVerify;
@@ -610,6 +597,7 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
   // If statistics were requested, print them out after codegen.
   if (llvm::AreStatisticsEnabled())
     llvm::PrintStatistics();
+  reportAndResetTimings();
 
   finishOptimizationRemarks();
 
diff --git a/contrib/llvm/lib/LTO/LTOModule.cpp b/contrib/llvm/lib/LTO/LTOModule.cpp
index 89aeb80..3cc8b7d 100644
--- a/contrib/llvm/lib/LTO/LTOModule.cpp
+++ b/contrib/llvm/lib/LTO/LTOModule.cpp
@@ -14,11 +14,12 @@
 
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
@@ -76,14 +77,12 @@ bool LTOModule::isBitcodeFile(StringRef Path) {
 }
 
 bool LTOModule::isThinLTO() {
-  // Right now the detection is only based on the summary presence. We may want
-  // to add a dedicated flag at some point.
-  Expected<bool> Result = hasGlobalValueSummary(MBRef);
+  Expected<BitcodeLTOInfo> Result = getBitcodeLTOInfo(MBRef);
   if (!Result) {
     logAllUnhandledErrors(Result.takeError(), errs(), "");
     return false;
   }
-  return *Result;
+  return Result->IsThinLTO;
 }
 
 bool LTOModule::isBitcodeForTarget(MemoryBuffer *Buffer,
@@ -636,10 +635,10 @@ void LTOModule::parseMetadata() {
   raw_string_ostream OS(LinkerOpts);
 
   // Linker Options
-  if (Metadata *Val = getModule().getModuleFlag("Linker Options")) {
-    MDNode *LinkerOptions = cast<MDNode>(Val);
+  if (NamedMDNode *LinkerOptions =
+          getModule().getNamedMetadata("llvm.linker.options")) {
     for (unsigned i = 0, e = LinkerOptions->getNumOperands(); i != e; ++i) {
-      MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
+      MDNode *MDOptions = LinkerOptions->getOperand(i);
       for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
         MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
         OS << " " << MDOption->getString();
@@ -647,11 +646,15 @@ void LTOModule::parseMetadata() {
     }
   }
 
-  // Globals
+  // Globals - we only need to do this for COFF.
+  const Triple TT(_target->getTargetTriple());
+  if (!TT.isOSBinFormatCOFF())
+    return;
+  Mangler M;
   for (const NameAndAttributes &Sym : _symbols) {
     if (!Sym.symbol)
       continue;
-    _target->getObjFileLowering()->emitLinkerFlagsForGlobal(OS, Sym.symbol);
+    emitLinkerFlagsForGlobalCOFF(OS, Sym.symbol, TT, M);
   }
 
   // Add other interesting metadata here.
diff --git a/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 40537e4..1efd481 100644
--- a/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/contrib/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -14,10 +14,6 @@
 
 #include "llvm/LTO/legacy/ThinLTOCodeGenerator.h"
 
-#ifdef HAVE_LLVM_REVISION
-#include "LLVMLTORevision.h"
-#endif
-
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
@@ -28,16 +24,16 @@
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/ExecutionEngine/ObjectMemoryBuffer.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/LTO/LTO.h"
-#include "llvm/Linker/Linker.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
-#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
 #include "llvm/Support/CachePruning.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
@@ -47,6 +43,7 @@
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
@@ -66,6 +63,7 @@ namespace llvm {
 extern cl::opt<bool> LTODiscardValueNames;
 extern cl::opt<std::string> LTORemarksFilename;
 extern cl::opt<bool> LTOPassRemarksWithHotness;
+extern cl::opt<bool> LTOStripInvalidDebugInfo;
 }
 
 namespace {
@@ -73,27 +71,6 @@ namespace {
 static cl::opt<int>
     ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
 
-Expected<std::unique_ptr<tool_output_file>>
-setupOptimizationRemarks(LLVMContext &Ctx, int Count) {
-  if (LTOPassRemarksWithHotness)
-    Ctx.setDiagnosticHotnessRequested(true);
-
-  if (LTORemarksFilename.empty())
-    return nullptr;
-
-  std::string FileName =
-      LTORemarksFilename + ".thin." + llvm::utostr(Count) + ".yaml";
-  std::error_code EC;
-  auto DiagnosticOutputFile =
-      llvm::make_unique<tool_output_file>(FileName, EC, sys::fs::F_None);
-  if (EC)
-    return errorCodeToError(EC);
-  Ctx.setDiagnosticsOutputFile(
-      llvm::make_unique<yaml::Output>(DiagnosticOutputFile->os()));
-  DiagnosticOutputFile->keep();
-  return std::move(DiagnosticOutputFile);
-}
-
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
                             unsigned count, StringRef Suffix) {
@@ -144,8 +121,9 @@ static void computePrevailingCopies(
   };
 
   for (auto &I : Index) {
-    if (HasMultipleCopies(I.second))
-      PrevailingCopy[I.first] = getFirstDefinitionForLinker(I.second);
+    if (HasMultipleCopies(I.second.SummaryList))
+      PrevailingCopy[I.first] =
+          getFirstDefinitionForLinker(I.second.SummaryList);
   }
 }
 
@@ -166,6 +144,30 @@ static void promoteModule(Module &TheModule, const ModuleSummaryIndex &Index) {
     report_fatal_error("renameModuleForThinLTO failed");
 }
 
+namespace {
+class ThinLTODiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+public:
+  ThinLTODiagnosticInfo(const Twine &DiagMsg,
+                        DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_Linker, Severity), Msg(DiagMsg) {}
+  void print(DiagnosticPrinter &DP) const override { DP << Msg; }
+};
+}
+
+/// Verify the module and strip broken debug info.
+static void verifyLoadedModule(Module &TheModule) {
+  bool BrokenDebugInfo = false;
+  if (verifyModule(TheModule, &dbgs(),
+                   LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr))
+    report_fatal_error("Broken module found, compilation aborted!");
+  if (BrokenDebugInfo) {
+    TheModule.getContext().diagnose(ThinLTODiagnosticInfo(
+        "Invalid debug info found, debug info will be stripped", DS_Warning));
+    StripDebugInfo(TheModule);
+  }
+}
+
 static std::unique_ptr<Module>
 loadModuleFromBuffer(const MemoryBufferRef &Buffer, LLVMContext &Context,
                      bool Lazy, bool IsImporting) {
@@ -183,6 +185,8 @@ loadModuleFromBuffer(const MemoryBufferRef &Buffer, LLVMContext &Context,
     });
     report_fatal_error("Can't load module, abort.");
   }
+  if (!Lazy)
+    verifyLoadedModule(*ModuleOrErr.get());
   return std::move(ModuleOrErr.get());
 }
 
@@ -205,19 +209,24 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
     });
     report_fatal_error("importFunctions failed");
   }
+  // Verify again after cross-importing.
+  verifyLoadedModule(TheModule);
 }
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
-                           unsigned OptLevel) {
+                           unsigned OptLevel, bool Freestanding) {
   // Populate the PassManager
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TM.getTargetTriple());
+  if (Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.Inliner = createFunctionInliningPass();
   // FIXME: should get it from the bitcode?
   PMB.OptLevel = OptLevel;
   PMB.LoopVectorize = true;
   PMB.SLPVectorize = true;
-  PMB.VerifyInput = true;
+  // Already did this in verifyLoadedModule().
+  PMB.VerifyInput = false;
   PMB.VerifyOutput = false;
 
   legacy::PassManager PM;
@@ -285,7 +294,7 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedFunctions,
       const DenseSet<GlobalValue::GUID> &PreservedSymbols, unsigned OptLevel,
-      const TargetMachineBuilder &TMBuilder) {
+      bool Freestanding, const TargetMachineBuilder &TMBuilder) {
     if (CachePath.empty())
       return;
 
@@ -323,7 +332,7 @@ public:
 
     // Start with the compiler revision
     Hasher.update(LLVM_VERSION_STRING);
-#ifdef HAVE_LLVM_REVISION
+#ifdef LLVM_REVISION
     Hasher.update(LLVM_REVISION);
 #endif
 
@@ -342,6 +351,7 @@ public:
       AddUnsigned(*TMBuilder.RelocModel);
     AddUnsigned(TMBuilder.CGOptLevel);
     AddUnsigned(OptLevel);
+    AddUnsigned(Freestanding);
 
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
     for (auto F : ExportList)
@@ -369,7 +379,10 @@ public:
             ArrayRef<uint8_t>((const uint8_t *)&Entry, sizeof(GlobalValue::GUID)));
     }
 
-    sys::path::append(EntryPath, CachePath, toHex(Hasher.result()));
+    // This choice of file name allows the cache to be pruned (see pruneCache()
+    // in include/llvm/Support/CachePruning.h).
+    sys::path::append(EntryPath, CachePath,
+                      "llvmcache-" + toHex(Hasher.result()));
   }
 
   // Access the path to this entry in the cache.
@@ -422,7 +435,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
                      const GVSummaryMapTy &DefinedGlobals,
                      const ThinLTOCodeGenerator::CachingOptions &CacheOptions,
                      bool DisableCodeGen, StringRef SaveTempsDir,
-                     unsigned OptLevel, unsigned count) {
+                     bool Freestanding, unsigned OptLevel, unsigned count) {
 
   // "Benchmark"-like optimization: single-source case
   bool SingleModule = (ModuleMap.size() == 1);
@@ -454,7 +467,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
   }
 
-  optimizeModule(TheModule, TM, OptLevel);
+  optimizeModule(TheModule, TM, OptLevel, Freestanding);
 
   saveTempBitcode(TheModule, SaveTempsDir, count, ".4.opt.bc");
 
@@ -464,7 +477,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     {
       raw_svector_ostream OS(OutputBuffer);
       ProfileSummaryInfo PSI(TheModule);
-      auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr);
+      auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI);
       WriteBitcodeToFile(&TheModule, OS, true, &Index);
     }
     return make_unique<ObjectMemoryBuffer>(std::move(OutputBuffer));
@@ -523,29 +536,25 @@ static void initTMBuilder(TargetMachineBuilder &TMBuilder,
 
 void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
   ThinLTOBuffer Buffer(Data, Identifier);
-  if (Modules.empty()) {
-    // First module added, so initialize the triple and some options
-    LLVMContext Context;
-    StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
-        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
-    if (TripleOrErr)
-      TripleStr = *TripleOrErr;
-    Triple TheTriple(TripleStr);
+  LLVMContext Context;
+  StringRef TripleStr;
+  ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
+      Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
+
+  if (TripleOrErr)
+    TripleStr = *TripleOrErr;
+
+  Triple TheTriple(TripleStr);
+
+  if (Modules.empty())
     initTMBuilder(TMBuilder, Triple(TheTriple));
+  else if (TMBuilder.TheTriple != TheTriple) {
+    if (!TMBuilder.TheTriple.isCompatibleWith(TheTriple))
+      report_fatal_error("ThinLTO modules with incompatible triples not "
+                         "supported");
+    initTMBuilder(TMBuilder, Triple(TMBuilder.TheTriple.merge(TheTriple)));
   }
-#ifndef NDEBUG
-  else {
-    LLVMContext Context;
-    StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
-        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
-    if (TripleOrErr)
-      TripleStr = *TripleOrErr;
-    assert(TMBuilder.TheTriple.str() == TripleStr &&
-           "ThinLTO modules with different triple not supported");
-  }
-#endif
+
   Modules.push_back(Buffer);
 }
 
@@ -584,25 +593,18 @@ std::unique_ptr<TargetMachine> TargetMachineBuilder::create() const {
  * "thin-link".
  */
 std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
-  std::unique_ptr<ModuleSummaryIndex> CombinedIndex;
+  std::unique_ptr<ModuleSummaryIndex> CombinedIndex =
+      llvm::make_unique<ModuleSummaryIndex>();
   uint64_t NextModuleId = 0;
   for (auto &ModuleBuffer : Modules) {
-    Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
-        object::ModuleSummaryIndexObjectFile::create(
-            ModuleBuffer.getMemBuffer());
-    if (!ObjOrErr) {
+    if (Error Err = readModuleSummaryIndex(ModuleBuffer.getMemBuffer(),
+                                           *CombinedIndex, NextModuleId++)) {
       // FIXME diagnose
       logAllUnhandledErrors(
-          ObjOrErr.takeError(), errs(),
-          "error: can't create ModuleSummaryIndexObjectFile for buffer: ");
+          std::move(Err), errs(),
+          "error: can't create module summary index for buffer: ");
       return nullptr;
     }
-    auto Index = (*ObjOrErr)->takeIndex();
-    if (CombinedIndex) {
-      CombinedIndex->mergeFrom(std::move(Index), ++NextModuleId);
-    } else {
-      CombinedIndex = std::move(Index);
-    }
   }
   return CombinedIndex;
 }
@@ -625,13 +627,13 @@ void ThinLTOCodeGenerator::promote(Module &TheModule,
       PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbols(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
 
   // Resolve LinkOnce/Weak symbols.
   StringMap<std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>> ResolvedODR;
@@ -670,13 +672,13 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
       PreservedSymbols, Triple(TheModule.getTargetTriple()));
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbols(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
   auto &ImportList = ImportLists[TheModule.getModuleIdentifier()];
 
   crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
@@ -747,13 +749,13 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
   Index.collectDefinedGVSummariesPerModule(ModuleToDefinedGVSummaries);
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(Index, GUIDPreservedSymbols);
+  computeDeadSymbols(Index, GUIDPreservedSymbols);
 
   // Generate import/export list
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
   auto &ExportList = ExportLists[ModuleIdentifier];
 
   // Be friendly and don't nuke totally the module when the client didn't
@@ -780,7 +782,7 @@ void ThinLTOCodeGenerator::optimize(Module &TheModule) {
   initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
 
   // Optimize now
-  optimizeModule(TheModule, *TMBuilder.create(), OptLevel);
+  optimizeModule(TheModule, *TMBuilder.create(), OptLevel, Freestanding);
 }
 
 /**
@@ -899,14 +901,14 @@ void ThinLTOCodeGenerator::run() {
       computeGUIDPreservedSymbols(PreservedSymbols, TMBuilder.TheTriple);
 
   // Compute "dead" symbols, we don't want to import/export these!
-  auto DeadSymbols = computeDeadSymbols(*Index, GUIDPreservedSymbols);
+  computeDeadSymbols(*Index, GUIDPreservedSymbols);
 
   // Collect the import/export lists for all modules from the call-graph in the
   // combined index.
   StringMap<FunctionImporter::ImportMapTy> ImportLists(ModuleCount);
   StringMap<FunctionImporter::ExportSetTy> ExportLists(ModuleCount);
   ComputeCrossModuleImport(*Index, ModuleToDefinedGVSummaries, ImportLists,
-                           ExportLists, &DeadSymbols);
+                           ExportLists);
 
   // We use a std::map here to be able to have a defined ordering when
   // producing a hash for the cache entry.
@@ -966,7 +968,7 @@ void ThinLTOCodeGenerator::run() {
                                     ImportLists[ModuleIdentifier], ExportList,
                                     ResolvedODR[ModuleIdentifier],
                                     DefinedFunctions, GUIDPreservedSymbols,
-                                    OptLevel, TMBuilder);
+                                    OptLevel, Freestanding, TMBuilder);
         auto CacheEntryPath = CacheEntry.getEntryPath();
 
         {
@@ -990,7 +992,8 @@ void ThinLTOCodeGenerator::run() {
         LLVMContext Context;
         Context.setDiscardValueNames(LTODiscardValueNames);
         Context.enableDebugTypeODRUniquing();
-        auto DiagFileOrErr = setupOptimizationRemarks(Context, count);
+        auto DiagFileOrErr = lto::setupOptimizationRemarks(
+            Context, LTORemarksFilename, LTOPassRemarksWithHotness, count);
         if (!DiagFileOrErr) {
           errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
           report_fatal_error("ThinLTO: Can't get an output file for the "
@@ -1011,7 +1014,7 @@ void ThinLTOCodeGenerator::run() {
             *TheModule, *Index, ModuleMap, *TMBuilder.create(), ImportList,
             ExportList, GUIDPreservedSymbols,
             ModuleToDefinedGVSummaries[ModuleIdentifier], CacheOptions,
-            DisableCodeGen, SaveTempsDir, OptLevel, count);
+            DisableCodeGen, SaveTempsDir, Freestanding, OptLevel, count);
 
         // Commit to the cache (if enabled)
         CacheEntry.write(*OutputBuffer);
@@ -1043,13 +1046,10 @@ void ThinLTOCodeGenerator::run() {
     }
   }
 
-  CachePruning(CacheOptions.Path)
-      .setPruningInterval(std::chrono::seconds(CacheOptions.PruningInterval))
-      .setEntryExpiration(std::chrono::seconds(CacheOptions.Expiration))
-      .setMaxSize(CacheOptions.MaxPercentageOfAvailableSpace)
-      .prune();
+  pruneCache(CacheOptions.Path, CacheOptions.Policy);
 
   // If statistics were requested, print them out now.
   if (llvm::AreStatisticsEnabled())
     llvm::PrintStatistics();
+  reportAndResetTimings();
 }
diff --git a/contrib/llvm/lib/LTO/UpdateCompilerUsed.cpp b/contrib/llvm/lib/LTO/UpdateCompilerUsed.cpp
index b67d9ea..5165cc9 100644
--- a/contrib/llvm/lib/LTO/UpdateCompilerUsed.cpp
+++ b/contrib/llvm/lib/LTO/UpdateCompilerUsed.cpp
@@ -65,7 +65,7 @@ private:
     // target.
     for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
          I != E; ++I) {
-      LibFunc::Func F = static_cast<LibFunc::Func>(I);
+      LibFunc F = static_cast<LibFunc>(I);
       if (TLI.has(F))
         Libcalls.insert(TLI.getName(F));
     }
diff --git a/contrib/llvm/lib/Linker/IRMover.cpp b/contrib/llvm/lib/Linker/IRMover.cpp
index 9f3cfc0..f486e52 100644
--- a/contrib/llvm/lib/Linker/IRMover.cpp
+++ b/contrib/llvm/lib/Linker/IRMover.cpp
@@ -395,11 +395,12 @@ class IRLinker {
       Worklist.push_back(GV);
   }
 
-  /// Flag whether the ModuleInlineAsm string in Src should be linked with
-  /// (concatenated into) the ModuleInlineAsm string for the destination
-  /// module. It should be true for full LTO, but not when importing for
-  /// ThinLTO, otherwise we can have duplicate symbols.
-  bool LinkModuleInlineAsm;
+  /// Whether we are importing globals for ThinLTO, as opposed to linking the
+  /// source module. If this flag is set, it means that we can rely on some
+  /// other object file to define any non-GlobalValue entities defined by the
+  /// source module. This currently causes us to not link retained types in
+  /// debug info metadata and module inline asm.
+  bool IsPerformingImport;
 
   /// Set to true when all global value body linking is complete (including
   /// lazy linking). Used to prevent metadata linking from creating new
@@ -491,10 +492,10 @@ public:
            IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
            ArrayRef<GlobalValue *> ValuesToLink,
            std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
-           bool LinkModuleInlineAsm, bool IsPerformingImport)
+           bool IsPerformingImport)
       : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
         TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this),
-        SharedMDs(SharedMDs), LinkModuleInlineAsm(LinkModuleInlineAsm),
+        SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
         Mapper(ValueMap, RF_MoveDistinctMDs | RF_IgnoreMissingLocals, &TypeMap,
                &GValMaterializer),
         AliasMCID(Mapper.registerAlternateMappingContext(AliasValueMap,
@@ -601,6 +602,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
                          /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(),
                          SGVar->getType()->getAddressSpace());
   NewDGV->setAlignment(SGVar->getAlignment());
+  NewDGV->copyAttributesFrom(SGVar);
   return NewDGV;
 }
 
@@ -609,8 +611,11 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) {
 Function *IRLinker::copyFunctionProto(const Function *SF) {
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
-  return Function::Create(TypeMap.get(SF->getFunctionType()),
-                          GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+  auto *F =
+      Function::Create(TypeMap.get(SF->getFunctionType()),
+                       GlobalValue::ExternalLinkage, SF->getName(), &DstM);
+  F->copyAttributesFrom(SF);
+  return F;
 }
 
 /// Set up prototypes for any aliases that come over from the source module.
@@ -618,9 +623,11 @@ GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) {
   // If there is no linkage to be performed or we're linking from the source,
   // bring over SGA.
   auto *Ty = TypeMap.get(SGA->getValueType());
-  return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
-                             GlobalValue::ExternalLinkage, SGA->getName(),
-                             &DstM);
+  auto *GA =
+      GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(),
+                          GlobalValue::ExternalLinkage, SGA->getName(), &DstM);
+  GA->copyAttributesFrom(SGA);
+  return GA;
 }
 
 GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
@@ -647,8 +654,6 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV,
   else if (SGV->hasExternalWeakLinkage())
     NewGV->setLinkage(GlobalValue::ExternalWeakLinkage);
 
-  NewGV->copyAttributesFrom(SGV);
-
   if (auto *NewGO = dyn_cast<GlobalObject>(NewGV)) {
     // Metadata for global variables and function declarations is copied eagerly.
     if (isa<GlobalVariable>(SGV) || SGV->isDeclaration())
@@ -870,9 +875,6 @@ bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
   if (DGV && !DGV->isDeclarationForLinker())
     return false;
 
-  if (SGV.hasAvailableExternallyLinkage())
-    return true;
-
   if (SGV.isDeclaration() || DoneLinkingBodies)
     return false;
 
@@ -1155,6 +1157,11 @@ Error IRLinker::linkModuleFlagsMetadata() {
         mdconst::extract<ConstantInt>(DstOp->getOperand(0));
     unsigned DstBehaviorValue = DstBehavior->getZExtValue();
 
+    auto overrideDstValue = [&]() {
+      DstModFlags->setOperand(DstIndex, SrcOp);
+      Flags[ID].first = SrcOp;
+    };
+
     // If either flag has override behavior, handle it first.
     if (DstBehaviorValue == Module::Override) {
       // Diagnose inconsistent flags which both have override behavior.
@@ -1165,8 +1172,7 @@ Error IRLinker::linkModuleFlagsMetadata() {
       continue;
     } else if (SrcBehaviorValue == Module::Override) {
       // Update the destination flag to that of the source.
-      DstModFlags->setOperand(DstIndex, SrcOp);
-      Flags[ID].first = SrcOp;
+      overrideDstValue();
       continue;
     }
 
@@ -1202,6 +1208,15 @@ Error IRLinker::linkModuleFlagsMetadata() {
       }
       continue;
     }
+    case Module::Max: {
+      ConstantInt *DstValue =
+          mdconst::extract<ConstantInt>(DstOp->getOperand(2));
+      ConstantInt *SrcValue =
+          mdconst::extract<ConstantInt>(SrcOp->getOperand(2));
+      if (SrcValue->getZExtValue() > DstValue->getZExtValue())
+        overrideDstValue();
+      break;
+    }
     case Module::Append: {
       MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
       MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
@@ -1241,25 +1256,16 @@ Error IRLinker::linkModuleFlagsMetadata() {
   return Error::success();
 }
 
-// This function returns true if the triples match.
-static bool triplesMatch(const Triple &T0, const Triple &T1) {
-  // If vendor is apple, ignore the version number.
-  if (T0.getVendor() == Triple::Apple)
-    return T0.getArch() == T1.getArch() && T0.getSubArch() == T1.getSubArch() &&
-           T0.getVendor() == T1.getVendor() && T0.getOS() == T1.getOS();
-
-  return T0 == T1;
-}
-
-// This function returns the merged triple.
-static std::string mergeTriples(const Triple &SrcTriple,
-                                const Triple &DstTriple) {
-  // If vendor is apple, pick the triple with the larger version number.
-  if (SrcTriple.getVendor() == Triple::Apple)
-    if (DstTriple.isOSVersionLT(SrcTriple))
-      return SrcTriple.str();
-
-  return DstTriple.str();
+/// Return InlineAsm adjusted with target-specific directives if required.
+/// For ARM and Thumb, we have to add directives to select the appropriate ISA
+/// to support mixing module-level inline assembly from ARM and Thumb modules.
+static std::string adjustInlineAsm(const std::string &InlineAsm,
+                                   const Triple &Triple) {
+  if (Triple.getArch() == Triple::thumb || Triple.getArch() == Triple::thumbeb)
+    return ".text\n.balign 2\n.thumb\n" + InlineAsm;
+  if (Triple.getArch() == Triple::arm || Triple.getArch() == Triple::armeb)
+    return ".text\n.balign 4\n.arm\n" + InlineAsm;
+  return InlineAsm;
 }
 
 Error IRLinker::run() {
@@ -1287,22 +1293,25 @@ Error IRLinker::run() {
 
   Triple SrcTriple(SrcM->getTargetTriple()), DstTriple(DstM.getTargetTriple());
 
-  if (!SrcM->getTargetTriple().empty() && !triplesMatch(SrcTriple, DstTriple))
+  if (!SrcM->getTargetTriple().empty()&&
+      !SrcTriple.isCompatibleWith(DstTriple))
     emitWarning("Linking two modules of different target triples: " +
                 SrcM->getModuleIdentifier() + "' is '" +
                 SrcM->getTargetTriple() + "' whereas '" +
                 DstM.getModuleIdentifier() + "' is '" + DstM.getTargetTriple() +
                 "'\n");
 
-  DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
+  DstM.setTargetTriple(SrcTriple.merge(DstTriple));
 
   // Append the module inline asm string.
-  if (LinkModuleInlineAsm && !SrcM->getModuleInlineAsm().empty()) {
+  if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
+    std::string SrcModuleInlineAsm = adjustInlineAsm(SrcM->getModuleInlineAsm(),
+                                                     SrcTriple);
     if (DstM.getModuleInlineAsm().empty())
-      DstM.setModuleInlineAsm(SrcM->getModuleInlineAsm());
+      DstM.setModuleInlineAsm(SrcModuleInlineAsm);
     else
       DstM.setModuleInlineAsm(DstM.getModuleInlineAsm() + "\n" +
-                              SrcM->getModuleInlineAsm());
+                              SrcModuleInlineAsm);
   }
 
   // Loop over all of the linked values to compute type mappings.
@@ -1436,10 +1445,10 @@ IRMover::IRMover(Module &M) : Composite(M) {
 Error IRMover::move(
     std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
     std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
-    bool LinkModuleInlineAsm, bool IsPerformingImport) {
+    bool IsPerformingImport) {
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
-                       LinkModuleInlineAsm, IsPerformingImport);
+                       IsPerformingImport);
   Error E = TheIRLinker.run();
   Composite.dropTriviallyDeadConstantArrays();
   return E;
diff --git a/contrib/llvm/lib/Linker/LinkModules.cpp b/contrib/llvm/lib/Linker/LinkModules.cpp
index cf2c4cc..c0ce4bf 100644
--- a/contrib/llvm/lib/Linker/LinkModules.cpp
+++ b/contrib/llvm/lib/Linker/LinkModules.cpp
@@ -14,12 +14,13 @@
 #include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringSet.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Transforms/Utils/FunctionImportUtils.h"
 using namespace llvm;
 
 namespace {
@@ -31,14 +32,17 @@ class ModuleLinker {
   std::unique_ptr<Module> SrcM;
 
   SetVector<GlobalValue *> ValuesToLink;
-  StringSet<> Internalize;
 
   /// For symbol clashes, prefer those from Src.
   unsigned Flags;
 
-  /// Functions to import from source module, all other functions are
-  /// imported as declarations instead of definitions.
-  DenseSet<const GlobalValue *> *GlobalsToImport;
+  /// List of global value names that should be internalized.
+  StringSet<> Internalize;
+
+  /// Function that will perform the actual internalization. The reason for a
+  /// callback is that the linker cannot call internalizeModule without
+  /// creating a circular dependency between IPO and the linker.
+  std::function<void(Module &, const StringSet<> &)> InternalizeCallback;
 
   /// Used as the callback for lazy linking.
   /// The mover has just hit GV and we have to decide if it, and other members
@@ -46,14 +50,8 @@ class ModuleLinker {
   /// to Add.
   void addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add);
 
-  bool shouldLinkReferencedLinkOnce() {
-    return !(Flags & Linker::DontForceLinkLinkonceODR);
-  }
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
-  bool shouldInternalizeLinkedSymbols() {
-    return Flags & Linker::InternalizeLinkedSymbols;
-  }
 
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
@@ -108,31 +106,17 @@ class ModuleLinker {
 
   bool linkIfNeeded(GlobalValue &GV);
 
-  /// Helper method to check if we are importing from the current source
-  /// module.
-  bool isPerformingImport() const { return GlobalsToImport != nullptr; }
-
-  /// If we are importing from the source module, checks if we should
-  /// import SGV as a definition, otherwise import as a declaration.
-  bool doImportAsDefinition(const GlobalValue *SGV);
-
 public:
   ModuleLinker(IRMover &Mover, std::unique_ptr<Module> SrcM, unsigned Flags,
-               DenseSet<const GlobalValue *> *GlobalsToImport = nullptr)
+               std::function<void(Module &, const StringSet<> &)>
+                   InternalizeCallback = {})
       : Mover(Mover), SrcM(std::move(SrcM)), Flags(Flags),
-        GlobalsToImport(GlobalsToImport) {}
+        InternalizeCallback(std::move(InternalizeCallback)) {}
 
   bool run();
 };
 }
 
-bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
-  if (!isPerformingImport())
-    return false;
-  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
-                                                              GlobalsToImport);
-}
-
 static GlobalValue::VisibilityTypes
 getMinVisibility(GlobalValue::VisibilityTypes A,
                  GlobalValue::VisibilityTypes B) {
@@ -266,18 +250,10 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 
   // We always have to add Src if it has appending linkage.
   if (Src.hasAppendingLinkage()) {
-    // Should have prevented importing for appending linkage in linkIfNeeded.
-    assert(!isPerformingImport());
     LinkFromSrc = true;
     return false;
   }
 
-  if (isPerformingImport()) {
-    // LinkFromSrc iff this is a global requested for importing.
-    LinkFromSrc = GlobalsToImport->count(&Src);
-    return false;
-  }
-
   bool SrcIsDeclaration = Src.isDeclarationForLinker();
   bool DestIsDeclaration = Dest.isDeclarationForLinker();
 
@@ -383,19 +359,9 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
     GV.setUnnamedAddr(UnnamedAddr);
   }
 
-  // Don't want to append to global_ctors list, for example, when we
-  // are importing for ThinLTO, otherwise the global ctors and dtors
-  // get executed multiple times for local variables (the latter causing
-  // double frees).
-  if (GV.hasAppendingLinkage() && isPerformingImport())
-    return false;
-
-  if (isPerformingImport()) {
-    if (!doImportAsDefinition(&GV))
-      return false;
-  } else if (!DGV && !shouldOverrideFromSrc() &&
-             (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-              GV.hasAvailableExternallyLinkage()))
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage()))
     return false;
 
   if (GV.isDeclaration())
@@ -418,17 +384,12 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
 }
 
 void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
-  if (!shouldLinkReferencedLinkOnce())
-    // For ThinLTO we don't import more than what was required.
-    // The client has to guarantee that the linkonce will be availabe at link
-    // time (by promoting it to weak for instance).
-    return;
-
   // Add these to the internalize list
-  if (!GV.hasLinkOnceLinkage() && !shouldLinkOnlyNeeded())
+  if (!GV.hasLinkOnceLinkage() && !GV.hasAvailableExternallyLinkage() &&
+      !shouldLinkOnlyNeeded())
     return;
 
-  if (shouldInternalizeLinkedSymbols())
+  if (InternalizeCallback)
     Internalize.insert(GV.getName());
   Add(GV);
 
@@ -442,7 +403,7 @@ void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
       return;
     if (!LinkFromSrc)
       continue;
-    if (shouldInternalizeLinkedSymbols())
+    if (InternalizeCallback)
       Internalize.insert(GV2->getName());
     Add(*GV2);
   }
@@ -571,7 +532,7 @@ bool ModuleLinker::run() {
     }
   }
 
-  if (shouldInternalizeLinkedSymbols()) {
+  if (InternalizeCallback) {
     for (GlobalValue *GV : ValuesToLink)
       Internalize.insert(GV->getName());
   }
@@ -583,8 +544,7 @@ bool ModuleLinker::run() {
                            [this](GlobalValue &GV, IRMover::ValueAdder Add) {
                              addLazyFor(GV, Add);
                            },
-                           /* LinkModuleInlineAsm */ !isPerformingImport(),
-                           /* IsPerformingImport */ isPerformingImport())) {
+                           /* IsPerformingImport */ false)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       DstM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, EIB.message()));
       HasErrors = true;
@@ -593,19 +553,19 @@ bool ModuleLinker::run() {
   if (HasErrors)
     return true;
 
-  for (auto &P : Internalize) {
-    GlobalValue *GV = DstM.getNamedValue(P.first());
-    GV->setLinkage(GlobalValue::InternalLinkage);
-  }
+  if (InternalizeCallback)
+    InternalizeCallback(DstM, Internalize);
 
   return false;
 }
 
 Linker::Linker(Module &M) : Mover(M) {}
 
-bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
-                          DenseSet<const GlobalValue *> *GlobalsToImport) {
-  ModuleLinker ModLinker(Mover, std::move(Src), Flags, GlobalsToImport);
+bool Linker::linkInModule(
+    std::unique_ptr<Module> Src, unsigned Flags,
+    std::function<void(Module &, const StringSet<> &)> InternalizeCallback) {
+  ModuleLinker ModLinker(Mover, std::move(Src), Flags,
+                         std::move(InternalizeCallback));
   return ModLinker.run();
 }
 
@@ -618,10 +578,11 @@ bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
 /// true is returned and ErrorMsg (if not null) is set to indicate the problem.
 /// Upon failure, the Dest module could be in a modified state, and shouldn't be
 /// relied on to be consistent.
-bool Linker::linkModules(Module &Dest, std::unique_ptr<Module> Src,
-                         unsigned Flags) {
+bool Linker::linkModules(
+    Module &Dest, std::unique_ptr<Module> Src, unsigned Flags,
+    std::function<void(Module &, const StringSet<> &)> InternalizeCallback) {
   Linker L(Dest);
-  return L.linkInModule(std::move(Src), Flags);
+  return L.linkInModule(std::move(Src), Flags, std::move(InternalizeCallback));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/MC/ConstantPools.cpp b/contrib/llvm/lib/MC/ConstantPools.cpp
index bf2c39e..ca54402 100644
--- a/contrib/llvm/lib/MC/ConstantPools.cpp
+++ b/contrib/llvm/lib/MC/ConstantPools.cpp
@@ -1,4 +1,4 @@
-//===- ConstantPools.cpp - ConstantPool class --*- C++ -*---------===//
+//===- ConstantPools.cpp - ConstantPool class -----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,13 +10,16 @@
 // This file implements the ConstantPool and  AssemblerConstantPools classes.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/ADT/MapVector.h"
+
 #include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
+
 //
 // ConstantPool implementation
 //
diff --git a/contrib/llvm/lib/MC/ELFObjectWriter.cpp b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
index a8c88dd..c8dd630 100644
--- a/contrib/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/ELFObjectWriter.cpp
@@ -11,29 +11,51 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -42,7 +64,8 @@ using namespace llvm;
 #define DEBUG_TYPE "reloc-info"
 
 namespace {
-typedef DenseMap<const MCSectionELF *, uint32_t> SectionIndexMapTy;
+
+using SectionIndexMapTy = DenseMap<const MCSectionELF *, uint32_t>;
 
 class ELFObjectWriter;
 
@@ -99,8 +122,7 @@ class ELFObjectWriter : public MCObjectWriter {
 
   DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
 
-  llvm::DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>>
-      Relocations;
+  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
 
   /// @}
   /// @name Symbol Table Data
@@ -144,6 +166,8 @@ public:
                   bool IsLittleEndian)
       : MCObjectWriter(OS, IsLittleEndian), TargetObjectWriter(MOTW) {}
 
+  ~ELFObjectWriter() override = default;
+
   void reset() override {
     Renames.clear();
     Relocations.clear();
@@ -152,8 +176,6 @@ public:
     MCObjectWriter::reset();
   }
 
-  ~ELFObjectWriter() override;
-
   void WriteWord(uint64_t W) {
     if (is64Bit())
       write64(W);
@@ -174,8 +196,8 @@ public:
                    ELFSymbolData &MSD, const MCAsmLayout &Layout);
 
   // Start and end offset of each section
-  typedef std::map<const MCSectionELF *, std::pair<uint64_t, uint64_t>>
-      SectionOffsetsTy;
+  using SectionOffsetsTy =
+      std::map<const MCSectionELF *, std::pair<uint64_t, uint64_t>>;
 
   bool shouldRelocateWithSymbol(const MCAssembler &Asm,
                                 const MCSymbolRefExpr *RefA,
@@ -184,11 +206,10 @@ public:
 
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, bool &IsPCRel,
-                        uint64_t &FixedValue) override;
+                        MCValue Target, uint64_t &FixedValue) override;
 
   // Map from a signature symbol to the group section index
-  typedef DenseMap<const MCSymbol *, unsigned> RevGroupMapTy;
+  using RevGroupMapTy = DenseMap<const MCSymbol *, unsigned>;
 
   /// Compute the symbol table data
   ///
@@ -222,18 +243,18 @@ public:
 
   void writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec);
 
+  using MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl;
   bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
                                               const MCSymbol &SymA,
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  bool isWeak(const MCSymbol &Sym) const override;
-
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
   void writeSection(const SectionIndexMapTy &SectionIndexMap,
                     uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
                     const MCSectionELF &Section);
 };
+
 } // end anonymous namespace
 
 void ELFObjectWriter::align(unsigned Alignment) {
@@ -297,9 +318,6 @@ void SymbolTableWriter::writeSymbol(uint32_t name, uint8_t info, uint64_t value,
   ++NumWritten;
 }
 
-ELFObjectWriter::~ELFObjectWriter()
-{}
-
 // Emit the ELF header.
 void ELFObjectWriter::writeHeader(const MCAssembler &Asm) {
   // ELF Header
@@ -370,22 +388,6 @@ uint64_t ELFObjectWriter::SymbolValue(const MCSymbol &Sym,
 
 void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                const MCAsmLayout &Layout) {
-  // Section symbols are used as definitions for undefined symbols with matching
-  // names. If there are multiple sections with the same name, the first one is
-  // used.
-  for (const MCSection &Sec : Asm) {
-    const MCSymbol *Begin = Sec.getBeginSymbol();
-    if (!Begin)
-      continue;
-
-    const MCSymbol *Alias = Asm.getContext().lookupSymbol(Begin->getName());
-    if (!Alias || !Alias->isUndefined())
-      continue;
-
-    Renames.insert(
-        std::make_pair(cast<MCSymbolELF>(Alias), cast<MCSymbolELF>(Begin)));
-  }
-
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
   for (const MCSymbol &A : Asm.symbols()) {
@@ -625,16 +627,16 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm,
                                        const MCAsmLayout &Layout,
                                        const MCFragment *Fragment,
                                        const MCFixup &Fixup, MCValue Target,
-                                       bool &IsPCRel, uint64_t &FixedValue) {
+                                       uint64_t &FixedValue) {
+  MCAsmBackend &Backend = Asm.getBackend();
+  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
+                 MCFixupKindInfo::FKF_IsPCRel;
   const MCSectionELF &FixupSection = cast<MCSectionELF>(*Fragment->getParent());
   uint64_t C = Target.getConstant();
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
   MCContext &Ctx = Asm.getContext();
 
   if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
-    assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
-           "Should not have constructed this");
-
     // Let A, B and C being the components of Target and R be the location of
     // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
     // If it is pcrel, we want to compute (A - B + C - R).
@@ -900,6 +902,8 @@ void ELFObjectWriter::computeSymbolTable(
 
   StrTabBuilder.finalize();
 
+  // File symbols are emitted first and handled separately from normal symbols,
+  // i.e. a non-STT_FILE symbol with the same name may appear.
   for (const std::string &Name : FileNames)
     Writer.writeSymbol(StrTabBuilder.getOffset(Name),
                        ELF::STT_FILE | ELF::STB_LOCAL, 0, 0, ELF::STV_DEFAULT,
@@ -1017,18 +1021,24 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   MCSectionELF &Section = static_cast<MCSectionELF &>(Sec);
   StringRef SectionName = Section.getSectionName();
 
+  auto &MC = Asm.getContext();
+  const auto &MAI = MC.getAsmInfo();
+
   // Compressing debug_frame requires handling alignment fragments which is
   // more work (possibly generalizing MCAssembler.cpp:writeFragment to allow
   // for writing to arbitrary buffers) for little benefit.
   bool CompressionEnabled =
-      Asm.getContext().getAsmInfo()->compressDebugSections() !=
-      DebugCompressionType::DCT_None;
+      MAI->compressDebugSections() != DebugCompressionType::None;
   if (!CompressionEnabled || !SectionName.startswith(".debug_") ||
       SectionName == ".debug_frame") {
     Asm.writeSectionData(&Section, Layout);
     return;
   }
 
+  assert((MAI->compressDebugSections() == DebugCompressionType::Z ||
+          MAI->compressDebugSections() == DebugCompressionType::GNU) &&
+         "expected zlib or zlib-gnu style compression");
+
   SmallVector<char, 128> UncompressedData;
   raw_svector_ostream VecOS(UncompressedData);
   raw_pwrite_stream &OldStream = getStream();
@@ -1037,16 +1047,15 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
   setStream(OldStream);
 
   SmallVector<char, 128> CompressedContents;
-  zlib::Status Success = zlib::compress(
-      StringRef(UncompressedData.data(), UncompressedData.size()),
-      CompressedContents);
-  if (Success != zlib::StatusOK) {
+  if (Error E = zlib::compress(
+          StringRef(UncompressedData.data(), UncompressedData.size()),
+          CompressedContents)) {
+    consumeError(std::move(E));
     getStream() << UncompressedData;
     return;
   }
 
-  bool ZlibStyle = Asm.getContext().getAsmInfo()->compressDebugSections() ==
-                   DebugCompressionType::DCT_Zlib;
+  bool ZlibStyle = MAI->compressDebugSections() == DebugCompressionType::Z;
   if (!maybeWriteCompression(UncompressedData.size(), CompressedContents,
                              ZlibStyle, Sec.getAlignment())) {
     getStream() << UncompressedData;
@@ -1058,8 +1067,7 @@ void ELFObjectWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
     Section.setFlags(Section.getFlags() | ELF::SHF_COMPRESSED);
   else
     // Add "z" prefix to section name. This is zlib-gnu style.
-    Asm.getContext().renameELFSection(&Section,
-                                      (".z" + SectionName.drop_front(1)).str());
+    MC.renameELFSection(&Section, (".z" + SectionName.drop_front(1)).str());
   getStream() << CompressedContents;
 }
 
@@ -1151,8 +1159,8 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   case ELF::SHT_RELA: {
     sh_link = SymbolTableIndex;
     assert(sh_link && ".symtab not found");
-    const MCSectionELF *InfoSection = Section.getAssociatedSection();
-    sh_info = SectionIndexMap.lookup(InfoSection);
+    const MCSection *InfoSection = Section.getAssociatedSection();
+    sh_info = SectionIndexMap.lookup(cast<MCSectionELF>(InfoSection));
     break;
   }
 
@@ -1172,9 +1180,11 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
     break;
   }
 
-  if (TargetObjectWriter->getEMachine() == ELF::EM_ARM &&
-      Section.getType() == ELF::SHT_ARM_EXIDX)
-    sh_link = SectionIndexMap.lookup(Section.getAssociatedSection());
+  if (Section.getFlags() & ELF::SHF_LINK_ORDER) {
+    const MCSymbol *Sym = Section.getAssociatedSymbol();
+    const MCSectionELF *Sec = cast<MCSectionELF>(&Sym->getSection());
+    sh_link = SectionIndexMap.lookup(Sec);
+  }
 
   WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getSectionName()),
                    Section.getType(), Section.getFlags(), 0, Offset, Size,
@@ -1298,7 +1308,8 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     // Remember the offset into the file for this section.
     uint64_t SecStart = getStream().tell();
 
-    writeRelocations(Asm, *RelSection->getAssociatedSection());
+    writeRelocations(Asm,
+                     cast<MCSectionELF>(*RelSection->getAssociatedSection()));
 
     uint64_t SecEnd = getStream().tell();
     SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
@@ -1351,34 +1362,13 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   const auto &SymA = cast<MCSymbolELF>(SA);
   if (IsPCRel) {
     assert(!InSet);
-    if (::isWeak(SymA))
+    if (isWeak(SymA))
       return false;
   }
   return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
                                                                 InSet, IsPCRel);
 }
 
-bool ELFObjectWriter::isWeak(const MCSymbol &S) const {
-  const auto &Sym = cast<MCSymbolELF>(S);
-  if (::isWeak(Sym))
-    return true;
-
-  // It is invalid to replace a reference to a global in a comdat
-  // with a reference to a local since out of comdat references
-  // to a local are forbidden.
-  // We could try to return false for more cases, like the reference
-  // being in the same comdat or Sym being an alias to another global,
-  // but it is not clear if it is worth the effort.
-  if (Sym.getBinding() != ELF::STB_GLOBAL)
-    return false;
-
-  if (!Sym.isInSection())
-    return false;
-
-  const auto &Sec = cast<MCSectionELF>(Sym.getSection());
-  return Sec.getGroup();
-}
-
 MCObjectWriter *llvm::createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
                                             raw_pwrite_stream &OS,
                                             bool IsLittleEndian) {
diff --git a/contrib/llvm/lib/MC/MCAsmBackend.cpp b/contrib/llvm/lib/MC/MCAsmBackend.cpp
index 570f764..3642f37 100644
--- a/contrib/llvm/lib/MC/MCAsmBackend.cpp
+++ b/contrib/llvm/lib/MC/MCAsmBackend.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------==//
+//===- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,13 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 
-MCAsmBackend::MCAsmBackend() {}
+MCAsmBackend::MCAsmBackend() = default;
 
-MCAsmBackend::~MCAsmBackend() {}
+MCAsmBackend::~MCAsmBackend() = default;
 
 Optional<MCFixupKind> MCAsmBackend::getFixupKind(StringRef Name) const {
   return None;
diff --git a/contrib/llvm/lib/MC/MCAsmInfo.cpp b/contrib/llvm/lib/MC/MCAsmInfo.cpp
index 3eb8f50..f059040 100644
--- a/contrib/llvm/lib/MC/MCAsmInfo.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfo.cpp - Asm Info -------------------------------------------==//
+//===- MCAsmInfo.cpp - Asm Info -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,32 +13,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Dwarf.h"
-#include <cctype>
-#include <cstring>
+
 using namespace llvm;
 
 MCAsmInfo::MCAsmInfo() {
-  PointerSize = 4;
-  CalleeSaveStackSlotSize = 4;
-
-  IsLittleEndian = true;
-  StackGrowsUp = false;
-  HasSubsectionsViaSymbols = false;
-  HasMachoZeroFillDirective = false;
-  HasMachoTBSSDirective = false;
-  MaxInstLength = 4;
-  MinInstAlignment = 1;
-  DollarIsPC = false;
   SeparatorString = ";";
   CommentString = "#";
   LabelSuffix = ":";
-  UseAssignmentForEHBegin = false;
-  NeedsLocalForSize = false;
   PrivateGlobalPrefix = "L";
   PrivateLabelPrefix = PrivateGlobalPrefix;
   LinkerPrivateGlobalPrefix = "";
@@ -47,10 +32,6 @@ MCAsmInfo::MCAsmInfo() {
   Code16Directive = ".code16";
   Code32Directive = ".code32";
   Code64Directive = ".code64";
-  AssemblerDialect = 0;
-  AllowAtInName = false;
-  SupportsQuotedNames = true;
-  UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
   AscizDirective = "\t.asciz\t";
@@ -58,40 +39,8 @@ MCAsmInfo::MCAsmInfo() {
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
   Data64bitsDirective = "\t.quad\t";
-  SunStyleELFSectionSwitchSyntax = false;
-  UsesELFSectionDirectiveForBSS = false;
-  AlignmentIsInBytes = true;
-  TextAlignFillValue = 0;
-  GPRel64Directive = nullptr;
-  GPRel32Directive = nullptr;
   GlobalDirective = "\t.globl\t";
-  SetDirectiveSuppressesReloc = false;
-  HasAggressiveSymbolFolding = true;
-  COMMDirectiveAlignmentIsInBytes = true;
-  LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
-  HasFunctionAlignment = true;
-  HasDotTypeDotSizeDirective = true;
-  HasSingleParameterDotFile = true;
-  HasIdentDirective = false;
-  HasNoDeadStrip = false;
-  HasAltEntry = false;
   WeakDirective = "\t.weak\t";
-  WeakRefDirective = nullptr;
-  HasWeakDefDirective = false;
-  HasWeakDefCanBeHiddenDirective = false;
-  HasLinkOnceDirective = false;
-  HiddenVisibilityAttr = MCSA_Hidden;
-  HiddenDeclarationVisibilityAttr = MCSA_Hidden;
-  ProtectedVisibilityAttr = MCSA_Protected;
-  SupportsDebugInformation = false;
-  ExceptionsType = ExceptionHandling::None;
-  WinEHEncodingType = WinEH::EncodingType::Invalid;
-  DwarfUsesRelocationsAcrossSections = true;
-  DwarfFDESymbolsUseAbsDiff = false;
-  DwarfRegNumForCFI = false;
-  NeedsDwarfSectionOffsetDirective = false;
-  UseParensForSymbolVariant = false;
-  UseLogicalShr = true;
 
   // FIXME: Clang's logic should be synced with the logic used to initialize
   //        this member and the two implementations should be merged.
@@ -107,12 +56,9 @@ MCAsmInfo::MCAsmInfo() {
   //   - The target subclasses for AArch64, ARM, and X86 handle these cases
   UseIntegratedAssembler = false;
   PreserveAsmComments = true;
-
-  CompressDebugSections = DebugCompressionType::DCT_None;
 }
 
-MCAsmInfo::~MCAsmInfo() {
-}
+MCAsmInfo::~MCAsmInfo() = default;
 
 bool MCAsmInfo::isSectionAtomizableBySymbols(const MCSection &Section) const {
   return false;
diff --git a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
index 5b9dd20..8510448 100644
--- a/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoCOFF.cpp - COFF asm properties -----------------*- C++ -*-===//
+//===- MCAsmInfoCOFF.cpp - COFF asm properties ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCDirectives.h"
+
 using namespace llvm;
 
-void MCAsmInfoCOFF::anchor() { }
+void MCAsmInfoCOFF::anchor() {}
 
 MCAsmInfoCOFF::MCAsmInfoCOFF() {
   // MingW 4.5 and later support .comm with log2 alignment, but .lcomm uses byte
@@ -41,13 +43,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   UseLogicalShr = false;
 }
 
-void MCAsmInfoMicrosoft::anchor() { }
-
-MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
-}
+void MCAsmInfoMicrosoft::anchor() {}
 
-void MCAsmInfoGNUCOFF::anchor() { }
+MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default;
 
-MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
+void MCAsmInfoGNUCOFF::anchor() {}
 
-}
+MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() = default;
diff --git a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
index e95cf48..c748409 100644
--- a/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoDarwin.cpp - Darwin asm properties -------------*- C++ -*-===//
+//===- MCAsmInfoDarwin.cpp - Darwin asm properties ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,9 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCSectionMachO.h"
+
 using namespace llvm;
 
 bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
diff --git a/contrib/llvm/lib/MC/MCAsmInfoELF.cpp b/contrib/llvm/lib/MC/MCAsmInfoELF.cpp
index 26e5608..b0dc43c 100644
--- a/contrib/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/contrib/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoELF.cpp - ELF asm properties -------------------*- C++ -*-===//
+//===- MCAsmInfoELF.cpp - ELF asm properties ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,12 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ELF.h"
+
 using namespace llvm;
 
-void MCAsmInfoELF::anchor() { }
+void MCAsmInfoELF::anchor() {}
 
 MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
   if (!UsesNonexecutableStackSection)
@@ -31,5 +32,4 @@ MCAsmInfoELF::MCAsmInfoELF() {
   WeakRefDirective = "\t.weak\t";
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
-  UsesNonexecutableStackSection = true;
 }
diff --git a/contrib/llvm/lib/MC/MCAsmInfoWasm.cpp b/contrib/llvm/lib/MC/MCAsmInfoWasm.cpp
new file mode 100644
index 0000000..aa26616
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCAsmInfoWasm.cpp
@@ -0,0 +1,27 @@
+//===-- MCAsmInfoWasm.cpp - Wasm asm properties -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on Wasm-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoWasm.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionWasm.h"
+using namespace llvm;
+
+void MCAsmInfoWasm::anchor() { }
+
+MCAsmInfoWasm::MCAsmInfoWasm() {
+  HasIdentDirective = true;
+  WeakRefDirective = "\t.weak\t";
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+}
diff --git a/contrib/llvm/lib/MC/MCAsmStreamer.cpp b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
index 817009a..9e5553f 100644
--- a/contrib/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCAsmStreamer.cpp
@@ -103,7 +103,10 @@ public:
   void AddComment(const Twine &T, bool EOL = true) override;
 
   /// AddEncodingComment - Add a comment showing the encoding of an instruction.
-  void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &);
+  /// If PrintSchedInfo - is true then the comment sched:[x:y] should
+  //    be added to output if it's being supported by target
+  void AddEncodingComment(const MCInst &Inst, const MCSubtargetInfo &,
+                          bool PrintSchedInfo);
 
   /// GetCommentOS - Return a raw_ostream that comments can be written to.
   /// Unlike AddComment, you are required to terminate comments with \n if you
@@ -130,7 +133,7 @@ public:
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override;
 
   void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
@@ -278,7 +281,8 @@ public:
   void EmitWinEHHandler(const MCSymbol *Sym, bool Unwind, bool Except) override;
   void EmitWinEHHandlerData() override;
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool PrintSchedInfo) override;
 
   void EmitBundleAlignMode(unsigned AlignPow2) override;
   void EmitBundleLock(bool AlignToEnd) override;
@@ -392,12 +396,13 @@ void MCAsmStreamer::emitExplicitComments() {
 void MCAsmStreamer::ChangeSection(MCSection *Section,
                                   const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
-  Section->PrintSwitchToSection(*MAI, OS, Subsection);
+  Section->PrintSwitchToSection(
+      *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS,
+      Subsection);
 }
 
-void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCStreamer::EmitLabel(Symbol);
+void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::EmitLabel(Symbol, Loc);
 
   Symbol->print(OS, MAI);
   OS << MAI->getLabelSuffix();
@@ -1503,7 +1508,8 @@ void MCAsmStreamer::EmitWinCFIEndProlog() {
 }
 
 void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
-                                       const MCSubtargetInfo &STI) {
+                                       const MCSubtargetInfo &STI,
+                                       bool PrintSchedInfo) {
   raw_ostream &OS = GetCommentOS();
   SmallString<256> Code;
   SmallVector<MCFixup, 4> Fixups;
@@ -1576,7 +1582,11 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
       }
     }
   }
-  OS << "]\n";
+  OS << "]";
+  // If we are not going to add fixup or schedul comments after this point then
+  // we have to end the current comment line with "\n".
+  if (Fixups.size() || !PrintSchedInfo)
+    OS << "\n";
 
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
     MCFixup &F = Fixups[i];
@@ -1587,16 +1597,19 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
 }
 
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
-                                    const MCSubtargetInfo &STI) {
+                                    const MCSubtargetInfo &STI,
+                                    bool PrintSchedInfo) {
   assert(getCurrentSectionOnly() &&
          "Cannot emit contents before setting section!");
 
   // Show the encoding in a comment if we have a code emitter.
   if (Emitter)
-    AddEncodingComment(Inst, STI);
+    AddEncodingComment(Inst, STI, PrintSchedInfo);
 
   // Show the MCInst if enabled.
   if (ShowInst) {
+    if (PrintSchedInfo)
+      GetCommentOS() << "\n";
     Inst.dump_pretty(GetCommentOS(), InstPrinter.get(), "\n ");
     GetCommentOS() << "\n";
   }
@@ -1606,6 +1619,16 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
   else
     InstPrinter->printInst(&Inst, OS, "", STI);
 
+  if (PrintSchedInfo) {
+    std::string SI = STI.getSchedInfoStr(Inst);
+    if (!SI.empty())
+      GetCommentOS() << SI;
+  }
+
+  StringRef Comments = CommentToEmit;
+  if (Comments.size() && Comments.back() != '\n')
+    GetCommentOS() << "\n";
+
   EmitEOL();
 }
 
diff --git a/contrib/llvm/lib/MC/MCAssembler.cpp b/contrib/llvm/lib/MC/MCAssembler.cpp
index 83fcec9..eaf6f19 100644
--- a/contrib/llvm/lib/MC/MCAssembler.cpp
+++ b/contrib/llvm/lib/MC/MCAssembler.cpp
@@ -8,8 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -19,24 +22,34 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstring>
 #include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "assembler"
 
 namespace {
 namespace stats {
+
 STATISTIC(EmittedFragments, "Number of emitted assembler fragments - total");
 STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
@@ -55,8 +68,9 @@ STATISTIC(FragmentLayouts, "Number of fragment layouts");
 STATISTIC(ObjectBytes, "Number of emitted object file bytes");
 STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
 STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
-}
-}
+
+} // end namespace stats
+} // end anonymous namespace
 
 // FIXME FIXME FIXME: There are number of places in this file where we convert
 // what is a 64-bit assembler value used for computation into a value in the
@@ -73,8 +87,7 @@ MCAssembler::MCAssembler(MCContext &Context, MCAsmBackend &Backend,
   VersionMinInfo.Major = 0; // Major version == 0 for "none specified"
 }
 
-MCAssembler::~MCAssembler() {
-}
+MCAssembler::~MCAssembler() = default;
 
 void MCAssembler::reset() {
   Sections.clear();
@@ -114,10 +127,16 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const {
   if (!Symbol->isVariable())
     return false;
 
-  // FIXME: It looks like gas supports some cases of the form "foo + 2". It
-  // is not clear if that is a bug or a feature.
   const MCExpr *Expr = Symbol->getVariableValue();
-  const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(Expr);
+
+  MCValue V;
+  if (!Expr->evaluateAsRelocatable(V, nullptr, nullptr))
+    return false;
+
+  if (V.getSymB() || V.getRefKind() != MCSymbolRefExpr::VK_None)
+    return false;
+
+  const MCSymbolRefExpr *Ref = V.getSymA();
   if (!Ref)
     return false;
 
@@ -174,14 +193,23 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   // FIXME: This code has some duplication with recordRelocation. We should
   // probably merge the two into a single callback that tries to evaluate a
   // fixup and records a relocation if one is needed.
+
+  // On error claim to have completely evaluated the fixup, to prevent any
+  // further processing from being done.
   const MCExpr *Expr = Fixup.getValue();
+  MCContext &Ctx = getContext();
+  Value = 0;
   if (!Expr->evaluateAsRelocatable(Target, &Layout, &Fixup)) {
-    getContext().reportError(Fixup.getLoc(), "expected relocatable expression");
-    // Claim to have completely evaluated the fixup, to prevent any further
-    // processing from being done.
-    Value = 0;
+    Ctx.reportError(Fixup.getLoc(), "expected relocatable expression");
     return true;
   }
+  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+    if (RefB->getKind() != MCSymbolRefExpr::VK_None) {
+      Ctx.reportError(Fixup.getLoc(),
+                      "unsupported subtraction of qualified symbol");
+      return true;
+    }
+  }
 
   bool IsPCRel = Backend.getFixupKindInfo(
     Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel;
@@ -219,7 +247,6 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
       Value -= Layout.getSymbolOffset(Sym);
   }
 
-
   bool ShouldAlignPC = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
                          MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
   assert((ShouldAlignPC ? IsPCRel : true) &&
@@ -234,10 +261,9 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
     Value -= Offset;
   }
 
-  // Let the backend adjust the fixup value if necessary, including whether
-  // we need a relocation.
-  Backend.processFixupValue(*this, Layout, Fixup, DF, Target, Value,
-                            IsResolved);
+  // Let the backend force a relocation if needed.
+  if (IsResolved && Backend.shouldForceRelocation(*this, Fixup, Target))
+    IsResolved = false;
 
   return IsResolved;
 }
@@ -621,27 +647,25 @@ void MCAssembler::writeSectionData(const MCSection *Sec,
          Layout.getSectionAddressSize(Sec));
 }
 
-std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout,
-                                                   MCFragment &F,
-                                                   const MCFixup &Fixup) {
+std::tuple<MCValue, uint64_t, bool>
+MCAssembler::handleFixup(const MCAsmLayout &Layout, MCFragment &F,
+                         const MCFixup &Fixup) {
   // Evaluate the fixup.
   MCValue Target;
   uint64_t FixedValue;
-  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
-                 MCFixupKindInfo::FKF_IsPCRel;
-  if (!evaluateFixup(Layout, Fixup, &F, Target, FixedValue)) {
+  bool IsResolved = evaluateFixup(Layout, Fixup, &F, Target, FixedValue);
+  if (!IsResolved) {
     // The fixup was unresolved, we need a relocation. Inform the object
     // writer of the relocation, and give it an opportunity to adjust the
     // fixup value if need be.
-    getWriter().recordRelocation(*this, Layout, &F, Fixup, Target, IsPCRel,
-                                 FixedValue);
+    getWriter().recordRelocation(*this, Layout, &F, Fixup, Target, FixedValue);
   }
-  return std::make_pair(FixedValue, IsPCRel);
+  return std::make_tuple(Target, FixedValue, IsResolved);
 }
 
 void MCAssembler::layout(MCAsmLayout &Layout) {
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - pre-layout\n--\n";
+      errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
 
   // Create dummy fragments and assign section ordinals.
@@ -671,14 +695,14 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
       return;
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - post-relaxation\n--\n";
+      errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
   // Finalize the layout, including fragment lowering.
   finishLayout(Layout);
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - final-layout\n--\n";
+      errs() << "assembler backend - final-layout\n--\n";
       dump(); });
 
   // Allow the object writer a chance to perform post-layout binding (for
@@ -712,10 +736,12 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         llvm_unreachable("Unknown fragment with fixups!");
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
-        bool IsPCRel;
-        std::tie(FixedValue, IsPCRel) = handleFixup(Layout, Frag, Fixup);
-        getBackend().applyFixup(Fixup, Contents.data(),
-                                Contents.size(), FixedValue, IsPCRel);
+        bool IsResolved;
+        MCValue Target;
+        std::tie(Target, FixedValue, IsResolved) =
+            handleFixup(Layout, Frag, Fixup);
+        getBackend().applyFixup(*this, Fixup, Target, Contents, FixedValue,
+                                IsResolved);
       }
     }
   }
@@ -741,6 +767,10 @@ bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
   MCValue Target;
   uint64_t Value;
   bool Resolved = evaluateFixup(Layout, Fixup, DF, Target, Value);
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_X86_ABS8 &&
+      Fixup.getKind() == FK_Data_1)
+    return false;
   return getBackend().fixupNeedsRelaxationAdvanced(Fixup, Resolved, Value, DF,
                                                    Layout);
 }
diff --git a/contrib/llvm/lib/MC/MCCodeEmitter.cpp b/contrib/llvm/lib/MC/MCCodeEmitter.cpp
index c122763..ca69478 100644
--- a/contrib/llvm/lib/MC/MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/MC/MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- MCCodeEmitter.cpp - Instruction Encoding --------------------------===//
+//===- MCCodeEmitter.cpp - Instruction Encoding ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,6 @@
 
 using namespace llvm;
 
-MCCodeEmitter::MCCodeEmitter() {
-}
+MCCodeEmitter::MCCodeEmitter() = default;
 
-MCCodeEmitter::~MCCodeEmitter() {
-}
+MCCodeEmitter::~MCCodeEmitter() = default;
diff --git a/contrib/llvm/lib/MC/MCCodeView.cpp b/contrib/llvm/lib/MC/MCCodeView.cpp
index 99a5c11..92b1e12 100644
--- a/contrib/llvm/lib/MC/MCCodeView.cpp
+++ b/contrib/llvm/lib/MC/MCCodeView.cpp
@@ -12,15 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCCodeView.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/EndianStream.h"
 
 using namespace llvm;
@@ -145,7 +145,7 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
   MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
            *StringEnd = Ctx.createTempSymbol("strtab_end", false);
 
-  OS.EmitIntValue(unsigned(ModuleSubstreamKind::StringTable), 4);
+  OS.EmitIntValue(unsigned(DebugSubsectionKind::StringTable), 4);
   OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
   OS.EmitLabel(StringBegin);
 
@@ -172,7 +172,7 @@ void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
   MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
            *FileEnd = Ctx.createTempSymbol("filechecksums_end", false);
 
-  OS.EmitIntValue(unsigned(ModuleSubstreamKind::FileChecksums), 4);
+  OS.EmitIntValue(unsigned(DebugSubsectionKind::FileChecksums), 4);
   OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
   OS.EmitLabel(FileBegin);
 
@@ -197,7 +197,7 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
   MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
            *LineEnd = Ctx.createTempSymbol("linetable_end", false);
 
-  OS.EmitIntValue(unsigned(ModuleSubstreamKind::Lines), 4);
+  OS.EmitIntValue(unsigned(DebugSubsectionKind::Lines), 4);
   OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
   OS.EmitLabel(LineBegin);
   OS.EmitCOFFSecRel32(FuncBegin, /*Offset=*/0);
@@ -208,7 +208,7 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
   bool HaveColumns = any_of(Locs, [](const MCCVLineEntry &LineEntry) {
     return LineEntry.getColumn() != 0;
   });
-  OS.EmitIntValue(HaveColumns ? int(LineFlags::HaveColumns) : 0, 2);
+  OS.EmitIntValue(HaveColumns ? int(LF_HaveColumns) : 0, 2);
   OS.emitAbsoluteSymbolDiff(FuncEnd, FuncBegin, 4);
 
   for (auto I = Locs.begin(), E = Locs.end(); I != E;) {
diff --git a/contrib/llvm/lib/MC/MCContext.cpp b/contrib/llvm/lib/MC/MCContext.cpp
index 9d2fac2..48ee84e 100644
--- a/contrib/llvm/lib/MC/MCContext.cpp
+++ b/contrib/llvm/lib/MC/MCContext.cpp
@@ -9,28 +9,41 @@
 
 #include "llvm/MC/MCContext.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCLabel.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolMachO.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -40,19 +53,14 @@ AsSecureLogFileName("as-secure-log-file-name",
                  "AS_SECURE_LOG_FILE env variable)"),
         cl::init(getenv("AS_SECURE_LOG_FILE")), cl::Hidden);
 
-
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
                      bool DoAutoReset)
-    : SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi), Allocator(),
+    : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
       Symbols(Allocator), UsedNames(Allocator),
-      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), DwarfLocSeen(false),
-      GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4),
-      AllowTemporaryLabels(true), DwarfCompileUnitID(0),
-      AutoReset(DoAutoReset), HadError(false) {
+      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
+      AutoReset(DoAutoReset) {
   SecureLogFile = AsSecureLogFileName;
-  SecureLog = nullptr;
-  SecureLogUsed = false;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
     MainFileName =
@@ -80,7 +88,6 @@ void MCContext::reset() {
   MCSubtargetAllocator.DestroyAll();
   UsedNames.clear();
   Symbols.clear();
-  SectionSymbols.clear();
   Allocator.Reset();
   Instances.clear();
   CompilationDir.clear();
@@ -124,18 +131,6 @@ MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
   return Sym;
 }
 
-MCSymbolELF *MCContext::getOrCreateSectionSymbol(const MCSectionELF &Section) {
-  MCSymbol *&Sym = SectionSymbols[&Section];
-  if (Sym)
-    return cast<MCSymbolELF>(Sym);
-
-  StringRef Name = Section.getSectionName();
-  auto NameIter = UsedNames.insert(std::make_pair(Name, false)).first;
-  Sym = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
-
-  return cast<MCSymbolELF>(Sym);
-}
-
 MCSymbol *MCContext::getOrCreateFrameAllocSymbol(StringRef FuncName,
                                                  unsigned Idx) {
   return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
@@ -162,6 +157,8 @@ MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
       return new (Name, *this) MCSymbolELF(Name, IsTemporary);
     case MCObjectFileInfo::IsMachO:
       return new (Name, *this) MCSymbolMachO(Name, IsTemporary);
+    case MCObjectFileInfo::IsWasm:
+      return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
     }
   }
   return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
@@ -182,7 +179,7 @@ MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
   SmallString<128> NewName = Name;
   bool AddSuffix = AlwaysAddSuffix;
   unsigned &NextUniqueID = NextID[Name];
-  for (;;) {
+  while (true) {
     if (AddSuffix) {
       NewName.resize(Name.size());
       raw_svector_ostream(NewName) << NextUniqueID++;
@@ -275,7 +272,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
                                            unsigned TypeAndAttributes,
                                            unsigned Reserved2, SectionKind Kind,
                                            const char *BeginSymName) {
-
   // We unique sections by their segment/section pair.  The returned section
   // may not have the same flags as the requested section, if so this should be
   // diagnosed by the client as an error.
@@ -316,18 +312,53 @@ void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
   const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
 }
 
+MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
+                                              unsigned Flags, SectionKind K,
+                                              unsigned EntrySize,
+                                              const MCSymbolELF *Group,
+                                              unsigned UniqueID,
+                                              const MCSymbolELF *Associated) {
+  MCSymbolELF *R;
+  MCSymbol *&Sym = Symbols[Section];
+  // A section symbol can not redefine regular symbols. There may be multiple
+  // sections with the same name, in which case the first such section wins.
+  if (Sym && Sym->isDefined() &&
+      (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
+    reportError(SMLoc(), "invalid symbol redefinition");
+  if (Sym && Sym->isUndefined()) {
+    R = cast<MCSymbolELF>(Sym);
+  } else {
+    auto NameIter = UsedNames.insert(std::make_pair(Section, false)).first;
+    R = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
+    if (!Sym)
+      Sym = R;
+  }
+  R->setBinding(ELF::STB_LOCAL);
+  R->setType(ELF::STT_SECTION);
+
+  auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
+      Section, Type, Flags, K, EntrySize, Group, UniqueID, R, Associated);
+
+  auto *F = new MCDataFragment();
+  Ret->getFragmentList().insert(Ret->begin(), F);
+  F->setParent(Ret);
+  R->setFragment(F);
+
+  return Ret;
+}
+
 MCSectionELF *MCContext::createELFRelSection(const Twine &Name, unsigned Type,
                                              unsigned Flags, unsigned EntrySize,
                                              const MCSymbolELF *Group,
-                                             const MCSectionELF *Associated) {
+                                             const MCSectionELF *RelInfoSection) {
   StringMap<bool>::iterator I;
   bool Inserted;
   std::tie(I, Inserted) =
-      ELFRelSecNames.insert(std::make_pair(Name.str(), true));
+      RelSecNames.insert(std::make_pair(Name.str(), true));
 
-  return new (ELFAllocator.Allocate())
-      MCSectionELF(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
-                   EntrySize, Group, true, nullptr, Associated);
+  return createELFSectionImpl(
+      I->getKey(), Type, Flags, SectionKind::getReadOnly(), EntrySize, Group,
+      true, cast<MCSymbolELF>(RelInfoSection->getBeginSymbol()));
 }
 
 MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
@@ -340,21 +371,20 @@ MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const Twine &Group, unsigned UniqueID,
-                                       const char *BeginSymName) {
+                                       const MCSymbolELF *Associated) {
   MCSymbolELF *GroupSym = nullptr;
   if (!Group.isTriviallyEmpty() && !Group.str().empty())
     GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));
 
   return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
-                       BeginSymName, nullptr);
+                       Associated);
 }
 
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const MCSymbolELF *GroupSym,
                                        unsigned UniqueID,
-                                       const char *BeginSymName,
-                                       const MCSectionELF *Associated) {
+                                       const MCSymbolELF *Associated) {
   StringRef Group = "";
   if (GroupSym)
     Group = GroupSym->getName();
@@ -375,22 +405,16 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
   else
     Kind = SectionKind::getReadOnly();
 
-  MCSymbol *Begin = nullptr;
-  if (BeginSymName)
-    Begin = createTempSymbol(BeginSymName, false);
-
-  MCSectionELF *Result = new (ELFAllocator.Allocate())
-      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID,
-                   Begin, Associated);
+  MCSectionELF *Result = createELFSectionImpl(
+      CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, Associated);
   Entry.second = Result;
   return Result;
 }
 
 MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) {
-  MCSectionELF *Result = new (ELFAllocator.Allocate())
-      MCSectionELF(".group", ELF::SHT_GROUP, 0, SectionKind::getReadOnly(), 4,
-                   Group, ~0, nullptr, nullptr);
-  return Result;
+  return createELFSectionImpl(".group", ELF::SHT_GROUP, 0,
+                              SectionKind::getReadOnly(), 4, Group, ~0,
+                              nullptr);
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
@@ -462,6 +486,80 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
                         "", 0, UniqueID);
 }
 
+void MCContext::renameWasmSection(MCSectionWasm *Section, StringRef Name) {
+  StringRef GroupName;
+  assert(!Section->getGroup() && "not yet implemented");
+
+  unsigned UniqueID = Section->getUniqueID();
+  WasmUniquingMap.erase(
+      WasmSectionKey{Section->getSectionName(), GroupName, UniqueID});
+  auto I = WasmUniquingMap.insert(std::make_pair(
+                                     WasmSectionKey{Name, GroupName, UniqueID},
+                                     Section))
+               .first;
+  StringRef CachedName = I->first.SectionName;
+  const_cast<MCSectionWasm *>(Section)->setSectionName(CachedName);
+}
+
+MCSectionWasm *MCContext::createWasmRelSection(const Twine &Name, unsigned Type,
+                                               unsigned Flags,
+                                               const MCSymbolWasm *Group) {
+  StringMap<bool>::iterator I;
+  bool Inserted;
+  std::tie(I, Inserted) =
+      RelSecNames.insert(std::make_pair(Name.str(), true));
+
+  return new (WasmAllocator.Allocate())
+      MCSectionWasm(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
+                    Group, ~0, nullptr);
+}
+
+MCSectionWasm *MCContext::getWasmNamedSection(const Twine &Prefix,
+                                              const Twine &Suffix, unsigned Type,
+                                              unsigned Flags) {
+  return getWasmSection(Prefix + "." + Suffix, Type, Flags, Suffix);
+}
+
+MCSectionWasm *MCContext::getWasmSection(const Twine &Section, unsigned Type,
+                                         unsigned Flags,
+                                         const Twine &Group, unsigned UniqueID,
+                                         const char *BeginSymName) {
+  MCSymbolWasm *GroupSym = nullptr;
+  if (!Group.isTriviallyEmpty() && !Group.str().empty())
+    GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
+
+  return getWasmSection(Section, Type, Flags, GroupSym, UniqueID, BeginSymName);
+}
+
+MCSectionWasm *MCContext::getWasmSection(const Twine &Section, unsigned Type,
+                                         unsigned Flags,
+                                         const MCSymbolWasm *GroupSym,
+                                         unsigned UniqueID,
+                                         const char *BeginSymName) {
+  StringRef Group = "";
+  if (GroupSym)
+    Group = GroupSym->getName();
+  // Do the lookup, if we have a hit, return it.
+  auto IterBool = WasmUniquingMap.insert(
+      std::make_pair(WasmSectionKey{Section.str(), Group, UniqueID}, nullptr));
+  auto &Entry = *IterBool.first;
+  if (!IterBool.second)
+    return Entry.second;
+
+  StringRef CachedName = Entry.first.SectionName;
+
+  SectionKind Kind = SectionKind::getText();
+
+  MCSymbol *Begin = nullptr;
+  if (BeginSymName)
+    Begin = createTempSymbol(BeginSymName, false);
+
+  MCSectionWasm *Result = new (WasmAllocator.Allocate())
+      MCSectionWasm(CachedName, Type, Flags, Kind, GroupSym, UniqueID, Begin);
+  Entry.second = Result;
+  return Result;
+}
+
 MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
   return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
 }
diff --git a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
index aa50727..ef1d833 100644
--- a/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/contrib/llvm/lib/MC/MCDisassembler/Disassembler.cpp
@@ -27,8 +27,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstddef>
 #include <cstring>
diff --git a/contrib/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/contrib/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
index 3a4f738..2f1275d 100644
--- a/contrib/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/contrib/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -1,4 +1,4 @@
-//===-- MCDisassembler.cpp - Disassembler interface -----------------------===//
+//===- MCDisassembler.cpp - Disassembler interface ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,13 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace llvm;
 
-MCDisassembler::~MCDisassembler() {
-}
+MCDisassembler::~MCDisassembler() = default;
 
 bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
                                               uint64_t Address, bool IsBranch,
diff --git a/contrib/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/contrib/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
index 1612562..8f932a3 100644
--- a/contrib/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
+++ b/contrib/llvm/lib/MC/MCDisassembler/MCRelocationInfo.cpp
@@ -1,4 +1,4 @@
-//==-- MCRelocationInfo.cpp ------------------------------------------------==//
+//===-- MCRelocationInfo.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,12 +13,9 @@
 
 using namespace llvm;
 
-MCRelocationInfo::MCRelocationInfo(MCContext &Ctx)
-  : Ctx(Ctx) {
-}
+MCRelocationInfo::MCRelocationInfo(MCContext &Ctx) : Ctx(Ctx) {}
 
-MCRelocationInfo::~MCRelocationInfo() {
-}
+MCRelocationInfo::~MCRelocationInfo() = default;
 
 const MCExpr *
 MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
diff --git a/contrib/llvm/lib/MC/MCDisassembler/MCSymbolizer.cpp b/contrib/llvm/lib/MC/MCDisassembler/MCSymbolizer.cpp
index c0f707d..78e611e 100644
--- a/contrib/llvm/lib/MC/MCDisassembler/MCSymbolizer.cpp
+++ b/contrib/llvm/lib/MC/MCDisassembler/MCSymbolizer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class -----------*- C++ -*-===//
+//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,5 +11,4 @@
 
 using namespace llvm;
 
-MCSymbolizer::~MCSymbolizer() {
-}
+MCSymbolizer::~MCSymbolizer() = default;
diff --git a/contrib/llvm/lib/MC/MCDwarf.cpp b/contrib/llvm/lib/MC/MCDwarf.cpp
index a7551a3..a2beee3 100644
--- a/contrib/llvm/lib/MC/MCDwarf.cpp
+++ b/contrib/llvm/lib/MC/MCDwarf.cpp
@@ -8,10 +8,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -20,14 +26,22 @@
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -154,7 +168,7 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
     // and the current Label.
     const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
     MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
-                                   asmInfo->getPointerSize());
+                                   asmInfo->getCodePointerSize());
 
     Discriminator = 0;
     LastLine = LineEntry.getLine();
@@ -174,7 +188,7 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
 
   const MCAsmInfo *AsmInfo = Ctx.getAsmInfo();
   MCOS->EmitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
-                                 AsmInfo->getPointerSize());
+                                 AsmInfo->getCodePointerSize());
 }
 
 //
@@ -580,7 +594,7 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   // Figure the padding after the header before the table of address and size
   // pairs who's values are PointerSize'ed.
   const MCAsmInfo *asmInfo = context.getAsmInfo();
-  int AddrSize = asmInfo->getPointerSize();
+  int AddrSize = asmInfo->getCodePointerSize();
   int Pad = 2 * AddrSize - (Length & (2 * AddrSize - 1));
   if (Pad == 2 * AddrSize)
     Pad = 0;
@@ -592,7 +606,6 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   // And the pair of terminating zeros.
   Length += 2 * AddrSize;
 
-
   // Emit the header for this section.
   // The 4 byte length not including the 4 byte value for the length.
   MCOS->EmitIntValue(Length - 4, 4);
@@ -661,7 +674,14 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   // The 2 byte DWARF version.
   MCOS->EmitIntValue(context.getDwarfVersion(), 2);
 
+  // The DWARF v5 header has unit type, address size, abbrev offset.
+  // Earlier versions have abbrev offset, address size.
   const MCAsmInfo &AsmInfo = *context.getAsmInfo();
+  int AddrSize = AsmInfo.getCodePointerSize();
+  if (context.getDwarfVersion() >= 5) {
+    MCOS->EmitIntValue(dwarf::DW_UT_compile, 1);
+    MCOS->EmitIntValue(AddrSize, 1);
+  }
   // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
   // it is at the start of that section so this is zero.
   if (AbbrevSectionSymbol == nullptr)
@@ -669,11 +689,8 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   else
     MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4,
                           AsmInfo.needsDwarfSectionOffsetDirective());
-
-  const MCAsmInfo *asmInfo = context.getAsmInfo();
-  int AddrSize = asmInfo->getPointerSize();
-  // The 1 byte size of an address.
-  MCOS->EmitIntValue(AddrSize, 1);
+  if (context.getDwarfVersion() <= 4)
+    MCOS->EmitIntValue(AddrSize, 1);
 
   // Second part: the compile_unit DIE.
 
@@ -806,7 +823,7 @@ static void EmitGenDwarfRanges(MCStreamer *MCOS) {
   auto &Sections = context.getGenDwarfSectionSyms();
 
   const MCAsmInfo *AsmInfo = context.getAsmInfo();
-  int AddrSize = AsmInfo->getPointerSize();
+  int AddrSize = AsmInfo->getCodePointerSize();
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
 
@@ -885,7 +902,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     }
   }
 
-  assert((RangesSectionSymbol != NULL) || !UseRangesSection);
+  assert((RangesSectionSymbol != nullptr) || !UseRangesSection);
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
@@ -964,7 +981,7 @@ static unsigned getSizeForEncoding(MCStreamer &streamer,
   default: llvm_unreachable("Unknown Encoding");
   case dwarf::DW_EH_PE_absptr:
   case dwarf::DW_EH_PE_signed:
-    return context.getAsmInfo()->getPointerSize();
+    return context.getAsmInfo()->getCodePointerSize();
   case dwarf::DW_EH_PE_udata2:
   case dwarf::DW_EH_PE_sdata2:
     return 2;
@@ -1003,6 +1020,7 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 }
 
 namespace {
+
 class FrameEmitterImpl {
   int CFAOffset = 0;
   int InitialCFAOffset = 0;
@@ -1050,10 +1068,10 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitULEB128IntValue(Reg2);
     return;
   }
-  case MCCFIInstruction::OpWindowSave: {
+  case MCCFIInstruction::OpWindowSave:
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
     return;
-  }
+
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
@@ -1087,7 +1105,6 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 
     return;
   }
-
   case MCCFIInstruction::OpDefCfaRegister: {
     unsigned Reg = Instr.getRegister();
     if (!IsEH)
@@ -1097,7 +1114,6 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 
     return;
   }
-
   case MCCFIInstruction::OpOffset:
   case MCCFIInstruction::OpRelOffset: {
     const bool IsRelative =
@@ -1145,11 +1161,11 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
     return;
   }
-  case MCCFIInstruction::OpGnuArgsSize: {
+  case MCCFIInstruction::OpGnuArgsSize:
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
     Streamer.EmitULEB128IntValue(Instr.getOffset());
     return;
-  }
+
   case MCCFIInstruction::OpEscape:
     Streamer.EmitBytes(Instr.getValues());
     return;
@@ -1302,7 +1318,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCSymbol *personality,
 
   if (CIEVersion >= 4) {
     // Address Size
-    Streamer.EmitIntValue(context.getAsmInfo()->getPointerSize(), 1);
+    Streamer.EmitIntValue(context.getAsmInfo()->getCodePointerSize(), 1);
 
     // Segment Descriptor Size
     Streamer.EmitIntValue(0, 1);
@@ -1368,7 +1384,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCSymbol *personality,
   InitialCFAOffset = CFAOffset;
 
   // Padding
-  Streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getPointerSize());
+  Streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getCodePointerSize());
 
   Streamer.EmitLabel(sectionEnd);
   return *sectionStart;
@@ -1437,17 +1453,19 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
   // The size of a .eh_frame section has to be a multiple of the alignment
   // since a null CIE is interpreted as the end. Old systems overaligned
   // .eh_frame, so we do too and account for it in the last FDE.
-  unsigned Align = LastInSection ? asmInfo->getPointerSize() : PCSize;
+  unsigned Align = LastInSection ? asmInfo->getCodePointerSize() : PCSize;
   Streamer.EmitValueToAlignment(Align);
 
   Streamer.EmitLabel(fdeEnd);
 }
 
 namespace {
+
 struct CIEKey {
   static const CIEKey getEmptyKey() {
     return CIEKey(nullptr, 0, -1, false, false);
   }
+
   static const CIEKey getTombstoneKey() {
     return CIEKey(nullptr, -1, 0, false, false);
   }
@@ -1457,23 +1475,28 @@ struct CIEKey {
       : Personality(Personality), PersonalityEncoding(PersonalityEncoding),
         LsdaEncoding(LsdaEncoding), IsSignalFrame(IsSignalFrame),
         IsSimple(IsSimple) {}
+
   const MCSymbol *Personality;
   unsigned PersonalityEncoding;
   unsigned LsdaEncoding;
   bool IsSignalFrame;
   bool IsSimple;
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct DenseMapInfo<CIEKey> {
   static CIEKey getEmptyKey() { return CIEKey::getEmptyKey(); }
   static CIEKey getTombstoneKey() { return CIEKey::getTombstoneKey(); }
+
   static unsigned getHashValue(const CIEKey &Key) {
     return static_cast<unsigned>(
         hash_combine(Key.Personality, Key.PersonalityEncoding, Key.LsdaEncoding,
                      Key.IsSignalFrame, Key.IsSimple));
   }
+
   static bool isEqual(const CIEKey &LHS, const CIEKey &RHS) {
     return LHS.Personality == RHS.Personality &&
            LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
@@ -1482,7 +1505,8 @@ template <> struct DenseMapInfo<CIEKey> {
            LHS.IsSimple == RHS.IsSimple;
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                                bool IsEH) {
@@ -1490,6 +1514,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
 
   MCContext &Context = Streamer.getContext();
   const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
+  const MCAsmInfo *AsmInfo = Context.getAsmInfo();
   FrameEmitterImpl Emitter(IsEH, Streamer);
   ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getDwarfFrameInfos();
 
@@ -1501,7 +1526,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
       if (Frame.CompactUnwindEncoding == 0) continue;
       if (!SectionEmitted) {
         Streamer.SwitchSection(MOFI->getCompactUnwindSection());
-        Streamer.EmitValueToAlignment(Context.getAsmInfo()->getPointerSize());
+        Streamer.EmitValueToAlignment(AsmInfo->getCodePointerSize());
         SectionEmitted = true;
       }
       NeedsEHFrameSection |=
diff --git a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
index de645ca..68fb5e7 100644
--- a/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/contrib/llvm/lib/MC/MCELFObjectTargetWriter.cpp
@@ -7,10 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/MC/MCELFStreamer.cpp b/contrib/llvm/lib/MC/MCELFStreamer.cpp
index 0ef1b2a..50c1f6e 100644
--- a/contrib/llvm/lib/MC/MCELFStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCELFStreamer.cpp
@@ -12,29 +12,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -42,9 +43,6 @@ bool MCELFStreamer::isBundleLocked() const {
   return getCurrentSectionOnly()->isBundleLocked();
 }
 
-MCELFStreamer::~MCELFStreamer() {
-}
-
 void MCELFStreamer::mergeFragment(MCDataFragment *DF,
                                   MCDataFragment *EF) {
   MCAssembler &Assembler = getAssembler();
@@ -95,11 +93,19 @@ void MCELFStreamer::InitSections(bool NoExecStack) {
     SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
 
-void MCELFStreamer::EmitLabel(MCSymbol *S) {
+void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
+
+  const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
+  if (Section.getFlags() & ELF::SHF_TLS)
+    Symbol->setType(ELF::STT_TLS);
+}
 
-  MCObjectStreamer::EmitLabel(Symbol);
+void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc, MCFragment *F) {
+  auto *Symbol = cast<MCSymbolELF>(S);
+  MCObjectStreamer::EmitLabel(Symbol, Loc, F);
 
   const MCSectionELF &Section =
       static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
@@ -147,17 +153,8 @@ void MCELFStreamer::ChangeSection(MCSection *Section,
   if (Grp)
     Asm.registerSymbol(*Grp);
 
-  this->MCObjectStreamer::ChangeSection(Section, Subsection);
-  MCContext &Ctx = getContext();
-  auto *Begin = cast_or_null<MCSymbolELF>(Section->getBeginSymbol());
-  if (!Begin) {
-    Begin = Ctx.getOrCreateSectionSymbol(*SectionELF);
-    Section->setBeginSymbol(Begin);
-  }
-  if (Begin->isUndefined()) {
-    Asm.registerSymbol(*Begin);
-    Begin->setType(ELF::STT_SECTION);
-  }
+  changeSectionImpl(Section, Subsection);
+  Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
 void MCELFStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
@@ -361,13 +358,6 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
-// Add a symbol for the file name of this module. They start after the
-// null symbol and don't count as normal symbol, i.e. a non-STT_FILE symbol
-// with the same name may appear.
-void MCELFStreamer::EmitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
-}
-
 void MCELFStreamer::EmitIdent(StringRef IdentString) {
   MCSection *Comment = getAssembler().getContext().getELFSection(
       ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
@@ -630,15 +620,6 @@ void MCELFStreamer::FinishImpl() {
   this->MCObjectStreamer::FinishImpl();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                    raw_pwrite_stream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll) {
-  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  return S;
-}
-
 void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic ELF doesn't support this directive");
 }
@@ -647,22 +628,6 @@ void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("ELF doesn't support this directive");
 }
 
-void MCELFStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EmitCOFFSymbolType(int Type) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EndCOFFSymbolDef() {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
 void MCELFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
@@ -672,3 +637,12 @@ void MCELFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
 }
+
+MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                    raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                    bool RelaxAll) {
+  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/contrib/llvm/lib/MC/MCExpr.cpp b/contrib/llvm/lib/MC/MCExpr.cpp
index bcc43a5..38a8af4 100644
--- a/contrib/llvm/lib/MC/MCExpr.cpp
+++ b/contrib/llvm/lib/MC/MCExpr.cpp
@@ -17,18 +17,25 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mcexpr"
 
 namespace {
 namespace stats {
+
 STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
-}
-}
+
+} // end namespace stats
+} // end anonymous namespace
 
 void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   switch (getKind()) {
@@ -44,7 +51,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
     bool UseParens =
-        !InParens && Sym.getName().size() && Sym.getName()[0] == '$';
+        !InParens && !Sym.getName().empty() && Sym.getName()[0] == '$';
     if (UseParens) {
       OS << '(';
       Sym.print(OS, MAI);
@@ -129,21 +136,24 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   llvm_unreachable("Invalid expression kind!");
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCExpr::dump() const {
   dbgs() << *this;
   dbgs() << '\n';
 }
+#endif
 
 /* *** */
 
 const MCBinaryExpr *MCBinaryExpr::create(Opcode Opc, const MCExpr *LHS,
-                                         const MCExpr *RHS, MCContext &Ctx) {
-  return new (Ctx) MCBinaryExpr(Opc, LHS, RHS);
+                                         const MCExpr *RHS, MCContext &Ctx,
+                                         SMLoc Loc) {
+  return new (Ctx) MCBinaryExpr(Opc, LHS, RHS, Loc);
 }
 
 const MCUnaryExpr *MCUnaryExpr::create(Opcode Opc, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) MCUnaryExpr(Opc, Expr);
+                                       MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCUnaryExpr(Opc, Expr, Loc);
 }
 
 const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx) {
@@ -153,8 +163,8 @@ const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx) {
 /* *** */
 
 MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
-                                 const MCAsmInfo *MAI)
-    : MCExpr(MCExpr::SymbolRef), Kind(Kind),
+                                 const MCAsmInfo *MAI, SMLoc Loc)
+    : MCExpr(MCExpr::SymbolRef, Loc), Kind(Kind),
       UseParensForSymbolVariant(MAI->useParensForSymbolVariant()),
       HasSubsectionsViaSymbols(MAI->hasSubsectionsViaSymbols()),
       Symbol(Symbol) {
@@ -163,8 +173,8 @@ MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
 
 const MCSymbolRefExpr *MCSymbolRefExpr::create(const MCSymbol *Sym,
                                                VariantKind Kind,
-                                               MCContext &Ctx) {
-  return new (Ctx) MCSymbolRefExpr(Sym, Kind, Ctx.getAsmInfo());
+                                               MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCSymbolRefExpr(Sym, Kind, Ctx.getAsmInfo(), Loc);
 }
 
 const MCSymbolRefExpr *MCSymbolRefExpr::create(StringRef Name, VariantKind Kind,
@@ -205,6 +215,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_SECREL: return "SECREL32";
   case VK_SIZE: return "SIZE";
   case VK_WEAKREF: return "WEAKREF";
+  case VK_X86_ABS8: return "ABS8";
   case VK_ARM_NONE: return "none";
   case VK_ARM_GOT_PREL: return "GOT_PREL";
   case VK_ARM_TARGET1: return "target1";
@@ -275,6 +286,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Hexagon_IE: return "IE";
   case VK_Hexagon_IE_GOT: return "IEGOT";
   case VK_WebAssembly_FUNCTION: return "FUNCTION";
+  case VK_WebAssembly_TYPEINDEX: return "TYPEINDEX";
   case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo";
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
   case VK_AMDGPU_REL32_LO: return "rel32@lo";
@@ -314,6 +326,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("imgrel", VK_COFF_IMGREL32)
     .Case("secrel32", VK_SECREL)
     .Case("size", VK_SIZE)
+    .Case("abs8", VK_X86_ABS8)
     .Case("l", VK_PPC_LO)
     .Case("h", VK_PPC_HI)
     .Case("ha", VK_PPC_HA)
@@ -642,8 +655,12 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm,
         // the OS X assembler will completely drop the 4. We should probably
         // include it in the relocation or produce an error if that is not
         // possible.
+        // Allow constant expressions.
         if (!A && !B)
           return true;
+        // Allows aliases with zero offset.
+        if (Res.getConstant() == 0 && (!A || !B))
+          return true;
       }
     }
 
diff --git a/contrib/llvm/lib/MC/MCFragment.cpp b/contrib/llvm/lib/MC/MCFragment.cpp
index 8ff8f8a..6e02493 100644
--- a/contrib/llvm/lib/MC/MCFragment.cpp
+++ b/contrib/llvm/lib/MC/MCFragment.cpp
@@ -8,29 +8,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCFragment.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
-MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
-  : Assembler(Asm), LastValidFragment()
- {
+MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
   // Compute the section layout order. Virtual sections must go last.
   for (MCSection &Sec : Asm)
     if (!Sec.isVirtualSection())
@@ -145,14 +144,14 @@ const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
   MCValue Value;
   if (!Expr->evaluateAsValue(Value, *this)) {
     Assembler.getContext().reportError(
-        SMLoc(), "expression could not be evaluated");
+        Expr->getLoc(), "expression could not be evaluated");
     return nullptr;
   }
 
   const MCSymbolRefExpr *RefB = Value.getSymB();
   if (RefB) {
     Assembler.getContext().reportError(
-        SMLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
+        Expr->getLoc(), Twine("symbol '") + RefB->getSymbol().getName() +
                      "' could not be evaluated in a subtraction expression");
     return nullptr;
   }
@@ -164,8 +163,7 @@ const MCSymbol *MCAsmLayout::getBaseSymbol(const MCSymbol &Symbol) const {
   const MCSymbol &ASym = A->getSymbol();
   const MCAssembler &Asm = getAssembler();
   if (ASym.isCommon()) {
-    // FIXME: we should probably add a SMLoc to MCExpr.
-    Asm.getContext().reportError(SMLoc(),
+    Asm.getContext().reportError(Expr->getLoc(),
                                  "Common symbol '" + ASym.getName() +
                                      "' cannot be used in assignment expr");
     return nullptr;
@@ -234,7 +232,7 @@ uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
 
 void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
 
-MCFragment::~MCFragment() { }
+MCFragment::~MCFragment() = default;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        uint8_t BundlePadding, MCSection *Parent)
@@ -295,8 +293,6 @@ void MCFragment::destroy() {
   }
 }
 
-/* *** */
-
 // Debugging methods
 
 namespace llvm {
@@ -308,10 +304,11 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
   return OS;
 }
 
-}
+} // end namespace llvm
 
-LLVM_DUMP_METHOD void MCFragment::dump() {
-  raw_ostream &OS = llvm::errs();
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MCFragment::dump() const {
+  raw_ostream &OS = errs();
 
   OS << "<";
   switch (getKind()) {
@@ -331,9 +328,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   case MCFragment::FT_Dummy: OS << "MCDummyFragment"; break;
   }
 
-  OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
+  OS << "<MCFragment " << (const void*) this << " LayoutOrder:" << LayoutOrder
      << " Offset:" << Offset
-     << " HasInstructions:" << hasInstructions() 
+     << " HasInstructions:" << hasInstructions()
      << " BundlePadding:" << static_cast<unsigned>(getBundlePadding()) << ">";
 
   switch (getKind()) {
@@ -385,7 +382,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   }
   case MCFragment::FT_Fill:  {
     const MCFillFragment *FF = cast<MCFillFragment>(this);
-    OS << " Value:" << FF->getValue() << " Size:" << FF->getSize();
+    OS << " Value:" << static_cast<unsigned>(FF->getValue())
+       << " Size:" << FF->getSize();
     break;
   }
   case MCFragment::FT_Relaxable:  {
@@ -398,7 +396,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   case MCFragment::FT_Org:  {
     const MCOrgFragment *OF = cast<MCOrgFragment>(this);
     OS << "\n       ";
-    OS << " Offset:" << OF->getOffset() << " Value:" << OF->getValue();
+    OS << " Offset:" << OF->getOffset()
+       << " Value:" << static_cast<unsigned>(OF->getValue());
     break;
   }
   case MCFragment::FT_Dwarf:  {
@@ -448,19 +447,19 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
   OS << ">";
 }
 
-LLVM_DUMP_METHOD void MCAssembler::dump() {
-  raw_ostream &OS = llvm::errs();
+LLVM_DUMP_METHOD void MCAssembler::dump() const{
+  raw_ostream &OS = errs();
 
   OS << "<MCAssembler\n";
   OS << "  Sections:[\n    ";
-  for (iterator it = begin(), ie = end(); it != ie; ++it) {
+  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
     if (it != begin()) OS << ",\n    ";
     it->dump();
   }
   OS << "],\n";
   OS << "  Symbols:[";
 
-  for (symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
+  for (const_symbol_iterator it = symbol_begin(), ie = symbol_end(); it != ie; ++it) {
     if (it != symbol_begin()) OS << ",\n           ";
     OS << "(";
     it->dump();
@@ -469,3 +468,4 @@ LLVM_DUMP_METHOD void MCAssembler::dump() {
   }
   OS << "]>\n";
 }
+#endif
diff --git a/contrib/llvm/lib/MC/MCInst.cpp b/contrib/llvm/lib/MC/MCInst.cpp
index 2da8ecc..f6d1d3c 100644
--- a/contrib/llvm/lib/MC/MCInst.cpp
+++ b/contrib/llvm/lib/MC/MCInst.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -34,10 +35,12 @@ void MCOperand::print(raw_ostream &OS) const {
   OS << ">";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCOperand::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
+#endif
 
 void MCInst::print(raw_ostream &OS) const {
   OS << "<MCInst " << getOpcode();
@@ -63,7 +66,9 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCInstPrinter *Printer,
   OS << ">";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCInst::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
+#endif
diff --git a/contrib/llvm/lib/MC/MCInstPrinter.cpp b/contrib/llvm/lib/MC/MCInstPrinter.cpp
index 23afe80..9296fce 100644
--- a/contrib/llvm/lib/MC/MCInstPrinter.cpp
+++ b/contrib/llvm/lib/MC/MCInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ---===//
+//===- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,12 +8,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 void llvm::dumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS) {
@@ -25,8 +29,7 @@ void llvm::dumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS) {
   }
 }
 
-MCInstPrinter::~MCInstPrinter() {
-}
+MCInstPrinter::~MCInstPrinter() = default;
 
 /// getOpcodeName - Return the name of the specified opcode enum (e.g.
 /// "MOV32ri") or empty if we can't resolve it.
@@ -68,7 +71,7 @@ StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
 // For asm-style hex (e.g. 0ffh) the first digit always has to be a number.
 static bool needsLeadingZero(uint64_t Value)
 {
-  while(Value)
+  while (Value)
   {
     uint64_t digit = (Value >> 60) & 0xf;
     if (digit != 0)
diff --git a/contrib/llvm/lib/MC/MCInstrAnalysis.cpp b/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
index 2d8336d..280b5cf 100644
--- a/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
+++ b/contrib/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -1,4 +1,4 @@
-//===-- MCInstrAnalysis.cpp - InstrDesc target hooks ------------*- C++ -*-===//
+//===- MCInstrAnalysis.cpp - InstrDesc target hooks -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,6 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include <cstdint>
+
 using namespace llvm;
 
 bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
diff --git a/contrib/llvm/lib/MC/MCLabel.cpp b/contrib/llvm/lib/MC/MCLabel.cpp
index b443cbb..db25a46 100644
--- a/contrib/llvm/lib/MC/MCLabel.cpp
+++ b/contrib/llvm/lib/MC/MCLabel.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCLabel.cpp - MCLabel implementation ----------------------===//
+//===- lib/MC/MCLabel.cpp - MCLabel implementation ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCLabel.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 void MCLabel::print(raw_ostream &OS) const {
   OS << '"' << getInstance() << '"';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCLabel::dump() const {
   print(dbgs());
 }
+#endif
diff --git a/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp b/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp
index f71fc78..97f9541 100644
--- a/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp
+++ b/contrib/llvm/lib/MC/MCLinkerOptimizationHint.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCLinkerOptimizationHint.cpp ----- LOH handling -*- C++ -*-===//
+//===- llvm/MC/MCLinkerOptimizationHint.cpp ----- LOH handling ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,9 +9,11 @@
 
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -41,14 +43,14 @@ void MCLOHDirective::emit(MachObjectWriter &ObjWriter,
 uint64_t MCLOHDirective::getEmitSize(const MachObjectWriter &ObjWriter,
                                      const MCAsmLayout &Layout) const {
   class raw_counting_ostream : public raw_ostream {
-    uint64_t Count;
+    uint64_t Count = 0;
 
     void write_impl(const char *, size_t size) override { Count += size; }
 
     uint64_t current_pos() const override { return Count; }
 
   public:
-    raw_counting_ostream() : Count(0) {}
+    raw_counting_ostream() = default;
     ~raw_counting_ostream() override { flush(); }
   };
 
diff --git a/contrib/llvm/lib/MC/MCMachOStreamer.cpp b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
index bd425bb..674c7b9 100644
--- a/contrib/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCMachOStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- MCMachOStreamer.cpp - MachO Streamer ------------------------------===//
+//===- MCMachOStreamer.cpp - MachO Streamer -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,27 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <vector>
 
 using namespace llvm;
 
@@ -70,7 +78,7 @@ public:
   /// @{
 
   void ChangeSection(MCSection *Sect, const MCExpr *Subsect) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
@@ -83,18 +91,7 @@ public:
   void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EmitCOFFSymbolType(int Type) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EndCOFFSymbolDef() override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
+
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
@@ -102,13 +99,6 @@ public:
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
-  void EmitFileDirective(StringRef Filename) override {
-    // FIXME: Just ignore the .file; it isn't important enough to fail the
-    // entire assembly.
-
-    // report_fatal_error("unsupported directive: '.file'");
-  }
-
   void EmitIdent(StringRef IdentString) override {
     llvm_unreachable("macho doesn't support this directive");
   }
@@ -152,7 +142,7 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
 void MCMachOStreamer::ChangeSection(MCSection *Section,
                                     const MCExpr *Subsection) {
   // Change the section normally.
-  bool Created = MCObjectStreamer::changeSectionImpl(Section, Subsection);
+  bool Created = changeSectionImpl(Section, Subsection);
   const MCSectionMachO &MSec = *cast<MCSectionMachO>(Section);
   StringRef SegName = MSec.getSegmentName();
   if (SegName == "__DWARF")
@@ -181,15 +171,13 @@ void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
     EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
 }
 
-void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-
+void MCMachOStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
     insert(new MCDataFragment());
 
-  MCObjectStreamer::EmitLabel(Symbol);
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
 
   // This causes the reference type flag to be cleared. Darwin 'as' was "trying"
   // to clear the weak reference and weak definition bits too, but the
diff --git a/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp
index 4ffd6a7..8809a3c 100644
--- a/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp
+++ b/contrib/llvm/lib/MC/MCMachObjectTargetWriter.cpp
@@ -1,4 +1,4 @@
-//===-- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass ------===//
+//===- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,4 +16,4 @@ MCMachObjectTargetWriter::MCMachObjectTargetWriter(bool Is64Bit_,
                                                    uint32_t CPUSubtype_)
     : Is64Bit(Is64Bit_), CPUType(CPUType_), CPUSubtype(CPUSubtype_) {}
 
-MCMachObjectTargetWriter::~MCMachObjectTargetWriter() {}
+MCMachObjectTargetWriter::~MCMachObjectTargetWriter() = default;
diff --git a/contrib/llvm/lib/MC/MCNullStreamer.cpp b/contrib/llvm/lib/MC/MCNullStreamer.cpp
index eb2d912..4db9a2c 100644
--- a/contrib/llvm/lib/MC/MCNullStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCNullStreamer.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 
 using namespace llvm;
@@ -34,6 +34,10 @@ namespace {
     void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
                       uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
     void EmitGPRel32Value(const MCExpr *Value) override {}
+    void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+    void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+    void EmitCOFFSymbolType(int Type) override {}
+    void EndCOFFSymbolDef() override {}
   };
 
 }
diff --git a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
index 8fd71f6..21c5516 100644
--- a/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/contrib/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -10,13 +10,15 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/MC/MCSectionWasm.h"
 
 using namespace llvm;
 
@@ -239,6 +241,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfStrSection =
       Ctx->getMachOSection("__DWARF", "__debug_str", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "info_string");
+  DwarfStrOffSection =
+      Ctx->getMachOSection("__DWARF", "__debug_str_offs", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "section_str_off");
   DwarfLocSection =
       Ctx->getMachOSection("__DWARF", "__debug_loc", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "section_debug_loc");
@@ -284,6 +289,10 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T) {
                      ((CMModel == CodeModel::Large) ? dwarf::DW_EH_PE_sdata8
                                                     : dwarf::DW_EH_PE_sdata4);
     break;
+  case Triple::bpfel:
+  case Triple::bpfeb:
+    FDECFIEncoding = dwarf::DW_EH_PE_sdata8;
+    break;
   default:
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
     break;
@@ -505,68 +514,79 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T) {
   COFFDebugSymbolsSection = nullptr;
   COFFDebugTypesSection = nullptr;
 
+  unsigned DebugSecType = ELF::SHT_PROGBITS;
+
+  // MIPS .debug_* sections should have SHT_MIPS_DWARF section type
+  // to distinguish among sections contain DWARF and ECOFF debug formats.
+  // Sections with ECOFF debug format are obsoleted and marked by SHT_PROGBITS.
+  if (T.getArch() == Triple::mips || T.getArch() == Triple::mipsel ||
+      T.getArch() == Triple::mips64 || T.getArch() == Triple::mips64el)
+    DebugSecType = ELF::SHT_MIPS_DWARF;
+
   // Debug Info Sections.
-  DwarfAbbrevSection = Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
-                                          "section_abbrev");
-  DwarfInfoSection =
-      Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0, "section_info");
-  DwarfLineSection = Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0);
-  DwarfFrameSection = Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0);
+  DwarfAbbrevSection =
+      Ctx->getELFSection(".debug_abbrev", DebugSecType, 0);
+  DwarfInfoSection = Ctx->getELFSection(".debug_info", DebugSecType, 0);
+  DwarfLineSection = Ctx->getELFSection(".debug_line", DebugSecType, 0);
+  DwarfFrameSection = Ctx->getELFSection(".debug_frame", DebugSecType, 0);
   DwarfPubNamesSection =
-      Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_pubnames", DebugSecType, 0);
   DwarfPubTypesSection =
-      Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_pubtypes", DebugSecType, 0);
   DwarfGnuPubNamesSection =
-      Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_gnu_pubnames", DebugSecType, 0);
   DwarfGnuPubTypesSection =
-      Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_gnu_pubtypes", DebugSecType, 0);
   DwarfStrSection =
-      Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".debug_str", DebugSecType,
                          ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
-  DwarfLocSection = Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0);
+  DwarfLocSection = Ctx->getELFSection(".debug_loc", DebugSecType, 0);
   DwarfARangesSection =
-      Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_aranges", DebugSecType, 0);
   DwarfRangesSection =
-      Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0, "debug_range");
-  DwarfMacinfoSection = Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS,
-                                           0, "debug_macinfo");
+      Ctx->getELFSection(".debug_ranges", DebugSecType, 0);
+  DwarfMacinfoSection =
+      Ctx->getELFSection(".debug_macinfo", DebugSecType, 0);
 
   // DWARF5 Experimental Debug Info
 
   // Accelerator Tables
   DwarfAccelNamesSection =
-      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0, "names_begin");
+      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0);
   DwarfAccelObjCSection =
-      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0, "objc_begin");
-  DwarfAccelNamespaceSection = Ctx->getELFSection(
-      ".apple_namespaces", ELF::SHT_PROGBITS, 0, "namespac_begin");
+      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0);
+  DwarfAccelNamespaceSection =
+      Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0);
   DwarfAccelTypesSection =
-      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0, "types_begin");
+      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0);
+
+  // String Offset and Address Sections
+  DwarfStrOffSection =
+      Ctx->getELFSection(".debug_str_offsets", DebugSecType, 0);
+  DwarfAddrSection = Ctx->getELFSection(".debug_addr", DebugSecType, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
-      Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_info.dwo", DebugSecType, 0);
   DwarfTypesDWOSection =
-      Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_types.dwo", DebugSecType, 0);
   DwarfAbbrevDWOSection =
-      Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_abbrev.dwo", DebugSecType, 0);
   DwarfStrDWOSection =
-      Ctx->getELFSection(".debug_str.dwo", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".debug_str.dwo", DebugSecType,
                          ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   DwarfLineDWOSection =
-      Ctx->getELFSection(".debug_line.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_line.dwo", DebugSecType, 0);
   DwarfLocDWOSection =
-      Ctx->getELFSection(".debug_loc.dwo", ELF::SHT_PROGBITS, 0, "skel_loc");
+      Ctx->getELFSection(".debug_loc.dwo", DebugSecType, 0);
   DwarfStrOffDWOSection =
-      Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0);
-  DwarfAddrSection =
-      Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec");
+      Ctx->getELFSection(".debug_str_offsets.dwo", DebugSecType, 0);
 
   // DWP Sections
   DwarfCUIndexSection =
-      Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_cu_index", DebugSecType, 0);
   DwarfTUIndexSection =
-      Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_tu_index", DebugSecType, 0);
 
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
@@ -682,6 +702,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "info_string");
+  DwarfStrOffSection = Ctx->getCOFFSection(
+      ".debug_str_offsets",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "section_str_off");
   DwarfLocSection = Ctx->getCOFFSection(
       ".debug_loc",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -736,7 +761,7 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       ".debug_str_offsets.dwo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
-      SectionKind::getMetadata());
+      SectionKind::getMetadata(), "section_str_off_dwo");
   DwarfAddrSection = Ctx->getCOFFSection(
       ".debug_addr",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -799,6 +824,30 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                         SectionKind::getReadOnly());
 }
 
+void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
+  // TODO: Set the section types and flags.
+  TextSection = Ctx->getWasmSection(".text", 0, 0);
+  DataSection = Ctx->getWasmSection(".data", 0, 0);
+
+  // TODO: Set the section types and flags.
+  DwarfLineSection = Ctx->getWasmSection(".debug_line", 0, 0);
+  DwarfStrSection = Ctx->getWasmSection(".debug_str", 0, 0);
+  DwarfLocSection = Ctx->getWasmSection(".debug_loc", 0, 0);
+  DwarfAbbrevSection = Ctx->getWasmSection(".debug_abbrev", 0, 0, "section_abbrev");
+  DwarfARangesSection = Ctx->getWasmSection(".debug_aranges", 0, 0);
+  DwarfRangesSection = Ctx->getWasmSection(".debug_ranges", 0, 0, "debug_range");
+  DwarfMacinfoSection = Ctx->getWasmSection(".debug_macinfo", 0, 0, "debug_macinfo");
+  DwarfAddrSection = Ctx->getWasmSection(".debug_addr", 0, 0);
+  DwarfCUIndexSection = Ctx->getWasmSection(".debug_cu_index", 0, 0);
+  DwarfTUIndexSection = Ctx->getWasmSection(".debug_tu_index", 0, 0);
+  DwarfInfoSection = Ctx->getWasmSection(".debug_info", 0, 0, "section_info");
+  DwarfFrameSection = Ctx->getWasmSection(".debug_frame", 0, 0);
+  DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", 0, 0);
+  DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", 0, 0);
+
+  // TODO: Define more sections.
+}
+
 void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
                                             CodeModel::Model cm,
                                             MCContext &ctx) {
@@ -843,6 +892,10 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
     Env = IsELF;
     initELFMCObjectFileInfo(TT);
     break;
+  case Triple::Wasm:
+    Env = IsWasm;
+    initWasmMCObjectFileInfo(TT);
+    break;
   case Triple::UnknownObjectFormat:
     report_fatal_error("Cannot initialize MC for unknown object file format.");
     break;
diff --git a/contrib/llvm/lib/MC/MCObjectStreamer.cpp b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
index cae5c1f..174397e 100644
--- a/contrib/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCObjectStreamer.cpp
@@ -133,6 +133,11 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   // Avoid fixups when possible.
   int64_t AbsValue;
   if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) {
+    if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) {
+      getContext().reportError(
+          Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
+      return;
+    }
     EmitIntValue(AbsValue, Size);
     return;
   }
@@ -153,8 +158,8 @@ void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
   EmitLabel(Frame.End);
 }
 
-void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
-  MCStreamer::EmitLabel(Symbol);
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::EmitLabel(Symbol, Loc);
 
   getAssembler().registerSymbol(*Symbol);
 
@@ -171,6 +176,16 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   }
 }
 
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc, MCFragment *F) {
+  MCStreamer::EmitLabel(Symbol, Loc);
+  getAssembler().registerSymbol(*Symbol);
+  auto *DF = dyn_cast_or_null<MCDataFragment>(F);
+  if (DF)
+    Symbol->setFragment(F);
+  else
+    PendingLabels.push_back(Symbol);
+}
+
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
@@ -203,6 +218,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
                                          const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   flushPendingLabels(nullptr);
+  getContext().clearDwarfLocSeen();
 
   bool Created = getAssembler().registerSection(*Section);
 
@@ -227,7 +243,7 @@ bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
 }
 
 void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
-                                       const MCSubtargetInfo &STI) {
+                                       const MCSubtargetInfo &STI, bool) {
   MCStreamer::EmitInstruction(Inst, STI);
 
   MCSection *Sec = getCurrentSectionOnly();
@@ -490,8 +506,8 @@ void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), 
-                                            Value, FK_GPRel_4));
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
@@ -500,8 +516,8 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), 
-                                            Value, FK_GPRel_4));
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
@@ -572,6 +588,10 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
   MCStreamer::emitFill(IntNumValues, Size, Expr);
 }
 
+void MCObjectStreamer::EmitFileDirective(StringRef Filename) {
+  getAssembler().addFileName(Filename);
+}
+
 void MCObjectStreamer::FinishImpl() {
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
diff --git a/contrib/llvm/lib/MC/MCObjectWriter.cpp b/contrib/llvm/lib/MC/MCObjectWriter.cpp
index e84f74a..98ac48a 100644
--- a/contrib/llvm/lib/MC/MCObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/MCObjectWriter.cpp
@@ -7,15 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSymbol.h"
 
 using namespace llvm;
 
-MCObjectWriter::~MCObjectWriter() {
-}
+MCObjectWriter::~MCObjectWriter() = default;
 
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
     const MCAssembler &Asm, const MCSymbolRefExpr *A, const MCSymbolRefExpr *B,
@@ -51,5 +51,3 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   // On ELF and COFF  A - B is absolute if A and B are in the same section.
   return &SecA == &SecB;
 }
-
-bool MCObjectWriter::isWeak(const MCSymbol &) const { return false; }
diff --git a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
index 87ecf9e..2b96360 100644
--- a/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/contrib/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -30,15 +30,11 @@
 
 using namespace llvm;
 
-AsmLexer::AsmLexer(const MCAsmInfo &MAI)
-    : MAI(MAI), CurPtr(nullptr), IsAtStartOfLine(true),
-      IsAtStartOfStatement(true), IsParsingMSInlineAsm(false),
-      IsPeeking(false) {
+AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
 }
 
-AsmLexer::~AsmLexer() {
-}
+AsmLexer::~AsmLexer() = default;
 
 void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
   CurBuf = Buf;
diff --git a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
index fd40baa..dad47e4 100644
--- a/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -15,12 +15,13 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
@@ -35,6 +36,7 @@
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -42,10 +44,10 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -55,6 +57,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cctype>
+#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <deque>
@@ -67,7 +70,7 @@
 
 using namespace llvm;
 
-MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;
 
 static cl::opt<unsigned> AsmMacroMaxNestingDepth(
      "asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
@@ -82,10 +85,10 @@ typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
 struct MCAsmMacroParameter {
   StringRef Name;
   MCAsmMacroArgument Value;
-  bool Required;
-  bool Vararg;
+  bool Required = false;
+  bool Vararg = false;
 
-  MCAsmMacroParameter() : Required(false), Vararg(false) {}
+  MCAsmMacroParameter() = default;
 };
 
 typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
@@ -124,23 +127,20 @@ struct ParseStatementInfo {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
 
   /// \brief The opcode from the last parsed instruction.
-  unsigned Opcode;
+  unsigned Opcode = ~0U;
 
   /// \brief Was there an error parsing the inline assembly?
-  bool ParseError;
+  bool ParseError = false;
 
-  SmallVectorImpl<AsmRewrite> *AsmRewrites;
+  SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
 
-  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(nullptr) {}
+  ParseStatementInfo() = delete;
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
-    : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
+    : AsmRewrites(rewrites) {}
 };
 
 /// \brief The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
-  AsmParser(const AsmParser &) = delete;
-  void operator=(const AsmParser &) = delete;
-
 private:
   AsmLexer Lexer;
   MCContext &Ctx;
@@ -199,17 +199,19 @@ private:
   unsigned LastQueryLine;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
-  unsigned AssemblerDialect;
+  unsigned AssemblerDialect = ~0U;
 
   /// \brief is Darwin compatibility enabled?
-  bool IsDarwin;
+  bool IsDarwin = false;
 
   /// \brief Are we parsing ms-style inline assembly?
-  bool ParsingInlineAsm;
+  bool ParsingInlineAsm = false;
 
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
             const MCAsmInfo &MAI, unsigned CB);
+  AsmParser(const AsmParser &) = delete;
+  AsmParser &operator=(const AsmParser &) = delete;
   ~AsmParser() override;
 
   bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;
@@ -223,7 +225,6 @@ public:
     DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
   }
 
-public:
   /// @name MCAsmParser Interface
   /// {
 
@@ -258,7 +259,7 @@ public:
 
   bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
-                        SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
+                        SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
                         SmallVectorImpl<std::string> &Constraints,
                         SmallVectorImpl<std::string> &Clobbers,
                         const MCInstrInfo *MII, const MCInstPrinter *IP,
@@ -286,6 +287,8 @@ public:
   /// }
 
 private:
+  bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc);
+  void altMacroString(StringRef AltMacroStr, std::string &Res);
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI);
   bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
@@ -411,7 +414,7 @@ private:
     DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
     DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
     DK_CFI_REGISTER, DK_CFI_WINDOW_SAVE,
-    DK_MACROS_ON, DK_MACROS_OFF,
+    DK_MACROS_ON, DK_MACROS_OFF, DK_ALTMACRO, DK_NOALTMACRO,
     DK_MACRO, DK_EXITM, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
     DK_SLEB128, DK_ULEB128,
     DK_ERR, DK_ERROR, DK_WARNING,
@@ -483,7 +486,8 @@ private:
   bool parseDirectiveEndMacro(StringRef Directive);
   bool parseDirectiveMacro(SMLoc DirectiveLoc);
   bool parseDirectiveMacrosOnOff(StringRef Directive);
-
+  // alternate macro mode directives
+  bool parseDirectiveAltmacro(StringRef Directive);
   // ".bundle_align_mode"
   bool parseDirectiveBundleAlignMode();
   // ".bundle_lock"
@@ -574,9 +578,7 @@ enum { DEFAULT_ADDRSPACE = 0 };
 AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
                      const MCAsmInfo &MAI, unsigned CB = 0)
     : Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
-      PlatformParser(nullptr), CurBuffer(CB ? CB : SM.getMainFileID()),
-      MacrosEnabledFlag(true), CppHashInfo(), AssemblerDialect(~0U),
-      IsDarwin(false), ParsingInlineAsm(false) {
+      CurBuffer(CB ? CB : SM.getMainFileID()), MacrosEnabledFlag(true) {
   HadError = false;
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
@@ -597,6 +599,9 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
   case MCObjectFileInfo::IsELF:
     PlatformParser.reset(createELFAsmParser());
     break;
+  case MCObjectFileInfo::IsWasm:
+    llvm_unreachable("Wasm parsing not supported yet");
+    break;
   }
 
   PlatformParser->Initialize(*this);
@@ -698,7 +703,7 @@ const AsmToken &AsmParser::Lex() {
   // if it's a end of statement with a comment in it
   if (getTok().is(AsmToken::EndOfStatement)) {
     // if this is a line comment output it.
-    if (getTok().getString().front() != '\n' &&
+    if (!getTok().getString().empty() && getTok().getString().front() != '\n' &&
         getTok().getString().front() != '\r' && MAI.preserveAsmComments())
       Out.addExplicitComment(Twine(getTok().getString()));
   }
@@ -735,6 +740,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   HadError = false;
   AsmCond StartingCondState = TheCondState;
+  SmallVector<AsmRewrite, 4> AsmStrRewrites;
 
   // If we are generating dwarf for assembly source files save the initial text
   // section and generate a .file directive.
@@ -754,7 +760,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   // While we have input, parse each statement.
   while (Lexer.isNot(AsmToken::Eof)) {
-    ParseStatementInfo Info;
+    ParseStatementInfo Info(&AsmStrRewrites);
     if (!parseStatement(Info, nullptr))
       continue;
 
@@ -922,7 +928,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createLNot(Res, getContext());
+    Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Dollar:
   case AsmToken::At:
@@ -983,7 +989,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
 
     // Lookup the symbol variant if used.
-    if (Split.second.size()) {
+    if (!Split.second.empty()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
       if (Variant != MCSymbolRefExpr::VK_Invalid) {
         SymbolName = Split.first;
@@ -1009,7 +1015,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     }
 
     // Otherwise create a symbol ref.
-    Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
+    Res = MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
     return false;
   }
   case AsmToken::BigNum:
@@ -1075,19 +1081,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createMinus(Res, getContext());
+    Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createPlus(Res, getContext());
+    Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createNot(Res, getContext());
+    Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
     return false;
   // MIPS unary expression operators. The lexer won't generate these tokens if
   // MCAsmInfo::HasMipsExpressions is false for the target.
@@ -1188,6 +1194,42 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
   llvm_unreachable("Invalid expression kind!");
 }
 
+/// This function checks if the next token is <string> type or arithmetic.
+/// string that begin with character '<' must end with character '>'.
+/// otherwise it is arithmetics.
+/// If the function returns a 'true' value,
+/// the End argument will be filled with the last location pointed to the '>'
+/// character.
+
+/// There is a gap between the AltMacro's documentation and the single quote implementation. 
+/// GCC does not fully support this feature and so we will not support it.
+/// TODO: Adding single quote as a string.
+bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
+  assert((StrLoc.getPointer() != NULL) &&
+         "Argument to the function cannot be a NULL value");
+  const char *CharPtr = StrLoc.getPointer();
+  while ((*CharPtr != '>') && (*CharPtr != '\n') &&
+         (*CharPtr != '\r') && (*CharPtr != '\0')){
+	  if(*CharPtr == '!')
+		  CharPtr++;
+    CharPtr++;
+  }
+  if (*CharPtr == '>') {
+    EndLoc = StrLoc.getFromPointer(CharPtr + 1);
+    return true;
+  }
+  return false;
+}
+
+/// \brief creating a string without the escape characters '!'.
+void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
+  for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
+    if (AltMacroStr[Pos] == '!')
+      Pos++;
+    Res += AltMacroStr[Pos];
+  }
+}
+
 /// \brief Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
@@ -1440,6 +1482,7 @@ unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
 /// Res contains the LHS of the expression on input.
 bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                               SMLoc &EndLoc) {
+  SMLoc StartLoc = Lexer.getLoc();
   while (true) {
     MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
     unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);
@@ -1464,7 +1507,7 @@ bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
       return true;
 
     // Merge LHS and RHS according to operator.
-    Res = MCBinaryExpr::create(Kind, Res, RHS, getContext());
+    Res = MCBinaryExpr::create(Kind, Res, RHS, getContext(), StartLoc);
   }
 }
 
@@ -1480,7 +1523,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     Lex();
   if (Lexer.is(AsmToken::EndOfStatement)) {
     // if this is a line comment we can drop it safely
-    if (getTok().getString().front() == '\r' ||
+    if (getTok().getString().empty() || getTok().getString().front() == '\r' ||
         getTok().getString().front() == '\n')
       Out.AddBlankLine();
     Lex();
@@ -1621,7 +1664,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       if (ParsingInlineAsm && SI) {
         StringRef RewrittenLabel =
             SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
-        assert(RewrittenLabel.size() &&
+        assert(!RewrittenLabel.empty() &&
                "We should have an internal name here.");
         Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
                                        RewrittenLabel);
@@ -1630,12 +1673,6 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Sym = getContext().getOrCreateSymbol(IDVal);
     } else
       Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
-
-    Sym->redefineIfPossible();
-
-    if (!Sym->isUndefined() || Sym->isVariable())
-      return Error(IDLoc, "invalid symbol redefinition");
-
     // End of Labels should be treated as end of line for lexing
     // purposes but that information is not available to the Lexer who
     // does not understand Labels. This may cause us to see a Hash
@@ -1653,8 +1690,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     }
 
     // Emit the label.
-    if (!ParsingInlineAsm)
-      Out.EmitLabel(Sym);
+    if (!getTargetParser().isParsingInlineAsm())
+      Out.EmitLabel(Sym, IDLoc);
 
     // If we are generating dwarf for assembly source files then gather the
     // info to make a dwarf label entry for this label if needed.
@@ -1758,8 +1795,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     case DK_8BYTE:
       return parseDirectiveValue(IDVal, 8);
     case DK_DC_A:
-      return parseDirectiveValue(IDVal,
-                                 getContext().getAsmInfo()->getPointerSize());
+      return parseDirectiveValue(
+          IDVal, getContext().getAsmInfo()->getCodePointerSize());
     case DK_OCTA:
       return parseDirectiveOctaValue(IDVal);
     case DK_SINGLE:
@@ -1924,6 +1961,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       return parseDirectiveMacrosOnOff(IDVal);
     case DK_MACRO:
       return parseDirectiveMacro(IDLoc);
+    case DK_ALTMACRO:
+    case DK_NOALTMACRO:
+      return parseDirectiveAltmacro(IDVal);
     case DK_EXITM:
       return parseDirectiveExitMacro(IDVal);
     case DK_ENDM:
@@ -1983,7 +2023,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
     return parseDirectiveMSAlign(IDLoc, Info);
 
-  if (ParsingInlineAsm && (IDVal == "even"))
+  if (ParsingInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
     Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
   if (checkForValidSection())
     return true;
@@ -2029,7 +2069,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    if (CppHashInfo.Filename.size()) {
+    if (!CppHashInfo.Filename.empty()) {
       unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
           0, StringRef(), CppHashInfo.Filename);
       getContext().setGenDwarfFileNumber(FileNumber);
@@ -2060,9 +2100,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   // If parsing succeeded, match the instruction.
   if (!ParseHadError) {
     uint64_t ErrorInfo;
-    if (getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
-                                                  Info.ParsedOperands, Out,
-                                                  ErrorInfo, ParsingInlineAsm))
+    if (getTargetParser().MatchAndEmitInstruction(
+            IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
+            getTargetParser().isParsingInlineAsm()))
       return true;
   }
   return false;
@@ -2272,9 +2312,27 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
         } else {
           bool VarargParameter = HasVararg && Index == (NParameters - 1);
           for (const AsmToken &Token : A[Index])
+            // For altmacro mode, you can write '%expr'.
+            // The prefix '%' evaluates the expression 'expr'
+            // and uses the result as a string (e.g. replace %(1+2) with the string "3").
+            // Here, we identify the integer token which is the result of the
+            // absolute expression evaluation and replace it with its string representation.
+            if ((Lexer.IsaAltMacroMode()) &&
+                 (*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer))
+              // Emit an integer value to the buffer.
+              OS << Token.getIntVal();
+            // Only Token that was validated as a string and begins with '<'
+            // is considered altMacroString!!!
+            else if ((Lexer.IsaAltMacroMode()) &&
+                     (*(Token.getString().begin()) == '<') &&
+                     Token.is(AsmToken::String)) {
+              std::string Res;
+              altMacroString(Token.getStringContents(), Res);
+              OS << Res;
+            }
             // We expect no quotes around the string's contents when
             // parsing for varargs.
-            if (Token.getKind() != AsmToken::String || VarargParameter)
+            else if (Token.isNot(AsmToken::String) || VarargParameter)
               OS << Token.getString();
             else
               OS << Token.getStringContents();
@@ -2445,13 +2503,37 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
 
       NamedParametersFound = true;
     }
+    bool Vararg = HasVararg && Parameter == (NParameters - 1);
 
     if (NamedParametersFound && FA.Name.empty())
       return Error(IDLoc, "cannot mix positional and keyword arguments");
 
-    bool Vararg = HasVararg && Parameter == (NParameters - 1);
-    if (parseMacroArgument(FA.Value, Vararg))
-      return true;
+    SMLoc StrLoc = Lexer.getLoc();
+    SMLoc EndLoc;
+    if (Lexer.IsaAltMacroMode() && Lexer.is(AsmToken::Percent)) {
+        const MCExpr *AbsoluteExp;
+        int64_t Value;
+        /// Eat '%'
+        Lex();
+        if (parseExpression(AbsoluteExp, EndLoc))
+          return false;
+        if (!AbsoluteExp->evaluateAsAbsolute(Value))
+          return Error(StrLoc, "expected absolute expression");
+        const char *StrChar = StrLoc.getPointer();
+        const char *EndChar = EndLoc.getPointer();
+        AsmToken newToken(AsmToken::Integer, StringRef(StrChar , EndChar - StrChar), Value);
+        FA.Value.push_back(newToken);
+    } else if (Lexer.IsaAltMacroMode() && Lexer.is(AsmToken::Less) &&
+               isAltmacroString(StrLoc, EndLoc)) {
+        const char *StrChar = StrLoc.getPointer();
+        const char *EndChar = EndLoc.getPointer();
+        jumpToLoc(EndLoc, CurBuffer);
+        /// Eat from '<' to '>'
+        Lex();
+        AsmToken newToken(AsmToken::String, StringRef(StrChar, EndChar - StrChar));
+        FA.Value.push_back(newToken);
+    } else if(parseMacroArgument(FA.Value, Vararg))
+        return true;
 
     unsigned PI = Parameter;
     if (!FA.Name.empty()) {
@@ -3843,6 +3925,19 @@ bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   return false;
 }
 
+/// parseDirectiveAltmacro
+/// ::= .altmacro
+/// ::= .noaltmacro
+bool AsmParser::parseDirectiveAltmacro(StringRef Directive) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Directive + "' directive");
+  if (Directive == ".altmacro")
+    getLexer().SetAltMacroMode(true);
+  else
+    getLexer().SetAltMacroMode(false);
+  return false;
+}
+
 /// parseDirectiveMacrosOnOff
 /// ::= .macros_on
 /// ::= .macros_off
@@ -3877,6 +3972,12 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     if (parseIdentifier(Parameter.Name))
       return TokError("expected identifier in '.macro' directive");
 
+    // Emit an error if two (or more) named parameters share the same name
+    for (const MCAsmMacroParameter& CurrParam : Parameters)
+      if (CurrParam.Name.equals(Parameter.Name))
+        return TokError("macro '" + Name + "' has multiple parameters"
+                        " named '" + Parameter.Name + "'");
+
     if (Lexer.is(AsmToken::Colon)) {
       Lex();  // consume ':'
 
@@ -4195,7 +4296,6 @@ bool AsmParser::parseDirectiveBundleUnlock() {
 /// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
 bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
-
   SMLoc NumBytesLoc = Lexer.getLoc();
   const MCExpr *NumBytes;
   if (checkForValidSection() || parseExpression(NumBytes))
@@ -4291,7 +4391,6 @@ bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Seman
 /// parseDirectiveDS
 /// ::= .ds.{b, d, l, p, s, w, x} expression
 bool AsmParser::parseDirectiveDS(StringRef IDVal, unsigned Size) {
-
   SMLoc NumValuesLoc = Lexer.getLoc();
   int64_t NumValues;
   if (checkForValidSection() || parseAbsoluteExpression(NumValues))
@@ -4420,6 +4519,7 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
     return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
                                    "alignment, can't be less than zero");
 
+  Sym->redefineIfPossible();
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
 
@@ -4935,6 +5035,8 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".err"] = DK_ERR;
   DirectiveKindMap[".error"] = DK_ERROR;
   DirectiveKindMap[".warning"] = DK_WARNING;
+  DirectiveKindMap[".altmacro"] = DK_ALTMACRO;
+  DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO;
   DirectiveKindMap[".reloc"] = DK_RELOC;
   DirectiveKindMap[".dc"] = DK_DC;
   DirectiveKindMap[".dc.a"] = DK_DC_A;
@@ -5212,7 +5314,7 @@ static int rewritesSort(const AsmRewrite *AsmRewriteA,
 
 bool AsmParser::parseMSInlineAsm(
     void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
-    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
     SmallVectorImpl<std::string> &Constraints,
     SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
     const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
diff --git a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index f411479..b83d68d 100644
--- a/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -7,19 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -98,12 +106,14 @@ class COFFAsmParser : public MCAsmParserExtension {
                             | COFF::IMAGE_SCN_MEM_READ,
                               SectionKind::getText());
   }
+
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
     return ParseSectionSwitch(".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                            COFF::IMAGE_SCN_MEM_READ |
                                            COFF::IMAGE_SCN_MEM_WRITE,
                               SectionKind::getData());
   }
+
   bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
     return ParseSectionSwitch(".bss",
                               COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA
@@ -141,8 +151,9 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
   bool ParseSEHRegisterNumber(unsigned &RegNo);
   bool ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
+
 public:
-  COFFAsmParser() {}
+  COFFAsmParser() = default;
 };
 
 } // end annonomous namespace.
@@ -277,7 +288,7 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
@@ -466,10 +477,11 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  if (Offset < 0 || Offset > UINT32_MAX)
-    return Error(OffsetLoc,
-                 "invalid '.secrel32' directive offset, can't be less "
-                 "than zero or greater than UINT32_MAX");
+  if (Offset < 0 || Offset > std::numeric_limits<uint32_t>::max())
+    return Error(
+        OffsetLoc,
+        "invalid '.secrel32' directive offset, can't be less "
+        "than zero or greater than std::numeric_limits<uint32_t>::max()");
 
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
@@ -817,4 +829,4 @@ MCAsmParserExtension *createCOFFAsmParser() {
   return new COFFAsmParser;
 }
 
-}
+} // end namespace llvm
diff --git a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index 94aa70e..f4152a9 100644
--- a/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -7,22 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <system_error>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -44,7 +57,7 @@ class DarwinAsmParser : public MCAsmParserExtension {
   SMLoc LastVersionMinDirective;
 
 public:
-  DarwinAsmParser() {}
+  DarwinAsmParser() = default;
 
   void Initialize(MCAsmParser &Parser) override {
     // Call the base implementation.
@@ -209,37 +222,47 @@ public:
   bool parseSectionDirectiveConst(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__const");
   }
+
   bool parseSectionDirectiveStaticConst(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__static_const");
   }
+
   bool parseSectionDirectiveCString(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveLiteral4(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__literal4",
                               MachO::S_4BYTE_LITERALS, 4);
   }
+
   bool parseSectionDirectiveLiteral8(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__literal8",
                               MachO::S_8BYTE_LITERALS, 8);
   }
+
   bool parseSectionDirectiveLiteral16(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__literal16",
                               MachO::S_16BYTE_LITERALS, 16);
   }
+
   bool parseSectionDirectiveConstructor(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__constructor");
   }
+
   bool parseSectionDirectiveDestructor(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__destructor");
   }
+
   bool parseSectionDirectiveFVMLibInit0(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__fvmlib_init0");
   }
+
   bool parseSectionDirectiveFVMLibInit1(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__fvmlib_init1");
   }
+
   bool parseSectionDirectiveSymbolStub(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__symbol_stub",
                               MachO::S_SYMBOL_STUBS |
@@ -247,144 +270,178 @@ public:
                               // FIXME: Different on PPC and ARM.
                               0, 16);
   }
+
   bool parseSectionDirectivePICSymbolStub(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__picsymbol_stub",
                               MachO::S_SYMBOL_STUBS |
                               MachO::S_ATTR_PURE_INSTRUCTIONS, 0, 26);
   }
+
   bool parseSectionDirectiveData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__data");
   }
+
   bool parseSectionDirectiveStaticData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__static_data");
   }
+
   bool parseSectionDirectiveNonLazySymbolPointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__nl_symbol_ptr",
                               MachO::S_NON_LAZY_SYMBOL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveLazySymbolPointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__la_symbol_ptr",
                               MachO::S_LAZY_SYMBOL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveThreadLocalVariablePointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_ptr",
                               MachO::S_THREAD_LOCAL_VARIABLE_POINTERS, 4);
   }
+
   bool parseSectionDirectiveDyld(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__dyld");
   }
+
   bool parseSectionDirectiveModInitFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__mod_init_func",
                               MachO::S_MOD_INIT_FUNC_POINTERS, 4);
   }
+
   bool parseSectionDirectiveModTermFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__mod_term_func",
                               MachO::S_MOD_TERM_FUNC_POINTERS, 4);
   }
+
   bool parseSectionDirectiveConstData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__const");
   }
+
   bool parseSectionDirectiveObjCClass(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__class",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCMetaClass(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__meta_class",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCatClsMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cat_cls_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCatInstMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cat_inst_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCProtocol(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__protocol",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCStringObject(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__string_object",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClsMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cls_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCInstMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__inst_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClsRefs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cls_refs",
                               MachO::S_ATTR_NO_DEAD_STRIP |
                               MachO::S_LITERAL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveObjCMessageRefs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__message_refs",
                               MachO::S_ATTR_NO_DEAD_STRIP |
                               MachO::S_LITERAL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveObjCSymbols(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__symbols",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCategory(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__category",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClassVars(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__class_vars",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCInstanceVars(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__instance_vars",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCModuleInfo(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__module_info",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClassNames(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCMethVarTypes(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCMethVarNames(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCSelectorStrs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__selector_strs",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveTData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_data",
                               MachO::S_THREAD_LOCAL_REGULAR);
   }
+
   bool parseSectionDirectiveText(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__text",
                               MachO::S_ATTR_PURE_INSTRUCTIONS);
   }
+
   bool parseSectionDirectiveTLV(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_vars",
                               MachO::S_THREAD_LOCAL_VARIABLES);
   }
+
   bool parseSectionDirectiveIdent(StringRef, SMLoc) {
     // Darwin silently ignores the .ident directive.
     getParser().eatToEndOfStatement();
     return false;
   }
+
   bool parseSectionDirectiveThreadInitFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_init",
                          MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS);
   }
-  bool parseVersionMin(StringRef, SMLoc);
 
+  bool parseVersionMin(StringRef, SMLoc);
 };
 
 } // end anonymous namespace
@@ -526,7 +583,7 @@ bool DarwinAsmParser::parseDirectiveDumpOrLoad(StringRef Directive,
 ///  ::= .linker_option "string" ( , "string" )*
 bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
   SmallVector<std::string, 4> Args;
-  for (;;) {
+  while (true) {
     if (getLexer().isNot(AsmToken::String))
       return TokError("expected string in '" + Twine(IDVal) + "' directive");
 
@@ -604,7 +661,6 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
     return TokError("unexpected token in '.section' directive");
   Lex();
 
-
   StringRef Segment, Section;
   unsigned StubSize;
   unsigned TAA;
diff --git a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 8d7ba0d..a407691 100644
--- a/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -7,17 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -142,9 +154,14 @@ private:
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionArguments(bool IsPush, SMLoc loc);
   unsigned parseSunStyleSectionFlags();
+  bool maybeParseSectionType(StringRef &TypeName);
+  bool parseMergeSize(int64_t &Size);
+  bool parseGroup(StringRef &GroupName);
+  bool parseMetadataSym(MCSymbolELF *&Associated);
+  bool maybeParseUniqueID(int64_t &UniqueID);
 };
 
-}
+} // end anonymous namespace
 
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ]
@@ -158,7 +175,7 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
@@ -230,8 +247,7 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
     return false;
   }
 
-  for (;;) {
-    
+  while (true) {
     SMLoc PrevLoc = getLexer().getLoc();
     if (getLexer().is(AsmToken::Comma) ||
       getLexer().is(AsmToken::EndOfStatement))
@@ -282,6 +298,9 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
     case 'w':
       flags |= ELF::SHF_WRITE;
       break;
+    case 'o':
+      flags |= ELF::SHF_LINK_ORDER;
+      break;
     case 'M':
       flags |= ELF::SHF_MERGE;
       break;
@@ -366,6 +385,97 @@ bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) {
   return ParseSectionArguments(/*IsPush=*/false, loc);
 }
 
+bool ELFAsmParser::maybeParseSectionType(StringRef &TypeName) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return false;
+  Lex();
+  if (L.isNot(AsmToken::At) && L.isNot(AsmToken::Percent) &&
+      L.isNot(AsmToken::String)) {
+    if (L.getAllowAtInIdentifier())
+      return TokError("expected '@<type>', '%<type>' or \"<type>\"");
+    else
+      return TokError("expected '%<type>' or \"<type>\"");
+  }
+  if (!L.is(AsmToken::String))
+    Lex();
+  if (L.is(AsmToken::Integer)) {
+    TypeName = getTok().getString();
+    Lex();
+  } else if (getParser().parseIdentifier(TypeName))
+    return TokError("expected identifier in directive");
+  return false;
+}
+
+bool ELFAsmParser::parseMergeSize(int64_t &Size) {
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected the entry size");
+  Lex();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+  if (Size <= 0)
+    return TokError("entry size must be positive");
+  return false;
+}
+
+bool ELFAsmParser::parseGroup(StringRef &GroupName) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected group name");
+  Lex();
+  if (getParser().parseIdentifier(GroupName))
+    return true;
+  if (L.is(AsmToken::Comma)) {
+    Lex();
+    StringRef Linkage;
+    if (getParser().parseIdentifier(Linkage))
+      return true;
+    if (Linkage != "comdat")
+      return TokError("Linkage must be 'comdat'");
+  }
+  return false;
+}
+
+bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected metadata symbol");
+  Lex();
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return true;
+  Associated = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
+  if (!Associated || !Associated->isInSection())
+    return TokError("symbol is not in a section: " + Name);
+  return false;
+}
+
+bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return false;
+  Lex();
+  StringRef UniqueStr;
+  if (getParser().parseIdentifier(UniqueStr))
+    return TokError("expected identifier in directive");
+  if (UniqueStr != "unique")
+    return TokError("expected 'unique'");
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected commma");
+  Lex();
+  if (getParser().parseAbsoluteExpression(UniqueID))
+    return true;
+  if (UniqueID < 0)
+    return TokError("unique id must be positive");
+  if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
+    return TokError("unique id is too large");
+  return false;
+}
+
+static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
+  return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
+}
+
 bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
@@ -379,14 +489,24 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
   StringRef UniqueStr;
+  MCSymbolELF *Associated = nullptr;
   int64_t UniqueID = ~0;
 
   // Set the defaults first.
-  if (SectionName == ".fini" || SectionName == ".init" ||
-      SectionName == ".rodata")
+  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
     Flags |= ELF::SHF_ALLOC;
-  if (SectionName == ".fini" || SectionName == ".init")
-    Flags |= ELF::SHF_EXECINSTR;
+  if (SectionName == ".fini" || SectionName == ".init" ||
+      hasPrefix(SectionName, ".text."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+  if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
+      hasPrefix(SectionName, ".bss.") ||
+      hasPrefix(SectionName, ".init_array.") ||
+      hasPrefix(SectionName, ".fini_array.") ||
+      hasPrefix(SectionName, ".preinit_array."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  if (hasPrefix(SectionName, ".tdata.") ||
+      hasPrefix(SectionName, ".tbss."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
 
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
@@ -422,65 +542,30 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       return TokError("Section cannot specifiy a group name while also acting "
                       "as a member of the last group");
 
-    if (getLexer().isNot(AsmToken::Comma)) {
+    if (maybeParseSectionType(TypeName))
+      return true;
+
+    MCAsmLexer &L = getLexer();
+    if (TypeName.empty()) {
       if (Mergeable)
         return TokError("Mergeable section must specify the type");
       if (Group)
         return TokError("Group section must specify the type");
-    } else {
-      Lex();
-      if (getLexer().is(AsmToken::At) || getLexer().is(AsmToken::Percent) ||
-          getLexer().is(AsmToken::String)) {
-        if (!getLexer().is(AsmToken::String))
-          Lex();
-      } else
-        return TokError("expected '@<type>', '%<type>' or \"<type>\"");
-
-      if (getParser().parseIdentifier(TypeName))
-        return TokError("expected identifier in directive");
-
-      if (Mergeable) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected the entry size");
-        Lex();
-        if (getParser().parseAbsoluteExpression(Size))
-          return true;
-        if (Size <= 0)
-          return TokError("entry size must be positive");
-      }
-
-      if (Group) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected group name");
-        Lex();
-        if (getParser().parseIdentifier(GroupName))
-          return true;
-        if (getLexer().is(AsmToken::Comma)) {
-          Lex();
-          StringRef Linkage;
-          if (getParser().parseIdentifier(Linkage))
-            return true;
-          if (Linkage != "comdat")
-            return TokError("Linkage must be 'comdat'");
-        }
-      }
-      if (getLexer().is(AsmToken::Comma)) {
-        Lex();
-        if (getParser().parseIdentifier(UniqueStr))
-          return TokError("expected identifier in directive");
-        if (UniqueStr != "unique")
-          return TokError("expected 'unique'");
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected commma");
-        Lex();
-        if (getParser().parseAbsoluteExpression(UniqueID))
-          return true;
-        if (UniqueID < 0)
-          return TokError("unique id must be positive");
-        if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
-          return TokError("unique id is too large");
-      }
+      if (L.isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in directive");
     }
+
+    if (Mergeable)
+      if (parseMergeSize(Size))
+        return true;
+    if (Group)
+      if (parseGroup(GroupName))
+        return true;
+    if (Flags & ELF::SHF_LINK_ORDER)
+      if (parseMetadataSym(Associated))
+        return true;
+    if (maybeParseUniqueID(UniqueID))
+      return true;
   }
 
 EndStmt:
@@ -493,11 +578,15 @@ EndStmt:
   if (TypeName.empty()) {
     if (SectionName.startswith(".note"))
       Type = ELF::SHT_NOTE;
-    else if (SectionName == ".init_array")
+    else if (hasPrefix(SectionName, ".init_array."))
       Type = ELF::SHT_INIT_ARRAY;
-    else if (SectionName == ".fini_array")
+    else if (hasPrefix(SectionName, ".bss."))
+      Type = ELF::SHT_NOBITS;
+    else if (hasPrefix(SectionName, ".tbss."))
+      Type = ELF::SHT_NOBITS;
+    else if (hasPrefix(SectionName, ".fini_array."))
       Type = ELF::SHT_FINI_ARRAY;
-    else if (SectionName == ".preinit_array")
+    else if (hasPrefix(SectionName, ".preinit_array."))
       Type = ELF::SHT_PREINIT_ARRAY;
   } else {
     if (TypeName == "init_array")
@@ -514,7 +603,9 @@ EndStmt:
       Type = ELF::SHT_NOTE;
     else if (TypeName == "unwind")
       Type = ELF::SHT_X86_64_UNWIND;
-    else
+    else if (TypeName == "llvm_odrtab")
+      Type = ELF::SHT_LLVM_ODRTAB;
+    else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
 
@@ -528,8 +619,9 @@ EndStmt:
       }
   }
 
-  MCSection *ELFSection = getContext().getELFSection(SectionName, Type, Flags,
-                                                     Size, GroupName, UniqueID);
+  MCSection *ELFSection =
+      getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
+                                 UniqueID, Associated);
   getStreamer().SwitchSection(ELFSection, Subsection);
 
   if (getContext().getGenDwarfForAssembly()) {
@@ -677,6 +769,7 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   const MCExpr *Value = MCSymbolRefExpr::create(Sym, getContext());
 
   getStreamer().EmitAssignment(Alias, Value);
+  getStreamer().emitELFSymverDirective(Alias, Sym);
   return false;
 }
 
@@ -752,4 +845,4 @@ MCAsmParserExtension *createELFAsmParser() {
   return new ELFAsmParser;
 }
 
-}
+} // end namespace llvm
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
index 63c0dab..8f845ee 100644
--- a/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmLexer.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmLexer.cpp - Abstract Asm Lexer Interface ---------------------===//
+//===- MCAsmLexer.cpp - Abstract Asm Lexer Interface ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,18 +8,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/SMLoc.h"
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer()
-    : TokStart(nullptr), SkipSpace(true), IsAtStartOfStatement(true),
-      CommentConsumer(nullptr) {
+MCAsmLexer::MCAsmLexer() : AltMacroMode(false) {
   CurTok.emplace_back(AsmToken::Space, StringRef());
 }
 
-MCAsmLexer::~MCAsmLexer() {
-}
+MCAsmLexer::~MCAsmLexer() = default;
 
 SMLoc MCAsmLexer::getLoc() const {
   return SMLoc::getFromPointer(TokStart);
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
index 98f4daf..ea36b3b 100644
--- a/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmParser.cpp
@@ -8,21 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
-MCAsmParser::MCAsmParser()
-    : TargetParser(nullptr), ShowParsedOperands(0), HadError(false),
-      PendingErrors() {}
+MCAsmParser::MCAsmParser() : ShowParsedOperands(0) {}
 
-MCAsmParser::~MCAsmParser() {
-}
+MCAsmParser::~MCAsmParser() = default;
 
 void MCAsmParser::setTargetParser(MCTargetAsmParser &P) {
   assert(!TargetParser && "Target parser is already initialized!");
@@ -118,10 +118,10 @@ bool MCAsmParser::addErrorSuffix(const Twine &Suffix) {
   return true;
 }
 
-bool MCAsmParser::parseMany(std::function<bool()> parseOne, bool hasComma) {
+bool MCAsmParser::parseMany(function_ref<bool()> parseOne, bool hasComma) {
   if (parseOptionalToken(AsmToken::EndOfStatement))
     return false;
-  while (1) {
+  while (true) {
     if (parseOne())
       return true;
     if (parseOptionalToken(AsmToken::EndOfStatement))
@@ -137,6 +137,9 @@ bool MCAsmParser::parseExpression(const MCExpr *&Res) {
   return parseExpression(Res, L);
 }
 
-LLVM_DUMP_METHOD void MCParsedAsmOperand::dump() const {
+void MCParsedAsmOperand::dump() const {
+  // Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
+#endif
 }
diff --git a/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp b/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
index 3f25a14..031f473 100644
--- a/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmParserExtension.cpp - Asm Parser Hooks -----------------------===//
+//===- MCAsmParserExtension.cpp - Asm Parser Hooks ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+
 using namespace llvm;
 
-MCAsmParserExtension::MCAsmParserExtension() :
-  BracketExpressionsSupported(false) {
-}
+MCAsmParserExtension::MCAsmParserExtension() = default;
 
-MCAsmParserExtension::~MCAsmParserExtension() {
-}
+MCAsmParserExtension::~MCAsmParserExtension() = default;
 
 void MCAsmParserExtension::Initialize(MCAsmParser &Parser) {
   this->Parser = &Parser;
diff --git a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
index 14a22c6..64ac82a 100644
--- a/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/contrib/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- MCTargetAsmParser.cpp - Target Assembly Parser ---------------------==//
+//===-- MCTargetAsmParser.cpp - Target Assembly Parser --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,17 +9,14 @@
 
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCContext.h"
+
 using namespace llvm;
 
 MCTargetAsmParser::MCTargetAsmParser(MCTargetOptions const &MCOptions,
                                      const MCSubtargetInfo &STI)
-  : AvailableFeatures(0), ParsingInlineAsm(false), MCOptions(MCOptions),
-    STI(&STI)
-{
-}
+  : MCOptions(MCOptions), STI(&STI) {}
 
-MCTargetAsmParser::~MCTargetAsmParser() {
-}
+MCTargetAsmParser::~MCTargetAsmParser() = default;
 
 MCSubtargetInfo &MCTargetAsmParser::copySTI() {
   MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI());
diff --git a/contrib/llvm/lib/MC/MCRegisterInfo.cpp b/contrib/llvm/lib/MC/MCRegisterInfo.cpp
index ea117f3..0f76c18 100644
--- a/contrib/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/contrib/llvm/lib/MC/MCRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//=== MC/MCRegisterInfo.cpp - Target Register Description -------*- C++ -*-===//
+//===- MC/MCRegisterInfo.cpp - Target Register Description ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/MC/MCSection.cpp b/contrib/llvm/lib/MC/MCSection.cpp
index 9064cdf..d141dd6 100644
--- a/contrib/llvm/lib/MC/MCSection.cpp
+++ b/contrib/llvm/lib/MC/MCSection.cpp
@@ -8,16 +8,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
+#include <algorithm>
+#include <utility>
 
-//===----------------------------------------------------------------------===//
-// MCSection
-//===----------------------------------------------------------------------===//
+using namespace llvm;
 
 MCSection::MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
@@ -31,8 +32,7 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
 
 bool MCSection::hasEnded() const { return End && End->isInSection(); }
 
-MCSection::~MCSection() {
-}
+MCSection::~MCSection() = default;
 
 void MCSection::setBundleLockState(BundleLockStateType NewState) {
   if (NewState == NotBundleLocked) {
@@ -85,8 +85,9 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
   return IP;
 }
 
-LLVM_DUMP_METHOD void MCSection::dump() {
-  raw_ostream &OS = llvm::errs();
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MCSection::dump() const {
+  raw_ostream &OS = errs();
 
   OS << "<MCSection";
   OS << " Fragments:[\n      ";
@@ -97,3 +98,4 @@ LLVM_DUMP_METHOD void MCSection::dump() {
   }
   OS << "]>";
 }
+#endif
diff --git a/contrib/llvm/lib/MC/MCSectionCOFF.cpp b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
index f2dd47d..72a7fc3 100644
--- a/contrib/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/contrib/llvm/lib/MC/MCSectionCOFF.cpp
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
-MCSectionCOFF::~MCSectionCOFF() {} // anchor.
+MCSectionCOFF::~MCSectionCOFF() = default; // anchor.
 
 // ShouldOmitSectionDirective - Decides whether a '.section' directive
 // should be printed before the section name
@@ -37,10 +37,9 @@ void MCSectionCOFF::setSelection(int Selection) const {
   Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 }
 
-void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
+void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
-
   // standard sections don't require the '.section'
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
@@ -94,7 +93,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
         OS << "newest,";
         break;
       default:
-        assert (0 && "unsupported COFF selection type");
+        assert(false && "unsupported COFF selection type");
         break;
     }
     assert(COMDATSymbol);
diff --git a/contrib/llvm/lib/MC/MCSectionELF.cpp b/contrib/llvm/lib/MC/MCSectionELF.cpp
index 587b28f..2f4f61a 100644
--- a/contrib/llvm/lib/MC/MCSectionELF.cpp
+++ b/contrib/llvm/lib/MC/MCSectionELF.cpp
@@ -8,22 +8,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
-MCSectionELF::~MCSectionELF() {} // anchor.
+MCSectionELF::~MCSectionELF() = default; // anchor.
 
 // Decides whether a '.section' directive
 // should be printed before the section name.
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
-
   if (isUnique())
     return false;
 
@@ -53,10 +53,9 @@ static void printName(raw_ostream &OS, StringRef Name) {
   OS << '"';
 }
 
-void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
+void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
-
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName();
     if (Subsection) {
@@ -104,14 +103,21 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << 'S';
   if (Flags & ELF::SHF_TLS)
     OS << 'T';
+  if (Flags & ELF::SHF_LINK_ORDER)
+    OS << 'o';
 
   // If there are target-specific flags, print them.
-  if (Flags & ELF::XCORE_SHF_CP_SECTION)
-    OS << 'c';
-  if (Flags & ELF::XCORE_SHF_DP_SECTION)
-    OS << 'd';
-  if (Flags & ELF::SHF_ARM_PURECODE)
-    OS << 'y';
+  Triple::ArchType Arch = T.getArch();
+  if (Arch == Triple::xcore) {
+    if (Flags & ELF::XCORE_SHF_CP_SECTION)
+      OS << 'c';
+    if (Flags & ELF::XCORE_SHF_DP_SECTION)
+      OS << 'd';
+  } else if (Arch == Triple::arm || Arch == Triple::armeb ||
+             Arch == Triple::thumb || Arch == Triple::thumbeb) {
+    if (Flags & ELF::SHF_ARM_PURECODE)
+      OS << 'y';
+  }
 
   OS << '"';
 
@@ -137,6 +143,15 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << "progbits";
   else if (Type == ELF::SHT_X86_64_UNWIND)
     OS << "unwind";
+  else if (Type == ELF::SHT_MIPS_DWARF)
+    // Print hex value of the flag while we do not have
+    // any standard symbolic representation of the flag.
+    OS << "0x7000001e";
+  else if (Type == ELF::SHT_LLVM_ODRTAB)
+    OS << "llvm_odrtab";
+  else
+    report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
+                       " for section " + getSectionName());
 
   if (EntrySize) {
     assert(Flags & ELF::SHF_MERGE);
@@ -149,6 +164,12 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << ",comdat";
   }
 
+  if (Flags & ELF::SHF_LINK_ORDER) {
+    assert(AssociatedSymbol);
+    OS << ",";
+    printName(OS, AssociatedSymbol->getName());
+  }
+
   if (isUnique())
     OS << ",unique," << UniqueID;
 
diff --git a/contrib/llvm/lib/MC/MCSectionMachO.cpp b/contrib/llvm/lib/MC/MCSectionMachO.cpp
index c2a772f..f402372 100644
--- a/contrib/llvm/lib/MC/MCSectionMachO.cpp
+++ b/contrib/llvm/lib/MC/MCSectionMachO.cpp
@@ -16,45 +16,57 @@ using namespace llvm;
 /// SectionTypeDescriptors - These are strings that describe the various section
 /// types.  This *must* be kept in order with and stay synchronized with the
 /// section type list.
-static const struct {
-  StringRef AssemblerName, EnumName;
-} SectionTypeDescriptors[MachO::LAST_KNOWN_SECTION_TYPE+1] = {
-  { "regular",                  "S_REGULAR" },                    // 0x00
-  { StringRef(),                "S_ZEROFILL" },                   // 0x01
-  { "cstring_literals",         "S_CSTRING_LITERALS" },           // 0x02
-  { "4byte_literals",           "S_4BYTE_LITERALS" },             // 0x03
-  { "8byte_literals",           "S_8BYTE_LITERALS" },             // 0x04
-  { "literal_pointers",         "S_LITERAL_POINTERS" },           // 0x05
-  { "non_lazy_symbol_pointers", "S_NON_LAZY_SYMBOL_POINTERS" },   // 0x06
-  { "lazy_symbol_pointers",     "S_LAZY_SYMBOL_POINTERS" },       // 0x07
-  { "symbol_stubs",             "S_SYMBOL_STUBS" },               // 0x08
-  { "mod_init_funcs",           "S_MOD_INIT_FUNC_POINTERS" },     // 0x09
-  { "mod_term_funcs",           "S_MOD_TERM_FUNC_POINTERS" },     // 0x0A
-  { "coalesced",                "S_COALESCED" },                  // 0x0B
-  { StringRef(), /*FIXME??*/    "S_GB_ZEROFILL" },                // 0x0C
-  { "interposing",              "S_INTERPOSING" },                // 0x0D
-  { "16byte_literals",          "S_16BYTE_LITERALS" },            // 0x0E
-  { StringRef(), /*FIXME??*/    "S_DTRACE_DOF" },                 // 0x0F
-  { StringRef(), /*FIXME??*/    "S_LAZY_DYLIB_SYMBOL_POINTERS" }, // 0x10
-  { "thread_local_regular",     "S_THREAD_LOCAL_REGULAR" },       // 0x11
-  { "thread_local_zerofill",    "S_THREAD_LOCAL_ZEROFILL" },      // 0x12
-  { "thread_local_variables",   "S_THREAD_LOCAL_VARIABLES" },     // 0x13
-  { "thread_local_variable_pointers",
-    "S_THREAD_LOCAL_VARIABLE_POINTERS" },                         // 0x14
-  { "thread_local_init_function_pointers",
-    "S_THREAD_LOCAL_INIT_FUNCTION_POINTERS"},                     // 0x15
+static constexpr struct {
+  StringLiteral AssemblerName, EnumName;
+} SectionTypeDescriptors[MachO::LAST_KNOWN_SECTION_TYPE + 1] = {
+    {StringLiteral("regular"), StringLiteral("S_REGULAR")}, // 0x00
+    {StringLiteral(""), StringLiteral("S_ZEROFILL")},       // 0x01
+    {StringLiteral("cstring_literals"),
+     StringLiteral("S_CSTRING_LITERALS")}, // 0x02
+    {StringLiteral("4byte_literals"),
+     StringLiteral("S_4BYTE_LITERALS")}, // 0x03
+    {StringLiteral("8byte_literals"),
+     StringLiteral("S_8BYTE_LITERALS")}, // 0x04
+    {StringLiteral("literal_pointers"),
+     StringLiteral("S_LITERAL_POINTERS")}, // 0x05
+    {StringLiteral("non_lazy_symbol_pointers"),
+     StringLiteral("S_NON_LAZY_SYMBOL_POINTERS")}, // 0x06
+    {StringLiteral("lazy_symbol_pointers"),
+     StringLiteral("S_LAZY_SYMBOL_POINTERS")},                        // 0x07
+    {StringLiteral("symbol_stubs"), StringLiteral("S_SYMBOL_STUBS")}, // 0x08
+    {StringLiteral("mod_init_funcs"),
+     StringLiteral("S_MOD_INIT_FUNC_POINTERS")}, // 0x09
+    {StringLiteral("mod_term_funcs"),
+     StringLiteral("S_MOD_TERM_FUNC_POINTERS")},                     // 0x0A
+    {StringLiteral("coalesced"), StringLiteral("S_COALESCED")},      // 0x0B
+    {StringLiteral("") /*FIXME??*/, StringLiteral("S_GB_ZEROFILL")}, // 0x0C
+    {StringLiteral("interposing"), StringLiteral("S_INTERPOSING")},  // 0x0D
+    {StringLiteral("16byte_literals"),
+     StringLiteral("S_16BYTE_LITERALS")},                           // 0x0E
+    {StringLiteral("") /*FIXME??*/, StringLiteral("S_DTRACE_DOF")}, // 0x0F
+    {StringLiteral("") /*FIXME??*/,
+     StringLiteral("S_LAZY_DYLIB_SYMBOL_POINTERS")}, // 0x10
+    {StringLiteral("thread_local_regular"),
+     StringLiteral("S_THREAD_LOCAL_REGULAR")}, // 0x11
+    {StringLiteral("thread_local_zerofill"),
+     StringLiteral("S_THREAD_LOCAL_ZEROFILL")}, // 0x12
+    {StringLiteral("thread_local_variables"),
+     StringLiteral("S_THREAD_LOCAL_VARIABLES")}, // 0x13
+    {StringLiteral("thread_local_variable_pointers"),
+     StringLiteral("S_THREAD_LOCAL_VARIABLE_POINTERS")}, // 0x14
+    {StringLiteral("thread_local_init_function_pointers"),
+     StringLiteral("S_THREAD_LOCAL_INIT_FUNCTION_POINTERS")}, // 0x15
 };
 
-
 /// SectionAttrDescriptors - This is an array of descriptors for section
 /// attributes.  Unlike the SectionTypeDescriptors, this is not directly indexed
 /// by attribute, instead it is searched.
-static const struct {
+static constexpr struct {
   unsigned AttrFlag;
-  StringRef AssemblerName, EnumName;
+  StringLiteral AssemblerName, EnumName;
 } SectionAttrDescriptors[] = {
 #define ENTRY(ASMNAME, ENUM) \
-  { MachO::ENUM, ASMNAME, #ENUM },
+  { MachO::ENUM, StringLiteral(ASMNAME), StringLiteral(#ENUM) },
 ENTRY("pure_instructions",   S_ATTR_PURE_INSTRUCTIONS)
 ENTRY("no_toc",              S_ATTR_NO_TOC)
 ENTRY("strip_static_syms",   S_ATTR_STRIP_STATIC_SYMS)
@@ -62,11 +74,11 @@ ENTRY("no_dead_strip",       S_ATTR_NO_DEAD_STRIP)
 ENTRY("live_support",        S_ATTR_LIVE_SUPPORT)
 ENTRY("self_modifying_code", S_ATTR_SELF_MODIFYING_CODE)
 ENTRY("debug",               S_ATTR_DEBUG)
-ENTRY(StringRef() /*FIXME*/, S_ATTR_SOME_INSTRUCTIONS)
-ENTRY(StringRef() /*FIXME*/, S_ATTR_EXT_RELOC)
-ENTRY(StringRef() /*FIXME*/, S_ATTR_LOC_RELOC)
+ENTRY("" /*FIXME*/,          S_ATTR_SOME_INSTRUCTIONS)
+ENTRY("" /*FIXME*/,          S_ATTR_EXT_RELOC)
+ENTRY("" /*FIXME*/,          S_ATTR_LOC_RELOC)
 #undef ENTRY
-  { 0, "none", StringRef() }, // used if section has no attributes but has a stub size
+  { 0, StringLiteral("none"), StringLiteral("") }, // used if section has no attributes but has a stub size
 };
 
 MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
@@ -89,7 +101,7 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
   }
 }
 
-void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI,
+void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
                                           const MCExpr *Subsection) const {
   OS << "\t.section\t" << getSegmentName() << ',' << getSectionName();
diff --git a/contrib/llvm/lib/MC/MCSectionWasm.cpp b/contrib/llvm/lib/MC/MCSectionWasm.cpp
new file mode 100644
index 0000000..c61f28e
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCSectionWasm.cpp
@@ -0,0 +1,97 @@
+//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSectionWasm::~MCSectionWasm() {} // anchor.
+
+// Decides whether a '.section' directive
+// should be printed before the section name.
+bool MCSectionWasm::ShouldOmitSectionDirective(StringRef Name,
+                                               const MCAsmInfo &MAI) const {
+  return MAI.shouldOmitSectionDirective(Name);
+}
+
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
+void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+                                         raw_ostream &OS,
+                                         const MCExpr *Subsection) const {
+
+  if (ShouldOmitSectionDirective(SectionName, MAI)) {
+    OS << '\t' << getSectionName();
+    if (Subsection) {
+      OS << '\t';
+      Subsection->print(OS, &MAI);
+    }
+    OS << '\n';
+    return;
+  }
+
+  OS << "\t.section\t";
+  printName(OS, getSectionName());
+  OS << ",\"";
+
+  // TODO: Print section flags.
+
+  OS << '"';
+
+  OS << ',';
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (MAI.getCommentString()[0] == '@')
+    OS << '%';
+  else
+    OS << '@';
+
+  // TODO: Print section type.
+
+  if (isUnique())
+    OS << ",unique," << UniqueID;
+
+  OS << '\n';
+
+  if (Subsection) {
+    OS << "\t.subsection\t";
+    Subsection->print(OS, &MAI);
+    OS << '\n';
+  }
+}
+
+bool MCSectionWasm::UseCodeAlign() const { return false; }
+
+bool MCSectionWasm::isVirtualSection() const { return false; }
diff --git a/contrib/llvm/lib/MC/MCStreamer.cpp b/contrib/llvm/lib/MC/MCStreamer.cpp
index fb28f85..2bfb9a6 100644
--- a/contrib/llvm/lib/MC/MCStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCStreamer.cpp
@@ -9,34 +9,42 @@
 
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/MC/MCWinEH.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 #include <cstdlib>
-using namespace llvm;
+#include <utility>
 
-// Pin the vtables to this file.
-MCTargetStreamer::~MCTargetStreamer() {}
+using namespace llvm;
 
 MCTargetStreamer::MCTargetStreamer(MCStreamer &S) : Streamer(S) {
   S.setTargetStreamer(this);
 }
 
+// Pin the vtables to this file.
+MCTargetStreamer::~MCTargetStreamer() = default;
+
 void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {}
 
 void MCTargetStreamer::finish() {}
@@ -290,10 +298,17 @@ void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
   SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
 }
 
-void MCStreamer::EmitLabel(MCSymbol *Symbol) {
+void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  Symbol->redefineIfPossible();
+
+  if (!Symbol->isUndefined() || Symbol->isVariable())
+    return getContext().reportError(Loc, "invalid symbol redefinition");
+
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSectionOnly() && "Cannot emit before setting section!");
   assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!");
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
   Symbol->setFragment(&getCurrentSectionOnly()->getDummyFragment());
 
   MCTargetStreamer *TS = getTargetStreamer();
@@ -666,7 +681,7 @@ void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) {
 
 void MCStreamer::EmitWinCFIPushFrame(bool Code) {
   EnsureValidWinFrameInfo();
-  if (CurrentWinFrameInfo->Instructions.size() > 0)
+  if (!CurrentWinFrameInfo->Instructions.empty())
     report_fatal_error("If present, PushMachFrame must be the first UOP");
 
   MCSymbol *Label = EmitCFILabel();
@@ -762,8 +777,8 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
   }
 }
 
-void MCStreamer::EmitInstruction(const MCInst &Inst,
-                                 const MCSubtargetInfo &STI) {
+void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                                 bool) {
   // Scan for values.
   for (unsigned i = Inst.getNumOperands(); i--;)
     if (Inst.getOperand(i).isExpr())
@@ -792,12 +807,22 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
 void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
 void MCStreamer::EmitThumbFunc(MCSymbol *Func) {}
 void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
-void MCStreamer::EndCOFFSymbolDef() {}
+void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
+void MCStreamer::EndCOFFSymbolDef() {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
 void MCStreamer::EmitFileDirective(StringRef Filename) {}
-void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {}
-void MCStreamer::EmitCOFFSymbolType(int Type) {}
+void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
+void MCStreamer::EmitCOFFSymbolType(int Type) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
 void MCStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+void MCStreamer::emitELFSymverDirective(MCSymbol *Alias,
+                                        const MCSymbol *Aliasee) {}
 void MCStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {}
 void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
diff --git a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
index 1b59250..385cdcc 100644
--- a/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/contrib/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MCSubtargetInfo.cpp - Subtarget Information -----------------------===//
+//===- MCSubtargetInfo.cpp - Subtarget Information ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,12 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstring>
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/MC/MCSymbol.cpp b/contrib/llvm/lib/MC/MCSymbol.cpp
index 20d985d..9abaaef 100644
--- a/contrib/llvm/lib/MC/MCSymbol.cpp
+++ b/contrib/llvm/lib/MC/MCSymbol.cpp
@@ -8,12 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+
 using namespace llvm;
 
 // Only the address of this fragment is ever actually used.
@@ -75,4 +81,8 @@ void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << '"';
 }
 
-LLVM_DUMP_METHOD void MCSymbol::dump() const { dbgs() << *this; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MCSymbol::dump() const {
+  dbgs() << *this;
+}
+#endif
diff --git a/contrib/llvm/lib/MC/MCSymbolELF.cpp b/contrib/llvm/lib/MC/MCSymbolELF.cpp
index ec7ef44..67449eb 100644
--- a/contrib/llvm/lib/MC/MCSymbolELF.cpp
+++ b/contrib/llvm/lib/MC/MCSymbolELF.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/Support/ELF.h"
 
 namespace llvm {
 
@@ -42,6 +42,8 @@ enum {
 
 void MCSymbolELF::setBinding(unsigned Binding) const {
   setIsBindingSet();
+  if (getType() == ELF::STT_SECTION && Binding != ELF::STB_LOCAL)
+    setType(ELF::STT_NOTYPE);
   unsigned Val;
   switch (Binding) {
   default:
@@ -93,6 +95,8 @@ unsigned MCSymbolELF::getBinding() const {
 
 void MCSymbolELF::setType(unsigned Type) const {
   unsigned Val;
+  if (Type == ELF::STT_SECTION && getBinding() != ELF::STB_LOCAL)
+    return;
   switch (Type) {
   default:
     llvm_unreachable("Unsupported Binding");
diff --git a/contrib/llvm/lib/MC/MCTargetOptions.cpp b/contrib/llvm/lib/MC/MCTargetOptions.cpp
index 4192105..b85e53d 100644
--- a/contrib/llvm/lib/MC/MCTargetOptions.cpp
+++ b/contrib/llvm/lib/MC/MCTargetOptions.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCTargetOptions.cpp - MC Target Options --------------------===//
+//===- lib/MC/MCTargetOptions.cpp - MC Target Options ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/ADT/StringRef.h"
 
-namespace llvm {
+using namespace llvm;
 
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
       MCFatalWarnings(false), MCNoWarn(false), MCNoDeprecatedWarn(false),
-      MCSaveTempLabels(false),
-      MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
-      MCPIECopyRelocations(false), ShowMCEncoding(false),
-      ShowMCInst(false), AsmVerbose(false),
-      PreserveAsmComments(true), DwarfVersion(0), ABIName() {}
+      MCSaveTempLabels(false), MCUseDwarfDirectory(false),
+      MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false),
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
+      PreserveAsmComments(true) {}
 
 StringRef MCTargetOptions::getABIName() const {
   return ABIName;
 }
-
-} // end namespace llvm
diff --git a/contrib/llvm/lib/MC/MCValue.cpp b/contrib/llvm/lib/MC/MCValue.cpp
index c1336d6..32a6adb 100644
--- a/contrib/llvm/lib/MC/MCValue.cpp
+++ b/contrib/llvm/lib/MC/MCValue.cpp
@@ -37,9 +37,11 @@ void MCValue::print(raw_ostream &OS) const {
     OS << " + " << getConstant();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCValue::dump() const {
   print(dbgs());
 }
+#endif
 
 MCSymbolRefExpr::VariantKind MCValue::getAccessVariant() const {
   const MCSymbolRefExpr *B = getSymB();
diff --git a/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp b/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp
new file mode 100644
index 0000000..301f30d
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCWasmObjectTargetWriter.cpp
@@ -0,0 +1,21 @@
+//===-- MCWasmObjectTargetWriter.cpp - Wasm Target Writer Subclass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+
+using namespace llvm;
+
+MCWasmObjectTargetWriter::MCWasmObjectTargetWriter(bool Is64Bit_)
+    : Is64Bit(Is64Bit_) {}
+
+// Pin the vtable to this object file
+MCWasmObjectTargetWriter::~MCWasmObjectTargetWriter() = default;
diff --git a/contrib/llvm/lib/MC/MCWasmStreamer.cpp b/contrib/llvm/lib/MC/MCWasmStreamer.cpp
new file mode 100644
index 0000000..02fa070
--- /dev/null
+++ b/contrib/llvm/lib/MC/MCWasmStreamer.cpp
@@ -0,0 +1,228 @@
+//===- lib/MC/MCWasmStreamer.cpp - Wasm Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits Wasm .o object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCWasmStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCWasmStreamer::~MCWasmStreamer() {}
+
+void MCWasmStreamer::mergeFragment(MCDataFragment *DF, MCDataFragment *EF) {
+  flushPendingLabels(DF, DF->getContents().size());
+
+  for (unsigned i = 0, e = EF->getFixups().size(); i != e; ++i) {
+    EF->getFixups()[i].setOffset(EF->getFixups()[i].getOffset() +
+                                 DF->getContents().size());
+    DF->getFixups().push_back(EF->getFixups()[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
+}
+
+void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+
+  // Do any generic stuff we need to do.
+  llvm_unreachable("invalid assembler flag!");
+}
+
+void MCWasmStreamer::ChangeSection(MCSection *Section,
+                                   const MCExpr *Subsection) {
+  MCAssembler &Asm = getAssembler();
+  auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
+  const MCSymbol *Grp = SectionWasm->getGroup();
+  if (Grp)
+    Asm.registerSymbol(*Grp);
+
+  this->MCObjectStreamer::ChangeSection(Section, Subsection);
+}
+
+void MCWasmStreamer::EmitWeakReference(MCSymbol *Alias,
+                                       const MCSymbol *Symbol) {
+  getAssembler().registerSymbol(*Symbol);
+  const MCExpr *Value = MCSymbolRefExpr::create(
+      Symbol, MCSymbolRefExpr::VK_WEAKREF, getContext());
+  Alias->setVariableValue(Value);
+}
+
+bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
+  assert(Attribute != MCSA_IndirectSymbol && "indirect symbols not supported");
+
+  auto *Symbol = cast<MCSymbolWasm>(S);
+
+  // Adding a symbol attribute always introduces the symbol, note that an
+  // important side effect of calling registerSymbol here is to register
+  // the symbol with the assembler.
+  getAssembler().registerSymbol(*Symbol);
+
+  switch (Attribute) {
+  case MCSA_LazyReference:
+  case MCSA_Reference:
+  case MCSA_SymbolResolver:
+  case MCSA_PrivateExtern:
+  case MCSA_WeakDefinition:
+  case MCSA_WeakDefAutoPrivate:
+  case MCSA_Invalid:
+  case MCSA_IndirectSymbol:
+  case MCSA_Hidden:
+    return false;
+
+  case MCSA_Weak:
+  case MCSA_WeakReference:
+    Symbol->setWeak(true);
+    Symbol->setExternal(true);
+    break;
+
+  case MCSA_Global:
+    Symbol->setExternal(true);
+    break;
+
+  case MCSA_ELF_TypeFunction:
+    Symbol->setIsFunction(true);
+    break;
+
+  case MCSA_ELF_TypeObject:
+    Symbol->setIsFunction(false);
+    break;
+
+  default:
+    // unrecognized directive
+    llvm_unreachable("unexpected MCSymbolAttr");
+    return false;
+  }
+
+  return true;
+}
+
+void MCWasmStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
+                                      unsigned ByteAlignment) {
+  llvm_unreachable("Common symbols are not yet implemented for Wasm");
+}
+
+void MCWasmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  cast<MCSymbolWasm>(Symbol)->setSize(Value);
+}
+
+void MCWasmStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
+                                           unsigned ByteAlignment) {
+  llvm_unreachable("Local common symbols are not yet implemented for Wasm");
+}
+
+void MCWasmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                   SMLoc Loc) {
+  MCObjectStreamer::EmitValueImpl(Value, Size, Loc);
+}
+
+void MCWasmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                          unsigned ValueSize,
+                                          unsigned MaxBytesToEmit) {
+  MCObjectStreamer::EmitValueToAlignment(ByteAlignment, Value, ValueSize,
+                                         MaxBytesToEmit);
+}
+
+void MCWasmStreamer::EmitIdent(StringRef IdentString) {
+  MCSection *Comment = getAssembler().getContext().getWasmSection(
+      ".comment", 0, 0);
+  PushSection();
+  SwitchSection(Comment);
+  if (!SeenIdent) {
+    EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  EmitBytes(IdentString);
+  EmitIntValue(0, 1);
+  PopSection();
+}
+
+void MCWasmStreamer::EmitInstToFragment(const MCInst &Inst,
+                                        const MCSubtargetInfo &STI) {
+  this->MCObjectStreamer::EmitInstToFragment(Inst, STI);
+}
+
+void MCWasmStreamer::EmitInstToData(const MCInst &Inst,
+                                    const MCSubtargetInfo &STI) {
+  MCAssembler &Assembler = getAssembler();
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
+
+  // Append the encoded instruction to the current data fragment (or create a
+  // new such fragment if the current fragment is not a data fragment).
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCWasmStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+
+  this->MCObjectStreamer::FinishImpl();
+}
+
+MCStreamer *llvm::createWasmStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                     bool RelaxAll) {
+  MCWasmStreamer *S = new MCWasmStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
+
+void MCWasmStreamer::EmitThumbFunc(MCSymbol *Func) {
+  llvm_unreachable("Generic Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+                                  uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+                                    uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
diff --git a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp b/contrib/llvm/lib/MC/MCWinCOFFStreamer.cpp
index 6383d87..bf341bb 100644
--- a/contrib/llvm/lib/MC/WinCOFFStreamer.cpp
+++ b/contrib/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/WinCOFFStreamer.cpp -----------------------------*- C++ -*-===//
+//===- llvm/MC/MCWinCOFFStreamer.cpp --------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,32 +11,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolCOFF.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "WinCOFFStreamer"
 
-namespace llvm {
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      MCCodeEmitter &CE, raw_pwrite_stream &OS)
     : MCObjectStreamer(Context, MAB, OS, &CE), CurSymbol(nullptr) {}
@@ -75,10 +79,9 @@ void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCWinCOFFStreamer::EmitLabel(MCSymbol *S) {
+void MCWinCOFFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCObjectStreamer::EmitLabel(Symbol);
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
 }
 
 void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
@@ -187,7 +190,8 @@ void MCWinCOFFStreamer::EmitCOFFSafeSEH(MCSymbol const *Symbol) {
                    << COFF::SCT_COMPLEX_TYPE_SHIFT);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
+void MCWinCOFFStreamer::EmitCOFFSectionIndex(const MCSymbol *Symbol) {
+  visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
   const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(Symbol, getContext());
   MCFixup Fixup = MCFixup::create(DF->getContents().size(), SRE, FK_SecRel_2);
@@ -195,8 +199,9 @@ void MCWinCOFFStreamer::EmitCOFFSectionIndex(MCSymbol const *Symbol) {
   DF->getContents().resize(DF->getContents().size() + 2, 0);
 }
 
-void MCWinCOFFStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol,
+void MCWinCOFFStreamer::EmitCOFFSecRel32(const MCSymbol *Symbol,
                                          uint64_t Offset) {
+  visitUsedSymbol(*Symbol);
   MCDataFragment *DF = getOrCreateDataFragment();
   // Create Symbol A for the relocation relative reference.
   const MCExpr *MCE = MCSymbolRefExpr::create(Symbol, getContext());
@@ -275,10 +280,6 @@ void MCWinCOFFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::EmitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
-}
-
 // TODO: Implement this if you want to emit .comment section in COFF obj files.
 void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) {
   llvm_unreachable("not implemented");
@@ -295,5 +296,3 @@ void MCWinCOFFStreamer::FinishImpl() {
 void MCWinCOFFStreamer::Error(const Twine &Msg) const {
   getContext().reportError(SMLoc(), Msg);
 }
-}
-
diff --git a/contrib/llvm/lib/MC/MCWinEH.cpp b/contrib/llvm/lib/MC/MCWinEH.cpp
index 21a9139..a5d0f5a 100644
--- a/contrib/llvm/lib/MC/MCWinEH.cpp
+++ b/contrib/llvm/lib/MC/MCWinEH.cpp
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCWinEH.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCWinEH.h"
-#include "llvm/Support/COFF.h"
 
 namespace llvm {
 namespace WinEH {
diff --git a/contrib/llvm/lib/MC/MachObjectWriter.cpp b/contrib/llvm/lib/MC/MachObjectWriter.cpp
index c4b35f5..62bf0a5 100644
--- a/contrib/llvm/lib/MC/MachObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/MachObjectWriter.cpp
@@ -7,23 +7,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
 #include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mc"
@@ -436,7 +449,7 @@ void MachObjectWriter::recordRelocation(MCAssembler &Asm,
                                         const MCAsmLayout &Layout,
                                         const MCFragment *Fragment,
                                         const MCFixup &Fixup, MCValue Target,
-                                        bool &IsPCRel, uint64_t &FixedValue) {
+                                        uint64_t &FixedValue) {
   TargetObjectWriter->recordRelocation(this, Asm, Layout, Fragment, Fixup,
                                        Target, FixedValue);
 }
diff --git a/contrib/llvm/lib/MC/StringTableBuilder.cpp b/contrib/llvm/lib/MC/StringTableBuilder.cpp
index 1a501bc..6025a20 100644
--- a/contrib/llvm/lib/MC/StringTableBuilder.cpp
+++ b/contrib/llvm/lib/MC/StringTableBuilder.cpp
@@ -1,4 +1,4 @@
-//===-- StringTableBuilder.cpp - String table building utility ------------===//
+//===- StringTableBuilder.cpp - String table building utility -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,17 +8,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/StringTableBuilder.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/COFF.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
 
-StringTableBuilder::~StringTableBuilder() {}
+StringTableBuilder::~StringTableBuilder() = default;
 
 void StringTableBuilder::initSize() {
   // Account for leading bytes in table so that offsets returned from add are
@@ -48,11 +54,11 @@ void StringTableBuilder::write(raw_ostream &OS) const {
   assert(isFinalized());
   SmallString<0> Data;
   Data.resize(getSize());
-  write((uint8_t *)&Data[0]);
+  write((uint8_t *)Data.data());
   OS << Data;
 }
 
-typedef std::pair<CachedHashStringRef, size_t> StringPair;
+using StringPair = std::pair<CachedHashStringRef, size_t>;
 
 void StringTableBuilder::write(uint8_t *Buf) const {
   assert(isFinalized());
diff --git a/contrib/llvm/lib/MC/SubtargetFeature.cpp b/contrib/llvm/lib/MC/SubtargetFeature.cpp
index 32f06f8..b68e88c 100644
--- a/contrib/llvm/lib/MC/SubtargetFeature.cpp
+++ b/contrib/llvm/lib/MC/SubtargetFeature.cpp
@@ -7,28 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the SubtargetFeature interface.
+/// \file Implements the SubtargetFeature interface.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cctype>
-#include <cstdlib>
-using namespace llvm;
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <vector>
 
-//===----------------------------------------------------------------------===//
-//                          Static Helper Functions
-//===----------------------------------------------------------------------===//
+using namespace llvm;
 
-/// hasFlag - Determine if a feature has a flag; '+' or '-'
-///
+/// Determine if a feature has a flag; '+' or '-'
 static inline bool hasFlag(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
@@ -37,14 +40,12 @@ static inline bool hasFlag(StringRef Feature) {
   return Ch == '+' || Ch =='-';
 }
 
-/// StripFlag - Return string stripped of flag.
-///
+/// Return string stripped of flag.
 static inline std::string StripFlag(StringRef Feature) {
   return hasFlag(Feature) ? Feature.substr(1) : Feature;
 }
 
-/// isEnabled - Return true if enable flag; '+'.
-///
+/// Return true if enable flag; '+'.
 static inline bool isEnabled(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
@@ -53,15 +54,13 @@ static inline bool isEnabled(StringRef Feature) {
   return Ch == '+';
 }
 
-/// Split - Splits a string of comma separated items in to a vector of strings.
-///
+/// Splits a string of comma separated items in to a vector of strings.
 static void Split(std::vector<std::string> &V, StringRef S) {
   SmallVector<StringRef, 3> Tmp;
   S.split(Tmp, ',', -1, false /* KeepEmpty */);
   V.assign(Tmp.begin(), Tmp.end());
 }
 
-/// Adding features.
 void SubtargetFeatures::AddFeature(StringRef String, bool Enable) {
   // Don't add empty features.
   if (!String.empty())
@@ -81,8 +80,7 @@ static const SubtargetFeatureKV *Find(StringRef S,
   return F;
 }
 
-/// getLongestEntryLength - Return the length of the longest entry in the table.
-///
+/// Return the length of the longest entry in the table.
 static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
   size_t MaxLen = 0;
   for (auto &I : Table)
@@ -91,7 +89,6 @@ static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
 }
 
 /// Display help for feature choices.
-///
 static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
                  ArrayRef<SubtargetFeatureKV> FeatTable) {
   // Determine the length of the longest CPU and Feature entries.
@@ -114,58 +111,47 @@ static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
             "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
 }
 
-//===----------------------------------------------------------------------===//
-//                    SubtargetFeatures Implementation
-//===----------------------------------------------------------------------===//
-
 SubtargetFeatures::SubtargetFeatures(StringRef Initial) {
   // Break up string into separate features
   Split(Features, Initial);
 }
 
-
 std::string SubtargetFeatures::getString() const {
   return join(Features.begin(), Features.end(), ",");
 }
 
-/// SetImpliedBits - For each feature that is (transitively) implied by this
-/// feature, set it.
-///
+/// For each feature that is (transitively) implied by this feature, set it.
 static
-void SetImpliedBits(FeatureBitset &Bits, const SubtargetFeatureKV *FeatureEntry,
+void SetImpliedBits(FeatureBitset &Bits, const SubtargetFeatureKV &FeatureEntry,
                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
-  for (auto &FE : FeatureTable) {
-    if (FeatureEntry->Value == FE.Value) continue;
+  for (const SubtargetFeatureKV &FE : FeatureTable) {
+    if (FeatureEntry.Value == FE.Value) continue;
 
-    if ((FeatureEntry->Implies & FE.Value).any()) {
+    if ((FeatureEntry.Implies & FE.Value).any()) {
       Bits |= FE.Value;
-      SetImpliedBits(Bits, &FE, FeatureTable);
+      SetImpliedBits(Bits, FE, FeatureTable);
     }
   }
 }
 
-/// ClearImpliedBits - For each feature that (transitively) implies this
-/// feature, clear it.
-///
+/// For each feature that (transitively) implies this feature, clear it.
 static
-void ClearImpliedBits(FeatureBitset &Bits, 
-                      const SubtargetFeatureKV *FeatureEntry,
+void ClearImpliedBits(FeatureBitset &Bits,
+                      const SubtargetFeatureKV &FeatureEntry,
                       ArrayRef<SubtargetFeatureKV> FeatureTable) {
-  for (auto &FE : FeatureTable) {
-    if (FeatureEntry->Value == FE.Value) continue;
+  for (const SubtargetFeatureKV &FE : FeatureTable) {
+    if (FeatureEntry.Value == FE.Value) continue;
 
-    if ((FE.Implies & FeatureEntry->Value).any()) {
+    if ((FE.Implies & FeatureEntry.Value).any()) {
       Bits &= ~FE.Value;
-      ClearImpliedBits(Bits, &FE, FeatureTable);
+      ClearImpliedBits(Bits, FE, FeatureTable);
     }
   }
 }
 
-/// ToggleFeature - Toggle a feature and update the feature bits.
 void
 SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
                                  ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   // Find feature in table.
   const SubtargetFeatureKV *FeatureEntry =
       Find(StripFlag(Feature), FeatureTable);
@@ -174,23 +160,21 @@ SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
     if ((Bits & FeatureEntry->Value) == FeatureEntry->Value) {
       Bits &= ~FeatureEntry->Value;
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
+      ClearImpliedBits(Bits, *FeatureEntry, FeatureTable);
     } else {
       Bits |=  FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
+      SetImpliedBits(Bits, *FeatureEntry, FeatureTable);
     }
   } else {
-    errs() << "'" << Feature
-           << "' is not a recognized feature for this target"
+    errs() << "'" << Feature << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
 }
 
 void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
                                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   assert(hasFlag(Feature));
 
   // Find feature in table.
@@ -203,37 +187,30 @@ void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
       Bits |= FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
+      SetImpliedBits(Bits, *FeatureEntry, FeatureTable);
     } else {
       Bits &= ~FeatureEntry->Value;
 
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
+      ClearImpliedBits(Bits, *FeatureEntry, FeatureTable);
     }
   } else {
-    errs() << "'" << Feature
-           << "' is not a recognized feature for this target"
+    errs() << "'" << Feature << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
 }
 
-
-/// getFeatureBits - Get feature bits a CPU.
-///
 FeatureBitset
 SubtargetFeatures::getFeatureBits(StringRef CPU,
                                   ArrayRef<SubtargetFeatureKV> CPUTable,
                                   ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   if (CPUTable.empty() || FeatureTable.empty())
     return FeatureBitset();
 
-#ifndef NDEBUG
   assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) &&
          "CPU table is not sorted");
   assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) &&
          "CPU features table is not sorted");
-#endif
   // Resulting bits
   FeatureBitset Bits;
 
@@ -253,17 +230,16 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
       // Set the feature implied by this CPU feature, if any.
       for (auto &FE : FeatureTable) {
         if ((CPUEntry->Value & FE.Value).any())
-          SetImpliedBits(Bits, &FE, FeatureTable);
+          SetImpliedBits(Bits, FE, FeatureTable);
       }
     } else {
-      errs() << "'" << CPU
-             << "' is not a recognized processor for this target"
+      errs() << "'" << CPU << "' is not a recognized processor for this target"
              << " (ignoring processor)\n";
     }
   }
 
   // Iterate through each feature
-  for (auto &Feature : Features) {
+  for (const std::string &Feature : Features) {
     // Check for help
     if (Feature == "+help")
       Help(CPUTable, FeatureTable);
@@ -274,27 +250,22 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
   return Bits;
 }
 
-/// print - Print feature string.
-///
 void SubtargetFeatures::print(raw_ostream &OS) const {
   for (auto &F : Features)
     OS << F << " ";
   OS << "\n";
 }
 
-/// dump - Dump feature info.
-///
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SubtargetFeatures::dump() const {
   print(dbgs());
 }
+#endif
 
-/// Adds the default features for the specified target triple.
-///
-/// FIXME: This is an inelegant way of specifying the features of a
-/// subtarget. It would be better if we could encode this information
-/// into the IR. See <rdar://5972456>.
-///
 void SubtargetFeatures::getDefaultSubtargetFeatures(const Triple& Triple) {
+  // FIXME: This is an inelegant way of specifying the features of a
+  // subtarget. It would be better if we could encode this information
+  // into the IR. See <rdar://5972456>.
   if (Triple.getVendor() == Triple::Apple) {
     if (Triple.getArch() == Triple::ppc) {
       // powerpc-apple-*
diff --git a/contrib/llvm/lib/MC/WasmObjectWriter.cpp b/contrib/llvm/lib/MC/WasmObjectWriter.cpp
new file mode 100644
index 0000000..0d31f65
--- /dev/null
+++ b/contrib/llvm/lib/MC/WasmObjectWriter.cpp
@@ -0,0 +1,1295 @@
+//===- lib/MC/WasmObjectWriter.cpp - Wasm File Writer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Wasm object file writer information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/StringSaver.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mc"
+
+namespace {
+
+// For patching purposes, we need to remember where each section starts, both
+// for patching up the section size field, and for patching up references to
+// locations within the section.
+struct SectionBookkeeping {
+  // Where the size of the section is written.
+  uint64_t SizeOffset;
+  // Where the contents of the section starts (after the header).
+  uint64_t ContentsOffset;
+};
+
+// The signature of a wasm function, in a struct capable of being used as a
+// DenseMap key.
+struct WasmFunctionType {
+  // Support empty and tombstone instances, needed by DenseMap.
+  enum { Plain, Empty, Tombstone } State;
+
+  // The return types of the function.
+  SmallVector<wasm::ValType, 1> Returns;
+
+  // The parameter types of the function.
+  SmallVector<wasm::ValType, 4> Params;
+
+  WasmFunctionType() : State(Plain) {}
+
+  bool operator==(const WasmFunctionType &Other) const {
+    return State == Other.State && Returns == Other.Returns &&
+           Params == Other.Params;
+  }
+};
+
+// Traits for using WasmFunctionType in a DenseMap.
+struct WasmFunctionTypeDenseMapInfo {
+  static WasmFunctionType getEmptyKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Empty;
+    return FuncTy;
+  }
+  static WasmFunctionType getTombstoneKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Tombstone;
+    return FuncTy;
+  }
+  static unsigned getHashValue(const WasmFunctionType &FuncTy) {
+    uintptr_t Value = FuncTy.State;
+    for (wasm::ValType Ret : FuncTy.Returns)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
+    for (wasm::ValType Param : FuncTy.Params)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
+    return Value;
+  }
+  static bool isEqual(const WasmFunctionType &LHS,
+                      const WasmFunctionType &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// A wasm import to be written into the import section.
+struct WasmImport {
+  StringRef ModuleName;
+  StringRef FieldName;
+  unsigned Kind;
+  int32_t Type;
+};
+
+// A wasm function to be written into the function section.
+struct WasmFunction {
+  int32_t Type;
+  const MCSymbolWasm *Sym;
+};
+
+// A wasm export to be written into the export section.
+struct WasmExport {
+  StringRef FieldName;
+  unsigned Kind;
+  uint32_t Index;
+};
+
+// A wasm global to be written into the global section.
+struct WasmGlobal {
+  wasm::ValType Type;
+  bool IsMutable;
+  bool HasImport;
+  uint64_t InitialValue;
+  uint32_t ImportIndex;
+};
+
+// Information about a single relocation.
+struct WasmRelocationEntry {
+  uint64_t Offset;                  // Where is the relocation.
+  const MCSymbolWasm *Symbol;       // The symbol to relocate with.
+  int64_t Addend;                   // A value to add to the symbol.
+  unsigned Type;                    // The type of the relocation.
+  const MCSectionWasm *FixupSection;// The section the relocation is targeting.
+
+  WasmRelocationEntry(uint64_t Offset, const MCSymbolWasm *Symbol,
+                      int64_t Addend, unsigned Type,
+                      const MCSectionWasm *FixupSection)
+      : Offset(Offset), Symbol(Symbol), Addend(Addend), Type(Type),
+        FixupSection(FixupSection) {}
+
+  bool hasAddend() const {
+    switch (Type) {
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  void print(raw_ostream &Out) const {
+    Out << "Off=" << Offset << ", Sym=" << *Symbol << ", Addend=" << Addend
+        << ", Type=" << Type << ", FixupSection=" << FixupSection;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+};
+
+#if !defined(NDEBUG)
+raw_ostream &operator<<(raw_ostream &OS, const WasmRelocationEntry &Rel) {
+  Rel.print(OS);
+  return OS;
+}
+#endif
+
+class WasmObjectWriter : public MCObjectWriter {
+  /// Helper struct for containing some precomputed information on symbols.
+  struct WasmSymbolData {
+    const MCSymbolWasm *Symbol;
+    StringRef Name;
+
+    // Support lexicographic sorting.
+    bool operator<(const WasmSymbolData &RHS) const { return Name < RHS.Name; }
+  };
+
+  /// The target specific Wasm writer instance.
+  std::unique_ptr<MCWasmObjectTargetWriter> TargetObjectWriter;
+
+  // Relocations for fixing up references in the code section.
+  std::vector<WasmRelocationEntry> CodeRelocations;
+
+  // Relocations for fixing up references in the data section.
+  std::vector<WasmRelocationEntry> DataRelocations;
+
+  // Index values to use for fixing up call_indirect type indices.
+  // Maps function symbols to the index of the type of the function
+  DenseMap<const MCSymbolWasm *, uint32_t> TypeIndices;
+  // Maps function symbols to the table element index space. Used
+  // for TABLE_INDEX relocation types (i.e. address taken functions).
+  DenseMap<const MCSymbolWasm *, uint32_t> IndirectSymbolIndices;
+  // Maps function/global symbols to the function/global index space.
+  DenseMap<const MCSymbolWasm *, uint32_t> SymbolIndices;
+
+  DenseMap<WasmFunctionType, int32_t, WasmFunctionTypeDenseMapInfo>
+      FunctionTypeIndices;
+  SmallVector<WasmFunctionType, 4> FunctionTypes;
+
+  // TargetObjectWriter wrappers.
+  bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup) const {
+    return TargetObjectWriter->getRelocType(Target, Fixup);
+  }
+
+  void startSection(SectionBookkeeping &Section, unsigned SectionId,
+                    const char *Name = nullptr);
+  void endSection(SectionBookkeeping &Section);
+
+public:
+  WasmObjectWriter(MCWasmObjectTargetWriter *MOTW, raw_pwrite_stream &OS)
+      : MCObjectWriter(OS, /*IsLittleEndian=*/true), TargetObjectWriter(MOTW) {}
+
+private:
+  ~WasmObjectWriter() override;
+
+  void reset() override {
+    CodeRelocations.clear();
+    DataRelocations.clear();
+    TypeIndices.clear();
+    SymbolIndices.clear();
+    IndirectSymbolIndices.clear();
+    FunctionTypeIndices.clear();
+    FunctionTypes.clear();
+    MCObjectWriter::reset();
+  }
+
+  void writeHeader(const MCAssembler &Asm);
+
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, uint64_t &FixedValue) override;
+
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override;
+
+  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+
+  void writeString(const StringRef Str) {
+    encodeULEB128(Str.size(), getStream());
+    writeBytes(Str);
+  }
+
+  void writeValueType(wasm::ValType Ty) {
+    encodeSLEB128(int32_t(Ty), getStream());
+  }
+
+  void writeTypeSection(const SmallVector<WasmFunctionType, 4> &FunctionTypes);
+  void writeImportSection(const SmallVector<WasmImport, 4> &Imports);
+  void writeFunctionSection(const SmallVector<WasmFunction, 4> &Functions);
+  void writeTableSection(uint32_t NumElements);
+  void writeMemorySection(const SmallVector<char, 0> &DataBytes);
+  void writeGlobalSection(const SmallVector<WasmGlobal, 4> &Globals);
+  void writeExportSection(const SmallVector<WasmExport, 4> &Exports);
+  void writeElemSection(const SmallVector<uint32_t, 4> &TableElems);
+  void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const SmallVector<WasmFunction, 4> &Functions);
+  uint64_t
+  writeDataSection(const SmallVector<char, 0> &DataBytes);
+  void writeNameSection(const SmallVector<WasmFunction, 4> &Functions,
+                        const SmallVector<WasmImport, 4> &Imports,
+                        uint32_t NumFuncImports);
+  void writeCodeRelocSection();
+  void writeDataRelocSection(uint64_t DataSectionHeaderSize);
+  void writeLinkingMetaDataSection(uint32_t DataSize, uint32_t DataAlignment,
+                                   ArrayRef<StringRef> WeakSymbols,
+                                   bool HasStackPointer,
+                                   uint32_t StackPointerGlobal);
+
+  void applyRelocations(ArrayRef<WasmRelocationEntry> Relocations,
+                        uint64_t ContentsOffset);
+
+  void writeRelocations(ArrayRef<WasmRelocationEntry> Relocations,
+                        uint64_t HeaderSize);
+  uint32_t getRelocationIndexValue(const WasmRelocationEntry &RelEntry);
+  uint32_t getFunctionType(const MCSymbolWasm& Symbol);
+  uint32_t registerFunctionType(const MCSymbolWasm& Symbol);
+};
+
+} // end anonymous namespace
+
+WasmObjectWriter::~WasmObjectWriter() {}
+
+// Return the padding size to write a 32-bit value into a 5-byte ULEB128.
+static unsigned PaddingFor5ByteULEB128(uint32_t X) {
+  return X == 0 ? 4 : (4u - (31u - countLeadingZeros(X)) / 7u);
+}
+
+// Return the padding size to write a 32-bit value into a 5-byte SLEB128.
+static unsigned PaddingFor5ByteSLEB128(int32_t X) {
+  return 5 - getSLEB128Size(X);
+}
+
+// Write out a section header and a patchable section size field.
+void WasmObjectWriter::startSection(SectionBookkeeping &Section,
+                                    unsigned SectionId,
+                                    const char *Name) {
+  assert((Name != nullptr) == (SectionId == wasm::WASM_SEC_CUSTOM) &&
+         "Only custom sections can have names");
+
+  DEBUG(dbgs() << "startSection " << SectionId << ": " << Name << "\n");
+  encodeULEB128(SectionId, getStream());
+
+  Section.SizeOffset = getStream().tell();
+
+  // The section size. We don't know the size yet, so reserve enough space
+  // for any 32-bit value; we'll patch it later.
+  encodeULEB128(UINT32_MAX, getStream());
+
+  // The position where the section starts, for measuring its size.
+  Section.ContentsOffset = getStream().tell();
+
+  // Custom sections in wasm also have a string identifier.
+  if (SectionId == wasm::WASM_SEC_CUSTOM) {
+    assert(Name);
+    writeString(StringRef(Name));
+  }
+}
+
+// Now that the section is complete and we know how big it is, patch up the
+// section size field at the start of the section.
+void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
+  uint64_t Size = getStream().tell() - Section.ContentsOffset;
+  if (uint32_t(Size) != Size)
+    report_fatal_error("section size does not fit in a uint32_t");
+
+  DEBUG(dbgs() << "endSection size=" << Size << "\n");
+  unsigned Padding = PaddingFor5ByteULEB128(Size);
+
+  // Write the final section size to the payload_len field, which follows
+  // the section id byte.
+  uint8_t Buffer[16];
+  unsigned SizeLen = encodeULEB128(Size, Buffer, Padding);
+  assert(SizeLen == 5);
+  getStream().pwrite((char *)Buffer, SizeLen, Section.SizeOffset);
+}
+
+// Emit the Wasm header.
+void WasmObjectWriter::writeHeader(const MCAssembler &Asm) {
+  writeBytes(StringRef(wasm::WasmMagic, sizeof(wasm::WasmMagic)));
+  writeLE32(wasm::WasmVersion);
+}
+
+void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
+                                                const MCAsmLayout &Layout) {
+}
+
+void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        uint64_t &FixedValue) {
+  MCAsmBackend &Backend = Asm.getBackend();
+  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
+                 MCFixupKindInfo::FKF_IsPCRel;
+  const auto &FixupSection = cast<MCSectionWasm>(*Fragment->getParent());
+  uint64_t C = Target.getConstant();
+  uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  MCContext &Ctx = Asm.getContext();
+
+  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+    assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
+           "Should not have constructed this");
+
+    // Let A, B and C being the components of Target and R be the location of
+    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
+    // If it is pcrel, we want to compute (A - B + C - R).
+
+    // In general, Wasm has no relocations for -B. It can only represent (A + C)
+    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
+    // replace B to implement it: (A - R - K + C)
+    if (IsPCRel) {
+      Ctx.reportError(
+          Fixup.getLoc(),
+          "No relocation available to represent this relative expression");
+      return;
+    }
+
+    const auto &SymB = cast<MCSymbolWasm>(RefB->getSymbol());
+
+    if (SymB.isUndefined()) {
+      Ctx.reportError(Fixup.getLoc(),
+                      Twine("symbol '") + SymB.getName() +
+                          "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    assert(!SymB.isAbsolute() && "Should have been folded");
+    const MCSection &SecB = SymB.getSection();
+    if (&SecB != &FixupSection) {
+      Ctx.reportError(Fixup.getLoc(),
+                      "Cannot represent a difference across sections");
+      return;
+    }
+
+    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
+    uint64_t K = SymBOffset - FixupOffset;
+    IsPCRel = true;
+    C -= K;
+  }
+
+  // We either rejected the fixup or folded B into C at this point.
+  const MCSymbolRefExpr *RefA = Target.getSymA();
+  const auto *SymA = RefA ? cast<MCSymbolWasm>(&RefA->getSymbol()) : nullptr;
+
+  if (SymA && SymA->isVariable()) {
+    const MCExpr *Expr = SymA->getVariableValue();
+    const auto *Inner = cast<MCSymbolRefExpr>(Expr);
+    if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF)
+      llvm_unreachable("weakref used in reloc not yet implemented");
+  }
+
+  // Put any constant offset in an addend. Offsets can be negative, and
+  // LLVM expects wrapping, in contrast to wasm's immediates which can't
+  // be negative and don't wrap.
+  FixedValue = 0;
+
+  if (SymA)
+    SymA->setUsedInReloc();
+
+  assert(!IsPCRel);
+  assert(SymA);
+
+  unsigned Type = getRelocType(Target, Fixup);
+
+  WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection);
+  DEBUG(dbgs() << "WasmReloc: " << Rec << "\n");
+
+  if (FixupSection.hasInstructions())
+    CodeRelocations.push_back(Rec);
+  else
+    DataRelocations.push_back(Rec);
+}
+
+// Write X as an (unsigned) LEB value at offset Offset in Stream, padded
+// to allow patching.
+static void
+WritePatchableLEB(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+  uint8_t Buffer[5];
+  unsigned Padding = PaddingFor5ByteULEB128(X);
+  unsigned SizeLen = encodeULEB128(X, Buffer, Padding);
+  assert(SizeLen == 5);
+  Stream.pwrite((char *)Buffer, SizeLen, Offset);
+}
+
+// Write X as an signed LEB value at offset Offset in Stream, padded
+// to allow patching.
+static void
+WritePatchableSLEB(raw_pwrite_stream &Stream, int32_t X, uint64_t Offset) {
+  uint8_t Buffer[5];
+  unsigned Padding = PaddingFor5ByteSLEB128(X);
+  unsigned SizeLen = encodeSLEB128(X, Buffer, Padding);
+  assert(SizeLen == 5);
+  Stream.pwrite((char *)Buffer, SizeLen, Offset);
+}
+
+// Write X as a plain integer value at offset Offset in Stream.
+static void WriteI32(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+  uint8_t Buffer[4];
+  support::endian::write32le(Buffer, X);
+  Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
+}
+
+// Compute a value to write into the code at the location covered
+// by RelEntry. This value isn't used by the static linker, since
+// we have addends; it just serves to make the code more readable
+// and to make standalone wasm modules directly usable.
+static uint32_t ProvisionalValue(const WasmRelocationEntry &RelEntry) {
+  const MCSymbolWasm *Sym = RelEntry.Symbol;
+
+  // For undefined symbols, use a hopefully invalid value.
+  if (!Sym->isDefined(/*SetUsed=*/false))
+    return UINT32_MAX;
+
+  const auto &Section = cast<MCSectionWasm>(RelEntry.Symbol->getSection(false));
+  uint64_t Address = Section.getSectionOffset() + RelEntry.Addend;
+
+  // Ignore overflow. LLVM allows address arithmetic to silently wrap.
+  uint32_t Value = Address;
+
+  return Value;
+}
+
+uint32_t WasmObjectWriter::getRelocationIndexValue(
+    const WasmRelocationEntry &RelEntry) {
+  switch (RelEntry.Type) {
+  case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+  case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+    if (!IndirectSymbolIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found table index space: " +
+                         RelEntry.Symbol->getName());
+    return IndirectSymbolIndices[RelEntry.Symbol];
+  case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+  case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
+  case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+  case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+  case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+    if (!SymbolIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found function/global index space: " +
+                         RelEntry.Symbol->getName());
+    return SymbolIndices[RelEntry.Symbol];
+  case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+    if (!TypeIndices.count(RelEntry.Symbol))
+      report_fatal_error("symbol not found in type index space: " +
+                         RelEntry.Symbol->getName());
+    return TypeIndices[RelEntry.Symbol];
+  default:
+    llvm_unreachable("invalid relocation type");
+  }
+}
+
+// Apply the portions of the relocation records that we can handle ourselves
+// directly.
+void WasmObjectWriter::applyRelocations(
+    ArrayRef<WasmRelocationEntry> Relocations, uint64_t ContentsOffset) {
+  raw_pwrite_stream &Stream = getStream();
+  for (const WasmRelocationEntry &RelEntry : Relocations) {
+    uint64_t Offset = ContentsOffset +
+                      RelEntry.FixupSection->getSectionOffset() +
+                      RelEntry.Offset;
+
+    DEBUG(dbgs() << "applyRelocation: " << RelEntry << "\n");
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB: {
+      uint32_t Index = getRelocationIndexValue(RelEntry);
+      WritePatchableSLEB(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: {
+      uint32_t Index = getRelocationIndexValue(RelEntry);
+      WriteI32(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+      WritePatchableSLEB(Stream, Value, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+      WritePatchableLEB(Stream, Value, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+      WriteI32(Stream, Value, Offset);
+      break;
+    }
+    default:
+      llvm_unreachable("invalid relocation type");
+    }
+  }
+}
+
+// Write out the portions of the relocation records that the linker will
+// need to handle.
+void WasmObjectWriter::writeRelocations(
+    ArrayRef<WasmRelocationEntry> Relocations, uint64_t HeaderSize) {
+  raw_pwrite_stream &Stream = getStream();
+  for (const WasmRelocationEntry& RelEntry : Relocations) {
+
+    uint64_t Offset = RelEntry.Offset +
+                      RelEntry.FixupSection->getSectionOffset() + HeaderSize;
+    uint32_t Index = getRelocationIndexValue(RelEntry);
+
+    encodeULEB128(RelEntry.Type, Stream);
+    encodeULEB128(Offset, Stream);
+    encodeULEB128(Index, Stream);
+    if (RelEntry.hasAddend())
+      encodeSLEB128(RelEntry.Addend, Stream);
+  }
+}
+
+void WasmObjectWriter::writeTypeSection(
+    const SmallVector<WasmFunctionType, 4> &FunctionTypes) {
+  if (FunctionTypes.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_TYPE);
+
+  encodeULEB128(FunctionTypes.size(), getStream());
+
+  for (const WasmFunctionType &FuncTy : FunctionTypes) {
+    encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream());
+    encodeULEB128(FuncTy.Params.size(), getStream());
+    for (wasm::ValType Ty : FuncTy.Params)
+      writeValueType(Ty);
+    encodeULEB128(FuncTy.Returns.size(), getStream());
+    for (wasm::ValType Ty : FuncTy.Returns)
+      writeValueType(Ty);
+  }
+
+  endSection(Section);
+}
+
+
+void WasmObjectWriter::writeImportSection(
+    const SmallVector<WasmImport, 4> &Imports) {
+  if (Imports.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_IMPORT);
+
+  encodeULEB128(Imports.size(), getStream());
+  for (const WasmImport &Import : Imports) {
+    writeString(Import.ModuleName);
+    writeString(Import.FieldName);
+
+    encodeULEB128(Import.Kind, getStream());
+
+    switch (Import.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      encodeULEB128(Import.Type, getStream());
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      encodeSLEB128(int32_t(Import.Type), getStream());
+      encodeULEB128(0, getStream()); // mutability
+      break;
+    default:
+      llvm_unreachable("unsupported import kind");
+    }
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeFunctionSection(
+    const SmallVector<WasmFunction, 4> &Functions) {
+  if (Functions.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_FUNCTION);
+
+  encodeULEB128(Functions.size(), getStream());
+  for (const WasmFunction &Func : Functions)
+    encodeULEB128(Func.Type, getStream());
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeTableSection(uint32_t NumElements) {
+  // For now, always emit the table section, since indirect calls are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no indirect calls.
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_TABLE);
+
+  encodeULEB128(1, getStream());                       // The number of tables.
+                                                       // Fixed to 1 for now.
+  encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream()); // Type of table
+  encodeULEB128(0, getStream());                       // flags
+  encodeULEB128(NumElements, getStream());             // initial
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeMemorySection(
+    const SmallVector<char, 0> &DataBytes) {
+  // For now, always emit the memory section, since loads and stores are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no loads or stores.
+  SectionBookkeeping Section;
+  uint32_t NumPages =
+      (DataBytes.size() + wasm::WasmPageSize - 1) / wasm::WasmPageSize;
+
+  startSection(Section, wasm::WASM_SEC_MEMORY);
+  encodeULEB128(1, getStream()); // number of memory spaces
+
+  encodeULEB128(0, getStream()); // flags
+  encodeULEB128(NumPages, getStream()); // initial
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeGlobalSection(
+    const SmallVector<WasmGlobal, 4> &Globals) {
+  if (Globals.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_GLOBAL);
+
+  encodeULEB128(Globals.size(), getStream());
+  for (const WasmGlobal &Global : Globals) {
+    writeValueType(Global.Type);
+    write8(Global.IsMutable);
+
+    if (Global.HasImport) {
+      assert(Global.InitialValue == 0);
+      write8(wasm::WASM_OPCODE_GET_GLOBAL);
+      encodeULEB128(Global.ImportIndex, getStream());
+    } else {
+      assert(Global.ImportIndex == 0);
+      write8(wasm::WASM_OPCODE_I32_CONST);
+      encodeSLEB128(Global.InitialValue, getStream()); // offset
+    }
+    write8(wasm::WASM_OPCODE_END);
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeExportSection(
+    const SmallVector<WasmExport, 4> &Exports) {
+  if (Exports.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_EXPORT);
+
+  encodeULEB128(Exports.size(), getStream());
+  for (const WasmExport &Export : Exports) {
+    writeString(Export.FieldName);
+    encodeSLEB128(Export.Kind, getStream());
+    encodeULEB128(Export.Index, getStream());
+  }
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeElemSection(
+    const SmallVector<uint32_t, 4> &TableElems) {
+  if (TableElems.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_ELEM);
+
+  encodeULEB128(1, getStream()); // number of "segments"
+  encodeULEB128(0, getStream()); // the table index
+
+  // init expr for starting offset
+  write8(wasm::WASM_OPCODE_I32_CONST);
+  encodeSLEB128(0, getStream());
+  write8(wasm::WASM_OPCODE_END);
+
+  encodeULEB128(TableElems.size(), getStream());
+  for (uint32_t Elem : TableElems)
+    encodeULEB128(Elem, getStream());
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeCodeSection(
+    const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const SmallVector<WasmFunction, 4> &Functions) {
+  if (Functions.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CODE);
+
+  encodeULEB128(Functions.size(), getStream());
+
+  for (const WasmFunction &Func : Functions) {
+    auto &FuncSection = static_cast<MCSectionWasm &>(Func.Sym->getSection());
+
+    int64_t Size = 0;
+    if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
+      report_fatal_error(".size expression must be evaluatable");
+
+    encodeULEB128(Size, getStream());
+
+    FuncSection.setSectionOffset(getStream().tell() - Section.ContentsOffset);
+
+    Asm.writeSectionData(&FuncSection, Layout);
+  }
+
+  // Apply fixups.
+  applyRelocations(CodeRelocations, Section.ContentsOffset);
+
+  endSection(Section);
+}
+
+uint64_t WasmObjectWriter::writeDataSection(
+    const SmallVector<char, 0> &DataBytes) {
+  if (DataBytes.empty())
+    return 0;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_DATA);
+
+  encodeULEB128(1, getStream()); // count
+  encodeULEB128(0, getStream()); // memory index
+  write8(wasm::WASM_OPCODE_I32_CONST);
+  encodeSLEB128(0, getStream()); // offset
+  write8(wasm::WASM_OPCODE_END);
+  encodeULEB128(DataBytes.size(), getStream()); // size
+  uint32_t HeaderSize = getStream().tell() - Section.ContentsOffset;
+  writeBytes(DataBytes); // data
+
+  // Apply fixups.
+  applyRelocations(DataRelocations, Section.ContentsOffset + HeaderSize);
+
+  endSection(Section);
+  return HeaderSize;
+}
+
+void WasmObjectWriter::writeNameSection(
+    const SmallVector<WasmFunction, 4> &Functions,
+    const SmallVector<WasmImport, 4> &Imports,
+    unsigned NumFuncImports) {
+  uint32_t TotalFunctions = NumFuncImports + Functions.size();
+  if (TotalFunctions == 0)
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "name");
+  SectionBookkeeping SubSection;
+  startSection(SubSection, wasm::WASM_NAMES_FUNCTION);
+
+  encodeULEB128(TotalFunctions, getStream());
+  uint32_t Index = 0;
+  for (const WasmImport &Import : Imports) {
+    if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+      encodeULEB128(Index, getStream());
+      writeString(Import.FieldName);
+      ++Index;
+    }
+  }
+  for (const WasmFunction &Func : Functions) {
+    encodeULEB128(Index, getStream());
+    writeString(Func.Sym->getName());
+    ++Index;
+  }
+
+  endSection(SubSection);
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeCodeRelocSection() {
+  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // for descriptions of the reloc sections.
+
+  if (CodeRelocations.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE");
+
+  encodeULEB128(wasm::WASM_SEC_CODE, getStream());
+  encodeULEB128(CodeRelocations.size(), getStream());
+
+  writeRelocations(CodeRelocations, 0);
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeDataRelocSection(uint64_t DataSectionHeaderSize) {
+  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // for descriptions of the reloc sections.
+
+  if (DataRelocations.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA");
+
+  encodeULEB128(wasm::WASM_SEC_DATA, getStream());
+  encodeULEB128(DataRelocations.size(), getStream());
+
+  writeRelocations(DataRelocations, DataSectionHeaderSize);
+
+  endSection(Section);
+}
+
+void WasmObjectWriter::writeLinkingMetaDataSection(
+    uint32_t DataSize, uint32_t DataAlignment, ArrayRef<StringRef> WeakSymbols,
+    bool HasStackPointer, uint32_t StackPointerGlobal) {
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
+  SectionBookkeeping SubSection;
+
+  if (HasStackPointer) {
+    startSection(SubSection, wasm::WASM_STACK_POINTER);
+    encodeULEB128(StackPointerGlobal, getStream()); // id
+    endSection(SubSection);
+  }
+
+  if (WeakSymbols.size() != 0) {
+    startSection(SubSection, wasm::WASM_SYMBOL_INFO);
+    encodeULEB128(WeakSymbols.size(), getStream());
+    for (const StringRef Export: WeakSymbols) {
+      writeString(Export);
+      encodeULEB128(wasm::WASM_SYMBOL_FLAG_WEAK, getStream());
+    }
+    endSection(SubSection);
+  }
+
+  if (DataSize > 0) {
+    startSection(SubSection, wasm::WASM_DATA_SIZE);
+    encodeULEB128(DataSize, getStream());
+    endSection(SubSection);
+
+    startSection(SubSection, wasm::WASM_DATA_ALIGNMENT);
+    encodeULEB128(DataAlignment, getStream());
+    endSection(SubSection);
+  }
+
+  endSection(Section);
+}
+
+uint32_t WasmObjectWriter::getFunctionType(const MCSymbolWasm& Symbol) {
+  assert(Symbol.isFunction());
+  assert(TypeIndices.count(&Symbol));
+  return TypeIndices[&Symbol];
+}
+
+uint32_t WasmObjectWriter::registerFunctionType(const MCSymbolWasm& Symbol) {
+  assert(Symbol.isFunction());
+
+  WasmFunctionType F;
+  if (Symbol.isVariable()) {
+    const MCExpr *Expr = Symbol.getVariableValue();
+    auto *Inner = cast<MCSymbolRefExpr>(Expr);
+    const auto *ResolvedSym = cast<MCSymbolWasm>(&Inner->getSymbol());
+    F.Returns = ResolvedSym->getReturns();
+    F.Params = ResolvedSym->getParams();
+  } else {
+    F.Returns = Symbol.getReturns();
+    F.Params = Symbol.getParams();
+  }
+
+  auto Pair =
+      FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+  if (Pair.second)
+    FunctionTypes.push_back(F);
+  TypeIndices[&Symbol] = Pair.first->second;
+
+  DEBUG(dbgs() << "registerFunctionType: " << Symbol << " new:" << Pair.second << "\n");
+  DEBUG(dbgs() << "  -> type index: " << Pair.first->second << "\n");
+  return Pair.first->second;
+}
+
+void WasmObjectWriter::writeObject(MCAssembler &Asm,
+                                   const MCAsmLayout &Layout) {
+  DEBUG(dbgs() << "WasmObjectWriter::writeObject\n");
+  MCContext &Ctx = Asm.getContext();
+  wasm::ValType PtrType = is64Bit() ? wasm::ValType::I64 : wasm::ValType::I32;
+
+  // Collect information from the available symbols.
+  SmallVector<WasmFunction, 4> Functions;
+  SmallVector<uint32_t, 4> TableElems;
+  SmallVector<WasmGlobal, 4> Globals;
+  SmallVector<WasmImport, 4> Imports;
+  SmallVector<WasmExport, 4> Exports;
+  SmallVector<StringRef, 4> WeakSymbols;
+  SmallPtrSet<const MCSymbolWasm *, 4> IsAddressTaken;
+  unsigned NumFuncImports = 0;
+  unsigned NumGlobalImports = 0;
+  SmallVector<char, 0> DataBytes;
+  uint32_t DataAlignment = 1;
+  uint32_t StackPointerGlobal = 0;
+  bool HasStackPointer = false;
+
+  // Populate the IsAddressTaken set.
+  for (const WasmRelocationEntry &RelEntry : CodeRelocations) {
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+      IsAddressTaken.insert(RelEntry.Symbol);
+      break;
+    default:
+      break;
+    }
+  }
+  for (const WasmRelocationEntry &RelEntry : DataRelocations) {
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      IsAddressTaken.insert(RelEntry.Symbol);
+      break;
+    default:
+      break;
+    }
+  }
+
+  // Populate the Imports set.
+  for (const MCSymbol &S : Asm.symbols()) {
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+
+    if (WS.isTemporary())
+      continue;
+
+    if (WS.isFunction())
+      registerFunctionType(WS);
+
+    // If the symbol is not defined in this translation unit, import it.
+    if (!WS.isDefined(/*SetUsed=*/false) || WS.isVariable()) {
+      WasmImport Import;
+      Import.ModuleName = WS.getModuleName();
+      Import.FieldName = WS.getName();
+
+      if (WS.isFunction()) {
+        Import.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+        Import.Type = getFunctionType(WS);
+        SymbolIndices[&WS] = NumFuncImports;
+        ++NumFuncImports;
+      } else {
+        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+        Import.Type = int32_t(PtrType);
+        SymbolIndices[&WS] = NumGlobalImports;
+        ++NumGlobalImports;
+      }
+
+      Imports.push_back(Import);
+    }
+  }
+
+  // In the special .global_variables section, we've encoded global
+  // variables used by the function. Translate them into the Globals
+  // list.
+  MCSectionWasm *GlobalVars = Ctx.getWasmSection(".global_variables", 0, 0);
+  if (!GlobalVars->getFragmentList().empty()) {
+    if (GlobalVars->getFragmentList().size() != 1)
+      report_fatal_error("only one .global_variables fragment supported");
+    const MCFragment &Frag = *GlobalVars->begin();
+    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+      report_fatal_error("only data supported in .global_variables");
+    const auto &DataFrag = cast<MCDataFragment>(Frag);
+    if (!DataFrag.getFixups().empty())
+      report_fatal_error("fixups not supported in .global_variables");
+    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+    for (const uint8_t *p = (const uint8_t *)Contents.data(),
+                     *end = (const uint8_t *)Contents.data() + Contents.size();
+         p != end; ) {
+      WasmGlobal G;
+      if (end - p < 3)
+        report_fatal_error("truncated global variable encoding");
+      G.Type = wasm::ValType(int8_t(*p++));
+      G.IsMutable = bool(*p++);
+      G.HasImport = bool(*p++);
+      if (G.HasImport) {
+        G.InitialValue = 0;
+
+        WasmImport Import;
+        Import.ModuleName = (const char *)p;
+        const uint8_t *nul = (const uint8_t *)memchr(p, '\0', end - p);
+        if (!nul)
+          report_fatal_error("global module name must be nul-terminated");
+        p = nul + 1;
+        nul = (const uint8_t *)memchr(p, '\0', end - p);
+        if (!nul)
+          report_fatal_error("global base name must be nul-terminated");
+        Import.FieldName = (const char *)p;
+        p = nul + 1;
+
+        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+        Import.Type = int32_t(G.Type);
+
+        G.ImportIndex = NumGlobalImports;
+        ++NumGlobalImports;
+
+        Imports.push_back(Import);
+      } else {
+        unsigned n;
+        G.InitialValue = decodeSLEB128(p, &n);
+        G.ImportIndex = 0;
+        if ((ptrdiff_t)n > end - p)
+          report_fatal_error("global initial value must be valid SLEB128");
+        p += n;
+      }
+      Globals.push_back(G);
+    }
+  }
+
+  // In the special .stack_pointer section, we've encoded the stack pointer
+  // index.
+  MCSectionWasm *StackPtr = Ctx.getWasmSection(".stack_pointer", 0, 0);
+  if (!StackPtr->getFragmentList().empty()) {
+    if (StackPtr->getFragmentList().size() != 1)
+      report_fatal_error("only one .stack_pointer fragment supported");
+    const MCFragment &Frag = *StackPtr->begin();
+    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+      report_fatal_error("only data supported in .stack_pointer");
+    const auto &DataFrag = cast<MCDataFragment>(Frag);
+    if (!DataFrag.getFixups().empty())
+      report_fatal_error("fixups not supported in .stack_pointer");
+    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+    if (Contents.size() != 4)
+      report_fatal_error("only one entry supported in .stack_pointer");
+    HasStackPointer = true;
+    StackPointerGlobal = NumGlobalImports + *(const int32_t *)Contents.data();
+  }
+
+  // Handle regular defined and undefined symbols.
+  for (const MCSymbol &S : Asm.symbols()) {
+    // Ignore unnamed temporary symbols, which aren't ever exported, imported,
+    // or used in relocations.
+    if (S.isTemporary() && S.getName().empty())
+      continue;
+
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    DEBUG(dbgs() << "MCSymbol: '" << S << "'"
+                 << " isDefined=" << S.isDefined() << " isExternal="
+                 << S.isExternal() << " isTemporary=" << S.isTemporary()
+                 << " isFunction=" << WS.isFunction()
+                 << " isWeak=" << WS.isWeak()
+                 << " isVariable=" << WS.isVariable() << "\n");
+
+    if (WS.isWeak())
+      WeakSymbols.push_back(WS.getName());
+
+    if (WS.isVariable())
+      continue;
+
+    unsigned Index;
+
+    if (WS.isFunction()) {
+      if (WS.isDefined(/*SetUsed=*/false)) {
+        if (WS.getOffset() != 0)
+          report_fatal_error(
+              "function sections must contain one function each");
+
+        if (WS.getSize() == 0)
+          report_fatal_error(
+              "function symbols must have a size set with .size");
+
+        // A definition. Take the next available index.
+        Index = NumFuncImports + Functions.size();
+
+        // Prepare the function.
+        WasmFunction Func;
+        Func.Type = getFunctionType(WS);
+        Func.Sym = &WS;
+        SymbolIndices[&WS] = Index;
+        Functions.push_back(Func);
+      } else {
+        // An import; the index was assigned above.
+        Index = SymbolIndices.find(&WS)->second;
+      }
+
+      DEBUG(dbgs() << "  -> function index: " << Index << "\n");
+
+      // If needed, prepare the function to be called indirectly.
+      if (IsAddressTaken.count(&WS) != 0) {
+        IndirectSymbolIndices[&WS] = TableElems.size();
+        DEBUG(dbgs() << "  -> adding to table: " << TableElems.size() << "\n");
+        TableElems.push_back(Index);
+      }
+    } else {
+      if (WS.isTemporary() && !WS.getSize())
+        continue;
+
+      if (!WS.isDefined(/*SetUsed=*/false))
+        continue;
+
+      if (WS.getOffset() != 0)
+        report_fatal_error("data sections must contain one variable each: " +
+                           WS.getName());
+      if (!WS.getSize())
+        report_fatal_error("data symbols must have a size set with .size: " +
+                           WS.getName());
+
+      int64_t Size = 0;
+      if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+
+      auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
+
+      if (uint64_t(Size) != Layout.getSectionFileSize(&DataSection))
+        report_fatal_error("data sections must contain at most one variable");
+
+      DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
+      DataAlignment = std::max(DataAlignment, DataSection.getAlignment());
+
+      DataSection.setSectionOffset(DataBytes.size());
+
+      for (const MCFragment &Frag : DataSection) {
+        if (Frag.hasInstructions())
+          report_fatal_error("only data supported in data sections");
+
+        if (auto *Align = dyn_cast<MCAlignFragment>(&Frag)) {
+          if (Align->getValueSize() != 1)
+            report_fatal_error("only byte values supported for alignment");
+          // If nops are requested, use zeros, as this is the data section.
+          uint8_t Value = Align->hasEmitNops() ? 0 : Align->getValue();
+          uint64_t Size = std::min<uint64_t>(alignTo(DataBytes.size(),
+                                                     Align->getAlignment()),
+                                             DataBytes.size() +
+                                                 Align->getMaxBytesToEmit());
+          DataBytes.resize(Size, Value);
+        } else if (auto *Fill = dyn_cast<MCFillFragment>(&Frag)) {
+          DataBytes.insert(DataBytes.end(), Fill->getSize(), Fill->getValue());
+        } else {
+          const auto &DataFrag = cast<MCDataFragment>(Frag);
+          const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+
+          DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
+        }
+      }
+
+      // For each global, prepare a corresponding wasm global holding its
+      // address.  For externals these will also be named exports.
+      Index = NumGlobalImports + Globals.size();
+
+      WasmGlobal Global;
+      Global.Type = PtrType;
+      Global.IsMutable = false;
+      Global.HasImport = false;
+      Global.InitialValue = DataSection.getSectionOffset();
+      Global.ImportIndex = 0;
+      SymbolIndices[&WS] = Index;
+      DEBUG(dbgs() << "  -> global index: " << Index << "\n");
+      Globals.push_back(Global);
+    }
+
+    // If the symbol is visible outside this translation unit, export it.
+    if ((WS.isExternal() && WS.isDefined(/*SetUsed=*/false))) {
+      WasmExport Export;
+      Export.FieldName = WS.getName();
+      Export.Index = Index;
+      if (WS.isFunction())
+        Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+      else
+        Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+      DEBUG(dbgs() << "  -> export " << Exports.size() << "\n");
+      Exports.push_back(Export);
+    }
+  }
+
+  // Handle weak aliases. We need to process these in a separate pass because
+  // we need to have processed the target of the alias before the alias itself
+  // and the symbols are not necessarily ordered in this way.
+  for (const MCSymbol &S : Asm.symbols()) {
+    if (!S.isVariable())
+      continue;
+    assert(S.isDefined(/*SetUsed=*/false));
+
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    // Find the target symbol of this weak alias and export that index
+    const MCExpr *Expr = WS.getVariableValue();
+    auto *Inner = cast<MCSymbolRefExpr>(Expr);
+    const auto *ResolvedSym = cast<MCSymbolWasm>(&Inner->getSymbol());
+    DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *ResolvedSym << "'\n");
+    assert(SymbolIndices.count(ResolvedSym) > 0);
+    uint32_t Index = SymbolIndices.find(ResolvedSym)->second;
+    DEBUG(dbgs() << "  -> index:" << Index << "\n");
+
+    WasmExport Export;
+    Export.FieldName = WS.getName();
+    Export.Index = Index;
+    if (WS.isFunction())
+      Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+    else
+      Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+    DEBUG(dbgs() << "  -> export " << Exports.size() << "\n");
+    Exports.push_back(Export);
+  }
+
+  // Add types for indirect function calls.
+  for (const WasmRelocationEntry &Fixup : CodeRelocations) {
+    if (Fixup.Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB)
+      continue;
+
+    registerFunctionType(*Fixup.Symbol);
+  }
+
+  // Write out the Wasm header.
+  writeHeader(Asm);
+
+  writeTypeSection(FunctionTypes);
+  writeImportSection(Imports);
+  writeFunctionSection(Functions);
+  writeTableSection(TableElems.size());
+  writeMemorySection(DataBytes);
+  writeGlobalSection(Globals);
+  writeExportSection(Exports);
+  // TODO: Start Section
+  writeElemSection(TableElems);
+  writeCodeSection(Asm, Layout, Functions);
+  uint64_t DataSectionHeaderSize = writeDataSection(DataBytes);
+  writeNameSection(Functions, Imports, NumFuncImports);
+  writeCodeRelocSection();
+  writeDataRelocSection(DataSectionHeaderSize);
+  writeLinkingMetaDataSection(DataBytes.size(), DataAlignment, WeakSymbols, HasStackPointer, StackPointerGlobal);
+
+  // TODO: Translate the .comment section to the output.
+  // TODO: Translate debug sections to the output.
+}
+
+MCObjectWriter *llvm::createWasmObjectWriter(MCWasmObjectTargetWriter *MOTW,
+                                             raw_pwrite_stream &OS) {
+  return new WasmObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
index afc5c6a..956ae70 100644
--- a/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/WinCOFFObjectWriter.cpp -------------------------*- C++ -*-===//
+//===- llvm/MC/WinCOFFObjectWriter.cpp ------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,38 +11,51 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Config/config.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/JamCRC.h"
-#include <cstdio>
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <ctime>
+#include <memory>
+#include <string>
+#include <vector>
 
 using namespace llvm;
+using llvm::support::endian::write32le;
 
 #define DEBUG_TYPE "WinCOFFObjectWriter"
 
 namespace {
-typedef SmallString<COFF::NameSize> name;
+
+using name = SmallString<COFF::NameSize>;
 
 enum AuxiliaryType {
   ATFunctionDefinition,
@@ -57,25 +70,24 @@ struct AuxSymbol {
   COFF::Auxiliary Aux;
 };
 
-class COFFSymbol;
 class COFFSection;
 
 class COFFSymbol {
 public:
-  COFF::symbol Data;
+  COFF::symbol Data = {};
 
-  typedef SmallVector<AuxSymbol, 1> AuxiliarySymbols;
+  using AuxiliarySymbols = SmallVector<AuxSymbol, 1>;
 
   name Name;
   int Index;
   AuxiliarySymbols Aux;
-  COFFSymbol *Other;
-  COFFSection *Section;
-  int Relocations;
+  COFFSymbol *Other = nullptr;
+  COFFSection *Section = nullptr;
+  int Relocations = 0;
+  const MCSymbol *MC = nullptr;
 
-  const MCSymbol *MC;
+  COFFSymbol(StringRef Name) : Name(Name) {}
 
-  COFFSymbol(StringRef name);
   void set_name_offset(uint32_t Offset);
 
   int64_t getIndex() const { return Index; }
@@ -89,39 +101,40 @@ public:
 // This class contains staging data for a COFF relocation entry.
 struct COFFRelocation {
   COFF::relocation Data;
-  COFFSymbol *Symb;
+  COFFSymbol *Symb = nullptr;
+
+  COFFRelocation() = default;
 
-  COFFRelocation() : Symb(nullptr) {}
   static size_t size() { return COFF::RelocationSize; }
 };
 
-typedef std::vector<COFFRelocation> relocations;
+using relocations = std::vector<COFFRelocation>;
 
 class COFFSection {
 public:
-  COFF::section Header;
+  COFF::section Header = {};
 
   std::string Name;
   int Number;
-  MCSectionCOFF const *MCSection;
-  COFFSymbol *Symbol;
+  MCSectionCOFF const *MCSection = nullptr;
+  COFFSymbol *Symbol = nullptr;
   relocations Relocations;
 
-  COFFSection(StringRef name);
+  COFFSection(StringRef Name) : Name(Name) {}
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
 public:
-  typedef std::vector<std::unique_ptr<COFFSymbol>> symbols;
-  typedef std::vector<std::unique_ptr<COFFSection>> sections;
+  using symbols = std::vector<std::unique_ptr<COFFSymbol>>;
+  using sections = std::vector<std::unique_ptr<COFFSection>>;
 
-  typedef DenseMap<MCSymbol const *, COFFSymbol *> symbol_map;
-  typedef DenseMap<MCSection const *, COFFSection *> section_map;
+  using symbol_map = DenseMap<MCSymbol const *, COFFSymbol *>;
+  using section_map = DenseMap<MCSection const *, COFFSection *>;
 
   std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
 
   // Root level file contents.
-  COFF::header Header;
+  COFF::header Header = {};
   sections Sections;
   symbols Symbols;
   StringTableBuilder Strings{StringTableBuilder::WinCOFF};
@@ -149,9 +162,6 @@ public:
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol *Symbol);
   COFFSection *createSection(StringRef Name);
 
-  template <typename object_t, typename list_t>
-  object_t *createCOFFEntity(StringRef Name, list_t &List);
-
   void defineSection(MCSectionCOFF const &Sec);
 
   COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
@@ -168,8 +178,12 @@ public:
   void WriteFileHeader(const COFF::header &Header);
   void WriteSymbol(const COFFSymbol &S);
   void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
-  void writeSectionHeader(const COFF::section &S);
+  void writeSectionHeaders();
   void WriteRelocation(const COFF::relocation &R);
+  uint32_t writeSectionContents(MCAssembler &Asm, const MCAsmLayout &Layout,
+                                const MCSection &MCSec);
+  void writeSection(MCAssembler &Asm, const MCAsmLayout &Layout,
+                    const COFFSection &Sec, const MCSection &MCSec);
 
   // MCObjectWriter interface implementation.
 
@@ -181,45 +195,28 @@ public:
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  bool isWeak(const MCSymbol &Sym) const override;
-
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, bool &IsPCRel,
-                        uint64_t &FixedValue) override;
+                        MCValue Target, uint64_t &FixedValue) override;
+
+  void createFileSymbols(MCAssembler &Asm);
+  void assignSectionNumbers();
+  void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
 
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 };
-}
 
-static inline void write_uint32_le(void *Data, uint32_t Value) {
-  support::endian::write<uint32_t, support::little, support::unaligned>(Data,
-                                                                        Value);
-}
+} // end anonymous namespace
 
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
-COFFSymbol::COFFSymbol(StringRef name)
-    : Name(name.begin(), name.end()), Other(nullptr), Section(nullptr),
-      Relocations(0), MC(nullptr) {
-  memset(&Data, 0, sizeof(Data));
-}
-
 // In the case that the name does not fit within 8 bytes, the offset
 // into the string table is stored in the last 4 bytes instead, leaving
 // the first 4 bytes as 0.
 void COFFSymbol::set_name_offset(uint32_t Offset) {
-  write_uint32_le(Data.Name + 0, 0);
-  write_uint32_le(Data.Name + 4, Offset);
-}
-
-//------------------------------------------------------------------------------
-// Section class implementation
-
-COFFSection::COFFSection(StringRef name)
-    : Name(name), MCSection(nullptr), Symbol(nullptr) {
-  memset(&Header, 0, sizeof(Header));
+  write32le(Data.Name + 0, 0);
+  write32le(Data.Name + 4, Offset);
 }
 
 //------------------------------------------------------------------------------
@@ -228,115 +225,92 @@ COFFSection::COFFSection(StringRef name)
 WinCOFFObjectWriter::WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
                                          raw_pwrite_stream &OS)
     : MCObjectWriter(OS, true), TargetObjectWriter(MOTW) {
-  memset(&Header, 0, sizeof(Header));
-
   Header.Machine = TargetObjectWriter->getMachine();
 }
 
 COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
-  return createCOFFEntity<COFFSymbol>(Name, Symbols);
+  Symbols.push_back(make_unique<COFFSymbol>(Name));
+  return Symbols.back().get();
 }
 
 COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol *Symbol) {
-  symbol_map::iterator i = SymbolMap.find(Symbol);
-  if (i != SymbolMap.end())
-    return i->second;
-  COFFSymbol *RetSymbol =
-      createCOFFEntity<COFFSymbol>(Symbol->getName(), Symbols);
-  SymbolMap[Symbol] = RetSymbol;
-  return RetSymbol;
+  COFFSymbol *&Ret = SymbolMap[Symbol];
+  if (!Ret)
+    Ret = createSymbol(Symbol->getName());
+  return Ret;
 }
 
 COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
-  return createCOFFEntity<COFFSection>(Name, Sections);
+  Sections.emplace_back(make_unique<COFFSection>(Name));
+  return Sections.back().get();
 }
 
-/// A template used to lookup or create a symbol/section, and initialize it if
-/// needed.
-template <typename object_t, typename list_t>
-object_t *WinCOFFObjectWriter::createCOFFEntity(StringRef Name, list_t &List) {
-  List.push_back(make_unique<object_t>(Name));
-
-  return List.back().get();
-}
-
-/// This function takes a section data object from the assembler
-/// and creates the associated COFF section staging object.
-void WinCOFFObjectWriter::defineSection(MCSectionCOFF const &Sec) {
-  COFFSection *coff_section = createSection(Sec.getSectionName());
-  COFFSymbol *coff_symbol = createSymbol(Sec.getSectionName());
-  if (Sec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    if (const MCSymbol *S = Sec.getCOMDATSymbol()) {
-      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
-      if (COMDATSymbol->Section)
-        report_fatal_error("two sections have the same comdat");
-      COMDATSymbol->Section = coff_section;
-    }
-  }
-
-  coff_section->Symbol = coff_symbol;
-  coff_symbol->Section = coff_section;
-  coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
-
-  // In this case the auxiliary symbol is a Section Definition.
-  coff_symbol->Aux.resize(1);
-  memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
-  coff_symbol->Aux[0].AuxType = ATSectionDefinition;
-  coff_symbol->Aux[0].Aux.SectionDefinition.Selection = Sec.getSelection();
-
-  coff_section->Header.Characteristics = Sec.getCharacteristics();
-
-  uint32_t &Characteristics = coff_section->Header.Characteristics;
+static uint32_t getAlignment(const MCSectionCOFF &Sec) {
   switch (Sec.getAlignment()) {
   case 1:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_1BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_1BYTES;
   case 2:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_2BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_2BYTES;
   case 4:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_4BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_4BYTES;
   case 8:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_8BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_8BYTES;
   case 16:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_16BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_16BYTES;
   case 32:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_32BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_32BYTES;
   case 64:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_64BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_64BYTES;
   case 128:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_128BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_128BYTES;
   case 256:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_256BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_256BYTES;
   case 512:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_512BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_512BYTES;
   case 1024:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_1024BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_1024BYTES;
   case 2048:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_2048BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_2048BYTES;
   case 4096:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_4096BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_4096BYTES;
   case 8192:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_8192BYTES;
-    break;
-  default:
-    llvm_unreachable("unsupported section alignment");
+    return COFF::IMAGE_SCN_ALIGN_8192BYTES;
+  }
+  llvm_unreachable("unsupported section alignment");
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF section staging object.
+void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
+  COFFSection *Section = createSection(MCSec.getSectionName());
+  COFFSymbol *Symbol = createSymbol(MCSec.getSectionName());
+  Section->Symbol = Symbol;
+  Symbol->Section = Section;
+  Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+
+  // Create a COMDAT symbol if needed.
+  if (MCSec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (const MCSymbol *S = MCSec.getCOMDATSymbol()) {
+      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
+      if (COMDATSymbol->Section)
+        report_fatal_error("two sections have the same comdat");
+      COMDATSymbol->Section = Section;
+    }
   }
 
+  // In this case the auxiliary symbol is a Section Definition.
+  Symbol->Aux.resize(1);
+  Symbol->Aux[0] = {};
+  Symbol->Aux[0].AuxType = ATSectionDefinition;
+  Symbol->Aux[0].Aux.SectionDefinition.Selection = MCSec.getSelection();
+
+  // Set section alignment.
+  Section->Header.Characteristics = MCSec.getCharacteristics();
+  Section->Header.Characteristics |= getAlignment(MCSec);
+
   // Bind internal COFF section to MC section.
-  coff_section->MCSection = &Sec;
-  SectionMap[&Sec] = coff_section;
+  Section->MCSection = &MCSec;
+  SectionMap[&MCSec] = Section;
 }
 
 static uint64_t getSymbolValue(const MCSymbol &Symbol,
@@ -368,25 +342,25 @@ COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
 
 /// This function takes a symbol data object from the assembler
 /// and creates the associated COFF symbol staging object.
-void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
+void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
                                        MCAssembler &Assembler,
                                        const MCAsmLayout &Layout) {
-  COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
-  const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
+  COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
+  const MCSymbol *Base = Layout.getBaseSymbol(MCSym);
   COFFSection *Sec = nullptr;
   if (Base && Base->getFragment()) {
     Sec = SectionMap[Base->getFragment()->getParent()];
-    if (coff_symbol->Section && coff_symbol->Section != Sec)
+    if (Sym->Section && Sym->Section != Sec)
       report_fatal_error("conflicting sections for symbol");
   }
 
   COFFSymbol *Local = nullptr;
-  if (cast<MCSymbolCOFF>(Symbol).isWeakExternal()) {
-    coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+  if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {
+    Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
 
-    COFFSymbol *WeakDefault = getLinkedSymbol(Symbol);
+    COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);
     if (!WeakDefault) {
-      std::string WeakName = (".weak." + Symbol.getName() + ".default").str();
+      std::string WeakName = (".weak." + MCSym.getName() + ".default").str();
       WeakDefault = createSymbol(WeakName);
       if (!Sec)
         WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
@@ -395,41 +369,41 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
       Local = WeakDefault;
     }
 
-    coff_symbol->Other = WeakDefault;
+    Sym->Other = WeakDefault;
 
     // Setup the Weak External auxiliary symbol.
-    coff_symbol->Aux.resize(1);
-    memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
-    coff_symbol->Aux[0].AuxType = ATWeakExternal;
-    coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = 0;
-    coff_symbol->Aux[0].Aux.WeakExternal.Characteristics =
+    Sym->Aux.resize(1);
+    memset(&Sym->Aux[0], 0, sizeof(Sym->Aux[0]));
+    Sym->Aux[0].AuxType = ATWeakExternal;
+    Sym->Aux[0].Aux.WeakExternal.TagIndex = 0;
+    Sym->Aux[0].Aux.WeakExternal.Characteristics =
         COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY;
   } else {
     if (!Base)
-      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+      Sym->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
     else
-      coff_symbol->Section = Sec;
-    Local = coff_symbol;
+      Sym->Section = Sec;
+    Local = Sym;
   }
 
   if (Local) {
-    Local->Data.Value = getSymbolValue(Symbol, Layout);
+    Local->Data.Value = getSymbolValue(MCSym, Layout);
 
-    const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(Symbol);
+    const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym);
     Local->Data.Type = SymbolCOFF.getType();
     Local->Data.StorageClass = SymbolCOFF.getClass();
 
     // If no storage class was specified in the streamer, define it here.
     if (Local->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
-      bool IsExternal = Symbol.isExternal() ||
-                        (!Symbol.getFragment() && !Symbol.isVariable());
+      bool IsExternal = MCSym.isExternal() ||
+                        (!MCSym.getFragment() && !MCSym.isVariable());
 
       Local->Data.StorageClass = IsExternal ? COFF::IMAGE_SYM_CLASS_EXTERNAL
                                             : COFF::IMAGE_SYM_CLASS_STATIC;
     }
   }
 
-  coff_symbol->MC = &Symbol;
+  Sym->MC = &MCSym;
 }
 
 // Maximum offsets for different string table entry encodings.
@@ -459,24 +433,25 @@ static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
 }
 
 void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
-  if (S.Name.size() > COFF::NameSize) {
-    uint64_t StringTableEntry = Strings.getOffset(S.Name);
-
-    if (StringTableEntry <= Max7DecimalOffset) {
-      SmallVector<char, COFF::NameSize> Buffer;
-      Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
-      assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
-
-      std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
-    } else if (StringTableEntry <= MaxBase64Offset) {
-      // Starting with 10,000,000, offsets are encoded as base64.
-      encodeBase64StringEntry(S.Header.Name, StringTableEntry);
-    } else {
-      report_fatal_error("COFF string table is greater than 64 GB.");
-    }
-  } else {
+  if (S.Name.size() <= COFF::NameSize) {
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
+    return;
+  }
+
+  uint64_t StringTableEntry = Strings.getOffset(S.Name);
+  if (StringTableEntry <= Max7DecimalOffset) {
+    SmallVector<char, COFF::NameSize> Buffer;
+    Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
+    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
+    std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
+    return;
   }
+  if (StringTableEntry <= MaxBase64Offset) {
+    // Starting with 10,000,000, offsets are encoded as base64.
+    encodeBase64StringEntry(S.Header.Name, StringTableEntry);
+    return;
+  }
+  report_fatal_error("COFF string table is greater than 64 GB.");
 }
 
 void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
@@ -583,18 +558,37 @@ void WinCOFFObjectWriter::WriteAuxiliarySymbols(
   }
 }
 
-void WinCOFFObjectWriter::writeSectionHeader(const COFF::section &S) {
-  writeBytes(StringRef(S.Name, COFF::NameSize));
-
-  writeLE32(S.VirtualSize);
-  writeLE32(S.VirtualAddress);
-  writeLE32(S.SizeOfRawData);
-  writeLE32(S.PointerToRawData);
-  writeLE32(S.PointerToRelocations);
-  writeLE32(S.PointerToLineNumbers);
-  writeLE16(S.NumberOfRelocations);
-  writeLE16(S.NumberOfLineNumbers);
-  writeLE32(S.Characteristics);
+// Write the section header.
+void WinCOFFObjectWriter::writeSectionHeaders() {
+  // Section numbers must be monotonically increasing in the section
+  // header, but our Sections array is not sorted by section number,
+  // so make a copy of Sections and sort it.
+  std::vector<COFFSection *> Arr;
+  for (auto &Section : Sections)
+    Arr.push_back(Section.get());
+  std::sort(Arr.begin(), Arr.end(),
+            [](const COFFSection *A, const COFFSection *B) {
+              return A->Number < B->Number;
+            });
+
+  for (auto &Section : Arr) {
+    if (Section->Number == -1)
+      continue;
+
+    COFF::section &S = Section->Header;
+    if (Section->Relocations.size() >= 0xffff)
+      S.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+    writeBytes(StringRef(S.Name, COFF::NameSize));
+    writeLE32(S.VirtualSize);
+    writeLE32(S.VirtualAddress);
+    writeLE32(S.SizeOfRawData);
+    writeLE32(S.PointerToRawData);
+    writeLE32(S.PointerToRelocations);
+    writeLE32(S.PointerToLineNumbers);
+    writeLE16(S.NumberOfRelocations);
+    writeLE16(S.NumberOfLineNumbers);
+    writeLE32(S.Characteristics);
+  }
 }
 
 void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
@@ -603,6 +597,87 @@ void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
   writeLE16(R.Type);
 }
 
+// Write MCSec's contents. What this function does is essentially
+// "Asm.writeSectionData(&MCSec, Layout)", but it's a bit complicated
+// because it needs to compute a CRC.
+uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout,
+                                                   const MCSection &MCSec) {
+  // Save the contents of the section to a temporary buffer, we need this
+  // to CRC the data before we dump it into the object file.
+  SmallVector<char, 128> Buf;
+  raw_svector_ostream VecOS(Buf);
+  raw_pwrite_stream &OldStream = getStream();
+
+  // Redirect the output stream to our buffer and fill our buffer with
+  // the section data.
+  setStream(VecOS);
+  Asm.writeSectionData(&MCSec, Layout);
+
+  // Reset the stream back to what it was before.
+  setStream(OldStream);
+
+  // Write the section contents to the object file.
+  getStream() << Buf;
+
+  // Calculate our CRC with an initial value of '0', this is not how
+  // JamCRC is specified but it aligns with the expected output.
+  JamCRC JC(/*Init=*/0);
+  JC.update(Buf);
+  return JC.getCRC();
+}
+
+void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const COFFSection &Sec,
+                                       const MCSection &MCSec) {
+  if (Sec.Number == -1)
+    return;
+
+  // Write the section contents.
+  if (Sec.Header.PointerToRawData != 0) {
+    assert(getStream().tell() <= Sec.Header.PointerToRawData &&
+           "Section::PointerToRawData is insane!");
+
+    unsigned PaddingSize = Sec.Header.PointerToRawData - getStream().tell();
+    assert(PaddingSize < 4 &&
+           "Should only need at most three bytes of padding!");
+    WriteZeros(PaddingSize);
+
+    uint32_t CRC = writeSectionContents(Asm, Layout, MCSec);
+
+    // Update the section definition auxiliary symbol to record the CRC.
+    COFFSection *Sec = SectionMap[&MCSec];
+    COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
+    assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
+    AuxSymbol &SecDef = AuxSyms[0];
+    SecDef.Aux.SectionDefinition.CheckSum = CRC;
+  }
+
+  // Write relocations for this section.
+  if (Sec.Relocations.empty()) {
+    assert(Sec.Header.PointerToRelocations == 0 &&
+           "Section::PointerToRelocations is insane!");
+    return;
+  }
+
+  assert(getStream().tell() == Sec.Header.PointerToRelocations &&
+         "Section::PointerToRelocations is insane!");
+
+  if (Sec.Relocations.size() >= 0xffff) {
+    // In case of overflow, write actual relocation count as first
+    // relocation. Including the synthetic reloc itself (+ 1).
+    COFF::relocation R;
+    R.VirtualAddress = Sec.Relocations.size() + 1;
+    R.SymbolTableIndex = 0;
+    R.Type = 0;
+    WriteRelocation(R);
+  }
+
+  for (const auto &Relocation : Sec.Relocations)
+    WriteRelocation(Relocation.Data);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // MCObjectWriter interface implementations
 
@@ -632,26 +707,11 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
                                                                 InSet, IsPCRel);
 }
 
-bool WinCOFFObjectWriter::isWeak(const MCSymbol &Sym) const {
-  if (!Sym.isExternal())
-    return false;
-
-  if (!Sym.isInSection())
-    return false;
-
-  const auto &Sec = cast<MCSectionCOFF>(Sym.getSection());
-  if (!Sec.getCOMDATSymbol())
-    return false;
-
-  // It looks like for COFF it is invalid to replace a reference to a global
-  // in a comdat with a reference to a local.
-  // FIXME: Add a specification reference if available.
-  return true;
-}
-
-void WinCOFFObjectWriter::recordRelocation(
-    MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment,
-    const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) {
+void WinCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
+                                           const MCAsmLayout &Layout,
+                                           const MCFragment *Fragment,
+                                           const MCFixup &Fixup, MCValue Target,
+                                           uint64_t &FixedValue) {
   assert(Target.getSymA() && "Relocation must reference a symbol!");
 
   const MCSymbol &A = Target.getSymA()->getSymbol();
@@ -668,15 +728,14 @@ void WinCOFFObjectWriter::recordRelocation(
     return;
   }
 
-  MCSection *Section = Fragment->getParent();
+  MCSection *MCSec = Fragment->getParent();
 
   // Mark this symbol as requiring an entry in the symbol table.
-  assert(SectionMap.find(Section) != SectionMap.end() &&
+  assert(SectionMap.find(MCSec) != SectionMap.end() &&
          "Section must already have been defined in executePostLayoutBinding!");
 
-  COFFSection *coff_section = SectionMap[Section];
+  COFFSection *Sec = SectionMap[MCSec];
   const MCSymbolRefExpr *SymB = Target.getSymB();
-  bool CrossSection = false;
 
   if (SymB) {
     const MCSymbol *B = &SymB->getSymbol();
@@ -688,28 +747,9 @@ void WinCOFFObjectWriter::recordRelocation(
       return;
     }
 
-    if (!A.getFragment()) {
-      Asm.getContext().reportError(
-          Fixup.getLoc(),
-          Twine("symbol '") + A.getName() +
-              "' can not be undefined in a subtraction expression");
-      return;
-    }
-
-    CrossSection = &A.getSection() != &B->getSection();
-
     // Offset of the symbol in the section
     int64_t OffsetOfB = Layout.getSymbolOffset(*B);
 
-    // In the case where we have SymbA and SymB, we just need to store the delta
-    // between the two symbols.  Update FixedValue to account for the delta, and
-    // skip recording the relocation.
-    if (!CrossSection) {
-      int64_t OffsetOfA = Layout.getSymbolOffset(A);
-      FixedValue = (OffsetOfA - OffsetOfB) + Target.getConstant();
-      return;
-    }
-
     // Offset of the relocation in the section
     int64_t OffsetOfRelocation =
         Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
@@ -725,7 +765,7 @@ void WinCOFFObjectWriter::recordRelocation(
   Reloc.Data.VirtualAddress = Layout.getFragmentOffset(Fragment);
 
   // Turn relocations for temporary symbols into section relocations.
-  if (A.isTemporary() || CrossSection) {
+  if (A.isTemporary()) {
     MCSection *TargetSection = &A.getSection();
     assert(
         SectionMap.find(TargetSection) != SectionMap.end() &&
@@ -743,7 +783,7 @@ void WinCOFFObjectWriter::recordRelocation(
 
   Reloc.Data.VirtualAddress += Fixup.getOffset();
   Reloc.Data.Type = TargetObjectWriter->getRelocType(
-      Target, Fixup, CrossSection, Asm.getBackend());
+      Asm.getContext(), Target, Fixup, SymB, Asm.getBackend());
 
   // FIXME: Can anyone explain what this does other than adjust for the size
   // of the offset?
@@ -796,46 +836,31 @@ void WinCOFFObjectWriter::recordRelocation(
     FixedValue = 0;
 
   if (TargetObjectWriter->recordRelocation(Fixup))
-    coff_section->Relocations.push_back(Reloc);
+    Sec->Relocations.push_back(Reloc);
 }
 
-void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
-                                      const MCAsmLayout &Layout) {
-  size_t SectionsSize = Sections.size();
-  if (SectionsSize > static_cast<size_t>(INT32_MAX))
-    report_fatal_error(
-        "PE COFF object files can't have more than 2147483647 sections");
-
-  // Assign symbol and section indexes and offsets.
-  int32_t NumberOfSections = static_cast<int32_t>(SectionsSize);
-
-  UseBigObj = NumberOfSections > COFF::MaxNumberOfSections16;
-
-  // Assign section numbers.
-  size_t Number = 1;
-  for (const auto &Section : Sections) {
-    Section->Number = Number;
-    Section->Symbol->Data.SectionNumber = Number;
-    Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Number;
-    ++Number;
-  }
-
-  Header.NumberOfSections = NumberOfSections;
-  Header.NumberOfSymbols = 0;
+static std::time_t getTime() {
+  std::time_t Now = time(nullptr);
+  if (Now < 0 || !isUInt<32>(Now))
+    return UINT32_MAX;
+  return Now;
+}
 
+// Create .file symbols.
+void WinCOFFObjectWriter::createFileSymbols(MCAssembler &Asm) {
   for (const std::string &Name : Asm.getFileNames()) {
     // round up to calculate the number of auxiliary symbols required
     unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
     unsigned Count = (Name.size() + SymbolSize - 1) / SymbolSize;
 
-    COFFSymbol *file = createSymbol(".file");
-    file->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
-    file->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
-    file->Aux.resize(Count);
+    COFFSymbol *File = createSymbol(".file");
+    File->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
+    File->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
+    File->Aux.resize(Count);
 
     unsigned Offset = 0;
     unsigned Length = Name.size();
-    for (auto &Aux : file->Aux) {
+    for (auto &Aux : File->Aux) {
       Aux.AuxType = ATFile;
 
       if (Length > SymbolSize) {
@@ -850,6 +875,109 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
       Offset += SymbolSize;
     }
   }
+}
+
+static bool isAssociative(const COFFSection &Section) {
+  return Section.Symbol->Aux[0].Aux.SectionDefinition.Selection ==
+         COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+}
+
+void WinCOFFObjectWriter::assignSectionNumbers() {
+  size_t I = 1;
+  auto Assign = [&](COFFSection &Section) {
+    Section.Number = I;
+    Section.Symbol->Data.SectionNumber = I;
+    Section.Symbol->Aux[0].Aux.SectionDefinition.Number = I;
+    ++I;
+  };
+
+  // Although it is not explicitly requested by the Microsoft COFF spec,
+  // we should avoid emitting forward associative section references,
+  // because MSVC link.exe as of 2017 cannot handle that.
+  for (const std::unique_ptr<COFFSection> &Section : Sections)
+    if (!isAssociative(*Section))
+      Assign(*Section);
+  for (const std::unique_ptr<COFFSection> &Section : Sections)
+    if (isAssociative(*Section))
+      Assign(*Section);
+}
+
+// Assign file offsets to COFF object file structures.
+void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
+                                            const MCAsmLayout &Layout) {
+  unsigned Offset = getInitialOffset();
+
+  Offset += UseBigObj ? COFF::Header32Size : COFF::Header16Size;
+  Offset += COFF::SectionSize * Header.NumberOfSections;
+
+  for (const auto &Section : Asm) {
+    COFFSection *Sec = SectionMap[&Section];
+
+    if (Sec->Number == -1)
+      continue;
+
+    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
+
+    if (IsPhysicalSection(Sec)) {
+      // Align the section data to a four byte boundary.
+      Offset = alignTo(Offset, 4);
+      Sec->Header.PointerToRawData = Offset;
+
+      Offset += Sec->Header.SizeOfRawData;
+    }
+
+    if (!Sec->Relocations.empty()) {
+      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
+
+      if (RelocationsOverflow) {
+        // Signal overflow by setting NumberOfRelocations to max value. Actual
+        // size is found in reloc #0. Microsoft tools understand this.
+        Sec->Header.NumberOfRelocations = 0xffff;
+      } else {
+        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      }
+      Sec->Header.PointerToRelocations = Offset;
+
+      if (RelocationsOverflow) {
+        // Reloc #0 will contain actual count, so make room for it.
+        Offset += COFF::RelocationSize;
+      }
+
+      Offset += COFF::RelocationSize * Sec->Relocations.size();
+
+      for (auto &Relocation : Sec->Relocations) {
+        assert(Relocation.Symb->getIndex() != -1);
+        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
+      }
+    }
+
+    assert(Sec->Symbol->Aux.size() == 1 &&
+           "Section's symbol must have one aux!");
+    AuxSymbol &Aux = Sec->Symbol->Aux[0];
+    assert(Aux.AuxType == ATSectionDefinition &&
+           "Section's symbol's aux symbol must be a Section Definition!");
+    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
+    Aux.Aux.SectionDefinition.NumberOfRelocations =
+        Sec->Header.NumberOfRelocations;
+    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
+        Sec->Header.NumberOfLineNumbers;
+  }
+
+  Header.PointerToSymbolTable = Offset;
+}
+
+void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+  if (Sections.size() > INT32_MAX)
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
+
+  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
+  Header.NumberOfSections = Sections.size();
+  Header.NumberOfSymbols = 0;
+
+  assignSectionNumbers();
+  createFileSymbols(Asm);
 
   for (auto &Symbol : Symbols) {
     // Update section number & offset for symbols that have them.
@@ -912,78 +1040,12 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Assoc->Number;
   }
 
-  // Assign file offsets to COFF object file structures.
-
-  unsigned offset = getInitialOffset();
-
-  if (UseBigObj)
-    offset += COFF::Header32Size;
-  else
-    offset += COFF::Header16Size;
-  offset += COFF::SectionSize * Header.NumberOfSections;
-
-  for (const auto &Section : Asm) {
-    COFFSection *Sec = SectionMap[&Section];
-
-    if (Sec->Number == -1)
-      continue;
-
-    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
-
-    if (IsPhysicalSection(Sec)) {
-      // Align the section data to a four byte boundary.
-      offset = alignTo(offset, 4);
-      Sec->Header.PointerToRawData = offset;
-
-      offset += Sec->Header.SizeOfRawData;
-    }
-
-    if (Sec->Relocations.size() > 0) {
-      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
-
-      if (RelocationsOverflow) {
-        // Signal overflow by setting NumberOfRelocations to max value. Actual
-        // size is found in reloc #0. Microsoft tools understand this.
-        Sec->Header.NumberOfRelocations = 0xffff;
-      } else {
-        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
-      }
-      Sec->Header.PointerToRelocations = offset;
-
-      if (RelocationsOverflow) {
-        // Reloc #0 will contain actual count, so make room for it.
-        offset += COFF::RelocationSize;
-      }
-
-      offset += COFF::RelocationSize * Sec->Relocations.size();
-
-      for (auto &Relocation : Sec->Relocations) {
-        assert(Relocation.Symb->getIndex() != -1);
-        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
-      }
-    }
-
-    assert(Sec->Symbol->Aux.size() == 1 &&
-           "Section's symbol must have one aux!");
-    AuxSymbol &Aux = Sec->Symbol->Aux[0];
-    assert(Aux.AuxType == ATSectionDefinition &&
-           "Section's symbol's aux symbol must be a Section Definition!");
-    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
-    Aux.Aux.SectionDefinition.NumberOfRelocations =
-        Sec->Header.NumberOfRelocations;
-    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
-        Sec->Header.NumberOfLineNumbers;
-  }
-
-  Header.PointerToSymbolTable = offset;
+  assignFileOffsets(Asm, Layout);
 
   // MS LINK expects to be able to use this timestamp to implement their
   // /INCREMENTAL feature.
   if (Asm.isIncrementalLinkerCompatible()) {
-    std::time_t Now = time(nullptr);
-    if (Now < 0 || !isUInt<32>(Now))
-      Now = UINT32_MAX;
-    Header.TimeDateStamp = Now;
+    Header.TimeDateStamp = getTime();
   } else {
     // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
     Header.TimeDateStamp = 0;
@@ -991,96 +1053,25 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
 
   // Write it all to disk...
   WriteFileHeader(Header);
+  writeSectionHeaders();
 
-  {
-    sections::iterator i, ie;
-    MCAssembler::iterator j, je;
-
-    for (auto &Section : Sections) {
-      if (Section->Number != -1) {
-        if (Section->Relocations.size() >= 0xffff)
-          Section->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-        writeSectionHeader(Section->Header);
-      }
-    }
-
-    SmallVector<char, 128> SectionContents;
-    for (i = Sections.begin(), ie = Sections.end(), j = Asm.begin(),
-        je = Asm.end();
-         (i != ie) && (j != je); ++i, ++j) {
-
-      if ((*i)->Number == -1)
-        continue;
-
-      if ((*i)->Header.PointerToRawData != 0) {
-        assert(getStream().tell() <= (*i)->Header.PointerToRawData &&
-               "Section::PointerToRawData is insane!");
-
-        unsigned SectionDataPadding =
-            (*i)->Header.PointerToRawData - getStream().tell();
-        assert(SectionDataPadding < 4 &&
-               "Should only need at most three bytes of padding!");
-
-        WriteZeros(SectionDataPadding);
-
-        // Save the contents of the section to a temporary buffer, we need this
-        // to CRC the data before we dump it into the object file.
-        SectionContents.clear();
-        raw_svector_ostream VecOS(SectionContents);
-        raw_pwrite_stream &OldStream = getStream();
-        // Redirect the output stream to our buffer.
-        setStream(VecOS);
-        // Fill our buffer with the section data.
-        Asm.writeSectionData(&*j, Layout);
-        // Reset the stream back to what it was before.
-        setStream(OldStream);
-
-        // Calculate our CRC with an initial value of '0', this is not how
-        // JamCRC is specified but it aligns with the expected output.
-        JamCRC JC(/*Init=*/0x00000000U);
-        JC.update(SectionContents);
-
-        // Write the section contents to the object file.
-        getStream() << SectionContents;
-
-        // Update the section definition auxiliary symbol to record the CRC.
-        COFFSection *Sec = SectionMap[&*j];
-        COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
-        assert(AuxSyms.size() == 1 &&
-               AuxSyms[0].AuxType == ATSectionDefinition);
-        AuxSymbol &SecDef = AuxSyms[0];
-        SecDef.Aux.SectionDefinition.CheckSum = JC.getCRC();
-      }
-
-      if ((*i)->Relocations.size() > 0) {
-        assert(getStream().tell() == (*i)->Header.PointerToRelocations &&
-               "Section::PointerToRelocations is insane!");
-
-        if ((*i)->Relocations.size() >= 0xffff) {
-          // In case of overflow, write actual relocation count as first
-          // relocation. Including the synthetic reloc itself (+ 1).
-          COFF::relocation r;
-          r.VirtualAddress = (*i)->Relocations.size() + 1;
-          r.SymbolTableIndex = 0;
-          r.Type = 0;
-          WriteRelocation(r);
-        }
-
-        for (const auto &Relocation : (*i)->Relocations)
-          WriteRelocation(Relocation.Data);
-      } else
-        assert((*i)->Header.PointerToRelocations == 0 &&
-               "Section::PointerToRelocations is insane!");
-    }
-  }
+  // Write section contents.
+  sections::iterator I = Sections.begin();
+  sections::iterator IE = Sections.end();
+  MCAssembler::iterator J = Asm.begin();
+  MCAssembler::iterator JE = Asm.end();
+  for (; I != IE && J != JE; ++I, ++J)
+    writeSection(Asm, Layout, **I, *J);
 
   assert(getStream().tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
+  // Write a symbol table.
   for (auto &Symbol : Symbols)
     if (Symbol->getIndex() != -1)
       WriteSymbol(*Symbol);
 
+  // Write a string table, which completes the entire COFF file.
   Strings.write(getStream());
 }
 
diff --git a/contrib/llvm/lib/Object/Archive.cpp b/contrib/llvm/lib/Object/Archive.cpp
index f2021f7..977cccc 100644
--- a/contrib/llvm/lib/Object/Archive.cpp
+++ b/contrib/llvm/lib/Object/Archive.cpp
@@ -1,4 +1,4 @@
-//===- Archive.cpp - ar File Format implementation --------------*- C++ -*-===//
+//===- Archive.cpp - ar File Format implementation ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +12,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/Archive.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/Chrono.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -25,7 +42,7 @@ using namespace llvm::support::endian;
 static const char *const Magic = "!<arch>\n";
 static const char *const ThinMagic = "!<thin>\n";
 
-void Archive::anchor() { }
+void Archive::anchor() {}
 
 static Error
 malformedError(Twine Msg) {
@@ -61,8 +78,8 @@ ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent,
     if (Err) {
       std::string Buf;
       raw_string_ostream OS(Buf);
-      OS.write_escaped(llvm::StringRef(ArMemHdr->Terminator,
-                                       sizeof(ArMemHdr->Terminator)));
+      OS.write_escaped(StringRef(ArMemHdr->Terminator,
+                                 sizeof(ArMemHdr->Terminator)));
       OS.flush();
       std::string Msg("terminator characters in archive member \"" + Buf +
                       "\" not the correct \"`\\n\" values for the archive "
@@ -97,13 +114,13 @@ Expected<StringRef> ArchiveMemberHeader::getRawName() const {
     EndCond = ' ';
   else
     EndCond = '/';
-  llvm::StringRef::size_type end =
-      llvm::StringRef(ArMemHdr->Name, sizeof(ArMemHdr->Name)).find(EndCond);
-  if (end == llvm::StringRef::npos)
+  StringRef::size_type end =
+      StringRef(ArMemHdr->Name, sizeof(ArMemHdr->Name)).find(EndCond);
+  if (end == StringRef::npos)
     end = sizeof(ArMemHdr->Name);
   assert(end <= sizeof(ArMemHdr->Name) && end > 0);
   // Don't include the EndCond if there is one.
-  return llvm::StringRef(ArMemHdr->Name, end);
+  return StringRef(ArMemHdr->Name, end);
 }
 
 // This gets the name looking up long names. Size is the size of the archive
@@ -205,12 +222,12 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
 
 Expected<uint32_t> ArchiveMemberHeader::getSize() const {
   uint32_t Ret;
-  if (llvm::StringRef(ArMemHdr->Size,
-        sizeof(ArMemHdr->Size)).rtrim(" ").getAsInteger(10, Ret)) {
+  if (StringRef(ArMemHdr->Size,
+                sizeof(ArMemHdr->Size)).rtrim(" ").getAsInteger(10, Ret)) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    OS.write_escaped(llvm::StringRef(ArMemHdr->Size,
-                                     sizeof(ArMemHdr->Size)).rtrim(" "));
+    OS.write_escaped(StringRef(ArMemHdr->Size,
+                               sizeof(ArMemHdr->Size)).rtrim(" "));
     OS.flush();
     uint64_t Offset = reinterpret_cast<const char *>(ArMemHdr) -
                       Parent->getData().data();
@@ -227,8 +244,8 @@ Expected<sys::fs::perms> ArchiveMemberHeader::getAccessMode() const {
                 sizeof(ArMemHdr->AccessMode)).rtrim(' ').getAsInteger(8, Ret)) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    OS.write_escaped(llvm::StringRef(ArMemHdr->AccessMode,
-                                   sizeof(ArMemHdr->AccessMode)).rtrim(" "));
+    OS.write_escaped(StringRef(ArMemHdr->AccessMode,
+                               sizeof(ArMemHdr->AccessMode)).rtrim(" "));
     OS.flush();
     uint64_t Offset = reinterpret_cast<const char *>(ArMemHdr) -
                       Parent->getData().data();
@@ -247,8 +264,8 @@ ArchiveMemberHeader::getLastModified() const {
           .getAsInteger(10, Seconds)) {
     std::string Buf;
     raw_string_ostream OS(Buf);
-    OS.write_escaped(llvm::StringRef(ArMemHdr->LastModified,
-                                   sizeof(ArMemHdr->LastModified)).rtrim(" "));
+    OS.write_escaped(StringRef(ArMemHdr->LastModified,
+                               sizeof(ArMemHdr->LastModified)).rtrim(" "));
     OS.flush();
     uint64_t Offset = reinterpret_cast<const char *>(ArMemHdr) -
                       Parent->getData().data();
diff --git a/contrib/llvm/lib/Object/ArchiveWriter.cpp b/contrib/llvm/lib/Object/ArchiveWriter.cpp
index f8e3c5a..b052c76 100644
--- a/contrib/llvm/lib/Object/ArchiveWriter.cpp
+++ b/contrib/llvm/lib/Object/ArchiveWriter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/ObjectFile.h"
@@ -35,7 +36,8 @@
 using namespace llvm;
 
 NewArchiveMember::NewArchiveMember(MemoryBufferRef BufRef)
-    : Buf(MemoryBuffer::getMemBuffer(BufRef, false)) {}
+    : Buf(MemoryBuffer::getMemBuffer(BufRef, false)),
+      MemberName(BufRef.getBufferIdentifier()) {}
 
 Expected<NewArchiveMember>
 NewArchiveMember::getOldMember(const object::Archive::Child &OldMember,
@@ -47,6 +49,7 @@ NewArchiveMember::getOldMember(const object::Archive::Child &OldMember,
   NewArchiveMember M;
   assert(M.IsNew == false);
   M.Buf = MemoryBuffer::getMemBuffer(*BufOrErr, false);
+  M.MemberName = M.Buf->getBufferIdentifier();
   if (!Deterministic) {
     auto ModTimeOrErr = OldMember.getLastModified();
     if (!ModTimeOrErr)
@@ -96,6 +99,7 @@ Expected<NewArchiveMember> NewArchiveMember::getFile(StringRef FileName,
   NewArchiveMember M;
   M.IsNew = true;
   M.Buf = std::move(*MemberBufferOrErr);
+  M.MemberName = M.Buf->getBufferIdentifier();
   if (!Deterministic) {
     M.ModTime = std::chrono::time_point_cast<std::chrono::seconds>(
         Status.getLastModificationTime());
@@ -122,12 +126,27 @@ static void printWithSpacePadding(raw_fd_ostream &OS, T Data, unsigned Size,
   }
 }
 
+static bool isBSDLike(object::Archive::Kind Kind) {
+  switch (Kind) {
+  case object::Archive::K_GNU:
+    return false;
+  case object::Archive::K_BSD:
+  case object::Archive::K_DARWIN:
+    return true;
+  case object::Archive::K_MIPS64:
+  case object::Archive::K_DARWIN64:
+  case object::Archive::K_COFF:
+    break;
+  }
+  llvm_unreachable("not supported for writting");
+}
+
 static void print32(raw_ostream &Out, object::Archive::Kind Kind,
                     uint32_t Val) {
-  if (Kind == object::Archive::K_GNU)
-    support::endian::Writer<support::big>(Out).write(Val);
-  else
+  if (isBSDLike(Kind))
     support::endian::Writer<support::little>(Out).write(Val);
+  else
+    support::endian::Writer<support::big>(Out).write(Val);
 }
 
 static void printRestOfMemberHeader(
@@ -169,7 +188,7 @@ printBSDMemberHeader(raw_fd_ostream &Out, StringRef Name,
 }
 
 static bool useStringTable(bool Thin, StringRef Name) {
-  return Thin || Name.size() >= 16;
+  return Thin || Name.size() >= 16 || Name.contains('/');
 }
 
 static void
@@ -178,7 +197,7 @@ printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind, bool Thin,
                   std::vector<unsigned>::iterator &StringMapIndexIter,
                   const sys::TimePoint<std::chrono::seconds> &ModTime,
                   unsigned UID, unsigned GID, unsigned Perms, unsigned Size) {
-  if (Kind == object::Archive::K_BSD)
+  if (isBSDLike(Kind))
     return printBSDMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
   if (!useStringTable(Thin, Name))
     return printGNUSmallMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
@@ -223,7 +242,7 @@ static void writeStringTable(raw_fd_ostream &Out, StringRef ArcName,
   unsigned StartOffset = 0;
   for (const NewArchiveMember &M : Members) {
     StringRef Path = M.Buf->getBufferIdentifier();
-    StringRef Name = sys::path::filename(Path);
+    StringRef Name = M.MemberName;
     if (!useStringTable(Thin, Name))
       continue;
     if (StartOffset == 0) {
@@ -275,7 +294,7 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
     MemoryBufferRef MemberBuffer = Members[MemberNum].Buf->getMemBufferRef();
     Expected<std::unique_ptr<object::SymbolicFile>> ObjOrErr =
         object::SymbolicFile::createSymbolicFile(
-            MemberBuffer, sys::fs::file_magic::unknown, &Context);
+            MemberBuffer, llvm::file_magic::unknown, &Context);
     if (!ObjOrErr) {
       // FIXME: check only for "not an object file" errors.
       consumeError(ObjOrErr.takeError());
@@ -285,10 +304,10 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
 
     if (!HeaderStartOffset) {
       HeaderStartOffset = Out.tell();
-      if (Kind == object::Archive::K_GNU)
-        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
-      else
+      if (isBSDLike(Kind))
         printBSDMemberHeader(Out, "__.SYMDEF", now(Deterministic), 0, 0, 0, 0);
+      else
+        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
       BodyStartOffset = Out.tell();
       print32(Out, Kind, 0); // number of entries or bytes
     }
@@ -299,7 +318,8 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
         continue;
       if (!(Symflags & object::SymbolRef::SF_Global))
         continue;
-      if (Symflags & object::SymbolRef::SF_Undefined)
+      if (Symflags & object::SymbolRef::SF_Undefined &&
+          !(Symflags & object::SymbolRef::SF_Indirect))
         continue;
 
       unsigned NameOffset = NameOS.tell();
@@ -307,7 +327,7 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
         return EC;
       NameOS << '\0';
       MemberOffsetRefs.push_back(MemberNum);
-      if (Kind == object::Archive::K_BSD)
+      if (isBSDLike(Kind))
         print32(Out, Kind, NameOffset);
       print32(Out, Kind, 0); // member offset
     }
@@ -316,10 +336,21 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   if (HeaderStartOffset == 0)
     return 0;
 
+  // ld64 prefers the cctools type archive which pads its string table to a
+  // boundary of sizeof(int32_t).
+  if (isBSDLike(Kind))
+    for (unsigned P = OffsetToAlignment(NameOS.tell(), sizeof(int32_t)); P--;)
+      NameOS << '\0';
+
   StringRef StringTable = NameOS.str();
-  if (Kind == object::Archive::K_BSD)
+  if (isBSDLike(Kind))
     print32(Out, Kind, StringTable.size()); // byte count of the string table
   Out << StringTable;
+  // If there are no symbols, emit an empty symbol table, to satisfy Solaris
+  // tools, older versions of which expect a symbol table in a non-empty
+  // archive, regardless of whether there are any symbols in it.
+  if (StringTable.size() == 0)
+    print32(Out, Kind, 0);
 
   // ld64 requires the next member header to start at an offset that is
   // 4 bytes aligned.
@@ -336,10 +367,10 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   // Patch up the number of symbols.
   Out.seek(BodyStartOffset);
   unsigned NumSyms = MemberOffsetRefs.size();
-  if (Kind == object::Archive::K_GNU)
-    print32(Out, Kind, NumSyms);
-  else
+  if (isBSDLike(Kind))
     print32(Out, Kind, NumSyms * 8);
+  else
+    print32(Out, Kind, NumSyms);
 
   Out.seek(Pos);
   return BodyStartOffset + 4;
@@ -351,8 +382,7 @@ llvm::writeArchive(StringRef ArcName,
                    bool WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
-  assert((!Thin || Kind == object::Archive::K_GNU) &&
-         "Only the gnu format has a thin mode");
+  assert((!Thin || !isBSDLike(Kind)) && "Only the gnu format has a thin mode");
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
@@ -368,10 +398,6 @@ llvm::writeArchive(StringRef ArcName,
 
   std::vector<unsigned> MemberOffsetRefs;
 
-  std::vector<std::unique_ptr<MemoryBuffer>> Buffers;
-  std::vector<MemoryBufferRef> Members;
-  std::vector<sys::fs::file_status> NewMemberStatus;
-
   unsigned MemberReferenceOffset = 0;
   if (WriteSymtab) {
     ErrorOr<unsigned> MemberReferenceOffsetOrErr = writeSymbolTable(
@@ -382,25 +408,34 @@ llvm::writeArchive(StringRef ArcName,
   }
 
   std::vector<unsigned> StringMapIndexes;
-  if (Kind != object::Archive::K_BSD)
+  if (!isBSDLike(Kind))
     writeStringTable(Out, ArcName, NewMembers, StringMapIndexes, Thin);
 
   std::vector<unsigned>::iterator StringMapIndexIter = StringMapIndexes.begin();
   std::vector<unsigned> MemberOffset;
   for (const NewArchiveMember &M : NewMembers) {
     MemoryBufferRef File = M.Buf->getMemBufferRef();
+    unsigned Padding = 0;
 
     unsigned Pos = Out.tell();
     MemberOffset.push_back(Pos);
 
-    printMemberHeader(Out, Kind, Thin,
-                      sys::path::filename(M.Buf->getBufferIdentifier()),
-                      StringMapIndexIter, M.ModTime, M.UID, M.GID, M.Perms,
-                      M.Buf->getBufferSize());
+    // ld64 expects the members to be 8-byte aligned for 64-bit content and at
+    // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
+    // uniformly.  This matches the behaviour with cctools and ensures that ld64
+    // is happy with archives that we generate.
+    if (Kind == object::Archive::K_DARWIN)
+      Padding = OffsetToAlignment(M.Buf->getBufferSize(), 8);
+
+    printMemberHeader(Out, Kind, Thin, M.MemberName, StringMapIndexIter,
+                      M.ModTime, M.UID, M.GID, M.Perms,
+                      M.Buf->getBufferSize() + Padding);
 
     if (!Thin)
       Out << File.getBuffer();
 
+    while (Padding--)
+      Out << '\n';
     if (Out.tell() % 2)
       Out << '\n';
   }
@@ -408,7 +443,7 @@ llvm::writeArchive(StringRef ArcName,
   if (MemberReferenceOffset) {
     Out.seek(MemberReferenceOffset);
     for (unsigned MemberNum : MemberOffsetRefs) {
-      if (Kind == object::Archive::K_BSD)
+      if (isBSDLike(Kind))
         Out.seek(Out.tell() + 4); // skip over the string offset
       print32(Out, Kind, MemberOffset[MemberNum]);
     }
diff --git a/contrib/llvm/lib/Object/Binary.cpp b/contrib/llvm/lib/Object/Binary.cpp
index 8467d34..c4565db 100644
--- a/contrib/llvm/lib/Object/Binary.cpp
+++ b/contrib/llvm/lib/Object/Binary.cpp
@@ -1,4 +1,4 @@
-//===- Binary.cpp - A generic binary file -----------------------*- C++ -*-===//
+//===- Binary.cpp - A generic binary file ---------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,19 +13,25 @@
 
 #include "llvm/Object/Binary.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
-
-// Include headers for createBinary.
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/WindowsResource.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <algorithm>
+#include <memory>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
 
-Binary::~Binary() {}
+Binary::~Binary() = default;
 
 Binary::Binary(unsigned int Type, MemoryBufferRef Source)
     : TypeID(Type), Data(Source) {}
@@ -38,40 +44,41 @@ MemoryBufferRef Binary::getMemoryBufferRef() const { return Data; }
 
 Expected<std::unique_ptr<Binary>> object::createBinary(MemoryBufferRef Buffer,
                                                       LLVMContext *Context) {
-  sys::fs::file_magic Type = sys::fs::identify_magic(Buffer.getBuffer());
+  file_magic Type = identify_magic(Buffer.getBuffer());
 
   switch (Type) {
-    case sys::fs::file_magic::archive:
-      return Archive::create(Buffer);
-    case sys::fs::file_magic::elf:
-    case sys::fs::file_magic::elf_relocatable:
-    case sys::fs::file_magic::elf_executable:
-    case sys::fs::file_magic::elf_shared_object:
-    case sys::fs::file_magic::elf_core:
-    case sys::fs::file_magic::macho_object:
-    case sys::fs::file_magic::macho_executable:
-    case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
-    case sys::fs::file_magic::macho_core:
-    case sys::fs::file_magic::macho_preload_executable:
-    case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
-    case sys::fs::file_magic::macho_dynamic_linker:
-    case sys::fs::file_magic::macho_bundle:
-    case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
-    case sys::fs::file_magic::macho_dsym_companion:
-    case sys::fs::file_magic::macho_kext_bundle:
-    case sys::fs::file_magic::coff_object:
-    case sys::fs::file_magic::coff_import_library:
-    case sys::fs::file_magic::pecoff_executable:
-    case sys::fs::file_magic::bitcode:
-    case sys::fs::file_magic::wasm_object:
-      return ObjectFile::createSymbolicFile(Buffer, Type, Context);
-    case sys::fs::file_magic::macho_universal_binary:
-      return MachOUniversalBinary::create(Buffer);
-    case sys::fs::file_magic::unknown:
-    case sys::fs::file_magic::coff_cl_gl_object:
-    case sys::fs::file_magic::windows_resource:
-      // Unrecognized object file format.
-      return errorCodeToError(object_error::invalid_file_type);
+  case file_magic::archive:
+    return Archive::create(Buffer);
+  case file_magic::elf:
+  case file_magic::elf_relocatable:
+  case file_magic::elf_executable:
+  case file_magic::elf_shared_object:
+  case file_magic::elf_core:
+  case file_magic::macho_object:
+  case file_magic::macho_executable:
+  case file_magic::macho_fixed_virtual_memory_shared_lib:
+  case file_magic::macho_core:
+  case file_magic::macho_preload_executable:
+  case file_magic::macho_dynamically_linked_shared_lib:
+  case file_magic::macho_dynamic_linker:
+  case file_magic::macho_bundle:
+  case file_magic::macho_dynamically_linked_shared_lib_stub:
+  case file_magic::macho_dsym_companion:
+  case file_magic::macho_kext_bundle:
+  case file_magic::coff_object:
+  case file_magic::coff_import_library:
+  case file_magic::pecoff_executable:
+  case file_magic::bitcode:
+  case file_magic::wasm_object:
+    return ObjectFile::createSymbolicFile(Buffer, Type, Context);
+  case file_magic::macho_universal_binary:
+    return MachOUniversalBinary::create(Buffer);
+  case file_magic::windows_resource:
+    return WindowsResource::createWindowsResource(Buffer);
+  case file_magic::unknown:
+  case file_magic::coff_cl_gl_object:
+    // Unrecognized object file format.
+    return errorCodeToError(object_error::invalid_file_type);
   }
   llvm_unreachable("Unexpected Binary File Type");
 }
diff --git a/contrib/llvm/lib/Object/COFFImportFile.cpp b/contrib/llvm/lib/Object/COFFImportFile.cpp
new file mode 100644
index 0000000..ff03946
--- /dev/null
+++ b/contrib/llvm/lib/Object/COFFImportFile.cpp
@@ -0,0 +1,612 @@
+//===- COFFImportFile.cpp - COFF short import file implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the writeImportLibrary function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+using namespace llvm::COFF;
+using namespace llvm::object;
+using namespace llvm;
+
+namespace llvm {
+namespace object {
+
+static bool is32bit(MachineTypes Machine) {
+  switch (Machine) {
+  default:
+    llvm_unreachable("unsupported machine");
+  case IMAGE_FILE_MACHINE_AMD64:
+    return false;
+  case IMAGE_FILE_MACHINE_ARMNT:
+  case IMAGE_FILE_MACHINE_I386:
+    return true;
+  }
+}
+
+static uint16_t getImgRelRelocation(MachineTypes Machine) {
+  switch (Machine) {
+  default:
+    llvm_unreachable("unsupported machine");
+  case IMAGE_FILE_MACHINE_AMD64:
+    return IMAGE_REL_AMD64_ADDR32NB;
+  case IMAGE_FILE_MACHINE_ARMNT:
+    return IMAGE_REL_ARM_ADDR32NB;
+  case IMAGE_FILE_MACHINE_I386:
+    return IMAGE_REL_I386_DIR32NB;
+  }
+}
+
+template <class T> static void append(std::vector<uint8_t> &B, const T &Data) {
+  size_t S = B.size();
+  B.resize(S + sizeof(T));
+  memcpy(&B[S], &Data, sizeof(T));
+}
+
+static void writeStringTable(std::vector<uint8_t> &B,
+                             ArrayRef<const std::string> Strings) {
+  // The COFF string table consists of a 4-byte value which is the size of the
+  // table, including the length field itself.  This value is followed by the
+  // string content itself, which is an array of null-terminated C-style
+  // strings.  The termination is important as they are referenced to by offset
+  // by the symbol entity in the file format.
+
+  size_t Pos = B.size();
+  size_t Offset = B.size();
+
+  // Skip over the length field, we will fill it in later as we will have
+  // computed the length while emitting the string content itself.
+  Pos += sizeof(uint32_t);
+
+  for (const auto &S : Strings) {
+    B.resize(Pos + S.length() + 1);
+    strcpy(reinterpret_cast<char *>(&B[Pos]), S.c_str());
+    Pos += S.length() + 1;
+  }
+
+  // Backfill the length of the table now that it has been computed.
+  support::ulittle32_t Length(B.size() - Offset);
+  support::endian::write32le(&B[Offset], Length);
+}
+
+static ImportNameType getNameType(StringRef Sym, StringRef ExtName,
+                                  MachineTypes Machine) {
+  if (Sym != ExtName)
+    return IMPORT_NAME_UNDECORATE;
+  if (Machine == IMAGE_FILE_MACHINE_I386 && Sym.startswith("_"))
+    return IMPORT_NAME_NOPREFIX;
+  return IMPORT_NAME;
+}
+
+static Expected<std::string> replace(StringRef S, StringRef From,
+                                     StringRef To) {
+  size_t Pos = S.find(From);
+
+  // From and To may be mangled, but substrings in S may not.
+  if (Pos == StringRef::npos && From.startswith("_") && To.startswith("_")) {
+    From = From.substr(1);
+    To = To.substr(1);
+    Pos = S.find(From);
+  }
+
+  if (Pos == StringRef::npos) {
+    return make_error<StringError>(
+      StringRef(Twine(S + ": replacing '" + From +
+        "' with '" + To + "' failed").str()), object_error::parse_failed);
+  }
+
+  return (Twine(S.substr(0, Pos)) + To + S.substr(Pos + From.size())).str();
+}
+
+static const std::string NullImportDescriptorSymbolName =
+    "__NULL_IMPORT_DESCRIPTOR";
+
+namespace {
+// This class constructs various small object files necessary to support linking
+// symbols imported from a DLL.  The contents are pretty strictly defined and
+// nearly entirely static.  The details of the structures files are defined in
+// WINNT.h and the PE/COFF specification.
+class ObjectFactory {
+  using u16 = support::ulittle16_t;
+  using u32 = support::ulittle32_t;
+  MachineTypes Machine;
+  BumpPtrAllocator Alloc;
+  StringRef ImportName;
+  StringRef Library;
+  std::string ImportDescriptorSymbolName;
+  std::string NullThunkSymbolName;
+
+public:
+  ObjectFactory(StringRef S, MachineTypes M)
+      : Machine(M), ImportName(S), Library(S.drop_back(4)),
+        ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
+        NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}
+
+  // Creates an Import Descriptor.  This is a small object file which contains a
+  // reference to the terminators and contains the library name (entry) for the
+  // import name table.  It will force the linker to construct the necessary
+  // structure to import symbols from the DLL.
+  NewArchiveMember createImportDescriptor(std::vector<uint8_t> &Buffer);
+
+  // Creates a NULL import descriptor.  This is a small object file whcih
+  // contains a NULL import descriptor.  It is used to terminate the imports
+  // from a specific DLL.
+  NewArchiveMember createNullImportDescriptor(std::vector<uint8_t> &Buffer);
+
+  // Create a NULL Thunk Entry.  This is a small object file which contains a
+  // NULL Import Address Table entry and a NULL Import Lookup Table Entry.  It
+  // is used to terminate the IAT and ILT.
+  NewArchiveMember createNullThunk(std::vector<uint8_t> &Buffer);
+
+  // Create a short import file which is described in PE/COFF spec 7. Import
+  // Library Format.
+  NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
+                                     ImportType Type, ImportNameType NameType);
+
+  // Create a weak external file which is described in PE/COFF Aux Format 3.
+  NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp);
+};
+} // namespace
+
+NewArchiveMember
+ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
+  const uint32_t NumberOfSections = 2;
+  const uint32_t NumberOfSymbols = 7;
+  const uint32_t NumberOfRelocations = 3;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(Machine),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
+          // .idata$2
+          sizeof(coff_import_directory_table_entry) +
+          NumberOfRelocations * sizeof(coff_relocation) +
+          // .idata$4
+          (ImportName.size() + 1)),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '2'},
+       u32(0),
+       u32(0),
+       u32(sizeof(coff_import_directory_table_entry)),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
+           sizeof(coff_import_directory_table_entry)),
+       u32(0),
+       u16(NumberOfRelocations),
+       u16(0),
+       u32(IMAGE_SCN_ALIGN_4BYTES | IMAGE_SCN_CNT_INITIALIZED_DATA |
+           IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE)},
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '6'},
+       u32(0),
+       u32(0),
+       u32(ImportName.size() + 1),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
+           sizeof(coff_import_directory_table_entry) +
+           NumberOfRelocations * sizeof(coff_relocation)),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_ALIGN_2BYTES | IMAGE_SCN_CNT_INITIALIZED_DATA |
+           IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE)},
+  };
+  append(Buffer, SectionTable);
+
+  // .idata$2
+  const coff_import_directory_table_entry ImportDescriptor{
+      u32(0), u32(0), u32(0), u32(0), u32(0),
+  };
+  append(Buffer, ImportDescriptor);
+
+  const coff_relocation RelocationTable[NumberOfRelocations] = {
+      {u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
+       u16(getImgRelRelocation(Machine))},
+      {u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
+       u32(3), u16(getImgRelRelocation(Machine))},
+      {u32(offsetof(coff_import_directory_table_entry, ImportAddressTableRVA)),
+       u32(4), u16(getImgRelRelocation(Machine))},
+  };
+  append(Buffer, RelocationTable);
+
+  // .idata$6
+  auto S = Buffer.size();
+  Buffer.resize(S + ImportName.size() + 1);
+  memcpy(&Buffer[S], ImportName.data(), ImportName.size());
+  Buffer[S + ImportName.size()] = '\0';
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '2'}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_SECTION,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '6'}},
+       u32(0),
+       u16(2),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '4'}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_SECTION,
+       0},
+      {{{'.', 'i', 'd', 'a', 't', 'a', '$', '5'}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_SECTION,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+  };
+  // TODO: Name.Offset.Offset here and in the all similar places below
+  // suggests a names refactoring. Maybe StringTableOffset.Value?
+  SymbolTable[0].Name.Offset.Offset =
+      sizeof(uint32_t);
+  SymbolTable[5].Name.Offset.Offset =
+      sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1;
+  SymbolTable[6].Name.Offset.Offset =
+      sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1 +
+      NullImportDescriptorSymbolName.length() + 1;
+  append(Buffer, SymbolTable);
+
+  // String Table
+  writeStringTable(Buffer,
+                   {ImportDescriptorSymbolName, NullImportDescriptorSymbolName,
+                    NullThunkSymbolName});
+
+  StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
+  return {MemoryBufferRef(F, ImportName)};
+}
+
+NewArchiveMember
+ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
+  const uint32_t NumberOfSections = 1;
+  const uint32_t NumberOfSymbols = 1;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(Machine),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
+          // .idata$3
+          sizeof(coff_import_directory_table_entry)),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '3'},
+       u32(0),
+       u32(0),
+       u32(sizeof(coff_import_directory_table_entry)),
+       u32(sizeof(coff_file_header) +
+           (NumberOfSections * sizeof(coff_section))),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_ALIGN_4BYTES | IMAGE_SCN_CNT_INITIALIZED_DATA |
+           IMAGE_SCN_MEM_READ | IMAGE_SCN_MEM_WRITE)},
+  };
+  append(Buffer, SectionTable);
+
+  // .idata$3
+  const coff_import_directory_table_entry ImportDescriptor{
+      u32(0), u32(0), u32(0), u32(0), u32(0),
+  };
+  append(Buffer, ImportDescriptor);
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+  };
+  SymbolTable[0].Name.Offset.Offset = sizeof(uint32_t);
+  append(Buffer, SymbolTable);
+
+  // String Table
+  writeStringTable(Buffer, {NullImportDescriptorSymbolName});
+
+  StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
+  return {MemoryBufferRef(F, ImportName)};
+}
+
+NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
+  const uint32_t NumberOfSections = 2;
+  const uint32_t NumberOfSymbols = 1;
+  uint32_t VASize = is32bit(Machine) ? 4 : 8;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(Machine),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
+          // .idata$5
+          VASize +
+          // .idata$4
+          VASize),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '5'},
+       u32(0),
+       u32(0),
+       u32(VASize),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
+                             : IMAGE_SCN_ALIGN_8BYTES) |
+           IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+           IMAGE_SCN_MEM_WRITE)},
+      {{'.', 'i', 'd', 'a', 't', 'a', '$', '4'},
+       u32(0),
+       u32(0),
+       u32(VASize),
+       u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
+           VASize),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
+                             : IMAGE_SCN_ALIGN_8BYTES) |
+           IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
+           IMAGE_SCN_MEM_WRITE)},
+  };
+  append(Buffer, SectionTable);
+
+  // .idata$5, ILT
+  append(Buffer, u32(0));
+  if (!is32bit(Machine))
+    append(Buffer, u32(0));
+
+  // .idata$4, IAT
+  append(Buffer, u32(0));
+  if (!is32bit(Machine))
+    append(Buffer, u32(0));
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(1),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+  };
+  SymbolTable[0].Name.Offset.Offset = sizeof(uint32_t);
+  append(Buffer, SymbolTable);
+
+  // String Table
+  writeStringTable(Buffer, {NullThunkSymbolName});
+
+  StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
+  return {MemoryBufferRef{F, ImportName}};
+}
+
+NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
+                                                  uint16_t Ordinal,
+                                                  ImportType ImportType,
+                                                  ImportNameType NameType) {
+  size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
+  size_t Size = sizeof(coff_import_header) + ImpSize;
+  char *Buf = Alloc.Allocate<char>(Size);
+  memset(Buf, 0, Size);
+  char *P = Buf;
+
+  // Write short import library.
+  auto *Imp = reinterpret_cast<coff_import_header *>(P);
+  P += sizeof(*Imp);
+  Imp->Sig2 = 0xFFFF;
+  Imp->Machine = Machine;
+  Imp->SizeOfData = ImpSize;
+  if (Ordinal > 0)
+    Imp->OrdinalHint = Ordinal;
+  Imp->TypeInfo = (NameType << 2) | ImportType;
+
+  // Write symbol name and DLL name.
+  memcpy(P, Sym.data(), Sym.size());
+  P += Sym.size() + 1;
+  memcpy(P, ImportName.data(), ImportName.size());
+
+  return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
+}
+
+NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
+                                                   StringRef Weak, bool Imp) {
+  std::vector<uint8_t> Buffer;
+  const uint32_t NumberOfSections = 1;
+  const uint32_t NumberOfSymbols = 5;
+
+  // COFF Header
+  coff_file_header Header{
+      u16(0),
+      u16(NumberOfSections),
+      u32(0),
+      u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))),
+      u32(NumberOfSymbols),
+      u16(0),
+      u16(0),
+  };
+  append(Buffer, Header);
+
+  // Section Header Table
+  const coff_section SectionTable[NumberOfSections] = {
+      {{'.', 'd', 'r', 'e', 'c', 't', 'v', 'e'},
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u32(0),
+       u16(0),
+       u16(0),
+       u32(IMAGE_SCN_LNK_INFO | IMAGE_SCN_LNK_REMOVE)}};
+  append(Buffer, SectionTable);
+
+  // Symbol Table
+  coff_symbol16 SymbolTable[NumberOfSymbols] = {
+      {{{'@', 'c', 'o', 'm', 'p', '.', 'i', 'd'}},
+       u32(0),
+       u16(0xFFFF),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{'@', 'f', 'e', 'a', 't', '.', '0', '0'}},
+       u32(0),
+       u16(0xFFFF),
+       u16(0),
+       IMAGE_SYM_CLASS_STATIC,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_EXTERNAL,
+       0},
+      {{{0, 0, 0, 0, 0, 0, 0, 0}},
+       u32(0),
+       u16(0),
+       u16(0),
+       IMAGE_SYM_CLASS_WEAK_EXTERNAL,
+       1},
+      {{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0},
+  };
+  SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);
+
+  //__imp_ String Table
+  StringRef Prefix = Imp ? "__imp_" : "";
+  SymbolTable[3].Name.Offset.Offset =
+      sizeof(uint32_t) + Sym.size() + Prefix.size() + 1;
+  append(Buffer, SymbolTable);
+  writeStringTable(Buffer, {(Prefix + Sym).str(),
+                            (Prefix + Weak).str()});
+
+  // Copied here so we can still use writeStringTable
+  char *Buf = Alloc.Allocate<char>(Buffer.size());
+  memcpy(Buf, Buffer.data(), Buffer.size());
+  return {MemoryBufferRef(StringRef(Buf, Buffer.size()), ImportName)};
+}
+
+std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
+                                   ArrayRef<COFFShortExport> Exports,
+                                   MachineTypes Machine, bool MakeWeakAliases) {
+
+  std::vector<NewArchiveMember> Members;
+  ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);
+
+  std::vector<uint8_t> ImportDescriptor;
+  Members.push_back(OF.createImportDescriptor(ImportDescriptor));
+
+  std::vector<uint8_t> NullImportDescriptor;
+  Members.push_back(OF.createNullImportDescriptor(NullImportDescriptor));
+
+  std::vector<uint8_t> NullThunk;
+  Members.push_back(OF.createNullThunk(NullThunk));
+
+  for (COFFShortExport E : Exports) {
+    if (E.Private)
+      continue;
+
+    if (E.isWeak() && MakeWeakAliases) {
+      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
+      Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
+      continue;
+    }
+
+    ImportType ImportType = IMPORT_CODE;
+    if (E.Data)
+      ImportType = IMPORT_DATA;
+    if (E.Constant)
+      ImportType = IMPORT_CONST;
+
+    StringRef SymbolName = E.SymbolName.empty() ? E.Name : E.SymbolName;
+    ImportNameType NameType = getNameType(SymbolName, E.Name, Machine);
+    Expected<std::string> Name = E.ExtName.empty()
+                                     ? SymbolName
+                                     : replace(SymbolName, E.Name, E.ExtName);
+
+    if (!Name) {
+      return errorToErrorCode(Name.takeError());
+    }
+
+    Members.push_back(
+        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType));
+  }
+
+  std::pair<StringRef, std::error_code> Result =
+      writeArchive(Path, Members, /*WriteSymtab*/ true, object::Archive::K_GNU,
+                   /*Deterministic*/ true, /*Thin*/ false);
+
+  return Result.second;
+}
+
+} // namespace object
+} // namespace llvm
diff --git a/contrib/llvm/lib/Object/COFFModuleDefinition.cpp b/contrib/llvm/lib/Object/COFFModuleDefinition.cpp
new file mode 100644
index 0000000..510eac8
--- /dev/null
+++ b/contrib/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -0,0 +1,337 @@
+//===--- COFFModuleDefinition.cpp - Simple DEF parser ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Windows-specific.
+// A parser for the module-definition file (.def file).
+//
+// The format of module-definition files are described in this document:
+// https://msdn.microsoft.com/en-us/library/28d6s79h.aspx
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/COFFModuleDefinition.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm::COFF;
+using namespace llvm;
+
+namespace llvm {
+namespace object {
+
+enum Kind {
+  Unknown,
+  Eof,
+  Identifier,
+  Comma,
+  Equal,
+  KwBase,
+  KwConstant,
+  KwData,
+  KwExports,
+  KwHeapsize,
+  KwLibrary,
+  KwName,
+  KwNoname,
+  KwPrivate,
+  KwStacksize,
+  KwVersion,
+};
+
+struct Token {
+  explicit Token(Kind T = Unknown, StringRef S = "") : K(T), Value(S) {}
+  Kind K;
+  StringRef Value;
+};
+
+static bool isDecorated(StringRef Sym, bool MingwDef) {
+  // mingw does not prepend "_".
+  return (!MingwDef && Sym.startswith("_")) || Sym.startswith("@") ||
+         Sym.startswith("?");
+}
+
+static Error createError(const Twine &Err) {
+  return make_error<StringError>(StringRef(Err.str()),
+                                 object_error::parse_failed);
+}
+
+class Lexer {
+public:
+  Lexer(StringRef S) : Buf(S) {}
+
+  Token lex() {
+    Buf = Buf.trim();
+    if (Buf.empty())
+      return Token(Eof);
+
+    switch (Buf[0]) {
+    case '\0':
+      return Token(Eof);
+    case ';': {
+      size_t End = Buf.find('\n');
+      Buf = (End == Buf.npos) ? "" : Buf.drop_front(End);
+      return lex();
+    }
+    case '=':
+      Buf = Buf.drop_front();
+      // GNU dlltool accepts both = and ==.
+      if (Buf.startswith("="))
+        Buf = Buf.drop_front();
+      return Token(Equal, "=");
+    case ',':
+      Buf = Buf.drop_front();
+      return Token(Comma, ",");
+    case '"': {
+      StringRef S;
+      std::tie(S, Buf) = Buf.substr(1).split('"');
+      return Token(Identifier, S);
+    }
+    default: {
+      size_t End = Buf.find_first_of("=,\r\n \t\v");
+      StringRef Word = Buf.substr(0, End);
+      Kind K = llvm::StringSwitch<Kind>(Word)
+                   .Case("BASE", KwBase)
+                   .Case("CONSTANT", KwConstant)
+                   .Case("DATA", KwData)
+                   .Case("EXPORTS", KwExports)
+                   .Case("HEAPSIZE", KwHeapsize)
+                   .Case("LIBRARY", KwLibrary)
+                   .Case("NAME", KwName)
+                   .Case("NONAME", KwNoname)
+                   .Case("PRIVATE", KwPrivate)
+                   .Case("STACKSIZE", KwStacksize)
+                   .Case("VERSION", KwVersion)
+                   .Default(Identifier);
+      Buf = (End == Buf.npos) ? "" : Buf.drop_front(End);
+      return Token(K, Word);
+    }
+    }
+  }
+
+private:
+  StringRef Buf;
+};
+
+class Parser {
+public:
+  explicit Parser(StringRef S, MachineTypes M, bool B)
+      : Lex(S), Machine(M), MingwDef(B) {}
+
+  Expected<COFFModuleDefinition> parse() {
+    do {
+      if (Error Err = parseOne())
+        return std::move(Err);
+    } while (Tok.K != Eof);
+    return Info;
+  }
+
+private:
+  void read() {
+    if (Stack.empty()) {
+      Tok = Lex.lex();
+      return;
+    }
+    Tok = Stack.back();
+    Stack.pop_back();
+  }
+
+  Error readAsInt(uint64_t *I) {
+    read();
+    if (Tok.K != Identifier || Tok.Value.getAsInteger(10, *I))
+      return createError("integer expected");
+    return Error::success();
+  }
+
+  Error expect(Kind Expected, StringRef Msg) {
+    read();
+    if (Tok.K != Expected)
+      return createError(Msg);
+    return Error::success();
+  }
+
+  void unget() { Stack.push_back(Tok); }
+
+  Error parseOne() {
+    read();
+    switch (Tok.K) {
+    case Eof:
+      return Error::success();
+    case KwExports:
+      for (;;) {
+        read();
+        if (Tok.K != Identifier) {
+          unget();
+          return Error::success();
+        }
+        if (Error Err = parseExport())
+          return Err;
+      }
+    case KwHeapsize:
+      return parseNumbers(&Info.HeapReserve, &Info.HeapCommit);
+    case KwStacksize:
+      return parseNumbers(&Info.StackReserve, &Info.StackCommit);
+    case KwLibrary:
+    case KwName: {
+      bool IsDll = Tok.K == KwLibrary; // Check before parseName.
+      std::string Name;
+      if (Error Err = parseName(&Name, &Info.ImageBase))
+        return Err;
+
+      Info.ImportName = Name;
+
+      // Set the output file, but don't override /out if it was already passed.
+      if (Info.OutputFile.empty()) {
+        Info.OutputFile = Name;
+        // Append the appropriate file extension if not already present.
+        if (!sys::path::has_extension(Name))
+          Info.OutputFile += IsDll ? ".dll" : ".exe";
+      }
+
+      return Error::success();
+    }
+    case KwVersion:
+      return parseVersion(&Info.MajorImageVersion, &Info.MinorImageVersion);
+    default:
+      return createError("unknown directive: " + Tok.Value);
+    }
+  }
+
+  Error parseExport() {
+    COFFShortExport E;
+    E.Name = Tok.Value;
+    read();
+    if (Tok.K == Equal) {
+      read();
+      if (Tok.K != Identifier)
+        return createError("identifier expected, but got " + Tok.Value);
+      E.ExtName = E.Name;
+      E.Name = Tok.Value;
+    } else {
+      unget();
+    }
+
+    if (Machine == IMAGE_FILE_MACHINE_I386) {
+      if (!isDecorated(E.Name, MingwDef))
+        E.Name = (std::string("_").append(E.Name));
+      if (!E.ExtName.empty() && !isDecorated(E.ExtName, MingwDef))
+        E.ExtName = (std::string("_").append(E.ExtName));
+    }
+
+    for (;;) {
+      read();
+      if (Tok.K == Identifier && Tok.Value[0] == '@') {
+        if (Tok.Value.drop_front().getAsInteger(10, E.Ordinal)) {
+          // Not an ordinal modifier at all, but the next export (fastcall
+          // decorated) - complete the current one.
+          unget();
+          Info.Exports.push_back(E);
+          return Error::success();
+        }
+        read();
+        if (Tok.K == KwNoname) {
+          E.Noname = true;
+        } else {
+          unget();
+        }
+        continue;
+      }
+      if (Tok.K == KwData) {
+        E.Data = true;
+        continue;
+      }
+      if (Tok.K == KwConstant) {
+        E.Constant = true;
+        continue;
+      }
+      if (Tok.K == KwPrivate) {
+        E.Private = true;
+        continue;
+      }
+      unget();
+      Info.Exports.push_back(E);
+      return Error::success();
+    }
+  }
+
+  // HEAPSIZE/STACKSIZE reserve[,commit]
+  Error parseNumbers(uint64_t *Reserve, uint64_t *Commit) {
+    if (Error Err = readAsInt(Reserve))
+      return Err;
+    read();
+    if (Tok.K != Comma) {
+      unget();
+      Commit = nullptr;
+      return Error::success();
+    }
+    if (Error Err = readAsInt(Commit))
+      return Err;
+    return Error::success();
+  }
+
+  // NAME outputPath [BASE=address]
+  Error parseName(std::string *Out, uint64_t *Baseaddr) {
+    read();
+    if (Tok.K == Identifier) {
+      *Out = Tok.Value;
+    } else {
+      *Out = "";
+      unget();
+      return Error::success();
+    }
+    read();
+    if (Tok.K == KwBase) {
+      if (Error Err = expect(Equal, "'=' expected"))
+        return Err;
+      if (Error Err = readAsInt(Baseaddr))
+        return Err;
+    } else {
+      unget();
+      *Baseaddr = 0;
+    }
+    return Error::success();
+  }
+
+  // VERSION major[.minor]
+  Error parseVersion(uint32_t *Major, uint32_t *Minor) {
+    read();
+    if (Tok.K != Identifier)
+      return createError("identifier expected, but got " + Tok.Value);
+    StringRef V1, V2;
+    std::tie(V1, V2) = Tok.Value.split('.');
+    if (V1.getAsInteger(10, *Major))
+      return createError("integer expected, but got " + Tok.Value);
+    if (V2.empty())
+      *Minor = 0;
+    else if (V2.getAsInteger(10, *Minor))
+      return createError("integer expected, but got " + Tok.Value);
+    return Error::success();
+  }
+
+  Lexer Lex;
+  Token Tok;
+  std::vector<Token> Stack;
+  MachineTypes Machine;
+  COFFModuleDefinition Info;
+  bool MingwDef;
+};
+
+Expected<COFFModuleDefinition> parseCOFFModuleDefinition(MemoryBufferRef MB,
+                                                         MachineTypes Machine,
+                                                         bool MingwDef) {
+  return Parser(MB.getBuffer(), Machine, MingwDef).parse();
+}
+
+} // namespace object
+} // namespace llvm
diff --git a/contrib/llvm/lib/Object/COFFObjectFile.cpp b/contrib/llvm/lib/Object/COFFObjectFile.cpp
index a2d8f12..0a20534 100644
--- a/contrib/llvm/lib/Object/COFFObjectFile.cpp
+++ b/contrib/llvm/lib/Object/COFFObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- COFFObjectFile.cpp - COFF object file implementation -----*- C++ -*-===//
+//===- COFFObjectFile.cpp - COFF object file implementation ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,16 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/COFF.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cctype>
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <limits>
+#include <memory>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
@@ -116,7 +129,7 @@ const coff_symbol_type *COFFObjectFile::toSymb(DataRefImpl Ref) const {
 const coff_section *COFFObjectFile::toSec(DataRefImpl Ref) const {
   const coff_section *Addr = reinterpret_cast<const coff_section*>(Ref.p);
 
-# ifndef NDEBUG
+#ifndef NDEBUG
   // Verify that the section points to a valid entry in the section table.
   if (Addr < SectionTable || Addr >= (SectionTable + getNumberOfSections()))
     report_fatal_error("Section was outside of section table.");
@@ -124,7 +137,7 @@ const coff_section *COFFObjectFile::toSec(DataRefImpl Ref) const {
   uintptr_t Offset = uintptr_t(Addr) - uintptr_t(SectionTable);
   assert(Offset % sizeof(coff_section) == 0 &&
          "Section did not point to the beginning of a section");
-# endif
+#endif
 
   return Addr;
 }
@@ -147,8 +160,7 @@ void COFFObjectFile::moveSymbolNext(DataRefImpl &Ref) const {
 Expected<StringRef> COFFObjectFile::getSymbolName(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   StringRef Result;
-  std::error_code EC = getSymbolName(Symb, Result);
-  if (EC)
+  if (std::error_code EC = getSymbolName(Symb, Result))
     return errorCodeToError(EC);
   return Result;
 }
@@ -215,8 +227,11 @@ uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   if (Symb.isExternal() || Symb.isWeakExternal())
     Result |= SymbolRef::SF_Global;
 
-  if (Symb.isWeakExternal())
+  if (Symb.isWeakExternal()) {
     Result |= SymbolRef::SF_Weak;
+    // We use indirect to allow the archiver to write weak externs
+    Result |= SymbolRef::SF_Indirect;
+  }
 
   if (Symb.getSectionNumber() == COFF::IMAGE_SYM_ABSOLUTE)
     Result |= SymbolRef::SF_Absolute;
@@ -281,6 +296,10 @@ uint64_t COFFObjectFile::getSectionAddress(DataRefImpl Ref) const {
   return Result;
 }
 
+uint64_t COFFObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  return toSec(Sec) - SectionTable;
+}
+
 uint64_t COFFObjectFile::getSectionSize(DataRefImpl Ref) const {
   return getSectionSize(toSec(Ref));
 }
@@ -634,6 +653,23 @@ std::error_code COFFObjectFile::initDebugDirectoryPtr() {
   return std::error_code();
 }
 
+std::error_code COFFObjectFile::initLoadConfigPtr() {
+  // Get the RVA of the debug directory. Do nothing if it does not exist.
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::LOAD_CONFIG_TABLE, DataEntry))
+    return std::error_code();
+
+  // Do nothing if the RVA is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return std::error_code();
+  uintptr_t IntPtr = 0;
+  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return EC;
+
+  LoadConfig = (const void *)IntPtr;
+  return std::error_code();
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
     : ObjectFile(Binary::ID_COFF, Object), COFFHeader(nullptr),
       COFFBigObjHeader(nullptr), PE32Header(nullptr), PE32PlusHeader(nullptr),
@@ -768,6 +804,9 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
   if ((EC = initDebugDirectoryPtr()))
     return;
 
+  if ((EC = initLoadConfigPtr()))
+    return;
+
   EC = std::error_code();
 }
 
@@ -847,7 +886,7 @@ base_reloc_iterator COFFObjectFile::base_reloc_end() const {
 }
 
 uint8_t COFFObjectFile::getBytesInAddress() const {
-  return getArch() == Triple::x86_64 ? 8 : 4;
+  return getArch() == Triple::x86_64 || getArch() == Triple::aarch64 ? 8 : 4;
 }
 
 StringRef COFFObjectFile::getFileFormatName() const {
@@ -985,7 +1024,7 @@ COFFObjectFile::getSymbolAuxData(COFFSymbolRef Symbol) const {
   if (Symbol.getNumberOfAuxSymbols() > 0) {
     // AUX data comes immediately after the symbol in COFF
     Aux = reinterpret_cast<const uint8_t *>(Symbol.getRawPtr()) + SymbolSize;
-# ifndef NDEBUG
+#ifndef NDEBUG
     // Verify that the Aux symbol points to a valid entry in the symbol table.
     uintptr_t Offset = uintptr_t(Aux) - uintptr_t(base());
     if (Offset < getPointerToSymbolTable() ||
@@ -995,7 +1034,7 @@ COFFObjectFile::getSymbolAuxData(COFFSymbolRef Symbol) const {
 
     assert((Offset - getPointerToSymbolTable()) % SymbolSize == 0 &&
            "Aux Symbol data did not point to the beginning of a symbol");
-# endif
+#endif
   }
   return makeArrayRef(Aux, Symbol.getNumberOfAuxSymbols() * SymbolSize);
 }
@@ -1050,7 +1089,7 @@ COFFObjectFile::getSectionContents(const coff_section *Sec,
   // In COFF, a virtual section won't have any in-file
   // content, so the file pointer to the content will be zero.
   if (Sec->PointerToRawData == 0)
-    return object_error::parse_failed;
+    return std::error_code();
   // The only thing that we need to verify is that the contents is contained
   // within the file bounds. We don't need to make sure it doesn't cover other
   // data, as there's nothing that says that is not allowed.
@@ -1180,6 +1219,29 @@ void COFFObjectFile::getRelocationTypeName(
       Res = "Unknown";
     }
     break;
+  case COFF::IMAGE_FILE_MACHINE_ARM64:
+    switch (Reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH26);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEBASE_REL21);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_REL21);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEOFFSET_12A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_PAGEOFFSET_12L);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_LOW12A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_HIGH12A);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECREL_LOW12L);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_ADDR64);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH19);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_ARM64_BRANCH14);
+    default:
+      Res = "Unknown";
+    }
+    break;
   case COFF::IMAGE_FILE_MACHINE_I386:
     switch (Reloc->Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE);
@@ -1579,3 +1641,42 @@ std::error_code BaseRelocRef::getRVA(uint32_t &Result) const {
   Result = Header->PageRVA + Entry[Index].getOffset();
   return std::error_code();
 }
+
+#define RETURN_IF_ERROR(X)                                                     \
+  if (auto EC = errorToErrorCode(X))                                           \
+    return EC;
+
+ErrorOr<ArrayRef<UTF16>> ResourceSectionRef::getDirStringAtOffset(uint32_t Offset) {
+  BinaryStreamReader Reader = BinaryStreamReader(BBS);
+  Reader.setOffset(Offset);
+  uint16_t Length;
+  RETURN_IF_ERROR(Reader.readInteger(Length));
+  ArrayRef<UTF16> RawDirString;
+  RETURN_IF_ERROR(Reader.readArray(RawDirString, Length));
+  return RawDirString;
+}
+
+ErrorOr<ArrayRef<UTF16>>
+ResourceSectionRef::getEntryNameString(const coff_resource_dir_entry &Entry) {
+  return getDirStringAtOffset(Entry.Identifier.getNameOffset());
+}
+
+ErrorOr<const coff_resource_dir_table &>
+ResourceSectionRef::getTableAtOffset(uint32_t Offset) {
+  const coff_resource_dir_table *Table = nullptr;
+
+  BinaryStreamReader Reader(BBS);
+  Reader.setOffset(Offset);
+  RETURN_IF_ERROR(Reader.readObject(Table));
+  assert(Table != nullptr);
+  return *Table;
+}
+
+ErrorOr<const coff_resource_dir_table &>
+ResourceSectionRef::getEntrySubDir(const coff_resource_dir_entry &Entry) {
+  return getTableAtOffset(Entry.Offset.value());
+}
+
+ErrorOr<const coff_resource_dir_table &> ResourceSectionRef::getBaseTable() {
+  return getTableAtOffset(0);
+}
diff --git a/contrib/llvm/lib/Object/Decompressor.cpp b/contrib/llvm/lib/Object/Decompressor.cpp
index bca41fd..53f084d 100644
--- a/contrib/llvm/lib/Object/Decompressor.cpp
+++ b/contrib/llvm/lib/Object/Decompressor.cpp
@@ -8,11 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/Decompressor.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/ELF.h"
 
 using namespace llvm;
 using namespace llvm::support::endian;
@@ -88,15 +88,7 @@ bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) {
   return (Flags & ELF::SHF_COMPRESSED) || isGnuStyle(Name);
 }
 
-Error Decompressor::decompress(SmallString<32> &Out) {
-  Out.resize(DecompressedSize);
-  return decompress({Out.data(), (size_t)DecompressedSize});
-}
-
 Error Decompressor::decompress(MutableArrayRef<char> Buffer) {
   size_t Size = Buffer.size();
-  zlib::Status Status = zlib::uncompress(SectionData, Buffer.data(), Size);
-  if (Status != zlib::StatusOK)
-    return createError("decompression failed");
-  return Error::success();
+  return zlib::uncompress(SectionData, Buffer.data(), Size);
 }
diff --git a/contrib/llvm/lib/Object/ELF.cpp b/contrib/llvm/lib/Object/ELF.cpp
index 23682e1..448fb1b 100644
--- a/contrib/llvm/lib/Object/ELF.cpp
+++ b/contrib/llvm/lib/Object/ELF.cpp
@@ -1,4 +1,4 @@
-//===- ELF.cpp - ELF object file implementation -----------------*- C++ -*-===//
+//===- ELF.cpp - ELF object file implementation ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,19 +8,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ELF.h"
+#include "llvm/BinaryFormat/ELF.h"
 
-namespace llvm {
-namespace object {
+using namespace llvm;
+using namespace object;
 
-#define ELF_RELOC(name, value)                                          \
-  case ELF::name:                                                       \
-    return #name;                                                       \
+#define STRINGIFY_ENUM_CASE(ns, name)                                          \
+  case ns::name:                                                               \
+    return #name;
 
-StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
+#define ELF_RELOC(name, value) STRINGIFY_ENUM_CASE(ELF, name)
+
+StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
+                                                 uint32_t Type) {
   switch (Machine) {
   case ELF::EM_X86_64:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/x86_64.def"
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
     default:
       break;
     }
@@ -28,77 +32,77 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
   case ELF::EM_386:
   case ELF::EM_IAMCU:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/i386.def"
+#include "llvm/BinaryFormat/ELFRelocs/i386.def"
     default:
       break;
     }
     break;
   case ELF::EM_MIPS:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/Mips.def"
+#include "llvm/BinaryFormat/ELFRelocs/Mips.def"
     default:
       break;
     }
     break;
   case ELF::EM_AARCH64:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/AArch64.def"
+#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
     default:
       break;
     }
     break;
   case ELF::EM_ARM:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/ARM.def"
+#include "llvm/BinaryFormat/ELFRelocs/ARM.def"
     default:
       break;
     }
     break;
   case ELF::EM_AVR:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/AVR.def"
+#include "llvm/BinaryFormat/ELFRelocs/AVR.def"
     default:
       break;
     }
     break;
   case ELF::EM_HEXAGON:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/Hexagon.def"
+#include "llvm/BinaryFormat/ELFRelocs/Hexagon.def"
     default:
       break;
     }
     break;
   case ELF::EM_LANAI:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/Lanai.def"
+#include "llvm/BinaryFormat/ELFRelocs/Lanai.def"
     default:
       break;
     }
     break;
   case ELF::EM_PPC:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/PowerPC.def"
+#include "llvm/BinaryFormat/ELFRelocs/PowerPC.def"
     default:
       break;
     }
     break;
   case ELF::EM_PPC64:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/PowerPC64.def"
+#include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     default:
       break;
     }
     break;
   case ELF::EM_RISCV:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/RISCV.def"
+#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
     default:
       break;
     }
     break;
   case ELF::EM_S390:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/SystemZ.def"
+#include "llvm/BinaryFormat/ELFRelocs/SystemZ.def"
     default:
       break;
     }
@@ -107,27 +111,27 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
   case ELF::EM_SPARC32PLUS:
   case ELF::EM_SPARCV9:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/Sparc.def"
+#include "llvm/BinaryFormat/ELFRelocs/Sparc.def"
     default:
       break;
     }
     break;
   case ELF::EM_WEBASSEMBLY:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/WebAssembly.def"
+#include "llvm/BinaryFormat/ELFRelocs/WebAssembly.def"
     default:
       break;
     }
     break;
   case ELF::EM_AMDGPU:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/AMDGPU.def"
+#include "llvm/BinaryFormat/ELFRelocs/AMDGPU.def"
     default:
       break;
     }
   case ELF::EM_BPF:
     switch (Type) {
-#include "llvm/Support/ELFRelocs/BPF.def"
+#include "llvm/BinaryFormat/ELFRelocs/BPF.def"
     default:
       break;
     }
@@ -140,5 +144,61 @@ StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
 
 #undef ELF_RELOC
 
-} // end namespace object
-} // end namespace llvm
+StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
+  switch (Machine) {
+  case ELF::EM_ARM:
+    switch (Type) {
+      STRINGIFY_ENUM_CASE(ELF, SHT_ARM_EXIDX);
+      STRINGIFY_ENUM_CASE(ELF, SHT_ARM_PREEMPTMAP);
+      STRINGIFY_ENUM_CASE(ELF, SHT_ARM_ATTRIBUTES);
+      STRINGIFY_ENUM_CASE(ELF, SHT_ARM_DEBUGOVERLAY);
+      STRINGIFY_ENUM_CASE(ELF, SHT_ARM_OVERLAYSECTION);
+    }
+    break;
+  case ELF::EM_HEXAGON:
+    switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_HEX_ORDERED); }
+    break;
+  case ELF::EM_X86_64:
+    switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_X86_64_UNWIND); }
+    break;
+  case ELF::EM_MIPS:
+  case ELF::EM_MIPS_RS3_LE:
+    switch (Type) {
+      STRINGIFY_ENUM_CASE(ELF, SHT_MIPS_REGINFO);
+      STRINGIFY_ENUM_CASE(ELF, SHT_MIPS_OPTIONS);
+      STRINGIFY_ENUM_CASE(ELF, SHT_MIPS_ABIFLAGS);
+      STRINGIFY_ENUM_CASE(ELF, SHT_MIPS_DWARF);
+    }
+    break;
+  default:
+    break;
+  }
+
+  switch (Type) {
+    STRINGIFY_ENUM_CASE(ELF, SHT_NULL);
+    STRINGIFY_ENUM_CASE(ELF, SHT_PROGBITS);
+    STRINGIFY_ENUM_CASE(ELF, SHT_SYMTAB);
+    STRINGIFY_ENUM_CASE(ELF, SHT_STRTAB);
+    STRINGIFY_ENUM_CASE(ELF, SHT_RELA);
+    STRINGIFY_ENUM_CASE(ELF, SHT_HASH);
+    STRINGIFY_ENUM_CASE(ELF, SHT_DYNAMIC);
+    STRINGIFY_ENUM_CASE(ELF, SHT_NOTE);
+    STRINGIFY_ENUM_CASE(ELF, SHT_NOBITS);
+    STRINGIFY_ENUM_CASE(ELF, SHT_REL);
+    STRINGIFY_ENUM_CASE(ELF, SHT_SHLIB);
+    STRINGIFY_ENUM_CASE(ELF, SHT_DYNSYM);
+    STRINGIFY_ENUM_CASE(ELF, SHT_INIT_ARRAY);
+    STRINGIFY_ENUM_CASE(ELF, SHT_FINI_ARRAY);
+    STRINGIFY_ENUM_CASE(ELF, SHT_PREINIT_ARRAY);
+    STRINGIFY_ENUM_CASE(ELF, SHT_GROUP);
+    STRINGIFY_ENUM_CASE(ELF, SHT_SYMTAB_SHNDX);
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_ODRTAB);
+    STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
+    STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
+    STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verdef);
+    STRINGIFY_ENUM_CASE(ELF, SHT_GNU_verneed);
+    STRINGIFY_ENUM_CASE(ELF, SHT_GNU_versym);
+  default:
+    return "Unknown";
+  }
+}
diff --git a/contrib/llvm/lib/Object/ELFObjectFile.cpp b/contrib/llvm/lib/Object/ELFObjectFile.cpp
index 4bd69e3..fa136d7 100644
--- a/contrib/llvm/lib/Object/ELFObjectFile.cpp
+++ b/contrib/llvm/lib/Object/ELFObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- ELFObjectFile.cpp - ELF object file implementation -------*- C++ -*-===//
+//===- ELFObjectFile.cpp - ELF object file implementation -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +12,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Support/ARMAttributeParser.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
-namespace llvm {
+using namespace llvm;
 using namespace object;
 
 ELFObjectFileBase::ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source)
@@ -55,71 +72,245 @@ ObjectFile::createELFObjectFile(MemoryBufferRef Obj) {
   return std::move(R);
 }
 
-SubtargetFeatures ELFObjectFileBase::getFeatures() const {
-  switch (getEMachine()) {
-  case ELF::EM_MIPS: {
-    SubtargetFeatures Features;
-    unsigned PlatformFlags;
-    getPlatformFlags(PlatformFlags);
+SubtargetFeatures ELFObjectFileBase::getMIPSFeatures() const {
+  SubtargetFeatures Features;
+  unsigned PlatformFlags;
+  getPlatformFlags(PlatformFlags);
+
+  switch (PlatformFlags & ELF::EF_MIPS_ARCH) {
+  case ELF::EF_MIPS_ARCH_1:
+    break;
+  case ELF::EF_MIPS_ARCH_2:
+    Features.AddFeature("mips2");
+    break;
+  case ELF::EF_MIPS_ARCH_3:
+    Features.AddFeature("mips3");
+    break;
+  case ELF::EF_MIPS_ARCH_4:
+    Features.AddFeature("mips4");
+    break;
+  case ELF::EF_MIPS_ARCH_5:
+    Features.AddFeature("mips5");
+    break;
+  case ELF::EF_MIPS_ARCH_32:
+    Features.AddFeature("mips32");
+    break;
+  case ELF::EF_MIPS_ARCH_64:
+    Features.AddFeature("mips64");
+    break;
+  case ELF::EF_MIPS_ARCH_32R2:
+    Features.AddFeature("mips32r2");
+    break;
+  case ELF::EF_MIPS_ARCH_64R2:
+    Features.AddFeature("mips64r2");
+    break;
+  case ELF::EF_MIPS_ARCH_32R6:
+    Features.AddFeature("mips32r6");
+    break;
+  case ELF::EF_MIPS_ARCH_64R6:
+    Features.AddFeature("mips64r6");
+    break;
+  default:
+    llvm_unreachable("Unknown EF_MIPS_ARCH value");
+  }
+
+  switch (PlatformFlags & ELF::EF_MIPS_MACH) {
+  case ELF::EF_MIPS_MACH_NONE:
+    // No feature associated with this value.
+    break;
+  case ELF::EF_MIPS_MACH_OCTEON:
+    Features.AddFeature("cnmips");
+    break;
+  default:
+    llvm_unreachable("Unknown EF_MIPS_ARCH value");
+  }
+
+  if (PlatformFlags & ELF::EF_MIPS_ARCH_ASE_M16)
+    Features.AddFeature("mips16");
+  if (PlatformFlags & ELF::EF_MIPS_MICROMIPS)
+    Features.AddFeature("micromips");
+
+  return Features;
+}
+
+SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
+  SubtargetFeatures Features;
+  ARMAttributeParser Attributes;
+  std::error_code EC = getBuildAttributes(Attributes);
+  if (EC)
+    return SubtargetFeatures();
+
+  // both ARMv7-M and R have to support thumb hardware div
+  bool isV7 = false;
+  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch))
+    isV7 = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch)
+      == ARMBuildAttrs::v7;
 
-    switch (PlatformFlags & ELF::EF_MIPS_ARCH) {
-    case ELF::EF_MIPS_ARCH_1:
+  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch_profile)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile)) {
+    case ARMBuildAttrs::ApplicationProfile:
+      Features.AddFeature("aclass");
       break;
-    case ELF::EF_MIPS_ARCH_2:
-      Features.AddFeature("mips2");
+    case ARMBuildAttrs::RealTimeProfile:
+      Features.AddFeature("rclass");
+      if (isV7)
+        Features.AddFeature("hwdiv");
       break;
-    case ELF::EF_MIPS_ARCH_3:
-      Features.AddFeature("mips3");
+    case ARMBuildAttrs::MicroControllerProfile:
+      Features.AddFeature("mclass");
+      if (isV7)
+        Features.AddFeature("hwdiv");
       break;
-    case ELF::EF_MIPS_ARCH_4:
-      Features.AddFeature("mips4");
+    }
+  }
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::THUMB_ISA_use)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use)) {
+    default:
       break;
-    case ELF::EF_MIPS_ARCH_5:
-      Features.AddFeature("mips5");
+    case ARMBuildAttrs::Not_Allowed:
+      Features.AddFeature("thumb", false);
+      Features.AddFeature("thumb2", false);
       break;
-    case ELF::EF_MIPS_ARCH_32:
-      Features.AddFeature("mips32");
+    case ARMBuildAttrs::AllowThumb32:
+      Features.AddFeature("thumb2");
       break;
-    case ELF::EF_MIPS_ARCH_64:
-      Features.AddFeature("mips64");
+    }
+  }
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::FP_arch)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::FP_arch)) {
+    default:
       break;
-    case ELF::EF_MIPS_ARCH_32R2:
-      Features.AddFeature("mips32r2");
+    case ARMBuildAttrs::Not_Allowed:
+      Features.AddFeature("vfp2", false);
+      Features.AddFeature("vfp3", false);
+      Features.AddFeature("vfp4", false);
       break;
-    case ELF::EF_MIPS_ARCH_64R2:
-      Features.AddFeature("mips64r2");
+    case ARMBuildAttrs::AllowFPv2:
+      Features.AddFeature("vfp2");
       break;
-    case ELF::EF_MIPS_ARCH_32R6:
-      Features.AddFeature("mips32r6");
+    case ARMBuildAttrs::AllowFPv3A:
+    case ARMBuildAttrs::AllowFPv3B:
+      Features.AddFeature("vfp3");
       break;
-    case ELF::EF_MIPS_ARCH_64R6:
-      Features.AddFeature("mips64r6");
+    case ARMBuildAttrs::AllowFPv4A:
+    case ARMBuildAttrs::AllowFPv4B:
+      Features.AddFeature("vfp4");
       break;
-    default:
-      llvm_unreachable("Unknown EF_MIPS_ARCH value");
     }
+  }
 
-    switch (PlatformFlags & ELF::EF_MIPS_MACH) {
-    case ELF::EF_MIPS_MACH_NONE:
-      // No feature associated with this value.
+  if (Attributes.hasAttribute(ARMBuildAttrs::Advanced_SIMD_arch)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::Advanced_SIMD_arch)) {
+    default:
+      break;
+    case ARMBuildAttrs::Not_Allowed:
+      Features.AddFeature("neon", false);
+      Features.AddFeature("fp16", false);
       break;
-    case ELF::EF_MIPS_MACH_OCTEON:
-      Features.AddFeature("cnmips");
+    case ARMBuildAttrs::AllowNeon:
+      Features.AddFeature("neon");
       break;
+    case ARMBuildAttrs::AllowNeon2:
+      Features.AddFeature("neon");
+      Features.AddFeature("fp16");
+      break;
+    }
+  }
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::DIV_use)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::DIV_use)) {
     default:
-      llvm_unreachable("Unknown EF_MIPS_ARCH value");
+      break;
+    case ARMBuildAttrs::DisallowDIV:
+      Features.AddFeature("hwdiv", false);
+      Features.AddFeature("hwdiv-arm", false);
+      break;
+    case ARMBuildAttrs::AllowDIVExt:
+      Features.AddFeature("hwdiv");
+      Features.AddFeature("hwdiv-arm");
+      break;
     }
+  }
 
-    if (PlatformFlags & ELF::EF_MIPS_ARCH_ASE_M16)
-      Features.AddFeature("mips16");
-    if (PlatformFlags & ELF::EF_MIPS_MICROMIPS)
-      Features.AddFeature("micromips");
+  return Features;
+}
 
-    return Features;
-  }
+SubtargetFeatures ELFObjectFileBase::getFeatures() const {
+  switch (getEMachine()) {
+  case ELF::EM_MIPS:
+    return getMIPSFeatures();
+  case ELF::EM_ARM:
+    return getARMFeatures();
   default:
     return SubtargetFeatures();
   }
 }
 
-} // end namespace llvm
+// FIXME Encode from a tablegen description or target parser.
+void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
+  if (TheTriple.getSubArch() != Triple::NoSubArch)
+    return;
+
+  ARMAttributeParser Attributes;
+  std::error_code EC = getBuildAttributes(Attributes);
+  if (EC)
+    return;
+
+  std::string Triple;
+  // Default to ARM, but use the triple if it's been set.
+  if (TheTriple.getArch() == Triple::thumb ||
+      TheTriple.getArch() == Triple::thumbeb)
+    Triple = "thumb";
+  else
+    Triple = "arm";
+
+  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch)) {
+    switch(Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch)) {
+    case ARMBuildAttrs::v4:
+      Triple += "v4";
+      break;
+    case ARMBuildAttrs::v4T:
+      Triple += "v4t";
+      break;
+    case ARMBuildAttrs::v5T:
+      Triple += "v5t";
+      break;
+    case ARMBuildAttrs::v5TE:
+      Triple += "v5te";
+      break;
+    case ARMBuildAttrs::v5TEJ:
+      Triple += "v5tej";
+      break;
+    case ARMBuildAttrs::v6:
+      Triple += "v6";
+      break;
+    case ARMBuildAttrs::v6KZ:
+      Triple += "v6kz";
+      break;
+    case ARMBuildAttrs::v6T2:
+      Triple += "v6t2";
+      break;
+    case ARMBuildAttrs::v6K:
+      Triple += "v6k";
+      break;
+    case ARMBuildAttrs::v7:
+      Triple += "v7";
+      break;
+    case ARMBuildAttrs::v6_M:
+      Triple += "v6m";
+      break;
+    case ARMBuildAttrs::v6S_M:
+      Triple += "v6sm";
+      break;
+    case ARMBuildAttrs::v7E_M:
+      Triple += "v7em";
+      break;
+    }
+  }
+  if (!isLittleEndian())
+    Triple += "eb";
+
+  TheTriple.setArchName(Triple);
+}
diff --git a/contrib/llvm/lib/Object/IRObjectFile.cpp b/contrib/llvm/lib/Object/IRObjectFile.cpp
index adbf0de..e7807b0 100644
--- a/contrib/llvm/lib/Object/IRObjectFile.cpp
+++ b/contrib/llvm/lib/Object/IRObjectFile.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Object/IRObjectFile.h"
 #include "RecordStreamer.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/GVMaterializer.h"
 #include "llvm/IR/LLVMContext.h"
@@ -95,13 +96,13 @@ ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInObject(const ObjectFile &Obj
 }
 
 ErrorOr<MemoryBufferRef> IRObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
-  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
+  file_magic Type = identify_magic(Object.getBuffer());
   switch (Type) {
-  case sys::fs::file_magic::bitcode:
+  case file_magic::bitcode:
     return Object;
-  case sys::fs::file_magic::elf_relocatable:
-  case sys::fs::file_magic::macho_object:
-  case sys::fs::file_magic::coff_object: {
+  case file_magic::elf_relocatable:
+  case file_magic::macho_object:
+  case file_magic::coff_object: {
     Expected<std::unique_ptr<ObjectFile>> ObjFile =
         ObjectFile::createObjectFile(Object, Type);
     if (!ObjFile)
@@ -138,3 +139,25 @@ IRObjectFile::create(MemoryBufferRef Object, LLVMContext &Context) {
   return std::unique_ptr<IRObjectFile>(
       new IRObjectFile(*BCOrErr, std::move(Mods)));
 }
+
+Expected<IRSymtabFile> object::readIRSymtab(MemoryBufferRef MBRef) {
+  IRSymtabFile F;
+  ErrorOr<MemoryBufferRef> BCOrErr =
+      IRObjectFile::findBitcodeInMemBuffer(MBRef);
+  if (!BCOrErr)
+    return errorCodeToError(BCOrErr.getError());
+
+  Expected<BitcodeFileContents> BFCOrErr = getBitcodeFileContents(*BCOrErr);
+  if (!BFCOrErr)
+    return BFCOrErr.takeError();
+
+  Expected<irsymtab::FileContents> FCOrErr = irsymtab::readBitcode(*BFCOrErr);
+  if (!FCOrErr)
+    return FCOrErr.takeError();
+
+  F.Mods = std::move(BFCOrErr->Mods);
+  F.Symtab = std::move(FCOrErr->Symtab);
+  F.Strtab = std::move(FCOrErr->Strtab);
+  F.TheReader = std::move(FCOrErr->TheReader);
+  return std::move(F);
+}
diff --git a/contrib/llvm/lib/Object/IRSymtab.cpp b/contrib/llvm/lib/Object/IRSymtab.cpp
new file mode 100644
index 0000000..7a6424a
--- /dev/null
+++ b/contrib/llvm/lib/Object/IRSymtab.cpp
@@ -0,0 +1,348 @@
+//===- IRSymtab.cpp - implementation of IR symbol tables ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/IRSymtab.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ObjectUtils.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/VCSRevision.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace irsymtab;
+
+namespace {
+
+const char *getExpectedProducerName() {
+  static char DefaultName[] = LLVM_VERSION_STRING
+#ifdef LLVM_REVISION
+      " " LLVM_REVISION
+#endif
+      ;
+  // Allows for testing of the irsymtab writer and upgrade mechanism. This
+  // environment variable should not be set by users.
+  if (char *OverrideName = getenv("LLVM_OVERRIDE_PRODUCER"))
+    return OverrideName;
+  return DefaultName;
+}
+
+const char *kExpectedProducerName = getExpectedProducerName();
+
+/// Stores the temporary state that is required to build an IR symbol table.
+struct Builder {
+  SmallVector<char, 0> &Symtab;
+  StringTableBuilder &StrtabBuilder;
+  StringSaver Saver;
+
+  // This ctor initializes a StringSaver using the passed in BumpPtrAllocator.
+  // The StringTableBuilder does not create a copy of any strings added to it,
+  // so this provides somewhere to store any strings that we create.
+  Builder(SmallVector<char, 0> &Symtab, StringTableBuilder &StrtabBuilder,
+          BumpPtrAllocator &Alloc)
+      : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {}
+
+  DenseMap<const Comdat *, unsigned> ComdatMap;
+  Mangler Mang;
+  Triple TT;
+
+  std::vector<storage::Comdat> Comdats;
+  std::vector<storage::Module> Mods;
+  std::vector<storage::Symbol> Syms;
+  std::vector<storage::Uncommon> Uncommons;
+
+  std::string COFFLinkerOpts;
+  raw_string_ostream COFFLinkerOptsOS{COFFLinkerOpts};
+
+  void setStr(storage::Str &S, StringRef Value) {
+    S.Offset = StrtabBuilder.add(Value);
+    S.Size = Value.size();
+  }
+
+  template <typename T>
+  void writeRange(storage::Range<T> &R, const std::vector<T> &Objs) {
+    R.Offset = Symtab.size();
+    R.Size = Objs.size();
+    Symtab.insert(Symtab.end(), reinterpret_cast<const char *>(Objs.data()),
+                  reinterpret_cast<const char *>(Objs.data() + Objs.size()));
+  }
+
+  Error addModule(Module *M);
+  Error addSymbol(const ModuleSymbolTable &Msymtab,
+                  const SmallPtrSet<GlobalValue *, 8> &Used,
+                  ModuleSymbolTable::Symbol Sym);
+
+  Error build(ArrayRef<Module *> Mods);
+};
+
+Error Builder::addModule(Module *M) {
+  if (M->getDataLayoutStr().empty())
+    return make_error<StringError>("input module has no datalayout",
+                                   inconvertibleErrorCode());
+
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(*M, Used, /*CompilerUsed*/ false);
+
+  ModuleSymbolTable Msymtab;
+  Msymtab.addModule(M);
+
+  storage::Module Mod;
+  Mod.Begin = Syms.size();
+  Mod.End = Syms.size() + Msymtab.symbols().size();
+  Mod.UncBegin = Uncommons.size();
+  Mods.push_back(Mod);
+
+  if (TT.isOSBinFormatCOFF()) {
+    if (auto E = M->materializeMetadata())
+      return E;
+    if (NamedMDNode *LinkerOptions =
+            M->getNamedMetadata("llvm.linker.options")) {
+      for (MDNode *MDOptions : LinkerOptions->operands())
+        for (const MDOperand &MDOption : cast<MDNode>(MDOptions)->operands())
+          COFFLinkerOptsOS << " " << cast<MDString>(MDOption)->getString();
+    }
+  }
+
+  for (ModuleSymbolTable::Symbol Msym : Msymtab.symbols())
+    if (Error Err = addSymbol(Msymtab, Used, Msym))
+      return Err;
+
+  return Error::success();
+}
+
+Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
+                         const SmallPtrSet<GlobalValue *, 8> &Used,
+                         ModuleSymbolTable::Symbol Msym) {
+  Syms.emplace_back();
+  storage::Symbol &Sym = Syms.back();
+  Sym = {};
+
+  storage::Uncommon *Unc = nullptr;
+  auto Uncommon = [&]() -> storage::Uncommon & {
+    if (Unc)
+      return *Unc;
+    Sym.Flags |= 1 << storage::Symbol::FB_has_uncommon;
+    Uncommons.emplace_back();
+    Unc = &Uncommons.back();
+    *Unc = {};
+    setStr(Unc->COFFWeakExternFallbackName, "");
+    return *Unc;
+  };
+
+  SmallString<64> Name;
+  {
+    raw_svector_ostream OS(Name);
+    Msymtab.printSymbolName(OS, Msym);
+  }
+  setStr(Sym.Name, Saver.save(StringRef(Name)));
+
+  auto Flags = Msymtab.getSymbolFlags(Msym);
+  if (Flags & object::BasicSymbolRef::SF_Undefined)
+    Sym.Flags |= 1 << storage::Symbol::FB_undefined;
+  if (Flags & object::BasicSymbolRef::SF_Weak)
+    Sym.Flags |= 1 << storage::Symbol::FB_weak;
+  if (Flags & object::BasicSymbolRef::SF_Common)
+    Sym.Flags |= 1 << storage::Symbol::FB_common;
+  if (Flags & object::BasicSymbolRef::SF_Indirect)
+    Sym.Flags |= 1 << storage::Symbol::FB_indirect;
+  if (Flags & object::BasicSymbolRef::SF_Global)
+    Sym.Flags |= 1 << storage::Symbol::FB_global;
+  if (Flags & object::BasicSymbolRef::SF_FormatSpecific)
+    Sym.Flags |= 1 << storage::Symbol::FB_format_specific;
+  if (Flags & object::BasicSymbolRef::SF_Executable)
+    Sym.Flags |= 1 << storage::Symbol::FB_executable;
+
+  Sym.ComdatIndex = -1;
+  auto *GV = Msym.dyn_cast<GlobalValue *>();
+  if (!GV) {
+    // Undefined module asm symbols act as GC roots and are implicitly used.
+    if (Flags & object::BasicSymbolRef::SF_Undefined)
+      Sym.Flags |= 1 << storage::Symbol::FB_used;
+    setStr(Sym.IRName, "");
+    return Error::success();
+  }
+
+  setStr(Sym.IRName, GV->getName());
+
+  if (Used.count(GV))
+    Sym.Flags |= 1 << storage::Symbol::FB_used;
+  if (GV->isThreadLocal())
+    Sym.Flags |= 1 << storage::Symbol::FB_tls;
+  if (GV->hasGlobalUnnamedAddr())
+    Sym.Flags |= 1 << storage::Symbol::FB_unnamed_addr;
+  if (canBeOmittedFromSymbolTable(GV))
+    Sym.Flags |= 1 << storage::Symbol::FB_may_omit;
+  Sym.Flags |= unsigned(GV->getVisibility()) << storage::Symbol::FB_visibility;
+
+  if (Flags & object::BasicSymbolRef::SF_Common) {
+    Uncommon().CommonSize = GV->getParent()->getDataLayout().getTypeAllocSize(
+        GV->getType()->getElementType());
+    Uncommon().CommonAlign = GV->getAlignment();
+  }
+
+  const GlobalObject *Base = GV->getBaseObject();
+  if (!Base)
+    return make_error<StringError>("Unable to determine comdat of alias!",
+                                   inconvertibleErrorCode());
+  if (const Comdat *C = Base->getComdat()) {
+    auto P = ComdatMap.insert(std::make_pair(C, Comdats.size()));
+    Sym.ComdatIndex = P.first->second;
+
+    if (P.second) {
+      storage::Comdat Comdat;
+      setStr(Comdat.Name, C->getName());
+      Comdats.push_back(Comdat);
+    }
+  }
+
+  if (TT.isOSBinFormatCOFF()) {
+    emitLinkerFlagsForGlobalCOFF(COFFLinkerOptsOS, GV, TT, Mang);
+
+    if ((Flags & object::BasicSymbolRef::SF_Weak) &&
+        (Flags & object::BasicSymbolRef::SF_Indirect)) {
+      std::string FallbackName;
+      raw_string_ostream OS(FallbackName);
+      Msymtab.printSymbolName(
+          OS, cast<GlobalValue>(
+                  cast<GlobalAlias>(GV)->getAliasee()->stripPointerCasts()));
+      OS.flush();
+      setStr(Uncommon().COFFWeakExternFallbackName, Saver.save(FallbackName));
+    }
+  }
+
+  return Error::success();
+}
+
+Error Builder::build(ArrayRef<Module *> IRMods) {
+  storage::Header Hdr;
+
+  assert(!IRMods.empty());
+  Hdr.Version = storage::Header::kCurrentVersion;
+  setStr(Hdr.Producer, kExpectedProducerName);
+  setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple());
+  setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
+  TT = Triple(IRMods[0]->getTargetTriple());
+
+  for (auto *M : IRMods)
+    if (Error Err = addModule(M))
+      return Err;
+
+  COFFLinkerOptsOS.flush();
+  setStr(Hdr.COFFLinkerOpts, Saver.save(COFFLinkerOpts));
+
+  // We are about to fill in the header's range fields, so reserve space for it
+  // and copy it in afterwards.
+  Symtab.resize(sizeof(storage::Header));
+  writeRange(Hdr.Modules, Mods);
+  writeRange(Hdr.Comdats, Comdats);
+  writeRange(Hdr.Symbols, Syms);
+  writeRange(Hdr.Uncommons, Uncommons);
+
+  *reinterpret_cast<storage::Header *>(Symtab.data()) = Hdr;
+  return Error::success();
+}
+
+} // end anonymous namespace
+
+Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
+                      StringTableBuilder &StrtabBuilder,
+                      BumpPtrAllocator &Alloc) {
+  return Builder(Symtab, StrtabBuilder, Alloc).build(Mods);
+}
+
+// Upgrade a vector of bitcode modules created by an old version of LLVM by
+// creating an irsymtab for them in the current format.
+static Expected<FileContents> upgrade(ArrayRef<BitcodeModule> BMs) {
+  FileContents FC;
+
+  LLVMContext Ctx;
+  std::vector<Module *> Mods;
+  std::vector<std::unique_ptr<Module>> OwnedMods;
+  for (auto BM : BMs) {
+    Expected<std::unique_ptr<Module>> MOrErr =
+        BM.getLazyModule(Ctx, /*ShouldLazyLoadMetadata*/ true,
+                         /*IsImporting*/ false);
+    if (!MOrErr)
+      return MOrErr.takeError();
+
+    Mods.push_back(MOrErr->get());
+    OwnedMods.push_back(std::move(*MOrErr));
+  }
+
+  StringTableBuilder StrtabBuilder(StringTableBuilder::RAW);
+  BumpPtrAllocator Alloc;
+  if (Error E = build(Mods, FC.Symtab, StrtabBuilder, Alloc))
+    return std::move(E);
+
+  StrtabBuilder.finalizeInOrder();
+  FC.Strtab.resize(StrtabBuilder.getSize());
+  StrtabBuilder.write((uint8_t *)FC.Strtab.data());
+
+  FC.TheReader = {{FC.Symtab.data(), FC.Symtab.size()},
+                  {FC.Strtab.data(), FC.Strtab.size()}};
+  return std::move(FC);
+}
+
+Expected<FileContents> irsymtab::readBitcode(const BitcodeFileContents &BFC) {
+  if (BFC.Mods.empty())
+    return make_error<StringError>("Bitcode file does not contain any modules",
+                                   inconvertibleErrorCode());
+
+  if (BFC.StrtabForSymtab.empty() ||
+      BFC.Symtab.size() < sizeof(storage::Header))
+    return upgrade(BFC.Mods);
+
+  // We cannot use the regular reader to read the version and producer, because
+  // it will expect the header to be in the current format. The only thing we
+  // can rely on is that the version and producer will be present as the first
+  // struct elements.
+  auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data());
+  unsigned Version = Hdr->Version;
+  StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab);
+  if (Version != storage::Header::kCurrentVersion ||
+      Producer != kExpectedProducerName)
+    return upgrade(BFC.Mods);
+
+  FileContents FC;
+  FC.TheReader = {{BFC.Symtab.data(), BFC.Symtab.size()},
+                  {BFC.StrtabForSymtab.data(), BFC.StrtabForSymtab.size()}};
+
+  // Finally, make sure that the number of modules in the symbol table matches
+  // the number of modules in the bitcode file. If they differ, it may mean that
+  // the bitcode file was created by binary concatenation, so we need to create
+  // a new symbol table from scratch.
+  if (FC.TheReader.getNumModules() != BFC.Mods.size())
+    return upgrade(std::move(BFC.Mods));
+
+  return std::move(FC);
+}
diff --git a/contrib/llvm/lib/Object/MachOObjectFile.cpp b/contrib/llvm/lib/Object/MachOObjectFile.cpp
index 5b01867..2e4da9f 100644
--- a/contrib/llvm/lib/Object/MachOObjectFile.cpp
+++ b/contrib/llvm/lib/Object/MachOObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- MachOObjectFile.cpp - Mach-O object file binding ---------*- C++ -*-===//
+//===- MachOObjectFile.cpp - Mach-O object file binding -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,32 +12,52 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/MachO.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/MachO.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cctype>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include <cstring>
 #include <limits>
 #include <list>
+#include <memory>
+#include <string>
+#include <system_error>
 
 using namespace llvm;
 using namespace object;
 
 namespace {
+
   struct section_base {
     char sectname[16];
     char segname[16];
   };
-}
+
+} // end anonymous namespace
 
 static Error
 malformedError(Twine Msg) {
@@ -106,13 +126,6 @@ static StringRef parseSegmentOrSectionName(const char *P) {
   return StringRef(P, 16);
 }
 
-// Helper to advance a section or symbol iterator multiple increments at a time.
-template<class T>
-static void advance(T &it, size_t Val) {
-  while (Val--)
-    ++it;
-}
-
 static unsigned getCPUType(const MachOObjectFile &O) {
   return O.getHeader().cputype;
 }
@@ -368,7 +381,7 @@ static Error parseSegmentLoadCommand(
                             CmdName + " extends past the end of the file");
     if (S.vmsize != 0 && S.filesize > S.vmsize)
       return malformedError("load command " + Twine(LoadCommandIndex) +
-                            " fileoff field in " + CmdName +
+                            " filesize field in " + CmdName +
                             " greater than vmsize field");
     IsPageZeroSegment |= StringRef("__PAGEZERO").equals(S.segname);
   } else
@@ -784,6 +797,52 @@ static Error checkVersCommand(const MachOObjectFile &Obj,
   return Error::success();
 }
 
+static Error checkNoteCommand(const MachOObjectFile &Obj,
+                              const MachOObjectFile::LoadCommandInfo &Load,
+                              uint32_t LoadCommandIndex,
+                              std::list<MachOElement> &Elements) {
+  if (Load.C.cmdsize != sizeof(MachO::note_command))
+    return malformedError("load command " + Twine(LoadCommandIndex) + 
+                          " LC_NOTE has incorrect cmdsize");
+  MachO::note_command Nt = getStruct<MachO::note_command>(Obj, Load.Ptr);
+  uint64_t FileSize = Obj.getData().size();
+  if (Nt.offset > FileSize)
+    return malformedError("offset field of LC_NOTE command " +
+                          Twine(LoadCommandIndex) + " extends "
+                          "past the end of the file");
+  uint64_t BigSize = Nt.offset;
+  BigSize += Nt.size;
+  if (BigSize > FileSize)
+    return malformedError("size field plus offset field of LC_NOTE command " +
+                          Twine(LoadCommandIndex) + " extends past the end of "
+                          "the file");
+  if (Error Err = checkOverlappingElement(Elements, Nt.offset, Nt.size,
+                                          "LC_NOTE data"))
+    return Err;
+  return Error::success();
+}
+
+static Error
+parseBuildVersionCommand(const MachOObjectFile &Obj,
+                         const MachOObjectFile::LoadCommandInfo &Load,
+                         SmallVectorImpl<const char*> &BuildTools,
+                         uint32_t LoadCommandIndex) {
+  MachO::build_version_command BVC =
+      getStruct<MachO::build_version_command>(Obj, Load.Ptr);
+  if (Load.C.cmdsize !=
+      sizeof(MachO::build_version_command) +
+          BVC.ntools * sizeof(MachO::build_tool_version))
+    return malformedError("load command " + Twine(LoadCommandIndex) +
+                          " LC_BUILD_VERSION_COMMAND has incorrect cmdsize");
+
+  auto Start = Load.Ptr + sizeof(MachO::build_version_command);
+  BuildTools.resize(BVC.ntools);
+  for (unsigned i = 0; i < BVC.ntools; ++i)
+    BuildTools[i] = Start + i * sizeof(MachO::build_tool_version);
+
+  return Error::success();
+}
+
 static Error checkRpathCommand(const MachOObjectFile &Obj,
                                const MachOObjectFile::LoadCommandInfo &Load,
                                uint32_t LoadCommandIndex) {
@@ -931,7 +990,26 @@ static Error checkThreadCommand(const MachOObjectFile &Obj,
       sys::swapByteOrder(count);
     state += sizeof(uint32_t);
 
-    if (cputype == MachO::CPU_TYPE_X86_64) {
+    if (cputype == MachO::CPU_TYPE_I386) {
+      if (flavor == MachO::x86_THREAD_STATE32) {
+        if (count != MachO::x86_THREAD_STATE32_COUNT)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " count not x86_THREAD_STATE32_COUNT for "
+                                "flavor number " + Twine(nflavor) + " which is "
+                                "a x86_THREAD_STATE32 flavor in " + CmdName +
+                                " command");
+        if (state + sizeof(MachO::x86_thread_state32_t) > end)
+          return malformedError("load command " + Twine(LoadCommandIndex) +
+                                " x86_THREAD_STATE32 extends past end of "
+                                "command in " + CmdName + " command");
+        state += sizeof(MachO::x86_thread_state32_t);
+      } else {
+        return malformedError("load command " + Twine(LoadCommandIndex) +
+                              " unknown flavor (" + Twine(flavor) + ") for "
+                              "flavor number " + Twine(nflavor) + " in " +
+                              CmdName + " command");
+      }
+    } else if (cputype == MachO::CPU_TYPE_X86_64) {
       if (flavor == MachO::x86_THREAD_STATE64) {
         if (count != MachO::x86_THREAD_STATE64_COUNT)
           return malformedError("load command " + Twine(LoadCommandIndex) +
@@ -1086,11 +1164,7 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
                                  bool Is64bits, Error &Err,
                                  uint32_t UniversalCputype,
                                  uint32_t UniversalIndex)
-    : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
-      SymtabLoadCmd(nullptr), DysymtabLoadCmd(nullptr),
-      DataInCodeLoadCmd(nullptr), LinkOptHintsLoadCmd(nullptr),
-      DyldInfoLoadCmd(nullptr), UuidLoadCmd(nullptr),
-      HasPageZeroSegment(false) {
+    : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object) {
   ErrorAsOutParameter ErrAsOutParam(&Err);
   uint64_t SizeOfHeaders;
   uint32_t cputype;
@@ -1280,6 +1354,12 @@ MachOObjectFile::MachOObjectFile(MemoryBufferRef Object, bool IsLittleEndian,
       if ((Err = checkVersCommand(*this, Load, I, &VersLoadCmd,
                                   "LC_VERSION_MIN_WATCHOS")))
         return;
+    } else if (Load.C.cmd == MachO::LC_NOTE) {
+      if ((Err = checkNoteCommand(*this, Load, I, Elements)))
+        return;
+    } else if (Load.C.cmd == MachO::LC_BUILD_VERSION) {
+      if ((Err = parseBuildVersionCommand(*this, Load, BuildTools, I)))
+        return;
     } else if (Load.C.cmd == MachO::LC_RPATH) {
       if ((Err = checkRpathCommand(*this, Load, I)))
         return;
@@ -1740,6 +1820,10 @@ uint64_t MachOObjectFile::getSectionAddress(DataRefImpl Sec) const {
   return getSection(Sec).addr;
 }
 
+uint64_t MachOObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  return Sec.d.a;
+}
+
 uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
   // In the case if a malformed Mach-O file where the section offset is past
   // the end of the file or some part of the section size is past the end of
@@ -1867,13 +1951,29 @@ MachOObjectFile::section_rel_end(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
+relocation_iterator MachOObjectFile::extrel_begin() const {
+  DataRefImpl Ret;
+  Ret.d.a = 0; // Would normally be a section index.
+  Ret.d.b = 0; // Index into the external relocations
+  return relocation_iterator(RelocationRef(Ret, this));
+}
+
+relocation_iterator MachOObjectFile::extrel_end() const {
+  MachO::dysymtab_command DysymtabLoadCmd = getDysymtabLoadCommand();
+  DataRefImpl Ret;
+  Ret.d.a = 0; // Would normally be a section index.
+  Ret.d.b = DysymtabLoadCmd.nextrel; // Index into the external relocations
+  return relocation_iterator(RelocationRef(Ret, this));
+}
+
 void MachOObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
   ++Rel.d.b;
 }
 
 uint64_t MachOObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  assert(getHeader().filetype == MachO::MH_OBJECT &&
-         "Only implemented for MH_OBJECT");
+  assert((getHeader().filetype == MachO::MH_OBJECT ||
+          getHeader().filetype == MachO::MH_KEXT_BUNDLE) &&
+         "Only implemented for MH_OBJECT && MH_KEXT_BUNDLE");
   MachO::any_relocation_info RE = getRelocation(Rel);
   return getAnyRelocationAddress(RE);
 }
@@ -2201,6 +2301,10 @@ std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
   return std::error_code();
 }
 
+uint32_t MachOObjectFile::getLibraryCount() const {
+  return Libraries.size();
+}
+
 section_iterator
 MachOObjectFile::getRelocationRelocatedSection(relocation_iterator Rel) const {
   DataRefImpl Sec;
@@ -2275,11 +2379,11 @@ StringRef MachOObjectFile::getFileFormatName() const {
   unsigned CPUType = getCPUType(*this);
   if (!is64Bit()) {
     switch (CPUType) {
-    case llvm::MachO::CPU_TYPE_I386:
+    case MachO::CPU_TYPE_I386:
       return "Mach-O 32-bit i386";
-    case llvm::MachO::CPU_TYPE_ARM:
+    case MachO::CPU_TYPE_ARM:
       return "Mach-O arm";
-    case llvm::MachO::CPU_TYPE_POWERPC:
+    case MachO::CPU_TYPE_POWERPC:
       return "Mach-O 32-bit ppc";
     default:
       return "Mach-O 32-bit unknown";
@@ -2287,11 +2391,11 @@ StringRef MachOObjectFile::getFileFormatName() const {
   }
 
   switch (CPUType) {
-  case llvm::MachO::CPU_TYPE_X86_64:
+  case MachO::CPU_TYPE_X86_64:
     return "Mach-O 64-bit x86-64";
-  case llvm::MachO::CPU_TYPE_ARM64:
+  case MachO::CPU_TYPE_ARM64:
     return "Mach-O arm64";
-  case llvm::MachO::CPU_TYPE_POWERPC64:
+  case MachO::CPU_TYPE_POWERPC64:
     return "Mach-O 64-bit ppc64";
   default:
     return "Mach-O 64-bit unknown";
@@ -2300,17 +2404,17 @@ StringRef MachOObjectFile::getFileFormatName() const {
 
 Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
   switch (CPUType) {
-  case llvm::MachO::CPU_TYPE_I386:
+  case MachO::CPU_TYPE_I386:
     return Triple::x86;
-  case llvm::MachO::CPU_TYPE_X86_64:
+  case MachO::CPU_TYPE_X86_64:
     return Triple::x86_64;
-  case llvm::MachO::CPU_TYPE_ARM:
+  case MachO::CPU_TYPE_ARM:
     return Triple::arm;
-  case llvm::MachO::CPU_TYPE_ARM64:
+  case MachO::CPU_TYPE_ARM64:
     return Triple::aarch64;
-  case llvm::MachO::CPU_TYPE_POWERPC:
+  case MachO::CPU_TYPE_POWERPC:
     return Triple::ppc;
-  case llvm::MachO::CPU_TYPE_POWERPC64:
+  case MachO::CPU_TYPE_POWERPC64:
     return Triple::ppc64;
   default:
     return Triple::UnknownArch;
@@ -2383,6 +2487,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
         *ArchFlag = "armv7em";
       return Triple("thumbv7em-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7K:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
       if (ArchFlag)
         *ArchFlag = "armv7k";
       return Triple("armv7k-apple-darwin");
@@ -2393,6 +2499,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
         *ArchFlag = "armv7m";
       return Triple("thumbv7m-apple-darwin");
     case MachO::CPU_SUBTYPE_ARM_V7S:
+      if (McpuDefault)
+        *McpuDefault = "cortex-a7";
       if (ArchFlag)
         *ArchFlag = "armv7s";
       return Triple("armv7s-apple-darwin");
@@ -2402,6 +2510,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
   case MachO::CPU_TYPE_ARM64:
     switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
     case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
       if (ArchFlag)
         *ArchFlag = "arm64";
       return Triple("arm64-apple-darwin");
@@ -2497,8 +2607,7 @@ dice_iterator MachOObjectFile::end_dices() const {
   return dice_iterator(DiceRef(DRI, this));
 }
 
-ExportEntry::ExportEntry(ArrayRef<uint8_t> T)
-    : Trie(T), Malformed(false), Done(false) {}
+ExportEntry::ExportEntry(ArrayRef<uint8_t> T) : Trie(T) {}
 
 void ExportEntry::moveToFirst() {
   pushNode(0);
@@ -2567,9 +2676,7 @@ uint32_t ExportEntry::nodeOffset() const {
 }
 
 ExportEntry::NodeState::NodeState(const uint8_t *Ptr)
-    : Start(Ptr), Current(Ptr), Flags(0), Address(0), Other(0),
-      ImportName(nullptr), ChildCount(0), NextChildIndex(0),
-      ParentStringLength(0), IsExportNode(false) {}
+    : Start(Ptr), Current(Ptr) {}
 
 void ExportEntry::pushNode(uint64_t offset) {
   const uint8_t *Ptr = Trie.begin() + offset;
@@ -2659,7 +2766,7 @@ void ExportEntry::moveNext() {
 iterator_range<export_iterator>
 MachOObjectFile::exports(ArrayRef<uint8_t> Trie) {
   ExportEntry Start(Trie);
-  if (Trie.size() == 0)
+  if (Trie.empty())
     Start.moveToEnd();
   else
     Start.moveToFirst();
@@ -2674,10 +2781,10 @@ iterator_range<export_iterator> MachOObjectFile::exports() const {
   return exports(getDyldInfoExportsTrie());
 }
 
-MachORebaseEntry::MachORebaseEntry(ArrayRef<uint8_t> Bytes, bool is64Bit)
-    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
-      RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
-      PointerSize(is64Bit ? 8 : 4), Malformed(false), Done(false) {}
+MachORebaseEntry::MachORebaseEntry(Error *E, const MachOObjectFile *O,
+                                   ArrayRef<uint8_t> Bytes, bool is64Bit)
+    : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()),
+      PointerSize(is64Bit ? 8 : 4) {}
 
 void MachORebaseEntry::moveToFirst() {
   Ptr = Opcodes.begin();
@@ -2691,122 +2798,307 @@ void MachORebaseEntry::moveToEnd() {
 }
 
 void MachORebaseEntry::moveNext() {
+  ErrorAsOutParameter ErrAsOutParam(E);
   // If in the middle of some loop, move to next rebasing in loop.
   SegmentOffset += AdvanceAmount;
   if (RemainingLoopCount) {
     --RemainingLoopCount;
     return;
   }
+  // REBASE_OPCODE_DONE is only used for padding if we are not aligned to
+  // pointer size. Therefore it is possible to reach the end without ever having
+  // seen REBASE_OPCODE_DONE.
   if (Ptr == Opcodes.end()) {
     Done = true;
     return;
   }
   bool More = true;
-  while (More && !Malformed) {
+  while (More) {
     // Parse next opcode and set up next loop.
+    const uint8_t *OpcodeStart = Ptr;
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::REBASE_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::REBASE_OPCODE_MASK;
+    uint32_t Count, Skip;
+    const char *error = nullptr;
     switch (Opcode) {
     case MachO::REBASE_OPCODE_DONE:
       More = false;
       Done = true;
       moveToEnd();
-      DEBUG_WITH_TYPE("mach-o-rebase", llvm::dbgs() << "REBASE_OPCODE_DONE\n");
+      DEBUG_WITH_TYPE("mach-o-rebase", dbgs() << "REBASE_OPCODE_DONE\n");
       break;
     case MachO::REBASE_OPCODE_SET_TYPE_IMM:
       RebaseType = ImmValue;
+      if (RebaseType > MachO::REBASE_TYPE_TEXT_PCREL32) {
+          *E = malformedError("for REBASE_OPCODE_SET_TYPE_IMM bad bind type: " +
+               Twine((int)RebaseType) + " for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
-          llvm::dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: "
-                       << "RebaseType=" << (int) RebaseType << "\n");
+          dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: "
+                 << "RebaseType=" << (int) RebaseType << "\n");
       break;
     case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
       SegmentIndex = ImmValue;
-      SegmentOffset = readULEB128();
+      SegmentOffset = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
-          llvm::dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
-                       << "SegmentIndex=" << SegmentIndex << ", "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << "\n");
+          dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
+                 << "SegmentIndex=" << SegmentIndex << ", "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_ULEB:
-      SegmentOffset += readULEB128();
+      SegmentOffset += readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-rebase",
-                      llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: "
-                                   << format("SegmentOffset=0x%06X",
-                                             SegmentOffset) << "\n");
+                      dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: "
+                             << format("SegmentOffset=0x%06X",
+                                       SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_IMM_SCALED:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       SegmentOffset += ImmValue * PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              false);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED "
+             " (after adding immediate times the pointer size) " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-rebase",
-                      llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: "
-                                   << format("SegmentOffset=0x%06X",
-                                             SegmentOffset) << "\n");
+                      dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: "
+                             << format("SegmentOffset=0x%06X",
+                                       SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_DO_REBASE_IMM_TIMES:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_IMM_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = PointerSize;
-      RemainingLoopCount = ImmValue - 1;
+      Skip = 0;
+      Count = ImmValue;
+      if (ImmValue != 0)
+        RemainingLoopCount = ImmValue - 1;
+      else
+        RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_IMM_TIMES "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
-          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << ", AdvanceAmount=" << AdvanceAmount
-                       << ", RemainingLoopCount=" << RemainingLoopCount
-                       << "\n");
+          dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << ", AdvanceAmount=" << AdvanceAmount
+                 << ", RemainingLoopCount=" << RemainingLoopCount
+                 << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = PointerSize;
-      RemainingLoopCount = readULEB128() - 1;
+      Skip = 0;
+      Count = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
-          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << ", AdvanceAmount=" << AdvanceAmount
-                       << ", RemainingLoopCount=" << RemainingLoopCount
-                       << "\n");
+          dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << ", AdvanceAmount=" << AdvanceAmount
+                 << ", RemainingLoopCount=" << RemainingLoopCount
+                 << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB:
-      AdvanceAmount = readULEB128() + PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Skip = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = Skip + PointerSize;
+      Count = 1;
       RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
-          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << ", AdvanceAmount=" << AdvanceAmount
-                       << ", RemainingLoopCount=" << RemainingLoopCount
-                       << "\n");
+          dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << ", AdvanceAmount=" << AdvanceAmount
+                 << ", RemainingLoopCount=" << RemainingLoopCount
+                 << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB:
-      RemainingLoopCount = readULEB128() - 1;
-      AdvanceAmount = readULEB128() + PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Count = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      Skip = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = Skip + PointerSize;
+
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
-          llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << ", AdvanceAmount=" << AdvanceAmount
-                       << ", RemainingLoopCount=" << RemainingLoopCount
-                       << "\n");
+          dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << ", AdvanceAmount=" << AdvanceAmount
+                 << ", RemainingLoopCount=" << RemainingLoopCount
+                 << "\n");
       return;
     default:
-      Malformed = true;
+      *E = malformedError("bad rebase info (bad opcode value 0x" +
+           utohexstr(Opcode) + " for opcode at: 0x" +
+           utohexstr(OpcodeStart - Opcodes.begin()));
+      moveToEnd();
+      return;
     }
   }
 }
 
-uint64_t MachORebaseEntry::readULEB128() {
+uint64_t MachORebaseEntry::readULEB128(const char **error) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
+  uint64_t Result = decodeULEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-uint32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
+int32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachORebaseEntry::segmentOffset() const { return SegmentOffset; }
 
@@ -2822,6 +3114,24 @@ StringRef MachORebaseEntry::typeName() const {
   return "unknown";
 }
 
+// For use with the SegIndex of a checked Mach-O Rebase entry
+// to get the segment name.
+StringRef MachORebaseEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Rebase entry
+// to get the section name.
+StringRef MachORebaseEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Rebase entry
+// to get the address.
+uint64_t MachORebaseEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
 bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
 #ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
@@ -2834,25 +3144,27 @@ bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
 }
 
 iterator_range<rebase_iterator>
-MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
-  MachORebaseEntry Start(Opcodes, is64);
+MachOObjectFile::rebaseTable(Error &Err, MachOObjectFile *O,
+                             ArrayRef<uint8_t> Opcodes, bool is64) {
+  if (O->BindRebaseSectionTable == nullptr)
+    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+  MachORebaseEntry Start(&Err, O, Opcodes, is64);
   Start.moveToFirst();
 
-  MachORebaseEntry Finish(Opcodes, is64);
+  MachORebaseEntry Finish(&Err, O, Opcodes, is64);
   Finish.moveToEnd();
 
   return make_range(rebase_iterator(Start), rebase_iterator(Finish));
 }
 
-iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
-  return rebaseTable(getDyldInfoRebaseOpcodes(), is64Bit());
+iterator_range<rebase_iterator> MachOObjectFile::rebaseTable(Error &Err) {
+  return rebaseTable(Err, this, getDyldInfoRebaseOpcodes(), is64Bit());
 }
 
-MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
-    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
-      Ordinal(0), Flags(0), Addend(0), RemainingLoopCount(0), AdvanceAmount(0),
-      BindType(0), PointerSize(is64Bit ? 8 : 4),
-      TableKind(BK), Malformed(false), Done(false) {}
+MachOBindEntry::MachOBindEntry(Error *E, const MachOObjectFile *O,
+                               ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
+    : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()),
+      PointerSize(is64Bit ? 8 : 4), TableKind(BK) {}
 
 void MachOBindEntry::moveToFirst() {
   Ptr = Opcodes.begin();
@@ -2866,24 +3178,31 @@ void MachOBindEntry::moveToEnd() {
 }
 
 void MachOBindEntry::moveNext() {
+  ErrorAsOutParameter ErrAsOutParam(E);
   // If in the middle of some loop, move to next binding in loop.
   SegmentOffset += AdvanceAmount;
   if (RemainingLoopCount) {
     --RemainingLoopCount;
     return;
   }
+  // BIND_OPCODE_DONE is only used for padding if we are not aligned to
+  // pointer size. Therefore it is possible to reach the end without ever having
+  // seen BIND_OPCODE_DONE.
   if (Ptr == Opcodes.end()) {
     Done = true;
     return;
   }
   bool More = true;
-  while (More && !Malformed) {
+  while (More) {
     // Parse next opcode and set up next loop.
+    const uint8_t *OpcodeStart = Ptr;
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::BIND_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::BIND_OPCODE_MASK;
     int8_t SignExtended;
     const uint8_t *SymStart;
+    uint32_t Count, Skip;
+    const char *error = nullptr;
     switch (Opcode) {
     case MachO::BIND_OPCODE_DONE:
       if (TableKind == Kind::Lazy) {
@@ -2899,48 +3218,108 @@ void MachOBindEntry::moveNext() {
           break;
       }
       More = false;
-      Done = true;
       moveToEnd();
-      DEBUG_WITH_TYPE("mach-o-bind", llvm::dbgs() << "BIND_OPCODE_DONE\n");
+      DEBUG_WITH_TYPE("mach-o-bind", dbgs() << "BIND_OPCODE_DONE\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_IMM:
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_ORDINAL_IMM not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       Ordinal = ImmValue;
+      LibraryOrdinalSet = true;
+      if (ImmValue > O->getLibraryCount()) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad "
+             "library ordinal: " + Twine((int)ImmValue) + " (max " +
+             Twine((int)O->getLibraryCount()) + ") for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: "
-                       << "Ordinal=" << Ordinal << "\n");
+          dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: "
+                 << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB:
-      Ordinal = readULEB128();
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Ordinal = readULEB128(&error);
+      LibraryOrdinalSet = true;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Ordinal > (int)O->getLibraryCount()) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad "
+             "library ordinal: " + Twine((int)Ordinal) + " (max " +
+             Twine((int)O->getLibraryCount()) + ") for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: "
-                       << "Ordinal=" << Ordinal << "\n");
+          dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: "
+                 << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_SPECIAL_IMM:
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_SPECIAL_IMM not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       if (ImmValue) {
         SignExtended = MachO::BIND_OPCODE_MASK | ImmValue;
         Ordinal = SignExtended;
+        if (Ordinal < MachO::BIND_SPECIAL_DYLIB_FLAT_LOOKUP) {
+          *E = malformedError("for BIND_OPCODE_SET_DYLIB_SPECIAL_IMM unknown "
+               "special ordinal: " + Twine((int)Ordinal) + " for opcode at: "
+               "0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+        }
       } else
         Ordinal = 0;
+      LibraryOrdinalSet = true;
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: "
-                       << "Ordinal=" << Ordinal << "\n");
+          dbgs() << "BIND_OPCODE_SET_DYLIB_SPECIAL_IMM: "
+                 << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM:
       Flags = ImmValue;
       SymStart = Ptr;
-      while (*Ptr) {
+      while (*Ptr && (Ptr < Opcodes.end())) {
         ++Ptr;
       }
+      if (Ptr == Opcodes.end()) {
+          *E = malformedError("for BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM "
+               "symbol name extends past opcodes for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       SymbolName = StringRef(reinterpret_cast<const char*>(SymStart),
                              Ptr-SymStart);
       ++Ptr;
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM: "
-                       << "SymbolName=" << SymbolName << "\n");
+          dbgs() << "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM: "
+                 << "SymbolName=" << SymbolName << "\n");
       if (TableKind == Kind::Weak) {
         if (ImmValue & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION)
           return;
@@ -2948,111 +3327,311 @@ void MachOBindEntry::moveNext() {
       break;
     case MachO::BIND_OPCODE_SET_TYPE_IMM:
       BindType = ImmValue;
+      if (ImmValue > MachO::BIND_TYPE_TEXT_PCREL32) {
+          *E = malformedError("for BIND_OPCODE_SET_TYPE_IMM bad bind type: " +
+               Twine((int)ImmValue) + " for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_TYPE_IMM: "
-                       << "BindType=" << (int)BindType << "\n");
+          dbgs() << "BIND_OPCODE_SET_TYPE_IMM: "
+                 << "BindType=" << (int)BindType << "\n");
       break;
     case MachO::BIND_OPCODE_SET_ADDEND_SLEB:
-      Addend = readSLEB128();
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      Addend = readSLEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_ADDEND_SLEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: "
-                       << "Addend=" << Addend << "\n");
+          dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: "
+                 << "Addend=" << Addend << "\n");
       break;
     case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
       SegmentIndex = ImmValue;
-      SegmentOffset = readULEB128();
+      SegmentOffset = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
-                       << "SegmentIndex=" << SegmentIndex << ", "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << "\n");
+          dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
+                 << "SegmentIndex=" << SegmentIndex << ", "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << "\n");
       break;
     case MachO::BIND_OPCODE_ADD_ADDR_ULEB:
-      SegmentOffset += readULEB128();
+      SegmentOffset += readULEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
-                      llvm::dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: "
-                                   << format("SegmentOffset=0x%06X",
-                                             SegmentOffset) << "\n");
+                      dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: "
+                             << format("SegmentOffset=0x%06X",
+                                       SegmentOffset) << "\n");
       break;
     case MachO::BIND_OPCODE_DO_BIND:
       AdvanceAmount = PointerSize;
       RemainingLoopCount = 0;
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND " + Twine(error) +
+             " for opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND missing preceding "
+             "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND missing preceding "
+             "BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
-                      llvm::dbgs() << "BIND_OPCODE_DO_BIND: "
-                                   << format("SegmentOffset=0x%06X",
-                                             SegmentOffset) << "\n");
+                      dbgs() << "BIND_OPCODE_DO_BIND: "
+                             << format("SegmentOffset=0x%06X",
+                                       SegmentOffset) << "\n");
       return;
      case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
-      AdvanceAmount = readULEB128() + PointerSize;
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB not allowed in "
+             "lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing "
+             "preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing "
+             "preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = readULEB128(&error) + PointerSize;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      // Note, this is not really an error until the next bind but make no sense
+      // for a BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB to not be followed by another
+      // bind operation.
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset +
+                                            AdvanceAmount, false);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB (after adding "
+             "ULEB) " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       RemainingLoopCount = 0;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << ", AdvanceAmount=" << AdvanceAmount
-                       << ", RemainingLoopCount=" << RemainingLoopCount
-                       << "\n");
+          dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << ", AdvanceAmount=" << AdvanceAmount
+                 << ", RemainingLoopCount=" << RemainingLoopCount
+                 << "\n");
       return;
     case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED:
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED not "
+             "allowed in lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             "missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for "
+             "opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             "missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = ImmValue * PointerSize + PointerSize;
       RemainingLoopCount = 0;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset +
+                                            AdvanceAmount, false);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             " (after adding immediate times the pointer size) " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
-                      llvm::dbgs()
+                      dbgs()
                       << "BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED: "
-                      << format("SegmentOffset=0x%06X",
-                                             SegmentOffset) << "\n");
+                      << format("SegmentOffset=0x%06X", SegmentOffset) << "\n");
       return;
     case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
-      RemainingLoopCount = readULEB128() - 1;
-      AdvanceAmount = readULEB128() + PointerSize;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB not "
+             "allowed in lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Count = readULEB128(&error);
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+                            " (count value) " + Twine(error) + " for opcode at"
+                            ": 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Skip = readULEB128(&error);
+      AdvanceAmount = Skip + PointerSize;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+                            " (skip value) " + Twine(error) + " for opcode at"
+                            ": 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             "missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for "
+             "opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             "missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                            SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
-          llvm::dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: "
-                       << format("SegmentOffset=0x%06X", SegmentOffset)
-                       << ", AdvanceAmount=" << AdvanceAmount
-                       << ", RemainingLoopCount=" << RemainingLoopCount
-                       << "\n");
+          dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: "
+                 << format("SegmentOffset=0x%06X", SegmentOffset)
+                 << ", AdvanceAmount=" << AdvanceAmount
+                 << ", RemainingLoopCount=" << RemainingLoopCount
+                 << "\n");
       return;
     default:
-      Malformed = true;
+      *E = malformedError("bad bind info (bad opcode value 0x" +
+           utohexstr(Opcode) + " for opcode at: 0x" +
+           utohexstr(OpcodeStart - Opcodes.begin()));
+      moveToEnd();
+      return;
     }
   }
 }
 
-uint64_t MachOBindEntry::readULEB128() {
+uint64_t MachOBindEntry::readULEB128(const char **error) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
+  uint64_t Result = decodeULEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-int64_t MachOBindEntry::readSLEB128() {
+int64_t MachOBindEntry::readSLEB128(const char **error) {
   unsigned Count;
-  int64_t Result = decodeSLEB128(Ptr, &Count);
+  int64_t Result = decodeSLEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-uint32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
+int32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachOBindEntry::segmentOffset() const { return SegmentOffset; }
 
@@ -3076,6 +3655,24 @@ uint32_t MachOBindEntry::flags() const { return Flags; }
 
 int MachOBindEntry::ordinal() const { return Ordinal; }
 
+// For use with the SegIndex of a checked Mach-O Bind entry
+// to get the segment name.
+StringRef MachOBindEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind entry
+// to get the section name.
+StringRef MachOBindEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind entry
+// to get the address.
+uint64_t MachOBindEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
 bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
 #ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
@@ -3087,30 +3684,149 @@ bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
          (Done == Other.Done);
 }
 
+// Build table of sections so SegIndex/SegOffset pairs can be translated.
+BindRebaseSegInfo::BindRebaseSegInfo(const object::MachOObjectFile *Obj) {
+  uint32_t CurSegIndex = Obj->hasPageZeroSegment() ? 1 : 0;
+  StringRef CurSegName;
+  uint64_t CurSegAddress;
+  for (const SectionRef &Section : Obj->sections()) {
+    SectionInfo Info;
+    Section.getName(Info.SectionName);
+    Info.Address = Section.getAddress();
+    Info.Size = Section.getSize();
+    Info.SegmentName =
+        Obj->getSectionFinalSegmentName(Section.getRawDataRefImpl());
+    if (!Info.SegmentName.equals(CurSegName)) {
+      ++CurSegIndex;
+      CurSegName = Info.SegmentName;
+      CurSegAddress = Info.Address;
+    }
+    Info.SegmentIndex = CurSegIndex - 1;
+    Info.OffsetInSegment = Info.Address - CurSegAddress;
+    Info.SegmentStartAddress = CurSegAddress;
+    Sections.push_back(Info);
+  }
+  MaxSegIndex = CurSegIndex;
+}
+
+// For use with a SegIndex,SegOffset pair in MachOBindEntry::moveNext() to
+// validate a MachOBindEntry or MachORebaseEntry.
+const char * BindRebaseSegInfo::checkSegAndOffset(int32_t SegIndex,
+                                                  uint64_t SegOffset,
+                                                  bool endInvalid) {
+  if (SegIndex == -1)
+    return "missing preceding *_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB";
+  if (SegIndex >= MaxSegIndex)
+    return "bad segIndex (too large)";
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > SegOffset)
+      continue;
+    if (SegOffset > (SI.OffsetInSegment + SI.Size))
+      continue;
+    if (endInvalid && SegOffset >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return nullptr;
+  }
+  return "bad segOffset, too large";
+}
+
+// For use in MachOBindEntry::moveNext() to validate a MachOBindEntry for
+// the BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB opcode and for use in
+// MachORebaseEntry::moveNext() to validate a MachORebaseEntry for
+// REBASE_OPCODE_DO_*_TIMES* opcodes.  The SegIndex and SegOffset must have
+// been already checked.
+const char * BindRebaseSegInfo::checkCountAndSkip(uint32_t Count, uint32_t Skip,
+                                                  uint8_t PointerSize,
+                                                  int32_t SegIndex,
+                                                  uint64_t SegOffset) {
+  const SectionInfo &SI = findSection(SegIndex, SegOffset);
+  uint64_t addr = SI.SegmentStartAddress + SegOffset;
+  if (addr >= SI.Address + SI.Size)
+    return "bad segOffset, too large";
+  uint64_t i = 0;
+  if (Count > 1)
+    i = (Skip + PointerSize) * (Count - 1);
+  else if (Count == 1)
+    i = Skip + PointerSize;
+  if (addr + i >= SI.Address + SI.Size) {
+    // For rebase opcodes they can step from one section to another.
+    uint64_t TrailingSegOffset = (addr + i) - SI.SegmentStartAddress;
+    const char *error = checkSegAndOffset(SegIndex, TrailingSegOffset, false);
+    if (error)
+      return "bad count and skip, too large";
+  }
+  return nullptr;
+}
+
+// For use with the SegIndex of a checked Mach-O Bind or Rebase entry
+// to get the segment name.
+StringRef BindRebaseSegInfo::segmentName(int32_t SegIndex) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex == SegIndex)
+      return SI.SegmentName;
+  }
+  llvm_unreachable("invalid SegIndex");
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// to get the SectionInfo.
+const BindRebaseSegInfo::SectionInfo &BindRebaseSegInfo::findSection(
+                                     int32_t SegIndex, uint64_t SegOffset) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > SegOffset)
+      continue;
+    if (SegOffset >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return SI;
+  }
+  llvm_unreachable("SegIndex and SegOffset not in any section");
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// entry to get the section name.
+StringRef BindRebaseSegInfo::sectionName(int32_t SegIndex,
+                                         uint64_t SegOffset) {
+  return findSection(SegIndex, SegOffset).SectionName;
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// entry to get the address.
+uint64_t BindRebaseSegInfo::address(uint32_t SegIndex, uint64_t OffsetInSeg) {
+  const SectionInfo &SI = findSection(SegIndex, OffsetInSeg);
+  return SI.SegmentStartAddress + OffsetInSeg;
+}
+
 iterator_range<bind_iterator>
-MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
+MachOObjectFile::bindTable(Error &Err, MachOObjectFile *O,
+                           ArrayRef<uint8_t> Opcodes, bool is64,
                            MachOBindEntry::Kind BKind) {
-  MachOBindEntry Start(Opcodes, is64, BKind);
+  if (O->BindRebaseSectionTable == nullptr)
+    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+  MachOBindEntry Start(&Err, O, Opcodes, is64, BKind);
   Start.moveToFirst();
 
-  MachOBindEntry Finish(Opcodes, is64, BKind);
+  MachOBindEntry Finish(&Err, O, Opcodes, is64, BKind);
   Finish.moveToEnd();
 
   return make_range(bind_iterator(Start), bind_iterator(Finish));
 }
 
-iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
-  return bindTable(getDyldInfoBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::bindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Regular);
 }
 
-iterator_range<bind_iterator> MachOObjectFile::lazyBindTable() const {
-  return bindTable(getDyldInfoLazyBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::lazyBindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoLazyBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Lazy);
 }
 
-iterator_range<bind_iterator> MachOObjectFile::weakBindTable() const {
-  return bindTable(getDyldInfoWeakBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::weakBindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoWeakBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Weak);
 }
 
@@ -3289,6 +4005,21 @@ MachOObjectFile::getVersionMinLoadCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::version_min_command>(*this, L.Ptr);
 }
 
+MachO::note_command
+MachOObjectFile::getNoteLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::note_command>(*this, L.Ptr);
+}
+
+MachO::build_version_command
+MachOObjectFile::getBuildVersionLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::build_version_command>(*this, L.Ptr);
+}
+
+MachO::build_tool_version
+MachOObjectFile::getBuildToolVersion(unsigned index) const {
+  return getStruct<MachO::build_tool_version>(*this, BuildTools[index]);
+}
+
 MachO::dylib_command
 MachOObjectFile::getDylibIDLoadCommand(const LoadCommandInfo &L) const {
   return getStruct<MachO::dylib_command>(*this, L.Ptr);
@@ -3371,15 +4102,20 @@ MachOObjectFile::getThreadCommand(const LoadCommandInfo &L) const {
 
 MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
-  DataRefImpl Sec;
-  Sec.d.a = Rel.d.a;
   uint32_t Offset;
-  if (is64Bit()) {
-    MachO::section_64 Sect = getSection64(Sec);
-    Offset = Sect.reloff;
+  if (getHeader().filetype == MachO::MH_OBJECT) {
+    DataRefImpl Sec;
+    Sec.d.a = Rel.d.a;
+    if (is64Bit()) {
+      MachO::section_64 Sect = getSection64(Sec);
+      Offset = Sect.reloff;
+    } else {
+      MachO::section Sect = getSection(Sec);
+      Offset = Sect.reloff;
+    }
   } else {
-    MachO::section Sect = getSection(Sec);
-    Offset = Sect.reloff;
+    MachO::dysymtab_command DysymtabLoadCmd = getDysymtabLoadCommand();
+    Offset = DysymtabLoadCmd.extreloff; // Offset to the external relocations
   }
 
   auto P = reinterpret_cast<const MachO::any_relocation_info *>(
@@ -3599,3 +4335,9 @@ ObjectFile::createMachOObjectFile(MemoryBufferRef Buffer,
   return make_error<GenericBinaryError>("Unrecognized MachO magic number",
                                         object_error::invalid_file_type);
 }
+
+StringRef MachOObjectFile::mapDebugSectionName(StringRef Name) const {
+  return StringSwitch<StringRef>(Name)
+      .Case("debug_str_offs", "debug_str_offsets")
+      .Default(Name);
+}
diff --git a/contrib/llvm/lib/Object/ModuleSummaryIndexObjectFile.cpp b/contrib/llvm/lib/Object/ModuleSummaryIndexObjectFile.cpp
deleted file mode 100644
index 11ace84..0000000
--- a/contrib/llvm/lib/Object/ModuleSummaryIndexObjectFile.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//===- ModuleSummaryIndexObjectFile.cpp - Summary index file implementation ==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Part of the ModuleSummaryIndexObjectFile class implementation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace object;
-
-static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
-    "ignore-empty-index-file", llvm::cl::ZeroOrMore,
-    llvm::cl::desc(
-        "Ignore an empty index file and perform non-ThinLTO compilation"),
-    llvm::cl::init(false));
-
-ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile(
-    MemoryBufferRef Object, std::unique_ptr<ModuleSummaryIndex> I)
-    : SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) {
-}
-
-ModuleSummaryIndexObjectFile::~ModuleSummaryIndexObjectFile() {}
-
-std::unique_ptr<ModuleSummaryIndex> ModuleSummaryIndexObjectFile::takeIndex() {
-  return std::move(Index);
-}
-
-ErrorOr<MemoryBufferRef>
-ModuleSummaryIndexObjectFile::findBitcodeInObject(const ObjectFile &Obj) {
-  for (const SectionRef &Sec : Obj.sections()) {
-    if (Sec.isBitcode()) {
-      StringRef SecContents;
-      if (std::error_code EC = Sec.getContents(SecContents))
-        return EC;
-      return MemoryBufferRef(SecContents, Obj.getFileName());
-    }
-  }
-
-  return object_error::bitcode_section_not_found;
-}
-
-ErrorOr<MemoryBufferRef>
-ModuleSummaryIndexObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
-  sys::fs::file_magic Type = sys::fs::identify_magic(Object.getBuffer());
-  switch (Type) {
-  case sys::fs::file_magic::bitcode:
-    return Object;
-  case sys::fs::file_magic::elf_relocatable:
-  case sys::fs::file_magic::macho_object:
-  case sys::fs::file_magic::coff_object: {
-    Expected<std::unique_ptr<ObjectFile>> ObjFile =
-        ObjectFile::createObjectFile(Object, Type);
-    if (!ObjFile)
-      return errorToErrorCode(ObjFile.takeError());
-    return findBitcodeInObject(*ObjFile->get());
-  }
-  default:
-    return object_error::invalid_file_type;
-  }
-}
-
-// Parse module summary index in the given memory buffer.
-// Return new ModuleSummaryIndexObjectFile instance containing parsed
-// module summary/index.
-Expected<std::unique_ptr<ModuleSummaryIndexObjectFile>>
-ModuleSummaryIndexObjectFile::create(MemoryBufferRef Object) {
-  ErrorOr<MemoryBufferRef> BCOrErr = findBitcodeInMemBuffer(Object);
-  if (!BCOrErr)
-    return errorCodeToError(BCOrErr.getError());
-
-  Expected<std::unique_ptr<ModuleSummaryIndex>> IOrErr =
-      getModuleSummaryIndex(BCOrErr.get());
-
-  if (!IOrErr)
-    return IOrErr.takeError();
-
-  std::unique_ptr<ModuleSummaryIndex> Index = std::move(IOrErr.get());
-  return llvm::make_unique<ModuleSummaryIndexObjectFile>(Object,
-                                                         std::move(Index));
-}
-
-// Parse the module summary index out of an IR file and return the summary
-// index object if found, or nullptr if not.
-Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
-      MemoryBuffer::getFileOrSTDIN(Path);
-  std::error_code EC = FileOrErr.getError();
-  if (EC)
-    return errorCodeToError(EC);
-  MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
-  if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize())
-    return nullptr;
-  Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
-      object::ModuleSummaryIndexObjectFile::create(BufferRef);
-  if (!ObjOrErr)
-    return ObjOrErr.takeError();
-
-  object::ModuleSummaryIndexObjectFile &Obj = **ObjOrErr;
-  return Obj.takeIndex();
-}
diff --git a/contrib/llvm/lib/Object/ModuleSymbolTable.cpp b/contrib/llvm/lib/Object/ModuleSymbolTable.cpp
index 9048800..f2e7a21 100644
--- a/contrib/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/contrib/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -1,4 +1,4 @@
-//===- ModuleSymbolTable.cpp - symbol table for in-memory IR ----*- C++ -*-===//
+//===- ModuleSymbolTable.cpp - symbol table for in-memory IR --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,27 +13,45 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "RecordStreamer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/IR/GVMaterializer.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Object/ObjectFile.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+
 using namespace llvm;
 using namespace object;
 
@@ -43,27 +61,98 @@ void ModuleSymbolTable::addModule(Module *M) {
   else
     FirstMod = M;
 
-  for (Function &F : *M)
-    SymTab.push_back(&F);
-  for (GlobalVariable &GV : M->globals())
+  for (GlobalValue &GV : M->global_values())
     SymTab.push_back(&GV);
-  for (GlobalAlias &GA : M->aliases())
-    SymTab.push_back(&GA);
-
-  CollectAsmSymbols(Triple(M->getTargetTriple()), M->getModuleInlineAsm(),
-                    [this](StringRef Name, BasicSymbolRef::Flags Flags) {
-                      SymTab.push_back(new (AsmSymbols.Allocate())
-                                           AsmSymbol(Name, Flags));
-                    });
+
+  CollectAsmSymbols(*M, [this](StringRef Name, BasicSymbolRef::Flags Flags) {
+    SymTab.push_back(new (AsmSymbols.Allocate()) AsmSymbol(Name, Flags));
+  });
+}
+
+// Ensure ELF .symver aliases get the same binding as the defined symbol
+// they alias with.
+static void handleSymverAliases(const Module &M, RecordStreamer &Streamer) {
+  if (Streamer.symverAliases().empty())
+    return;
+
+  // The name in the assembler will be mangled, but the name in the IR
+  // might not, so we first compute a mapping from mangled name to GV.
+  Mangler Mang;
+  SmallString<64> MangledName;
+  StringMap<const GlobalValue *> MangledNameMap;
+  auto GetMangledName = [&](const GlobalValue &GV) {
+    if (!GV.hasName())
+      return;
+
+    MangledName.clear();
+    MangledName.reserve(GV.getName().size() + 1);
+    Mang.getNameWithPrefix(MangledName, &GV, /*CannotUsePrivateLabel=*/false);
+    MangledNameMap[MangledName] = &GV;
+  };
+  for (const Function &F : M)
+    GetMangledName(F);
+  for (const GlobalVariable &GV : M.globals())
+    GetMangledName(GV);
+  for (const GlobalAlias &GA : M.aliases())
+    GetMangledName(GA);
+
+  // Walk all the recorded .symver aliases, and set up the binding
+  // for each alias.
+  for (auto &Symver : Streamer.symverAliases()) {
+    const MCSymbol *Aliasee = Symver.first;
+    MCSymbolAttr Attr = MCSA_Invalid;
+
+    // First check if the aliasee binding was recorded in the asm.
+    RecordStreamer::State state = Streamer.getSymbolState(Aliasee);
+    switch (state) {
+    case RecordStreamer::Global:
+    case RecordStreamer::DefinedGlobal:
+      Attr = MCSA_Global;
+      break;
+    case RecordStreamer::UndefinedWeak:
+    case RecordStreamer::DefinedWeak:
+      Attr = MCSA_Weak;
+      break;
+    default:
+      break;
+    }
+
+    // If we don't have a symbol attribute from assembly, then check if
+    // the aliasee was defined in the IR.
+    if (Attr == MCSA_Invalid) {
+      const auto *GV = M.getNamedValue(Aliasee->getName());
+      if (!GV) {
+        auto MI = MangledNameMap.find(Aliasee->getName());
+        if (MI != MangledNameMap.end())
+          GV = MI->second;
+        else
+          continue;
+      }
+      if (GV->hasExternalLinkage())
+        Attr = MCSA_Global;
+      else if (GV->hasLocalLinkage())
+        Attr = MCSA_Local;
+      else if (GV->isWeakForLinker())
+        Attr = MCSA_Weak;
+    }
+    if (Attr == MCSA_Invalid)
+      continue;
+
+    // Set the detected binding on each alias with this aliasee.
+    for (auto &Alias : Symver.second)
+      Streamer.EmitSymbolAttribute(Alias, Attr);
+  }
 }
 
 void ModuleSymbolTable::CollectAsmSymbols(
-    const Triple &TT, StringRef InlineAsm,
+    const Module &M,
     function_ref<void(StringRef, BasicSymbolRef::Flags)> AsmSymbol) {
+  StringRef InlineAsm = M.getModuleInlineAsm();
   if (InlineAsm.empty())
     return;
 
   std::string Err;
+  const Triple TT(M.getTargetTriple());
   const Target *T = TargetRegistry::lookupTarget(TT.str(), Err);
   assert(T && T->hasMCAsmParser());
 
@@ -106,6 +195,8 @@ void ModuleSymbolTable::CollectAsmSymbols(
   if (Parser->Run(false))
     return;
 
+  handleSymverAliases(M, Streamer);
+
   for (auto &KV : Streamer) {
     StringRef Key = KV.first();
     RecordStreamer::State Value = KV.second;
diff --git a/contrib/llvm/lib/Object/Object.cpp b/contrib/llvm/lib/Object/Object.cpp
index 6df481b..1d2859c 100644
--- a/contrib/llvm/lib/Object/Object.cpp
+++ b/contrib/llvm/lib/Object/Object.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm-c/Object.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Object/ObjectFile.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Object/ObjectFile.cpp b/contrib/llvm/lib/Object/ObjectFile.cpp
index f36388b..8377dd0 100644
--- a/contrib/llvm/lib/Object/ObjectFile.cpp
+++ b/contrib/llvm/lib/Object/ObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- ObjectFile.cpp - File format independent object file -----*- C++ -*-===//
+//===- ObjectFile.cpp - File format independent object file ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,19 +12,28 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/MachO.h"
 #include "llvm/Object/Wasm.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
 #include <system_error>
 
 using namespace llvm;
 using namespace object;
 
-void ObjectFile::anchor() { }
+void ObjectFile::anchor() {}
 
 ObjectFile::ObjectFile(unsigned int Type, MemoryBufferRef Source)
     : SymbolicFile(Type, Source) {}
@@ -71,42 +80,42 @@ section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
 }
 
 Expected<std::unique_ptr<ObjectFile>>
-ObjectFile::createObjectFile(MemoryBufferRef Object, sys::fs::file_magic Type) {
+ObjectFile::createObjectFile(MemoryBufferRef Object, file_magic Type) {
   StringRef Data = Object.getBuffer();
-  if (Type == sys::fs::file_magic::unknown)
-    Type = sys::fs::identify_magic(Data);
+  if (Type == file_magic::unknown)
+    Type = identify_magic(Data);
 
   switch (Type) {
-  case sys::fs::file_magic::unknown:
-  case sys::fs::file_magic::bitcode:
-  case sys::fs::file_magic::coff_cl_gl_object:
-  case sys::fs::file_magic::archive:
-  case sys::fs::file_magic::macho_universal_binary:
-  case sys::fs::file_magic::windows_resource:
+  case file_magic::unknown:
+  case file_magic::bitcode:
+  case file_magic::coff_cl_gl_object:
+  case file_magic::archive:
+  case file_magic::macho_universal_binary:
+  case file_magic::windows_resource:
     return errorCodeToError(object_error::invalid_file_type);
-  case sys::fs::file_magic::elf:
-  case sys::fs::file_magic::elf_relocatable:
-  case sys::fs::file_magic::elf_executable:
-  case sys::fs::file_magic::elf_shared_object:
-  case sys::fs::file_magic::elf_core:
+  case file_magic::elf:
+  case file_magic::elf_relocatable:
+  case file_magic::elf_executable:
+  case file_magic::elf_shared_object:
+  case file_magic::elf_core:
     return errorOrToExpected(createELFObjectFile(Object));
-  case sys::fs::file_magic::macho_object:
-  case sys::fs::file_magic::macho_executable:
-  case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
-  case sys::fs::file_magic::macho_core:
-  case sys::fs::file_magic::macho_preload_executable:
-  case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
-  case sys::fs::file_magic::macho_dynamic_linker:
-  case sys::fs::file_magic::macho_bundle:
-  case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
-  case sys::fs::file_magic::macho_dsym_companion:
-  case sys::fs::file_magic::macho_kext_bundle:
+  case file_magic::macho_object:
+  case file_magic::macho_executable:
+  case file_magic::macho_fixed_virtual_memory_shared_lib:
+  case file_magic::macho_core:
+  case file_magic::macho_preload_executable:
+  case file_magic::macho_dynamically_linked_shared_lib:
+  case file_magic::macho_dynamic_linker:
+  case file_magic::macho_bundle:
+  case file_magic::macho_dynamically_linked_shared_lib_stub:
+  case file_magic::macho_dsym_companion:
+  case file_magic::macho_kext_bundle:
     return createMachOObjectFile(Object);
-  case sys::fs::file_magic::coff_object:
-  case sys::fs::file_magic::coff_import_library:
-  case sys::fs::file_magic::pecoff_executable:
+  case file_magic::coff_object:
+  case file_magic::coff_import_library:
+  case file_magic::pecoff_executable:
     return errorOrToExpected(createCOFFObjectFile(Object));
-  case sys::fs::file_magic::wasm_object:
+  case file_magic::wasm_object:
     return createWasmObjectFile(Object);
   }
   llvm_unreachable("Unexpected Object File Type");
diff --git a/contrib/llvm/lib/Object/RecordStreamer.cpp b/contrib/llvm/lib/Object/RecordStreamer.cpp
index 572b960..e94e9cf 100644
--- a/contrib/llvm/lib/Object/RecordStreamer.cpp
+++ b/contrib/llvm/lib/Object/RecordStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- RecordStreamer.cpp - Record asm definde and used symbols ----------===//
+//===-- RecordStreamer.cpp - Record asm defined and used symbols ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,6 +9,7 @@
 
 #include "RecordStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+
 using namespace llvm;
 
 void RecordStreamer::markDefined(const MCSymbol &Symbol) {
@@ -69,20 +70,20 @@ void RecordStreamer::markUsed(const MCSymbol &Symbol) {
 
 void RecordStreamer::visitUsedSymbol(const MCSymbol &Sym) { markUsed(Sym); }
 
+RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {}
+
 RecordStreamer::const_iterator RecordStreamer::begin() {
   return Symbols.begin();
 }
 
 RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); }
 
-RecordStreamer::RecordStreamer(MCContext &Context) : MCStreamer(Context) {}
-
 void RecordStreamer::EmitInstruction(const MCInst &Inst,
-                                     const MCSubtargetInfo &STI) {
+                                     const MCSubtargetInfo &STI, bool) {
   MCStreamer::EmitInstruction(Inst, STI);
 }
 
-void RecordStreamer::EmitLabel(MCSymbol *Symbol) {
+void RecordStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCStreamer::EmitLabel(Symbol);
   markDefined(*Symbol);
 }
@@ -110,3 +111,8 @@ void RecordStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment) {
   markDefined(*Symbol);
 }
+
+void RecordStreamer::emitELFSymverDirective(MCSymbol *Alias,
+                                            const MCSymbol *Aliasee) {
+  SymverAliasMap[Aliasee].push_back(Alias);
+}
diff --git a/contrib/llvm/lib/Object/RecordStreamer.h b/contrib/llvm/lib/Object/RecordStreamer.h
index 617d8a4..4d11909 100644
--- a/contrib/llvm/lib/Object/RecordStreamer.h
+++ b/contrib/llvm/lib/Object/RecordStreamer.h
@@ -1,4 +1,4 @@
-//===-- RecordStreamer.h - Record asm defined and used symbols ---*- C++ -*===//
+//===- RecordStreamer.h - Record asm defined and used symbols ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,16 @@
 #ifndef LLVM_LIB_OBJECT_RECORDSTREAMER_H
 #define LLVM_LIB_OBJECT_RECORDSTREAMER_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/SMLoc.h"
+#include <vector>
 
 namespace llvm {
+
 class RecordStreamer : public MCStreamer {
 public:
   enum State { NeverSeen, Global, Defined, DefinedGlobal, DefinedWeak, Used,
@@ -20,24 +27,49 @@ public:
 
 private:
   StringMap<State> Symbols;
+  // Map of aliases created by .symver directives, saved so we can update
+  // their symbol binding after parsing complete. This maps from each
+  // aliasee to its list of aliases.
+  DenseMap<const MCSymbol *, std::vector<MCSymbol *>> SymverAliasMap;
+
   void markDefined(const MCSymbol &Symbol);
   void markGlobal(const MCSymbol &Symbol, MCSymbolAttr Attribute);
   void markUsed(const MCSymbol &Symbol);
   void visitUsedSymbol(const MCSymbol &Sym) override;
 
 public:
-  typedef StringMap<State>::const_iterator const_iterator;
+  RecordStreamer(MCContext &Context);
+
+  using const_iterator = StringMap<State>::const_iterator;
+
   const_iterator begin();
   const_iterator end();
-  RecordStreamer(MCContext &Context);
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                     unsigned ByteAlignment) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
+  /// Record .symver aliases for later processing.
+  void emitELFSymverDirective(MCSymbol *Alias,
+                              const MCSymbol *Aliasee) override;
+  /// Return the map of .symver aliasee to associated aliases.
+  DenseMap<const MCSymbol *, std::vector<MCSymbol *>> &symverAliases() {
+    return SymverAliasMap;
+  }
+
+  /// Get the state recorded for the given symbol.
+  State getSymbolState(const MCSymbol *Sym) {
+    auto SI = Symbols.find(Sym->getName());
+    if (SI == Symbols.end())
+      return NeverSeen;
+    return SI->second;
+  }
 };
-}
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_OBJECT_RECORDSTREAMER_H
diff --git a/contrib/llvm/lib/Object/SymbolicFile.cpp b/contrib/llvm/lib/Object/SymbolicFile.cpp
index 4b51a49..1042d29 100644
--- a/contrib/llvm/lib/Object/SymbolicFile.cpp
+++ b/contrib/llvm/lib/Object/SymbolicFile.cpp
@@ -1,4 +1,4 @@
-//===- SymbolicFile.cpp - Interface that only provides symbols --*- C++ -*-===//
+//===- SymbolicFile.cpp - Interface that only provides symbols ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/COFF.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include <algorithm>
+#include <memory>
 
 using namespace llvm;
 using namespace object;
@@ -24,47 +33,48 @@ using namespace object;
 SymbolicFile::SymbolicFile(unsigned int Type, MemoryBufferRef Source)
     : Binary(Type, Source) {}
 
-SymbolicFile::~SymbolicFile() {}
+SymbolicFile::~SymbolicFile() = default;
 
-Expected<std::unique_ptr<SymbolicFile>> SymbolicFile::createSymbolicFile(
-    MemoryBufferRef Object, sys::fs::file_magic Type, LLVMContext *Context) {
+Expected<std::unique_ptr<SymbolicFile>>
+SymbolicFile::createSymbolicFile(MemoryBufferRef Object, file_magic Type,
+                                 LLVMContext *Context) {
   StringRef Data = Object.getBuffer();
-  if (Type == sys::fs::file_magic::unknown)
-    Type = sys::fs::identify_magic(Data);
+  if (Type == file_magic::unknown)
+    Type = identify_magic(Data);
 
   switch (Type) {
-  case sys::fs::file_magic::bitcode:
+  case file_magic::bitcode:
     if (Context)
       return IRObjectFile::create(Object, *Context);
     LLVM_FALLTHROUGH;
-  case sys::fs::file_magic::unknown:
-  case sys::fs::file_magic::archive:
-  case sys::fs::file_magic::coff_cl_gl_object:
-  case sys::fs::file_magic::macho_universal_binary:
-  case sys::fs::file_magic::windows_resource:
+  case file_magic::unknown:
+  case file_magic::archive:
+  case file_magic::coff_cl_gl_object:
+  case file_magic::macho_universal_binary:
+  case file_magic::windows_resource:
     return errorCodeToError(object_error::invalid_file_type);
-  case sys::fs::file_magic::elf:
-  case sys::fs::file_magic::elf_executable:
-  case sys::fs::file_magic::elf_shared_object:
-  case sys::fs::file_magic::elf_core:
-  case sys::fs::file_magic::macho_executable:
-  case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
-  case sys::fs::file_magic::macho_core:
-  case sys::fs::file_magic::macho_preload_executable:
-  case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
-  case sys::fs::file_magic::macho_dynamic_linker:
-  case sys::fs::file_magic::macho_bundle:
-  case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
-  case sys::fs::file_magic::macho_dsym_companion:
-  case sys::fs::file_magic::macho_kext_bundle:
-  case sys::fs::file_magic::pecoff_executable:
-  case sys::fs::file_magic::wasm_object:
+  case file_magic::elf:
+  case file_magic::elf_executable:
+  case file_magic::elf_shared_object:
+  case file_magic::elf_core:
+  case file_magic::macho_executable:
+  case file_magic::macho_fixed_virtual_memory_shared_lib:
+  case file_magic::macho_core:
+  case file_magic::macho_preload_executable:
+  case file_magic::macho_dynamically_linked_shared_lib:
+  case file_magic::macho_dynamic_linker:
+  case file_magic::macho_bundle:
+  case file_magic::macho_dynamically_linked_shared_lib_stub:
+  case file_magic::macho_dsym_companion:
+  case file_magic::macho_kext_bundle:
+  case file_magic::pecoff_executable:
+  case file_magic::wasm_object:
     return ObjectFile::createObjectFile(Object, Type);
-  case sys::fs::file_magic::coff_import_library:
+  case file_magic::coff_import_library:
     return std::unique_ptr<SymbolicFile>(new COFFImportFile(Object));
-  case sys::fs::file_magic::elf_relocatable:
-  case sys::fs::file_magic::macho_object:
-  case sys::fs::file_magic::coff_object: {
+  case file_magic::elf_relocatable:
+  case file_magic::macho_object:
+  case file_magic::coff_object: {
     Expected<std::unique_ptr<ObjectFile>> Obj =
         ObjectFile::createObjectFile(Object, Type);
     if (!Obj || !Context)
diff --git a/contrib/llvm/lib/Object/WasmObjectFile.cpp b/contrib/llvm/lib/Object/WasmObjectFile.cpp
index 2b61a8a0..7f80bf0 100644
--- a/contrib/llvm/lib/Object/WasmObjectFile.cpp
+++ b/contrib/llvm/lib/Object/WasmObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- WasmObjectFile.cpp - Wasm object file implementation -----*- C++ -*-===//
+//===- WasmObjectFile.cpp - Wasm object file implementation ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/SymbolicFile.h"
 #include "llvm/Object/Wasm.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <system_error>
 
-namespace llvm {
-namespace object {
+#define DEBUG_TYPE "wasm-object"
+
+using namespace llvm;
+using namespace object;
 
 Expected<std::unique_ptr<WasmObjectFile>>
 ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
@@ -24,34 +43,146 @@ ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
   return std::move(ObjectFile);
 }
 
-namespace {
+#define VARINT7_MAX ((1<<7)-1)
+#define VARINT7_MIN (-(1<<7))
+#define VARUINT7_MAX (1<<7)
+#define VARUINT1_MAX (1)
+
+static uint8_t readUint8(const uint8_t *&Ptr) { return *Ptr++; }
 
-uint32_t readUint32(const uint8_t *&Ptr) {
+static uint32_t readUint32(const uint8_t *&Ptr) {
   uint32_t Result = support::endian::read32le(Ptr);
   Ptr += sizeof(Result);
   return Result;
 }
 
-uint64_t readULEB128(const uint8_t *&Ptr) {
+static int32_t readFloat32(const uint8_t *&Ptr) {
+  int32_t Result = 0;
+  memcpy(&Result, Ptr, sizeof(Result));
+  Ptr += sizeof(Result);
+  return Result;
+}
+
+static int64_t readFloat64(const uint8_t *&Ptr) {
+  int64_t Result = 0;
+  memcpy(&Result, Ptr, sizeof(Result));
+  Ptr += sizeof(Result);
+  return Result;
+}
+
+static uint64_t readULEB128(const uint8_t *&Ptr) {
   unsigned Count;
   uint64_t Result = decodeULEB128(Ptr, &Count);
   Ptr += Count;
   return Result;
 }
 
-StringRef readString(const uint8_t *&Ptr) {
+static StringRef readString(const uint8_t *&Ptr) {
   uint32_t StringLen = readULEB128(Ptr);
   StringRef Return = StringRef(reinterpret_cast<const char *>(Ptr), StringLen);
   Ptr += StringLen;
   return Return;
 }
 
-Error readSection(wasm::WasmSection &Section, const uint8_t *&Ptr,
-                  const uint8_t *Start) {
+static int64_t readLEB128(const uint8_t *&Ptr) {
+  unsigned Count;
+  uint64_t Result = decodeSLEB128(Ptr, &Count);
+  Ptr += Count;
+  return Result;
+}
+
+static uint8_t readVaruint1(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= VARUINT1_MAX && result >= 0);
+  return result;
+}
+
+static int8_t readVarint7(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= VARINT7_MAX && result >= VARINT7_MIN);
+  return result;
+}
+
+static uint8_t readVaruint7(const uint8_t *&Ptr) {
+  uint64_t result = readULEB128(Ptr);
+  assert(result <= VARUINT7_MAX);
+  return result;
+}
+
+static int32_t readVarint32(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= INT32_MAX && result >= INT32_MIN);
+  return result;
+}
+
+static uint32_t readVaruint32(const uint8_t *&Ptr) {
+  uint64_t result = readULEB128(Ptr);
+  assert(result <= UINT32_MAX);
+  return result;
+}
+
+static int64_t readVarint64(const uint8_t *&Ptr) {
+  return readLEB128(Ptr);
+}
+
+static uint8_t readOpcode(const uint8_t *&Ptr) {
+  return readUint8(Ptr);
+}
+
+static Error readInitExpr(wasm::WasmInitExpr &Expr, const uint8_t *&Ptr) {
+  Expr.Opcode = readOpcode(Ptr);
+
+  switch (Expr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    Expr.Value.Int32 = readVarint32(Ptr);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    Expr.Value.Int64 = readVarint64(Ptr);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    Expr.Value.Float32 = readFloat32(Ptr);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    Expr.Value.Float64 = readFloat64(Ptr);
+    break;
+  case wasm::WASM_OPCODE_GET_GLOBAL:
+    Expr.Value.Global = readULEB128(Ptr);
+    break;
+  default:
+    return make_error<GenericBinaryError>("Invalid opcode in init_expr",
+                                          object_error::parse_failed);
+  }
+
+  uint8_t EndOpcode = readOpcode(Ptr);
+  if (EndOpcode != wasm::WASM_OPCODE_END) {
+    return make_error<GenericBinaryError>("Invalid init_expr",
+                                          object_error::parse_failed);
+  }
+  return Error::success();
+}
+
+static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
+  wasm::WasmLimits Result;
+  Result.Flags = readVaruint1(Ptr);
+  Result.Initial = readVaruint32(Ptr);
+  if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    Result.Maximum = readVaruint32(Ptr);
+  return Result;
+}
+
+static wasm::WasmTable readTable(const uint8_t *&Ptr) {
+  wasm::WasmTable Table;
+  Table.ElemType = readVarint7(Ptr);
+  Table.Limits = readLimits(Ptr);
+  return Table;
+}
+
+static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
+                         const uint8_t *Start) {
   // TODO(sbc): Avoid reading past EOF in the case of malformed files.
   Section.Offset = Ptr - Start;
-  Section.Type = readULEB128(Ptr);
-  uint32_t Size = readULEB128(Ptr);
+  Section.Type = readVaruint7(Ptr);
+  uint32_t Size = readVaruint32(Ptr);
   if (Size == 0)
     return make_error<StringError>("Zero length section",
                                    object_error::parse_failed);
@@ -59,10 +190,12 @@ Error readSection(wasm::WasmSection &Section, const uint8_t *&Ptr,
   Ptr += Size;
   return Error::success();
 }
-}
 
 WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
     : ObjectFile(Binary::ID_Wasm, Buffer) {
+  LinkingData.DataAlignment = 0;
+  LinkingData.DataSize = 0;
+
   ErrorAsOutParameter ErrAsOutParam(&Err);
   Header.Magic = getData().substr(0, 4);
   if (Header.Magic != StringRef("\0asm", 4)) {
@@ -79,21 +212,486 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   }
 
   const uint8_t *Eof = getPtr(getData().size());
-  wasm::WasmSection Sec;
+  WasmSection Sec;
   while (Ptr < Eof) {
     if ((Err = readSection(Sec, Ptr, getPtr(0))))
       return;
-    if (Sec.Type == wasm::WASM_SEC_USER) {
-      if ((Err = parseUserSection(Sec, Sec.Content.data(), Sec.Content.size())))
-        return;
-    }
+    if ((Err = parseSection(Sec)))
+      return;
+
     Sections.push_back(Sec);
   }
 }
 
-Error WasmObjectFile::parseUserSection(wasm::WasmSection &Sec,
-                                       const uint8_t *Ptr, size_t Length) {
+Error WasmObjectFile::parseSection(WasmSection &Sec) {
+  const uint8_t* Start = Sec.Content.data();
+  const uint8_t* End = Start + Sec.Content.size();
+  switch (Sec.Type) {
+  case wasm::WASM_SEC_CUSTOM:
+    return parseCustomSection(Sec, Start, End);
+  case wasm::WASM_SEC_TYPE:
+    return parseTypeSection(Start, End);
+  case wasm::WASM_SEC_IMPORT:
+    return parseImportSection(Start, End);
+  case wasm::WASM_SEC_FUNCTION:
+    return parseFunctionSection(Start, End);
+  case wasm::WASM_SEC_TABLE:
+    return parseTableSection(Start, End);
+  case wasm::WASM_SEC_MEMORY:
+    return parseMemorySection(Start, End);
+  case wasm::WASM_SEC_GLOBAL:
+    return parseGlobalSection(Start, End);
+  case wasm::WASM_SEC_EXPORT:
+    return parseExportSection(Start, End);
+  case wasm::WASM_SEC_START:
+    return parseStartSection(Start, End);
+  case wasm::WASM_SEC_ELEM:
+    return parseElemSection(Start, End);
+  case wasm::WASM_SEC_CODE:
+    return parseCodeSection(Start, End);
+  case wasm::WASM_SEC_DATA:
+    return parseDataSection(Start, End);
+  default:
+    return make_error<GenericBinaryError>("Bad section type",
+                                          object_error::parse_failed);
+  }
+}
+
+Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
+  while (Ptr < End) {
+    uint8_t Type = readVarint7(Ptr);
+    uint32_t Size = readVaruint32(Ptr);
+    const uint8_t *SubSectionEnd = Ptr + Size;
+    switch (Type) {
+    case wasm::WASM_NAMES_FUNCTION: {
+      uint32_t Count = readVaruint32(Ptr);
+      while (Count--) {
+        uint32_t Index = readVaruint32(Ptr);
+        StringRef Name = readString(Ptr);
+        if (!Name.empty())
+          Symbols.emplace_back(Name,
+                               WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME,
+                               Sections.size(), Index);
+      }
+      break;
+    }
+    // Ignore local names for now
+    case wasm::WASM_NAMES_LOCAL:
+    default:
+      Ptr += Size;
+      break;
+    }
+    if (Ptr != SubSectionEnd)
+      return make_error<GenericBinaryError>("Name sub-section ended prematurely",
+                                            object_error::parse_failed);
+  }
+
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Name section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseLinkingSection(const uint8_t *Ptr,
+                                          const uint8_t *End) {
+  HasLinkingSection = true;
+  while (Ptr < End) {
+    uint8_t Type = readVarint7(Ptr);
+    uint32_t Size = readVaruint32(Ptr);
+    const uint8_t *SubSectionEnd = Ptr + Size;
+    switch (Type) {
+    case wasm::WASM_SYMBOL_INFO: {
+      uint32_t Count = readVaruint32(Ptr);
+      while (Count--) {
+        StringRef Symbol = readString(Ptr);
+        DEBUG(dbgs() << "reading syminfo: " << Symbol << "\n");
+        uint32_t Flags = readVaruint32(Ptr);
+        auto iter = SymbolMap.find(Symbol);
+        if (iter == SymbolMap.end()) {
+          return make_error<GenericBinaryError>(
+              "Invalid symbol name in linking section: " + Symbol,
+              object_error::parse_failed);
+        }
+        uint32_t SymIndex = iter->second;
+        assert(SymIndex < Symbols.size());
+        Symbols[SymIndex].Flags = Flags;
+        DEBUG(dbgs() << "Set symbol flags index:"
+                     << SymIndex << " name:"
+                     << Symbols[SymIndex].Name << " exptected:"
+                     << Symbol << " flags: " << Flags << "\n");
+      }
+      break;
+    }
+    case wasm::WASM_DATA_SIZE:
+      LinkingData.DataSize = readVaruint32(Ptr);
+      break;
+    case wasm::WASM_DATA_ALIGNMENT:
+      LinkingData.DataAlignment = readVaruint32(Ptr);
+      break;
+    case wasm::WASM_STACK_POINTER:
+    default:
+      Ptr += Size;
+      break;
+    }
+    if (Ptr != SubSectionEnd)
+      return make_error<GenericBinaryError>(
+          "Linking sub-section ended prematurely", object_error::parse_failed);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Linking section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+WasmSection* WasmObjectFile::findCustomSectionByName(StringRef Name) {
+  for (WasmSection& Section : Sections) {
+    if (Section.Type == wasm::WASM_SEC_CUSTOM && Section.Name == Name)
+      return &Section;
+  }
+  return nullptr;
+}
+
+WasmSection* WasmObjectFile::findSectionByType(uint32_t Type) {
+  assert(Type != wasm::WASM_SEC_CUSTOM);
+  for (WasmSection& Section : Sections) {
+    if (Section.Type == Type)
+      return &Section;
+  }
+  return nullptr;
+}
+
+Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr,
+                                        const uint8_t *End) {
+  uint8_t SectionCode = readVarint7(Ptr);
+  WasmSection* Section = nullptr;
+  if (SectionCode == wasm::WASM_SEC_CUSTOM) {
+    StringRef Name = readString(Ptr);
+    Section = findCustomSectionByName(Name);
+  } else {
+    Section = findSectionByType(SectionCode);
+  }
+  if (!Section)
+    return make_error<GenericBinaryError>("Invalid section code",
+                                          object_error::parse_failed);
+  uint32_t RelocCount = readVaruint32(Ptr);
+  while (RelocCount--) {
+    wasm::WasmRelocation Reloc;
+    memset(&Reloc, 0, sizeof(Reloc));
+    Reloc.Type = readVaruint32(Ptr);
+    Reloc.Offset = readVaruint32(Ptr);
+    Reloc.Index = readVaruint32(Ptr);
+    switch (Reloc.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+    case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB:
+      break;
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      Reloc.Addend = readVarint32(Ptr);
+      break;
+    default:
+      return make_error<GenericBinaryError>("Bad relocation type: " +
+                                                Twine(Reloc.Type),
+                                            object_error::parse_failed);
+    }
+    Section->Relocations.push_back(Reloc);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Reloc section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseCustomSection(WasmSection &Sec,
+                                         const uint8_t *Ptr, const uint8_t *End) {
   Sec.Name = readString(Ptr);
+  if (Sec.Name == "name") {
+    if (Error Err = parseNameSection(Ptr, End))
+      return Err;
+  } else if (Sec.Name == "linking") {
+    if (Error Err = parseLinkingSection(Ptr, End))
+      return Err;
+  } else if (Sec.Name.startswith("reloc.")) {
+    if (Error Err = parseRelocSection(Sec.Name, Ptr, End))
+      return Err;
+  }
+  return Error::success();
+}
+
+Error WasmObjectFile::parseTypeSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Signatures.reserve(Count);
+  while (Count--) {
+    wasm::WasmSignature Sig;
+    Sig.ReturnType = wasm::WASM_TYPE_NORESULT;
+    int8_t Form = readVarint7(Ptr);
+    if (Form != wasm::WASM_TYPE_FUNC) {
+      return make_error<GenericBinaryError>("Invalid signature type",
+                                            object_error::parse_failed);
+    }
+    uint32_t ParamCount = readVaruint32(Ptr);
+    Sig.ParamTypes.reserve(ParamCount);
+    while (ParamCount--) {
+      uint32_t ParamType = readVarint7(Ptr);
+      Sig.ParamTypes.push_back(ParamType);
+    }
+    uint32_t ReturnCount = readVaruint32(Ptr);
+    if (ReturnCount) {
+      if (ReturnCount != 1) {
+        return make_error<GenericBinaryError>(
+            "Multiple return types not supported", object_error::parse_failed);
+      }
+      Sig.ReturnType = readVarint7(Ptr);
+    }
+    Signatures.push_back(Sig);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Type section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Imports.reserve(Count);
+  for (uint32_t i = 0; i < Count; i++) {
+    wasm::WasmImport Im;
+    Im.Module = readString(Ptr);
+    Im.Field = readString(Ptr);
+    Im.Kind = readUint8(Ptr);
+    switch (Im.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      Im.SigIndex = readVaruint32(Ptr);
+      SymbolMap.try_emplace(Im.Field, Symbols.size());
+      Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::FUNCTION_IMPORT,
+                           Sections.size(), i);
+      DEBUG(dbgs() << "Adding import: " << Symbols.back()
+                   << " sym index:" << Symbols.size() << "\n");
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      Im.Global.Type = readVarint7(Ptr);
+      Im.Global.Mutable = readVaruint1(Ptr);
+      SymbolMap.try_emplace(Im.Field, Symbols.size());
+      Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT,
+                           Sections.size(), i);
+      DEBUG(dbgs() << "Adding import: " << Symbols.back()
+                   << " sym index:" << Symbols.size() << "\n");
+      break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+      Im.Memory = readLimits(Ptr);
+      break;
+    case wasm::WASM_EXTERNAL_TABLE:
+      Im.Table = readTable(Ptr);
+      if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+        return make_error<GenericBinaryError>("Invalid table element type",
+                                              object_error::parse_failed);
+      }
+      break;
+    default:
+      return make_error<GenericBinaryError>(
+          "Unexpected import kind", object_error::parse_failed);
+    }
+    Imports.push_back(Im);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Import section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseFunctionSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  FunctionTypes.reserve(Count);
+  while (Count--) {
+    FunctionTypes.push_back(readVaruint32(Ptr));
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Function section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Tables.reserve(Count);
+  while (Count--) {
+    Tables.push_back(readTable(Ptr));
+    if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) {
+      return make_error<GenericBinaryError>("Invalid table element type",
+                                            object_error::parse_failed);
+    }
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Table section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseMemorySection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Memories.reserve(Count);
+  while (Count--) {
+    Memories.push_back(readLimits(Ptr));
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Memory section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseGlobalSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Globals.reserve(Count);
+  while (Count--) {
+    wasm::WasmGlobal Global;
+    Global.Type = readVarint7(Ptr);
+    Global.Mutable = readVaruint1(Ptr);
+    if (Error Err = readInitExpr(Global.InitExpr, Ptr))
+      return Err;
+    Globals.push_back(Global);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Global section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Exports.reserve(Count);
+  for (uint32_t i = 0; i < Count; i++) {
+    wasm::WasmExport Ex;
+    Ex.Name = readString(Ptr);
+    Ex.Kind = readUint8(Ptr);
+    Ex.Index = readVaruint32(Ptr);
+    WasmSymbol::SymbolType ExportType;
+    bool MakeSymbol = false;
+    switch (Ex.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      ExportType = WasmSymbol::SymbolType::FUNCTION_EXPORT;
+      MakeSymbol = true;
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      ExportType = WasmSymbol::SymbolType::GLOBAL_EXPORT;
+      MakeSymbol = true;
+      break;
+    case wasm::WASM_EXTERNAL_MEMORY:
+    case wasm::WASM_EXTERNAL_TABLE:
+      break;
+    default:
+      return make_error<GenericBinaryError>(
+          "Unexpected export kind", object_error::parse_failed);
+    }
+    if (MakeSymbol) {
+      auto Pair = SymbolMap.try_emplace(Ex.Name, Symbols.size());
+      if (Pair.second) {
+        Symbols.emplace_back(Ex.Name, ExportType,
+                             Sections.size(), i);
+        DEBUG(dbgs() << "Adding export: " << Symbols.back()
+                     << " sym index:" << Symbols.size() << "\n");
+      } else {
+        uint32_t SymIndex = Pair.first->second;
+        Symbols[SymIndex] = WasmSymbol(Ex.Name, ExportType, Sections.size(), i);
+        DEBUG(dbgs() << "Replacing existing symbol:  " << Symbols[SymIndex]
+                     << " sym index:" << SymIndex << "\n");
+      }
+    }
+    Exports.push_back(Ex);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Export section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
+  StartFunction = readVaruint32(Ptr);
+  if (StartFunction >= FunctionTypes.size())
+    return make_error<GenericBinaryError>("Invalid start function",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseCodeSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t FunctionCount = readVaruint32(Ptr);
+  if (FunctionCount != FunctionTypes.size()) {
+    return make_error<GenericBinaryError>("Invalid function count",
+                                          object_error::parse_failed);
+  }
+
+  CodeSection = ArrayRef<uint8_t>(Ptr, End - Ptr);
+
+  while (FunctionCount--) {
+    wasm::WasmFunction Function;
+    uint32_t FunctionSize = readVaruint32(Ptr);
+    const uint8_t *FunctionEnd = Ptr + FunctionSize;
+
+    uint32_t NumLocalDecls = readVaruint32(Ptr);
+    Function.Locals.reserve(NumLocalDecls);
+    while (NumLocalDecls--) {
+      wasm::WasmLocalDecl Decl;
+      Decl.Count = readVaruint32(Ptr);
+      Decl.Type = readVarint7(Ptr);
+      Function.Locals.push_back(Decl);
+    }
+
+    uint32_t BodySize = FunctionEnd - Ptr;
+    Function.Body = ArrayRef<uint8_t>(Ptr, BodySize);
+    Ptr += BodySize;
+    assert(Ptr == FunctionEnd);
+    Functions.push_back(Function);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Code section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseElemSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  ElemSegments.reserve(Count);
+  while (Count--) {
+    wasm::WasmElemSegment Segment;
+    Segment.TableIndex = readVaruint32(Ptr);
+    if (Segment.TableIndex != 0) {
+      return make_error<GenericBinaryError>("Invalid TableIndex",
+                                            object_error::parse_failed);
+    }
+    if (Error Err = readInitExpr(Segment.Offset, Ptr))
+      return Err;
+    uint32_t NumElems = readVaruint32(Ptr);
+    while (NumElems--) {
+      Segment.Functions.push_back(readVaruint32(Ptr));
+    }
+    ElemSegments.push_back(Segment);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Elem section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseDataSection(const uint8_t *Ptr, const uint8_t *End) {
+  const uint8_t *Start = Ptr;
+  uint32_t Count = readVaruint32(Ptr);
+  DataSegments.reserve(Count);
+  while (Count--) {
+    WasmSegment Segment;
+    Segment.Data.MemoryIndex = readVaruint32(Ptr);
+    if (Error Err = readInitExpr(Segment.Data.Offset, Ptr))
+      return Err;
+    uint32_t Size = readVaruint32(Ptr);
+    Segment.Data.Content = ArrayRef<uint8_t>(Ptr, Size);
+    Segment.SectionOffset = Ptr - Start;
+    Ptr += Size;
+    DataSegments.push_back(Segment);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Data section ended prematurely",
+                                          object_error::parse_failed);
   return Error::success();
 }
 
@@ -105,42 +703,79 @@ const wasm::WasmObjectHeader &WasmObjectFile::getHeader() const {
   return Header;
 }
 
-void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
-  llvm_unreachable("not yet implemented");
-}
-
-std::error_code WasmObjectFile::printSymbolName(raw_ostream &OS,
-                                                DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return object_error::invalid_symbol_index;
-}
+void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const { Symb.d.a++; }
 
 uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+  uint32_t Result = SymbolRef::SF_None;
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+
+  DEBUG(dbgs() << "getSymbolFlags: ptr=" << &Sym << " " << Sym << "\n");
+  if (Sym.Flags & wasm::WASM_SYMBOL_FLAG_WEAK)
+    Result |= SymbolRef::SF_Weak;
+
+  switch (Sym.Type) {
+  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
+    Result |= SymbolRef::SF_Undefined | SymbolRef::SF_Executable;
+    break;
+  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
+    Result |= SymbolRef::SF_Global | SymbolRef::SF_Executable;
+    break;
+  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+    Result |= SymbolRef::SF_Executable;
+    Result |= SymbolRef::SF_FormatSpecific;
+    break;
+  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+    Result |= SymbolRef::SF_Undefined;
+    break;
+  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+    Result |= SymbolRef::SF_Global;
+    break;
+  }
+
+  return Result;
 }
 
 basic_symbol_iterator WasmObjectFile::symbol_begin() const {
-  return BasicSymbolRef(DataRefImpl(), this);
+  DataRefImpl Ref;
+  Ref.d.a = 0;
+  return BasicSymbolRef(Ref, this);
 }
 
 basic_symbol_iterator WasmObjectFile::symbol_end() const {
-  return BasicSymbolRef(DataRefImpl(), this);
+  DataRefImpl Ref;
+  Ref.d.a = Symbols.size();
+  return BasicSymbolRef(Ref, this);
+}
+
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const DataRefImpl &Symb) const {
+  return Symbols[Symb.d.a];
+}
+
+const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const {
+  return getWasmSymbol(Symb.getRawDataRefImpl());
 }
 
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  return getWasmSymbol(Symb).Name;
 }
 
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  return getSymbolValue(Symb);
 }
 
 uint64_t WasmObjectFile::getSymbolValueImpl(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+  const WasmSymbol& Sym = getWasmSymbol(Symb);
+  switch (Sym.Type) {
+  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
+  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+    return 0;
+  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
+  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+    return Exports[Sym.ElementIndex].Index;
+  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+    return Sym.ElementIndex;
+  }
+  llvm_unreachable("invalid symbol type");
 }
 
 uint32_t WasmObjectFile::getSymbolAlignment(DataRefImpl Symb) const {
@@ -155,21 +790,34 @@ uint64_t WasmObjectFile::getCommonSymbolSizeImpl(DataRefImpl Symb) const {
 
 Expected<SymbolRef::Type>
 WasmObjectFile::getSymbolType(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+
+  switch (Sym.Type) {
+  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
+  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
+  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+    return SymbolRef::ST_Function;
+  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+    return SymbolRef::ST_Data;
+  }
+
+  llvm_unreachable("Unknown WasmSymbol::SymbolType");
+  return SymbolRef::ST_Other;
 }
 
 Expected<section_iterator>
 WasmObjectFile::getSymbolSection(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  DataRefImpl Ref;
+  Ref.d.a = getWasmSymbol(Symb).Section;
+  return section_iterator(SectionRef(Ref, this));
 }
 
 void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
 
 std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
                                                StringRef &Res) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
 #define ECase(X)                                                               \
   case wasm::WASM_SEC_##X:                                                     \
     Res = #X;                                                                  \
@@ -186,7 +834,7 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
     ECase(ELEM);
     ECase(CODE);
     ECase(DATA);
-  case wasm::WASM_SEC_USER:
+  case wasm::WASM_SEC_CUSTOM:
     Res = S.Name;
     break;
   default:
@@ -198,14 +846,18 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
 
 uint64_t WasmObjectFile::getSectionAddress(DataRefImpl Sec) const { return 0; }
 
+uint64_t WasmObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  return Sec.d.a;
+}
+
 uint64_t WasmObjectFile::getSectionSize(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
   return S.Content.size();
 }
 
 std::error_code WasmObjectFile::getSectionContents(DataRefImpl Sec,
                                                    StringRef &Res) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
   // This will never fail since wasm sections can never be empty (user-sections
   // must have a name and non-user sections each have a defined structure).
   Res = StringRef(reinterpret_cast<const char *>(S.Content.data()),
@@ -222,13 +874,11 @@ bool WasmObjectFile::isSectionCompressed(DataRefImpl Sec) const {
 }
 
 bool WasmObjectFile::isSectionText(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
-  return S.Type == wasm::WASM_SEC_CODE;
+  return getWasmSection(Sec).Type == wasm::WASM_SEC_CODE;
 }
 
 bool WasmObjectFile::isSectionData(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
-  return S.Type == wasm::WASM_SEC_DATA;
+  return getWasmSection(Sec).Type == wasm::WASM_SEC_DATA;
 }
 
 bool WasmObjectFile::isSectionBSS(DataRefImpl Sec) const { return false; }
@@ -237,31 +887,28 @@ bool WasmObjectFile::isSectionVirtual(DataRefImpl Sec) const { return false; }
 
 bool WasmObjectFile::isSectionBitcode(DataRefImpl Sec) const { return false; }
 
-relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  RelocationRef Rel;
-  return relocation_iterator(Rel);
-}
-
-relocation_iterator WasmObjectFile::section_rel_end(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  RelocationRef Rel;
-  return relocation_iterator(Rel);
+relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Ref) const {
+  DataRefImpl RelocRef;
+  RelocRef.d.a = Ref.d.a;
+  RelocRef.d.b = 0;
+  return relocation_iterator(RelocationRef(RelocRef, this));
 }
 
-section_iterator WasmObjectFile::getRelocatedSection(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  SectionRef Ref;
-  return section_iterator(Ref);
+relocation_iterator WasmObjectFile::section_rel_end(DataRefImpl Ref) const {
+  const WasmSection &Sec = getWasmSection(Ref);
+  DataRefImpl RelocRef;
+  RelocRef.d.a = Ref.d.a;
+  RelocRef.d.b = Sec.Relocations.size();
+  return relocation_iterator(RelocationRef(RelocRef, this));
 }
 
 void WasmObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  llvm_unreachable("not yet implemented");
+  Rel.d.b++;
 }
 
-uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  return Rel.Offset;
 }
 
 symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
@@ -270,14 +917,28 @@ symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(Ref);
 }
 
-uint64_t WasmObjectFile::getRelocationType(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+uint64_t WasmObjectFile::getRelocationType(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  return Rel.Type;
 }
 
 void WasmObjectFile::getRelocationTypeName(
-    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  llvm_unreachable("not yet implemented");
+    DataRefImpl Ref, SmallVectorImpl<char> &Result) const {
+  const wasm::WasmRelocation& Rel = getWasmRelocation(Ref);
+  StringRef Res = "Unknown";
+
+#define WASM_RELOC(name, value)  \
+  case wasm::name:              \
+    Res = #name;               \
+    break;
+
+  switch (Rel.Type) {
+#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def"
+  }
+
+#undef WASM_RELOC
+
+  Result.append(Res.begin(), Res.end());
 }
 
 section_iterator WasmObjectFile::section_begin() const {
@@ -302,12 +963,29 @@ SubtargetFeatures WasmObjectFile::getFeatures() const {
   return SubtargetFeatures();
 }
 
-bool WasmObjectFile::isRelocatableObject() const { return false; }
+bool WasmObjectFile::isRelocatableObject() const {
+  return HasLinkingSection;
+}
+
+const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const {
+  assert(Ref.d.a < Sections.size());
+  return Sections[Ref.d.a];
+}
 
-const wasm::WasmSection *
+const WasmSection &
 WasmObjectFile::getWasmSection(const SectionRef &Section) const {
-  return &Sections[Section.getRawDataRefImpl().d.a];
+  return getWasmSection(Section.getRawDataRefImpl());
 }
 
-} // end namespace object
-} // end namespace llvm
+const wasm::WasmRelocation &
+WasmObjectFile::getWasmRelocation(const RelocationRef &Ref) const {
+  return getWasmRelocation(Ref.getRawDataRefImpl());
+}
+
+const wasm::WasmRelocation &
+WasmObjectFile::getWasmRelocation(DataRefImpl Ref) const {
+  assert(Ref.d.a < Sections.size());
+  const WasmSection& Sec = Sections[Ref.d.a];
+  assert(Ref.d.b < Sec.Relocations.size());
+  return Sec.Relocations[Ref.d.b];
+}
diff --git a/contrib/llvm/lib/Object/WindowsResource.cpp b/contrib/llvm/lib/Object/WindowsResource.cpp
new file mode 100644
index 0000000..246eee5
--- /dev/null
+++ b/contrib/llvm/lib/Object/WindowsResource.cpp
@@ -0,0 +1,720 @@
+//===-- WindowsResource.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the .res file class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/WindowsResource.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MathExtras.h"
+#include <ctime>
+#include <queue>
+#include <sstream>
+#include <system_error>
+
+using namespace llvm;
+using namespace object;
+
+namespace llvm {
+namespace object {
+
+#define RETURN_IF_ERROR(X)                                                     \
+  if (auto EC = X)                                                             \
+    return EC;
+
+const uint32_t MIN_HEADER_SIZE = 7 * sizeof(uint32_t) + 2 * sizeof(uint16_t);
+
+// COFF files seem to be inconsistent with alignment between sections, just use
+// 8-byte because it makes everyone happy.
+const uint32_t SECTION_ALIGNMENT = sizeof(uint64_t);
+
+uint32_t WindowsResourceParser::TreeNode::StringCount = 0;
+uint32_t WindowsResourceParser::TreeNode::DataCount = 0;
+
+WindowsResource::WindowsResource(MemoryBufferRef Source)
+    : Binary(Binary::ID_WinRes, Source) {
+  size_t LeadingSize = WIN_RES_MAGIC_SIZE + WIN_RES_NULL_ENTRY_SIZE;
+  BBS = BinaryByteStream(Data.getBuffer().drop_front(LeadingSize),
+                         support::little);
+}
+
+Expected<std::unique_ptr<WindowsResource>>
+WindowsResource::createWindowsResource(MemoryBufferRef Source) {
+  if (Source.getBufferSize() < WIN_RES_MAGIC_SIZE + WIN_RES_NULL_ENTRY_SIZE)
+    return make_error<GenericBinaryError>(
+        "File too small to be a resource file",
+        object_error::invalid_file_type);
+  std::unique_ptr<WindowsResource> Ret(new WindowsResource(Source));
+  return std::move(Ret);
+}
+
+Expected<ResourceEntryRef> WindowsResource::getHeadEntry() {
+  Error Err = Error::success();
+  auto Ref = ResourceEntryRef(BinaryStreamRef(BBS), this, Err);
+  if (Err)
+    return std::move(Err);
+  return Ref;
+}
+
+ResourceEntryRef::ResourceEntryRef(BinaryStreamRef Ref,
+                                   const WindowsResource *Owner, Error &Err)
+    : Reader(Ref), OwningRes(Owner) {
+  if (loadNext())
+    Err = make_error<GenericBinaryError>("Could not read first entry.\n",
+                                         object_error::unexpected_eof);
+}
+
+Error ResourceEntryRef::moveNext(bool &End) {
+  // Reached end of all the entries.
+  if (Reader.bytesRemaining() == 0) {
+    End = true;
+    return Error::success();
+  }
+  RETURN_IF_ERROR(loadNext());
+
+  return Error::success();
+}
+
+static Error readStringOrId(BinaryStreamReader &Reader, uint16_t &ID,
+                            ArrayRef<UTF16> &Str, bool &IsString) {
+  uint16_t IDFlag;
+  RETURN_IF_ERROR(Reader.readInteger(IDFlag));
+  IsString = IDFlag != 0xffff;
+
+  if (IsString) {
+    Reader.setOffset(
+        Reader.getOffset() -
+        sizeof(uint16_t)); // Re-read the bytes which we used to check the flag.
+    RETURN_IF_ERROR(Reader.readWideString(Str));
+  } else
+    RETURN_IF_ERROR(Reader.readInteger(ID));
+
+  return Error::success();
+}
+
+Error ResourceEntryRef::loadNext() {
+  const WinResHeaderPrefix *Prefix;
+  RETURN_IF_ERROR(Reader.readObject(Prefix));
+
+  if (Prefix->HeaderSize < MIN_HEADER_SIZE)
+    return make_error<GenericBinaryError>("Header size is too small.",
+                                          object_error::parse_failed);
+
+  RETURN_IF_ERROR(readStringOrId(Reader, TypeID, Type, IsStringType));
+
+  RETURN_IF_ERROR(readStringOrId(Reader, NameID, Name, IsStringName));
+
+  RETURN_IF_ERROR(Reader.padToAlignment(WIN_RES_HEADER_ALIGNMENT));
+
+  RETURN_IF_ERROR(Reader.readObject(Suffix));
+
+  RETURN_IF_ERROR(Reader.readArray(Data, Prefix->DataSize));
+
+  RETURN_IF_ERROR(Reader.padToAlignment(WIN_RES_DATA_ALIGNMENT));
+
+  return Error::success();
+}
+
+WindowsResourceParser::WindowsResourceParser() : Root(false) {}
+
+Error WindowsResourceParser::parse(WindowsResource *WR) {
+  auto EntryOrErr = WR->getHeadEntry();
+  if (!EntryOrErr)
+    return EntryOrErr.takeError();
+
+  ResourceEntryRef Entry = EntryOrErr.get();
+  bool End = false;
+  while (!End) {
+    Data.push_back(Entry.getData());
+
+    bool IsNewTypeString = false;
+    bool IsNewNameString = false;
+
+    Root.addEntry(Entry, IsNewTypeString, IsNewNameString);
+
+    if (IsNewTypeString)
+      StringTable.push_back(Entry.getTypeString());
+
+    if (IsNewNameString)
+      StringTable.push_back(Entry.getNameString());
+
+    RETURN_IF_ERROR(Entry.moveNext(End));
+  }
+
+  return Error::success();
+}
+
+void WindowsResourceParser::printTree(raw_ostream &OS) const {
+  ScopedPrinter Writer(OS);
+  Root.print(Writer, "Resource Tree");
+}
+
+void WindowsResourceParser::TreeNode::addEntry(const ResourceEntryRef &Entry,
+                                               bool &IsNewTypeString,
+                                               bool &IsNewNameString) {
+  TreeNode &TypeNode = addTypeNode(Entry, IsNewTypeString);
+  TreeNode &NameNode = TypeNode.addNameNode(Entry, IsNewNameString);
+  NameNode.addLanguageNode(Entry);
+}
+
+WindowsResourceParser::TreeNode::TreeNode(bool IsStringNode) {
+  if (IsStringNode)
+    StringIndex = StringCount++;
+}
+
+WindowsResourceParser::TreeNode::TreeNode(uint16_t MajorVersion,
+                                          uint16_t MinorVersion,
+                                          uint32_t Characteristics)
+    : IsDataNode(true), MajorVersion(MajorVersion), MinorVersion(MinorVersion),
+      Characteristics(Characteristics) {
+    DataIndex = DataCount++;
+}
+
+std::unique_ptr<WindowsResourceParser::TreeNode>
+WindowsResourceParser::TreeNode::createStringNode() {
+  return std::unique_ptr<TreeNode>(new TreeNode(true));
+}
+
+std::unique_ptr<WindowsResourceParser::TreeNode>
+WindowsResourceParser::TreeNode::createIDNode() {
+  return std::unique_ptr<TreeNode>(new TreeNode(false));
+}
+
+std::unique_ptr<WindowsResourceParser::TreeNode>
+WindowsResourceParser::TreeNode::createDataNode(uint16_t MajorVersion,
+                                                uint16_t MinorVersion,
+                                                uint32_t Characteristics) {
+  return std::unique_ptr<TreeNode>(
+      new TreeNode(MajorVersion, MinorVersion, Characteristics));
+}
+
+WindowsResourceParser::TreeNode &
+WindowsResourceParser::TreeNode::addTypeNode(const ResourceEntryRef &Entry,
+                                             bool &IsNewTypeString) {
+  if (Entry.checkTypeString())
+    return addChild(Entry.getTypeString(), IsNewTypeString);
+  else
+    return addChild(Entry.getTypeID());
+}
+
+WindowsResourceParser::TreeNode &
+WindowsResourceParser::TreeNode::addNameNode(const ResourceEntryRef &Entry,
+                                             bool &IsNewNameString) {
+  if (Entry.checkNameString())
+    return addChild(Entry.getNameString(), IsNewNameString);
+  else
+    return addChild(Entry.getNameID());
+}
+
+WindowsResourceParser::TreeNode &
+WindowsResourceParser::TreeNode::addLanguageNode(
+    const ResourceEntryRef &Entry) {
+  return addChild(Entry.getLanguage(), true, Entry.getMajorVersion(),
+                  Entry.getMinorVersion(), Entry.getCharacteristics());
+}
+
+WindowsResourceParser::TreeNode &WindowsResourceParser::TreeNode::addChild(
+    uint32_t ID, bool IsDataNode, uint16_t MajorVersion, uint16_t MinorVersion,
+    uint32_t Characteristics) {
+  auto Child = IDChildren.find(ID);
+  if (Child == IDChildren.end()) {
+    auto NewChild =
+        IsDataNode ? createDataNode(MajorVersion, MinorVersion, Characteristics)
+                   : createIDNode();
+    WindowsResourceParser::TreeNode &Node = *NewChild;
+    IDChildren.emplace(ID, std::move(NewChild));
+    return Node;
+  } else
+    return *(Child->second);
+}
+
+WindowsResourceParser::TreeNode &
+WindowsResourceParser::TreeNode::addChild(ArrayRef<UTF16> NameRef,
+                                          bool &IsNewString) {
+  std::string NameString;
+  ArrayRef<UTF16> CorrectedName;
+  std::vector<UTF16> EndianCorrectedName;
+  if (sys::IsBigEndianHost) {
+    EndianCorrectedName.resize(NameRef.size() + 1);
+    std::copy(NameRef.begin(), NameRef.end(), EndianCorrectedName.begin() + 1);
+    EndianCorrectedName[0] = UNI_UTF16_BYTE_ORDER_MARK_SWAPPED;
+    CorrectedName = makeArrayRef(EndianCorrectedName);
+  } else
+    CorrectedName = NameRef;
+  convertUTF16ToUTF8String(CorrectedName, NameString);
+
+  auto Child = StringChildren.find(NameString);
+  if (Child == StringChildren.end()) {
+    auto NewChild = createStringNode();
+    IsNewString = true;
+    WindowsResourceParser::TreeNode &Node = *NewChild;
+    StringChildren.emplace(NameString, std::move(NewChild));
+    return Node;
+  } else
+    return *(Child->second);
+}
+
+void WindowsResourceParser::TreeNode::print(ScopedPrinter &Writer,
+                                            StringRef Name) const {
+  ListScope NodeScope(Writer, Name);
+  for (auto const &Child : StringChildren) {
+    Child.second->print(Writer, Child.first);
+  }
+  for (auto const &Child : IDChildren) {
+    Child.second->print(Writer, to_string(Child.first));
+  }
+}
+
+// This function returns the size of the entire resource tree, including
+// directory tables, directory entries, and data entries.  It does not include
+// the directory strings or the relocations of the .rsrc section.
+uint32_t WindowsResourceParser::TreeNode::getTreeSize() const {
+  uint32_t Size = (IDChildren.size() + StringChildren.size()) *
+                  sizeof(coff_resource_dir_entry);
+
+  // Reached a node pointing to a data entry.
+  if (IsDataNode) {
+    Size += sizeof(coff_resource_data_entry);
+    return Size;
+  }
+
+  // If the node does not point to data, it must have a directory table pointing
+  // to other nodes.
+  Size += sizeof(coff_resource_dir_table);
+
+  for (auto const &Child : StringChildren) {
+    Size += Child.second->getTreeSize();
+  }
+  for (auto const &Child : IDChildren) {
+    Size += Child.second->getTreeSize();
+  }
+  return Size;
+}
+
+class WindowsResourceCOFFWriter {
+public:
+  WindowsResourceCOFFWriter(COFF::MachineTypes MachineType,
+                            const WindowsResourceParser &Parser, Error &E);
+  std::unique_ptr<MemoryBuffer> write();
+
+private:
+  void performFileLayout();
+  void performSectionOneLayout();
+  void performSectionTwoLayout();
+  void writeCOFFHeader();
+  void writeFirstSectionHeader();
+  void writeSecondSectionHeader();
+  void writeFirstSection();
+  void writeSecondSection();
+  void writeSymbolTable();
+  void writeStringTable();
+  void writeDirectoryTree();
+  void writeDirectoryStringTable();
+  void writeFirstSectionRelocations();
+  std::unique_ptr<MemoryBuffer> OutputBuffer;
+  char *BufferStart;
+  uint64_t CurrentOffset = 0;
+  COFF::MachineTypes MachineType;
+  const WindowsResourceParser::TreeNode &Resources;
+  const ArrayRef<std::vector<uint8_t>> Data;
+  uint64_t FileSize;
+  uint32_t SymbolTableOffset;
+  uint32_t SectionOneSize;
+  uint32_t SectionOneOffset;
+  uint32_t SectionOneRelocations;
+  uint32_t SectionTwoSize;
+  uint32_t SectionTwoOffset;
+  const ArrayRef<std::vector<UTF16>> StringTable;
+  std::vector<uint32_t> StringTableOffsets;
+  std::vector<uint32_t> DataOffsets;
+  std::vector<uint32_t> RelocationAddresses;
+};
+
+WindowsResourceCOFFWriter::WindowsResourceCOFFWriter(
+    COFF::MachineTypes MachineType, const WindowsResourceParser &Parser,
+    Error &E)
+    : MachineType(MachineType), Resources(Parser.getTree()),
+      Data(Parser.getData()), StringTable(Parser.getStringTable()) {
+  performFileLayout();
+
+  OutputBuffer = MemoryBuffer::getNewMemBuffer(FileSize);
+}
+
+void WindowsResourceCOFFWriter::performFileLayout() {
+  // Add size of COFF header.
+  FileSize = COFF::Header16Size;
+
+  // one .rsrc section header for directory tree, another for resource data.
+  FileSize += 2 * COFF::SectionSize;
+
+  performSectionOneLayout();
+  performSectionTwoLayout();
+
+  // We have reached the address of the symbol table.
+  SymbolTableOffset = FileSize;
+
+  FileSize += COFF::Symbol16Size;     // size of the @feat.00 symbol.
+  FileSize += 4 * COFF::Symbol16Size; // symbol + aux for each section.
+  FileSize += Data.size() * COFF::Symbol16Size; // 1 symbol per resource.
+  FileSize += 4; // four null bytes for the string table.
+}
+
+void WindowsResourceCOFFWriter::performSectionOneLayout() {
+  SectionOneOffset = FileSize;
+
+  SectionOneSize = Resources.getTreeSize();
+  uint32_t CurrentStringOffset = SectionOneSize;
+  uint32_t TotalStringTableSize = 0;
+  for (auto const &String : StringTable) {
+    StringTableOffsets.push_back(CurrentStringOffset);
+    uint32_t StringSize = String.size() * sizeof(UTF16) + sizeof(uint16_t);
+    CurrentStringOffset += StringSize;
+    TotalStringTableSize += StringSize;
+  }
+  SectionOneSize += alignTo(TotalStringTableSize, sizeof(uint32_t));
+
+  // account for the relocations of section one.
+  SectionOneRelocations = FileSize + SectionOneSize;
+  FileSize += SectionOneSize;
+  FileSize +=
+      Data.size() * COFF::RelocationSize; // one relocation for each resource.
+  FileSize = alignTo(FileSize, SECTION_ALIGNMENT);
+}
+
+void WindowsResourceCOFFWriter::performSectionTwoLayout() {
+  // add size of .rsrc$2 section, which contains all resource data on 8-byte
+  // alignment.
+  SectionTwoOffset = FileSize;
+  SectionTwoSize = 0;
+  for (auto const &Entry : Data) {
+    DataOffsets.push_back(SectionTwoSize);
+    SectionTwoSize += alignTo(Entry.size(), sizeof(uint64_t));
+  }
+  FileSize += SectionTwoSize;
+  FileSize = alignTo(FileSize, SECTION_ALIGNMENT);
+}
+
+static std::time_t getTime() {
+  std::time_t Now = time(nullptr);
+  if (Now < 0 || !isUInt<32>(Now))
+    return UINT32_MAX;
+  return Now;
+}
+
+std::unique_ptr<MemoryBuffer> WindowsResourceCOFFWriter::write() {
+  BufferStart = const_cast<char *>(OutputBuffer->getBufferStart());
+
+  writeCOFFHeader();
+  writeFirstSectionHeader();
+  writeSecondSectionHeader();
+  writeFirstSection();
+  writeSecondSection();
+  writeSymbolTable();
+  writeStringTable();
+
+  return std::move(OutputBuffer);
+}
+
+void WindowsResourceCOFFWriter::writeCOFFHeader() {
+  // Write the COFF header.
+  auto *Header = reinterpret_cast<coff_file_header *>(BufferStart);
+  switch (MachineType) {
+  case COFF::IMAGE_FILE_MACHINE_ARMNT:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_ARMNT;
+    break;
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_AMD64;
+    break;
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_I386;
+    break;
+  default:
+    Header->Machine = COFF::IMAGE_FILE_MACHINE_UNKNOWN;
+  }
+  Header->NumberOfSections = 2;
+  Header->TimeDateStamp = getTime();
+  Header->PointerToSymbolTable = SymbolTableOffset;
+  // One symbol for every resource plus 2 for each section and @feat.00
+  Header->NumberOfSymbols = Data.size() + 5;
+  Header->SizeOfOptionalHeader = 0;
+  Header->Characteristics = COFF::IMAGE_FILE_32BIT_MACHINE;
+}
+
+void WindowsResourceCOFFWriter::writeFirstSectionHeader() {
+  // Write the first section header.
+  CurrentOffset += sizeof(coff_file_header);
+  auto *SectionOneHeader =
+      reinterpret_cast<coff_section *>(BufferStart + CurrentOffset);
+  strncpy(SectionOneHeader->Name, ".rsrc$01", (size_t)COFF::NameSize);
+  SectionOneHeader->VirtualSize = 0;
+  SectionOneHeader->VirtualAddress = 0;
+  SectionOneHeader->SizeOfRawData = SectionOneSize;
+  SectionOneHeader->PointerToRawData = SectionOneOffset;
+  SectionOneHeader->PointerToRelocations = SectionOneRelocations;
+  SectionOneHeader->PointerToLinenumbers = 0;
+  SectionOneHeader->NumberOfRelocations = Data.size();
+  SectionOneHeader->NumberOfLinenumbers = 0;
+  SectionOneHeader->Characteristics += COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  SectionOneHeader->Characteristics += COFF::IMAGE_SCN_MEM_READ;
+}
+
+void WindowsResourceCOFFWriter::writeSecondSectionHeader() {
+  // Write the second section header.
+  CurrentOffset += sizeof(coff_section);
+  auto *SectionTwoHeader =
+      reinterpret_cast<coff_section *>(BufferStart + CurrentOffset);
+  strncpy(SectionTwoHeader->Name, ".rsrc$02", (size_t)COFF::NameSize);
+  SectionTwoHeader->VirtualSize = 0;
+  SectionTwoHeader->VirtualAddress = 0;
+  SectionTwoHeader->SizeOfRawData = SectionTwoSize;
+  SectionTwoHeader->PointerToRawData = SectionTwoOffset;
+  SectionTwoHeader->PointerToRelocations = 0;
+  SectionTwoHeader->PointerToLinenumbers = 0;
+  SectionTwoHeader->NumberOfRelocations = 0;
+  SectionTwoHeader->NumberOfLinenumbers = 0;
+  SectionTwoHeader->Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  SectionTwoHeader->Characteristics += COFF::IMAGE_SCN_MEM_READ;
+}
+
+void WindowsResourceCOFFWriter::writeFirstSection() {
+  // Write section one.
+  CurrentOffset += sizeof(coff_section);
+
+  writeDirectoryTree();
+  writeDirectoryStringTable();
+  writeFirstSectionRelocations();
+
+  CurrentOffset = alignTo(CurrentOffset, SECTION_ALIGNMENT);
+}
+
+void WindowsResourceCOFFWriter::writeSecondSection() {
+  // Now write the .rsrc$02 section.
+  for (auto const &RawDataEntry : Data) {
+    std::copy(RawDataEntry.begin(), RawDataEntry.end(),
+              BufferStart + CurrentOffset);
+    CurrentOffset += alignTo(RawDataEntry.size(), sizeof(uint64_t));
+  }
+
+  CurrentOffset = alignTo(CurrentOffset, SECTION_ALIGNMENT);
+}
+
+void WindowsResourceCOFFWriter::writeSymbolTable() {
+  // Now write the symbol table.
+  // First, the feat symbol.
+  auto *Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+  strncpy(Symbol->Name.ShortName, "@feat.00", (size_t)COFF::NameSize);
+  Symbol->Value = 0x11;
+  Symbol->SectionNumber = 0xffff;
+  Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+  Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+  Symbol->NumberOfAuxSymbols = 0;
+  CurrentOffset += sizeof(coff_symbol16);
+
+  // Now write the .rsrc1 symbol + aux.
+  Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+  strncpy(Symbol->Name.ShortName, ".rsrc$01", (size_t)COFF::NameSize);
+  Symbol->Value = 0;
+  Symbol->SectionNumber = 1;
+  Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+  Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+  Symbol->NumberOfAuxSymbols = 1;
+  CurrentOffset += sizeof(coff_symbol16);
+  auto *Aux = reinterpret_cast<coff_aux_section_definition *>(BufferStart +
+                                                              CurrentOffset);
+  Aux->Length = SectionOneSize;
+  Aux->NumberOfRelocations = Data.size();
+  Aux->NumberOfLinenumbers = 0;
+  Aux->CheckSum = 0;
+  Aux->NumberLowPart = 0;
+  Aux->Selection = 0;
+  CurrentOffset += sizeof(coff_aux_section_definition);
+
+  // Now write the .rsrc2 symbol + aux.
+  Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+  strncpy(Symbol->Name.ShortName, ".rsrc$02", (size_t)COFF::NameSize);
+  Symbol->Value = 0;
+  Symbol->SectionNumber = 2;
+  Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+  Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+  Symbol->NumberOfAuxSymbols = 1;
+  CurrentOffset += sizeof(coff_symbol16);
+  Aux = reinterpret_cast<coff_aux_section_definition *>(BufferStart +
+                                                        CurrentOffset);
+  Aux->Length = SectionTwoSize;
+  Aux->NumberOfRelocations = 0;
+  Aux->NumberOfLinenumbers = 0;
+  Aux->CheckSum = 0;
+  Aux->NumberLowPart = 0;
+  Aux->Selection = 0;
+  CurrentOffset += sizeof(coff_aux_section_definition);
+
+  // Now write a symbol for each relocation.
+  for (unsigned i = 0; i < Data.size(); i++) {
+    char RelocationName[9];
+    sprintf(RelocationName, "$R%06X", DataOffsets[i]);
+    Symbol = reinterpret_cast<coff_symbol16 *>(BufferStart + CurrentOffset);
+    strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize);
+    Symbol->Value = DataOffsets[i];
+    Symbol->SectionNumber = 2;
+    Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL;
+    Symbol->StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+    Symbol->NumberOfAuxSymbols = 0;
+    CurrentOffset += sizeof(coff_symbol16);
+  }
+}
+
+void WindowsResourceCOFFWriter::writeStringTable() {
+  // Just 4 null bytes for the string table.
+  auto COFFStringTable = reinterpret_cast<void *>(BufferStart + CurrentOffset);
+  memset(COFFStringTable, 0, 4);
+}
+
+void WindowsResourceCOFFWriter::writeDirectoryTree() {
+  // Traverse parsed resource tree breadth-first and write the corresponding
+  // COFF objects.
+  std::queue<const WindowsResourceParser::TreeNode *> Queue;
+  Queue.push(&Resources);
+  uint32_t NextLevelOffset =
+      sizeof(coff_resource_dir_table) + (Resources.getStringChildren().size() +
+                                         Resources.getIDChildren().size()) *
+                                            sizeof(coff_resource_dir_entry);
+  std::vector<const WindowsResourceParser::TreeNode *> DataEntriesTreeOrder;
+  uint32_t CurrentRelativeOffset = 0;
+
+  while (!Queue.empty()) {
+    auto CurrentNode = Queue.front();
+    Queue.pop();
+    auto *Table = reinterpret_cast<coff_resource_dir_table *>(BufferStart +
+                                                              CurrentOffset);
+    Table->Characteristics = CurrentNode->getCharacteristics();
+    Table->TimeDateStamp = 0;
+    Table->MajorVersion = CurrentNode->getMajorVersion();
+    Table->MinorVersion = CurrentNode->getMinorVersion();
+    auto &IDChildren = CurrentNode->getIDChildren();
+    auto &StringChildren = CurrentNode->getStringChildren();
+    Table->NumberOfNameEntries = StringChildren.size();
+    Table->NumberOfIDEntries = IDChildren.size();
+    CurrentOffset += sizeof(coff_resource_dir_table);
+    CurrentRelativeOffset += sizeof(coff_resource_dir_table);
+
+    // Write the directory entries immediately following each directory table.
+    for (auto const &Child : StringChildren) {
+      auto *Entry = reinterpret_cast<coff_resource_dir_entry *>(BufferStart +
+                                                                CurrentOffset);
+      Entry->Identifier.setNameOffset(
+          StringTableOffsets[Child.second->getStringIndex()]);
+      if (Child.second->checkIsDataNode()) {
+        Entry->Offset.DataEntryOffset = NextLevelOffset;
+        NextLevelOffset += sizeof(coff_resource_data_entry);
+        DataEntriesTreeOrder.push_back(Child.second.get());
+      } else {
+        Entry->Offset.SubdirOffset = NextLevelOffset + (1 << 31);
+        NextLevelOffset += sizeof(coff_resource_dir_table) +
+                           (Child.second->getStringChildren().size() +
+                            Child.second->getIDChildren().size()) *
+                               sizeof(coff_resource_dir_entry);
+        Queue.push(Child.second.get());
+      }
+      CurrentOffset += sizeof(coff_resource_dir_entry);
+      CurrentRelativeOffset += sizeof(coff_resource_dir_entry);
+    }
+    for (auto const &Child : IDChildren) {
+      auto *Entry = reinterpret_cast<coff_resource_dir_entry *>(BufferStart +
+                                                                CurrentOffset);
+      Entry->Identifier.ID = Child.first;
+      if (Child.second->checkIsDataNode()) {
+        Entry->Offset.DataEntryOffset = NextLevelOffset;
+        NextLevelOffset += sizeof(coff_resource_data_entry);
+        DataEntriesTreeOrder.push_back(Child.second.get());
+      } else {
+        Entry->Offset.SubdirOffset = NextLevelOffset + (1 << 31);
+        NextLevelOffset += sizeof(coff_resource_dir_table) +
+                           (Child.second->getStringChildren().size() +
+                            Child.second->getIDChildren().size()) *
+                               sizeof(coff_resource_dir_entry);
+        Queue.push(Child.second.get());
+      }
+      CurrentOffset += sizeof(coff_resource_dir_entry);
+      CurrentRelativeOffset += sizeof(coff_resource_dir_entry);
+    }
+  }
+
+  RelocationAddresses.resize(Data.size());
+  // Now write all the resource data entries.
+  for (auto DataNodes : DataEntriesTreeOrder) {
+    auto *Entry = reinterpret_cast<coff_resource_data_entry *>(BufferStart +
+                                                               CurrentOffset);
+    RelocationAddresses[DataNodes->getDataIndex()] = CurrentRelativeOffset;
+    Entry->DataRVA = 0; // Set to zero because it is a relocation.
+    Entry->DataSize = Data[DataNodes->getDataIndex()].size();
+    Entry->Codepage = 0;
+    Entry->Reserved = 0;
+    CurrentOffset += sizeof(coff_resource_data_entry);
+    CurrentRelativeOffset += sizeof(coff_resource_data_entry);
+  }
+}
+
+void WindowsResourceCOFFWriter::writeDirectoryStringTable() {
+  // Now write the directory string table for .rsrc$01
+  uint32_t TotalStringTableSize = 0;
+  for (auto &String : StringTable) {
+    uint16_t Length = String.size();
+    support::endian::write16le(BufferStart + CurrentOffset, Length);
+    CurrentOffset += sizeof(uint16_t);
+    auto *Start = reinterpret_cast<UTF16 *>(BufferStart + CurrentOffset);
+    std::copy(String.begin(), String.end(), Start);
+    CurrentOffset += Length * sizeof(UTF16);
+    TotalStringTableSize += Length * sizeof(UTF16) + sizeof(uint16_t);
+  }
+  CurrentOffset +=
+      alignTo(TotalStringTableSize, sizeof(uint32_t)) - TotalStringTableSize;
+}
+
+void WindowsResourceCOFFWriter::writeFirstSectionRelocations() {
+
+  // Now write the relocations for .rsrc$01
+  // Five symbols already in table before we start, @feat.00 and 2 for each
+  // .rsrc section.
+  uint32_t NextSymbolIndex = 5;
+  for (unsigned i = 0; i < Data.size(); i++) {
+    auto *Reloc =
+        reinterpret_cast<coff_relocation *>(BufferStart + CurrentOffset);
+    Reloc->VirtualAddress = RelocationAddresses[i];
+    Reloc->SymbolTableIndex = NextSymbolIndex++;
+    switch (MachineType) {
+    case COFF::IMAGE_FILE_MACHINE_ARMNT:
+      Reloc->Type = COFF::IMAGE_REL_ARM_ADDR32NB;
+      break;
+    case COFF::IMAGE_FILE_MACHINE_AMD64:
+      Reloc->Type = COFF::IMAGE_REL_AMD64_ADDR32NB;
+      break;
+    case COFF::IMAGE_FILE_MACHINE_I386:
+      Reloc->Type = COFF::IMAGE_REL_I386_DIR32NB;
+      break;
+    default:
+      Reloc->Type = 0;
+    }
+    CurrentOffset += sizeof(coff_relocation);
+  }
+}
+
+Expected<std::unique_ptr<MemoryBuffer>>
+writeWindowsResourceCOFF(COFF::MachineTypes MachineType,
+                         const WindowsResourceParser &Parser) {
+  Error E = Error::success();
+  WindowsResourceCOFFWriter Writer(MachineType, Parser, E);
+  if (E)
+    return std::move(E);
+  return Writer.write();
+}
+
+} // namespace object
+} // namespace llvm
diff --git a/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp b/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp
index 7f9f4c1..1103159 100644
--- a/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/COFFYAML.cpp
@@ -12,17 +12,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/COFFYAML.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cstdint>
+#include <cstring>
 
 #define ECase(X) IO.enumCase(Value, #X, COFF::X);
+
 namespace llvm {
 
 namespace COFFYAML {
+
 Section::Section() { memset(&Header, 0, sizeof(COFF::section)); }
 Symbol::Symbol() { memset(&Header, 0, sizeof(COFF::symbol)); }
 Object::Object() { memset(&Header, 0, sizeof(COFF::header)); }
-}
+
+} // end namespace COFFYAML
 
 namespace yaml {
+
 void ScalarEnumerationTraits<COFFYAML::COMDATType>::enumeration(
     IO &IO, COFFYAML::COMDATType &Value) {
   IO.enumCase(Value, "0", 0);
@@ -172,20 +180,20 @@ void ScalarEnumerationTraits<COFF::RelocationTypeAMD64>::enumeration(
 
 void ScalarEnumerationTraits<COFF::WindowsSubsystem>::enumeration(
     IO &IO, COFF::WindowsSubsystem &Value) {
-    ECase(IMAGE_SUBSYSTEM_UNKNOWN);
-    ECase(IMAGE_SUBSYSTEM_NATIVE);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI);
-    ECase(IMAGE_SUBSYSTEM_OS2_CUI);
-    ECase(IMAGE_SUBSYSTEM_POSIX_CUI);
-    ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI);
-    ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION);
-    ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER);
-    ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER);
-    ECase(IMAGE_SUBSYSTEM_EFI_ROM);
-    ECase(IMAGE_SUBSYSTEM_XBOX);
-    ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION);
+  ECase(IMAGE_SUBSYSTEM_UNKNOWN);
+  ECase(IMAGE_SUBSYSTEM_NATIVE);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_GUI);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_CUI);
+  ECase(IMAGE_SUBSYSTEM_OS2_CUI);
+  ECase(IMAGE_SUBSYSTEM_POSIX_CUI);
+  ECase(IMAGE_SUBSYSTEM_NATIVE_WINDOWS);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_CE_GUI);
+  ECase(IMAGE_SUBSYSTEM_EFI_APPLICATION);
+  ECase(IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER);
+  ECase(IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER);
+  ECase(IMAGE_SUBSYSTEM_EFI_ROM);
+  ECase(IMAGE_SUBSYSTEM_XBOX);
+  ECase(IMAGE_SUBSYSTEM_WINDOWS_BOOT_APPLICATION);
 }
 #undef ECase
 
@@ -252,12 +260,15 @@ void ScalarBitSetTraits<COFF::DLLCharacteristics>::bitset(
 #undef BCase
 
 namespace {
+
 struct NSectionSelectionType {
   NSectionSelectionType(IO &)
       : SelectionType(COFFYAML::COMDATType(0)) {}
   NSectionSelectionType(IO &, uint8_t C)
       : SelectionType(COFFYAML::COMDATType(C)) {}
+
   uint8_t denormalize(IO &) { return SelectionType; }
+
   COFFYAML::COMDATType SelectionType;
 };
 
@@ -266,7 +277,9 @@ struct NWeakExternalCharacteristics {
       : Characteristics(COFFYAML::WeakExternalCharacteristics(0)) {}
   NWeakExternalCharacteristics(IO &, uint32_t C)
       : Characteristics(COFFYAML::WeakExternalCharacteristics(C)) {}
+
   uint32_t denormalize(IO &) { return Characteristics; }
+
   COFFYAML::WeakExternalCharacteristics Characteristics;
 };
 
@@ -275,7 +288,9 @@ struct NSectionCharacteristics {
       : Characteristics(COFF::SectionCharacteristics(0)) {}
   NSectionCharacteristics(IO &, uint32_t C)
       : Characteristics(COFF::SectionCharacteristics(C)) {}
+
   uint32_t denormalize(IO &) { return Characteristics; }
+
   COFF::SectionCharacteristics Characteristics;
 };
 
@@ -284,13 +299,16 @@ struct NAuxTokenType {
       : AuxType(COFFYAML::AuxSymbolType(0)) {}
   NAuxTokenType(IO &, uint8_t C)
       : AuxType(COFFYAML::AuxSymbolType(C)) {}
+
   uint32_t denormalize(IO &) { return AuxType; }
+
   COFFYAML::AuxSymbolType AuxType;
 };
 
 struct NStorageClass {
   NStorageClass(IO &) : StorageClass(COFF::SymbolStorageClass(0)) {}
   NStorageClass(IO &, uint8_t S) : StorageClass(COFF::SymbolStorageClass(S)) {}
+
   uint8_t denormalize(IO &) { return StorageClass; }
 
   COFF::SymbolStorageClass StorageClass;
@@ -299,7 +317,9 @@ struct NStorageClass {
 struct NMachine {
   NMachine(IO &) : Machine(COFF::MachineTypes(0)) {}
   NMachine(IO &, uint16_t M) : Machine(COFF::MachineTypes(M)) {}
+
   uint16_t denormalize(IO &) { return Machine; }
+
   COFF::MachineTypes Machine;
 };
 
@@ -307,6 +327,7 @@ struct NHeaderCharacteristics {
   NHeaderCharacteristics(IO &) : Characteristics(COFF::Characteristics(0)) {}
   NHeaderCharacteristics(IO &, uint16_t C)
       : Characteristics(COFF::Characteristics(C)) {}
+
   uint16_t denormalize(IO &) { return Characteristics; }
 
   COFF::Characteristics Characteristics;
@@ -316,13 +337,16 @@ template <typename RelocType>
 struct NType {
   NType(IO &) : Type(RelocType(0)) {}
   NType(IO &, uint16_t T) : Type(RelocType(T)) {}
+
   uint16_t denormalize(IO &) { return Type; }
+
   RelocType Type;
 };
 
 struct NWindowsSubsystem {
   NWindowsSubsystem(IO &) : Subsystem(COFF::WindowsSubsystem(0)) {}
   NWindowsSubsystem(IO &, uint16_t C) : Subsystem(COFF::WindowsSubsystem(C)) {}
+
   uint16_t denormalize(IO &) { return Subsystem; }
 
   COFF::WindowsSubsystem Subsystem;
@@ -332,12 +356,13 @@ struct NDLLCharacteristics {
   NDLLCharacteristics(IO &) : Characteristics(COFF::DLLCharacteristics(0)) {}
   NDLLCharacteristics(IO &, uint16_t C)
       : Characteristics(COFF::DLLCharacteristics(C)) {}
+
   uint16_t denormalize(IO &) { return Characteristics; }
 
   COFF::DLLCharacteristics Characteristics;
 };
 
-}
+} // end anonymous namespace
 
 void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
                                                   COFFYAML::Relocation &Rel) {
@@ -488,7 +513,16 @@ void MappingTraits<COFFYAML::Section>::mapping(IO &IO, COFFYAML::Section &Sec) {
   IO.mapOptional("VirtualAddress", Sec.Header.VirtualAddress, 0U);
   IO.mapOptional("VirtualSize", Sec.Header.VirtualSize, 0U);
   IO.mapOptional("Alignment", Sec.Alignment, 0U);
-  IO.mapRequired("SectionData", Sec.SectionData);
+
+  // If this is a .debug$S or .debug$T section parse the semantic representation
+  // of the symbols/types.  If it is any other kind of section, just deal in raw
+  // bytes.
+  IO.mapOptional("SectionData", Sec.SectionData);
+  if (Sec.Name == ".debug$S")
+    IO.mapOptional("Subsections", Sec.DebugS);
+  else if (Sec.Name == ".debug$T")
+    IO.mapOptional("Types", Sec.DebugT);
+
   IO.mapOptional("Relocations", Sec.Relocations);
 }
 
@@ -500,5 +534,6 @@ void MappingTraits<COFFYAML::Object>::mapping(IO &IO, COFFYAML::Object &Obj) {
   IO.mapRequired("symbols", Obj.Symbols);
 }
 
-}
-}
+} // end namespace yaml
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
new file mode 100644
index 0000000..60b0ea2
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp
@@ -0,0 +1,958 @@
+//===- CodeViewYAMLDebugSections.cpp - CodeView YAMLIO debug sections -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of CodeView
+// Debug Info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/CodeViewYAMLDebugSections.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugCrossExSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugCrossImpSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugSymbolRVASubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
+#include "llvm/DebugInfo/CodeView/Line.h"
+#include "llvm/DebugInfo/CodeView/StringsAndChecksums.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/ObjectYAML/CodeViewYAMLSymbols.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::CodeViewYAML;
+using namespace llvm::CodeViewYAML::detail;
+using namespace llvm::yaml;
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(SourceFileChecksumEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(SourceColumnEntry)
+LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineBlock)
+LLVM_YAML_IS_SEQUENCE_VECTOR(SourceLineInfo)
+LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeSite)
+LLVM_YAML_IS_SEQUENCE_VECTOR(InlineeInfo)
+LLVM_YAML_IS_SEQUENCE_VECTOR(CrossModuleExport)
+LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLCrossModuleImport)
+LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLFrameData)
+
+LLVM_YAML_DECLARE_SCALAR_TRAITS(HexFormattedString, false)
+LLVM_YAML_DECLARE_ENUM_TRAITS(DebugSubsectionKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(FileChecksumKind)
+LLVM_YAML_DECLARE_BITSET_TRAITS(LineFlags)
+
+LLVM_YAML_DECLARE_MAPPING_TRAITS(CrossModuleExport)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(YAMLFrameData)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(YAMLCrossModuleImport)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(CrossModuleImportItem)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceLineEntry)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceColumnEntry)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceFileChecksumEntry)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(SourceLineBlock)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(InlineeSite)
+
+namespace llvm {
+namespace CodeViewYAML {
+namespace detail {
+
+struct YAMLSubsectionBase {
+  explicit YAMLSubsectionBase(DebugSubsectionKind Kind) : Kind(Kind) {}
+  virtual ~YAMLSubsectionBase() = default;
+
+  virtual void map(IO &IO) = 0;
+  virtual std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const = 0;
+
+  DebugSubsectionKind Kind;
+};
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
+
+namespace {
+
+struct YAMLChecksumsSubsection : public YAMLSubsectionBase {
+  YAMLChecksumsSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::FileChecksums) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLChecksumsSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugChecksumsSubsectionRef &FC);
+
+  std::vector<SourceFileChecksumEntry> Checksums;
+};
+
+struct YAMLLinesSubsection : public YAMLSubsectionBase {
+  YAMLLinesSubsection() : YAMLSubsectionBase(DebugSubsectionKind::Lines) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLLinesSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugChecksumsSubsectionRef &Checksums,
+                         const DebugLinesSubsectionRef &Lines);
+
+  SourceLineInfo Lines;
+};
+
+struct YAMLInlineeLinesSubsection : public YAMLSubsectionBase {
+  YAMLInlineeLinesSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::InlineeLines) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLInlineeLinesSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugChecksumsSubsectionRef &Checksums,
+                         const DebugInlineeLinesSubsectionRef &Lines);
+
+  InlineeInfo InlineeLines;
+};
+
+struct YAMLCrossModuleExportsSubsection : public YAMLSubsectionBase {
+  YAMLCrossModuleExportsSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::CrossScopeExports) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLCrossModuleExportsSubsection>>
+  fromCodeViewSubsection(const DebugCrossModuleExportsSubsectionRef &Exports);
+
+  std::vector<CrossModuleExport> Exports;
+};
+
+struct YAMLCrossModuleImportsSubsection : public YAMLSubsectionBase {
+  YAMLCrossModuleImportsSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::CrossScopeImports) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLCrossModuleImportsSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugCrossModuleImportsSubsectionRef &Imports);
+
+  std::vector<YAMLCrossModuleImport> Imports;
+};
+
+struct YAMLSymbolsSubsection : public YAMLSubsectionBase {
+  YAMLSymbolsSubsection() : YAMLSubsectionBase(DebugSubsectionKind::Symbols) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLSymbolsSubsection>>
+  fromCodeViewSubsection(const DebugSymbolsSubsectionRef &Symbols);
+
+  std::vector<CodeViewYAML::SymbolRecord> Symbols;
+};
+
+struct YAMLStringTableSubsection : public YAMLSubsectionBase {
+  YAMLStringTableSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::StringTable) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLStringTableSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings);
+
+  std::vector<StringRef> Strings;
+};
+
+struct YAMLFrameDataSubsection : public YAMLSubsectionBase {
+  YAMLFrameDataSubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::FrameData) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLFrameDataSubsection>>
+  fromCodeViewSubsection(const DebugStringTableSubsectionRef &Strings,
+                         const DebugFrameDataSubsectionRef &Frames);
+
+  std::vector<YAMLFrameData> Frames;
+};
+
+struct YAMLCoffSymbolRVASubsection : public YAMLSubsectionBase {
+  YAMLCoffSymbolRVASubsection()
+      : YAMLSubsectionBase(DebugSubsectionKind::CoffSymbolRVA) {}
+
+  void map(IO &IO) override;
+  std::shared_ptr<DebugSubsection>
+  toCodeViewSubsection(BumpPtrAllocator &Allocator,
+                       const codeview::StringsAndChecksums &SC) const override;
+  static Expected<std::shared_ptr<YAMLCoffSymbolRVASubsection>>
+  fromCodeViewSubsection(const DebugSymbolRVASubsectionRef &RVAs);
+
+  std::vector<uint32_t> RVAs;
+};
+
+} // end anonymous namespace
+
+void ScalarBitSetTraits<LineFlags>::bitset(IO &io, LineFlags &Flags) {
+  io.bitSetCase(Flags, "HasColumnInfo", LF_HaveColumns);
+  io.enumFallback<Hex16>(Flags);
+}
+
+void ScalarEnumerationTraits<FileChecksumKind>::enumeration(
+    IO &io, FileChecksumKind &Kind) {
+  io.enumCase(Kind, "None", FileChecksumKind::None);
+  io.enumCase(Kind, "MD5", FileChecksumKind::MD5);
+  io.enumCase(Kind, "SHA1", FileChecksumKind::SHA1);
+  io.enumCase(Kind, "SHA256", FileChecksumKind::SHA256);
+}
+
+void ScalarTraits<HexFormattedString>::output(const HexFormattedString &Value,
+                                              void *ctx, raw_ostream &Out) {
+  StringRef Bytes(reinterpret_cast<const char *>(Value.Bytes.data()),
+                  Value.Bytes.size());
+  Out << toHex(Bytes);
+}
+
+StringRef ScalarTraits<HexFormattedString>::input(StringRef Scalar, void *ctxt,
+                                                  HexFormattedString &Value) {
+  std::string H = fromHex(Scalar);
+  Value.Bytes.assign(H.begin(), H.end());
+  return StringRef();
+}
+
+void MappingTraits<SourceLineEntry>::mapping(IO &IO, SourceLineEntry &Obj) {
+  IO.mapRequired("Offset", Obj.Offset);
+  IO.mapRequired("LineStart", Obj.LineStart);
+  IO.mapRequired("IsStatement", Obj.IsStatement);
+  IO.mapRequired("EndDelta", Obj.EndDelta);
+}
+
+void MappingTraits<SourceColumnEntry>::mapping(IO &IO, SourceColumnEntry &Obj) {
+  IO.mapRequired("StartColumn", Obj.StartColumn);
+  IO.mapRequired("EndColumn", Obj.EndColumn);
+}
+
+void MappingTraits<SourceLineBlock>::mapping(IO &IO, SourceLineBlock &Obj) {
+  IO.mapRequired("FileName", Obj.FileName);
+  IO.mapRequired("Lines", Obj.Lines);
+  IO.mapRequired("Columns", Obj.Columns);
+}
+
+void MappingTraits<CrossModuleExport>::mapping(IO &IO, CrossModuleExport &Obj) {
+  IO.mapRequired("LocalId", Obj.Local);
+  IO.mapRequired("GlobalId", Obj.Global);
+}
+
+void MappingTraits<YAMLCrossModuleImport>::mapping(IO &IO,
+                                                   YAMLCrossModuleImport &Obj) {
+  IO.mapRequired("Module", Obj.ModuleName);
+  IO.mapRequired("Imports", Obj.ImportIds);
+}
+
+void MappingTraits<SourceFileChecksumEntry>::mapping(
+    IO &IO, SourceFileChecksumEntry &Obj) {
+  IO.mapRequired("FileName", Obj.FileName);
+  IO.mapRequired("Kind", Obj.Kind);
+  IO.mapRequired("Checksum", Obj.ChecksumBytes);
+}
+
+void MappingTraits<InlineeSite>::mapping(IO &IO, InlineeSite &Obj) {
+  IO.mapRequired("FileName", Obj.FileName);
+  IO.mapRequired("LineNum", Obj.SourceLineNum);
+  IO.mapRequired("Inlinee", Obj.Inlinee);
+  IO.mapOptional("ExtraFiles", Obj.ExtraFiles);
+}
+
+void MappingTraits<YAMLFrameData>::mapping(IO &IO, YAMLFrameData &Obj) {
+  IO.mapRequired("CodeSize", Obj.CodeSize);
+  IO.mapRequired("FrameFunc", Obj.FrameFunc);
+  IO.mapRequired("LocalSize", Obj.LocalSize);
+  IO.mapOptional("MaxStackSize", Obj.MaxStackSize);
+  IO.mapOptional("ParamsSize", Obj.ParamsSize);
+  IO.mapOptional("PrologSize", Obj.PrologSize);
+  IO.mapOptional("RvaStart", Obj.RvaStart);
+  IO.mapOptional("SavedRegsSize", Obj.SavedRegsSize);
+}
+
+void YAMLChecksumsSubsection::map(IO &IO) {
+  IO.mapTag("!FileChecksums", true);
+  IO.mapRequired("Checksums", Checksums);
+}
+
+void YAMLLinesSubsection::map(IO &IO) {
+  IO.mapTag("!Lines", true);
+  IO.mapRequired("CodeSize", Lines.CodeSize);
+
+  IO.mapRequired("Flags", Lines.Flags);
+  IO.mapRequired("RelocOffset", Lines.RelocOffset);
+  IO.mapRequired("RelocSegment", Lines.RelocSegment);
+  IO.mapRequired("Blocks", Lines.Blocks);
+}
+
+void YAMLInlineeLinesSubsection::map(IO &IO) {
+  IO.mapTag("!InlineeLines", true);
+  IO.mapRequired("HasExtraFiles", InlineeLines.HasExtraFiles);
+  IO.mapRequired("Sites", InlineeLines.Sites);
+}
+
+void YAMLCrossModuleExportsSubsection::map(IO &IO) {
+  IO.mapTag("!CrossModuleExports", true);
+  IO.mapOptional("Exports", Exports);
+}
+
+void YAMLCrossModuleImportsSubsection::map(IO &IO) {
+  IO.mapTag("!CrossModuleImports", true);
+  IO.mapOptional("Imports", Imports);
+}
+
+void YAMLSymbolsSubsection::map(IO &IO) {
+  IO.mapTag("!Symbols", true);
+  IO.mapRequired("Records", Symbols);
+}
+
+void YAMLStringTableSubsection::map(IO &IO) {
+  IO.mapTag("!StringTable", true);
+  IO.mapRequired("Strings", Strings);
+}
+
+void YAMLFrameDataSubsection::map(IO &IO) {
+  IO.mapTag("!FrameData", true);
+  IO.mapRequired("Frames", Frames);
+}
+
+void YAMLCoffSymbolRVASubsection::map(IO &IO) {
+  IO.mapTag("!COFFSymbolRVAs", true);
+  IO.mapRequired("RVAs", RVAs);
+}
+
+void MappingTraits<YAMLDebugSubsection>::mapping(
+    IO &IO, YAMLDebugSubsection &Subsection) {
+  if (!IO.outputting()) {
+    if (IO.mapTag("!FileChecksums")) {
+      auto SS = std::make_shared<YAMLChecksumsSubsection>();
+      Subsection.Subsection = SS;
+    } else if (IO.mapTag("!Lines")) {
+      Subsection.Subsection = std::make_shared<YAMLLinesSubsection>();
+    } else if (IO.mapTag("!InlineeLines")) {
+      Subsection.Subsection = std::make_shared<YAMLInlineeLinesSubsection>();
+    } else if (IO.mapTag("!CrossModuleExports")) {
+      Subsection.Subsection =
+          std::make_shared<YAMLCrossModuleExportsSubsection>();
+    } else if (IO.mapTag("!CrossModuleImports")) {
+      Subsection.Subsection =
+          std::make_shared<YAMLCrossModuleImportsSubsection>();
+    } else if (IO.mapTag("!Symbols")) {
+      Subsection.Subsection = std::make_shared<YAMLSymbolsSubsection>();
+    } else if (IO.mapTag("!StringTable")) {
+      Subsection.Subsection = std::make_shared<YAMLStringTableSubsection>();
+    } else if (IO.mapTag("!FrameData")) {
+      Subsection.Subsection = std::make_shared<YAMLFrameDataSubsection>();
+    } else if (IO.mapTag("!COFFSymbolRVAs")) {
+      Subsection.Subsection = std::make_shared<YAMLCoffSymbolRVASubsection>();
+    } else {
+      llvm_unreachable("Unexpected subsection tag!");
+    }
+  }
+  Subsection.Subsection->map(IO);
+}
+
+std::shared_ptr<DebugSubsection> YAMLChecksumsSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  assert(SC.hasStrings());
+  auto Result = std::make_shared<DebugChecksumsSubsection>(*SC.strings());
+  for (const auto &CS : Checksums) {
+    Result->addChecksum(CS.FileName, CS.Kind, CS.ChecksumBytes.Bytes);
+  }
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection> YAMLLinesSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  assert(SC.hasStrings() && SC.hasChecksums());
+  auto Result =
+      std::make_shared<DebugLinesSubsection>(*SC.checksums(), *SC.strings());
+  Result->setCodeSize(Lines.CodeSize);
+  Result->setRelocationAddress(Lines.RelocSegment, Lines.RelocOffset);
+  Result->setFlags(Lines.Flags);
+  for (const auto &LC : Lines.Blocks) {
+    Result->createBlock(LC.FileName);
+    if (Result->hasColumnInfo()) {
+      for (const auto &Item : zip(LC.Lines, LC.Columns)) {
+        auto &L = std::get<0>(Item);
+        auto &C = std::get<1>(Item);
+        uint32_t LE = L.LineStart + L.EndDelta;
+        Result->addLineAndColumnInfo(L.Offset,
+                                     LineInfo(L.LineStart, LE, L.IsStatement),
+                                     C.StartColumn, C.EndColumn);
+      }
+    } else {
+      for (const auto &L : LC.Lines) {
+        uint32_t LE = L.LineStart + L.EndDelta;
+        Result->addLineInfo(L.Offset, LineInfo(L.LineStart, LE, L.IsStatement));
+      }
+    }
+  }
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection>
+YAMLInlineeLinesSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  assert(SC.hasChecksums());
+  auto Result = std::make_shared<DebugInlineeLinesSubsection>(
+      *SC.checksums(), InlineeLines.HasExtraFiles);
+
+  for (const auto &Site : InlineeLines.Sites) {
+    Result->addInlineSite(TypeIndex(Site.Inlinee), Site.FileName,
+                          Site.SourceLineNum);
+    if (!InlineeLines.HasExtraFiles)
+      continue;
+
+    for (auto EF : Site.ExtraFiles) {
+      Result->addExtraFile(EF);
+    }
+  }
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection>
+YAMLCrossModuleExportsSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  auto Result = std::make_shared<DebugCrossModuleExportsSubsection>();
+  for (const auto &M : Exports)
+    Result->addMapping(M.Local, M.Global);
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection>
+YAMLCrossModuleImportsSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  assert(SC.hasStrings());
+
+  auto Result =
+      std::make_shared<DebugCrossModuleImportsSubsection>(*SC.strings());
+  for (const auto &M : Imports) {
+    for (const auto Id : M.ImportIds)
+      Result->addImport(M.ModuleName, Id);
+  }
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection> YAMLSymbolsSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  auto Result = std::make_shared<DebugSymbolsSubsection>();
+  for (const auto &Sym : Symbols)
+    Result->addSymbol(
+        Sym.toCodeViewSymbol(Allocator, CodeViewContainer::ObjectFile));
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection>
+YAMLStringTableSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  auto Result = std::make_shared<DebugStringTableSubsection>();
+  for (const auto &Str : this->Strings)
+    Result->insert(Str);
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection> YAMLFrameDataSubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  assert(SC.hasStrings());
+
+  auto Result = std::make_shared<DebugFrameDataSubsection>();
+  for (const auto &YF : Frames) {
+    codeview::FrameData F;
+    F.CodeSize = YF.CodeSize;
+    F.Flags = YF.Flags;
+    F.LocalSize = YF.LocalSize;
+    F.MaxStackSize = YF.MaxStackSize;
+    F.ParamsSize = YF.ParamsSize;
+    F.PrologSize = YF.PrologSize;
+    F.RvaStart = YF.RvaStart;
+    F.SavedRegsSize = YF.SavedRegsSize;
+    F.FrameFunc = SC.strings()->insert(YF.FrameFunc);
+    Result->addFrameData(F);
+  }
+  return Result;
+}
+
+std::shared_ptr<DebugSubsection>
+YAMLCoffSymbolRVASubsection::toCodeViewSubsection(
+    BumpPtrAllocator &Allocator,
+    const codeview::StringsAndChecksums &SC) const {
+  auto Result = std::make_shared<DebugSymbolRVASubsection>();
+  for (const auto &RVA : RVAs)
+    Result->addRVA(RVA);
+  return Result;
+}
+
+static Expected<SourceFileChecksumEntry>
+convertOneChecksum(const DebugStringTableSubsectionRef &Strings,
+                   const FileChecksumEntry &CS) {
+  auto ExpectedString = Strings.getString(CS.FileNameOffset);
+  if (!ExpectedString)
+    return ExpectedString.takeError();
+
+  SourceFileChecksumEntry Result;
+  Result.ChecksumBytes.Bytes = CS.Checksum;
+  Result.Kind = CS.Kind;
+  Result.FileName = *ExpectedString;
+  return Result;
+}
+
+static Expected<StringRef>
+getFileName(const DebugStringTableSubsectionRef &Strings,
+            const DebugChecksumsSubsectionRef &Checksums, uint32_t FileID) {
+  auto Iter = Checksums.getArray().at(FileID);
+  if (Iter == Checksums.getArray().end())
+    return make_error<CodeViewError>(cv_error_code::no_records);
+  uint32_t Offset = Iter->FileNameOffset;
+  return Strings.getString(Offset);
+}
+
+Expected<std::shared_ptr<YAMLChecksumsSubsection>>
+YAMLChecksumsSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &FC) {
+  auto Result = std::make_shared<YAMLChecksumsSubsection>();
+
+  for (const auto &CS : FC) {
+    auto ConvertedCS = convertOneChecksum(Strings, CS);
+    if (!ConvertedCS)
+      return ConvertedCS.takeError();
+    Result->Checksums.push_back(*ConvertedCS);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLLinesSubsection>>
+YAMLLinesSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &Checksums,
+    const DebugLinesSubsectionRef &Lines) {
+  auto Result = std::make_shared<YAMLLinesSubsection>();
+  Result->Lines.CodeSize = Lines.header()->CodeSize;
+  Result->Lines.RelocOffset = Lines.header()->RelocOffset;
+  Result->Lines.RelocSegment = Lines.header()->RelocSegment;
+  Result->Lines.Flags = static_cast<LineFlags>(uint16_t(Lines.header()->Flags));
+  for (const auto &L : Lines) {
+    SourceLineBlock Block;
+    auto EF = getFileName(Strings, Checksums, L.NameIndex);
+    if (!EF)
+      return EF.takeError();
+    Block.FileName = *EF;
+    if (Lines.hasColumnInfo()) {
+      for (const auto &C : L.Columns) {
+        SourceColumnEntry SCE;
+        SCE.EndColumn = C.EndColumn;
+        SCE.StartColumn = C.StartColumn;
+        Block.Columns.push_back(SCE);
+      }
+    }
+    for (const auto &LN : L.LineNumbers) {
+      SourceLineEntry SLE;
+      LineInfo LI(LN.Flags);
+      SLE.Offset = LN.Offset;
+      SLE.LineStart = LI.getStartLine();
+      SLE.EndDelta = LI.getLineDelta();
+      SLE.IsStatement = LI.isStatement();
+      Block.Lines.push_back(SLE);
+    }
+    Result->Lines.Blocks.push_back(Block);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLInlineeLinesSubsection>>
+YAMLInlineeLinesSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugChecksumsSubsectionRef &Checksums,
+    const DebugInlineeLinesSubsectionRef &Lines) {
+  auto Result = std::make_shared<YAMLInlineeLinesSubsection>();
+
+  Result->InlineeLines.HasExtraFiles = Lines.hasExtraFiles();
+  for (const auto &IL : Lines) {
+    InlineeSite Site;
+    auto ExpF = getFileName(Strings, Checksums, IL.Header->FileID);
+    if (!ExpF)
+      return ExpF.takeError();
+    Site.FileName = *ExpF;
+    Site.Inlinee = IL.Header->Inlinee.getIndex();
+    Site.SourceLineNum = IL.Header->SourceLineNum;
+    if (Lines.hasExtraFiles()) {
+      for (const auto EF : IL.ExtraFiles) {
+        auto ExpF2 = getFileName(Strings, Checksums, EF);
+        if (!ExpF2)
+          return ExpF2.takeError();
+        Site.ExtraFiles.push_back(*ExpF2);
+      }
+    }
+    Result->InlineeLines.Sites.push_back(Site);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLCrossModuleExportsSubsection>>
+YAMLCrossModuleExportsSubsection::fromCodeViewSubsection(
+    const DebugCrossModuleExportsSubsectionRef &Exports) {
+  auto Result = std::make_shared<YAMLCrossModuleExportsSubsection>();
+  Result->Exports.assign(Exports.begin(), Exports.end());
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLCrossModuleImportsSubsection>>
+YAMLCrossModuleImportsSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugCrossModuleImportsSubsectionRef &Imports) {
+  auto Result = std::make_shared<YAMLCrossModuleImportsSubsection>();
+  for (const auto &CMI : Imports) {
+    YAMLCrossModuleImport YCMI;
+    auto ExpectedStr = Strings.getString(CMI.Header->ModuleNameOffset);
+    if (!ExpectedStr)
+      return ExpectedStr.takeError();
+    YCMI.ModuleName = *ExpectedStr;
+    YCMI.ImportIds.assign(CMI.Imports.begin(), CMI.Imports.end());
+    Result->Imports.push_back(YCMI);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLSymbolsSubsection>>
+YAMLSymbolsSubsection::fromCodeViewSubsection(
+    const DebugSymbolsSubsectionRef &Symbols) {
+  auto Result = std::make_shared<YAMLSymbolsSubsection>();
+  for (const auto &Sym : Symbols) {
+    auto S = CodeViewYAML::SymbolRecord::fromCodeViewSymbol(Sym);
+    if (!S)
+      return joinErrors(make_error<CodeViewError>(
+                            cv_error_code::corrupt_record,
+                            "Invalid CodeView Symbol Record in SymbolRecord "
+                            "subsection of .debug$S while converting to YAML!"),
+                        S.takeError());
+
+    Result->Symbols.push_back(*S);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLStringTableSubsection>>
+YAMLStringTableSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings) {
+  auto Result = std::make_shared<YAMLStringTableSubsection>();
+  BinaryStreamReader Reader(Strings.getBuffer());
+  StringRef S;
+  // First item is a single null string, skip it.
+  if (auto EC = Reader.readCString(S))
+    return std::move(EC);
+  assert(S.empty());
+  while (Reader.bytesRemaining() > 0) {
+    if (auto EC = Reader.readCString(S))
+      return std::move(EC);
+    Result->Strings.push_back(S);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLFrameDataSubsection>>
+YAMLFrameDataSubsection::fromCodeViewSubsection(
+    const DebugStringTableSubsectionRef &Strings,
+    const DebugFrameDataSubsectionRef &Frames) {
+  auto Result = std::make_shared<YAMLFrameDataSubsection>();
+  for (const auto &F : Frames) {
+    YAMLFrameData YF;
+    YF.CodeSize = F.CodeSize;
+    YF.Flags = F.Flags;
+    YF.LocalSize = F.LocalSize;
+    YF.MaxStackSize = F.MaxStackSize;
+    YF.ParamsSize = F.ParamsSize;
+    YF.PrologSize = F.PrologSize;
+    YF.RvaStart = F.RvaStart;
+    YF.SavedRegsSize = F.SavedRegsSize;
+
+    auto ES = Strings.getString(F.FrameFunc);
+    if (!ES)
+      return joinErrors(
+          make_error<CodeViewError>(
+              cv_error_code::no_records,
+              "Could not find string for string id while mapping FrameData!"),
+          ES.takeError());
+    YF.FrameFunc = *ES;
+    Result->Frames.push_back(YF);
+  }
+  return Result;
+}
+
+Expected<std::shared_ptr<YAMLCoffSymbolRVASubsection>>
+YAMLCoffSymbolRVASubsection::fromCodeViewSubsection(
+    const DebugSymbolRVASubsectionRef &Section) {
+  auto Result = std::make_shared<YAMLCoffSymbolRVASubsection>();
+  for (const auto &RVA : Section) {
+    Result->RVAs.push_back(RVA);
+  }
+  return Result;
+}
+
+Expected<std::vector<std::shared_ptr<DebugSubsection>>>
+llvm::CodeViewYAML::toCodeViewSubsectionList(
+    BumpPtrAllocator &Allocator, ArrayRef<YAMLDebugSubsection> Subsections,
+    const codeview::StringsAndChecksums &SC) {
+  std::vector<std::shared_ptr<DebugSubsection>> Result;
+  if (Subsections.empty())
+    return std::move(Result);
+
+  for (const auto &SS : Subsections) {
+    std::shared_ptr<DebugSubsection> CVS;
+    CVS = SS.Subsection->toCodeViewSubsection(Allocator, SC);
+    assert(CVS != nullptr);
+    Result.push_back(std::move(CVS));
+  }
+  return std::move(Result);
+}
+
+namespace {
+
+struct SubsectionConversionVisitor : public DebugSubsectionVisitor {
+  SubsectionConversionVisitor() = default;
+
+  Error visitUnknown(DebugUnknownSubsectionRef &Unknown) override;
+  Error visitLines(DebugLinesSubsectionRef &Lines,
+                   const StringsAndChecksumsRef &State) override;
+  Error visitFileChecksums(DebugChecksumsSubsectionRef &Checksums,
+                           const StringsAndChecksumsRef &State) override;
+  Error visitInlineeLines(DebugInlineeLinesSubsectionRef &Inlinees,
+                          const StringsAndChecksumsRef &State) override;
+  Error visitCrossModuleExports(DebugCrossModuleExportsSubsectionRef &Checksums,
+                                const StringsAndChecksumsRef &State) override;
+  Error visitCrossModuleImports(DebugCrossModuleImportsSubsectionRef &Inlinees,
+                                const StringsAndChecksumsRef &State) override;
+  Error visitStringTable(DebugStringTableSubsectionRef &ST,
+                         const StringsAndChecksumsRef &State) override;
+  Error visitSymbols(DebugSymbolsSubsectionRef &Symbols,
+                     const StringsAndChecksumsRef &State) override;
+  Error visitFrameData(DebugFrameDataSubsectionRef &Symbols,
+                       const StringsAndChecksumsRef &State) override;
+  Error visitCOFFSymbolRVAs(DebugSymbolRVASubsectionRef &Symbols,
+                            const StringsAndChecksumsRef &State) override;
+
+  YAMLDebugSubsection Subsection;
+};
+
+} // end anonymous namespace
+
+Error SubsectionConversionVisitor::visitUnknown(
+    DebugUnknownSubsectionRef &Unknown) {
+  return make_error<CodeViewError>(cv_error_code::operation_unsupported);
+}
+
+Error SubsectionConversionVisitor::visitLines(
+    DebugLinesSubsectionRef &Lines, const StringsAndChecksumsRef &State) {
+  auto Result = YAMLLinesSubsection::fromCodeViewSubsection(
+      State.strings(), State.checksums(), Lines);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitFileChecksums(
+    DebugChecksumsSubsectionRef &Checksums,
+    const StringsAndChecksumsRef &State) {
+  auto Result = YAMLChecksumsSubsection::fromCodeViewSubsection(State.strings(),
+                                                                Checksums);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitInlineeLines(
+    DebugInlineeLinesSubsectionRef &Inlinees,
+    const StringsAndChecksumsRef &State) {
+  auto Result = YAMLInlineeLinesSubsection::fromCodeViewSubsection(
+      State.strings(), State.checksums(), Inlinees);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitCrossModuleExports(
+    DebugCrossModuleExportsSubsectionRef &Exports,
+    const StringsAndChecksumsRef &State) {
+  auto Result =
+      YAMLCrossModuleExportsSubsection::fromCodeViewSubsection(Exports);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitCrossModuleImports(
+    DebugCrossModuleImportsSubsectionRef &Imports,
+    const StringsAndChecksumsRef &State) {
+  auto Result = YAMLCrossModuleImportsSubsection::fromCodeViewSubsection(
+      State.strings(), Imports);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitStringTable(
+    DebugStringTableSubsectionRef &Strings,
+    const StringsAndChecksumsRef &State) {
+  auto Result = YAMLStringTableSubsection::fromCodeViewSubsection(Strings);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitSymbols(
+    DebugSymbolsSubsectionRef &Symbols, const StringsAndChecksumsRef &State) {
+  auto Result = YAMLSymbolsSubsection::fromCodeViewSubsection(Symbols);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitFrameData(
+    DebugFrameDataSubsectionRef &Frames, const StringsAndChecksumsRef &State) {
+  auto Result =
+      YAMLFrameDataSubsection::fromCodeViewSubsection(State.strings(), Frames);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Error SubsectionConversionVisitor::visitCOFFSymbolRVAs(
+    DebugSymbolRVASubsectionRef &RVAs, const StringsAndChecksumsRef &State) {
+  auto Result = YAMLCoffSymbolRVASubsection::fromCodeViewSubsection(RVAs);
+  if (!Result)
+    return Result.takeError();
+  Subsection.Subsection = *Result;
+  return Error::success();
+}
+
+Expected<YAMLDebugSubsection>
+YAMLDebugSubsection::fromCodeViewSubection(const StringsAndChecksumsRef &SC,
+                                           const DebugSubsectionRecord &SS) {
+  SubsectionConversionVisitor V;
+  if (auto EC = visitDebugSubsection(SS, V, SC))
+    return std::move(EC);
+
+  return V.Subsection;
+}
+
+std::vector<YAMLDebugSubsection>
+llvm::CodeViewYAML::fromDebugS(ArrayRef<uint8_t> Data,
+                               const StringsAndChecksumsRef &SC) {
+  BinaryStreamReader Reader(Data, support::little);
+  uint32_t Magic;
+
+  ExitOnError Err("Invalid .debug$S section!");
+  Err(Reader.readInteger(Magic));
+  assert(Magic == COFF::DEBUG_SECTION_MAGIC && "Invalid .debug$S section!");
+
+  DebugSubsectionArray Subsections;
+  Err(Reader.readArray(Subsections, Reader.bytesRemaining()));
+
+  std::vector<YAMLDebugSubsection> Result;
+
+  for (const auto &SS : Subsections) {
+    auto YamlSS = Err(YAMLDebugSubsection::fromCodeViewSubection(SC, SS));
+    Result.push_back(YamlSS);
+  }
+  return Result;
+}
+
+void llvm::CodeViewYAML::initializeStringsAndChecksums(
+    ArrayRef<YAMLDebugSubsection> Sections, codeview::StringsAndChecksums &SC) {
+  // String Table and Checksums subsections don't use the allocator.
+  BumpPtrAllocator Allocator;
+
+  // It's possible for checksums and strings to even appear in different debug$S
+  // sections, so we have to make this a stateful function that can build up
+  // the strings and checksums field over multiple iterations.
+
+  // File Checksums require the string table, but may become before it, so we
+  // have to scan for strings first, then scan for checksums again from the
+  // beginning.
+  if (!SC.hasStrings()) {
+    for (const auto &SS : Sections) {
+      if (SS.Subsection->Kind != DebugSubsectionKind::StringTable)
+        continue;
+
+      auto Result = SS.Subsection->toCodeViewSubsection(Allocator, SC);
+      SC.setStrings(
+          std::static_pointer_cast<DebugStringTableSubsection>(Result));
+      break;
+    }
+  }
+
+  if (SC.hasStrings() && !SC.hasChecksums()) {
+    for (const auto &SS : Sections) {
+      if (SS.Subsection->Kind != DebugSubsectionKind::FileChecksums)
+        continue;
+
+      auto Result = SS.Subsection->toCodeViewSubsection(Allocator, SC);
+      SC.setChecksums(
+          std::static_pointer_cast<DebugChecksumsSubsection>(Result));
+      break;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
new file mode 100644
index 0000000..dbe4e2a
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLSymbols.cpp
@@ -0,0 +1,590 @@
+//===- CodeViewYAMLSymbols.cpp - CodeView YAMLIO Symbol implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of CodeView
+// Debug Info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/CodeViewYAMLSymbols.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/EnumTables.h"
+#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::CodeViewYAML;
+using namespace llvm::CodeViewYAML::detail;
+using namespace llvm::yaml;
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex)
+
+// We only need to declare these, the definitions are in CodeViewYAMLTypes.cpp
+LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, false)
+LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeIndex, false)
+
+LLVM_YAML_DECLARE_ENUM_TRAITS(SymbolKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(FrameCookieKind)
+
+LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym2Flags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(CompileSym3Flags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(ExportFlags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(PublicSymFlags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(LocalSymFlags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(ProcSymFlags)
+LLVM_YAML_DECLARE_BITSET_TRAITS(FrameProcedureOptions)
+LLVM_YAML_DECLARE_ENUM_TRAITS(CPUType)
+LLVM_YAML_DECLARE_ENUM_TRAITS(RegisterId)
+LLVM_YAML_DECLARE_ENUM_TRAITS(TrampolineType)
+LLVM_YAML_DECLARE_ENUM_TRAITS(ThunkOrdinal)
+
+LLVM_YAML_STRONG_TYPEDEF(StringRef, TypeName)
+
+LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeName, true)
+
+StringRef ScalarTraits<TypeName>::input(StringRef S, void *V, TypeName &T) {
+  return ScalarTraits<StringRef>::input(S, V, T.value);
+}
+
+void ScalarTraits<TypeName>::output(const TypeName &T, void *V,
+                                    raw_ostream &R) {
+  ScalarTraits<StringRef>::output(T.value, V, R);
+}
+
+void ScalarEnumerationTraits<SymbolKind>::enumeration(IO &io,
+                                                      SymbolKind &Value) {
+  auto SymbolNames = getSymbolTypeNames();
+  for (const auto &E : SymbolNames)
+    io.enumCase(Value, E.Name.str().c_str(), E.Value);
+}
+
+void ScalarBitSetTraits<CompileSym2Flags>::bitset(IO &io,
+                                                  CompileSym2Flags &Flags) {
+  auto FlagNames = getCompileSym2FlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<CompileSym2Flags>(E.Value));
+  }
+}
+
+void ScalarBitSetTraits<CompileSym3Flags>::bitset(IO &io,
+                                                  CompileSym3Flags &Flags) {
+  auto FlagNames = getCompileSym3FlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<CompileSym3Flags>(E.Value));
+  }
+}
+
+void ScalarBitSetTraits<ExportFlags>::bitset(IO &io, ExportFlags &Flags) {
+  auto FlagNames = getExportSymFlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<ExportFlags>(E.Value));
+  }
+}
+
+void ScalarBitSetTraits<PublicSymFlags>::bitset(IO &io, PublicSymFlags &Flags) {
+  auto FlagNames = getProcSymFlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<PublicSymFlags>(E.Value));
+  }
+}
+
+void ScalarBitSetTraits<LocalSymFlags>::bitset(IO &io, LocalSymFlags &Flags) {
+  auto FlagNames = getLocalFlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<LocalSymFlags>(E.Value));
+  }
+}
+
+void ScalarBitSetTraits<ProcSymFlags>::bitset(IO &io, ProcSymFlags &Flags) {
+  auto FlagNames = getProcSymFlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<ProcSymFlags>(E.Value));
+  }
+}
+
+void ScalarBitSetTraits<FrameProcedureOptions>::bitset(
+    IO &io, FrameProcedureOptions &Flags) {
+  auto FlagNames = getFrameProcSymFlagNames();
+  for (const auto &E : FlagNames) {
+    io.bitSetCase(Flags, E.Name.str().c_str(),
+                  static_cast<FrameProcedureOptions>(E.Value));
+  }
+}
+
+void ScalarEnumerationTraits<CPUType>::enumeration(IO &io, CPUType &Cpu) {
+  auto CpuNames = getCPUTypeNames();
+  for (const auto &E : CpuNames) {
+    io.enumCase(Cpu, E.Name.str().c_str(), static_cast<CPUType>(E.Value));
+  }
+}
+
+void ScalarEnumerationTraits<RegisterId>::enumeration(IO &io, RegisterId &Reg) {
+  auto RegNames = getRegisterNames();
+  for (const auto &E : RegNames) {
+    io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value));
+  }
+  io.enumFallback<Hex16>(Reg);
+}
+
+void ScalarEnumerationTraits<TrampolineType>::enumeration(
+    IO &io, TrampolineType &Tramp) {
+  auto TrampNames = getTrampolineNames();
+  for (const auto &E : TrampNames) {
+    io.enumCase(Tramp, E.Name.str().c_str(),
+                static_cast<TrampolineType>(E.Value));
+  }
+}
+
+void ScalarEnumerationTraits<ThunkOrdinal>::enumeration(IO &io,
+                                                        ThunkOrdinal &Ord) {
+  auto ThunkNames = getThunkOrdinalNames();
+  for (const auto &E : ThunkNames) {
+    io.enumCase(Ord, E.Name.str().c_str(), static_cast<ThunkOrdinal>(E.Value));
+  }
+}
+
+void ScalarEnumerationTraits<FrameCookieKind>::enumeration(
+    IO &io, FrameCookieKind &FC) {
+  auto ThunkNames = getFrameCookieKindNames();
+  for (const auto &E : ThunkNames) {
+    io.enumCase(FC, E.Name.str().c_str(),
+                static_cast<FrameCookieKind>(E.Value));
+  }
+}
+
+namespace llvm {
+namespace CodeViewYAML {
+namespace detail {
+
+struct SymbolRecordBase {
+  codeview::SymbolKind Kind;
+
+  explicit SymbolRecordBase(codeview::SymbolKind K) : Kind(K) {}
+  virtual ~SymbolRecordBase() = default;
+
+  virtual void map(yaml::IO &io) = 0;
+  virtual codeview::CVSymbol
+  toCodeViewSymbol(BumpPtrAllocator &Allocator,
+                   CodeViewContainer Container) const = 0;
+  virtual Error fromCodeViewSymbol(codeview::CVSymbol Type) = 0;
+};
+
+template <typename T> struct SymbolRecordImpl : public SymbolRecordBase {
+  explicit SymbolRecordImpl(codeview::SymbolKind K)
+      : SymbolRecordBase(K), Symbol(static_cast<SymbolRecordKind>(K)) {}
+
+  void map(yaml::IO &io) override;
+
+  codeview::CVSymbol
+  toCodeViewSymbol(BumpPtrAllocator &Allocator,
+                   CodeViewContainer Container) const override {
+    return SymbolSerializer::writeOneSymbol(Symbol, Allocator, Container);
+  }
+
+  Error fromCodeViewSymbol(codeview::CVSymbol CVS) override {
+    return SymbolDeserializer::deserializeAs<T>(CVS, Symbol);
+  }
+
+  mutable T Symbol;
+};
+
+struct UnknownSymbolRecord : public SymbolRecordBase {
+  explicit UnknownSymbolRecord(codeview::SymbolKind K) : SymbolRecordBase(K) {}
+
+  void map(yaml::IO &io) override;
+
+  CVSymbol toCodeViewSymbol(BumpPtrAllocator &Allocator,
+                            CodeViewContainer Container) const override {
+    RecordPrefix Prefix;
+    uint32_t TotalLen = sizeof(RecordPrefix) + Data.size();
+    Prefix.RecordKind = Kind;
+    Prefix.RecordLen = TotalLen - 2;
+    uint8_t *Buffer = Allocator.Allocate<uint8_t>(TotalLen);
+    ::memcpy(Buffer, &Prefix, sizeof(RecordPrefix));
+    ::memcpy(Buffer + sizeof(RecordPrefix), Data.data(), Data.size());
+    return CVSymbol(Kind, ArrayRef<uint8_t>(Buffer, TotalLen));
+  }
+
+  Error fromCodeViewSymbol(CVSymbol CVS) override {
+    this->Kind = CVS.kind();
+    Data = CVS.RecordData.drop_front(sizeof(RecordPrefix));
+    return Error::success();
+  }
+
+  std::vector<uint8_t> Data;
+};
+
+template <> void SymbolRecordImpl<ScopeEndSym>::map(IO &IO) {}
+
+void UnknownSymbolRecord::map(yaml::IO &io) {
+  yaml::BinaryRef Binary;
+  if (io.outputting())
+    Binary = yaml::BinaryRef(Data);
+  io.mapRequired("Data", Binary);
+  if (!io.outputting()) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    Binary.writeAsBinary(OS);
+    OS.flush();
+    Data.assign(Str.begin(), Str.end());
+  }
+}
+
+template <> void SymbolRecordImpl<Thunk32Sym>::map(IO &IO) {
+  IO.mapRequired("Parent", Symbol.Parent);
+  IO.mapRequired("End", Symbol.End);
+  IO.mapRequired("Next", Symbol.Next);
+  IO.mapRequired("Off", Symbol.Offset);
+  IO.mapRequired("Seg", Symbol.Segment);
+  IO.mapRequired("Len", Symbol.Length);
+  IO.mapRequired("Ordinal", Symbol.Thunk);
+}
+
+template <> void SymbolRecordImpl<TrampolineSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapRequired("Size", Symbol.Size);
+  IO.mapRequired("ThunkOff", Symbol.ThunkOffset);
+  IO.mapRequired("TargetOff", Symbol.TargetOffset);
+  IO.mapRequired("ThunkSection", Symbol.ThunkSection);
+  IO.mapRequired("TargetSection", Symbol.TargetSection);
+}
+
+template <> void SymbolRecordImpl<SectionSym>::map(IO &IO) {
+  IO.mapRequired("SectionNumber", Symbol.SectionNumber);
+  IO.mapRequired("Alignment", Symbol.Alignment);
+  IO.mapRequired("Rva", Symbol.Rva);
+  IO.mapRequired("Length", Symbol.Length);
+  IO.mapRequired("Characteristics", Symbol.Characteristics);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<CoffGroupSym>::map(IO &IO) {
+  IO.mapRequired("Size", Symbol.Size);
+  IO.mapRequired("Characteristics", Symbol.Characteristics);
+  IO.mapRequired("Offset", Symbol.Offset);
+  IO.mapRequired("Segment", Symbol.Segment);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<ExportSym>::map(IO &IO) {
+  IO.mapRequired("Ordinal", Symbol.Ordinal);
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<ProcSym>::map(IO &IO) {
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
+  IO.mapOptional("PtrNext", Symbol.Next, 0U);
+  IO.mapRequired("CodeSize", Symbol.CodeSize);
+  IO.mapRequired("DbgStart", Symbol.DbgStart);
+  IO.mapRequired("DbgEnd", Symbol.DbgEnd);
+  IO.mapRequired("FunctionType", Symbol.FunctionType);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("DisplayName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<RegisterSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Index);
+  IO.mapRequired("Seg", Symbol.Register);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<PublicSym32>::map(IO &IO) {
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapOptional("Offset", Symbol.Offset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<ProcRefSym>::map(IO &IO) {
+  IO.mapRequired("SumName", Symbol.SumName);
+  IO.mapRequired("SymOffset", Symbol.SymOffset);
+  IO.mapRequired("Mod", Symbol.Module);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<EnvBlockSym>::map(IO &IO) {
+  IO.mapRequired("Entries", Symbol.Fields);
+}
+
+template <> void SymbolRecordImpl<InlineSiteSym>::map(IO &IO) {
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
+  IO.mapRequired("Inlinee", Symbol.Inlinee);
+  // TODO: The binary annotations
+}
+
+template <> void SymbolRecordImpl<LocalSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapRequired("Flags", Symbol.Flags);
+
+  IO.mapRequired("VarName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<DefRangeSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <> void SymbolRecordImpl<DefRangeSubfieldSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <> void SymbolRecordImpl<DefRangeRegisterSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <> void SymbolRecordImpl<DefRangeFramePointerRelSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <> void SymbolRecordImpl<DefRangeSubfieldRegisterSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <>
+void SymbolRecordImpl<DefRangeFramePointerRelFullScopeSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <> void SymbolRecordImpl<DefRangeRegisterRelSym>::map(IO &IO) {
+  // TODO: Print the subfields
+}
+
+template <> void SymbolRecordImpl<BlockSym>::map(IO &IO) {
+  IO.mapOptional("PtrParent", Symbol.Parent, 0U);
+  IO.mapOptional("PtrEnd", Symbol.End, 0U);
+  IO.mapRequired("CodeSize", Symbol.CodeSize);
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("BlockName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<LabelSym>::map(IO &IO) {
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("DisplayName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<ObjNameSym>::map(IO &IO) {
+  IO.mapRequired("Signature", Symbol.Signature);
+  IO.mapRequired("ObjectName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<Compile2Sym>::map(IO &IO) {
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("Machine", Symbol.Machine);
+  IO.mapRequired("FrontendMajor", Symbol.VersionFrontendMajor);
+  IO.mapRequired("FrontendMinor", Symbol.VersionFrontendMinor);
+  IO.mapRequired("FrontendBuild", Symbol.VersionFrontendBuild);
+  IO.mapRequired("BackendMajor", Symbol.VersionBackendMajor);
+  IO.mapRequired("BackendMinor", Symbol.VersionBackendMinor);
+  IO.mapRequired("BackendBuild", Symbol.VersionBackendBuild);
+  IO.mapRequired("Version", Symbol.Version);
+}
+
+template <> void SymbolRecordImpl<Compile3Sym>::map(IO &IO) {
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("Machine", Symbol.Machine);
+  IO.mapRequired("FrontendMajor", Symbol.VersionFrontendMajor);
+  IO.mapRequired("FrontendMinor", Symbol.VersionFrontendMinor);
+  IO.mapRequired("FrontendBuild", Symbol.VersionFrontendBuild);
+  IO.mapRequired("FrontendQFE", Symbol.VersionFrontendQFE);
+  IO.mapRequired("BackendMajor", Symbol.VersionBackendMajor);
+  IO.mapRequired("BackendMinor", Symbol.VersionBackendMinor);
+  IO.mapRequired("BackendBuild", Symbol.VersionBackendBuild);
+  IO.mapRequired("BackendQFE", Symbol.VersionBackendQFE);
+  IO.mapRequired("Version", Symbol.Version);
+}
+
+template <> void SymbolRecordImpl<FrameProcSym>::map(IO &IO) {
+  IO.mapRequired("TotalFrameBytes", Symbol.TotalFrameBytes);
+  IO.mapRequired("PaddingFrameBytes", Symbol.PaddingFrameBytes);
+  IO.mapRequired("OffsetToPadding", Symbol.OffsetToPadding);
+  IO.mapRequired("BytesOfCalleeSavedRegisters",
+                 Symbol.BytesOfCalleeSavedRegisters);
+  IO.mapRequired("OffsetOfExceptionHandler", Symbol.OffsetOfExceptionHandler);
+  IO.mapRequired("SectionIdOfExceptionHandler",
+                 Symbol.SectionIdOfExceptionHandler);
+  IO.mapRequired("Flags", Symbol.Flags);
+}
+
+template <> void SymbolRecordImpl<CallSiteInfoSym>::map(IO &IO) {
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("Type", Symbol.Type);
+}
+
+template <> void SymbolRecordImpl<FileStaticSym>::map(IO &IO) {
+  IO.mapRequired("Index", Symbol.Index);
+  IO.mapRequired("ModFilenameOffset", Symbol.ModFilenameOffset);
+  IO.mapRequired("Flags", Symbol.Flags);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<HeapAllocationSiteSym>::map(IO &IO) {
+  IO.mapOptional("Offset", Symbol.CodeOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("CallInstructionSize", Symbol.CallInstructionSize);
+  IO.mapRequired("Type", Symbol.Type);
+}
+
+template <> void SymbolRecordImpl<FrameCookieSym>::map(IO &IO) {
+  IO.mapRequired("Register", Symbol.Register);
+  IO.mapRequired("CookieKind", Symbol.CookieKind);
+  IO.mapRequired("Flags", Symbol.Flags);
+}
+
+template <> void SymbolRecordImpl<CallerSym>::map(IO &IO) {
+  IO.mapRequired("FuncID", Symbol.Indices);
+}
+
+template <> void SymbolRecordImpl<UDTSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapRequired("UDTName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<BuildInfoSym>::map(IO &IO) {
+  IO.mapRequired("BuildId", Symbol.BuildId);
+}
+
+template <> void SymbolRecordImpl<BPRelativeSym>::map(IO &IO) {
+  IO.mapRequired("Offset", Symbol.Offset);
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapRequired("VarName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<RegRelativeSym>::map(IO &IO) {
+  IO.mapRequired("Offset", Symbol.Offset);
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapRequired("Register", Symbol.Register);
+  IO.mapRequired("VarName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<ConstantSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapRequired("Value", Symbol.Value);
+  IO.mapRequired("Name", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<DataSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapOptional("Offset", Symbol.DataOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("DisplayName", Symbol.Name);
+}
+
+template <> void SymbolRecordImpl<ThreadLocalDataSym>::map(IO &IO) {
+  IO.mapRequired("Type", Symbol.Type);
+  IO.mapOptional("Offset", Symbol.DataOffset, 0U);
+  IO.mapOptional("Segment", Symbol.Segment, uint16_t(0));
+  IO.mapRequired("DisplayName", Symbol.Name);
+}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
+
+CVSymbol CodeViewYAML::SymbolRecord::toCodeViewSymbol(
+    BumpPtrAllocator &Allocator, CodeViewContainer Container) const {
+  return Symbol->toCodeViewSymbol(Allocator, Container);
+}
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<SymbolRecordBase> {
+  static void mapping(IO &io, SymbolRecordBase &Record) { Record.map(io); }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+template <typename SymbolType>
+static inline Expected<CodeViewYAML::SymbolRecord>
+fromCodeViewSymbolImpl(CVSymbol Symbol) {
+  CodeViewYAML::SymbolRecord Result;
+
+  auto Impl = std::make_shared<SymbolType>(Symbol.kind());
+  if (auto EC = Impl->fromCodeViewSymbol(Symbol))
+    return std::move(EC);
+  Result.Symbol = Impl;
+  return Result;
+}
+
+Expected<CodeViewYAML::SymbolRecord>
+CodeViewYAML::SymbolRecord::fromCodeViewSymbol(CVSymbol Symbol) {
+#define SYMBOL_RECORD(EnumName, EnumVal, ClassName)                            \
+  case EnumName:                                                               \
+    return fromCodeViewSymbolImpl<SymbolRecordImpl<ClassName>>(Symbol);
+#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)           \
+  SYMBOL_RECORD(EnumName, EnumVal, ClassName)
+  switch (Symbol.kind()) {
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
+  default:
+    return fromCodeViewSymbolImpl<UnknownSymbolRecord>(Symbol);
+  }
+  return make_error<CodeViewError>(cv_error_code::corrupt_record);
+}
+
+template <typename ConcreteType>
+static void mapSymbolRecordImpl(IO &IO, const char *Class, SymbolKind Kind,
+                                CodeViewYAML::SymbolRecord &Obj) {
+  if (!IO.outputting())
+    Obj.Symbol = std::make_shared<ConcreteType>(Kind);
+
+  IO.mapRequired(Class, *Obj.Symbol);
+}
+
+void MappingTraits<CodeViewYAML::SymbolRecord>::mapping(
+    IO &IO, CodeViewYAML::SymbolRecord &Obj) {
+  SymbolKind Kind;
+  if (IO.outputting())
+    Kind = Obj.Symbol->Kind;
+  IO.mapRequired("Kind", Kind);
+
+#define SYMBOL_RECORD(EnumName, EnumVal, ClassName)                            \
+  case EnumName:                                                               \
+    mapSymbolRecordImpl<SymbolRecordImpl<ClassName>>(IO, #ClassName, Kind,     \
+                                                     Obj);                     \
+    break;
+#define SYMBOL_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)           \
+  SYMBOL_RECORD(EnumName, EnumVal, ClassName)
+  switch (Kind) {
+#include "llvm/DebugInfo/CodeView/CodeViewSymbols.def"
+  default:
+    mapSymbolRecordImpl<UnknownSymbolRecord>(IO, "UnknownSym", Kind, Obj);
+  }
+}
diff --git a/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
new file mode 100644
index 0000000..81046b2
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/CodeViewYAMLTypes.cpp
@@ -0,0 +1,806 @@
+//===- CodeViewYAMLTypes.cpp - CodeView YAMLIO types implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of CodeView
+// Debug Info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/CodeViewYAMLTypes.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::CodeViewYAML;
+using namespace llvm::CodeViewYAML::detail;
+using namespace llvm::yaml;
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(OneMethodRecord)
+LLVM_YAML_IS_SEQUENCE_VECTOR(VFTableSlotKind)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TypeIndex)
+
+LLVM_YAML_DECLARE_SCALAR_TRAITS(TypeIndex, false)
+LLVM_YAML_DECLARE_SCALAR_TRAITS(APSInt, false)
+
+LLVM_YAML_DECLARE_ENUM_TRAITS(TypeLeafKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(PointerToMemberRepresentation)
+LLVM_YAML_DECLARE_ENUM_TRAITS(VFTableSlotKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(CallingConvention)
+LLVM_YAML_DECLARE_ENUM_TRAITS(PointerKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(PointerMode)
+LLVM_YAML_DECLARE_ENUM_TRAITS(HfaKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(MemberAccess)
+LLVM_YAML_DECLARE_ENUM_TRAITS(MethodKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(WindowsRTClassKind)
+LLVM_YAML_DECLARE_ENUM_TRAITS(LabelType)
+
+LLVM_YAML_DECLARE_BITSET_TRAITS(PointerOptions)
+LLVM_YAML_DECLARE_BITSET_TRAITS(ModifierOptions)
+LLVM_YAML_DECLARE_BITSET_TRAITS(FunctionOptions)
+LLVM_YAML_DECLARE_BITSET_TRAITS(ClassOptions)
+LLVM_YAML_DECLARE_BITSET_TRAITS(MethodOptions)
+
+LLVM_YAML_DECLARE_MAPPING_TRAITS(OneMethodRecord)
+LLVM_YAML_DECLARE_MAPPING_TRAITS(MemberPointerInfo)
+
+namespace llvm {
+namespace CodeViewYAML {
+namespace detail {
+
+struct LeafRecordBase {
+  TypeLeafKind Kind;
+
+  explicit LeafRecordBase(TypeLeafKind K) : Kind(K) {}
+  virtual ~LeafRecordBase() = default;
+
+  virtual void map(yaml::IO &io) = 0;
+  virtual CVType toCodeViewRecord(TypeTableBuilder &TTB) const = 0;
+  virtual Error fromCodeViewRecord(CVType Type) = 0;
+};
+
+template <typename T> struct LeafRecordImpl : public LeafRecordBase {
+  explicit LeafRecordImpl(TypeLeafKind K)
+      : LeafRecordBase(K), Record(static_cast<TypeRecordKind>(K)) {}
+
+  void map(yaml::IO &io) override;
+
+  Error fromCodeViewRecord(CVType Type) override {
+    return TypeDeserializer::deserializeAs<T>(Type, Record);
+  }
+
+  CVType toCodeViewRecord(TypeTableBuilder &TTB) const override {
+    TTB.writeKnownType(Record);
+    return CVType(Kind, TTB.records().back());
+  }
+
+  mutable T Record;
+};
+
+template <> struct LeafRecordImpl<FieldListRecord> : public LeafRecordBase {
+  explicit LeafRecordImpl(TypeLeafKind K) : LeafRecordBase(K) {}
+
+  void map(yaml::IO &io) override;
+  CVType toCodeViewRecord(TypeTableBuilder &TTB) const override;
+  Error fromCodeViewRecord(CVType Type) override;
+
+  std::vector<MemberRecord> Members;
+};
+
+struct MemberRecordBase {
+  TypeLeafKind Kind;
+
+  explicit MemberRecordBase(TypeLeafKind K) : Kind(K) {}
+  virtual ~MemberRecordBase() = default;
+
+  virtual void map(yaml::IO &io) = 0;
+  virtual void writeTo(FieldListRecordBuilder &FLRB) = 0;
+};
+
+template <typename T> struct MemberRecordImpl : public MemberRecordBase {
+  explicit MemberRecordImpl(TypeLeafKind K)
+      : MemberRecordBase(K), Record(static_cast<TypeRecordKind>(K)) {}
+
+  void map(yaml::IO &io) override;
+
+  void writeTo(FieldListRecordBuilder &FLRB) override {
+    FLRB.writeMemberType(Record);
+  }
+
+  mutable T Record;
+};
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
+
+void ScalarTraits<GUID>::output(const GUID &G, void *, llvm::raw_ostream &OS) {
+  OS << G;
+}
+
+StringRef ScalarTraits<GUID>::input(StringRef Scalar, void *Ctx, GUID &S) {
+  if (Scalar.size() != 38)
+    return "GUID strings are 38 characters long";
+  if (Scalar[0] != '{' || Scalar[37] != '}')
+    return "GUID is not enclosed in {}";
+  if (Scalar[9] != '-' || Scalar[14] != '-' || Scalar[19] != '-' ||
+      Scalar[24] != '-')
+    return "GUID sections are not properly delineated with dashes";
+
+  uint8_t *OutBuffer = S.Guid;
+  for (auto Iter = Scalar.begin(); Iter != Scalar.end();) {
+    if (*Iter == '-' || *Iter == '{' || *Iter == '}') {
+      ++Iter;
+      continue;
+    }
+    uint8_t Value = (llvm::hexDigitValue(*Iter++) << 4);
+    Value |= llvm::hexDigitValue(*Iter++);
+    *OutBuffer++ = Value;
+  }
+
+  return "";
+}
+
+void ScalarTraits<TypeIndex>::output(const TypeIndex &S, void *,
+                                     raw_ostream &OS) {
+  OS << S.getIndex();
+}
+
+StringRef ScalarTraits<TypeIndex>::input(StringRef Scalar, void *Ctx,
+                                         TypeIndex &S) {
+  uint32_t I;
+  StringRef Result = ScalarTraits<uint32_t>::input(Scalar, Ctx, I);
+  S.setIndex(I);
+  return Result;
+}
+
+void ScalarTraits<APSInt>::output(const APSInt &S, void *, raw_ostream &OS) {
+  S.print(OS, S.isSigned());
+}
+
+StringRef ScalarTraits<APSInt>::input(StringRef Scalar, void *Ctx, APSInt &S) {
+  S = APSInt(Scalar);
+  return "";
+}
+
+void ScalarEnumerationTraits<TypeLeafKind>::enumeration(IO &io,
+                                                        TypeLeafKind &Value) {
+#define CV_TYPE(name, val) io.enumCase(Value, #name, name);
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+#undef CV_TYPE
+}
+
+void ScalarEnumerationTraits<PointerToMemberRepresentation>::enumeration(
+    IO &IO, PointerToMemberRepresentation &Value) {
+  IO.enumCase(Value, "Unknown", PointerToMemberRepresentation::Unknown);
+  IO.enumCase(Value, "SingleInheritanceData",
+              PointerToMemberRepresentation::SingleInheritanceData);
+  IO.enumCase(Value, "MultipleInheritanceData",
+              PointerToMemberRepresentation::MultipleInheritanceData);
+  IO.enumCase(Value, "VirtualInheritanceData",
+              PointerToMemberRepresentation::VirtualInheritanceData);
+  IO.enumCase(Value, "GeneralData", PointerToMemberRepresentation::GeneralData);
+  IO.enumCase(Value, "SingleInheritanceFunction",
+              PointerToMemberRepresentation::SingleInheritanceFunction);
+  IO.enumCase(Value, "MultipleInheritanceFunction",
+              PointerToMemberRepresentation::MultipleInheritanceFunction);
+  IO.enumCase(Value, "VirtualInheritanceFunction",
+              PointerToMemberRepresentation::VirtualInheritanceFunction);
+  IO.enumCase(Value, "GeneralFunction",
+              PointerToMemberRepresentation::GeneralFunction);
+}
+
+void ScalarEnumerationTraits<VFTableSlotKind>::enumeration(
+    IO &IO, VFTableSlotKind &Kind) {
+  IO.enumCase(Kind, "Near16", VFTableSlotKind::Near16);
+  IO.enumCase(Kind, "Far16", VFTableSlotKind::Far16);
+  IO.enumCase(Kind, "This", VFTableSlotKind::This);
+  IO.enumCase(Kind, "Outer", VFTableSlotKind::Outer);
+  IO.enumCase(Kind, "Meta", VFTableSlotKind::Meta);
+  IO.enumCase(Kind, "Near", VFTableSlotKind::Near);
+  IO.enumCase(Kind, "Far", VFTableSlotKind::Far);
+}
+
+void ScalarEnumerationTraits<CallingConvention>::enumeration(
+    IO &IO, CallingConvention &Value) {
+  IO.enumCase(Value, "NearC", CallingConvention::NearC);
+  IO.enumCase(Value, "FarC", CallingConvention::FarC);
+  IO.enumCase(Value, "NearPascal", CallingConvention::NearPascal);
+  IO.enumCase(Value, "FarPascal", CallingConvention::FarPascal);
+  IO.enumCase(Value, "NearFast", CallingConvention::NearFast);
+  IO.enumCase(Value, "FarFast", CallingConvention::FarFast);
+  IO.enumCase(Value, "NearStdCall", CallingConvention::NearStdCall);
+  IO.enumCase(Value, "FarStdCall", CallingConvention::FarStdCall);
+  IO.enumCase(Value, "NearSysCall", CallingConvention::NearSysCall);
+  IO.enumCase(Value, "FarSysCall", CallingConvention::FarSysCall);
+  IO.enumCase(Value, "ThisCall", CallingConvention::ThisCall);
+  IO.enumCase(Value, "MipsCall", CallingConvention::MipsCall);
+  IO.enumCase(Value, "Generic", CallingConvention::Generic);
+  IO.enumCase(Value, "AlphaCall", CallingConvention::AlphaCall);
+  IO.enumCase(Value, "PpcCall", CallingConvention::PpcCall);
+  IO.enumCase(Value, "SHCall", CallingConvention::SHCall);
+  IO.enumCase(Value, "ArmCall", CallingConvention::ArmCall);
+  IO.enumCase(Value, "AM33Call", CallingConvention::AM33Call);
+  IO.enumCase(Value, "TriCall", CallingConvention::TriCall);
+  IO.enumCase(Value, "SH5Call", CallingConvention::SH5Call);
+  IO.enumCase(Value, "M32RCall", CallingConvention::M32RCall);
+  IO.enumCase(Value, "ClrCall", CallingConvention::ClrCall);
+  IO.enumCase(Value, "Inline", CallingConvention::Inline);
+  IO.enumCase(Value, "NearVector", CallingConvention::NearVector);
+}
+
+void ScalarEnumerationTraits<PointerKind>::enumeration(IO &IO,
+                                                       PointerKind &Kind) {
+  IO.enumCase(Kind, "Near16", PointerKind::Near16);
+  IO.enumCase(Kind, "Far16", PointerKind::Far16);
+  IO.enumCase(Kind, "Huge16", PointerKind::Huge16);
+  IO.enumCase(Kind, "BasedOnSegment", PointerKind::BasedOnSegment);
+  IO.enumCase(Kind, "BasedOnValue", PointerKind::BasedOnValue);
+  IO.enumCase(Kind, "BasedOnSegmentValue", PointerKind::BasedOnSegmentValue);
+  IO.enumCase(Kind, "BasedOnAddress", PointerKind::BasedOnAddress);
+  IO.enumCase(Kind, "BasedOnSegmentAddress",
+              PointerKind::BasedOnSegmentAddress);
+  IO.enumCase(Kind, "BasedOnType", PointerKind::BasedOnType);
+  IO.enumCase(Kind, "BasedOnSelf", PointerKind::BasedOnSelf);
+  IO.enumCase(Kind, "Near32", PointerKind::Near32);
+  IO.enumCase(Kind, "Far32", PointerKind::Far32);
+  IO.enumCase(Kind, "Near64", PointerKind::Near64);
+}
+
+void ScalarEnumerationTraits<PointerMode>::enumeration(IO &IO,
+                                                       PointerMode &Mode) {
+  IO.enumCase(Mode, "Pointer", PointerMode::Pointer);
+  IO.enumCase(Mode, "LValueReference", PointerMode::LValueReference);
+  IO.enumCase(Mode, "PointerToDataMember", PointerMode::PointerToDataMember);
+  IO.enumCase(Mode, "PointerToMemberFunction",
+              PointerMode::PointerToMemberFunction);
+  IO.enumCase(Mode, "RValueReference", PointerMode::RValueReference);
+}
+
+void ScalarEnumerationTraits<HfaKind>::enumeration(IO &IO, HfaKind &Value) {
+  IO.enumCase(Value, "None", HfaKind::None);
+  IO.enumCase(Value, "Float", HfaKind::Float);
+  IO.enumCase(Value, "Double", HfaKind::Double);
+  IO.enumCase(Value, "Other", HfaKind::Other);
+}
+
+void ScalarEnumerationTraits<MemberAccess>::enumeration(IO &IO,
+                                                        MemberAccess &Access) {
+  IO.enumCase(Access, "None", MemberAccess::None);
+  IO.enumCase(Access, "Private", MemberAccess::Private);
+  IO.enumCase(Access, "Protected", MemberAccess::Protected);
+  IO.enumCase(Access, "Public", MemberAccess::Public);
+}
+
+void ScalarEnumerationTraits<MethodKind>::enumeration(IO &IO,
+                                                      MethodKind &Kind) {
+  IO.enumCase(Kind, "Vanilla", MethodKind::Vanilla);
+  IO.enumCase(Kind, "Virtual", MethodKind::Virtual);
+  IO.enumCase(Kind, "Static", MethodKind::Static);
+  IO.enumCase(Kind, "Friend", MethodKind::Friend);
+  IO.enumCase(Kind, "IntroducingVirtual", MethodKind::IntroducingVirtual);
+  IO.enumCase(Kind, "PureVirtual", MethodKind::PureVirtual);
+  IO.enumCase(Kind, "PureIntroducingVirtual",
+              MethodKind::PureIntroducingVirtual);
+}
+
+void ScalarEnumerationTraits<WindowsRTClassKind>::enumeration(
+    IO &IO, WindowsRTClassKind &Value) {
+  IO.enumCase(Value, "None", WindowsRTClassKind::None);
+  IO.enumCase(Value, "Ref", WindowsRTClassKind::RefClass);
+  IO.enumCase(Value, "Value", WindowsRTClassKind::ValueClass);
+  IO.enumCase(Value, "Interface", WindowsRTClassKind::Interface);
+}
+
+void ScalarEnumerationTraits<LabelType>::enumeration(IO &IO, LabelType &Value) {
+  IO.enumCase(Value, "Near", LabelType::Near);
+  IO.enumCase(Value, "Far", LabelType::Far);
+}
+
+void ScalarBitSetTraits<PointerOptions>::bitset(IO &IO,
+                                                PointerOptions &Options) {
+  IO.bitSetCase(Options, "None", PointerOptions::None);
+  IO.bitSetCase(Options, "Flat32", PointerOptions::Flat32);
+  IO.bitSetCase(Options, "Volatile", PointerOptions::Volatile);
+  IO.bitSetCase(Options, "Const", PointerOptions::Const);
+  IO.bitSetCase(Options, "Unaligned", PointerOptions::Unaligned);
+  IO.bitSetCase(Options, "Restrict", PointerOptions::Restrict);
+  IO.bitSetCase(Options, "WinRTSmartPointer",
+                PointerOptions::WinRTSmartPointer);
+}
+
+void ScalarBitSetTraits<ModifierOptions>::bitset(IO &IO,
+                                                 ModifierOptions &Options) {
+  IO.bitSetCase(Options, "None", ModifierOptions::None);
+  IO.bitSetCase(Options, "Const", ModifierOptions::Const);
+  IO.bitSetCase(Options, "Volatile", ModifierOptions::Volatile);
+  IO.bitSetCase(Options, "Unaligned", ModifierOptions::Unaligned);
+}
+
+void ScalarBitSetTraits<FunctionOptions>::bitset(IO &IO,
+                                                 FunctionOptions &Options) {
+  IO.bitSetCase(Options, "None", FunctionOptions::None);
+  IO.bitSetCase(Options, "CxxReturnUdt", FunctionOptions::CxxReturnUdt);
+  IO.bitSetCase(Options, "Constructor", FunctionOptions::Constructor);
+  IO.bitSetCase(Options, "ConstructorWithVirtualBases",
+                FunctionOptions::ConstructorWithVirtualBases);
+}
+
+void ScalarBitSetTraits<ClassOptions>::bitset(IO &IO, ClassOptions &Options) {
+  IO.bitSetCase(Options, "None", ClassOptions::None);
+  IO.bitSetCase(Options, "HasConstructorOrDestructor",
+                ClassOptions::HasConstructorOrDestructor);
+  IO.bitSetCase(Options, "HasOverloadedOperator",
+                ClassOptions::HasOverloadedOperator);
+  IO.bitSetCase(Options, "Nested", ClassOptions::Nested);
+  IO.bitSetCase(Options, "ContainsNestedClass",
+                ClassOptions::ContainsNestedClass);
+  IO.bitSetCase(Options, "HasOverloadedAssignmentOperator",
+                ClassOptions::HasOverloadedAssignmentOperator);
+  IO.bitSetCase(Options, "HasConversionOperator",
+                ClassOptions::HasConversionOperator);
+  IO.bitSetCase(Options, "ForwardReference", ClassOptions::ForwardReference);
+  IO.bitSetCase(Options, "Scoped", ClassOptions::Scoped);
+  IO.bitSetCase(Options, "HasUniqueName", ClassOptions::HasUniqueName);
+  IO.bitSetCase(Options, "Sealed", ClassOptions::Sealed);
+  IO.bitSetCase(Options, "Intrinsic", ClassOptions::Intrinsic);
+}
+
+void ScalarBitSetTraits<MethodOptions>::bitset(IO &IO, MethodOptions &Options) {
+  IO.bitSetCase(Options, "None", MethodOptions::None);
+  IO.bitSetCase(Options, "Pseudo", MethodOptions::Pseudo);
+  IO.bitSetCase(Options, "NoInherit", MethodOptions::NoInherit);
+  IO.bitSetCase(Options, "NoConstruct", MethodOptions::NoConstruct);
+  IO.bitSetCase(Options, "CompilerGenerated", MethodOptions::CompilerGenerated);
+  IO.bitSetCase(Options, "Sealed", MethodOptions::Sealed);
+}
+
+void MappingTraits<MemberPointerInfo>::mapping(IO &IO, MemberPointerInfo &MPI) {
+  IO.mapRequired("ContainingType", MPI.ContainingType);
+  IO.mapRequired("Representation", MPI.Representation);
+}
+
+namespace llvm {
+namespace CodeViewYAML {
+namespace detail {
+
+template <> void LeafRecordImpl<ModifierRecord>::map(IO &IO) {
+  IO.mapRequired("ModifiedType", Record.ModifiedType);
+  IO.mapRequired("Modifiers", Record.Modifiers);
+}
+
+template <> void LeafRecordImpl<ProcedureRecord>::map(IO &IO) {
+  IO.mapRequired("ReturnType", Record.ReturnType);
+  IO.mapRequired("CallConv", Record.CallConv);
+  IO.mapRequired("Options", Record.Options);
+  IO.mapRequired("ParameterCount", Record.ParameterCount);
+  IO.mapRequired("ArgumentList", Record.ArgumentList);
+}
+
+template <> void LeafRecordImpl<MemberFunctionRecord>::map(IO &IO) {
+  IO.mapRequired("ReturnType", Record.ReturnType);
+  IO.mapRequired("ClassType", Record.ClassType);
+  IO.mapRequired("ThisType", Record.ThisType);
+  IO.mapRequired("CallConv", Record.CallConv);
+  IO.mapRequired("Options", Record.Options);
+  IO.mapRequired("ParameterCount", Record.ParameterCount);
+  IO.mapRequired("ArgumentList", Record.ArgumentList);
+  IO.mapRequired("ThisPointerAdjustment", Record.ThisPointerAdjustment);
+}
+
+template <> void LeafRecordImpl<LabelRecord>::map(IO &IO) {
+  IO.mapRequired("Mode", Record.Mode);
+}
+
+template <> void LeafRecordImpl<MemberFuncIdRecord>::map(IO &IO) {
+  IO.mapRequired("ClassType", Record.ClassType);
+  IO.mapRequired("FunctionType", Record.FunctionType);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void LeafRecordImpl<ArgListRecord>::map(IO &IO) {
+  IO.mapRequired("ArgIndices", Record.ArgIndices);
+}
+
+template <> void LeafRecordImpl<StringListRecord>::map(IO &IO) {
+  IO.mapRequired("StringIndices", Record.StringIndices);
+}
+
+template <> void LeafRecordImpl<PointerRecord>::map(IO &IO) {
+  IO.mapRequired("ReferentType", Record.ReferentType);
+  IO.mapRequired("Attrs", Record.Attrs);
+  IO.mapOptional("MemberInfo", Record.MemberInfo);
+}
+
+template <> void LeafRecordImpl<ArrayRecord>::map(IO &IO) {
+  IO.mapRequired("ElementType", Record.ElementType);
+  IO.mapRequired("IndexType", Record.IndexType);
+  IO.mapRequired("Size", Record.Size);
+  IO.mapRequired("Name", Record.Name);
+}
+
+void LeafRecordImpl<FieldListRecord>::map(IO &IO) {
+  IO.mapRequired("FieldList", Members);
+}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
+
+namespace {
+
+class MemberRecordConversionVisitor : public TypeVisitorCallbacks {
+public:
+  explicit MemberRecordConversionVisitor(std::vector<MemberRecord> &Records)
+      : Records(Records) {}
+
+#define TYPE_RECORD(EnumName, EnumVal, Name)
+#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
+  Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override { \
+    return visitKnownMemberImpl(Record);                                       \
+  }
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+private:
+  template <typename T> Error visitKnownMemberImpl(T &Record) {
+    TypeLeafKind K = static_cast<TypeLeafKind>(Record.getKind());
+    auto Impl = std::make_shared<MemberRecordImpl<T>>(K);
+    Impl->Record = Record;
+    Records.push_back(MemberRecord{Impl});
+    return Error::success();
+  }
+
+  std::vector<MemberRecord> &Records;
+};
+
+} // end anonymous namespace
+
+Error LeafRecordImpl<FieldListRecord>::fromCodeViewRecord(CVType Type) {
+  MemberRecordConversionVisitor V(Members);
+  return visitMemberRecordStream(Type.content(), V);
+}
+
+CVType
+LeafRecordImpl<FieldListRecord>::toCodeViewRecord(TypeTableBuilder &TTB) const {
+  FieldListRecordBuilder FLRB(TTB);
+  FLRB.begin();
+  for (const auto &Member : Members) {
+    Member.Member->writeTo(FLRB);
+  }
+  FLRB.end(true);
+  return CVType(Kind, TTB.records().back());
+}
+
+void MappingTraits<OneMethodRecord>::mapping(IO &io, OneMethodRecord &Record) {
+  io.mapRequired("Type", Record.Type);
+  io.mapRequired("Attrs", Record.Attrs.Attrs);
+  io.mapRequired("VFTableOffset", Record.VFTableOffset);
+  io.mapRequired("Name", Record.Name);
+}
+
+namespace llvm {
+namespace CodeViewYAML {
+namespace detail {
+
+template <> void LeafRecordImpl<ClassRecord>::map(IO &IO) {
+  IO.mapRequired("MemberCount", Record.MemberCount);
+  IO.mapRequired("Options", Record.Options);
+  IO.mapRequired("FieldList", Record.FieldList);
+  IO.mapRequired("Name", Record.Name);
+  IO.mapRequired("UniqueName", Record.UniqueName);
+  IO.mapRequired("DerivationList", Record.DerivationList);
+  IO.mapRequired("VTableShape", Record.VTableShape);
+  IO.mapRequired("Size", Record.Size);
+}
+
+template <> void LeafRecordImpl<UnionRecord>::map(IO &IO) {
+  IO.mapRequired("MemberCount", Record.MemberCount);
+  IO.mapRequired("Options", Record.Options);
+  IO.mapRequired("FieldList", Record.FieldList);
+  IO.mapRequired("Name", Record.Name);
+  IO.mapRequired("UniqueName", Record.UniqueName);
+  IO.mapRequired("Size", Record.Size);
+}
+
+template <> void LeafRecordImpl<EnumRecord>::map(IO &IO) {
+  IO.mapRequired("NumEnumerators", Record.MemberCount);
+  IO.mapRequired("Options", Record.Options);
+  IO.mapRequired("FieldList", Record.FieldList);
+  IO.mapRequired("Name", Record.Name);
+  IO.mapRequired("UniqueName", Record.UniqueName);
+  IO.mapRequired("UnderlyingType", Record.UnderlyingType);
+}
+
+template <> void LeafRecordImpl<BitFieldRecord>::map(IO &IO) {
+  IO.mapRequired("Type", Record.Type);
+  IO.mapRequired("BitSize", Record.BitSize);
+  IO.mapRequired("BitOffset", Record.BitOffset);
+}
+
+template <> void LeafRecordImpl<VFTableShapeRecord>::map(IO &IO) {
+  IO.mapRequired("Slots", Record.Slots);
+}
+
+template <> void LeafRecordImpl<TypeServer2Record>::map(IO &IO) {
+  IO.mapRequired("Guid", Record.Guid);
+  IO.mapRequired("Age", Record.Age);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void LeafRecordImpl<StringIdRecord>::map(IO &IO) {
+  IO.mapRequired("Id", Record.Id);
+  IO.mapRequired("String", Record.String);
+}
+
+template <> void LeafRecordImpl<FuncIdRecord>::map(IO &IO) {
+  IO.mapRequired("ParentScope", Record.ParentScope);
+  IO.mapRequired("FunctionType", Record.FunctionType);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void LeafRecordImpl<UdtSourceLineRecord>::map(IO &IO) {
+  IO.mapRequired("UDT", Record.UDT);
+  IO.mapRequired("SourceFile", Record.SourceFile);
+  IO.mapRequired("LineNumber", Record.LineNumber);
+}
+
+template <> void LeafRecordImpl<UdtModSourceLineRecord>::map(IO &IO) {
+  IO.mapRequired("UDT", Record.UDT);
+  IO.mapRequired("SourceFile", Record.SourceFile);
+  IO.mapRequired("LineNumber", Record.LineNumber);
+  IO.mapRequired("Module", Record.Module);
+}
+
+template <> void LeafRecordImpl<BuildInfoRecord>::map(IO &IO) {
+  IO.mapRequired("ArgIndices", Record.ArgIndices);
+}
+
+template <> void LeafRecordImpl<VFTableRecord>::map(IO &IO) {
+  IO.mapRequired("CompleteClass", Record.CompleteClass);
+  IO.mapRequired("OverriddenVFTable", Record.OverriddenVFTable);
+  IO.mapRequired("VFPtrOffset", Record.VFPtrOffset);
+  IO.mapRequired("MethodNames", Record.MethodNames);
+}
+
+template <> void LeafRecordImpl<MethodOverloadListRecord>::map(IO &IO) {
+  IO.mapRequired("Methods", Record.Methods);
+}
+
+template <> void MemberRecordImpl<OneMethodRecord>::map(IO &IO) {
+  MappingTraits<OneMethodRecord>::mapping(IO, Record);
+}
+
+template <> void MemberRecordImpl<OverloadedMethodRecord>::map(IO &IO) {
+  IO.mapRequired("NumOverloads", Record.NumOverloads);
+  IO.mapRequired("MethodList", Record.MethodList);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void MemberRecordImpl<NestedTypeRecord>::map(IO &IO) {
+  IO.mapRequired("Type", Record.Type);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void MemberRecordImpl<DataMemberRecord>::map(IO &IO) {
+  IO.mapRequired("Attrs", Record.Attrs.Attrs);
+  IO.mapRequired("Type", Record.Type);
+  IO.mapRequired("FieldOffset", Record.FieldOffset);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void MemberRecordImpl<StaticDataMemberRecord>::map(IO &IO) {
+  IO.mapRequired("Attrs", Record.Attrs.Attrs);
+  IO.mapRequired("Type", Record.Type);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void MemberRecordImpl<EnumeratorRecord>::map(IO &IO) {
+  IO.mapRequired("Attrs", Record.Attrs.Attrs);
+  IO.mapRequired("Value", Record.Value);
+  IO.mapRequired("Name", Record.Name);
+}
+
+template <> void MemberRecordImpl<VFPtrRecord>::map(IO &IO) {
+  IO.mapRequired("Type", Record.Type);
+}
+
+template <> void MemberRecordImpl<BaseClassRecord>::map(IO &IO) {
+  IO.mapRequired("Attrs", Record.Attrs.Attrs);
+  IO.mapRequired("Type", Record.Type);
+  IO.mapRequired("Offset", Record.Offset);
+}
+
+template <> void MemberRecordImpl<VirtualBaseClassRecord>::map(IO &IO) {
+  IO.mapRequired("Attrs", Record.Attrs.Attrs);
+  IO.mapRequired("BaseType", Record.BaseType);
+  IO.mapRequired("VBPtrType", Record.VBPtrType);
+  IO.mapRequired("VBPtrOffset", Record.VBPtrOffset);
+  IO.mapRequired("VTableIndex", Record.VTableIndex);
+}
+
+template <> void MemberRecordImpl<ListContinuationRecord>::map(IO &IO) {
+  IO.mapRequired("ContinuationIndex", Record.ContinuationIndex);
+}
+
+} // end namespace detail
+} // end namespace CodeViewYAML
+} // end namespace llvm
+
+template <typename T>
+static inline Expected<LeafRecord> fromCodeViewRecordImpl(CVType Type) {
+  LeafRecord Result;
+
+  auto Impl = std::make_shared<LeafRecordImpl<T>>(Type.kind());
+  if (auto EC = Impl->fromCodeViewRecord(Type))
+    return std::move(EC);
+  Result.Leaf = Impl;
+  return Result;
+}
+
+Expected<LeafRecord> LeafRecord::fromCodeViewRecord(CVType Type) {
+#define TYPE_RECORD(EnumName, EnumVal, ClassName)                              \
+  case EnumName:                                                               \
+    return fromCodeViewRecordImpl<ClassName##Record>(Type);
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)             \
+  TYPE_RECORD(EnumName, EnumVal, ClassName)
+#define MEMBER_RECORD(EnumName, EnumVal, ClassName)
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)
+  switch (Type.kind()) {
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+  default:
+      llvm_unreachable("Unknown leaf kind!");
+  }
+  return make_error<CodeViewError>(cv_error_code::corrupt_record);
+}
+
+CVType LeafRecord::toCodeViewRecord(BumpPtrAllocator &Alloc) const {
+  TypeTableBuilder TTB(Alloc);
+  return Leaf->toCodeViewRecord(TTB);
+}
+
+CVType LeafRecord::toCodeViewRecord(TypeTableBuilder &TTB) const {
+  return Leaf->toCodeViewRecord(TTB);
+}
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<LeafRecordBase> {
+  static void mapping(IO &io, LeafRecordBase &Record) { Record.map(io); }
+};
+
+template <> struct MappingTraits<MemberRecordBase> {
+  static void mapping(IO &io, MemberRecordBase &Record) { Record.map(io); }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+template <typename ConcreteType>
+static void mapLeafRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind,
+                              LeafRecord &Obj) {
+  if (!IO.outputting())
+    Obj.Leaf = std::make_shared<LeafRecordImpl<ConcreteType>>(Kind);
+
+  if (Kind == LF_FIELDLIST)
+    Obj.Leaf->map(IO);
+  else
+    IO.mapRequired(Class, *Obj.Leaf);
+}
+
+void MappingTraits<LeafRecord>::mapping(IO &IO, LeafRecord &Obj) {
+  TypeLeafKind Kind;
+  if (IO.outputting())
+    Kind = Obj.Leaf->Kind;
+  IO.mapRequired("Kind", Kind);
+
+#define TYPE_RECORD(EnumName, EnumVal, ClassName)                              \
+  case EnumName:                                                               \
+    mapLeafRecordImpl<ClassName##Record>(IO, #ClassName, Kind, Obj);           \
+    break;
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)             \
+  TYPE_RECORD(EnumName, EnumVal, ClassName)
+#define MEMBER_RECORD(EnumName, EnumVal, ClassName)
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)
+  switch (Kind) {
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+  default: { llvm_unreachable("Unknown leaf kind!"); }
+  }
+}
+
+template <typename ConcreteType>
+static void mapMemberRecordImpl(IO &IO, const char *Class, TypeLeafKind Kind,
+                                MemberRecord &Obj) {
+  if (!IO.outputting())
+    Obj.Member = std::make_shared<MemberRecordImpl<ConcreteType>>(Kind);
+
+  IO.mapRequired(Class, *Obj.Member);
+}
+
+void MappingTraits<MemberRecord>::mapping(IO &IO, MemberRecord &Obj) {
+  TypeLeafKind Kind;
+  if (IO.outputting())
+    Kind = Obj.Member->Kind;
+  IO.mapRequired("Kind", Kind);
+
+#define MEMBER_RECORD(EnumName, EnumVal, ClassName)                            \
+  case EnumName:                                                               \
+    mapMemberRecordImpl<ClassName##Record>(IO, #ClassName, Kind, Obj);         \
+    break;
+#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)           \
+  MEMBER_RECORD(EnumName, EnumVal, ClassName)
+#define TYPE_RECORD(EnumName, EnumVal, ClassName)
+#define TYPE_RECORD_ALIAS(EnumName, EnumVal, AliasName, ClassName)
+  switch (Kind) {
+#include "llvm/DebugInfo/CodeView/CodeViewTypes.def"
+  default: { llvm_unreachable("Unknown member kind!"); }
+  }
+}
+
+std::vector<LeafRecord>
+llvm::CodeViewYAML::fromDebugT(ArrayRef<uint8_t> DebugT) {
+  ExitOnError Err("Invalid .debug$T section!");
+  BinaryStreamReader Reader(DebugT, support::little);
+  CVTypeArray Types;
+  uint32_t Magic;
+
+  Err(Reader.readInteger(Magic));
+  assert(Magic == COFF::DEBUG_SECTION_MAGIC && "Invalid .debug$T section!");
+
+  std::vector<LeafRecord> Result;
+  Err(Reader.readArray(Types, Reader.bytesRemaining()));
+  for (const auto &T : Types) {
+    auto CVT = Err(LeafRecord::fromCodeViewRecord(T));
+    Result.push_back(CVT);
+  }
+  return Result;
+}
+
+ArrayRef<uint8_t> llvm::CodeViewYAML::toDebugT(ArrayRef<LeafRecord> Leafs,
+                                               BumpPtrAllocator &Alloc) {
+  TypeTableBuilder TTB(Alloc, false);
+  uint32_t Size = sizeof(uint32_t);
+  for (const auto &Leaf : Leafs) {
+    CVType T = Leaf.toCodeViewRecord(TTB);
+    Size += T.length();
+    assert(T.length() % 4 == 0 && "Improper type record alignment!");
+  }
+  uint8_t *ResultBuffer = Alloc.Allocate<uint8_t>(Size);
+  MutableArrayRef<uint8_t> Output(ResultBuffer, Size);
+  BinaryStreamWriter Writer(Output, support::little);
+  ExitOnError Err("Error writing type record to .debug$T section");
+  Err(Writer.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC));
+  for (const auto &R : TTB.records()) {
+    Err(Writer.writeBytes(R));
+  }
+  assert(Writer.bytesRemaining() == 0 && "Didn't write all type record bytes!");
+  return Output;
+}
diff --git a/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp
new file mode 100644
index 0000000..89fc652
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -0,0 +1,333 @@
+//===- DWARFEmitter - Convert YAML to DWARF binary data -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief The DWARF component of yaml2obj. Provided as library code for tests.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/DWARFEmitter.h"
+#include "DWARFVisitor.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+template <typename T>
+static void writeInteger(T Integer, raw_ostream &OS, bool IsLittleEndian) {
+  if (IsLittleEndian != sys::IsLittleEndianHost)
+    sys::swapByteOrder(Integer);
+  OS.write(reinterpret_cast<char *>(&Integer), sizeof(T));
+}
+
+static void writeVariableSizedInteger(uint64_t Integer, size_t Size,
+                                      raw_ostream &OS, bool IsLittleEndian) {
+  if (8 == Size)
+    writeInteger((uint64_t)Integer, OS, IsLittleEndian);
+  else if (4 == Size)
+    writeInteger((uint32_t)Integer, OS, IsLittleEndian);
+  else if (2 == Size)
+    writeInteger((uint16_t)Integer, OS, IsLittleEndian);
+  else if (1 == Size)
+    writeInteger((uint8_t)Integer, OS, IsLittleEndian);
+  else
+    assert(false && "Invalid integer write size.");
+}
+
+static void ZeroFillBytes(raw_ostream &OS, size_t Size) {
+  std::vector<uint8_t> FillData;
+  FillData.insert(FillData.begin(), Size, 0);
+  OS.write(reinterpret_cast<char *>(FillData.data()), Size);
+}
+
+void writeInitialLength(const DWARFYAML::InitialLength &Length, raw_ostream &OS,
+                        bool IsLittleEndian) {
+  writeInteger((uint32_t)Length.TotalLength, OS, IsLittleEndian);
+  if (Length.isDWARF64())
+    writeInteger((uint64_t)Length.TotalLength64, OS, IsLittleEndian);
+}
+
+void DWARFYAML::EmitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto Str : DI.DebugStrings) {
+    OS.write(Str.data(), Str.size());
+    OS.write('\0');
+  }
+}
+
+void DWARFYAML::EmitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto AbbrevDecl : DI.AbbrevDecls) {
+    encodeULEB128(AbbrevDecl.Code, OS);
+    encodeULEB128(AbbrevDecl.Tag, OS);
+    OS.write(AbbrevDecl.Children);
+    for (auto Attr : AbbrevDecl.Attributes) {
+      encodeULEB128(Attr.Attribute, OS);
+      encodeULEB128(Attr.Form, OS);
+      if (Attr.Form == dwarf::DW_FORM_implicit_const)
+        encodeSLEB128(Attr.Value, OS);
+    }
+    encodeULEB128(0, OS);
+    encodeULEB128(0, OS);
+  }
+}
+
+void DWARFYAML::EmitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (auto Range : DI.ARanges) {
+    auto HeaderStart = OS.tell();
+    writeInitialLength(Range.Length, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian);
+    writeInteger((uint32_t)Range.CuOffset, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)Range.AddrSize, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)Range.SegSize, OS, DI.IsLittleEndian);
+
+    auto HeaderSize = OS.tell() - HeaderStart;
+    auto FirstDescriptor = alignTo(HeaderSize, Range.AddrSize * 2);
+    ZeroFillBytes(OS, FirstDescriptor - HeaderSize);
+
+    for (auto Descriptor : Range.Descriptors) {
+      writeVariableSizedInteger(Descriptor.Address, Range.AddrSize, OS,
+                                DI.IsLittleEndian);
+      writeVariableSizedInteger(Descriptor.Length, Range.AddrSize, OS,
+                                DI.IsLittleEndian);
+    }
+    ZeroFillBytes(OS, Range.AddrSize * 2);
+  }
+}
+
+void DWARFYAML::EmitPubSection(raw_ostream &OS,
+                               const DWARFYAML::PubSection &Sect,
+                               bool IsLittleEndian) {
+  writeInitialLength(Sect.Length, OS, IsLittleEndian);
+  writeInteger((uint16_t)Sect.Version, OS, IsLittleEndian);
+  writeInteger((uint32_t)Sect.UnitOffset, OS, IsLittleEndian);
+  writeInteger((uint32_t)Sect.UnitSize, OS, IsLittleEndian);
+  for (auto Entry : Sect.Entries) {
+    writeInteger((uint32_t)Entry.DieOffset, OS, IsLittleEndian);
+    if (Sect.IsGNUStyle)
+      writeInteger((uint32_t)Entry.Descriptor, OS, IsLittleEndian);
+    OS.write(Entry.Name.data(), Entry.Name.size());
+    OS.write('\0');
+  }
+}
+
+/// \brief An extension of the DWARFYAML::ConstVisitor which writes compile
+/// units and DIEs to a stream.
+class DumpVisitor : public DWARFYAML::ConstVisitor {
+  raw_ostream &OS;
+
+protected:
+  void onStartCompileUnit(const DWARFYAML::Unit &CU) override {
+    writeInitialLength(CU.Length, OS, DebugInfo.IsLittleEndian);
+    writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian);
+    if(CU.Version >= 5) {
+      writeInteger((uint8_t)CU.Type, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+    }else {
+      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
+    }
+    
+  }
+
+  void onStartDIE(const DWARFYAML::Unit &CU,
+                  const DWARFYAML::Entry &DIE) override {
+    encodeULEB128(DIE.AbbrCode, OS);
+  }
+
+  void onValue(const uint8_t U) override {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  void onValue(const uint16_t U) override {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  void onValue(const uint32_t U) override {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  void onValue(const uint64_t U, const bool LEB = false) override {
+    if (LEB)
+      encodeULEB128(U, OS);
+    else
+      writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  void onValue(const int64_t S, const bool LEB = false) override {
+    if (LEB)
+      encodeSLEB128(S, OS);
+    else
+      writeInteger(S, OS, DebugInfo.IsLittleEndian);
+  }
+
+  void onValue(const StringRef String) override {
+    OS.write(String.data(), String.size());
+    OS.write('\0');
+  }
+
+  void onValue(const MemoryBufferRef MBR) override {
+    OS.write(MBR.getBufferStart(), MBR.getBufferSize());
+  }
+
+public:
+  DumpVisitor(const DWARFYAML::Data &DI, raw_ostream &Out)
+      : DWARFYAML::ConstVisitor(DI), OS(Out) {}
+};
+
+void DWARFYAML::EmitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  DumpVisitor Visitor(DI, OS);
+  Visitor.traverseDebugInfo();
+}
+
+static void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
+  OS.write(File.Name.data(), File.Name.size());
+  OS.write('\0');
+  encodeULEB128(File.DirIdx, OS);
+  encodeULEB128(File.ModTime, OS);
+  encodeULEB128(File.Length, OS);
+}
+
+void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  for (const auto &LineTable : DI.DebugLines) {
+    writeInitialLength(LineTable.Length, OS, DI.IsLittleEndian);
+    uint64_t SizeOfPrologueLength = LineTable.Length.isDWARF64() ? 8 : 4;
+    writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian);
+    writeVariableSizedInteger(LineTable.PrologueLength, SizeOfPrologueLength,
+                              OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.MinInstLength, OS, DI.IsLittleEndian);
+    if (LineTable.Version >= 4)
+      writeInteger((uint8_t)LineTable.MaxOpsPerInst, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.DefaultIsStmt, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.LineBase, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.LineRange, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)LineTable.OpcodeBase, OS, DI.IsLittleEndian);
+
+    for (auto OpcodeLength : LineTable.StandardOpcodeLengths)
+      writeInteger((uint8_t)OpcodeLength, OS, DI.IsLittleEndian);
+
+    for (auto IncludeDir : LineTable.IncludeDirs) {
+      OS.write(IncludeDir.data(), IncludeDir.size());
+      OS.write('\0');
+    }
+    OS.write('\0');
+
+    for (auto File : LineTable.Files)
+      EmitFileEntry(OS, File);
+    OS.write('\0');
+
+    for (auto Op : LineTable.Opcodes) {
+      writeInteger((uint8_t)Op.Opcode, OS, DI.IsLittleEndian);
+      if (Op.Opcode == 0) {
+        encodeULEB128(Op.ExtLen, OS);
+        writeInteger((uint8_t)Op.SubOpcode, OS, DI.IsLittleEndian);
+        switch (Op.SubOpcode) {
+        case dwarf::DW_LNE_set_address:
+        case dwarf::DW_LNE_set_discriminator:
+          writeVariableSizedInteger(Op.Data, DI.CompileUnits[0].AddrSize, OS,
+                                    DI.IsLittleEndian);
+          break;
+        case dwarf::DW_LNE_define_file:
+          EmitFileEntry(OS, Op.FileEntry);
+          break;
+        case dwarf::DW_LNE_end_sequence:
+          break;
+        default:
+          for (auto OpByte : Op.UnknownOpcodeData)
+            writeInteger((uint8_t)OpByte, OS, DI.IsLittleEndian);
+        }
+      } else if (Op.Opcode < LineTable.OpcodeBase) {
+        switch (Op.Opcode) {
+        case dwarf::DW_LNS_copy:
+        case dwarf::DW_LNS_negate_stmt:
+        case dwarf::DW_LNS_set_basic_block:
+        case dwarf::DW_LNS_const_add_pc:
+        case dwarf::DW_LNS_set_prologue_end:
+        case dwarf::DW_LNS_set_epilogue_begin:
+          break;
+
+        case dwarf::DW_LNS_advance_pc:
+        case dwarf::DW_LNS_set_file:
+        case dwarf::DW_LNS_set_column:
+        case dwarf::DW_LNS_set_isa:
+          encodeULEB128(Op.Data, OS);
+          break;
+
+        case dwarf::DW_LNS_advance_line:
+          encodeSLEB128(Op.SData, OS);
+          break;
+
+        case dwarf::DW_LNS_fixed_advance_pc:
+          writeInteger((uint16_t)Op.Data, OS, DI.IsLittleEndian);
+          break;
+
+        default:
+          for (auto OpData : Op.StandardOpcodeData) {
+            encodeULEB128(OpData, OS);
+          }
+        }
+      }
+    }
+  }
+}
+
+using EmitFuncType = void (*)(raw_ostream &, const DWARFYAML::Data &);
+
+static void
+EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
+                     StringRef Sec,
+                     StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
+  std::string Data;
+  raw_string_ostream DebugInfoStream(Data);
+  EmitFunc(DebugInfoStream, DI);
+  DebugInfoStream.flush();
+  if (!Data.empty())
+    OutputBuffers[Sec] = MemoryBuffer::getMemBufferCopy(Data);
+}
+
+Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
+DWARFYAML::EmitDebugSections(StringRef YAMLString,
+                             bool IsLittleEndian) {
+  StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
+
+  yaml::Input YIn(YAMLString);
+
+  DWARFYAML::Data DI;
+  DI.IsLittleEndian = IsLittleEndian;
+  YIn >> DI;
+  if (YIn.error())
+    return errorCodeToError(YIn.error());
+
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugInfo, "debug_info",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugLine, "debug_line",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugStr, "debug_str",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugAbbrev, "debug_abbrev",
+                       DebugSections);
+  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugAranges, "debug_aranges",
+                       DebugSections);
+  return std::move(DebugSections);
+}
diff --git a/contrib/llvm/lib/ObjectYAML/DWARFVisitor.cpp b/contrib/llvm/lib/ObjectYAML/DWARFVisitor.cpp
new file mode 100644
index 0000000..36a9f76
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/DWARFVisitor.cpp
@@ -0,0 +1,178 @@
+//===--- DWARFVisitor.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFVisitor.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+
+using namespace llvm;
+
+template <typename T>
+void DWARFYAML::VisitorImpl<T>::onVariableSizeValue(uint64_t U, unsigned Size) {
+  switch (Size) {
+  case 8:
+    onValue((uint64_t)U);
+    break;
+  case 4:
+    onValue((uint32_t)U);
+    break;
+  case 2:
+    onValue((uint16_t)U);
+    break;
+  case 1:
+    onValue((uint8_t)U);
+    break;
+  default:
+    llvm_unreachable("Invalid integer write size.");
+  }
+}
+
+unsigned getOffsetSize(const DWARFYAML::Unit &Unit) {
+  return Unit.Length.isDWARF64() ? 8 : 4;
+}
+
+unsigned getRefSize(const DWARFYAML::Unit &Unit) {
+  if (Unit.Version == 2)
+    return Unit.AddrSize;
+  return getOffsetSize(Unit);
+}
+
+template <typename T> void DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
+  for (auto &Unit : DebugInfo.CompileUnits) {
+    onStartCompileUnit(Unit);
+    auto FirstAbbrevCode = Unit.Entries[0].AbbrCode;
+
+    for (auto &Entry : Unit.Entries) {
+      onStartDIE(Unit, Entry);
+      if (Entry.AbbrCode == 0u)
+        continue;
+      auto &Abbrev = DebugInfo.AbbrevDecls[Entry.AbbrCode - FirstAbbrevCode];
+      auto FormVal = Entry.Values.begin();
+      auto AbbrForm = Abbrev.Attributes.begin();
+      for (;
+           FormVal != Entry.Values.end() && AbbrForm != Abbrev.Attributes.end();
+           ++FormVal, ++AbbrForm) {
+        onForm(*AbbrForm, *FormVal);
+        dwarf::Form Form = AbbrForm->Form;
+        bool Indirect;
+        do {
+          Indirect = false;
+          switch (Form) {
+          case dwarf::DW_FORM_addr:
+            onVariableSizeValue(FormVal->Value, Unit.AddrSize);
+            break;
+          case dwarf::DW_FORM_ref_addr:
+            onVariableSizeValue(FormVal->Value, getRefSize(Unit));
+            break;
+          case dwarf::DW_FORM_exprloc:
+          case dwarf::DW_FORM_block:
+            onValue((uint64_t)FormVal->BlockData.size(), true);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          case dwarf::DW_FORM_block1: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint8_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_block2: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint16_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_block4: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint32_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_data1:
+          case dwarf::DW_FORM_ref1:
+          case dwarf::DW_FORM_flag:
+          case dwarf::DW_FORM_strx1:
+          case dwarf::DW_FORM_addrx1:
+            onValue((uint8_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data2:
+          case dwarf::DW_FORM_ref2:
+          case dwarf::DW_FORM_strx2:
+          case dwarf::DW_FORM_addrx2:
+            onValue((uint16_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data4:
+          case dwarf::DW_FORM_ref4:
+          case dwarf::DW_FORM_ref_sup4:
+          case dwarf::DW_FORM_strx4:
+          case dwarf::DW_FORM_addrx4:
+            onValue((uint32_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data8:
+          case dwarf::DW_FORM_ref8:
+          case dwarf::DW_FORM_ref_sup8:
+            onValue((uint64_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_sdata:
+            onValue((int64_t)FormVal->Value, true);
+            break;
+          case dwarf::DW_FORM_udata:
+          case dwarf::DW_FORM_ref_udata:
+            onValue((uint64_t)FormVal->Value, true);
+            break;
+          case dwarf::DW_FORM_string:
+            onValue(FormVal->CStr);
+            break;
+          case dwarf::DW_FORM_indirect:
+            onValue((uint64_t)FormVal->Value, true);
+            Indirect = true;
+            Form = static_cast<dwarf::Form>((uint64_t)FormVal->Value);
+            ++FormVal;
+            break;
+          case dwarf::DW_FORM_strp:
+          case dwarf::DW_FORM_sec_offset:
+          case dwarf::DW_FORM_GNU_ref_alt:
+          case dwarf::DW_FORM_GNU_strp_alt:
+          case dwarf::DW_FORM_line_strp:
+          case dwarf::DW_FORM_strp_sup:
+            onVariableSizeValue(FormVal->Value, getOffsetSize(Unit));
+            break;
+          case dwarf::DW_FORM_ref_sig8:
+            onValue((uint64_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_GNU_addr_index:
+          case dwarf::DW_FORM_GNU_str_index:
+            onValue((uint64_t)FormVal->Value, true);
+            break;
+          default:
+            break;
+          }
+        } while (Indirect);
+      }
+      onEndDIE(Unit, Entry);
+    }
+    onEndCompileUnit(Unit);
+  }
+}
+
+// Explicitly instantiate the two template expansions.
+template class DWARFYAML::VisitorImpl<DWARFYAML::Data>;
+template class DWARFYAML::VisitorImpl<const DWARFYAML::Data>;
diff --git a/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h b/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h
new file mode 100644
index 0000000..81ef412
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/DWARFVisitor.h
@@ -0,0 +1,97 @@
+//===--- DWARFVisitor.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_DWARFVISITOR_H
+#define LLVM_OBJECTYAML_DWARFVISITOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+namespace DWARFYAML {
+
+struct Data;
+struct Unit;
+struct Entry;
+struct FormValue;
+struct AttributeAbbrev;
+
+/// \brief A class to visits DWARFYAML Compile Units and DIEs in preorder.
+///
+/// Extensions of this class can either maintain const or non-const references
+/// to the DWARFYAML::Data object.
+template <typename T> class VisitorImpl {
+protected:
+  T &DebugInfo;
+
+  /// Visitor Functions
+  /// @{
+  virtual void onStartCompileUnit(Unit &CU) {}
+  virtual void onEndCompileUnit(Unit &CU) {}
+  virtual void onStartDIE(Unit &CU, Entry &DIE) {}
+  virtual void onEndDIE(Unit &CU, Entry &DIE) {}
+  virtual void onForm(AttributeAbbrev &AttAbbrev, FormValue &Value) {}
+  /// @}
+
+  /// Const Visitor Functions
+  /// @{
+  virtual void onStartCompileUnit(const Unit &CU) {}
+  virtual void onEndCompileUnit(const Unit &CU) {}
+  virtual void onStartDIE(const Unit &CU, const Entry &DIE) {}
+  virtual void onEndDIE(const Unit &CU, const Entry &DIE) {}
+  virtual void onForm(const AttributeAbbrev &AttAbbrev,
+                      const FormValue &Value) {}
+  /// @}
+
+  /// Value visitors
+  /// @{
+  virtual void onValue(const uint8_t U) {}
+  virtual void onValue(const uint16_t U) {}
+  virtual void onValue(const uint32_t U) {}
+  virtual void onValue(const uint64_t U, const bool LEB = false) {}
+  virtual void onValue(const int64_t S, const bool LEB = false) {}
+  virtual void onValue(const StringRef String) {}
+  virtual void onValue(const MemoryBufferRef MBR) {}
+  /// @}
+
+public:
+  VisitorImpl(T &DI) : DebugInfo(DI) {}
+
+  virtual ~VisitorImpl() {}
+
+  void traverseDebugInfo();
+
+private:
+  void onVariableSizeValue(uint64_t U, unsigned Size);
+};
+
+// Making the visior instantiations extern and explicit in the cpp file. This
+// prevents them from being instantiated in every compile unit that uses the
+// visitors.
+extern template class VisitorImpl<DWARFYAML::Data>;
+extern template class VisitorImpl<const DWARFYAML::Data>;
+
+class Visitor : public VisitorImpl<Data> {
+public:
+  Visitor(Data &DI) : VisitorImpl<Data>(DI) {}
+};
+
+class ConstVisitor : public VisitorImpl<const Data> {
+public:
+  ConstVisitor(const Data &DI) : VisitorImpl<const Data>(DI) {}
+};
+
+} // namespace DWARFYAML
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/ObjectYAML/DWARFYAML.cpp b/contrib/llvm/lib/ObjectYAML/DWARFYAML.cpp
index 014e63f..d6c09e1 100644
--- a/contrib/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -54,6 +54,8 @@ void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
     IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev) {
   IO.mapRequired("Attribute", AttAbbrev.Attribute);
   IO.mapRequired("Form", AttAbbrev.Form);
+  if(AttAbbrev.Form == dwarf::DW_FORM_implicit_const)
+    IO.mapRequired("Value", AttAbbrev.Value);
 }
 
 void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
@@ -97,6 +99,8 @@ void MappingTraits<DWARFYAML::PubSection>::mapping(
 void MappingTraits<DWARFYAML::Unit>::mapping(IO &IO, DWARFYAML::Unit &Unit) {
   IO.mapRequired("Length", Unit.Length);
   IO.mapRequired("Version", Unit.Version);
+  if (Unit.Version >= 5)
+    IO.mapRequired("UnitType", Unit.Type);
   IO.mapRequired("AbbrOffset", Unit.AbbrOffset);
   IO.mapRequired("AddrSize", Unit.AddrSize);
   IO.mapOptional("Entries", Unit.Entries);
@@ -144,9 +148,7 @@ void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
 
 void MappingTraits<DWARFYAML::LineTable>::mapping(
     IO &IO, DWARFYAML::LineTable &LineTable) {
-  IO.mapRequired("TotalLength", LineTable.TotalLength);
-  if (LineTable.TotalLength == UINT32_MAX)
-    IO.mapRequired("TotalLength64", LineTable.TotalLength64);
+  IO.mapRequired("Length", LineTable.Length);
   IO.mapRequired("Version", LineTable.Version);
   IO.mapRequired("PrologueLength", LineTable.PrologueLength);
   IO.mapRequired("MinInstLength", LineTable.MinInstLength);
@@ -162,6 +164,13 @@ void MappingTraits<DWARFYAML::LineTable>::mapping(
   IO.mapRequired("Opcodes", LineTable.Opcodes);
 }
 
-} // namespace llvm::yaml
+void MappingTraits<DWARFYAML::InitialLength>::mapping(
+    IO &IO, DWARFYAML::InitialLength &InitialLength) {
+  IO.mapRequired("TotalLength", InitialLength.TotalLength);
+  if (InitialLength.isDWARF64())
+    IO.mapRequired("TotalLength64", InitialLength.TotalLength64);
+}
+
+} // end namespace yaml
 
-} // namespace llvm
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp b/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp
index fe9af9f..39741da 100644
--- a/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -12,240 +12,244 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/ELFYAML.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cassert>
+#include <cstdint>
 
 namespace llvm {
 
-ELFYAML::Section::~Section() {}
+ELFYAML::Section::~Section() = default;
 
 namespace yaml {
 
-void
-ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(IO &IO,
-                                                      ELFYAML::ELF_ET &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(ET_NONE)
-  ECase(ET_REL)
-  ECase(ET_EXEC)
-  ECase(ET_DYN)
-  ECase(ET_CORE)
+void ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(
+    IO &IO, ELFYAML::ELF_ET &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(ET_NONE);
+  ECase(ET_REL);
+  ECase(ET_EXEC);
+  ECase(ET_DYN);
+  ECase(ET_CORE);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
 
-void
-ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO,
-                                                      ELFYAML::ELF_EM &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(EM_NONE)
-  ECase(EM_M32)
-  ECase(EM_SPARC)
-  ECase(EM_386)
-  ECase(EM_68K)
-  ECase(EM_88K)
-  ECase(EM_IAMCU)
-  ECase(EM_860)
-  ECase(EM_MIPS)
-  ECase(EM_S370)
-  ECase(EM_MIPS_RS3_LE)
-  ECase(EM_PARISC)
-  ECase(EM_VPP500)
-  ECase(EM_SPARC32PLUS)
-  ECase(EM_960)
-  ECase(EM_PPC)
-  ECase(EM_PPC64)
-  ECase(EM_S390)
-  ECase(EM_SPU)
-  ECase(EM_V800)
-  ECase(EM_FR20)
-  ECase(EM_RH32)
-  ECase(EM_RCE)
-  ECase(EM_ARM)
-  ECase(EM_ALPHA)
-  ECase(EM_SH)
-  ECase(EM_SPARCV9)
-  ECase(EM_TRICORE)
-  ECase(EM_ARC)
-  ECase(EM_H8_300)
-  ECase(EM_H8_300H)
-  ECase(EM_H8S)
-  ECase(EM_H8_500)
-  ECase(EM_IA_64)
-  ECase(EM_MIPS_X)
-  ECase(EM_COLDFIRE)
-  ECase(EM_68HC12)
-  ECase(EM_MMA)
-  ECase(EM_PCP)
-  ECase(EM_NCPU)
-  ECase(EM_NDR1)
-  ECase(EM_STARCORE)
-  ECase(EM_ME16)
-  ECase(EM_ST100)
-  ECase(EM_TINYJ)
-  ECase(EM_X86_64)
-  ECase(EM_PDSP)
-  ECase(EM_PDP10)
-  ECase(EM_PDP11)
-  ECase(EM_FX66)
-  ECase(EM_ST9PLUS)
-  ECase(EM_ST7)
-  ECase(EM_68HC16)
-  ECase(EM_68HC11)
-  ECase(EM_68HC08)
-  ECase(EM_68HC05)
-  ECase(EM_SVX)
-  ECase(EM_ST19)
-  ECase(EM_VAX)
-  ECase(EM_CRIS)
-  ECase(EM_JAVELIN)
-  ECase(EM_FIREPATH)
-  ECase(EM_ZSP)
-  ECase(EM_MMIX)
-  ECase(EM_HUANY)
-  ECase(EM_PRISM)
-  ECase(EM_AVR)
-  ECase(EM_FR30)
-  ECase(EM_D10V)
-  ECase(EM_D30V)
-  ECase(EM_V850)
-  ECase(EM_M32R)
-  ECase(EM_MN10300)
-  ECase(EM_MN10200)
-  ECase(EM_PJ)
-  ECase(EM_OPENRISC)
-  ECase(EM_ARC_COMPACT)
-  ECase(EM_XTENSA)
-  ECase(EM_VIDEOCORE)
-  ECase(EM_TMM_GPP)
-  ECase(EM_NS32K)
-  ECase(EM_TPC)
-  ECase(EM_SNP1K)
-  ECase(EM_ST200)
-  ECase(EM_IP2K)
-  ECase(EM_MAX)
-  ECase(EM_CR)
-  ECase(EM_F2MC16)
-  ECase(EM_MSP430)
-  ECase(EM_BLACKFIN)
-  ECase(EM_SE_C33)
-  ECase(EM_SEP)
-  ECase(EM_ARCA)
-  ECase(EM_UNICORE)
-  ECase(EM_EXCESS)
-  ECase(EM_DXP)
-  ECase(EM_ALTERA_NIOS2)
-  ECase(EM_CRX)
-  ECase(EM_XGATE)
-  ECase(EM_C166)
-  ECase(EM_M16C)
-  ECase(EM_DSPIC30F)
-  ECase(EM_CE)
-  ECase(EM_M32C)
-  ECase(EM_TSK3000)
-  ECase(EM_RS08)
-  ECase(EM_SHARC)
-  ECase(EM_ECOG2)
-  ECase(EM_SCORE7)
-  ECase(EM_DSP24)
-  ECase(EM_VIDEOCORE3)
-  ECase(EM_LATTICEMICO32)
-  ECase(EM_SE_C17)
-  ECase(EM_TI_C6000)
-  ECase(EM_TI_C2000)
-  ECase(EM_TI_C5500)
-  ECase(EM_MMDSP_PLUS)
-  ECase(EM_CYPRESS_M8C)
-  ECase(EM_R32C)
-  ECase(EM_TRIMEDIA)
-  ECase(EM_HEXAGON)
-  ECase(EM_8051)
-  ECase(EM_STXP7X)
-  ECase(EM_NDS32)
-  ECase(EM_ECOG1)
-  ECase(EM_ECOG1X)
-  ECase(EM_MAXQ30)
-  ECase(EM_XIMO16)
-  ECase(EM_MANIK)
-  ECase(EM_CRAYNV2)
-  ECase(EM_RX)
-  ECase(EM_METAG)
-  ECase(EM_MCST_ELBRUS)
-  ECase(EM_ECOG16)
-  ECase(EM_CR16)
-  ECase(EM_ETPU)
-  ECase(EM_SLE9X)
-  ECase(EM_L10M)
-  ECase(EM_K10M)
-  ECase(EM_AARCH64)
-  ECase(EM_AVR32)
-  ECase(EM_STM8)
-  ECase(EM_TILE64)
-  ECase(EM_TILEPRO)
-  ECase(EM_CUDA)
-  ECase(EM_TILEGX)
-  ECase(EM_CLOUDSHIELD)
-  ECase(EM_COREA_1ST)
-  ECase(EM_COREA_2ND)
-  ECase(EM_ARC_COMPACT2)
-  ECase(EM_OPEN8)
-  ECase(EM_RL78)
-  ECase(EM_VIDEOCORE5)
-  ECase(EM_78KOR)
-  ECase(EM_56800EX)
-  ECase(EM_AMDGPU)
-  ECase(EM_RISCV)
-  ECase(EM_LANAI)
-  ECase(EM_BPF)
+void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
+    IO &IO, ELFYAML::ELF_EM &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(EM_NONE);
+  ECase(EM_M32);
+  ECase(EM_SPARC);
+  ECase(EM_386);
+  ECase(EM_68K);
+  ECase(EM_88K);
+  ECase(EM_IAMCU);
+  ECase(EM_860);
+  ECase(EM_MIPS);
+  ECase(EM_S370);
+  ECase(EM_MIPS_RS3_LE);
+  ECase(EM_PARISC);
+  ECase(EM_VPP500);
+  ECase(EM_SPARC32PLUS);
+  ECase(EM_960);
+  ECase(EM_PPC);
+  ECase(EM_PPC64);
+  ECase(EM_S390);
+  ECase(EM_SPU);
+  ECase(EM_V800);
+  ECase(EM_FR20);
+  ECase(EM_RH32);
+  ECase(EM_RCE);
+  ECase(EM_ARM);
+  ECase(EM_ALPHA);
+  ECase(EM_SH);
+  ECase(EM_SPARCV9);
+  ECase(EM_TRICORE);
+  ECase(EM_ARC);
+  ECase(EM_H8_300);
+  ECase(EM_H8_300H);
+  ECase(EM_H8S);
+  ECase(EM_H8_500);
+  ECase(EM_IA_64);
+  ECase(EM_MIPS_X);
+  ECase(EM_COLDFIRE);
+  ECase(EM_68HC12);
+  ECase(EM_MMA);
+  ECase(EM_PCP);
+  ECase(EM_NCPU);
+  ECase(EM_NDR1);
+  ECase(EM_STARCORE);
+  ECase(EM_ME16);
+  ECase(EM_ST100);
+  ECase(EM_TINYJ);
+  ECase(EM_X86_64);
+  ECase(EM_PDSP);
+  ECase(EM_PDP10);
+  ECase(EM_PDP11);
+  ECase(EM_FX66);
+  ECase(EM_ST9PLUS);
+  ECase(EM_ST7);
+  ECase(EM_68HC16);
+  ECase(EM_68HC11);
+  ECase(EM_68HC08);
+  ECase(EM_68HC05);
+  ECase(EM_SVX);
+  ECase(EM_ST19);
+  ECase(EM_VAX);
+  ECase(EM_CRIS);
+  ECase(EM_JAVELIN);
+  ECase(EM_FIREPATH);
+  ECase(EM_ZSP);
+  ECase(EM_MMIX);
+  ECase(EM_HUANY);
+  ECase(EM_PRISM);
+  ECase(EM_AVR);
+  ECase(EM_FR30);
+  ECase(EM_D10V);
+  ECase(EM_D30V);
+  ECase(EM_V850);
+  ECase(EM_M32R);
+  ECase(EM_MN10300);
+  ECase(EM_MN10200);
+  ECase(EM_PJ);
+  ECase(EM_OPENRISC);
+  ECase(EM_ARC_COMPACT);
+  ECase(EM_XTENSA);
+  ECase(EM_VIDEOCORE);
+  ECase(EM_TMM_GPP);
+  ECase(EM_NS32K);
+  ECase(EM_TPC);
+  ECase(EM_SNP1K);
+  ECase(EM_ST200);
+  ECase(EM_IP2K);
+  ECase(EM_MAX);
+  ECase(EM_CR);
+  ECase(EM_F2MC16);
+  ECase(EM_MSP430);
+  ECase(EM_BLACKFIN);
+  ECase(EM_SE_C33);
+  ECase(EM_SEP);
+  ECase(EM_ARCA);
+  ECase(EM_UNICORE);
+  ECase(EM_EXCESS);
+  ECase(EM_DXP);
+  ECase(EM_ALTERA_NIOS2);
+  ECase(EM_CRX);
+  ECase(EM_XGATE);
+  ECase(EM_C166);
+  ECase(EM_M16C);
+  ECase(EM_DSPIC30F);
+  ECase(EM_CE);
+  ECase(EM_M32C);
+  ECase(EM_TSK3000);
+  ECase(EM_RS08);
+  ECase(EM_SHARC);
+  ECase(EM_ECOG2);
+  ECase(EM_SCORE7);
+  ECase(EM_DSP24);
+  ECase(EM_VIDEOCORE3);
+  ECase(EM_LATTICEMICO32);
+  ECase(EM_SE_C17);
+  ECase(EM_TI_C6000);
+  ECase(EM_TI_C2000);
+  ECase(EM_TI_C5500);
+  ECase(EM_MMDSP_PLUS);
+  ECase(EM_CYPRESS_M8C);
+  ECase(EM_R32C);
+  ECase(EM_TRIMEDIA);
+  ECase(EM_HEXAGON);
+  ECase(EM_8051);
+  ECase(EM_STXP7X);
+  ECase(EM_NDS32);
+  ECase(EM_ECOG1);
+  ECase(EM_ECOG1X);
+  ECase(EM_MAXQ30);
+  ECase(EM_XIMO16);
+  ECase(EM_MANIK);
+  ECase(EM_CRAYNV2);
+  ECase(EM_RX);
+  ECase(EM_METAG);
+  ECase(EM_MCST_ELBRUS);
+  ECase(EM_ECOG16);
+  ECase(EM_CR16);
+  ECase(EM_ETPU);
+  ECase(EM_SLE9X);
+  ECase(EM_L10M);
+  ECase(EM_K10M);
+  ECase(EM_AARCH64);
+  ECase(EM_AVR32);
+  ECase(EM_STM8);
+  ECase(EM_TILE64);
+  ECase(EM_TILEPRO);
+  ECase(EM_CUDA);
+  ECase(EM_TILEGX);
+  ECase(EM_CLOUDSHIELD);
+  ECase(EM_COREA_1ST);
+  ECase(EM_COREA_2ND);
+  ECase(EM_ARC_COMPACT2);
+  ECase(EM_OPEN8);
+  ECase(EM_RL78);
+  ECase(EM_VIDEOCORE5);
+  ECase(EM_78KOR);
+  ECase(EM_56800EX);
+  ECase(EM_AMDGPU);
+  ECase(EM_RISCV);
+  ECase(EM_LANAI);
+  ECase(EM_BPF);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFCLASS>::enumeration(
     IO &IO, ELFYAML::ELF_ELFCLASS &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
   // Since the semantics of ELFCLASSNONE is "invalid", just don't accept it
   // here.
-  ECase(ELFCLASS32)
-  ECase(ELFCLASS64)
+  ECase(ELFCLASS32);
+  ECase(ELFCLASS64);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFDATA>::enumeration(
     IO &IO, ELFYAML::ELF_ELFDATA &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
   // Since the semantics of ELFDATANONE is "invalid", just don't accept it
   // here.
-  ECase(ELFDATA2LSB)
-  ECase(ELFDATA2MSB)
+  ECase(ELFDATA2LSB);
+  ECase(ELFDATA2MSB);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFOSABI>::enumeration(
     IO &IO, ELFYAML::ELF_ELFOSABI &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(ELFOSABI_NONE)
-  ECase(ELFOSABI_HPUX)
-  ECase(ELFOSABI_NETBSD)
-  ECase(ELFOSABI_GNU)
-  ECase(ELFOSABI_GNU)
-  ECase(ELFOSABI_HURD)
-  ECase(ELFOSABI_SOLARIS)
-  ECase(ELFOSABI_AIX)
-  ECase(ELFOSABI_IRIX)
-  ECase(ELFOSABI_FREEBSD)
-  ECase(ELFOSABI_TRU64)
-  ECase(ELFOSABI_MODESTO)
-  ECase(ELFOSABI_OPENBSD)
-  ECase(ELFOSABI_OPENVMS)
-  ECase(ELFOSABI_NSK)
-  ECase(ELFOSABI_AROS)
-  ECase(ELFOSABI_FENIXOS)
-  ECase(ELFOSABI_CLOUDABI)
-  ECase(ELFOSABI_C6000_ELFABI)
-  ECase(ELFOSABI_AMDGPU_HSA)
-  ECase(ELFOSABI_C6000_LINUX)
-  ECase(ELFOSABI_ARM)
-  ECase(ELFOSABI_STANDALONE)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(ELFOSABI_NONE);
+  ECase(ELFOSABI_HPUX);
+  ECase(ELFOSABI_NETBSD);
+  ECase(ELFOSABI_GNU);
+  ECase(ELFOSABI_GNU);
+  ECase(ELFOSABI_HURD);
+  ECase(ELFOSABI_SOLARIS);
+  ECase(ELFOSABI_AIX);
+  ECase(ELFOSABI_IRIX);
+  ECase(ELFOSABI_FREEBSD);
+  ECase(ELFOSABI_TRU64);
+  ECase(ELFOSABI_MODESTO);
+  ECase(ELFOSABI_OPENBSD);
+  ECase(ELFOSABI_OPENVMS);
+  ECase(ELFOSABI_NSK);
+  ECase(ELFOSABI_AROS);
+  ECase(ELFOSABI_FENIXOS);
+  ECase(ELFOSABI_CLOUDABI);
+  ECase(ELFOSABI_C6000_ELFABI);
+  ECase(ELFOSABI_AMDGPU_HSA);
+  ECase(ELFOSABI_C6000_LINUX);
+  ECase(ELFOSABI_ARM);
+  ECase(ELFOSABI_STANDALONE);
 #undef ECase
 }
 
@@ -253,92 +257,92 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
                                                  ELFYAML::ELF_EF &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M);
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
+#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M)
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    BCase(EF_ARM_SOFT_FLOAT)
-    BCase(EF_ARM_VFP_FLOAT)
-    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK)
+    BCase(EF_ARM_SOFT_FLOAT);
+    BCase(EF_ARM_VFP_FLOAT);
+    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK);
     break;
   case ELF::EM_MIPS:
-    BCase(EF_MIPS_NOREORDER)
-    BCase(EF_MIPS_PIC)
-    BCase(EF_MIPS_CPIC)
-    BCase(EF_MIPS_ABI2)
-    BCase(EF_MIPS_32BITMODE)
-    BCase(EF_MIPS_FP64)
-    BCase(EF_MIPS_NAN2008)
-    BCase(EF_MIPS_MICROMIPS)
-    BCase(EF_MIPS_ARCH_ASE_M16)
-    BCase(EF_MIPS_ARCH_ASE_MDMX)
-    BCaseMask(EF_MIPS_ABI_O32, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_O64, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_EABI32, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_EABI64, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_MACH_3900, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4010, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4100, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4650, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4120, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4111, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_SB1, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_XLR, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON2, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON3, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5400, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5900, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5500, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_9000, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS2E, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS2F, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS3A, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH)
+    BCase(EF_MIPS_NOREORDER);
+    BCase(EF_MIPS_PIC);
+    BCase(EF_MIPS_CPIC);
+    BCase(EF_MIPS_ABI2);
+    BCase(EF_MIPS_32BITMODE);
+    BCase(EF_MIPS_FP64);
+    BCase(EF_MIPS_NAN2008);
+    BCase(EF_MIPS_MICROMIPS);
+    BCase(EF_MIPS_ARCH_ASE_M16);
+    BCase(EF_MIPS_ARCH_ASE_MDMX);
+    BCaseMask(EF_MIPS_ABI_O32, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_O64, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_EABI32, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_EABI64, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_MACH_3900, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4010, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4100, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4650, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4120, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4111, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_SB1, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_XLR, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON2, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON3, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5400, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5900, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5500, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_9000, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS2E, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS2F, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS3A, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH);
     break;
   case ELF::EM_HEXAGON:
-    BCase(EF_HEXAGON_MACH_V2)
-    BCase(EF_HEXAGON_MACH_V3)
-    BCase(EF_HEXAGON_MACH_V4)
-    BCase(EF_HEXAGON_MACH_V5)
-    BCase(EF_HEXAGON_ISA_V2)
-    BCase(EF_HEXAGON_ISA_V3)
-    BCase(EF_HEXAGON_ISA_V4)
-    BCase(EF_HEXAGON_ISA_V5)
+    BCase(EF_HEXAGON_MACH_V2);
+    BCase(EF_HEXAGON_MACH_V3);
+    BCase(EF_HEXAGON_MACH_V4);
+    BCase(EF_HEXAGON_MACH_V5);
+    BCase(EF_HEXAGON_ISA_V2);
+    BCase(EF_HEXAGON_ISA_V3);
+    BCase(EF_HEXAGON_ISA_V4);
+    BCase(EF_HEXAGON_ISA_V5);
     break;
   case ELF::EM_AVR:
-    BCase(EF_AVR_ARCH_AVR1)
-    BCase(EF_AVR_ARCH_AVR2)
-    BCase(EF_AVR_ARCH_AVR25)
-    BCase(EF_AVR_ARCH_AVR3)
-    BCase(EF_AVR_ARCH_AVR31)
-    BCase(EF_AVR_ARCH_AVR35)
-    BCase(EF_AVR_ARCH_AVR4)
-    BCase(EF_AVR_ARCH_AVR51)
-    BCase(EF_AVR_ARCH_AVR6)
-    BCase(EF_AVR_ARCH_AVRTINY)
-    BCase(EF_AVR_ARCH_XMEGA1)
-    BCase(EF_AVR_ARCH_XMEGA2)
-    BCase(EF_AVR_ARCH_XMEGA3)
-    BCase(EF_AVR_ARCH_XMEGA4)
-    BCase(EF_AVR_ARCH_XMEGA5)
-    BCase(EF_AVR_ARCH_XMEGA6)
-    BCase(EF_AVR_ARCH_XMEGA7)
+    BCase(EF_AVR_ARCH_AVR1);
+    BCase(EF_AVR_ARCH_AVR2);
+    BCase(EF_AVR_ARCH_AVR25);
+    BCase(EF_AVR_ARCH_AVR3);
+    BCase(EF_AVR_ARCH_AVR31);
+    BCase(EF_AVR_ARCH_AVR35);
+    BCase(EF_AVR_ARCH_AVR4);
+    BCase(EF_AVR_ARCH_AVR51);
+    BCase(EF_AVR_ARCH_AVR6);
+    BCase(EF_AVR_ARCH_AVRTINY);
+    BCase(EF_AVR_ARCH_XMEGA1);
+    BCase(EF_AVR_ARCH_XMEGA2);
+    BCase(EF_AVR_ARCH_XMEGA3);
+    BCase(EF_AVR_ARCH_XMEGA4);
+    BCase(EF_AVR_ARCH_XMEGA5);
+    BCase(EF_AVR_ARCH_XMEGA6);
+    BCase(EF_AVR_ARCH_XMEGA7);
     break;
   case ELF::EM_AMDGPU:
   case ELF::EM_X86_64:
@@ -354,51 +358,52 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     IO &IO, ELFYAML::ELF_SHT &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(SHT_NULL)
-  ECase(SHT_PROGBITS)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(SHT_NULL);
+  ECase(SHT_PROGBITS);
   // No SHT_SYMTAB. Use the top-level `Symbols` key instead.
   // FIXME: Issue a diagnostic with this information.
-  ECase(SHT_STRTAB)
-  ECase(SHT_RELA)
-  ECase(SHT_HASH)
-  ECase(SHT_DYNAMIC)
-  ECase(SHT_NOTE)
-  ECase(SHT_NOBITS)
-  ECase(SHT_REL)
-  ECase(SHT_SHLIB)
-  ECase(SHT_DYNSYM)
-  ECase(SHT_INIT_ARRAY)
-  ECase(SHT_FINI_ARRAY)
-  ECase(SHT_PREINIT_ARRAY)
-  ECase(SHT_GROUP)
-  ECase(SHT_SYMTAB_SHNDX)
-  ECase(SHT_LOOS)
-  ECase(SHT_GNU_ATTRIBUTES)
-  ECase(SHT_GNU_HASH)
-  ECase(SHT_GNU_verdef)
-  ECase(SHT_GNU_verneed)
-  ECase(SHT_GNU_versym)
-  ECase(SHT_HIOS)
-  ECase(SHT_LOPROC)
+  ECase(SHT_STRTAB);
+  ECase(SHT_RELA);
+  ECase(SHT_HASH);
+  ECase(SHT_DYNAMIC);
+  ECase(SHT_NOTE);
+  ECase(SHT_NOBITS);
+  ECase(SHT_REL);
+  ECase(SHT_SHLIB);
+  ECase(SHT_DYNSYM);
+  ECase(SHT_INIT_ARRAY);
+  ECase(SHT_FINI_ARRAY);
+  ECase(SHT_PREINIT_ARRAY);
+  ECase(SHT_GROUP);
+  ECase(SHT_SYMTAB_SHNDX);
+  ECase(SHT_LOOS);
+  ECase(SHT_LLVM_ODRTAB);
+  ECase(SHT_GNU_ATTRIBUTES);
+  ECase(SHT_GNU_HASH);
+  ECase(SHT_GNU_verdef);
+  ECase(SHT_GNU_verneed);
+  ECase(SHT_GNU_versym);
+  ECase(SHT_HIOS);
+  ECase(SHT_LOPROC);
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    ECase(SHT_ARM_EXIDX)
-    ECase(SHT_ARM_PREEMPTMAP)
-    ECase(SHT_ARM_ATTRIBUTES)
-    ECase(SHT_ARM_DEBUGOVERLAY)
-    ECase(SHT_ARM_OVERLAYSECTION)
+    ECase(SHT_ARM_EXIDX);
+    ECase(SHT_ARM_PREEMPTMAP);
+    ECase(SHT_ARM_ATTRIBUTES);
+    ECase(SHT_ARM_DEBUGOVERLAY);
+    ECase(SHT_ARM_OVERLAYSECTION);
     break;
   case ELF::EM_HEXAGON:
-    ECase(SHT_HEX_ORDERED)
+    ECase(SHT_HEX_ORDERED);
     break;
   case ELF::EM_X86_64:
-    ECase(SHT_X86_64_UNWIND)
+    ECase(SHT_X86_64_UNWIND);
     break;
   case ELF::EM_MIPS:
-    ECase(SHT_MIPS_REGINFO)
-    ECase(SHT_MIPS_OPTIONS)
-    ECase(SHT_MIPS_ABIFLAGS)
+    ECase(SHT_MIPS_REGINFO);
+    ECase(SHT_MIPS_OPTIONS);
+    ECase(SHT_MIPS_ABIFLAGS);
     break;
   default:
     // Nothing to do.
@@ -410,43 +415,37 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
 void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
                                                   ELFYAML::ELF_SHF &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-  BCase(SHF_WRITE)
-  BCase(SHF_ALLOC)
-  BCase(SHF_EXCLUDE)
-  BCase(SHF_EXECINSTR)
-  BCase(SHF_MERGE)
-  BCase(SHF_STRINGS)
-  BCase(SHF_INFO_LINK)
-  BCase(SHF_LINK_ORDER)
-  BCase(SHF_OS_NONCONFORMING)
-  BCase(SHF_GROUP)
-  BCase(SHF_TLS)
-  switch(Object->Header.Machine) {
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
+  BCase(SHF_WRITE);
+  BCase(SHF_ALLOC);
+  BCase(SHF_EXCLUDE);
+  BCase(SHF_EXECINSTR);
+  BCase(SHF_MERGE);
+  BCase(SHF_STRINGS);
+  BCase(SHF_INFO_LINK);
+  BCase(SHF_LINK_ORDER);
+  BCase(SHF_OS_NONCONFORMING);
+  BCase(SHF_GROUP);
+  BCase(SHF_TLS);
+  switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    BCase(SHF_ARM_PURECODE)
-    break;
-  case ELF::EM_AMDGPU:
-    BCase(SHF_AMDGPU_HSA_GLOBAL)
-    BCase(SHF_AMDGPU_HSA_READONLY)
-    BCase(SHF_AMDGPU_HSA_CODE)
-    BCase(SHF_AMDGPU_HSA_AGENT)
+    BCase(SHF_ARM_PURECODE);
     break;
   case ELF::EM_HEXAGON:
-    BCase(SHF_HEX_GPREL)
+    BCase(SHF_HEX_GPREL);
     break;
   case ELF::EM_MIPS:
-    BCase(SHF_MIPS_NODUPES)
-    BCase(SHF_MIPS_NAMES)
-    BCase(SHF_MIPS_LOCAL)
-    BCase(SHF_MIPS_NOSTRIP)
-    BCase(SHF_MIPS_GPREL)
-    BCase(SHF_MIPS_MERGE)
-    BCase(SHF_MIPS_ADDR)
-    BCase(SHF_MIPS_STRING)
+    BCase(SHF_MIPS_NODUPES);
+    BCase(SHF_MIPS_NAMES);
+    BCase(SHF_MIPS_LOCAL);
+    BCase(SHF_MIPS_NOSTRIP);
+    BCase(SHF_MIPS_GPREL);
+    BCase(SHF_MIPS_MERGE);
+    BCase(SHF_MIPS_ADDR);
+    BCase(SHF_MIPS_STRING);
     break;
   case ELF::EM_X86_64:
-    BCase(SHF_X86_64_LARGE)
+    BCase(SHF_X86_64_LARGE);
     break;
   default:
     // Nothing to do.
@@ -457,25 +456,25 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
     IO &IO, ELFYAML::ELF_STT &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(STT_NOTYPE)
-  ECase(STT_OBJECT)
-  ECase(STT_FUNC)
-  ECase(STT_SECTION)
-  ECase(STT_FILE)
-  ECase(STT_COMMON)
-  ECase(STT_TLS)
-  ECase(STT_GNU_IFUNC)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(STT_NOTYPE);
+  ECase(STT_OBJECT);
+  ECase(STT_FUNC);
+  ECase(STT_SECTION);
+  ECase(STT_FILE);
+  ECase(STT_COMMON);
+  ECase(STT_TLS);
+  ECase(STT_GNU_IFUNC);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STV>::enumeration(
     IO &IO, ELFYAML::ELF_STV &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(STV_DEFAULT)
-  ECase(STV_INTERNAL)
-  ECase(STV_HIDDEN)
-  ECase(STV_PROTECTED)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(STV_DEFAULT);
+  ECase(STV_INTERNAL);
+  ECase(STV_HIDDEN);
+  ECase(STV_PROTECTED);
 #undef ECase
 }
 
@@ -483,13 +482,13 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
                                                   ELFYAML::ELF_STO &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
   switch (Object->Header.Machine) {
   case ELF::EM_MIPS:
-    BCase(STO_MIPS_OPTIONAL)
-    BCase(STO_MIPS_PLT)
-    BCase(STO_MIPS_PIC)
-    BCase(STO_MIPS_MICROMIPS)
+    BCase(STO_MIPS_OPTIONAL);
+    BCase(STO_MIPS_PLT);
+    BCase(STO_MIPS_PIC);
+    BCase(STO_MIPS_MICROMIPS);
     break;
   default:
     break; // Nothing to do
@@ -500,11 +499,11 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_RSS>::enumeration(
     IO &IO, ELFYAML::ELF_RSS &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(RSS_UNDEF)
-  ECase(RSS_GP)
-  ECase(RSS_GP0)
-  ECase(RSS_LOC)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(RSS_UNDEF);
+  ECase(RSS_GP);
+  ECase(RSS_GP0);
+  ECase(RSS_LOC);
 #undef ECase
 }
 
@@ -515,89 +514,90 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
 #define ELF_RELOC(X, Y) IO.enumCase(Value, #X, ELF::X);
   switch (Object->Header.Machine) {
   case ELF::EM_X86_64:
-#include "llvm/Support/ELFRelocs/x86_64.def"
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
     break;
   case ELF::EM_MIPS:
-#include "llvm/Support/ELFRelocs/Mips.def"
+#include "llvm/BinaryFormat/ELFRelocs/Mips.def"
     break;
   case ELF::EM_HEXAGON:
-#include "llvm/Support/ELFRelocs/Hexagon.def"
+#include "llvm/BinaryFormat/ELFRelocs/Hexagon.def"
     break;
   case ELF::EM_386:
   case ELF::EM_IAMCU:
-#include "llvm/Support/ELFRelocs/i386.def"
+#include "llvm/BinaryFormat/ELFRelocs/i386.def"
     break;
   case ELF::EM_AARCH64:
-#include "llvm/Support/ELFRelocs/AArch64.def"
+#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
     break;
   case ELF::EM_ARM:
-#include "llvm/Support/ELFRelocs/ARM.def"
+#include "llvm/BinaryFormat/ELFRelocs/ARM.def"
     break;
   case ELF::EM_RISCV:
-#include "llvm/Support/ELFRelocs/RISCV.def"
+#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
     break;
   case ELF::EM_LANAI:
-#include "llvm/Support/ELFRelocs/Lanai.def"
+#include "llvm/BinaryFormat/ELFRelocs/Lanai.def"
     break;
   case ELF::EM_AMDGPU:
-#include "llvm/Support/ELFRelocs/AMDGPU.def"
+#include "llvm/BinaryFormat/ELFRelocs/AMDGPU.def"
     break;
   case ELF::EM_BPF:
-#include "llvm/Support/ELFRelocs/BPF.def"
+#include "llvm/BinaryFormat/ELFRelocs/BPF.def"
     break;
   default:
     llvm_unreachable("Unsupported architecture");
   }
 #undef ELF_RELOC
+  IO.enumFallback<Hex32>(Value);
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_REG>::enumeration(
     IO &IO, ELFYAML::MIPS_AFL_REG &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X);
-  ECase(REG_NONE)
-  ECase(REG_32)
-  ECase(REG_64)
-  ECase(REG_128)
+#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X)
+  ECase(REG_NONE);
+  ECase(REG_32);
+  ECase(REG_64);
+  ECase(REG_128);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_ABI_FP>::enumeration(
     IO &IO, ELFYAML::MIPS_ABI_FP &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::Val_GNU_MIPS_ABI_##X);
-  ECase(FP_ANY)
-  ECase(FP_DOUBLE)
-  ECase(FP_SINGLE)
-  ECase(FP_SOFT)
-  ECase(FP_OLD_64)
-  ECase(FP_XX)
-  ECase(FP_64)
-  ECase(FP_64A)
+#define ECase(X) IO.enumCase(Value, #X, Mips::Val_GNU_MIPS_ABI_##X)
+  ECase(FP_ANY);
+  ECase(FP_DOUBLE);
+  ECase(FP_SINGLE);
+  ECase(FP_SOFT);
+  ECase(FP_OLD_64);
+  ECase(FP_XX);
+  ECase(FP_64);
+  ECase(FP_64A);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_EXT>::enumeration(
     IO &IO, ELFYAML::MIPS_AFL_EXT &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X);
-  ECase(EXT_NONE)
-  ECase(EXT_XLR)
-  ECase(EXT_OCTEON2)
-  ECase(EXT_OCTEONP)
-  ECase(EXT_LOONGSON_3A)
-  ECase(EXT_OCTEON)
-  ECase(EXT_5900)
-  ECase(EXT_4650)
-  ECase(EXT_4010)
-  ECase(EXT_4100)
-  ECase(EXT_3900)
-  ECase(EXT_10000)
-  ECase(EXT_SB1)
-  ECase(EXT_4111)
-  ECase(EXT_4120)
-  ECase(EXT_5400)
-  ECase(EXT_5500)
-  ECase(EXT_LOONGSON_2E)
-  ECase(EXT_LOONGSON_2F)
-  ECase(EXT_OCTEON3)
+#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X)
+  ECase(EXT_NONE);
+  ECase(EXT_XLR);
+  ECase(EXT_OCTEON2);
+  ECase(EXT_OCTEONP);
+  ECase(EXT_LOONGSON_3A);
+  ECase(EXT_OCTEON);
+  ECase(EXT_5900);
+  ECase(EXT_4650);
+  ECase(EXT_4010);
+  ECase(EXT_4100);
+  ECase(EXT_3900);
+  ECase(EXT_10000);
+  ECase(EXT_SB1);
+  ECase(EXT_4111);
+  ECase(EXT_4120);
+  ECase(EXT_5400);
+  ECase(EXT_5500);
+  ECase(EXT_LOONGSON_2E);
+  ECase(EXT_LOONGSON_2F);
+  ECase(EXT_OCTEON3);
 #undef ECase
 }
 
@@ -614,27 +614,27 @@ void ScalarEnumerationTraits<ELFYAML::MIPS_ISA>::enumeration(
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_ASE>::bitset(
     IO &IO, ELFYAML::MIPS_AFL_ASE &Value) {
-#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_ASE_##X);
-  BCase(DSP)
-  BCase(DSPR2)
-  BCase(EVA)
-  BCase(MCU)
-  BCase(MDMX)
-  BCase(MIPS3D)
-  BCase(MT)
-  BCase(SMARTMIPS)
-  BCase(VIRT)
-  BCase(MSA)
-  BCase(MIPS16)
-  BCase(MICROMIPS)
-  BCase(XPA)
+#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_ASE_##X)
+  BCase(DSP);
+  BCase(DSPR2);
+  BCase(EVA);
+  BCase(MCU);
+  BCase(MDMX);
+  BCase(MIPS3D);
+  BCase(MT);
+  BCase(SMARTMIPS);
+  BCase(VIRT);
+  BCase(MSA);
+  BCase(MIPS16);
+  BCase(MICROMIPS);
+  BCase(XPA);
 #undef BCase
 }
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_FLAGS1>::bitset(
     IO &IO, ELFYAML::MIPS_AFL_FLAGS1 &Value) {
-#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_FLAGS1_##X);
-  BCase(ODDSPREG)
+#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_FLAGS1_##X)
+  BCase(ODDSPREG);
 #undef BCase
 }
 
@@ -650,6 +650,7 @@ void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
 }
 
 namespace {
+
 struct NormalizedOther {
   NormalizedOther(IO &)
       : Visibility(ELFYAML::ELF_STV(0)), Other(ELFYAML::ELF_STO(0)) {}
@@ -661,7 +662,8 @@ struct NormalizedOther {
   ELFYAML::ELF_STV Visibility;
   ELFYAML::ELF_STO Other;
 };
-}
+
+} // end anonymous namespace
 
 void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Name", Symbol.Name, StringRef());
@@ -784,6 +786,7 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Section>>::validate(
 }
 
 namespace {
+
 struct NormalizedMips64RelType {
   NormalizedMips64RelType(IO &)
       : Type(ELFYAML::ELF_REL(ELF::R_MIPS_NONE)),
@@ -804,7 +807,8 @@ struct NormalizedMips64RelType {
   ELFYAML::ELF_REL Type3;
   ELFYAML::ELF_RSS SpecSym;
 };
-}
+
+} // end anonymous namespace
 
 void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
                                                  ELFYAML::Relocation &Rel) {
@@ -845,4 +849,5 @@ LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_ASE)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_FLAGS1)
 
 } // end namespace yaml
+
 } // end namespace llvm
diff --git a/contrib/llvm/lib/ObjectYAML/MachOYAML.cpp b/contrib/llvm/lib/ObjectYAML/MachOYAML.cpp
index a033a79..ab452a7 100644
--- a/contrib/llvm/lib/ObjectYAML/MachOYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/MachOYAML.cpp
@@ -12,16 +12,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/MachOYAML.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/MachO.h"
-
-#include <string.h> // For memcpy, memset and strnlen.
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
 
 namespace llvm {
 
-MachOYAML::LoadCommand::~LoadCommand() {}
+MachOYAML::LoadCommand::~LoadCommand() = default;
 
 bool MachOYAML::LinkEditData::isEmpty() const {
   return 0 ==
@@ -33,7 +36,7 @@ bool MachOYAML::LinkEditData::isEmpty() const {
 namespace yaml {
 
 void ScalarTraits<char_16>::output(const char_16 &Val, void *,
-                                   llvm::raw_ostream &Out) {
+                                   raw_ostream &Out) {
   auto Len = strnlen(&Val[0], 16);
   Out << StringRef(&Val[0], Len);
 }
@@ -51,8 +54,7 @@ StringRef ScalarTraits<char_16>::input(StringRef Scalar, void *, char_16 &Val) {
 
 bool ScalarTraits<char_16>::mustQuote(StringRef S) { return needsQuotes(S); }
 
-void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *,
-                                  llvm::raw_ostream &Out) {
+void ScalarTraits<uuid_t>::output(const uuid_t &Val, void *, raw_ostream &Out) {
   for (int Idx = 0; Idx < 16; ++Idx) {
     Out << format("%02" PRIX32, Val[Idx]);
     if (Idx == 3 || Idx == 5 || Idx == 7 || Idx == 9)
@@ -154,7 +156,7 @@ void MappingTraits<MachOYAML::LinkEditData>::mapping(
   IO.mapOptional("BindOpcodes", LinkEditData.BindOpcodes);
   IO.mapOptional("WeakBindOpcodes", LinkEditData.WeakBindOpcodes);
   IO.mapOptional("LazyBindOpcodes", LinkEditData.LazyBindOpcodes);
-  if(LinkEditData.ExportTrie.Children.size() > 0 || !IO.outputting())
+  if (!LinkEditData.ExportTrie.Children.empty() || !IO.outputting())
     IO.mapOptional("ExportTrie", LinkEditData.ExportTrie);
   IO.mapOptional("NameList", LinkEditData.NameList);
   IO.mapOptional("StringTable", LinkEditData.StringTable);
@@ -230,6 +232,12 @@ void mapLoadCommandData<MachO::dylinker_command>(
   IO.mapOptional("PayloadString", LoadCommand.PayloadString);
 }
 
+template <>
+void mapLoadCommandData<MachO::build_version_command>(
+    IO &IO, MachOYAML::LoadCommand &LoadCommand) {
+  IO.mapOptional("Tools", LoadCommand.Tools);
+}
+
 void MappingTraits<MachOYAML::LoadCommand>::mapping(
     IO &IO, MachOYAML::LoadCommand &LoadCommand) {
   MachO::LoadCommandType TempCmd = static_cast<MachO::LoadCommandType>(
@@ -246,7 +254,7 @@ void MappingTraits<MachOYAML::LoadCommand>::mapping(
     break;
 
   switch (LoadCommand.Data.load_command_data.cmd) {
-#include "llvm/Support/MachO.def"
+#include "llvm/BinaryFormat/MachO.def"
   }
   IO.mapOptional("PayloadBytes", LoadCommand.PayloadBytes);
   IO.mapOptional("ZeroPadBytes", LoadCommand.ZeroPadBytes, (uint64_t)0ull);
@@ -282,6 +290,12 @@ void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
   IO.mapOptional("reserved3", Section.reserved3);
 }
 
+void MappingTraits<MachO::build_tool_version>::mapping(
+    IO &IO, MachO::build_tool_version &tool) {
+  IO.mapRequired("tool", tool.tool);
+  IO.mapRequired("version", tool.version);
+}
+
 void MappingTraits<MachO::dylib>::mapping(IO &IO, MachO::dylib &DylibStruct) {
   IO.mapRequired("name", DylibStruct.name);
   IO.mapRequired("timestamp", DylibStruct.timestamp);
@@ -296,13 +310,11 @@ void MappingTraits<MachO::dylib_command>::mapping(
 
 void MappingTraits<MachO::dylinker_command>::mapping(
     IO &IO, MachO::dylinker_command &LoadCommand) {
-
   IO.mapRequired("name", LoadCommand.name);
 }
 
 void MappingTraits<MachO::dysymtab_command>::mapping(
     IO &IO, MachO::dysymtab_command &LoadCommand) {
-
   IO.mapRequired("ilocalsym", LoadCommand.ilocalsym);
   IO.mapRequired("nlocalsym", LoadCommand.nlocalsym);
   IO.mapRequired("iextdefsym", LoadCommand.iextdefsym);
@@ -325,7 +337,6 @@ void MappingTraits<MachO::dysymtab_command>::mapping(
 
 void MappingTraits<MachO::encryption_info_command>::mapping(
     IO &IO, MachO::encryption_info_command &LoadCommand) {
-
   IO.mapRequired("cryptoff", LoadCommand.cryptoff);
   IO.mapRequired("cryptsize", LoadCommand.cryptsize);
   IO.mapRequired("cryptid", LoadCommand.cryptid);
@@ -333,7 +344,6 @@ void MappingTraits<MachO::encryption_info_command>::mapping(
 
 void MappingTraits<MachO::encryption_info_command_64>::mapping(
     IO &IO, MachO::encryption_info_command_64 &LoadCommand) {
-
   IO.mapRequired("cryptoff", LoadCommand.cryptoff);
   IO.mapRequired("cryptsize", LoadCommand.cryptsize);
   IO.mapRequired("cryptid", LoadCommand.cryptid);
@@ -342,14 +352,12 @@ void MappingTraits<MachO::encryption_info_command_64>::mapping(
 
 void MappingTraits<MachO::entry_point_command>::mapping(
     IO &IO, MachO::entry_point_command &LoadCommand) {
-
   IO.mapRequired("entryoff", LoadCommand.entryoff);
   IO.mapRequired("stacksize", LoadCommand.stacksize);
 }
 
 void MappingTraits<MachO::fvmfile_command>::mapping(
     IO &IO, MachO::fvmfile_command &LoadCommand) {
-
   IO.mapRequired("name", LoadCommand.name);
   IO.mapRequired("header_addr", LoadCommand.header_addr);
 }
@@ -362,7 +370,6 @@ void MappingTraits<MachO::fvmlib>::mapping(IO &IO, MachO::fvmlib &FVMLib) {
 
 void MappingTraits<MachO::fvmlib_command>::mapping(
     IO &IO, MachO::fvmlib_command &LoadCommand) {
-
   IO.mapRequired("fvmlib", LoadCommand.fvmlib);
 }
 
@@ -371,20 +378,17 @@ void MappingTraits<MachO::ident_command>::mapping(
 
 void MappingTraits<MachO::linkedit_data_command>::mapping(
     IO &IO, MachO::linkedit_data_command &LoadCommand) {
-
   IO.mapRequired("dataoff", LoadCommand.dataoff);
   IO.mapRequired("datasize", LoadCommand.datasize);
 }
 
 void MappingTraits<MachO::linker_option_command>::mapping(
     IO &IO, MachO::linker_option_command &LoadCommand) {
-
   IO.mapRequired("count", LoadCommand.count);
 }
 
 void MappingTraits<MachO::prebind_cksum_command>::mapping(
     IO &IO, MachO::prebind_cksum_command &LoadCommand) {
-
   IO.mapRequired("cksum", LoadCommand.cksum);
 }
 
@@ -393,7 +397,6 @@ void MappingTraits<MachO::load_command>::mapping(
 
 void MappingTraits<MachO::prebound_dylib_command>::mapping(
     IO &IO, MachO::prebound_dylib_command &LoadCommand) {
-
   IO.mapRequired("name", LoadCommand.name);
   IO.mapRequired("nmodules", LoadCommand.nmodules);
   IO.mapRequired("linked_modules", LoadCommand.linked_modules);
@@ -401,7 +404,6 @@ void MappingTraits<MachO::prebound_dylib_command>::mapping(
 
 void MappingTraits<MachO::routines_command>::mapping(
     IO &IO, MachO::routines_command &LoadCommand) {
-
   IO.mapRequired("init_address", LoadCommand.init_address);
   IO.mapRequired("init_module", LoadCommand.init_module);
   IO.mapRequired("reserved1", LoadCommand.reserved1);
@@ -414,7 +416,6 @@ void MappingTraits<MachO::routines_command>::mapping(
 
 void MappingTraits<MachO::routines_command_64>::mapping(
     IO &IO, MachO::routines_command_64 &LoadCommand) {
-
   IO.mapRequired("init_address", LoadCommand.init_address);
   IO.mapRequired("init_module", LoadCommand.init_module);
   IO.mapRequired("reserved1", LoadCommand.reserved1);
@@ -427,7 +428,6 @@ void MappingTraits<MachO::routines_command_64>::mapping(
 
 void MappingTraits<MachO::rpath_command>::mapping(
     IO &IO, MachO::rpath_command &LoadCommand) {
-
   IO.mapRequired("path", LoadCommand.path);
 }
 
@@ -463,7 +463,6 @@ void MappingTraits<MachO::section_64>::mapping(IO &IO,
 
 void MappingTraits<MachO::segment_command>::mapping(
     IO &IO, MachO::segment_command &LoadCommand) {
-
   IO.mapRequired("segname", LoadCommand.segname);
   IO.mapRequired("vmaddr", LoadCommand.vmaddr);
   IO.mapRequired("vmsize", LoadCommand.vmsize);
@@ -477,7 +476,6 @@ void MappingTraits<MachO::segment_command>::mapping(
 
 void MappingTraits<MachO::segment_command_64>::mapping(
     IO &IO, MachO::segment_command_64 &LoadCommand) {
-
   IO.mapRequired("segname", LoadCommand.segname);
   IO.mapRequired("vmaddr", LoadCommand.vmaddr);
   IO.mapRequired("vmsize", LoadCommand.vmsize);
@@ -491,44 +489,37 @@ void MappingTraits<MachO::segment_command_64>::mapping(
 
 void MappingTraits<MachO::source_version_command>::mapping(
     IO &IO, MachO::source_version_command &LoadCommand) {
-
   IO.mapRequired("version", LoadCommand.version);
 }
 
 void MappingTraits<MachO::sub_client_command>::mapping(
     IO &IO, MachO::sub_client_command &LoadCommand) {
-
   IO.mapRequired("client", LoadCommand.client);
 }
 
 void MappingTraits<MachO::sub_framework_command>::mapping(
     IO &IO, MachO::sub_framework_command &LoadCommand) {
-
   IO.mapRequired("umbrella", LoadCommand.umbrella);
 }
 
 void MappingTraits<MachO::sub_library_command>::mapping(
     IO &IO, MachO::sub_library_command &LoadCommand) {
-
   IO.mapRequired("sub_library", LoadCommand.sub_library);
 }
 
 void MappingTraits<MachO::sub_umbrella_command>::mapping(
     IO &IO, MachO::sub_umbrella_command &LoadCommand) {
-
   IO.mapRequired("sub_umbrella", LoadCommand.sub_umbrella);
 }
 
 void MappingTraits<MachO::symseg_command>::mapping(
     IO &IO, MachO::symseg_command &LoadCommand) {
-
   IO.mapRequired("offset", LoadCommand.offset);
   IO.mapRequired("size", LoadCommand.size);
 }
 
 void MappingTraits<MachO::symtab_command>::mapping(
     IO &IO, MachO::symtab_command &LoadCommand) {
-
   IO.mapRequired("symoff", LoadCommand.symoff);
   IO.mapRequired("nsyms", LoadCommand.nsyms);
   IO.mapRequired("stroff", LoadCommand.stroff);
@@ -540,24 +531,36 @@ void MappingTraits<MachO::thread_command>::mapping(
 
 void MappingTraits<MachO::twolevel_hints_command>::mapping(
     IO &IO, MachO::twolevel_hints_command &LoadCommand) {
-
   IO.mapRequired("offset", LoadCommand.offset);
   IO.mapRequired("nhints", LoadCommand.nhints);
 }
 
 void MappingTraits<MachO::uuid_command>::mapping(
     IO &IO, MachO::uuid_command &LoadCommand) {
-
   IO.mapRequired("uuid", LoadCommand.uuid);
 }
 
 void MappingTraits<MachO::version_min_command>::mapping(
     IO &IO, MachO::version_min_command &LoadCommand) {
-
   IO.mapRequired("version", LoadCommand.version);
   IO.mapRequired("sdk", LoadCommand.sdk);
 }
 
-} // namespace llvm::yaml
+void MappingTraits<MachO::note_command>::mapping(
+    IO &IO, MachO::note_command &LoadCommand) {
+  IO.mapRequired("data_owner", LoadCommand.data_owner);
+  IO.mapRequired("offset", LoadCommand.offset);
+  IO.mapRequired("size", LoadCommand.size);
+}
+
+void MappingTraits<MachO::build_version_command>::mapping(
+    IO &IO, MachO::build_version_command &LoadCommand) {
+  IO.mapRequired("platform", LoadCommand.platform);
+  IO.mapRequired("minos", LoadCommand.minos);
+  IO.mapRequired("sdk", LoadCommand.sdk);
+  IO.mapRequired("ntools", LoadCommand.ntools);
+}
+
+} // end namespace yaml
 
-} // namespace llvm
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ObjectYAML/ObjectYAML.cpp b/contrib/llvm/lib/ObjectYAML/ObjectYAML.cpp
index cbbaac6..850c1a5 100644
--- a/contrib/llvm/lib/ObjectYAML/ObjectYAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/ObjectYAML.cpp
@@ -11,8 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ObjectYAML/YAML.h"
 #include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <string>
 
 using namespace llvm;
 using namespace yaml;
@@ -43,6 +46,9 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
       ObjectFile.FatMachO.reset(new MachOYAML::UniversalBinary());
       MappingTraits<MachOYAML::UniversalBinary>::mapping(IO,
                                                          *ObjectFile.FatMachO);
+    } else if (IO.mapTag("!WASM")) {
+      ObjectFile.Wasm.reset(new WasmYAML::Object());
+      MappingTraits<WasmYAML::Object>::mapping(IO, *ObjectFile.Wasm);
     } else {
       Input &In = (Input &)IO;
       std::string Tag = In.getCurrentNode()->getRawTag();
@@ -50,8 +56,8 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
         IO.setError("YAML Object File missing document type tag!");
       else
         IO.setError(
-            llvm::Twine("YAML Object File unsupported document type tag '") +
-            llvm::Twine(Tag) + llvm::Twine("'!"));
+            Twine("YAML Object File unsupported document type tag '") +
+            Twine(Tag) + Twine("'!"));
     }
   }
 }
diff --git a/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp b/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp
new file mode 100644
index 0000000..6a68cd2
--- /dev/null
+++ b/contrib/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -0,0 +1,411 @@
+//===- WasmYAML.cpp - Wasm YAMLIO implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of wasm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/WasmYAML.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+
+namespace llvm {
+
+namespace WasmYAML {
+
+// Declared here rather than in the header to comply with:
+// http://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers
+Section::~Section() = default;
+
+} // end namespace WasmYAML
+
+namespace yaml {
+
+void MappingTraits<WasmYAML::FileHeader>::mapping(
+    IO &IO, WasmYAML::FileHeader &FileHdr) {
+  IO.mapRequired("Version", FileHdr.Version);
+}
+
+void MappingTraits<WasmYAML::Object>::mapping(IO &IO,
+                                              WasmYAML::Object &Object) {
+  IO.setContext(&Object);
+  IO.mapTag("!WASM", true);
+  IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("Sections", Object.Sections);
+  IO.setContext(nullptr);
+}
+
+static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) {
+  IO.mapRequired("Type", Section.Type);
+  IO.mapOptional("Relocations", Section.Relocations);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::NameSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapOptional("FunctionNames", Section.FunctionNames);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::LinkingSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("DataSize", Section.DataSize);
+  IO.mapRequired("DataAlignment", Section.DataAlignment);
+  IO.mapRequired("SymbolInfo", Section.SymbolInfos);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::CustomSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("Payload", Section.Payload);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::TypeSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Signatures", Section.Signatures);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ImportSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Imports", Section.Imports);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::FunctionSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("FunctionTypes", Section.FunctionTypes);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::TableSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Tables", Section.Tables);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::MemorySection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Memories", Section.Memories);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Globals", Section.Globals);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ExportSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Exports", Section.Exports);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::StartSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("StartFunction", Section.StartFunction);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ElemSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Segments", Section.Segments);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::CodeSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Functions", Section.Functions);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::DataSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Segments", Section.Segments);
+}
+
+void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
+    IO &IO, std::unique_ptr<WasmYAML::Section> &Section) {
+  WasmYAML::SectionType SectionType;
+  if (IO.outputting())
+    SectionType = Section->Type;
+  else
+    IO.mapRequired("Type", SectionType);
+
+  switch (SectionType) {
+  case wasm::WASM_SEC_CUSTOM: {
+    StringRef SectionName;
+    if (IO.outputting()) {
+      auto CustomSection = cast<WasmYAML::CustomSection>(Section.get());
+      SectionName = CustomSection->Name;
+    } else {
+      IO.mapRequired("Name", SectionName);
+    }
+    if (SectionName == "linking") {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::LinkingSection());
+      sectionMapping(IO, *cast<WasmYAML::LinkingSection>(Section.get()));
+    } else if (SectionName == "name") {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::NameSection());
+      sectionMapping(IO, *cast<WasmYAML::NameSection>(Section.get()));
+    } else {
+      if (!IO.outputting())
+        Section.reset(new WasmYAML::CustomSection(SectionName));
+      sectionMapping(IO, *cast<WasmYAML::CustomSection>(Section.get()));
+    }
+    break;
+  }
+  case wasm::WASM_SEC_TYPE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::TypeSection());
+    sectionMapping(IO, *cast<WasmYAML::TypeSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_IMPORT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ImportSection());
+    sectionMapping(IO, *cast<WasmYAML::ImportSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_FUNCTION:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::FunctionSection());
+    sectionMapping(IO, *cast<WasmYAML::FunctionSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_TABLE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::TableSection());
+    sectionMapping(IO, *cast<WasmYAML::TableSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_MEMORY:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::MemorySection());
+    sectionMapping(IO, *cast<WasmYAML::MemorySection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_GLOBAL:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::GlobalSection());
+    sectionMapping(IO, *cast<WasmYAML::GlobalSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_EXPORT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ExportSection());
+    sectionMapping(IO, *cast<WasmYAML::ExportSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_START:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::StartSection());
+    sectionMapping(IO, *cast<WasmYAML::StartSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_ELEM:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ElemSection());
+    sectionMapping(IO, *cast<WasmYAML::ElemSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_CODE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::CodeSection());
+    sectionMapping(IO, *cast<WasmYAML::CodeSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_DATA:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::DataSection());
+    sectionMapping(IO, *cast<WasmYAML::DataSection>(Section.get()));
+    break;
+  default:
+    llvm_unreachable("Unknown section type");
+  }
+}
+
+void ScalarEnumerationTraits<WasmYAML::SectionType>::enumeration(
+    IO &IO, WasmYAML::SectionType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_SEC_##X);
+  ECase(CUSTOM);
+  ECase(TYPE);
+  ECase(IMPORT);
+  ECase(FUNCTION);
+  ECase(TABLE);
+  ECase(MEMORY);
+  ECase(GLOBAL);
+  ECase(EXPORT);
+  ECase(START);
+  ECase(ELEM);
+  ECase(CODE);
+  ECase(DATA);
+#undef ECase
+}
+
+void MappingTraits<WasmYAML::Signature>::mapping(
+    IO &IO, WasmYAML::Signature &Signature) {
+  IO.mapOptional("Index", Signature.Index);
+  IO.mapRequired("ReturnType", Signature.ReturnType);
+  IO.mapRequired("ParamTypes", Signature.ParamTypes);
+}
+
+void MappingTraits<WasmYAML::Table>::mapping(IO &IO, WasmYAML::Table &Table) {
+  IO.mapRequired("ElemType", Table.ElemType);
+  IO.mapRequired("Limits", Table.TableLimits);
+}
+
+void MappingTraits<WasmYAML::Function>::mapping(IO &IO,
+                                                WasmYAML::Function &Function) {
+  IO.mapRequired("Locals", Function.Locals);
+  IO.mapRequired("Body", Function.Body);
+}
+
+void MappingTraits<WasmYAML::Relocation>::mapping(
+    IO &IO, WasmYAML::Relocation &Relocation) {
+  IO.mapRequired("Type", Relocation.Type);
+  IO.mapRequired("Index", Relocation.Index);
+  IO.mapRequired("Offset", Relocation.Offset);
+  IO.mapOptional("Addend", Relocation.Addend, 0);
+}
+
+void MappingTraits<WasmYAML::NameEntry>::mapping(
+    IO &IO, WasmYAML::NameEntry &NameEntry) {
+  IO.mapRequired("Index", NameEntry.Index);
+  IO.mapRequired("Name", NameEntry.Name);
+}
+
+void MappingTraits<WasmYAML::LocalDecl>::mapping(
+    IO &IO, WasmYAML::LocalDecl &LocalDecl) {
+  IO.mapRequired("Type", LocalDecl.Type);
+  IO.mapRequired("Count", LocalDecl.Count);
+}
+
+void MappingTraits<WasmYAML::Limits>::mapping(IO &IO,
+                                              WasmYAML::Limits &Limits) {
+  if (!IO.outputting() || Limits.Flags)
+    IO.mapOptional("Flags", Limits.Flags);
+  IO.mapRequired("Initial", Limits.Initial);
+  if (!IO.outputting() || Limits.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    IO.mapOptional("Maximum", Limits.Maximum);
+}
+
+void MappingTraits<WasmYAML::ElemSegment>::mapping(
+    IO &IO, WasmYAML::ElemSegment &Segment) {
+  IO.mapRequired("Offset", Segment.Offset);
+  IO.mapRequired("Functions", Segment.Functions);
+}
+
+void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
+                                              WasmYAML::Import &Import) {
+  IO.mapRequired("Module", Import.Module);
+  IO.mapRequired("Field", Import.Field);
+  IO.mapRequired("Kind", Import.Kind);
+  if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+    IO.mapRequired("SigIndex", Import.SigIndex);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
+    IO.mapRequired("GlobalType", Import.GlobalImport.Type);
+    IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) {
+    IO.mapRequired("Table", Import.TableImport);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY ) {
+    IO.mapRequired("Memory", Import.Memory);
+  } else {
+    llvm_unreachable("unhandled import type");
+  }
+}
+
+void MappingTraits<WasmYAML::Export>::mapping(IO &IO,
+                                              WasmYAML::Export &Export) {
+  IO.mapRequired("Name", Export.Name);
+  IO.mapRequired("Kind", Export.Kind);
+  IO.mapRequired("Index", Export.Index);
+}
+
+void MappingTraits<WasmYAML::Global>::mapping(IO &IO,
+                                              WasmYAML::Global &Global) {
+  IO.mapRequired("Type", Global.Type);
+  IO.mapRequired("Mutable", Global.Mutable);
+  IO.mapRequired("InitExpr", Global.InitExpr);
+}
+
+void MappingTraits<wasm::WasmInitExpr>::mapping(IO &IO,
+                                                wasm::WasmInitExpr &Expr) {
+  WasmYAML::Opcode Op = Expr.Opcode;
+  IO.mapRequired("Opcode", Op);
+  Expr.Opcode = Op;
+  switch (Expr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    IO.mapRequired("Value", Expr.Value.Int32);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    IO.mapRequired("Value", Expr.Value.Int64);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    IO.mapRequired("Value", Expr.Value.Float32);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    IO.mapRequired("Value", Expr.Value.Float64);
+    break;
+  case wasm::WASM_OPCODE_GET_GLOBAL:
+    IO.mapRequired("Index", Expr.Value.Global);
+    break;
+  }
+}
+
+void MappingTraits<WasmYAML::DataSegment>::mapping(
+    IO &IO, WasmYAML::DataSegment &Segment) {
+  IO.mapOptional("SectionOffset", Segment.SectionOffset);
+  IO.mapRequired("MemoryIndex", Segment.MemoryIndex);
+  IO.mapRequired("Offset", Segment.Offset);
+  IO.mapRequired("Content", Segment.Content);
+}
+
+void MappingTraits<WasmYAML::SymbolInfo>::mapping(IO &IO,
+                                                  WasmYAML::SymbolInfo &Info) {
+  IO.mapRequired("Name", Info.Name);
+  IO.mapRequired("Flags", Info.Flags);
+}
+
+void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
+    IO &IO, WasmYAML::ValueType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
+  ECase(I32);
+  ECase(I64);
+  ECase(F32);
+  ECase(F64);
+  ECase(ANYFUNC);
+  ECase(FUNC);
+  ECase(NORESULT);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::ExportKind>::enumeration(
+    IO &IO, WasmYAML::ExportKind &Kind) {
+#define ECase(X) IO.enumCase(Kind, #X, wasm::WASM_EXTERNAL_##X);
+  ECase(FUNCTION);
+  ECase(TABLE);
+  ECase(MEMORY);
+  ECase(GLOBAL);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::Opcode>::enumeration(
+    IO &IO, WasmYAML::Opcode &Code) {
+#define ECase(X) IO.enumCase(Code, #X, wasm::WASM_OPCODE_##X);
+  ECase(END);
+  ECase(I32_CONST);
+  ECase(I64_CONST);
+  ECase(F64_CONST);
+  ECase(F32_CONST);
+  ECase(GET_GLOBAL);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
+    IO &IO, WasmYAML::TableType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
+  ECase(ANYFUNC);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration(
+    IO &IO, WasmYAML::RelocType &Type) {
+#define WASM_RELOC(name, value) IO.enumCase(Type, #name, wasm::name);
+#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def"
+#undef WASM_RELOC
+}
+
+} // end namespace yaml
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/ObjectYAML/YAML.cpp b/contrib/llvm/lib/ObjectYAML/YAML.cpp
index 75cf1fb..67b5764 100644
--- a/contrib/llvm/lib/ObjectYAML/YAML.cpp
+++ b/contrib/llvm/lib/ObjectYAML/YAML.cpp
@@ -16,11 +16,12 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
+#include <cstdint>
 
 using namespace llvm;
 
 void yaml::ScalarTraits<yaml::BinaryRef>::output(
-    const yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) {
+    const yaml::BinaryRef &Val, void *, raw_ostream &Out) {
   Val.writeAsHex(Out);
 }
 
@@ -34,7 +35,7 @@ StringRef yaml::ScalarTraits<yaml::BinaryRef>::input(StringRef Scalar, void *,
     if (!isxdigit(Scalar[I]))
       return "BinaryRef hex string must contain only hex digits.";
   Val = yaml::BinaryRef(Scalar);
-  return StringRef();
+  return {};
 }
 
 void yaml::BinaryRef::writeAsBinary(raw_ostream &OS) const {
diff --git a/contrib/llvm/lib/Option/Arg.cpp b/contrib/llvm/lib/Option/Arg.cpp
index c3de2d1..e581fee 100644
--- a/contrib/llvm/lib/Option/Arg.cpp
+++ b/contrib/llvm/lib/Option/Arg.cpp
@@ -1,4 +1,4 @@
-//===--- Arg.cpp - Argument Implementations -------------------------------===//
+//===- Arg.cpp - Argument Implementations ---------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Option/Arg.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 using namespace llvm::opt;
@@ -61,11 +61,13 @@ void Arg::print(raw_ostream& O) const {
   O << "]>\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Arg::dump() const { print(dbgs()); }
+#endif
 
 std::string Arg::getAsString(const ArgList &Args) const {
   SmallString<256> Res;
-  llvm::raw_svector_ostream OS(Res);
+  raw_svector_ostream OS(Res);
 
   ArgStringList ASL;
   render(Args, ASL);
@@ -96,7 +98,7 @@ void Arg::render(const ArgList &Args, ArgStringList &Output) const {
 
   case Option::RenderCommaJoinedStyle: {
     SmallString<256> Res;
-    llvm::raw_svector_ostream OS(Res);
+    raw_svector_ostream OS(Res);
     OS << getSpelling();
     for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
       if (i) OS << ',';
diff --git a/contrib/llvm/lib/Option/ArgList.cpp b/contrib/llvm/lib/Option/ArgList.cpp
index f94de86..cbccc19 100644
--- a/contrib/llvm/lib/Option/ArgList.cpp
+++ b/contrib/llvm/lib/Option/ArgList.cpp
@@ -1,4 +1,4 @@
-//===--- ArgList.cpp - Argument List Management ---------------------------===//
+//===- ArgList.cpp - Argument List Management -----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,215 +7,67 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Option/ArgList.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Option/OptSpecifier.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::opt;
 
-void arg_iterator::SkipToNextArg() {
-  for (; Current != Args.end(); ++Current) {
-    // Done if there are no filters.
-    if (!Id0.isValid())
-      break;
-
-    // Otherwise require a match.
-    const Option &O = (*Current)->getOption();
-    if (O.matches(Id0) ||
-        (Id1.isValid() && O.matches(Id1)) ||
-        (Id2.isValid() && O.matches(Id2)))
-      break;
-  }
-}
-
 void ArgList::append(Arg *A) {
   Args.push_back(A);
-}
-
-void ArgList::eraseArg(OptSpecifier Id) {
-  Args.erase(
-      remove_if(*this, [=](Arg *A) { return A->getOption().matches(Id); }),
-      end());
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id))
-      return *it;
-  return nullptr;
-}
 
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1,
-                                OptSpecifier Id2) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id0) || (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArgNoClaim(OptSpecifier Id0, OptSpecifier Id1,
-                                OptSpecifier Id2, OptSpecifier Id3) const {
-  // FIXME: Make search efficient?
-  for (const_reverse_iterator it = rbegin(), ie = rend(); it != ie; ++it)
-    if ((*it)->getOption().matches(Id0) || (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) || (*it)->getOption().matches(Id3))
-      return *it;
-  return nullptr;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id)) {
-      Res = *it;
-      Res->claim();
-    }
+  // Update ranges for the option and all of its groups.
+  for (Option O = A->getOption().getUnaliasedOption(); O.isValid();
+       O = O.getGroup()) {
+    auto &R =
+        OptRanges.insert(std::make_pair(O.getID(), emptyRange())).first->second;
+    R.first = std::min<unsigned>(R.first, Args.size() - 1);
+    R.second = Args.size();
   }
-
-  return Res;
 }
 
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1)) {
-      Res = *it;
-      Res->claim();
-
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4, OptSpecifier Id5) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4) ||
-        (*it)->getOption().matches(Id5)) {
-      Res = *it;
-      Res->claim();
-    }
+void ArgList::eraseArg(OptSpecifier Id) {
+  // Zero out the removed entries but keep them around so that we don't
+  // need to invalidate OptRanges.
+  for (Arg *const &A : filtered(Id)) {
+    // Avoid the need for a non-const filtered iterator variant.
+    Arg **ArgsBegin = Args.data();
+    ArgsBegin[&A - ArgsBegin] = nullptr;
   }
-
-  return Res;
+  OptRanges.erase(Id.getID());
 }
 
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4, OptSpecifier Id5,
-                         OptSpecifier Id6) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4) ||
-        (*it)->getOption().matches(Id5) ||
-        (*it)->getOption().matches(Id6)) {
-      Res = *it;
-      Res->claim();
+ArgList::OptRange
+ArgList::getRange(std::initializer_list<OptSpecifier> Ids) const {
+  OptRange R = emptyRange();
+  for (auto Id : Ids) {
+    auto I = OptRanges.find(Id.getID());
+    if (I != OptRanges.end()) {
+      R.first = std::min(R.first, I->second.first);
+      R.second = std::max(R.second, I->second.second);
     }
   }
-
-  return Res;
-}
-
-Arg *ArgList::getLastArg(OptSpecifier Id0, OptSpecifier Id1,
-                         OptSpecifier Id2, OptSpecifier Id3,
-                         OptSpecifier Id4, OptSpecifier Id5,
-                         OptSpecifier Id6, OptSpecifier Id7) const {
-  Arg *Res = nullptr;
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it) {
-    if ((*it)->getOption().matches(Id0) ||
-        (*it)->getOption().matches(Id1) ||
-        (*it)->getOption().matches(Id2) ||
-        (*it)->getOption().matches(Id3) ||
-        (*it)->getOption().matches(Id4) ||
-        (*it)->getOption().matches(Id5) ||
-        (*it)->getOption().matches(Id6) ||
-        (*it)->getOption().matches(Id7)) {
-      Res = *it;
-      Res->claim();
-    }
-  }
-
-  return Res;
+  // Map an empty {-1, 0} range to {0, 0} so it can be used to form iterators.
+  if (R.first == -1u)
+    R.first = 0;
+  return R;
 }
 
 bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
@@ -231,8 +83,7 @@ bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
   return Default;
 }
 
-StringRef ArgList::getLastArgValue(OptSpecifier Id,
-                                         StringRef Default) const {
+StringRef ArgList::getLastArgValue(OptSpecifier Id, StringRef Default) const {
   if (Arg *A = getLastArg(Id))
     return A->getValue();
   return Default;
@@ -262,7 +113,7 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0,
 void ArgList::AddAllArgsExcept(ArgStringList &Output,
                                ArrayRef<OptSpecifier> Ids,
                                ArrayRef<OptSpecifier> ExcludeIds) const {
-  for (const Arg *Arg : Args) {
+  for (const Arg *Arg : *this) {
     bool Excluded = false;
     for (OptSpecifier Id : ExcludeIds) {
       if (Arg->getOption().matches(Id)) {
@@ -325,14 +176,14 @@ void ArgList::AddAllArgsTranslated(ArgStringList &Output, OptSpecifier Id0,
 }
 
 void ArgList::ClaimAllArgs(OptSpecifier Id0) const {
-  for (auto Arg : filtered(Id0))
+  for (auto *Arg : filtered(Id0))
     Arg->claim();
 }
 
 void ArgList::ClaimAllArgs() const {
-  for (const_iterator it = begin(), ie = end(); it != ie; ++it)
-    if (!(*it)->isClaimed())
-      (*it)->claim();
+  for (auto *Arg : *this)
+    if (!Arg->isClaimed())
+      Arg->claim();
 }
 
 const char *ArgList::GetOrMakeJoinedArgString(unsigned Index,
@@ -353,9 +204,9 @@ void ArgList::print(raw_ostream &O) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ArgList::dump() const { print(dbgs()); }
-
-//
+#endif
 
 void InputArgList::releaseMemory() {
   // An InputArgList always owns its arguments.
@@ -392,8 +243,6 @@ const char *InputArgList::MakeArgStringRef(StringRef Str) const {
   return getArgString(MakeIndex(Str));
 }
 
-//
-
 DerivedArgList::DerivedArgList(const InputArgList &BaseArgs)
     : BaseArgs(BaseArgs) {}
 
diff --git a/contrib/llvm/lib/Option/OptTable.cpp b/contrib/llvm/lib/Option/OptTable.cpp
index 7eafb00..51c62d3 100644
--- a/contrib/llvm/lib/Option/OptTable.cpp
+++ b/contrib/llvm/lib/Option/OptTable.cpp
@@ -1,4 +1,4 @@
-//===--- OptTable.cpp - Option Table Implementation -----------------------===//
+//===- OptTable.cpp - Option Table Implementation -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Option/OptTable.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Option/OptSpecifier.h"
+#include "llvm/Option/OptTable.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
+#include <cstring>
 #include <map>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace llvm::opt;
@@ -80,14 +89,14 @@ static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
 static inline bool operator<(const OptTable::Info &I, const char *Name) {
   return StrCmpOptionNameIgnoreCase(I.Name, Name) < 0;
 }
-}
-}
+
+} // end namespace opt
+} // end namespace llvm
 
 OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {}
 
 OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase)
-    : OptionInfos(OptionInfos), IgnoreCase(IgnoreCase), TheInputOptionID(0),
-      TheUnknownOptionID(0), FirstSearchableIndex(0) {
+    : OptionInfos(OptionInfos), IgnoreCase(IgnoreCase) {
   // Explicitly zero initialize the error to work around a bug in array
   // value-initialization on MinGW with gcc 4.3.5.
 
@@ -138,8 +147,8 @@ OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase)
   }
 
   // Build prefix chars.
-  for (llvm::StringSet<>::const_iterator I = PrefixesUnion.begin(),
-                                         E = PrefixesUnion.end(); I != E; ++I) {
+  for (StringSet<>::const_iterator I = PrefixesUnion.begin(),
+                                   E = PrefixesUnion.end(); I != E; ++I) {
     StringRef Prefix = I->getKey();
     for (StringRef::const_iterator C = Prefix.begin(), CE = Prefix.end();
                                    C != CE; ++C)
@@ -148,8 +157,7 @@ OptTable::OptTable(ArrayRef<Info> OptionInfos, bool IgnoreCase)
   }
 }
 
-OptTable::~OptTable() {
-}
+OptTable::~OptTable() = default;
 
 const Option OptTable::getOption(OptSpecifier Opt) const {
   unsigned id = Opt.getID();
@@ -159,11 +167,11 @@ const Option OptTable::getOption(OptSpecifier Opt) const {
   return Option(&getInfo(id), this);
 }
 
-static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
+static bool isInput(const StringSet<> &Prefixes, StringRef Arg) {
   if (Arg == "-")
     return true;
-  for (llvm::StringSet<>::const_iterator I = Prefixes.begin(),
-                                         E = Prefixes.end(); I != E; ++I)
+  for (StringSet<>::const_iterator I = Prefixes.begin(),
+                                   E = Prefixes.end(); I != E; ++I)
     if (Arg.startswith(I->getKey()))
       return false;
   return true;
@@ -186,6 +194,57 @@ static unsigned matchOption(const OptTable::Info *I, StringRef Str,
   return 0;
 }
 
+// Returns true if one of the Prefixes + In.Names matches Option
+static bool optionMatches(const OptTable::Info &In, StringRef Option) {
+  if (In.Values && In.Prefixes)
+    for (size_t I = 0; In.Prefixes[I]; I++)
+      if (Option == std::string(In.Prefixes[I]) + In.Name)
+        return true;
+  return false;
+}
+
+// This function is for flag value completion.
+// Eg. When "-stdlib=" and "l" was passed to this function, it will return
+// appropiriate values for stdlib, which starts with l.
+std::vector<std::string>
+OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
+  // Search all options and return possible values.
+  for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
+    if (!optionMatches(In, Option))
+      continue;
+
+    SmallVector<StringRef, 8> Candidates;
+    StringRef(In.Values).split(Candidates, ",", -1, false);
+
+    std::vector<std::string> Result;
+    for (StringRef Val : Candidates)
+      if (Val.startswith(Arg))
+        Result.push_back(Val);
+    return Result;
+  }
+  return {};
+}
+
+std::vector<std::string>
+OptTable::findByPrefix(StringRef Cur, unsigned short DisableFlags) const {
+  std::vector<std::string> Ret;
+  for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
+    if (!In.Prefixes || (!In.HelpText && !In.GroupID))
+      continue;
+    if (In.Flags & DisableFlags)
+      continue;
+
+    for (int I = 0; In.Prefixes[I]; I++) {
+      std::string S = std::string(In.Prefixes[I]) + std::string(In.Name) + "\t";
+      if (In.HelpText)
+        S += In.HelpText;
+      if (StringRef(S).startswith(Cur))
+        Ret.push_back(S);
+    }
+  }
+  return Ret;
+}
+
 Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
                            unsigned FlagsToInclude,
                            unsigned FlagsToExclude) const {
@@ -314,6 +373,9 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
   case Option::FlagClass:
     break;
 
+  case Option::ValuesClass:
+    break;
+
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
   case Option::RemainingArgsClass: case Option::RemainingArgsJoinedClass:
     Name += ' ';
@@ -330,27 +392,29 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
   return Name;
 }
 
+namespace {
+struct OptionInfo {
+  std::string Name;
+  StringRef HelpText;
+};
+} // namespace
+
 static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
-                                std::vector<std::pair<std::string,
-                                const char*> > &OptionHelp) {
+                                std::vector<OptionInfo> &OptionHelp) {
   OS << Title << ":\n";
 
   // Find the maximum option length.
   unsigned OptionFieldWidth = 0;
   for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    // Skip titles.
-    if (!OptionHelp[i].second)
-      continue;
-
     // Limit the amount of padding we are willing to give up for alignment.
-    unsigned Length = OptionHelp[i].first.size();
+    unsigned Length = OptionHelp[i].Name.size();
     if (Length <= 23)
       OptionFieldWidth = std::max(OptionFieldWidth, Length);
   }
 
   const unsigned InitialPad = 2;
   for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) {
-    const std::string &Option = OptionHelp[i].first;
+    const std::string &Option = OptionHelp[i].Name;
     int Pad = OptionFieldWidth - int(Option.size());
     OS.indent(InitialPad) << Option;
 
@@ -359,7 +423,7 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title,
       OS << "\n";
       Pad = OptionFieldWidth + InitialPad;
     }
-    OS.indent(Pad + 1) << OptionHelp[i].second << '\n';
+    OS.indent(Pad + 1) << OptionHelp[i].HelpText << '\n';
   }
 }
 
@@ -398,8 +462,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
 
   // Render help text into a map of group-name to a list of (option, help)
   // pairs.
-  typedef std::map<std::string,
-                 std::vector<std::pair<std::string, const char*> > > helpmap_ty;
+  using helpmap_ty = std::map<std::string, std::vector<OptionInfo>>;
   helpmap_ty GroupedOptionHelp;
 
   for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
@@ -418,7 +481,7 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
     if (const char *Text = getOptionHelpText(Id)) {
       const char *HelpGroup = getOptionHelpGroup(*this, Id);
       const std::string &OptName = getOptionHelpName(*this, Id);
-      GroupedOptionHelp[HelpGroup].push_back(std::make_pair(OptName, Text));
+      GroupedOptionHelp[HelpGroup].push_back({OptName, Text});
     }
   }
 
diff --git a/contrib/llvm/lib/Option/Option.cpp b/contrib/llvm/lib/Option/Option.cpp
index 5eb179f..bf9f040 100644
--- a/contrib/llvm/lib/Option/Option.cpp
+++ b/contrib/llvm/lib/Option/Option.cpp
@@ -1,4 +1,4 @@
-//===--- Option.cpp - Abstract Driver Options -----------------------------===//
+//===- Option.cpp - Abstract Driver Options -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Option/Option.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Option/OptTable.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
+#include <cstring>
 
 using namespace llvm;
 using namespace llvm::opt;
 
 Option::Option(const OptTable::Info *info, const OptTable *owner)
   : Info(info), Owner(owner) {
-
   // Multi-level aliases are not supported. This just simplifies option
   // tracking, it is not an inherent limitation.
   assert((!Info || !getAlias().isValid() || !getAlias().getAlias().isValid()) &&
@@ -45,6 +47,7 @@ void Option::print(raw_ostream &O) const {
     P(UnknownClass);
     P(FlagClass);
     P(JoinedClass);
+    P(ValuesClass);
     P(SeparateClass);
     P(CommaJoinedClass);
     P(MultiArgClass);
@@ -83,7 +86,9 @@ void Option::print(raw_ostream &O) const {
   O << ">\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Option::dump() const { print(dbgs()); }
+#endif
 
 bool Option::matches(OptSpecifier Opt) const {
   // Aliases are never considered in matching, look through them.
diff --git a/contrib/llvm/lib/Passes/PassBuilder.cpp b/contrib/llvm/lib/Passes/PassBuilder.cpp
index 2994a07..9e0cf27 100644
--- a/contrib/llvm/lib/Passes/PassBuilder.cpp
+++ b/contrib/llvm/lib/Passes/PassBuilder.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -61,6 +62,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/GCOVProfiler.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/ConstantMerge.h"
 #include "llvm/Transforms/IPO/CrossDSOCFI.h"
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
@@ -104,9 +106,12 @@
 #include "llvm/Transforms/Scalar/LoopDistribute.h"
 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
 #include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopPredication.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
 #include "llvm/Transforms/Scalar/LowerAtomic.h"
@@ -120,6 +125,7 @@
 #include "llvm/Transforms/Scalar/Reassociate.h"
 #include "llvm/Transforms/Scalar/SCCP.h"
 #include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Scalar/Sink.h"
 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
@@ -131,8 +137,8 @@
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include "llvm/Transforms/Utils/Mem2Reg.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/SimplifyInstructions.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
@@ -142,7 +148,32 @@
 
 using namespace llvm;
 
-static Regex DefaultAliasRegex("^(default|lto-pre-link|lto)<(O[0123sz])>$");
+static cl::opt<unsigned> MaxDevirtIterations("pm-max-devirt-iterations",
+                                             cl::ReallyHidden, cl::init(4));
+static cl::opt<bool>
+    RunPartialInlining("enable-npm-partial-inlining", cl::init(false),
+                       cl::Hidden, cl::ZeroOrMore,
+                       cl::desc("Run Partial inlinining pass"));
+
+static cl::opt<bool>
+    RunNewGVN("enable-npm-newgvn", cl::init(false),
+              cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Run NewGVN instead of GVN"));
+
+static cl::opt<bool> EnableEarlyCSEMemSSA(
+    "enable-npm-earlycse-memssa", cl::init(true), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass for the new PM (default = on)"));
+
+static cl::opt<bool> EnableGVNHoist(
+    "enable-npm-gvn-hoist", cl::init(false), cl::Hidden,
+    cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
+
+static cl::opt<bool> EnableGVNSink(
+    "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
+    cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
+
+static Regex DefaultAliasRegex(
+    "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$");
 
 static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) {
   switch (Level) {
@@ -250,33 +281,52 @@ AnalysisKey NoOpLoopAnalysis::Key;
 
 } // End anonymous namespace.
 
+void PassBuilder::invokePeepholeEPCallbacks(
+    FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
+  for (auto &C : PeepholeEPCallbacks)
+    C(FPM, Level);
+}
+
 void PassBuilder::registerModuleAnalyses(ModuleAnalysisManager &MAM) {
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   MAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
+
+  for (auto &C : ModuleAnalysisRegistrationCallbacks)
+    C(MAM);
 }
 
 void PassBuilder::registerCGSCCAnalyses(CGSCCAnalysisManager &CGAM) {
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)                                      \
   CGAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
+
+  for (auto &C : CGSCCAnalysisRegistrationCallbacks)
+    C(CGAM);
 }
 
 void PassBuilder::registerFunctionAnalyses(FunctionAnalysisManager &FAM) {
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   FAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
+
+  for (auto &C : FunctionAnalysisRegistrationCallbacks)
+    C(FAM);
 }
 
 void PassBuilder::registerLoopAnalyses(LoopAnalysisManager &LAM) {
 #define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
   LAM.registerPass([&] { return CREATE_PASS; });
 #include "PassRegistry.def"
+
+  for (auto &C : LoopAnalysisRegistrationCallbacks)
+    C(LAM);
 }
 
 FunctionPassManager
 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
-                                                 bool DebugLogging) {
+                                                 bool DebugLogging,
+                                                 bool PrepareForThinLTO) {
   assert(Level != O0 && "Must request optimizations!");
   FunctionPassManager FPM(DebugLogging);
 
@@ -285,7 +335,17 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(SROA());
 
   // Catch trivial redundancies
-  FPM.addPass(EarlyCSEPass());
+  FPM.addPass(EarlyCSEPass(EnableEarlyCSEMemSSA));
+
+  // Hoisting of scalars and load expressions.
+  if (EnableGVNHoist)
+    FPM.addPass(GVNHoistPass());
+
+  // Global value numbering based sinking.
+  if (EnableGVNSink) {
+    FPM.addPass(GVNSinkPass());
+    FPM.addPass(SimplifyCFGPass());
+  }
 
   // Speculative execution if the target has divergent branches; otherwise nop.
   FPM.addPass(SpeculativeExecutionPass());
@@ -299,6 +359,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if (!isOptimizingForSize(Level))
     FPM.addPass(LibCallsShrinkWrapPass());
 
+  invokePeepholeEPCallbacks(FPM, Level);
+
   FPM.addPass(TailCallElimPass());
   FPM.addPass(SimplifyCFGPass());
 
@@ -316,19 +378,29 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // the other we have is `LoopInstSimplify`.
   LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging);
 
-  // FIXME: Enable these when the loop pass manager can support enforcing loop
-  // simplified and LCSSA form as well as updating the loop nest after
-  // transformations and we finsih porting the loop passes.
-#if 0
   // Rotate Loop - disable header duplication at -Oz
   LPM1.addPass(LoopRotatePass(Level != Oz));
   LPM1.addPass(LICMPass());
-  LPM1.addPass(LoopUnswitchPass(/* OptimizeForSize */ Level != O3));
+  LPM1.addPass(SimpleLoopUnswitchPass());
   LPM2.addPass(IndVarSimplifyPass());
-  LPM2.addPass(LoopIdiomPass());
+  LPM2.addPass(LoopIdiomRecognizePass());
+
+  for (auto &C : LateLoopOptimizationsEPCallbacks)
+    C(LPM2, Level);
+
   LPM2.addPass(LoopDeletionPass());
-  LPM2.addPass(SimpleLoopUnrollPass());
-#endif
+  // Do not enable unrolling in PrepareForThinLTO phase during sample PGO
+  // because it changes IR to makes profile annotation in back compile
+  // inaccurate.
+  if (!PrepareForThinLTO || !PGOOpt || PGOOpt->SampleProfileFile.empty())
+    LPM2.addPass(LoopUnrollPass::createFull(Level));
+
+  for (auto &C : LoopOptimizerEndEPCallbacks)
+    C(LPM2, Level);
+
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1)));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
@@ -338,7 +410,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if (Level != O1) {
     // These passes add substantial compile time so skip them at O1.
     FPM.addPass(MergedLoadStoreMotionPass());
-    FPM.addPass(GVN());
+    if (RunNewGVN)
+      FPM.addPass(NewGVNPass());
+    else
+      FPM.addPass(GVN());
   }
 
   // Specially optimize memory movement as it doesn't look like dataflow in SSA.
@@ -357,37 +432,102 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // Run instcombine after redundancy and dead bit elimination to exploit
   // opportunities opened up by them.
   FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
   FPM.addPass(JumpThreadingPass());
   FPM.addPass(CorrelatedValuePropagationPass());
   FPM.addPass(DSEPass());
-  // FIXME: Enable this when the loop pass manager can support enforcing loop
-  // simplified and LCSSA form as well as updating the loop nest after
-  // transformations and we finsih porting the loop passes.
-#if 0
   FPM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
-#endif
+
+  for (auto &C : ScalarOptimizerLateEPCallbacks)
+    C(FPM, Level);
 
   // Finally, do an expensive DCE pass to catch all the dead code exposed by
   // the simplifications and basic cleanup after all the simplifications.
   FPM.addPass(ADCEPass());
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
 
   return FPM;
 }
 
+void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
+                                    PassBuilder::OptimizationLevel Level,
+                                    bool RunProfileGen,
+                                    std::string ProfileGenFile,
+                                    std::string ProfileUseFile) {
+  // Generally running simplification passes and the inliner with an high
+  // threshold results in smaller executables, but there may be cases where
+  // the size grows, so let's be conservative here and skip this simplification
+  // at -Os/Oz.
+  if (!isOptimizingForSize(Level)) {
+    InlineParams IP;
+
+    // In the old pass manager, this is a cl::opt. Should still this be one?
+    IP.DefaultThreshold = 75;
+
+    // FIXME: The hint threshold has the same value used by the regular inliner.
+    // This should probably be lowered after performance testing.
+    // FIXME: this comment is cargo culted from the old pass manager, revisit).
+    IP.HintThreshold = 325;
+
+    CGSCCPassManager CGPipeline(DebugLogging);
+
+    CGPipeline.addPass(InlinerPass(IP));
+
+    FunctionPassManager FPM;
+    FPM.addPass(SROA());
+    FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
+    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
+    FPM.addPass(InstCombinePass()); // Combine silly sequences.
+    invokePeepholeEPCallbacks(FPM, Level);
+
+    CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline)));
+  }
+
+  // Delete anything that is now dead to make sure that we don't instrument
+  // dead code. Instrumentation can end up keeping dead code around and
+  // dramatically increase code size.
+  MPM.addPass(GlobalDCEPass());
+
+  if (RunProfileGen) {
+    MPM.addPass(PGOInstrumentationGen());
+
+    FunctionPassManager FPM;
+    FPM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
+    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+    // Add the profile lowering pass.
+    InstrProfOptions Options;
+    if (!ProfileGenFile.empty())
+      Options.InstrProfileOutput = ProfileGenFile;
+    Options.DoCounterPromotion = true;
+    MPM.addPass(InstrProfiling(Options));
+  }
+
+  if (!ProfileUseFile.empty())
+    MPM.addPass(PGOInstrumentationUse(ProfileUseFile));
+}
+
+static InlineParams
+getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) {
+  auto O3 = PassBuilder::O3;
+  unsigned OptLevel = Level > O3 ? 2 : Level;
+  unsigned SizeLevel = Level > O3 ? Level - O3 : 0;
+  return getInlineParams(OptLevel, SizeLevel);
+}
+
 ModulePassManager
-PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
-                                           bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
+                                               bool DebugLogging,
+                                               bool PrepareForThinLTO) {
   ModulePassManager MPM(DebugLogging);
 
-  // Force any function attributes we want the rest of the pipeline te observe.
-  MPM.addPass(ForceFunctionAttrsPass());
-
   // Do basic inference of function attributes from known properties of system
   // libraries and other oracles.
   MPM.addPass(InferFunctionAttrsPass());
@@ -399,7 +539,6 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   EarlyFPM.addPass(SROA());
   EarlyFPM.addPass(EarlyCSEPass());
   EarlyFPM.addPass(LowerExpectIntrinsicPass());
-  EarlyFPM.addPass(GVNHoistPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
 
   // Interprocedural constant propagation now that basic cleanup has occured
@@ -426,13 +565,36 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // optimizations.
   FunctionPassManager GlobalCleanupPM(DebugLogging);
   GlobalCleanupPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(GlobalCleanupPM, Level);
+
   GlobalCleanupPM.addPass(SimplifyCFGPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM)));
 
-  // FIXME: Enable this when cross-IR-unit analysis invalidation is working.
-#if 0
-  MPM.addPass(RequireAnalysisPass<GlobalsAA>());
-#endif
+  // Add all the requested passes for PGO, if requested.
+  if (PGOOpt) {
+    assert(PGOOpt->RunProfileGen || !PGOOpt->SampleProfileFile.empty() ||
+           !PGOOpt->ProfileUseFile.empty());
+    if (PGOOpt->SampleProfileFile.empty())
+      addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
+                        PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+    else
+      MPM.addPass(SampleProfileLoaderPass(PGOOpt->SampleProfileFile));
+
+    // Indirect call promotion that promotes intra-module targes only.
+    // Do not enable it in PrepareForThinLTO phase during sample PGO because
+    // it changes IR to makes profile annotation in back compile inaccurate.
+    if (!PrepareForThinLTO || PGOOpt->SampleProfileFile.empty())
+      MPM.addPass(PGOIndirectCallPromotion(
+          false, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
+  }
+
+  // Require the GlobalsAA analysis for the module so we can query it within
+  // the CGSCC pipeline.
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
+
+  // Require the ProfileSummaryAnalysis for the module so we can query it within
+  // the inliner pass.
+  MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
 
   // Now begin the main postorder CGSCC pipeline.
   // FIXME: The current CGSCC pipeline has its origins in the legacy pass
@@ -448,26 +610,63 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Run the inliner first. The theory is that we are walking bottom-up and so
   // the callees have already been fully optimized, and we want to inline them
   // into the callers so that our optimizations can reflect that.
-  // FIXME; Customize the threshold based on optimization level.
-  MainCGPipeline.addPass(InlinerPass());
+  // For PrepareForThinLTO pass, we disable hot-caller heuristic for sample PGO
+  // because it makes profile annotation in the backend inaccurate.
+  InlineParams IP = getInlineParamsFromOptLevel(Level);
+  if (PrepareForThinLTO && PGOOpt && !PGOOpt->SampleProfileFile.empty())
+    IP.HotCallSiteThreshold = 0;
+  MainCGPipeline.addPass(InlinerPass(IP));
 
   // Now deduce any function attributes based in the current code.
   MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
 
+  // When at O3 add argument promotion to the pass pipeline.
+  // FIXME: It isn't at all clear why this should be limited to O3.
+  if (Level == O3)
+    MainCGPipeline.addPass(ArgumentPromotionPass());
+
   // Lastly, add the core function simplification pipeline nested inside the
   // CGSCC walk.
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
-      buildFunctionSimplificationPipeline(Level, DebugLogging)));
+      buildFunctionSimplificationPipeline(Level, DebugLogging,
+                                          PrepareForThinLTO)));
+
+  for (auto &C : CGSCCOptimizerLateEPCallbacks)
+    C(MainCGPipeline, Level);
 
+  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
+  // to detect when we devirtualize indirect calls and iterate the SCC passes
+  // in that case to try and catch knock-on inlining or function attrs
+  // opportunities. Then we add it to the module pipeline by walking the SCCs
+  // in postorder (or bottom-up).
   MPM.addPass(
-      createModuleToPostOrderCGSCCPassAdaptor(std::move(MainCGPipeline)));
+      createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass(
+          std::move(MainCGPipeline), MaxDevirtIterations, DebugLogging)));
 
-  // This ends the canonicalization and simplification phase of the pipeline.
-  // At this point, we expect to have canonical and simple IR which we begin
-  // *optimizing* for efficient execution going forward.
+  return MPM;
+}
+
+ModulePassManager
+PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
+                                             bool DebugLogging) {
+  ModulePassManager MPM(DebugLogging);
+
+  // Optimize globals now that the module is fully simplified.
+  MPM.addPass(GlobalOptPass());
 
-  // Eliminate externally available functions now that inlining is over -- we
-  // won't emit these anyways.
+  // Run partial inlining pass to partially inline functions that have
+  // large bodies.
+  if (RunPartialInlining)
+    MPM.addPass(PartialInlinerPass());
+
+  // Remove avail extern fns and globals definitions since we aren't compiling
+  // an object file for later LTO. For LTO we want to preserve these so they
+  // are eligible for inlining at link-time. Note if they are unreferenced they
+  // will be removed by GlobalDCE later, so this only impacts referenced
+  // available externally globals. Eventually they will be suppressed during
+  // codegen, but eliminating here enables more opportunity for GlobalDCE as it
+  // may make globals referenced by available external functions dead and saves
+  // running remaining passes on the eliminated functions.
   MPM.addPass(EliminateAvailableExternallyPass());
 
   // Do RPO function attribute inference across the module to forward-propagate
@@ -475,17 +674,14 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // FIXME: Is this really an optimization rather than a canonicalization?
   MPM.addPass(ReversePostOrderFunctionAttrsPass());
 
-  // Recompute GloblasAA here prior to function passes. This is particularly
+  // Re-require GloblasAA here prior to function passes. This is particularly
   // useful as the above will have inlined, DCE'ed, and function-attr
   // propagated everything. We should at this point have a reasonably minimal
   // and richly annotated call graph. By computing aliasing and mod/ref
   // information for all local globals here, the late loop passes and notably
   // the vectorizer will be able to use them to help recognize vectorizable
   // memory operations.
-  // FIXME: Enable this once analysis invalidation is fully supported.
-#if 0
-  MPM.addPass(Require<GlobalsAA>());
-#endif
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
 
   FunctionPassManager OptimizePM(DebugLogging);
   OptimizePM.addPass(Float2IntPass());
@@ -495,36 +691,70 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Optimize the loop execution. These passes operate on entire loop nests
   // rather than on each loop in an inside-out manner, and so they are actually
   // function passes.
+
+  for (auto &C : VectorizerStartEPCallbacks)
+    C(OptimizePM, Level);
+
+  // First rotate loops that may have been un-rotated by prior passes.
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopRotatePass()));
+
+  // Distribute loops to allow partial vectorization.  I.e. isolate dependences
+  // into separate loop that would otherwise inhibit vectorization.  This is
+  // currently only performed for loops marked with the metadata
+  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   OptimizePM.addPass(LoopDistributePass());
-#if 0
-  // FIXME: LoopVectorize relies on "requiring" LCSSA which isn't supported in
-  // the new PM.
+
+  // Now run the core loop vectorizer.
   OptimizePM.addPass(LoopVectorizePass());
-#endif
-  // FIXME: Need to port Loop Load Elimination and add it here.
+
+  // Eliminate loads by forwarding stores from the previous iteration to loads
+  // of the current iteration.
+  OptimizePM.addPass(LoopLoadEliminationPass());
+
+  // Cleanup after the loop optimization passes.
   OptimizePM.addPass(InstCombinePass());
 
+
+  // Now that we've formed fast to execute loop structures, we do further
+  // optimizations. These are run afterward as they might block doing complex
+  // analyses and transforms such as what are needed for loop vectorization.
+
   // Optimize parallel scalar instruction chains into SIMD instructions.
   OptimizePM.addPass(SLPVectorizerPass());
 
-  // Cleanup after vectorizers.
+  // Cleanup after all of the vectorizers.
   OptimizePM.addPass(SimplifyCFGPass());
   OptimizePM.addPass(InstCombinePass());
 
   // Unroll small loops to hide loop backedge latency and saturate any parallel
-  // execution resources of an out-of-order processor.
-  // FIXME: Need to add once loop pass pipeline is available.
-
-  // FIXME: Add the loop sink pass when ported.
-
-  // FIXME: Add cleanup from the loop pass manager when we're forming LCSSA
-  // here.
+  // execution resources of an out-of-order processor. We also then need to
+  // clean up redundancies and loop invariant code.
+  // FIXME: It would be really good to use a loop-integrated instruction
+  // combiner for cleanup here so that the unrolling and LICM can be pipelined
+  // across the loop nests.
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopUnrollPass::create(Level)));
+  OptimizePM.addPass(InstCombinePass());
+  OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
 
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
   OptimizePM.addPass(AlignmentFromAssumptionsPass());
 
-  // ADd the core optimizing pipeline.
+  // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+  // canonicalization pass that enables other optimizations. As a result,
+  // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+  // result too early.
+  OptimizePM.addPass(LoopSinkPass());
+
+  // And finally clean up LCSSA form before generating code.
+  OptimizePM.addPass(InstSimplifierPass());
+
+  // LoopSink (and other loop passes since the last simplifyCFG) might have
+  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
+  OptimizePM.addPass(SimplifyCFGPass());
+
+  // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
   // Now we need to do some global optimization transforms.
@@ -538,6 +768,87 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 }
 
 ModulePassManager
+PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
+                                           bool DebugLogging) {
+  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+
+  ModulePassManager MPM(DebugLogging);
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // Add the core simplification pipeline.
+  MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging,
+                                                /*PrepareForThinLTO=*/false));
+
+  // Now add the optimization pipeline.
+  MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging));
+
+  return MPM;
+}
+
+ModulePassManager
+PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
+                                                bool DebugLogging) {
+  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+
+  ModulePassManager MPM(DebugLogging);
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // If we are planning to perform ThinLTO later, we don't bloat the code with
+  // unrolling/vectorization/... now. Just simplify the module as much as we
+  // can.
+  MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging,
+                                                /*PrepareForThinLTO=*/true));
+
+  // Run partial inlining pass to partially inline functions that have
+  // large bodies.
+  // FIXME: It isn't clear whether this is really the right place to run this
+  // in ThinLTO. Because there is another canonicalization and simplification
+  // phase that will run after the thin link, running this here ends up with
+  // less information than will be available later and it may grow functions in
+  // ways that aren't beneficial.
+  if (RunPartialInlining)
+    MPM.addPass(PartialInlinerPass());
+
+  // Reduce the size of the IR as much as possible.
+  MPM.addPass(GlobalOptPass());
+
+  return MPM;
+}
+
+ModulePassManager
+PassBuilder::buildThinLTODefaultPipeline(OptimizationLevel Level,
+                                         bool DebugLogging) {
+  // FIXME: The summary index is not hooked in the new pass manager yet.
+  // When it's going to be hooked, enable WholeProgramDevirt and LowerTypeTest
+  // here.
+
+  ModulePassManager MPM(DebugLogging);
+
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
+
+  // During the ThinLTO backend phase we perform early indirect call promotion
+  // here, before globalopt. Otherwise imported available_externally functions
+  // look unreferenced and are removed.
+  MPM.addPass(PGOIndirectCallPromotion(
+      true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty() &&
+                            !PGOOpt->ProfileUseFile.empty()));
+
+  // Add the core simplification pipeline.
+  MPM.addPass(buildModuleSimplificationPipeline(Level, DebugLogging,
+                                                /*PrepareForThinLTO=*/false));
+
+  // Now add the optimization pipeline.
+  MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging));
+
+  return MPM;
+}
+
+ModulePassManager
 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level,
                                             bool DebugLogging) {
   assert(Level != O0 && "Must request optimizations for the default pipeline!");
@@ -550,13 +861,174 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   assert(Level != O0 && "Must request optimizations for the default pipeline!");
   ModulePassManager MPM(DebugLogging);
 
-  // FIXME: Finish fleshing this out to match the legacy LTO pipelines.
-  FunctionPassManager LateFPM(DebugLogging);
-  LateFPM.addPass(InstCombinePass());
-  LateFPM.addPass(SimplifyCFGPass());
+  // Remove unused virtual tables to improve the quality of code generated by
+  // whole-program devirtualization and bitset lowering.
+  MPM.addPass(GlobalDCEPass());
 
-  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM)));
+  // Force any function attributes we want the rest of the pipeline to observe.
+  MPM.addPass(ForceFunctionAttrsPass());
 
+  // Do basic inference of function attributes from known properties of system
+  // libraries and other oracles.
+  MPM.addPass(InferFunctionAttrsPass());
+
+  if (Level > 1) {
+    // Indirect call promotion. This should promote all the targets that are
+    // left by the earlier promotion pass that promotes intra-module targets.
+    // This two-step promotion is to save the compile time. For LTO, it should
+    // produce the same result as if we only do promotion here.
+    MPM.addPass(PGOIndirectCallPromotion(
+        true /* InLTO */, PGOOpt && !PGOOpt->SampleProfileFile.empty()));
+
+    // Propagate constants at call sites into the functions they call.  This
+    // opens opportunities for globalopt (and inlining) by substituting function
+    // pointers passed as arguments to direct uses of functions.
+   MPM.addPass(IPSCCPPass());
+  }
+
+  // Now deduce any function attributes based in the current code.
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              PostOrderFunctionAttrsPass()));
+
+  // Do RPO function attribute inference across the module to forward-propagate
+  // attributes where applicable.
+  // FIXME: Is this really an optimization rather than a canonicalization?
+  MPM.addPass(ReversePostOrderFunctionAttrsPass());
+
+  // Use inragne annotations on GEP indices to split globals where beneficial.
+  MPM.addPass(GlobalSplitPass());
+
+  // Run whole program optimization of virtual call when the list of callees
+  // is fixed.
+  MPM.addPass(WholeProgramDevirtPass());
+
+  // Stop here at -O1.
+  if (Level == 1)
+    return MPM;
+
+  // Optimize globals to try and fold them into constants.
+  MPM.addPass(GlobalOptPass());
+
+  // Promote any localized globals to SSA registers.
+  MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
+
+  // Linking modules together can lead to duplicate global constant, only
+  // keep one copy of each constant.
+  MPM.addPass(ConstantMergePass());
+
+  // Remove unused arguments from functions.
+  MPM.addPass(DeadArgumentEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  FunctionPassManager PeepholeFPM(DebugLogging);
+  PeepholeFPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(PeepholeFPM, Level);
+
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM)));
+
+  // Note: historically, the PruneEH pass was run first to deduce nounwind and
+  // generally clean up exception handling overhead. It isn't clear this is
+  // valuable as the inliner doesn't currently care whether it is inlining an
+  // invoke or a call.
+  // Run the inliner now.
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+      InlinerPass(getInlineParamsFromOptLevel(Level))));
+
+  // Optimize globals again after we ran the inliner.
+  MPM.addPass(GlobalOptPass());
+
+  // Garbage collect dead functions.
+  // FIXME: Add ArgumentPromotion pass after once it's ported.
+  MPM.addPass(GlobalDCEPass());
+
+  FunctionPassManager FPM(DebugLogging);
+  // The IPO Passes may leave cruft around. Clean up after them.
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  FPM.addPass(JumpThreadingPass());
+
+  // Break up allocas
+  FPM.addPass(SROA());
+
+  // Run a few AA driver optimizations here and now to cleanup the code.
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+
+  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+              PostOrderFunctionAttrsPass()));
+  // FIXME: here we run IP alias analysis in the legacy PM.
+
+  FunctionPassManager MainFPM;
+
+  // FIXME: once we fix LoopPass Manager, add LICM here.
+  // FIXME: once we provide support for enabling MLSM, add it here.
+  // FIXME: once we provide support for enabling NewGVN, add it here.
+  if (RunNewGVN)
+    MainFPM.addPass(NewGVNPass());
+  else
+    MainFPM.addPass(GVN());
+
+  // Remove dead memcpy()'s.
+  MainFPM.addPass(MemCpyOptPass());
+
+  // Nuke dead stores.
+  MainFPM.addPass(DSEPass());
+
+  // FIXME: at this point, we run a bunch of loop passes:
+  // indVarSimplify, loopDeletion, loopInterchange, loopUnrool,
+  // loopVectorize. Enable them once the remaining issue with LPM
+  // are sorted out.
+
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(SimplifyCFGPass());
+  MainFPM.addPass(SCCPPass());
+  MainFPM.addPass(InstCombinePass());
+  MainFPM.addPass(BDCEPass());
+
+  // FIXME: We may want to run SLPVectorizer here.
+  // After vectorization, assume intrinsics may tell us more
+  // about pointer alignments.
+#if 0
+  MainFPM.add(AlignmentFromAssumptionsPass());
+#endif
+
+  // FIXME: Conditionally run LoadCombine here, after it's ported
+  // (in case we still have this pass, given its questionable usefulness).
+
+  MainFPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(MainFPM, Level);
+  MainFPM.addPass(JumpThreadingPass());
+  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));
+
+  // Create a function that performs CFI checks for cross-DSO calls with
+  // targets in the current module.
+  MPM.addPass(CrossDSOCFIPass());
+
+  // Lower type metadata and the type.test intrinsic. This pass supports
+  // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs
+  // to be run at link time if CFI is enabled. This pass does nothing if
+  // CFI is disabled.
+  // Enable once we add support for the summary in the new PM.
+#if 0
+  MPM.addPass(LowerTypeTestsPass(Summary ? PassSummaryAction::Export :
+                                           PassSummaryAction::None,
+                                Summary));
+#endif
+
+  // Add late LTO optimization passes.
+  // Delete basic blocks, which optimization passes may have killed.
+  MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass()));
+
+  // Drop bodies of available eternally objects to improve GlobalDCE.
+  MPM.addPass(EliminateAvailableExternallyPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  MPM.addPass(GlobalDCEPass());
+
+  // FIXME: Enable MergeFuncs, conditionally, after ported, maybe.
   return MPM;
 }
 
@@ -579,12 +1051,8 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
   // Add support for querying global aliasing information when available.
   // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
   // analysis, all that the `AAManager` can do is query for any *cached*
-  // results from `GlobalsAA` through a readonly proxy..
-#if 0
-  // FIXME: Enable once the invalidation logic supports this. Currently, the
-  // `AAManager` will hold stale references to the module analyses.
+  // results from `GlobalsAA` through a readonly proxy.
   AA.registerModuleAnalysis<GlobalsAA>();
-#endif
 
   return AA;
 }
@@ -607,9 +1075,36 @@ static Optional<int> parseDevirtPassName(StringRef Name) {
   return Count;
 }
 
-static bool isModulePassName(StringRef Name) {
+/// Tests whether a pass name starts with a valid prefix for a default pipeline
+/// alias.
+static bool startsWithDefaultPipelineAliasPrefix(StringRef Name) {
+  return Name.startswith("default") || Name.startswith("thinlto") ||
+         Name.startswith("lto");
+}
+
+/// Tests whether registered callbacks will accept a given pass name.
+///
+/// When parsing a pipeline text, the type of the outermost pipeline may be
+/// omitted, in which case the type is automatically determined from the first
+/// pass name in the text. This may be a name that is handled through one of the
+/// callbacks. We check this through the oridinary parsing callbacks by setting
+/// up a dummy PassManager in order to not force the client to also handle this
+/// type of query.
+template <typename PassManagerT, typename CallbacksT>
+static bool callbacksAcceptPassName(StringRef Name, CallbacksT &Callbacks) {
+  if (!Callbacks.empty()) {
+    PassManagerT DummyPM;
+    for (auto &CB : Callbacks)
+      if (CB(Name, DummyPM, {}))
+        return true;
+  }
+  return false;
+}
+
+template <typename CallbacksT>
+static bool isModulePassName(StringRef Name, CallbacksT &Callbacks) {
   // Manually handle aliases for pre-configured pipeline fragments.
-  if (Name.startswith("default") || Name.startswith("lto"))
+  if (startsWithDefaultPipelineAliasPrefix(Name))
     return DefaultAliasRegex.match(Name);
 
   // Explicitly handle pass manager names.
@@ -632,10 +1127,11 @@ static bool isModulePassName(StringRef Name) {
     return true;
 #include "PassRegistry.def"
 
-  return false;
+  return callbacksAcceptPassName<ModulePassManager>(Name, Callbacks);
 }
 
-static bool isCGSCCPassName(StringRef Name) {
+template <typename CallbacksT>
+static bool isCGSCCPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
   if (Name == "cgscc")
     return true;
@@ -656,10 +1152,11 @@ static bool isCGSCCPassName(StringRef Name) {
     return true;
 #include "PassRegistry.def"
 
-  return false;
+  return callbacksAcceptPassName<CGSCCPassManager>(Name, Callbacks);
 }
 
-static bool isFunctionPassName(StringRef Name) {
+template <typename CallbacksT>
+static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
   if (Name == "function")
     return true;
@@ -678,10 +1175,11 @@ static bool isFunctionPassName(StringRef Name) {
     return true;
 #include "PassRegistry.def"
 
-  return false;
+  return callbacksAcceptPassName<FunctionPassManager>(Name, Callbacks);
 }
 
-static bool isLoopPassName(StringRef Name) {
+template <typename CallbacksT>
+static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks) {
   // Explicitly handle pass manager names.
   if (Name == "loop")
     return true;
@@ -698,7 +1196,7 @@ static bool isLoopPassName(StringRef Name) {
     return true;
 #include "PassRegistry.def"
 
-  return false;
+  return callbacksAcceptPassName<LoopPassManager>(Name, Callbacks);
 }
 
 Optional<std::vector<PassBuilder::PipelineElement>>
@@ -799,30 +1297,39 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
       MPM.addPass(createRepeatedPass(*Count, std::move(NestedMPM)));
       return true;
     }
+
+    for (auto &C : ModulePipelineParsingCallbacks)
+      if (C(Name, MPM, InnerPipeline))
+        return true;
+
     // Normal passes can't have pipelines.
     return false;
   }
 
   // Manually handle aliases for pre-configured pipeline fragments.
-  if (Name.startswith("default") || Name.startswith("lto")) {
+  if (startsWithDefaultPipelineAliasPrefix(Name)) {
     SmallVector<StringRef, 3> Matches;
     if (!DefaultAliasRegex.match(Name, &Matches))
       return false;
     assert(Matches.size() == 3 && "Must capture two matched strings!");
 
     OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
-        .Case("O0", O0)
-        .Case("O1", O1)
-        .Case("O2", O2)
-        .Case("O3", O3)
-        .Case("Os", Os)
-        .Case("Oz", Oz);
+                              .Case("O0", O0)
+                              .Case("O1", O1)
+                              .Case("O2", O2)
+                              .Case("O3", O3)
+                              .Case("Os", Os)
+                              .Case("Oz", Oz);
     if (L == O0)
       // At O0 we do nothing at all!
       return true;
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
+    } else if (Matches[1] == "thinlto-pre-link") {
+      MPM.addPass(buildThinLTOPreLinkDefaultPipeline(L, DebugLogging));
+    } else if (Matches[1] == "thinlto") {
+      MPM.addPass(buildThinLTODefaultPipeline(L, DebugLogging));
     } else if (Matches[1] == "lto-pre-link") {
       MPM.addPass(buildLTOPreLinkDefaultPipeline(L, DebugLogging));
     } else {
@@ -852,6 +1359,9 @@ bool PassBuilder::parseModulePass(ModulePassManager &MPM,
   }
 #include "PassRegistry.def"
 
+  for (auto &C : ModulePipelineParsingCallbacks)
+    if (C(Name, MPM, InnerPipeline))
+      return true;
   return false;
 }
 
@@ -899,11 +1409,16 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
                                                *MaxRepetitions, DebugLogging));
       return true;
     }
+
+    for (auto &C : CGSCCPipelineParsingCallbacks)
+      if (C(Name, CGPM, InnerPipeline))
+        return true;
+
     // Normal passes can't have pipelines.
     return false;
   }
 
-  // Now expand the basic registered passes from the .inc file.
+// Now expand the basic registered passes from the .inc file.
 #define CGSCC_PASS(NAME, CREATE_PASS)                                          \
   if (Name == NAME) {                                                          \
     CGPM.addPass(CREATE_PASS);                                                 \
@@ -924,6 +1439,9 @@ bool PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   }
 #include "PassRegistry.def"
 
+  for (auto &C : CGSCCPipelineParsingCallbacks)
+    if (C(Name, CGPM, InnerPipeline))
+      return true;
   return false;
 }
 
@@ -961,11 +1479,16 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
       FPM.addPass(createRepeatedPass(*Count, std::move(NestedFPM)));
       return true;
     }
+
+    for (auto &C : FunctionPipelineParsingCallbacks)
+      if (C(Name, FPM, InnerPipeline))
+        return true;
+
     // Normal passes can't have pipelines.
     return false;
   }
 
-  // Now expand the basic registered passes from the .inc file.
+// Now expand the basic registered passes from the .inc file.
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
@@ -985,6 +1508,9 @@ bool PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
   }
 #include "PassRegistry.def"
 
+  for (auto &C : FunctionPipelineParsingCallbacks)
+    if (C(Name, FPM, InnerPipeline))
+      return true;
   return false;
 }
 
@@ -1012,11 +1538,16 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
       LPM.addPass(createRepeatedPass(*Count, std::move(NestedLPM)));
       return true;
     }
+
+    for (auto &C : LoopPipelineParsingCallbacks)
+      if (C(Name, LPM, InnerPipeline))
+        return true;
+
     // Normal passes can't have pipelines.
     return false;
   }
 
-  // Now expand the basic registered passes from the .inc file.
+// Now expand the basic registered passes from the .inc file.
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME) {                                                          \
     LPM.addPass(CREATE_PASS);                                                  \
@@ -1037,6 +1568,9 @@ bool PassBuilder::parseLoopPass(LoopPassManager &LPM, const PipelineElement &E,
   }
 #include "PassRegistry.def"
 
+  for (auto &C : LoopPipelineParsingCallbacks)
+    if (C(Name, LPM, InnerPipeline))
+      return true;
   return false;
 }
 
@@ -1055,6 +1589,9 @@ bool PassBuilder::parseAAPassName(AAManager &AA, StringRef Name) {
   }
 #include "PassRegistry.def"
 
+  for (auto &C : AAParsingCallbacks)
+    if (C(Name, AA))
+      return true;
   return false;
 }
 
@@ -1121,7 +1658,7 @@ bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM,
   return true;
 }
 
-// Primary pass pipeline description parsing routine.
+// Primary pass pipeline description parsing routine for a \c ModulePassManager
 // FIXME: Should this routine accept a TargetMachine or require the caller to
 // pre-populate the analysis managers with target-specific stuff?
 bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
@@ -1135,21 +1672,70 @@ bool PassBuilder::parsePassPipeline(ModulePassManager &MPM,
   // automatically.
   StringRef FirstName = Pipeline->front().Name;
 
-  if (!isModulePassName(FirstName)) {
-    if (isCGSCCPassName(FirstName))
+  if (!isModulePassName(FirstName, ModulePipelineParsingCallbacks)) {
+    if (isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks)) {
       Pipeline = {{"cgscc", std::move(*Pipeline)}};
-    else if (isFunctionPassName(FirstName))
+    } else if (isFunctionPassName(FirstName,
+                                  FunctionPipelineParsingCallbacks)) {
       Pipeline = {{"function", std::move(*Pipeline)}};
-    else if (isLoopPassName(FirstName))
+    } else if (isLoopPassName(FirstName, LoopPipelineParsingCallbacks)) {
       Pipeline = {{"function", {{"loop", std::move(*Pipeline)}}}};
-    else
+    } else {
+      for (auto &C : TopLevelPipelineParsingCallbacks)
+        if (C(MPM, *Pipeline, VerifyEachPass, DebugLogging))
+          return true;
+
       // Unknown pass name!
       return false;
+    }
   }
 
   return parseModulePassPipeline(MPM, *Pipeline, VerifyEachPass, DebugLogging);
 }
 
+// Primary pass pipeline description parsing routine for a \c CGSCCPassManager
+bool PassBuilder::parsePassPipeline(CGSCCPassManager &CGPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
+  auto Pipeline = parsePipelineText(PipelineText);
+  if (!Pipeline || Pipeline->empty())
+    return false;
+
+  StringRef FirstName = Pipeline->front().Name;
+  if (!isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks))
+    return false;
+
+  return parseCGSCCPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+}
+
+// Primary pass pipeline description parsing routine for a \c
+// FunctionPassManager
+bool PassBuilder::parsePassPipeline(FunctionPassManager &FPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
+  auto Pipeline = parsePipelineText(PipelineText);
+  if (!Pipeline || Pipeline->empty())
+    return false;
+
+  StringRef FirstName = Pipeline->front().Name;
+  if (!isFunctionPassName(FirstName, FunctionPipelineParsingCallbacks))
+    return false;
+
+  return parseFunctionPassPipeline(FPM, *Pipeline, VerifyEachPass,
+                                   DebugLogging);
+}
+
+// Primary pass pipeline description parsing routine for a \c LoopPassManager
+bool PassBuilder::parsePassPipeline(LoopPassManager &CGPM,
+                                    StringRef PipelineText, bool VerifyEachPass,
+                                    bool DebugLogging) {
+  auto Pipeline = parsePipelineText(PipelineText);
+  if (!Pipeline || Pipeline->empty())
+    return false;
+
+  return parseLoopPassPipeline(CGPM, *Pipeline, VerifyEachPass, DebugLogging);
+}
+
 bool PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
   // If the pipeline just consists of the word 'default' just replace the AA
   // manager with our default one.
diff --git a/contrib/llvm/lib/Passes/PassRegistry.def b/contrib/llvm/lib/Passes/PassRegistry.def
index a9939fd..d59ec7f 100644
--- a/contrib/llvm/lib/Passes/PassRegistry.def
+++ b/contrib/llvm/lib/Passes/PassRegistry.def
@@ -85,6 +85,7 @@ CGSCC_ANALYSIS("fam-proxy", FunctionAnalysisManagerCGSCCProxy())
 #ifndef CGSCC_PASS
 #define CGSCC_PASS(NAME, CREATE_PASS)
 #endif
+CGSCC_PASS("argpromotion", ArgumentPromotionPass())
 CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
 CGSCC_PASS("inline", InlinerPass())
@@ -159,6 +160,7 @@ FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn", GVN())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
+FUNCTION_PASS("loop-sink", LoopSinkPass())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass())
 FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
@@ -169,8 +171,10 @@ FUNCTION_PASS("jump-threading", JumpThreadingPass())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
+FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("loop-vectorize", LoopVectorizePass())
+FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
@@ -223,7 +227,10 @@ LOOP_PASS("loop-deletion", LoopDeletionPass())
 LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
-LOOP_PASS("unroll", LoopUnrollPass())
+LOOP_PASS("unroll", LoopUnrollPass::create())
+LOOP_PASS("unroll-full", LoopUnrollPass::createFull())
+LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("loop-predication", LoopPredicationPass())
 #undef LOOP_PASS
diff --git a/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6d907c7..8c5f136 100644
--- a/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/contrib/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMapping.cpp - Code coverage mapping support ---------*- C++ -*-=//
+//===- CoverageMapping.cpp - Code coverage mapping support ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,17 +13,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/Coverage/CoverageMapping.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -40,26 +54,26 @@ Counter CounterExpressionBuilder::get(const CounterExpression &E) {
   return Counter::getExpression(I);
 }
 
-void CounterExpressionBuilder::extractTerms(
-    Counter C, int Sign, SmallVectorImpl<std::pair<unsigned, int>> &Terms) {
+void CounterExpressionBuilder::extractTerms(Counter C, int Factor,
+                                            SmallVectorImpl<Term> &Terms) {
   switch (C.getKind()) {
   case Counter::Zero:
     break;
   case Counter::CounterValueReference:
-    Terms.push_back(std::make_pair(C.getCounterID(), Sign));
+    Terms.emplace_back(C.getCounterID(), Factor);
     break;
   case Counter::Expression:
     const auto &E = Expressions[C.getExpressionID()];
-    extractTerms(E.LHS, Sign, Terms);
-    extractTerms(E.RHS, E.Kind == CounterExpression::Subtract ? -Sign : Sign,
-                 Terms);
+    extractTerms(E.LHS, Factor, Terms);
+    extractTerms(
+        E.RHS, E.Kind == CounterExpression::Subtract ? -Factor : Factor, Terms);
     break;
   }
 }
 
 Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   // Gather constant terms.
-  llvm::SmallVector<std::pair<unsigned, int>, 32> Terms;
+  SmallVector<Term, 32> Terms;
   extractTerms(ExpressionTree, +1, Terms);
 
   // If there are no terms, this is just a zero. The algorithm below assumes at
@@ -68,17 +82,15 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
     return Counter::getZero();
 
   // Group the terms by counter ID.
-  std::sort(Terms.begin(), Terms.end(),
-            [](const std::pair<unsigned, int> &LHS,
-               const std::pair<unsigned, int> &RHS) {
-    return LHS.first < RHS.first;
+  std::sort(Terms.begin(), Terms.end(), [](const Term &LHS, const Term &RHS) {
+    return LHS.CounterID < RHS.CounterID;
   });
 
   // Combine terms by counter ID to eliminate counters that sum to zero.
   auto Prev = Terms.begin();
   for (auto I = Prev + 1, E = Terms.end(); I != E; ++I) {
-    if (I->first == Prev->first) {
-      Prev->second += I->second;
+    if (I->CounterID == Prev->CounterID) {
+      Prev->Factor += I->Factor;
       continue;
     }
     ++Prev;
@@ -89,24 +101,24 @@ Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   Counter C;
   // Create additions. We do this before subtractions to avoid constructs like
   // ((0 - X) + Y), as opposed to (Y - X).
-  for (auto Term : Terms) {
-    if (Term.second <= 0)
+  for (auto T : Terms) {
+    if (T.Factor <= 0)
       continue;
-    for (int I = 0; I < Term.second; ++I)
+    for (int I = 0; I < T.Factor; ++I)
       if (C.isZero())
-        C = Counter::getCounter(Term.first);
+        C = Counter::getCounter(T.CounterID);
       else
         C = get(CounterExpression(CounterExpression::Add, C,
-                                  Counter::getCounter(Term.first)));
+                                  Counter::getCounter(T.CounterID)));
   }
 
   // Create subtractions.
-  for (auto Term : Terms) {
-    if (Term.second >= 0)
+  for (auto T : Terms) {
+    if (T.Factor >= 0)
       continue;
-    for (int I = 0; I < -Term.second; ++I)
+    for (int I = 0; I < -T.Factor; ++I)
       C = get(CounterExpression(CounterExpression::Subtract, C,
-                                Counter::getCounter(Term.first)));
+                                Counter::getCounter(T.CounterID)));
   }
   return C;
 }
@@ -120,8 +132,7 @@ Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS) {
       get(CounterExpression(CounterExpression::Subtract, LHS, RHS)));
 }
 
-void CounterMappingContext::dump(const Counter &C,
-                                 llvm::raw_ostream &OS) const {
+void CounterMappingContext::dump(const Counter &C, raw_ostream &OS) const {
   switch (C.getKind()) {
   case Counter::Zero:
     OS << '0';
@@ -145,7 +156,7 @@ void CounterMappingContext::dump(const Counter &C,
     return;
   Expected<int64_t> Value = evaluate(C);
   if (auto E = Value.takeError()) {
-    llvm::consumeError(std::move(E));
+    consumeError(std::move(E));
     return;
   }
   OS << '[' << *Value << ']';
@@ -187,6 +198,9 @@ Error CoverageMapping::loadFunctionRecord(
     const CoverageMappingRecord &Record,
     IndexedInstrProfReader &ProfileReader) {
   StringRef OrigFuncName = Record.FunctionName;
+  if (OrigFuncName.empty())
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+
   if (Record.Filenames.empty())
     OrigFuncName = getFuncNameWithoutPrefix(OrigFuncName);
   else
@@ -217,7 +231,7 @@ Error CoverageMapping::loadFunctionRecord(
   for (const auto &Region : Record.MappingRegions) {
     Expected<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
     if (auto E = ExecutionCount.takeError()) {
-      llvm::consumeError(std::move(E));
+      consumeError(std::move(E));
       return Error::success();
     }
     Function.pushRegion(Region, *ExecutionCount);
@@ -231,18 +245,6 @@ Error CoverageMapping::loadFunctionRecord(
   return Error::success();
 }
 
-Expected<std::unique_ptr<CoverageMapping>>
-CoverageMapping::load(CoverageMappingReader &CoverageReader,
-                      IndexedInstrProfReader &ProfileReader) {
-  auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
-
-  for (const auto &Record : CoverageReader)
-    if (Error E = Coverage->loadFunctionRecord(Record, ProfileReader))
-      return std::move(E);
-
-  return std::move(Coverage);
-}
-
 Expected<std::unique_ptr<CoverageMapping>> CoverageMapping::load(
     ArrayRef<std::unique_ptr<CoverageMappingReader>> CoverageReaders,
     IndexedInstrProfReader &ProfileReader) {
@@ -281,13 +283,14 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
 }
 
 namespace {
+
 /// \brief Distributes functions into instantiation sets.
 ///
 /// An instantiation set is a collection of functions that have the same source
 /// code, ie, template functions specializations.
 class FunctionInstantiationSetCollector {
-  typedef DenseMap<std::pair<unsigned, unsigned>,
-                   std::vector<const FunctionRecord *>> MapT;
+  using MapT = DenseMap<std::pair<unsigned, unsigned>,
+                        std::vector<const FunctionRecord *>>;
   MapT InstantiatedFunctions;
 
 public:
@@ -301,7 +304,6 @@ public:
   }
 
   MapT::iterator begin() { return InstantiatedFunctions.begin(); }
-
   MapT::iterator end() { return InstantiatedFunctions.end(); }
 };
 
@@ -326,7 +328,7 @@ class SegmentBuilder {
       Segments.pop_back();
     DEBUG(dbgs() << "Segment at " << Line << ":" << Col);
     // Set this region's count.
-    if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion) {
+    if (Region.Kind != CounterMappingRegion::SkippedRegion) {
       DEBUG(dbgs() << " with count " << Region.ExecutionCount);
       Segments.emplace_back(Line, Col, Region.ExecutionCount, IsRegionEntry);
     } else
@@ -380,10 +382,10 @@ class SegmentBuilder {
       // in combineRegions(). Because we accumulate counter values only from
       // regions of the same kind as the first region of the area, prefer
       // CodeRegion to ExpansionRegion and ExpansionRegion to SkippedRegion.
-      static_assert(coverage::CounterMappingRegion::CodeRegion <
-                            coverage::CounterMappingRegion::ExpansionRegion &&
-                        coverage::CounterMappingRegion::ExpansionRegion <
-                            coverage::CounterMappingRegion::SkippedRegion,
+      static_assert(CounterMappingRegion::CodeRegion <
+                            CounterMappingRegion::ExpansionRegion &&
+                        CounterMappingRegion::ExpansionRegion <
+                            CounterMappingRegion::SkippedRegion,
                     "Unexpected order of region kind values");
       return LHS.Kind < RHS.Kind;
     });
@@ -437,7 +439,8 @@ public:
     return Segments;
   }
 };
-}
+
+} // end anonymous namespace
 
 std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
   std::vector<StringRef> Filenames;
@@ -487,7 +490,7 @@ static bool isExpansion(const CountedRegion &R, unsigned FileID) {
 
 CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
   CoverageData FileCoverage(Filename);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
 
   for (const auto &Function : Functions) {
     auto MainFileID = findMainViewFileID(Filename, Function);
@@ -533,7 +536,7 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
     return CoverageData();
 
   CoverageData FunctionCoverage(Function.Filenames[*MainFileID]);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
   for (const auto &CR : Function.CountedRegions)
     if (CR.FileID == *MainFileID) {
       Regions.push_back(CR);
@@ -551,7 +554,7 @@ CoverageData CoverageMapping::getCoverageForExpansion(
     const ExpansionRecord &Expansion) const {
   CoverageData ExpansionCoverage(
       Expansion.Function.Filenames[Expansion.FileID]);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
   for (const auto &CR : Expansion.Function.CountedRegions)
     if (CR.FileID == Expansion.FileID) {
       Regions.push_back(CR);
@@ -566,8 +569,7 @@ CoverageData CoverageMapping::getCoverageForExpansion(
   return ExpansionCoverage;
 }
 
-namespace {
-std::string getCoverageMapErrString(coveragemap_error Err) {
+static std::string getCoverageMapErrString(coveragemap_error Err) {
   switch (Err) {
   case coveragemap_error::success:
     return "Success";
@@ -585,6 +587,8 @@ std::string getCoverageMapErrString(coveragemap_error Err) {
   llvm_unreachable("A value of coveragemap_error has no message.");
 }
 
+namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
@@ -594,6 +598,7 @@ class CoverageMappingErrorCategoryType : public std::error_category {
     return getCoverageMapErrString(static_cast<coveragemap_error>(IE));
   }
 };
+
 } // end anonymous namespace
 
 std::string CoverageMapError::message() const {
diff --git a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index a6c7031..fff0a03 100644
--- a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMappingReader.cpp - Code coverage mapping reader ----*- C++ -*-=//
+//===- CoverageMappingReader.cpp - Code coverage mapping reader -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,14 +13,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -42,7 +62,7 @@ void CoverageMappingIterator::increment() {
 }
 
 Error RawCoverageReader::readULEB128(uint64_t &Result) {
-  if (Data.size() < 1)
+  if (Data.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   unsigned N = 0;
   Result = decodeULEB128(reinterpret_cast<const uint8_t *>(Data.data()), &N);
@@ -226,9 +246,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
 }
 
 Error RawCoverageMappingReader::read() {
-
   // Read the virtual file mapping.
-  llvm::SmallVector<unsigned, 8> VirtualFileMapping;
+  SmallVector<unsigned, 8> VirtualFileMapping;
   uint64_t NumFileMappings;
   if (auto Err = readSize(NumFileMappings))
     return Err;
@@ -349,7 +368,10 @@ static Expected<bool> isCoverageMappingDummy(uint64_t Hash, StringRef Mapping) {
 }
 
 namespace {
+
 struct CovMapFuncRecordReader {
+  virtual ~CovMapFuncRecordReader() = default;
+
   // The interface to read coverage mapping function records for a module.
   //
   // \p Buf points to the buffer containing the \c CovHeader of the coverage
@@ -359,26 +381,24 @@ struct CovMapFuncRecordReader {
   // greater than \p End if not.
   virtual Expected<const char *> readFunctionRecords(const char *Buf,
                                                      const char *End) = 0;
-  virtual ~CovMapFuncRecordReader() {}
+
   template <class IntPtrT, support::endianness Endian>
   static Expected<std::unique_ptr<CovMapFuncRecordReader>>
-  get(coverage::CovMapVersion Version, InstrProfSymtab &P,
+  get(CovMapVersion Version, InstrProfSymtab &P,
       std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
       std::vector<StringRef> &F);
 };
 
 // A class for reading coverage mapping function records for a module.
-template <coverage::CovMapVersion Version, class IntPtrT,
-          support::endianness Endian>
+template <CovMapVersion Version, class IntPtrT, support::endianness Endian>
 class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
-  typedef typename coverage::CovMapTraits<
-      Version, IntPtrT>::CovMapFuncRecordType FuncRecordType;
-  typedef typename coverage::CovMapTraits<Version, IntPtrT>::NameRefType
-      NameRefType;
+  using FuncRecordType =
+      typename CovMapTraits<Version, IntPtrT>::CovMapFuncRecordType;
+  using NameRefType = typename CovMapTraits<Version, IntPtrT>::NameRefType;
 
   // Maps function's name references to the indexes of their records
   // in \c Records.
-  llvm::DenseMap<NameRefType, size_t> FunctionRecords;
+  DenseMap<NameRefType, size_t> FunctionRecords;
   InstrProfSymtab &ProfileNames;
   std::vector<StringRef> &Filenames;
   std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records;
@@ -399,6 +419,8 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
       StringRef FuncName;
       if (Error Err = CFR->template getFuncName<Endian>(ProfileNames, FuncName))
         return Err;
+      if (FuncName.empty())
+        return make_error<InstrProfError>(instrprof_error::malformed);
       Records.emplace_back(Version, FuncName, FuncHash, Mapping, FilenamesBegin,
                            Filenames.size() - FilenamesBegin);
       return Error::success();
@@ -432,14 +454,16 @@ public:
       std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
       std::vector<StringRef> &F)
       : ProfileNames(P), Filenames(F), Records(R) {}
-  ~VersionedCovMapFuncRecordReader() override {}
+
+  ~VersionedCovMapFuncRecordReader() override = default;
 
   Expected<const char *> readFunctionRecords(const char *Buf,
                                              const char *End) override {
     using namespace support;
+
     if (Buf + sizeof(CovMapHeader) > End)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
-    auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
+    auto CovHeader = reinterpret_cast<const CovMapHeader *>(Buf);
     uint32_t NRecords = CovHeader->getNRecords<Endian>();
     uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>();
     uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>();
@@ -490,14 +514,16 @@ public:
     return Buf;
   }
 };
+
 } // end anonymous namespace
 
 template <class IntPtrT, support::endianness Endian>
 Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
-    coverage::CovMapVersion Version, InstrProfSymtab &P,
+    CovMapVersion Version, InstrProfSymtab &P,
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
     std::vector<StringRef> &F) {
   using namespace coverage;
+
   switch (Version) {
   case CovMapVersion::Version1:
     return llvm::make_unique<VersionedCovMapFuncRecordReader<
@@ -518,11 +544,12 @@ static Error readCoverageMappingData(
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
     std::vector<StringRef> &Filenames) {
   using namespace coverage;
+
   // Read the records in the coverage data section.
   auto CovHeader =
-      reinterpret_cast<const coverage::CovMapHeader *>(Data.data());
+      reinterpret_cast<const CovMapHeader *>(Data.data());
   CovMapVersion Version = (CovMapVersion)CovHeader->getVersion<Endian>();
-  if (Version > coverage::CovMapVersion::CurrentVersion)
+  if (Version > CovMapVersion::CurrentVersion)
     return make_error<CoverageMapError>(coveragemap_error::unsupported_version);
   Expected<std::unique_ptr<CovMapFuncRecordReader>> ReaderExpected =
       CovMapFuncRecordReader::get<T, Endian>(Version, ProfileNames, Records,
@@ -538,6 +565,7 @@ static Error readCoverageMappingData(
   }
   return Error::success();
 }
+
 static const char *TestingFormatMagic = "llvmcovmtestdata";
 
 static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
@@ -548,7 +576,7 @@ static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
   Endian = support::endianness::little;
 
   Data = Data.substr(StringRef(TestingFormatMagic).size());
-  if (Data.size() < 1)
+  if (Data.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   unsigned N = 0;
   auto ProfileNamesSize =
@@ -556,7 +584,7 @@ static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
   if (N > Data.size())
     return make_error<CoverageMapError>(coveragemap_error::malformed);
   Data = Data.substr(N);
-  if (Data.size() < 1)
+  if (Data.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   N = 0;
   uint64_t Address =
@@ -570,7 +598,7 @@ static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
     return E;
   CoverageMapping = Data.substr(ProfileNamesSize);
   // Skip the padding bytes because coverage map data has an alignment of 8.
-  if (CoverageMapping.size() < 1)
+  if (CoverageMapping.empty())
     return make_error<CoverageMapError>(coveragemap_error::truncated);
   size_t Pad = alignmentAdjustment(CoverageMapping.data(), 8);
   if (CoverageMapping.size() < Pad)
@@ -595,21 +623,21 @@ static Error loadBinaryFormat(MemoryBufferRef ObjectBuffer,
                               StringRef &CoverageMapping,
                               uint8_t &BytesInAddress,
                               support::endianness &Endian, StringRef Arch) {
-  auto BinOrErr = object::createBinary(ObjectBuffer);
+  auto BinOrErr = createBinary(ObjectBuffer);
   if (!BinOrErr)
     return BinOrErr.takeError();
   auto Bin = std::move(BinOrErr.get());
   std::unique_ptr<ObjectFile> OF;
-  if (auto *Universal = dyn_cast<object::MachOUniversalBinary>(Bin.get())) {
+  if (auto *Universal = dyn_cast<MachOUniversalBinary>(Bin.get())) {
     // If we have a universal binary, try to look up the object for the
     // appropriate architecture.
     auto ObjectFileOrErr = Universal->getObjectForArch(Arch);
     if (!ObjectFileOrErr)
       return ObjectFileOrErr.takeError();
     OF = std::move(ObjectFileOrErr.get());
-  } else if (isa<object::ObjectFile>(Bin.get())) {
+  } else if (isa<ObjectFile>(Bin.get())) {
     // For any other object file, upcast and take ownership.
-    OF.reset(cast<object::ObjectFile>(Bin.release()));
+    OF.reset(cast<ObjectFile>(Bin.release()));
     // If we've asked for a particular arch, make sure they match.
     if (!Arch.empty() && OF->getArch() != Triple(Arch).getArch())
       return errorCodeToError(object_error::arch_not_found);
@@ -623,11 +651,15 @@ static Error loadBinaryFormat(MemoryBufferRef ObjectBuffer,
                                 : support::endianness::big;
 
   // Look for the sections that we are interested in.
-  auto NamesSection = lookupSection(*OF, getInstrProfNameSectionName(false));
+  auto ObjFormat = OF->getTripleObjectFormat();
+  auto NamesSection =
+      lookupSection(*OF, getInstrProfSectionName(IPSK_name, ObjFormat,
+                                                 /*AddSegmentInfo=*/false));
   if (auto E = NamesSection.takeError())
     return E;
   auto CoverageSection =
-      lookupSection(*OF, getInstrProfCoverageSectionName(false));
+      lookupSection(*OF, getInstrProfSectionName(IPSK_covmap, ObjFormat,
+                                                 /*AddSegmentInfo=*/false));
   if (auto E = CoverageSection.takeError())
     return E;
 
diff --git a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 8235633..6fe9353 100644
--- a/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMappingWriter.cpp - Code coverage mapping writer -------------=//
+//===- CoverageMappingWriter.cpp - Code coverage mapping writer -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,7 +13,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/Coverage/CoverageMappingWriter.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -27,14 +34,25 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
 }
 
 namespace {
+
 /// \brief Gather only the expressions that are used by the mapping
 /// regions in this function.
 class CounterExpressionsMinimizer {
   ArrayRef<CounterExpression> Expressions;
-  llvm::SmallVector<CounterExpression, 16> UsedExpressions;
+  SmallVector<CounterExpression, 16> UsedExpressions;
   std::vector<unsigned> AdjustedExpressionIDs;
 
 public:
+  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
+                              ArrayRef<CounterMappingRegion> MappingRegions)
+      : Expressions(Expressions) {
+    AdjustedExpressionIDs.resize(Expressions.size(), 0);
+    for (const auto &I : MappingRegions)
+      mark(I.Count);
+    for (const auto &I : MappingRegions)
+      gatherUsed(I.Count);
+  }
+
   void mark(Counter C) {
     if (!C.isExpression())
       return;
@@ -54,16 +72,6 @@ public:
     gatherUsed(E.RHS);
   }
 
-  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
-                              ArrayRef<CounterMappingRegion> MappingRegions)
-      : Expressions(Expressions) {
-    AdjustedExpressionIDs.resize(Expressions.size(), 0);
-    for (const auto &I : MappingRegions)
-      mark(I.Count);
-    for (const auto &I : MappingRegions)
-      gatherUsed(I.Count);
-  }
-
   ArrayRef<CounterExpression> getExpressions() const { return UsedExpressions; }
 
   /// \brief Adjust the given counter to correctly transition from the old
@@ -74,7 +82,8 @@ public:
     return C;
   }
 };
-}
+
+} // end anonymous namespace
 
 /// \brief Encode the counter.
 ///
diff --git a/contrib/llvm/lib/ProfileData/InstrProf.cpp b/contrib/llvm/lib/ProfileData/InstrProf.cpp
index 74acd9e..48c1643 100644
--- a/contrib/llvm/lib/ProfileData/InstrProf.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProf.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProf.cpp - Instrumented profiling format support -----------------=//
+//===- InstrProf.cpp - Instrumented profiling format support --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,28 +13,67 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 static cl::opt<bool> StaticFuncFullModulePrefix(
-    "static-func-full-module-prefix", cl::init(false),
+    "static-func-full-module-prefix", cl::init(true),
     cl::desc("Use full module build paths in the profile counter names for "
              "static functions."));
 
-namespace {
-std::string getInstrProfErrString(instrprof_error Err) {
+// This option is tailored to users that have different top-level directory in
+// profile-gen and profile-use compilation. Users need to specific the number
+// of levels to strip. A value larger than the number of directories in the
+// source file will strip all the directory names and only leave the basename.
+//
+// Note current ThinLTO module importing for the indirect-calls assumes
+// the source directory name not being stripped. A non-zero option value here
+// can potentially prevent some inter-module indirect-call-promotions.
+static cl::opt<unsigned> StaticFuncStripDirNamePrefix(
+    "static-func-strip-dirname-prefix", cl::init(0),
+    cl::desc("Strip specified level of directory name from source path in "
+             "the profile counter name for static functions."));
+
+static std::string getInstrProfErrString(instrprof_error Err) {
   switch (Err) {
   case instrprof_error::success:
     return "Success";
@@ -76,15 +115,19 @@ std::string getInstrProfErrString(instrprof_error Err) {
   llvm_unreachable("A value of instrprof_error has no message.");
 }
 
+namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class InstrProfErrorCategoryType : public std::error_category {
   const char *name() const noexcept override { return "llvm.instrprof"; }
+
   std::string message(int IE) const override {
     return getInstrProfErrString(static_cast<instrprof_error>(IE));
   }
 };
+
 } // end anonymous namespace
 
 static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory;
@@ -93,8 +136,49 @@ const std::error_category &llvm::instrprof_category() {
   return *ErrorCategory;
 }
 
+namespace {
+
+const char *InstrProfSectNameCommon[] = {
+#define INSTR_PROF_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)      \
+  SectNameCommon,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+const char *InstrProfSectNameCoff[] = {
+#define INSTR_PROF_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)      \
+  SectNameCoff,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+const char *InstrProfSectNamePrefix[] = {
+#define INSTR_PROF_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)      \
+  Prefix,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+} // namespace
+
 namespace llvm {
 
+std::string getInstrProfSectionName(InstrProfSectKind IPSK,
+                                    Triple::ObjectFormatType OF,
+                                    bool AddSegmentInfo) {
+  std::string SectName;
+
+  if (OF == Triple::MachO && AddSegmentInfo)
+    SectName = InstrProfSectNamePrefix[IPSK];
+
+  if (OF == Triple::COFF)
+    SectName += InstrProfSectNameCoff[IPSK];
+  else
+    SectName += InstrProfSectNameCommon[IPSK];
+
+  if (OF == Triple::MachO && IPSK == IPSK_data && AddSegmentInfo)
+    SectName += ",regular,live_support";
+
+  return SectName;
+}
+
 void SoftInstrProfErrors::addError(instrprof_error IE) {
   if (IE == instrprof_error::success)
     return;
@@ -133,6 +217,24 @@ std::string getPGOFuncName(StringRef RawFuncName,
   return GlobalValue::getGlobalIdentifier(RawFuncName, Linkage, FileName);
 }
 
+// Strip NumPrefix level of directory name from PathNameStr. If the number of
+// directory separators is less than NumPrefix, strip all the directories and
+// leave base file name only.
+static StringRef stripDirPrefix(StringRef PathNameStr, uint32_t NumPrefix) {
+  uint32_t Count = NumPrefix;
+  uint32_t Pos = 0, LastPos = 0;
+  for (auto & CI : PathNameStr) {
+    ++Pos;
+    if (llvm::sys::path::is_separator(CI)) {
+      LastPos = Pos;
+      --Count;
+    }
+    if (Count == 0)
+      break;
+  }
+  return PathNameStr.substr(LastPos);
+}
+
 // Return the PGOFuncName. This function has some special handling when called
 // in LTO optimization. The following only applies when calling in LTO passes
 // (when \c InLTO is true): LTO's internalization privatizes many global linkage
@@ -151,6 +253,8 @@ std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
     StringRef FileName = (StaticFuncFullModulePrefix
                               ? F.getParent()->getName()
                               : sys::path::filename(F.getParent()->getName()));
+    if (StaticFuncFullModulePrefix && StaticFuncStripDirNamePrefix != 0)
+      FileName = stripDirPrefix(FileName, StaticFuncStripDirNamePrefix);
     return getPGOFuncName(F.getName(), F.getLinkage(), FileName, Version);
   }
 
@@ -198,7 +302,6 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName) {
-
   // We generally want to match the function's linkage, but available_externally
   // and extern_weak both have the wrong semantics, and anything that doesn't
   // need to link across compilation units doesn't need to be visible at all.
@@ -227,23 +330,37 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef PGOFuncName) {
   return createPGOFuncNameVar(*F.getParent(), F.getLinkage(), PGOFuncName);
 }
 
-void InstrProfSymtab::create(Module &M, bool InLTO) {
+Error InstrProfSymtab::create(Module &M, bool InLTO) {
   for (Function &F : M) {
     // Function may not have a name: like using asm("") to overwrite the name.
     // Ignore in this case.
     if (!F.hasName())
       continue;
     const std::string &PGOFuncName = getPGOFuncName(F, InLTO);
-    addFuncName(PGOFuncName);
+    if (Error E = addFuncName(PGOFuncName))
+      return E;
     MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+    // In ThinLTO, local function may have been promoted to global and have
+    // suffix added to the function name. We need to add the stripped function
+    // name to the symbol table so that we can find a match from profile.
+    if (InLTO) {
+      auto pos = PGOFuncName.find('.');
+      if (pos != std::string::npos) {
+        const std::string &OtherFuncName = PGOFuncName.substr(0, pos);
+        if (Error E = addFuncName(OtherFuncName))
+          return E;
+        MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
+      }
+    }
   }
 
   finalizeSymtab();
+  return Error::success();
 }
 
-Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
                                 bool doCompression, std::string &Result) {
-  assert(NameStrs.size() && "No name data to emit");
+  assert(!NameStrs.empty() && "No name data to emit");
 
   uint8_t Header[16], *P = Header;
   std::string UncompressedNameStrings =
@@ -271,12 +388,12 @@ Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
   }
 
   SmallString<128> CompressedNameStrings;
-  zlib::Status Success =
-      zlib::compress(StringRef(UncompressedNameStrings), CompressedNameStrings,
-                     zlib::BestSizeCompression);
-
-  if (Success != zlib::StatusOK)
+  Error E = zlib::compress(StringRef(UncompressedNameStrings),
+                           CompressedNameStrings, zlib::BestSizeCompression);
+  if (E) {
+    consumeError(std::move(E));
     return make_error<InstrProfError>(instrprof_error::compress_failed);
+  }
 
   return WriteStringToResult(CompressedNameStrings.size(),
                              CompressedNameStrings);
@@ -289,7 +406,7 @@ StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar) {
   return NameStr;
 }
 
-Error collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
                                 std::string &Result, bool doCompression) {
   std::vector<std::string> NameStrs;
   for (auto *NameVar : NameVars) {
@@ -315,9 +432,12 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
     if (isCompressed) {
       StringRef CompressedNameStrings(reinterpret_cast<const char *>(P),
                                       CompressedSize);
-      if (zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
-                           UncompressedSize) != zlib::StatusOK)
+      if (Error E =
+              zlib::uncompress(CompressedNameStrings, UncompressedNameStrings,
+                               UncompressedSize)) {
+        consumeError(std::move(E));
         return make_error<InstrProfError>(instrprof_error::uncompress_failed);
+      }
       P += CompressedSize;
       NameStrings = StringRef(UncompressedNameStrings.data(),
                               UncompressedNameStrings.size());
@@ -330,7 +450,8 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
     SmallVector<StringRef, 0> Names;
     NameStrings.split(Names, getInstrProfNameSeparator());
     for (StringRef &Name : Names)
-      Symtab.addFuncName(Name);
+      if (Error E = Symtab.addFuncName(Name))
+        return E;
 
     while (P < EndP && *P == 0)
       P++;
@@ -339,9 +460,9 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) {
   return Error::success();
 }
 
-void InstrProfValueSiteRecord::merge(SoftInstrProfErrors &SIPE,
-                                     InstrProfValueSiteRecord &Input,
-                                     uint64_t Weight) {
+void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input,
+                                     uint64_t Weight,
+                                     function_ref<void(instrprof_error)> Warn) {
   this->sortByTargetValues();
   Input.sortByTargetValues();
   auto I = ValueData.begin();
@@ -354,7 +475,7 @@ void InstrProfValueSiteRecord::merge(SoftInstrProfErrors &SIPE,
       bool Overflowed;
       I->Count = SaturatingMultiplyAdd(J->Count, Weight, I->Count, &Overflowed);
       if (Overflowed)
-        SIPE.addError(instrprof_error::counter_overflow);
+        Warn(instrprof_error::counter_overflow);
       ++I;
       continue;
     }
@@ -362,40 +483,43 @@ void InstrProfValueSiteRecord::merge(SoftInstrProfErrors &SIPE,
   }
 }
 
-void InstrProfValueSiteRecord::scale(SoftInstrProfErrors &SIPE,
-                                     uint64_t Weight) {
+void InstrProfValueSiteRecord::scale(uint64_t Weight,
+                                     function_ref<void(instrprof_error)> Warn) {
   for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) {
     bool Overflowed;
     I->Count = SaturatingMultiply(I->Count, Weight, &Overflowed);
     if (Overflowed)
-      SIPE.addError(instrprof_error::counter_overflow);
+      Warn(instrprof_error::counter_overflow);
   }
 }
 
 // Merge Value Profile data from Src record to this record for ValueKind.
 // Scale merged value counts by \p Weight.
-void InstrProfRecord::mergeValueProfData(uint32_t ValueKind,
-                                         InstrProfRecord &Src,
-                                         uint64_t Weight) {
+void InstrProfRecord::mergeValueProfData(
+    uint32_t ValueKind, InstrProfRecord &Src, uint64_t Weight,
+    function_ref<void(instrprof_error)> Warn) {
   uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
   uint32_t OtherNumValueSites = Src.getNumValueSites(ValueKind);
   if (ThisNumValueSites != OtherNumValueSites) {
-    SIPE.addError(instrprof_error::value_site_count_mismatch);
+    Warn(instrprof_error::value_site_count_mismatch);
     return;
   }
+  if (!ThisNumValueSites)
+    return;
   std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
-      getValueSitesForKind(ValueKind);
-  std::vector<InstrProfValueSiteRecord> &OtherSiteRecords =
+      getOrCreateValueSitesForKind(ValueKind);
+  MutableArrayRef<InstrProfValueSiteRecord> OtherSiteRecords =
       Src.getValueSitesForKind(ValueKind);
   for (uint32_t I = 0; I < ThisNumValueSites; I++)
-    ThisSiteRecords[I].merge(SIPE, OtherSiteRecords[I], Weight);
+    ThisSiteRecords[I].merge(OtherSiteRecords[I], Weight, Warn);
 }
 
-void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight) {
+void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
+                            function_ref<void(instrprof_error)> Warn) {
   // If the number of counters doesn't match we either have bad data
   // or a hash collision.
   if (Counts.size() != Other.Counts.size()) {
-    SIPE.addError(instrprof_error::count_mismatch);
+    Warn(instrprof_error::count_mismatch);
     return;
   }
 
@@ -404,30 +528,30 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight) {
     Counts[I] =
         SaturatingMultiplyAdd(Other.Counts[I], Weight, Counts[I], &Overflowed);
     if (Overflowed)
-      SIPE.addError(instrprof_error::counter_overflow);
+      Warn(instrprof_error::counter_overflow);
   }
 
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
-    mergeValueProfData(Kind, Other, Weight);
+    mergeValueProfData(Kind, Other, Weight, Warn);
 }
 
-void InstrProfRecord::scaleValueProfData(uint32_t ValueKind, uint64_t Weight) {
-  uint32_t ThisNumValueSites = getNumValueSites(ValueKind);
-  std::vector<InstrProfValueSiteRecord> &ThisSiteRecords =
-      getValueSitesForKind(ValueKind);
-  for (uint32_t I = 0; I < ThisNumValueSites; I++)
-    ThisSiteRecords[I].scale(SIPE, Weight);
+void InstrProfRecord::scaleValueProfData(
+    uint32_t ValueKind, uint64_t Weight,
+    function_ref<void(instrprof_error)> Warn) {
+  for (auto &R : getValueSitesForKind(ValueKind))
+    R.scale(Weight, Warn);
 }
 
-void InstrProfRecord::scale(uint64_t Weight) {
+void InstrProfRecord::scale(uint64_t Weight,
+                            function_ref<void(instrprof_error)> Warn) {
   for (auto &Count : this->Counts) {
     bool Overflowed;
     Count = SaturatingMultiply(Count, Weight, &Overflowed);
     if (Overflowed)
-      SIPE.addError(instrprof_error::counter_overflow);
+      Warn(instrprof_error::counter_overflow);
   }
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
-    scaleValueProfData(Kind, Weight);
+    scaleValueProfData(Kind, Weight, Warn);
 }
 
 // Map indirect call target name hash to name string.
@@ -462,7 +586,7 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
     VData[I].Value = remapValue(VData[I].Value, ValueKind, ValueMap);
   }
   std::vector<InstrProfValueSiteRecord> &ValueSites =
-      getValueSitesForKind(ValueKind);
+      getOrCreateValueSitesForKind(ValueKind);
   if (N == 0)
     ValueSites.emplace_back();
   else
@@ -521,8 +645,9 @@ static ValueProfRecordClosure InstrProfRecordClosure = {
 
 // Wrapper implementation using the closure mechanism.
 uint32_t ValueProfData::getSize(const InstrProfRecord &Record) {
-  InstrProfRecordClosure.Record = &Record;
-  return getValueProfDataSize(&InstrProfRecordClosure);
+  auto Closure = InstrProfRecordClosure;
+  Closure.Record = &Record;
+  return getValueProfDataSize(&Closure);
 }
 
 // Wrapper implementation using the closure mechanism.
@@ -553,6 +678,7 @@ void ValueProfRecord::deserializeTo(InstrProfRecord &Record,
 void ValueProfRecord::swapBytes(support::endianness Old,
                                 support::endianness New) {
   using namespace support;
+
   if (Old == New)
     return;
 
@@ -589,6 +715,7 @@ void ValueProfData::deserializeTo(InstrProfRecord &Record,
 template <class T>
 static T swapToHostOrder(const unsigned char *&D, support::endianness Orig) {
   using namespace support;
+
   if (Orig == little)
     return endian::readNext<T, little, unaligned>(D);
   else
@@ -623,6 +750,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
                                 const unsigned char *const BufferEnd,
                                 support::endianness Endianness) {
   using namespace support;
+
   if (D + sizeof(ValueProfData) > BufferEnd)
     return make_error<InstrProfError>(instrprof_error::truncated);
 
@@ -645,6 +773,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
 
 void ValueProfData::swapBytesToHost(support::endianness Endianness) {
   using namespace support;
+
   if (Endianness == getHostEndianness())
     return;
 
@@ -660,6 +789,7 @@ void ValueProfData::swapBytesToHost(support::endianness Endianness) {
 
 void ValueProfData::swapBytesFromHost(support::endianness Endianness) {
   using namespace support;
+
   if (Endianness == getHostEndianness())
     return;
 
@@ -791,7 +921,7 @@ bool needsComdatForCounter(const Function &F, const Module &M) {
     return true;
 
   Triple TT(M.getTargetTriple());
-  if (!TT.isOSBinFormatELF())
+  if (!TT.isOSBinFormatELF() && !TT.isOSBinFormatWasm())
     return false;
 
   // See createPGOFuncNameVar for more details. To avoid link errors, profile
@@ -854,4 +984,26 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
   }
   return true;
 }
+
+// Parse the value profile options.
+void getMemOPSizeRangeFromOption(StringRef MemOPSizeRange, int64_t &RangeStart,
+                                 int64_t &RangeLast) {
+  static const int64_t DefaultMemOPSizeRangeStart = 0;
+  static const int64_t DefaultMemOPSizeRangeLast = 8;
+  RangeStart = DefaultMemOPSizeRangeStart;
+  RangeLast = DefaultMemOPSizeRangeLast;
+
+  if (!MemOPSizeRange.empty()) {
+    auto Pos = MemOPSizeRange.find(':');
+    if (Pos != std::string::npos) {
+      if (Pos > 0)
+        MemOPSizeRange.substr(0, Pos).getAsInteger(10, RangeStart);
+      if (Pos < MemOPSizeRange.size() - 1)
+        MemOPSizeRange.substr(Pos + 1).getAsInteger(10, RangeLast);
+    } else
+      MemOPSizeRange.getAsInteger(10, RangeLast);
+  }
+  assert(RangeLast >= RangeStart);
+}
+
 } // end namespace llvm
diff --git a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
index ad407f0..1b39a06 100644
--- a/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProfReader.cpp - Instrumented profiling reader -------------------=//
+//===- InstrProfReader.cpp - Instrumented profiling reader ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,8 +13,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include <cassert>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -78,7 +96,6 @@ IndexedInstrProfReader::create(const Twine &Path) {
   return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
 }
 
-
 Expected<std::unique_ptr<IndexedInstrProfReader>>
 IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Sanity check the buffer.
@@ -182,8 +199,9 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         CHECK_LINE_END(Line);
         std::pair<StringRef, StringRef> VD = Line->rsplit(':');
         uint64_t TakenCount, Value;
-        if (VK == IPVK_IndirectCallTarget) {
-          Symtab->addFuncName(VD.first);
+        if (ValueKind == IPVK_IndirectCallTarget) {
+          if (Error E = Symtab->addFuncName(VD.first))
+            return E;
           Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
           READ_NUM(VD.first, Value);
@@ -192,7 +210,8 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         CurrentValues.push_back({Value, TakenCount});
         Line++;
       }
-      Record.addValueData(VK, S, CurrentValues.data(), NumValueData, nullptr);
+      Record.addValueData(ValueKind, S, CurrentValues.data(), NumValueData,
+                          nullptr);
     }
   }
   return success();
@@ -202,7 +221,7 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
 #undef VP_READ_ADVANCE
 }
 
-Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
   // Skip empty lines and comments.
   while (!Line.is_at_end() && (Line->empty() || Line->startswith("#")))
     ++Line;
@@ -214,7 +233,8 @@ Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
 
   // Read the function name.
   Record.Name = *Line++;
-  Symtab->addFuncName(Record.Name);
+  if (Error E = Symtab->addFuncName(Record.Name))
+    return E;
 
   // Read the function hash.
   if (Line.is_at_end())
@@ -232,7 +252,7 @@ Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
     return error(instrprof_error::malformed);
 
   // Read each counter and fill our internal storage with the values.
-  Record.Counts.clear();
+  Record.Clear();
   Record.Counts.reserve(NumCounters);
   for (uint64_t I = 0; I < NumCounters; ++I) {
     if (Line.is_at_end())
@@ -357,13 +377,13 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
 }
 
 template <class IntPtrT>
-Error RawInstrProfReader<IntPtrT>::readName(InstrProfRecord &Record) {
+Error RawInstrProfReader<IntPtrT>::readName(NamedInstrProfRecord &Record) {
   Record.Name = getName(Data->NameRef);
   return success();
 }
 
 template <class IntPtrT>
-Error RawInstrProfReader<IntPtrT>::readFuncHash(InstrProfRecord &Record) {
+Error RawInstrProfReader<IntPtrT>::readFuncHash(NamedInstrProfRecord &Record) {
   Record.Hash = swap(Data->FuncHash);
   return success();
 }
@@ -398,7 +418,6 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
     InstrProfRecord &Record) {
-
   Record.clearValueData();
   CurValueDataSize = 0;
   // Need to match the logic in value profile dumper code in compiler-rt:
@@ -426,7 +445,7 @@ Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
 }
 
 template <class IntPtrT>
-Error RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
+Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record) {
   if (atEnd())
     // At this point, ValueDataStart field points to the next header.
     if (Error E = readNextHeader(getNextHeaderPos()))
@@ -454,17 +473,19 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
 }
 
 namespace llvm {
+
 template class RawInstrProfReader<uint32_t>;
 template class RawInstrProfReader<uint64_t>;
-}
+
+} // end namespace llvm
 
 InstrProfLookupTrait::hash_value_type
 InstrProfLookupTrait::ComputeHash(StringRef K) {
   return IndexedInstrProf::ComputeHash(HashType, K);
 }
 
-typedef InstrProfLookupTrait::data_type data_type;
-typedef InstrProfLookupTrait::offset_type offset_type;
+using data_type = InstrProfLookupTrait::data_type;
+using offset_type = InstrProfLookupTrait::offset_type;
 
 bool InstrProfLookupTrait::readValueProfilingData(
     const unsigned char *&D, const unsigned char *const End) {
@@ -482,6 +503,8 @@ bool InstrProfLookupTrait::readValueProfilingData(
 
 data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
                                          offset_type N) {
+  using namespace support;
+
   // Check if the data is corrupt. If so, don't try to read it.
   if (N % sizeof(uint64_t))
     return data_type();
@@ -489,7 +512,6 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
 
-  using namespace support;
   const unsigned char *End = D + N;
   while (D < End) {
     // Read hash.
@@ -528,7 +550,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
 
 template <typename HashTableImpl>
 Error InstrProfReaderIndex<HashTableImpl>::getRecords(
-    StringRef FuncName, ArrayRef<InstrProfRecord> &Data) {
+    StringRef FuncName, ArrayRef<NamedInstrProfRecord> &Data) {
   auto Iter = HashTable->find(FuncName);
   if (Iter == HashTable->end())
     return make_error<InstrProfError>(instrprof_error::unknown_function);
@@ -542,7 +564,7 @@ Error InstrProfReaderIndex<HashTableImpl>::getRecords(
 
 template <typename HashTableImpl>
 Error InstrProfReaderIndex<HashTableImpl>::getRecords(
-    ArrayRef<InstrProfRecord> &Data) {
+    ArrayRef<NamedInstrProfRecord> &Data) {
   if (atEnd())
     return make_error<InstrProfError>(instrprof_error::eof);
 
@@ -567,9 +589,10 @@ InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
 }
 
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  using namespace support;
+
   if (DataBuffer.getBufferSize() < 8)
     return false;
-  using namespace support;
   uint64_t Magic =
       endian::read<uint64_t, little, aligned>(DataBuffer.getBufferStart());
   // Verify that it's magical.
@@ -581,6 +604,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
                                     const unsigned char *Cur) {
   using namespace IndexedInstrProf;
   using namespace support;
+
   if (Version >= IndexedInstrProf::Version4) {
     const IndexedInstrProf::Summary *SummaryInLE =
         reinterpret_cast<const IndexedInstrProf::Summary *>(Cur);
@@ -598,7 +622,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
     for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++)
       Dst[I] = endian::byte_swap<uint64_t, little>(Src[I]);
 
-    llvm::SummaryEntryVector DetailedSummary;
+    SummaryEntryVector DetailedSummary;
     for (unsigned I = 0; I < SummaryData->NumCutoffEntries; I++) {
       const IndexedInstrProf::Summary::Entry &Ent = SummaryData->getEntry(I);
       DetailedSummary.emplace_back((uint32_t)Ent.Cutoff, Ent.MinBlockCount,
@@ -617,23 +641,24 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
   } else {
     // For older version of profile data, we need to compute on the fly:
     using namespace IndexedInstrProf;
+
     InstrProfSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
     // FIXME: This only computes an empty summary. Need to call addRecord for
-    // all InstrProfRecords to get the correct summary.
+    // all NamedInstrProfRecords to get the correct summary.
     this->Summary = Builder.getSummary();
     return Cur;
   }
 }
 
 Error IndexedInstrProfReader::readHeader() {
+  using namespace support;
+
   const unsigned char *Start =
       (const unsigned char *)DataBuffer->getBufferStart();
   const unsigned char *Cur = Start;
   if ((const unsigned char *)DataBuffer->getBufferEnd() - Cur < 24)
     return error(instrprof_error::truncated);
 
-  using namespace support;
-
   auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur);
   Cur += sizeof(IndexedInstrProf::Header);
 
@@ -671,7 +696,9 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
     return *Symtab.get();
 
   std::unique_ptr<InstrProfSymtab> NewSymtab = make_unique<InstrProfSymtab>();
-  Index->populateSymtab(*NewSymtab.get());
+  if (Error E = Index->populateSymtab(*NewSymtab.get())) {
+    consumeError(error(InstrProfError::take(std::move(E))));
+  }
 
   Symtab = std::move(NewSymtab);
   return *Symtab.get();
@@ -680,7 +707,7 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
 Expected<InstrProfRecord>
 IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
                                            uint64_t FuncHash) {
-  ArrayRef<InstrProfRecord> Data;
+  ArrayRef<NamedInstrProfRecord> Data;
   Error Err = Index->getRecords(FuncName, Data);
   if (Err)
     return std::move(Err);
@@ -705,10 +732,10 @@ Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
   return success();
 }
 
-Error IndexedInstrProfReader::readNextRecord(InstrProfRecord &Record) {
+Error IndexedInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
   static unsigned RecordIndex = 0;
 
-  ArrayRef<InstrProfRecord> Data;
+  ArrayRef<NamedInstrProfRecord> Data;
 
   Error E = Index->getRecords(Data);
   if (E)
diff --git a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
index 029d756..ce3f880 100644
--- a/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProfWriter.cpp - Instrumented profiling writer -------------------=//
+//===- InstrProfWriter.cpp - Instrumented profiling writer ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,14 +13,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -41,10 +47,9 @@ namespace llvm {
 // A wrapper class to abstract writer stream with support of bytes
 // back patching.
 class ProfOStream {
-
 public:
-  ProfOStream(llvm::raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
-  ProfOStream(llvm::raw_string_ostream &STR)
+  ProfOStream(raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
+  ProfOStream(raw_string_ostream &STR)
       : IsFDOStream(false), OS(STR), LE(STR) {}
 
   uint64_t tell() { return OS.tell(); }
@@ -55,16 +60,16 @@ public:
   // directly and it won't be reflected in the stream's internal buffer.
   void patch(PatchItem *P, int NItems) {
     using namespace support;
+
     if (IsFDOStream) {
-      llvm::raw_fd_ostream &FDOStream = static_cast<llvm::raw_fd_ostream &>(OS);
+      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
     } else {
-      llvm::raw_string_ostream &SOStream =
-          static_cast<llvm::raw_string_ostream &>(OS);
+      raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
       for (int K = 0; K < NItems; K++) {
         for (int I = 0; I < P[K].N; I++) {
@@ -85,26 +90,28 @@ public:
 
 class InstrProfRecordWriterTrait {
 public:
-  typedef StringRef key_type;
-  typedef StringRef key_type_ref;
+  using key_type = StringRef;
+  using key_type_ref = StringRef;
 
-  typedef const InstrProfWriter::ProfilingData *const data_type;
-  typedef const InstrProfWriter::ProfilingData *const data_type_ref;
+  using data_type = const InstrProfWriter::ProfilingData *const;
+  using data_type_ref = const InstrProfWriter::ProfilingData *const;
 
-  typedef uint64_t hash_value_type;
-  typedef uint64_t offset_type;
+  using hash_value_type = uint64_t;
+  using offset_type = uint64_t;
 
-  support::endianness ValueProfDataEndianness;
+  support::endianness ValueProfDataEndianness = support::little;
   InstrProfSummaryBuilder *SummaryBuilder;
 
-  InstrProfRecordWriterTrait() : ValueProfDataEndianness(support::little) {}
+  InstrProfRecordWriterTrait() = default;
+
   static hash_value_type ComputeHash(key_type_ref K) {
     return IndexedInstrProf::ComputeHash(K);
   }
 
   static std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
-    using namespace llvm::support;
+    using namespace support;
+
     endian::Writer<little> LE(Out);
 
     offset_type N = K.size();
@@ -130,7 +137,8 @@ public:
   }
 
   void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
-    using namespace llvm::support;
+    using namespace support;
+
     endian::Writer<little> LE(Out);
     for (const auto &ProfileData : *V) {
       const InstrProfRecord &ProfRecord = ProfileData.second;
@@ -154,8 +162,7 @@ public:
 } // end namespace llvm
 
 InstrProfWriter::InstrProfWriter(bool Sparse)
-    : Sparse(Sparse), FunctionData(), ProfileKind(PF_Unknown),
-      InfoObj(new InstrProfRecordWriterTrait()) {}
+    : Sparse(Sparse), InfoObj(new InstrProfRecordWriterTrait()) {}
 
 InstrProfWriter::~InstrProfWriter() { delete InfoObj; }
 
@@ -169,38 +176,46 @@ void InstrProfWriter::setOutputSparse(bool Sparse) {
   this->Sparse = Sparse;
 }
 
-Error InstrProfWriter::addRecord(InstrProfRecord &&I, uint64_t Weight) {
-  auto &ProfileDataMap = FunctionData[I.Name];
+void InstrProfWriter::addRecord(NamedInstrProfRecord &&I, uint64_t Weight,
+                                function_ref<void(Error)> Warn) {
+  auto Name = I.Name;
+  auto Hash = I.Hash;
+  addRecord(Name, Hash, std::move(I), Weight, Warn);
+}
+
+void InstrProfWriter::addRecord(StringRef Name, uint64_t Hash,
+                                InstrProfRecord &&I, uint64_t Weight,
+                                function_ref<void(Error)> Warn) {
+  auto &ProfileDataMap = FunctionData[Name];
 
   bool NewFunc;
   ProfilingData::iterator Where;
   std::tie(Where, NewFunc) =
-      ProfileDataMap.insert(std::make_pair(I.Hash, InstrProfRecord()));
+      ProfileDataMap.insert(std::make_pair(Hash, InstrProfRecord()));
   InstrProfRecord &Dest = Where->second;
 
+  auto MapWarn = [&](instrprof_error E) {
+    Warn(make_error<InstrProfError>(E));
+  };
+
   if (NewFunc) {
     // We've never seen a function with this name and hash, add it.
     Dest = std::move(I);
-    // Fix up the name to avoid dangling reference.
-    Dest.Name = FunctionData.find(Dest.Name)->getKey();
     if (Weight > 1)
-      Dest.scale(Weight);
+      Dest.scale(Weight, MapWarn);
   } else {
     // We're updating a function we've seen before.
-    Dest.merge(I, Weight);
+    Dest.merge(I, Weight, MapWarn);
   }
 
   Dest.sortValueData();
-
-  return Dest.takeError();
 }
 
-Error InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW) {
+void InstrProfWriter::mergeRecordsFromWriter(InstrProfWriter &&IPW,
+                                             function_ref<void(Error)> Warn) {
   for (auto &I : IPW.FunctionData)
     for (auto &Func : I.getValue())
-      if (Error E = addRecord(std::move(Func.second), 1))
-        return E;
-  return Error::success();
+      addRecord(I.getKey(), Func.first, std::move(Func.second), 1, Warn);
 }
 
 bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
@@ -208,7 +223,7 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
     return true;
   for (const auto &Func : PD) {
     const InstrProfRecord &IPR = Func.second;
-    if (any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
+    if (llvm::any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
       return true;
   }
   return false;
@@ -217,6 +232,7 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
 static void setSummary(IndexedInstrProf::Summary *TheSummary,
                        ProfileSummary &PS) {
   using namespace IndexedInstrProf;
+
   std::vector<ProfileSummaryEntry> &Res = PS.getDetailedSummary();
   TheSummary->NumSummaryFields = Summary::NumKinds;
   TheSummary->NumCutoffEntries = Res.size();
@@ -231,9 +247,10 @@ static void setSummary(IndexedInstrProf::Summary *TheSummary,
 }
 
 void InstrProfWriter::writeImpl(ProfOStream &OS) {
+  using namespace IndexedInstrProf;
+
   OnDiskChainedHashTableGenerator<InstrProfRecordWriterTrait> Generator;
 
-  using namespace IndexedInstrProf;
   InstrProfSummaryBuilder ISB(ProfileSummaryBuilder::DefaultCutoffs);
   InfoObj->SummaryBuilder = &ISB;
 
@@ -301,7 +318,7 @@ void InstrProfWriter::write(raw_fd_ostream &OS) {
 
 std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
   std::string Data;
-  llvm::raw_string_ostream OS(Data);
+  raw_string_ostream OS(Data);
   ProfOStream POS(OS);
   // Write the hash table.
   writeImpl(POS);
@@ -314,11 +331,12 @@ static const char *ValueProfKindStr[] = {
 #include "llvm/ProfileData/InstrProfData.inc"
 };
 
-void InstrProfWriter::writeRecordInText(const InstrProfRecord &Func,
+void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
+                                        const InstrProfRecord &Func,
                                         InstrProfSymtab &Symtab,
                                         raw_fd_ostream &OS) {
-  OS << Func.Name << "\n";
-  OS << "# Func Hash:\n" << Func.Hash << "\n";
+  OS << Name << "\n";
+  OS << "# Func Hash:\n" << Hash << "\n";
   OS << "# Num Counters:\n" << Func.Counts.size() << "\n";
   OS << "# Counter Values:\n";
   for (uint64_t Count : Func.Counts)
@@ -353,17 +371,19 @@ void InstrProfWriter::writeRecordInText(const InstrProfRecord &Func,
   OS << "\n";
 }
 
-void InstrProfWriter::writeText(raw_fd_ostream &OS) {
+Error InstrProfWriter::writeText(raw_fd_ostream &OS) {
   if (ProfileKind == PF_IRLevel)
     OS << "# IR level Instrumentation Flag\n:ir\n";
   InstrProfSymtab Symtab;
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
-      Symtab.addFuncName(I.getKey());
+      if (Error E = Symtab.addFuncName(I.getKey()))
+        return E;
   Symtab.finalizeSymtab();
 
   for (const auto &I : FunctionData)
     if (shouldEncodeData(I.getValue()))
       for (const auto &Func : I.getValue())
-        writeRecordInText(Func.second, Symtab, OS);
+        writeRecordInText(I.getKey(), Func.first, Func.second, Symtab, OS);
+  return Error::success();
 }
diff --git a/contrib/llvm/lib/ProfileData/SampleProf.cpp b/contrib/llvm/lib/ProfileData/SampleProf.cpp
index 5bcfff0..eafdd21 100644
--- a/contrib/llvm/lib/ProfileData/SampleProf.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProf.cpp
@@ -13,18 +13,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <system_error>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class SampleProfErrorCategoryType : public std::error_category {
   const char *name() const noexcept override { return "llvm.sampleprof"; }
+
   std::string message(int IE) const override {
     sampleprof_error E = static_cast<sampleprof_error>(IE);
     switch (E) {
@@ -54,7 +61,8 @@ class SampleProfErrorCategoryType : public std::error_category {
     llvm_unreachable("A value of sampleprof_error has no message.");
   }
 };
-}
+
+} // end anonymous namespace
 
 static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory;
 
@@ -74,7 +82,9 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void LineLocation::dump() const { print(dbgs()); }
+#endif
 
 /// \brief Print the sample record to the stream \p OS indented by \p Indent.
 void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
@@ -87,7 +97,9 @@ void SampleRecord::print(raw_ostream &OS, unsigned Indent) const {
   OS << "\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SampleRecord::dump() const { print(dbgs(), 0); }
+#endif
 
 raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
                                           const SampleRecord &Sample) {
@@ -101,7 +113,7 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
      << " sampled lines\n";
 
   OS.indent(Indent);
-  if (BodySamples.size() > 0) {
+  if (!BodySamples.empty()) {
     OS << "Samples collected in the function's body {\n";
     SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples);
     for (const auto &SI : SortedBodySamples.get()) {
@@ -115,14 +127,16 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   }
 
   OS.indent(Indent);
-  if (CallsiteSamples.size() > 0) {
+  if (!CallsiteSamples.empty()) {
     OS << "Samples collected in inlined callsites {\n";
-    SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
+    SampleSorter<LineLocation, FunctionSamplesMap> SortedCallsiteSamples(
         CallsiteSamples);
     for (const auto &CS : SortedCallsiteSamples.get()) {
-      OS.indent(Indent + 2);
-      OS << CS->first << ": inlined callee: " << CS->second.getName() << ": ";
-      CS->second.print(OS, Indent + 4);
+      for (const auto &FS : CS->second) {
+        OS.indent(Indent + 2);
+        OS << CS->first << ": inlined callee: " << FS.second.getName() << ": ";
+        FS.second.print(OS, Indent + 4);
+      }
     }
     OS << "}\n";
   } else {
@@ -136,4 +150,6 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
-void FunctionSamples::dump(void) const { print(dbgs(), 0); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
+#endif
diff --git a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
index af80b03..234fe02 100644
--- a/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -23,14 +23,25 @@
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <system_error>
+#include <vector>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 /// \brief Dump the function profile for \p FName.
 ///
@@ -200,7 +211,7 @@ std::error_code SampleProfileReaderText::read() {
           InlineStack.pop_back();
         }
         FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt(
-            LineLocation(LineOffset, Discriminator));
+            LineLocation(LineOffset, Discriminator))[FName];
         FSamples.setName(FName);
         MergeResult(Result, FSamples.addTotalSamples(NumSamples));
         InlineStack.push_back(&FSamples);
@@ -352,8 +363,8 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
     if (std::error_code EC = FName.getError())
       return EC;
 
-    FunctionSamples &CalleeProfile =
-        FProfile.functionSamplesAt(LineLocation(*LineOffset, *Discriminator));
+    FunctionSamples &CalleeProfile = FProfile.functionSamplesAt(
+        LineLocation(*LineOffset, *Discriminator))[*FName];
     CalleeProfile.setName(*FName);
     if (std::error_code EC = readProfile(CalleeProfile))
       return EC;
@@ -625,7 +636,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
     uint32_t LineOffset = Offset >> 16;
     uint32_t Discriminator = Offset & 0xffff;
     FProfile = &CallerProfile->functionSamplesAt(
-        LineLocation(LineOffset, Discriminator));
+        LineLocation(LineOffset, Discriminator))[Name];
   }
   FProfile->setName(Name);
 
@@ -681,11 +692,9 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
       if (!GcovBuffer.readInt64(TargetCount))
         return sampleprof_error::truncated;
 
-      if (Update) {
-        FunctionSamples &TargetProfile = Profiles[TargetName];
-        TargetProfile.addCalledTargetSamples(LineOffset, Discriminator,
-                                             TargetName, TargetCount);
-      }
+      if (Update)
+        FProfile->addCalledTargetSamples(LineOffset, Discriminator,
+                                         TargetName, TargetCount);
     }
   }
 
diff --git a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
index 4fa7128..b450261 100644
--- a/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/contrib/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -19,15 +19,49 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/SampleProfWriter.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <system_error>
+#include <utility>
+#include <vector>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
+
+std::error_code
+SampleProfileWriter::write(const StringMap<FunctionSamples> &ProfileMap) {
+  if (std::error_code EC = writeHeader(ProfileMap))
+    return EC;
+
+  // Sort the ProfileMap by total samples.
+  typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
+  std::vector<NameFunctionSamples> V;
+  for (const auto &I : ProfileMap)
+    V.push_back(std::make_pair(I.getKey(), &I.second));
+
+  std::stable_sort(
+      V.begin(), V.end(),
+      [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
+        if (A.second->getTotalSamples() == B.second->getTotalSamples())
+          return A.first > B.first;
+        return A.second->getTotalSamples() > B.second->getTotalSamples();
+      });
+
+  for (const auto &I : V) {
+    if (std::error_code EC = write(*I.second))
+      return EC;
+  }
+  return sampleprof_error::success;
+}
 
 /// \brief Write samples to a text file.
 ///
@@ -61,20 +95,21 @@ std::error_code SampleProfileWriterText::write(const FunctionSamples &S) {
     OS << "\n";
   }
 
-  SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
+  SampleSorter<LineLocation, FunctionSamplesMap> SortedCallsiteSamples(
       S.getCallsiteSamples());
   Indent += 1;
-  for (const auto &I : SortedCallsiteSamples.get()) {
-    LineLocation Loc = I->first;
-    const FunctionSamples &CalleeSamples = I->second;
-    OS.indent(Indent);
-    if (Loc.Discriminator == 0)
-      OS << Loc.LineOffset << ": ";
-    else
-      OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
-    if (std::error_code EC = write(CalleeSamples))
-      return EC;
-  }
+  for (const auto &I : SortedCallsiteSamples.get())
+    for (const auto &FS : I->second) {
+      LineLocation Loc = I->first;
+      const FunctionSamples &CalleeSamples = FS.second;
+      OS.indent(Indent);
+      if (Loc.Discriminator == 0)
+        OS << Loc.LineOffset << ": ";
+      else
+        OS << Loc.LineOffset << "." << Loc.Discriminator << ": ";
+      if (std::error_code EC = write(CalleeSamples))
+        return EC;
+    }
   Indent -= 1;
 
   return sampleprof_error::success;
@@ -89,8 +124,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) {
 }
 
 void SampleProfileWriterBinary::addName(StringRef FName) {
-  auto NextIdx = NameTable.size();
-  NameTable.insert(std::make_pair(FName, NextIdx));
+  NameTable.insert(std::make_pair(FName, 0));
 }
 
 void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
@@ -102,11 +136,12 @@ void SampleProfileWriterBinary::addNames(const FunctionSamples &S) {
   }
 
   // Recursively add all the names for inlined callsites.
-  for (const auto &J : S.getCallsiteSamples()) {
-    const FunctionSamples &CalleeSamples = J.second;
-    addName(CalleeSamples.getName());
-    addNames(CalleeSamples);
-  }
+  for (const auto &J : S.getCallsiteSamples())
+    for (const auto &FS : J.second) {
+      const FunctionSamples &CalleeSamples = FS.second;
+      addName(CalleeSamples.getName());
+      addNames(CalleeSamples);
+    }
 }
 
 std::error_code SampleProfileWriterBinary::writeHeader(
@@ -127,10 +162,18 @@ std::error_code SampleProfileWriterBinary::writeHeader(
     addNames(I.second);
   }
 
+  // Sort the names to make NameTable is deterministic.
+  std::set<StringRef> V;
+  for (const auto &I : NameTable)
+    V.insert(I.first);
+  int i = 0;
+  for (const StringRef &N : V)
+    NameTable[N] = i++;
+
   // Write out the name table.
   encodeULEB128(NameTable.size(), OS);
-  for (auto N : NameTable) {
-    OS << N.first;
+  for (auto N : V) {
+    OS << N;
     encodeULEB128(0, OS);
   }
   return sampleprof_error::success;
@@ -180,14 +223,15 @@ std::error_code SampleProfileWriterBinary::writeBody(const FunctionSamples &S) {
 
   // Recursively emit all the callsite samples.
   encodeULEB128(S.getCallsiteSamples().size(), OS);
-  for (const auto &J : S.getCallsiteSamples()) {
-    LineLocation Loc = J.first;
-    const FunctionSamples &CalleeSamples = J.second;
-    encodeULEB128(Loc.LineOffset, OS);
-    encodeULEB128(Loc.Discriminator, OS);
-    if (std::error_code EC = writeBody(CalleeSamples))
-      return EC;
-  }
+  for (const auto &J : S.getCallsiteSamples())
+    for (const auto &FS : J.second) {
+      LineLocation Loc = J.first;
+      const FunctionSamples &CalleeSamples = FS.second;
+      encodeULEB128(Loc.LineOffset, OS);
+      encodeULEB128(Loc.Discriminator, OS);
+      if (std::error_code EC = writeBody(CalleeSamples))
+        return EC;
+    }
 
   return sampleprof_error::success;
 }
diff --git a/contrib/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp b/contrib/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp
new file mode 100644
index 0000000..863093a
--- /dev/null
+++ b/contrib/llvm/lib/Support/AMDGPUCodeObjectMetadata.cpp
@@ -0,0 +1,216 @@
+//===--- AMDGPUCodeObjectMetadata.cpp ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata definitions and in-memory
+/// representations.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AMDGPUCodeObjectMetadata.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::CodeObject;
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Arg::Metadata)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
+
+namespace llvm {
+namespace yaml {
+
+template <>
+struct ScalarEnumerationTraits<AccessQualifier> {
+  static void enumeration(IO &YIO, AccessQualifier &EN) {
+    YIO.enumCase(EN, "Default", AccessQualifier::Default);
+    YIO.enumCase(EN, "ReadOnly", AccessQualifier::ReadOnly);
+    YIO.enumCase(EN, "WriteOnly", AccessQualifier::WriteOnly);
+    YIO.enumCase(EN, "ReadWrite", AccessQualifier::ReadWrite);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<AddressSpaceQualifier> {
+  static void enumeration(IO &YIO, AddressSpaceQualifier &EN) {
+    YIO.enumCase(EN, "Private", AddressSpaceQualifier::Private);
+    YIO.enumCase(EN, "Global", AddressSpaceQualifier::Global);
+    YIO.enumCase(EN, "Constant", AddressSpaceQualifier::Constant);
+    YIO.enumCase(EN, "Local", AddressSpaceQualifier::Local);
+    YIO.enumCase(EN, "Generic", AddressSpaceQualifier::Generic);
+    YIO.enumCase(EN, "Region", AddressSpaceQualifier::Region);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<ValueKind> {
+  static void enumeration(IO &YIO, ValueKind &EN) {
+    YIO.enumCase(EN, "ByValue", ValueKind::ByValue);
+    YIO.enumCase(EN, "GlobalBuffer", ValueKind::GlobalBuffer);
+    YIO.enumCase(EN, "DynamicSharedPointer", ValueKind::DynamicSharedPointer);
+    YIO.enumCase(EN, "Sampler", ValueKind::Sampler);
+    YIO.enumCase(EN, "Image", ValueKind::Image);
+    YIO.enumCase(EN, "Pipe", ValueKind::Pipe);
+    YIO.enumCase(EN, "Queue", ValueKind::Queue);
+    YIO.enumCase(EN, "HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX);
+    YIO.enumCase(EN, "HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY);
+    YIO.enumCase(EN, "HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ);
+    YIO.enumCase(EN, "HiddenNone", ValueKind::HiddenNone);
+    YIO.enumCase(EN, "HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer);
+    YIO.enumCase(EN, "HiddenDefaultQueue", ValueKind::HiddenDefaultQueue);
+    YIO.enumCase(EN, "HiddenCompletionAction",
+                 ValueKind::HiddenCompletionAction);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<ValueType> {
+  static void enumeration(IO &YIO, ValueType &EN) {
+    YIO.enumCase(EN, "Struct", ValueType::Struct);
+    YIO.enumCase(EN, "I8", ValueType::I8);
+    YIO.enumCase(EN, "U8", ValueType::U8);
+    YIO.enumCase(EN, "I16", ValueType::I16);
+    YIO.enumCase(EN, "U16", ValueType::U16);
+    YIO.enumCase(EN, "F16", ValueType::F16);
+    YIO.enumCase(EN, "I32", ValueType::I32);
+    YIO.enumCase(EN, "U32", ValueType::U32);
+    YIO.enumCase(EN, "F32", ValueType::F32);
+    YIO.enumCase(EN, "I64", ValueType::I64);
+    YIO.enumCase(EN, "U64", ValueType::U64);
+    YIO.enumCase(EN, "F64", ValueType::F64);
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Attrs::Metadata> {
+  static void mapping(IO &YIO, Kernel::Attrs::Metadata &MD) {
+    YIO.mapOptional(Kernel::Attrs::Key::ReqdWorkGroupSize,
+                    MD.mReqdWorkGroupSize, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::Attrs::Key::WorkGroupSizeHint,
+                    MD.mWorkGroupSizeHint, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
+                    MD.mVecTypeHint, std::string());
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Arg::Metadata> {
+  static void mapping(IO &YIO, Kernel::Arg::Metadata &MD) {
+    YIO.mapRequired(Kernel::Arg::Key::Size, MD.mSize);
+    YIO.mapRequired(Kernel::Arg::Key::Align, MD.mAlign);
+    YIO.mapRequired(Kernel::Arg::Key::ValueKind, MD.mValueKind);
+    YIO.mapRequired(Kernel::Arg::Key::ValueType, MD.mValueType);
+    YIO.mapOptional(Kernel::Arg::Key::PointeeAlign, MD.mPointeeAlign,
+                    uint32_t(0));
+    YIO.mapOptional(Kernel::Arg::Key::AccQual, MD.mAccQual,
+                    AccessQualifier::Unknown);
+    YIO.mapOptional(Kernel::Arg::Key::AddrSpaceQual, MD.mAddrSpaceQual,
+                    AddressSpaceQualifier::Unknown);
+    YIO.mapOptional(Kernel::Arg::Key::IsConst, MD.mIsConst, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsPipe, MD.mIsPipe, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsRestrict, MD.mIsRestrict, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsVolatile, MD.mIsVolatile, false);
+    YIO.mapOptional(Kernel::Arg::Key::Name, MD.mName, std::string());
+    YIO.mapOptional(Kernel::Arg::Key::TypeName, MD.mTypeName, std::string());
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::CodeProps::Metadata> {
+  static void mapping(IO &YIO, Kernel::CodeProps::Metadata &MD) {
+    YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentSize,
+                    MD.mKernargSegmentSize, uint64_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkgroupGroupSegmentSize,
+                    MD.mWorkgroupGroupSegmentSize, uint32_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkitemPrivateSegmentSize,
+                    MD.mWorkitemPrivateSegmentSize, uint32_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WavefrontNumSGPRs,
+                    MD.mWavefrontNumSGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkitemNumVGPRs,
+                    MD.mWorkitemNumVGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentAlign,
+                    MD.mKernargSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::GroupSegmentAlign,
+                    MD.mGroupSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::PrivateSegmentAlign,
+                    MD.mPrivateSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WavefrontSize,
+                    MD.mWavefrontSize, uint8_t(0));
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::DebugProps::Metadata> {
+  static void mapping(IO &YIO, Kernel::DebugProps::Metadata &MD) {
+    YIO.mapOptional(Kernel::DebugProps::Key::DebuggerABIVersion,
+                    MD.mDebuggerABIVersion, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::DebugProps::Key::ReservedNumVGPRs,
+                    MD.mReservedNumVGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::DebugProps::Key::ReservedFirstVGPR,
+                    MD.mReservedFirstVGPR, uint16_t(-1));
+    YIO.mapOptional(Kernel::DebugProps::Key::PrivateSegmentBufferSGPR,
+                    MD.mPrivateSegmentBufferSGPR, uint16_t(-1));
+    YIO.mapOptional(Kernel::DebugProps::Key::WavefrontPrivateSegmentOffsetSGPR,
+                    MD.mWavefrontPrivateSegmentOffsetSGPR, uint16_t(-1));
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Metadata> {
+  static void mapping(IO &YIO, Kernel::Metadata &MD) {
+    YIO.mapRequired(Kernel::Key::Name, MD.mName);
+    YIO.mapOptional(Kernel::Key::Language, MD.mLanguage, std::string());
+    YIO.mapOptional(Kernel::Key::LanguageVersion, MD.mLanguageVersion,
+                    std::vector<uint32_t>());
+    if (!MD.mAttrs.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::Attrs, MD.mAttrs);
+    if (!MD.mArgs.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::Args, MD.mArgs);
+    if (!MD.mCodeProps.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::CodeProps, MD.mCodeProps);
+    if (!MD.mDebugProps.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::DebugProps, MD.mDebugProps);
+  }
+};
+
+template <>
+struct MappingTraits<CodeObject::Metadata> {
+  static void mapping(IO &YIO, CodeObject::Metadata &MD) {
+    YIO.mapRequired(Key::Version, MD.mVersion);
+    YIO.mapOptional(Key::Printf, MD.mPrintf, std::vector<std::string>());
+    if (!MD.mKernels.empty() || !YIO.outputting())
+      YIO.mapOptional(Key::Kernels, MD.mKernels);
+  }
+};
+
+} // end namespace yaml
+
+namespace AMDGPU {
+namespace CodeObject {
+
+/* static */
+std::error_code Metadata::fromYamlString(
+    std::string YamlString, Metadata &CodeObjectMetadata) {
+  yaml::Input YamlInput(YamlString);
+  YamlInput >> CodeObjectMetadata;
+  return YamlInput.error();
+}
+
+/* static */
+std::error_code Metadata::toYamlString(
+    Metadata CodeObjectMetadata, std::string &YamlString) {
+  raw_string_ostream YamlStream(YamlString);
+  yaml::Output YamlOutput(YamlStream, nullptr, std::numeric_limits<int>::max());
+  YamlOutput << CodeObjectMetadata;
+  return std::error_code();
+}
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Support/APFloat.cpp b/contrib/llvm/lib/Support/APFloat.cpp
index 4cfbbf8..deb76cb 100644
--- a/contrib/llvm/lib/Support/APFloat.cpp
+++ b/contrib/llvm/lib/Support/APFloat.cpp
@@ -26,6 +26,15 @@
 #include <cstring>
 #include <limits.h>
 
+#define APFLOAT_DISPATCH_ON_SEMANTICS(METHOD_CALL)                             \
+  do {                                                                         \
+    if (usesLayout<IEEEFloat>(getSemantics()))                                 \
+      return U.IEEE.METHOD_CALL;                                               \
+    if (usesLayout<DoubleAPFloat>(getSemantics()))                             \
+      return U.Double.METHOD_CALL;                                             \
+    llvm_unreachable("Unexpected semantics");                                  \
+  } while (false)
+
 using namespace llvm;
 
 /// A macro used to combine two fcCategory enums into one key which can be used
@@ -38,7 +47,7 @@ using namespace llvm;
 
 /* Assumed in hexadecimal significand parsing, and conversion to
    hexadecimal strings.  */
-static_assert(integerPartWidth % 4 == 0, "Part width must be divisible by 4!");
+static_assert(APFloatBase::integerPartWidth % 4 == 0, "Part width must be divisible by 4!");
 
 namespace llvm {
   /* Represents floating point arithmetic semantics.  */
@@ -66,33 +75,43 @@ namespace llvm {
   static const fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
   static const fltSemantics semBogus = {0, 0, 0, 0};
 
-  /* The PowerPC format consists of two doubles.  It does not map cleanly
-     onto the usual format above.  It is approximated using twice the
-     mantissa bits.  Note that for exponents near the double minimum,
-     we no longer can represent the full 106 mantissa bits, so those
-     will be treated as denormal numbers.
-
-     FIXME: While this approximation is equivalent to what GCC uses for
-     compile-time arithmetic on PPC double-double numbers, it is not able
-     to represent all possible values held by a PPC double-double number,
-     for example: (long double) 1.0 + (long double) 0x1p-106
-     Should this be replaced by a full emulation of PPC double-double?
+  /* The IBM double-double semantics. Such a number consists of a pair of IEEE
+     64-bit doubles (Hi, Lo), where |Hi| > |Lo|, and if normal,
+     (double)(Hi + Lo) == Hi. The numeric value it's modeling is Hi + Lo.
+     Therefore it has two 53-bit mantissa parts that aren't necessarily adjacent
+     to each other, and two 11-bit exponents.
 
      Note: we need to make the value different from semBogus as otherwise
      an unsafe optimization may collapse both values to a single address,
      and we heavily rely on them having distinct addresses.             */
   static const fltSemantics semPPCDoubleDouble = {-1, 0, 0, 0};
 
-  /* There are temporary semantics for the real PPCDoubleDouble implementation.
-     Currently, APFloat of PPCDoubleDouble holds one PPCDoubleDoubleImpl as the
-     high part of double double, and one IEEEdouble as the low part, so that
-     the old operations operate on PPCDoubleDoubleImpl, while the newly added
-     operations also populate the IEEEdouble.
+  /* These are legacy semantics for the fallback, inaccrurate implementation of
+     IBM double-double, if the accurate semPPCDoubleDouble doesn't handle the
+     operation. It's equivalent to having an IEEE number with consecutive 106
+     bits of mantissa and 11 bits of exponent.
+
+     It's not equivalent to IBM double-double. For example, a legit IBM
+     double-double, 1 + epsilon:
+
+       1 + epsilon = 1 + (1 >> 1076)
+
+     is not representable by a consecutive 106 bits of mantissa.
 
-     TODO: Once all functions support DoubleAPFloat mode, we'll change all
-     PPCDoubleDoubleImpl to IEEEdouble and remove PPCDoubleDoubleImpl.  */
-  static const fltSemantics semPPCDoubleDoubleImpl = {1023, -1022 + 53, 53 + 53,
-                                                      128};
+     Currently, these semantics are used in the following way:
+
+       semPPCDoubleDouble -> (IEEEdouble, IEEEdouble) ->
+       (64-bit APInt, 64-bit APInt) -> (128-bit APInt) ->
+       semPPCDoubleDoubleLegacy -> IEEE operations
+
+     We use bitcastToAPInt() to get the bit representation (in APInt) of the
+     underlying IEEEdouble, then use the APInt constructor to construct the
+     legacy IEEE float.
+
+     TODO: Implement all operations in semPPCDoubleDouble, and delete these
+     semantics.  */
+  static const fltSemantics semPPCDoubleDoubleLegacy = {1023, -1022 + 53,
+                                                        53 + 53, 128};
 
   const fltSemantics &APFloatBase::IEEEhalf() {
     return semIEEEhalf;
@@ -130,8 +149,7 @@ namespace llvm {
   const unsigned int maxExponent = 16383;
   const unsigned int maxPrecision = 113;
   const unsigned int maxPowerOfFiveExponent = maxExponent + maxPrecision - 1;
-  const unsigned int maxPowerOfFiveParts = 2 + ((maxPowerOfFiveExponent * 815)
-                                                / (351 * integerPartWidth));
+  const unsigned int maxPowerOfFiveParts = 2 + ((maxPowerOfFiveExponent * 815) / (351 * APFloatBase::integerPartWidth));
 
   unsigned int APFloatBase::semanticsPrecision(const fltSemantics &semantics) {
     return semantics.precision;
@@ -157,7 +175,7 @@ namespace llvm {
 static inline unsigned int
 partCountForBits(unsigned int bits)
 {
-  return ((bits) + integerPartWidth - 1) / integerPartWidth;
+  return ((bits) + APFloatBase::integerPartWidth - 1) / APFloatBase::integerPartWidth;
 }
 
 /* Returns 0U-9U.  Return values >= 10U are not digits.  */
@@ -397,7 +415,7 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
 /* Return the fraction lost were a bignum truncated losing the least
    significant BITS bits.  */
 static lostFraction
-lostFractionThroughTruncation(const integerPart *parts,
+lostFractionThroughTruncation(const APFloatBase::integerPart *parts,
                               unsigned int partCount,
                               unsigned int bits)
 {
@@ -410,7 +428,7 @@ lostFractionThroughTruncation(const integerPart *parts,
     return lfExactlyZero;
   if (bits == lsb + 1)
     return lfExactlyHalf;
-  if (bits <= partCount * integerPartWidth &&
+  if (bits <= partCount * APFloatBase::integerPartWidth &&
       APInt::tcExtractBit(parts, bits - 1))
     return lfMoreThanHalf;
 
@@ -419,7 +437,7 @@ lostFractionThroughTruncation(const integerPart *parts,
 
 /* Shift DST right BITS bits noting lost fraction.  */
 static lostFraction
-shiftRight(integerPart *dst, unsigned int parts, unsigned int bits)
+shiftRight(APFloatBase::integerPart *dst, unsigned int parts, unsigned int bits)
 {
   lostFraction lost_fraction;
 
@@ -466,22 +484,22 @@ HUerrBound(bool inexactMultiply, unsigned int HUerr1, unsigned int HUerr2)
 /* The number of ulps from the boundary (zero, or half if ISNEAREST)
    when the least significant BITS are truncated.  BITS cannot be
    zero.  */
-static integerPart
-ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
-{
+static APFloatBase::integerPart
+ulpsFromBoundary(const APFloatBase::integerPart *parts, unsigned int bits,
+                 bool isNearest) {
   unsigned int count, partBits;
-  integerPart part, boundary;
+  APFloatBase::integerPart part, boundary;
 
   assert(bits != 0);
 
   bits--;
-  count = bits / integerPartWidth;
-  partBits = bits % integerPartWidth + 1;
+  count = bits / APFloatBase::integerPartWidth;
+  partBits = bits % APFloatBase::integerPartWidth + 1;
 
-  part = parts[count] & (~(integerPart) 0 >> (integerPartWidth - partBits));
+  part = parts[count] & (~(APFloatBase::integerPart) 0 >> (APFloatBase::integerPartWidth - partBits));
 
   if (isNearest)
-    boundary = (integerPart) 1 << (partBits - 1);
+    boundary = (APFloatBase::integerPart) 1 << (partBits - 1);
   else
     boundary = 0;
 
@@ -495,32 +513,30 @@ ulpsFromBoundary(const integerPart *parts, unsigned int bits, bool isNearest)
   if (part == boundary) {
     while (--count)
       if (parts[count])
-        return ~(integerPart) 0; /* A lot.  */
+        return ~(APFloatBase::integerPart) 0; /* A lot.  */
 
     return parts[0];
   } else if (part == boundary - 1) {
     while (--count)
       if (~parts[count])
-        return ~(integerPart) 0; /* A lot.  */
+        return ~(APFloatBase::integerPart) 0; /* A lot.  */
 
     return -parts[0];
   }
 
-  return ~(integerPart) 0; /* A lot.  */
+  return ~(APFloatBase::integerPart) 0; /* A lot.  */
 }
 
 /* Place pow(5, power) in DST, and return the number of parts used.
    DST must be at least one part larger than size of the answer.  */
 static unsigned int
-powerOf5(integerPart *dst, unsigned int power)
-{
-  static const integerPart firstEightPowers[] = { 1, 5, 25, 125, 625, 3125,
-                                                  15625, 78125 };
-  integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
+powerOf5(APFloatBase::integerPart *dst, unsigned int power) {
+  static const APFloatBase::integerPart firstEightPowers[] = { 1, 5, 25, 125, 625, 3125, 15625, 78125 };
+  APFloatBase::integerPart pow5s[maxPowerOfFiveParts * 2 + 5];
   pow5s[0] = 78125 * 5;
 
   unsigned int partsCount[16] = { 1 };
-  integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
+  APFloatBase::integerPart scratch[maxPowerOfFiveParts], *p1, *p2, *pow5;
   unsigned int result;
   assert(power <= maxExponent);
 
@@ -549,7 +565,7 @@ powerOf5(integerPart *dst, unsigned int power)
     }
 
     if (power & 1) {
-      integerPart *tmp;
+      APFloatBase::integerPart *tmp;
 
       APInt::tcFullMultiply(p2, p1, pow5, result, pc);
       result += pc;
@@ -585,14 +601,14 @@ static const char NaNU[] = "NAN";
    significant nibble.  Write out exactly COUNT hexdigits, return
    COUNT.  */
 static unsigned int
-partAsHex (char *dst, integerPart part, unsigned int count,
+partAsHex (char *dst, APFloatBase::integerPart part, unsigned int count,
            const char *hexDigitChars)
 {
   unsigned int result = count;
 
-  assert(count != 0 && count <= integerPartWidth / 4);
+  assert(count != 0 && count <= APFloatBase::integerPartWidth / 4);
 
-  part >>= (integerPartWidth - 4 * count);
+  part >>= (APFloatBase::integerPartWidth - 4 * count);
   while (count--) {
     dst[count] = hexDigitChars[part & 0xf];
     part >>= 4;
@@ -742,7 +758,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) {
 
 bool IEEEFloat::isDenormal() const {
   return isFiniteNonZero() && (exponent == semantics->minExponent) &&
-         (APInt::tcExtractBit(significandParts(), 
+         (APInt::tcExtractBit(significandParts(),
                               semantics->precision - 1) == 0);
 }
 
@@ -862,20 +878,15 @@ IEEEFloat::IEEEFloat(IEEEFloat &&rhs) : semantics(&semBogus) {
 
 IEEEFloat::~IEEEFloat() { freeSignificand(); }
 
-// Profile - This method 'profiles' an APFloat for use with FoldingSet.
-void IEEEFloat::Profile(FoldingSetNodeID &ID) const {
-  ID.Add(bitcastToAPInt());
-}
-
 unsigned int IEEEFloat::partCount() const {
   return partCountForBits(semantics->precision + 1);
 }
 
-const integerPart *IEEEFloat::significandParts() const {
+const IEEEFloat::integerPart *IEEEFloat::significandParts() const {
   return const_cast<IEEEFloat *>(this)->significandParts();
 }
 
-integerPart *IEEEFloat::significandParts() {
+IEEEFloat::integerPart *IEEEFloat::significandParts() {
   if (partCount() > 1)
     return significand.parts;
   else
@@ -898,7 +909,7 @@ void IEEEFloat::incrementSignificand() {
 }
 
 /* Add the significand of the RHS.  Returns the carry flag.  */
-integerPart IEEEFloat::addSignificand(const IEEEFloat &rhs) {
+IEEEFloat::integerPart IEEEFloat::addSignificand(const IEEEFloat &rhs) {
   integerPart *parts;
 
   parts = significandParts();
@@ -911,8 +922,8 @@ integerPart IEEEFloat::addSignificand(const IEEEFloat &rhs) {
 
 /* Subtract the significand of the RHS with a borrow flag.  Returns
    the borrow flag.  */
-integerPart IEEEFloat::subtractSignificand(const IEEEFloat &rhs,
-                                           integerPart borrow) {
+IEEEFloat::integerPart IEEEFloat::subtractSignificand(const IEEEFloat &rhs,
+                                                      integerPart borrow) {
   integerPart *parts;
 
   parts = significandParts();
@@ -966,14 +977,14 @@ lostFraction IEEEFloat::multiplySignificand(const IEEEFloat &rhs,
   //     rhs = b23 . b22 ... b0 * 2^e2
   // the result of multiplication is:
   //   *this = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
-  // Note that there are three significant bits at the left-hand side of the 
+  // Note that there are three significant bits at the left-hand side of the
   // radix point: two for the multiplication, and an overflow bit for the
   // addition (that will always be zero at this point). Move the radix point
   // toward left by two bits, and adjust exponent accordingly.
   exponent += 2;
 
   if (addend && addend->isNonZero()) {
-    // The intermediate result of the multiplication has "2 * precision" 
+    // The intermediate result of the multiplication has "2 * precision"
     // signicant bit; adjust the addend to be consistent with mul result.
     //
     Significand savedSignificand = significand;
@@ -1025,7 +1036,7 @@ lostFraction IEEEFloat::multiplySignificand(const IEEEFloat &rhs,
   }
 
   // Convert the result having "2 * precision" significant-bits back to the one
-  // having "precision" significant-bits. First, move the radix point from 
+  // having "precision" significant-bits. First, move the radix point from
   // poision "2*precision - 1" to "precision - 1". The exponent need to be
   // adjusted by "2*precision - 1" - "precision - 1" = "precision".
   exponent -= precision + 1;
@@ -1541,11 +1552,13 @@ IEEEFloat::opStatus IEEEFloat::divideSpecials(const IEEEFloat &rhs) {
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
     category = fcNaN;
     copySignificand(rhs);
+    LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
   case PackCategoriesIntoKey(fcNaN, fcInfinity):
   case PackCategoriesIntoKey(fcNaN, fcNaN):
     sign = false;
+    LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcInfinity, fcZero):
   case PackCategoriesIntoKey(fcInfinity, fcNormal):
   case PackCategoriesIntoKey(fcZero, fcInfinity):
@@ -1611,16 +1624,6 @@ void IEEEFloat::changeSign() {
   sign = !sign;
 }
 
-void IEEEFloat::clearSign() {
-  /* So is this one. */
-  sign = 0;
-}
-
-void IEEEFloat::copySign(const IEEEFloat &rhs) {
-  /* And this one. */
-  sign = rhs.sign;
-}
-
 /* Normalized addition or subtraction.  */
 IEEEFloat::opStatus IEEEFloat::addOrSubtract(const IEEEFloat &rhs,
                                              roundingMode rounding_mode,
@@ -1712,9 +1715,10 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   int parts = partCount();
   integerPart *x = new integerPart[parts];
   bool ignored;
-  fs = V.convertToInteger(x, parts * integerPartWidth, true,
-                          rmNearestTiesToEven, &ignored);
-  if (fs==opInvalidOp) {
+  fs = V.convertToInteger(makeMutableArrayRef(x, parts),
+                          parts * integerPartWidth, true, rmNearestTiesToEven,
+                          &ignored);
+  if (fs == opInvalidOp) {
     delete[] x;
     return fs;
   }
@@ -1735,43 +1739,20 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   return fs;
 }
 
-/* Normalized llvm frem (C fmod).
-   This is not currently correct in all cases.  */
+/* Normalized llvm frem (C fmod). */
 IEEEFloat::opStatus IEEEFloat::mod(const IEEEFloat &rhs) {
   opStatus fs;
   fs = modSpecials(rhs);
 
-  if (isFiniteNonZero() && rhs.isFiniteNonZero()) {
-    IEEEFloat V = *this;
-    unsigned int origSign = sign;
-
-    fs = V.divide(rhs, rmNearestTiesToEven);
-    if (fs == opDivByZero)
-      return fs;
-
-    int parts = partCount();
-    integerPart *x = new integerPart[parts];
-    bool ignored;
-    fs = V.convertToInteger(x, parts * integerPartWidth, true,
-                            rmTowardZero, &ignored);
-    if (fs==opInvalidOp) {
-      delete[] x;
-      return fs;
-    }
-
-    fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
-                                          rmNearestTiesToEven);
-    assert(fs==opOK);   // should always work
-
-    fs = V.multiply(rhs, rmNearestTiesToEven);
-    assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
-
+  while (isFiniteNonZero() && rhs.isFiniteNonZero() &&
+         compareAbsoluteValue(rhs) != cmpLessThan) {
+    IEEEFloat V = scalbn(rhs, ilogb(*this) - ilogb(rhs), rmNearestTiesToEven);
+    if (compareAbsoluteValue(V) == cmpLessThan)
+      V = scalbn(V, -1, rmNearestTiesToEven);
+    V.sign = sign;
+  
     fs = subtract(V, rmNearestTiesToEven);
-    assert(fs==opOK || fs==opInexact);   // likewise
-
-    if (isZero())
-      sign = origSign;    // IEEE754 requires this
-    delete[] x;
+    assert(fs==opOK);
   }
   return fs;
 }
@@ -1840,7 +1821,7 @@ IEEEFloat::opStatus IEEEFloat::roundToIntegral(roundingMode rounding_mode) {
   IEEEFloat MagicConstant(*semantics);
   fs = MagicConstant.convertFromAPInt(IntegerConstant, false,
                                       rmNearestTiesToEven);
-  MagicConstant.copySign(*this);
+  MagicConstant.sign = sign;
 
   if (fs != opOK)
     return fs;
@@ -2047,7 +2028,7 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
    Note that for conversions to integer type the C standard requires
    round-to-zero to always be used.  */
 IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
-    integerPart *parts, unsigned int width, bool isSigned,
+    MutableArrayRef<integerPart> parts, unsigned int width, bool isSigned,
     roundingMode rounding_mode, bool *isExact) const {
   lostFraction lost_fraction;
   const integerPart *src;
@@ -2060,9 +2041,10 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
     return opInvalidOp;
 
   dstPartsCount = partCountForBits(width);
+  assert(dstPartsCount <= parts.size() && "Integer too big");
 
   if (category == fcZero) {
-    APInt::tcSet(parts, 0, dstPartsCount);
+    APInt::tcSet(parts.data(), 0, dstPartsCount);
     // Negative zero can't be represented as an int.
     *isExact = !sign;
     return opOK;
@@ -2074,7 +2056,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
      the destination.  */
   if (exponent < 0) {
     /* Our absolute value is less than one; truncate everything.  */
-    APInt::tcSet(parts, 0, dstPartsCount);
+    APInt::tcSet(parts.data(), 0, dstPartsCount);
     /* For exponent -1 the integer bit represents .5, look at that.
        For smaller exponents leftmost truncated bit is 0. */
     truncatedBits = semantics->precision -1U - exponent;
@@ -2090,11 +2072,13 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
     if (bits < semantics->precision) {
       /* We truncate (semantics->precision - bits) bits.  */
       truncatedBits = semantics->precision - bits;
-      APInt::tcExtract(parts, dstPartsCount, src, bits, truncatedBits);
+      APInt::tcExtract(parts.data(), dstPartsCount, src, bits, truncatedBits);
     } else {
       /* We want at least as many bits as are available.  */
-      APInt::tcExtract(parts, dstPartsCount, src, semantics->precision, 0);
-      APInt::tcShiftLeft(parts, dstPartsCount, bits - semantics->precision);
+      APInt::tcExtract(parts.data(), dstPartsCount, src, semantics->precision,
+                       0);
+      APInt::tcShiftLeft(parts.data(), dstPartsCount,
+                         bits - semantics->precision);
       truncatedBits = 0;
     }
   }
@@ -2106,7 +2090,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
                                                   truncatedBits);
     if (lost_fraction != lfExactlyZero &&
         roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
-      if (APInt::tcIncrement(parts, dstPartsCount))
+      if (APInt::tcIncrement(parts.data(), dstPartsCount))
         return opInvalidOp;     /* Overflow.  */
     }
   } else {
@@ -2114,7 +2098,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
   }
 
   /* Step 3: check if we fit in the destination.  */
-  unsigned int omsb = APInt::tcMSB(parts, dstPartsCount) + 1;
+  unsigned int omsb = APInt::tcMSB(parts.data(), dstPartsCount) + 1;
 
   if (sign) {
     if (!isSigned) {
@@ -2125,7 +2109,8 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
       /* It takes omsb bits to represent the unsigned integer value.
          We lose a bit for the sign, but care is needed as the
          maximally negative integer is a special case.  */
-      if (omsb == width && APInt::tcLSB(parts, dstPartsCount) + 1 != omsb)
+      if (omsb == width &&
+          APInt::tcLSB(parts.data(), dstPartsCount) + 1 != omsb)
         return opInvalidOp;
 
       /* This case can happen because of rounding.  */
@@ -2133,7 +2118,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
         return opInvalidOp;
     }
 
-    APInt::tcNegate (parts, dstPartsCount);
+    APInt::tcNegate (parts.data(), dstPartsCount);
   } else {
     if (omsb >= width + !isSigned)
       return opInvalidOp;
@@ -2155,11 +2140,10 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
    the original value.  This is almost equivalent to result==opOK,
    except for negative zeroes.
 */
-IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
-                                                unsigned int width,
-                                                bool isSigned,
-                                                roundingMode rounding_mode,
-                                                bool *isExact) const {
+IEEEFloat::opStatus
+IEEEFloat::convertToInteger(MutableArrayRef<integerPart> parts,
+                            unsigned int width, bool isSigned,
+                            roundingMode rounding_mode, bool *isExact) const {
   opStatus fs;
 
   fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode,
@@ -2169,6 +2153,7 @@ IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
     unsigned int bits, dstPartsCount;
 
     dstPartsCount = partCountForBits(width);
+    assert(dstPartsCount <= parts.size() && "Integer too big");
 
     if (category == fcNaN)
       bits = 0;
@@ -2177,30 +2162,14 @@ IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
     else
       bits = width - isSigned;
 
-    APInt::tcSetLeastSignificantBits(parts, dstPartsCount, bits);
+    APInt::tcSetLeastSignificantBits(parts.data(), dstPartsCount, bits);
     if (sign && isSigned)
-      APInt::tcShiftLeft(parts, dstPartsCount, width - 1);
+      APInt::tcShiftLeft(parts.data(), dstPartsCount, width - 1);
   }
 
   return fs;
 }
 
-/* Same as convertToInteger(integerPart*, ...), except the result is returned in
-   an APSInt, whose initial bit-width and signed-ness are used to determine the
-   precision of the conversion.
- */
-IEEEFloat::opStatus IEEEFloat::convertToInteger(APSInt &result,
-                                                roundingMode rounding_mode,
-                                                bool *isExact) const {
-  unsigned bitWidth = result.getBitWidth();
-  SmallVector<uint64_t, 4> parts(result.getNumWords());
-  opStatus status = convertToInteger(
-    parts.data(), bitWidth, result.isSigned(), rounding_mode, isExact);
-  // Keeps the original signed-ness.
-  result = APInt(bitWidth, parts);
-  return status;
-}
-
 /* Convert an unsigned integer SRC to a floating point number,
    rounding according to ROUNDING_MODE.  The sign of the floating
    point number is not modified.  */
@@ -2484,7 +2453,7 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {
 
   // Test if we have a zero number allowing for strings with no null terminators
   // and zero decimals with non-zero exponents.
-  // 
+  //
   // We computed firstSigDigit by ignoring all zeros and dots. Thus if
   // D->firstSigDigit equals str.end(), every digit must be a zero and there can
   // be at most one dot. On the other hand, if we have a zero with a non-zero
@@ -2852,7 +2821,7 @@ APInt IEEEFloat::convertF80LongDoubleAPFloatToAPInt() const {
 }
 
 APInt IEEEFloat::convertPPCDoubleDoubleAPFloatToAPInt() const {
-  assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleImpl);
+  assert(semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy);
   assert(partCount()==2);
 
   uint64_t words[2];
@@ -3033,7 +3002,7 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics*)&semIEEEquad)
     return convertQuadrupleAPFloatToAPInt();
 
-  if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleImpl)
+  if (semantics == (const llvm::fltSemantics *)&semPPCDoubleDoubleLegacy)
     return convertPPCDoubleDoubleAPFloatToAPInt();
 
   assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
@@ -3103,14 +3072,14 @@ void IEEEFloat::initFromPPCDoubleDoubleAPInt(const APInt &api) {
 
   // Get the first double and convert to our format.
   initFromDoubleAPInt(APInt(64, i1));
-  fs = convert(semPPCDoubleDoubleImpl, rmNearestTiesToEven, &losesInfo);
+  fs = convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
   assert(fs == opOK && !losesInfo);
   (void)fs;
 
   // Unless we have a special case, add in second double.
   if (isFiniteNonZero()) {
     IEEEFloat v(semIEEEdouble, APInt(64, i2));
-    fs = v.convert(semPPCDoubleDoubleImpl, rmNearestTiesToEven, &losesInfo);
+    fs = v.convert(semPPCDoubleDoubleLegacy, rmNearestTiesToEven, &losesInfo);
     assert(fs == opOK && !losesInfo);
     (void)fs;
 
@@ -3264,7 +3233,7 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromF80LongDoubleAPInt(api);
   if (Sem == &semIEEEquad)
     return initFromQuadrupleAPInt(api);
-  if (Sem == &semPPCDoubleDoubleImpl)
+  if (Sem == &semPPCDoubleDoubleLegacy)
     return initFromPPCDoubleDoubleAPInt(api);
 
   llvm_unreachable(nullptr);
@@ -3419,7 +3388,7 @@ namespace {
 }
 
 void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
-                         unsigned FormatMaxPadding) const {
+                         unsigned FormatMaxPadding, bool TruncateZero) const {
   switch (category) {
   case fcInfinity:
     if (isNegative())
@@ -3433,9 +3402,16 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
     if (isNegative())
       Str.push_back('-');
 
-    if (!FormatMaxPadding)
-      append(Str, "0.0E+0");
-    else
+    if (!FormatMaxPadding) {
+      if (TruncateZero)
+        append(Str, "0.0E+0");
+      else {
+        append(Str, "0.0");
+        if (FormatPrecision > 1)
+          Str.append(FormatPrecision - 1, '0');
+        append(Str, "e+00");
+      }
+    } else
       Str.push_back('0');
     return;
 
@@ -3468,7 +3444,7 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
   // Ignore trailing binary zeros.
   int trailingZeros = significand.countTrailingZeros();
   exp += trailingZeros;
-  significand = significand.lshr(trailingZeros);
+  significand.lshrInPlace(trailingZeros);
 
   // Change the exponent from 2^e to 10^e.
   if (exp == 0) {
@@ -3569,12 +3545,16 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
 
     Str.push_back(buffer[NDigits-1]);
     Str.push_back('.');
-    if (NDigits == 1)
+    if (NDigits == 1 && TruncateZero)
       Str.push_back('0');
     else
       for (unsigned I = 1; I != NDigits; ++I)
         Str.push_back(buffer[NDigits-1-I]);
-    Str.push_back('E');
+    // Fill with zeros up to FormatPrecision.
+    if (!TruncateZero && FormatPrecision > NDigits - 1)
+      Str.append(FormatPrecision - NDigits + 1, '0');
+    // For !TruncateZero we use lower 'e'.
+    Str.push_back(TruncateZero ? 'E' : 'e');
 
     Str.push_back(exp >= 0 ? '+' : '-');
     if (exp < 0) exp = -exp;
@@ -3583,6 +3563,9 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
       expbuf.push_back((char) ('0' + (exp % 10)));
       exp /= 10;
     } while (exp);
+    // Exponent always at least two digits if we do not truncate zeros.
+    if (!TruncateZero && expbuf.size() < 2)
+      expbuf.push_back('0');
     for (unsigned I = 0, E = expbuf.size(); I != E; ++I)
       Str.push_back(expbuf[E-1-I]);
     return;
@@ -3620,7 +3603,7 @@ void IEEEFloat::toString(SmallVectorImpl<char> &Str, unsigned FormatPrecision,
     Str.push_back(buffer[NDigits-I-1]);
 }
 
-bool IEEEFloat::getExactInverse(IEEEFloat *inv) const {
+bool IEEEFloat::getExactInverse(APFloat *inv) const {
   // Special floats and denormals have no exact inverse.
   if (!isFiniteNonZero())
     return false;
@@ -3644,7 +3627,7 @@ bool IEEEFloat::getExactInverse(IEEEFloat *inv) const {
          reciprocal.significandLSB() == reciprocal.semantics->precision - 1);
 
   if (inv)
-    *inv = reciprocal;
+    *inv = APFloat(reciprocal, *semantics);
 
   return true;
 }
@@ -3856,28 +3839,29 @@ IEEEFloat frexp(const IEEEFloat &Val, int &Exp, IEEEFloat::roundingMode RM) {
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S)
-    : Semantics(&S), Floats(new APFloat[2]{APFloat(semPPCDoubleDoubleImpl),
-                                           APFloat(semIEEEdouble)}) {
+    : Semantics(&S),
+      Floats(new APFloat[2]{APFloat(semIEEEdouble), APFloat(semIEEEdouble)}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, uninitializedTag)
     : Semantics(&S),
-      Floats(new APFloat[2]{APFloat(semPPCDoubleDoubleImpl, uninitialized),
+      Floats(new APFloat[2]{APFloat(semIEEEdouble, uninitialized),
                             APFloat(semIEEEdouble, uninitialized)}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, integerPart I)
-    : Semantics(&S), Floats(new APFloat[2]{APFloat(semPPCDoubleDoubleImpl, I),
+    : Semantics(&S), Floats(new APFloat[2]{APFloat(semIEEEdouble, I),
                                            APFloat(semIEEEdouble)}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
 DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, const APInt &I)
-    : Semantics(&S), Floats(new APFloat[2]{
-                         APFloat(semPPCDoubleDoubleImpl, I),
-                         APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
+    : Semantics(&S),
+      Floats(new APFloat[2]{
+          APFloat(semIEEEdouble, APInt(64, I.getRawData()[0])),
+          APFloat(semIEEEdouble, APInt(64, I.getRawData()[1]))}) {
   assert(Semantics == &semPPCDoubleDouble);
 }
 
@@ -3886,9 +3870,7 @@ DoubleAPFloat::DoubleAPFloat(const fltSemantics &S, APFloat &&First,
     : Semantics(&S),
       Floats(new APFloat[2]{std::move(First), std::move(Second)}) {
   assert(Semantics == &semPPCDoubleDouble);
-  // TODO Check for First == &IEEEdouble once the transition is done.
-  assert(&Floats[0].getSemantics() == &semPPCDoubleDoubleImpl ||
-         &Floats[0].getSemantics() == &semIEEEdouble);
+  assert(&Floats[0].getSemantics() == &semIEEEdouble);
   assert(&Floats[1].getSemantics() == &semIEEEdouble);
 }
 
@@ -3917,6 +3899,7 @@ DoubleAPFloat &DoubleAPFloat::operator=(const DoubleAPFloat &RHS) {
   return *this;
 }
 
+// Implement addition, subtraction, multiplication and division based on:
 // "Software for Doubled-Precision Floating-Point Computations",
 // by Seppo Linnainmaa, ACM TOMS vol 7 no 3, September 1981, pages 272-283.
 APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
@@ -3928,7 +3911,7 @@ APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
   if (!z.isFinite()) {
     if (!z.isInfinity()) {
       Floats[0] = std::move(z);
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return (opStatus)Status;
     }
     Status = opOK;
@@ -3946,7 +3929,7 @@ APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
     }
     if (!z.isFinite()) {
       Floats[0] = std::move(z);
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return (opStatus)Status;
     }
     Floats[0] = z;
@@ -3982,13 +3965,13 @@ APFloat::opStatus DoubleAPFloat::addImpl(const APFloat &a, const APFloat &aa,
     Status |= zz.add(cc, RM);
     if (zz.isZero() && !zz.isNegative()) {
       Floats[0] = std::move(z);
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return opOK;
     }
     Floats[0] = z;
     Status |= Floats[0].add(zz, RM);
     if (!Floats[0].isFinite()) {
-      Floats[1].makeZero(false);
+      Floats[1].makeZero(/* Neg = */ false);
       return (opStatus)Status;
     }
     Floats[1] = std::move(z);
@@ -4033,25 +4016,15 @@ APFloat::opStatus DoubleAPFloat::addWithSpecial(const DoubleAPFloat &LHS,
   }
   assert(LHS.getCategory() == fcNormal && RHS.getCategory() == fcNormal);
 
-  // These conversions will go away once PPCDoubleDoubleImpl goes away.
-  // (PPCDoubleDoubleImpl, IEEEDouble) -> (IEEEDouble, IEEEDouble)
-  APFloat A(semIEEEdouble,
-            APInt(64, LHS.Floats[0].bitcastToAPInt().getRawData()[0])),
-      AA(LHS.Floats[1]),
-      C(semIEEEdouble, APInt(64, RHS.Floats[0].bitcastToAPInt().getRawData()[0])),
+  APFloat A(LHS.Floats[0]), AA(LHS.Floats[1]), C(RHS.Floats[0]),
       CC(RHS.Floats[1]);
+  assert(&A.getSemantics() == &semIEEEdouble);
   assert(&AA.getSemantics() == &semIEEEdouble);
+  assert(&C.getSemantics() == &semIEEEdouble);
   assert(&CC.getSemantics() == &semIEEEdouble);
-  Out.Floats[0] = APFloat(semIEEEdouble);
+  assert(&Out.Floats[0].getSemantics() == &semIEEEdouble);
   assert(&Out.Floats[1].getSemantics() == &semIEEEdouble);
-
-  auto Ret = Out.addImpl(A, AA, C, CC, RM);
-
-  // (IEEEDouble, IEEEDouble) -> (PPCDoubleDoubleImpl, IEEEDouble)
-  uint64_t Buffer[] = {Out.Floats[0].bitcastToAPInt().getRawData()[0],
-                       Out.Floats[1].bitcastToAPInt().getRawData()[0]};
-  Out.Floats[0] = APFloat(semPPCDoubleDoubleImpl, APInt(128, 2, Buffer));
-  return Ret;
+  return Out.addImpl(A, AA, C, CC, RM);
 }
 
 APFloat::opStatus DoubleAPFloat::add(const DoubleAPFloat &RHS,
@@ -4067,6 +4040,140 @@ APFloat::opStatus DoubleAPFloat::subtract(const DoubleAPFloat &RHS,
   return Ret;
 }
 
+APFloat::opStatus DoubleAPFloat::multiply(const DoubleAPFloat &RHS,
+                                          APFloat::roundingMode RM) {
+  const auto &LHS = *this;
+  auto &Out = *this;
+  /* Interesting observation: For special categories, finding the lowest
+     common ancestor of the following layered graph gives the correct
+     return category:
+
+        NaN
+       /   \
+     Zero  Inf
+       \   /
+       Normal
+
+     e.g. NaN * NaN = NaN
+          Zero * Inf = NaN
+          Normal * Zero = Zero
+          Normal * Inf = Inf
+  */
+  if (LHS.getCategory() == fcNaN) {
+    Out = LHS;
+    return opOK;
+  }
+  if (RHS.getCategory() == fcNaN) {
+    Out = RHS;
+    return opOK;
+  }
+  if ((LHS.getCategory() == fcZero && RHS.getCategory() == fcInfinity) ||
+      (LHS.getCategory() == fcInfinity && RHS.getCategory() == fcZero)) {
+    Out.makeNaN(false, false, nullptr);
+    return opOK;
+  }
+  if (LHS.getCategory() == fcZero || LHS.getCategory() == fcInfinity) {
+    Out = LHS;
+    return opOK;
+  }
+  if (RHS.getCategory() == fcZero || RHS.getCategory() == fcInfinity) {
+    Out = RHS;
+    return opOK;
+  }
+  assert(LHS.getCategory() == fcNormal && RHS.getCategory() == fcNormal &&
+         "Special cases not handled exhaustively");
+
+  int Status = opOK;
+  APFloat A = Floats[0], B = Floats[1], C = RHS.Floats[0], D = RHS.Floats[1];
+  // t = a * c
+  APFloat T = A;
+  Status |= T.multiply(C, RM);
+  if (!T.isFiniteNonZero()) {
+    Floats[0] = T;
+    Floats[1].makeZero(/* Neg = */ false);
+    return (opStatus)Status;
+  }
+
+  // tau = fmsub(a, c, t), that is -fmadd(-a, c, t).
+  APFloat Tau = A;
+  T.changeSign();
+  Status |= Tau.fusedMultiplyAdd(C, T, RM);
+  T.changeSign();
+  {
+    // v = a * d
+    APFloat V = A;
+    Status |= V.multiply(D, RM);
+    // w = b * c
+    APFloat W = B;
+    Status |= W.multiply(C, RM);
+    Status |= V.add(W, RM);
+    // tau += v + w
+    Status |= Tau.add(V, RM);
+  }
+  // u = t + tau
+  APFloat U = T;
+  Status |= U.add(Tau, RM);
+
+  Floats[0] = U;
+  if (!U.isFinite()) {
+    Floats[1].makeZero(/* Neg = */ false);
+  } else {
+    // Floats[1] = (t - u) + tau
+    Status |= T.subtract(U, RM);
+    Status |= T.add(Tau, RM);
+    Floats[1] = T;
+  }
+  return (opStatus)Status;
+}
+
+APFloat::opStatus DoubleAPFloat::divide(const DoubleAPFloat &RHS,
+                                        APFloat::roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret =
+      Tmp.divide(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()), RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::remainder(const DoubleAPFloat &RHS) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret =
+      Tmp.remainder(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::mod(const DoubleAPFloat &RHS) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.mod(APFloat(semPPCDoubleDoubleLegacy, RHS.bitcastToAPInt()));
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::fusedMultiplyAdd(const DoubleAPFloat &Multiplicand,
+                                const DoubleAPFloat &Addend,
+                                APFloat::roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.fusedMultiplyAdd(
+      APFloat(semPPCDoubleDoubleLegacy, Multiplicand.bitcastToAPInt()),
+      APFloat(semPPCDoubleDoubleLegacy, Addend.bitcastToAPInt()), RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::roundToIntegral(APFloat::roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.roundToIntegral(RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
 void DoubleAPFloat::changeSign() {
   Floats[0].changeSign();
   Floats[1].changeSign();
@@ -4101,12 +4208,201 @@ bool DoubleAPFloat::isNegative() const { return Floats[0].isNegative(); }
 
 void DoubleAPFloat::makeInf(bool Neg) {
   Floats[0].makeInf(Neg);
-  Floats[1].makeZero(false);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+void DoubleAPFloat::makeZero(bool Neg) {
+  Floats[0].makeZero(Neg);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+void DoubleAPFloat::makeLargest(bool Neg) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x7fefffffffffffffull));
+  Floats[1] = APFloat(semIEEEdouble, APInt(64, 0x7c8ffffffffffffeull));
+  if (Neg)
+    changeSign();
+}
+
+void DoubleAPFloat::makeSmallest(bool Neg) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  Floats[0].makeSmallest(Neg);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+void DoubleAPFloat::makeSmallestNormalized(bool Neg) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  Floats[0] = APFloat(semIEEEdouble, APInt(64, 0x0360000000000000ull));
+  if (Neg)
+    Floats[0].changeSign();
+  Floats[1].makeZero(/* Neg = */ false);
 }
 
 void DoubleAPFloat::makeNaN(bool SNaN, bool Neg, const APInt *fill) {
   Floats[0].makeNaN(SNaN, Neg, fill);
-  Floats[1].makeZero(false);
+  Floats[1].makeZero(/* Neg = */ false);
+}
+
+APFloat::cmpResult DoubleAPFloat::compare(const DoubleAPFloat &RHS) const {
+  auto Result = Floats[0].compare(RHS.Floats[0]);
+  // |Float[0]| > |Float[1]|
+  if (Result == APFloat::cmpEqual)
+    return Floats[1].compare(RHS.Floats[1]);
+  return Result;
+}
+
+bool DoubleAPFloat::bitwiseIsEqual(const DoubleAPFloat &RHS) const {
+  return Floats[0].bitwiseIsEqual(RHS.Floats[0]) &&
+         Floats[1].bitwiseIsEqual(RHS.Floats[1]);
+}
+
+hash_code hash_value(const DoubleAPFloat &Arg) {
+  if (Arg.Floats)
+    return hash_combine(hash_value(Arg.Floats[0]), hash_value(Arg.Floats[1]));
+  return hash_combine(Arg.Semantics);
+}
+
+APInt DoubleAPFloat::bitcastToAPInt() const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  uint64_t Data[] = {
+      Floats[0].bitcastToAPInt().getRawData()[0],
+      Floats[1].bitcastToAPInt().getRawData()[0],
+  };
+  return APInt(128, 2, Data);
+}
+
+APFloat::opStatus DoubleAPFloat::convertFromString(StringRef S,
+                                                   roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromString(S, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  auto Ret = Tmp.next(nextDown);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input,
+                                unsigned int Width, bool IsSigned,
+                                roundingMode RM, bool *IsExact) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+      .convertToInteger(Input, Width, IsSigned, RM, IsExact);
+}
+
+APFloat::opStatus DoubleAPFloat::convertFromAPInt(const APInt &Input,
+                                                  bool IsSigned,
+                                                  roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromAPInt(Input, IsSigned, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::convertFromSignExtendedInteger(const integerPart *Input,
+                                              unsigned int InputSize,
+                                              bool IsSigned, roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromSignExtendedInteger(Input, InputSize, IsSigned, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+APFloat::opStatus
+DoubleAPFloat::convertFromZeroExtendedInteger(const integerPart *Input,
+                                              unsigned int InputSize,
+                                              bool IsSigned, roundingMode RM) {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.convertFromZeroExtendedInteger(Input, InputSize, IsSigned, RM);
+  *this = DoubleAPFloat(semPPCDoubleDouble, Tmp.bitcastToAPInt());
+  return Ret;
+}
+
+unsigned int DoubleAPFloat::convertToHexString(char *DST,
+                                               unsigned int HexDigits,
+                                               bool UpperCase,
+                                               roundingMode RM) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+      .convertToHexString(DST, HexDigits, UpperCase, RM);
+}
+
+bool DoubleAPFloat::isDenormal() const {
+  return getCategory() == fcNormal &&
+         (Floats[0].isDenormal() || Floats[1].isDenormal() ||
+          // (double)(Hi + Lo) == Hi defines a normal number.
+          Floats[0].compare(Floats[0] + Floats[1]) != cmpEqual);
+}
+
+bool DoubleAPFloat::isSmallest() const {
+  if (getCategory() != fcNormal)
+    return false;
+  DoubleAPFloat Tmp(*this);
+  Tmp.makeSmallest(this->isNegative());
+  return Tmp.compare(*this) == cmpEqual;
+}
+
+bool DoubleAPFloat::isLargest() const {
+  if (getCategory() != fcNormal)
+    return false;
+  DoubleAPFloat Tmp(*this);
+  Tmp.makeLargest(this->isNegative());
+  return Tmp.compare(*this) == cmpEqual;
+}
+
+bool DoubleAPFloat::isInteger() const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy);
+  (void)Tmp.add(Floats[0], rmNearestTiesToEven);
+  (void)Tmp.add(Floats[1], rmNearestTiesToEven);
+  return Tmp.isInteger();
+}
+
+void DoubleAPFloat::toString(SmallVectorImpl<char> &Str,
+                             unsigned FormatPrecision,
+                             unsigned FormatMaxPadding,
+                             bool TruncateZero) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
+      .toString(Str, FormatPrecision, FormatMaxPadding, TruncateZero);
+}
+
+bool DoubleAPFloat::getExactInverse(APFloat *inv) const {
+  assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat Tmp(semPPCDoubleDoubleLegacy, bitcastToAPInt());
+  if (!inv)
+    return Tmp.getExactInverse(nullptr);
+  APFloat Inv(semPPCDoubleDoubleLegacy);
+  auto Ret = Tmp.getExactInverse(&Inv);
+  *inv = APFloat(semPPCDoubleDouble, Inv.bitcastToAPInt());
+  return Ret;
+}
+
+DoubleAPFloat scalbn(DoubleAPFloat Arg, int Exp, APFloat::roundingMode RM) {
+  assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  return DoubleAPFloat(semPPCDoubleDouble, scalbn(Arg.Floats[0], Exp, RM),
+                       scalbn(Arg.Floats[1], Exp, RM));
+}
+
+DoubleAPFloat frexp(const DoubleAPFloat &Arg, int &Exp,
+                    APFloat::roundingMode RM) {
+  assert(Arg.Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
+  APFloat First = frexp(Arg.Floats[0], Exp, RM);
+  APFloat Second = Arg.Floats[1];
+  if (Arg.getCategory() == APFloat::fcNormal)
+    Second = scalbn(Second, -Exp, RM);
+  return DoubleAPFloat(semPPCDoubleDouble, std::move(First), std::move(Second));
 }
 
 } // End detail namespace
@@ -4126,10 +4422,16 @@ APFloat::Storage::Storage(IEEEFloat F, const fltSemantics &Semantics) {
 }
 
 APFloat::opStatus APFloat::convertFromString(StringRef Str, roundingMode RM) {
-  return getIEEE().convertFromString(Str, RM);
+  APFLOAT_DISPATCH_ON_SEMANTICS(convertFromString(Str, RM));
 }
 
-hash_code hash_value(const APFloat &Arg) { return hash_value(Arg.getIEEE()); }
+hash_code hash_value(const APFloat &Arg) {
+  if (APFloat::usesLayout<detail::IEEEFloat>(Arg.getSemantics()))
+    return hash_value(Arg.U.IEEE);
+  if (APFloat::usesLayout<detail::DoubleAPFloat>(Arg.getSemantics()))
+    return hash_value(Arg.U.Double);
+  llvm_unreachable("Unexpected semantics");
+}
 
 APFloat::APFloat(const fltSemantics &Semantics, StringRef S)
     : APFloat(Semantics) {
@@ -4146,10 +4448,8 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
   if (usesLayout<IEEEFloat>(getSemantics()) &&
       usesLayout<DoubleAPFloat>(ToSemantics)) {
     assert(&ToSemantics == &semPPCDoubleDouble);
-    auto Ret = U.IEEE.convert(semPPCDoubleDoubleImpl, RM, losesInfo);
-    *this = APFloat(DoubleAPFloat(semPPCDoubleDouble, std::move(*this),
-                                  APFloat(semIEEEdouble)),
-                    ToSemantics);
+    auto Ret = U.IEEE.convert(semPPCDoubleDoubleLegacy, RM, losesInfo);
+    *this = APFloat(ToSemantics, U.IEEE.bitcastToAPInt());
     return Ret;
   }
   if (usesLayout<DoubleAPFloat>(getSemantics()) &&
@@ -4189,6 +4489,30 @@ void APFloat::print(raw_ostream &OS) const {
   OS << Buffer << "\n";
 }
 
-void APFloat::dump() const { print(dbgs()); }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void APFloat::dump() const { print(dbgs()); }
+#endif
+
+void APFloat::Profile(FoldingSetNodeID &NID) const {
+  NID.Add(bitcastToAPInt());
+}
+
+/* Same as convertToInteger(integerPart*, ...), except the result is returned in
+   an APSInt, whose initial bit-width and signed-ness are used to determine the
+   precision of the conversion.
+ */
+APFloat::opStatus APFloat::convertToInteger(APSInt &result,
+                                            roundingMode rounding_mode,
+                                            bool *isExact) const {
+  unsigned bitWidth = result.getBitWidth();
+  SmallVector<uint64_t, 4> parts(result.getNumWords());
+  opStatus status = convertToInteger(parts, bitWidth, result.isSigned(),
+                                     rounding_mode, isExact);
+  // Keeps the original signed-ness.
+  result = APInt(bitWidth, parts);
+  return status;
+}
 
 } // End llvm namespace
+
+#undef APFLOAT_DISPATCH_ON_SEMANTICS
diff --git a/contrib/llvm/lib/Support/APInt.cpp b/contrib/llvm/lib/Support/APInt.cpp
index fb8b451..c558ddd 100644
--- a/contrib/llvm/lib/Support/APInt.cpp
+++ b/contrib/llvm/lib/Support/APInt.cpp
@@ -63,7 +63,7 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
     r = cdigit - 'a';
     if (r <= radix - 11U)
       return r + 10;
-    
+
     radix = 10;
   }
 
@@ -76,91 +76,84 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
 
 
 void APInt::initSlowCase(uint64_t val, bool isSigned) {
-  pVal = getClearedMemory(getNumWords());
-  pVal[0] = val;
+  U.pVal = getClearedMemory(getNumWords());
+  U.pVal[0] = val;
   if (isSigned && int64_t(val) < 0)
     for (unsigned i = 1; i < getNumWords(); ++i)
-      pVal[i] = -1ULL;
+      U.pVal[i] = WORD_MAX;
+  clearUnusedBits();
 }
 
 void APInt::initSlowCase(const APInt& that) {
-  pVal = getMemory(getNumWords());
-  memcpy(pVal, that.pVal, getNumWords() * APINT_WORD_SIZE);
+  U.pVal = getMemory(getNumWords());
+  memcpy(U.pVal, that.U.pVal, getNumWords() * APINT_WORD_SIZE);
 }
 
 void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
   assert(BitWidth && "Bitwidth too small");
   assert(bigVal.data() && "Null pointer detected!");
   if (isSingleWord())
-    VAL = bigVal[0];
+    U.VAL = bigVal[0];
   else {
     // Get memory, cleared to 0
-    pVal = getClearedMemory(getNumWords());
+    U.pVal = getClearedMemory(getNumWords());
     // Calculate the number of words to copy
     unsigned words = std::min<unsigned>(bigVal.size(), getNumWords());
     // Copy the words from bigVal to pVal
-    memcpy(pVal, bigVal.data(), words * APINT_WORD_SIZE);
+    memcpy(U.pVal, bigVal.data(), words * APINT_WORD_SIZE);
   }
   // Make sure unused high bits are cleared
   clearUnusedBits();
 }
 
 APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal)
-  : BitWidth(numBits), VAL(0) {
+  : BitWidth(numBits) {
   initFromArray(bigVal);
 }
 
 APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
-  : BitWidth(numBits), VAL(0) {
+  : BitWidth(numBits) {
   initFromArray(makeArrayRef(bigVal, numWords));
 }
 
 APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
-  : BitWidth(numbits), VAL(0) {
+  : BitWidth(numbits) {
   assert(BitWidth && "Bitwidth too small");
   fromString(numbits, Str, radix);
 }
 
-APInt& APInt::AssignSlowCase(const APInt& RHS) {
-  // Don't do anything for X = X
-  if (this == &RHS)
-    return *this;
-
-  if (BitWidth == RHS.getBitWidth()) {
-    // assume same bit-width single-word case is already handled
-    assert(!isSingleWord());
-    memcpy(pVal, RHS.pVal, getNumWords() * APINT_WORD_SIZE);
-    return *this;
+void APInt::reallocate(unsigned NewBitWidth) {
+  // If the number of words is the same we can just change the width and stop.
+  if (getNumWords() == getNumWords(NewBitWidth)) {
+    BitWidth = NewBitWidth;
+    return;
   }
 
-  if (isSingleWord()) {
-    // assume case where both are single words is already handled
-    assert(!RHS.isSingleWord());
-    VAL = 0;
-    pVal = getMemory(RHS.getNumWords());
-    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  } else if (getNumWords() == RHS.getNumWords())
-    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  else if (RHS.isSingleWord()) {
-    delete [] pVal;
-    VAL = RHS.VAL;
-  } else {
-    delete [] pVal;
-    pVal = getMemory(RHS.getNumWords());
-    memcpy(pVal, RHS.pVal, RHS.getNumWords() * APINT_WORD_SIZE);
-  }
-  BitWidth = RHS.BitWidth;
-  return clearUnusedBits();
+  // If we have an allocation, delete it.
+  if (!isSingleWord())
+    delete [] U.pVal;
+
+  // Update BitWidth.
+  BitWidth = NewBitWidth;
+
+  // If we are supposed to have an allocation, create it.
+  if (!isSingleWord())
+    U.pVal = getMemory(getNumWords());
 }
 
-APInt& APInt::operator=(uint64_t RHS) {
+void APInt::AssignSlowCase(const APInt& RHS) {
+  // Don't do anything for X = X
+  if (this == &RHS)
+    return;
+
+  // Adjust the bit width and handle allocations as necessary.
+  reallocate(RHS.getBitWidth());
+
+  // Copy the data.
   if (isSingleWord())
-    VAL = RHS;
-  else {
-    pVal[0] = RHS;
-    memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
-  }
-  return clearUnusedBits();
+    U.VAL = RHS.U.VAL;
+  else
+    memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE);
 }
 
 /// This method 'profiles' an APInt for use with FoldingSet.
@@ -168,374 +161,132 @@ void APInt::Profile(FoldingSetNodeID& ID) const {
   ID.AddInteger(BitWidth);
 
   if (isSingleWord()) {
-    ID.AddInteger(VAL);
+    ID.AddInteger(U.VAL);
     return;
   }
 
   unsigned NumWords = getNumWords();
   for (unsigned i = 0; i < NumWords; ++i)
-    ID.AddInteger(pVal[i]);
-}
-
-/// This function adds a single "digit" integer, y, to the multiple
-/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
-/// 1 is returned if there is a carry out, otherwise 0 is returned.
-/// @returns the carry of the addition.
-static bool add_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
-  for (unsigned i = 0; i < len; ++i) {
-    dest[i] = y + x[i];
-    if (dest[i] < y)
-      y = 1; // Carry one to next digit.
-    else {
-      y = 0; // No need to carry so exit early
-      break;
-    }
-  }
-  return y;
+    ID.AddInteger(U.pVal[i]);
 }
 
 /// @brief Prefix increment operator. Increments the APInt by one.
 APInt& APInt::operator++() {
   if (isSingleWord())
-    ++VAL;
+    ++U.VAL;
   else
-    add_1(pVal, pVal, getNumWords(), 1);
+    tcIncrement(U.pVal, getNumWords());
   return clearUnusedBits();
 }
 
-/// This function subtracts a single "digit" (64-bit word), y, from
-/// the multi-digit integer array, x[], propagating the borrowed 1 value until
-/// no further borrowing is needed or it runs out of "digits" in x.  The result
-/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
-/// In other words, if y > x then this function returns 1, otherwise 0.
-/// @returns the borrow out of the subtraction
-static bool sub_1(uint64_t x[], unsigned len, uint64_t y) {
-  for (unsigned i = 0; i < len; ++i) {
-    uint64_t X = x[i];
-    x[i] -= y;
-    if (y > X)
-      y = 1;  // We have to "borrow 1" from next "digit"
-    else {
-      y = 0;  // No need to borrow
-      break;  // Remaining digits are unchanged so exit early
-    }
-  }
-  return bool(y);
-}
-
 /// @brief Prefix decrement operator. Decrements the APInt by one.
 APInt& APInt::operator--() {
   if (isSingleWord())
-    --VAL;
+    --U.VAL;
   else
-    sub_1(pVal, getNumWords(), 1);
+    tcDecrement(U.pVal, getNumWords());
   return clearUnusedBits();
 }
 
-/// This function adds the integer array x to the integer array Y and
-/// places the result in dest.
-/// @returns the carry out from the addition
-/// @brief General addition of 64-bit integer arrays
-static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y,
-                unsigned len) {
-  bool carry = false;
-  for (unsigned i = 0; i< len; ++i) {
-    uint64_t limit = std::min(x[i],y[i]); // must come first in case dest == x
-    dest[i] = x[i] + y[i] + carry;
-    carry = dest[i] < limit || (carry && dest[i] == limit);
-  }
-  return carry;
-}
-
 /// Adds the RHS APint to this APInt.
 /// @returns this, after addition of RHS.
 /// @brief Addition assignment operator.
 APInt& APInt::operator+=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
-    VAL += RHS.VAL;
-  else {
-    add(pVal, pVal, RHS.pVal, getNumWords());
-  }
+    U.VAL += RHS.U.VAL;
+  else
+    tcAdd(U.pVal, RHS.U.pVal, 0, getNumWords());
   return clearUnusedBits();
 }
 
 APInt& APInt::operator+=(uint64_t RHS) {
   if (isSingleWord())
-    VAL += RHS;
+    U.VAL += RHS;
   else
-    add_1(pVal, pVal, getNumWords(), RHS);
+    tcAddPart(U.pVal, RHS, getNumWords());
   return clearUnusedBits();
 }
 
-/// Subtracts the integer array y from the integer array x
-/// @returns returns the borrow out.
-/// @brief Generalized subtraction of 64-bit integer arrays.
-static bool sub(uint64_t *dest, const uint64_t *x, const uint64_t *y,
-                unsigned len) {
-  bool borrow = false;
-  for (unsigned i = 0; i < len; ++i) {
-    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
-    dest[i] = x_tmp - y[i];
-  }
-  return borrow;
-}
-
 /// Subtracts the RHS APInt from this APInt
 /// @returns this, after subtraction
 /// @brief Subtraction assignment operator.
 APInt& APInt::operator-=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
-    VAL -= RHS.VAL;
+    U.VAL -= RHS.U.VAL;
   else
-    sub(pVal, pVal, RHS.pVal, getNumWords());
+    tcSubtract(U.pVal, RHS.U.pVal, 0, getNumWords());
   return clearUnusedBits();
 }
 
 APInt& APInt::operator-=(uint64_t RHS) {
   if (isSingleWord())
-    VAL -= RHS;
+    U.VAL -= RHS;
   else
-    sub_1(pVal, getNumWords(), RHS);
+    tcSubtractPart(U.pVal, RHS, getNumWords());
   return clearUnusedBits();
 }
 
-/// Multiplies an integer array, x, by a uint64_t integer and places the result
-/// into dest.
-/// @returns the carry out of the multiplication.
-/// @brief Multiply a multi-digit APInt by a single digit (64-bit) integer.
-static uint64_t mul_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
-  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
-  uint64_t ly = y & 0xffffffffULL, hy = y >> 32;
-  uint64_t carry = 0;
-
-  // For each digit of x.
-  for (unsigned i = 0; i < len; ++i) {
-    // Split x into high and low words
-    uint64_t lx = x[i] & 0xffffffffULL;
-    uint64_t hx = x[i] >> 32;
-    // hasCarry - A flag to indicate if there is a carry to the next digit.
-    // hasCarry == 0, no carry
-    // hasCarry == 1, has carry
-    // hasCarry == 2, no carry and the calculation result == 0.
-    uint8_t hasCarry = 0;
-    dest[i] = carry + lx * ly;
-    // Determine if the add above introduces carry.
-    hasCarry = (dest[i] < carry) ? 1 : 0;
-    carry = hx * ly + (dest[i] >> 32) + (hasCarry ? (1ULL << 32) : 0);
-    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
-    // (2^32 - 1) + 2^32 = 2^64.
-    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-
-    carry += (lx * hy) & 0xffffffffULL;
-    dest[i] = (carry << 32) | (dest[i] & 0xffffffffULL);
-    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
-            (carry >> 32) + ((lx * hy) >> 32) + hx * hy;
-  }
-  return carry;
-}
-
-/// Multiplies integer array x by integer array y and stores the result into
-/// the integer array dest. Note that dest's size must be >= xlen + ylen.
-/// @brief Generalized multiplicate of integer arrays.
-static void mul(uint64_t dest[], uint64_t x[], unsigned xlen, uint64_t y[],
-                unsigned ylen) {
-  dest[xlen] = mul_1(dest, x, xlen, y[0]);
-  for (unsigned i = 1; i < ylen; ++i) {
-    uint64_t ly = y[i] & 0xffffffffULL, hy = y[i] >> 32;
-    uint64_t carry = 0, lx = 0, hx = 0;
-    for (unsigned j = 0; j < xlen; ++j) {
-      lx = x[j] & 0xffffffffULL;
-      hx = x[j] >> 32;
-      // hasCarry - A flag to indicate if has carry.
-      // hasCarry == 0, no carry
-      // hasCarry == 1, has carry
-      // hasCarry == 2, no carry and the calculation result == 0.
-      uint8_t hasCarry = 0;
-      uint64_t resul = carry + lx * ly;
-      hasCarry = (resul < carry) ? 1 : 0;
-      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + (resul >> 32);
-      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-
-      carry += (lx * hy) & 0xffffffffULL;
-      resul = (carry << 32) | (resul & 0xffffffffULL);
-      dest[i+j] += resul;
-      carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0)+
-              (carry >> 32) + (dest[i+j] < resul ? 1 : 0) +
-              ((lx * hy) >> 32) + hx * hy;
-    }
-    dest[i+xlen] = carry;
-  }
-}
-
-APInt& APInt::operator*=(const APInt& RHS) {
+APInt APInt::operator*(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL *= RHS.VAL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  // Get some bit facts about LHS and check for zero
-  unsigned lhsBits = getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
-  if (!lhsWords)
-    // 0 * X ===> 0
-    return *this;
+  if (isSingleWord())
+    return APInt(BitWidth, U.VAL * RHS.U.VAL);
 
-  // Get some bit facts about RHS and check for zero
-  unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
-  if (!rhsWords) {
-    // X * 0 ===> 0
-    clearAllBits();
-    return *this;
-  }
+  APInt Result(getMemory(getNumWords()), getBitWidth());
 
-  // Allocate space for the result
-  unsigned destWords = rhsWords + lhsWords;
-  uint64_t *dest = getMemory(destWords);
+  tcMultiply(Result.U.pVal, U.pVal, RHS.U.pVal, getNumWords());
 
-  // Perform the long multiply
-  mul(dest, pVal, lhsWords, RHS.pVal, rhsWords);
+  Result.clearUnusedBits();
+  return Result;
+}
 
-  // Copy result back into *this
-  clearAllBits();
-  unsigned wordsToCopy = destWords >= getNumWords() ? getNumWords() : destWords;
-  memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
-  clearUnusedBits();
+void APInt::AndAssignSlowCase(const APInt& RHS) {
+  tcAnd(U.pVal, RHS.U.pVal, getNumWords());
+}
 
-  // delete dest array and return
-  delete[] dest;
-  return *this;
+void APInt::OrAssignSlowCase(const APInt& RHS) {
+  tcOr(U.pVal, RHS.U.pVal, getNumWords());
 }
 
-APInt& APInt::operator&=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL &= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] &= RHS.pVal[i];
-  return *this;
+void APInt::XorAssignSlowCase(const APInt& RHS) {
+  tcXor(U.pVal, RHS.U.pVal, getNumWords());
 }
 
-APInt& APInt::operator|=(const APInt& RHS) {
+APInt& APInt::operator*=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL |= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] |= RHS.pVal[i];
+  *this = *this * RHS;
   return *this;
 }
 
-APInt& APInt::operator^=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+APInt& APInt::operator*=(uint64_t RHS) {
   if (isSingleWord()) {
-    VAL ^= RHS.VAL;
-    this->clearUnusedBits();
-    return *this;
+    U.VAL *= RHS;
+  } else {
+    unsigned NumWords = getNumWords();
+    tcMultiplyPart(U.pVal, U.pVal, RHS, 0, NumWords, NumWords, false);
   }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] ^= RHS.pVal[i];
   return clearUnusedBits();
 }
 
-APInt APInt::AndSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t* val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] & RHS.pVal[i];
-  return APInt(val, getBitWidth());
-}
-
-APInt APInt::OrSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t *val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] | RHS.pVal[i];
-  return APInt(val, getBitWidth());
-}
-
-APInt APInt::XorSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t *val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] ^ RHS.pVal[i];
-
-  APInt Result(val, getBitWidth());
-  // 0^0==1 so clear the high bits in case they got set.
-  Result.clearUnusedBits();
-  return Result;
-}
-
-APInt APInt::operator*(const APInt& RHS) const {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord())
-    return APInt(BitWidth, VAL * RHS.VAL);
-  APInt Result(*this);
-  Result *= RHS;
-  return Result;
-}
-
 bool APInt::EqualSlowCase(const APInt& RHS) const {
-  return std::equal(pVal, pVal + getNumWords(), RHS.pVal);
+  return std::equal(U.pVal, U.pVal + getNumWords(), RHS.U.pVal);
 }
 
-bool APInt::EqualSlowCase(uint64_t Val) const {
-  unsigned n = getActiveBits();
-  if (n <= APINT_BITS_PER_WORD)
-    return pVal[0] == Val;
-  else
-    return false;
-}
-
-bool APInt::ult(const APInt& RHS) const {
+int APInt::compare(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
   if (isSingleWord())
-    return VAL < RHS.VAL;
-
-  // Get active bit length of both operands
-  unsigned n1 = getActiveBits();
-  unsigned n2 = RHS.getActiveBits();
+    return U.VAL < RHS.U.VAL ? -1 : U.VAL > RHS.U.VAL;
 
-  // If magnitude of LHS is less than RHS, return true.
-  if (n1 < n2)
-    return true;
-
-  // If magnitude of RHS is greather than LHS, return false.
-  if (n2 < n1)
-    return false;
-
-  // If they bot fit in a word, just compare the low order word
-  if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
-    return pVal[0] < RHS.pVal[0];
-
-  // Otherwise, compare all words
-  unsigned topWord = whichWord(std::max(n1,n2)-1);
-  for (int i = topWord; i >= 0; --i) {
-    if (pVal[i] > RHS.pVal[i])
-      return false;
-    if (pVal[i] < RHS.pVal[i])
-      return true;
-  }
-  return false;
+  return tcCompare(U.pVal, RHS.U.pVal, getNumWords());
 }
 
-bool APInt::slt(const APInt& RHS) const {
+int APInt::compareSigned(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be same for comparison");
   if (isSingleWord()) {
-    int64_t lhsSext = SignExtend64(VAL, BitWidth);
-    int64_t rhsSext = SignExtend64(RHS.VAL, BitWidth);
-    return lhsSext < rhsSext;
+    int64_t lhsSext = SignExtend64(U.VAL, BitWidth);
+    int64_t rhsSext = SignExtend64(RHS.U.VAL, BitWidth);
+    return lhsSext < rhsSext ? -1 : lhsSext > rhsSext;
   }
 
   bool lhsNeg = isNegative();
@@ -543,30 +294,45 @@ bool APInt::slt(const APInt& RHS) const {
 
   // If the sign bits don't match, then (LHS < RHS) if LHS is negative
   if (lhsNeg != rhsNeg)
-    return lhsNeg;
+    return lhsNeg ? -1 : 1;
 
-  // Otherwise we can just use an unsigned comparision, because even negative
+  // Otherwise we can just use an unsigned comparison, because even negative
   // numbers compare correctly this way if both have the same signed-ness.
-  return ult(RHS);
+  return tcCompare(U.pVal, RHS.U.pVal, getNumWords());
 }
 
-void APInt::setBit(unsigned bitPosition) {
-  if (isSingleWord())
-    VAL |= maskBit(bitPosition);
-  else
-    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
-}
+void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) {
+  unsigned loWord = whichWord(loBit);
+  unsigned hiWord = whichWord(hiBit);
 
-/// Set the given bit to 0 whose position is given as "bitPosition".
-/// @brief Set a given bit to 0.
-void APInt::clearBit(unsigned bitPosition) {
-  if (isSingleWord())
-    VAL &= ~maskBit(bitPosition);
-  else
-    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
+  // Create an initial mask for the low word with zeros below loBit.
+  uint64_t loMask = WORD_MAX << whichBit(loBit);
+
+  // If hiBit is not aligned, we need a high mask.
+  unsigned hiShiftAmt = whichBit(hiBit);
+  if (hiShiftAmt != 0) {
+    // Create a high mask with zeros above hiBit.
+    uint64_t hiMask = WORD_MAX >> (APINT_BITS_PER_WORD - hiShiftAmt);
+    // If loWord and hiWord are equal, then we combine the masks. Otherwise,
+    // set the bits in hiWord.
+    if (hiWord == loWord)
+      loMask &= hiMask;
+    else
+      U.pVal[hiWord] |= hiMask;
+  }
+  // Apply the mask to the low word.
+  U.pVal[loWord] |= loMask;
+
+  // Fill any words between loWord and hiWord with all ones.
+  for (unsigned word = loWord + 1; word < hiWord; ++word)
+    U.pVal[word] = WORD_MAX;
 }
 
 /// @brief Toggle every bit to its opposite value.
+void APInt::flipAllBitsSlowCase() {
+  tcComplement(U.pVal, getNumWords());
+  clearUnusedBits();
+}
 
 /// Toggle a given bit to its opposite value whose position is given
 /// as "bitPosition".
@@ -577,9 +343,104 @@ void APInt::flipBit(unsigned bitPosition) {
   else setBit(bitPosition);
 }
 
+void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
+  unsigned subBitWidth = subBits.getBitWidth();
+  assert(0 < subBitWidth && (subBitWidth + bitPosition) <= BitWidth &&
+         "Illegal bit insertion");
+
+  // Insertion is a direct copy.
+  if (subBitWidth == BitWidth) {
+    *this = subBits;
+    return;
+  }
+
+  // Single word result can be done as a direct bitmask.
+  if (isSingleWord()) {
+    uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
+    U.VAL &= ~(mask << bitPosition);
+    U.VAL |= (subBits.U.VAL << bitPosition);
+    return;
+  }
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hi1Word = whichWord(bitPosition + subBitWidth - 1);
+
+  // Insertion within a single word can be done as a direct bitmask.
+  if (loWord == hi1Word) {
+    uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
+    U.pVal[loWord] &= ~(mask << loBit);
+    U.pVal[loWord] |= (subBits.U.VAL << loBit);
+    return;
+  }
+
+  // Insert on word boundaries.
+  if (loBit == 0) {
+    // Direct copy whole words.
+    unsigned numWholeSubWords = subBitWidth / APINT_BITS_PER_WORD;
+    memcpy(U.pVal + loWord, subBits.getRawData(),
+           numWholeSubWords * APINT_WORD_SIZE);
+
+    // Mask+insert remaining bits.
+    unsigned remainingBits = subBitWidth % APINT_BITS_PER_WORD;
+    if (remainingBits != 0) {
+      uint64_t mask = WORD_MAX >> (APINT_BITS_PER_WORD - remainingBits);
+      U.pVal[hi1Word] &= ~mask;
+      U.pVal[hi1Word] |= subBits.getWord(subBitWidth - 1);
+    }
+    return;
+  }
+
+  // General case - set/clear individual bits in dst based on src.
+  // TODO - there is scope for optimization here, but at the moment this code
+  // path is barely used so prefer readability over performance.
+  for (unsigned i = 0; i != subBitWidth; ++i) {
+    if (subBits[i])
+      setBit(bitPosition + i);
+    else
+      clearBit(bitPosition + i);
+  }
+}
+
+APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
+  assert(numBits > 0 && "Can't extract zero bits");
+  assert(bitPosition < BitWidth && (numBits + bitPosition) <= BitWidth &&
+         "Illegal bit extraction");
+
+  if (isSingleWord())
+    return APInt(numBits, U.VAL >> bitPosition);
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hiWord = whichWord(bitPosition + numBits - 1);
+
+  // Single word result extracting bits from a single word source.
+  if (loWord == hiWord)
+    return APInt(numBits, U.pVal[loWord] >> loBit);
+
+  // Extracting bits that start on a source word boundary can be done
+  // as a fast memory copy.
+  if (loBit == 0)
+    return APInt(numBits, makeArrayRef(U.pVal + loWord, 1 + hiWord - loWord));
+
+  // General case - shift + copy source words directly into place.
+  APInt Result(numBits, 0);
+  unsigned NumSrcWords = getNumWords();
+  unsigned NumDstWords = Result.getNumWords();
+
+  for (unsigned word = 0; word < NumDstWords; ++word) {
+    uint64_t w0 = U.pVal[loWord + word];
+    uint64_t w1 =
+        (loWord + word + 1) < NumSrcWords ? U.pVal[loWord + word + 1] : 0;
+    Result.U.pVal[word] = (w0 >> loBit) | (w1 << (APINT_BITS_PER_WORD - loBit));
+  }
+
+  return Result.clearUnusedBits();
+}
+
 unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
           radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -604,7 +465,7 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
     return slen * 4 + isNegative;
 
   // FIXME: base 36
-  
+
   // This is grossly inefficient but accurate. We could probably do something
   // with a computation of roughly slen*64/20 and then adjust by the value of
   // the first few digits. But, I'm not sure how accurate that could be.
@@ -613,7 +474,7 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   // be too large. This avoids the assertion in the constructor. This
   // calculation doesn't work appropriately for the numbers 0-9, so just use 4
   // bits in that case.
-  unsigned sufficient 
+  unsigned sufficient
     = radix == 10? (slen == 1 ? 4 : slen * 64/18)
                  : (slen == 1 ? 7 : slen * 16/3);
 
@@ -632,9 +493,9 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
 
 hash_code llvm::hash_value(const APInt &Arg) {
   if (Arg.isSingleWord())
-    return hash_combine(Arg.VAL);
+    return hash_combine(Arg.U.VAL);
 
-  return hash_combine_range(Arg.pVal, Arg.pVal + Arg.getNumWords());
+  return hash_combine_range(Arg.U.pVal, Arg.U.pVal + Arg.getNumWords());
 }
 
 bool APInt::isSplat(unsigned SplatSizeInBits) const {
@@ -647,19 +508,31 @@ bool APInt::isSplat(unsigned SplatSizeInBits) const {
 
 /// This function returns the high "numBits" bits of this APInt.
 APInt APInt::getHiBits(unsigned numBits) const {
-  return APIntOps::lshr(*this, BitWidth - numBits);
+  return this->lshr(BitWidth - numBits);
 }
 
 /// This function returns the low "numBits" bits of this APInt.
 APInt APInt::getLoBits(unsigned numBits) const {
-  return APIntOps::lshr(APIntOps::shl(*this, BitWidth - numBits),
-                        BitWidth - numBits);
+  APInt Result(getLowBitsSet(BitWidth, numBits));
+  Result &= *this;
+  return Result;
+}
+
+/// Return a value containing V broadcasted over NewLen bits.
+APInt APInt::getSplat(unsigned NewLen, const APInt &V) {
+  assert(NewLen >= V.getBitWidth() && "Can't splat to smaller bit width!");
+
+  APInt Val = V.zextOrSelf(NewLen);
+  for (unsigned I = V.getBitWidth(); I < NewLen; I <<= 1)
+    Val |= Val << I;
+
+  return Val;
 }
 
 unsigned APInt::countLeadingZerosSlowCase() const {
   unsigned Count = 0;
   for (int i = getNumWords()-1; i >= 0; --i) {
-    integerPart V = pVal[i];
+    uint64_t V = U.pVal[i];
     if (V == 0)
       Count += APINT_BITS_PER_WORD;
     else {
@@ -673,10 +546,7 @@ unsigned APInt::countLeadingZerosSlowCase() const {
   return Count;
 }
 
-unsigned APInt::countLeadingOnes() const {
-  if (isSingleWord())
-    return llvm::countLeadingOnes(VAL << (APINT_BITS_PER_WORD - BitWidth));
-
+unsigned APInt::countLeadingOnesSlowCase() const {
   unsigned highWordBits = BitWidth % APINT_BITS_PER_WORD;
   unsigned shift;
   if (!highWordBits) {
@@ -686,13 +556,13 @@ unsigned APInt::countLeadingOnes() const {
     shift = APINT_BITS_PER_WORD - highWordBits;
   }
   int i = getNumWords() - 1;
-  unsigned Count = llvm::countLeadingOnes(pVal[i] << shift);
+  unsigned Count = llvm::countLeadingOnes(U.pVal[i] << shift);
   if (Count == highWordBits) {
     for (i--; i >= 0; --i) {
-      if (pVal[i] == -1ULL)
+      if (U.pVal[i] == WORD_MAX)
         Count += APINT_BITS_PER_WORD;
       else {
-        Count += llvm::countLeadingOnes(pVal[i]);
+        Count += llvm::countLeadingOnes(U.pVal[i]);
         break;
       }
     }
@@ -700,69 +570,71 @@ unsigned APInt::countLeadingOnes() const {
   return Count;
 }
 
-unsigned APInt::countTrailingZeros() const {
-  if (isSingleWord())
-    return std::min(unsigned(llvm::countTrailingZeros(VAL)), BitWidth);
+unsigned APInt::countTrailingZerosSlowCase() const {
   unsigned Count = 0;
   unsigned i = 0;
-  for (; i < getNumWords() && pVal[i] == 0; ++i)
+  for (; i < getNumWords() && U.pVal[i] == 0; ++i)
     Count += APINT_BITS_PER_WORD;
   if (i < getNumWords())
-    Count += llvm::countTrailingZeros(pVal[i]);
+    Count += llvm::countTrailingZeros(U.pVal[i]);
   return std::min(Count, BitWidth);
 }
 
 unsigned APInt::countTrailingOnesSlowCase() const {
   unsigned Count = 0;
   unsigned i = 0;
-  for (; i < getNumWords() && pVal[i] == -1ULL; ++i)
+  for (; i < getNumWords() && U.pVal[i] == WORD_MAX; ++i)
     Count += APINT_BITS_PER_WORD;
   if (i < getNumWords())
-    Count += llvm::countTrailingOnes(pVal[i]);
-  return std::min(Count, BitWidth);
+    Count += llvm::countTrailingOnes(U.pVal[i]);
+  assert(Count <= BitWidth);
+  return Count;
 }
 
 unsigned APInt::countPopulationSlowCase() const {
   unsigned Count = 0;
   for (unsigned i = 0; i < getNumWords(); ++i)
-    Count += llvm::countPopulation(pVal[i]);
+    Count += llvm::countPopulation(U.pVal[i]);
   return Count;
 }
 
-/// Perform a logical right-shift from Src to Dst, which must be equal or
-/// non-overlapping, of Words words, by Shift, which must be less than 64.
-static void lshrNear(uint64_t *Dst, uint64_t *Src, unsigned Words,
-                     unsigned Shift) {
-  uint64_t Carry = 0;
-  for (int I = Words - 1; I >= 0; --I) {
-    uint64_t Tmp = Src[I];
-    Dst[I] = (Tmp >> Shift) | Carry;
-    Carry = Tmp << (64 - Shift);
-  }
+bool APInt::intersectsSlowCase(const APInt &RHS) const {
+  for (unsigned i = 0, e = getNumWords(); i != e; ++i)
+    if ((U.pVal[i] & RHS.U.pVal[i]) != 0)
+      return true;
+
+  return false;
+}
+
+bool APInt::isSubsetOfSlowCase(const APInt &RHS) const {
+  for (unsigned i = 0, e = getNumWords(); i != e; ++i)
+    if ((U.pVal[i] & ~RHS.U.pVal[i]) != 0)
+      return false;
+
+  return true;
 }
 
 APInt APInt::byteSwap() const {
   assert(BitWidth >= 16 && BitWidth % 16 == 0 && "Cannot byteswap!");
   if (BitWidth == 16)
-    return APInt(BitWidth, ByteSwap_16(uint16_t(VAL)));
+    return APInt(BitWidth, ByteSwap_16(uint16_t(U.VAL)));
   if (BitWidth == 32)
-    return APInt(BitWidth, ByteSwap_32(unsigned(VAL)));
+    return APInt(BitWidth, ByteSwap_32(unsigned(U.VAL)));
   if (BitWidth == 48) {
-    unsigned Tmp1 = unsigned(VAL >> 16);
+    unsigned Tmp1 = unsigned(U.VAL >> 16);
     Tmp1 = ByteSwap_32(Tmp1);
-    uint16_t Tmp2 = uint16_t(VAL);
+    uint16_t Tmp2 = uint16_t(U.VAL);
     Tmp2 = ByteSwap_16(Tmp2);
     return APInt(BitWidth, (uint64_t(Tmp2) << 32) | Tmp1);
   }
   if (BitWidth == 64)
-    return APInt(BitWidth, ByteSwap_64(VAL));
+    return APInt(BitWidth, ByteSwap_64(U.VAL));
 
   APInt Result(getNumWords() * APINT_BITS_PER_WORD, 0);
   for (unsigned I = 0, N = getNumWords(); I != N; ++I)
-    Result.pVal[I] = ByteSwap_64(pVal[N - I - 1]);
+    Result.U.pVal[I] = ByteSwap_64(U.pVal[N - I - 1]);
   if (Result.BitWidth != BitWidth) {
-    lshrNear(Result.pVal, Result.pVal, getNumWords(),
-             Result.BitWidth - BitWidth);
+    Result.lshrInPlace(Result.BitWidth - BitWidth);
     Result.BitWidth = BitWidth;
   }
   return Result;
@@ -771,26 +643,24 @@ APInt APInt::byteSwap() const {
 APInt APInt::reverseBits() const {
   switch (BitWidth) {
   case 64:
-    return APInt(BitWidth, llvm::reverseBits<uint64_t>(VAL));
+    return APInt(BitWidth, llvm::reverseBits<uint64_t>(U.VAL));
   case 32:
-    return APInt(BitWidth, llvm::reverseBits<uint32_t>(VAL));
+    return APInt(BitWidth, llvm::reverseBits<uint32_t>(U.VAL));
   case 16:
-    return APInt(BitWidth, llvm::reverseBits<uint16_t>(VAL));
+    return APInt(BitWidth, llvm::reverseBits<uint16_t>(U.VAL));
   case 8:
-    return APInt(BitWidth, llvm::reverseBits<uint8_t>(VAL));
+    return APInt(BitWidth, llvm::reverseBits<uint8_t>(U.VAL));
   default:
     break;
   }
 
   APInt Val(*this);
-  APInt Reversed(*this);
-  int S = BitWidth - 1;
-
-  const APInt One(BitWidth, 1);
+  APInt Reversed(BitWidth, 0);
+  unsigned S = BitWidth;
 
-  for ((Val = Val.lshr(1)); Val != 0; (Val = Val.lshr(1))) {
+  for (; Val != 0; Val.lshrInPlace(1)) {
     Reversed <<= 1;
-    Reversed |= (Val & One);
+    Reversed |= Val[0];
     --S;
   }
 
@@ -798,14 +668,46 @@ APInt APInt::reverseBits() const {
   return Reversed;
 }
 
-APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1,
-                                            const APInt& API2) {
-  APInt A = API1, B = API2;
-  while (!!B) {
-    APInt T = B;
-    B = APIntOps::urem(A, B);
-    A = T;
+APInt llvm::APIntOps::GreatestCommonDivisor(APInt A, APInt B) {
+  // Fast-path a common case.
+  if (A == B) return A;
+
+  // Corner cases: if either operand is zero, the other is the gcd.
+  if (!A) return B;
+  if (!B) return A;
+
+  // Count common powers of 2 and remove all other powers of 2.
+  unsigned Pow2;
+  {
+    unsigned Pow2_A = A.countTrailingZeros();
+    unsigned Pow2_B = B.countTrailingZeros();
+    if (Pow2_A > Pow2_B) {
+      A.lshrInPlace(Pow2_A - Pow2_B);
+      Pow2 = Pow2_B;
+    } else if (Pow2_B > Pow2_A) {
+      B.lshrInPlace(Pow2_B - Pow2_A);
+      Pow2 = Pow2_A;
+    } else {
+      Pow2 = Pow2_A;
+    }
+  }
+
+  // Both operands are odd multiples of 2^Pow_2:
+  //
+  //   gcd(a, b) = gcd(|a - b| / 2^i, min(a, b))
+  //
+  // This is a modified version of Stein's algorithm, taking advantage of
+  // efficient countTrailingZeros().
+  while (A != B) {
+    if (A.ugt(B)) {
+      A -= B;
+      A.lshrInPlace(A.countTrailingZeros() - Pow2);
+    } else {
+      B -= A;
+      B.lshrInPlace(B.countTrailingZeros() - Pow2);
+    }
   }
+
   return A;
 }
 
@@ -841,7 +743,7 @@ APInt llvm::APIntOps::RoundDoubleToAPInt(double Double, unsigned width) {
 
   // Otherwise, we have to shift the mantissa bits up to the right location
   APInt Tmp(width, mantissa);
-  Tmp = Tmp.shl((unsigned)exp - 52);
+  Tmp <<= (unsigned)exp - 52;
   return isNeg ? -Tmp : Tmp;
 }
 
@@ -892,13 +794,13 @@ double APInt::roundToDouble(bool isSigned) const {
   uint64_t mantissa;
   unsigned hiWord = whichWord(n-1);
   if (hiWord == 0) {
-    mantissa = Tmp.pVal[0];
+    mantissa = Tmp.U.pVal[0];
     if (n > 52)
       mantissa >>= n - 52; // shift down, we want the top 52 bits.
   } else {
     assert(hiWord > 0 && "huh?");
-    uint64_t hibits = Tmp.pVal[hiWord] << (52 - n % APINT_BITS_PER_WORD);
-    uint64_t lobits = Tmp.pVal[hiWord-1] >> (11 + n % APINT_BITS_PER_WORD);
+    uint64_t hibits = Tmp.U.pVal[hiWord] << (52 - n % APINT_BITS_PER_WORD);
+    uint64_t lobits = Tmp.U.pVal[hiWord-1] >> (11 + n % APINT_BITS_PER_WORD);
     mantissa = hibits | lobits;
   }
 
@@ -925,54 +827,37 @@ APInt APInt::trunc(unsigned width) const {
   // Copy full words.
   unsigned i;
   for (i = 0; i != width / APINT_BITS_PER_WORD; i++)
-    Result.pVal[i] = pVal[i];
+    Result.U.pVal[i] = U.pVal[i];
 
   // Truncate and copy any partial word.
   unsigned bits = (0 - width) % APINT_BITS_PER_WORD;
   if (bits != 0)
-    Result.pVal[i] = pVal[i] << bits >> bits;
+    Result.U.pVal[i] = U.pVal[i] << bits >> bits;
 
   return Result;
 }
 
 // Sign extend to a new width.
-APInt APInt::sext(unsigned width) const {
-  assert(width > BitWidth && "Invalid APInt SignExtend request");
-
-  if (width <= APINT_BITS_PER_WORD) {
-    uint64_t val = VAL << (APINT_BITS_PER_WORD - BitWidth);
-    val = (int64_t)val >> (width - BitWidth);
-    return APInt(width, val >> (APINT_BITS_PER_WORD - width));
-  }
+APInt APInt::sext(unsigned Width) const {
+  assert(Width > BitWidth && "Invalid APInt SignExtend request");
 
-  APInt Result(getMemory(getNumWords(width)), width);
+  if (Width <= APINT_BITS_PER_WORD)
+    return APInt(Width, SignExtend64(U.VAL, BitWidth));
 
-  // Copy full words.
-  unsigned i;
-  uint64_t word = 0;
-  for (i = 0; i != BitWidth / APINT_BITS_PER_WORD; i++) {
-    word = getRawData()[i];
-    Result.pVal[i] = word;
-  }
+  APInt Result(getMemory(getNumWords(Width)), Width);
 
-  // Read and sign-extend any partial word.
-  unsigned bits = (0 - BitWidth) % APINT_BITS_PER_WORD;
-  if (bits != 0)
-    word = (int64_t)getRawData()[i] << bits >> bits;
-  else
-    word = (int64_t)word >> (APINT_BITS_PER_WORD - 1);
-
-  // Write remaining full words.
-  for (; i != width / APINT_BITS_PER_WORD; i++) {
-    Result.pVal[i] = word;
-    word = (int64_t)word >> (APINT_BITS_PER_WORD - 1);
-  }
+  // Copy words.
+  std::memcpy(Result.U.pVal, getRawData(), getNumWords() * APINT_WORD_SIZE);
 
-  // Write any partial word.
-  bits = (0 - width) % APINT_BITS_PER_WORD;
-  if (bits != 0)
-    Result.pVal[i] = word << bits >> bits;
+  // Sign extend the last word since there may be unused bits in the input.
+  Result.U.pVal[getNumWords() - 1] =
+      SignExtend64(Result.U.pVal[getNumWords() - 1],
+                   ((BitWidth - 1) % APINT_BITS_PER_WORD) + 1);
 
+  // Fill with sign bits.
+  std::memset(Result.U.pVal + getNumWords(), isNegative() ? -1 : 0,
+              (Result.getNumWords() - getNumWords()) * APINT_WORD_SIZE);
+  Result.clearUnusedBits();
   return Result;
 }
 
@@ -981,17 +866,16 @@ APInt APInt::zext(unsigned width) const {
   assert(width > BitWidth && "Invalid APInt ZeroExtend request");
 
   if (width <= APINT_BITS_PER_WORD)
-    return APInt(width, VAL);
+    return APInt(width, U.VAL);
 
   APInt Result(getMemory(getNumWords(width)), width);
 
   // Copy words.
-  unsigned i;
-  for (i = 0; i != getNumWords(); i++)
-    Result.pVal[i] = getRawData()[i];
+  std::memcpy(Result.U.pVal, getRawData(), getNumWords() * APINT_WORD_SIZE);
 
   // Zero remaining words.
-  memset(&Result.pVal[i], 0, (Result.getNumWords() - i) * APINT_WORD_SIZE);
+  std::memset(Result.U.pVal + getNumWords(), 0,
+              (Result.getNumWords() - getNumWords()) * APINT_WORD_SIZE);
 
   return Result;
 }
@@ -1026,226 +910,93 @@ APInt APInt::sextOrSelf(unsigned width) const {
 
 /// Arithmetic right-shift this APInt by shiftAmt.
 /// @brief Arithmetic right-shift function.
-APInt APInt::ashr(const APInt &shiftAmt) const {
-  return ashr((unsigned)shiftAmt.getLimitedValue(BitWidth));
+void APInt::ashrInPlace(const APInt &shiftAmt) {
+  ashrInPlace((unsigned)shiftAmt.getLimitedValue(BitWidth));
 }
 
 /// Arithmetic right-shift this APInt by shiftAmt.
 /// @brief Arithmetic right-shift function.
-APInt APInt::ashr(unsigned shiftAmt) const {
-  assert(shiftAmt <= BitWidth && "Invalid shift amount");
-  // Handle a degenerate case
-  if (shiftAmt == 0)
-    return *this;
+void APInt::ashrSlowCase(unsigned ShiftAmt) {
+  // Don't bother performing a no-op shift.
+  if (!ShiftAmt)
+    return;
 
-  // Handle single word shifts with built-in ashr
-  if (isSingleWord()) {
-    if (shiftAmt == BitWidth)
-      return APInt(BitWidth, 0); // undefined
-    return APInt(BitWidth, SignExtend64(VAL, BitWidth) >> shiftAmt);
-  }
+  // Save the original sign bit for later.
+  bool Negative = isNegative();
 
-  // If all the bits were shifted out, the result is, technically, undefined.
-  // We return -1 if it was negative, 0 otherwise. We check this early to avoid
-  // issues in the algorithm below.
-  if (shiftAmt == BitWidth) {
-    if (isNegative())
-      return APInt(BitWidth, -1ULL, true);
-    else
-      return APInt(BitWidth, 0);
-  }
-
-  // Create some space for the result.
-  uint64_t * val = new uint64_t[getNumWords()];
-
-  // Compute some values needed by the following shift algorithms
-  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD; // bits to shift per word
-  unsigned offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
-  unsigned breakWord = getNumWords() - 1 - offset; // last word affected
-  unsigned bitsInWord = whichBit(BitWidth); // how many bits in last word?
-  if (bitsInWord == 0)
-    bitsInWord = APINT_BITS_PER_WORD;
-
-  // If we are shifting whole words, just move whole words
-  if (wordShift == 0) {
-    // Move the words containing significant bits
-    for (unsigned i = 0; i <= breakWord; ++i)
-      val[i] = pVal[i+offset]; // move whole word
-
-    // Adjust the top significant word for sign bit fill, if negative
-    if (isNegative())
-      if (bitsInWord < APINT_BITS_PER_WORD)
-        val[breakWord] |= ~0ULL << bitsInWord; // set high bits
-  } else {
-    // Shift the low order words
-    for (unsigned i = 0; i < breakWord; ++i) {
-      // This combines the shifted corresponding word with the low bits from
-      // the next word (shifted into this word's high bits).
-      val[i] = (pVal[i+offset] >> wordShift) |
-               (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
-    }
+  // WordShift is the inter-part shift; BitShift is is intra-part shift.
+  unsigned WordShift = ShiftAmt / APINT_BITS_PER_WORD;
+  unsigned BitShift = ShiftAmt % APINT_BITS_PER_WORD;
+
+  unsigned WordsToMove = getNumWords() - WordShift;
+  if (WordsToMove != 0) {
+    // Sign extend the last word to fill in the unused bits.
+    U.pVal[getNumWords() - 1] = SignExtend64(
+        U.pVal[getNumWords() - 1], ((BitWidth - 1) % APINT_BITS_PER_WORD) + 1);
 
-    // Shift the break word. In this case there are no bits from the next word
-    // to include in this word.
-    val[breakWord] = pVal[breakWord+offset] >> wordShift;
-
-    // Deal with sign extension in the break word, and possibly the word before
-    // it.
-    if (isNegative()) {
-      if (wordShift > bitsInWord) {
-        if (breakWord > 0)
-          val[breakWord-1] |=
-            ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
-        val[breakWord] |= ~0ULL;
-      } else
-        val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
+    // Fastpath for moving by whole words.
+    if (BitShift == 0) {
+      std::memmove(U.pVal, U.pVal + WordShift, WordsToMove * APINT_WORD_SIZE);
+    } else {
+      // Move the words containing significant bits.
+      for (unsigned i = 0; i != WordsToMove - 1; ++i)
+        U.pVal[i] = (U.pVal[i + WordShift] >> BitShift) |
+                    (U.pVal[i + WordShift + 1] << (APINT_BITS_PER_WORD - BitShift));
+
+      // Handle the last word which has no high bits to copy.
+      U.pVal[WordsToMove - 1] = U.pVal[WordShift + WordsToMove - 1] >> BitShift;
+      // Sign extend one more time.
+      U.pVal[WordsToMove - 1] =
+          SignExtend64(U.pVal[WordsToMove - 1], APINT_BITS_PER_WORD - BitShift);
     }
   }
 
-  // Remaining words are 0 or -1, just assign them.
-  uint64_t fillValue = (isNegative() ? -1ULL : 0);
-  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
-    val[i] = fillValue;
-  APInt Result(val, BitWidth);
-  Result.clearUnusedBits();
-  return Result;
+  // Fill in the remainder based on the original sign.
+  std::memset(U.pVal + WordsToMove, Negative ? -1 : 0,
+              WordShift * APINT_WORD_SIZE);
+  clearUnusedBits();
 }
 
 /// Logical right-shift this APInt by shiftAmt.
 /// @brief Logical right-shift function.
-APInt APInt::lshr(const APInt &shiftAmt) const {
-  return lshr((unsigned)shiftAmt.getLimitedValue(BitWidth));
+void APInt::lshrInPlace(const APInt &shiftAmt) {
+  lshrInPlace((unsigned)shiftAmt.getLimitedValue(BitWidth));
 }
 
 /// Logical right-shift this APInt by shiftAmt.
 /// @brief Logical right-shift function.
-APInt APInt::lshr(unsigned shiftAmt) const {
-  if (isSingleWord()) {
-    if (shiftAmt >= BitWidth)
-      return APInt(BitWidth, 0);
-    else
-      return APInt(BitWidth, this->VAL >> shiftAmt);
-  }
-
-  // If all the bits were shifted out, the result is 0. This avoids issues
-  // with shifting by the size of the integer type, which produces undefined
-  // results. We define these "undefined results" to always be 0.
-  if (shiftAmt >= BitWidth)
-    return APInt(BitWidth, 0);
-
-  // If none of the bits are shifted out, the result is *this. This avoids
-  // issues with shifting by the size of the integer type, which produces
-  // undefined results in the code below. This is also an optimization.
-  if (shiftAmt == 0)
-    return *this;
-
-  // Create some space for the result.
-  uint64_t * val = new uint64_t[getNumWords()];
-
-  // If we are shifting less than a word, compute the shift with a simple carry
-  if (shiftAmt < APINT_BITS_PER_WORD) {
-    lshrNear(val, pVal, getNumWords(), shiftAmt);
-    APInt Result(val, BitWidth);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  // Compute some values needed by the remaining shift algorithms
-  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
-  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
-
-  // If we are shifting whole words, just move whole words
-  if (wordShift == 0) {
-    for (unsigned i = 0; i < getNumWords() - offset; ++i)
-      val[i] = pVal[i+offset];
-    for (unsigned i = getNumWords()-offset; i < getNumWords(); i++)
-      val[i] = 0;
-    APInt Result(val, BitWidth);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  // Shift the low order words
-  unsigned breakWord = getNumWords() - offset -1;
-  for (unsigned i = 0; i < breakWord; ++i)
-    val[i] = (pVal[i+offset] >> wordShift) |
-             (pVal[i+offset+1] << (APINT_BITS_PER_WORD - wordShift));
-  // Shift the break word.
-  val[breakWord] = pVal[breakWord+offset] >> wordShift;
-
-  // Remaining words are 0
-  for (unsigned i = breakWord+1; i < getNumWords(); ++i)
-    val[i] = 0;
-  APInt Result(val, BitWidth);
-  Result.clearUnusedBits();
-  return Result;
+void APInt::lshrSlowCase(unsigned ShiftAmt) {
+  tcShiftRight(U.pVal, getNumWords(), ShiftAmt);
 }
 
 /// Left-shift this APInt by shiftAmt.
 /// @brief Left-shift function.
-APInt APInt::shl(const APInt &shiftAmt) const {
+APInt &APInt::operator<<=(const APInt &shiftAmt) {
   // It's undefined behavior in C to shift by BitWidth or greater.
-  return shl((unsigned)shiftAmt.getLimitedValue(BitWidth));
+  *this <<= (unsigned)shiftAmt.getLimitedValue(BitWidth);
+  return *this;
 }
 
-APInt APInt::shlSlowCase(unsigned shiftAmt) const {
-  // If all the bits were shifted out, the result is 0. This avoids issues
-  // with shifting by the size of the integer type, which produces undefined
-  // results. We define these "undefined results" to always be 0.
-  if (shiftAmt == BitWidth)
-    return APInt(BitWidth, 0);
-
-  // If none of the bits are shifted out, the result is *this. This avoids a
-  // lshr by the words size in the loop below which can produce incorrect
-  // results. It also avoids the expensive computation below for a common case.
-  if (shiftAmt == 0)
-    return *this;
-
-  // Create some space for the result.
-  uint64_t * val = new uint64_t[getNumWords()];
+void APInt::shlSlowCase(unsigned ShiftAmt) {
+  tcShiftLeft(U.pVal, getNumWords(), ShiftAmt);
+  clearUnusedBits();
+}
 
-  // If we are shifting less than a word, do it the easy way
-  if (shiftAmt < APINT_BITS_PER_WORD) {
-    uint64_t carry = 0;
-    for (unsigned i = 0; i < getNumWords(); i++) {
-      val[i] = pVal[i] << shiftAmt | carry;
-      carry = pVal[i] >> (APINT_BITS_PER_WORD - shiftAmt);
-    }
-    APInt Result(val, BitWidth);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  // Compute some values needed by the remaining shift algorithms
-  unsigned wordShift = shiftAmt % APINT_BITS_PER_WORD;
-  unsigned offset = shiftAmt / APINT_BITS_PER_WORD;
-
-  // If we are shifting whole words, just move whole words
-  if (wordShift == 0) {
-    for (unsigned i = 0; i < offset; i++)
-      val[i] = 0;
-    for (unsigned i = offset; i < getNumWords(); i++)
-      val[i] = pVal[i-offset];
-    APInt Result(val, BitWidth);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  // Copy whole words from this to Result.
-  unsigned i = getNumWords() - 1;
-  for (; i > offset; --i)
-    val[i] = pVal[i-offset] << wordShift |
-             pVal[i-offset-1] >> (APINT_BITS_PER_WORD - wordShift);
-  val[offset] = pVal[0] << wordShift;
-  for (i = 0; i < offset; ++i)
-    val[i] = 0;
-  APInt Result(val, BitWidth);
-  Result.clearUnusedBits();
-  return Result;
+// Calculate the rotate amount modulo the bit width.
+static unsigned rotateModulo(unsigned BitWidth, const APInt &rotateAmt) {
+  unsigned rotBitWidth = rotateAmt.getBitWidth();
+  APInt rot = rotateAmt;
+  if (rotBitWidth < BitWidth) {
+    // Extend the rotate APInt, so that the urem doesn't divide by 0.
+    // e.g. APInt(1, 32) would give APInt(1, 0).
+    rot = rotateAmt.zext(BitWidth);
+  }
+  rot = rot.urem(APInt(rot.getBitWidth(), BitWidth));
+  return rot.getLimitedValue(BitWidth);
 }
 
 APInt APInt::rotl(const APInt &rotateAmt) const {
-  return rotl((unsigned)rotateAmt.getLimitedValue(BitWidth));
+  return rotl(rotateModulo(BitWidth, rotateAmt));
 }
 
 APInt APInt::rotl(unsigned rotateAmt) const {
@@ -1256,7 +1007,7 @@ APInt APInt::rotl(unsigned rotateAmt) const {
 }
 
 APInt APInt::rotr(const APInt &rotateAmt) const {
-  return rotr((unsigned)rotateAmt.getLimitedValue(BitWidth));
+  return rotr(rotateModulo(BitWidth, rotateAmt));
 }
 
 APInt APInt::rotr(unsigned rotateAmt) const {
@@ -1290,7 +1041,7 @@ APInt APInt::sqrt() const {
       /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       /*    31 */ 6
     };
-    return APInt(BitWidth, results[ (isSingleWord() ? VAL : pVal[0]) ]);
+    return APInt(BitWidth, results[ (isSingleWord() ? U.VAL : U.pVal[0]) ]);
   }
 
   // If the magnitude of the value fits in less than 52 bits (the precision of
@@ -1299,7 +1050,8 @@ APInt APInt::sqrt() const {
   // This should be faster than the algorithm below.
   if (magnitude < 52) {
     return APInt(BitWidth,
-                 uint64_t(::round(::sqrt(double(isSingleWord()?VAL:pVal[0])))));
+                 uint64_t(::round(::sqrt(double(isSingleWord() ? U.VAL
+                                                               : U.pVal[0])))));
   }
 
   // Okay, all the short cuts are exhausted. We must compute it. The following
@@ -1384,10 +1136,13 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const {
     return APInt(BitWidth, 0);
 
   // The next-to-last t is the multiplicative inverse.  However, we are
-  // interested in a positive inverse. Calcuate a positive one from a negative
+  // interested in a positive inverse. Calculate a positive one from a negative
   // one if necessary. A simple addition of the modulo suffices because
   // abs(t[i]) is known to be less than *this/2 (see the link above).
-  return t[i].isNegative() ? t[i] + modulo : t[i];
+  if (t[i].isNegative())
+    t[i] += modulo;
+
+  return std::move(t[i]);
 }
 
 /// Calculate the magic numbers required to implement a signed integer division
@@ -1486,7 +1241,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const {
 /// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
 /// variables here have the same names as in the algorithm. Comments explain
 /// the algorithm and any deviation from it.
-static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
+static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r,
                      unsigned m, unsigned n) {
   assert(u && "Must provide dividend");
   assert(v && "Must provide divisor");
@@ -1512,16 +1267,16 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   // overflow. Note that this can require an extra word in u so that u must
   // be of length m+n+1.
   unsigned shift = countLeadingZeros(v[n-1]);
-  unsigned v_carry = 0;
-  unsigned u_carry = 0;
+  uint32_t v_carry = 0;
+  uint32_t u_carry = 0;
   if (shift) {
     for (unsigned i = 0; i < m+n; ++i) {
-      unsigned u_tmp = u[i] >> (32 - shift);
+      uint32_t u_tmp = u[i] >> (32 - shift);
       u[i] = (u[i] << shift) | u_carry;
       u_carry = u_tmp;
     }
     for (unsigned i = 0; i < n; ++i) {
-      unsigned v_tmp = v[i] >> (32 - shift);
+      uint32_t v_tmp = v[i] >> (32 - shift);
       v[i] = (v[i] << shift) | v_carry;
       v_carry = v_tmp;
     }
@@ -1542,11 +1297,11 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
     //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
     // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
-    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // qp by 1, increase rp by v[n-1], and repeat this test if rp < b. The test
     // on v[n-2] determines at high speed most of the cases in which the trial
     // value qp is one too large, and it eliminates all cases where qp is two
     // too large.
-    uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]);
+    uint64_t dividend = Make_64(u[j+n], u[j+n-1]);
     DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n');
     uint64_t qp = dividend / v[n-1];
     uint64_t rp = dividend % v[n-1];
@@ -1569,14 +1324,14 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
     int64_t borrow = 0;
     for (unsigned i = 0; i < n; ++i) {
       uint64_t p = uint64_t(qp) * uint64_t(v[i]);
-      int64_t subres = int64_t(u[j+i]) - borrow - (unsigned)p;
-      u[j+i] = (unsigned)subres;
-      borrow = (p >> 32) - (subres >> 32);
+      int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p);
+      u[j+i] = Lo_32(subres);
+      borrow = Hi_32(p) - Hi_32(subres);
       DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i]
                    << ", borrow = " << borrow << '\n');
     }
     bool isNeg = u[j+n] < borrow;
-    u[j+n] -= (unsigned)borrow;
+    u[j+n] -= Lo_32(borrow);
 
     DEBUG(dbgs() << "KnuthDiv: after subtraction:");
     DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]);
@@ -1584,7 +1339,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
 
     // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
     // negative, go to step D6; otherwise go on to step D7.
-    q[j] = (unsigned)qp;
+    q[j] = Lo_32(qp);
     if (isNeg) {
       // D6. [Add back]. The probability that this step is necessary is very
       // small, on the order of only 2/b. Make sure that test data accounts for
@@ -1595,7 +1350,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
       // since it cancels with the borrow that occurred in D4.
       bool carry = false;
       for (unsigned i = 0; i < n; i++) {
-        unsigned limit = std::min(u[j+i],v[i]);
+        uint32_t limit = std::min(u[j+i],v[i]);
         u[j+i] += v[i] + carry;
         carry = u[j+i] < limit || (carry && u[j+i] == limit);
       }
@@ -1618,9 +1373,9 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   if (r) {
     // The value d is expressed by the "shift" value above since we avoided
     // multiplication by d by using a shift left. So, all we have to do is
-    // shift right here. In order to mak
+    // shift right here.
     if (shift) {
-      unsigned carry = 0;
+      uint32_t carry = 0;
       DEBUG(dbgs() << "KnuthDiv: remainder:");
       for (int i = n-1; i >= 0; i--) {
         r[i] = (u[i] >> shift) | carry;
@@ -1638,8 +1393,8 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   DEBUG(dbgs() << '\n');
 }
 
-void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
-                   unsigned rhsWords, APInt *Quotient, APInt *Remainder) {
+void APInt::divide(const WordType *LHS, unsigned lhsWords, const WordType *RHS,
+                   unsigned rhsWords, WordType *Quotient, WordType *Remainder) {
   assert(lhsWords >= rhsWords && "Fractional result");
 
   // First, compose the values into an array of 32-bit words instead of
@@ -1649,17 +1404,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // can't use 64-bit operands here because we don't have native results of
   // 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't
   // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT);
   unsigned n = rhsWords * 2;
   unsigned m = (lhsWords * 2) - n;
 
   // Allocate space for the temporary values we need either on the stack, if
   // it will fit, or on the heap if it won't.
-  unsigned SPACE[128];
-  unsigned *U = nullptr;
-  unsigned *V = nullptr;
-  unsigned *Q = nullptr;
-  unsigned *R = nullptr;
+  uint32_t SPACE[128];
+  uint32_t *U = nullptr;
+  uint32_t *V = nullptr;
+  uint32_t *Q = nullptr;
+  uint32_t *R = nullptr;
   if ((Remainder?4:3)*n+2*m+1 <= 128) {
     U = &SPACE[0];
     V = &SPACE[m+n+1];
@@ -1667,34 +1421,34 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
     if (Remainder)
       R = &SPACE[(m+n+1) + n + (m+n)];
   } else {
-    U = new unsigned[m + n + 1];
-    V = new unsigned[n];
-    Q = new unsigned[m+n];
+    U = new uint32_t[m + n + 1];
+    V = new uint32_t[n];
+    Q = new uint32_t[m+n];
     if (Remainder)
-      R = new unsigned[n];
+      R = new uint32_t[n];
   }
 
   // Initialize the dividend
-  memset(U, 0, (m+n+1)*sizeof(unsigned));
+  memset(U, 0, (m+n+1)*sizeof(uint32_t));
   for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.VAL : LHS.pVal[i]);
-    U[i * 2] = (unsigned)(tmp & mask);
-    U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+    uint64_t tmp = LHS[i];
+    U[i * 2] = Lo_32(tmp);
+    U[i * 2 + 1] = Hi_32(tmp);
   }
   U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm.
 
   // Initialize the divisor
-  memset(V, 0, (n)*sizeof(unsigned));
+  memset(V, 0, (n)*sizeof(uint32_t));
   for (unsigned i = 0; i < rhsWords; ++i) {
-    uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.VAL : RHS.pVal[i]);
-    V[i * 2] = (unsigned)(tmp & mask);
-    V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT));
+    uint64_t tmp = RHS[i];
+    V[i * 2] = Lo_32(tmp);
+    V[i * 2 + 1] = Hi_32(tmp);
   }
 
   // initialize the quotient and remainder
-  memset(Q, 0, (m+n) * sizeof(unsigned));
+  memset(Q, 0, (m+n) * sizeof(uint32_t));
   if (Remainder)
-    memset(R, 0, n * sizeof(unsigned));
+    memset(R, 0, n * sizeof(uint32_t));
 
   // Now, adjust m and n for the Knuth division. n is the number of words in
   // the divisor. m is the number of words by which the dividend exceeds the
@@ -1715,22 +1469,22 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   // are using base 2^32 instead of base 10.
   assert(n != 0 && "Divide by zero?");
   if (n == 1) {
-    unsigned divisor = V[0];
-    unsigned remainder = 0;
-    for (int i = m+n-1; i >= 0; i--) {
-      uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i];
+    uint32_t divisor = V[0];
+    uint32_t remainder = 0;
+    for (int i = m; i >= 0; i--) {
+      uint64_t partial_dividend = Make_64(remainder, U[i]);
       if (partial_dividend == 0) {
         Q[i] = 0;
         remainder = 0;
       } else if (partial_dividend < divisor) {
         Q[i] = 0;
-        remainder = (unsigned)partial_dividend;
+        remainder = Lo_32(partial_dividend);
       } else if (partial_dividend == divisor) {
         Q[i] = 1;
         remainder = 0;
       } else {
-        Q[i] = (unsigned)(partial_dividend / divisor);
-        remainder = (unsigned)(partial_dividend - (Q[i] * divisor));
+        Q[i] = Lo_32(partial_dividend / divisor);
+        remainder = Lo_32(partial_dividend - (Q[i] * divisor));
       }
     }
     if (R)
@@ -1743,66 +1497,14 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
 
   // If the caller wants the quotient
   if (Quotient) {
-    // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord())
-        Quotient->VAL = 0;
-      else
-        delete [] Quotient->pVal;
-      Quotient->BitWidth = LHS.BitWidth;
-      if (!Quotient->isSingleWord())
-        Quotient->pVal = getClearedMemory(Quotient->getNumWords());
-    } else
-      Quotient->clearAllBits();
-
-    // The quotient is in Q. Reconstitute the quotient into Quotient's low
-    // order words.
-    // This case is currently dead as all users of divide() handle trivial cases
-    // earlier.
-    if (lhsWords == 1) {
-      uint64_t tmp =
-        uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2));
-      if (Quotient->isSingleWord())
-        Quotient->VAL = tmp;
-      else
-        Quotient->pVal[0] = tmp;
-    } else {
-      assert(!Quotient->isSingleWord() && "Quotient APInt not large enough");
-      for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->pVal[i] =
-          uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2));
-    }
+    for (unsigned i = 0; i < lhsWords; ++i)
+      Quotient[i] = Make_64(Q[i*2+1], Q[i*2]);
   }
 
   // If the caller wants the remainder
   if (Remainder) {
-    // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != RHS.BitWidth) {
-      if (Remainder->isSingleWord())
-        Remainder->VAL = 0;
-      else
-        delete [] Remainder->pVal;
-      Remainder->BitWidth = RHS.BitWidth;
-      if (!Remainder->isSingleWord())
-        Remainder->pVal = getClearedMemory(Remainder->getNumWords());
-    } else
-      Remainder->clearAllBits();
-
-    // The remainder is in R. Reconstitute the remainder into Remainder's low
-    // order words.
-    if (rhsWords == 1) {
-      uint64_t tmp =
-        uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2));
-      if (Remainder->isSingleWord())
-        Remainder->VAL = tmp;
-      else
-        Remainder->pVal[0] = tmp;
-    } else {
-      assert(!Remainder->isSingleWord() && "Remainder APInt not large enough");
-      for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->pVal[i] =
-          uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2));
-    }
+    for (unsigned i = 0; i < rhsWords; ++i)
+      Remainder[i] = Make_64(R[i*2+1], R[i*2]);
   }
 
   // Clean up the memory we allocated.
@@ -1814,40 +1516,74 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS,
   }
 }
 
-APInt APInt::udiv(const APInt& RHS) const {
+APInt APInt::udiv(const APInt &RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
 
   // First, deal with the easy case
   if (isSingleWord()) {
-    assert(RHS.VAL != 0 && "Divide by zero?");
-    return APInt(BitWidth, VAL / RHS.VAL);
+    assert(RHS.U.VAL != 0 && "Divide by zero?");
+    return APInt(BitWidth, U.VAL / RHS.U.VAL);
   }
 
   // Get some facts about the LHS and RHS number of bits and words
-  unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(getActiveBits());
+  unsigned rhsBits  = RHS.getActiveBits();
+  unsigned rhsWords = getNumWords(rhsBits);
   assert(rhsWords && "Divided by zero???");
-  unsigned lhsBits = this->getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
 
   // Deal with some degenerate cases
   if (!lhsWords)
     // 0 / X ===> 0
     return APInt(BitWidth, 0);
-  else if (lhsWords < rhsWords || this->ult(RHS)) {
+  if (rhsBits == 1)
+    // X / 1 ===> X
+    return *this;
+  if (lhsWords < rhsWords || this->ult(RHS))
     // X / Y ===> 0, iff X < Y
     return APInt(BitWidth, 0);
-  } else if (*this == RHS) {
+  if (*this == RHS)
     // X / X ===> 1
     return APInt(BitWidth, 1);
-  } else if (lhsWords == 1 && rhsWords == 1) {
+  if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
     // All high words are zero, just use native divide
-    return APInt(BitWidth, this->pVal[0] / RHS.pVal[0]);
-  }
+    return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]);
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Quotient(BitWidth, 0); // to hold result.
+  divide(U.pVal, lhsWords, RHS.U.pVal, rhsWords, Quotient.U.pVal, nullptr);
+  return Quotient;
+}
+
+APInt APInt::udiv(uint64_t RHS) const {
+  assert(RHS != 0 && "Divide by zero?");
+
+  // First, deal with the easy case
+  if (isSingleWord())
+    return APInt(BitWidth, U.VAL / RHS);
+
+  // Get some facts about the LHS words.
+  unsigned lhsWords = getNumWords(getActiveBits());
+
+  // Deal with some degenerate cases
+  if (!lhsWords)
+    // 0 / X ===> 0
+    return APInt(BitWidth, 0);
+  if (RHS == 1)
+    // X / 1 ===> X
+    return *this;
+  if (this->ult(RHS))
+    // X / Y ===> 0, iff X < Y
+    return APInt(BitWidth, 0);
+  if (*this == RHS)
+    // X / X ===> 1
+    return APInt(BitWidth, 1);
+  if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1.
+    // All high words are zero, just use native divide
+    return APInt(BitWidth, this->U.pVal[0] / RHS);
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Quotient(1,0); // to hold result.
-  divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr);
+  APInt Quotient(BitWidth, 0); // to hold result.
+  divide(U.pVal, lhsWords, &RHS, 1, Quotient.U.pVal, nullptr);
   return Quotient;
 }
 
@@ -1862,40 +1598,84 @@ APInt APInt::sdiv(const APInt &RHS) const {
   return this->udiv(RHS);
 }
 
-APInt APInt::urem(const APInt& RHS) const {
+APInt APInt::sdiv(int64_t RHS) const {
+  if (isNegative()) {
+    if (RHS < 0)
+      return (-(*this)).udiv(-RHS);
+    return -((-(*this)).udiv(RHS));
+  }
+  if (RHS < 0)
+    return -(this->udiv(-RHS));
+  return this->udiv(RHS);
+}
+
+APInt APInt::urem(const APInt &RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord()) {
-    assert(RHS.VAL != 0 && "Remainder by zero?");
-    return APInt(BitWidth, VAL % RHS.VAL);
+    assert(RHS.U.VAL != 0 && "Remainder by zero?");
+    return APInt(BitWidth, U.VAL % RHS.U.VAL);
   }
 
   // Get some facts about the LHS
-  unsigned lhsBits = getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(getActiveBits());
 
   // Get some facts about the RHS
   unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned rhsWords = getNumWords(rhsBits);
   assert(rhsWords && "Performing remainder operation by zero ???");
 
   // Check the degenerate cases
-  if (lhsWords == 0) {
+  if (lhsWords == 0)
     // 0 % Y ===> 0
     return APInt(BitWidth, 0);
-  } else if (lhsWords < rhsWords || this->ult(RHS)) {
+  if (rhsBits == 1)
+    // X % 1 ===> 0
+    return APInt(BitWidth, 0);
+  if (lhsWords < rhsWords || this->ult(RHS))
     // X % Y ===> X, iff X < Y
     return *this;
-  } else if (*this == RHS) {
+  if (*this == RHS)
     // X % X == 0;
     return APInt(BitWidth, 0);
-  } else if (lhsWords == 1) {
+  if (lhsWords == 1)
     // All high words are zero, just use native remainder
-    return APInt(BitWidth, pVal[0] % RHS.pVal[0]);
-  }
+    return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]);
+
+  // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+  APInt Remainder(BitWidth, 0);
+  divide(U.pVal, lhsWords, RHS.U.pVal, rhsWords, nullptr, Remainder.U.pVal);
+  return Remainder;
+}
+
+uint64_t APInt::urem(uint64_t RHS) const {
+  assert(RHS != 0 && "Remainder by zero?");
+
+  if (isSingleWord())
+    return U.VAL % RHS;
+
+  // Get some facts about the LHS
+  unsigned lhsWords = getNumWords(getActiveBits());
+
+  // Check the degenerate cases
+  if (lhsWords == 0)
+    // 0 % Y ===> 0
+    return 0;
+  if (RHS == 1)
+    // X % 1 ===> 0
+    return 0;
+  if (this->ult(RHS))
+    // X % Y ===> X, iff X < Y
+    return getZExtValue();
+  if (*this == RHS)
+    // X % X == 0;
+    return 0;
+  if (lhsWords == 1)
+    // All high words are zero, just use native remainder
+    return U.pVal[0] % RHS;
 
   // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-  APInt Remainder(1,0);
-  divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder);
+  uint64_t Remainder;
+  divide(U.pVal, lhsWords, &RHS, 1, nullptr, &Remainder);
   return Remainder;
 }
 
@@ -1910,25 +1690,37 @@ APInt APInt::srem(const APInt &RHS) const {
   return this->urem(RHS);
 }
 
+int64_t APInt::srem(int64_t RHS) const {
+  if (isNegative()) {
+    if (RHS < 0)
+      return -((-(*this)).urem(-RHS));
+    return -((-(*this)).urem(RHS));
+  }
+  if (RHS < 0)
+    return this->urem(-RHS);
+  return this->urem(RHS);
+}
+
 void APInt::udivrem(const APInt &LHS, const APInt &RHS,
                     APInt &Quotient, APInt &Remainder) {
   assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same");
+  unsigned BitWidth = LHS.BitWidth;
 
   // First, deal with the easy case
   if (LHS.isSingleWord()) {
-    assert(RHS.VAL != 0 && "Divide by zero?");
-    uint64_t QuotVal = LHS.VAL / RHS.VAL;
-    uint64_t RemVal = LHS.VAL % RHS.VAL;
-    Quotient = APInt(LHS.BitWidth, QuotVal);
-    Remainder = APInt(LHS.BitWidth, RemVal);
+    assert(RHS.U.VAL != 0 && "Divide by zero?");
+    uint64_t QuotVal = LHS.U.VAL / RHS.U.VAL;
+    uint64_t RemVal = LHS.U.VAL % RHS.U.VAL;
+    Quotient = APInt(BitWidth, QuotVal);
+    Remainder = APInt(BitWidth, RemVal);
     return;
   }
 
   // Get some size facts about the dividend and divisor
-  unsigned lhsBits  = LHS.getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1);
+  unsigned lhsWords = getNumWords(LHS.getActiveBits());
   unsigned rhsBits  = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1);
+  unsigned rhsWords = getNumWords(rhsBits);
+  assert(rhsWords && "Performing divrem operation by zero ???");
 
   // Check the degenerate cases
   if (lhsWords == 0) {
@@ -1937,6 +1729,11 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
+  if (rhsBits == 1) {
+    Quotient = LHS;             // X / 1 ===> X
+    Remainder = 0;              // X % 1 ===> 0
+  }
+
   if (lhsWords < rhsWords || LHS.ult(RHS)) {
     Remainder = LHS;            // X % Y ===> X, iff X < Y
     Quotient = 0;               // X / Y ===> 0, iff X < Y
@@ -1949,17 +1746,90 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
     return;
   }
 
-  if (lhsWords == 1 && rhsWords == 1) {
+  // Make sure there is enough space to hold the results.
+  // NOTE: This assumes that reallocate won't affect any bits if it doesn't
+  // change the size. This is necessary if Quotient or Remainder is aliased
+  // with LHS or RHS.
+  Quotient.reallocate(BitWidth);
+  Remainder.reallocate(BitWidth);
+
+  if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
+    // There is only one word to consider so use the native versions.
+    uint64_t lhsValue = LHS.U.pVal[0];
+    uint64_t rhsValue = RHS.U.pVal[0];
+    Quotient = lhsValue / rhsValue;
+    Remainder = lhsValue % rhsValue;
+    return;
+  }
+
+  // Okay, lets do it the long way
+  divide(LHS.U.pVal, lhsWords, RHS.U.pVal, rhsWords, Quotient.U.pVal,
+         Remainder.U.pVal);
+  // Clear the rest of the Quotient and Remainder.
+  std::memset(Quotient.U.pVal + lhsWords, 0,
+              (getNumWords(BitWidth) - lhsWords) * APINT_WORD_SIZE);
+  std::memset(Remainder.U.pVal + rhsWords, 0,
+              (getNumWords(BitWidth) - rhsWords) * APINT_WORD_SIZE);
+}
+
+void APInt::udivrem(const APInt &LHS, uint64_t RHS, APInt &Quotient,
+                    uint64_t &Remainder) {
+  assert(RHS != 0 && "Divide by zero?");
+  unsigned BitWidth = LHS.BitWidth;
+
+  // First, deal with the easy case
+  if (LHS.isSingleWord()) {
+    uint64_t QuotVal = LHS.U.VAL / RHS;
+    Remainder = LHS.U.VAL % RHS;
+    Quotient = APInt(BitWidth, QuotVal);
+    return;
+  }
+
+  // Get some size facts about the dividend and divisor
+  unsigned lhsWords = getNumWords(LHS.getActiveBits());
+
+  // Check the degenerate cases
+  if (lhsWords == 0) {
+    Quotient = 0;                // 0 / Y ===> 0
+    Remainder = 0;               // 0 % Y ===> 0
+    return;
+  }
+
+  if (RHS == 1) {
+    Quotient = LHS;             // X / 1 ===> X
+    Remainder = 0;              // X % 1 ===> 0
+  }
+
+  if (LHS.ult(RHS)) {
+    Remainder = LHS.getZExtValue(); // X % Y ===> X, iff X < Y
+    Quotient = 0;                   // X / Y ===> 0, iff X < Y
+    return;
+  }
+
+  if (LHS == RHS) {
+    Quotient  = 1;              // X / X ===> 1
+    Remainder = 0;              // X % X ===> 0;
+    return;
+  }
+
+  // Make sure there is enough space to hold the results.
+  // NOTE: This assumes that reallocate won't affect any bits if it doesn't
+  // change the size. This is necessary if Quotient is aliased with LHS.
+  Quotient.reallocate(BitWidth);
+
+  if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1.
     // There is only one word to consider so use the native versions.
-    uint64_t lhsValue = LHS.isSingleWord() ? LHS.VAL : LHS.pVal[0];
-    uint64_t rhsValue = RHS.isSingleWord() ? RHS.VAL : RHS.pVal[0];
-    Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue);
-    Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue);
+    uint64_t lhsValue = LHS.U.pVal[0];
+    Quotient = lhsValue / RHS;
+    Remainder = lhsValue % RHS;
     return;
   }
 
   // Okay, lets do it the long way
-  divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder);
+  divide(LHS.U.pVal, lhsWords, &RHS, 1, Quotient.U.pVal, &Remainder);
+  // Clear the rest of the Quotient.
+  std::memset(Quotient.U.pVal + lhsWords, 0,
+              (getNumWords(BitWidth) - lhsWords) * APINT_WORD_SIZE);
 }
 
 void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
@@ -1969,17 +1839,37 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
       APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
     else {
       APInt::udivrem(-LHS, RHS, Quotient, Remainder);
-      Quotient = -Quotient;
+      Quotient.negate();
     }
-    Remainder = -Remainder;
+    Remainder.negate();
   } else if (RHS.isNegative()) {
     APInt::udivrem(LHS, -RHS, Quotient, Remainder);
-    Quotient = -Quotient;
+    Quotient.negate();
   } else {
     APInt::udivrem(LHS, RHS, Quotient, Remainder);
   }
 }
 
+void APInt::sdivrem(const APInt &LHS, int64_t RHS,
+                    APInt &Quotient, int64_t &Remainder) {
+  uint64_t R = Remainder;
+  if (LHS.isNegative()) {
+    if (RHS < 0)
+      APInt::udivrem(-LHS, -RHS, Quotient, R);
+    else {
+      APInt::udivrem(-LHS, RHS, Quotient, R);
+      Quotient.negate();
+    }
+    R = -R;
+  } else if (RHS < 0) {
+    APInt::udivrem(LHS, -RHS, Quotient, R);
+    Quotient.negate();
+  } else {
+    APInt::udivrem(LHS, RHS, Quotient, R);
+  }
+  Remainder = R;
+}
+
 APInt APInt::sadd_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this+RHS;
   Overflow = isNonNegative() == RHS.isNonNegative() &&
@@ -2014,7 +1904,7 @@ APInt APInt::sdiv_ov(const APInt &RHS, bool &Overflow) const {
 
 APInt APInt::smul_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this * RHS;
-  
+
   if (*this != 0 && RHS != 0)
     Overflow = Res.sdiv(RHS) != *this || Res.sdiv(*this) != RHS;
   else
@@ -2041,7 +1931,7 @@ APInt APInt::sshl_ov(const APInt &ShAmt, bool &Overflow) const {
     Overflow = ShAmt.uge(countLeadingZeros());
   else
     Overflow = ShAmt.uge(countLeadingOnes());
-  
+
   return *this << ShAmt;
 }
 
@@ -2061,7 +1951,7 @@ APInt APInt::ushl_ov(const APInt &ShAmt, bool &Overflow) const {
 void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   // Check our assumptions here
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
           radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -2079,18 +1969,15 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   assert((((slen-1)*64)/22 <= numbits || radix != 10) &&
          "Insufficient bit width");
 
-  // Allocate memory
-  if (!isSingleWord())
-    pVal = getClearedMemory(getNumWords());
+  // Allocate memory if needed
+  if (isSingleWord())
+    U.VAL = 0;
+  else
+    U.pVal = getClearedMemory(getNumWords());
 
   // Figure out if we can shift instead of multiply
   unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
 
-  // Set up an APInt for the digit to add outside the loop so we don't
-  // constantly construct/destruct it.
-  APInt apdigit(getBitWidth(), 0);
-  APInt apradix(getBitWidth(), radix);
-
   // Enter digit traversal loop
   for (StringRef::iterator e = str.end(); p != e; ++p) {
     unsigned digit = getDigit(*p, radix);
@@ -2101,26 +1988,20 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
       if (shift)
         *this <<= shift;
       else
-        *this *= apradix;
+        *this *= radix;
     }
 
     // Add in the digit we just interpreted
-    if (apdigit.isSingleWord())
-      apdigit.VAL = digit;
-    else
-      apdigit.pVal[0] = digit;
-    *this += apdigit;
+    *this += digit;
   }
   // If its negative, put it in two's complement form
-  if (isNeg) {
-    --(*this);
-    this->flipAllBits();
-  }
+  if (isNeg)
+    this->negate();
 }
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
                      bool Signed, bool formatAsCLiteral) const {
-  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 || 
+  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 ||
           Radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -2159,7 +2040,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 
   if (isSingleWord()) {
     char Buffer[65];
-    char *BufPtr = Buffer+65;
+    char *BufPtr = std::end(Buffer);
 
     uint64_t N;
     if (!Signed) {
@@ -2183,7 +2064,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
       *--BufPtr = Digits[N % Radix];
       N /= Radix;
     }
-    Str.append(BufPtr, Buffer+65);
+    Str.append(BufPtr, std::end(Buffer));
     return;
   }
 
@@ -2193,8 +2074,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     // They want to print the signed version and it is a negative value
     // Flip the bits and add one to turn it into the equivalent positive
     // value and put a '-' in the result.
-    Tmp.flipAllBits();
-    ++Tmp;
+    Tmp.negate();
     Str.push_back('-');
   }
 
@@ -2208,28 +2088,23 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 
   // For the 2, 8 and 16 bit cases, we can just shift instead of divide
   // because the number of bits per digit (1, 3 and 4 respectively) divides
-  // equaly.  We just shift until the value is zero.
+  // equally.  We just shift until the value is zero.
   if (Radix == 2 || Radix == 8 || Radix == 16) {
     // Just shift tmp right for each digit width until it becomes zero
     unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
     unsigned MaskAmt = Radix - 1;
 
-    while (Tmp != 0) {
+    while (Tmp.getBoolValue()) {
       unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt;
       Str.push_back(Digits[Digit]);
-      Tmp = Tmp.lshr(ShiftAmt);
+      Tmp.lshrInPlace(ShiftAmt);
     }
   } else {
-    APInt divisor(Radix == 10? 4 : 8, Radix);
-    while (Tmp != 0) {
-      APInt APdigit(1, 0);
-      APInt tmp2(Tmp.getBitWidth(), 0);
-      divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2,
-             &APdigit);
-      unsigned Digit = (unsigned)APdigit.getZExtValue();
+    while (Tmp.getBoolValue()) {
+      uint64_t Digit;
+      udivrem(Tmp, Radix, Tmp, Digit);
       assert(Digit < Radix && "divide failed");
       Str.push_back(Digits[Digit]);
-      Tmp = tmp2;
     }
   }
 
@@ -2245,14 +2120,15 @@ std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
   return S.str();
 }
 
-
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void APInt::dump() const {
   SmallString<40> S, U;
   this->toStringUnsigned(U);
   this->toStringSigned(S);
   dbgs() << "APInt(" << BitWidth << "b, "
-         << U << "u " << S << "s)";
+         << U << "u " << S << "s)\n";
 }
+#endif
 
 void APInt::print(raw_ostream &OS, bool isSigned) const {
   SmallString<40> S;
@@ -2265,83 +2141,60 @@ void APInt::print(raw_ostream &OS, bool isSigned) const {
 
 // Assumed by lowHalf, highHalf, partMSB and partLSB.  A fairly safe
 // and unrestricting assumption.
-static_assert(integerPartWidth % 2 == 0, "Part width must be divisible by 2!");
+static_assert(APInt::APINT_BITS_PER_WORD % 2 == 0,
+              "Part width must be divisible by 2!");
 
 /* Some handy functions local to this file.  */
-namespace {
 
-  /* Returns the integer part with the least significant BITS set.
-     BITS cannot be zero.  */
-  static inline integerPart
-  lowBitMask(unsigned int bits)
-  {
-    assert(bits != 0 && bits <= integerPartWidth);
+/* Returns the integer part with the least significant BITS set.
+   BITS cannot be zero.  */
+static inline APInt::WordType lowBitMask(unsigned bits) {
+  assert(bits != 0 && bits <= APInt::APINT_BITS_PER_WORD);
 
-    return ~(integerPart) 0 >> (integerPartWidth - bits);
-  }
+  return ~(APInt::WordType) 0 >> (APInt::APINT_BITS_PER_WORD - bits);
+}
 
-  /* Returns the value of the lower half of PART.  */
-  static inline integerPart
-  lowHalf(integerPart part)
-  {
-    return part & lowBitMask(integerPartWidth / 2);
-  }
+/* Returns the value of the lower half of PART.  */
+static inline APInt::WordType lowHalf(APInt::WordType part) {
+  return part & lowBitMask(APInt::APINT_BITS_PER_WORD / 2);
+}
 
-  /* Returns the value of the upper half of PART.  */
-  static inline integerPart
-  highHalf(integerPart part)
-  {
-    return part >> (integerPartWidth / 2);
-  }
+/* Returns the value of the upper half of PART.  */
+static inline APInt::WordType highHalf(APInt::WordType part) {
+  return part >> (APInt::APINT_BITS_PER_WORD / 2);
+}
 
-  /* Returns the bit number of the most significant set bit of a part.
-     If the input number has no bits set -1U is returned.  */
-  static unsigned int
-  partMSB(integerPart value)
-  {
-    return findLastSet(value, ZB_Max);
-  }
+/* Returns the bit number of the most significant set bit of a part.
+   If the input number has no bits set -1U is returned.  */
+static unsigned partMSB(APInt::WordType value) {
+  return findLastSet(value, ZB_Max);
+}
 
-  /* Returns the bit number of the least significant set bit of a
-     part.  If the input number has no bits set -1U is returned.  */
-  static unsigned int
-  partLSB(integerPart value)
-  {
-    return findFirstSet(value, ZB_Max);
-  }
+/* Returns the bit number of the least significant set bit of a
+   part.  If the input number has no bits set -1U is returned.  */
+static unsigned partLSB(APInt::WordType value) {
+  return findFirstSet(value, ZB_Max);
 }
 
 /* Sets the least significant part of a bignum to the input value, and
    zeroes out higher parts.  */
-void
-APInt::tcSet(integerPart *dst, integerPart part, unsigned int parts)
-{
-  unsigned int i;
-
+void APInt::tcSet(WordType *dst, WordType part, unsigned parts) {
   assert(parts > 0);
 
   dst[0] = part;
-  for (i = 1; i < parts; i++)
+  for (unsigned i = 1; i < parts; i++)
     dst[i] = 0;
 }
 
 /* Assign one bignum to another.  */
-void
-APInt::tcAssign(integerPart *dst, const integerPart *src, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcAssign(WordType *dst, const WordType *src, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] = src[i];
 }
 
 /* Returns true if a bignum is zero, false otherwise.  */
-bool
-APInt::tcIsZero(const integerPart *src, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+bool APInt::tcIsZero(const WordType *src, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     if (src[i])
       return false;
 
@@ -2349,41 +2202,29 @@ APInt::tcIsZero(const integerPart *src, unsigned int parts)
 }
 
 /* Extract the given bit of a bignum; returns 0 or 1.  */
-int
-APInt::tcExtractBit(const integerPart *parts, unsigned int bit)
-{
-  return (parts[bit / integerPartWidth] &
-          ((integerPart) 1 << bit % integerPartWidth)) != 0;
+int APInt::tcExtractBit(const WordType *parts, unsigned bit) {
+  return (parts[whichWord(bit)] & maskBit(bit)) != 0;
 }
 
 /* Set the given bit of a bignum. */
-void
-APInt::tcSetBit(integerPart *parts, unsigned int bit)
-{
-  parts[bit / integerPartWidth] |= (integerPart) 1 << (bit % integerPartWidth);
+void APInt::tcSetBit(WordType *parts, unsigned bit) {
+  parts[whichWord(bit)] |= maskBit(bit);
 }
 
 /* Clears the given bit of a bignum. */
-void
-APInt::tcClearBit(integerPart *parts, unsigned int bit)
-{
-  parts[bit / integerPartWidth] &=
-    ~((integerPart) 1 << (bit % integerPartWidth));
+void APInt::tcClearBit(WordType *parts, unsigned bit) {
+  parts[whichWord(bit)] &= ~maskBit(bit);
 }
 
 /* Returns the bit number of the least significant set bit of a
    number.  If the input number has no bits set -1U is returned.  */
-unsigned int
-APInt::tcLSB(const integerPart *parts, unsigned int n)
-{
-  unsigned int i, lsb;
-
-  for (i = 0; i < n; i++) {
-      if (parts[i] != 0) {
-          lsb = partLSB(parts[i]);
+unsigned APInt::tcLSB(const WordType *parts, unsigned n) {
+  for (unsigned i = 0; i < n; i++) {
+    if (parts[i] != 0) {
+      unsigned lsb = partLSB(parts[i]);
 
-          return lsb + i * integerPartWidth;
-      }
+      return lsb + i * APINT_BITS_PER_WORD;
+    }
   }
 
   return -1U;
@@ -2391,18 +2232,14 @@ APInt::tcLSB(const integerPart *parts, unsigned int n)
 
 /* Returns the bit number of the most significant set bit of a number.
    If the input number has no bits set -1U is returned.  */
-unsigned int
-APInt::tcMSB(const integerPart *parts, unsigned int n)
-{
-  unsigned int msb;
-
+unsigned APInt::tcMSB(const WordType *parts, unsigned n) {
   do {
     --n;
 
     if (parts[n] != 0) {
-      msb = partMSB(parts[n]);
+      unsigned msb = partMSB(parts[n]);
 
-      return msb + n * integerPartWidth;
+      return msb + n * APINT_BITS_PER_WORD;
     }
   } while (n);
 
@@ -2414,31 +2251,28 @@ APInt::tcMSB(const integerPart *parts, unsigned int n)
    the least significant bit of DST.  All high bits above srcBITS in
    DST are zero-filled.  */
 void
-APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
-                 unsigned int srcBits, unsigned int srcLSB)
-{
-  unsigned int firstSrcPart, dstParts, shift, n;
-
-  dstParts = (srcBits + integerPartWidth - 1) / integerPartWidth;
+APInt::tcExtract(WordType *dst, unsigned dstCount, const WordType *src,
+                 unsigned srcBits, unsigned srcLSB) {
+  unsigned dstParts = (srcBits + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
   assert(dstParts <= dstCount);
 
-  firstSrcPart = srcLSB / integerPartWidth;
+  unsigned firstSrcPart = srcLSB / APINT_BITS_PER_WORD;
   tcAssign (dst, src + firstSrcPart, dstParts);
 
-  shift = srcLSB % integerPartWidth;
+  unsigned shift = srcLSB % APINT_BITS_PER_WORD;
   tcShiftRight (dst, dstParts, shift);
 
-  /* We now have (dstParts * integerPartWidth - shift) bits from SRC
+  /* We now have (dstParts * APINT_BITS_PER_WORD - shift) bits from SRC
      in DST.  If this is less that srcBits, append the rest, else
      clear the high bits.  */
-  n = dstParts * integerPartWidth - shift;
+  unsigned n = dstParts * APINT_BITS_PER_WORD - shift;
   if (n < srcBits) {
-    integerPart mask = lowBitMask (srcBits - n);
+    WordType mask = lowBitMask (srcBits - n);
     dst[dstParts - 1] |= ((src[firstSrcPart + dstParts] & mask)
-                          << n % integerPartWidth);
+                          << n % APINT_BITS_PER_WORD);
   } else if (n > srcBits) {
-    if (srcBits % integerPartWidth)
-      dst[dstParts - 1] &= lowBitMask (srcBits % integerPartWidth);
+    if (srcBits % APINT_BITS_PER_WORD)
+      dst[dstParts - 1] &= lowBitMask (srcBits % APINT_BITS_PER_WORD);
   }
 
   /* Clear high parts.  */
@@ -2447,18 +2281,12 @@ APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
 }
 
 /* DST += RHS + C where C is zero or one.  Returns the carry flag.  */
-integerPart
-APInt::tcAdd(integerPart *dst, const integerPart *rhs,
-             integerPart c, unsigned int parts)
-{
-  unsigned int i;
-
+APInt::WordType APInt::tcAdd(WordType *dst, const WordType *rhs,
+                             WordType c, unsigned parts) {
   assert(c <= 1);
 
-  for (i = 0; i < parts; i++) {
-    integerPart l;
-
-    l = dst[i];
+  for (unsigned i = 0; i < parts; i++) {
+    WordType l = dst[i];
     if (c) {
       dst[i] += rhs[i] + 1;
       c = (dst[i] <= l);
@@ -2471,19 +2299,29 @@ APInt::tcAdd(integerPart *dst, const integerPart *rhs,
   return c;
 }
 
-/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
-integerPart
-APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
-                  integerPart c, unsigned int parts)
-{
-  unsigned int i;
+/// This function adds a single "word" integer, src, to the multiple
+/// "word" integer array, dst[]. dst[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+APInt::WordType APInt::tcAddPart(WordType *dst, WordType src,
+                                 unsigned parts) {
+  for (unsigned i = 0; i < parts; ++i) {
+    dst[i] += src;
+    if (dst[i] >= src)
+      return 0; // No need to carry so exit early.
+    src = 1; // Carry one to next digit.
+  }
 
-  assert(c <= 1);
+  return 1;
+}
 
-  for (i = 0; i < parts; i++) {
-    integerPart l;
+/* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
+APInt::WordType APInt::tcSubtract(WordType *dst, const WordType *rhs,
+                                  WordType c, unsigned parts) {
+  assert(c <= 1);
 
-    l = dst[i];
+  for (unsigned i = 0; i < parts; i++) {
+    WordType l = dst[i];
     if (c) {
       dst[i] -= rhs[i] + 1;
       c = (dst[i] >= l);
@@ -2496,10 +2334,28 @@ APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
   return c;
 }
 
+/// This function subtracts a single "word" (64-bit word), src, from
+/// the multi-word integer array, dst[], propagating the borrowed 1 value until
+/// no further borrowing is needed or it runs out of "words" in dst.  The result
+/// is 1 if "borrowing" exhausted the digits in dst, or 0 if dst was not
+/// exhausted. In other words, if src > dst then this function returns 1,
+/// otherwise 0.
+/// @returns the borrow out of the subtraction
+APInt::WordType APInt::tcSubtractPart(WordType *dst, WordType src,
+                                      unsigned parts) {
+  for (unsigned i = 0; i < parts; ++i) {
+    WordType Dst = dst[i];
+    dst[i] -= src;
+    if (src <= Dst)
+      return 0; // No need to borrow so exit early.
+    src = 1; // We have to "borrow 1" from next "word"
+  }
+
+  return 1;
+}
+
 /* Negate a bignum in-place.  */
-void
-APInt::tcNegate(integerPart *dst, unsigned int parts)
-{
+void APInt::tcNegate(WordType *dst, unsigned parts) {
   tcComplement(dst, parts);
   tcIncrement(dst, parts);
 }
@@ -2515,23 +2371,19 @@ APInt::tcNegate(integerPart *dst, unsigned int parts)
     DSTPARTS parts of the result, and if all of the omitted higher
     parts were zero return zero, otherwise overflow occurred and
     return one.  */
-int
-APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
-                      integerPart multiplier, integerPart carry,
-                      unsigned int srcParts, unsigned int dstParts,
-                      bool add)
-{
-  unsigned int i, n;
-
+int APInt::tcMultiplyPart(WordType *dst, const WordType *src,
+                          WordType multiplier, WordType carry,
+                          unsigned srcParts, unsigned dstParts,
+                          bool add) {
   /* Otherwise our writes of DST kill our later reads of SRC.  */
   assert(dst <= src || dst >= src + srcParts);
   assert(dstParts <= srcParts + 1);
 
   /* N loops; minimum of dstParts and srcParts.  */
-  n = dstParts < srcParts ? dstParts: srcParts;
+  unsigned n = std::min(dstParts, srcParts);
 
-  for (i = 0; i < n; i++) {
-    integerPart low, mid, high, srcPart;
+  for (unsigned i = 0; i < n; i++) {
+    WordType low, mid, high, srcPart;
 
       /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
 
@@ -2543,7 +2395,7 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
 
     srcPart = src[i];
 
-    if (multiplier == 0 || srcPart == 0)        {
+    if (multiplier == 0 || srcPart == 0) {
       low = carry;
       high = 0;
     } else {
@@ -2552,14 +2404,14 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
 
       mid = lowHalf(srcPart) * highHalf(multiplier);
       high += highHalf(mid);
-      mid <<= integerPartWidth / 2;
+      mid <<= APINT_BITS_PER_WORD / 2;
       if (low + mid < low)
         high++;
       low += mid;
 
       mid = highHalf(srcPart) * lowHalf(multiplier);
       high += highHalf(mid);
-      mid <<= integerPartWidth / 2;
+      mid <<= APINT_BITS_PER_WORD / 2;
       if (low + mid < low)
         high++;
       low += mid;
@@ -2581,78 +2433,62 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
     carry = high;
   }
 
-  if (i < dstParts) {
+  if (srcParts < dstParts) {
     /* Full multiplication, there is no overflow.  */
-    assert(i + 1 == dstParts);
-    dst[i] = carry;
-    return 0;
-  } else {
-    /* We overflowed if there is carry.  */
-    if (carry)
-      return 1;
-
-    /* We would overflow if any significant unwritten parts would be
-       non-zero.  This is true if any remaining src parts are non-zero
-       and the multiplier is non-zero.  */
-    if (multiplier)
-      for (; i < srcParts; i++)
-        if (src[i])
-          return 1;
-
-    /* We fitted in the narrow destination.  */
+    assert(srcParts + 1 == dstParts);
+    dst[srcParts] = carry;
     return 0;
   }
+
+  /* We overflowed if there is carry.  */
+  if (carry)
+    return 1;
+
+  /* We would overflow if any significant unwritten parts would be
+     non-zero.  This is true if any remaining src parts are non-zero
+     and the multiplier is non-zero.  */
+  if (multiplier)
+    for (unsigned i = dstParts; i < srcParts; i++)
+      if (src[i])
+        return 1;
+
+  /* We fitted in the narrow destination.  */
+  return 0;
 }
 
 /* DST = LHS * RHS, where DST has the same width as the operands and
    is filled with the least significant parts of the result.  Returns
    one if overflow occurred, otherwise zero.  DST must be disjoint
    from both operands.  */
-int
-APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
-                  const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-  int overflow;
-
+int APInt::tcMultiply(WordType *dst, const WordType *lhs,
+                      const WordType *rhs, unsigned parts) {
   assert(dst != lhs && dst != rhs);
 
-  overflow = 0;
+  int overflow = 0;
   tcSet(dst, 0, parts);
 
-  for (i = 0; i < parts; i++)
+  for (unsigned i = 0; i < parts; i++)
     overflow |= tcMultiplyPart(&dst[i], lhs, rhs[i], 0, parts,
                                parts - i, true);
 
   return overflow;
 }
 
-/* DST = LHS * RHS, where DST has width the sum of the widths of the
-   operands.  No overflow occurs.  DST must be disjoint from both
-   operands.  Returns the number of parts required to hold the
-   result.  */
-unsigned int
-APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
-                      const integerPart *rhs, unsigned int lhsParts,
-                      unsigned int rhsParts)
-{
+/// DST = LHS * RHS, where DST has width the sum of the widths of the
+/// operands. No overflow occurs. DST must be disjoint from both operands.
+void APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+                           const WordType *rhs, unsigned lhsParts,
+                           unsigned rhsParts) {
   /* Put the narrower number on the LHS for less loops below.  */
-  if (lhsParts > rhsParts) {
+  if (lhsParts > rhsParts)
     return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
-  } else {
-    unsigned int n;
-
-    assert(dst != lhs && dst != rhs);
 
-    tcSet(dst, 0, rhsParts);
-
-    for (n = 0; n < lhsParts; n++)
-      tcMultiplyPart(&dst[n], rhs, lhs[n], 0, rhsParts, rhsParts + 1, true);
+  assert(dst != lhs && dst != rhs);
 
-    n = lhsParts + rhsParts;
+  tcSet(dst, 0, rhsParts);
 
-    return n - (dst[n - 1] == 0);
-  }
+  for (unsigned i = 0; i < lhsParts; i++)
+    tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
 }
 
 /* If RHS is zero LHS and REMAINDER are left unchanged, return one.
@@ -2665,23 +2501,18 @@ APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
    use by the routine; its contents need not be initialized and are
    destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
 */
-int
-APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
-                integerPart *remainder, integerPart *srhs,
-                unsigned int parts)
-{
-  unsigned int n, shiftCount;
-  integerPart mask;
-
+int APInt::tcDivide(WordType *lhs, const WordType *rhs,
+                    WordType *remainder, WordType *srhs,
+                    unsigned parts) {
   assert(lhs != remainder && lhs != srhs && remainder != srhs);
 
-  shiftCount = tcMSB(rhs, parts) + 1;
+  unsigned shiftCount = tcMSB(rhs, parts) + 1;
   if (shiftCount == 0)
     return true;
 
-  shiftCount = parts * integerPartWidth - shiftCount;
-  n = shiftCount / integerPartWidth;
-  mask = (integerPart) 1 << (shiftCount % integerPartWidth);
+  shiftCount = parts * APINT_BITS_PER_WORD - shiftCount;
+  unsigned n = shiftCount / APINT_BITS_PER_WORD;
+  WordType mask = (WordType) 1 << (shiftCount % APINT_BITS_PER_WORD);
 
   tcAssign(srhs, rhs, parts);
   tcShiftLeft(srhs, parts, shiftCount);
@@ -2691,196 +2522,127 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
   /* Loop, subtracting SRHS if REMAINDER is greater and adding that to
      the total.  */
   for (;;) {
-      int compare;
-
-      compare = tcCompare(remainder, srhs, parts);
-      if (compare >= 0) {
-        tcSubtract(remainder, srhs, 0, parts);
-        lhs[n] |= mask;
-      }
+    int compare = tcCompare(remainder, srhs, parts);
+    if (compare >= 0) {
+      tcSubtract(remainder, srhs, 0, parts);
+      lhs[n] |= mask;
+    }
 
-      if (shiftCount == 0)
-        break;
-      shiftCount--;
-      tcShiftRight(srhs, parts, 1);
-      if ((mask >>= 1) == 0) {
-        mask = (integerPart) 1 << (integerPartWidth - 1);
-        n--;
-      }
+    if (shiftCount == 0)
+      break;
+    shiftCount--;
+    tcShiftRight(srhs, parts, 1);
+    if ((mask >>= 1) == 0) {
+      mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
+      n--;
+    }
   }
 
   return false;
 }
 
-/* Shift a bignum left COUNT bits in-place.  Shifted in bits are zero.
-   There are no restrictions on COUNT.  */
-void
-APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
-{
-  if (count) {
-    unsigned int jump, shift;
-
-    /* Jump is the inter-part jump; shift is is intra-part shift.  */
-    jump = count / integerPartWidth;
-    shift = count % integerPartWidth;
-
-    while (parts > jump) {
-      integerPart part;
-
-      parts--;
-
-      /* dst[i] comes from the two parts src[i - jump] and, if we have
-         an intra-part shift, src[i - jump - 1].  */
-      part = dst[parts - jump];
-      if (shift) {
-        part <<= shift;
-        if (parts >= jump + 1)
-          part |= dst[parts - jump - 1] >> (integerPartWidth - shift);
-      }
+/// Shift a bignum left Cound bits in-place. Shifted in bits are zero. There are
+/// no restrictions on Count.
+void APInt::tcShiftLeft(WordType *Dst, unsigned Words, unsigned Count) {
+  // Don't bother performing a no-op shift.
+  if (!Count)
+    return;
 
-      dst[parts] = part;
-    }
+  // WordShift is the inter-part shift; BitShift is the intra-part shift.
+  unsigned WordShift = std::min(Count / APINT_BITS_PER_WORD, Words);
+  unsigned BitShift = Count % APINT_BITS_PER_WORD;
 
-    while (parts > 0)
-      dst[--parts] = 0;
+  // Fastpath for moving by whole words.
+  if (BitShift == 0) {
+    std::memmove(Dst + WordShift, Dst, (Words - WordShift) * APINT_WORD_SIZE);
+  } else {
+    while (Words-- > WordShift) {
+      Dst[Words] = Dst[Words - WordShift] << BitShift;
+      if (Words > WordShift)
+        Dst[Words] |=
+          Dst[Words - WordShift - 1] >> (APINT_BITS_PER_WORD - BitShift);
+    }
   }
+
+  // Fill in the remainder with 0s.
+  std::memset(Dst, 0, WordShift * APINT_WORD_SIZE);
 }
 
-/* Shift a bignum right COUNT bits in-place.  Shifted in bits are
-   zero.  There are no restrictions on COUNT.  */
-void
-APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
-{
-  if (count) {
-    unsigned int i, jump, shift;
-
-    /* Jump is the inter-part jump; shift is is intra-part shift.  */
-    jump = count / integerPartWidth;
-    shift = count % integerPartWidth;
-
-    /* Perform the shift.  This leaves the most significant COUNT bits
-       of the result at zero.  */
-    for (i = 0; i < parts; i++) {
-      integerPart part;
-
-      if (i + jump >= parts) {
-        part = 0;
-      } else {
-        part = dst[i + jump];
-        if (shift) {
-          part >>= shift;
-          if (i + jump + 1 < parts)
-            part |= dst[i + jump + 1] << (integerPartWidth - shift);
-        }
-      }
+/// Shift a bignum right Count bits in-place. Shifted in bits are zero. There
+/// are no restrictions on Count.
+void APInt::tcShiftRight(WordType *Dst, unsigned Words, unsigned Count) {
+  // Don't bother performing a no-op shift.
+  if (!Count)
+    return;
+
+  // WordShift is the inter-part shift; BitShift is the intra-part shift.
+  unsigned WordShift = std::min(Count / APINT_BITS_PER_WORD, Words);
+  unsigned BitShift = Count % APINT_BITS_PER_WORD;
 
-      dst[i] = part;
+  unsigned WordsToMove = Words - WordShift;
+  // Fastpath for moving by whole words.
+  if (BitShift == 0) {
+    std::memmove(Dst, Dst + WordShift, WordsToMove * APINT_WORD_SIZE);
+  } else {
+    for (unsigned i = 0; i != WordsToMove; ++i) {
+      Dst[i] = Dst[i + WordShift] >> BitShift;
+      if (i + 1 != WordsToMove)
+        Dst[i] |= Dst[i + WordShift + 1] << (APINT_BITS_PER_WORD - BitShift);
     }
   }
+
+  // Fill in the remainder with 0s.
+  std::memset(Dst + WordsToMove, 0, WordShift * APINT_WORD_SIZE);
 }
 
 /* Bitwise and of two bignums.  */
-void
-APInt::tcAnd(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcAnd(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] &= rhs[i];
 }
 
 /* Bitwise inclusive or of two bignums.  */
-void
-APInt::tcOr(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcOr(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] |= rhs[i];
 }
 
 /* Bitwise exclusive or of two bignums.  */
-void
-APInt::tcXor(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcXor(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] ^= rhs[i];
 }
 
 /* Complement a bignum in-place.  */
-void
-APInt::tcComplement(integerPart *dst, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcComplement(WordType *dst, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] = ~dst[i];
 }
 
 /* Comparison (unsigned) of two bignums.  */
-int
-APInt::tcCompare(const integerPart *lhs, const integerPart *rhs,
-                 unsigned int parts)
-{
+int APInt::tcCompare(const WordType *lhs, const WordType *rhs,
+                     unsigned parts) {
   while (parts) {
-      parts--;
-      if (lhs[parts] == rhs[parts])
-        continue;
-
-      if (lhs[parts] > rhs[parts])
-        return 1;
-      else
-        return -1;
-    }
+    parts--;
+    if (lhs[parts] != rhs[parts])
+      return (lhs[parts] > rhs[parts]) ? 1 : -1;
+  }
 
   return 0;
 }
 
-/* Increment a bignum in-place, return the carry flag.  */
-integerPart
-APInt::tcIncrement(integerPart *dst, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
-    if (++dst[i] != 0)
-      break;
-
-  return i == parts;
-}
-
-/* Decrement a bignum in-place, return the borrow flag.  */
-integerPart
-APInt::tcDecrement(integerPart *dst, unsigned int parts) {
-  for (unsigned int i = 0; i < parts; i++) {
-    // If the current word is non-zero, then the decrement has no effect on the
-    // higher-order words of the integer and no borrow can occur. Exit early.
-    if (dst[i]--)
-      return 0;
-  }
-  // If every word was zero, then there is a borrow.
-  return 1;
-}
-
-
 /* Set the least significant BITS bits of a bignum, clear the
    rest.  */
-void
-APInt::tcSetLeastSignificantBits(integerPart *dst, unsigned int parts,
-                                 unsigned int bits)
-{
-  unsigned int i;
-
-  i = 0;
-  while (bits > integerPartWidth) {
-    dst[i++] = ~(integerPart) 0;
-    bits -= integerPartWidth;
+void APInt::tcSetLeastSignificantBits(WordType *dst, unsigned parts,
+                                      unsigned bits) {
+  unsigned i = 0;
+  while (bits > APINT_BITS_PER_WORD) {
+    dst[i++] = ~(WordType) 0;
+    bits -= APINT_BITS_PER_WORD;
   }
 
   if (bits)
-    dst[i++] = ~(integerPart) 0 >> (integerPartWidth - bits);
+    dst[i++] = ~(WordType) 0 >> (APINT_BITS_PER_WORD - bits);
 
   while (i < parts)
     dst[i++] = 0;
diff --git a/contrib/llvm/lib/Support/ARMAttributeParser.cpp b/contrib/llvm/lib/Support/ARMAttributeParser.cpp
new file mode 100644
index 0000000..a9a0c1d
--- /dev/null
+++ b/contrib/llvm/lib/Support/ARMAttributeParser.cpp
@@ -0,0 +1,708 @@
+//===--- ARMAttributeParser.cpp - ARM Attribute Information Printer -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ARMAttributeParser.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+using namespace llvm::ARMBuildAttrs;
+
+
+static const EnumEntry<unsigned> TagNames[] = {
+  { "Tag_File", ARMBuildAttrs::File },
+  { "Tag_Section", ARMBuildAttrs::Section },
+  { "Tag_Symbol", ARMBuildAttrs::Symbol },
+};
+
+namespace llvm {
+#define ATTRIBUTE_HANDLER(Attr_)                                                \
+  { ARMBuildAttrs::Attr_, &ARMAttributeParser::Attr_ }
+
+const ARMAttributeParser::DisplayHandler
+ARMAttributeParser::DisplayRoutines[] = {
+  { ARMBuildAttrs::CPU_raw_name, &ARMAttributeParser::StringAttribute, },
+  { ARMBuildAttrs::CPU_name, &ARMAttributeParser::StringAttribute },
+  ATTRIBUTE_HANDLER(CPU_arch),
+  ATTRIBUTE_HANDLER(CPU_arch_profile),
+  ATTRIBUTE_HANDLER(ARM_ISA_use),
+  ATTRIBUTE_HANDLER(THUMB_ISA_use),
+  ATTRIBUTE_HANDLER(FP_arch),
+  ATTRIBUTE_HANDLER(WMMX_arch),
+  ATTRIBUTE_HANDLER(Advanced_SIMD_arch),
+  ATTRIBUTE_HANDLER(PCS_config),
+  ATTRIBUTE_HANDLER(ABI_PCS_R9_use),
+  ATTRIBUTE_HANDLER(ABI_PCS_RW_data),
+  ATTRIBUTE_HANDLER(ABI_PCS_RO_data),
+  ATTRIBUTE_HANDLER(ABI_PCS_GOT_use),
+  ATTRIBUTE_HANDLER(ABI_PCS_wchar_t),
+  ATTRIBUTE_HANDLER(ABI_FP_rounding),
+  ATTRIBUTE_HANDLER(ABI_FP_denormal),
+  ATTRIBUTE_HANDLER(ABI_FP_exceptions),
+  ATTRIBUTE_HANDLER(ABI_FP_user_exceptions),
+  ATTRIBUTE_HANDLER(ABI_FP_number_model),
+  ATTRIBUTE_HANDLER(ABI_align_needed),
+  ATTRIBUTE_HANDLER(ABI_align_preserved),
+  ATTRIBUTE_HANDLER(ABI_enum_size),
+  ATTRIBUTE_HANDLER(ABI_HardFP_use),
+  ATTRIBUTE_HANDLER(ABI_VFP_args),
+  ATTRIBUTE_HANDLER(ABI_WMMX_args),
+  ATTRIBUTE_HANDLER(ABI_optimization_goals),
+  ATTRIBUTE_HANDLER(ABI_FP_optimization_goals),
+  ATTRIBUTE_HANDLER(compatibility),
+  ATTRIBUTE_HANDLER(CPU_unaligned_access),
+  ATTRIBUTE_HANDLER(FP_HP_extension),
+  ATTRIBUTE_HANDLER(ABI_FP_16bit_format),
+  ATTRIBUTE_HANDLER(MPextension_use),
+  ATTRIBUTE_HANDLER(DIV_use),
+  ATTRIBUTE_HANDLER(DSP_extension),
+  ATTRIBUTE_HANDLER(T2EE_use),
+  ATTRIBUTE_HANDLER(Virtualization_use),
+  ATTRIBUTE_HANDLER(nodefaults)
+};
+
+#undef ATTRIBUTE_HANDLER
+
+uint64_t ARMAttributeParser::ParseInteger(const uint8_t *Data,
+                                          uint32_t &Offset) {
+  unsigned Length;
+  uint64_t Value = decodeULEB128(Data + Offset, &Length);
+  Offset = Offset + Length;
+  return Value;
+}
+
+StringRef ARMAttributeParser::ParseString(const uint8_t *Data,
+                                          uint32_t &Offset) {
+  const char *String = reinterpret_cast<const char*>(Data + Offset);
+  size_t Length = std::strlen(String);
+  Offset = Offset + Length + 1;
+  return StringRef(String, Length);
+}
+
+void ARMAttributeParser::IntegerAttribute(AttrType Tag, const uint8_t *Data,
+                                          uint32_t &Offset) {
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  Attributes.insert(std::make_pair(Tag, Value));
+
+  if (SW)
+    SW->printNumber(ARMBuildAttrs::AttrTypeAsString(Tag), Value);
+}
+
+void ARMAttributeParser::StringAttribute(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  StringRef TagName = ARMBuildAttrs::AttrTypeAsString(Tag, /*TagPrefix*/false);
+  StringRef ValueDesc = ParseString(Data, Offset);
+
+  if (SW) {
+    DictScope AS(*SW, "Attribute");
+    SW->printNumber("Tag", Tag);
+    if (!TagName.empty())
+      SW->printString("TagName", TagName);
+    SW->printString("Value", ValueDesc);
+  }
+}
+
+void ARMAttributeParser::PrintAttribute(unsigned Tag, unsigned Value,
+                                        StringRef ValueDesc) {
+  Attributes.insert(std::make_pair(Tag, Value));
+
+  if (SW) {
+    StringRef TagName = ARMBuildAttrs::AttrTypeAsString(Tag,
+                                                        /*TagPrefix*/false);
+    DictScope AS(*SW, "Attribute");
+    SW->printNumber("Tag", Tag);
+    SW->printNumber("Value", Value);
+    if (!TagName.empty())
+      SW->printString("TagName", TagName);
+    if (!ValueDesc.empty())
+      SW->printString("Description", ValueDesc);
+  }
+}
+
+void ARMAttributeParser::CPU_arch(AttrType Tag, const uint8_t *Data,
+                                  uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Pre-v4", "ARM v4", "ARM v4T", "ARM v5T", "ARM v5TE", "ARM v5TEJ", "ARM v6",
+    "ARM v6KZ", "ARM v6T2", "ARM v6K", "ARM v7", "ARM v6-M", "ARM v6S-M",
+    "ARM v7E-M", "ARM v8"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::CPU_arch_profile(AttrType Tag, const uint8_t *Data,
+                                          uint32_t &Offset) {
+  uint64_t Encoded = ParseInteger(Data, Offset);
+
+  StringRef Profile;
+  switch (Encoded) {
+  default:  Profile = "Unknown"; break;
+  case 'A': Profile = "Application"; break;
+  case 'R': Profile = "Real-time"; break;
+  case 'M': Profile = "Microcontroller"; break;
+  case 'S': Profile = "Classic"; break;
+  case 0: Profile = "None"; break;
+  }
+
+  PrintAttribute(Tag, Encoded, Profile);
+}
+
+void ARMAttributeParser::ARM_ISA_use(AttrType Tag, const uint8_t *Data,
+                                     uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::THUMB_ISA_use(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Thumb-1", "Thumb-2" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::FP_arch(AttrType Tag, const uint8_t *Data,
+                                 uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "VFPv1", "VFPv2", "VFPv3", "VFPv3-D16", "VFPv4",
+    "VFPv4-D16", "ARMv8-a FP", "ARMv8-a FP-D16"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::WMMX_arch(AttrType Tag, const uint8_t *Data,
+                                   uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "WMMXv1", "WMMXv2" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::Advanced_SIMD_arch(AttrType Tag, const uint8_t *Data,
+                                            uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "NEONv1", "NEONv2+FMA", "ARMv8-a NEON", "ARMv8.1-a NEON"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::PCS_config(AttrType Tag, const uint8_t *Data,
+                                    uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "None", "Bare Platform", "Linux Application", "Linux DSO", "Palm OS 2004",
+    "Reserved (Palm OS)", "Symbian OS 2004", "Reserved (Symbian OS)"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_R9_use(AttrType Tag, const uint8_t *Data,
+                                        uint32_t &Offset) {
+  static const char *const Strings[] = { "v6", "Static Base", "TLS", "Unused" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_RW_data(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Absolute", "PC-relative", "SB-relative", "Not Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_RO_data(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Absolute", "PC-relative", "Not Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_GOT_use(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Direct", "GOT-Indirect"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_PCS_wchar_t(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Unknown", "2-byte", "Unknown", "4-byte"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_rounding(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = { "IEEE-754", "Runtime" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_denormal(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Unsupported", "IEEE-754", "Sign Only"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_exceptions(AttrType Tag, const uint8_t *Data,
+                                           uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "IEEE-754" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_user_exceptions(AttrType Tag,
+                                                const uint8_t *Data,
+                                                uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "IEEE-754" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_number_model(AttrType Tag, const uint8_t *Data,
+                                             uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Finite Only", "RTABI", "IEEE-754"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_align_needed(AttrType Tag, const uint8_t *Data,
+                                          uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "8-byte alignment", "4-byte alignment", "Reserved"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+
+  std::string Description;
+  if (Value < array_lengthof(Strings))
+    Description = std::string(Strings[Value]);
+  else if (Value <= 12)
+    Description = std::string("8-byte alignment, ") + utostr(1ULL << Value)
+                + std::string("-byte extended alignment");
+  else
+    Description = "Invalid";
+
+  PrintAttribute(Tag, Value, Description);
+}
+
+void ARMAttributeParser::ABI_align_preserved(AttrType Tag, const uint8_t *Data,
+                                             uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Required", "8-byte data alignment", "8-byte data and code alignment",
+    "Reserved"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+
+  std::string Description;
+  if (Value < array_lengthof(Strings))
+    Description = std::string(Strings[Value]);
+  else if (Value <= 12)
+    Description = std::string("8-byte stack alignment, ") +
+                  utostr(1ULL << Value) + std::string("-byte data alignment");
+  else
+    Description = "Invalid";
+
+  PrintAttribute(Tag, Value, Description);
+}
+
+void ARMAttributeParser::ABI_enum_size(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "Packed", "Int32", "External Int32"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_HardFP_use(AttrType Tag, const uint8_t *Data,
+                                        uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Tag_FP_arch", "Single-Precision", "Reserved", "Tag_FP_arch (deprecated)"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_VFP_args(AttrType Tag, const uint8_t *Data,
+                                      uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "AAPCS", "AAPCS VFP", "Custom", "Not Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_WMMX_args(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = { "AAPCS", "iWMMX", "Custom" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_optimization_goals(AttrType Tag,
+                                                const uint8_t *Data,
+                                                uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "None", "Speed", "Aggressive Speed", "Size", "Aggressive Size", "Debugging",
+    "Best Debugging"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_optimization_goals(AttrType Tag,
+                                                   const uint8_t *Data,
+                                                   uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "None", "Speed", "Aggressive Speed", "Size", "Aggressive Size", "Accuracy",
+    "Best Accuracy"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::compatibility(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  uint64_t Integer = ParseInteger(Data, Offset);
+  StringRef String = ParseString(Data, Offset);
+
+  if (SW) {
+    DictScope AS(*SW, "Attribute");
+    SW->printNumber("Tag", Tag);
+    SW->startLine() << "Value: " << Integer << ", " << String << '\n';
+    SW->printString("TagName", AttrTypeAsString(Tag, /*TagPrefix*/false));
+    switch (Integer) {
+    case 0:
+      SW->printString("Description", StringRef("No Specific Requirements"));
+      break;
+    case 1:
+      SW->printString("Description", StringRef("AEABI Conformant"));
+      break;
+    default:
+      SW->printString("Description", StringRef("AEABI Non-Conformant"));
+      break;
+    }
+  }
+}
+
+void ARMAttributeParser::CPU_unaligned_access(AttrType Tag, const uint8_t *Data,
+                                              uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "v6-style" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::FP_HP_extension(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = { "If Available", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::ABI_FP_16bit_format(AttrType Tag, const uint8_t *Data,
+                                             uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "IEEE-754", "VFPv3" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::MPextension_use(AttrType Tag, const uint8_t *Data,
+                                         uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::DIV_use(AttrType Tag, const uint8_t *Data,
+                                 uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "If Available", "Not Permitted", "Permitted"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::DSP_extension(AttrType Tag, const uint8_t *Data,
+                                       uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::T2EE_use(AttrType Tag, const uint8_t *Data,
+                                  uint32_t &Offset) {
+  static const char *const Strings[] = { "Not Permitted", "Permitted" };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::Virtualization_use(AttrType Tag, const uint8_t *Data,
+                                            uint32_t &Offset) {
+  static const char *const Strings[] = {
+    "Not Permitted", "TrustZone", "Virtualization Extensions",
+    "TrustZone + Virtualization Extensions"
+  };
+
+  uint64_t Value = ParseInteger(Data, Offset);
+  StringRef ValueDesc =
+    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
+  PrintAttribute(Tag, Value, ValueDesc);
+}
+
+void ARMAttributeParser::nodefaults(AttrType Tag, const uint8_t *Data,
+                                    uint32_t &Offset) {
+  uint64_t Value = ParseInteger(Data, Offset);
+  PrintAttribute(Tag, Value, "Unspecified Tags UNDEFINED");
+}
+
+void ARMAttributeParser::ParseIndexList(const uint8_t *Data, uint32_t &Offset,
+                                        SmallVectorImpl<uint8_t> &IndexList) {
+  for (;;) {
+    unsigned Length;
+    uint64_t Value = decodeULEB128(Data + Offset, &Length);
+    Offset = Offset + Length;
+    if (Value == 0)
+      break;
+    IndexList.push_back(Value);
+  }
+}
+
+void ARMAttributeParser::ParseAttributeList(const uint8_t *Data,
+                                            uint32_t &Offset, uint32_t Length) {
+  while (Offset < Length) {
+    unsigned Length;
+    uint64_t Tag = decodeULEB128(Data + Offset, &Length);
+    Offset += Length;
+
+    bool Handled = false;
+    for (unsigned AHI = 0, AHE = array_lengthof(DisplayRoutines);
+         AHI != AHE && !Handled; ++AHI) {
+      if (DisplayRoutines[AHI].Attribute == Tag) {
+        (this->*DisplayRoutines[AHI].Routine)(ARMBuildAttrs::AttrType(Tag),
+                                              Data, Offset);
+        Handled = true;
+        break;
+      }
+    }
+    if (!Handled) {
+      if (Tag < 32) {
+        errs() << "unhandled AEABI Tag " << Tag
+               << " (" << ARMBuildAttrs::AttrTypeAsString(Tag) << ")\n";
+        continue;
+      }
+
+      if (Tag % 2 == 0)
+        IntegerAttribute(ARMBuildAttrs::AttrType(Tag), Data, Offset);
+      else
+        StringAttribute(ARMBuildAttrs::AttrType(Tag), Data, Offset);
+    }
+  }
+}
+
+void ARMAttributeParser::ParseSubsection(const uint8_t *Data, uint32_t Length) {
+  uint32_t Offset = sizeof(uint32_t); /* SectionLength */
+
+  const char *VendorName = reinterpret_cast<const char*>(Data + Offset);
+  size_t VendorNameLength = std::strlen(VendorName);
+  Offset = Offset + VendorNameLength + 1;
+
+  if (SW) {
+    SW->printNumber("SectionLength", Length);
+    SW->printString("Vendor", StringRef(VendorName, VendorNameLength));
+  }
+
+  if (StringRef(VendorName, VendorNameLength).lower() != "aeabi") {
+    return;
+  }
+
+  while (Offset < Length) {
+    /// Tag_File | Tag_Section | Tag_Symbol   uleb128:byte-size
+    uint8_t Tag = Data[Offset];
+    Offset = Offset + sizeof(Tag);
+
+    uint32_t Size =
+      *reinterpret_cast<const support::ulittle32_t*>(Data + Offset);
+    Offset = Offset + sizeof(Size);
+
+    if (SW) {
+      SW->printEnum("Tag", Tag, makeArrayRef(TagNames));
+      SW->printNumber("Size", Size);
+    }
+
+    if (Size > Length) {
+      errs() << "subsection length greater than section length\n";
+      return;
+    }
+
+    StringRef ScopeName, IndexName;
+    SmallVector<uint8_t, 8> Indicies;
+    switch (Tag) {
+    case ARMBuildAttrs::File:
+      ScopeName = "FileAttributes";
+      break;
+    case ARMBuildAttrs::Section:
+      ScopeName = "SectionAttributes";
+      IndexName = "Sections";
+      ParseIndexList(Data, Offset, Indicies);
+      break;
+    case ARMBuildAttrs::Symbol:
+      ScopeName = "SymbolAttributes";
+      IndexName = "Symbols";
+      ParseIndexList(Data, Offset, Indicies);
+      break;
+    default:
+      errs() << "unrecognised tag: 0x" << utohexstr(Tag) << '\n';
+      return;
+    }
+
+    if (SW) {
+      DictScope ASS(*SW, ScopeName);
+      if (!Indicies.empty())
+        SW->printList(IndexName, Indicies);
+      ParseAttributeList(Data, Offset, Length);
+    } else {
+      ParseAttributeList(Data, Offset, Length);
+    }
+  }
+}
+
+void ARMAttributeParser::Parse(ArrayRef<uint8_t> Section, bool isLittle) {
+  size_t Offset = 1;
+  unsigned SectionNumber = 0;
+
+  while (Offset < Section.size()) {
+    uint32_t SectionLength = isLittle ?
+      support::endian::read32le(Section.data() + Offset) :
+      support::endian::read32be(Section.data() + Offset);
+
+    if (SW) {
+      SW->startLine() << "Section " << ++SectionNumber << " {\n";
+      SW->indent();
+    }
+
+    ParseSubsection(Section.data() + Offset, SectionLength);
+    Offset = Offset + SectionLength;
+
+    if (SW) {
+      SW->unindent();
+      SW->startLine() << "}\n";
+    }
+  }
+}
+}
+
diff --git a/contrib/llvm/lib/Support/ARMBuildAttrs.cpp b/contrib/llvm/lib/Support/ARMBuildAttrs.cpp
index 134ef8b..8f18e9e 100644
--- a/contrib/llvm/lib/Support/ARMBuildAttrs.cpp
+++ b/contrib/llvm/lib/Support/ARMBuildAttrs.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ARMBuildAttributes.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Support/Atomic.cpp b/contrib/llvm/lib/Support/Atomic.cpp
index 80550e2..55910c4 100644
--- a/contrib/llvm/lib/Support/Atomic.cpp
+++ b/contrib/llvm/lib/Support/Atomic.cpp
@@ -18,6 +18,8 @@ using namespace llvm;
 
 #if defined(_MSC_VER)
 #include <Intrin.h>
+
+// We must include windows.h after Intrin.h.
 #include <windows.h>
 #undef MemoryFence
 #endif
diff --git a/contrib/llvm/lib/Support/BinaryStreamError.cpp b/contrib/llvm/lib/Support/BinaryStreamError.cpp
new file mode 100644
index 0000000..60f5e21
--- /dev/null
+++ b/contrib/llvm/lib/Support/BinaryStreamError.cpp
@@ -0,0 +1,56 @@
+//===- BinaryStreamError.cpp - Error extensions for streams -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+char BinaryStreamError::ID = 0;
+
+BinaryStreamError::BinaryStreamError(stream_error_code C)
+    : BinaryStreamError(C, "") {}
+
+BinaryStreamError::BinaryStreamError(StringRef Context)
+    : BinaryStreamError(stream_error_code::unspecified, Context) {}
+
+BinaryStreamError::BinaryStreamError(stream_error_code C, StringRef Context)
+    : Code(C) {
+  ErrMsg = "Stream Error: ";
+  switch (C) {
+  case stream_error_code::unspecified:
+    ErrMsg += "An unspecified error has occurred.";
+    break;
+  case stream_error_code::stream_too_short:
+    ErrMsg += "The stream is too short to perform the requested operation.";
+    break;
+  case stream_error_code::invalid_array_size:
+    ErrMsg += "The buffer size is not a multiple of the array element size.";
+    break;
+  case stream_error_code::invalid_offset:
+    ErrMsg += "The specified offset is invalid for the current stream.";
+    break;
+  case stream_error_code::filesystem_error:
+    ErrMsg += "An I/O error occurred on the file system.";
+    break;
+  }
+
+  if (!Context.empty()) {
+    ErrMsg += "  ";
+    ErrMsg += Context;
+  }
+}
+
+void BinaryStreamError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+StringRef BinaryStreamError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code BinaryStreamError::convertToErrorCode() const {
+  return inconvertibleErrorCode();
+}
diff --git a/contrib/llvm/lib/Support/BinaryStreamReader.cpp b/contrib/llvm/lib/Support/BinaryStreamReader.cpp
new file mode 100644
index 0000000..e00527f
--- /dev/null
+++ b/contrib/llvm/lib/Support/BinaryStreamReader.cpp
@@ -0,0 +1,149 @@
+//===- BinaryStreamReader.cpp - Reads objects from a binary stream --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamReader.h"
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+using endianness = llvm::support::endianness;
+
+BinaryStreamReader::BinaryStreamReader(BinaryStreamRef Ref) : Stream(Ref) {}
+
+BinaryStreamReader::BinaryStreamReader(BinaryStream &Stream) : Stream(Stream) {}
+
+BinaryStreamReader::BinaryStreamReader(ArrayRef<uint8_t> Data,
+                                       endianness Endian)
+    : Stream(Data, Endian) {}
+
+BinaryStreamReader::BinaryStreamReader(StringRef Data, endianness Endian)
+    : Stream(Data, Endian) {}
+
+Error BinaryStreamReader::readLongestContiguousChunk(
+    ArrayRef<uint8_t> &Buffer) {
+  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error BinaryStreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
+  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
+    return EC;
+  Offset += Size;
+  return Error::success();
+}
+
+Error BinaryStreamReader::readCString(StringRef &Dest) {
+  uint32_t OriginalOffset = getOffset();
+  uint32_t FoundOffset = 0;
+  while (true) {
+    uint32_t ThisOffset = getOffset();
+    ArrayRef<uint8_t> Buffer;
+    if (auto EC = readLongestContiguousChunk(Buffer))
+      return EC;
+    StringRef S(reinterpret_cast<const char *>(Buffer.begin()), Buffer.size());
+    size_t Pos = S.find_first_of('\0');
+    if (LLVM_LIKELY(Pos != StringRef::npos)) {
+      FoundOffset = Pos + ThisOffset;
+      break;
+    }
+  }
+  assert(FoundOffset >= OriginalOffset);
+
+  setOffset(OriginalOffset);
+  size_t Length = FoundOffset - OriginalOffset;
+
+  if (auto EC = readFixedString(Dest, Length))
+    return EC;
+
+  // Now set the offset back to after the null terminator.
+  setOffset(FoundOffset + 1);
+  return Error::success();
+}
+
+Error BinaryStreamReader::readWideString(ArrayRef<UTF16> &Dest) {
+  uint32_t Length = 0;
+  uint32_t OriginalOffset = getOffset();
+  const UTF16 *C;
+  while (true) {
+    if (auto EC = readObject(C))
+      return EC;
+    if (*C == 0x0000)
+      break;
+    ++Length;
+  }
+  uint32_t NewOffset = getOffset();
+  setOffset(OriginalOffset);
+
+  if (auto EC = readArray(Dest, Length))
+    return EC;
+  setOffset(NewOffset);
+  return Error::success();
+}
+
+Error BinaryStreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
+  ArrayRef<uint8_t> Bytes;
+  if (auto EC = readBytes(Bytes, Length))
+    return EC;
+  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
+  return Error::success();
+}
+
+Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref) {
+  return readStreamRef(Ref, bytesRemaining());
+}
+
+Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref, uint32_t Length) {
+  if (bytesRemaining() < Length)
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Ref = Stream.slice(Offset, Length);
+  Offset += Length;
+  return Error::success();
+}
+
+Error BinaryStreamReader::readSubstream(BinarySubstreamRef &Stream,
+                                        uint32_t Size) {
+  Stream.Offset = getOffset();
+  return readStreamRef(Stream.StreamData, Size);
+}
+
+Error BinaryStreamReader::skip(uint32_t Amount) {
+  if (Amount > bytesRemaining())
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Offset += Amount;
+  return Error::success();
+}
+
+Error BinaryStreamReader::padToAlignment(uint32_t Align) {
+  uint32_t NewOffset = alignTo(Offset, Align);
+  return skip(NewOffset - Offset);
+}
+
+uint8_t BinaryStreamReader::peek() const {
+  ArrayRef<uint8_t> Buffer;
+  auto EC = Stream.readBytes(Offset, 1, Buffer);
+  assert(!EC && "Cannot peek an empty buffer!");
+  llvm::consumeError(std::move(EC));
+  return Buffer[0];
+}
+
+std::pair<BinaryStreamReader, BinaryStreamReader>
+BinaryStreamReader::split(uint32_t Off) const {
+  assert(getLength() >= Off);
+
+  BinaryStreamRef First = Stream.drop_front(Offset);
+
+  BinaryStreamRef Second = First.drop_front(Off);
+  First = First.keep_front(Off);
+  BinaryStreamReader W1{First};
+  BinaryStreamReader W2{Second};
+  return std::make_pair(W1, W2);
+}
+\ No newline at end of file
diff --git a/contrib/llvm/lib/Support/BinaryStreamRef.cpp b/contrib/llvm/lib/Support/BinaryStreamRef.cpp
new file mode 100644
index 0000000..fe9a817
--- /dev/null
+++ b/contrib/llvm/lib/Support/BinaryStreamRef.cpp
@@ -0,0 +1,137 @@
+//===- BinaryStreamRef.cpp - ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryByteStream.h"
+
+using namespace llvm;
+using namespace llvm::support;
+
+namespace {
+
+class ArrayRefImpl : public BinaryStream {
+public:
+  ArrayRefImpl(ArrayRef<uint8_t> Data, endianness Endian) : BBS(Data, Endian) {}
+
+  llvm::support::endianness getEndian() const override {
+    return BBS.getEndian();
+  }
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readBytes(Offset, Size, Buffer);
+  }
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readLongestContiguousChunk(Offset, Buffer);
+  }
+  uint32_t getLength() override { return BBS.getLength(); }
+
+private:
+  BinaryByteStream BBS;
+};
+
+class MutableArrayRefImpl : public WritableBinaryStream {
+public:
+  MutableArrayRefImpl(MutableArrayRef<uint8_t> Data, endianness Endian)
+      : BBS(Data, Endian) {}
+
+  // Inherited via WritableBinaryStream
+  llvm::support::endianness getEndian() const override {
+    return BBS.getEndian();
+  }
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readBytes(Offset, Size, Buffer);
+  }
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    return BBS.readLongestContiguousChunk(Offset, Buffer);
+  }
+  uint32_t getLength() override { return BBS.getLength(); }
+
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) override {
+    return BBS.writeBytes(Offset, Data);
+  }
+  Error commit() override { return BBS.commit(); }
+
+private:
+  MutableBinaryByteStream BBS;
+};
+}
+
+BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream)
+    : BinaryStreamRef(Stream, 0, Stream.getLength()) {}
+BinaryStreamRef::BinaryStreamRef(BinaryStream &Stream, uint32_t Offset,
+                                 uint32_t Length)
+    : BinaryStreamRefBase(Stream, Offset, Length) {}
+BinaryStreamRef::BinaryStreamRef(ArrayRef<uint8_t> Data, endianness Endian)
+    : BinaryStreamRefBase(std::make_shared<ArrayRefImpl>(Data, Endian), 0,
+                          Data.size()) {}
+BinaryStreamRef::BinaryStreamRef(StringRef Data, endianness Endian)
+    : BinaryStreamRef(makeArrayRef(Data.bytes_begin(), Data.bytes_end()),
+                      Endian) {}
+
+BinaryStreamRef::BinaryStreamRef(const BinaryStreamRef &Other)
+    : BinaryStreamRefBase(Other) {}
+
+Error BinaryStreamRef::readBytes(uint32_t Offset, uint32_t Size,
+                                 ArrayRef<uint8_t> &Buffer) const {
+  if (auto EC = checkOffset(Offset, Size))
+    return EC;
+  return BorrowedImpl->readBytes(ViewOffset + Offset, Size, Buffer);
+}
+
+Error BinaryStreamRef::readLongestContiguousChunk(
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+  if (auto EC = checkOffset(Offset, 1))
+    return EC;
+
+  if (auto EC =
+          BorrowedImpl->readLongestContiguousChunk(ViewOffset + Offset, Buffer))
+    return EC;
+  // This StreamRef might refer to a smaller window over a larger stream.  In
+  // that case we will have read out more bytes than we should return, because
+  // we should not read past the end of the current view.
+  uint32_t MaxLength = Length - Offset;
+  if (Buffer.size() > MaxLength)
+    Buffer = Buffer.slice(0, MaxLength);
+  return Error::success();
+}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(WritableBinaryStream &Stream)
+    : WritableBinaryStreamRef(Stream, 0, Stream.getLength()) {}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(WritableBinaryStream &Stream,
+                                                 uint32_t Offset,
+                                                 uint32_t Length)
+    : BinaryStreamRefBase(Stream, Offset, Length) {}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(MutableArrayRef<uint8_t> Data,
+                                                 endianness Endian)
+    : BinaryStreamRefBase(std::make_shared<MutableArrayRefImpl>(Data, Endian),
+                          0, Data.size()) {}
+
+WritableBinaryStreamRef::WritableBinaryStreamRef(
+    const WritableBinaryStreamRef &Other)
+    : BinaryStreamRefBase(Other) {}
+
+Error WritableBinaryStreamRef::writeBytes(uint32_t Offset,
+                                          ArrayRef<uint8_t> Data) const {
+  if (auto EC = checkOffset(Offset, Data.size()))
+    return EC;
+
+  return BorrowedImpl->writeBytes(ViewOffset + Offset, Data);
+}
+
+WritableBinaryStreamRef::operator BinaryStreamRef() const {
+  return BinaryStreamRef(*BorrowedImpl, ViewOffset, Length);
+}
+
+/// \brief For buffered streams, commits changes to the backing store.
+Error WritableBinaryStreamRef::commit() { return BorrowedImpl->commit(); }
diff --git a/contrib/llvm/lib/Support/BinaryStreamWriter.cpp b/contrib/llvm/lib/Support/BinaryStreamWriter.cpp
new file mode 100644
index 0000000..c427651
--- /dev/null
+++ b/contrib/llvm/lib/Support/BinaryStreamWriter.cpp
@@ -0,0 +1,90 @@
+//===- BinaryStreamWriter.cpp - Writes objects to a BinaryStream ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamWriter.h"
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+
+BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStreamRef Ref)
+    : Stream(Ref) {}
+
+BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStream &Stream)
+    : Stream(Stream) {}
+
+BinaryStreamWriter::BinaryStreamWriter(MutableArrayRef<uint8_t> Data,
+                                       llvm::support::endianness Endian)
+    : Stream(Data, Endian) {}
+
+Error BinaryStreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
+  if (auto EC = Stream.writeBytes(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error BinaryStreamWriter::writeCString(StringRef Str) {
+  if (auto EC = writeFixedString(Str))
+    return EC;
+  if (auto EC = writeObject('\0'))
+    return EC;
+
+  return Error::success();
+}
+
+Error BinaryStreamWriter::writeFixedString(StringRef Str) {
+  return writeBytes(ArrayRef<uint8_t>(Str.bytes_begin(), Str.bytes_end()));
+}
+
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref) {
+  return writeStreamRef(Ref, Ref.getLength());
+}
+
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref, uint32_t Length) {
+  BinaryStreamReader SrcReader(Ref.slice(0, Length));
+  // This is a bit tricky.  If we just call readBytes, we are requiring that it
+  // return us the entire stream as a contiguous buffer.  There is no guarantee
+  // this can be satisfied by returning a reference straight from the buffer, as
+  // an implementation may not store all data in a single contiguous buffer.  So
+  // we iterate over each contiguous chunk, writing each one in succession.
+  while (SrcReader.bytesRemaining() > 0) {
+    ArrayRef<uint8_t> Chunk;
+    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
+      return EC;
+    if (auto EC = writeBytes(Chunk))
+      return EC;
+  }
+  return Error::success();
+}
+
+std::pair<BinaryStreamWriter, BinaryStreamWriter>
+BinaryStreamWriter::split(uint32_t Off) const {
+  assert(getLength() >= Off);
+
+  WritableBinaryStreamRef First = Stream.drop_front(Offset);
+
+  WritableBinaryStreamRef Second = First.drop_front(Off);
+  First = First.keep_front(Off);
+  BinaryStreamWriter W1{First};
+  BinaryStreamWriter W2{Second};
+  return std::make_pair(W1, W2);
+}
+
+Error BinaryStreamWriter::padToAlignment(uint32_t Align) {
+  uint32_t NewOffset = alignTo(Offset, Align);
+  if (NewOffset > getLength())
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  while (Offset < NewOffset)
+    if (auto EC = writeInteger('\0'))
+      return EC;
+  return Error::success();
+}
diff --git a/contrib/llvm/lib/Support/BranchProbability.cpp b/contrib/llvm/lib/Support/BranchProbability.cpp
index 1c41659..44ad110 100644
--- a/contrib/llvm/lib/Support/BranchProbability.cpp
+++ b/contrib/llvm/lib/Support/BranchProbability.cpp
@@ -32,7 +32,9 @@ raw_ostream &BranchProbability::print(raw_ostream &OS) const {
                       Percent);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void BranchProbability::dump() const { print(dbgs()) << '\n'; }
+#endif
 
 BranchProbability::BranchProbability(uint32_t Numerator, uint32_t Denominator) {
   assert(Denominator > 0 && "Denominator cannot be 0!");
diff --git a/contrib/llvm/lib/Support/CachePruning.cpp b/contrib/llvm/lib/Support/CachePruning.cpp
index 3831625..60d0964 100644
--- a/contrib/llvm/lib/Support/CachePruning.cpp
+++ b/contrib/llvm/lib/Support/CachePruning.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,8 +34,96 @@ static void writeTimestampFile(StringRef TimestampFile) {
   raw_fd_ostream Out(TimestampFile.str(), EC, sys::fs::F_None);
 }
 
+static Expected<std::chrono::seconds> parseDuration(StringRef Duration) {
+  if (Duration.empty())
+    return make_error<StringError>("Duration must not be empty",
+                                   inconvertibleErrorCode());
+
+  StringRef NumStr = Duration.slice(0, Duration.size()-1);
+  uint64_t Num;
+  if (NumStr.getAsInteger(0, Num))
+    return make_error<StringError>("'" + NumStr + "' not an integer",
+                                   inconvertibleErrorCode());
+
+  switch (Duration.back()) {
+  case 's':
+    return std::chrono::seconds(Num);
+  case 'm':
+    return std::chrono::minutes(Num);
+  case 'h':
+    return std::chrono::hours(Num);
+  default:
+    return make_error<StringError>("'" + Duration +
+                                       "' must end with one of 's', 'm' or 'h'",
+                                   inconvertibleErrorCode());
+  }
+}
+
+Expected<CachePruningPolicy>
+llvm::parseCachePruningPolicy(StringRef PolicyStr) {
+  CachePruningPolicy Policy;
+  std::pair<StringRef, StringRef> P = {"", PolicyStr};
+  while (!P.second.empty()) {
+    P = P.second.split(':');
+
+    StringRef Key, Value;
+    std::tie(Key, Value) = P.first.split('=');
+    if (Key == "prune_interval") {
+      auto DurationOrErr = parseDuration(Value);
+      if (!DurationOrErr)
+        return DurationOrErr.takeError();
+      Policy.Interval = *DurationOrErr;
+    } else if (Key == "prune_after") {
+      auto DurationOrErr = parseDuration(Value);
+      if (!DurationOrErr)
+        return DurationOrErr.takeError();
+      Policy.Expiration = *DurationOrErr;
+    } else if (Key == "cache_size") {
+      if (Value.back() != '%')
+        return make_error<StringError>("'" + Value + "' must be a percentage",
+                                       inconvertibleErrorCode());
+      StringRef SizeStr = Value.drop_back();
+      uint64_t Size;
+      if (SizeStr.getAsInteger(0, Size))
+        return make_error<StringError>("'" + SizeStr + "' not an integer",
+                                       inconvertibleErrorCode());
+      if (Size > 100)
+        return make_error<StringError>("'" + SizeStr +
+                                           "' must be between 0 and 100",
+                                       inconvertibleErrorCode());
+      Policy.MaxSizePercentageOfAvailableSpace = Size;
+    } else if (Key == "cache_size_bytes") {
+      uint64_t Mult = 1;
+      switch (tolower(Value.back())) {
+      case 'k':
+        Mult = 1024;
+        Value = Value.drop_back();
+        break;
+      case 'm':
+        Mult = 1024 * 1024;
+        Value = Value.drop_back();
+        break;
+      case 'g':
+        Mult = 1024 * 1024 * 1024;
+        Value = Value.drop_back();
+        break;
+      }
+      uint64_t Size;
+      if (Value.getAsInteger(0, Size))
+        return make_error<StringError>("'" + Value + "' not an integer",
+                                       inconvertibleErrorCode());
+      Policy.MaxSizeBytes = Size * Mult;
+    } else {
+      return make_error<StringError>("Unknown key: '" + Key + "'",
+                                     inconvertibleErrorCode());
+    }
+  }
+
+  return Policy;
+}
+
 /// Prune the cache of files that haven't been accessed in a long time.
-bool CachePruning::prune() {
+bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
   using namespace std::chrono;
 
   if (Path.empty())
@@ -47,7 +136,12 @@ bool CachePruning::prune() {
   if (!isPathDir)
     return false;
 
-  if (Expiration == seconds(0) && PercentageOfAvailableSpace == 0) {
+  Policy.MaxSizePercentageOfAvailableSpace =
+      std::min(Policy.MaxSizePercentageOfAvailableSpace, 100u);
+
+  if (Policy.Expiration == seconds(0) &&
+      Policy.MaxSizePercentageOfAvailableSpace == 0 &&
+      Policy.MaxSizeBytes == 0) {
     DEBUG(dbgs() << "No pruning settings set, exit early\n");
     // Nothing will be pruned, early exit
     return false;
@@ -67,12 +161,12 @@ bool CachePruning::prune() {
       return false;
     }
   } else {
-    if (Interval == seconds(0)) {
+    if (Policy.Interval == seconds(0)) {
       // Check whether the time stamp is older than our pruning interval.
       // If not, do nothing.
       const auto TimeStampModTime = FileStatus.getLastModificationTime();
       auto TimeStampAge = CurrentTime - TimeStampModTime;
-      if (TimeStampAge <= Interval) {
+      if (TimeStampAge <= Policy.Interval) {
         DEBUG(dbgs() << "Timestamp file too recent ("
                      << duration_cast<seconds>(TimeStampAge).count()
                      << "s old), do not prune.\n");
@@ -85,7 +179,8 @@ bool CachePruning::prune() {
     writeTimestampFile(TimestampFile);
   }
 
-  bool ShouldComputeSize = (PercentageOfAvailableSpace > 0);
+  bool ShouldComputeSize =
+      (Policy.MaxSizePercentageOfAvailableSpace > 0 || Policy.MaxSizeBytes > 0);
 
   // Keep track of space
   std::set<std::pair<uint64_t, std::string>> FileSizes;
@@ -108,8 +203,11 @@ bool CachePruning::prune() {
   // Walk all of the files within this directory.
   for (sys::fs::directory_iterator File(CachePathNative, EC), FileEnd;
        File != FileEnd && !EC; File.increment(EC)) {
-    // Do not touch the timestamp.
-    if (File->path() == TimestampFile)
+    // Ignore any files not beginning with the string "llvmcache-". This
+    // includes the timestamp file as well as any files created by the user.
+    // This acts as a safeguard against data loss if the user specifies the
+    // wrong directory as their cache directory.
+    if (!sys::path::filename(File->path()).startswith("llvmcache-"))
       continue;
 
     // Look at this file. If we can't stat it, there's nothing interesting
@@ -122,7 +220,7 @@ bool CachePruning::prune() {
     // If the file hasn't been used recently enough, delete it
     const auto FileAccessTime = FileStatus.getLastAccessedTime();
     auto FileAge = CurrentTime - FileAccessTime;
-    if (FileAge > Expiration) {
+    if (FileAge > Policy.Expiration) {
       DEBUG(dbgs() << "Remove " << File->path() << " ("
                    << duration_cast<seconds>(FileAge).count() << "s old)\n");
       sys::fs::remove(File->path());
@@ -141,12 +239,22 @@ bool CachePruning::prune() {
     }
     sys::fs::space_info SpaceInfo = ErrOrSpaceInfo.get();
     auto AvailableSpace = TotalSize + SpaceInfo.free;
-    auto FileAndSize = FileSizes.rbegin();
+
+    if (Policy.MaxSizePercentageOfAvailableSpace == 0)
+      Policy.MaxSizePercentageOfAvailableSpace = 100;
+    if (Policy.MaxSizeBytes == 0)
+      Policy.MaxSizeBytes = AvailableSpace;
+    auto TotalSizeTarget = std::min<uint64_t>(
+        AvailableSpace * Policy.MaxSizePercentageOfAvailableSpace / 100ull,
+        Policy.MaxSizeBytes);
+
     DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
-                 << "% target is: " << PercentageOfAvailableSpace << "\n");
+                 << "% target is: " << Policy.MaxSizePercentageOfAvailableSpace
+                 << "%, " << Policy.MaxSizeBytes << " bytes\n");
+
+    auto FileAndSize = FileSizes.rbegin();
     // Remove the oldest accessed files first, till we get below the threshold
-    while (((100 * TotalSize) / AvailableSpace) > PercentageOfAvailableSpace &&
-           FileAndSize != FileSizes.rend()) {
+    while (TotalSize > TotalSizeTarget && FileAndSize != FileSizes.rend()) {
       // Remove the file.
       sys::fs::remove(FileAndSize->second);
       // Update size
diff --git a/contrib/llvm/lib/Support/Chrono.cpp b/contrib/llvm/lib/Support/Chrono.cpp
index cdadbd8..daccaf1 100644
--- a/contrib/llvm/lib/Support/Chrono.cpp
+++ b/contrib/llvm/lib/Support/Chrono.cpp
@@ -16,6 +16,13 @@ namespace llvm {
 
 using namespace sys;
 
+const char llvm::detail::unit<std::ratio<3600>>::value[] = "h";
+const char llvm::detail::unit<std::ratio<60>>::value[] = "m";
+const char llvm::detail::unit<std::ratio<1>>::value[] = "s";
+const char llvm::detail::unit<std::milli>::value[] = "ms";
+const char llvm::detail::unit<std::micro>::value[] = "us";
+const char llvm::detail::unit<std::nano>::value[] = "ns";
+
 static inline struct tm getStructTM(TimePoint<> TP) {
   struct tm Storage;
   std::time_t OurTime = toTimeT(TP);
diff --git a/contrib/llvm/lib/Support/CommandLine.cpp b/contrib/llvm/lib/Support/CommandLine.cpp
index 3889902..8eeb685 100644
--- a/contrib/llvm/lib/Support/CommandLine.cpp
+++ b/contrib/llvm/lib/Support/CommandLine.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
@@ -123,7 +124,7 @@ public:
   void ResetAllOptionOccurrences();
 
   bool ParseCommandLineOptions(int argc, const char *const *argv,
-                               StringRef Overview, bool IgnoreErrors);
+                               StringRef Overview, raw_ostream *Errs = nullptr);
 
   void addLiteralOption(Option &Opt, SubCommand *SC, StringRef Name) {
     if (Opt.hasArgStr())
@@ -1013,9 +1014,9 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
 }
 
 bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
-                                 StringRef Overview, bool IgnoreErrors) {
+                                 StringRef Overview, raw_ostream *Errs) {
   return GlobalParser->ParseCommandLineOptions(argc, argv, Overview,
-                                               IgnoreErrors);
+                                               Errs);
 }
 
 void CommandLineParser::ResetAllOptionOccurrences() {
@@ -1030,7 +1031,7 @@ void CommandLineParser::ResetAllOptionOccurrences() {
 bool CommandLineParser::ParseCommandLineOptions(int argc,
                                                 const char *const *argv,
                                                 StringRef Overview,
-                                                bool IgnoreErrors) {
+                                                raw_ostream *Errs) {
   assert(hasOptions() && "No options specified!");
 
   // Expand response files.
@@ -1045,6 +1046,9 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   ProgramName = sys::path::filename(StringRef(argv[0]));
 
   ProgramOverview = Overview;
+  bool IgnoreErrors = Errs;
+  if (!Errs)
+    Errs = &errs();
   bool ErrorParsing = false;
 
   // Check out the positional arguments to collect information about them.
@@ -1097,15 +1101,14 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
         // not specified after an option that eats all extra arguments, or this
         // one will never get any!
         //
-        if (!IgnoreErrors) {
+        if (!IgnoreErrors)
           Opt->error("error - option can never match, because "
                      "another positional argument will match an "
                      "unbounded number of values, and this option"
                      " does not require a value!");
-          errs() << ProgramName << ": CommandLine Error: Option '"
-                 << Opt->ArgStr << "' is all messed up!\n";
-          errs() << PositionalOpts.size();
-        }
+        *Errs << ProgramName << ": CommandLine Error: Option '" << Opt->ArgStr
+              << "' is all messed up!\n";
+        *Errs << PositionalOpts.size();
         ErrorParsing = true;
       }
       UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
@@ -1200,15 +1203,13 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
     if (!Handler) {
       if (SinkOpts.empty()) {
-        if (!IgnoreErrors) {
-          errs() << ProgramName << ": Unknown command line argument '"
-                 << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
-
-          if (NearestHandler) {
-            // If we know a near match, report it as well.
-            errs() << ProgramName << ": Did you mean '-" << NearestHandlerString
-                   << "'?\n";
-          }
+        *Errs << ProgramName << ": Unknown command line argument '" << argv[i]
+              << "'.  Try: '" << argv[0] << " -help'\n";
+
+        if (NearestHandler) {
+          // If we know a near match, report it as well.
+          *Errs << ProgramName << ": Did you mean '-" << NearestHandlerString
+                 << "'?\n";
         }
 
         ErrorParsing = true;
@@ -1231,22 +1232,18 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
   // Check and handle positional arguments now...
   if (NumPositionalRequired > PositionalVals.size()) {
-    if (!IgnoreErrors) {
-      errs() << ProgramName
+      *Errs << ProgramName
              << ": Not enough positional command line arguments specified!\n"
              << "Must specify at least " << NumPositionalRequired
              << " positional argument" << (NumPositionalRequired > 1 ? "s" : "")
-             << ": See: " << argv[0] << " - help\n";
-    }
+             << ": See: " << argv[0] << " -help\n";
 
     ErrorParsing = true;
   } else if (!HasUnlimitedPositionals &&
              PositionalVals.size() > PositionalOpts.size()) {
-    if (!IgnoreErrors) {
-      errs() << ProgramName << ": Too many positional arguments specified!\n"
-             << "Can specify at most " << PositionalOpts.size()
-             << " positional arguments: See: " << argv[0] << " -help\n";
-    }
+    *Errs << ProgramName << ": Too many positional arguments specified!\n"
+          << "Can specify at most " << PositionalOpts.size()
+          << " positional arguments: See: " << argv[0] << " -help\n";
     ErrorParsing = true;
 
   } else if (!ConsumeAfterOpt) {
@@ -1404,8 +1401,8 @@ static StringRef getValueStr(const Option &O, StringRef DefaultMsg) {
 // Return the width of the option tag for printing...
 size_t alias::getOptionWidth() const { return ArgStr.size() + 6; }
 
-static void printHelpStr(StringRef HelpStr, size_t Indent,
-                         size_t FirstLineIndentedBy) {
+void Option::printHelpStr(StringRef HelpStr, size_t Indent,
+                                 size_t FirstLineIndentedBy) {
   std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
   outs().indent(Indent - FirstLineIndentedBy) << " - " << Split.first << "\n";
   while (!Split.second.empty()) {
@@ -1448,7 +1445,7 @@ void basic_parser_impl::printOptionInfo(const Option &O,
   if (!ValName.empty())
     outs() << "=<" << getValueStr(O, ValName) << '>';
 
-  printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
+  Option::printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
 }
 
 void basic_parser_impl::printOptionName(const Option &O,
@@ -1526,13 +1523,9 @@ bool parser<unsigned long long>::parse(Option &O, StringRef ArgName,
 // parser<double>/parser<float> implementation
 //
 static bool parseDouble(Option &O, StringRef Arg, double &Value) {
-  SmallString<32> TmpStr(Arg.begin(), Arg.end());
-  const char *ArgStart = TmpStr.c_str();
-  char *End;
-  Value = strtod(ArgStart, &End);
-  if (*End != 0)
-    return O.error("'" + Arg + "' value invalid for floating point argument!");
-  return false;
+  if (to_float(Arg, Value))
+    return false;
+  return O.error("'" + Arg + "' value invalid for floating point argument!");
 }
 
 bool parser<double>::parse(Option &O, StringRef ArgName, StringRef Arg,
@@ -1587,7 +1580,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
                                           size_t GlobalWidth) const {
   if (O.hasArgStr()) {
     outs() << "  -" << O.ArgStr;
-    printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
+    Option::printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
 
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       size_t NumSpaces = GlobalWidth - getOption(i).size() - 8;
@@ -1600,7 +1593,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       auto Option = getOption(i);
       outs() << "    -" << Option;
-      printHelpStr(getDescription(i), GlobalWidth, Option.size() + 8);
+      Option::printHelpStr(getDescription(i), GlobalWidth, Option.size() + 8);
     }
   }
 }
@@ -1856,10 +1849,11 @@ public:
 
   // Helper function for printOptions().
   // It shall return a negative value if A's name should be lexicographically
-  // ordered before B's name. It returns a value greater equal zero otherwise.
+  // ordered before B's name. It returns a value greater than zero if B's name
+  // should be ordered before A's name, and it returns 0 otherwise.
   static int OptionCategoryCompare(OptionCategory *const *A,
                                    OptionCategory *const *B) {
-    return (*A)->getName() == (*B)->getName();
+    return (*A)->getName().compare((*B)->getName());
   }
 
   // Make sure we inherit our base class's operator=()
@@ -2072,12 +2066,15 @@ public:
 #ifndef NDEBUG
     OS << " with assertions";
 #endif
+#if LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO
     std::string CPU = sys::getHostCPUName();
     if (CPU == "generic")
       CPU = "(unknown)";
     OS << ".\n"
        << "  Default target: " << sys::getDefaultTargetTriple() << '\n'
-       << "  Host CPU: " << CPU << '\n';
+       << "  Host CPU: " << CPU;
+#endif
+    OS << '\n';
   }
   void operator=(bool OptionWasSpecified) {
     if (!OptionWasSpecified)
@@ -2182,5 +2179,6 @@ void cl::ResetAllOptionOccurrences() {
 
 void LLVMParseCommandLineOptions(int argc, const char *const *argv,
                                  const char *Overview) {
-  llvm::cl::ParseCommandLineOptions(argc, argv, StringRef(Overview), true);
+  llvm::cl::ParseCommandLineOptions(argc, argv, StringRef(Overview),
+                                    &llvm::nulls());
 }
diff --git a/contrib/llvm/lib/Support/Compression.cpp b/contrib/llvm/lib/Support/Compression.cpp
index 5d55646..c279d10 100644
--- a/contrib/llvm/lib/Support/Compression.cpp
+++ b/contrib/llvm/lib/Support/Compression.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #if LLVM_ENABLE_ZLIB == 1 && HAVE_ZLIB_H
 #include <zlib.h>
@@ -24,6 +25,10 @@
 using namespace llvm;
 
 #if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
+static Error createError(StringRef Err) {
+  return make_error<StringError>(Err, inconvertibleErrorCode());
+}
+
 static int encodeZlibCompressionLevel(zlib::CompressionLevel Level) {
   switch (Level) {
     case zlib::NoCompression: return 0;
@@ -34,53 +39,59 @@ static int encodeZlibCompressionLevel(zlib::CompressionLevel Level) {
   llvm_unreachable("Invalid zlib::CompressionLevel!");
 }
 
-static zlib::Status encodeZlibReturnValue(int ReturnValue) {
-  switch (ReturnValue) {
-    case Z_OK: return zlib::StatusOK;
-    case Z_MEM_ERROR: return zlib::StatusOutOfMemory;
-    case Z_BUF_ERROR: return zlib::StatusBufferTooShort;
-    case Z_STREAM_ERROR: return zlib::StatusInvalidArg;
-    case Z_DATA_ERROR: return zlib::StatusInvalidData;
-    default: llvm_unreachable("unknown zlib return status!");
+static StringRef convertZlibCodeToString(int Code) {
+  switch (Code) {
+  case Z_MEM_ERROR:
+    return "zlib error: Z_MEM_ERROR";
+  case Z_BUF_ERROR:
+    return "zlib error: Z_BUF_ERROR";
+  case Z_STREAM_ERROR:
+    return "zlib error: Z_STREAM_ERROR";
+  case Z_DATA_ERROR:
+    return "zlib error: Z_DATA_ERROR";
+  case Z_OK:
+  default:
+    llvm_unreachable("unknown or unexpected zlib status code");
   }
 }
 
 bool zlib::isAvailable() { return true; }
-zlib::Status zlib::compress(StringRef InputBuffer,
-                            SmallVectorImpl<char> &CompressedBuffer,
-                            CompressionLevel Level) {
+
+Error zlib::compress(StringRef InputBuffer,
+                     SmallVectorImpl<char> &CompressedBuffer,
+                     CompressionLevel Level) {
   unsigned long CompressedSize = ::compressBound(InputBuffer.size());
   CompressedBuffer.resize(CompressedSize);
   int CLevel = encodeZlibCompressionLevel(Level);
-  Status Res = encodeZlibReturnValue(::compress2(
-      (Bytef *)CompressedBuffer.data(), &CompressedSize,
-      (const Bytef *)InputBuffer.data(), InputBuffer.size(), CLevel));
+  int Res = ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize,
+                        (const Bytef *)InputBuffer.data(), InputBuffer.size(),
+                        CLevel);
   // Tell MemorySanitizer that zlib output buffer is fully initialized.
   // This avoids a false report when running LLVM with uninstrumented ZLib.
   __msan_unpoison(CompressedBuffer.data(), CompressedSize);
   CompressedBuffer.resize(CompressedSize);
-  return Res;
+  return Res ? createError(convertZlibCodeToString(Res)) : Error::success();
 }
 
-zlib::Status zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
-                              size_t &UncompressedSize) {
-  Status Res = encodeZlibReturnValue(
+Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
+                       size_t &UncompressedSize) {
+  int Res =
       ::uncompress((Bytef *)UncompressedBuffer, (uLongf *)&UncompressedSize,
-                   (const Bytef *)InputBuffer.data(), InputBuffer.size()));
+                   (const Bytef *)InputBuffer.data(), InputBuffer.size());
   // Tell MemorySanitizer that zlib output buffer is fully initialized.
   // This avoids a false report when running LLVM with uninstrumented ZLib.
   __msan_unpoison(UncompressedBuffer, UncompressedSize);
-  return Res;
+  return Res ? createError(convertZlibCodeToString(Res)) : Error::success();
 }
 
-zlib::Status zlib::uncompress(StringRef InputBuffer,
-                              SmallVectorImpl<char> &UncompressedBuffer,
-                              size_t UncompressedSize) {
+Error zlib::uncompress(StringRef InputBuffer,
+                       SmallVectorImpl<char> &UncompressedBuffer,
+                       size_t UncompressedSize) {
   UncompressedBuffer.resize(UncompressedSize);
-  Status Res =
+  Error E =
       uncompress(InputBuffer, UncompressedBuffer.data(), UncompressedSize);
   UncompressedBuffer.resize(UncompressedSize);
-  return Res;
+  return E;
 }
 
 uint32_t zlib::crc32(StringRef Buffer) {
@@ -89,19 +100,19 @@ uint32_t zlib::crc32(StringRef Buffer) {
 
 #else
 bool zlib::isAvailable() { return false; }
-zlib::Status zlib::compress(StringRef InputBuffer,
-                            SmallVectorImpl<char> &CompressedBuffer,
-                            CompressionLevel Level) {
-  return zlib::StatusUnsupported;
+Error zlib::compress(StringRef InputBuffer,
+                     SmallVectorImpl<char> &CompressedBuffer,
+                     CompressionLevel Level) {
+  llvm_unreachable("zlib::compress is unavailable");
 }
-zlib::Status zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
-                              size_t &UncompressedSize) {
-  return zlib::StatusUnsupported;
+Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
+                       size_t &UncompressedSize) {
+  llvm_unreachable("zlib::uncompress is unavailable");
 }
-zlib::Status zlib::uncompress(StringRef InputBuffer,
-                              SmallVectorImpl<char> &UncompressedBuffer,
-                              size_t UncompressedSize) {
-  return zlib::StatusUnsupported;
+Error zlib::uncompress(StringRef InputBuffer,
+                       SmallVectorImpl<char> &UncompressedBuffer,
+                       size_t UncompressedSize) {
+  llvm_unreachable("zlib::uncompress is unavailable");
 }
 uint32_t zlib::crc32(StringRef Buffer) {
   llvm_unreachable("zlib::crc32 is unavailable");
diff --git a/contrib/llvm/lib/Support/ConvertUTF.cpp b/contrib/llvm/lib/Support/ConvertUTF.cpp
index 39fd218..e56854a 100644
--- a/contrib/llvm/lib/Support/ConvertUTF.cpp
+++ b/contrib/llvm/lib/Support/ConvertUTF.cpp
@@ -46,13 +46,40 @@
 
 ------------------------------------------------------------------------ */
 
-
 #include "llvm/Support/ConvertUTF.h"
 #ifdef CVTUTF_DEBUG
 #include <stdio.h>
 #endif
 #include <assert.h>
 
+/*
+ * This code extensively uses fall-through switches.
+ * Keep the compiler from warning about that.
+ */
+#if defined(__clang__) && defined(__has_warning)
+# if __has_warning("-Wimplicit-fallthrough")
+#  define ConvertUTF_DISABLE_WARNINGS \
+    _Pragma("clang diagnostic push")  \
+    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
+#  define ConvertUTF_RESTORE_WARNINGS \
+    _Pragma("clang diagnostic pop")
+# endif
+#elif defined(__GNUC__) && __GNUC__ > 6
+# define ConvertUTF_DISABLE_WARNINGS \
+   _Pragma("GCC diagnostic push")    \
+   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
+# define ConvertUTF_RESTORE_WARNINGS \
+   _Pragma("GCC diagnostic pop")
+#endif
+#ifndef ConvertUTF_DISABLE_WARNINGS
+# define ConvertUTF_DISABLE_WARNINGS
+#endif
+#ifndef ConvertUTF_RESTORE_WARNINGS
+# define ConvertUTF_RESTORE_WARNINGS
+#endif
+
+ConvertUTF_DISABLE_WARNINGS
+
 namespace llvm {
 
 static const int halfShift  = 10; /* used for shifting by 10 bits */
@@ -708,3 +735,5 @@ ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
    --------------------------------------------------------------------- */
 
 } // namespace llvm
+
+ConvertUTF_RESTORE_WARNINGS
diff --git a/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp b/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp
index 217cedb..6cb4f63 100644
--- a/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/contrib/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/ConvertUTF.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include <string>
diff --git a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
index 98865f5..bd38dd8 100644
--- a/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/contrib/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -78,6 +78,9 @@ static bool gCrashRecoveryEnabled = false;
 static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContext>>
        tlIsRecoveringFromCrash;
 
+static void installExceptionOrSignalHandlers();
+static void uninstallExceptionOrSignalHandlers();
+
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
 
 CrashRecoveryContext::~CrashRecoveryContext() {
@@ -113,6 +116,23 @@ CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
   return CRCI->CRC;
 }
 
+void CrashRecoveryContext::Enable() {
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
+  // FIXME: Shouldn't this be a refcount or something?
+  if (gCrashRecoveryEnabled)
+    return;
+  gCrashRecoveryEnabled = true;
+  installExceptionOrSignalHandlers();
+}
+
+void CrashRecoveryContext::Disable() {
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
+  if (!gCrashRecoveryEnabled)
+    return;
+  gCrashRecoveryEnabled = false;
+  uninstallExceptionOrSignalHandlers();
+}
+
 void CrashRecoveryContext::registerCleanup(CrashRecoveryContextCleanup *cleanup)
 {
   if (!cleanup)
@@ -140,30 +160,70 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
   delete cleanup;
 }
 
-#ifdef LLVM_ON_WIN32
+#if defined(_MSC_VER)
+// If _MSC_VER is defined, we must have SEH. Use it if it's available. It's way
+// better than VEH. Vectored exception handling catches all exceptions happening
+// on the thread with installed exception handlers, so it can interfere with
+// internal exception handling of other libraries on that thread. SEH works
+// exactly as you would expect normal exception handling to work: it only
+// catches exceptions if they would bubble out from the stack frame with __try /
+// __except.
 
-#include "Windows/WindowsSupport.h"
+static void installExceptionOrSignalHandlers() {}
+static void uninstallExceptionOrSignalHandlers() {}
 
-// On Windows, we can make use of vectored exception handling to
-// catch most crashing situations.  Note that this does mean
-// we will be alerted of exceptions *before* structured exception
-// handling has the opportunity to catch it.  But that isn't likely
-// to cause problems because nowhere in the project is SEH being
-// used.
+bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
+  if (!gCrashRecoveryEnabled) {
+    Fn();
+    return true;
+  }
+
+  bool Result = true;
+  __try {
+    Fn();
+  } __except (1) { // Catch any exception.
+    Result = false;
+  }
+  return Result;
+}
+
+#else // !_MSC_VER
+
+#if defined(LLVM_ON_WIN32)
+// This is a non-MSVC compiler, probably mingw gcc or clang without
+// -fms-extensions. Use vectored exception handling (VEH).
+//
+// On Windows, we can make use of vectored exception handling to catch most
+// crashing situations.  Note that this does mean we will be alerted of
+// exceptions *before* structured exception handling has the opportunity to
+// catch it. Unfortunately, this causes problems in practice with other code
+// running on threads with LLVM crash recovery contexts, so we would like to
+// eventually move away from VEH.
 //
-// Vectored exception handling is built on top of SEH, and so it
-// works on a per-thread basis.
+// Vectored works on a per-thread basis, which is an advantage over
+// SetUnhandledExceptionFilter. SetUnhandledExceptionFilter also doesn't have
+// any native support for chaining exception handlers, but VEH allows more than
+// one.
 //
 // The vectored exception handler functionality was added in Windows
 // XP, so if support for older versions of Windows is required,
 // it will have to be added.
-//
-// If we want to support as far back as Win2k, we could use the
-// SetUnhandledExceptionFilter API, but there's a risk of that
-// being entirely overwritten (it's not a chain).
+
+#include "Windows/WindowsSupport.h"
 
 static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 {
+  // DBG_PRINTEXCEPTION_WIDE_C is not properly defined on all supported
+  // compilers and platforms, so we define it manually.
+  constexpr ULONG DbgPrintExceptionWideC = 0x4001000AL;
+  switch (ExceptionInfo->ExceptionRecord->ExceptionCode)
+  {
+  case DBG_PRINTEXCEPTION_C:
+  case DbgPrintExceptionWideC:
+  case 0x406D1388:  // set debugger thread name
+    return EXCEPTION_CONTINUE_EXECUTION;
+  }
+
   // Lookup the current thread local recovery object.
   const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
@@ -192,14 +252,7 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 // non-NULL, valid VEH handles, or NULL.
 static sys::ThreadLocal<const void> sCurrentExceptionHandle;
 
-void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
-  if (gCrashRecoveryEnabled)
-    return;
-
-  gCrashRecoveryEnabled = true;
-
+static void installExceptionOrSignalHandlers() {
   // We can set up vectored exception handling now.  We will install our
   // handler as the front of the list, though there's no assurances that
   // it will remain at the front (another call could install itself before
@@ -208,14 +261,7 @@ void CrashRecoveryContext::Enable() {
   sCurrentExceptionHandle.set(handle);
 }
 
-void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
-  if (!gCrashRecoveryEnabled)
-    return;
-
-  gCrashRecoveryEnabled = false;
-
+static void uninstallExceptionOrSignalHandlers() {
   PVOID currentHandle = const_cast<PVOID>(sCurrentExceptionHandle.get());
   if (currentHandle) {
     // Now we can remove the vectored exception handler from the chain
@@ -226,7 +272,7 @@ void CrashRecoveryContext::Disable() {
   }
 }
 
-#else
+#else // !LLVM_ON_WIN32
 
 // Generic POSIX implementation.
 //
@@ -278,14 +324,7 @@ static void CrashRecoverySignalHandler(int Signal) {
     const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
 }
 
-void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
-  if (gCrashRecoveryEnabled)
-    return;
-
-  gCrashRecoveryEnabled = true;
-
+static void installExceptionOrSignalHandlers() {
   // Setup the signal handler.
   struct sigaction Handler;
   Handler.sa_handler = CrashRecoverySignalHandler;
@@ -297,20 +336,13 @@ void CrashRecoveryContext::Enable() {
   }
 }
 
-void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(*gCrashRecoveryContextMutex);
-
-  if (!gCrashRecoveryEnabled)
-    return;
-
-  gCrashRecoveryEnabled = false;
-
+static void uninstallExceptionOrSignalHandlers() {
   // Restore the previous signal handlers.
   for (unsigned i = 0; i != NumSignals; ++i)
     sigaction(Signals[i], &PrevActions[i], nullptr);
 }
 
-#endif
+#endif // !LLVM_ON_WIN32
 
 bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
   // If crash recovery is disabled, do nothing.
@@ -328,6 +360,8 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
   return true;
 }
 
+#endif // !_MSC_VER
+
 void CrashRecoveryContext::HandleCrash() {
   CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
   assert(CRCI && "Crash recovery context never initialized!");
diff --git a/contrib/llvm/lib/Support/DataExtractor.cpp b/contrib/llvm/lib/Support/DataExtractor.cpp
index 5d6d60a..0199b30 100644
--- a/contrib/llvm/lib/Support/DataExtractor.cpp
+++ b/contrib/llvm/lib/Support/DataExtractor.cpp
@@ -68,6 +68,13 @@ uint16_t *DataExtractor::getU16(uint32_t *offset_ptr, uint16_t *dst,
                         Data.data());
 }
 
+uint32_t DataExtractor::getU24(uint32_t *offset_ptr) const {
+  uint24_t ExtractedVal =
+      getU<uint24_t>(offset_ptr, this, IsLittleEndian, Data.data());
+  // The 3 bytes are in the correct byte order for the host.
+  return ExtractedVal.getAsUint32(sys::IsLittleEndianHost);
+}
+
 uint32_t DataExtractor::getU32(uint32_t *offset_ptr) const {
   return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data());
 }
@@ -128,6 +135,16 @@ const char *DataExtractor::getCStr(uint32_t *offset_ptr) const {
   return nullptr;
 }
 
+StringRef DataExtractor::getCStrRef(uint32_t *OffsetPtr) const {
+  uint32_t Start = *OffsetPtr;
+  StringRef::size_type Pos = Data.find('\0', Start);
+  if (Pos != StringRef::npos) {
+    *OffsetPtr = Pos + 1;
+    return StringRef(Data.data() + Start, Pos - Start);
+  }
+  return StringRef();
+}
+
 uint64_t DataExtractor::getULEB128(uint32_t *offset_ptr) const {
   uint64_t result = 0;
   if (Data.empty())
diff --git a/contrib/llvm/lib/Support/DebugCounter.cpp b/contrib/llvm/lib/Support/DebugCounter.cpp
new file mode 100644
index 0000000..1d46de0
--- /dev/null
+++ b/contrib/llvm/lib/Support/DebugCounter.cpp
@@ -0,0 +1,114 @@
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Options.h"
+
+using namespace llvm;
+
+namespace {
+// This class overrides the default list implementation of printing so we
+// can pretty print the list of debug counter options.  This type of
+// dynamic option is pretty rare (basically this and pass lists).
+class DebugCounterList : public cl::list<std::string, DebugCounter> {
+private:
+  using Base = cl::list<std::string, DebugCounter>;
+
+public:
+  template <class... Mods>
+  explicit DebugCounterList(Mods &&... Ms) : Base(std::forward<Mods>(Ms)...) {}
+
+private:
+  void printOptionInfo(size_t GlobalWidth) const override {
+    // This is a variant of from generic_parser_base::printOptionInfo.  Sadly,
+    // it's not easy to make it more usable.  We could get it to print these as
+    // options if we were a cl::opt and registered them, but lists don't have
+    // options, nor does the parser for std::string.  The other mechanisms for
+    // options are global and would pollute the global namespace with our
+    // counters.  Rather than go that route, we have just overridden the
+    // printing, which only a few things call anyway.
+    outs() << "  -" << ArgStr;
+    // All of the other options in CommandLine.cpp use ArgStr.size() + 6 for
+    // width, so we do the same.
+    Option::printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6);
+    const auto &CounterInstance = DebugCounter::instance();
+    for (auto Name : CounterInstance) {
+      const auto Info =
+          CounterInstance.getCounterInfo(CounterInstance.getCounterId(Name));
+      size_t NumSpaces = GlobalWidth - Info.first.size() - 8;
+      outs() << "    =" << Info.first;
+      outs().indent(NumSpaces) << " -   " << Info.second << '\n';
+    }
+  }
+};
+} // namespace
+
+// Create our command line option.
+static DebugCounterList DebugCounterOption(
+    "debug-counter",
+    cl::desc("Comma separated list of debug counter skip and count"),
+    cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance()));
+
+static ManagedStatic<DebugCounter> DC;
+
+DebugCounter &DebugCounter::instance() { return *DC; }
+
+// This is called by the command line parser when it sees a value for the
+// debug-counter option defined above.
+void DebugCounter::push_back(const std::string &Val) {
+  if (Val.empty())
+    return;
+  // The strings should come in as counter=value
+  auto CounterPair = StringRef(Val).split('=');
+  if (CounterPair.second.empty()) {
+    errs() << "DebugCounter Error: " << Val << " does not have an = in it\n";
+    return;
+  }
+  // Now we have counter=value.
+  // First, process value.
+  long CounterVal;
+  if (CounterPair.second.getAsInteger(0, CounterVal)) {
+    errs() << "DebugCounter Error: " << CounterPair.second
+           << " is not a number\n";
+    return;
+  }
+  // Now we need to see if this is the skip or the count, remove the suffix, and
+  // add it to the counter values.
+  if (CounterPair.first.endswith("-skip")) {
+    auto CounterName = CounterPair.first.drop_back(5);
+    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    if (!CounterID) {
+      errs() << "DebugCounter Error: " << CounterName
+             << " is not a registered counter\n";
+      return;
+    }
+
+    auto Res = Counters.insert({CounterID, {0, -1}});
+    Res.first->second.first = CounterVal;
+  } else if (CounterPair.first.endswith("-count")) {
+    auto CounterName = CounterPair.first.drop_back(6);
+    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    if (!CounterID) {
+      errs() << "DebugCounter Error: " << CounterName
+             << " is not a registered counter\n";
+      return;
+    }
+
+    auto Res = Counters.insert({CounterID, {0, -1}});
+    Res.first->second.second = CounterVal;
+  } else {
+    errs() << "DebugCounter Error: " << CounterPair.first
+           << " does not end with -skip or -count\n";
+  }
+}
+
+void DebugCounter::print(raw_ostream &OS) const {
+  OS << "Counters and values:\n";
+  for (const auto &KV : Counters)
+    OS << left_justify(RegisteredCounters[KV.first], 32) << ": {"
+       << KV.second.first << "," << KV.second.second << "}\n";
+}
+
+LLVM_DUMP_METHOD void DebugCounter::dump() const {
+  print(dbgs());
+}
diff --git a/contrib/llvm/lib/Support/Dwarf.cpp b/contrib/llvm/lib/Support/Dwarf.cpp
deleted file mode 100644
index 8950e8c..0000000
--- a/contrib/llvm/lib/Support/Dwarf.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-//===-- llvm/Support/Dwarf.cpp - Dwarf Framework ----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains support for generic dwarf information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/Dwarf.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-using namespace dwarf;
-
-StringRef llvm::dwarf::TagString(unsigned Tag) {
-  switch (Tag) {
-  default:
-    return StringRef();
-#define HANDLE_DW_TAG(ID, NAME)                                                \
-  case DW_TAG_##NAME:                                                          \
-    return "DW_TAG_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-unsigned llvm::dwarf::getTag(StringRef TagString) {
-  return StringSwitch<unsigned>(TagString)
-#define HANDLE_DW_TAG(ID, NAME) .Case("DW_TAG_" #NAME, DW_TAG_##NAME)
-#include "llvm/Support/Dwarf.def"
-      .Default(DW_TAG_invalid);
-}
-
-StringRef llvm::dwarf::ChildrenString(unsigned Children) {
-  switch (Children) {
-  case DW_CHILDREN_no:                   return "DW_CHILDREN_no";
-  case DW_CHILDREN_yes:                  return "DW_CHILDREN_yes";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::AttributeString(unsigned Attribute) {
-  switch (Attribute) {
-  default:
-    return StringRef();
-#define HANDLE_DW_AT(ID, NAME)                                                \
-  case DW_AT_##NAME:                                                          \
-    return "DW_AT_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-StringRef llvm::dwarf::FormEncodingString(unsigned Encoding) {
-  switch (Encoding) {
-  default:
-    return StringRef();
-#define HANDLE_DW_FORM(ID, NAME)                                                \
-  case DW_FORM_##NAME:                                                          \
-    return "DW_FORM_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-StringRef llvm::dwarf::OperationEncodingString(unsigned Encoding) {
-  switch (Encoding) {
-  default:
-    return StringRef();
-#define HANDLE_DW_OP(ID, NAME)                                                 \
-  case DW_OP_##NAME:                                                           \
-    return "DW_OP_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  case DW_OP_LLVM_fragment:
-    return "DW_OP_LLVM_fragment";
-  }
-}
-
-unsigned llvm::dwarf::getOperationEncoding(StringRef OperationEncodingString) {
-  return StringSwitch<unsigned>(OperationEncodingString)
-#define HANDLE_DW_OP(ID, NAME) .Case("DW_OP_" #NAME, DW_OP_##NAME)
-#include "llvm/Support/Dwarf.def"
-      .Case("DW_OP_LLVM_fragment", DW_OP_LLVM_fragment)
-      .Default(0);
-}
-
-StringRef llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
-  switch (Encoding) {
-  default:
-    return StringRef();
-#define HANDLE_DW_ATE(ID, NAME)                                                \
-  case DW_ATE_##NAME:                                                          \
-    return "DW_ATE_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-unsigned llvm::dwarf::getAttributeEncoding(StringRef EncodingString) {
-  return StringSwitch<unsigned>(EncodingString)
-#define HANDLE_DW_ATE(ID, NAME) .Case("DW_ATE_" #NAME, DW_ATE_##NAME)
-#include "llvm/Support/Dwarf.def"
-      .Default(0);
-}
-
-StringRef llvm::dwarf::DecimalSignString(unsigned Sign) {
-  switch (Sign) {
-  case DW_DS_unsigned:                   return "DW_DS_unsigned";
-  case DW_DS_leading_overpunch:          return "DW_DS_leading_overpunch";
-  case DW_DS_trailing_overpunch:         return "DW_DS_trailing_overpunch";
-  case DW_DS_leading_separate:           return "DW_DS_leading_separate";
-  case DW_DS_trailing_separate:          return "DW_DS_trailing_separate";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::EndianityString(unsigned Endian) {
-  switch (Endian) {
-  case DW_END_default:                   return "DW_END_default";
-  case DW_END_big:                       return "DW_END_big";
-  case DW_END_little:                    return "DW_END_little";
-  case DW_END_lo_user:                   return "DW_END_lo_user";
-  case DW_END_hi_user:                   return "DW_END_hi_user";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::AccessibilityString(unsigned Access) {
-  switch (Access) {
-  // Accessibility codes
-  case DW_ACCESS_public:                 return "DW_ACCESS_public";
-  case DW_ACCESS_protected:              return "DW_ACCESS_protected";
-  case DW_ACCESS_private:                return "DW_ACCESS_private";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::VisibilityString(unsigned Visibility) {
-  switch (Visibility) {
-  case DW_VIS_local:                     return "DW_VIS_local";
-  case DW_VIS_exported:                  return "DW_VIS_exported";
-  case DW_VIS_qualified:                 return "DW_VIS_qualified";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::VirtualityString(unsigned Virtuality) {
-  switch (Virtuality) {
-  default:
-    return StringRef();
-#define HANDLE_DW_VIRTUALITY(ID, NAME)                                         \
-  case DW_VIRTUALITY_##NAME:                                                   \
-    return "DW_VIRTUALITY_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-unsigned llvm::dwarf::getVirtuality(StringRef VirtualityString) {
-  return StringSwitch<unsigned>(VirtualityString)
-#define HANDLE_DW_VIRTUALITY(ID, NAME)                                         \
-  .Case("DW_VIRTUALITY_" #NAME, DW_VIRTUALITY_##NAME)
-#include "llvm/Support/Dwarf.def"
-      .Default(DW_VIRTUALITY_invalid);
-}
-
-StringRef llvm::dwarf::LanguageString(unsigned Language) {
-  switch (Language) {
-  default:
-    return StringRef();
-#define HANDLE_DW_LANG(ID, NAME)                                               \
-  case DW_LANG_##NAME:                                                         \
-    return "DW_LANG_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-unsigned llvm::dwarf::getLanguage(StringRef LanguageString) {
-  return StringSwitch<unsigned>(LanguageString)
-#define HANDLE_DW_LANG(ID, NAME) .Case("DW_LANG_" #NAME, DW_LANG_##NAME)
-#include "llvm/Support/Dwarf.def"
-      .Default(0);
-}
-
-StringRef llvm::dwarf::CaseString(unsigned Case) {
-  switch (Case) {
-  case DW_ID_case_sensitive:             return "DW_ID_case_sensitive";
-  case DW_ID_up_case:                    return "DW_ID_up_case";
-  case DW_ID_down_case:                  return "DW_ID_down_case";
-  case DW_ID_case_insensitive:           return "DW_ID_case_insensitive";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::ConventionString(unsigned CC) {
-  switch (CC) {
-  default:
-    return StringRef();
-#define HANDLE_DW_CC(ID, NAME)                                               \
-  case DW_CC_##NAME:                                                         \
-    return "DW_CC_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-unsigned llvm::dwarf::getCallingConvention(StringRef CCString) {
-  return StringSwitch<unsigned>(CCString)
-#define HANDLE_DW_CC(ID, NAME) .Case("DW_CC_" #NAME, DW_CC_##NAME)
-#include "llvm/Support/Dwarf.def"
-      .Default(0);
-}
-
-StringRef llvm::dwarf::InlineCodeString(unsigned Code) {
-  switch (Code) {
-  case DW_INL_not_inlined:               return "DW_INL_not_inlined";
-  case DW_INL_inlined:                   return "DW_INL_inlined";
-  case DW_INL_declared_not_inlined:      return "DW_INL_declared_not_inlined";
-  case DW_INL_declared_inlined:          return "DW_INL_declared_inlined";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::ArrayOrderString(unsigned Order) {
-  switch (Order) {
-  case DW_ORD_row_major:                 return "DW_ORD_row_major";
-  case DW_ORD_col_major:                 return "DW_ORD_col_major";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::DiscriminantString(unsigned Discriminant) {
-  switch (Discriminant) {
-  case DW_DSC_label:                     return "DW_DSC_label";
-  case DW_DSC_range:                     return "DW_DSC_range";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::LNStandardString(unsigned Standard) {
-  switch (Standard) {
-  default:
-    return StringRef();
-#define HANDLE_DW_LNS(ID, NAME)                                               \
-  case DW_LNS_##NAME:                                                         \
-    return "DW_LNS_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-StringRef llvm::dwarf::LNExtendedString(unsigned Encoding) {
-  switch (Encoding) {
-  default:
-    return StringRef();
-#define HANDLE_DW_LNE(ID, NAME)                                               \
-  case DW_LNE_##NAME:                                                         \
-    return "DW_LNE_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-StringRef llvm::dwarf::MacinfoString(unsigned Encoding) {
-  switch (Encoding) {
-  // Macinfo Type Encodings
-  case DW_MACINFO_define:                return "DW_MACINFO_define";
-  case DW_MACINFO_undef:                 return "DW_MACINFO_undef";
-  case DW_MACINFO_start_file:            return "DW_MACINFO_start_file";
-  case DW_MACINFO_end_file:              return "DW_MACINFO_end_file";
-  case DW_MACINFO_vendor_ext:            return "DW_MACINFO_vendor_ext";
-  case DW_MACINFO_invalid:               return "DW_MACINFO_invalid";
-  }
-  return StringRef();
-}
-
-unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
-  return StringSwitch<unsigned>(MacinfoString)
-      .Case("DW_MACINFO_define", DW_MACINFO_define)
-      .Case("DW_MACINFO_undef", DW_MACINFO_undef)
-      .Case("DW_MACINFO_start_file", DW_MACINFO_start_file)
-      .Case("DW_MACINFO_end_file", DW_MACINFO_end_file)
-      .Case("DW_MACINFO_vendor_ext", DW_MACINFO_vendor_ext)
-      .Default(DW_MACINFO_invalid);
-}
-
-StringRef llvm::dwarf::CallFrameString(unsigned Encoding) {
-  switch (Encoding) {
-  default:
-    return StringRef();
-#define HANDLE_DW_CFA(ID, NAME)                                               \
-  case DW_CFA_##NAME:                                                         \
-    return "DW_CFA_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-StringRef llvm::dwarf::ApplePropertyString(unsigned Prop) {
-  switch (Prop) {
-  default:
-    return StringRef();
-#define HANDLE_DW_APPLE_PROPERTY(ID, NAME)                                               \
-  case DW_APPLE_PROPERTY_##NAME:                                                         \
-    return "DW_APPLE_PROPERTY_" #NAME;
-#include "llvm/Support/Dwarf.def"
-  }
-}
-
-StringRef llvm::dwarf::AtomTypeString(unsigned AT) {
-  switch (AT) {
-  case dwarf::DW_ATOM_null:
-    return "DW_ATOM_null";
-  case dwarf::DW_ATOM_die_offset:
-    return "DW_ATOM_die_offset";
-  case DW_ATOM_cu_offset:
-    return "DW_ATOM_cu_offset";
-  case DW_ATOM_die_tag:
-    return "DW_ATOM_die_tag";
-  case DW_ATOM_type_flags:
-    return "DW_ATOM_type_flags";
-  }
-  return StringRef();
-}
-
-StringRef llvm::dwarf::GDBIndexEntryKindString(GDBIndexEntryKind Kind) {
-  switch (Kind) {
-  case GIEK_NONE:
-    return "NONE";
-  case GIEK_TYPE:
-    return "TYPE";
-  case GIEK_VARIABLE:
-    return "VARIABLE";
-  case GIEK_FUNCTION:
-    return "FUNCTION";
-  case GIEK_OTHER:
-    return "OTHER";
-  case GIEK_UNUSED5:
-    return "UNUSED5";
-  case GIEK_UNUSED6:
-    return "UNUSED6";
-  case GIEK_UNUSED7:
-    return "UNUSED7";
-  }
-  llvm_unreachable("Unknown GDBIndexEntryKind value");
-}
-
-StringRef
-llvm::dwarf::GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage) {
-  switch (Linkage) {
-  case GIEL_EXTERNAL:
-    return "EXTERNAL";
-  case GIEL_STATIC:
-    return "STATIC";
-  }
-  llvm_unreachable("Unknown GDBIndexEntryLinkage value");
-}
-
-StringRef llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) {
-  switch (Attr) {
-  case DW_AT_accessibility:
-    return AccessibilityString(Val);
-  case DW_AT_virtuality:
-    return VirtualityString(Val);
-  case DW_AT_language:
-    return LanguageString(Val);
-  case DW_AT_encoding:
-    return AttributeEncodingString(Val);
-  case DW_AT_decimal_sign:
-    return DecimalSignString(Val);
-  case DW_AT_endianity:
-    return EndianityString(Val);
-  case DW_AT_visibility:
-    return VisibilityString(Val);
-  case DW_AT_identifier_case:
-    return CaseString(Val);
-  case DW_AT_calling_convention:
-    return ConventionString(Val);
-  case DW_AT_inline:
-    return InlineCodeString(Val);
-  case DW_AT_ordering:
-    return ArrayOrderString(Val);
-  case DW_AT_discr_value:
-    return DiscriminantString(Val);
-  }
-
-  return StringRef();
-}
diff --git a/contrib/llvm/lib/Support/DynamicLibrary.cpp b/contrib/llvm/lib/Support/DynamicLibrary.cpp
index ced21e4..d842211 100644
--- a/contrib/llvm/lib/Support/DynamicLibrary.cpp
+++ b/contrib/llvm/lib/Support/DynamicLibrary.cpp
@@ -9,173 +9,201 @@
 //
 //  This file implements the operating system DynamicLibrary concept.
 //
-// FIXME: This file leaks ExplicitSymbols and OpenedHandles!
-//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm-c/Support.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include <cstdio>
 #include <cstring>
+#include <vector>
 
-// Collection of symbol name/value pairs to be searched prior to any libraries.
-static llvm::ManagedStatic<llvm::StringMap<void *> > ExplicitSymbols;
-static llvm::ManagedStatic<llvm::sys::SmartMutex<true> > SymbolsMutex;
-
-void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
-                                          void *symbolValue) {
-  SmartScopedLock<true> lock(*SymbolsMutex);
-  (*ExplicitSymbols)[symbolName] = symbolValue;
-}
-
-char llvm::sys::DynamicLibrary::Invalid = 0;
-
-#ifdef LLVM_ON_WIN32
-
-#include "Windows/DynamicLibrary.inc"
-
-#else
-
-#if defined(HAVE_DLFCN_H) && defined(HAVE_DLOPEN)
-#include <dlfcn.h>
 using namespace llvm;
 using namespace llvm::sys;
 
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only TRULY operating system
-//===          independent code.
-//===----------------------------------------------------------------------===//
+// All methods for HandleSet should be used holding SymbolsMutex.
+class DynamicLibrary::HandleSet {
+  typedef std::vector<void *> HandleList;
+  HandleList Handles;
+  void *Process;
 
-static DenseSet<void *> *OpenedHandles = nullptr;
+public:
+  static void *DLOpen(const char *Filename, std::string *Err);
+  static void DLClose(void *Handle);
+  static void *DLSym(void *Handle, const char *Symbol);
 
-DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
-                                                   std::string *errMsg) {
-  SmartScopedLock<true> lock(*SymbolsMutex);
+  HandleSet() : Process(nullptr) {}
+  ~HandleSet();
 
-  void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
-  if (!handle) {
-    if (errMsg) *errMsg = dlerror();
-    return DynamicLibrary();
+  HandleList::iterator Find(void *Handle) {
+    return std::find(Handles.begin(), Handles.end(), Handle);
   }
 
-#ifdef __CYGWIN__
-  // Cygwin searches symbols only in the main
-  // with the handle of dlopen(NULL, RTLD_GLOBAL).
-  if (!filename)
-    handle = RTLD_DEFAULT;
+  bool Contains(void *Handle) {
+    return Handle == Process || Find(Handle) != Handles.end();
+  }
+
+  bool AddLibrary(void *Handle, bool IsProcess = false, bool CanClose = true) {
+#ifdef LLVM_ON_WIN32
+    assert((Handle == this ? IsProcess : !IsProcess) && "Bad Handle.");
 #endif
 
-  if (!OpenedHandles)
-    OpenedHandles = new DenseSet<void *>();
+    if (LLVM_LIKELY(!IsProcess)) {
+      if (Find(Handle) != Handles.end()) {
+        if (CanClose)
+          DLClose(Handle);
+        return false;
+      }
+      Handles.push_back(Handle);
+    } else {
+#ifndef LLVM_ON_WIN32
+      if (Process) {
+        if (CanClose)
+          DLClose(Process);
+        if (Process == Handle)
+          return false;
+      }
+#endif
+      Process = Handle;
+    }
+    return true;
+  }
 
-  // If we've already loaded this library, dlclose() the handle in order to
-  // keep the internal refcount at +1.
-  if (!OpenedHandles->insert(handle).second)
-    dlclose(handle);
+  void *LibLookup(const char *Symbol, DynamicLibrary::SearchOrdering Order) {
+    if (Order & SO_LoadOrder) {
+      for (void *Handle : Handles) {
+        if (void *Ptr = DLSym(Handle, Symbol))
+          return Ptr;
+      }
+    } else {
+      for (void *Handle : llvm::reverse(Handles)) {
+        if (void *Ptr = DLSym(Handle, Symbol))
+          return Ptr;
+      }
+    }
+    return nullptr;
+  }
 
-  return DynamicLibrary(handle);
-}
+  void *Lookup(const char *Symbol, DynamicLibrary::SearchOrdering Order) {
+    assert(!((Order & SO_LoadedFirst) && (Order & SO_LoadedLast)) &&
+           "Invalid Ordering");
 
-void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
-  if (!isValid())
+    if (!Process || (Order & SO_LoadedFirst)) {
+      if (void *Ptr = LibLookup(Symbol, Order))
+        return Ptr;
+    }
+    if (Process) {
+      // Use OS facilities to search the current binary and all loaded libs.
+      if (void *Ptr = DLSym(Process, Symbol))
+        return Ptr;
+
+      // Search any libs that might have been skipped because of RTLD_LOCAL.
+      if (Order & SO_LoadedLast) {
+        if (void *Ptr = LibLookup(Symbol, Order))
+          return Ptr;
+      }
+    }
     return nullptr;
-  return dlsym(Data, symbolName);
+  }
+};
+
+namespace {
+// Collection of symbol name/value pairs to be searched prior to any libraries.
+static llvm::ManagedStatic<llvm::StringMap<void *>> ExplicitSymbols;
+// Collection of known library handles.
+static llvm::ManagedStatic<DynamicLibrary::HandleSet> OpenedHandles;
+// Lock for ExplicitSymbols and OpenedHandles.
+static llvm::ManagedStatic<llvm::sys::SmartMutex<true>> SymbolsMutex;
 }
 
-#else
+#ifdef LLVM_ON_WIN32
 
-using namespace llvm;
-using namespace llvm::sys;
+#include "Windows/DynamicLibrary.inc"
 
-DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
-                                                   std::string *errMsg) {
-  if (errMsg) *errMsg = "dlopen() not supported on this platform";
-  return DynamicLibrary();
-}
+#else
 
-void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
-  return NULL;
-}
+#include "Unix/DynamicLibrary.inc"
 
 #endif
 
+char DynamicLibrary::Invalid;
+DynamicLibrary::SearchOrdering DynamicLibrary::SearchOrder =
+    DynamicLibrary::SO_Linker;
+
 namespace llvm {
-void *SearchForAddressOfSpecialSymbol(const char* symbolName);
+void *SearchForAddressOfSpecialSymbol(const char *SymbolName) {
+  return DoSearch(SymbolName); // DynamicLibrary.inc
+}
 }
 
-void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
+void DynamicLibrary::AddSymbol(StringRef SymbolName, void *SymbolValue) {
   SmartScopedLock<true> Lock(*SymbolsMutex);
+  (*ExplicitSymbols)[SymbolName] = SymbolValue;
+}
 
-  // First check symbols added via AddSymbol().
-  if (ExplicitSymbols.isConstructed()) {
-    StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *FileName,
+                                                   std::string *Err) {
+  // Force OpenedHandles to be added into the ManagedStatic list before any
+  // ManagedStatic can be added from static constructors in HandleSet::DLOpen.
+  HandleSet& HS = *OpenedHandles;
 
-    if (i != ExplicitSymbols->end())
-      return i->second;
+  void *Handle = HandleSet::DLOpen(FileName, Err);
+  if (Handle != &Invalid) {
+    SmartScopedLock<true> Lock(*SymbolsMutex);
+    HS.AddLibrary(Handle, /*IsProcess*/ FileName == nullptr);
   }
 
-#if defined(HAVE_DLFCN_H) && defined(HAVE_DLOPEN)
-  // Now search the libraries.
-  if (OpenedHandles) {
-    for (DenseSet<void *>::iterator I = OpenedHandles->begin(),
-         E = OpenedHandles->end(); I != E; ++I) {
-      //lt_ptr ptr = lt_dlsym(*I, symbolName);
-      void *ptr = dlsym(*I, symbolName);
-      if (ptr) {
-        return ptr;
-      }
-    }
-  }
-#endif
+  return DynamicLibrary(Handle);
+}
 
-  if (void *Result = llvm::SearchForAddressOfSpecialSymbol(symbolName))
-    return Result;
+DynamicLibrary DynamicLibrary::addPermanentLibrary(void *Handle,
+                                                   std::string *Err) {
+  SmartScopedLock<true> Lock(*SymbolsMutex);
+  // If we've already loaded this library, tell the caller.
+  if (!OpenedHandles->AddLibrary(Handle, /*IsProcess*/false, /*CanClose*/false))
+    *Err = "Library already loaded";
 
-// This macro returns the address of a well-known, explicit symbol
-#define EXPLICIT_SYMBOL(SYM) \
-   if (!strcmp(symbolName, #SYM)) return &SYM
+  return DynamicLibrary(Handle);
+}
 
-// On linux we have a weird situation. The stderr/out/in symbols are both
-// macros and global variables because of standards requirements. So, we
-// boldly use the EXPLICIT_SYMBOL macro without checking for a #define first.
-#if defined(__linux__) and !defined(__ANDROID__)
-  {
-    EXPLICIT_SYMBOL(stderr);
-    EXPLICIT_SYMBOL(stdout);
-    EXPLICIT_SYMBOL(stdin);
-  }
-#else
-  // For everything else, we want to check to make sure the symbol isn't defined
-  // as a macro before using EXPLICIT_SYMBOL.
+void *DynamicLibrary::getAddressOfSymbol(const char *SymbolName) {
+  if (!isValid())
+    return nullptr;
+  return HandleSet::DLSym(Data, SymbolName);
+}
+
+void *DynamicLibrary::SearchForAddressOfSymbol(const char *SymbolName) {
   {
-#ifndef stdin
-    EXPLICIT_SYMBOL(stdin);
-#endif
-#ifndef stdout
-    EXPLICIT_SYMBOL(stdout);
-#endif
-#ifndef stderr
-    EXPLICIT_SYMBOL(stderr);
-#endif
+    SmartScopedLock<true> Lock(*SymbolsMutex);
+
+    // First check symbols added via AddSymbol().
+    if (ExplicitSymbols.isConstructed()) {
+      StringMap<void *>::iterator i = ExplicitSymbols->find(SymbolName);
+
+      if (i != ExplicitSymbols->end())
+        return i->second;
+    }
+
+    // Now search the libraries.
+    if (OpenedHandles.isConstructed()) {
+      if (void *Ptr = OpenedHandles->Lookup(SymbolName, SearchOrder))
+        return Ptr;
+    }
   }
-#endif
-#undef EXPLICIT_SYMBOL
 
-  return nullptr;
+  return llvm::SearchForAddressOfSpecialSymbol(SymbolName);
 }
 
-#endif // LLVM_ON_WIN32
-
 //===----------------------------------------------------------------------===//
 // C API.
 //===----------------------------------------------------------------------===//
 
-LLVMBool LLVMLoadLibraryPermanently(const char* Filename) {
+LLVMBool LLVMLoadLibraryPermanently(const char *Filename) {
   return llvm::sys::DynamicLibrary::LoadLibraryPermanently(Filename);
 }
 
@@ -186,4 +214,3 @@ void *LLVMSearchForAddressOfSymbol(const char *symbolName) {
 void LLVMAddSymbol(const char *symbolName, void *symbolValue) {
   return llvm::sys::DynamicLibrary::AddSymbol(symbolName, symbolValue);
 }
-
diff --git a/contrib/llvm/lib/Support/Errno.cpp b/contrib/llvm/lib/Support/Errno.cpp
index 3ba2a12..10be9b3 100644
--- a/contrib/llvm/lib/Support/Errno.cpp
+++ b/contrib/llvm/lib/Support/Errno.cpp
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Errno.h"
-#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Config/config.h" // Get autoconf configuration settings
 #include "llvm/Support/raw_ostream.h"
 #include <string.h>
 
diff --git a/contrib/llvm/lib/Support/Error.cpp b/contrib/llvm/lib/Support/Error.cpp
index 4730c0b..bb02c03 100644
--- a/contrib/llvm/lib/Support/Error.cpp
+++ b/contrib/llvm/lib/Support/Error.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Support/ManagedStatic.h"
 #include <system_error>
 
-
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Support/ErrorHandling.cpp b/contrib/llvm/lib/Support/ErrorHandling.cpp
index a7d3a18..fb8ae4c 100644
--- a/contrib/llvm/lib/Support/ErrorHandling.cpp
+++ b/contrib/llvm/lib/Support/ErrorHandling.cpp
@@ -20,15 +20,14 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdlib>
+#include <mutex>
+#include <new>
 
 #if defined(HAVE_UNISTD_H)
 # include <unistd.h>
@@ -43,18 +42,39 @@ using namespace llvm;
 static fatal_error_handler_t ErrorHandler = nullptr;
 static void *ErrorHandlerUserData = nullptr;
 
-static ManagedStatic<sys::Mutex> ErrorHandlerMutex;
+static fatal_error_handler_t BadAllocErrorHandler = nullptr;
+static void *BadAllocErrorHandlerUserData = nullptr;
+
+#if LLVM_ENABLE_THREADS == 1
+// Mutexes to synchronize installing error handlers and calling error handlers.
+// Do not use ManagedStatic, or that may allocate memory while attempting to
+// report an OOM.
+//
+// This usage of std::mutex has to be conditionalized behind ifdefs because
+// of this script:
+//   compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+// That script attempts to statically link the LLVM symbolizer library with the
+// STL and hide all of its symbols with 'opt -internalize'. To reduce size, it
+// cuts out the threading portions of the hermetic copy of libc++ that it
+// builds. We can remove these ifdefs if that script goes away.
+static std::mutex ErrorHandlerMutex;
+static std::mutex BadAllocErrorHandlerMutex;
+#endif
 
 void llvm::install_fatal_error_handler(fatal_error_handler_t handler,
                                        void *user_data) {
-  llvm::MutexGuard Lock(*ErrorHandlerMutex);
+#if LLVM_ENABLE_THREADS == 1
+  std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
   assert(!ErrorHandler && "Error handler already registered!\n");
   ErrorHandler = handler;
   ErrorHandlerUserData = user_data;
 }
 
 void llvm::remove_fatal_error_handler() {
-  llvm::MutexGuard Lock(*ErrorHandlerMutex);
+#if LLVM_ENABLE_THREADS == 1
+  std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
   ErrorHandler = nullptr;
   ErrorHandlerUserData = nullptr;
 }
@@ -77,7 +97,9 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   {
     // Only acquire the mutex while reading the handler, so as not to invoke a
     // user-supplied callback under a lock.
-    llvm::MutexGuard Lock(*ErrorHandlerMutex);
+#if LLVM_ENABLE_THREADS == 1
+    std::lock_guard<std::mutex> Lock(ErrorHandlerMutex);
+#endif
     handler = ErrorHandler;
     handlerData = ErrorHandlerUserData;
   }
@@ -104,6 +126,55 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   exit(1);
 }
 
+void llvm::install_bad_alloc_error_handler(fatal_error_handler_t handler,
+                                           void *user_data) {
+#if LLVM_ENABLE_THREADS == 1
+  std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
+  assert(!ErrorHandler && "Bad alloc error handler already registered!\n");
+  BadAllocErrorHandler = handler;
+  BadAllocErrorHandlerUserData = user_data;
+}
+
+void llvm::remove_bad_alloc_error_handler() {
+#if LLVM_ENABLE_THREADS == 1
+  std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
+  BadAllocErrorHandler = nullptr;
+  BadAllocErrorHandlerUserData = nullptr;
+}
+
+void llvm::report_bad_alloc_error(const char *Reason, bool GenCrashDiag) {
+  fatal_error_handler_t Handler = nullptr;
+  void *HandlerData = nullptr;
+  {
+    // Only acquire the mutex while reading the handler, so as not to invoke a
+    // user-supplied callback under a lock.
+#if LLVM_ENABLE_THREADS == 1
+    std::lock_guard<std::mutex> Lock(BadAllocErrorHandlerMutex);
+#endif
+    Handler = BadAllocErrorHandler;
+    HandlerData = BadAllocErrorHandlerUserData;
+  }
+
+  if (Handler) {
+    Handler(HandlerData, Reason, GenCrashDiag);
+    llvm_unreachable("bad alloc handler should not return");
+  }
+
+#ifdef LLVM_ENABLE_EXCEPTIONS
+  // If exceptions are enabled, make OOM in malloc look like OOM in new.
+  throw std::bad_alloc();
+#else
+  // Don't call the normal error handler. It may allocate memory. Directly write
+  // an OOM to stderr and abort.
+  char OOMMessage[] = "LLVM ERROR: out of memory\n";
+  ssize_t written = ::write(2, OOMMessage, strlen(OOMMessage));
+  (void)written;
+  abort();
+#endif
+}
+
 void llvm::llvm_unreachable_internal(const char *msg, const char *file,
                                      unsigned line) {
   // This code intentionally doesn't call the ErrorHandler callback, because
diff --git a/contrib/llvm/lib/Support/FileOutputBuffer.cpp b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
index 57e5a8d..731740d 100644
--- a/contrib/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/contrib/llvm/lib/Support/FileOutputBuffer.cpp
@@ -57,6 +57,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
         // FIXME: In posix, you use the access() call to check this.
       }
       break;
+    case sys::fs::file_type::directory_file:
+      return errc::is_a_directory;
     default:
       if (EC)
         return EC;
diff --git a/contrib/llvm/lib/Support/FoldingSet.cpp b/contrib/llvm/lib/Support/FoldingSet.cpp
index c9bca7f..4496d06 100644
--- a/contrib/llvm/lib/Support/FoldingSet.cpp
+++ b/contrib/llvm/lib/Support/FoldingSet.cpp
@@ -26,7 +26,7 @@ using namespace llvm;
 // FoldingSetNodeIDRef Implementation
 
 /// ComputeHash - Compute a strong hash value for this FoldingSetNodeIDRef,
-/// used to lookup the node in the FoldingSetImpl.
+/// used to lookup the node in the FoldingSetBase.
 unsigned FoldingSetNodeIDRef::ComputeHash() const {
   return static_cast<unsigned>(hash_combine_range(Data, Data+Size));
 }
@@ -142,7 +142,7 @@ void FoldingSetNodeID::AddNodeID(const FoldingSetNodeID &ID) {
 }
 
 /// ComputeHash - Compute a strong hash value for this FoldingSetNodeID, used to 
-/// lookup the node in the FoldingSetImpl.
+/// lookup the node in the FoldingSetBase.
 unsigned FoldingSetNodeID::ComputeHash() const {
   return FoldingSetNodeIDRef(Bits.data(), Bits.size()).ComputeHash();
 }
@@ -180,7 +180,7 @@ FoldingSetNodeID::Intern(BumpPtrAllocator &Allocator) const {
 }
 
 //===----------------------------------------------------------------------===//
-/// Helper functions for FoldingSetImpl.
+/// Helper functions for FoldingSetBase.
 
 /// GetNextPtr - In order to save space, each bucket is a
 /// singly-linked-list. In order to make deletion more efficient, we make
@@ -188,12 +188,12 @@ FoldingSetNodeID::Intern(BumpPtrAllocator &Allocator) const {
 /// The problem with this is that the start of the hash buckets are not
 /// Nodes.  If NextInBucketPtr is a bucket pointer, this method returns null:
 /// use GetBucketPtr when this happens.
-static FoldingSetImpl::Node *GetNextPtr(void *NextInBucketPtr) {
+static FoldingSetBase::Node *GetNextPtr(void *NextInBucketPtr) {
   // The low bit is set if this is the pointer back to the bucket.
   if (reinterpret_cast<intptr_t>(NextInBucketPtr) & 1)
     return nullptr;
   
-  return static_cast<FoldingSetImpl::Node*>(NextInBucketPtr);
+  return static_cast<FoldingSetBase::Node*>(NextInBucketPtr);
 }
 
 
@@ -221,11 +221,11 @@ static void **AllocateBuckets(unsigned NumBuckets) {
 }
 
 //===----------------------------------------------------------------------===//
-// FoldingSetImpl Implementation
+// FoldingSetBase Implementation
 
-void FoldingSetImpl::anchor() {}
+void FoldingSetBase::anchor() {}
 
-FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) {
+FoldingSetBase::FoldingSetBase(unsigned Log2InitSize) {
   assert(5 < Log2InitSize && Log2InitSize < 32 &&
          "Initial hash table size out of range");
   NumBuckets = 1 << Log2InitSize;
@@ -233,14 +233,14 @@ FoldingSetImpl::FoldingSetImpl(unsigned Log2InitSize) {
   NumNodes = 0;
 }
 
-FoldingSetImpl::FoldingSetImpl(FoldingSetImpl &&Arg)
+FoldingSetBase::FoldingSetBase(FoldingSetBase &&Arg)
     : Buckets(Arg.Buckets), NumBuckets(Arg.NumBuckets), NumNodes(Arg.NumNodes) {
   Arg.Buckets = nullptr;
   Arg.NumBuckets = 0;
   Arg.NumNodes = 0;
 }
 
-FoldingSetImpl &FoldingSetImpl::operator=(FoldingSetImpl &&RHS) {
+FoldingSetBase &FoldingSetBase::operator=(FoldingSetBase &&RHS) {
   free(Buckets); // This may be null if the set is in a moved-from state.
   Buckets = RHS.Buckets;
   NumBuckets = RHS.NumBuckets;
@@ -251,11 +251,11 @@ FoldingSetImpl &FoldingSetImpl::operator=(FoldingSetImpl &&RHS) {
   return *this;
 }
 
-FoldingSetImpl::~FoldingSetImpl() {
+FoldingSetBase::~FoldingSetBase() {
   free(Buckets);
 }
 
-void FoldingSetImpl::clear() {
+void FoldingSetBase::clear() {
   // Set all but the last bucket to null pointers.
   memset(Buckets, 0, NumBuckets*sizeof(void*));
 
@@ -266,7 +266,7 @@ void FoldingSetImpl::clear() {
   NumNodes = 0;
 }
 
-void FoldingSetImpl::GrowBucketCount(unsigned NewBucketCount) {
+void FoldingSetBase::GrowBucketCount(unsigned NewBucketCount) {
   assert((NewBucketCount > NumBuckets) && "Can't shrink a folding set with GrowBucketCount");
   assert(isPowerOf2_32(NewBucketCount) && "Bad bucket count!");
   void **OldBuckets = Buckets;
@@ -300,11 +300,11 @@ void FoldingSetImpl::GrowBucketCount(unsigned NewBucketCount) {
 
 /// GrowHashTable - Double the size of the hash table and rehash everything.
 ///
-void FoldingSetImpl::GrowHashTable() {
+void FoldingSetBase::GrowHashTable() {
   GrowBucketCount(NumBuckets * 2);
 }
 
-void FoldingSetImpl::reserve(unsigned EltCount) {
+void FoldingSetBase::reserve(unsigned EltCount) {
   // This will give us somewhere between EltCount / 2 and
   // EltCount buckets.  This puts us in the load factor
   // range of 1.0 - 2.0.
@@ -316,9 +316,9 @@ void FoldingSetImpl::reserve(unsigned EltCount) {
 /// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
 /// return it.  If not, return the insertion token that will make insertion
 /// faster.
-FoldingSetImpl::Node
-*FoldingSetImpl::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
-                                     void *&InsertPos) {
+FoldingSetBase::Node *
+FoldingSetBase::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                    void *&InsertPos) {
   unsigned IDHash = ID.ComputeHash();
   void **Bucket = GetBucketFor(IDHash, Buckets, NumBuckets);
   void *Probe = *Bucket;
@@ -342,7 +342,7 @@ FoldingSetImpl::Node
 /// InsertNode - Insert the specified node into the folding set, knowing that it
 /// is not already in the map.  InsertPos must be obtained from 
 /// FindNodeOrInsertPos.
-void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
+void FoldingSetBase::InsertNode(Node *N, void *InsertPos) {
   assert(!N->getNextInBucket());
   // Do we need to grow the hashtable?
   if (NumNodes+1 > capacity()) {
@@ -371,7 +371,7 @@ void FoldingSetImpl::InsertNode(Node *N, void *InsertPos) {
 
 /// RemoveNode - Remove a node from the folding set, returning true if one was
 /// removed or false if the node was not in the folding set.
-bool FoldingSetImpl::RemoveNode(Node *N) {
+bool FoldingSetBase::RemoveNode(Node *N) {
   // Because each bucket is a circular list, we don't need to compute N's hash
   // to remove it.
   void *Ptr = N->getNextInBucket();
@@ -412,7 +412,7 @@ bool FoldingSetImpl::RemoveNode(Node *N) {
 /// GetOrInsertNode - If there is an existing simple Node exactly
 /// equal to the specified node, return it.  Otherwise, insert 'N' and it
 /// instead.
-FoldingSetImpl::Node *FoldingSetImpl::GetOrInsertNode(FoldingSetImpl::Node *N) {
+FoldingSetBase::Node *FoldingSetBase::GetOrInsertNode(FoldingSetBase::Node *N) {
   FoldingSetNodeID ID;
   GetNodeProfile(N, ID);
   void *IP;
diff --git a/contrib/llvm/lib/Support/FormattedStream.cpp b/contrib/llvm/lib/Support/FormattedStream.cpp
index 2ed71c7..a9f4409 100644
--- a/contrib/llvm/lib/Support/FormattedStream.cpp
+++ b/contrib/llvm/lib/Support/FormattedStream.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -32,6 +32,7 @@ static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *
     switch (*Ptr) {
     case '\n':
       Line += 1;
+      LLVM_FALLTHROUGH;
     case '\r':
       Column = 0;
       break;
diff --git a/contrib/llvm/lib/Support/GraphWriter.cpp b/contrib/llvm/lib/Support/GraphWriter.cpp
index d0e1d50..e04bd8b 100644
--- a/contrib/llvm/lib/Support/GraphWriter.cpp
+++ b/contrib/llvm/lib/Support/GraphWriter.cpp
@@ -1,4 +1,4 @@
-//===-- GraphWriter.cpp - Implements GraphWriter support routines ---------===//
+//===- GraphWriter.cpp - Implements GraphWriter support routines ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,10 +12,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <system_error>
+#include <string>
+#include <vector>
+
 using namespace llvm;
 
 static cl::opt<bool> ViewBackground("view-background", cl::Hidden,
@@ -43,6 +55,7 @@ std::string llvm::DOT::EscapeString(const std::string &Label) {
             Str.erase(Str.begin()+i); continue;
           default: break;
         }
+        LLVM_FALLTHROUGH;
     case '{': case '}':
     case '<': case '>':
     case '|': case '"':
@@ -98,8 +111,10 @@ static bool ExecGraphViewer(StringRef ExecPath, std::vector<const char *> &args,
 }
 
 namespace {
+
 struct GraphSession {
   std::string LogBuffer;
+
   bool TryFindProgram(StringRef Names, std::string &ProgramPath) {
     raw_string_ostream Log(LogBuffer);
     SmallVector<StringRef, 8> parts;
@@ -114,7 +129,8 @@ struct GraphSession {
     return false;
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static const char *getProgramName(GraphProgram::Name program) {
   switch (program) {
diff --git a/contrib/llvm/lib/Support/Host.cpp b/contrib/llvm/lib/Support/Host.cpp
index d1b4041..5cf0316 100644
--- a/contrib/llvm/lib/Support/Host.cpp
+++ b/contrib/llvm/lib/Support/Host.cpp
@@ -52,25 +52,220 @@
 
 using namespace llvm;
 
-#if defined(__linux__)
-static ssize_t LLVM_ATTRIBUTE_UNUSED readCpuInfo(void *Buf, size_t Size) {
-  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
-  // memory buffer because the 'file' has 0 size (it can be read from only
-  // as a stream).
-
-  int FD;
-  std::error_code EC = sys::fs::openFileForRead("/proc/cpuinfo", FD);
-  if (EC) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << EC.message() << "\n");
-    return -1;
+static std::unique_ptr<llvm::MemoryBuffer>
+    LLVM_ATTRIBUTE_UNUSED getProcCpuinfoContent() {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFileAsStream("/proc/cpuinfo");
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Can't read "
+                 << "/proc/cpuinfo: " << EC.message() << "\n";
+    return nullptr;
   }
-  int Ret = read(FD, Buf, Size);
-  int CloseStatus = close(FD);
-  if (CloseStatus)
-    return -1;
-  return Ret;
+  return std::move(*Text);
+}
+
+StringRef sys::detail::getHostCPUNameForPowerPC(
+    const StringRef &ProcCpuinfoContent) {
+  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
+  // and so we must use an operating-system interface to determine the current
+  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
+  const char *generic = "generic";
+
+  // The cpu line is second (after the 'processor: 0' line), so if this
+  // buffer is too small then something has changed (or is wrong).
+  StringRef::const_iterator CPUInfoStart = ProcCpuinfoContent.begin();
+  StringRef::const_iterator CPUInfoEnd = ProcCpuinfoContent.end();
+
+  StringRef::const_iterator CIP = CPUInfoStart;
+
+  StringRef::const_iterator CPUStart = 0;
+  size_t CPULen = 0;
+
+  // We need to find the first line which starts with cpu, spaces, and a colon.
+  // After the colon, there may be some additional spaces and then the cpu type.
+  while (CIP < CPUInfoEnd && CPUStart == 0) {
+    if (CIP < CPUInfoEnd && *CIP == '\n')
+      ++CIP;
+
+    if (CIP < CPUInfoEnd && *CIP == 'c') {
+      ++CIP;
+      if (CIP < CPUInfoEnd && *CIP == 'p') {
+        ++CIP;
+        if (CIP < CPUInfoEnd && *CIP == 'u') {
+          ++CIP;
+          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+            ++CIP;
+
+          if (CIP < CPUInfoEnd && *CIP == ':') {
+            ++CIP;
+            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+              ++CIP;
+
+            if (CIP < CPUInfoEnd) {
+              CPUStart = CIP;
+              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
+                                          *CIP != ',' && *CIP != '\n'))
+                ++CIP;
+              CPULen = CIP - CPUStart;
+            }
+          }
+        }
+      }
+    }
+
+    if (CPUStart == 0)
+      while (CIP < CPUInfoEnd && *CIP != '\n')
+        ++CIP;
+  }
+
+  if (CPUStart == 0)
+    return generic;
+
+  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
+      .Case("604e", "604e")
+      .Case("604", "604")
+      .Case("7400", "7400")
+      .Case("7410", "7400")
+      .Case("7447", "7400")
+      .Case("7455", "7450")
+      .Case("G4", "g4")
+      .Case("POWER4", "970")
+      .Case("PPC970FX", "970")
+      .Case("PPC970MP", "970")
+      .Case("G5", "g5")
+      .Case("POWER5", "g5")
+      .Case("A2", "a2")
+      .Case("POWER6", "pwr6")
+      .Case("POWER7", "pwr7")
+      .Case("POWER8", "pwr8")
+      .Case("POWER8E", "pwr8")
+      .Case("POWER8NVL", "pwr8")
+      .Case("POWER9", "pwr9")
+      .Default(generic);
+}
+
+StringRef sys::detail::getHostCPUNameForARM(
+    const StringRef &ProcCpuinfoContent) {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+
+  // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  StringRef Hardware;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+    if (Lines[I].startswith("Hardware"))
+      Hardware = Lines[I].substr(8).ltrim("\t :");
+  }
+
+  if (Implementer == "0x41") { // ARM Ltd.
+    // MSM8992/8994 may give cpu part for the core that the kernel is running on,
+    // which is undeterministic and wrong. Always return cortex-a53 for these SoC.
+    if (Hardware.endswith("MSM8994") || Hardware.endswith("MSM8996"))
+      return "cortex-a53";
+
+
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x926", "arm926ej-s")
+            .Case("0xb02", "mpcore")
+            .Case("0xb36", "arm1136j-s")
+            .Case("0xb56", "arm1156t2-s")
+            .Case("0xb76", "arm1176jz-s")
+            .Case("0xc08", "cortex-a8")
+            .Case("0xc09", "cortex-a9")
+            .Case("0xc0f", "cortex-a15")
+            .Case("0xc20", "cortex-m0")
+            .Case("0xc23", "cortex-m3")
+            .Case("0xc24", "cortex-m4")
+            .Case("0xd04", "cortex-a35")
+            .Case("0xd03", "cortex-a53")
+            .Case("0xd07", "cortex-a57")
+            .Case("0xd08", "cortex-a72")
+            .Case("0xd09", "cortex-a73")
+            .Default("generic");
+  }
+
+  if (Implementer == "0x51") // Qualcomm Technologies, Inc.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x06f", "krait") // APQ8064
+            .Case("0x201", "kryo")
+            .Case("0x205", "kryo")
+            .Default("generic");
+
+  return "generic";
+}
+
+StringRef sys::detail::getHostCPUNameForS390x(
+    const StringRef &ProcCpuinfoContent) {
+  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
+
+  // The "processor 0:" line comes after a fair amount of other information,
+  // including a cache breakdown, but this should be plenty.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for the CPU features.
+  SmallVector<StringRef, 32> CPUFeatures;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("features")) {
+      size_t Pos = Lines[I].find(":");
+      if (Pos != StringRef::npos) {
+        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
+        break;
+      }
+    }
+
+  // We need to check for the presence of vector support independently of
+  // the machine type, since we may only use the vector register set when
+  // supported by the kernel (and hypervisor).
+  bool HaveVectorSupport = false;
+  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
+    if (CPUFeatures[I] == "vx")
+      HaveVectorSupport = true;
+  }
+
+  // Now check the processor machine type.
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("processor ")) {
+      size_t Pos = Lines[I].find("machine = ");
+      if (Pos != StringRef::npos) {
+        Pos += sizeof("machine = ") - 1;
+        unsigned int Id;
+        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 3906 && HaveVectorSupport)
+            return "z14";
+          if (Id >= 2964 && HaveVectorSupport)
+            return "z13";
+          if (Id >= 2827)
+            return "zEC12";
+          if (Id >= 2817)
+            return "z196";
+        }
+      }
+      break;
+    }
+  }
+
+  return "generic";
 }
-#endif
 
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
@@ -88,11 +283,17 @@ enum ProcessorVendors {
 };
 
 enum ProcessorTypes {
-  INTEL_ATOM = 1,
+  INTEL_BONNELL = 1,
   INTEL_CORE2,
   INTEL_COREI7,
   AMDFAM10H,
   AMDFAM15H,
+  INTEL_SILVERMONT,
+  INTEL_KNL,
+  AMD_BTVER1,
+  AMD_BTVER2,
+  AMDFAM17H,
+  // Entries below this are not in libgcc/compiler-rt.
   INTEL_i386,
   INTEL_i486,
   INTEL_PENTIUM,
@@ -102,16 +303,13 @@ enum ProcessorTypes {
   INTEL_PENTIUM_IV,
   INTEL_PENTIUM_M,
   INTEL_CORE_DUO,
-  INTEL_XEONPHI,
   INTEL_X86_64,
   INTEL_NOCONA,
   INTEL_PRESCOTT,
   AMD_i486,
   AMDPENTIUM,
   AMDATHLON,
-  AMDFAM14H,
-  AMDFAM16H,
-  AMDFAM17H,
+  INTEL_GOLDMONT,
   CPU_TYPE_MAX
 };
 
@@ -124,33 +322,26 @@ enum ProcessorSubtypes {
   AMDFAM10H_ISTANBUL,
   AMDFAM15H_BDVER1,
   AMDFAM15H_BDVER2,
-  INTEL_PENTIUM_MMX,
-  INTEL_CORE2_65,
-  INTEL_CORE2_45,
+  AMDFAM15H_BDVER3,
+  AMDFAM15H_BDVER4,
+  AMDFAM17H_ZNVER1,
   INTEL_COREI7_IVYBRIDGE,
   INTEL_COREI7_HASWELL,
   INTEL_COREI7_BROADWELL,
   INTEL_COREI7_SKYLAKE,
   INTEL_COREI7_SKYLAKE_AVX512,
-  INTEL_ATOM_BONNELL,
-  INTEL_ATOM_SILVERMONT,
-  INTEL_KNIGHTS_LANDING,
+  // Entries below this are not in libgcc/compiler-rt.
+  INTEL_PENTIUM_MMX,
+  INTEL_CORE2_65,
+  INTEL_CORE2_45,
   AMDPENTIUM_K6,
   AMDPENTIUM_K62,
   AMDPENTIUM_K63,
   AMDPENTIUM_GEODE,
-  AMDATHLON_TBIRD,
-  AMDATHLON_MP,
+  AMDATHLON_CLASSIC,
   AMDATHLON_XP,
+  AMDATHLON_K8,
   AMDATHLON_K8SSE3,
-  AMDATHLON_OPTERON,
-  AMDATHLON_FX,
-  AMDATHLON_64,
-  AMD_BTVER1,
-  AMD_BTVER2,
-  AMDFAM15H_BDVER3,
-  AMDFAM15H_BDVER4,
-  AMDFAM17H_ZNVER1,
   CPU_SUBTYPE_MAX
 };
 
@@ -166,9 +357,28 @@ enum ProcessorFeatures {
   FEATURE_SSE4_2,
   FEATURE_AVX,
   FEATURE_AVX2,
-  FEATURE_AVX512,
-  FEATURE_AVX512SAVE,
-  FEATURE_MOVBE,
+  FEATURE_SSE4_A,
+  FEATURE_FMA4,
+  FEATURE_XOP,
+  FEATURE_FMA,
+  FEATURE_AVX512F,
+  FEATURE_BMI,
+  FEATURE_BMI2,
+  FEATURE_AES,
+  FEATURE_PCLMUL,
+  FEATURE_AVX512VL,
+  FEATURE_AVX512BW,
+  FEATURE_AVX512DQ,
+  FEATURE_AVX512CD,
+  FEATURE_AVX512ER,
+  FEATURE_AVX512PF,
+  FEATURE_AVX512VBMI,
+  FEATURE_AVX512IFMA,
+  FEATURE_AVX5124VNNIW,
+  FEATURE_AVX5124FMAPS,
+  FEATURE_AVX512VPOPCNTDQ,
+  // Only one bit free left in the first 32 features.
+  FEATURE_MOVBE = 32,
   FEATURE_ADX,
   FEATURE_EM64T
 };
@@ -212,7 +422,6 @@ static bool isCpuIdSupported() {
 /// the specified arguments.  If we can't run cpuid on the host, return true.
 static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
                                unsigned *rECX, unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 #if defined(__GNUC__) || defined(__clang__)
 #if defined(__x86_64__)
   // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
@@ -222,14 +431,16 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
           "xchgq\t%%rbx, %%rsi\n\t"
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value));
+  return false;
 #elif defined(__i386__)
   __asm__("movl\t%%ebx, %%esi\n\t"
           "cpuid\n\t"
           "xchgl\t%%ebx, %%esi\n\t"
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value));
+  return false;
 #else
-  assert(0 && "This method is defined only for x86.");
+  return true;
 #endif
 #elif defined(_MSC_VER)
   // The MSVC intrinsic is portable across x86 and x64.
@@ -239,7 +450,6 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
   *rEBX = registers[1];
   *rECX = registers[2];
   *rEDX = registers[3];
-#endif
   return false;
 #else
   return true;
@@ -252,55 +462,40 @@ static bool getX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
 static bool getX86CpuIDAndInfoEx(unsigned value, unsigned subleaf,
                                  unsigned *rEAX, unsigned *rEBX, unsigned *rECX,
                                  unsigned *rEDX) {
-#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
-#if defined(__x86_64__) || defined(_M_X64)
 #if defined(__GNUC__) || defined(__clang__)
-  // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+#if defined(__x86_64__)
+  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
   // FIXME: should we save this for Clang?
   __asm__("movq\t%%rbx, %%rsi\n\t"
           "cpuid\n\t"
           "xchgq\t%%rbx, %%rsi\n\t"
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value), "c"(subleaf));
-#elif defined(_MSC_VER)
-  int registers[4];
-  __cpuidex(registers, value, subleaf);
-  *rEAX = registers[0];
-  *rEBX = registers[1];
-  *rECX = registers[2];
-  *rEDX = registers[3];
-#endif
-#elif defined(__i386__) || defined(_M_IX86)
-#if defined(__GNUC__) || defined(__clang__)
+  return false;
+#elif defined(__i386__)
   __asm__("movl\t%%ebx, %%esi\n\t"
           "cpuid\n\t"
           "xchgl\t%%ebx, %%esi\n\t"
           : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
           : "a"(value), "c"(subleaf));
-#elif defined(_MSC_VER)
-  __asm {
-      mov   eax,value
-      mov   ecx,subleaf
-      cpuid
-      mov   esi,rEAX
-      mov   dword ptr [esi],eax
-      mov   esi,rEBX
-      mov   dword ptr [esi],ebx
-      mov   esi,rECX
-      mov   dword ptr [esi],ecx
-      mov   esi,rEDX
-      mov   dword ptr [esi],edx
-  }
-#endif
+  return false;
 #else
-  assert(0 && "This method is defined only for x86.");
+  return true;
 #endif
+#elif defined(_MSC_VER)
+  int registers[4];
+  __cpuidex(registers, value, subleaf);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
   return false;
 #else
   return true;
 #endif
 }
 
+// Read control register 0 (XCR0). Used to detect features such as AVX.
 static bool getX86XCR0(unsigned *rEAX, unsigned *rEDX) {
 #if defined(__GNUC__) || defined(__clang__)
   // Check xgetbv; this uses a .byte sequence instead of the instruction
@@ -332,9 +527,10 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
 }
 
 static void
-getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
-                                unsigned int Brand_id, unsigned int Features,
-                                unsigned *Type, unsigned *Subtype) {
+getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                                unsigned Brand_id, unsigned Features,
+                                unsigned Features2, unsigned *Type,
+                                unsigned *Subtype) {
   if (Brand_id != 0)
     return;
   switch (Family) {
@@ -487,12 +683,7 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
     // Skylake Xeon:
     case 0x55:
       *Type = INTEL_COREI7;
-      // Check that we really have AVX512
-      if (Features & (1 << FEATURE_AVX512)) {
-        *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512"
-      } else {
-        *Subtype = INTEL_COREI7_SKYLAKE; // "skylake"
-      }
+      *Subtype = INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512"
       break;
 
     case 0x1c: // Most 45 nm Intel Atom processors
@@ -500,8 +691,7 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
     case 0x27: // 32 nm Atom Medfield
     case 0x35: // 32 nm Atom Midview
     case 0x36: // 32 nm Atom Midview
-      *Type = INTEL_ATOM;
-      *Subtype = INTEL_ATOM_BONNELL;
+      *Type = INTEL_BONNELL;
       break; // "bonnell"
 
     // Atom Silvermont codes from the Intel software optimization guide.
@@ -511,22 +701,23 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
     case 0x5a:
     case 0x5d:
     case 0x4c: // really airmont
-      *Type = INTEL_ATOM;
-      *Subtype = INTEL_ATOM_SILVERMONT;
+      *Type = INTEL_SILVERMONT;
       break; // "silvermont"
-
+    // Goldmont:
+    case 0x5c:
+    case 0x5f:
+      *Type = INTEL_GOLDMONT;
+      break; // "goldmont"
     case 0x57:
-      *Type = INTEL_XEONPHI; // knl
-      *Subtype = INTEL_KNIGHTS_LANDING;
+      *Type = INTEL_KNL; // knl
       break;
 
     default: // Unknown family 6 CPU, try to guess.
-      if (Features & (1 << FEATURE_AVX512)) {
-        *Type = INTEL_XEONPHI; // knl
-        *Subtype = INTEL_KNIGHTS_LANDING;
+      if (Features & (1 << FEATURE_AVX512F)) {
+        *Type = INTEL_KNL; // knl
         break;
       }
-      if (Features & (1 << FEATURE_ADX)) {
+      if (Features2 & (1 << (FEATURE_ADX - 32))) {
         *Type = INTEL_COREI7;
         *Subtype = INTEL_COREI7_BROADWELL;
         break;
@@ -542,9 +733,8 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
         break;
       }
       if (Features & (1 << FEATURE_SSE4_2)) {
-        if (Features & (1 << FEATURE_MOVBE)) {
-          *Type = INTEL_ATOM;
-          *Subtype = INTEL_ATOM_SILVERMONT;
+        if (Features2 & (1 << (FEATURE_MOVBE - 32))) {
+          *Type = INTEL_SILVERMONT;
         } else {
           *Type = INTEL_COREI7;
           *Subtype = INTEL_COREI7_NEHALEM;
@@ -557,16 +747,15 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
         break;
       }
       if (Features & (1 << FEATURE_SSSE3)) {
-        if (Features & (1 << FEATURE_MOVBE)) {
-          *Type = INTEL_ATOM;
-          *Subtype = INTEL_ATOM_BONNELL; // "bonnell"
+        if (Features2 & (1 << (FEATURE_MOVBE - 32))) {
+          *Type = INTEL_BONNELL; // "bonnell"
         } else {
           *Type = INTEL_CORE2; // "core2"
           *Subtype = INTEL_CORE2_65;
         }
         break;
       }
-      if (Features & (1 << FEATURE_EM64T)) {
+      if (Features2 & (1 << (FEATURE_EM64T - 32))) {
         *Type = INTEL_X86_64;
         break; // x86-64
       }
@@ -597,8 +786,8 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
             // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
             // processor, and Mobile Intel Celeron processor. All processors
             // are model 02h and manufactured using the 0.13 micron process.
-      *Type =
-          ((Features & (1 << FEATURE_EM64T)) ? INTEL_X86_64 : INTEL_PENTIUM_IV);
+      *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64
+                                                         : INTEL_PENTIUM_IV);
       break;
 
     case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
@@ -612,13 +801,13 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
             // Extreme Edition, Intel Xeon processor, Intel Xeon processor
             // MP, Intel Celeron D processor. All processors are model 06h
             // and manufactured using the 65 nm process.
-      *Type =
-          ((Features & (1 << FEATURE_EM64T)) ? INTEL_NOCONA : INTEL_PRESCOTT);
+      *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_NOCONA
+                                                         : INTEL_PRESCOTT);
       break;
 
     default:
-      *Type =
-          ((Features & (1 << FEATURE_EM64T)) ? INTEL_X86_64 : INTEL_PENTIUM_IV);
+      *Type = ((Features2 & (1 << (FEATURE_EM64T - 32))) ? INTEL_X86_64
+                                                         : INTEL_PENTIUM_IV);
       break;
     }
     break;
@@ -628,10 +817,8 @@ getIntelProcessorTypeAndSubtype(unsigned int Family, unsigned int Model,
   }
 }
 
-static void getAMDProcessorTypeAndSubtype(unsigned int Family,
-                                          unsigned int Model,
-                                          unsigned int Features,
-                                          unsigned *Type,
+static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                                          unsigned Features, unsigned *Type,
                                           unsigned *Subtype) {
   // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
   // appears to be no way to generate the wide variety of AMD-specific targets
@@ -661,38 +848,20 @@ static void getAMDProcessorTypeAndSubtype(unsigned int Family,
     break;
   case 6:
     *Type = AMDATHLON;
-    switch (Model) {
-    case 4:
-      *Subtype = AMDATHLON_TBIRD;
-      break; // "athlon-tbird"
-    case 6:
-    case 7:
-    case 8:
-      *Subtype = AMDATHLON_MP;
-      break; // "athlon-mp"
-    case 10:
+    if (Features & (1 << FEATURE_SSE)) {
       *Subtype = AMDATHLON_XP;
       break; // "athlon-xp"
     }
-    break;
+    *Subtype = AMDATHLON_CLASSIC;
+    break; // "athlon"
   case 15:
     *Type = AMDATHLON;
     if (Features & (1 << FEATURE_SSE3)) {
       *Subtype = AMDATHLON_K8SSE3;
       break; // "k8-sse3"
     }
-    switch (Model) {
-    case 1:
-      *Subtype = AMDATHLON_OPTERON;
-      break; // "opteron"
-    case 5:
-      *Subtype = AMDATHLON_FX;
-      break; // "athlon-fx"; also opteron
-    default:
-      *Subtype = AMDATHLON_64;
-      break; // "athlon64"
-    }
-    break;
+    *Subtype = AMDATHLON_K8;
+    break; // "k8"
   case 16:
     *Type = AMDFAM10H; // "amdfam10"
     switch (Model) {
@@ -708,19 +877,13 @@ static void getAMDProcessorTypeAndSubtype(unsigned int Family,
     }
     break;
   case 20:
-    *Type = AMDFAM14H;
-    *Subtype = AMD_BTVER1;
+    *Type = AMD_BTVER1;
     break; // "btver1";
   case 21:
     *Type = AMDFAM15H;
-    if (!(Features &
-          (1 << FEATURE_AVX))) { // If no AVX support, provide a sane fallback.
-      *Subtype = AMD_BTVER1;
-      break; // "btver1"
-    }
-    if (Model >= 0x50 && Model <= 0x6f) {
+    if (Model >= 0x60 && Model <= 0x7f) {
       *Subtype = AMDFAM15H_BDVER4;
-      break; // "bdver4"; 50h-6Fh: Excavator
+      break; // "bdver4"; 60h-7Fh: Excavator
     }
     if (Model >= 0x30 && Model <= 0x3f) {
       *Subtype = AMDFAM15H_BDVER3;
@@ -736,39 +899,52 @@ static void getAMDProcessorTypeAndSubtype(unsigned int Family,
     }
     break;
   case 22:
-    *Type = AMDFAM16H;
-    if (!(Features &
-          (1 << FEATURE_AVX))) { // If no AVX support provide a sane fallback.
-      *Subtype = AMD_BTVER1;
-      break; // "btver1";
-    }
-    *Subtype = AMD_BTVER2;
+    *Type = AMD_BTVER2;
     break; // "btver2"
   case 23:
     *Type = AMDFAM17H;
-    if (Features & (1 << FEATURE_ADX)) {
-      *Subtype = AMDFAM17H_ZNVER1;
-      break; // "znver1"
-    }
-    *Subtype =  AMD_BTVER1;
+    *Subtype = AMDFAM17H_ZNVER1;
     break;
   default:
     break; // "generic"
   }
 }
 
-static unsigned getAvailableFeatures(unsigned int ECX, unsigned int EDX,
-                                     unsigned MaxLeaf) {
+static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
+                                 unsigned *FeaturesOut,
+                                 unsigned *Features2Out) {
   unsigned Features = 0;
-  unsigned int EAX, EBX;
-  Features |= (((EDX >> 23) & 1) << FEATURE_MMX);
-  Features |= (((EDX >> 25) & 1) << FEATURE_SSE);
-  Features |= (((EDX >> 26) & 1) << FEATURE_SSE2);
-  Features |= (((ECX >> 0) & 1) << FEATURE_SSE3);
-  Features |= (((ECX >> 9) & 1) << FEATURE_SSSE3);
-  Features |= (((ECX >> 19) & 1) << FEATURE_SSE4_1);
-  Features |= (((ECX >> 20) & 1) << FEATURE_SSE4_2);
-  Features |= (((ECX >> 22) & 1) << FEATURE_MOVBE);
+  unsigned Features2 = 0;
+  unsigned EAX, EBX;
+
+  if ((EDX >> 15) & 1)
+    Features |= 1 << FEATURE_CMOV;
+  if ((EDX >> 23) & 1)
+    Features |= 1 << FEATURE_MMX;
+  if ((EDX >> 25) & 1)
+    Features |= 1 << FEATURE_SSE;
+  if ((EDX >> 26) & 1)
+    Features |= 1 << FEATURE_SSE2;
+
+  if ((ECX >> 0) & 1)
+    Features |= 1 << FEATURE_SSE3;
+  if ((ECX >> 1) & 1)
+    Features |= 1 << FEATURE_PCLMUL;
+  if ((ECX >> 9) & 1)
+    Features |= 1 << FEATURE_SSSE3;
+  if ((ECX >> 12) & 1)
+    Features |= 1 << FEATURE_FMA;
+  if ((ECX >> 19) & 1)
+    Features |= 1 << FEATURE_SSE4_1;
+  if ((ECX >> 20) & 1)
+    Features |= 1 << FEATURE_SSE4_2;
+  if ((ECX >> 23) & 1)
+    Features |= 1 << FEATURE_POPCNT;
+  if ((ECX >> 25) & 1)
+    Features |= 1 << FEATURE_AES;
+
+  if ((ECX >> 22) & 1)
+    Features2 |= 1 << (FEATURE_MOVBE - 32);
 
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
@@ -777,20 +953,65 @@ static unsigned getAvailableFeatures(unsigned int ECX, unsigned int EDX,
   bool HasAVX = ((ECX & AVXBits) == AVXBits) && !getX86XCR0(&EAX, &EDX) &&
                 ((EAX & 0x6) == 0x6);
   bool HasAVX512Save = HasAVX && ((EAX & 0xe0) == 0xe0);
+
+  if (HasAVX)
+    Features |= 1 << FEATURE_AVX;
+
   bool HasLeaf7 =
       MaxLeaf >= 0x7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
-  bool HasADX = HasLeaf7 && ((EBX >> 19) & 1);
-  bool HasAVX2 = HasAVX && HasLeaf7 && (EBX & 0x20);
-  bool HasAVX512 = HasLeaf7 && HasAVX512Save && ((EBX >> 16) & 1);
-  Features |= (HasAVX << FEATURE_AVX);
-  Features |= (HasAVX2 << FEATURE_AVX2);
-  Features |= (HasAVX512 << FEATURE_AVX512);
-  Features |= (HasAVX512Save << FEATURE_AVX512SAVE);
-  Features |= (HasADX << FEATURE_ADX);
-
-  getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
-  Features |= (((EDX >> 29) & 0x1) << FEATURE_EM64T);
-  return Features;
+
+  if (HasLeaf7 && ((EBX >> 3) & 1))
+    Features |= 1 << FEATURE_BMI;
+  if (HasLeaf7 && ((EBX >> 5) & 1) && HasAVX)
+    Features |= 1 << FEATURE_AVX2;
+  if (HasLeaf7 && ((EBX >> 9) & 1))
+    Features |= 1 << FEATURE_BMI2;
+  if (HasLeaf7 && ((EBX >> 16) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512F;
+  if (HasLeaf7 && ((EBX >> 17) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512DQ;
+  if (HasLeaf7 && ((EBX >> 19) & 1))
+    Features2 |= 1 << (FEATURE_ADX - 32);
+  if (HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512IFMA;
+  if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512PF;
+  if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512ER;
+  if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512CD;
+  if (HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512BW;
+  if (HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512VL;
+
+  if (HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512VBMI;
+  if (HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX512VPOPCNTDQ;
+
+  if (HasLeaf7 && ((EDX >> 2) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX5124VNNIW;
+  if (HasLeaf7 && ((EDX >> 3) & 1) && HasAVX512Save)
+    Features |= 1 << FEATURE_AVX5124FMAPS;
+
+  unsigned MaxExtLevel;
+  getX86CpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
+
+  bool HasExtLeaf1 = MaxExtLevel >= 0x80000001 &&
+                     !getX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  if (HasExtLeaf1 && ((ECX >> 6) & 1))
+    Features |= 1 << FEATURE_SSE4_A;
+  if (HasExtLeaf1 && ((ECX >> 11) & 1))
+    Features |= 1 << FEATURE_XOP;
+  if (HasExtLeaf1 && ((ECX >> 16) & 1))
+    Features |= 1 << FEATURE_FMA4;
+
+  if (HasExtLeaf1 && ((EDX >> 29) & 1))
+    Features2 |= 1 << (FEATURE_EM64T - 32);
+
+  *FeaturesOut  = Features;
+  *Features2Out = Features2;
 }
 
 StringRef sys::getHostCPUName() {
@@ -805,23 +1026,22 @@ StringRef sys::getHostCPUName() {
   if(!isCpuIdSupported())
     return "generic";
 #endif
-  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX))
-    return "generic";
-  if (getX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+  if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1)
     return "generic";
+  getX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
 
   unsigned Brand_id = EBX & 0xff;
   unsigned Family = 0, Model = 0;
-  unsigned Features = 0;
+  unsigned Features = 0, Features2 = 0;
   detectX86FamilyModel(EAX, &Family, &Model);
-  Features = getAvailableFeatures(ECX, EDX, MaxLeaf);
+  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2);
 
   unsigned Type;
   unsigned Subtype;
 
   if (Vendor == SIG_INTEL) {
-    getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features, &Type,
-                                    &Subtype);
+    getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
+                                    Features2, &Type, &Subtype);
     switch (Type) {
     case INTEL_i386:
       return "i386";
@@ -850,7 +1070,7 @@ StringRef sys::getHostCPUName() {
       case INTEL_CORE2_45:
         return "penryn";
       default:
-        return "core2";
+        llvm_unreachable("Unexpected subtype!");
       }
     case INTEL_COREI7:
       switch (Subtype) {
@@ -871,19 +1091,16 @@ StringRef sys::getHostCPUName() {
       case INTEL_COREI7_SKYLAKE_AVX512:
         return "skylake-avx512";
       default:
-        return "corei7";
-      }
-    case INTEL_ATOM:
-      switch (Subtype) {
-      case INTEL_ATOM_BONNELL:
-        return "bonnell";
-      case INTEL_ATOM_SILVERMONT:
-        return "silvermont";
-      default:
-        return "atom";
+        llvm_unreachable("Unexpected subtype!");
       }
-    case INTEL_XEONPHI:
-      return "knl"; /*update for more variants added*/
+    case INTEL_BONNELL:
+      return "bonnell";
+    case INTEL_SILVERMONT:
+      return "silvermont";
+    case INTEL_GOLDMONT:
+      return "goldmont";
+    case INTEL_KNL:
+      return "knl";
     case INTEL_X86_64:
       return "x86-64";
     case INTEL_NOCONA:
@@ -891,7 +1108,7 @@ StringRef sys::getHostCPUName() {
     case INTEL_PRESCOTT:
       return "prescott";
     default:
-      return "generic";
+      break;
     }
   } else if (Vendor == SIG_AMD) {
     getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type, &Subtype);
@@ -913,31 +1130,24 @@ StringRef sys::getHostCPUName() {
       }
     case AMDATHLON:
       switch (Subtype) {
-      case AMDATHLON_TBIRD:
-        return "athlon-tbird";
-      case AMDATHLON_MP:
-        return "athlon-mp";
+      case AMDATHLON_CLASSIC:
+        return "athlon";
       case AMDATHLON_XP:
         return "athlon-xp";
+      case AMDATHLON_K8:
+        return "k8";
       case AMDATHLON_K8SSE3:
         return "k8-sse3";
-      case AMDATHLON_OPTERON:
-        return "opteron";
-      case AMDATHLON_FX:
-        return "athlon-fx";
-      case AMDATHLON_64:
-        return "athlon64";
       default:
-        return "athlon";
+        llvm_unreachable("Unexpected subtype!");
       }
     case AMDFAM10H:
-      if(Subtype == AMDFAM10H_BARCELONA)
-        return "barcelona";
       return "amdfam10";
-    case AMDFAM14H:
+    case AMD_BTVER1:
       return "btver1";
     case AMDFAM15H:
       switch (Subtype) {
+      default: // There are gaps in the subtype detection.
       case AMDFAM15H_BDVER1:
         return "bdver1";
       case AMDFAM15H_BDVER2:
@@ -946,31 +1156,13 @@ StringRef sys::getHostCPUName() {
         return "bdver3";
       case AMDFAM15H_BDVER4:
         return "bdver4";
-      case AMD_BTVER1:
-        return "btver1";
-      default:
-        return "amdfam15";
-      }
-    case AMDFAM16H:
-      switch (Subtype) {
-      case AMD_BTVER1:
-        return "btver1";
-      case AMD_BTVER2:
-        return "btver2";
-      default:
-        return "amdfam16";
       }
+    case AMD_BTVER2:
+      return "btver2";
     case AMDFAM17H:
-      switch (Subtype) {
-      case AMD_BTVER1:
-        return "btver1";
-      case AMDFAM17H_ZNVER1:
-        return "znver1";
-      default:
-        return "amdfam17";
-      }
+      return "znver1";
     default:
-      return "generic";
+      break;
     }
   }
   return "generic";
@@ -1020,201 +1212,21 @@ StringRef sys::getHostCPUName() {
 }
 #elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
 StringRef sys::getHostCPUName() {
-  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
-  // and so we must use an operating-system interface to determine the current
-  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
-  const char *generic = "generic";
-
-  // The cpu line is second (after the 'processor: 0' line), so if this
-  // buffer is too small then something has changed (or is wrong).
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return generic;
-
-  const char *CPUInfoStart = buffer;
-  const char *CPUInfoEnd = buffer + CPUInfoSize;
-
-  const char *CIP = CPUInfoStart;
-
-  const char *CPUStart = 0;
-  size_t CPULen = 0;
-
-  // We need to find the first line which starts with cpu, spaces, and a colon.
-  // After the colon, there may be some additional spaces and then the cpu type.
-  while (CIP < CPUInfoEnd && CPUStart == 0) {
-    if (CIP < CPUInfoEnd && *CIP == '\n')
-      ++CIP;
-
-    if (CIP < CPUInfoEnd && *CIP == 'c') {
-      ++CIP;
-      if (CIP < CPUInfoEnd && *CIP == 'p') {
-        ++CIP;
-        if (CIP < CPUInfoEnd && *CIP == 'u') {
-          ++CIP;
-          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
-            ++CIP;
-
-          if (CIP < CPUInfoEnd && *CIP == ':') {
-            ++CIP;
-            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
-              ++CIP;
-
-            if (CIP < CPUInfoEnd) {
-              CPUStart = CIP;
-              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
-                                          *CIP != ',' && *CIP != '\n'))
-                ++CIP;
-              CPULen = CIP - CPUStart;
-            }
-          }
-        }
-      }
-    }
-
-    if (CPUStart == 0)
-      while (CIP < CPUInfoEnd && *CIP != '\n')
-        ++CIP;
-  }
-
-  if (CPUStart == 0)
-    return generic;
-
-  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
-      .Case("604e", "604e")
-      .Case("604", "604")
-      .Case("7400", "7400")
-      .Case("7410", "7400")
-      .Case("7447", "7400")
-      .Case("7455", "7450")
-      .Case("G4", "g4")
-      .Case("POWER4", "970")
-      .Case("PPC970FX", "970")
-      .Case("PPC970MP", "970")
-      .Case("G5", "g5")
-      .Case("POWER5", "g5")
-      .Case("A2", "a2")
-      .Case("POWER6", "pwr6")
-      .Case("POWER7", "pwr7")
-      .Case("POWER8", "pwr8")
-      .Case("POWER8E", "pwr8")
-      .Case("POWER8NVL", "pwr8")
-      .Case("POWER9", "pwr9")
-      .Default(generic);
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForPowerPC(Content);
 }
-#elif defined(__linux__) && defined(__arm__)
+#elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 StringRef sys::getHostCPUName() {
-  // The cpuid register on arm is not accessible from user space. On Linux,
-  // it is exposed through the /proc/cpuinfo file.
-
-  // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
-  // in all cases.
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return "generic";
-
-  StringRef Str(buffer, CPUInfoSize);
-
-  SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
-
-  // Look for the CPU implementer line.
-  StringRef Implementer;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("CPU implementer"))
-      Implementer = Lines[I].substr(15).ltrim("\t :");
-
-  if (Implementer == "0x41") // ARM Ltd.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x926", "arm926ej-s")
-            .Case("0xb02", "mpcore")
-            .Case("0xb36", "arm1136j-s")
-            .Case("0xb56", "arm1156t2-s")
-            .Case("0xb76", "arm1176jz-s")
-            .Case("0xc08", "cortex-a8")
-            .Case("0xc09", "cortex-a9")
-            .Case("0xc0f", "cortex-a15")
-            .Case("0xc20", "cortex-m0")
-            .Case("0xc23", "cortex-m3")
-            .Case("0xc24", "cortex-m4")
-            .Default("generic");
-
-  if (Implementer == "0x51") // Qualcomm Technologies, Inc.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x06f", "krait") // APQ8064
-            .Default("generic");
-
-  return "generic";
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForARM(Content);
 }
 #elif defined(__linux__) && defined(__s390x__)
 StringRef sys::getHostCPUName() {
-  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
-
-  // The "processor 0:" line comes after a fair amount of other information,
-  // including a cache breakdown, but this should be plenty.
-  char buffer[2048];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return "generic";
-
-  StringRef Str(buffer, CPUInfoSize);
-  SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
-
-  // Look for the CPU features.
-  SmallVector<StringRef, 32> CPUFeatures;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("features")) {
-      size_t Pos = Lines[I].find(":");
-      if (Pos != StringRef::npos) {
-        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
-        break;
-      }
-    }
-
-  // We need to check for the presence of vector support independently of
-  // the machine type, since we may only use the vector register set when
-  // supported by the kernel (and hypervisor).
-  bool HaveVectorSupport = false;
-  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
-    if (CPUFeatures[I] == "vx")
-      HaveVectorSupport = true;
-  }
-
-  // Now check the processor machine type.
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-    if (Lines[I].startswith("processor ")) {
-      size_t Pos = Lines[I].find("machine = ");
-      if (Pos != StringRef::npos) {
-        Pos += sizeof("machine = ") - 1;
-        unsigned int Id;
-        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
-          if (Id >= 2964 && HaveVectorSupport)
-            return "z13";
-          if (Id >= 2827)
-            return "zEC12";
-          if (Id >= 2817)
-            return "z196";
-        }
-      }
-      break;
-    }
-  }
-
-  return "generic";
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForS390x(Content);
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
@@ -1232,6 +1244,7 @@ static int computeHostNumPhysicalCores() {
   if (std::error_code EC = Text.getError()) {
     llvm::errs() << "Can't read "
                  << "/proc/cpuinfo: " << EC.message() << "\n";
+    return -1;
   }
   SmallVector<StringRef, 8> strs;
   (*Text)->getBuffer().split(strs, "\n", /*MaxSplit=*/-1,
@@ -1349,10 +1362,15 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["sse4a"] = HasExtLeaf1 && ((ECX >> 6) & 1);
   Features["prfchw"] = HasExtLeaf1 && ((ECX >> 8) & 1);
   Features["xop"] = HasExtLeaf1 && ((ECX >> 11) & 1) && HasAVXSave;
+  Features["lwp"] = HasExtLeaf1 && ((ECX >> 15) & 1);
   Features["fma4"] = HasExtLeaf1 && ((ECX >> 16) & 1) && HasAVXSave;
   Features["tbm"] = HasExtLeaf1 && ((ECX >> 21) & 1);
   Features["mwaitx"] = HasExtLeaf1 && ((ECX >> 29) & 1);
 
+  bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
+                     !getX86CpuIDAndInfoEx(0x80000008,0x0, &EAX, &EBX, &ECX, &EDX);
+  Features["clzero"] = HasExtLeaf8 && ((EBX >> 0) & 1);
+
   bool HasLeaf7 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
@@ -1362,14 +1380,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["fsgsbase"] = HasLeaf7 && ((EBX >> 0) & 1);
   Features["sgx"] = HasLeaf7 && ((EBX >> 2) & 1);
   Features["bmi"] = HasLeaf7 && ((EBX >> 3) & 1);
-  Features["hle"] = HasLeaf7 && ((EBX >> 4) & 1);
   Features["bmi2"] = HasLeaf7 && ((EBX >> 8) & 1);
-  Features["invpcid"] = HasLeaf7 && ((EBX >> 10) & 1);
   Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1);
   Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1);
-  Features["smap"] = HasLeaf7 && ((EBX >> 20) & 1);
-  Features["pcommit"] = HasLeaf7 && ((EBX >> 22) & 1);
   Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1);
   Features["clwb"] = HasLeaf7 && ((EBX >> 24) & 1);
   Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1);
@@ -1386,6 +1400,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 
   Features["prefetchwt1"] = HasLeaf7 && (ECX & 1);
   Features["avx512vbmi"] = HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save;
+  Features["avx512vpopcntdq"] = HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save;  
   // Enable protection keys
   Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1);
 
@@ -1401,17 +1416,12 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
-  // Read 1024 bytes from /proc/cpuinfo, which should contain the Features line
-  // in all cases.
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  if (!P)
     return false;
 
-  StringRef Str(buffer, CPUInfoSize);
-
   SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
+  P->getBuffer().split(Lines, "\n");
 
   SmallVector<StringRef, 32> CPUFeatures;
 
@@ -1475,7 +1485,8 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) { return false; }
 #endif
 
 std::string sys::getProcessTriple() {
-  Triple PT(Triple::normalize(LLVM_HOST_TRIPLE));
+  std::string TargetTripleString = updateTripleOSVersion(LLVM_HOST_TRIPLE);
+  Triple PT(Triple::normalize(TargetTripleString));
 
   if (sizeof(void *) == 8 && PT.isArch32Bit())
     PT = PT.get64BitArchVariant();
diff --git a/contrib/llvm/lib/Support/LockFileManager.cpp b/contrib/llvm/lib/Support/LockFileManager.cpp
index 444aaa3..3ee3af7 100644
--- a/contrib/llvm/lib/Support/LockFileManager.cpp
+++ b/contrib/llvm/lib/Support/LockFileManager.cpp
@@ -15,15 +15,15 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cerrno>
 #include <ctime>
 #include <memory>
-#include <tuple>
-#include <system_error>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <system_error>
+#include <tuple>
 #if LLVM_ON_WIN32
 #include <windows.h>
 #endif
@@ -304,9 +304,9 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   Interval.tv_sec = 0;
   Interval.tv_nsec = 1000000;
 #endif
-  // Don't wait more than five minutes per iteration. Total timeout for the file
-  // to appear is ~8.5 mins.
-  const unsigned MaxSeconds = 5*60;
+  // Don't wait more than 40s per iteration. Total timeout for the file
+  // to appear is ~1.5 minutes.
+  const unsigned MaxSeconds = 40;
   do {
     // Sleep for the designated interval, to allow the owning process time to
     // finish up and remove the lock file.
diff --git a/contrib/llvm/lib/Support/LowLevelType.cpp b/contrib/llvm/lib/Support/LowLevelType.cpp
new file mode 100644
index 0000000..0ee3f1d
--- /dev/null
+++ b/contrib/llvm/lib/Support/LowLevelType.cpp
@@ -0,0 +1,56 @@
+//===-- llvm/Support/LowLevelType.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the more header-heavy bits of the LLT class to
+/// avoid polluting users' namespaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LLT::LLT(MVT VT) {
+  if (VT.isVector()) {
+    init(/*isPointer=*/false, VT.getVectorNumElements() > 1,
+         VT.getVectorNumElements(), VT.getVectorElementType().getSizeInBits(),
+         /*AddressSpace=*/0);
+  } else if (VT.isValid()) {
+    // Aggregates are no different from real scalars as far as GlobalISel is
+    // concerned.
+    assert(VT.getSizeInBits() != 0 && "invalid zero-sized type");
+    init(/*isPointer=*/false, /*isVector=*/false, /*NumElements=*/0,
+         VT.getSizeInBits(), /*AddressSpace=*/0);
+  } else {
+    IsPointer = false;
+    IsVector = false;
+    RawData = 0;
+  }
+}
+
+void LLT::print(raw_ostream &OS) const {
+  if (isVector())
+    OS << "<" << getNumElements() << " x " << getElementType() << ">";
+  else if (isPointer())
+    OS << "p" << getAddressSpace();
+  else if (isValid()) {
+    assert(isScalar() && "unexpected type");
+    OS << "s" << getScalarSizeInBits();
+  } else
+    llvm_unreachable("trying to print an invalid type");
+}
+
+const constexpr LLT::BitFieldInfo LLT::ScalarSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerAddressSpaceFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::VectorElementsFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::VectorSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorElementsFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorSizeFieldInfo;
+const constexpr LLT::BitFieldInfo LLT::PointerVectorAddressSpaceFieldInfo;
diff --git a/contrib/llvm/lib/Support/MD5.cpp b/contrib/llvm/lib/Support/MD5.cpp
index 942571e..545a64c 100644
--- a/contrib/llvm/lib/Support/MD5.cpp
+++ b/contrib/llvm/lib/Support/MD5.cpp
@@ -37,10 +37,14 @@
  * compile-time configuration.
  */
 
+#include "llvm/Support/MD5.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
+#include <array>
+#include <cstdint>
 #include <cstring>
 
 // The basic MD5 functions.
@@ -68,7 +72,7 @@
        ((MD5_u32plus) ptr[(n) * 4 + 3] << 24))
 #define GET(n) (block[(n)])
 
-namespace llvm {
+using namespace llvm;
 
 /// \brief This processes one or more 64-byte data blocks, but does NOT update
 ///the bit counters.  There are no alignment requirements.
@@ -179,9 +183,7 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
   return ptr;
 }
 
-MD5::MD5()
-    : a(0x67452301), b(0xefcdab89), c(0x98badcfe), d(0x10325476), hi(0), lo(0) {
-}
+MD5::MD5() = default;
 
 /// Incrementally add the bytes in \p Data to the hash.
 void MD5::update(ArrayRef<uint8_t> Data) {
@@ -259,10 +261,16 @@ void MD5::final(MD5Result &Result) {
   support::endian::write32le(&Result[12], d);
 }
 
-void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
+SmallString<32> MD5::MD5Result::digest() const {
+  SmallString<32> Str;
   raw_svector_ostream Res(Str);
   for (int i = 0; i < 16; ++i)
-    Res << format("%.2x", Result[i]);
+    Res << format("%.2x", Bytes[i]);
+  return Str;
+}
+
+void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
+  Str = Result.digest();
 }
 
 std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
@@ -271,8 +279,5 @@ std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
   MD5::MD5Result Res;
   Hash.final(Res);
 
-  std::array<uint8_t, 16> Arr;
-  memcpy(Arr.data(), Res, sizeof(Res));
-  return Arr;
-}
+  return Res;
 }
diff --git a/contrib/llvm/lib/Support/ManagedStatic.cpp b/contrib/llvm/lib/Support/ManagedStatic.cpp
index 7dd3131..fb7cd07 100644
--- a/contrib/llvm/lib/Support/ManagedStatic.cpp
+++ b/contrib/llvm/lib/Support/ManagedStatic.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 
 static const ManagedStaticBase *StaticList = nullptr;
 static sys::Mutex *ManagedStaticMutex = nullptr;
-LLVM_DEFINE_ONCE_FLAG(mutex_init_flag);
+static llvm::once_flag mutex_init_flag;
 
 static void initializeMutex() {
   ManagedStaticMutex = new sys::Mutex();
diff --git a/contrib/llvm/lib/Support/MemoryBuffer.cpp b/contrib/llvm/lib/Support/MemoryBuffer.cpp
index a3a18c9..85e782b 100644
--- a/contrib/llvm/lib/Support/MemoryBuffer.cpp
+++ b/contrib/llvm/lib/Support/MemoryBuffer.cpp
@@ -103,7 +103,7 @@ public:
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, 
-           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize);
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile);
 
 std::unique_ptr<MemoryBuffer>
 MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,
@@ -178,8 +178,8 @@ MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, 
-                           uint64_t Offset) {
-  return getFileAux(FilePath, -1, MapSize, Offset, false, false);
+                           uint64_t Offset, bool IsVolatile) {
+  return getFileAux(FilePath, -1, MapSize, Offset, false, IsVolatile);
 }
 
 
@@ -240,11 +240,9 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
   // Read into Buffer until we hit EOF.
   do {
     Buffer.reserve(Buffer.size() + ChunkSize);
-    ReadBytes = read(FD, Buffer.end(), ChunkSize);
-    if (ReadBytes == -1) {
-      if (errno == EINTR) continue;
+    ReadBytes = sys::RetryAfterSignal(-1, read, FD, Buffer.end(), ChunkSize);
+    if (ReadBytes == -1)
       return std::error_code(errno, std::generic_category());
-    }
     Buffer.set_size(Buffer.size() + ReadBytes);
   } while (ReadBytes != 0);
 
@@ -254,19 +252,19 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
-                      bool RequiresNullTerminator, bool IsVolatileSize) {
+                      bool RequiresNullTerminator, bool IsVolatile) {
   return getFileAux(Filename, FileSize, FileSize, 0,
-                    RequiresNullTerminator, IsVolatileSize);
+                    RequiresNullTerminator, IsVolatile);
 }
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
-                bool IsVolatileSize);
+                bool IsVolatile);
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
-           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize) {
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) {
   int FD;
   std::error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
@@ -274,7 +272,7 @@ getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
       getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset,
-                      RequiresNullTerminator, IsVolatileSize);
+                      RequiresNullTerminator, IsVolatile);
   close(FD);
   return Ret;
 }
@@ -285,11 +283,11 @@ static bool shouldUseMmap(int FD,
                           off_t Offset,
                           bool RequiresNullTerminator,
                           int PageSize,
-                          bool IsVolatileSize) {
+                          bool IsVolatile) {
   // mmap may leave the buffer without null terminator if the file size changed
   // by the time the last page is mapped in, so avoid it if the file size is
   // likely to change.
-  if (IsVolatileSize)
+  if (IsVolatile)
     return false;
 
   // We don't use mmap for small files because this can severely fragment our
@@ -300,7 +298,6 @@ static bool shouldUseMmap(int FD,
   if (!RequiresNullTerminator)
     return true;
 
-
   // If we don't know the file size, use fstat to find out.  fstat on an open
   // file descriptor is cheaper than stat on a random path.
   // FIXME: this chunk of code is duplicated, but it avoids a fstat when
@@ -338,7 +335,7 @@ static bool shouldUseMmap(int FD,
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
-                bool IsVolatileSize) {
+                bool IsVolatile) {
   static int PageSize = sys::Process::getPageSize();
 
   // Default is to map the full file.
@@ -365,7 +362,7 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
   }
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
-                    PageSize, IsVolatileSize)) {
+                    PageSize, IsVolatile)) {
     std::error_code EC;
     std::unique_ptr<MemoryBuffer> Result(
         new (NamedBufferAlloc(Filename))
@@ -392,13 +389,12 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
 
   while (BytesLeft) {
 #ifdef HAVE_PREAD
-    ssize_t NumRead = ::pread(FD, BufPtr, BytesLeft, MapSize-BytesLeft+Offset);
+    ssize_t NumRead = sys::RetryAfterSignal(-1, ::pread, FD, BufPtr, BytesLeft,
+                                            MapSize - BytesLeft + Offset);
 #else
-    ssize_t NumRead = ::read(FD, BufPtr, BytesLeft);
+    ssize_t NumRead = sys::RetryAfterSignal(-1, ::read, FD, BufPtr, BytesLeft);
 #endif
     if (NumRead == -1) {
-      if (errno == EINTR)
-        continue;
       // Error while reading.
       return std::error_code(errno, std::generic_category());
     }
@@ -415,17 +411,16 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
-                          bool RequiresNullTerminator, bool IsVolatileSize) {
+                          bool RequiresNullTerminator, bool IsVolatile) {
   return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
-                         RequiresNullTerminator, IsVolatileSize);
+                         RequiresNullTerminator, IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
-                               int64_t Offset) {
+                               int64_t Offset, bool IsVolatile) {
   assert(MapSize != uint64_t(-1));
-  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false,
-                         /*IsVolatileSize*/ false);
+  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
diff --git a/contrib/llvm/lib/Support/Mutex.cpp b/contrib/llvm/lib/Support/Mutex.cpp
index c8d3844..b1d5e7c 100644
--- a/contrib/llvm/lib/Support/Mutex.cpp
+++ b/contrib/llvm/lib/Support/Mutex.cpp
@@ -11,8 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Config/config.h"
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
@@ -47,6 +48,10 @@ MutexImpl::MutexImpl( bool recursive)
   // Declare the pthread_mutex data structures
   pthread_mutex_t* mutex =
     static_cast<pthread_mutex_t*>(malloc(sizeof(pthread_mutex_t)));
+
+  if (mutex == nullptr)
+    report_bad_alloc_error("Mutex allocation failed");
+
   pthread_mutexattr_t attr;
 
   // Initialize the mutex attributes
diff --git a/contrib/llvm/lib/Support/Parallel.cpp b/contrib/llvm/lib/Support/Parallel.cpp
new file mode 100644
index 0000000..ab2cfde
--- /dev/null
+++ b/contrib/llvm/lib/Support/Parallel.cpp
@@ -0,0 +1,138 @@
+//===- llvm/Support/Parallel.cpp - Parallel algorithms --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Parallel.h"
+#include "llvm/Config/llvm-config.h"
+
+#include <atomic>
+#include <stack>
+#include <thread>
+
+using namespace llvm;
+
+namespace {
+
+/// \brief An abstract class that takes closures and runs them asynchronously.
+class Executor {
+public:
+  virtual ~Executor() = default;
+  virtual void add(std::function<void()> func) = 0;
+
+  static Executor *getDefaultExecutor();
+};
+
+#if !LLVM_ENABLE_THREADS
+class SyncExecutor : public Executor {
+public:
+  virtual void add(std::function<void()> F) { F(); }
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static SyncExecutor Exec;
+  return &Exec;
+}
+
+#elif defined(_MSC_VER)
+/// \brief An Executor that runs tasks via ConcRT.
+class ConcRTExecutor : public Executor {
+  struct Taskish {
+    Taskish(std::function<void()> Task) : Task(Task) {}
+
+    std::function<void()> Task;
+
+    static void run(void *P) {
+      Taskish *Self = static_cast<Taskish *>(P);
+      Self->Task();
+      concurrency::Free(Self);
+    }
+  };
+
+public:
+  virtual void add(std::function<void()> F) {
+    Concurrency::CurrentScheduler::ScheduleTask(
+        Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F));
+  }
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static ConcRTExecutor exec;
+  return &exec;
+}
+
+#else
+/// \brief An implementation of an Executor that runs closures on a thread pool
+///   in filo order.
+class ThreadPoolExecutor : public Executor {
+public:
+  explicit ThreadPoolExecutor(
+      unsigned ThreadCount = std::thread::hardware_concurrency())
+      : Done(ThreadCount) {
+    // Spawn all but one of the threads in another thread as spawning threads
+    // can take a while.
+    std::thread([&, ThreadCount] {
+      for (size_t i = 1; i < ThreadCount; ++i) {
+        std::thread([=] { work(); }).detach();
+      }
+      work();
+    }).detach();
+  }
+
+  ~ThreadPoolExecutor() override {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    Stop = true;
+    Lock.unlock();
+    Cond.notify_all();
+    // Wait for ~Latch.
+  }
+
+  void add(std::function<void()> F) override {
+    std::unique_lock<std::mutex> Lock(Mutex);
+    WorkStack.push(F);
+    Lock.unlock();
+    Cond.notify_one();
+  }
+
+private:
+  void work() {
+    while (true) {
+      std::unique_lock<std::mutex> Lock(Mutex);
+      Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
+      if (Stop)
+        break;
+      auto Task = WorkStack.top();
+      WorkStack.pop();
+      Lock.unlock();
+      Task();
+    }
+    Done.dec();
+  }
+
+  std::atomic<bool> Stop{false};
+  std::stack<std::function<void()>> WorkStack;
+  std::mutex Mutex;
+  std::condition_variable Cond;
+  parallel::detail::Latch Done;
+};
+
+Executor *Executor::getDefaultExecutor() {
+  static ThreadPoolExecutor exec;
+  return &exec;
+}
+#endif
+}
+
+#if LLVM_ENABLE_THREADS
+void parallel::detail::TaskGroup::spawn(std::function<void()> F) {
+  L.inc();
+  Executor::getDefaultExecutor()->add([&, F] {
+    F();
+    L.dec();
+  });
+}
+#endif
diff --git a/contrib/llvm/lib/Support/Path.cpp b/contrib/llvm/lib/Support/Path.cpp
index 4bb035e..ea59ba6 100644
--- a/contrib/llvm/lib/Support/Path.cpp
+++ b/contrib/llvm/lib/Support/Path.cpp
@@ -11,13 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/MachO.h"
+#include "llvm/Support/Path.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include <cctype>
 #include <cstring>
@@ -34,16 +33,29 @@ using namespace llvm::support::endian;
 namespace {
   using llvm::StringRef;
   using llvm::sys::path::is_separator;
+  using llvm::sys::path::Style;
 
+  inline Style real_style(Style style) {
 #ifdef LLVM_ON_WIN32
-  const char *separators = "\\/";
-  const char preferred_separator = '\\';
+    return (style == Style::posix) ? Style::posix : Style::windows;
 #else
-  const char  separators = '/';
-  const char preferred_separator = '/';
+    return (style == Style::windows) ? Style::windows : Style::posix;
 #endif
+  }
+
+  inline const char *separators(Style style) {
+    if (real_style(style) == Style::windows)
+      return "\\/";
+    return "/";
+  }
+
+  inline char preferred_separator(Style style) {
+    if (real_style(style) == Style::windows)
+      return '\\';
+    return '/';
+  }
 
-  StringRef find_first_component(StringRef path) {
+  StringRef find_first_component(StringRef path, Style style) {
     // Look for this first component in the following order.
     // * empty (in this case we return an empty string)
     // * either C: or {//,\\}net.
@@ -53,96 +65,85 @@ namespace {
     if (path.empty())
       return path;
 
-#ifdef LLVM_ON_WIN32
-    // C:
-    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
-        path[1] == ':')
-      return path.substr(0, 2);
-#endif
+    if (real_style(style) == Style::windows) {
+      // C:
+      if (path.size() >= 2 &&
+          std::isalpha(static_cast<unsigned char>(path[0])) && path[1] == ':')
+        return path.substr(0, 2);
+    }
 
     // //net
-    if ((path.size() > 2) &&
-        is_separator(path[0]) &&
-        path[0] == path[1] &&
-        !is_separator(path[2])) {
+    if ((path.size() > 2) && is_separator(path[0], style) &&
+        path[0] == path[1] && !is_separator(path[2], style)) {
       // Find the next directory separator.
-      size_t end = path.find_first_of(separators, 2);
+      size_t end = path.find_first_of(separators(style), 2);
       return path.substr(0, end);
     }
 
     // {/,\}
-    if (is_separator(path[0]))
+    if (is_separator(path[0], style))
       return path.substr(0, 1);
 
     // * {file,directory}name
-    size_t end = path.find_first_of(separators);
+    size_t end = path.find_first_of(separators(style));
     return path.substr(0, end);
   }
 
-  size_t filename_pos(StringRef str) {
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
+  size_t filename_pos(StringRef str, Style style) {
+    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
       return 0;
 
-    if (str.size() > 0 && is_separator(str[str.size() - 1]))
+    if (str.size() > 0 && is_separator(str[str.size() - 1], style))
       return str.size() - 1;
 
-    size_t pos = str.find_last_of(separators, str.size() - 1);
+    size_t pos = str.find_last_of(separators(style), str.size() - 1);
 
-#ifdef LLVM_ON_WIN32
-    if (pos == StringRef::npos)
-      pos = str.find_last_of(':', str.size() - 2);
-#endif
+    if (real_style(style) == Style::windows) {
+      if (pos == StringRef::npos)
+        pos = str.find_last_of(':', str.size() - 2);
+    }
 
-    if (pos == StringRef::npos ||
-        (pos == 1 && is_separator(str[0])))
+    if (pos == StringRef::npos || (pos == 1 && is_separator(str[0], style)))
       return 0;
 
     return pos + 1;
   }
 
-  size_t root_dir_start(StringRef str) {
+  size_t root_dir_start(StringRef str, Style style) {
     // case "c:/"
-#ifdef LLVM_ON_WIN32
-    if (str.size() > 2 &&
-        str[1] == ':' &&
-        is_separator(str[2]))
-      return 2;
-#endif
+    if (real_style(style) == Style::windows) {
+      if (str.size() > 2 && str[1] == ':' && is_separator(str[2], style))
+        return 2;
+    }
 
     // case "//"
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
+    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
       return StringRef::npos;
 
     // case "//net"
-    if (str.size() > 3 &&
-        is_separator(str[0]) &&
-        str[0] == str[1] &&
-        !is_separator(str[2])) {
-      return str.find_first_of(separators, 2);
+    if (str.size() > 3 && is_separator(str[0], style) && str[0] == str[1] &&
+        !is_separator(str[2], style)) {
+      return str.find_first_of(separators(style), 2);
     }
 
     // case "/"
-    if (str.size() > 0 && is_separator(str[0]))
+    if (str.size() > 0 && is_separator(str[0], style))
       return 0;
 
     return StringRef::npos;
   }
 
-  size_t parent_path_end(StringRef path) {
-    size_t end_pos = filename_pos(path);
+  size_t parent_path_end(StringRef path, Style style) {
+    size_t end_pos = filename_pos(path, style);
 
-    bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
+    bool filename_was_sep =
+        path.size() > 0 && is_separator(path[end_pos], style);
 
     // Skip separators except for root dir.
-    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
+    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos), style);
 
-    while(end_pos > 0 &&
-          (end_pos - 1) != root_dir_pos &&
-          is_separator(path[end_pos - 1]))
+    while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+           is_separator(path[end_pos - 1], style))
       --end_pos;
 
     if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
@@ -230,11 +231,12 @@ namespace llvm {
 namespace sys  {
 namespace path {
 
-const_iterator begin(StringRef path) {
+const_iterator begin(StringRef path, Style style) {
   const_iterator i;
   i.Path      = path;
-  i.Component = find_first_component(path);
+  i.Component = find_first_component(path, style);
   i.Position  = 0;
+  i.S = style;
   return i;
 }
 
@@ -259,27 +261,21 @@ const_iterator &const_iterator::operator++() {
 
   // Both POSIX and Windows treat paths that begin with exactly two separators
   // specially.
-  bool was_net = Component.size() > 2 &&
-    is_separator(Component[0]) &&
-    Component[1] == Component[0] &&
-    !is_separator(Component[2]);
+  bool was_net = Component.size() > 2 && is_separator(Component[0], S) &&
+                 Component[1] == Component[0] && !is_separator(Component[2], S);
 
   // Handle separators.
-  if (is_separator(Path[Position])) {
+  if (is_separator(Path[Position], S)) {
     // Root dir.
-    if (was_net
-#ifdef LLVM_ON_WIN32
+    if (was_net ||
         // c:/
-        || Component.endswith(":")
-#endif
-        ) {
+        (real_style(S) == Style::windows && Component.endswith(":"))) {
       Component = Path.substr(Position, 1);
       return *this;
     }
 
     // Skip extra separators.
-    while (Position != Path.size() &&
-           is_separator(Path[Position])) {
+    while (Position != Path.size() && is_separator(Path[Position], S)) {
       ++Position;
     }
 
@@ -292,7 +288,7 @@ const_iterator &const_iterator::operator++() {
   }
 
   // Find next component.
-  size_t end_pos = Path.find_first_of(separators, Position);
+  size_t end_pos = Path.find_first_of(separators(S), Position);
   Component = Path.slice(Position, end_pos);
 
   return *this;
@@ -306,10 +302,11 @@ ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
   return Position - RHS.Position;
 }
 
-reverse_iterator rbegin(StringRef Path) {
+reverse_iterator rbegin(StringRef Path, Style style) {
   reverse_iterator I;
   I.Path = Path;
   I.Position = Path.size();
+  I.S = style;
   return ++I;
 }
 
@@ -324,10 +321,9 @@ reverse_iterator rend(StringRef Path) {
 reverse_iterator &reverse_iterator::operator++() {
   // If we're at the end and the previous char was a '/', return '.' unless
   // we are the root path.
-  size_t root_dir_pos = root_dir_start(Path);
-  if (Position == Path.size() &&
-      Path.size() > root_dir_pos + 1 &&
-      is_separator(Path[Position - 1])) {
+  size_t root_dir_pos = root_dir_start(Path, S);
+  if (Position == Path.size() && Path.size() > root_dir_pos + 1 &&
+      is_separator(Path[Position - 1], S)) {
     --Position;
     Component = ".";
     return *this;
@@ -336,13 +332,12 @@ reverse_iterator &reverse_iterator::operator++() {
   // Skip separators unless it's the root directory.
   size_t end_pos = Position;
 
-  while(end_pos > 0 &&
-        (end_pos - 1) != root_dir_pos &&
-        is_separator(Path[end_pos - 1]))
+  while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+         is_separator(Path[end_pos - 1], S))
     --end_pos;
 
   // Find next separator.
-  size_t start_pos = filename_pos(Path.substr(0, end_pos));
+  size_t start_pos = filename_pos(Path.substr(0, end_pos), S);
   Component = Path.slice(start_pos, end_pos);
   Position = start_pos;
   return *this;
@@ -357,21 +352,15 @@ ptrdiff_t reverse_iterator::operator-(const reverse_iterator &RHS) const {
   return Position - RHS.Position;
 }
 
-StringRef root_path(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
+StringRef root_path(StringRef path, Style style) {
+  const_iterator b = begin(path, style), pos = b, e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if (has_net || has_drive) {
-      if ((++pos != e) && is_separator((*pos)[0])) {
+      if ((++pos != e) && is_separator((*pos)[0], style)) {
         // {C:/,//net/}, so get the first two components.
         return path.substr(0, b->size() + pos->size());
       } else {
@@ -381,7 +370,7 @@ StringRef root_path(StringRef path) {
     }
 
     // POSIX style root directory.
-    if (is_separator((*b)[0])) {
+    if (is_separator((*b)[0], style)) {
       return *b;
     }
   }
@@ -389,17 +378,12 @@ StringRef root_path(StringRef path) {
   return StringRef();
 }
 
-StringRef root_name(StringRef path) {
-  const_iterator b = begin(path),
-                 e = end(path);
+StringRef root_name(StringRef path, Style style) {
+  const_iterator b = begin(path, style), e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if (has_net || has_drive) {
       // just {C:,//net}, return the first component.
@@ -411,27 +395,21 @@ StringRef root_name(StringRef path) {
   return StringRef();
 }
 
-StringRef root_directory(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
+StringRef root_directory(StringRef path, Style style) {
+  const_iterator b = begin(path, style), pos = b, e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if ((has_net || has_drive) &&
         // {C:,//net}, skip to the next component.
-        (++pos != e) && is_separator((*pos)[0])) {
+        (++pos != e) && is_separator((*pos)[0], style)) {
       return *pos;
     }
 
     // POSIX style root directory.
-    if (!has_net && is_separator((*b)[0])) {
+    if (!has_net && is_separator((*b)[0], style)) {
       return *b;
     }
   }
@@ -440,15 +418,13 @@ StringRef root_directory(StringRef path) {
   return StringRef();
 }
 
-StringRef relative_path(StringRef path) {
-  StringRef root = root_path(path);
+StringRef relative_path(StringRef path, Style style) {
+  StringRef root = root_path(path, style);
   return path.substr(root.size());
 }
 
-void append(SmallVectorImpl<char> &path, const Twine &a,
-                                         const Twine &b,
-                                         const Twine &c,
-                                         const Twine &d) {
+void append(SmallVectorImpl<char> &path, Style style, const Twine &a,
+            const Twine &b, const Twine &c, const Twine &d) {
   SmallString<32> a_storage;
   SmallString<32> b_storage;
   SmallString<32> c_storage;
@@ -461,13 +437,15 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
   if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
 
   for (auto &component : components) {
-    bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
-    bool component_has_sep = !component.empty() && is_separator(component[0]);
-    bool is_root_name = has_root_name(component);
+    bool path_has_sep =
+        !path.empty() && is_separator(path[path.size() - 1], style);
+    bool component_has_sep =
+        !component.empty() && is_separator(component[0], style);
+    bool is_root_name = has_root_name(component, style);
 
     if (path_has_sep) {
       // Strip separators from beginning of component.
-      size_t loc = component.find_first_not_of(separators);
+      size_t loc = component.find_first_not_of(separators(style));
       StringRef c = component.substr(loc);
 
       // Append it.
@@ -477,41 +455,47 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 
     if (!component_has_sep && !(path.empty() || is_root_name)) {
       // Add a separator.
-      path.push_back(preferred_separator);
+      path.push_back(preferred_separator(style));
     }
 
     path.append(component.begin(), component.end());
   }
 }
 
-void append(SmallVectorImpl<char> &path,
-            const_iterator begin, const_iterator end) {
+void append(SmallVectorImpl<char> &path, const Twine &a, const Twine &b,
+            const Twine &c, const Twine &d) {
+  append(path, Style::native, a, b, c, d);
+}
+
+void append(SmallVectorImpl<char> &path, const_iterator begin,
+            const_iterator end, Style style) {
   for (; begin != end; ++begin)
-    path::append(path, *begin);
+    path::append(path, style, *begin);
 }
 
-StringRef parent_path(StringRef path) {
-  size_t end_pos = parent_path_end(path);
+StringRef parent_path(StringRef path, Style style) {
+  size_t end_pos = parent_path_end(path, style);
   if (end_pos == StringRef::npos)
     return StringRef();
   else
     return path.substr(0, end_pos);
 }
 
-void remove_filename(SmallVectorImpl<char> &path) {
-  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
+void remove_filename(SmallVectorImpl<char> &path, Style style) {
+  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()), style);
   if (end_pos != StringRef::npos)
     path.set_size(end_pos);
 }
 
-void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
+                       Style style) {
   StringRef p(path.begin(), path.size());
   SmallString<32> ext_storage;
   StringRef ext = extension.toStringRef(ext_storage);
 
   // Erase existing extension.
   size_t pos = p.find_last_of('.');
-  if (pos != StringRef::npos && pos >= filename_pos(p))
+  if (pos != StringRef::npos && pos >= filename_pos(p, style))
     path.set_size(pos);
 
   // Append '.' if needed.
@@ -523,8 +507,8 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
 }
 
 void replace_path_prefix(SmallVectorImpl<char> &Path,
-                         const StringRef &OldPrefix,
-                         const StringRef &NewPrefix) {
+                         const StringRef &OldPrefix, const StringRef &NewPrefix,
+                         Style style) {
   if (OldPrefix.empty() && NewPrefix.empty())
     return;
 
@@ -540,53 +524,58 @@ void replace_path_prefix(SmallVectorImpl<char> &Path,
 
   StringRef RelPath = OrigPath.substr(OldPrefix.size());
   SmallString<256> NewPath;
-  path::append(NewPath, NewPrefix);
-  path::append(NewPath, RelPath);
+  path::append(NewPath, style, NewPrefix);
+  path::append(NewPath, style, RelPath);
   Path.swap(NewPath);
 }
 
-void native(const Twine &path, SmallVectorImpl<char> &result) {
+void native(const Twine &path, SmallVectorImpl<char> &result, Style style) {
   assert((!path.isSingleStringRef() ||
           path.getSingleStringRef().data() != result.data()) &&
          "path and result are not allowed to overlap!");
   // Clear result.
   result.clear();
   path.toVector(result);
-  native(result);
+  native(result, style);
 }
 
-void native(SmallVectorImpl<char> &Path) {
-#ifdef LLVM_ON_WIN32
-  std::replace(Path.begin(), Path.end(), '/', '\\');
-#else
-  for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
-    if (*PI == '\\') {
-      auto PN = PI + 1;
-      if (PN < PE && *PN == '\\')
-        ++PI; // increment once, the for loop will move over the escaped slash
-      else
-        *PI = '/';
+void native(SmallVectorImpl<char> &Path, Style style) {
+  if (Path.empty())
+    return;
+  if (real_style(style) == Style::windows) {
+    std::replace(Path.begin(), Path.end(), '/', '\\');
+    if (Path[0] == '~' && (Path.size() == 1 || is_separator(Path[1], style))) {
+      SmallString<128> PathHome;
+      home_directory(PathHome);
+      PathHome.append(Path.begin() + 1, Path.end());
+      Path = PathHome;
+    }
+  } else {
+    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
+      if (*PI == '\\') {
+        auto PN = PI + 1;
+        if (PN < PE && *PN == '\\')
+          ++PI; // increment once, the for loop will move over the escaped slash
+        else
+          *PI = '/';
+      }
     }
   }
-#endif
 }
 
-std::string convert_to_slash(StringRef path) {
-#ifdef LLVM_ON_WIN32
+std::string convert_to_slash(StringRef path, Style style) {
+  if (real_style(style) != Style::windows)
+    return path;
+
   std::string s = path.str();
   std::replace(s.begin(), s.end(), '\\', '/');
   return s;
-#else
-  return path;
-#endif
 }
 
-StringRef filename(StringRef path) {
-  return *rbegin(path);
-}
+StringRef filename(StringRef path, Style style) { return *rbegin(path, style); }
 
-StringRef stem(StringRef path) {
-  StringRef fname = filename(path);
+StringRef stem(StringRef path, Style style) {
+  StringRef fname = filename(path, style);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return fname;
@@ -598,8 +587,8 @@ StringRef stem(StringRef path) {
       return fname.substr(0, pos);
 }
 
-StringRef extension(StringRef path) {
-  StringRef fname = filename(path);
+StringRef extension(StringRef path, Style style) {
+  StringRef fname = filename(path, style);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return StringRef();
@@ -611,110 +600,109 @@ StringRef extension(StringRef path) {
       return fname.substr(pos);
 }
 
-bool is_separator(char value) {
-  switch(value) {
-#ifdef LLVM_ON_WIN32
-    case '\\': // fall through
-#endif
-    case '/': return true;
-    default: return false;
-  }
+bool is_separator(char value, Style style) {
+  if (value == '/')
+    return true;
+  if (real_style(style) == Style::windows)
+    return value == '\\';
+  return false;
 }
 
-static const char preferred_separator_string[] = { preferred_separator, '\0' };
-
-StringRef get_separator() {
-  return preferred_separator_string;
+StringRef get_separator(Style style) {
+  if (real_style(style) == Style::windows)
+    return "\\";
+  return "/";
 }
 
-bool has_root_name(const Twine &path) {
+bool has_root_name(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_name(p).empty();
+  return !root_name(p, style).empty();
 }
 
-bool has_root_directory(const Twine &path) {
+bool has_root_directory(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_directory(p).empty();
+  return !root_directory(p, style).empty();
 }
 
-bool has_root_path(const Twine &path) {
+bool has_root_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_path(p).empty();
+  return !root_path(p, style).empty();
 }
 
-bool has_relative_path(const Twine &path) {
+bool has_relative_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !relative_path(p).empty();
+  return !relative_path(p, style).empty();
 }
 
-bool has_filename(const Twine &path) {
+bool has_filename(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !filename(p).empty();
+  return !filename(p, style).empty();
 }
 
-bool has_parent_path(const Twine &path) {
+bool has_parent_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !parent_path(p).empty();
+  return !parent_path(p, style).empty();
 }
 
-bool has_stem(const Twine &path) {
+bool has_stem(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !stem(p).empty();
+  return !stem(p, style).empty();
 }
 
-bool has_extension(const Twine &path) {
+bool has_extension(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !extension(p).empty();
+  return !extension(p, style).empty();
 }
 
-bool is_absolute(const Twine &path) {
+bool is_absolute(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  bool rootDir = has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = has_root_name(p);
-#else
-       rootName = true;
-#endif
+  bool rootDir = has_root_directory(p, style);
+  bool rootName =
+      (real_style(style) != Style::windows) || has_root_name(p, style);
 
   return rootDir && rootName;
 }
 
-bool is_relative(const Twine &path) { return !is_absolute(path); }
+bool is_relative(const Twine &path, Style style) {
+  return !is_absolute(path, style);
+}
 
-StringRef remove_leading_dotslash(StringRef Path) {
+StringRef remove_leading_dotslash(StringRef Path, Style style) {
   // Remove leading "./" (or ".//" or "././" etc.)
-  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1])) {
+  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1], style)) {
     Path = Path.substr(2);
-    while (Path.size() > 0 && is_separator(Path[0]))
+    while (Path.size() > 0 && is_separator(Path[0], style))
       Path = Path.substr(1);
   }
   return Path;
 }
 
-static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
+static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot,
+                                    Style style) {
   SmallVector<StringRef, 16> components;
 
   // Skip the root path, then look for traversal in the components.
-  StringRef rel = path::relative_path(path);
-  for (StringRef C : llvm::make_range(path::begin(rel), path::end(rel))) {
+  StringRef rel = path::relative_path(path, style);
+  for (StringRef C :
+       llvm::make_range(path::begin(rel, style), path::end(rel))) {
     if (C == ".")
       continue;
     // Leading ".." will remain in the path unless it's at the root.
@@ -723,22 +711,23 @@ static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
         components.pop_back();
         continue;
       }
-      if (path::is_absolute(path))
+      if (path::is_absolute(path, style))
         continue;
     }
     components.push_back(C);
   }
 
-  SmallString<256> buffer = path::root_path(path);
+  SmallString<256> buffer = path::root_path(path, style);
   for (StringRef C : components)
-    path::append(buffer, C);
+    path::append(buffer, style, C);
   return buffer;
 }
 
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot) {
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot,
+                 Style style) {
   StringRef p(path.data(), path.size());
 
-  SmallString<256> result = remove_dots(p, remove_dot_dot);
+  SmallString<256> result = remove_dots(p, remove_dot_dot, style);
   if (result == path)
     return false;
 
@@ -776,7 +765,7 @@ createTemporaryFile(const Twine &Model, int &ResultFD,
                     llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   SmallString<128> Storage;
   StringRef P = Model.toNullTerminatedStringRef(Storage);
-  assert(P.find_first_of(separators) == StringRef::npos &&
+  assert(P.find_first_of(separators(Style::native)) == StringRef::npos &&
          "Model must be a simple filename.");
   // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
   return createUniqueEntity(P.begin(), ResultFD, ResultPath,
@@ -818,12 +807,9 @@ static std::error_code make_absolute(const Twine &current_directory,
                                      bool use_current_directory) {
   StringRef p(path.data(), path.size());
 
-  bool rootDirectory = path::has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = path::has_root_name(p);
-#else
-       rootName = true;
-#endif
+  bool rootDirectory = path::has_root_directory(p);
+  bool rootName =
+      (real_style(Style::native) != Style::windows) || path::has_root_name(p);
 
   // Already absolute.
   if (rootName && rootDirectory)
@@ -937,6 +923,36 @@ std::error_code copy_file(const Twine &From, const Twine &To) {
   return std::error_code();
 }
 
+ErrorOr<MD5::MD5Result> md5_contents(int FD) {
+  MD5 Hash;
+
+  constexpr size_t BufSize = 4096;
+  std::vector<uint8_t> Buf(BufSize);
+  int BytesRead = 0;
+  for (;;) {
+    BytesRead = read(FD, Buf.data(), BufSize);
+    if (BytesRead <= 0)
+      break;
+    Hash.update(makeArrayRef(Buf.data(), BytesRead));
+  }
+
+  if (BytesRead < 0)
+    return std::error_code(errno, std::generic_category());
+  MD5::MD5Result Result;
+  Hash.final(Result);
+  return Result;
+}
+
+ErrorOr<MD5::MD5Result> md5_contents(const Twine &Path) {
+  int FD;
+  if (auto EC = openFileForRead(Path, FD))
+    return EC;
+
+  auto Result = md5_contents(FD);
+  close(FD);
+  return Result;
+}
+
 bool exists(file_status status) {
   return status_known(status) && status.type() != file_type::file_not_found;
 }
@@ -945,6 +961,13 @@ bool status_known(file_status s) {
   return s.type() != file_type::status_error;
 }
 
+file_type get_file_type(const Twine &Path, bool Follow) {
+  file_status st;
+  if (status(Path, st, Follow))
+    return file_type::status_error;
+  return st.type();
+}
+
 bool is_directory(file_status status) {
   return status.type() == file_type::directory_file;
 }
@@ -969,6 +992,18 @@ std::error_code is_regular_file(const Twine &path, bool &result) {
   return std::error_code();
 }
 
+bool is_symlink_file(file_status status) {
+  return status.type() == file_type::symlink_file;
+}
+
+std::error_code is_symlink_file(const Twine &path, bool &result) {
+  file_status st;
+  if (std::error_code ec = status(path, st, false))
+    return ec;
+  result = is_symlink_file(st);
+  return std::error_code();
+}
+
 bool is_other(file_status status) {
   return exists(status) &&
          !is_regular_file(status) &&
@@ -990,179 +1025,16 @@ void directory_entry::replace_filename(const Twine &filename, file_status st) {
   Status = st;
 }
 
-template <size_t N>
-static bool startswith(StringRef Magic, const char (&S)[N]) {
-  return Magic.startswith(StringRef(S, N - 1));
-}
-
-/// @brief Identify the magic in magic.
-file_magic identify_magic(StringRef Magic) {
-  if (Magic.size() < 4)
-    return file_magic::unknown;
-  switch ((unsigned char)Magic[0]) {
-    case 0x00: {
-      // COFF bigobj, CL.exe's LTO object file, or short import library file
-      if (startswith(Magic, "\0\0\xFF\xFF")) {
-        size_t MinSize = offsetof(COFF::BigObjHeader, UUID) + sizeof(COFF::BigObjMagic);
-        if (Magic.size() < MinSize)
-          return file_magic::coff_import_library;
-
-        const char *Start = Magic.data() + offsetof(COFF::BigObjHeader, UUID);
-        if (memcmp(Start, COFF::BigObjMagic, sizeof(COFF::BigObjMagic)) == 0)
-          return file_magic::coff_object;
-        if (memcmp(Start, COFF::ClGlObjMagic, sizeof(COFF::BigObjMagic)) == 0)
-          return file_magic::coff_cl_gl_object;
-        return file_magic::coff_import_library;
-      }
-      // Windows resource file
-      if (startswith(Magic, "\0\0\0\0\x20\0\0\0\xFF"))
-        return file_magic::windows_resource;
-      // 0x0000 = COFF unknown machine type
-      if (Magic[1] == 0)
-        return file_magic::coff_object;
-      if (startswith(Magic, "\0asm"))
-        return file_magic::wasm_object;
-      break;
-    }
-    case 0xDE:  // 0x0B17C0DE = BC wraper
-      if (startswith(Magic, "\xDE\xC0\x17\x0B"))
-        return file_magic::bitcode;
-      break;
-    case 'B':
-      if (startswith(Magic, "BC\xC0\xDE"))
-        return file_magic::bitcode;
-      break;
-    case '!':
-      if (startswith(Magic, "!<arch>\n") || startswith(Magic, "!<thin>\n"))
-        return file_magic::archive;
-      break;
-
-    case '\177':
-      if (startswith(Magic, "\177ELF") && Magic.size() >= 18) {
-        bool Data2MSB = Magic[5] == 2;
-        unsigned high = Data2MSB ? 16 : 17;
-        unsigned low  = Data2MSB ? 17 : 16;
-        if (Magic[high] == 0) {
-          switch (Magic[low]) {
-            default: return file_magic::elf;
-            case 1: return file_magic::elf_relocatable;
-            case 2: return file_magic::elf_executable;
-            case 3: return file_magic::elf_shared_object;
-            case 4: return file_magic::elf_core;
-          }
-        }
-        // It's still some type of ELF file.
-        return file_magic::elf;
-      }
-      break;
-
-    case 0xCA:
-      if (startswith(Magic, "\xCA\xFE\xBA\xBE") ||
-          startswith(Magic, "\xCA\xFE\xBA\xBF")) {
-        // This is complicated by an overlap with Java class files.
-        // See the Mach-O section in /usr/share/file/magic for details.
-        if (Magic.size() >= 8 && Magic[7] < 43)
-          return file_magic::macho_universal_binary;
-      }
-      break;
-
-      // The two magic numbers for mach-o are:
-      // 0xfeedface - 32-bit mach-o
-      // 0xfeedfacf - 64-bit mach-o
-    case 0xFE:
-    case 0xCE:
-    case 0xCF: {
-      uint16_t type = 0;
-      if (startswith(Magic, "\xFE\xED\xFA\xCE") ||
-          startswith(Magic, "\xFE\xED\xFA\xCF")) {
-        /* Native endian */
-        size_t MinSize;
-        if (Magic[3] == char(0xCE))
-          MinSize = sizeof(MachO::mach_header);
-        else
-          MinSize = sizeof(MachO::mach_header_64);
-        if (Magic.size() >= MinSize)
-          type = Magic[12] << 24 | Magic[13] << 12 | Magic[14] << 8 | Magic[15];
-      } else if (startswith(Magic, "\xCE\xFA\xED\xFE") ||
-                 startswith(Magic, "\xCF\xFA\xED\xFE")) {
-        /* Reverse endian */
-        size_t MinSize;
-        if (Magic[0] == char(0xCE))
-          MinSize = sizeof(MachO::mach_header);
-        else
-          MinSize = sizeof(MachO::mach_header_64);
-        if (Magic.size() >= MinSize)
-          type = Magic[15] << 24 | Magic[14] << 12 |Magic[13] << 8 | Magic[12];
-      }
-      switch (type) {
-        default: break;
-        case 1: return file_magic::macho_object;
-        case 2: return file_magic::macho_executable;
-        case 3: return file_magic::macho_fixed_virtual_memory_shared_lib;
-        case 4: return file_magic::macho_core;
-        case 5: return file_magic::macho_preload_executable;
-        case 6: return file_magic::macho_dynamically_linked_shared_lib;
-        case 7: return file_magic::macho_dynamic_linker;
-        case 8: return file_magic::macho_bundle;
-        case 9: return file_magic::macho_dynamically_linked_shared_lib_stub;
-        case 10: return file_magic::macho_dsym_companion;
-        case 11: return file_magic::macho_kext_bundle;
-      }
-      break;
-    }
-    case 0xF0: // PowerPC Windows
-    case 0x83: // Alpha 32-bit
-    case 0x84: // Alpha 64-bit
-    case 0x66: // MPS R4000 Windows
-    case 0x50: // mc68K
-    case 0x4c: // 80386 Windows
-    case 0xc4: // ARMNT Windows
-      if (Magic[1] == 0x01)
-        return file_magic::coff_object;
-
-    case 0x90: // PA-RISC Windows
-    case 0x68: // mc68K Windows
-      if (Magic[1] == 0x02)
-        return file_magic::coff_object;
-      break;
-
-    case 'M': // Possible MS-DOS stub on Windows PE file
-      if (startswith(Magic, "MZ")) {
-        uint32_t off = read32le(Magic.data() + 0x3c);
-        // PE/COFF file, either EXE or DLL.
-        if (off < Magic.size() &&
-            memcmp(Magic.data()+off, COFF::PEMagic, sizeof(COFF::PEMagic)) == 0)
-          return file_magic::pecoff_executable;
-      }
-      break;
-
-    case 0x64: // x86-64 Windows.
-      if (Magic[1] == char(0x86))
-        return file_magic::coff_object;
-      break;
-
-    default:
-      break;
-  }
-  return file_magic::unknown;
+std::error_code directory_entry::status(file_status &result) const {
+  return fs::status(Path, result, FollowSymlinks);
 }
 
-std::error_code identify_magic(const Twine &Path, file_magic &Result) {
-  int FD;
-  if (std::error_code EC = openFileForRead(Path, FD))
+ErrorOr<perms> getPermissions(const Twine &Path) {
+  file_status Status;
+  if (std::error_code EC = status(Path, Status))
     return EC;
 
-  char Buffer[32];
-  int Length = read(FD, Buffer, sizeof(Buffer));
-  if (close(FD) != 0 || Length < 0)
-    return std::error_code(errno, std::generic_category());
-
-  Result = identify_magic(StringRef(Buffer, Length));
-  return std::error_code();
-}
-
-std::error_code directory_entry::status(file_status &result) const {
-  return fs::status(Path, result);
+  return Status.permissions();
 }
 
 } // end namespace fs
diff --git a/contrib/llvm/lib/Support/PrettyStackTrace.cpp b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
index 5b079ff..a18e9cc 100644
--- a/contrib/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/contrib/llvm/lib/Support/PrettyStackTrace.cpp
@@ -15,13 +15,14 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm-c/ErrorHandling.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Config/config.h" // Get autoconf configuration settings
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <cstdarg>
+#include <cstdio>
 #include <tuple>
 
 #ifdef HAVE_CRASHREPORTERCLIENT_H
diff --git a/contrib/llvm/lib/Support/Process.cpp b/contrib/llvm/lib/Support/Process.cpp
index 290c30f..caec993 100644
--- a/contrib/llvm/lib/Support/Process.cpp
+++ b/contrib/llvm/lib/Support/Process.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Process.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Support/RWMutex.cpp b/contrib/llvm/lib/Support/RWMutex.cpp
index 3b6309c..83c6d1d 100644
--- a/contrib/llvm/lib/Support/RWMutex.cpp
+++ b/contrib/llvm/lib/Support/RWMutex.cpp
@@ -11,9 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Config/config.h"
 #include "llvm/Support/RWMutex.h"
-#include <cstring>
+#include "llvm/Config/config.h"
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
@@ -22,29 +21,31 @@
 
 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
 // Define all methods as no-ops if threading is explicitly disabled
-namespace llvm {
+
+using namespace llvm;
 using namespace sys;
-RWMutexImpl::RWMutexImpl() { }
-RWMutexImpl::~RWMutexImpl() { }
+
+RWMutexImpl::RWMutexImpl() = default;
+RWMutexImpl::~RWMutexImpl() = default;
+
 bool RWMutexImpl::reader_acquire() { return true; }
 bool RWMutexImpl::reader_release() { return true; }
 bool RWMutexImpl::writer_acquire() { return true; }
 bool RWMutexImpl::writer_release() { return true; }
-}
+
 #else
 
 #if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_RWLOCK_INIT)
 
 #include <cassert>
+#include <cstdlib>
 #include <pthread.h>
-#include <stdlib.h>
 
-namespace llvm {
+using namespace llvm;
 using namespace sys;
 
 // Construct a RWMutex using pthread calls
 RWMutexImpl::RWMutexImpl()
-  : data_(nullptr)
 {
   // Declare the pthread_rwlock data structures
   pthread_rwlock_t* rwlock =
@@ -113,8 +114,6 @@ RWMutexImpl::writer_release()
   return errorcode == 0;
 }
 
-}
-
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/RWMutex.inc"
 #elif defined( LLVM_ON_WIN32)
diff --git a/contrib/llvm/lib/Support/Regex.cpp b/contrib/llvm/lib/Support/Regex.cpp
index 68ba79e..b1087fd 100644
--- a/contrib/llvm/lib/Support/Regex.cpp
+++ b/contrib/llvm/lib/Support/Regex.cpp
@@ -48,7 +48,7 @@ Regex::~Regex() {
   }
 }
 
-bool Regex::isValid(std::string &Error) {
+bool Regex::isValid(std::string &Error) const {
   if (!error)
     return true;
   
diff --git a/contrib/llvm/lib/Support/SHA1.cpp b/contrib/llvm/lib/Support/SHA1.cpp
index 0eefd99..20f41c5 100644
--- a/contrib/llvm/lib/Support/SHA1.cpp
+++ b/contrib/llvm/lib/Support/SHA1.cpp
@@ -15,9 +15,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Host.h"
 #include "llvm/Support/SHA1.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Host.h"
 using namespace llvm;
 
 #include <stdint.h>
diff --git a/contrib/llvm/lib/Support/ScopedPrinter.cpp b/contrib/llvm/lib/Support/ScopedPrinter.cpp
index d8ee1ef..537ff62 100644
--- a/contrib/llvm/lib/Support/ScopedPrinter.cpp
+++ b/contrib/llvm/lib/Support/ScopedPrinter.cpp
@@ -21,7 +21,8 @@ const std::string to_hexString(uint64_t Value, bool UpperCase) {
 }
 
 void ScopedPrinter::printBinaryImpl(StringRef Label, StringRef Str,
-                                    ArrayRef<uint8_t> Data, bool Block) {
+                                    ArrayRef<uint8_t> Data, bool Block,
+                                    uint32_t StartOffset) {
   if (Data.size() > 16)
     Block = true;
 
@@ -31,7 +32,8 @@ void ScopedPrinter::printBinaryImpl(StringRef Label, StringRef Str,
       OS << ": " << Str;
     OS << " (\n";
     if (!Data.empty())
-      OS << format_bytes_with_ascii(Data, 0, 16, 4, (IndentLevel + 1) * 2, true)
+      OS << format_bytes_with_ascii(Data, StartOffset, 16, 4,
+                                    (IndentLevel + 1) * 2, true)
          << "\n";
     startLine() << ")\n";
   } else {
diff --git a/contrib/llvm/lib/Support/SearchForAddressOfSpecialSymbol.cpp b/contrib/llvm/lib/Support/SearchForAddressOfSpecialSymbol.cpp
deleted file mode 100644
index 55f3320..0000000
--- a/contrib/llvm/lib/Support/SearchForAddressOfSpecialSymbol.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- SearchForAddressOfSpecialSymbol.cpp - Function addresses -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file pulls the addresses of certain symbols out of the linker.  It must
-//  include as few header files as possible because it declares the symbols as
-//  void*, which would conflict with the actual symbol type if any header
-//  declared it.
-//
-//===----------------------------------------------------------------------===//
-
-#include <string.h>
-
-// Must declare the symbols in the global namespace.
-static void *DoSearch(const char* symbolName) {
-#define EXPLICIT_SYMBOL(SYM) \
-   extern void *SYM; if (!strcmp(symbolName, #SYM)) return &SYM
-
-  // If this is darwin, it has some funky issues, try to solve them here.  Some
-  // important symbols are marked 'private external' which doesn't allow
-  // SearchForAddressOfSymbol to find them.  As such, we special case them here,
-  // there is only a small handful of them.
-
-#ifdef __APPLE__
-  {
-    // __eprintf is sometimes used for assert() handling on x86.
-    //
-    // FIXME: Currently disabled when using Clang, as we don't always have our
-    // runtime support libraries available.
-#ifndef __clang__
-#ifdef __i386__
-    EXPLICIT_SYMBOL(__eprintf);
-#endif
-#endif
-  }
-#endif
-
-#ifdef __CYGWIN__
-  {
-    EXPLICIT_SYMBOL(_alloca);
-    EXPLICIT_SYMBOL(__main);
-  }
-#endif
-
-#undef EXPLICIT_SYMBOL
-  return nullptr;
-}
-
-namespace llvm {
-void *SearchForAddressOfSpecialSymbol(const char* symbolName) {
-  return DoSearch(symbolName);
-}
-}  // namespace llvm
diff --git a/contrib/llvm/lib/Support/Signals.cpp b/contrib/llvm/lib/Support/Signals.cpp
index e5e38f5..256a22d 100644
--- a/contrib/llvm/lib/Support/Signals.cpp
+++ b/contrib/llvm/lib/Support/Signals.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Signals.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
@@ -23,19 +24,23 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Program.h"
-#include "llvm/Support/Signals.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Options.h"
 #include <vector>
 
-namespace llvm {
-using namespace sys;
-
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
+using namespace llvm;
+
+static cl::opt<bool>
+    DisableSymbolication("disable-symbolication",
+                         cl::desc("Disable symbolizing crash backtraces."),
+                         cl::init(false), cl::Hidden);
+
 static ManagedStatic<std::vector<std::pair<void (*)(void *), void *>>>
     CallBacksToRun;
 void sys::RunSignalHandlers() {
@@ -45,9 +50,6 @@ void sys::RunSignalHandlers() {
     I.first(I.second);
   CallBacksToRun->clear();
 }
-}
-
-using namespace llvm;
 
 static bool findModulesAndOffsets(void **StackTrace, int Depth,
                                   const char **Modules, intptr_t *Offsets,
@@ -71,6 +73,9 @@ static bool printSymbolizedStackTrace(StringRef Argv0,
 static bool printSymbolizedStackTrace(StringRef Argv0,
                                       void **StackTrace, int Depth,
                                       llvm::raw_ostream &OS) {
+  if (DisableSymbolication)
+    return false;
+
   // Don't recursively invoke the llvm-symbolizer binary.
   if (Argv0.find("llvm-symbolizer") != std::string::npos)
     return false;
diff --git a/contrib/llvm/lib/Support/SourceMgr.cpp b/contrib/llvm/lib/Support/SourceMgr.cpp
index 4cb9b2f..b0609d4 100644
--- a/contrib/llvm/lib/Support/SourceMgr.cpp
+++ b/contrib/llvm/lib/Support/SourceMgr.cpp
@@ -14,33 +14,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
 using namespace llvm;
 
 static const size_t TabStop = 8;
 
 namespace {
+
   struct LineNoCacheTy {
     const char *LastQuery;
     unsigned LastQueryBufferID;
     unsigned LineNoOfQuery;
   };
-}
+
+} // end anonymous namespace
 
 static LineNoCacheTy *getCache(void *Ptr) {
   return (LineNoCacheTy*)Ptr;
 }
 
-
 SourceMgr::~SourceMgr() {
-  // Delete the line # cache if allocated.
-  if (LineNoCacheTy *Cache = getCache(LineNoCache))
-    delete Cache;
+  delete getCache(LineNoCache);
 }
 
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
@@ -132,12 +143,10 @@ void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
      << ":" << FindLineNumber(IncludeLoc, CurBuf) << ":\n";
 }
 
-
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    const Twine &Msg,
                                    ArrayRef<SMRange> Ranges,
                                    ArrayRef<SMFixIt> FixIts) const {
-
   // First thing to do: find the current buffer containing the specified
   // location to pull out the source line.
   SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
@@ -223,7 +232,7 @@ void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
-  PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+  PrintMessage(errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
 }
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +242,7 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
                            int Line, int Col, SourceMgr::DiagKind Kind,
                            StringRef Msg, StringRef LineStr,
-                           ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+                           ArrayRef<std::pair<unsigned,unsigned>> Ranges,
                            ArrayRef<SMFixIt> Hints)
   : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
     Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
@@ -286,7 +295,7 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
     // FIXME: This assertion is intended to catch unintended use of multibyte
     // characters in fixits. If we decide to do this, we'll have to track
     // separate byte widths for the source and fixit lines.
-    assert((size_t)llvm::sys::locale::columnWidth(I->getText()) ==
+    assert((size_t)sys::locale::columnWidth(I->getText()) ==
            I->getText().size());
 
     // This relies on one byte per column in our fixit hints.
diff --git a/contrib/llvm/lib/Support/SpecialCaseList.cpp b/contrib/llvm/lib/Support/SpecialCaseList.cpp
index df524b3..05886ea 100644
--- a/contrib/llvm/lib/Support/SpecialCaseList.cpp
+++ b/contrib/llvm/lib/Support/SpecialCaseList.cpp
@@ -15,12 +15,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/SpecialCaseList.h"
-#include "llvm/Support/TrigramIndex.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/TrigramIndex.h"
 #include <string>
 #include <system_error>
 #include <utility>
diff --git a/contrib/llvm/lib/Support/Statistic.cpp b/contrib/llvm/lib/Support/Statistic.cpp
index 0c50dfd..72ca228 100644
--- a/contrib/llvm/lib/Support/Statistic.cpp
+++ b/contrib/llvm/lib/Support/Statistic.cpp
@@ -30,8 +30,8 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Timer.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstring>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Support/StringExtras.cpp b/contrib/llvm/lib/Support/StringExtras.cpp
index 3e2420f..b2f42df 100644
--- a/contrib/llvm/lib/Support/StringExtras.cpp
+++ b/contrib/llvm/lib/Support/StringExtras.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 /// StrInStrNoCase - Portable version of strcasestr.  Locates the first
diff --git a/contrib/llvm/lib/Support/StringRef.cpp b/contrib/llvm/lib/Support/StringRef.cpp
index d81250e..9b7cc1c 100644
--- a/contrib/llvm/lib/Support/StringRef.cpp
+++ b/contrib/llvm/lib/Support/StringRef.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
@@ -595,6 +596,18 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
   return false;
 }
 
+bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
+  APFloat F(0.0);
+  APFloat::opStatus Status =
+      F.convertFromString(*this, APFloat::rmNearestTiesToEven);
+  if (Status != APFloat::opOK) {
+    if (!AllowInexact || Status != APFloat::opInexact)
+      return true;
+  }
+
+  Result = F.convertToDouble();
+  return false;
+}
 
 // Implementation of StringRef hashing.
 hash_code llvm::hash_value(StringRef S) {
diff --git a/contrib/llvm/lib/Support/TargetParser.cpp b/contrib/llvm/lib/Support/TargetParser.cpp
index 42fab67..e8ef1d2 100644
--- a/contrib/llvm/lib/Support/TargetParser.cpp
+++ b/contrib/llvm/lib/Support/TargetParser.cpp
@@ -210,7 +210,7 @@ bool llvm::ARM::getHWDivFeatures(unsigned HWDivKind,
   else
     Features.push_back("-hwdiv-arm");
 
-  if (HWDivKind & ARM::AEK_HWDIV)
+  if (HWDivKind & ARM::AEK_HWDIVTHUMB)
     Features.push_back("+hwdiv");
   else
     Features.push_back("-hwdiv");
@@ -422,8 +422,10 @@ unsigned llvm::AArch64::getDefaultExtensions(StringRef CPU, unsigned ArchKind) {
     return AArch64ARCHNames[ArchKind].ArchBaseExtensions;
 
   return StringSwitch<unsigned>(CPU)
-#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT) \
-    .Case(NAME, DEFAULT_EXT)
+#define AARCH64_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)       \
+  .Case(NAME,                                                                  \
+        AArch64ARCHNames[(unsigned)AArch64::ArchKind::ID].ArchBaseExtensions | \
+            DEFAULT_EXT)
 #include "llvm/Support/AArch64TargetParser.def"
     .Default(AArch64::AEK_INVALID);
 }
@@ -448,6 +450,10 @@ bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+spe");
   if (Extensions & AArch64::AEK_RAS)
     Features.push_back("+ras");
+  if (Extensions & AArch64::AEK_LSE)
+    Features.push_back("+lse");
+  if (Extensions & AArch64::AEK_SVE)
+    Features.push_back("+sve");
 
   return true;
 }
@@ -725,6 +731,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   case ARM::AK_ARMV8R:
     return ARM::PK_R;
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7VE:
   case ARM::AK_ARMV7K:
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
@@ -761,6 +768,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   case ARM::AK_ARMV6M:
     return 6;
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7VE:
   case ARM::AK_ARMV7R:
   case ARM::AK_ARMV7M:
   case ARM::AK_ARMV7S:
@@ -778,6 +786,42 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   return 0;
 }
 
+StringRef llvm::ARM::computeDefaultTargetABI(const Triple &TT, StringRef CPU) {
+  StringRef ArchName =
+      CPU.empty() ? TT.getArchName() : ARM::getArchName(ARM::parseCPUArch(CPU));
+
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getEnvironment() == Triple::EABI ||
+        TT.getOS() == Triple::UnknownOS ||
+        llvm::ARM::parseArchProfile(ArchName) == ARM::PK_M)
+      return "aapcs";
+    if (TT.isWatchABI())
+      return "aapcs16";
+    return "apcs-gnu";
+  } else if (TT.isOSWindows())
+    // FIXME: this is invalid for WindowsCE.
+    return "aapcs";
+
+  // Select the default based on the platform.
+  switch (TT.getEnvironment()) {
+  case Triple::Android:
+  case Triple::GNUEABI:
+  case Triple::GNUEABIHF:
+  case Triple::MuslEABI:
+  case Triple::MuslEABIHF:
+    return "aapcs-linux";
+  case Triple::EABIHF:
+  case Triple::EABI:
+    return "aapcs";
+  default:
+    if (TT.isOSNetBSD())
+      return "apcs-gnu";
+    if (TT.isOSOpenBSD())
+      return "aapcs-linux";
+    return "aapcs";
+  }
+}
+
 StringRef llvm::AArch64::getCanonicalArchName(StringRef Arch) {
   return ARM::getCanonicalArchName(Arch);
 }
diff --git a/contrib/llvm/lib/Support/ThreadLocal.cpp b/contrib/llvm/lib/Support/ThreadLocal.cpp
index 9da1603..9a75c02 100644
--- a/contrib/llvm/lib/Support/ThreadLocal.cpp
+++ b/contrib/llvm/lib/Support/ThreadLocal.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/ThreadLocal.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/ThreadLocal.h"
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
diff --git a/contrib/llvm/lib/Support/ThreadPool.cpp b/contrib/llvm/lib/Support/ThreadPool.cpp
index db03a4d..22b7550 100644
--- a/contrib/llvm/lib/Support/ThreadPool.cpp
+++ b/contrib/llvm/lib/Support/ThreadPool.cpp
@@ -53,11 +53,7 @@ ThreadPool::ThreadPool(unsigned ThreadCount)
           Tasks.pop();
         }
         // Run the task we just grabbed
-#ifndef _MSC_VER
         Task();
-#else
-        Task(/* unused */ false);
-#endif
 
         {
           // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
@@ -82,7 +78,7 @@ void ThreadPool::wait() {
                            [&] { return !ActiveThreads && Tasks.empty(); });
 }
 
-std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) {
+std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
   /// Wrap the Task in a packaged_task to return a future object.
   PackagedTaskTy PackagedTask(std::move(Task));
   auto Future = PackagedTask.get_future();
@@ -128,25 +124,16 @@ void ThreadPool::wait() {
   while (!Tasks.empty()) {
     auto Task = std::move(Tasks.front());
     Tasks.pop();
-#ifndef _MSC_VER
-        Task();
-#else
-        Task(/* unused */ false);
-#endif
+    Task();
   }
 }
 
-std::shared_future<ThreadPool::VoidTy> ThreadPool::asyncImpl(TaskTy Task) {
-#ifndef _MSC_VER
+std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
   // Get a Future with launch::deferred execution using std::async
   auto Future = std::async(std::launch::deferred, std::move(Task)).share();
   // Wrap the future so that both ThreadPool::wait() can operate and the
   // returned future can be sync'ed on.
   PackagedTaskTy PackagedTask([Future]() { Future.get(); });
-#else
-  auto Future = std::async(std::launch::deferred, std::move(Task), false).share();
-  PackagedTaskTy PackagedTask([Future](bool) -> bool { Future.get(); return false; });
-#endif
   Tasks.push(std::move(PackagedTask));
   return Future;
 }
diff --git a/contrib/llvm/lib/Support/Threading.cpp b/contrib/llvm/lib/Support/Threading.cpp
index 760f9e2..6a10b98 100644
--- a/contrib/llvm/lib/Support/Threading.cpp
+++ b/contrib/llvm/lib/Support/Threading.cpp
@@ -14,14 +14,20 @@
 
 #include "llvm/Support/Threading.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/Atomic.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/thread.h"
+
 #include <cassert>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
 bool llvm::llvm_is_multithreaded() {
 #if LLVM_ENABLE_THREADS != 0
   return true;
@@ -30,100 +36,47 @@ bool llvm::llvm_is_multithreaded() {
 #endif
 }
 
-#if LLVM_ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H)
-#include <pthread.h>
-
-struct ThreadInfo {
-  void (*UserFn)(void *);
-  void *UserData;
-};
-static void *ExecuteOnThread_Dispatch(void *Arg) {
-  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
-  TI->UserFn(TI->UserData);
-  return nullptr;
-}
-
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+#if LLVM_ENABLE_THREADS == 0 ||                                                \
+    (!defined(LLVM_ON_WIN32) && !defined(HAVE_PTHREAD_H))
+// Support for non-Win32, non-pthread implementation.
+void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
                                   unsigned RequestedStackSize) {
-  ThreadInfo Info = { Fn, UserData };
-  pthread_attr_t Attr;
-  pthread_t Thread;
-
-  // Construct the attributes object.
-  if (::pthread_attr_init(&Attr) != 0)
-    return;
-
-  // Set the requested stack size, if given.
-  if (RequestedStackSize != 0) {
-    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
-      goto error;
-  }
-
-  // Construct and execute the thread.
-  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
-    goto error;
-
-  // Wait for the thread and clean up.
-  ::pthread_join(Thread, nullptr);
-
- error:
-  ::pthread_attr_destroy(&Attr);
+  (void)RequestedStackSize;
+  Fn(UserData);
 }
-#elif LLVM_ENABLE_THREADS!=0 && defined(LLVM_ON_WIN32)
-#include "Windows/WindowsSupport.h"
-#include <process.h>
 
-// Windows will at times define MemoryFence.
-#ifdef MemoryFence
-#undef MemoryFence
-#endif
+unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
 
-struct ThreadInfo {
-  void (*func)(void*);
-  void *param;
-};
+uint64_t llvm::get_threadid() { return 0; }
 
-static unsigned __stdcall ThreadCallback(void *param) {
-  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
-  info->func(info->param);
+uint32_t llvm::get_max_thread_name_length() { return 0; }
 
-  return 0;
-}
+void llvm::set_thread_name(const Twine &Name) {}
 
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
-                                  unsigned RequestedStackSize) {
-  struct ThreadInfo param = { Fn, UserData };
-
-  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
-                                            RequestedStackSize, ThreadCallback,
-                                            &param, 0, NULL);
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
-  if (hThread) {
-    // We actually don't care whether the wait succeeds or fails, in
-    // the same way we don't care whether the pthread_join call succeeds
-    // or fails.  There's not much we could do if this were to fail. But
-    // on success, this call will wait until the thread finishes executing
-    // before returning.
-    (void)::WaitForSingleObject(hThread, INFINITE);
-    ::CloseHandle(hThread);
-  }
-}
 #else
-// Support for non-Win32, non-pthread implementation.
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
-                                  unsigned RequestedStackSize) {
-  (void) RequestedStackSize;
-  Fn(UserData);
-}
-
-#endif
 
+#include <thread>
 unsigned llvm::heavyweight_hardware_concurrency() {
-#if !LLVM_ENABLE_THREADS
-  return 1;
-#endif
+  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
+  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
+  // allows us to not define `thread` in the llvm namespace, which conflicts
+  // with some platforms such as FreeBSD whose headers also define a struct
+  // called `thread` in the global namespace which can cause ambiguity due to
+  // ADL.
   int NumPhysical = sys::getHostNumPhysicalCores();
   if (NumPhysical == -1)
-    return thread::hardware_concurrency();
+    return std::thread::hardware_concurrency();
   return NumPhysical;
 }
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Threading.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Threading.inc"
+#endif
+
+#endif
diff --git a/contrib/llvm/lib/Support/Timer.cpp b/contrib/llvm/lib/Support/Timer.cpp
index fbd73d0..3386f26 100644
--- a/contrib/llvm/lib/Support/Timer.cpp
+++ b/contrib/llvm/lib/Support/Timer.cpp
@@ -20,8 +20,8 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Process.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 // This ugly hack is brought to you courtesy of constructor/destructor ordering
@@ -72,23 +72,15 @@ std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
   return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
 }
 
-
-static TimerGroup *DefaultTimerGroup = nullptr;
-static TimerGroup *getDefaultTimerGroup() {
-  TimerGroup *tmp = DefaultTimerGroup;
-  sys::MemoryFence();
-  if (tmp) return tmp;
-
-  sys::SmartScopedLock<true> Lock(*TimerLock);
-  tmp = DefaultTimerGroup;
-  if (!tmp) {
-    tmp = new TimerGroup("misc", "Miscellaneous Ungrouped Timers");
-    sys::MemoryFence();
-    DefaultTimerGroup = tmp;
+namespace {
+struct CreateDefaultTimerGroup {
+  static void *call() {
+    return new TimerGroup("misc", "Miscellaneous Ungrouped Timers");
   }
-
-  return tmp;
-}
+};
+} // namespace
+static ManagedStatic<TimerGroup, CreateDefaultTimerGroup> DefaultTimerGroup;
+static TimerGroup *getDefaultTimerGroup() { return &*DefaultTimerGroup; }
 
 //===----------------------------------------------------------------------===//
 // Timer Implementation
@@ -309,7 +301,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // If this is not an collection of ungrouped times, print the total time.
   // Ungrouped timers don't really make sense to add up.  We still print the
   // TOTAL line to make the percentages make sense.
-  if (this != DefaultTimerGroup)
+  if (this != getDefaultTimerGroup())
     OS << format("  Total Execution Time: %5.4f seconds (%5.4f wall clock)\n",
                  Total.getProcessTime(), Total.getWallTime());
   OS << '\n';
diff --git a/contrib/llvm/lib/Support/TrigramIndex.cpp b/contrib/llvm/lib/Support/TrigramIndex.cpp
index 85ab528..721763c 100644
--- a/contrib/llvm/lib/Support/TrigramIndex.cpp
+++ b/contrib/llvm/lib/Support/TrigramIndex.cpp
@@ -18,9 +18,9 @@
 #include "llvm/Support/TrigramIndex.h"
 #include "llvm/ADT/SmallVector.h"
 
-#include <unordered_map>
 #include <set>
 #include <string>
+#include <unordered_map>
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Support/Triple.cpp b/contrib/llvm/lib/Support/Triple.cpp
index 6783b40..2687a67 100644
--- a/contrib/llvm/lib/Support/Triple.cpp
+++ b/contrib/llvm/lib/Support/Triple.cpp
@@ -12,8 +12,8 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/TargetParser.h"
 #include <cstring>
 using namespace llvm;
 
@@ -34,6 +34,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case mips64:         return "mips64";
   case mips64el:       return "mips64el";
   case msp430:         return "msp430";
+  case nios2:          return "nios2";
   case ppc64:          return "powerpc64";
   case ppc64le:        return "powerpc64le";
   case ppc:            return "powerpc";
@@ -98,6 +99,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case mips64:
   case mips64el:    return "mips";
 
+  case nios2:       return "nios2";
+
   case hexagon:     return "hexagon";
 
   case amdgcn:      return "amdgcn";
@@ -161,6 +164,7 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
   case Myriad: return "myriad";
   case AMD: return "amd";
   case Mesa: return "mesa";
+  case SUSE: return "suse";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -170,6 +174,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   switch (Kind) {
   case UnknownOS: return "unknown";
 
+  case Ananas: return "ananas";
   case CloudABI: return "cloudabi";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
@@ -261,6 +266,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("mips64", mips64)
     .Case("mips64el", mips64el)
     .Case("msp430", msp430)
+    .Case("nios2", nios2)
     .Case("ppc64", ppc64)
     .Case("ppc32", ppc)
     .Case("ppc", ppc)
@@ -383,6 +389,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
     .Cases("mips64", "mips64eb", Triple::mips64)
     .Case("mips64el", Triple::mips64el)
+    .Case("nios2", Triple::nios2)
     .Case("r600", Triple::r600)
     .Case("amdgcn", Triple::amdgcn)
     .Case("riscv32", Triple::riscv32)
@@ -443,11 +450,13 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("myriad", Triple::Myriad)
     .Case("amd", Triple::AMD)
     .Case("mesa", Triple::Mesa)
+    .Case("suse", Triple::SUSE)
     .Default(Triple::UnknownVendor);
 }
 
 static Triple::OSType parseOS(StringRef OSName) {
   return StringSwitch<Triple::OSType>(OSName)
+    .StartsWith("ananas", Triple::Ananas)
     .StartsWith("cloudabi", Triple::CloudABI)
     .StartsWith("darwin", Triple::Darwin)
     .StartsWith("dragonfly", Triple::DragonFly)
@@ -457,7 +466,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("kfreebsd", Triple::KFreeBSD)
     .StartsWith("linux", Triple::Linux)
     .StartsWith("lv2", Triple::Lv2)
-    .StartsWith("macosx", Triple::MacOSX)
+    .StartsWith("macos", Triple::MacOSX)
     .StartsWith("netbsd", Triple::NetBSD)
     .StartsWith("openbsd", Triple::OpenBSD)
     .StartsWith("solaris", Triple::Solaris)
@@ -510,6 +519,7 @@ static Triple::ObjectFormatType parseFormat(StringRef EnvironmentName) {
     .EndsWith("coff", Triple::COFF)
     .EndsWith("elf", Triple::ELF)
     .EndsWith("macho", Triple::MachO)
+    .EndsWith("wasm", Triple::Wasm)
     .Default(Triple::UnknownObjectFormat);
 }
 
@@ -550,6 +560,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
   case ARM::AK_ARMV7A:
   case ARM::AK_ARMV7R:
     return Triple::ARMSubArch_v7;
+  case ARM::AK_ARMV7VE:
+    return Triple::ARMSubArch_v7ve;
   case ARM::AK_ARMV7K:
     return Triple::ARMSubArch_v7k;
   case ARM::AK_ARMV7M:
@@ -581,6 +593,7 @@ static StringRef getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   case Triple::COFF: return "coff";
   case Triple::ELF: return "elf";
   case Triple::MachO: return "macho";
+  case Triple::Wasm: return "wasm";
   }
   llvm_unreachable("unknown object format type");
 }
@@ -619,6 +632,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::mips64el:
   case Triple::mipsel:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::nvptx:
   case Triple::nvptx64:
   case Triple::ppc64le:
@@ -865,6 +879,10 @@ std::string Triple::normalize(StringRef Str) {
     }
   }
 
+  // SUSE uses "gnueabi" to mean "gnueabihf"
+  if (Vendor == Triple::SUSE && Environment == llvm::Triple::GNUEABI)
+    Components[3] = "gnueabihf";
+
   if (OS == Triple::Win32) {
     Components.resize(4);
     Components[2] = "windows";
@@ -978,6 +996,8 @@ void Triple::getOSVersion(unsigned &Major, unsigned &Minor,
   StringRef OSTypeName = getOSTypeName(getOS());
   if (OSName.startswith(OSTypeName))
     OSName = OSName.substr(OSTypeName.size());
+  else if (getOS() == MacOSX)
+    OSName.consume_front("macos");
 
   parseVersionFromName(OSName, Major, Minor, Micro);
 }
@@ -1152,6 +1172,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::le32:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
+  case llvm::Triple::nios2:
   case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
   case llvm::Triple::r600:
@@ -1235,6 +1256,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::le32:
   case Triple::mips:
   case Triple::mipsel:
+  case Triple::nios2:
   case Triple::nvptx:
   case Triple::ppc:
   case Triple::r600:
@@ -1282,6 +1304,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::kalimba:
   case Triple::lanai:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::r600:
   case Triple::tce:
   case Triple::tcele:
@@ -1353,6 +1376,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::le32:
   case Triple::le64:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::r600:
@@ -1439,6 +1463,7 @@ bool Triple::isLittleEndian() const {
   case Triple::mips64el:
   case Triple::mipsel:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::ppc64le:
@@ -1464,6 +1489,39 @@ bool Triple::isLittleEndian() const {
   }
 }
 
+bool Triple::isCompatibleWith(const Triple &Other) const {
+  // ARM and Thumb triples are compatible, if subarch, vendor and OS match.
+  if ((getArch() == Triple::thumb && Other.getArch() == Triple::arm) ||
+      (getArch() == Triple::arm && Other.getArch() == Triple::thumb) ||
+      (getArch() == Triple::thumbeb && Other.getArch() == Triple::armeb) ||
+      (getArch() == Triple::armeb && Other.getArch() == Triple::thumbeb)) {
+    if (getVendor() == Triple::Apple)
+      return getSubArch() == Other.getSubArch() &&
+             getVendor() == Other.getVendor() && getOS() == Other.getOS();
+    else
+      return getSubArch() == Other.getSubArch() &&
+             getVendor() == Other.getVendor() && getOS() == Other.getOS() &&
+             getEnvironment() == Other.getEnvironment() &&
+             getObjectFormat() == Other.getObjectFormat();
+  }
+
+  // If vendor is apple, ignore the version number.
+  if (getVendor() == Triple::Apple)
+    return getArch() == Other.getArch() && getSubArch() == Other.getSubArch() &&
+           getVendor() == Other.getVendor() && getOS() == Other.getOS();
+
+  return *this == Other;
+}
+
+std::string Triple::merge(const Triple &Other) const {
+  // If vendor is apple, pick the triple with the larger version number.
+  if (getVendor() == Triple::Apple)
+    if (Other.isOSVersionLT(*this))
+      return str();
+
+  return Other.str();
+}
+
 StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   if (MArch.empty())
     MArch = getArchName();
@@ -1511,6 +1569,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
       return "strongarm";
     }
   case llvm::Triple::NaCl:
+  case llvm::Triple::OpenBSD:
     return "cortex-a8";
   default:
     switch (getEnvironment()) {
diff --git a/contrib/llvm/lib/Support/Twine.cpp b/contrib/llvm/lib/Support/Twine.cpp
index 465c6e6..d17cd4e 100644
--- a/contrib/llvm/lib/Support/Twine.cpp
+++ b/contrib/llvm/lib/Support/Twine.cpp
@@ -173,10 +173,12 @@ void Twine::printRepr(raw_ostream &OS) const {
   OS << ")";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Twine::dump() const {
   print(dbgs());
 }
 
-void Twine::dumpRepr() const {
+LLVM_DUMP_METHOD void Twine::dumpRepr() const {
   printRepr(dbgs());
 }
+#endif
diff --git a/contrib/llvm/lib/Support/Unix/DynamicLibrary.inc b/contrib/llvm/lib/Support/Unix/DynamicLibrary.inc
new file mode 100644
index 0000000..f05103c
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/DynamicLibrary.inc
@@ -0,0 +1,135 @@
+//===- Unix/DynamicLibrary.cpp - Unix DL Implementation ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the UNIX specific implementation of DynamicLibrary.
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(HAVE_DLFCN_H) && defined(HAVE_DLOPEN)
+#include <dlfcn.h>
+
+DynamicLibrary::HandleSet::~HandleSet() {
+  // Close the libraries in reverse order.
+  for (void *Handle : llvm::reverse(Handles))
+    ::dlclose(Handle);
+  if (Process)
+    ::dlclose(Process);
+
+  // llvm_shutdown called, Return to default
+  DynamicLibrary::SearchOrder = DynamicLibrary::SO_Linker;
+}
+
+void *DynamicLibrary::HandleSet::DLOpen(const char *File, std::string *Err) {
+  void *Handle = ::dlopen(File, RTLD_LAZY|RTLD_GLOBAL);
+  if (!Handle) {
+    if (Err) *Err = ::dlerror();
+    return &DynamicLibrary::Invalid;
+  }
+
+#ifdef __CYGWIN__
+  // Cygwin searches symbols only in the main
+  // with the handle of dlopen(NULL, RTLD_GLOBAL).
+  if (!File)
+    Handle = RTLD_DEFAULT;
+#endif
+
+  return Handle;
+}
+
+void DynamicLibrary::HandleSet::DLClose(void *Handle) {
+  ::dlclose(Handle);
+}
+
+void *DynamicLibrary::HandleSet::DLSym(void *Handle, const char *Symbol) {
+  return ::dlsym(Handle, Symbol);
+}
+
+#else // !HAVE_DLOPEN
+
+DynamicLibrary::HandleSet::~HandleSet() {}
+
+void *DynamicLibrary::HandleSet::DLOpen(const char *File, std::string *Err) {
+  if (Err) *Err = "dlopen() not supported on this platform";
+  return &Invalid;
+}
+
+void DynamicLibrary::HandleSet::DLClose(void *Handle) {
+}
+
+void *DynamicLibrary::HandleSet::DLSym(void *Handle, const char *Symbol) {
+  return nullptr;
+}
+
+#endif
+
+// Must declare the symbols in the global namespace.
+static void *DoSearch(const char* SymbolName) {
+#define EXPLICIT_SYMBOL(SYM) \
+   extern void *SYM; if (!strcmp(SymbolName, #SYM)) return &SYM
+
+  // If this is darwin, it has some funky issues, try to solve them here.  Some
+  // important symbols are marked 'private external' which doesn't allow
+  // SearchForAddressOfSymbol to find them.  As such, we special case them here,
+  // there is only a small handful of them.
+
+#ifdef __APPLE__
+  {
+    // __eprintf is sometimes used for assert() handling on x86.
+    //
+    // FIXME: Currently disabled when using Clang, as we don't always have our
+    // runtime support libraries available.
+#ifndef __clang__
+#ifdef __i386__
+    EXPLICIT_SYMBOL(__eprintf);
+#endif
+#endif
+  }
+#endif
+
+#ifdef __CYGWIN__
+  {
+    EXPLICIT_SYMBOL(_alloca);
+    EXPLICIT_SYMBOL(__main);
+  }
+#endif
+
+#undef EXPLICIT_SYMBOL
+
+// This macro returns the address of a well-known, explicit symbol
+#define EXPLICIT_SYMBOL(SYM) \
+   if (!strcmp(SymbolName, #SYM)) return &SYM
+
+// Under glibc we have a weird situation. The stderr/out/in symbols are both
+// macros and global variables because of standards requirements. So, we
+// boldly use the EXPLICIT_SYMBOL macro without checking for a #define first.
+#if defined(__GLIBC__)
+  {
+    EXPLICIT_SYMBOL(stderr);
+    EXPLICIT_SYMBOL(stdout);
+    EXPLICIT_SYMBOL(stdin);
+  }
+#else
+  // For everything else, we want to check to make sure the symbol isn't defined
+  // as a macro before using EXPLICIT_SYMBOL.
+  {
+#ifndef stdin
+    EXPLICIT_SYMBOL(stdin);
+#endif
+#ifndef stdout
+    EXPLICIT_SYMBOL(stdout);
+#endif
+#ifndef stderr
+    EXPLICIT_SYMBOL(stderr);
+#endif
+  }
+#endif
+#undef EXPLICIT_SYMBOL
+
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Support/Unix/Host.inc b/contrib/llvm/lib/Support/Unix/Host.inc
index 4572171..5580e63 100644
--- a/contrib/llvm/lib/Support/Unix/Host.inc
+++ b/contrib/llvm/lib/Support/Unix/Host.inc
@@ -34,16 +34,35 @@ static std::string getOSVersion() {
   return info.release;
 }
 
-std::string sys::getDefaultTargetTriple() {
-  std::string TargetTripleString(LLVM_DEFAULT_TARGET_TRIPLE);
-
-  // On darwin, we want to update the version to match that of the
-  // target.
+static std::string updateTripleOSVersion(std::string TargetTripleString) {
+  // On darwin, we want to update the version to match that of the target.
   std::string::size_type DarwinDashIdx = TargetTripleString.find("-darwin");
   if (DarwinDashIdx != std::string::npos) {
     TargetTripleString.resize(DarwinDashIdx + strlen("-darwin"));
     TargetTripleString += getOSVersion();
+    return TargetTripleString;
+  }
+  std::string::size_type MacOSDashIdx = TargetTripleString.find("-macos");
+  if (MacOSDashIdx != std::string::npos) {
+    TargetTripleString.resize(MacOSDashIdx);
+    // Reset the OS to darwin as the OS version from `uname` doesn't use the
+    // macOS version scheme.
+    TargetTripleString += "-darwin";
+    TargetTripleString += getOSVersion();
   }
+  return TargetTripleString;
+}
+
+std::string sys::getDefaultTargetTriple() {
+  std::string TargetTripleString =
+      updateTripleOSVersion(LLVM_DEFAULT_TARGET_TRIPLE);
+
+  // Override the default target with an environment variable named by
+  // LLVM_TARGET_TRIPLE_ENV.
+#if defined(LLVM_TARGET_TRIPLE_ENV)
+  if (const char *EnvTriple = std::getenv(LLVM_TARGET_TRIPLE_ENV))
+    TargetTripleString = EnvTriple;
+#endif
 
   return Triple::normalize(TargetTripleString);
 }
diff --git a/contrib/llvm/lib/Support/Unix/Memory.inc b/contrib/llvm/lib/Support/Unix/Memory.inc
index edbc793..dd39ef9 100644
--- a/contrib/llvm/lib/Support/Unix/Memory.inc
+++ b/contrib/llvm/lib/Support/Unix/Memory.inc
@@ -195,6 +195,10 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
 #if defined(__APPLE__) && (defined(__arm__) || defined(__arm64__))
   void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_EXEC,
                     flags, fd, 0);
+#elif defined(__NetBSD__) && defined(PROT_MPROTECT)
+  void *pa =
+      ::mmap(start, PageSize * NumPages,
+             PROT_READ | PROT_WRITE | PROT_MPROTECT(PROT_EXEC), flags, fd, 0);
 #else
   void *pa = ::mmap(start, PageSize*NumPages, PROT_READ|PROT_WRITE|PROT_EXEC,
                     flags, fd, 0);
diff --git a/contrib/llvm/lib/Support/Unix/Path.inc b/contrib/llvm/lib/Support/Unix/Path.inc
index e0b11aa..45097eb 100644
--- a/contrib/llvm/lib/Support/Unix/Path.inc
+++ b/contrib/llvm/lib/Support/Unix/Path.inc
@@ -48,6 +48,8 @@
 # endif
 #endif
 
+#include <pwd.h>
+
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #include <sys/attr.h>
@@ -65,23 +67,41 @@
 #endif
 
 #include <sys/types.h>
-#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__ANDROID__)
+#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) &&   \
+    !defined(__linux__)
 #include <sys/statvfs.h>
 #define STATVFS statvfs
+#define FSTATVFS fstatvfs
 #define STATVFS_F_FRSIZE(vfs) vfs.f_frsize
 #else
-#ifdef __OpenBSD__
-#include <sys/param.h>
+#if defined(__OpenBSD__) || defined(__FreeBSD__)
 #include <sys/mount.h>
-#elif defined(__ANDROID__)
+#include <sys/param.h>
+#elif defined(__linux__)
+#if defined(HAVE_LINUX_MAGIC_H)
+#include <linux/magic.h>
+#else
+#if defined(HAVE_LINUX_NFS_FS_H)
+#include <linux/nfs_fs.h>
+#endif
+#if defined(HAVE_LINUX_SMB_H)
+#include <linux/smb.h>
+#endif
+#endif
 #include <sys/vfs.h>
 #else
 #include <sys/mount.h>
 #endif
 #define STATVFS statfs
+#define FSTATVFS fstatfs
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
+#if defined(__NetBSD__)
+#define STATVFS_F_FLAG(vfs) (vfs).f_flag
+#else
+#define STATVFS_F_FLAG(vfs) (vfs).f_flags
+#endif
 
 using namespace llvm;
 
@@ -180,7 +200,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       if (getprogpath(exe_path, argv0))
         return exe_path;
   }
-#elif defined(HAVE_DLFCN_H)
+#elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
   // Use dladdr to get executable path if available.
   Dl_info DLInfo;
   int err = dladdr(MainAddr, &DLInfo);
@@ -210,6 +230,10 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(fs_st_dev, fs_st_ino);
 }
 
+uint32_t file_status::getLinkCount() const {
+  return fs_st_nlinks;
+}
+
 ErrorOr<space_info> disk_space(const Twine &Path) {
   struct STATVFS Vfs;
   if (::STATVFS(Path.str().c_str(), &Vfs))
@@ -257,6 +281,16 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   return std::error_code();
 }
 
+std::error_code set_current_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
+  if (::chdir(p.begin()) == -1)
+    return std::error_code(errno, std::generic_category());
+
+  return std::error_code();
+}
+
 std::error_code create_directory(const Twine &path, bool IgnoreExisting,
                                  perms Perms) {
   SmallString<128> path_storage;
@@ -325,6 +359,56 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   return std::error_code();
 }
 
+static bool is_local_impl(struct STATVFS &Vfs) {
+#if defined(__linux__)
+#ifndef NFS_SUPER_MAGIC
+#define NFS_SUPER_MAGIC 0x6969
+#endif
+#ifndef SMB_SUPER_MAGIC
+#define SMB_SUPER_MAGIC 0x517B
+#endif
+#ifndef CIFS_MAGIC_NUMBER
+#define CIFS_MAGIC_NUMBER 0xFF534D42
+#endif
+  switch ((uint32_t)Vfs.f_type) {
+  case NFS_SUPER_MAGIC:
+  case SMB_SUPER_MAGIC:
+  case CIFS_MAGIC_NUMBER:
+    return false;
+  default:
+    return true;
+  }
+#elif defined(__CYGWIN__)
+  // Cygwin doesn't expose this information; would need to use Win32 API.
+  return false;
+#elif defined(__sun)
+  // statvfs::f_basetype contains a null-terminated FSType name of the mounted target
+  StringRef fstype(Vfs.f_basetype);
+  // NFS is the only non-local fstype??
+  return !fstype.equals("nfs");
+#else
+  return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL);
+#endif
+}
+
+std::error_code is_local(const Twine &Path, bool &Result) {
+  struct STATVFS Vfs;
+  if (::STATVFS(Path.str().c_str(), &Vfs))
+    return std::error_code(errno, std::generic_category());
+
+  Result = is_local_impl(Vfs);
+  return std::error_code();
+}
+
+std::error_code is_local(int FD, bool &Result) {
+  struct STATVFS Vfs;
+  if (::FSTATVFS(FD, &Vfs))
+    return std::error_code(errno, std::generic_category());
+
+  Result = is_local_impl(Vfs);
+  return std::error_code();
+}
+
 std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
@@ -342,14 +426,15 @@ std::error_code resize_file(int FD, uint64_t Size) {
 #if defined(HAVE_POSIX_FALLOCATE)
   // If we have posix_fallocate use it. Unlike ftruncate it always allocates
   // space, so we get an error if the disk is full.
-  if (int Err = ::posix_fallocate(FD, 0, Size))
-    return std::error_code(Err, std::generic_category());
-#else
+  if (int Err = ::posix_fallocate(FD, 0, Size)) {
+    if (Err != EOPNOTSUPP)
+      return std::error_code(Err, std::generic_category());
+  }
+#endif
   // Use ftruncate as a fallback. It may or may not allocate space. At least on
   // OS X with HFS+ it does.
   if (::ftruncate(FD, Size) == -1)
     return std::error_code(errno, std::generic_category());
-#endif
 
   return std::error_code();
 }
@@ -405,6 +490,46 @@ std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   return std::error_code();
 }
 
+static void expandTildeExpr(SmallVectorImpl<char> &Path) {
+  StringRef PathStr(Path.begin(), Path.size());
+  if (PathStr.empty() || !PathStr.startswith("~"))
+    return;
+
+  PathStr = PathStr.drop_front();
+  StringRef Expr =
+      PathStr.take_until([](char c) { return path::is_separator(c); });
+  StringRef Remainder = PathStr.substr(Expr.size() + 1);
+  SmallString<128> Storage;
+  if (Expr.empty()) {
+    // This is just ~/..., resolve it to the current user's home dir.
+    if (!path::home_directory(Storage)) {
+      // For some reason we couldn't get the home directory.  Just exit.
+      return;
+    }
+
+    // Overwrite the first character and insert the rest.
+    Path[0] = Storage[0];
+    Path.insert(Path.begin() + 1, Storage.begin() + 1, Storage.end());
+    return;
+  }
+
+  // This is a string of the form ~username/, look up this user's entry in the
+  // password database.
+  struct passwd *Entry = nullptr;
+  std::string User = Expr.str();
+  Entry = ::getpwnam(User.c_str());
+
+  if (!Entry) {
+    // Unable to look up the entry, just return back the original path.
+    return;
+  }
+
+  Storage = Remainder;
+  Path.clear();
+  Path.append(Entry->pw_dir, Entry->pw_dir + strlen(Entry->pw_dir));
+  llvm::sys::path::append(Path, Storage);
+}
+
 static std::error_code fillStatus(int StatRet, const struct stat &Status,
                              file_status &Result) {
   if (StatRet != 0) {
@@ -430,22 +555,23 @@ static std::error_code fillStatus(int StatRet, const struct stat &Status,
     Type = file_type::fifo_file;
   else if (S_ISSOCK(Status.st_mode))
     Type = file_type::socket_file;
+  else if (S_ISLNK(Status.st_mode))
+    Type = file_type::symlink_file;
 
-  perms Perms = static_cast<perms>(Status.st_mode);
-  Result =
-      file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_atime,
-                  Status.st_mtime, Status.st_uid, Status.st_gid,
-                  Status.st_size);
+  perms Perms = static_cast<perms>(Status.st_mode) & all_perms;
+  Result = file_status(Type, Perms, Status.st_dev, Status.st_nlink,
+                       Status.st_ino, Status.st_atime, Status.st_mtime,
+                       Status.st_uid, Status.st_gid, Status.st_size);
 
   return std::error_code();
 }
 
-std::error_code status(const Twine &Path, file_status &Result) {
+std::error_code status(const Twine &Path, file_status &Result, bool Follow) {
   SmallString<128> PathStorage;
   StringRef P = Path.toNullTerminatedStringRef(PathStorage);
 
   struct stat Status;
-  int StatRet = ::stat(P.begin(), &Status);
+  int StatRet = (Follow ? ::stat : ::lstat)(P.begin(), &Status);
   return fillStatus(StatRet, Status, Result);
 }
 
@@ -455,6 +581,15 @@ std::error_code status(int FD, file_status &Result) {
   return fillStatus(StatRet, Status, Result);
 }
 
+std::error_code setPermissions(const Twine &Path, perms Permissions) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+  if (::chmod(P.begin(), Permissions))
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
+}
+
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
 #if defined(HAVE_FUTIMENS)
   timespec Times[2];
@@ -481,6 +616,26 @@ std::error_code mapped_file_region::init(int FD, uint64_t Offset,
 
   int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
   int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
+#if defined(__APPLE__)
+  //----------------------------------------------------------------------
+  // Newer versions of MacOSX have a flag that will allow us to read from
+  // binaries whose code signature is invalid without crashing by using
+  // the MAP_RESILIENT_CODESIGN flag. Also if a file from removable media
+  // is mapped we can avoid crashing and return zeroes to any pages we try
+  // to read if the media becomes unavailable by using the
+  // MAP_RESILIENT_MEDIA flag.  These flags are only usable when mapping
+  // with PROT_READ, so take care not to specify them otherwise.
+  //----------------------------------------------------------------------
+  if (Mode == readonly) {
+#if defined(MAP_RESILIENT_CODESIGN)
+    flags |= MAP_RESILIENT_CODESIGN;
+#endif
+#if defined(MAP_RESILIENT_MEDIA)
+    flags |= MAP_RESILIENT_MEDIA;
+#endif
+  }
+#endif // #if defined (__APPLE__)
+
   Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
     return std::error_code(errno, std::generic_category());
@@ -526,7 +681,8 @@ int mapped_file_region::alignment() {
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
+                                                     StringRef path,
+                                                     bool follow_symlinks) {
   SmallString<128> path_null(path);
   DIR *directory = ::opendir(path_null.c_str());
   if (!directory)
@@ -535,7 +691,7 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
   it.IterationHandle = reinterpret_cast<intptr_t>(directory);
   // Add something for replace_filename to replace.
   path::append(path_null, ".");
-  it.CurrentEntry = directory_entry(path_null.str());
+  it.CurrentEntry = directory_entry(path_null.str(), follow_symlinks);
   return directory_iterator_increment(it);
 }
 
@@ -577,10 +733,17 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
                                 SmallVectorImpl<char> *RealPath) {
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
-  while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
-    if (errno != EINTR)
-      return std::error_code(errno, std::generic_category());
-  }
+  int OpenFlags = O_RDONLY;
+#ifdef O_CLOEXEC
+  OpenFlags |= O_CLOEXEC;
+#endif
+  if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags)) < 0)
+    return std::error_code(errno, std::generic_category());
+#ifndef O_CLOEXEC
+  int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
+  (void)r;
+  assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
+#endif
   // Attempt to get the real name of the file, if the user asked
   if(!RealPath)
     return std::error_code();
@@ -616,6 +779,10 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
 
   int OpenFlags = O_CREAT;
 
+#ifdef O_CLOEXEC
+  OpenFlags |= O_CLOEXEC;
+#endif
+
   if (Flags & F_RW)
     OpenFlags |= O_RDWR;
   else
@@ -631,10 +798,13 @@ std::error_code openFileForWrite(const Twine &Name, int &ResultFD,
 
   SmallString<128> Storage;
   StringRef P = Name.toNullTerminatedStringRef(Storage);
-  while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) {
-    if (errno != EINTR)
-      return std::error_code(errno, std::generic_category());
-  }
+  if ((ResultFD = sys::RetryAfterSignal(-1, open, P.begin(), OpenFlags, Mode)) < 0)
+    return std::error_code(errno, std::generic_category());
+#ifndef O_CLOEXEC
+  int r = fcntl(ResultFD, F_SETFD, FD_CLOEXEC);
+  (void)r;
+  assert(r == 0 && "fcntl(F_SETFD, FD_CLOEXEC) failed");
+#endif
   return std::error_code();
 }
 
@@ -685,18 +855,85 @@ std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
   return std::error_code();
 }
 
+template <typename T>
+static std::error_code remove_directories_impl(const T &Entry,
+                                               bool IgnoreErrors) {
+  std::error_code EC;
+  directory_iterator Begin(Entry, EC, false);
+  directory_iterator End;
+  while (Begin != End) {
+    auto &Item = *Begin;
+    file_status st;
+    EC = Item.status(st);
+    if (EC && !IgnoreErrors)
+      return EC;
+
+    if (is_directory(st)) {
+      EC = remove_directories_impl(Item, IgnoreErrors);
+      if (EC && !IgnoreErrors)
+        return EC;
+    }
+
+    EC = fs::remove(Item.path(), true);
+    if (EC && !IgnoreErrors)
+      return EC;
+
+    Begin.increment(EC);
+    if (EC && !IgnoreErrors)
+      return EC;
+  }
+  return std::error_code();
+}
+
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  auto EC = remove_directories_impl(path, IgnoreErrors);
+  if (EC && !IgnoreErrors)
+    return EC;
+  EC = fs::remove(path, true);
+  if (EC && !IgnoreErrors)
+    return EC;
+  return std::error_code();
+}
+
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
+                          bool expand_tilde) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return std::error_code();
+
+  if (expand_tilde) {
+    SmallString<128> Storage;
+    path.toVector(Storage);
+    expandTildeExpr(Storage);
+    return real_path(Storage, dest, false);
+  }
+
+  int fd;
+  std::error_code EC = openFileForRead(path, fd, &dest);
+
+  if (EC)
+    return EC;
+  ::close(fd);
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
 
 bool home_directory(SmallVectorImpl<char> &result) {
-  if (char *RequestedDir = getenv("HOME")) {
-    result.clear();
-    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
-    return true;
+  char *RequestedDir = getenv("HOME");
+  if (!RequestedDir) {
+    struct passwd *pw = getpwuid(getuid());
+    if (pw && pw->pw_dir)
+      RequestedDir = pw->pw_dir;
   }
+  if (!RequestedDir)
+    return false;
 
-  return false;
+  result.clear();
+  result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+  return true;
 }
 
 static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
diff --git a/contrib/llvm/lib/Support/Unix/Process.inc b/contrib/llvm/lib/Support/Unix/Process.inc
index 16f8f5a..2d46620 100644
--- a/contrib/llvm/lib/Support/Unix/Process.inc
+++ b/contrib/llvm/lib/Support/Unix/Process.inc
@@ -207,13 +207,10 @@ std::error_code Process::FixupStandardFileDescriptors() {
   for (int StandardFD : StandardFDs) {
     struct stat st;
     errno = 0;
-    while (fstat(StandardFD, &st) < 0) {
+    if (RetryAfterSignal(-1, fstat, StandardFD, &st) < 0) {
       assert(errno && "expected errno to be set if fstat failed!");
       // fstat should return EBADF if the file descriptor is closed.
-      if (errno == EBADF)
-        break;
-      // retry fstat if we got EINTR, otherwise bubble up the failure.
-      if (errno != EINTR)
+      if (errno != EBADF)
         return std::error_code(errno, std::generic_category());
     }
     // if fstat succeeds, move on to the next FD.
@@ -222,11 +219,8 @@ std::error_code Process::FixupStandardFileDescriptors() {
     assert(errno == EBADF && "expected errno to have EBADF at this point!");
 
     if (NullFD < 0) {
-      while ((NullFD = open("/dev/null", O_RDWR)) < 0) {
-        if (errno == EINTR)
-          continue;
+      if ((NullFD = RetryAfterSignal(-1, open, "/dev/null", O_RDWR)) < 0)
         return std::error_code(errno, std::generic_category());
-      }
     }
 
     if (NullFD == StandardFD)
@@ -347,7 +341,7 @@ static bool terminalHasColors(int fd) {
   MutexGuard G(*TermColorMutex);
 
   int errret = 0;
-  if (setupterm((char *)nullptr, fd, &errret) != 0)
+  if (setupterm(nullptr, fd, &errret) != 0)
     // Regardless of why, if we can't get terminfo, we shouldn't try to print
     // colors.
     return false;
@@ -369,7 +363,7 @@ static bool terminalHasColors(int fd) {
 
   // Now extract the structure allocated by setupterm and free its memory
   // through a really silly dance.
-  struct term *termp = set_curterm((struct term *)nullptr);
+  struct term *termp = set_curterm(nullptr);
   (void)del_curterm(termp); // Drop any errors here.
 
   // Return true if we found a color capabilities for the current terminal.
diff --git a/contrib/llvm/lib/Support/Unix/Program.inc b/contrib/llvm/lib/Support/Unix/Program.inc
index 7d3537e..c866d5b 100644
--- a/contrib/llvm/lib/Support/Unix/Program.inc
+++ b/contrib/llvm/lib/Support/Unix/Program.inc
@@ -40,9 +40,6 @@
 #include <unistd.h>
 #endif
 #ifdef HAVE_POSIX_SPAWN
-#ifdef __sun__
-#define  _RESTRICT_KYWD
-#endif
 #include <spawn.h>
 
 #if defined(__APPLE__)
@@ -163,16 +160,6 @@ static void SetMemoryLimits (unsigned size)
   r.rlim_cur = limit;
   setrlimit (RLIMIT_RSS, &r);
 #endif
-#ifdef RLIMIT_AS  // e.g. NetBSD doesn't have it.
-  // Don't set virtual memory limit if built with any Sanitizer. They need 80Tb
-  // of virtual memory for shadow memory mapping.
-#if !LLVM_MEMORY_SANITIZER_BUILD && !LLVM_ADDRESS_SANITIZER_BUILD
-  // Virtual memory.
-  getrlimit (RLIMIT_AS, &r);
-  r.rlim_cur = limit;
-  setrlimit (RLIMIT_AS, &r);
-#endif
-#endif
 #endif
 }
 
@@ -459,11 +446,22 @@ bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program, ArrayRef<co
   size_t ArgLength = Program.size() + 1;
   for (ArrayRef<const char*>::iterator I = Args.begin(), E = Args.end();
        I != E; ++I) {
-    ArgLength += strlen(*I) + 1;
+    size_t length = strlen(*I);
+
+    // Ensure that we do not exceed the MAX_ARG_STRLEN constant on Linux, which
+    // does not have a constant unlike what the man pages would have you
+    // believe. Since this limit is pretty high, perform the check
+    // unconditionally rather than trying to be aggressive and limiting it to
+    // Linux only.
+    if (length >= (32 * 4096))
+      return false;
+
+    ArgLength += length + 1;
     if (ArgLength > size_t(HalfArgMax)) {
       return false;
     }
   }
+
   return true;
 }
 }
diff --git a/contrib/llvm/lib/Support/Unix/Signals.inc b/contrib/llvm/lib/Support/Unix/Signals.inc
index 9752b70..aaf760c 100644
--- a/contrib/llvm/lib/Support/Unix/Signals.inc
+++ b/contrib/llvm/lib/Support/Unix/Signals.inc
@@ -15,9 +15,9 @@
 #include "Unix.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Demangle/Demangle.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/Program.h"
@@ -25,8 +25,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
-#if HAVE_EXECINFO_H
-# include <execinfo.h>         // For backtrace().
+#ifdef HAVE_BACKTRACE
+# include BACKTRACE_HEADER         // For backtrace().
 #endif
 #if HAVE_SIGNAL_H
 #include <signal.h>
@@ -59,7 +59,7 @@ using namespace llvm;
 
 static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 
-static ManagedStatic<SmartMutex<true> > SignalsMutex;
+static ManagedStatic<sys::SmartMutex<true> > SignalsMutex;
 
 /// InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = nullptr;
@@ -149,11 +149,7 @@ static void CreateSigAltStack() {}
 #endif
 
 static void RegisterHandlers() {
-  // We need to dereference the signals mutex during handler registration so
-  // that we force its construction. This is to prevent the first use being
-  // during handling an actual signal because you can't safely call new in a
-  // signal handler.
-  *SignalsMutex;
+  sys::SmartScopedLock<true> Guard(*SignalsMutex);
 
   // If the handlers are already registered, we're done.
   if (NumRegisteredSignals != 0) return;
@@ -223,7 +219,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
   {
-    unique_lock<SmartMutex<true>> Guard(*SignalsMutex);
+    unique_lock<sys::SmartMutex<true>> Guard(*SignalsMutex);
     RemoveFilesToRemove();
 
     if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
@@ -412,7 +408,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 
   if (printSymbolizedStackTrace(Argv0, StackTrace, depth, OS))
     return;
-#if HAVE_DLFCN_H && __GNUG__ && !defined(__CYGWIN__)
+#if HAVE_DLFCN_H && HAVE_DLADDR
   int width = 0;
   for (int i = 0; i < depth; ++i) {
     Dl_info dlinfo;
@@ -462,7 +458,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 }
 
 static void PrintStackTraceSignalHandler(void *) {
-  PrintStackTrace(llvm::errs());
+  sys::PrintStackTrace(llvm::errs());
 }
 
 void llvm::sys::DisableSystemDialogsOnCrash() {}
diff --git a/contrib/llvm/lib/Support/Unix/Threading.inc b/contrib/llvm/lib/Support/Unix/Threading.inc
new file mode 100644
index 0000000..267af38
--- /dev/null
+++ b/contrib/llvm/lib/Support/Unix/Threading.inc
@@ -0,0 +1,215 @@
+//===- Unix/Threading.inc - Unix Threading Implementation ----- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Unix specific implementation of Threading functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#if defined(__APPLE__)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#endif
+
+#include <pthread.h>
+
+#if defined(__FreeBSD__)
+#include <pthread_np.h> // For pthread_getthreadid_np()
+#endif
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+#include <errno.h>
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <unistd.h>
+#endif
+
+#if defined(__NetBSD__)
+#include <lwp.h> // For _lwp_self()
+#endif
+
+#if defined(__linux__)
+#include <sys/syscall.h> // For syscall codes
+#include <unistd.h>      // For syscall()
+#endif
+
+namespace {
+  struct ThreadInfo {
+    void(*UserFn)(void *);
+    void *UserData;
+  };
+}
+
+static void *ExecuteOnThread_Dispatch(void *Arg) {
+  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
+  TI->UserFn(TI->UserData);
+  return nullptr;
+}
+
+void llvm::llvm_execute_on_thread(void(*Fn)(void*), void *UserData,
+  unsigned RequestedStackSize) {
+  ThreadInfo Info = { Fn, UserData };
+  pthread_attr_t Attr;
+  pthread_t Thread;
+
+  // Construct the attributes object.
+  if (::pthread_attr_init(&Attr) != 0)
+    return;
+
+  // Set the requested stack size, if given.
+  if (RequestedStackSize != 0) {
+    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
+      goto error;
+  }
+
+  // Construct and execute the thread.
+  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
+    goto error;
+
+  // Wait for the thread and clean up.
+  ::pthread_join(Thread, nullptr);
+
+error:
+  ::pthread_attr_destroy(&Attr);
+}
+
+
+uint64_t llvm::get_threadid() {
+#if defined(__APPLE__)
+  // Calling "mach_thread_self()" bumps the reference count on the thread
+  // port, so we need to deallocate it. mach_task_self() doesn't bump the ref
+  // count.
+  thread_port_t Self = mach_thread_self();
+  mach_port_deallocate(mach_task_self(), Self);
+  return Self;
+#elif defined(__FreeBSD__)
+  return uint64_t(pthread_getthreadid_np());
+#elif defined(__NetBSD__)
+  return uint64_t(_lwp_self());
+#elif defined(__ANDROID__)
+  return uint64_t(gettid());
+#elif defined(__linux__)
+  return uint64_t(syscall(SYS_gettid));
+#elif defined(LLVM_ON_WIN32)
+  return uint64_t(::GetCurrentThreadId());
+#else
+  return uint64_t(pthread_self());
+#endif
+}
+
+
+static constexpr uint32_t get_max_thread_name_length_impl() {
+#if defined(__NetBSD__)
+	return PTHREAD_MAX_NAMELEN_NP;
+#elif defined(__APPLE__)
+	return 64;
+#elif defined(__linux__)
+#if HAVE_PTHREAD_SETNAME_NP
+	return 16;
+#else
+	return 0;
+#endif
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  return 16;
+#else
+  return 0;
+#endif
+}
+
+uint32_t llvm::get_max_thread_name_length() {
+  return get_max_thread_name_length_impl();
+}
+
+void llvm::set_thread_name(const Twine &Name) {
+  // Make sure the input is null terminated.
+  SmallString<64> Storage;
+  StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
+
+  // Truncate from the beginning, not the end, if the specified name is too
+  // long.  For one, this ensures that the resulting string is still null
+  // terminated, but additionally the end of a long thread name will usually
+  // be more unique than the beginning, since a common pattern is for similar
+  // threads to share a common prefix.
+  if (get_max_thread_name_length() > 0)
+    NameStr = NameStr.take_back(get_max_thread_name_length());
+  (void)NameStr;
+#if defined(__linux__)
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
+#if HAVE_PTHREAD_SETNAME_NP
+  ::pthread_setname_np(::pthread_self(), NameStr.data());
+#endif
+#endif
+#elif defined(__FreeBSD__)
+  ::pthread_set_name_np(::pthread_self(), NameStr.data());
+#elif defined(__NetBSD__)
+  ::pthread_setname_np(::pthread_self(), "%s",
+    const_cast<char *>(NameStr.data()));
+#elif defined(__APPLE__)
+  ::pthread_setname_np(NameStr.data());
+#endif
+}
+
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
+  Name.clear();
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  int pid = ::getpid();
+  uint64_t tid = get_threadid();
+
+  struct kinfo_proc *kp = nullptr, *nkp;
+  size_t len = 0;
+  int error;
+  int ctl[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID | KERN_PROC_INC_THREAD,
+    (int)pid };
+
+  while (1) {
+    error = sysctl(ctl, 4, kp, &len, nullptr, 0);
+    if (kp == nullptr || (error != 0 && errno == ENOMEM)) {
+      // Add extra space in case threads are added before next call.
+      len += sizeof(*kp) + len / 10;
+      nkp = (struct kinfo_proc *)realloc(kp, len);
+      if (nkp == nullptr) {
+        free(kp);
+        return;
+      }
+      kp = nkp;
+      continue;
+    }
+    if (error != 0)
+      len = 0;
+    break;
+  }
+
+  for (size_t i = 0; i < len / sizeof(*kp); i++) {
+    if (kp[i].ki_tid == (lwpid_t)tid) {
+      Name.append(kp[i].ki_tdname, kp[i].ki_tdname + strlen(kp[i].ki_tdname));
+      break;
+    }
+  }
+  free(kp);
+  return;
+#elif defined(__NetBSD__)
+  constexpr uint32_t len = get_max_thread_name_length_impl();
+  char buf[len];
+  ::pthread_getname_np(::pthread_self(), buf, len);
+
+  Name.append(buf, buf + strlen(buf));
+#elif defined(__linux__)
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
+#if HAVE_PTHREAD_GETNAME_NP
+  constexpr uint32_t len = get_max_thread_name_length_impl();
+  char Buffer[len];
+  if (0 == ::pthread_getname_np(::pthread_self(), Buffer, len))
+    Name.append(Buffer, Buffer + strlen(Buffer));
+#endif
+#endif
+#endif
+}
diff --git a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
index 0506894..083ea90 100644
--- a/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
+++ b/contrib/llvm/lib/Support/Windows/DynamicLibrary.inc
@@ -12,92 +12,142 @@
 //===----------------------------------------------------------------------===//
 
 #include "WindowsSupport.h"
+#include "llvm/Support/raw_ostream.h"
 
-#ifdef __MINGW32__
- #include <imagehlp.h>
-#else
- #include <dbghelp.h>
-#endif
-
-#ifdef _MSC_VER
- #include <ntverp.h>
-#endif
-
-namespace llvm {
-using namespace sys;
+#include <psapi.h>
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
 //===          and must not be UNIX code.
 //===----------------------------------------------------------------------===//
 
-typedef BOOL (WINAPI *fpEnumerateLoadedModules)(HANDLE,PENUMLOADED_MODULES_CALLBACK64,PVOID);
-static fpEnumerateLoadedModules fEnumerateLoadedModules;
-static DenseSet<HMODULE> *OpenedHandles;
 
-static bool loadDebugHelp(void) {
-  HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
-  if (hLib) {
-    fEnumerateLoadedModules = (fpEnumerateLoadedModules)
-      ::GetProcAddress(hLib, "EnumerateLoadedModules64");
-  }
-  return fEnumerateLoadedModules != 0;
-}
+DynamicLibrary::HandleSet::~HandleSet() {
+  for (void *Handle : llvm::reverse(Handles))
+    FreeLibrary(HMODULE(Handle));
 
-static BOOL CALLBACK
-ELM_Callback(PCSTR ModuleName, DWORD64 ModuleBase,
-             ULONG ModuleSize, PVOID UserContext) {
-  OpenedHandles->insert((HMODULE)ModuleBase);
-  return TRUE;
+  // 'Process' should not be released on Windows.
+  assert((!Process || Process==this) && "Bad Handle");
+  // llvm_shutdown called, Return to default
+  DynamicLibrary::SearchOrder = DynamicLibrary::SO_Linker;
 }
 
-DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
-                                                   std::string *errMsg) {
-  SmartScopedLock<true> lock(*SymbolsMutex);
+void *DynamicLibrary::HandleSet::DLOpen(const char *File, std::string *Err) {
+  // Create the instance and return it to be the *Process* handle
+  // simillar to dlopen(NULL, RTLD_LAZY|RTLD_GLOBAL)
+  if (!File)
+    return &(*OpenedHandles);
 
-  if (!filename) {
-    // When no file is specified, enumerate all DLLs and EXEs in the process.
-    if (OpenedHandles == 0)
-      OpenedHandles = new DenseSet<HMODULE>();
-
-    if (!fEnumerateLoadedModules) {
-      if (!loadDebugHelp()) {
-        assert(false && "These APIs should always be available");
-        return DynamicLibrary();
-      }
-    }
-
-    fEnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
-    // Dummy library that represents "search all handles".
-    // This is mostly to ensure that the return value still shows up as "valid".
-    return DynamicLibrary(&OpenedHandles);
-  }
-
-  SmallVector<wchar_t, MAX_PATH> filenameUnicode;
-  if (std::error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
+  SmallVector<wchar_t, MAX_PATH> FileUnicode;
+  if (std::error_code ec = windows::UTF8ToUTF16(File, FileUnicode)) {
     SetLastError(ec.value());
-    MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16");
-    return DynamicLibrary();
+    MakeErrMsg(Err, std::string(File) + ": Can't convert to UTF-16");
+    return &DynamicLibrary::Invalid;
   }
-  
-  HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
-  if (a_handle == 0) {
-    MakeErrMsg(errMsg, std::string(filename) + ": Can't open");
-    return DynamicLibrary();
+  HMODULE Handle = LoadLibraryW(FileUnicode.data());
+  if (Handle == NULL) {
+    MakeErrMsg(Err, std::string(File) + ": Can't open");
+    return &DynamicLibrary::Invalid;
   }
 
-  if (OpenedHandles == 0)
-    OpenedHandles = new DenseSet<HMODULE>();
+  return reinterpret_cast<void*>(Handle);
+}
+
+static DynamicLibrary::HandleSet *IsOpenedHandlesInstance(void *Handle) {
+  if (!OpenedHandles.isConstructed())
+    return nullptr;
+  DynamicLibrary::HandleSet &Inst = *OpenedHandles;
+  return Handle == &Inst ? &Inst : nullptr;
+}
+
+void DynamicLibrary::HandleSet::DLClose(void *Handle) {
+  if (HandleSet* HS = IsOpenedHandlesInstance(Handle))
+    HS->Process = nullptr; // Just drop the *Process* handle.
+  else
+    FreeLibrary((HMODULE)Handle);
+}
 
-  // If we've already loaded this library, FreeLibrary() the handle in order to
-  // keep the internal refcount at +1.
-  if (!OpenedHandles->insert(a_handle).second)
-    FreeLibrary(a_handle);
+static bool GetProcessModules(HANDLE H, DWORD &Bytes, HMODULE *Data = nullptr) {
+  // EnumProcessModules will fail on Windows 64 while some versions of
+  // MingW-32 don't have EnumProcessModulesEx.
+  if (
+#ifdef _WIN64
+      !EnumProcessModulesEx(H, Data, Bytes, &Bytes, LIST_MODULES_64BIT)
+#else
+      !EnumProcessModules(H, Data, Bytes, &Bytes)
+#endif
+     ) {
+    std::string Err;
+    if (MakeErrMsg(&Err, "EnumProcessModules failure"))
+      llvm::errs() << Err << "\n";
+    return false;
+  }
+  return true;
+}
 
-  return DynamicLibrary(a_handle);
+void *DynamicLibrary::HandleSet::DLSym(void *Handle, const char *Symbol) {
+  HandleSet* HS = IsOpenedHandlesInstance(Handle);
+  if (!HS)
+    return (void *)uintptr_t(GetProcAddress((HMODULE)Handle, Symbol));
+
+  // Could have done a dlclose on the *Process* handle
+  if (!HS->Process)
+    return nullptr;
+
+  // Trials indicate EnumProcessModulesEx is consistantly faster than using
+  // EnumerateLoadedModules64 or CreateToolhelp32Snapshot.
+  //
+  // | Handles | DbgHelp.dll | CreateSnapshot | EnumProcessModulesEx
+  // |=========|=============|========================================
+  // | 37      | 0.0000585 * | 0.0003031      | 0.0000152
+  // | 1020    | 0.0026310 * | 0.0121598      | 0.0002683
+  // | 2084    | 0.0149418 * | 0.0369936      | 0.0005610
+  //
+  // * Not including the load time of Dbghelp.dll (~.005 sec)
+  //
+  // There's still a case to somehow cache the result of EnumProcessModulesEx
+  // across invocations, but the complication of doing that properly...
+  // Possibly using LdrRegisterDllNotification to invalidate the cache?
+
+  DWORD Bytes = 0;
+  HMODULE Self = HMODULE(GetCurrentProcess());
+  if (!GetProcessModules(Self, Bytes))
+    return nullptr;
+
+  // Get the most recent list in case any modules added/removed between calls
+  // to EnumProcessModulesEx that gets the amount of, then copies the HMODULES.
+  // MSDN is pretty clear that if the module list changes during the call to
+  // EnumProcessModulesEx the results should not be used.
+  std::vector<HMODULE> Handles;
+  do {
+    assert(Bytes && ((Bytes % sizeof(HMODULE)) == 0) &&
+           "Should have at least one module and be aligned");
+    Handles.resize(Bytes / sizeof(HMODULE));
+    if (!GetProcessModules(Self, Bytes, Handles.data()))
+      return nullptr;
+  } while (Bytes != (Handles.size() * sizeof(HMODULE)));
+
+  // Try EXE first, mirroring what dlsym(dlopen(NULL)) does.
+  if (FARPROC Ptr = GetProcAddress(HMODULE(Handles.front()), Symbol))
+    return (void *) uintptr_t(Ptr);
+
+  if (Handles.size() > 1) {
+    // This is different behaviour than what Posix dlsym(dlopen(NULL)) does.
+    // Doing that here is causing real problems for the JIT where msvc.dll
+    // and ucrt.dll can define the same symbols. The runtime linker will choose
+    // symbols from ucrt.dll first, but iterating NOT in reverse here would
+    // mean that the msvc.dll versions would be returned.
+
+    for (auto I = Handles.rbegin(), E = Handles.rend()-1; I != E; ++I) {
+      if (FARPROC Ptr = GetProcAddress(HMODULE(*I), Symbol))
+        return (void *) uintptr_t(Ptr);
+    }
+  }
+  return nullptr;
 }
 
+
 // Stack probing routines are in the support library (e.g. libgcc), but we don't
 // have dynamic linking on windows. Provide a hook.
 #define EXPLICIT_SYMBOL(SYM)                    \
@@ -123,38 +173,18 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #undef INLINE_DEF_SYMBOL1
 #undef INLINE_DEF_SYMBOL2
 
-void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
-  SmartScopedLock<true> Lock(*SymbolsMutex);
-
-  // First check symbols added via AddSymbol().
-  if (ExplicitSymbols.isConstructed()) {
-    StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
-
-    if (i != ExplicitSymbols->end())
-      return i->second;
-  }
-
-  // Now search the libraries.
-  if (OpenedHandles) {
-    for (DenseSet<HMODULE>::iterator I = OpenedHandles->begin(),
-         E = OpenedHandles->end(); I != E; ++I) {
-      FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
-      if (ptr) {
-        return (void *)(intptr_t)ptr;
-      }
-    }
-  }
+static void *DoSearch(const char *SymbolName) {
 
 #define EXPLICIT_SYMBOL(SYM)                                                   \
-  if (!strcmp(symbolName, #SYM))                                               \
+  if (!strcmp(SymbolName, #SYM))                                               \
     return (void *)&SYM;
 #define EXPLICIT_SYMBOL2(SYMFROM, SYMTO)                                       \
-  if (!strcmp(symbolName, #SYMFROM))                                           \
+  if (!strcmp(SymbolName, #SYMFROM))                                           \
     return (void *)&SYMTO;
 
 #ifdef _M_IX86
 #define INLINE_DEF_SYMBOL1(TYP, SYM)                                           \
-  if (!strcmp(symbolName, #SYM))                                               \
+  if (!strcmp(SymbolName, #SYM))                                               \
     return (void *)&inline_##SYM;
 #define INLINE_DEF_SYMBOL2(TYP, SYM) INLINE_DEF_SYMBOL1(TYP, SYM)
 #endif
@@ -168,15 +198,5 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
 #undef INLINE_DEF_SYMBOL1
 #undef INLINE_DEF_SYMBOL2
 
-  return 0;
-}
-
-void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
-  if (!isValid())
-    return NULL;
-  if (Data == &OpenedHandles)
-    return SearchForAddressOfSymbol(symbolName);
-  return (void *)(intptr_t)GetProcAddress((HMODULE)Data, symbolName);
-}
-
+  return nullptr;
 }
diff --git a/contrib/llvm/lib/Support/Windows/Host.inc b/contrib/llvm/lib/Support/Windows/Host.inc
index fe89fe0..90a6fb3 100644
--- a/contrib/llvm/lib/Support/Windows/Host.inc
+++ b/contrib/llvm/lib/Support/Windows/Host.inc
@@ -17,6 +17,18 @@
 
 using namespace llvm;
 
+static std::string updateTripleOSVersion(std::string Triple) {
+  return Triple;
+}
+
 std::string sys::getDefaultTargetTriple() {
-  return Triple::normalize(LLVM_DEFAULT_TARGET_TRIPLE);
+  const char *Triple = LLVM_DEFAULT_TARGET_TRIPLE;
+
+  // Override the default target with an environment variable named by LLVM_TARGET_TRIPLE_ENV.
+#if defined(LLVM_TARGET_TRIPLE_ENV)
+  if (const char *EnvTriple = std::getenv(LLVM_TARGET_TRIPLE_ENV))
+    Triple = EnvTriple;
+#endif
+
+  return Triple::normalize(Triple);
 }
diff --git a/contrib/llvm/lib/Support/Windows/Mutex.inc b/contrib/llvm/lib/Support/Windows/Mutex.inc
index ab79d07..0af145e 100644
--- a/contrib/llvm/lib/Support/Windows/Mutex.inc
+++ b/contrib/llvm/lib/Support/Windows/Mutex.inc
@@ -20,15 +20,14 @@
 #include "llvm/Support/Mutex.h"
 
 namespace llvm {
-using namespace sys;
 
-MutexImpl::MutexImpl(bool /*recursive*/)
+sys::MutexImpl::MutexImpl(bool /*recursive*/)
 {
   data_ = new CRITICAL_SECTION;
   InitializeCriticalSection((LPCRITICAL_SECTION)data_);
 }
 
-MutexImpl::~MutexImpl()
+sys::MutexImpl::~MutexImpl()
 {
   DeleteCriticalSection((LPCRITICAL_SECTION)data_);
   delete (LPCRITICAL_SECTION)data_;
@@ -36,21 +35,21 @@ MutexImpl::~MutexImpl()
 }
 
 bool
-MutexImpl::acquire()
+sys::MutexImpl::acquire()
 {
   EnterCriticalSection((LPCRITICAL_SECTION)data_);
   return true;
 }
 
 bool
-MutexImpl::release()
+sys::MutexImpl::release()
 {
   LeaveCriticalSection((LPCRITICAL_SECTION)data_);
   return true;
 }
 
 bool
-MutexImpl::tryacquire()
+sys::MutexImpl::tryacquire()
 {
   return TryEnterCriticalSection((LPCRITICAL_SECTION)data_);
 }
diff --git a/contrib/llvm/lib/Support/Windows/Path.inc b/contrib/llvm/lib/Support/Windows/Path.inc
index 27b250b..b00d390 100644
--- a/contrib/llvm/lib/Support/Windows/Path.inc
+++ b/contrib/llvm/lib/Support/Windows/Path.inc
@@ -26,6 +26,7 @@
 // These two headers must be included last, and make sure shlobj is required
 // after Windows.h to make sure it picks up our definition of _WIN32_WINNT
 #include "WindowsSupport.h"
+#include <shellapi.h>
 #include <shlobj.h>
 
 #undef max
@@ -178,6 +179,10 @@ TimePoint<> file_status::getLastModificationTime() const {
   return toTimePoint(Time);
 }
 
+uint32_t file_status::getLinkCount() const {
+  return NumLinks;
+}
+
 std::error_code current_path(SmallVectorImpl<char> &result) {
   SmallVector<wchar_t, MAX_PATH> cur_path;
   DWORD len = MAX_PATH;
@@ -200,6 +205,18 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
+std::error_code set_current_path(const Twine &path) {
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_path;
+  if (std::error_code ec = widenPath(path, wide_path))
+    return ec;
+
+  if (!::SetCurrentDirectoryW(wide_path.begin()))
+    return mapWindowsError(::GetLastError());
+
+  return std::error_code();
+}
+
 std::error_code create_directory(const Twine &path, bool IgnoreExisting,
                                  perms Perms) {
   SmallVector<wchar_t, 128> path_utf16;
@@ -265,6 +282,80 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   return std::error_code();
 }
 
+static std::error_code is_local_internal(SmallVectorImpl<wchar_t> &Path,
+                                         bool &Result) {
+  SmallVector<wchar_t, 128> VolumePath;
+  size_t Len = 128;
+  while (true) {
+    VolumePath.resize(Len);
+    BOOL Success =
+        ::GetVolumePathNameW(Path.data(), VolumePath.data(), VolumePath.size());
+
+    if (Success)
+      break;
+
+    DWORD Err = ::GetLastError();
+    if (Err != ERROR_INSUFFICIENT_BUFFER)
+      return mapWindowsError(Err);
+
+    Len *= 2;
+  }
+  // If the output buffer has exactly enough space for the path name, but not
+  // the null terminator, it will leave the output unterminated.  Push a null
+  // terminator onto the end to ensure that this never happens.
+  VolumePath.push_back(L'\0');
+  VolumePath.set_size(wcslen(VolumePath.data()));
+  const wchar_t *P = VolumePath.data();
+
+  UINT Type = ::GetDriveTypeW(P);
+  switch (Type) {
+  case DRIVE_FIXED:
+    Result = true;
+    return std::error_code();
+  case DRIVE_REMOTE:
+  case DRIVE_CDROM:
+  case DRIVE_RAMDISK:
+  case DRIVE_REMOVABLE:
+    Result = false;
+    return std::error_code();
+  default:
+    return make_error_code(errc::no_such_file_or_directory);
+  }
+  llvm_unreachable("Unreachable!");
+}
+
+std::error_code is_local(const Twine &path, bool &result) {
+  if (!llvm::sys::fs::exists(path) || !llvm::sys::path::has_root_path(path))
+    return make_error_code(errc::no_such_file_or_directory);
+
+  SmallString<128> Storage;
+  StringRef P = path.toStringRef(Storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> WidePath;
+  if (std::error_code ec = widenPath(P, WidePath))
+    return ec;
+  return is_local_internal(WidePath, result);
+}
+
+std::error_code is_local(int FD, bool &Result) {
+  SmallVector<wchar_t, 128> FinalPath;
+  HANDLE Handle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+
+  size_t Len = 128;
+  do {
+    FinalPath.reserve(Len);
+    Len = ::GetFinalPathNameByHandleW(Handle, FinalPath.data(),
+                                      FinalPath.capacity() - 1, VOLUME_NAME_NT);
+    if (Len == 0)
+      return mapWindowsError(::GetLastError());
+  } while (Len > FinalPath.capacity());
+
+  FinalPath.set_size(Len);
+
+  return is_local_internal(FinalPath, Result);
+}
+
 std::error_code rename(const Twine &from, const Twine &to) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
@@ -443,13 +534,16 @@ static std::error_code getStatus(HANDLE FileHandle, file_status &Result) {
     file_type Type = (Info.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
                          ? file_type::directory_file
                          : file_type::regular_file;
-    Result =
-        file_status(Type, Info.ftLastAccessTime.dwHighDateTime,
-                    Info.ftLastAccessTime.dwLowDateTime,
-                    Info.ftLastWriteTime.dwHighDateTime,
-                    Info.ftLastWriteTime.dwLowDateTime,
-                    Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
-                    Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
+    perms Permissions = (Info.dwFileAttributes & FILE_ATTRIBUTE_READONLY)
+                            ? (all_read | all_exe)
+                            : all_all;
+    Result = file_status(
+        Type, Permissions, Info.nNumberOfLinks,
+        Info.ftLastAccessTime.dwHighDateTime,
+        Info.ftLastAccessTime.dwLowDateTime,
+        Info.ftLastWriteTime.dwHighDateTime, Info.ftLastWriteTime.dwLowDateTime,
+        Info.dwVolumeSerialNumber, Info.nFileSizeHigh, Info.nFileSizeLow,
+        Info.nFileIndexHigh, Info.nFileIndexLow);
     return std::error_code();
   }
 
@@ -465,7 +559,7 @@ handle_status_error:
   return mapWindowsError(LastError);
 }
 
-std::error_code status(const Twine &path, file_status &result) {
+std::error_code status(const Twine &path, file_status &result, bool Follow) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
@@ -482,28 +576,19 @@ std::error_code status(const Twine &path, file_status &result) {
   if (attr == INVALID_FILE_ATTRIBUTES)
     return getStatus(INVALID_HANDLE_VALUE, result);
 
+  DWORD Flags = FILE_FLAG_BACKUP_SEMANTICS;
   // Handle reparse points.
-  if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
-    ScopedFileHandle h(
-      ::CreateFileW(path_utf16.begin(),
-                    0, // Attributes only.
-                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL,
-                    OPEN_EXISTING,
-                    FILE_FLAG_BACKUP_SEMANTICS,
-                    0));
-    if (!h)
-      return getStatus(INVALID_HANDLE_VALUE, result);
-  }
+  if (!Follow && (attr & FILE_ATTRIBUTE_REPARSE_POINT))
+    Flags |= FILE_FLAG_OPEN_REPARSE_POINT;
 
   ScopedFileHandle h(
       ::CreateFileW(path_utf16.begin(), 0, // Attributes only.
                     FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0));
-    if (!h)
-      return getStatus(INVALID_HANDLE_VALUE, result);
+                    NULL, OPEN_EXISTING, Flags, 0));
+  if (!h)
+    return getStatus(INVALID_HANDLE_VALUE, result);
 
-    return getStatus(h, result);
+  return getStatus(h, result);
 }
 
 std::error_code status(int FD, file_status &Result) {
@@ -511,6 +596,37 @@ std::error_code status(int FD, file_status &Result) {
   return getStatus(FileHandle, Result);
 }
 
+std::error_code setPermissions(const Twine &Path, perms Permissions) {
+  SmallVector<wchar_t, 128> PathUTF16;
+  if (std::error_code EC = widenPath(Path, PathUTF16))
+    return EC;
+
+  DWORD Attributes = ::GetFileAttributesW(PathUTF16.begin());
+  if (Attributes == INVALID_FILE_ATTRIBUTES)
+    return mapWindowsError(GetLastError());
+
+  // There are many Windows file attributes that are not to do with the file
+  // permissions (e.g. FILE_ATTRIBUTE_HIDDEN). We need to be careful to preserve
+  // them.
+  if (Permissions & all_write) {
+    Attributes &= ~FILE_ATTRIBUTE_READONLY;
+    if (Attributes == 0)
+      // FILE_ATTRIBUTE_NORMAL indicates no other attributes are set.
+      Attributes |= FILE_ATTRIBUTE_NORMAL;
+  }
+  else {
+    Attributes |= FILE_ATTRIBUTE_READONLY;
+    // FILE_ATTRIBUTE_NORMAL is not compatible with any other attributes, so
+    // remove it, if it is present.
+    Attributes &= ~FILE_ATTRIBUTE_NORMAL;
+  }
+
+  if (!::SetFileAttributesW(PathUTF16.begin(), Attributes))
+    return mapWindowsError(GetLastError());
+
+  return std::error_code();
+}
+
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
   FILETIME FT = toFILETIME(Time);
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
@@ -616,7 +732,8 @@ int mapped_file_region::alignment() {
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
+                                                     StringRef path,
+                                                     bool follow_symlinks) {
   SmallVector<wchar_t, 128> path_utf16;
 
   if (std::error_code ec = widenPath(path, path_utf16))
@@ -661,7 +778,7 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
   it.IterationHandle = intptr_t(FindHandle.take());
   SmallString<128> directory_entry_path(path);
   path::append(directory_entry_path, directory_entry_name_utf8);
-  it.CurrentEntry = directory_entry(directory_entry_path);
+  it.CurrentEntry = directory_entry(directory_entry_path, follow_symlinks);
 
   return std::error_code();
 }
@@ -701,6 +818,52 @@ std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return std::error_code();
 }
 
+static std::error_code realPathFromHandle(HANDLE H,
+                                          SmallVectorImpl<char> &RealPath) {
+  RealPath.clear();
+  llvm::SmallVector<wchar_t, MAX_PATH> Buffer;
+  DWORD CountChars = ::GetFinalPathNameByHandleW(
+      H, Buffer.begin(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+  if (CountChars > Buffer.capacity()) {
+    // The buffer wasn't big enough, try again.  In this case the return value
+    // *does* indicate the size of the null terminator.
+    Buffer.reserve(CountChars);
+    CountChars = ::GetFinalPathNameByHandleW(
+        H, Buffer.data(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+  }
+  if (CountChars == 0)
+    return mapWindowsError(GetLastError());
+
+  const wchar_t *Data = Buffer.data();
+  if (CountChars >= 4) {
+    if (0 == ::memcmp(Data, L"\\\\?\\", 8)) {
+      CountChars -= 4;
+      Data += 4;
+    }
+  }
+
+  // Convert the result from UTF-16 to UTF-8.
+  return UTF16ToUTF8(Data, CountChars, RealPath);
+}
+
+static std::error_code directoryRealPath(const Twine &Name,
+                                         SmallVectorImpl<char> &RealPath) {
+  SmallVector<wchar_t, 128> PathUTF16;
+
+  if (std::error_code EC = widenPath(Name, PathUTF16))
+    return EC;
+
+  HANDLE H =
+      ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+  if (H == INVALID_HANDLE_VALUE)
+    return mapWindowsError(GetLastError());
+  std::error_code EC = realPathFromHandle(H, RealPath);
+  ::CloseHandle(H);
+  return EC;
+}
+
 std::error_code openFileForRead(const Twine &Name, int &ResultFD,
                                 SmallVectorImpl<char> *RealPath) {
   SmallVector<wchar_t, 128> PathUTF16;
@@ -732,20 +895,8 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
   }
 
   // Fetch the real name of the file, if the user asked
-  if (RealPath) {
-    RealPath->clear();
-    wchar_t RealPathUTF16[MAX_PATH];
-    DWORD CountChars =
-      ::GetFinalPathNameByHandleW(H, RealPathUTF16, MAX_PATH,
-                                  FILE_NAME_NORMALIZED);
-    if (CountChars > 0 && CountChars < MAX_PATH) {
-      // Convert the result from UTF-16 to UTF-8.
-      SmallString<MAX_PATH> RealPathUTF8;
-      if (!UTF16ToUTF8(RealPathUTF16, CountChars, RealPathUTF8))
-        RealPath->append(RealPathUTF8.data(),
-                         RealPathUTF8.data() + strlen(RealPathUTF8.data()));
-    }
-  }
+  if (RealPath)
+    realPathFromHandle(H, *RealPath);
 
   ResultFD = FD;
   return std::error_code();
@@ -843,6 +994,81 @@ std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
 
   return windows::UTF16ToUTF8(TempPath.data(), CharCount, ResultPath);
 }
+
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> Path16;
+  std::error_code EC = widenPath(path, Path16);
+  if (EC && !IgnoreErrors)
+    return EC;
+
+  // SHFileOperation() accepts a list of paths, and so must be double null-
+  // terminated to indicate the end of the list.  The buffer is already null
+  // terminated, but since that null character is not considered part of the
+  // vector's size, pushing another one will just consume that byte.  So we
+  // need to push 2 null terminators.
+  Path16.push_back(0);
+  Path16.push_back(0);
+
+  SHFILEOPSTRUCTW shfos = {};
+  shfos.wFunc = FO_DELETE;
+  shfos.pFrom = Path16.data();
+  shfos.fFlags = FOF_NO_UI;
+
+  int result = ::SHFileOperationW(&shfos);
+  if (result != 0 && !IgnoreErrors)
+    return mapWindowsError(result);
+  return std::error_code();
+}
+
+static void expandTildeExpr(SmallVectorImpl<char> &Path) {
+  // Path does not begin with a tilde expression.
+  if (Path.empty() || Path[0] != '~')
+    return;
+
+  StringRef PathStr(Path.begin(), Path.size());
+  PathStr = PathStr.drop_front();
+  StringRef Expr = PathStr.take_until([](char c) { return path::is_separator(c); });
+
+  if (!Expr.empty()) {
+    // This is probably a ~username/ expression.  Don't support this on Windows.
+    return;
+  }
+
+  SmallString<128> HomeDir;
+  if (!path::home_directory(HomeDir)) {
+    // For some reason we couldn't get the home directory.  Just exit.
+    return;
+  }
+
+  // Overwrite the first character and insert the rest.
+  Path[0] = HomeDir[0];
+  Path.insert(Path.begin() + 1, HomeDir.begin() + 1, HomeDir.end());
+}
+
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
+                          bool expand_tilde) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return std::error_code();
+
+  if (expand_tilde) {
+    SmallString<128> Storage;
+    path.toVector(Storage);
+    expandTildeExpr(Storage);
+    return real_path(Storage, dest, false);
+  }
+
+  if (is_directory(path))
+    return directoryRealPath(path, dest);
+
+  int fd;
+  if (std::error_code EC = llvm::sys::fs::openFileForRead(path, fd, &dest))
+    return EC;
+  ::close(fd);
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
diff --git a/contrib/llvm/lib/Support/Windows/Process.inc b/contrib/llvm/lib/Support/Windows/Process.inc
index 8d646b3..18aef61 100644
--- a/contrib/llvm/lib/Support/Windows/Process.inc
+++ b/contrib/llvm/lib/Support/Windows/Process.inc
@@ -47,7 +47,6 @@
 #endif
 
 using namespace llvm;
-using namespace sys;
 
 // This function retrieves the page size using GetNativeSystemInfo() and is
 // present solely so it can be called once to initialize the self_process member
diff --git a/contrib/llvm/lib/Support/Windows/Program.inc b/contrib/llvm/lib/Support/Windows/Program.inc
index 78fc538..721167d 100644
--- a/contrib/llvm/lib/Support/Windows/Program.inc
+++ b/contrib/llvm/lib/Support/Windows/Program.inc
@@ -29,7 +29,6 @@
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-using namespace sys;
 
 ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
 
diff --git a/contrib/llvm/lib/Support/Windows/RWMutex.inc b/contrib/llvm/lib/Support/Windows/RWMutex.inc
index 2d1d25f..ac60c2f 100644
--- a/contrib/llvm/lib/Support/Windows/RWMutex.inc
+++ b/contrib/llvm/lib/Support/Windows/RWMutex.inc
@@ -19,7 +19,6 @@
 #include "WindowsSupport.h"
 
 namespace llvm {
-using namespace sys;
 
 // Windows has slim read-writer lock support on Vista and higher, so we
 // will attempt to load the APIs.  If they exist, we will use them, and
@@ -73,7 +72,7 @@ static bool loadSRW() {
   return sHasSRW;
 }
 
-RWMutexImpl::RWMutexImpl() {
+sys::RWMutexImpl::RWMutexImpl() {
   if (loadSRW()) {
     data_ = calloc(1, sizeof(SRWLOCK));
     fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
@@ -83,14 +82,14 @@ RWMutexImpl::RWMutexImpl() {
   }
 }
 
-RWMutexImpl::~RWMutexImpl() {
+sys::RWMutexImpl::~RWMutexImpl() {
   if (!sHasSRW)
     DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
   // Nothing to do in the case of slim reader/writers except free the memory.
   free(data_);
 }
 
-bool RWMutexImpl::reader_acquire() {
+bool sys::RWMutexImpl::reader_acquire() {
   if (sHasSRW) {
     fpAcquireSRWLockShared(static_cast<PSRWLOCK>(data_));
   } else {
@@ -99,7 +98,7 @@ bool RWMutexImpl::reader_acquire() {
   return true;
 }
 
-bool RWMutexImpl::reader_release() {
+bool sys::RWMutexImpl::reader_release() {
   if (sHasSRW) {
     fpReleaseSRWLockShared(static_cast<PSRWLOCK>(data_));
   } else {
@@ -108,7 +107,7 @@ bool RWMutexImpl::reader_release() {
   return true;
 }
 
-bool RWMutexImpl::writer_acquire() {
+bool sys::RWMutexImpl::writer_acquire() {
   if (sHasSRW) {
     fpAcquireSRWLockExclusive(static_cast<PSRWLOCK>(data_));
   } else {
@@ -117,7 +116,7 @@ bool RWMutexImpl::writer_acquire() {
   return true;
 }
 
-bool RWMutexImpl::writer_release() {
+bool sys::RWMutexImpl::writer_release() {
   if (sHasSRW) {
     fpReleaseSRWLockExclusive(static_cast<PSRWLOCK>(data_));
   } else {
diff --git a/contrib/llvm/lib/Support/Windows/Signals.inc b/contrib/llvm/lib/Support/Windows/Signals.inc
index f739421..1ef5188 100644
--- a/contrib/llvm/lib/Support/Windows/Signals.inc
+++ b/contrib/llvm/lib/Support/Windows/Signals.inc
@@ -776,7 +776,7 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   // the nasty sorts of crashes that aren't 100% reproducible from a set of
   // inputs (or in the event that the user is unable or unwilling to provide a
   // reproducible case).
-  if (!llvm::Process::AreCoreFilesPrevented()) {
+  if (!llvm::sys::Process::AreCoreFilesPrevented()) {
     MINIDUMP_EXCEPTION_INFORMATION ExceptionInfo;
     ExceptionInfo.ThreadId = ::GetCurrentThreadId();
     ExceptionInfo.ExceptionPointers = ep;
diff --git a/contrib/llvm/lib/Support/Windows/ThreadLocal.inc b/contrib/llvm/lib/Support/Windows/ThreadLocal.inc
index b9cb8ff..8be1c3e 100644
--- a/contrib/llvm/lib/Support/Windows/ThreadLocal.inc
+++ b/contrib/llvm/lib/Support/Windows/ThreadLocal.inc
@@ -20,33 +20,32 @@
 #include "llvm/Support/ThreadLocal.h"
 
 namespace llvm {
-using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() : data() {
+sys::ThreadLocalImpl::ThreadLocalImpl() : data() {
   static_assert(sizeof(DWORD) <= sizeof(data), "size too big");
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   *tls = TlsAlloc();
   assert(*tls != TLS_OUT_OF_INDEXES);
 }
 
-ThreadLocalImpl::~ThreadLocalImpl() {
+sys::ThreadLocalImpl::~ThreadLocalImpl() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   TlsFree(*tls);
 }
 
-void *ThreadLocalImpl::getInstance() {
+void *sys::ThreadLocalImpl::getInstance() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   return TlsGetValue(*tls);
 }
 
-void ThreadLocalImpl::setInstance(const void* d){
+void sys::ThreadLocalImpl::setInstance(const void* d){
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   int errorcode = TlsSetValue(*tls, const_cast<void*>(d));
   assert(errorcode != 0);
   (void)errorcode;
 }
 
-void ThreadLocalImpl::removeInstance() {
+void sys::ThreadLocalImpl::removeInstance() {
   setInstance(0);
 }
 
diff --git a/contrib/llvm/lib/Support/Windows/Threading.inc b/contrib/llvm/lib/Support/Windows/Threading.inc
new file mode 100644
index 0000000..decb488
--- /dev/null
+++ b/contrib/llvm/lib/Support/Windows/Threading.inc
@@ -0,0 +1,109 @@
+//===- Windows/Threading.inc - Win32 Threading Implementation - -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of Threading functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#include "Windows/WindowsSupport.h"
+#include <process.h>
+
+// Windows will at times define MemoryFence.
+#ifdef MemoryFence
+#undef MemoryFence
+#endif
+
+namespace {
+  struct ThreadInfo {
+    void(*func)(void*);
+    void *param;
+  };
+}
+
+static unsigned __stdcall ThreadCallback(void *param) {
+  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
+  info->func(info->param);
+
+  return 0;
+}
+
+void llvm::llvm_execute_on_thread(void(*Fn)(void*), void *UserData,
+  unsigned RequestedStackSize) {
+  struct ThreadInfo param = { Fn, UserData };
+
+  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
+    RequestedStackSize, ThreadCallback,
+    &param, 0, NULL);
+
+  if (hThread) {
+    // We actually don't care whether the wait succeeds or fails, in
+    // the same way we don't care whether the pthread_join call succeeds
+    // or fails.  There's not much we could do if this were to fail. But
+    // on success, this call will wait until the thread finishes executing
+    // before returning.
+    (void)::WaitForSingleObject(hThread, INFINITE);
+    ::CloseHandle(hThread);
+  }
+}
+
+uint64_t llvm::get_threadid() {
+  return uint64_t(::GetCurrentThreadId());
+}
+
+uint32_t llvm::get_max_thread_name_length() { return 0; }
+
+#if defined(_MSC_VER)
+static void SetThreadName(DWORD Id, LPCSTR Name) {
+  constexpr DWORD MS_VC_EXCEPTION = 0x406D1388;
+
+#pragma pack(push, 8)
+  struct THREADNAME_INFO {
+    DWORD dwType;     // Must be 0x1000.
+    LPCSTR szName;    // Pointer to thread name
+    DWORD dwThreadId; // Thread ID (-1 == current thread)
+    DWORD dwFlags;    // Reserved.  Do not use.
+  };
+#pragma pack(pop)
+
+  THREADNAME_INFO info;
+  info.dwType = 0x1000;
+  info.szName = Name;
+  info.dwThreadId = Id;
+  info.dwFlags = 0;
+
+  __try {
+    ::RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR),
+      (ULONG_PTR *)&info);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+  }
+}
+#endif
+
+void llvm::set_thread_name(const Twine &Name) {
+#if defined(_MSC_VER)
+  // Make sure the input is null terminated.
+  SmallString<64> Storage;
+  StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
+  SetThreadName(::GetCurrentThreadId(), NameStr.data());
+#endif
+}
+
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
+  // "Name" is not an inherent property of a thread on Windows.  In fact, when
+  // you "set" the name, you are only firing a one-time message to a debugger
+  // which it interprets as a program setting its threads' name.  We may be
+  // able to get fancy by creating a TLS entry when someone calls
+  // set_thread_name so that subsequent calls to get_thread_name return this
+  // value.
+  Name.clear();
+}
diff --git a/contrib/llvm/lib/Support/Windows/WindowsSupport.h b/contrib/llvm/lib/Support/Windows/WindowsSupport.h
index c358b99..d4599dc 100644
--- a/contrib/llvm/lib/Support/Windows/WindowsSupport.h
+++ b/contrib/llvm/lib/Support/Windows/WindowsSupport.h
@@ -45,7 +45,9 @@
 #include <string>
 #include <system_error>
 #include <windows.h>
-#include <wincrypt.h> // Must be included after windows.h
+
+// Must be included after windows.h
+#include <wincrypt.h>
 
 /// Determines if the program is running on Windows 8 or newer. This
 /// reimplements one of the helpers in the Windows 8.1 SDK, which are intended
diff --git a/contrib/llvm/lib/Support/YAMLParser.cpp b/contrib/llvm/lib/Support/YAMLParser.cpp
index c17a6f6..e2f21a5 100644
--- a/contrib/llvm/lib/Support/YAMLParser.cpp
+++ b/contrib/llvm/lib/Support/YAMLParser.cpp
@@ -1,4 +1,4 @@
-//===--- YAMLParser.cpp - Simple YAML parser ------------------------------===//
+//===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +12,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/YAMLParser.h"
+#include "llvm/ADT/AllocatorList.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/ADT/AllocatorList.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace llvm;
 using namespace yaml;
@@ -37,7 +51,7 @@ enum UnicodeEncodingForm {
 
 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
 ///                it exists. Length is in {0, 2, 3, 4}.
-typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
+using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
 
 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
 ///                      encoding form of \a Input.
@@ -46,7 +60,7 @@ typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo;
 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
 ///          and how long the byte order mark is if one exists.
 static EncodingInfo getUnicodeEncoding(StringRef Input) {
-  if (Input.size() == 0)
+  if (Input.empty())
     return std::make_pair(UEF_Unknown, 0);
 
   switch (uint8_t(Input[0])) {
@@ -95,8 +109,6 @@ static EncodingInfo getUnicodeEncoding(StringRef Input) {
   return std::make_pair(UEF_UTF8, 0);
 }
 
-namespace llvm {
-namespace yaml {
 /// Pin the vtables to this file.
 void Node::anchor() {}
 void NullNode::anchor() {}
@@ -107,6 +119,9 @@ void MappingNode::anchor() {}
 void SequenceNode::anchor() {}
 void AliasNode::anchor() {}
 
+namespace llvm {
+namespace yaml {
+
 /// Token - A single YAML token.
 struct Token {
   enum TokenKind {
@@ -133,7 +148,7 @@ struct Token {
     TK_Alias,
     TK_Anchor,
     TK_Tag
-  } Kind;
+  } Kind = TK_Error;
 
   /// A string of length 0 or more whose begin() points to the logical location
   /// of the token in the input.
@@ -142,14 +157,16 @@ struct Token {
   /// The value of a block scalar node.
   std::string Value;
 
-  Token() : Kind(TK_Error) {}
+  Token() = default;
 };
-}
-}
 
-typedef llvm::BumpPtrList<Token> TokenQueueT;
+} // end namespace yaml
+} // end namespace llvm
+
+using TokenQueueT = BumpPtrList<Token>;
 
 namespace {
+
 /// @brief This struct is used to track simple keys.
 ///
 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
@@ -170,12 +187,13 @@ struct SimpleKey {
     return Tok == Other.Tok;
   }
 };
-}
+
+} // end anonymous namespace
 
 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
 ///        subsequence and the subsequence's length in code units (uint8_t).
 ///        A length of 0 represents an error.
-typedef std::pair<uint32_t, unsigned> UTF8Decoded;
+using UTF8Decoded = std::pair<uint32_t, unsigned>;
 
 static UTF8Decoded decodeUTF8(StringRef Range) {
   StringRef::iterator Position= Range.begin();
@@ -229,6 +247,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
 
 namespace llvm {
 namespace yaml {
+
 /// @brief Scans YAML tokens from a MemoryBuffer.
 class Scanner {
 public:
@@ -350,7 +369,8 @@ private:
   ///          ns-char.
   StringRef::iterator skip_ns_char(StringRef::iterator Position);
 
-  typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator);
+  using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
+
   /// @brief Skip minimal well-formed code unit subsequences until Func
   ///        returns its input.
   ///
@@ -655,10 +675,10 @@ bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
 }
 
 bool yaml::scanTokens(StringRef Input) {
-  llvm::SourceMgr SM;
-  llvm::yaml::Scanner scanner(Input, SM);
-  for (;;) {
-    llvm::yaml::Token T = scanner.getNext();
+  SourceMgr SM;
+  Scanner scanner(Input, SM);
+  while (true) {
+    Token T = scanner.getNext();
     if (T.Kind == Token::TK_StreamEnd)
       break;
     else if (T.Kind == Token::TK_Error)
@@ -1744,7 +1764,7 @@ Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
                std::error_code *EC)
     : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
 
-Stream::~Stream() {}
+Stream::~Stream() = default;
 
 bool Stream::failed() { return scanner->failed(); }
 
@@ -1851,8 +1871,6 @@ bool Node::failed() const {
   return Doc->failed();
 }
 
-
-
 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
   // TODO: Handle newlines properly. We need to remove leading whitespace.
   if (Value[0] == '"') { // Double quoted.
@@ -2116,6 +2134,7 @@ void MappingNode::increment() {
       break;
     default:
       setError("Unexpected token. Expected Key or Block End", T);
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       IsAtEnd = true;
       CurrentEntry = nullptr;
@@ -2128,6 +2147,7 @@ void MappingNode::increment() {
       return increment();
     case Token::TK_FlowMappingEnd:
       getNext();
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       // Set this to end iterator.
       IsAtEnd = true;
@@ -2170,6 +2190,7 @@ void SequenceNode::increment() {
     default:
       setError( "Unexpected token. Expected Block Entry or Block End."
               , T);
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       IsAtEnd = true;
       CurrentEntry = nullptr;
@@ -2198,6 +2219,7 @@ void SequenceNode::increment() {
       return increment();
     case Token::TK_FlowSequenceEnd:
       getNext();
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       // Set this to end iterator.
       IsAtEnd = true;
diff --git a/contrib/llvm/lib/Support/YAMLTraits.cpp b/contrib/llvm/lib/Support/YAMLTraits.cpp
index 9849b3a..65eda24 100644
--- a/contrib/llvm/lib/Support/YAMLTraits.cpp
+++ b/contrib/llvm/lib/Support/YAMLTraits.cpp
@@ -8,17 +8,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/YAMLTraits.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cctype>
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 #include <cstring>
+#include <string>
+#include <vector>
+
 using namespace llvm;
 using namespace yaml;
 
@@ -26,11 +36,9 @@ using namespace yaml;
 //  IO
 //===----------------------------------------------------------------------===//
 
-IO::IO(void *Context) : Ctxt(Context) {
-}
+IO::IO(void *Context) : Ctxt(Context) {}
 
-IO::~IO() {
-}
+IO::~IO() = default;
 
 void *IO::getContext() {
   return Ctxt;
@@ -46,16 +54,22 @@ void IO::setContext(void *Context) {
 
 Input::Input(StringRef InputContent, void *Ctxt,
              SourceMgr::DiagHandlerTy DiagHandler, void *DiagHandlerCtxt)
-    : IO(Ctxt), Strm(new Stream(InputContent, SrcMgr, false, &EC)),
-      CurrentNode(nullptr) {
+    : IO(Ctxt), Strm(new Stream(InputContent, SrcMgr, false, &EC)) {
   if (DiagHandler)
     SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
   DocIterator = Strm->begin();
 }
 
-Input::~Input() {
+Input::Input(MemoryBufferRef Input, void *Ctxt,
+             SourceMgr::DiagHandlerTy DiagHandler, void *DiagHandlerCtxt)
+    : IO(Ctxt), Strm(new Stream(Input, SrcMgr, false, &EC)) {
+  if (DiagHandler)
+    SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
+  DocIterator = Strm->begin();
 }
 
+Input::~Input() = default;
+
 std::error_code Input::error() { return EC; }
 
 // Pin the vtables to this file.
@@ -398,20 +412,9 @@ bool Input::canElideEmptySequence() {
 //===----------------------------------------------------------------------===//
 
 Output::Output(raw_ostream &yout, void *context, int WrapColumn)
-    : IO(context),
-      Out(yout),
-      WrapColumn(WrapColumn),
-      Column(0),
-      ColumnAtFlowStart(0),
-      ColumnAtMapFlowStart(0),
-      NeedBitValueComma(false),
-      NeedFlowSequenceComma(false),
-      EnumerationMatchFound(false),
-      NeedsNewLine(false) {
-}
+    : IO(context), Out(yout), WrapColumn(WrapColumn) {}
 
-Output::~Output() {
-}
+Output::~Output() = default;
 
 bool Output::outputting() {
   return true;
@@ -462,7 +465,7 @@ std::vector<StringRef> Output::keys() {
 bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
                           bool &UseDefault, void *&) {
   UseDefault = false;
-  if (Required || !SameAsDefault) {
+  if (Required || !SameAsDefault || WriteDefaultValues) {
     auto State = StateStack.back();
     if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
       flowKey(Key);
@@ -918,12 +921,9 @@ void ScalarTraits<double>::output(const double &Val, void *, raw_ostream &Out) {
 }
 
 StringRef ScalarTraits<double>::input(StringRef Scalar, void *, double &Val) {
-  SmallString<32> buff(Scalar.begin(), Scalar.end());
-  char *end;
-  Val = strtod(buff.c_str(), &end);
-  if (*end != '\0')
-    return "invalid floating point number";
-  return StringRef();
+  if (to_float(Scalar, Val))
+    return StringRef();
+  return "invalid floating point number";
 }
 
 void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
@@ -931,12 +931,9 @@ void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
 }
 
 StringRef ScalarTraits<float>::input(StringRef Scalar, void *, float &Val) {
-  SmallString<32> buff(Scalar.begin(), Scalar.end());
-  char *end;
-  Val = strtod(buff.c_str(), &end);
-  if (*end != '\0')
-    return "invalid floating point number";
-  return StringRef();
+  if (to_float(Scalar, Val))
+    return StringRef();
+  return "invalid floating point number";
 }
 
 void ScalarTraits<Hex8>::output(const Hex8 &Val, void *, raw_ostream &Out) {
diff --git a/contrib/llvm/lib/Support/raw_ostream.cpp b/contrib/llvm/lib/Support/raw_ostream.cpp
index d073802..dd58ecc 100644
--- a/contrib/llvm/lib/Support/raw_ostream.cpp
+++ b/contrib/llvm/lib/Support/raw_ostream.cpp
@@ -326,13 +326,30 @@ raw_ostream &raw_ostream::operator<<(const formatv_object_base &Obj) {
 }
 
 raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
-  unsigned Len = FS.Str.size(); 
-  int PadAmount = FS.Width - Len;
-  if (FS.RightJustify && (PadAmount > 0))
-    this->indent(PadAmount);
-  this->operator<<(FS.Str);
-  if (!FS.RightJustify && (PadAmount > 0))
+  if (FS.Str.size() >= FS.Width || FS.Justify == FormattedString::JustifyNone) {
+    this->operator<<(FS.Str);
+    return *this;
+  }
+  const size_t Difference = FS.Width - FS.Str.size();
+  switch (FS.Justify) {
+  case FormattedString::JustifyLeft:
+    this->operator<<(FS.Str);
+    this->indent(Difference);
+    break;
+  case FormattedString::JustifyRight:
+    this->indent(Difference);
+    this->operator<<(FS.Str);
+    break;
+  case FormattedString::JustifyCenter: {
+    int PadAmount = Difference / 2;
     this->indent(PadAmount);
+    this->operator<<(FS.Str);
+    this->indent(Difference - PadAmount);
+    break;
+  }
+  default:
+    llvm_unreachable("Bad Justification");
+  }
   return *this;
 }
 
@@ -465,8 +482,7 @@ void format_object_base::home() {
 static int getFD(StringRef Filename, std::error_code &EC,
                  sys::fs::OpenFlags Flags) {
   // Handle "-" as stdout. Note that when we do this, we consider ourself
-  // the owner of stdout. This means that we can do things like close the
-  // file descriptor when we're done and set the "binary" flag globally.
+  // the owner of stdout and may set the "binary" flag globally based on Flags.
   if (Filename == "-") {
     EC = std::error_code();
     // If user requested binary then put stdout into binary mode if
@@ -497,6 +513,13 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
     ShouldClose = false;
     return;
   }
+  // We do not want to close STDOUT as there may have been several uses of it
+  // such as the case: llc %s -o=- -pass-remarks-output=- -filetype=asm
+  // which cause multiple closes of STDOUT_FILENO and/or use-after-close of it.
+  // Using dup() in getFD doesn't work as we end up with original STDOUT_FILENO
+  // open anyhow.
+  if (FD <= STDERR_FILENO)
+    ShouldClose = false;
 
   // Get the starting position.
   off_t loc = ::lseek(FD, 0, SEEK_CUR);
@@ -542,7 +565,11 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
   pos += Size;
 
 #ifndef LLVM_ON_WIN32
+#if defined(__linux__)
+  bool ShouldWriteInChunks = true;
+#else
   bool ShouldWriteInChunks = false;
+#endif
 #else
   // Writing a large size of output to Windows console returns ENOMEM. It seems
   // that, prior to Windows 8, WriteFile() is redirecting to WriteConsole(), and
diff --git a/contrib/llvm/lib/TableGen/Record.cpp b/contrib/llvm/lib/TableGen/Record.cpp
index ea9c9a1..b2636e1 100644
--- a/contrib/llvm/lib/TableGen/Record.cpp
+++ b/contrib/llvm/lib/TableGen/Record.cpp
@@ -11,20 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/TableGen/Record.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include <cassert>
 #include <cstdint>
-#include <new>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -40,7 +48,9 @@ IntRecTy IntRecTy::Shared;
 StringRecTy StringRecTy::Shared;
 DagRecTy DagRecTy::Shared;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecTy::dump() const { print(errs()); }
+#endif
 
 ListRecTy *RecTy::getListTy() {
   if (!ListTy)
@@ -160,8 +170,11 @@ RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
 //    Initializer implementations
 //===----------------------------------------------------------------------===//
 
-void Init::anchor() { }
+void Init::anchor() {}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Init::dump() const { return print(errs()); }
+#endif
 
 UnsetInit *UnsetInit::get() {
   static UnsetInit TheInit;
@@ -215,7 +228,6 @@ ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range) {
 
 BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
   static FoldingSet<BitsInit> ThePool;
-  static std::vector<BitsInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileBitsInit(ID, Range);
@@ -230,7 +242,6 @@ BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -299,7 +310,6 @@ static Init *fixBitInit(const RecordVal *RV, Init *Before, Init *After) {
 
 // resolveReferences - If there are any field references that refer to fields
 // that have been filled in, we can propagate the values now.
-//
 Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   bool Changed = false;
   SmallVector<Init *, 16> NewBits(getNumBits());
@@ -403,27 +413,21 @@ IntInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
 }
 
 CodeInit *CodeInit::get(StringRef V) {
-  static DenseMap<StringRef, CodeInit*> ThePool;
+  static StringMap<CodeInit*, BumpPtrAllocator &> ThePool(Allocator);
 
-  auto I = ThePool.insert(std::make_pair(V, nullptr));
-  if (I.second) {
-    StringRef VCopy = V.copy(Allocator);
-    I.first->first = VCopy;
-    I.first->second = new(Allocator) CodeInit(VCopy);
-  }
-  return I.first->second;
+  auto &Entry = *ThePool.insert(std::make_pair(V, nullptr)).first;
+  if (!Entry.second)
+    Entry.second = new(Allocator) CodeInit(Entry.getKey());
+  return Entry.second;
 }
 
 StringInit *StringInit::get(StringRef V) {
-  static DenseMap<StringRef, StringInit*> ThePool;
+  static StringMap<StringInit*, BumpPtrAllocator &> ThePool(Allocator);
 
-  auto I = ThePool.insert(std::make_pair(V, nullptr));
-  if (I.second) {
-    StringRef VCopy = V.copy(Allocator);
-    I.first->first = VCopy;
-    I.first->second = new(Allocator) StringInit(VCopy);
-  }
-  return I.first->second;
+  auto &Entry = *ThePool.insert(std::make_pair(V, nullptr)).first;
+  if (!Entry.second)
+    Entry.second = new(Allocator) StringInit(Entry.getKey());
+  return Entry.second;
 }
 
 Init *StringInit::convertInitializerTo(RecTy *Ty) const {
@@ -452,7 +456,6 @@ static void ProfileListInit(FoldingSetNodeID &ID,
 
 ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   static FoldingSet<ListInit> ThePool;
-  static std::vector<ListInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileListInit(ID, Range, EltTy);
@@ -467,7 +470,6 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -602,7 +604,6 @@ ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, RecTy *Type) {
 
 UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
   static FoldingSet<UnOpInit> ThePool;
-  static std::vector<UnOpInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileUnOpInit(ID, Opc, LHS, Type);
@@ -613,7 +614,6 @@ UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
 
   UnOpInit *I = new(Allocator) UnOpInit(Opc, LHS, Type);
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -623,7 +623,7 @@ void UnOpInit::Profile(FoldingSetNodeID &ID) const {
 
 Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   switch (getOpcode()) {
-  case CAST: {
+  case CAST:
     if (isa<StringRecTy>(getType())) {
       if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
         return LHSs;
@@ -688,15 +688,15 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
       }
     }
     break;
-  }
-  case HEAD: {
+
+  case HEAD:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       assert(!LHSl->empty() && "Empty list in head");
       return LHSl->getElement(0);
     }
     break;
-  }
-  case TAIL: {
+
+  case TAIL:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       assert(!LHSl->empty() && "Empty list in tail");
       // Note the +1.  We can't just pass the result of getValues()
@@ -704,16 +704,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
       return ListInit::get(LHSl->getValues().slice(1), LHSl->getType());
     }
     break;
-  }
-  case EMPTY: {
+
+  case EMPTY:
     if (ListInit *LHSl = dyn_cast<ListInit>(LHS))
       return IntInit::get(LHSl->empty());
     if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
       return IntInit::get(LHSs->getValue().empty());
-
     break;
   }
-  }
   return const_cast<UnOpInit *>(this);
 }
 
@@ -748,7 +746,6 @@ ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *RHS,
 BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS,
                           Init *RHS, RecTy *Type) {
   static FoldingSet<BinOpInit> ThePool;
-  static std::vector<BinOpInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileBinOpInit(ID, Opc, LHS, RHS, Type);
@@ -759,7 +756,6 @@ BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS,
 
   BinOpInit *I = new(Allocator) BinOpInit(Opc, LHS, RHS, Type);
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -906,7 +902,6 @@ ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS,
 TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
                             RecTy *Type) {
   static FoldingSet<TernOpInit> ThePool;
-  static std::vector<TernOpInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type);
@@ -917,7 +912,6 @@ TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
 
   TernOpInit *I = new(Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type);
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -960,7 +954,6 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
 
 static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
                            Record *CurRec, MultiClass *CurMultiClass) {
-
   OpInit *RHSo = dyn_cast<OpInit>(RHS);
 
   if (!RHSo)
@@ -1257,7 +1250,7 @@ VarInit *VarInit::get(StringRef VN, RecTy *T) {
 }
 
 VarInit *VarInit::get(Init *VN, RecTy *T) {
-  typedef std::pair<RecTy *, Init *> Key;
+  using Key = std::pair<RecTy *, Init *>;
   static DenseMap<Key, VarInit*> ThePool;
 
   Key TheKey(std::make_pair(T, VN));
@@ -1332,7 +1325,7 @@ Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
 }
 
 VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) {
-  typedef std::pair<TypedInit *, unsigned> Key;
+  using Key = std::pair<TypedInit *, unsigned>;
   static DenseMap<Key, VarBitInit*> ThePool;
 
   Key TheKey(std::make_pair(T, B));
@@ -1364,7 +1357,7 @@ Init *VarBitInit::resolveReferences(Record &R, const RecordVal *RV) const {
 
 VarListElementInit *VarListElementInit::get(TypedInit *T,
                                             unsigned E) {
-  typedef std::pair<TypedInit *, unsigned> Key;
+  using Key = std::pair<TypedInit *, unsigned>;
   static DenseMap<Key, VarListElementInit*> ThePool;
 
   Key TheKey(std::make_pair(T, E));
@@ -1434,7 +1427,7 @@ std::string DefInit::getAsString() const {
 }
 
 FieldInit *FieldInit::get(Init *R, StringInit *FN) {
-  typedef std::pair<Init *, StringInit *> Key;
+  using Key = std::pair<Init *, StringInit *>;
   static DenseMap<Key, FieldInit*> ThePool;
 
   Key TheKey(std::make_pair(R, FN));
@@ -1499,7 +1492,6 @@ DagInit *
 DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
              ArrayRef<StringInit *> NameRange) {
   static FoldingSet<DagInit> ThePool;
-  static std::vector<DagInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileDagInit(ID, V, VN, ArgRange, NameRange);
@@ -1508,9 +1500,13 @@ DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
   if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  DagInit *I = new(Allocator) DagInit(V, VN, ArgRange, NameRange);
+  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *, StringInit *>(ArgRange.size(), NameRange.size()), alignof(BitsInit));
+  DagInit *I = new(Mem) DagInit(V, VN, ArgRange.size(), NameRange.size());
+  std::uninitialized_copy(ArgRange.begin(), ArgRange.end(),
+                          I->getTrailingObjects<Init *>());
+  std::uninitialized_copy(NameRange.begin(), NameRange.end(),
+                          I->getTrailingObjects<StringInit *>());
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -1529,7 +1525,7 @@ DagInit::get(Init *V, StringInit *VN,
 }
 
 void DagInit::Profile(FoldingSetNodeID &ID) const {
-  ProfileDagInit(ID, Val, ValName, Args, ArgNames);
+  ProfileDagInit(ID, Val, ValName, makeArrayRef(getTrailingObjects<Init *>(), NumArgs), makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames));
 }
 
 Init *DagInit::convertInitializerTo(RecTy *Ty) const {
@@ -1541,9 +1537,9 @@ Init *DagInit::convertInitializerTo(RecTy *Ty) const {
 
 Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
   SmallVector<Init*, 8> NewArgs;
-  NewArgs.reserve(Args.size());
+  NewArgs.reserve(arg_size());
   bool ArgsChanged = false;
-  for (const Init *Arg : Args) {
+  for (const Init *Arg : getArgs()) {
     Init *NewArg = Arg->resolveReferences(R, RV);
     NewArgs.push_back(NewArg);
     ArgsChanged |= NewArg != Arg;
@@ -1551,7 +1547,7 @@ Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
 
   Init *Op = Val->resolveReferences(R, RV);
   if (Op != Val || ArgsChanged)
-    return DagInit::get(Op, ValName, NewArgs, ArgNames);
+    return DagInit::get(Op, ValName, NewArgs, getArgNames());
 
   return const_cast<DagInit *>(this);
 }
@@ -1560,12 +1556,12 @@ std::string DagInit::getAsString() const {
   std::string Result = "(" + Val->getAsString();
   if (ValName)
     Result += ":" + ValName->getAsUnquotedString();
-  if (!Args.empty()) {
-    Result += " " + Args[0]->getAsString();
-    if (ArgNames[0]) Result += ":$" + ArgNames[0]->getAsUnquotedString();
-    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
-      Result += ", " + Args[i]->getAsString();
-      if (ArgNames[i]) Result += ":$" + ArgNames[i]->getAsUnquotedString();
+  if (!arg_empty()) {
+    Result += " " + getArg(0)->getAsString();
+    if (getArgName(0)) Result += ":$" + getArgName(0)->getAsUnquotedString();
+    for (unsigned i = 1, e = getNumArgs(); i != e; ++i) {
+      Result += ", " + getArg(i)->getAsString();
+      if (getArgName(i)) Result += ":$" + getArgName(i)->getAsUnquotedString();
     }
   }
   return Result + ")";
@@ -1581,17 +1577,13 @@ RecordVal::RecordVal(Init *N, RecTy *T, bool P)
   assert(Value && "Cannot create unset value for current type!");
 }
 
-RecordVal::RecordVal(StringRef N, RecTy *T, bool P)
-  : Name(StringInit::get(N)), TyAndPrefix(T, P) {
-  Value = UnsetInit::get()->convertInitializerTo(T);
-  assert(Value && "Cannot create unset value for current type!");
-}
-
 StringRef RecordVal::getName() const {
   return cast<StringInit>(getNameInit())->getValue();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void RecordVal::dump() const { errs() << *this; }
+#endif
 
 void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
   if (getPrefix()) OS << "field ";
@@ -1610,8 +1602,7 @@ void Record::init() {
 
   // Every record potentially has a def at the top.  This value is
   // replaced with the top-level def name at instantiation time.
-  RecordVal DN("NAME", StringRecTy::get(), false);
-  addValue(DN);
+  addValue(RecordVal(StringInit::get("NAME"), StringRecTy::get(), false));
 }
 
 void Record::checkName() {
@@ -1647,10 +1638,6 @@ void Record::setName(Init *NewName) {
   // this.  See TGParser::ParseDef and TGParser::ParseDefm.
 }
 
-void Record::setName(StringRef Name) {
-  setName(StringInit::get(Name));
-}
-
 void Record::resolveReferencesTo(const RecordVal *RV) {
   for (RecordVal &Value : Values) {
     if (RV == &Value) // Skip resolve the same field as the given one
@@ -1673,7 +1660,9 @@ void Record::resolveReferencesTo(const RecordVal *RV) {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Record::dump() const { errs() << *this; }
+#endif
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
   OS << R.getNameInitAsString();
@@ -1719,7 +1708,7 @@ Init *Record::getValueInit(StringRef FieldName) const {
   return R->getValue();
 }
 
-std::string Record::getValueAsString(StringRef FieldName) const {
+StringRef Record::getValueAsString(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (!R || !R->getValue())
     PrintFatalError(getLoc(), "Record `" + getName() +
@@ -1798,10 +1787,10 @@ Record::getValueAsListOfInts(StringRef FieldName) const {
   return Ints;
 }
 
-std::vector<std::string>
+std::vector<StringRef>
 Record::getValueAsListOfStrings(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
-  std::vector<std::string> Strings;
+  std::vector<StringRef> Strings;
   for (Init *I : List->getValues()) {
     if (StringInit *SI = dyn_cast<StringInit>(I))
       Strings.push_back(SI->getValue());
@@ -1865,6 +1854,7 @@ DagInit *Record::getValueAsDag(StringRef FieldName) const {
     FieldName + "' does not have a dag initializer!");
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MultiClass::dump() const {
   errs() << "Record:\n";
   Rec.dump();
@@ -1875,6 +1865,7 @@ LLVM_DUMP_METHOD void MultiClass::dump() const {
 }
 
 LLVM_DUMP_METHOD void RecordKeeper::dump() const { errs() << *this; }
+#endif
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
   OS << "------------- Classes -----------------\n";
diff --git a/contrib/llvm/lib/TableGen/SetTheory.cpp b/contrib/llvm/lib/TableGen/SetTheory.cpp
index a4d3305..733e0ae 100644
--- a/contrib/llvm/lib/TableGen/SetTheory.cpp
+++ b/contrib/llvm/lib/TableGen/SetTheory.cpp
@@ -12,18 +12,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/TableGen/SetTheory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/SetTheory.h"
+#include <algorithm>
+#include <cstdint>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 
 // Define the standard operators.
 namespace {
 
-typedef SetTheory::RecSet RecSet;
-typedef SetTheory::RecVec RecVec;
+using RecSet = SetTheory::RecSet;
+using RecVec = SetTheory::RecVec;
 
 // (add a, b, ...) Evaluate and union all arguments.
 struct AddOp : public SetTheory::Operator {
@@ -237,13 +248,13 @@ struct FieldExpander : public SetTheory::Expander {
     ST.evaluate(Def->getValueInit(FieldName), Elts, Def->getLoc());
   }
 };
+
 } // end anonymous namespace
 
 // Pin the vtables to this file.
 void SetTheory::Operator::anchor() {}
 void SetTheory::Expander::anchor() {}
 
-
 SetTheory::SetTheory() {
   addOperator("add", llvm::make_unique<AddOp>());
   addOperator("sub", llvm::make_unique<SubOp>());
@@ -321,4 +332,3 @@ const RecVec *SetTheory::expand(Record *Set) {
   // Set is not expandable.
   return nullptr;
 }
-
diff --git a/contrib/llvm/lib/TableGen/StringMatcher.cpp b/contrib/llvm/lib/TableGen/StringMatcher.cpp
index 0c83da6..7e510f0 100644
--- a/contrib/llvm/lib/TableGen/StringMatcher.cpp
+++ b/contrib/llvm/lib/TableGen/StringMatcher.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/TableGen/StringMatcher.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TableGen/StringMatcher.h"
 #include <cassert>
 #include <map>
 #include <string>
diff --git a/contrib/llvm/lib/TableGen/TGParser.cpp b/contrib/llvm/lib/TableGen/TGParser.cpp
index 1a91b37..b492cf9 100644
--- a/contrib/llvm/lib/TableGen/TGParser.cpp
+++ b/contrib/llvm/lib/TableGen/TGParser.cpp
@@ -54,6 +54,7 @@ struct SubMultiClassReference {
   void dump() const;
 };
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
   errs() << "Multiclass:\n";
 
@@ -63,6 +64,7 @@ LLVM_DUMP_METHOD void SubMultiClassReference::dump() const {
   for (Init *TA : TemplateArgs)
     TA->dump();
 }
+#endif
 
 } // end namespace llvm
 
@@ -337,7 +339,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
     if (!IVal)
       return Error(Loc, "foreach iterator value is untyped");
 
-    IterRec->addValue(RecordVal(IterVar->getName(), IVal->getType(), false));
+    IterRec->addValue(RecordVal(IterVar->getNameInit(), IVal->getType(), false));
 
     if (SetValue(IterRec.get(), Loc, IterVar->getNameInit(), None, IVal))
       return Error(Loc, "when instantiating this def");
@@ -376,8 +378,8 @@ static bool isObjectStart(tgtok::TokKind K) {
 
 /// GetNewAnonymousName - Generate a unique anonymous name that can be used as
 /// an identifier.
-std::string TGParser::GetNewAnonymousName() {
-  return "anonymous_" + utostr(AnonCounter++);
+Init *TGParser::GetNewAnonymousName() {
+  return StringInit::get("anonymous_" + utostr(AnonCounter++));
 }
 
 /// ParseObjectName - If an object name is specified, return it.  Otherwise,
@@ -945,7 +947,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       else if (ListInit *Arg0 = dyn_cast<ListInit>(InitList[0]))
         Type = Arg0->getType();
       else {
-        InitList[0]->dump();
+        InitList[0]->print(errs());
         Error(OpLoc, "expected a list");
         return nullptr;
       }
@@ -2348,7 +2350,7 @@ Record *TGParser::InstantiateMulticlassDef(MultiClass &MC, Record *DefProto,
 
   bool IsAnonymous = false;
   if (!DefmPrefix) {
-    DefmPrefix = StringInit::get(GetNewAnonymousName());
+    DefmPrefix = GetNewAnonymousName();
     IsAnonymous = true;
   }
 
diff --git a/contrib/llvm/lib/TableGen/TGParser.h b/contrib/llvm/lib/TableGen/TGParser.h
index 76f7d8f..1b2966c 100644
--- a/contrib/llvm/lib/TableGen/TGParser.h
+++ b/contrib/llvm/lib/TableGen/TGParser.h
@@ -110,7 +110,7 @@ private:  // Semantic analysis methods.
   bool AddSubMultiClass(MultiClass *CurMC,
                         SubMultiClassReference &SubMultiClass);
 
-  std::string GetNewAnonymousName();
+  Init *GetNewAnonymousName();
 
   // IterRecord: Map an iterator name to a value.
   struct IterRecord {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h
index fd106a8..1dda746 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.h
@@ -22,12 +22,16 @@
 
 namespace llvm {
 
+class AArch64RegisterBankInfo;
+class AArch64Subtarget;
 class AArch64TargetMachine;
 class FunctionPass;
+class InstructionSelector;
 class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
 FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64CondBrTuning();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -38,19 +42,23 @@ FunctionPass *createAArch64LoadStoreOptimizationPass();
 FunctionPass *createAArch64VectorByElementOptPass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
-FunctionPass *createAArch64AddressTypePromotionPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
+FunctionPass *createFalkorHWPFFixPass();
+FunctionPass *createFalkorMarkStridedAccessesPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &,
+                                 AArch64Subtarget &, AArch64RegisterBankInfo &);
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
-void initializeAArch64AddressTypePromotionPass(PassRegistry&);
 void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
 void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64CondBrTuningPass(PassRegistry &);
 void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
@@ -60,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeFalkorHWPFFixPass(PassRegistry&);
+void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
index 91c335f..436bf11 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -27,7 +27,7 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-  "Enable cryptographic instructions">;
+  "Enable cryptographic instructions", [FeatureNEON]>;
 
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
@@ -38,6 +38,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
 def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
   "Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
 
+def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
+  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
+
 def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
   "Enable ARMv8 PMUv3 Performance Monitors extension">;
 
@@ -47,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
 def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
   "Enable Statistical Profiling extension">;
 
+def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
+  "Enable Scalable Vector Extension (SVE) instructions">;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -100,6 +106,14 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
     "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
     "CPU fuses arithmetic + cbz/cbnz operations">;
 
+def FeatureFuseAES : SubtargetFeature<
+    "fuse-aes", "HasFuseAES", "true",
+    "CPU fuses AES crypto operations">;
+
+def FeatureFuseLiterals : SubtargetFeature<
+    "fuse-literals", "HasFuseLiterals", "true",
+    "CPU fuses literal generation operations">;
+
 def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
     "Disable latency scheduling heuristic">;
@@ -108,12 +122,22 @@ def FeatureUseRSqrt : SubtargetFeature<
     "use-reciprocal-square-root", "UseRSqrt", "true",
     "Use the reciprocal square root approximation">;
 
+def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
+                                        "NegativeImmediates", "false",
+                                        "Convert immediates and instructions "
+                                        "to their negated or complemented "
+                                        "equivalent when the immediate does "
+                                        "not fit in the encoding.">;
+
+def FeatureLSLFast : SubtargetFeature<
+    "lsl-fast", "HasLSLFast", "true",
+    "CPU has a fastpath logical shift of up to 3 places">;
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
 
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
-  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>;
+  "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
   "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
@@ -123,6 +147,7 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
 //===----------------------------------------------------------------------===//
 
 include "AArch64RegisterInfo.td"
+include "AArch64RegisterBanks.td"
 include "AArch64CallingConvention.td"
 
 //===----------------------------------------------------------------------===//
@@ -149,7 +174,8 @@ include "AArch64SchedCyclone.td"
 include "AArch64SchedFalkor.td"
 include "AArch64SchedKryo.td"
 include "AArch64SchedM1.td"
-include "AArch64SchedVulcan.td"
+include "AArch64SchedThunderX.td"
+include "AArch64SchedThunderX2T99.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", [
@@ -167,6 +193,7 @@ def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
@@ -180,6 +207,8 @@ def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
+                                   FeatureFuseLiterals,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
@@ -191,6 +220,7 @@ def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon
                                    ]>;
@@ -200,6 +230,7 @@ def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    FeatureCRC,
                                    FeatureCrypto,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
                                    FeatureNEON,
                                    FeaturePerfMon
                                    ]>;
@@ -226,6 +257,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
@@ -240,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
@@ -256,7 +289,8 @@ def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
 def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -269,33 +303,80 @@ def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing
+                                   FeatureRDM,
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
-def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
-                                   "Broadcom Vulcan processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
-                                   FeatureArithmeticBccFusion,
-                                   FeatureNEON,
-                                   FeaturePostRAScheduler,
-                                   FeaturePredictableSelectIsExpensive,
-                                   HasV8_1aOps]>;
+def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
+                                         "ThunderX2T99",
+                                         "Cavium ThunderX2 processors", [
+                                          FeatureCRC,
+                                          FeatureCrypto,
+                                          FeatureFPARMv8,
+                                          FeatureArithmeticBccFusion,
+                                          FeatureNEON,
+                                          FeaturePostRAScheduler,
+                                          FeaturePredictableSelectIsExpensive,
+                                          FeatureLSE,
+                                          HasV8_1aOps]>;
+
+def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
+                                    "Cavium ThunderX processors", [
+                                    FeatureCRC,
+                                    FeatureCrypto,
+                                    FeatureFPARMv8,
+                                    FeaturePerfMon,
+                                    FeaturePostRAScheduler,
+                                    FeaturePredictableSelectIsExpensive,
+                                    FeatureNEON]>;
+
+def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
+                                       "ThunderXT88",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
+
+def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
+                                       "ThunderXT81",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
+
+def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
+                                       "ThunderXT83",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
 
 def : ProcessorModel<"generic", NoSchedModel, [
-                     FeatureCRC,
                      FeatureFPARMv8,
+                     FeatureFuseAES,
                      FeatureNEON,
                      FeaturePerfMon,
                      FeaturePostRAScheduler
                      ]>;
 
-// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+// FIXME: Cortex-A35 is currently modeled as a Cortex-A53.
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+// FIXME: Cortex-A72 and Cortex-A73 are currently modeled as a Cortex-A57.
 def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
 def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
@@ -304,7 +385,13 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
 def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
 def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
 def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
-def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
+// Cavium ThunderX/ThunderX T8X  Processors
+def : ProcessorModel<"thunderx", ThunderXT8XModel,  [ProcThunderX]>;
+def : ProcessorModel<"thunderxt88", ThunderXT8XModel,  [ProcThunderXT88]>;
+def : ProcessorModel<"thunderxt81", ThunderXT8XModel,  [ProcThunderXT81]>;
+def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
+// Cavium ThunderX2T9X  Processors. Formerly Broadcom Vulcan.
+def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 0aa597b..db1fbe0 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -493,43 +493,30 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
 
 int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
                                                 MachineBasicBlock &MBB) {
-  RegScavenger RS;
-  RS.enterBasicBlock(MBB);
-  RS.forward(MachineBasicBlock::iterator(G->getStart()));
-
   // Can we find an appropriate register that is available throughout the life
-  // of the chain?
-  unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
-  BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-  for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
-    RS.forward(I);
-    AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-
-    // Remove any registers clobbered by a regmask or any def register that is
-    // immediately dead.
-    for (auto J : I->operands()) {
-      if (J.isRegMask())
-        AvailableRegs.clearBitsNotInMask(J.getRegMask());
-
-      if (J.isReg() && J.isDef()) {
-        MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
-        if (J.isDead())
-          for (; AI.isValid(); ++AI)
-            AvailableRegs.reset(*AI);
-#ifndef NDEBUG
-        else
-          for (; AI.isValid(); ++AI)
-            assert(!AvailableRegs[*AI] &&
-                   "Non-dead def should have been removed by now!");
-#endif
-      }
-    }
+  // of the chain? Simulate liveness backwards until the end of the chain.
+  LiveRegUnits Units(*TRI);
+  Units.addLiveOuts(MBB);
+  MachineBasicBlock::iterator I = MBB.end();
+  MachineBasicBlock::iterator ChainEnd = G->end();
+  while (I != ChainEnd) {
+    --I;
+    Units.stepBackward(*I);
   }
 
+  // Check which register units are alive throughout the chain.
+  MachineBasicBlock::iterator ChainBegin = G->begin();
+  assert(ChainBegin != ChainEnd && "Chain should contain instructions");
+  do {
+    --I;
+    Units.accumulate(*I);
+  } while (I != ChainBegin);
+
   // Make sure we allocate in-order, to get the cheapest registers first.
+  unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass;
   auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
   for (auto Reg : Ord) {
-    if (!AvailableRegs[Reg])
+    if (!Units.available(Reg))
       continue;
     if (C == getColor(Reg))
       return Reg;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
deleted file mode 100644
index 0cbb2db..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ /dev/null
@@ -1,484 +0,0 @@
-//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass tries to promote the computations use to obtained a sign extended
-// value used into memory accesses.
-// E.g.
-// a = add nsw i32 b, 3
-// d = sext i32 a to i64
-// e = getelementptr ..., i64 d
-//
-// =>
-// f = sext i32 b to i64
-// a = add nsw i64 f, 3
-// e = getelementptr ..., i64 a
-//
-// This is legal to do if the computations are marked with either nsw or nuw
-// markers. Moreover, the current heuristic is simple: it does not create new
-// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
-// = add i32 b, c, two sexts are required to promote the computation).
-//
-// FIXME: This pass may be useful for other targets too.
-// ===---------------------------------------------------------------------===//
-
-#include "AArch64.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64-type-promotion"
-
-static cl::opt<bool>
-EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
-            cl::desc("Enable merging of redundant sexts when one is dominating"
-                     " the other."),
-            cl::init(true));
-
-#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
-
-//===----------------------------------------------------------------------===//
-//                       AArch64AddressTypePromotion
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AArch64AddressTypePromotion : public FunctionPass {
-
-public:
-  static char ID;
-  AArch64AddressTypePromotion()
-      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
-    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override { return AARCH64_TYPE_PROMO_NAME; }
-
-  /// Iterate over the functions and promote the computation of interesting
-  // sext instructions.
-  bool runOnFunction(Function &F) override;
-
-private:
-  /// The current function.
-  Function *Func;
-  /// Filter out all sexts that does not have this type.
-  /// Currently initialized with Int64Ty.
-  Type *ConsideredSExtType;
-
-  // This transformation requires dominator info.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    FunctionPass::getAnalysisUsage(AU);
-  }
-
-  typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
-  typedef SmallVector<Instruction *, 16> Instructions;
-  typedef DenseMap<Value *, Instructions> ValueToInsts;
-
-  /// Check if it is profitable to move a sext through this instruction.
-  /// Currently, we consider it is profitable if:
-  /// - Inst is used only once (no need to insert truncate).
-  /// - Inst has only one operand that will require a sext operation (we do
-  ///   do not create new sext operation).
-  bool shouldGetThrough(const Instruction *Inst);
-
-  /// Check if it is possible and legal to move a sext through this
-  /// instruction.
-  /// Current heuristic considers that we can get through:
-  /// - Arithmetic operation marked with the nsw or nuw flag.
-  /// - Other sext operation.
-  /// - Truncate operation if it was just dropping sign extended bits.
-  bool canGetThrough(const Instruction *Inst);
-
-  /// Move sext operations through safe to sext instructions.
-  bool propagateSignExtension(Instructions &SExtInsts);
-
-  /// Is this sext should be considered for code motion.
-  /// We look for sext with ConsideredSExtType and uses in at least one
-  // GetElementPtrInst.
-  bool shouldConsiderSExt(const Instruction *SExt) const;
-
-  /// Collect all interesting sext operations, i.e., the ones with the right
-  /// type and used in memory accesses.
-  /// More precisely, a sext instruction is considered as interesting if it
-  /// is used in a "complex" getelementptr or it exits at least another
-  /// sext instruction that sign extended the same initial value.
-  /// A getelementptr is considered as "complex" if it has more than 2
-  // operands.
-  void analyzeSExtension(Instructions &SExtInsts);
-
-  /// Merge redundant sign extension operations in common dominator.
-  void mergeSExts(ValueToInsts &ValToSExtendedUses,
-                  SetOfInstructions &ToRemove);
-};
-} // end anonymous namespace.
-
-char AArch64AddressTypePromotion::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
-                      AARCH64_TYPE_PROMO_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
-                    AARCH64_TYPE_PROMO_NAME, false, false)
-
-FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
-  return new AArch64AddressTypePromotion();
-}
-
-bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
-  if (isa<SExtInst>(Inst))
-    return true;
-
-  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
-      (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
-    return true;
-
-  // sext(trunc(sext)) --> sext
-  if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
-    const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
-    // Check that the truncate just drop sign extended bits.
-    if (Inst->getType()->getIntegerBitWidth() >=
-            Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
-        Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
-            ConsideredSExtType->getIntegerBitWidth())
-      return true;
-  }
-
-  return false;
-}
-
-bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
-  // If the type of the sext is the same as the considered one, this sext
-  // will become useless.
-  // Otherwise, we will have to do something to preserve the original value,
-  // unless it is used once.
-  if (isa<SExtInst>(Inst) &&
-      (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
-    return true;
-
-  // If the Inst is used more that once, we may need to insert truncate
-  // operations and we don't do that at the moment.
-  if (!Inst->hasOneUse())
-    return false;
-
-  // This truncate is used only once, thus if we can get thourgh, it will become
-  // useless.
-  if (isa<TruncInst>(Inst))
-    return true;
-
-  // If both operands are not constant, a new sext will be created here.
-  // Current heuristic is: each step should be profitable.
-  // Therefore we don't allow to increase the number of sext even if it may
-  // be profitable later on.
-  if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
-    return true;
-
-  return false;
-}
-
-static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
-  return !(isa<SelectInst>(Inst) && OpIdx == 0);
-}
-
-bool
-AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
-  if (SExt->getType() != ConsideredSExtType)
-    return false;
-
-  for (const User *U : SExt->users()) {
-    if (isa<GetElementPtrInst>(U))
-      return true;
-  }
-
-  return false;
-}
-
-// Input:
-// - SExtInsts contains all the sext instructions that are used directly in
-//   GetElementPtrInst, i.e., access to memory.
-// Algorithm:
-// - For each sext operation in SExtInsts:
-//   Let var be the operand of sext.
-//   while it is profitable (see shouldGetThrough), legal, and safe
-//   (see canGetThrough) to move sext through var's definition:
-//   * promote the type of var's definition.
-//   * fold var into sext uses.
-//   * move sext above var's definition.
-//   * update sext operand to use the operand of var that should be sign
-//     extended (by construction there is only one).
-//
-//   E.g.,
-//   a = ... i32 c, 3
-//   b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
-//   ...
-//   = b
-// => Yes, update the code
-//   b = sext i32 c to i64
-//   a = ... i64 b, 3
-//   ...
-//   = a
-// Iterate on 'c'.
-bool
-AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
-  DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
-
-  bool LocalChange = false;
-  SetOfInstructions ToRemove;
-  ValueToInsts ValToSExtendedUses;
-  while (!SExtInsts.empty()) {
-    // Get through simple chain.
-    Instruction *SExt = SExtInsts.pop_back_val();
-
-    DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
-
-    // If this SExt has already been merged continue.
-    if (SExt->use_empty() && ToRemove.count(SExt)) {
-      DEBUG(dbgs() << "No uses => marked as delete\n");
-      continue;
-    }
-
-    // Now try to get through the chain of definitions.
-    while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) {
-      DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
-      if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
-        // We cannot get through something that is not an Instruction
-        // or not safe to SExt.
-        DEBUG(dbgs() << "Cannot get through\n");
-        break;
-      }
-
-      LocalChange = true;
-      // If this is a sign extend, it becomes useless.
-      if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
-        DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
-        // We cannot use replaceAllUsesWith here because we may trigger some
-        // assertion on the type as all involved sext operation may have not
-        // been moved yet.
-        while (!Inst->use_empty()) {
-          Use &U = *Inst->use_begin();
-          Instruction *User = dyn_cast<Instruction>(U.getUser());
-          assert(User && "User of sext is not an Instruction!");
-          User->setOperand(U.getOperandNo(), SExt);
-        }
-        ToRemove.insert(Inst);
-        SExt->setOperand(0, Inst->getOperand(0));
-        SExt->moveBefore(Inst);
-        continue;
-      }
-
-      // Get through the Instruction:
-      // 1. Update its type.
-      // 2. Replace the uses of SExt by Inst.
-      // 3. Sign extend each operand that needs to be sign extended.
-
-      // Step #1.
-      Inst->mutateType(SExt->getType());
-      // Step #2.
-      SExt->replaceAllUsesWith(Inst);
-      // Step #3.
-      Instruction *SExtForOpnd = SExt;
-
-      DEBUG(dbgs() << "Propagate SExt to operands\n");
-      for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
-           ++OpIdx) {
-        DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
-        if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
-            !shouldSExtOperand(Inst, OpIdx)) {
-          DEBUG(dbgs() << "No need to propagate\n");
-          continue;
-        }
-        // Check if we can statically sign extend the operand.
-        Value *Opnd = Inst->getOperand(OpIdx);
-        if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
-          DEBUG(dbgs() << "Statically sign extend\n");
-          Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
-                                                         Cst->getSExtValue()));
-          continue;
-        }
-        // UndefValue are typed, so we have to statically sign extend them.
-        if (isa<UndefValue>(Opnd)) {
-          DEBUG(dbgs() << "Statically sign extend\n");
-          Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
-          continue;
-        }
-
-        // Otherwise we have to explicity sign extend it.
-        assert(SExtForOpnd &&
-               "Only one operand should have been sign extended");
-
-        SExtForOpnd->setOperand(0, Opnd);
-
-        DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
-        // Move the sign extension before the insertion point.
-        SExtForOpnd->moveBefore(Inst);
-        Inst->setOperand(OpIdx, SExtForOpnd);
-        // If more sext are required, new instructions will have to be created.
-        SExtForOpnd = nullptr;
-      }
-      if (SExtForOpnd == SExt) {
-        DEBUG(dbgs() << "Sign extension is useless now\n");
-        ToRemove.insert(SExt);
-        break;
-      }
-    }
-
-    // If the use is already of the right type, connect its uses to its argument
-    // and delete it.
-    // This can happen for an Instruction all uses of which are sign extended.
-    if (!ToRemove.count(SExt) &&
-        SExt->getType() == SExt->getOperand(0)->getType()) {
-      DEBUG(dbgs() << "Sign extension is useless, attach its use to "
-                      "its argument\n");
-      SExt->replaceAllUsesWith(SExt->getOperand(0));
-      ToRemove.insert(SExt);
-    } else
-      ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
-  }
-
-  if (EnableMerge)
-    mergeSExts(ValToSExtendedUses, ToRemove);
-
-  // Remove all instructions marked as ToRemove.
-  for (Instruction *I: ToRemove)
-    I->eraseFromParent();
-  return LocalChange;
-}
-
-void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
-                                             SetOfInstructions &ToRemove) {
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-  for (auto &Entry : ValToSExtendedUses) {
-    Instructions &Insts = Entry.second;
-    Instructions CurPts;
-    for (Instruction *Inst : Insts) {
-      if (ToRemove.count(Inst))
-        continue;
-      bool inserted = false;
-      for (auto &Pt : CurPts) {
-        if (DT.dominates(Inst, Pt)) {
-          DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
-                       << *Inst << '\n');
-          Pt->replaceAllUsesWith(Inst);
-          ToRemove.insert(Pt);
-          Pt = Inst;
-          inserted = true;
-          break;
-        }
-        if (!DT.dominates(Pt, Inst))
-          // Give up if we need to merge in a common dominator as the
-          // expermients show it is not profitable.
-          continue;
-
-        DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
-                     << *Pt << '\n');
-        Inst->replaceAllUsesWith(Pt);
-        ToRemove.insert(Inst);
-        inserted = true;
-        break;
-      }
-      if (!inserted)
-        CurPts.push_back(Inst);
-    }
-  }
-}
-
-void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
-  DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
-
-  DenseMap<Value *, Instruction *> SeenChains;
-
-  for (auto &BB : *Func) {
-    for (auto &II : BB) {
-      Instruction *SExt = &II;
-
-      // Collect all sext operation per type.
-      if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
-        continue;
-
-      DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
-
-      // Cases where we actually perform the optimization:
-      // 1. SExt is used in a getelementptr with more than 2 operand =>
-      //    likely we can merge some computation if they are done on 64 bits.
-      // 2. The beginning of the SExt chain is SExt several time. =>
-      //    code sharing is possible.
-
-      bool insert = false;
-      // #1.
-      for (const User *U : SExt->users()) {
-        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
-        if (Inst && Inst->getNumOperands() > 2) {
-          DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
-                       << '\n');
-          insert = true;
-          break;
-        }
-      }
-
-      // #2.
-      // Check the head of the chain.
-      Instruction *Inst = SExt;
-      Value *Last;
-      do {
-        int OpdIdx = 0;
-        const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-        if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
-          OpdIdx = 1;
-        Last = Inst->getOperand(OpdIdx);
-        Inst = dyn_cast<Instruction>(Last);
-      } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
-
-      DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
-      DenseMap<Value *, Instruction *>::iterator AlreadySeen =
-          SeenChains.find(Last);
-      if (insert || AlreadySeen != SeenChains.end()) {
-        DEBUG(dbgs() << "Insert\n");
-        SExtInsts.push_back(SExt);
-        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
-          DEBUG(dbgs() << "Insert chain member\n");
-          SExtInsts.push_back(AlreadySeen->second);
-          SeenChains[Last] = nullptr;
-        }
-      } else {
-        DEBUG(dbgs() << "Record its chain membership\n");
-        SeenChains[Last] = SExt;
-      }
-    }
-  }
-}
-
-bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  if (F.isDeclaration())
-    return false;
-  Func = &F;
-  ConsideredSExtType = Type::getInt64Ty(Func->getContext());
-
-  DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
-
-  Instructions SExtInsts;
-  analyzeSExtension(SExtInsts);
-  return propagateSignExtension(SExtInsts);
-}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index efc2218..5ce5792 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -12,13 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64.h"
 #include "AArch64MCInstLower.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -35,11 +35,11 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     switch (ExtraCode[0]) {
     default:
       return true; // Unknown modifier.
+    case 'a':      // Print 'a' modifier
+      PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+      return false;
     case 'w':      // Print W register
     case 'x':      // Print X register
       if (MO.isReg())
@@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
+  if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
     return true; // Unknown modifier.
 
   const MachineOperand &MO = MI->getOperand(OpNum);
@@ -580,8 +583,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO_Sym = MI->getOperand(0);
     MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
     MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
-    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
-                                   AArch64II::MO_NC);
+    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
     MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
     MCInstLowering.lowerOperand(MO_Sym, Sym);
     MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
index a4950af..29f6d57 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//===--- AArch64CallLowering.cpp - Call lowering --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,15 +15,36 @@
 
 #include "AArch64CallLowering.h"
 #include "AArch64ISelLowering.h"
-
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -31,12 +52,12 @@ using namespace llvm;
 #endif
 
 AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
-  : CallLowering(&TLI) {
-}
+  : CallLowering(&TLI) {}
 
 struct IncomingArgHandler : public CallLowering::ValueHandler {
-  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
-    : ValueHandler(MIRBuilder, MRI) {}
+  IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                     CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -45,6 +66,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
     unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
     MIRBuilder.buildFrameIndex(AddrReg, FI);
+    StackUsed = std::max(StackUsed, Size + Offset);
     return AddrReg;
   }
 
@@ -67,11 +89,14 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
   /// parameters (it's a basic-block live-in), and a call instruction
   /// (it's an implicit-def of the BL).
   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+  uint64_t StackUsed;
 };
 
 struct FormalArgHandler : public IncomingArgHandler {
-  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
-      : IncomingArgHandler(MIRBuilder, MRI) {}
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn *AssignFn)
+    : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIRBuilder.getMBB().addLiveIn(PhysReg);
@@ -80,8 +105,8 @@ struct FormalArgHandler : public IncomingArgHandler {
 
 struct CallReturnHandler : public IncomingArgHandler {
   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                       MachineInstrBuilder MIB)
-    : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
+                    MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+    : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
 
   void markPhysRegUsed(unsigned PhysReg) override {
     MIB.addDef(PhysReg, RegState::Implicit);
@@ -92,8 +117,10 @@ struct CallReturnHandler : public IncomingArgHandler {
 
 struct OutgoingArgHandler : public CallLowering::ValueHandler {
   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                     MachineInstrBuilder MIB)
-      : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+                     MachineInstrBuilder MIB, CCAssignFn *AssignFn,
+                     CCAssignFn *AssignFnVarArg)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+        AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -126,14 +153,29 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
   }
 
+  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                 CCValAssign::LocInfo LocInfo,
+                 const CallLowering::ArgInfo &Info,
+                 CCState &State) override {
+    bool Res;
+    if (Info.IsFixed)
+      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+    else
+      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+
+    StackSize = State.getNextStackOffset();
+    return Res;
+  }
+
   MachineInstrBuilder MIB;
+  CCAssignFn *AssignFnVarArg;
+  uint64_t StackSize;
 };
 
-void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
-                                            SmallVectorImpl<ArgInfo> &SplitArgs,
-                                            const DataLayout &DL,
-                                            MachineRegisterInfo &MRI,
-                                            SplitArgTy PerformArgSplit) const {
+void AArch64CallLowering::splitToValueTypes(
+    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+    const DataLayout &DL, MachineRegisterInfo &MRI,
+    const SplitArgTy &PerformArgSplit) const {
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
@@ -145,7 +187,7 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     // No splitting to do, but we want to replace the original type (e.g. [1 x
     // double] -> double).
     SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
-                           OrigArg.Flags);
+                           OrigArg.Flags, OrigArg.IsFixed);
     return;
   }
 
@@ -154,19 +196,12 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS).
     Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
     SplitArgs.push_back(
-        ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy,
-                OrigArg.Flags});
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+                SplitTy, OrigArg.Flags, OrigArg.IsFixed});
   }
 
-  SmallVector<uint64_t, 4> BitOffsets;
-  for (auto Offset : Offsets)
-    BitOffsets.push_back(Offset * 8);
-
-  SmallVector<unsigned, 8> SplitRegs;
-  for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I)
-    SplitRegs.push_back(I->Reg);
-
-  PerformArgSplit(SplitRegs, BitOffsets);
+  for (unsigned i = 0; i < Offsets.size(); ++i)
+    PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
 }
 
 bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -184,16 +219,16 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     auto &DL = F.getParent()->getDataLayout();
 
     ArgInfo OrigArg{VReg, Val->getType()};
-    setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F);
+    setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
 
     SmallVector<ArgInfo, 8> SplitArgs;
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildExtract(Regs, Offsets, VReg);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, VReg, Offset);
                       });
 
-    OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
-    Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
+    OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
+    Success = handleAssignments(MIRBuilder, SplitArgs, Handler);
   }
 
   MIRBuilder.insertInstr(MIB);
@@ -203,7 +238,6 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                                const Function &F,
                                                ArrayRef<unsigned> VRegs) const {
-  auto &Args = F.getArgumentList();
   MachineFunction &MF = MIRBuilder.getMF();
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -211,13 +245,27 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   SmallVector<ArgInfo, 8> SplitArgs;
   unsigned i = 0;
-  for (auto &Arg : Args) {
+  for (auto &Arg : F.args()) {
     ArgInfo OrigArg{VRegs[i], Arg.getType()};
-    setArgFlags(OrigArg, i + 1, DL, F);
+    setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F);
+    bool Split = false;
+    LLT Ty = MRI.getType(VRegs[i]);
+    unsigned Dst = VRegs[i];
+
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildSequence(VRegs[i], Regs, Offsets);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        if (!Split) {
+                          Split = true;
+                          Dst = MRI.createGenericVirtualRegister(Ty);
+                          MIRBuilder.buildUndef(Dst);
+                        }
+                        unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+                        MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
+                        Dst = Tmp;
                       });
+
+    if (Dst != VRegs[i])
+      MIRBuilder.buildCopy(VRegs[i], Dst);
     ++i;
   }
 
@@ -228,10 +276,25 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
 
-  FormalArgHandler Handler(MIRBuilder, MRI);
-  if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler))
+  FormalArgHandler Handler(MIRBuilder, MRI, AssignFn);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
+  if (F.isVarArg()) {
+    if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
+      // FIXME: we need to reimplement saveVarArgsRegisters from
+      // AArch64ISelLowering.
+      return false;
+    }
+
+    // We currently pass all varargs at 8-byte alignment.
+    uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+    FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+  }
+
   // Move back to the end of the basic block.
   MIRBuilder.setMBB(MBB);
 
@@ -239,6 +302,7 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 }
 
 bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                    CallingConv::ID CallConv,
                                     const MachineOperand &Callee,
                                     const ArgInfo &OrigRet,
                                     ArrayRef<ArgInfo> OrigArgs) const {
@@ -250,21 +314,25 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   SmallVector<ArgInfo, 8> SplitArgs;
   for (auto &OrigArg : OrigArgs) {
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset);
                       });
   }
 
   // Find out which ABI gets to decide where things go.
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
-  CCAssignFn *CallAssignFn =
-      TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+  CCAssignFn *AssignFnFixed =
+      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+  CCAssignFn *AssignFnVarArg =
+      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
+
+  auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
   auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
                                                           : AArch64::BL);
-  MIB.addOperand(Callee);
+  MIB.add(Callee);
 
   // Tell the call which registers are clobbered.
   auto TRI = MF.getSubtarget().getRegisterInfo();
@@ -272,8 +340,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Do the actual argument marshalling.
   SmallVector<unsigned, 8> PhysRegs;
-  OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
-  if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler))
+  OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+                             AssignFnVarArg);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
   // Now we can add the actual call instruction to the correct basic block.
@@ -298,20 +367,23 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     SmallVector<uint64_t, 8> RegOffsets;
     SmallVector<unsigned, 8> SplitRegs;
     splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        std::copy(Offsets.begin(), Offsets.end(),
-                                  std::back_inserter(RegOffsets));
-                        std::copy(Regs.begin(), Regs.end(),
-                                  std::back_inserter(SplitRegs));
+                      [&](unsigned Reg, uint64_t Offset) {
+                        RegOffsets.push_back(Offset);
+                        SplitRegs.push_back(Reg);
                       });
 
-    CallReturnHandler Handler(MIRBuilder, MRI, MIB);
-    if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler))
+    CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
+    if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
       return false;
 
     if (!RegOffsets.empty())
       MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
   }
 
+  CallSeqStart.addImm(Handler.StackSize).addImm(0);
+  MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
+      .addImm(Handler.StackSize)
+      .addImm(0);
+
   return true;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
index ce66762..d96ce95 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,20 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
-#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include <cstdint>
+#include <functional>
 
 namespace llvm {
 
 class AArch64TargetLowering;
 
 class AArch64CallLowering: public CallLowering {
- public:
+public:
   AArch64CallLowering(const AArch64TargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
@@ -32,8 +34,8 @@ class AArch64CallLowering: public CallLowering {
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee,
-                 const ArgInfo &OrigRet,
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
                  ArrayRef<ArgInfo> OrigArgs) const override;
 
 private:
@@ -44,13 +46,14 @@ private:
   typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)>
       MemHandler;
 
-  typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)>
-      SplitArgTy;
+  typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
 
   void splitToValueTypes(const ArgInfo &OrigArgInfo,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          const DataLayout &DL, MachineRegisterInfo &MRI,
-                         SplitArgTy SplitArg) const;
+                         const SplitArgTy &SplitArg) const;
 };
-} // End of namespace llvm;
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 938779d..291bc5e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
+// Vararg functions on windows pass floats in integer registers
+def CC_AArch64_Win64_VarArg : CallingConv<[
+  CCIfType<[f16, f32],    CCPromoteToType<f64>>,
+  CCIfType<[f64], CCBitConvertToType<i64>>,
+  CCDelegateTo<CC_AArch64_AAPCS>
+]>;
+
 
 // Darwin uses a calling convention which differs in only two ways
 // from the standard one at this level:
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 6f8dd3e..b3b7385 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -113,7 +113,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
     return Copy;
   }
 
-  // Create a virtal register in *TLSBaseAddrReg, and populate it by
+  // Create a virtual register in *TLSBaseAddrReg, and populate it by
   // inserting a copy instruction after I. Returns the new instruction.
   MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
     MachineFunction *MF = I.getParent()->getParent();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
new file mode 100644
index 0000000..51700f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -0,0 +1,339 @@
+//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions
+/// into a conditional branch (B.cond), when the NZCV flags can be set for
+/// "free".  This is preferred on targets that have more flexibility when
+/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming
+/// all other variables are equal).  This can also reduce register pressure.
+///
+/// A few examples:
+///
+/// 1) add w8, w0, w1  -> cmn w0, w1             ; CMN is an alias of ADDS.
+///    cbz w8, .LBB_2  -> b.eq .LBB0_2
+///
+/// 2) add w8, w0, w1  -> adds w8, w0, w1        ; w8 has multiple uses.
+///    cbz w8, .LBB1_2 -> b.eq .LBB1_2
+///
+/// 3) sub w8, w0, w1       -> subs w8, w0, w1   ; w8 has multiple uses.
+///    tbz w8, #31, .LBB6_2 -> b.pl .LBB6_2
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-cond-br-tuning"
+#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning"
+
+namespace {
+class AArch64CondBrTuning : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+  AArch64CondBrTuning() : MachineFunctionPass(ID) {
+    initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; }
+
+private:
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting);
+  MachineInstr *convertToCondBr(MachineInstr &MI);
+  bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI);
+};
+} // end anonymous namespace
+
+char AArch64CondBrTuning::ID = 0;
+
+INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning",
+                AARCH64_CONDBR_TUNING_NAME, false, false)
+
+void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
+  if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    return nullptr;
+  return MRI->getUniqueVRegDef(MO.getReg());
+}
+
+MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
+                                                        bool IsFlagSetting) {
+  // If this is already the flag setting version of the instruction (e.g., SUBS)
+  // just make sure the implicit-def of NZCV isn't marked dead.
+  if (IsFlagSetting) {
+    for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands();
+         I != E; ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV)
+        MO.setIsDead(false);
+    }
+    return &MI;
+  }
+  bool Is64Bit;
+  unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
+  unsigned NewDestReg = MI.getOperand(0).getReg();
+  if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
+    NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                    TII->get(NewOpc), NewDestReg);
+  for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+    MIB.add(MI.getOperand(I));
+
+  return MIB;
+}
+
+MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
+  AArch64CC::CondCode CC;
+  MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI);
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode!");
+
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+    CC = AArch64CC::EQ;
+    break;
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    CC = AArch64CC::NE;
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+    CC = AArch64CC::PL;
+    break;
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    CC = AArch64CC::MI;
+    break;
+  }
+  return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
+      .addImm(CC)
+      .addMBB(TargetMBB);
+}
+
+bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
+                                          MachineInstr &DefMI) {
+  // We don't want NZCV bits live across blocks.
+  if (MI.getParent() != DefMI.getParent())
+    return false;
+
+  bool IsFlagSetting = true;
+  unsigned MIOpc = MI.getOpcode();
+  MachineInstr *NewCmp = nullptr, *NewBr = nullptr;
+  switch (DefMI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDWri:
+  case AArch64::ADDWrr:
+  case AArch64::ADDWrs:
+  case AArch64::ADDWrx:
+  case AArch64::ANDWri:
+  case AArch64::ANDWrr:
+  case AArch64::ANDWrs:
+  case AArch64::BICWrr:
+  case AArch64::BICWrs:
+  case AArch64::SUBWri:
+  case AArch64::SUBWrr:
+  case AArch64::SUBWrs:
+  case AArch64::SUBWrx:
+    IsFlagSetting = false;
+    LLVM_FALLTHROUGH;
+  case AArch64::ADDSWri:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ANDSWri:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSWrs:
+  case AArch64::BICSWrr:
+  case AArch64::BICSWrs:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+    switch (MIOpc) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::TBZW:
+    case AArch64::TBNZW:
+      // Check to see if the TBZ/TBNZ is checking the sign bit.
+      if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) &&
+          MI.getOperand(1).getImm() != 31)
+        return false;
+
+      // There must not be any instruction between DefMI and MI that clobbers or
+      // reads NZCV.
+      MachineBasicBlock::iterator I(DefMI), E(MI);
+      for (I = std::next(I); I != E; ++I) {
+        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+            I->readsRegister(AArch64::NZCV, TRI))
+          return false;
+      }
+      DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      DEBUG(DefMI.print(dbgs()));
+      DEBUG(dbgs() << "    ");
+      DEBUG(MI.print(dbgs()));
+
+      NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+      NewBr = convertToCondBr(MI);
+      break;
+    }
+    break;
+
+  case AArch64::ADDXri:
+  case AArch64::ADDXrr:
+  case AArch64::ADDXrs:
+  case AArch64::ADDXrx:
+  case AArch64::ANDXri:
+  case AArch64::ANDXrr:
+  case AArch64::ANDXrs:
+  case AArch64::BICXrr:
+  case AArch64::BICXrs:
+  case AArch64::SUBXri:
+  case AArch64::SUBXrr:
+  case AArch64::SUBXrs:
+  case AArch64::SUBXrx:
+    IsFlagSetting = false;
+    LLVM_FALLTHROUGH;
+  case AArch64::ADDSXri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+  case AArch64::ANDSXri:
+  case AArch64::ANDSXrr:
+  case AArch64::ANDSXrs:
+  case AArch64::BICSXrr:
+  case AArch64::BICSXrs:
+  case AArch64::SUBSXri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+    switch (MIOpc) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+    case AArch64::TBZX:
+    case AArch64::TBNZX: {
+      // Check to see if the TBZ/TBNZ is checking the sign bit.
+      if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) &&
+          MI.getOperand(1).getImm() != 63)
+        return false;
+      // There must not be any instruction between DefMI and MI that clobbers or
+      // reads NZCV.
+      MachineBasicBlock::iterator I(DefMI), E(MI);
+      for (I = std::next(I); I != E; ++I) {
+        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+            I->readsRegister(AArch64::NZCV, TRI))
+          return false;
+      }
+      DEBUG(dbgs() << "  Replacing instructions:\n    ");
+      DEBUG(DefMI.print(dbgs()));
+      DEBUG(dbgs() << "    ");
+      DEBUG(MI.print(dbgs()));
+
+      NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+      NewBr = convertToCondBr(MI);
+      break;
+    }
+    }
+    break;
+  }
+  (void)NewCmp; (void)NewBr;
+  assert(NewCmp && NewBr && "Expected new instructions.");
+
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(NewCmp->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(NewBr->print(dbgs()));
+
+  // If this was a flag setting version of the instruction, we use the original
+  // instruction by just clearing the dead marked on the implicit-def of NCZV.
+  // Therefore, we should not erase this instruction.
+  if (!IsFlagSetting)
+    DefMI.eraseFromParent();
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning  **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    bool LocalChange = false;
+    for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(),
+                                     E = MBB.end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+      default:
+        break;
+      case AArch64::CBZW:
+      case AArch64::CBZX:
+      case AArch64::CBNZW:
+      case AArch64::CBNZX:
+      case AArch64::TBZW:
+      case AArch64::TBZX:
+      case AArch64::TBNZW:
+      case AArch64::TBNZX:
+        MachineInstr *DefMI = getOperandDef(MI.getOperand(0));
+        LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI));
+        break;
+      }
+      // If the optimization was successful, we can't optimize any other
+      // branches because doing so would clobber the NZCV flags.
+      if (LocalChange) {
+        Changed = true;
+        break;
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64CondBrTuning() {
+  return new AArch64CondBrTuning();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 8b18632..2dfcd2d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -265,10 +265,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
 
   // Change immediate in comparison instruction (ADDS or SUBS).
   BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
-      .addOperand(CmpMI->getOperand(0))
-      .addOperand(CmpMI->getOperand(1))
+      .add(CmpMI->getOperand(0))
+      .add(CmpMI->getOperand(1))
       .addImm(Imm)
-      .addOperand(CmpMI->getOperand(3));
+      .add(CmpMI->getOperand(3));
   CmpMI->eraseFromParent();
 
   // The fact that this comparison was picked ensures that it's related to the
@@ -278,7 +278,7 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
   // Change condition in branch instruction.
   BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
       .addImm(Cmp)
-      .addOperand(BrMI.getOperand(1));
+      .add(BrMI.getOperand(1));
   BrMI.eraseFromParent();
 
   MBB->updateTerminator();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index da09b36..9eda56c 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -139,6 +140,7 @@ class SSACCmpConv {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
+  const MachineBranchProbabilityInfo *MBPI;
 
 public:
   /// The first block containing a conditional branch, dominating everything
@@ -186,8 +188,10 @@ private:
 
 public:
   /// runOnMachineFunction - Initialize per-function data structures.
-  void runOnMachineFunction(MachineFunction &MF) {
+  void runOnMachineFunction(MachineFunction &MF,
+                            const MachineBranchProbabilityInfo *MBPI) {
     this->MF = &MF;
+    this->MBPI = MBPI;
     TII = MF.getSubtarget().getInstrInfo();
     TRI = MF.getSubtarget().getRegisterInfo();
     MRI = &MF.getRegInfo();
@@ -564,8 +568,40 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // All CmpBB instructions are moved into Head, and CmpBB is deleted.
   // Update the CFG first.
   updateTailPHIs();
-  Head->removeSuccessor(CmpBB, true);
-  CmpBB->removeSuccessor(Tail, true);
+
+  // Save successor probabilties before removing CmpBB and Tail from their
+  // parents.
+  BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB);
+  BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail);
+
+  Head->removeSuccessor(CmpBB);
+  CmpBB->removeSuccessor(Tail);
+
+  // If Head and CmpBB had successor probabilties, udpate the probabilities to
+  // reflect the ccmp-conversion.
+  if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) {
+
+    // Head is allowed two successors. We've removed CmpBB, so the remaining
+    // successor is Tail. We need to increase the successor probability for
+    // Tail to account for the CmpBB path we removed.
+    //
+    // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB).
+    assert(*Head->succ_begin() == Tail && "Head successor is not Tail");
+    BranchProbability Head2Tail = MBPI->getEdgeProbability(Head, Tail);
+    Head->setSuccProbability(Head->succ_begin(),
+                             Head2Tail + Head2CmpBB * CmpBB2Tail);
+
+    // We will transfer successors of CmpBB to Head in a moment without
+    // normalizing the successor probabilities. Set the successor probabilites
+    // before doing so.
+    //
+    // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB).
+    for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) {
+      BranchProbability CmpBB2I = MBPI->getEdgeProbability(CmpBB, *I);
+      CmpBB->setSuccProbability(I, Head2CmpBB * CmpBB2I);
+    }
+  }
+
   Head->transferSuccessorsAndUpdatePHIs(CmpBB);
   DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
   TII->removeBranch(*Head);
@@ -594,7 +630,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
     BuildMI(*Head, Head->end(), TermDL, MCID)
         .addReg(DestReg, RegState::Define | RegState::Dead)
-        .addOperand(HeadCond[2])
+        .add(HeadCond[2])
         .addImm(0)
         .addImm(0);
     // SUBS uses the GPR*sp register classes.
@@ -650,13 +686,12 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   if (CmpMI->getOperand(FirstOp + 1).isReg())
     MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
                            TII->getRegClass(MCID, 1, TRI, *MF));
-  MachineInstrBuilder MIB =
-      BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
-          .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+  MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+                                .add(CmpMI->getOperand(FirstOp)); // Register Rn
   if (isZBranch)
     MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
   else
-    MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+    MIB.add(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
   MIB.addImm(NZCV).addImm(HeadCmpBBCC);
 
   // If CmpMI was a terminator, we need a new conditional branch to replace it.
@@ -666,7 +701,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
                 CmpMI->getOpcode() == AArch64::CBNZX;
     BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
         .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
-        .addOperand(CmpMI->getOperand(1)); // Branch target.
+        .add(CmpMI->getOperand(1)); // Branch target.
   }
   CmpMI->eraseFromParent();
   Head->updateTerminator();
@@ -718,6 +753,7 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
 
 namespace {
 class AArch64ConditionalCompares : public MachineFunctionPass {
+  const MachineBranchProbabilityInfo *MBPI;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MCSchedModel SchedModel;
@@ -754,6 +790,7 @@ char AArch64ConditionalCompares::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -764,6 +801,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 }
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -893,12 +931,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
   MinSize = MF.getFunction()->optForMinSize();
 
   bool Changed = false;
-  CmpConv.runOnMachineFunction(MF);
+  CmpConv.runOnMachineFunction(MF, MBPI);
 
   // Visit blocks in dominator tree pre-order. The pre-order enables multiple
   // cmp-conversions from the same head block.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 30e2b23..b72f23b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -13,15 +13,17 @@
 
 #include "AArch64.h"
 #include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-dead-defs"
@@ -84,6 +86,55 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
       continue;
     }
+    if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) {
+      // XZ/WZ for LSE can only be used when acquire semantics are not used,
+      // LDOPAL WZ is an invalid opcode.
+      switch (MI.getOpcode()) {
+      case AArch64::CASALb:
+      case AArch64::CASALh:
+      case AArch64::CASALs:
+      case AArch64::CASALd:
+      case AArch64::SWPALb:
+      case AArch64::SWPALh:
+      case AArch64::SWPALs:
+      case AArch64::SWPALd:
+      case AArch64::LDADDALb:
+      case AArch64::LDADDALh:
+      case AArch64::LDADDALs:
+      case AArch64::LDADDALd:
+      case AArch64::LDCLRALb:
+      case AArch64::LDCLRALh:
+      case AArch64::LDCLRALs:
+      case AArch64::LDCLRALd:
+      case AArch64::LDEORALb:
+      case AArch64::LDEORALh:
+      case AArch64::LDEORALs:
+      case AArch64::LDEORALd:
+      case AArch64::LDSETALb:
+      case AArch64::LDSETALh:
+      case AArch64::LDSETALs:
+      case AArch64::LDSETALd:
+      case AArch64::LDSMINALb:
+      case AArch64::LDSMINALh:
+      case AArch64::LDSMINALs:
+      case AArch64::LDSMINALd:
+      case AArch64::LDSMAXALb:
+      case AArch64::LDSMAXALh:
+      case AArch64::LDSMAXALs:
+      case AArch64::LDSMAXALd:
+      case AArch64::LDUMINALb:
+      case AArch64::LDUMINALh:
+      case AArch64::LDUMINALs:
+      case AArch64::LDUMINALd:
+      case AArch64::LDUMAXALb:
+      case AArch64::LDUMAXALh:
+      case AArch64::LDUMAXALs:
+      case AArch64::LDUMAXALd:
+        continue;
+      default:
+        break;
+      }
+    }
     const MCInstrDesc &Desc = MI.getDesc();
     for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
       MachineOperand &MO = MI.getOperand(I);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index fe1c0be..d52cd84 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -14,9 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -70,9 +71,9 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
     const MachineOperand &MO = OldMI.getOperand(i);
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
-      UseMI.addOperand(MO);
+      UseMI.add(MO);
     else
-      DefMI.addOperand(MO);
+      DefMI.add(MO);
   }
 }
 
@@ -112,7 +113,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(AArch64::XZR)
             .addImm(Encoding);
 
@@ -179,7 +180,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(AArch64::XZR)
             .addImm(Encoding);
 
@@ -362,7 +363,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
   AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
   MachineInstrBuilder MIB =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
-          .addOperand(MI.getOperand(0))
+          .add(MI.getOperand(0))
           .addReg(AArch64::XZR)
           .addImm(Encoding);
 
@@ -425,7 +426,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
     unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
             .addImm(Encoding);
     transferImpOps(MI, MIB, MIB);
@@ -539,15 +540,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   if (Imm != 0) {
     unsigned LZ = countLeadingZeros(Imm);
     unsigned TZ = countTrailingZeros(Imm);
-    Shift = ((63 - LZ) / 16) * 16;
-    LastShift = (TZ / 16) * 16;
+    Shift = (TZ / 16) * 16;
+    LastShift = ((63 - LZ) / 16) * 16;
   }
   unsigned Imm16 = (Imm >> Shift) & Mask;
   bool DstIsDead = MI.getOperand(0).isDead();
   MachineInstrBuilder MIB1 =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
           .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && Shift == LastShift))
+                  getDeadRegState(DstIsDead && Shift == LastShift))
           .addImm(Imm16)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
 
@@ -564,15 +565,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
 
   MachineInstrBuilder MIB2;
   unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
-  while (Shift != LastShift) {
-    Shift -= 16;
+  while (Shift < LastShift) {
+    Shift += 16;
     Imm16 = (Imm >> Shift) & Mask;
     if (Imm16 == (isNeg ? Mask : 0))
       continue; // This 16-bit portion is already set correctly.
     MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
                .addReg(DstReg,
                        RegState::Define |
-                           getDeadRegState(DstIsDead && Shift == LastShift))
+                       getDeadRegState(DstIsDead && Shift == LastShift))
                .addReg(DstReg)
                .addImm(Imm16)
                .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
@@ -583,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   return true;
 }
 
-static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    MBB->addLiveIn(*I);
-}
-
 bool AArch64ExpandPseudo::expandCMP_SWAP(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
     unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
     MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Dest = MI.getOperand(0);
   unsigned StatusReg = MI.getOperand(1).getReg();
-  MachineOperand &Addr = MI.getOperand(2);
-  MachineOperand &Desired = MI.getOperand(3);
-  MachineOperand &New = MI.getOperand(4);
-
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  bool StatusDead = MI.getOperand(1).isDead();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned DesiredReg = MI.getOperand(3).getReg();
+  unsigned NewReg = MI.getOperand(4).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -615,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
   MF->insert(++StoreBB->getIterator(), DoneBB);
 
   // .Lloadcmp:
+  //     mov wStatus, 0
   //     ldaxr xDest, [xAddr]
   //     cmp xDest, xDesired
   //     b.ne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(Dest.getReg());
-  LoadCmpBB->addLiveIn(Desired.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
+  if (!StatusDead)
+    BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg)
+      .addImm(0).addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
-      .addReg(Addr.getReg());
+      .addReg(AddrReg);
   BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
       .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
-      .addOperand(Desired)
+      .addReg(DesiredReg)
       .addImm(ExtendImm);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
       .addImm(AArch64CC::NE)
@@ -639,27 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
   // .Lstore:
   //     stlxr wStatus, xNew, [xAddr]
   //     cbnz wStatus, .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(New.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
-
   BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
-      .addOperand(New)
-      .addOperand(Addr);
+      .addReg(NewReg)
+      .addReg(AddrReg);
   BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
-      .addReg(StatusReg, RegState::Kill)
+      .addReg(StatusReg, getKillRegState(StatusDead))
       .addMBB(LoadCmpBB);
   StoreBB->addSuccessor(LoadCmpBB);
   StoreBB->addSuccessor(DoneBB);
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute livein lists.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass around the loop to get loop carried registers right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
@@ -672,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   MachineOperand &DestLo = MI.getOperand(0);
   MachineOperand &DestHi = MI.getOperand(1);
   unsigned StatusReg = MI.getOperand(2).getReg();
-  MachineOperand &Addr = MI.getOperand(3);
-  MachineOperand &DesiredLo = MI.getOperand(4);
-  MachineOperand &DesiredHi = MI.getOperand(5);
-  MachineOperand &NewLo = MI.getOperand(6);
-  MachineOperand &NewHi = MI.getOperand(7);
-
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  bool StatusDead = MI.getOperand(2).isDead();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(3).getReg();
+  unsigned DesiredLoReg = MI.getOperand(4).getReg();
+  unsigned DesiredHiReg = MI.getOperand(5).getReg();
+  unsigned NewLoReg = MI.getOperand(6).getReg();
+  unsigned NewHiReg = MI.getOperand(7).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -697,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   //     cmp xDestLo, xDesiredLo
   //     sbcs xDestHi, xDesiredHi
   //     b.ne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(DestLo.getReg());
-  LoadCmpBB->addLiveIn(DestHi.getReg());
-  LoadCmpBB->addLiveIn(DesiredLo.getReg());
-  LoadCmpBB->addLiveIn(DesiredHi.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
       .addReg(DestLo.getReg(), RegState::Define)
       .addReg(DestHi.getReg(), RegState::Define)
-      .addReg(Addr.getReg());
+      .addReg(AddrReg);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
       .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
-      .addOperand(DesiredLo)
+      .addReg(DesiredLoReg)
       .addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
     .addUse(AArch64::WZR)
@@ -718,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
     .addImm(AArch64CC::EQ);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
       .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
-      .addOperand(DesiredHi)
+      .addReg(DesiredHiReg)
       .addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
       .addUse(StatusReg, RegState::Kill)
       .addUse(StatusReg, RegState::Kill)
       .addImm(AArch64CC::EQ);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
-      .addUse(StatusReg, RegState::Kill)
+      .addUse(StatusReg, getKillRegState(StatusDead))
       .addMBB(DoneBB);
   LoadCmpBB->addSuccessor(DoneBB);
   LoadCmpBB->addSuccessor(StoreBB);
@@ -733,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   // .Lstore:
   //     stlxp wStatus, xNewLo, xNewHi, [xAddr]
   //     cbnz wStatus, .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(NewLo.getReg());
-  StoreBB->addLiveIn(NewHi.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
   BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
-      .addOperand(NewLo)
-      .addOperand(NewHi)
-      .addOperand(Addr);
+      .addReg(NewLoReg)
+      .addReg(NewHiReg)
+      .addReg(AddrReg);
   BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
-      .addReg(StatusReg, RegState::Kill)
+      .addReg(StatusReg, getKillRegState(StatusDead))
       .addMBB(LoadCmpBB);
   StoreBB->addSuccessor(LoadCmpBB);
   StoreBB->addSuccessor(DoneBB);
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute liveness bottom up.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass in the loop to get the loop carried dependencies right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
@@ -825,8 +827,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
                 MI.getOperand(0).getReg())
-            .addOperand(MI.getOperand(1))
-            .addOperand(MI.getOperand(2))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(2))
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     transferImpOps(MI, MIB1, MIB1);
     MI.eraseFromParent();
@@ -842,7 +844,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
     MachineInstrBuilder MIB2 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(DstReg);
 
     if (MO1.isGlobal()) {
@@ -878,19 +880,31 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     unsigned DstReg = MI.getOperand(0).getReg();
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
-            .addOperand(MI.getOperand(1));
+            .add(MI.getOperand(1));
 
     MachineInstrBuilder MIB2 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
-            .addOperand(MI.getOperand(0))
+            .add(MI.getOperand(0))
             .addReg(DstReg)
-            .addOperand(MI.getOperand(2))
+            .add(MI.getOperand(2))
             .addImm(0);
 
     transferImpOps(MI, MIB1, MIB2);
     MI.eraseFromParent();
     return true;
   }
+  case AArch64::MOVbaseTLS: {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    auto SysReg = AArch64SysReg::TPIDR_EL0;
+    MachineFunction *MF = MBB.getParent();
+    if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
+        MF->getTarget().getCodeModel() == CodeModel::Kernel)
+      SysReg = AArch64SysReg::TPIDR_EL1;
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
+        .addImm(SysReg);
+    MI.eraseFromParent();
+    return true;
+  }
 
   case AArch64::MOVi32imm:
     return expandMOVImm(MBB, MBBI, 32);
@@ -931,6 +945,19 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                           AArch64::XZR, NextMBBI);
   case AArch64::CMP_SWAP_128:
     return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
+
+  case AArch64::AESMCrrTied:
+  case AArch64::AESIMCrrTied: {
+    MachineInstrBuilder MIB =
+    BuildMI(MBB, MBBI, MI.getDebugLoc(),
+            TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr :
+                                                      AArch64::AESIMCrr))
+      .add(MI.getOperand(0))
+      .add(MI.getOperand(1));
+    transferImpOps(MI, MIB, MIB);
+    MI.eraseFromParent();
+    return true;
+   }
   }
   return false;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
new file mode 100644
index 0000000..c0e2235
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,790 @@
+//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
+/// that may inhibit the HW prefetching.  This is done in two steps.  Before
+/// ISel, we mark strided loads (i.e. those that will likely benefit from
+/// prefetching) with metadata.  Then, after opcodes have been finalized, we
+/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "falkor-hwpf-fix"
+
+STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
+STATISTIC(NumCollisionsAvoided,
+          "Number of HW prefetch tag collisions avoided");
+STATISTIC(NumCollisionsNotAvoided,
+          "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+
+namespace {
+
+class FalkorMarkStridedAccesses {
+public:
+  FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
+      : LI(LI), SE(SE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop &L);
+
+  LoopInfo &LI;
+  ScalarEvolution &SE;
+};
+
+class FalkorMarkStridedAccessesLegacy : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
+    initializeFalkorMarkStridedAccessesLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    // FIXME: For some reason, preserving SE here breaks LSR (even if
+    // this pass changes nothing).
+    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FalkorMarkStridedAccessesLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                      "Falkor HW Prefetch Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                    "Falkor HW Prefetch Fix", false, false)
+
+FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
+  return new FalkorMarkStridedAccessesLegacy();
+}
+
+bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
+  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const AArch64Subtarget *ST =
+      TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
+  if (ST->getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(F))
+    return false;
+
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  FalkorMarkStridedAccesses LDP(LI, SE);
+  return LDP.run();
+}
+
+bool FalkorMarkStridedAccesses::run() {
+  bool MadeChange = false;
+
+  for (Loop *L : LI)
+    for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
+      MadeChange |= runOnLoop(**LIt);
+
+  return MadeChange;
+}
+
+bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
+  // Only mark strided loads in the inner-most loop
+  if (!L.empty())
+    return false;
+
+  bool MadeChange = false;
+
+  for (BasicBlock *BB : L.blocks()) {
+    for (Instruction &I : *BB) {
+      LoadInst *LoadI = dyn_cast<LoadInst>(&I);
+      if (!LoadI)
+        continue;
+
+      Value *PtrValue = LoadI->getPointerOperand();
+      if (L.isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE.getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+        continue;
+
+      LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
+                         MDNode::get(LoadI->getContext(), {}));
+      ++NumStridedLoadsMarked;
+      DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+namespace {
+
+class FalkorHWPFFix : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FalkorHWPFFix() : MachineFunctionPass(ID) {
+    initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  void runOnLoop(MachineLoop &L, MachineFunction &Fn);
+
+  const AArch64InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
+  bool Modified;
+};
+
+/// Bits from load opcodes used to compute HW prefetcher instruction tags.
+struct LoadInfo {
+  LoadInfo()
+      : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr),
+        IsPrePost(false) {}
+  unsigned DestReg;
+  unsigned BaseReg;
+  int BaseRegIdx;
+  const MachineOperand *OffsetOpnd;
+  bool IsPrePost;
+};
+
+} // namespace
+
+char FalkorHWPFFix::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                      "Falkor HW Prefetch Fix Late Phase", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+                    "Falkor HW Prefetch Fix Late Phase", false, false)
+
+static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
+  return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
+}
+
+static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
+  int DestRegIdx;
+  int BaseRegIdx;
+  int OffsetIdx;
+  bool IsPrePost;
+
+  switch (MI.getOpcode()) {
+  default:
+    return None;
+
+  case AArch64::LD1i8:
+  case AArch64::LD1i16:
+  case AArch64::LD1i32:
+  case AArch64::LD1i64:
+  case AArch64::LD2i8:
+  case AArch64::LD2i16:
+  case AArch64::LD2i32:
+  case AArch64::LD2i64:
+  case AArch64::LD3i8:
+  case AArch64::LD3i16:
+  case AArch64::LD3i32:
+  case AArch64::LD4i8:
+  case AArch64::LD4i16:
+  case AArch64::LD4i32:
+    DestRegIdx = 0;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64:
+  case AArch64::LD4i64:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d:
+  case AArch64::LD1Onev2s:
+  case AArch64::LD1Onev4h:
+  case AArch64::LD1Onev8b:
+  case AArch64::LD1Onev2d:
+  case AArch64::LD1Onev4s:
+  case AArch64::LD1Onev8h:
+  case AArch64::LD1Onev16b:
+  case AArch64::LD1Rv1d:
+  case AArch64::LD1Rv2s:
+  case AArch64::LD1Rv4h:
+  case AArch64::LD1Rv8b:
+  case AArch64::LD1Rv2d:
+  case AArch64::LD1Rv4s:
+  case AArch64::LD1Rv8h:
+  case AArch64::LD1Rv16b:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Twov2s:
+  case AArch64::LD1Twov4h:
+  case AArch64::LD1Twov8b:
+  case AArch64::LD2Twov2s:
+  case AArch64::LD2Twov4s:
+  case AArch64::LD2Twov8b:
+  case AArch64::LD2Rv1d:
+  case AArch64::LD2Rv2s:
+  case AArch64::LD2Rv4s:
+  case AArch64::LD2Rv8b:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Twov4s:
+  case AArch64::LD1Twov8h:
+  case AArch64::LD1Twov16b:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Threev2s:
+  case AArch64::LD1Threev4h:
+  case AArch64::LD1Threev8b:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Threev4s:
+  case AArch64::LD1Threev8h:
+  case AArch64::LD1Threev16b:
+  case AArch64::LD1Fourv1d:
+  case AArch64::LD1Fourv2s:
+  case AArch64::LD1Fourv4h:
+  case AArch64::LD1Fourv8b:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Fourv4s:
+  case AArch64::LD1Fourv8h:
+  case AArch64::LD1Fourv16b:
+  case AArch64::LD2Twov2d:
+  case AArch64::LD2Twov4h:
+  case AArch64::LD2Twov8h:
+  case AArch64::LD2Twov16b:
+  case AArch64::LD2Rv2d:
+  case AArch64::LD2Rv4h:
+  case AArch64::LD2Rv8h:
+  case AArch64::LD2Rv16b:
+  case AArch64::LD3Threev2s:
+  case AArch64::LD3Threev4h:
+  case AArch64::LD3Threev8b:
+  case AArch64::LD3Threev2d:
+  case AArch64::LD3Threev4s:
+  case AArch64::LD3Threev8h:
+  case AArch64::LD3Threev16b:
+  case AArch64::LD3Rv1d:
+  case AArch64::LD3Rv2s:
+  case AArch64::LD3Rv4h:
+  case AArch64::LD3Rv8b:
+  case AArch64::LD3Rv2d:
+  case AArch64::LD3Rv4s:
+  case AArch64::LD3Rv8h:
+  case AArch64::LD3Rv16b:
+  case AArch64::LD4Fourv2s:
+  case AArch64::LD4Fourv4h:
+  case AArch64::LD4Fourv8b:
+  case AArch64::LD4Fourv2d:
+  case AArch64::LD4Fourv4s:
+  case AArch64::LD4Fourv8h:
+  case AArch64::LD4Fourv16b:
+  case AArch64::LD4Rv1d:
+  case AArch64::LD4Rv2s:
+  case AArch64::LD4Rv4h:
+  case AArch64::LD4Rv8b:
+  case AArch64::LD4Rv2d:
+  case AArch64::LD4Rv4s:
+  case AArch64::LD4Rv8h:
+  case AArch64::LD4Rv16b:
+    DestRegIdx = -1;
+    BaseRegIdx = 1;
+    OffsetIdx = -1;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1i8_POST:
+  case AArch64::LD1i16_POST:
+  case AArch64::LD1i32_POST:
+  case AArch64::LD1i64_POST:
+  case AArch64::LD2i8_POST:
+  case AArch64::LD2i16_POST:
+  case AArch64::LD2i32_POST:
+  case AArch64::LD2i64_POST:
+  case AArch64::LD3i8_POST:
+  case AArch64::LD3i16_POST:
+  case AArch64::LD3i32_POST:
+  case AArch64::LD4i8_POST:
+  case AArch64::LD4i16_POST:
+  case AArch64::LD4i32_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD3i64_POST:
+  case AArch64::LD4i64_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 4;
+    OffsetIdx = 5;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Onev1d_POST:
+  case AArch64::LD1Onev2s_POST:
+  case AArch64::LD1Onev4h_POST:
+  case AArch64::LD1Onev8b_POST:
+  case AArch64::LD1Onev2d_POST:
+  case AArch64::LD1Onev4s_POST:
+  case AArch64::LD1Onev8h_POST:
+  case AArch64::LD1Onev16b_POST:
+  case AArch64::LD1Rv1d_POST:
+  case AArch64::LD1Rv2s_POST:
+  case AArch64::LD1Rv4h_POST:
+  case AArch64::LD1Rv8b_POST:
+  case AArch64::LD1Rv2d_POST:
+  case AArch64::LD1Rv4s_POST:
+  case AArch64::LD1Rv8h_POST:
+  case AArch64::LD1Rv16b_POST:
+  case AArch64::LD1Twov1d_POST:
+  case AArch64::LD1Twov2s_POST:
+  case AArch64::LD1Twov4h_POST:
+  case AArch64::LD1Twov8b_POST:
+  case AArch64::LD2Twov2s_POST:
+  case AArch64::LD2Twov4s_POST:
+  case AArch64::LD2Twov8b_POST:
+  case AArch64::LD2Rv1d_POST:
+  case AArch64::LD2Rv2s_POST:
+  case AArch64::LD2Rv4s_POST:
+  case AArch64::LD2Rv8b_POST:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LD1Twov2d_POST:
+  case AArch64::LD1Twov4s_POST:
+  case AArch64::LD1Twov8h_POST:
+  case AArch64::LD1Twov16b_POST:
+  case AArch64::LD1Threev1d_POST:
+  case AArch64::LD1Threev2s_POST:
+  case AArch64::LD1Threev4h_POST:
+  case AArch64::LD1Threev8b_POST:
+  case AArch64::LD1Threev2d_POST:
+  case AArch64::LD1Threev4s_POST:
+  case AArch64::LD1Threev8h_POST:
+  case AArch64::LD1Threev16b_POST:
+  case AArch64::LD1Fourv1d_POST:
+  case AArch64::LD1Fourv2s_POST:
+  case AArch64::LD1Fourv4h_POST:
+  case AArch64::LD1Fourv8b_POST:
+  case AArch64::LD1Fourv2d_POST:
+  case AArch64::LD1Fourv4s_POST:
+  case AArch64::LD1Fourv8h_POST:
+  case AArch64::LD1Fourv16b_POST:
+  case AArch64::LD2Twov2d_POST:
+  case AArch64::LD2Twov4h_POST:
+  case AArch64::LD2Twov8h_POST:
+  case AArch64::LD2Twov16b_POST:
+  case AArch64::LD2Rv2d_POST:
+  case AArch64::LD2Rv4h_POST:
+  case AArch64::LD2Rv8h_POST:
+  case AArch64::LD2Rv16b_POST:
+  case AArch64::LD3Threev2s_POST:
+  case AArch64::LD3Threev4h_POST:
+  case AArch64::LD3Threev8b_POST:
+  case AArch64::LD3Threev2d_POST:
+  case AArch64::LD3Threev4s_POST:
+  case AArch64::LD3Threev8h_POST:
+  case AArch64::LD3Threev16b_POST:
+  case AArch64::LD3Rv1d_POST:
+  case AArch64::LD3Rv2s_POST:
+  case AArch64::LD3Rv4h_POST:
+  case AArch64::LD3Rv8b_POST:
+  case AArch64::LD3Rv2d_POST:
+  case AArch64::LD3Rv4s_POST:
+  case AArch64::LD3Rv8h_POST:
+  case AArch64::LD3Rv16b_POST:
+  case AArch64::LD4Fourv2s_POST:
+  case AArch64::LD4Fourv4h_POST:
+  case AArch64::LD4Fourv8b_POST:
+  case AArch64::LD4Fourv2d_POST:
+  case AArch64::LD4Fourv4s_POST:
+  case AArch64::LD4Fourv8h_POST:
+  case AArch64::LD4Fourv16b_POST:
+  case AArch64::LD4Rv1d_POST:
+  case AArch64::LD4Rv2s_POST:
+  case AArch64::LD4Rv4h_POST:
+  case AArch64::LD4Rv8b_POST:
+  case AArch64::LD4Rv2d_POST:
+  case AArch64::LD4Rv4s_POST:
+  case AArch64::LD4Rv8h_POST:
+  case AArch64::LD4Rv16b_POST:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBBui:
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRBui:
+  case AArch64::LDRDl:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRDui:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHHui:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRHui:
+  case AArch64::LDRQl:
+  case AArch64::LDRQroW:
+  case AArch64::LDRQroX:
+  case AArch64::LDRQui:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWl:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSWui:
+  case AArch64::LDRSl:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+  case AArch64::LDRSui:
+  case AArch64::LDRWl:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRWui:
+  case AArch64::LDRXl:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+  case AArch64::LDRXui:
+  case AArch64::LDURBBi:
+  case AArch64::LDURBi:
+  case AArch64::LDURDi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURHi:
+  case AArch64::LDURQi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 1;
+    OffsetIdx = 2;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpost:
+  case AArch64::LDRBpre:
+  case AArch64::LDRDpost:
+  case AArch64::LDRDpre:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpost:
+  case AArch64::LDRHpre:
+  case AArch64::LDRQpost:
+  case AArch64::LDRQpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRSpost:
+  case AArch64::LDRSpre:
+  case AArch64::LDRWpost:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpost:
+  case AArch64::LDRXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+    DestRegIdx = -1;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPSWi:
+  case AArch64::LDPSi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi:
+    DestRegIdx = 0;
+    BaseRegIdx = 2;
+    OffsetIdx = 3;
+    IsPrePost = false;
+    break;
+
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+    DestRegIdx = -1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre:
+    DestRegIdx = 1;
+    BaseRegIdx = 3;
+    OffsetIdx = 4;
+    IsPrePost = true;
+    break;
+  }
+
+  LoadInfo LI;
+  LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
+  LI.BaseReg = MI.getOperand(BaseRegIdx).getReg();
+  LI.BaseRegIdx = BaseRegIdx;
+  LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
+  LI.IsPrePost = IsPrePost;
+  return LI;
+}
+
+static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
+                                 const MachineInstr &MI, const LoadInfo &LI) {
+  unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
+  unsigned Base = TRI->getEncodingValue(LI.BaseReg);
+  unsigned Off;
+  if (LI.OffsetOpnd == nullptr)
+    Off = 0;
+  else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
+           LI.OffsetOpnd->isCPI())
+    return None;
+  else if (LI.OffsetOpnd->isReg())
+    Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
+  else
+    Off = LI.OffsetOpnd->getImm() >> 2;
+
+  return makeTag(Dest, Base, Off);
+}
+
+void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
+  // Build the initial tag map for the whole loop.
+  TagMap.clear();
+  for (MachineBasicBlock *MBB : L.getBlocks())
+    for (MachineInstr &MI : *MBB) {
+      Optional<LoadInfo> LInfo = getLoadInfo(MI);
+      if (!LInfo)
+        continue;
+      Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
+      if (!Tag)
+        continue;
+      TagMap[*Tag].push_back(&MI);
+    }
+
+  bool AnyCollisions = false;
+  for (auto &P : TagMap) {
+    auto Size = P.second.size();
+    if (Size > 1) {
+      for (auto *MI : P.second) {
+        if (TII->isStridedAccess(*MI)) {
+          AnyCollisions = true;
+          break;
+        }
+      }
+    }
+    if (AnyCollisions)
+      break;
+  }
+  // Nothing to fix.
+  if (!AnyCollisions)
+    return;
+
+  MachineRegisterInfo &MRI = Fn.getRegInfo();
+
+  // Go through all the basic blocks in the current loop and fix any streaming
+  // loads to avoid collisions with any other loads.
+  LiveRegUnits LR(*TRI);
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    LR.clear();
+    LR.addLiveOuts(*MBB);
+    for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
+      MachineInstr &MI = *I;
+      if (!TII->isStridedAccess(MI))
+        continue;
+
+      LoadInfo LdI = *getLoadInfo(MI);
+      unsigned OldTag = *getTag(TRI, MI, LdI);
+      auto &OldCollisions = TagMap[OldTag];
+      if (OldCollisions.size() <= 1)
+        continue;
+
+      bool Fixed = false;
+      DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+
+      for (unsigned ScratchReg : AArch64::GPR64RegClass) {
+        if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
+          continue;
+
+        LoadInfo NewLdI(LdI);
+        NewLdI.BaseReg = ScratchReg;
+        unsigned NewTag = *getTag(TRI, MI, NewLdI);
+        // Scratch reg tag would collide too, so don't use it.
+        if (TagMap.count(NewTag))
+          continue;
+
+        DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI)
+                     << '\n');
+
+        // Rewrite:
+        //   Xd = LOAD Xb, off
+        // to:
+        //   Xc = MOV Xb
+        //   Xd = LOAD Xc, off
+        DebugLoc DL = MI.getDebugLoc();
+        BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
+            .addReg(AArch64::XZR)
+            .addReg(LdI.BaseReg)
+            .addImm(0);
+        MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
+        BaseOpnd.setReg(ScratchReg);
+
+        // If the load does a pre/post increment, then insert a MOV after as
+        // well to update the real base register.
+        if (LdI.IsPrePost) {
+          DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+                       << PrintReg(ScratchReg, TRI) << '\n');
+          MI.getOperand(0).setReg(
+              ScratchReg); // Change tied operand pre/post update dest.
+          BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
+                  TII->get(AArch64::ORRXrs), LdI.BaseReg)
+              .addReg(AArch64::XZR)
+              .addReg(ScratchReg)
+              .addImm(0);
+        }
+
+        for (int I = 0, E = OldCollisions.size(); I != E; ++I)
+          if (OldCollisions[I] == &MI) {
+            std::swap(OldCollisions[I], OldCollisions[E - 1]);
+            OldCollisions.pop_back();
+            break;
+          }
+
+        // Update TagMap to reflect instruction changes to reduce the number
+        // of later MOVs to be inserted.  This needs to be done after
+        // OldCollisions is updated since it may be relocated by this
+        // insertion.
+        TagMap[NewTag].push_back(&MI);
+        ++NumCollisionsAvoided;
+        Fixed = true;
+        Modified = true;
+        break;
+      }
+      if (!Fixed)
+        ++NumCollisionsNotAvoided;
+    }
+  }
+}
+
+bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
+  auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+  if (ST.getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+  TRI = ST.getRegisterInfo();
+
+  assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
+         "Register liveness not available!");
+
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+
+  Modified = false;
+
+  for (MachineLoop *I : LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      // Only process inner-loops
+      if (L->empty())
+        runOnLoop(**L, Fn);
+
+  return Modified;
+}
+
+FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index fe2c2d4..9739605 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -15,28 +15,62 @@
 
 #include "AArch64.h"
 #include "AArch64CallingConvention.h"
+#include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -50,48 +84,55 @@ class AArch64FastISel final : public FastISel {
     } BaseKind;
 
   private:
-    BaseKind Kind;
-    AArch64_AM::ShiftExtendType ExtType;
+    BaseKind Kind = RegBase;
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::InvalidShiftExtend;
     union {
       unsigned Reg;
       int FI;
     } Base;
-    unsigned OffsetReg;
-    unsigned Shift;
-    int64_t Offset;
-    const GlobalValue *GV;
+    unsigned OffsetReg = 0;
+    unsigned Shift = 0;
+    int64_t Offset = 0;
+    const GlobalValue *GV = nullptr;
 
   public:
-    Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
-      OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
+    Address() { Base.Reg = 0; }
+
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
     void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
     AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
+
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
       Base.Reg = Reg;
     }
+
     unsigned getReg() const {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+
     void setOffsetReg(unsigned Reg) {
       OffsetReg = Reg;
     }
+
     unsigned getOffsetReg() const {
       return OffsetReg;
     }
+
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index  access!");
       Base.FI = FI;
     }
+
     unsigned getFI() const {
       assert(isFIBase() && "Invalid base frame index access!");
       return Base.FI;
     }
+
     void setOffset(int64_t O) { Offset = O; }
     int64_t getOffset() { return Offset; }
     void setShift(unsigned S) { Shift = S; }
@@ -417,7 +458,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 
   // MachO still uses GOT for large code-model accesses, but ELF requires
   // movz/movk sequences, which FastISel doesn't handle yet.
-  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+  if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
     return 0;
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
@@ -531,23 +572,23 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
   switch (Opcode) {
   default:
     break;
-  case Instruction::BitCast: {
+  case Instruction::BitCast:
     // Look through bitcasts.
     return computeAddress(U->getOperand(0), Addr, Ty);
-  }
-  case Instruction::IntToPtr: {
+
+  case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
     if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
         TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
-  }
-  case Instruction::PtrToInt: {
+
+  case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
     if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
-  }
+
   case Instruction::GetElementPtr: {
     Address SavedAddr = Addr;
     uint64_t TmpOffset = Addr.getOffset();
@@ -563,7 +604,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
         TmpOffset += SL->getElementOffset(Idx);
       } else {
         uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
+        while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
             TmpOffset += CI->getSExtValue() * S;
@@ -1241,6 +1282,10 @@ unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         bool WantResult) {
   assert(LHSReg && RHSReg && "Invalid register number.");
 
+  if (LHSReg == AArch64::SP || LHSReg == AArch64::WSP ||
+      RHSReg == AArch64::SP || RHSReg == AArch64::WSP)
+    return 0;
+
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
@@ -1321,6 +1366,8 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         uint64_t ShiftImm, bool SetFlags,
                                         bool WantResult) {
   assert(LHSReg && RHSReg && "Invalid register number.");
+  assert(LHSReg != AArch64::SP && LHSReg != AArch64::WSP &&
+         RHSReg != AArch64::SP && RHSReg != AArch64::WSP);
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
@@ -1362,6 +1409,8 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         uint64_t ShiftImm, bool SetFlags,
                                         bool WantResult) {
   assert(LHSReg && RHSReg && "Invalid register number.");
+  assert(LHSReg != AArch64::XZR && LHSReg != AArch64::WZR &&
+         RHSReg != AArch64::XZR && RHSReg != AArch64::WZR);
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
@@ -2065,7 +2114,7 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
 
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type.");
-  case MVT::i1:  VTIsi1 = true;
+  case MVT::i1:  VTIsi1 = true; LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = OpcTable[Idx][0]; break;
   case MVT::i16: Opc = OpcTable[Idx][1]; break;
   case MVT::i32: Opc = OpcTable[Idx][2]; break;
@@ -2786,7 +2835,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
     return false;
 
   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
-  if (SrcVT == MVT::f128)
+  if (SrcVT == MVT::f128 || SrcVT == MVT::f16)
     return false;
 
   unsigned Opc;
@@ -2813,8 +2862,12 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
-  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
-          "Unexpected value type.");
+  // Let regular ISEL handle FP16
+  if (DestVT == MVT::f16)
+    return false;
+
+  assert((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+         "Unexpected value type.");
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
@@ -2866,16 +2919,13 @@ bool AArch64FastISel::fastLowerArguments() {
   // Only handle simple cases of up to 8 GPR and FPR each.
   unsigned GPRCnt = 0;
   unsigned FPRCnt = 0;
-  unsigned Idx = 0;
   for (auto const &Arg : F->args()) {
-    // The first argument is at index 1.
-    ++Idx;
-    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+    if (Arg.hasAttribute(Attribute::ByVal) ||
+        Arg.hasAttribute(Attribute::InReg) ||
+        Arg.hasAttribute(Attribute::StructRet) ||
+        Arg.hasAttribute(Attribute::SwiftSelf) ||
+        Arg.hasAttribute(Attribute::SwiftError) ||
+        Arg.hasAttribute(Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
@@ -2976,7 +3026,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Process the args.
   for (CCValAssign &VA : ArgLocs) {
@@ -3106,8 +3156,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   CodeModel::Model CM = TM.getCodeModel();
-  // Only support the small and large code model.
-  if (CM != CodeModel::Small && CM != CodeModel::Large)
+  // Only support the small-addressing and large code models.
+  if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
     return false;
 
   // FIXME: Add large code model support for ELF.
@@ -3158,7 +3208,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  if (CM == CodeModel::Small) {
+  if (Subtarget->useSmallAddressing()) {
     const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
     if (Symbol)
@@ -3369,8 +3419,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
     MFI.setFrameAddressIsTaken(true);
 
-    const AArch64RegisterInfo *RegInfo =
-        static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
+    const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -3521,11 +3570,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     updateValueMap(II, ResultReg);
     return true;
   }
-  case Intrinsic::trap: {
+  case Intrinsic::trap:
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
-  }
+
   case Intrinsic::sqrt: {
     Type *RetTy = II->getCalledFunction()->getReturnType();
 
@@ -5089,11 +5138,14 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   return selectOperator(I, I->getOpcode());
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
+  (void)&CC_AArch64_Win64_VarArg;
 }
 
 namespace llvm {
-llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+
+FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
   return new AArch64FastISel(FuncInfo, LibInfo);
 }
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f5b8c35..7c6a999 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -41,6 +41,10 @@
 // |                                   |
 // |-----------------------------------|
 // |                                   |
+// | (Win64 only) varargs from reg     |
+// |                                   |
+// |-----------------------------------|
+// |                                   |
 // | prev_fp, prev_lr                  |
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
@@ -90,21 +94,42 @@
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
 
 using namespace llvm;
 
@@ -116,6 +141,34 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
 
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
+/// Look at each instruction that references stack frames and return the stack
+/// size limit beyond which some of these instructions will require a scratch
+/// register during their expansion later.
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
+  // FIXME: For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue() || MI.isPseudo() ||
+          MI.getOpcode() == AArch64::ADDXri ||
+          MI.getOpcode() == AArch64::ADDSXri)
+        continue;
+
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+
+        int Offset = 0;
+        if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
+            AArch64FrameOffsetCannotUpdate)
+          return 0;
+      }
+    }
+  }
+  return 255;
+}
+
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
@@ -245,14 +298,13 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   if (&MF->front() == MBB)
     return AArch64::X9;
 
-  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
-  LivePhysRegs LiveRegs(&TRI);
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
   LiveRegs.addLiveIns(*MBB);
 
   // Mark callee saved registers as used so we will not choose them.
-  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     LiveRegs.addReg(CSRegs[i]);
 
@@ -319,7 +371,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
-
   unsigned NewOpc;
   bool NewIsUnscaled = false;
   switch (MBBI->getOpcode()) {
@@ -362,7 +413,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   unsigned OpndIdx = 0;
   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
        ++OpndIdx)
-    MIB.addOperand(MBBI->getOperand(OpndIdx));
+    MIB.add(MBBI->getOperand(OpndIdx));
 
   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
          "Unexpected immediate offset in first/last callee-save save/restore "
@@ -455,19 +506,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     return;
   }
 
-  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes - CSStackSize);
+  AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
 
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                     MachineInstr::FrameSetup);
     NumBytes = 0;
-  } else if (CSStackSize != 0) {
+  } else if (PrologueSaveSize != 0) {
     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
-                                                     -CSStackSize);
-    NumBytes -= CSStackSize;
+                                                     -PrologueSaveSize);
+    NumBytes -= PrologueSaveSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
@@ -481,8 +536,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     ++MBBI;
   }
   if (HasFP) {
-    // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
-    int FPOffset = CSStackSize - 16;
+    // Only set up FP if we actually need to. Frame pointer is fp =
+    // sp - fixedobject - 16.
+    int FPOffset = AFI->getCalleeSavedStackSize() - 16;
     if (CombineSPBump)
       FPOffset += AFI->getLocalStackSize();
 
@@ -621,8 +677,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     if (HasFP) {
       // Define the current CFA rule to use the provided FP.
       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+          nullptr, Reg, 2 * StackGrowth - FixedObject));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -708,12 +764,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
   // it as the 2nd argument of AArch64ISD::TC_RETURN.
 
-  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+  auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
 
-  if (!CombineSPBump && CSStackSize != 0)
+  if (!CombineSPBump && PrologueSaveSize != 0)
     convertCalleeSaveRestoreToSPPrePostIncDec(
-        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
 
   // Move past the restores of the callee-saved registers.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
@@ -735,7 +795,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     return;
   }
 
-  NumBytes -= CSStackSize;
+  NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   if (!hasFP(MF)) {
@@ -745,7 +805,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     if (RedZone && ArgumentPopSize == 0)
       return;
 
-    bool NoCalleeSaveRestore = CSStackSize == 0;
+    bool NoCalleeSaveRestore = PrologueSaveSize == 0;
     int StackRestoreBytes = RedZone ? 0 : NumBytes;
     if (NoCalleeSaveRestore)
       StackRestoreBytes += ArgumentPopSize;
@@ -764,7 +824,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // be able to save any instructions.
   if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+                    -AFI->getCalleeSavedStackSize() + 16, TII,
+                    MachineInstr::FrameDestroy);
   else if (NumBytes)
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
                     MachineInstr::FrameDestroy);
@@ -794,7 +855,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  int FPOffset = MFI.getObjectOffset(FI) + 16;
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  bool IsWin64 =
+      Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
   int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
   bool isFixed = MFI.isFixedObjectIndex(FI);
 
@@ -863,22 +928,26 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
 
 static bool produceCompactUnwindFrame(MachineFunction &MF) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  AttributeSet Attrs = MF.getFunction()->getAttributes();
+  AttributeList Attrs = MF.getFunction()->getAttributes();
   return Subtarget.isTargetMachO() &&
          !(Subtarget.getTargetLowering()->supportSwiftError() &&
            Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
 namespace {
+
 struct RegPairInfo {
-  RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
-  unsigned Reg1;
-  unsigned Reg2;
+  unsigned Reg1 = AArch64::NoRegister;
+  unsigned Reg2 = AArch64::NoRegister;
   int FrameIdx;
   int Offset;
   bool IsGPR;
+
+  RegPairInfo() = default;
+
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
 };
+
 } // end anonymous namespace
 
 static void computeCalleeSaveRegisterPairs(
@@ -899,7 +968,7 @@ static void computeCalleeSaveRegisterPairs(
           CC == CallingConv::PreserveMost ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
-  unsigned Offset = AFI->getCalleeSavedStackSize();
+  int Offset = AFI->getCalleeSavedStackSize();
 
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
@@ -968,6 +1037,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   SmallVector<RegPairInfo, 8> RegPairs;
 
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
        ++RPII) {
@@ -999,9 +1069,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
           dbgs() << ")\n");
 
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
-    MBB.addLiveIn(Reg1);
+    if (!MRI.isReserved(Reg1))
+      MBB.addLiveIn(Reg1);
     if (RPI.isPaired()) {
-      MBB.addLiveIn(Reg2);
+      if (!MRI.isReserved(Reg2))
+        MBB.addLiveIn(Reg2);
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
@@ -1102,7 +1174,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (RegInfo->hasBasePointer(MF))
     BasePointerReg = RegInfo->getBaseRegister();
 
-  bool ExtraCSSpill = false;
+  unsigned ExtraCSSpill = 0;
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   // Figure out which callee-saved registers to save/restore.
   for (unsigned i = 0; CSRegs[i]; ++i) {
@@ -1130,13 +1202,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(PairedReg);
       if (AArch64::GPR64RegClass.contains(PairedReg) &&
           !RegInfo->isReservedReg(MF, PairedReg))
-        ExtraCSSpill = true;
+        ExtraCSSpill = PairedReg;
     }
   }
 
   DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
-        for (int Reg = SavedRegs.find_first(); Reg != -1;
-             Reg = SavedRegs.find_next(Reg))
+        for (unsigned Reg : SavedRegs.set_bits())
           dbgs() << ' ' << PrintReg(Reg, RegInfo);
         dbgs() << "\n";);
 
@@ -1144,16 +1215,13 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned NumRegsSpilled = SavedRegs.count();
   bool CanEliminateFrame = NumRegsSpilled == 0;
 
-  // FIXME: Set BigStack if any stack slot references may be out of range.
-  // For now, just conservatively guestimate based on unscaled indexing
-  // range. We'll end up allocating an unnecessary spill slot a lot, but
-  // realistically that's not a big deal at this stage of the game.
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
   DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
-  bool BigStack = (CFSize >= 256);
+  unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
+  bool BigStack = (CFSize > EstimatedStackSizeLimit);
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
     AFI->setHasStackFrame(true);
 
@@ -1163,8 +1231,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // register scavenging. If we already spilled an extra callee-saved register
   // above to keep the number of spills even, we don't need to do anything else
   // here.
-  if (BigStack && !ExtraCSSpill) {
-    if (UnspilledCSGPR != AArch64::NoRegister) {
+  if (BigStack) {
+    if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
       DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
             << " to get a scratch register.\n");
       SavedRegs.set(UnspilledCSGPR);
@@ -1173,15 +1241,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       // store the pair.
       if (produceCompactUnwindFrame(MF))
         SavedRegs.set(UnspilledCSGPRPaired);
-      ExtraCSSpill = true;
+      ExtraCSSpill = UnspilledCSGPRPaired;
       NumRegsSpilled = SavedRegs.count();
     }
 
     // If we didn't find an extra callee-saved register to spill, create
     // an emergency spill slot.
-    if (!ExtraCSSpill) {
-      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
-      int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+    if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
+      const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass &RC = AArch64::GPR64RegClass;
+      unsigned Size = TRI->getSpillSize(RC);
+      unsigned Align = TRI->getSpillAlignment(RC);
+      int FI = MFI.CreateStackObject(Size, Align, false);
       RS->addScavengingFrameIndex(FI);
       DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
                    << " as the emergency spill slot.\n");
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index d472a54..8b1c974 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -16,281 +16,198 @@
 #endif
 
 namespace llvm {
-namespace AArch64 {
-
-const uint32_t GPRCoverageData[] = {
-    // Classes 0-31
-    (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) |
-        (1u << AArch64::GPR32spRegClassID) |
-        (1u << AArch64::GPR32commonRegClassID) |
-        (1u << AArch64::GPR32sponlyRegClassID) |
-        (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) |
-        (1u << AArch64::GPR64spRegClassID) |
-        (1u << AArch64::GPR64commonRegClassID) |
-        (1u << AArch64::tcGPR64RegClassID) |
-        (1u << AArch64::GPR64sponlyRegClassID),
-    // Classes 32-63
-    0,
-    // FIXME: The entries below this point can be safely removed once this is
-    // tablegenerated. It's only needed because of the hardcoded register class
-    // limit.
-    // Classes 64-96
-    0,
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
-};
-
-const uint32_t FPRCoverageData[] = {
-    // Classes 0-31
-    (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) |
-        (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) |
-        (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) |
-        (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) |
-        (1u << AArch64::DDDDRegClassID),
-    // Classes 32-63
-    (1u << (AArch64::QQRegClassID - 32)) |
-        (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
-        (1u
-         << (AArch64::
-                 QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u << (AArch64::QQQQRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
-             32)) |
-        (1u
-         << (AArch64::
-                 QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID -
-             32)) |
-        (1u << (AArch64::QQQRegClassID - 32)) |
-        (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
-        (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
-        (1u
-         << (AArch64::
-                 QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID -
-             32)),
-    // FIXME: The entries below this point can be safely removed once this
-    // is tablegenerated. It's only needed because of the hardcoded register
-    // class limit.
-    // Classes 64-96
-    0,
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
-};
-
-const uint32_t CCRCoverageData[] = {
-    // Classes 0-31
-    1u << AArch64::CCRRegClassID,
-    // Classes 32-63
-    0,
-    // FIXME: The entries below this point can be safely removed once this
-    // is tablegenerated. It's only needed because of the hardcoded register
-    // class limit.
-    // Classes 64-96
-    0,
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
-};
-
-RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData);
-RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData);
-RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData);
-
-RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
-
-// PartialMappings.
-enum PartialMappingIdx {
-  PMI_None = -1,
-  PMI_GPR32 = 1,
-  PMI_GPR64,
-  PMI_FPR32,
-  PMI_FPR64,
-  PMI_FPR128,
-  PMI_FPR256,
-  PMI_FPR512,
-  PMI_FirstGPR = PMI_GPR32,
-  PMI_LastGPR = PMI_GPR64,
-  PMI_FirstFPR = PMI_FPR32,
-  PMI_LastFPR = PMI_FPR512,
-  PMI_Min = PMI_FirstGPR,
-};
-
-static unsigned getRegBankBaseIdxOffset(unsigned Size) {
-  assert(Size && "0-sized type!!");
-  // Make anything smaller than 32 gets 32
-  Size = ((Size + 31) / 32) * 32;
-  // 32 is 0, 64 is 1, 128 is 2, and so on.
-  return Log2_32(Size) - /*Log2_32(32)=*/ 5;
-}
-
-RegisterBankInfo::PartialMapping PartMappings[] {
-  /* StartIdx, Length, RegBank */
-  // 0: GPR 32-bit value.
-  {0, 32, GPRRegBank},
-  // 1: GPR 64-bit value.
-  {0, 64, GPRRegBank},
-  // 2: FPR 32-bit value.
-  {0, 32, FPRRegBank},
-  // 3: FPR 64-bit value.
-  {0, 64, FPRRegBank},
-  // 4: FPR 128-bit value.
-  {0, 128, FPRRegBank},
-  // 5: FPR 256-bit value.
-  {0, 256, FPRRegBank},
-  // 6: FPR 512-bit value.
-  {0, 512, FPRRegBank}
-};
-
-enum ValueMappingIdx {
-  First3OpsIdx = 0,
-  Last3OpsIdx = 18,
-  DistanceBetweenRegBanks = 3,
-  FirstCrossRegCpyIdx = 21,
-  LastCrossRegCpyIdx = 27,
-  DistanceBetweenCrossRegCpy = 2
+RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{
+    /* StartIdx, Length, RegBank */
+    // 0: FPR 32-bit value.
+    {0, 32, AArch64::FPRRegBank},
+    // 1: FPR 64-bit value.
+    {0, 64, AArch64::FPRRegBank},
+    // 2: FPR 128-bit value.
+    {0, 128, AArch64::FPRRegBank},
+    // 3: FPR 256-bit value.
+    {0, 256, AArch64::FPRRegBank},
+    // 4: FPR 512-bit value.
+    {0, 512, AArch64::FPRRegBank},
+    // 5: GPR 32-bit value.
+    {0, 32, AArch64::GPRRegBank},
+    // 6: GPR 64-bit value.
+    {0, 64, AArch64::GPRRegBank},
 };
 
 // ValueMappings.
-RegisterBankInfo::ValueMapping ValMappings[]{
+RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
     /* BreakDown, NumBreakDowns */
+    // 0: invalid
+    {nullptr, 0},
     // 3-operands instructions (all binary operations should end up with one of
     // those mapping).
-    // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 3: GPR 64-bit value.
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    // 6: FPR 32-bit value.
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 9: FPR 64-bit value.
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    // 12: FPR 128-bit value.
-    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR128 - PMI_Min], 1},
-    // 15: FPR 256-bit value.
-    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR256 - PMI_Min], 1},
-    // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
-    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+    // 1: FPR 32-bit value. <-- This must match First3OpsIdx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 4: FPR 64-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    // 7: FPR 128-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+    // 10: FPR 256-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+    // 13: FPR 512-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+    // 16: GPR 32-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 19: GPR 64-bit value. <-- This must match Last3OpsIdx.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     // Cross register bank copies.
-    // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match
+    // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match
     //                                               FirstCrossRegCpyIdx.
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 23: GPR 64-bit value to FPR 64-bit value.
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    // 25: FPR 32-bit value to GPR 32-bit value.
-    {&PartMappings[PMI_FPR32 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    // 24: FPR 64-bit value to GPR 64-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    // 26: FPR 128-bit value to GPR 128-bit value (invalid)
+    {nullptr, 1},
+    {nullptr, 1},
+    // 28: FPR 256-bit value to GPR 256-bit value (invalid)
+    {nullptr, 1},
+    {nullptr, 1},
+    // 30: FPR 512-bit value to GPR 512-bit value (invalid)
+    {nullptr, 1},
+    {nullptr, 1},
+    // 32: GPR 32-bit value to FPR 32-bit value.
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+    // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match
     //                                               LastCrossRegCpyIdx.
-    {&PartMappings[PMI_FPR64 - PMI_Min], 1},
-    {&PartMappings[PMI_GPR64 - PMI_Min], 1}
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+    {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
 };
 
-/// Get the pointer to the ValueMapping representing the RegisterBank
-/// at \p RBIdx with a size of \p Size.
-///
-/// The returned mapping works for instructions with the same kind of
-/// operands for up to 3 operands.
-///
-/// \pre \p RBIdx != PartialMappingIdx::None
+bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx,
+                                                 unsigned ValStartIdx,
+                                                 unsigned ValLength,
+                                                 const RegisterBank &RB) {
+  const PartialMapping &Map = PartMappings[Idx - PartialMappingIdx::PMI_Min];
+  return Map.StartIdx == ValStartIdx && Map.Length == ValLength &&
+         Map.RegBank == &RB;
+}
+
+bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx,
+                                                   unsigned FirstInBank,
+                                                   unsigned Size,
+                                                   unsigned Offset) {
+  unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min;
+  const ValueMapping &Map =
+      AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset];
+  return Map.BreakDown == &PartMappings[PartialMapBaseIdx] &&
+         Map.NumBreakDowns == 1;
+}
+
+bool AArch64GenRegisterBankInfo::checkPartialMappingIdx(
+    PartialMappingIdx FirstAlias, PartialMappingIdx LastAlias,
+    ArrayRef<PartialMappingIdx> Order) {
+  if (Order.front() != FirstAlias)
+    return false;
+  if (Order.back() != LastAlias)
+    return false;
+  if (Order.front() > Order.back())
+    return false;
+
+  PartialMappingIdx Previous = Order.front();
+  bool First = true;
+  for (const auto &Current : Order) {
+    if (First) {
+      First = false;
+      continue;
+    }
+    if (Previous + 1 != Current)
+      return false;
+    Previous = Current;
+  }
+  return true;
+}
+
+unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
+                                                             unsigned Size) {
+  if (RBIdx == PMI_FirstGPR) {
+    if (Size <= 32)
+      return 0;
+    if (Size <= 64)
+      return 1;
+    return -1;
+  }
+  if (RBIdx == PMI_FirstFPR) {
+    if (Size <= 32)
+      return 0;
+    if (Size <= 64)
+      return 1;
+    if (Size <= 128)
+      return 2;
+    if (Size <= 256)
+      return 3;
+    if (Size <= 512)
+      return 4;
+    return -1;
+  }
+  return -1;
+}
+
 const RegisterBankInfo::ValueMapping *
-getValueMapping(PartialMappingIdx RBIdx, unsigned Size) {
+AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx,
+                                            unsigned Size) {
   assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
-  unsigned ValMappingIdx = First3OpsIdx +
-                           (RBIdx - AArch64::PartialMappingIdx::PMI_Min +
-                            getRegBankBaseIdxOffset(Size)) *
-                               ValueMappingIdx::DistanceBetweenRegBanks;
-  assert(ValMappingIdx >= AArch64::First3OpsIdx &&
-         ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
+  unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size);
+  if (BaseIdxOffset == -1u)
+    return &ValMappings[InvalidIdx];
+
+  unsigned ValMappingIdx =
+      First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) *
+                         ValueMappingIdx::DistanceBetweenRegBanks;
+  assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx &&
+         "Mapping out of bound");
 
   return &ValMappings[ValMappingIdx];
 }
 
-/// Get the pointer to the ValueMapping of the operands of a copy
-/// instruction from a GPR or FPR register to a GPR or FPR register
-/// with a size of \p Size.
-///
-/// If \p DstIsGPR is true, the destination of the copy is on GPR,
-/// otherwise it is on FPR. Same thing for \p SrcIsGPR.
+AArch64GenRegisterBankInfo::PartialMappingIdx
+    AArch64GenRegisterBankInfo::BankIDToCopyMapIdx[]{
+        PMI_None,     // CCR
+        PMI_FirstFPR, // FPR
+        PMI_FirstGPR, // GPR
+    };
+
 const RegisterBankInfo::ValueMapping *
-getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) {
-  PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
-  PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
+                                           unsigned SrcBankID, unsigned Size) {
+  assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
+  assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
+  PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID];
+  PartialMappingIdx SrcRBIdx = BankIDToCopyMapIdx[SrcBankID];
+  assert(DstRBIdx != PMI_None && "No such mapping");
+  assert(SrcRBIdx != PMI_None && "No such mapping");
+
   if (DstRBIdx == SrcRBIdx)
     return getValueMapping(DstRBIdx, Size);
+
   assert(Size <= 64 && "GPR cannot handle that size");
   unsigned ValMappingIdx =
       FirstCrossRegCpyIdx +
-      (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) *
+      (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(DstRBIdx, Size)) *
           ValueMappingIdx::DistanceBetweenCrossRegCpy;
-  assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx &&
-         ValMappingIdx <= AArch64::LastCrossRegCpyIdx &&
-         "Mapping out of bound");
+  assert(ValMappingIdx >= FirstCrossRegCpyIdx &&
+         ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound");
   return &ValMappings[ValMappingIdx];
 }
-
-} // End AArch64 namespace.
 } // End llvm namespace.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3099383..06005f6 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -200,7 +201,7 @@ private:
 
   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
 
-  void SelectCMP_SWAP(SDNode *N);
+  bool SelectCMP_SWAP(SDNode *N);
 
 };
 } // end anonymous namespace
@@ -238,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
   case InlineAsm::Constraint_i:
   case InlineAsm::Constraint_m:
   case InlineAsm::Constraint_Q:
-    // Require the address to be in a register.  That is safe for all AArch64
-    // variants and it is hard to do anything much smarter without knowing
-    // how the operand is used.
-    OutOps.push_back(Op);
+    // We need to make sure that this one operand does not end up in XZR, thus
+    // require the address to be in a PointerRegClass register.
+    const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+    const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
+    SDLoc dl(Op);
+    SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
+    SDValue NewOp =
+        SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                       dl, Op.getValueType(),
+                                       Op, RC), 0);
+    OutOps.push_back(NewOp);
     return false;
   }
   return true;
@@ -328,11 +336,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
   }
 }
 
+/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// mode.
+static bool isWorthFoldingSHL(SDValue V) {
+  assert(V.getOpcode() == ISD::SHL && "invalid opcode");
+  // It is worth folding logical shift of up to three places.
+  auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!CSD)
+    return false;
+  unsigned ShiftVal = CSD->getZExtValue();
+  if (ShiftVal > 3)
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = V.getNode();
+  for (SDNode *UI : Node->uses())
+    if (!isa<MemSDNode>(*UI))
+      for (SDNode *UII : UI->uses())
+        if (!isa<MemSDNode>(*UII))
+          return false;
+  return true;
+}
+
 /// \brief Determine whether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the value is used at least twice, unless we are optimizing
-  // for code size.
-  return ForCodeSize || V.hasOneUse();
+  // Trivial if we are optimizing for code size or if there is only
+  // one use of the value.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  // If a subtarget has a fastpath LSL we can fold a logical shift into
+  // the addressing mode and save a cycle.
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+      isWorthFoldingSHL(V))
+    return true;
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+    const SDValue LHS = V.getOperand(0);
+    const SDValue RHS = V.getOperand(1);
+    if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
+      return true;
+    if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
+      return true;
+  }
+
+  // It hurts otherwise, since the value will be reused.
+  return false;
 }
 
 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
@@ -1811,20 +1860,20 @@ static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
   OpUsefulBits = 1;
 
   if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    OpUsefulBits <<= MSB - Imm + 1;
     --OpUsefulBits;
     // The interesting part will be in the lower part of the result
     getUsefulBits(Op, OpUsefulBits, Depth + 1);
     // The interesting part was starting at Imm in the argument
-    OpUsefulBits = OpUsefulBits.shl(Imm);
+    OpUsefulBits <<= Imm;
   } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    OpUsefulBits <<= MSB + 1;
     --OpUsefulBits;
     // The interesting part will be shifted in the result
-    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
     getUsefulBits(Op, OpUsefulBits, Depth + 1);
     // The interesting part was at zero in the argument
-    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+    OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
   }
 
   UsefulBits &= OpUsefulBits;
@@ -1851,17 +1900,17 @@ static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
   if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
     // Shift Left
     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.shl(ShiftAmt);
+    Mask <<= ShiftAmt;
     getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.lshr(ShiftAmt);
+    Mask.lshrInPlace(ShiftAmt);
   } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
     // Shift Right
     // We do not handle AArch64_AM::ASR, because the sign will change the
     // number of useful bits
     uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.lshr(ShiftAmt);
+    Mask.lshrInPlace(ShiftAmt);
     getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.shl(ShiftAmt);
+    Mask <<= ShiftAmt;
   } else
     return;
 
@@ -1889,13 +1938,13 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
     uint64_t Width = MSB - Imm + 1;
     uint64_t LSB = Imm;
 
-    OpUsefulBits = OpUsefulBits.shl(Width);
+    OpUsefulBits <<= Width;
     --OpUsefulBits;
 
     if (Op.getOperand(1) == Orig) {
       // Copy the low bits from the result to bits starting from LSB.
       Mask = ResultUsefulBits & OpUsefulBits;
-      Mask = Mask.shl(LSB);
+      Mask <<= LSB;
     }
 
     if (Op.getOperand(0) == Orig)
@@ -1906,14 +1955,14 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
     uint64_t Width = MSB + 1;
     uint64_t LSB = UsefulBits.getBitWidth() - Imm;
 
-    OpUsefulBits = OpUsefulBits.shl(Width);
+    OpUsefulBits <<= Width;
     --OpUsefulBits;
-    OpUsefulBits = OpUsefulBits.shl(LSB);
+    OpUsefulBits <<= LSB;
 
     if (Op.getOperand(1) == Orig) {
       // Copy the bits from the result to the zero bits.
       Mask = ResultUsefulBits & OpUsefulBits;
-      Mask = Mask.lshr(LSB);
+      Mask.lshrInPlace(LSB);
     }
 
     if (Op.getOperand(0) == Orig)
@@ -2037,18 +2086,18 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
   (void)BitWidth;
   assert(BitWidth == 32 || BitWidth == 64);
 
-  APInt KnownZero, KnownOne;
-  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+  KnownBits Known;
+  CurDAG->computeKnownBits(Op, Known);
 
   // Non-zero in the sense that they're not provably zero, which is the key
   // point if we want to use this value
-  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+  uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
 
   // Discard a constant AND mask if present. It's safe because the node will
   // already have been factored into the computeKnownBits calculation above.
   uint64_t AndImm;
   if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
-    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
     Op = Op.getOperand(0);
   }
 
@@ -2117,15 +2166,15 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
 
   // Compute the Known Zero for the AND as this allows us to catch more general
   // cases than just looking for AND with imm.
-  APInt KnownZero, KnownOne;
-  CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+  KnownBits Known;
+  CurDAG->computeKnownBits(And, Known);
 
   // Non-zero in the sense that they're not provably zero, which is the key
   // point if we want to use this value.
-  uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+  uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
 
   // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
-  if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+  if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
     return false;
 
   // The bits being inserted must only set those bits that are known to be zero.
@@ -2259,15 +2308,15 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
     // This allows to catch more general case than just looking for
     // AND with imm. Indeed, simplify-demanded-bits may have removed
     // the AND instruction because it proves it was useless.
-    APInt KnownZero, KnownOne;
-    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+    KnownBits Known;
+    CurDAG->computeKnownBits(OrOpd1Val, Known);
 
     // Check if there is enough room for the second operand to appear
     // in the first one
     APInt BitsToBeInserted =
-        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+        APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
 
-    if ((BitsToBeInserted & ~KnownZero) != 0)
+    if ((BitsToBeInserted & ~Known.Zero) != 0)
       continue;
 
     // Set the first operand
@@ -2524,7 +2573,7 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
   // pstatefield for the MSR (immediate) instruction, we also require that an
   // immediate value has been provided as an argument, we know that this is
   // the case as it has been ensured by semantic checking.
-  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
   if (PMapper) {
     assert (isa<ConstantSDNode>(N->getOperand(2))
               && "Expected a constant integer expression.");
@@ -2567,9 +2616,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
 }
 
 /// We've got special pseudo-instructions for these
-void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
   EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+
+  // Leave IR for LSE if subtarget supports it.
+  if (Subtarget->hasLSE()) return false;
+
   if (MemTy == MVT::i8)
     Opcode = AArch64::CMP_SWAP_8;
   else if (MemTy == MVT::i16)
@@ -2595,6 +2648,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
   CurDAG->RemoveDeadNode(N);
+
+  return true;
 }
 
 void AArch64DAGToDAGISel::Select(SDNode *Node) {
@@ -2618,8 +2673,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     break;
 
   case ISD::ATOMIC_CMP_SWAP:
-    SelectCMP_SWAP(Node);
-    return;
+    if (SelectCMP_SWAP(Node))
+      return;
+    break;
 
   case ISD::READ_REGISTER:
     if (tryReadRegister(Node))
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 849058b..9d87988 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64ISelLowering.h"
 #include "AArch64CallingConvention.h"
 #include "AArch64MachineFunctionInfo.h"
-#include "AArch64ISelLowering.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
@@ -22,13 +22,14 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -50,10 +51,10 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
@@ -66,6 +67,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetCallingConv.h"
@@ -90,6 +92,7 @@ using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
@@ -104,6 +107,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+static cl::opt<bool>
+EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+                         cl::desc("Enable AArch64 logical imm instruction "
+                                  "optimization"),
+                         cl::init(true));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -372,7 +381,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
   setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
-  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
   setOperationAction(ISD::FREM, MVT::v4f16, Expand);
   setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
   setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
@@ -404,7 +412,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
   setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
   setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
-  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
   setOperationAction(ISD::FREM, MVT::v8f16, Expand);
   setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
   setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
@@ -544,7 +551,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
-  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -554,8 +560,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setSchedulingPreference(Sched::Hybrid);
 
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
   EnableExtLdPromotion = true;
 
   // Set required alignment.
@@ -652,6 +656,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 
+    // Vector reductions
+    for (MVT VT : MVT::integer_valuetypes()) {
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+    }
+    for (MVT VT : MVT::fp_valuetypes()) {
+      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+    }
+
     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
@@ -707,7 +724,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
-    setOperationAction(ISD::FPOWI, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FLOG, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);
@@ -751,6 +767,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 
+  if (!VT.isFloatingPoint())
+    setOperationAction(ISD::ABS, VT, Legal);
+
   // [SU][MIN|MAX] are available for all NEON types apart from i64.
   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
@@ -788,21 +807,157 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
   return VT.changeVectorElementTypeToInteger();
 }
 
+static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
+                               const APInt &Demanded,
+                               TargetLowering::TargetLoweringOpt &TLO,
+                               unsigned NewOpc) {
+  uint64_t OldImm = Imm, NewImm, Enc;
+  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
+
+  // Return if the immediate is already all zeros, all ones, a bimm32 or a
+  // bimm64.
+  if (Imm == 0 || Imm == Mask ||
+      AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
+    return false;
+
+  unsigned EltSize = Size;
+  uint64_t DemandedBits = Demanded.getZExtValue();
+
+  // Clear bits that are not demanded.
+  Imm &= DemandedBits;
+
+  while (true) {
+    // The goal here is to set the non-demanded bits in a way that minimizes
+    // the number of switching between 0 and 1. In order to achieve this goal,
+    // we set the non-demanded bits to the value of the preceding demanded bits.
+    // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+    // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+    // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+    // The final result is 0b11000011.
+    uint64_t NonDemandedBits = ~DemandedBits;
+    uint64_t InvertedImm = ~Imm & DemandedBits;
+    uint64_t RotatedImm =
+        ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
+        NonDemandedBits;
+    uint64_t Sum = RotatedImm + NonDemandedBits;
+    bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
+    uint64_t Ones = (Sum + Carry) & NonDemandedBits;
+    NewImm = (Imm | Ones) & Mask;
+
+    // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+    // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+    // we halve the element size and continue the search.
+    if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
+      break;
+
+    // We cannot shrink the element size any further if it is 2-bits.
+    if (EltSize == 2)
+      return false;
+
+    EltSize /= 2;
+    Mask >>= EltSize;
+    uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+    // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+    if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+      return false;
+
+    // Merge the upper and lower halves of Imm and DemandedBits.
+    Imm |= Hi;
+    DemandedBits |= DemandedBitsHi;
+  }
+
+  ++NumOptimizedImms;
+
+  // Replicate the element across the register width.
+  while (EltSize < Size) {
+    NewImm |= NewImm << EltSize;
+    EltSize *= 2;
+  }
+
+  (void)OldImm;
+  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
+         "demanded bits should never be altered");
+  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
+
+  // Create the new constant immediate node.
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue New;
+
+  // If the new constant immediate is all-zeros or all-ones, let the target
+  // independent DAG combine optimize this node.
+  if (NewImm == 0 || NewImm == OrigMask) {
+    New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
+                          TLO.DAG.getConstant(NewImm, DL, VT));
+  // Otherwise, create a machine node so that target independent DAG combine
+  // doesn't undo this optimization.
+  } else {
+    Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+    SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+    New = SDValue(
+        TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+  }
+
+  return TLO.CombineTo(Op, New);
+}
+
+bool AArch64TargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+  // Delay this optimization to as late as possible.
+  if (!TLO.LegalOps)
+    return false;
+
+  if (!EnableOptimizeLogicalImm)
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector())
+    return false;
+
+  unsigned Size = VT.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "i32 or i64 is expected after legalization.");
+
+  // Exit early if we demand all bits.
+  if (Demanded.countPopulation() == Size)
+    return false;
+
+  unsigned NewOpc;
+  switch (Op.getOpcode()) {
+  default:
+    return false;
+  case ISD::AND:
+    NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
+    break;
+  case ISD::OR:
+    NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
+    break;
+  case ISD::XOR:
+    NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
+    break;
+  }
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!C)
+    return false;
+  uint64_t Imm = C->getZExtValue();
+  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+}
+
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
-/// Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
+/// Mask are known to be either zero or one and return them Known.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
-    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
+    const SDValue Op, KnownBits &Known,
+    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
   case AArch64ISD::CSEL: {
-    APInt KnownZero2, KnownOne2;
-    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
-    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
-    KnownZero &= KnownZero2;
-    KnownOne &= KnownOne2;
+    KnownBits Known2;
+    DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
@@ -812,10 +967,10 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     default: return;
     case Intrinsic::aarch64_ldaxr:
     case Intrinsic::aarch64_ldxr: {
-      unsigned BitWidth = KnownOne.getBitWidth();
+      unsigned BitWidth = Known.getBitWidth();
       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
       return;
     }
     }
@@ -834,15 +989,15 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
       // bits larger than the element datatype. 32-bit or larget doesn't need
       // this as those are legal types and will be handled by isel directly.
       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
-      unsigned BitWidth = KnownZero.getBitWidth();
+      unsigned BitWidth = Known.getBitWidth();
       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
         assert(BitWidth >= 8 && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
-        KnownZero |= Mask;
+        Known.Zero |= Mask;
       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
         assert(BitWidth >= 16 && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
-        KnownZero |= Mask;
+        Known.Zero |= Mask;
       }
       break;
     } break;
@@ -2113,8 +2268,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
@@ -2122,10 +2277,11 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2231,19 +2387,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::SIGN_EXTEND)
-    return true;
-  if (isExtendedBUILD_VECTOR(N, DAG, true))
-    return true;
-  return false;
+  return N->getOpcode() == ISD::SIGN_EXTEND ||
+         isExtendedBUILD_VECTOR(N, DAG, true);
 }
 
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::ZERO_EXTEND)
-    return true;
-  if (isExtendedBUILD_VECTOR(N, DAG, false))
-    return true;
-  return false;
+  return N->getOpcode() == ISD::ZERO_EXTEND ||
+         isExtendedBUILD_VECTOR(N, DAG, false);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -2347,6 +2497,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
   }
+  case Intrinsic::aarch64_neon_abs:
+    return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
+                       Op.getOperand(1));
   case Intrinsic::aarch64_neon_smax:
     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
@@ -2465,6 +2618,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerMUL(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+  case ISD::VECREDUCE_FMAX:
+  case ISD::VECREDUCE_FMIN:
+    return LowerVECREDUCE(Op, DAG);
   }
 }
 
@@ -2489,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::PreserveMost:
   case CallingConv::CXX_FAST_TLS:
   case CallingConv::Swift:
+    if (Subtarget->isTargetWindows() && IsVarArg)
+      return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+  case CallingConv::Win64:
+    return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   }
 }
 
@@ -2507,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -2663,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
+    if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
+      // Win64 variadic functions also pass arguments in registers, but all float
+      // arguments are passed in integer registers.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
@@ -2708,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
+  bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
 
   SmallVector<SDValue, 8> MemOps;
 
@@ -2720,7 +2889,13 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
   if (GPRSaveSize != 0) {
-    GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+    if (IsWin64) {
+      GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
+      if (GPRSaveSize & 15)
+        // The extra size here, if triggered, will always be 8.
+        MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
+    } else
+      GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
@@ -2729,7 +2904,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
-          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+          IsWin64
+              ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                                  GPRIdx,
+                                                  (i - FirstVariadicGPR) * 8)
+              : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2738,7 +2917,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  if (Subtarget->hasFPARMv8()) {
+  if (Subtarget->hasFPARMv8() && !IsWin64) {
     static const MCPhysReg FPRArgRegs[] = {
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
@@ -3108,9 +3287,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
-                                                              true),
-                                 DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
                                         getPointerTy(DAG.getDataLayout()));
@@ -3245,30 +3422,26 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      Subtarget->isTargetMachO()) {
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    auto GV = G->getGlobal();
+    if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
+        AArch64II::MO_GOT) {
+      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+    } else {
       const GlobalValue *GV = G->getGlobal();
-      bool InternalLinkage = GV->hasInternalLinkage();
-      if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
-      else {
-        Callee =
-            DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
-        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
-      }
-    } else if (ExternalSymbolSDNode *S =
-                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+    }
+  } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+        Subtarget->isTargetMachO()) {
       const char *Sym = S->getSymbol();
       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+    } else {
+      const char *Sym = S->getSymbol();
+      Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
     }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
   }
 
   // We don't usually want to end the call-sequence here because we would tidy
@@ -3428,11 +3601,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
+SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+}
+
+// (loadGOT sym)
+template <class NodeTy>
+SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const {
+  DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT);
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into two nodes instead of using a wrapper node.
+  return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
+}
+
+// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
+template <class NodeTy>
+SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG)
+  const {
+  DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  const unsigned char MO_NC = AArch64II::MO_NC;
+  return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, Ty,
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G3),
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC),
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC),
+        getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC));
+}
+
+// (addlow (adrp %hi(sym)) %lo(sym))
+template <class NodeTy>
+SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const {
+  DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE);
+  SDValue Lo = getTargetNode(N, Ty, DAG,
+                             AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
+}
+
 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+  GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
@@ -3440,32 +3677,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
          "unexpected offset in global node");
 
-  // This also catched the large code model case for Darwin.
+  // This also catches the large code model case for Darwin.
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
-    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes instead of using a wrapper node.
-    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    return getGOT(GN, DAG);
   }
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(GN, DAG);
   } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
-    // the only correct model on Darwin.
-    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                            OpFlags | AArch64II::MO_PAGE);
-    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
-    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    return getAddr(GN, DAG);
   }
 }
 
@@ -3578,7 +3798,7 @@ SDValue
 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+  assert(Subtarget->useSmallAddressing() &&
          "ELF TLS only supported in small memory model");
   // Different choices can be made for the maximum size of the TLS area for a
   // module. For the small address model, the default TLS size is 16MiB and the
@@ -3679,7 +3899,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                      SelectionDAG &DAG) const {
   if (Subtarget->isTargetDarwin())
     return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
+  if (Subtarget->isTargetELF())
     return LowerELFGlobalTLSAddress(Op, DAG);
 
   llvm_unreachable("Unexpected platform trying to use TLS");
@@ -4242,90 +4462,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                               AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(JT, DAG);
   }
-
-  SDValue Hi =
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
-  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  return getAddr(JT, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     // Use the GOT for the large code model on iOS.
     if (Subtarget->isTargetMachO()) {
-      SDValue GotAddr = DAG.getTargetConstantPool(
-          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-          AArch64II::MO_GOT);
-      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+      return getGOT(CP, DAG);
     }
-
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G3),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(CP, DAG);
   } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
-    // ELF, the only valid one on Darwin.
-    SDValue Hi =
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), AArch64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetConstantPool(
-        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    return getAddr(CP, DAG);
   }
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  SDLoc DL(Op);
+  BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = AArch64II::MO_NC;
-    return DAG.getNode(
-        AArch64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+    return getAddrLarge(BA, DAG);
   } else {
-    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
-                                                             AArch64II::MO_NC);
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+    return getAddr(BA, DAG);
   }
 }
 
@@ -4342,6 +4509,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                       MachinePointerInfo(SV));
 }
 
+SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
+                                     ? FuncInfo->getVarArgsGPRIndex()
+                                     : FuncInfo->getVarArgsStackIndex(),
+                                 getPointerTy(DAG.getDataLayout()));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
@@ -4413,8 +4595,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+    return LowerWin64_VASTART(Op, DAG);
+  else if (Subtarget->isTargetDarwin())
+    return LowerDarwin_VASTART(Op, DAG);
+  else
+    return LowerAAPCS_VASTART(Op, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
@@ -4422,7 +4610,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  unsigned VaListSize =
+      Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
@@ -4516,7 +4705,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                                   SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
+                       .Case("x18", AArch64::X18)
+                       .Case("w18", AArch64::W18)
                        .Default(0);
+  if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
+      !Subtarget->isX18Reserved())
+    Reg = 0;
   if (Reg)
     return Reg;
   report_fatal_error(Twine("Invalid register name \""
@@ -4717,9 +4911,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
-                                   &Flags);
-        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
-        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+                                   Flags);
+        Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
 
       if (!Reciprocal) {
@@ -4728,7 +4922,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
         SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
         SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
 
-        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
         // Correct the result if the operand is 0.0.
         Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
                                VT, Eq, Operand, Estimate);
@@ -4757,8 +4951,8 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
       // AArch64 reciprocal iteration instruction: (2 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
-                                   Estimate, &Flags);
-        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+                                   Estimate, Flags);
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
 
       ExtraSteps = 0;
@@ -6591,21 +6785,20 @@ FailedModImm:
   if (!isConstant && !usesOnlyOneValue) {
     SDValue Vec = DAG.getUNDEF(VT);
     SDValue Op0 = Op.getOperand(0);
-    unsigned ElemSize = VT.getScalarSizeInBits();
     unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+
+    // Use SCALAR_TO_VECTOR for lane zero to
     // a) Avoid a RMW dependency on the full vector register, and
     // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    // Do not do this for UNDEF/LOAD nodes because we have better patterns
-    // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
-    if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
-        (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, dl, MVT::i32));
-      Vec = SDValue(N, 0);
+    //    value is already in an S or D register, and we're forced to emit an
+    //    INSERT_SUBREG that we can't fold anywhere.
+    //
+    // We also allow types like i8 and i16 which are illegal scalar but legal
+    // vector element types. After type-legalization the inserted value is
+    // extended (i32) and it is safe to cast them to the vector type by ignoring
+    // the upper bits of the lowest lane (e.g. v8i8, v4i16).
+    if (!Op0.isUndef()) {
+      Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
       ++i;
     }
     for (; i < NumElts; ++i) {
@@ -6995,6 +7188,47 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   return Cmp;
 }
 
+static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
+                                  SelectionDAG &DAG) {
+  SDValue VecOp = ScalarOp.getOperand(0);
+  auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
+                     DAG.getConstant(0, DL, MVT::i64));
+}
+
+SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  switch (Op.getOpcode()) {
+  case ISD::VECREDUCE_ADD:
+    return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
+  case ISD::VECREDUCE_SMAX:
+    return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
+  case ISD::VECREDUCE_SMIN:
+    return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
+  case ISD::VECREDUCE_UMAX:
+    return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
+  case ISD::VECREDUCE_UMIN:
+    return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
+  case ISD::VECREDUCE_FMAX: {
+    assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
+    return DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
+        DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
+        Op.getOperand(0));
+  }
+  case ISD::VECREDUCE_FMIN: {
+    assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
+    return DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
+        DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
+        Op.getOperand(0));
+  }
+  default:
+    llvm_unreachable("Unhandled reduction");
+  }
+}
+
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
@@ -7132,7 +7366,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   if (I->getOpcode() != Instruction::FMul)
     return true;
 
-  if (I->getNumUses() != 1)
+  if (!I->hasOneUse())
     return true;
 
   Instruction *User = I->user_back();
@@ -7249,6 +7483,41 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
   return NumBits == 32 || NumBits == 64;
 }
 
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+                                                 const DataLayout &DL) const {
+  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+MachineMemOperand::Flags
+AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
+      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
+    return MOStridedAccess;
+  return MachineMemOperand::MONone;
+}
+
+bool AArch64TargetLowering::isLegalInterleavedAccessType(
+    VectorType *VecTy, const DataLayout &DL) const {
+
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+  // Ensure the number of vector elements is greater than 1.
+  if (VecTy->getNumElements() < 2)
+    return false;
+
+  // Ensure the element type is legal.
+  if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+    return false;
+
+  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+  // 128 will be split into multiple interleaved accesses.
+  return VecSize == 64 || VecSize % 128 == 0;
+}
+
 /// \brief Lower an interleaved load into a ldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -7272,12 +7541,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
 
-  // Skip if we do not have NEON and skip illegal vector types.
-  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
+  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getVectorElementType();
@@ -7285,6 +7557,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
+  IRBuilder<> Builder(LI);
+
+  // The base address of the load.
+  Value *BaseAddr = LI->getPointerOperand();
+
+  if (NumLoads > 1) {
+    // If we're going to generate more than one load, reset the sub-vector type
+    // to something legal.
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
+
+    // We will compute the pointer operand of each load from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
+  }
+
   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
   Type *Tys[2] = {VecTy, PtrTy};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
@@ -7293,39 +7584,50 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   Function *LdNFunc =
       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
-  IRBuilder<> Builder(LI);
-  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+  // Holds sub-vectors extracted from the load intrinsic return values. The
+  // sub-vectors are associated with the shufflevector instructions they will
+  // replace.
+  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
-  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
-  // Replace uses of each shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SVI = Shuffles[i];
-    unsigned Index = Indices[i];
+    // If we're generating more than one load, compute the base address of
+    // subsequent loads as an offset from the previous.
+    if (LoadCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(
+          BaseAddr, VecTy->getVectorNumElements() * Factor);
 
-    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+    CallInst *LdN = Builder.CreateCall(
+        LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
 
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+    // Extract and store the sub-vectors returned by the load intrinsic.
+    for (unsigned i = 0; i < Shuffles.size(); i++) {
+      ShuffleVectorInst *SVI = Shuffles[i];
+      unsigned Index = Indices[i];
 
-    SVI->replaceAllUsesWith(SubVec);
-  }
+      Value *SubVec = Builder.CreateExtractValue(LdN, Index);
 
-  return true;
-}
+      // Convert the integer vector to pointer vector if the element is pointer.
+      if (EltTy->isPointerTy())
+        SubVec = Builder.CreateIntToPtr(
+            SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
+                                    VecTy->getVectorNumElements()));
+      SubVecs[SVI].push_back(SubVec);
+    }
+  }
 
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+  // Replace uses of the shufflevector instructions with the sub-vectors
+  // returned by the load intrinsic. If a shufflevector instruction is
+  // associated with more than one sub-vector, those sub-vectors will be
+  // concatenated into a single wide vector.
+  for (ShuffleVectorInst *SVI : Shuffles) {
+    auto &SubVec = SubVecs[SVI];
+    auto *WideVec =
+        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+    SVI->replaceAllUsesWith(WideVec);
+  }
 
-  return ConstantVector::get(Mask);
+  return true;
 }
 
 /// \brief Lower an interleaved store into a stN intrinsic.
@@ -7369,12 +7671,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
-  // Skip if we do not have NEON and skip illegal vector types.
-  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
@@ -7394,6 +7699,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
+  // The base address of the store.
+  Value *BaseAddr = SI->getPointerOperand();
+
+  if (NumStores > 1) {
+    // If we're going to generate more than one store, reset the lane length
+    // and sub-vector type to something legal.
+    LaneLen /= NumStores;
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+    // We will compute the pointer operand of each store from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
+  }
+
+  auto Mask = SVI->getShuffleMask();
+
   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
   Type *Tys[2] = {SubVecTy, PtrTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
@@ -7402,34 +7726,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   Function *StNFunc =
       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
 
-  SmallVector<Value *, 5> Ops;
+  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
-  // Split the shufflevector operands into sub vectors for the new stN call.
-  auto Mask = SVI->getShuffleMask();
-  for (unsigned i = 0; i < Factor; i++) {
-    if (Mask[i] >= 0) {
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
-    } else {
-      unsigned StartMask = 0;
-      for (unsigned j = 1; j < LaneLen; j++) {
-        if (Mask[j*Factor + i] >= 0) {
-          StartMask = Mask[j*Factor + i] - j;
-          break;
+    SmallVector<Value *, 5> Ops;
+
+    // Split the shufflevector operands into sub vectors for the new stN call.
+    for (unsigned i = 0; i < Factor; i++) {
+      unsigned IdxI = StoreCount * LaneLen * Factor + i;
+      if (Mask[IdxI] >= 0) {
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+      } else {
+        unsigned StartMask = 0;
+        for (unsigned j = 1; j < LaneLen; j++) {
+          unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+          if (Mask[IdxJ * Factor + IdxI] >= 0) {
+            StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+            break;
+          }
         }
+        // Note: Filling undef gaps with random elements is ok, since
+        // those elements were being written anyway (with undefs).
+        // In the case of all undefs we're defaulting to using elems from 0
+        // Note: StartMask cannot be negative, it's checked in
+        // isReInterleaveMask
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
-      // Note: If all elements in a chunk are undefs, StartMask=0!
-      // Note: Filling undef gaps with random elements is ok, since
-      // those elements were being written anyway (with undefs).
-      // In the case of all undefs we're defaulting to using elems from 0
-      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
     }
-  }
 
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
-  Builder.CreateCall(StNFunc, Ops);
+    // If we generating more than one store, we compute the base address of
+    // subsequent stores as an offset from the previous.
+    if (StoreCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
   return true;
 }
 
@@ -7690,7 +8023,7 @@ SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
 
@@ -8079,9 +8412,9 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
 
 /// EXTR instruction extracts a contiguous chunk of bits from two existing
 /// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
+/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
+/// with an EXTR. Can't quite be done in TableGen because the two immediates
+/// aren't independent.
 static SDValue tryCombineToEXTR(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
@@ -8935,16 +9268,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
   // instructions (stp).
   SDLoc DL(&St);
   SDValue BasePtr = St.getBasePtr();
+  uint64_t BaseOffset = 0;
+
   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
   SDValue NewST1 =
       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
                    OrigAlignment, St.getMemOperand()->getFlags());
 
+  // As this in ISel, we will not merge this add which may degrade results.
+  if (BasePtr->getOpcode() == ISD::ADD &&
+      isa<ConstantSDNode>(BasePtr->getOperand(1))) {
+    BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+    BasePtr = BasePtr->getOperand(0);
+  }
+
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
     unsigned Alignment = MinAlign(OrigAlignment, Offset);
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                    DAG.getConstant(Offset, DL, MVT::i64));
+    SDValue OffsetPtr =
+        DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                    DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
                           PtrInfo.getWithOffset(Offset), Alignment,
                           St.getMemOperand()->getFlags());
@@ -9072,7 +9415,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return SDValue();
 
   StoreSDNode *S = cast<StoreSDNode>(N);
-  if (S->isVolatile())
+  if (S->isVolatile() || S->isIndexed())
     return SDValue();
 
   SDValue StVal = S->getValue();
@@ -9236,17 +9579,17 @@ static SDValue performPostLD1Combine(SDNode *N,
   return SDValue();
 }
 
-/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
 /// address translation.
 static bool performTBISimplification(SDValue Addr,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
-  APInt KnownZero, KnownOne;
-  TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
-                                        DCI.isBeforeLegalizeOps());
+  KnownBits Known;
+  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                        !DCI.isBeforeLegalizeOps());
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+  if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
     DCI.CommitTargetLoweringOpt(TLO);
     return true;
   }
@@ -9267,266 +9610,6 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
-  /// This function handles the log2-shuffle pattern produced by the
-/// LoopVectorizer for the across vector reduction. It consists of
-/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
-/// are reduced, where s is an induction variable from 0 to
-/// log2(NumVectorElements).
-static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
-                                                     unsigned Op,
-                                                     SelectionDAG &DAG) {
-  EVT VTy = OpV->getOperand(0).getValueType();
-  if (!VTy.isVector())
-    return SDValue();
-
-  int NumVecElts = VTy.getVectorNumElements();
-  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
-    if (NumVecElts != 4)
-      return SDValue();
-  } else {
-    if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
-      return SDValue();
-  }
-
-  int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
-  SDValue PreOp = OpV;
-  // Iterate over each step of the across vector reduction.
-  for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
-    SDValue CurOp = PreOp.getOperand(0);
-    SDValue Shuffle = PreOp.getOperand(1);
-    if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
-      // Try to swap the 1st and 2nd operand as add and min/max instructions
-      // are commutative.
-      CurOp = PreOp.getOperand(1);
-      Shuffle = PreOp.getOperand(0);
-      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
-        return SDValue();
-    }
-
-    // Check if the input vector is fed by the operator we want to handle,
-    // except the last step; the very first input vector is not necessarily
-    // the same operator we are handling.
-    if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
-      return SDValue();
-
-    // Check if it forms one step of the across vector reduction.
-    // E.g.,
-    //   %cur = add %1, %0
-    //   %shuffle = vector_shuffle %cur, <2, 3, u, u>
-    //   %pre = add %cur, %shuffle
-    if (Shuffle.getOperand(0) != CurOp)
-      return SDValue();
-
-    int NumMaskElts = 1 << CurStep;
-    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
-    // Check mask values in each step.
-    // We expect the shuffle mask in each step follows a specific pattern
-    // denoted here by the <M, U> form, where M is a sequence of integers
-    // starting from NumMaskElts, increasing by 1, and the number integers
-    // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
-    // of undef in U should be NumVecElts - NumMaskElts.
-    // E.g., for <8 x i16>, mask values in each step should be :
-    //   step 0 : <1,u,u,u,u,u,u,u>
-    //   step 1 : <2,3,u,u,u,u,u,u>
-    //   step 2 : <4,5,6,7,u,u,u,u>
-    for (int i = 0; i < NumVecElts; ++i)
-      if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
-          (i >= NumMaskElts && !(Mask[i] < 0)))
-        return SDValue();
-
-    PreOp = CurOp;
-  }
-  unsigned Opcode;
-  bool IsIntrinsic = false;
-
-  switch (Op) {
-  default:
-    llvm_unreachable("Unexpected operator for across vector reduction");
-  case ISD::ADD:
-    Opcode = AArch64ISD::UADDV;
-    break;
-  case ISD::SMAX:
-    Opcode = AArch64ISD::SMAXV;
-    break;
-  case ISD::UMAX:
-    Opcode = AArch64ISD::UMAXV;
-    break;
-  case ISD::SMIN:
-    Opcode = AArch64ISD::SMINV;
-    break;
-  case ISD::UMIN:
-    Opcode = AArch64ISD::UMINV;
-    break;
-  case ISD::FMAXNUM:
-    Opcode = Intrinsic::aarch64_neon_fmaxnmv;
-    IsIntrinsic = true;
-    break;
-  case ISD::FMINNUM:
-    Opcode = Intrinsic::aarch64_neon_fminnmv;
-    IsIntrinsic = true;
-    break;
-  }
-  SDLoc DL(N);
-
-  return IsIntrinsic
-             ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
-                           DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
-             : DAG.getNode(
-                   ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
-                   DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
-                   DAG.getConstant(0, DL, MVT::i64));
-}
-
-/// Target-specific DAG combine for the across vector min/max reductions.
-/// This function specifically handles the final clean-up step of the vector
-/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
-/// pattern, which narrows down and finds the final min/max value from all
-/// elements of the vector.
-/// For example, for a <16 x i8> vector :
-///   svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
-///   %smax0 = smax %arr, svn0
-///   %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
-///   %smax1 = smax %smax0, %svn1
-///   %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-///   %smax2 = smax %smax1, svn2
-///   %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-///   %sc = setcc %smax2, %svn3, gt
-///   %n0 = extract_vector_elt %sc, #0
-///   %n1 = extract_vector_elt %smax2, #0
-///   %n2 = extract_vector_elt $smax2, #1
-///   %result = select %n0, %n1, n2
-///     becomes :
-///   %1 = smaxv %0
-///   %result = extract_vector_elt %1, 0
-static SDValue
-performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
-                                        const AArch64Subtarget *Subtarget) {
-  if (!Subtarget->hasNEON())
-    return SDValue();
-
-  SDValue N0 = N->getOperand(0);
-  SDValue IfTrue = N->getOperand(1);
-  SDValue IfFalse = N->getOperand(2);
-
-  // Check if the SELECT merges up the final result of the min/max
-  // from a vector.
-  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-    return SDValue();
-
-  // Expect N0 is fed by SETCC.
-  SDValue SetCC = N0.getOperand(0);
-  EVT SetCCVT = SetCC.getValueType();
-  if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
-      SetCCVT.getVectorElementType() != MVT::i1)
-    return SDValue();
-
-  SDValue VectorOp = SetCC.getOperand(0);
-  unsigned Op = VectorOp->getOpcode();
-  // Check if the input vector is fed by the operator we want to handle.
-  if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
-      Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
-    return SDValue();
-
-  EVT VTy = VectorOp.getValueType();
-  if (!VTy.isVector())
-    return SDValue();
-
-  if (VTy.getSizeInBits() < 64)
-    return SDValue();
-
-  EVT EltTy = VTy.getVectorElementType();
-  if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
-    if (EltTy != MVT::f32)
-      return SDValue();
-  } else {
-    if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
-      return SDValue();
-  }
-
-  // Check if extracting from the same vector.
-  // For example,
-  //   %sc = setcc %vector, %svn1, gt
-  //   %n0 = extract_vector_elt %sc, #0
-  //   %n1 = extract_vector_elt %vector, #0
-  //   %n2 = extract_vector_elt $vector, #1
-  if (!(VectorOp == IfTrue->getOperand(0) &&
-        VectorOp == IfFalse->getOperand(0)))
-    return SDValue();
-
-  // Check if the condition code is matched with the operator type.
-  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
-  if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
-      (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
-      (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
-      (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
-      (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
-       CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
-       CC != ISD::SETGE) ||
-      (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
-       CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
-       CC != ISD::SETLE))
-    return SDValue();
-
-  // Expect to check only lane 0 from the vector SETCC.
-  if (!isNullConstant(N0.getOperand(1)))
-    return SDValue();
-
-  // Expect to extract the true value from lane 0.
-  if (!isNullConstant(IfTrue.getOperand(1)))
-    return SDValue();
-
-  // Expect to extract the false value from lane 1.
-  if (!isOneConstant(IfFalse.getOperand(1)))
-    return SDValue();
-
-  return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
-}
-
-/// Target-specific DAG combine for the across vector add reduction.
-/// This function specifically handles the final clean-up step of the vector
-/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
-/// pattern, which adds all elements of a vector together.
-/// For example, for a <4 x i32> vector :
-///   %1 = vector_shuffle %0, <2,3,u,u>
-///   %2 = add %0, %1
-///   %3 = vector_shuffle %2, <1,u,u,u>
-///   %4 = add %2, %3
-///   %result = extract_vector_elt %4, 0
-/// becomes :
-///   %0 = uaddv %0
-///   %result = extract_vector_elt %0, 0
-static SDValue
-performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
-                                     const AArch64Subtarget *Subtarget) {
-  if (!Subtarget->hasNEON())
-    return SDValue();
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // Check if the input vector is fed by the ADD.
-  if (N0->getOpcode() != ISD::ADD)
-    return SDValue();
-
-  // The vector extract idx must constant zero because we only expect the final
-  // result of the reduction is placed in lane 0.
-  if (!isNullConstant(N1))
-    return SDValue();
-
-  EVT VTy = N0.getValueType();
-  if (!VTy.isVector())
-    return SDValue();
-
-  EVT EltTy = VTy.getVectorElementType();
-  if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
-    return SDValue();
-
-  if (VTy.getSizeInBits() < 64)
-    return SDValue();
-
-  return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
-}
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
@@ -10205,12 +10288,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::SELECT: {
-    SDValue RV = performSelectCombine(N, DCI);
-    if (!RV.getNode())
-      RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
-    return RV;
-  }
+  case ISD::SELECT:
+    return performSelectCombine(N, DCI);
   case ISD::VSELECT:
     return performVSelectCombine(N, DCI.DAG);
   case ISD::LOAD:
@@ -10232,8 +10311,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performNVCASTCombine(N);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
-  case ISD::EXTRACT_VECTOR_ELT:
-    return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -10307,7 +10384,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
 // call. This will cause the optimizers to attempt to move, or duplicate,
 // return instructions to help enable tail call optimizations for this
 // instruction.
-bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return CI->isTailCall();
 }
 
@@ -10453,6 +10530,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
   case ISD::BITCAST:
     ReplaceBITCASTResults(N, Results, DAG);
     return;
+  case ISD::VECREDUCE_ADD:
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN:
+    Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
+    return;
+
   case AArch64ISD::SADDV:
     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
     return;
@@ -10483,9 +10568,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
-  if (!Subtarget->isTargetAndroid())
-    return true;
-  return TargetLowering::useLoadStackGuardNode();
+  if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
 }
 
 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10527,11 +10612,17 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
-  return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+  if (Size > 128) return AtomicExpansionKind::None;
+  // Nand not supported in LSE.
+  if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
+  // Leave 128 bits to LLSC.
+  return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
+  // If subtarget has LSE, leave cmpxchg intact for codegen.
+  if (Subtarget->hasLSE()) return false;
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
@@ -10623,36 +10714,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
   return false;
 }
 
-Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
-    return TargetLowering::getIRStackGuard(IRB);
-
-  // Android provides a fixed TLS slot for the stack cookie. See the definition
-  // of TLS_SLOT_STACK_GUARD in
-  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  const unsigned TlsOffset = 0x28;
+static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
-      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
 }
 
-Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
-    return TargetLowering::getSafeStackPointerLocation(IRB);
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  if (Subtarget->isTargetAndroid())
+    return UseTlsOffset(IRB, 0x28);
 
+  // Fuchsia is similar.
+  // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+  if (Subtarget->isTargetFuchsia())
+    return UseTlsOffset(IRB, -0x10);
+
+  return TargetLowering::getIRStackGuard(IRB);
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  const unsigned TlsOffset = 0x48;
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
-  return IRB.CreatePointerCast(
-      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
-      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+  if (Subtarget->isTargetAndroid())
+    return UseTlsOffset(IRB, 0x48);
+
+  // Fuchsia is similar.
+  // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+  if (Subtarget->isTargetFuchsia())
+    return UseTlsOffset(IRB, -0x8);
+
+  return TargetLowering::getSafeStackPointerLocation(IRB);
+}
+
+bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  // Only sink 'and' mask to cmp use block if it is masking a single bit, since
+  // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
+  // may be beneficial to sink in other cases, but we would have to check that
+  // the cmp would not get folded into the br to form a cbz for these to be
+  // beneficial.
+  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask)
+    return false;
+  return Mask->getUniqueInteger().isPowerOf2();
 }
 
 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
@@ -10702,7 +10813,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
   }
 }
 
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on AArch64 is expensive. However, when aggressively
   // optimizing for code size, we prefer to use a div instruction, as it is
   // usually smaller than the alternative sequence.
@@ -10711,6 +10822,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
   bool OptSize =
-      Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
+
+unsigned
+AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
+  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+    return getPointerTy(DL).getSizeInBits();
+
+  return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 054ccc3..3b0e0f1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -250,10 +250,14 @@ public:
 
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
-  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                     APInt &KnownOne, const SelectionDAG &DAG,
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+                                    TargetLoweringOpt &TLO) const override;
+
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
   /// Returns true if the target allows unaligned memory accesses of the
@@ -402,7 +406,20 @@ public:
     return AArch64::X1;
   }
 
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
+
+  bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                        const SelectionDAG &DAG) const override {
+    // Do not merge to float value size (128 bytes) if no implicit
+    // float attribute is set.
+
+    bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+        Attribute::NoImplicitFloat);
+
+    if (NoFloat)
+      return (MemVT.getSizeInBits() <= 64);
+    return true;
+  }
 
   bool isCheapToSpeculateCttz() const override {
     return true;
@@ -412,6 +429,8 @@ public:
     return true;
   }
 
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
   bool hasAndNotCompare(SDValue) const override {
     // 'bics'
     return true;
@@ -435,6 +454,22 @@ public:
     return true;
   }
 
+  /// Returns the size of the platform's va_list object.
+  unsigned getVaListSizeInBits(const DataLayout &DL) const override;
+
+  /// Returns true if \p VecTy is a legal interleaved access type. This
+  /// function checks the vector element type and the overall width of the
+  /// vector.
+  bool isLegalInterleavedAccessType(VectorType *VecTy,
+                                    const DataLayout &DL) const;
+
+  /// Returns the number of interleaved accesses that will be generated when
+  /// lowering accesses of the given type.
+  unsigned getNumInterleavedAccesses(VectorType *VecTy,
+                                     const DataLayout &DL) const;
+
+  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
@@ -491,6 +526,18 @@ private:
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
+  SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+  template <class NodeTy> SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const;
+  template <class NodeTy>
+  SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const;
+  template <class NodeTy> SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -509,6 +556,7 @@ private:
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
@@ -536,6 +584,7 @@ private:
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
@@ -576,7 +625,7 @@ private:
   }
 
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
-  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
   bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
                               ISD::MemIndexedMode &AM, bool &IsInc,
                               SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 867074c..eec41dd 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -14,6 +14,9 @@
 //===----------------------------------
 // Atomic fences
 //===----------------------------------
+let AddedComplexity = 15, Size = 0 in
+def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering),
+                             [(atomic_fence imm:$ordering, 0)]>, Sched<[]>;
 def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
 def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
 
@@ -402,3 +405,59 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
                           (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
                                GPR64:$newLo, GPR64:$newHi), []>,
                    Sched<[WriteAtomic]>;
+
+// v8.1 Atomic instructions:
+def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cefdf51..c44daf3 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -39,6 +39,9 @@ class AArch64Inst<Format f, string cstr> : Instruction {
   let Constraints = cstr;
 }
 
+class InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+  : InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>;
+
 // Pseudo instructions (don't have encoding information)
 class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
     : AArch64Inst<PseudoFrm, cstr> {
@@ -257,6 +260,7 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 class AsmImmRange<int Low, int High> : AsmOperandClass {
   let Name = "Imm" # Low # "_" # High;
   let DiagnosticType = "InvalidImm" # Low # "_" # High;
+  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
 }
 
 def Imm1_8Operand : AsmImmRange<1, 8>;
@@ -264,6 +268,20 @@ def Imm1_16Operand : AsmImmRange<1, 16>;
 def Imm1_32Operand : AsmImmRange<1, 32>;
 def Imm1_64Operand : AsmImmRange<1, 64>;
 
+class BranchTarget<int N> : AsmOperandClass {
+  let Name = "BranchTarget" # N;
+  let DiagnosticType = "InvalidLabel";
+  let PredicateMethod = "isBranchTarget<" # N # ">";
+}
+
+class PCRelLabel<int N> : BranchTarget<N> {
+  let Name = "PCRelLabel" # N;
+}
+
+def BranchTarget14Operand : BranchTarget<14>;
+def BranchTarget26Operand : BranchTarget<26>;
+def PCRelLabel19Operand   : PCRelLabel<19>;
+
 def MovZSymbolG3AsmOperand : AsmOperandClass {
   let Name = "MovZSymbolG3";
   let RenderMethod = "addImmOperands";
@@ -500,7 +518,8 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 // imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def Imm0_255Operand : AsmImmRange<0,255>;
+
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 256;
 }]> {
@@ -673,6 +692,14 @@ def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
 def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
 def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
 
+def gi_addsub_shifted_imm32 :
+    GIComplexOperandMatcher<s32, "selectArithImmed">,
+    GIComplexPatternEquiv<addsub_shifted_imm32>;
+
+def gi_addsub_shifted_imm64 :
+    GIComplexOperandMatcher<s64, "selectArithImmed">,
+    GIComplexPatternEquiv<addsub_shifted_imm64>;
+
 class neg_addsub_shifted_imm<ValueType Ty>
     : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
   let PrintMethod = "printAddSubImm";
@@ -1094,10 +1121,6 @@ def inv_ccode : Operand<i32> {
 
 // Conditional branch target. 19-bit immediate. The low two bits of the target
 // offset are implied zero and so are not part of the immediate.
-def PCRelLabel19Operand : AsmOperandClass {
-  let Name = "PCRelLabel19";
-  let DiagnosticType = "InvalidLabel";
-}
 def am_brcond : Operand<OtherVT> {
   let EncoderMethod = "getCondBranchTargetOpValue";
   let DecoderMethod = "DecodePCRelLabel19";
@@ -1154,9 +1177,6 @@ multiclass CmpBranch<bit op, string asm, SDNode node> {
 //---
 // Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
 // the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
 def am_tbrcond : Operand<OtherVT> {
   let EncoderMethod = "getTestBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
@@ -1166,11 +1186,12 @@ def am_tbrcond : Operand<OtherVT> {
 // AsmOperand classes to emit (or not) special diagnostics
 def TBZImm0_31Operand : AsmOperandClass {
   let Name = "TBZImm0_31";
-  let PredicateMethod = "isImm0_31";
+  let PredicateMethod = "isImmInRange<0,31>";
   let RenderMethod = "addImm0_31Operands";
 }
 def TBZImm32_63Operand : AsmOperandClass {
   let Name = "Imm32_63";
+  let PredicateMethod = "isImmInRange<32,63>";
   let DiagnosticType = "InvalidImm0_63";
 }
 
@@ -1232,10 +1253,6 @@ multiclass TestBranch<bit op, string asm, SDNode node> {
 //---
 // Unconditional branch (immediate) instructions.
 //---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-  let DiagnosticType = "InvalidLabel";
-}
 def am_b_target : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
@@ -1784,10 +1801,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
   }
 
   // add Rd, Rb, -imm -> sub Rd, Rn, imm
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1859,10 +1876,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
   } // Defs = [NZCV]
 
   // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1883,9 +1900,9 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
                   XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
 
   // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
-  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+  def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
                   WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+  def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
                   XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
 
   // Compare shorthands
@@ -2100,10 +2117,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
     let Inst{31} = 1;
   }
 
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2122,10 +2139,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
   }
   } // end Defs = [NZCV]
 
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2454,7 +2471,7 @@ class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
 
 // Load literal address: 19-bit immediate. The low two bits of the target
 // offset are implied zero and so are not part of the immediate.
-def am_ldrlit : Operand<OtherVT> {
+def am_ldrlit : Operand<iPTR> {
   let EncoderMethod = "getLoadLiteralOpValue";
   let DecoderMethod = "DecodePCRelLabel19";
   let PrintMethod = "printAlignedLabel";
@@ -9060,7 +9077,7 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
 // AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
 //----------------------------------------------------------------------------
 
-let Predicates = [HasNEON, HasV8_1a] in {
+let Predicates = [HasNEON, HasRDM] in {
 
 class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
                                     RegisterOperand regtype, string asm, 
@@ -9221,7 +9238,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
     let Inst{21} = idx{0};
   }
 }
-} // let Predicates = [HasNeon, HasV8_1a]
+} // let Predicates = [HasNeon, HasRDM]
 
 //----------------------------------------------------------------------------
 // Crypto extensions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4c78992..c0c6055 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,12 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -51,9 +52,6 @@ using namespace llvm;
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-static const MachineMemOperand::Flags MOSuppressPair =
-    MachineMemOperand::MOTargetFlag1;
-
 static cl::opt<unsigned>
 TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
@@ -369,7 +367,7 @@ void AArch64InstrInfo::instantiateCondBranch(
     // Folded compare-and-branch
     // Note that we use addOperand instead of addReg to keep the flags.
     const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
     if (Cond.size() > 3)
       MIB.addImm(Cond[3].getImm());
     MIB.addMBB(TBB);
@@ -762,6 +760,128 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
+bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+    if (ShiftVal == 0)
+      return true;
+    return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
+  }
+
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    switch (AArch64_AM::getArithExtendType(Imm)) {
+    default:
+      return false;
+    case AArch64_AM::UXTB:
+    case AArch64_AM::UXTH:
+    case AArch64_AM::UXTW:
+    case AArch64_AM::UXTX:
+      return AArch64_AM::getArithShiftValue(Imm) <= 4;
+    }
+  }
+
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+    return ShiftVal == 0 ||
+           (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
+  }
+
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+    return ShiftVal == 0 ||
+           (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
+  }
+
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    switch (AArch64_AM::getArithExtendType(Imm)) {
+    default:
+      return false;
+    case AArch64_AM::UXTB:
+    case AArch64_AM::UXTH:
+    case AArch64_AM::UXTW:
+    case AArch64_AM::UXTX:
+      return AArch64_AM::getArithShiftValue(Imm) == 0;
+    }
+  }
+
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroW:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+  case AArch64::STRBBroW:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroW:
+  case AArch64::STRBroX:
+  case AArch64::STRDroW:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroW:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroW:
+  case AArch64::STRHroX:
+  case AArch64::STRQroW:
+  case AArch64::STRQroX:
+  case AArch64::STRSroW:
+  case AArch64::STRSroX:
+  case AArch64::STRWroW:
+  case AArch64::STRWroX:
+  case AArch64::STRXroW:
+  case AArch64::STRXroX: {
+    unsigned IsSigned = MI.getOperand(3).getImm();
+    return !IsSigned;
+  }
+  }
+}
+
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                              unsigned &SrcReg, unsigned &DstReg,
                                              unsigned &SubIdx) const {
@@ -913,7 +1033,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
 /// \brief Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
   // Don't convert all compare instructions, because for some the zero register
   // encoding becomes the sp register.
   bool MIDefinesZeroReg = false;
@@ -1022,7 +1142,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
       return true;
     }
     unsigned Opc = CmpInstr.getOpcode();
-    unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+    unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
     if (NewOpc == Opc)
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
@@ -1159,6 +1279,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
     case AArch64CC::HI: // Z clear and C set
     case AArch64CC::LS: // Z set   or  C clear
       UsedFlags.Z = true;
+      LLVM_FALLTHROUGH;
     case AArch64CC::HS: // C set
     case AArch64CC::LO: // C clear
       UsedFlags.C = true;
@@ -1177,6 +1298,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
     case AArch64CC::GT: // Z clear, N and V the same
     case AArch64CC::LE: // Z set,   N and V differ
       UsedFlags.Z = true;
+      LLVM_FALLTHROUGH;
     case AArch64CC::GE: // N and V the same
     case AArch64CC::LT: // N and V differ 
       UsedFlags.N = true;
@@ -1299,16 +1421,16 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         .addMemOperand(*MI.memoperands_begin());
   } else if (TM.getCodeModel() == CodeModel::Large) {
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
         .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
         .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
     BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
         .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+        .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
         .addReg(Reg, RegState::Kill)
         .addImm(0)
@@ -1345,14 +1467,6 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
   case AArch64::BICSXrs:
   case AArch64::BICWrs:
   case AArch64::BICXrs:
-  case AArch64::CRC32Brr:
-  case AArch64::CRC32CBrr:
-  case AArch64::CRC32CHrr:
-  case AArch64::CRC32CWrr:
-  case AArch64::CRC32CXrr:
-  case AArch64::CRC32Hrr:
-  case AArch64::CRC32Wrr:
-  case AArch64::CRC32Xrr:
   case AArch64::EONWrs:
   case AArch64::EONXrs:
   case AArch64::EORWrs:
@@ -1598,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
 }
 
+/// Check all MachineMemOperands for a hint that the load/store is strided.
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOStridedAccess;
+  });
+}
+
 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
   switch (Opc) {
   default:
@@ -1691,16 +1812,59 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   } else
     return false;
 
-  // Offset is calculated as the immediate operand multiplied by the scaling factor.
-  // Unscaled instructions have scaling factor set to 1.
+  // Get the scaling factor for the instruction and set the width for the 
+  // instruction.
   unsigned Scale = 0;
-  switch (LdSt.getOpcode()) {
+  int64_t Dummy1, Dummy2;
+
+  // If this returns false, then it's an instruction we don't want to handle.
+  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
+    return false;
+
+  // Compute the offset. Offset is calculated as the immediate operand
+  // multiplied by the scaling factor. Unscaled instructions have scaling factor
+  // set to 1.
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
+  return true;
+}
+
+MachineOperand&
+AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1);
+  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
+  return OfsOp;
+}
+
+bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
+                                    unsigned &Width, int64_t &MinOffset,
+                                    int64_t &MaxOffset) const {
+  switch (Opcode) {
+  // Not a memory operation or something we want to handle.  
   default:
+    Scale = Width = 0;
+    MinOffset = MaxOffset = 0;
     return false;
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    Width = 32;
+    Scale = 4;
+    MinOffset = -256;
+    MaxOffset = 255;
+    break;
   case AArch64::LDURQi:
   case AArch64::STURQi:
     Width = 16;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURXi:
   case AArch64::LDURDi:
@@ -1708,6 +1872,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURDi:
     Width = 8;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURWi:
   case AArch64::LDURSi:
@@ -1716,6 +1882,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURSi:
     Width = 4;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURHi:
   case AArch64::LDURHHi:
@@ -1725,6 +1893,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURHHi:
     Width = 2;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURBi:
   case AArch64::LDURBBi:
@@ -1734,6 +1904,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURBBi:
     Width = 1;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDPQi:
   case AArch64::LDNPQi:
@@ -1741,10 +1913,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPQi:
     Scale = 16;
     Width = 32;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDPXi:
   case AArch64::LDPDi:
@@ -1756,12 +1932,16 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPDi:
     Scale = 8;
     Width = 16;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
     Scale = Width = 8;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDPWi:
   case AArch64::LDPSi:
@@ -1773,6 +1953,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPSi:
     Scale = 4;
     Width = 8;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
@@ -1780,29 +1962,27 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDRHui:
   case AArch64::LDRHHui:
   case AArch64::STRHui:
   case AArch64::STRHHui:
     Scale = Width = 2;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDRBui:
   case AArch64::LDRBBui:
   case AArch64::STRBui:
   case AArch64::STRBBui:
     Scale = Width = 1;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   }
 
-  if (LdSt.getNumExplicitOperands() == 3) {
-    BaseReg = LdSt.getOperand(1).getReg();
-    Offset = LdSt.getOperand(2).getImm() * Scale;
-  } else {
-    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
-    BaseReg = LdSt.getOperand(2).getReg();
-    Offset = LdSt.getOperand(3).getImm() * Scale;
-  }
   return true;
 }
 
@@ -1903,88 +2083,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(
-    const MachineInstr &First, const MachineInstr &Second) const {
-  if (Subtarget.hasArithmeticBccFusion()) {
-    // Fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::Bcc) {
-      switch (First.getOpcode()) {
-      default:
-        return false;
-      case AArch64::ADDSWri:
-      case AArch64::ADDSWrr:
-      case AArch64::ADDSXri:
-      case AArch64::ADDSXrr:
-      case AArch64::ANDSWri:
-      case AArch64::ANDSWrr:
-      case AArch64::ANDSXri:
-      case AArch64::ANDSXrr:
-      case AArch64::SUBSWri:
-      case AArch64::SUBSWrr:
-      case AArch64::SUBSXri:
-      case AArch64::SUBSXrr:
-      case AArch64::BICSWrr:
-      case AArch64::BICSXrr:
-        return true;
-      case AArch64::ADDSWrs:
-      case AArch64::ADDSXrs:
-      case AArch64::ANDSWrs:
-      case AArch64::ANDSXrs:
-      case AArch64::SUBSWrs:
-      case AArch64::SUBSXrs:
-      case AArch64::BICSWrs:
-      case AArch64::BICSXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
-      }
-    }
-  }
-  if (Subtarget.hasArithmeticCbzFusion()) {
-    // Fuse ALU operations followed by CBZ/CBNZ.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
-        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First.getOpcode()) {
-      default:
-        return false;
-      case AArch64::ADDWri:
-      case AArch64::ADDWrr:
-      case AArch64::ADDXri:
-      case AArch64::ADDXrr:
-      case AArch64::ANDWri:
-      case AArch64::ANDWrr:
-      case AArch64::ANDXri:
-      case AArch64::ANDXrr:
-      case AArch64::EORWri:
-      case AArch64::EORWrr:
-      case AArch64::EORXri:
-      case AArch64::EORXrr:
-      case AArch64::ORRWri:
-      case AArch64::ORRWrr:
-      case AArch64::ORRXri:
-      case AArch64::ORRXrr:
-      case AArch64::SUBWri:
-      case AArch64::SUBWrr:
-      case AArch64::SUBXri:
-      case AArch64::SUBXrr:
-        return true;
-      case AArch64::ADDWrs:
-      case AArch64::ADDXrs:
-      case AArch64::ANDWrs:
-      case AArch64::ANDXrs:
-      case AArch64::SUBWrs:
-      case AArch64::SUBXrs:
-      case AArch64::BICWrs:
-      case AArch64::BICXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
-      }
-    }
-  }
-  return false;
-}
-
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
     const MDNode *Expr, const DebugLoc &DL) const {
@@ -2339,7 +2437,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
       PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
   unsigned Opc = 0;
   bool Offset = true;
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRBui;
@@ -2443,7 +2541,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 
   unsigned Opc = 0;
   bool Offset = true;
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRBui;
@@ -2668,7 +2766,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     };
 
     if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
-      assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() &&
+      assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
+             TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
              "Mismatched register size in non subreg COPY");
       if (IsSpill)
         storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
@@ -2754,7 +2853,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
       }
 
       if (FillRC) {
-        assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() &&
+        assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
+                   TRI.getRegSizeInBits(*FillRC) &&
                "Mismatched regclass size on folded subreg COPY");
         loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
         MachineInstr &LoadMI = *--InsertPt;
@@ -3044,7 +3144,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   return false;
 }
 
-void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::createImm(0));
 }
@@ -3224,7 +3324,7 @@ static bool getMaddPatterns(MachineInstr &Root,
     // When NZCV is live bail out.
     if (Cmp_NZCV == -1)
       return false;
-    unsigned NewOpc = convertFlagSettingOpcode(Root);
+    unsigned NewOpc = convertToNonFlagSettingOpc(Root);
     // When opcode can't change bail out.
     // CHECKME: do we miss any cases for opcode conversion?
     if (NewOpc == Opc)
@@ -3444,6 +3544,10 @@ static bool getFMAPatterns(MachineInstr &Root,
       Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
       Found = true;
     }
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
+      Found = true;
+    }
     break;
   case AArch64::FSUBDrr:
     if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
@@ -3458,6 +3562,10 @@ static bool getFMAPatterns(MachineInstr &Root,
       Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
       Found = true;
     }
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
+      Found = true;
+    }
     break;
   case AArch64::FSUBv2f32:
     if (canCombineWithFMUL(MBB, Root.getOperand(2),
@@ -3512,6 +3620,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
   case MachineCombinerPattern::FMULADDD_OP2:
   case MachineCombinerPattern::FMULSUBD_OP1:
   case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FNMULSUBS_OP1:
+  case MachineCombinerPattern::FNMULSUBD_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
   case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
   case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
@@ -3565,12 +3675,17 @@ enum class FMAInstKind { Default, Indexed, Accumulator };
 ///  F|MUL I=A,B,0
 ///  F|ADD R,I,C
 ///  ==> F|MADD R,A,B,C
+/// \param MF Containing MachineFunction
+/// \param MRI Register information
+/// \param TII Target information
 /// \param Root is the F|ADD instruction
 /// \param [out] InsInstrs is a vector of machine instructions and will
 /// contain the generated madd instruction
 /// \param IdxMulOpd is index of operand in Root that is the result of
 /// the F|MUL. In the example above IdxMulOpd is 1.
 /// \param MaddOpc the opcode fo the f|madd instruction
+/// \param RC Register class of operands
+/// \param kind of fma instruction (addressing mode) to be generated
 static MachineInstr *
 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
                  const TargetInstrInfo *TII, MachineInstr &Root,
@@ -3629,6 +3744,9 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
 ///   ADD R,I,Imm
 ///   ==> ORR  V, ZR, Imm
 ///   ==> MADD R,A,B,V
+/// \param MF Containing MachineFunction
+/// \param MRI Register information
+/// \param TII Target information
 /// \param Root is the ADD instruction
 /// \param [out] InsInstrs is a vector of machine instructions and will
 /// contain the generated madd instruction
@@ -3637,6 +3755,7 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
 /// \param MaddOpc the opcode fo the madd instruction
 /// \param VR is a virtual register that holds the value of an ADD operand
 /// (V in the example above).
+/// \param RC Register class of operands
 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
                               const TargetInstrInfo *TII, MachineInstr &Root,
                               SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -3793,7 +3912,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MachineInstrBuilder MIB1 =
         BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
             .addReg(ZeroReg)
-            .addOperand(Root.getOperand(2));
+            .add(Root.getOperand(2));
     InsInstrs.push_back(MIB1);
     InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
     MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
@@ -4013,6 +4132,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
   }
+
+  case MachineCombinerPattern::FNMULSUBS_OP1:
+  case MachineCombinerPattern::FNMULSUBD_OP1: {
+    // FNMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMADD R,A,B,C // = -A*B - C
+    // --- Create(FNMADD);
+    if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
+      Opc = AArch64::FNMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+
   case MachineCombinerPattern::FMULSUBS_OP2:
   case MachineCombinerPattern::FMULSUBD_OP2: {
     // FMUL I=A,B,0
@@ -4028,6 +4165,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
+  }
 
   case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
     Opc = AArch64::FMLSv1i32_indexed;
@@ -4084,7 +4222,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
                              FMAInstKind::Accumulator);
     }
     break;
-  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
@@ -4094,26 +4231,36 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
 /// \brief Replace csincr-branch sequence by simple conditional branch
 ///
 /// Examples:
-/// 1.
+/// 1. \code
 ///   csinc  w9, wzr, wzr, <condition code>
 ///   tbnz   w9, #0, 0x44
+///    \endcode
 /// to
+///    \code
 ///   b.<inverted condition code>
+///    \endcode
 ///
-/// 2.
+/// 2. \code
 ///   csinc w9, wzr, wzr, <condition code>
 ///   tbz   w9, #0, 0x44
+///    \endcode
 /// to
+///    \code
 ///   b.<condition code>
+///    \endcode
 ///
 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the
 /// compare's constant operand is power of 2.
 ///
 /// Examples:
+///    \code
 ///   and  w8, w8, #0x400
 ///   cbnz w8, L1
+///    \endcode
 /// to
+///    \code
 ///   tbnz w8, #10, L1
+///    \endcode
 ///
 /// \param  MI Conditional Branch
 /// \return True when the simple conditional branch is generated
@@ -4286,3 +4433,207 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
       {MO_TLS, "aarch64-tls"}};
   return makeArrayRef(TargetFlags);
 }
+
+ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
+  static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
+      {{MOSuppressPair, "aarch64-suppress-pair"},
+       {MOStridedAccess, "aarch64-strided-access"}};
+  return makeArrayRef(TargetFlags);
+}
+
+unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize,
+                                               size_t Occurrences,
+                                               bool CanBeTailCall) const {
+  unsigned NotOutlinedSize = SequenceSize * Occurrences;
+  unsigned OutlinedSize;
+
+  // Is this candidate something we can outline as a tail call?
+  if (CanBeTailCall) {
+    // If yes, then we just outline the sequence and replace each of its
+    // occurrences with a branch instruction.
+    OutlinedSize = SequenceSize + Occurrences;
+  } else {
+    // If no, then we outline the sequence (SequenceSize), add a return (+1),
+    // and replace each occurrence with a save/restore to LR and a call
+    // (3 * Occurrences)
+    OutlinedSize = (SequenceSize + 1) + (3 * Occurrences);
+  }
+
+  // Return the number of instructions saved by outlining this sequence.
+  return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+  return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+AArch64GenInstrInfo::MachineOutlinerInstrType
+AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+  // Don't outline LOHs.
+  if (FuncInfo->getLOHRelated().count(&MI))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugValue() || MI.isIndirectDebugValue())
+    return MachineOutlinerInstrType::Invisible;
+
+  // Is this a terminator for a basic block?
+  if (MI.isTerminator()) {
+
+    // Is this the end of a function?
+    if (MI.getParent()->succ_empty())
+        return MachineOutlinerInstrType::Legal;
+
+    // It's not, so don't outline it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  // Don't outline positions.
+  if (MI.isPosition())
+    return MachineOutlinerInstrType::Illegal;
+
+  // Make sure none of the operands are un-outlinable.
+  for (const MachineOperand &MOP : MI.operands())
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return MachineOutlinerInstrType::Illegal;
+
+  // Don't outline anything that uses the link register.
+  if (MI.modifiesRegister(AArch64::LR, &RI) ||
+      MI.readsRegister(AArch64::LR, &RI))
+      return MachineOutlinerInstrType::Illegal;
+
+  // Does this use the stack?
+  if (MI.modifiesRegister(AArch64::SP, &RI) ||
+      MI.readsRegister(AArch64::SP, &RI)) {
+
+    // Is it a memory operation?
+    if (MI.mayLoadOrStore()) {
+      unsigned Base; // Filled with the base regiser of MI.
+      int64_t Offset; // Filled with the offset of MI.
+      unsigned DummyWidth;
+
+      // Does it allow us to offset the base register and is the base SP?
+      if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
+                                      Base != AArch64::SP)
+        return MachineOutlinerInstrType::Illegal;
+
+      // Find the minimum/maximum offset for this instruction and check if
+      // fixing it up would be in range.
+      int64_t MinOffset, MaxOffset;
+      unsigned DummyScale;
+      getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset,
+                   MaxOffset);
+
+      // TODO: We should really test what happens if an instruction overflows.
+      // This is tricky to test with IR tests, but when the outliner is moved
+      // to a MIR test, it really ought to be checked.
+      if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset)
+	return MachineOutlinerInstrType::Illegal;
+
+      // It's in range, so we can outline it.
+      return MachineOutlinerInstrType::Legal;
+    }
+
+    // We can't fix it up, so don't outline it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  return MachineOutlinerInstrType::Legal;
+}
+
+void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
+  for (MachineInstr &MI : MBB) {
+    unsigned Base, Width;
+    int64_t Offset;
+
+    // Is this a load or store with an immediate offset with SP as the base?
+    if (!MI.mayLoadOrStore() ||
+        !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) ||
+        Base != AArch64::SP)
+      continue;
+
+    // It is, so we have to fix it up.
+    unsigned Scale;
+    int64_t Dummy1, Dummy2;
+
+    MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
+    assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
+    getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
+    assert(Scale != 0 && "Unexpected opcode!");
+
+    // We've pushed the return address to the stack, so add 16 to the offset.
+    // This is safe, since we already checked if it would overflow when we
+    // checked if this instruction was legal to outline.
+    int64_t NewImm = (Offset + 16)/Scale;
+    StackOffsetOperand.setImm(NewImm);
+  }
+}
+
+void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                              MachineFunction &MF,
+                                              bool IsTailCall) const {
+
+  // If this is a tail call outlined function, then there's already a return.
+  if (IsTailCall)
+    return;
+
+  // It's not a tail call, so we have to insert the return ourselves.
+  MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
+                          .addReg(AArch64::LR, RegState::Undef);
+  MBB.insert(MBB.end(), ret);
+
+  // Walk over the basic block and fix up all the stack accesses.
+  fixupPostOutline(MBB);
+}
+
+void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                              MachineFunction &MF,
+                                              bool IsTailCall) const {}
+
+MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
+    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+    MachineFunction &MF, bool IsTailCall) const {
+
+  // Are we tail calling?
+  if (IsTailCall) {
+    // If yes, then we can just branch to the label.
+    It = MBB.insert(It,
+                    BuildMI(MF, DebugLoc(), get(AArch64::B))
+                        .addGlobalAddress(M.getNamedValue(MF.getName())));
+    return It;
+  }
+
+  // We're not tail calling, so we have to save LR before the call and restore
+  // it after.
+  MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
+                              .addReg(AArch64::SP, RegState::Define)
+                              .addReg(AArch64::LR)
+                              .addReg(AArch64::SP)
+                              .addImm(-16);
+  It = MBB.insert(It, STRXpre);
+  It++;
+
+  // Insert the call.
+  It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(AArch64::BL))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+
+  It++;
+
+  // Restore the link register.
+  MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
+                               .addReg(AArch64::SP, RegState::Define)
+                               .addReg(AArch64::LR)
+                               .addReg(AArch64::SP)
+                               .addImm(16);
+  It = MBB.insert(It, LDRXpost);
+
+  return It;
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 5037866..1765a02 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -27,6 +27,13 @@ namespace llvm {
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
+static const MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MOStridedAccess =
+    MachineMemOperand::MOTargetFlag2;
+
+#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"
+
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
@@ -81,6 +88,9 @@ public:
   /// unprofitable.
   bool isLdStPairSuppressed(const MachineInstr &MI) const;
 
+  /// Return true if the given load or store is a strided memory access.
+  bool isStridedAccess(const MachineInstr &MI) const;
+
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(unsigned Opc) const;
 
@@ -119,6 +129,44 @@ public:
     }
   }
 
+  /// \brief Return the opcode that set flags when possible.  The caller is
+  /// responsible for ensuring the opc has a flag setting equivalent.
+  static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
+    switch (Opc) {
+    default:
+      llvm_unreachable("Opcode has no flag setting equivalent!");
+    // 32-bit cases:
+    case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri;
+    case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr;
+    case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs;
+    case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx;
+    case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri;
+    case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr;
+    case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs;
+    case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr;
+    case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs;
+    case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri;
+    case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr;
+    case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs;
+    case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx;
+    // 64-bit cases:
+    case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri;
+    case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr;
+    case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs;
+    case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx;
+    case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri;
+    case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr;
+    case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs;
+    case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr;
+    case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs;
+    case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri;
+    case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr;
+    case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs;
+    case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx;
+    }
+  }
+
+
   /// Return true if this is a load/store that can be potentially paired/merged.
   bool isCandidateToMergeOrPair(MachineInstr &MI) const;
 
@@ -133,12 +181,19 @@ public:
                                   int64_t &Offset, unsigned &Width,
                                   const TargetRegisterInfo *TRI) const;
 
+  /// Return the immediate offset of the base register in a load/store \p LdSt.
+  MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
+
+  /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set
+  /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
+  ///
+  /// For unscaled instructions, \p Scale is set to 1.
+  bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+                    int64_t &MinOffset, int64_t &MaxOffset) const;
+
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
                            unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
-
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
                                          const MDNode *Expr,
@@ -198,7 +253,7 @@ public:
                     const DebugLoc &DL, unsigned DstReg,
                     ArrayRef<MachineOperand> Cond, unsigned TrueReg,
                     unsigned FalseReg) const override;
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  void getNoop(MCInst &NopInst) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
@@ -218,8 +273,8 @@ public:
   /// \param Pattern - combiner pattern
   bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
   /// Return true when there is potentially a faster code sequence
-  /// for an instruction chain ending in <Root>. All potential patterns are
-  /// listed in the <Patterns> array.
+  /// for an instruction chain ending in ``Root``. All potential patterns are
+  /// listed in the ``Patterns`` array.
   bool getMachineCombinerPatterns(MachineInstr &Root,
                   SmallVectorImpl<MachineCombinerPattern> &Patterns)
       const override;
@@ -244,8 +299,36 @@ public:
   getSerializableDirectMachineOperandTargetFlags() const override;
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
-
+  ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+  getSerializableMachineMemOperandTargetFlags() const override;
+
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+  unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences,
+                               bool CanBeTailCall) const override;
+  AArch64GenInstrInfo::MachineOutlinerInstrType
+  getOutliningType(MachineInstr &MI) const override;
+  void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool IsTailCall) const override;
+  void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool isTailCall) const override;
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It,
+                     MachineFunction &MF,
+                     bool IsTailCall) const override;
+  /// Returns true if the instruction has a shift by immediate that can be
+  /// executed in one cycle less.
+  bool isFalkorShiftExtFast(const MachineInstr &MI) const;
 private:
+
+  /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+  /// so that they will be valid post-outlining.
+  ///
+  /// \param MBB A \p MachineBasicBlock in an outlined function.
+  void fixupPostOutline(MachineBasicBlock &MBB) const;
+
   void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
                              MachineBasicBlock *TBB,
                              ArrayRef<MachineOperand> Cond) const;
@@ -283,7 +366,7 @@ enum AArch64FrameOffsetStatus {
 /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
 /// use an offset.eq
 /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
+/// rewritten in @p MI.
 /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
 /// amount that is off the limit of the legal offset.
 /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2244baa..5049a39 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -30,17 +30,29 @@ def HasLSE           : Predicate<"Subtarget->hasLSE()">,
                                  AssemblerPredicate<"FeatureLSE", "lse">;
 def HasRAS           : Predicate<"Subtarget->hasRAS()">,
                                  AssemblerPredicate<"FeatureRAS", "ras">;
+def HasRDM           : Predicate<"Subtarget->hasRDM()">,
+                                 AssemblerPredicate<"FeatureRDM", "rdm">;
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
                                  AssemblerPredicate<"FeatureSPE", "spe">;
+def HasFuseAES       : Predicate<"Subtarget->hasFuseAES()">,
+                                 AssemblerPredicate<"FeatureFuseAES",
+                                 "fuse-aes">;
+def HasSVE           : Predicate<"Subtarget->hasSVE()">,
+                                 AssemblerPredicate<"FeatureSVE", "sve">;
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def UseAlternateSExtLoadCVTF32
     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
+def UseNegativeImmediates
+    : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+                                             "NegativeImmediates">;
+
+
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
 //
@@ -149,7 +161,8 @@ def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
 def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
 def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                                 SDTCisVT<1, i32> ]>,
                                 [SDNPHasChain, SDNPOutGlue]>;
 def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
                                 SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -305,10 +318,13 @@ def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 //===----------------------------------------------------------------------===//
 
 // AArch64 Instruction Predicate Definitions.
-def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize   : Predicate<"ForCodeSize">;
-def NotForCodeSize   : Predicate<"!ForCodeSize">;
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+  def ForCodeSize   : Predicate<"MF->getFunction()->optForSize()">;
+  def NotForCodeSize   : Predicate<"!MF->getFunction()->optForSize()">;
+}
 
 include "AArch64InstrFormats.td"
 
@@ -321,8 +337,9 @@ include "AArch64InstrFormats.td"
 let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                              [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+                              Sched<[]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
@@ -424,8 +441,10 @@ def MSRpstateImm1 : MSRpstateImm0_1;
 def MSRpstateImm4 : MSRpstateImm0_15;
 
 // The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// TPIDR_EL0.  Add pseudo op so we can mark it as not having any side effects.
+let hasSideEffects = 0 in
+def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
+                       [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
 
 // The cycle counter PMC register is PMCCNTR_EL0.
 let Predicates = [HasPerfMon] in
@@ -574,31 +593,31 @@ def : Pat<(f64 fpimm:$in),
 // sequences.
 def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
                              tglobaladdr:$g1, tglobaladdr:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
-                                  tglobaladdr:$g2, 32),
-                          tglobaladdr:$g1, 16),
-                  tglobaladdr:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
+                                  tglobaladdr:$g1, 16),
+                          tglobaladdr:$g2, 32),
+                  tglobaladdr:$g3, 48)>;
 
 def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
                              tblockaddress:$g1, tblockaddress:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
-                                  tblockaddress:$g2, 32),
-                          tblockaddress:$g1, 16),
-                  tblockaddress:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
+                                  tblockaddress:$g1, 16),
+                          tblockaddress:$g2, 32),
+                  tblockaddress:$g3, 48)>;
 
 def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
                              tconstpool:$g1, tconstpool:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
-                                  tconstpool:$g2, 32),
-                          tconstpool:$g1, 16),
-                  tconstpool:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
+                                  tconstpool:$g1, 16),
+                          tconstpool:$g2, 32),
+                  tconstpool:$g3, 48)>;
 
 def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
                              tjumptable:$g1, tjumptable:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
-                                  tjumptable:$g2, 32),
-                          tjumptable:$g1, 16),
-                  tjumptable:$g0, 0)>;
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
+                                  tjumptable:$g1, 16),
+                          tjumptable:$g2, 32),
+                  tjumptable:$g3, 48)>;
 
 
 //===----------------------------------------------------------------------===//
@@ -697,10 +716,10 @@ def : InstAlias<"negs $dst, $src$shift",
 defm UDIV : Div<0, "udiv", udiv>;
 defm SDIV : Div<1, "sdiv", sdiv>;
 
-def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
-def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
-def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
-def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>;
 
 // Variable shift
 defm ASRV : Shift<0b10, "asr", sra>;
@@ -718,7 +737,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>;
 def : ShiftAlias<"rorv", RORVXr, GPR64>;
 
 // Multiply-add
-let AddedComplexity = 7 in {
+let AddedComplexity = 5 in {
 defm MADD : MulAccum<0, "madd", add>;
 defm MSUB : MulAccum<1, "msub", sub>;
 
@@ -735,7 +754,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
+} // AddedComplexity = 5
 
 let AddedComplexity = 5 in {
 def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
@@ -2577,6 +2596,11 @@ def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
 def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
     Sched<[WriteF]>;
 }
+// Similarly add aliases
+def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
+    Requires<[HasFullFP16]>;
+def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
+def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;
 
 //===----------------------------------------------------------------------===//
 // Floating point conversion instruction.
@@ -2720,60 +2744,36 @@ defm FMOV : FPMoveImmediate<"fmov">;
 defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
                                           int_aarch64_neon_uabd>;
 // Match UABDL in log2-shuffle patterns.
+def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
+                           (zext (v8i8 V64:$opB))))),
+          (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
 def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
                (v8i16 (add (sub (zext (v8i8 V64:$opA)),
                                 (zext (v8i8 V64:$opB))),
                            (AArch64vashr v8i16:$src, (i32 15))))),
           (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
+                           (zext (extract_high_v16i8 V128:$opB))))),
+          (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
 def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
                (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
                                 (zext (extract_high_v16i8 V128:$opB))),
                            (AArch64vashr v8i16:$src, (i32 15))))),
           (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
-               (v4i32 (add (sub (zext (v4i16 V64:$opA)),
-                                (zext (v4i16 V64:$opB))),
-                           (AArch64vashr v4i32:$src, (i32 31))))),
+def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
+                           (zext (v4i16 V64:$opB))))),
           (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
-               (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
-                                (zext (extract_high_v8i16 V128:$opB))),
-                          (AArch64vashr v4i32:$src, (i32 31))))),
+def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
+                           (zext (extract_high_v8i16 V128:$opB))))),
           (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
-               (v2i64 (add (sub (zext (v2i32 V64:$opA)),
-                                (zext (v2i32 V64:$opB))),
-                           (AArch64vashr v2i64:$src, (i32 63))))),
+def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
+                           (zext (v2i32 V64:$opB))))),
           (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
-               (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
-                                (zext (extract_high_v4i32 V128:$opB))),
-                          (AArch64vashr v2i64:$src, (i32 63))))),
+def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
+                           (zext (extract_high_v4i32 V128:$opB))))),
           (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
 
-defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
-def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
-               (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
-          (ABSv8i8 V64:$src)>;
-def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
-               (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
-          (ABSv4i16 V64:$src)>;
-def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
-               (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
-          (ABSv2i32 V64:$src)>;
-def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
-               (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
-          (ABSv16i8 V128:$src)>;
-def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
-               (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
-          (ABSv8i16 V128:$src)>;
-def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
-               (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
-          (ABSv4i32 V128:$src)>;
-def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
-               (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
-          (ABSv2i64 V128:$src)>;
-
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
 defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
 defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
 defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
@@ -3284,7 +3284,7 @@ defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>
 defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
 defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
-let Predicates = [HasV8_1a] in {
+let Predicates = [HasRDM] in {
   defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
   defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
   def : Pat<(i32 (int_aarch64_neon_sqadd
@@ -3345,7 +3345,7 @@ def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
 // Advanced SIMD two scalar instructions.
 //===----------------------------------------------------------------------===//
 
-defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", abs>;
 defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
 defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
 defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
@@ -5029,7 +5029,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                0),
                              dsub)))>,
     Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
- 
+
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
@@ -5307,6 +5307,31 @@ def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
 def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
 def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
 
+// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
+// for AES fusion on some CPUs.
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
+def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+                        Sched<[WriteV]>;
+def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+                         Sched<[WriteV]>;
+}
+
+// Only use constrained versions of AES(I)MC instructions if they are paired with
+// AESE/AESD.
+def : Pat<(v16i8 (int_aarch64_crypto_aesmc
+            (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
+                                            (v16i8 V128:$src2))))),
+          (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
+                                             (v16i8 V128:$src2)))))>,
+          Requires<[HasFuseAES]>;
+
+def : Pat<(v16i8 (int_aarch64_crypto_aesimc
+            (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
+                                            (v16i8 V128:$src2))))),
+          (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
+                                              (v16i8 V128:$src2)))))>,
+          Requires<[HasFuseAES]>;
+
 def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
 def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
 def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index b514735..7e275e4 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -12,17 +12,20 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
-#include "AArch64InstructionSelector.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
@@ -30,19 +33,79 @@
 
 #define DEBUG_TYPE "aarch64-isel"
 
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
 #error "You shouldn't build this"
 #endif
 
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class AArch64InstructionSelector : public InstructionSelector {
+public:
+  AArch64InstructionSelector(const AArch64TargetMachine &TM,
+                             const AArch64Subtarget &STI,
+                             const AArch64RegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
+                          MachineRegisterInfo &MRI) const;
+  bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
+                           MachineRegisterInfo &MRI) const;
+
+  bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
+                           MachineRegisterInfo &MRI) const;
+
+  ComplexRendererFn selectArithImmed(MachineOperand &Root) const;
+
+  const AArch64TargetMachine &TM;
+  const AArch64Subtarget &STI;
+  const AArch64InstrInfo &TII;
+  const AArch64RegisterInfo &TRI;
+  const AArch64RegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
 #include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+// We declare the temporaries used by selectImpl() in the class to minimize the
+// cost of constructing placeholder values.
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
 
 AArch64InstructionSelector::AArch64InstructionSelector(
     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
     const AArch64RegisterBankInfo &RBI)
-  : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
@@ -119,71 +182,39 @@ static bool unsupportedBinOp(const MachineInstr &I,
 }
 
 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
-/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID
+/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
 /// and of size \p OpSize.
 /// \returns \p GenericOpc if the combination is unsupported.
 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
                                unsigned OpSize) {
   switch (RegBankID) {
   case AArch64::GPRRegBankID:
-    if (OpSize <= 32) {
-      assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV &&
-                               GenericOpc != TargetOpcode::G_UDIV &&
-                               GenericOpc != TargetOpcode::G_LSHR &&
-                               GenericOpc != TargetOpcode::G_ASHR)) &&
-             "operation should have been legalized before now");
-
+    if (OpSize == 32) {
       switch (GenericOpc) {
-      case TargetOpcode::G_OR:
-        return AArch64::ORRWrr;
-      case TargetOpcode::G_XOR:
-        return AArch64::EORWrr;
-      case TargetOpcode::G_AND:
-        return AArch64::ANDWrr;
-      case TargetOpcode::G_ADD:
-        assert(OpSize != 32 && "s32 G_ADD should have been selected");
-        return AArch64::ADDWrr;
-      case TargetOpcode::G_SUB:
-        return AArch64::SUBWrr;
       case TargetOpcode::G_SHL:
         return AArch64::LSLVWr;
       case TargetOpcode::G_LSHR:
         return AArch64::LSRVWr;
       case TargetOpcode::G_ASHR:
         return AArch64::ASRVWr;
-      case TargetOpcode::G_SDIV:
-        return AArch64::SDIVWr;
-      case TargetOpcode::G_UDIV:
-        return AArch64::UDIVWr;
       default:
         return GenericOpc;
       }
     } else if (OpSize == 64) {
       switch (GenericOpc) {
-      case TargetOpcode::G_OR:
-        return AArch64::ORRXrr;
-      case TargetOpcode::G_XOR:
-        return AArch64::EORXrr;
-      case TargetOpcode::G_AND:
-        return AArch64::ANDXrr;
       case TargetOpcode::G_GEP:
         return AArch64::ADDXrr;
-      case TargetOpcode::G_SUB:
-        return AArch64::SUBXrr;
       case TargetOpcode::G_SHL:
         return AArch64::LSLVXr;
       case TargetOpcode::G_LSHR:
         return AArch64::LSRVXr;
       case TargetOpcode::G_ASHR:
         return AArch64::ASRVXr;
-      case TargetOpcode::G_SDIV:
-        return AArch64::SDIVXr;
-      case TargetOpcode::G_UDIV:
-        return AArch64::UDIVXr;
       default:
         return GenericOpc;
       }
     }
+    break;
   case AArch64::FPRRegBankID:
     switch (OpSize) {
     case 32:
@@ -215,7 +246,8 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
         return GenericOpc;
       }
     }
-  };
+    break;
+  }
   return GenericOpc;
 }
 
@@ -239,6 +271,7 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
     case 64:
       return isStore ? AArch64::STRXui : AArch64::LDRXui;
     }
+    break;
   case AArch64::FPRRegBankID:
     switch (OpSize) {
     case 8:
@@ -250,7 +283,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
     case 64:
       return isStore ? AArch64::STRDui : AArch64::LDRDui;
     }
-  };
+    break;
+  }
   return GenericOpc;
 }
 
@@ -473,6 +507,82 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
   }
 }
 
+bool AArch64InstructionSelector::selectCompareBranch(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+
+  const unsigned CondReg = I.getOperand(0).getReg();
+  MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+  MachineInstr *CCMI = MRI.getVRegDef(CondReg);
+  if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
+    return false;
+
+  unsigned LHS = CCMI->getOperand(2).getReg();
+  unsigned RHS = CCMI->getOperand(3).getReg();
+  if (!getConstantVRegVal(RHS, MRI))
+    std::swap(RHS, LHS);
+
+  const auto RHSImm = getConstantVRegVal(RHS, MRI);
+  if (!RHSImm || *RHSImm != 0)
+    return false;
+
+  const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
+  if (RB.getID() != AArch64::GPRRegBankID)
+    return false;
+
+  const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
+  if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
+    return false;
+
+  const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
+  unsigned CBOpc = 0;
+  if (CmpWidth <= 32)
+    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
+  else if (CmpWidth == 64)
+    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
+  else
+    return false;
+
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
+                 .addUse(LHS)
+                 .addMBB(DestMBB);
+
+  constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::selectVaStartAAPCS(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  return false;
+}
+
+bool AArch64InstructionSelector::selectVaStartDarwin(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned ListReg = I.getOperand(0).getReg();
+
+  unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+
+  auto MIB =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
+          .addDef(ArgsAddrReg)
+          .addFrameIndex(FuncInfo->getVarArgsStackIndex())
+          .addImm(0)
+          .addImm(0);
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+  MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
+            .addUse(ArgsAddrReg)
+            .addUse(ListReg)
+            .addImm(0)
+            .addMemOperand(*I.memoperands_begin());
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -549,6 +659,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const unsigned CondReg = I.getOperand(0).getReg();
     MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
+    if (selectCompareBranch(I, MF, MRI))
+      return true;
+
     auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
                    .addUse(CondReg)
                    .addImm(/*bit offset=*/0)
@@ -558,6 +671,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_BRINDIRECT: {
+    I.setDesc(TII.get(AArch64::BR));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
   case TargetOpcode::G_FCONSTANT:
   case TargetOpcode::G_CONSTANT: {
     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
@@ -629,9 +747,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       // FIXME: Is going through int64_t always correct?
       ImmOp.ChangeToImmediate(
           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
-    } else {
+    } else if (I.getOperand(1).isCImm()) {
       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
       I.getOperand(1).ChangeToImmediate(Val);
+    } else if (I.getOperand(1).isImm()) {
+      uint64_t Val = I.getOperand(1).getImm();
+      I.getOperand(1).ChangeToImmediate(Val);
     }
 
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -686,10 +807,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       return false;
     }
 
-#ifndef NDEBUG
-    // Sanity-check the pointer register.
+    auto &MemOp = **I.memoperands_begin();
+    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      return false;
+    }
+
     const unsigned PtrReg = I.getOperand(1).getReg();
+#ifndef NDEBUG
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
+    // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
            "Load/Store pointer operand isn't a GPR");
     assert(MRI.getType(PtrReg).isPointer() &&
@@ -706,11 +833,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
     I.setDesc(TII.get(NewOpc));
 
-    I.addOperand(MachineOperand::CreateImm(0));
+    uint64_t Offset = 0;
+    auto *PtrMI = MRI.getVRegDef(PtrReg);
+
+    // Try to fold a GEP into our unsigned immediate addressing mode.
+    if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
+      if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
+        int64_t Imm = *COff;
+        const unsigned Size = MemTy.getSizeInBits() / 8;
+        const unsigned Scale = Log2_32(Size);
+        if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
+          unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+          I.getOperand(1).setReg(Ptr2Reg);
+          PtrMI = MRI.getVRegDef(Ptr2Reg);
+          Offset = Imm / Size;
+        }
+      }
+    }
+
+    // If we haven't folded anything into our addressing mode yet, try to fold
+    // a frame index into the base+offset.
+    if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+      I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
+
+    I.addOperand(MachineOperand::CreateImm(Offset));
+
+    // If we're storing a 0, use WZR/XZR.
+    if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
+      if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
+        if (I.getOpcode() == AArch64::STRWui)
+          I.getOperand(0).setReg(AArch64::WZR);
+        else if (I.getOpcode() == AArch64::STRXui)
+          I.getOperand(0).setReg(AArch64::XZR);
+      }
+    }
+
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
-  case TargetOpcode::G_MUL: {
+  case TargetOpcode::G_SMULH:
+  case TargetOpcode::G_UMULH: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
@@ -719,48 +881,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n");
+      DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
       return false;
     }
 
-    unsigned ZeroReg;
-    unsigned NewOpc;
-    if (Ty.isScalar() && Ty.getSizeInBits() <= 32) {
-      NewOpc = AArch64::MADDWrrr;
-      ZeroReg = AArch64::WZR;
-    } else if (Ty == LLT::scalar(64)) {
-      NewOpc = AArch64::MADDXrrr;
-      ZeroReg = AArch64::XZR;
-    } else {
-      DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: "
-                   << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n');
+    if (Ty != LLT::scalar(64)) {
+      DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
+                   << ", expected: " << LLT::scalar(64) << '\n');
       return false;
     }
 
+    unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
+                                                             : AArch64::UMULHrr;
     I.setDesc(TII.get(NewOpc));
 
-    I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false));
-
     // Now that we selected an opcode, we need to constrain the register
     // operands to use appropriate classes.
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
-
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
 
   case TargetOpcode::G_OR:
-  case TargetOpcode::G_XOR:
-  case TargetOpcode::G_AND:
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR:
-  case TargetOpcode::G_SDIV:
-  case TargetOpcode::G_UDIV:
-  case TargetOpcode::G_ADD:
-  case TargetOpcode::G_SUB:
   case TargetOpcode::G_GEP: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
@@ -783,6 +930,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_PTR_MASK: {
+    uint64_t Align = I.getOperand(2).getImm();
+    if (Align >= 64 || Align == 0)
+      return false;
+
+    uint64_t Mask = ~((1ULL << Align) - 1);
+    I.setDesc(TII.get(AArch64::ANDXri));
+    I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_TRUNC: {
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -795,7 +953,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
 
     if (DstRB.getID() != SrcRB.getID()) {
-      DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+      DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
       return false;
     }
 
@@ -812,16 +970,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
-        DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+        DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
       if (DstRC == SrcRC) {
         // Nothing to be done
+      } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
+                 SrcTy == LLT::scalar(64)) {
+        llvm_unreachable("TableGen can import this case");
+        return false;
       } else if (DstRC == &AArch64::GPR32RegClass &&
                  SrcRC == &AArch64::GPR64RegClass) {
         I.getOperand(1).setSubReg(AArch64::sub_32);
       } else {
+        DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
@@ -1026,7 +1189,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
     if (Ty == LLT::scalar(32)) {
       CSelOpc = AArch64::CSELWr;
-    } else if (Ty == LLT::scalar(64)) {
+    } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
       CSelOpc = AArch64::CSELXr;
     } else {
       return false;
@@ -1134,7 +1297,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
              .addDef(Def1Reg)
              .addUse(AArch64::WZR)
              .addUse(AArch64::WZR)
-             .addImm(CC1);
+             .addImm(getInvertedCondCode(CC1));
 
     if (CC2 != AArch64CC::AL) {
       unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
@@ -1143,7 +1306,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
                .addDef(Def2Reg)
                .addUse(AArch64::WZR)
                .addUse(AArch64::WZR)
-               .addImm(CC2);
+               .addImm(getInvertedCondCode(CC2));
       MachineInstr &OrMI =
           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
                .addDef(DefReg)
@@ -1159,7 +1322,67 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     I.eraseFromParent();
     return true;
   }
+  case TargetOpcode::G_VASTART:
+    return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
+                                : selectVaStartAAPCS(I, MF, MRI);
+  case TargetOpcode::G_IMPLICIT_DEF:
+    I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+    return true;
   }
 
   return false;
 }
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+InstructionSelector::ComplexRendererFn
+AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
+  MachineInstr &MI = *Root.getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  uint64_t Immed;
+  if (Root.isImm())
+    Immed = Root.getImm();
+  else if (Root.isCImm())
+    Immed = Root.getCImm()->getZExtValue();
+  else if (Root.isReg()) {
+    MachineInstr *Def = MRI.getVRegDef(Root.getReg());
+    if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
+      return nullptr;
+    MachineOperand &Op1 = Def->getOperand(1);
+    if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
+      return nullptr;
+    Immed = Op1.getCImm()->getZExtValue();
+  } else
+    return nullptr;
+
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return nullptr;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); };
+}
+
+namespace llvm {
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &TM,
+                                 AArch64Subtarget &Subtarget,
+                                 AArch64RegisterBankInfo &RBI) {
+  return new AArch64InstructionSelector(TM, Subtarget, RBI);
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
deleted file mode 100644
index 2c6e5a9..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- AArch64InstructionSelector --------------------------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file declares the targeting of the InstructionSelector class for
-/// AArch64.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-
-namespace llvm {
-
-class AArch64InstrInfo;
-class AArch64RegisterBankInfo;
-class AArch64RegisterInfo;
-class AArch64Subtarget;
-class AArch64TargetMachine;
-
-class AArch64InstructionSelector : public InstructionSelector {
-public:
-  AArch64InstructionSelector(const AArch64TargetMachine &TM,
-                             const AArch64Subtarget &STI,
-                             const AArch64RegisterBankInfo &RBI);
-
-  bool select(MachineInstr &I) const override;
-
-private:
-  /// tblgen-erated 'select' implementation, used as the initial selector for
-  /// the patterns that don't require complex C++.
-  bool selectImpl(MachineInstr &I) const;
-
-  const AArch64TargetMachine &TM;
-  const AArch64Subtarget &STI;
-  const AArch64InstrInfo &TII;
-  const AArch64RegisterInfo &TRI;
-  const AArch64RegisterBankInfo &RBI;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 83f276a..ffb2783 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,9 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Target/TargetOpcodes.h"
 
 using namespace llvm;
@@ -36,11 +39,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   const LLT v4s32 = LLT::vector(4, 32);
   const LLT v2s64 = LLT::vector(2, 64);
 
-  for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
+  for (auto Ty : {p0, s1, s8, s16, s32, s64})
+    setAction({G_IMPLICIT_DEF, Ty}, Legal);
+
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
     // These operations naturally get the right answer when used on
     // GPR32, even if the actual type is narrower.
-    for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
+    for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
       setAction({BinOp, Ty}, Legal);
+
+    for (auto Ty : {s1, s8, s16})
+      setAction({BinOp, Ty}, WidenScalar);
   }
 
   setAction({G_GEP, p0}, Legal);
@@ -49,7 +58,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   for (auto Ty : {s1, s8, s16, s32})
     setAction({G_GEP, 1, Ty}, WidenScalar);
 
-  for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
+  setAction({G_PTR_MASK, p0}, Legal);
+
+  for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
@@ -57,25 +68,47 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
       setAction({BinOp, Ty}, WidenScalar);
   }
 
-  for (auto BinOp : { G_SREM, G_UREM })
+  for (unsigned BinOp : {G_SREM, G_UREM})
     for (auto Ty : { s1, s8, s16, s32, s64 })
       setAction({BinOp, Ty}, Lower);
 
-  for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) {
+  for (unsigned Op : {G_SMULO, G_UMULO})
+      setAction({Op, s64}, Lower);
+
+  for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
     for (auto Ty : { s32, s64 })
       setAction({Op, Ty}, Legal);
 
     setAction({Op, 1, s1}, Legal);
   }
 
-  for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
-  setAction({G_FREM, s32}, Libcall);
-  setAction({G_FREM, s64}, Libcall);
+  for (unsigned BinOp : {G_FREM, G_FPOW}) {
+    setAction({BinOp, s32}, Libcall);
+    setAction({BinOp, s64}, Libcall);
+  }
+
+  for (auto Ty : {s32, s64, p0}) {
+    setAction({G_INSERT, Ty}, Legal);
+    setAction({G_INSERT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {s1, s8, s16}) {
+    setAction({G_INSERT, Ty}, WidenScalar);
+    setAction({G_INSERT, 1, Ty}, Legal);
+    // FIXME: Can't widen the sources because that violates the constraints on
+    // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
+  }
+
+  for (auto Ty : {s1, s8, s16, s32, s64, p0})
+    setAction({G_EXTRACT, Ty}, Legal);
+
+  for (auto Ty : {s32, s64})
+    setAction({G_EXTRACT, 1, Ty}, Legal);
 
-  for (auto MemOp : {G_LOAD, G_STORE}) {
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
     for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
       setAction({MemOp, Ty}, Legal);
 
@@ -141,12 +174,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_TRUNC, 1, Ty}, Legal);
 
   // Conversions
-  for (auto Ty : { s1, s8, s16, s32, s64 }) {
+  for (auto Ty : { s32, s64 }) {
     setAction({G_FPTOSI, 0, Ty}, Legal);
     setAction({G_FPTOUI, 0, Ty}, Legal);
     setAction({G_SITOFP, 1, Ty}, Legal);
     setAction({G_UITOFP, 1, Ty}, Legal);
   }
+  for (auto Ty : { s1, s8, s16 }) {
+    setAction({G_FPTOSI, 0, Ty}, WidenScalar);
+    setAction({G_FPTOUI, 0, Ty}, WidenScalar);
+    setAction({G_SITOFP, 1, Ty}, WidenScalar);
+    setAction({G_UITOFP, 1, Ty}, WidenScalar);
+  }
 
   for (auto Ty : { s32, s64 }) {
     setAction({G_FPTOSI, 1, Ty}, Legal);
@@ -158,9 +197,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   // Control-flow
   for (auto Ty : {s1, s8, s16, s32})
     setAction({G_BRCOND, Ty}, Legal);
+  setAction({G_BRINDIRECT, p0}, Legal);
 
   // Select
-  for (auto Ty : {s1, s8, s16, s32, s64})
+  for (auto Ty : {s1, s8, s16})
+    setAction({G_SELECT, Ty}, WidenScalar);
+
+  for (auto Ty : {s32, s64, p0})
     setAction({G_SELECT, Ty}, Legal);
 
   setAction({G_SELECT, 1, s1}, Legal);
@@ -200,5 +243,81 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
   }
 
+  setAction({G_VASTART, p0}, Legal);
+
+  // va_list must be a pointer, but most sized types are pretty easy to handle
+  // as the destination.
+  setAction({G_VAARG, 1, p0}, Legal);
+
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({G_VAARG, Ty}, Custom);
+
   computeTables();
 }
+
+bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                          MachineRegisterInfo &MRI,
+                                          MachineIRBuilder &MIRBuilder) const {
+  switch (MI.getOpcode()) {
+  default:
+    // No idea what to do.
+    return false;
+  case TargetOpcode::G_VAARG:
+    return legalizeVaArg(MI, MRI, MIRBuilder);
+  }
+
+  llvm_unreachable("expected switch to return");
+}
+
+bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineIRBuilder &MIRBuilder) const {
+  MIRBuilder.setInstr(MI);
+  MachineFunction &MF = MIRBuilder.getMF();
+  unsigned Align = MI.getOperand(2).getImm();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned ListPtr = MI.getOperand(1).getReg();
+
+  LLT PtrTy = MRI.getType(ListPtr);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
+  unsigned List = MRI.createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildLoad(
+      List, ListPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+                               PtrSize, /* Align = */ PtrSize));
+
+  unsigned DstPtr;
+  if (Align > PtrSize) {
+    // Realign the list to the actual required alignment.
+    auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1);
+
+    unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildGEP(ListTmp, List, AlignMinus1->getOperand(0).getReg());
+
+    DstPtr = MRI.createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
+  } else
+    DstPtr = List;
+
+  uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8;
+  MIRBuilder.buildLoad(
+      Dst, DstPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+                               ValSize, std::max(Align, PtrSize)));
+
+  unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy);
+  MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize));
+
+  unsigned NewList = MRI.createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildGEP(NewList, DstPtr, SizeReg);
+
+  MIRBuilder.buildStore(
+      NewList, ListPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
+                               PtrSize, /* Align = */ PtrSize));
+
+  MI.eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
index feacbef..42d4ac1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -25,6 +25,13 @@ class LLVMContext;
 class AArch64LegalizerInfo : public LegalizerInfo {
 public:
   AArch64LegalizerInfo();
+
+  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &MIRBuilder) const override;
+
+private:
+  bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     MachineIRBuilder &MIRBuilder) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 8e312dc..9a7f45b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -18,17 +18,27 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-ldst-opt"
@@ -58,15 +68,15 @@ typedef struct LdStPairFlags {
   // If a matching instruction is found, MergeForward is set to true if the
   // merge is to remove the first instruction and replace the second with
   // a pair-wise insn, and false if the reverse is true.
-  bool MergeForward;
+  bool MergeForward = false;
 
   // SExtIdx gives the index of the result of the load pair that must be
   // extended. The value of SExtIdx assumes that the paired load produces the
   // value in this order: (I, returned iterator), i.e., -1 means no value has
   // to be extended, 0 means I, and 1 means the returned iterator.
-  int SExtIdx;
+  int SExtIdx = -1;
 
-  LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+  LdStPairFlags() = default;
 
   void setMergeForward(bool V = true) { MergeForward = V; }
   bool getMergeForward() const { return MergeForward; }
@@ -78,10 +88,12 @@ typedef struct LdStPairFlags {
 
 struct AArch64LoadStoreOpt : public MachineFunctionPass {
   static char ID;
+
   AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
   }
 
+  AliasAnalysis *AA;
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const AArch64Subtarget *Subtarget;
@@ -89,6 +101,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Track which registers have been modified and used.
   BitVector ModifiedRegs, UsedRegs;
 
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
@@ -162,8 +179,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
 };
+
 char AArch64LoadStoreOpt::ID = 0;
-} // namespace
+
+} // end anonymous namespace
 
 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
@@ -246,7 +265,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   default:
     if (IsValidLdStrOpc)
       *IsValidLdStrOpc = false;
-    return UINT_MAX;
+    return std::numeric_limits<unsigned>::max();
   case AArch64::STRDui:
   case AArch64::STURDi:
   case AArch64::STRQui:
@@ -369,6 +388,10 @@ static unsigned isMatchingStore(MachineInstr &LoadInst,
 }
 
 static unsigned getPreIndexedOpcode(unsigned Opc) {
+  // FIXME: We don't currently support creating pre-indexed loads/stores when
+  // the load or store is the unscaled version.  If we decide to perform such an
+  // optimization in the future the cases for the unscaled loads/stores will
+  // need to be added here.
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pre-indexed equivalent!");
@@ -432,32 +455,42 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   default:
     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
   case AArch64::STRSui:
+  case AArch64::STURSi:
     return AArch64::STRSpost;
   case AArch64::STRDui:
+  case AArch64::STURDi:
     return AArch64::STRDpost;
   case AArch64::STRQui:
+  case AArch64::STURQi:
     return AArch64::STRQpost;
   case AArch64::STRBBui:
     return AArch64::STRBBpost;
   case AArch64::STRHHui:
     return AArch64::STRHHpost;
   case AArch64::STRWui:
+  case AArch64::STURWi:
     return AArch64::STRWpost;
   case AArch64::STRXui:
+  case AArch64::STURXi:
     return AArch64::STRXpost;
   case AArch64::LDRSui:
+  case AArch64::LDURSi:
     return AArch64::LDRSpost;
   case AArch64::LDRDui:
+  case AArch64::LDURDi:
     return AArch64::LDRDpost;
   case AArch64::LDRQui:
+  case AArch64::LDURQi:
     return AArch64::LDRQpost;
   case AArch64::LDRBBui:
     return AArch64::LDRBBpost;
   case AArch64::LDRHHui:
     return AArch64::LDRHHpost;
   case AArch64::LDRWui:
+  case AArch64::LDURWi:
     return AArch64::LDRWpost;
   case AArch64::LDRXui:
+  case AArch64::LDURXi:
     return AArch64::LDRXpost;
   case AArch64::LDRSWui:
     return AArch64::LDRSWpost;
@@ -595,7 +628,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
   MachineInstrBuilder MIB;
   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
-            .addOperand(BaseRegOp)
+            .add(BaseRegOp)
             .addImm(OffsetImm)
             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
   (void)MIB;
@@ -709,9 +742,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     }
   }
   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
-            .addOperand(RegOp0)
-            .addOperand(RegOp1)
-            .addOperand(BaseRegOp)
+            .add(RegOp0)
+            .add(RegOp1)
+            .add(BaseRegOp)
             .addImm(OffsetImm)
             .setMemRefs(I->mergeMemRefsWith(*Paired));
 
@@ -776,6 +809,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   int LoadSize = getMemScale(*LoadI);
   int StoreSize = getMemScale(*StoreI);
   unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  const MachineOperand &StMO = getLdStRegOp(*StoreI);
   unsigned StRt = getLdStRegOp(*StoreI).getReg();
   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
 
@@ -788,7 +822,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     // Remove the load, if the destination register of the loads is the same
     // register for stored value.
     if (StRt == LdRt && LoadSize == 8) {
-      StoreI->clearRegisterKills(StRt, TRI);
+      for (MachineInstr &MI : make_range(StoreI->getIterator(),
+                                         LoadI->getIterator())) {
+        if (MI.killsRegister(StRt, TRI)) {
+          MI.clearRegisterKills(StRt, TRI);
+          break;
+        }
+      }
       DEBUG(dbgs() << "Remove load instruction:\n    ");
       DEBUG(LoadI->print(dbgs()));
       DEBUG(dbgs() << "\n");
@@ -800,7 +840,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
         BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                 TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
             .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
-            .addReg(StRt)
+            .add(StMO)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
   } else {
     // FIXME: Currently we disable this transformation in big-endian targets as
@@ -841,14 +881,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                   TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
                   DestReg)
-              .addReg(StRt)
+              .add(StMO)
               .addImm(AndMaskEncoded);
     } else {
       BitExtMI =
           BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
                   TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
                   DestReg)
-              .addReg(StRt)
+              .add(StMO)
               .addImm(Immr)
               .addImm(Imms);
     }
@@ -857,7 +897,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   // Clear kill flags between store and load.
   for (MachineInstr &MI : make_range(StoreI->getIterator(),
                                      BitExtMI->getIterator()))
-    MI.clearRegisterKills(StRt, TRI);
+    if (MI.killsRegister(StRt, TRI)) {
+      MI.clearRegisterKills(StRt, TRI);
+      break;
+    }
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
   DEBUG(StoreI->print(dbgs()));
@@ -923,7 +966,7 @@ static int alignTo(int Num, int PowOf2) {
 }
 
 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
-                     const AArch64InstrInfo *TII) {
+                     AliasAnalysis *AA) {
   // One of the instructions must modify memory.
   if (!MIa.mayStore() && !MIb.mayStore())
     return false;
@@ -932,14 +975,14 @@ static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
     return false;
 
-  return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
+  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
 }
 
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
-                     const AArch64InstrInfo *TII) {
+                     AliasAnalysis *AA) {
   for (MachineInstr *MIb : MemInsns)
-    if (mayAlias(MIa, *MIb, TII))
+    if (mayAlias(MIa, *MIb, AA))
       return true;
 
   return false;
@@ -997,7 +1040,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
+    if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
       return false;
   } while (MBBI != B && Count < Limit);
   return false;
@@ -1167,7 +1210,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // first.
         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
-            !mayAlias(MI, MemInsns, TII)) {
+            !mayAlias(MI, MemInsns, AA)) {
           Flags.setMergeForward(false);
           return MBBI;
         }
@@ -1178,7 +1221,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // into the second.
         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
-            !mayAlias(FirstMI, MemInsns, TII)) {
+            !mayAlias(FirstMI, MemInsns, AA)) {
           Flags.setMergeForward(true);
           return MBBI;
         }
@@ -1233,19 +1276,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(*Update))
-              .addOperand(getLdStRegOp(*I))
-              .addOperand(getLdStBaseOp(*I))
+              .add(getLdStRegOp(*Update))
+              .add(getLdStRegOp(*I))
+              .add(getLdStBaseOp(*I))
               .addImm(Value)
               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   } else {
     // Paired instruction.
     int Scale = getMemScale(*I);
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(*Update))
-              .addOperand(getLdStRegOp(*I, 0))
-              .addOperand(getLdStRegOp(*I, 1))
-              .addOperand(getLdStBaseOp(*I))
+              .add(getLdStRegOp(*Update))
+              .add(getLdStRegOp(*I, 0))
+              .add(getLdStRegOp(*I, 1))
+              .add(getLdStBaseOp(*I))
               .addImm(Value / Scale)
               .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   }
@@ -1545,7 +1588,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
     case AArch64::LDURBBi:
     case AArch64::LDURHHi:
     case AArch64::LDURWi:
-    case AArch64::LDURXi: {
+    case AArch64::LDURXi:
       if (tryToPromoteLoadFromStore(MBBI)) {
         Modified = true;
         break;
@@ -1553,7 +1596,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
       break;
     }
-    }
   }
   // 2) Merge adjacent zero stores into a wider store.
   //      e.g.,
@@ -1666,8 +1708,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
         ++NumPostFolded;
         break;
       }
-      // Don't know how to handle pre/post-index versions, so move to the next
-      // instruction.
+
+      // Don't know how to handle unscaled pre/post-index versions below, so
+      // move to the next instruction.
       if (TII->isUnscaledLdSt(Opc)) {
         ++MBBI;
         break;
@@ -1722,6 +1765,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   TRI = Subtarget->getRegisterInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   // Resize the modified and used register bitfield trackers.  We do this once
   // per function and then clear the bitfield each time we optimize a load or
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 45083df..f82b9db 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -151,13 +151,24 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
   return MCOperand::createExpr(Expr);
 }
 
+MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
+                                                     MCSymbol *Sym) const {
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::createExpr(Expr);
+}
+
 MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                  MCSymbol *Sym) const {
   if (Printer.TM.getTargetTriple().isOSDarwin())
     return lowerSymbolOperandDarwin(MO, Sym);
+  if (Printer.TM.getTargetTriple().isOSBinFormatCOFF())
+    return lowerSymbolOperandCOFF(MO, Sym);
 
-  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() &&
-         "Expect Darwin or ELF target");
+  assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && "Invalid target");
   return lowerSymbolOperandELF(MO, Sym);
 }
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
index 1e29b80..aa30fe1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -42,6 +42,8 @@ public:
                                      MCSymbol *Sym) const;
   MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
                                   MCSymbol *Sym) const;
+  MCOperand lowerSymbolOperandCOFF(const MachineOperand &MO,
+                                   MCSymbol *Sym) const;
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
new file mode 100644
index 0000000..963cfad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -0,0 +1,166 @@
+//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AArch64 implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MacroFusion.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
+  const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
+
+  // Assume wildcards for unspecified instrs.
+  unsigned FirstOpcode =
+    FirstMI ? FirstMI->getOpcode()
+	    : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  if (ST.hasArithmeticBccFusion())
+    // Fuse CMN, CMP, TST followed by Bcc.
+    if (SecondOpcode == AArch64::Bcc)
+      switch (FirstOpcode) {
+      default:
+        return false;
+      case AArch64::ADDSWri:
+      case AArch64::ADDSWrr:
+      case AArch64::ADDSXri:
+      case AArch64::ADDSXrr:
+      case AArch64::ANDSWri:
+      case AArch64::ANDSWrr:
+      case AArch64::ANDSXri:
+      case AArch64::ANDSXrr:
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+      case AArch64::BICSWrr:
+      case AArch64::BICSXrr:
+        return true;
+      case AArch64::ADDSWrs:
+      case AArch64::ADDSXrs:
+      case AArch64::ANDSWrs:
+      case AArch64::ANDSXrs:
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+      case AArch64::BICSWrs:
+      case AArch64::BICSXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !II.hasShiftedReg(*FirstMI);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
+      }
+
+  if (ST.hasArithmeticCbzFusion())
+    // Fuse ALU operations followed by CBZ/CBNZ.
+    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
+      switch (FirstOpcode) {
+      default:
+        return false;
+      case AArch64::ADDWri:
+      case AArch64::ADDWrr:
+      case AArch64::ADDXri:
+      case AArch64::ADDXrr:
+      case AArch64::ANDWri:
+      case AArch64::ANDWrr:
+      case AArch64::ANDXri:
+      case AArch64::ANDXrr:
+      case AArch64::EORWri:
+      case AArch64::EORWrr:
+      case AArch64::EORXri:
+      case AArch64::EORXrr:
+      case AArch64::ORRWri:
+      case AArch64::ORRWrr:
+      case AArch64::ORRXri:
+      case AArch64::ORRXrr:
+      case AArch64::SUBWri:
+      case AArch64::SUBWrr:
+      case AArch64::SUBXri:
+      case AArch64::SUBXrr:
+        return true;
+      case AArch64::ADDWrs:
+      case AArch64::ADDXrs:
+      case AArch64::ANDWrs:
+      case AArch64::ANDXrs:
+      case AArch64::SUBWrs:
+      case AArch64::SUBXrs:
+      case AArch64::BICWrs:
+      case AArch64::BICXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !II.hasShiftedReg(*FirstMI);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
+      }
+
+  if (ST.hasFuseAES())
+    // Fuse AES crypto operations.
+    switch(SecondOpcode) {
+    // AES encode.
+    case AArch64::AESMCrr:
+    case AArch64::AESMCrrTied:
+      return FirstOpcode == AArch64::AESErr ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+    // AES decode.
+    case AArch64::AESIMCrr:
+    case AArch64::AESIMCrrTied:
+      return FirstOpcode == AArch64::AESDrr ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+    }
+
+  if (ST.hasFuseLiterals())
+    // Fuse literal generation operations.
+    switch (SecondOpcode) {
+    // PC relative address.
+    case AArch64::ADDXri:
+      return FirstOpcode == AArch64::ADRP ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+    // 32 bit immediate.
+    case AArch64::MOVKWi:
+      return (FirstOpcode == AArch64::MOVZWi &&
+              SecondMI.getOperand(3).getImm() == 16) ||
+             FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Lower and upper half of 64 bit immediate.
+    case AArch64::MOVKXi:
+      return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+             (FirstOpcode == AArch64::MOVZXi &&
+              SecondMI.getOperand(3).getImm() == 16) ||
+             (FirstOpcode == AArch64::MOVKXi &&
+              FirstMI->getOperand(3).getImm() == 32 &&
+              SecondMI.getOperand(3).getImm() == 48);
+    }
+
+  return false;
+}
+
+} // end namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h
new file mode 100644
index 0000000..32d90d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h
@@ -0,0 +1,24 @@
+//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AArch64 definition of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createAArch64MacroFusionDAGMutation());
+/// to AArch64PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation();
+
+} // llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index 038162c..fe4ef4b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -17,8 +17,8 @@
 
 #define DEBUG_TYPE "aarch64-pbqp"
 
-#include "AArch64.h"
 #include "AArch64PBQPRegAlloc.h"
+#include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
index 4f656f9..b99c1d1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -1,4 +1,4 @@
-//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,8 @@
 
 namespace llvm {
 
+class TargetRegisterInfo;
+
 /// Add the accumulator chaining constraint to a PBQP graph
 class A57ChainingConstraint : public PBQPRAConstraint {
 public:
@@ -33,6 +35,7 @@ private:
   // Add constraints between existing chains
   void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
 };
-}
+
+} // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 8f45e6a..4e65c0a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -12,13 +12,14 @@
 //    CBZW %W0, <BB#2>
 //  BB#2:
 //    %W0 = COPY %WZR
-// This pass should be run after register allocation.
+// Similarly, this pass also handles non-zero copies.
+//  BB#0:
+//    cmp x0, #1
+//    b.eq .LBB0_1
+//  .LBB0_1:
+//    orr x0, xzr, #0x1
 //
-// FIXME: This should be extended to handle any constant other than zero. E.g.,
-//   cmp w0, #1
-//     b.eq .BB1
-//   BB1:
-//     mov w0, #1
+// This pass should be run after register allocation.
 //
 // FIXME: This could also be extended to check the whole dominance subtree below
 // the comparison if the compile time regression is acceptable.
@@ -26,6 +27,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -43,6 +45,7 @@ namespace {
 class AArch64RedundantCopyElimination : public MachineFunctionPass {
   const MachineRegisterInfo *MRI;
   const TargetRegisterInfo *TRI;
+  BitVector ClobberedRegs;
 
 public:
   static char ID;
@@ -50,6 +53,16 @@ public:
     initializeAArch64RedundantCopyEliminationPass(
         *PassRegistry::getPassRegistry());
   }
+
+  struct RegImm {
+    MCPhysReg Reg;
+    int32_t Imm;
+    RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {}
+  };
+
+  Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr,
+                                      MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator &FirstUse);
   bool optimizeCopy(MachineBasicBlock *MBB);
   bool runOnMachineFunction(MachineFunction &MF) override;
   MachineFunctionProperties getRequiredProperties() const override {
@@ -66,18 +79,121 @@ char AArch64RedundantCopyElimination::ID = 0;
 INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
                 "AArch64 redundant copy elimination pass", false, false)
 
-static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
-  unsigned Opc = MI.getOpcode();
+/// Remember what registers the specified instruction modifies.
+static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs,
+                         const TargetRegisterInfo *TRI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      ClobberedRegs.setBitsNotInMask(MO.getRegMask());
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (!MO.isDef())
+      continue;
+
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      ClobberedRegs.set(*AI);
+  }
+}
+
+/// It's possible to determine the value of a register based on a dominating
+/// condition.  To do so, this function checks to see if the basic block \p MBB
+/// is the target to which a conditional branch \p CondBr jumps and whose
+/// equality comparison is against a constant.  If so, return a known physical
+/// register and constant value pair.  Otherwise, return None.
+Optional<AArch64RedundantCopyElimination::RegImm>
+AArch64RedundantCopyElimination::knownRegValInBlock(
+    MachineInstr &CondBr, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator &FirstUse) {
+  unsigned Opc = CondBr.getOpcode();
+
   // Check if the current basic block is the target block to which the
   // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
-  if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
-      MBB == MI.getOperand(1).getMBB())
-    return true;
-  else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
-           MBB != MI.getOperand(1).getMBB())
-    return true;
-
-  return false;
+  if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+       MBB == CondBr.getOperand(1).getMBB()) ||
+      ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+       MBB != CondBr.getOperand(1).getMBB())) {
+    FirstUse = CondBr;
+    return RegImm(CondBr.getOperand(0).getReg(), 0);
+  }
+
+  // Otherwise, must be a conditional branch.
+  if (Opc != AArch64::Bcc)
+    return None;
+
+  // Must be an equality check (i.e., == or !=).
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return None;
+
+  MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB();
+  if ((CC == AArch64CC::EQ && BrTarget != MBB) ||
+      (CC == AArch64CC::NE && BrTarget == MBB))
+    return None;
+
+  // Stop if we get to the beginning of PredMBB.
+  MachineBasicBlock *PredMBB = *MBB->pred_begin();
+  assert(PredMBB == CondBr.getParent() &&
+         "Conditional branch not in predecessor block!");
+  if (CondBr == PredMBB->begin())
+    return None;
+
+  // Registers clobbered in PredMBB between CondBr instruction and current
+  // instruction being checked in loop.
+  ClobberedRegs.reset();
+
+  // Find compare instruction that sets NZCV used by CondBr.
+  MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator();
+  for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) {
+
+    // Track clobbered registers.
+    trackRegDefs(PredI, ClobberedRegs, TRI);
+
+    bool IsCMN = false;
+    switch (PredI.getOpcode()) {
+    default:
+      break;
+
+    // CMN is an alias for ADDS with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      IsCMN = true;
+      LLVM_FALLTHROUGH;
+    // CMP is an alias for SUBS with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri: {
+      MCPhysReg SrcReg = PredI.getOperand(1).getReg();
+
+      // Must not be a symbolic immediate.
+      if (!PredI.getOperand(2).isImm())
+        return None;
+
+      // The src register must not be modified between the cmp and conditional
+      // branch.  This includes a self-clobbering compare.
+      if (ClobberedRegs[SrcReg])
+        return None;
+
+      // We've found the Cmp that sets NZCV.
+      int32_t KnownImm = PredI.getOperand(2).getImm();
+      int32_t Shift = PredI.getOperand(3).getImm();
+      KnownImm <<= Shift;
+      if (IsCMN)
+        KnownImm = -KnownImm;
+      FirstUse = PredI;
+      return RegImm(SrcReg, KnownImm);
+    }
+    }
+
+    // Bail if we see an instruction that defines NZCV that we don't handle.
+    if (PredI.definesRegister(AArch64::NZCV))
+      return None;
+  }
+  return None;
 }
 
 bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
@@ -85,79 +201,187 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
   if (MBB->pred_size() != 1)
     return false;
 
+  // Check if the predecessor has two successors, implying the block ends in a
+  // conditional branch.
   MachineBasicBlock *PredMBB = *MBB->pred_begin();
-  MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
-  if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+  if (PredMBB->succ_size() != 2)
+    return false;
+
+  MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr();
+  if (CondBr == PredMBB->end())
     return false;
 
-  ++CompBr;
+  // Keep track of the earliest point in the PredMBB block where kill markers
+  // need to be removed if a COPY is removed.
+  MachineBasicBlock::iterator FirstUse;
+  // After calling knownRegValInBlock, FirstUse will either point to a CBZ/CBNZ
+  // or a compare (i.e., SUBS).  In the latter case, we must take care when
+  // updating FirstUse when scanning for COPY instructions.  In particular, if
+  // there's a COPY in between the compare and branch the COPY should not
+  // update FirstUse.
+  bool SeenFirstUse = false;
+  // Registers that contain a known value at the start of MBB.
+  SmallVector<RegImm, 4> KnownRegs;
+
+  MachineBasicBlock::iterator Itr = std::next(CondBr);
   do {
-    --CompBr;
-    if (guaranteesZeroRegInBlock(*CompBr, MBB))
-      break;
-  } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+    --Itr;
 
-  // We've not found a CBZ/CBNZ, time to bail out.
-  if (!guaranteesZeroRegInBlock(*CompBr, MBB))
-    return false;
+    Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse);
+    if (KnownRegImm == None)
+      continue;
 
-  unsigned TargetReg = CompBr->getOperand(0).getReg();
-  if (!TargetReg)
-    return false;
-  assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
-         "Expect physical register");
+    KnownRegs.push_back(*KnownRegImm);
+
+    // Reset the clobber list, which is used by knownRegValInBlock.
+    ClobberedRegs.reset();
+
+    // Look backward in PredMBB for COPYs from the known reg to find other
+    // registers that are known to be a constant value.
+    for (auto PredI = Itr;; --PredI) {
+      if (FirstUse == PredI)
+        SeenFirstUse = true;
+
+      if (PredI->isCopy()) {
+        MCPhysReg CopyDstReg = PredI->getOperand(0).getReg();
+        MCPhysReg CopySrcReg = PredI->getOperand(1).getReg();
+        for (auto &KnownReg : KnownRegs) {
+          if (ClobberedRegs[KnownReg.Reg])
+            continue;
+          // If we have X = COPY Y, and Y is known to be zero, then now X is
+          // known to be zero.
+          if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) {
+            KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm));
+            if (SeenFirstUse)
+              FirstUse = PredI;
+            break;
+          }
+          // If we have X = COPY Y, and X is known to be zero, then now Y is
+          // known to be zero.
+          if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) {
+            KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm));
+            if (SeenFirstUse)
+              FirstUse = PredI;
+            break;
+          }
+        }
+      }
+
+      // Stop if we get to the beginning of PredMBB.
+      if (PredI == PredMBB->begin())
+        break;
+
+      trackRegDefs(*PredI, ClobberedRegs, TRI);
+      // Stop if all of the known-zero regs have been clobbered.
+      if (all_of(KnownRegs, [&](RegImm KnownReg) {
+            return ClobberedRegs[KnownReg.Reg];
+          }))
+        break;
+    }
+    break;
+
+  } while (Itr != PredMBB->begin() && Itr->isTerminator());
 
-  // Remember all registers aliasing with TargetReg.
-  SmallSetVector<unsigned, 8> TargetRegs;
-  for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
-    TargetRegs.insert(*AI);
+  // We've not found a registers with a known value, time to bail out.
+  if (KnownRegs.empty())
+    return false;
 
   bool Changed = false;
+  // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB.
+  SmallSetVector<unsigned, 4> UsedKnownRegs;
   MachineBasicBlock::iterator LastChange = MBB->begin();
-  unsigned SmallestDef = TargetReg;
-  // Remove redundant Copy instructions unless TargetReg is modified.
+  // Remove redundant Copy instructions unless KnownReg is modified.
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     MachineInstr *MI = &*I;
     ++I;
-    if (MI->isCopy() && MI->getOperand(0).isReg() &&
-        MI->getOperand(1).isReg()) {
-
-      unsigned DefReg = MI->getOperand(0).getReg();
-      unsigned SrcReg = MI->getOperand(1).getReg();
-
-      if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
-          !MRI->isReserved(DefReg) &&
-          (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
-        DEBUG(dbgs() << "Remove redundant Copy : ");
-        DEBUG((MI)->print(dbgs()));
-
-        MI->eraseFromParent();
-        Changed = true;
-        LastChange = I;
-        NumCopiesRemoved++;
-        SmallestDef =
-            TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
-        continue;
+    bool RemovedMI = false;
+    bool IsCopy = MI->isCopy();
+    bool IsMoveImm = MI->isMoveImmediate();
+    if (IsCopy || IsMoveImm) {
+      MCPhysReg DefReg = MI->getOperand(0).getReg();
+      MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0;
+      int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0;
+      if (!MRI->isReserved(DefReg) &&
+          ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) ||
+           IsMoveImm)) {
+        for (RegImm &KnownReg : KnownRegs) {
+          if (KnownReg.Reg != DefReg &&
+              !TRI->isSuperRegister(DefReg, KnownReg.Reg))
+            continue;
+
+          // For a copy, the known value must be a zero.
+          if (IsCopy && KnownReg.Imm != 0)
+            continue;
+
+          if (IsMoveImm) {
+            // For a move immediate, the known immediate must match the source
+            // immediate.
+            if (KnownReg.Imm != SrcImm)
+              continue;
+
+            // Don't remove a move immediate that implicitly defines the upper
+            // bits when only the lower 32 bits are known.
+            MCPhysReg CmpReg = KnownReg.Reg;
+            if (any_of(MI->implicit_operands(), [CmpReg](MachineOperand &O) {
+                  return !O.isDead() && O.isReg() && O.isDef() &&
+                         O.getReg() != CmpReg;
+                }))
+              continue;
+          }
+
+          if (IsCopy)
+            DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
+          else
+            DEBUG(dbgs() << "Remove redundant Move : " << *MI);
+
+          MI->eraseFromParent();
+          Changed = true;
+          LastChange = I;
+          NumCopiesRemoved++;
+          UsedKnownRegs.insert(KnownReg.Reg);
+          RemovedMI = true;
+          break;
+        }
       }
     }
 
-    if (MI->modifiesRegister(TargetReg, TRI))
+    // Skip to the next instruction if we removed the COPY/MovImm.
+    if (RemovedMI)
+      continue;
+
+    // Remove any regs the MI clobbers from the KnownConstRegs set.
+    for (unsigned RI = 0; RI < KnownRegs.size();)
+      if (MI->modifiesRegister(KnownRegs[RI].Reg, TRI)) {
+        std::swap(KnownRegs[RI], KnownRegs[KnownRegs.size() - 1]);
+        KnownRegs.pop_back();
+        // Don't increment RI since we need to now check the swapped-in
+        // KnownRegs[RI].
+      } else {
+        ++RI;
+      }
+
+    // Continue until the KnownRegs set is empty.
+    if (KnownRegs.empty())
       break;
   }
 
   if (!Changed)
     return false;
 
-  // Otherwise, we have to fixup the use-def chain, starting with the
-  // CBZ/CBNZ. Conservatively mark as much as we can live.
-  CompBr->clearRegisterKills(SmallestDef, TRI);
+  // Add newly used regs to the block's live-in list if they aren't there
+  // already.
+  for (MCPhysReg KnownReg : UsedKnownRegs)
+    if (!MBB->isLiveIn(KnownReg))
+      MBB->addLiveIn(KnownReg);
 
-  if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
-    MBB->addLiveIn(TargetReg);
-
-  // Clear any kills of TargetReg between CompBr and the last removed COPY.
+  // Clear kills in the range where changes were made.  This is conservative,
+  // but should be okay since kill markers are being phased out.
+  DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
+               << "\tLastChange: " << *LastChange);
+  for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end()))
+    MMI.clearKillInfo();
   for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
-    MMI.clearRegisterKills(SmallestDef, TRI);
+    MMI.clearKillInfo();
 
   return true;
 }
@@ -168,6 +392,11 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
     return false;
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
+
+  // Resize the clobber register bitfield tracker.  We do this once per
+  // function and then clear the bitfield each time we optimize a copy.
+  ClobberedRegs.resize(TRI->getNumRegs());
+
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF)
     Changed |= optimizeCopy(&MBB);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index b292c9c..69124db 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//===- AArch64RegisterBankInfo.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,13 +13,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64RegisterBankInfo.h"
-#include "AArch64InstrInfo.h" // For XXXRegClassID.
-#include "llvm/CodeGen/LowLevelType.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+
+#define GET_TARGET_REGBANK_IMPL
+#include "AArch64GenRegisterBank.inc"
 
 // This file will be TableGen'ed at some point.
 #include "AArch64GenRegisterBankInfo.def"
@@ -31,7 +42,7 @@ using namespace llvm;
 #endif
 
 AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
-    : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) {
+    : AArch64GenRegisterBankInfo() {
   static bool AlreadyInit = false;
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
@@ -78,44 +89,21 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
   // Check that the TableGen'ed like file is in sync we our expectations.
   // First, the Idx.
-  assert(AArch64::PartialMappingIdx::PMI_GPR32 ==
-             AArch64::PartialMappingIdx::PMI_FirstGPR &&
-         "GPR32 index not first in the GPR list");
-  assert(AArch64::PartialMappingIdx::PMI_GPR64 ==
-             AArch64::PartialMappingIdx::PMI_LastGPR &&
-         "GPR64 index not last in the GPR list");
-  assert(AArch64::PartialMappingIdx::PMI_FirstGPR <=
-             AArch64::PartialMappingIdx::PMI_LastGPR &&
-         "GPR list is backward");
-  assert(AArch64::PartialMappingIdx::PMI_FPR32 ==
-             AArch64::PartialMappingIdx::PMI_FirstFPR &&
-         "FPR32 index not first in the FPR list");
-  assert(AArch64::PartialMappingIdx::PMI_FPR512 ==
-             AArch64::PartialMappingIdx::PMI_LastFPR &&
-         "FPR512 index not last in the FPR list");
-  assert(AArch64::PartialMappingIdx::PMI_FirstFPR <=
-             AArch64::PartialMappingIdx::PMI_LastFPR &&
-         "FPR list is backward");
-  assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR64 &&
-         AArch64::PartialMappingIdx::PMI_FPR64 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR128 &&
-         AArch64::PartialMappingIdx::PMI_FPR128 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR256 &&
-         AArch64::PartialMappingIdx::PMI_FPR256 + 1 ==
-             AArch64::PartialMappingIdx::PMI_FPR512 &&
-         "FPR indices not properly ordered");
+  assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
+                                {PMI_GPR32, PMI_GPR64}) &&
+         "PartialMappingIdx's are incorrectly ordered");
+  assert(checkPartialMappingIdx(
+             PMI_FirstFPR, PMI_LastFPR,
+             {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) &&
+         "PartialMappingIdx's are incorrectly ordered");
 // Now, the content.
 // Check partial mapping.
 #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB)                      \
   do {                                                                         \
-    const PartialMapping &Map =                                                \
-        AArch64::PartMappings[AArch64::PartialMappingIdx::Idx -                \
-                              AArch64::PartialMappingIdx::PMI_Min];            \
-    (void)Map;                                                                 \
-    assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength &&           \
-           Map.RegBank == &RB && #Idx " is incorrectly initialized");          \
-  } while (0)
+    assert(                                                                    \
+        checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \
+        #Idx " is incorrectly initialized");                                   \
+  } while (false)
 
   CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
   CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
@@ -128,17 +116,11 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 // Check value mapping.
 #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset)                              \
   do {                                                                         \
-    unsigned PartialMapBaseIdx =                                               \
-        AArch64::PartialMappingIdx::PMI_##RBName##Size -                       \
-        AArch64::PartialMappingIdx::PMI_Min;                                   \
-    (void)PartialMapBaseIdx;                                                   \
-    const ValueMapping &Map = AArch64::getValueMapping(                        \
-        AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset];          \
-    (void)Map;                                                                 \
-    assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] &&       \
-           Map.NumBreakDowns == 1 && #RBName #Size                             \
-           " " #Offset " is incorrectly initialized");                         \
-  } while (0)
+    assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size,            \
+                             PartialMappingIdx::PMI_First##RBName, Size,       \
+                             Offset) &&                                        \
+           #RBName #Size " " #Offset " is incorrectly initialized");           \
+  } while (false)
 
 #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
 
@@ -157,7 +139,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
     CHECK_VALUEMAP_IMPL(RBName, Size, 0);                                      \
     CHECK_VALUEMAP_IMPL(RBName, Size, 1);                                      \
     CHECK_VALUEMAP_IMPL(RBName, Size, 2);                                      \
-  } while (0)
+  } while (false)
 
   CHECK_VALUEMAP_3OPS(GPR, 32);
   CHECK_VALUEMAP_3OPS(GPR, 64);
@@ -169,24 +151,23 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
 #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size)                 \
   do {                                                                         \
-    unsigned PartialMapDstIdx =                                                \
-        AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min;                     \
-    unsigned PartialMapSrcIdx =                                                \
-        AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min;                     \
-    (void) PartialMapDstIdx;                                                   \
-    (void) PartialMapSrcIdx;                                                   \
-    const ValueMapping *Map = AArch64::getCopyMapping(                         \
-        AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR,                \
-        AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size);         \
-    (void) Map;                                                                \
-    assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] &&     \
+    unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min;               \
+    unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min;               \
+    (void)PartialMapDstIdx;                                                    \
+    (void)PartialMapSrcIdx;                                                    \
+    const ValueMapping *Map = getCopyMapping(                                  \
+        AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size);  \
+    (void)Map;                                                                 \
+    assert(Map[0].BreakDown ==                                                 \
+               &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] &&  \
            Map[0].NumBreakDowns == 1 && #RBNameDst #Size                       \
            " Dst is incorrectly initialized");                                 \
-    assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] &&     \
+    assert(Map[1].BreakDown ==                                                 \
+               &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] &&  \
            Map[1].NumBreakDowns == 1 && #RBNameSrc #Size                       \
            " Src is incorrectly initialized");                                 \
                                                                                \
-  } while (0)
+  } while (false)
 
   CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
   CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
@@ -279,17 +260,15 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
     if (MI.getNumOperands() != 3)
       break;
     InstructionMappings AltMappings;
-    InstructionMapping GPRMapping(
-        /*ID*/ 1, /*Cost*/ 1,
-        AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+    const InstructionMapping &GPRMapping = getInstructionMapping(
+        /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size),
         /*NumOperands*/ 3);
-    InstructionMapping FPRMapping(
-        /*ID*/ 2, /*Cost*/ 1,
-        AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+    const InstructionMapping &FPRMapping = getInstructionMapping(
+        /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size),
         /*NumOperands*/ 3);
 
-    AltMappings.emplace_back(std::move(GPRMapping));
-    AltMappings.emplace_back(std::move(FPRMapping));
+    AltMappings.push_back(&GPRMapping);
+    AltMappings.push_back(&FPRMapping);
     return AltMappings;
   }
   case TargetOpcode::G_BITCAST: {
@@ -303,29 +282,29 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
       break;
 
     InstructionMappings AltMappings;
-    InstructionMapping GPRMapping(
+    const InstructionMapping &GPRMapping = getInstructionMapping(
         /*ID*/ 1, /*Cost*/ 1,
-        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size),
+        getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size),
         /*NumOperands*/ 2);
-    InstructionMapping FPRMapping(
+    const InstructionMapping &FPRMapping = getInstructionMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size),
+        getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size),
         /*NumOperands*/ 2);
-    InstructionMapping GPRToFPRMapping(
+    const InstructionMapping &GPRToFPRMapping = getInstructionMapping(
         /*ID*/ 3,
         /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
-        AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size),
+        getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size),
         /*NumOperands*/ 2);
-    InstructionMapping FPRToGPRMapping(
+    const InstructionMapping &FPRToGPRMapping = getInstructionMapping(
         /*ID*/ 3,
         /*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
-        AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size),
+        getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size),
         /*NumOperands*/ 2);
 
-    AltMappings.emplace_back(std::move(GPRMapping));
-    AltMappings.emplace_back(std::move(FPRMapping));
-    AltMappings.emplace_back(std::move(GPRToFPRMapping));
-    AltMappings.emplace_back(std::move(FPRToGPRMapping));
+    AltMappings.push_back(&GPRMapping);
+    AltMappings.push_back(&FPRMapping);
+    AltMappings.push_back(&GPRToFPRMapping);
+    AltMappings.push_back(&FPRToGPRMapping);
     return AltMappings;
   }
   case TargetOpcode::G_LOAD: {
@@ -339,23 +318,21 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
       break;
 
     InstructionMappings AltMappings;
-    InstructionMapping GPRMapping(
+    const InstructionMapping &GPRMapping = getInstructionMapping(
         /*ID*/ 1, /*Cost*/ 1,
-        getOperandsMapping(
-            {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
-             // Addresses are GPR 64-bit.
-             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+        getOperandsMapping({getValueMapping(PMI_FirstGPR, Size),
+                            // Addresses are GPR 64-bit.
+                            getValueMapping(PMI_FirstGPR, 64)}),
         /*NumOperands*/ 2);
-    InstructionMapping FPRMapping(
+    const InstructionMapping &FPRMapping = getInstructionMapping(
         /*ID*/ 2, /*Cost*/ 1,
-        getOperandsMapping(
-            {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
-             // Addresses are GPR 64-bit.
-             AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+        getOperandsMapping({getValueMapping(PMI_FirstFPR, Size),
+                            // Addresses are GPR 64-bit.
+                            getValueMapping(PMI_FirstGPR, 64)}),
         /*NumOperands*/ 2);
 
-    AltMappings.emplace_back(std::move(GPRMapping));
-    AltMappings.emplace_back(std::move(FPRMapping));
+    AltMappings.push_back(&GPRMapping);
+    AltMappings.push_back(&FPRMapping);
     return AltMappings;
   }
   default:
@@ -369,13 +346,12 @@ void AArch64RegisterBankInfo::applyMappingImpl(
   switch (OpdMapper.getMI().getOpcode()) {
   case TargetOpcode::G_OR:
   case TargetOpcode::G_BITCAST:
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
     // Those ID must match getInstrAlternativeMappings.
     assert((OpdMapper.getInstrMapping().getID() >= 1 &&
             OpdMapper.getInstrMapping().getID() <= 4) &&
            "Don't know how to handle that ID");
     return applyDefaultMapping(OpdMapper);
-  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -397,8 +373,9 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
   return false;
 }
 
-RegisterBankInfo::InstructionMapping
-AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
+const RegisterBankInfo::InstructionMapping &
+AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
+    const MachineInstr &MI) const {
   const unsigned Opc = MI.getOpcode();
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -411,6 +388,8 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
   unsigned Size = Ty.getSizeInBits();
   bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
 
+  PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR;
+
 #ifndef NDEBUG
   // Make sure all the operands are using similar size and type.
   // Should probably be checked by the machine verifier.
@@ -422,23 +401,22 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
   // for each types.
   for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
     LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
-    assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) ==
-               AArch64::getRegBankBaseIdxOffset(Size) &&
-           "Operand has incompatible size");
+    assert(
+        AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(
+            RBIdx, OpTy.getSizeInBits()) ==
+            AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) &&
+        "Operand has incompatible size");
     bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
     (void)OpIsFPR;
     assert(IsFPR == OpIsFPR && "Operand has incompatible type");
   }
 #endif // End NDEBUG.
 
-  AArch64::PartialMappingIdx RBIdx =
-      IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR;
-
-  return InstructionMapping{DefaultMappingID, 1,
-                            AArch64::getValueMapping(RBIdx, Size), NumOperands};
+  return getInstructionMapping(DefaultMappingID, 1,
+                               getValueMapping(RBIdx, Size), NumOperands);
 }
 
-RegisterBankInfo::InstructionMapping
+const RegisterBankInfo::InstructionMapping &
 AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const unsigned Opc = MI.getOpcode();
   const MachineFunction &MF = *MI.getParent()->getParent();
@@ -447,7 +425,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // Try the default logic for non-generic instructions that are either copies
   // or already have some operands assigned to banks.
   if (!isPreISelGenericOpcode(Opc)) {
-    RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+    const RegisterBankInfo::InstructionMapping &Mapping =
+        getInstrMappingImpl(MI);
     if (Mapping.isValid())
       return Mapping;
   }
@@ -485,14 +464,11 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
     const RegisterBank &SrcRB =
         SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
-    return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size),
-                              AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size),
-                              /*NumOperands*/ 2};
+    return getInstructionMapping(
+        DefaultMappingID, copyCost(DstRB, SrcRB, Size),
+        getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
+        /*NumOperands*/ 2);
   }
-  case TargetOpcode::G_SEQUENCE:
-    // FIXME: support this, but the generic code is really not going to do
-    // anything sane.
-    return InstructionMapping();
   default:
     break;
   }
@@ -501,10 +477,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
   // Track the size and bank of each register.  We don't do partial mappings.
   SmallVector<unsigned, 4> OpSize(NumOperands);
-  SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+  SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
   for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
     auto &MO = MI.getOperand(Idx);
-    if (!MO.isReg())
+    if (!MO.isReg() || !MO.getReg())
       continue;
 
     LLT Ty = MRI.getType(MO.getReg());
@@ -513,9 +489,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
     // For floating-point instructions, scalars go in FPRs.
     if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
-      OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR;
+      OpRegBankIdx[Idx] = PMI_FirstFPR;
     else
-      OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR;
+      OpRegBankIdx[Idx] = PMI_FirstGPR;
   }
 
   unsigned Cost = 1;
@@ -523,50 +499,74 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // fine-tune the computed mapping.
   switch (Opc) {
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_UITOFP: {
-    OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR};
+  case TargetOpcode::G_UITOFP:
+    OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
     break;
-  }
   case TargetOpcode::G_FPTOSI:
-  case TargetOpcode::G_FPTOUI: {
-    OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR};
+  case TargetOpcode::G_FPTOUI:
+    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
     break;
-  }
-  case TargetOpcode::G_FCMP: {
-    OpRegBankIdx = {AArch64::PMI_FirstGPR,
-                    /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR,
-                    AArch64::PMI_FirstFPR};
+  case TargetOpcode::G_FCMP:
+    OpRegBankIdx = {PMI_FirstGPR,
+                    /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR};
     break;
-  }
-  case TargetOpcode::G_BITCAST: {
+  case TargetOpcode::G_BITCAST:
     // This is going to be a cross register bank copy and this is expensive.
     if (OpRegBankIdx[0] != OpRegBankIdx[1])
-      Cost =
-          copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank,
-                   *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]);
+      Cost = copyCost(
+          *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank,
+          *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank,
+          OpSize[0]);
     break;
-  }
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
     // Loading in vector unit is slightly more expensive.
     // This is actually only true for the LD1R and co instructions,
     // but anyway for the fast mode this number does not matter and
     // for the greedy mode the cost of the cross bank copy will
     // offset this number.
     // FIXME: Should be derived from the scheduling model.
-    if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR)
+    if (OpRegBankIdx[0] != PMI_FirstGPR)
       Cost = 2;
-  }
+    else
+      // Check if that load feeds fp instructions.
+      // In that case, we want the default mapping to be on FPR
+      // instead of blind map every scalar to GPR.
+      for (const MachineInstr &UseMI :
+           MRI.use_instructions(MI.getOperand(0).getReg()))
+        // If we have at least one direct use in a FP instruction,
+        // assume this was a floating point load in the IR.
+        // If it was not, we would have had a bitcast before
+        // reaching that instruction.
+        if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) {
+          OpRegBankIdx[0] = PMI_FirstFPR;
+          break;
+        }
+    break;
+  case TargetOpcode::G_STORE:
+    // Check if that store is fed by fp instructions.
+    if (OpRegBankIdx[0] == PMI_FirstGPR) {
+      unsigned VReg = MI.getOperand(0).getReg();
+      if (!VReg)
+        break;
+      MachineInstr *DefMI = MRI.getVRegDef(VReg);
+      if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode()))
+        OpRegBankIdx[0] = PMI_FirstFPR;
+      break;
+    }
   }
 
   // Finally construct the computed mapping.
-  RegisterBankInfo::InstructionMapping Mapping =
-      InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands};
   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
-  for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
-    if (MI.getOperand(Idx).isReg())
-      OpdsMapping[Idx] =
-          AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) {
+      auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+      if (!Mapping->isValid())
+        return getInvalidInstructionMapping();
+
+      OpdsMapping[Idx] = Mapping;
+    }
+  }
 
-  Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
-  return Mapping;
+  return getInstructionMapping(DefaultMappingID, Cost,
+                               getOperandsMapping(OpdsMapping), NumOperands);
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
index f763235..6d74a47 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -16,25 +16,78 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "AArch64GenRegisterBank.inc"
+
 namespace llvm {
 
 class TargetRegisterInfo;
 
-namespace AArch64 {
-enum {
-  GPRRegBankID = 0, /// General Purpose Registers: W, X.
-  FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
-  CCRRegBankID = 2, /// Conditional register: NZCV.
-  NumRegisterBanks
-};
+class AArch64GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+
+  enum PartialMappingIdx {
+    PMI_None = -1,
+    PMI_FPR32 = 1,
+    PMI_FPR64,
+    PMI_FPR128,
+    PMI_FPR256,
+    PMI_FPR512,
+    PMI_GPR32,
+    PMI_GPR64,
+    PMI_FirstGPR = PMI_GPR32,
+    PMI_LastGPR = PMI_GPR64,
+    PMI_FirstFPR = PMI_FPR32,
+    PMI_LastFPR = PMI_FPR512,
+    PMI_Min = PMI_FirstFPR,
+  };
+
+  static RegisterBankInfo::PartialMapping PartMappings[];
+  static RegisterBankInfo::ValueMapping ValMappings[];
+  static PartialMappingIdx BankIDToCopyMapIdx[];
+
+  enum ValueMappingIdx {
+    InvalidIdx = 0,
+    First3OpsIdx = 1,
+    Last3OpsIdx = 19,
+    DistanceBetweenRegBanks = 3,
+    FirstCrossRegCpyIdx = 22,
+    LastCrossRegCpyIdx = 34,
+    DistanceBetweenCrossRegCpy = 2
+  };
+
+  static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
+                              unsigned ValLength, const RegisterBank &RB);
+  static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank,
+                                unsigned Size, unsigned Offset);
+  static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias,
+                                     PartialMappingIdx LastAlias,
+                                     ArrayRef<PartialMappingIdx> Order);
 
-extern RegisterBank GPRRegBank;
-extern RegisterBank FPRRegBank;
-extern RegisterBank CCRRegBank;
-} // End AArch64 namespace.
+  static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size);
+
+  /// Get the pointer to the ValueMapping representing the RegisterBank
+  /// at \p RBIdx with a size of \p Size.
+  ///
+  /// The returned mapping works for instructions with the same kind of
+  /// operands for up to 3 operands.
+  ///
+  /// \pre \p RBIdx != PartialMappingIdx::None
+  static const RegisterBankInfo::ValueMapping *
+  getValueMapping(PartialMappingIdx RBIdx, unsigned Size);
+
+  /// Get the pointer to the ValueMapping of the operands of a copy
+  /// instruction from the \p SrcBankID register bank to the \p DstBankID
+  /// register bank with a size of \p Size.
+  static const RegisterBankInfo::ValueMapping *
+  getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
+
+#define GET_TARGET_REGBANK_CLASS
+#include "AArch64GenRegisterBank.inc"
+};
 
 /// This class provides the information for the target register banks.
-class AArch64RegisterBankInfo final : public RegisterBankInfo {
+class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
   /// See RegisterBankInfo::applyMapping.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
 
@@ -45,8 +98,8 @@ class AArch64RegisterBankInfo final : public RegisterBankInfo {
   ///
   /// \return An InstructionMappings with a statically allocated
   /// OperandsMapping.
-  static InstructionMapping
-  getSameKindOfOperandsMapping(const MachineInstr &MI);
+  const InstructionMapping &
+  getSameKindOfOperandsMapping(const MachineInstr &MI) const;
 
 public:
   AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
@@ -60,7 +113,8 @@ public:
   InstructionMappings
   getInstrAlternativeMappings(const MachineInstr &MI) const override;
 
-  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+  const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const override;
 };
 } // End llvm namespace.
 #endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
new file mode 100644
index 0000000..c2b6c0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -0,0 +1,20 @@
+//=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: W, X.
+def GPRRegBank : RegisterBank<"GPR", [GPR64all]>;
+
+/// Floating Point/Vector Registers: B, H, S, D, Q.
+def FPRRegBank : RegisterBank<"FPR", [QQQQ]>;
+
+/// Conditional register: NZCV.
+def CCRRegBank : RegisterBank<"CCR", [CCR]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 98fad71..9f7dcb3 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -74,7 +74,7 @@ const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_RegMask;
@@ -94,7 +94,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
   if (TT.isOSDarwin())
     return CSR_AArch64_TLS_Darwin_RegMask;
 
-  assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS");
+  assert(TT.isOSBinFormatELF() && "Invalid target");
   return CSR_AArch64_TLS_ELF_RegMask;
 }
 
@@ -118,25 +118,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
-  markSuperRegs(Reserved, AArch64::SP);
-  markSuperRegs(Reserved, AArch64::XZR);
   markSuperRegs(Reserved, AArch64::WSP);
   markSuperRegs(Reserved, AArch64::WZR);
 
-  if (TFI->hasFP(MF) || TT.isOSDarwin()) {
-    markSuperRegs(Reserved, AArch64::FP);
+  if (TFI->hasFP(MF) || TT.isOSDarwin())
     markSuperRegs(Reserved, AArch64::W29);
-  }
 
-  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
-    markSuperRegs(Reserved, AArch64::X18); // Platform register
-    markSuperRegs(Reserved, AArch64::W18);
-  }
+  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
+    markSuperRegs(Reserved, AArch64::W18); // Platform register
 
-  if (hasBasePointer(MF)) {
-    markSuperRegs(Reserved, AArch64::X19);
+  if (hasBasePointer(MF))
     markSuperRegs(Reserved, AArch64::W19);
-  }
 
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
@@ -175,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
 const TargetRegisterClass *
 AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                       unsigned Kind) const {
-  return &AArch64::GPR64RegClass;
+  return &AArch64::GPR64spRegClass;
 }
 
 const TargetRegisterClass *
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
index 93ca079..18d000a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -13,7 +13,7 @@
 
 // ===---------------------------------------------------------------------===//
 // The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
+// This works with MachineScheduler. See MCSchedule.h for details.
 
 // Cortex-A53 machine model for scheduling and other instruction cost heuristics.
 def CortexA53Model : SchedMachineModel {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 99c48d0..5d1608e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -13,7 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// The Cortex-A57 is a traditional superscaler microprocessor with a
+// The Cortex-A57 is a traditional superscalar microprocessor with a
 // conservative 3-wide in-order stage for decode and dispatch. Combined with the
 // much wider out-of-order issue stage, this produced a need to carefully
 // schedule micro-ops so that all three decoded each cycle are successfully
@@ -162,7 +162,9 @@ def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
 // Cryptography Extensions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def A57ReadAES  : SchedReadAdvance<3, [A57Write_3cyc_1W]>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES[DE]")>;
+def : InstRW<[A57Write_3cyc_1W, A57ReadAES], (instregex "^AESI?MC")>;
 def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 19a6d6f..44fd94f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -17,10 +17,98 @@
 // instruction cost model.
 
 def FalkorModel : SchedMachineModel {
-  let IssueWidth = 4;          // 4-wide issue for expanded uops.
+  let IssueWidth = 8;          // 8 uops are dispatched per cycle.
   let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer.
   let LoopMicroOpBufferSize = 16;
   let LoadLatency = 3;         // Optimistic load latency.
   let MispredictPenalty = 11;  // Minimum branch misprediction penalty.
-  let CompleteModel = 0;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Falkor.
+
+let SchedModel = FalkorModel in {
+
+  def FalkorUnitB    : ProcResource<1>; // Branch
+  def FalkorUnitLD   : ProcResource<1>; // Load pipe
+  def FalkorUnitSD   : ProcResource<1>; // Store data
+  def FalkorUnitST   : ProcResource<1>; // Store pipe
+  def FalkorUnitX    : ProcResource<1>; // Complex arithmetic
+  def FalkorUnitY    : ProcResource<1>; // Simple arithmetic
+  def FalkorUnitZ    : ProcResource<1>; // Simple arithmetic
+
+  def FalkorUnitVSD  : ProcResource<1>; // Vector store data
+  def FalkorUnitVX   : ProcResource<1>; // Vector X-pipe
+  def FalkorUnitVY   : ProcResource<1>; // Vector Y-pipe
+
+  def FalkorUnitGTOV : ProcResource<1>; // Scalar to Vector
+  def FalkorUnitVTOG : ProcResource<1>; // Vector to Scalar
+
+  // Define the resource groups.
+  def FalkorUnitXY   : ProcResGroup<[FalkorUnitX, FalkorUnitY]>;
+  def FalkorUnitXYZ  : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ]>;
+  def FalkorUnitXYZB : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ,
+                                     FalkorUnitB]>;
+  def FalkorUnitZB   : ProcResGroup<[FalkorUnitZ, FalkorUnitB]>;
+  def FalkorUnitVXVY : ProcResGroup<[FalkorUnitVX, FalkorUnitVY]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Falkor.
+
+let SchedModel = FalkorModel in {
+
+// These WriteRes entries are not used in the Falkor sched model.
+def : WriteRes<WriteImm, []>     { let Unsupported = 1; }
+def : WriteRes<WriteI, []>       { let Unsupported = 1; }
+def : WriteRes<WriteISReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteIEReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteExtr, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIS, []>      { let Unsupported = 1; }
+def : WriteRes<WriteID32, []>    { let Unsupported = 1; }
+def : WriteRes<WriteID64, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIM32, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIM64, []>    { let Unsupported = 1; }
+def : WriteRes<WriteBr, []>      { let Unsupported = 1; }
+def : WriteRes<WriteBrReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteLD, []>      { let Unsupported = 1; }
+def : WriteRes<WriteST, []>      { let Unsupported = 1; }
+def : WriteRes<WriteSTP, []>     { let Unsupported = 1; }
+def : WriteRes<WriteAdr, []>     { let Unsupported = 1; }
+def : WriteRes<WriteLDIdx, []>   { let Unsupported = 1; }
+def : WriteRes<WriteSTIdx, []>   { let Unsupported = 1; }
+def : WriteRes<WriteF, []>       { let Unsupported = 1; }
+def : WriteRes<WriteFCmp, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFCvt, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFCopy, []>   { let Unsupported = 1; }
+def : WriteRes<WriteFImm, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFMul, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFDiv, []>    { let Unsupported = 1; }
+def : WriteRes<WriteV, []>       { let Unsupported = 1; }
+def : WriteRes<WriteVLD, []>     { let Unsupported = 1; }
+def : WriteRes<WriteVST, []>     { let Unsupported = 1; }
+def : WriteRes<WriteSys, []>     { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint, []>    { let Unsupported = 1; }
+def : WriteRes<WriteLDHi, []>    { let Unsupported = 1; }
+def : WriteRes<WriteAtomic, []>  { let Unsupported = 1; }
+
+// These ReadAdvance entries are not used in the Falkor sched model.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+// Detailed Refinements
+// -----------------------------------------------------------------------------
+include "AArch64SchedFalkorDetails.td"
+
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
new file mode 100644
index 0000000..0aeb1f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -0,0 +1,1288 @@
+//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Falkor subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+// Contains all of the Falkor specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: FalkorWr
+//   MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
+//   Latency: #cyc
+//
+// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
+//      down one Z pipe, six SD pipes, four VX pipes and the total latency is
+//      six cycles.
+//
+// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
+//
+// Contains all of the Falkor specific WriteVariant types for immediate zero
+// and LSLFast.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define 0 micro-op types
+def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+def FalkorWr_none_3cyc : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+def FalkorWr_none_4cyc : SchedWriteRes<[]> {
+  let Latency = 4;
+  let NumMicroOps = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 1 micro-op types
+
+def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
+def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
+def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
+def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
+def FalkorWr_1LD_4cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 4; }
+def FalkorWr_1XYZ_0cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 0; }
+def FalkorWr_1XYZ_1cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
+def FalkorWr_1XYZ_2cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
+def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
+def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
+def FalkorWr_1none_0cyc : SchedWriteRes<[]>              { let Latency = 0; }
+
+def FalkorWr_1VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 0; }
+def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
+def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
+def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
+def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+
+def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
+def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
+def FalkorWr_1ST_3cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 3; }
+
+def FalkorWr_1GTOV_0cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 0; }
+def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
+def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
+def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define 2 micro-op types
+
+def FalkorWr_2VXVY_0cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_1cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_2cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_3cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_4cyc  : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2LD_3cyc   : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_12cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 12;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 14;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 21;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2GTOV_1cyc    : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2XYZ_2cyc   : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1X_1Z_8cyc  : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2, 8];
+}
+
+def FalkorWr_1X_1Z_11cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2, 11];
+}
+
+def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 3 micro-op types
+
+def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+                                               FalkorUnitLD]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+                                               FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1LD_2VXVY_4cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1none_3cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3LD_3cyc        : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                              FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1Z_3cyc     : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+//===----------------------------------------------------------------------===//
+// Define 4 micro-op types
+
+def FalkorWr_2VX_2VY_14cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                             FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 14;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_20cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                             FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 20;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_21cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                             FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 21;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_24cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                             FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 24;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4VXVY_2cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_3cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_4cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_6cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4LD_3cyc      : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
+                                              FalkorUnitSD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 5 micro-op types
+
+def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_5VXVY_7cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+}
+def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+                                                FalkorUnitVSD, FalkorUnitST,
+                                                FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 5;
+}
+def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                  FalkorUnitVSD, FalkorUnitST,
+                                                  FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 5;
+}
+//===----------------------------------------------------------------------===//
+// Define 6 micro-op types
+
+def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+                                                FalkorUnitVSD, FalkorUnitXYZ,
+                                                FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
+def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
+def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 8 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+}
+
+def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 9 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitXYZ,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitXYZ,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 10 micro-op types
+
+def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD,
+                                                 FalkorUnitST, FalkorUnitVSD,
+                                                 FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 12 micro-op types
+
+def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD,
+                                                 FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 12;
+}
+
+// Forwarding logic is modeled for multiply add/accumulate and
+// load/store base register increment.
+// -----------------------------------------------------------------------------
+def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
+
+def FalkorReadIncLd  : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncSt  : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+
+// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
+// -----------------------------------------------------------------------------
+def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).isImm() &&
+                                         MI->getOperand(1).getImm() == 0}]>;
+def FalkorOp1ZrReg    : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
+
+                                         MI->getOperand(1).getReg() == AArch64::XZR}]>;
+def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>;
+
+def FalkorWr_FMOV  : SchedWriteVariant<[
+                       SchedVar<FalkorOp1ZrReg,  [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,     [FalkorWr_1GTOV_1cyc]>]>;
+
+def FalkorWr_MOVZ  : SchedWriteVariant<[
+                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_0cyc]>]>; // imm fwd
+
+
+def FalkorWr_ADDSUBsx : SchedWriteVariant<[
+                          SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>,
+                          SchedVar<NoSchedPred,            [FalkorWr_2XYZ_2cyc]>]>;
+
+def FalkorWr_LDRro : SchedWriteVariant<[
+                       SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_3cyc]>,
+                       SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1LD_4cyc]>]>;
+
+def FalkorWr_LDRSro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1LD_5cyc]>]>;
+
+def FalkorWr_ORRi : SchedWriteVariant<[
+                      SchedVar<FalkorOp1ZrReg, [FalkorWr_1XYZ_0cyc]>, // imm fwd
+                      SchedVar<NoSchedPred,    [FalkorWr_1XYZ_1cyc]>]>;
+
+def FalkorWr_PRFMro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1ST_4cyc]>]>;
+
+def FalkorWr_STRVro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1VSD_1ST_0cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1VSD_1ST_0cyc]>]>;
+
+def FalkorWr_STRQro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_2ST_2VSD_0cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_2XYZ_2ST_2VSD_0cyc]>]>;
+
+def FalkorWr_STRro : SchedWriteVariant<[
+                       SchedVar<FalkorShiftExtFastPred, [FalkorWr_1SD_1ST_0cyc]>,
+                       SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1SD_1ST_0cyc]>]>;
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the earlier mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>;
+
+// SIMD Floating-point Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)v2f32$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FAC(GE|GT)(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FABD|FADD|FSUB)v2f32$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTXNv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instrs FMULX32)>;
+
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instrs FMULX64)>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instrs FCVTLv4i16, FCVTLv2i32)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVv2f32)>;
+def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTv2f32)>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs FCVTLv8i16, FCVTLv4i32)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+                                      (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>;
+
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+                                      (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+
+def : InstRW<[FalkorWr_3VXVY_4cyc],   (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>;
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>;
+
+def : InstRW<[FalkorWr_2VX_2VY_14cyc],(instrs FDIVv2f64)>;
+def : InstRW<[FalkorWr_2VX_2VY_20cyc],(instrs FDIVv4f32)>;
+def : InstRW<[FalkorWr_2VX_2VY_21cyc],(instrs FSQRTv2f64)>;
+def : InstRW<[FalkorWr_2VX_2VY_24cyc],(instrs FSQRTv4f32)>;
+
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32],
+                                      (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64],
+                                      (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32],
+                                      (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64],
+                                      (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
+
+// SIMD Integer Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs ADDPv2i64p)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(AND|ORR|ORN|BIC|EOR)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIC|ORR)(v2i32|v4i16)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^NEG(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHLv1i64$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHRd$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs PMULv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^SHLd$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)(ABD|ADALP)(v8i8|v4i16|v2i32)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i16v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(s|h|b)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RSHRd$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs ADDVv4i16v)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQABS(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)ADDLVv8i8v$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs ADDVv8i8v)>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+                                      (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+                                      (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+                                      (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs ADDVv4i32v)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs ADDVv8i16v)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(ADD|SUB)HNv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)ABA(v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs ADDVv16i8v)>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32)_shift?$")>;
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^R(ADD|SUB)HNv.*$")>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^ADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs ADDPv2i64)>; // sz==11
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(AND|ORR|ORN|BIC|EOR)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIC|ORR)(v8i16|v4i32)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(NEG|SUB)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)ADDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v16i8|v2i64|v4i32|v8i16)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SHR(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SUBLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; // sz!=11
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^PMULL(v8i8|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)ABD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)ABDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16|v4i32|v2i64)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL(v1i64|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+                                      (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+                                      (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_3VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i32v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(S|U)ADDLVv8i16v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_6cyc],   (instregex "^(S|U)ADDLVv16i8v$")>;
+
+def : InstRW<[FalkorWr_4VXVY_2cyc],   (instregex "^(S|U)(ADD|SUB)Wv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_3cyc],   (instregex "^(S|U)ABALv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4cyc],   (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
+
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+
+// SIMD Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],       (instrs LD2i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD3i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd],       (instrs LD4i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd],       (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd],       (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+                                                         (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ADC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ADD(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^AND(S)?(W|X)r(i|r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^BIC(S)?(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^EON(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^EOR(W|X)r(i|r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORN(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_ORRi],         (instregex "^ORR(W|X)ri$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORR(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^SBC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^SUB(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx],     (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx],     (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>;
+
+// SIMD Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^CPY(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs NOTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^REV(16|32|64)v.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "(S|U)QXTU?Nv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPXv1i32, FRECPXv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instrs FRECPS64, FRSQRTS64)>;
+
+def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
+                                      (instregex "^INSv(i32|i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_0cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs TBLv16i8One)>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs TBLv8i8Two)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^TBX(v8|v16)i8One$")>;
+
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+                                      (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+                                      (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBL(v8i8Three|v16i8Two)$")>;
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBX(v8i8Two|v16i8Two)$")>;
+
+def : InstRW<[FalkorWr_4VXVY_6cyc],   (instregex "^TBL(v8i8Four|v16i8Three)$")>;
+def : InstRW<[FalkorWr_4VXVY_6cyc],   (instregex "^TBX(v8i8Three|v16i8Three)$")>;
+
+def : InstRW<[FalkorWr_5VXVY_7cyc],   (instrs TBLv16i8Four)>;
+def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
+
+// SIMD Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STPQ(post|pre)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STP(D|S)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                       (instrs STNPQi)>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4(i8|i16|i32|i64)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v8b|v4h|v2s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>;
+
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST3Threev2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST3Threev2d_POST)>;
+
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v8b|v4h|v2s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>;
+
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST4Fourv2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instrs ST4Fourv2d_POST)>;
+
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+                                       (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs B, TCRETURNdi)>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instrs RET_ReallyLR, TCRETURNri)>;
+def : InstRW<[FalkorWr_1ZB_0cyc],     (instrs Bcc)>;
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instrs BL)>;
+def : InstRW<[FalkorWr_1Z_1XY_0cyc],  (instrs BLR)>;
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs SHA1Hrr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs AESIMCrr, AESMCrr)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs AESDrr, AESErr)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def : InstRW<[FalkorWr_1VX_1VY_4cyc], (instregex "^SHA1(C|M|P)rrr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_5cyc], (instrs SHA256H2rrr, SHA256Hrrr)>;
+def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
+
+// FP Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+                                      (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instrs LDNPQi)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instrs LDPQi)>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "LDNP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "LDP(D|S)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "LDP(D|S)(pre|post)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDPQ(pre|post)$")>;
+
+// FP Data Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCCMP(E)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCMP(E)?(S|D)r(r|i)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCSEL(S|D)rrr$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs FCVTSHr, FCVTDHr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FABD(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FADD|FSUB)(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTHSr, FCVTHDr)>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTSDr, FCVTDSr)>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instregex "^F(N)?MULSrr$")>;
+
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instregex "^F(N)?MULDrr$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVSrr)>;
+def : InstRW<[FalkorWr_1VX_1VY_14cyc],(instrs FDIVDrr)>;
+def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTSr)>;
+def : InstRW<[FalkorWr_1VX_1VY_21cyc],(instrs FSQRTDr)>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32],
+                                      (instregex "^F(N)?M(ADD|SUB)Srrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64],
+                                      (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+
+// FP Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(WS|XD|XDHigh)r$")>;
+def : InstRW<[FalkorWr_1GTOV_0cyc],   (instregex "^FMOV(S|D)i$")>; // imm fwd
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(d|s)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; // imm fwd
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr
+def : InstRW<[FalkorWr_2VXVY_0cyc],   (instrs FMOVD0, FMOVS0)>; // imm fwd
+
+def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+
+// Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFUMi)>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+                                      (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_PRFMro],       (instregex "^PRFMro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
+                                      (instrs LDPSWi)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDPSW(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+                                      (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(S|U)?BFM(W|X)ri$")>;
+def : InstRW<[FalkorWr_1X_2cyc],      (instregex "^CRC32.*$")>;
+def : InstRW<[FalkorWr_1XYZ_2cyc],    (instregex "^(CLS|CLZ|RBIT|REV|REV16|REV32)(W|X)r$")>;
+def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^EXTR(W|X)rri$")>;
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+                                        (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32],
+                                        (instregex "^M(ADD|SUB)Wrrr$")>;
+
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+                                        (instregex "^M(ADD|SUB)Xrrr$")>;
+
+def : InstRW<[FalkorWr_1X_1Z_8cyc],     (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[FalkorWr_1X_1Z_11cyc],    (instregex "^(S|U)DIVXr$")>;
+
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+                                        (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                        (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
+
+// Move and Shift Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(LSLV|LSRV|ASRV|RORV)(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_0cyc],    (instregex "^MOVK(W|X)i$")>; // imm fwd
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instregex "^ADRP?$")>; // imm fwd
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instregex "^MOVN(W|X)i$")>; // imm fwd
+def : InstRW<[FalkorWr_MOVZ],         (instregex "^MOVZ(W|X)i$")>;
+def : InstRW<[FalkorWr_1XYZ_0cyc],    (instrs MOVi32imm, MOVi64imm)>; // imm fwd (approximation)
+def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>],
+                                      (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>;
+def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>],
+                                      (instrs LOADgot)>;
+
+// Other Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_0cyc],     (instrs CLREX, DMB, DSB)>;
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HLT, HVC, ISB, SMC, SVC)>;
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs SYSxt, SYSLxt)>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instrs MSRpstateImm1, MSRpstateImm4)>;
+
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+                                      (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+                                      (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS, MOVbaseTLS)>;
+
+def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
+
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
+
+def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STXP(W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLXP(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                      (instregex "^STLXR(B|H|W|X)$")>;
+
+// Store Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+                                          (instregex "^STUR(BB|HH|W|X)i$")>;
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 426ae61..cf4cdab 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -776,23 +776,29 @@ def KryoWrite_4cyc_X_X_115ln :
 }
 def : InstRW<[KryoWrite_4cyc_X_X_115ln],
 	(instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
-def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+def KryoWrite_10cyc_XA_Y_noRSV_43ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 10; let NumMicroOps = 3;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
-	(instrs FDIVDrr, FDIVSrr)>;
-def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVSrr)>;
+def KryoWrite_14cyc_XA_Y_noRSV_43ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 14; let NumMicroOps = 3;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+def : InstRW<[KryoWrite_14cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVDrr)>;
+def KryoWrite_10cyc_XA_Y_noRSV_121ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 10; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_121ln],
 	(instrs FDIVv2f32)>;
-def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+def KryoWrite_14cyc_XA_Y_XA_Y_123ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 4;
+	let Latency = 14; let NumMicroOps = 4;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+def : InstRW<[KryoWrite_14cyc_XA_Y_XA_Y_123ln],
 	(instrs FDIVv2f64, FDIVv4f32)>;
 def KryoWrite_5cyc_X_noRSV_55ln :
 	SchedWriteRes<[KryoUnitX]> {
@@ -968,24 +974,36 @@ def KryoWrite_2cyc_XY_XY_109ln :
 }
 def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
 	(instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
-def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+def KryoWrite_12cyc_XA_Y_noRSV_42ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 12; let NumMicroOps = 3;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
-	(instregex "FSQRT(S|D)r")>;
-def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_42ln],
+	(instrs FSQRTSr)>;
+def KryoWrite_21cyc_XA_Y_noRSV_42ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 3;
+	let Latency = 21; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_21cyc_XA_Y_noRSV_42ln],
+	(instrs FSQRTDr)>;
+def KryoWrite_12cyc_XA_Y_noRSV_120ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 12; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_120ln],
+	(instrs FSQRTv2f32)>;
+def KryoWrite_21cyc_XA_Y_XA_Y_122ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 21; let NumMicroOps = 4;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
-	(instregex "FSQRTv2f32")>;
-def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+def : InstRW<[KryoWrite_21cyc_XA_Y_XA_Y_122ln],
+	(instrs FSQRTv4f32)>;
+def KryoWrite_36cyc_XA_Y_XA_Y_122ln :
 	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
-	let Latency = 1; let NumMicroOps = 4;
+	let Latency = 36; let NumMicroOps = 4;
 }
-def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
-	(instregex "FSQRT(v2f64|v4f32)")>;
+def : InstRW<[KryoWrite_36cyc_XA_Y_XA_Y_122ln],
+	(instrs FSQRTv2f64)>;
 def KryoWrite_1cyc_X_201ln :
 	SchedWriteRes<[KryoUnitX]> {
 	let Latency = 1; let NumMicroOps = 1;
@@ -1356,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln :
 	let Latency = 3; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
-	(instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+	(instregex "LDAX?R(B|H|W|X)")>;
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi],
+	(instregex "LDAXP(W|X)")>;
 def KryoWrite_3cyc_LS_LS_401ln :
 	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
 	let Latency = 3; let NumMicroOps = 2;
@@ -1547,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln :
 	SchedWriteRes<[KryoUnitLS]> {
 	let Latency = 3; let NumMicroOps = 1;
 }
-def : InstRW<[KryoWrite_3cyc_LS_258ln],
+def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi],
 	(instregex "LDXP(W|X)")>;
 def KryoWrite_3cyc_LS_258_1ln :
 	SchedWriteRes<[KryoUnitLS]> {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
index 14d6891..3b71cf8 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
@@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel {
   let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
   let LoadLatency           =  4; // Optimistic load cases.
   let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
-  let CompleteModel         =  0; // Use the default model otherwise.
+  let CompleteModel         =  1; // Use the default model otherwise.
 }
 
 //===----------------------------------------------------------------------===//
@@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]>   { let Latency = 2; }
 def M1WriteB1 : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
 
 def M1WriteL5 : SchedWriteRes<[M1UnitL]>   { let Latency = 5; }
-def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
+def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
                                                             M1WriteA1]>,
                                    SchedVar<NoSchedPred,   [M1WriteL5]>]>;
 
 def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; }
 def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; }
 def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
-def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
+def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
                                                             M1WriteA1]>,
                                    SchedVar<NoSchedPred,   [M1WriteS1]>]>;
 
@@ -125,13 +125,13 @@ def : WriteRes<WriteAdr, []> { let Latency = 0; }
 // Load instructions.
 def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
 def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }
-def : SchedAlias<WriteLDIdx, M1WriteLA>;
+def : SchedAlias<WriteLDIdx, M1WriteLX>;
 
 // Store instructions.
 def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
 def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
 def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }
-def : SchedAlias<WriteSTIdx, M1WriteSA>;
+def : SchedAlias<WriteSTIdx, M1WriteSX>;
 
 // FP data instructions.
 def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
@@ -231,6 +231,111 @@ def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
 def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
 def M1WriteTB      : SchedWriteRes<[M1UnitC,
                                     M1UnitALU]>    { let Latency = 2; }
+def M1WriteVLDA    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL]>      { let Latency = 6; }
+def M1WriteVLDB    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 7; }
+def M1WriteVLDC    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 8; }
+def M1WriteVLDD    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDE    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU]>   { let Latency = 6; }
+def M1WriteVLDF    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL]>      { let Latency = 10;
+                                                     let ResourceCycles = [5]; }
+def M1WriteVLDG    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDH    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 6; }
+def M1WriteVLDI    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 12;
+                                                     let ResourceCycles = [6]; }
+def M1WriteVLDJ    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 9;
+                                                     let ResourceCycles = [4]; }
+def M1WriteVLDK    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 9;
+                                                     let ResourceCycles = [4]; }
+def M1WriteVLDL    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDM    : SchedWriteRes<[M1UnitL,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU,
+                                    M1UnitNALU]>   { let Latency = 7;
+                                                     let ResourceCycles = [2]; }
+def M1WriteVLDN    : SchedWriteRes<[M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL,
+                                    M1UnitL]>      { let Latency = 14;
+                                                     let ResourceCycles = [7]; }
+
+def M1WriteVSTA    : WriteSequence<[WriteVST], 2>;
+def M1WriteVSTB    : WriteSequence<[WriteVST], 3>;
+def M1WriteVSTC    : WriteSequence<[WriteVST], 4>;
+def M1WriteVSTD    : SchedWriteRes<[M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST]>    { let Latency = 7;
+                                                     let ResourceCycles = [7]; }
+def M1WriteVSTE    : SchedWriteRes<[M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST]>    { let Latency = 8;
+                                                     let ResourceCycles = [8]; }
+def M1WriteVSTF    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>     { let Latency = 15;
+                                                      let ResourceCycles = [15]; }
+def M1WriteVSTG    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>     { let Latency = 16;
+                                                      let ResourceCycles = [16]; }
+def M1WriteVSTH    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>      { let Latency = 14;
+                                                       let ResourceCycles = [14]; }
+def M1WriteVSTI    : SchedWriteRes<[M1UnitNALU,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitS,
+                                    M1UnitFST,
+                                    M1UnitFST,
+                                    M1UnitFST]>      { let Latency = 17;
+                                                       let ResourceCycles = [17]; }
 
 // Branch instructions
 def : InstRW<[M1WriteB1], (instrs Bcc)>;
@@ -360,13 +465,239 @@ def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64
 def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
 
 // ASIMD load instructions.
+def : InstRW<[M1WriteVLDD],   (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[M1WriteVLDD,
+              WriteAdr],      (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVLDE],   (instregex "LD1i(64)$")>;
+def : InstRW<[M1WriteVLDE,
+              WriteAdr],      (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[M1WriteL5],     (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteL5],     (instregex "LD1Rv(1d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[M1WriteL5],     (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteL5],     (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteL5],     (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteL5,
+              WriteAdr],      (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDG],   (instregex "LD2i(8|16)$")>;
+def : InstRW<[M1WriteVLDG,
+              WriteAdr],      (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDG],   (instregex "LD2i(32)$")>;
+def : InstRW<[M1WriteVLDG,
+              WriteAdr],      (instregex "LD2i(32)_POST$")>;
+def : InstRW<[M1WriteVLDH],   (instregex "LD2i(64)$")>;
+def : InstRW<[M1WriteVLDH,
+              WriteAdr],      (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDA],   (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD2Rv(1d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDA],   (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDA,
+              WriteAdr],      (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDF],   (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDF,
+              WriteAdr],      (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDF],   (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDF,
+              WriteAdr],      (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDF],   (instregex "LD2Twov(2d)$")>;
+def : InstRW<[M1WriteVLDF,
+              WriteAdr],      (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDJ],   (instregex "LD3i(8|16)$")>;
+def : InstRW<[M1WriteVLDJ,
+              WriteAdr],      (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDJ],   (instregex "LD3i(32)$")>;
+def : InstRW<[M1WriteVLDJ,
+              WriteAdr],      (instregex "LD3i(32)_POST$")>;
+def : InstRW<[M1WriteVLDL],   (instregex "LD3i(64)$")>;
+def : InstRW<[M1WriteVLDL,
+              WriteAdr],      (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(1d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDB],   (instregex "LD3Rv(2d)$")>;
+def : InstRW<[M1WriteVLDB,
+              WriteAdr],      (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDI],   (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDI,
+              WriteAdr],      (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDI],   (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDI,
+              WriteAdr],      (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDI],   (instregex "LD3Threev(2d)$")>;
+def : InstRW<[M1WriteVLDI,
+              WriteAdr],      (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDK],   (instregex "LD4i(8|16)$")>;
+def : InstRW<[M1WriteVLDK,
+              WriteAdr],      (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDK],   (instregex "LD4i(32)$")>;
+def : InstRW<[M1WriteVLDK,
+              WriteAdr],      (instregex "LD4i(32)_POST$")>;
+def : InstRW<[M1WriteVLDM],   (instregex "LD4i(64)$")>;
+def : InstRW<[M1WriteVLDM,
+              WriteAdr],      (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(1d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDC],   (instregex "LD4Rv(2d)$")>;
+def : InstRW<[M1WriteVLDC,
+              WriteAdr],      (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDN],   (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDN,
+              WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDN],   (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDN,
+              WriteAdr],      (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDN],   (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[M1WriteVLDN,
+              WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
 
 // ASIMD store instructions.
+def : InstRW<[M1WriteVSTD],   (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVSTD],   (instregex "ST1i(64)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[WriteVST],      (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVST,
+              WriteAdr],      (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteVST],      (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVST,
+              WriteAdr],      (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTA],   (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTA,
+              WriteAdr],      (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTA],   (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTA,
+              WriteAdr],      (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTB],   (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTB,
+              WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTB],   (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTB,
+              WriteAdr],      (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTC],   (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTC,
+              WriteAdr],      (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTC],   (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTC,
+              WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTD],   (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVSTD],   (instregex "ST2i(64)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTD],   (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTD,
+              WriteAdr],      (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTE],   (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTE,
+              WriteAdr],      (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTE],   (instregex "ST2Twov(2d)$")>;
+def : InstRW<[M1WriteVSTE,
+              WriteAdr],      (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTH],   (instregex "ST3i(8|16)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[M1WriteVSTH],   (instregex "ST3i(32)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST3i(32)_POST$")>;
+def : InstRW<[M1WriteVSTF],   (instregex "ST3i(64)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTF],   (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTG],   (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTG,
+              WriteAdr],      (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTG],   (instregex "ST3Threev(2d)$")>;
+def : InstRW<[M1WriteVSTG,
+              WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTH],   (instregex "ST4i(8|16)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[M1WriteVSTH],   (instregex "ST4i(32)$")>;
+def : InstRW<[M1WriteVSTH,
+              WriteAdr],      (instregex "ST4i(32)_POST$")>;
+def : InstRW<[M1WriteVSTF],   (instregex "ST4i(64)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTF],   (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTF,
+              WriteAdr],      (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTI],   (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTI,
+              WriteAdr],      (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTI],   (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[M1WriteVSTI,
+              WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
 
 // Cryptography instructions.
 def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
 def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
-def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>;
 
 def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
 def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
new file mode 100644
index 0000000..9a0cb70
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -0,0 +1,352 @@
+//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM ThunderX T8X
+// (T88, T81, T83) processors.
+// Loosely based on Cortex-A53 which is somewhat similar.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details.
+
+// Cavium ThunderX T8X scheduling machine model.
+def ThunderXT8XModel : SchedMachineModel {
+  let IssueWidth = 2;         // 2 micro-ops dispatched per cycle.
+  let MicroOpBufferSize = 0;  // ThunderX T88/T81/T83 are in-order.
+  let LoadLatency = 3;        // Optimistic load latency.
+  let MispredictPenalty = 8;  // Branch mispredict penalty.
+  let PostRAScheduler = 1;    // Use PostRA scheduler.
+  let CompleteModel = 1;
+}
+
+// Modeling each pipeline with BufferSize == 0 since T8X is in-order.
+def THXT8XUnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def THXT8XUnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def THXT8XUnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def THXT8XUnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def THXT8XUnitBr     : ProcResource<1> { let BufferSize = 0; } // Branch
+def THXT8XUnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def THXT8XUnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mul/Div/Sqrt
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types mapping the ProcResources and
+// latencies.
+
+let SchedModel = ThunderXT8XModel in {
+
+// ALU
+def : WriteRes<WriteImm, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteI, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIEReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIS, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [THXT8XUnitALU]> { let Latency = 2; }
+
+// MAC
+def : WriteRes<WriteIM32, [THXT8XUnitMAC]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+def : WriteRes<WriteIM64, [THXT8XUnitMAC]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+// Div
+def : WriteRes<WriteID32, [THXT8XUnitDiv]> {
+  let Latency = 12;
+  let ResourceCycles = [6];
+}
+
+def : WriteRes<WriteID64, [THXT8XUnitDiv]> {
+  let Latency = 14;
+  let ResourceCycles = [8];
+}
+
+// Load
+def : WriteRes<WriteLD, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDIdx, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDHi, [THXT8XUnitLdSt]> { let Latency = 3; }
+
+// Vector Load
+def : WriteRes<WriteVLD, [THXT8XUnitLdSt]> {
+  let Latency = 8;
+  let ResourceCycles = [3];
+}
+
+def THXT8XWriteVLD1 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 6;
+  let ResourceCycles = [1];
+}
+
+def THXT8XWriteVLD2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 11;
+  let ResourceCycles = [7];
+}
+
+def THXT8XWriteVLD3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 12;
+  let ResourceCycles = [8];
+}
+
+def THXT8XWriteVLD4 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 13;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteVLD5 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 13;
+  let ResourceCycles = [9];
+}
+
+// Pre/Post Indexing
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTX, [THXT8XUnitLdSt]> { let Latency = 1; }
+
+// Vector Store
+def : WriteRes<WriteVST, [THXT8XUnitLdSt]>;
+def THXT8XWriteVST1 : SchedWriteRes<[THXT8XUnitLdSt]>;
+
+def THXT8XWriteVST2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 10;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteVST3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 11;
+  let ResourceCycles = [10];
+}
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [THXT8XUnitBr]>;
+def THXT8XWriteBR : SchedWriteRes<[THXT8XUnitBr]>;
+def : WriteRes<WriteBrReg, [THXT8XUnitBr]>;
+def THXT8XWriteBRR : SchedWriteRes<[THXT8XUnitBr]>;
+def THXT8XWriteRET : SchedWriteRes<[THXT8XUnitALU]>;
+def : WriteRes<WriteSys, [THXT8XUnitBr]>;
+def : WriteRes<WriteBarrier, [THXT8XUnitBr]>;
+def : WriteRes<WriteHint, [THXT8XUnitBr]>;
+
+// FP ALU
+def : WriteRes<WriteF, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [THXT8XUnitFPMDS]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THXT8XWriteFMAC : SchedWriteRes<[THXT8XUnitFPMDS]> { let Latency = 10; }
+
+def THXT8XWriteFDivSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 12;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteFDivDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THXT8XWriteFSqrtSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 17;
+  let ResourceCycles = [14];
+}
+
+def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 31;
+  let ResourceCycles = [28];
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 1>;
+def : ReadAdvance<ReadAdrBase, 2>;
+def : ReadAdvance<ReadVLD, 2>;
+
+// FIXME: This needs more targeted benchmarking.
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycles later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm, WriteI,
+                             WriteISReg, WriteIEReg, WriteIS,
+                             WriteID32, WriteID64,
+                             WriteIM32, WriteIM64]>;
+def THXT8XReadShifted : SchedReadAdvance<1, [WriteImm, WriteI,
+                                          WriteISReg, WriteIEReg, WriteIS,
+                                          WriteID32, WriteID64,
+                                          WriteIM32, WriteIM64]>;
+def THXT8XReadNotShifted : SchedReadAdvance<2, [WriteImm, WriteI,
+                                             WriteISReg, WriteIEReg, WriteIS,
+                                             WriteID32, WriteID64,
+                                             WriteIM32, WriteIM64]>;
+def THXT8XReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [THXT8XReadShifted]>,
+	SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, THXT8XReadISReg>;
+
+def THXT8XReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [THXT8XReadShifted]>,
+	SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, THXT8XReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg, WriteIS,
+                              WriteID32, WriteID64,
+                              WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm, WriteI,
+                               WriteISReg, WriteIEReg, WriteIS,
+                               WriteID32, WriteID64,
+                               WriteIM32, WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm, WriteI,
+                              WriteISReg, WriteIEReg, WriteIS,
+                              WriteID32, WriteID64,
+                              WriteIM32, WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRW.
+
+//---
+// Branch
+//---
+def : InstRW<[THXT8XWriteBR], (instregex "^B")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^BL")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>;
+
+//---
+// Ret
+//---
+def : InstRW<[THXT8XWriteRET], (instregex "^RET")>;
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[THXT8XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THXT8XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THXT8XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THXT8XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THXT8XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THXT8XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
new file mode 100644
index 0000000..10df50b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -0,0 +1,1745 @@
+//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for Cavium ThunderX2T99
+// processors.
+// Based on Broadcom Vulcan.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def ThunderX2T99Model : SchedMachineModel {
+  let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
+  let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
+  // Determined via a mix of micro-arch details and experimentation.
+  let LoopMicroOpBufferSize =  32;
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def THX2T99P0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def THX2T99P1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def THX2T99P2 : ProcResource<1>;
+
+// Port 3: Store data.
+def THX2T99P3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def THX2T99P4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def THX2T99P5 : ProcResource<1>;
+
+let SchedModel = ThunderX2T99Model in {
+
+// Define groups for the functional units on each issue port.  Each group
+// created will be used by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member.  This is a way to create names for
+// the various functional units that share a single issue port.  For example,
+// "THX2T99I1" for ALU ops on port 1 and "THX2T99F1" for FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def THX2T99I1 : ProcResGroup<[THX2T99P1]>;
+
+// Branch micro-ops only on port 2.
+def THX2T99I2 : ProcResGroup<[THX2T99P2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def THX2T99I012 : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def THX2T99F1 : ProcResGroup<[THX2T99P1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def THX2T99F01 : ProcResGroup<[THX2T99P0, THX2T99P1]>;
+
+// Store data micro-ops only on port 3.
+def THX2T99SD : ProcResGroup<[THX2T99P3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>;
+
+// 60 entry unified scheduler.
+def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2,
+                               THX2T99P3, THX2T99P4, THX2T99P5]> {
+  let BufferSize = 60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on I2.
+def THX2T99Write_1Cyc_I2 : SchedWriteRes<[THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on I1.
+def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 23 cycles on I1.
+def THX2T99Write_23Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
+}
+
+// 39 cycles on I1.
+def THX2T99Write_39Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
+}
+
+// 1 cycle on I0, I1, or I2.
+def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 2 cycles on I0, I1, or I2.
+def THX2T99Write_2Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on I0, I1, or I2.
+def THX2T99Write_4Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on I0, I1, or I2.
+def THX2T99Write_5Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+  let Latency = 2;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on F1.
+def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 7 cycles on F1.
+def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on F0 or F1.
+def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on F0 or F1.
+def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 6 cycles on F0 or F1.
+def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 7 cycles on F0 or F1.
+def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// 8 cycles on F0 or F1.
+def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 10 cycles on F0 or F1.
+def THX2T99Write_10Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+// 16 cycles on F0 or F1.
+def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let NumMicroOps = 3;
+  let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let NumMicroOps = 3;
+  let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 0;
+}
+
+// 1 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+
+// 2 cycles on LS0 or LS1.
+def THX2T99Write_2Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0 or LS1.
+def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+// 5 cycles on LS0 or LS1.
+def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1.
+def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_5Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012_I012 :
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+}
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = ThunderX2T99Model in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr,      [THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [THX2T99I2]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic,  []> {
+  let Unsupported = 1;
+  let NumMicroOps = 2;
+}
+
+//---
+// Branch
+//---
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>;
+def : InstRW<[THX2T99Write_1Cyc_I2],
+            (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>;
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI,       [THX2T99I012]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteI],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg,   [THX2T99I012]> {
+  let Latency = 2;
+  let ResourceCycles = [2, 3];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteISReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
+def : WriteRes<WriteIEReg,   [THX2T99I012]> {
+  let Latency = 1;
+  let ResourceCycles = [1, 3];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteIEReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC?(W|X)r(i|r|s|x)",   "ADCS?(W|X)r(i|r|s|x)",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC?(W|X)r(i|r|s|x)",
+                       "SBCS?(W|X)r(i|r|s|x)",  "CCMN?(W|X)r(i|r|s|x)",
+                       "CCMP?(W|X)r(i|r|s|x)",  "CSEL?(W|X)r(i|r|s|x)",
+                       "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+                       "CSNEG?(W|X)r(i|r|s|x)")>;
+
+// Move immed
+def : WriteRes<WriteImm,     [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
+
+// Variable shift
+def : WriteRes<WriteIS,      [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23/13-39.
+def : WriteRes<WriteID32,    [THX2T99I1]> {
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
+}
+
+// Divide, X-form
+def : WriteRes<WriteID64,    [THX2T99I1]> {
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32,    [THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64,    [THX2T99I012]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX2T99Write_5Cyc_I012],
+//             (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[THX2T99Write_5Cyc_I012],
+            (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr,    [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Multiply high
+def : InstRW<[THX2T99Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[THX2T99Write_1Cyc_I012],
+            (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
+
+// Bitfield move, insert
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "^BFM")>;
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "(S|U)?BFM.*")>;
+
+// Count leading
+def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
+                                                "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES[DE]")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AESI?MC")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1[CMP]")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256(H|H2|SU1)")>;
+
+// CRC Instructions
+// def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>;
+def : InstRW<[THX2T99Write_4Cyc_I1],
+            (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>;
+
+def : InstRW<[THX2T99Write_4Cyc_I1],
+            (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD,      [THX2T99LS01]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr,     [THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+  let NumMicroOps = 5;
+}
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def THX2T99WriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [THX2T99Write_6Cyc_LS01_I012_I012]>,
+  SchedVar<NoSchedPred,   [THX2T99Write_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, THX2T99WriteLDIdx>;
+
+def THX2T99ReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>;
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRBui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRHui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRQui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRSui)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRQl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRWl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRXl)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSWi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+            (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+             (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURDi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURQi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSWi)>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMl)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFUMi)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMui)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroW)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroX)>;
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST,      [THX2T99LS01, THX2T99SD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [THX2T99LS01, THX2T99SD, THX2T99I012]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURSi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURWi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRWui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRWui)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF,       [THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// FP arithmetic
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp,    [THX2T99F01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFDiv, [THX2T99F01]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THX2T99XWriteFDiv : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFDivSP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFDivDP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtSP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtDP : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>;
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMul : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMulAcc : SchedWriteRes<[THX2T99F01]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def : InstRW<[THX2T99XWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[THX2T99XWriteFMulAcc],
+            (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
+
+// FP round to integral
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [THX2T99F01]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [THX2T99F01]> {
+  let Latency = 7;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4, 23];
+}
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv",
+                       "^ORRv", "^ORNv", "^NOTv")>;
+// ASIMD arith, reduce
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+            (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^(P?MUL|SQR?DMULH)" #
+                       "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+                       "(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv",
+                       "SQSHRNv","SQSHRUNv", "UQRSHRNv",
+                       "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU][QR]{1,2}SHL" #
+                       "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+                       "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+                       "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADALP","^UADALP")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLPv","^UADDLPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLV","^UADDLV")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+             (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+             (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SUQADDv","^USQADDv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv",
+                       "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+                       "^SRHADD", "^SUBHNv", "^SUQADD",
+                       "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^CMEQv","^CMGEv","^CMGTv",
+                       "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv",
+                       "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+                                                 "^FCMGTv", "^FCMLEv",
+                                                 "^FCMLTv")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv2f32")>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv4f32")>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "FDIVv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+                                                "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+                                                "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+                                                "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+            (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP negate
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>;
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^XTNv")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[THX2T99Write_7Cyc_F01],
+            (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
+
+// ASIMD insert, element to element
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>;
+
+// ASIMD table lookup, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transpose
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+                       "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[THX2T99Write_5Cyc_F01],
+            (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "(S|U)MOVv.*")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+                                                 "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX2T99Write_4Cyc_LS01],
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX2T99Write_4Cyc_LS01],
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX2T99Write_5Cyc_LS01],
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr],
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX2T99Write_6Cyc_LS01],
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr],
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01],
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01],
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+            (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+} // SchedModel = ThunderX2T99Model
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
deleted file mode 100644
index 35a40c3..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
+++ /dev/null
@@ -1,852 +0,0 @@
-//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// 1. Introduction
-//
-// This file defines the machine model for Broadcom Vulcan to support
-// instruction scheduling and other instruction cost heuristics.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// 2. Pipeline Description.
-
-def VulcanModel : SchedMachineModel {
-  let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
-  let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
-  let LoadLatency           =   4; // Optimistic load latency.
-  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
-  // Determined via a mix of micro-arch details and experimentation.
-  let LoopMicroOpBufferSize =  32; 
-  let PostRAScheduler       =   1; // Using PostRA sched.
-  let CompleteModel         =   1;
-}
-
-// Define the issue ports.
-
-// Port 0: ALU, FP/SIMD.
-def VulcanP0 : ProcResource<1>;
-
-// Port 1: ALU, FP/SIMD, integer mul/div.
-def VulcanP1 : ProcResource<1>;
-
-// Port 2: ALU, Branch.
-def VulcanP2 : ProcResource<1>;
-
-// Port 3: Store data.
-def VulcanP3 : ProcResource<1>;
-
-// Port 4: Load/store.
-def VulcanP4 : ProcResource<1>;
-
-// Port 5: Load/store.
-def VulcanP5 : ProcResource<1>;
-
-let SchedModel = VulcanModel in {
-
-// Define groups for the functional units on each issue port.  Each group
-// created will be used by a WriteRes later on.
-//
-// NOTE: Some groups only contain one member.  This is a way to create names for
-// the various functional units that share a single issue port.  For example,
-// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1.
-
-// Integer divide and multiply micro-ops only on port 1.
-def VulcanI1 : ProcResGroup<[VulcanP1]>;
-
-// Branch micro-ops only on port 2.
-def VulcanI2 : ProcResGroup<[VulcanP2]>;
-
-// ALU micro-ops on ports 0, 1, and 2.
-def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
-
-// Crypto FP/SIMD micro-ops only on port 1.
-def VulcanF1 : ProcResGroup<[VulcanP1]>;
-
-// FP/SIMD micro-ops on ports 0 and 1.
-def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
-
-// Store data micro-ops only on port 3.
-def VulcanSD : ProcResGroup<[VulcanP3]>;
-
-// Load/store micro-ops on ports 4 and 5.
-def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
-
-// 60 entry unified scheduler.
-def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
-                              VulcanP3, VulcanP4, VulcanP5]> {
-  let BufferSize=60;
-}
-
-// Define commonly used write types for InstRW specializations.
-// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
-
-// 3 cycles on I1.
-def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
-
-// 4 cycles on I1.
-def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
-
-// 1 cycle on I0, I1, or I2.
-def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
-
-// 5 cycles on F1.
-def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
-
-// 7 cycles on F1.
-def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
-
-// 4 cycles on F0 or F1.
-def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
-
-// 5 cycles on F0 or F1.
-def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
-
-// 6 cycles on F0 or F1.
-def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
-
-// 7 cycles on F0 or F1.
-def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
-
-// 8 cycles on F0 or F1.
-def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
-
-// 16 cycles on F0 or F1.
-def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
-  let Latency = 16;
-  let ResourceCycles = [8];
-}
-
-// 23 cycles on F0 or F1.
-def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
-  let Latency = 23;
-  let ResourceCycles = [11];
-}
-
-// 1 cycles on LS0 or LS1.
-def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
-
-// 4 cycles on LS0 or LS1.
-def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
-
-// 5 cycles on LS0 or LS1.
-def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
-
-// 6 cycles on LS0 or LS1.
-def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
-
-// 5 cycles on LS0 or LS1 and I0, I1, or I2.
-def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-
-// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
-def VulcanWrite_6Cyc_LS01_I012_I012 : 
-  SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-}
-
-// 1 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-// 5 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-
-// 6 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-}
-
-// 7 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
-  let Latency = 7;
-  let NumMicroOps = 2;
-}
-
-// 8 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
-  let Latency = 8;
-  let NumMicroOps = 2;
-}
-
-// Define commonly used read types.
-
-// No forwarding is provided for these types.
-def : ReadAdvance<ReadI,       0>;
-def : ReadAdvance<ReadISReg,   0>;
-def : ReadAdvance<ReadIEReg,   0>;
-def : ReadAdvance<ReadIM,      0>;
-def : ReadAdvance<ReadIMA,     0>;
-def : ReadAdvance<ReadID,      0>;
-def : ReadAdvance<ReadExtrHi,  0>;
-def : ReadAdvance<ReadAdrBase, 0>;
-def : ReadAdvance<ReadVLD,     0>;
-
-}
-
-
-//===----------------------------------------------------------------------===//
-// 3. Instruction Tables.
-
-let SchedModel = VulcanModel in {
-
-//---
-// 3.1 Branch Instructions
-//---
-
-// Branch, immed
-// Branch and link, immed
-// Compare and branch
-def : WriteRes<WriteBr,      [VulcanI2]> { let Latency = 1; }
-
-def : WriteRes<WriteSys,     []> { let Latency = 1; }
-def : WriteRes<WriteBarrier, []> { let Latency = 1; }
-def : WriteRes<WriteHint,    []> { let Latency = 1; }
-
-def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
-
-// Branch, register
-// Branch and link, register != LR
-// Branch and link, register = LR
-def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
-
-//---
-// 3.2 Arithmetic and Logical Instructions
-// 3.3 Move and Shift Instructions
-//---
-
-// ALU, basic
-// Conditional compare
-// Conditional select
-// Address generation
-def : WriteRes<WriteI,       [VulcanI012]> { let Latency = 1; }
-def : InstRW<[WriteI], (instrs COPY)>;
-
-// ALU, extend and/or shift
-def : WriteRes<WriteISReg,   [VulcanI012]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-
-def : WriteRes<WriteIEReg,   [VulcanI012]> {
-  let Latency = 2;
-  let ResourceCycles = [2];
-}
-
-// Move immed
-def : WriteRes<WriteImm,     [VulcanI012]> { let Latency = 1; }
-
-// Variable shift
-def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
-
-//---
-// 3.4 Divide and Multiply Instructions
-//---
-
-// Divide, W-form
-// Latency range of 13-23.  Take the average.
-def : WriteRes<WriteID32,    [VulcanI1]> {
-  let Latency = 18;
-  let ResourceCycles = [18];
-}
-
-// Divide, X-form
-// Latency range of 13-39.  Take the average.
-def : WriteRes<WriteID64,    [VulcanI1]> {
-  let Latency = 26;
-  let ResourceCycles = [26];
-}
-
-// Multiply accumulate, W-form
-def : WriteRes<WriteIM32,    [VulcanI012]> { let Latency = 5; }
-
-// Multiply accumulate, X-form
-def : WriteRes<WriteIM64,    [VulcanI012]> { let Latency = 5; }
-
-// Bitfield extract, two reg
-def : WriteRes<WriteExtr,    [VulcanI012]> { let Latency = 1; }
-
-// Bitfield move, basic
-// Bitfield move, insert
-// NOTE: Handled by WriteIS.
-
-// Count leading
-def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
-                                               "^CLZ(W|X)r$")>;
-
-// Reverse bits/bytes
-// NOTE: Handled by WriteI.
-
-//---
-// 3.6 Load Instructions 
-// 3.10 FP Load Instructions
-//---
-
-// Load register, literal
-// Load register, unscaled immed
-// Load register, immed unprivileged
-// Load register, unsigned immed
-def : WriteRes<WriteLD,      [VulcanLS01]> { let Latency = 4; }
-
-// Load register, immed post-index
-// NOTE: Handled by WriteLD, WriteI.
-// Load register, immed pre-index
-// NOTE: Handled by WriteLD, WriteAdr.
-def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
-
-// Load register offset, basic
-// Load register, register offset, scale by 4/8
-// Load register, register offset, scale by 2
-// Load register offset, extend
-// Load register, register offset, extend, scale by 4/8
-// Load register, register offset, extend, scale by 2
-def VulcanWriteLDIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
-  SchedVar<NoSchedPred,   [VulcanWrite_5Cyc_LS01_I012]>]>;
-def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
-
-def VulcanReadAdrBase : SchedReadVariant<[
-  SchedVar<ScaledIdxPred, [ReadDefault]>,
-  SchedVar<NoSchedPred,   [ReadDefault]>]>;
-def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
-
-// Load pair, immed offset, normal
-// Load pair, immed offset, signed words, base != SP
-// Load pair, immed offset signed words, base = SP
-// LDP only breaks into *one* LS micro-op.  Thus
-// the resources are handling by WriteLD.
-def : WriteRes<WriteLDHi,    []> {
-  let Latency = 5;
-}
-
-// Load pair, immed pre-index, normal
-// Load pair, immed pre-index, signed words
-// Load pair, immed post-index, normal
-// Load pair, immed post-index, signed words
-// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
-
-//--
-// 3.7 Store Instructions 
-// 3.11 FP Store Instructions
-//--
-
-// Store register, unscaled immed
-// Store register, immed unprivileged
-// Store register, unsigned immed
-def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-// Store register, immed post-index
-// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
-
-// Store register, immed pre-index
-// NOTE: Handled by WriteAdr, WriteST
-
-// Store register, register offset, basic
-// Store register, register offset, scaled by 4/8
-// Store register, register offset, scaled by 2
-// Store register, register offset, extend
-// Store register, register offset, extend, scale by 4/8
-// Store register, register offset, extend, scale by 1
-def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
-  let Latency = 1;
-  let NumMicroOps = 3;
-}
-
-// Store pair, immed offset, W-form
-// Store pair, immed offset, X-form
-def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-// Store pair, immed post-index, W-form
-// Store pair, immed post-index, X-form
-// Store pair, immed pre-index, W-form
-// Store pair, immed pre-index, X-form
-// NOTE: Handled by WriteAdr, WriteSTP.
-
-//---
-// 3.8 FP Data Processing Instructions
-//---
-
-// FP absolute value
-// FP min/max
-// FP negate
-def : WriteRes<WriteF,       [VulcanF01]> { let Latency = 5; }
-
-// FP arithmetic
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
-
-// FP compare
-def : WriteRes<WriteFCmp,    [VulcanF01]> { let Latency = 5; }
-
-// FP divide, S-form
-// FP square root, S-form
-def : WriteRes<WriteFDiv,    [VulcanF01]> {
-  let Latency = 16;
-  let ResourceCycles = [8];
-}
-
-// FP divide, D-form
-// FP square root, D-form
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
-
-// FP multiply
-// FP multiply accumulate
-def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
-
-// FP round to integral
-def : InstRW<[VulcanWrite_7Cyc_F01],
-            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
-
-// FP select
-def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
-
-//---
-// 3.9 FP Miscellaneous Instructions
-//---
-
-// FP convert, from vec to vec reg
-// FP convert, from gen to vec reg
-// FP convert, from vec to gen reg
-def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
-
-// FP move, immed
-// FP move, register
-def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
-
-// FP transfer, from gen to vec reg
-// FP transfer, from vec to gen reg
-def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
-
-//---
-// 3.12 ASIMD Integer Instructions
-//---
-
-// ASIMD absolute diff, D-form
-// ASIMD absolute diff, Q-form
-// ASIMD absolute diff accum, D-form
-// ASIMD absolute diff accum, Q-form
-// ASIMD absolute diff accum long
-// ASIMD absolute diff long
-// ASIMD arith, basic
-// ASIMD arith, complex
-// ASIMD compare
-// ASIMD logical (AND, BIC, EOR)
-// ASIMD max/min, basic
-// ASIMD max/min, reduce, 4H/4S
-// ASIMD max/min, reduce, 8B/8H
-// ASIMD max/min, reduce, 16B
-// ASIMD multiply, D-form
-// ASIMD multiply, Q-form
-// ASIMD multiply accumulate long
-// ASIMD multiply accumulate saturating long
-// ASIMD multiply long
-// ASIMD pairwise add and accumulate
-// ASIMD shift accumulate
-// ASIMD shift by immed, basic
-// ASIMD shift by immed and insert, basic, D-form
-// ASIMD shift by immed and insert, basic, Q-form
-// ASIMD shift by immed, complex
-// ASIMD shift by register, basic, D-form
-// ASIMD shift by register, basic, Q-form
-// ASIMD shift by register, complex, D-form
-// ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
-
-// ASIMD arith, reduce, 4H/4S
-// ASIMD arith, reduce, 8B/8H
-// ASIMD arith, reduce, 16B
-def : InstRW<[VulcanWrite_5Cyc_F01], 
-            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
-
-// ASIMD logical (MOV, MVN, ORN, ORR)
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
-
-// ASIMD polynomial (8x8) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
-
-//---
-// 3.13 ASIMD Floating-point Instructions
-//---
-
-// ASIMD FP absolute value
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
-
-// ASIMD FP arith, normal, D-form
-// ASIMD FP arith, normal, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
-
-// ASIMD FP arith,pairwise, D-form
-// ASIMD FP arith, pairwise, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
-
-// ASIMD FP compare, D-form
-// ASIMD FP compare, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
-                                                "^FCMGTv", "^FCMLEv",
-                                                "^FCMLTv")>;
-
-// ASIMD FP convert, long
-// ASIMD FP convert, narrow
-// ASIMD FP convert, other, D-form
-// ASIMD FP convert, other, Q-form
-// NOTE: Handled by WriteV.
-
-// ASIMD FP divide, D-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
-
-// ASIMD FP divide, Q-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
-
-// ASIMD FP divide, Q-form, F64
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
-
-// ASIMD FP max/min, normal, D-form
-// ASIMD FP max/min, normal, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
-                                                "^FMINv", "^FMINNMv")>;
-
-// ASIMD FP max/min, pairwise, D-form
-// ASIMD FP max/min, pairwise, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
-                                                "^FMINPv", "^FMINNMPv")>;
-
-// ASIMD FP max/min, reduce
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
-                                                "^FMINVv", "^FMINNMVv")>;
-
-// ASIMD FP multiply, D-form, FZ
-// ASIMD FP multiply, D-form, no FZ
-// ASIMD FP multiply, Q-form, FZ
-// ASIMD FP multiply, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
-
-// ASIMD FP multiply accumulate, Dform, FZ
-// ASIMD FP multiply accumulate, Dform, no FZ
-// ASIMD FP multiply accumulate, Qform, FZ
-// ASIMD FP multiply accumulate, Qform, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
-
-// ASIMD FP negate
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
-
-// ASIMD FP round, D-form
-// ASIMD FP round, Q-form
-// NOTE: Handled by WriteV.
-
-//--
-// 3.14 ASIMD Miscellaneous Instructions
-//--
-
-// ASIMD bit reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
-
-// ASIMD bitwise insert, D-form
-// ASIMD bitwise insert, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
-
-// ASIMD count, D-form
-// ASIMD count, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
-
-// ASIMD duplicate, gen reg
-// ASIMD duplicate, element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
-
-// ASIMD extract
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
-
-// ASIMD extract narrow
-// ASIMD extract narrow, saturating
-// NOTE: Handled by WriteV.
-
-// ASIMD insert, element to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
-
-// ASIMD move, integer immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
-
-// ASIMD move, FP immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
-
-// ASIMD reciprocal estimate, D-form
-// ASIMD reciprocal estimate, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], 
-            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
-                                   "^FRSQRTEv", "^URSQRTEv")>;
-
-// ASIMD reciprocal step, D-form, FZ
-// ASIMD reciprocal step, D-form, no FZ
-// ASIMD reciprocal step, Q-form, FZ
-// ASIMD reciprocal step, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
-
-// ASIMD reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], 
-            (instregex "^REV16v", "^REV32v", "^REV64v")>;
-
-// ASIMD table lookup, D-form
-// ASIMD table lookup, Q-form
-def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
-
-// ASIMD transfer, element to word or word
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
-
-// ASIMD transfer, element to gen reg
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
-
-// ASIMD transfer gen reg to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
-
-// ASIMD transpose
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
-                                                "^UZP1v", "^UZP2v")>;
-
-// ASIMD unzip/zip
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
-
-//--
-// 3.15 ASIMD Load Instructions 
-//--
-
-// ASIMD load, 1 element, multiple, 1 reg, D-form
-// ASIMD load, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01], 
-            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
-            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, multiple, 2 reg, D-form
-// ASIMD load, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01], 
-            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
-            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, multiple, 3 reg, D-form
-// ASIMD load, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01], 
-            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], 
-            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, multiple, 4 reg, D-form
-// ASIMD load, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_6Cyc_LS01], 
-            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], 
-            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, one lane, B/H/S
-// ASIMD load, 1 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD1i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 1 element, all lanes, D-form, B/H/S
-// ASIMD load, 1 element, all lanes, D-form, D
-// ASIMD load, 1 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
-            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 2 element, multiple, D-form, B/H/S
-// ASIMD load, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
-            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 2 element, one lane, B/H
-// ASIMD load, 2 element, one lane, S
-// ASIMD load, 2 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD2i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 2 element, all lanes, D-form, B/H/S
-// ASIMD load, 2 element, all lanes, D-form, D
-// ASIMD load, 2 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
-            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 3 element, multiple, D-form, B/H/S
-// ASIMD load, 3 element, multiple, Q-form, B/H/S
-// ASIMD load, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
-            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 3 element, one lone, B/H
-// ASIMD load, 3 element, one lane, S
-// ASIMD load, 3 element, one lane, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD3i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 3 element, all lanes, D-form, B/H/S
-// ASIMD load, 3 element, all lanes, D-form, D
-// ASIMD load, 3 element, all lanes, Q-form, B/H/S
-// ASIMD load, 3 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], 
-            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 4 element, multiple, D-form, B/H/S
-// ASIMD load, 4 element, multiple, Q-form, B/H/S
-// ASIMD load, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
-            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 4 element, one lane, B/H
-// ASIMD load, 4 element, one lane, S
-// ASIMD load, 4 element, one lane, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD4i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 4 element, all lanes, D-form, B/H/S
-// ASIMD load, 4 element, all lanes, D-form, D
-// ASIMD load, 4 element, all lanes, Q-form, B/H/S
-// ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], 
-            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
-            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-//--
-// 3.16 ASIMD Store Instructions
-//--
-
-// ASIMD store, 1 element, multiple, 1 reg, D-form
-// ASIMD store, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
-            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
-            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, multiple, 2 reg, D-form
-// ASIMD store, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
-            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
-            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, multiple, 3 reg, D-form
-// ASIMD store, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
-            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
-            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, multiple, 4 reg, D-form
-// ASIMD store, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
-            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
-            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, one lane, B/H/S
-// ASIMD store, 1 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
-            (instregex "^ST1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST1i(8|16|32|64)_POST$")>;
-
-// ASIMD store, 2 element, multiple, D-form, B/H/S
-// ASIMD store, 2 element, multiple, Q-form, B/H/S
-// ASIMD store, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
-            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 2 element, one lane, B/H/S
-// ASIMD store, 2 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
-            (instregex "^ST2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST2i(8|16|32|64)_POST$")>;
-
-// ASIMD store, 3 element, multiple, D-form, B/H/S
-// ASIMD store, 3 element, multiple, Q-form, B/H/S
-// ASIMD store, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
-            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 3 element, one lane, B/H
-// ASIMD store, 3 element, one lane, S
-// ASIMD store, 3 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST3i(8|16|32|64)_POST$")>;
-
-// ASIMD store, 4 element, multiple, D-form, B/H/S
-// ASIMD store, 4 element, multiple, Q-form, B/H/S
-// ASIMD store, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
-            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 4 element, one lane, B/H
-// ASIMD store, 4 element, one lane, S
-// ASIMD store, 4 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
-            (instregex "^ST4i(8|16|32|64)_POST$")>;
-
-//--
-// 3.17 Cryptography Extensions
-//--
-
-// Crypto AES ops
-def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
-
-// Crypto polynomial (64x64) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
-
-// Crypto SHA1 xor ops
-// Crypto SHA1 schedule acceleration ops
-// Crypto SHA256 schedule acceleration op (1 u-op)
-// Crypto SHA256 schedule acceleration op (2 u-ops)
-// Crypto SHA256 hash acceleration ops
-def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
-
-//--
-// 3.18 CRC
-//--
-
-// CRC checksum ops
-def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
-
-} // SchedModel = VulcanModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 66a8f33..7f55073 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -42,10 +42,12 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     Entry.Node = Size;
     Args.push_back(Entry);
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
-      .setDiscardResult();
+    CLI.setDebugLoc(dl)
+        .setChain(Chain)
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                      std::move(Args))
+        .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
@@ -53,7 +55,5 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
 }
 bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
     CodeGenOpt::Level OptLevel) const {
-  if (OptLevel >= CodeGenOpt::Aggressive)
-    return true;
-  return false;
+  return OptLevel >= CodeGenOpt::Aggressive;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 03e0132..ea61124 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -12,8 +12,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64Subtarget.h"
+
+#include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64PBQPRegAlloc.h"
+#include "AArch64TargetMachine.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "AArch64CallLowering.h"
+#include "AArch64LegalizerInfo.h"
+#include "AArch64RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -35,6 +49,11 @@ static cl::opt<bool>
 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
                          "an address is ignored"), cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    UseNonLazyBind("aarch64-enable-nonlazybind",
+                   cl::desc("Call nonlazybind functions via direct GOT load"),
+                   cl::init(false), cl::Hidden);
+
 AArch64Subtarget &
 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
                                                   StringRef CPUString) {
@@ -62,6 +81,7 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case CortexA57:
     MaxInterleaveFactor = 4;
+    PrefFunctionAlignment = 4;
     break;
   case ExynosM1:
     MaxInterleaveFactor = 4;
@@ -71,7 +91,12 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case Falkor:
     MaxInterleaveFactor = 4;
-    VectorInsertExtractBaseCost = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
+    CacheLineSize = 128;
+    PrefetchDistance = 820;
+    MinPrefetchStride = 2048;
+    MaxPrefetchIterationsAhead = 8;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
@@ -80,25 +105,99 @@ void AArch64Subtarget::initializeProperties() {
     PrefetchDistance = 740;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
-  case Vulcan:
+  case ThunderX2T99:
+    CacheLineSize = 64;
+    PrefFunctionAlignment = 3;
+    PrefLoopAlignment = 2;
     MaxInterleaveFactor = 4;
+    PrefetchDistance = 128;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
+    break;
+  case ThunderX:
+  case ThunderXT88:
+  case ThunderXT81:
+  case ThunderXT83:
+    CacheLineSize = 128;
+    PrefFunctionAlignment = 3;
+    PrefLoopAlignment = 2;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
     break;
   case CortexA35: break;
   case CortexA53: break;
-  case CortexA72: break;
-  case CortexA73: break;
+  case CortexA72:
+    PrefFunctionAlignment = 4;
+    break;
+  case CortexA73:
+    PrefFunctionAlignment = 4;
+    break;
   case Others: break;
   }
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct AArch64GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+    : AArch64GenSubtargetInfo(TT, CPU, FS),
+      ReserveX18(TT.isOSDarwin() || TT.isOSWindows()),
       IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
-      TLInfo(TM, *this), GISel() {}
+      TLInfo(TM, *this), GISel() {
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *AArch64GISel = new GISelAccessor();
+#else
+  AArch64GISelActualAccessor *AArch64GISel = new AArch64GISelActualAccessor();
+  AArch64GISel->CallLoweringInfo.reset(
+      new AArch64CallLowering(*getTargetLowering()));
+  AArch64GISel->Legalizer.reset(new AArch64LegalizerInfo());
+
+  auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
+
+  // FIXME: At this point, we can't rely on Subtarget having RBI.
+  // It's awkward to mix passing RBI and the Subtarget; should we pass
+  // TII/TRI as well?
+  AArch64GISel->InstSelector.reset(createAArch64InstructionSelector(
+      *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
+
+  AArch64GISel->RegBankInfo.reset(RBI);
+#endif
+  setGISelAccessor(*AArch64GISel);
+}
 
 const CallLowering *AArch64Subtarget::getCallLowering() const {
   assert(GISel && "Access to GlobalISel APIs not set");
@@ -133,9 +232,26 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
     return AArch64II::MO_GOT;
 
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+  // The small code model's direct accesses use ADRP, which cannot
+  // necessarily produce the value 0 (if the code is above 4GB).
+  if (useSmallAddressing() && GV->hasExternalWeakLinkage())
+    return AArch64II::MO_GOT;
+
+  return AArch64II::MO_NO_FLAG;
+}
+
+unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
+    const GlobalValue *GV, const TargetMachine &TM) const {
+  // MachO large model always goes via a GOT, because we don't have the
+  // relocations available to do anything else..
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
+      !GV->hasInternalLinkage())
+    return AArch64II::MO_GOT;
+
+  // NonLazyBind goes via GOT unless we know it's available locally.
+  auto *F = dyn_cast<Function>(GV);
+  if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
+      !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
     return AArch64II::MO_GOT;
 
   return AArch64II::MO_NO_FLAG;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
index a993402..5a1f45e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -45,7 +45,11 @@ public:
     ExynosM1,
     Falkor,
     Kryo,
-    Vulcan
+    ThunderX2T99,
+    ThunderX,
+    ThunderXT81,
+    ThunderXT83,
+    ThunderXT88
   };
 
 protected:
@@ -61,9 +65,12 @@ protected:
   bool HasCRC = false;
   bool HasLSE = false;
   bool HasRAS = false;
+  bool HasRDM = false;
   bool HasPerfMon = false;
   bool HasFullFP16 = false;
   bool HasSPE = false;
+  bool HasLSLFast = false;
+  bool HasSVE = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -73,6 +80,13 @@ protected:
 
   // StrictAlign - Disallow unaligned memory accesses.
   bool StrictAlign = false;
+
+  // NegativeImmediates - transform instructions with negative immediates
+  bool NegativeImmediates = true;
+
+  // Enable 64-bit vectorization in SLP.
+  unsigned MinVectorRegisterBitWidth = 64;
+
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -83,6 +97,8 @@ protected:
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
   bool HasArithmeticCbzFusion = false;
+  bool HasFuseAES = false;
+  bool HasFuseLiterals = false;
   bool DisableLatencySchedHeuristic = false;
   bool UseRSqrt = false;
   uint8_t MaxInterleaveFactor = 2;
@@ -94,6 +110,7 @@ protected:
   unsigned PrefFunctionAlignment = 0;
   unsigned PrefLoopAlignment = 0;
   unsigned MaxJumpTableSize = 0;
+  unsigned WideningBaseCost = 0;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -176,6 +193,10 @@ public:
 
   bool isXRaySupported() const override { return true; }
 
+  unsigned getMinVectorRegisterBitWidth() const {
+    return MinVectorRegisterBitWidth;
+  }
+
   bool isX18Reserved() const { return ReserveX18; }
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
@@ -183,6 +204,7 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasLSE() const { return HasLSE; }
   bool hasRAS() const { return HasRAS; }
+  bool hasRDM() const { return HasRDM; }
   bool balanceFPOps() const { return BalanceFPOps; }
   bool predictableSelectIsExpensive() const {
     return PredictableSelectIsExpensive;
@@ -195,6 +217,15 @@ public:
   }
   bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
   bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+  bool hasFuseAES() const { return HasFuseAES; }
+  bool hasFuseLiterals() const { return HasFuseLiterals; }
+
+  /// \brief Return true if the CPU supports any kind of instruction fusion.
+  bool hasFusion() const {
+    return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
+           hasFuseAES() || hasFuseLiterals();
+  }
+
   bool useRSqrt() const { return UseRSqrt; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
@@ -211,6 +242,8 @@ public:
 
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
+  unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
@@ -218,6 +251,8 @@ public:
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
+  bool hasLSLFast() const { return HasLSLFast; }
+  bool hasSVE() const { return HasSVE; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -226,6 +261,7 @@ public:
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
@@ -233,9 +269,17 @@ public:
 
   bool useAA() const override { return UseAA; }
 
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
+  bool useSmallAddressing() const {
+    switch (TLInfo.getTargetMachine().getCodeModel()) {
+      case CodeModel::Kernel:
+        // Kernel is currently allowed only for Fuchsia targets,
+        // where it is the same as Small for almost all purposes.
+      case CodeModel::Small:
+        return true;
+      default:
+        return false;
+    }
+  }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -246,6 +290,9 @@ public:
   unsigned char ClassifyGlobalReference(const GlobalValue *GV,
                                         const TargetMachine &TM) const;
 
+  unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+                                                const TargetMachine &TM) const;
+
   /// This function returns the name of a function which has an interface
   /// like the non-standard bzero function, if such a function exists on
   /// the current subtarget and it is considered prefereable over
@@ -259,6 +306,17 @@ public:
   bool enableEarlyIfConversion() const override;
 
   std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    switch (CC) {
+    case CallingConv::C:
+      return isTargetWindows();
+    case CallingConv::Win64:
+      return true;
+    default:
+      return false;
+    }
+  }
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index a3736c0..7c5dcb0 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,35 +18,37 @@ include "llvm/TableGen/SearchableTable.td"
 // AT (address translate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm,
          bits<3> op2> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
+  code Requires = [{ {} }];
 }
 
-def : AT<"S1E1R",  0b01, 0b000, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E2R",  0b01, 0b100, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E3R",  0b01, 0b110, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E1W",  0b01, 0b000, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E2W",  0b01, 0b100, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E3W",  0b01, 0b110, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E0R",  0b01, 0b000, 0b0111, 0b1000, 0b010>;
-def : AT<"S1E0W",  0b01, 0b000, 0b0111, 0b1000, 0b011>;
-def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
-def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
-def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
-def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
-def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
-def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
-
+def : AT<"S1E1R",  0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R",  0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R",  0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W",  0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W",  0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W",  0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R",  0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W",  0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in {
+def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>;
+}
 
 //===----------------------------------------------------------------------===//
 // DMB/DSB (data barrier) instruction options.
@@ -77,28 +79,31 @@ def : DB<"sy",    0xf>;
 // DC (data cache maintenance) instruction options.
 //===----------------------------------------------------------------------===//
 
-class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm,
          bits<3> op2> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
+  code Requires = [{ {} }];
 }
 
-def : DC<"ZVA",   0b01, 0b011, 0b0111, 0b0100, 0b001>;
-def : DC<"IVAC",  0b01, 0b000, 0b0111, 0b0110, 0b001>;
-def : DC<"ISW",   0b01, 0b000, 0b0111, 0b0110, 0b010>;
-def : DC<"CVAC",  0b01, 0b011, 0b0111, 0b1010, 0b001>;
-def : DC<"CSW",   0b01, 0b000, 0b0111, 0b1010, 0b010>;
-def : DC<"CVAU",  0b01, 0b011, 0b0111, 0b1011, 0b001>;
-def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
-def : DC<"CISW",  0b01, 0b000, 0b0111, 0b1110, 0b010>;
+def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC",  0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW",   0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU",  0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW",  0b000, 0b0111, 0b1110, 0b010>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : DC<"CVAP",  0b011, 0b0111, 0b1100, 0b001>;
 
 //===----------------------------------------------------------------------===//
 // IC (instruction cache maintenance) instruction options.
@@ -120,7 +125,7 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
 
 def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
 def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
-def : IC<"IVAU",    0b000, 0b0111, 0b0001, 0b000, 1>;
+def : IC<"IVAU",    0b011, 0b0111, 0b0101, 0b001, 1>;
 
 //===----------------------------------------------------------------------===//
 // ISB (instruction-fetch barrier) instruction options.
@@ -213,14 +218,13 @@ def : PSB<"csync", 0x11>;
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
              bits<3> op2, bit needsreg = 1> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
@@ -228,38 +232,38 @@ class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
   bit NeedsReg = needsreg;
 }
 
-def : TLBI<"IPAS2E1IS",    0b01, 0b100, 0b1000, 0b0000, 0b001>;
-def : TLBI<"IPAS2LE1IS",   0b01, 0b100, 0b1000, 0b0000, 0b101>;
-def : TLBI<"VMALLE1IS",    0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"VAE1IS",       0b01, 0b000, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE2IS",       0b01, 0b100, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE3IS",       0b01, 0b110, 0b1000, 0b0011, 0b001>;
-def : TLBI<"ASIDE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b010>;
-def : TLBI<"VAAE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b011>;
-def : TLBI<"ALLE1IS",      0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
-def : TLBI<"VALE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
-def : TLBI<"VAALE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b111>;
-def : TLBI<"IPAS2E1",      0b01, 0b100, 0b1000, 0b0100, 0b001>;
-def : TLBI<"IPAS2LE1",     0b01, 0b100, 0b1000, 0b0100, 0b101>;
-def : TLBI<"VMALLE1",      0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE2",        0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE3",        0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"VAE1",         0b01, 0b000, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE2",         0b01, 0b100, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE3",         0b01, 0b110, 0b1000, 0b0111, 0b001>;
-def : TLBI<"ASIDE1",       0b01, 0b000, 0b1000, 0b0111, 0b010>;
-def : TLBI<"VAAE1",        0b01, 0b000, 0b1000, 0b0111, 0b011>;
-def : TLBI<"ALLE1",        0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
-def : TLBI<"VALE1",        0b01, 0b000, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE2",        0b01, 0b100, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE3",        0b01, 0b110, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VMALLS12E1",   0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
-def : TLBI<"VAALE1",       0b01, 0b000, 0b1000, 0b0111, 0b111>;
+def : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS",   0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS",    0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS",      0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS",      0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS",       0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS",       0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS",       0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS",     0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS",      0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS",      0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS",      0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS",      0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS",      0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS",     0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1",      0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1",     0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1",      0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2",        0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3",        0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1",         0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2",         0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3",         0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1",       0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1",        0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1",        0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1",        0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2",        0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index d288394..ba28c01 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -10,23 +10,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64TargetMachine.h"
 #include "AArch64.h"
-#include "AArch64CallLowering.h"
-#include "AArch64InstructionSelector.h"
-#include "AArch64LegalizerInfo.h"
-#include "AArch64RegisterBankInfo.h"
+#include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
@@ -50,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
                                 cl::desc("Enable the CCMP formation pass"),
                                 cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+    EnableCondBrTuning("aarch64-enable-cond-br-tune",
+                       cl::desc("Enable the conditional branch tuning pass"),
+                       cl::init(true), cl::Hidden);
+
 static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
@@ -113,11 +115,6 @@ EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
                 cl::init(false));
 
 static cl::opt<bool>
-    EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden,
-                               cl::desc("Enable the type promotion pass"),
-                               cl::init(true));
-
-static cl::opt<bool>
     EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
                  cl::desc("Enable optimizations on complex GEPs"),
                  cl::init(false));
@@ -136,6 +133,14 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
+static cl::opt<int> EnableGlobalISelAtO(
+    "aarch64-enable-global-isel-at-O", cl::Hidden,
+    cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
+    cl::init(-1));
+
+static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
+                                         cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -145,7 +150,6 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeGlobalISel(*PR);
   initializeAArch64A53Fix835769Pass(*PR);
   initializeAArch64A57FPLoadBalancingPass(*PR);
-  initializeAArch64AddressTypePromotionPass(*PR);
   initializeAArch64AdvSIMDScalarPass(*PR);
   initializeAArch64CollectLOHPass(*PR);
   initializeAArch64ConditionalComparesPass(*PR);
@@ -157,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() {
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
+  initializeFalkorHWPFFixPass(*PR);
+  initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
 }
 
@@ -166,6 +172,8 @@ extern "C" void LLVMInitializeAArch64Target() {
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
     return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+  if (TT.isOSBinFormatCOFF())
+    return llvm::make_unique<AArch64_COFFTargetObjectFile>();
 
   return llvm::make_unique<AArch64_ELFTargetObjectFile>();
 }
@@ -178,6 +186,8 @@ static std::string computeDataLayout(const Triple &TT,
     return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
+  if (TT.isOSBinFormatCOFF())
+    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
@@ -215,35 +225,6 @@ AArch64TargetMachine::AArch64TargetMachine(
 
 AArch64TargetMachine::~AArch64TargetMachine() = default;
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct AArch64GISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CallLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-};
-
-} // end anonymous namespace
-#endif
-
 const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -264,25 +245,6 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
                                             isLittle);
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    AArch64GISelActualAccessor *GISel =
-        new AArch64GISelActualAccessor();
-    GISel->CallLoweringInfo.reset(
-        new AArch64CallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new AArch64LegalizerInfo());
-
-    auto *RBI = new AArch64RegisterBankInfo(*I->getRegisterInfo());
-
-    // FIXME: At this point, we can't rely on Subtarget having RBI.
-    // It's awkward to mix passing RBI and the Subtarget; should we pass
-    // TII/TRI as well?
-    GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI));
-
-    GISel->RegBankInfo.reset(RBI);
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
@@ -308,9 +270,9 @@ namespace {
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
-  AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+  AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {
-    if (TM->getOptLevel() != CodeGenOpt::None)
+    if (TM.getOptLevel() != CodeGenOpt::None)
       substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
   }
 
@@ -320,13 +282,29 @@ public:
 
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
+    const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    if (ST.hasFusion())
+      DAG->addMutation(createAArch64MacroFusionDAGMutation());
     return DAG;
   }
 
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
+    if (ST.hasFusion()) {
+      // Run the Macro Fusion after RA again since literals are expanded from
+      // pseudos then (v. addPreSched2()).
+      ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+      DAG->addMutation(createAArch64MacroFusionDAGMutation());
+      return DAG;
+    }
+
+    return nullptr;
+  }
+
   void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
@@ -334,6 +312,7 @@ public:
   bool addIRTranslator() override;
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
+  void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
 #endif
   bool addILPOpts() override;
@@ -341,6 +320,8 @@ public:
   void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
+
+  bool isGlobalISelEnabled() const override;
 };
 
 } // end anonymous namespace
@@ -352,13 +333,13 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
 }
 
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new AArch64PassConfig(this, PM);
+  return new AArch64PassConfig(*this, PM);
 }
 
 void AArch64PassConfig::addIRPasses() {
   // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
   // ourselves.
-  addPass(createAtomicExpandPass(TM));
+  addPass(createAtomicExpandPass());
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -370,14 +351,18 @@ void AArch64PassConfig::addIRPasses() {
   //
   // Run this before LSR to remove the multiplies involved in computing the
   // pointer values N iterations ahead.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
-    addPass(createLoopDataPrefetchPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoopDataPrefetch)
+      addPass(createLoopDataPrefetchPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorMarkStridedAccessesPass());
+  }
 
   TargetPassConfig::addIRPasses();
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createInterleavedAccessPass(TM));
+    addPass(createInterleavedAccessPass());
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
@@ -410,9 +395,6 @@ bool AArch64PassConfig::addPreISel() {
     addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
   }
 
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableAddressTypePromotion)
-    addPass(createAArch64AddressTypePromotionPass());
-
   return false;
 }
 
@@ -444,12 +426,22 @@ bool AArch64PassConfig::addRegBankSelect() {
   return false;
 }
 
+void AArch64PassConfig::addPreGlobalInstructionSelect() {
+  // Workaround the deficiency of the fast register allocator.
+  if (TM->getOptLevel() == CodeGenOpt::None)
+    addPass(new Localizer());
+}
+
 bool AArch64PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
   return false;
 }
 #endif
 
+bool AArch64PassConfig::isGlobalISelEnabled() const {
+  return TM->getOptLevel() <= EnableGlobalISelAtO;
+}
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
@@ -457,6 +449,8 @@ bool AArch64PassConfig::addILPOpts() {
     addPass(createAArch64ConditionalCompares());
   if (EnableMCR)
     addPass(&MachineCombinerID);
+  if (EnableCondBrTuning)
+    addPass(createAArch64CondBrTuning());
   if (EnableEarlyIfConversion)
     addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
@@ -493,8 +487,12 @@ void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
-    addPass(createAArch64LoadStoreOptimizationPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoadStoreOpt)
+      addPass(createAArch64LoadStoreOptimizationPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorHWPFFixPass());
+  }
 }
 
 void AArch64PassConfig::addPreEmitPass() {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 6fa5e83..85de02e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -21,6 +21,8 @@
 
 namespace llvm {
 
+class AArch64RegisterBankInfo;
+
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -34,6 +36,9 @@ public:
 
   ~AArch64TargetMachine() override;
   const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some, targets is
+  // deprecated and should not be used.
+  const AArch64Subtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8875f9b..4bc2c06 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -9,12 +9,12 @@
 
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetMachine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Dwarf.h"
 using namespace llvm;
 using namespace dwarf;
 
@@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
   const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
   return MCBinaryExpr::createSub(Res, PC, getContext());
 }
+
+void AArch64_MachoTargetObjectFile::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+    const TargetMachine &TM) const {
+  // AArch64 does not use section-relative relocations so any global symbol must
+  // be accessed via at least a linker-private symbol.
+  getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index 05e1dfa..9077eb7 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -40,8 +40,14 @@ public:
                                           const MCValue &MV, int64_t Offset,
                                           MachineModuleInfo *MMI,
                                           MCStreamer &Streamer) const override;
+
+  void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+                         const TargetMachine &TM) const override;
 };
 
+/// This implementation is used for AArch64 COFF targets.
+class AArch64_COFFTargetObjectFile : public TargetLoweringObjectFileCOFF {};
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5..a76f080 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -9,8 +9,8 @@
 
 #include "AArch64TargetTransformInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
@@ -20,6 +20,23 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64tti"
 
+static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
+                                               cl::init(true), cl::Hidden);
+
+bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
+                                         const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // Inline a callee if its target-features are a subset of the callers
+  // target-features.
+  return (CallerBits & CalleeBits) == CalleeBits;
+}
+
 /// \brief Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
@@ -176,10 +193,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+                                           ArrayRef<const Value *> Args) {
+
+  // A helper that returns a vector type from the given type. The number of
+  // elements in type Ty determine the vector width.
+  auto toVectorTy = [&](Type *ArgTy) {
+    return VectorType::get(ArgTy->getScalarType(),
+                           DstTy->getVectorNumElements());
+  };
+
+  // Exit early if DstTy is not a vector type whose elements are at least
+  // 16-bits wide.
+  if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+    return false;
+
+  // Determine if the operation has a widening variant. We consider both the
+  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+  // instructions.
+  //
+  // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+  //       verify that their extending operands are eliminated during code
+  //       generation.
+  switch (Opcode) {
+  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+    break;
+  default:
+    return false;
+  }
+
+  // To be a widening instruction (either the "wide" or "long" versions), the
+  // second operand must be a sign- or zero extend having a single user. We
+  // only consider extends having a single user because they may otherwise not
+  // be eliminated.
+  if (Args.size() != 2 ||
+      (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+      !Args[1]->hasOneUse())
+    return false;
+  auto *Extend = cast<CastInst>(Args[1]);
+
+  // Legalize the destination type and ensure it can be used in a widening
+  // operation.
+  auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+  unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+  if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+    return false;
+
+  // Legalize the source type and ensure it can be used in a widening
+  // operation.
+  Type *SrcTy = toVectorTy(Extend->getSrcTy());
+  auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+  unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+  if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+    return false;
+
+  // Get the total number of vector elements in the legalized types.
+  unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+  unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+  // Return true if the legalized types have the same number of vector elements
+  // and the destination element type size is twice that of the source type.
+  return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // If the cast is observable, and it is used by a widening instruction (e.g.,
+  // uaddl, saddw, etc.), it may be free.
+  if (I && I->hasOneUse()) {
+    auto *SingleUser = cast<Instruction>(*I->user_begin());
+    SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+      // If the cast is the second operand, it is free. We will generate either
+      // a "wide" or "long" version of the widening instruction.
+      if (I == SingleUser->getOperand(1))
+        return 0;
+      // If the cast is not the second operand, it will be free if it looks the
+      // same as the second operand. In this case, we will generate a "long"
+      // version of the widening instruction.
+      if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+        if (I->getOpcode() == Cast->getOpcode() &&
+            cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+          return 0;
+    }
+  }
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
@@ -378,6 +480,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
+  // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+  // add in the widening overhead specified by the sub-target. Since the
+  // extends feeding widening instructions are performed automatically, they
+  // aren't present in the generated code and have a zero cost. By adding a
+  // widening overhead here, we attach the total cost of the combined operation
+  // to the widening instruction.
+  int Cost = 0;
+  if (isWideningInstruction(Ty, Opcode, Args))
+    Cost += ST->getWideningBaseCost();
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
   if (ISD == ISD::SDIV &&
@@ -387,9 +499,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
     // normally expanded to the sequence ADD + CMP + SELECT + SRA.
     // The OperandValue properties many not be same as that of previous
     // operation; conservatively assume OP_None.
-    int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                                      TargetTransformInfo::OP_None,
-                                      TargetTransformInfo::OP_None);
+    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                   TargetTransformInfo::OP_None,
+                                   TargetTransformInfo::OP_None);
     Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
                                    TargetTransformInfo::OP_None,
                                    TargetTransformInfo::OP_None);
@@ -404,8 +516,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
-                                         Opd1PropInfo, Opd2PropInfo);
+    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -413,7 +525,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
   case ISD::AND:
     // These nodes are marked as 'custom' for combining purposes only.
     // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
+    return (Cost + 1) * LT.first;
   }
 }
 
@@ -436,7 +548,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 }
 
 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy) {
+                                       Type *CondTy, const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
@@ -463,11 +575,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return Entry->Cost;
     }
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
-                                    unsigned Alignment, unsigned AddressSpace) {
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -505,12 +618,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
-    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one ldN/stN instruction.
+    if (NumElts % Factor == 0 &&
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -533,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return ST->getMaxInterleaveFactor();
 }
 
-void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+// For Falkor, we want to avoid having too many strided loads in a loop since
+// that can exhaust the HW prefetcher resources.  We adjust the unroller
+// MaxCount preference below to attempt to ensure unrolling doesn't create too
+// many strided loads.
+static void
+getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                              TargetTransformInfo::UnrollingPreferences &UP) {
+  enum { MaxStridedLoads = 7 };
+  auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
+    int StridedLoads = 0;
+    // FIXME? We could make this more precise by looking at the CFG and
+    // e.g. not counting loads in each side of an if-then-else diamond.
+    for (const auto BB : L->blocks()) {
+      for (auto &I : *BB) {
+        LoadInst *LMemI = dyn_cast<LoadInst>(&I);
+        if (!LMemI)
+          continue;
+
+        Value *PtrValue = LMemI->getPointerOperand();
+        if (L->isLoopInvariant(PtrValue))
+          continue;
+
+        const SCEV *LSCEV = SE.getSCEV(PtrValue);
+        const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+        if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+          continue;
+
+        // FIXME? We could take pairing of unrolled load copies into account
+        // by looking at the AddRec, but we would probably have to limit this
+        // to loops with no stores or other memory optimization barriers.
+        ++StridedLoads;
+        // We've seen enough strided loads that seeing more won't make a
+        // difference.
+        if (StridedLoads > MaxStridedLoads / 2)
+          return StridedLoads;
+      }
+    }
+    return StridedLoads;
+  };
+
+  int StridedLoads = countStridedLoads(L, SE);
+  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+               << " strided loads\n");
+  // Pick the largest power of 2 unroll count that won't result in too many
+  // strided loads.
+  if (StridedLoads) {
+    UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
+    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
+                 << '\n');
+  }
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   // Enable partial unrolling and runtime unrolling.
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 
   // For inner loop, it is more likely to be a hot one, and the runtime check
   // can be promoted out from LICM pass, so the overhead is less, let's try
@@ -546,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
 
   // Disable partial & runtime unrolling on -Os.
   UP.PartialOptSizeThreshold = 0;
+
+  if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
+      EnableFalkorHWPFUnrollFix)
+    getFalkorUnrollingPreferences(L, SE, UP);
 }
 
 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
@@ -594,8 +765,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_ld4:
     Info.ReadMem = true;
     Info.WriteMem = false;
-    Info.IsSimple = true;
-    Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(0);
     break;
   case Intrinsic::aarch64_neon_st2:
@@ -603,8 +772,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.IsSimple = true;
-    Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
     break;
   }
@@ -628,6 +795,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   return true;
 }
 
+/// See if \p I should be considered for address type promotion. We check if \p
+/// I is a sext with right type and used in memory accesses. If it used in a
+/// "complex" getelementptr, we allow it to be promoted without finding other
+/// sext instructions that sign extended the same initial value. A getelementptr
+/// is considered as "complex" if it has more than 2 operands.
+bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
+    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
+  bool Considerable = false;
+  AllowPromotionWithoutCommonHeader = false;
+  if (!isa<SExtInst>(&I))
+    return false;
+  Type *ConsideredSExtType =
+      Type::getInt64Ty(I.getParent()->getParent()->getContext());
+  if (I.getType() != ConsideredSExtType)
+    return false;
+  // See if the sext is the one with the right type and used in at least one
+  // GetElementPtrInst.
+  for (const User *U : I.users()) {
+    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
+      Considerable = true;
+      // A getelementptr is considered as "complex" if it has more than 2
+      // operands. We will promote a SExt used in such complex GEP as we
+      // expect some computation to be merged if they are done on 64 bits.
+      if (GEPInst->getNumOperands() > 2) {
+        AllowPromotionWithoutCommonHeader = true;
+        break;
+      }
+    }
+  }
+  return Considerable;
+}
+
 unsigned AArch64TTIImpl::getCacheLineSize() {
   return ST->getCacheLineSize();
 }
@@ -643,3 +842,28 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() {
 unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
   return ST->getMaxPrefetchIterationsAhead();
 }
+
+bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                                           TTI::ReductionFlags Flags) const {
+  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+  switch (Opcode) {
+  case Instruction::FAdd:
+  case Instruction::FMul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Mul:
+    return false;
+  case Instruction::Add:
+    return ScalarBits * Ty->getVectorNumElements() >= 128;
+  case Instruction::ICmp:
+    return (ScalarBits < 64) &&
+           (ScalarBits * Ty->getVectorNumElements() >= 128);
+  case Instruction::FCmp:
+    return Flags.NoNaN;
+  default:
+    llvm_unreachable("Unhandled reduction opcode");
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 18287ed..31c0373 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -34,10 +34,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const AArch64Subtarget *getST() const { return ST; }
   const AArch64TargetLowering *getTLI() const { return TLI; }
 
@@ -47,11 +43,17 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
+  bool isWideningInstruction(Type *Ty, unsigned Opcode,
+                             ArrayRef<const Value *> Args);
+
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
@@ -79,7 +81,7 @@ public:
     return 31;
   }
 
-  unsigned getRegisterBitWidth(bool Vector) {
+  unsigned getRegisterBitWidth(bool Vector) const {
     if (Vector) {
       if (ST->hasNEON())
         return 128;
@@ -88,9 +90,14 @@ public:
     return 64;
   }
 
+  unsigned getMinVectorRegisterBitWidth() {
+    return ST->getMinVectorRegisterBitWidth();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
 
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index);
@@ -107,14 +114,16 @@ public:
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
 
   int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                            Type *ExpectedType);
@@ -125,6 +134,10 @@ public:
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace);
 
+  bool
+  shouldConsiderAddressTypePromotion(const Instruction &I,
+                                     bool &AllowPromotionWithoutCommonHeader);
+
   unsigned getCacheLineSize();
 
   unsigned getPrefetchDistance();
@@ -132,6 +145,13 @@ public:
   unsigned getMinPrefetchStride();
 
   unsigned getMaxPrefetchIterationsAhead();
+
+  bool shouldExpandReduction(const IntrinsicInst *II) const {
+    return false;
+  }
+
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const;
   /// @}
 };
 
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
index e3b1d7c..f53af23 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -19,13 +19,27 @@
 //    is rewritten into
 //    dup v3.4s, v2.s[1]
 //    fmla v0.4s, v1.4s, v3.4s
+//
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
 
 using namespace llvm;
 
@@ -41,14 +55,15 @@ namespace {
 
 struct AArch64VectorByElementOpt : public MachineFunctionPass {
   static char ID;
-  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
-    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
-  }
 
   const TargetInstrInfo *TII;
   MachineRegisterInfo *MRI;
   TargetSchedModel SchedModel;
 
+  AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+    initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+  }
+
   /// Based only on latency of instructions, determine if it is cost efficient
   /// to replace the instruction InstDesc by the two instructions InstDescRep1
   /// and InstDescRep2.
@@ -90,8 +105,10 @@ struct AArch64VectorByElementOpt : public MachineFunctionPass {
     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
   }
 };
+
 char AArch64VectorByElementOpt::ID = 0;
-} // namespace
+
+} // end anonymous namespace
 
 INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index b86a283..a79d518 100644
--- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -15,8 +15,8 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -74,6 +74,7 @@ private:
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
   unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
@@ -85,7 +86,7 @@ private:
   bool parseOperand(OperandVector &Operands, bool isCondCode,
                     bool invertCondCode);
 
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+  bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
 
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveCPU(SMLoc L);
@@ -537,154 +538,15 @@ public:
     return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
-  bool isImm0_1() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 2);
-  }
-
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-
-  bool isImm1_31() const {
+  template <int N, int M>
+  bool isImmInRange() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
       return false;
     int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-
-  bool isImm32_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 32 && Val < 64);
+    return (Val >= N && Val <= M);
   }
 
   bool isLogicalImm32() const {
@@ -804,19 +666,8 @@ public:
     return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
 
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-
-  bool isPCRelLabel19() const {
+  template<int N>
+  bool isBranchTarget() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
@@ -825,19 +676,8 @@ public:
     int64_t Val = MCE->getValue();
     if (Val & 0x3)
       return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-
-  bool isBranchTarget14() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+    assert(N > 0 && "Branch target immediate cannot be 0 bits!");
+    return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2));
   }
 
   bool
@@ -2260,27 +2100,9 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
   bool isNegative = parseOptionalToken(AsmToken::Minus);
 
   const AsmToken &Tok = Parser.getTok();
-  if (Tok.is(AsmToken::Real)) {
-    APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
-    if (isNegative)
-      RealVal.changeSign();
-
-    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
-    Parser.Lex(); // Eat the token.
-    // Check for out of range values. As an exception, we let Zero through,
-    // as we handle that special case in post-processing before matching in
-    // order to use the zero register for it.
-    if (Val == -1 && !RealVal.isPosZero()) {
-      TokError("expected compatible register or floating-point constant");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-  if (Tok.is(AsmToken::Integer)) {
+  if (Tok.is(AsmToken::Real) || Tok.is(AsmToken::Integer)) {
     int64_t Val;
-    if (!isNegative && Tok.getString().startswith("0x")) {
+    if (Tok.is(AsmToken::Integer) && !isNegative && Tok.getString().startswith("0x")) {
       Val = Tok.getIntVal();
       if (Val > 255 || Val < 0) {
         TokError("encoded floating point value out of range");
@@ -2288,10 +2110,24 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
       }
     } else {
       APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+      if (isNegative)
+        RealVal.changeSign();
+
       uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      // If we had a '-' in front, toggle the sign bit.
-      IntVal ^= (uint64_t)isNegative << 63;
       Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+
+      // Check for out of range values. As an exception we let Zero through,
+      // but as tokens instead of an FPImm so that it can be matched by the
+      // appropriate alias if one exists.
+      if (RealVal.isPosZero()) {
+        Parser.Lex(); // Eat the token.
+        Operands.push_back(AArch64Operand::CreateToken("#0", false, S, getContext()));
+        Operands.push_back(AArch64Operand::CreateToken(".0", false, S, getContext()));
+        return MatchOperand_Success;
+      } else if (Val == -1) {
+        TokError("expected compatible register or floating-point constant");
+        return MatchOperand_ParseFail;
+      }
     }
     Parser.Lex(); // Eat the token.
     Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
@@ -2494,6 +2330,35 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
+  if (FBS[AArch64::HasV8_1aOps])
+    Str += "ARMv8.1a";
+  else if (FBS[AArch64::HasV8_2aOps])
+    Str += "ARMv8.2a";
+  else
+    Str += "(unknown)";
+}
+
+void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands,
+                                      SMLoc S) {
+  const uint16_t Op2 = Encoding & 7;
+  const uint16_t Cm = (Encoding & 0x78) >> 3;
+  const uint16_t Cn = (Encoding & 0x780) >> 7;
+  const uint16_t Op1 = (Encoding & 0x3800) >> 11;
+
+  const MCExpr *Expr = MCConstantExpr::create(Op1, getContext());
+
+  Operands.push_back(
+      AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));
+  Expr = MCConstantExpr::create(Op2, getContext());
+  Operands.push_back(
+      AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+}
+
 /// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
 /// the SYS instruction. Parse them specially so that we create a SYS MCInst.
 bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
@@ -2510,228 +2375,48 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
 
-  const MCExpr *Expr = nullptr;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
-    Expr = MCConstantExpr::create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-  } while (false)
-
   if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
+    const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
+    if (!IC)
       return TokError("invalid operand for IC instruction");
+    else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("IC " + std::string(IC->Name) + " requires ");
+      setRequiredFeatureString(IC->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(IC->Encoding, Operands, S);
   } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else if (!Op.compare_lower("cvap")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #3, C7, C12, #1
-        SYS_ALIAS(3, 7, 12, 1);
-      } else {
-        return TokError("DC CVAP requires ARMv8.2a");
-      }
-    } else {
+    const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op);
+    if (!DC)
       return TokError("invalid operand for DC instruction");
+    else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("DC " + std::string(DC->Name) + " requires ");
+      setRequiredFeatureString(DC->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(DC->Encoding, Operands, S);
   } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else if (!Op.compare_lower("s1e1rp")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #0, C7, C9, #0
-        SYS_ALIAS(0, 7, 9, 0);
-      } else {
-        return TokError("AT S1E1RP requires ARMv8.2a");
-      }
-    } else if (!Op.compare_lower("s1e1wp")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #0, C7, C9, #1
-        SYS_ALIAS(0, 7, 9, 1);
-      } else {
-        return TokError("AT S1E1WP requires ARMv8.2a");
-      }
-    } else {
+    const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op);
+    if (!AT)
       return TokError("invalid operand for AT instruction");
+    else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("AT " + std::string(AT->Name) + " requires ");
+      setRequiredFeatureString(AT->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(AT->Encoding, Operands, S);
   } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("ipas2e1is")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 0, 1);
-    } else if (!Op.compare_lower("ipas2le1is")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 0, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
+    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op);
+    if (!TLBI)
       return TokError("invalid operand for TLBI instruction");
+    else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
+      setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(TLBI->Encoding, Operands, S);
   }
 
-#undef SYS_ALIAS
-
   Parser.Lex(); // Eat operand.
 
   bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
@@ -2744,12 +2429,10 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     HasRegister = true;
   }
 
-  if (ExpectRegister && !HasRegister) {
+  if (ExpectRegister && !HasRegister)
     return TokError("specified " + Mnemonic + " op requires a register");
-  }
-  else if (!ExpectRegister && HasRegister) {
+  else if (!ExpectRegister && HasRegister)
     return TokError("specified " + Mnemonic + " op does not use a register");
-  }
 
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
@@ -2790,16 +2473,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  auto DB = AArch64DB::lookupDBByName(Tok.getString());
-  if (!DB) {
-    TokError("invalid barrier option name");
-    return MatchOperand_ParseFail;
-  }
-
   // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
+  auto DB = AArch64DB::lookupDBByName(Tok.getString());
+  if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
     TokError("'sy' or #imm operand expected");
     return MatchOperand_ParseFail;
+  } else if (!DB) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
   }
 
   Operands.push_back(AArch64Operand::CreateBarrier(
@@ -2884,7 +2565,6 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
 
 /// parseRegister - Parse a non-vector register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   // Try for a vector register.
   if (!tryParseVectorRegister(Operands))
@@ -2897,30 +2577,6 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
   Operands.push_back(
       AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
 
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  SMLoc LBracS = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  if (parseOptionalToken(AsmToken::LBrac)) {
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        SMLoc RBracS = getLoc();
-        if (parseOptionalToken(AsmToken::RBrac)) {
-          Operands.push_back(
-              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              AArch64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
   return false;
 }
 
@@ -3601,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
   }
 }
 
-bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS);
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+                                      OperandVector &Operands) {
   switch (ErrCode) {
   case Match_MissingFeature:
     return Error(Loc,
@@ -3696,6 +3355,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "immediate must be an integer in range [0, 63].");
   case Match_InvalidImm0_127:
     return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_255:
+    return Error(Loc, "immediate must be an integer in range [0, 255].");
   case Match_InvalidImm0_65535:
     return Error(Loc, "immediate must be an integer in range [0, 65535].");
   case Match_InvalidImm1_8:
@@ -3722,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "expected readable system register");
   case Match_MSR:
     return Error(Loc, "expected writable system register or pstate");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
+  case Match_MnemonicFail: {
+    std::string Suggestion = AArch64MnemonicSpellCheck(
+        ((AArch64Operand &)*Operands[0]).getToken(),
+        ComputeAvailableFeatures(STI->getFeatureBits()));
+    return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
+  }
   default:
     llvm_unreachable("unexpected error code!");
   }
@@ -3991,21 +3656,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     }
   }
 
-  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
-  if (NumOperands == 3 && Tok == "fmov") {
-    AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
-    AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
-    if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
-      unsigned zreg =
-          !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
-              RegOp.getReg())
-              ? AArch64::WZR
-              : AArch64::XZR;
-      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
-                                              Op.getEndLoc(), getContext());
-    }
-  }
-
   MCInst Inst;
   // First try to match against the secondary set of tables containing the
   // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
@@ -4064,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, Msg);
   }
   case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
+    return showMatchError(IDLoc, MatchResult, Operands);
   case Match_InvalidOperand: {
     SMLoc ErrorLoc = IDLoc;
 
@@ -4083,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
       MatchResult = Match_InvalidSuffix;
 
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   case Match_InvalidMemoryIndexed1:
   case Match_InvalidMemoryIndexed2:
@@ -4120,6 +3770,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidImm0_31:
   case Match_InvalidImm0_63:
   case Match_InvalidImm0_127:
+  case Match_InvalidImm0_255:
   case Match_InvalidImm0_65535:
   case Match_InvalidImm1_8:
   case Match_InvalidImm1_16:
@@ -4140,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
     if (ErrorLoc == SMLoc())
       ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
+    return showMatchError(ErrorLoc, MatchResult, Operands);
   }
   }
 
@@ -4260,10 +3911,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
   return false;
 }
 
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+  return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
 /// parseDirectiveCPU
 ///   ::= .cpu id
 bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
-  SMLoc CPULoc = getLoc();
+  SMLoc CurLoc = getLoc();
 
   StringRef CPU, ExtensionString;
   std::tie(CPU, ExtensionString) =
@@ -4279,15 +3934,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
   // FIXME This is using tablegen data, but should be moved to ARMTargetParser
   // once that is tablegen'ed
   if (!getSTI().isCPUStringValid(CPU)) {
-    Error(CPULoc, "unknown CPU name");
+    Error(CurLoc, "unknown CPU name");
     return false;
   }
 
   MCSubtargetInfo &STI = copySTI();
   STI.setDefaultFeatures(CPU, "");
+  CurLoc = incrementLoc(CurLoc, CPU.size());
 
   FeatureBitset Features = STI.getFeatureBits();
   for (auto Name : RequestedExtensions) {
+    // Advance source location past '+'.
+    CurLoc = incrementLoc(CurLoc, 1);
+
     bool EnableFeature = true;
 
     if (Name.startswith_lower("no")) {
@@ -4295,6 +3954,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       Name = Name.substr(2);
     }
 
+    bool FoundExtension = false;
     for (const auto &Extension : ExtensionMap) {
       if (Extension.Name != Name)
         continue;
@@ -4308,9 +3968,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
       uint64_t Features =
           ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
       setAvailableFeatures(Features);
+      FoundExtension = true;
 
       break;
     }
+
+    if (!FoundExtension)
+      Error(CurLoc, "unsupported architectural extension");
+
+    CurLoc = incrementLoc(CurLoc, Name.size());
   }
   return false;
 }
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 0d860a7..7870dce 100644
--- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -756,7 +756,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
     // if shift == '11' then ReservedValue()
     if (shiftHi == 0x3)
       return Fail;
-    // Deliberate fallthrough
+    LLVM_FALLTHROUGH;
   case AArch64::ANDWrs:
   case AArch64::ANDSWrs:
   case AArch64::BICWrs:
@@ -780,7 +780,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
     // if shift == '11' then ReservedValue()
     if (shiftHi == 0x3)
       return Fail;
-    // Deliberate fallthrough
+    LLVM_FALLTHROUGH;
   case AArch64::ANDXrs:
   case AArch64::ANDSXrs:
   case AArch64::BICXrs:
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index b4f8520..fc89657 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -16,12 +16,21 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -267,6 +276,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
   }
 
+  if (Opcode == AArch64::CompilerBarrier) {
+    O << '\t' << MAI.getCommentString() << " COMPILER BARRIER";
+    printAnnotation(O, Annot);
+    return;
+  }
+
   if (!printAliasInstr(MI, STI, O))
     printInstruction(MI, STI, O);
 
@@ -451,8 +466,8 @@ static const LdStNInstrDesc LdStNInstInfo[] = {
   { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
   { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
   { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
-  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
-  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12 },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24 },
   { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
   { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
   { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
@@ -731,7 +746,6 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
 #endif
 
-  const char *Asm = nullptr;
   const MCOperand &Op1 = MI->getOperand(0);
   const MCOperand &Cn = MI->getOperand(1);
   const MCOperand &Cm = MI->getOperand(2);
@@ -742,230 +756,74 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   unsigned CmVal = Cm.getImm();
   unsigned Op2Val = Op2.getImm();
 
+  uint16_t Encoding = Op2Val;
+  Encoding |= CmVal << 3;
+  Encoding |= CnVal << 7;
+  Encoding |= Op1Val << 11;
+
+  bool NeedsReg;
+  std::string Ins;
+  std::string Name;
+
   if (CnVal == 7) {
     switch (CmVal) {
-    default:
-      break;
-
+    default: return false;
     // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
+    case 1: case 5: {
+      const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding);
+      if (!IC || !IC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = IC->NeedsReg;
+      Ins = "ic\t";
+      Name = std::string(IC->Name);
+    }
+    break;
     // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 12:
-      if (Op1Val == 3 && Op2Val == 1 &&
-          (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
-        Asm = "dc\tcvap";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
+    case 4: case 6: case 10: case 11: case 12: case 14:
+    {
+      const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding);
+      if (!DC || !DC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "dc\t";
+      Name = std::string(DC->Name);
+    }
+    break;
     // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    case 9:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
-          switch (Op2Val) {
-          default:
-            break;
-          case 0: Asm = "at\ts1e1rp"; break;
-          case 1: Asm = "at\ts1e1wp"; break;
-          }
-        }
-        break;
-      }
+    case 8: case 9: {
+      const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding);
+      if (!AT || !AT->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "at\t";
+      Name = std::string(AT->Name);
+    }
+    break;
     }
   } else if (CnVal == 8) {
     // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 0:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1is"; break;
-        case 5: Asm = "tlbi\tipas2le1is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
+    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
+    if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
+      return false;
+
+    NeedsReg = TLBI->NeedsReg;
+    Ins = "tlbi\t";
+    Name = std::string(TLBI->Name);
   }
+  else
+    return false;
 
-  if (Asm) {
-    unsigned Reg = MI->getOperand(4).getReg();
+  std::string Str = Ins + Name;
+  std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower);
 
-    O << '\t' << Asm;
-    if (StringRef(Asm).lower().find("all") == StringRef::npos)
-      O << ", " << getRegisterName(Reg);
-  }
+  O << '\t' << Str;
+  if (NeedsReg)
+    O << ", " << getRegisterName(MI->getOperand(4).getReg());
 
-  return Asm != nullptr;
+  return true;
 }
 
 void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 65dca99..a45258c 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -37,9 +38,11 @@ public:
                                        unsigned PrintMethodIdx,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O);
+
   virtual StringRef getRegName(unsigned RegNo) const {
     return getRegisterName(RegNo);
   }
+
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = AArch64::NoRegAltName);
 
@@ -177,12 +180,15 @@ public:
                                unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI,
                                raw_ostream &O) override;
+
   StringRef getRegName(unsigned RegNo) const override {
     return getRegisterName(RegNo);
   }
+
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = AArch64::NoRegAltName);
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 14c0327..2bd0cbf 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -11,8 +11,9 @@
 #include "AArch64RegisterInfo.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -22,7 +23,6 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
 using namespace llvm;
 
 namespace {
@@ -43,26 +43,25 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // AArch64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_aarch64_add_imm12", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_movw", 5, 16, 0 },
-      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
-    };
+        // This table *must* be in the order that the fixup_* kinds are defined
+        // in AArch64FixupKinds.h.
+        //
+        // Name                           Offset (bits) Size (bits)     Flags
+        {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal},
+        {"fixup_aarch64_add_imm12", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0},
+        {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0},
+        {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_movw", 5, 16, 0},
+        {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
+        {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
 
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
@@ -72,8 +71,9 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case FK_Data_1:
     return 1;
 
-  case FK_Data_2:
   case AArch64::fixup_aarch64_movw:
+  case FK_Data_2:
+  case FK_SecRel_2:
     return 2;
 
   case AArch64::fixup_aarch64_pcrel_branch14:
@@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
   case FK_Data_4:
+  case FK_SecRel_4:
     return 4;
 
   case FK_Data_8:
@@ -138,15 +140,15 @@ static unsigned AdrImmBits(unsigned Value) {
 }
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx) {
+                                 MCContext &Ctx) {
   unsigned Kind = Fixup.getKind();
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
-    if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     return AdrImmBits(Value & 0x1fffffULL);
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -154,71 +156,72 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case AArch64::fixup_aarch64_pcrel_branch19:
     // Signed 21-bit immediate
     if (SignedValue > 2097151 || SignedValue < -2097152)
-      if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     // Low two bits are not encoded.
     return (Value >> 2) & 0x7ffff;
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
     // Unsigned 12-bit immediate
-    if (Ctx && Value >= 0x1000)
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value >= 0x1000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     return Value;
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
     // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Ctx && (Value >= 0x2000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x1))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
+    if (Value >= 0x2000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
     return Value >> 1;
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
     // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Ctx && (Value >= 0x4000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
+    if (Value >= 0x4000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
     return Value >> 2;
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
     // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Ctx && (Value >= 0x8000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x7))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
+    if (Value >= 0x8000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x7)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
     return Value >> 3;
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
     // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Ctx && (Value >= 0x10000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0xf))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
+    if (Value >= 0x10000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0xf)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
     return Value >> 4;
   case AArch64::fixup_aarch64_movw:
-    if (Ctx)
-      Ctx->reportError(Fixup.getLoc(),
-                       "no resolvable MOVZ/MOVK fixups supported yet");
+    Ctx.reportError(Fixup.getLoc(),
+                    "no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
   case AArch64::fixup_aarch64_pcrel_branch14:
     // Signed 16-bit immediate
-    if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 32767 || SignedValue < -32768)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     // Signed 28-bit immediate
-    if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3ffffff;
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
+  case FK_SecRel_2:
+  case FK_SecRel_4:
     return Value;
   }
 }
@@ -262,21 +265,23 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
   }
 }
 
-void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                   unsigned DataSize, uint64_t Value,
-                                   bool IsPCRel) const {
+void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                   const MCValue &Target,
+                                   MutableArrayRef<char> Data, uint64_t Value,
+                                   bool IsResolved) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  MCContext &Ctx = Asm.getContext();
   // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup, Value, nullptr);
+  Value = adjustFixupValue(Fixup, Value, Ctx);
 
   // Shift the value into position.
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // Used to point to big endian bytes.
   unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -290,7 +295,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
     }
   } else {
     // Handle as big-endian
-    assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+    assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
     for (unsigned i = 0; i != NumBytes; ++i) {
       unsigned Idx = FulleSizeInBytes - 1 - i;
@@ -521,17 +526,6 @@ public:
 
     return CompactUnwindEncoding;
   }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
-    // Try to get the encoded value for the fixup as-if we're mapping it into
-    // the instruction. This allows adjustFixupValue() to issue a diagnostic
-    // if the value is invalid.
-    if (IsResolved)
-      (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
-  }
 };
 
 } // end anonymous namespace
@@ -551,16 +545,13 @@ public:
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 };
 
-void ELFAArch64AsmBackend::processFixupValue(
-    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
-    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
-    bool &IsResolved) {
+bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                                 const MCFixup &Fixup,
+                                                 const MCValue &Target) {
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
   // ~0xfff. This means that the required offset to reach a symbol can vary by
   // up to one step depending on where the ADRP is in memory. For example:
@@ -574,15 +565,22 @@ void ELFAArch64AsmBackend::processFixupValue(
   // section isn't 0x1000-aligned, we therefore need to delegate this decision
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
-    IsResolved = false;
+    return true;
+  return false;
+}
 
-  // Try to get the encoded value for the fixup as-if we're mapping it into
-  // the instruction. This allows adjustFixupValue() to issue a diagnostic
-  // if the value is invalid.
-  if (IsResolved)
-    (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
 }
 
+namespace {
+class COFFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple)
+      : AArch64AsmBackend(T, /*IsLittleEndian*/true) {}
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+    return createAArch64WinCOFFObjectWriter(OS);
+  }
+};
 }
 
 MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
@@ -593,7 +591,11 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
   if (TheTriple.isOSBinFormatMachO())
     return new DarwinAArch64AsmBackend(T, MRI);
 
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  if (TheTriple.isOSBinFormatCOFF())
+    return new COFFAArch64AsmBackend(T, TheTriple);
+
+  assert(TheTriple.isOSBinFormatELF() && "Invalid target");
+
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
   bool IsILP32 = Options.getABIName() == "ilp32";
   return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32);
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c954c0e..89c3e5b 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -15,11 +15,11 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
@@ -49,10 +49,11 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
                               /*HasRelocationAddend*/ true),
       IsILP32(IsILP32) {}
 
-#define R_CLS(rtype) \
-        IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
-#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
-        "supported (LP64 eqv: " #lp64rtype ")"
+#define R_CLS(rtype)                                                           \
+  IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
+#define BAD_ILP32_MOV(lp64rtype)                                               \
+  "ILP32 absolute MOV relocation not "                                         \
+  "supported (LP64 eqv: " #lp64rtype ")"
 
 // assumes IsILP32 is true
 static bool isNonILP32reloc(const MCFixup &Fixup,
@@ -60,44 +61,45 @@ static bool isNonILP32reloc(const MCFixup &Fixup,
                             MCContext &Ctx) {
   if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
     return false;
-  switch(RefKind) {
-    case AArch64MCExpr::VK_ABS_G3:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
-      return true;
-    case AArch64MCExpr::VK_ABS_G2_S:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_ABS_G2_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_ABS_G1_S:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_ABS_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_DTPREL_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_DTPREL_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_TPREL_G2:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_TPREL_G1_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_GOTTPREL_G1:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
-      return ELF::R_AARCH64_NONE;
-    case AArch64MCExpr::VK_GOTTPREL_G0_NC:
-      Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
-      return ELF::R_AARCH64_NONE;
-    default: return false;
+  switch (RefKind) {
+  case AArch64MCExpr::VK_ABS_G3:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2_S:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
+    return true;
+  case AArch64MCExpr::VK_ABS_G2_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
+    return true;
+  case AArch64MCExpr::VK_ABS_G1_S:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
+    return true;
+  case AArch64MCExpr::VK_ABS_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_DTPREL_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
+    return true;
+  case AArch64MCExpr::VK_DTPREL_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_TPREL_G2:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
+    return true;
+  case AArch64MCExpr::VK_TPREL_G1_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
+    return true;
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
+    return true;
+  case AArch64MCExpr::VK_GOTTPREL_G0_NC:
+    Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
+    return true;
+  default:
+    return false;
   }
   return false;
 }
@@ -130,7 +132,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(PREL32);
     case FK_Data_8:
       if (IsILP32) {
-        Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data "
+        Ctx.reportError(Fixup.getLoc(),
+                        "ILP32 8 byte PC relative data "
                         "relocation not supported (LP64 eqv: PREL64)");
         return ELF::R_AARCH64_NONE;
       } else
@@ -141,6 +144,16 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
     case AArch64::fixup_aarch64_pcrel_adrp_imm21:
       if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
         return R_CLS(ADR_PREL_PG_HI21);
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) {
+        if (IsILP32) {
+          Ctx.reportError(Fixup.getLoc(),
+                          "invalid fixup for 32-bit pcrel ADRP instruction "
+                          "VK_ABS VK_NC");
+          return ELF::R_AARCH64_NONE;
+        } else {
+          return ELF::R_AARCH64_ADR_PREL_PG_HI21_NC;
+        }
+      }
       if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
         return R_CLS(ADR_GOT_PAGE);
       if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
@@ -168,7 +181,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
     }
   } else {
     if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
-        return ELF::R_AARCH64_NONE;
+      return ELF::R_AARCH64_NONE;
     switch ((unsigned)Fixup.getKind()) {
     case FK_Data_1:
       Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
@@ -179,7 +192,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(ABS32);
     case FK_Data_8:
       if (IsILP32) {
-        Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(ABS64));
+        Ctx.reportError(Fixup.getLoc(),
+                        "ILP32 8 byte absolute data "
+                        "relocation not supported (LP64 eqv: ABS64)");
         return ELF::R_AARCH64_NONE;
       } else
         return ELF::R_AARCH64_ABS64;
@@ -197,7 +212,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
         return R_CLS(TLSLE_ADD_TPREL_LO12);
       if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
-        return R_CLS(TLSDESC_ADD_LO12_NC);
+        return R_CLS(TLSDESC_ADD_LO12);
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return R_CLS(ADD_ABS_LO12_NC);
 
@@ -245,15 +260,68 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         return R_CLS(TLSLE_LDST32_TPREL_LO12);
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return R_CLS(TLSLE_LDST32_TPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+        if (IsILP32) {
+          return ELF::R_AARCH64_P32_LD32_GOT_LO12_NC;
+        } else {
+          Ctx.reportError(Fixup.getLoc(),
+                          "LP64 4 byte unchecked GOT load/store relocation "
+                          "not supported (ILP32 eqv: LD32_GOT_LO12_NC");
+          return ELF::R_AARCH64_NONE;
+        }
+      }
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) {
+        if (IsILP32) {
+          Ctx.reportError(Fixup.getLoc(),
+                          "ILP32 4 byte checked GOT load/store relocation "
+                          "not supported (unchecked eqv: LD32_GOT_LO12_NC)");
+        } else {
+          Ctx.reportError(Fixup.getLoc(),
+                          "LP64 4 byte checked GOT load/store relocation "
+                          "not supported (unchecked/ILP32 eqv: "
+                          "LD32_GOT_LO12_NC)");
+        }
+        return ELF::R_AARCH64_NONE;
+      }
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) {
+        if (IsILP32) {
+          return ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC;
+        } else {
+          Ctx.reportError(Fixup.getLoc(),
+                          "LP64 32-bit load/store "
+                          "relocation not supported (ILP32 eqv: "
+                          "TLSIE_LD32_GOTTPREL_LO12_NC)");
+          return ELF::R_AARCH64_NONE;
+        }
+      }
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) {
+        if (IsILP32) {
+          return ELF::R_AARCH64_P32_TLSDESC_LD32_LO12;
+        } else {
+          Ctx.reportError(Fixup.getLoc(),
+                          "LP64 4 byte TLSDESC load/store relocation "
+                          "not supported (ILP32 eqv: TLSDESC_LD64_LO12)");
+          return ELF::R_AARCH64_NONE;
+        }
+      }
 
       Ctx.reportError(Fixup.getLoc(),
-                      "invalid fixup for 32-bit load/store instruction");
+                      "invalid fixup for 32-bit load/store instruction "
+                      "fixup_aarch64_ldst_imm12_scale4");
       return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale8:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return R_CLS(LDST64_ABS_LO12_NC);
-      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
-        return R_CLS(LD64_GOT_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+        if (!IsILP32) {
+          return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+        } else {
+          Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
+                                          "relocation not supported (LP64 eqv: "
+                                          "LD64_GOT_LO12_NC)");
+          return ELF::R_AARCH64_NONE;
+        }
+      }
       if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
         return R_CLS(TLSLD_LDST64_DTPREL_LO12);
       if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
@@ -262,19 +330,40 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         return R_CLS(TLSLE_LDST64_TPREL_LO12);
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return R_CLS(TLSLE_LDST64_TPREL_LO12_NC);
-      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
-        return IsILP32 ? ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC
-                       : ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
-        return IsILP32 ? ELF::R_AARCH64_P32_TLSDESC_LD32_LO12_NC
-                       : ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) {
+        if (!IsILP32) {
+          return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+        } else {
+          Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
+                                          "relocation not supported (LP64 eqv: "
+                                          "TLSIE_LD64_GOTTPREL_LO12_NC)");
+          return ELF::R_AARCH64_NONE;
+        }
+      }
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC) {
+        if (!IsILP32) {
+          return ELF::R_AARCH64_TLSDESC_LD64_LO12;
+        } else {
+          Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
+                                          "relocation not supported (LP64 eqv: "
+                                          "TLSDESC_LD64_LO12)");
+          return ELF::R_AARCH64_NONE;
+        }
+      }
       Ctx.reportError(Fixup.getLoc(),
                       "invalid fixup for 64-bit load/store instruction");
       return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale16:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return R_CLS(LDST128_ABS_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return R_CLS(TLSLD_LDST128_DTPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return R_CLS(TLSLD_LDST128_DTPREL_LO12_NC);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return R_CLS(TLSLE_LDST128_TPREL_LO12);
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return R_CLS(TLSLE_LDST128_TPREL_LO12_NC);
 
       Ctx.reportError(Fixup.getLoc(),
                       "invalid fixup for 128-bit load/store instruction");
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 685907a..a0de3c3 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -14,27 +14,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetStreamer.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringExtras.h"
+#include "AArch64WinCOFFStreamer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -106,8 +104,8 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override {
     EmitA64MappingSymbol();
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
@@ -180,6 +178,7 @@ private:
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 };
+
 } // end anonymous namespace
 
 AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
@@ -191,6 +190,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
 }
 
 namespace llvm {
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
@@ -212,6 +212,9 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   const Triple &TT = STI.getTargetTriple();
   if (TT.isOSBinFormatELF())
     return new AArch64TargetELFStreamer(S);
+  if (TT.isOSBinFormatCOFF())
+    return new AArch64TargetWinCOFFStreamer(S);
   return nullptr;
 }
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 0f5b765..4293dcb 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -16,53 +16,47 @@ namespace llvm {
 namespace AArch64 {
 
 enum Fixups {
-  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
+  // A 21-bit pc-relative immediate inserted into an ADR instruction.
   fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
 
-  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
+  // A 21-bit pc-relative immediate inserted into an ADRP instruction.
   fixup_aarch64_pcrel_adrp_imm21,
 
-  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
+  // 12-bit fixup for add/sub instructions. No alignment adjustment. All value
+  // bits are encoded.
   fixup_aarch64_add_imm12,
 
-  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
+  // unsigned 12-bit fixups for load and store instructions.
   fixup_aarch64_ldst_imm12_scale1,
   fixup_aarch64_ldst_imm12_scale2,
   fixup_aarch64_ldst_imm12_scale4,
   fixup_aarch64_ldst_imm12_scale8,
   fixup_aarch64_ldst_imm12_scale16,
 
-  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
-  // pc-relative loads and generates relocations directly when necessary.
+  // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+  // fixup_aarch64_pcrel_adrhi, except this is used by pc-relative loads and
+  // generates relocations directly when necessary.
   fixup_aarch64_ldr_pcrel_imm19,
 
   // FIXME: comment
   fixup_aarch64_movw,
 
-  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
+  // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
-  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
-  // b.cc and generates relocations directly when necessary.
+  // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+  // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
+  // relocations directly when necessary.
   fixup_aarch64_pcrel_branch19,
 
-  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
+  // The high 26 bits of a 28-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch26,
 
-  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
+  // The high 26 bits of a 28-bit pc-relative immediate. Distinguished from
+  // branch26 only on ELF.
   fixup_aarch64_pcrel_call26,
 
-  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
+  // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation.
   fixup_aarch64_tlsdesc_call,
 
   // Marker
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 8fc8223..c25bd8c 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -32,14 +32,15 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
                clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
 
 AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+  // We prefer NEON instructions to be printed in the short, Apple-specific
+  // form when targeting Darwin.
+  AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
 
   PrivateGlobalPrefix = "L";
   PrivateLabelPrefix = "L";
   SeparatorString = "%%";
   CommentString = ";";
-  PointerSize = CalleeSaveStackSlotSize = 8;
+  CodePointerSize = CalleeSaveStackSlotSize = 8;
 
   AlignmentIsInBytes = false;
   UsesELFSectionDirectiveForBSS = true;
@@ -68,10 +69,11 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   if (T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+  // We prefer NEON instructions to be printed in the generic form when
+  // targeting ELF.
+  AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
 
-  PointerSize = 8;
+  CodePointerSize = 8;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
@@ -98,3 +100,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
 
   HasIdentDirective = true;
 }
+
+AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
+  CommentString = ";";
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 253cd30..2d7107a 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
 
+#include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 
@@ -33,6 +34,10 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
   explicit AArch64MCAsmInfoELF(const Triple &T);
 };
 
+struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF {
+  explicit AArch64MCAsmInfoCOFF();
+};
+
 } // namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 62dfa59..33698d2 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -565,6 +565,9 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
     Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup));
     return;
+  } else if (MI.getOpcode() == AArch64::CompilerBarrier) {
+    // This just prevents the compiler from reordering accesses, no actual code.
+    return;
   }
 
   uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index a540f49..97c92fa 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -62,6 +62,7 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
   case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
   case VK_ABS_PAGE:            return "";
+  case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
   case VK_GOT_PAGE:            return ":got:";
   case VK_GOT_LO12:            return ":got_lo12:";
   case VK_GOTTPREL_PAGE:       return ":gottprel:";
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index db36a65..3dbf0f8 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -62,6 +62,7 @@ public:
     // since a user would write ":lo12:").
     VK_CALL              = VK_ABS,
     VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_PAGE_NC       = VK_ABS      | VK_PAGE    | VK_NC,
     VK_ABS_G3            = VK_ABS      | VK_G3,
     VK_ABS_G2            = VK_ABS      | VK_G2,
     VK_ABS_G2_S          = VK_SABS     | VK_G2,
@@ -95,7 +96,7 @@ public:
     VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
     VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
     VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF,
     VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
 
     VK_INVALID  = 0xfff
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index e9d38d3..a255549 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -14,6 +14,7 @@
 #include "AArch64MCTargetDesc.h"
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
+#include "AArch64WinCOFFStreamer.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -59,8 +60,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI;
   if (TheTriple.isOSBinFormatMachO())
     MAI = new AArch64MCAsmInfoDarwin();
+  else if (TheTriple.isOSBinFormatCOFF())
+    MAI = new AArch64MCAsmInfoCOFF();
   else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    assert(TheTriple.isOSBinFormatELF() && "Invalid target");
     MAI = new AArch64MCAsmInfoELF(TheTriple);
   }
 
@@ -74,8 +77,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 
 static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
                               CodeModel::Model &CM) {
-  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
+  assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() ||
+          TT.isOSBinFormatCOFF()) && "Invalid target");
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
@@ -84,9 +87,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
   // no matter how far away they are.
   else if (CM == CodeModel::JITDefault)
     CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error(
-        "Only small and large code models are allowed on AArch64");
+  else if (CM != CodeModel::Small && CM != CodeModel::Large) {
+    if (!TT.isOSFuchsia())
+      report_fatal_error(
+          "Only small and large code models are allowed on AArch64");
+    else if (CM != CodeModel::Kernel)
+      report_fatal_error(
+          "Only small, kernel, and large code models are allowed on AArch64");
+  }
 }
 
 static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
@@ -117,6 +125,14 @@ static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                              /*LabelSections*/ true);
 }
 
+static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                                         raw_pwrite_stream &OS,
+                                         MCCodeEmitter *Emitter, bool RelaxAll,
+                                         bool IncrementalLinkerCompatible) {
+  return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                                      IncrementalLinkerCompatible);
+}
+
 static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
@@ -149,6 +165,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
     // Register the obj streamers.
     TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
     TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer);
+    TargetRegistry::RegisterCOFFStreamer(*T, createWinCOFFStreamer);
 
     // Register the obj target streamer.
     TargetRegistry::RegisterObjectTargetStreamer(
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 615d7da..1404926 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -60,6 +60,8 @@ MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS,
                                               uint32_t CPUType,
                                               uint32_t CPUSubtype);
 
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS);
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 53a6852..19b2576 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -10,20 +10,28 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 namespace {
+
 class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
   bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
                                   const MCSymbolRefExpr *Sym,
@@ -38,7 +46,8 @@ public:
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
 };
-}
+
+} // end anonymous namespace
 
 bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
@@ -51,18 +60,18 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     return false;
 
   case FK_Data_1:
-    Log2Size = llvm::Log2_32(1);
+    Log2Size = Log2_32(1);
     return true;
   case FK_Data_2:
-    Log2Size = llvm::Log2_32(2);
+    Log2Size = Log2_32(2);
     return true;
   case FK_Data_4:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
   case FK_Data_8:
-    Log2Size = llvm::Log2_32(8);
+    Log2Size = Log2_32(8);
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
@@ -72,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     switch (Sym->getKind()) {
     default:
       return false;
@@ -87,14 +96,13 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
       return true;
     }
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     // This encompasses the relocation for the whole 21-bit value.
     switch (Sym->getKind()) {
-    default: {
+    default:
       Asm.getContext().reportError(Fixup.getLoc(),
                                    "ADR/ADRP relocations must be GOT relative");
       return false;
-    }
     case MCSymbolRefExpr::VK_PAGE:
       RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
       return true;
@@ -108,7 +116,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     return true;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
     return true;
   }
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..31762b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -0,0 +1,104 @@
+//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  AArch64WinCOFFObjectWriter()
+      : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {}
+
+  ~AArch64WinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+
+  bool recordRelocation(const MCFixup &) const override;
+};
+
+} // end anonymous namespace
+
+unsigned AArch64WinCOFFObjectWriter::getRelocType(
+    MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup,
+    bool IsCrossSection, const MCAsmBackend &MAB) const {
+  auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                                      : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: {
+    const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+    report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+  }
+
+  case FK_Data_4:
+    switch (Modifier) {
+    default:
+      return COFF::IMAGE_REL_ARM64_ADDR32;
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM64_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM64_SECREL;
+    }
+
+  case FK_Data_8:
+    return COFF::IMAGE_REL_ARM64_ADDR64;
+
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM64_SECTION;
+
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM64_SECREL;
+
+  case AArch64::fixup_aarch64_add_imm12:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
+
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
+
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
+
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    return COFF::IMAGE_REL_ARM64_BRANCH26;
+  }
+}
+
+bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return true;
+}
+
+namespace llvm {
+
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) {
+  MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter();
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
new file mode 100644
index 0000000..6c8da27
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -0,0 +1,37 @@
+//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64WinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  friend class AArch64TargetWinCOFFStreamer;
+
+  AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                        raw_pwrite_stream &OS)
+      : MCWinCOFFStreamer(C, AB, CE, OS) {}
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                              raw_pwrite_stream &OS,
+                              MCCodeEmitter *Emitter, bool RelaxAll,
+                              bool IncrementalLinkerCompatible) {
+  auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS);
+  S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+  return S;
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
new file mode 100644
index 0000000..1b4fcd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -0,0 +1,43 @@
+//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements WinCOFF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+namespace {
+class AArch64WinCOFFStreamer;
+
+class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
+private:
+  AArch64WinCOFFStreamer &getStreamer();
+
+public:
+  AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
+    : AArch64TargetStreamer(S) {}
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                              raw_pwrite_stream &OS,
+                              MCCodeEmitter *Emitter, bool RelaxAll,
+                              bool IncrementalLinkerCompatible);
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index dcc3917..5d76681 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,82 +266,86 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
 }
 } // end namespace AArch64CC
 
+struct SysAlias {
+  const char *Name;
+  uint16_t Encoding;
+  FeatureBitset FeaturesRequired;
+
+  SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
+  SysAlias (const char *N, uint16_t E, FeatureBitset F) :
+    Name(N), Encoding(E), FeaturesRequired(F) {};
+
+  bool haveFeatures(FeatureBitset ActiveFeatures) const {
+    return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+  }
+
+  FeatureBitset getRequiredFeatures() const { return FeaturesRequired; }
+};
+
+struct SysAliasReg : SysAlias {
+  bool NeedsReg;
+  SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
+};
+
 namespace AArch64AT{
-  struct AT {
-    const char *Name;
-    uint16_t Encoding;
+  struct AT : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_AT_DECL
   #include "AArch64GenSystemOperands.inc"
-
 }
+
 namespace AArch64DB {
-  struct DB {
-    const char *Name;
-    uint16_t Encoding;
+  struct DB : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_DB_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64DC {
-  struct DC {
-    const char *Name;
-    uint16_t Encoding;
+  struct DC : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_DC_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64IC {
-  struct IC {
-    const char *Name;
-    uint16_t Encoding;
-    bool NeedsReg;
+  struct IC : SysAliasReg {
+    using SysAliasReg::SysAliasReg;
   };
   #define GET_IC_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64ISB {
-  struct ISB {
-    const char *Name;
-    uint16_t Encoding;
+  struct ISB : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_ISB_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PRFM {
-  struct PRFM {
-    const char *Name;
-    uint16_t Encoding;
+  struct PRFM : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_PRFM_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PState {
-  struct PState {
-    const char *Name;
-    uint16_t Encoding;
-    FeatureBitset FeaturesRequired;
-
-    bool haveFeatures(FeatureBitset ActiveFeatures) const {
-      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
-    }
+  struct PState : SysAlias{
+    using SysAlias::SysAlias;
   };
   #define GET_PSTATE_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PSBHint {
-  struct PSB {
-    const char *Name;
-    uint16_t Encoding;
+  struct PSB : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_PSB_DECL
   #include "AArch64GenSystemOperands.inc"
@@ -451,10 +455,8 @@ namespace AArch64SysReg {
 }
 
 namespace AArch64TLBI {
-  struct TLBI {
-    const char *Name;
-    uint16_t Encoding;
-    bool NeedsReg;
+  struct TLBI : SysAliasReg {
+    using SysAliasReg::SysAliasReg;
   };
   #define GET_TLBI_DECL
   #include "AArch64GenSystemOperands.inc"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
index 7b0a7f4..5686828 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -11,6 +11,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -23,43 +24,61 @@ class Pass;
 class Target;
 class TargetMachine;
 class PassRegistry;
+class Module;
 
 // R600 Passes
-FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
-FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+FunctionPass *createR600VectorRegMerger();
+FunctionPass *createR600ExpandSpecialInstrsPass();
 FunctionPass *createR600EmitClauseMarkers();
-FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
-FunctionPass *createR600Packetizer(TargetMachine &tm);
-FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createR600ClauseMergePass();
+FunctionPass *createR600Packetizer();
+FunctionPass *createR600ControlFlowFinalizer();
 FunctionPass *createAMDGPUCFGStructurizerPass();
 
 // SI Passes
-FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
-FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
-FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createSIInsertWaitcntsPass();
+FunctionPass *createAMDGPUCodeGenPreparePass();
+FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
+extern char &AMDGPUMachineCFGStructurizerID;
+
+void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
+
+Pass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
+ModulePass *createAMDGPULowerIntrinsicsPass();
+void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
+extern char &AMDGPULowerIntrinsicsID;
+
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIPeepholeSDWAPass(PassRegistry &);
+extern char &SIPeepholeSDWAID;
+
 void initializeSIShrinkInstructionsPass(PassRegistry&);
 extern char &SIShrinkInstructionsID;
 
 void initializeSIFixSGPRCopiesPass(PassRegistry &);
 extern char &SIFixSGPRCopiesID;
 
+void initializeSIFixVGPRCopiesPass(PassRegistry &);
+extern char &SIFixVGPRCopiesID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
@@ -79,18 +98,18 @@ void initializeSIOptimizeExecMaskingPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingID;
 
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
                                   CodeGenOpt::Level OptLevel);
-ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
 ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
-FunctionPass* createAMDGPUUnifyMetadataPass();
+ModulePass* createAMDGPUUnifyMetadataPass();
 void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
 extern char &AMDGPUUnifyMetadataID;
 
@@ -112,6 +131,15 @@ extern char &SIDebuggerInsertNopsID;
 void initializeSIInsertWaitsPass(PassRegistry&);
 extern char &SIInsertWaitsID;
 
+void initializeSIInsertWaitcntsPass(PassRegistry&);
+extern char &SIInsertWaitcntsID;
+
+void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
+extern char &AMDGPUUnifyDivergentExitNodesID;
+
+ImmutablePass *createAMDGPUAAWrapperPass();
+void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+
 Target &getTheAMDGPUTarget();
 Target &getTheGCNTarget();
 
@@ -133,43 +161,53 @@ enum TargetIndex {
 /// however on the GPU, each address space points to
 /// a separate piece of memory that is unique from other
 /// memory locations.
-namespace AMDGPUAS {
-enum AddressSpaces : unsigned {
-  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
-  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
-  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
-  REGION_ADDRESS   = 5, ///< Address space for region memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+struct AMDGPUAS {
+  // The following address space values depend on the triple environment.
+  unsigned PRIVATE_ADDRESS;  ///< Address space for private memory.
+  unsigned FLAT_ADDRESS;     ///< Address space for flat memory.
+  unsigned REGION_ADDRESS;   ///< Address space for region memory.
+
+  // The maximum value for flat, generic, local, private, constant and region.
+  const static unsigned MAX_COMMON_ADDRESS = 5;
+
+  const static unsigned GLOBAL_ADDRESS   = 1;  ///< Address space for global memory (RAT0, VTX0).
+  const static unsigned CONSTANT_ADDRESS = 2;  ///< Address space for constant memory (VTX2)
+  const static unsigned LOCAL_ADDRESS    = 3;  ///< Address space for local memory.
+  const static unsigned PARAM_D_ADDRESS  = 6;  ///< Address space for direct addressible parameter memory (CONST0)
+  const static unsigned PARAM_I_ADDRESS  = 7;  ///< Address space for indirect addressible parameter memory (VTX1)
 
   // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
   // order to be able to dynamically index a constant buffer, for example:
   //
   // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
 
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
+  const static unsigned CONSTANT_BUFFER_0 = 8;
+  const static unsigned CONSTANT_BUFFER_1 = 9;
+  const static unsigned CONSTANT_BUFFER_2 = 10;
+  const static unsigned CONSTANT_BUFFER_3 = 11;
+  const static unsigned CONSTANT_BUFFER_4 = 12;
+  const static unsigned CONSTANT_BUFFER_5 = 13;
+  const static unsigned CONSTANT_BUFFER_6 = 14;
+  const static unsigned CONSTANT_BUFFER_7 = 15;
+  const static unsigned CONSTANT_BUFFER_8 = 16;
+  const static unsigned CONSTANT_BUFFER_9 = 17;
+  const static unsigned CONSTANT_BUFFER_10 = 18;
+  const static unsigned CONSTANT_BUFFER_11 = 19;
+  const static unsigned CONSTANT_BUFFER_12 = 20;
+  const static unsigned CONSTANT_BUFFER_13 = 21;
+  const static unsigned CONSTANT_BUFFER_14 = 22;
+  const static unsigned CONSTANT_BUFFER_15 = 23;
 
   // Some places use this if the address space can't be determined.
-  UNKNOWN_ADDRESS_SPACE = ~0u
+  const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u;
 };
 
-} // namespace AMDGPUAS
+namespace llvm {
+namespace AMDGPU {
+AMDGPUAS getAMDGPUAS(const Module &M);
+AMDGPUAS getAMDGPUAS(const TargetMachine &TM);
+AMDGPUAS getAMDGPUAS(Triple T);
+} // namespace AMDGPU
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
index 1302200..f1d899c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -61,18 +61,48 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
   "Support flat address space"
 >;
 
+def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
+  "FlatInstOffsets",
+  "true",
+  "Flat instructions have immediate offset addressing mode"
+>;
+
+def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
+  "FlatGlobalInsts",
+  "true",
+  "Have global_* flat memory instructions"
+>;
+
+def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
+  "FlatScratchInsts",
+  "true",
+  "Have scratch_* flat memory instructions"
+>;
+
 def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "UnalignedBufferAccess",
   "true",
   "Support unaligned global loads and stores"
 >;
 
+def FeatureTrapHandler: SubtargetFeature<"trap-handler",
+  "TrapHandler",
+  "true",
+  "Trap handler support"
+>;
+
 def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
   "UnalignedScratchAccess",
   "true",
   "Support unaligned scratch loads and stores"
 >;
 
+def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
+  "HasApertureRegs",
+  "true",
+  "Has Memory Aperture Base and Size Registers"
+>;
+
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
@@ -154,6 +184,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
   "Additional intstructions for CI+"
 >;
 
+def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
+  "GFX9Insts",
+  "true",
+  "Additional intstructions for GFX9+"
+>;
+
 def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
   "HasSMemRealTime",
   "true",
@@ -172,6 +208,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
   "Has i16/f16 instructions"
 >;
 
+def FeatureVOP3P : SubtargetFeature<"vop3p",
+  "HasVOP3PInsts",
+  "true",
+  "Has VOP3P packed instructions"
+>;
+
 def FeatureMovrel : SubtargetFeature<"movrel",
   "HasMovrel",
   "true",
@@ -190,16 +232,52 @@ def FeatureScalarStores : SubtargetFeature<"scalar-stores",
   "Has store scalar memory instructions"
 >;
 
-//===------------------------------------------------------------===//
-// Subtarget Features (options and debugging)
-//===------------------------------------------------------------===//
+def FeatureSDWA : SubtargetFeature<"sdwa",
+  "HasSDWA",
+  "true",
+  "Support SDWA (Sub-DWORD Addressing) extension"
+>;
 
-def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
-  "FP16Denormals",
+def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
+  "HasSDWAOmod",
+  "true",
+  "Support OMod with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
+  "HasSDWAScalar",
+  "true",
+  "Support scalar register with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
+  "HasSDWASdst",
+  "true",
+  "Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
+  "HasSDWAMac",
   "true",
-  "Enable half precision denormal handling"
+  "Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
 >;
 
+def FeatureSDWAOutModsVOPC : SubtargetFeature<"sdwa-out-mods-vopc",
+  "HasSDWAOutModsVOPC",
+  "true",
+  "Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
+>;
+
+def FeatureDPP : SubtargetFeature<"dpp",
+  "HasDPP",
+  "true",
+  "Support DPP (Data Parallel Primitives) extension"
+>;
+
+//===------------------------------------------------------------===//
+// Subtarget Features (options and debugging)
+//===------------------------------------------------------------===//
+
 // Some instructions do not support denormals despite this flag. Using
 // fp32 denormals also causes instructions to run at the double
 // precision rate for the device.
@@ -209,13 +287,36 @@ def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
   "Enable single precision denormal handling"
 >;
 
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
-  "FP64Denormals",
+// Denormal handling for fp64 and fp16 is controlled by the same
+// config register when fp16 supported.
+// TODO: Do we need a separate f16 setting when not legal?
+def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals",
+  "FP64FP16Denormals",
   "true",
-  "Enable double precision denormal handling",
+  "Enable double and half precision denormal handling",
   [FeatureFP64]
 >;
 
+def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
+  "FP64FP16Denormals",
+  "true",
+  "Enable double and half precision denormal handling",
+  [FeatureFP64, FeatureFP64FP16Denormals]
+>;
+
+def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
+  "FP64FP16Denormals",
+  "true",
+  "Enable half precision denormal handling",
+  [FeatureFP64FP16Denormals]
+>;
+
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+  "DX10Clamp",
+  "true",
+  "clamp modifier clamps NaNs to 0.0"
+>;
+
 def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
   "FPExceptions",
   "true",
@@ -294,6 +395,13 @@ def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
   "Force to generate flat instruction for global"
 >;
 
+def FeatureAutoWaitcntBeforeBarrier : SubtargetFeature <
+  "auto-waitcnt-before-barrier",
+  "AutoWaitcntBeforeBarrier",
+  "true",
+  "Hardware automatically inserts waitcnt before barrier"
+>;
+
 // Dummy feature used to disable assembler instructions.
 def FeatureDisable : SubtargetFeature<"",
   "FeatureDisable","true",
@@ -343,7 +451,20 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
-   FeatureScalarStores, FeatureInv2PiInlineImm
+   FeatureScalarStores, FeatureInv2PiInlineImm,
+   FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP
+  ]
+>;
+
+def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+  [FeatureFP64, FeatureLocalMemorySize65536,
+   FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
+   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
+   FeatureFastFMAF32, FeatureDPP,
+   FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
+   FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
   ]
 >;
 
@@ -357,6 +478,16 @@ class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
   Implies
 >;
 
+def FeatureISAVersion6_0_0 : SubtargetFeatureISAVersion <6,0,0,
+  [FeatureSouthernIslands,
+   FeatureFastFMAF32,
+   HalfRate64Ops,
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion6_0_1 : SubtargetFeatureISAVersion <6,0,1,
+  [FeatureSouthernIslands,
+   FeatureLDSBankCount32]>;
+
 def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0,
   [FeatureSeaIslands,
    FeatureLDSBankCount32]>;
@@ -371,6 +502,10 @@ def FeatureISAVersion7_0_2 : SubtargetFeatureISAVersion <7,0,2,
   [FeatureSeaIslands,
    FeatureLDSBankCount16]>;
 
+def FeatureISAVersion7_0_3 : SubtargetFeatureISAVersion <7,0,3,
+  [FeatureSeaIslands,
+   FeatureLDSBankCount16]>;
+
 def FeatureISAVersion8_0_0 : SubtargetFeatureISAVersion <8,0,0,
   [FeatureVolcanicIslands,
    FeatureLDSBankCount32,
@@ -399,6 +534,24 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
    FeatureLDSBankCount16,
    FeatureXNACK]>;
 
+def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
+  [FeatureGFX9,
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,
+  [FeatureGFX9,
+   FeatureLDSBankCount32,
+   FeatureXNACK]>;
+
+def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
+  [FeatureGFX9,
+   FeatureLDSBankCount32]>;
+
+def FeatureISAVersion9_0_3 : SubtargetFeatureISAVersion <9,0,3,
+  [FeatureGFX9,
+   FeatureLDSBankCount32,
+   FeatureXNACK]>;
+
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
 //===----------------------------------------------------------------------===//
@@ -448,10 +601,12 @@ def AMDGPUAsmVariants {
   int VOP3_ID = 1;
   string SDWA = "SDWA";
   int SDWA_ID = 2;
+  string SDWA9 = "SDWA9";
+  int SDWA9_ID = 3;
   string DPP = "DPP";
-  int DPP_ID = 3;
+  int DPP_ID = 4;
   string Disable = "Disable";
-  int Disable_ID = 4;
+  int Disable_ID = 5;
 }
 
 def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
@@ -469,6 +624,12 @@ def SDWAAsmParserVariant : AsmParserVariant {
   let Name = AMDGPUAsmVariants.SDWA;
 }
 
+def SDWA9AsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.SDWA9_ID;
+  let Name = AMDGPUAsmVariants.SDWA9;
+}
+
+
 def DPPAsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.DPP_ID;
   let Name = AMDGPUAsmVariants.DPP;
@@ -481,6 +642,7 @@ def AMDGPU : Target {
   let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
                                 VOP3AsmParserVariant,
                                 SDWAAsmParserVariant,
+                                SDWA9AsmParserVariant,
                                 DPPAsmParserVariant];
   let AssemblyWriters = [AMDGPUAsmWriter];
 }
@@ -504,14 +666,34 @@ def isVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate<"FeatureGCN3Encoding">;
 
+def isGFX9 : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
+  AssemblerPredicate<"FeatureGFX9Insts">;
+
+// TODO: Either the name to be changed or we simply use IsCI!
 def isCIVI : Predicate <
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
-  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
->, AssemblerPredicate<"FeatureCIInsts">;
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
+  AssemblerPredicate<"FeatureCIInsts">;
+
+def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
+  AssemblerPredicate<"FeatureFlatAddressSpace">;
+
+def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
+  AssemblerPredicate<"FeatureFlatGlobalInsts">;
+
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
+  AssemblerPredicate<"Feature16BitInsts">;
+def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
+  AssemblerPredicate<"FeatureVOP3P">;
+
+def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
+  AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
 
-def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
+def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
+  AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;
 
-def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+def HasDPP : Predicate<"Subtarget->hasDPP()">,
+  AssemblerPredicate<"FeatureDPP">;
 
 class PredicateControl {
   Predicate SubtargetPredicate;
@@ -532,5 +714,6 @@ include "Processors.td"
 include "AMDGPUInstrInfo.td"
 include "AMDGPUIntrinsics.td"
 include "AMDGPURegisterInfo.td"
+include "AMDGPURegisterBanks.td"
 include "AMDGPUInstructions.td"
 include "AMDGPUCallingConv.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
new file mode 100644
index 0000000..faa424e
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -0,0 +1,147 @@
+//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is the AMGPU address space based alias analysis pass.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUAliasAnalysis.h"
+#include "AMDGPU.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-aa"
+
+// Register this pass...
+char AMDGPUAAWrapperPass::ID = 0;
+INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
+                "AMDGPU Address space based Alias Analysis", false, true)
+
+ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
+  return new AMDGPUAAWrapperPass();
+}
+
+void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+// Must match the table in getAliasResult.
+AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_)
+  : Arch(Arch_), AS(AS_) {
+  // These arrarys are indexed by address space value
+  // enum elements 0 ... to 5
+  static const AliasResult ASAliasRulesPrivIsZero[6][6] = {
+  /*             Private    Global    Constant  Group     Flat      Region*/
+  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Global   */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias},
+  /* Group    */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias},
+  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
+  };
+  static const AliasResult ASAliasRulesGenIsZero[6][6] = {
+  /*             Flat       Global    Constant  Group     Region    Private */
+  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
+  /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
+  /* Group    */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
+  /* Region   */ {MayAlias, NoAlias , NoAlias , NoAlias,  MayAlias, NoAlias},
+  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias}
+  };
+  assert(AS.MAX_COMMON_ADDRESS <= 5);
+  if (AS.FLAT_ADDRESS == 0) {
+    assert(AS.GLOBAL_ADDRESS   == 1 &&
+           AS.REGION_ADDRESS   == 4 &&
+           AS.LOCAL_ADDRESS    == 3 &&
+           AS.CONSTANT_ADDRESS == 2 &&
+           AS.PRIVATE_ADDRESS  == 5);
+    ASAliasRules = &ASAliasRulesGenIsZero;
+  } else {
+    assert(AS.PRIVATE_ADDRESS  == 0 &&
+           AS.GLOBAL_ADDRESS   == 1 &&
+           AS.CONSTANT_ADDRESS == 2 &&
+           AS.LOCAL_ADDRESS    == 3 &&
+           AS.FLAT_ADDRESS     == 4 &&
+           AS.REGION_ADDRESS   == 5);
+    ASAliasRules = &ASAliasRulesPrivIsZero;
+  }
+}
+
+AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1,
+    unsigned AS2) const {
+  if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) {
+    if (Arch == Triple::amdgcn)
+      report_fatal_error("Pointer address space out of range");
+    return AS1 == AS2 ? MayAlias : NoAlias;
+  }
+
+  return (*ASAliasRules)[AS1][AS2];
+}
+
+AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
+                                  const MemoryLocation &LocB) {
+  unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
+  unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
+
+  AliasResult Result = ASAliasRules.getAliasResult(asA, asB);
+  if (Result == NoAlias) return Result;
+
+  // Forward the query to the next alias analysis.
+  return AAResultBase::alias(LocA, LocB);
+}
+
+bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
+                                            bool OrLocal) {
+  const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
+
+  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+    return true;
+  }
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+    if (GV->isConstant())
+      return true;
+  } else if (const Argument *Arg = dyn_cast<Argument>(Base)) {
+    const Function *F = Arg->getParent();
+
+    // Only assume constant memory for arguments on kernels.
+    switch (F->getCallingConv()) {
+    default:
+      return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+    case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_GS:
+    case CallingConv::AMDGPU_PS:
+    case CallingConv::AMDGPU_CS:
+    case CallingConv::AMDGPU_KERNEL:
+    case CallingConv::SPIR_KERNEL:
+      break;
+    }
+
+    unsigned ArgNo = Arg->getArgNo();
+    /* On an argument, ReadOnly attribute indicates that the function does
+       not write through this pointer argument, even though it may write
+       to the memory that the pointer points to.
+       On an argument, ReadNone attribute indicates that the function does
+       not dereference that pointer argument, even though it may read or write
+       the memory that the pointer points to if accessed through other pointers.
+     */
+    if (F->hasParamAttribute(ArgNo, Attribute::NoAlias) &&
+        (F->hasParamAttribute(ArgNo, Attribute::ReadNone) ||
+         F->hasParamAttribute(ArgNo, Attribute::ReadOnly))) {
+      return true;
+    }
+  }
+  return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
new file mode 100644
index 0000000..5f8ed9b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -0,0 +1,102 @@
+//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is the AMGPU address space based alias analysis pass.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A simple AA result that uses TBAA metadata to answer queries.
+class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
+  friend AAResultBase<AMDGPUAAResult>;
+
+  const DataLayout &DL;
+  AMDGPUAS AS;
+
+public:
+  explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
+    DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {}
+  AMDGPUAAResult(AMDGPUAAResult &&Arg)
+      : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS),
+        ASAliasRules(Arg.ASAliasRules){}
+
+  /// Handle invalidation events from the new pass manager.
+  ///
+  /// By definition, this result is stateless and so remains valid.
+  bool invalidate(Function &, const PreservedAnalyses &) { return false; }
+
+  AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
+  bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal);
+
+private:
+  bool Aliases(const MDNode *A, const MDNode *B) const;
+  bool PathAliases(const MDNode *A, const MDNode *B) const;
+
+  class ASAliasRulesTy {
+  public:
+    ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_);
+    AliasResult getAliasResult(unsigned AS1, unsigned AS2) const;
+  private:
+    Triple::ArchType Arch;
+    AMDGPUAS AS;
+    const AliasResult (*ASAliasRules)[6][6];
+  } ASAliasRules;
+};
+
+/// Analysis pass providing a never-invalidated alias analysis result.
+class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
+  friend AnalysisInfoMixin<AMDGPUAA>;
+  static char PassID;
+
+public:
+  typedef AMDGPUAAResult Result;
+
+  AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
+    return AMDGPUAAResult(F.getParent()->getDataLayout(),
+        Triple(F.getParent()->getTargetTriple()));
+  }
+};
+
+/// Legacy wrapper pass to provide the AMDGPUAAResult object.
+class AMDGPUAAWrapperPass : public ImmutablePass {
+  std::unique_ptr<AMDGPUAAResult> Result;
+
+public:
+  static char ID;
+
+  AMDGPUAAWrapperPass() : ImmutablePass(ID) {
+    initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  AMDGPUAAResult &getResult() { return *Result; }
+  const AMDGPUAAResult &getResult() const { return *Result; }
+
+  bool doInitialization(Module &M) override {
+    Result.reset(new AMDGPUAAResult(M.getDataLayout(),
+        Triple(M.getTargetTriple())));
+    return false;
+  }
+  bool doFinalization(Module &M) override {
+    Result.reset();
+    return false;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+}
+#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 067a16a..6f3742e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -9,7 +9,7 @@
 //
 /// \file
 /// This pass marks all internal functions as always_inline and creates
-/// duplicates of all other functions a marks the duplicates as always_inline.
+/// duplicates of all other functions and marks the duplicates as always_inline.
 //
 //===----------------------------------------------------------------------===//
 
@@ -22,16 +22,22 @@ using namespace llvm;
 namespace {
 
 class AMDGPUAlwaysInline : public ModulePass {
-  static char ID;
+  bool GlobalOpt;
 
 public:
-  AMDGPUAlwaysInline() : ModulePass(ID) { }
+  static char ID;
+
+  AMDGPUAlwaysInline(bool GlobalOpt = false) :
+    ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
 };
 
 } // End anonymous namespace
 
+INITIALIZE_PASS(AMDGPUAlwaysInline, "amdgpu-always-inline",
+                "AMDGPU Inline All Functions", false, false)
+
 char AMDGPUAlwaysInline::ID = 0;
 
 bool AMDGPUAlwaysInline::runOnModule(Module &M) {
@@ -45,8 +51,10 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     }
   }
 
-  for (GlobalAlias* A : AliasesToRemove) {
-    A->eraseFromParent();
+  if (GlobalOpt) {
+    for (GlobalAlias* A : AliasesToRemove) {
+      A->eraseFromParent();
+    }
   }
 
   for (Function &F : M) {
@@ -70,6 +78,6 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
   return false;
 }
 
-ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
-  return new AMDGPUAlwaysInline();
+ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
+  return new AMDGPUAlwaysInline(GlobalOpt);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index c98d25e2..c68e586 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -13,8 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 
@@ -24,31 +28,34 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUAnnotateKernelFeatures : public ModulePass {
+class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
 private:
-  static bool hasAddrSpaceCast(const Function &F);
+  const TargetMachine *TM = nullptr;
+  AMDGPUAS AS;
 
-  void addAttrToCallers(Function *Intrin, StringRef AttrName);
-  bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
+  bool addFeatureAttributes(Function &F);
 
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
-  bool runOnModule(Module &M) override;
+  AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
+
+  bool doInitialization(CallGraph &CG) override;
+  bool runOnSCC(CallGraphSCC &SCC) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
-    ModulePass::getAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
   }
 
-  static bool visitConstantExpr(const ConstantExpr *CE);
+  static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
   static bool visitConstantExprsRecursively(
     const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+    AMDGPUAS AS);
 };
 
 }
@@ -62,18 +69,20 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
 
 
 // The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS) {
-  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
+  return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
 }
 
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
-  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
+    const AMDGPUAS &AS) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
 }
 
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
+    AMDGPUAS AS) {
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-    return castRequiresQueuePtr(SrcAS);
+    return castRequiresQueuePtr(SrcAS, AS);
   }
 
   return false;
@@ -81,7 +90,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
 
 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+  AMDGPUAS AS) {
 
   if (!ConstantExprVisited.insert(EntryC).second)
     return false;
@@ -94,7 +104,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 
     // Check this constant expression.
     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (visitConstantExpr(CE))
+      if (visitConstantExpr(CE, AS))
         return true;
     }
 
@@ -114,15 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   return false;
 }
 
-// Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+// We do not need to note the x workitem or workgroup id because they are always
+// initialized.
+//
+// TODO: We should not add the attributes if the known compile time workgroup
+// size is 1 for y/z.
+static StringRef intrinsicToAttrName(Intrinsic::ID ID,
+                                     bool &NonKernelOnly,
+                                     bool &IsQueuePtr) {
+  switch (ID) {
+  case Intrinsic::amdgcn_workitem_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-item-id-x";
+  case Intrinsic::amdgcn_workgroup_id_x:
+    NonKernelOnly = true;
+    return "amdgpu-work-group-id-x";
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+    return "amdgpu-work-item-id-y";
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+    return "amdgpu-work-item-id-z";
+  case Intrinsic::amdgcn_workgroup_id_y:
+  case Intrinsic::r600_read_tgid_y:
+    return "amdgpu-work-group-id-y";
+  case Intrinsic::amdgcn_workgroup_id_z:
+  case Intrinsic::r600_read_tgid_z:
+    return "amdgpu-work-group-id-z";
+  case Intrinsic::amdgcn_dispatch_ptr:
+    return "amdgpu-dispatch-ptr";
+  case Intrinsic::amdgcn_dispatch_id:
+    return "amdgpu-dispatch-id";
+  case Intrinsic::amdgcn_kernarg_segment_ptr:
+  case Intrinsic::amdgcn_implicitarg_ptr:
+    return "amdgpu-kernarg-segment-ptr";
+  case Intrinsic::amdgcn_queue_ptr:
+  case Intrinsic::trap:
+  case Intrinsic::debugtrap:
+    IsQueuePtr = true;
+    return "amdgpu-queue-ptr";
+  default:
+    return "";
+  }
+}
+
+static bool handleAttr(Function &Parent, const Function &Callee,
+                       StringRef Name) {
+  if (Callee.hasFnAttribute(Name)) {
+    Parent.addFnAttr(Name);
+    return true;
+  }
+
+  return false;
+}
+
+static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
+                                   bool &NeedQueuePtr) {
+  // X ids unnecessarily propagated to kernels.
+  static const StringRef AttrNames[] = {
+    { "amdgpu-work-item-id-x" },
+    { "amdgpu-work-item-id-y" },
+    { "amdgpu-work-item-id-z" },
+    { "amdgpu-work-group-id-x" },
+    { "amdgpu-work-group-id-y" },
+    { "amdgpu-work-group-id-z" },
+    { "amdgpu-dispatch-ptr" },
+    { "amdgpu-dispatch-id" },
+    { "amdgpu-kernarg-segment-ptr" }
+  };
+
+  if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
+    NeedQueuePtr = true;
+
+  for (StringRef AttrName : AttrNames)
+    handleAttr(Parent, Callee, AttrName);
+}
+
+bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  bool HasFlat = ST.hasFlatAddressSpace();
+  bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
+  bool Changed = false;
+  bool NeedQueuePtr = false;
+  bool HaveCall = false;
+  bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallSite CS(&I);
+      if (CS) {
+        Function *Callee = CS.getCalledFunction();
+
+        // TODO: Do something with indirect calls.
+        if (!Callee) {
+          if (!CS.isInlineAsm())
+            HaveCall = true;
+          continue;
+        }
+
+        Intrinsic::ID IID = Callee->getIntrinsicID();
+        if (IID == Intrinsic::not_intrinsic) {
+          HaveCall = true;
+          copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
+          Changed = true;
+        } else {
+          bool NonKernelOnly = false;
+          StringRef AttrName = intrinsicToAttrName(IID,
+                                                   NonKernelOnly, NeedQueuePtr);
+          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+            F.addFnAttr(AttrName);
+            Changed = true;
+          }
+        }
+      }
+
+      if (NeedQueuePtr || HasApertureRegs)
+        continue;
+
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC))
-          return true;
+        if (castRequiresQueuePtr(ASC, AS)) {
+          NeedQueuePtr = true;
+          continue;
+        }
       }
 
       for (const Use &U : I.operands()) {
@@ -130,93 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
-          return true;
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) {
+          NeedQueuePtr = true;
+          break;
+        }
       }
     }
   }
 
-  return false;
-}
-
-void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin,
-                                                    StringRef AttrName) {
-  SmallPtrSet<Function *, 4> SeenFuncs;
-
-  for (User *U : Intrin->users()) {
-    // CallInst is the only valid user for an intrinsic.
-    CallInst *CI = cast<CallInst>(U);
-
-    Function *CallingFunction = CI->getParent()->getParent();
-    if (SeenFuncs.insert(CallingFunction).second)
-      CallingFunction->addFnAttr(AttrName);
+  if (NeedQueuePtr) {
+    F.addFnAttr("amdgpu-queue-ptr");
+    Changed = true;
   }
-}
-
-bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
-  Module &M,
-  ArrayRef<StringRef[2]> IntrinsicToAttr) {
-  bool Changed = false;
 
-  for (const StringRef *Arr  : IntrinsicToAttr) {
-    if (Function *Fn = M.getFunction(Arr[0])) {
-      addAttrToCallers(Fn, Arr[1]);
-      Changed = true;
-    }
+  // TODO: We could refine this to captured pointers that could possibly be
+  // accessed by flat instructions. For now this is mostly a poor way of
+  // estimating whether there are calls before argument lowering.
+  if (HasFlat && !IsFunc && HaveCall) {
+    F.addFnAttr("amdgpu-flat-scratch");
+    Changed = true;
   }
 
   return Changed;
 }
 
-bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
+bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
+  Module &M = SCC.getCallGraph().getModule();
   Triple TT(M.getTargetTriple());
 
-  static const StringRef IntrinsicToAttr[][2] = {
-    // .x omitted
-    { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" },
-    { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" },
-
-    { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" },
-    { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" },
-
-    { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" },
-    { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" },
-
-    // .x omitted
-    { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" },
-    { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" }
-  };
-
-  static const StringRef HSAIntrinsicToAttr[][2] = {
-    { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
-    { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
-    { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }
-  };
-
-  // TODO: We should not add the attributes if the known compile time workgroup
-  // size is 1 for y/z.
-
-  // TODO: Intrinsics that require queue ptr.
+  bool Changed = false;
+  for (CallGraphNode *I : SCC) {
+    Function *F = I->getFunction();
+    if (!F || F->isDeclaration())
+      continue;
 
-  // We do not need to note the x workitem or workgroup id because they are
-  // always initialized.
+    Changed |= addFeatureAttributes(*F);
+  }
 
-  bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr);
-  if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) {
-    Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr);
 
-    for (Function &F : M) {
-      if (F.hasFnAttribute("amdgpu-queue-ptr"))
-        continue;
+  return Changed;
+}
 
-      if (hasAddrSpaceCast(F))
-        F.addFnAttr("amdgpu-queue-ptr");
-    }
-  }
+bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    report_fatal_error("TargetMachine is required");
 
-  return Changed;
+  AS = AMDGPU::getAMDGPUAS(CG.getModule());
+  TM = &TPC->getTM<TargetMachine>();
+  return false;
 }
 
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
+Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
   return new AMDGPUAnnotateKernelFeatures();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index c011be6..ed53708 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -19,8 +19,8 @@
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -37,6 +37,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
   LoopInfo *LI;
   DenseMap<Value*, GetElementPtrInst*> noClobberClones;
   bool isKernelFunc;
+  AMDGPUAS AMDGPUASI;
 
 public:
   static char ID;
@@ -106,11 +107,12 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
 
   DFS(Start, Checklist);
   for (auto &BB : Checklist) {
-    BasicBlock::iterator StartIt = (BB == Load->getParent()) ?
-     BasicBlock::iterator(Load) : BB->end();
-     if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr),
-       true, StartIt, BB, Load).isClobber())
-       return true;
+    BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
+      BasicBlock::iterator(Load) : BB->end();
+    auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true,
+                                           StartIt, BB, Load);
+    if (Q.isClobber() || Q.isUnknown())
+      return true;
   }
   return false;
 }
@@ -130,8 +132,8 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
-  auto isGlobalLoad = [](LoadInst &Load)->bool {
-    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  auto isGlobalLoad = [&](LoadInst &Load)->bool {
+    return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
   };
   // We're tracking up to the Function boundaries
   // We cannot go beyond because of FunctionPass restrictions
@@ -166,6 +168,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
 }
 
 bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+  AMDGPUASI = AMDGPU::getAMDGPUAS(M);
   return false;
 }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 974e79f..2247814 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -17,25 +17,25 @@
 //
 
 #include "AMDGPUAsmPrinter.h"
-#include "MCTargetDesc/AMDGPUTargetStreamer.h"
-#include "InstPrinter/AMDGPUInstPrinter.h"
-#include "Utils/AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
 #include "SIDefines.h"
-#include "SIMachineFunctionInfo.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -93,33 +93,40 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
 
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
-  : AsmPrinter(TM, std::move(Streamer)) {}
+  : AsmPrinter(TM, std::move(Streamer)) {
+    AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+  }
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
   return "AMDGPU Assembly Printer";
 }
 
+const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
+  return TM.getMCSubtargetInfo();
+}
+
+AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const {
+  return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer());
+}
+
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
 
-  // Need to construct an MCSubtargetInfo here in case we have no functions
-  // in the module.
-  std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
-        TM.getTargetTriple().str(), TM.getTargetCPU(),
-        TM.getTargetFeatureString()));
-
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
 
-  TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+  getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1);
+  getTargetStreamer().EmitDirectiveHSACodeObjectISA(
+      ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
+  getTargetStreamer().EmitStartOfCodeObjectMetadata(M);
+}
 
-  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
-  TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
-                                    "AMD", "AMDGPU");
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
 
-  // Emit runtime metadata.
-  TS->EmitRuntimeMetadata(M);
+  getTargetStreamer().EmitEndOfCodeObjectMetadata();
 }
 
 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
@@ -136,25 +143,34 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
 }
 
-
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
+  const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
+  if (!MFI->isEntryFunction())
+    return;
+
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  SIProgramInfo KernelInfo;
+  amd_kernel_code_t KernelCode;
   if (STM.isAmdCodeObjectV2(*MF)) {
-    getSIProgramInfo(KernelInfo, *MF);
-    EmitAmdKernelCodeT(*MF, KernelInfo);
+    getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
+
+    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+    getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
   }
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+  getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(),
+                                                   KernelCode);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
-    AMDGPUTargetStreamer *TS =
-        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
     SmallString<128> SymbolName;
     getNameWithPrefix(SymbolName, MF->getFunction()),
-    TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
+    getTargetStreamer().EmitAMDGPUSymbolType(
+        SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
   }
 
   AsmPrinter::EmitFunctionEntryLabel();
@@ -163,16 +179,37 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+  if (AMDGPU::isGroupSegment(GV, AMDGPUASI))
     return;
 
   AsmPrinter::EmitGlobalVariable(GV);
 }
 
+bool AMDGPUAsmPrinter::doFinalization(Module &M) {
+  CallGraphResourceInfo.clear();
+  return AsmPrinter::doFinalization(M);
+}
+
+// Print comments that apply to both callable functions and entry points.
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+  uint32_t NumVGPR,
+  uint32_t NumSGPR,
+  uint32_t ScratchSize,
+  uint64_t CodeSize) {
+  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+  OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
+  OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+  OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  CurrentProgramInfo = SIProgramInfo();
+
+  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
   // The starting address of all shader programs must be 256 bytes aligned.
-  MF.setAlignment(8);
+  // Regular functions just need the basic required instruction alignment.
+  MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
 
   SetupMachineFunction(MF);
 
@@ -184,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     OutStreamer->SwitchSection(ConfigSection);
   }
 
-  SIProgramInfo KernelInfo;
   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    getSIProgramInfo(KernelInfo, MF);
+    if (MFI->isEntryFunction()) {
+      getSIProgramInfo(CurrentProgramInfo, MF);
+    } else {
+      auto I = CallGraphResourceInfo.insert(
+        std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
+      SIFunctionResourceInfo &Info = I.first->second;
+      assert(I.second && "should only be called once per function");
+      Info = analyzeResourceUsage(MF);
+    }
+
     if (!STM.isAmdHsaOS()) {
-      EmitProgramInfoSI(MF, KernelInfo);
+      EmitProgramInfoSI(MF, CurrentProgramInfo);
     }
   } else {
     EmitProgramInfoR600(MF);
@@ -206,60 +251,78 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     OutStreamer->SwitchSection(CommentSection);
 
     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      if (!MFI->isEntryFunction()) {
+        OutStreamer->emitRawComment(" Function info:", false);
+        SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
+        emitCommonFunctionComments(
+          Info.NumVGPR,
+          Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
+          Info.PrivateSegmentSize,
+          getFunctionCodeSize(MF));
+        return false;
+      }
+
       OutStreamer->emitRawComment(" Kernel info:", false);
-      OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
-                                  false);
-      OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
-                                  false);
-      OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
-                                  false);
-      OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
-                                  false);
-      OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
-                                  false);
-      OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
-                                  false);
-      OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
-                                  " bytes/workgroup (compile time only)", false);
-
-      OutStreamer->emitRawComment(" SGPRBlocks: " +
-                                  Twine(KernelInfo.SGPRBlocks), false);
-      OutStreamer->emitRawComment(" VGPRBlocks: " +
-                                  Twine(KernelInfo.VGPRBlocks), false);
-
-      OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
-                                  Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
-      OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
-                                  Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
-
-      OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
-                                  false);
-      OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
-                                  false);
+      emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+                                 CurrentProgramInfo.NumSGPR,
+                                 CurrentProgramInfo.ScratchSize,
+                                 getFunctionCodeSize(MF));
+
+      OutStreamer->emitRawComment(
+        " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+      OutStreamer->emitRawComment(
+        " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+      OutStreamer->emitRawComment(
+        " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+        " bytes/workgroup (compile time only)", false);
+
+      OutStreamer->emitRawComment(
+        " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+      OutStreamer->emitRawComment(
+        " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+
+      OutStreamer->emitRawComment(
+        " NumSGPRsForWavesPerEU: " +
+        Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+      OutStreamer->emitRawComment(
+        " NumVGPRsForWavesPerEU: " +
+        Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+
+      OutStreamer->emitRawComment(
+        " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
+        false);
+      OutStreamer->emitRawComment(
+        " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
+        false);
 
       if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
-        OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
-                                    Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
-        OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
-                                    Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+        OutStreamer->emitRawComment(
+          " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+          Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+        OutStreamer->emitRawComment(
+          " DebuggerPrivateSegmentBufferSGPR: s" +
+          Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
       }
 
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
-                                  Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-                                  Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-                                  Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-                                  Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-                                  Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+        Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+        Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+        Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+        Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+        Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+        Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+        false);
     } else {
       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       OutStreamer->emitRawComment(
@@ -300,7 +363,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
         const MachineOperand &MO = MI.getOperand(op_idx);
         if (!MO.isReg())
           continue;
-        unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
+        unsigned HWReg = RI->getHWRegIndex(MO.getReg());
 
         // Register with value > 127 aren't GPR
         if (HWReg > 127)
@@ -343,18 +406,12 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
   }
 }
 
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
-                                        const MachineFunction &MF) const {
+uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  uint64_t CodeSize = 0;
-  unsigned MaxSGPR = 0;
-  unsigned MaxVGPR = 0;
-  bool VCCUsed = false;
-  bool FlatUsed = false;
-  const SIRegisterInfo *RI = STM.getRegisterInfo();
   const SIInstrInfo *TII = STM.getInstrInfo();
 
+  uint64_t CodeSize = 0;
+
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
       // TODO: CodeSize should account for multiple functions.
@@ -363,196 +420,191 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       if (MI.isDebugValue())
         continue;
 
-      if (isVerbose())
-        CodeSize += TII->getInstSizeInBytes(MI);
-
-      unsigned numOperands = MI.getNumOperands();
-      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        const MachineOperand &MO = MI.getOperand(op_idx);
-        unsigned width = 0;
-        bool isSGPR = false;
-
-        if (!MO.isReg())
-          continue;
-
-        unsigned reg = MO.getReg();
-        switch (reg) {
-        case AMDGPU::EXEC:
-        case AMDGPU::EXEC_LO:
-        case AMDGPU::EXEC_HI:
-        case AMDGPU::SCC:
-        case AMDGPU::M0:
-          continue;
-
-        case AMDGPU::VCC:
-        case AMDGPU::VCC_LO:
-        case AMDGPU::VCC_HI:
-          VCCUsed = true;
-          continue;
+      CodeSize += TII->getInstSizeInBytes(MI);
+    }
+  }
 
-        case AMDGPU::FLAT_SCR:
-        case AMDGPU::FLAT_SCR_LO:
-        case AMDGPU::FLAT_SCR_HI:
-          // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
-          // instructions aren't used to access the scratch buffer.
-          if (MFI->hasFlatScratchInit())
-            FlatUsed = true;
-          continue;
+  return CodeSize;
+}
 
-        case AMDGPU::TBA:
-        case AMDGPU::TBA_LO:
-        case AMDGPU::TBA_HI:
-        case AMDGPU::TMA:
-        case AMDGPU::TMA_LO:
-        case AMDGPU::TMA_HI:
-          llvm_unreachable("trap handler registers should not be used");
-
-        default:
-          break;
-        }
-
-        if (AMDGPU::SReg_32RegClass.contains(reg)) {
-          assert(!AMDGPU::TTMP_32RegClass.contains(reg) &&
-                 "trap handler registers should not be used");
-          isSGPR = true;
-          width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 1;
-        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
-          assert(!AMDGPU::TTMP_64RegClass.contains(reg) &&
-                 "trap handler registers should not be used");
-          isSGPR = true;
-          width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 3;
-        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 4;
-        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 8;
-        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
-          isSGPR = true;
-          width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
-          isSGPR = false;
-          width = 16;
-        } else {
-          llvm_unreachable("Unknown register class");
-        }
-        unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
-        unsigned maxUsed = hwReg + width - 1;
-        if (isSGPR) {
-          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
-        } else {
-          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
-        }
-      }
-    }
+static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
+                                  const SIInstrInfo &TII,
+                                  unsigned Reg) {
+  for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
+    if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
+      return true;
   }
 
-  unsigned ExtraSGPRs = 0;
+  return false;
+}
 
+static unsigned getNumExtraSGPRs(const SISubtarget &ST,
+                                 bool VCCUsed,
+                                 bool FlatScrUsed) {
+  unsigned ExtraSGPRs = 0;
   if (VCCUsed)
     ExtraSGPRs = 2;
 
-  if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
-    if (FlatUsed)
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
+    if (FlatScrUsed)
       ExtraSGPRs = 4;
   } else {
-    if (STM.isXNACKEnabled())
+    if (ST.isXNACKEnabled())
       ExtraSGPRs = 4;
 
-    if (FlatUsed)
+    if (FlatScrUsed)
       ExtraSGPRs = 6;
   }
 
-  // Record first reserved register and reserved register count fields, and
-  // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
-  // requested.
-  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
-  ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
+  return ExtraSGPRs;
+}
 
-  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
-  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
-  // attribute was requested.
-  if (STM.debuggerEmitPrologue()) {
-    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
-      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
-    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
-      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
+  const SISubtarget &ST) const {
+  return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+}
+
+AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
+  const MachineFunction &MF) const {
+  SIFunctionResourceInfo Info;
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+                         MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
+
+  // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
+  // instructions aren't used to access the scratch buffer. Inline assembly may
+  // need it though.
+  //
+  // If we only have implicit uses of flat_scr on flat instructions, it is not
+  // really needed.
+  if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
+      (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
+       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
+       !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
+    Info.UsesFlatScratch = false;
+  }
+
+  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+  Info.PrivateSegmentSize = FrameInfo.getStackSize();
+
+
+  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
+                 MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+  // If there are no calls, MachineRegisterInfo can tell us the used register
+  // count easily.
+
+  MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      HighestVGPRReg = Reg;
+      break;
+    }
   }
 
+  MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      HighestSGPRReg = Reg;
+      break;
+    }
+  }
+
+  // We found the maximum register index. They start at 0, so add one to get the
+  // number of registers.
+  Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+    TRI.getHWRegIndex(HighestVGPRReg) + 1;
+  Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+    TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+  return Info;
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        const MachineFunction &MF) {
+  SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+
+  ProgInfo.NumVGPR = Info.NumVGPR;
+  ProgInfo.NumSGPR = Info.NumExplicitSGPR;
+  ProgInfo.ScratchSize = Info.PrivateSegmentSize;
+  ProgInfo.VCCUsed = Info.UsesVCC;
+  ProgInfo.FlatUsed = Info.UsesFlatScratch;
+  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo *TII = STM.getInstrInfo();
+  const SIRegisterInfo *RI = &TII->getRegisterInfo();
+
+  unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
+                                         ProgInfo.VCCUsed,
+                                         ProgInfo.FlatUsed);
+  unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
+
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       !STM.hasSGPRInitBug()) {
-    unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
-    if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm.
       LLVMContext &Ctx = MF.getFunction()->getContext();
       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
                                        "addressable scalar registers",
-                                       MaxSGPR + 1, DS_Error,
-                                       DK_ResourceLimit, MaxAddressableNumSGPRs);
+                                       ProgInfo.NumSGPR, DS_Error,
+                                       DK_ResourceLimit,
+                                       MaxAddressableNumSGPRs);
       Ctx.diagnose(Diag);
-      MaxSGPR = MaxAddressableNumSGPRs - 1;
+      ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
     }
   }
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
-  MaxSGPR += ExtraSGPRs;
-  MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
-
-  // We found the maximum register index. They start at 0, so add one to get the
-  // number of registers.
-  ProgInfo.NumVGPR = MaxVGPR + 1;
-  ProgInfo.NumSGPR = MaxSGPR + 1;
+  ProgInfo.NumSGPR += ExtraSGPRs;
+  ProgInfo.NumVGPR += ExtraVGPRs;
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
   ProgInfo.NumSGPRsForWavesPerEU = std::max(
-    ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
+    std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
   ProgInfo.NumVGPRsForWavesPerEU = std::max(
-    ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+    std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
 
   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
       STM.hasSGPRInitBug()) {
-    unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxNumSGPRs) {
-      // This can happen due to a compiler bug or when using inline asm to use the
-      // registers which are usually reserved for vcc etc.
-
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+      // This can happen due to a compiler bug or when using inline asm to use
+      // the registers which are usually reserved for vcc etc.
       LLVMContext &Ctx = MF.getFunction()->getContext();
       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
                                        "scalar registers",
                                        ProgInfo.NumSGPR, DS_Error,
-                                       DK_ResourceLimit, MaxNumSGPRs);
+                                       DK_ResourceLimit,
+                                       MaxAddressableNumSGPRs);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxNumSGPRs;
-      ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
+      ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
+      ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
     }
   }
 
   if (STM.hasSGPRInitBug()) {
-    ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-    ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    ProgInfo.NumSGPR =
+        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+    ProgInfo.NumSGPRsForWavesPerEU =
+        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
   }
 
-  if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
+  if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
     LLVMContext &Ctx = MF.getFunction()->getContext();
     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
-                                     MFI->NumUserSGPRs, DS_Error);
+                                     MFI->getNumUserSGPRs(), DS_Error);
     Ctx.diagnose(Diag);
   }
 
@@ -565,13 +617,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // SGPRBlocks is actual number of SGPR blocks minus 1.
   ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
-                                RI->getSGPRAllocGranule());
-  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
+                                STM.getSGPREncodingGranule());
+  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
 
   // VGPRBlocks is actual number of VGPR blocks minus 1.
   ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
-                                RI->getVGPRAllocGranule());
-  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
+                                STM.getVGPREncodingGranule());
+  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
+
+  // Record first reserved VGPR and number of reserved VGPRs.
+  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0;
+  ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was requested.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
 
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
   // register.
@@ -580,14 +646,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
 
   // Make clamp modifier on NaN input returns 0.
-  ProgInfo.DX10Clamp = 1;
-
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  ProgInfo.ScratchSize = FrameInfo.getStackSize();
-
-  ProgInfo.FlatUsed = FlatUsed;
-  ProgInfo.VCCUsed = VCCUsed;
-  ProgInfo.CodeLen = CodeSize;
+  ProgInfo.DX10Clamp = STM.enableDX10Clamp();
 
   unsigned LDSAlignShift;
   if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
@@ -599,7 +658,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   }
 
   unsigned LDSSpillSize =
-    MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();
+    MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
 
   ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
   ProgInfo.LDSBlocks =
@@ -635,13 +694,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
       S_00B84C_EXCP_EN_MSB(0) |
-      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
+      // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
+      S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
       S_00B84C_EXCP_EN(0);
 }
 
@@ -649,6 +710,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
   switch (CallConv) {
   default: LLVM_FALLTHROUGH;
   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+  case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
@@ -656,7 +718,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) {
+                                         const SIProgramInfo &CurrentProgramInfo) {
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
@@ -664,31 +726,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
 
     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
 
     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
-    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
   } else {
     OutStreamer->EmitIntValue(RsrcReg, 4);
-    OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
-                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
     if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
     }
   }
 
   if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
@@ -713,97 +775,91 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   }
 }
 
-void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) const {
+void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
+                                        const SIProgramInfo &CurrentProgramInfo,
+                                        const MachineFunction &MF) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  amd_kernel_code_t header;
 
-  AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
-  header.compute_pgm_resource_registers =
-      KernelInfo.ComputePGMRSrc1 |
-      (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+  Out.compute_pgm_resource_registers =
+      CurrentProgramInfo.ComputePGMRSrc1 |
+      (CurrentProgramInfo.ComputePGMRSrc2 << 32);
+  Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
+  if (CurrentProgramInfo.DynamicCallStack)
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
 
-  AMD_HSA_BITS_SET(header.code_properties,
+  AMD_HSA_BITS_SET(Out.code_properties,
                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
 
   if (MFI->hasPrivateSegmentBuffer()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (MFI->hasQueuePtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 
   if (MFI->hasKernargSegmentPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
 
   if (MFI->hasDispatchID())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
 
   if (MFI->hasFlatScratchInit())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
-
-  // TODO: Private segment size
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
   if (MFI->hasGridWorkgroupCountX()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
   }
 
   if (MFI->hasGridWorkgroupCountY()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
   }
 
   if (MFI->hasGridWorkgroupCountZ()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (STM.debuggerSupported())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
 
   if (STM.isXNACKEnabled())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
   // FIXME: Should use getKernArgSize
-  header.kernarg_segment_byte_size =
+  Out.kernarg_segment_byte_size =
     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
-  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  header.workitem_vgpr_count = KernelInfo.NumVGPR;
-  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
-  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
+  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+  Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
+  Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
+  Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
-  header.kernarg_segment_alignment = std::max((size_t)4,
+  Out.kernarg_segment_alignment = std::max((size_t)4,
       countTrailingZeros(MFI->getMaxKernArgAlign()));
 
   if (STM.debuggerEmitPrologue()) {
-    header.debug_wavefront_private_segment_offset_sgpr =
-      KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
-    header.debug_private_segment_buffer_sgpr =
-      KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+    Out.debug_wavefront_private_segment_offset_sgpr =
+      CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    Out.debug_private_segment_buffer_sgpr =
+      CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
   }
-
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-  TS->EmitAMDKernelCodeT(header);
 }
 
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 9a4bafe..0a58ce0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -15,95 +15,110 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
-#include "AMDGPUMCInstLower.h"
-
+#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
 #include <vector>
 
 namespace llvm {
+
+class AMDGPUTargetStreamer;
 class MCOperand;
+class SISubtarget;
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
-  struct SIProgramInfo {
-    SIProgramInfo() :
-      VGPRBlocks(0),
-      SGPRBlocks(0),
-      Priority(0),
-      FloatMode(0),
-      Priv(0),
-      DX10Clamp(0),
-      DebugMode(0),
-      IEEEMode(0),
-      ScratchSize(0),
-      ComputePGMRSrc1(0),
-      LDSBlocks(0),
-      ScratchBlocks(0),
-      ComputePGMRSrc2(0),
-      NumVGPR(0),
-      NumSGPR(0),
-      FlatUsed(false),
-      NumSGPRsForWavesPerEU(0),
-      NumVGPRsForWavesPerEU(0),
-      ReservedVGPRFirst(0),
-      ReservedVGPRCount(0),
-      DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
-      DebuggerPrivateSegmentBufferSGPR((uint16_t)-1),
-      VCCUsed(false),
-      CodeLen(0) {}
+  // Track resource usage for callee functions.
+  struct SIFunctionResourceInfo {
+    // Track the number of explicitly used VGPRs. Special registers reserved at
+    // the end are tracked separately.
+    int32_t NumVGPR = 0;
+    int32_t NumExplicitSGPR = 0;
+    uint32_t PrivateSegmentSize = 0;
+    bool UsesVCC = false;
+    bool UsesFlatScratch = false;
+    bool HasDynamicallySizedStack = false;
+    bool HasRecursion = false;
+
+    int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
+  };
 
+  // Track resource usage for kernels / entry functions.
+  struct SIProgramInfo {
     // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks;
-    uint32_t SGPRBlocks;
-    uint32_t Priority;
-    uint32_t FloatMode;
-    uint32_t Priv;
-    uint32_t DX10Clamp;
-    uint32_t DebugMode;
-    uint32_t IEEEMode;
-    uint32_t ScratchSize;
-
-    uint64_t ComputePGMRSrc1;
+    uint32_t VGPRBlocks = 0;
+    uint32_t SGPRBlocks = 0;
+    uint32_t Priority = 0;
+    uint32_t FloatMode = 0;
+    uint32_t Priv = 0;
+    uint32_t DX10Clamp = 0;
+    uint32_t DebugMode = 0;
+    uint32_t IEEEMode = 0;
+    uint32_t ScratchSize = 0;
+
+    uint64_t ComputePGMRSrc1 = 0;
 
     // Fields set in PGM_RSRC2 pm4 packet.
-    uint32_t LDSBlocks;
-    uint32_t ScratchBlocks;
+    uint32_t LDSBlocks = 0;
+    uint32_t ScratchBlocks = 0;
 
-    uint64_t ComputePGMRSrc2;
+    uint64_t ComputePGMRSrc2 = 0;
 
-    uint32_t NumVGPR;
-    uint32_t NumSGPR;
-    uint32_t LDSSize;
-    bool FlatUsed;
+    uint32_t NumVGPR = 0;
+    uint32_t NumSGPR = 0;
+    uint32_t LDSSize = 0;
+    bool FlatUsed = false;
 
     // Number of SGPRs that meets number of waves per execution unit request.
-    uint32_t NumSGPRsForWavesPerEU;
+    uint32_t NumSGPRsForWavesPerEU = 0;
 
     // Number of VGPRs that meets number of waves per execution unit request.
-    uint32_t NumVGPRsForWavesPerEU;
+    uint32_t NumVGPRsForWavesPerEU = 0;
 
     // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
     // fixed VGPR number reserved.
-    uint16_t ReservedVGPRFirst;
+    uint16_t ReservedVGPRFirst = 0;
 
     // The number of consecutive VGPRs reserved.
-    uint16_t ReservedVGPRCount;
+    uint16_t ReservedVGPRCount = 0;
 
     // Fixed SGPR number used to hold wave scratch offset for entire kernel
-    // execution, or uint16_t(-1) if the register is not used or not known.
-    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
+    // execution, or std::numeric_limits<uint16_t>::max() if the register is not
+    // used or not known.
+    uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR =
+        std::numeric_limits<uint16_t>::max();
 
     // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
-    // kernel execution, or uint16_t(-1) if the register is not used or not
-    // known.
-    uint16_t DebuggerPrivateSegmentBufferSGPR;
+    // kernel execution, or std::numeric_limits<uint16_t>::max() if the register
+    // is not used or not known.
+    uint16_t DebuggerPrivateSegmentBufferSGPR =
+        std::numeric_limits<uint16_t>::max();
+
+    // Whether there is recursion, dynamic allocas, indirect calls or some other
+    // reason there may be statically unknown stack usage.
+    bool DynamicCallStack = false;
 
     // Bonus information for debugging.
-    bool VCCUsed;
-    uint64_t CodeLen;
+    bool VCCUsed = false;
+
+    SIProgramInfo() = default;
   };
 
-  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  SIProgramInfo CurrentProgramInfo;
+  DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+
+  uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
+  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
+
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
+  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
+                        const MachineFunction &MF) const;
   void findNumUsedRegistersSI(const MachineFunction &MF,
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
@@ -112,21 +127,33 @@ private:
   /// can correctly setup the GPU state.
   void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
-  void EmitAmdKernelCodeT(const MachineFunction &MF,
-                          const SIProgramInfo &KernelInfo) const;
+  void emitCommonFunctionComments(uint32_t NumVGPR,
+                                  uint32_t NumSGPR,
+                                  uint32_t ScratchSize,
+                                  uint64_t CodeSize);
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
   StringRef getPassName() const override;
 
+  const MCSubtargetInfo* getSTI() const;
+
+  AMDGPUTargetStreamer& getTargetStreamer() const;
+
+  bool doFinalization(Module &M) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
   /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
   /// pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
 
+  /// \brief Lower the specified LLVM Constant to an MCExpr.
+  /// The AsmPrinter::lowerConstantof does not know how to lower
+  /// addrspacecast, therefore they should be lowered by this function.
+  const MCExpr *lowerConstant(const Constant *CV) override;
+
   /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
@@ -143,6 +170,8 @@ public:
 
   void EmitStartOfAsmFile(Module &M) override;
 
+  void EmitEndOfAsmFile(Module &M) override;
+
   bool isBlockOnlyReachableByFallthrough(
     const MachineBasicBlock *MBB) const override;
 
@@ -153,8 +182,9 @@ public:
 protected:
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
+  AMDGPUAS AMDGPUASI;
 };
 
-} // End anonymous llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index d53cc15..515cc07 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -14,8 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUCallLowering.h"
+#include "AMDGPU.h"
 #include "AMDGPUISelLowering.h"
-
+#include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 
@@ -26,17 +31,138 @@ using namespace llvm;
 #endif
 
 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
-  : CallLowering(&TLI) {
+  : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) {
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
-                                        const Value *Val, unsigned VReg) const {
+                                     const Value *Val, unsigned VReg) const {
+  MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
   return true;
 }
 
+unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+                                               Type *ParamTy,
+                                               unsigned Offset) const {
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  LLT PtrType = getLLTForType(*PtrTy, DL);
+  unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
+  unsigned KernArgSegmentPtr =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+  unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
+
+  unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+  MIRBuilder.buildConstant(OffsetReg, Offset);
+
+  MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
+
+  return DstReg;
+}
+
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
+                                        Type *ParamTy, unsigned Offset,
+                                        unsigned DstReg) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+  unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
+  unsigned Align = DL.getABITypeAlignment(ParamTy);
+  unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
+                                       MachineMemOperand::MONonTemporal |
+                                       MachineMemOperand::MOInvariant,
+                                       TypeSize, Align);
+
+  MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
+}
+
 bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                               const Function &F,
                                               ArrayRef<unsigned> VRegs) const {
-  // TODO: Implement once there are generic loads/stores.
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    const LLT P2 = LLT::pointer(2, 64);
+    unsigned VReg = MRI.createGenericVirtualRegister(P2);
+    MRI.addLiveIn(InputPtrReg, VReg);
+    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
+    MIRBuilder.buildCopy(VReg, InputPtrReg);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
+  if (Info->hasDispatchID()) {
+    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(DispatchIDReg);
+  }
+
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
+  unsigned NumArgs = F.arg_size();
+  Function::const_arg_iterator CurOrigArg = F.arg_begin();
+  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
+    MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT();
+    ISD::ArgFlagsTy Flags;
+    Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+    CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
+                                             /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+
+  Function::const_arg_iterator Arg = F.arg_begin();
+  for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
+    // FIXME: We should be getting DebugInfo from the arguments some how.
+    CCValAssign &VA = ArgLocs[i];
+    lowerParameter(MIRBuilder, Arg->getType(),
+                   VA.getLocMemOffset() +
+                   Subtarget->getExplicitKernelArgOffset(MF), VRegs[i]);
+  }
+
   return true;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 9ae87c9..251cb7a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 
 namespace llvm {
@@ -22,6 +23,14 @@ namespace llvm {
 class AMDGPUTargetLowering;
 
 class AMDGPUCallLowering: public CallLowering {
+  AMDGPUAS AMDGPUASI;
+
+  unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+                             unsigned Offset) const;
+
+  void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+                      unsigned Offset, unsigned DstReg) const;
+
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
@@ -29,6 +38,8 @@ class AMDGPUCallLowering: public CallLowering {
                    unsigned VReg) const override;
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
+  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+  static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
 };
 } // End of namespace llvm;
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 47dfa49..4bef7a8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -13,11 +13,13 @@
 
 // Inversion of CCIfInReg
 class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
+class CCIfExtend<CCAction A>
+  : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
 
 // Calling convention for SI
 def CC_SI : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -25,17 +27,13 @@ def CC_SI : CallingConv<[
     SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
   ]>>>,
 
-  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
-      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
-      SGPR32, SGPR34, SGPR36, SGPR38 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
-      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
-      SGPR33, SGPR35, SGPR37, SGPR39 ]
-  >>>,
+  // We have no way of referring to the generated register tuples
+  // here, so use a custom function.
+  CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
+  CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -53,20 +51,10 @@ def CC_SI : CallingConv<[
     VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
     VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
     VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
-  ]>>>,
-
-  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
-      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
-      SGPR32, SGPR34, SGPR36, SGPR38 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
-      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
-      SGPR33, SGPR35, SGPR37, SGPR39 ]
-  >>>
-
+  ]>>>
 ]>;
 
-def RetCC_SI : CallingConv<[
+def RetCC_SI_Shader : CallingConv<[
   CCIfType<[i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@@ -76,7 +64,7 @@ def RetCC_SI : CallingConv<[
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32] , CCAssignToReg<[
+  CCIfType<[f32, f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -113,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[
   CCCustom<"allocateKernArg">
 ]>;
 
+def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
+  (sequence "VGPR%u", 24, 255)
+>;
+
+def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
+  (sequence "VGPR%u", 32, 255)
+>;
+
+def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
+  (sequence "SGPR%u", 32, 103)
+>;
+
+def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
+  (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
+>;
+
+// Calling convention for leaf functions
+def CC_AMDGPU_Func : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfType<[i1], CCPromoteToType<i32>>,
+  CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
+  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
+  CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
+  CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
+  CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
+  CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
+]>;
+
+// Calling convention for leaf functions
+def RetCC_AMDGPU_Func : CallingConv<[
+  CCIfType<[i1], CCPromoteToType<i32>>,
+  CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
+  CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
+    VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
+    VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
+    VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
+    VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
+  CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
+]>;
+
 def CC_AMDGPU : CallingConv<[
   CCIf<"static_cast<const AMDGPUSubtarget&>"
         "(State.getMachineFunction().getSubtarget()).getGeneration() >="
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e623054..31ee920 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,16 +14,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
-
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/InstVisitor.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <iterator>
 
 #define DEBUG_TYPE "amdgpu-codegenprepare"
 
@@ -33,18 +49,15 @@ namespace {
 
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
-  const GCNTargetMachine *TM;
-  const SISubtarget *ST;
-  DivergenceAnalysis *DA;
-  Module *Mod;
-  bool HasUnsafeFPMath;
+  const SISubtarget *ST = nullptr;
+  DivergenceAnalysis *DA = nullptr;
+  Module *Mod = nullptr;
+  bool HasUnsafeFPMath = false;
 
   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
   /// binary operation \p V.
   ///
   /// \returns Binary operation \p V.
-  Value *copyFlags(const BinaryOperator &I, Value *V) const;
-
   /// \returns \p T's base element bit width.
   unsigned getBaseElementBitWidth(const Type *T) const;
 
@@ -113,13 +126,8 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
 
 public:
   static char ID;
-  AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
-    FunctionPass(ID),
-    TM(static_cast<const GCNTargetMachine *>(TM)),
-    ST(nullptr),
-    DA(nullptr),
-    Mod(nullptr),
-    HasUnsafeFPMath(false) { }
+
+  AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
 
   bool visitFDiv(BinaryOperator &I);
 
@@ -142,22 +150,7 @@ public:
  }
 };
 
-} // End anonymous namespace
-
-Value *AMDGPUCodeGenPrepare::copyFlags(
-    const BinaryOperator &I, Value *V) const {
-  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
-  if (!BinOp) // Possibly constant expression.
-    return V;
-
-  if (isa<OverflowingBinaryOperator>(BinOp)) {
-    BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
-    BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-  } else if (isa<PossiblyExactOperator>(BinOp))
-    BinOp->setIsExact(I.isExact());
-
-  return V;
-}
+} // end anonymous namespace
 
 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
@@ -186,12 +179,48 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 }
 
 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
-  if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
-      T->getIntegerBitWidth() <= 16)
+  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
+  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
+    return true;
+
+  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
+    // TODO: The set of packed operations is more limited, so may want to
+    // promote some anyway.
+    if (ST->hasVOP3PInsts())
+      return false;
+
+    return needsPromotionToI32(VT->getElementType());
+  }
+
+  return false;
+}
+
+// Return true if the op promoted to i32 should have nsw set.
+static bool promotedOpIsNSW(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::Add:
+  case Instruction::Sub:
     return true;
-  if (!T->isVectorTy())
+  case Instruction::Mul:
+    return I.hasNoUnsignedWrap();
+  default:
+    return false;
+  }
+}
+
+// Return true if the op promoted to i32 should have nuw set.
+static bool promotedOpIsNUW(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::Add:
+  case Instruction::Mul:
+    return true;
+  case Instruction::Sub:
+    return I.hasNoUnsignedWrap();
+  default:
     return false;
-  return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+  }
 }
 
 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -218,7 +247,19 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
   }
-  ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+
+  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
+  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
+    if (promotedOpIsNSW(cast<Instruction>(I)))
+      Inst->setHasNoSignedWrap();
+
+    if (promotedOpIsNUW(cast<Instruction>(I)))
+      Inst->setHasNoUnsignedWrap();
+
+    if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
+      Inst->setIsExact(ExactOp->isExact());
+  }
+
   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 
   I.replaceAllUsesWith(TruncRes);
@@ -339,16 +380,16 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
   FastMathFlags FMF = FPOp->getFastMathFlags();
   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
                                       FMF.allowReciprocal();
-  if (ST->hasFP32Denormals() && !UnsafeDiv)
+
+  // With UnsafeDiv node will be optimized to just rcp and mul.
+  if (ST->hasFP32Denormals() || UnsafeDiv)
     return false;
 
   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
-  Function *Decl
-    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
 
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
@@ -447,10 +488,15 @@ bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
 }
 
 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
-  if (!TM || skipFunction(F))
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
     return false;
 
-  ST = &TM->getSubtarget<SISubtarget>(F);
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  ST = &TM.getSubtarget<SISubtarget>(F);
   DA = &getAnalysis<DivergenceAnalysis>();
   HasUnsafeFPMath = hasUnsafeFPMath(F);
 
@@ -467,14 +513,14 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
-INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
-                       "AMDGPU IR optimizations", false, false)
+INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
+                    false, false)
 
 char AMDGPUCodeGenPrepare::ID = 0;
 
-FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
-  return new AMDGPUCodeGenPrepare(TM);
+FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
+  return new AMDGPUCodeGenPrepare();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 805fb71..e32ca96 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -12,11 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
@@ -69,34 +64,3 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
   // T1.W = stack[1].w
   return 1;
 }
-
-/// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                int FI,
-                                                unsigned &FrameReg) const {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const AMDGPURegisterInfo *RI
-    = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
-
-  // Fill in FrameReg output argument.
-  FrameReg = RI->getFrameRegister(MF);
-
-  // Start the offset at 2 so we don't overwrite work group information.
-  // XXX: We should only do this when the shader actually uses this
-  // information.
-  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
-  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
-
-  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
-    OffsetBytes += MFI.getObjectSize(i);
-    // Each register holds 4 bytes, so we must always align the offset to at
-    // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = alignTo(OffsetBytes, 4);
-  }
-
-  if (FI != -1)
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
-
-  return OffsetBytes / (getStackWidth(MF) * 4);
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 5d51351..8e187c7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -34,9 +34,6 @@ public:
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
-
   bool hasFP(const MachineFunction &MF) const override {
     return false;
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
new file mode 100644
index 0000000..5cb9036
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -0,0 +1,62 @@
+//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by AMDGPURegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace llvm {
+namespace AMDGPU {
+
+enum PartialMappingIdx {
+  None = - 1,
+  PM_SGPR32 = 0,
+  PM_SGPR64 = 1,
+  PM_VGPR32 = 2,
+  PM_VGPR64 = 3
+};
+
+const RegisterBankInfo::PartialMapping PartMappings[] {
+  // StartIdx, Length, RegBank
+  {0, 32, SGPRRegBank},
+  {0, 64, SGPRRegBank},
+  {0, 32, VGPRRegBank},
+  {0, 64, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappings[] {
+  // SGPR 32-bit
+  {&PartMappings[0], 1},
+  // SGPR 64-bit
+  {&PartMappings[1], 1},
+  // VGPR 32-bit
+  {&PartMappings[2], 1},
+  // VGPR 64-bit
+  {&PartMappings[3], 1}
+};
+
+enum ValueMappingIdx {
+  SGPRStartIdx = 0,
+  VGPRStartIdx = 2
+};
+
+const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
+                                                      unsigned Size) {
+  assert(Size % 32 == 0);
+  unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
+  Idx += (Size / 32) - 1;
+  return &ValMappings[Idx];
+}
+
+} // End AMDGPU namespace.
+} // End llvm namespace.
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 5bf347e..f235313 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -13,15 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
-#include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPUSubtarget.h"
 #include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
 #include "SIISelLowering.h"
+#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -67,10 +67,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const AMDGPUSubtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
 
 public:
   explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel) {}
+      : SelectionDAGISel(TM, OptLevel){
+    AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
+  }
   ~AMDGPUDAGToDAGISel() override = default;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -79,7 +82,8 @@ public:
   void PostprocessISelDAG() override;
 
 private:
-  SDValue foldFrameIndex(SDValue N) const;
+  std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
+  bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -112,8 +116,13 @@ private:
   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
                          SDValue &SLC) const;
-  bool SelectMUBUFScratch(SDValue Addr, SDValue &RSrc, SDValue &VAddr,
-                          SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFScratchOffen(SDNode *Root,
+                               SDValue Addr, SDValue &RSrc, SDValue &VAddr,
+                               SDValue &SOffset, SDValue &ImmOffset) const;
+  bool SelectMUBUFScratchOffset(SDNode *Root,
+                                SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
+                                SDValue &Offset) const;
+
   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
                          SDValue &TFE) const;
@@ -129,8 +138,10 @@ private:
   bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset,
                                    SDValue &ImmOffset, SDValue &VOffset) const;
 
-  bool SelectFlat(SDValue Addr, SDValue &VAddr,
-                  SDValue &SLC, SDValue &TFE) const;
+  bool SelectFlatAtomic(SDValue Addr, SDValue &VAddr,
+                        SDValue &Offset, SDValue &SLC) const;
+  bool SelectFlatOffset(SDValue Addr, SDValue &VAddr,
+                        SDValue &Offset, SDValue &SLC) const;
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
@@ -143,20 +154,28 @@ private:
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+
+  bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                        SDValue &Clamp, SDValue &Omod) const;
   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
                          SDValue &Clamp, SDValue &Omod) const;
 
-  bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods,
-                            SDValue &Omod) const;
   bool SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, SDValue &SrcMods,
                                  SDValue &Clamp,
                                  SDValue &Omod) const;
 
+  bool SelectVOP3OMods(SDValue In, SDValue &Src,
+                       SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                        SDValue &Clamp) const;
+
   void SelectADD_SUB_I64(SDNode *N);
+  void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
   void SelectFMA_W_CHAIN(SDNode *N);
   void SelectFMUL_W_CHAIN(SDNode *N);
@@ -187,6 +206,17 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
+bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
+  if (TM.Options.NoNaNsFPMath)
+    return true;
+
+  // TODO: Move into isKnownNeverNaN
+  if (N->getFlags().isDefined())
+    return N->getFlags().hasNoNaNs();
+
+  return CurDAG->isKnownNeverNaN(N);
+}
+
 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   const SIInstrInfo *TII
     = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
@@ -250,7 +280,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
     return N;
 
   const SITargetLowering& Lowering =
@@ -290,6 +320,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   llvm_unreachable("invalid vector size");
 }
 
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    Out = C->getAPIntValue().getZExtValue();
+    return true;
+  }
+
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+    Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
+    return true;
+  }
+
+  return false;
+}
+
 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -319,6 +363,11 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectADD_SUB_I64(N);
     return;
   }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SelectUADDO_USUBO(N);
+    return;
+  }
   case AMDGPUISD::FMUL_W_CHAIN: {
     SelectFMUL_W_CHAIN(N);
     return;
@@ -336,7 +385,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
     EVT EltVT = VT.getVectorElementType();
+
+    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+      if (Opc == ISD::BUILD_VECTOR) {
+        uint32_t LHSVal, RHSVal;
+        if (getConstantValue(N->getOperand(0), LHSVal) &&
+            getConstantValue(N->getOperand(1), RHSVal)) {
+          uint32_t K = LHSVal | (RHSVal << 16);
+          CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
+                               CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+          return;
+        }
+      }
+
+      break;
+    }
+
     assert(EltVT.bitsEq(MVT::i32));
+
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
     } else {
@@ -502,7 +568,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::CopyToReg: {
     const SITargetLowering& Lowering =
       *static_cast<const SITargetLowering*>(getTargetLowering());
-    Lowering.legalizeTargetIndependentNode(N, *CurDAG);
+    N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
     break;
   }
   case ISD::AND:
@@ -531,9 +597,9 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
   if (!N->readMem())
     return false;
   if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
 
-  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
+  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
 }
 
 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
@@ -689,6 +755,17 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
+  // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
+  // carry out despite the _i32 name. These were renamed in VI to _U32.
+  // FIXME: We should probably rename the opcodes here.
+  unsigned Opc = N->getOpcode() == ISD::UADDO ?
+    AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
+                       { N->getOperand(0), N->getOperand(1) });
+}
+
 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
   SDLoc SL(N);
   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
@@ -881,8 +958,12 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
   return true;
 }
 
+static bool isLegalMUBUFImmOffset(unsigned Imm) {
+  return isUInt<12>(Imm);
+}
+
 static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) {
-  return isUInt<12>(Imm->getZExtValue());
+  return isLegalMUBUFImmOffset(Imm->getZExtValue());
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
@@ -998,43 +1079,111 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
 }
 
-SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
-  if (auto FI = dyn_cast<FrameIndexSDNode>(N))
-    return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
-  return N;
+static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
+  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
+  return PSV && PSV->isStack();
+}
+
+std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  const MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (auto FI = dyn_cast<FrameIndexSDNode>(N)) {
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+                                              FI->getValueType(0));
+
+    // If we can resolve this to a frame index access, this is relative to the
+    // frame pointer SGPR.
+    return std::make_pair(TFI, CurDAG->getRegister(Info->getFrameOffsetReg(),
+                                                   MVT::i32));
+  }
+
+  // If we don't know this private access is a local stack object, it needs to
+  // be relative to the entry point's scratch wave offset register.
+  return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
+                                               MVT::i32));
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
-                                            SDValue &VAddr, SDValue &SOffset,
-                                            SDValue &ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Root,
+                                                 SDValue Addr, SDValue &Rsrc,
+                                                 SDValue &VAddr, SDValue &SOffset,
+                                                 SDValue &ImmOffset) const {
 
   SDLoc DL(Addr);
   MachineFunction &MF = CurDAG->getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
-  SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
 
-  // (add n0, c1)
+  if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
+    unsigned Imm = CAddr->getZExtValue();
+    assert(!isLegalMUBUFImmOffset(Imm) &&
+           "should have been selected by other pattern");
+
+    SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+    MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                        DL, MVT::i32, HighBits);
+    VAddr = SDValue(MovHighBits, 0);
+
+    // In a call sequence, stores to the argument stack area are relative to the
+    // stack pointer.
+    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+    unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
+      Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
+
+    SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+    ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+    return true;
+  }
+
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    // (add n0, c1)
+
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 
     // Offsets in vaddr must be positive.
     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
     if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = foldFrameIndex(N0);
+      std::tie(VAddr, SOffset) = foldFrameIndex(N0);
       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
       return true;
     }
   }
 
   // (node)
-  VAddr = foldFrameIndex(Addr);
+  std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Root,
+                                                  SDValue Addr,
+                                                  SDValue &SRsrc,
+                                                  SDValue &SOffset,
+                                                  SDValue &Offset) const {
+  ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
+  if (!CAddr || !isLegalMUBUFImmOffset(CAddr))
+    return false;
+
+  SDLoc DL(Addr);
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+
+  const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Root)->getPointerInfo();
+  unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
+    Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
+
+  // FIXME: Get from MachinePointerInfo? We should only be using the frame
+  // offset if we know this is in a call sequence.
+  SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+
+  Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                                            SDValue &SOffset, SDValue &Offset,
                                            SDValue &GLC, SDValue &SLC,
@@ -1167,23 +1316,35 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr,
-                                    SDValue &VAddr,
-                                    SDValue &SLC,
-                                    SDValue &TFE) const {
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDValue Addr,
+                                          SDValue &VAddr,
+                                          SDValue &Offset,
+                                          SDValue &SLC) const {
+  int64_t OffsetVal = 0;
+
+  if (Subtarget->hasFlatInstOffsets() &&
+      CurDAG->isBaseWithConstantOffset(Addr)) {
+    SDValue N0 = Addr.getOperand(0);
+    SDValue N1 = Addr.getOperand(1);
+    uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getZExtValue();
+    if (isUInt<12>(COffsetVal)) {
+      Addr = N0;
+      OffsetVal = COffsetVal;
+    }
+  }
+
   VAddr = Addr;
-  TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+  Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+  SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+
   return true;
 }
 
-///
-/// \param EncodedOffset This is the immediate value that will be encoded
-///        directly into the instruction.  On SI/CI the \p EncodedOffset
-///        will be in units of dwords and on VI+ it will be units of bytes.
-static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST,
-                                 int64_t EncodedOffset) {
-  return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
-     isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDValue Addr,
+                                          SDValue &VAddr,
+                                          SDValue &Offset,
+                                          SDValue &SLC) const {
+  return SelectFlatOffset(Addr, VAddr, Offset, SLC);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
@@ -1197,10 +1358,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   SDLoc SL(ByteOffsetNode);
   AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
   int64_t ByteOffset = C->getSExtValue();
-  int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
-      ByteOffset >> 2 : ByteOffset;
+  int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
 
-  if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
+  if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
     Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
     Imm = true;
     return true;
@@ -1481,7 +1641,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   MemSDNode *Mem = cast<MemSDNode>(N);
   unsigned AS = Mem->getAddressSpace();
-  if (AS == AMDGPUAS::FLAT_ADDRESS) {
+  if (AS == AMDGPUASI.FLAT_ADDRESS) {
     SelectCode(N);
     return;
   }
@@ -1545,7 +1705,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
                                         SDValue &SrcMods) const {
   unsigned Mods = 0;
-
   Src = In;
 
   if (Src.getOpcode() == ISD::FNEG) {
@@ -1559,42 +1718,29 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
   }
 
   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
-
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
-                                         SDValue &SrcMods) const {
-  bool Res = SelectVOP3Mods(In, Src, SrcMods);
-  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue();
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
+                                             SDValue &SrcMods) const {
+  SelectVOP3Mods(In, Src, SrcMods);
+  return isNoNanSrc(Src);
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
+  if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
+    return false;
+
+  Src = In;
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
                                          SDValue &SrcMods, SDValue &Clamp,
                                          SDValue &Omod) const {
   SDLoc DL(In);
-  // FIXME: Handle Clamp and Omod
-  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
-
-  return SelectVOP3Mods(In, Src, SrcMods);
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src,
-                                           SDValue &SrcMods, SDValue &Clamp,
-                                           SDValue &Omod) const {
-  bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod);
-
-  return Res && cast<ConstantSDNode>(SrcMods)->isNullValue() &&
-                cast<ConstantSDNode>(Clamp)->isNullValue() &&
-                cast<ConstantSDNode>(Omod)->isNullValue();
-}
-
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src,
-                                              SDValue &SrcMods,
-                                              SDValue &Omod) const {
-  // FIXME: Handle Omod
-  Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
   return SelectVOP3Mods(In, Src, SrcMods);
 }
@@ -1607,6 +1753,117 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
+                                         SDValue &Clamp, SDValue &Omod) const {
+  Src = In;
+
+  SDLoc DL(In);
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+  return true;
+}
+
+static SDValue stripBitcast(SDValue Val) {
+  return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
+}
+
+// Figure out if this is really an extract of the high 16-bits of a dword.
+static bool isExtractHiElt(SDValue In, SDValue &Out) {
+  In = stripBitcast(In);
+  if (In.getOpcode() != ISD::TRUNCATE)
+    return false;
+
+  SDValue Srl = In.getOperand(0);
+  if (Srl.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+      if (ShiftAmt->getZExtValue() == 16) {
+        Out = stripBitcast(Srl.getOperand(0));
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+// Look through operations that obscure just looking at the low 16-bits of the
+// same register.
+static SDValue stripExtractLoElt(SDValue In) {
+  if (In.getOpcode() == ISD::TRUNCATE) {
+    SDValue Src = In.getOperand(0);
+    if (Src.getValueType().getSizeInBits() == 32)
+      return stripBitcast(Src);
+  }
+
+  return In;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  unsigned Mods = 0;
+  Src = In;
+
+  if (Src.getOpcode() == ISD::FNEG) {
+    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+    Src = Src.getOperand(0);
+  }
+
+  if (Src.getOpcode() == ISD::BUILD_VECTOR) {
+    unsigned VecMods = Mods;
+
+    SDValue Lo = stripBitcast(Src.getOperand(0));
+    SDValue Hi = stripBitcast(Src.getOperand(1));
+
+    if (Lo.getOpcode() == ISD::FNEG) {
+      Lo = stripBitcast(Lo.getOperand(0));
+      Mods ^= SISrcMods::NEG;
+    }
+
+    if (Hi.getOpcode() == ISD::FNEG) {
+      Hi = stripBitcast(Hi.getOperand(0));
+      Mods ^= SISrcMods::NEG_HI;
+    }
+
+    if (isExtractHiElt(Lo, Lo))
+      Mods |= SISrcMods::OP_SEL_0;
+
+    if (isExtractHiElt(Hi, Hi))
+      Mods |= SISrcMods::OP_SEL_1;
+
+    Lo = stripExtractLoElt(Lo);
+    Hi = stripExtractLoElt(Hi);
+
+    if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
+      // Really a scalar input. Just select from the low half of the register to
+      // avoid packing.
+
+      Src = Lo;
+      SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+      return true;
+    }
+
+    Mods = VecMods;
+  }
+
+  // Packed instructions do not have abs modifiers.
+  Mods |= SISrcMods::OP_SEL_1;
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
+                                          SDValue &SrcMods,
+                                          SDValue &Clamp) const {
+  SDLoc SL(In);
+
+  // FIXME: Handle clamp and op_sel
+  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+  return SelectVOP3PMods(In, Src, SrcMods);
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 54caa2c..258b173 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,11 +15,13 @@
 
 #include "AMDGPUISelLowering.h"
 #include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
 #include "AMDGPUFrameLowering.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600MachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -28,7 +30,7 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
-#include "SIInstrInfo.h"
+#include "llvm/Support/KnownBits.h"
 using namespace llvm;
 
 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
@@ -43,6 +45,76 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
   return true;
 }
 
+static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
+                           CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State,
+                           const TargetRegisterClass *RC,
+                           unsigned NumRegs) {
+  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
+  unsigned RegResult = State.AllocateReg(RegList);
+  if (RegResult == AMDGPU::NoRegister)
+    return false;
+
+  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
+  return true;
+}
+
+static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v2f32: {
+    // Up to SGPR0-SGPR39
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::SGPR_64RegClass, 20);
+  }
+  default:
+    return false;
+  }
+}
+
+// Allocate up to VGPR31.
+//
+// TODO: Since there are no VGPR alignent requirements would it be better to
+// split into individual scalar registers?
+static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v2f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_64RegClass, 31);
+  }
+  case MVT::v4i32:
+  case MVT::v4f32:
+  case MVT::v2i64:
+  case MVT::v2f64: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_128RegClass, 29);
+  }
+  case MVT::v8i32:
+  case MVT::v8f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_256RegClass, 25);
+
+  }
+  case MVT::v16i32:
+  case MVT::v16f32: {
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::VReg_512RegClass, 17);
+
+  }
+  default:
+    return false;
+  }
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 // Find a larger type to do a load / store of a vector with.
@@ -55,9 +127,33 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
+bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
+{
+  assert(Op.getOpcode() == ISD::OR);
+
+  SDValue N0 = Op->getOperand(0);
+  SDValue N1 = Op->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  if (VT.isInteger() && !VT.isVector()) {
+    KnownBits LHSKnown, RHSKnown;
+    DAG.computeKnownBits(N0, LHSKnown);
+
+    if (LHSKnown.Zero.getBoolValue()) {
+      DAG.computeKnownBits(N1, RHSKnown);
+
+      if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
@@ -211,10 +307,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // This is totally unsupported, just custom lower to produce an error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
-  // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-
   // Library functions.  These default to Expand, but we have instructions
   // for them.
   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
@@ -270,6 +362,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
@@ -460,10 +553,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // N > 4 stores on the same chain.
   GatherAllAliasesMaxDepth = 16;
 
-  // FIXME: Need to really handle these.
-  MaxStoresPerMemcpy  = 4096;
-  MaxStoresPerMemmove = 4096;
-  MaxStoresPerMemset  = 4096;
+  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
+  // about these during lowering.
+  MaxStoresPerMemcpy  = 0xffffffff;
+  MaxStoresPerMemmove = 0xffffffff;
+  MaxStoresPerMemset  = 0xffffffff;
 
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::SHL);
@@ -478,12 +572,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
+  setTargetDAGCombine(ISD::FABS);
+  setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::AssertSext);
 }
 
 //===----------------------------------------------------------------------===//
 // Target Information
 //===----------------------------------------------------------------------===//
 
+LLVM_READNONE
 static bool fnegFoldsIntoOp(unsigned Opc) {
   switch (Opc) {
   case ISD::FADD:
@@ -491,17 +589,83 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMUL:
   case ISD::FMA:
   case ISD::FMAD:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::FSIN:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
   case AMDGPUISD::SIN_HW:
   case AMDGPUISD::FMUL_LEGACY:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
     return true;
   default:
     return false;
   }
 }
 
+/// \p returns true if the operation will definitely need to use a 64-bit
+/// encoding, and thus will use a VOP3 encoding regardless of the source
+/// modifiers.
+LLVM_READONLY
+static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
+  return N->getNumOperands() > 2 || VT == MVT::f64;
+}
+
+// Most FP instructions support source modifiers, but this could be refined
+// slightly.
+LLVM_READONLY
+static bool hasSourceMods(const SDNode *N) {
+  if (isa<MemSDNode>(N))
+    return false;
+
+  switch (N->getOpcode()) {
+  case ISD::CopyToReg:
+  case ISD::SELECT:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::INLINEASM:
+  case AMDGPUISD::INTERP_P1:
+  case AMDGPUISD::INTERP_P2:
+  case AMDGPUISD::DIV_SCALE:
+
+  // TODO: Should really be looking at the users of the bitcast. These are
+  // problematic because bitcasts are used to legalize all stores to integer
+  // types.
+  case ISD::BITCAST:
+    return false;
+  default:
+    return true;
+  }
+}
+
+bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
+                                                 unsigned CostThreshold) {
+  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
+  // it is truly free to use a source modifier in all cases. If there are
+  // multiple users but for each one will necessitate using VOP3, there will be
+  // a code size increase. Try to avoid increasing code size unless we know it
+  // will save on the instruction count.
+  unsigned NumMayIncreaseSize = 0;
+  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
+
+  // XXX - Should this limit number of uses to check?
+  for (const SDNode *U : N->uses()) {
+    if (!hasSourceMods(U))
+      return false;
+
+    if (!opMustUseVOP3Encoding(U, VT)) {
+      if (++NumMayIncreaseSize > CostThreshold)
+        return false;
+    }
+  }
+
+  return true;
+}
+
 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
@@ -580,12 +744,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 
 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
   assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
-                                              VT == MVT::f16);
+
+  // Packed operations do not have a fabs modifier.
+  return VT == MVT::f32 || VT == MVT::f64 ||
+         (Subtarget->has16BitInsts() && VT == MVT::f16);
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
-  return isFAbsFree(VT);
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32 || VT == MVT::f64 ||
+         (Subtarget->has16BitInsts() && VT == MVT::f16) ||
+         (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
 }
 
 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -667,6 +836,46 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//
 
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                  bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return CC_AMDGPU;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CC_AMDGPU_Func;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
+CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                    bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return RetCC_SI_Shader;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return RetCC_AMDGPU_Func;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
 /// The SelectionDAGBuilder will automatically promote function arguments
 /// with illegal types.  However, this does not work for the AMDGPU targets
 /// since the function arguments are stored in memory as these illegal types.
@@ -676,7 +885,7 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
 
 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
 /// input values across multiple registers.  Each item in the Ins array
-/// represents a single value that will be stored in regsters.  Ins[x].VT is
+/// represents a single value that will be stored in registers.  Ins[x].VT is
 /// the value type of the value that will be stored in the register, so
 /// whatever SDNode we lower the argument to needs to be this type.
 ///
@@ -764,23 +973,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
   }
 }
 
-void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
-  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
-}
-
-void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
-                           const SmallVectorImpl<ISD::OutputArg> &Outs) const {
-
-  State.AnalyzeReturn(Outs, RetCC_SI);
-}
-
-SDValue
-AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                  bool isVarArg,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SDLoc &DL, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerReturn(
+  SDValue Chain, CallingConv::ID CallConv,
+  bool isVarArg,
+  const SmallVectorImpl<ISD::OutputArg> &Outs,
+  const SmallVectorImpl<SDValue> &OutVals,
+  const SDLoc &DL, SelectionDAG &DAG) const {
+  // FIXME: Fails for r600 tests
+  //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
+  // "wave terminate should not have return values");
   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
 }
 
@@ -788,6 +989,17 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 // Target specific lowering
 //===---------------------------------------------------------------------===//
 
+/// Selects the correct CCAssignFn for a given CallingConvention value.
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                    bool IsVarArg) {
+  return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
+}
+
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
+                                                      bool IsVarArg) {
+  return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
+}
+
 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                         SmallVectorImpl<SDValue> &InVals) const {
   SDValue Callee = CLI.Callee;
@@ -829,14 +1041,13 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
-    Op->dump(&DAG);
+    Op->print(errs(), &DAG);
     llvm_unreachable("Custom lowering code for this"
                      "instruction is not implemented yet!");
     break;
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
   case ISD::FREM: return LowerFREM(Op, DAG);
@@ -892,19 +1103,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
-  switch (G->getAddressSpace()) {
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
     // TODO: We could emit code to handle the initialization somewhere.
-    if (hasDefinedInitializer(GV))
-      break;
-
-    unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
-    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
-  }
+    if (!hasDefinedInitializer(GV)) {
+      unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+      return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+    }
   }
 
   const Function &Fn = *DAG.getMachineFunction().getFunction();
@@ -936,41 +1144,12 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
-SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-    SelectionDAG &DAG) const {
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  switch (IntrinsicID) {
-    default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
-      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
-      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
-      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-  }
-}
-
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
                                                    SDValue LHS, SDValue RHS,
                                                    SDValue True, SDValue False,
                                                    SDValue CC,
                                                    DAGCombinerInfo &DCI) const {
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return SDValue();
-
   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
     return SDValue();
 
@@ -1228,7 +1407,10 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
+  unsigned OpCode = Subtarget->hasFP32Denormals() ?
+                    (unsigned)AMDGPUISD::FMAD_FTZ :
+                    (unsigned)ISD::FMAD;
+  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
 
   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
@@ -1662,32 +1844,37 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
 }
 
 // XXX - May require not supporting f32 denormals?
-SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+
+// Don't handle v2f16. The extra instructions to scalarize and repack around the
+// compare and vselect end up producing worse code than scalarizing the whole
+// operation.
+SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
+  EVT VT = Op.getValueType();
 
-  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
 
   // TODO: Should this propagate fast-math-flags?
 
-  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
 
-  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
 
-  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
-  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
+  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
 
-  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
 
   EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
 
-  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
 
-  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
 }
 
 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
@@ -1750,8 +1937,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const
 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::f32)
-    return LowerFROUND32(Op, DAG);
+  if (VT == MVT::f32 || VT == MVT::f16)
+    return LowerFROUND32_16(Op, DAG);
 
   if (VT == MVT::f64)
     return LowerFROUND64(Op, DAG);
@@ -2030,15 +2217,19 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
 }
 
 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue N0 = Op.getOperand(0);
+
+  // Convert to target node to get known bits
+  if (N0.getValueType() == MVT::f32)
+    return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
 
   if (getTargetMachine().Options.UnsafeFPMath) {
     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
     return SDValue();
   }
 
-  SDLoc DL(Op);
-  SDValue N0 = Op.getOperand(0);
-  assert (N0.getSimpleValueType() == MVT::f64);
+  assert(N0.getSimpleValueType() == MVT::f64);
 
   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
   const unsigned ExpMask = 0x7ff;
@@ -2198,11 +2389,11 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 //===----------------------------------------------------------------------===//
 
 static bool isU24(SDValue Op, SelectionDAG &DAG) {
-  APInt KnownZero, KnownOne;
+  KnownBits Known;
   EVT VT = Op.getValueType();
-  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+  DAG.computeKnownBits(Op, Known);
 
-  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+  return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
 }
 
 static bool isI24(SDValue Op, SelectionDAG &DAG) {
@@ -2220,12 +2411,13 @@ static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
 
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op = Node24->getOperand(OpIdx);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = Op.getValueType();
 
   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
   APInt KnownZero, KnownOne;
   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
-  if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
+  if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
     return true;
 
   return false;
@@ -2379,6 +2571,53 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
+SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CSrc)
+    return SDValue();
+
+  const APFloat &F = CSrc->getValueAPF();
+  APFloat Zero = APFloat::getZero(F.getSemantics());
+  APFloat::cmpResult Cmp0 = F.compare(Zero);
+  if (Cmp0 == APFloat::cmpLessThan ||
+      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+  }
+
+  APFloat One(F.getSemantics(), "1.0");
+  APFloat::cmpResult Cmp1 = F.compare(One);
+  if (Cmp1 == APFloat::cmpGreaterThan)
+    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+  return SDValue(CSrc, 0);
+}
+
+// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
+// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
+// issues.
+SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
+                                                        DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  // (vt2 (assertzext (truncate vt0:x), vt1)) ->
+  //     (vt2 (truncate (assertzext vt0:x, vt1)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue N1 = N->getOperand(1);
+    EVT ExtVT = cast<VTSDNode>(N1)->getVT();
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.bitsGE(ExtVT)) {
+      SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
+      return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
+    }
+  }
+
+  return SDValue();
+}
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -2406,7 +2645,57 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
 
 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
-  if (N->getValueType(0) != MVT::i64)
+  EVT VT = N->getValueType(0);
+
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  unsigned RHSVal = RHS->getZExtValue();
+  if (!RHSVal)
+    return LHS;
+
+  SDLoc SL(N);
+  SelectionDAG &DAG = DCI.DAG;
+
+  switch (LHS->getOpcode()) {
+  default:
+    break;
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND: {
+    // shl (ext x) => zext (shl x), if shift does not overflow int
+    if (VT != MVT::i64)
+      break;
+    KnownBits Known;
+    SDValue X = LHS->getOperand(0);
+    DAG.computeKnownBits(X, Known);
+    unsigned LZ = Known.countMinLeadingZeros();
+    if (LZ < RHSVal)
+      break;
+    EVT XVT = X.getValueType();
+    SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
+    return DAG.getZExtOrTrunc(Shl, SL, VT);
+  }
+  case ISD::OR:
+    if (!isOrEquivalentToAdd(DAG, LHS))
+      break;
+    LLVM_FALLTHROUGH;
+  case ISD::ADD: {
+    // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
+    if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+      SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
+                                SDValue(RHS, 0));
+      SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
+                                    SDLoc(C2), VT);
+      return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
+    }
+    break;
+  }
+  }
+
+  if (VT != MVT::i64)
     return SDValue();
 
   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
@@ -2414,19 +2703,9 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   // common case, splitting this into a move and a 32-bit shift is faster and
   // the same code size.
-  const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
-  if (!RHS)
-    return SDValue();
-
-  unsigned RHSVal = RHS->getZExtValue();
   if (RHSVal < 32)
     return SDValue();
 
-  SDValue LHS = N->getOperand(0);
-
-  SDLoc SL(N);
-  SelectionDAG &DAG = DCI.DAG;
-
   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
 
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
@@ -2821,20 +3100,41 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
     }
-  }
 
-  if (VT == MVT::f32 && Cond.hasOneUse()) {
-    SDValue MinMax
-      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
-    // Revisit this node so we can catch min3/max3/med3 patterns.
-    //DCI.AddToWorklist(MinMax.getNode());
-    return MinMax;
+    if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
+      SDValue MinMax
+        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+      // Revisit this node so we can catch min3/max3/med3 patterns.
+      //DCI.AddToWorklist(MinMax.getNode());
+      return MinMax;
+    }
   }
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
 }
 
+static bool isConstantFPZero(SDValue N) {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+    return C->isZero() && !C->isNegative();
+  return false;
+}
+
+static unsigned inverseMinMax(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FMAXNUM:
+    return ISD::FMINNUM;
+  case ISD::FMINNUM:
+    return ISD::FMAXNUM;
+  case AMDGPUISD::FMAX_LEGACY:
+    return AMDGPUISD::FMIN_LEGACY;
+  case AMDGPUISD::FMIN_LEGACY:
+    return  AMDGPUISD::FMAX_LEGACY;
+  default:
+    llvm_unreachable("invalid min/max opcode");
+  }
+}
+
 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2847,10 +3147,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   // the other uses cannot, give up. This both prevents unprofitable
   // transformations and infinite loops: we won't repeatedly try to fold around
   // a negate that has no 'good' form.
-  //
-  // TODO: Check users can fold
-  if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
-    return SDValue();
+  if (N0.hasOneUse()) {
+    // This may be able to fold into the source, but at a code size cost. Don't
+    // fold if the fold into the user is free.
+    if (allUsesHaveSourceMods(N, 0))
+      return SDValue();
+  } else {
+    if (fnegFoldsIntoOp(Opc) &&
+        (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
+      return SDValue();
+  }
 
   SDLoc SL(N);
   switch (Opc) {
@@ -2872,7 +3178,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     else
       RHS = RHS.getOperand(0);
 
-    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS);
+    SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
     if (!N0.hasOneUse())
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
@@ -2891,7 +3197,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     else
       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
 
-    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS);
+    SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
     if (!N0.hasOneUse())
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
@@ -2923,10 +3229,40 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
   }
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+  case AMDGPUISD::FMAX_LEGACY:
+  case AMDGPUISD::FMIN_LEGACY: {
+    // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
+    // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
+    // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
+    // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
+
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    // 0 doesn't have a negated inline immediate.
+    // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
+    // operations.
+    if (isConstantFPZero(RHS))
+      return SDValue();
+
+    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+    SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    unsigned Opposite = inverseMinMax(Opc);
+
+    SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
   case ISD::FP_EXTEND:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT: // XXX - Should fround be handled?
+  case ISD::FSIN:
   case AMDGPUISD::RCP:
   case AMDGPUISD::RCP_LEGACY:
-  case ISD::FSIN:
   case AMDGPUISD::SIN_HW: {
     SDValue CvtSrc = N0.getOperand(0);
     if (CvtSrc.getOpcode() == ISD::FNEG) {
@@ -2941,7 +3277,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
     // (fneg (rcp x)) -> (rcp (fneg x))
     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
-    return DAG.getNode(Opc, SL, VT, Neg);
+    return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
   }
   case ISD::FP_ROUND: {
     SDValue CvtSrc = N0.getOperand(0);
@@ -2959,6 +3295,45 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
   }
+  case ISD::FP16_TO_FP: {
+    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
+    // f16, but legalization of f16 fneg ends up pulling it out of the source.
+    // Put the fneg back as a legal source operation that can be matched later.
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
+    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
+                                  DAG.getConstant(0x8000, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  switch (N0.getOpcode()) {
+  case ISD::FP16_TO_FP: {
+    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+    SDLoc SL(N);
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
+    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
+                                  DAG.getConstant(0x7fff, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
+  }
   default:
     return SDValue();
   }
@@ -3071,6 +3446,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performSelectCombine(N, DCI);
   case ISD::FNEG:
     return performFNegCombine(N, DCI);
+  case ISD::FABS:
+    return performFAbsCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
@@ -3131,7 +3508,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                        DL);
     }
 
-    if ((OffsetVal + WidthVal) >= 32) {
+    if ((OffsetVal + WidthVal) >= 32 &&
+        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
                          BitsFrom, ShiftVal);
@@ -3142,13 +3520,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                          OffsetVal,
                                          OffsetVal + WidthVal);
 
-      APInt KnownZero, KnownOne;
+      KnownBits Known;
       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                             !DCI.isBeforeLegalizeOps());
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
-          TLI.SimplifyDemandedBits(BitsFrom, Demanded,
-                                   KnownZero, KnownOne, TLO)) {
+      if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
+          TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
         DCI.CommitTargetLoweringOpt(TLO);
       }
     }
@@ -3159,6 +3536,21 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
+  case AMDGPUISD::CLAMP:
+    return performClampCombine(N, DCI);
+  case AMDGPUISD::RCP: {
+    if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
+      // XXX - Should this flush denormals?
+      const APFloat &Val = CFP->getValueAPF();
+      APFloat One(Val.getSemantics(), "1.0");
+      return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+    }
+
+    break;
+  }
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+    return performAssertSZExtCombine(N, DCI);
   }
   return SDValue();
 }
@@ -3168,18 +3560,25 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 //===----------------------------------------------------------------------===//
 
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                                  const TargetRegisterClass *RC,
-                                                   unsigned Reg, EVT VT) const {
+                                                   const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT,
+                                                   const SDLoc &SL,
+                                                   bool RawReg) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned VirtualRegister;
+  unsigned VReg;
+
   if (!MRI.isLiveIn(Reg)) {
-    VirtualRegister = MRI.createVirtualRegister(RC);
-    MRI.addLiveIn(Reg, VirtualRegister);
+    VReg = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VReg);
   } else {
-    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+    VReg = MRI.getLiveInVirtReg(Reg);
   }
-  return DAG.getRegister(VirtualRegister, VT);
+
+  if (RawReg)
+    return DAG.getRegister(VReg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
 }
 
 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
@@ -3201,13 +3600,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AMDGPUISD::NodeType)Opcode) {
   case AMDGPUISD::FIRST_NUMBER: break;
   // AMDIL DAG nodes
-  NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(IF)
+  NODE_NAME_CASE(ELSE)
+  NODE_NAME_CASE(LOOP)
+  NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(TRAP)
+  NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
-  NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
@@ -3232,6 +3636,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
   NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(FMAD_FTZ)
   NODE_NAME_CASE(TRIG_PREOP)
   NODE_NAME_CASE(RCP)
   NODE_NAME_CASE(RSQ)
@@ -3265,7 +3670,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(LOAD_INPUT)
   NODE_NAME_CASE(SAMPLE)
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
@@ -3274,12 +3678,17 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE1)
   NODE_NAME_CASE(CVT_F32_UBYTE2)
   NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+  NODE_NAME_CASE(FP_TO_FP16)
+  NODE_NAME_CASE(FP16_ZEXT)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   NODE_NAME_CASE(KILL)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+  NODE_NAME_CASE(INIT_EXEC)
+  NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
   NODE_NAME_CASE(SENDMSG)
   NODE_NAME_CASE(SENDMSGHALT)
   NODE_NAME_CASE(INTERP_MOV)
@@ -3288,6 +3697,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
@@ -3338,16 +3749,12 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
 }
 
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
-  const SDValue Op,
-  APInt &KnownZero,
-  APInt &KnownOne,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
+    const SDValue Op, KnownBits &Known,
+    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
 
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+  Known.resetAll(); // Don't know anything.
 
-  APInt KnownZero2;
-  APInt KnownOne2;
+  KnownBits Known2;
   unsigned Opc = Op.getOpcode();
 
   switch (Opc) {
@@ -3355,7 +3762,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     break;
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW: {
-    KnownZero = APInt::getHighBitsSet(32, 31);
+    Known.Zero = APInt::getHighBitsSet(32, 31);
     break;
   }
 
@@ -3365,21 +3772,27 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     if (!CWidth)
       return;
 
-    unsigned BitWidth = 32;
     uint32_t Width = CWidth->getZExtValue() & 0x1f;
 
     if (Opc == AMDGPUISD::BFE_U32)
-      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+      Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
+
+    break;
+  }
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP16_ZEXT: {
+    unsigned BitWidth = Known.getBitWidth();
 
+    // High bits are zero.
+    Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
     break;
   }
   }
 }
 
 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
-  SDValue Op,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
   switch (Op.getOpcode()) {
   case AMDGPUISD::BFE_I32: {
     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -3403,7 +3816,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW:
     return 31;
-
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP16_ZEXT:
+    return 16;
   default:
     return 1;
   }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f6adcea..d85aada 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -16,6 +16,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
+#include "AMDGPU.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -32,12 +34,15 @@ private:
   /// compare.
   SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
 
+public:
+  static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
+
 protected:
   const AMDGPUSubtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
 
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
@@ -47,7 +52,7 @@ protected:
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
@@ -70,6 +75,8 @@ protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
@@ -85,6 +92,7 @@ protected:
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
@@ -111,24 +119,22 @@ protected:
                                     SmallVectorImpl<SDValue> &Results) const;
   void analyzeFormalArgumentsCompute(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
-  void AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
-  void AnalyzeReturn(CCState &State,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs) const;
-
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
   bool mayIgnoreSignedZero(SDValue Op) const {
-    if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only
+    if (getTargetMachine().Options.NoSignedZerosFPMath)
       return true;
 
-    if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op))
-      return BO->Flags.hasNoSignedZeros();
+    const auto Flags = Op.getNode()->getFlags();
+    if (Flags.isDefined())
+      return Flags.hasNoSignedZeros();
 
     return false;
   }
 
+  static bool allUsesHaveSourceMods(const SDNode *N,
+                                    unsigned CostThreshold = 4);
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
   bool isTruncateFree(EVT Src, EVT Dest) const override;
@@ -158,6 +164,9 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
+  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
+  static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
+
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
@@ -174,7 +183,7 @@ public:
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
-  SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+  SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
                                SDValue RHS, SDValue True, SDValue False,
                                SDValue CC, DAGCombinerInfo &DCI) const;
 
@@ -196,21 +205,37 @@ public:
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op,
-                                     APInt &KnownZero,
-                                     APInt &KnownOne,
+                                     KnownBits &Known,
+                                     const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
   /// MachineFunction.
   ///
-  /// \returns a RegisterSDNode representing Reg.
-  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
-                                       const TargetRegisterClass *RC,
-                                       unsigned Reg, EVT VT) const;
+  /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
+  /// a copy from the register.
+  SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                               const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT,
+                               const SDLoc &SL,
+                               bool RawReg = false) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                               const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const {
+    return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
+  }
+
+  // Returns the raw live in register rather than a copy from it.
+  SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
+                                  const TargetRegisterClass *RC,
+                                  unsigned Reg, EVT VT) const {
+    return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
+  }
 
   enum ImplicitParameter {
     FIRST_IMPLICIT,
@@ -222,6 +247,14 @@ public:
   /// type of implicit parameter.
   uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
                                       const ImplicitParameter Param) const;
+
+  AMDGPUAS getAMDGPUAS() const {
+    return AMDGPUASI;
+  }
+
+  MVT getFenceOperandTy(const DataLayout &DL) const override {
+    return MVT::i32;
+  }
 };
 
 namespace AMDGPUISD {
@@ -229,15 +262,35 @@ namespace AMDGPUISD {
 enum NodeType : unsigned {
   // AMDIL ISD Opcodes
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+
+  // Function call.
+  CALL,
+  TRAP,
+
+  // Masked control flow nodes.
+  IF,
+  ELSE,
+  LOOP,
+
+  // A uniform kernel return that terminates the wavefront.
   ENDPGM,
-  RETURN,
+
+  // Return to a shader part's epilog code.
+  RETURN_TO_EPILOG,
+
+  // Return with values from a non-entry function.
+  RET_FLAG,
+
   DWORDADDR,
   FRACT,
+
+  /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+  /// modifier behavior with dx10_enable.
   CLAMP,
+
   // This is SETCC with the full mask result which is used for a compare with a
   // result bit per item in the wavefront.
   SETCC,
@@ -265,6 +318,9 @@ enum NodeType : unsigned {
   DIV_SCALE,
   DIV_FMAS,
   DIV_FIXUP,
+  // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
+  // treated as an illegal operation.
+  FMAD_FTZ,
   TRIG_PREOP, // 1 ULP max error for f64
 
   // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
@@ -301,7 +357,6 @@ enum NodeType : unsigned {
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
-  LOAD_INPUT,
   SAMPLE,
   SAMPLEB,
   SAMPLED,
@@ -312,6 +367,18 @@ enum NodeType : unsigned {
   CVT_F32_UBYTE1,
   CVT_F32_UBYTE2,
   CVT_F32_UBYTE3,
+
+  // Convert two float 32 numbers into a single register holding two packed f16
+  // with round to zero.
+  CVT_PKRTZ_F16_F32,
+
+  // Same as the standard node, except the high bits of the resulting integer
+  // are known 0.
+  FP_TO_FP16,
+
+  // Wrapper around fp16 results that are known to zero the high bits.
+  FP16_ZEXT,
+
   /// This node is for VLIW targets and it is used to represent a vector
   /// that is stored in consecutive registers with the same channel.
   /// For example:
@@ -323,6 +390,8 @@ enum NodeType : unsigned {
   BUILD_VERTICAL_VECTOR,
   /// Pointer to the start of the shader's constant data.
   CONST_DATA_PTR,
+  INIT_EXEC,
+  INIT_EXEC_FROM_INPUT,
   SENDMSG,
   SENDMSGHALT,
   INTERP_MOV,
@@ -335,6 +404,8 @@ enum NodeType : unsigned {
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_STORE_FORMAT_X3,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index e4dc659..69dc529 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 void AMDGPUInstrInfo::anchor() {}
 
 AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
-  : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
+  : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
 // the first 16 loads will be interleaved with the stores, and the next 16 will
@@ -66,7 +66,9 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
 enum SIEncodingFamily {
   SI = 0,
-  VI = 1
+  VI = 1,
+  SDWA = 2,
+  SDWA9 = 3
 };
 
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
@@ -86,6 +88,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
   case AMDGPUSubtarget::SEA_ISLANDS:
     return SIEncodingFamily::SI;
   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+  case AMDGPUSubtarget::GFX9:
     return SIEncodingFamily::VI;
 
   // FIXME: This should never be called for r600 GPUs.
@@ -100,7 +103,12 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
 }
 
 int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
-  int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST));
+  SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+  if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
+    Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
+                                                      : SIEncodingFamily::SDWA;
+
+  int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index bd8e389..41cc7d7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -16,11 +16,11 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 
-#include "llvm/Target/TargetInstrInfo.h"
+#include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
-#define GET_INSTRINFO_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 
 namespace llvm {
@@ -35,6 +35,8 @@ private:
   const AMDGPUSubtarget &ST;
 
   virtual void anchor();
+protected:
+  AMDGPUAS AMDGPUASI;
 
 public:
   explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index d7fa28b..bcf89bb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2,
   [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
 >;
 
+def AMDGPUFPPackOp : SDTypeProfile<1, 2,
+  [SDTCisFP<1>, SDTCisSameAs<1, 2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@@ -42,10 +46,47 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 
 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def AMDGPUIfOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPUElseOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPULoopOp : SDTypeProfile<0, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+>;
+
+def AMDGPUBreakOp : SDTypeProfile<1, 1,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
+>;
+
+def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
+>;
+
+def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
+>;
+
+def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
+  [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
 
+def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
+def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
+def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+
+def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
+  SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
+    [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
+>;
+
 def AMDGPUconstdata_ptr : SDNode<
   "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
                                                      SDTCisVT<0, iPTR>]>
@@ -78,6 +119,11 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
+def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
+def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
+
+
 def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
 
 // out = max(a, b) a and b are floats, where a nan comparison fails.
@@ -92,17 +138,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-
-// out = max(a, b) a and b are signed ints
-def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// out = max(a, b) a and b are unsigned ints
-def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
 // out = min(a, b) a and b are floats, where a nan comparison fails.
 def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
@@ -147,6 +183,12 @@ def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;
 // out = (src1 > src0) ? 1 : 0
 def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;
 
+// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
+// nodes in TargetSelectionDAG.td.
+def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;
+
+def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;
+
 def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
   SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
 ]>;
@@ -194,6 +236,8 @@ def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
 // Denominator, src2 = Numerator).
 def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
 
+def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
+
 // Look Up 2.0 / pi src0 with segment select src1[4:0]
 def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
 
@@ -265,6 +309,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
 
 def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
+def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
+                      SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                      [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
+                                 SDTypeProfile<0, 2,
+                                 [SDTCisInt<0>, SDTCisInt<1>]>,
+                                 [SDNPHasChain, SDNPInGlue]>;
+
 def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
                     SDTypeProfile<0, 1, [SDTCisInt<0>]>,
                     [SDNPHasChain, SDNPInGlue]>;
@@ -291,15 +344,16 @@ def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
 
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
-  SDTCisInt<0>, // i8 en
-  SDTCisInt<1>, // i1 vm
+  SDTCisInt<0>,       // i8 tgt
+  SDTCisInt<1>,       // i8 en
+                      // i32 or f32 src0
+  SDTCisSameAs<3, 2>, // f32 src1
+  SDTCisSameAs<4, 2>, // f32 src2
+  SDTCisSameAs<5, 2>, // f32 src3
+  SDTCisInt<6>,       // i1 compr
   // skip done
-  SDTCisInt<2>, // i8 tgt
-  SDTCisSameAs<3, 1>, // i1 compr
-  SDTCisFP<4>,        // f32 src0
-  SDTCisSameAs<5, 4>, // f32 src1
-  SDTCisSameAs<6, 4>, // f32 src2
-  SDTCisSameAs<7, 4>  // f32 src3
+  SDTCisInt<1>        // i1 vm
+
 ]>;
 
 def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
@@ -333,5 +387,9 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
     [SDNPHasChain, SDNPOptInGlue]>;
 
-def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
+def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
new file mode 100644
index 0000000..e54c887
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -0,0 +1,425 @@
+//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-isel"
+
+using namespace llvm;
+
+AMDGPUInstructionSelector::AMDGPUInstructionSelector(
+    const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+    : InstructionSelector(), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
+
+MachineOperand
+AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
+                                           unsigned SubIdx) const {
+
+  MachineInstr *MI = MO.getParent();
+  MachineBasicBlock *BB = MO.getParent()->getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+  if (MO.isReg()) {
+    unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
+    unsigned Reg = MO.getReg();
+    BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
+            .addReg(Reg, 0, ComposedSubIdx);
+
+    return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
+                                     MO.isKill(), MO.isDead(), MO.isUndef(),
+                                     MO.isEarlyClobber(), 0, MO.isDebug(),
+                                     MO.isInternalRead());
+  }
+
+  assert(MO.isImm());
+
+  APInt Imm(64, MO.getImm());
+
+  switch (SubIdx) {
+  default:
+    llvm_unreachable("do not know to split immediate with this sub index.");
+  case AMDGPU::sub0:
+    return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
+  case AMDGPU::sub1:
+    return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
+  }
+}
+
+bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  if (Size != 64)
+    return false;
+
+  DebugLoc DL = I.getDebugLoc();
+
+  MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0));
+  MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
+          .add(Lo1)
+          .add(Lo2);
+
+  MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1));
+  MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
+          .add(Hi1)
+          .add(Hi2);
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg())
+          .addReg(DstLo)
+          .addImm(AMDGPU::sub0)
+          .addReg(DstHi)
+          .addImm(AMDGPU::sub1);
+
+  for (MachineOperand &MO : I.explicit_operands()) {
+    if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      continue;
+    RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI);
+  }
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
+  return selectG_ADD(I);
+}
+
+bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  DebugLoc DL = I.getDebugLoc();
+
+  // FIXME: Select store instruction based on address space
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+          .add(I.getOperand(1))
+          .add(I.getOperand(0))
+          .addImm(0)  // offset
+          .addImm(0)  // glc
+          .addImm(0); // slc
+
+
+  // Now that we selected an opcode, we need to constrain the register
+  // operands to use appropriate classes.
+  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return Ret;
+}
+
+bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+  if (Size == 32) {
+    I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  assert(Size == 64);
+
+  DebugLoc DL = I.getDebugLoc();
+  unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  const APInt &Imm = I.getOperand(1).getCImm()->getValue();
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+          .addImm(Imm.trunc(32).getZExtValue());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+          .addImm(Imm.ashr(32).getZExtValue());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+          .addReg(LoReg)
+          .addImm(AMDGPU::sub0)
+          .addReg(HiReg)
+          .addImm(AMDGPU::sub1);
+  // We can't call constrainSelectedInstRegOperands here, because it doesn't
+  // work for target independent opcodes
+  I.eraseFromParent();
+  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+}
+
+static bool isConstant(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
+}
+
+void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
+    const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
+
+  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
+
+  assert(PtrMI);
+
+  if (PtrMI->getOpcode() != TargetOpcode::G_GEP)
+    return;
+
+  GEPInfo GEPInfo(*PtrMI);
+
+  for (unsigned i = 1, e = 3; i < e; ++i) {
+    const MachineOperand &GEPOp = PtrMI->getOperand(i);
+    const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
+    assert(OpDef);
+    if (isConstant(*OpDef)) {
+      // FIXME: Is it possible to have multiple Imm parts?  Maybe if we
+      // are lacking other optimizations.
+      assert(GEPInfo.Imm == 0);
+      GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
+      continue;
+    }
+    const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
+    if (OpBank->getID() == AMDGPU::SGPRRegBankID)
+      GEPInfo.SgprParts.push_back(GEPOp.getReg());
+    else
+      GEPInfo.VgprParts.push_back(GEPOp.getReg());
+  }
+
+  AddrInfo.push_back(GEPInfo);
+  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  const Value *Ptr = MMO->getValue();
+
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
+static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) {
+
+  if (LoadSize == 32)
+    return BaseOpcode;
+
+  switch (BaseOpcode) {
+  case AMDGPU::S_LOAD_DWORD_IMM:
+    switch (LoadSize) {
+    case 64:
+      return AMDGPU::S_LOAD_DWORDX2_IMM;
+    case 128:
+      return AMDGPU::S_LOAD_DWORDX4_IMM;
+    case 256:
+      return AMDGPU::S_LOAD_DWORDX8_IMM;
+    case 512:
+      return AMDGPU::S_LOAD_DWORDX16_IMM;
+    }
+    break;
+  case AMDGPU::S_LOAD_DWORD_IMM_ci:
+    switch (LoadSize) {
+    case 64:
+      return AMDGPU::S_LOAD_DWORDX2_IMM_ci;
+    case 128:
+      return AMDGPU::S_LOAD_DWORDX4_IMM_ci;
+    case 256:
+      return AMDGPU::S_LOAD_DWORDX8_IMM_ci;
+    case 512:
+      return AMDGPU::S_LOAD_DWORDX16_IMM_ci;
+    }
+    break;
+  case AMDGPU::S_LOAD_DWORD_SGPR:
+    switch (LoadSize) {
+    case 64:
+      return AMDGPU::S_LOAD_DWORDX2_SGPR;
+    case 128:
+      return AMDGPU::S_LOAD_DWORDX4_SGPR;
+    case 256:
+      return AMDGPU::S_LOAD_DWORDX8_SGPR;
+    case 512:
+      return AMDGPU::S_LOAD_DWORDX16_SGPR;
+    }
+    break;
+  }
+  llvm_unreachable("Invalid base smrd opcode or size");
+}
+
+bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
+  for (const GEPInfo &GEPInfo : AddrInfo) {
+    if (!GEPInfo.VgprParts.empty())
+      return true;
+  }
+  return false;
+}
+
+bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
+                                           ArrayRef<GEPInfo> AddrInfo) const {
+
+  if (!I.hasOneMemOperand())
+    return false;
+
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
+    return false;
+
+  if (!isInstrUniform(I))
+    return false;
+
+  if (hasVgprParts(AddrInfo))
+    return false;
+
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = I.getOperand(0).getReg();
+  const DebugLoc &DL = I.getDebugLoc();
+  unsigned Opcode;
+  unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+  if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) {
+
+    const GEPInfo &GEPInfo = AddrInfo[0];
+
+    unsigned PtrReg = GEPInfo.SgprParts[0];
+    int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(Subtarget, GEPInfo.Imm);
+    if (AMDGPU::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
+
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                 .addReg(PtrReg)
+                                 .addImm(EncodedImm)
+                                 .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+
+    if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS &&
+        isUInt<32>(EncodedImm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize);
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                   .addReg(PtrReg)
+                                   .addImm(EncodedImm)
+                                   .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+
+    if (isUInt<32>(GEPInfo.Imm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize);
+      unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+              .addImm(GEPInfo.Imm);
+
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                   .addReg(PtrReg)
+                                   .addReg(OffsetReg)
+                                   .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+  }
+
+  unsigned PtrReg = I.getOperand(1).getReg();
+  Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
+  MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                               .addReg(PtrReg)
+                               .addImm(0)
+                               .addImm(0); // glc
+  return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+}
+
+
+bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  DebugLoc DL = I.getDebugLoc();
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned PtrReg = I.getOperand(1).getReg();
+  unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+  unsigned Opcode;
+
+  SmallVector<GEPInfo, 4> AddrInfo;
+
+  getAddrModeInfo(I, MRI, AddrInfo);
+
+  if (selectSMRD(I, AddrInfo)) {
+    I.eraseFromParent();
+    return true;
+  }
+
+  switch (LoadSize) {
+  default:
+    llvm_unreachable("Load size not supported\n");
+  case 32:
+    Opcode = AMDGPU::FLAT_LOAD_DWORD;
+    break;
+  case 64:
+    Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
+    break;
+  }
+
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
+                               .add(I.getOperand(0))
+                               .addReg(PtrReg)
+                               .addImm(0)  // offset
+                               .addImm(0)  // glc
+                               .addImm(0); // slc
+
+  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
+  I.eraseFromParent();
+  return Ret;
+}
+
+bool AMDGPUInstructionSelector::select(MachineInstr &I) const {
+
+  if (!isPreISelGenericOpcode(I.getOpcode()))
+    return true;
+
+  switch (I.getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::G_ADD:
+    return selectG_ADD(I);
+  case TargetOpcode::G_CONSTANT:
+    return selectG_CONSTANT(I);
+  case TargetOpcode::G_GEP:
+    return selectG_GEP(I);
+  case TargetOpcode::G_LOAD:
+    return selectG_LOAD(I);
+  case TargetOpcode::G_STORE:
+    return selectG_STORE(I);
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
new file mode 100644
index 0000000..ef845f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -0,0 +1,67 @@
+//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for
+/// AMDGPU.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+
+#include "AMDGPU.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+
+namespace llvm {
+
+class AMDGPUInstrInfo;
+class AMDGPURegisterBankInfo;
+class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
+class SIInstrInfo;
+class SIRegisterInfo;
+class SISubtarget;
+
+class AMDGPUInstructionSelector : public InstructionSelector {
+public:
+  AMDGPUInstructionSelector(const SISubtarget &STI,
+                            const AMDGPURegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+private:
+  struct GEPInfo {
+    const MachineInstr &GEP;
+    SmallVector<unsigned, 2> SgprParts;
+    SmallVector<unsigned, 2> VgprParts;
+    int64_t Imm;
+    GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
+  };
+
+  MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+  bool selectG_CONSTANT(MachineInstr &I) const;
+  bool selectG_ADD(MachineInstr &I) const;
+  bool selectG_GEP(MachineInstr &I) const;
+  bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
+  void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
+                       SmallVectorImpl<GEPInfo> &AddrInfo) const;
+  bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
+  bool selectG_LOAD(MachineInstr &I) const;
+  bool selectG_STORE(MachineInstr &I) const;
+
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+  const AMDGPURegisterBankInfo &RBI;
+protected:
+  AMDGPUAS AMDGPUASI;
+};
+
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 59cba63..4e688ab 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -50,6 +50,16 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+def u16ImmTarget : AsmOperandClass {
+  let Name = "U16Imm";
+  let RenderMethod = "addImmOperands";
+}
+
+def s16ImmTarget : AsmOperandClass {
+  let Name = "S16Imm";
+  let RenderMethod = "addImmOperands";
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def u32imm : Operand<i32> {
@@ -58,6 +68,12 @@ def u32imm : Operand<i32> {
 
 def u16imm : Operand<i16> {
   let PrintMethod = "printU16ImmOperand";
+  let ParserMatchClass = u16ImmTarget;
+}
+
+def s16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+  let ParserMatchClass = s16ImmTarget;
 }
 
 def u8imm : Operand<i8> {
@@ -72,6 +88,49 @@ def u8imm : Operand<i8> {
 def brtarget   : Operand<OtherVT>;
 
 //===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0),
+  (op $src0),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1),
+  (op $src0, $src1),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1, node:$src2),
+  (op $src0, $src1, $src2),
+  [{ return N->hasOneUse(); }]
+>;
+
+def trunc_oneuse : HasOneUseUnaryOp<trunc>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+def fminnum_oneuse : HasOneUseBinOp<fminnum>;
+def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def and_oneuse : HasOneUseBinOp<and>;
+def or_oneuse : HasOneUseBinOp<or>;
+def xor_oneuse : HasOneUseBinOp<xor>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+def sub_oneuse : HasOneUseBinOp<sub>;
+
+def srl_oneuse : HasOneUseBinOp<srl>;
+def shl_oneuse : HasOneUseBinOp<shl>;
+
+def select_oneuse : HasOneUseTernaryOp<select>;
+
+//===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
 
@@ -157,27 +216,11 @@ def COND_NULL : PatLeaf <
 
 
 //===----------------------------------------------------------------------===//
-// Misc. PatFrags
-//===----------------------------------------------------------------------===//
-
-class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0, node:$src1),
-  (op $src0, $src1),
-  [{ return N->hasOneUse(); }]
->;
-
-class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0, node:$src1, node:$src2),
-  (op $src0, $src1, $src2),
-  [{ return N->hasOneUse(); }]
->;
-
-//===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
 class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
 }]>;
 
 class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
@@ -195,7 +238,7 @@ def truncstorei16_private : PrivateStore <truncstorei16>;
 def store_private : PrivateStore <store>;
 
 class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 // Global address space loads
@@ -215,7 +258,7 @@ def global_store_atomic : GlobalStore<atomic_store>;
 
 
 class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 // Constant address space loads
@@ -226,7 +269,7 @@ class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
 def constant_load : ConstantLoad<load>;
 
 class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 // Local address space loads
@@ -239,7 +282,7 @@ class LocalStore <SDPatternOperator op> : LocalMemOp <
 >;
 
 class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS;
 }]>;
 
 class FlatLoad <SDPatternOperator op> : FlatMemOp <
@@ -321,7 +364,7 @@ def local_store_aligned8bytes : Aligned8Bytes <
 class local_binary_atomic_op<SDNode atomic_op> :
   PatFrag<(ops node:$ptr, node:$value),
     (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 
@@ -339,7 +382,7 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
 
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
@@ -349,7 +392,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
       return AN->getMemoryVT() == MVT::i32 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+             AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
   }]>;
 
   def _64_local : PatFrag<
@@ -357,7 +400,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
       return AN->getMemoryVT() == MVT::i64 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+             AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
   }]>;
 }
 
@@ -367,17 +410,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> {
   def "" : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
   def _noret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
   def _ret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 }
 
 defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
@@ -395,22 +438,22 @@ defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
 def AMDGPUatomic_cmp_swap_global : PatFrag<
         (ops node:$ptr, node:$value),
         (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
 def atomic_cmp_swap_global : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
 def atomic_cmp_swap_global_noret : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
 def atomic_cmp_swap_global_ret : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
@@ -422,6 +465,7 @@ int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 int FP16_ONE = 0x3C00;
+int V2FP16_ONE = 0x3C003C00;
 int FP32_ONE = 0x3f800000;
 int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
@@ -452,7 +496,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
   (ins rc:$src0),
   "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+  [(set f32:$dst, (AMDGPUclamp f32:$src0))]
 >;
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
@@ -565,6 +609,12 @@ multiclass BFIPatterns <Instruction BFI_INT,
   >;
 
   def : Pat <
+    (f32 (fcopysign f32:$src0, f64:$src1)),
+    (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0,
+             (i32 (EXTRACT_SUBREG $src1, sub1)))
+  >;
+
+  def : Pat <
     (f64 (fcopysign f64:$src0, f64:$src1)),
     (REG_SEQUENCE RC64,
       (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
@@ -602,10 +652,22 @@ def IMMPopCount : SDNodeXForm<imm, [{
                                    MVT::i32);
 }]>;
 
-class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
-  (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
-  (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
->;
+multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
+  def : Pat <
+    (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
+    (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
+  >;
+
+  def : Pat <
+    (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+    (UBFE $src, (i32 0), $width)
+  >;
+
+  def : Pat <
+    (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+    (SBFE $src, (i32 0), $width)
+  >;
+}
 
 // rotr pattern
 class ROTRPattern <Instruction BIT_ALIGN> : Pat <
@@ -618,23 +680,13 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
 class IntMed3Pat<Instruction med3Inst,
                  SDPatternOperator max,
                  SDPatternOperator max_oneuse,
-                 SDPatternOperator min_oneuse> : Pat<
-  (max (min_oneuse i32:$src0, i32:$src1),
-       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+                 SDPatternOperator min_oneuse,
+                 ValueType vt = i32> : Pat<
+  (max (min_oneuse vt:$src0, vt:$src1),
+       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst $src0, $src1, $src2)
 >;
 
-let Properties = [SDNPCommutative, SDNPAssociative] in {
-def smax_oneuse : HasOneUseBinOp<smax>;
-def smin_oneuse : HasOneUseBinOp<smin>;
-def umax_oneuse : HasOneUseBinOp<umax>;
-def umin_oneuse : HasOneUseBinOp<umin>;
-} // Properties = [SDNPCommutative, SDNPAssociative]
-
-def sub_oneuse : HasOneUseBinOp<sub>;
-
-def select_oneuse : HasOneUseTernaryOp<select>;
-
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 8e3471b..86dc9bd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -54,14 +54,7 @@ std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
 FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
                                            ArrayRef<Type*> Tys) const {
   // FIXME: Re-use Intrinsic::getType machinery
-  switch (ID) {
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
-    Type *F32Ty = Type::getFloatTy(Context);
-    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
-  }
-  default:
-    llvm_unreachable("unhandled intrinsic");
-  }
+  llvm_unreachable("unhandled intrinsic");
 }
 
 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -97,8 +90,8 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
   Function *F
     = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
 
-  AttributeSet AS = getAttributes(M->getContext(),
-                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  AttributeList AS =
+      getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID));
   F->setAttributes(AS);
   return F;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ceae0b5..18c9bd9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -12,25 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
-  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
-
-  // Deprecated in favor of llvm.amdgcn.sffbh
-  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
-  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of expanded bit operations
-  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of llvm.amdgcn.rsq
-  def int_AMDGPU_rsq : Intrinsic<
-    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
-  >;
 }
 
 include "SIIntrinsics.td"
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
new file mode 100644
index 0000000..cc56216
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -0,0 +1,88 @@
+//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+  using namespace TargetOpcode;
+
+  const LLT S1= LLT::scalar(1);
+  const LLT V2S16 = LLT::vector(2, 16);
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+  const LLT P1 = LLT::pointer(1, 64);
+  const LLT P2 = LLT::pointer(2, 64);
+
+  setAction({G_ADD, S32}, Legal);
+  setAction({G_AND, S32}, Legal);
+
+  setAction({G_BITCAST, V2S16}, Legal);
+  setAction({G_BITCAST, 1, S32}, Legal);
+
+  setAction({G_BITCAST, S32}, Legal);
+  setAction({G_BITCAST, 1, V2S16}, Legal);
+
+  // FIXME: i1 operands to intrinsics should always be legal, but other i1
+  // values may not be legal.  We need to figure out how to distinguish
+  // between these two scenarios.
+  setAction({G_CONSTANT, S1}, Legal);
+  setAction({G_CONSTANT, S32}, Legal);
+  setAction({G_CONSTANT, S64}, Legal);
+
+  setAction({G_FCONSTANT, S32}, Legal);
+
+  setAction({G_GEP, P1}, Legal);
+  setAction({G_GEP, P2}, Legal);
+  setAction({G_GEP, 1, S64}, Legal);
+
+  setAction({G_ICMP, S1}, Legal);
+  setAction({G_ICMP, 1, S32}, Legal);
+
+  setAction({G_LOAD, P1}, Legal);
+  setAction({G_LOAD, P2}, Legal);
+  setAction({G_LOAD, S32}, Legal);
+  setAction({G_LOAD, 1, P1}, Legal);
+  setAction({G_LOAD, 1, P2}, Legal);
+
+  setAction({G_SELECT, S32}, Legal);
+  setAction({G_SELECT, 1, S1}, Legal);
+
+  setAction({G_SHL, S32}, Legal);
+
+  setAction({G_STORE, S32}, Legal);
+  setAction({G_STORE, 1, P1}, Legal);
+
+  // FIXME: When RegBankSelect inserts copies, it will only create new
+  // registers with scalar types.  This means we can end up with
+  // G_LOAD/G_STORE/G_GEP instruction with scalar types for their pointer
+  // operands.  In assert builds, the instruction selector will assert
+  // if it sees a generic instruction which isn't legal, so we need to
+  // tell it that scalar types are legal for pointer operands
+  setAction({G_GEP, S64}, Legal);
+  setAction({G_LOAD, 1, S64}, Legal);
+  setAction({G_STORE, 1, S64}, Legal);
+
+  computeTables();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
new file mode 100644
index 0000000..291e336
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -0,0 +1,30 @@
+//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class AMDGPULegalizerInfo : public LegalizerInfo {
+public:
+  AMDGPULegalizerInfo();
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 0000000..7e0e980
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,170 @@
+//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+const unsigned MaxStaticSize = 1024;
+
+class AMDGPULowerIntrinsics : public ModulePass {
+private:
+  bool makeLIDRangeMetadata(Function &F) const;
+
+public:
+  static char ID;
+
+  AMDGPULowerIntrinsics() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+  bool expandMemIntrinsicUses(Function &F);
+  StringRef getPassName() const override {
+    return "AMDGPU Lower Intrinsics";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+};
+
+}
+
+char AMDGPULowerIntrinsics::ID = 0;
+
+char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
+
+INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
+                false)
+
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+static bool shouldExpandOperationWithSize(Value *Size) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+  return !CI || (CI->getZExtValue() > MaxStaticSize);
+}
+
+bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
+  Intrinsic::ID ID = F.getIntrinsicID();
+  bool Changed = false;
+
+  for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
+    Instruction *Inst = cast<Instruction>(*I);
+    ++I;
+
+    switch (ID) {
+    case Intrinsic::memcpy: {
+      auto *Memcpy = cast<MemCpyInst>(Inst);
+      if (shouldExpandOperationWithSize(Memcpy->getLength())) {
+        Function *ParentFunc = Memcpy->getParent()->getParent();
+        const TargetTransformInfo &TTI =
+            getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
+        expandMemCpyAsLoop(Memcpy, TTI);
+        Changed = true;
+        Memcpy->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memmove: {
+      auto *Memmove = cast<MemMoveInst>(Inst);
+      if (shouldExpandOperationWithSize(Memmove->getLength())) {
+        expandMemMoveAsLoop(Memmove);
+        Changed = true;
+        Memmove->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memset: {
+      auto *Memset = cast<MemSetInst>(Inst);
+      if (shouldExpandOperationWithSize(Memset->getLength())) {
+        expandMemSetAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+      }
+
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(F);
+  bool Changed = false;
+
+  for (auto *U : F.users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI)
+      continue;
+
+    Changed |= ST.makeLIDRangeMetadata(CI);
+  }
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      continue;
+
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset:
+      if (expandMemIntrinsicUses(F))
+        Changed = true;
+      break;
+
+    case Intrinsic::amdgcn_workitem_id_x:
+    case Intrinsic::r600_read_tidig_x:
+    case Intrinsic::amdgcn_workitem_id_y:
+    case Intrinsic::r600_read_tidig_y:
+    case Intrinsic::amdgcn_workitem_id_z:
+    case Intrinsic::r600_read_tidig_z:
+    case Intrinsic::r600_read_local_size_x:
+    case Intrinsic::r600_read_local_size_y:
+    case Intrinsic::r600_read_local_size_z:
+      Changed |= makeLIDRangeMetadata(F);
+      break;
+
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
+  return new AMDGPULowerIntrinsics();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 7d56355..63dd0d7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -38,7 +38,6 @@ using namespace llvm;
 
 #include "AMDGPUGenMCPseudoLowering.inc"
 
-
 AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st,
                                      const AsmPrinter &ap):
   Ctx(ctx), ST(st), AP(ap) { }
@@ -126,9 +125,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
 }
 
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  unsigned Opcode = MI->getOpcode();
 
-  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
+  // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
+  // need to select it to the subtarget specific version, and there's no way to
+  // do that with a single pseudo source operation.
+  if (Opcode == AMDGPU::S_SETPC_B64_return)
+    Opcode = AMDGPU::S_SETPC_B64;
 
+  int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
   if (MCOpcode == -1) {
     LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
     C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
@@ -151,6 +156,28 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
   return MCInstLowering.lowerOperand(MO, MCOp);
 }
 
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+  // TargetMachine does not support llvm-style cast. Use C++-style cast.
+  // This is safe since TM is always of type AMDGPUTargetMachine or its
+  // derived class.
+  auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+  auto *CE = dyn_cast<ConstantExpr>(CV);
+
+  // Lower null pointers in private and local address space.
+  // Clang generates addrspacecast for null pointers in private and local
+  // address space, which needs to be lowered.
+  if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
+    auto Op = CE->getOperand(0);
+    auto SrcAddr = Op->getType()->getPointerAddressSpace();
+    if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+      auto DstAddr = CE->getType()->getPointerAddressSpace();
+      return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+        OutContext);
+    }
+  }
+  return AsmPrinter::lowerConstant(CV);
+}
+
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
@@ -162,7 +189,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) {
     LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
     C.emitError("Illegal instruction detected: " + Err);
-    MI->dump();
+    MI->print(errs());
   }
 
   if (MI->isBundle()) {
@@ -173,8 +200,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       ++I;
     }
   } else {
-    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
-    // terminator instructions and should only be printed as comments.
+    // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are
+    // placeholder terminator instructions and should only be printed as
+    // comments.
     if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
       if (isVerbose()) {
         SmallVector<char, 16> BBStr;
@@ -190,9 +218,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       return;
     }
 
-    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+    if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
       if (isVerbose())
-        OutStreamer->emitRawComment(" return");
+        OutStreamer->emitRawComment(" return to shader part epilog");
       return;
     }
 
@@ -202,6 +230,12 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       return;
     }
 
+    if (MI->getOpcode() == AMDGPU::SI_MASKED_UNREACHABLE) {
+      if (isVerbose())
+        OutStreamer->emitRawComment(" divergent unreachable");
+      return;
+    }
+
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
new file mode 100644
index 0000000..9a391d0
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -0,0 +1,2881 @@
+//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level CFG structurizer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <tuple>
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpucfgstructurizer"
+
+namespace {
+class PHILinearizeDestIterator;
+
+class PHILinearize {
+  friend class PHILinearizeDestIterator;
+
+public:
+  typedef std::pair<unsigned, MachineBasicBlock *> PHISourceT;
+
+private:
+  typedef DenseSet<PHISourceT> PHISourcesT;
+  typedef struct {
+    unsigned DestReg;
+    DebugLoc DL;
+    PHISourcesT Sources;
+  } PHIInfoElementT;
+  typedef SmallPtrSet<PHIInfoElementT *, 2> PHIInfoT;
+  PHIInfoT PHIInfo;
+
+  static unsigned phiInfoElementGetDest(PHIInfoElementT *Info);
+  static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef);
+  static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info);
+  static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg,
+                                      MachineBasicBlock *SourceMBB);
+  static void phiInfoElementRemoveSource(PHIInfoElementT *Info,
+                                         unsigned SourceReg,
+                                         MachineBasicBlock *SourceMBB);
+  PHIInfoElementT *findPHIInfoElement(unsigned DestReg);
+  PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg,
+                                                MachineBasicBlock *SourceMBB);
+
+public:
+  bool findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+                          SmallVector<unsigned, 4> &Sources);
+  void addDest(unsigned DestReg, const DebugLoc &DL);
+  void replaceDef(unsigned OldDestReg, unsigned NewDestReg);
+  void deleteDef(unsigned DestReg);
+  void addSource(unsigned DestReg, unsigned SourceReg,
+                 MachineBasicBlock *SourceMBB);
+  void removeSource(unsigned DestReg, unsigned SourceReg,
+                    MachineBasicBlock *SourceMBB = nullptr);
+  bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+                unsigned &DestReg);
+  bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr);
+  unsigned getNumSources(unsigned DestReg);
+  void dump(MachineRegisterInfo *MRI);
+  void clear();
+
+  typedef PHISourcesT::iterator source_iterator;
+  typedef PHILinearizeDestIterator dest_iterator;
+
+  dest_iterator dests_begin();
+  dest_iterator dests_end();
+
+  source_iterator sources_begin(unsigned Reg);
+  source_iterator sources_end(unsigned Reg);
+};
+
+class PHILinearizeDestIterator {
+private:
+  PHILinearize::PHIInfoT::iterator Iter;
+
+public:
+  unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); }
+  PHILinearizeDestIterator &operator++() {
+    ++Iter;
+    return *this;
+  }
+  bool operator==(const PHILinearizeDestIterator &I) const {
+    return I.Iter == Iter;
+  }
+  bool operator!=(const PHILinearizeDestIterator &I) const {
+    return I.Iter != Iter;
+  }
+
+  PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {}
+};
+
+unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) {
+  return Info->DestReg;
+}
+
+void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info,
+                                        unsigned NewDef) {
+  Info->DestReg = NewDef;
+}
+
+PHILinearize::PHISourcesT &
+PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) {
+  return Info->Sources;
+}
+
+void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info,
+                                           unsigned SourceReg,
+                                           MachineBasicBlock *SourceMBB) {
+  // Assertion ensures we don't use the same SourceMBB for the
+  // sources, because we cannot have different registers with
+  // identical predecessors, but we can have the same register for
+  // multiple predecessors.
+#if !defined(NDEBUG)
+  for (auto SI : phiInfoElementGetSources(Info)) {
+    assert((SI.second != SourceMBB || SourceReg == SI.first));
+  }
+#endif
+
+  phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB));
+}
+
+void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info,
+                                              unsigned SourceReg,
+                                              MachineBasicBlock *SourceMBB) {
+  auto &Sources = phiInfoElementGetSources(Info);
+  SmallVector<PHISourceT, 4> ElimiatedSources;
+  for (auto SI : Sources) {
+    if (SI.first == SourceReg &&
+        (SI.second == nullptr || SI.second == SourceMBB)) {
+      ElimiatedSources.push_back(PHISourceT(SI.first, SI.second));
+    }
+  }
+
+  for (auto &Source : ElimiatedSources) {
+    Sources.erase(Source);
+  }
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElement(unsigned DestReg) {
+  for (auto I : PHIInfo) {
+    if (phiInfoElementGetDest(I) == DestReg) {
+      return I;
+    }
+  }
+  return nullptr;
+}
+
+PHILinearize::PHIInfoElementT *
+PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg,
+                                           MachineBasicBlock *SourceMBB) {
+  for (auto I : PHIInfo) {
+    for (auto SI : phiInfoElementGetSources(I)) {
+      if (SI.first == SourceReg &&
+          (SI.second == nullptr || SI.second == SourceMBB)) {
+        return I;
+      }
+    }
+  }
+  return nullptr;
+}
+
+bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB,
+                                      SmallVector<unsigned, 4> &Sources) {
+  bool FoundSource = false;
+  for (auto I : PHIInfo) {
+    for (auto SI : phiInfoElementGetSources(I)) {
+      if (SI.second == SourceMBB) {
+        FoundSource = true;
+        Sources.push_back(SI.first);
+      }
+    }
+  }
+  return FoundSource;
+}
+
+void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) {
+  assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists");
+  PHISourcesT EmptySet;
+  PHIInfoElementT *NewElement = new PHIInfoElementT();
+  NewElement->DestReg = DestReg;
+  NewElement->DL = DL;
+  NewElement->Sources = EmptySet;
+  PHIInfo.insert(NewElement);
+}
+
+void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) {
+  phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg);
+}
+
+void PHILinearize::deleteDef(unsigned DestReg) {
+  PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg);
+  PHIInfo.erase(InfoElement);
+  delete InfoElement;
+}
+
+void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg,
+                             MachineBasicBlock *SourceMBB) {
+  phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg,
+                                MachineBasicBlock *SourceMBB) {
+  phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
+}
+
+bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB,
+                            unsigned &DestReg) {
+  PHIInfoElementT *InfoElement =
+      findPHIInfoElementFromSource(SourceReg, SourceMBB);
+  if (InfoElement != nullptr) {
+    DestReg = phiInfoElementGetDest(InfoElement);
+    return true;
+  }
+  return false;
+}
+
+bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) {
+  unsigned DestReg;
+  return findDest(Reg, SourceMBB, DestReg);
+}
+
+unsigned PHILinearize::getNumSources(unsigned DestReg) {
+  return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size();
+}
+
+void PHILinearize::dump(MachineRegisterInfo *MRI) {
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  dbgs() << "=PHIInfo Start=\n";
+  for (auto PII : this->PHIInfo) {
+    PHIInfoElementT &Element = *PII;
+    dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+           << " Sources: {";
+    for (auto &SI : Element.Sources) {
+      dbgs() << PrintReg(SI.first, TRI) << "(BB#"
+             << SI.second->getNumber() << "),";
+    }
+    dbgs() << "}\n";
+  }
+  dbgs() << "=PHIInfo End=\n";
+}
+
+void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
+
+PHILinearize::dest_iterator PHILinearize::dests_begin() {
+  return PHILinearizeDestIterator(PHIInfo.begin());
+}
+
+PHILinearize::dest_iterator PHILinearize::dests_end() {
+  return PHILinearizeDestIterator(PHIInfo.end());
+}
+
+PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) {
+  auto InfoElement = findPHIInfoElement(Reg);
+  return phiInfoElementGetSources(InfoElement).begin();
+}
+PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) {
+  auto InfoElement = findPHIInfoElement(Reg);
+  return phiInfoElementGetSources(InfoElement).end();
+}
+
+class RegionMRT;
+class MBBMRT;
+
+static unsigned getPHINumInputs(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+  return (PHI.getNumOperands() - 1) / 2;
+}
+
+static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(Index * 2 + 2).getMBB();
+}
+
+static void setPhiPred(MachineInstr &PHI, unsigned Index,
+                       MachineBasicBlock *NewPred) {
+  PHI.getOperand(Index * 2 + 2).setMBB(NewPred);
+}
+
+static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(Index * 2 + 1).getReg();
+}
+
+static unsigned getPHIDestReg(MachineInstr &PHI) {
+  assert(PHI.isPHI());
+  return PHI.getOperand(0).getReg();
+}
+
+class LinearizedRegion {
+protected:
+  MachineBasicBlock *Entry;
+  // The exit block is part of the region, and is the last
+  // merge block before exiting the region.
+  MachineBasicBlock *Exit;
+  DenseSet<unsigned> LiveOuts;
+  SmallPtrSet<MachineBasicBlock *, 1> MBBs;
+  bool HasLoop;
+  LinearizedRegion *Parent;
+  RegionMRT *RMRT;
+
+  void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+                       MachineInstr *DefInstr, const MachineRegisterInfo *MRI,
+                       const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+                             MachineInstr *DefInstr,
+                             const MachineRegisterInfo *MRI,
+                             const TargetRegisterInfo *TRI,
+                             PHILinearize &PHIInfo);
+
+  void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                        const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+                        RegionMRT *TopRegion);
+
+  void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                     const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI,
+                     const TargetRegisterInfo *TRI, PHILinearize &PHIInfo,
+                     RegionMRT *TopRegion = nullptr);
+
+public:
+  void setRegionMRT(RegionMRT *Region) { RMRT = Region; }
+
+  RegionMRT *getRegionMRT() { return RMRT; }
+
+  void setParent(LinearizedRegion *P) { Parent = P; }
+
+  LinearizedRegion *getParent() { return Parent; }
+
+  void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr);
+
+  void setBBSelectRegIn(unsigned Reg);
+
+  unsigned getBBSelectRegIn();
+
+  void setBBSelectRegOut(unsigned Reg, bool IsLiveOut);
+
+  unsigned getBBSelectRegOut();
+
+  void setHasLoop(bool Value);
+
+  bool getHasLoop();
+
+  void addLiveOut(unsigned VReg);
+
+  void removeLiveOut(unsigned Reg);
+
+  void replaceLiveOut(unsigned OldReg, unsigned NewReg);
+
+  void replaceRegister(unsigned Register, unsigned NewRegister,
+                       MachineRegisterInfo *MRI, bool ReplaceInside,
+                       bool ReplaceOutside, bool IncludeLoopPHIs);
+
+  void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister,
+                                   bool IncludeLoopPHIs,
+                                   MachineRegisterInfo *MRI);
+
+  void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister,
+                                    bool IncludeLoopPHIs,
+                                    MachineRegisterInfo *MRI);
+
+  DenseSet<unsigned> *getLiveOuts();
+
+  void setEntry(MachineBasicBlock *NewEntry);
+
+  MachineBasicBlock *getEntry();
+
+  void setExit(MachineBasicBlock *NewExit);
+
+  MachineBasicBlock *getExit();
+
+  void addMBB(MachineBasicBlock *MBB);
+
+  void addMBBs(LinearizedRegion *InnerRegion);
+
+  bool contains(MachineBasicBlock *MBB);
+
+  bool isLiveOut(unsigned Reg);
+
+  bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI);
+
+  void removeFalseRegisterKills(MachineRegisterInfo *MRI);
+
+  void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
+                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
+                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
+
+  LinearizedRegion();
+
+  ~LinearizedRegion();
+};
+
+class MRT {
+protected:
+  RegionMRT *Parent;
+  unsigned BBSelectRegIn;
+  unsigned BBSelectRegOut;
+
+public:
+  unsigned getBBSelectRegIn() { return BBSelectRegIn; }
+
+  unsigned getBBSelectRegOut() { return BBSelectRegOut; }
+
+  void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; }
+
+  void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; }
+
+  virtual RegionMRT *getRegionMRT() { return nullptr; }
+
+  virtual MBBMRT *getMBBMRT() { return nullptr; }
+
+  bool isRegion() { return getRegionMRT() != nullptr; }
+
+  bool isMBB() { return getMBBMRT() != nullptr; }
+
+  bool isRoot() { return Parent == nullptr; }
+
+  void setParent(RegionMRT *Region) { Parent = Region; }
+
+  RegionMRT *getParent() { return Parent; }
+
+  static MachineBasicBlock *
+  initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+                DenseMap<MachineRegion *, RegionMRT *> &RegionMap);
+
+  static RegionMRT *buildMRT(MachineFunction &MF,
+                             const MachineRegionInfo *RegionInfo,
+                             const SIInstrInfo *TII,
+                             MachineRegisterInfo *MRI);
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0;
+
+  void dumpDepth(int depth) {
+    for (int i = depth; i > 0; --i) {
+      dbgs() << "  ";
+    }
+  }
+
+  virtual ~MRT() {}
+};
+
+class MBBMRT : public MRT {
+  MachineBasicBlock *MBB;
+
+public:
+  virtual MBBMRT *getMBBMRT() { return this; }
+
+  MachineBasicBlock *getMBB() { return MBB; }
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+    dumpDepth(depth);
+    dbgs() << "MBB: " << getMBB()->getNumber();
+    dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+    dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+  }
+
+  MBBMRT(MachineBasicBlock *BB) : MBB(BB) {
+    setParent(nullptr);
+    setBBSelectRegOut(0);
+    setBBSelectRegIn(0);
+  }
+};
+
+class RegionMRT : public MRT {
+protected:
+  MachineRegion *Region;
+  LinearizedRegion *LRegion;
+  MachineBasicBlock *Succ;
+
+  SetVector<MRT *> Children;
+
+public:
+  virtual RegionMRT *getRegionMRT() { return this; }
+
+  void setLinearizedRegion(LinearizedRegion *LinearizeRegion) {
+    LRegion = LinearizeRegion;
+  }
+
+  LinearizedRegion *getLinearizedRegion() { return LRegion; }
+
+  MachineRegion *getMachineRegion() { return Region; }
+
+  unsigned getInnerOutputRegister() {
+    return (*(Children.begin()))->getBBSelectRegOut();
+  }
+
+  void addChild(MRT *Tree) { Children.insert(Tree); }
+
+  SetVector<MRT *> *getChildren() { return &Children; }
+
+  virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) {
+    dumpDepth(depth);
+    dbgs() << "Region: " << (void *)Region;
+    dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI);
+    dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n";
+
+    dumpDepth(depth);
+    if (getSucc())
+      dbgs() << "Succ: " << getSucc()->getNumber() << "\n";
+    else
+      dbgs() << "Succ: none \n";
+    for (auto MRTI : Children) {
+      MRTI->dump(TRI, depth + 1);
+    }
+  }
+
+  MRT *getEntryTree() { return Children.back(); }
+
+  MRT *getExitTree() { return Children.front(); }
+
+  MachineBasicBlock *getEntry() {
+    MRT *Tree = Children.back();
+    return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry()
+                              : Tree->getMBBMRT()->getMBB();
+  }
+
+  MachineBasicBlock *getExit() {
+    MRT *Tree = Children.front();
+    return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit()
+                              : Tree->getMBBMRT()->getMBB();
+  }
+
+  void setSucc(MachineBasicBlock *MBB) { Succ = MBB; }
+
+  MachineBasicBlock *getSucc() { return Succ; }
+
+  bool contains(MachineBasicBlock *MBB) {
+    for (auto CI : Children) {
+      if (CI->isMBB()) {
+        if (MBB == CI->getMBBMRT()->getMBB()) {
+          return true;
+        }
+      } else {
+        if (CI->getRegionMRT()->contains(MBB)) {
+          return true;
+        } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr &&
+                   CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  void replaceLiveOutReg(unsigned Register, unsigned NewRegister) {
+    LinearizedRegion *LRegion = getLinearizedRegion();
+    LRegion->replaceLiveOut(Register, NewRegister);
+    for (auto &CI : Children) {
+      if (CI->isRegion()) {
+        CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+      }
+    }
+  }
+
+  RegionMRT(MachineRegion *MachineRegion)
+      : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) {
+    setParent(nullptr);
+    setBBSelectRegOut(0);
+    setBBSelectRegIn(0);
+  }
+
+  virtual ~RegionMRT() {
+    if (LRegion) {
+      delete LRegion;
+    }
+
+    for (auto CI : Children) {
+      delete &(*CI);
+    }
+  }
+};
+
+static unsigned createBBSelectReg(const SIInstrInfo *TII,
+                                  MachineRegisterInfo *MRI) {
+  return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32));
+}
+
+MachineBasicBlock *
+MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo,
+                   DenseMap<MachineRegion *, RegionMRT *> &RegionMap) {
+  for (auto &MFI : MF) {
+    MachineBasicBlock *ExitMBB = &MFI;
+    if (ExitMBB->succ_size() == 0) {
+      return ExitMBB;
+    }
+  }
+  llvm_unreachable("CFG has no exit block");
+  return nullptr;
+}
+
+RegionMRT *MRT::buildMRT(MachineFunction &MF,
+                         const MachineRegionInfo *RegionInfo,
+                         const SIInstrInfo *TII, MachineRegisterInfo *MRI) {
+  SmallPtrSet<MachineRegion *, 4> PlacedRegions;
+  DenseMap<MachineRegion *, RegionMRT *> RegionMap;
+  MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion();
+  RegionMRT *Result = new RegionMRT(TopLevelRegion);
+  RegionMap[TopLevelRegion] = Result;
+
+  // Insert the exit block first, we need it to be the merge node
+  // for the top level region.
+  MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap);
+
+  unsigned BBSelectRegIn = createBBSelectReg(TII, MRI);
+  MBBMRT *ExitMRT = new MBBMRT(Exit);
+  RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT);
+  ExitMRT->setBBSelectRegIn(BBSelectRegIn);
+
+  for (auto MBBI : post_order(&(MF.front()))) {
+    MachineBasicBlock *MBB = &(*MBBI);
+
+    // Skip Exit since we already added it
+    if (MBB == Exit) {
+      continue;
+    }
+
+    DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n");
+    MBBMRT *NewMBB = new MBBMRT(MBB);
+    MachineRegion *Region = RegionInfo->getRegionFor(MBB);
+
+    // Ensure we have the MRT region
+    if (RegionMap.count(Region) == 0) {
+      RegionMRT *NewMRTRegion = new RegionMRT(Region);
+      RegionMap[Region] = NewMRTRegion;
+
+      // Ensure all parents are in the RegionMap
+      MachineRegion *Parent = Region->getParent();
+      while (RegionMap.count(Parent) == 0) {
+        RegionMRT *NewMRTParent = new RegionMRT(Parent);
+        NewMRTParent->addChild(NewMRTRegion);
+        NewMRTRegion->setParent(NewMRTParent);
+        RegionMap[Parent] = NewMRTParent;
+        NewMRTRegion = NewMRTParent;
+        Parent = Parent->getParent();
+      }
+      RegionMap[Parent]->addChild(NewMRTRegion);
+      NewMRTRegion->setParent(RegionMap[Parent]);
+    }
+
+    // Add MBB to Region MRT
+    RegionMap[Region]->addChild(NewMBB);
+    NewMBB->setParent(RegionMap[Region]);
+    RegionMap[Region]->setSucc(Region->getExit());
+  }
+  return Result;
+}
+
+void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
+                                       MachineInstr *DefInstr,
+                                       const MachineRegisterInfo *MRI,
+                                       const TargetRegisterInfo *TRI,
+                                       PHILinearize &PHIInfo) {
+  if (TRI->isVirtualRegister(Reg)) {
+    DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+    // If this is a source register to a PHI we are chaining, it
+    // must be live out.
+    if (PHIInfo.isSource(Reg)) {
+      DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n");
+      addLiveOut(Reg);
+    } else {
+      // If this is live out of the MBB
+      for (auto &UI : MRI->use_operands(Reg)) {
+        if (UI.getParent()->getParent() != MBB) {
+          DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber()
+                       << "): " << PrintReg(Reg, TRI) << "\n");
+          addLiveOut(Reg);
+        } else {
+          // If the use is in the same MBB we have to make sure
+          // it is after the def, otherwise it is live out in a loop
+          MachineInstr *UseInstr = UI.getParent();
+          for (MachineBasicBlock::instr_iterator
+                   MII = UseInstr->getIterator(),
+                   MIE = UseInstr->getParent()->instr_end();
+               MII != MIE; ++MII) {
+            if ((&(*MII)) == DefInstr) {
+              DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI)
+                           << "\n");
+              addLiveOut(Reg);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
+                                             MachineInstr *DefInstr,
+                                             const MachineRegisterInfo *MRI,
+                                             const TargetRegisterInfo *TRI,
+                                             PHILinearize &PHIInfo) {
+  if (TRI->isVirtualRegister(Reg)) {
+    DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n");
+    for (auto &UI : MRI->use_operands(Reg)) {
+      if (!Region->contains(UI.getParent()->getParent())) {
+        DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region
+                     << "): " << PrintReg(Reg, TRI) << "\n");
+        addLiveOut(Reg);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB,
+                                     const MachineRegisterInfo *MRI,
+                                     const TargetRegisterInfo *TRI,
+                                     PHILinearize &PHIInfo) {
+  DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n");
+  for (auto &II : *MBB) {
+    for (auto &RI : II.defs()) {
+      storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo);
+    }
+    for (auto &IRI : II.implicit_operands()) {
+      if (IRI.isDef()) {
+        storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo);
+      }
+    }
+  }
+
+  // If we have a successor with a PHI, source coming from this MBB we have to
+  // add the register as live out
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       SI != E; ++SI) {
+    for (auto &II : *(*SI)) {
+      if (II.isPHI()) {
+        MachineInstr &PHI = II;
+        int numPreds = getPHINumInputs(PHI);
+        for (int i = 0; i < numPreds; ++i) {
+          if (getPHIPred(PHI, i) == MBB) {
+            unsigned PHIReg = getPHISourceReg(PHI, i);
+            DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber()
+                         << " -> BB#" << (*SI)->getNumber()
+                         << "): " << PrintReg(PHIReg, TRI) << "\n");
+            addLiveOut(PHIReg);
+          }
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "-Store Live Outs Endn-\n");
+}
+
+void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB,
+                                        const MachineRegisterInfo *MRI,
+                                        const TargetRegisterInfo *TRI,
+                                        PHILinearize &PHIInfo,
+                                        RegionMRT *TopRegion) {
+  for (auto &II : *MBB) {
+    for (auto &RI : II.defs()) {
+      storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI,
+                            PHIInfo);
+    }
+    for (auto &IRI : II.implicit_operands()) {
+      if (IRI.isDef()) {
+        storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI,
+                              TRI, PHIInfo);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::storeLiveOuts(RegionMRT *Region,
+                                     const MachineRegisterInfo *MRI,
+                                     const TargetRegisterInfo *TRI,
+                                     PHILinearize &PHIInfo,
+                                     RegionMRT *CurrentTopRegion) {
+  MachineBasicBlock *Exit = Region->getSucc();
+
+  RegionMRT *TopRegion =
+      CurrentTopRegion == nullptr ? Region : CurrentTopRegion;
+
+  // Check if exit is end of function, if so, no live outs.
+  if (Exit == nullptr)
+    return;
+
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (CI->isMBB()) {
+      auto MBB = CI->getMBBMRT()->getMBB();
+      storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion);
+    } else {
+      LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion();
+      // We should be limited to only store registers that are live out from the
+      // lineaized region
+      for (auto MBBI : SubRegion->MBBs) {
+        storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion);
+      }
+    }
+  }
+
+  if (CurrentTopRegion == nullptr) {
+    auto Succ = Region->getSucc();
+    for (auto &II : *Succ) {
+      if (II.isPHI()) {
+        MachineInstr &PHI = II;
+        int numPreds = getPHINumInputs(PHI);
+        for (int i = 0; i < numPreds; ++i) {
+          if (Region->contains(getPHIPred(PHI, i))) {
+            unsigned PHIReg = getPHISourceReg(PHI, i);
+            DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region
+                         << "): " << PrintReg(PHIReg, TRI) << "\n");
+            addLiveOut(PHIReg);
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+  OS << "Linearized Region {";
+  bool IsFirst = true;
+  for (const auto &MBB : MBBs) {
+    if (IsFirst) {
+      IsFirst = false;
+    } else {
+      OS << " ,";
+    }
+    OS << MBB->getNumber();
+  }
+  OS << "} (" << Entry->getNumber() << ", "
+     << (Exit == nullptr ? -1 : Exit->getNumber())
+     << "): In:" << PrintReg(getBBSelectRegIn(), TRI)
+     << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {";
+  for (auto &LI : LiveOuts) {
+    OS << PrintReg(LI, TRI) << " ";
+  }
+  OS << "} \n";
+}
+
+unsigned LinearizedRegion::getBBSelectRegIn() {
+  return getRegionMRT()->getBBSelectRegIn();
+}
+
+unsigned LinearizedRegion::getBBSelectRegOut() {
+  return getRegionMRT()->getBBSelectRegOut();
+}
+
+void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; }
+
+bool LinearizedRegion::getHasLoop() { return HasLoop; }
+
+void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); }
+
+void LinearizedRegion::removeLiveOut(unsigned Reg) {
+  if (isLiveOut(Reg))
+    LiveOuts.erase(Reg);
+}
+
+void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) {
+  if (isLiveOut(OldReg)) {
+    removeLiveOut(OldReg);
+    addLiveOut(NewReg);
+  }
+}
+
+void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
+                                       MachineRegisterInfo *MRI,
+                                       bool ReplaceInside, bool ReplaceOutside,
+                                       bool IncludeLoopPHI) {
+  assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+  DEBUG(dbgs() << "Pepareing to replace register (region): "
+               << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with "
+               << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n");
+
+  // If we are replacing outside, we also need to update the LiveOuts
+  if (ReplaceOutside &&
+      (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) {
+    LinearizedRegion *Current = this;
+    while (Current != nullptr && Current->getEntry() != nullptr) {
+      DEBUG(dbgs() << "Region before register replace\n");
+      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      Current->replaceLiveOut(Register, NewRegister);
+      DEBUG(dbgs() << "Region after register replace\n");
+      DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo()));
+      Current = Current->getParent();
+    }
+  }
+
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+                                         E = MRI->reg_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+
+    // We don't rewrite defs.
+    if (O.isDef())
+      continue;
+
+    bool IsInside = contains(O.getParent()->getParent());
+    bool IsLoopPHI = IsInside && (O.getParent()->isPHI() &&
+                                  O.getParent()->getParent() == getEntry());
+    bool ShouldReplace = (IsInside && ReplaceInside) ||
+                         (!IsInside && ReplaceOutside) ||
+                         (IncludeLoopPHI && IsLoopPHI);
+    if (ShouldReplace) {
+
+      if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+        DEBUG(dbgs() << "Trying to substitute physical register: "
+                     << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                     << "\n");
+        llvm_unreachable("Cannot substitute physical registers");
+      } else {
+        DEBUG(dbgs() << "Replacing register (region): "
+                     << PrintReg(Register, MRI->getTargetRegisterInfo())
+                     << " with "
+                     << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                     << "\n");
+        O.setReg(NewRegister);
+      }
+    }
+  }
+}
+
+void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register,
+                                                   unsigned NewRegister,
+                                                   bool IncludeLoopPHIs,
+                                                   MachineRegisterInfo *MRI) {
+  replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs);
+}
+
+void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register,
+                                                    unsigned NewRegister,
+                                                    bool IncludeLoopPHIs,
+                                                    MachineRegisterInfo *MRI) {
+  replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs);
+}
+
+DenseSet<unsigned> *LinearizedRegion::getLiveOuts() { return &LiveOuts; }
+
+void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) {
+  Entry = NewEntry;
+}
+
+MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; }
+
+void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; }
+
+MachineBasicBlock *LinearizedRegion::getExit() { return Exit; }
+
+void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); }
+
+void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) {
+  for (const auto &MBB : InnerRegion->MBBs) {
+    addMBB(MBB);
+  }
+}
+
+bool LinearizedRegion::contains(MachineBasicBlock *MBB) {
+  return MBBs.count(MBB) == 1;
+}
+
+bool LinearizedRegion::isLiveOut(unsigned Reg) {
+  return LiveOuts.count(Reg) == 1;
+}
+
+bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
+  return MRI->def_begin(Reg) == MRI->def_end();
+}
+
+// After the code has been structurized, what was flagged as kills
+// before are no longer register kills.
+void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
+  const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+  for (auto MBBI : MBBs) {
+    MachineBasicBlock *MBB = MBBI;
+    for (auto &II : *MBB) {
+      for (auto &RI : II.uses()) {
+        if (RI.isReg()) {
+          unsigned Reg = RI.getReg();
+          if (TRI->isVirtualRegister(Reg)) {
+            if (hasNoDef(Reg, MRI))
+              continue;
+            if (!MRI->hasOneDef(Reg)) {
+              DEBUG(this->getEntry()->getParent()->dump());
+              DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n");
+            }
+
+            if (MRI->def_begin(Reg) == MRI->def_end()) {
+              DEBUG(dbgs() << "Register "
+                           << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                           << " has NO defs\n");
+            } else if (!MRI->hasOneDef(Reg)) {
+              DEBUG(dbgs() << "Register "
+                           << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                           << " has multiple defs\n");
+            }
+
+            assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+            MachineOperand *Def = &(*(MRI->def_begin(Reg)));
+            MachineOperand *UseOperand = &(RI);
+            bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB;
+            if (UseIsOutsideDefMBB && UseOperand->isKill()) {
+              DEBUG(dbgs() << "Removing kill flag on register: "
+                           << PrintReg(Reg, TRI) << "\n");
+              UseOperand->setIsKill(false);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void LinearizedRegion::initLiveOut(RegionMRT *Region,
+                                   const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI,
+                                   PHILinearize &PHIInfo) {
+  storeLiveOuts(Region, MRI, TRI, PHIInfo);
+}
+
+LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB,
+                                   const MachineRegisterInfo *MRI,
+                                   const TargetRegisterInfo *TRI,
+                                   PHILinearize &PHIInfo) {
+  setEntry(MBB);
+  setExit(MBB);
+  storeLiveOuts(MBB, MRI, TRI, PHIInfo);
+  MBBs.insert(MBB);
+  Parent = nullptr;
+}
+
+LinearizedRegion::LinearizedRegion() {
+  setEntry(nullptr);
+  setExit(nullptr);
+  Parent = nullptr;
+}
+
+LinearizedRegion::~LinearizedRegion() {}
+
+class AMDGPUMachineCFGStructurizer : public MachineFunctionPass {
+private:
+  const MachineRegionInfo *Regions;
+  const SIInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  unsigned BBSelectRegister;
+  PHILinearize PHIInfo;
+  DenseMap<MachineBasicBlock *, MachineBasicBlock *> FallthroughMap;
+
+  void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI,
+                           SmallVector<unsigned, 2> &RegionIndices);
+  void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+                           SmallVector<unsigned, 2> &RegionIndices);
+  void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI,
+                              SmallVector<unsigned, 2> &PHINonRegionIndices);
+
+  void storePHILinearizationInfoDest(
+      unsigned LDestReg, MachineInstr &PHI,
+      SmallVector<unsigned, 2> *RegionIndices = nullptr);
+
+  unsigned storePHILinearizationInfo(MachineInstr &PHI,
+                                     SmallVector<unsigned, 2> *RegionIndices);
+
+  void extractKilledPHIs(MachineBasicBlock *MBB);
+
+  bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices,
+                 unsigned *ReplaceReg);
+
+  bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                 MachineBasicBlock *SourceMBB,
+                 SmallVector<unsigned, 2> &PHIIndices, unsigned *ReplaceReg);
+
+  void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                  MachineBasicBlock *LastMerge,
+                  SmallVector<unsigned, 2> &PHIRegionIndices);
+  void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg,
+                       MachineBasicBlock *IfMBB,
+                       SmallVector<unsigned, 2> &PHIRegionIndices);
+  void replaceLiveOutRegs(MachineInstr &PHI,
+                          SmallVector<unsigned, 2> &PHIRegionIndices,
+                          unsigned CombinedSourceReg,
+                          LinearizedRegion *LRegion);
+  void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge,
+                            MachineInstr &PHI, LinearizedRegion *LRegion);
+
+  void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge,
+                             LinearizedRegion *LRegion);
+  void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB,
+                             MachineInstr &PHI);
+  void rewriteRegionEntryPHIs(LinearizedRegion *Region,
+                              MachineBasicBlock *IfMBB);
+
+  bool regionIsSimpleIf(RegionMRT *Region);
+
+  void transformSimpleIfRegion(RegionMRT *Region);
+
+  void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
+
+  void insertUnconditionalBranch(MachineBasicBlock *MBB,
+                                 MachineBasicBlock *Dest,
+                                 const DebugLoc &DL = DebugLoc());
+
+  MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region);
+
+  void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                      MachineBasicBlock *MergeBB, unsigned DestRegister,
+                      unsigned IfSourceRegister, unsigned CodeSourceRegister,
+                      bool IsUndefIfSource = false);
+
+  MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB,
+                                   MachineBasicBlock *CodeBBStart,
+                                   MachineBasicBlock *CodeBBEnd,
+                                   MachineBasicBlock *SelectBB, unsigned IfReg,
+                                   bool InheritPreds);
+
+  void prunePHIInfo(MachineBasicBlock *MBB);
+  void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg);
+
+  void createEntryPHIs(LinearizedRegion *CurrentRegion);
+  void resolvePHIInfos(MachineBasicBlock *FunctionEntry);
+
+  void replaceRegisterWith(unsigned Register, unsigned NewRegister);
+
+  MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB,
+                                    MachineBasicBlock *CodeBB,
+                                    LinearizedRegion *LRegion,
+                                    unsigned BBSelectRegIn,
+                                    unsigned BBSelectRegOut);
+
+  MachineBasicBlock *
+  createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion,
+                 LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+                 unsigned BBSelectRegIn, unsigned BBSelectRegOut);
+  void ensureCondIsNotKilled(SmallVector<MachineOperand, 1> Cond);
+
+  void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+                               MachineBasicBlock *MergeBB,
+                               unsigned BBSelectReg);
+
+  MachineInstr *getDefInstr(unsigned Reg);
+  void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                        MachineBasicBlock *MergeBB,
+                        LinearizedRegion *InnerRegion, unsigned DestReg,
+                        unsigned SourceReg);
+  bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion,
+                   unsigned Register);
+  void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
+                          MachineBasicBlock *MergeBB,
+                          LinearizedRegion *InnerRegion,
+                          LinearizedRegion *LRegion);
+
+  void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry,
+                    MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion);
+  void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc,
+                     LinearizedRegion *LRegion);
+
+  MachineBasicBlock *splitExit(LinearizedRegion *LRegion);
+
+  MachineBasicBlock *splitEntry(LinearizedRegion *LRegion);
+
+  LinearizedRegion *initLinearizedRegion(RegionMRT *Region);
+
+  bool structurizeComplexRegion(RegionMRT *Region);
+
+  bool structurizeRegion(RegionMRT *Region);
+
+  bool structurizeRegions(RegionMRT *Region, bool isTopRegion);
+
+public:
+  static char ID;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineRegionInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+    AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) {
+      initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry());
+    }
+
+  void initFallthroughMap(MachineFunction &MF);
+
+  void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut);
+
+  unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg,
+                                     MachineRegisterInfo *MRI,
+                                     const SIInstrInfo *TII);
+
+  RegionMRT *RMRT;
+  void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; }
+
+  RegionMRT *getRegionMRT() { return RMRT; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+char AMDGPUMachineCFGStructurizer::ID = 0;
+
+bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) {
+  MachineBasicBlock *Entry = Region->getEntry();
+  MachineBasicBlock *Succ = Region->getSucc();
+  bool FoundBypass = false;
+  bool FoundIf = false;
+
+  if (Entry->succ_size() != 2) {
+    return false;
+  }
+
+  for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
+                                              E = Entry->succ_end();
+       SI != E; ++SI) {
+    MachineBasicBlock *Current = *SI;
+
+    if (Current == Succ) {
+      FoundBypass = true;
+    } else if ((Current->succ_size() == 1) &&
+               *(Current->succ_begin()) == Succ) {
+      FoundIf = true;
+    }
+  }
+
+  return FoundIf && FoundBypass;
+}
+
+void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) {
+  MachineBasicBlock *Entry = Region->getEntry();
+  MachineBasicBlock *Exit = Region->getExit();
+  TII->convertNonUniformIfRegion(Entry, Exit);
+}
+
+static void fixMBBTerminator(MachineBasicBlock *MBB) {
+
+  if (MBB->succ_size() == 1) {
+    auto *Succ = *(MBB->succ_begin());
+    for (auto &TI : MBB->terminators()) {
+      for (auto &UI : TI.uses()) {
+        if (UI.isMBB() && UI.getMBB() != Succ) {
+          UI.setMBB(Succ);
+        }
+      }
+    }
+  }
+}
+
+static void fixRegionTerminator(RegionMRT *Region) {
+  MachineBasicBlock *InternalSucc = nullptr;
+  MachineBasicBlock *ExternalSucc = nullptr;
+  LinearizedRegion *LRegion = Region->getLinearizedRegion();
+  auto Exit = LRegion->getExit();
+
+  SmallPtrSet<MachineBasicBlock *, 2> Successors;
+  for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(),
+                                              SE = Exit->succ_end();
+       SI != SE; ++SI) {
+    MachineBasicBlock *Succ = *SI;
+    if (LRegion->contains(Succ)) {
+      // Do not allow re-assign
+      assert(InternalSucc == nullptr);
+      InternalSucc = Succ;
+    } else {
+      // Do not allow re-assign
+      assert(ExternalSucc == nullptr);
+      ExternalSucc = Succ;
+    }
+  }
+
+  for (auto &TI : Exit->terminators()) {
+    for (auto &UI : TI.uses()) {
+      if (UI.isMBB()) {
+        auto Target = UI.getMBB();
+        if (Target != InternalSucc && Target != ExternalSucc) {
+          UI.setMBB(ExternalSucc);
+        }
+      }
+    }
+  }
+}
+
+// If a region region is just a sequence of regions (and the exit
+// block in the case of the top level region), we can simply skip
+// linearizing it, because it is already linear
+bool regionIsSequence(RegionMRT *Region) {
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (!CI->isRegion()) {
+      if (CI->getMBBMRT()->getMBB()->succ_size() > 1) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void fixupRegionExits(RegionMRT *Region) {
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (!CI->isRegion()) {
+      fixMBBTerminator(CI->getMBBMRT()->getMBB());
+    } else {
+      fixRegionTerminator(CI->getRegionMRT());
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+    RegionMRT *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (Region->contains(Pred)) {
+      PHIRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHIRegionIndices(
+    LinearizedRegion *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (Region->contains(Pred)) {
+      PHIRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices(
+    LinearizedRegion *Region, MachineInstr &PHI,
+    SmallVector<unsigned, 2> &PHINonRegionIndices) {
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    MachineBasicBlock *Pred = getPHIPred(PHI, i);
+    if (!Region->contains(Pred)) {
+      PHINonRegionIndices.push_back(i);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
+    unsigned LDestReg, MachineInstr &PHI,
+    SmallVector<unsigned, 2> *RegionIndices) {
+  if (RegionIndices) {
+    for (auto i : *RegionIndices) {
+      PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+    }
+  } else {
+    unsigned NumInputs = getPHINumInputs(PHI);
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i));
+    }
+  }
+}
+
+unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
+    MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
+  unsigned DestReg = getPHIDestReg(PHI);
+  unsigned LinearizeDestReg =
+      MRI->createVirtualRegister(MRI->getRegClass(DestReg));
+  PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
+  storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
+  return LinearizeDestReg;
+}
+
+void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) {
+  // We need to create a new chain for the killed phi, but there is no
+  // need to do the renaming outside or inside the block.
+  SmallPtrSet<MachineInstr *, 2> PHIs;
+  for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+                                         E = MBB->instr_end();
+       I != E; ++I) {
+    MachineInstr &Instr = *I;
+    if (Instr.isPHI()) {
+      unsigned PHIDestReg = getPHIDestReg(Instr);
+      DEBUG(dbgs() << "Extractking killed phi:\n");
+      DEBUG(Instr.dump());
+      PHIs.insert(&Instr);
+      PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc());
+      storePHILinearizationInfoDest(PHIDestReg, Instr);
+    }
+  }
+
+  for (auto PI : PHIs) {
+    PI->eraseFromParent();
+  }
+}
+
+static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
+                             unsigned Index) {
+  for (auto i : PHIRegionIndices) {
+    if (i == Index)
+      return true;
+  }
+  return false;
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+                                       SmallVector<unsigned, 2> &PHIIndices,
+                                       unsigned *ReplaceReg) {
+  return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg);
+}
+
+bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI,
+                                       unsigned CombinedSourceReg,
+                                       MachineBasicBlock *SourceMBB,
+                                       SmallVector<unsigned, 2> &PHIIndices,
+                                       unsigned *ReplaceReg) {
+  DEBUG(dbgs() << "Shrink PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI)
+               << "<def> = PHI(");
+
+  bool Replaced = false;
+  unsigned NumInputs = getPHINumInputs(PHI);
+  int SingleExternalEntryIndex = -1;
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (!isPHIRegionIndex(PHIIndices, i)) {
+      if (SingleExternalEntryIndex == -1) {
+        // Single entry
+        SingleExternalEntryIndex = i;
+      } else {
+        // Multiple entries
+        SingleExternalEntryIndex = -2;
+      }
+    }
+  }
+
+  if (SingleExternalEntryIndex > -1) {
+    *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex);
+    // We should not rewrite the code, we should only pick up the single value
+    // that represents the shrunk PHI.
+    Replaced = true;
+  } else {
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    if (SourceMBB) {
+      MIB.addReg(CombinedSourceReg);
+      MIB.addMBB(SourceMBB);
+      DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                   << SourceMBB->getNumber());
+    }
+
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+  }
+  PHI.eraseFromParent();
+  return Replaced;
+}
+
+void AMDGPUMachineCFGStructurizer::replacePHI(
+    MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+  DEBUG(dbgs() << "Replace PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI)
+               << "<def> = PHI(");
+
+  bool HasExternalEdge = false;
+  unsigned NumInputs = getPHINumInputs(PHI);
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (!isPHIRegionIndex(PHIRegionIndices, i)) {
+      HasExternalEdge = true;
+    }
+  }
+
+  if (HasExternalEdge) {
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    MIB.addReg(CombinedSourceReg);
+    MIB.addMBB(LastMerge);
+    DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                 << LastMerge->getNumber());
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIRegionIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+  } else {
+    replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg);
+  }
+  PHI.eraseFromParent();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceEntryPHI(
+    MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB,
+    SmallVector<unsigned, 2> &PHIRegionIndices) {
+
+  DEBUG(dbgs() << "Replace entry PHI: ");
+  DEBUG(PHI.dump());
+  DEBUG(dbgs() << " with ");
+
+  unsigned NumInputs = getPHINumInputs(PHI);
+  unsigned NumNonRegionInputs = NumInputs;
+  for (unsigned i = 0; i < NumInputs; ++i) {
+    if (isPHIRegionIndex(PHIRegionIndices, i)) {
+      NumNonRegionInputs--;
+    }
+  }
+
+  if (NumNonRegionInputs == 0) {
+    auto DestReg = getPHIDestReg(PHI);
+    replaceRegisterWith(DestReg, CombinedSourceReg);
+    DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n");
+    PHI.eraseFromParent();
+  } else {
+    DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << "<def> = PHI(");
+    MachineBasicBlock *MBB = PHI.getParent();
+    MachineInstrBuilder MIB =
+        BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI),
+                getPHIDestReg(PHI));
+    MIB.addReg(CombinedSourceReg);
+    MIB.addMBB(IfMBB);
+    DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#"
+                 << IfMBB->getNumber());
+    unsigned NumInputs = getPHINumInputs(PHI);
+    for (unsigned i = 0; i < NumInputs; ++i) {
+      if (isPHIRegionIndex(PHIRegionIndices, i)) {
+        continue;
+      }
+      unsigned SourceReg = getPHISourceReg(PHI, i);
+      MachineBasicBlock *SourcePred = getPHIPred(PHI, i);
+      MIB.addReg(SourceReg);
+      MIB.addMBB(SourcePred);
+      DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                   << SourcePred->getNumber());
+    }
+    DEBUG(dbgs() << ")\n");
+    PHI.eraseFromParent();
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs(
+    MachineInstr &PHI, SmallVector<unsigned, 2> &PHIRegionIndices,
+    unsigned CombinedSourceReg, LinearizedRegion *LRegion) {
+  bool WasLiveOut = false;
+  for (auto PII : PHIRegionIndices) {
+    unsigned Reg = getPHISourceReg(PHI, PII);
+    if (LRegion->isLiveOut(Reg)) {
+      bool IsDead = true;
+
+      // Check if register is live out of the basic block
+      MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent();
+      for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) {
+        if ((*UI).getParent()->getParent() != DefMBB) {
+          IsDead = false;
+        }
+      }
+
+      DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is "
+                   << (IsDead ? "dead" : "alive") << " after PHI replace\n");
+      if (IsDead) {
+        LRegion->removeLiveOut(Reg);
+      }
+      WasLiveOut = true;
+    }
+  }
+
+  if (WasLiveOut)
+    LRegion->addLiveOut(CombinedSourceReg);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region,
+                                                  MachineBasicBlock *LastMerge,
+                                                  MachineInstr &PHI,
+                                                  LinearizedRegion *LRegion) {
+  SmallVector<unsigned, 2> PHIRegionIndices;
+  getPHIRegionIndices(Region, PHI, PHIRegionIndices);
+  unsigned LinearizedSourceReg =
+      storePHILinearizationInfo(PHI, &PHIRegionIndices);
+
+  replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices);
+  replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion);
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region,
+                                                   MachineBasicBlock *IfMBB,
+                                                   MachineInstr &PHI) {
+  SmallVector<unsigned, 2> PHINonRegionIndices;
+  getPHINonRegionIndices(Region, PHI, PHINonRegionIndices);
+  unsigned LinearizedSourceReg =
+      storePHILinearizationInfo(PHI, &PHINonRegionIndices);
+  replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices);
+}
+
+static void collectPHIs(MachineBasicBlock *MBB,
+                        SmallVector<MachineInstr *, 2> &PHIs) {
+  for (auto &BBI : *MBB) {
+    if (BBI.isPHI()) {
+      PHIs.push_back(&BBI);
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region,
+                                                   MachineBasicBlock *LastMerge,
+                                                   LinearizedRegion *LRegion) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  auto Exit = Region->getSucc();
+  if (Exit == nullptr)
+    return;
+
+  collectPHIs(Exit, PHIs);
+
+  for (auto PHII : PHIs) {
+    rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region,
+                                                    MachineBasicBlock *IfMBB) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  auto Entry = Region->getEntry();
+
+  collectPHIs(Entry, PHIs);
+
+  for (auto PHII : PHIs) {
+    rewriteRegionEntryPHI(Region, IfMBB, *PHII);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB,
+                                                       MachineBasicBlock *Dest,
+                                                       const DebugLoc &DL) {
+  DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber()
+               << " -> " << Dest->getNumber() << "\n");
+  MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator();
+  bool HasTerminator = Terminator != MBB->instr_end();
+  if (HasTerminator) {
+    TII->ReplaceTailWithBranchTo(Terminator, Dest);
+  }
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) {
+    TII->insertUnconditionalBranch(*MBB, Dest, DL);
+  }
+}
+
+static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) {
+  MachineBasicBlock *result = nullptr;
+  for (auto &MFI : MF) {
+    if (MFI.succ_size() == 0) {
+      if (result == nullptr) {
+        result = &MFI;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  return result;
+}
+
+static bool hasOneExitNode(MachineFunction &MF) {
+  return getSingleExitNode(MF) != nullptr;
+}
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) {
+  auto Exit = Region->getSucc();
+
+  // If the exit is the end of the function, we just use the existing
+  MachineFunction *MF = Region->getEntry()->getParent();
+  if (Exit == nullptr && hasOneExitNode(*MF)) {
+    return &(*(--(Region->getEntry()->getParent()->end())));
+  }
+
+  MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock();
+  if (Exit == nullptr) {
+    MachineFunction::iterator ExitIter = MF->end();
+    MF->insert(ExitIter, LastMerge);
+  } else {
+    MachineFunction::iterator ExitIter = Exit->getIterator();
+    MF->insert(ExitIter, LastMerge);
+    LastMerge->addSuccessor(Exit);
+    insertUnconditionalBranch(LastMerge, Exit);
+    DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n");
+  }
+  return LastMerge;
+}
+
+void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
+                                            MachineBasicBlock *CodeBB,
+                                            MachineBasicBlock *MergeBB,
+                                            unsigned DestRegister,
+                                            unsigned IfSourceRegister,
+                                            unsigned CodeSourceRegister,
+                                            bool IsUndefIfSource) {
+  // If this is the function exit block, we don't need a phi.
+  if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+    return;
+  }
+  DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber()
+               << "): " << PrintReg(DestRegister, TRI) << "<def> = PHI("
+               << PrintReg(IfSourceRegister, TRI) << ", BB#"
+               << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI)
+               << ", BB#" << CodeBB->getNumber() << ")\n");
+  const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin());
+  MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL,
+                                    TII->get(TargetOpcode::PHI), DestRegister);
+  if (IsUndefIfSource && false) {
+    MIB.addReg(IfSourceRegister, RegState::Undef);
+  } else {
+    MIB.addReg(IfSourceRegister);
+  }
+  MIB.addMBB(IfBB);
+  MIB.addReg(CodeSourceRegister);
+  MIB.addMBB(CodeBB);
+}
+
+static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       PI != E; ++PI) {
+    if ((*PI) != MBB) {
+      (MBB)->removeSuccessor(*PI);
+    }
+  }
+}
+
+static void removeExternalCFGEdges(MachineBasicBlock *StartMBB,
+                                   MachineBasicBlock *EndMBB) {
+
+  // We have to check against the StartMBB successor becasuse a
+  // structurized region with a loop will have the entry block split,
+  // and the backedge will go to the entry successor.
+  DenseSet<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Succs;
+  unsigned SuccSize = StartMBB->succ_size();
+  if (SuccSize > 0) {
+    MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin());
+    for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(),
+                                          E = EndMBB->succ_end();
+         PI != E; ++PI) {
+      // Either we have a back-edge to the entry block, or a back-edge to the
+      // successor of the entry block since the block may be split.
+      if ((*PI) != StartMBB &&
+          !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) {
+        Succs.insert(
+            std::pair<MachineBasicBlock *, MachineBasicBlock *>(EndMBB, *PI));
+      }
+    }
+  }
+
+  for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(),
+                                        E = StartMBB->pred_end();
+       PI != E; ++PI) {
+    if ((*PI) != EndMBB) {
+      Succs.insert(
+          std::pair<MachineBasicBlock *, MachineBasicBlock *>(*PI, StartMBB));
+    }
+  }
+
+  for (auto SI : Succs) {
+    std::pair<MachineBasicBlock *, MachineBasicBlock *> Edge = SI;
+    DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#"
+                 << Edge.second->getNumber() << "\n");
+    Edge.first->removeSuccessor(Edge.second);
+  }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock(
+    MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart,
+    MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg,
+    bool InheritPreds) {
+  MachineFunction *MF = MergeBB->getParent();
+  MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock();
+
+  if (InheritPreds) {
+    for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(),
+                                          E = CodeBBStart->pred_end();
+         PI != E; ++PI) {
+      if ((*PI) != CodeBBEnd) {
+        MachineBasicBlock *Pred = (*PI);
+        Pred->addSuccessor(IfBB);
+      }
+    }
+  }
+
+  removeExternalCFGEdges(CodeBBStart, CodeBBEnd);
+
+  auto CodeBBStartI = CodeBBStart->getIterator();
+  auto CodeBBEndI = CodeBBEnd->getIterator();
+  auto MergeIter = MergeBB->getIterator();
+  MF->insert(MergeIter, IfBB);
+  MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI);
+  IfBB->addSuccessor(MergeBB);
+  IfBB->addSuccessor(CodeBBStart);
+
+  DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n");
+  // Ensure that the MergeBB is a successor of the CodeEndBB.
+  if (!CodeBBEnd->isSuccessor(MergeBB))
+    CodeBBEnd->addSuccessor(MergeBB);
+
+  DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#"
+               << CodeBBEnd->getNumber() << "\n");
+
+  // If we have a single predecessor we can find a reasonable debug location
+  MachineBasicBlock *SinglePred =
+      CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr;
+  const DebugLoc &DL = SinglePred
+                    ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator())
+                    : DebugLoc();
+
+  unsigned Reg =
+      TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg,
+                    SelectBB->getNumber() /* CodeBBStart->getNumber() */);
+  if (&(*(IfBB->getParent()->begin())) == IfBB) {
+    TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg,
+                              CodeBBStart->getNumber());
+  }
+  MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+  ArrayRef<MachineOperand> Cond(RegOp);
+  TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL);
+
+  return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
+    SmallVector<MachineOperand, 1> Cond) {
+  if (Cond.size() != 1)
+    return;
+  if (!Cond[0].isReg())
+    return;
+
+  unsigned CondReg = Cond[0].getReg();
+  for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
+    (*UI).setIsKill(false);
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB,
+                                                     MachineBasicBlock *MergeBB,
+                                                     unsigned BBSelectReg) {
+  MachineBasicBlock *TrueBB = nullptr;
+  MachineBasicBlock *FalseBB = nullptr;
+  SmallVector<MachineOperand, 1> Cond;
+  MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB];
+  TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond);
+
+  const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator());
+
+  if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) {
+    // This is an exit block, hence no successors. We will assign the
+    // bb select register to the entry block.
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              BBSelectReg,
+                              CodeBB->getParent()->begin()->getNumber());
+    insertUnconditionalBranch(CodeBB, MergeBB, DL);
+    return;
+  }
+
+  if (FalseBB == nullptr && TrueBB == nullptr) {
+    TrueBB = FallthroughBB;
+  } else if (TrueBB != nullptr) {
+    FalseBB =
+        (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB;
+  }
+
+  if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) {
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              BBSelectReg, TrueBB->getNumber());
+  } else {
+    const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
+    unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
+    unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              TrueBBReg, TrueBB->getNumber());
+    TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                              FalseBBReg, FalseBB->getNumber());
+    ensureCondIsNotKilled(Cond);
+    TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL,
+                            BBSelectReg, Cond, TrueBBReg, FalseBBReg);
+  }
+
+  insertUnconditionalBranch(CodeBB, MergeBB, DL);
+}
+
+MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) {
+  if (MRI->def_begin(Reg) == MRI->def_end()) {
+    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                 << " has NO defs\n");
+  } else if (!MRI->hasOneDef(Reg)) {
+    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
+                 << " has multiple defs\n");
+    DEBUG(dbgs() << "DEFS BEGIN:\n");
+    for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) {
+      DEBUG(DI->getParent()->dump());
+    }
+    DEBUG(dbgs() << "DEFS END\n");
+  }
+
+  assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
+  return (*(MRI->def_begin(Reg))).getParent();
+}
+
+void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
+                                              MachineBasicBlock *CodeBB,
+                                              MachineBasicBlock *MergeBB,
+                                              LinearizedRegion *InnerRegion,
+                                              unsigned DestReg,
+                                              unsigned SourceReg) {
+  // In this function we know we are part of a chain already, so we need
+  // to add the registers to the existing chain, and rename the register
+  // inside the region.
+  bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+  MachineInstr *DefInstr = getDefInstr(SourceReg);
+  if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) {
+    // Handle the case where the def is a PHI-def inside a basic
+    // block, then we only need to do renaming. Special care needs to
+    // be taken if the PHI-def is part of an existing chain, or if a
+    // new one needs to be created.
+    InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI);
+
+    // We collect all PHI Information, and if we are at the region entry,
+    // all PHIs will be removed, and then re-introduced if needed.
+    storePHILinearizationInfoDest(DestReg, *DefInstr);
+    // We have picked up all the information we need now and can remove
+    // the PHI
+    PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+    DefInstr->eraseFromParent();
+  } else {
+    // If this is not a phi-def, or it is a phi-def but from a linearized region
+    if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) {
+      // If this is a single BB and the definition is in this block we
+      // need to replace any uses outside the region.
+      InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
+    }
+    const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
+    unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+    bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
+    DEBUG(dbgs() << "Insert Chained PHI\n");
+    insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
+                   SourceReg, IsLastDef);
+
+    PHIInfo.removeSource(DestReg, SourceReg, CodeBB);
+    if (IsLastDef) {
+      const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator());
+      TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL,
+                                NextDestReg, 0);
+      PHIInfo.deleteDef(DestReg);
+    } else {
+      PHIInfo.replaceDef(DestReg, NextDestReg);
+    }
+  }
+}
+
+bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB,
+                                         LinearizedRegion *InnerRegion,
+                                         unsigned Register) {
+  return getDefInstr(Register)->getParent() == MBB ||
+         InnerRegion->contains(getDefInstr(Register)->getParent());
+}
+
+void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
+                                                MachineBasicBlock *CodeBB,
+                                                MachineBasicBlock *MergeBB,
+                                                LinearizedRegion *InnerRegion,
+                                                LinearizedRegion *LRegion) {
+  DenseSet<unsigned> *LiveOuts = InnerRegion->getLiveOuts();
+  SmallVector<unsigned, 4> OldLiveOuts;
+  bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit();
+  for (auto OLI : *LiveOuts) {
+    OldLiveOuts.push_back(OLI);
+  }
+
+  for (auto LI : OldLiveOuts) {
+    DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI));
+    if (!containsDef(CodeBB, InnerRegion, LI) ||
+        (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) {
+      // If the register simly lives through the CodeBB, we don't have
+      // to rewrite anything since the register is not defined in this
+      // part of the code.
+      DEBUG(dbgs() << "- through");
+      continue;
+    }
+    DEBUG(dbgs() << "\n");
+    unsigned Reg = LI;
+    if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) {
+      // If the register is live out, we do want to create a phi,
+      // unless it is from the Exit block, becasuse in that case there
+      // is already a PHI, and no need to create a new one.
+
+      // If the register is just a live out def and not part of a phi
+      // chain, we need to create a PHI node to handle the if region,
+      // and replace all uses outside of the region with the new dest
+      // register, unless it is the outgoing BB select register. We have
+      // already creaed phi nodes for these.
+      const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+      unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
+      unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+      // Create initializer, this value is never used, but is needed
+      // to satisfy SSA.
+      DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n");
+      TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(),
+                        IfSourceReg, 0);
+
+      InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI);
+      DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n");
+      insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg,
+                     IfSourceReg, Reg, true);
+    }
+  }
+
+  // Handle the chained definitions in PHIInfo, checking if this basic block
+  // is a source block for a definition.
+  SmallVector<unsigned, 4> Sources;
+  if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) {
+    DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber()
+                 << "\n");
+    for (auto SI : Sources) {
+      unsigned DestReg;
+      PHIInfo.findDest(SI, CodeBB, DestReg);
+      insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI);
+    }
+    DEBUG(dbgs() << "Insertion done.\n");
+  }
+
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Before PHI Prune\n");
+  DEBUG(PHIInfo.dump(MRI));
+  SmallVector<std::tuple<unsigned, unsigned, MachineBasicBlock *>, 4>
+      ElimiatedSources;
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+
+    unsigned DestReg = *DRI;
+    auto SE = PHIInfo.sources_end(DestReg);
+
+    bool MBBContainsPHISource = false;
+    // Check if there is a PHI source in this MBB
+    for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+      unsigned SourceReg = (*SRI).first;
+      MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+      if (Def->getParent()->getParent() == MBB) {
+        MBBContainsPHISource = true;
+      }
+    }
+
+    // If so, all other sources are useless since we know this block
+    // is always executed when the region is executed.
+    if (MBBContainsPHISource) {
+      for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+        PHILinearize::PHISourceT Source = *SRI;
+        unsigned SourceReg = Source.first;
+        MachineBasicBlock *SourceMBB = Source.second;
+        MachineOperand *Def = &(*(MRI->def_begin(SourceReg)));
+        if (Def->getParent()->getParent() != MBB) {
+          ElimiatedSources.push_back(
+              std::make_tuple(DestReg, SourceReg, SourceMBB));
+        }
+      }
+    }
+  }
+
+  // Remove the PHI sources that are in the given MBB
+  for (auto &SourceInfo : ElimiatedSources) {
+    PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo),
+                         std::get<2>(SourceInfo));
+  }
+  DEBUG(dbgs() << "After PHI Prune\n");
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion,
+                                            unsigned DestReg) {
+  MachineBasicBlock *Entry = CurrentRegion->getEntry();
+  MachineBasicBlock *Exit = CurrentRegion->getExit();
+
+  DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
+               << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
+
+  int NumSources = 0;
+  auto SE = PHIInfo.sources_end(DestReg);
+
+  for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+    NumSources++;
+  }
+
+  if (NumSources == 1) {
+    auto SRI = PHIInfo.sources_begin(DestReg);
+    unsigned SourceReg = (*SRI).first;
+    replaceRegisterWith(DestReg, SourceReg);
+  } else {
+    const DebugLoc &DL = Entry->findDebugLoc(Entry->begin());
+    MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL,
+                                      TII->get(TargetOpcode::PHI), DestReg);
+    DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << "<def> = PHI(");
+
+    unsigned CurrentBackedgeReg = 0;
+
+    for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) {
+      unsigned SourceReg = (*SRI).first;
+
+      if (CurrentRegion->contains((*SRI).second)) {
+        if (CurrentBackedgeReg == 0) {
+          CurrentBackedgeReg = SourceReg;
+        } else {
+          MachineInstr *PHIDefInstr = getDefInstr(SourceReg);
+          MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
+          const TargetRegisterClass *RegClass =
+              MRI->getRegClass(CurrentBackedgeReg);
+          unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+          MachineInstrBuilder BackedgePHI =
+              BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
+                      TII->get(TargetOpcode::PHI), NewBackedgeReg);
+          BackedgePHI.addReg(CurrentBackedgeReg);
+          BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0));
+          BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1));
+          BackedgePHI.addMBB((*SRI).second);
+          CurrentBackedgeReg = NewBackedgeReg;
+          DEBUG(dbgs() << "Inserting backedge PHI: "
+                       << PrintReg(NewBackedgeReg, TRI) << "<def> = PHI("
+                       << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+                       << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", "
+                       << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI)
+                       << ", BB#" << (*SRI).second->getNumber());
+        }
+      } else {
+        MIB.addReg(SourceReg);
+        MIB.addMBB((*SRI).second);
+        DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#"
+                     << (*SRI).second->getNumber() << ", ");
+      }
+    }
+
+    // Add the final backedge register source to the entry phi
+    if (CurrentBackedgeReg != 0) {
+      MIB.addReg(CurrentBackedgeReg);
+      MIB.addMBB(Exit);
+      DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#"
+                   << Exit->getNumber() << ")\n");
+    } else {
+      DEBUG(dbgs() << ")\n");
+    }
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) {
+  DEBUG(PHIInfo.dump(MRI));
+
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+
+    unsigned DestReg = *DRI;
+    createEntryPHI(CurrentRegion, DestReg);
+  }
+  PHIInfo.clear();
+}
+
+void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
+                                                 unsigned NewRegister) {
+  assert(Register != NewRegister && "Cannot replace a reg with itself");
+
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
+                                         E = MRI->reg_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+      DEBUG(dbgs() << "Trying to substitute physical register: "
+                   << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                   << "\n");
+      llvm_unreachable("Cannot substitute physical registers");
+      // We don't handle physical registers, but if we need to
+      // in the future This is how we do it:
+      // O.substPhysReg(NewRegister, *TRI);
+    } else {
+      DEBUG(dbgs() << "Replacing register: "
+                   << PrintReg(Register, MRI->getTargetRegisterInfo())
+                   << " with "
+                   << PrintReg(NewRegister, MRI->getTargetRegisterInfo())
+                   << "\n");
+      O.setReg(NewRegister);
+    }
+  }
+  PHIInfo.deleteDef(Register);
+
+  getRegionMRT()->replaceLiveOutReg(Register, NewRegister);
+
+  DEBUG(PHIInfo.dump(MRI));
+}
+
+void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) {
+  DEBUG(dbgs() << "Resolve PHI Infos\n");
+  DEBUG(PHIInfo.dump(MRI));
+  for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE;
+       ++DRI) {
+    unsigned DestReg = *DRI;
+    DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n");
+    auto SRI = PHIInfo.sources_begin(DestReg);
+    unsigned SourceReg = (*SRI).first;
+    DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI)
+                 << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n");
+
+    assert(PHIInfo.sources_end(DestReg) == ++SRI &&
+           "More than one phi source in entry node");
+    replaceRegisterWith(DestReg, SourceReg);
+  }
+}
+
+static bool isFunctionEntryBlock(MachineBasicBlock *MBB) {
+  return ((&(*(MBB->getParent()->begin()))) == MBB);
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+    MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB,
+    LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn,
+    unsigned BBSelectRegOut) {
+  if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) {
+    // Handle non-loop function entry block.
+    // We need to allow loops to the entry block and then
+    rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+    resolvePHIInfos(CodeBB);
+    removeExternalCFGSuccessors(CodeBB);
+    CodeBB->addSuccessor(MergeBB);
+    CurrentRegion->addMBB(CodeBB);
+    return nullptr;
+  }
+  if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) {
+    // Handle non-loop region entry block.
+    MachineFunction *MF = MergeBB->getParent();
+    auto MergeIter = MergeBB->getIterator();
+    auto CodeBBStartIter = CodeBB->getIterator();
+    auto CodeBBEndIter = ++(CodeBB->getIterator());
+    if (CodeBBEndIter != MergeIter) {
+      MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter);
+    }
+    rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut);
+    prunePHIInfo(CodeBB);
+    createEntryPHIs(CurrentRegion);
+    removeExternalCFGSuccessors(CodeBB);
+    CodeBB->addSuccessor(MergeBB);
+    CurrentRegion->addMBB(CodeBB);
+    return nullptr;
+  } else {
+    // Handle internal block.
+    const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
+    unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+    rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
+    bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
+    MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
+                                            BBSelectRegIn, IsRegionEntryBB);
+    CurrentRegion->addMBB(IfBB);
+    // If this is the entry block we need to make the If block the new
+    // linearized region entry.
+    if (IsRegionEntryBB) {
+      CurrentRegion->setEntry(IfBB);
+
+      if (CurrentRegion->getHasLoop()) {
+        MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+        MachineBasicBlock *ETrueBB = nullptr;
+        MachineBasicBlock *EFalseBB = nullptr;
+        SmallVector<MachineOperand, 1> ECond;
+
+        const DebugLoc &DL = DebugLoc();
+        TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+        TII->removeBranch(*RegionExit);
+
+        // We need to create a backedge if there is a loop
+        unsigned Reg = TII->insertNE(
+            RegionExit, RegionExit->instr_end(), DL,
+            CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+            CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+        MachineOperand RegOp =
+            MachineOperand::CreateReg(Reg, false, false, true);
+        ArrayRef<MachineOperand> Cond(RegOp);
+        DEBUG(dbgs() << "RegionExitReg: ");
+        DEBUG(Cond[0].print(dbgs(), TRI));
+        DEBUG(dbgs() << "\n");
+        TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+                          Cond, DebugLoc());
+        RegionExit->addSuccessor(CurrentRegion->getEntry());
+      }
+    }
+    CurrentRegion->addMBB(CodeBB);
+    LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo);
+
+    InnerRegion.setParent(CurrentRegion);
+    DEBUG(dbgs() << "Insert BB Select PHI (BB)\n");
+    insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+                   CodeBBSelectReg);
+    InnerRegion.addMBB(MergeBB);
+
+    DEBUG(InnerRegion.print(dbgs(), TRI));
+    rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion);
+    extractKilledPHIs(CodeBB);
+    if (IsRegionEntryBB) {
+      createEntryPHIs(CurrentRegion);
+    }
+    return IfBB;
+  }
+}
+
+MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
+    MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion,
+    LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB,
+    unsigned BBSelectRegIn, unsigned BBSelectRegOut) {
+  unsigned CodeBBSelectReg =
+      InnerRegion->getRegionMRT()->getInnerOutputRegister();
+  MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry();
+  MachineBasicBlock *CodeExitBB = InnerRegion->getExit();
+  MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB,
+                                          SelectBB, BBSelectRegIn, true);
+  CurrentRegion->addMBB(IfBB);
+  bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry();
+  if (isEntry) {
+
+    if (CurrentRegion->getHasLoop()) {
+      MachineBasicBlock *RegionExit = CurrentRegion->getExit();
+      MachineBasicBlock *ETrueBB = nullptr;
+      MachineBasicBlock *EFalseBB = nullptr;
+      SmallVector<MachineOperand, 1> ECond;
+
+      const DebugLoc &DL = DebugLoc();
+      TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond);
+      TII->removeBranch(*RegionExit);
+
+      // We need to create a backedge if there is a loop
+      unsigned Reg =
+          TII->insertNE(RegionExit, RegionExit->instr_end(), DL,
+                        CurrentRegion->getRegionMRT()->getInnerOutputRegister(),
+                        CurrentRegion->getRegionMRT()->getEntry()->getNumber());
+      MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
+      ArrayRef<MachineOperand> Cond(RegOp);
+      DEBUG(dbgs() << "RegionExitReg: ");
+      DEBUG(Cond[0].print(dbgs(), TRI));
+      DEBUG(dbgs() << "\n");
+      TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit,
+                        Cond, DebugLoc());
+      RegionExit->addSuccessor(IfBB);
+    }
+  }
+  CurrentRegion->addMBBs(InnerRegion);
+  DEBUG(dbgs() << "Insert BB Select PHI (region)\n");
+  insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn,
+                 CodeBBSelectReg);
+
+  rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion,
+                     CurrentRegion);
+
+  rewriteRegionEntryPHIs(InnerRegion, IfBB);
+
+  if (isEntry) {
+    CurrentRegion->setEntry(IfBB);
+  }
+
+  if (isEntry) {
+    createEntryPHIs(CurrentRegion);
+  }
+
+  return IfBB;
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
+                                          MachineBasicBlock *Entry,
+                                          MachineBasicBlock *EntrySucc,
+                                          LinearizedRegion *LRegion) {
+  SmallVector<unsigned, 2> PHIRegionIndices;
+  getPHIRegionIndices(LRegion, PHI, PHIRegionIndices);
+
+  assert(PHIRegionIndices.size() == 1);
+
+  unsigned RegionIndex = PHIRegionIndices[0];
+  unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex);
+  MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex);
+  unsigned PHIDest = getPHIDestReg(PHI);
+  unsigned PHISource = PHIDest;
+  unsigned ReplaceReg;
+
+  if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) {
+    PHISource = ReplaceReg;
+  }
+
+  const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
+  unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+  LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
+  MachineInstrBuilder MIB =
+      BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
+              TII->get(TargetOpcode::PHI), NewDestReg);
+  DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI)
+               << "<def> = PHI(");
+  MIB.addReg(PHISource);
+  MIB.addMBB(Entry);
+  DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber());
+  MIB.addReg(RegionSourceReg);
+  MIB.addMBB(RegionSourceMBB);
+  DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#"
+               << RegionSourceMBB->getNumber() << ")\n");
+}
+
+void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry,
+                                           MachineBasicBlock *EntrySucc,
+                                           LinearizedRegion *LRegion) {
+  SmallVector<MachineInstr *, 2> PHIs;
+  collectPHIs(Entry, PHIs);
+
+  for (auto PHII : PHIs) {
+    splitLoopPHI(*PHII, Entry, EntrySucc, LRegion);
+  }
+}
+
+// Split the exit block so that we can insert a end control flow
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) {
+  auto MRTRegion = LRegion->getRegionMRT();
+  auto Exit = LRegion->getExit();
+  auto MF = Exit->getParent();
+  auto Succ = MRTRegion->getSucc();
+
+  auto NewExit = MF->CreateMachineBasicBlock();
+  auto AfterExitIter = Exit->getIterator();
+  AfterExitIter++;
+  MF->insert(AfterExitIter, NewExit);
+  Exit->removeSuccessor(Succ);
+  Exit->addSuccessor(NewExit);
+  NewExit->addSuccessor(Succ);
+  insertUnconditionalBranch(NewExit, Succ);
+  LRegion->addMBB(NewExit);
+  LRegion->setExit(NewExit);
+
+  DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n");
+
+  // Replace any PHI Predecessors in the successor with NewExit
+  for (auto &II : *Succ) {
+    MachineInstr &Instr = II;
+
+    // If we are past the PHI instructions we are done
+    if (!Instr.isPHI())
+      break;
+
+    int numPreds = getPHINumInputs(Instr);
+    for (int i = 0; i < numPreds; ++i) {
+      auto Pred = getPHIPred(Instr, i);
+      if (Pred == Exit) {
+        setPhiPred(Instr, i, NewExit);
+      }
+    }
+  }
+
+  return NewExit;
+}
+
+
+static MachineBasicBlock *split(MachineBasicBlock::iterator I) {
+  // Create the fall-through block.
+  MachineBasicBlock *MBB = (*I).getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock();
+  auto MBBIter = ++(MBB->getIterator());
+  MF->insert(MBBIter, SuccMBB);
+  SuccMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(SuccMBB);
+
+  // Splice the code over.
+  SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end());
+
+  return SuccMBB;
+}
+
+// Split the entry block separating PHI-nodes and the rest of the code
+// This is needed to insert an initializer for the bb select register
+// inloop regions.
+
+MachineBasicBlock *
+AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) {
+  MachineBasicBlock *Entry = LRegion->getEntry();
+  MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI());
+  MachineBasicBlock *Exit = LRegion->getExit();
+
+  DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#"
+               << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber()
+               << "\n");
+  LRegion->addMBB(EntrySucc);
+
+  // Make the backedge go to Entry Succ
+  if (Exit->isSuccessor(Entry)) {
+    Exit->removeSuccessor(Entry);
+  }
+  Exit->addSuccessor(EntrySucc);
+  MachineInstr &Branch = *(Exit->instr_rbegin());
+  for (auto &UI : Branch.uses()) {
+    if (UI.isMBB() && UI.getMBB() == Entry) {
+      UI.setMBB(EntrySucc);
+    }
+  }
+
+  splitLoopPHIs(Entry, EntrySucc, LRegion);
+
+  return EntrySucc;
+}
+
+LinearizedRegion *
+AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) {
+  LinearizedRegion *LRegion = Region->getLinearizedRegion();
+  LRegion->initLiveOut(Region, MRI, TRI, PHIInfo);
+  LRegion->setEntry(Region->getEntry());
+  return LRegion;
+}
+
+static void removeOldExitPreds(RegionMRT *Region) {
+  MachineBasicBlock *Exit = Region->getSucc();
+  if (Exit == nullptr) {
+    return;
+  }
+  for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(),
+                                        E = Exit->pred_end();
+       PI != E; ++PI) {
+    if (Region->contains(*PI)) {
+      (*PI)->removeSuccessor(Exit);
+    }
+  }
+}
+
+static bool mbbHasBackEdge(MachineBasicBlock *MBB,
+                           SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
+    if (MBBs.count(*SI) != 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool containsNewBackedge(MRT *Tree,
+                                SmallPtrSet<MachineBasicBlock *, 8> &MBBs) {
+  // Need to traverse this in reverse since it is in post order.
+  if (Tree == nullptr)
+    return false;
+
+  if (Tree->isMBB()) {
+    MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB();
+    MBBs.insert(MBB);
+    if (mbbHasBackEdge(MBB, MBBs)) {
+      return true;
+    }
+  } else {
+    RegionMRT *Region = Tree->getRegionMRT();
+    SetVector<MRT *> *Children = Region->getChildren();
+    for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) {
+      if (containsNewBackedge(*CI, MBBs))
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool containsNewBackedge(RegionMRT *Region) {
+  SmallPtrSet<MachineBasicBlock *, 8> MBBs;
+  return containsNewBackedge(Region, MBBs);
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
+  auto *LRegion = initLinearizedRegion(Region);
+  LRegion->setHasLoop(containsNewBackedge(Region));
+  MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region);
+  MachineBasicBlock *CurrentMerge = LastMerge;
+  LRegion->addMBB(LastMerge);
+  LRegion->setExit(LastMerge);
+
+  rewriteRegionExitPHIs(Region, LastMerge, LRegion);
+  removeOldExitPreds(Region);
+
+  DEBUG(PHIInfo.dump(MRI));
+
+  SetVector<MRT *> *Children = Region->getChildren();
+  DEBUG(dbgs() << "===========If Region Start===============\n");
+  if (LRegion->getHasLoop()) {
+    DEBUG(dbgs() << "Has Backedge: Yes\n");
+  } else {
+    DEBUG(dbgs() << "Has Backedge: No\n");
+  }
+
+  unsigned BBSelectRegIn;
+  unsigned BBSelectRegOut;
+  for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) {
+    DEBUG(dbgs() << "CurrentRegion: \n");
+    DEBUG(LRegion->print(dbgs(), TRI));
+
+    auto CNI = CI;
+    ++CNI;
+
+    MRT *Child = (*CI);
+
+    if (Child->isRegion()) {
+
+      LinearizedRegion *InnerLRegion =
+          Child->getRegionMRT()->getLinearizedRegion();
+      // We found the block is the exit of an inner region, we need
+      // to put it in the current linearized region.
+
+      DEBUG(dbgs() << "Linearizing region: ");
+      DEBUG(InnerLRegion->print(dbgs(), TRI));
+      DEBUG(dbgs() << "\n");
+
+      MachineBasicBlock *InnerEntry = InnerLRegion->getEntry();
+      if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) {
+        // Entry has already been linearized, no need to do this region.
+        unsigned OuterSelect = InnerLRegion->getBBSelectRegOut();
+        unsigned InnerSelectReg =
+            InnerLRegion->getRegionMRT()->getInnerOutputRegister();
+        replaceRegisterWith(InnerSelectReg, OuterSelect),
+            resolvePHIInfos(InnerEntry);
+        if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge))
+          InnerLRegion->getExit()->addSuccessor(CurrentMerge);
+        continue;
+      }
+
+      BBSelectRegOut = Child->getBBSelectRegOut();
+      BBSelectRegIn = Child->getBBSelectRegIn();
+
+      DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+                   << "\n");
+      DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+                   << "\n");
+
+      MachineBasicBlock *IfEnd = CurrentMerge;
+      CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion,
+                                    Child->getRegionMRT()->getEntry(),
+                                    BBSelectRegIn, BBSelectRegOut);
+      TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+    } else {
+      MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB();
+      DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n");
+
+      if (MBB == getSingleExitNode(*(MBB->getParent()))) {
+        // If this is the exit block then we need to skip to the next.
+        // The "in" register will be transferred to "out" in the next
+        // iteration.
+        continue;
+      }
+
+      BBSelectRegOut = Child->getBBSelectRegOut();
+      BBSelectRegIn = Child->getBBSelectRegIn();
+
+      DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI)
+                   << "\n");
+      DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI)
+                   << "\n");
+
+      MachineBasicBlock *IfEnd = CurrentMerge;
+      // This is a basic block that is not part of an inner region, we
+      // need to put it in the current linearized region.
+      CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn,
+                                    BBSelectRegOut);
+      if (CurrentMerge) {
+        TII->convertNonUniformIfRegion(CurrentMerge, IfEnd);
+      }
+
+      DEBUG(PHIInfo.dump(MRI));
+    }
+  }
+
+  LRegion->removeFalseRegisterKills(MRI);
+
+  if (LRegion->getHasLoop()) {
+    MachineBasicBlock *NewSucc = splitEntry(LRegion);
+    if (isFunctionEntryBlock(LRegion->getEntry())) {
+      resolvePHIInfos(LRegion->getEntry());
+    }
+    const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
+    unsigned InReg = LRegion->getBBSelectRegIn();
+    unsigned InnerSelectReg =
+        MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+    TII->materializeImmediate(*(LRegion->getEntry()),
+                              LRegion->getEntry()->getFirstTerminator(), DL,
+                              NewInReg, Region->getEntry()->getNumber());
+    // Need to be careful about updating the registers inside the region.
+    LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
+    DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
+    insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc,
+                   InnerSelectReg, NewInReg,
+                   LRegion->getRegionMRT()->getInnerOutputRegister());
+    splitExit(LRegion);
+    TII->convertNonUniformLoopRegion(NewSucc, LastMerge);
+  }
+
+  if (Region->isRoot()) {
+    TII->insertReturn(*LastMerge);
+  }
+
+  DEBUG(Region->getEntry()->getParent()->dump());
+  DEBUG(LRegion->print(dbgs(), TRI));
+  DEBUG(PHIInfo.dump(MRI));
+
+  DEBUG(dbgs() << "===========If Region End===============\n");
+
+  Region->setLinearizedRegion(LRegion);
+  return true;
+}
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) {
+  if (false && regionIsSimpleIf(Region)) {
+    transformSimpleIfRegion(Region);
+    return true;
+  } else if (regionIsSequence(Region)) {
+    fixupRegionExits(Region);
+    return false;
+  } else {
+    structurizeComplexRegion(Region);
+  }
+  return false;
+}
+
+static int structurize_once = 0;
+
+bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region,
+                                                bool isTopRegion) {
+  bool Changed = false;
+
+  auto Children = Region->getChildren();
+  for (auto CI : *Children) {
+    if (CI->isRegion()) {
+      Changed |= structurizeRegions(CI->getRegionMRT(), false);
+    }
+  }
+
+  if (structurize_once < 2 || true) {
+    Changed |= structurizeRegion(Region);
+    structurize_once++;
+  }
+  return Changed;
+}
+
+void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) {
+  DEBUG(dbgs() << "Fallthrough Map:\n");
+  for (auto &MBBI : MF) {
+    MachineBasicBlock *MBB = MBBI.getFallThrough();
+    if (MBB != nullptr) {
+      DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> "
+                   << MBB->getNumber() << "\n");
+    }
+    FallthroughMap[&MBBI] = MBB;
+  }
+}
+
+void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region,
+                                                    unsigned SelectOut) {
+  LinearizedRegion *LRegion = new LinearizedRegion();
+  if (SelectOut) {
+    LRegion->addLiveOut(SelectOut);
+    DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI)
+                 << "\n");
+  }
+  LRegion->setRegionMRT(Region);
+  Region->setLinearizedRegion(LRegion);
+  LRegion->setParent(Region->getParent()
+                         ? Region->getParent()->getLinearizedRegion()
+                         : nullptr);
+}
+
+unsigned
+AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut,
+                                                  MachineRegisterInfo *MRI,
+                                                  const SIInstrInfo *TII) {
+  if (MRT->isRegion()) {
+    RegionMRT *Region = MRT->getRegionMRT();
+    Region->setBBSelectRegOut(SelectOut);
+    unsigned InnerSelectOut = createBBSelectReg(TII, MRI);
+
+    // Fixme: Move linearization creation to the original spot
+    createLinearizedRegion(Region, SelectOut);
+
+    for (auto CI = Region->getChildren()->begin(),
+              CE = Region->getChildren()->end();
+         CI != CE; ++CI) {
+      InnerSelectOut =
+          initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
+    }
+    MRT->setBBSelectRegIn(InnerSelectOut);
+    return InnerSelectOut;
+  } else {
+    MRT->setBBSelectRegOut(SelectOut);
+    unsigned NewSelectIn = createBBSelectReg(TII, MRI);
+    MRT->setBBSelectRegIn(NewSelectIn);
+    return NewSelectIn;
+  }
+}
+
+static void checkRegOnlyPHIInputs(MachineFunction &MF) {
+  for (auto &MBBI : MF) {
+    for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(),
+                                           E = MBBI.instr_end();
+         I != E; ++I) {
+      MachineInstr &Instr = *I;
+      if (Instr.isPHI()) {
+        int numPreds = getPHINumInputs(Instr);
+        for (int i = 0; i < numPreds; ++i) {
+          assert(Instr.getOperand(i * 2 + 1).isReg() &&
+                 "PHI Operand not a register");
+        }
+      }
+    }
+  }
+}
+
+
+INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+                      "AMDGPU Machine CFG Structurizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass)
+INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer",
+                    "AMDGPU Machine CFG Structurizer", false, false)
+
+char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID;
+
+
+bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &(MF.getRegInfo());
+  initFallthroughMap(MF);
+
+  checkRegOnlyPHIInputs(MF);
+  DEBUG(dbgs() << "----STRUCTURIZER START----\n");
+  DEBUG(MF.dump());
+
+  Regions = &(getAnalysis<MachineRegionInfoPass>().getRegionInfo());
+  DEBUG(Regions->dump());
+
+  RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI);
+  setRegionMRT(RTree);
+  initializeSelectRegisters(RTree, 0, MRI, TII);
+  DEBUG(RTree->dump(TRI));
+  bool result = structurizeRegions(RTree, true);
+  delete RTree;
+  DEBUG(dbgs() << "----STRUCTURIZER END----\n");
+  initFallthroughMap(MF);
+  return result;
+}
+
+FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
+  return new AMDGPUMachineCFGStructurizer();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 40c3327..9fb7f5f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -19,8 +19,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
-  IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
-           MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) {
+  IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 5d0640b..99bb61b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
 
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
 
@@ -30,7 +30,11 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// Start of implicit kernel args
   unsigned ABIArgOffset;
 
-  bool IsKernel;
+  // Kernels + shaders. i.e. functions called by the driver and not not called
+  // by other functions.
+  bool IsEntryFunction;
+
+  bool NoSignedZerosFPMath;
 
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
@@ -66,8 +70,12 @@ public:
     return LDSSize;
   }
 
-  bool isKernel() const {
-    return IsKernel;
+  bool isEntryFunction() const {
+    return IsEntryFunction;
+  }
+
+  bool hasNoSignedZerosFPMath() const {
+    return NoSignedZerosFPMath;
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
new file mode 100644
index 0000000..7263ba7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -0,0 +1,64 @@
+//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMacroFusion.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+
+#include "llvm/CodeGen/MacroFusion.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const SIInstrInfo &TII = static_cast<const SIInstrInfo&>(TII_);
+
+  switch (SecondMI.getOpcode()) {
+  case AMDGPU::V_ADDC_U32_e64:
+  case AMDGPU::V_SUBB_U32_e64:
+  case AMDGPU::V_CNDMASK_B32_e64: {
+    // Try to cluster defs of condition registers to their uses. This improves
+    // the chance VCC will be available which will allow shrinking to VOP2
+    // encodings.
+    if (!FirstMI)
+      return true;
+
+    const MachineOperand *Src2 = TII.getNamedOperand(SecondMI,
+                                                     AMDGPU::OpName::src2);
+    return FirstMI->definesRegister(Src2->getReg());
+  }
+  default:
+    return false;
+  }
+
+  return false;
+}
+
+} // end namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
new file mode 100644
index 0000000..8449585
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.h
@@ -0,0 +1,19 @@
+//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createAMDGPUMacroFusionDAGMutation());
+/// to AMDGPUPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation();
+
+} // llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 947d45b..71b9ab6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -19,12 +19,13 @@
 
 namespace AMDGPU {
 
-namespace PT_NOTE {
+namespace ElfNote {
 
 const char SectionName[] = ".note";
 
 const char NoteName[] = "AMD";
 
+// TODO: Move this enum to include/llvm/Support so it can be used in tools?
 enum NoteType{
     NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
     NT_AMDGPU_HSA_HSAIL = 2,
@@ -32,7 +33,7 @@ enum NoteType{
     NT_AMDGPU_HSA_PRODUCER = 4,
     NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
     NT_AMDGPU_HSA_EXTENSION = 6,
-    NT_AMDGPU_HSA_RUNTIME_METADATA = 7,
+    NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10,
     NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
     NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
 };
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index baa28de..625c9b7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -14,12 +14,50 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <map>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -31,16 +69,16 @@ namespace {
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
   const TargetMachine *TM;
-  Module *Mod;
-  const DataLayout *DL;
-  MDNode *MaxWorkGroupSizeRange;
+  Module *Mod = nullptr;
+  const DataLayout *DL = nullptr;
+  AMDGPUAS AS;
 
   // FIXME: This should be per-kernel.
-  uint32_t LocalMemLimit;
-  uint32_t CurrentLocalMemUsage;
+  uint32_t LocalMemLimit = 0;
+  uint32_t CurrentLocalMemUsage = 0;
 
-  bool IsAMDGCN;
-  bool IsAMDHSA;
+  bool IsAMDGCN = false;
+  bool IsAMDHSA = false;
 
   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
@@ -59,26 +97,20 @@ private:
                                        Instruction *UseInst,
                                        int OpIdx0, int OpIdx1) const;
 
+  /// Check whether we have enough local memory for promotion.
+  bool hasSufficientLocalMem(const Function &F);
+
 public:
   static char ID;
 
-  AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
-    FunctionPass(ID),
-    TM(TM_),
-    Mod(nullptr),
-    DL(nullptr),
-    MaxWorkGroupSizeRange(nullptr),
-    LocalMemLimit(0),
-    CurrentLocalMemUsage(0),
-    IsAMDGCN(false),
-    IsAMDHSA(false) { }
+  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
 
   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
 
-  void handleAlloca(AllocaInst &I);
+  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -86,146 +118,60 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
 
-INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
-                   "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                "AMDGPU promote alloca to vector or LDS", false, false)
 
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 
-
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
-  if (!TM)
-    return false;
-
   Mod = &M;
   DL = &Mod->getDataLayout();
 
-  // The maximum workitem id.
-  //
-  // FIXME: Should get as subtarget property. Usually runtime enforced max is
-  // 256.
-  MDBuilder MDB(Mod->getContext());
-  MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
-
-  const Triple &TT = TM->getTargetTriple();
-
-  IsAMDGCN = TT.getArch() == Triple::amdgcn;
-  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
-
   return false;
 }
 
 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
-  if (!TM || skipFunction(F))
+  if (skipFunction(F))
     return false;
 
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
-  if (!ST.isPromoteAllocaEnabled())
-    return false;
-
-  FunctionType *FTy = F.getFunctionType();
-
-  // If the function has any arguments in the local address space, then it's
-  // possible these arguments require the entire local memory space, so
-  // we cannot use local memory in the pass.
-  for (Type *ParamTy : FTy->params()) {
-    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      LocalMemLimit = 0;
-      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
-                      "local memory disabled.\n");
-      return false;
-    }
-  }
-
-  LocalMemLimit = ST.getLocalMemorySize();
-  if (LocalMemLimit == 0)
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    TM = &TPC->getTM<TargetMachine>();
+  else
     return false;
 
-  const DataLayout &DL = Mod->getDataLayout();
-
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
-  for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
-      continue;
-
-    for (const User *U : GV.users()) {
-      const Instruction *Use = dyn_cast<Instruction>(U);
-      if (!Use)
-        continue;
-
-      if (Use->getParent()->getParent() == &F) {
-        unsigned Align = GV.getAlignment();
-        if (Align == 0)
-          Align = DL.getABITypeAlignment(GV.getValueType());
-
-        // FIXME: Try to account for padding here. The padding is currently
-        // determined from the inverse order of uses in the function. I'm not
-        // sure if the use list order is in any way connected to this, so the
-        // total reported size is likely incorrect.
-        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
-        CurrentLocalMemUsage += AllocSize;
-        break;
-      }
-    }
-  }
-
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
-
-  // Restrict local memory usage so that we don't drastically reduce occupancy,
-  // unless it is already significantly reduced.
-
-  // TODO: Have some sort of hint or other heuristics to guess occupancy based
-  // on other factors..
-  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
-  if (OccupancyHint == 0)
-    OccupancyHint = 7;
-
-  // Clamp to max value.
-  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
-  // Check the hint but ignore it if it's obviously wrong from the existing LDS
-  // usage.
-  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
-
-  // Round up to the next tier of usage.
-  unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+  const Triple &TT = TM->getTargetTriple();
+  IsAMDGCN = TT.getArch() == Triple::amdgcn;
+  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
 
-  // Program is possibly broken by using more local mem than available.
-  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+  if (!ST.isPromoteAllocaEnabled())
     return false;
 
-  LocalMemLimit = MaxSizeWithWaveCount;
-
-  DEBUG(
-    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
-    << "  Rounding size to " << MaxSizeWithWaveCount
-    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-    << " available for promotion\n"
-  );
+  AS = AMDGPU::getAMDGPUAS(*F.getParent());
 
+  bool SufficientLDS = hasSufficientLocalMem(F);
+  bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
     AllocaInst *AI = dyn_cast<AllocaInst>(I);
 
     ++I;
     if (AI)
-      handleAlloca(*AI);
+      Changed |= handleAlloca(*AI, SufficientLDS);
   }
 
-  return true;
+  return Changed;
 }
 
 std::pair<Value *, Value *>
 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
+                                *Builder.GetInsertBlock()->getParent());
+
   if (!IsAMDHSA) {
     Function *LocalSizeYFn
       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
@@ -235,8 +181,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
 
-    LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
-    LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+    ST.makeLIDRangeMetadata(LocalSizeY);
+    ST.makeLIDRangeMetadata(LocalSizeZ);
 
     return std::make_pair(LocalSizeY, LocalSizeZ);
   }
@@ -279,15 +225,15 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 
   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
-  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
-  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
 
   // Size of the dispatch packet struct.
-  DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
   Value *CastDispatchPtr = Builder.CreateBitCast(
-    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+    DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
 
   // We could do a single 64-bit load here, but it's likely that the basic
   // 32-bit and extract sequence is already present, and it is probably easier
@@ -298,10 +244,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
 
-  MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
+  MDNode *MD = MDNode::get(Mod->getContext(), None);
   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
-  LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  ST.makeLIDRangeMetadata(LoadZU);
 
   // Extract y component. Upper half of LoadZU should be zero already.
   Value *Y = Builder.CreateLShr(LoadXY, 16);
@@ -310,6 +256,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
 }
 
 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
+                                *Builder.GetInsertBlock()->getParent());
   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
 
   switch (N) {
@@ -332,7 +280,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
 
   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
-  CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  ST.makeLIDRangeMetadata(CI);
 
   return CI;
 }
@@ -369,29 +317,37 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 // instructions.
 static bool canVectorizeInst(Instruction *Inst, User *User) {
   switch (Inst->getOpcode()) {
-  case Instruction::Load:
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(Inst);
+    // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
+    return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
+  }
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return true;
   case Instruction::Store: {
-    // Must be the stored pointer operand, not a stored value.
+    // Must be the stored pointer operand, not a stored value, plus
+    // since it should be canonical form, the User should be a GEP.
     StoreInst *SI = cast<StoreInst>(Inst);
-    return SI->getPointerOperand() == User;
+    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
   }
   default:
     return false;
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
   // FIXME: There is no reason why we can't support larger arrays, we
   // are just being conservative for now.
+  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
+  // could also be promoted but we don't currently handle this case
   if (!AllocaTy ||
       AllocaTy->getElementType()->isVectorTy() ||
+      AllocaTy->getElementType()->isArrayTy() ||
       AllocaTy->getNumElements() > 4 ||
       AllocaTy->getNumElements() < 2) {
     DEBUG(dbgs() << "  Cannot convert type to vector\n");
@@ -438,8 +394,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
-      Value *Ptr = Inst->getOperand(0);
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
+      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
@@ -450,14 +406,15 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
 
-      Value *Ptr = Inst->getOperand(1);
+      StoreInst *SI = cast<StoreInst>(Inst);
+      Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(BitCast);
       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
-                                                       Inst->getOperand(0),
+                                                       SI->getValueOperand(),
                                                        Index);
       Builder.CreateStore(NewVecValue, BitCast);
       Inst->eraseFromParent();
@@ -580,6 +537,9 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
     }
 
     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
+      // Give up if the pointer may be captured.
+      if (PointerMayBeCaptured(UseInst, true, true))
+        return false;
       // Don't collect the users of this.
       WorkList.push_back(User);
       continue;
@@ -626,12 +586,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
   return true;
 }
 
+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+
+  FunctionType *FTy = F.getFunctionType();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                      "local memory disabled.\n");
+      return false;
+    }
+  }
+
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
+        continue;
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
+      }
+    }
+  }
+
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
+
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
+
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  return true;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // Array allocations are probably not worth handling, since an allocation of
   // the array type is the canonical form.
   if (!I.isStaticAlloca() || I.isArrayAllocation())
-    return;
+    return false;
 
   IRBuilder<> Builder(&I);
 
@@ -640,23 +693,30 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I)) {
-    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
-    return;
-  }
+  if (tryPromoteAllocaToVector(&I, AS))
+    return true; // Promoted to vector.
 
   const Function &ContainingFunction = *I.getParent()->getParent();
+  CallingConv::ID CC = ContainingFunction.getCallingConv();
 
   // Don't promote the alloca to LDS for shader calling conventions as the work
   // item ID intrinsics are not supported for these calling conventions.
   // Furthermore not all LDS is available for some of the stages.
-  if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
-    return;
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    break;
+  default:
+    DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
+    return false;
+  }
+
+  // Not likely to have sufficient local memory for promotion.
+  if (!SufficientLDS)
+    return false;
 
   const AMDGPUSubtarget &ST =
     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
-  // FIXME: We should also try to get this value from the reqd_work_group_size
-  // function attribute if it is available.
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
   const DataLayout &DL = Mod->getDataLayout();
@@ -678,7 +738,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
   if (NewSize > LocalMemLimit) {
     DEBUG(dbgs() << "  " << AllocSize
           << " bytes of local memory not available to promote\n");
-    return;
+    return false;
   }
 
   CurrentLocalMemUsage = NewSize;
@@ -687,7 +747,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return;
+    return false;
   }
 
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
@@ -701,7 +761,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       Twine(F->getName()) + Twine('.') + I.getName(),
       nullptr,
       GlobalVariable::NotThreadLocal,
-      AMDGPUAS::LOCAL_ADDRESS);
+      AS.LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   GV->setAlignment(I.getAlignment());
 
@@ -734,7 +794,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
         Value *Src0 = CI->getOperand(0);
         Type *EltTy = Src0->getType()->getPointerElementType();
-        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+        PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
 
         if (isa<ConstantPointerNull>(CI->getOperand(0)))
           CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -751,7 +811,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
         continue;
 
       Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
 
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
@@ -819,22 +879,23 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       Type *SrcTy = Src->getType()->getPointerElementType();
       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
         Intrinsic::objectsize,
-        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+        { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
       );
 
-      CallInst *NewCall
-        = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+      CallInst *NewCall = Builder.CreateCall(
+          ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
       Intr->replaceAllUsesWith(NewCall);
       Intr->eraseFromParent();
       continue;
     }
     default:
-      Intr->dump();
+      Intr->print(errs());
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
     }
   }
+  return true;
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
-  return new AMDGPUPromoteAlloca(TM);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+  return new AMDGPUPromoteAlloca();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
new file mode 100644
index 0000000..36d88f5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegAsmNames.inc.cpp
@@ -0,0 +1,353 @@
+//===-- AMDGPURegAsmNames.inc - Register asm names ----------*- C++ -*-----===//
+
+#ifdef AMDGPU_REG_ASM_NAMES
+
+static const char *const VGPR32RegNames[] = {
+    "v0",   "v1",   "v2",   "v3",   "v4",   "v5",   "v6",   "v7",   "v8",
+    "v9",   "v10",  "v11",  "v12",  "v13",  "v14",  "v15",  "v16",  "v17",
+    "v18",  "v19",  "v20",  "v21",  "v22",  "v23",  "v24",  "v25",  "v26",
+    "v27",  "v28",  "v29",  "v30",  "v31",  "v32",  "v33",  "v34",  "v35",
+    "v36",  "v37",  "v38",  "v39",  "v40",  "v41",  "v42",  "v43",  "v44",
+    "v45",  "v46",  "v47",  "v48",  "v49",  "v50",  "v51",  "v52",  "v53",
+    "v54",  "v55",  "v56",  "v57",  "v58",  "v59",  "v60",  "v61",  "v62",
+    "v63",  "v64",  "v65",  "v66",  "v67",  "v68",  "v69",  "v70",  "v71",
+    "v72",  "v73",  "v74",  "v75",  "v76",  "v77",  "v78",  "v79",  "v80",
+    "v81",  "v82",  "v83",  "v84",  "v85",  "v86",  "v87",  "v88",  "v89",
+    "v90",  "v91",  "v92",  "v93",  "v94",  "v95",  "v96",  "v97",  "v98",
+    "v99",  "v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
+    "v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", "v116",
+    "v117", "v118", "v119", "v120", "v121", "v122", "v123", "v124", "v125",
+    "v126", "v127", "v128", "v129", "v130", "v131", "v132", "v133", "v134",
+    "v135", "v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143",
+    "v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151", "v152",
+    "v153", "v154", "v155", "v156", "v157", "v158", "v159", "v160", "v161",
+    "v162", "v163", "v164", "v165", "v166", "v167", "v168", "v169", "v170",
+    "v171", "v172", "v173", "v174", "v175", "v176", "v177", "v178", "v179",
+    "v180", "v181", "v182", "v183", "v184", "v185", "v186", "v187", "v188",
+    "v189", "v190", "v191", "v192", "v193", "v194", "v195", "v196", "v197",
+    "v198", "v199", "v200", "v201", "v202", "v203", "v204", "v205", "v206",
+    "v207", "v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215",
+    "v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223", "v224",
+    "v225", "v226", "v227", "v228", "v229", "v230", "v231", "v232", "v233",
+    "v234", "v235", "v236", "v237", "v238", "v239", "v240", "v241", "v242",
+    "v243", "v244", "v245", "v246", "v247", "v248", "v249", "v250", "v251",
+    "v252", "v253", "v254", "v255"
+};
+
+static const char *const SGPR32RegNames[] = {
+    "s0",   "s1",   "s2",   "s3",   "s4",  "s5",  "s6",  "s7",  "s8",  "s9",
+    "s10",  "s11",  "s12",  "s13",  "s14", "s15", "s16", "s17", "s18", "s19",
+    "s20",  "s21",  "s22",  "s23",  "s24", "s25", "s26", "s27", "s28", "s29",
+    "s30",  "s31",  "s32",  "s33",  "s34", "s35", "s36", "s37", "s38", "s39",
+    "s40",  "s41",  "s42",  "s43",  "s44", "s45", "s46", "s47", "s48", "s49",
+    "s50",  "s51",  "s52",  "s53",  "s54", "s55", "s56", "s57", "s58", "s59",
+    "s60",  "s61",  "s62",  "s63",  "s64", "s65", "s66", "s67", "s68", "s69",
+    "s70",  "s71",  "s72",  "s73",  "s74", "s75", "s76", "s77", "s78", "s79",
+    "s80",  "s81",  "s82",  "s83",  "s84", "s85", "s86", "s87", "s88", "s89",
+    "s90",  "s91",  "s92",  "s93",  "s94", "s95", "s96", "s97", "s98", "s99",
+    "s100", "s101", "s102", "s103"
+};
+
+static const char *const VGPR64RegNames[] = {
+    "v[0:1]",     "v[1:2]",     "v[2:3]",     "v[3:4]",     "v[4:5]",
+    "v[5:6]",     "v[6:7]",     "v[7:8]",     "v[8:9]",     "v[9:10]",
+    "v[10:11]",   "v[11:12]",   "v[12:13]",   "v[13:14]",   "v[14:15]",
+    "v[15:16]",   "v[16:17]",   "v[17:18]",   "v[18:19]",   "v[19:20]",
+    "v[20:21]",   "v[21:22]",   "v[22:23]",   "v[23:24]",   "v[24:25]",
+    "v[25:26]",   "v[26:27]",   "v[27:28]",   "v[28:29]",   "v[29:30]",
+    "v[30:31]",   "v[31:32]",   "v[32:33]",   "v[33:34]",   "v[34:35]",
+    "v[35:36]",   "v[36:37]",   "v[37:38]",   "v[38:39]",   "v[39:40]",
+    "v[40:41]",   "v[41:42]",   "v[42:43]",   "v[43:44]",   "v[44:45]",
+    "v[45:46]",   "v[46:47]",   "v[47:48]",   "v[48:49]",   "v[49:50]",
+    "v[50:51]",   "v[51:52]",   "v[52:53]",   "v[53:54]",   "v[54:55]",
+    "v[55:56]",   "v[56:57]",   "v[57:58]",   "v[58:59]",   "v[59:60]",
+    "v[60:61]",   "v[61:62]",   "v[62:63]",   "v[63:64]",   "v[64:65]",
+    "v[65:66]",   "v[66:67]",   "v[67:68]",   "v[68:69]",   "v[69:70]",
+    "v[70:71]",   "v[71:72]",   "v[72:73]",   "v[73:74]",   "v[74:75]",
+    "v[75:76]",   "v[76:77]",   "v[77:78]",   "v[78:79]",   "v[79:80]",
+    "v[80:81]",   "v[81:82]",   "v[82:83]",   "v[83:84]",   "v[84:85]",
+    "v[85:86]",   "v[86:87]",   "v[87:88]",   "v[88:89]",   "v[89:90]",
+    "v[90:91]",   "v[91:92]",   "v[92:93]",   "v[93:94]",   "v[94:95]",
+    "v[95:96]",   "v[96:97]",   "v[97:98]",   "v[98:99]",   "v[99:100]",
+    "v[100:101]", "v[101:102]", "v[102:103]", "v[103:104]", "v[104:105]",
+    "v[105:106]", "v[106:107]", "v[107:108]", "v[108:109]", "v[109:110]",
+    "v[110:111]", "v[111:112]", "v[112:113]", "v[113:114]", "v[114:115]",
+    "v[115:116]", "v[116:117]", "v[117:118]", "v[118:119]", "v[119:120]",
+    "v[120:121]", "v[121:122]", "v[122:123]", "v[123:124]", "v[124:125]",
+    "v[125:126]", "v[126:127]", "v[127:128]", "v[128:129]", "v[129:130]",
+    "v[130:131]", "v[131:132]", "v[132:133]", "v[133:134]", "v[134:135]",
+    "v[135:136]", "v[136:137]", "v[137:138]", "v[138:139]", "v[139:140]",
+    "v[140:141]", "v[141:142]", "v[142:143]", "v[143:144]", "v[144:145]",
+    "v[145:146]", "v[146:147]", "v[147:148]", "v[148:149]", "v[149:150]",
+    "v[150:151]", "v[151:152]", "v[152:153]", "v[153:154]", "v[154:155]",
+    "v[155:156]", "v[156:157]", "v[157:158]", "v[158:159]", "v[159:160]",
+    "v[160:161]", "v[161:162]", "v[162:163]", "v[163:164]", "v[164:165]",
+    "v[165:166]", "v[166:167]", "v[167:168]", "v[168:169]", "v[169:170]",
+    "v[170:171]", "v[171:172]", "v[172:173]", "v[173:174]", "v[174:175]",
+    "v[175:176]", "v[176:177]", "v[177:178]", "v[178:179]", "v[179:180]",
+    "v[180:181]", "v[181:182]", "v[182:183]", "v[183:184]", "v[184:185]",
+    "v[185:186]", "v[186:187]", "v[187:188]", "v[188:189]", "v[189:190]",
+    "v[190:191]", "v[191:192]", "v[192:193]", "v[193:194]", "v[194:195]",
+    "v[195:196]", "v[196:197]", "v[197:198]", "v[198:199]", "v[199:200]",
+    "v[200:201]", "v[201:202]", "v[202:203]", "v[203:204]", "v[204:205]",
+    "v[205:206]", "v[206:207]", "v[207:208]", "v[208:209]", "v[209:210]",
+    "v[210:211]", "v[211:212]", "v[212:213]", "v[213:214]", "v[214:215]",
+    "v[215:216]", "v[216:217]", "v[217:218]", "v[218:219]", "v[219:220]",
+    "v[220:221]", "v[221:222]", "v[222:223]", "v[223:224]", "v[224:225]",
+    "v[225:226]", "v[226:227]", "v[227:228]", "v[228:229]", "v[229:230]",
+    "v[230:231]", "v[231:232]", "v[232:233]", "v[233:234]", "v[234:235]",
+    "v[235:236]", "v[236:237]", "v[237:238]", "v[238:239]", "v[239:240]",
+    "v[240:241]", "v[241:242]", "v[242:243]", "v[243:244]", "v[244:245]",
+    "v[245:246]", "v[246:247]", "v[247:248]", "v[248:249]", "v[249:250]",
+    "v[250:251]", "v[251:252]", "v[252:253]", "v[253:254]", "v[254:255]"
+};
+
+static const char *const VGPR96RegNames[] = {
+    "v[0:2]",     "v[1:3]",     "v[2:4]",     "v[3:5]",     "v[4:6]",
+    "v[5:7]",     "v[6:8]",     "v[7:9]",     "v[8:10]",    "v[9:11]",
+    "v[10:12]",   "v[11:13]",   "v[12:14]",   "v[13:15]",   "v[14:16]",
+    "v[15:17]",   "v[16:18]",   "v[17:19]",   "v[18:20]",   "v[19:21]",
+    "v[20:22]",   "v[21:23]",   "v[22:24]",   "v[23:25]",   "v[24:26]",
+    "v[25:27]",   "v[26:28]",   "v[27:29]",   "v[28:30]",   "v[29:31]",
+    "v[30:32]",   "v[31:33]",   "v[32:34]",   "v[33:35]",   "v[34:36]",
+    "v[35:37]",   "v[36:38]",   "v[37:39]",   "v[38:40]",   "v[39:41]",
+    "v[40:42]",   "v[41:43]",   "v[42:44]",   "v[43:45]",   "v[44:46]",
+    "v[45:47]",   "v[46:48]",   "v[47:49]",   "v[48:50]",   "v[49:51]",
+    "v[50:52]",   "v[51:53]",   "v[52:54]",   "v[53:55]",   "v[54:56]",
+    "v[55:57]",   "v[56:58]",   "v[57:59]",   "v[58:60]",   "v[59:61]",
+    "v[60:62]",   "v[61:63]",   "v[62:64]",   "v[63:65]",   "v[64:66]",
+    "v[65:67]",   "v[66:68]",   "v[67:69]",   "v[68:70]",   "v[69:71]",
+    "v[70:72]",   "v[71:73]",   "v[72:74]",   "v[73:75]",   "v[74:76]",
+    "v[75:77]",   "v[76:78]",   "v[77:79]",   "v[78:80]",   "v[79:81]",
+    "v[80:82]",   "v[81:83]",   "v[82:84]",   "v[83:85]",   "v[84:86]",
+    "v[85:87]",   "v[86:88]",   "v[87:89]",   "v[88:90]",   "v[89:91]",
+    "v[90:92]",   "v[91:93]",   "v[92:94]",   "v[93:95]",   "v[94:96]",
+    "v[95:97]",   "v[96:98]",   "v[97:99]",   "v[98:100]",  "v[99:101]",
+    "v[100:102]", "v[101:103]", "v[102:104]", "v[103:105]", "v[104:106]",
+    "v[105:107]", "v[106:108]", "v[107:109]", "v[108:110]", "v[109:111]",
+    "v[110:112]", "v[111:113]", "v[112:114]", "v[113:115]", "v[114:116]",
+    "v[115:117]", "v[116:118]", "v[117:119]", "v[118:120]", "v[119:121]",
+    "v[120:122]", "v[121:123]", "v[122:124]", "v[123:125]", "v[124:126]",
+    "v[125:127]", "v[126:128]", "v[127:129]", "v[128:130]", "v[129:131]",
+    "v[130:132]", "v[131:133]", "v[132:134]", "v[133:135]", "v[134:136]",
+    "v[135:137]", "v[136:138]", "v[137:139]", "v[138:140]", "v[139:141]",
+    "v[140:142]", "v[141:143]", "v[142:144]", "v[143:145]", "v[144:146]",
+    "v[145:147]", "v[146:148]", "v[147:149]", "v[148:150]", "v[149:151]",
+    "v[150:152]", "v[151:153]", "v[152:154]", "v[153:155]", "v[154:156]",
+    "v[155:157]", "v[156:158]", "v[157:159]", "v[158:160]", "v[159:161]",
+    "v[160:162]", "v[161:163]", "v[162:164]", "v[163:165]", "v[164:166]",
+    "v[165:167]", "v[166:168]", "v[167:169]", "v[168:170]", "v[169:171]",
+    "v[170:172]", "v[171:173]", "v[172:174]", "v[173:175]", "v[174:176]",
+    "v[175:177]", "v[176:178]", "v[177:179]", "v[178:180]", "v[179:181]",
+    "v[180:182]", "v[181:183]", "v[182:184]", "v[183:185]", "v[184:186]",
+    "v[185:187]", "v[186:188]", "v[187:189]", "v[188:190]", "v[189:191]",
+    "v[190:192]", "v[191:193]", "v[192:194]", "v[193:195]", "v[194:196]",
+    "v[195:197]", "v[196:198]", "v[197:199]", "v[198:200]", "v[199:201]",
+    "v[200:202]", "v[201:203]", "v[202:204]", "v[203:205]", "v[204:206]",
+    "v[205:207]", "v[206:208]", "v[207:209]", "v[208:210]", "v[209:211]",
+    "v[210:212]", "v[211:213]", "v[212:214]", "v[213:215]", "v[214:216]",
+    "v[215:217]", "v[216:218]", "v[217:219]", "v[218:220]", "v[219:221]",
+    "v[220:222]", "v[221:223]", "v[222:224]", "v[223:225]", "v[224:226]",
+    "v[225:227]", "v[226:228]", "v[227:229]", "v[228:230]", "v[229:231]",
+    "v[230:232]", "v[231:233]", "v[232:234]", "v[233:235]", "v[234:236]",
+    "v[235:237]", "v[236:238]", "v[237:239]", "v[238:240]", "v[239:241]",
+    "v[240:242]", "v[241:243]", "v[242:244]", "v[243:245]", "v[244:246]",
+    "v[245:247]", "v[246:248]", "v[247:249]", "v[248:250]", "v[249:251]",
+    "v[250:252]", "v[251:253]", "v[252:254]", "v[253:255]"
+};
+
+static const char *const VGPR128RegNames[] = {
+    "v[0:3]",     "v[1:4]",     "v[2:5]",     "v[3:6]",     "v[4:7]",
+    "v[5:8]",     "v[6:9]",     "v[7:10]",    "v[8:11]",    "v[9:12]",
+    "v[10:13]",   "v[11:14]",   "v[12:15]",   "v[13:16]",   "v[14:17]",
+    "v[15:18]",   "v[16:19]",   "v[17:20]",   "v[18:21]",   "v[19:22]",
+    "v[20:23]",   "v[21:24]",   "v[22:25]",   "v[23:26]",   "v[24:27]",
+    "v[25:28]",   "v[26:29]",   "v[27:30]",   "v[28:31]",   "v[29:32]",
+    "v[30:33]",   "v[31:34]",   "v[32:35]",   "v[33:36]",   "v[34:37]",
+    "v[35:38]",   "v[36:39]",   "v[37:40]",   "v[38:41]",   "v[39:42]",
+    "v[40:43]",   "v[41:44]",   "v[42:45]",   "v[43:46]",   "v[44:47]",
+    "v[45:48]",   "v[46:49]",   "v[47:50]",   "v[48:51]",   "v[49:52]",
+    "v[50:53]",   "v[51:54]",   "v[52:55]",   "v[53:56]",   "v[54:57]",
+    "v[55:58]",   "v[56:59]",   "v[57:60]",   "v[58:61]",   "v[59:62]",
+    "v[60:63]",   "v[61:64]",   "v[62:65]",   "v[63:66]",   "v[64:67]",
+    "v[65:68]",   "v[66:69]",   "v[67:70]",   "v[68:71]",   "v[69:72]",
+    "v[70:73]",   "v[71:74]",   "v[72:75]",   "v[73:76]",   "v[74:77]",
+    "v[75:78]",   "v[76:79]",   "v[77:80]",   "v[78:81]",   "v[79:82]",
+    "v[80:83]",   "v[81:84]",   "v[82:85]",   "v[83:86]",   "v[84:87]",
+    "v[85:88]",   "v[86:89]",   "v[87:90]",   "v[88:91]",   "v[89:92]",
+    "v[90:93]",   "v[91:94]",   "v[92:95]",   "v[93:96]",   "v[94:97]",
+    "v[95:98]",   "v[96:99]",   "v[97:100]",  "v[98:101]",  "v[99:102]",
+    "v[100:103]", "v[101:104]", "v[102:105]", "v[103:106]", "v[104:107]",
+    "v[105:108]", "v[106:109]", "v[107:110]", "v[108:111]", "v[109:112]",
+    "v[110:113]", "v[111:114]", "v[112:115]", "v[113:116]", "v[114:117]",
+    "v[115:118]", "v[116:119]", "v[117:120]", "v[118:121]", "v[119:122]",
+    "v[120:123]", "v[121:124]", "v[122:125]", "v[123:126]", "v[124:127]",
+    "v[125:128]", "v[126:129]", "v[127:130]", "v[128:131]", "v[129:132]",
+    "v[130:133]", "v[131:134]", "v[132:135]", "v[133:136]", "v[134:137]",
+    "v[135:138]", "v[136:139]", "v[137:140]", "v[138:141]", "v[139:142]",
+    "v[140:143]", "v[141:144]", "v[142:145]", "v[143:146]", "v[144:147]",
+    "v[145:148]", "v[146:149]", "v[147:150]", "v[148:151]", "v[149:152]",
+    "v[150:153]", "v[151:154]", "v[152:155]", "v[153:156]", "v[154:157]",
+    "v[155:158]", "v[156:159]", "v[157:160]", "v[158:161]", "v[159:162]",
+    "v[160:163]", "v[161:164]", "v[162:165]", "v[163:166]", "v[164:167]",
+    "v[165:168]", "v[166:169]", "v[167:170]", "v[168:171]", "v[169:172]",
+    "v[170:173]", "v[171:174]", "v[172:175]", "v[173:176]", "v[174:177]",
+    "v[175:178]", "v[176:179]", "v[177:180]", "v[178:181]", "v[179:182]",
+    "v[180:183]", "v[181:184]", "v[182:185]", "v[183:186]", "v[184:187]",
+    "v[185:188]", "v[186:189]", "v[187:190]", "v[188:191]", "v[189:192]",
+    "v[190:193]", "v[191:194]", "v[192:195]", "v[193:196]", "v[194:197]",
+    "v[195:198]", "v[196:199]", "v[197:200]", "v[198:201]", "v[199:202]",
+    "v[200:203]", "v[201:204]", "v[202:205]", "v[203:206]", "v[204:207]",
+    "v[205:208]", "v[206:209]", "v[207:210]", "v[208:211]", "v[209:212]",
+    "v[210:213]", "v[211:214]", "v[212:215]", "v[213:216]", "v[214:217]",
+    "v[215:218]", "v[216:219]", "v[217:220]", "v[218:221]", "v[219:222]",
+    "v[220:223]", "v[221:224]", "v[222:225]", "v[223:226]", "v[224:227]",
+    "v[225:228]", "v[226:229]", "v[227:230]", "v[228:231]", "v[229:232]",
+    "v[230:233]", "v[231:234]", "v[232:235]", "v[233:236]", "v[234:237]",
+    "v[235:238]", "v[236:239]", "v[237:240]", "v[238:241]", "v[239:242]",
+    "v[240:243]", "v[241:244]", "v[242:245]", "v[243:246]", "v[244:247]",
+    "v[245:248]", "v[246:249]", "v[247:250]", "v[248:251]", "v[249:252]",
+    "v[250:253]", "v[251:254]", "v[252:255]"
+};
+
+static const char *const VGPR256RegNames[] = {
+    "v[0:7]",     "v[1:8]",     "v[2:9]",     "v[3:10]",    "v[4:11]",
+    "v[5:12]",    "v[6:13]",    "v[7:14]",    "v[8:15]",    "v[9:16]",
+    "v[10:17]",   "v[11:18]",   "v[12:19]",   "v[13:20]",   "v[14:21]",
+    "v[15:22]",   "v[16:23]",   "v[17:24]",   "v[18:25]",   "v[19:26]",
+    "v[20:27]",   "v[21:28]",   "v[22:29]",   "v[23:30]",   "v[24:31]",
+    "v[25:32]",   "v[26:33]",   "v[27:34]",   "v[28:35]",   "v[29:36]",
+    "v[30:37]",   "v[31:38]",   "v[32:39]",   "v[33:40]",   "v[34:41]",
+    "v[35:42]",   "v[36:43]",   "v[37:44]",   "v[38:45]",   "v[39:46]",
+    "v[40:47]",   "v[41:48]",   "v[42:49]",   "v[43:50]",   "v[44:51]",
+    "v[45:52]",   "v[46:53]",   "v[47:54]",   "v[48:55]",   "v[49:56]",
+    "v[50:57]",   "v[51:58]",   "v[52:59]",   "v[53:60]",   "v[54:61]",
+    "v[55:62]",   "v[56:63]",   "v[57:64]",   "v[58:65]",   "v[59:66]",
+    "v[60:67]",   "v[61:68]",   "v[62:69]",   "v[63:70]",   "v[64:71]",
+    "v[65:72]",   "v[66:73]",   "v[67:74]",   "v[68:75]",   "v[69:76]",
+    "v[70:77]",   "v[71:78]",   "v[72:79]",   "v[73:80]",   "v[74:81]",
+    "v[75:82]",   "v[76:83]",   "v[77:84]",   "v[78:85]",   "v[79:86]",
+    "v[80:87]",   "v[81:88]",   "v[82:89]",   "v[83:90]",   "v[84:91]",
+    "v[85:92]",   "v[86:93]",   "v[87:94]",   "v[88:95]",   "v[89:96]",
+    "v[90:97]",   "v[91:98]",   "v[92:99]",   "v[93:100]",  "v[94:101]",
+    "v[95:102]",  "v[96:103]",  "v[97:104]",  "v[98:105]",  "v[99:106]",
+    "v[100:107]", "v[101:108]", "v[102:109]", "v[103:110]", "v[104:111]",
+    "v[105:112]", "v[106:113]", "v[107:114]", "v[108:115]", "v[109:116]",
+    "v[110:117]", "v[111:118]", "v[112:119]", "v[113:120]", "v[114:121]",
+    "v[115:122]", "v[116:123]", "v[117:124]", "v[118:125]", "v[119:126]",
+    "v[120:127]", "v[121:128]", "v[122:129]", "v[123:130]", "v[124:131]",
+    "v[125:132]", "v[126:133]", "v[127:134]", "v[128:135]", "v[129:136]",
+    "v[130:137]", "v[131:138]", "v[132:139]", "v[133:140]", "v[134:141]",
+    "v[135:142]", "v[136:143]", "v[137:144]", "v[138:145]", "v[139:146]",
+    "v[140:147]", "v[141:148]", "v[142:149]", "v[143:150]", "v[144:151]",
+    "v[145:152]", "v[146:153]", "v[147:154]", "v[148:155]", "v[149:156]",
+    "v[150:157]", "v[151:158]", "v[152:159]", "v[153:160]", "v[154:161]",
+    "v[155:162]", "v[156:163]", "v[157:164]", "v[158:165]", "v[159:166]",
+    "v[160:167]", "v[161:168]", "v[162:169]", "v[163:170]", "v[164:171]",
+    "v[165:172]", "v[166:173]", "v[167:174]", "v[168:175]", "v[169:176]",
+    "v[170:177]", "v[171:178]", "v[172:179]", "v[173:180]", "v[174:181]",
+    "v[175:182]", "v[176:183]", "v[177:184]", "v[178:185]", "v[179:186]",
+    "v[180:187]", "v[181:188]", "v[182:189]", "v[183:190]", "v[184:191]",
+    "v[185:192]", "v[186:193]", "v[187:194]", "v[188:195]", "v[189:196]",
+    "v[190:197]", "v[191:198]", "v[192:199]", "v[193:200]", "v[194:201]",
+    "v[195:202]", "v[196:203]", "v[197:204]", "v[198:205]", "v[199:206]",
+    "v[200:207]", "v[201:208]", "v[202:209]", "v[203:210]", "v[204:211]",
+    "v[205:212]", "v[206:213]", "v[207:214]", "v[208:215]", "v[209:216]",
+    "v[210:217]", "v[211:218]", "v[212:219]", "v[213:220]", "v[214:221]",
+    "v[215:222]", "v[216:223]", "v[217:224]", "v[218:225]", "v[219:226]",
+    "v[220:227]", "v[221:228]", "v[222:229]", "v[223:230]", "v[224:231]",
+    "v[225:232]", "v[226:233]", "v[227:234]", "v[228:235]", "v[229:236]",
+    "v[230:237]", "v[231:238]", "v[232:239]", "v[233:240]", "v[234:241]",
+    "v[235:242]", "v[236:243]", "v[237:244]", "v[238:245]", "v[239:246]",
+    "v[240:247]", "v[241:248]", "v[242:249]", "v[243:250]", "v[244:251]",
+    "v[245:252]", "v[246:253]", "v[247:254]", "v[248:255]"
+};
+
+static const char *const VGPR512RegNames[] = {
+    "v[0:15]",    "v[1:16]",    "v[2:17]",    "v[3:18]",    "v[4:19]",
+    "v[5:20]",    "v[6:21]",    "v[7:22]",    "v[8:23]",    "v[9:24]",
+    "v[10:25]",   "v[11:26]",   "v[12:27]",   "v[13:28]",   "v[14:29]",
+    "v[15:30]",   "v[16:31]",   "v[17:32]",   "v[18:33]",   "v[19:34]",
+    "v[20:35]",   "v[21:36]",   "v[22:37]",   "v[23:38]",   "v[24:39]",
+    "v[25:40]",   "v[26:41]",   "v[27:42]",   "v[28:43]",   "v[29:44]",
+    "v[30:45]",   "v[31:46]",   "v[32:47]",   "v[33:48]",   "v[34:49]",
+    "v[35:50]",   "v[36:51]",   "v[37:52]",   "v[38:53]",   "v[39:54]",
+    "v[40:55]",   "v[41:56]",   "v[42:57]",   "v[43:58]",   "v[44:59]",
+    "v[45:60]",   "v[46:61]",   "v[47:62]",   "v[48:63]",   "v[49:64]",
+    "v[50:65]",   "v[51:66]",   "v[52:67]",   "v[53:68]",   "v[54:69]",
+    "v[55:70]",   "v[56:71]",   "v[57:72]",   "v[58:73]",   "v[59:74]",
+    "v[60:75]",   "v[61:76]",   "v[62:77]",   "v[63:78]",   "v[64:79]",
+    "v[65:80]",   "v[66:81]",   "v[67:82]",   "v[68:83]",   "v[69:84]",
+    "v[70:85]",   "v[71:86]",   "v[72:87]",   "v[73:88]",   "v[74:89]",
+    "v[75:90]",   "v[76:91]",   "v[77:92]",   "v[78:93]",   "v[79:94]",
+    "v[80:95]",   "v[81:96]",   "v[82:97]",   "v[83:98]",   "v[84:99]",
+    "v[85:100]",  "v[86:101]",  "v[87:102]",  "v[88:103]",  "v[89:104]",
+    "v[90:105]",  "v[91:106]",  "v[92:107]",  "v[93:108]",  "v[94:109]",
+    "v[95:110]",  "v[96:111]",  "v[97:112]",  "v[98:113]",  "v[99:114]",
+    "v[100:115]", "v[101:116]", "v[102:117]", "v[103:118]", "v[104:119]",
+    "v[105:120]", "v[106:121]", "v[107:122]", "v[108:123]", "v[109:124]",
+    "v[110:125]", "v[111:126]", "v[112:127]", "v[113:128]", "v[114:129]",
+    "v[115:130]", "v[116:131]", "v[117:132]", "v[118:133]", "v[119:134]",
+    "v[120:135]", "v[121:136]", "v[122:137]", "v[123:138]", "v[124:139]",
+    "v[125:140]", "v[126:141]", "v[127:142]", "v[128:143]", "v[129:144]",
+    "v[130:145]", "v[131:146]", "v[132:147]", "v[133:148]", "v[134:149]",
+    "v[135:150]", "v[136:151]", "v[137:152]", "v[138:153]", "v[139:154]",
+    "v[140:155]", "v[141:156]", "v[142:157]", "v[143:158]", "v[144:159]",
+    "v[145:160]", "v[146:161]", "v[147:162]", "v[148:163]", "v[149:164]",
+    "v[150:165]", "v[151:166]", "v[152:167]", "v[153:168]", "v[154:169]",
+    "v[155:170]", "v[156:171]", "v[157:172]", "v[158:173]", "v[159:174]",
+    "v[160:175]", "v[161:176]", "v[162:177]", "v[163:178]", "v[164:179]",
+    "v[165:180]", "v[166:181]", "v[167:182]", "v[168:183]", "v[169:184]",
+    "v[170:185]", "v[171:186]", "v[172:187]", "v[173:188]", "v[174:189]",
+    "v[175:190]", "v[176:191]", "v[177:192]", "v[178:193]", "v[179:194]",
+    "v[180:195]", "v[181:196]", "v[182:197]", "v[183:198]", "v[184:199]",
+    "v[185:200]", "v[186:201]", "v[187:202]", "v[188:203]", "v[189:204]",
+    "v[190:205]", "v[191:206]", "v[192:207]", "v[193:208]", "v[194:209]",
+    "v[195:210]", "v[196:211]", "v[197:212]", "v[198:213]", "v[199:214]",
+    "v[200:215]", "v[201:216]", "v[202:217]", "v[203:218]", "v[204:219]",
+    "v[205:220]", "v[206:221]", "v[207:222]", "v[208:223]", "v[209:224]",
+    "v[210:225]", "v[211:226]", "v[212:227]", "v[213:228]", "v[214:229]",
+    "v[215:230]", "v[216:231]", "v[217:232]", "v[218:233]", "v[219:234]",
+    "v[220:235]", "v[221:236]", "v[222:237]", "v[223:238]", "v[224:239]",
+    "v[225:240]", "v[226:241]", "v[227:242]", "v[228:243]", "v[229:244]",
+    "v[230:245]", "v[231:246]", "v[232:247]", "v[233:248]", "v[234:249]",
+    "v[235:250]", "v[236:251]", "v[237:252]", "v[238:253]", "v[239:254]",
+    "v[240:255]"
+};
+
+static const char *const SGPR64RegNames[] = {
+    "s[0:1]",   "s[2:3]",   "s[4:5]",     "s[6:7]",     "s[8:9]",   "s[10:11]",
+    "s[12:13]", "s[14:15]", "s[16:17]",   "s[18:19]",   "s[20:21]", "s[22:23]",
+    "s[24:25]", "s[26:27]", "s[28:29]",   "s[30:31]",   "s[32:33]", "s[34:35]",
+    "s[36:37]", "s[38:39]", "s[40:41]",   "s[42:43]",   "s[44:45]", "s[46:47]",
+    "s[48:49]", "s[50:51]", "s[52:53]",   "s[54:55]",   "s[56:57]", "s[58:59]",
+    "s[60:61]", "s[62:63]", "s[64:65]",   "s[66:67]",   "s[68:69]", "s[70:71]",
+    "s[72:73]", "s[74:75]", "s[76:77]",   "s[78:79]",   "s[80:81]", "s[82:83]",
+    "s[84:85]", "s[86:87]", "s[88:89]",   "s[90:91]",   "s[92:93]", "s[94:95]",
+    "s[96:97]", "s[98:99]", "s[100:101]", "s[102:103]"
+};
+
+static const char *const SGPR128RegNames[] = {
+    "s[0:3]",   "s[4:7]",     "s[8:11]",  "s[12:15]", "s[16:19]", "s[20:23]",
+    "s[24:27]", "s[28:31]",   "s[32:35]", "s[36:39]", "s[40:43]", "s[44:47]",
+    "s[48:51]", "s[52:55]",   "s[56:59]", "s[60:63]", "s[64:67]", "s[68:71]",
+    "s[72:75]", "s[76:79]",   "s[80:83]", "s[84:87]", "s[88:91]", "s[92:95]",
+    "s[96:99]", "s[100:103]"
+};
+
+static const char *const SGPR256RegNames[] = {
+    "s[0:7]",   "s[4:11]",  "s[8:15]",  "s[12:19]", "s[16:23]",
+    "s[20:27]", "s[24:31]", "s[28:35]", "s[32:39]", "s[36:43]",
+    "s[40:47]", "s[44:51]", "s[48:55]", "s[52:59]", "s[56:63]",
+    "s[60:67]", "s[64:71]", "s[68:75]", "s[72:79]", "s[76:83]",
+    "s[80:87]", "s[84:91]", "s[88:95]", "s[92:99]", "s[96:103]"
+};
+
+static const char *const SGPR512RegNames[] = {
+    "s[0:15]",  "s[4:19]",  "s[8:23]",  "s[12:27]", "s[16:31]",  "s[20:35]",
+    "s[24:39]", "s[28:43]", "s[32:47]", "s[36:51]", "s[40:55]",  "s[44:59]",
+    "s[48:63]", "s[52:67]", "s[56:71]", "s[60:75]", "s[64:79]",  "s[68:83]",
+    "s[72:87]", "s[76:91]", "s[80:95]", "s[84:99]", "s[88:103]"
+};
+
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
new file mode 100644
index 0000000..623b2c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -0,0 +1,231 @@
+//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "AMDGPUGenRegisterBank.inc"
+
+// This file will be TableGen'ed at some point.
+#include "AMDGPUGenRegisterBankInfo.def"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
+    : AMDGPUGenRegisterBankInfo(),
+      TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
+
+  // HACK: Until this is fully tablegen'd
+  static bool AlreadyInit = false;
+  if (AlreadyInit)
+    return;
+
+  AlreadyInit = true;
+
+  const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
+  (void)RBSGPR;
+  assert(&RBSGPR == &AMDGPU::SGPRRegBank);
+
+  const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
+  (void)RBVGPR;
+  assert(&RBVGPR == &AMDGPU::VGPRRegBank);
+
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
+                                           const RegisterBank &B,
+                                           unsigned Size) const {
+  return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+
+  if (TRI->isSGPRClass(&RC))
+    return getRegBank(AMDGPU::SGPRRegBankID);
+
+  return getRegBank(AMDGPU::VGPRRegBankID);
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappings(
+    const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+
+  InstructionMappings AltMappings;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD: {
+    // FIXME: Should we be hard coding the size for these mappings?
+    const InstructionMapping &SSMapping = getInstructionMapping(
+        1, 1, getOperandsMapping(
+                  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                   AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+        2); // Num Operands
+    AltMappings.push_back(&SSMapping);
+
+    const InstructionMapping &VVMapping = getInstructionMapping(
+        2, 1, getOperandsMapping(
+                  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                   AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
+        2); // Num Operands
+    AltMappings.push_back(&VVMapping);
+
+    // FIXME: Should this be the pointer-size (64-bits) or the size of the
+    // register that will hold the bufffer resourc (128-bits).
+    const InstructionMapping &VSMapping = getInstructionMapping(
+        3, 1, getOperandsMapping(
+                  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                   AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+        2); // Num Operands
+    AltMappings.push_back(&VSMapping);
+
+    return AltMappings;
+
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AMDGPURegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  return applyDefaultMapping(OpdMapper);
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  return AMDGPU::isUniformMMO(MMO);
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+  unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+
+  const ValueMapping *ValMapping;
+  const ValueMapping *PtrMapping;
+
+  if (isInstrUniform(MI)) {
+    // We have a uniform instruction so we want to use an SMRD load
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
+  } else {
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    // FIXME: What would happen if we used SGPRRegBankID here?
+    PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
+  }
+
+  OpdsMapping[0] = ValMapping;
+  OpdsMapping[1] = PtrMapping;
+  const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
+      1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
+  return Mapping;
+
+  // FIXME: Do we want to add a mapping for FLAT load, or should we just
+  // handle that during instruction selection?
+}
+
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
+
+  if (Mapping.isValid())
+    return Mapping;
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+  bool IsComplete = true;
+  switch (MI.getOpcode()) {
+  default:
+    IsComplete = false;
+    break;
+  case AMDGPU::G_CONSTANT: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    break;
+  }
+  case AMDGPU::G_GEP: {
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (!MI.getOperand(i).isReg())
+        continue;
+
+      unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits();
+      OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    }
+    break;
+  }
+  case AMDGPU::G_STORE: {
+    assert(MI.getOperand(0).isReg());
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    // FIXME: We need to specify a different reg bank once scalar stores
+    // are supported.
+    const ValueMapping *ValMapping =
+        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    // FIXME: Depending on the type of store, the pointer could be in
+    // the SGPR Reg bank.
+    // FIXME: Pointer size should be based on the address space.
+    const ValueMapping *PtrMapping =
+        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+
+    OpdsMapping[0] = ValMapping;
+    OpdsMapping[1] = PtrMapping;
+    break;
+  }
+
+  case AMDGPU::G_LOAD:
+    return getInstrMappingForLoad(MI);
+  }
+
+  if (!IsComplete) {
+    unsigned BankID = AMDGPU::SGPRRegBankID;
+
+    unsigned Size = 0;
+    for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+      // If the operand is not a register default to the size of the previous
+      // operand.
+      // FIXME: Can't we pull the types from the MachineInstr rather than the
+      // operands.
+      if (MI.getOperand(Idx).isReg())
+        Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
+      OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+    }
+  }
+  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+                               MI.getNumOperands());
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
new file mode 100644
index 0000000..201fdc1
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -0,0 +1,65 @@
+//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+class TargetRegisterInfo;
+
+namespace AMDGPU {
+enum {
+  SGPRRegBankID = 0,
+  VGPRRegBankID = 1,
+  NumRegisterBanks
+};
+} // End AMDGPU namespace.
+
+/// This class provides the information for the target register banks.
+class AMDGPUGenRegisterBankInfo : public RegisterBankInfo {
+
+protected:
+
+#define GET_TARGET_REGBANK_CLASS
+#include "AMDGPUGenRegisterBank.inc"
+};
+class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+  const SIRegisterInfo *TRI;
+
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  const RegisterBankInfo::InstructionMapping &
+  getInstrMappingForLoad(const MachineInstr &MI) const;
+
+public:
+  AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                    unsigned Size) const override;
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
new file mode 100644
index 0000000..f4428e5
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -0,0 +1,16 @@
+//=- AMDGPURegisterBank.td - Describe the AMDGPU Banks -------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def SGPRRegBank : RegisterBank<"SGPR",
+  [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512]
+>;
+
+def VGPRRegBank : RegisterBank<"VGPR",
+  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
+>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 941f2d8..ff58aa5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIRegisterInfo.h"
 
 using namespace llvm;
 
@@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-// Dummy to not crash RegisterClassInfo.
-static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
-
-const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
-  const MachineFunction *) const {
-  return &CalleeSavedReg;
-}
-
-unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  return AMDGPU::NoRegister;
-}
-
 unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
   static const unsigned SubRegs[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
@@ -50,3 +39,34 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
 
 #define GET_REGINFO_TARGET_DESC
 #include "AMDGPUGenRegisterInfo.inc"
+
+// Forced to be here by one .inc
+const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *MF) const {
+  CallingConv::ID CC = MF->getFunction()->getCallingConv();
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CSR_AMDGPU_HighRegs_SaveList;
+  default: {
+    // Dummy to not crash RegisterClassInfo.
+    static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
+    return &NoCalleeSavedReg;
+  }
+  }
+}
+
+const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                                     CallingConv::ID CC) const {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return CSR_AMDGPU_HighRegs_RegMask;
+  default:
+    return nullptr;
+  }
+}
+
+unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index ef51aad..d8604d2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -16,10 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 
-#include "llvm/Target/TargetRegisterInfo.h"
-
 #define GET_REGINFO_HEADER
-#define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
 
 namespace llvm {
@@ -33,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// \returns the sub reg enum value for the given \p Channel
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
-
-  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
deleted file mode 100644
index ecd2ac7..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
+++ /dev/null
@@ -1,193 +0,0 @@
-//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Enums and structure types used by runtime metadata.
-///
-/// Runtime requests certain information (metadata) about kernels to be able
-/// to execute the kernels and answer the queries about the kernels.
-/// The metadata is represented as a note element in the .note ELF section of a
-/// binary (code object). The desc field of the note element is a YAML string
-/// consisting of key-value pairs. Each key is a string. Each value can be
-/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps.
-/// At the beginning of the YAML string is the module level YAML map. A
-/// kernel-level YAML map is in the amd.Kernels sequence. A
-/// kernel-argument-level map is in the amd.Args sequence.
-///
-/// The format should be kept backward compatible. New enum values and bit
-/// fields should be appended at the end. It is suggested to bump up the
-/// revision number whenever the format changes and document the change
-/// in the revision in this header.
-///
-//
-//===----------------------------------------------------------------------===//
-//
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-
-#include <cstdint>
-#include <vector>
-#include <string>
-
-namespace AMDGPU {
-
-namespace RuntimeMD {
-
-  // Version and revision of runtime metadata
-  const unsigned char MDVersion   = 2;
-  const unsigned char MDRevision  = 0;
-
-  // Name of keys for runtime metadata.
-  namespace KeyName {
-    const char MDVersion[]                = "amd.MDVersion";            // Runtime metadata version
-    const char Language[]                 = "amd.Language";             // Language
-    const char LanguageVersion[]          = "amd.LanguageVersion";      // Language version
-    const char Kernels[]                  = "amd.Kernels";              // Kernels
-    const char KernelName[]               = "amd.KernelName";           // Kernel name
-    const char Args[]                     = "amd.Args";                 // Kernel arguments
-    const char ArgSize[]                  = "amd.ArgSize";              // Kernel arg size
-    const char ArgAlign[]                 = "amd.ArgAlign";             // Kernel arg alignment
-    const char ArgTypeName[]              = "amd.ArgTypeName";          // Kernel type name
-    const char ArgName[]                  = "amd.ArgName";              // Kernel name
-    const char ArgKind[]                  = "amd.ArgKind";              // Kernel argument kind
-    const char ArgValueType[]             = "amd.ArgValueType";         // Kernel argument value type
-    const char ArgAddrQual[]              = "amd.ArgAddrQual";          // Kernel argument address qualifier
-    const char ArgAccQual[]               = "amd.ArgAccQual";           // Kernel argument access qualifier
-    const char ArgIsConst[]               = "amd.ArgIsConst";           // Kernel argument is const qualified
-    const char ArgIsRestrict[]            = "amd.ArgIsRestrict";        // Kernel argument is restrict qualified
-    const char ArgIsVolatile[]            = "amd.ArgIsVolatile";        // Kernel argument is volatile qualified
-    const char ArgIsPipe[]                = "amd.ArgIsPipe";            // Kernel argument is pipe qualified
-    const char ReqdWorkGroupSize[]        = "amd.ReqdWorkGroupSize";    // Required work group size
-    const char WorkGroupSizeHint[]        = "amd.WorkGroupSizeHint";    // Work group size hint
-    const char VecTypeHint[]              = "amd.VecTypeHint";          // Vector type hint
-    const char KernelIndex[]              = "amd.KernelIndex";          // Kernel index for device enqueue
-    const char NoPartialWorkGroups[]      = "amd.NoPartialWorkGroups";  // No partial work groups
-    const char PrintfInfo[]               = "amd.PrintfInfo";           // Prinf function call information
-    const char ArgActualAcc[]             = "amd.ArgActualAcc";         // The actual kernel argument access qualifier
-    const char ArgPointeeAlign[]          = "amd.ArgPointeeAlign";      // Alignment of pointee type
-  }
-
-  namespace KernelArg {
-    enum Kind : uint8_t {
-      ByValue                 = 0,
-      GlobalBuffer            = 1,
-      DynamicSharedPointer    = 2,
-      Sampler                 = 3,
-      Image                   = 4,
-      Pipe                    = 5,
-      Queue                   = 6,
-      HiddenGlobalOffsetX     = 7,
-      HiddenGlobalOffsetY     = 8,
-      HiddenGlobalOffsetZ     = 9,
-      HiddenNone              = 10,
-      HiddenPrintfBuffer      = 11,
-      HiddenDefaultQueue      = 12,
-      HiddenCompletionAction  = 13,
-    };
-
-    enum ValueType : uint16_t {
-      Struct  = 0,
-      I8      = 1,
-      U8      = 2,
-      I16     = 3,
-      U16     = 4,
-      F16     = 5,
-      I32     = 6,
-      U32     = 7,
-      F32     = 8,
-      I64     = 9,
-      U64     = 10,
-      F64     = 11,
-    };
-
-    // Avoid using 'None' since it conflicts with a macro in X11 header file.
-    enum AccessQualifer : uint8_t {
-      AccNone    = 0,
-      ReadOnly   = 1,
-      WriteOnly  = 2,
-      ReadWrite  = 3,
-    };
-
-    enum AddressSpaceQualifer : uint8_t {
-      Private    = 0,
-      Global     = 1,
-      Constant   = 2,
-      Local      = 3,
-      Generic    = 4,
-      Region     = 5,
-    };
-  } // namespace KernelArg
-
-  // Invalid values are used to indicate an optional key should not be emitted.
-  const uint8_t INVALID_ADDR_QUAL     = 0xff;
-  const uint8_t INVALID_ACC_QUAL      = 0xff;
-  const uint32_t INVALID_KERNEL_INDEX = ~0U;
-
-  namespace KernelArg {
-    // In-memory representation of kernel argument information.
-    struct Metadata {
-      uint32_t Size;
-      uint32_t Align;
-      uint32_t PointeeAlign;
-      uint8_t Kind;
-      uint16_t ValueType;
-      std::string TypeName;
-      std::string Name;
-      uint8_t AddrQual;
-      uint8_t AccQual;
-      uint8_t IsVolatile;
-      uint8_t IsConst;
-      uint8_t IsRestrict;
-      uint8_t IsPipe;
-      Metadata() : Size(0), Align(0), PointeeAlign(0), Kind(0), ValueType(0),
-          AddrQual(INVALID_ADDR_QUAL), AccQual(INVALID_ACC_QUAL), IsVolatile(0),
-          IsConst(0), IsRestrict(0), IsPipe(0) {}
-    };
-  }
-
-  namespace Kernel {
-    // In-memory representation of kernel information.
-    struct Metadata {
-      std::string Name;
-      std::string Language;
-      std::vector<uint8_t> LanguageVersion;
-      std::vector<uint32_t> ReqdWorkGroupSize;
-      std::vector<uint32_t> WorkGroupSizeHint;
-      std::string VecTypeHint;
-      uint32_t KernelIndex;
-      uint8_t NoPartialWorkGroups;
-      std::vector<KernelArg::Metadata> Args;
-      Metadata() : KernelIndex(INVALID_KERNEL_INDEX), NoPartialWorkGroups(0) {}
-    };
-  }
-
-  namespace Program {
-    // In-memory representation of program information.
-    struct Metadata {
-      std::vector<uint8_t> MDVersionSeq;
-      std::vector<std::string> PrintfInfo;
-      std::vector<Kernel::Metadata> Kernels;
-
-      explicit Metadata(){}
-
-      // Construct from an YAML string.
-      explicit Metadata(const std::string &YAML);
-
-      // Convert to YAML string.
-      std::string toYAML();
-
-      // Convert from YAML string.
-      static Metadata fromYAML(const std::string &S);
-    };
-  }
-} // namespace RuntimeMD
-} // namespace AMDGPU
-
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c35a67d..7796176 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,8 +13,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPULegalizerInfo.h"
+#include "AMDGPURegisterBankInfo.h"
+#endif
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include <algorithm>
 
@@ -22,7 +32,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-subtarget"
 
-#define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
@@ -41,9 +50,10 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
 
-  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
-    FullFS += "+flat-for-global,+unaligned-buffer-access,";
+    FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
+
   FullFS += FS;
 
   ParseSubtargetFeatures(GPU, FullFS);
@@ -59,9 +69,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // denormals, but should be checked. Should we issue a warning somewhere
   // if someone tries to enable these?
   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    FP16Denormals = false;
+    FP64FP16Denormals = false;
     FP32Denormals = false;
-    FP64Denormals = false;
   }
 
   // Set defaults if needed.
@@ -71,6 +80,31 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   return *this;
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct SIGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  const AMDGPUCallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
                                  const TargetMachine &TM)
   : AMDGPUGenSubtargetInfo(TT, GPU, FS),
@@ -85,15 +119,18 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FastFMAF32(false),
     HalfRate64Ops(false),
 
-    FP16Denormals(false),
     FP32Denormals(false),
-    FP64Denormals(false),
+    FP64FP16Denormals(false),
     FPExceptions(false),
+    DX10Clamp(false),
     FlatForGlobal(false),
+    AutoWaitcntBeforeBarrier(false),
     UnalignedScratchAccess(false),
     UnalignedBufferAccess(false),
 
+    HasApertureRegs(false),
     EnableXNACK(false),
+    TrapHandler(false),
     DebuggerInsertNops(false),
     DebuggerReserveRegs(false),
     DebuggerEmitPrologue(false),
@@ -110,14 +147,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     GCN1Encoding(false),
     GCN3Encoding(false),
     CIInsts(false),
+    GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
     Has16BitInsts(false),
+    HasVOP3PInsts(false),
     HasMovrel(false),
     HasVGPRIndexMode(false),
     HasScalarStores(false),
     HasInv2PiInlineImm(false),
+    HasSDWA(false),
+    HasSDWAOmod(false),
+    HasSDWAScalar(false),
+    HasSDWASdst(false),
+    HasSDWAMac(false),
+    HasSDWAOutModsVOPC(false),
+    HasDPP(false),
     FlatAddressSpace(false),
+    FlatInstOffsets(false),
+    FlatGlobalInsts(false),
+    FlatScratchInsts(false),
 
     R600ALUInst(false),
     CaymanISA(false),
@@ -128,65 +177,30 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 
     FeatureDisable(false),
     InstrItins(getInstrItineraryForCPU(GPU)) {
+  AS = AMDGPU::getAMDGPUAS(TT);
   initializeSubtargetDependencies(TT, GPU, FS);
 }
 
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
-  switch (NWaves) {
-  case 10:
-    return 1638;
-  case 9:
-    return 1820;
-  case 8:
-    return 2048;
-  case 7:
-    return 2340;
-  case 6:
-    return 2730;
-  case 5:
-    return 3276;
-  case 4:
-    return 4096;
-  case 3:
-    return 5461;
-  case 2:
-    return 8192;
-  default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+  const Function &F) const {
+  if (NWaves == 1)
     return getLocalMemorySize();
-  }
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 }
 
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
-  if (Bytes <= 1638)
-    return 10;
-
-  if (Bytes <= 1820)
-    return 9;
-
-  if (Bytes <= 2048)
-    return 8;
-
-  if (Bytes <= 2340)
-    return 7;
-
-  if (Bytes <= 2730)
-    return 6;
-
-  if (Bytes <= 3276)
-    return 5;
-
-  if (Bytes <= 4096)
-    return 4;
-
-  if (Bytes <= 5461)
-    return 3;
-
-  if (Bytes <= 8192)
-    return 2;
-
-  return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+  const Function &F) const {
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+  NumWaves = std::min(NumWaves, MaxWaves);
+  NumWaves = std::max(NumWaves, 1u);
+  return NumWaves;
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
@@ -224,7 +238,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, 0);
+  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 
   // Default/requested minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
@@ -263,12 +277,74 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   // Make sure requested values are compatible with values implied by requested
   // minimum/maximum flat work group sizes.
   if (RequestedFlatWorkGroupSize &&
-      Requested.first > MinImpliedByFlatWorkGroupSize)
+      Requested.first < MinImpliedByFlatWorkGroupSize)
     return Default;
 
   return Requested;
 }
 
+bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
+  Function *Kernel = I->getParent()->getParent();
+  unsigned MinSize = 0;
+  unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
+  bool IdQuery = false;
+
+  // If reqd_work_group_size is present it narrows value down.
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    const Function *F = CI->getCalledFunction();
+    if (F) {
+      unsigned Dim = UINT_MAX;
+      switch (F->getIntrinsicID()) {
+      case Intrinsic::amdgcn_workitem_id_x:
+      case Intrinsic::r600_read_tidig_x:
+        IdQuery = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::r600_read_local_size_x:
+        Dim = 0;
+        break;
+      case Intrinsic::amdgcn_workitem_id_y:
+      case Intrinsic::r600_read_tidig_y:
+        IdQuery = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::r600_read_local_size_y:
+        Dim = 1;
+        break;
+      case Intrinsic::amdgcn_workitem_id_z:
+      case Intrinsic::r600_read_tidig_z:
+        IdQuery = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::r600_read_local_size_z:
+        Dim = 2;
+        break;
+      default:
+        break;
+      }
+      if (Dim <= 3) {
+        if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
+          if (Node->getNumOperands() == 3)
+            MinSize = MaxSize = mdconst::extract<ConstantInt>(
+                                  Node->getOperand(Dim))->getZExtValue();
+      }
+    }
+  }
+
+  if (!MaxSize)
+    return false;
+
+  // Range metadata is [Lo, Hi). For ID query we need to pass max size
+  // as Hi. For size query we need to pass Hi + 1.
+  if (IdQuery)
+    MinSize = 0;
+  else
+    ++MaxSize;
+
+  MDBuilder MDB(I->getContext());
+  MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
+                                                  APInt(32, MaxSize));
+  I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
+  return true;
+}
+
 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
                              const TargetMachine &TM) :
   AMDGPUSubtarget(TT, GPU, FS, TM),
@@ -277,11 +353,23 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
   TLInfo(TM, *this) {}
 
 SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                         const TargetMachine &TM) :
-  AMDGPUSubtarget(TT, GPU, FS, TM),
-  InstrInfo(*this),
-  FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-  TLInfo(TM, *this) {}
+                         const TargetMachine &TM)
+    : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this),
+      FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
+      TLInfo(TM, *this) {
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+  GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new AMDGPULegalizerInfo());
+
+  GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+  GISel->InstSelector.reset(new AMDGPUInstructionSelector(
+      *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get())));
+#endif
+  setGISelAccessor(*GISel);
+}
 
 void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                       unsigned NumRegionInstrs) const {
@@ -305,7 +393,7 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 }
 
 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
-					    unsigned ExplicitArgBytes) const {
+                                            unsigned ExplicitArgBytes) const {
   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
   if (ImplicitBytes == 0)
     return ExplicitArgBytes;
@@ -359,12 +447,100 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   return 1;
 }
 
-unsigned SISubtarget::getMaxNumSGPRs() const {
+unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI.hasFlatScratchInit()) {
+    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
+    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+      return 4; // FLAT_SCRATCH, VCC (in that order).
+  }
+
+  if (isXNACKEnabled())
+    return 4; // XNACK, VCC (in that order).
+  return 2; // VCC.
+}
+
+unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of SGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
+  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
+
+  // Check if maximum number of SGPRs was explicitly requested using
+  // "amdgpu-num-sgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-sgpr", MaxNumSGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && (Requested <= getReservedNumSGPRs(MF)))
+      Requested = 0;
+
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accommodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < InputNumSGPRs)
+      Requested = InputNumSGPRs;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumSGPRs = Requested;
+  }
+
   if (hasSGPRInitBug())
-    return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 
-  if (getGeneration() >= VOLCANIC_ISLANDS)
-    return 102;
+  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
+                  MaxAddressableNumSGPRs);
+}
+
+unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of VGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+
+  // Check if maximum number of VGPRs was explicitly requested using
+  // "amdgpu-num-vgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-vgpr", MaxNumVGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && Requested <= getReservedNumVGPRs(MF))
+      Requested = 0;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumVGPRs = Requested;
+  }
 
-  return 104;
+  return MaxNumVGPRs - getReservedNumVGPRs(MF);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 0e3cb7d..d4b6a5f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -16,12 +16,13 @@
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
 
 #include "AMDGPU.h"
-#include "R600InstrInfo.h"
-#include "R600ISelLowering.h"
 #include "R600FrameLowering.h"
-#include "SIInstrInfo.h"
-#include "SIISelLowering.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
 #include "SIFrameLowering.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@@ -51,19 +52,47 @@ public:
     SOUTHERN_ISLANDS,
     SEA_ISLANDS,
     VOLCANIC_ISLANDS,
+    GFX9,
   };
 
   enum {
     ISAVersion0_0_0,
+    ISAVersion6_0_0,
+    ISAVersion6_0_1,
     ISAVersion7_0_0,
     ISAVersion7_0_1,
     ISAVersion7_0_2,
+    ISAVersion7_0_3,
     ISAVersion8_0_0,
     ISAVersion8_0_1,
     ISAVersion8_0_2,
     ISAVersion8_0_3,
     ISAVersion8_0_4,
     ISAVersion8_1_0,
+    ISAVersion9_0_0,
+    ISAVersion9_0_1,
+    ISAVersion9_0_2,
+    ISAVersion9_0_3
+  };
+
+  enum TrapHandlerAbi {
+    TrapHandlerAbiNone = 0,
+    TrapHandlerAbiHsa = 1
+  };
+
+  enum TrapID {
+    TrapIDHardwareReserved = 0,
+    TrapIDHSADebugTrap = 1,
+    TrapIDLLVMTrap = 2,
+    TrapIDLLVMDebugTrap = 3,
+    TrapIDDebugBreakpoint = 7,
+    TrapIDDebugReserved8 = 8,
+    TrapIDDebugReservedFE = 0xfe,
+    TrapIDDebugReservedFF = 0xff
+  };
+
+  enum TrapRegValues {
+    LLVMTrapHandlerRegValue = 1
   };
 
 protected:
@@ -81,14 +110,17 @@ protected:
   bool HalfRate64Ops;
 
   // Dynamially set bits that enable features.
-  bool FP16Denormals;
   bool FP32Denormals;
-  bool FP64Denormals;
+  bool FP64FP16Denormals;
   bool FPExceptions;
+  bool DX10Clamp;
   bool FlatForGlobal;
+  bool AutoWaitcntBeforeBarrier;
   bool UnalignedScratchAccess;
   bool UnalignedBufferAccess;
+  bool HasApertureRegs;
   bool EnableXNACK;
+  bool TrapHandler;
   bool DebuggerInsertNops;
   bool DebuggerReserveRegs;
   bool DebuggerEmitPrologue;
@@ -107,14 +139,26 @@ protected:
   bool GCN1Encoding;
   bool GCN3Encoding;
   bool CIInsts;
+  bool GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
   bool Has16BitInsts;
+  bool HasVOP3PInsts;
   bool HasMovrel;
   bool HasVGPRIndexMode;
   bool HasScalarStores;
   bool HasInv2PiInlineImm;
+  bool HasSDWA;
+  bool HasSDWAOmod;
+  bool HasSDWAScalar;
+  bool HasSDWASdst;
+  bool HasSDWAMac;
+  bool HasSDWAOutModsVOPC;
+  bool HasDPP;
   bool FlatAddressSpace;
+  bool FlatInstOffsets;
+  bool FlatGlobalInsts;
+  bool FlatScratchInsts;
   bool R600ALUInst;
   bool CaymanISA;
   bool CFALUBug;
@@ -127,6 +171,7 @@ protected:
 
   InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
+  AMDGPUAS AS;
 
 public:
   AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
@@ -161,7 +206,8 @@ public:
   }
 
   bool isOpenCLEnv() const {
-    return TargetTriple.getEnvironment() == Triple::OpenCL;
+    return TargetTriple.getEnvironment() == Triple::OpenCL ||
+           TargetTriple.getEnvironmentName() == "amdgizcl";
   }
 
   Generation getGeneration() const {
@@ -184,10 +230,18 @@ public:
     return MaxPrivateElementSize;
   }
 
+  AMDGPUAS getAMDGPUAS() const {
+    return AS;
+  }
+
   bool has16BitInsts() const {
     return Has16BitInsts;
   }
 
+  bool hasVOP3PInsts() const {
+    return HasVOP3PInsts;
+  }
+
   bool hasHWFP64() const {
     return FP64;
   }
@@ -243,6 +297,14 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
+  bool hasMed3_16() const {
+    return getGeneration() >= GFX9;
+  }
+
+  bool hasMin3Max3_16() const {
+    return getGeneration() >= GFX9;
+  }
+
   bool hasCARRY() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -255,6 +317,10 @@ public:
     return CaymanISA;
   }
 
+  TrapHandlerAbi getTrapHandlerAbi() const {
+    return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+  }
+
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
@@ -267,20 +333,22 @@ public:
     return DumpCode;
   }
 
-  bool enableIEEEBit(const MachineFunction &MF) const {
-    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
-  }
-
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
-  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
 
   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
   /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+    const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
+  }
 
   bool hasFP16Denormals() const {
-    return FP16Denormals;
+    return FP64FP16Denormals;
   }
 
   bool hasFP32Denormals() const {
@@ -288,17 +356,33 @@ public:
   }
 
   bool hasFP64Denormals() const {
-    return FP64Denormals;
+    return FP64FP16Denormals;
+  }
+
+  bool supportsMinMaxDenormModes() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
 
   bool hasFPExceptions() const {
     return FPExceptions;
   }
 
+  bool enableDX10Clamp() const {
+    return DX10Clamp;
+  }
+
+  bool enableIEEEBit(const MachineFunction &MF) const {
+    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+  }
+
   bool useFlatForGlobal() const {
     return FlatForGlobal;
   }
 
+  bool hasAutoWaitcntBeforeBarrier() const {
+    return AutoWaitcntBeforeBarrier;
+  }
+
   bool hasUnalignedBufferAccess() const {
     return UnalignedBufferAccess;
   }
@@ -307,10 +391,34 @@ public:
     return UnalignedScratchAccess;
   }
 
+  bool hasApertureRegs() const {
+   return HasApertureRegs;
+  }
+
+  bool isTrapHandlerEnabled() const {
+    return TrapHandler;
+  }
+
   bool isXNACKEnabled() const {
     return EnableXNACK;
   }
 
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
+
+  bool hasFlatInstOffsets() const {
+    return FlatInstOffsets;
+  }
+
+  bool hasFlatGlobalInsts() const {
+    return FlatGlobalInsts;
+  }
+
+  bool hasFlatScratchInsts() const {
+    return FlatScratchInsts;
+  }
+
   bool isMesaKernel(const MachineFunction &MF) const {
     return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
   }
@@ -324,6 +432,34 @@ public:
     return isAmdHsaOS() || isMesaKernel(MF);
   }
 
+  bool hasFminFmaxLegacy() const {
+    return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  }
+
+  bool hasSDWA() const {
+    return HasSDWA;
+  }
+
+  bool hasSDWAOmod() const {
+    return HasSDWAOmod;
+  }
+
+  bool hasSDWAScalar() const {
+    return HasSDWAScalar;
+  }
+
+  bool hasSDWASdst() const {
+    return HasSDWASdst;
+  }
+
+  bool hasSDWAMac() const {
+    return HasSDWAMac;
+  }
+
+  bool hasSDWAOutModsVOPC() const {
+    return HasSDWAOutModsVOPC;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -342,9 +478,11 @@ public:
     return 0;
   }
 
+  // Scratch is allocated in 256 dword per wave blocks for the entire
+  // wavefront. When viewed from the perspecive of an arbitrary workitem, this
+  // is 4-byte aligned.
   unsigned getStackAlignment() const {
-    // Scratch is allocated in 256 dword per wave blocks.
-    return 4 * 256 / getWavefrontSize();
+    return 4;
   }
 
   bool enableMachineScheduler() const override {
@@ -355,72 +493,71 @@ public:
     return true;
   }
 
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
   /// \returns Number of execution units per compute unit supported by the
   /// subtarget.
   unsigned getEUsPerCU() const {
-    return 4;
+    return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
   }
 
   /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
-    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 8;
-    return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
+                                                  FlatWorkGroupSize);
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerCU() const {
-    return getMaxWavesPerEU() * getEUsPerCU();
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return getWavesPerWorkGroup(FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+                                             FlatWorkGroupSize);
   }
 
   /// \returns Minimum number of waves per execution unit supported by the
   /// subtarget.
   unsigned getMinWavesPerEU() const {
-    return 1;
+    return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const {
-    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 8;
-    // FIXME: Need to take scratch memory into account.
-    return 10;
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
-    return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
-      getEUsPerCU();
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
+                                             FlatWorkGroupSize);
   }
 
   /// \returns Minimum flat work group size supported by the subtarget.
   unsigned getMinFlatWorkGroupSize() const {
-    return 1;
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
   }
 
   /// \returns Maximum flat work group size supported by the subtarget.
   unsigned getMaxFlatWorkGroupSize() const {
-    return 2048;
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
   }
 
-  /// \returns Number of waves per work group given the flat work group size.
+  /// \returns Number of waves per work group supported by the subtarget and
+  /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
+    return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
+                                                 FlatWorkGroupSize);
   }
 
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
-
   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
   /// for function \p F, or minimum/maximum flat work group sizes explicitly
   /// requested using "amdgpu-flat-work-group-size" attribute attached to
@@ -440,6 +577,9 @@ public:
   /// compatible with minimum/maximum number of waves limited by flat work group
   /// size, register usage, and/or lds usage.
   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
+
+  /// Creates value range metadata on an workitemid.* inrinsic call or load.
+  bool makeLIDRangeMetadata(Instruction *I) const;
 };
 
 class R600Subtarget final : public AMDGPUSubtarget {
@@ -482,13 +622,6 @@ public:
 };
 
 class SISubtarget final : public AMDGPUSubtarget {
-public:
-  enum {
-    // The closed Vulkan driver sets 96, which limits the wave count to 8 but
-    // doesn't spill SGPRs as much as when 80 is set.
-    FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
-  };
-
 private:
   SIInstrInfo InstrInfo;
   SIFrameLowering FrameLowering;
@@ -516,6 +649,21 @@ public:
     return GISel->getCallLowering();
   }
 
+  const InstructionSelector *getInstructionSelector() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getInstructionSelector();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getLegalizerInfo();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getRegBankInfo();
+  }
+
   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
@@ -524,6 +672,11 @@ public:
     this->GISel.reset(&GISel);
   }
 
+  // XXX - Why is this here if it isn't in the default pass set?
+  bool enableEarlyIfConversion() const override {
+    return true;
+  }
+
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
@@ -533,10 +686,6 @@ public:
     return 16;
   }
 
-  bool hasFlatAddressSpace() const {
-    return FlatAddressSpace;
-  }
-
   bool hasSMemRealTime() const {
     return HasSMemRealTime;
   }
@@ -549,6 +698,10 @@ public:
     return HasVGPRIndexMode;
   }
 
+  bool useVGPRIndexMode(bool UserEnable) const {
+    return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
+  }
+
   bool hasScalarCompareEq64() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
@@ -561,6 +714,10 @@ public:
     return HasInv2PiInlineImm;
   }
 
+  bool hasDPP() const {
+    return HasDPP;
+  }
+
   bool enableSIScheduler() const {
     return EnableSIScheduler;
   }
@@ -594,6 +751,14 @@ public:
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
+  bool hasSMovFedHazard() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasReadM0Hazard() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
 
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
@@ -602,13 +767,104 @@ public:
   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
-  /// \returns True if waitcnt instruction is needed before barrier instruction,
-  /// false otherwise.
-  bool needWaitcntBeforeBarrier() const {
-    return true;
+  /// \returns true if the flat_scratch register should be initialized with the
+  /// pointer to the wave's scratch memory rather than a size and offset.
+  bool flatScratchIsPointer() const {
+    return getGeneration() >= GFX9;
+  }
+
+  /// \returns SGPR allocation granularity supported by the subtarget.
+  unsigned getSGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+  }
+
+  /// \returns SGPR encoding granularity supported by the subtarget.
+  unsigned getSGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
+  }
+
+  /// \returns Total number of SGPRs supported by the subtarget.
+  unsigned getTotalNumSGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+  }
+
+  /// \returns Addressable number of SGPRs supported by the subtarget.
+  unsigned getAddressableNumSGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+  }
+
+  /// \returns Minimum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Maximum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
+                                           Addressable);
+  }
+
+  /// \returns Reserved number of SGPRs for given function \p MF.
+  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns Maximum number of SGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of SGPRs explicitly
+  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns VGPR allocation granularity supported by the subtarget.
+  unsigned getVGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());
   }
 
-  unsigned getMaxNumSGPRs() const;
+  /// \returns VGPR encoding granularity supported by the subtarget.
+  unsigned getVGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+  }
+
+  /// \returns Total number of VGPRs supported by the subtarget.
+  unsigned getTotalNumVGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+  }
+
+  /// \returns Addressable number of VGPRs supported by the subtarget.
+  unsigned getAddressableNumVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+  }
+
+  /// \returns Minimum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Maximum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Reserved number of VGPRs for given function \p MF.
+  unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
+    return debuggerReserveRegs() ? 4 : 0;
+  }
+
+  /// \returns Maximum number of VGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of VGPRs explicitly
+  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d8a0c71..dc868f0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,34 +15,37 @@
 
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUCallLowering.h"
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPULegalizerInfo.h"
+#include "AMDGPUMacroFusion.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
+#include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
 #include "R600MachineScheduler.h"
 #include "SIMachineScheduler.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Vectorize.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
 #include <memory>
 
 using namespace llvm;
@@ -58,6 +61,11 @@ static cl::opt<bool> EnableSROA(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool>
+EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                        cl::desc("Run early if-conversion"),
+                        cl::init(false));
+
 static cl::opt<bool> EnableR600IfConvert(
   "r600-if-convert",
   cl::desc("Use if conversion pass"),
@@ -75,6 +83,43 @@ static cl::opt<bool> EnableLoadStoreVectorizer(
 static cl::opt<bool> ScalarizeGlobal(
   "amdgpu-scalarize-global-loads",
   cl::desc("Enable global load scalarization"),
+  cl::init(true),
+  cl::Hidden);
+
+// Option to run internalize pass.
+static cl::opt<bool> InternalizeSymbols(
+  "amdgpu-internalize-symbols",
+  cl::desc("Enable elimination of non-kernel functions and unused globals"),
+  cl::init(false),
+  cl::Hidden);
+
+// Option to inline all early.
+static cl::opt<bool> EarlyInlineAll(
+  "amdgpu-early-inline-all",
+  cl::desc("Inline all functions early"),
+  cl::init(false),
+  cl::Hidden);
+
+static cl::opt<bool> EnableSDWAPeephole(
+  "amdgpu-sdwa-peephole",
+  cl::desc("Enable SDWA peepholer"),
+  cl::init(true));
+
+// Enable address space based alias analysis
+static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+  cl::desc("Enable AMDGPU Alias Analysis"),
+  cl::init(true));
+
+// Option to enable new waitcnt insertion pass.
+static cl::opt<bool> EnableSIInsertWaitcntsPass(
+  "enable-si-insert-waitcnts",
+  cl::desc("Use new waitcnt insertion pass"),
+  cl::init(true));
+
+// Option to run late CFG structurizer
+static cl::opt<bool> LateCFGStructurize(
+  "amdgpu-late-structurize",
+  cl::desc("Enable late CFG structurization"),
   cl::init(false),
   cl::Hidden);
 
@@ -86,22 +131,29 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   PassRegistry *PR = PassRegistry::getPassRegistry();
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
+  initializeSIFixVGPRCopiesPass(*PR);
   initializeSIFoldOperandsPass(*PR);
+  initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
+  initializeAMDGPUAlwaysInlinePass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
+  initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
+  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
+  initializeAMDGPUAAWrapperPassPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -119,13 +171,27 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-      new ScheduleDAGMILive(C,
-                            llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  return DAG;
+}
+
+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  auto DAG = new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
 
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+  return new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
 static MachineSchedRegistry
 R600SchedRegistry("r600", "Run R600's custom scheduler",
                    createR600MachineScheduler);
@@ -139,6 +205,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
                              "Run GCN scheduler to maximize occupancy",
                              createGCNMaxOccupancyMachineScheduler);
 
+static MachineSchedRegistry
+IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
+  "Run GCN scheduler to maximize occupancy (experimental)",
+  createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+GCNMinRegSchedRegistry("gcn-minreg",
+  "Run GCN iterative scheduler for minimal register usage (experimental)",
+  createMinRegScheduler);
+
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
@@ -148,9 +224,14 @@ static StringRef computeDataLayout(const Triple &TT) {
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat.
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+  if (TT.getEnvironmentName() == "amdgiz" ||
+      TT.getEnvironmentName() == "amdgizcl")
+    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 }
 
 LLVM_READNONE
@@ -180,6 +261,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
   : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
                       FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
     TLOF(createTLOF(getTargetTriple())) {
+  AS = AMDGPU::getAMDGPUAS(TT);
   initAsmInfo();
 }
 
@@ -199,8 +281,74 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     FSAttr.getValueAsString();
 }
 
-void AMDGPUTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
-  PM.add(createAMDGPUUnifyMetadataPass());
+static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
+  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
+      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+        AAR.addAAResult(WrapperPass->getResult());
+      });
+}
+
+void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+  Builder.DivergentTarget = true;
+
+  bool Internalize = InternalizeSymbols &&
+                     (getOptLevel() > CodeGenOpt::None) &&
+                     (getTargetTriple().getArch() == Triple::amdgcn);
+  bool EarlyInline = EarlyInlineAll &&
+                     (getOptLevel() > CodeGenOpt::None);
+  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_ModuleOptimizerEarly,
+    [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
+                                         legacy::PassManagerBase &PM) {
+      if (AMDGPUAA) {
+        PM.add(createAMDGPUAAWrapperPass());
+        PM.add(createAMDGPUExternalAAWrapperPass());
+      }
+      PM.add(createAMDGPUUnifyMetadataPass());
+      if (Internalize) {
+        PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
+          if (const Function *F = dyn_cast<Function>(&GV)) {
+            if (F->isDeclaration())
+                return true;
+            switch (F->getCallingConv()) {
+            default:
+              return false;
+            case CallingConv::AMDGPU_VS:
+            case CallingConv::AMDGPU_HS:
+            case CallingConv::AMDGPU_GS:
+            case CallingConv::AMDGPU_PS:
+            case CallingConv::AMDGPU_CS:
+            case CallingConv::AMDGPU_KERNEL:
+            case CallingConv::SPIR_KERNEL:
+              return true;
+            }
+          }
+          return !GV.use_empty();
+        }));
+        PM.add(createGlobalDCEPass());
+      }
+      if (EarlyInline)
+        PM.add(createAMDGPUAlwaysInlinePass(false));
+  });
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_EarlyAsPossible,
+    [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      if (AMDGPUAA) {
+        PM.add(createAMDGPUAAWrapperPass());
+        PM.add(createAMDGPUExternalAAWrapperPass());
+      }
+  });
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_CGSCCOptimizerLate,
+    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      // Add infer address spaces pass to the opt pipeline after inlining
+      // but before SROA to increase SROA opportunities.
+      PM.add(createInferAddressSpacesPass());
+  });
 }
 
 //===----------------------------------------------------------------------===//
@@ -240,19 +388,6 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
 // GCN Target Machine (SI+)
 //===----------------------------------------------------------------------===//
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct SIGISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
-  const AMDGPUCallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-};
-
-} // end anonymous namespace
-#endif
-
 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
                                    StringRef CPU, StringRef FS,
                                    TargetOptions Options,
@@ -274,16 +409,6 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
-
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
-    GISel->CallLoweringInfo.reset(
-      new AMDGPUCallLowering(*I->getTargetLowering()));
-#endif
-
-    I->setGISelAccessor(*GISel);
   }
 
   I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -299,7 +424,7 @@ namespace {
 
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
-  AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
+  AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
     // Exceptions and StackMaps are not supported, so these passes will never do
     // anything.
@@ -330,7 +455,7 @@ public:
 
 class R600PassConfig final : public AMDGPUPassConfig {
 public:
-  R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
+  R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) {}
 
   ScheduleDAGInstrs *createMachineScheduler(
@@ -346,7 +471,7 @@ public:
 
 class GCNPassConfig final : public AMDGPUPassConfig {
 public:
-  GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
+  GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) {}
 
   GCNTargetMachine &getGCNTargetMachine() const {
@@ -356,9 +481,9 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override;
 
-  void addIRPasses() override;
   bool addPreISel() override;
   void addMachineSSAOptimization() override;
+  bool addILPOpts() override;
   bool addInstSelector() override;
 #ifdef LLVM_BUILD_GLOBAL_ISEL
   bool addIRTranslator() override;
@@ -406,11 +531,15 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
 }
 
 void AMDGPUPassConfig::addIRPasses() {
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+
   // There is no reason to run these.
   disablePass(&StackMapLivenessID);
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  addPass(createAMDGPULowerIntrinsicsPass());
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());
@@ -421,17 +550,32 @@ void AMDGPUPassConfig::addIRPasses() {
   // without ever running any passes on the second.
   addPass(createBarrierNoopPass());
 
+  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+    // TODO: May want to move later or split into an early and late one.
+
+    addPass(createAMDGPUCodeGenPreparePass());
+  }
+
   // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
 
-  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
   if (TM.getOptLevel() > CodeGenOpt::None) {
-    addPass(createAMDGPUPromoteAlloca(&TM));
+    addPass(createInferAddressSpacesPass());
+    addPass(createAMDGPUPromoteAlloca());
 
     if (EnableSROA)
       addPass(createSROAPass());
 
     addStraightLineScalarOptimizationPasses();
+
+    if (EnableAMDGPUAliasAnalysis) {
+      addPass(createAMDGPUAAWrapperPass());
+      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
+                                             AAResults &AAR) {
+        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+          AAR.addAAResult(WrapperPass->getResult());
+        }));
+    }
   }
 
   TargetPassConfig::addIRPasses();
@@ -487,26 +631,26 @@ bool R600PassConfig::addPreISel() {
 }
 
 void R600PassConfig::addPreRegAlloc() {
-  addPass(createR600VectorRegMerger(*TM));
+  addPass(createR600VectorRegMerger());
 }
 
 void R600PassConfig::addPreSched2() {
   addPass(createR600EmitClauseMarkers(), false);
   if (EnableR600IfConvert)
     addPass(&IfConverterID, false);
-  addPass(createR600ClauseMergePass(*TM), false);
+  addPass(createR600ClauseMergePass(), false);
 }
 
 void R600PassConfig::addPreEmitPass() {
   addPass(createAMDGPUCFGStructurizerPass(), false);
-  addPass(createR600ExpandSpecialInstrsPass(*TM), false);
+  addPass(createR600ExpandSpecialInstrsPass(), false);
   addPass(&FinalizeMachineBundlesID, false);
-  addPass(createR600Packetizer(*TM), false);
-  addPass(createR600ControlFlowFinalizer(*TM), false);
+  addPass(createR600Packetizer(), false);
+  addPass(createR600ControlFlowFinalizer(), false);
 }
 
 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new R600PassConfig(this, PM);
+  return new R600PassConfig(*this, PM);
 }
 
 //===----------------------------------------------------------------------===//
@@ -526,12 +670,19 @@ bool GCNPassConfig::addPreISel() {
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
-  addPass(&AMDGPUAnnotateKernelFeaturesID);
-  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  addPass(createAMDGPUAnnotateKernelFeaturesPass());
+
+  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+  // regions formed by them.
+  addPass(&AMDGPUUnifyDivergentExitNodesID);
+  if (!LateCFGStructurize) {
+    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  }
   addPass(createSinkingPass());
-  addPass(createSITypeRewriter());
   addPass(createAMDGPUAnnotateUniformValues());
-  addPass(createSIAnnotateControlFlowPass());
+  if (!LateCFGStructurize) {
+    addPass(createSIAnnotateControlFlowPass());
+  }
 
   return false;
 }
@@ -549,13 +700,22 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&SIFoldOperandsID);
   addPass(&DeadMachineInstructionElimID);
   addPass(&SILoadStoreOptimizerID);
+  if (EnableSDWAPeephole) {
+    addPass(&SIPeepholeSDWAID);
+    addPass(&MachineLICMID);
+    addPass(&MachineCSEID);
+    addPass(&SIFoldOperandsID);
+    addPass(&DeadMachineInstructionElimID);
+  }
+  addPass(createSIShrinkInstructionsPass());
 }
 
-void GCNPassConfig::addIRPasses() {
-  // TODO: May want to move later or split into an early and late one.
-  addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine()));
+bool GCNPassConfig::addILPOpts() {
+  if (EnableEarlyIfConversion)
+    addPass(&EarlyIfConverterID);
 
-  AMDGPUPassConfig::addIRPasses();
+  TargetPassConfig::addILPOpts();
+  return false;
 }
 
 bool GCNPassConfig::addInstSelector() {
@@ -572,20 +732,26 @@ bool GCNPassConfig::addIRTranslator() {
 }
 
 bool GCNPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
   return false;
 }
 
 bool GCNPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
   return false;
 }
 
 bool GCNPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
   return false;
 }
+
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
-  addPass(createSIShrinkInstructionsPass());
+  if (LateCFGStructurize) {
+    addPass(createAMDGPUMachineCFGStructurizerPass());
+  }
   addPass(createSIWholeQuadModePass());
 }
 
@@ -615,6 +781,7 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
+  addPass(&SIFixVGPRCopiesID);
   addPass(&SIOptimizeExecMaskingID);
   TargetPassConfig::addPostRegAlloc();
 }
@@ -633,7 +800,10 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  addPass(createSIInsertWaitsPass());
+  if (EnableSIInsertWaitcntsPass)
+    addPass(createSIInsertWaitcntsPass());
+  else
+    addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
   addPass(createSIDebuggerInsertNopsPass());
@@ -641,5 +811,6 @@ void GCNPassConfig::addPreEmitPass() {
 }
 
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new GCNPassConfig(this, PM);
+  return new GCNPassConfig(*this, PM);
 }
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 9496773..a3c7c19 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AMDGPUIntrinsicInfo IntrinsicInfo;
+  AMDGPUAS AS;
 
   StringRef getGPUName(const Function &F) const;
   StringRef getFeatureString(const Function &F) const;
@@ -57,7 +58,17 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+  AMDGPUAS getAMDGPUAS() const {
+    return AS;
+  }
+
+  void adjustPassManager(PassManagerBuilder &) override;
+  /// Get the integer value of a null pointer in the given address space.
+  uint64_t getNullPointerValue(unsigned AddrSpace) const {
+    if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS)
+      return -1;
+    return 0;
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -77,6 +88,10 @@ public:
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   const R600Subtarget *getSubtargetImpl(const Function &) const override;
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 1fddc88..6c1885e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -9,10 +9,11 @@
 
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ELF.h"
-#include "Utils/AMDGPUBaseInfo.h"
 
 using namespace llvm;
 
@@ -22,7 +23,8 @@ using namespace llvm;
 
 MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
+  auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) &&
       AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
     return TextSection;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index de32778..ca6210f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index e904870..89a0390 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -20,8 +20,8 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -29,8 +29,41 @@ using namespace llvm;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
+static cl::opt<unsigned> UnrollThresholdPrivate(
+  "amdgpu-unroll-threshold-private",
+  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
+  cl::init(2500), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdLocal(
+  "amdgpu-unroll-threshold-local",
+  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
+  cl::init(1000), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdIf(
+  "amdgpu-unroll-threshold-if",
+  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
+  cl::init(150), cl::Hidden);
+
+static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
+                              unsigned Depth = 0) {
+  const Instruction *I = dyn_cast<Instruction>(Cond);
+  if (!I)
+    return false;
 
-void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
+  for (const Value *V : I->operand_values()) {
+    if (!L->contains(I))
+      continue;
+    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
+      if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
+                  return SubLoop->contains(PHI); }))
+        return true;
+    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
+      return true;
+  }
+  return false;
+}
+
+void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                             TTI::UnrollingPreferences &UP) {
   UP.Threshold = 300; // Twice the default.
   UP.MaxCount = UINT_MAX;
@@ -38,36 +71,122 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
 
   // TODO: Do we want runtime unrolling?
 
+  // Maximum alloca size than can fit registers. Reserve 16 registers.
+  const unsigned MaxAlloca = (256 - 16) * 4;
+  unsigned ThresholdPrivate = UnrollThresholdPrivate;
+  unsigned ThresholdLocal = UnrollThresholdLocal;
+  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+  AMDGPUAS ASST = ST->getAMDGPUAS();
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
+    unsigned LocalGEPsSeen = 0;
+
+    if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
+               return SubLoop->contains(BB); }))
+        continue; // Block belongs to an inner loop.
+
     for (const Instruction &I : *BB) {
+
+      // Unroll a loop which contains an "if" statement whose condition
+      // defined by a PHI belonging to the loop. This may help to eliminate
+      // if region and potentially even PHI itself, saving on both divergence
+      // and registers used for the PHI.
+      // Add a small bonus for each of such "if" statements.
+      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
+        if (UP.Threshold < MaxBoost && Br->isConditional()) {
+          if (L->isLoopExiting(Br->getSuccessor(0)) ||
+              L->isLoopExiting(Br->getSuccessor(1)))
+            continue;
+          if (dependsOnLocalPhi(L, Br->getCondition())) {
+            UP.Threshold += UnrollThresholdIf;
+            DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                         << " for loop:\n" << *L << " due to " << *Br << '\n');
+            if (UP.Threshold >= MaxBoost)
+              return;
+          }
+        }
+        continue;
+      }
+
       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
-      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+      if (!GEP)
+        continue;
+
+      unsigned AS = GEP->getAddressSpace();
+      unsigned Threshold = 0;
+      if (AS == ASST.PRIVATE_ADDRESS)
+        Threshold = ThresholdPrivate;
+      else if (AS == ASST.LOCAL_ADDRESS)
+        Threshold = ThresholdLocal;
+      else
+        continue;
+
+      if (UP.Threshold >= Threshold)
         continue;
 
-      const Value *Ptr = GEP->getPointerOperand();
-      const AllocaInst *Alloca =
-          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
-      if (Alloca) {
-        // We want to do whatever we can to limit the number of alloca
-        // instructions that make it through to the code generator.  allocas
-        // require us to use indirect addressing, which is slow and prone to
-        // compiler bugs.  If this loop does an address calculation on an
-        // alloca ptr, then we want to use a higher than normal loop unroll
-        // threshold. This will give SROA a better chance to eliminate these
-        // allocas.
-        //
-        // Don't use the maximum allowed value here as it will make some
-        // programs way too big.
-        UP.Threshold = 800;
+      if (AS == ASST.PRIVATE_ADDRESS) {
+        const Value *Ptr = GEP->getPointerOperand();
+        const AllocaInst *Alloca =
+            dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+        if (!Alloca || !Alloca->isStaticAlloca())
+          continue;
+        Type *Ty = Alloca->getAllocatedType();
+        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
+        if (AllocaSize > MaxAlloca)
+          continue;
+      } else if (AS == ASST.LOCAL_ADDRESS) {
+        LocalGEPsSeen++;
+        // Inhibit unroll for local memory if we have seen addressing not to
+        // a variable, most likely we will be unable to combine it.
+        // Do not unroll too deep inner loops for local memory to give a chance
+        // to unroll an outer loop for a more important reason.
+        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
+            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
+             !isa<Argument>(GEP->getPointerOperand())))
+          continue;
       }
+
+      // Check if GEP depends on a value defined by this loop itself.
+      bool HasLoopDef = false;
+      for (const Value *Op : GEP->operands()) {
+        const Instruction *Inst = dyn_cast<Instruction>(Op);
+        if (!Inst || L->isLoopInvariant(Op))
+          continue;
+
+        if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+             return SubLoop->contains(Inst); }))
+          continue;
+        HasLoopDef = true;
+        break;
+      }
+      if (!HasLoopDef)
+        continue;
+
+      // We want to do whatever we can to limit the number of alloca
+      // instructions that make it through to the code generator.  allocas
+      // require us to use indirect addressing, which is slow and prone to
+      // compiler bugs.  If this loop does an address calculation on an
+      // alloca ptr, then we want to use a higher than normal loop unroll
+      // threshold. This will give SROA a better chance to eliminate these
+      // allocas.
+      //
+      // We also want to have more unrolling for local memory to let ds
+      // instructions with different offsets combine.
+      //
+      // Don't use the maximum allowed value here as it will make some
+      // programs way too big.
+      UP.Threshold = Threshold;
+      DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
+                   << *L << " due to " << *GEP << '\n');
+      if (UP.Threshold >= MaxBoost)
+        return;
     }
   }
 }
 
-unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
-  if (Vec)
-    return 0;
+unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
+  // The concept of vector registers doesn't really exist. Some packed vector
+  // operations operate on the normal 32-bit registers.
 
   // Number of VGPRs on SI.
   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
@@ -76,35 +195,73 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) {
   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
 }
 
-unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
-  return Vector ? 0 : 32;
+unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const {
+  // This is really the number of registers to fill when vectorizing /
+  // interleaving loops, so we lie to avoid trying to use all registers.
+  return getHardwareNumberOfRegisters(Vec) >> 3;
+}
+
+unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) const {
+  return 32;
+}
+
+unsigned AMDGPUTTIImpl::getMinVectorRegisterBitWidth() const {
+  return 32;
 }
 
 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
-  switch (AddrSpace) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS:
-  case AMDGPUAS::FLAT_ADDRESS:
+  AMDGPUAS AS = ST->getAMDGPUAS();
+  if (AddrSpace == AS.GLOBAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS ||
+      AddrSpace == AS.FLAT_ADDRESS)
     return 128;
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS:
+  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.REGION_ADDRESS)
     return 64;
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  if (AddrSpace == AS.PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
-  default:
-    if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
-        (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
-         AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
-         (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
-          AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
-      return 128;
-    llvm_unreachable("unhandled address space");
+
+  if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+      (AddrSpace == AS.PARAM_D_ADDRESS ||
+      AddrSpace == AS.PARAM_I_ADDRESS ||
+      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+      AddrSpace <= AS.CONSTANT_BUFFER_15)))
+    return 128;
+  llvm_unreachable("unhandled address space");
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                               unsigned Alignment,
+                                               unsigned AddrSpace) const {
+  // We allow vectorization of flat stores, even though we may need to decompose
+  // them later if they may access private memory. We don't have enough context
+  // here, and legalization can handle it.
+  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
+      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
+  return true;
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                                unsigned Alignment,
+                                                unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                                 unsigned Alignment,
+                                                 unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  // Semi-arbitrary large amount.
-  return 64;
+  // Disable unrolling if the loop is not vectorized.
+  // TODO: Enable this again.
+  if (VF == 1)
+    return 1;
+
+  return 8;
 }
 
 int AMDGPUTTIImpl::getArithmeticInstrCost(
@@ -216,28 +373,29 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                                       unsigned Index) {
   switch (Opcode) {
   case Instruction::ExtractElement:
-  case Instruction::InsertElement:
+  case Instruction::InsertElement: {
+    unsigned EltSize
+      = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
+    if (EltSize < 32) {
+      if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
+        return 0;
+      return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
+    }
+
     // Extracts are just reads of a subregister, so are free. Inserts are
     // considered free because we don't want to have any cost for scalarizing
     // operations, and we don't have to copy into a different register class.
 
     // Dynamic indexing isn't free and is best avoided.
     return Index == ~0u ? 2 : 0;
+  }
   default:
     return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
   }
 }
 
-static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
-                                          const IntrinsicInst *I) {
+static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
   switch (I->getIntrinsicID()) {
-  default:
-    return false;
-  case Intrinsic::not_intrinsic:
-    // This means we have an intrinsic that isn't defined in
-    // IntrinsicsAMDGPU.td
-    break;
-
   case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::amdgcn_workitem_id_z:
@@ -249,6 +407,8 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::r600_read_tidig_x:
   case Intrinsic::r600_read_tidig_y:
   case Intrinsic::r600_read_tidig_z:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
   case Intrinsic::amdgcn_image_atomic_swap:
   case Intrinsic::amdgcn_image_atomic_add:
   case Intrinsic::amdgcn_image_atomic_sub:
@@ -274,16 +434,10 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::amdgcn_buffer_atomic_xor:
   case Intrinsic::amdgcn_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_ps_live:
+  case Intrinsic::amdgcn_ds_swizzle:
     return true;
-  }
-
-  StringRef Name = I->getCalledFunction()->getName();
-  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
   default:
     return false;
-  case AMDGPUIntrinsic::SI_fs_interp:
-  case AMDGPUIntrinsic::SI_fs_constant:
-    return true;
   }
 }
 
@@ -291,16 +445,24 @@ static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
   // Arguments to compute shaders are never a source of divergence.
-  if (!AMDGPU::isShader(F->getCallingConv()))
-    return true;
-
-  // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
-  if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
-      F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
+  CallingConv::ID CC = F->getCallingConv();
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
     return true;
-
-  // Everything else is in VGPRs.
-  return false;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
+    // Everything else is in VGPRs.
+    return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
+           F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
+  default:
+    // TODO: Should calls support inreg for SGPR inputs?
+    return false;
+  }
 }
 
 ///
@@ -318,7 +480,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   // All other loads are not divergent, because if threads issue loads with the
   // same arguments, they will always get the same result.
   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
-    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+    return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
 
   // Atomics are divergent because they are executed sequentially: when an
   // atomic operation refers to the same address in each thread, then each
@@ -327,10 +489,8 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
     return true;
 
-  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    const TargetMachine &TM = getTLI()->getTargetMachine();
-    return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
-  }
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
+    return isIntrinsicSourceOfDivergence(Intrinsic);
 
   // Assume all function calls are a source of divergence.
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
@@ -338,3 +498,39 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
 
   return false;
 }
+
+bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const {
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_readfirstlane:
+    case Intrinsic::amdgcn_readlane:
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                       Type *SubTp) {
+  if (ST->hasVOP3PInsts()) {
+    VectorType *VT = cast<VectorType>(Tp);
+    if (VT->getNumElements() == 2 &&
+        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+      // With op_sel VOP3P instructions freely can access the low half or high
+      // half of a register, so any swizzle is free.
+
+      switch (Kind) {
+      case TTI::SK_Broadcast:
+      case TTI::SK_Reverse:
+      case TTI::SK_PermuteSingleSrc:
+        return 0;
+      default:
+        break;
+      }
+    }
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 0d83b2a..9a320bd 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -32,6 +32,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
 
   const AMDGPUSubtarget *ST;
   const AMDGPUTargetLowering *TLI;
+  bool IsGraphicsShader;
 
   const AMDGPUSubtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
@@ -62,20 +63,35 @@ public:
   explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
     : BaseT(TM, F.getParent()->getDataLayout()),
       ST(TM->getSubtargetImpl(F)),
-      TLI(ST->getTargetLowering()) {}
+      TLI(ST->getTargetLowering()),
+      IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
 
   bool hasBranchDivergence() { return true; }
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
     return TTI::PSK_FastHardware;
   }
 
-  unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getHardwareNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getRegisterBitWidth(bool Vector) const ;
+  unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                  unsigned Alignment,
+                                  unsigned AddrSpace) const;
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                   unsigned Alignment,
+                                   unsigned AddrSpace) const;
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                    unsigned Alignment,
+                                    unsigned AddrSpace) const;
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getArithmeticInstrCost(
@@ -90,8 +106,21 @@ public:
 
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
+  bool isAlwaysUniform(const Value *V) const;
+
+  unsigned getFlatAddressSpace() const {
+    // Don't bother running InferAddressSpaces pass on graphics shaders which
+    // don't use flat addressing.
+    if (IsGraphicsShader)
+      return -1;
+    return ST->hasFlatAddressSpace() ?
+      ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE;
+  }
 
   unsigned getVectorSplitCost() { return 0; }
+
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                          Type *SubTp);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
new file mode 100644
index 0000000..309913f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -0,0 +1,225 @@
+//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
+
+namespace {
+
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+    initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
+  }
+
+  // We can preserve non-critical-edgeness when we unify function exit nodes
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+
+}
+
+char AMDGPUUnifyDivergentExitNodes::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                     "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                    "Unify divergent function exit nodes", false, false)
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // TODO: Preserve dominator tree.
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+
+  AU.addRequired<DivergenceAnalysis>();
+
+  // No divergent values are changed, only blocks and branch edges.
+  AU.addPreserved<DivergenceAnalysis>();
+
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(LowerSwitchID);
+  FunctionPass::getAnalysisUsage(AU);
+
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+/// \returns true if \p BB is reachable through only uniform branches.
+/// XXX - Is there a more efficient way to find this?
+static bool isUniformlyReached(const DivergenceAnalysis &DA,
+                               BasicBlock &BB) {
+  SmallVector<BasicBlock *, 8> Stack;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+
+  for (BasicBlock *Pred : predecessors(&BB))
+    Stack.push_back(Pred);
+
+  while (!Stack.empty()) {
+    BasicBlock *Top = Stack.pop_back_val();
+    if (!DA.isUniform(Top->getTerminator()))
+      return false;
+
+    for (BasicBlock *Pred : predecessors(Top)) {
+      if (Visited.insert(Pred).second)
+        Stack.push_back(Pred);
+    }
+  }
+
+  return true;
+}
+
+static BasicBlock *unifyReturnBlockSet(Function &F,
+                                       ArrayRef<BasicBlock *> ReturningBlocks,
+                                       const TargetTransformInfo &TTI,
+                                       StringRef Name) {
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Cleanup possible branch to unconditional branch to the return.
+    SimplifyCFG(BB, TTI, 2);
+  }
+
+  return NewRetBlock;
+}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  if (PDT.getRoots().size() <= 1)
+    return false;
+
+  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  SmallVector<BasicBlock *, 4> ReturningBlocks;
+  SmallVector<BasicBlock *, 4> UnreachableBlocks;
+
+  for (BasicBlock *BB : PDT.getRoots()) {
+    if (isa<ReturnInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        ReturningBlocks.push_back(BB);
+    } else if (isa<UnreachableInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        UnreachableBlocks.push_back(BB);
+    }
+  }
+
+  if (!UnreachableBlocks.empty()) {
+    BasicBlock *UnreachableBlock = nullptr;
+
+    if (UnreachableBlocks.size() == 1) {
+      UnreachableBlock = UnreachableBlocks.front();
+    } else {
+      UnreachableBlock = BasicBlock::Create(F.getContext(),
+                                            "UnifiedUnreachableBlock", &F);
+      new UnreachableInst(F.getContext(), UnreachableBlock);
+
+      for (BasicBlock *BB : UnreachableBlocks) {
+        BB->getInstList().pop_back();  // Remove the unreachable inst.
+        BranchInst::Create(UnreachableBlock, BB);
+      }
+    }
+
+    if (!ReturningBlocks.empty()) {
+      // Don't create a new unreachable inst if we have a return. The
+      // structurizer/annotator can't handle the multiple exits
+
+      Type *RetTy = F.getReturnType();
+      Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.
+
+      Function *UnreachableIntrin =
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+
+      // Insert a call to an intrinsic tracking that this is an unreachable
+      // point, in case we want to kill the active lanes or something later.
+      CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
+
+      // Don't create a scalar trap. We would only want to trap if this code was
+      // really reached, but a scalar trap would happen even if no lanes
+      // actually reached here.
+      ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
+      ReturningBlocks.push_back(UnreachableBlock);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty())
+    return false; // No blocks return
+
+  if (ReturningBlocks.size() == 1)
+    return false; // Already has a single return block
+
+  const TargetTransformInfo &TTI
+    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
index bf501a1..3a0c3ed 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp
@@ -13,38 +13,39 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include <algorithm>
+#include <cassert>
 
 using namespace llvm;
 
 namespace {
+
   namespace kOCLMD {
+
     const char SpirVer[]            = "opencl.spir.version";
     const char OCLVer[]             = "opencl.ocl.version";
     const char UsedExt[]            = "opencl.used.extensions";
     const char UsedOptCoreFeat[]    = "opencl.used.optional.core.features";
     const char CompilerOptions[]    = "opencl.compiler.options";
     const char LLVMIdent[]          = "llvm.ident";
-  }
+
+  } // end namespace kOCLMD
 
   /// \brief Unify multiple OpenCL metadata due to linking.
-  class AMDGPUUnifyMetadata : public FunctionPass {
+  class AMDGPUUnifyMetadata : public ModulePass {
   public:
     static char ID;
-    explicit AMDGPUUnifyMetadata() : FunctionPass(ID) {};
+    explicit AMDGPUUnifyMetadata() : ModulePass(ID) {};
 
   private:
-    // This should really be a module pass but we have to run it as early
-    // as possible, so given function passes are executed first and
-    // TargetMachine::addEarlyAsPossiblePasses() expects only function passes
-    // it has to be a function pass.
     virtual bool runOnModule(Module &M);
 
-    // \todo: Convert to a module pass.
-    virtual bool runOnFunction(Function &F);
-
     /// \brief Unify version metadata.
     /// \return true if changes are made.
     /// Assume the named metadata has operands each of which is a pair of
@@ -117,7 +118,7 @@ INITIALIZE_PASS(AMDGPUUnifyMetadata, "amdgpu-unify-metadata",
                 "Unify multiple OpenCL metadata due to linking",
                 false, false)
 
-FunctionPass* llvm::createAMDGPUUnifyMetadataPass() {
+ModulePass* llvm::createAMDGPUUnifyMetadataPass() {
   return new AMDGPUUnifyMetadata();
 }
 
@@ -143,7 +144,3 @@ bool AMDGPUUnifyMetadata::runOnModule(Module &M) {
 
   return Changed;
 }
-
-bool AMDGPUUnifyMetadata::runOnFunction(Function &F) {
-  return runOnModule(*F.getParent());
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 7faeccd..1a39384 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -9,27 +9,40 @@
 //==-----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
+#include "R600RegisterInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstddef>
 #include <deque>
+#include <iterator>
+#include <map>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -53,15 +66,19 @@ STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
 namespace llvm {
+
   void initializeAMDGPUCFGStructurizerPass(PassRegistry&);
-}
+
+} // end namespace llvm
+
+namespace {
 
 //===----------------------------------------------------------------------===//
 //
 // Miscellaneous utility for CFGStructurizer.
 //
 //===----------------------------------------------------------------------===//
-namespace {
+
 #define SHOWNEWINSTR(i) \
   DEBUG(dbgs() << "New instr: " << *i << "\n");
 
@@ -82,35 +99,19 @@ DEBUG( \
 
 #define INVALIDSCCNUM -1
 
-template<class NodeT>
-void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
-  size_t sz = Src.size();
-  for (size_t i = 0; i < sz/2; ++i) {
-    NodeT *t = Src[i];
-    Src[i] = Src[sz - i - 1];
-    Src[sz - i - 1] = t;
-  }
-}
-
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 //
 // supporting data structure for CFGStructurizer
 //
 //===----------------------------------------------------------------------===//
 
-
-namespace {
-
 class BlockInformation {
 public:
-  bool IsRetired;
-  int  SccNum;
-  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
-};
+  bool IsRetired = false;
+  int SccNum = INVALIDSCCNUM;
 
-} // end anonymous namespace
+  BlockInformation() = default;
+};
 
 //===----------------------------------------------------------------------===//
 //
@@ -118,7 +119,6 @@ public:
 //
 //===----------------------------------------------------------------------===//
 
-namespace {
 class AMDGPUCFGStructurizer : public MachineFunctionPass {
 public:
   typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
@@ -133,8 +133,7 @@ public:
 
   static char ID;
 
-  AMDGPUCFGStructurizer() :
-      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
+  AMDGPUCFGStructurizer() : MachineFunctionPass(ID) {
     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -167,7 +166,7 @@ public:
     MLI = &getAnalysis<MachineLoopInfo>();
     DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
+    DEBUG(MDT->print(dbgs(), (const Module*)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
     DEBUG(PDT->print(dbgs()););
     prepare();
@@ -180,8 +179,8 @@ protected:
   MachineDominatorTree *MDT;
   MachinePostDominatorTree *PDT;
   MachineLoopInfo *MLI;
-  const R600InstrInfo *TII;
-  const R600RegisterInfo *TRI;
+  const R600InstrInfo *TII = nullptr;
+  const R600RegisterInfo *TRI = nullptr;
 
   // PRINT FUNCTIONS
   /// Print the ordered Blocks.
@@ -198,6 +197,7 @@ protected:
       }
     }
   }
+
   static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
     for (MachineLoop::iterator iter = LoopInfo.begin(),
          iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
@@ -263,7 +263,6 @@ protected:
       MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
   static void wrapup(MachineBasicBlock *MBB);
 
-
   int patternMatch(MachineBasicBlock *MBB);
   int patternMatchGroup(MachineBasicBlock *MBB);
   int serialPatternMatch(MachineBasicBlock *MBB);
@@ -328,7 +327,6 @@ protected:
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
 
-
 private:
   MBBInfoMap BlockInfoMap;
   LoopLandInfoMap LLInfoMap;
@@ -337,6 +335,10 @@ private:
   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
 };
 
+char AMDGPUCFGStructurizer::ID = 0;
+
+} // end anonymous namespace
+
 int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
   MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
   if (It == BlockInfoMap.end())
@@ -379,6 +381,7 @@ bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
   }
   return false;
 }
+
 AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
     MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
     bool AllowSideEntry) const {
@@ -697,10 +700,8 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
    // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
    // there isn't such an interface yet.  alternatively, replace all the other
    // blocks in the jump table with the entryBlk //}
-
 }
 
-
 bool AMDGPUCFGStructurizer::prepare() {
   bool Changed = false;
 
@@ -748,7 +749,6 @@ bool AMDGPUCFGStructurizer::prepare() {
 }
 
 bool AMDGPUCFGStructurizer::run() {
-
   //Assume reducible CFG...
   DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n");
 
@@ -886,8 +886,6 @@ bool AMDGPUCFGStructurizer::run() {
   return true;
 }
 
-
-
 void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   int SccNum = 0;
   MachineBasicBlock *MBB;
@@ -903,11 +901,8 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
     }
   }
 
-  //walk through all the block in func to check for unreachable
-  typedef GraphTraits<MachineFunction *> GTM;
-  auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
+  // walk through all the block in func to check for unreachable
+  for (auto *MBB : nodes(MF)) {
     SccNum = getSCCNum(MBB);
     if (SccNum == INVALIDSCCNUM)
       dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
@@ -941,7 +936,6 @@ int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
   return NumMatch;
 }
 
-
 int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
   if (MBB->succ_size() != 1)
     return 0;
@@ -1039,7 +1033,7 @@ int AMDGPUCFGStructurizer::loopendPatternMatch() {
     for (MachineLoop *ML : depth_first(It))
       NestedLoops.push_front(ML);
 
-  if (NestedLoops.size() == 0)
+  if (NestedLoops.empty())
     return 0;
 
   // Process nested loop outside->inside (we did push_front),
@@ -1074,13 +1068,9 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   MachineBasicBlock *ExitBlk = *ExitBlks.begin();
   assert(ExitBlk && "Loop has several exit block");
   MBBVector LatchBlks;
-  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
-  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
-      PE = InvMBBTraits::child_end(LoopHeader);
-  for (; PI != PE; PI++) {
-    if (LoopRep->contains(*PI))
-      LatchBlks.push_back(*PI);
-  }
+  for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
+    if (LoopRep->contains(LB))
+      LatchBlks.push_back(LB);
 
   for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
     mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
@@ -1217,7 +1207,7 @@ void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
     }
   }
 
-    dbgs() << "\n";
+  dbgs() << "\n";
 }
 
 int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
@@ -1478,7 +1468,6 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
 
   if (LandMBB && TrueMBB && FalseMBB)
     MBB->addSuccessor(LandMBB);
-
 }
 
 void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
@@ -1491,7 +1480,6 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
   DstBlk->replaceSuccessor(DstBlk, LandMBB);
 }
 
-
 void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
     MachineBasicBlock *LandMBB) {
   DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
@@ -1727,11 +1715,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
          && "can't retire block yet");
 }
 
-char AMDGPUCFGStructurizer::ID = 0;
-
-} // end anonymous namespace
-
-
 INITIALIZE_PASS_BEGIN(AMDGPUCFGStructurizer, "amdgpustructurizer",
                       "AMDGPU CFG Structurizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3cf9a1d..b37c2741 100644
--- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -11,17 +11,19 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "SIDefines.h"
+#include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
-#include "Utils/AMDGPUAsmUtils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -39,15 +41,11 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -56,7 +54,6 @@
 #include <map>
 #include <memory>
 #include <string>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::AMDGPU;
@@ -83,7 +80,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   const AMDGPUAsmParser *AsmParser;
 
 public:
-  AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
+  AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
     : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
 
   typedef std::unique_ptr<AMDGPUOperand> Ptr;
@@ -155,12 +152,19 @@ public:
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDFMT,
+    ImmTyNFMT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
     ImmTyInterpSlot,
     ImmTyInterpAttr,
-    ImmTyAttrChan
+    ImmTyAttrChan,
+    ImmTyOpSel,
+    ImmTyOpSelHi,
+    ImmTyNegLo,
+    ImmTyNegHi,
+    ImmTySwizzle
   };
 
   struct TokOp {
@@ -258,6 +262,8 @@ public:
     return isOff() || isRegClass(AMDGPU::VGPR_32RegClassID);
   }
 
+  bool isSDWARegKind() const;
+
   bool isImmTy(ImmTy ImmT) const {
     return isImm() && Imm.Type == ImmT;
   }
@@ -283,10 +289,15 @@ public:
   bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
   bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); }
   bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
+
+  bool isOffsetU12() const { return isImmTy(ImmTyOffset) && isUInt<12>(getImm()); }
+  bool isOffsetS13() const { return isImmTy(ImmTyOffset) && isInt<13>(getImm()); }
   bool isGDS() const { return isImmTy(ImmTyGDS); }
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -297,6 +308,10 @@ public:
   bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
   bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
   bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+  bool isOpSel() const { return isImmTy(ImmTyOpSel); }
+  bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
+  bool isNegLo() const { return isImmTy(ImmTyNegLo); }
+  bool isNegHi() const { return isImmTy(ImmTyNegHi); }
 
   bool isMod() const {
     return isClampSI() || isOModSI();
@@ -316,6 +331,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
   }
 
+  bool isSCSrcV2B16() const {
+    return isSCSrcB16();
+  }
+
   bool isSCSrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
   }
@@ -328,6 +347,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
   }
 
+  bool isSCSrcV2F16() const {
+    return isSCSrcF16();
+  }
+
   bool isSCSrcF32() const {
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
   }
@@ -344,6 +367,11 @@ public:
     return isSCSrcB16() || isLiteralImm(MVT::i16);
   }
 
+  bool isSSrcV2B16() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcB16();
+  }
+
   bool isSSrcB64() const {
     // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
     // See isVSrc64().
@@ -362,6 +390,11 @@ public:
     return isSCSrcB16() || isLiteralImm(MVT::f16);
   }
 
+  bool isSSrcV2F16() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcF16();
+  }
+
   bool isVCSrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
@@ -374,6 +407,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
+  bool isVCSrcV2B16() const {
+    return isVCSrcB16();
+  }
+
   bool isVCSrcF32() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
@@ -386,6 +423,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
 
+  bool isVCSrcV2F16() const {
+    return isVCSrcF16();
+  }
+
   bool isVSrcB32() const {
     return isVCSrcF32() || isLiteralImm(MVT::i32);
   }
@@ -398,6 +439,11 @@ public:
     return isVCSrcF16() || isLiteralImm(MVT::i16);
   }
 
+  bool isVSrcV2B16() const {
+    llvm_unreachable("cannot happen");
+    return isVSrcB16();
+  }
+
   bool isVSrcF32() const {
     return isVCSrcF32() || isLiteralImm(MVT::f32);
   }
@@ -410,6 +456,11 @@ public:
     return isVCSrcF16() || isLiteralImm(MVT::f16);
   }
 
+  bool isVSrcV2F16() const {
+    llvm_unreachable("cannot happen");
+    return isVSrcF16();
+  }
+
   bool isKImmFP32() const {
     return isLiteralImm(MVT::f32);
   }
@@ -433,11 +484,14 @@ public:
   bool isSWaitCnt() const;
   bool isHwreg() const;
   bool isSendMsg() const;
+  bool isSwizzle() const;
   bool isSMRDOffset8() const;
   bool isSMRDOffset20() const;
   bool isSMRDLiteralOffset() const;
   bool isDPPCtrl() const;
   bool isGPRIdxMode() const;
+  bool isS16Imm() const;
+  bool isU16Imm() const;
 
   StringRef getExpressionAsToken() const {
     assert(isExpr());
@@ -459,7 +513,7 @@ public:
     return Imm.Val;
   }
 
-  enum ImmTy getImmTy() const {
+  ImmTy getImmTy() const {
     assert(isImm());
     return Imm.Type;
   }
@@ -501,9 +555,11 @@ public:
     return getModifiers().hasIntModifiers();
   }
 
+  uint64_t applyInputFPModifiers(uint64_t Val, unsigned Size) const;
+
   void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const;
 
-  void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
+  void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const;
 
   template <unsigned Bitwidth>
   void addKImmFPOperands(MCInst &Inst, unsigned N) const;
@@ -586,6 +642,8 @@ public:
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyDFMT: OS << "DFMT"; break;
+    case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -610,6 +668,11 @@ public:
     case ImmTyInterpSlot: OS << "InterpSlot"; break;
     case ImmTyInterpAttr: OS << "InterpAttr"; break;
     case ImmTyAttrChan: OS << "AttrChan"; break;
+    case ImmTyOpSel: OS << "OpSel"; break;
+    case ImmTyOpSelHi: OS << "OpSelHi"; break;
+    case ImmTyNegLo: OS << "NegLo"; break;
+    case ImmTyNegHi: OS << "NegHi"; break;
+    case ImmTySwizzle: OS << "Swizzle"; break;
     }
   }
 
@@ -636,7 +699,7 @@ public:
 
   static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser,
                                       int64_t Val, SMLoc Loc,
-                                      enum ImmTy Type = ImmTyNone,
+                                      ImmTy Type = ImmTyNone,
                                       bool IsFPImm = false) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
     Op->Imm.Val = Val;
@@ -695,9 +758,9 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) {
 // Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next
 // .amdgpu_hsa_kernel or at EOF.
 class KernelScopeInfo {
-  int SgprIndexUnusedMin;
-  int VgprIndexUnusedMin;
-  MCContext *Ctx;
+  int SgprIndexUnusedMin = -1;
+  int VgprIndexUnusedMin = -1;
+  MCContext *Ctx = nullptr;
 
   void usesSgprAt(int i) {
     if (i >= SgprIndexUnusedMin) {
@@ -708,6 +771,7 @@ class KernelScopeInfo {
       }
     }
   }
+
   void usesVgprAt(int i) {
     if (i >= VgprIndexUnusedMin) {
       VgprIndexUnusedMin = ++i;
@@ -717,14 +781,16 @@ class KernelScopeInfo {
       }
     }
   }
+
 public:
-  KernelScopeInfo() : SgprIndexUnusedMin(-1), VgprIndexUnusedMin(-1), Ctx(nullptr)
-  {}
+  KernelScopeInfo() = default;
+
   void initialize(MCContext &Context) {
     Ctx = &Context;
     usesSgprAt(SgprIndexUnusedMin = -1);
     usesVgprAt(VgprIndexUnusedMin = -1);
   }
+
   void usesRegister(RegisterKind RegKind, unsigned DwordRegIndex, unsigned RegWidth) {
     switch (RegKind) {
       case IS_SGPR: usesSgprAt(DwordRegIndex + RegWidth - 1); break;
@@ -738,9 +804,9 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   const MCInstrInfo &MII;
   MCAsmParser &Parser;
 
-  unsigned ForcedEncodingSize;
-  bool ForcedDPP;
-  bool ForcedSDWA;
+  unsigned ForcedEncodingSize = 0;
+  bool ForcedDPP = false;
+  bool ForcedSDWA = false;
   KernelScopeInfo KernelScope;
 
   /// @name Auto-generated Match Functions
@@ -756,55 +822,57 @@ private:
   bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
   bool ParseDirectiveHSACodeObjectVersion();
   bool ParseDirectiveHSACodeObjectISA();
-  bool ParseDirectiveRuntimeMetadata();
+  bool ParseDirectiveCodeObjectMetadata();
   bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
   bool ParseDirectiveAMDKernelCodeT();
-  bool ParseSectionDirectiveHSAText();
   bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
   bool ParseDirectiveAMDGPUHsaKernel();
-  bool ParseDirectiveAMDGPUHsaModuleGlobal();
-  bool ParseDirectiveAMDGPUHsaProgramGlobal();
-  bool ParseSectionDirectiveHSADataGlobalAgent();
-  bool ParseSectionDirectiveHSADataGlobalProgram();
-  bool ParseSectionDirectiveHSARodataReadonlyAgent();
-  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
-  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex);
-  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
+  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
+                             RegisterKind RegKind, unsigned Reg1,
+                             unsigned RegNum);
+  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
+                           unsigned& RegNum, unsigned& RegWidth,
+                           unsigned *DwordRegIndex);
+  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
+                    bool IsAtomic, bool IsAtomicReturn);
+  void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
+                 bool IsGdsHardcoded);
 
 public:
   enum AMDGPUMatchResultTy {
     Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
   };
 
+  typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
+
   AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
-      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser),
-        ForcedEncodingSize(0),
-        ForcedDPP(false),
-        ForcedSDWA(false) {
+      : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) {
     MCAsmParserExtension::Initialize(Parser);
 
-    if (getSTI().getFeatureBits().none()) {
+    if (getFeatureBits().none()) {
       // Set default features.
       copySTI().ToggleFeature("SOUTHERN_ISLANDS");
     }
 
-    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
 
     {
       // TODO: make those pre-defined variables read-only.
       // Currently there is none suitable machinery in the core llvm-mc for this.
       // MCSymbol::isRedefinable is intended for another purpose, and
       // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
-      AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+      AMDGPU::IsaInfo::IsaVersion ISA =
+          AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
       MCContext &Ctx = getContext();
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+      MCSymbol *Sym =
+          Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
       Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
       Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
     }
     KernelScope.initialize(getContext());
   }
@@ -821,8 +889,16 @@ public:
     return AMDGPU::isVI(getSTI());
   }
 
+  bool isGFX9() const {
+    return AMDGPU::isGFX9(getSTI());
+  }
+
   bool hasInv2PiInlineImm() const {
-    return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+    return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+  }
+
+  bool hasFlatOffsets() const {
+    return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
   }
 
   bool hasSGPR102_SGPR103() const {
@@ -844,6 +920,10 @@ public:
     return &MII;
   }
 
+  const FeatureBitset &getFeatureBits() const {
+    return getSTI().getFeatureBits();
+  }
+
   void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
   void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
   void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
@@ -871,19 +951,28 @@ public:
   //bool ProcessInstruction(MCInst &Inst);
 
   OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
+
   OperandMatchResultTy
   parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                     enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                     AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
                      bool (*ConvertResult)(int64_t &) = nullptr);
+
+  OperandMatchResultTy parseOperandArrayWithPrefix(
+    const char *Prefix,
+    OperandVector &Operands,
+    AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+    bool (*ConvertResult)(int64_t&) = nullptr);
+
   OperandMatchResultTy
   parseNamedBit(const char *Name, OperandVector &Operands,
-                enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+                AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
   OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
                                              StringRef &Value);
 
-  OperandMatchResultTy parseImm(OperandVector &Operands);
+  bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false);
+  OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false);
   OperandMatchResultTy parseReg(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false);
   OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
   OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
   OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
@@ -891,7 +980,8 @@ public:
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
-  void cvtDS(MCInst &Inst, const OperandVector &Operands);
+  void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
+  void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); }
   void cvtExp(MCInst &Inst, const OperandVector &Operands);
 
   bool parseCnt(int64_t &IntVal);
@@ -911,6 +1001,19 @@ private:
   void errorExpTgt();
   OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
 
+  bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc);
+  bool validateConstantBusLimitations(const MCInst &Inst);
+  bool validateEarlyClobberLimitations(const MCInst &Inst);
+  bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
+  bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
+  unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+
+  bool trySkipId(const StringRef Id);
+  bool trySkipToken(const AsmToken::TokenKind Kind);
+  bool skipToken(const AsmToken::TokenKind Kind, const StringRef ErrMsg);
+  bool parseString(StringRef &Val, const StringRef ErrMsg = "expected a string");
+  bool parseExpr(int64_t &Imm);
+
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
@@ -920,9 +1023,24 @@ public:
   OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
   OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
 
+  bool parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
+                            const unsigned MinVal,
+                            const unsigned MaxVal,
+                            const StringRef ErrMsg);
+  OperandMatchResultTy parseSwizzleOp(OperandVector &Operands);
+  bool parseSwizzleOffset(int64_t &Imm);
+  bool parseSwizzleMacro(int64_t &Imm);
+  bool parseSwizzleQuadPerm(int64_t &Imm);
+  bool parseSwizzleBitmaskPerm(int64_t &Imm);
+  bool parseSwizzleBroadcast(int64_t &Imm);
+  bool parseSwizzleSwap(int64_t &Imm);
+  bool parseSwizzleReverse(int64_t &Imm);
+
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
   AMDGPUOperand::Ptr defaultTFE() const;
@@ -935,14 +1053,18 @@ public:
   AMDGPUOperand::Ptr defaultSMRDOffset8() const;
   AMDGPUOperand::Ptr defaultSMRDOffset20() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
+  AMDGPUOperand::Ptr defaultOffsetU12() const;
+  AMDGPUOperand::Ptr defaultOffsetS13() const;
 
   OperandMatchResultTy parseOModOperand(OperandVector &Operands);
 
-  void cvtId(MCInst &Inst, const OperandVector &Operands);
-  void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
+               OptionalImmIndexMap &OptionalIdx);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
 
-  void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
+  void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
+               bool IsAtomic = false);
   void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
 
   OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
@@ -956,9 +1078,10 @@ public:
   OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
   void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
   void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
-               uint64_t BasicInstType);
+                uint64_t BasicInstType, bool skipVcc = false);
 };
 
 struct OptionalOperand {
@@ -988,6 +1111,30 @@ static const fltSemantics *getFltSemantics(MVT VT) {
   return getFltSemantics(VT.getSizeInBits() / 8);
 }
 
+static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
+  switch (OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    return &APFloat::IEEEsingle();
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    return &APFloat::IEEEdouble();
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    return &APFloat::IEEEhalf();
+  default:
+    llvm_unreachable("unsupported fp type");
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Operand
 //===----------------------------------------------------------------------===//
@@ -1031,13 +1178,18 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
     if (!canLosslesslyConvertToFPType(FPLiteral, type))
       return false;
 
+    if (type.getScalarSizeInBits() == 16) {
+      return AMDGPU::isInlinableLiteral16(
+        static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+        AsmParser->hasInv2PiInlineImm());
+    }
+
     // Check if single precision literal is inlinable
     return AMDGPU::isInlinableLiteral32(
       static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
       AsmParser->hasInv2PiInlineImm());
   }
 
-
   // We got int literal token.
   if (type == MVT::f64 || type == MVT::i64) { // Expected 64-bit operand
     return AMDGPU::isInlinableLiteral64(Imm.Val,
@@ -1056,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
 }
 
 bool AMDGPUOperand::isLiteralImm(MVT type) const {
-  // Check that this imediate can be added as literal
+  // Check that this immediate can be added as literal
   if (!isImmTy(ImmTyNone)) {
     return false;
   }
@@ -1064,6 +1216,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
   if (!Imm.IsFPImm) {
     // We got int literal token.
 
+    if (type == MVT::f64 && hasFPModifiers()) {
+      // Cannot apply fp modifiers to int literals preserving the same semantics
+      // for VOP1/2/C and VOP3 because of integer truncation. To avoid ambiguity,
+      // disable these cases.
+      return false;
+    }
+
     unsigned Size = type.getSizeInBits();
     if (Size == 64)
       Size = 32;
@@ -1093,40 +1252,66 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
-void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
-  int64_t Val = Imm.Val;
-  if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) {
-    // Apply modifiers to immediate value. Only negate can get here
-    if (Imm.IsFPImm) {
-      APFloat F(BitsToDouble(Val));
-      F.changeSign();
-      Val = F.bitcastToAPInt().getZExtValue();
-    } else {
-      Val = -Val;
-    }
+bool AMDGPUOperand::isSDWARegKind() const {
+  if (AsmParser->isVI())
+    return isVReg();
+  else if (AsmParser->isGFX9())
+    return isRegKind();
+  else
+    return false;
+}
+
+uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
+{
+  assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
+  assert(Size == 2 || Size == 4 || Size == 8);
+
+  const uint64_t FpSignMask = (1ULL << (Size * 8 - 1));
+
+  if (Imm.Mods.Abs) {
+    Val &= ~FpSignMask;
+  }
+  if (Imm.Mods.Neg) {
+    Val ^= FpSignMask;
   }
 
+  return Val;
+}
+
+void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+
   if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
                              Inst.getNumOperands())) {
-    addLiteralImmOperand(Inst, Val);
+    addLiteralImmOperand(Inst, Imm.Val,
+                         ApplyModifiers &
+                         isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
   } else {
-    Inst.addOperand(MCOperand::createImm(Val));
+    assert(!isImmTy(ImmTyNone) || !hasModifiers());
+    Inst.addOperand(MCOperand::createImm(Imm.Val));
   }
 }
 
-void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
+void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const {
   const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode());
   auto OpNum = Inst.getNumOperands();
   // Check that this operand accepts literals
   assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
 
-  auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
+  if (ApplyModifiers) {
+    assert(AMDGPU::isSISrcFPOperand(InstDesc, OpNum));
+    const unsigned Size = Imm.IsFPImm ? sizeof(double) : getOperandSize(InstDesc, OpNum);
+    Val = applyInputFPModifiers(Val, Size);
+  }
+
+  APInt Literal(64, Val);
+  uint8_t OpTy = InstDesc.OpInfo[OpNum].OperandType;
 
   if (Imm.IsFPImm) { // We got fp literal token
-    APInt Literal(64, Val);
-
-    switch (OpSize) {
-    case 8: {
+    switch (OpTy) {
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1151,16 +1336,31 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
       // in predicate methods (isLiteralImm())
       llvm_unreachable("fp literal in 64-bit integer instruction.");
     }
-    case 4:
-    case 2: {
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
       bool lost;
       APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
       // Convert literal to single precision
-      FPLiteral.convert(*getFltSemantics(OpSize),
+      FPLiteral.convert(*getOpFltSemantics(OpTy),
                         APFloat::rmNearestTiesToEven, &lost);
       // We allow precision lost but not overflow or underflow. This should be
       // checked earlier in isLiteralImm()
-      Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+
+      uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
+      if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
+          OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+        ImmVal |= (ImmVal << 16);
+      }
+
+      Inst.addOperand(MCOperand::createImm(ImmVal));
       return;
     }
     default:
@@ -1173,8 +1373,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
    // We got int literal token.
   // Only sign extend inline immediates.
   // FIXME: No errors on truncation
-  switch (OpSize) {
-  case 4: {
+  switch (OpTy) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     if (isInt<32>(Val) &&
         AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1185,9 +1388,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
     Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
     return;
   }
-  case 8: {
-    if (AMDGPU::isInlinableLiteral64(Val,
-                                     AsmParser->hasInv2PiInlineImm())) {
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
+    if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       return;
     }
@@ -1195,7 +1400,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
     Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
     return;
   }
-  case 2: {
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
     if (isInt<16>(Val) &&
         AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1206,6 +1414,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
     Inst.addOperand(MCOperand::createImm(Val & 0xffff));
     return;
   }
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue());
+    assert(AMDGPU::isInlinableLiteral16(LiteralVal,
+                                        AsmParser->hasInv2PiInlineImm()));
+
+    uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 |
+                      static_cast<uint32_t>(LiteralVal);
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+    return;
+  }
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -1289,7 +1508,8 @@ static unsigned getSpecialRegForName(StringRef RegName) {
     .Default(0);
 }
 
-bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                    SMLoc &EndLoc) {
   auto R = parseRegister();
   if (!R) return true;
   assert(R->isReg());
@@ -1299,20 +1519,43 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End
   return false;
 }
 
-bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum)
-{
+bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
+                                            RegisterKind RegKind, unsigned Reg1,
+                                            unsigned RegNum) {
   switch (RegKind) {
   case IS_SPECIAL:
-    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; }
-    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; }
+    if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
+      Reg = AMDGPU::EXEC;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) {
+      Reg = AMDGPU::FLAT_SCR;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) {
+      Reg = AMDGPU::VCC;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) {
+      Reg = AMDGPU::TBA;
+      RegWidth = 2;
+      return true;
+    }
+    if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) {
+      Reg = AMDGPU::TMA;
+      RegWidth = 2;
+      return true;
+    }
     return false;
   case IS_VGPR:
   case IS_SGPR:
   case IS_TTMP:
-    if (Reg1 != Reg + RegWidth) { return false; }
+    if (Reg1 != Reg + RegWidth) {
+      return false;
+    }
     RegWidth++;
     return true;
   default:
@@ -1320,8 +1563,9 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, R
   }
 }
 
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex)
-{
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+                                          unsigned &RegNum, unsigned &RegWidth,
+                                          unsigned *DwordRegIndex) {
   if (DwordRegIndex) { *DwordRegIndex = 0; }
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   if (getLexer().is(AsmToken::Identifier)) {
@@ -1462,8 +1706,33 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
 }
 
+bool
+AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) {
+  if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) &&
+      (getLexer().getKind() == AsmToken::Integer ||
+       getLexer().getKind() == AsmToken::Real)) {
+
+    // This is a workaround for handling operands like these:
+    //     |1.0|
+    //     |-1|
+    // This syntax is not compatible with syntax of standard
+    // MC expressions (due to the trailing '|').
+
+    SMLoc EndLoc;
+    const MCExpr *Expr;
+
+    if (getParser().parsePrimaryExpr(Expr, EndLoc)) {
+      return true;
+    }
+
+    return !Expr->evaluateAsAbsolute(Val);
+  }
+
+  return getParser().parseAbsoluteExpression(Val);
+}
+
 OperandMatchResultTy
-AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
   // TODO: add syntactic sugar for 1/(2*PI)
   bool Minus = false;
   if (getLexer().getKind() == AsmToken::Minus) {
@@ -1475,7 +1744,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
   switch(getLexer().getKind()) {
   case AsmToken::Integer: {
     int64_t IntVal;
-    if (getParser().parseAbsoluteExpression(IntVal))
+    if (parseAbsoluteExpr(IntVal, AbsMod))
       return MatchOperand_ParseFail;
     if (Minus)
       IntVal *= -1;
@@ -1484,7 +1753,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
   }
   case AsmToken::Real: {
     int64_t IntVal;
-    if (getParser().parseAbsoluteExpression(IntVal))
+    if (parseAbsoluteExpr(IntVal, AbsMod))
       return MatchOperand_ParseFail;
 
     APFloat F(BitsToDouble(IntVal));
@@ -1512,8 +1781,8 @@ AMDGPUAsmParser::parseReg(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
-  auto res = parseImm(Operands);
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) {
+  auto res = parseImm(Operands, AbsMod);
   if (res != MatchOperand_NoMatch) {
     return res;
   }
@@ -1522,18 +1791,50 @@ AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) {
-  // XXX: During parsing we can't determine if minus sign means
-  // negate-modifier or negative immediate value.
-  // By default we suppose it is modifier.
-  bool Negate = false, Abs = false, Abs2 = false;
+AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
+                                              bool AllowImm) {
+  bool Negate = false, Negate2 = false, Abs = false, Abs2 = false;
 
   if (getLexer().getKind()== AsmToken::Minus) {
+    const AsmToken NextToken = getLexer().peekTok();
+
+    // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
+    if (NextToken.is(AsmToken::Minus)) {
+      Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier");
+      return MatchOperand_ParseFail;
+    }
+
+    // '-' followed by an integer literal N should be interpreted as integer
+    // negation rather than a floating-point NEG modifier applied to N.
+    // Beside being contr-intuitive, such use of floating-point NEG modifier
+    // results in different meaning of integer literals used with VOP1/2/C
+    // and VOP3, for example:
+    //    v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
+    //    v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
+    // Negative fp literals should be handled likewise for unifomtity
+    if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) {
+      Parser.Lex();
+      Negate = true;
+    }
+  }
+
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "neg") {
+    if (Negate) {
+      Error(Parser.getTok().getLoc(), "expected register or immediate");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Negate2 = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after neg");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    Negate = true;
   }
 
-  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") {
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "abs") {
     Parser.Lex();
     Abs2 = true;
     if (getLexer().isNot(AsmToken::LParen)) {
@@ -1554,7 +1855,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
 
   OperandMatchResultTy Res;
   if (AllowImm) {
-    Res = parseRegOrImm(Operands);
+    Res = parseRegOrImm(Operands, Abs);
   } else {
     Res = parseReg(Operands);
   }
@@ -1563,9 +1864,6 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
   }
 
   AMDGPUOperand::Modifiers Mods;
-  if (Negate) {
-    Mods.Neg = true;
-  }
   if (Abs) {
     if (getLexer().getKind() != AsmToken::Pipe) {
       Error(Parser.getTok().getLoc(), "expected vertical bar");
@@ -1583,6 +1881,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
     Mods.Abs = true;
   }
 
+  if (Negate) {
+    Mods.Neg = true;
+  } else if (Negate2) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Neg = true;
+  }
+
   if (Mods.hasFPModifiers()) {
     AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
     Op.setModifiers(Mods);
@@ -1591,10 +1900,12 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool Allo
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm) {
+AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
+                                               bool AllowImm) {
   bool Sext = false;
 
-  if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") {
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "sext") {
     Parser.Lex();
     Sext = true;
     if (getLexer().isNot(AsmToken::LParen)) {
@@ -1661,7 +1972,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands)
 }
 
 unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
-
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
 
   if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) ||
@@ -1686,6 +1996,15 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     }
   }
 
+  if ((TSFlags & SIInstrFlags::FLAT) && !hasFlatOffsets()) {
+    // FIXME: Produces error without correct column reported.
+    auto OpNum =
+        AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset);
+    const auto &Op = Inst.getOperand(OpNum);
+    if (Op.getImm() != 0)
+      return Match_InvalidOperand;
+  }
+
   return Match_Success;
 }
 
@@ -1702,7 +2021,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
   }
 
   if (isForcedSDWA()) {
-    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA,
+                                        AMDGPUAsmVariants::SDWA9};
     return makeArrayRef(Variants);
   }
 
@@ -1713,12 +2033,182 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
 
   static const unsigned Variants[] = {
     AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
-    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
   };
 
   return makeArrayRef(Variants);
 }
 
+unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  const unsigned Num = Desc.getNumImplicitUses();
+  for (unsigned i = 0; i < Num; ++i) {
+    unsigned Reg = Desc.ImplicitUses[i];
+    switch (Reg) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::VCC:
+    case AMDGPU::M0:
+      return Reg;
+    default:
+      break;
+    }
+  }
+  return AMDGPU::NoRegister;
+}
+
+// NB: This code is correct only when used to check constant
+// bus limitations because GFX7 support no f16 inline constants.
+// Note that there are no cases when a GFX7 opcode violates
+// constant bus limitations due to the use of an f16 constant.
+bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
+                                       unsigned OpIdx) const {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+
+  if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+    return false;
+  }
+
+  const MCOperand &MO = Inst.getOperand(OpIdx);
+
+  int64_t Val = MO.getImm();
+  auto OpSize = AMDGPU::getOperandSize(Desc, OpIdx);
+
+  switch (OpSize) { // expected operand size
+  case 8:
+    return AMDGPU::isInlinableLiteral64(Val, hasInv2PiInlineImm());
+  case 4:
+    return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm());
+  case 2: {
+    const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
+    if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+      return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+    } else {
+      return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
+    }
+  }
+  default:
+    llvm_unreachable("invalid operand size");
+  }
+}
+
+bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
+  const MCOperand &MO = Inst.getOperand(OpIdx);
+  if (MO.isImm()) {
+    return !isInlineConstant(Inst, OpIdx);
+  }
+  return !MO.isReg() ||
+         isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
+}
+
+bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
+  const unsigned Opcode = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opcode);
+  unsigned ConstantBusUseCount = 0;
+
+  if (Desc.TSFlags &
+      (SIInstrFlags::VOPC |
+       SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |
+       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P |
+       SIInstrFlags::SDWA)) {
+
+    // Check special imm operands (used by madmk, etc)
+    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
+      ++ConstantBusUseCount;
+    }
+
+    unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+    if (SGPRUsed != AMDGPU::NoRegister) {
+      ++ConstantBusUseCount;
+    }
+
+    const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+    const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+    const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+    for (int OpIdx : OpIndices) {
+      if (OpIdx == -1) break;
+
+      const MCOperand &MO = Inst.getOperand(OpIdx);
+      if (usesConstantBus(Inst, OpIdx)) {
+        if (MO.isReg()) {
+          const unsigned Reg = mc2PseudoReg(MO.getReg());
+          // Pairs of registers with a partial intersections like these
+          //   s0, s[0:1]
+          //   flat_scratch_lo, flat_scratch
+          //   flat_scratch_lo, flat_scratch_hi
+          // are theoretically valid but they are disabled anyway.
+          // Note that this code mimics SIInstrInfo::verifyInstruction
+          if (Reg != SGPRUsed) {
+            ++ConstantBusUseCount;
+          }
+          SGPRUsed = Reg;
+        } else { // Expression or a literal
+          ++ConstantBusUseCount;
+        }
+      }
+    }
+  }
+
+  return ConstantBusUseCount <= 1;
+}
+
+bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
+
+  const unsigned Opcode = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opcode);
+
+  const int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+  if (DstIdx == -1 ||
+      Desc.getOperandConstraint(DstIdx, MCOI::EARLY_CLOBBER) == -1) {
+    return true;
+  }
+
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+
+  const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  assert(DstIdx != -1);
+  const MCOperand &Dst = Inst.getOperand(DstIdx);
+  assert(Dst.isReg());
+  const unsigned DstReg = mc2PseudoReg(Dst.getReg());
+
+  const int SrcIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+  for (int SrcIdx : SrcIndices) {
+    if (SrcIdx == -1) break;
+    const MCOperand &Src = Inst.getOperand(SrcIdx);
+    if (Src.isReg()) {
+      const unsigned SrcReg = mc2PseudoReg(Src.getReg());
+      if (isRegIntersect(DstReg, SrcReg, TRI)) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
+                                          const SMLoc &IDLoc) {
+  if (!validateConstantBusLimitations(Inst)) {
+    Error(IDLoc,
+      "invalid operand (violates constant bus restrictions)");
+    return false;
+  }
+  if (!validateEarlyClobberLimitations(Inst)) {
+    Error(IDLoc,
+      "destination must be different than all sources");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -1751,6 +2241,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (Result) {
   default: break;
   case Match_Success:
+    if (!validateInstruction(Inst, IDLoc)) {
+      return true;
+    }
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, getSTI());
     return false;
@@ -1793,7 +2286,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
   return false;
 }
 
-
 bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
                                                uint32_t &Minor) {
   if (ParseAsAbsoluteExpression(Major))
@@ -1810,7 +2302,6 @@ bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
 }
 
 bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
-
   uint32_t Major;
   uint32_t Minor;
 
@@ -1831,9 +2322,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
-    getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
-                                                      Isa.Stepping,
+    AMDGPU::IsaInfo::IsaVersion ISA =
+        AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+    getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
+                                                      ISA.Stepping,
                                                       "AMD", "AMDGPU");
     return false;
   }
@@ -1873,42 +2365,45 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   return false;
 }
 
-bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() {
-  std::string Metadata;
-  raw_string_ostream MS(Metadata);
+bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() {
+  std::string YamlString;
+  raw_string_ostream YamlStream(YamlString);
 
   getLexer().setSkipSpace(false);
 
   bool FoundEnd = false;
   while (!getLexer().is(AsmToken::Eof)) {
     while (getLexer().is(AsmToken::Space)) {
-      MS << ' ';
+      YamlStream << getLexer().getTok().getString();
       Lex();
     }
 
     if (getLexer().is(AsmToken::Identifier)) {
       StringRef ID = getLexer().getTok().getIdentifier();
-      if (ID == ".end_amdgpu_runtime_metadata") {
+      if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) {
         Lex();
         FoundEnd = true;
         break;
       }
     }
 
-    MS << Parser.parseStringToEndOfStatement()
-       << getContext().getAsmInfo()->getSeparatorString();
+    YamlStream << Parser.parseStringToEndOfStatement()
+               << getContext().getAsmInfo()->getSeparatorString();
 
     Parser.eatToEndOfStatement();
   }
 
   getLexer().setSkipSpace(true);
 
-  if (getLexer().is(AsmToken::Eof) && !FoundEnd)
-    return TokError("expected directive .end_amdgpu_runtime_metadata not found");
+  if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
+    return TokError(
+        "expected directive .end_amdgpu_code_object_metadata not found");
+  }
 
-  MS.flush();
+  YamlStream.flush();
 
-  getTargetStreamer().EmitRuntimeMetadata(Metadata);
+  if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString))
+    return Error(getParser().getTok().getLoc(), "invalid code object metadata");
 
   return false;
 }
@@ -1926,7 +2421,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits());
 
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
@@ -1952,12 +2447,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   return false;
 }
 
-bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() {
-  getParser().getStreamer().SwitchSection(
-      AMDGPU::getHSATextSection(getContext()));
-  return false;
-}
-
 bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
   if (getLexer().isNot(AsmToken::Identifier))
     return TokError("expected symbol name");
@@ -1971,46 +2460,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() {
   return false;
 }
 
-bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() {
-  if (getLexer().isNot(AsmToken::Identifier))
-    return TokError("expected symbol name");
-
-  StringRef GlobalName = Parser.getTok().getIdentifier();
-
-  getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName);
-  Lex();
-  return false;
-}
-
-bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() {
-  if (getLexer().isNot(AsmToken::Identifier))
-    return TokError("expected symbol name");
-
-  StringRef GlobalName = Parser.getTok().getIdentifier();
-
-  getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName);
-  Lex();
-  return false;
-}
-
-bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() {
-  getParser().getStreamer().SwitchSection(
-      AMDGPU::getHSADataGlobalAgentSection(getContext()));
-  return false;
-}
-
-bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() {
-  getParser().getStreamer().SwitchSection(
-      AMDGPU::getHSADataGlobalProgramSection(getContext()));
-  return false;
-}
-
-bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() {
-  getParser().getStreamer().SwitchSection(
-      AMDGPU::getHSARodataReadonlyAgentSection(getContext()));
-  return false;
-}
-
 bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
@@ -2020,33 +2469,15 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".hsa_code_object_isa")
     return ParseDirectiveHSACodeObjectISA();
 
-  if (IDVal == ".amdgpu_runtime_metadata")
-    return ParseDirectiveRuntimeMetadata();
+  if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin)
+    return ParseDirectiveCodeObjectMetadata();
 
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
 
-  if (IDVal == ".hsatext")
-    return ParseSectionDirectiveHSAText();
-
   if (IDVal == ".amdgpu_hsa_kernel")
     return ParseDirectiveAMDGPUHsaKernel();
 
-  if (IDVal == ".amdgpu_hsa_module_global")
-    return ParseDirectiveAMDGPUHsaModuleGlobal();
-
-  if (IDVal == ".amdgpu_hsa_program_global")
-    return ParseDirectiveAMDGPUHsaProgramGlobal();
-
-  if (IDVal == ".hsadata_global_agent")
-    return ParseSectionDirectiveHSADataGlobalAgent();
-
-  if (IDVal == ".hsadata_global_program")
-    return ParseSectionDirectiveHSADataGlobalProgram();
-
-  if (IDVal == ".hsarodata_readonly_agent")
-    return ParseSectionDirectiveHSARodataReadonlyAgent();
-
   return true;
 }
 
@@ -2080,7 +2511,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
-
   // Try to parse with a custom parser
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
 
@@ -2195,11 +2625,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
         return MatchOperand_ParseFail;
 
       Parser.Lex();
+
+      bool IsMinus = false;
+      if (getLexer().getKind() == AsmToken::Minus) {
+        Parser.Lex();
+        IsMinus = true;
+      }
+
       if (getLexer().isNot(AsmToken::Integer))
         return MatchOperand_ParseFail;
 
       if (getParser().parseAbsoluteExpression(Int))
         return MatchOperand_ParseFail;
+
+      if (IsMinus)
+        Int = -Int;
       break;
     }
   }
@@ -2208,7 +2648,7 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                                    enum AMDGPUOperand::ImmTy ImmTy,
+                                    AMDGPUOperand::ImmTy ImmTy,
                                     bool (*ConvertResult)(int64_t&)) {
   SMLoc S = Parser.getTok().getLoc();
   int64_t Value = 0;
@@ -2225,9 +2665,59 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix(
+  const char *Prefix,
+  OperandVector &Operands,
+  AMDGPUOperand::ImmTy ImmTy,
+  bool (*ConvertResult)(int64_t&)) {
+  StringRef Name = Parser.getTok().getString();
+  if (!Name.equals(Prefix))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Colon))
+    return MatchOperand_ParseFail;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::LBrac))
+    return MatchOperand_ParseFail;
+  Parser.Lex();
+
+  unsigned Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  // FIXME: How to verify the number of elements matches the number of src
+  // operands?
+  for (int I = 0; I < 3; ++I) {
+    if (I != 0) {
+      if (getLexer().is(AsmToken::RBrac))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+    }
+
+    if (getLexer().isNot(AsmToken::Integer))
+      return MatchOperand_ParseFail;
+
+    int64_t Op;
+    if (getParser().parseAbsoluteExpression(Op))
+      return MatchOperand_ParseFail;
+
+    if (Op != 0 && Op != 1)
+      return MatchOperand_ParseFail;
+    Val |= (Op << I);
+  }
+
+  Parser.Lex();
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
-                               enum AMDGPUOperand::ImmTy ImmTy) {
+                               AMDGPUOperand::ImmTy ImmTy) {
   int64_t Bit = 0;
   SMLoc S = Parser.getTok().getLoc();
 
@@ -2257,11 +2747,11 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
-typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
-
-void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
-                           OptionalImmIndexMap& OptionalIdx,
-                           enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+static void addOptionalImmOperand(
+  MCInst& Inst, const OperandVector& Operands,
+  AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
+  AMDGPUOperand::ImmTy ImmT,
+  int64_t Default = 0) {
   auto i = OptionalIdx.find(ImmT);
   if (i != OptionalIdx.end()) {
     unsigned Idx = i->second;
@@ -2323,9 +2813,9 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
 
-void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-  bool GDSOnly = false;
+void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
+                                bool IsGdsHardcoded) {
+  OptionalImmIndexMap OptionalIdx;
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -2337,7 +2827,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
     }
 
     if (Op.isToken() && Op.getToken() == "gds") {
-      GDSOnly = true;
+      IsGdsHardcoded = true;
       continue;
     }
 
@@ -2345,10 +2835,14 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
     OptionalIdx[Op.getImmTy()] = i;
   }
 
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
+  AMDGPUOperand::ImmTy OffsetType =
+    (Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_si ||
+     Inst.getOpcode() == AMDGPU::DS_SWIZZLE_B32_vi) ? AMDGPUOperand::ImmTySwizzle :
+                                                      AMDGPUOperand::ImmTyOffset;
 
-  if (!GDSOnly) {
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType);
+
+  if (!IsGdsHardcoded) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
   }
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
@@ -2357,6 +2851,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
 void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
   OptionalImmIndexMap OptionalIdx;
 
+  unsigned OperandIdx[4];
   unsigned EnMask = 0;
   int SrcIdx = 0;
 
@@ -2365,15 +2860,18 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
 
     // Add the register arguments
     if (Op.isReg()) {
-      EnMask |= (1 << SrcIdx);
+      assert(SrcIdx < 4);
+      OperandIdx[SrcIdx] = Inst.size();
       Op.addRegOperands(Inst, 1);
       ++SrcIdx;
       continue;
     }
 
     if (Op.isOff()) {
-      ++SrcIdx;
+      assert(SrcIdx < 4);
+      OperandIdx[SrcIdx] = Inst.size();
       Inst.addOperand(MCOperand::createReg(AMDGPU::NoRegister));
+      ++SrcIdx;
       continue;
     }
 
@@ -2389,6 +2887,22 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
     OptionalIdx[Op.getImmTy()] = i;
   }
 
+  assert(SrcIdx == 4);
+
+  bool Compr = false;
+  if (OptionalIdx.find(AMDGPUOperand::ImmTyExpCompr) != OptionalIdx.end()) {
+    Compr = true;
+    Inst.getOperand(OperandIdx[1]) = Inst.getOperand(OperandIdx[2]);
+    Inst.getOperand(OperandIdx[2]).setReg(AMDGPU::NoRegister);
+    Inst.getOperand(OperandIdx[3]).setReg(AMDGPU::NoRegister);
+  }
+
+  for (auto i = 0; i < SrcIdx; ++i) {
+    if (Inst.getOperand(OperandIdx[i]).getReg() != AMDGPU::NoRegister) {
+      EnMask |= Compr? (0x3 << i * 2) : (0x1 << i);
+    }
+  }
+
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpVM);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyExpCompr);
 
@@ -2399,6 +2913,28 @@ void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
 // s_waitcnt
 //===----------------------------------------------------------------------===//
 
+static bool
+encodeCnt(
+  const AMDGPU::IsaInfo::IsaVersion ISA,
+  int64_t &IntVal,
+  int64_t CntVal,
+  bool Saturate,
+  unsigned (*encode)(const IsaInfo::IsaVersion &Version, unsigned, unsigned),
+  unsigned (*decode)(const IsaInfo::IsaVersion &Version, unsigned))
+{
+  bool Failed = false;
+
+  IntVal = encode(ISA, IntVal, CntVal);
+  if (CntVal != decode(ISA, IntVal)) {
+    if (Saturate) {
+      IntVal = encode(ISA, IntVal, -1);
+    } else {
+      Failed = true;
+    }
+  }
+  return Failed;
+}
+
 bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   StringRef CntName = Parser.getTok().getString();
   int64_t CntVal;
@@ -2411,33 +2947,49 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   if (getLexer().isNot(AsmToken::Integer))
     return true;
 
+  SMLoc ValLoc = Parser.getTok().getLoc();
   if (getParser().parseAbsoluteExpression(CntVal))
     return true;
 
-  if (getLexer().isNot(AsmToken::RParen))
-    return true;
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
 
-  Parser.Lex();
-  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
-    Parser.Lex();
+  bool Failed = true;
+  bool Sat = CntName.endswith("_sat");
 
-  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
-  if (CntName == "vmcnt")
-    IntVal = encodeVmcnt(IV, IntVal, CntVal);
-  else if (CntName == "expcnt")
-    IntVal = encodeExpcnt(IV, IntVal, CntVal);
-  else if (CntName == "lgkmcnt")
-    IntVal = encodeLgkmcnt(IV, IntVal, CntVal);
-  else
+  if (CntName == "vmcnt" || CntName == "vmcnt_sat") {
+    Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt);
+  } else if (CntName == "expcnt" || CntName == "expcnt_sat") {
+    Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeExpcnt, decodeExpcnt);
+  } else if (CntName == "lgkmcnt" || CntName == "lgkmcnt_sat") {
+    Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeLgkmcnt, decodeLgkmcnt);
+  }
+
+  if (Failed) {
+    Error(ValLoc, "too large value for " + CntName);
     return true;
+  }
+
+  if (getLexer().isNot(AsmToken::RParen)) {
+    return true;
+  }
+
+  Parser.Lex();
+  if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma)) {
+    const AsmToken NextToken = getLexer().peekTok();
+    if (NextToken.is(AsmToken::Identifier)) {
+      Parser.Lex();
+    }
+  }
 
   return false;
 }
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
-  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
-  int64_t Waitcnt = getWaitcntBitMask(IV);
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+  int64_t Waitcnt = getWaitcntBitMask(ISA);
   SMLoc S = Parser.getTok().getLoc();
 
   switch(getLexer().getKind()) {
@@ -2459,7 +3011,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
+                                          int64_t &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
   if (Parser.getTok().getString() != "hwreg")
@@ -2520,8 +3073,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
   return false;
 }
 
-OperandMatchResultTy
-AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   using namespace llvm::AMDGPU::Hwreg;
 
   int64_t Imm16Val = 0;
@@ -2892,6 +3444,298 @@ bool AMDGPUOperand::isSendMsg() const {
 }
 
 //===----------------------------------------------------------------------===//
+// parser helpers
+//===----------------------------------------------------------------------===//
+
+bool
+AMDGPUAsmParser::trySkipId(const StringRef Id) {
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == Id) {
+    Parser.Lex();
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::trySkipToken(const AsmToken::TokenKind Kind) {
+  if (getLexer().getKind() == Kind) {
+    Parser.Lex();
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::skipToken(const AsmToken::TokenKind Kind,
+                           const StringRef ErrMsg) {
+  if (!trySkipToken(Kind)) {
+    Error(Parser.getTok().getLoc(), ErrMsg);
+    return false;
+  }
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseExpr(int64_t &Imm) {
+  return !getParser().parseAbsoluteExpression(Imm);
+}
+
+bool
+AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
+  SMLoc S = Parser.getTok().getLoc();
+  if (getLexer().getKind() == AsmToken::String) {
+    Val = Parser.getTok().getStringContents();
+    Parser.Lex();
+    return true;
+  } else {
+    Error(S, ErrMsg);
+    return false;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// swizzle
+//===----------------------------------------------------------------------===//
+
+LLVM_READNONE
+static unsigned
+encodeBitmaskPerm(const unsigned AndMask,
+                  const unsigned OrMask,
+                  const unsigned XorMask) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  return BITMASK_PERM_ENC |
+         (AndMask << BITMASK_AND_SHIFT) |
+         (OrMask  << BITMASK_OR_SHIFT)  |
+         (XorMask << BITMASK_XOR_SHIFT);
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleOperands(const unsigned OpNum, int64_t* Op,
+                                      const unsigned MinVal,
+                                      const unsigned MaxVal,
+                                      const StringRef ErrMsg) {
+  for (unsigned i = 0; i < OpNum; ++i) {
+    if (!skipToken(AsmToken::Comma, "expected a comma")){
+      return false;
+    }
+    SMLoc ExprLoc = Parser.getTok().getLoc();
+    if (!parseExpr(Op[i])) {
+      return false;
+    }
+    if (Op[i] < MinVal || Op[i] > MaxVal) {
+      Error(ExprLoc, ErrMsg);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleQuadPerm(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  int64_t Lane[LANE_NUM];
+  if (parseSwizzleOperands(LANE_NUM, Lane, 0, LANE_MAX,
+                           "expected a 2-bit lane id")) {
+    Imm = QUAD_PERM_ENC;
+    for (auto i = 0; i < LANE_NUM; ++i) {
+      Imm |= Lane[i] << (LANE_SHIFT * i);
+    }
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleBroadcast(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t GroupSize;
+  int64_t LaneIdx;
+
+  if (!parseSwizzleOperands(1, &GroupSize,
+                            2, 32,
+                            "group size must be in the interval [2,32]")) {
+    return false;
+  }
+  if (!isPowerOf2_64(GroupSize)) {
+    Error(S, "group size must be a power of two");
+    return false;
+  }
+  if (parseSwizzleOperands(1, &LaneIdx,
+                           0, GroupSize - 1,
+                           "lane id must be in the interval [0,group size - 1]")) {
+    Imm = encodeBitmaskPerm(BITMASK_MAX - GroupSize + 1, LaneIdx, 0);
+    return true;
+  }
+  return false;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleReverse(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t GroupSize;
+
+  if (!parseSwizzleOperands(1, &GroupSize,
+      2, 32, "group size must be in the interval [2,32]")) {
+    return false;
+  }
+  if (!isPowerOf2_64(GroupSize)) {
+    Error(S, "group size must be a power of two");
+    return false;
+  }
+
+  Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize - 1);
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleSwap(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t GroupSize;
+
+  if (!parseSwizzleOperands(1, &GroupSize,
+      1, 16, "group size must be in the interval [1,16]")) {
+    return false;
+  }
+  if (!isPowerOf2_64(GroupSize)) {
+    Error(S, "group size must be a power of two");
+    return false;
+  }
+
+  Imm = encodeBitmaskPerm(BITMASK_MAX, 0, GroupSize);
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleBitmaskPerm(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  if (!skipToken(AsmToken::Comma, "expected a comma")) {
+    return false;
+  }
+
+  StringRef Ctl;
+  SMLoc StrLoc = Parser.getTok().getLoc();
+  if (!parseString(Ctl)) {
+    return false;
+  }
+  if (Ctl.size() != BITMASK_WIDTH) {
+    Error(StrLoc, "expected a 5-character mask");
+    return false;
+  }
+
+  unsigned AndMask = 0;
+  unsigned OrMask = 0;
+  unsigned XorMask = 0;
+
+  for (size_t i = 0; i < Ctl.size(); ++i) {
+    unsigned Mask = 1 << (BITMASK_WIDTH - 1 - i);
+    switch(Ctl[i]) {
+    default:
+      Error(StrLoc, "invalid mask");
+      return false;
+    case '0':
+      break;
+    case '1':
+      OrMask |= Mask;
+      break;
+    case 'p':
+      AndMask |= Mask;
+      break;
+    case 'i':
+      AndMask |= Mask;
+      XorMask |= Mask;
+      break;
+    }
+  }
+
+  Imm = encodeBitmaskPerm(AndMask, OrMask, XorMask);
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleOffset(int64_t &Imm) {
+
+  SMLoc OffsetLoc = Parser.getTok().getLoc();
+
+  if (!parseExpr(Imm)) {
+    return false;
+  }
+  if (!isUInt<16>(Imm)) {
+    Error(OffsetLoc, "expected a 16-bit offset");
+    return false;
+  }
+  return true;
+}
+
+bool
+AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  if (skipToken(AsmToken::LParen, "expected a left parentheses")) {
+
+    SMLoc ModeLoc = Parser.getTok().getLoc();
+    bool Ok = false;
+
+    if (trySkipId(IdSymbolic[ID_QUAD_PERM])) {
+      Ok = parseSwizzleQuadPerm(Imm);
+    } else if (trySkipId(IdSymbolic[ID_BITMASK_PERM])) {
+      Ok = parseSwizzleBitmaskPerm(Imm);
+    } else if (trySkipId(IdSymbolic[ID_BROADCAST])) {
+      Ok = parseSwizzleBroadcast(Imm);
+    } else if (trySkipId(IdSymbolic[ID_SWAP])) {
+      Ok = parseSwizzleSwap(Imm);
+    } else if (trySkipId(IdSymbolic[ID_REVERSE])) {
+      Ok = parseSwizzleReverse(Imm);
+    } else {
+      Error(ModeLoc, "expected a swizzle mode");
+    }
+
+    return Ok && skipToken(AsmToken::RParen, "expected a closing parentheses");
+  }
+
+  return false;
+}
+
+OperandMatchResultTy
+AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  int64_t Imm = 0;
+
+  if (trySkipId("offset")) {
+
+    bool Ok = false;
+    if (skipToken(AsmToken::Colon, "expected a colon")) {
+      if (trySkipId("swizzle")) {
+        Ok = parseSwizzleMacro(Imm);
+      } else {
+        Ok = parseSwizzleOffset(Imm);
+      }
+    }
+
+    Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle));
+
+    return Ok? MatchOperand_Success : MatchOperand_ParseFail;
+  } else {
+    return MatchOperand_NoMatch;
+  }
+}
+
+bool
+AMDGPUOperand::isSwizzle() const {
+  return isImmTy(ImmTySwizzle);
+}
+
+//===----------------------------------------------------------------------===//
 // sopp branch targets
 //===----------------------------------------------------------------------===//
 
@@ -2980,52 +3824,60 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
-//===----------------------------------------------------------------------===//
-// mimg
-//===----------------------------------------------------------------------===//
-
-void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) {
-  unsigned I = 1;
-  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
-  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
-    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
-  }
-
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
   OptionalImmIndexMap OptionalIdx;
 
-  for (unsigned E = Operands.size(); I != E; ++I) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
 
     // Add the register arguments
-    if (Op.isRegOrImm()) {
-      Op.addRegOrImmOperands(Inst, 1);
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
       continue;
-    } else if (Op.isImmModifier()) {
-      OptionalIdx[Op.getImmTy()] = I;
-    } else {
-      llvm_unreachable("unexpected operand type");
     }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
   }
 
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                        AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
-void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
+//===----------------------------------------------------------------------===//
+// mimg
+//===----------------------------------------------------------------------===//
+
+void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
+                              bool IsAtomic) {
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
   for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
     ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  // Add src, same as dst
-  ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+  if (IsAtomic) {
+    // Add src, same as dst
+    ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1);
+  }
 
   OptionalImmIndexMap OptionalIdx;
 
@@ -3053,6 +3905,10 @@ void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands)
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
 }
 
+void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
+  cvtMIMG(Inst, Operands, true);
+}
+
 AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDMask);
 }
@@ -3103,6 +3959,14 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
 }
 
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetU12() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultOffsetS13() const {
+  return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+}
+
 //===----------------------------------------------------------------------===//
 // vop3
 //===----------------------------------------------------------------------===//
@@ -3152,6 +4016,8 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
+  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
@@ -3169,7 +4035,12 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"src0_sel",   AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr},
   {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
   {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
+  {"compr", AMDGPUOperand::ImmTyExpCompr, true, nullptr },
   {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
+  {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
+  {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
+  {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
+  {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}
 };
 
 OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
@@ -3186,6 +4057,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan
       res = parseSDWASel(Operands, Op.Name, Op.Type);
     } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
       res = parseSDWADstUnused(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTyOpSel ||
+               Op.Type == AMDGPUOperand::ImmTyOpSelHi ||
+               Op.Type == AMDGPUOperand::ImmTyNegLo ||
+               Op.Type == AMDGPUOperand::ImmTyNegHi) {
+      res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
+                                        Op.ConvertResult);
     } else {
       res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
     }
@@ -3211,25 +4088,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
   return MatchOperand_NoMatch;
 }
 
-void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) {
-  unsigned I = 1;
-  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
-  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
-    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
-  }
-  for (unsigned E = Operands.size(); I != E; ++I)
-    ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1);
-}
-
-void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) {
-  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
-  if (TSFlags & SIInstrFlags::VOP3) {
-    cvtVOP3(Inst, Operands);
-  } else {
-    cvtId(Inst, Operands);
-  }
-}
-
 static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
       // 1. This operand is input modifiers
   return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
@@ -3241,48 +4099,133 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
       && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
 }
 
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
-  OptionalImmIndexMap OptionalIdx;
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
+                              OptionalImmIndexMap &OptionalIdx) {
+  unsigned Opc = Inst.getOpcode();
+
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
   for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
     ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
-  for (unsigned E = Operands.size(); I != E; ++I) {
-    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
-    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
-      Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
-    } else if (Op.isImm()) {
-      OptionalIdx[Op.getImmTy()] = I;
-    } else {
-      llvm_unreachable("unhandled operand type");
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) {
+    // This instruction has src modifiers
+    for (unsigned E = Operands.size(); I != E; ++I) {
+      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+      if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+        Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
+      } else if (Op.isImmModifier()) {
+        OptionalIdx[Op.getImmTy()] = I;
+      } else if (Op.isRegOrImm()) {
+        Op.addRegOrImmOperands(Inst, 1);
+      } else {
+        llvm_unreachable("unhandled operand type");
+      }
+    }
+  } else {
+    // No src modifiers
+    for (unsigned E = Operands.size(); I != E; ++I) {
+      AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+      if (Op.isMod()) {
+        OptionalIdx[Op.getImmTy()] = I;
+      } else {
+        Op.addRegOrImmOperands(Inst, 1);
+      }
     }
   }
 
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  }
+
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+  }
 
   // special case v_mac_{f16, f32}:
   // it has src2 register operand that is tied to dst operand
   // we don't allow modifiers for this operand in assembler so src2_modifiers
   // should be 0
-  if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
-      Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
-      Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) {
+  if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi ||
+      Opc == AMDGPU::V_MAC_F16_e64_vi) {
     auto it = Inst.begin();
-    std::advance(
-      it,
-      AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ?
-                                     AMDGPU::V_MAC_F16_e64 :
-                                     AMDGPU::V_MAC_F32_e64,
-                                 AMDGPU::OpName::src2_modifiers));
+    std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers));
     it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2
     ++it;
     Inst.insert(it, Inst.getOperand(0)); // src2 = dst
   }
 }
 
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+  cvtVOP3(Inst, Operands, OptionalIdx);
+}
+
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptIdx;
+
+  cvtVOP3(Inst, Operands, OptIdx);
+
+  // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
+  // instruction, and then figure out where to actually put the modifiers
+  int Opc = Inst.getOpcode();
+
+  addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+  addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1);
+
+  int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
+  if (NegLoIdx != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
+  }
+
+  const int Ops[] = { AMDGPU::OpName::src0,
+                      AMDGPU::OpName::src1,
+                      AMDGPU::OpName::src2 };
+  const int ModOps[] = { AMDGPU::OpName::src0_modifiers,
+                         AMDGPU::OpName::src1_modifiers,
+                         AMDGPU::OpName::src2_modifiers };
+
+  int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+  int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+
+  unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+  unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+  unsigned NegLo = 0;
+  unsigned NegHi = 0;
+
+  if (NegLoIdx != -1) {
+    int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+    NegLo = Inst.getOperand(NegLoIdx).getImm();
+    NegHi = Inst.getOperand(NegHiIdx).getImm();
+  }
+
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
+    if (OpIdx == -1)
+      break;
+
+    uint32_t ModVal = 0;
+
+    if ((OpSel & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_0;
+
+    if ((OpSelHi & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_1;
+
+    if ((NegLo & (1 << J)) != 0)
+      ModVal |= SISrcMods::NEG;
+
+    if ((NegHi & (1 << J)) != 0)
+      ModVal |= SISrcMods::NEG_HI;
+
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+    Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // dpp
 //===----------------------------------------------------------------------===//
@@ -3311,6 +4254,14 @@ bool AMDGPUOperand::isGPRIdxMode() const {
   return isImm() && isUInt<4>(getImm());
 }
 
+bool AMDGPUOperand::isS16Imm() const {
+  return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm()));
+}
+
+bool AMDGPUOperand::isU16Imm() const {
+  return isImm() && isUInt<16>(getImm());
+}
+
 OperandMatchResultTy
 AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
   SMLoc S = Parser.getTok().getLoc();
@@ -3436,7 +4387,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     // Add the register arguments
     if (Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
-      // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token.
+      // VOP2b (v_add_u32, v_sub_u32 ...) dpp use "vcc" token.
       // Skip it.
       continue;
     } if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
@@ -3541,13 +4492,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
   cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
 }
 
+void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);
+}
+
 void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
-  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI());
 }
 
 void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
-                              uint64_t BasicInstType) {
+                              uint64_t BasicInstType, bool skipVcc) {
+  using namespace llvm::AMDGPU::SDWA;
   OptionalImmIndexMap OptionalIdx;
+  bool skippedVcc = false;
 
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
@@ -3557,15 +4514,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
   for (unsigned E = Operands.size(); I != E; ++I) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
-    // Add the register arguments
-    if ((BasicInstType == SIInstrFlags::VOPC ||
-         BasicInstType == SIInstrFlags::VOP2)&&
-        Op.isReg() &&
-        Op.Reg.RegNo == AMDGPU::VCC) {
-      // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
-      // Skip it.
-      continue;
-    } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+    if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+      // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
+      // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
+      // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
+      // Skip VCC only if we didn't skip it on previous iteration.
+      if (BasicInstType == SIInstrFlags::VOP2 &&
+          (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {
+        skippedVcc = true;
+        continue;
+      } else if (BasicInstType == SIInstrFlags::VOPC &&
+                 Inst.getNumOperands() == 0) {
+        skippedVcc = true;
+        continue;
+      }
+    }
+    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
       Op.addRegWithInputModsOperands(Inst, 2);
     } else if (Op.isImm()) {
       // Handle optional arguments
@@ -3573,29 +4537,38 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
     } else {
       llvm_unreachable("Invalid operand type");
     }
+    skippedVcc = false;
   }
 
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-
-  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
-    // V_NOP_sdwa_vi has no optional sdwa arguments
+  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
+      Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
+    // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+      }
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       break;
 
     case SIInstrFlags::VOP2:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+      }
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
 
     case SIInstrFlags::VOPC:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
 
     default:
@@ -3609,10 +4582,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi)  {
     auto it = Inst.begin();
     std::advance(
-        it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+      it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
     Inst.insert(it, Inst.getOperand(0)); // src2 = dst
   }
-
 }
 
 /// Force static initialization.
diff --git a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 45a7fe6..2e96c14 100644
--- a/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -11,7 +11,9 @@ def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
 def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">;
 def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
 
-def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">;
+def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantRoot]>;
+def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantRoot], 20>;
+
 def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">;
 def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
 def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
@@ -21,8 +23,8 @@ def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset"
 class MubufLoad <SDPatternOperator op> : PatFrag <
   (ops node:$ptr), (op node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 def mubuf_load          : MubufLoad <load>;
@@ -55,6 +57,11 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> {
   string OpName = NAME # suffix;
 }
 
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
@@ -76,14 +83,31 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
   let EXP_CNT = 1;
   let MTBUF = 1;
   let Uses = [EXEC];
-
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMtbuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
+  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+  bits<1> has_dfmt    = 1;
+  bits<1> has_nfmt    = 1;
 }
 
 class MTBUF_Real <MTBUF_Pseudo ps> :
-  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  Enc64 {
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -95,57 +119,168 @@ class MTBUF_Real <MTBUF_Pseudo ps> :
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
 
-  bits<8> vdata;
   bits<12> offset;
-  bits<1> offen;
-  bits<1> idxen;
-  bits<1> glc;
-  bits<1> addr64;
-  bits<4> dfmt;
-  bits<3> nfmt;
-  bits<8> vaddr;
-  bits<7> srsrc;
-  bits<1> slc;
-  bits<1> tfe;
-  bits<8> soffset;
-
-  let Inst{11-0}  = offset;
-  let Inst{12}    = offen;
-  let Inst{13}    = idxen;
-  let Inst{14}    = glc;
-  let Inst{22-19} = dfmt;
-  let Inst{25-23} = nfmt;
-  let Inst{31-26} = 0x3a; //encoding
-  let Inst{39-32} = vaddr;
-  let Inst{47-40} = vdata;
-  let Inst{52-48} = srsrc{6-2};
-  let Inst{54}    = slc;
-  let Inst{55}    = tfe;
-  let Inst{63-56} = soffset;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc,
+         slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
+}
+
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64),
+            "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
 }
 
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class MTBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MTBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs),
-  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
-       i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
-       SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt,
+                      i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
+                      i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+class MTBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MTBUF_Pseudo<opName,
+                 (outs),
+                 getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 0;
   let mayStore = 1;
 }
 
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc,
+                                       i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // MUBUF classes
 //===----------------------------------------------------------------------===//
@@ -674,14 +809,14 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_128>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
 
 } // End let SubtargetPredicate = isGCN
 
@@ -705,12 +840,6 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 
 let Predicates = [isGCN] in {
 
-// int_SI_vs_load_input
-def : Pat<
-  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
->;
-
 // Offset in an 32-bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
@@ -964,21 +1093,30 @@ defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
 
 } // End Predicates = [Has16BitInsts]
 
-class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
-  (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
-                        i32:$soffset, u16imm:$offset))),
-  (Instr $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
+multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
+                                MUBUF_Pseudo InstrOffset,
+                                ValueType vt, PatFrag ld> {
+  def : Pat <
+    (vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+                               i32:$soffset, u16imm:$offset))),
+    (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+
+  def : Pat <
+    (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
+    (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+}
 
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, v2i32, load_private>;
-def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, v4i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
 
 // BUFFER_LOAD_DWORD*, addr64=0
 multiclass MUBUF_Load_Dword <ValueType vt,
@@ -1060,40 +1198,126 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
 defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
 defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
 
-class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
-  (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
-                               u16imm:$offset)),
-  (Instr $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
->;
+multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
+                                 MUBUF_Pseudo InstrOffset,
+                                 ValueType vt, PatFrag st> {
+  def : Pat <
+    (st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
+                                      i32:$soffset, u16imm:$offset)),
+    (InstrOffen $value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+
+  def : Pat <
+    (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
+                                       u16imm:$offset)),
+    (InstrOffset $value, $srsrc, $soffset, $offset, 0, 0, 0)
+  >;
+}
 
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
-def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private>;
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
 
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
-// TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
-  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
-                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
-                   imm:$nfmt, imm:$offen, imm:$idxen,
-                   imm:$glc, imm:$slc, imm:$tfe),
-  (opcode
-    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
-    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
-    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
->;
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, f32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
+                                (as_i16imm $offset), (as_i8imm $dfmt),
+                                (as_i8imm $nfmt), (as_i1imm $glc),
+                                (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt),
+                                   (as_i8imm $nfmt), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+          imm:$offset, imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
 
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4i32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, f32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_x3, v4f32, "TBUFFER_STORE_FORMAT_XYZ">;
+defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
 
 } // End let Predicates = [isGCN]
 
@@ -1209,21 +1433,44 @@ def BUFFER_WBINVL1_si           : MUBUF_Real_si <0x71, BUFFER_WBINVL1>;
 
 class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
   let AssemblerPredicate=isSICI;
   let DecoderNamespace="SICI";
 
-  bits<1> addr64;
-  let Inst{15}    = addr64;
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
   let Inst{18-16} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+  def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -1335,16 +1582,39 @@ def BUFFER_WBINVL1_VOL_vi       : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
 
 class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate=isVI;
   let DecoderNamespace="VI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
 
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
index a077001..fc516c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -88,18 +88,6 @@ class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
   let has_vdst = 0;
 }
 
-class DS_1A_Off8_NORET<string opName> : DS_Pseudo<opName,
-  (outs),
-  (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, gds:$gds),
-  "$addr $offset0$offset1$gds"> {
-
-  let has_data0 = 0;
-  let has_data1 = 0;
-  let has_vdst  = 0;
-  let has_offset = 0;
-  let AsmMatchConverter = "cvtDSOffset01";
-}
-
 class DS_1A2D_NORET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs),
@@ -143,10 +131,24 @@ class DS_1A2D_RET<string opName,
   let hasPostISelHook = 1;
 }
 
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
+class DS_1A2D_Off8_RET<string opName,
+                       RegisterClass rc = VGPR_32,
+                       RegisterClass src = rc>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
-  (ins VGPR_32:$addr, offset:$offset, gds:$gds),
+  (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
+  "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
+
+  let has_offset = 0;
+  let AsmMatchConverter = "cvtDSOffset01";
+
+  let hasPostISelHook = 1;
+}
+
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
   "$vdst, $addr$offset$gds"> {
 
   let has_data0 = 0;
@@ -174,6 +176,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
   let has_data1 = 0;
   let has_gds = 0;
   let gdsValue = 1;
+  let AsmMatchConverter = "cvtDSGds";
 }
 
 class DS_0A_RET <string opName> : DS_Pseudo<opName,
@@ -202,20 +205,46 @@ class DS_1A <string opName> : DS_Pseudo<opName,
   let has_data1 = 0;
 }
 
-class DS_1A_GDS <string opName> : DS_Pseudo<opName,
-  (outs),
-  (ins VGPR_32:$addr),
-  "$addr gds"> {
+class DS_GWS <string opName, dag ins, string asmOps>
+: DS_Pseudo<opName, (outs), ins, asmOps> {
+
+  let has_vdst  = 0;
+  let has_addr  = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+
+  let has_gds   = 0;
+  let gdsValue  = 1;
+  let AsmMatchConverter = "cvtDSGds";
+}
 
-  let has_vdst    = 0;
-  let has_data0   = 0;
-  let has_data1   = 0;
-  let has_offset  = 0;
+class DS_GWS_0D <string opName>
+: DS_GWS<opName,
+  (ins offset:$offset, gds:$gds), "$offset gds">;
+
+class DS_GWS_1D <string opName>
+: DS_GWS<opName,
+  (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
+
+  let has_data0 = 1;
+}
+
+class DS_VOID <string opName> : DS_Pseudo<opName,
+  (outs), (ins), ""> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 1;
+  let UseNamedOperandTable = 0;
+  let AsmMatchConverter = "";
+
+  let has_vdst = 0;
+  let has_addr = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+  let has_offset = 0;
   let has_offset0 = 0;
   let has_offset1 = 0;
-
-  let has_gds     = 0;
-  let gdsValue    = 1;
+  let has_gds = 0;
 }
 
 class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
@@ -324,9 +353,9 @@ def DS_MAX_RTN_F32    : DS_1A1D_RET <"ds_max_rtn_f32">,
 
 def DS_WRXCHG_RTN_B32      : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
                              AtomicNoRet<"", 1>;
-def DS_WRXCHG2_RTN_B32     : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
+def DS_WRXCHG2_RTN_B32     : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
                              AtomicNoRet<"", 1>;
-def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
+def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
                              AtomicNoRet<"", 1>;
 
 def DS_ADD_RTN_U64    : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
@@ -365,17 +394,17 @@ def DS_MAX_RTN_F64    : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
                         AtomicNoRet<"ds_max_f64", 1>;
 
 def DS_WRXCHG_RTN_B64      : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
-                             AtomicNoRet<"ds_wrxchg_b64", 1>;
-def DS_WRXCHG2_RTN_B64     : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
-                             AtomicNoRet<"ds_wrxchg2_b64", 1>;
-def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
-                             AtomicNoRet<"ds_wrxchg2st64_b64", 1>;
-
-def DS_GWS_INIT       : DS_1A_GDS<"ds_gws_init">;
-def DS_GWS_SEMA_V     : DS_1A_GDS<"ds_gws_sema_v">;
-def DS_GWS_SEMA_BR    : DS_1A_GDS<"ds_gws_sema_br">;
-def DS_GWS_SEMA_P     : DS_1A_GDS<"ds_gws_sema_p">;
-def DS_GWS_BARRIER    : DS_1A_GDS<"ds_gws_barrier">;
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2_RTN_B64     : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"", 1>;
+
+def DS_GWS_INIT       : DS_GWS_1D<"ds_gws_init">;
+def DS_GWS_SEMA_V     : DS_GWS_0D<"ds_gws_sema_v">;
+def DS_GWS_SEMA_BR    : DS_GWS_1D<"ds_gws_sema_br">;
+def DS_GWS_SEMA_P     : DS_GWS_0D<"ds_gws_sema_p">;
+def DS_GWS_BARRIER    : DS_GWS_1D<"ds_gws_barrier">;
 
 def DS_ADD_SRC2_U32   : DS_1A<"ds_add_src2_u32">;
 def DS_SUB_SRC2_U32   : DS_1A<"ds_sub_src2_u32">;
@@ -386,7 +415,7 @@ def DS_MIN_SRC2_I32   : DS_1A<"ds_min_src2_i32">;
 def DS_MAX_SRC2_I32   : DS_1A<"ds_max_src2_i32">;
 def DS_MIN_SRC2_U32   : DS_1A<"ds_min_src2_u32">;
 def DS_MAX_SRC2_U32   : DS_1A<"ds_max_src2_u32">;
-def DS_AND_SRC2_B32   : DS_1A<"ds_and_src_b32">;
+def DS_AND_SRC2_B32   : DS_1A<"ds_and_src2_b32">;
 def DS_OR_SRC2_B32    : DS_1A<"ds_or_src2_b32">;
 def DS_XOR_SRC2_B32   : DS_1A<"ds_xor_src2_b32">;
 def DS_MIN_SRC2_F32   : DS_1A<"ds_min_src2_f32">;
@@ -407,11 +436,11 @@ def DS_XOR_SRC2_B64   : DS_1A<"ds_xor_src2_b64">;
 def DS_MIN_SRC2_F64   : DS_1A<"ds_min_src2_f64">;
 def DS_MAX_SRC2_F64   : DS_1A<"ds_max_src2_f64">;
 
-def DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET<"ds_write_src2_b32">;
-def DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET<"ds_write_src2_b64">;
+def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
+def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
 
 let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32">;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
 }
 
 let mayStore = 0 in {
@@ -429,30 +458,34 @@ def DS_READ2_B64     : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
 def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
 }
 
-let SubtargetPredicate = isSICI in {
 def DS_CONSUME       : DS_0A_RET<"ds_consume">;
 def DS_APPEND        : DS_0A_RET<"ds_append">;
 def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
-}
 
 //===----------------------------------------------------------------------===//
 // Instruction definitions for CI and newer.
 //===----------------------------------------------------------------------===//
-// Remaining instructions:
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
 
 let SubtargetPredicate = isCIVI in {
 
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">,
-                      AtomicNoRet<"ds_wrap_f32", 1>;
+def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>;
+
+def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>,
+                            AtomicNoRet<"", 1>;
+
+def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
+
+let mayStore = 0 in {
+def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>;
+def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>;
+} // End mayStore = 0
+
+let mayLoad = 0 in {
+def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>;
+def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>;
+} // End mayLoad = 0
+
+def DS_NOP : DS_VOID<"ds_nop">;
 
 } // let SubtargetPredicate = isCIVI
 
@@ -623,6 +656,7 @@ def DS_CMPST_B32_si       : DS_Real_si<0x10, DS_CMPST_B32>;
 def DS_CMPST_F32_si       : DS_Real_si<0x11, DS_CMPST_F32>;
 def DS_MIN_F32_si         : DS_Real_si<0x12, DS_MIN_F32>;
 def DS_MAX_F32_si         : DS_Real_si<0x13, DS_MAX_F32>;
+def DS_NOP_si             : DS_Real_si<0x14, DS_NOP>;
 def DS_GWS_INIT_si        : DS_Real_si<0x19, DS_GWS_INIT>;
 def DS_GWS_SEMA_V_si      : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
 def DS_GWS_SEMA_BR_si     : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
@@ -651,8 +685,10 @@ def DS_CMPST_RTN_F32_si   : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
 def DS_MIN_RTN_F32_si     : DS_Real_si<0x32, DS_MIN_RTN_F32>;
 def DS_MAX_RTN_F32_si     : DS_Real_si<0x33, DS_MAX_RTN_F32>;
 
-// FIXME: this instruction is actually CI/VI
-def DS_WRAP_RTN_F32_si    : DS_Real_si<0x34, DS_WRAP_RTN_F32>;
+// These instruction are CI/VI only
+def DS_WRAP_RTN_B32_si    : DS_Real_si<0x34, DS_WRAP_RTN_B32>;
+def DS_CONDXCHG32_RTN_B64_si   : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>;
+def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>;
 
 def DS_SWIZZLE_B32_si     : DS_Real_si<0x35, DS_SWIZZLE_B32>;
 def DS_READ_B32_si        : DS_Real_si<0x36, DS_READ_B32>;
@@ -744,6 +780,10 @@ def DS_WRITE_SRC2_B64_si  : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
 
 def DS_MIN_SRC2_F64_si    : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
 def DS_MAX_SRC2_F64_si    : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
+def DS_WRITE_B96_si       : DS_Real_si<0xde, DS_WRITE_B96>;
+def DS_WRITE_B128_si      : DS_Real_si<0xdf, DS_WRITE_B128>;
+def DS_READ_B96_si        : DS_Real_si<0xfe, DS_READ_B96>;
+def DS_READ_B128_si       : DS_Real_si<0xff, DS_READ_B128>;
 
 //===----------------------------------------------------------------------===//
 // VIInstructions.td
@@ -787,12 +827,13 @@ def DS_CMPST_B32_vi       : DS_Real_vi<0x10, DS_CMPST_B32>;
 def DS_CMPST_F32_vi       : DS_Real_vi<0x11, DS_CMPST_F32>;
 def DS_MIN_F32_vi         : DS_Real_vi<0x12, DS_MIN_F32>;
 def DS_MAX_F32_vi         : DS_Real_vi<0x13, DS_MAX_F32>;
+def DS_NOP_vi             : DS_Real_vi<0x14, DS_NOP>;
 def DS_ADD_F32_vi         : DS_Real_vi<0x15, DS_ADD_F32>;
-def DS_GWS_INIT_vi        : DS_Real_vi<0x19, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x1a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x1c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_vi     : DS_Real_vi<0x1d, DS_GWS_BARRIER>;
+def DS_GWS_INIT_vi        : DS_Real_vi<0x99, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x9a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x9c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_vi     : DS_Real_vi<0x9d, DS_GWS_BARRIER>;
 def DS_WRITE_B8_vi        : DS_Real_vi<0x1e, DS_WRITE_B8>;
 def DS_WRITE_B16_vi       : DS_Real_vi<0x1f, DS_WRITE_B16>;
 def DS_ADD_RTN_U32_vi     : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
@@ -815,7 +856,7 @@ def DS_CMPST_RTN_B32_vi   : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
 def DS_CMPST_RTN_F32_vi   : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
 def DS_MIN_RTN_F32_vi     : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
 def DS_MAX_RTN_F32_vi     : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
-def DS_WRAP_RTN_F32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_F32>;
+def DS_WRAP_RTN_B32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
 def DS_ADD_RTN_F32_vi     : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
 def DS_READ_B32_vi        : DS_Real_vi<0x36, DS_READ_B32>;
 def DS_READ2_B32_vi       : DS_Real_vi<0x37, DS_READ2_B32>;
@@ -824,6 +865,9 @@ def DS_READ_I8_vi         : DS_Real_vi<0x39, DS_READ_I8>;
 def DS_READ_U8_vi         : DS_Real_vi<0x3a, DS_READ_U8>;
 def DS_READ_I16_vi        : DS_Real_vi<0x3b, DS_READ_I16>;
 def DS_READ_U16_vi        : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_CONSUME_vi         : DS_Real_vi<0xbd, DS_CONSUME>;
+def DS_APPEND_vi          : DS_Real_vi<0xbe, DS_APPEND>;
+def DS_ORDERED_COUNT_vi   : DS_Real_vi<0xbf, DS_ORDERED_COUNT>;
 def DS_SWIZZLE_B32_vi     : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
 def DS_PERMUTE_B32_vi     : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
 def DS_BPERMUTE_B32_vi    : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
@@ -865,6 +909,8 @@ def DS_MSKOR_RTN_B64_vi   : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
 def DS_WRXCHG_RTN_B64_vi  : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
 def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
 def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CONDXCHG32_RTN_B64_vi   : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
+def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
 def DS_CMPST_RTN_B64_vi   : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
 def DS_CMPST_RTN_F64_vi   : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
 def DS_MIN_RTN_F64_vi     : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
@@ -904,3 +950,7 @@ def DS_XOR_SRC2_B64_vi    : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
 def DS_WRITE_SRC2_B64_vi  : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
 def DS_MIN_SRC2_F64_vi    : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
 def DS_MAX_SRC2_F64_vi    : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
+def DS_WRITE_B96_vi       : DS_Real_vi<0xde, DS_WRITE_B96>;
+def DS_WRITE_B128_vi      : DS_Real_vi<0xdf, DS_WRITE_B128>;
+def DS_READ_B96_vi        : DS_Real_vi<0xfe, DS_READ_B96>;
+def DS_READ_B128_vi       : DS_Real_vi<0xff, DS_READ_B128>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2247cad..966c6fe 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -20,20 +20,20 @@
 #include "AMDGPUDisassembler.h"
 #include "AMDGPU.h"
 #include "AMDGPURegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
 
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/TargetRegistry.h"
 
-
 using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-disassembler"
@@ -49,6 +49,17 @@ addOperand(MCInst &Inst, const MCOperand& Opnd) {
     MCDisassembler::SoftFail;
 }
 
+static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
+                                uint16_t NameIdx) {
+  int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), NameIdx);
+  if (OpIdx != -1) {
+    auto I = MI.begin();
+    std::advance(I, OpIdx);
+    MI.insert(I, Op);
+  }
+  return OpIdx;
+}
+
 static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr, const void *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -61,32 +72,34 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, MCOperand::createImm(Imm));
 }
 
-#define DECODE_OPERAND2(RegClass, DecName) \
-static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
-                                                    unsigned Imm, \
-                                                    uint64_t /*Addr*/, \
-                                                    const void *Decoder) { \
+#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
+static DecodeStatus StaticDecoderName(MCInst &Inst, \
+                                       unsigned Imm, \
+                                       uint64_t /*Addr*/, \
+                                       const void *Decoder) { \
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
-  return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+  return addOperand(Inst, DAsm->DecoderName(Imm)); \
 }
 
-#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+#define DECODE_OPERAND_REG(RegClass) \
+DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
 
-DECODE_OPERAND(VGPR_32)
-DECODE_OPERAND(VS_32)
-DECODE_OPERAND(VS_64)
+DECODE_OPERAND_REG(VGPR_32)
+DECODE_OPERAND_REG(VS_32)
+DECODE_OPERAND_REG(VS_64)
+DECODE_OPERAND_REG(VS_128)
 
-DECODE_OPERAND(VReg_64)
-DECODE_OPERAND(VReg_96)
-DECODE_OPERAND(VReg_128)
+DECODE_OPERAND_REG(VReg_64)
+DECODE_OPERAND_REG(VReg_96)
+DECODE_OPERAND_REG(VReg_128)
 
-DECODE_OPERAND(SReg_32)
-DECODE_OPERAND(SReg_32_XM0_XEXEC)
-DECODE_OPERAND(SReg_64)
-DECODE_OPERAND(SReg_64_XEXEC)
-DECODE_OPERAND(SReg_128)
-DECODE_OPERAND(SReg_256)
-DECODE_OPERAND(SReg_512)
+DECODE_OPERAND_REG(SReg_32)
+DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
+DECODE_OPERAND_REG(SReg_64)
+DECODE_OPERAND_REG(SReg_64_XEXEC)
+DECODE_OPERAND_REG(SReg_128)
+DECODE_OPERAND_REG(SReg_256)
+DECODE_OPERAND_REG(SReg_512)
 
 
 static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
@@ -97,9 +110,20 @@ static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
 }
 
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
+static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
+                                         unsigned Imm,
+                                         uint64_t Addr,
+                                         const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
+}
+
+#define DECODE_SDWA(DecName) \
+DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
+
+DECODE_SDWA(Src32)
+DECODE_SDWA(Src16)
+DECODE_SDWA(VopcDst)
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
@@ -121,6 +145,7 @@ DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table,
   assert(MI.getOpcode() == 0);
   assert(MI.getNumOperands() == 0);
   MCInst TmpInst;
+  HasLiteral = false;
   const auto SavedBytes = Bytes;
   if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
     MI = TmpInst;
@@ -136,9 +161,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                                 raw_ostream &WS,
                                                 raw_ostream &CS) const {
   CommentStream = &CS;
+  bool IsSDWA = false;
 
   // ToDo: AMDGPUDisassembler supports only VI ISA.
-  assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
+    report_fatal_error("Disassembly not yet supported for subtarget");
 
   const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
   Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -156,7 +183,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       if (Res) break;
 
       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
-      if (Res) break;
+      if (Res) { IsSDWA = true;  break; }
+
+      Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
+      if (Res) { IsSDWA = true;  break; }
     }
 
     // Reinitialize Bytes as DPP64 could have eaten too much
@@ -179,10 +209,40 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
   } while (false);
 
+  if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
+              MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+    // Insert dummy unused src2_modifiers.
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src2_modifiers);
+  }
+
+  if (Res && IsSDWA)
+    Res = convertSDWAInst(MI);
+
   Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
   return Res;
 }
 
+DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+    if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst) != -1)
+      // VOPC - insert clamp
+      insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
+  } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+    int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
+    if (SDst != -1) {
+      // VOPC - insert VCC register as sdst
+      insertNamedMCOperand(MI, MCOperand::createReg(AMDGPU::VCC),
+                           AMDGPU::OpName::sdst);
+    } else {
+      // VOP1/2 - insert omod if present in instruction
+      insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::omod);
+    }
+  }
+  return MCDisassembler::Success;
+}
+
 const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const {
   return getContext().getRegisterInfo()->
     getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]);
@@ -259,10 +319,18 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
   return decodeSrcOp(OPW64, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
+  return decodeSrcOp(OPW128, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
   return decodeSrcOp(OPW16, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const {
+  return decodeSrcOp(OPWV216, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
   // Some instructions have operand restrictions beyond what the encoding
   // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -322,10 +390,15 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
   // For now all literal constants are supposed to be unsigned integer
   // ToDo: deal with signed/unsigned 64-bit integer constants
   // ToDo: deal with float/double constants
-  if (Bytes.size() < 4)
-    return errOperand(0, "cannot read literal, inst bytes left " +
-                         Twine(Bytes.size()));
-  return MCOperand::createImm(eatBytes<uint32_t>(Bytes));
+  if (!HasLiteral) {
+    if (Bytes.size() < 4) {
+      return errOperand(0, "cannot read literal, inst bytes left " +
+                        Twine(Bytes.size()));
+    }
+    HasLiteral = true;
+    Literal = eatBytes<uint32_t>(Bytes);
+  }
+  return MCOperand::createImm(Literal);
 }
 
 MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -423,6 +496,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
   case OPW64:
     return MCOperand::createImm(getInlineImmVal64(Imm));
   case OPW16:
+  case OPWV216:
     return MCOperand::createImm(getInlineImmVal16(Imm));
   default:
     llvm_unreachable("implement me");
@@ -436,6 +510,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return VGPR_32RegClassID;
   case OPW64: return VReg_64RegClassID;
   case OPW128: return VReg_128RegClassID;
@@ -449,6 +524,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return SGPR_32RegClassID;
   case OPW64: return SGPR_64RegClassID;
   case OPW128: return SGPR_128RegClassID;
@@ -462,6 +538,7 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return TTMP_32RegClassID;
   case OPW64: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
@@ -483,8 +560,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
     return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
   }
 
-  assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
-
   if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
     return decodeIntImmed(Val);
 
@@ -497,6 +572,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
   switch (Width) {
   case OPW32:
   case OPW16:
+  case OPWV216:
     return decodeSpecialReg32(Val);
   case OPW64:
     return decodeSpecialReg64(Val);
@@ -522,6 +598,11 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   case 124: return createRegOperand(M0);
   case 126: return createRegOperand(EXEC_LO);
   case 127: return createRegOperand(EXEC_HI);
+  case 235: return createRegOperand(SRC_SHARED_BASE);
+  case 236: return createRegOperand(SRC_SHARED_LIMIT);
+  case 237: return createRegOperand(SRC_PRIVATE_BASE);
+  case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
+    // TODO: SRC_POPS_EXITING_WAVE_ID
     // ToDo: no support for vccz register
   case 251: break;
     // ToDo: no support for execz register
@@ -545,6 +626,57 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   return errOperand(Val, "unknown operand encoding " + Twine(Val));
 }
 
+MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
+                                            unsigned Val) const {
+  using namespace AMDGPU::SDWA;
+
+  if (STI.getFeatureBits()[AMDGPU::FeatureGFX9]) {
+    // XXX: static_cast<int> is needed to avoid stupid warning:
+    // compare with unsigned is always true
+    if (SDWA9EncValues::SRC_VGPR_MIN <= static_cast<int>(Val) &&
+        Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+      return createRegOperand(getVgprClassId(Width),
+                              Val - SDWA9EncValues::SRC_VGPR_MIN);
+    }
+    if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+        Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+      return createSRegOperand(getSgprClassId(Width),
+                               Val - SDWA9EncValues::SRC_SGPR_MIN);
+    }
+
+    return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+  } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+    return createRegOperand(getVgprClassId(Width), Val);
+  }
+  llvm_unreachable("unsupported target");
+}
+
+MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
+  return decodeSDWASrc(OPW16, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
+  return decodeSDWASrc(OPW32, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
+  using namespace AMDGPU::SDWA;
+
+  assert(STI.getFeatureBits()[AMDGPU::FeatureGFX9] &&
+         "SDWAVopcDst should be present only on GFX9");
+  if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
+    Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+    if (Val > AMDGPU::EncValues::SGPR_MAX) {
+      return decodeSpecialReg64(Val);
+    } else {
+      return createSRegOperand(getSgprClassId(OPW64), Val);
+    }
+  } else {
+    return createRegOperand(AMDGPU::VCC);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUSymbolizer
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index ee5883a..4c755be 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -20,8 +20,8 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
-#include <cstdint>
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 
 namespace llvm {
@@ -39,6 +39,8 @@ class Twine;
 class AMDGPUDisassembler : public MCDisassembler {
 private:
   mutable ArrayRef<uint8_t> Bytes;
+  mutable uint32_t Literal;
+  mutable bool HasLiteral;
 
 public:
   AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
@@ -63,10 +65,14 @@ public:
                               uint64_t Inst,
                               uint64_t Address) const;
 
+  DecodeStatus convertSDWAInst(MCInst &MI) const;
+
   MCOperand decodeOperand_VGPR_32(unsigned Val) const;
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
+  MCOperand decodeOperand_VS_128(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
+  MCOperand decodeOperand_VSrcV216(unsigned Val) const;
 
   MCOperand decodeOperand_VReg_64(unsigned Val) const;
   MCOperand decodeOperand_VReg_96(unsigned Val) const;
@@ -85,6 +91,7 @@ public:
     OPW64,
     OPW128,
     OPW16,
+    OPWV216,
     OPW_LAST_,
     OPW_FIRST_ = OPW32
   };
@@ -100,6 +107,11 @@ public:
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
+
+  MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeSDWASrc16(unsigned Val) const;
+  MCOperand decodeSDWASrc32(unsigned Val) const;
+  MCOperand decodeSDWAVopcDst(unsigned Val) const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 48c6592..5480110 100644
--- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -35,28 +35,59 @@ class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag
     : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
                  "MEM_RAT_CACHELESS "#name, pattern>;
 
-class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
-                  list<dag> pattern>
-    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                  dag outs, string name, list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, mask, outs, ins,
                  "MEM_RAT "#name, pattern>;
 
 class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop>
-    : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
-                           i32imm:$rat_id, InstFlag:$eop),
+    : CF_MEM_RAT <0x1, ?, 0xf, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr,
+                           i32imm:$rat_id, InstFlag:$eop), (outs),
                   "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr"
                                #!if(has_eop, ", $eop", ""),
                   [(int_r600_rat_store_typed R600_Reg128:$rw_gpr,
                                              R600_Reg128:$index_gpr,
                                              (i32 imm:$rat_id))]>;
 
-def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), (outs),
   "MSKOR $rw_gpr.XW, $index_gpr",
   [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
 > {
   let eop = 0;
 }
 
+
+multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> {
+  let Constraints = "$rw_gpr = $out_gpr", eop = 0, mayStore = 1 in {
+  def  _RTN: CF_MEM_RAT <op_ret, 0, 0xf,
+             (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+             (outs R600_Reg128:$out_gpr),
+             name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >;
+  def _NORET: CF_MEM_RAT <op_noret, 0, 0xf,
+              (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+              (outs R600_Reg128:$out_gpr),
+              name ## " $rw_gpr, $index_gpr", [] >;
+  }
+}
+
+// Swap no-ret is just store. Raw store to cached target
+// can only store on dword, which exactly matches swap_no_ret.
+defm RAT_ATOMIC_XCHG_INT : RAT_ATOMIC<1, 34, "ATOMIC_XCHG_INT">;
+defm RAT_ATOMIC_CMPXCHG_INT : RAT_ATOMIC<4, 36, "ATOMIC_CMPXCHG_INT">;
+defm RAT_ATOMIC_ADD : RAT_ATOMIC<7, 39, "ATOMIC_ADD">;
+defm RAT_ATOMIC_SUB : RAT_ATOMIC<8, 40, "ATOMIC_SUB">;
+defm RAT_ATOMIC_RSUB : RAT_ATOMIC<9, 41, "ATOMIC_RSUB">;
+defm RAT_ATOMIC_MIN_INT : RAT_ATOMIC<10, 42, "ATOMIC_MIN_INT">;
+defm RAT_ATOMIC_MIN_UINT : RAT_ATOMIC<11, 43, "ATOMIC_MIN_UINT">;
+defm RAT_ATOMIC_MAX_INT : RAT_ATOMIC<12, 44, "ATOMIC_MAX_INT">;
+defm RAT_ATOMIC_MAX_UINT : RAT_ATOMIC<13, 45, "ATOMIC_MAX_UINT">;
+defm RAT_ATOMIC_AND : RAT_ATOMIC<14, 46, "ATOMIC_AND">;
+defm RAT_ATOMIC_OR : RAT_ATOMIC<15, 47, "ATOMIC_OR">;
+defm RAT_ATOMIC_XOR : RAT_ATOMIC<16, 48, "ATOMIC_XOR">;
+defm RAT_ATOMIC_INC_UINT : RAT_ATOMIC<18, 50, "ATOMIC_INC_UINT">;
+defm RAT_ATOMIC_DEC_UINT : RAT_ATOMIC<19, 51, "ATOMIC_DEC_UINT">;
+
 } // End let Predicates = [isEGorCayman]
 
 //===----------------------------------------------------------------------===//
@@ -257,6 +288,76 @@ def : Pat<(v4i32:$dst_gpr (vtx_id1_load ADDRVTX_READ:$src_gpr)),
 
 let Predicates = [isEGorCayman] in {
 
+multiclass AtomicPat<Instruction inst_ret, Instruction inst_noret,
+                     SDPatternOperator node_ret, SDPatternOperator node_noret> {
+  // FIXME: Add _RTN version. We need per WI scratch location to store the old value
+  // EXTRACT_SUBREG here is dummy, we know the node has no uses
+  def : Pat<(i32 (node_noret i32:$ptr, i32:$data)),
+            (EXTRACT_SUBREG (inst_noret
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $data, sub0), $ptr), sub1)>;
+}
+multiclass AtomicIncDecPat<Instruction inst_ret, Instruction inst_noret,
+                     SDPatternOperator node_ret, SDPatternOperator node_noret, int C> {
+  // FIXME: Add _RTN version. We need per WI scratch location to store the old value
+  // EXTRACT_SUBREG here is dummy, we know the node has no uses
+  def : Pat<(i32 (node_noret i32:$ptr, C)),
+            (EXTRACT_SUBREG (inst_noret
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (MOV_IMM_I32 -1), sub0), $ptr), sub1)>;
+}
+
+// CMPSWAP is pattern is special
+// EXTRACT_SUBREG here is dummy, we know the node has no uses
+// FIXME: Add _RTN version. We need per WI scratch location to store the old value
+def : Pat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$data)),
+          (EXTRACT_SUBREG (RAT_ATOMIC_CMPXCHG_INT_NORET
+            (INSERT_SUBREG
+              (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), $cmp, sub3),
+            $data, sub0),
+          $ptr), sub1)>;
+
+defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN,
+                                RAT_ATOMIC_XCHG_INT_NORET,
+                                atomic_swap_global_ret,
+                                atomic_swap_global_noret>;
+defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET,
+                               atomic_add_global_ret, atomic_add_global_noret>;
+defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET,
+                               atomic_sub_global_ret, atomic_sub_global_noret>;
+defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN,
+                               RAT_ATOMIC_MIN_INT_NORET,
+                               atomic_min_global_ret, atomic_min_global_noret>;
+defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN,
+                                RAT_ATOMIC_MIN_UINT_NORET,
+                                atomic_umin_global_ret, atomic_umin_global_noret>;
+defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN,
+                               RAT_ATOMIC_MAX_INT_NORET,
+                               atomic_max_global_ret, atomic_max_global_noret>;
+defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN,
+                                RAT_ATOMIC_MAX_UINT_NORET,
+                                atomic_umax_global_ret, atomic_umax_global_noret>;
+defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET,
+                               atomic_and_global_ret, atomic_and_global_noret>;
+defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET,
+                              atomic_or_global_ret, atomic_or_global_noret>;
+defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET,
+                               atomic_xor_global_ret, atomic_xor_global_noret>;
+defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
+                                        RAT_ATOMIC_INC_UINT_NORET,
+                                        atomic_add_global_ret,
+                                        atomic_add_global_noret, 1>;
+defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
+                                        RAT_ATOMIC_INC_UINT_NORET,
+                                        atomic_sub_global_ret,
+                                        atomic_sub_global_noret, -1>;
+defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
+                                        RAT_ATOMIC_DEC_UINT_NORET,
+                                        atomic_add_global_ret,
+                                        atomic_add_global_noret, -1>;
+defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
+                                        RAT_ATOMIC_DEC_UINT_NORET,
+                                        atomic_sub_global_ret,
+                                        atomic_sub_global_noret, 1>;
+
 // Should be predicated on FeatureFP64
 // def FMA_64 : R600_3OP <
 //   0xA, "FMA_64",
@@ -287,7 +388,7 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   VecALU
 >;
 
-def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
+defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>;
 
 def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
@@ -337,7 +438,7 @@ defm CUBE_eg : CUBE_Common<0xC0>;
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
-def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, VecALU>;
 def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
 def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 849fb8a..edca6fc 100644
--- a/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">;
+def FLATAtomic : ComplexPattern<i64, 3, "SelectFlatAtomic", [], [], -10>;
+def FLATOffset : ComplexPattern<i64, 3, "SelectFlat", [], [], -10>;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
@@ -30,8 +31,6 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   let VM_CNT = 1;
   let LGKM_CNT = 1;
 
-  let Uses = [EXEC, FLAT_SCR]; // M0
-
   let UseNamedOperandTable = 1;
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
@@ -39,10 +38,16 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,
   string Mnemonic = opName;
   string AsmOperands = asmOps;
 
+  bits<1> is_flat_global = 0;
+  bits<1> is_flat_scratch = 0;
+
   bits<1> has_vdst = 1;
   bits<1> has_data = 1;
   bits<1> has_glc  = 1;
   bits<1> glcValue = 0;
+
+  // TODO: M0 if it could possibly access LDS (before gfx9? only)?
+  let Uses = !if(is_flat_global, [EXEC], [EXEC, FLAT_SCR]);
 }
 
 class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
@@ -55,6 +60,8 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
+  let TSFlags = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
 
   // encoding fields
   bits<8> vaddr;
@@ -62,9 +69,27 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   bits<8> vdst;
   bits<1> slc;
   bits<1> glc;
-  bits<1> tfe;
 
-  // 15-0 is reserved.
+  // Only valid on gfx9
+  bits<1> lds = 0; // XXX - What does this actually do?
+
+  // Segment, 00=flat, 01=scratch, 10=global, 11=reserved
+  bits<2> seg = !if(ps.is_flat_global, 0b10,
+                  !if(ps.is_flat_scratch, 0b01, 0));
+
+  // Signed offset. Highest bit ignored for flat and treated as 12-bit
+  // unsigned for flat acceses.
+  bits<13> offset;
+  bits<1> nv = 0; // XXX - What does this actually do?
+
+  // We don't use tfe right now, and it was removed in gfx9.
+  bits<1> tfe = 0;
+
+  // Only valid on GFX9+
+  let Inst{12-0} = offset;
+  let Inst{13} = lds;
+  let Inst{15-14} = seg;
+
   let Inst{16}    = !if(ps.has_glc, glc, ps.glcValue);
   let Inst{17}    = slc;
   let Inst{24-18} = op;
@@ -72,41 +97,70 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
   let Inst{39-32} = vaddr;
   let Inst{47-40} = !if(ps.has_data, vdata, ?);
   // 54-48 is reserved.
-  let Inst{55}    = tfe;
+  let Inst{55}    = nv; // nv on GFX9+, TFE before.
   let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
 }
 
-class FLAT_Load_Pseudo <string opName, RegisterClass regClass> : FLAT_Pseudo<
+class FLAT_Load_Pseudo <string opName, RegisterClass regClass,
+  bit HasSignedOffset = 0> : FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
-  (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe),
-  " $vdst, $vaddr$glc$slc$tfe"> {
+  !if(HasSignedOffset,
+    (ins VReg_64:$vaddr, offset_s13:$offset, GLC:$glc, slc:$slc),
+    (ins VReg_64:$vaddr, offset_u12:$offset, GLC:$glc, slc:$slc)),
+  " $vdst, $vaddr$offset$glc$slc"> {
   let has_data = 0;
   let mayLoad = 1;
 }
 
-class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass> : FLAT_Pseudo<
+class FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Load_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
+class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
+  bit HasSignedOffset = 0> : FLAT_Pseudo<
   opName,
   (outs),
-  (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe),
-  " $vaddr, $vdata$glc$slc$tfe"> {
+  !if(HasSignedOffset,
+    (ins VReg_64:$vaddr, vdataClass:$vdata, offset_s13:$offset, GLC:$glc, slc:$slc),
+    (ins VReg_64:$vaddr, vdataClass:$vdata, offset_u12:$offset, GLC:$glc, slc:$slc)),
+  " $vaddr, $vdata$offset$glc$slc"> {
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
 }
 
+class FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_global = 1;
+}
+
+class FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> :
+  FLAT_Store_Pseudo<opName, regClass, 1> {
+  let is_flat_scratch = 1;
+}
+
 multiclass FLAT_Atomic_Pseudo<
   string opName,
   RegisterClass vdst_rc,
   ValueType vt,
   SDPatternOperator atomic = null_frag,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc> {
+  RegisterClass data_rc = vdst_rc,
+  bit HasSignedOffset = 0> {
 
   def "" : FLAT_Pseudo <opName,
     (outs),
-    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
-    " $vaddr, $vdata$slc$tfe",
+    !if(HasSignedOffset,
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)),
+    " $vaddr, $vdata$offset$slc",
     []>,
     AtomicNoRet <NAME, 0> {
     let mayLoad = 1;
@@ -119,10 +173,12 @@ multiclass FLAT_Atomic_Pseudo<
 
   def _RTN : FLAT_Pseudo <opName,
     (outs vdst_rc:$vdst),
-    (ins VReg_64:$vaddr, data_rc:$vdata, slc:$slc, tfe:$tfe),
-    " $vdst, $vaddr, $vdata glc$slc$tfe",
+    !if(HasSignedOffset,
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, slc:$slc),
+      (ins VReg_64:$vaddr, data_rc:$vdata, offset_u12:$offset, slc:$slc)),
+    " $vdst, $vaddr, $vdata$offset glc$slc",
     [(set vt:$vdst,
-      (atomic (FLATAtomic i64:$vaddr, i1:$slc, i1:$tfe), data_vt:$vdata))]>,
+      (atomic (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
     AtomicNoRet <NAME, 1> {
     let mayLoad  = 1;
     let mayStore = 1;
@@ -136,7 +192,7 @@ multiclass FLAT_Atomic_Pseudo<
 class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
   (ops node:$ptr, node:$value),
   (atomic_op node:$ptr, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}]
 >;
 
 def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
@@ -277,6 +333,26 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 
 } // End SubtargetPredicate = isCI
 
+let SubtargetPredicate = HasFlatGlobalInsts in {
+def GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
+def GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
+def GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
+def GLOBAL_LOAD_SSHORT   : FLAT_Global_Load_Pseudo <"global_load_sshort", VGPR_32>;
+def GLOBAL_LOAD_DWORD    : FLAT_Global_Load_Pseudo <"global_load_dword", VGPR_32>;
+def GLOBAL_LOAD_DWORDX2  : FLAT_Global_Load_Pseudo <"global_load_dwordx2", VReg_64>;
+def GLOBAL_LOAD_DWORDX3  : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>;
+def GLOBAL_LOAD_DWORDX4  : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>;
+
+def GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
+def GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
+def GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR_32>;
+def GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
+def GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
+def GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+
+} // End SubtargetPredicate = HasFlatGlobalInsts
+
+
 //===----------------------------------------------------------------------===//
 // Flat Patterns
 //===----------------------------------------------------------------------===//
@@ -284,16 +360,16 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
                                                (ld node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+  return AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
                                                (st node:$val, node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS;
+  return AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 def atomic_flat_load   : flat_ld <atomic_load>;
@@ -310,31 +386,31 @@ def flat_truncstorei16 : flat_st <truncstorei16>;
 
 // Patterns for global loads with no offset.
 class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr)),
-  (inst $addr, 0, 0, 0)
+  (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
+  (inst $vaddr, $offset, 0, $slc)
 >;
 
 class FlatLoadAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
-  (vt (node i64:$addr)),
-  (inst $addr, 1, 0, 0)
+  (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))),
+  (inst $vaddr, $offset, 1, $slc)
 >;
 
 class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
-  (node vt:$data, i64:$addr),
-  (inst $addr, $data, 0, 0, 0)
+  (node vt:$data, (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc)),
+  (inst $vaddr, $data, $offset, 0, $slc)
 >;
 
 class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : Pat <
   // atomic store follows atomic binop convention so the address comes
   // first.
-  (node i64:$addr, vt:$data),
-  (inst $addr, $data, 1, 0, 0)
+  (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data),
+  (inst $vaddr, $data, $offset, 1, $slc)
 >;
 
 class FlatAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
                      ValueType data_vt = vt> : Pat <
-  (vt (node i64:$addr, data_vt:$data)),
-  (inst $addr, $data, 0, 0)
+  (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$data)),
+  (inst $vaddr, $data, $offset, $slc)
 >;
 
 let Predicates = [isCIVI] in {
@@ -528,3 +604,18 @@ defm FLAT_ATOMIC_XOR_X2     : FLAT_Real_Atomics_vi <0x6a, FLAT_ATOMIC_XOR_X2>;
 defm FLAT_ATOMIC_INC_X2     : FLAT_Real_Atomics_vi <0x6b, FLAT_ATOMIC_INC_X2>;
 defm FLAT_ATOMIC_DEC_X2     : FLAT_Real_Atomics_vi <0x6c, FLAT_ATOMIC_DEC_X2>;
 
+def GLOBAL_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, GLOBAL_LOAD_UBYTE>;
+def GLOBAL_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, GLOBAL_LOAD_SBYTE>;
+def GLOBAL_LOAD_USHORT_vi : FLAT_Real_vi <0x12, GLOBAL_LOAD_USHORT>;
+def GLOBAL_LOAD_SSHORT_vi : FLAT_Real_vi <0x13, GLOBAL_LOAD_SSHORT>;
+def GLOBAL_LOAD_DWORD_vi : FLAT_Real_vi <0x14, GLOBAL_LOAD_DWORD>;
+def GLOBAL_LOAD_DWORDX2_vi : FLAT_Real_vi <0x15, GLOBAL_LOAD_DWORDX2>;
+def GLOBAL_LOAD_DWORDX4_vi : FLAT_Real_vi <0x17, GLOBAL_LOAD_DWORDX4>;
+def GLOBAL_LOAD_DWORDX3_vi : FLAT_Real_vi <0x16, GLOBAL_LOAD_DWORDX3>;
+
+def GLOBAL_STORE_BYTE_vi : FLAT_Real_vi <0x18, GLOBAL_STORE_BYTE>;
+def GLOBAL_STORE_SHORT_vi : FLAT_Real_vi <0x1a, GLOBAL_STORE_SHORT>;
+def GLOBAL_STORE_DWORD_vi : FLAT_Real_vi <0x1c, GLOBAL_STORE_DWORD>;
+def GLOBAL_STORE_DWORDX2_vi : FLAT_Real_vi <0x1d, GLOBAL_STORE_DWORDX2>;
+def GLOBAL_STORE_DWORDX4_vi : FLAT_Real_vi <0x1f, GLOBAL_STORE_DWORDX4>;
+def GLOBAL_STORE_DWORDX3_vi : FLAT_Real_vi <0x1e, GLOBAL_STORE_DWORDX3>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index dd3b46f..cd9e7fb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -13,9 +13,22 @@
 
 #include "GCNHazardRecognizer.h"
 #include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
 #include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <set>
+#include <vector>
 
 using namespace llvm;
 
@@ -26,7 +39,8 @@ using namespace llvm;
 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   CurrCycleInstr(nullptr),
   MF(MF),
-  ST(MF.getSubtarget<SISubtarget>()) {
+  ST(MF.getSubtarget<SISubtarget>()),
+  TII(*ST.getInstrInfo()) {
   MaxLookAhead = 5;
 }
 
@@ -58,8 +72,19 @@ static bool isRFE(unsigned Opcode) {
   return Opcode == AMDGPU::S_RFE_B64;
 }
 
-static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
+static bool isSMovRel(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_MOVRELS_B32:
+  case AMDGPU::S_MOVRELS_B64:
+  case AMDGPU::S_MOVRELD_B32:
+  case AMDGPU::S_MOVRELD_B64:
+    return true;
+  default:
+    return false;
+  }
+}
 
+static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
@@ -96,6 +121,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
     return NoopHazard;
 
+  if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+      checkReadM0Hazards(MI) > 0)
+    return NoopHazard;
+
+  if (checkAnyInstHazards(MI) > 0)
+    return NoopHazard;
+
   return NoHazard;
 }
 
@@ -104,11 +136,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  int WaitStates = std::max(0, checkAnyInstHazards(MI));
+
   if (SIInstrInfo::isSMRD(*MI))
-    return std::max(0, checkSMRDHazards(MI));
+    return std::max(WaitStates, checkSMRDHazards(MI));
 
   if (SIInstrInfo::isVALU(*MI)) {
-    int WaitStates = std::max(0, checkVALUHazards(MI));
+      WaitStates = std::max(WaitStates, checkVALUHazards(MI));
 
     if (SIInstrInfo::isVMEM(*MI))
       WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
@@ -122,19 +156,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
     if (isRWLane(MI->getOpcode()))
       WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
+    if (TII.isVINTRP(*MI))
+      WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
+
     return WaitStates;
   }
 
   if (isSGetReg(MI->getOpcode()))
-    return std::max(0, checkGetRegHazards(MI));
+    return std::max(WaitStates, checkGetRegHazards(MI));
 
   if (isSSetReg(MI->getOpcode()))
-    return std::max(0, checkSetRegHazards(MI));
+    return std::max(WaitStates, checkSetRegHazards(MI));
 
   if (isRFE(MI->getOpcode()))
-    return std::max(0, checkRFEHazards(MI));
+    return std::max(WaitStates, checkRFEHazards(MI));
 
-  return 0;
+  if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))
+    return std::max(WaitStates, checkReadM0Hazards(MI));
+
+  return WaitStates;
 }
 
 void GCNHazardRecognizer::EmitNoop() {
@@ -142,14 +182,12 @@ void GCNHazardRecognizer::EmitNoop() {
 }
 
 void GCNHazardRecognizer::AdvanceCycle() {
-
   // When the scheduler detects a stall, it will call AdvanceCycle() without
   // emitting any instructions.
   if (!CurrCycleInstr)
     return;
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+  unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
 
   // Keep track of emitted instructions
   EmittedInstrs.push_front(CurrCycleInstr);
@@ -180,7 +218,6 @@ void GCNHazardRecognizer::RecedeCycle() {
 
 int GCNHazardRecognizer::getWaitStatesSince(
     function_ref<bool(MachineInstr *)> IsHazard) {
-
   int WaitStates = -1;
   for (MachineInstr *MI : EmittedInstrs) {
     ++WaitStates;
@@ -204,7 +241,6 @@ int GCNHazardRecognizer::getWaitStatesSinceDef(
 
 int GCNHazardRecognizer::getWaitStatesSinceSetReg(
     function_ref<bool(MachineInstr *)> IsHazard) {
-
   auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
     return isSSetReg(MI->getOpcode()) && IsHazard(MI);
   };
@@ -281,7 +317,6 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
 
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
   int WaitStatesNeeded = 0;
 
   WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
@@ -293,7 +328,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   // A read of an SGPR by SMRD instruction requires 4 wait states when the
   // SGPR was written by a VALU instruction.
   int SmrdSgprWaitStates = 4;
-  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 
   for (const MachineOperand &Use : SMRD->uses()) {
     if (!Use.isReg())
@@ -486,7 +521,6 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
 }
 
 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
-
   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
     return 0;
 
@@ -500,3 +534,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
   return RFEWaitStates - WaitStatesNeeded;
 }
+
+int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
+  if (MI->isDebugValue())
+    return 0;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  if (!ST.hasSMovFedHazard())
+    return 0;
+
+  // Check for any instruction reading an SGPR after a write from
+  // s_mov_fed_b32.
+  int MovFedWaitStates = 1;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Use : MI->uses()) {
+    if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+    auto IsHazardFn = [] (MachineInstr *MI) {
+      return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
+    };
+    int WaitStatesNeededForUse =
+        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
+  if (!ST.hasReadM0Hazard())
+    return 0;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  int SMovRelWaitStates = 1;
+  auto IsHazardFn = [TII] (MachineInstr *MI) {
+    return TII->isSALU(*MI);
+  };
+  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 0ab82ff..5680c3d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -34,6 +34,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   std::list<MachineInstr*> EmittedInstrs;
   const MachineFunction &MF;
   const SISubtarget &ST;
+  const SIInstrInfo &TII;
 
   int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
   int getWaitStatesSinceDef(unsigned Reg,
@@ -52,6 +53,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   int checkVALUHazards(MachineInstr *VALU);
   int checkRWLaneHazards(MachineInstr *RWLane);
   int checkRFEHazards(MachineInstr *RFE);
+  int checkAnyInstHazards(MachineInstr *MI);
+  int checkReadM0Hazards(MachineInstr *SMovRel);
 public:
   GCNHazardRecognizer(const MachineFunction &MF);
   // We can only issue one instruction per cycle.
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
new file mode 100644
index 0000000..2e7641c
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -0,0 +1,530 @@
+//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNIterativeScheduler.h"
+#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace llvm {
+  std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+    const ScheduleDAG &DAG);
+}
+
+// shim accessors for different order containers
+static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
+  return MI;
+}
+static inline MachineInstr *getMachineInstr(const SUnit *SU) {
+  return SU->getInstr();
+}
+static inline MachineInstr *getMachineInstr(const SUnit &SU) {
+  return SU.getInstr();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void printRegion(raw_ostream &OS,
+                        MachineBasicBlock::iterator Begin,
+                        MachineBasicBlock::iterator End,
+                        const LiveIntervals *LIS,
+                        unsigned MaxInstNum =
+                          std::numeric_limits<unsigned>::max()) {
+  auto BB = Begin->getParent();
+  OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
+     << ' ' << BB->getName() << ":\n";
+  auto I = Begin;
+  MaxInstNum = std::max(MaxInstNum, 1u);
+  for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (I != End) {
+    OS << "\t...\n";
+    I = std::prev(End);
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (End != BB->end()) { // print boundary inst if present
+    OS << "----\n";
+    if (LIS) OS << LIS->getInstructionIndex(*End) << '\t';
+    OS << *End;
+  }
+}
+
+LLVM_DUMP_METHOD
+static void printLivenessInfo(raw_ostream &OS,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              const LiveIntervals *LIS) {
+  const auto BB = Begin->getParent();
+  const auto &MRI = BB->getParent()->getRegInfo();
+
+  const auto LiveIns = getLiveRegsBefore(*Begin, *LIS);
+  OS << "LIn RP: ";
+  getRegPressure(MRI, LiveIns).print(OS);
+
+  const auto BottomMI = End == BB->end() ? std::prev(End) : End;
+  const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS);
+  OS << "LOt RP: ";
+  getRegPressure(MRI, LiveOuts).print(OS);
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  for (const auto R : Regions) {
+    OS << "Region to schedule ";
+    printRegion(OS, R->Begin, R->End, LIS, 1);
+    printLivenessInfo(OS, R->Begin, R->End, LIS);
+    OS << "Max RP: ";
+    R->MaxPressure.print(OS, &ST);
+  }
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedResult(raw_ostream &OS,
+                                             const Region *R,
+                                             const GCNRegPressure &RP) const {
+  OS << "\nAfter scheduling ";
+  printRegion(OS, R->Begin, R->End, LIS);
+  printSchedRP(OS, R->MaxPressure, RP);
+  OS << '\n';
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
+                                         const GCNRegPressure &Before,
+                                         const GCNRegPressure &After) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  OS << "RP before: ";
+  Before.print(OS, &ST);
+  OS << "RP after:  ";
+  After.print(OS, &ST);
+}
+
+#endif
+
+// DAG builder helper
+class GCNIterativeScheduler::BuildDAG {
+  GCNIterativeScheduler &Sch;
+  SmallVector<SUnit*, 8> TopRoots;
+public:
+  BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
+    : Sch(_Sch) {
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+
+    Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
+                        /*TrackLaneMask*/true);
+    Sch.Topo.InitDAGTopologicalSorting();
+
+    SmallVector<SUnit*, 8> BotRoots;
+    Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
+  }
+  ~BuildDAG() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+  }
+  ArrayRef<const SUnit*> getTopRoots() const {
+    return TopRoots;
+  }
+};
+
+class GCNIterativeScheduler::OverrideLegacyStrategy {
+  GCNIterativeScheduler &Sch;
+  Region &Rgn;
+  std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
+  GCNRegPressure SaveMaxRP;
+public:
+  OverrideLegacyStrategy(Region &R,
+                         MachineSchedStrategy &OverrideStrategy,
+                         GCNIterativeScheduler &_Sch)
+    : Sch(_Sch)
+    , Rgn(R)
+    , SaveSchedImpl(std::move(_Sch.SchedImpl))
+    , SaveMaxRP(R.MaxPressure) {
+    Sch.SchedImpl.reset(&OverrideStrategy);
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+  }
+  ~OverrideLegacyStrategy() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+    Sch.SchedImpl.release();
+    Sch.SchedImpl = std::move(SaveSchedImpl);
+  }
+  void schedule() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    DEBUG(dbgs() << "\nScheduling ";
+      printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+    Sch.BaseClass::schedule();
+
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    Sch.RegionEnd = Rgn.End;
+    //assert(Rgn.End == Sch.RegionEnd);
+    Rgn.Begin = Sch.RegionBegin;
+    Rgn.MaxPressure.clear();
+  }
+  void restoreOrder() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    // DAG SUnits are stored using original region's order
+    // so just use SUnits as the restoring schedule
+    Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP);
+  }
+};
+
+namespace {
+// just a stub to make base class happy
+class SchedStrategyStub : public MachineSchedStrategy {
+public:
+  bool shouldTrackPressure() const override { return false; }
+  bool shouldTrackLaneMasks() const override { return false; }
+  void initialize(ScheduleDAGMI *DAG) override {}
+  SUnit *pickNode(bool &IsTopNode) override { return nullptr; }
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+  void releaseTopNode(SUnit *SU) override {}
+  void releaseBottomNode(SUnit *SU) override {}
+};
+} // namespace
+
+GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
+                                             StrategyKind S)
+  : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+  , Context(C)
+  , Strategy(S)
+  , UPTracker(*LIS) {
+}
+
+// returns max pressure for a region
+GCNRegPressure
+GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End)
+  const {
+  // For the purpose of pressure tracking bottom inst of the region should
+  // be also processed. End is either BB end, BB terminator inst or sched
+  // boundary inst.
+  auto const BBEnd = Begin->getParent()->end();
+  auto const BottomMI = End == BBEnd ? std::prev(End) : End;
+
+  // scheduleRegions walks bottom to top, so its likely we just get next
+  // instruction to track
+  auto AfterBottomMI = std::next(BottomMI);
+  if (AfterBottomMI == BBEnd ||
+      &*AfterBottomMI != UPTracker.getLastTrackedMI()) {
+    UPTracker.reset(*BottomMI);
+  } else {
+    assert(UPTracker.isValid());
+  }
+
+  for (auto I = BottomMI; I != Begin; --I)
+    UPTracker.recede(*I);
+
+  UPTracker.recede(*Begin);
+
+  assert(UPTracker.isValid() ||
+         (dbgs() << "Tracked region ",
+          printRegion(dbgs(), Begin, End, LIS), false));
+  return UPTracker.moveMaxPressure();
+}
+
+// returns max pressure for a tentative schedule
+template <typename Range> GCNRegPressure
+GCNIterativeScheduler::getSchedulePressure(const Region &R,
+                                           Range &&Schedule) const {
+  auto const BBEnd = R.Begin->getParent()->end();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  if (R.End != BBEnd) {
+    // R.End points to the boundary instruction but the
+    // schedule doesn't include it
+    RPTracker.reset(*R.End);
+    RPTracker.recede(*R.End);
+  } else {
+    // R.End doesn't point to the boundary instruction
+    RPTracker.reset(*std::prev(BBEnd));
+  }
+  for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
+    RPTracker.recede(*getMachineInstr(*--I));
+  }
+  return RPTracker.moveMaxPressure();
+}
+
+void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
+                                        MachineBasicBlock::iterator Begin,
+                                        MachineBasicBlock::iterator End,
+                                        unsigned NumRegionInstrs) {
+  BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs);
+  if (NumRegionInstrs > 2) {
+    Regions.push_back(
+      new (Alloc.Allocate())
+      Region { Begin, End, NumRegionInstrs,
+               getRegionPressure(Begin, End), nullptr });
+  }
+}
+
+void GCNIterativeScheduler::schedule() { // overriden
+  // do nothing
+  DEBUG(
+    printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+    if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+      dbgs() << "Max RP: ";
+      Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
+    }
+    dbgs() << '\n';
+  );
+}
+
+void GCNIterativeScheduler::finalizeSchedule() { // overriden
+  if (Regions.empty())
+    return;
+  switch (Strategy) {
+  case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
+  case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
+  case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+  }
+}
+
+// Detach schedule from SUnits and interleave it with debug values.
+// Returned schedule becomes independent of DAG state.
+std::vector<MachineInstr*>
+GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
+  std::vector<MachineInstr*> Res;
+  Res.reserve(Schedule.size() * 2);
+
+  if (FirstDbgValue)
+    Res.push_back(FirstDbgValue);
+
+  const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end();
+  for (auto SU : Schedule) {
+    Res.push_back(SU->getInstr());
+    const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) {
+      return P.second == SU->getInstr();
+    });
+    if (D != DbgE)
+      Res.push_back(D->first);
+  }
+  return Res;
+}
+
+void GCNIterativeScheduler::setBestSchedule(Region &R,
+                                            ScheduleRef Schedule,
+                                            const GCNRegPressure &MaxRP) {
+  R.BestSchedule.reset(
+    new TentativeSchedule{ detachSchedule(Schedule), MaxRP });
+}
+
+void GCNIterativeScheduler::scheduleBest(Region &R) {
+  assert(R.BestSchedule.get() && "No schedule specified");
+  scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure);
+  R.BestSchedule.reset();
+}
+
+// minimal required region scheduler, works for ranges of SUnits*,
+// SUnits or MachineIntrs*
+template <typename Range>
+void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
+                                           const GCNRegPressure &MaxRP) {
+  assert(RegionBegin == R.Begin && RegionEnd == R.End);
+  assert(LIS != nullptr);
+#ifndef NDEBUG
+  const auto SchedMaxRP = getSchedulePressure(R, Schedule);
+#endif
+  auto BB = R.Begin->getParent();
+  auto Top = R.Begin;
+  for (const auto &I : Schedule) {
+    auto MI = getMachineInstr(I);
+    if (MI != &*Top) {
+      BB->remove(MI);
+      BB->insert(Top, MI);
+      if (!MI->isDebugValue())
+        LIS->handleMove(*MI, true);
+    }
+    if (!MI->isDebugValue()) {
+      // Reset read - undef flags and update them later.
+      for (auto &Op : MI->operands())
+        if (Op.isReg() && Op.isDef())
+          Op.setIsUndef(false);
+
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
+                                       /*IgnoreDead*/false);
+      // Adjust liveness and add missing dead+read-undef flags.
+      auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    }
+    Top = std::next(MI->getIterator());
+  }
+  RegionBegin = getMachineInstr(Schedule.front());
+
+  // Schedule consisting of MachineInstr* is considered 'detached'
+  // and already interleaved with debug values
+  if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
+    placeDebugValues();
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    //assert(R.End == RegionEnd);
+    RegionEnd = R.End;
+  }
+
+  R.Begin = RegionBegin;
+  R.MaxPressure = MaxRP;
+
+#ifndef NDEBUG
+  const auto RegionMaxRP = getRegionPressure(R);
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+#endif
+  assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
+  || (dbgs() << "Max RP mismatch!!!\n"
+                "RP for schedule (calculated): ",
+      SchedMaxRP.print(dbgs(), &ST),
+      dbgs() << "RP for schedule (reported): ",
+      MaxRP.print(dbgs(), &ST),
+      dbgs() << "RP after scheduling: ",
+      RegionMaxRP.print(dbgs(), &ST),
+      false));
+}
+
+// Sort recorded regions by pressure - highest at the front
+void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  std::sort(Regions.begin(), Regions.end(),
+    [&ST, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  });
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Legacy MaxOccupancy Strategy
+
+// Tries to increase occupancy applying minreg scheduler for a sequence of
+// most demanding regions. Obtained schedules are saved as BestSchedule for a
+// region.
+// TargetOcc is the best achievable occupancy for a kernel.
+// Returns better occupancy on success or current occupancy on fail.
+// BestSchedules aren't deleted on fail.
+unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
+  // TODO: assert Regions are sorted descending by pressure
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
+               << ", current = " << Occ << '\n');
+
+  auto NewOcc = TargetOcc;
+  for (auto R : Regions) {
+    if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+      break;
+
+    DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+          printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+    const auto MaxRP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+          printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+    if (NewOcc <= Occ)
+      break;
+
+    setBestSchedule(*R, MinSchedule, MaxRP);
+  }
+  DEBUG(dbgs() << "New occupancy = " << NewOcc
+               << ", prev occupancy = " << Occ << '\n');
+  return std::max(NewOcc, Occ);
+}
+
+void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
+  bool TryMaximizeOccupancy) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+
+  sortRegionsByPressure(TgtOcc);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+  if (TryMaximizeOccupancy && Occ < TgtOcc)
+    Occ = tryMaximizeOccupancy(TgtOcc);
+
+  // This is really weird but for some magic scheduling regions twice
+  // gives performance improvement
+  const int NumPasses = Occ < TgtOcc ? 2 : 1;
+
+  TgtOcc = std::min(Occ, TgtOcc);
+  DEBUG(dbgs() << "Scheduling using default scheduler, "
+                  "target occupancy = " << TgtOcc << '\n');
+  GCNMaxOccupancySchedStrategy LStrgy(Context);
+
+  for (int I = 0; I < NumPasses; ++I) {
+    // running first pass with TargetOccupancy = 0 mimics previous scheduling
+    // approach and is a performance magic
+    LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
+    for (auto R : Regions) {
+      OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
+
+      Ovr.schedule();
+      const auto RP = getRegionPressure(*R);
+      DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+      if (RP.getOccupancy(ST) < TgtOcc) {
+        DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+        if (R->BestSchedule.get() &&
+            R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+          DEBUG(dbgs() << ", scheduling minimal register\n");
+          scheduleBest(*R);
+        } else {
+          DEBUG(dbgs() << ", restoring\n");
+          Ovr.restoreOrder();
+          assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+        }
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Minimal Register Strategy
+
+void GCNIterativeScheduler::scheduleMinReg(bool force) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  sortRegionsByPressure(TgtOcc);
+
+  auto MaxPressure = Regions.front()->MaxPressure;
+  for (auto R : Regions) {
+    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+      break;
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+
+    const auto RP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+      dbgs() << "\nWarning: Pressure becomes worse after minreg!";
+      printSchedRP(dbgs(), R->MaxPressure, RP);
+    });
+
+    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+      break;
+
+    scheduleRegion(*R, MinSchedule, RP);
+    DEBUG(printSchedResult(dbgs(), R, RP));
+
+    MaxPressure = RP;
+  }
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
new file mode 100644
index 0000000..df3afce
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -0,0 +1,118 @@
+//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+
+#include "GCNRegPressure.h"
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class GCNIterativeScheduler : public ScheduleDAGMILive {
+  typedef ScheduleDAGMILive BaseClass;
+public:
+  enum StrategyKind {
+    SCHEDULE_MINREGONLY,
+    SCHEDULE_MINREGFORCED,
+    SCHEDULE_LEGACYMAXOCCUPANCY
+  };
+
+  GCNIterativeScheduler(MachineSchedContext *C,
+                        StrategyKind S);
+
+  void schedule() override;
+
+  void enterRegion(MachineBasicBlock *BB,
+                   MachineBasicBlock::iterator Begin,
+                   MachineBasicBlock::iterator End,
+                   unsigned RegionInstrs) override;
+
+  void finalizeSchedule() override;
+
+protected:
+
+  typedef ArrayRef<const SUnit*> ScheduleRef;
+
+  struct TentativeSchedule {
+    std::vector<MachineInstr*> Schedule;
+    GCNRegPressure MaxPressure;
+  };
+
+  struct Region {
+    // Fields except for BestSchedule are supposed to reflect current IR state
+    // `const` fields are to emphasize they shouldn't change for any schedule.
+    MachineBasicBlock::iterator Begin;
+    // End is either a boundary instruction or end of basic block
+    const MachineBasicBlock::iterator End;
+    const unsigned NumRegionInstrs;
+    GCNRegPressure MaxPressure;
+
+    // best schedule for the region so far (not scheduled yet)
+    std::unique_ptr<TentativeSchedule> BestSchedule;
+  };
+
+  SpecificBumpPtrAllocator<Region> Alloc;
+  std::vector<Region*> Regions;
+
+  MachineSchedContext *Context;
+  const StrategyKind Strategy;
+  mutable GCNUpwardRPTracker UPTracker;
+
+  class BuildDAG;
+  class OverrideLegacyStrategy;
+
+  template <typename Range>
+  GCNRegPressure getSchedulePressure(const Region &R,
+                                     Range &&Schedule) const;
+
+  GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End) const;
+
+  GCNRegPressure getRegionPressure(const Region &R) const {
+    return getRegionPressure(R.Begin, R.End);
+  }
+
+  void setBestSchedule(Region &R,
+                       ScheduleRef Schedule,
+                       const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  void scheduleBest(Region &R);
+
+  std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const;
+
+  void sortRegionsByPressure(unsigned TargetOcc);
+
+  template <typename Range>
+  void scheduleRegion(Region &R, Range &&Schedule,
+                      const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  unsigned tryMaximizeOccupancy(unsigned TargetOcc =
+                                std::numeric_limits<unsigned>::max());
+
+  void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
+  void scheduleMinReg(bool force = false);
+
+  void printRegions(raw_ostream &OS) const;
+  void printSchedResult(raw_ostream &OS,
+                        const Region *R,
+                        const GCNRegPressure &RP) const;
+  void printSchedRP(raw_ostream &OS,
+                    const GCNRegPressure &Before,
+                    const GCNRegPressure &After) const;
+};
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
new file mode 100644
index 0000000..0657f67
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -0,0 +1,268 @@
+//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+class GCNMinRegScheduler {
+  struct Candidate : ilist_node<Candidate> {
+    const SUnit *SU;
+    int Priority;
+
+    Candidate(const SUnit *SU_, int Priority_ = 0)
+      : SU(SU_), Priority(Priority_) {}
+  };
+
+  SpecificBumpPtrAllocator<Candidate> Alloc;
+  typedef simple_ilist<Candidate> Queue;
+  Queue RQ; // Ready queue
+
+  std::vector<unsigned> NumPreds;
+
+  bool isScheduled(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max();
+  }
+
+  void setIsScheduled(const SUnit *SU)  {
+    assert(!SU->isBoundaryNode());
+    NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max();
+  }
+
+  unsigned getNumPreds(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return NumPreds[SU->NodeNum];
+  }
+
+  unsigned decNumPreds(const SUnit *SU) {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return --NumPreds[SU->NodeNum];
+  }
+
+  void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits);
+
+  int getReadySuccessors(const SUnit *SU) const;
+  int getNotReadySuccessors(const SUnit *SU) const;
+
+  template <typename Calc>
+  unsigned findMax(unsigned Num, Calc C);
+
+  Candidate* pickCandidate();
+
+  void bumpPredsPriority(const SUnit *SchedSU, int Priority);
+  void releaseSuccessors(const SUnit* SU, int Priority);
+
+public:
+  std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+                                     const ScheduleDAG &DAG);
+};
+} // namespace
+
+void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
+  NumPreds.resize(SUnits.size());
+  for (unsigned I = 0; I < SUnits.size(); ++I)
+    NumPreds[I] = SUnits[I].NumPredsLeft;
+}
+
+int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const {
+  unsigned NumSchedSuccs = 0;
+  for (auto SDep : SU->Succs) {
+    bool wouldBeScheduled = true;
+    for (auto PDep : SDep.getSUnit()->Preds) {
+      auto PSU = PDep.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SU && !isScheduled(PSU)) {
+        wouldBeScheduled = false;
+        break;
+      }
+    }
+    NumSchedSuccs += wouldBeScheduled ? 1 : 0;
+  }
+  return NumSchedSuccs;
+}
+
+int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
+  return SU->Succs.size() - getReadySuccessors(SU);
+}
+
+template <typename Calc>
+unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
+  assert(!RQ.empty() && Num <= RQ.size());
+  typedef decltype(C(*RQ.begin())) T;
+  T Max = std::numeric_limits<T>::min();
+  unsigned NumMax = 0;
+  for (auto I = RQ.begin(); Num; --Num) {
+    T Cur = C(*I);
+    if (Cur >= Max) {
+      if (Cur > Max) {
+        Max = Cur;
+        NumMax = 1;
+      } else
+        ++NumMax;
+      auto &Cand = *I++;
+      RQ.remove(Cand);
+      RQ.push_front(Cand);
+      continue;
+    }
+    ++I;
+  }
+  return NumMax;
+}
+
+GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
+  do {
+    unsigned Num = RQ.size();
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      int Res = getNotReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+                   << Res << " successors, metric = " << -Res << '\n');
+      return -Res;
+    });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting most producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      auto Res = getReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
+                   << Res << " successors, metric = " << Res << '\n');
+      return Res;
+    });
+    if (Num == 1) break;
+
+    Num = Num ? Num : RQ.size();
+    DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
+    assert(Num == 1);
+  } while (false);
+
+  return &RQ.front();
+}
+
+void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
+  SmallPtrSet<const SUnit*, 32> Set;
+  for (const auto &S : SchedSU->Succs) {
+    if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) ||
+        S.getKind() != SDep::Data)
+      continue;
+    for (const auto &P : S.getSUnit()->Preds) {
+      auto PSU = P.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SchedSU && !isScheduled(PSU)) {
+        Set.insert(PSU);
+      }
+    }
+  }
+  SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end());
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    assert(!SU->isBoundaryNode());
+    for (const auto &P : SU->Preds) {
+      if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) &&
+          Set.insert(P.getSUnit()).second)
+        Worklist.push_back(P.getSUnit());
+    }
+  }
+  DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+               << ")'s non-ready successors of " << Priority
+               << " priority in ready queue: ");
+  const auto SetEnd = Set.end();
+  for (auto &C : RQ) {
+    if (Set.find(C.SU) != SetEnd) {
+      C.Priority = Priority;
+      DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
+  for (const auto &S : SU->Succs) {
+    auto SuccSU = S.getSUnit();
+    if (S.isWeak())
+      continue;
+    assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0);
+    if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0)
+      RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority));
+  }
+}
+
+std::vector<const SUnit*>
+GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
+                             const ScheduleDAG &DAG) {
+  const auto &SUnits = DAG.SUnits;
+  std::vector<const SUnit*> Schedule;
+  Schedule.reserve(SUnits.size());
+
+  initNumPreds(SUnits);
+
+  int StepNo = 0;
+
+  for (auto SU : TopRoots) {
+    RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo));
+  }
+  releaseSuccessors(&DAG.EntrySU, StepNo);
+
+  while (!RQ.empty()) {
+    DEBUG(
+      dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
+                "Ready queue:";
+      for (auto &C : RQ)
+        dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+      dbgs() << '\n';
+    );
+
+    auto C = pickCandidate();
+    assert(C);
+    RQ.remove(*C);
+    auto SU = C->SU;
+    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+    releaseSuccessors(SU, StepNo);
+    Schedule.push_back(SU);
+    setIsScheduled(SU);
+
+    if (getReadySuccessors(SU) == 0)
+      bumpPredsPriority(SU, StepNo);
+
+    ++StepNo;
+  }
+  assert(SUnits.size() == Schedule.size());
+
+  return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+                                             const ScheduleDAG &DAG) {
+  GCNMinRegScheduler S;
+  return S.schedule(TopRoots, DAG);
+}
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
new file mode 100644
index 0000000..1d02c7f
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -0,0 +1,492 @@
+//===------------------------- GCNRegPressure.cpp - -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNRegPressure.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void llvm::printLivesAt(SlotIndex SI,
+                        const LiveIntervals &LIS,
+                        const MachineRegisterInfo &MRI) {
+  dbgs() << "Live regs at " << SI << ": "
+         << *LIS.getInstructionFromIndex(SI);
+  unsigned Num = 0;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (!LIS.hasInterval(Reg))
+      continue;
+    const auto &LI = LIS.getInterval(Reg);
+    if (LI.hasSubRanges()) {
+      bool firstTime = true;
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) continue;
+        if (firstTime) {
+          dbgs() << "  " << PrintReg(Reg, MRI.getTargetRegisterInfo())
+                 << '\n';
+          firstTime = false;
+        }
+        dbgs() << "  " << S << '\n';
+        ++Num;
+      }
+    } else if (LI.liveAt(SI)) {
+      dbgs() << "  " << LI << '\n';
+      ++Num;
+    }
+  }
+  if (!Num) dbgs() << "  <none>\n";
+}
+
+static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+                    const GCNRPTracker::LiveRegSet &S2) {
+  if (S1.size() != S2.size())
+    return false;
+
+  for (const auto &P : S1) {
+    auto I = S2.find(P.first);
+    if (I == S2.end() || I->second != P.second)
+      return false;
+  }
+  return true;
+}
+
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRegPressure
+
+unsigned GCNRegPressure::getRegKind(unsigned Reg,
+                                    const MachineRegisterInfo &MRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const auto RC = MRI.getRegClass(Reg);
+  auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  return STI->isSGPRClass(RC) ?
+    (STI->getRegSizeInBits(*RC) == 32 ? SGPR32 : SGPR_TUPLE) :
+    (STI->getRegSizeInBits(*RC) == 32 ? VGPR32 : VGPR_TUPLE);
+}
+
+void GCNRegPressure::inc(unsigned Reg,
+                         LaneBitmask PrevMask,
+                         LaneBitmask NewMask,
+                         const MachineRegisterInfo &MRI) {
+  if (NewMask == PrevMask)
+    return;
+
+  int Sign = 1;
+  if (NewMask < PrevMask) {
+    std::swap(NewMask, PrevMask);
+    Sign = -1;
+  }
+#ifndef NDEBUG
+  const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
+#endif
+  switch (auto Kind = getRegKind(Reg, MRI)) {
+  case SGPR32:
+  case VGPR32:
+    assert(PrevMask.none() && NewMask == MaxMask);
+    Value[Kind] += Sign;
+    break;
+
+  case SGPR_TUPLE:
+  case VGPR_TUPLE:
+    assert(NewMask < MaxMask || NewMask == MaxMask);
+    assert(PrevMask < NewMask);
+
+    Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+      Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
+
+    if (PrevMask.none()) {
+      assert(NewMask.any());
+      Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
+    }
+    break;
+
+  default: llvm_unreachable("Unknown register kind");
+  }
+}
+
+bool GCNRegPressure::less(const SISubtarget &ST,
+                          const GCNRegPressure& O,
+                          unsigned MaxOccupancy) const {
+  const auto SGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(getSGPRNum()));
+  const auto VGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+  const auto OtherSGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
+  const auto OtherVGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(O.getVGPRNum()));
+
+  const auto Occ = std::min(SGPROcc, VGPROcc);
+  const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+  if (Occ != OtherOcc)
+    return Occ > OtherOcc;
+
+  bool SGPRImportant = SGPROcc < VGPROcc;
+  const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
+
+  // if both pressures disagree on what is more important compare vgprs
+  if (SGPRImportant != OtherSGPRImportant) {
+    SGPRImportant = false;
+  }
+
+  // compare large regs pressure
+  bool SGPRFirst = SGPRImportant;
+  for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
+    if (SGPRFirst) {
+      auto SW = getSGPRTuplesWeight();
+      auto OtherSW = O.getSGPRTuplesWeight();
+      if (SW != OtherSW)
+        return SW < OtherSW;
+    } else {
+      auto VW = getVGPRTuplesWeight();
+      auto OtherVW = O.getVGPRTuplesWeight();
+      if (VW != OtherVW)
+        return VW < OtherVW;
+    }
+  }
+  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
+                         (getVGPRNum() < O.getVGPRNum());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+  OS << "VGPRs: " << getVGPRNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
+  OS << ", SGPRs: " << getSGPRNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
+  OS << ", LVGPR WT: " << getVGPRTuplesWeight()
+     << ", LSGPR WT: " << getSGPRTuplesWeight();
+  if (ST) OS << " -> Occ: " << getOccupancy(*ST);
+  OS << '\n';
+}
+#endif
+
+
+static LaneBitmask getDefRegMask(const MachineOperand &MO,
+                                 const MachineRegisterInfo &MRI) {
+  assert(MO.isDef() && MO.isReg() &&
+    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0 ?
+    MRI.getMaxLaneMaskForVReg(MO.getReg()) :
+    MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+static LaneBitmask getUsedRegMask(const MachineOperand &MO,
+                                  const MachineRegisterInfo &MRI,
+                                  const LiveIntervals &LIS) {
+  assert(MO.isUse() && MO.isReg() &&
+         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  if (auto SubReg = MO.getSubReg())
+    return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+  auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
+  if (MaxMask.getAsInteger() == 1) // cannot have subregs
+    return MaxMask;
+
+  // For a tentative schedule LIS isn't updated yet but livemask should remain
+  // the same on any schedule. Subreg defs can be reordered but they all must
+  // dominate uses anyway.
+  auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+  return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);
+}
+
+static SmallVector<RegisterMaskPair, 8>
+collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
+                      const MachineRegisterInfo &MRI) {
+  SmallVector<RegisterMaskPair, 8> Res;
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+    if (!MO.isUse() || !MO.readsReg())
+      continue;
+
+    auto const UsedMask = getUsedRegMask(MO, MRI, LIS);
+
+    auto Reg = MO.getReg();
+    auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) {
+      return RM.RegUnit == Reg;
+    });
+    if (I != Res.end())
+      I->LaneMask |= UsedMask;
+    else
+      Res.push_back(RegisterMaskPair(Reg, UsedMask));
+  }
+  return Res;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
+LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
+                                  SlotIndex SI,
+                                  const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI) {
+  LaneBitmask LiveMask;
+  const auto &LI = LIS.getInterval(Reg);
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges())
+      if (S.liveAt(SI)) {
+        LiveMask |= S.LaneMask;
+        assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
+               LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
+      }
+  } else if (LI.liveAt(SI)) {
+    LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
+  }
+  return LiveMask;
+}
+
+GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
+                                           const LiveIntervals &LIS,
+                                           const MachineRegisterInfo &MRI) {
+  GCNRPTracker::LiveRegSet LiveRegs;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    auto Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (!LIS.hasInterval(Reg))
+      continue;
+    auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
+    if (LiveMask.any())
+      LiveRegs[Reg] = LiveMask;
+  }
+  return LiveRegs;
+}
+
+void GCNUpwardRPTracker::reset(const MachineInstr &MI,
+                               const LiveRegSet *LiveRegsCopy) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  if (LiveRegsCopy) {
+    if (&LiveRegs != LiveRegsCopy)
+      LiveRegs = *LiveRegsCopy;
+  } else {
+    LiveRegs = getLiveRegsAfter(MI, LIS);
+  }
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
+void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
+  assert(MRI && "call reset first");
+
+  LastTrackedMI = &MI;
+
+  if (MI.isDebugValue())
+    return;
+
+  auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
+
+  // calc pressure at the MI (defs + uses)
+  auto AtMIPressure = CurPressure;
+  for (const auto &U : RegUses) {
+    auto LiveMask = LiveRegs[U.RegUnit];
+    AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI);
+  }
+  // update max pressure
+  MaxPressure = max(AtMIPressure, MaxPressure);
+
+  for (const auto &MO : MI.defs()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
+         MO.isDead())
+      continue;
+
+    auto Reg = MO.getReg();
+    auto I = LiveRegs.find(Reg);
+    if (I == LiveRegs.end())
+      continue;
+    auto &LiveMask = I->second;
+    auto PrevMask = LiveMask;
+    LiveMask &= ~getDefRegMask(MO, *MRI);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+    if (LiveMask.none())
+      LiveRegs.erase(I);
+  }
+  for (const auto &U : RegUses) {
+    auto &LiveMask = LiveRegs[U.RegUnit];
+    auto PrevMask = LiveMask;
+    LiveMask |= U.LaneMask;
+    CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
+  }
+  assert(CurPressure == getRegPressure(*MRI, LiveRegs));
+}
+
+bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
+                                 const LiveRegSet *LiveRegsCopy) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  LastTrackedMI = nullptr;
+  MBBEnd = MI.getParent()->end();
+  NextMI = &MI;
+  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  if (NextMI == MBBEnd)
+    return false;
+  if (LiveRegsCopy) {
+    if (&LiveRegs != LiveRegsCopy)
+      LiveRegs = *LiveRegsCopy;
+  } else {
+    LiveRegs = getLiveRegsBefore(*NextMI, LIS);
+  }
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+  return true;
+}
+
+bool GCNDownwardRPTracker::advanceBeforeNext() {
+  assert(MRI && "call reset first");
+
+  NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
+  if (NextMI == MBBEnd)
+    return false;
+
+  SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex();
+  assert(SI.isValid());
+
+  // Remove dead registers or mask bits.
+  for (auto &It : LiveRegs) {
+    const LiveInterval &LI = LIS.getInterval(It.first);
+    if (LI.hasSubRanges()) {
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) {
+          auto PrevMask = It.second;
+          It.second &= ~S.LaneMask;
+          CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+        }
+      }
+    } else if (!LI.liveAt(SI)) {
+      auto PrevMask = It.second;
+      It.second = LaneBitmask::getNone();
+      CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+    }
+    if (It.second.none())
+      LiveRegs.erase(It.first);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+
+  return true;
+}
+
+void GCNDownwardRPTracker::advanceToNext() {
+  LastTrackedMI = &*NextMI++;
+
+  // Add new registers or mask bits.
+  for (const auto &MO : LastTrackedMI->defs()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask |= getDefRegMask(MO, *MRI);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
+bool GCNDownwardRPTracker::advance() {
+  // If we have just called reset live set is actual.
+  if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext()))
+    return false;
+  advanceToNext();
+  return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) {
+  while (NextMI != End)
+    if (!advance()) return false;
+  return true;
+}
+
+bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
+                                   MachineBasicBlock::const_iterator End,
+                                   const LiveRegSet *LiveRegsCopy) {
+  reset(*Begin, LiveRegsCopy);
+  return advance(End);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
+                           const GCNRPTracker::LiveRegSet &TrackedLR,
+                           const TargetRegisterInfo *TRI) {
+  for (auto const &P : TrackedLR) {
+    auto I = LISLR.find(P.first);
+    if (I == LISLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in LIS reported set\n";
+    }
+    else if (I->second != P.second) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+        << " masks doesn't match: LIS reported "
+        << PrintLaneMask(I->second)
+        << ", tracked "
+        << PrintLaneMask(P.second)
+        << '\n';
+    }
+  }
+  for (auto const &P : LISLR) {
+    auto I = TrackedLR.find(P.first);
+    if (I == TrackedLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in tracked set\n";
+    }
+  }
+}
+
+bool GCNUpwardRPTracker::isValid() const {
+  const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
+  const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
+  const auto &TrackedLR = LiveRegs;
+
+  if (!isEqual(LISLR, TrackedLR)) {
+    dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
+              " LIS reported livesets mismatch:\n";
+    printLivesAt(SI, LIS, *MRI);
+    reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo());
+    return false;
+  }
+
+  auto LISPressure = getRegPressure(*MRI, LISLR);
+  if (LISPressure != CurPressure) {
+    dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: ";
+    CurPressure.print(dbgs());
+    dbgs() << "LIS rpt: ";
+    LISPressure.print(dbgs());
+    return false;
+  }
+  return true;
+}
+
+void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+                                 const MachineRegisterInfo &MRI) {
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    auto It = LiveRegs.find(Reg);
+    if (It != LiveRegs.end() && It->second.any())
+      OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':'
+         << PrintLaneMask(It->second);
+  }
+  OS << '\n';
+}
+#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h
new file mode 100644
index 0000000..5dfe440
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -0,0 +1,207 @@
+//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+
+#include "AMDGPUSubtarget.h"
+
+#include <limits>
+
+namespace llvm {
+
+struct GCNRegPressure {
+  enum RegKind {
+    SGPR32,
+    SGPR_TUPLE,
+    VGPR32,
+    VGPR_TUPLE,
+    TOTAL_KINDS
+  };
+
+  GCNRegPressure() {
+    clear();
+  }
+
+  bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; }
+
+  void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+
+  unsigned getSGPRNum() const { return Value[SGPR32]; }
+  unsigned getVGPRNum() const { return Value[VGPR32]; }
+
+  unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+  unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
+
+  unsigned getOccupancy(const SISubtarget &ST) const {
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+                    ST.getOccupancyWithNumVGPRs(getVGPRNum()));
+  }
+
+  void inc(unsigned Reg,
+           LaneBitmask PrevMask,
+           LaneBitmask NewMask,
+           const MachineRegisterInfo &MRI);
+
+  bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+    return getOccupancy(ST) > O.getOccupancy(ST);
+  }
+
+  bool less(const SISubtarget &ST, const GCNRegPressure& O,
+    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+
+  bool operator==(const GCNRegPressure &O) const {
+    return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
+  }
+
+  bool operator!=(const GCNRegPressure &O) const {
+    return !(*this == O);
+  }
+
+  void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
+  void dump() const { print(dbgs()); }
+
+private:
+  unsigned Value[TOTAL_KINDS];
+
+  static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+
+  friend GCNRegPressure max(const GCNRegPressure &P1,
+                            const GCNRegPressure &P2);
+};
+
+inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
+  GCNRegPressure Res;
+  for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I)
+    Res.Value[I] = std::max(P1.Value[I], P2.Value[I]);
+  return Res;
+}
+
+class GCNRPTracker {
+public:
+  typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
+
+protected:
+  const LiveIntervals &LIS;
+  LiveRegSet LiveRegs;
+  GCNRegPressure CurPressure, MaxPressure;
+  const MachineInstr *LastTrackedMI = nullptr;
+  mutable const MachineRegisterInfo *MRI = nullptr;
+  GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+public:
+  // live regs for the current state
+  const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
+  const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+
+  void clearMaxPressure() { MaxPressure.clear(); }
+
+  // returns MaxPressure, resetting it
+  decltype(MaxPressure) moveMaxPressure() {
+    auto Res = MaxPressure;
+    MaxPressure.clear();
+    return Res;
+  }
+  decltype(LiveRegs) moveLiveRegs() {
+    return std::move(LiveRegs);
+  }
+  static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
+                            const MachineRegisterInfo &MRI);
+};
+
+class GCNUpwardRPTracker : public GCNRPTracker {
+public:
+  GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+  // reset tracker to the point just below MI
+  // filling live regs upon this point using LIS
+  void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+
+  // move to the state just above the MI
+  void recede(const MachineInstr &MI);
+
+  // checks whether the tracker's state after receding MI corresponds
+  // to reported by LIS
+  bool isValid() const;
+};
+
+class GCNDownwardRPTracker : public GCNRPTracker {
+  // Last position of reset or advanceBeforeNext
+  MachineBasicBlock::const_iterator NextMI;
+
+  MachineBasicBlock::const_iterator MBBEnd;
+
+public:
+  GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
+
+  const MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+
+  // Reset tracker to the point before the MI
+  // filling live regs upon this point using LIS.
+  // Returns false if block is empty except debug values.
+  bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+
+  // Move to the state right before the next MI. Returns false if reached
+  // end of the block.
+  bool advanceBeforeNext();
+
+  // Move to the state at the MI, advanceBeforeNext has to be called first.
+  void advanceToNext();
+
+  // Move to the state at the next MI. Returns false if reached end of block.
+  bool advance();
+
+  // Advance instructions until before End.
+  bool advance(MachineBasicBlock::const_iterator End);
+
+  // Reset to Begin and advance to End.
+  bool advance(MachineBasicBlock::const_iterator Begin,
+               MachineBasicBlock::const_iterator End,
+               const LiveRegSet *LiveRegsCopy = nullptr);
+};
+
+LaneBitmask getLiveLaneMask(unsigned Reg,
+                            SlotIndex SI,
+                            const LiveIntervals &LIS,
+                            const MachineRegisterInfo &MRI);
+
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
+                                     const LiveIntervals &LIS,
+                                     const MachineRegisterInfo &MRI);
+
+inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
+                                                 const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
+                                                  const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+template <typename Range>
+GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
+                              Range &&LiveRegs) {
+  GCNRegPressure Res;
+  for (const auto &RM : LiveRegs)
+    Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
+  return Res;
+}
+
+void printLivesAt(SlotIndex SI,
+                  const LiveIntervals &LIS,
+                  const MachineRegisterInfo &MRI);
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 2f88033..155b400 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -18,14 +18,15 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/MathExtras.h"
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 using namespace llvm;
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C) :
-    GenericScheduler(C) { }
+    GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
 
 static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
                             const MachineFunction &MF) {
@@ -35,18 +36,46 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
   unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
                                       ST.getOccupancyWithNumVGPRs(VGPRs));
   return std::min(MinRegOccupancy,
-                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                  *MF.getFunction()));
+}
+
+void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
+  GenericScheduler::initialize(DAG);
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  MF = &DAG->MF;
+
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+
+  // FIXME: This is also necessary, because some passes that run after
+  // scheduling and before regalloc increase register pressure.
+  const int ErrorMargin = 3;
+
+  SGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
+  VGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
+  if (TargetOccupancy) {
+    SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true);
+    VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
+  } else {
+    SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                                                    SRI->getSGPRPressureSet());
+    VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                                                    SRI->getVGPRPressureSet());
+  }
+
+  SGPRCriticalLimit -= ErrorMargin;
+  VGPRCriticalLimit -= ErrorMargin;
 }
 
 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop, const RegPressureTracker &RPTracker,
                                      const SIRegisterInfo *SRI,
-                                     int SGPRPressure,
-                                     int VGPRPressure,
-                                     int SGPRExcessLimit,
-                                     int VGPRExcessLimit,
-                                     int SGPRCriticalLimit,
-                                     int VGPRCriticalLimit) {
+                                     unsigned SGPRPressure,
+                                     unsigned VGPRPressure) {
 
   Cand.SU = SU;
   Cand.AtTop = AtTop;
@@ -66,8 +95,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
   }
 
-  int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
-  int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+  unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
 
   // If two instructions increase the pressure of different register sets
   // by the same amount, the generic scheduler will prefer to schedule the
@@ -77,7 +106,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // only for VGPRs or only for SGPRs.
 
   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
-  const int MaxVGPRPressureInc = 16;
+  const unsigned MaxVGPRPressureInc = 16;
   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
 
@@ -86,11 +115,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // to increase the likelihood we don't go over the limits.  We should improve
   // the analysis to look through dependencies to find the path with the least
   // register pressure.
-  // FIXME: This is also necessary, because some passes that run after
-  // scheduling and before regalloc increase register pressure.
-  const int ErrorMargin = 3;
-  VGPRExcessLimit -= ErrorMargin;
-  SGPRExcessLimit -= ErrorMargin;
 
   // We only need to update the RPDelata for instructions that increase
   // register pressure.  Instructions that decrease or keep reg pressure
@@ -103,7 +127,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
 
   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
     Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
-    Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
+    Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
   }
 
   // Register pressure is considered 'CRITICAL' if it is approaching a value
@@ -111,9 +135,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
   // has the same cost, so we don't need to prefer one over the other.
 
-  VGPRCriticalLimit -= ErrorMargin;
-  SGPRCriticalLimit -= ErrorMargin;
-
   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
 
@@ -134,27 +155,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand) {
-  const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
   unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
-  unsigned SGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
-  unsigned VGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
-  unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
-  unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true);
-  unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
-
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
 
     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
-                  SGPRPressure, VGPRPressure,
-                  SGPRExcessLimit, VGPRExcessLimit,
-                  SGPRCriticalLimit, VGPRCriticalLimit);
+                  SGPRPressure, VGPRPressure);
     // Pass SchedBoundary only when comparing nodes from the same boundary.
     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
     GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
@@ -167,16 +177,6 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
-static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
-  switch (Reason) {
-  default:
-    return Reason;
-  case GenericSchedulerBase::RegCritical:
-  case GenericSchedulerBase::RegExcess:
-    return -Reason;
- }
-}
-
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
@@ -224,9 +224,9 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   DEBUG(
     dbgs() << "Top Cand: ";
-    traceCandidate(BotCand);
-    dbgs() << "Bot Cand: ";
     traceCandidate(TopCand);
+    dbgs() << "Bot Cand: ";
+    traceCandidate(BotCand);
   );
   SchedCandidate Cand;
   if (TopCand.Reason == BotCand.Reason) {
@@ -249,9 +249,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
     } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
       Cand = BotCand;
     } else {
-      int TopRank = getBidirectionalReasonRank(TopCand.Reason);
-      int BotRank = getBidirectionalReasonRank(BotCand.Reason);
-      if (TopRank > BotRank) {
+      if (BotCand.Reason > TopCand.Reason) {
         Cand = TopCand;
       } else {
         Cand = BotCand;
@@ -310,3 +308,242 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
   return SU;
 }
+
+GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
+                        std::unique_ptr<MachineSchedStrategy> S) :
+  ScheduleDAGMILive(C, std::move(S)),
+  ST(MF.getSubtarget<SISubtarget>()),
+  MFI(*MF.getInfo<SIMachineFunctionInfo>()),
+  StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
+                                                    *MF.getFunction())),
+  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
+
+  DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+}
+
+void GCNScheduleDAGMILive::schedule() {
+  if (Stage == 0) {
+    // Just record regions at the first pass.
+    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+    return;
+  }
+
+  std::vector<MachineInstr*> Unsched;
+  Unsched.reserve(NumRegionInstrs);
+  for (auto &I : *this)
+    Unsched.push_back(&I);
+
+  GCNRegPressure PressureBefore;
+  if (LIS) {
+    PressureBefore = Pressure[RegionIdx];
+
+    DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:";
+          GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI);
+          dbgs() << "Region live-in pressure:  ";
+          llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs());
+          dbgs() << "Region register pressure: ";
+          PressureBefore.print(dbgs()));
+  }
+
+  ScheduleDAGMILive::schedule();
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+
+  if (!LIS)
+    return;
+
+  // Check the results of scheduling.
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  auto PressureAfter = getRealRegPressure();
+
+  DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs()));
+
+  if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
+      PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) {
+    Pressure[RegionIdx] = PressureAfter;
+    DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    return;
+  }
+  unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(),
+                                    PressureAfter.getVGPRNum(), MF);
+  unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(),
+                                     PressureBefore.getVGPRNum(), MF);
+  DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
+                  ", after " << WavesAfter << ".\n");
+
+  // We could not keep current target occupancy because of the just scheduled
+  // region. Record new occupancy for next scheduling cycle.
+  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+  if (NewOccupancy < MinOccupancy) {
+    MinOccupancy = NewOccupancy;
+    DEBUG(dbgs() << "Occupancy lowered for the function to "
+                 << MinOccupancy << ".\n");
+  }
+
+  if (WavesAfter >= WavesBefore) {
+    Pressure[RegionIdx] = PressureAfter;
+    return;
+  }
+
+  DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RegionEnd = RegionBegin;
+  for (MachineInstr *MI : Unsched) {
+    if (MI->getIterator() != RegionEnd) {
+      BB->remove(MI);
+      BB->insert(RegionEnd, MI);
+      LIS->handleMove(*MI, true);
+    }
+    // Reset read-undef flags and update them later.
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef())
+        Op.setIsUndef(false);
+    RegisterOperands RegOpers;
+    RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+    if (ShouldTrackLaneMasks) {
+      // Adjust liveness and add missing dead+read-undef flags.
+      SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    } else {
+      // Adjust for missing dead-def flags.
+      RegOpers.detectDeadDefs(*MI, *LIS);
+    }
+    RegionEnd = MI->getIterator();
+    ++RegionEnd;
+    DEBUG(dbgs() << "Scheduling " << *MI);
+  }
+  RegionBegin = Unsched.front()->getIterator();
+  Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+
+  placeDebugValues();
+}
+
+GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const {
+  GCNDownwardRPTracker RPTracker(*LIS);
+  RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
+  return RPTracker.moveMaxPressure();
+}
+
+void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) {
+  GCNDownwardRPTracker RPTracker(*LIS);
+
+  // If the block has the only successor then live-ins of that successor are
+  // live-outs of the current block. We can reuse calculated live set if the
+  // successor will be sent to scheduling past current block.
+  const MachineBasicBlock *OnlySucc = nullptr;
+  if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
+    SlotIndexes *Ind = LIS->getSlotIndexes();
+    if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
+      OnlySucc = *MBB->succ_begin();
+  }
+
+  // Scheduler sends regions from the end of the block upwards.
+  size_t CurRegion = RegionIdx;
+  for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
+    if (Regions[CurRegion].first->getParent() != MBB)
+      break;
+  --CurRegion;
+
+  auto I = MBB->begin();
+  auto LiveInIt = MBBLiveIns.find(MBB);
+  if (LiveInIt != MBBLiveIns.end()) {
+    auto LiveIn = std::move(LiveInIt->second);
+    RPTracker.reset(*MBB->begin(), &LiveIn);
+    MBBLiveIns.erase(LiveInIt);
+  } else {
+    I = Regions[CurRegion].first;
+    RPTracker.reset(*I);
+  }
+
+  for ( ; ; ) {
+    I = RPTracker.getNext();
+
+    if (Regions[CurRegion].first == I) {
+      LiveIns[CurRegion] = RPTracker.getLiveRegs();
+      RPTracker.clearMaxPressure();
+    }
+
+    if (Regions[CurRegion].second == I) {
+      Pressure[CurRegion] = RPTracker.moveMaxPressure();
+      if (CurRegion-- == RegionIdx)
+        break;
+    }
+    RPTracker.advanceToNext();
+    RPTracker.advanceBeforeNext();
+  }
+
+  if (OnlySucc) {
+    if (I != MBB->end()) {
+      RPTracker.advanceToNext();
+      RPTracker.advance(MBB->end());
+    }
+    RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs());
+    RPTracker.advanceBeforeNext();
+    MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
+  }
+}
+
+void GCNScheduleDAGMILive::finalizeSchedule() {
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+
+  LiveIns.resize(Regions.size());
+  Pressure.resize(Regions.size());
+
+  do {
+    Stage++;
+    RegionIdx = 0;
+    MachineBasicBlock *MBB = nullptr;
+
+    if (Stage > 1) {
+      // Retry function scheduling if we found resulting occupancy and it is
+      // lower than used for first pass scheduling. This will give more freedom
+      // to schedule low register pressure blocks.
+      // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+
+      if (!LIS || StartingOccupancy <= MinOccupancy)
+        break;
+
+      DEBUG(dbgs()
+              << "Retrying function scheduling with lowest recorded occupancy "
+              << MinOccupancy << ".\n");
+
+      S.setTargetOccupancy(MinOccupancy);
+    }
+
+    for (auto Region : Regions) {
+      RegionBegin = Region.first;
+      RegionEnd = Region.second;
+
+      if (RegionBegin->getParent() != MBB) {
+        if (MBB) finishBlock();
+        MBB = RegionBegin->getParent();
+        startBlock(MBB);
+        if (Stage == 1)
+          computeBlockPressure(MBB);
+      }
+
+      unsigned NumRegionInstrs = std::distance(begin(), end());
+      enterRegion(MBB, begin(), end(), NumRegionInstrs);
+
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (begin() == end() || begin() == std::prev(end())) {
+        exitRegion();
+        continue;
+      }
+
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF.getName()
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *begin() << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+      schedule();
+
+      exitRegion();
+      ++RegionIdx;
+    }
+    finishBlock();
+
+  } while (Stage < 2);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 4cfc0ce..060d2ca 100644
--- a/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/contrib/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,17 +14,21 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 
+#include "GCNRegPressure.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
 namespace llvm {
 
+class SIMachineFunctionInfo;
 class SIRegisterInfo;
+class SISubtarget;
 
 /// This is a minimal scheduler strategy.  The main difference between this
 /// and the GenericScheduler is that GCNSchedStrategy uses different
 /// heuristics to determine excess/critical pressure sets.  Its goal is to
 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+  friend class GCNScheduleDAGMILive;
 
   SUnit *pickNodeBidirectional(bool &IsTopNode);
 
@@ -35,18 +39,72 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler {
   void initCandidate(SchedCandidate &Cand, SUnit *SU,
                      bool AtTop, const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI,
-                     int SGPRPressure, int VGPRPressure,
-                     int SGPRExcessLimit, int VGPRExcessLimit,
-                     int SGPRCriticalLimit, int VGPRCriticalLimit);
+                     unsigned SGPRPressure, unsigned VGPRPressure);
 
-  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                    SchedBoundary *Zone, const SIRegisterInfo *SRI,
-                    unsigned SGPRPressure, unsigned VGPRPressure);
+  unsigned SGPRExcessLimit;
+  unsigned VGPRExcessLimit;
+  unsigned SGPRCriticalLimit;
+  unsigned VGPRCriticalLimit;
+
+  unsigned TargetOccupancy;
+
+  MachineFunction *MF;
 
 public:
   GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
 
   SUnit *pickNode(bool &IsTopNode) override;
+
+  void initialize(ScheduleDAGMI *DAG) override;
+
+  void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
+};
+
+class GCNScheduleDAGMILive : public ScheduleDAGMILive {
+
+  const SISubtarget &ST;
+
+  const SIMachineFunctionInfo &MFI;
+
+  // Occupancy target at the beginning of function scheduling cycle.
+  unsigned StartingOccupancy;
+
+  // Minimal real occupancy recorder for the function.
+  unsigned MinOccupancy;
+
+  // Scheduling stage number.
+  unsigned Stage;
+
+  // Current region index.
+  size_t RegionIdx;
+
+  // Vecor of regions recorder for later rescheduling
+  SmallVector<std::pair<MachineBasicBlock::iterator,
+                        MachineBasicBlock::iterator>, 32> Regions;
+
+  // Region live-in cache.
+  SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
+
+  // Region pressure cache.
+  SmallVector<GCNRegPressure, 32> Pressure;
+
+  // Temporary basic block live-in cache.
+  DenseMap<const MachineBasicBlock*, GCNRPTracker::LiveRegSet> MBBLiveIns;
+
+  // Return current region pressure.
+  GCNRegPressure getRealRegPressure() const;
+
+  // Compute and cache live-ins and pressure for all regions in block.
+  void computeBlockPressure(const MachineBasicBlock *MBB);
+
+
+public:
+  GCNScheduleDAGMILive(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S);
+
+  void schedule() override;
+
+  void finalizeSchedule() override;
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index 7172a0a..a844081 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -9,8 +9,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUInstPrinter.h"
-#include "SIDefines.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
 #include "Utils/AMDGPUAsmUtils.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -72,6 +72,11 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo,
   O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff);
 }
 
+void AMDGPUInstPrinter::printS16ImmDecOperand(const MCInst *MI, unsigned OpNo,
+                                              raw_ostream &O) {
+  O << formatDec(static_cast<int16_t>(MI->getOperand(OpNo).getImm()));
+}
+
 void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
@@ -113,11 +118,21 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
-    O << " offset:";
+    O << ((OpNo == 0)? "offset:" : " offset:");
     printU16ImmDecOperand(MI, OpNo, O);
   }
 }
 
+void AMDGPUInstPrinter::printOffsetS13(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm != 0) {
+    O << ((OpNo == 0)? "offset:" : " offset:");
+    printS16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
@@ -216,6 +231,24 @@ void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
     O << " vm";
 }
 
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " nfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
@@ -264,6 +297,11 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   case AMDGPU::FLAT_SCR_HI:
     O << "flat_scratch_hi";
     return;
+  case AMDGPU::FP_REG:
+  case AMDGPU::SP_REG:
+  case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
+  case AMDGPU::PRIVATE_RSRC_REG:
+    llvm_unreachable("pseudo-register should not ever be emitted");
   default:
     break;
   }
@@ -375,6 +413,13 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
     O << formatHex(static_cast<uint64_t>(Imm));
 }
 
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  uint16_t Lo16 = static_cast<uint16_t>(Imm);
+  printImmediate16(Lo16, STI, O);
+}
+
 void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
@@ -489,6 +534,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_IMM_FP16:
       printImmediate16(Op.getImm(), STI, O);
       break;
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+      printImmediateV216(Op.getImm(), STI, O);
+      break;
     case MCOI::OPERAND_UNKNOWN:
     case MCOI::OPERAND_PCREL:
       O << formatDec(Op.getImm());
@@ -531,13 +580,34 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
                                                    const MCSubtargetInfo &STI,
                                                    raw_ostream &O) {
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
-  if (InputModifiers & SISrcMods::NEG)
-    O << '-';
+
+  // Use 'neg(...)' instead of '-' to avoid ambiguity.
+  // This is important for integer literals because
+  // -1 is not the same value as neg(1).
+  bool NegMnemo = false;
+
+  if (InputModifiers & SISrcMods::NEG) {
+    if (OpNo + 1 < MI->getNumOperands() &&
+        (InputModifiers & SISrcMods::ABS) == 0) {
+      const MCOperand &Op = MI->getOperand(OpNo + 1);
+      NegMnemo = Op.isImm() || Op.isFPImm();
+    }
+    if (NegMnemo) {
+      O << "neg(";
+    } else {
+      O << '-';
+    }
+  }
+
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
   printOperand(MI, OpNo + 1, STI, O);
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
+
+  if (NegMnemo) {
+    O << ')';
+  }
 }
 
 void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
@@ -672,11 +742,19 @@ template <unsigned N>
 void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+  unsigned Opc = MI->getOpcode();
+  int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en);
   unsigned En = MI->getOperand(EnIdx).getImm();
 
-  // FIXME: What do we do with compr? The meaning of en changes depending on if
-  // compr is set.
+  int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr);
+
+  // If compr is set, print as src0, src0, src1, src1
+  if (MI->getOperand(ComprIdx).getImm()) {
+    if (N == 1 || N == 2)
+      --OpNo;
+    else if (N == 3)
+      OpNo -= 2;
+  }
 
   if (En & (1 << N))
     printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
@@ -730,6 +808,71 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
   }
 }
 
+static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) {
+  int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+
+  for (int I = 0; I < NumOps; ++I) {
+    if (!!(Ops[I] & Mod) != DefaultValue)
+      return false;
+  }
+
+  return true;
+}
+
+static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
+                                raw_ostream &O) {
+  unsigned Opc = MI->getOpcode();
+  int NumOps = 0;
+  int Ops[3];
+
+  for (int OpName : { AMDGPU::OpName::src0_modifiers,
+                      AMDGPU::OpName::src1_modifiers,
+                      AMDGPU::OpName::src2_modifiers }) {
+    int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+    if (Idx == -1)
+      break;
+
+    Ops[NumOps++] = MI->getOperand(Idx).getImm();
+  }
+
+  if (allOpsDefaultValue(Ops, NumOps, Mod))
+    return;
+
+  O << Name;
+  for (int I = 0; I < NumOps; ++I) {
+    if (I != 0)
+      O << ',';
+
+    O << !!(Ops[I] & Mod);
+  }
+
+  O << ']';
+}
+
+void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
+}
+
+void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O);
+}
+
+void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O);
+}
+
+void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -1054,30 +1197,137 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
   O << SImm16; // Unknown simm16 code.
 }
 
+static void printSwizzleBitmask(const uint16_t AndMask,
+                                const uint16_t OrMask,
+                                const uint16_t XorMask,
+                                raw_ostream &O) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  uint16_t Probe0 = ((0            & AndMask) | OrMask) ^ XorMask;
+  uint16_t Probe1 = ((BITMASK_MASK & AndMask) | OrMask) ^ XorMask;
+
+  O << "\"";
+
+  for (unsigned Mask = 1 << (BITMASK_WIDTH - 1); Mask > 0; Mask >>= 1) {
+    uint16_t p0 = Probe0 & Mask;
+    uint16_t p1 = Probe1 & Mask;
+
+    if (p0 == p1) {
+      if (p0 == 0) {
+        O << "0";
+      } else {
+        O << "1";
+      }
+    } else {
+      if (p0 == 0) {
+        O << "p";
+      } else {
+        O << "i";
+      }
+    }
+  }
+
+  O << "\"";
+}
+
+void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  using namespace llvm::AMDGPU::Swizzle;
+
+  uint16_t Imm = MI->getOperand(OpNo).getImm();
+  if (Imm == 0) {
+    return;
+  }
+
+  O << " offset:";
+
+  if ((Imm & QUAD_PERM_ENC_MASK) == QUAD_PERM_ENC) {
+
+    O << "swizzle(" << IdSymbolic[ID_QUAD_PERM];
+    for (auto i = 0; i < LANE_NUM; ++i) {
+      O << ",";
+      O << formatDec(Imm & LANE_MASK);
+      Imm >>= LANE_SHIFT;
+    }
+    O << ")";
+
+  } else if ((Imm & BITMASK_PERM_ENC_MASK) == BITMASK_PERM_ENC) {
+
+    uint16_t AndMask = (Imm >> BITMASK_AND_SHIFT) & BITMASK_MASK;
+    uint16_t OrMask  = (Imm >> BITMASK_OR_SHIFT)  & BITMASK_MASK;
+    uint16_t XorMask = (Imm >> BITMASK_XOR_SHIFT) & BITMASK_MASK;
+
+    if (AndMask == BITMASK_MAX &&
+        OrMask == 0 &&
+        countPopulation(XorMask) == 1) {
+
+      O << "swizzle(" << IdSymbolic[ID_SWAP];
+      O << ",";
+      O << formatDec(XorMask);
+      O << ")";
+
+    } else if (AndMask == BITMASK_MAX &&
+               OrMask == 0 && XorMask > 0 &&
+               isPowerOf2_64(XorMask + 1)) {
+
+      O << "swizzle(" << IdSymbolic[ID_REVERSE];
+      O << ",";
+      O << formatDec(XorMask + 1);
+      O << ")";
+
+    } else {
+
+      uint16_t GroupSize = BITMASK_MAX - AndMask + 1;
+      if (GroupSize > 1 &&
+          isPowerOf2_64(GroupSize) &&
+          OrMask < GroupSize &&
+          XorMask == 0) {
+
+        O << "swizzle(" << IdSymbolic[ID_BROADCAST];
+        O << ",";
+        O << formatDec(GroupSize);
+        O << ",";
+        O << formatDec(OrMask);
+        O << ")";
+
+      } else {
+        O << "swizzle(" << IdSymbolic[ID_BITMASK_PERM];
+        O << ",";
+        printSwizzleBitmask(AndMask, OrMask, XorMask, O);
+        O << ")";
+      }
+    }
+  } else {
+    printU16ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  IsaVersion IV = getIsaVersion(STI.getFeatureBits());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits());
 
   unsigned SImm16 = MI->getOperand(OpNo).getImm();
   unsigned Vmcnt, Expcnt, Lgkmcnt;
-  decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt);
+  decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt);
 
   bool NeedSpace = false;
 
-  if (Vmcnt != getVmcntBitMask(IV)) {
+  if (Vmcnt != getVmcntBitMask(ISA)) {
     O << "vmcnt(" << Vmcnt << ')';
     NeedSpace = true;
   }
 
-  if (Expcnt != getExpcntBitMask(IV)) {
+  if (Expcnt != getExpcntBitMask(ISA)) {
     if (NeedSpace)
       O << ' ';
     O << "expcnt(" << Expcnt << ')';
     NeedSpace = true;
   }
 
-  if (Lgkmcnt != getLgkmcntBitMask(IV)) {
+  if (Lgkmcnt != getLgkmcntBitMask(ISA)) {
     if (NeedSpace)
       O << ' ';
     O << "lgkmcnt(" << Lgkmcnt << ')';
diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index a6d348f..7bbf99a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -42,6 +42,7 @@ private:
   void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
   void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
@@ -52,6 +53,9 @@ private:
   void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printOffsetS13(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
+
   void printOffset0(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
   void printOffset1(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -84,12 +88,18 @@ private:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printDFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
 
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
+  void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
+                          raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
@@ -117,6 +127,14 @@ private:
                         const MCSubtargetInfo &STI, raw_ostream &O);
   void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOpSel(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOpSelHi(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNegLo(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNegHi(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
@@ -183,6 +201,8 @@ private:
                    raw_ostream &O);
   void printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                    raw_ostream &O);
   void printWaitFlag(const MCInst *MI, unsigned OpNo,
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index ffb92aa..a50e3eb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -8,8 +8,8 @@
 /// \file
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
@@ -30,14 +30,9 @@ public:
 
   unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
 
-  void processFixupValue(const MCAssembler &Asm,
-                         const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -102,36 +97,11 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
-void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                         const MCAsmLayout &Layout,
-                                         const MCFixup &Fixup, const MCFragment *DF,
-                                         const MCValue &Target, uint64_t &Value,
-                                         bool &IsResolved) {
-  MCValue Res;
-
-  // When we have complex expressions like: BB0_1 + (BB0_2 - 4), which are
-  // used for long branches, this function will be called with
-  // IsResolved = false and Value set to some pre-computed value.  In
-  // the example above, the value would be:
-  // (BB0_1 + (BB0_2 - 4)) - CurrentOffsetFromStartOfFunction.
-  // This is not what we want.  We just want the expression computation
-  // only.  The reason the MC layer subtracts the current offset from the
-  // expression is because the fixup is of kind FK_PCRel_4.
-  // For these scenarios, evaluateAsValue gives us the computation that we
-  // want.
-  if (!IsResolved && Fixup.getValue()->evaluateAsValue(Res, Layout) &&
-      Res.isAbsolute()) {
-    Value = Res.getConstant();
-    IsResolved = true;
-
-  }
-  if (IsResolved)
-    Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
-void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                  unsigned DataSize, uint64_t Value,
-                                  bool IsPCRel) const {
+void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                  const MCValue &Target,
+                                  MutableArrayRef<char> Data, uint64_t Value,
+                                  bool IsResolved) const {
+  Value = adjustFixupValue(Fixup, Value, &Asm.getContext());
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -142,7 +112,7 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   uint32_t Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value.
@@ -164,7 +134,20 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
 }
 
 bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  OW->WriteZeros(Count);
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  OW->WriteZeros(Count % 4);
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+
+  // FIXME: R600 support.
+  // s_nop 0
+  const uint32_t Encoded_S_NOP_0 = 0xbf800000;
+
+  for (uint64_t I = 0; I != Count; ++I)
+    OW->write32(Encoded_S_NOP_0);
 
   return true;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
new file mode 100644
index 0000000..4e828a7
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -0,0 +1,432 @@
+//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata Streamer.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCodeObjectMetadataStreamer.h"
+#include "AMDGPU.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+static cl::opt<bool> DumpCodeObjectMetadata(
+    "amdgpu-dump-comd",
+    cl::desc("Dump AMDGPU Code Object Metadata"));
+static cl::opt<bool> VerifyCodeObjectMetadata(
+    "amdgpu-verify-comd",
+    cl::desc("Verify AMDGPU Code Object Metadata"));
+
+namespace AMDGPU {
+namespace CodeObject {
+
+void MetadataStreamer::dump(StringRef YamlString) const {
+  errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n';
+}
+
+void MetadataStreamer::verify(StringRef YamlString) const {
+  errs() << "AMDGPU Code Object Metadata Parser Test: ";
+
+  CodeObject::Metadata FromYamlString;
+  if (Metadata::fromYamlString(YamlString, FromYamlString)) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  std::string ToYamlString;
+  if (Metadata::toYamlString(FromYamlString, ToYamlString)) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n';
+  if (YamlString != ToYamlString) {
+    errs() << "Original input: " << YamlString << '\n'
+           << "Produced output: " << ToYamlString << '\n';
+  }
+}
+
+AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+  if (AccQual.empty())
+    return AccessQualifier::Unknown;
+
+  return StringSwitch<AccessQualifier>(AccQual)
+             .Case("read_only",  AccessQualifier::ReadOnly)
+             .Case("write_only", AccessQualifier::WriteOnly)
+             .Case("read_write", AccessQualifier::ReadWrite)
+             .Default(AccessQualifier::Default);
+}
+
+AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+    unsigned AddressSpace) const {
+  if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
+    return AddressSpaceQualifier::Private;
+  if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS)
+    return AddressSpaceQualifier::Global;
+  if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS)
+    return AddressSpaceQualifier::Constant;
+  if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS)
+    return AddressSpaceQualifier::Local;
+  if (AddressSpace == AMDGPUASI.FLAT_ADDRESS)
+    return AddressSpaceQualifier::Generic;
+  if (AddressSpace == AMDGPUASI.REGION_ADDRESS)
+    return AddressSpaceQualifier::Region;
+
+  llvm_unreachable("Unknown address space qualifier");
+}
+
+ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
+                                         StringRef BaseTypeName) const {
+  if (TypeQual.find("pipe") != StringRef::npos)
+    return ValueKind::Pipe;
+
+  return StringSwitch<ValueKind>(BaseTypeName)
+             .Case("image1d_t", ValueKind::Image)
+             .Case("image1d_array_t", ValueKind::Image)
+             .Case("image1d_buffer_t", ValueKind::Image)
+             .Case("image2d_t", ValueKind::Image)
+             .Case("image2d_array_t", ValueKind::Image)
+             .Case("image2d_array_depth_t", ValueKind::Image)
+             .Case("image2d_array_msaa_t", ValueKind::Image)
+             .Case("image2d_array_msaa_depth_t", ValueKind::Image)
+             .Case("image2d_depth_t", ValueKind::Image)
+             .Case("image2d_msaa_t", ValueKind::Image)
+             .Case("image2d_msaa_depth_t", ValueKind::Image)
+             .Case("image3d_t", ValueKind::Image)
+             .Case("sampler_t", ValueKind::Sampler)
+             .Case("queue_t", ValueKind::Queue)
+             .Default(isa<PointerType>(Ty) ?
+                          (Ty->getPointerAddressSpace() ==
+                           AMDGPUASI.LOCAL_ADDRESS ?
+                           ValueKind::DynamicSharedPointer :
+                           ValueKind::GlobalBuffer) :
+                      ValueKind::ByValue);
+}
+
+ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    auto Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? ValueType::I8 : ValueType::U8;
+    case 16:
+      return Signed ? ValueType::I16 : ValueType::U16;
+    case 32:
+      return Signed ? ValueType::I32 : ValueType::U32;
+    case 64:
+      return Signed ? ValueType::I64 : ValueType::U64;
+    default:
+      return ValueType::Struct;
+    }
+  }
+  case Type::HalfTyID:
+    return ValueType::F16;
+  case Type::FloatTyID:
+    return ValueType::F32;
+  case Type::DoubleTyID:
+    return ValueType::F64;
+  case Type::PointerTyID:
+    return getValueType(Ty->getPointerElementType(), TypeName);
+  case Type::VectorTyID:
+    return getValueType(Ty->getVectorElementType(), TypeName);
+  default:
+    return ValueType::Struct;
+  }
+}
+
+std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getTypeName(Ty, true)).str();
+
+    auto BitWidth = Ty->getIntegerBitWidth();
+    switch (BitWidth) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BitWidth)).str();
+    }
+  }
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::VectorTyID: {
+    auto VecTy = cast<VectorType>(Ty);
+    auto ElTy = VecTy->getElementType();
+    auto NumElements = VecTy->getVectorNumElements();
+    return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
+    MDNode *Node) const {
+  std::vector<uint32_t> Dims;
+  if (Node->getNumOperands() != 3)
+    return Dims;
+
+  for (auto &Op : Node->operands())
+    Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue());
+  return Dims;
+}
+
+void MetadataStreamer::emitVersion() {
+  auto &Version = CodeObjectMetadata.mVersion;
+
+  Version.push_back(MetadataVersionMajor);
+  Version.push_back(MetadataVersionMinor);
+}
+
+void MetadataStreamer::emitPrintf(const Module &Mod) {
+  auto &Printf = CodeObjectMetadata.mPrintf;
+
+  auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+  if (!Node)
+    return;
+
+  for (auto Op : Node->operands())
+    if (Op->getNumOperands())
+      Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
+}
+
+void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+  auto &Kernel = CodeObjectMetadata.mKernels.back();
+
+  // TODO: What about other languages?
+  auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+  if (!Node || !Node->getNumOperands())
+    return;
+  auto Op0 = Node->getOperand(0);
+  if (Op0->getNumOperands() <= 1)
+    return;
+
+  Kernel.mLanguage = "OpenCL C";
+  Kernel.mLanguageVersion.push_back(
+      mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue());
+  Kernel.mLanguageVersion.push_back(
+      mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
+}
+
+void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+  auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs;
+
+  if (auto Node = Func.getMetadata("reqd_work_group_size"))
+    Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("work_group_size_hint"))
+    Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("vec_type_hint")) {
+    Attrs.mVecTypeHint = getTypeName(
+        cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+        mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
+  }
+}
+
+void MetadataStreamer::emitKernelArgs(const Function &Func) {
+  for (auto &Arg : Func.args())
+    emitKernelArg(Arg);
+
+  // TODO: What about other languages?
+  if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+  if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+    return;
+
+  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+                                      AMDGPUASI.GLOBAL_ADDRESS);
+  emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+}
+
+void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+  auto Func = Arg.getParent();
+  auto ArgNo = Arg.getArgNo();
+  const MDNode *Node;
+
+  StringRef TypeQual;
+  Node = Func->getMetadata("kernel_arg_type_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef BaseTypeName;
+  Node = Func->getMetadata("kernel_arg_base_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef AccQual;
+  if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
+      Arg.hasNoAliasAttr()) {
+    AccQual = "read_only";
+  } else {
+    Node = Func->getMetadata("kernel_arg_access_qual");
+    if (Node && ArgNo < Node->getNumOperands())
+      AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+  }
+
+  StringRef Name;
+  Node = Func->getMetadata("kernel_arg_name");
+  if (Node && ArgNo < Node->getNumOperands())
+    Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef TypeName;
+  Node = Func->getMetadata("kernel_arg_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+                getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual,
+                BaseTypeName, AccQual, Name, TypeName);
+}
+
+void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                     ValueKind ValueKind, StringRef TypeQual,
+                                     StringRef BaseTypeName, StringRef AccQual,
+                                     StringRef Name, StringRef TypeName) {
+  CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
+  auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back();
+
+  Arg.mSize = DL.getTypeAllocSize(Ty);
+  Arg.mAlign = DL.getABITypeAlignment(Ty);
+  Arg.mValueKind = ValueKind;
+  Arg.mValueType = getValueType(Ty, BaseTypeName);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    auto ElTy = PtrTy->getElementType();
+    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
+      Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
+  }
+
+  Arg.mAccQual = getAccessQualifier(AccQual);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty))
+    Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+
+  SmallVector<StringRef, 1> SplitTypeQuals;
+  TypeQual.split(SplitTypeQuals, " ", -1, false);
+  for (StringRef Key : SplitTypeQuals) {
+    auto P = StringSwitch<bool*>(Key)
+                 .Case("const",    &Arg.mIsConst)
+                 .Case("pipe",     &Arg.mIsPipe)
+                 .Case("restrict", &Arg.mIsRestrict)
+                 .Case("volatile", &Arg.mIsVolatile)
+                 .Default(nullptr);
+    if (P)
+      *P = true;
+  }
+
+  Arg.mName = Name;
+  Arg.mTypeName = TypeName;
+}
+
+void MetadataStreamer::emitKernelCodeProps(
+    const amd_kernel_code_t &KernelCode) {
+  auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps;
+
+  CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size;
+  CodeProps.mWorkgroupGroupSegmentSize =
+      KernelCode.workgroup_group_segment_byte_size;
+  CodeProps.mWorkitemPrivateSegmentSize =
+      KernelCode.workitem_private_segment_byte_size;
+  CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count;
+  CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count;
+  CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment;
+  CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment;
+  CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment;
+  CodeProps.mWavefrontSize = KernelCode.wavefront_size;
+}
+
+void MetadataStreamer::emitKernelDebugProps(
+    const amd_kernel_code_t &KernelCode) {
+  if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED))
+    return;
+
+  auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps;
+
+  // FIXME: Need to pass down debugger ABI version through features. This is ok
+  // for now because we only have one version.
+  DebugProps.mDebuggerABIVersion.push_back(1);
+  DebugProps.mDebuggerABIVersion.push_back(0);
+  DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count;
+  DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first;
+  DebugProps.mPrivateSegmentBufferSGPR =
+      KernelCode.debug_private_segment_buffer_sgpr;
+  DebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+      KernelCode.debug_wavefront_private_segment_offset_sgpr;
+}
+
+void MetadataStreamer::begin(const Module &Mod) {
+  AMDGPUASI = getAMDGPUAS(Mod);
+  emitVersion();
+  emitPrintf(Mod);
+}
+
+void MetadataStreamer::emitKernel(const Function &Func,
+                                  const amd_kernel_code_t &KernelCode) {
+  if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+    return;
+
+  CodeObjectMetadata.mKernels.push_back(Kernel::Metadata());
+  auto &Kernel = CodeObjectMetadata.mKernels.back();
+
+  Kernel.mName = Func.getName();
+  emitKernelLanguage(Func);
+  emitKernelAttrs(Func);
+  emitKernelArgs(Func);
+  emitKernelCodeProps(KernelCode);
+  emitKernelDebugProps(KernelCode);
+}
+
+ErrorOr<std::string> MetadataStreamer::toYamlString() {
+  std::string YamlString;
+  if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString))
+    return Error;
+
+  if (DumpCodeObjectMetadata)
+    dump(YamlString);
+  if (VerifyCodeObjectMetadata)
+    verify(YamlString);
+
+  return YamlString;
+}
+
+ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) {
+  if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata))
+    return Error;
+
+  return toYamlString();
+}
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
new file mode 100644
index 0000000..c668143
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
@@ -0,0 +1,99 @@
+//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata Streamer.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+
+#include "AMDGPU.h"
+#include "AMDKernelCodeT.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/AMDGPUCodeObjectMetadata.h"
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+
+class Argument;
+class DataLayout;
+class Function;
+class MDNode;
+class Module;
+class Type;
+
+namespace AMDGPU {
+namespace CodeObject {
+
+class MetadataStreamer final {
+private:
+  Metadata CodeObjectMetadata;
+  AMDGPUAS AMDGPUASI;
+
+  void dump(StringRef YamlString) const;
+
+  void verify(StringRef YamlString) const;
+
+  AccessQualifier getAccessQualifier(StringRef AccQual) const;
+
+  AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const;
+
+  ValueKind getValueKind(Type *Ty, StringRef TypeQual,
+                         StringRef BaseTypeName) const;
+
+  ValueType getValueType(Type *Ty, StringRef TypeName) const;
+
+  std::string getTypeName(Type *Ty, bool Signed) const;
+
+  std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
+
+  void emitVersion();
+
+  void emitPrintf(const Module &Mod);
+
+  void emitKernelLanguage(const Function &Func);
+
+  void emitKernelAttrs(const Function &Func);
+
+  void emitKernelArgs(const Function &Func);
+
+  void emitKernelArg(const Argument &Arg);
+
+  void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+                     StringRef TypeQual = "", StringRef BaseTypeName = "",
+                     StringRef AccQual = "", StringRef Name = "",
+                     StringRef TypeName = "");
+
+  void emitKernelCodeProps(const amd_kernel_code_t &KernelCode);
+
+  void emitKernelDebugProps(const amd_kernel_code_t &KernelCode);
+
+public:
+  MetadataStreamer() = default;
+  ~MetadataStreamer() = default;
+
+  void begin(const Module &Mod);
+
+  void end() {}
+
+  void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode);
+
+  ErrorOr<std::string> toYamlString();
+
+  ErrorOr<std::string> toYamlString(StringRef YamlString);
+};
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 1847d7a..6abe7f3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -1,16 +1,20 @@
-//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// \file
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -19,20 +23,21 @@ namespace {
 class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
+
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
 
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
                                              bool HasRelocationAddend)
   : MCELFObjectTargetWriter(Is64Bit,
                             ELF::ELFOSABI_AMDGPU_HSA,
                             ELF::EM_AMDGPU,
-                            HasRelocationAddend) { }
+                            HasRelocationAddend) {}
 
 unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
                                              const MCValue &Target,
@@ -77,7 +82,6 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
   llvm_unreachable("unhandled relocation type");
 }
 
-
 MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
                                                   bool HasRelocationAddend,
                                                   raw_pwrite_stream &OS) {
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 1655591..2364e7b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -14,6 +14,8 @@
 using namespace llvm;
 
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() {
+  CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4;
+  StackGrowsUp = true;
   HasSingleParameterDotFile = false;
   //===------------------------------------------------------------------===//
   MinInstAlignment = 4;
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 3d3858a..1b06206 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,6 +52,18 @@ public:
     return 0;
   }
 
+  virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
+  virtual unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
 protected:
   uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
   void verifyInstructionPredicates(const MCInst &MI,
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 548bad5..f80b5f3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -54,11 +54,17 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
 
 #define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
 #include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
+
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
 
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
deleted file mode 100644
index 95387ad..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Generates AMDGPU runtime metadata for YAML mapping.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include "AMDGPU.h"
-#include "AMDGPURuntimeMetadata.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <vector>
-#include "AMDGPURuntimeMD.h"
-
-using namespace llvm;
-using namespace ::AMDGPU::RuntimeMD;
-
-static cl::opt<bool>
-DumpRuntimeMD("amdgpu-dump-rtmd",
-              cl::desc("Dump AMDGPU runtime metadata"));
-
-static cl::opt<bool>
-CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden,
-                     cl::desc("Check AMDGPU runtime metadata YAML parser"));
-
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
-LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
-LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata)
-
-namespace llvm {
-namespace yaml {
-
-template <> struct MappingTraits<KernelArg::Metadata> {
-  static void mapping(IO &YamlIO, KernelArg::Metadata &A) {
-    YamlIO.mapRequired(KeyName::ArgSize, A.Size);
-    YamlIO.mapRequired(KeyName::ArgAlign, A.Align);
-    YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U);
-    YamlIO.mapRequired(KeyName::ArgKind, A.Kind);
-    YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType);
-    YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string());
-    YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string());
-    YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL);
-    YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL);
-    YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0));
-  }
-  static const bool flow = true;
-};
-
-template <> struct MappingTraits<Kernel::Metadata> {
-  static void mapping(IO &YamlIO, Kernel::Metadata &K) {
-    YamlIO.mapRequired(KeyName::KernelName, K.Name);
-    YamlIO.mapOptional(KeyName::Language, K.Language, std::string());
-    YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion);
-    YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize);
-    YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint);
-    YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string());
-    YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex,
-        INVALID_KERNEL_INDEX);
-    YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups,
-        uint8_t(0));
-    YamlIO.mapRequired(KeyName::Args, K.Args);
-  }
-  static const bool flow = true;
-};
-
-template <> struct MappingTraits<Program::Metadata> {
-  static void mapping(IO &YamlIO, Program::Metadata &Prog) {
-    YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq);
-    YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo);
-    YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels);
-  }
-  static const bool flow = true;
-};
-
-} // end namespace yaml
-} // end namespace llvm
-
-// Get a vector of three integer values from MDNode \p Node;
-static std::vector<uint32_t> getThreeInt32(MDNode *Node) {
-  assert(Node->getNumOperands() == 3);
-  std::vector<uint32_t> V;
-  for (const MDOperand &Op : Node->operands()) {
-    const ConstantInt *CI = mdconst::extract<ConstantInt>(Op);
-    V.push_back(CI->getZExtValue());
-  }
-  return V;
-}
-
-static std::string getOCLTypeName(Type *Ty, bool Signed) {
-  switch (Ty->getTypeID()) {
-  case Type::HalfTyID:
-    return "half";
-  case Type::FloatTyID:
-    return "float";
-  case Type::DoubleTyID:
-    return "double";
-  case Type::IntegerTyID: {
-    if (!Signed)
-      return (Twine('u') + getOCLTypeName(Ty, true)).str();
-    unsigned BW = Ty->getIntegerBitWidth();
-    switch (BW) {
-    case 8:
-      return "char";
-    case 16:
-      return "short";
-    case 32:
-      return "int";
-    case 64:
-      return "long";
-    default:
-      return (Twine('i') + Twine(BW)).str();
-    }
-  }
-  case Type::VectorTyID: {
-    VectorType *VecTy = cast<VectorType>(Ty);
-    Type *EleTy = VecTy->getElementType();
-    unsigned Size = VecTy->getVectorNumElements();
-    return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
-  }
-  default:
-    return "unknown";
-  }
-}
-
-static KernelArg::ValueType getRuntimeMDValueType(
-  Type *Ty, StringRef TypeName) {
-  switch (Ty->getTypeID()) {
-  case Type::HalfTyID:
-    return KernelArg::F16;
-  case Type::FloatTyID:
-    return KernelArg::F32;
-  case Type::DoubleTyID:
-    return KernelArg::F64;
-  case Type::IntegerTyID: {
-    bool Signed = !TypeName.startswith("u");
-    switch (Ty->getIntegerBitWidth()) {
-    case 8:
-      return Signed ? KernelArg::I8 : KernelArg::U8;
-    case 16:
-      return Signed ? KernelArg::I16 : KernelArg::U16;
-    case 32:
-      return Signed ? KernelArg::I32 : KernelArg::U32;
-    case 64:
-      return Signed ? KernelArg::I64 : KernelArg::U64;
-    default:
-      // Runtime does not recognize other integer types. Report as struct type.
-      return KernelArg::Struct;
-    }
-  }
-  case Type::VectorTyID:
-    return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName);
-  case Type::PointerTyID:
-    return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName);
-  default:
-    return KernelArg::Struct;
-  }
-}
-
-static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace(
-    AMDGPUAS::AddressSpaces A) {
-  switch (A) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-    return KernelArg::Global;
-  case AMDGPUAS::CONSTANT_ADDRESS:
-    return KernelArg::Constant;
-  case AMDGPUAS::LOCAL_ADDRESS:
-    return KernelArg::Local;
-  case AMDGPUAS::FLAT_ADDRESS:
-    return KernelArg::Generic;
-  case AMDGPUAS::REGION_ADDRESS:
-    return KernelArg::Region;
-  default:
-    return KernelArg::Private;
-  }
-}
-
-static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL,
-    Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "",
-    StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "",
-    StringRef AccQual = "") {
-
-  KernelArg::Metadata Arg;
-
-  // Set ArgSize and ArgAlign.
-  Arg.Size = DL.getTypeAllocSize(T);
-  Arg.Align = DL.getABITypeAlignment(T);
-  if (auto PT = dyn_cast<PointerType>(T)) {
-    auto ET = PT->getElementType();
-    if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized())
-      Arg.PointeeAlign = DL.getABITypeAlignment(ET);
-  }
-
-  // Set ArgTypeName.
-  Arg.TypeName = TypeName;
-
-  // Set ArgName.
-  Arg.Name = ArgName;
-
-  // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe.
-  SmallVector<StringRef, 1> SplitQ;
-  TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */);
-
-  for (StringRef KeyName : SplitQ) {
-    auto *P = StringSwitch<uint8_t *>(KeyName)
-      .Case("volatile", &Arg.IsVolatile)
-      .Case("restrict", &Arg.IsRestrict)
-      .Case("const",    &Arg.IsConst)
-      .Case("pipe",     &Arg.IsPipe)
-      .Default(nullptr);
-    if (P)
-      *P = 1;
-  }
-
-  // Set ArgKind.
-  Arg.Kind = Kind;
-
-  // Set ArgValueType.
-  Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName);
-
-  // Set ArgAccQual.
-  if (!AccQual.empty()) {
-    Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual)
-      .Case("read_only",  KernelArg::ReadOnly)
-      .Case("write_only", KernelArg::WriteOnly)
-      .Case("read_write", KernelArg::ReadWrite)
-      .Default(KernelArg::AccNone);
-  }
-
-  // Set ArgAddrQual.
-  if (auto *PT = dyn_cast<PointerType>(T)) {
-    Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>(
-        PT->getAddressSpace()));
-  }
-
-  return Arg;
-}
-
-static Kernel::Metadata getRuntimeMDForKernel(const Function &F) {
-  Kernel::Metadata Kernel;
-  Kernel.Name = F.getName();
-  auto &M = *F.getParent();
-
-  // Set Language and LanguageVersion.
-  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
-    if (MD->getNumOperands() != 0) {
-      auto Node = MD->getOperand(0);
-      if (Node->getNumOperands() > 1) {
-        Kernel.Language = "OpenCL C";
-        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
-                         ->getZExtValue();
-        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
-                         ->getZExtValue();
-        Kernel.LanguageVersion.push_back(Major);
-        Kernel.LanguageVersion.push_back(Minor);
-      }
-    }
-  }
-
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  for (auto &Arg : F.args()) {
-    unsigned I = Arg.getArgNo();
-    Type *T = Arg.getType();
-    auto TypeName = dyn_cast<MDString>(F.getMetadata(
-        "kernel_arg_type")->getOperand(I))->getString();
-    auto BaseTypeName = cast<MDString>(F.getMetadata(
-        "kernel_arg_base_type")->getOperand(I))->getString();
-    StringRef ArgName;
-    if (auto ArgNameMD = F.getMetadata("kernel_arg_name"))
-      ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString();
-    auto TypeQual = cast<MDString>(F.getMetadata(
-        "kernel_arg_type_qual")->getOperand(I))->getString();
-    auto AccQual = cast<MDString>(F.getMetadata(
-        "kernel_arg_access_qual")->getOperand(I))->getString();
-    KernelArg::Kind Kind;
-    if (TypeQual.find("pipe") != StringRef::npos)
-      Kind = KernelArg::Pipe;
-    else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName)
-      .Case("sampler_t", KernelArg::Sampler)
-      .Case("queue_t",   KernelArg::Queue)
-      .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
-             "image2d_t" , "image2d_array_t",  KernelArg::Image)
-      .Cases("image2d_depth_t", "image2d_array_depth_t",
-             "image2d_msaa_t", "image2d_array_msaa_t",
-             "image2d_msaa_depth_t",  KernelArg::Image)
-      .Cases("image2d_array_msaa_depth_t", "image3d_t",
-             KernelArg::Image)
-      .Default(isa<PointerType>(T) ?
-                   (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ?
-                   KernelArg::DynamicSharedPointer :
-                   KernelArg::GlobalBuffer) :
-                   KernelArg::ByValue);
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind,
-        BaseTypeName, TypeName, ArgName, TypeQual, AccQual));
-  }
-
-  // Emit hidden kernel arguments for OpenCL kernels.
-  if (F.getParent()->getNamedMetadata("opencl.ocl.version")) {
-    auto Int64T = Type::getInt64Ty(F.getContext());
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetX));
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetY));
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetZ));
-    if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) {
-      auto Int8PtrT = Type::getInt8PtrTy(F.getContext(),
-          KernelArg::Global);
-      Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT,
-          KernelArg::HiddenPrintfBuffer));
-    }
-  }
-
-  // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint.
-  if (auto RWGS = F.getMetadata("reqd_work_group_size"))
-    Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS);
-
-  if (auto WGSH = F.getMetadata("work_group_size_hint"))
-    Kernel.WorkGroupSizeHint = getThreeInt32(WGSH);
-
-  if (auto VTH = F.getMetadata("vec_type_hint"))
-    Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>(
-      VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
-      VTH->getOperand(1))->getZExtValue());
-
-  return Kernel;
-}
-
-Program::Metadata::Metadata(const std::string &YAML) {
-  yaml::Input Input(YAML);
-  Input >> *this;
-}
-
-std::string Program::Metadata::toYAML(void) {
-  std::string Text;
-  raw_string_ostream Stream(Text);
-  yaml::Output Output(Stream, nullptr, INT_MAX /* do not wrap line */);
-  Output << *this;
-  return Stream.str();
-}
-
-Program::Metadata Program::Metadata::fromYAML(const std::string &S) {
-  return Program::Metadata(S);
-}
-
-// Check if the YAML string can be parsed.
-static void checkRuntimeMDYAMLString(const std::string &YAML) {
-  auto P = Program::Metadata::fromYAML(YAML);
-  auto S = P.toYAML();
-  llvm::errs() << "AMDGPU runtime metadata parser test "
-               << (YAML == S ? "passes" : "fails") << ".\n";
-  if (YAML != S) {
-    llvm::errs() << "First output: " << YAML << '\n'
-                 << "Second output: " << S << '\n';
-  }
-}
-
-std::string llvm::getRuntimeMDYAMLString(Module &M) {
-  Program::Metadata Prog;
-  Prog.MDVersionSeq.push_back(MDVersion);
-  Prog.MDVersionSeq.push_back(MDRevision);
-
-  // Set PrintfInfo.
-  if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) {
-    for (unsigned I = 0; I < MD->getNumOperands(); ++I) {
-      auto Node = MD->getOperand(I);
-      if (Node->getNumOperands() > 0)
-        Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0))
-            ->getString());
-    }
-  }
-
-  // Set Kernels.
-  for (auto &F: M.functions()) {
-    if (!F.getMetadata("kernel_arg_type"))
-      continue;
-    Prog.Kernels.emplace_back(getRuntimeMDForKernel(F));
-  }
-
-  auto YAML = Prog.toYAML();
-
-  if (DumpRuntimeMD)
-    llvm::errs() << "AMDGPU runtime metadata:\n" << YAML << '\n';
-
-  if (CheckRuntimeMDParser)
-    checkRuntimeMDYAMLString(YAML);
-
-  return YAML;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
deleted file mode 100644
index a92fdd4..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares functions for generating runtime metadata.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
-#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
-
-#include <string>
-
-namespace llvm {
-class Module;
-
-// Get runtime metadata as YAML string.
-std::string getRuntimeMDYAMLString(Module &M);
-
-}
-#endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 3392183..2a0032f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDGPUTargetStreamer.h"
+#include "AMDGPU.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
@@ -25,9 +26,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/FormattedStream.h"
-#include "AMDGPURuntimeMD.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -36,9 +35,27 @@ namespace llvm {
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetStreamer
+//===----------------------------------------------------------------------===//
+
 AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
+void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) {
+  CodeObjectMetadataStreamer.begin(Mod);
+}
+
+void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata(
+    const Function &Func, const amd_kernel_code_t &KernelCode) {
+  CodeObjectMetadataStreamer.emitKernel(Func, KernelCode);
+}
+
+void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() {
+  CodeObjectMetadataStreamer.end();
+  EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get());
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetAsmStreamer
 //===----------------------------------------------------------------------===//
@@ -83,26 +100,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
   }
 }
 
-void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal(
-    StringRef GlobalName) {
-  OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n';
-}
+bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
+  auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
+  if (!VerifiedYamlString)
+    return false;
 
-void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
-    StringRef GlobalName) {
-  OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
-}
+  OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n';
+  OS << VerifiedYamlString.get();
+  OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n';
 
-void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) {
-  OS << "\t.amdgpu_runtime_metadata\n";
-  OS << getRuntimeMDYAMLString(M);
-  OS << "\n\t.end_amdgpu_runtime_metadata\n";
-}
-
-void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) {
-  OS << "\t.amdgpu_runtime_metadata";
-  OS << Metadata;
-  OS << "\t.end_amdgpu_runtime_metadata\n";
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -116,22 +123,21 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
   return static_cast<MCELFStreamer &>(Streamer);
 }
 
-void
-AMDGPUTargetELFStreamer::EmitAMDGPUNote(const MCExpr* DescSZ,
-                                        PT_NOTE::NoteType Type,
-                              std::function<void(MCELFStreamer &)> EmitDesc) {
+void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
+    const MCExpr *DescSZ, ElfNote::NoteType Type,
+    function_ref<void(MCELFStreamer &)> EmitDesc) {
   auto &S = getStreamer();
   auto &Context = S.getContext();
 
-  auto NameSZ = sizeof(PT_NOTE::NoteName);
+  auto NameSZ = sizeof(ElfNote::NoteName);
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
-    PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
+    ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
   S.EmitIntValue(NameSZ, 4);                                  // namesz
   S.EmitValue(DescSZ, 4);                                     // descz
-  S.EmitIntValue(Type, 4); // type
-  S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ));          // name
+  S.EmitIntValue(Type, 4);                                    // type
+  S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ));          // name
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
@@ -144,7 +150,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
 
   EmitAMDGPUNote(
     MCConstantExpr::create(8, getContext()),
-    PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
+    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
     [&](MCELFStreamer &OS){
       OS.EmitIntValue(Major, 4);
       OS.EmitIntValue(Minor, 4);
@@ -160,14 +166,14 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
                                                        StringRef ArchName) {
   uint16_t VendorNameSize = VendorName.size() + 1;
   uint16_t ArchNameSize = ArchName.size() + 1;
-  
+
   unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
     sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
     VendorNameSize + ArchNameSize;
 
   EmitAMDGPUNote(
     MCConstantExpr::create(DescSZ, getContext()),
-    PT_NOTE::NT_AMDGPU_HSA_ISA,
+    ElfNote::NT_AMDGPU_HSA_ISA,
     [&](MCELFStreamer &OS) {
       OS.EmitIntValue(VendorNameSize, 2);
       OS.EmitIntValue(ArchNameSize, 2);
@@ -198,25 +204,11 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
   Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL);
 }
 
-void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal(
-    StringRef GlobalName) {
-
-  MCSymbolELF *Symbol = cast<MCSymbolELF>(
-      getStreamer().getContext().getOrCreateSymbol(GlobalName));
-  Symbol->setType(ELF::STT_OBJECT);
-  Symbol->setBinding(ELF::STB_LOCAL);
-}
-
-void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
-    StringRef GlobalName) {
-
-  MCSymbolELF *Symbol = cast<MCSymbolELF>(
-      getStreamer().getContext().getOrCreateSymbol(GlobalName));
-  Symbol->setType(ELF::STT_OBJECT);
-  Symbol->setBinding(ELF::STB_GLOBAL);
-}
+bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
+  auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
+  if (!VerifiedYamlString)
+    return false;
 
-void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
   // Create two labels to mark the beginning and end of the desc field
   // and a MCExpr to calculate the size of the desc field.
   auto &Context = getContext();
@@ -228,15 +220,13 @@ void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
 
   EmitAMDGPUNote(
     DescSZ,
-    PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA,
+    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA,
     [&](MCELFStreamer &OS) {
       OS.EmitLabel(DescBegin);
-      OS.EmitBytes(Metadata);
+      OS.EmitBytes(VerifiedYamlString.get());
       OS.EmitLabel(DescEnd);
     }
   );
-}
 
-void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) {
-  EmitRuntimeMetadata(getRuntimeMDYAMLString(M));
+  return true;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e2f2058..968128e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
+#include "AMDGPUCodeObjectMetadataStreamer.h"
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
 
@@ -26,6 +27,7 @@ class Type;
 
 class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
+  AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer;
   MCContext &getContext() const { return Streamer.getContext(); }
 
 public:
@@ -42,16 +44,18 @@ public:
 
   virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
 
-  virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0;
+  virtual void EmitStartOfCodeObjectMetadata(const Module &Mod);
 
-  virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
+  virtual void EmitKernelCodeObjectMetadata(
+      const Function &Func, const amd_kernel_code_t &KernelCode);
 
-  virtual void EmitRuntimeMetadata(Module &M) = 0;
+  virtual void EmitEndOfCodeObjectMetadata();
 
-  virtual void EmitRuntimeMetadata(StringRef Metadata) = 0;
+  /// \returns True on success, false on failure.
+  virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0;
 };
 
-class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
+class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   formatted_raw_ostream &OS;
 public:
   AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
@@ -66,21 +70,16 @@ public:
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
-  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
-
-  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
-
-  void EmitRuntimeMetadata(Module &M) override;
-
-  void EmitRuntimeMetadata(StringRef Metadata) override;
+  /// \returns True on success, false on failure.
+  bool EmitCodeObjectMetadata(StringRef YamlString) override;
 };
 
-class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
+class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
 
-  void EmitAMDGPUNote(const MCExpr* DescSize,
-                      AMDGPU::PT_NOTE::NoteType Type,
-                      std::function<void(MCELFStreamer &)> EmitDesc);
+  void EmitAMDGPUNote(const MCExpr *DescSize,
+                      AMDGPU::ElfNote::NoteType Type,
+                      function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
   AMDGPUTargetELFStreamer(MCStreamer &S);
@@ -98,13 +97,8 @@ public:
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
-  void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override;
-
-  void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
-
-  void EmitRuntimeMetadata(Module &M) override;
-
-  void EmitRuntimeMetadata(StringRef Metadata) override;
+  /// \returns True on success, false on failure.
+  bool EmitCodeObjectMetadata(StringRef YamlString) override;
 };
 
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 6015ec1..eab90e1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -14,10 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "R600Defines.h"
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "R600Defines.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixup.h"
diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 0c5bb06..376c9bf 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -69,6 +69,14 @@ public:
   unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
+
+  unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const override;
+
+  unsigned getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const override;
 };
 
 } // end anonymous namespace
@@ -220,13 +228,33 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     Imm = MO.getImm();
   }
 
-  switch (AMDGPU::getOperandSize(OpInfo)) {
-  case 4:
+  switch (OpInfo.OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-  case 8:
+
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
-  case 2:
+
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    // FIXME Is this correct? What do inline immediates do on SI for f16 src
+    // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    uint16_t Lo16 = static_cast<uint16_t>(Imm);
+    uint32_t Encoding = getLit16Encoding(Lo16, STI);
+    return Encoding;
+  }
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -297,6 +325,63 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
   return getMachineOpValue(MI, MO, Fixups, STI);
 }
 
+unsigned
+SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const {
+  using namespace AMDGPU::SDWA;
+
+  uint64_t RegEnc = 0;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  unsigned Reg = MO.getReg();
+  RegEnc |= MRI.getEncodingValue(Reg);
+  RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+  if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+    RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+  }
+  return RegEnc;
+}
+
+unsigned
+SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  using namespace AMDGPU::SDWA;
+
+  uint64_t RegEnc = 0;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  unsigned Reg = MO.getReg();
+  if (Reg != AMDGPU::VCC) {
+    RegEnc |= MRI.getEncodingValue(Reg);
+    RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+    RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
+  }
+  return RegEnc;
+}
+
+static bool needsPCRel(const MCExpr *Expr) {
+  switch (Expr->getKind()) {
+  case MCExpr::SymbolRef:
+    return true;
+  case MCExpr::Binary: {
+    auto *BE = cast<MCBinaryExpr>(Expr);
+    if (BE->getOpcode() == MCBinaryExpr::Sub)
+      return false;
+    return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
+  }
+  case MCExpr::Unary:
+    return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return false;
+  }
+  llvm_unreachable("invalid kind");
+}
+
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
@@ -305,12 +390,21 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
     return MRI.getEncodingValue(MO.getReg());
 
   if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
-    const auto *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr());
+    // FIXME: If this is expression is PCRel or not should not depend on what
+    // the expression looks like. Given that this is just a general expression,
+    // it should probably be FK_Data_4 and whatever is producing
+    //
+    //    s_add_u32 s2, s2, (extern_const_addrspace+16
+    //
+    // And expecting a PCRel should instead produce
+    //
+    // .Ltmp1:
+    //   s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
     MCFixupKind Kind;
-    if (Expr && Expr->getSymbol().isExternal())
-      Kind = FK_Data_4;
-    else
+    if (needsPCRel(MO.getExpr()))
       Kind = FK_PCRel_4;
+    else
+      Kind = FK_Data_4;
     Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc()));
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 46803e5..06e2c11 100644
--- a/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -26,6 +26,7 @@ class MIMG_Helper <dag outs, dag ins, string asm,
   let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
   let AsmMatchConverter = "cvtMIMG";
   let usesCustomInserter = 1;
+  let SchedRW = [WriteVMEM];
 }
 
 class MIMG_NoSampler_Helper <bits<7> op, string asm,
@@ -475,106 +476,6 @@ class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
     sub0)
 >;
 
-// ======= SI Image Intrinsics ================
-
-// Image load
-defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
-defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
-def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
-
-// Basic sample
-defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
-defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
-defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-// Only the variants which make sense are defined.
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
-def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
-
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
-
 // ======= amdgcn Image Intrinsics ==============
 
 // Image load
diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td
index 3c07cc7..d30d1d3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Processors.td
+++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td
@@ -80,50 +80,53 @@ def : Proc<"cayman",     R600_VLIW4_Itin,
 // Southern Islands
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"SI", SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
+def : ProcessorModel<"gfx600",     SIFullSpeedModel,
+  [FeatureISAVersion6_0_0]>;
+
+def : ProcessorModel<"SI",         SIFullSpeedModel,
+  [FeatureISAVersion6_0_0]
+>;
+
+def : ProcessorModel<"tahiti",     SIFullSpeedModel,
+  [FeatureISAVersion6_0_0]
 >;
 
-def : ProcessorModel<"tahiti", SIFullSpeedModel,
-  [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops]
+def : ProcessorModel<"gfx601",     SIQuarterSpeedModel,
+  [FeatureISAVersion6_0_1]
 >;
 
-def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+def : ProcessorModel<"pitcairn",   SIQuarterSpeedModel,
+  [FeatureISAVersion6_0_1]>;
 
-def : ProcessorModel<"verde",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+def : ProcessorModel<"verde",      SIQuarterSpeedModel,
+  [FeatureISAVersion6_0_1]>;
 
-def : ProcessorModel<"oland",    SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+def : ProcessorModel<"oland",      SIQuarterSpeedModel,
+  [FeatureISAVersion6_0_1]>;
 
-def : ProcessorModel<"hainan",   SIQuarterSpeedModel, [FeatureSouthernIslands]>;
+def : ProcessorModel<"hainan",     SIQuarterSpeedModel, [FeatureISAVersion6_0_1]>;
 
 //===----------------------------------------------------------------------===//
 // Sea Islands
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"bonaire",    SIQuarterSpeedModel,
+def : ProcessorModel<"gfx700",     SIQuarterSpeedModel,
   [FeatureISAVersion7_0_0]
 >;
 
-def : ProcessorModel<"kabini",     SIQuarterSpeedModel,
-  [FeatureISAVersion7_0_2]
+def : ProcessorModel<"bonaire",    SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_0]
 >;
 
 def : ProcessorModel<"kaveri",     SIQuarterSpeedModel,
   [FeatureISAVersion7_0_0]
 >;
 
-def : ProcessorModel<"hawaii",     SIFullSpeedModel,
+def : ProcessorModel<"gfx701",     SIFullSpeedModel,
   [FeatureISAVersion7_0_1]
 >;
 
-def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
-  [FeatureISAVersion7_0_2]>;
-
-def : ProcessorModel<"gfx700",     SIQuarterSpeedModel,
-  [FeatureISAVersion7_0_0]
->;
-
-def : ProcessorModel<"gfx701",     SIFullSpeedModel,
+def : ProcessorModel<"hawaii",     SIFullSpeedModel,
   [FeatureISAVersion7_0_1]
 >;
 
@@ -131,6 +134,17 @@ def : ProcessorModel<"gfx702",     SIQuarterSpeedModel,
   [FeatureISAVersion7_0_2]
 >;
 
+def : ProcessorModel<"gfx703",     SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_3]
+>;
+
+def : ProcessorModel<"kabini",     SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_3]
+>;
+
+def : ProcessorModel<"mullins",    SIQuarterSpeedModel,
+  [FeatureISAVersion7_0_3]>;
+
 //===----------------------------------------------------------------------===//
 // Volcanic Islands
 //===----------------------------------------------------------------------===//
@@ -187,3 +201,23 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
   [FeatureISAVersion8_1_0]
 >;
 
+//===----------------------------------------------------------------------===//
+// GFX9
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx900", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_0]
+>;
+
+def : ProcessorModel<"gfx901", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_1]
+>;
+
+def : ProcessorModel<"gfx902", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_2]
+>;
+
+def : ProcessorModel<"gfx903", SIQuarterSpeedModel,
+  [FeatureISAVersion9_0_3]
+>;
+
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
index d0aba38..fbe45cb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ClauseMergePass.cpp
@@ -62,7 +62,7 @@ private:
                        const MachineInstr &LatrCFAlu) const;
 
 public:
-  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+  R600ClauseMergePass() : MachineFunctionPass(ID) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -208,6 +208,6 @@ StringRef R600ClauseMergePass::getPassName() const {
 } // end anonymous namespace
 
 
-llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
-  return new R600ClauseMergePass(TM);
+llvm::FunctionPass *llvm::createR600ClauseMergePass() {
+  return new R600ClauseMergePass();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 45b36d3..00cbd24 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -12,17 +12,33 @@
 /// computing their address on the fly ; it also sets STACK_SIZE info.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <new>
+#include <set>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -43,13 +59,12 @@ struct CFStack {
   std::vector<StackItem> BranchStack;
   std::vector<StackItem> LoopStack;
   unsigned MaxStackSize;
-  unsigned CurrentEntries;
-  unsigned CurrentSubEntries;
+  unsigned CurrentEntries = 0;
+  unsigned CurrentSubEntries = 0;
 
   CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
       // We need to reserve a stack entry for CALL_FS in vertex shaders.
-      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0),
-      CurrentEntries(0), CurrentSubEntries(0) { }
+      MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {}
 
   unsigned getLoopDepth();
   bool branchStackContains(CFStack::StackItem);
@@ -198,9 +213,8 @@ void CFStack::popLoop() {
 }
 
 class R600ControlFlowFinalizer : public MachineFunctionPass {
-
 private:
-  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+  typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile;
 
   enum ControlFlowInstruction {
     CF_TC,
@@ -217,10 +231,10 @@ private:
   };
 
   static char ID;
-  const R600InstrInfo *TII;
-  const R600RegisterInfo *TRI;
+  const R600InstrInfo *TII = nullptr;
+  const R600RegisterInfo *TRI = nullptr;
   unsigned MaxFetchInst;
-  const R600Subtarget *ST;
+  const R600Subtarget *ST = nullptr;
 
   bool IsTrivialInst(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
@@ -355,7 +369,7 @@ private:
         continue;
       int64_t Imm = Src.second;
       std::vector<MachineOperand *>::iterator It =
-          find_if(Lits, [&](MachineOperand *val) {
+          llvm::find_if(Lits, [&](MachineOperand *val) {
             return val->isImm() && (val->getImm() == Imm);
           });
 
@@ -485,8 +499,7 @@ private:
   }
 
 public:
-  R600ControlFlowFinalizer(TargetMachine &tm)
-      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
+  R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     ST = &MF.getSubtarget<R600Subtarget>();
@@ -501,7 +514,7 @@ public:
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
-      std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
+      std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
       if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
@@ -542,7 +555,7 @@ public:
             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
           } else
             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
-
+          LLVM_FALLTHROUGH;
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
@@ -554,7 +567,7 @@ public:
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_WHILE_LOOP))
               .addImm(1);
-          std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
+          std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount,
               std::set<MachineInstr *>());
           Pair.second.insert(MIb);
           LoopStack.push_back(std::move(Pair));
@@ -564,7 +577,7 @@ public:
         }
         case AMDGPU::ENDLOOP: {
           CFStack.popLoop();
-          std::pair<unsigned, std::set<MachineInstr *> > Pair =
+          std::pair<unsigned, std::set<MachineInstr *>> Pair =
               std::move(LoopStack.back());
           LoopStack.pop_back();
           CounterPropagateAddr(Pair.second, CfCount);
@@ -693,7 +706,6 @@ char R600ControlFlowFinalizer::ID = 0;
 
 } // end anonymous namespace
 
-
-llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
-  return new R600ControlFlowFinalizer(TM);
+FunctionPass *llvm::createR600ControlFlowFinalizer() {
+  return new R600ControlFlowFinalizer();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index 9a5db6c..0d8ccd0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -15,28 +15,39 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
-#include "AMDGPUSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 namespace llvm {
+
   void initializeR600EmitClauseMarkersPass(PassRegistry&);
-}
+
+} // end namespace llvm
 
 namespace {
 
 class R600EmitClauseMarkers : public MachineFunctionPass {
-
 private:
-  const R600InstrInfo *TII;
-  int Address;
+  const R600InstrInfo *TII = nullptr;
+  int Address = 0;
 
   unsigned OccupiedDwords(MachineInstr &MI) const {
     switch (MI.getOpcode()) {
@@ -118,7 +129,7 @@ private:
   SubstituteKCacheBank(MachineInstr &MI,
                        std::vector<std::pair<unsigned, unsigned>> &CachedConsts,
                        bool UpdateInstr = true) const {
-    std::vector<std::pair<unsigned, unsigned> > UsedKCache;
+    std::vector<std::pair<unsigned, unsigned>> UsedKCache;
 
     if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4)
       return true;
@@ -181,10 +192,11 @@ private:
 
   bool canClauseLocalKillFitInClause(
                         unsigned AluInstCount,
-                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        std::vector<std::pair<unsigned, unsigned>> KCacheBanks,
                         MachineBasicBlock::iterator Def,
                         MachineBasicBlock::iterator BBEnd) {
     const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    //TODO: change this to defs?
     for (MachineInstr::const_mop_iterator
            MOI = Def->operands_begin(),
            MOE = Def->operands_end(); MOI != MOE; ++MOI) {
@@ -207,15 +219,17 @@ private:
         if (AluInstCount >= TII->getMaxAlusPerClause())
           return false;
 
+        // TODO: Is this true? kill flag appears to work OK below
         // Register kill flags have been cleared by the time we get to this
         // pass, but it is safe to assume that all uses of this register
         // occur in the same basic block as its definition, because
         // it is illegal for the scheduler to schedule them in
         // different blocks.
-        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+        if (UseI->readsRegister(MOI->getReg()))
           LastUseCount = AluInstCount;
 
-        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+        // Exit early if the current use kills the register
+        if (UseI != Def && UseI->killsRegister(MOI->getReg()))
           break;
       }
       if (LastUseCount)
@@ -228,7 +242,7 @@ private:
   MachineBasicBlock::iterator
   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
     MachineBasicBlock::iterator ClauseHead = I;
-    std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
+    std::vector<std::pair<unsigned, unsigned>> KCacheBanks;
     bool PushBeforeModifier = false;
     unsigned AluInstCount = 0;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
@@ -294,8 +308,8 @@ private:
 
 public:
   static char ID;
-  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
 
+  R600EmitClauseMarkers() : MachineFunctionPass(ID) {
     initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
   }
 
@@ -310,9 +324,11 @@ public:
       if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
-        if (isALU(*I))
-          I = MakeALUClause(MBB, I);
-        else
+        if (isALU(*I)) {
+          auto next = MakeALUClause(MBB, I);
+          assert(next != I);
+          I = next;
+        } else
           ++I;
       }
     }
@@ -333,7 +349,6 @@ INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
                       "R600 Emit Clause Markters", false, false)
 
-llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
+FunctionPass *llvm::createR600EmitClauseMarkers() {
   return new R600EmitClauseMarkers();
 }
-
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index 3e46e63..66def2d 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -15,11 +15,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
-#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -37,7 +37,7 @@ private:
       unsigned Op);
 
 public:
-  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+  R600ExpandSpecialInstrsPass() : MachineFunctionPass(ID),
     TII(nullptr) { }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -51,8 +51,8 @@ public:
 
 char R600ExpandSpecialInstrsPass::ID = 0;
 
-FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
-  return new R600ExpandSpecialInstrsPass(TM);
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass() {
+  return new R600ExpandSpecialInstrsPass();
 }
 
 void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI,
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index 5813786..37787b3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -8,7 +8,43 @@
 //==-----------------------------------------------------------------------===//
 
 #include "R600FrameLowering.h"
+#include "AMDGPUSubtarget.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
 R600FrameLowering::~R600FrameLowering() = default;
+
+/// \returns The number of registers allocated for \p FI.
+int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                              int FI,
+                                              unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const R600RegisterInfo *RI
+    = MF.getSubtarget<R600Subtarget>().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
+  // Start the offset at 2 so we don't overwrite work group information.
+  // FIXME: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
+
+  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+    OffsetBytes += MFI.getObjectSize(i);
+    // Each register holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = alignTo(OffsetBytes, 4);
+  }
+
+  if (FI != -1)
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index 874435f..142f709 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -25,6 +25,8 @@ public:
                     MachineBasicBlock &MBB) const override {}
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 77fee435..69a63b6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -221,6 +221,15 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SUBE, VT, Expand);
   }
 
+  // LLVM will expand these to atomic_cmp_swap(0)
+  // and atomic_swap, respectively.
+  setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
   setSchedulingPreference(Sched::Source);
 
   setTargetDAGCombine(ISD::FP_ROUND);
@@ -266,7 +275,7 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
-        NewMI.addOperand(MI.getOperand(i));
+        NewMI.add(MI.getOperand(i));
       }
     } else {
       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -339,34 +348,34 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
-        .addOperand(MI.getOperand(0))
-        .addOperand(MI.getOperand(1))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
   case AMDGPU::RAT_STORE_TYPED_eg:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
-        .addOperand(MI.getOperand(0))
-        .addOperand(MI.getOperand(1))
-        .addOperand(MI.getOperand(2))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
         .addImm(isEOP(I)); // Set End of program bit
     break;
 
   case AMDGPU::BRANCH:
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
-        .addOperand(MI.getOperand(0));
+        .add(MI.getOperand(0));
     break;
 
   case AMDGPU::BRANCH_COND_f32: {
     MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
                 AMDGPU::PREDICATE_BIT)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(AMDGPU::PRED_SETNE)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-        .addOperand(MI.getOperand(0))
+        .add(MI.getOperand(0))
         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
@@ -375,12 +384,12 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MachineInstr *NewMI =
         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
                 AMDGPU::PREDICATE_BIT)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(AMDGPU::PRED_SETNE_INT)
             .addImm(0); // Flags
     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
-        .addOperand(MI.getOperand(0))
+        .add(MI.getOperand(0))
         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     break;
   }
@@ -408,13 +417,13 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       return BB;
     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
-        .addOperand(MI.getOperand(0))
-        .addOperand(MI.getOperand(1))
-        .addOperand(MI.getOperand(2))
-        .addOperand(MI.getOperand(3))
-        .addOperand(MI.getOperand(4))
-        .addOperand(MI.getOperand(5))
-        .addOperand(MI.getOperand(6))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .add(MI.getOperand(5))
+        .add(MI.getOperand(6))
         .addImm(CfInst)
         .addImm(EOP);
     break;
@@ -490,8 +499,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
-    switch(IntrinsicID) {
-    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    switch (IntrinsicID) {
     case AMDGPUIntrinsic::r600_tex:
     case AMDGPUIntrinsic::r600_texc: {
       unsigned TextureOp;
@@ -552,7 +560,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     }
 
     case Intrinsic::r600_implicitarg_ptr: {
-      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
@@ -576,29 +584,31 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T1_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_X, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Y, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                                  AMDGPU::T0_Z, VT);
+      return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass,
+                                     AMDGPU::T0_Z, VT);
 
     case Intrinsic::r600_recipsqrt_ieee:
       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
 
     case Intrinsic::r600_recipsqrt_clamped:
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+    default:
+      return Op;
     }
 
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
@@ -702,12 +712,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                                SDValue Op,
                                                SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   const DataLayout &DL = DAG.getDataLayout();
   const GlobalValue *GV = GSD->getGlobal();
-  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
 
   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
@@ -864,7 +874,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUAS::CONSTANT_BUFFER_0);
+                                      AMDGPUASI.CONSTANT_BUFFER_0);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
@@ -911,7 +921,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 
   if (VT == MVT::f32) {
     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
-    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     if (MinMax)
       return MinMax;
   }
@@ -1102,7 +1112,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   //TODO: Who creates the i8 stores?
   assert(Store->isTruncatingStore()
          || Store->getValue().getValueType() == MVT::i8);
-  assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+  assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   SDValue Mask;
   if (Store->getMemoryVT() == MVT::i8) {
@@ -1110,7 +1120,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
     Mask = DAG.getConstant(0xff, DL, MVT::i32);
   } else if (Store->getMemoryVT() == MVT::i16) {
     assert(Store->getAlignment() >= 2);
-    Mask = DAG.getConstant(0xffff, DL, MVT::i32);;
+    Mask = DAG.getConstant(0xffff, DL, MVT::i32);
   } else {
     llvm_unreachable("Unsupported private trunc store");
   }
@@ -1200,9 +1210,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   // Neither LOCAL nor PRIVATE can do vectors at the moment
-  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
-    if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) {
+    if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+         StoreNode->isTruncatingStore()) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
       // TODO: can the chain be replaced without creating a new store?
@@ -1225,7 +1236,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
                                   DAG.getConstant(2, DL, PtrVT));
 
-  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // It is beneficial to create MSKOR here instead of combiner to avoid
     // artificial dependencies introduced by RMW
     if (StoreNode->isTruncatingStore()) {
@@ -1278,7 +1289,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
-  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
+  if (AS != AMDGPUASI.PRIVATE_ADDRESS)
     return SDValue();
 
   if (MemVT.bitsLT(MVT::i32))
@@ -1297,39 +1308,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
 // return (512 + (kc_bank << 12)
 static int
-ConstantAddressBlock(unsigned AddressSpace) {
+ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) {
   switch (AddressSpace) {
-  case AMDGPUAS::CONSTANT_BUFFER_0:
+  case AMDGPUASI.CONSTANT_BUFFER_0:
     return 512;
-  case AMDGPUAS::CONSTANT_BUFFER_1:
+  case AMDGPUASI.CONSTANT_BUFFER_1:
     return 512 + 4096;
-  case AMDGPUAS::CONSTANT_BUFFER_2:
+  case AMDGPUASI.CONSTANT_BUFFER_2:
     return 512 + 4096 * 2;
-  case AMDGPUAS::CONSTANT_BUFFER_3:
+  case AMDGPUASI.CONSTANT_BUFFER_3:
     return 512 + 4096 * 3;
-  case AMDGPUAS::CONSTANT_BUFFER_4:
+  case AMDGPUASI.CONSTANT_BUFFER_4:
     return 512 + 4096 * 4;
-  case AMDGPUAS::CONSTANT_BUFFER_5:
+  case AMDGPUASI.CONSTANT_BUFFER_5:
     return 512 + 4096 * 5;
-  case AMDGPUAS::CONSTANT_BUFFER_6:
+  case AMDGPUASI.CONSTANT_BUFFER_6:
     return 512 + 4096 * 6;
-  case AMDGPUAS::CONSTANT_BUFFER_7:
+  case AMDGPUASI.CONSTANT_BUFFER_7:
     return 512 + 4096 * 7;
-  case AMDGPUAS::CONSTANT_BUFFER_8:
+  case AMDGPUASI.CONSTANT_BUFFER_8:
     return 512 + 4096 * 8;
-  case AMDGPUAS::CONSTANT_BUFFER_9:
+  case AMDGPUASI.CONSTANT_BUFFER_9:
     return 512 + 4096 * 9;
-  case AMDGPUAS::CONSTANT_BUFFER_10:
+  case AMDGPUASI.CONSTANT_BUFFER_10:
     return 512 + 4096 * 10;
-  case AMDGPUAS::CONSTANT_BUFFER_11:
+  case AMDGPUASI.CONSTANT_BUFFER_11:
     return 512 + 4096 * 11;
-  case AMDGPUAS::CONSTANT_BUFFER_12:
+  case AMDGPUASI.CONSTANT_BUFFER_12:
     return 512 + 4096 * 12;
-  case AMDGPUAS::CONSTANT_BUFFER_13:
+  case AMDGPUASI.CONSTANT_BUFFER_13:
     return 512 + 4096 * 13;
-  case AMDGPUAS::CONSTANT_BUFFER_14:
+  case AMDGPUASI.CONSTANT_BUFFER_14:
     return 512 + 4096 * 14;
-  case AMDGPUAS::CONSTANT_BUFFER_15:
+  case AMDGPUASI.CONSTANT_BUFFER_15:
     return 512 + 4096 * 15;
   default:
     return -1;
@@ -1397,7 +1408,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = LoadNode->getMemoryVT();
   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
 
-  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+  if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
     return lowerPrivateExtLoad(Op, DAG);
   }
@@ -1407,13 +1418,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = LoadNode->getChain();
   SDValue Ptr = LoadNode->getBasePtr();
 
-  if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-      LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+  if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+      LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
       return scalarizeVectorLoad(LoadNode, DAG);
   }
 
-  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(),
+      AMDGPUASI);
   if (ConstantBlock > -1 &&
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
@@ -1445,7 +1457,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
                       DAG.getConstant(4, DL, MVT::i32)),
                       DAG.getConstant(LoadNode->getAddressSpace() -
-                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
+                                      AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
           );
     }
 
@@ -1481,7 +1493,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(MergedValues, DL);
   }
 
-  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
     return SDValue();
   }
 
@@ -1535,7 +1547,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
   SmallVector<ISD::InputArg, 8> LocalIns;
 
   if (AMDGPU::isShader(CallConv)) {
-    AnalyzeFormalArguments(CCInfo, Ins);
+    CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
   } else {
     analyzeFormalArgumentsCompute(CCInfo, Ins);
   }
@@ -1558,7 +1570,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUAS::CONSTANT_BUFFER_0);
+                                          AMDGPUASI.CONSTANT_BUFFER_0);
 
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
@@ -1606,6 +1618,15 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    return VT.changeVectorElementTypeToInteger();
 }
 
+bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
+                                          const SelectionDAG &DAG) const {
+  // Local and Private addresses do not handle vectors. Limit to i32
+  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
+    return (MemVT.getSizeInBits() <= 32);
+  }
+  return true;
+}
+
 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                         unsigned AddrSpace,
                                                         unsigned Align,
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 9700ce1..2a77469 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -44,6 +44,9 @@ public:
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
                          EVT VT) const override;
 
+  bool canMergeStoresTo(unsigned AS, EVT MemVT,
+                        const SelectionDAG &DAG) const override;
+
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *IsFast) const override;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index e88bd07..c5da5e4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -14,14 +14,32 @@
 
 #include "R600InstrInfo.h"
 #include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDGPUTargetMachine.h"
 #include "R600Defines.h"
-#include "R600MachineFunctionInfo.h"
+#include "R600FrameLowering.h"
 #include "R600RegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -191,7 +209,7 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const {
   const MachineFunction *MF = MI.getParent()->getParent();
   return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) &&
           usesVertexCache(MI.getOpcode())) ||
-         usesTextureCache(MI.getOpcode());
+          usesTextureCache(MI.getOpcode());
 }
 
 bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
@@ -321,7 +339,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
                            unsigned &ConstCount) const {
   ConstCount = 0;
   const std::pair<int, unsigned> DummyPair(-1, 0);
-  std::vector<std::pair<int, unsigned> > Result;
+  std::vector<std::pair<int, unsigned>> Result;
   unsigned i = 0;
   for (const auto &Src : getSrcs(MI)) {
     ++i;
@@ -348,8 +366,8 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
   return Result;
 }
 
-static std::vector<std::pair<int, unsigned> >
-Swizzle(std::vector<std::pair<int, unsigned> > Src,
+static std::vector<std::pair<int, unsigned>>
+Swizzle(std::vector<std::pair<int, unsigned>> Src,
         R600InstrInfo::BankSwizzle Swz) {
   if (Src[0] == Src[1])
     Src[1].first = -1;
@@ -404,14 +422,14 @@ static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
 /// in the same Instruction Group while meeting read port limitations given a
 /// Swz swizzle sequence.
 unsigned  R600InstrInfo::isLegalUpTo(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs,
     const std::vector<R600InstrInfo::BankSwizzle> &Swz,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    const std::vector<std::pair<int, unsigned>> &TransSrcs,
     R600InstrInfo::BankSwizzle TransSwz) const {
   int Vector[4][3];
   memset(Vector, -1, sizeof(Vector));
   for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
-    const std::vector<std::pair<int, unsigned> > &Srcs =
+    const std::vector<std::pair<int, unsigned>> &Srcs =
         Swizzle(IGSrcs[i], Swz[i]);
     for (unsigned j = 0; j < 3; j++) {
       const std::pair<int, unsigned> &Src = Srcs[j];
@@ -473,9 +491,9 @@ NextPossibleSolution(
 /// Enumerate all possible Swizzle sequence to find one that can meet all
 /// read port requirements.
 bool R600InstrInfo::FindSwizzleForVectorSlot(
-    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<std::vector<std::pair<int, unsigned>>> &IGSrcs,
     std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
-    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    const std::vector<std::pair<int, unsigned>> &TransSrcs,
     R600InstrInfo::BankSwizzle TransSwz) const {
   unsigned ValidUpTo = 0;
   do {
@@ -490,7 +508,7 @@ bool R600InstrInfo::FindSwizzleForVectorSlot(
 /// a const, and can't read a gpr at cycle 1 if they read 2 const.
 static bool
 isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
-                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  const std::vector<std::pair<int, unsigned>> &TransOps,
                   unsigned ConstCount) {
   // TransALU can't read 3 constants
   if (ConstCount > 2)
@@ -516,7 +534,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
     const {
   //Todo : support shared src0 - src1 operand
 
-  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs;
   ValidSwizzle.clear();
   unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
@@ -527,7 +545,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
     ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
         IG[i]->getOperand(Op).getImm());
   }
-  std::vector<std::pair<int, unsigned> > TransOps;
+  std::vector<std::pair<int, unsigned>> TransOps;
   if (!isLastAluTrans)
     return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
 
@@ -556,7 +574,6 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   return false;
 }
 
-
 bool
 R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
     const {
@@ -780,7 +797,7 @@ unsigned R600InstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 unsigned R600InstrInfo::removeBranch(MachineBasicBlock &MBB,
                                      int *BytesRemoved) const {
-    assert(!BytesRemoved && "code size not handled");
+  assert(!BytesRemoved && "code size not handled");
 
   // Note : we leave PRED* instructions there.
   // They may be needed when predicating instructions.
@@ -852,7 +869,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
   }
 }
 
-bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
+bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   // XXX: KILL* instructions can be predicated, but they must be the last
   // instruction in a clause, so this means any instructions after them cannot
   // be predicated.  Until we have proper support for instruction clauses in the
@@ -863,7 +880,7 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
-    if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
+    if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
       return false;
     // TODO: We don't support KC merging atm
     return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
@@ -874,10 +891,9 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   }
 }
 
-
 bool
 R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCyles,
+                                   unsigned NumCycles,
                                    unsigned ExtraPredCycles,
                                    BranchProbability Probability) const{
   return true;
@@ -896,7 +912,7 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
 
 bool
 R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCyles,
+                                         unsigned NumCycles,
                                          BranchProbability Probability)
                                          const {
   return true;
@@ -908,7 +924,6 @@ R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
   return false;
 }
 
-
 bool
 R600InstrInfo::reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   MachineOperand &MO = Cond[1];
@@ -948,7 +963,6 @@ bool R600InstrInfo::DefinesPredicate(MachineInstr &MI,
   return isPredicateSetter(MI.getOpcode());
 }
 
-
 bool R600InstrInfo::PredicateInstruction(MachineInstr &MI,
                                          ArrayRef<MachineOperand> Pred) const {
   int PIdx = MI.findFirstPredOperandIdx();
@@ -1067,7 +1081,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
-void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600FrameLowering *TFL = ST.getFrameLowering();
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index a280052..3b82800 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -177,12 +177,12 @@ public:
 
   bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
-  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                  BranchProbability Probability) const override;
 
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override ;
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
index 9210e66..bac557b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -316,7 +316,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
   [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
-            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }]
 >;
 
 def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
@@ -326,8 +326,8 @@ def vtx_id3_load : LoadParamFrag<load>;
 class LoadVtxId1 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
            !isa<GlobalValue>(GetUnderlyingObject(
            LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
 }]>;
@@ -339,7 +339,7 @@ def vtx_id1_load : LoadVtxId1 <load>;
 class LoadVtxId2 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
          isa<GlobalValue>(GetUnderlyingObject(
          LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
 }]>;
@@ -1013,7 +1013,7 @@ multiclass CUBE_Common <bits<11> inst> {
     (outs R600_Reg128:$dst),
     (ins R600_Reg128:$src0),
     "CUBE $dst $src0",
-    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+    [(set v4f32:$dst, (int_r600_cube v4f32:$src0))],
     VecALU
   > {
     let isPseudo = 1;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
index a5310e9..4c9e1e8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Intrinsics.td
@@ -61,7 +61,7 @@ def int_r600_ddx : TextureIntrinsicFloatInput;
 def int_r600_ddy : TextureIntrinsicFloatInput;
 
 def int_r600_dot4 : Intrinsic<[llvm_float_ty],
-  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
 >;
 
 } // End TargetPrefix = "r600", isTarget = 1
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index db18e5b..a7e540f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -13,16 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "R600MachineScheduler.h"
-#include "R600InstrInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "R600InstrInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Pass.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index d90008a..502dd3b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -124,7 +124,7 @@ private:
 public:
   static char ID;
 
-  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  R600VectorRegMerger() : MachineFunctionPass(ID),
   TII(nullptr) { }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -396,6 +396,6 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
   return false;
 }
 
-llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
-  return new R600VectorRegMerger(tm);
+llvm::FunctionPass *llvm::createR600VectorRegMerger() {
+  return new R600VectorRegMerger();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 5b6dd1e..1cb4093 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "R600InstrInfo.h"
@@ -24,6 +23,7 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -36,7 +36,7 @@ class R600Packetizer : public MachineFunctionPass {
 
 public:
   static char ID;
-  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
+  R600Packetizer() : MachineFunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -404,6 +404,6 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
 } // end anonymous namespace
 
-llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
-  return new R600Packetizer(tm);
+llvm::FunctionPass *llvm::createR600Packetizer() {
+  return new R600Packetizer();
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index dfdc602..7501fac 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+// Dummy to not crash RegisterClassInfo.
+static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
+
+const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *) const {
+  return &CalleeSavedReg;
+}
+
+unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return AMDGPU::NoRegister;
+}
+
 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 9dfb310..f0d9644 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
   R600RegisterInfo();
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
index cc667d9..3c1e852 100644
--- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_Addr,
     R600_KC0, R600_KC1,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
-    ALU_CONST, ALU_PARAM, OQAP
+    ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR
     )>;
 
 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d70f52e..8cb35c5 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -34,15 +35,6 @@ namespace {
 typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
-// Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.amdgcn.if";
-static const char *const ElseIntrinsic = "llvm.amdgcn.else";
-static const char *const BreakIntrinsic = "llvm.amdgcn.break";
-static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
-static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
-static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
-
 class SIAnnotateControlFlow : public FunctionPass {
   DivergenceAnalysis *DA;
 
@@ -56,13 +48,13 @@ class SIAnnotateControlFlow : public FunctionPass {
   UndefValue *BoolUndef;
   Constant *Int64Zero;
 
-  Constant *If;
-  Constant *Else;
-  Constant *Break;
-  Constant *IfBreak;
-  Constant *ElseBreak;
-  Constant *Loop;
-  Constant *EndCf;
+  Function *If;
+  Function *Else;
+  Function *Break;
+  Function *IfBreak;
+  Function *ElseBreak;
+  Function *Loop;
+  Function *EndCf;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -85,8 +77,10 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   void insertElse(BranchInst *Term);
 
-  Value *handleLoopCondition(Value *Cond, PHINode *Broken,
-                             llvm::Loop *L, BranchInst *Term);
+  Value *
+  handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
+                      BranchInst *Term,
+                      SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);
 
   void handleLoop(BranchInst *Term);
 
@@ -118,6 +112,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
                       "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
                     "Annotate SI Control Flow", false, false)
@@ -131,37 +126,20 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
   Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
+  ReturnStruct = StructType::get(Boolean, Int64);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
   BoolUndef = UndefValue::get(Boolean);
   Int64Zero = ConstantInt::get(Int64, 0);
 
-  If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
-
-  Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
-
-  Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
-  cast<Function>(Break)->setDoesNotAccessMemory();
-
-  IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
-  cast<Function>(IfBreak)->setDoesNotAccessMemory();;
-
-  ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
-  cast<Function>(ElseBreak)->setDoesNotAccessMemory();
-
-  Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
-
-  EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
-
+  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
+  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
+  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
+  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
+  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
+  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
+  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
 }
 
@@ -208,15 +186,16 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
 
 // \brief Erase "Phi" if it is not used any more
 void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
-  if (!Phi->hasNUsesOrMore(1))
-    Phi->eraseFromParent();
+  if (llvm::RecursivelyDeleteDeadPHINode(Phi)) {
+    DEBUG(dbgs() << "Erased unused condition phi\n");
+  }
 }
 
 /// \brief Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
-  if (isUniform(Term)) {
+  if (isUniform(Term))
     return;
-  }
+
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -233,8 +212,9 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 }
 
 /// \brief Recursively handle the condition leading to a loop
-Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
-                                             llvm::Loop *L, BranchInst *Term) {
+Value *SIAnnotateControlFlow::handleLoopCondition(
+    Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
+    SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
 
   // Only search through PHI nodes which are inside the loop.  If we try this
   // with PHI nodes that are outside of the loop, we end up inserting new PHI
@@ -245,7 +225,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
   if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
 
     BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
     Value *Ret = NewPhi;
 
     // Handle all non-constant incoming values first
@@ -258,14 +238,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
       }
 
       Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
+                                          Term, LoopPhiConditions);
       NewPhi->addIncoming(PhiArg, From);
     }
 
     BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
 
     for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-
       Value *Incoming = Phi->getIncomingValue(i);
       if (Incoming != BoolTrue)
         continue;
@@ -295,14 +275,17 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
           continue;
         }
       }
+
       TerminatorInst *Insert = From->getTerminator();
       Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
       NewPhi->setIncomingValue(i, PhiArg);
     }
-    eraseIfUnused(Phi);
+
+    LoopPhiConditions.push_back(WeakTrackingVH(Phi));
     return Ret;
+  }
 
-  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+  if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
     if (L->contains(Inst)) {
@@ -310,46 +293,55 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
     } else {
       Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     }
+
     Value *Args[] = { Cond, Broken };
     return CallInst::Create(IfBreak, Args, "", Insert);
+  }
 
-  // Insert IfBreak before TERM for constant COND.
-  } else if (isa<ConstantInt>(Cond)) {
-    Value *Args[] = { Cond, Broken };
-    return CallInst::Create(IfBreak, Args, "", Term);
+  // Insert IfBreak in the loop header TERM for constant COND other than true.
+  if (isa<Constant>(Cond)) {
+    Instruction *Insert = Cond == BoolTrue ?
+      Term : L->getHeader()->getTerminator();
 
-  } else {
-    llvm_unreachable("Unhandled loop condition!");
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
   }
-  return nullptr;
+
+  llvm_unreachable("Unhandled loop condition!");
 }
 
 /// \brief Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
-  if (isUniform(Term)) {
+  if (isUniform(Term))
     return;
-  }
 
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   if (!L)
     return;
+
   BasicBlock *Target = Term->getSuccessor(1);
-  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+  PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
 
+  SmallVector<WeakTrackingVH, 8> LoopPhiConditions;
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
 
-  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
-       PI != PE; ++PI) {
+  for (BasicBlock *Pred : predecessors(Target))
+    Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
-    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  for (WeakTrackingVH Val : reverse(LoopPhiConditions)) {
+    if (PHINode *Cond = cast_or_null<PHINode>(Val))
+      eraseIfUnused(Cond);
   }
 
-  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
   push(Term->getSuccessor(0), Arg);
-}/// \brief Close the last opened control flow
+}
+
+/// \brief Close the last opened control flow
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
@@ -359,59 +351,62 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     // We can't insert an EndCF call into a loop header, because it will
     // get executed on every iteration of the loop, when it should be
     // executed only once before the loop.
-    SmallVector <BasicBlock*, 8> Latches;
+    SmallVector <BasicBlock *, 8> Latches;
     L->getLoopLatches(Latches);
 
-    std::vector<BasicBlock*> Preds;
-    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (!is_contained(Latches, *PI))
-        Preds.push_back(*PI);
+    SmallVector<BasicBlock *, 2> Preds;
+    for (BasicBlock *Pred : predecessors(BB)) {
+      if (!is_contained(Latches, Pred))
+        Preds.push_back(Pred);
     }
+
     BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
   Value *Exec = popSaved();
-  if (!isa<UndefValue>(Exec))
-    CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
+  Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
+  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt))
+    CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
-
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DA = &getAnalysis<DivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
-
-    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+    BasicBlock *BB = *I;
+    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
 
     if (!Term || Term->isUnconditional()) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
+      if (isTopOfStack(BB))
+        closeControlFlow(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
+      if (isTopOfStack(BB))
+        closeControlFlow(BB);
 
       handleLoop(Term);
       continue;
     }
 
-    if (isTopOfStack(*I)) {
+    if (isTopOfStack(BB)) {
       PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
-      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+      if (Phi && Phi->getParent() == BB && isElse(Phi)) {
         insertElse(Term);
         eraseIfUnused(Phi);
         continue;
       }
-      closeControlFlow(*I);
+
+      closeControlFlow(BB);
     }
+
     openIf(Term);
   }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
index 62ebef8..b5c439b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp
@@ -19,8 +19,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SIInstrInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
index ff4e321..3915c0e 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -36,6 +36,7 @@ enum : uint64_t {
 
  // TODO: Should this be spilt into VOP3 a and b?
   VOP3 = 1 << 10,
+  VOP3P = 1 << 12,
 
   VINTRP = 1 << 13,
   SDWA = 1 << 14,
@@ -65,8 +66,8 @@ enum : uint64_t {
   SOPK_ZEXT = UINT64_C(1) << 38,
   SCALAR_STORE = UINT64_C(1) << 39,
   FIXED_SIZE = UINT64_C(1) << 40,
-  VOPAsmPrefer32Bit = UINT64_C(1) << 41
-
+  VOPAsmPrefer32Bit = UINT64_C(1) << 41,
+  HasFPClamp = UINT64_C(1) << 42
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -102,12 +103,14 @@ namespace AMDGPU {
     OPERAND_REG_INLINE_C_FP16,
     OPERAND_REG_INLINE_C_FP32,
     OPERAND_REG_INLINE_C_FP64,
+    OPERAND_REG_INLINE_C_V2FP16,
+    OPERAND_REG_INLINE_C_V2INT16,
 
     OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
 
     OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
-    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16,
 
     OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -115,6 +118,10 @@ namespace AMDGPU {
     // Operand for source modifiers for VOP instructions
     OPERAND_INPUT_MODS,
 
+    // Operand for SDWA instructions
+    OPERAND_SDWA_SRC,
+    OPERAND_SDWA_VOPC_DST,
+
     /// Operand with 32-bit immediate that uses the constant bus.
     OPERAND_KIMM32,
     OPERAND_KIMM16
@@ -125,9 +132,12 @@ namespace AMDGPU {
 // NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
   enum {
-   NEG = 1 << 0,  // Floating-point negate modifier
-   ABS = 1 << 1,  // Floating-point absolute modifier
-   SEXT = 1 << 0  // Integer sign-extend modifier
+   NEG = 1 << 0,   // Floating-point negate modifier
+   ABS = 1 << 1,   // Floating-point absolute modifier
+   SEXT = 1 << 0,  // Integer sign-extend modifier
+   NEG_HI = ABS,   // Floating-point negate high packed component modifier.
+   OP_SEL_0 = 1 << 2,
+   OP_SEL_1 = 1 << 3
   };
 }
 
@@ -154,7 +164,8 @@ namespace AMDGPUAsmVariants {
     DEFAULT = 0,
     VOP3 = 1,
     SDWA = 2,
-    DPP = 3
+    SDWA9 = 3,
+    DPP = 4
   };
 }
 
@@ -242,6 +253,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_LDS_ALLOC = 6,
   ID_IB_STS = 7,
   ID_SYMBOLIC_LAST_ = 8,
+  ID_MEM_BASES = 15,
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -251,18 +263,64 @@ enum Offset { // Offset, (5) [10:6]
   OFFSET_DEFAULT_ = 0,
   OFFSET_SHIFT_ = 6,
   OFFSET_WIDTH_ = 5,
-  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
+
+  OFFSET_SRC_SHARED_BASE = 16,
+  OFFSET_SRC_PRIVATE_BASE = 0
 };
 
 enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
   WIDTH_M1_DEFAULT_ = 31,
   WIDTH_M1_SHIFT_ = 11,
   WIDTH_M1_WIDTH_ = 5,
-  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
+
+  WIDTH_M1_SRC_SHARED_BASE = 15,
+  WIDTH_M1_SRC_PRIVATE_BASE = 15
 };
 
 } // namespace Hwreg
 
+namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
+
+enum Id { // id of symbolic names
+  ID_QUAD_PERM = 0,
+  ID_BITMASK_PERM,
+  ID_SWAP,
+  ID_REVERSE,
+  ID_BROADCAST
+};
+
+enum EncBits {
+
+  // swizzle mode encodings
+
+  QUAD_PERM_ENC         = 0x8000,
+  QUAD_PERM_ENC_MASK    = 0xFF00,
+
+  BITMASK_PERM_ENC      = 0x0000,
+  BITMASK_PERM_ENC_MASK = 0x8000,
+
+  // QUAD_PERM encodings
+
+  LANE_MASK             = 0x3,
+  LANE_MAX              = LANE_MASK,
+  LANE_SHIFT            = 2,
+  LANE_NUM              = 4,
+
+  // BITMASK_PERM encodings
+
+  BITMASK_MASK          = 0x1F,
+  BITMASK_MAX           = BITMASK_MASK,
+  BITMASK_WIDTH         = 5,
+
+  BITMASK_AND_SHIFT     = 0,
+  BITMASK_OR_SHIFT      = 5,
+  BITMASK_XOR_SHIFT     = 10
+};
+
+} // namespace Swizzle
+
 namespace SDWA {
 
 enum SdwaSel {
@@ -281,6 +339,18 @@ enum DstUnused {
   UNUSED_PRESERVE = 2,
 };
 
+enum SDWA9EncValues{
+  SRC_SGPR_MASK = 0x100,
+  SRC_VGPR_MASK = 0xFF,
+  VOPC_DST_VCC_MASK = 0x80,
+  VOPC_DST_SGPR_MASK = 0x7F,
+
+  SRC_VGPR_MIN = 0,
+  SRC_VGPR_MAX = 255,
+  SRC_SGPR_MIN = 256,
+  SRC_SGPR_MAX = 357,
+};
+
 } // namespace SDWA
 } // namespace AMDGPU
 
@@ -289,6 +359,7 @@ enum DstUnused {
 #define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
 #define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
+#define R_00B428_SPI_SHADER_PGM_RSRC1_HS                                0x00B428
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
@@ -300,6 +371,9 @@ enum DstUnused {
 #define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TRAP_HANDLER(x)                                    (((x) & 0x1) << 6)
+#define   G_00B84C_TRAP_HANDLER(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B84C_TRAP_HANDLER                                       0xFFFFFFBF
 #define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
@@ -387,7 +461,6 @@ enum DstUnused {
 
 #define R_SPILLED_SGPRS         0x4
 #define R_SPILLED_VGPRS         0x8
-
 } // End namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 6a422e7..0a795c9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -68,6 +68,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -80,6 +81,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "si-fix-sgpr-copies"
 
+static cl::opt<bool> EnableM0Merge(
+  "amdgpu-enable-merge-m0",
+  cl::desc("Merge and hoist M0 initializations"),
+  cl::init(false));
+
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
@@ -107,7 +113,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
                      "SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
                      "SI Fix SGPR copies", false, false)
 
@@ -168,6 +174,31 @@ static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
   return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);
 }
 
+static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
+                                      const SIRegisterInfo *TRI,
+                                      const SIInstrInfo *TII) {
+  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  auto &Src = MI.getOperand(1);
+  unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned SrcReg = Src.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      !TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
+    const auto *UseMI = MO.getParent();
+    if (UseMI == &MI)
+      continue;
+    if (MO.isDef() || UseMI->getParent() != MI.getParent() ||
+        UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END ||
+        !TII->isOperandLegal(*UseMI, UseMI->getOperandNo(&MO), &Src))
+      return false;
+  }
+  // Change VGPR to SGPR destination.
+  MRI.setRegClass(DstReg, TRI->getEquivalentSGPRClass(MRI.getRegClass(DstReg)));
+  return true;
+}
+
 // Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE.
 //
 // SGPRx = ...
@@ -198,12 +229,19 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   if (!CopyUse.isCopy())
     return false;
 
+  // It is illegal to have vreg inputs to a physreg defining reg_sequence.
+  if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+    return false;
+
   const TargetRegisterClass *SrcRC, *DstRC;
   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
 
   if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI))
     return false;
 
+  if (tryChangeVGPRtoSGPRinCopy(CopyUse, TRI, TII))
+    return true;
+
   // TODO: Could have multiple extracts?
   unsigned SubReg = CopyUse.getOperand(1).getSubReg();
   if (SubReg != AMDGPU::NoSubRegister)
@@ -234,8 +272,9 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
 
     unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
 
-    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg)
-      .addOperand(MI.getOperand(I));
+    BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
+            TmpReg)
+        .add(MI.getOperand(I));
 
     MI.getOperand(I).setReg(TmpReg);
   }
@@ -267,8 +306,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
 
     Visited.insert(Reg);
 
-    MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
-    assert(DefInstr);
+    MachineInstr *DefInstr = MRI.getVRegDef(Reg);
     switch (DefInstr->getOpcode()) {
     default:
       break;
@@ -326,6 +364,186 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
   return true;
 }
 
+template <class UnaryPredicate>
+bool searchPredecessors(const MachineBasicBlock *MBB,
+                        const MachineBasicBlock *CutOff,
+                        UnaryPredicate Predicate) {
+
+  if (MBB == CutOff)
+    return false;
+
+  DenseSet<const MachineBasicBlock*> Visited;
+  SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
+                                              MBB->pred_end());
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+
+    if (!Visited.insert(MBB).second)
+      continue;
+    if (MBB == CutOff)
+      continue;
+    if (Predicate(MBB))
+      return true;
+
+    Worklist.append(MBB->pred_begin(), MBB->pred_end());
+  }
+
+  return false;
+}
+
+static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
+                                        const TargetRegisterInfo *TRI) {
+  return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
+           return hasTerminatorThatModifiesExec(*MBB, *TRI); });
+}
+
+// Checks if there is potential path From instruction To instruction.
+// If CutOff is specified and it sits in between of that path we ignore
+// a higher portion of the path and report it is not reachable.
+static bool isReachable(const MachineInstr *From,
+                        const MachineInstr *To,
+                        const MachineBasicBlock *CutOff,
+                        MachineDominatorTree &MDT) {
+  // If either From block dominates To block or instructions are in the same
+  // block and From is higher.
+  if (MDT.dominates(From, To))
+    return true;
+
+  const MachineBasicBlock *MBBFrom = From->getParent();
+  const MachineBasicBlock *MBBTo = To->getParent();
+  if (MBBFrom == MBBTo)
+    return false;
+
+  // Instructions are in different blocks, do predecessor search.
+  // We should almost never get here since we do not usually produce M0 stores
+  // other than -1.
+  return searchPredecessors(MBBTo, CutOff, [MBBFrom]
+           (const MachineBasicBlock *MBB) { return MBB == MBBFrom; });
+}
+
+// Hoist and merge identical SGPR initializations into a common predecessor.
+// This is intended to combine M0 initializations, but can work with any
+// SGPR. A VGPR cannot be processed since we cannot guarantee vector
+// executioon.
+static bool hoistAndMergeSGPRInits(unsigned Reg,
+                                   const MachineRegisterInfo &MRI,
+                                   MachineDominatorTree &MDT) {
+  // List of inits by immediate value.
+  typedef std::map<unsigned, std::list<MachineInstr*>> InitListMap;
+  InitListMap Inits;
+  // List of clobbering instructions.
+  SmallVector<MachineInstr*, 8> Clobbers;
+  bool Changed = false;
+
+  for (auto &MI : MRI.def_instructions(Reg)) {
+    MachineOperand *Imm = nullptr;
+    for (auto &MO: MI.operands()) {
+      if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
+          (!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
+        Imm = nullptr;
+        break;
+      } else if (MO.isImm())
+        Imm = &MO;
+    }
+    if (Imm)
+      Inits[Imm->getImm()].push_front(&MI);
+    else
+      Clobbers.push_back(&MI);
+  }
+
+  for (auto &Init : Inits) {
+    auto &Defs = Init.second;
+
+    for (auto I1 = Defs.begin(), E = Defs.end(); I1 != E; ) {
+      MachineInstr *MI1 = *I1;
+
+      for (auto I2 = std::next(I1); I2 != E; ) {
+        MachineInstr *MI2 = *I2;
+
+        // Check any possible interference
+        auto intereferes = [&](MachineBasicBlock::iterator From,
+                               MachineBasicBlock::iterator To) -> bool {
+
+          assert(MDT.dominates(&*To, &*From));
+
+          auto interferes = [&MDT, From, To](MachineInstr* &Clobber) -> bool {
+            const MachineBasicBlock *MBBFrom = From->getParent();
+            const MachineBasicBlock *MBBTo = To->getParent();
+            bool MayClobberFrom = isReachable(Clobber, &*From, MBBTo, MDT);
+            bool MayClobberTo = isReachable(Clobber, &*To, MBBTo, MDT);
+            if (!MayClobberFrom && !MayClobberTo)
+              return false;
+            if ((MayClobberFrom && !MayClobberTo) ||
+                (!MayClobberFrom && MayClobberTo))
+              return true;
+            // Both can clobber, this is not an interference only if both are
+            // dominated by Clobber and belong to the same block or if Clobber
+            // properly dominates To, given that To >> From, so it dominates
+            // both and located in a common dominator.
+            return !((MBBFrom == MBBTo &&
+                      MDT.dominates(Clobber, &*From) &&
+                      MDT.dominates(Clobber, &*To)) ||
+                     MDT.properlyDominates(Clobber->getParent(), MBBTo));
+          };
+
+          return (any_of(Clobbers, interferes)) ||
+                 (any_of(Inits, [&](InitListMap::value_type &C) {
+                    return C.first != Init.first && any_of(C.second, interferes);
+                  }));
+        };
+
+        if (MDT.dominates(MI1, MI2)) {
+          if (!intereferes(MI2, MI1)) {
+            DEBUG(dbgs() << "Erasing from BB#" << MI2->getParent()->getNumber()
+                         << " " << *MI2);
+            MI2->eraseFromParent();
+            Defs.erase(I2++);
+            Changed = true;
+            continue;
+          }
+        } else if (MDT.dominates(MI2, MI1)) {
+          if (!intereferes(MI1, MI2)) {
+            DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+                         << " " << *MI1);
+            MI1->eraseFromParent();
+            Defs.erase(I1++);
+            Changed = true;
+            break;
+          }
+        } else {
+          auto *MBB = MDT.findNearestCommonDominator(MI1->getParent(),
+                                                     MI2->getParent());
+          if (!MBB) {
+            ++I2;
+            continue;
+          }
+
+          MachineBasicBlock::iterator I = MBB->getFirstNonPHI();
+          if (!intereferes(MI1, I) && !intereferes(MI2, I)) {
+            DEBUG(dbgs() << "Erasing from BB#" << MI1->getParent()->getNumber()
+                         << " " << *MI1 << "and moving from BB#"
+                         << MI2->getParent()->getNumber() << " to BB#"
+                         << I->getParent()->getNumber() << " " << *MI2);
+            I->getParent()->splice(I, MI2->getParent(), MI2);
+            MI1->eraseFromParent();
+            Defs.erase(I1++);
+            Changed = true;
+            break;
+          }
+        }
+        ++I2;
+      }
+      ++I1;
+    }
+  }
+
+  if (Changed)
+    MRI.clearKillFlags(Reg);
+
+  return Changed;
+}
+
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -355,7 +573,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         const TargetRegisterClass *SrcRC, *DstRC;
         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
         if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
-          MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+          unsigned SrcReg = MI.getOperand(1).getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+            TII->moveToVALU(MI);
+            break;
+          }
+
+          MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
           unsigned SMovOp;
           int64_t Imm;
           // If we are just copying an immediate, we can replace the copy with
@@ -367,6 +591,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
             break;
           }
           TII->moveToVALU(MI);
+        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+          tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
         break;
@@ -382,8 +608,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
           MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
 
-          MachineBasicBlock *NCD = MDT->findNearestCommonDominator(MBB0, MBB1);
-          if (NCD && !hasTerminatorThatModifiesExec(*NCD, *TRI)) {
+          if (!predsHasDivergentTerminator(MBB0, TRI) &&
+              !predsHasDivergentTerminator(MBB1, TRI)) {
             DEBUG(dbgs() << "Not fixing PHI for uniform branch: " << MI << '\n');
             break;
           }
@@ -458,5 +684,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
+    hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT);
+
   return true;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
new file mode 100644
index 0000000..3d31217
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -0,0 +1,72 @@
+//===-- SIFixVGPRCopies.cpp - Fix VGPR Copies after regalloc --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Add implicit use of exec to vector register copies.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-vgpr-copies"
+
+namespace {
+
+class SIFixVGPRCopies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixVGPRCopies() : MachineFunctionPass(ID) {
+    initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fix VGPR copies"; }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixVGPRCopies, DEBUG_TYPE, "SI Fix VGPR copies", false, false)
+
+char SIFixVGPRCopies::ID = 0;
+
+char &llvm::SIFixVGPRCopiesID = SIFixVGPRCopies::ID;
+
+bool SIFixVGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case AMDGPU::COPY:
+        if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) {
+          MI.addOperand(MF,
+                        MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+          DEBUG(dbgs() << "Add exec use to " << MI);
+          Changed = true;
+        }
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a5c0d49..0aad8f0 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,6 +12,8 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -34,9 +36,12 @@ struct FoldCandidate {
   };
   unsigned char UseOpNo;
   MachineOperand::MachineOperandType Kind;
+  bool Commuted;
 
-  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
-    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()) {
+  FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
+                bool Commuted_ = false) :
+    UseMI(MI), OpToFold(nullptr), UseOpNo(OpNo), Kind(FoldOp->getType()),
+    Commuted(Commuted_) {
     if (FoldOp->isImm()) {
       ImmToFold = FoldOp->getImm();
     } else if (FoldOp->isFI()) {
@@ -58,6 +63,10 @@ struct FoldCandidate {
   bool isReg() const {
     return Kind == MachineOperand::MO_Register;
   }
+
+  bool isCommuted() const {
+    return Commuted;
+  }
 };
 
 class SIFoldOperands : public MachineFunctionPass {
@@ -66,6 +75,7 @@ public:
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
+  const SISubtarget *ST;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
@@ -75,6 +85,12 @@ public:
 
   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
+  const MachineOperand *isClamp(const MachineInstr &MI) const;
+  bool tryFoldClamp(MachineInstr &MI);
+
+  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
+  bool tryFoldOMod(MachineInstr &MI);
+
 public:
   SIFoldOperands() : MachineFunctionPass(ID) {
     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
@@ -121,6 +137,7 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
         = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
       return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
     }
+    return false;
   }
   default:
     return false;
@@ -131,27 +148,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
 
-static bool isSafeToFold(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
-  case AMDGPU::V_MOV_B64_PSEUDO: {
-    // If there are additional implicit register operands, this may be used for
-    // register indexing so the source register operand isn't simply copied.
-    unsigned NumOps = MI.getDesc().getNumOperands() +
-      MI.getDesc().getNumImplicitUses();
-
-    return MI.getNumOperands() == NumOps;
-  }
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::COPY:
-    return true;
-  default:
-    return false;
-  }
-}
-
 static bool updateOperand(FoldCandidate &Fold,
                           const TargetRegisterInfo &TRI) {
   MachineInstr *MI = Fold.UseMI;
@@ -172,6 +168,8 @@ static bool updateOperand(FoldCandidate &Fold,
   if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
       TargetRegisterInfo::isVirtualRegister(New->getReg())) {
     Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
+
+    Old.setIsUndef(New->isUndef());
     return true;
   }
 
@@ -250,8 +248,13 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
         !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
       return false;
 
-    if (!TII->isOperandLegal(*MI, OpNo, OpToFold))
+    if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+      TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
       return false;
+    }
+
+    FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold, true));
+    return true;
   }
 
   FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
@@ -260,9 +263,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
 // If the use operand doesn't care about the value, this may be an operand only
 // used for register indexing, in which case it is unsafe to fold.
-static bool isUseSafeToFold(const MachineInstr &MI,
+static bool isUseSafeToFold(const SIInstrInfo *TII,
+                            const MachineInstr &MI,
                             const MachineOperand &UseMO) {
-  return !UseMO.isUndef();
+  return !UseMO.isUndef() && !TII->isSDWA(MI);
   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }
 
@@ -274,7 +278,7 @@ void SIFoldOperands::foldOperand(
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
 
-  if (!isUseSafeToFold(*UseMI, UseOp))
+  if (!isUseSafeToFold(TII, *UseMI, UseOp))
     return;
 
   // FIXME: Fold operands with subregs.
@@ -359,8 +363,6 @@ void SIFoldOperands::foldOperand(
   const TargetRegisterClass *FoldRC =
     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
 
-  APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
-            OpToFold.getImm());
 
   // Split 64-bit constants into 32-bits for folding.
   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
@@ -370,21 +372,25 @@ void SIFoldOperands::foldOperand(
       MRI->getRegClass(UseReg) :
       TRI->getPhysRegClass(UseReg);
 
-    assert(Imm.getBitWidth() == 64);
-
     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
       return;
 
+    APInt Imm(64, OpToFold.getImm());
     if (UseOp.getSubReg() == AMDGPU::sub0) {
       Imm = Imm.getLoBits(32);
     } else {
       assert(UseOp.getSubReg() == AMDGPU::sub1);
       Imm = Imm.getHiBits(32);
     }
+
+    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+    return;
   }
 
-  MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+
+
+  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 }
 
 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
@@ -468,7 +474,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
       return &Op;
 
     MachineInstr *Def = MRI.getVRegDef(Op.getReg());
-    if (Def->isMoveImmediate()) {
+    if (Def && Def->isMoveImmediate()) {
       MachineOperand &ImmSrc = Def->getOperand(1);
       if (ImmSrc.isImm())
         return &ImmSrc;
@@ -581,6 +587,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   return false;
 }
 
+// Try to fold an instruction into a simpler one
+static bool tryFoldInst(const SIInstrInfo *TII,
+                        MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
+      Opc == AMDGPU::V_CNDMASK_B32_e64    ||
+      Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
+    const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
+    if (Src1->isIdenticalTo(*Src0)) {
+      DEBUG(dbgs() << "Folded " << *MI << " into ");
+      int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+      if (Src2Idx != -1)
+        MI->RemoveOperand(Src2Idx);
+      MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+      mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
+                                               : getMovOpc(false)));
+      DEBUG(dbgs() << *MI << '\n');
+      return true;
+    }
+  }
+
+  return false;
+}
+
 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
                                      MachineOperand &OpToFold) const {
   // We need mutate the operands of new mov instructions to add implicit
@@ -621,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
         // again. The same constant folded instruction could also have a second
         // use operand.
         NextUse = MRI->use_begin(Dst.getReg());
+        FoldList.clear();
         continue;
       }
 
@@ -682,31 +715,230 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       }
       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+      tryFoldInst(TII, Fold.UseMI);
+    } else if (Fold.isCommuted()) {
+      // Restoring instruction's original operand order if fold has failed.
+      TII->commuteInstruction(*Fold.UseMI, false);
     }
   }
 }
 
+const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
+  unsigned Op = MI.getOpcode();
+  switch (Op) {
+  case AMDGPU::V_MAX_F32_e64:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F64: {
+    if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
+      return nullptr;
+
+    // Make sure sources are identical.
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    if (!Src0->isReg() || !Src1->isReg() ||
+        Src0->getSubReg() != Src1->getSubReg() ||
+        Src0->getSubReg() != AMDGPU::NoSubRegister)
+      return nullptr;
+
+    // Can't fold up if we have modifiers.
+    if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+      return nullptr;
+    return Src0;
+  }
+  default:
+    return nullptr;
+  }
+}
+
+// We obviously have multiple uses in a clamp since the register is used twice
+// in the same instruction.
+static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
+  int Count = 0;
+  for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
+       I != E; ++I) {
+    if (++Count > 1)
+      return false;
+  }
+
+  return true;
+}
+
+bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
+  const MachineOperand *ClampSrc = isClamp(MI);
+  if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+  if (!TII->hasFPClamp(*Def))
+    return false;
+  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
+  if (!DefClamp)
+    return false;
+
+  DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+
+  // Clamp is applied after omod, so it is OK if omod is set.
+  DefClamp->setImm(1);
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
+static int getOModValue(unsigned Opc, int64_t Val) {
+  switch (Opc) {
+  case AMDGPU::V_MUL_F32_e64: {
+    switch (static_cast<uint32_t>(Val)) {
+    case 0x3f000000: // 0.5
+      return SIOutMods::DIV2;
+    case 0x40000000: // 2.0
+      return SIOutMods::MUL2;
+    case 0x40800000: // 4.0
+      return SIOutMods::MUL4;
+    default:
+      return SIOutMods::NONE;
+    }
+  }
+  case AMDGPU::V_MUL_F16_e64: {
+    switch (static_cast<uint16_t>(Val)) {
+    case 0x3800: // 0.5
+      return SIOutMods::DIV2;
+    case 0x4000: // 2.0
+      return SIOutMods::MUL2;
+    case 0x4400: // 4.0
+      return SIOutMods::MUL4;
+    default:
+      return SIOutMods::NONE;
+    }
+  }
+  default:
+    llvm_unreachable("invalid mul opcode");
+  }
+}
+
+// FIXME: Does this really not support denormals with f16?
+// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
+// handled, so will anything other than that break?
+std::pair<const MachineOperand *, int>
+SIFoldOperands::isOMod(const MachineInstr &MI) const {
+  unsigned Op = MI.getOpcode();
+  switch (Op) {
+  case AMDGPU::V_MUL_F32_e64:
+  case AMDGPU::V_MUL_F16_e64: {
+    // If output denormals are enabled, omod is ignored.
+    if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
+        (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    const MachineOperand *RegOp = nullptr;
+    const MachineOperand *ImmOp = nullptr;
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    if (Src0->isImm()) {
+      ImmOp = Src0;
+      RegOp = Src1;
+    } else if (Src1->isImm()) {
+      ImmOp = Src1;
+      RegOp = Src0;
+    } else
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    int OMod = getOModValue(Op, ImmOp->getImm());
+    if (OMod == SIOutMods::NONE ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    return std::make_pair(RegOp, OMod);
+  }
+  case AMDGPU::V_ADD_F32_e64:
+  case AMDGPU::V_ADD_F16_e64: {
+    // If output denormals are enabled, omod is ignored.
+    if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
+        (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+    if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
+        Src0->getSubReg() == Src1->getSubReg() &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+      return std::make_pair(Src0, SIOutMods::MUL2);
+
+    return std::make_pair(nullptr, SIOutMods::NONE);
+  }
+  default:
+    return std::make_pair(nullptr, SIOutMods::NONE);
+  }
+}
+
+// FIXME: Does this need to check IEEE bit on function?
+bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
+  const MachineOperand *RegOp;
+  int OMod;
+  std::tie(RegOp, OMod) = isOMod(MI);
+  if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
+      RegOp->getSubReg() != AMDGPU::NoSubRegister ||
+      !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
+  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
+  if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
+    return false;
+
+  // Clamp is applied after omod. If the source already has clamp set, don't
+  // fold it.
+  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
+    return false;
+
+  DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+
+  DefOMod->setImm(OMod);
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-
   MRI = &MF.getRegInfo();
-  TII = ST.getInstrInfo();
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
+  // correctly handle signed zeros.
+  //
+  // TODO: Check nsz on instructions when fast math flags are preserved to MI
+  // level.
+  bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
 
-    MachineBasicBlock &MBB = *BI;
+  for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineBasicBlock::iterator I, Next;
-    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    for (I = MBB->begin(); I != MBB->end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
-      if (!isSafeToFold(MI))
+      tryFoldInst(TII, &MI);
+
+      if (!TII->isFoldableCopy(MI)) {
+        if (IsIEEEMode || !tryFoldOMod(MI))
+          tryFoldClamp(MI);
         continue;
+      }
 
       MachineOperand &OpToFold = MI.getOperand(1);
       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0b57155..7334781 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,10 +8,10 @@
 //==-----------------------------------------------------------------------===//
 
 #include "SIFrameLowering.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "AMDGPUSubtarget.h"
 
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -21,22 +21,24 @@
 using namespace llvm;
 
 
-static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
-                                         const SIRegisterInfo *TRI) {
+static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+                                         const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
-                      TRI->getMaxNumSGPRs(MF) / 4);
+                      ST.getMaxNumSGPRs(MF) / 4);
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
-                                       const SIRegisterInfo *TRI) {
+static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+                                       const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-                      TRI->getMaxNumSGPRs(MF));
+                      ST.getMaxNumSGPRs(MF));
 }
 
-void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
-                                          const SIRegisterInfo* TRI,
+void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
                                           MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+
   // We don't need this if we only have spills since there is no user facing
   // scratch.
 
@@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
   MRI.addLiveIn(FlatScratchInitReg);
   MBB.addLiveIn(FlatScratchInitReg);
 
-  // Copy the size in bytes.
-  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
-    .addReg(FlatScrInitHi, RegState::Kill);
-
   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 
+  // Do a 64-bit pointer add.
+  if (ST.flatScratchIsPointer()) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
+      .addReg(FlatScrInitHi)
+      .addImm(0);
+
+    return;
+  }
+
+  // Copy the size in bytes.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
+    .addReg(FlatScrInitHi, RegState::Kill);
+
   // Add wave offset in bytes to private base offset.
   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@@ -87,10 +101,12 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
   MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  if (ScratchRsrcReg == AMDGPU::NoRegister)
+  if (ScratchRsrcReg == AMDGPU::NoRegister ||
+      !MRI.isPhysRegUsed(ScratchRsrcReg))
     return AMDGPU::NoRegister;
 
   if (ST.hasSGPRInitBug() ||
@@ -108,19 +124,16 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
 
   // We find the resource first because it has an alignment requirement.
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
-  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
 
-  // Skip the last 2 elements because the last one is reserved for VCC, and
-  // this is the 2nd to last element already.
+  // Skip the last N reserved elements because they should have already been
+  // reserved for VCC etc.
   for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
     // reserved input we needed.
     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-      //assert(MRI.isAllocatable(Reg));
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -130,25 +143,34 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   return ScratchRsrcReg;
 }
 
-unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
+// Shift down registers reserved for the scratch wave offset and stack pointer
+// SGPRs.
+std::pair<unsigned, unsigned>
+SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   const SISubtarget &ST,
   const SIInstrInfo *TII,
   const SIRegisterInfo *TRI,
   SIMachineFunctionInfo *MFI,
   MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-  if (ST.hasSGPRInitBug() ||
-      ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF))
-    return ScratchWaveOffsetReg;
 
-  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // No replacement necessary.
+  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
+      !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
+    assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
+    return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
+  }
+
+  unsigned SPReg = MFI->getStackPtrOffsetReg();
+  if (ST.hasSGPRInitBug())
+    return std::make_pair(ScratchWaveOffsetReg, SPReg);
 
   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
 
-  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
   if (NumPreloaded > AllSGPRs.size())
-    return ScratchWaveOffsetReg;
+    return std::make_pair(ScratchWaveOffsetReg, SPReg);
 
   AllSGPRs = AllSGPRs.slice(NumPreloaded);
 
@@ -163,33 +185,41 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
   //     register from the list to consider, it means that when this
   //     register is being used for the scratch wave offset and there
   //     are no other free SGPRs, then the value will stay in this register.
+  // + 1 if stack pointer is used.
   // ----
-  //  13
-  if (AllSGPRs.size() < 13)
-    return ScratchWaveOffsetReg;
+  //  13 (+1)
+  unsigned ReservedRegCount = 13;
+
+  if (AllSGPRs.size() < ReservedRegCount)
+    return std::make_pair(ScratchWaveOffsetReg, SPReg);
+
+  bool HandledScratchWaveOffsetReg =
+    ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
 
-  for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
+  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
     // scratch descriptor, since we haven’t added its uses yet.
-    if (!MRI.isPhysRegUsed(Reg)) {
-      if (!MRI.isAllocatable(Reg) ||
-          TRI->isSubRegisterEq(ScratchRsrcReg, Reg))
-        continue;
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+      if (!HandledScratchWaveOffsetReg) {
+        HandledScratchWaveOffsetReg = true;
 
-      MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-      MFI->setScratchWaveOffsetReg(Reg);
-      return Reg;
+        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+        MFI->setScratchWaveOffsetReg(Reg);
+        ScratchWaveOffsetReg = Reg;
+        break;
+      }
     }
   }
 
-  return ScratchWaveOffsetReg;
+  return std::make_pair(ScratchWaveOffsetReg, SPReg);
 }
 
-void SIFrameLowering::emitPrologue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
+void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
+                                                MachineBasicBlock &MBB) const {
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  auto AMDGPUASI = ST.getAMDGPUAS();
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
@@ -207,18 +237,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  unsigned ScratchRsrcReg
-    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
-  unsigned ScratchWaveOffsetReg
-    = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-
-  if (ScratchRsrcReg == AMDGPU::NoRegister) {
-    assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
-    return;
-  }
-
-  assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
-
   // We need to do the replacement of the private segment buffer and wave offset
   // register even if there are no stack objects. There could be stores to undef
   // or a constant without an associated object.
@@ -228,22 +246,55 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   // this point it appears we need the setup. This part of the prolog should be
   // emitted after frame indices are eliminated.
 
-  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
-    emitFlatScratchInit(TII, TRI, MF, MBB);
+  if (MFI->hasFlatScratchInit())
+    emitFlatScratchInit(ST, MF, MBB);
+
+  unsigned SPReg = MFI->getStackPtrOffsetReg();
+  if (SPReg != AMDGPU::SP_REG) {
+    assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
+
+    DebugLoc DL;
+    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    int64_t StackSize = FrameInfo.getStackSize();
+
+    if (StackSize == 0) {
+      BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
+        .addReg(MFI->getScratchWaveOffsetReg());
+    } else {
+      BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
+        .addReg(MFI->getScratchWaveOffsetReg())
+        .addImm(StackSize * ST.getWavefrontSize());
+    }
+  }
+
+  unsigned ScratchRsrcReg
+    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+
+  unsigned ScratchWaveOffsetReg;
+  std::tie(ScratchWaveOffsetReg, SPReg)
+    = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+
+  // It's possible to have uses of only ScratchWaveOffsetReg without
+  // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
+  // but the inverse is not true.
+  if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
+    assert(ScratchRsrcReg == AMDGPU::NoRegister);
+    return;
+  }
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
-
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
+  if (ST.isAmdCodeObjectV2(MF)) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
-  bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
-  bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
+  bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
+  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
+                         MRI.isPhysRegUsed(ScratchRsrcReg);
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
@@ -296,7 +347,8 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   if (OffsetRegUsed &&
       PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
-      .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+      .addReg(PreloadedScratchWaveOffsetReg,
+              MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill);
   }
 
   if (CopyBuffer && !CopyBufferFirst) {
@@ -314,21 +366,21 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
     // Use relocations to get the pointer, and setup the other bits manually.
     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
 
-    if (MFI->hasPrivateMemoryInputPtr()) {
+    if (MFI->hasImplicitBufferPtr()) {
       unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
 
       if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
 
         BuildMI(MBB, I, DL, Mov64, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
       } else {
         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
 
         PointerType *PtrTy =
           PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
-                           AMDGPUAS::CONSTANT_ADDRESS);
+                           AMDGPUASI.CONSTANT_ADDRESS);
         MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
         auto MMO = MF.getMachineMemOperand(PtrInfo,
                                            MachineMemOperand::MOLoad |
@@ -336,7 +388,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                            MachineMemOperand::MODereferenceable,
                                            0, 0);
         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
-          .addReg(PreloadedPrivateBufferReg)
+          .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addImm(0) // offset
           .addImm(0) // glc
           .addMemOperand(MMO)
@@ -366,9 +418,89 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (FuncInfo->isEntryFunction()) {
+    emitEntryFunctionPrologue(MF, MBB);
+    return;
+  }
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL;
+
+  bool NeedFP = hasFP(MF);
+  if (NeedFP) {
+    // If we need a base pointer, set it up here. It's whatever the value of
+    // the stack pointer is at this point. Any variable size objects will be
+    // allocated after this, so we can still use the base pointer to reference
+    // locals.
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+      .addReg(StackPtrReg)
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  uint32_t NumBytes = MFI.getStackSize();
+  if (NumBytes != 0 && hasSP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(NumBytes * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameSetup);
+  }
+}
+
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  if (FuncInfo->isEntryFunction())
+    return;
+
+  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  if (StackPtrReg == AMDGPU::NoRegister)
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  uint32_t NumBytes = MFI.getStackSize();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  DebugLoc DL;
+
+  // FIXME: Clarify distinction between no set SP and SP. For callee functions,
+  // it's really whether we need SP to be accurate or not.
 
+  if (NumBytes != 0 && hasSP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
+      .addReg(StackPtrReg)
+      .addImm(NumBytes * ST.getWavefrontSize())
+      .setMIFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+       I != E; ++I) {
+    if (!MFI.isDeadObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            unsigned &FrameReg) const {
+  const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+
+  FrameReg = RI->getFrameRegister(MF);
+  return MF.getFrameInfo().getObjectOffset(FI);
 }
 
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
@@ -379,15 +511,66 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (!MFI.hasStackObjects())
     return;
 
-  bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  bool AllSGPRSpilledToVGPRs = false;
+
+  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
+    AllSGPRSpilledToVGPRs = true;
+
+    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
+    // are spilled to VGPRs, in which case we can eliminate the stack usage.
+    //
+    // XXX - This operates under the assumption that only other SGPR spills are
+    // users of the frame index. I'm not 100% sure this is correct. The
+    // StackColoring pass has a comment saying a future improvement would be to
+    // merging of allocas with spill slots, but for now according to
+    // MachineFrameInfo isSpillSlot can't alias any other object.
+    for (MachineBasicBlock &MBB : MF) {
+      MachineBasicBlock::iterator Next;
+      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+        MachineInstr &MI = *I;
+        Next = std::next(I);
+
+        if (TII->isSGPRSpill(MI)) {
+          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
+            (void)Spilled;
+            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+          } else
+            AllSGPRSpilledToVGPRs = false;
+        }
+      }
+    }
 
-  assert((RS || !MayNeedScavengingEmergencySlot) &&
-         "RegScavenger required if spilling");
+    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
+  }
 
-  if (MayNeedScavengingEmergencySlot) {
-    int ScavengeFI = MFI.CreateStackObject(
-      AMDGPU::SGPR_32RegClass.getSize(),
-      AMDGPU::SGPR_32RegClass.getAlignment(), false);
+  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
+  // but currently hasNonSpillStackObjects is set only from source
+  // allocas. Stack temps produced from legalization are not counted currently.
+  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
+      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
+    assert(RS && "RegScavenger required if spilling");
+
+    // We force this to be at offset 0 so no user object ever has 0 as an
+    // address, so we may use 0 as an invalid pointer value. This is because
+    // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
+    // is required to be address space 0, we are forced to accept this for
+    // now. Ideally we could have the stack in another address space with 0 as a
+    // valid pointer, and -1 as the null value.
+    //
+    // This will also waste additional space when user stack objects require > 4
+    // byte alignment.
+    //
+    // The main cost here is losing the offset for addressing modes. However
+    // this also ensures we shouldn't need a register for the offset when
+    // emergency scavenging.
+    int ScavengeFI = MFI.CreateFixedObject(
+      TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
@@ -432,3 +615,19 @@ void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
       WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
   }
 }
+
+bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
+  // All stack operations are relative to the frame offset SGPR.
+  // TODO: Still want to eliminate sometimes.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // XXX - Is this only called after frame is finalized? Should be able to check
+  // frame size.
+  return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
+}
+
+bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
+  // All stack operations are relative to the frame offset SGPR.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.hasCalls() || MFI.hasVarSizedObjects();
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 7657b4e..d4dfa1c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -26,18 +26,21 @@ public:
     AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override = default;
 
+  void emitEntryFunctionPrologue(MachineFunction &MF,
+                                 MachineBasicBlock &MBB) const;
   void emitPrologue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
 
 private:
-  void emitFlatScratchInit(const SIInstrInfo *TII,
-                           const SIRegisterInfo* TRI,
+  void emitFlatScratchInit(const SISubtarget &ST,
                            MachineFunction &MF,
                            MachineBasicBlock &MBB) const;
 
@@ -48,7 +51,7 @@ private:
     SIMachineFunctionInfo *MFI,
     MachineFunction &MF) const;
 
-  unsigned getReservedPrivateSegmentWaveByteOffsetReg(
+  std::pair<unsigned, unsigned> getReservedPrivateSegmentWaveByteOffsetReg(
     const SISubtarget &ST,
     const SIInstrInfo *TII,
     const SIRegisterInfo *TRI,
@@ -57,6 +60,10 @@ private:
 
   /// \brief Emits debugger prologue.
   void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+public:
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasSP(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b98f9f4..2356405f 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,26 +15,71 @@
 #ifdef _MSC_VER
 // Provide M_PI.
 #define _USE_MATH_DEFINES
-#include <cmath>
 #endif
 
+#include "SIISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 #include "SIDefines.h"
-#include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/DAGCombine.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -43,7 +88,6 @@ static cl::opt<bool> EnableVGPRIndexMode(
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
   cl::init(false));
 
-
 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -84,6 +128,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
   }
 
+  if (Subtarget->hasVOP3PInsts()) {
+    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+  }
+
   computeRegisterProperties(STI.getRegisterInfo());
 
   // We need to custom lower vector stores from local memory
@@ -110,7 +159,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
 
-
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
@@ -142,10 +190,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
 
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
@@ -153,9 +208,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
 
+  setOperationAction(ISD::UADDO, MVT::i32, Legal);
+  setOperationAction(ISD::USUBO, MVT::i32, Legal);
+
+  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
+  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+        MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -202,6 +264,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
 
+  // Avoid stack access for these.
+  // TODO: Generalize to more vector types.
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
   // and output demarshalling
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
@@ -223,6 +292,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
   setOperationAction(ISD::TRAP, MVT::Other, Custom);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
 
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
@@ -303,6 +373,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
+    setOperationAction(ISD::FROUND, MVT::f16, Custom);
 
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
@@ -317,6 +388,96 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
   }
 
+  if (Subtarget->hasVOP3PInsts()) {
+    for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+        switch (Op) {
+        case ISD::LOAD:
+        case ISD::STORE:
+        case ISD::BUILD_VECTOR:
+        case ISD::BITCAST:
+        case ISD::EXTRACT_VECTOR_ELT:
+        case ISD::INSERT_VECTOR_ELT:
+        case ISD::INSERT_SUBVECTOR:
+        case ISD::EXTRACT_SUBVECTOR:
+        case ISD::SCALAR_TO_VECTOR:
+          break;
+        case ISD::CONCAT_VECTORS:
+          setOperationAction(Op, VT, Custom);
+          break;
+        default:
+          setOperationAction(Op, VT, Expand);
+          break;
+        }
+      }
+    }
+
+    // XXX - Do these do anything? Vector constants turn into build_vector.
+    setOperationAction(ISD::Constant, MVT::v2i16, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::STORE, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::STORE, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::AND, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::OR, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::XOR, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::ADD, MVT::v2i16, Legal);
+    setOperationAction(ISD::SUB, MVT::v2i16, Legal);
+    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SHL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SRL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SRA, MVT::v2i16, Legal);
+    setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
+    setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
+    setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
+
+    setOperationAction(ISD::FADD, MVT::v2f16, Legal);
+    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMA, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    // This isn't really legal, but this avoids the legalizer unrolling it (and
+    // allows matching fneg (fabs x) patterns)
+    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+  } else {
+    setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
+    setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+  }
+
+  for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
+    setOperationAction(ISD::SELECT, VT, Custom);
+  }
+
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ADDCARRY);
+  setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::SUBCARRY);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
@@ -332,6 +493,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::FCANONICALIZE);
+  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -364,36 +528,63 @@ const SISubtarget *SITargetLowering::getSubtarget() const {
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+                                          EVT) const {
+  // SI has some legal vector types, but no legal vector operations. Say no
+  // shuffles are legal in order to prefer scalarizing some vector operations.
+  return false;
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           unsigned IntrID) const {
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_atomic_dec: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align = 0;
-    Info.vol = false;
+
+    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
+    Info.vol = !Vol || !Vol->isZero();
     Info.readMem = true;
     Info.writeMem = true;
     return true;
+  }
   default:
     return false;
   }
 }
 
-bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
-                                          EVT) const {
-  // SI has some legal vector types, but no legal vector operations. Say no
-  // shuffles are legal in order to prefer scalarizing some vector operations.
-  return false;
+bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
+                                            SmallVectorImpl<Value*> &Ops,
+                                            Type *&AccessTy) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    Value *Ptr = II->getArgOperand(0);
+    AccessTy = II->getType();
+    Ops.push_back(Ptr);
+    return true;
+  }
+  default:
+    return false;
+  }
 }
 
 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
-  // Flat instructions do not have offsets, and only have the register
-  // address.
-  return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
+  if (!Subtarget->hasFlatInstOffsets()) {
+    // Flat instructions do not have offsets, and only have the register
+    // address.
+    return AM.BaseOffs == 0 && AM.Scale == 0;
+  }
+
+  // GFX9 added a 13-bit signed offset. When using regular flat instructions,
+  // the sign bit is ignored and is treated as a 12-bit unsigned offset.
+
+  // Just r + i
+  return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
 }
 
 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
@@ -438,8 +629,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS: {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
@@ -454,8 +644,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     }
 
     return isLegalMUBUFAddressingMode(AM);
-  }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
+  } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -478,7 +667,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -492,13 +681,11 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return true;
 
     return false;
-  }
 
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     return isLegalMUBUFAddressingMode(AM);
-
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
+             AS == AMDGPUASI.REGION_ADDRESS) {
     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     // field.
     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -513,21 +700,32 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return true;
 
     return false;
-  }
-  case AMDGPUAS::FLAT_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+  } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
+             AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     // For an unknown address space, this usually means that this is for some
     // reason being used for pure arithmetic, and not based on some addressing
     // computation. We don't have instructions that compute pointers with any
     // addressing modes, so treat them as having no offset like flat
     // instructions.
     return isLegalFlatAddressingMode(AM);
-
-  default:
+  } else {
     llvm_unreachable("unhandled address space");
   }
 }
 
+bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
+                                        const SelectionDAG &DAG) const {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
+    return (MemVT.getSizeInBits() <= 4 * 32);
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+    unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
+    return (MemVT.getSizeInBits() <= MaxPrivateBits);
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+    return (MemVT.getSizeInBits() <= 2 * 32);
+  }
+  return true;
+}
+
 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                       unsigned AddrSpace,
                                                       unsigned Align,
@@ -544,8 +742,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     return false;
   }
 
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
+  if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUASI.REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
@@ -560,8 +758,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
   if (!Subtarget->hasUnalignedScratchAccess() &&
-      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
-       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+      (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
+       AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
     return false;
   }
 
@@ -569,7 +767,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
         (Align % 4 == 0) : true;
     }
 
@@ -609,15 +807,16 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
-static bool isFlatGlobalAddrSpace(unsigned AS) {
-  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
+  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
+         isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
 }
 
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
@@ -631,7 +830,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
   // Flat -> global is no-op
-  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
     return true;
 
   return isNoopAddrSpaceCast(SrcAS, DestAS);
@@ -639,18 +838,8 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
 
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
-  const Value *Ptr = MemNode->getMemOperand()->getValue();
-
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
 
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
+  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -693,40 +882,28 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   return TargetLowering::isTypeDesirableForOp(Op, VT);
 }
 
-SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
-                                            const SDLoc &SL, SDValue Chain,
-                                            unsigned Offset) const {
+SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
+                                                   const SDLoc &SL,
+                                                   SDValue Chain,
+                                                   uint64_t Offset) const {
   const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF,
+                                                SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
   return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                      DAG.getConstant(Offset, SL, PtrVT));
 }
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         const SDLoc &SL, SDValue Chain,
-                                         unsigned Offset, bool Signed,
+SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Val,
+                                         bool Signed,
                                          const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
-  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
-
-  unsigned Align = DL.getABITypeAlignment(Ty);
-
-  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
-  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
-                             MachineMemOperand::MONonTemporal |
-                             MachineMemOperand::MODereferenceable |
-                             MachineMemOperand::MOInvariant);
-
-  SDValue Val = Load;
   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
       VT.bitsLT(MemVT)) {
     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
@@ -740,373 +917,545 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   else
     Val = DAG.getZExtOrTrunc(Val, SL, VT);
 
-  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
+  return Val;
 }
 
-SDValue SITargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+SDValue SITargetLowering::lowerKernargMemParameter(
+  SelectionDAG &DAG, EVT VT, EVT MemVT,
+  const SDLoc &SL, SDValue Chain,
+  uint64_t Offset, bool Signed,
+  const ISD::InputArg *Arg) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
+  unsigned Align = DL.getABITypeAlignment(Ty);
+
+  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
+  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+                             MachineMemOperand::MONonTemporal |
+                             MachineMemOperand::MODereferenceable |
+                             MachineMemOperand::MOInvariant);
+
+  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
+  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
+}
+
+SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+                                              const SDLoc &SL, SDValue Chain,
+                                              const ISD::InputArg &Arg) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  FunctionType *FType = MF.getFunction()->getFunctionType();
-  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
-    const Function *Fn = MF.getFunction();
-    DiagnosticInfoUnsupported NoGraphicsHSA(
-        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
-    DAG.getContext()->diagnose(NoGraphicsHSA);
-    return DAG.getEntryNode();
+  if (Arg.Flags.isByVal()) {
+    unsigned Size = Arg.Flags.getByValSize();
+    int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
+    return DAG.getFrameIndex(FrameIdx, MVT::i32);
   }
 
-  // Create stack objects that are used for emitting debugger prologue if
-  // "amdgpu-debugger-emit-prologue" attribute was specified.
-  if (ST.debuggerEmitPrologue())
-    createDebuggerPrologueStackObjects(MF);
+  unsigned ArgOffset = VA.getLocMemOffset();
+  unsigned ArgSize = VA.getValVT().getStoreSize();
 
-  SmallVector<ISD::InputArg, 16> Splits;
-  BitVector Skipped(Ins.size());
+  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
 
-  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
-    const ISD::InputArg &Arg = Ins[i];
+  // Create load nodes to retrieve arguments from the stack.
+  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+  SDValue ArgValue;
+
+  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  MVT MemVT = VA.getValVT();
 
-    // First check if it's a PS input addr
+  switch (VA.getLocInfo()) {
+  default:
+    break;
+  case CCValAssign::BCvt:
+    MemVT = VA.getLocVT();
+    break;
+  case CCValAssign::SExt:
+    ExtType = ISD::SEXTLOAD;
+    break;
+  case CCValAssign::ZExt:
+    ExtType = ISD::ZEXTLOAD;
+    break;
+  case CCValAssign::AExt:
+    ExtType = ISD::EXTLOAD;
+    break;
+  }
+
+  ArgValue = DAG.getExtLoad(
+    ExtType, SL, VA.getLocVT(), Chain, FIN,
+    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
+    MemVT);
+  return ArgValue;
+}
+
+static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+                                   CallingConv::ID CallConv,
+                                   ArrayRef<ISD::InputArg> Ins,
+                                   BitVector &Skipped,
+                                   FunctionType *FType,
+                                   SIMachineFunctionInfo *Info) {
+  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
+    const ISD::InputArg &Arg = Ins[I];
+
+    // First check if it's a PS input addr.
     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
-        // We can safely skip PS inputs
-        Skipped.set(i);
+        // We can safely skip PS inputs.
+        Skipped.set(I);
         ++PSInputNum;
         continue;
       }
 
       Info->markPSInputAllocated(PSInputNum);
       if (Arg.Used)
-        Info->PSInputEna |= 1 << PSInputNum;
+        Info->markPSInputEnabled(PSInputNum);
 
       ++PSInputNum;
     }
 
-    if (AMDGPU::isShader(CallConv)) {
-      // Second split vertices into their elements
-      if (Arg.VT.isVector()) {
-        ISD::InputArg NewArg = Arg;
-        NewArg.Flags.setSplit();
-        NewArg.VT = Arg.VT.getVectorElementType();
-
-        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-        // three or five element vertex only needs three or five registers,
-        // NOT four or eight.
-        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-        unsigned NumElements = ParamType->getVectorNumElements();
-
-        for (unsigned j = 0; j != NumElements; ++j) {
-          Splits.push_back(NewArg);
-          NewArg.PartOffset += NewArg.VT.getStoreSize();
-        }
-      } else {
-        Splits.push_back(Arg);
+    // Second split vertices into their elements.
+    if (Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eight.
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned J = 0; J != NumElements; ++J) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
+    } else {
+      Splits.push_back(Arg);
     }
   }
+}
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+// Allocate special inputs passed in VGPRs.
+static void allocateSpecialInputVGPRs(CCState &CCInfo,
+                                      MachineFunction &MF,
+                                      const SIRegisterInfo &TRI,
+                                      SIMachineFunctionInfo &Info) {
+  if (Info.hasWorkItemIDX()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
 
-  // At least one interpolation mode must be enabled or else the GPU will hang.
-  //
-  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
-  // PSInputAddr, the user wants to enable some bits after the compilation
-  // based on run-time states. Since we can't know what the final PSInputEna
-  // will look like, so we shouldn't do anything here and the user should take
-  // responsibility for the correct programming.
-  //
-  // Otherwise, the following restrictions apply:
-  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
-  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
-  //   enabled too.
-  if (CallConv == CallingConv::AMDGPU_PS &&
-      ((Info->getPSInputAddr() & 0x7F) == 0 ||
-       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
-    CCInfo.AllocateReg(AMDGPU::VGPR0);
-    CCInfo.AllocateReg(AMDGPU::VGPR1);
-    Info->markPSInputAllocated(0);
-    Info->PSInputEna |= 1;
-  }
-
-  if (!AMDGPU::isShader(CallConv)) {
-    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
-  } else {
-    assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
-           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
+  if (Info.hasWorkItemIDY()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasPrivateMemoryInputPtr()) {
-    unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
-    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
-    CCInfo.AllocateReg(PrivateMemoryPtrReg);
+  if (Info.hasWorkItemIDZ()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+}
+
+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+                                 MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) {
+  if (Info.hasImplicitBufferPtr()) {
+    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
-  if (Info->hasPrivateSegmentBuffer()) {
-    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
-    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+  if (Info.hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
     CCInfo.AllocateReg(PrivateSegmentBufferReg);
   }
 
-  if (Info->hasDispatchPtr()) {
-    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+  if (Info.hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
-  if (Info->hasQueuePtr()) {
-    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+  if (Info.hasQueuePtr()) {
+    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
   }
 
-  if (Info->hasKernargSegmentPtr()) {
-    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+  if (Info.hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
-  if (Info->hasDispatchID()) {
-    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+  if (Info.hasDispatchID()) {
+    unsigned DispatchIDReg = Info.addDispatchID(TRI);
     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchIDReg);
   }
 
-  if (Info->hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+  if (Info.hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
-  if (!AMDGPU::isShader(CallConv))
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
-  else
-    AnalyzeFormalArguments(CCInfo, Splits);
-
-  SmallVector<SDValue, 16> Chains;
-
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
-
-    const ISD::InputArg &Arg = Ins[i];
-    if (Skipped[i]) {
-      InVals.push_back(DAG.getUNDEF(Arg.VT));
-      continue;
-    }
-
-    CCValAssign &VA = ArgLocs[ArgIdx++];
-    MVT VT = VA.getLocVT();
-
-    if (VA.isMemLoc()) {
-      VT = Ins[i].VT;
-      EVT MemVT = VA.getLocVT();
-      const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
-                              VA.getLocMemOffset();
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
-                                   Offset, Ins[i].Flags.isSExt(),
-                                   &Ins[i]);
-      Chains.push_back(Arg.getValue(1));
-
-      auto *ParamTy =
-        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
-          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-        // On SI local pointers are just offsets into LDS, so they are always
-        // less than 16-bits.  On CI and newer they could potentially be
-        // real pointers, so we can't guarantee their size.
-        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
-                          DAG.getValueType(MVT::i16));
-      }
-
-      InVals.push_back(Arg);
-      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
-      continue;
-    }
-    assert(VA.isRegLoc() && "Parameter must be in a register!");
-
-    unsigned Reg = VA.getLocReg();
-
-    if (VT == MVT::i64) {
-      // For now assume it is a pointer
-      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
-                                     &AMDGPU::SGPR_64RegClass);
-      Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
-      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-      InVals.push_back(Copy);
-      continue;
-    }
-
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-
-    Reg = MF.addLiveIn(Reg, RC);
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-
-    if (Arg.VT.isVector()) {
-
-      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      SmallVector<SDValue, 4> Regs;
-      Regs.push_back(Val);
-      for (unsigned j = 1; j != NumElements; ++j) {
-        Reg = ArgLocs[ArgIdx++].getLocReg();
-        Reg = MF.addLiveIn(Reg, RC);
-
-        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-        Regs.push_back(Copy);
-      }
-
-      // Fill up the missing vector elements
-      NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      Regs.append(NumElements, DAG.getUNDEF(VT));
-
-      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
-      continue;
-    }
-
-    InVals.push_back(Val);
-  }
-
   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
   // these from the dispatch pointer.
+}
 
-  // Start adding system SGPRs.
-  if (Info->hasWorkGroupIDX()) {
-    unsigned Reg = Info->addWorkGroupIDX();
+// Allocate special input registers that are initialized per-wave.
+static void allocateSystemSGPRs(CCState &CCInfo,
+                                MachineFunction &MF,
+                                SIMachineFunctionInfo &Info,
+                                CallingConv::ID CallConv,
+                                bool IsShader) {
+  if (Info.hasWorkGroupIDX()) {
+    unsigned Reg = Info.addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupIDY()) {
-    unsigned Reg = Info->addWorkGroupIDY();
+  if (Info.hasWorkGroupIDY()) {
+    unsigned Reg = Info.addWorkGroupIDY();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupIDZ()) {
-    unsigned Reg = Info->addWorkGroupIDZ();
+  if (Info.hasWorkGroupIDZ()) {
+    unsigned Reg = Info.addWorkGroupIDZ();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupInfo()) {
-    unsigned Reg = Info->addWorkGroupInfo();
+  if (Info.hasWorkGroupInfo()) {
+    unsigned Reg = Info.addWorkGroupInfo();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasPrivateSegmentWaveByteOffset()) {
+  if (Info.hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
     unsigned PrivateSegmentWaveByteOffsetReg;
 
-    if (AMDGPU::isShader(CallConv)) {
-      PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
-      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+    if (IsShader) {
+      PrivateSegmentWaveByteOffsetReg =
+        Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
+
+      // This is true if the scratch wave byte offset doesn't have a fixed
+      // location.
+      if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
+        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+      }
     } else
-      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
+      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
   }
+}
 
+static void reservePrivateMemoryRegs(const TargetMachine &TM,
+                                     MachineFunction &MF,
+                                     const SIRegisterInfo &TRI,
+                                     SIMachineFunctionInfo &Info) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-  bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool HasStackObjects = MFI.hasStackObjects();
+
   // Record that we know we have non-spill stack objects so we don't need to
   // check all stack objects later.
   if (HasStackObjects)
-    Info->setHasNonSpillStackObjects(true);
+    Info.setHasNonSpillStackObjects(true);
 
   // Everything live out of a block is spilled with fast regalloc, so it's
   // almost certain that spilling will be required.
-  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+  if (TM.getOptLevel() == CodeGenOpt::None)
     HasStackObjects = true;
 
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   if (ST.isAmdCodeObjectV2(MF)) {
     if (HasStackObjects) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
       // SGPR inputs. We can reserve those and use them directly.
 
-      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+      unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
-      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+      Info.setScratchRSrcReg(PrivateSegmentBufferReg);
 
-      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+      Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
     } else {
       unsigned ReservedBufferReg
-        = TRI->reservedPrivateSegmentBufferReg(MF);
+        = TRI.reservedPrivateSegmentBufferReg(MF);
       unsigned ReservedOffsetReg
-        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
 
       // We tentatively reserve the last registers (skipping the last two
       // which may contain VCC). After register allocation, we'll replace
       // these with the ones immediately after those which were really
       // allocated. In the prologue copies will be inserted from the argument
       // to these reserved registers.
-      Info->setScratchRSrcReg(ReservedBufferReg);
-      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+      Info.setScratchRSrcReg(ReservedBufferReg);
+      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   } else {
-    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
 
     // Without HSA, relocations are used for the scratch pointer and the
     // buffer resource setup is always inserted in the prologue. Scratch wave
     // offset is still in an input SGPR.
-    Info->setScratchRSrcReg(ReservedBufferReg);
+    Info.setScratchRSrcReg(ReservedBufferReg);
 
     if (HasStackObjects) {
-      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+      unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+      Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
     } else {
       unsigned ReservedOffsetReg
-        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
-      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
+}
 
-  if (Info->hasWorkItemIDX()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+SDValue SITargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return DAG.getEntryNode();
   }
 
-  if (Info->hasWorkItemIDY()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  SmallVector<CCValAssign, 16> ArgLocs;
+  BitVector Skipped(Ins.size());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  bool IsShader = AMDGPU::isShader(CallConv);
+  bool IsKernel = AMDGPU::isKernel(CallConv);
+  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
+
+  if (IsShader) {
+    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+
+    // At least one interpolation mode must be enabled or else the GPU will
+    // hang.
+    //
+    // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
+    // set PSInputAddr, the user wants to enable some bits after the compilation
+    // based on run-time states. Since we can't know what the final PSInputEna
+    // will look like, so we shouldn't do anything here and the user should take
+    // responsibility for the correct programming.
+    //
+    // Otherwise, the following restrictions apply:
+    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+    //   enabled too.
+    if (CallConv == CallingConv::AMDGPU_PS &&
+        ((Info->getPSInputAddr() & 0x7F) == 0 ||
+         ((Info->getPSInputAddr() & 0xF) == 0 &&
+          Info->isPSInputAllocated(11)))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
+    }
+
+    assert(!Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  } else if (IsKernel) {
+    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+  } else {
+    Splits.append(Ins.begin(), Ins.end());
   }
 
-  if (Info->hasWorkItemIDZ()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+  if (IsEntryFunc) {
+    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+    allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
-  if (Chains.empty())
-    return Chain;
+  if (IsKernel) {
+    analyzeFormalArgumentsCompute(CCInfo, Ins);
+  } else {
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+  }
+
+  SmallVector<SDValue, 16> Chains;
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+    if (Skipped[i]) {
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    MVT VT = VA.getLocVT();
+
+    if (IsEntryFunc && VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = VA.getLocVT();
+
+      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
+        VA.getLocMemOffset();
+      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = lowerKernargMemParameter(
+        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+      Chains.push_back(Arg.getValue(1));
+
+      auto *ParamTy =
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+        // On SI local pointers are just offsets into LDS, so they are always
+        // less than 16-bits.  On CI and newer they could potentially be
+        // real pointers, so we can't guarantee their size.
+        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+                          DAG.getValueType(MVT::i16));
+      }
+
+      InVals.push_back(Arg);
+      continue;
+    } else if (!IsEntryFunc && VA.isMemLoc()) {
+      SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
+      InVals.push_back(Val);
+      if (!Arg.Flags.isByVal())
+        Chains.push_back(Val.getValue(1));
+      continue;
+    }
+
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+    EVT ValVT = VA.getValVT();
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+    // If this is an 8 or 16-bit value, it is really passed promoted
+    // to 32 bits. Insert an assert[sz]ext to capture this, then
+    // truncate to the right size.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
+                        DAG.getValueType(ValVT));
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    if (IsShader && Arg.VT.isVector()) {
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      Regs.append(NumElements, DAG.getUNDEF(VT));
+
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  // Start adding system SGPRs.
+  if (IsEntryFunc) {
+    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
+  } else {
+    CCInfo.AllocateReg(Info->getScratchRSrcReg());
+    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+    CCInfo.AllocateReg(Info->getFrameOffsetReg());
+  }
+
+  return Chains.empty() ? Chain :
+    DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+}
+
+// TODO: If return values can't fit in registers, we should return as many as
+// possible in registers before passing on stack.
+bool SITargetLowering::CanLowerReturn(
+  CallingConv::ID CallConv,
+  MachineFunction &MF, bool IsVarArg,
+  const SmallVectorImpl<ISD::OutputArg> &Outs,
+  LLVMContext &Context) const {
+  // Replacing returns with sret/stack usage doesn't make sense for shaders.
+  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
+  // for shaders. Vector types should be explicitly handled by CC.
+  if (AMDGPU::isEntryFunctionCC(CallConv))
+    return true;
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
 }
 
 SDValue
@@ -1118,11 +1467,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  if (!AMDGPU::isShader(CallConv))
+  if (AMDGPU::isKernel(CallConv)) {
     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
                                              OutVals, DL, DAG);
+  }
+
+  bool IsShader = AMDGPU::isShader(CallConv);
 
   Info->setIfReturnsVoid(Outs.size() == 0);
+  bool IsWaveEnd = Info->returnsVoid() && IsShader;
 
   SmallVector<ISD::OutputArg, 48> Splits;
   SmallVector<SDValue, 48> SplitVals;
@@ -1131,7 +1484,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     const ISD::OutputArg &Out = Outs[i];
 
-    if (Out.VT.isVector()) {
+    if (IsShader && Out.VT.isVector()) {
       MVT VT = Out.VT.getVectorElementType();
       ISD::OutputArg NewOut = Out;
       NewOut.Flags.setSplit();
@@ -1162,29 +1515,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                  *DAG.getContext());
 
   // Analyze outgoing return values.
-  AnalyzeReturn(CCInfo, Splits);
+  CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
 
   SDValue Flag;
   SmallVector<SDValue, 48> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
 
+  // Add return address for callable functions.
+  if (!Info->isEntryFunction()) {
+    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+    SDValue ReturnAddrReg = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
+
+    // FIXME: Should be able to use a vreg here, but need a way to prevent it
+    // from being allcoated to a CSR.
+
+    SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
+                                                MVT::i64);
+
+    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
+    Flag = Chain.getValue(1);
+
+    RetOps.push_back(PhysReturnAddrReg);
+  }
+
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
+    // TODO: Partially return in registers if return values don't fit.
 
     SDValue Arg = SplitVals[realRVLocIdx];
 
     // Copied from other backends.
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
     }
 
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
@@ -1192,12 +1574,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  // FIXME: Does sret work properly?
+
   // Update chain and glue.
   RetOps[0] = Chain;
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  unsigned Opc = AMDGPUISD::ENDPGM;
+  if (!IsWaveEnd)
+    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -1436,7 +1822,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
                             const TargetRegisterClass *SuperRC,
                             unsigned VecReg,
                             int Offset) {
-  int NumElts = SuperRC->getSize() / 4;
+  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
 
   // Skip out of bounds offsets, or else we would end up using an undefined
   // register.
@@ -1470,16 +1856,16 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
     if (Offset == 0) {
       MachineInstr *SetOn =
-        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
-        .addOperand(*Idx)
-        .addImm(IdxMode);
+          BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+              .add(*Idx)
+              .addImm(IdxMode);
 
       SetOn->getOperand(3).setIsUndef();
     } else {
       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
-        .addOperand(*Idx)
-        .addImm(Offset);
+          .add(*Idx)
+          .addImm(Offset);
       MachineInstr *SetOn =
         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
         .addReg(Tmp, RegState::Kill)
@@ -1493,10 +1879,10 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
 
   if (Offset == 0) {
     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-      .addOperand(*Idx);
+      .add(*Idx);
   } else {
     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-      .addOperand(*Idx)
+      .add(*Idx)
       .addImm(Offset);
   }
 
@@ -1522,7 +1908,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   std::tie(SubReg, Offset)
     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
 
-  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
 
   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
     MachineBasicBlock::iterator I(&MI);
@@ -1548,7 +1934,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
     return &MBB;
   }
 
-
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
 
@@ -1586,17 +1971,18 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   return LoopBB;
 }
 
-static unsigned getMOVRELDPseudo(const TargetRegisterClass *VecRC) {
-  switch (VecRC->getSize()) {
-  case 4:
+static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
+                                 const TargetRegisterClass *VecRC) {
+  switch (TRI.getRegSizeInBits(*VecRC)) {
+  case 32: // 4 bytes
     return AMDGPU::V_MOVRELD_B32_V1;
-  case 8:
+  case 64: // 8 bytes
     return AMDGPU::V_MOVRELD_B32_V2;
-  case 16:
+  case 128: // 16 bytes
     return AMDGPU::V_MOVRELD_B32_V4;
-  case 32:
+  case 256: // 32 bytes
     return AMDGPU::V_MOVRELD_B32_V8;
-  case 64:
+  case 512: // 64 bytes
     return AMDGPU::V_MOVRELD_B32_V16;
   default:
     llvm_unreachable("unsupported size for MOVRELD pseudos");
@@ -1625,7 +2011,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
                                                          SrcVec->getReg(),
                                                          Offset);
-  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
 
   if (Idx->getReg() == AMDGPU::NoRegister) {
     MachineBasicBlock::iterator I(&MI);
@@ -1634,9 +2020,9 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
     assert(Offset == 0);
 
     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
-      .addOperand(*SrcVec)
-      .addOperand(*Val)
-      .addImm(SubReg);
+        .add(*SrcVec)
+        .add(*Val)
+        .addImm(SubReg);
 
     MI.eraseFromParent();
     return &MBB;
@@ -1648,20 +2034,20 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
     if (UseGPRIdxMode) {
       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
-        .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
-        .addOperand(*Val)
-        .addReg(Dst, RegState::ImplicitDefine)
-        .addReg(SrcVec->getReg(), RegState::Implicit)
-        .addReg(AMDGPU::M0, RegState::Implicit);
+          .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
+          .add(*Val)
+          .addReg(Dst, RegState::ImplicitDefine)
+          .addReg(SrcVec->getReg(), RegState::Implicit)
+          .addReg(AMDGPU::M0, RegState::Implicit);
 
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
     } else {
-      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 
       BuildMI(MBB, I, DL, MovRelDesc)
           .addReg(Dst, RegState::Define)
           .addReg(SrcVec->getReg())
-          .addOperand(*Val)
+          .add(*Val)
           .addImm(SubReg - AMDGPU::sub0);
     }
 
@@ -1694,18 +2080,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
 
   if (UseGPRIdxMode) {
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
-      .addReg(PhiReg, RegState::Undef, SubReg) // vdst
-      .addOperand(*Val) // src0
-      .addReg(Dst, RegState::ImplicitDefine)
-      .addReg(PhiReg, RegState::Implicit)
-      .addReg(AMDGPU::M0, RegState::Implicit);
+        .addReg(PhiReg, RegState::Undef, SubReg) // vdst
+        .add(*Val)                               // src0
+        .addReg(Dst, RegState::ImplicitDefine)
+        .addReg(PhiReg, RegState::Implicit)
+        .addReg(AMDGPU::M0, RegState::Implicit);
   } else {
-    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(VecRC));
+    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
 
     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
         .addReg(Dst, RegState::Define)
         .addReg(PhiReg)
-        .addOperand(*Val)
+        .add(*Val)
         .addImm(SubReg - AMDGPU::sub0);
   }
 
@@ -1741,18 +2127,76 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   }
 
   switch (MI.getOpcode()) {
-  case AMDGPU::SI_INIT_M0: {
+  case AMDGPU::SI_INIT_M0:
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
-      .addOperand(MI.getOperand(0));
+        .add(MI.getOperand(0));
+    MI.eraseFromParent();
+    return BB;
+
+  case AMDGPU::SI_INIT_EXEC:
+    // This should be before all vector instructions.
+    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+            AMDGPU::EXEC)
+        .addImm(MI.getOperand(0).getImm());
+    MI.eraseFromParent();
+    return BB;
+
+  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
+    // Extract the thread count from an SGPR input and set EXEC accordingly.
+    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+    //
+    // S_BFE_U32 count, input, {shift, 7}
+    // S_BFM_B64 exec, count, 0
+    // S_CMP_EQ_U32 count, 64
+    // S_CMOV_B64 exec, -1
+    MachineInstr *FirstMI = &*BB->begin();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned InputReg = MI.getOperand(0).getReg();
+    unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    bool Found = false;
+
+    // Move the COPY of the input reg to the beginning, so that we can use it.
+    for (auto I = BB->begin(); I != &MI; I++) {
+      if (I->getOpcode() != TargetOpcode::COPY ||
+          I->getOperand(0).getReg() != InputReg)
+        continue;
+
+      if (I == FirstMI) {
+        FirstMI = &*++BB->begin();
+      } else {
+        I->removeFromParent();
+        BB->insert(FirstMI, &*I);
+      }
+      Found = true;
+      break;
+    }
+    assert(Found);
+    (void)Found;
+
+    // This should be before all vector instructions.
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
+        .addReg(InputReg)
+        .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
+            AMDGPU::EXEC)
+        .addReg(CountReg)
+        .addImm(0);
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
+        .addReg(CountReg, RegState::Kill)
+        .addImm(64);
+    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
+            AMDGPU::EXEC)
+        .addImm(-1);
     MI.eraseFromParent();
     return BB;
   }
+
   case AMDGPU::GET_GROUPSTATICSIZE: {
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
-      .addOperand(MI.getOperand(0))
-      .addImm(MFI->getLDSSize());
+        .add(MI.getOperand(0))
+        .addImm(MFI->getLDSSize());
     MI.eraseFromParent();
     return BB;
   }
@@ -1803,7 +2247,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
     const DebugLoc &DL = MI.getDebugLoc();
     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
-      .addOperand(MI.getOperand(0));
+                           .add(MI.getOperand(0));
     Br->getOperand(1).setIsUndef(true); // read undef SCC
     MI.eraseFromParent();
     return BB;
@@ -1856,9 +2300,6 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
-  if (!VT.isSimple())
-    return false;
-
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
     // This is as fast on some subtargets. However, we always have full rate f32
@@ -1909,13 +2350,74 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
-  case ISD::TRAP: return lowerTRAP(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return lowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::FP_ROUND:
     return lowerFP_ROUND(Op, DAG);
+
+  case ISD::TRAP:
+  case ISD::DEBUGTRAP:
+    return lowerTRAP(Op, DAG);
   }
   return SDValue();
 }
 
+void SITargetLowering::ReplaceNodeResults(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  case ISD::INSERT_VECTOR_ELT: {
+    if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
+      Results.push_back(Res);
+    return;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
+      Results.push_back(Res);
+    return;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+      SDValue Src0 = N->getOperand(1);
+      SDValue Src1 = N->getOperand(2);
+      SDLoc SL(N);
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
+                                Src0, Src1);
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
+      return;
+    }
+    break;
+  }
+  case ISD::SELECT: {
+    SDLoc SL(N);
+    EVT VT = N->getValueType(0);
+    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+    SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
+
+    EVT SelectVT = NewVT;
+    if (NewVT.bitsLT(MVT::i32)) {
+      LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
+      RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
+      SelectVT = MVT::i32;
+    }
+
+    SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
+                                    N->getOperand(0), LHS, RHS);
+
+    if (NewVT != SelectVT)
+      NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
+    return;
+  }
+  default:
+    break;
+  }
+}
+
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -1932,31 +2434,25 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
   return nullptr;
 }
 
-bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_if:
-    case AMDGPUIntrinsic::amdgcn_else:
-    case AMDGPUIntrinsic::amdgcn_end_cf:
-    case AMDGPUIntrinsic::amdgcn_loop:
-      return true;
+    case Intrinsic::amdgcn_if:
+      return AMDGPUISD::IF;
+    case Intrinsic::amdgcn_else:
+      return AMDGPUISD::ELSE;
+    case Intrinsic::amdgcn_loop:
+      return AMDGPUISD::LOOP;
+    case Intrinsic::amdgcn_end_cf:
+      llvm_unreachable("should not occur");
     default:
-      return false;
+      return 0;
     }
   }
 
-  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_break:
-    case AMDGPUIntrinsic::amdgcn_if_break:
-    case AMDGPUIntrinsic::amdgcn_else_break:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  return false;
+  // break, if_break, else_break are all only used as inputs to loop, not
+  // directly as branch conditions.
+  return 0;
 }
 
 void SITargetLowering::createDebuggerPrologueStackObjects(
@@ -1987,13 +2483,13 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
 
 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
   const Triple &TT = getTargetMachine().getTargetTriple();
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
-  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-              GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -2006,7 +2502,6 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
 /// last parameter, also switches branch target with BR if the need arise
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
                                       SelectionDAG &DAG) const {
-
   SDLoc DL(BRCOND);
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
@@ -2032,7 +2527,8 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
 
-  if (!isCFIntrinsic(Intr)) {
+  unsigned CFNode = isCFIntrinsic(Intr);
+  if (CFNode == 0) {
     // This is a uniform branch so we don't need to legalize.
     return BRCOND;
   }
@@ -2050,15 +2546,13 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   if (HaveChain)
     Ops.push_back(BRCOND.getOperand(0));
 
-  Ops.append(Intr->op_begin() + (HaveChain ?  1 : 0), Intr->op_end());
+  Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
   Ops.push_back(Target);
 
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
 
   // build the new intrinsic call
-  SDNode *Result = DAG.getNode(
-    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res), Ops).getNode();
+  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
 
   if (!HaveChain) {
     SDValue Ops[] =  {
@@ -2127,12 +2621,82 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
-  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
+  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
+}
+
+SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain = Op.getOperand(0);
+
+  unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
+    SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
+
+  if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
+      Subtarget->isTrapHandlerEnabled()) {
+    SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+    unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+    assert(UserSGPR != AMDGPU::NoRegister);
+
+    SDValue QueuePtr = CreateLiveInRegister(
+      DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+    SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+
+    SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+                                     QueuePtr, SDValue());
+
+    SDValue Ops[] = {
+      ToReg,
+      DAG.getTargetConstant(TrapID, SL, MVT::i16),
+      SGPR01,
+      ToReg.getValue(1)
+    };
+
+    return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+  }
+
+  switch (TrapID) {
+  case SISubtarget::TrapIDLLVMTrap:
+    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+  case SISubtarget::TrapIDLLVMDebugTrap: {
+    DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+                                     "debugtrap handler not supported",
+                                     Op.getDebugLoc(),
+                                     DS_Warning);
+    LLVMContext &Ctx = MF.getFunction()->getContext();
+    Ctx.diagnose(NoTrap);
+    return Chain;
+  }
+  default:
+    llvm_unreachable("unsupported trap handler type!");
+  }
+
+  return Chain;
 }
 
-SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                              SelectionDAG &DAG) const {
-  SDLoc SL;
+  // FIXME: Use inline constants (src_{shared, private}_base) instead.
+  if (Subtarget->hasApertureRegs()) {
+    unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
+        AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
+        AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
+    unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
+        AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
+        AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
+    unsigned Encoding =
+        AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
+        Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
+        WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
+
+    SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
+    SDValue ApertureReg = SDValue(
+        DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
+    SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
+    return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
+  }
+
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
@@ -2143,19 +2707,19 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS,
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
-  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
 
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
-                            DAG.getConstant(StructOffset, SL, MVT::i64));
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, DL, MVT::i64));
 
   // TODO: Use custom target PseudoSourceValue.
   // TODO: We should use the value from the IR intrinsic call, but it might not
   // be available and how do we get it?
   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
-                                              AMDGPUAS::CONSTANT_ADDRESS));
+                                              AMDGPUASI.CONSTANT_ADDRESS));
 
   MachinePointerInfo PtrInfo(V, StructOffset);
-  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
                      MinAlign(64, StructOffset),
                      MachineMemOperand::MODereferenceable |
                          MachineMemOperand::MOInvariant);
@@ -2167,15 +2731,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
 
   SDValue Src = ASC->getOperand(0);
-
-  // FIXME: Really support non-0 null pointers.
-  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
 
+  const AMDGPUTargetMachine &TM =
+    static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+
   // flat -> local/private
-  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+  if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+    unsigned DestAS = ASC->getDestAddressSpace();
+
+    if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
+        DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
+      unsigned NullVal = TM.getNullPointerValue(DestAS);
+      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
@@ -2185,13 +2753,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   }
 
   // local/private -> flat
-  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+  if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+    unsigned SrcAS = ASC->getSrcAddressSpace();
+
+    if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
+        SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
+      unsigned NullVal = TM.getNullPointerValue(SrcAS);
+      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
+
       SDValue NonNull
         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
 
-      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
       SDValue CvtPtr
         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
 
@@ -2211,17 +2784,97 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   return DAG.getUNDEF(ASC->getValueType(0));
 }
 
+SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  SDValue Idx = Op.getOperand(2);
+  if (isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  // Avoid stack access for dynamic indexing.
+  SDLoc SL(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+
+  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
+  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+
+  // Convert vector index to bit-index.
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
+                                  DAG.getConstant(16, SL, MVT::i32));
+
+  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+
+  SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
+                            DAG.getConstant(0xffff, SL, MVT::i32),
+                            ScaledIdx);
+
+  SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
+  SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
+                            DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+
+  SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
+  return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+}
+
+SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc SL(Op);
+
+  EVT ResultVT = Op.getValueType();
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+
+  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
+
+  // Make sure we we do any optimizations that will make it easier to fold
+  // source modifiers before obscuring it with bit operations.
+
+  // XXX - Why doesn't this get called when vector_shuffle is expanded?
+  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
+    return Combined;
+
+  if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+
+    if (CIdx->getZExtValue() == 1) {
+      Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
+                           DAG.getConstant(16, SL, MVT::i32));
+    } else {
+      assert(CIdx->getZExtValue() == 0);
+    }
+
+    if (ResultVT.bitsLT(MVT::i32))
+      Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+    return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+  }
+
+  SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+
+  // Convert vector index to bit-index.
+  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+  SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+
+  SDValue Result = Elt;
+  if (ResultVT.bitsLT(MVT::i32))
+    Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+
+  return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+}
+
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // We can fold offsets for anything that doesn't require a GOT relocation.
-  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-              GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
          !shouldEmitGOTReloc(GA->getGlobal());
 }
 
-static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
-                                      SDLoc DL, unsigned Offset, EVT PtrVT,
-                                      unsigned GAFlags = SIInstrInfo::MO_NONE) {
+static SDValue
+buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+                        const SDLoc &DL, unsigned Offset, EVT PtrVT,
+                        unsigned GAFlags = SIInstrInfo::MO_NONE) {
   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
   // lowered to the following code sequence:
   //
@@ -2265,8 +2918,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
@@ -2283,7 +2936,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                             SIInstrInfo::MO_GOTPCREL32);
 
   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
   const DataLayout &DataLayout = DAG.getDataLayout();
   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
@@ -2294,23 +2947,6 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                          MachineMemOperand::MOInvariant);
 }
 
-SDValue SITargetLowering::lowerTRAP(SDValue Op,
-                                    SelectionDAG &DAG) const {
-  const MachineFunction &MF = DAG.getMachineFunction();
-  DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
-                                   "trap handler not supported",
-                                   Op.getDebugLoc(),
-                                   DS_Warning);
-  DAG.getContext()->diagnose(NoTrap);
-
-  // Emit s_endpgm.
-
-  // FIXME: This should really be selected to s_trap, but that requires
-  // setting up the trap handler for it o do anything.
-  return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
-                     Op.getOperand(0));
-}
-
 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
                                    const SDLoc &DL, SDValue V) const {
   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
@@ -2332,14 +2968,15 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                  MVT VT,
                                                  unsigned Offset) const {
   SDLoc SL(Op);
-  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
-                                 DAG.getEntryNode(), Offset, false);
+  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
+                                           DAG.getEntryNode(), Offset, false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
 }
 
-static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
+                                        EVT VT) {
   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
                                       "non-hsa intrinsic with hsa target",
                                       DL.getDebugLoc());
@@ -2347,7 +2984,8 @@ static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
   return DAG.getUNDEF(VT);
 }
 
-static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
+                                         EVT VT) {
   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
                                       "intrinsic not supported on subtarget",
                                       DL.getDebugLoc());
@@ -2369,7 +3007,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_implicit_buffer_ptr: {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+    if (getSubtarget()->isAmdCodeObjectV2(MF))
+      return emitNonHSAIntrinsicError(DAG, DL, VT);
+
+    unsigned Reg = TRI->getPreloadedValue(MF,
+                                          SIRegisterInfo::IMPLICIT_BUFFER_PTR);
     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
   }
   case Intrinsic::amdgcn_dispatch_ptr:
@@ -2389,7 +3031,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_implicitarg_ptr: {
     unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
-    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+    return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
   }
   case Intrinsic::amdgcn_kernarg_segment_ptr: {
     unsigned Reg
@@ -2403,19 +3045,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_rcp:
     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq:
-  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-  case Intrinsic::amdgcn_rsq_legacy: {
+  case Intrinsic::amdgcn_rsq_legacy:
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
 
     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-  }
-  case Intrinsic::amdgcn_rcp_legacy: {
+  case Intrinsic::amdgcn_rcp_legacy:
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
-  }
   case Intrinsic::amdgcn_rsq_clamp: {
     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
@@ -2434,38 +3073,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_X, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_X, false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Y, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_Y, false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Z, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_Z, false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -2522,43 +3161,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
+  case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
-  }
-  case AMDGPUIntrinsic::SI_vs_load_input:
-    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                       Op.getOperand(1),
-                       Op.getOperand(2),
-                       Op.getOperand(3));
-
-  case AMDGPUIntrinsic::SI_fs_constant: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
-                       DAG.getConstant(2, DL, MVT::i32), // P0
-                       Op.getOperand(1), Op.getOperand(2), Glue);
-  }
-  case AMDGPUIntrinsic::SI_packf16:
-    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
-      return DAG.getUNDEF(MVT::i32);
-    return Op;
-  case AMDGPUIntrinsic::SI_fs_interp: {
-    SDValue IJ = Op.getOperand(4);
-    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(0, DL, MVT::i32));
-    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(1, DL, MVT::i32));
-    I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
-    J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
-                             DAG.getVTList(MVT::f32, MVT::Glue),
-                             I, Op.getOperand(1), Op.getOperand(2), Glue);
-    Glue = SDValue(P1.getNode(), 1);
-    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
-                             Op.getOperand(1), Op.getOperand(2), Glue);
-  }
   case Intrinsic::amdgcn_interp_mov: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
     SDValue Glue = M0.getValue(1);
@@ -2639,10 +3243,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_icmp: {
     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    int CondCode = CD->getSExtValue();
+    if (!CD)
+      return DAG.getUNDEF(VT);
 
+    int CondCode = CD->getSExtValue();
     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
-        CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+        CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
       return DAG.getUNDEF(VT);
 
     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
@@ -2652,10 +3258,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_fcmp: {
     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    int CondCode = CD->getSExtValue();
+    if (!CD)
+      return DAG.getUNDEF(VT);
 
-    if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
-        CondCode >= FCmpInst::Predicate::FCMP_TRUE)
+    int CondCode = CD->getSExtValue();
+    if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
+        CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
       return DAG.getUNDEF(VT);
 
     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
@@ -2663,14 +3271,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
   }
+  case Intrinsic::amdgcn_fmed3:
+    return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_fmul_legacy:
     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::amdgcn_sffbh:
-  case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_sbfe:
+    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_ubfe:
+    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    // FIXME: Stop adding cast if v2f16 legal.
+    EVT VT = Op.getValueType();
+    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+                               Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::BITCAST, DL, VT, Node);
+  }
   default:
-    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    return Op;
   }
 }
 
@@ -2678,6 +3301,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -2703,7 +3328,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -2718,6 +3342,87 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vindex
+      Op.getOperand(4),  // voffset
+      Op.getOperand(5),  // soffset
+      Op.getOperand(6),  // offset
+      Op.getOperand(7),  // dfmt
+      Op.getOperand(8),  // nfmt
+      Op.getOperand(9),  // glc
+      Op.getOperand(10)   // slc
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+  // Basic sample.
+  case Intrinsic::amdgcn_image_sample:
+  case Intrinsic::amdgcn_image_sample_cl:
+  case Intrinsic::amdgcn_image_sample_d:
+  case Intrinsic::amdgcn_image_sample_d_cl:
+  case Intrinsic::amdgcn_image_sample_l:
+  case Intrinsic::amdgcn_image_sample_b:
+  case Intrinsic::amdgcn_image_sample_b_cl:
+  case Intrinsic::amdgcn_image_sample_lz:
+  case Intrinsic::amdgcn_image_sample_cd:
+  case Intrinsic::amdgcn_image_sample_cd_cl:
+
+  // Sample with comparison.
+  case Intrinsic::amdgcn_image_sample_c:
+  case Intrinsic::amdgcn_image_sample_c_cl:
+  case Intrinsic::amdgcn_image_sample_c_d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl:
+  case Intrinsic::amdgcn_image_sample_c_l:
+  case Intrinsic::amdgcn_image_sample_c_b:
+  case Intrinsic::amdgcn_image_sample_c_b_cl:
+  case Intrinsic::amdgcn_image_sample_c_lz:
+  case Intrinsic::amdgcn_image_sample_c_cd:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl:
+
+  // Sample with offsets.
+  case Intrinsic::amdgcn_image_sample_o:
+  case Intrinsic::amdgcn_image_sample_cl_o:
+  case Intrinsic::amdgcn_image_sample_d_o:
+  case Intrinsic::amdgcn_image_sample_d_cl_o:
+  case Intrinsic::amdgcn_image_sample_l_o:
+  case Intrinsic::amdgcn_image_sample_b_o:
+  case Intrinsic::amdgcn_image_sample_b_cl_o:
+  case Intrinsic::amdgcn_image_sample_lz_o:
+  case Intrinsic::amdgcn_image_sample_cd_o:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o:
+
+  // Sample with comparison and offsets.
+  case Intrinsic::amdgcn_image_sample_c_o:
+  case Intrinsic::amdgcn_image_sample_c_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_d_o:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_l_o:
+  case Intrinsic::amdgcn_image_sample_c_b_o:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_lz_o:
+  case Intrinsic::amdgcn_image_sample_c_cd_o:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
+
+  case Intrinsic::amdgcn_image_getlod: {
+    // Replace dmask with everything disabled with undef.
+    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
+    if (!DMask || DMask->isNullValue()) {
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
+    }
+
+    return SDValue();
+  }
   default:
     return SDValue();
   }
@@ -2725,51 +3430,75 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
-  case AMDGPUIntrinsic::SI_sendmsg:
-  case Intrinsic::amdgcn_s_sendmsg: {
-    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
-    SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
-                       Op.getOperand(2), Glue);
+  case Intrinsic::amdgcn_exp: {
+    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
+    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
+    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
+
+    const SDValue Ops[] = {
+      Chain,
+      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
+      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
+      Op.getOperand(4), // src0
+      Op.getOperand(5), // src1
+      Op.getOperand(6), // src2
+      Op.getOperand(7), // src3
+      DAG.getTargetConstant(0, DL, MVT::i1), // compr
+      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
+    };
+
+    unsigned Opc = Done->isNullValue() ?
+      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+  }
+  case Intrinsic::amdgcn_exp_compr: {
+    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
+    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
+    SDValue Src0 = Op.getOperand(4);
+    SDValue Src1 = Op.getOperand(5);
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
+    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
+
+    SDValue Undef = DAG.getUNDEF(MVT::f32);
+    const SDValue Ops[] = {
+      Chain,
+      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
+      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
+      Undef, // src2
+      Undef, // src3
+      DAG.getTargetConstant(1, DL, MVT::i1), // compr
+      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
+    };
+
+    unsigned Opc = Done->isNullValue() ?
+      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
+    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
   }
+  case Intrinsic::amdgcn_s_sendmsg:
   case Intrinsic::amdgcn_s_sendmsghalt: {
+    unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
+      AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
     SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
+    return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
                        Op.getOperand(2), Glue);
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+  case Intrinsic::amdgcn_init_exec: {
+    return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
+                       Op.getOperand(2));
+  }
+  case Intrinsic::amdgcn_init_exec_from_input: {
+    return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
+                       Op.getOperand(2), Op.getOperand(3));
   }
   case AMDGPUIntrinsic::AMDGPU_kill: {
     SDValue Src = Op.getOperand(2);
@@ -2784,31 +3513,87 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
   }
-  case AMDGPUIntrinsic::SI_export: {
-    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
-    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
-    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
-    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
-    const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
+  case Intrinsic::amdgcn_s_barrier: {
+    if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
+      const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+      unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
+      if (WGSize <= ST.getWavefrontSize())
+        return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
+                                          Op.getOperand(0)), 0);
+    }
+    return SDValue();
+  };
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
 
-    const SDValue Ops[] = {
-      Chain,
-      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
-      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1),
-      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
-      DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
-      Op.getOperand(7), // src0
-      Op.getOperand(8), // src1
-      Op.getOperand(9), // src2
-      Op.getOperand(10) // src3
+    // Extract vindex and voffset from vaddr as appropriate
+    const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
+    const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
+    SDValue VAddr = Op.getOperand(5);
+
+    SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
+
+    assert(!(OffEn->isOne() && IdxEn->isOne()) &&
+           "Legacy intrinsic doesn't support both offset and index - use new version");
+
+    SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
+    SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
+
+    // Deal with the vec-3 case
+    const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
+    auto Opcode = NumChannels->getZExtValue() == 3 ?
+      AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
+
+    SDValue Ops[] = {
+     Chain,
+     Op.getOperand(3),  // vdata
+     Op.getOperand(2),  // rsrc
+     VIndex,
+     VOffset,
+     Op.getOperand(6),  // soffset
+     Op.getOperand(7),  // inst_offset
+     Op.getOperand(8),  // dfmt
+     Op.getOperand(9),  // nfmt
+     Op.getOperand(12), // glc
+     Op.getOperand(13), // slc
     };
 
-    unsigned Opc = Done->isNullValue() ?
-      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
-    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+    assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
+           "Value of tfe other than zero is unsupported");
+
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(Opcode, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
+  }
+
+  case Intrinsic::amdgcn_tbuffer_store: {
+    SDValue Ops[] = {
+      Chain,
+      Op.getOperand(2),  // vdata
+      Op.getOperand(3),  // rsrc
+      Op.getOperand(4),  // vindex
+      Op.getOperand(5),  // voffset
+      Op.getOperand(6),  // soffset
+      Op.getOperand(7),  // offset
+      Op.getOperand(8),  // dfmt
+      Op.getOperand(9),  // nfmt
+      Op.getOperand(10), // glc
+      Op.getOperand(11)  // slc
+    };
+    EVT VT = Op.getOperand(3).getValueType();
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOStore,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);
   }
+
   default:
-    return SDValue();
+    return Op;
   }
 }
 
@@ -2857,36 +3642,36 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUASI.FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = MemVT.getVectorNumElements();
-  switch (AS) {
-  case AMDGPUAS::CONSTANT_ADDRESS:
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     if (isMemOpUniform(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
-    LLVM_FALLTHROUGH;
-  case AMDGPUAS::GLOBAL_ADDRESS: {
+  }
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
-                  isMemOpHasNoClobberedMemOperand(Load))
+        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
   }
-    LLVM_FALLTHROUGH;
-  case AMDGPUAS::FLAT_ADDRESS:
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+      AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
     // v4 loads are supported for private and global memory.
     return SDValue();
-  case AMDGPUAS::PRIVATE_ADDRESS: {
+  }
+  if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     // Depending on the setting of the private_element_size field in the
     // resource descriptor, we can only make private accesses up to a certain
     // size.
@@ -2905,8 +3690,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  }
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
 
@@ -2916,9 +3700,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     // If properly aligned, if we split we might be able to use ds_read_b64.
     return SplitVectorLoad(Op, DAG);
   }
-  default:
-    return SDValue();
-  }
+  return SDValue();
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -2956,11 +3738,15 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
+  const SDNodeFlags Flags = Op->getFlags();
+  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
+                Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
+
+  if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
+    return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
-        VT == MVT::f16) {
+    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
       if (CLHS->isExactlyValue(1.0)) {
         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
         // the CI documentation has a worst case error of 1 ulp.
@@ -2989,15 +3775,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
     }
   }
 
-  const SDNodeFlags *Flags = Op->getFlags();
-
-  if (Unsafe || Flags->hasAllowReciprocal()) {
+  if (Unsafe) {
     // Turn into multiply by the reciprocal.
     // x / y -> x * (1.0 / y)
-    SDNodeFlags Flags;
-    Flags.setUnsafeAlgebra(true);
     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
+    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
   }
 
   return SDValue();
@@ -3287,18 +4069,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUASI.FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = VT.getVectorNumElements();
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::FLAT_ADDRESS:
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
+      AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorStore(Op, DAG);
     return SDValue();
-  case AMDGPUAS::PRIVATE_ADDRESS: {
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     switch (Subtarget->getMaxPrivateElementSize()) {
     case 4:
       return scalarizeVectorStore(Store, DAG);
@@ -3313,8 +4094,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  }
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
@@ -3323,8 +4103,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
     // If properly aligned, if we split we might be able to use ds_write_b64.
     return SplitVectorStore(Op, DAG);
-  }
-  default:
+  } else {
     llvm_unreachable("unhandled address space");
   }
 }
@@ -3355,7 +4134,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
   unsigned AS = AtomicNode->getAddressSpace();
 
   // No custom lowering required for local address space
-  if (!isFlatGlobalAddrSpace(AS))
+  if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
     return Op;
 
   // Non-local address space requires custom lowering for atomic compare
@@ -3412,12 +4191,12 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
 /// the immediate offsets of a memory instruction for the given address space.
 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
                           const SISubtarget &STI) {
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS: {
+  auto AMDGPUASI = STI.getAMDGPUAS();
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // MUBUF instructions a 12-bit offset in bytes.
     return isUInt<12>(OffsetSize);
   }
-  case AMDGPUAS::CONSTANT_ADDRESS: {
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // SMRD instructions have an 8-bit offset in dwords on SI and
     // a 20-bit offset in bytes on VI.
     if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
@@ -3425,16 +4204,13 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
     else
       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
   }
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS: {
+  if (AS == AMDGPUASI.LOCAL_ADDRESS ||
+      AS == AMDGPUASI.REGION_ADDRESS) {
     // The single offset versions have a 16-bit offset in bytes.
     return isUInt<16>(OffsetSize);
   }
-  case AMDGPUAS::PRIVATE_ADDRESS:
   // Indirect register addressing does not use any offsets.
-  default:
-    return 0;
-  }
+  return false;
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
@@ -3492,7 +4268,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
 
   // TODO: We could also do this for multiplies.
   unsigned AS = N->getAddressSpace();
-  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
     if (NewPtr) {
       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
@@ -3538,6 +4314,23 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
   return SDValue();
 }
 
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cmdmask_b32 to be deserialized.
+static bool isBoolSGPR(SDValue V) {
+  if (V.getValueType() != MVT::i1)
+    return false;
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::SETCC:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case AMDGPUISD::FP_CLASS:
+    return true;
+  }
+  return false;
+}
+
 SDValue SITargetLowering::performAndCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   if (DCI.isBeforeLegalize())
@@ -3549,12 +4342,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   SDValue RHS = N->getOperand(1);
 
 
-  if (VT == MVT::i64) {
-    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
-    if (CRHS) {
-      if (SDValue Split
-          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
-        return Split;
+  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (VT == MVT::i64 && CRHS) {
+    if (SDValue Split
+        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+      return Split;
+  }
+
+  if (CRHS && VT == MVT::i32) {
+    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+    // nb = number of trailing zeroes in mask
+    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+    // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+    uint64_t Mask = CRHS->getZExtValue();
+    unsigned Bits = countPopulation(Mask);
+    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+        unsigned Shift = CShift->getZExtValue();
+        unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+        unsigned Offset = NB + Shift;
+        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+          SDLoc SL(N);
+          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                    LHS->getOperand(0),
+                                    DAG.getConstant(Offset, SL, MVT::i32),
+                                    DAG.getConstant(Bits, SL, MVT::i32));
+          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+                                    DAG.getValueType(NarrowVT));
+          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+          return Shl;
+        }
+      }
     }
   }
 
@@ -3598,6 +4419,16 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
     }
   }
 
+  if (VT == MVT::i32 &&
+      (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
+    // and x, (sext cc from i1) => select cc, x, 0
+    if (RHS.getOpcode() != ISD::SIGN_EXTEND)
+      std::swap(LHS, RHS);
+    if (isBoolSGPR(RHS.getOperand(0)))
+      return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
+                           LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
+  }
+
   return SDValue();
 }
 
@@ -3692,6 +4523,88 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
   return SDValue();
 }
 
+// Instructions that will be lowered with a final instruction that zeros the
+// high result bits.
+// XXX - probably only need to list legal operations.
+static bool fp16SrcZerosHighBits(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FMA:
+  case ISD::FMAD:
+  case ISD::FCANONICALIZE:
+  case ISD::FP_ROUND:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::FABS:
+    // Fabs is lowered to a bit operation, but it's an and which will clear the
+    // high bits anyway.
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FFLOOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::CLAMP:
+  case AMDGPUISD::COS_HW:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMIN3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMAD_FTZ:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::LDEXP:
+    return true;
+  default:
+    // fcopysign, select and others may be lowered to 32-bit bit operations
+    // which don't zero the high bits.
+    return false;
+  }
+}
+
+SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  if (!Subtarget->has16BitInsts() ||
+      DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  if (Src.getValueType() != MVT::i16)
+    return SDValue();
+
+  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
+  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
+  if (Src.getOpcode() == ISD::BITCAST) {
+    SDValue BCSrc = Src.getOperand(0);
+    if (BCSrc.getValueType() == MVT::f16 &&
+        fp16SrcZerosHighBits(BCSrc.getOpcode()))
+      return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::performClassCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -3709,27 +4622,123 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
   return SDValue();
 }
 
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+    return true;
+
+  return DAG.isKnownNeverNaN(Op);
+}
+
+static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
+                            const SISubtarget *ST, unsigned MaxDepth=5) {
+  // If source is a result of another standard FP operation it is already in
+  // canonical form.
+
+  switch (Op.getOpcode()) {
+  default:
+    break;
+
+  // These will flush denorms if required.
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FSQRT:
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FMA:
+  case ISD::FMAD:
+
+  case ISD::FCANONICALIZE:
+    return true;
+
+  case ISD::FP_ROUND:
+    return Op.getValueType().getScalarType() != MVT::f16 ||
+           ST->hasFP16Denormals();
+
+  case ISD::FP_EXTEND:
+    return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
+           ST->hasFP16Denormals();
+
+  case ISD::FP16_TO_FP:
+  case ISD::FP_TO_FP16:
+    return ST->hasFP16Denormals();
+
+  // It can/will be lowered or combined as a bit operation.
+  // Need to check their input recursively to handle.
+  case ISD::FNEG:
+  case ISD::FABS:
+    return (MaxDepth > 0) &&
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
+
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FSINCOS:
+    return Op.getValueType().getScalarType() != MVT::f16;
+
+  // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
+  // For such targets need to check their input recursively.
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+
+    if (ST->supportsMinMaxDenormModes() &&
+        DAG.isKnownNeverNaN(Op.getOperand(0)) &&
+        DAG.isKnownNeverNaN(Op.getOperand(1)))
+      return true;
+
+    return (MaxDepth > 0) &&
+           isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
+           isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
+
+  case ISD::ConstantFP: {
+    auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
+  }
+  }
+  return false;
+}
+
 // Constant fold canonicalize.
 SDValue SITargetLowering::performFCanonicalizeCombine(
   SDNode *N,
   DAGCombinerInfo &DCI) const {
-  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
-  if (!CFP)
+  SelectionDAG &DAG = DCI.DAG;
+  ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
+
+  if (!CFP) {
+    SDValue N0 = N->getOperand(0);
+    EVT VT = N0.getValueType().getScalarType();
+    auto ST = getSubtarget();
+
+    if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
+         (VT == MVT::f64 && ST->hasFP64Denormals()) ||
+         (VT == MVT::f16 && ST->hasFP16Denormals())) &&
+        DAG.isKnownNeverNaN(N0))
+      return N0;
+
+    bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
+
+    if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
+        isCanonicalized(DAG, N0, ST))
+      return N0;
+
     return SDValue();
+  }
 
-  SelectionDAG &DAG = DCI.DAG;
   const APFloat &C = CFP->getValueAPF();
 
   // Flush denormals to 0 if not enabled.
   if (C.isDenormal()) {
     EVT VT = N->getValueType(0);
-    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+    EVT SVT = VT.getScalarType();
+    if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
 
-    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+    if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
 
-    if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+    if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
   }
 
@@ -3749,7 +4758,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
   }
 
-  return SDValue(CFP, 0);
+  return N->getOperand(0);
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -3771,8 +4780,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   }
 }
 
-static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                        SDValue Op0, SDValue Op1, bool Signed) {
+SDValue SITargetLowering::performIntMed3ImmCombine(
+  SelectionDAG &DAG, const SDLoc &SL,
+  SDValue Op0, SDValue Op1, bool Signed) const {
   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
   if (!K1)
     return SDValue();
@@ -3790,34 +4800,28 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
   }
 
   EVT VT = K0->getValueType(0);
+  unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
+  if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
+    return DAG.getNode(Med3Opc, SL, VT,
+                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+  }
 
+  // If there isn't a 16-bit med3 operation, convert to 32-bit.
   MVT NVT = MVT::i32;
   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
 
-  SDValue Tmp1, Tmp2, Tmp3;
-  Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
-  Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
-  Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
-
-  if (VT == MVT::i16) {
-    Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
-                       Tmp1, Tmp2, Tmp3);
+  SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+  SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+  SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
 
-    return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
-  } else
-    return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
-                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
-}
-
-static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
-  if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
-    return true;
-
-  return DAG.isKnownNeverNaN(Op);
+  SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+  return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
 }
 
-static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                       SDValue Op0, SDValue Op1) {
+SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
+                                                  const SDLoc &SL,
+                                                  SDValue Op0,
+                                                  SDValue Op1) const {
   ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
   if (!K1)
     return SDValue();
@@ -3831,6 +4835,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
   if (Cmp == APFloat::cmpGreaterThan)
     return SDValue();
 
+  // TODO: Check IEEE bit enabled?
+  EVT VT = K0->getValueType(0);
+  if (Subtarget->enableDX10Clamp()) {
+    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
+    // hardware fmed3 behavior converting to a min.
+    // FIXME: Should this be allowing -0.0?
+    if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
+  }
+
+  // med3 for f16 is only available on gfx9+.
+  if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
+    return SDValue();
+
   // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
   // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
   // give the other result, which is different from med3 with a NaN input.
@@ -3846,6 +4864,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
+  EVT VT = N->getValueType(0);
   unsigned Opc = N->getOpcode();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -3853,7 +4872,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
+      VT != MVT::f64 &&
+      ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
@@ -3895,7 +4917,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
-      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+      (VT == MVT::f32 || VT == MVT::f64 ||
+       (VT == MVT::f16 && Subtarget->has16BitInsts())) &&
+      Op0.hasOneUse()) {
     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
       return Res;
   }
@@ -3903,6 +4927,87 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   return SDValue();
 }
 
+static bool isClampZeroToOne(SDValue A, SDValue B) {
+  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
+    if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
+      // FIXME: Should this be allowing -0.0?
+      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
+             (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
+    }
+  }
+
+  return false;
+}
+
+// FIXME: Should only worry about snans for version with chain.
+SDValue SITargetLowering::performFMed3Combine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
+  // NaNs. With a NaN input, the order of the operands may change the result.
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue Src2 = N->getOperand(2);
+
+  if (isClampZeroToOne(Src0, Src1)) {
+    // const_a, const_b, x -> clamp is safe in all cases including signaling
+    // nans.
+    // FIXME: Should this be allowing -0.0?
+    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
+  }
+
+  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
+  // handling no dx10-clamp?
+  if (Subtarget->enableDX10Clamp()) {
+    // If NaNs is clamped to 0, we are free to reorder the inputs.
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
+      std::swap(Src1, Src2);
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isClampZeroToOne(Src1, Src2))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  if (Src0.isUndef() && Src1.isUndef())
+    return DCI.DAG.getUNDEF(N->getValueType(0));
+  return SDValue();
+}
+
+SDValue SITargetLowering::performExtractVectorEltCombine(
+  SDNode *N, DAGCombinerInfo &DCI) const {
+  SDValue Vec = N->getOperand(0);
+
+  SelectionDAG &DAG= DCI.DAG;
+  if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+    SDLoc SL(N);
+    EVT EltVT = N->getValueType(0);
+    SDValue Idx = N->getOperand(1);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Vec.getOperand(0), Idx);
+    return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+  }
+
+  return SDValue();
+}
+
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -3915,10 +5020,9 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
     return ISD::FMAD;
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
-       Options.UnsafeFPMath ||
-       (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
-        cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
+  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+       (N0->getFlags().hasUnsafeAlgebra() &&
+        N1->getFlags().hasUnsafeAlgebra())) &&
       isFMAFasterThanFMulAndFAdd(VT)) {
     return ISD::FMA;
   }
@@ -3926,6 +5030,102 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
   return 0;
 }
 
+SDValue SITargetLowering::performAddCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // add x, zext (setcc) => addcarry x, 0, setcc
+  // add x, sext (setcc) => subcarry x, 0, setcc
+  unsigned Opc = LHS.getOpcode();
+  if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
+      Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
+    std::swap(RHS, LHS);
+
+  Opc = RHS.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ANY_EXTEND: {
+    auto Cond = RHS.getOperand(0);
+    if (!isBoolSGPR(Cond))
+      break;
+    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
+    SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
+    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
+    return DAG.getNode(Opc, SL, VTList, Args);
+  }
+  case ISD::ADDCARRY: {
+    // add x, (addcarry y, 0, cc) => addcarry x, y, cc
+    auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
+    if (!C || C->getZExtValue() != 0) break;
+    SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
+    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
+  }
+  }
+  return SDValue();
+}
+
+SDValue SITargetLowering::performSubCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDLoc SL(N);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  unsigned Opc = LHS.getOpcode();
+  if (Opc != ISD::SUBCARRY)
+    std::swap(RHS, LHS);
+
+  if (LHS.getOpcode() == ISD::SUBCARRY) {
+    // sub (subcarry x, 0, cc), y => subcarry x, y, cc
+    auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
+    if (!C || C->getZExtValue() != 0)
+      return SDValue();
+    SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
+    return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
+  }
+  return SDValue();
+}
+
+SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
+  DAGCombinerInfo &DCI) const {
+
+  if (N->getValueType(0) != MVT::i32)
+    return SDValue();
+
+  auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C || C->getZExtValue() != 0)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N->getOperand(0);
+
+  // addcarry (add x, y), 0, cc => addcarry x, y, cc
+  // subcarry (sub x, y), 0, cc => subcarry x, y, cc
+  unsigned LHSOpc = LHS.getOpcode();
+  unsigned Opc = N->getOpcode();
+  if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
+      (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
+    SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
+    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
+  }
+  return SDValue();
+}
+
 SDValue SITargetLowering::performFAddCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
@@ -3933,7 +5133,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
-  assert(!VT.isVector());
 
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
@@ -4024,6 +5223,35 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT VT = LHS.getValueType();
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  auto CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (!CRHS) {
+    CRHS = dyn_cast<ConstantSDNode>(LHS);
+    if (CRHS) {
+      std::swap(LHS, RHS);
+      CC = getSetCCSwappedOperands(CC);
+    }
+  }
+
+  if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+      isBoolSGPR(LHS.getOperand(0))) {
+    // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+    // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+    // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
+    // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
+    if ((CRHS->isAllOnesValue() &&
+         (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+        (CRHS->isNullValue() &&
+         (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+      return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+                         DAG.getConstant(-1, SL, MVT::i1));
+    if ((CRHS->isAllOnesValue() &&
+         (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+        (CRHS->isNullValue() &&
+         (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+      return LHS.getOperand(0);
+  }
 
   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
                                            VT != MVT::f16))
@@ -4031,7 +5259,6 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
 
   // Match isinf pattern
   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
-  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
     if (!CRHS)
@@ -4080,12 +5307,12 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
 
   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
 
-  APInt KnownZero, KnownOne;
+  KnownBits Known;
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
-      TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+  if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
+      TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
     DCI.CommitTargetLoweringOpt(TLO);
   }
 
@@ -4097,6 +5324,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default:
     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+  case ISD::ADD:
+    return performAddCombine(N, DCI);
+  case ISD::SUB:
+    return performSubCombine(N, DCI);
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY:
+    return performAddCarrySubCarryCombine(N, DCI);
   case ISD::FADD:
     return performFAddCombine(N, DCI);
   case ISD::FSUB:
@@ -4112,7 +5346,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
       return performMinMaxCombine(N, DCI);
     break;
@@ -4135,17 +5368,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case AMDGPUISD::ATOMIC_INC:
-  case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
+  case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
     if (DCI.isBeforeLegalize())
       break;
     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
-  }
   case ISD::AND:
     return performAndCombine(N, DCI);
   case ISD::OR:
     return performOrCombine(N, DCI);
   case ISD::XOR:
     return performXorCombine(N, DCI);
+  case ISD::ZERO_EXTEND:
+    return performZeroExtendCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
   case ISD::FCANONICALIZE:
@@ -4170,6 +5404,30 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3:
     return performCvtF32UByteNCombine(N, DCI);
+  case AMDGPUISD::FMED3:
+    return performFMed3Combine(N, DCI);
+  case AMDGPUISD::CVT_PKRTZ_F16_F32:
+    return performCvtPkRTZCombine(N, DCI);
+  case ISD::SCALAR_TO_VECTOR: {
+    SelectionDAG &DAG = DCI.DAG;
+    EVT VT = N->getValueType(0);
+
+    // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
+    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+      SDLoc SL(N);
+      SDValue Src = N->getOperand(0);
+      EVT EltVT = Src.getValueType();
+      if (EltVT == MVT::f16)
+        Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
+
+      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
+      return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
+    }
+
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    return performExtractVectorEltCombine(N, DCI);
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -4198,6 +5456,10 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
        I != E; ++I) {
 
+    // Don't look at users of the chain.
+    if (I.getUse().getResNo() != 0)
+      continue;
+
     // Abort if we can't understand the usage
     if (!I->isMachineOpcode() ||
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
@@ -4250,7 +5512,6 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 
   // Update the users of the node with the new indices
   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
-
     SDNode *User = Users[i];
     if (!User)
       continue;
@@ -4277,8 +5538,33 @@ static bool isFrameIndexOp(SDValue Op) {
 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
 /// with frame index operands.
 /// LLVM assumes that inputs are to these instructions are registers.
-void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
-                                                     SelectionDAG &DAG) const {
+SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
+                                                        SelectionDAG &DAG) const {
+  if (Node->getOpcode() == ISD::CopyToReg) {
+    RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
+    SDValue SrcVal = Node->getOperand(2);
+
+    // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
+    // to try understanding copies to physical registers.
+    if (SrcVal.getValueType() == MVT::i1 &&
+        TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
+      SDLoc SL(Node);
+      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+      SDValue VReg = DAG.getRegister(
+        MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
+
+      SDNode *Glued = Node->getGluedNode();
+      SDValue ToVReg
+        = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
+                         SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
+      SDValue ToResultReg
+        = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
+                           VReg, ToVReg.getValue(1));
+      DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
+      DAG.RemoveDeadNode(Node);
+      return ToResultReg.getNode();
+    }
+  }
 
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
@@ -4294,6 +5580,7 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
   }
 
   DAG.UpdateNodeOperands(Node, Ops);
+  return Node;
 }
 
 /// \brief Fold the instructions after selecting them.
@@ -4460,15 +5747,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
-                                               const TargetRegisterClass *RC,
-                                               unsigned Reg, EVT VT) const {
-  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
-
-  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
-                            cast<RegisterSDNode>(VReg)->getReg(), VT);
-}
-
 //===----------------------------------------------------------------------===//
 //                         SI Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -4496,6 +5774,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
       case 256:
         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
       }
 
     case 'v':
@@ -4549,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
   }
   return TargetLowering::getConstraintType(Constraint);
 }
+
+// Figure out which registers should be reserved for stack access. Only after
+// the function is legalized do we know all of the non-spill stack objects or if
+// calls are present.
+void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  if (Info->isEntryFunction()) {
+    // Callable functions have fixed registers used for stack access.
+    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+  }
+
+  // We have to assume the SP is needed in case there are calls in the function
+  // during lowering. Calls are only detected after the function is
+  // lowered. We're about to reserve registers, so don't bother using it if we
+  // aren't really going to use it.
+  bool NeedSP = !Info->isEntryFunction() ||
+    MFI.hasVarSizedObjects() ||
+    MFI.hasCalls();
+
+  if (NeedSP) {
+    unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
+    Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
+
+    assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
+    assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
+                               Info->getStackPtrOffsetReg()));
+    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
+  }
+
+  MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
+  MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
+  MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
+                     Info->getScratchWaveOffsetReg());
+
+  TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 6c04e4f..e6bb3d6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -21,11 +21,17 @@
 namespace llvm {
 
 class SITargetLowering final : public AMDGPUTargetLowering {
-  SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
-                            unsigned Offset) const;
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
-                         SDValue Chain, unsigned Offset, bool Signed,
-                         const ISD::InputArg *Arg = nullptr) const;
+  SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
+                                   SDValue Chain, uint64_t Offset) const;
+  SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                   const SDLoc &SL, SDValue Chain,
+                                   uint64_t Offset, bool Signed,
+                                   const ISD::InputArg *Arg = nullptr) const;
+
+  SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
+                              const SDLoc &SL, SDValue Chain,
+                              const ISD::InputArg &Arg) const;
+
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -55,11 +61,19 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                             const SDLoc &DL,
                             EVT VT) const;
 
+  SDValue convertArgType(
+    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
+    bool Signed, const ISD::InputArg *Arg = nullptr) const;
+
   /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+  SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
+                             SelectionDAG &DAG) const;
+
   SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
 
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
@@ -79,13 +93,24 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
+  SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                  SDValue Op0, SDValue Op1) const;
+  SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                   SDValue Op0, SDValue Op1, bool Signed) const;
   SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
+  SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -94,7 +119,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
-  bool isCFIntrinsic(const SDNode *Intr) const;
+  unsigned isCFIntrinsic(const SDNode *Intr) const;
 
   void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
 
@@ -115,15 +140,22 @@ public:
 
   const SISubtarget *getSubtarget() const;
 
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+                          EVT /*VT*/) const override;
+
   bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
                           unsigned IntrinsicID) const override;
 
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
-                          EVT /*VT*/) const override;
+  bool getAddrModeArguments(IntrinsicInst * /*I*/,
+                            SmallVectorImpl<Value*> &/*Ops*/,
+                            Type *&/*AccessTy*/) const override;
 
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
+  bool canMergeStoresTo(unsigned AS, EVT MemVT,
+                        const SelectionDAG &DAG) const override;
+
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *IsFast) const override;
@@ -155,7 +187,12 @@ public:
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv,
+                      MachineFunction &MF, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
@@ -175,14 +212,15 @@ public:
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
   SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
   void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                      SDNode *Node) const override;
 
-  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const override;
-  void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
+  SDNode *legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const;
 
   MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL,
                                 SDValue Ptr) const;
@@ -194,6 +232,8 @@ public:
   ConstraintType getConstraintType(StringRef Constraint) const override;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
+
+  void finalizeLowering(MachineFunction &MF) const override;
 };
 
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 91e4bf7..ba346d2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -1,4 +1,4 @@
-//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
+//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,33 +12,46 @@
 /// branches when it's expected that jumping over the untaken control flow will
 /// be cheaper than having every workitem no-op through it.
 //
+//===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "si-insert-skips"
 
-namespace {
-
 static cl::opt<unsigned> SkipThresholdFlag(
   "amdgpu-skip-threshold",
   cl::desc("Number of instructions before jumping over divergent control flow"),
   cl::init(12), cl::Hidden);
 
+namespace {
+
 class SIInsertSkips : public MachineFunctionPass {
 private:
-  const SIRegisterInfo *TRI;
-  const SIInstrInfo *TII;
-  unsigned SkipThreshold;
+  const SIRegisterInfo *TRI = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  unsigned SkipThreshold = 0;
 
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
@@ -55,8 +68,7 @@ private:
 public:
   static char ID;
 
-  SIInsertSkips() :
-    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+  SIInsertSkips() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -69,7 +81,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 char SIInsertSkips::ID = 0;
 
@@ -195,8 +207,8 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     }
   } else {
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
-      .addImm(0)
-      .addOperand(Op);
+        .addImm(0)
+        .add(Op);
   }
 }
 
@@ -251,6 +263,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
+    bool HaveSkipBlock = false;
 
     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
       // Reached convergence point for last divergent branch.
@@ -270,27 +283,33 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
-      case AMDGPU::SI_MASK_BRANCH: {
+      case AMDGPU::SI_MASK_BRANCH:
         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
         MadeChange |= skipMaskBranch(MI, MBB);
         break;
-      }
-      case AMDGPU::S_BRANCH: {
+
+      case AMDGPU::S_BRANCH:
         // Optimize out branches to the next block.
         // FIXME: Shouldn't this be handled by BranchFolding?
-        if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
+        if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+          MI.eraseFromParent();
+        } else if (HaveSkipBlock) {
+          // Remove the given unconditional branch when a skip block has been
+          // inserted after the current one and let skip the two instructions
+          // performing the kill if the exec mask is non-zero.
           MI.eraseFromParent();
+        }
         break;
-      }
-      case AMDGPU::SI_KILL_TERMINATOR: {
+
+      case AMDGPU::SI_KILL_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
         if (ExecBranchStack.empty()) {
           if (skipIfDead(MI, *NextBB)) {
+            HaveSkipBlock = true;
             NextBB = std::next(BI);
             BE = MF.end();
-            Next = MBB.end();
           }
         } else {
           HaveKill = true;
@@ -298,15 +317,15 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
 
         MI.eraseFromParent();
         break;
-      }
-      case AMDGPU::SI_RETURN: {
+
+      case AMDGPU::SI_RETURN_TO_EPILOG:
         // FIXME: Should move somewhere else
         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
 
         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
         // because external bytecode will be appended at the end.
         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
-          // SI_RETURN is not the last instruction. Add an empty block at
+          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
           // the end and jump there.
           if (!EmptyMBBAtEnd) {
             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
@@ -318,7 +337,8 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
             .addMBB(EmptyMBBAtEnd);
           I->eraseFromParent();
         }
-      }
+        break;
+
       default:
         break;
       }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
new file mode 100644
index 0000000..0f009a4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -0,0 +1,1882 @@
+//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "si-insert-waitcnts"
+
+using namespace llvm;
+
+namespace {
+
+// Class of object that encapsulates latest instruction counter score
+// associated with the operand.  Used for determining whether
+// s_waitcnt instruction needs to be emited.
+
+#define CNT_MASK(t) (1u << (t))
+
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+
+typedef std::pair<signed, signed> RegInterval;
+
+struct {
+  int32_t VmcntMax;
+  int32_t ExpcntMax;
+  int32_t LgkmcntMax;
+  int32_t NumVGPRsMax;
+  int32_t NumSGPRsMax;
+} HardwareLimits;
+
+struct {
+  unsigned VGPR0;
+  unsigned VGPRL;
+  unsigned SGPR0;
+  unsigned SGPRL;
+} RegisterEncoding;
+
+enum WaitEventType {
+  VMEM_ACCESS,      // vector-memory read & write
+  LDS_ACCESS,       // lds read & write
+  GDS_ACCESS,       // gds read & write
+  SQ_MESSAGE,       // send message
+  SMEM_ACCESS,      // scalar-memory read & write
+  EXP_GPR_LOCK,     // export holding on its data src
+  GDS_GPR_LOCK,     // GDS holding on its data and addr src
+  EXP_POS_ACCESS,   // write to export position
+  EXP_PARAM_ACCESS, // write to export parameter
+  VMW_GPR_LOCK,     // vector-memory write holding on its data src
+  NUM_WAIT_EVENTS,
+};
+
+// The mapping is:
+//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
+//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
+//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// We reserve a fixed number of VGPR slots in the scoring tables for
+// special tokens like SCMEM_LDS (needed for buffer load to LDS).
+enum RegisterMapping {
+  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
+  EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
+  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
+};
+
+#define ForAllWaitEventType(w)                                                 \
+  for (enum WaitEventType w = (enum WaitEventType)0;                           \
+       (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
+       (w) = (enum WaitEventType)((w) + 1))
+
+// This is a per-basic-block object that maintains current score brackets
+// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait-count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class BlockWaitcntBrackets {
+public:
+  static int32_t getWaitCountMax(InstCounterType T) {
+    switch (T) {
+    case VM_CNT:
+      return HardwareLimits.VmcntMax;
+    case LGKM_CNT:
+      return HardwareLimits.LgkmcntMax;
+    case EXP_CNT:
+      return HardwareLimits.ExpcntMax;
+    default:
+      break;
+    }
+    return 0;
+  };
+
+  void setScoreLB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  };
+
+  void setScoreUB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
+      if (ScoreLBs[T] < UB)
+        ScoreLBs[T] = UB;
+    }
+  };
+
+  int32_t getScoreLB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreLBs[T];
+  };
+
+  int32_t getScoreUB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreUBs[T];
+  };
+
+  // Mapping from event to counter.
+  InstCounterType eventCounter(WaitEventType E) {
+    switch (E) {
+    case VMEM_ACCESS:
+      return VM_CNT;
+    case LDS_ACCESS:
+    case GDS_ACCESS:
+    case SQ_MESSAGE:
+    case SMEM_ACCESS:
+      return LGKM_CNT;
+    case EXP_GPR_LOCK:
+    case GDS_GPR_LOCK:
+    case VMW_GPR_LOCK:
+    case EXP_POS_ACCESS:
+    case EXP_PARAM_ACCESS:
+      return EXP_CNT;
+    default:
+      llvm_unreachable("unhandled event type");
+    }
+    return NUM_INST_CNTS;
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
+  }
+
+  int32_t getRegScore(int GprNo, InstCounterType T) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      return VgprScores[T][GprNo];
+    }
+    return SgprScores[GprNo - NUM_ALL_VGPRS];
+  }
+
+  void clear() {
+    memset(ScoreLBs, 0, sizeof(ScoreLBs));
+    memset(ScoreUBs, 0, sizeof(ScoreUBs));
+    memset(EventUBs, 0, sizeof(EventUBs));
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+    memset(SgprScores, 0, sizeof(SgprScores));
+  }
+
+  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI, unsigned OpNo,
+                             bool Def) const;
+
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, int32_t Val);
+
+  void setWaitAtBeginning() { WaitAtBeginning = true; }
+  void clearWaitAtBeginning() { WaitAtBeginning = false; }
+  bool getWaitAtBeginning() const { return WaitAtBeginning; }
+  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
+  int32_t getMaxVGPR() const { return VgprUB; }
+  int32_t getMaxSGPR() const { return SgprUB; }
+  int32_t getEventUB(enum WaitEventType W) const {
+    assert(W < NUM_WAIT_EVENTS);
+    return EventUBs[W];
+  }
+  bool counterOutOfOrder(InstCounterType T);
+  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  BlockWaitcntBrackets()
+      : WaitAtBeginning(false), RevisitLoop(false), ValidLoop(false), MixedExpTypes(false),
+        LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+  }
+  ~BlockWaitcntBrackets(){};
+
+  bool hasPendingSMEM() const {
+    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
+             LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
+            (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
+             LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
+    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+  }
+
+  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
+
+  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+
+  bool getRevisitLoop() const { return RevisitLoop; }
+  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+
+  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
+  int32_t getPostOrder() const { return PostOrder; }
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
+  void clearWaitcnt() { Waitcnt = NULL; }
+  MachineInstr *getWaitcnt() const { return Waitcnt; }
+
+  bool mixedExpTypes() const { return MixedExpTypes; }
+  void setMixedExpTypes(bool MixedExpTypesIn) {
+    MixedExpTypes = MixedExpTypesIn;
+  }
+
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
+
+private:
+  bool WaitAtBeginning;
+  bool RevisitLoop;
+  bool ValidLoop;
+  bool MixedExpTypes;
+  MachineLoop *LoopRegion;
+  int32_t PostOrder;
+  MachineInstr *Waitcnt;
+  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
+  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
+  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  // Remember the last flat memory operation.
+  int32_t LastFlat[NUM_INST_CNTS] = {0};
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int32_t VgprUB;
+  int32_t SgprUB;
+  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+};
+
+// This is a per-loop-region object that records waitcnt status at the end of
+// loop footer from the previous iteration. We also maintain an iteration
+// count to track the number of times the loop has been visited. When it
+// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
+// at the end of the loop footer.
+class LoopWaitcntData {
+public:
+  void incIterCnt() { IterCnt++; }
+  void resetIterCnt() { IterCnt = 0; }
+  int32_t getIterCnt() { return IterCnt; }
+
+  LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
+  ~LoopWaitcntData(){};
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
+  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
+
+  void print() {
+    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
+    return;
+  }
+
+private:
+  // s_waitcnt added at the end of loop footer to stablize wait scores
+  // at the end of the loop footer.
+  MachineInstr *LfWaitcnt;
+  // Number of iterations the loop has been visited, not including the initial
+  // walk over.
+  int32_t IterCnt;
+};
+
+class SIInsertWaitcnts : public MachineFunctionPass {
+
+private:
+  const SISubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const MachineLoopInfo *MLI;
+  AMDGPU::IsaInfo::IsaVersion IV;
+  AMDGPUAS AMDGPUASI;
+
+  DenseSet<MachineBasicBlock *> BlockVisitedSet;
+  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> VCCZBugHandledSet;
+
+  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
+      BlockWaitcntBracketsMap;
+
+  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+
+  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
+
+  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+
+public:
+  static char ID;
+
+  SIInsertWaitcnts()
+      : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
+        MRI(nullptr), MLI(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI insert wait instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
+    // The waitcnt information is copied because it changes as the block is
+    // traversed.
+    KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
+  }
+
+  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
+                                           BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitCntAfter(MachineInstr &Inst,
+                               BlockWaitcntBrackets *ScoreBrackets);
+  void mergeInputScoreBrackets(MachineBasicBlock &Block);
+  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
+  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+};
+
+} // End anonymous namespace.
+
+RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                                 const SIInstrInfo *TII,
+                                                 const MachineRegisterInfo *MRI,
+                                                 const SIRegisterInfo *TRI,
+                                                 unsigned OpNo,
+                                                 bool Def) const {
+  const MachineOperand &Op = MI->getOperand(OpNo);
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
+      (Def && !Op.isDef()))
+    return {-1, -1};
+
+  // A use via a PW operand does not need a waitcnt.
+  // A partial write is not a WAW.
+  assert(!Op.getSubReg() || !Op.isUndef());
+
+  RegInterval Result;
+  const MachineRegisterInfo &MRIA = *MRI;
+
+  unsigned Reg = TRI->getEncodingValue(Op.getReg());
+
+  if (TRI->isVGPR(MRIA, Op.getReg())) {
+    assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
+    Result.first = Reg - RegisterEncoding.VGPR0;
+    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
+    assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+    Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
+    assert(Result.first >= NUM_ALL_VGPRS &&
+           Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+  }
+  // TODO: Handle TTMP
+  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
+  else
+    return {-1, -1};
+
+  const MachineInstr &MIA = *MI;
+  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  Result.second = Result.first + (Size / 32);
+
+  return Result;
+}
+
+void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                       const SIInstrInfo *TII,
+                                       const SIRegisterInfo *TRI,
+                                       const MachineRegisterInfo *MRI,
+                                       unsigned OpNo, int32_t Val) {
+  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
+  DEBUG({
+    const MachineOperand &Opnd = MI->getOperand(OpNo);
+    assert(TRI->isVGPR(*MRI, Opnd.getReg()));
+  });
+  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+    setRegScore(RegNo, EXP_CNT, Val);
+  }
+}
+
+void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         const MachineRegisterInfo *MRI,
+                                         WaitEventType E, MachineInstr &Inst) {
+  const MachineRegisterInfo &MRIA = *MRI;
+  InstCounterType T = eventCounter(E);
+  int32_t CurrScore = getScoreUB(T) + 1;
+  // EventUB and ScoreUB need to be update regardless if this event changes
+  // the score of a register or not.
+  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
+  EventUBs[E] = CurrScore;
+  setScoreUB(T, CurrScore);
+
+  if (T == EXP_CNT) {
+    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
+    // is required.
+    if (!MixedExpTypes) {
+      MixedExpTypes = counterOutOfOrder(EXP_CNT);
+    }
+
+    // Put score on the source vgprs. If this is a store, just use those
+    // specific register(s).
+    if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+      // All GDS operations must protect their address register (same as
+      // export.)
+      if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
+          Inst.getOpcode() != AMDGPU::DS_CONSUME) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
+            CurrScore);
+      }
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
+            CurrScore);
+        if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                       AMDGPU::OpName::data1) != -1) {
+          setExpScore(&Inst, TII, TRI, MRI,
+                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                                 AMDGPU::OpName::data1),
+                      CurrScore);
+        }
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
+                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
+                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
+                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          const MachineOperand &Op = Inst.getOperand(I);
+          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+          }
+        }
+      }
+    } else if (TII->isFLAT(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMIMG(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMTBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      }
+    } else if (TII->isMUBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else {
+      if (TII->isEXP(Inst)) {
+        // For export the destination registers are really temps that
+        // can be used as the actual source after export patching, so
+        // we need to treat them like sources and set the EXP_CNT
+        // score.
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          MachineOperand &DefMO = Inst.getOperand(I);
+          if (DefMO.isReg() && DefMO.isDef() &&
+              TRI->isVGPR(MRIA, DefMO.getReg())) {
+            setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
+                        CurrScore);
+          }
+        }
+      }
+      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = Inst.getOperand(I);
+        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+        }
+      }
+    }
+#if 0 // TODO: check if this is handled by MUBUF code above.
+  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
+    unsigned OpNo;//TODO: find the OpNo for this operand;
+    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
+    for (signed RegNo = Interval.first; RegNo < Interval.second;
+	 ++RegNo) {
+      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
+    }
+#endif
+  } else {
+    // Match the score to the destination registers.
+    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
+      if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+        continue;
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
+    if (TII->isDS(Inst) && Inst.mayStore()) {
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+    }
+  }
+}
+
+void BlockWaitcntBrackets::print(raw_ostream &OS) {
+  OS << '\n';
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1)) {
+    int LB = getScoreLB(T);
+    int UB = getScoreUB(T);
+
+    switch (T) {
+    case VM_CNT:
+      OS << "    VM_CNT(" << UB - LB << "): ";
+      break;
+    case LGKM_CNT:
+      OS << "    LGKM_CNT(" << UB - LB << "): ";
+      break;
+    case EXP_CNT:
+      OS << "    EXP_CNT(" << UB - LB << "): ";
+      break;
+    default:
+      OS << "    UNKNOWN(" << UB - LB << "): ";
+      break;
+    }
+
+    if (LB < UB) {
+      // Print vgpr scores.
+      for (int J = 0; J <= getMaxVGPR(); J++) {
+        int RegScore = getRegScore(J, T);
+        if (RegScore <= LB)
+          continue;
+        int RelScore = RegScore - LB - 1;
+        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
+          OS << RelScore << ":v" << J << " ";
+        } else {
+          OS << RelScore << ":ds ";
+        }
+      }
+      // Also need to print sgpr scores for lgkm_cnt.
+      if (T == LGKM_CNT) {
+        for (int J = 0; J <= getMaxSGPR(); J++) {
+          int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          if (RegScore <= LB)
+            continue;
+          int RelScore = RegScore - LB - 1;
+          OS << RelScore << ":s" << J << " ";
+        }
+      }
+    }
+    OS << '\n';
+  }
+  OS << '\n';
+  return;
+}
+
+unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
+                                                int ScoreToWait) {
+  unsigned int NeedWait = 0;
+  if (ScoreToWait == -1) {
+    // The score to wait is unknown. This implies that it was not encountered
+    // during the path of the CFG walk done during the current traversal but
+    // may be seen on a different path. Emit an s_wait counter with a
+    // conservative value of 0 for the counter.
+    NeedWait = CNT_MASK(T);
+    setScoreLB(T, getScoreUB(T));
+    return NeedWait;
+  }
+
+  // If the score of src_operand falls within the bracket, we need an
+  // s_waitcnt instruction.
+  const int32_t LB = getScoreLB(T);
+  const int32_t UB = getScoreUB(T);
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if (T == VM_CNT && hasPendingFlat()) {
+      // If there is a pending FLAT operation, and this is a VM waitcnt,
+      // then we need to force a waitcnt 0 for VM.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the brack. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else {
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, ScoreToWait);
+    }
+  }
+
+  return NeedWait;
+}
+
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
+  switch (T) {
+  case VM_CNT:
+    return false;
+  case LGKM_CNT: {
+    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      // Scalar memory read always can go out of order.
+      return true;
+    }
+    int NumEventTypes = 0;
+    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  case EXP_CNT: {
+    // If there has been a mixture of export types, then a waitcnt exp(0) is
+    // required.
+    if (MixedExpTypes)
+      return true;
+    int NumEventTypes = 0;
+    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                      false)
+INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                    false)
+
+char SIInsertWaitcnts::ID = 0;
+
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+
+FunctionPass *llvm::createSIInsertWaitcntsPass() {
+  return new SIInsertWaitcnts();
+}
+
+static bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
+}
+
+///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+///  Instructions of a given type are returned in order,
+///  but instructions of different types can complete out of order.
+///  We rely on this in-order completion
+///  and simply assign a score to the memory access instructions.
+///  We keep track of the active "score bracket" to determine
+///  if an access of a memory read requires an s_waitcnt
+///  and if so what the value of each counter is.
+///  The "score bracket" is bound by the lower bound and upper bound
+///  scores (*_score_LB and *_score_ub respectively).
+MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
+  // To emit, or not to emit - that's the question!
+  // Start with an assumption that there is no need to emit.
+  unsigned int EmitSwaitcnt = 0;
+  // s_waitcnt instruction to return; default is NULL.
+  MachineInstr *SWaitInst = nullptr;
+  // No need to wait before phi. If a phi-move exists, then the wait should
+  // has been inserted before the move. If a phi-move does not exist, then
+  // wait should be inserted before the real use. The same is true for
+  // sc-merge. It is not a coincident that all these cases correspond to the
+  // instructions that are skipped in the assembling loop.
+  bool NeedLineMapping = false; // TODO: Check on this.
+  if (MI.isDebugValue() &&
+      // TODO: any other opcode?
+      !NeedLineMapping) {
+    return SWaitInst;
+  }
+
+  // See if an s_waitcnt is forced at block entry, or is needed at
+  // program end.
+  if (ScoreBrackets->getWaitAtBeginning()) {
+    // Note that we have already cleared the state, so we don't need to update
+    // it.
+    ScoreBrackets->clearWaitAtBeginning();
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      EmitSwaitcnt |= CNT_MASK(T);
+      ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+    }
+  }
+
+  // See if this instruction has a forced S_WAITCNT VM.
+  // TODO: Handle other cases of NeedsWaitcntVmBefore()
+  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+  }
+
+  // All waits must be resolved at call return.
+  // NOTE: this could be improved with knowledge of all call sites or
+  //   with knowledge of the called routines.
+  if (MI.getOpcode() == AMDGPU::RETURN ||
+      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+        EmitSwaitcnt |= CNT_MASK(T);
+      }
+    }
+  }
+  // Resolve vm waits before gs-done.
+  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
+            MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
+            AMDGPU::SendMsg::ID_GS_DONE)) {
+    if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
+      ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+    }
+  }
+#if 0 // TODO: the following blocks of logic when we have fence.
+  else if (MI.getOpcode() == SC_FENCE) {
+    const unsigned int group_size =
+      context->shader_info->GetMaxThreadGroupSize();
+    // group_size == 0 means thread group size is unknown at compile time
+    const bool group_is_multi_wave =
+      (group_size == 0 || group_size > target_info->GetWaveFrontSize());
+    const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
+
+    for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
+      SCRegType src_type = Inst->GetSrcType(i);
+      switch (src_type) {
+        case SCMEM_LDS:
+          if (group_is_multi_wave ||
+	      context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+                               ScoreBrackets->getScoreUB(LGKM_CNT));
+            // LDS may have to wait for VM_CNT after buffer load to LDS
+            if (target_info->HasBufferLoadToLDS()) {
+              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+                                 ScoreBrackets->getScoreUB(VM_CNT));
+            }
+          }
+          break;
+
+        case SCMEM_GDS:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+			       ScoreBrackets->getScoreUB(LGKM_CNT));
+          }
+          break;
+
+        case SCMEM_UAV:
+        case SCMEM_TFBUF:
+        case SCMEM_RING:
+        case SCMEM_SCATTER:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+			       ScoreBrackets->getScoreUB(VM_CNT));
+          }
+          break;
+
+        case SCMEM_SCRATCH:
+        default:
+          break;
+      }
+    }
+  }
+#endif
+
+  // Export & GDS instructions do not read the EXEC mask until after the export
+  // is granted (which can occur well after the instruction is issued).
+  // The shader program must flush all EXP operations on the export-count
+  // before overwriting the EXEC mask.
+  else {
+    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+      // Export and GDS are tracked individually, either may trigger a waitcnt
+      // for EXEC.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+    }
+
+#if 0 // TODO: the following code to handle CALL.
+    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
+    // However, there is a problem with EXP_CNT, because the call cannot
+    // easily tell if a register is used in the function, and if it did, then
+    // the referring instruction would have to have an S_WAITCNT, which is
+    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
+    // before the call.
+    if (MI.getOpcode() == SC_CALL) {
+      if (ScoreBrackets->getScoreUB(EXP_CNT) >
+	  ScoreBrackets->getScoreLB(EXP_CNT)) {
+        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+      }
+    }
+#endif
+
+    // Look at the source operands of every instruction to see if
+    // any of them results from a previous memory operation that affects
+    // its current usage. If so, an s_waitcnt instruction needs to be
+    // emitted.
+    // If the source operand was defined by a load, add the s_waitcnt
+    // instruction.
+    for (const MachineMemOperand *Memop : MI.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS != AMDGPUASI.LOCAL_ADDRESS)
+        continue;
+      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+      // VM_CNT is only relevant to vgpr or LDS.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      const MachineOperand &Op = MI.getOperand(I);
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Op.getReg())) {
+          // VM_CNT is only relevant to vgpr or LDS.
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    }
+    // End of for loop that looks at all source operands to decide vm_wait_cnt
+    // and lgk_wait_cnt.
+
+    // Two cases are handled for destination operands:
+    // 1) If the destination operand was defined by a load, add the s_waitcnt
+    // instruction to guarantee the right WAW order.
+    // 2) If a destination operand that was used by a recent export/store ins,
+    // add s_waitcnt on exp_cnt to guarantee the WAR order.
+    if (MI.mayStore()) {
+      for (const MachineMemOperand *Memop : MI.memoperands()) {
+        unsigned AS = Memop->getAddrSpace();
+        if (AS != AMDGPUASI.LOCAL_ADDRESS)
+          continue;
+        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+      }
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      MachineOperand &Def = MI.getOperand(I);
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Def.getReg())) {
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    } // End of for loop that looks at all dest operands.
+  }
+
+  // TODO: Tie force zero to a compiler triage option.
+  bool ForceZero = false;
+
+  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
+  // occurs before the instruction. Doing it here prevents any additional
+  // S_WAITCNTs from being emitted if the instruction was marked as
+  // requiring a WAITCNT beforehand.
+  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
+      !ST->hasAutoWaitcntBeforeBarrier()) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
+  }
+
+  // TODO: Remove this work-around, enable the assert for Bug 457939
+  //       after fixing the scheduler. Also, the Shader Compiler code is
+  //       independent of target.
+  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+            ScoreBrackets->getScoreUB(LGKM_CNT) &&
+        ScoreBrackets->hasPendingSMEM()) {
+      // Wait on everything, not just LGKM.  vccz reads usually come from
+      // terminators, and we always wait on everything at the end of the
+      // block, so if we only wait on LGKM here, we might end up with
+      // another s_waitcnt inserted right after this if there are non-LGKM
+      // instructions still outstanding.
+      ForceZero = true;
+      EmitSwaitcnt = true;
+    }
+  }
+
+  // Does this operand processing indicate s_wait counter update?
+  if (EmitSwaitcnt) {
+    int CntVal[NUM_INST_CNTS];
+
+    bool UseDefaultWaitcntStrategy = true;
+    if (ForceZero) {
+      // Force all waitcnts to 0.
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+      }
+      CntVal[VM_CNT] = 0;
+      CntVal[EXP_CNT] = 0;
+      CntVal[LGKM_CNT] = 0;
+      UseDefaultWaitcntStrategy = false;
+    }
+
+    if (UseDefaultWaitcntStrategy) {
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        if (EmitSwaitcnt & CNT_MASK(T)) {
+          int Delta =
+              ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
+          int MaxDelta = ScoreBrackets->getWaitCountMax(T);
+          if (Delta >= MaxDelta) {
+            Delta = -1;
+            if (T != EXP_CNT) {
+              ScoreBrackets->setScoreLB(
+                  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
+            }
+            EmitSwaitcnt &= ~CNT_MASK(T);
+          }
+          CntVal[T] = Delta;
+        } else {
+          // If we are not waiting for a particular counter then encode
+          // it as -1 which means "don't care."
+          CntVal[T] = -1;
+        }
+      }
+    }
+
+    // If we are not waiting on any counter we can skip the wait altogether.
+    if (EmitSwaitcnt != 0) {
+      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
+      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
+      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+                          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
+          (AMDGPU::decodeExpcnt(IV, Imm) !=
+           (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
+          (AMDGPU::decodeLgkmcnt(IV, Imm) !=
+           (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
+        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
+        if (ContainingLoop) {
+          MachineBasicBlock *TBB = ContainingLoop->getHeader();
+          BlockWaitcntBrackets *ScoreBracket =
+              BlockWaitcntBracketsMap[TBB].get();
+          if (!ScoreBracket) {
+            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
+            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
+          }
+          ScoreBracket->setRevisitLoop(true);
+          DEBUG(dbgs() << "set-revisit: block"
+                       << ContainingLoop->getHeader()->getNumber() << '\n';);
+        }
+      }
+
+      // Update an existing waitcount, or make a new one.
+      MachineFunction &MF = *MI.getParent()->getParent();
+      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
+        SWaitInst = OldWaitcnt;
+      } else {
+        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
+                                          MI.getDebugLoc());
+        CompilerGeneratedWaitcntSet.insert(SWaitInst);
+      }
+
+      const MachineOperand &Op =
+          MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
+              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
+      SWaitInst->addOperand(MF, Op);
+
+      if (CntVal[EXP_CNT] == 0) {
+        ScoreBrackets->setMixedExpTypes(false);
+      }
+    }
+  }
+
+  return SWaitInst;
+}
+
+void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
+                                             MachineInstr *Waitcnt) {
+  if (MBB.empty()) {
+    MBB.push_back(Waitcnt);
+    return;
+  }
+
+  MachineBasicBlock::iterator It = MBB.end();
+  MachineInstr *MI = &*(--It);
+  if (MI->isBranch()) {
+    MBB.insert(It, Waitcnt);
+  } else {
+    MBB.push_back(Waitcnt);
+  }
+
+  return;
+}
+
+void SIInsertWaitcnts::updateEventWaitCntAfter(
+    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+  // Now look at the instruction opcode. If it is a memory access
+  // instruction, update the upper-bound of the appropriate counter's
+  // bracket and the destination operand scores.
+  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
+  uint64_t TSFlags = Inst.getDesc().TSFlags;
+  if (TII->isDS(Inst) && (TSFlags & SIInstrFlags::LGKM_CNT)) {
+    if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds) &&
+	TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+    } else {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+    }
+  } else if (TII->isFLAT(Inst)) {
+    assert(Inst.mayLoad() || Inst.mayStore());
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+
+    // This is a flat memory operation. Check to see if it has memory
+    // tokens for both LDS and Memory, and if so mark it as a flat.
+    bool FoundLDSMem = false;
+    for (const MachineMemOperand *Memop : Inst.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+        FoundLDSMem = true;
+    }
+
+    // This is a flat memory operation, so note it - it will require
+    // that both the VM and LGKM be flushed to zero if it is pending when
+    // a VM or LGKM dependency occurs.
+    if (FoundLDSMem) {
+      ScoreBrackets->setPendingFlat();
+    }
+  } else if (SIInstrInfo::isVMEM(Inst) &&
+             // TODO: get a better carve out.
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+    }
+  } else if (TII->isSMRD(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else {
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_SENDMSG:
+    case AMDGPU::S_SENDMSGHALT:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+      break;
+    case AMDGPU::EXP:
+    case AMDGPU::EXP_DONE: {
+      int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+      if (Imm >= 32 && Imm <= 63)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+      else if (Imm >= 12 && Imm <= 15)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+      else
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+      break;
+    }
+    case AMDGPU::S_MEMTIME:
+    case AMDGPU::S_MEMREALTIME:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+  int32_t MaxPending[NUM_INST_CNTS] = {0};
+  int32_t MaxFlat[NUM_INST_CNTS] = {0};
+  bool MixedExpTypes = false;
+
+  // Clear the score bracket state.
+  ScoreBrackets->clear();
+
+  // Compute the number of pending elements on block entry.
+
+  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
+  // need to handle single BBs with backedges to themselves. This means that
+  // they will need to retain and not clear their initial state.
+
+  // See if there are any uninitialized predecessors. If so, emit an
+  // s_waitcnt 0 at the beginning of the block.
+  for (MachineBasicBlock *pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[pred].get();
+    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      int span =
+          PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
+      MaxPending[T] = std::max(MaxPending[T], span);
+      span =
+          PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
+      MaxFlat[T] = std::max(MaxFlat[T], span);
+    }
+
+    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Also handle kills for exit block.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        int Span = KillWaitBrackets[I]->getScoreUB(T) -
+                   KillWaitBrackets[I]->getScoreLB(T);
+        MaxPending[T] = std::max(MaxPending[T], Span);
+        Span = KillWaitBrackets[I]->pendingFlat(T) -
+               KillWaitBrackets[I]->getScoreLB(T);
+        MaxFlat[T] = std::max(MaxFlat[T], Span);
+      }
+
+      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
+    }
+  }
+
+  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+
+    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+    }
+  }
+
+#if 0
+  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
+  // TODO: how does LC distinguish between function entry and main entry?
+  // If this is the entry to a function, force a wait.
+  MachineBasicBlock &Entry = Block.getParent()->front();
+  if (Entry.getNumber() == Block.getNumber()) {
+    ScoreBrackets->setWaitAtBeginning();
+    return;
+  }
+#endif
+
+  // Now set the current Block's brackets to the largest ending bracket.
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1)) {
+    ScoreBrackets->setScoreUB(T, MaxPending[T]);
+    ScoreBrackets->setScoreLB(T, 0);
+    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
+  }
+
+  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
+
+  // Set the register scoreboard.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    // Now merge the gpr_reg_score information
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      int PredLB = PredScoreBrackets->getScoreLB(T);
+      int PredUB = PredScoreBrackets->getScoreUB(T);
+      if (PredLB < PredUB) {
+        int PredScale = MaxPending[T] - PredUB;
+        // Merge vgpr scores.
+        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
+          int PredRegScore = PredScoreBrackets->getRegScore(J, T);
+          if (PredRegScore <= PredLB)
+            continue;
+          int NewRegScore = PredScale + PredRegScore;
+          ScoreBrackets->setRegScore(
+              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+        }
+        // Also need to merge sgpr scores for lgkm_cnt.
+        if (T == LGKM_CNT) {
+          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
+            int PredRegScore =
+                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J + NUM_ALL_VGPRS, LGKM_CNT,
+                std::max(
+                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                    NewRegScore));
+          }
+        }
+      }
+    }
+
+    // Also merge the WaitEvent information.
+    ForAllWaitEventType(W) {
+      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
+      int PredEventUB = PredScoreBrackets->getEventUB(W);
+      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
+        int NewEventUB =
+            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
+        if (NewEventUB > 0) {
+          ScoreBrackets->setEventUB(
+              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+        }
+      }
+    }
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Set the register scoreboard.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      // Now merge the gpr_reg_score information.
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
+        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
+        if (PredLB < PredUB) {
+          int PredScale = MaxPending[T] - PredUB;
+          // Merge vgpr scores.
+          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
+            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+          }
+          // Also need to merge sgpr scores for lgkm_cnt.
+          if (T == LGKM_CNT) {
+            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
+              int PredRegScore =
+                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+              if (PredRegScore <= PredLB)
+                continue;
+              int NewRegScore = PredScale + PredRegScore;
+              ScoreBrackets->setRegScore(
+                  J + NUM_ALL_VGPRS, LGKM_CNT,
+                  std::max(
+                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                      NewRegScore));
+            }
+          }
+        }
+      }
+
+      // Also merge the WaitEvent information.
+      ForAllWaitEventType(W) {
+        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
+        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
+        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
+          int NewEventUB =
+              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
+          if (NewEventUB > 0) {
+            ScoreBrackets->setEventUB(
+                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+          }
+        }
+      }
+    }
+  }
+
+  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
+  // sequencing predecessors, because changes to EXEC require waitcnts due to
+  // the delayed nature of these operations.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
+    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_gds_ub > 0) {
+        ScoreBrackets->setEventUB(
+            GDS_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
+      }
+    }
+    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
+    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_exp_ub > 0) {
+        ScoreBrackets->setEventUB(
+            EXP_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+      }
+    }
+  }
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+// Generate s_waitcnt instructions where needed.
+void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block) {
+  // Initialize the state information.
+  mergeInputScoreBrackets(Block);
+
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+
+  DEBUG({
+    dbgs() << "Block" << Block.getNumber();
+    ScoreBrackets->dump();
+  });
+
+  bool InsertNOP = false;
+
+  // Walk over the instructions.
+  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
+       Iter != E;) {
+    MachineInstr &Inst = *Iter;
+    // Remove any previously existing waitcnts.
+    if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
+      // TODO: Register the old waitcnt and optimize the following waitcnts.
+      // Leaving the previously existing waitcnts is conservatively correct.
+      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
+          CompilerGeneratedWaitcntSet.end())
+        ++Iter;
+      else {
+        ScoreBrackets->setWaitcnt(&Inst);
+        ++Iter;
+        Inst.removeFromParent();
+      }
+      continue;
+    }
+
+    // Kill instructions generate a conditional branch to the endmain block.
+    // Merge the current waitcnt state into the endmain block information.
+    // TODO: Are there other flavors of KILL instruction?
+    if (Inst.getOpcode() == AMDGPU::KILL) {
+      addKillWaitBracket(ScoreBrackets);
+    }
+
+    bool VCCZBugWorkAround = false;
+    if (readsVCCZ(Inst) &&
+        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+              ScoreBrackets->getScoreUB(LGKM_CNT) &&
+          ScoreBrackets->hasPendingSMEM()) {
+        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+          VCCZBugWorkAround = true;
+      }
+    }
+
+    // Generate an s_waitcnt instruction to be placed before
+    // cur_Inst, if needed.
+    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
+
+    if (SWaitInst) {
+      Block.insert(Inst, SWaitInst);
+      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
+        DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                     << "Old Instr: " << Inst << '\n'
+                     << "New Instr: " << *SWaitInst << '\n';);
+      }
+    }
+
+    updateEventWaitCntAfter(Inst, ScoreBrackets);
+
+#if 0 // TODO: implement resource type check controlled by options with ub = LB.
+    // If this instruction generates a S_SETVSKIP because it is an
+    // indexed resource, and we are on Tahiti, then it will also force
+    // an S_WAITCNT vmcnt(0)
+    if (RequireCheckResourceType(Inst, context)) {
+      // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
+      ScoreBrackets->setScoreLB(VM_CNT,
+				   ScoreBrackets->getScoreUB(VM_CNT));
+    }
+#endif
+
+    ScoreBrackets->clearWaitcnt();
+
+    if (SWaitInst) {
+      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
+    }
+    DEBUG({
+      Inst.print(dbgs());
+      ScoreBrackets->dump();
+    });
+
+    // Check to see if this is a GWS instruction. If so, and if this is CI or
+    // VI, then the generated code sequence will include an S_WAITCNT 0.
+    // TODO: Are these the only GWS instructions?
+    if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
+      // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
+      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+      ScoreBrackets->updateByWait(LGKM_CNT,
+                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+    }
+
+    // TODO: Remove this work-around after fixing the scheduler and enable the
+    // assert above.
+    if (VCCZBugWorkAround) {
+      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+      // bit is updated, so we can restore the bit by reading the value of
+      // vcc and then writing it back to the register.
+      BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::VCC);
+      VCCZBugHandledSet.insert(&Inst);
+    }
+
+    if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+
+      // This avoids a s_nop after a waitcnt has just been inserted.
+      if (!SWaitInst && InsertNOP) {
+        BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+      }
+      InsertNOP = false;
+
+      // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
+      // or SMEM clause, respectively.
+      //
+      // The temporary workaround is to break the clauses with S_NOP.
+      //
+      // The proper solution would be to allocate registers such that all source
+      // and destination registers don't overlap, e.g. this is illegal:
+      //   r0 = load r2
+      //   r2 = load r0
+      bool IsSMEM = false;
+      bool IsVMEM = false;
+      if (TII->isSMRD(Inst))
+        IsSMEM = true;
+      else if (TII->usesVM_CNT(Inst))
+        IsVMEM = true;
+
+      ++Iter;
+      if (Iter == E)
+        break;
+
+      MachineInstr &Next = *Iter;
+
+      // TODO: How about consecutive SMEM instructions?
+      //       The comments above says break the clause but the code does not.
+      // if ((TII->isSMRD(next) && isSMEM) ||
+      if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
+          // TODO: Enable this check when hasSoftClause is upstreamed.
+          // ST->hasSoftClauses() &&
+          ST->isXNACKEnabled()) {
+        // Insert a NOP to break the clause.
+        InsertNOP = true;
+        continue;
+      }
+
+      // There must be "S_NOP 0" between an instruction writing M0 and
+      // S_SENDMSG.
+      if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
+           Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+          Inst.definesRegister(AMDGPU::M0))
+        InsertNOP = true;
+
+      continue;
+    }
+
+    ++Iter;
+  }
+
+  // Check if we need to force convergence at loop footer.
+  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
+  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+    WaitcntData->print();
+    DEBUG(dbgs() << '\n';);
+
+    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
+    // placement and doesn't always guarantee convergence for a loop. Each
+    // loop should take at most 2 iterations for it to converge naturally.
+    // When this max is reached and result doesn't converge, we force
+    // convergence by inserting a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > 2) {
+      // To ensure convergence, need to make wait events at loop footer be no
+      // more than those from the previous iteration.
+      // As a simplification, Instead of tracking individual scores and
+      // generate the precise wait count, just wait on 0.
+      bool HasPending = false;
+      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+          HasPending = true;
+        }
+      }
+
+      if (HasPending) {
+        if (!SWaitInst) {
+          SWaitInst = Block.getParent()->CreateMachineInstr(
+              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          CompilerGeneratedWaitcntSet.insert(SWaitInst);
+          const MachineOperand &Op = MachineOperand::CreateImm(0);
+          SWaitInst->addOperand(MF, Op);
+#if 0 // TODO: Format the debug output
+          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
+          OutputTransformAdd(SWaitInst, context);
+#endif
+        }
+#if 0 // TODO: ??
+        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
+#endif
+      }
+
+      if (SWaitInst) {
+        DEBUG({
+          SWaitInst->print(dbgs());
+          dbgs() << "\nAdjusted score board:";
+          ScoreBrackets->dump();
+        });
+
+        // Add this waitcnt to the block. It is either newly created or
+        // created in previous iterations and added back since block traversal
+        // always remove waitcnt.
+        insertWaitcntBeforeCF(Block, SWaitInst);
+        WaitcntData->setWaitcnt(SWaitInst);
+      }
+    }
+  }
+}
+
+bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  AMDGPUASI = ST->getAMDGPUAS();
+
+  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+
+  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
+  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
+  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+
+  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+  RegisterEncoding.VGPRL =
+      RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
+  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+  RegisterEncoding.SGPRL =
+      RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+
+  // Walk over the blocks in reverse post-dominator order, inserting
+  // s_waitcnt where needed.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+           I = RPOT.begin(),
+           E = RPOT.end(), J = RPOT.begin();
+       I != E;) {
+    MachineBasicBlock &MBB = **I;
+
+    BlockVisitedSet.insert(&MBB);
+
+    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    if (!ScoreBrackets) {
+      BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
+      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    }
+    ScoreBrackets->setPostOrder(MBB.getNumber());
+    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
+    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
+      LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
+
+    // If we are walking into the block from before the loop, then guarantee
+    // at least 1 re-walk over the loop to propagate the information, even if
+    // no S_WAITCNT instructions were generated.
+    if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
+        (BlockWaitcntProcessedSet.find(&MBB) ==
+         BlockWaitcntProcessedSet.end())) {
+      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+      DEBUG(dbgs() << "set-revisit: block"
+                   << ContainingLoop->getHeader()->getNumber() << '\n';);
+    }
+
+    // Walk over the instructions.
+    insertWaitcntInBlock(MF, MBB);
+
+    // Flag that waitcnts have been processed at least once.
+    BlockWaitcntProcessedSet.insert(&MBB);
+
+    // See if we want to revisit the loop.
+    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+      MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
+      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
+      if (EntrySB && EntrySB->getRevisitLoop()) {
+        EntrySB->setRevisitLoop(false);
+        J = I;
+        int32_t PostOrder = EntrySB->getPostOrder();
+        // TODO: Avoid this loop. Find another way to set I.
+        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+                 X = RPOT.begin(),
+                 Y = RPOT.end();
+             X != Y; ++X) {
+          MachineBasicBlock &MBBX = **X;
+          if (MBBX.getNumber() == PostOrder) {
+            I = X;
+            break;
+          }
+        }
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        WaitcntData->incIterCnt();
+        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        continue;
+      } else {
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        // Loop converged, reset iteration count. If this loop gets revisited,
+        // it must be from an outer loop, the counter will restart, this will
+        // ensure we don't force convergence on such revisits.
+        WaitcntData->resetIterCnt();
+      }
+    }
+
+    J = I;
+    ++I;
+  }
+
+  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+  bool HaveScalarStores = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         ++I) {
+
+      if (!HaveScalarStores && TII->isScalarStore(*I))
+        HaveScalarStores = true;
+
+      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+        EndPgmBlocks.push_back(&MBB);
+    }
+  }
+
+  if (HaveScalarStores) {
+    // If scalar writes are used, the cache must be flushed or else the next
+    // wave to reuse the same scratch memory can be clobbered.
+    //
+    // Insert s_dcache_wb at wave termination points if there were any scalar
+    // stores, and only if the cache hasn't already been flushed. This could be
+    // improved by looking across blocks for flushes in postdominating blocks
+    // from the stores but an explicitly requested flush is probably very rare.
+    for (MachineBasicBlock *MBB : EndPgmBlocks) {
+      bool SeenDCacheWB = false;
+
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+           ++I) {
+
+        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+          SeenDCacheWB = true;
+        else if (TII->isScalarStore(*I))
+          SeenDCacheWB = false;
+
+        // FIXME: It would be better to insert this before a waitcnt if any.
+        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+            !SeenDCacheWB) {
+          Modified = true;
+          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+        }
+      }
+    }
+  }
+
+  if (!MFI->isEntryFunction()) {
+    // Wait for any outstanding memory operations that the input registers may
+    // depend on. We can't track them and it's better to to the wait after the
+    // costly call sequence.
+
+    // TODO: Could insert earlier and schedule more liberally with operations
+    // that only use caller preserved registers.
+    MachineBasicBlock &EntryBB = MF.front();
+    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      .addImm(0);
+
+    Modified = true;
+  }
+
+  return Modified;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index fceabd7..bc86515 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -21,16 +21,32 @@
 #include "SIDefines.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <new>
+#include <utility>
 
 #define DEBUG_TYPE "si-insert-waits"
 
 using namespace llvm;
-using namespace llvm::AMDGPU;
 
 namespace {
 
@@ -42,7 +58,6 @@ typedef union {
     unsigned LGKM;
   } Named;
   unsigned Array[3];
-
 } Counters;
 
 typedef enum {
@@ -55,13 +70,12 @@ typedef Counters RegCounters[512];
 typedef std::pair<unsigned, unsigned> RegInterval;
 
 class SIInsertWaits : public MachineFunctionPass {
-
 private:
-  const SISubtarget *ST;
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
+  const SISubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI;
-  IsaVersion IV;
+  AMDGPU::IsaInfo::IsaVersion ISA;
 
   /// \brief Constant zero value
   static const Counters ZeroCounts;
@@ -86,7 +100,7 @@ private:
   RegCounters DefinedRegs;
 
   /// \brief Different export instruction types seen since last wait.
-  unsigned ExpInstrTypesSeen;
+  unsigned ExpInstrTypesSeen = 0;
 
   /// \brief Type of the last opcode.
   InstType LastOpcodeType;
@@ -100,7 +114,7 @@ private:
   bool ReturnsVoid;
 
   /// Whether the VCCZ bit is possibly corrupt
-  bool VCCZCorrupt;
+  bool VCCZCorrupt = false;
 
   /// \brief Get increment/decrement amount for this instruction.
   Counters getHwCounts(MachineInstr &MI);
@@ -141,13 +155,7 @@ private:
 public:
   static char ID;
 
-  SIInsertWaits() :
-    MachineFunctionPass(ID),
-    ST(nullptr),
-    TII(nullptr),
-    TRI(nullptr),
-    ExpInstrTypesSeen(0),
-    VCCZCorrupt(false) { }
+  SIInsertWaits() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -161,7 +169,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
                       "SI Insert Waits", false, false)
@@ -208,8 +216,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
 
         // XXX - What if this is a write into a super register?
         const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
-        unsigned Size = RC->getSize();
-        Result.Named.LGKM = Size > 4 ? 2 : 1;
+        unsigned Size = TRI->getRegSizeInBits(*RC);
+        Result.Named.LGKM = Size > 32 ? 2 : 1;
       } else {
         // s_dcache_inv etc. do not have a a destination register. Assume we
         // want a wait on these.
@@ -281,12 +289,12 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 
 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
                                           const MachineOperand &Reg) const {
-  unsigned Size = RC->getSize();
-  assert(Size >= 4);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  assert(Size >= 32);
 
   RegInterval Result;
   Result.first = TRI->getEncodingValue(Reg.getReg());
-  Result.second = Result.first + Size / 4;
+  Result.second = Result.first + Size / 32;
 
   return Result;
 }
@@ -294,7 +302,6 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     const Counters &Increment) {
-
   // Get the hardware counter increments and sum them up
   Counters Limit = ZeroCounts;
   unsigned Sum = 0;
@@ -366,7 +373,6 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I,
                                const Counters &Required) {
-
   // End of program? No need to wait on anything
   // A function not returning void needs to wait, because other bytecode will
   // be appended after it and we don't know what it will be.
@@ -393,7 +399,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
   bool NeedWait = false;
 
   for (unsigned i = 0; i < 3; ++i) {
-
     if (Required.Array[i] <= WaitedOn.Array[i])
       continue;
 
@@ -421,10 +426,10 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 
   // Build the wait instruction
   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-    .addImm(encodeWaitcnt(IV,
-                          Counts.Named.VM,
-                          Counts.Named.EXP,
-                          Counts.Named.LGKM));
+    .addImm(AMDGPU::encodeWaitcnt(ISA,
+                                  Counts.Named.VM,
+                                  Counts.Named.EXP,
+                                  Counts.Named.LGKM));
 
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -434,7 +439,6 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 
 /// \brief helper function for handleOperands
 static void increaseCounters(Counters &Dst, const Counters &Src) {
-
   for (unsigned i = 0; i < 3; ++i)
     Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
 }
@@ -453,9 +457,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
   unsigned Imm = I->getOperand(0).getImm();
   Counters Counts, WaitOn;
 
-  Counts.Named.VM = decodeVmcnt(IV, Imm);
-  Counts.Named.EXP = decodeExpcnt(IV, Imm);
-  Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
+  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
+  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
+  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
 
   for (unsigned i = 0; i < 3; ++i) {
     if (Counts.Array[i] <= LastIssued.Array[i])
@@ -468,7 +472,6 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
 }
 
 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
-
   Counters Result = ZeroCounts;
 
   // For each register affected by this instruction increase the result
@@ -484,7 +487,6 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
     const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
     RegInterval Interval = getRegInterval(RC, Op);
     for (unsigned j = Interval.first; j < Interval.second; ++j) {
-
       if (Op.isDef()) {
         increaseCounters(Result, UsedRegs[j]);
         increaseCounters(Result, DefinedRegs[j]);
@@ -522,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
   }
 }
 
+/// Return true if \p MBB has one successor immediately following, and is its
+/// only predecessor
+static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
+  if (MBB.succ_size() != 1)
+    return false;
+
+  const MachineBasicBlock *Succ = *MBB.succ_begin();
+  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -531,12 +543,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  IV = getIsaVersion(ST->getFeatureBits());
+  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  HardwareLimits.Named.VM = getVmcntBitMask(IV);
-  HardwareLimits.Named.EXP = getExpcntBitMask(IV);
-  HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
+  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
+  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
+  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
 
   WaitedOn = ZeroCounts;
   DelayedWaitOn = ZeroCounts;
@@ -618,7 +630,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
       // but we also want to wait for any other outstanding transfers before
       // signalling other hardware blocks
       if ((I->getOpcode() == AMDGPU::S_BARRIER &&
-               ST->needWaitcntBeforeBarrier()) ||
+               !ST->hasAutoWaitcntBeforeBarrier()) ||
            I->getOpcode() == AMDGPU::S_SENDMSG ||
            I->getOpcode() == AMDGPU::S_SENDMSGHALT)
         Required = LastIssued;
@@ -636,12 +648,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
       handleSendMsg(MBB, I);
 
       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN)
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
         EndPgmBlocks.push_back(&MBB);
     }
 
-    // Wait for everything at the end of the MBB
-    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+    // Wait for everything at the end of the MBB. If there is only one
+    // successor, we can defer this until the uses there.
+    if (!hasTrivialSuccessor(MBB))
+      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
   }
 
   if (HaveScalarStores) {
@@ -665,7 +679,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 
         // FIXME: It would be better to insert this before a waitcnt if any.
         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
           Changes = true;
           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
         }
@@ -676,5 +690,19 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   for (MachineInstr *I : RemoveMI)
     I->eraseFromParent();
 
+  if (!MFI->isEntryFunction()) {
+    // Wait for any outstanding memory operations that the input registers may
+    // depend on. We can't track them and it's better to to the wait after the
+    // costly call sequence.
+
+    // TODO: Could insert earlier and schedule more liberally with operations
+    // that only use caller preserved registers.
+    MachineBasicBlock &EntryBB = MF.front();
+    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      .addImm(0);
+
+    Changes = true;
+  }
+
   return Changes;
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 5523ec1..02c9b4b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -31,6 +31,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bit VOP2 = 0;
   field bit VOPC = 0;
   field bit VOP3 = 0;
+  field bit VOP3P = 0;
   field bit VINTRP = 0;
   field bit SDWA = 0;
   field bit DPP = 0;
@@ -78,6 +79,10 @@ class InstSI <dag outs, dag ins, string asm = "",
   // is unable to infer the encoding from the operands.
   field bit VOPAsmPrefer32Bit = 0;
 
+  // This bit indicates that this has a floating point result type, so
+  // the clamp modifier has floating point semantics.
+  field bit FPClamp = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -92,6 +97,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{8} = VOP2;
   let TSFlags{9} = VOPC;
   let TSFlags{10} = VOP3;
+  let TSFlags{12} = VOP3P;
 
   let TSFlags{13} = VINTRP;
   let TSFlags{14} = SDWA;
@@ -120,6 +126,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{39} = ScalarStore;
   let TSFlags{40} = FixedSize;
   let TSFlags{41} = VOPAsmPrefer32Bit;
+  let TSFlags{42} = FPClamp;
 
   let SchedRW = [Write32Bit];
 
@@ -131,19 +138,19 @@ class InstSI <dag outs, dag ins, string asm = "",
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
 
-class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : InstSI<outs, ins, "", pattern> {
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : InstSI<outs, ins, asm, pattern> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let SALU = 1;
 }
 
-class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let VALU = 1;
   let Uses = [EXEC];
 }
@@ -221,10 +228,10 @@ class EXPe : Enc64 {
   bits<1> compr;
   bits<1> done;
   bits<1> vm;
-  bits<8> vsrc0;
-  bits<8> vsrc1;
-  bits<8> vsrc2;
-  bits<8> vsrc3;
+  bits<8> src0;
+  bits<8> src1;
+  bits<8> src2;
+  bits<8> src3;
 
   let Inst{3-0} = en;
   let Inst{9-4} = tgt;
@@ -232,10 +239,10 @@ class EXPe : Enc64 {
   let Inst{11} = done;
   let Inst{12} = vm;
   let Inst{31-26} = 0x3e;
-  let Inst{39-32} = vsrc0;
-  let Inst{47-40} = vsrc1;
-  let Inst{55-48} = vsrc2;
-  let Inst{63-56} = vsrc3;
+  let Inst{39-32} = src0;
+  let Inst{47-40} = src1;
+  let Inst{55-48} = src2;
+  let Inst{63-56} = src3;
 }
 
 let Uses = [EXEC] in {
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 26a8d22..a7e0feb 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -20,9 +20,10 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Debug.h"
 
@@ -36,7 +37,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
                  cl::desc("Restrict range of branch instructions (DEBUG)"));
 
 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -137,6 +138,11 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
   }
 
   if (isSMRD(Opc0) && isSMRD(Opc1)) {
+    // Skip time and cache invalidation instructions.
+    if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
+        AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
+      return false;
+
     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
 
     // Check base reg.
@@ -244,11 +250,11 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
 
       unsigned EltSize;
       if (LdSt.mayLoad())
-        EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
+        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
       else {
         assert(LdSt.mayStore());
         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
-        EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
+        EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
       }
 
       if (isStride64(Opc))
@@ -315,7 +321,8 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   const MachineOperand *SecondDst = nullptr;
 
   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
-      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
+      (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
@@ -343,7 +350,22 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
       FirstLdSt.getParent()->getParent()->getRegInfo();
   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
 
-  return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
+  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
+}
+
+static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, unsigned DestReg,
+                              unsigned SrcReg, bool KillSrc) {
+  MachineFunction *MF = MBB.getParent();
+  DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
+                                        "illegal SGPR to VGPR copy",
+                                        DL, DS_Error);
+  LLVMContext &C = MF->getFunction()->getContext();
+  C.diagnose(IllegalCopy);
+
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -369,7 +391,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -391,7 +417,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -408,15 +438,21 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (RI.isSGPRClass(RC)) {
-    if (RC->getSize() > 4) {
+    if (RI.getRegSizeInBits(*RC) > 32) {
       Opcode =  AMDGPU::S_MOV_B64;
       EltSize = 8;
     } else {
       Opcode = AMDGPU::S_MOV_B32;
       EltSize = 4;
     }
+
+    if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
   }
 
+
   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
 
@@ -432,13 +468,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
 
-    if (Idx == SubIndices.size() - 1)
-      Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
-
     if (Idx == 0)
       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
 
-    Builder.addReg(SrcReg, RegState::Implicit);
+    bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
+    Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
   }
 }
 
@@ -460,13 +494,195 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
   return Opcode;
 }
 
+void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       const DebugLoc &DL, unsigned DestReg,
+                                       int64_t Value) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
+  if (RegClass == &AMDGPU::SReg_32RegClass ||
+      RegClass == &AMDGPU::SGPR_32RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::SReg_64RegClass ||
+      RegClass == &AMDGPU::SGPR_64RegClass ||
+      RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::VGPR_32RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+      .addImm(Value);
+    return;
+  }
+  if (RegClass == &AMDGPU::VReg_64RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  unsigned EltSize = 4;
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (RI.isSGPRClass(RegClass)) {
+    if (RI.getRegSizeInBits(*RegClass) > 32) {
+      Opcode =  AMDGPU::S_MOV_B64;
+      EltSize = 8;
+    } else {
+      Opcode = AMDGPU::S_MOV_B32;
+      EltSize = 4;
+    }
+  }
+
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    int64_t IdxValue = Idx == 0 ? Value : 0;
+
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, Idx));
+    Builder.addImm(IdxValue);
+  }
+}
+
+const TargetRegisterClass *
+SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
+  return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     const DebugLoc &DL, unsigned DstReg,
+                                     ArrayRef<MachineOperand> Cond,
+                                     unsigned TrueReg,
+                                     unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
+         "Not a VGPR32 reg");
+
+  if (Cond.size() == 1) {
+    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg)
+      .add(Cond[0]);
+  } else if (Cond.size() == 2) {
+    assert(Cond[0].isImm() && "Cond[0] is not an immediate");
+    switch (Cond[0].getImm()) {
+    case SIInstrInfo::SCC_TRUE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::SCC_FALSE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::VCCNZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(FalseReg)
+          .addReg(TrueReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::VCCZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(TrueReg)
+          .addReg(FalseReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::EXECNZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::EXECZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      llvm_unreachable("Unhandled branch predicate EXECZ");
+      break;
+    }
+    default:
+      llvm_unreachable("invalid branch predicate");
+    }
+  } else {
+    llvm_unreachable("Can only handle Cond size 1 or 2");
+  }
+}
+
+unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
+unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
-  if (DstRC->getSize() == 4) {
+  if (RI.getRegSizeInBits(*DstRC) == 32) {
     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-  } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
+  } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
     return AMDGPU::S_MOV_B64;
-  } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
+  } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
     return  AMDGPU::V_MOV_B64_PSEUDO;
   }
   return AMDGPU::COPY;
@@ -526,17 +742,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO
     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
                                Size, Align);
+  unsigned SpillSize = TRI->getSpillSize(*RC);
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
 
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling SGPRs.
-    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
+    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
 
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
-    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
+    if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
     }
@@ -546,14 +763,14 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addFrameIndex(FrameIndex)               // addr
       .addMemOperand(MMO)
       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
-      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+      .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
     // Add the scratch resource registers as implicit uses because we may end up
     // needing them, and need to ensure that the reserved registers are
     // correctly handled.
 
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     }
 
     return;
@@ -571,13 +788,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
+  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg, getKillRegState(isKill)) // data
     .addFrameIndex(FrameIndex)               // addr
     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
-    .addReg(MFI->getScratchWaveOffsetReg())  // scratch_offset
+    .addReg(MFI->getFrameOffsetReg())        // scratch_offset
     .addImm(0)                               // offset
     .addMemOperand(MMO);
 }
@@ -629,6 +846,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
+  unsigned SpillSize = TRI->getSpillSize(*RC);
 
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
@@ -639,8 +857,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (RI.isSGPRClass(RC)) {
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
-    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
-    if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
+    const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
+    if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
     }
@@ -649,11 +867,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       .addFrameIndex(FrameIndex) // addr
       .addMemOperand(MMO)
       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
-      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+      .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
 
     if (ST.hasScalarStores()) {
       // m0 is used for offset to scalar stores if used to spill.
-      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
+      Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
     }
 
     return;
@@ -670,12 +888,12 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
-  unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
+  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-    .addFrameIndex(FrameIndex)              // vaddr
-    .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
-    .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
-    .addImm(0)                              // offset
+    .addFrameIndex(FrameIndex)        // vaddr
+    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+    .addReg(MFI->getFrameOffsetReg()) // scratch_offset
+    .addImm(0)                        // offset
     .addMemOperand(MMO);
 }
 
@@ -796,6 +1014,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
   insertWaitStates(MBB, MI, 1);
 }
 
+void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
+  auto MF = MBB.getParent();
+  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+  assert(Info->isEntryFunction());
+
+  if (MBB.succ_empty()) {
+    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
+    if (HasNoTerminator)
+      BuildMI(MBB, MBB.end(), DebugLoc(),
+              get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+  }
+}
+
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default: return 1; // FIXME: Do wait states equal cycles?
@@ -870,9 +1102,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MachineInstr *MovRel =
         BuildMI(MBB, MI, DL, MovRelDesc)
             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
-            .addOperand(MI.getOperand(2))
+            .add(MI.getOperand(2))
             .addReg(VecReg, RegState::ImplicitDefine)
-            .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+            .addReg(VecReg,
+                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));
 
     const int ImpDefIdx =
         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
@@ -897,14 +1130,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     // constant data.
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
                        .addReg(RegLo)
-                       .addOperand(MI.getOperand(1)));
+                       .add(MI.getOperand(1)));
 
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                                   .addReg(RegHi);
     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
       MIB.addImm(0);
     else
-      MIB.addOperand(MI.getOperand(2));
+      MIB.add(MI.getOperand(2));
 
     Bundler.append(MIB);
     llvm::finalizeBundle(MBB, Bundler.begin());
@@ -1202,14 +1435,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
     return false;
   }
 
-  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
-  if (Pred == INVALID_BR)
-    return true;
+  MachineBasicBlock *CondBB = nullptr;
 
-  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
-  Cond.push_back(MachineOperand::CreateImm(Pred));
-  Cond.push_back(I->getOperand(1)); // Save the branch register.
+  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    CondBB = I->getOperand(1).getMBB();
+    Cond.push_back(I->getOperand(0));
+  } else {
+    BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+    if (Pred == INVALID_BR)
+      return true;
 
+    CondBB = I->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(Pred));
+    Cond.push_back(I->getOperand(1)); // Save the branch register.
+  }
   ++I;
 
   if (I == MBB.end()) {
@@ -1290,6 +1529,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
   return Count;
 }
 
+// Copy the flags onto the implicit condition register operand.
+static void preserveCondRegFlags(MachineOperand &CondReg,
+                                 const MachineOperand &OrigCond) {
+  CondReg.setIsUndef(OrigCond.isUndef());
+  CondReg.setIsKill(OrigCond.isKill());
+}
+
 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                    MachineBasicBlock *TBB,
                                    MachineBasicBlock *FBB,
@@ -1305,6 +1551,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
     return 1;
   }
 
+  if(Cond.size() == 1 && Cond[0].isReg()) {
+     BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
+       .add(Cond[0])
+       .addMBB(TBB);
+     return 1;
+  }
+
   assert(TBB && Cond[0].isImm());
 
   unsigned Opcode
@@ -1317,9 +1570,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
       .addMBB(TBB);
 
     // Copy the flags onto the implicit condition register operand.
-    MachineOperand &CondReg = CondBr->getOperand(1);
-    CondReg.setIsUndef(Cond[1].isUndef());
-    CondReg.setIsKill(Cond[1].isKill());
+    preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
 
     if (BytesAdded)
       *BytesAdded = 4;
@@ -1346,9 +1597,167 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 bool SIInstrInfo::reverseBranchCondition(
   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 2);
-  Cond[0].setImm(-Cond[0].getImm());
-  return false;
+  if (Cond.size() != 2) {
+    return true;
+  }
+
+  if (Cond[0].isImm()) {
+    Cond[0].setImm(-Cond[0].getImm());
+    return false;
+  }
+
+  return true;
+}
+
+bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                                  ArrayRef<MachineOperand> Cond,
+                                  unsigned TrueReg, unsigned FalseReg,
+                                  int &CondCycles,
+                                  int &TrueCycles, int &FalseCycles) const {
+  switch (Cond[0].getImm()) {
+  case VCCNZ:
+  case VCCZ: {
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
+    assert(MRI.getRegClass(FalseReg) == RC);
+
+    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
+
+    // Limit to equal cost for branch vs. N v_cndmask_b32s.
+    return !RI.isSGPRClass(RC) && NumInsts <= 6;
+  }
+  case SCC_TRUE:
+  case SCC_FALSE: {
+    // FIXME: We could insert for VGPRs if we could replace the original compare
+    // with a vector one.
+    const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
+    assert(MRI.getRegClass(FalseReg) == RC);
+
+    int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+
+    // Multiples of 8 can do s_cselect_b64
+    if (NumInsts % 2 == 0)
+      NumInsts /= 2;
+
+    CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
+    return RI.isSGPRClass(RC);
+  }
+  default:
+    return false;
+  }
+}
+
+void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, const DebugLoc &DL,
+                               unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                               unsigned TrueReg, unsigned FalseReg) const {
+  BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
+  if (Pred == VCCZ || Pred == SCC_FALSE) {
+    Pred = static_cast<BranchPredicate>(-Pred);
+    std::swap(TrueReg, FalseReg);
+  }
+
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  unsigned DstSize = RI.getRegSizeInBits(*DstRC);
+
+  if (DstSize == 32) {
+    unsigned SelOp = Pred == SCC_TRUE ?
+      AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
+
+    // Instruction's operands are backwards from what is expected.
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(SelOp), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg);
+
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+    return;
+  }
+
+  if (DstSize == 64 && Pred == SCC_TRUE) {
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg);
+
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+    return;
+  }
+
+  static const int16_t Sub0_15[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+  };
+
+  static const int16_t Sub0_15_64[] = {
+    AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
+    AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
+    AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
+    AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
+  };
+
+  unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
+  const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
+  const int16_t *SubIndices = Sub0_15;
+  int NElts = DstSize / 32;
+
+  // 64-bit select is only avaialble for SALU.
+  if (Pred == SCC_TRUE) {
+    SelOp = AMDGPU::S_CSELECT_B64;
+    EltRC = &AMDGPU::SGPR_64RegClass;
+    SubIndices = Sub0_15_64;
+
+    assert(NElts % 2 == 0);
+    NElts /= 2;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(
+    MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
+
+  I = MIB->getIterator();
+
+  SmallVector<unsigned, 8> Regs;
+  for (int Idx = 0; Idx != NElts; ++Idx) {
+    unsigned DstElt = MRI.createVirtualRegister(EltRC);
+    Regs.push_back(DstElt);
+
+    unsigned SubIdx = SubIndices[Idx];
+
+    MachineInstr *Select =
+      BuildMI(MBB, I, DL, get(SelOp), DstElt)
+      .addReg(FalseReg, 0, SubIdx)
+      .addReg(TrueReg, 0, SubIdx);
+    preserveCondRegFlags(Select->getOperand(3), Cond[1]);
+
+    MIB.addReg(DstElt)
+       .addImm(SubIdx);
+  }
+}
+
+bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    // If there are additional implicit register operands, this may be used for
+    // register indexing so the source register operand isn't simply copied.
+    unsigned NumOps = MI.getDesc().getNumOperands() +
+      MI.getDesc().getNumImplicitUses();
+
+    return MI.getNumOperands() == NumOps;
+  }
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::COPY:
+    return true;
+  default:
+    return false;
+  }
 }
 
 static void removeModOperands(MachineInstr &MI) {
@@ -1400,15 +1809,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
-    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
-
-    // Don't fold if we are using source modifiers. The new VOP2 instructions
-    // don't have them.
-    if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
-        hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
+    // Don't fold if we are using source or output modifiers. The new VOP2
+    // instructions don't have them.
+    if (hasAnyModifiersSet(UseMI))
       return false;
-    }
 
     const MachineOperand &ImmOp = DefMI.getOperand(1);
 
@@ -1421,6 +1825,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     if (isInlineConstant(UseMI, *Src0, ImmOp))
       return false;
 
+    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
@@ -1617,10 +2022,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
     return nullptr;
   case AMDGPU::V_MAC_F16_e64:
     IsF16 = true;
+    LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e64:
     break;
   case AMDGPU::V_MAC_F16_e32:
     IsF16 = true;
+    LLVM_FALLTHROUGH;
   case AMDGPU::V_MAC_F32_e32: {
     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                              AMDGPU::OpName::src0);
@@ -1633,20 +2040,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
 
   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src0Mods =
+    getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mods =
+    getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
 
   return BuildMI(*MBB, MI, MI.getDebugLoc(),
                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
-      .addOperand(*Dst)
-      .addImm(0) // Src0 mods
-      .addOperand(*Src0)
-      .addImm(0) // Src1 mods
-      .addOperand(*Src1)
+      .add(*Dst)
+      .addImm(Src0Mods ? Src0Mods->getImm() : 0)
+      .add(*Src0)
+      .addImm(Src1Mods ? Src1Mods->getImm() : 0)
+      .add(*Src1)
       .addImm(0) // Src mods
-      .addOperand(*Src2)
-      .addImm(0)  // clamp
-      .addImm(0); // omod
+      .add(*Src2)
+      .addImm(Clamp ? Clamp->getImm() : 0)
+      .addImm(Omod ? Omod->getImm() : 0);
 }
 
 // It's not generally safe to move VALU instructions across these since it will
@@ -1687,7 +2100,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   case 16:
-    return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+    return ST.has16BitInsts() &&
+           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   default:
     llvm_unreachable("invalid bitwidth");
@@ -1696,7 +2110,9 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
 
 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
                                    uint8_t OperandType) const {
-  if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
+  if (!MO.isImm() ||
+      OperandType < AMDGPU::OPERAND_SRC_FIRST ||
+      OperandType > AMDGPU::OPERAND_SRC_LAST)
     return false;
 
   // MachineOperand provides no way to tell the true operand size, since it only
@@ -1705,24 +2121,43 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
 
   int64_t Imm = MO.getImm();
-  switch (operandBitWidth(OperandType)) {
-  case 32: {
+  switch (OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     int32_t Trunc = static_cast<int32_t>(Imm);
     return Trunc == Imm &&
            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
   }
-  case 64: {
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
     return AMDGPU::isInlinableLiteral64(MO.getImm(),
                                         ST.hasInv2PiInlineImm());
   }
-  case 16: {
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+      // A few special case instructions have 16-bit operands on subtargets
+      // where 16-bit instructions are not legal.
+      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
+      // constants in these cases
       int16_t Trunc = static_cast<int16_t>(Imm);
-      return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
     }
 
     return false;
   }
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    uint32_t Trunc = static_cast<uint32_t>(Imm);
+    return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
+  }
   default:
     llvm_unreachable("invalid bitwidth");
   }
@@ -1801,6 +2236,14 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
   return Mods && Mods->getImm();
 }
 
+bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
+  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
+         hasModifiersSet(MI, AMDGPU::OpName::omod);
+}
+
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
                                   const MachineOperand &MO,
                                   const MCOperandInfo &OpInfo) const {
@@ -1890,7 +2333,12 @@ static bool isSubRegOf(const SIRegisterInfo &TRI,
 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
                                     StringRef &ErrInfo) const {
   uint16_t Opcode = MI.getOpcode();
-  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
+    return true;
+
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
@@ -1989,8 +2437,77 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  // Verify SDWA
+  if (isSDWA(MI)) {
+
+    if (!ST.hasSDWA()) {
+      ErrInfo = "SDWA is not supported on this target";
+      return false;
+    }
+
+    int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
+
+    const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
+
+    for (int OpIdx: OpIndicies) {
+      if (OpIdx == -1)
+        continue;
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+
+      if (!ST.hasSDWAScalar()) {
+        // Only VGPRS on VI
+        if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
+          ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
+          return false;
+        }
+      } else {
+        // No immediates on GFX9
+        if (!MO.isReg()) {
+          ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
+          return false;
+        }
+      }
+    }
+
+    if (!ST.hasSDWAOmod()) {
+      // No omod allowed on VI
+      const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+      if (OMod != nullptr &&
+        (!OMod->isImm() || OMod->getImm() != 0)) {
+        ErrInfo = "OMod not allowed in SDWA instructions on VI";
+        return false;
+      }
+    }
+
+    uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
+    if (isVOPC(BasicOpcode)) {
+      if (!ST.hasSDWASdst() && DstIdx != -1) {
+        // Only vcc allowed as dst on VI for VOPC
+        const MachineOperand &Dst = MI.getOperand(DstIdx);
+        if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
+          ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
+          return false;
+        }
+      } else if (!ST.hasSDWAOutModsVOPC()) {
+        // No clamp allowed on GFX9 for VOPC
+        const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+        if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
+          ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
+          return false;
+        }
+
+        // No omod allowed on GFX9 for VOPC
+        const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
+        if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
+          ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
+          return false;
+        }
+      }
+    }
+  }
+
   // Verify VOP*
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
+  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
@@ -2120,6 +2637,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
+    const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
+    if (Offset->getImm() != 0) {
+      ErrInfo = "subtarget does not support offsets in flat instructions";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -2238,7 +2763,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
 
   unsigned Reg = MRI.createVirtualRegister(VRC);
   DebugLoc DL = MBB->findDebugLoc(I);
-  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO);
+  BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
   MO.ChangeToRegister(Reg, false);
 }
 
@@ -2417,6 +2942,19 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
     return;
 
+  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
+  // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
+  // select is uniform.
+  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
+      RI.isVGPR(MRI, Src1.getReg())) {
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+    const DebugLoc &DL = MI.getDebugLoc();
+    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+        .add(Src1);
+    Src1.ChangeToRegister(Reg, false);
+    return;
+  }
+
   // We do not use commuteInstruction here because it is too aggressive and will
   // commute if it is possible. We only want to commute here if it improves
   // legality. This can be called a fairly large number of times so don't waste
@@ -2511,7 +3049,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
   unsigned DstReg = MRI.createVirtualRegister(SRC);
-  unsigned SubRegs = VRC->getSize() / 4;
+  unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
 
   SmallVector<unsigned, 8> SRegs;
   for (unsigned i = 0; i < SubRegs; ++i) {
@@ -2564,8 +3102,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
     return;
 
   unsigned DstReg = MRI.createVirtualRegister(DstRC);
-  MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg)
-                               .addOperand(Op);
+  MachineInstr *Copy =
+      BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
 
   Op.setReg(DstReg);
   Op.setSubReg(0);
@@ -2810,13 +3348,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
         // Regular buffer load / store.
         MachineInstrBuilder MIB =
             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
-                .addOperand(*VData)
+                .add(*VData)
                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
                 // This will be replaced later
                 // with the new value of vaddr.
-                .addOperand(*SRsrc)
-                .addOperand(*SOffset)
-                .addOperand(*Offset);
+                .add(*SRsrc)
+                .add(*SOffset)
+                .add(*Offset);
 
         // Atomics do not have this operand.
         if (const MachineOperand *GLC =
@@ -2836,14 +3374,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
       } else {
         // Atomics with return.
         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
-                     .addOperand(*VData)
-                     .addOperand(*VDataIn)
+                     .add(*VData)
+                     .add(*VDataIn)
                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
                      // This will be replaced later
                      // with the new value of vaddr.
-                     .addOperand(*SRsrc)
-                     .addOperand(*SOffset)
-                     .addOperand(*Offset)
+                     .add(*SRsrc)
+                     .add(*SOffset)
+                     .add(*Offset)
                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
@@ -2870,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
 }
 
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
-  SmallVector<MachineInstr *, 128> Worklist;
-  Worklist.push_back(&TopInst);
+  SetVectorType Worklist;
+  Worklist.insert(&TopInst);
 
   while (!Worklist.empty()) {
     MachineInstr &Inst = *Worklist.pop_back_val();
@@ -2970,6 +3508,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
+
+    case AMDGPU::S_PACK_LL_B32_B16:
+    case AMDGPU::S_PACK_LH_B32_B16:
+    case AMDGPU::S_PACK_HH_B32_B16: {
+      movePackToVALU(Worklist, MRI, Inst);
+      Inst.eraseFromParent();
+      continue;
+    }
     }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -3027,12 +3573,15 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
     unsigned NewDstReg = AMDGPU::NoRegister;
     if (HasDst) {
+      unsigned DstReg = Inst.getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+        continue;
+
       // Update the destination register class.
       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
       if (!NewDstRC)
         continue;
 
-      unsigned DstReg = Inst.getOperand(0).getReg();
       if (Inst.isCopy() &&
           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
@@ -3061,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   }
 }
 
-void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3086,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
 }
 
 void SIInstrInfo::splitScalar64BitUnaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3112,15 +3661,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-    .addOperand(SrcReg0Sub0);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
-  BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-    .addOperand(SrcReg0Sub1);
+  BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -3139,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBinaryOp(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
+    SetVectorType &Worklist, MachineInstr &Inst,
     unsigned Opcode) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3174,8 +3721,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
-                              .addOperand(SrcReg0Sub0)
-                              .addOperand(SrcReg1Sub0);
+                              .add(SrcReg0Sub0)
+                              .add(SrcReg1Sub0);
 
   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
                                                        AMDGPU::sub1, Src0SubRC);
@@ -3184,8 +3731,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 
   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
-                              .addOperand(SrcReg0Sub1)
-                              .addOperand(SrcReg1Sub1);
+                              .add(SrcReg0Sub1)
+                              .add(SrcReg1Sub1);
 
   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
@@ -3206,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(
 }
 
 void SIInstrInfo::splitScalar64BitBCNT(
-    SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
+    SetVectorType &Worklist, MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
@@ -3231,13 +3778,9 @@ void SIInstrInfo::splitScalar64BitBCNT(
   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
                                                       AMDGPU::sub1, SrcSubRC);
 
-  BuildMI(MBB, MII, DL, InstDesc, MidReg)
-    .addOperand(SrcRegSub0)
-    .addImm(0);
+  BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
 
-  BuildMI(MBB, MII, DL, InstDesc, ResultReg)
-    .addOperand(SrcRegSub1)
-    .addReg(MidReg);
+  BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
 
@@ -3246,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
                                       MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -3310,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
   unsigned DstReg,
   MachineRegisterInfo &MRI,
-  SmallVectorImpl<MachineInstr *> &Worklist) const {
+  SetVectorType &Worklist) const {
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
          E = MRI.use_end(); I != E;) {
     MachineInstr &UseMI = *I->getParent();
     if (!canReadVGPR(UseMI, I.getOperandNo())) {
-      Worklist.push_back(&UseMI);
+      Worklist.insert(&UseMI);
 
       do {
         ++I;
@@ -3326,8 +3869,70 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
+void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
+                                 MachineRegisterInfo &MRI,
+                                 MachineInstr &Inst) const {
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  MachineBasicBlock *MBB = Inst.getParent();
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  switch (Inst.getOpcode()) {
+  case AMDGPU::S_PACK_LL_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
+    // 0.
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
+      .addReg(ImmReg, RegState::Kill)
+      .add(Src0);
+
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
+      .add(Src1)
+      .addImm(16)
+      .addReg(TmpReg, RegState::Kill);
+    break;
+  }
+  case AMDGPU::S_PACK_LH_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
+      .addReg(ImmReg, RegState::Kill)
+      .add(Src0)
+      .add(Src1);
+    break;
+  }
+  case AMDGPU::S_PACK_HH_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+      .addImm(16)
+      .add(Src0);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff0000);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
+      .add(Src1)
+      .addReg(ImmReg, RegState::Kill)
+      .addReg(TmpReg, RegState::Kill);
+    break;
+  }
+  default:
+    llvm_unreachable("unhandled s_pack_* instruction");
+  }
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
-    MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
+    MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
   // This assumes that all the users of SCC are in the same block
   // as the SCC def.
   for (MachineInstr &MI :
@@ -3338,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(
       return;
 
     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
-      Worklist.push_back(&MI);
+      Worklist.insert(&MI);
   }
 }
 
@@ -3448,10 +4053,13 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   if (ST.isAmdHsaOS()) {
-    RsrcDataFormat |= (1ULL << 56);
+    // Set ATC = 1. GFX9 doesn't have this bit.
+    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+      RsrcDataFormat |= (1ULL << 56);
 
-    if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
-      // Set MTYPE = 2
+    // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
+    // BTW, it disables TC L2 and therefore decreases performance.
+    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (2ULL << 59);
   }
 
@@ -3463,11 +4071,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     AMDGPU::RSRC_TID_ENABLE |
                     0xffffffff; // Size;
 
-  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+  // GFX9 doesn't have ELEMENT_SIZE.
+  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+    Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
+  }
 
-  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
-            // IndexStride = 64
-            (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+  // IndexStride = 64.
+  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
 
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
@@ -3496,7 +4107,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
     return AMDGPU::NoRegister;
 
   assert(!MI.memoperands_empty() &&
-         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   FrameIndex = Addr->getIndex();
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -3552,16 +4163,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (DescSize != 0 && DescSize != 4)
     return DescSize;
 
-  if (Opc == AMDGPU::WAVE_BARRIER)
-    return 0;
-
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    if (isFixedSize(MI)) {
-      assert(DescSize == 4);
+    if (isFixedSize(MI))
       return DescSize;
-    }
 
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
@@ -3584,7 +4190,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return 4;
 
   switch (Opc) {
-  case AMDGPU::SI_MASK_BRANCH:
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
   case TargetOpcode::DBG_VALUE:
@@ -3609,12 +4214,88 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
     return true;
 
   for (const MachineMemOperand *MMO : MI.memoperands()) {
-    if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
       return true;
   }
   return false;
 }
 
+bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
+  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
+}
+
+void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                            MachineBasicBlock *IfEnd) const {
+  MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
+  assert(TI != IfEntry->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = IfEntry->getParent();
+  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstr *SIIF =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
+            .add(Branch->getOperand(0))
+            .add(Branch->getOperand(1));
+    MachineInstr *SIEND =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+            .addReg(DstReg);
+
+    IfEntry->erase(TI);
+    IfEntry->insert(IfEntry->end(), SIIF);
+    IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
+  }
+}
+
+void SIInstrInfo::convertNonUniformLoopRegion(
+    MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
+  MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
+  // We expect 2 terminators, one conditional and one unconditional.
+  assert(TI != LoopEnd->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = LoopEnd->getParent();
+  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder HeaderPHIBuilder =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
+    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
+                                          E = LoopEntry->pred_end();
+         PI != E; ++PI) {
+      if (*PI == LoopEnd) {
+        HeaderPHIBuilder.addReg(BackEdgeReg);
+      } else {
+        MachineBasicBlock *PMBB = *PI;
+        unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
+                             ZeroReg, 0);
+        HeaderPHIBuilder.addReg(ZeroReg);
+      }
+      HeaderPHIBuilder.addMBB(*PI);
+    }
+    MachineInstr *HeaderPhi = HeaderPHIBuilder;
+    MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
+                                      get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
+                                  .addReg(DstReg)
+                                  .add(Branch->getOperand(0));
+    MachineInstr *SILOOP =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
+            .addReg(BackEdgeReg)
+            .addMBB(LoopEntry);
+
+    LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
+    LoopEnd->erase(TI);
+    LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
+    LoopEnd->insert(LoopEnd->end(), SILOOP);
+  }
+}
+
 ArrayRef<std::pair<int, const char *>>
 SIInstrInfo::getSerializableTargetIndices() const {
   static const std::pair<int, const char *> TargetIndices[] = {
@@ -3640,3 +4321,39 @@ ScheduleHazardRecognizer *
 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
   return new GCNHazardRecognizer(MF);
 }
+
+std::pair<unsigned, unsigned>
+SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
+}
+
+ArrayRef<std::pair<unsigned, const char *>>
+SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  static const std::pair<unsigned, const char *> TargetFlags[] = {
+    { MO_GOTPCREL, "amdgpu-gotprel" },
+    { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
+    { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
+    { MO_REL32_LO, "amdgpu-rel32-lo" },
+    { MO_REL32_HI, "amdgpu-rel32-hi" }
+  };
+
+  return makeArrayRef(TargetFlags);
+}
+
+bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
+  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
+         MI.modifiesRegister(AMDGPU::EXEC, &RI);
+}
+
+MachineInstrBuilder
+SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I,
+                           const DebugLoc &DL,
+                           unsigned DestReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+           .addReg(UnusedCarry, RegState::Define | RegState::Dead);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e68f6f9..3dd5bc8 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -19,6 +19,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "SIDefines.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SetVector.h"
 
 namespace llvm {
 
@@ -38,6 +39,8 @@ private:
     EXECZ = 3
   };
 
+  typedef SmallSetVector<MachineInstr *, 32> SetVectorType;
+
   static unsigned getBranchOpcode(BranchPredicate Cond);
   static BranchPredicate getBranchPredicate(unsigned Opcode);
 
@@ -56,27 +59,30 @@ private:
 
   void swapOperands(MachineInstr &Inst) const;
 
-  void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
+  void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
 
-  void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitUnaryOp(SetVectorType &Worklist,
                                MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBinaryOp(SetVectorType &Worklist,
                                 MachineInstr &Inst, unsigned Opcode) const;
 
-  void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBCNT(SetVectorType &Worklist,
                             MachineInstr &Inst) const;
-  void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
+  void splitScalar64BitBFE(SetVectorType &Worklist,
                            MachineInstr &Inst) const;
+  void movePackToVALU(SetVectorType &Worklist,
+                      MachineRegisterInfo &MRI,
+                      MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
-    SmallVectorImpl<MachineInstr *> &Worklist) const;
+    SetVectorType &Worklist) const;
 
   void
   addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst,
-                               SmallVectorImpl<MachineInstr *> &Worklist) const;
+                               SetVectorType &Worklist) const;
 
   const TargetRegisterClass *
   getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
@@ -97,6 +103,8 @@ protected:
 public:
 
   enum TargetOperandFlags {
+    MO_MASK = 0x7,
+
     MO_NONE = 0,
     // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL.
     MO_GOTPCREL = 1,
@@ -140,6 +148,23 @@ public:
                                     RegScavenger *RS, unsigned TmpReg,
                                     unsigned Offset, unsigned Size) const;
 
+  void materializeImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const DebugLoc &DL,
+                            unsigned DestReg,
+                            int64_t Value) const;
+
+  const TargetRegisterClass *getPreferredSelectRegClass(
+                               unsigned Size) const;
+
+  unsigned insertNE(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
+  unsigned insertEQ(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -190,7 +215,7 @@ public:
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify) const override;
+                     bool AllowModify = false) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
@@ -203,10 +228,29 @@ public:
   bool reverseBranchCondition(
     SmallVectorImpl<MachineOperand> &Cond) const override;
 
+
+  bool canInsertSelect(const MachineBasicBlock &MBB,
+                       ArrayRef<MachineOperand> Cond,
+                       unsigned TrueReg, unsigned FalseReg,
+                       int &CondCycles,
+                       int &TrueCycles, int &FalseCycles) const override;
+
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+
+  void insertVectorSelect(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, const DebugLoc &DL,
+                          unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                          unsigned TrueReg, unsigned FalseReg) const;
+
   bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 
+  bool isFoldableCopy(const MachineInstr &MI) const;
+
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const final;
 
@@ -308,6 +352,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::VOP3;
   }
 
+  static bool isSDWA(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
+  }
+
+  bool isSDWA(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::SDWA;
+  }
+
   static bool isVOPC(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::VOPC;
   }
@@ -420,6 +472,22 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::DPP;
   }
 
+  static bool isVOP3P(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
+  }
+
+  bool isVOP3P(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
+  }
+
+  static bool isVINTRP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
+  }
+
+  bool isVINTRP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
+  }
+
   static bool isScalarUnit(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
   }
@@ -454,6 +522,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
+  static bool hasFPClamp(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp;
+  }
+
+  bool hasFPClamp(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp;
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
     unsigned Dest = MI.getOperand(0).getReg();
@@ -462,28 +538,6 @@ public:
     return !RI.isSGPRReg(MRI, Dest);
   }
 
-  static int operandBitWidth(uint8_t OperandType) {
-    switch (OperandType) {
-    case AMDGPU::OPERAND_REG_IMM_INT32:
-    case AMDGPU::OPERAND_REG_IMM_FP32:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
-      return 32;
-    case AMDGPU::OPERAND_REG_IMM_INT64:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
-      return 64;
-    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-    case AMDGPU::OPERAND_REG_IMM_INT16:
-    case AMDGPU::OPERAND_REG_IMM_FP16:
-      return 16;
-    default:
-      llvm_unreachable("unexpected operand type");
-    }
-  }
-
   bool isInlineConstant(const APInt &Imm) const;
 
   bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
@@ -571,6 +625,7 @@ public:
 
   bool hasModifiersSet(const MachineInstr &MI,
                        unsigned OpName) const;
+  bool hasAnyModifiersSet(const MachineInstr &MI) const;
 
   bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
@@ -598,13 +653,13 @@ public:
       return 4;
     }
 
-    return RI.getRegClass(OpInfo.RegClass)->getSize();
+    return RI.getRegSizeInBits(*RI.getRegClass(OpInfo.RegClass)) / 8;
   }
 
   /// \brief This form should usually be preferred since it handles operands
   /// with unknown register classes.
   unsigned getOpSize(const MachineInstr &MI, unsigned OpNo) const {
-    return getOpRegClass(MI, OpNo)->getSize();
+    return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
   }
 
   /// \returns true if it is legal for the operand at index \p OpNo
@@ -677,6 +732,7 @@ public:
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
 
+  void insertReturn(MachineBasicBlock &MBB) const;
   /// \brief Return the number of wait states that result from executing this
   /// instruction.
   unsigned getNumWaitStates(const MachineInstr &MI) const;
@@ -722,15 +778,40 @@ public:
 
   bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
 
+  bool isNonUniformBranchInstr(MachineInstr &Instr) const;
+
+  void convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                 MachineBasicBlock *IfEnd) const;
+
+  void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
+                                   MachineBasicBlock *LoopEnd) const;
+
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
   ArrayRef<std::pair<int, const char *>>
   getSerializableTargetIndices() const override;
 
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                  const ScheduleDAG *DAG) const override;
 
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override;
+
+  bool isBasicBlockPrologue(const MachineInstr &MI) const override;
+
+  /// \brief Return a partially built integer add instruction without carry.
+  /// Caller must add source operands.
+  /// For pre-GFX9 it will generate unused carry destination operand.
+  /// TODO: After GFX9 it should return a no-carry operation.
+  MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL,
+                                    unsigned DestReg) const;
 };
 
 namespace AMDGPU {
@@ -741,6 +822,12 @@ namespace AMDGPU {
   int getVOPe32(uint16_t Opcode);
 
   LLVM_READONLY
+  int getSDWAOp(uint16_t Opcode);
+
+  LLVM_READONLY
+  int getBasicFromSDWAOp(uint16_t Opcode);
+
+  LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
 
   LLVM_READONLY
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ebaefae..0881736 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -20,6 +20,8 @@ def SIEncodingFamily {
   int NONE = -1;
   int SI = 0;
   int VI = 1;
+  int SDWA = 2;
+  int SDWA9 = 3;
 }
 
 //===----------------------------------------------------------------------===//
@@ -39,25 +41,41 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
-def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
-  SDTypeProfile<0, 13,
-    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
-     SDTCisVT<1, iAny>,   // vdata(VGPR)
-     SDTCisVT<2, i32>,    // num_channels(imm)
-     SDTCisVT<3, i32>,    // vaddr(VGPR)
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 9,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
      SDTCisVT<4, i32>,    // soffset(SGPR)
-     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<5, i32>,    // offset(imm)
      SDTCisVT<6, i32>,    // dfmt(imm)
      SDTCisVT<7, i32>,    // nfmt(imm)
-     SDTCisVT<8, i32>,    // offen(imm)
-     SDTCisVT<9, i32>,    // idxen(imm)
-     SDTCisVT<10, i32>,   // glc(imm)
-     SDTCisVT<11, i32>,   // slc(imm)
-     SDTCisVT<12, i32>    // tfe(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
     ]>,
-  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SDTtbuffer_store : SDTypeProfile<0, 10,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vindex(VGPR)
+     SDTCisVT<3, i32>,    // voffset(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // glc(imm)
+     SDTCisVT<9, i32>     // slc(imm)
+    ]>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTtbuffer_store,
+                             [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+def SItbuffer_store_x3 : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT_X3",
+                                SDTtbuffer_store,
+                                [SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
+
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
@@ -71,11 +89,6 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
 def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 
-def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
-                       SDTCisVT<3, i32>]>
->;
-
 class SDSample<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
                        SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
@@ -107,7 +120,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
 >;
 
 def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
@@ -144,7 +157,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore,
 
 def si_st_local : PatFrag <
   (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 def si_store_local : PatFrag <
@@ -196,6 +209,21 @@ def si_uniform_br_scc : PatFrag <
   return isCBranchSCC(N);
 }]>;
 
+def lshr_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (srl $src0, $src1)
+>;
+
+def ashr_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (sra $src0, $src1)
+>;
+
+def lshl_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (shl $src0, $src1)
+>;
+
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
 
   def _glue : SDNode <
@@ -266,10 +294,6 @@ def SIMM16bit : PatLeaf <(imm),
   [{return isInt<16>(N->getSExtValue());}]
 >;
 
-def IMM20bit : PatLeaf <(imm),
-  [{return isUInt<20>(N->getZExtValue());}]
->;
-
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
   return isInlineImmediate(N);
 }]>;
@@ -299,6 +323,23 @@ class VGPRImm <dag frag> : PatLeaf<frag, [{
   return Limit < 10;
 }]>;
 
+def NegateImm : SDNodeXForm<imm, [{
+  return CurDAG->getConstant(-N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+// TODO: When FP inline imm values work?
+def NegSubInlineConst32 : ImmLeaf<i32, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
+def NegSubInlineConst16 : ImmLeaf<i16, [{
+  return Imm < -16 && Imm >= -64;
+}], NegateImm>;
+
+def ShiftAmt32Imm : PatLeaf <(imm), [{
+  return N->getZExtValue() < 32;
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
@@ -364,6 +405,14 @@ def SendMsgMatchClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+def SwizzleMatchClass : AsmOperandClass {
+  let Name = "Swizzle";
+  let PredicateMethod = "isSwizzle";
+  let ParserMethod = "parseSwizzleOp";
+  let RenderMethod = "addImmOperands";
+  let IsOptional = 1;
+}
+
 def ExpTgtMatchClass : AsmOperandClass {
   let Name = "ExpTgt";
   let PredicateMethod = "isExpTgt";
@@ -376,6 +425,11 @@ def SendMsgImm : Operand<i32> {
   let ParserMatchClass = SendMsgMatchClass;
 }
 
+def SwizzleImm : Operand<i16> {
+  let PrintMethod = "printSwizzle";
+  let ParserMatchClass = SwizzleMatchClass;
+}
+
 def SWaitMatchClass : AsmOperandClass {
   let Name = "SWaitCnt";
   let RenderMethod = "addImmOperands";
@@ -420,6 +474,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
   let ParserMatchClass = VReg32OrOffClass;
 }
 
+class SDWASrc : RegisterOperand<VS_32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_SDWA_SRC";
+  let EncoderMethod = "getSDWASrcEncoding";
+}
+
+def SDWASrc32 : SDWASrc {
+  let DecoderMethod = "decodeSDWASrc32";
+}
+
+def SDWASrc16 : SDWASrc {
+  let DecoderMethod = "decodeSDWASrc16";
+}
+
+def SDWAVopcDst : VOPDstOperand<SReg_64> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_SDWA_VOPC_DST";
+  let EncoderMethod = "getSDWAVopcDstEncoding";
+  let DecoderMethod = "decodeSDWAVopcDst";
+}
+
 class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
   let Name = "Imm"#CName;
   let PredicateMethod = "is"#CName;
@@ -439,22 +514,40 @@ class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> {
   let ParserMatchClass = MatchClass;
 }
 
+class NamedOperandU12<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
 class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> {
   let PrintMethod = "print"#Name;
   let ParserMatchClass = MatchClass;
 }
 
+class NamedOperandS13<string Name, AsmOperandClass MatchClass> : Operand<i16> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
 class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
   let PrintMethod = "print"#Name;
   let ParserMatchClass = MatchClass;
 }
 
+class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> :
+  OperandWithDefaultOps<i32, (ops (i32 0))> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
 def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>;
 def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>;
 
+def offset_u12 : NamedOperandU12<"Offset", NamedMatchClass<"OffsetU12">>;
+def offset_s13 : NamedOperandS13<"OffsetS13", NamedMatchClass<"OffsetS13">>;
 def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>;
 def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>;
 def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>;
@@ -474,6 +567,9 @@ def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
@@ -486,6 +582,11 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
 def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
 def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
 
+def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
+def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
+def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
+def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+
 def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
 
 def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
@@ -525,6 +626,7 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass {
   let ParserMethod = "parseRegOrImmWithFPInputMods";
   let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
 }
+
 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
@@ -557,6 +659,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 
+def FPRegSDWAInputModsMatchClass : AsmOperandClass {
+  let Name = "SDWARegWithFPInputMods";
+  let ParserMethod = "parseRegWithFPInputMods";
+  let PredicateMethod = "isSDWARegKind";
+}
+
+def FPRegSDWAInputMods : InputMods <FPRegSDWAInputModsMatchClass> {
+  let PrintMethod = "printOperandAndFPInputMods";
+}
+
 def FPVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
@@ -567,6 +679,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
+
+def IntRegSDWAInputModsMatchClass : AsmOperandClass {
+  let Name = "SDWARegWithIntInputMods";
+  let ParserMethod = "parseRegWithIntInputMods";
+  let PredicateMethod = "isSDWARegKind";
+}
+
+def IntRegSDWAInputMods : InputMods <IntRegSDWAInputModsMatchClass> {
+  let PrintMethod = "printOperandAndIntInputMods";
+}
+
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
@@ -577,6 +700,33 @@ def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
+class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "PackedFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImm";
+  let PredicateMethod = "isRegOrImm";
+//  let PredicateMethod = "isPackedFP"#opSize#"InputMods";
+}
+
+class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "PackedInt"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImm";
+  let PredicateMethod = "isRegOrImm";
+//  let PredicateMethod = "isPackedInt"#opSize#"InputMods";
+}
+
+def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>;
+def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>;
+
+class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> {
+//  let PrintMethod = "printPackedFPInputMods";
+}
+
+class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <matchClass> {
+  //let PrintMethod = "printPackedIntInputMods";
+}
+
+def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>;
+def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 
 //===----------------------------------------------------------------------===//
 // Complex patterns
@@ -588,11 +738,18 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
 def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
-def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
 def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
-def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
+// VOP3Mods, but the input source is known to never be NaN.
+def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
+
+def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
+
+def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
+def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">;
+
 
 //===----------------------------------------------------------------------===//
 // SI assembler operands
@@ -604,19 +761,32 @@ def SIOperand {
   int FLAT_SCR = 0x68;
 }
 
+// This should be kept in sync with SISrcMods enum
 def SRCMODS {
   int NONE = 0;
   int NEG = 1;
+  int ABS = 2;
+  int NEG_ABS = 3;
+
+  int NEG_HI = ABS;
+  int OP_SEL_0 = 4;
+  int OP_SEL_1 = 8;
 }
 
 def DSTCLAMP {
   int NONE = 0;
+  int ENABLE = 1;
 }
 
 def DSTOMOD {
   int NONE = 0;
 }
 
+def TRAPID{
+  int LLVM_TRAP = 2;
+  int LLVM_DEBUG_TRAP = 3;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // SI Instruction multiclass helpers.
@@ -648,8 +818,9 @@ class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
        ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
        exp_vm:$vm, exp_compr:$compr, i8imm:$en),
   "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
-  [(node (i8 timm:$en), (i1 timm:$vm), (i8 timm:$tgt), (i1 timm:$compr),
-         f32:$src0, f32:$src1, f32:$src2, f32:$src3)]> {
+  [(node (i8 timm:$tgt), (i8 timm:$en),
+         f32:$src0, f32:$src1, f32:$src2, f32:$src3,
+         (i1 timm:$compr), (i1 timm:$vm))]> {
   let AsmMatchConverter = "cvtExp";
 }
 
@@ -666,6 +837,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> {
       def _si : EXP_Helper<done>,
                 SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.SI>,
                 EXPe {
+        let AssemblerPredicates = [isSICI];
         let DecoderNamespace = "SICI";
         let DisableDecoder = DisableSIDecoder;
       }
@@ -673,6 +845,7 @@ multiclass EXP_m<bit done, SDPatternOperator node> {
       def _vi : EXP_Helper<done>,
                 SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.VI>,
                 EXPe_vi {
+        let AssemblerPredicates = [isVI];
         let DecoderNamespace = "VI";
         let DisableDecoder = DisableVIDecoder;
       }
@@ -702,16 +875,46 @@ class getVALUDstForVT<ValueType VT> {
                               VOPDstOperand<SReg_64>)))); // else VT == i1
 }
 
+// Returns the register class to use for the destination of VOP[12C]
+// instructions with SDWA extension
+class getSDWADstForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 1),
+                            SDWAVopcDst, // VOPC
+                            VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
+}
+
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
   bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, v2f16.Value), 1,
              !if(!eq(VT.Value, f32.Value), 1,
              !if(!eq(VT.Value, f64.Value), 1,
-             0)));
-  RegisterOperand ret = !if(isFP,
-                            !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
-                            !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
+             0))));
+
+  RegisterOperand ret =
+    !if(isFP,
+      !if(!eq(VT.Size, 64),
+         VSrc_f64,
+         !if(!eq(VT.Value, f16.Value),
+            VSrc_f16,
+            !if(!eq(VT.Value, v2f16.Value),
+               VCSrc_v2f16,
+               VSrc_f32
+            )
+         )
+       ),
+       !if(!eq(VT.Size, 64),
+          VSrc_b64,
+          !if(!eq(VT.Value, i16.Value),
+             VSrc_b16,
+             !if(!eq(VT.Value, v2i16.Value),
+                VCSrc_v2b16,
+                VSrc_b32
+             )
+          )
+       )
+    );
 }
 
 // Returns the vreg register class to use for source operand given VT
@@ -720,30 +923,46 @@ class getVregSrcForVT<ValueType VT> {
                         !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
 }
 
+class getSDWASrcForVT <ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWASrc16, SDWASrc32);
+}
 
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
   bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, v2f16.Value), 1,
              !if(!eq(VT.Value, f32.Value), 1,
              !if(!eq(VT.Value, f64.Value), 1,
-             0)));
+             0))));
   RegisterOperand ret =
   !if(!eq(VT.Size, 128),
-      VSrc_128,
-    !if(!eq(VT.Size, 64),
+     VSrc_128,
+     !if(!eq(VT.Size, 64),
         !if(isFP,
-            VCSrc_f64,
-            VCSrc_b64),
+           VCSrc_f64,
+           VCSrc_b64),
         !if(!eq(VT.Value, i1.Value),
-            SCSrc_b64,
-            !if(isFP,
-                !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
-                !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
-            )
-         )
-	   )
-     );
+           SCSrc_b64,
+           !if(isFP,
+              !if(!eq(VT.Value, f16.Value),
+                 VCSrc_f16,
+                 !if(!eq(VT.Value, v2f16.Value),
+                    VCSrc_v2f16,
+                    VCSrc_f32
+                 )
+              ),
+              !if(!eq(VT.Value, i16.Value),
+                 VCSrc_b16,
+                 !if(!eq(VT.Value, v2i16.Value),
+                    VCSrc_v2b16,
+                    VCSrc_b32
+                 )
+              )
+           )
+        )
+     )
+  );
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
@@ -753,7 +972,8 @@ class isFloatType<ValueType SrcVT> {
     !if(!eq(SrcVT.Value, f16.Value), 1,
     !if(!eq(SrcVT.Value, f32.Value), 1,
     !if(!eq(SrcVT.Value, f64.Value), 1,
-    0)));
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    0))));
 }
 
 class isIntType<ValueType SrcVT> {
@@ -764,6 +984,23 @@ class isIntType<ValueType SrcVT> {
     0)));
 }
 
+class isPackedType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+    );
+}
+
+// Float or packed int
+class isModifierType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, f16.Value), 1,
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+    0)))));
+}
 
 // Return type of input modifiers operand for specified input operand
 class getSrcMod <ValueType VT> {
@@ -771,6 +1008,7 @@ class getSrcMod <ValueType VT> {
                !if(!eq(VT.Value, f32.Value), 1,
                !if(!eq(VT.Value, f64.Value), 1,
                0)));
+  bit isPacked = isPackedType<VT>.ret;
   Operand ret =  !if(!eq(VT.Size, 64),
                      !if(isFP, FP64InputMods, Int64InputMods),
                        !if(isFP,
@@ -782,7 +1020,7 @@ class getSrcMod <ValueType VT> {
                      );
 }
 
-// Return type of input modifiers operand specified input operand for SDWA/DPP
+// Return type of input modifiers operand specified input operand for DPP
 class getSrcModExt <ValueType VT> {
     bit isFP = !if(!eq(VT.Value, f16.Value), 1,
                !if(!eq(VT.Value, f32.Value), 1,
@@ -791,6 +1029,15 @@ class getSrcModExt <ValueType VT> {
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
+// Return type of input modifiers operand specified input operand for SDWA
+class getSrcModSDWA <ValueType VT> {
+    bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+               !if(!eq(VT.Value, f32.Value), 1,
+               !if(!eq(VT.Value, f64.Value), 1,
+               0)));
+  Operand ret = !if(isFP, FPRegSDWAInputMods, IntRegSDWAInputMods);
+}
+
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
 class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
   dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
@@ -801,8 +1048,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
 // Returns the input arguments for VOP3 instructions for the given SrcVT.
 class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 RegisterOperand Src2RC, int NumSrcArgs,
-                bit HasModifiers, Operand Src0Mod, Operand Src1Mod,
-                Operand Src2Mod> {
+                bit HasModifiers, bit HasOMod,
+                Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
 
   dag ret =
     !if (!eq(NumSrcArgs, 0),
@@ -821,9 +1068,13 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     !if (!eq(NumSrcArgs, 2),
       !if (!eq(HasModifiers, 1),
         // VOP 2 with modifiers
-        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-             Src1Mod:$src1_modifiers, Src1RC:$src1,
-             clampmod:$clamp, omod:$omod)
+        !if( !eq(HasOMod, 1),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               clampmod:$clamp, omod:$omod),
+           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               clampmod:$clamp))
       /* else */,
         // VOP2 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1)
@@ -831,16 +1082,57 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     /* NumSrcArgs == 3 */,
       !if (!eq(HasModifiers, 1),
         // VOP3 with modifiers
-        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-             Src1Mod:$src1_modifiers, Src1RC:$src1,
-             Src2Mod:$src2_modifiers, Src2RC:$src2,
-             clampmod:$clamp, omod:$omod)
+        !if (!eq(HasOMod, 1),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               Src2Mod:$src2_modifiers, Src2RC:$src2,
+               clampmod:$clamp, omod:$omod),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               Src2Mod:$src2_modifiers, Src2RC:$src2,
+               clampmod:$clamp))
       /* else */,
         // VOP3 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
       /* endif */ ))));
 }
 
+/// XXX - src1 may only allow VGPRs?
+
+// The modifiers (except clamp) are dummy operands for the benefit of
+// printing and parsing. They defer their values to looking at the
+// srcN_modifiers for what to print.
+class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                   RegisterOperand Src2RC, int NumSrcArgs,
+                   bit HasClamp,
+                   Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = !if (!eq(NumSrcArgs, 2),
+    !if (HasClamp,
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           clampmod:$clamp,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi),
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi)),
+    // else NumSrcArgs == 3
+    !if (HasClamp,
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           Src2Mod:$src2_modifiers, Src2RC:$src2,
+           clampmod:$clamp,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi),
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           Src2Mod:$src2_modifiers, Src2RC:$src2,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi))
+  );
+}
+
 class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
                  bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
 
@@ -874,37 +1166,67 @@ class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
              /* endif */)));
 }
 
-class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
-                  bit HasFloatModifiers, Operand Src0Mod, Operand Src1Mod,
+
+
+// Ins for SDWA
+class getInsSDWA <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+                  bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
                   ValueType DstVT> {
 
   dag ret = !if(!eq(NumSrcArgs, 0),
                // VOP1 without input operands (V_NOP)
                (ins),
             !if(!eq(NumSrcArgs, 1),
-               (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                    clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                    src0_sel:$src0_sel),
+               // VOP1
+               !if(!eq(HasSDWAOMod, 0),
+                  // VOP1_SDWA without omod
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       clampmod:$clamp,
+                       dst_sel:$dst_sel, dst_unused:$dst_unused,
+                       src0_sel:$src0_sel),
+                  // VOP1_SDWA with omod
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       clampmod:$clamp, omod:$omod,
+                       dst_sel:$dst_sel, dst_unused:$dst_unused,
+                       src0_sel:$src0_sel)),
             !if(!eq(NumSrcArgs, 2),
                !if(!eq(DstVT.Size, 1),
-                  // VOPC_SDWA with modifiers
+                  // VOPC_SDWA
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        Src1Mod:$src1_modifiers, Src1RC:$src1,
                        clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
-                  // VOP2_SDWA or VOPC_SDWA with modifiers
-                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                       Src1Mod:$src1_modifiers, Src1RC:$src1,
-                       clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
-                       src0_sel:$src0_sel, src1_sel:$src1_sel)),
+                  // VOP2_SDWA
+                  !if(!eq(HasSDWAOMod, 0),
+                     // VOP2_SDWA without omod
+                     (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                          Src1Mod:$src1_modifiers, Src1RC:$src1,
+                          clampmod:$clamp,
+                          dst_sel:$dst_sel, dst_unused:$dst_unused,
+                          src0_sel:$src0_sel, src1_sel:$src1_sel),
+                     // VOP2_SDWA with omod
+                     (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                          Src1Mod:$src1_modifiers, Src1RC:$src1,
+                          clampmod:$clamp, omod:$omod,
+                          dst_sel:$dst_sel, dst_unused:$dst_unused,
+                          src0_sel:$src0_sel, src1_sel:$src1_sel))),
             (ins)/* endif */)));
 }
 
 // Outs for DPP and SDWA
-class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
   dag ret = !if(HasDst,
                 !if(!eq(DstVT.Size, 1),
                     (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
-                    (outs DstRCDPP:$vdst)),
+                    (outs DstRCExt:$vdst)),
+                (outs)); // V_NOP
+}
+
+// Outs for SDWA
+class getOutsSDWA <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA> {
+  dag ret = !if(HasDst,
+                !if(!eq(DstVT.Size, 1),
+                    (outs DstRCSDWA:$sdst),
+                    (outs DstRCSDWA:$vdst)),
                 (outs)); // V_NOP
 }
 
@@ -924,7 +1246,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers,
+                bit HasOMod, ValueType DstVT = i32> {
   string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
@@ -934,7 +1257,26 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
   string ret =
   !if(!eq(HasModifiers, 0),
       getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
-      dst#", "#src0#src1#src2#"$clamp"#"$omod");
+      dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", ""));
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3P
+// instruction.
+class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
+                   bit HasClamp, ValueType DstVT = i32> {
+  string dst = " $vdst";
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1",
+                                           " $src1,"));
+  string src2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+  string mods = !if(HasModifiers, "$neg_lo$neg_hi", "");
+  string clamp = !if(HasClamp, "$clamp", "");
+
+  // Each modifier is printed as an array of bits for each operand, so
+  // all operands are printed as part of src0_modifiers.
+  string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp;
 }
 
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
@@ -953,8 +1295,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
   string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
 }
 
-class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
-                  ValueType DstVT = i32> {
+class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
                        " vcc", // use vcc token as dst for VOPC instructioins
@@ -982,6 +1323,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
   string ret = dst#args#sdwa;
 }
 
+class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
+                   ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       "$sdst", // VOPC
+                       "$vdst"), // VOP1/2
+                    "");
+  string src0 = "$src0_modifiers";
+  string src1 = "$src1_modifiers";
+  string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod");
+  string args = !if(!eq(NumSrcArgs, 0), "",
+                    !if(!eq(NumSrcArgs, 1),
+                        ", "#src0,
+                        ", "#src0#", "#src1
+                     )
+                );
+  string sdwa = !if(!eq(NumSrcArgs, 0), "",
+                    !if(!eq(NumSrcArgs, 1),
+                        out_mods#" $dst_sel $dst_unused $src0_sel",
+                        !if(!eq(DstVT.Size, 1),
+                            " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC
+                            out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel"
+                        )
+                    )
+                );
+  string ret = dst#args#sdwa;
+}
+
+
 // Function that checks if instruction supports DPP and SDWA
 class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32> {
@@ -1018,7 +1388,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field ValueType Src2VT = ArgVT[3];
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
-  field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
   field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1026,16 +1396,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
   field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
   field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
-  field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
-  field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
+  field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
   field Operand Src0Mod = getSrcMod<Src0VT>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT>.ret;
   field Operand Src0ModDPP = getSrcModExt<Src0VT>.ret;
   field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
-  field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
-  field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
-  
+  field Operand Src0ModSDWA = getSrcModSDWA<Src0VT>.ret;
+  field Operand Src1ModSDWA = getSrcModSDWA<Src1VT>.ret;
+
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
   field bit HasDst32 = HasDst;
@@ -1046,7 +1416,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
 
   // TODO: Modifiers logic is somewhat adhoc here, to be refined later
-  field bit HasModifiers = isFloatType<Src0VT>.ret;
+  field bit HasModifiers = isModifierType<Src0VT>.ret;
 
   field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
   field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
@@ -1060,11 +1430,21 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
   field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
 
-  field bit HasOMod = HasModifiers;
   field bit HasClamp = HasModifiers;
-  field bit HasSDWAClamp = HasSrc0;
+  field bit HasSDWAClamp = EmitDst;
+  field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
+
+  field bit IsPacked = isPackedType<Src0VT>.ret;
+  field bit HasOpSel = IsPacked;
+  field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret);
+  field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+  field bit HasSDWA9 = HasExt;
+
+  field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
+  field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
+  field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);
 
   field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
 
@@ -1073,25 +1453,34 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag Outs32 = Outs;
   field dag Outs64 = Outs;
   field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
-  field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
+  field dag OutsSDWA = getOutsSDWA<HasDst, DstVT, DstRCSDWA>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
-                             HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+                             HasModifiers, HasOMod, Src0Mod, Src1Mod,
+                             Src2Mod>.ret;
+  field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
+                                   NumSrcArgs, HasClamp,
+                                   Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
+
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
                                HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
-                                 HasModifiers, Src0ModSDWA, Src1ModSDWA,
+                                 HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
 
+
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
-  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
+  field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
   field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
-  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
+  field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
 }
 
 class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
@@ -1101,11 +1490,20 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
 def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
-def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
 
-def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
 
+def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
+def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
+
+def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
+def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+
+def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
+
 def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
 
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
@@ -1117,6 +1515,8 @@ def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
 def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
 def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
 def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
+def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
 
 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -1126,6 +1526,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
 def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
 def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
@@ -1213,6 +1614,24 @@ def getVOPe32 : InstrMapping {
   let ValueCols = [["4", "0"]];
 }
 
+// Maps ordinary instructions to their SDWA counterparts
+def getSDWAOp : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["SDWA"]];
+}
+
+// Maps SDWA instructions to their ordinary counterparts
+def getBasicFromSDWAOp : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["SDWA"];
+  let ValueCols = [["Default"]];
+}
+
 def getMaskedMIMGOp : InstrMapping {
   let FilterClass = "MIMG_Mask";
   let RowFields = ["Op"];
@@ -1245,7 +1664,9 @@ def getMCOpcodeGen : InstrMapping {
   let ColFields = ["Subtarget"];
   let KeyCol = [!cast<string>(SIEncodingFamily.NONE)];
   let ValueCols = [[!cast<string>(SIEncodingFamily.SI)],
-                   [!cast<string>(SIEncodingFamily.VI)]];
+                   [!cast<string>(SIEncodingFamily.VI)],
+                   [!cast<string>(SIEncodingFamily.SDWA)],
+                   [!cast<string>(SIEncodingFamily.SDWA9)]];
 }
 
 // Get equivalent SOPK instruction.
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
index 38e31e7..ba69e42 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -94,6 +94,12 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
+def ATOMIC_FENCE : SPseudoInstSI<
+  (outs), (ins i32imm:$ordering, i32imm:$scope),
+  [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))],
+  "ATOMIC_FENCE $ordering, $scope"> {
+  let hasSideEffects = 1;
+}
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
@@ -146,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   let mayStore = 1;
   let isBarrier = 1;
   let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -153,48 +161,51 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
 
 // Dummy terminator instruction to use after control flow instructions
 // replaced with exec mask operations.
-def SI_MASK_BRANCH : PseudoInstSI <
+def SI_MASK_BRANCH : VPseudoInstSI <
   (outs), (ins brtarget:$target)> {
   let isBranch = 0;
   let isTerminator = 1;
   let isBarrier = 0;
-  let Uses = [EXEC];
   let SchedRW = [];
   let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 let isTerminator = 1 in {
 
+ def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
+  (outs),
+  (ins SReg_64:$vcc, brtarget:$target),
+  [(brcond i1:$vcc, bb:$target)]> {
+    let Size = 12;
+}
+
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
-  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
+  [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
   let Constraints = "";
   let Size = 12;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
 }
 
 def SI_ELSE : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
   let Constraints = "$src = $dst";
   let Size = 12;
-  let mayStore = 1;
-  let mayLoad = 1;
   let hasSideEffects = 1;
 }
 
 def SI_LOOP : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved, brtarget:$target),
-  [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+  [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
   let Size = 8;
-  let isBranch = 1;
+  let isBranch = 0;
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
 }
 
-} // End isBranch = 1, isTerminator = 1
+} // End isTerminator = 1
 
 def SI_END_CF : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved),
@@ -202,9 +213,9 @@ def SI_END_CF : CFPseudoInstSI <
   let Size = 4;
   let isAsCheapAsAMove = 1;
   let isReMaterializable = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
+  let mayLoad = 1; // FIXME: Should not need memory flags
+  let mayStore = 1;
 }
 
 def SI_BREAK : CFPseudoInstSI <
@@ -244,6 +255,10 @@ def SI_KILL_TERMINATOR : SPseudoInstSI <
   let isTerminator = 1;
 }
 
+def SI_ILLEGAL_COPY : SPseudoInstSI <
+  (outs unknown:$dst), (ins unknown:$src),
+  [], " ; illegal copy $src to $dst">;
+
 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 
 // Branch on undef scc. Used to avoid intermediate copy from
@@ -259,6 +274,14 @@ def SI_PS_LIVE : PseudoInstSI <
   let SALU = 1;
 }
 
+def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
+  [(int_amdgcn_unreachable)],
+  "; divergent unreachable"> {
+  let Size = 0;
+  let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+}
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
@@ -270,12 +293,25 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
   let isReMaterializable = 1;
 }
 
-def SI_RETURN : SPseudoInstSI <
-  (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+def SI_INIT_EXEC : SPseudoInstSI <
+  (outs), (ins i64imm:$src), []> {
+  let Defs = [EXEC];
+  let usesCustomInserter = 1;
+  let isAsCheapAsAMove = 1;
+}
+
+def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
+  (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+  let Defs = [EXEC];
+  let usesCustomInserter = 1;
+}
+
+// Return for returning shaders to a shader variant epilog.
+def SI_RETURN_TO_EPILOG : SPseudoInstSI <
+  (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isReturn = 1;
-  let hasSideEffects = 1;
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
 }
@@ -383,9 +419,23 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
 } // End SubtargetPredicate = isGCN
 
 let Predicates = [isGCN] in {
+def : Pat <
+  (AMDGPUinit_exec i64:$src),
+  (SI_INIT_EXEC (as_i64imm $src))
+>;
+
+def : Pat <
+  (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
+  (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
+>;
+
+def : Pat<
+  (AMDGPUtrap timm:$trapid),
+  (S_TRAP $trapid)
+>;
 
 def : Pat<
-  (int_amdgcn_else i64:$src, bb:$target),
+  (AMDGPUelse i64:$src, bb:$target),
   (SI_ELSE $src, $target, 0)
 >;
 
@@ -423,24 +473,37 @@ def : Pat <
 
 } // End Predicates = [UnsafeFPMath]
 
+
+// f16_to_fp patterns
 def : Pat <
-  (f32 (fpextend f16:$src)),
-  (V_CVT_F32_F16_e32 $src)
+  (f32 (f16_to_fp i32:$src0)),
+  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
-  (f64 (fpextend f16:$src)),
-  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
+  (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
-  (f16 (fpround f32:$src)),
-  (V_CVT_F16_F32_e32 $src)
+  (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
-  (f16 (fpround f64:$src)),
-  (V_CVT_F16_F32_e32 (V_CVT_F32_F64_e32 $src))
+  (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f64 (fpextend f16:$src)),
+  (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
+>;
+
+// fp_to_fp16 patterns
+def : Pat <
+  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
@@ -469,17 +532,27 @@ def : Pat <
 
 multiclass FMADPat <ValueType vt, Instruction inst> {
   def : Pat <
-    (vt (fmad (VOP3NoMods0 vt:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
-              (VOP3NoMods  vt:$src1, i32:$src1_modifiers),
-              (VOP3NoMods  vt:$src2, i32:$src2_modifiers))),
-    (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
-          $src2_modifiers, $src2, $clamp, $omod)
+    (vt (fmad (VOP3NoMods vt:$src0),
+              (VOP3NoMods vt:$src1),
+              (VOP3NoMods vt:$src2))),
+    (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+          SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
   >;
 }
 
 defm : FMADPat <f16, V_MAC_F16_e64>;
 defm : FMADPat <f32, V_MAC_F32_e64>;
 
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat<
+  (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
+  (VOP3Mods f32:$src1, i32:$src1_mod),
+  (VOP3Mods f32:$src2, i32:$src2_mod))),
+  (inst $src0_mod, $src0, $src1_mod, $src1,
+  $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+
 multiclass SelectPat <ValueType vt, Instruction inst> {
   def : Pat <
     (vt (select i1:$src0, vt:$src1, vt:$src2)),
@@ -578,6 +651,16 @@ def : BitConvert <i32, f32, VGPR_32>;
 def : BitConvert <f32, i32, VGPR_32>;
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <v2i16, i32, SReg_32>;
+def : BitConvert <i32, v2i16, SReg_32>;
+def : BitConvert <v2f16, i32, SReg_32>;
+def : BitConvert <i32, v2f16, SReg_32>;
+def : BitConvert <v2i16, v2f16, SReg_32>;
+def : BitConvert <v2f16, v2i16, SReg_32>;
+def : BitConvert <v2f16, f32, SReg_32>;
+def : BitConvert <f32, v2f16, SReg_32>;
+def : BitConvert <v2i16, f32, SReg_32>;
+def : BitConvert <f32, v2i16, SReg_32>;
 
 // 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
@@ -619,12 +702,19 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
-def : Pat <
-  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
-               (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
+
+// If denormals are not enabled, it only impacts the compare of the
+// inputs. The output result is not flushed.
+class ClampPat<Instruction inst, ValueType vt> : Pat <
+  (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))),
+  (inst i32:$src0_modifiers, vt:$src0,
+        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE)
 >;
 
+def : ClampPat<V_MAX_F32_e64, f32>;
+def : ClampPat<V_MAX_F64, f64>;
+def : ClampPat<V_MAX_F16_e64, f16>;
+
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
 /********** ================================ **********/
@@ -678,6 +768,37 @@ def : Pat <
 >;
 
 def : Pat <
+  (fcopysign f16:$src0, f16:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
+>;
+
+def : Pat <
+  (fcopysign f32:$src0, f16:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0,
+             (V_LSHLREV_B32_e64 (i32 16), $src1))
+>;
+
+def : Pat <
+  (fcopysign f64:$src0, f16:$src1),
+  (REG_SEQUENCE SReg_64,
+    (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
+    (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
+               (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
+>;
+
+def : Pat <
+  (fcopysign f16:$src0, f32:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+             (V_LSHRREV_B32_e64 (i32 16), $src1))
+>;
+
+def : Pat <
+  (fcopysign f16:$src0, f64:$src1),
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0,
+             (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
+>;
+
+def : Pat <
   (fneg f16:$src),
   (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x00008000)))
 >;
@@ -692,6 +813,25 @@ def : Pat <
   (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
 >;
 
+def : Pat <
+  (fneg v2f16:$src),
+  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+>;
+
+def : Pat <
+  (fabs v2f16:$src),
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : Pat <
+  (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
+  (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+>;
+
 /********** ================== **********/
 /********** Immediate Patterns **********/
 /********** ================== **********/
@@ -759,27 +899,6 @@ def : Pat <
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 
 def : Pat <
-  (int_AMDGPU_cube v4f32:$src),
-  (REG_SEQUENCE VReg_128,
-    (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub0,
-    (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub1,
-    (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub2,
-    (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub3)
->;
-
-def : Pat <
   (i32 (sext i1:$src0)),
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
 >;
@@ -810,6 +929,14 @@ def : UMad24Pat<V_MAD_U32_U24>;
 defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>;
 def : ROTRPattern <V_ALIGNBIT_B32>;
 
+def : Pat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))),
+          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+
+def : Pat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
+          (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
+                          (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
+
 /********** ====================== **********/
 /**********   Indirect addressing  **********/
 /********** ====================== **********/
@@ -933,7 +1060,7 @@ def : Pat <
 
 class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
-  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
 >;
 
 def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
@@ -985,6 +1112,11 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 // Miscellaneous Patterns
 //===----------------------------------------------------------------------===//
+def : Pat <
+  (i32 (AMDGPUfp16_zext f16:$src)),
+  (COPY $src)
+>;
+
 
 def : Pat <
   (i32 (trunc i64:$a)),
@@ -1028,24 +1160,72 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
 
 defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 
-def : BFEPattern <V_BFE_U32, S_MOV_B32>;
+def : Pat<
+  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+>;
 
 def : Pat<
-  (fcanonicalize f16:$src),
-  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
+  (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
 >;
 
 def : Pat<
-  (fcanonicalize f32:$src),
-  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
+  (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
 >;
 
 def : Pat<
-  (fcanonicalize f64:$src),
-  (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+  (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+>;
+
+
+// Allow integer inputs
+class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
+  (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
+  (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
 >;
 
+def : ExpPattern<AMDGPUexport, i32, EXP>;
+def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
+
+def : Pat <
+  (v2i16 (build_vector i16:$src0, i16:$src1)),
+  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+def : Pat <
+  (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+  (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
+>;
+
+def : Pat <
+  (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
+                       (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+  (v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
+>;
+
+// TODO: Should source modifiers be matched to v_pack_b32_f16?
+def : Pat <
+  (v2f16 (build_vector f16:$src0, f16:$src1)),
+  (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// def : Pat <
+//   (v2f16 (scalar_to_vector f16:$src0)),
+//   (COPY $src0)
+// >;
+
+// def : Pat <
+//   (v2i16 (scalar_to_vector i16:$src0)),
+//   (COPY $src0)
+// >;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -1083,11 +1263,39 @@ def : Pat <
 // Miscellaneous Optimization Patterns
 //============================================================================//
 
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+>;
+
 def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
 
 def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
 def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
 
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class FPMed3Pat<ValueType vt,
+                Instruction med3Inst> : Pat<
+  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
+  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FPMed3Pat<f32, V_MED3_F32>;
+
+let Predicates = [isGFX9] in {
+def : FPMed3Pat<f16, V_MED3_F16>;
+def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
+def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+} // End Predicates = [isGFX9]
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
index 5da3754..7b7cf16 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIIntrinsics.td
@@ -14,23 +14,7 @@
 
 
 let TargetPrefix = "SI", isTarget = 1 in {
-  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-
-  def int_SI_export : Intrinsic <[],
-    [llvm_i32_ty,   // en
-    llvm_i32_ty,    // vm   (FIXME: should be i1)
-    llvm_i32_ty,    // done (FIXME: should be i1)
-    llvm_i32_ty,    // tgt
-    llvm_i32_ty,    // compr (FIXME: should be i1)
-    llvm_float_ty,  // src0
-    llvm_float_ty,  // src1
-    llvm_float_ty,  // src2
-    llvm_float_ty], // src3
-    []
-  >;
-
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
   // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
   def int_SI_tbuffer_store : Intrinsic <
@@ -64,146 +48,4 @@ let TargetPrefix = "SI", isTarget = 1 in {
      llvm_i32_ty],    // tfe(imm)
     [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Fully-flexible SAMPLE instruction.
-  class SampleRaw : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_v4i32_ty,     // sampler(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Image instruction without a sampler.
-  class Image : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Basic sample
-  def int_SI_image_sample : SampleRaw;
-  def int_SI_image_sample_cl : SampleRaw;
-  def int_SI_image_sample_d : SampleRaw;
-  def int_SI_image_sample_d_cl : SampleRaw;
-  def int_SI_image_sample_l : SampleRaw;
-  def int_SI_image_sample_b : SampleRaw;
-  def int_SI_image_sample_b_cl : SampleRaw;
-  def int_SI_image_sample_lz : SampleRaw;
-  def int_SI_image_sample_cd : SampleRaw;
-  def int_SI_image_sample_cd_cl : SampleRaw;
-
-  // Sample with comparison
-  def int_SI_image_sample_c : SampleRaw;
-  def int_SI_image_sample_c_cl : SampleRaw;
-  def int_SI_image_sample_c_d : SampleRaw;
-  def int_SI_image_sample_c_d_cl : SampleRaw;
-  def int_SI_image_sample_c_l : SampleRaw;
-  def int_SI_image_sample_c_b : SampleRaw;
-  def int_SI_image_sample_c_b_cl : SampleRaw;
-  def int_SI_image_sample_c_lz : SampleRaw;
-  def int_SI_image_sample_c_cd : SampleRaw;
-  def int_SI_image_sample_c_cd_cl : SampleRaw;
-
-  // Sample with offsets
-  def int_SI_image_sample_o : SampleRaw;
-  def int_SI_image_sample_cl_o : SampleRaw;
-  def int_SI_image_sample_d_o : SampleRaw;
-  def int_SI_image_sample_d_cl_o : SampleRaw;
-  def int_SI_image_sample_l_o : SampleRaw;
-  def int_SI_image_sample_b_o : SampleRaw;
-  def int_SI_image_sample_b_cl_o : SampleRaw;
-  def int_SI_image_sample_lz_o : SampleRaw;
-  def int_SI_image_sample_cd_o : SampleRaw;
-  def int_SI_image_sample_cd_cl_o : SampleRaw;
-
-  // Sample with comparison and offsets
-  def int_SI_image_sample_c_o : SampleRaw;
-  def int_SI_image_sample_c_cl_o : SampleRaw;
-  def int_SI_image_sample_c_d_o : SampleRaw;
-  def int_SI_image_sample_c_d_cl_o : SampleRaw;
-  def int_SI_image_sample_c_l_o : SampleRaw;
-  def int_SI_image_sample_c_b_o : SampleRaw;
-  def int_SI_image_sample_c_b_cl_o : SampleRaw;
-  def int_SI_image_sample_c_lz_o : SampleRaw;
-  def int_SI_image_sample_c_cd_o : SampleRaw;
-  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
-
-  // Basic gather4
-  def int_SI_gather4 : SampleRaw;
-  def int_SI_gather4_cl : SampleRaw;
-  def int_SI_gather4_l : SampleRaw;
-  def int_SI_gather4_b : SampleRaw;
-  def int_SI_gather4_b_cl : SampleRaw;
-  def int_SI_gather4_lz : SampleRaw;
-
-  // Gather4 with comparison
-  def int_SI_gather4_c : SampleRaw;
-  def int_SI_gather4_c_cl : SampleRaw;
-  def int_SI_gather4_c_l : SampleRaw;
-  def int_SI_gather4_c_b : SampleRaw;
-  def int_SI_gather4_c_b_cl : SampleRaw;
-  def int_SI_gather4_c_lz : SampleRaw;
-
-  // Gather4 with offsets
-  def int_SI_gather4_o : SampleRaw;
-  def int_SI_gather4_cl_o : SampleRaw;
-  def int_SI_gather4_l_o : SampleRaw;
-  def int_SI_gather4_b_o : SampleRaw;
-  def int_SI_gather4_b_cl_o : SampleRaw;
-  def int_SI_gather4_lz_o : SampleRaw;
-
-  // Gather4 with comparison and offsets
-  def int_SI_gather4_c_o : SampleRaw;
-  def int_SI_gather4_c_cl_o : SampleRaw;
-  def int_SI_gather4_c_l_o : SampleRaw;
-  def int_SI_gather4_c_b_o : SampleRaw;
-  def int_SI_gather4_c_b_cl_o : SampleRaw;
-  def int_SI_gather4_c_lz_o : SampleRaw;
-
-  def int_SI_getlod : SampleRaw;
-
-  // Image instrinsics.
-  def int_SI_image_load : Image;
-  def int_SI_image_load_mip : Image;
-  def int_SI_getresinfo : Image;
-
-  /* Interpolation Intrinsics */
-
-  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
 } // End TargetPrefix = "SI", isTarget = 1
-
-let TargetPrefix = "amdgcn", isTarget = 1 in {
-  // Emit 2.5 ulp, no denormal division. Should only be inserted by
-  // pass based on !fpmath metadata.
-  def int_amdgcn_fdiv_fast : Intrinsic<
-    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
-  >;
-
-  /* Control flow Intrinsics */
-
-  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 99fe96c..c6ad61a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -39,15 +39,27 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -56,41 +68,36 @@ using namespace llvm;
 namespace {
 
 class SILoadStoreOptimizer : public MachineFunctionPass {
+
+  typedef struct {
+    MachineBasicBlock::iterator I;
+    MachineBasicBlock::iterator Paired;
+    unsigned EltSize;
+    unsigned Offset0;
+    unsigned Offset1;
+    unsigned BaseOff;
+    bool UseST64;
+    SmallVector<MachineInstr*, 8> InstsToMove;
+   } CombineInfo;
+
 private:
-  const SIInstrInfo *TII;
-  const SIRegisterInfo *TRI;
-  MachineRegisterInfo *MRI;
-  AliasAnalysis *AA;
-
-  static bool offsetsCanBeCombined(unsigned Offset0,
-                                   unsigned Offset1,
-                                   unsigned EltSize);
-
-  MachineBasicBlock::iterator findMatchingDSInst(
-    MachineBasicBlock::iterator I,
-    unsigned EltSize,
-    SmallVectorImpl<MachineInstr*> &InstsToMove);
-
-  MachineBasicBlock::iterator mergeRead2Pair(
-    MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator Paired,
-    unsigned EltSize,
-    ArrayRef<MachineInstr*> InstsToMove);
-
-  MachineBasicBlock::iterator mergeWrite2Pair(
-    MachineBasicBlock::iterator I,
-    MachineBasicBlock::iterator Paired,
-    unsigned EltSize,
-    ArrayRef<MachineInstr*> InstsToMove);
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  AliasAnalysis *AA = nullptr;
+
+  static bool offsetsCanBeCombined(CombineInfo &CI);
+
+  bool findMatchingDSInst(CombineInfo &CI);
+
+  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+
+  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
 
 public:
   static char ID;
 
-  SILoadStoreOptimizer()
-      : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
-        AA(nullptr) {}
-
-  SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
+  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -108,7 +115,7 @@ public:
   }
 };
 
-} // End anonymous namespace.
+} // end anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
                       "SI Load / Store Optimizer", false, false)
@@ -120,8 +127,8 @@ char SILoadStoreOptimizer::ID = 0;
 
 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
 
-FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
-  return new SILoadStoreOptimizer(TM);
+FunctionPass *llvm::createSILoadStoreOptimizerPass() {
+  return new SILoadStoreOptimizer();
 }
 
 static void moveInstsAfter(MachineBasicBlock::iterator I,
@@ -141,11 +148,10 @@ static void addDefsToList(const MachineInstr &MI,
   }
 }
 
-static bool memAccessesCanBeReordered(
-  MachineBasicBlock::iterator A,
-  MachineBasicBlock::iterator B,
-  const SIInstrInfo *TII,
-  llvm::AliasAnalysis * AA) {
+static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
+                                      MachineBasicBlock::iterator B,
+                                      const SIInstrInfo *TII,
+                                      AliasAnalysis * AA) {
   return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
     // RAW or WAR - cannot reorder
     // WAW - cannot reorder
@@ -179,7 +185,6 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
                         ArrayRef<MachineInstr*> InstsToMove,
                         const SIInstrInfo *TII,
                         AliasAnalysis *AA) {
-
   assert(MemOp.mayLoadOrStore());
 
   for (MachineInstr *InstToMove : InstsToMove) {
@@ -191,47 +196,68 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
   return true;
 }
 
-bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
-                                                unsigned Offset1,
-                                                unsigned Size) {
+bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
   // XXX - Would the same offset be OK? Is there any reason this would happen or
   // be useful?
-  if (Offset0 == Offset1)
+  if (CI.Offset0 == CI.Offset1)
     return false;
 
   // This won't be valid if the offset isn't aligned.
-  if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
+  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
     return false;
 
-  unsigned EltOffset0 = Offset0 / Size;
-  unsigned EltOffset1 = Offset1 / Size;
+  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
+  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
+  CI.UseST64 = false;
+  CI.BaseOff = 0;
+
+  // If the offset in elements doesn't fit in 8-bits, we might be able to use
+  // the stride 64 versions.
+  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
+      isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
+    CI.Offset0 = EltOffset0 / 64;
+    CI.Offset1 = EltOffset1 / 64;
+    CI.UseST64 = true;
+    return true;
+  }
 
   // Check if the new offsets fit in the reduced 8-bit range.
-  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
+  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
+    CI.Offset0 = EltOffset0;
+    CI.Offset1 = EltOffset1;
     return true;
+  }
 
-  // If the offset in elements doesn't fit in 8-bits, we might be able to use
-  // the stride 64 versions.
-  if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
-    return false;
+  // Try to shift base address to decrease offsets.
+  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
+  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
 
-  return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
+  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
+    CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+    CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+    CI.UseST64 = true;
+    return true;
+  }
+
+  if (isUInt<8>(OffsetDiff)) {
+    CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
+    CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
+    return true;
+  }
+
+  return false;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
-                                  unsigned EltSize,
-                                  SmallVectorImpl<MachineInstr*> &InstsToMove) {
-  MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock::iterator MBBI = I;
+bool SILoadStoreOptimizer::findMatchingDSInst(CombineInfo &CI) {
+  MachineBasicBlock::iterator E = CI.I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = CI.I;
   ++MBBI;
 
   SmallVector<const MachineOperand *, 8> DefsToMove;
-  addDefsToList(*I, DefsToMove);
+  addDefsToList(*CI.I, DefsToMove);
 
   for ( ; MBBI != E; ++MBBI) {
-
-    if (MBBI->getOpcode() != I->getOpcode()) {
+    if (MBBI->getOpcode() != CI.I->getOpcode()) {
 
       // This is not a matching DS instruction, but we can keep looking as
       // long as one of these conditions are met:
@@ -242,14 +268,14 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
       if (MBBI->hasUnmodeledSideEffects())
         // We can't re-order this instruction with respect to other memory
         // opeations, so we fail both conditions mentioned above.
-        return E;
+        return false;
 
       if (MBBI->mayLoadOrStore() &&
-        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
+        !memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA)) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
-        InstsToMove.push_back(&*MBBI);
+        CI.InstsToMove.push_back(&*MBBI);
         addDefsToList(*MBBI, DefsToMove);
         continue;
       }
@@ -257,13 +283,13 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
       // When we match I with another DS instruction we will be moving I down
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
-      addToListsIfDependent(*MBBI, DefsToMove, InstsToMove);
+      addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove);
       continue;
     }
 
     // Don't merge volatiles.
     if (MBBI->hasOrderedMemoryRef())
-      return E;
+      return false;
 
     // Handle a case like
     //   DS_WRITE_B32 addr, v, idx0
@@ -271,77 +297,67 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
     //   DS_WRITE_B32 addr, f(w), idx1
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
-    if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove))
+    if (addToListsIfDependent(*MBBI, DefsToMove, CI.InstsToMove))
       continue;
 
-    int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
-    const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+    int AddrIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
+                                             AMDGPU::OpName::addr);
+    const MachineOperand &AddrReg0 = CI.I->getOperand(AddrIdx);
     const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
 
     // Check same base pointer. Be careful of subregisters, which can occur with
     // vectors of pointers.
     if (AddrReg0.getReg() == AddrReg1.getReg() &&
         AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
-      int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+      int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
                                                  AMDGPU::OpName::offset);
-      unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
-      unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+      CI.Paired = MBBI;
 
       // Check both offsets fit in the reduced range.
       // We also need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
-          canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
-        return MBBI;
+      if (offsetsCanBeCombined(CI))
+        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+          return true;
     }
 
     // We've found a load/store that we couldn't merge for some reason.
     // We could potentially keep looking, but we'd need to make sure that
     // it was safe to move I and also all the instruction in InstsToMove
     // down past this instruction.
-    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
-      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
-     )
+    // check if we can move I across MBBI and if we can move all I's users
+    if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
+      !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
       break;
   }
-  return E;
+  return false;
 }
 
 MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
-  MachineBasicBlock::iterator I,
-  MachineBasicBlock::iterator Paired,
-  unsigned EltSize,
-  ArrayRef<MachineInstr*> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
+  CombineInfo &CI) {
+  MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
   // cases, like vectors of pointers.
-  const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
-
-  const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
-  const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
-
-  unsigned Offset0
-    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
-  unsigned Offset1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
-
-  unsigned NewOffset0 = Offset0 / EltSize;
-  unsigned NewOffset1 = Offset1 / EltSize;
-  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
-
-  // Prefer the st64 form if we can use it, even if we can fit the offset in the
-  // non st64 version. I'm not sure if there's any real reason to do this.
-  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
-  if (UseST64) {
-    NewOffset0 /= 64;
-    NewOffset1 /= 64;
-    Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
-  }
+  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+
+  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
+  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
+
+  unsigned NewOffset0 = CI.Offset0;
+  unsigned NewOffset1 = CI.Offset1;
+  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2_B32
+                                   : AMDGPU::DS_READ2_B64;
+
+  if (CI.UseST64)
+    Opc = (CI.EltSize == 4) ? AMDGPU::DS_READ2ST64_B32
+                            : AMDGPU::DS_READ2ST64_B64;
 
-  unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
-  unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
+  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
+  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -356,72 +372,70 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   const MCInstrDesc &Read2Desc = TII->get(Opc);
 
   const TargetRegisterClass *SuperRC
-    = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+    = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
 
-  DebugLoc DL = I->getDebugLoc();
-  MachineInstrBuilder Read2
-    = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
-    .addOperand(*AddrReg) // addr
-    .addImm(NewOffset0) // offset0
-    .addImm(NewOffset1) // offset1
-    .addImm(0) // gds
-    .addMemOperand(*I->memoperands_begin())
-    .addMemOperand(*Paired->memoperands_begin());
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  unsigned BaseReg = AddrReg->getReg();
+  unsigned BaseRegFlags = 0;
+  if (CI.BaseOff) {
+    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BaseRegFlags = RegState::Kill;
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
+           .addImm(CI.BaseOff)
+           .addReg(AddrReg->getReg());
+  }
+
+  MachineInstrBuilder Read2 =
+    BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
+      .addReg(BaseReg, BaseRegFlags) // addr
+      .addImm(NewOffset0)            // offset0
+      .addImm(NewOffset1)            // offset1
+      .addImm(0)                     // gds
+      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
+
   (void)Read2;
 
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 
   // Copy to the old destination registers.
-  BuildMI(*MBB, Paired, DL, CopyDesc)
-    .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
-    .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
-    .addOperand(*Dest1)
-    .addReg(DestReg, RegState::Kill, SubRegIdx1);
+  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+      .add(*Dest0) // Copy to same destination including flags and sub reg.
+      .addReg(DestReg, 0, SubRegIdx0);
+  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+                            .add(*Dest1)
+                            .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, InstsToMove);
+  moveInstsAfter(Copy1, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(I);
-  I->eraseFromParent();
-  Paired->eraseFromParent();
+  MachineBasicBlock::iterator Next = std::next(CI.I);
+  CI.I->eraseFromParent();
+  CI.Paired->eraseFromParent();
 
   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
   return Next;
 }
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
-  MachineBasicBlock::iterator I,
-  MachineBasicBlock::iterator Paired,
-  unsigned EltSize,
-  ArrayRef<MachineInstr*> InstsToMove) {
-  MachineBasicBlock *MBB = I->getParent();
+  CombineInfo &CI) {
+  MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
   // sure we preserve the subregister index and any register flags set on them.
-  const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
-  const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
+  const MachineOperand *Addr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
+  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
   const MachineOperand *Data1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
+    = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
 
+  unsigned NewOffset0 = CI.Offset0;
+  unsigned NewOffset1 = CI.Offset1;
+  unsigned Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2_B32
+                                   : AMDGPU::DS_WRITE2_B64;
 
-  unsigned Offset0
-    = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
-  unsigned Offset1
-    = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
-
-  unsigned NewOffset0 = Offset0 / EltSize;
-  unsigned NewOffset1 = Offset1 / EltSize;
-  unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
-
-  // Prefer the st64 form if we can use it, even if we can fit the offset in the
-  // non st64 version. I'm not sure if there's any real reason to do this.
-  bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
-  if (UseST64) {
-    NewOffset0 /= 64;
-    NewOffset1 /= 64;
-    Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
-  }
+  if (CI.UseST64)
+    Opc = (CI.EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
+                            : AMDGPU::DS_WRITE2ST64_B64;
 
   if (NewOffset0 > NewOffset1) {
     // Canonicalize the merged instruction so the smaller offset comes first.
@@ -434,24 +448,33 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
          "Computed offset doesn't fit");
 
   const MCInstrDesc &Write2Desc = TII->get(Opc);
-  DebugLoc DL = I->getDebugLoc();
+  DebugLoc DL = CI.I->getDebugLoc();
+
+  unsigned BaseReg = Addr->getReg();
+  unsigned BaseRegFlags = 0;
+  if (CI.BaseOff) {
+    BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BaseRegFlags = RegState::Kill;
+    BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::V_ADD_I32_e32), BaseReg)
+           .addImm(CI.BaseOff)
+           .addReg(Addr->getReg());
+  }
 
-  MachineInstrBuilder Write2
-    = BuildMI(*MBB, Paired, DL, Write2Desc)
-    .addOperand(*Addr) // addr
-    .addOperand(*Data0) // data0
-    .addOperand(*Data1) // data1
-    .addImm(NewOffset0) // offset0
-    .addImm(NewOffset1) // offset1
-    .addImm(0) // gds
-    .addMemOperand(*I->memoperands_begin())
-    .addMemOperand(*Paired->memoperands_begin());
+  MachineInstrBuilder Write2 =
+    BuildMI(*MBB, CI.Paired, DL, Write2Desc)
+      .addReg(BaseReg, BaseRegFlags) // addr
+      .add(*Data0)                   // data0
+      .add(*Data1)                   // data1
+      .addImm(NewOffset0)            // offset0
+      .addImm(NewOffset1)            // offset1
+      .addImm(0)                     // gds
+      .setMemRefs(CI.I->mergeMemRefsWith(*CI.Paired));
 
-  moveInstsAfter(Write2, InstsToMove);
+  moveInstsAfter(Write2, CI.InstsToMove);
 
-  MachineBasicBlock::iterator Next = std::next(I);
-  I->eraseFromParent();
-  Paired->eraseFromParent();
+  MachineBasicBlock::iterator Next = std::next(CI.I);
+  CI.I->eraseFromParent();
+  CI.Paired->eraseFromParent();
 
   DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
   return Next;
@@ -472,27 +495,24 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
-    SmallVector<MachineInstr*, 8> InstsToMove;
+    CombineInfo CI;
+    CI.I = I;
     unsigned Opc = MI.getOpcode();
     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
-      unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
-                                                             InstsToMove);
-      if (Match != E) {
+      CI.EltSize = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
+      if (findMatchingDSInst(CI)) {
         Modified = true;
-        I = mergeRead2Pair(I, Match, Size, InstsToMove);
+        I = mergeRead2Pair(CI);
       } else {
         ++I;
       }
 
       continue;
     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
-      unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
-                                                             InstsToMove);
-      if (Match != E) {
+      CI.EltSize = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
+      if (findMatchingDSInst(CI)) {
         Modified = true;
-        I = mergeWrite2Pair(I, Match, Size, InstsToMove);
+        I = mergeWrite2Pair(CI);
       } else {
         ++I;
       }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 7ed18f2..5f1c7f1 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -51,13 +51,23 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <iterator>
 
 using namespace llvm;
 
@@ -67,10 +77,10 @@ namespace {
 
 class SILowerControlFlow : public MachineFunctionPass {
 private:
-  const SIRegisterInfo *TRI;
-  const SIInstrInfo *TII;
-  LiveIntervals *LIS;
-  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  LiveIntervals *LIS = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
 
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
@@ -88,12 +98,7 @@ private:
 public:
   static char ID;
 
-  SILowerControlFlow() :
-    MachineFunctionPass(ID),
-    TRI(nullptr),
-    TII(nullptr),
-    LIS(nullptr),
-    MRI(nullptr) {}
+  SILowerControlFlow() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -113,7 +118,7 @@ public:
   }
 };
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 char SILowerControlFlow::ID = 0;
 
@@ -175,9 +180,8 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
 
   // Insert a pseudo terminator to help keep the verifier happy. This will also
   // be used later when inserting skips.
-  MachineInstr *NewBr =
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
-    .addOperand(MI.getOperand(2));
+  MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+                            .add(MI.getOperand(2));
 
   if (!LIS) {
     MI.eraseFromParent();
@@ -220,8 +224,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   // tied. In order to correctly tie the registers, split this into a copy of
   // the src like it does.
   unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
-    .addOperand(MI.getOperand(1)); // Saved EXEC
+  MachineInstr *CopyExec =
+    BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg)
+      .add(MI.getOperand(1)); // Saved EXEC
 
   // This must be inserted before phis and any spill code inserted before the
   // else.
@@ -262,6 +267,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   LIS->RemoveMachineInstrFromMaps(MI);
   MI.eraseFromParent();
 
+  LIS->InsertMachineInstrInMaps(*CopyExec);
   LIS->InsertMachineInstrInMaps(*OrSaveExec);
 
   LIS->InsertMachineInstrInMaps(*Xor);
@@ -283,10 +289,9 @@ void SILowerControlFlow::emitBreak(MachineInstr &MI) {
   const DebugLoc &DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
 
-  MachineInstr *Or =
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
-    .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(1));
+  MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+                         .addReg(AMDGPU::EXEC)
+                         .add(MI.getOperand(1));
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *Or);
@@ -306,13 +311,13 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   const DebugLoc &DL = MI.getDebugLoc();
 
   MachineInstr *AndN2 =
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
-    .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(0));
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64_term), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .add(MI.getOperand(0));
 
   MachineInstr *Branch =
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addOperand(MI.getOperand(1));
+      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+          .add(MI.getOperand(1));
 
   if (LIS) {
     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
@@ -328,9 +333,9 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
 
   MachineBasicBlock::iterator InsPt = MBB.begin();
   MachineInstr *NewMI =
-    BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
-    .addReg(AMDGPU::EXEC)
-    .addOperand(MI.getOperand(0));
+      BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC)
+          .add(MI.getOperand(0));
 
   if (LIS)
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index be2e14f..ba616ad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -21,8 +21,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -114,18 +114,18 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
             assert(Val == 0 || Val == -1);
 
             BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))
-              .addOperand(Dst)
-              .addImm(Val);
+                .add(Dst)
+                .addImm(Val);
             MI.eraseFromParent();
             continue;
           }
         }
 
         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
-          .addOperand(Dst)
-          .addImm(0)
-          .addImm(-1)
-          .addOperand(Src);
+            .add(Dst)
+            .addImm(0)
+            .addImm(-1)
+            .add(Src);
         MI.eraseFromParent();
       } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
                  SrcRC == &AMDGPU::VReg_1RegClass) {
@@ -140,14 +140,14 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
               MRI.getRegClass(DefInst->getOperand(3).getReg()),
               &AMDGPU::SGPR_64RegClass)) {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
-            .addOperand(Dst)
-            .addReg(AMDGPU::EXEC)
-            .addOperand(DefInst->getOperand(3));
+              .add(Dst)
+              .addReg(AMDGPU::EXEC)
+              .add(DefInst->getOperand(3));
         } else {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
-            .addOperand(Dst)
-            .addOperand(Src)
-            .addImm(0);
+              .add(Dst)
+              .add(Src)
+              .addImm(0);
         }
         MI.eraseFromParent();
       }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ecd46b9..a7c8166 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -20,17 +20,13 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToVGPR(
-  "amdgpu-spill-sgpr-to-vgpr",
-  cl::desc("Enable spilling VGPRs to SGPRs"),
-  cl::ReallyHidden,
-  cl::init(true));
-
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
-    ScratchRSrcReg(AMDGPU::NoRegister),
-    ScratchWaveOffsetReg(AMDGPU::NoRegister),
+    ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG),
+    ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG),
+    FrameOffsetReg(AMDGPU::FP_REG),
+    StackPtrOffsetReg(AMDGPU::SP_REG),
     PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
     DispatchPtrUserSGPR(AMDGPU::NoRegister),
     QueuePtrUserSGPR(AMDGPU::NoRegister),
@@ -46,14 +42,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
+    WorkItemIDXVGPR(AMDGPU::NoRegister),
+    WorkItemIDYVGPR(AMDGPU::NoRegister),
+    WorkItemIDZVGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
+    PSInputEnable(0),
     ReturnsVoid(true),
     FlatWorkGroupSizes(0, 0),
     WavesPerEU(0, 0),
     DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
     DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
     LDSWaveSpillSize(0),
-    PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
@@ -78,44 +77,83 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDX(false),
     WorkItemIDY(false),
     WorkItemIDZ(false),
-    PrivateMemoryInputPtr(false) {
+    ImplicitBufferPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
+  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+  WavesPerEU = ST.getWavesPerEU(*F);
 
-  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+  if (!isEntryFunction()) {
+    // Non-entry functions have no special inputs for now, other registers
+    // required for scratch access.
+    ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+    ScratchWaveOffsetReg = AMDGPU::SGPR4;
+    FrameOffsetReg = AMDGPU::SGPR5;
+    StackPtrOffsetReg = AMDGPU::SGPR32;
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+    // FIXME: Not really a system SGPR.
+    PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg;
+  }
 
-  if (!AMDGPU::isShader(F->getCallingConv())) {
-    KernargSegmentPtr = true;
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+    KernargSegmentPtr = !F->arg_empty();
     WorkGroupIDX = true;
     WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
   }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
+  if (ST.debuggerEmitPrologue()) {
+    // Enable everything.
+    WorkGroupIDX = true;
     WorkGroupIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
+    WorkItemIDX = true;
     WorkItemIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
+  } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-x"))
+      WorkGroupIDX = true;
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
+    if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+      WorkGroupIDY = true;
+
+    if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+      WorkGroupIDZ = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-x"))
+      WorkItemIDX = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+      WorkItemIDY = true;
 
+    if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+      WorkItemIDZ = true;
+  }
+
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
-  if (HasStackObjects || MaySpill)
-    PrivateSegmentWaveByteOffset = true;
+  if (isEntryFunction()) {
+    // X, XY, and XYZ are the only supported combinations, so make sure Y is
+    // enabled if Z is.
+    if (WorkItemIDZ)
+      WorkItemIDY = true;
+
+    if (HasStackObjects || MaySpill) {
+      PrivateSegmentWaveByteOffset = true;
 
-  if (ST.isAmdCodeObjectV2(MF)) {
+      // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
+      if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
+          (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
+        PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5;
+    }
+  }
+
+  bool IsCOV2 = ST.isAmdCodeObjectV2(MF);
+  if (IsCOV2) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -129,18 +167,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       DispatchID = true;
   } else if (ST.isMesaGfxShader(MF)) {
     if (HasStackObjects || MaySpill)
-      PrivateMemoryInputPtr = true;
+      ImplicitBufferPtr = true;
   }
 
-  // We don't need to worry about accessing spills with flat instructions.
-  // TODO: On VI where we must use flat for global, we should be able to omit
-  // this if it is never used for generic access.
-  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
-      ST.isAmdHsaOS())
-    FlatScratchInit = true;
+  if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
 
-  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
-  WavesPerEU = ST.getWavesPerEU(*F);
+  if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
+    // TODO: This could be refined a lot. The attribute is a poor way of
+    // detecting calls that may require it before argument lowering.
+    if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch"))
+      FlatScratchInit = true;
+  }
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -186,52 +224,67 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   return FlatScratchInitUserSGPR;
 }
 
-unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
-  PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+  ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
   NumUserSGPRs += 2;
-  return PrivateMemoryPtrUserSGPR;
+  return ImplicitBufferPtrUserSGPR;
 }
 
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
-                                                       MachineFunction *MF,
-                                                       unsigned FrameIndex,
-                                                       unsigned SubIdx) {
-  if (!EnableSpillSGPRToVGPR)
-    return SpilledReg();
-
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
-  Offset += SubIdx * 4;
+/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
+                                                    int FI) {
+  std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
 
-  unsigned LaneVGPRIdx = Offset / (64 * 4);
-  unsigned Lane = (Offset / 4) % 64;
+  // This has already been allocated.
+  if (!SpillLanes.empty())
+    return true;
 
-  struct SpilledReg Spill;
-  Spill.Lane = Lane;
-
-  if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
-                                                *MF);
-
-    if (LaneVGPR == AMDGPU::NoRegister)
-      // We have no VGPRs left for spilling SGPRs.
-      return Spill;
-
-    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-
-    // Add this register as live-in to all blocks to avoid machine verifer
-    // complaining about use of an undefined physical register.
-    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
-         BI != BE; ++BI) {
-      BI->addLiveIn(LaneVGPR);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned WaveSize = ST.getWavefrontSize();
+
+  unsigned Size = FrameInfo.getObjectSize(FI);
+  assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
+  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+
+  int NumLanes = Size / 4;
+
+  // Make sure to handle the case where a wide SGPR spill may span between two
+  // VGPRs.
+  for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+    unsigned LaneVGPR;
+    unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
+
+    if (VGPRIndex == 0) {
+      LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+      if (LaneVGPR == AMDGPU::NoRegister) {
+        // We have no VGPRs left for spilling SGPRs. Reset because we won't
+        // partially spill the SGPR to VGPRs.
+        SGPRToVGPRSpills.erase(FI);
+        NumVGPRSpillLanes -= I;
+        return false;
+      }
+
+      SpillVGPRs.push_back(LaneVGPR);
+
+      // Add this register as live-in to all blocks to avoid machine verifer
+      // complaining about use of an undefined physical register.
+      for (MachineBasicBlock &BB : MF)
+        BB.addLiveIn(LaneVGPR);
+    } else {
+      LaneVGPR = SpillVGPRs.back();
     }
+
+    SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
   }
 
-  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  return Spill;
+  return true;
+}
+
+void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
+  for (auto &R : SGPRToVGPRSpills)
+    MFI.RemoveStackObject(R.first);
 }
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 6fc8d18..4c7f38a 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -15,14 +15,18 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
 
 #include "AMDGPUMachineFunction.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <array>
+#include <cassert>
 #include <map>
+#include <utility>
 
 namespace llvm {
 
-class MachineRegisterInfo;
-
 class AMDGPUImagePseudoSourceValue : public PseudoSourceValue {
 public:
   explicit AMDGPUImagePseudoSourceValue() :
@@ -84,8 +88,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned ScratchRSrcReg;
   unsigned ScratchWaveOffsetReg;
 
+  // This is the current function's incremented size from the kernel's scratch
+  // wave offset register. For an entry function, this is exactly the same as
+  // the ScratchWaveOffsetReg.
+  unsigned FrameOffsetReg;
+
+  // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
+  unsigned StackPtrOffsetReg;
+
   // Input registers for non-HSA ABI
-  unsigned PrivateMemoryPtrUserSGPR;
+  unsigned ImplicitBufferPtrUserSGPR;
 
   // Input registers setup for the HSA ABI.
   // User SGPRs in allocation order.
@@ -107,8 +119,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned WorkGroupInfoSystemSGPR;
   unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
 
+  // VGPR inputs. These are always v0, v1 and v2 for entry functions.
+  unsigned WorkItemIDXVGPR;
+  unsigned WorkItemIDYVGPR;
+  unsigned WorkItemIDZVGPR;
+
   // Graphics info.
   unsigned PSInputAddr;
+  unsigned PSInputEnable;
+
   bool ReturnsVoid;
 
   // A pair of default/requested minimum/maximum flat work group sizes.
@@ -127,16 +146,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   AMDGPUBufferPseudoSourceValue BufferPSV;
   AMDGPUImagePseudoSourceValue ImagePSV;
 
-public:
-  // FIXME: Make private
+private:
   unsigned LDSWaveSpillSize;
-  unsigned PSInputEna;
-  std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
   unsigned NumSystemSGPRs;
 
-private:
   bool HasSpilledSGPRs;
   bool HasSpilledVGPRs;
   bool HasNonSpillStackObjects;
@@ -169,7 +184,7 @@ private:
   // Private memory buffer
   // Compute directly in sgpr[0:1]
   // Other shaders indirect 64-bits at sgpr[0:1]
-  bool PrivateMemoryInputPtr : 1;
+  bool ImplicitBufferPtr : 1;
 
   MCPhysReg getNextUserSGPR() const {
     assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
@@ -182,19 +197,39 @@ private:
 
 public:
   struct SpilledReg {
-    unsigned VGPR;
-    int Lane;
+    unsigned VGPR = AMDGPU::NoRegister;
+    int Lane = -1;
+
+    SpilledReg() = default;
     SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { }
-    SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { }
+
     bool hasLane() { return Lane != -1;}
     bool hasReg() { return VGPR != AMDGPU::NoRegister;}
   };
 
-  // SIMachineFunctionInfo definition
+private:
+  // SGPR->VGPR spilling support.
+  typedef std::pair<unsigned, unsigned> SpillRegMask;
+
+  // Track VGPR + wave index for each subregister of the SGPR spilled to
+  // frameindex key.
+  DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
+  unsigned NumVGPRSpillLanes = 0;
+  SmallVector<unsigned, 2> SpillVGPRs;
+
+public:
 
   SIMachineFunctionInfo(const MachineFunction &MF);
-  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
-                           unsigned SubIdx);
+
+  ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
+    auto I = SGPRToVGPRSpills.find(FrameIndex);
+    return (I == SGPRToVGPRSpills.end()) ?
+      ArrayRef<SpilledReg>() : makeArrayRef(I->second);
+  }
+
+  bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
+  void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
@@ -206,7 +241,7 @@ public:
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
   unsigned addDispatchID(const SIRegisterInfo &TRI);
   unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
-  unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
+  unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -311,8 +346,8 @@ public:
     return WorkItemIDZ;
   }
 
-  bool hasPrivateMemoryInputPtr() const {
-    return PrivateMemoryInputPtr;
+  bool hasImplicitBufferPtr() const {
+    return ImplicitBufferPtr;
   }
 
   unsigned getNumUserSGPRs() const {
@@ -342,17 +377,35 @@ public:
     return ScratchWaveOffsetReg;
   }
 
+  unsigned getFrameOffsetReg() const {
+    return FrameOffsetReg;
+  }
+
+  void setStackPtrOffsetReg(unsigned Reg) {
+    StackPtrOffsetReg = Reg;
+  }
+
+  // Note the unset value for this is AMDGPU::SP_REG rather than
+  // NoRegister. This is mostly a workaround for MIR tests where state that
+  // can't be directly computed from the function is not preserved in serialized
+  // MIR.
+  unsigned getStackPtrOffsetReg() const {
+    return StackPtrOffsetReg;
+  }
+
   void setScratchWaveOffsetReg(unsigned Reg) {
     assert(Reg != AMDGPU::NoRegister && "Should never be unset");
     ScratchWaveOffsetReg = Reg;
+    if (isEntryFunction())
+      FrameOffsetReg = ScratchWaveOffsetReg;
   }
 
   unsigned getQueuePtrUserSGPR() const {
     return QueuePtrUserSGPR;
   }
 
-  unsigned getPrivateMemoryPtrUserSGPR() const {
-    return PrivateMemoryPtrUserSGPR;
+  unsigned getImplicitBufferPtrUserSGPR() const {
+    return ImplicitBufferPtrUserSGPR;
   }
 
   bool hasSpilledSGPRs() const {
@@ -399,6 +452,10 @@ public:
     return PSInputAddr;
   }
 
+  unsigned getPSInputEnable() const {
+    return PSInputEnable;
+  }
+
   bool isPSInputAllocated(unsigned Index) const {
     return PSInputAddr & (1 << Index);
   }
@@ -407,6 +464,10 @@ public:
     PSInputAddr |= 1 << Index;
   }
 
+  void markPSInputEnabled(unsigned Index) {
+    PSInputEnable |= 1 << Index;
+  }
+
   bool returnsVoid() const {
     return ReturnsVoid;
   }
@@ -503,6 +564,10 @@ public:
     llvm_unreachable("unexpected dimension");
   }
 
+  unsigned getLDSWaveSpillSize() const {
+    return LDSWaveSpillSize;
+  }
+
   const AMDGPUBufferPseudoSourceValue *getBufferPSV() const {
     return &BufferPSV;
   }
@@ -512,6 +577,6 @@ public:
   }
 };
 
-} // End namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index da86bbf..34886c4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SIMachineScheduler.h"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
-#include "SIMachineScheduler.h"
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -38,7 +38,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 // This scheduler implements a different scheduling algorithm than
 // GenericScheduler.
@@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
   Preds.push_back(Pred);
 
   assert(none_of(Succs,
-                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+                 [=](std::pair<SIScheduleBlock*,
+                     SIScheduleBlockLinkKind> S) {
+                   return PredID == S.first->getID();
+                    }) &&
          "Loop in the Block Graph!");
 }
 
-void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
+                              SIScheduleBlockLinkKind Kind) {
   unsigned SuccID = Succ->getID();
 
   // Check if not already predecessor.
-  for (SIScheduleBlock* S : Succs) {
-    if (SuccID == S->getID())
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
+    if (SuccID == S.first->getID()) {
+      if (S.second == SIScheduleBlockLinkKind::NoData &&
+          Kind == SIScheduleBlockLinkKind::Data)
+        S.second = Kind;
       return;
+    }
   }
   if (Succ->isHighLatencyBlock())
     ++NumHighLatencySuccessors;
-  Succs.push_back(Succ);
+  Succs.push_back(std::make_pair(Succ, Kind));
+
   assert(none_of(Preds,
                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
          "Loop in the Block Graph!");
@@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) {
   }
 
   dbgs() << "\nSuccessors:\n";
-  for (SIScheduleBlock* S : Succs) {
-    S->printDebug(false);
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
+    if (S.second == SIScheduleBlockLinkKind::Data)
+      dbgs() << "(Data Dep) ";
+    S.first->printDebug(false);
   }
 
   if (Scheduled) {
@@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() {
   }
 }
 
+static bool
+hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
+  for (const auto &PredDep : SU.Preds) {
+    if (PredDep.getSUnit() == &FromSU &&
+        PredDep.getKind() == llvm::SDep::Data)
+      return true;
+  }
+  return false;
+}
+
 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   unsigned DAGSize = DAG->SUnits.size();
   unsigned NumHighLatencies = 0;
   unsigned GroupSize;
-  unsigned Color = NextReservedID;
+  int Color = NextReservedID;
   unsigned Count = 0;
   std::set<unsigned> FormingGroup;
 
@@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   else
     GroupSize = 4;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    const SUnit &SU = DAG->SUnits[SUNum];
+    if (DAG->IsHighLatencySU[SU.NodeNum]) {
       unsigned CompatibleGroup = true;
-      unsigned ProposedColor = Color;
+      int ProposedColor = Color;
+      std::vector<int> AdditionalElements;
+
+      // We don't want to put in the same block
+      // two high latency instructions that depend
+      // on each other.
+      // One way would be to check canAddEdge
+      // in both directions, but that currently is not
+      // enough because there the high latency order is
+      // enforced (via links).
+      // Instead, look at the dependencies between the
+      // high latency instructions and deduce if it is
+      // a data dependency or not.
       for (unsigned j : FormingGroup) {
-        // TODO: Currently CompatibleGroup will always be false,
-        // because the graph enforces the load order. This
-        // can be fixed, but as keeping the load order is often
-        // good for performance that causes a performance hit (both
-        // the default scheduler and this scheduler).
-        // When this scheduler determines a good load order,
-        // this can be fixed.
-        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
-            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+        bool HasSubGraph;
+        std::vector<int> SubGraph;
+        // By construction (topological order), if SU and
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // in the parent graph of SU.
+#ifndef NDEBUG
+        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+                                               HasSubGraph);
+        assert(!HasSubGraph);
+#endif
+        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+                                               HasSubGraph);
+        if (!HasSubGraph)
+          continue; // No dependencies between each other
+        else if (SubGraph.size() > 5) {
+          // Too many elements would be required to be added to the block.
           CompatibleGroup = false;
+          break;
+        }
+        else {
+          // Check the type of dependency
+          for (unsigned k : SubGraph) {
+            // If in the path to join the two instructions,
+            // there is another high latency instruction,
+            // or instructions colored for another block
+            // abort the merge.
+            if (DAG->IsHighLatencySU[k] ||
+                (CurrentColoring[k] != ProposedColor &&
+                 CurrentColoring[k] != 0)) {
+              CompatibleGroup = false;
+              break;
+            }
+            // If one of the SU in the subgraph depends on the result of SU j,
+            // there'll be a data dependency.
+            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
+              CompatibleGroup = false;
+              break;
+            }
+          }
+          if (!CompatibleGroup)
+            break;
+          // Same check for the SU
+          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
+            CompatibleGroup = false;
+            break;
+          }
+          // Add all the required instructions to the block
+          // These cannot live in another block (because they
+          // depend (order dependency) on one of the
+          // instruction in the block, and are required for the
+          // high latency instruction we add.
+          AdditionalElements.insert(AdditionalElements.end(),
+                                    SubGraph.begin(), SubGraph.end());
+        }
+      }
+      if (CompatibleGroup) {
+        FormingGroup.insert(SU.NodeNum);
+        for (unsigned j : AdditionalElements)
+          CurrentColoring[j] = ProposedColor;
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        ++Count;
       }
-      if (!CompatibleGroup || ++Count == GroupSize) {
+      // Found one incompatible instruction,
+      // or has filled a big enough group.
+      // -> start a new one.
+      if (!CompatibleGroup) {
         FormingGroup.clear();
         Color = ++NextReservedID;
-        if (!CompatibleGroup) {
-          ProposedColor = Color;
-          FormingGroup.insert(SU->NodeNum);
-        }
+        ProposedColor = Color;
+        FormingGroup.insert(SU.NodeNum);
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        Count = 0;
+      } else if (Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        ProposedColor = Color;
         Count = 0;
-      } else {
-        FormingGroup.insert(SU->NodeNum);
       }
-      CurrentColoring[SU->NodeNum] = ProposedColor;
     }
   }
 }
@@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
   unsigned DAGSize = DAG->SUnits.size();
   std::vector<int> PendingColoring = CurrentColoring;
 
+  assert(DAGSize >= 1 &&
+         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
+         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
+  // If there is no reserved block at all, do nothing. We don't want
+  // everything in one block.
+  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
+                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
+      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
+                        CurrentTopDownReservedDependencyColoring.end()) == 0)
+    return;
+
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
@@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
         SUColors.insert(CurrentColoring[Succ->NodeNum]);
       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     }
+    // If there is only one child/parent block, and that block
+    // is not among the ones we are removing in this path, then
+    // merge the instruction to that block
     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
       PendingColoring[SU->NodeNum] = *SUColors.begin();
     else // TODO: Attribute new colors depending on color
@@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
-    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
-      if (Pos != ColorCount.end()) {
-        ++ColorCount[color];
-      } else {
-        ColorCount[color] = 1;
-      }
+     ++ColorCount[color];
   }
 
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
@@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
         continue;
       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
-        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
+                                     SuccDep.isCtrl() ? NoData : Data);
     }
     for (SDep& PredDep : SU->Preds) {
       SUnit *Pred = PredDep.getSUnit();
@@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() {
       Block->Height = 0;
     else {
       unsigned Height = 0;
-      for (SIScheduleBlock *Succ : Block->getSuccs()) {
-        if (Height < Succ->Height + 1)
-          Height = Succ->Height + 1;
-      }
+      for (const auto &Succ : Block->getSuccs())
+        Height = std::min(Height, Succ.first->Height + 1);
       Block->Height = Height;
     }
   }
@@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         continue;
 
       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
-      std::map<unsigned, unsigned>::iterator RegPos =
-        LiveOutRegsNumUsages[PredID].find(Reg);
-      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
-        ++LiveOutRegsNumUsages[PredID][Reg];
-      } else {
-        LiveOutRegsNumUsages[PredID][Reg] = 1;
-      }
+      ++LiveOutRegsNumUsages[PredID][Reg];
     }
   }
 
@@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   std::set<unsigned> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
+  // Increase LiveOutRegsNumUsages for blocks
+  // producing registers consumed in another
+  // scheduling region.
+  for (unsigned Reg : DAG->getOutRegs()) {
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      // Do reverse traversal
+      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
+      SIScheduleBlock *Block = Blocks[ID];
+      const std::set<unsigned> &OutRegs = Block->getOutRegs();
+
+      if (OutRegs.find(Reg) == OutRegs.end())
+        continue;
+
+      ++LiveOutRegsNumUsages[ID][Reg];
+      break;
+    }
+  }
+
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         }
       }
 
-      if (!Found) {
-        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
-          LiveRegsConsumers[Reg] = 1;
-        else
-          ++LiveRegsConsumers[Reg];
-      }
+      if (!Found)
+        ++LiveRegsConsumers[Reg];
     }
   }
 
@@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
     for (SIScheduleBlock* Block : BlocksScheduled) {
       dbgs() << ' ' << Block->getID();
     }
+    dbgs() << '\n';
   );
 }
 
@@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
                         VregCurrentUsage, SregCurrentUsage);
   if (VregCurrentUsage > maxVregUsage)
     maxVregUsage = VregCurrentUsage;
-  if (VregCurrentUsage > maxSregUsage)
-    maxSregUsage = VregCurrentUsage;
+  if (SregCurrentUsage > maxSregUsage)
+    maxSregUsage = SregCurrentUsage;
   DEBUG(
     dbgs() << "Picking New Blocks\n";
     dbgs() << "Available: ";
@@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
 }
 
 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
-  for (SIScheduleBlock* Block : Parent->getSuccs()) {
-    --BlockNumPredsLeft[Block->getID()];
-    if (BlockNumPredsLeft[Block->getID()] == 0) {
-      ReadyBlocks.push_back(Block);
-    }
-    // TODO: Improve check. When the dependency between the high latency
-    // instructions and the instructions of the other blocks are WAR or WAW
-    // there will be no wait triggered. We would like these cases to not
-    // update LastPosHighLatencyParentScheduled.
-    if (Parent->isHighLatencyBlock())
-      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  for (const auto &Block : Parent->getSuccs()) {
+    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
+      ReadyBlocks.push_back(Block.first);
+
+    if (Parent->isHighLatencyBlock() &&
+        Block.second == SIScheduleBlockLinkKind::Data)
+      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
   }
 }
 
@@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
        LiveOutRegsNumUsages[Block->getID()].begin(),
        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
     std::pair<unsigned, unsigned> RegP = *RegI;
-    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
-      LiveRegsConsumers[RegP.first] = RegP.second;
-    else {
-      assert(LiveRegsConsumers[RegP.first] == 0);
-      LiveRegsConsumers[RegP.first] += RegP.second;
-    }
+    // We produce this register, thus it must not be previously alive.
+    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
+           LiveRegsConsumers[RegP.first] == 0);
+    LiveRegsConsumers[RegP.first] += RegP.second;
   }
   if (LastPosHighLatencyParentScheduled[Block->getID()] >
         (unsigned)LastPosWaitedHighLatency)
@@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is extremely high, try other good performing variants
   // which could lead to lower VGPR usage
   if (Best.MaxVGPRUsage > 180) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
       { LatenciesAlone, BlockRegUsageLatency },
 //      { LatenciesAlone, BlockRegUsage },
       { LatenciesGrouped, BlockLatencyRegUsage },
@@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is still extremely high, we may spill. Try other variants
   // which are less performing, but that could lead to lower VGPR usage.
   if (Best.MaxVGPRUsage > 200) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
 //      { LatenciesAlone, BlockRegUsageLatency },
       { LatenciesAlone, BlockRegUsage },
 //      { LatenciesGrouped, BlockLatencyRegUsage },
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index 77c0735..122d0f6 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -40,13 +40,12 @@ enum SIScheduleCandReason {
 
 struct SISchedulerCandidate {
   // The reason for this candidate.
-  SIScheduleCandReason Reason;
+  SIScheduleCandReason Reason = NoCand;
 
   // Set of reasons that apply to multiple candidates.
-  uint32_t RepeatReasonSet;
+  uint32_t RepeatReasonSet = 0;
 
-  SISchedulerCandidate()
-    :  Reason(NoCand), RepeatReasonSet(0) {}
+  SISchedulerCandidate() = default;
 
   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
@@ -55,6 +54,11 @@ struct SISchedulerCandidate {
 class SIScheduleDAGMI;
 class SIScheduleBlockCreator;
 
+enum SIScheduleBlockLinkKind {
+  NoData,
+  Data
+};
+
 class SIScheduleBlock {
   SIScheduleDAGMI *DAG;
   SIScheduleBlockCreator *BC;
@@ -84,8 +88,8 @@ class SIScheduleBlock {
   std::set<unsigned> LiveInRegs;
   std::set<unsigned> LiveOutRegs;
 
-  bool Scheduled;
-  bool HighLatencyBlock;
+  bool Scheduled = false;
+  bool HighLatencyBlock = false;
 
   std::vector<unsigned> HasLowLatencyNonWaitedParent;
 
@@ -93,14 +97,14 @@ class SIScheduleBlock {
   unsigned ID;
 
   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
-  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
-  unsigned NumHighLatencySuccessors;
+  // All blocks successors, and the kind of link
+  std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
+  unsigned NumHighLatencySuccessors = 0;
 
 public:
   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
                   unsigned ID):
-    DAG(DAG), BC(BC), TopRPTracker(TopPressure), Scheduled(false),
-    HighLatencyBlock(false), ID(ID), NumHighLatencySuccessors(0) {}
+    DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}
 
   ~SIScheduleBlock() = default;
 
@@ -114,10 +118,11 @@ public:
 
   // Add block pred, which has instruction predecessor of SU.
   void addPred(SIScheduleBlock *Pred);
-  void addSucc(SIScheduleBlock *Succ);
+  void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
 
   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
-  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+  ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
+    getSuccs() const { return Succs; }
 
   unsigned Height;  // Maximum topdown path length to block without outputs
   unsigned Depth;   // Maximum bottomup path length to block without inputs
@@ -213,9 +218,9 @@ struct SIScheduleBlocks {
 };
 
 enum SISchedulerBlockCreatorVariant {
-    LatenciesAlone,
-    LatenciesGrouped,
-    LatenciesAlonePlusConsecutive
+  LatenciesAlone,
+  LatenciesGrouped,
+  LatenciesAlonePlusConsecutive
 };
 
 class SIScheduleBlockCreator {
@@ -451,6 +456,7 @@ public:
   LiveIntervals *getLIS() { return LIS; }
   MachineRegisterInfo *getMRI() { return &MRI; }
   const TargetRegisterInfo *getTRI() { return TRI; }
+  ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
   SUnit& getEntrySU() { return EntrySU; }
   SUnit& getExitSU() { return ExitSU; }
 
@@ -469,6 +475,14 @@ public:
     return InRegs;
   }
 
+  std::set<unsigned> getOutRegs() {
+    std::set<unsigned> OutRegs;
+    for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+      OutRegs.insert(RegMaskPair.RegUnit);
+    }
+    return OutRegs;
+  };
+
   unsigned getVGPRSetID() const { return VGPRSetID; }
   unsigned getSGPRSetID() const { return SGPRSetID; }
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
new file mode 100644
index 0000000..e2ac663
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -0,0 +1,846 @@
+//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass tries to apply several peephole SDWA patterns.
+///
+/// E.g. original:
+///   V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
+///   V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
+///   V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
+///
+/// Replace:
+///   V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
+///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include <unordered_map>
+#include <unordered_set>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-peephole-sdwa"
+
+STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
+STATISTIC(NumSDWAInstructionsPeepholed,
+          "Number of instruction converted to SDWA.");
+
+namespace {
+
+class SDWAOperand;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
+public:
+  typedef SmallVector<SDWAOperand *, 4> SDWAOperandsVector;
+
+private:
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
+  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+  SmallVector<MachineInstr *, 8> ConvertedInstructions;
+
+  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
+
+public:
+  static char ID;
+
+  SIPeepholeSDWA() : MachineFunctionPass(ID) {
+    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void matchSDWAOperands(MachineFunction &MF);
+  bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
+  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
+  void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
+
+  StringRef getPassName() const override { return "SI Peephole SDWA"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class SDWAOperand {
+private:
+  MachineOperand *Target; // Operand that would be used in converted instruction
+  MachineOperand *Replaced; // Operand that would be replace by Target
+
+public:
+  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
+      : Target(TargetOp), Replaced(ReplacedOp) {
+    assert(Target->isReg());
+    assert(Replaced->isReg());
+  }
+
+  virtual ~SDWAOperand() {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
+
+  MachineOperand *getTargetOperand() const { return Target; }
+  MachineOperand *getReplacedOperand() const { return Replaced; }
+  MachineInstr *getParentInst() const { return Target->getParent(); }
+  MachineRegisterInfo *getMRI() const {
+    return &getParentInst()->getParent()->getParent()->getRegInfo();
+  }
+};
+
+using namespace AMDGPU::SDWA;
+
+class SDWASrcOperand : public SDWAOperand {
+private:
+  SdwaSel SrcSel;
+  bool Abs;
+  bool Neg;
+  bool Sext;
+
+public:
+  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
+                 bool Sext_ = false)
+      : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
+        Neg(Neg_), Sext(Sext_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getSrcSel() const { return SrcSel; }
+  bool getAbs() const { return Abs; }
+  bool getNeg() const { return Neg; }
+  bool getSext() const { return Sext; }
+
+  uint64_t getSrcMods(const SIInstrInfo *TII,
+                      const MachineOperand *SrcOp) const;
+};
+
+class SDWADstOperand : public SDWAOperand {
+private:
+  SdwaSel DstSel;
+  DstUnused DstUn;
+
+public:
+  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
+      : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getDstSel() const { return DstSel; }
+  DstUnused getDstUnused() const { return DstUn; }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
+
+char SIPeepholeSDWA::ID = 0;
+
+char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
+
+FunctionPass *llvm::createSIPeepholeSDWAPass() {
+  return new SIPeepholeSDWA();
+}
+
+#ifndef NDEBUG
+
+static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+  switch(Sel) {
+  case BYTE_0: OS << "BYTE_0"; break;
+  case BYTE_1: OS << "BYTE_1"; break;
+  case BYTE_2: OS << "BYTE_2"; break;
+  case BYTE_3: OS << "BYTE_3"; break;
+  case WORD_0: OS << "WORD_0"; break;
+  case WORD_1: OS << "WORD_1"; break;
+  case DWORD:  OS << "DWORD"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
+  switch(Un) {
+  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
+  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
+  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
+  OS << "SDWA src: " << *Src.getTargetOperand()
+     << " src_sel:" << Src.getSrcSel()
+     << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
+     << " sext:" << Src.getSext() << '\n';
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
+  OS << "SDWA dst: " << *Dst.getTargetOperand()
+     << " dst_sel:" << Dst.getDstSel()
+     << " dst_unused:" << Dst.getDstUnused() << '\n';
+  return OS;
+}
+
+#endif
+
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+  assert(To.isReg() && From.isReg());
+  To.setReg(From.getReg());
+  To.setSubReg(From.getSubReg());
+  To.setIsUndef(From.isUndef());
+  if (To.isUse()) {
+    To.setIsKill(From.isKill());
+  } else {
+    To.setIsDead(From.isDead());
+  }
+}
+
+static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
+  return LHS.isReg() &&
+         RHS.isReg() &&
+         LHS.getReg() == RHS.getReg() &&
+         LHS.getSubReg() == RHS.getSubReg();
+}
+
+static bool isSubregOf(const MachineOperand &SubReg,
+                       const MachineOperand &SuperReg,
+                       const TargetRegisterInfo *TRI) {
+
+  if (!SuperReg.isReg() || !SubReg.isReg())
+    return false;
+
+  if (isSameReg(SuperReg, SubReg))
+    return true;
+
+  if (SuperReg.getReg() != SubReg.getReg())
+    return false;
+
+  LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
+  LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
+  SuperMask |= ~SubMask;
+  return SuperMask.all();
+}
+
+uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
+                                    const MachineOperand *SrcOp) const {
+  uint64_t Mods = 0;
+  const auto *MI = SrcOp->getParent();
+  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
+    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
+      Mods = Mod->getImm();
+    }
+  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
+    if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
+      Mods = Mod->getImm();
+    }
+  }
+  if (Abs || Neg) {
+    assert(!Sext &&
+           "Float and integer src modifiers can't be set simulteniously");
+    Mods |= Abs ? SISrcMods::ABS : 0;
+    Mods ^= Neg ? SISrcMods::NEG : 0;
+  } else if (Sext) {
+    Mods |= SISrcMods::SEXT;
+  }
+
+  return Mods;
+}
+
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA src operand potential instruction is one that use register
+  // defined by parent instruction
+  MachineRegisterInfo *MRI = getMRI();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  MachineInstr *PotentialMI = nullptr;
+  for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
+    // If this is use of another subreg of dst reg then do nothing
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    // If there exist use of superreg of dst then we should not combine this
+    // opernad
+    if (!isSameReg(PotentialMO, *Replaced))
+      return nullptr;
+
+    // Check that PotentialMI is only instruction that uses dst reg
+    if (PotentialMI == nullptr) {
+      PotentialMI = PotentialMO.getParent();
+    } else if (PotentialMI != PotentialMO.getParent()) {
+      return nullptr;
+    }
+  }
+
+  return PotentialMI;
+}
+
+bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Find operand in instruction that matches source operand and replace it with
+  // target operand. Set corresponding src_sel
+
+  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
+  MachineOperand *SrcMods =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+  assert(Src && (Src->isReg() || Src->isImm()));
+  if (!isSameReg(*Src, *getReplacedOperand())) {
+    // If this is not src0 then it should be src1
+    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
+    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+    assert(Src && Src->isReg());
+
+    if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
+      // src2. This is not allowed.
+      return false;
+    }
+
+    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+  }
+  copyRegOperand(*Src, *getTargetOperand());
+  SrcSel->setImm(getSrcSel());
+  SrcMods->setImm(getSrcMods(TII, Src));
+  getTargetOperand()->setIsKill(false);
+  return true;
+}
+
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA dst operand potential instruction is one that defines register
+  // that this operand uses
+  MachineRegisterInfo *MRI = getMRI();
+  MachineInstr *ParentMI = getParentInst();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    if (!isSameReg(*Replaced, PotentialMO))
+      return nullptr;
+
+    // Check that ParentMI is the only instruction that uses replaced register
+    for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
+      if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
+          UseMO.getParent() != ParentMI) {
+        return nullptr;
+      }
+    }
+
+    // Due to SSA this should be onle def of replaced register, so return it
+    return PotentialMO.getParent();
+  }
+
+  return nullptr;
+}
+
+bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
+
+  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+      getDstSel() != AMDGPU::SDWA::DWORD) {
+    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
+    return false;
+  }
+
+  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  assert(Operand &&
+         Operand->isReg() &&
+         isSameReg(*Operand, *getReplacedOperand()));
+  copyRegOperand(*Operand, *getTargetOperand());
+  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
+  assert(DstSel);
+  DstSel->setImm(getDstSel());
+  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  assert(DstUnused);
+  DstUnused->setImm(getDstUnused());
+
+  // Remove original instruction  because it would conflict with our new
+  // instruction by register definition
+  getParentInst()->eraseFromParent();
+  return true;
+}
+
+Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
+  if (Op.isImm()) {
+    return Op.getImm();
+  }
+
+  // If this is not immediate then it can be copy of immediate value, e.g.:
+  // %vreg1<def> = S_MOV_B32 255;
+  if (Op.isReg()) {
+    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
+      if (!isSameReg(Op, Def))
+        continue;
+
+      const MachineInstr *DefInst = Def.getParent();
+      if (!TII->isFoldableCopy(*DefInst))
+        return None;
+
+      const MachineOperand &Copied = DefInst->getOperand(1);
+      if (!Copied.isImm())
+        return None;
+
+      return Copied.getImm();
+    }
+  }
+
+  return None;
+}
+
+void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opcode = MI.getOpcode();
+      switch (Opcode) {
+      case AMDGPU::V_LSHRREV_B32_e32:
+      case AMDGPU::V_ASHRREV_I32_e32:
+      case AMDGPU::V_LSHLREV_B32_e32:
+      case AMDGPU::V_LSHRREV_B32_e64:
+      case AMDGPU::V_ASHRREV_I32_e64:
+      case AMDGPU::V_LSHLREV_B32_e64: {
+        // from: v_lshrrev_b32_e32 v1, 16/24, v0
+        // to SDWA src:v0 src_sel:WORD_1/BYTE_3
+
+        // from: v_ashrrev_i32_e32 v1, 16/24, v0
+        // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
+
+        // from: v_lshlrev_b32_e32 v1, 16/24, v0
+        // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm)
+          break;
+
+        if (*Imm != 16 && *Imm != 24)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
+            Opcode == AMDGPU::V_LSHLREV_B32_e64) {
+          auto SDWADst = make_unique<SDWADstOperand>(
+              Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+          SDWAOperands[&MI] = std::move(SDWADst);
+          ++NumSDWAPatternsFound;
+        } else {
+          auto SDWASrc = make_unique<SDWASrcOperand>(
+              Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
+              Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
+              Opcode != AMDGPU::V_LSHRREV_B32_e64);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+          SDWAOperands[&MI] = std::move(SDWASrc);
+          ++NumSDWAPatternsFound;
+        }
+        break;
+      }
+
+      case AMDGPU::V_LSHRREV_B16_e32:
+      case AMDGPU::V_ASHRREV_I16_e32:
+      case AMDGPU::V_LSHLREV_B16_e32:
+      case AMDGPU::V_LSHRREV_B16_e64:
+      case AMDGPU::V_ASHRREV_I16_e64:
+      case AMDGPU::V_LSHLREV_B16_e64: {
+        // from: v_lshrrev_b16_e32 v1, 8, v0
+        // to SDWA src:v0 src_sel:BYTE_1
+
+        // from: v_ashrrev_i16_e32 v1, 8, v0
+        // to SDWA src:v0 src_sel:BYTE_1 sext:1
+
+        // from: v_lshlrev_b16_e32 v1, 8, v0
+        // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm || *Imm != 8)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
+            Opcode == AMDGPU::V_LSHLREV_B16_e64) {
+          auto SDWADst =
+            make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+          SDWAOperands[&MI] = std::move(SDWADst);
+          ++NumSDWAPatternsFound;
+        } else {
+          auto SDWASrc = make_unique<SDWASrcOperand>(
+              Src1, Dst, BYTE_1, false, false,
+              Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
+              Opcode != AMDGPU::V_LSHRREV_B16_e64);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+          SDWAOperands[&MI] = std::move(SDWASrc);
+          ++NumSDWAPatternsFound;
+        }
+        break;
+      }
+
+      case AMDGPU::V_BFE_I32:
+      case AMDGPU::V_BFE_U32: {
+        // e.g.:
+        // from: v_bfe_u32 v1, v0, 8, 8
+        // to SDWA src:v0 src_sel:BYTE_1
+
+        // offset | width | src_sel
+        // ------------------------
+        // 0      | 8     | BYTE_0
+        // 0      | 16    | WORD_0
+        // 0      | 32    | DWORD ?
+        // 8      | 8     | BYTE_1
+        // 16     | 8     | BYTE_2
+        // 16     | 16    | WORD_1
+        // 24     | 8     | BYTE_3
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        auto Offset = foldToImm(*Src1);
+        if (!Offset)
+          break;
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        auto Width = foldToImm(*Src2);
+        if (!Width)
+          break;
+
+        SdwaSel SrcSel = DWORD;
+
+        if (*Offset == 0 && *Width == 8)
+          SrcSel = BYTE_0;
+        else if (*Offset == 0 && *Width == 16)
+          SrcSel = WORD_0;
+        else if (*Offset == 0 && *Width == 32)
+          SrcSel = DWORD;
+        else if (*Offset == 8 && *Width == 8)
+          SrcSel = BYTE_1;
+        else if (*Offset == 16 && *Width == 8)
+          SrcSel = BYTE_2;
+        else if (*Offset == 16 && *Width == 16)
+          SrcSel = WORD_1;
+        else if (*Offset == 24 && *Width == 8)
+          SrcSel = BYTE_3;
+        else
+          break;
+
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+        if (TRI->isPhysicalRegister(Src0->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src0, Dst, SrcSel, false, false,
+            Opcode == AMDGPU::V_BFE_U32 ? false : true);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+        break;
+      }
+      case AMDGPU::V_AND_B32_e32:
+      case AMDGPU::V_AND_B32_e64: {
+        // e.g.:
+        // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
+        // to SDWA src:v0 src_sel:WORD_0/BYTE_0
+
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        auto ValSrc = Src1;
+        auto Imm = foldToImm(*Src0);
+
+        if (!Imm) {
+          Imm = foldToImm(*Src1);
+          ValSrc = Src0;
+        }
+
+        if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
+          break;
+
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+        break;
+      }
+      }
+    }
+  }
+}
+
+bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
+                                         const SISubtarget &ST) const {
+  // Check if this instruction has opcode that supports SDWA
+  int Opc = MI.getOpcode();
+  if (AMDGPU::getSDWAOp(Opc) == -1)
+    Opc = AMDGPU::getVOPe32(Opc);
+
+  if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
+    return false;
+
+  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+    return false;
+
+  if (TII->isVOPC(Opc)) {
+    if (!ST.hasSDWASdst()) {
+      const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
+      if (SDst && SDst->getReg() != AMDGPU::VCC)
+        return false;
+    }
+
+    if (!ST.hasSDWAOutModsVOPC() &&
+        (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
+         TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
+      return false;
+
+  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
+             !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+    return false;
+  }
+
+  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+                           Opc == AMDGPU::V_MAC_F32_e32))
+    return false;
+
+  return true;
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+                                   const SDWAOperandsVector &SDWAOperands) {
+  // Convert to sdwa
+  int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
+  if (SDWAOpcode == -1)
+    SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(MI.getOpcode()));
+  assert(SDWAOpcode != -1);
+
+  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
+
+  // Create SDWA version of instruction MI and initialize its operands
+  MachineInstrBuilder SDWAInst =
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+
+  // Copy dst, if it is present in original then should also be present in SDWA
+  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (Dst) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
+    SDWAInst.add(*Dst);
+  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
+    assert(Dst &&
+           AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+    SDWAInst.add(*Dst);
+  } else {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
+    SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
+  }
+
+  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
+  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
+  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  assert(
+    Src0 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
+  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
+    SDWAInst.addImm(Mod->getImm());
+  else
+    SDWAInst.addImm(0);
+  SDWAInst.add(*Src0);
+
+  // Copy src1 if present, initialize src1_modifiers.
+  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
+    if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
+      SDWAInst.addImm(Mod->getImm());
+    else
+      SDWAInst.addImm(0);
+    SDWAInst.add(*Src1);
+  }
+
+  if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
+    // v_mac_f16/32 has additional src2 operand tied to vdst
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    assert(Src2);
+    SDWAInst.add(*Src2);
+  }
+
+  // Copy clamp if present, initialize otherwise
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
+  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
+  if (Clamp) {
+    SDWAInst.add(*Clamp);
+  } else {
+    SDWAInst.addImm(0);
+  }
+
+  // Copy omod if present, initialize otherwise if needed
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
+    MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
+    if (OMod) {
+      SDWAInst.add(*OMod);
+    } else {
+      SDWAInst.addImm(0);
+    }
+  }
+
+  // Initialize dst_sel if present
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Initialize dst_unused if present
+  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
+    SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+  }
+
+  // Initialize src0_sel
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
+  SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+
+
+  // Initialize src1_sel if present
+  if (Src1) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Apply all sdwa operand pattenrs
+  bool Converted = false;
+  for (auto &Operand : SDWAOperands) {
+    // There should be no intesection between SDWA operands and potential MIs
+    // e.g.:
+    // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
+    // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
+    // v_add_u32 v3, v4, v2
+    //
+    // In that example it is possible that we would fold 2nd instruction into 3rd
+    // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
+    // already destroyed). So if SDWAOperand is also a potential MI then do not
+    // apply it.
+    if (PotentialMatches.count(Operand->getParentInst()) == 0)
+      Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+  }
+  if (Converted) {
+    ConvertedInstructions.push_back(SDWAInst);
+  } else {
+    SDWAInst->eraseFromParent();
+    return false;
+  }
+
+  DEBUG(dbgs() << "Convert instruction:" << MI
+               << "Into:" << *SDWAInst << '\n');
+  ++NumSDWAInstructionsPeepholed;
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// If an instruction was converted to SDWA it should not have immediates or SGPR
+// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
+  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+  unsigned ConstantBusCount = 0;
+  for (MachineOperand &Op: MI.explicit_uses()) {
+    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
+      continue;
+
+    unsigned I = MI.getOperandNo(&Op);
+    if (Desc.OpInfo[I].RegClass == -1 ||
+       !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
+      continue;
+
+    if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
+        TRI->isSGPRReg(*MRI, Op.getReg())) {
+      ++ConstantBusCount;
+      continue;
+    }
+
+    unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
+    if (Op.isImm())
+      Copy.addImm(Op.getImm());
+    else if (Op.isReg())
+      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
+                  Op.getSubReg());
+    Op.ChangeToRegister(VGPR, false);
+  }
+}
+
+bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (!ST.hasSDWA())
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
+
+  // Find all SDWA operands in MF.
+  matchSDWAOperands(MF);
+
+  for (const auto &OperandPair : SDWAOperands) {
+    const auto &Operand = OperandPair.second;
+    MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+    if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
+      PotentialMatches[PotentialMI].push_back(Operand.get());
+    }
+  }
+
+  for (auto &PotentialPair : PotentialMatches) {
+    MachineInstr &PotentialMI = *PotentialPair.first;
+    convertToSDWA(PotentialMI, PotentialPair.second);
+  }
+
+  PotentialMatches.clear();
+  SDWAOperands.clear();
+
+  bool Ret = !ConvertedInstructions.empty();
+  while (!ConvertedInstructions.empty())
+    legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
+
+  return Ret;
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a1ed5e8..4a3fbb4 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SIRegisterInfo.h"
+#include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
-#include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -24,12 +24,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToSMEM(
-  "amdgpu-spill-sgpr-to-smem",
-  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
-  cl::init(false));
-
-
 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
   for (unsigned i = 0; PSets[i] != -1; ++i) {
     if (PSets[i] == (int)PSetID)
@@ -49,9 +43,28 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
   }
 }
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
-                                   SGPRPressureSets(getNumRegPressureSets()),
-                                   VGPRPressureSets(getNumRegPressureSets()) {
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+  "amdgpu-spill-sgpr-to-smem",
+  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+  cl::init(false));
+
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+  "amdgpu-spill-sgpr-to-vgpr",
+  cl::desc("Enable spilling VGPRs to SGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+  AMDGPURegisterInfo(),
+  SGPRPressureSets(getNumRegPressureSets()),
+  VGPRPressureSets(getNumRegPressureSets()),
+  SpillSGPRToVGPR(false),
+  SpillSGPRToSMEM(false) {
+  if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
+    SpillSGPRToSMEM = true;
+  else if (EnableSpillSGPRToVGPR)
+    SpillSGPRToVGPR = true;
+
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
   SGPRSetID = NumRegPressureSets;
@@ -97,17 +110,17 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
-  const MachineFunction &MF) const {
-  unsigned RegCount = getMaxNumSGPRs(MF);
+static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
   unsigned Reg;
 
-  // Try to place it in a hole after PrivateSegmentbufferReg.
+  // Try to place it in a hole after PrivateSegmentBufferReg.
   if (RegCount & 3) {
     // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
     // alignment constraints, so we have a hole where can put the wave offset.
@@ -117,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
     // wave offset before it.
     Reg = RegCount - 5;
   }
+
+  return Reg;
+}
+
+unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+  const MachineFunction &MF) const {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
 }
 
+unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
+  const MachineFunction &MF) const {
+  return AMDGPU::SGPR32;
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@@ -129,6 +155,15 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
+  // M0 has to be reserved so that llvm accepts it as a live-in into a block.
+  reserveRegisterTuples(Reserved, AMDGPU::M0);
+
+  // Reserve the memory aperture registers.
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+
   // Reserve Trap Handler registers - support is not implemented in Codegen.
   reserveRegisterTuples(Reserved, AMDGPU::TBA);
   reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -139,14 +174,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
 
-  unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
   }
 
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
+  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
@@ -170,15 +207,37 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
+  // We have to assume the SP is needed in case there are calls in the function,
+  // which is detected after the function is lowered. If we aren't really going
+  // to need SP, don't bother reserving it.
+  unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+
+  if (StackPtrReg != AMDGPU::NoRegister) {
+    reserveRegisterTuples(Reserved, StackPtrReg);
+    assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
+  }
+
+  unsigned FrameReg = MFI->getFrameOffsetReg();
+  if (FrameReg != AMDGPU::NoRegister) {
+    reserveRegisterTuples(Reserved, FrameReg);
+    assert(!isSubRegister(ScratchRSrcReg, FrameReg));
+  }
+
   return Reserved;
 }
 
 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
-  return Fn.getFrameInfo().hasStackObjects();
+  const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
+  if (Info->isEntryFunction()) {
+    const MachineFrameInfo &MFI = Fn.getFrameInfo();
+    return MFI.hasStackObjects() || MFI.hasCalls();
+  }
+
+  // May need scavenger for dealing with callee saved registers.
+  return true;
 }
 
-bool
-SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
+bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return MF.getFrameInfo().hasStackObjects();
 }
 
@@ -253,7 +312,6 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   }
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
   unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
   unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -263,8 +321,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
     .addFrameIndex(FrameIdx);
 
-  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
-    .addReg(UnusedCarry, RegState::Define | RegState::Dead)
+  TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
     .addReg(OffsetReg, RegState::Kill)
     .addReg(FIReg);
 }
@@ -292,8 +349,11 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 
   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
-
   assert(TII->isMUBUF(MI));
+  assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
+         MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
+         "should only be seeing frame offset relative FrameIndex");
+
 
   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
   int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -415,14 +475,14 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
   unsigned Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata)->getReg();
 
   BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
-    .addReg(Reg, getDefRegState(!IsStore))
-    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
-    .addOperand(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
-    .addImm(Offset)
-    .addImm(0) // glc
-    .addImm(0) // slc
-    .addImm(0) // tfe
-    .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+      .addReg(Reg, getDefRegState(!IsStore))
+      .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
+      .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
+      .addImm(Offset)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
   return true;
 }
 
@@ -545,11 +605,20 @@ static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
                       AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
 }
 
-void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
+bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                int Index,
-                               RegScavenger *RS) const {
+                               RegScavenger *RS,
+                               bool OnlyToVGPR) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
+    = MFI->getSGPRToVGPRSpills(Index);
+  bool SpillToVGPR = !VGPRSpills.empty();
+  if (OnlyToVGPR && !SpillToVGPR)
+    return false;
+
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -558,10 +627,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   bool IsKill = MI->getOperand(0).isKill();
   const DebugLoc &DL = MI->getDebugLoc();
 
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 
-  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+  bool SpillToSMEM = spillSGPRToSMEM();
+  if (SpillToSMEM && OnlyToVGPR)
+    return false;
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -582,7 +652,8 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   if (SpillToSMEM && isSGPRClass(RC)) {
     // XXX - if private_element_size is larger than 4 it might be useful to be
     // able to spill wider vmem spills.
-    std::tie(EltSize, ScalarStoreOp) = getSpillEltSize(RC->getSize(), true);
+    std::tie(EltSize, ScalarStoreOp) =
+          getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
   }
 
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
@@ -617,11 +688,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
       if (Offset != 0) {
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
-          .addReg(MFI->getScratchWaveOffsetReg())
+          .addReg(MFI->getFrameOffsetReg())
           .addImm(Offset);
       } else {
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
-          .addReg(MFI->getScratchWaveOffsetReg());
+          .addReg(MFI->getFrameOffsetReg());
       }
 
       BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
@@ -634,9 +705,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       continue;
     }
 
-    struct SIMachineFunctionInfo::SpilledReg Spill =
-      MFI->getSpilledReg(MF, Index, i);
-    if (Spill.hasReg()) {
+    if (SpillToVGPR) {
+      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+
       BuildMI(*MBB, MI, DL,
               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
               Spill.VGPR)
@@ -647,6 +718,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       // frame index, we should delete the frame index when all references to
       // it are fixed.
     } else {
+      // XXX - Can to VGPR spill fail for some subregisters but not others?
+      if (OnlyToVGPR)
+        return false;
+
       // Spill SGPR to a frame index.
       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -674,11 +749,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
                                    EltSize, MinAlign(Align, EltSize * i));
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
-        .addReg(TmpReg, RegState::Kill)         // src
-        .addFrameIndex(Index)                   // vaddr
-        .addReg(MFI->getScratchRSrcReg())       // srrsrc
-        .addReg(MFI->getScratchWaveOffsetReg()) // soffset
-        .addImm(i * 4)                          // offset
+        .addReg(TmpReg, RegState::Kill)    // src
+        .addFrameIndex(Index)              // vaddr
+        .addReg(MFI->getScratchRSrcReg())  // srrsrc
+        .addReg(MFI->getFrameOffsetReg())  // soffset
+        .addImm(i * 4)                     // offset
         .addMemOperand(MMO);
     }
   }
@@ -690,22 +765,33 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
 
   MI->eraseFromParent();
   MFI->addToSpilledSGPRs(NumSubRegs);
+  return true;
 }
 
-void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
+bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                  int Index,
-                                 RegScavenger *RS) const {
+                                 RegScavenger *RS,
+                                 bool OnlyToVGPR) const {
   MachineFunction *MF = MI->getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
+    = MFI->getSGPRToVGPRSpills(Index);
+  bool SpillToVGPR = !VGPRSpills.empty();
+  if (OnlyToVGPR && !SpillToVGPR)
+    return false;
+
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
   unsigned SuperReg = MI->getOperand(0).getReg();
-  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+  bool SpillToSMEM = spillSGPRToSMEM();
+  if (SpillToSMEM && OnlyToVGPR)
+    return false;
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -727,7 +813,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   if (SpillToSMEM && isSGPRClass(RC)) {
     // XXX - if private_element_size is larger than 4 it might be useful to be
     // able to spill wider vmem spills.
-    std::tie(EltSize, ScalarLoadOp) = getSpillEltSize(RC->getSize(), false);
+    std::tie(EltSize, ScalarLoadOp) =
+          getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
   }
 
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
@@ -753,11 +840,11 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
       if (Offset != 0) {
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
-          .addReg(MFI->getScratchWaveOffsetReg())
+          .addReg(MFI->getFrameOffsetReg())
           .addImm(Offset);
       } else {
         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
-          .addReg(MFI->getScratchWaveOffsetReg());
+          .addReg(MFI->getFrameOffsetReg());
       }
 
       auto MIB =
@@ -773,10 +860,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       continue;
     }
 
-    SIMachineFunctionInfo::SpilledReg Spill
-      = MFI->getSpilledReg(MF, Index, i);
-
-    if (Spill.hasReg()) {
+    if (SpillToVGPR) {
+      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
       auto MIB =
         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
                 SubReg)
@@ -786,6 +871,9 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       if (NumSubRegs > 1)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
     } else {
+      if (OnlyToVGPR)
+        return false;
+
       // Restore SGPR from a stack slot.
       // FIXME: We should use S_LOAD_DWORD here for VI.
       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -799,10 +887,10 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
         MinAlign(Align, EltSize * i));
 
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
-        .addFrameIndex(Index)                   // vaddr
-        .addReg(MFI->getScratchRSrcReg())       // srsrc
-        .addReg(MFI->getScratchWaveOffsetReg()) // soffset
-        .addImm(i * 4)                          // offset
+        .addFrameIndex(Index)              // vaddr
+        .addReg(MFI->getScratchRSrcReg())  // srsrc
+        .addReg(MFI->getFrameOffsetReg())  // soffset
+        .addImm(i * 4)                     // offset
         .addMemOperand(MMO);
 
       auto MIB =
@@ -820,6 +908,32 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   }
 
   MI->eraseFromParent();
+  return true;
+}
+
+/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
+/// a VGPR and the stack slot can be safely eliminated when all other users are
+/// handled.
+bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
+  MachineBasicBlock::iterator MI,
+  int FI,
+  RegScavenger *RS) const {
+  switch (MI->getOpcode()) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S32_SAVE:
+    return spillSGPR(MI, FI, RS, true);
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+    return restoreSGPR(MI, FI, RS, true);
+  default:
+    llvm_unreachable("not an SGPR spill instruction");
+  }
 }
 
 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
@@ -901,12 +1015,83 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     }
 
     default: {
-      if (TII->isMUBUF(*MI)) {
+      const DebugLoc &DL = MI->getDebugLoc();
+      bool IsMUBUF = TII->isMUBUF(*MI);
+
+      if (!IsMUBUF &&
+          MFI->getFrameOffsetReg() != MFI->getScratchWaveOffsetReg()) {
+        // Convert to an absolute stack address by finding the offset from the
+        // scratch wave base and scaling by the wave size.
+        //
+        // In an entry function/kernel the stack address is already the absolute
+        // address relative to the the scratch wave offset.
+
+        unsigned DiffReg
+          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+        bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
+        unsigned ResultReg = IsCopy ?
+          MI->getOperand(0).getReg() :
+          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
+          .addReg(MFI->getFrameOffsetReg())
+          .addReg(MFI->getScratchWaveOffsetReg());
+
+        int64_t Offset = FrameInfo.getObjectOffset(Index);
+        if (Offset == 0) {
+          // XXX - This never happens because of emergency scavenging slot at 0?
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
+            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addReg(DiffReg);
+        } else {
+          unsigned CarryOut
+            = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+          unsigned ScaledReg
+            = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
+            .addImm(Log2_32(ST.getWavefrontSize()))
+            .addReg(DiffReg, RegState::Kill);
+
+          // TODO: Fold if use instruction is another add of a constant.
+          if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+              .addReg(CarryOut, RegState::Define | RegState::Dead)
+              .addImm(Offset)
+              .addReg(ScaledReg, RegState::Kill);
+          } else {
+            unsigned ConstOffsetReg
+              = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+              .addImm(Offset);
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), ResultReg)
+              .addReg(CarryOut, RegState::Define | RegState::Dead)
+              .addReg(ConstOffsetReg, RegState::Kill)
+              .addReg(ScaledReg, RegState::Kill);
+          }
+
+          MRI.setRegAllocationHint(CarryOut, 0, AMDGPU::VCC);
+        }
+
+        // Don't introduce an extra copy if we're just materializing in a mov.
+        if (IsCopy)
+          MI->eraseFromParent();
+        else
+          FIOp.ChangeToRegister(ResultReg, false, false, true);
+        return;
+      }
+
+      if (IsMUBUF) {
         // Disable offen so we don't need a 0 vgpr base.
         assert(static_cast<int>(FIOperandNum) ==
                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::vaddr));
 
+        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg()
+               == MFI->getFrameOffsetReg());
+
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         int64_t OldImm
           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
@@ -915,23 +1100,85 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
         if (isUInt<12>(NewOffset) &&
             buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
           MI->eraseFromParent();
-          break;
+          return;
         }
       }
 
+      // If the offset is simply too big, don't convert to a scratch wave offset
+      // relative index.
+
       int64_t Offset = FrameInfo.getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
         unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(*MBB, MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
-                .addImm(Offset);
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+          .addImm(Offset);
         FIOp.ChangeToRegister(TmpReg, false, false, true);
       }
     }
   }
 }
 
+StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
+  #define AMDGPU_REG_ASM_NAMES
+  #include "AMDGPURegAsmNames.inc.cpp"
+
+  #define REG_RANGE(BeginReg, EndReg, RegTable)            \
+    if (Reg >= BeginReg && Reg <= EndReg) {                \
+      unsigned Index = Reg - BeginReg;                     \
+      assert(Index < array_lengthof(RegTable));            \
+      return RegTable[Index];                              \
+    }
+
+  REG_RANGE(AMDGPU::VGPR0, AMDGPU::VGPR255, VGPR32RegNames);
+  REG_RANGE(AMDGPU::SGPR0, AMDGPU::SGPR103, SGPR32RegNames);
+  REG_RANGE(AMDGPU::VGPR0_VGPR1, AMDGPU::VGPR254_VGPR255, VGPR64RegNames);
+  REG_RANGE(AMDGPU::SGPR0_SGPR1, AMDGPU::SGPR102_SGPR103, SGPR64RegNames);
+  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2, AMDGPU::VGPR253_VGPR254_VGPR255,
+            VGPR96RegNames);
+
+  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3,
+            AMDGPU::VGPR252_VGPR253_VGPR254_VGPR255,
+            VGPR128RegNames);
+  REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
+            AMDGPU::SGPR100_SGPR101_SGPR102_SGPR103,
+            SGPR128RegNames);
+
+  REG_RANGE(AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7,
+            AMDGPU::VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
+            VGPR256RegNames);
+
+  REG_RANGE(
+    AMDGPU::VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15,
+    AMDGPU::VGPR240_VGPR241_VGPR242_VGPR243_VGPR244_VGPR245_VGPR246_VGPR247_VGPR248_VGPR249_VGPR250_VGPR251_VGPR252_VGPR253_VGPR254_VGPR255,
+    VGPR512RegNames);
+
+  REG_RANGE(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7,
+            AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
+            SGPR256RegNames);
+
+  REG_RANGE(
+    AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7_SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15,
+    AMDGPU::SGPR88_SGPR89_SGPR90_SGPR91_SGPR92_SGPR93_SGPR94_SGPR95_SGPR96_SGPR97_SGPR98_SGPR99_SGPR100_SGPR101_SGPR102_SGPR103,
+    SGPR512RegNames
+  );
+
+#undef REG_RANGE
+
+  // FIXME: Rename flat_scr so we don't need to special case this.
+  switch (Reg) {
+  case AMDGPU::FLAT_SCR:
+    return "flat_scratch";
+  case AMDGPU::FLAT_SCR_LO:
+    return "flat_scratch_lo";
+  case AMDGPU::FLAT_SCR_HI:
+    return "flat_scratch_hi";
+  default:
+    // For the special named registers the default is fine.
+    return TargetRegisterInfo::getRegAsmName(Reg);
+  }
+}
+
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
@@ -963,20 +1210,21 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 // TODO: It might be helpful to have some target specific flags in
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
-  switch (RC->getSize()) {
-  case 0: return false;
-  case 1: return false;
-  case 4:
+  unsigned Size = getRegSizeInBits(*RC);
+  if (Size < 32)
+    return false;
+  switch (Size) {
+  case 32:
     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
-  case 8:
+  case 64:
     return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
-  case 12:
+  case 96:
     return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
-  case 16:
+  case 128:
     return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
-  case 32:
+  case 256:
     return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
-  case 64:
+  case 512:
     return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
   default:
     llvm_unreachable("Invalid register class size");
@@ -985,18 +1233,18 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
 
 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
                                          const TargetRegisterClass *SRC) const {
-  switch (SRC->getSize()) {
-  case 4:
+  switch (getRegSizeInBits(*SRC)) {
+  case 32:
     return &AMDGPU::VGPR_32RegClass;
-  case 8:
+  case 64:
     return &AMDGPU::VReg_64RegClass;
-  case 12:
+  case 96:
     return &AMDGPU::VReg_96RegClass;
-  case 16:
+  case 128:
     return &AMDGPU::VReg_128RegClass;
-  case 32:
+  case 256:
     return &AMDGPU::VReg_256RegClass;
-  case 64:
+  case 512:
     return &AMDGPU::VReg_512RegClass;
   default:
     llvm_unreachable("Invalid register class size");
@@ -1005,16 +1253,16 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
 
 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
                                          const TargetRegisterClass *VRC) const {
-  switch (VRC->getSize()) {
-  case 4:
+  switch (getRegSizeInBits(*VRC)) {
+  case 32:
     return &AMDGPU::SGPR_32RegClass;
-  case 8:
+  case 64:
     return &AMDGPU::SReg_64RegClass;
-  case 16:
+  case 128:
     return &AMDGPU::SReg_128RegClass;
-  case 32:
+  case 256:
     return &AMDGPU::SReg_256RegClass;
-  case 64:
+  case 512:
     return &AMDGPU::SReg_512RegClass;
   default:
     llvm_unreachable("Invalid register class size");
@@ -1108,12 +1356,11 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
     return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
   case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
-    if (ST.isAmdCodeObjectV2(MF)) {
-      assert(MFI->hasPrivateSegmentBuffer());
-      return MFI->PrivateSegmentBufferUserSGPR;
-    }
-    assert(MFI->hasPrivateMemoryInputPtr());
-    return MFI->PrivateMemoryPtrUserSGPR;
+    assert(MFI->hasPrivateSegmentBuffer());
+    return MFI->PrivateSegmentBufferUserSGPR;
+  case SIRegisterInfo::IMPLICIT_BUFFER_PTR:
+    assert(MFI->hasImplicitBufferPtr());
+    return MFI->ImplicitBufferPtrUserSGPR;
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
@@ -1156,210 +1403,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
   return AMDGPU::NoRegister;
 }
 
-unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return 800;
-  return 512;
-}
-
-unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return 102;
-  return 104;
-}
-
-unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
-                                             const SIMachineFunctionInfo &MFI) const {
-  if (MFI.hasFlatScratchInit()) {
-    if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
-
-    if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
-      return 4; // FLAT_SCRATCH, VCC (in that order)
-  }
-
-  if (ST.isXNACKEnabled())
-    return 4; // XNACK, VCC (in that order)
-
-  return 2; // VCC.
-}
-
-unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
-                                        unsigned WavesPerEU) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WavesPerEU) {
-      case 0:  return 0;
-      case 10: return 0;
-      case 9:  return 0;
-      case 8:  return 81;
-      default: return 97;
-    }
-  } else {
-    switch (WavesPerEU) {
-      case 0:  return 0;
-      case 10: return 0;
-      case 9:  return 49;
-      case 8:  return 57;
-      case 7:  return 65;
-      case 6:  return 73;
-      case 5:  return 81;
-      default: return 97;
-    }
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
-                                        unsigned WavesPerEU,
-                                        bool Addressable) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WavesPerEU) {
-      case 0:  return 80;
-      case 10: return 80;
-      case 9:  return 80;
-      case 8:  return 96;
-      default: return Addressable ? getNumAddressableSGPRs(ST) : 112;
-    }
-  } else {
-    switch (WavesPerEU) {
-      case 0:  return 48;
-      case 10: return 48;
-      case 9:  return 56;
-      case 8:  return 64;
-      case 7:  return 72;
-      case 6:  return 80;
-      case 5:  return 96;
-      default: return getNumAddressableSGPRs(ST);
-    }
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
-  const Function &F = *MF.getFunction();
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
-  // Compute maximum number of SGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
-  unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false);
-  unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true);
-
-  // Check if maximum number of SGPRs was explicitly requested using
-  // "amdgpu-num-sgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
-    unsigned Requested = AMDGPU::getIntegerAttribute(
-      F, "amdgpu-num-sgpr", MaxNumSGPRs);
-
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
-      Requested = 0;
-
-    // If more SGPRs are required to support the input user/system SGPRs,
-    // increase to accommodate them.
-    //
-    // FIXME: This really ends up using the requested number of SGPRs + number
-    // of reserved special registers in total. Theoretically you could re-use
-    // the last input registers for these special registers, but this would
-    // require a lot of complexity to deal with the weird aliasing.
-    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
-    if (Requested && Requested < NumInputSGPRs)
-      Requested = NumInputSGPRs;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false))
-      Requested = 0;
-    if (WavesPerEU.second &&
-        Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
-      Requested = 0;
-
-    if (Requested)
-      MaxNumSGPRs = Requested;
-  }
-
-  if (ST.hasSGPRInitBug())
-    MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-
-  return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
-                  MaxNumAddressableSGPRs);
-}
-
-unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
-  const SISubtarget &ST) const {
-  if (ST.debuggerReserveRegs())
-    return 4;
-  return 0;
-}
-
-unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
-  switch (WavesPerEU) {
-    case 0:  return 0;
-    case 10: return 0;
-    case 9:  return 25;
-    case 8:  return 29;
-    case 7:  return 33;
-    case 6:  return 37;
-    case 5:  return 41;
-    case 4:  return 49;
-    case 3:  return 65;
-    case 2:  return 85;
-    default: return 129;
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
-  switch (WavesPerEU) {
-    case 0:  return 24;
-    case 10: return 24;
-    case 9:  return 28;
-    case 8:  return 32;
-    case 7:  return 36;
-    case 6:  return 40;
-    case 5:  return 48;
-    case 4:  return 64;
-    case 3:  return 84;
-    case 2:  return 128;
-    default: return getTotalNumVGPRs();
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
-  const Function &F = *MF.getFunction();
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
-  // Compute maximum number of VGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
-
-  // Check if maximum number of VGPRs was explicitly requested using
-  // "amdgpu-num-vgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
-    unsigned Requested = AMDGPU::getIntegerAttribute(
-      F, "amdgpu-num-vgpr", MaxNumVGPRs);
-
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
-      Requested = 0;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
-      Requested = 0;
-    if (WavesPerEU.second &&
-        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
-      Requested = 0;
-
-    if (Requested)
-      MaxNumVGPRs = Requested;
-  }
-
-  return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
-}
-
 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
                                                    unsigned EltSize) const {
   if (EltSize == 4) {
@@ -1476,3 +1519,62 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
                             unsigned Reg) const {
   return hasVGPRs(getRegClassForReg(MRI, Reg));
 }
+
+bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
+                                    const TargetRegisterClass *SrcRC,
+                                    unsigned SubReg,
+                                    const TargetRegisterClass *DstRC,
+                                    unsigned DstSubReg,
+                                    const TargetRegisterClass *NewRC) const {
+  unsigned SrcSize = getRegSizeInBits(*SrcRC);
+  unsigned DstSize = getRegSizeInBits(*DstRC);
+  unsigned NewSize = getRegSizeInBits(*NewRC);
+
+  // Do not increase size of registers beyond dword, we would need to allocate
+  // adjacent registers and constraint regalloc more than needed.
+
+  // Always allow dword coalescing.
+  if (SrcSize <= 32 || DstSize <= 32)
+    return true;
+
+  return NewSize <= DstSize || NewSize <= SrcSize;
+}
+
+unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                             MachineFunction &MF) const {
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                       *MF.getFunction());
+  switch (RC->getID()) {
+  default:
+    return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
+  case AMDGPU::VGPR_32RegClassID:
+    return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
+  case AMDGPU::SGPR_32RegClassID:
+    return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
+  }
+}
+
+unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+                                                unsigned Idx) const {
+  if (Idx == getVGPRPressureSet())
+    return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+                               const_cast<MachineFunction &>(MF));
+
+  if (Idx == getSGPRPressureSet())
+    return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
+                               const_cast<MachineFunction &>(MF));
+
+  return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
+}
+
+const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+  static const int Empty[] = { -1 };
+
+  if (hasRegUnit(AMDGPU::M0, RegUnit))
+    return Empty;
+  return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
+}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 0bcae7d..600cc88 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -16,13 +16,14 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
 #include "AMDGPURegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 namespace llvm {
 
-class SISubtarget;
 class MachineRegisterInfo;
+class SISubtarget;
 class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -31,13 +32,22 @@ private:
   unsigned VGPRSetID;
   BitVector SGPRPressureSets;
   BitVector VGPRPressureSets;
+  bool SpillSGPRToVGPR;
+  bool SpillSGPRToSMEM;
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
   void classifyPressureSet(unsigned PSetID, unsigned Reg,
                            BitVector &PressureSets) const;
-
 public:
-  SIRegisterInfo();
+  SIRegisterInfo(const SISubtarget &ST);
+
+  bool spillSGPRToVGPR() const {
+    return SpillSGPRToVGPR;
+  }
+
+  bool spillSGPRToSMEM() const {
+    return SpillSGPRToSMEM;
+  }
 
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
@@ -48,8 +58,22 @@ public:
   unsigned reservedPrivateSegmentWaveByteOffsetReg(
     const MachineFunction &MF) const;
 
+  unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
+
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+                                       CallingConv::ID) const override;
+
+  // Stack access is very expensive. CSRs are also the high registers, and we
+  // want to minimize the number of used registers.
+  unsigned getCSRFirstUseCost() const override {
+    return 100;
+  }
+
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
+
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@@ -78,16 +102,24 @@ public:
   const TargetRegisterClass *getPointerRegClass(
     const MachineFunction &MF, unsigned Kind = 0) const override;
 
-  void spillSGPR(MachineBasicBlock::iterator MI,
-                 int FI, RegScavenger *RS) const;
+  /// If \p OnlyToVGPR is true, this will only succeed if this
+  bool spillSGPR(MachineBasicBlock::iterator MI,
+                 int FI, RegScavenger *RS,
+                 bool OnlyToVGPR = false) const;
 
-  void restoreSGPR(MachineBasicBlock::iterator MI,
-                   int FI, RegScavenger *RS) const;
+  bool restoreSGPR(MachineBasicBlock::iterator MI,
+                   int FI, RegScavenger *RS,
+                   bool OnlyToVGPR = false) const;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
+  bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
+                                          int FI, RegScavenger *RS) const;
+
+  StringRef getRegAsmName(unsigned Reg) const override;
+
   unsigned getHWRegIndex(unsigned Reg) const {
     return getEncodingValue(Reg) & 0xff;
   }
@@ -165,12 +197,13 @@ public:
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
     PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14,
+    IMPLICIT_BUFFER_PTR = 15,
 
     // VGPRS:
-    FIRST_VGPR_VALUE    = 15,
+    FIRST_VGPR_VALUE    = 16,
     WORKITEM_ID_X       = FIRST_VGPR_VALUE,
-    WORKITEM_ID_Y       = 16,
-    WORKITEM_ID_Z       = 17
+    WORKITEM_ID_Y       = 17,
+    WORKITEM_ID_Z       = 18
   };
 
   /// \brief Returns the physical register that \p Value is stored in.
@@ -195,74 +228,28 @@ public:
     return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
   }
 
-  /// \returns SGPR allocation granularity supported by the subtarget.
-  unsigned getSGPRAllocGranule() const {
-    return 8;
-  }
-
-  /// \returns Total number of SGPRs supported by the subtarget.
-  unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
-
-  /// \returns Number of addressable SGPRs supported by the subtarget.
-  unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
-
-  /// \returns Number of reserved SGPRs supported by the subtarget.
-  unsigned getNumReservedSGPRs(const SISubtarget &ST,
-                               const SIMachineFunctionInfo &MFI) const;
-
-  /// \returns Minimum number of SGPRs that meets given number of waves per
-  /// execution unit requirement for given subtarget.
-  unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
-
-  /// \returns Maximum number of SGPRs that meets given number of waves per
-  /// execution unit requirement for given subtarget.
-  unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU,
-                          bool Addressable) const;
-
-  /// \returns Maximum number of SGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of SGPRs explicitly
-  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
-
-  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule() const {
-    return 4;
-  }
-
-  /// \returns Total number of VGPRs supported by the subtarget.
-  unsigned getTotalNumVGPRs() const {
-    return 256;
-  }
+  ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
+                                     unsigned EltSize) const;
 
-  /// \returns Number of reserved VGPRs for debugger use supported by the
-  /// subtarget.
-  unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
 
-  /// \returns Minimum number of SGPRs that meets given number of waves per
-  /// execution unit requirement.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 
-  /// \returns Maximum number of VGPRs that meets given number of waves per
-  /// execution unit requirement.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
 
-  /// \returns Maximum number of VGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of VGPRs explicitly
-  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+  const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
-  ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
-                                     unsigned EltSize) const;
+  unsigned getReturnAddressReg(const MachineFunction &MF) const {
+    // Not a callee saved register.
+    return AMDGPU::SGPR30_SGPR31;
+  }
 
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 31e714b..d097b78 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>,
 def VCC_LO : SIReg<"vcc_lo", 106>;
 def VCC_HI : SIReg<"vcc_hi", 107>;
 
+// Pseudo-registers: Used as placeholders during isel and immediately
+// replaced, never seeing the verifier.
+def PRIVATE_RSRC_REG : SIReg<"", 0>;
+def FP_REG : SIReg<"", 0>;
+def SP_REG : SIReg<"", 0>;
+def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>;
+
 // VCC for 64-bit instructions
 def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
           DwarfRegAlias<VCC_LO> {
@@ -44,6 +51,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
+def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
+def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
+def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
+def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+
 // Trap handler registers
 def TBA_LO : SIReg<"tba_lo", 108>;
 def TBA_HI : SIReg<"tba_hi", 109>;
@@ -128,7 +140,7 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 // SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "SGPR%u", 0, 103))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
@@ -179,7 +191,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                (add (decimate (shl SGPR_32, 15), 4))]>;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
                             (add (sequence "TTMP%u", 0, 11))> {
   let isAllocatable = 0;
 }
@@ -197,7 +209,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
                                (add (decimate (shl TTMP_32, 3), 4))]>;
 
 // VGPR 32-bit registers
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+// i16/f16 only on VI+
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
@@ -258,19 +271,20 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
-   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
+   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
   let AllocationPriority = 7;
 }
 
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 7;
 }
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 7;
 }
@@ -307,7 +321,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R
   let isAllocatable = 0;
 }
 
-def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> {
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32,
+  (add SGPR_128, TTMP_128)> {
   let AllocationPriority = 10;
 }
 
@@ -319,7 +334,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
   let AllocationPriority = 11;
 }
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
   let AllocationPriority = 12;
@@ -366,7 +381,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
   let Size = 32;
 }
 
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                           (add VGPR_32, SReg_32)> {
   let isAllocatable = 0;
 }
@@ -417,6 +432,18 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> {
       let OperandType = opType#"_FP64";
       let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
     }
+
+    def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_V2INT16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">;
+      let DecoderMethod = "decodeOperand_VSrcV216";
+    }
+
+    def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_V2FP16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">;
+      let DecoderMethod = "decodeOperand_VSrcV216";
+    }
   }
 }
 
@@ -445,7 +472,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
 
 defm VSrc : RegImmOperand<"VS", "VSrc">;
 
-def VSrc_128 : RegisterOperand<VReg_128>;
+def VSrc_128 : RegisterOperand<VReg_128> {
+  let DecoderMethod = "DecodeVS_128RegisterClass";
+}
 
 //===----------------------------------------------------------------------===//
 //  VSrc_* Operands with an VGPR
diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
index be27966..0f02f58 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -53,6 +53,11 @@ class SISchedMachineModel : SchedMachineModel {
   let MicroOpBufferSize = 1;
   let IssueWidth = 1;
   let PostRAScheduler = 1;
+
+  // FIXME:Approximate 2 * branch cost.  Try to hack around bad
+  // early-ifcvt heuristics. These need improvement to avoid the OOE
+  // heuristics.
+  int MispredictPenalty = 20;
 }
 
 def SIFullSpeedModel : SISchedMachineModel;
diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index dd31dc6..874fbad 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -92,6 +92,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
 
       case AMDGPU::V_ADDC_U32_e64:
       case AMDGPU::V_SUBB_U32_e64:
+        if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isImm())
+          return false;
         // Additional verification is needed for sdst/src2.
         return true;
 
@@ -108,10 +110,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
   }
 
   const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
-  const MachineOperand *Src1Mod =
-      TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
-
-  if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0)))
+  if (Src1 && (!isVGPR(Src1, TRI, MRI) ||
+               TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
     return false;
 
   // We don't need to check src0, all input types are legal, so just make sure
@@ -120,58 +120,64 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII,
     return false;
 
   // Check output modifiers
-  if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
-    return false;
-
-  return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
+  return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) &&
+         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp);
 }
 
 /// \brief This function checks \p MI for operands defined by a move immediate
 /// instruction and then folds the literal constant into the instruction if it
-/// can.  This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction
-/// and will only fold literal constants if we are still in SSA.
-static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
+/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
+static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
-
-  if (!MRI.isSSA())
-    return;
-
   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
 
   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
 
-  // Only one literal constant is allowed per instruction, so if src0 is a
-  // literal constant then we can't do any folding.
-  if (TII->isLiteralConstant(MI, Src0Idx))
-    return;
-
   // Try to fold Src0
   MachineOperand &Src0 = MI.getOperand(Src0Idx);
-  if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
+  if (Src0.isReg()) {
     unsigned Reg = Src0.getReg();
-    MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
-    if (Def && Def->isMoveImmediate()) {
-      MachineOperand &MovSrc = Def->getOperand(1);
-      bool ConstantFolded = false;
-
-      if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
-                             isUInt<32>(MovSrc.getImm()))) {
-        Src0.ChangeToImmediate(MovSrc.getImm());
-        ConstantFolded = true;
-      }
-      if (ConstantFolded) {
-        if (MRI.use_empty(Reg))
+    if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+      MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
+      if (Def && Def->isMoveImmediate()) {
+        MachineOperand &MovSrc = Def->getOperand(1);
+        bool ConstantFolded = false;
+
+        if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) ||
+                               isUInt<32>(MovSrc.getImm()))) {
+          // It's possible to have only one component of a super-reg defined by
+          // a single mov, so we need to clear any subregister flag.
+          Src0.setSubReg(0);
+          Src0.ChangeToImmediate(MovSrc.getImm());
+          ConstantFolded = true;
+        } else if (MovSrc.isFI()) {
+          Src0.setSubReg(0);
+          Src0.ChangeToFrameIndex(MovSrc.getIndex());
+          ConstantFolded = true;
+        }
+
+        if (ConstantFolded) {
+          assert(MRI.use_empty(Reg));
           Def->eraseFromParent();
-        ++NumLiteralConstantsFolded;
-        return;
+          ++NumLiteralConstantsFolded;
+          return true;
+        }
       }
     }
   }
 
   // We have failed to fold src0, so commute the instruction and try again.
-  if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI))
-    foldImmediates(MI, TII, MRI, false);
+  if (TryToCommute && MI.isCommutable()) {
+    if (TII->commuteInstruction(MI)) {
+      if (foldImmediates(MI, TII, MRI, false))
+        return true;
 
+      // Commute back.
+      TII->commuteInstruction(MI);
+    }
+  }
+
+  return false;
 }
 
 // Copy MachineOperand with all flags except setting it as implicit.
@@ -497,24 +503,24 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
       if (Op32DstIdx != -1) {
         // dst
-        Inst32.addOperand(MI.getOperand(0));
+        Inst32.add(MI.getOperand(0));
       } else {
         assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
                "Unexpected case");
       }
 
 
-      Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
+      Inst32.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
 
       const MachineOperand *Src1 =
           TII->getNamedOperand(MI, AMDGPU::OpName::src1);
       if (Src1)
-        Inst32.addOperand(*Src1);
+        Inst32.add(*Src1);
 
       if (Src2) {
         int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
         if (Op32Src2Idx != -1) {
-          Inst32.addOperand(*Src2);
+          Inst32.add(*Src2);
         } else {
           // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
           // replaced with an implicit read of vcc. This was already added
diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
deleted file mode 100644
index aad6853..0000000
--- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass removes performs the following type substitution on all
-/// non-compute shaders:
-///
-/// v16i8 => i128
-///   - v16i8 is used for constant memory resource descriptors.  This type is
-///      legal for some compute APIs, and we don't want to declare it as legal
-///      in the backend, because we want the legalizer to expand all v16i8
-///      operations.
-/// v1* => *
-///   - Having v1* types complicates the legalizer and we can easily replace
-///   - them with the element type.
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-
-namespace {
-
-class SITypeRewriter : public FunctionPass,
-                       public InstVisitor<SITypeRewriter> {
-
-  static char ID;
-  Module *Mod;
-  Type *v16i8;
-  Type *v4i32;
-
-public:
-  SITypeRewriter() : FunctionPass(ID) { }
-  bool doInitialization(Module &M) override;
-  bool runOnFunction(Function &F) override;
-  StringRef getPassName() const override { return "SI Type Rewriter"; }
-  void visitLoadInst(LoadInst &I);
-  void visitCallInst(CallInst &I);
-  void visitBitCast(BitCastInst &I);
-};
-
-} // End anonymous namespace
-
-char SITypeRewriter::ID = 0;
-
-bool SITypeRewriter::doInitialization(Module &M) {
-  Mod = &M;
-  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
-  return false;
-}
-
-bool SITypeRewriter::runOnFunction(Function &F) {
-  if (!AMDGPU::isShader(F.getCallingConv()))
-    return false;
-
-  visit(F);
-  visit(F);
-
-  return false;
-}
-
-void SITypeRewriter::visitLoadInst(LoadInst &I) {
-  Value *Ptr = I.getPointerOperand();
-  Type *PtrTy = Ptr->getType();
-  Type *ElemTy = PtrTy->getPointerElementType();
-  IRBuilder<> Builder(&I);
-  if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr,
-        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
-    LoadInst *Load = Builder.CreateLoad(BitCast);
-    SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
-    I.getAllMetadataOtherThanDebugLoc(MD);
-    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
-      Load->setMetadata(MD[i].first, MD[i].second);
-    }
-    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
-    I.replaceAllUsesWith(BitCastLoad);
-    I.eraseFromParent();
-  }
-}
-
-void SITypeRewriter::visitCallInst(CallInst &I) {
-  IRBuilder<> Builder(&I);
-
-  SmallVector <Value*, 8> Args;
-  SmallVector <Type*, 8> Types;
-  bool NeedToReplace = false;
-  Function *F = I.getCalledFunction();
-  if (!F)
-    return;
-
-  std::string Name = F->getName();
-  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-    Value *Arg = I.getArgOperand(i);
-    if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
-      Types.push_back(v4i32);
-      NeedToReplace = true;
-      Name = Name + ".v4i32";
-    } else if (Arg->getType()->isVectorTy() &&
-               Arg->getType()->getVectorNumElements() == 1 &&
-               Arg->getType()->getVectorElementType() ==
-                                              Type::getInt32Ty(I.getContext())){
-      Type *ElementTy = Arg->getType()->getVectorElementType();
-      std::string TypeName = "i32";
-      InsertElementInst *Def = cast<InsertElementInst>(Arg);
-      Args.push_back(Def->getOperand(1));
-      Types.push_back(ElementTy);
-      std::string VecTypeName = "v1" + TypeName;
-      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
-      NeedToReplace = true;
-    } else {
-      Args.push_back(Arg);
-      Types.push_back(Arg->getType());
-    }
-  }
-
-  if (!NeedToReplace) {
-    return;
-  }
-  Function *NewF = Mod->getFunction(Name);
-  if (!NewF) {
-    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
-    NewF->setAttributes(F->getAttributes());
-  }
-  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
-  I.eraseFromParent();
-}
-
-void SITypeRewriter::visitBitCast(BitCastInst &I) {
-  IRBuilder<> Builder(&I);
-  if (I.getDestTy() != v4i32) {
-    return;
-  }
-
-  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == v4i32) {
-      I.replaceAllUsesWith(Op->getOperand(0));
-      I.eraseFromParent();
-    }
-  }
-}
-
-FunctionPass *llvm::createSITypeRewriter() {
-  return new SITypeRewriter();
-}
diff --git a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
index 0265648..73dd8b7 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -226,9 +226,10 @@ def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
-    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
+    !Ld->isVolatile() &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
@@ -293,12 +294,6 @@ def : Pat <
 
 let Predicates = [isVI] in {
 
-// 1. Offset as 20bit DWORD immediate
-def : Pat <
-  (SIload_constant v4i32:$sbase, IMM20bit:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset), 0)
->;
-
 def : Pat <
   (i64 (readcyclecounter)),
   (S_MEMREALTIME)
diff --git a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 73cd577..ec29a66 100644
--- a/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -82,6 +82,12 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
   let has_sdst = 0;
 }
 
+class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_32:$src0),
+  "$src0", pattern> {
+  let has_sdst = 0;
+}
+
 class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
   opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0),
   "$sdst, $src0", pattern
@@ -178,13 +184,27 @@ def S_BITSET0_B32 : SOP1_32    <"s_bitset0_b32">;
 def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
 def S_BITSET1_B32 : SOP1_32    <"s_bitset1_b32">;
 def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
-def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
+def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64",
+  [(set i64:$sdst, (int_amdgcn_s_getpc))]
+>;
 
-let isTerminator = 1, isBarrier = 1,
-    isBranch = 1, isIndirectBranch = 1 in {
+let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
+
+let isBranch = 1, isIndirectBranch = 1 in {
 def S_SETPC_B64 : SOP1_1  <"s_setpc_b64">;
+} // End isBranch = 1, isIndirectBranch = 1
+
+let isReturn = 1 in {
+// Define variant marked as return rather than branch.
+def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
+}
+} // End isTerminator = 1, isBarrier = 1
+
+let isCall = 1 in {
+def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
+>;
 }
-def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
+
 def S_RFE_B64 : SOP1_1  <"s_rfe_b64">;
 
 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
@@ -210,7 +230,7 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
 def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
 } // End Uses = [M0]
 
-def S_CBRANCH_JOIN : SOP1_1  <"s_cbranch_join">;
+def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
 def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
 let Defs = [SCC] in {
 def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
@@ -428,7 +448,7 @@ def S_BFE_I64 : SOP2_64_32 <"s_bfe_i64">;
 
 def S_CBRANCH_G_FORK : SOP2_Pseudo <
   "s_cbranch_g_fork", (outs),
-  (ins SReg_64:$src0, SReg_64:$src1),
+  (ins SCSrc_b64:$src0, SCSrc_b64:$src1),
   "$src0, $src1"
 > {
   let has_sdst = 0;
@@ -438,6 +458,22 @@ let Defs = [SCC] in {
 def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
 } // End Defs = [SCC]
 
+let SubtargetPredicate = isVI in {
+  def S_RFE_RESTORE_B64 : SOP2_Pseudo <
+    "s_rfe_restore_b64", (outs),
+    (ins SSrc_b64:$src0, SSrc_b32:$src1),
+    "$src0, $src1"
+  > {
+    let hasSideEffects = 1;
+    let has_sdst = 0;
+  }
+}
+
+let SubtargetPredicate = isGFX9 in {
+  def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
+  def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
+  def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+}
 
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
@@ -508,14 +544,16 @@ class SOPKInstTable <bit is_sopk, string cmpOp = ""> {
 class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
   opName,
   (outs SReg_32:$sdst),
-  (ins u16imm:$simm16),
+  (ins s16imm:$simm16),
   "$sdst, $simm16",
   pattern>;
 
-class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
+class SOPK_SCC <string opName, string base_op, bit isSignExt> : SOPK_Pseudo <
   opName,
   (outs),
-  (ins SReg_32:$sdst, u16imm:$simm16),
+  !if(isSignExt,
+      (ins SReg_32:$sdst, s16imm:$simm16),
+      (ins SReg_32:$sdst, u16imm:$simm16)),
   "$sdst, $simm16", []>,
   SOPKInstTable<1, base_op>{
   let Defs = [SCC];
@@ -524,7 +562,7 @@ class SOPK_SCC <string opName, string base_op = ""> : SOPK_Pseudo <
 class SOPK_32TIE <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
   opName,
   (outs SReg_32:$sdst),
-  (ins SReg_32:$src0, u16imm:$simm16),
+  (ins SReg_32:$src0, s16imm:$simm16),
   "$sdst, $simm16",
   pattern
 >;
@@ -553,20 +591,20 @@ let isCompare = 1 in {
 //   [(set i1:$dst, (setcc i32:$src0, imm:$src1, SETEQ))]
 // >;
 
-def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32">;
-def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32">;
-def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32">;
-def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32">;
-def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32">;
-def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32">;
+def S_CMPK_EQ_I32 : SOPK_SCC <"s_cmpk_eq_i32", "s_cmp_eq_i32", 1>;
+def S_CMPK_LG_I32 : SOPK_SCC <"s_cmpk_lg_i32", "s_cmp_lg_i32", 1>;
+def S_CMPK_GT_I32 : SOPK_SCC <"s_cmpk_gt_i32", "s_cmp_gt_i32", 1>;
+def S_CMPK_GE_I32 : SOPK_SCC <"s_cmpk_ge_i32", "s_cmp_ge_i32", 1>;
+def S_CMPK_LT_I32 : SOPK_SCC <"s_cmpk_lt_i32", "s_cmp_lt_i32", 1>;
+def S_CMPK_LE_I32 : SOPK_SCC <"s_cmpk_le_i32", "s_cmp_le_i32", 1>;
 
 let SOPKZext = 1 in {
-def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32">;
-def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32">;
-def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32">;
-def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32">;
-def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32">;
-def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32">;
+def S_CMPK_EQ_U32 : SOPK_SCC <"s_cmpk_eq_u32", "s_cmp_eq_u32", 0>;
+def S_CMPK_LG_U32 : SOPK_SCC <"s_cmpk_lg_u32", "s_cmp_lg_u32", 0>;
+def S_CMPK_GT_U32 : SOPK_SCC <"s_cmpk_gt_u32", "s_cmp_gt_u32", 0>;
+def S_CMPK_GE_U32 : SOPK_SCC <"s_cmpk_ge_u32", "s_cmp_ge_u32", 0>;
+def S_CMPK_LT_U32 : SOPK_SCC <"s_cmpk_lt_u32", "s_cmp_lt_u32", 0>;
+def S_CMPK_LE_U32 : SOPK_SCC <"s_cmpk_le_u32", "s_cmp_le_u32", 0>;
 } // End SOPKZext = 1
 } // End isCompare = 1
 
@@ -578,7 +616,7 @@ let Defs = [SCC], isCommutable = 1, DisableEncoding = "$src0",
 
 def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "s_cbranch_i_fork",
-  (outs), (ins SReg_64:$sdst, u16imm:$simm16),
+  (outs), (ins SReg_64:$sdst, s16imm:$simm16),
   "$sdst, $simm16"
 >;
 
@@ -751,6 +789,14 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
   let isReturn = 1;
 }
 
+let SubtargetPredicate = isVI in {
+def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+}
+
 let isBranch = 1, SchedRW = [WriteBranch] in {
 def S_BRANCH : SOPP <
   0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -792,6 +838,25 @@ def S_CBRANCH_EXECNZ : SOPP <
 >;
 } // End Uses = [EXEC]
 
+def S_CBRANCH_CDBGSYS : SOPP <
+  0x00000017, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys $simm16"
+>;
+
+def S_CBRANCH_CDBGSYS_AND_USER : SOPP <
+  0x0000001A, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys_and_user $simm16"
+>;
+
+def S_CBRANCH_CDBGSYS_OR_USER : SOPP <
+  0x00000019, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys_or_user $simm16"
+>;
+
+def S_CBRANCH_CDBGUSER : SOPP <
+  0x00000018, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbguser $simm16"
+>;
 
 } // End isBranch = 1
 } // End isTerminator = 1
@@ -806,9 +871,18 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   let isConvergent = 1;
 }
 
+let SubtargetPredicate = isVI in {
+def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+  let simm16 = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+}
+
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
 
 // On SI the documentation says sleep for approximately 64 * low 2
 // bits, consistent with the reported maximum of 448. On VI the
@@ -1207,6 +1281,10 @@ def S_BFE_U64_vi           : SOP2_Real_vi <0x27, S_BFE_U64>;
 def S_BFE_I64_vi           : SOP2_Real_vi <0x28, S_BFE_I64>;
 def S_CBRANCH_G_FORK_vi    : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>;
 def S_ABSDIFF_I32_vi       : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>;
+def S_PACK_LL_B32_B16_vi   : SOP2_Real_vi <0x32, S_PACK_LL_B32_B16>;
+def S_PACK_LH_B32_B16_vi   : SOP2_Real_vi <0x33, S_PACK_LH_B32_B16>;
+def S_PACK_HH_B32_B16_vi   : SOP2_Real_vi <0x34, S_PACK_HH_B32_B16>;
+def S_RFE_RESTORE_B64_vi   : SOP2_Real_vi <0x2b, S_RFE_RESTORE_B64>;
 
 def S_MOVK_I32_vi          : SOPK_Real_vi <0x00, S_MOVK_I32>;
 def S_CMOVK_I32_vi         : SOPK_Real_vi <0x01, S_CMOVK_I32>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 9908fc0..92fb762 100644
--- a/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -16,7 +16,7 @@
 
 using namespace llvm;
 
-/// \brief The target which suports all AMD GPUs.  This will eventually
+/// \brief The target which supports all AMD GPUs.  This will eventually
 ///         be deprecated and there will be a R600 target and a GCN target.
 Target &llvm::getTheAMDGPUTarget() {
   static Target TheAMDGPUTarget;
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index b6868de..03b11ae 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -65,5 +65,18 @@ const char* const IdSymbolic[] = {
 };
 
 } // namespace Hwreg
+
+namespace Swizzle {
+
+// This must be in sync with llvm::AMDGPU::Swizzle::Id enum members, see SIDefines.h.
+const char* const IdSymbolic[] = {
+  "QUAD_PERM",
+  "BITMASK_PERM",
+  "SWAP",
+  "REVERSE",
+  "BROADCAST",
+};
+
+} // namespace Swizzle
 } // namespace AMDGPU
 } // namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
index b2dc2c0..ebb2be2 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
@@ -25,6 +25,12 @@ namespace Hwreg { // Symbolic names for the hwreg(...) syntax.
 extern const char* const IdSymbolic[];
 
 } // namespace Hwreg
+
+namespace Swizzle { // Symbolic names for the swizzle(...) syntax.
+
+extern const char* const IdSymbolic[];
+
+} // namespace Swizzle
 } // namespace AMDGPU
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5f651d4..67ad904 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===//
+//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,32 +6,41 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
 #include "AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
 #include "SIDefines.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
-
-#define GET_REGINFO_ENUM
-#include "AMDGPUGenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 #define GET_INSTRINFO_NAMED_OPS
-#define GET_INSTRINFO_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 #undef GET_INSTRINFO_NAMED_OPS
-#undef GET_INSTRINFO_ENUM
 
 namespace {
 
@@ -56,11 +65,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
   return (Src & getBitMask(Shift, Width)) >> Shift;
 }
 
-/// \returns Vmcnt bit shift.
-unsigned getVmcntBitShift() { return 0; }
+/// \returns Vmcnt bit shift (lower bits).
+unsigned getVmcntBitShiftLo() { return 0; }
 
-/// \returns Vmcnt bit width.
-unsigned getVmcntBitWidth() { return 4; }
+/// \returns Vmcnt bit width (lower bits).
+unsigned getVmcntBitWidthLo() { return 4; }
 
 /// \returns Expcnt bit shift.
 unsigned getExpcntBitShift() { return 4; }
@@ -74,52 +83,241 @@ unsigned getLgkmcntBitShift() { return 8; }
 /// \returns Lgkmcnt bit width.
 unsigned getLgkmcntBitWidth() { return 4; }
 
-} // anonymous namespace
+/// \returns Vmcnt bit shift (higher bits).
+unsigned getVmcntBitShiftHi() { return 14; }
+
+/// \returns Vmcnt bit width (higher bits).
+unsigned getVmcntBitWidthHi() { return 2; }
+
+} // end namespace anonymous
 
 namespace llvm {
+
+static cl::opt<bool> EnablePackedInlinableLiterals(
+    "enable-packed-inlinable-literals",
+    cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
+    cl::init(false));
+
 namespace AMDGPU {
 
-IsaVersion getIsaVersion(const FeatureBitset &Features) {
+namespace IsaInfo {
 
+IsaVersion getIsaVersion(const FeatureBitset &Features) {
+  // SI.
+  if (Features.test(FeatureISAVersion6_0_0))
+    return {6, 0, 0};
+  if (Features.test(FeatureISAVersion6_0_1))
+    return {6, 0, 1};
+  // CI.
   if (Features.test(FeatureISAVersion7_0_0))
     return {7, 0, 0};
-
   if (Features.test(FeatureISAVersion7_0_1))
     return {7, 0, 1};
-
   if (Features.test(FeatureISAVersion7_0_2))
     return {7, 0, 2};
+  if (Features.test(FeatureISAVersion7_0_3))
+    return {7, 0, 3};
 
+  // VI.
   if (Features.test(FeatureISAVersion8_0_0))
     return {8, 0, 0};
-
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
-
   if (Features.test(FeatureISAVersion8_0_2))
     return {8, 0, 2};
-
   if (Features.test(FeatureISAVersion8_0_3))
     return {8, 0, 3};
-
   if (Features.test(FeatureISAVersion8_0_4))
     return {8, 0, 4};
-
   if (Features.test(FeatureISAVersion8_1_0))
     return {8, 1, 0};
 
-  return {0, 0, 0};
+  // GFX9.
+  if (Features.test(FeatureISAVersion9_0_0))
+    return {9, 0, 0};
+  if (Features.test(FeatureISAVersion9_0_1))
+    return {9, 0, 1};
+  if (Features.test(FeatureISAVersion9_0_2))
+    return {9, 0, 2};
+  if (Features.test(FeatureISAVersion9_0_3))
+    return {9, 0, 3};
+
+  if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+    return {0, 0, 0};
+  return {7, 0, 0};
+}
+
+unsigned getWavefrontSize(const FeatureBitset &Features) {
+  if (Features.test(FeatureWavefrontSize16))
+    return 16;
+  if (Features.test(FeatureWavefrontSize32))
+    return 32;
+
+  return 64;
+}
+
+unsigned getLocalMemorySize(const FeatureBitset &Features) {
+  if (Features.test(FeatureLocalMemorySize32768))
+    return 32768;
+  if (Features.test(FeatureLocalMemorySize65536))
+    return 65536;
+
+  return 0;
+}
+
+unsigned getEUsPerCU(const FeatureBitset &Features) {
+  return 4;
+}
+
+unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+                               unsigned FlatWorkGroupSize) {
+  if (!Features.test(FeatureGCN))
+    return 8;
+  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  if (N == 1)
+    return 40;
+  N = 40 / N;
+  return std::min(N, 16u);
+}
+
+unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
+  return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+}
+
+unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize) {
+  return getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+}
+
+unsigned getMinWavesPerEU(const FeatureBitset &Features) {
+  return 1;
+}
+
+unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
+  if (!Features.test(FeatureGCN))
+    return 8;
+  // FIXME: Need to take scratch memory into account.
+  return 10;
 }
 
+unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize) {
+  return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize),
+                 getEUsPerCU(Features)) / getEUsPerCU(Features);
+}
+
+unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) {
+  return 1;
+}
+
+unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) {
+  return 2048;
+}
+
+unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+                              unsigned FlatWorkGroupSize) {
+  return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) /
+                 getWavefrontSize(Features);
+}
+
+unsigned getSGPRAllocGranule(const FeatureBitset &Features) {
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 16;
+  return 8;
+}
+
+unsigned getSGPREncodingGranule(const FeatureBitset &Features) {
+  return 8;
+}
+
+unsigned getTotalNumSGPRs(const FeatureBitset &Features) {
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 800;
+  return 512;
+}
+
+unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
+  if (Features.test(FeatureSGPRInitBug))
+    return FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 102;
+  return 104;
+}
+
+unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  if (WavesPerEU >= getMaxWavesPerEU(Features))
+    return 0;
+  unsigned MinNumSGPRs =
+      alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
+                getSGPRAllocGranule(Features)) + 1;
+  return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
+}
+
+unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+                        bool Addressable) {
+  assert(WavesPerEU != 0);
+
+  IsaVersion Version = getIsaVersion(Features);
+  unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
+                                   getSGPRAllocGranule(Features));
+  unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
+  if (Version.Major >= 8 && !Addressable)
+    AddressableNumSGPRs = 112;
+  return std::min(MaxNumSGPRs, AddressableNumSGPRs);
+}
+
+unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
+  return 4;
+}
+
+unsigned getVGPREncodingGranule(const FeatureBitset &Features) {
+  return getVGPRAllocGranule(Features);
+}
+
+unsigned getTotalNumVGPRs(const FeatureBitset &Features) {
+  return 256;
+}
+
+unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
+  return getTotalNumVGPRs(Features);
+}
+
+unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  if (WavesPerEU >= getMaxWavesPerEU(Features))
+    return 0;
+  unsigned MinNumVGPRs =
+      alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
+                getVGPRAllocGranule(Features)) + 1;
+  return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features));
+}
+
+unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU,
+                                   getVGPRAllocGranule(Features));
+  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features);
+  return std::min(MaxNumVGPRs, AddressableNumVGPRs);
+}
+
+} // end namespace IsaInfo
+
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features) {
-
-  IsaVersion ISA = getIsaVersion(Features);
+  IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features);
 
   memset(&Header, 0, sizeof(Header));
 
   Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 0;
+  Header.amd_kernel_code_version_minor = 1;
   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
   Header.amd_machine_version_major = ISA.Major;
   Header.amd_machine_version_minor = ISA.Minor;
@@ -127,6 +325,11 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.kernel_code_entry_byte_offset = sizeof(Header);
   // wavefront_size is specified as a power of 2: 2^6 = 64 threads.
   Header.wavefront_size = 6;
+
+  // If the code object does not support indirect functions, then the value must
+  // be 0xffffffff.
+  Header.call_convention = -1;
+
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
   Header.kernarg_segment_alignment = 4;
@@ -134,43 +337,16 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
   Header.private_segment_alignment = 4;
 }
 
-MCSection *getHSATextSection(MCContext &Ctx) {
-  return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS,
-                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                           ELF::SHF_EXECINSTR |
-                           ELF::SHF_AMDGPU_HSA_AGENT |
-                           ELF::SHF_AMDGPU_HSA_CODE);
+bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS;
 }
 
-MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) {
-  return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS,
-                           ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                           ELF::SHF_AMDGPU_HSA_GLOBAL |
-                           ELF::SHF_AMDGPU_HSA_AGENT);
+bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS;
 }
 
-MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) {
-  return  Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS,
-                            ELF::SHF_ALLOC | ELF::SHF_WRITE |
-                            ELF::SHF_AMDGPU_HSA_GLOBAL);
-}
-
-MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
-  return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS,
-                           ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY |
-                           ELF::SHF_AMDGPU_HSA_AGENT);
-}
-
-bool isGroupSegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}
-
-bool isGlobalSegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}
-
-bool isReadOnlySegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS;
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -208,7 +384,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
     return Default;
   }
   if (Strs.second.trim().getAsInteger(0, Ints.second)) {
-    if (!OnlyFirstRequired || Strs.second.trim().size()) {
+    if (!OnlyFirstRequired || !Strs.second.trim().empty()) {
       Ctx.emitError("can't parse second integer attribute " + Name);
       return Default;
     }
@@ -217,57 +393,84 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
   return Ints;
 }
 
-unsigned getWaitcntBitMask(IsaVersion Version) {
-  unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
-  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
-  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
-  return Vmcnt | Expcnt | Lgkmcnt;
-}
+unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
+  unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
+  if (Version.Major < 9)
+    return VmcntLo;
 
-unsigned getVmcntBitMask(IsaVersion Version) {
-  return (1 << getVmcntBitWidth()) - 1;
+  unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
+  return VmcntLo | VmcntHi;
 }
 
-unsigned getExpcntBitMask(IsaVersion Version) {
+unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
   return (1 << getExpcntBitWidth()) - 1;
 }
 
-unsigned getLgkmcntBitMask(IsaVersion Version) {
+unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
   return (1 << getLgkmcntBitWidth()) - 1;
 }
 
-unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) {
-  return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
+  unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
+  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+  unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
+  if (Version.Major < 9)
+    return Waitcnt;
+
+  unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  return Waitcnt | VmcntHi;
+}
+
+unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+  unsigned VmcntLo =
+      unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  if (Version.Major < 9)
+    return VmcntLo;
+
+  unsigned VmcntHi =
+      unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  VmcntHi <<= getVmcntBitWidthLo();
+  return VmcntLo | VmcntHi;
 }
 
-unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) {
+unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) {
+unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
   Vmcnt = decodeVmcnt(Version, Waitcnt);
   Expcnt = decodeExpcnt(Version, Waitcnt);
   Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
 }
 
-unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) {
-  return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                     unsigned Vmcnt) {
+  Waitcnt =
+      packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  if (Version.Major < 9)
+    return Waitcnt;
+
+  Vmcnt >>= getVmcntBitWidthLo();
+  return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
 }
 
-unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) {
+unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                      unsigned Expcnt) {
   return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
+unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                       unsigned Lgkmcnt) {
   return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-unsigned encodeWaitcnt(IsaVersion Version,
+unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
   unsigned Waitcnt = getWaitcntBitMask(Version);
   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
@@ -283,6 +486,7 @@ unsigned getInitialPSInputAddr(const Function &F) {
 bool isShader(CallingConv::ID cc) {
   switch(cc) {
     case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_HS:
     case CallingConv::AMDGPU_GS:
     case CallingConv::AMDGPU_PS:
     case CallingConv::AMDGPU_CS:
@@ -296,6 +500,21 @@ bool isCompute(CallingConv::ID cc) {
   return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
 }
 
+bool isEntryFunctionCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_HS:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
@@ -308,6 +527,24 @@ bool isVI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
 }
 
+bool isGFX9(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+}
+
+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
+  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
+  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
+    Reg == AMDGPU::SCC;
+}
+
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
+  for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
+    if (*R == Reg1) return true;
+  }
+  return false;
+}
+
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 
   switch(Reg) {
@@ -327,13 +564,34 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
   return Reg;
 }
 
+unsigned mc2PseudoReg(unsigned Reg) {
+  switch (Reg) {
+  case AMDGPU::FLAT_SCR_ci:
+  case AMDGPU::FLAT_SCR_vi:
+    return FLAT_SCR;
+
+  case AMDGPU::FLAT_SCR_LO_ci:
+  case AMDGPU::FLAT_SCR_LO_vi:
+    return AMDGPU::FLAT_SCR_LO;
+
+  case AMDGPU::FLAT_SCR_HI_ci:
+  case AMDGPU::FLAT_SCR_HI_vi:
+    return AMDGPU::FLAT_SCR_HI;
+
+  default:
+    return Reg;
+  }
+}
+
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
          OpType <= AMDGPU::OPERAND_SRC_LAST;
 }
 
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   switch (OpType) {
   case AMDGPU::OPERAND_REG_IMM_FP32:
@@ -342,6 +600,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return true;
   default:
     return false;
@@ -349,6 +608,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 }
 
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
          OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
@@ -392,6 +652,7 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
 
 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
                            unsigned OpNo) {
+  assert(OpNo < Desc.NumOperands);
   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
   return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
 }
@@ -440,7 +701,8 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
 }
 
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
-  assert(HasInv2Pi);
+  if (!HasInv2Pi)
+    return false;
 
   if (Literal >= -16 && Literal <= 64)
     return true;
@@ -457,5 +719,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
          Val == 0x3118;   // 1/2pi
 }
 
-} // End namespace AMDGPU
-} // End namespace llvm
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
+  assert(HasInv2Pi);
+
+  if (!EnablePackedInlinableLiterals)
+    return false;
+
+  int16_t Lo16 = static_cast<int16_t>(Literal);
+  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+  return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+}
+
+bool isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
+int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+  if (isSI(ST) || isCI(ST))
+    return ByteOffset >> 2;
+
+  return ByteOffset;
+}
+
+bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+  int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
+  return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) :
+                                isUInt<20>(EncodedOffset);
+}
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+const unsigned AMDGPUAS::MAX_COMMON_ADDRESS;
+const unsigned AMDGPUAS::GLOBAL_ADDRESS;
+const unsigned AMDGPUAS::LOCAL_ADDRESS;
+const unsigned AMDGPUAS::PARAM_D_ADDRESS;
+const unsigned AMDGPUAS::PARAM_I_ADDRESS;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_0;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_1;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_2;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_3;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_4;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_5;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_6;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_7;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_8;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_9;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_10;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_11;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_12;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_13;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_14;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_15;
+const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+namespace llvm {
+namespace AMDGPU {
+
+AMDGPUAS getAMDGPUAS(Triple T) {
+  auto Env = T.getEnvironmentName();
+  AMDGPUAS AS;
+  if (Env == "amdgiz" || Env == "amdgizcl") {
+    AS.FLAT_ADDRESS     = 0;
+    AS.PRIVATE_ADDRESS  = 5;
+    AS.REGION_ADDRESS   = 4;
+  }
+  else {
+    AS.FLAT_ADDRESS     = 4;
+    AS.PRIVATE_ADDRESS  = 0;
+    AS.REGION_ADDRESS   = 5;
+   }
+  return AS;
+}
+
+AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
+  return getAMDGPUAS(M.getTargetTriple());
+}
+
+AMDGPUAS getAMDGPUAS(const Module &M) {
+  return getAMDGPUAS(Triple(M.getTargetTriple()));
+}
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ea5fc36..936e492 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===//
+//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,52 +10,149 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
+#include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
-#include "llvm/IR/CallingConv.h"
-
 #include "SIDefines.h"
-
-#define GET_INSTRINFO_OPERAND_ENUM
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_OPERAND_ENUM
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
 class FeatureBitset;
 class Function;
 class GlobalValue;
+class MachineMemOperand;
 class MCContext;
-class MCInstrDesc;
 class MCRegisterClass;
 class MCRegisterInfo;
 class MCSection;
 class MCSubtargetInfo;
+class Triple;
 
 namespace AMDGPU {
+namespace IsaInfo {
 
-LLVM_READONLY
-int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+enum {
+  // The closed Vulkan driver sets 96, which limits the wave count to 8 but
+  // doesn't spill SGPRs as much as when 80 is set.
+  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+};
 
+/// \brief Instruction set architecture version.
 struct IsaVersion {
   unsigned Major;
   unsigned Minor;
   unsigned Stepping;
 };
 
+/// \returns Isa version for given subtarget \p Features.
 IsaVersion getIsaVersion(const FeatureBitset &Features);
-void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
-                               const FeatureBitset &Features);
-MCSection *getHSATextSection(MCContext &Ctx);
 
-MCSection *getHSADataGlobalAgentSection(MCContext &Ctx);
+/// \returns Wavefront size for given subtarget \p Features.
+unsigned getWavefrontSize(const FeatureBitset &Features);
+
+/// \returns Local memory size in bytes for given subtarget \p Features.
+unsigned getLocalMemorySize(const FeatureBitset &Features);
+
+/// \returns Number of execution units per compute unit for given subtarget \p
+/// Features.
+unsigned getEUsPerCU(const FeatureBitset &Features);
+
+/// \returns Maximum number of work groups per compute unit for given subtarget
+/// \p Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+                               unsigned FlatWorkGroupSize);
+
+/// \returns Maximum number of waves per compute unit for given subtarget \p
+/// Features without any kind of limitation.
+unsigned getMaxWavesPerCU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per compute unit for given subtarget \p
+/// Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize);
+
+/// \returns Minimum number of waves per execution unit for given subtarget \p
+/// Features.
+unsigned getMinWavesPerEU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per execution unit for given subtarget \p
+/// Features without any kind of limitation.
+unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per execution unit for given subtarget \p
+/// Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize);
+
+/// \returns Minimum flat work group size for given subtarget \p Features.
+unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features);
+
+/// \returns Maximum flat work group size for given subtarget \p Features.
+unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features);
+
+/// \returns Number of waves per work group for given subtarget \p Features and
+/// limited by given \p FlatWorkGroupSize.
+unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+                              unsigned FlatWorkGroupSize);
+
+/// \returns SGPR allocation granularity for given subtarget \p Features.
+unsigned getSGPRAllocGranule(const FeatureBitset &Features);
+
+/// \returns SGPR encoding granularity for given subtarget \p Features.
+unsigned getSGPREncodingGranule(const FeatureBitset &Features);
+
+/// \returns Total number of SGPRs for given subtarget \p Features.
+unsigned getTotalNumSGPRs(const FeatureBitset &Features);
+
+/// \returns Addressable number of SGPRs for given subtarget \p Features.
+unsigned getAddressableNumSGPRs(const FeatureBitset &Features);
+
+/// \returns Minimum number of SGPRs that meets the given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+/// \returns Maximum number of SGPRs that meets the given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+                        bool Addressable);
+
+/// \returns VGPR allocation granularity for given subtarget \p Features.
+unsigned getVGPRAllocGranule(const FeatureBitset &Features);
+
+/// \returns VGPR encoding granularity for given subtarget \p Features.
+unsigned getVGPREncodingGranule(const FeatureBitset &Features);
+
+/// \returns Total number of VGPRs for given subtarget \p Features.
+unsigned getTotalNumVGPRs(const FeatureBitset &Features);
+
+/// \returns Addressable number of VGPRs for given subtarget \p Features.
+unsigned getAddressableNumVGPRs(const FeatureBitset &Features);
 
-MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
+/// \returns Minimum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 
-MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
+/// \returns Maximum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
 
-bool isGroupSegment(const GlobalValue *GV);
-bool isGlobalSegment(const GlobalValue *GV);
-bool isReadOnlySegment(const GlobalValue *GV);
+} // end namespace IsaInfo
+
+LLVM_READONLY
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+
+void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+                               const FeatureBitset &Features);
+
+bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS);
 
 /// \returns True if constants should be emitted to .text section for given
 /// target triple \p TT, false otherwise.
@@ -83,73 +180,108 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
                                             std::pair<int, int> Default,
                                             bool OnlyFirstRequired = false);
 
-/// \returns Waitcnt bit mask for given isa \p Version.
-unsigned getWaitcntBitMask(IsaVersion Version);
-
 /// \returns Vmcnt bit mask for given isa \p Version.
-unsigned getVmcntBitMask(IsaVersion Version);
+unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Expcnt bit mask for given isa \p Version.
-unsigned getExpcntBitMask(IsaVersion Version);
+unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Lgkmcnt bit mask for given isa \p Version.
-unsigned getLgkmcntBitMask(IsaVersion Version);
+unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version);
+
+/// \returns Waitcnt bit mask for given isa \p Version.
+unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
 /// \p Lgkmcnt respectively.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-///     \p Vmcnt = \p Waitcnt[3:0]
+///     \p Vmcnt = \p Waitcnt[3:0]                      (pre-gfx9 only)
+///     \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14]  (gfx9+ only)
 ///     \p Expcnt = \p Waitcnt[6:4]
 ///     \p Lgkmcnt = \p Waitcnt[11:8]
-void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
 /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
-unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt);
+unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                     unsigned Vmcnt);
 
 /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
-unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt);
+unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                      unsigned Expcnt);
 
 /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
-unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt);
+unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                       unsigned Lgkmcnt);
 
 /// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
 /// \p Version.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
-///     Waitcnt[3:0]  = \p Vmcnt
-///     Waitcnt[6:4]  = \p Expcnt
-///     Waitcnt[11:8] = \p Lgkmcnt
+///     Waitcnt[3:0]   = \p Vmcnt       (pre-gfx9 only)
+///     Waitcnt[3:0]   = \p Vmcnt[3:0]  (gfx9+ only)
+///     Waitcnt[6:4]   = \p Expcnt
+///     Waitcnt[11:8]  = \p Lgkmcnt
+///     Waitcnt[15:14] = \p Vmcnt[5:4]  (gfx9+ only)
 ///
 /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
 /// isa \p Version.
-unsigned encodeWaitcnt(IsaVersion Version,
+unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
 
 unsigned getInitialPSInputAddr(const Function &F);
 
-bool isShader(CallingConv::ID cc);
-bool isCompute(CallingConv::ID cc);
+LLVM_READNONE
+bool isShader(CallingConv::ID CC);
+
+LLVM_READNONE
+bool isCompute(CallingConv::ID CC);
+
+LLVM_READNONE
+bool isEntryFunctionCC(CallingConv::ID CC);
+
+// FIXME: Remove this when calling conventions cleaned up
+LLVM_READNONE
+inline bool isKernel(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return true;
+  default:
+    return false;
+  }
+}
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
+bool isGFX9(const MCSubtargetInfo &STI);
+
+/// \brief Is Reg - scalar register
+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
+
+/// \brief Is there any intersection between registers
+bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);
 
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
+/// \brief Convert hardware register \p Reg to a pseudo register
+LLVM_READNONE
+unsigned mc2PseudoReg(unsigned Reg);
+
 /// \brief Can this operand also contain immediate values?
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
@@ -188,6 +320,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return 2;
 
   default:
@@ -210,7 +344,21 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+
+bool isUniformMMO(const MachineMemOperand *MMO);
+
+/// \returns The encoding that will be used for \p ByteOffset in the SMRD
+/// offset field.
+int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+
+/// \returns true if this offset is small enough to fit in the SMRD
+/// offset field.  \p ByteOffset should be the offset in bytes and
+/// not the encoded offset.
+bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index c55eaab..991408c 100644
--- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -87,7 +87,7 @@ COMPPGM1(enable_ieee_mode,                compute_pgm_rsrc1_ieee_mode,      IEEE
 // TODO: cdbg_user
 COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
 COMPPGM2(user_sgpr_count,                 compute_pgm_rsrc2_user_sgpr,      USER_SGPR),
-// TODO: enable_trap_handler
+COMPPGM2(enable_trap_handler,             compute_pgm_rsrc2_trap_handler,   TRAP_HANDLER),
 COMPPGM2(enable_sgpr_workgroup_id_x,      compute_pgm_rsrc2_tgid_x_en,      TGID_X_EN),
 COMPPGM2(enable_sgpr_workgroup_id_y,      compute_pgm_rsrc2_tgid_y_en,      TGID_Y_EN),
 COMPPGM2(enable_sgpr_workgroup_id_z,      compute_pgm_rsrc2_tgid_z_en,      TGID_Z_EN),
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 8cae83c..96b33c3 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -23,18 +23,27 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
 
 class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
-  
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = op;
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{31-25} = 0x3f; // encoding
+}
+
+class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
+  bits<8> vdst;
+
   let Inst{8-0}   = 0xf9; // sdwa
   let Inst{16-9}  = op;
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{31-25} = 0x3f; // encoding
 }
 
-class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
   InstSI <P.Outs32, P.Ins32, "", pattern>,
   VOP <opName>,
-  SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
+  SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
+  MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
 
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -75,6 +84,8 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -83,10 +94,17 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 }
 
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
-  list<dag> ret = !if(P.HasModifiers,
-    [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
-                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-    [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
+  list<dag> ret =
+    !if(P.HasModifiers,
+        [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                              i32:$src0_modifiers,
+                                              i1:$clamp, i32:$omod))))],
+        !if(P.HasOMod,
+            [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
+                                                  i1:$clamp, i32:$omod))))],
+            [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]
+        )
+    );
 }
 
 multiclass VOP1Inst <string opName, VOPProfile P,
@@ -96,6 +114,23 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
 }
 
+// Special profile for instructions which have clamp
+// and output modifiers (but have no input modifiers)
+class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
+  VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+
+  let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
+  let Asm64 = "$vdst, $src0$clamp$omod";
+
+  let HasModifiers = 0;
+  let HasClamp = 1;
+  let HasOMod = 1;
+}
+
+def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
+def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
+def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
+
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -142,24 +177,24 @@ def V_READFIRSTLANE_B32 :
 
 let SchedRW = [WriteQuarterRate32] in {
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
-defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
-defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
-defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
+defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
-defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP1_F32_I32>;
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
 defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
-defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
-defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
-defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
-defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
 defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
-defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
 } // End SchedRW = [WriteQuarterRate32]
 
 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
@@ -217,6 +252,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
   let Src0RC64 = VRegSrc_32;
 
   let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 // Special case because there are no true output operands.  Hack vdst
@@ -232,16 +268,19 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
   let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
-  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+
+  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel);
 
   let Asm32 = getAsm32<1, 1>.ret;
-  let Asm64 = getAsm64<1, 1, 0>.ret;
+  let Asm64 = getAsm64<1, 1, 0, 1>.ret;
   let AsmDPP = getAsmDPP<1, 1, 0>.ret;
-  let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
+  let AsmSDWA = getAsmSDWA<1, 1>.ret;
+  let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
 
   let HasExt = 0;
+  let HasSDWA9 = 0;
   let HasDst = 0;
   let EmitDst = 1; // force vdst emission
 }
@@ -258,11 +297,14 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
 defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
+let SchedRW = [WriteQuarterRate32] in {
+defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
+}
+
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
 let SchedRW = [WriteQuarterRate32] in {
-defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
 defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
 defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
 defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
@@ -295,10 +337,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
 } // End SubtargetPredicate = isCIVI
 
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
 
-defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
-defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
+defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
@@ -318,7 +360,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 
 }
 
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
 
 def : Pat<
     (f32 (f16_to_fp i16:$src)),
@@ -326,12 +368,31 @@ def : Pat<
 >;
 
 def : Pat<
-    (i16 (fp_to_f16 f32:$src)),
+    (i16 (AMDGPUfp_to_f16 f32:$src)),
     (V_CVT_F16_F32_e32 $src)
 >;
 
 }
 
+def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+  let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1);
+  let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1);
+  let Outs64 = Outs32;
+  let Asm32 = " $vdst, $src0";
+  let Asm64 = "";
+  let Ins64 = (ins);
+}
+
+let SubtargetPredicate = isGFX9 in {
+  let Constraints = "$vdst = $src1, $vdst1 = $src0",
+      DisableEncoding="$vdst1,$src1",
+      SchedRW = [Write64Bit, Write64Bit] in {
+// Never VOP3. Takes as long as 2 v_mov_b32s
+def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
+}
+
+} // End SubtargetPredicate = isGFX9
+
 //===----------------------------------------------------------------------===//
 // Target
 //===----------------------------------------------------------------------===//
@@ -453,6 +514,14 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
   let Inst{31-25} = 0x3f; //encoding
 }
 
+multiclass VOP1Only_Real_vi <bits<10> op> {
+  let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+    def _vi :
+      VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
+      VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+  }
+}
+
 multiclass VOP1_Real_vi <bits<10> op> {
   let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
     def _e32_vi :
@@ -467,6 +536,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
     VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
   def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
@@ -480,6 +553,7 @@ defm V_CVT_F32_I32       : VOP1_Real_vi <0x5>;
 defm V_CVT_F32_U32       : VOP1_Real_vi <0x6>;
 defm V_CVT_U32_F32       : VOP1_Real_vi <0x7>;
 defm V_CVT_I32_F32       : VOP1_Real_vi <0x8>;
+defm V_MOV_FED_B32       : VOP1_Real_vi <0x9>;
 defm V_CVT_F16_F32       : VOP1_Real_vi <0xa>;
 defm V_CVT_F32_F16       : VOP1_Real_vi <0xb>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_vi <0xc>;
@@ -547,7 +621,7 @@ defm V_RNDNE_F16         : VOP1_Real_vi <0x47>;
 defm V_FRACT_F16         : VOP1_Real_vi <0x48>;
 defm V_SIN_F16           : VOP1_Real_vi <0x49>;
 defm V_COS_F16           : VOP1_Real_vi <0x4a>;
-
+defm V_SWAP_B32          : VOP1Only_Real_vi <0x51>;
 
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 00e5ab3..d5acb49 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -40,12 +40,24 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
 class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
   bits<8> src1;
-  
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; // encoding
+}
+
+class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
+  bits<8> vdst;
+  bits<9> src1;
+
   let Inst{8-0}   = 0xf9; // sdwa
   let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{30-25} = op;
   let Inst{31}    = 0x0; // encoding
+  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
 }
 
 class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
@@ -93,6 +105,8 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -103,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
-      (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+      (node (P.Src0VT
+              !if(P.HasOMod,
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                  (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
             (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
     [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
 }
@@ -119,11 +136,9 @@ multiclass VOP2Inst <string opName,
   def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
              Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
-              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+  def _sdwa  : VOP2_SDWA_Pseudo <opName, P>;
 }
 
-// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
 multiclass VOP2bInst <string opName,
                       VOPProfile P,
                       SDPatternOperator node = null_frag,
@@ -134,10 +149,12 @@ multiclass VOP2bInst <string opName,
     let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
-      
-      def _sdwa : VOP2_SDWA_Pseudo <opName, P>,
-              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)>;
+
+      def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
+        let AsmMatchConverter = "cvtSdwaVOP2b";
+      }
     }
+
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
   }
@@ -154,6 +171,7 @@ multiclass VOP2eInst <string opName,
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
     }
+
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
                Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
   }
@@ -162,8 +180,11 @@ multiclass VOP2eInst <string opName,
 class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
-  field string Asm32 = "$vdst, $src0, $src1, $imm";
   field bit HasExt = 0;
+
+  // Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+  field string Asm32 = " $vdst, $src0, $src1, $imm";
 }
 
 def VOP_MADAK_F16 : VOP_MADAK <f16>;
@@ -172,45 +193,55 @@ def VOP_MADAK_F32 : VOP_MADAK <f32>;
 class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
   field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
-  field string Asm32 = "$vdst, $src0, $imm, $src1";
   field bit HasExt = 0;
+
+  // Hack to stop printing _e64
+  let DstRC = RegisterOperand<VGPR_32>;
+  field string Asm32 = " $vdst, $src0, $imm, $src1";
 }
 
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
 
+// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
+// and processing time but it makes it easier to convert to mad.
 class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
-                       HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+                       HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
   let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                     VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      VGPR_32:$src2, // stub argument
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                     clampmod:$clamp, omod:$omod,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm32 = getAsm32<1, 2, vt>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
   let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
-  let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
+  let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
+  let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
   let HasSrc2 = 0;
   let HasSrc2Mods = 0;
   let HasExt = 1;
+  let HasSDWA9 = 0;
 }
 
 def VOP_MAC_F16 : VOP_MAC <f16> {
   // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
   // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret;
 }
 
 def VOP_MAC_F32 : VOP_MAC <f32> {
   // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
   // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret;
 }
 
 // Write out to vcc or arbitrary SGPR.
@@ -218,6 +249,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
   let Asm32 = "$vdst, vcc, $src0, $src1";
   let Asm64 = "$vdst, $sdst, $src0, $src1";
   let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
   let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
@@ -235,6 +267,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
   let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
   let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
   let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
@@ -243,9 +276,10 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   // implicit VCC use.
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
 
-  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,
-                     Src1Mod:$src1_modifiers, Src1SDWA:$src1,
-                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
+                     clampmod:$clamp, omod:$omod,
+                     dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
 
   let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
@@ -253,6 +287,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let HasExt = 1;
+  let HasSDWA9 = 1;
 }
 
 // Read in from vcc or arbitrary SGPR
@@ -275,15 +310,19 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+  let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
   let Outs32 = (outs VGPR_32:$vdst);
   let Outs64 = Outs32;
-  let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1);
+  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
+  let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -293,7 +332,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
 let SubtargetPredicate = isGCN in {
 
 defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
-def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>;
+def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;
 
 let isCommutable = 1 in {
 defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
@@ -323,7 +362,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2",
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 }
 
-def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>;
+def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
@@ -346,20 +385,29 @@ def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
 def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
 } // End isConvergent = 1
 
-defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32>;
-defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
-defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
+defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
+defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
 
 } // End SubtargetPredicate = isGCN
 
+def : Pat<
+    (AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
+    (V_ADDC_U32_e64 $src0, $src1, $src2)
+>;
+
+def : Pat<
+    (AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
+    (V_SUBB_U32_e64 $src0, $src1, $src2)
+>;
 
 // These instructions only exist on SI and CI
 let SubtargetPredicate = isSICI in {
@@ -376,9 +424,9 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
 
 } // End let SubtargetPredicate = SICI
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
 
-def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>;
+def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
 defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
 defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
@@ -389,7 +437,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
 defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
-def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>;
+def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
 defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
 defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
 defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
@@ -407,7 +455,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
 }
 } // End isCommutable = 1
 
-} // End SubtargetPredicate = isVI
+} // End SubtargetPredicate = Has16BitInsts
 
 // Note: 16-bit instructions produce a 0 result in the high 16-bits.
 multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
@@ -457,7 +505,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
 >;
 
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
 
 defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
 defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
@@ -494,7 +542,15 @@ def : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
 >;
 
-} // End Predicates = [isVI]
+// Undo sub x, c -> add x, -c canonicalization since c is more likely
+// an inline immediate than -c.
+// TODO: Also do for 64-bit.
+def : Pat<
+  (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
+  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+>;
+
+} // End Predicates = [Has16BitInsts]
 
 //===----------------------------------------------------------------------===//
 // SI
@@ -566,7 +622,10 @@ defm V_SUBB_U32           : VOP2be_Real_e32e64_si <0x29>;
 defm V_SUBBREV_U32        : VOP2be_Real_e32e64_si <0x2a>;
 
 defm V_READLANE_B32       : VOP2_Real_si <0x01>;
+
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
 defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
+}
 
 defm V_MAC_LEGACY_F32     : VOP2_Real_e32e64_si <0x6>;
 defm V_MIN_LEGACY_F32     : VOP2_Real_e32e64_si <0xd>;
@@ -635,6 +694,17 @@ multiclass VOP2_Real_e64_vi <bits<10> op> {
     VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
 }
 
+multiclass VOP2_Real_e64only_vi <bits<10> op> {
+  def _e64_vi :
+    VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+    VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
+      // Hack to stop printing _e64
+      VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
+      let OutOperandList = (outs VGPR_32:$vdst);
+      let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
+    }
+}
+
 multiclass Base_VOP2be_Real_e32e64_vi <bits<6> op> : VOP2_Real_e32_vi<op> {
   def _e64_vi :
     VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
@@ -646,22 +716,28 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
   VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
- 
+
 multiclass VOP2_SDWA_Real <bits<6> op> {
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
+multiclass VOP2_SDWA9_Real <bits<6> op> {
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+}
+
 multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
-  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
   def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
 }
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
-  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
   def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
@@ -702,17 +778,17 @@ defm V_SUBBREV_U32        : VOP2be_Real_e32e64_vi <0x1e>;
 defm V_READLANE_B32       : VOP32_Real_vi <0x289>;
 defm V_WRITELANE_B32      : VOP32_Real_vi <0x28a>;
 
-defm V_BFM_B32            : VOP2_Real_e64_vi <0x293>;
-defm V_BCNT_U32_B32       : VOP2_Real_e64_vi <0x28b>;
-defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64_vi <0x28c>;
-defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64_vi <0x28d>;
-defm V_LDEXP_F32          : VOP2_Real_e64_vi <0x288>;
-defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>;
-defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>;
-defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>;
-defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64_vi <0x296>;
-defm V_CVT_PK_U16_U32     : VOP2_Real_e64_vi <0x297>;
-defm V_CVT_PK_I16_I32     : VOP2_Real_e64_vi <0x298>;
+defm V_BFM_B32            : VOP2_Real_e64only_vi <0x293>;
+defm V_BCNT_U32_B32       : VOP2_Real_e64only_vi <0x28b>;
+defm V_MBCNT_LO_U32_B32   : VOP2_Real_e64only_vi <0x28c>;
+defm V_MBCNT_HI_U32_B32   : VOP2_Real_e64only_vi <0x28d>;
+defm V_LDEXP_F32          : VOP2_Real_e64only_vi <0x288>;
+defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>;
+defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>;
+defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>;
+defm V_CVT_PKRTZ_F16_F32  : VOP2_Real_e64only_vi <0x296>;
+defm V_CVT_PK_U16_U32     : VOP2_Real_e64only_vi <0x297>;
+defm V_CVT_PK_I16_I32     : VOP2_Real_e64only_vi <0x298>;
 
 defm V_ADD_F16            : VOP2_Real_e32e64_vi <0x1f>;
 defm V_SUB_F16            : VOP2_Real_e32e64_vi <0x20>;
@@ -740,9 +816,11 @@ let SubtargetPredicate = isVI in {
 
 // Aliases to simplify matching of floating-point instructions that
 // are VOP2 on SI and VOP3 on VI.
-class SI2_VI3Alias <string name, Instruction inst> : InstAlias <
+class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias <
   name#" $dst, $src0, $src1",
-  (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0)
+  !if(inst.Pfl.HasOMod,
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0),
+      (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0))
 >, PredicateControl {
   let UseInstAsmMatchConverter = 0;
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c2a4d4b..92ed070 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -12,17 +12,41 @@
 //===----------------------------------------------------------------------===//
 
 class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
+  dag src0 = !if(P.HasOMod,
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp));
+
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+    (node (P.Src0VT src0),
           (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))];
+    (node (P.Src0VT src0)))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+                                    (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+                          (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -72,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
   // v_div_scale_{f32|f64} do not support input modifiers.
   let HasModifiers = 0;
+  let HasOMod = 0;
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
   let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }
@@ -86,6 +111,14 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
   let DstRC = RegisterOperand<VReg_64>;
 }
 
+def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VReg_64>;
+
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+}
+
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
@@ -144,8 +177,8 @@ def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>,
 def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
 def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
 def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
-def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbit>;
+def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
 def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
 def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
 def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
@@ -181,7 +214,10 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
+
+let Constraints = "@earlyclobber $vdst" in {
 def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_mqsad_pk_u16_u8>;
+} // End Constraints = "@earlyclobber $vdst"
 
 def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
   let SchedRW = [WriteDouble];
@@ -204,25 +240,25 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>>;
 
 let SubtargetPredicate = isCIVI in {
 
-def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile<VOP_I32_I32_I32>>;
+let Constraints = "@earlyclobber $vdst" in {
 def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64>, int_amdgcn_qsad_pk_u16_u8>;
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
+} // End Constraints = "@earlyclobber $vdst"
 
 let isCommutable = 1 in {
-def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
-
-// XXX - Does this set VCC?
-def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
+def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isCIVI
 
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
+
+def V_DIV_FIXUP_F16   : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
 
 let isCommutable = 1 in {
 
-def V_DIV_FIXUP_F16   : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup>;
 def V_FMA_F16         : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma>;
 def V_INTERP_P1LL_F16 : VOP3Inst <"v_interp_p1ll_f16", VOP3_Profile<VOP_F32_F32_F16>>;
 def V_INTERP_P1LV_F16 : VOP3Inst <"v_interp_p1lv_f16", VOP3_Profile<VOP_F32_F32_F16_F16>>;
@@ -233,13 +269,16 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 
 }  // End isCommutable = 1
+} // End SubtargetPredicate = Has16BitInsts
 
+let SubtargetPredicate = isVI in {
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 } // End SubtargetPredicate = isVI
 
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
 
-multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
-                            Instruction inst, SDPatternOperator op3> {
+multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+                             Instruction inst, SDPatternOperator op3> {
 def : Pat<
   (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
   (inst i16:$src0, i16:$src1, i16:$src2)
@@ -258,10 +297,34 @@ def : Pat<
 >;
 }
 
-defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
+
+} // End Predicates = [Has16BitInsts]
+
+let SubtargetPredicate = isGFX9 in {
+def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;
+def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
 
-} // End Predicates = [isVI]
+def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
+def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
+def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
+def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
+
+def V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmin3>;
+def V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmin3>;
+def V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumin3>;
+
+def V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmax3>;
+def V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmax3>;
+def V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumax3>;
+} // End SubtargetPredicate = isGFX9
 
 
 //===----------------------------------------------------------------------===//
@@ -351,11 +414,18 @@ multiclass VOP3_Real_ci<bits<9> op> {
   }
 }
 
-defm V_MQSAD_U16_U8     : VOP3_Real_ci <0x172>;
+multiclass VOP3be_Real_ci<bits<9> op> {
+  def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [isCIOnly];
+    let DecoderNamespace = "CI";
+  }
+}
+
 defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
-defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x174>;
-defm V_MAD_U64_U32      : VOP3_Real_ci <0x176>;
-defm V_MAD_I64_I32      : VOP3_Real_ci <0x177>;
+defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x175>;
+defm V_MAD_U64_U32      : VOP3be_Real_ci <0x176>;
+defm V_MAD_I64_I32      : VOP3be_Real_ci <0x177>;
 
 //===----------------------------------------------------------------------===//
 // VI
@@ -375,9 +445,8 @@ multiclass VOP3be_Real_vi<bits<10> op> {
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
 
-defm V_MQSAD_U16_U8     : VOP3_Real_vi <0x172>;
-defm V_MAD_U64_U32      : VOP3_Real_vi <0x176>;
-defm V_MAD_I64_I32      : VOP3_Real_vi <0x177>;
+defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
+defm V_MAD_I64_I32      : VOP3be_Real_vi <0x1E9>;
 
 defm V_MAD_LEGACY_F32   : VOP3_Real_vi <0x1c0>;
 defm V_MAD_F32          : VOP3_Real_vi <0x1c1>;
@@ -424,6 +493,8 @@ defm V_MAD_F16          : VOP3_Real_vi <0x1ea>;
 defm V_MAD_U16          : VOP3_Real_vi <0x1eb>;
 defm V_MAD_I16          : VOP3_Real_vi <0x1ec>;
 
+defm V_PERM_B32         : VOP3_Real_vi <0x1ed>;
+
 defm V_FMA_F16          : VOP3_Real_vi <0x1ee>;
 defm V_DIV_FIXUP_F16    : VOP3_Real_vi <0x1ef>;
 
@@ -449,3 +520,25 @@ defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
 defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
 defm V_ASHRREV_I64      : VOP3_Real_vi <0x291>;
 defm V_TRIG_PREOP_F64   : VOP3_Real_vi <0x292>;
+
+defm V_LSHL_ADD_U32 : VOP3_Real_vi <0x1fd>;
+defm V_ADD_LSHL_U32 : VOP3_Real_vi <0x1fe>;
+defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>;
+defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
+defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
+defm V_OR3_B32 : VOP3_Real_vi <0x202>;
+defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
+
+defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
+
+defm V_MIN3_F16 : VOP3_Real_vi <0x1f4>;
+defm V_MIN3_I16 : VOP3_Real_vi <0x1f5>;
+defm V_MIN3_U16 : VOP3_Real_vi <0x1f6>;
+
+defm V_MAX3_F16 : VOP3_Real_vi <0x1f7>;
+defm V_MAX3_I16 : VOP3_Real_vi <0x1f8>;
+defm V_MAX3_U16 : VOP3_Real_vi <0x1f9>;
+
+defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
+defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
+defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
new file mode 100644
index 0000000..3becf75
--- /dev/null
+++ b/contrib/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -0,0 +1,102 @@
+//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP3P Classes
+//===----------------------------------------------------------------------===//
+
+class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+  VOP3P_Pseudo<OpName, P,
+    !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+>;
+
+// Non-packed instructions that use the VOP3P encoding.
+// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
+class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+  VOP3P_Pseudo<OpName, P> {
+  let InOperandList =
+    (ins
+      FP32InputMods:$src0_modifiers, VCSrc_f32:$src0,
+      FP32InputMods:$src1_modifiers, VCSrc_f32:$src1,
+      FP32InputMods:$src2_modifiers, VCSrc_f32:$src2,
+      clampmod:$clamp,
+      op_sel:$op_sel,
+      op_sel_hi:$op_sel_hi);
+  let AsmOperands =
+    " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
+}
+
+let isCommutable = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+
+def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
+def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+
+def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+
+def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+}
+
+def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+
+def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
+def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
+def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+
+// XXX - Commutable?
+// These are VOP3a-like opcodes which accept no omod.
+// Size of src arguments (16/32) is controlled by op_sel.
+// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
+def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_V2F16_V2F16_V2F16>>;
+def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
+def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>;
+
+
+multiclass VOP3P_Real_vi<bits<10> op> {
+  def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [HasVOP3PInsts];
+    let DecoderNamespace = "VI";
+  }
+}
+
+defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
+defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
+defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
+defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
+defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>;
+
+defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>;
+defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
+defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
+defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
+defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>;
+defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
+defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
+defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+
+defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 16a456d..b636fc9 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   let Inst{44-43} = SDWA.UNUSED_PRESERVE;
 }
 
+class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
+  bits<9> src1;
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e; // encoding
+  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+}
+
+
 //===----------------------------------------------------------------------===//
 // VOPC classes
 //===----------------------------------------------------------------------===//
@@ -93,6 +104,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -135,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   let SubtargetPredicate = AssemblerPredicate;
 }
 
+class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+  list<dag> ret = !if(P.HasModifiers,
+      [(set i1:$sdst,
+        (setcc (P.Src0VT
+                  !if(P.HasOMod,
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
+                    (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
+               (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
+               cond))],
+      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]);
+}
+
+
 multiclass VOPC_Pseudos <string opName,
                          VOPC_Profile P,
                          PatLeaf cond = COND_NULL,
@@ -150,14 +176,7 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
-  def _e64 : VOP3_Pseudo<opName, P,
-    !if(P.HasModifiers,
-      [(set i1:$sdst,
-          (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers,
-                                      i1:$clamp, i32:$omod)),
-                 (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)),
-                 cond))],
-      [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>,
+  def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>,
     Commutable_REV<revOp#"_e64", !eq(revOp, opName)> {
     let Defs = !if(DefExec, [EXEC], []);
     let SchedRW = P.Schedule;
@@ -165,13 +184,11 @@ multiclass VOPC_Pseudos <string opName,
     let isCommutable = 1;
   }
 
-  def _sdwa : VOPC_SDWA_Pseudo <opName, P>,
-              Commutable_REV<revOp#"_sdwa", !eq(revOp, opName)> {
+  def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
     let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
     let SchedRW = P.Schedule;
     let isConvergent = DefExec;
     let isCompare = 1;
-    let isCommutable = 1;
   }
 }
 
@@ -517,9 +534,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
   let Asm64 = "$sdst, $src0_modifiers, $src1";
+
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+
   let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
   let HasSrc1Mods = 0;
   let HasClamp = 0;
@@ -563,7 +582,7 @@ multiclass VOPC_CLASS_F16 <string opName> :
   VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
 
 multiclass VOPCX_CLASS_F16 <string opName> :
-  VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+  VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>;
 
 multiclass VOPC_CLASS_F32 <string opName> :
   VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
@@ -621,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat <
   (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
                    (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
   (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
-        DSTCLAMP.NONE, DSTOMOD.NONE)
+        DSTCLAMP.NONE)
 >;
 
 def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
@@ -920,6 +939,10 @@ multiclass VOPC_Real_vi <bits<10> op> {
     VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+
   def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
                        !cast<Instruction>(NAME#"_e32_vi")> {
     let AssemblerPredicate = isVI;
diff --git a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5f72f97..b47538b 100644
--- a/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -51,12 +51,8 @@ class VOP3Common <dag outs, dag ins, string asm = "",
 
   let VOP3 = 1;
 
-  let AsmMatchConverter =
-    !if(!eq(VOP3Only,1),
-        "cvtVOP3",
-        !if(!eq(HasMods,1), "cvtVOP3_2_mod", ""));
-
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
+  let AsmMatchConverter = !if(!eq(HasMods,1), "cvtVOP3", "");
 
   let isCodeGenOnly = 0;
 
@@ -68,8 +64,9 @@ class VOP3Common <dag outs, dag ins, string asm = "",
   let hasPostISelHook = 1;
 }
 
-class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
-  InstSI <P.Outs64, P.Ins64, "", pattern>,
+class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
+                   bit VOP3Only = 0, bit isVOP3P = 0> :
+  InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>,
   VOP <opName>,
   SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e64", opName> {
@@ -79,7 +76,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On
   let UseNamedOperandTable = 1;
 
   string Mnemonic = opName;
-  string AsmOperands = P.Asm64;
+  string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64);
 
   let Size = 8;
   let mayLoad = 0;
@@ -100,23 +97,32 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On
 
   let VOP3 = 1;
   let VALU = 1;
+  let FPClamp = P.HasFPClamp;
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
   let AsmMatchConverter =
-    !if(!eq(VOP3Only,1),
-        "cvtVOP3",
-        !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));
+    !if(!and(P.IsPacked, isVOP3P),
+        "cvtVOP3P",
+        !if(!or(P.HasModifiers, P.HasOMod),
+            "cvtVOP3",
+            ""));
 
   VOPProfile Pfl = P;
 }
 
+class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
+  VOP3_Pseudo<opName, P, pattern, 1, 1> {
+  let VOP3P = 1;
+}
+
 class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let UseNamedOperandTable = 1;
 
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
@@ -128,8 +134,17 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
+
+  VOPProfile Pfl = ps.Pfl;
 }
 
+// XXX - Is there any reason to distingusih this from regular VOP3
+// here?
+class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+  VOP3_Real<ps, EncodingFamily>;
+
 class VOP3a<VOPProfile P> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
@@ -197,6 +212,42 @@ class VOP3be <VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
+class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
+  bits<8> vdst;
+  // neg, neg_hi, op_sel put in srcN_modifiers
+  bits<4> src0_modifiers;
+  bits<9> src0;
+  bits<4> src1_modifiers;
+  bits<9> src1;
+  bits<4> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+
+  let Inst{7-0} = vdst;
+  let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+  let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+  let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+
+  let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
+
+  let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, 0); // op_sel_hi(2)
+
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{59}    = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, 0); // op_sel_hi(0)
+  let Inst{60}    = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, 0); // op_sel_hi(1)
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
 class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
   let Inst{25-17} = op;
 }
@@ -238,11 +289,65 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
   let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
-  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
   let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+}
+
+// GFX9 adds two features to SDWA:
+// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+//       than VGPRs (at most 1 can be an SGPR);
+//    b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+//    replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+//    field.
+//    a. When SD=1, the SDST is used as the destination for the compare result;
+//    b. When SD=0, VCC is used.
+//
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
+// gfx9 SDWA basic encoding
+class VOP_SDWA9e<VOPProfile P> : Enc64 {
+  bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
+  bits<3> src0_sel;
+  bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+  bits<3> src1_sel;
+  bits<2> src1_modifiers;
+  bits<1> src1_sgpr;
+
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
+  let Inst{55}    = !if(P.HasSrc0, src0{8}, 0);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+  let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+  let Inst{63}    = 0; // src1_sgpr - should be specified in subclass
+}
+
+// gfx9 SDWA-A
+class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
+  bits<3> dst_sel;
+  bits<2> dst_unused;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
+  let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
+}
+
+// gfx9 SDWA-B
+class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
+  bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}}
+
+  let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0);
+  let Inst{47} = !if(P.EmitDst, sdst{7}, 0);
 }
 
 class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
@@ -250,25 +355,26 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOP <opName>,
   SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
   MnemonicAlias <opName#"_sdwa", opName> {
-  
+
   let isPseudo = 1;
   let isCodeGenOnly = 1;
   let UseNamedOperandTable = 1;
 
   string Mnemonic = opName;
   string AsmOperands = P.AsmSDWA;
+  string AsmOperands9 = P.AsmSDWA9;
 
   let Size = 8;
   let mayLoad = 0;
   let mayStore = 0;
-  let hasSideEffects = 0;  
+  let hasSideEffects = 0;
 
   let VALU = 1;
   let SDWA = 1;
   let Uses = [EXEC];
-  
-  let SubtargetPredicate = isVI;
-  let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+
+  let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst);
+  let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
                                      AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "SDWA";
@@ -278,7 +384,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
 
 class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
-  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
@@ -303,6 +409,35 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   let TSFlags              = ps.TSFlags;
 }
 
+class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands9, []>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  let SubtargetPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+  let AssemblerPredicate = !if(ps.Pfl.HasSDWA9, HasSDWA9, DisableInst);
+  let AsmVariantName = !if(ps.Pfl.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+                                            AMDGPUAsmVariants.Disable);
+  let DecoderNamespace = "SDWA9";
+
+  // Copy relevant pseudo op flags
+  let AsmMatchConverter    = ps.AsmMatchConverter;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Constraints          = ps.Constraints;
+  let DisableEncoding      = ps.DisableEncoding;
+  let TSFlags              = ps.TSFlags;
+}
+
 class VOP_DPPe<VOPProfile P> : Enc64 {
   bits<2> src0_modifiers;
   bits<8> src0;
@@ -337,8 +472,8 @@ class VOP_DPP <string OpName, VOPProfile P> :
   let Size = 8;
 
   let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
-  let SubtargetPredicate = isVI;
-  let AssemblerPredicate = !if(P.HasExt, isVI, DisableInst);
+  let SubtargetPredicate = HasDPP;
+  let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
                                      AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "DPP";
@@ -348,3 +483,4 @@ include "VOPCInstructions.td"
 include "VOP1Instructions.td"
 include "VOP2Instructions.td"
 include "VOP3Instructions.td"
+include "VOP3PInstructions.td"
diff --git a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
index 89859ba..8640c87 100644
--- a/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/A15SDOptimizer.cpp
@@ -427,13 +427,11 @@ unsigned A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
                                        unsigned Lane, bool QPR) {
   unsigned Out = MRI->createVirtualRegister(QPR ? &ARM::QPRRegClass :
                                                   &ARM::DPRRegClass);
-  AddDefaultPred(BuildMI(MBB,
-                         InsertBefore,
-                         DL,
-                         TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d),
-                         Out)
-                   .addReg(Reg)
-                   .addImm(Lane));
+  BuildMI(MBB, InsertBefore, DL,
+          TII->get(QPR ? ARM::VDUPLN32q : ARM::VDUPLN32d), Out)
+      .addReg(Reg)
+      .addImm(Lane)
+      .add(predOps(ARMCC::AL));
 
   return Out;
 }
@@ -476,13 +474,11 @@ unsigned A15SDOptimizer::createVExt(MachineBasicBlock &MBB,
                                     const DebugLoc &DL, unsigned Ssub0,
                                     unsigned Ssub1) {
   unsigned Out = MRI->createVirtualRegister(&ARM::DPRRegClass);
-  AddDefaultPred(BuildMI(MBB,
-                         InsertBefore,
-                         DL,
-                         TII->get(ARM::VEXTd32), Out)
-                   .addReg(Ssub0)
-                   .addReg(Ssub1)
-                   .addImm(1));
+  BuildMI(MBB, InsertBefore, DL, TII->get(ARM::VEXTd32), Out)
+      .addReg(Ssub0)
+      .addReg(Ssub1)
+      .addImm(1)
+      .add(predOps(ARMCC::AL));
   return Out;
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h
index be30482..4676226 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm/lib/Target/ARM/ARM.h
@@ -16,21 +16,24 @@
 #define LLVM_LIB_TARGET_ARM_ARM_H
 
 #include "llvm/Support/CodeGen.h"
-#include "ARMBasicBlockInfo.h"
 #include <functional>
+#include <vector>
 
 namespace llvm {
 
 class ARMAsmPrinter;
 class ARMBaseTargetMachine;
+class ARMRegisterBankInfo;
+class ARMSubtarget;
+struct BasicBlockInfo;
 class Function;
 class FunctionPass;
-class ImmutablePass;
+class InstructionSelector;
+class MachineBasicBlock;
+class MachineFunction;
 class MachineInstr;
 class MCInst;
 class PassRegistry;
-class TargetLowering;
-class TargetMachine;
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -43,6 +46,9 @@ FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createARMOptimizeBarriersPass();
 FunctionPass *createThumb2SizeReductionPass(
     std::function<bool(const Function &)> Ftor = nullptr);
+InstructionSelector *
+createARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
+                             const ARMRegisterBankInfo &RBI);
 
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
@@ -53,7 +59,8 @@ std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
 
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMConstantIslandsPass(PassRegistry &);
 
-} // end namespace llvm;
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARM_H
diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td
index 2a090fa..e49c1ba 100644
--- a/contrib/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm/lib/Target/ARM/ARM.td
@@ -17,144 +17,172 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
-// ARM Helper classes.
+// ARM Subtarget state.
 //
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+def ModeThumb             : SubtargetFeature<"thumb-mode", "InThumbMode",
+                                             "true", "Thumb mode">;
+
+def ModeSoftFloat         : SubtargetFeature<"soft-float","UseSoftFloat",
+                                             "true", "Use software floating "
+                                             "point features.">;
 
-class Architecture<string fname, string aname, list<SubtargetFeature> features >
-  : SubtargetFeature<fname, "ARMArch", aname,
-                     !strconcat(aname, " architecture"), features>;
 
 //===----------------------------------------------------------------------===//
-// ARM Subtarget state.
+// ARM Subtarget features.
 //
 
-def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
-                                  "Thumb mode">;
+// Floating Point, HW Division and Neon Support
+def FeatureVFP2           : SubtargetFeature<"vfp2", "HasVFPv2", "true",
+                                             "Enable VFP2 instructions">;
 
-def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
-                                     "Use software floating point features.">;
+def FeatureVFP3           : SubtargetFeature<"vfp3", "HasVFPv3", "true",
+                                             "Enable VFP3 instructions",
+                                             [FeatureVFP2]>;
 
-//===----------------------------------------------------------------------===//
-// ARM Subtarget features.
-//
+def FeatureNEON           : SubtargetFeature<"neon", "HasNEON", "true",
+                                             "Enable NEON instructions",
+                                             [FeatureVFP3]>;
+
+def FeatureFP16           : SubtargetFeature<"fp16", "HasFP16", "true",
+                                             "Enable half-precision "
+                                             "floating point">;
+
+def FeatureVFP4           : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                             "Enable VFP4 instructions",
+                                             [FeatureVFP3, FeatureFP16]>;
+
+def FeatureFPARMv8        : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                             "true", "Enable ARMv8 FP",
+                                             [FeatureVFP4]>;
+
+def FeatureFullFP16       : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
+                                             "Enable full half-precision "
+                                             "floating point",
+                                             [FeatureFPARMv8]>;
+
+def FeatureVFPOnlySP      : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
+                                             "Floating point unit supports "
+                                             "single precision only">;
+
+def FeatureD16            : SubtargetFeature<"d16", "HasD16", "true",
+                                             "Restrict FP to 16 double registers">;
+
+def FeatureHWDivThumb     : SubtargetFeature<"hwdiv",
+                                             "HasHardwareDivideInThumb", "true",
+                                             "Enable divide instructions in Thumb">;
+
+def FeatureHWDivARM       : SubtargetFeature<"hwdiv-arm",
+                                             "HasHardwareDivideInARM", "true",
+                                             "Enable divide instructions in ARM mode">;
+
+// Atomic Support
+def FeatureDB             : SubtargetFeature<"db", "HasDataBarrier", "true",
+                                             "Has data barrier (dmb/dsb) instructions">;
+
+def FeatureV7Clrex        : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
+                                             "Has v7 clrex instruction">;
 
-def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
-                                   "Enable VFP2 instructions">;
-def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
-                                   "Enable VFP3 instructions",
-                                   [FeatureVFP2]>;
-def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-                                   "Enable NEON instructions",
-                                   [FeatureVFP3]>;
-def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
-                                     "Enable Thumb2 instructions">;
-def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
-                                     "Does not support ARM mode execution",
-                                     [ModeThumb]>;
-def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
-                                     "Enable half-precision floating point">;
-def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
-                                     "Enable VFP4 instructions",
-                                     [FeatureVFP3, FeatureFP16]>;
-def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
-                                   "true", "Enable ARMv8 FP",
-                                   [FeatureVFP4]>;
-def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
-                                       "Enable full half-precision floating point",
-                                       [FeatureFPARMv8]>;
-def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
-                                     "Restrict FP to 16 double registers">;
-def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
-                                     "Enable divide instructions">;
-def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
-                                        "HasHardwareDivideInARM", "true",
-                                      "Enable divide instructions in ARM mode">;
-def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
-                                 "Enable Thumb2 extract and pack instructions">;
-def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
-                                   "Has data barrier (dmb / dsb) instructions">;
-def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
-                                      "Has v7 clrex instruction">;
 def FeatureAcquireRelease : SubtargetFeature<"acquire-release",
                                              "HasAcquireRelease", "true",
-                         "Has v8 acquire/release (lda/ldaex etc) instructions">;
-def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
-                                         "FP compare + branch is slow">;
-def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
-                          "Floating point unit supports single precision only">;
-def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
-                           "Enable support for Performance Monitor extensions">;
-def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
-                          "Enable support for TrustZone security extensions">;
-def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
-                          "Enable support for ARMv8-M Security Extensions">;
-def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-                          "Enable support for Cryptography extensions",
-                          [FeatureNEON]>;
-def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
-                          "Enable support for CRC instructions">;
+                                             "Has v8 acquire/release (lda/ldaex "
+                                             " etc) instructions">;
+
+
+def FeatureSlowFPBrcc     : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
+                                             "FP compare + branch is slow">;
+
+def FeaturePerfMon        : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                                             "Enable support for Performance "
+                                             "Monitor extensions">;
+
+
+// TrustZone Security Extensions
+def FeatureTrustZone      : SubtargetFeature<"trustzone", "HasTrustZone", "true",
+                                             "Enable support for TrustZone "
+                                             "security extensions">;
+
+def Feature8MSecExt       : SubtargetFeature<"8msecext", "Has8MSecExt", "true",
+                                             "Enable support for ARMv8-M "
+                                             "Security Extensions">;
+
+def FeatureCrypto         : SubtargetFeature<"crypto", "HasCrypto", "true",
+                                             "Enable support for "
+                                             "Cryptography extensions",
+                                             [FeatureNEON]>;
+
+def FeatureCRC            : SubtargetFeature<"crc", "HasCRC", "true",
+                                             "Enable support for CRC instructions">;
+
+
 // Not to be confused with FeatureHasRetAddrStack (return address stack)
-def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
-                "Enable Reliability, Availability and Serviceability extensions">;
-def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true",
-                "Enable fast computation of positive address offsets">;
+def FeatureRAS            : SubtargetFeature<"ras", "HasRAS", "true",
+                                             "Enable Reliability, Availability "
+                                             "and Serviceability extensions">;
 
+// Fast computation of non-negative address offsets
+def FeatureFPAO           : SubtargetFeature<"fpao", "HasFPAO", "true",
+                                             "Enable fast computation of "
+                                             "positive address offsets">;
 
-// Cyclone has preferred instructions for zeroing VFP registers, which can
-// execute in 0 cycles.
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
+// Fast execution of AES crypto operations
+def FeatureFuseAES        : SubtargetFeature<"fuse-aes", "HasFuseAES", "true",
+                                             "CPU fuses AES crypto operations">;
 
-// Whether or not it may be profitable to unpredicate certain instructions
-// during if conversion.
+// Cyclone can zero VFP registers in 0 cycles.
+def FeatureZCZeroing      : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                             "Has zero-cycle zeroing instructions">;
+
+// Whether it is profitable to unpredicate certain instructions during if-conversion
 def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr",
-                                              "IsProfitableToUnpredicate",
-                                              "true",
+                                              "IsProfitableToUnpredicate", "true",
                                               "Is profitable to unpredicate">;
 
 // Some targets (e.g. Swift) have microcoded VGETLNi32.
-def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32",
-                                            "HasSlowVGETLNi32", "true",
-                                            "Has slow VGETLNi32 - prefer VMOV">;
+def FeatureSlowVGETLNi32  : SubtargetFeature<"slow-vgetlni32",
+                                             "HasSlowVGETLNi32", "true",
+                                             "Has slow VGETLNi32 - prefer VMOV">;
 
 // Some targets (e.g. Swift) have microcoded VDUP32.
-def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true",
-                                         "Has slow VDUP32 - prefer VMOV">;
+def FeatureSlowVDUP32     : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32",
+                                             "true",
+                                             "Has slow VDUP32 - prefer VMOV">;
 
 // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON
 // for scalar FP, as this allows more effective execution domain optimization.
-def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
-                                           "true", "Prefer VMOVSR">;
+def FeaturePreferVMOVSR   : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR",
+                                             "true", "Prefer VMOVSR">;
 
 // Swift has ISHST barriers compatible with Atomic Release semantics but weaker
 // than ISH
 def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST",
-                                           "true", "Prefer ISHST barriers">;
+                                               "true", "Prefer ISHST barriers">;
 
 // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU.
-def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true",
-                                         "Has muxed AGU and NEON/FPU">;
+def FeatureMuxedUnits     : SubtargetFeature<"muxed-units", "HasMuxedUnits",
+                                             "true",
+                                             "Has muxed AGU and NEON/FPU">;
 
-// On some targets, a VLDM/VSTM starting with an odd register number needs more
-// microops than single VLDRS.
+// Whether VLDM/VSTM starting with odd register number need more microops
+// than single VLDRS
 def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister",
-                     "true", "VLDM/VSTM starting with an odd register is slow">;
+                                              "true", "VLDM/VSTM starting "
+                                              "with an odd register is slow">;
 
 // Some targets have a renaming dependency when loading into D subregisters.
 def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg",
                                               "SlowLoadDSubregister", "true",
                                               "Loading into D subregs is slow">;
+
 // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD.
 def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs",
                                              "DontWidenVMOVS", "true",
                                              "Don't widen VMOVS to VMOVD">;
 
 // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions.
-def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true",
-                                        "Expand VFP/NEON MLA/MLS instructions">;
+def FeatureExpandMLx      : SubtargetFeature<"expand-fp-mlx",
+                                             "ExpandMLx", "true",
+                                             "Expand VFP/NEON MLA/MLS instructions">;
 
 // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS.
 def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
@@ -162,15 +190,18 @@ def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards",
 
 // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from
 // VFP to NEON, as an execution domain optimization.
-def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs",
-                              "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">;
+def FeatureNEONForFPMovs  : SubtargetFeature<"neon-fpmovs",
+                                             "UseNEONForFPMovs", "true",
+                                             "Convert VMOVSR, VMOVRS, "
+                                             "VMOVS to NEON">;
 
 // Some processors benefit from using NEON instructions for scalar
 // single-precision FP operations. This affects instruction selection and should
 // only be enabled if the handling of denormals is not important.
-def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
-                                        "true",
-                                        "Use NEON for single precision FP">;
+def FeatureNEONForFP      : SubtargetFeature<"neonfp",
+                                             "UseNEONForSinglePrecisionFP",
+                                             "true",
+                                             "Use NEON for single precision FP">;
 
 // On some processors, VLDn instructions that access unaligned data take one
 // extra cycle. Take that into account when computing operand latencies.
@@ -181,18 +212,18 @@ def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign",
 // Some processors have a nonpipelined VFP coprocessor.
 def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp",
                                               "NonpipelinedVFP", "true",
-                                          "VFP instructions are not pipelined">;
+                                              "VFP instructions are not pipelined">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
 // to just not use them.
-def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
-                                         "Disable VFP / NEON MAC instructions">;
+def FeatureHasSlowFPVMLx  : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
+                                             "Disable VFP / NEON MAC instructions">;
 
 // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
 def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
-                                       "HasVMLxForwarding", "true",
-                                       "Has multiplier accumulator forwarding">;
+                                             "HasVMLxForwarding", "true",
+                                             "Has multiplier accumulator forwarding">;
 
 // Disable 32-bit to 16-bit narrowing for experimentation.
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
@@ -206,62 +237,105 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
                                                "AvoidCPSRPartialUpdate", "true",
                                  "Avoid CPSR partial update for OOO execution">;
 
-def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
-                                            "AvoidMOVsShifterOperand", "true",
-                                "Avoid movs instructions with shifter operand">;
+/// Disable +1 predication cost for instructions updating CPSR.
+/// Enabled for Cortex-A57.
+def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr",
+                                                  "CheapPredicableCPSRDef",
+                                                  "true",
+                  "Disable +1 predication cost for instructions updating CPSR">;
+
+def FeatureAvoidMOVsShOp  : SubtargetFeature<"avoid-movs-shop",
+                                             "AvoidMOVsShifterOperand", "true",
+                                             "Avoid movs instructions with "
+                                             "shifter operand">;
 
 // Some processors perform return stack prediction. CodeGen should avoid issue
 // "normal" call instructions to callees which do not return.
-def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
-                                     "Has return address stack">;
+def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack",
+                                              "HasRetAddrStack", "true",
+                                              "Has return address stack">;
+
+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+                                                   "HasBranchPredictor", "false",
+                                                   "Has no branch predictor">;
 
 /// DSP extension.
-def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
-                              "Supports DSP instructions in ARM and/or Thumb2">;
+def FeatureDSP            : SubtargetFeature<"dsp", "HasDSP", "true",
+                                             "Supports DSP instructions in "
+                                             "ARM and/or Thumb2">;
 
 // Multiprocessing extension.
-def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
-                                 "Supports Multiprocessing extension">;
+def FeatureMP             : SubtargetFeature<"mp", "HasMPExtension", "true",
+                                        "Supports Multiprocessing extension">;
 
 // Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
 def FeatureVirtualization : SubtargetFeature<"virtualization",
-                                 "HasVirtualization", "true",
-                                 "Supports Virtualization extension",
-                                 [FeatureHWDiv, FeatureHWDivARM]>;
+                                             "HasVirtualization", "true",
+                                             "Supports Virtualization extension",
+                                             [FeatureHWDivThumb, FeatureHWDivARM]>;
 
-// M-series ISA
-def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
-                                     "Is microcontroller profile ('M' series)">;
+// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
+// See ARMInstrInfo.td for details.
+def FeatureNaClTrap       : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
+                                             "NaCl trap">;
 
-// R-series ISA
-def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
-                                     "Is realtime profile ('R' series)">;
+def FeatureStrictAlign    : SubtargetFeature<"strict-align",
+                                             "StrictAlign", "true",
+                                             "Disallow all unaligned memory "
+                                             "access">;
+
+def FeatureLongCalls      : SubtargetFeature<"long-calls", "GenLongCalls", "true",
+                                             "Generate calls via indirect call "
+                                             "instructions">;
+
+def FeatureExecuteOnly    : SubtargetFeature<"execute-only",
+                                             "GenExecuteOnly", "true",
+                                             "Enable the generation of "
+                                             "execute only code.">;
+
+def FeatureReserveR9      : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                             "Reserve R9, making it unavailable"
+                                             " as GPR">;
+
+def FeatureNoMovt         : SubtargetFeature<"no-movt", "NoMovt", "true",
+                                             "Don't use movt/movw pairs for "
+                                             "32-bit imms">;
+
+def FeatureNoNegativeImmediates
+                          : SubtargetFeature<"no-neg-immediates",
+                                             "NegativeImmediates", "false",
+                                             "Convert immediates and instructions "
+                                             "to their negated or complemented "
+                                             "equivalent when the immediate does "
+                                             "not fit in the encoding.">;
+
+
+//===----------------------------------------------------------------------===//
+// ARM architecture class
+//
 
 // A-series ISA
 def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
                                      "Is application profile ('A' series)">;
 
-// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
-// See ARMInstrInfo.td for details.
-def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
-                                       "NaCl trap">;
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
 
-def FeatureStrictAlign : SubtargetFeature<"strict-align",
-                                          "StrictAlign", "true",
-                                          "Disallow all unaligned memory "
-                                          "access">;
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
+                                     "Is microcontroller profile ('M' series)">;
 
-def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true",
-                                        "Generate calls via indirect call "
-                                        "instructions">;
 
-def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
-                                        "Reserve R9, making it unavailable as "
-                                        "GPR">;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
+                                     "Enable Thumb2 instructions">;
 
-def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
-                                     "Don't use movt/movw pairs for 32-bit "
-                                     "imms">;
+def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -270,44 +344,57 @@ def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
 
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
+
 def HasV5TOps   : SubtargetFeature<"v5t", "HasV5TOps", "true",
                                    "Support ARM v5T instructions",
                                    [HasV4TOps]>;
+
 def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
-                             "Support ARM v5TE, v5TEj, and v5TExp instructions",
+                                   "Support ARM v5TE, v5TEj, and "
+                                   "v5TExp instructions",
                                    [HasV5TOps]>;
+
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+
 def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
                                    "Support ARM v6M instructions",
                                    [HasV6Ops]>;
+
 def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true",
                                          "Support ARM v8M Baseline instructions",
                                          [HasV6MOps]>;
+
 def HasV6KOps   : SubtargetFeature<"v6k", "HasV6KOps", "true",
                                    "Support ARM v6k instructions",
                                    [HasV6Ops]>;
+
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
                                    [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>;
+
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon,
                                     FeatureV7Clrex]>;
+
+def HasV8MMainlineOps :
+                  SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
+                                   "Support ARM v8M Mainline instructions",
+                                   [HasV7Ops]>;
+
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops, FeatureAcquireRelease,
-                                    FeatureT2XtPk]>;
+                                   [HasV7Ops, FeatureAcquireRelease]>;
+
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
-def HasV8_2aOps   : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
+
+def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
                                    "Support ARM v8.2a instructions",
                                    [HasV8_1aOps]>;
-def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
-                                         "Support ARM v8M Mainline instructions",
-                                         [HasV7Ops]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -342,7 +429,9 @@ def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", []>;
 
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
-                                   "Qualcomm ARM processors", []>;
+                                   "Qualcomm Krait processors", []>;
+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors", []>;
 def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
                                    "Swift ARM processors", []>;
 
@@ -361,11 +450,17 @@ def ProcR52     : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52",
 def ProcM3      : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3",
                                    "Cortex-M3 ARM processors", []>;
 
+
 //===----------------------------------------------------------------------===//
-// ARM schedules.
+// ARM Helper classes.
 //
 
-include "ARMSchedule.td"
+class Architecture<string fname, string aname, list<SubtargetFeature> features>
+  : SubtargetFeature<fname, "ARMArch", aname,
+                     !strconcat(aname, " architecture"), features>;
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+  : Processor<Name, NoItineraries, Features>;
 
 
 //===----------------------------------------------------------------------===//
@@ -393,8 +488,7 @@ def ARMv5tej  : Architecture<"armv5tej",  "ARMv5tej", [HasV5TEOps]>;
 def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops]>;
 
 def ARMv6t2   : Architecture<"armv6t2",   "ARMv6t2",  [HasV6T2Ops,
-                                                       FeatureDSP,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureDSP]>;
 
 def ARMv6k    : Architecture<"armv6k",    "ARMv6k",   [HasV6KOps]>;
 
@@ -415,31 +509,37 @@ def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
                                                        FeatureNEON,
                                                        FeatureDB,
                                                        FeatureDSP,
-                                                       FeatureAClass,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureAClass]>;
+
+def ARMv7ve   : Architecture<"armv7ve",   "ARMv7ve",  [HasV7Ops,
+                                                       FeatureNEON,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureAClass]>;
 
 def ARMv7r    : Architecture<"armv7-r",   "ARMv7r",   [HasV7Ops,
                                                        FeatureDB,
                                                        FeatureDSP,
-                                                       FeatureHWDiv,
-                                                       FeatureRClass,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureHWDivThumb,
+                                                       FeatureRClass]>;
 
 def ARMv7m    : Architecture<"armv7-m",   "ARMv7m",   [HasV7Ops,
                                                        FeatureThumb2,
                                                        FeatureNoARM,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
+                                                       FeatureHWDivThumb,
                                                        FeatureMClass]>;
 
 def ARMv7em   : Architecture<"armv7e-m",  "ARMv7em",  [HasV7Ops,
                                                        FeatureThumb2,
                                                        FeatureNoARM,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
+                                                       FeatureHWDivThumb,
                                                        FeatureMClass,
-                                                       FeatureDSP,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureDSP]>;
 
 def ARMv8a    : Architecture<"armv8-a",   "ARMv8a",   [HasV8Ops,
                                                        FeatureAClass,
@@ -481,9 +581,6 @@ def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
-                                                       FeatureHWDivARM,
-                                                       FeatureT2XtPk,
                                                        FeatureDSP,
                                                        FeatureCRC,
                                                        FeatureMP,
@@ -495,7 +592,7 @@ def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
                                                       [HasV8MBaselineOps,
                                                        FeatureNoARM,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
+                                                       FeatureHWDivThumb,
                                                        FeatureV7Clrex,
                                                        Feature8MSecExt,
                                                        FeatureAcquireRelease,
@@ -505,7 +602,7 @@ def ARMv8mMainline : Architecture<"armv8-m.main", "ARMv8mMainline",
                                                       [HasV8MMainlineOps,
                                                        FeatureNoARM,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
+                                                       FeatureHWDivThumb,
                                                        Feature8MSecExt,
                                                        FeatureAcquireRelease,
                                                        FeatureMClass]>;
@@ -520,11 +617,20 @@ def ARMv7s   : Architecture<"armv7s",      "ARMv7a",   [ARMv7a]>;
 
 
 //===----------------------------------------------------------------------===//
+// ARM schedules.
+//===----------------------------------------------------------------------===//
+//
+include "ARMSchedule.td"
+
+//===----------------------------------------------------------------------===//
 // ARM processors
 //
 
 // Dummy CPU, used to target architectures
-def : ProcNoItin<"generic",                             []>;
+def : ProcessorModel<"generic",     CortexA8Model,      []>;
+
+// FIXME: Several processors below are not using their own scheduler
+// model, but one of similar/previous processor. These should be fixed.
 
 def : ProcNoItin<"arm8",                                [ARMv4]>;
 def : ProcNoItin<"arm810",                              [ARMv4]>;
@@ -569,6 +675,7 @@ def : Processor<"cortex-m0plus",    ARMV6Itineraries,   [ARMv6m]>;
 def : Processor<"cortex-m1",        ARMV6Itineraries,   [ARMv6m]>;
 def : Processor<"sc000",            ARMV6Itineraries,   [ARMv6m]>;
 
+def : Processor<"arm1176j-s",       ARMV6Itineraries,   [ARMv6kz]>;
 def : Processor<"arm1176jz-s",      ARMV6Itineraries,   [ARMv6kz]>;
 def : Processor<"arm1176jzf-s",     ARMV6Itineraries,   [ARMv6kz,
                                                          FeatureVFP2,
@@ -584,7 +691,6 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries,   [ARMv6t2,
                                                          FeatureVFP2,
                                                          FeatureHasSlowFPVMLx]>;
 
-// FIXME: A5 has currently the same Schedule model as A8
 def : ProcessorModel<"cortex-a5",   CortexA8Model,      [ARMv7a, ProcA5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
@@ -603,8 +709,6 @@ def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
                                                          FeatureVMLxForwarding,
                                                          FeatureMP,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureVirtualization]>;
 
 def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
@@ -630,19 +734,15 @@ def : ProcessorModel<"cortex-a9",   CortexA9Model,      [ARMv7a, ProcA9,
                                                          FeatureCheckVLDnAlign,
                                                          FeatureMP]>;
 
-// FIXME: A12 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureVMLxForwarding,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization,
                                                          FeatureMP]>;
 
-// FIXME: A15 has currently the same Schedule model as A9.
 def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureDontWidenVMOVS,
                                                          FeatureHasRetAddrStack,
@@ -651,26 +751,19 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureVFP4,
                                                          FeatureMP,
                                                          FeatureCheckVLDnAlign,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: A17 has currently the same Schedule model as A9
 def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureHasRetAddrStack,
                                                          FeatureTrustZone,
                                                          FeatureMP,
                                                          FeatureVMLxForwarding,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
-// FIXME: krait has currently the same Schedule model as A9
-// FIXME: krait has currently the same features as A9 plus VFP4 and hardware
-//        division features.
+// FIXME: krait has currently the same features as A9 plus VFP4 and  HWDiv
 def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureHasRetAddrStack,
                                                          FeatureMuxedUnits,
@@ -679,7 +772,7 @@ def : ProcessorModel<"krait",       CortexA9Model,      [ARMv7a, ProcKrait,
                                                          FeatureFP16,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM]>;
 
 def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
@@ -687,7 +780,7 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureNEONForFP,
                                                          FeatureVFP4,
                                                          FeatureMP,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureAvoidMOVsShOp,
@@ -700,12 +793,10 @@ def : ProcessorModel<"swift",       SwiftModel,         [ARMv7a, ProcSwift,
                                                          FeatureSlowVGETLNi32,
                                                          FeatureSlowVDUP32]>;
 
-// FIXME: R4 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4",   CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R4F has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureHasRetAddrStack,
                                                          FeatureSlowFPBrcc,
@@ -714,7 +805,6 @@ def : ProcessorModel<"cortex-r4f",  CortexA8Model,      [ARMv7r, ProcR4,
                                                          FeatureD16,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -724,7 +814,6 @@ def : ProcessorModel<"cortex-r5",   CortexA8Model,      [ARMv7r, ProcR5,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5.
 def : ProcessorModel<"cortex-r7",   CortexA8Model,      [ARMv7r, ProcR7,
                                                          FeatureHasRetAddrStack,
                                                          FeatureVFP3,
@@ -747,63 +836,80 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
                                                          FeatureHasSlowFPVMLx,
                                                          FeatureAvoidPartialCPSR]>;
 
-def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
 
-def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
                                                          FeatureVFP4,
                                                          FeatureVFPOnlySP,
-                                                         FeatureD16]>;
+                                                         FeatureD16,
+                                                         FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
                                                          FeatureD16]>;
 
+def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
+                                                         FeatureNoMovt]>;
+
+def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,
+                                                         FeatureDSP,
+                                                         FeatureFPARMv8,
+                                                         FeatureD16,
+                                                         FeatureVFPOnlySP,
+                                                         FeatureHasNoBranchPredictor]>;
+
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
 def : ProcNoItin<"cortex-a35",                          [ARMv8a, ProcA35,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
 def : ProcNoItin<"cortex-a53",                          [ARMv8a, ProcA53,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC,
                                                          FeatureFPAO]>;
 
-def : ProcNoItin<"cortex-a57",                          [ARMv8a, ProcA57,
-                                                         FeatureHWDiv,
+def : ProcessorModel<"cortex-a57",  CortexA57Model,     [ARMv8a, ProcA57,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC,
-                                                         FeatureFPAO]>;
+                                                         FeatureFPAO,
+                                                         FeatureAvoidPartialCPSR,
+                                                         FeatureCheapPredicableCPSR]>;
 
 def : ProcNoItin<"cortex-a72",                          [ARMv8a, ProcA72,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
 def : ProcNoItin<"cortex-a73",                          [ARMv8a, ProcA73,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
-// Cyclone is very similar to swift
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
                                                          FeatureNEONForFP,
                                                          FeatureVFP4,
                                                          FeatureMP,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureAvoidMOVsShOp,
@@ -812,19 +918,25 @@ def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureZCZeroing]>;
 
 def : ProcNoItin<"exynos-m1",                           [ARMv8a, ProcExynosM1,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
 def : ProcNoItin<"exynos-m2",                           [ARMv8a, ProcExynosM1,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
 def : ProcNoItin<"exynos-m3",                           [ARMv8a, ProcExynosM1,
-                                                         FeatureHWDiv,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
+def : ProcNoItin<"kryo",                                [ARMv8a, ProcKryo,
+                                                         FeatureHWDivThumb,
                                                          FeatureHWDivARM,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
@@ -837,7 +949,7 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
 //===----------------------------------------------------------------------===//
 
 include "ARMRegisterInfo.td"
-
+include "ARMRegisterBanks.td"
 include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
@@ -845,7 +957,6 @@ include "ARMCallingConv.td"
 //===----------------------------------------------------------------------===//
 
 include "ARMInstrInfo.td"
-
 def ARMInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
@@ -866,7 +977,7 @@ def ARMAsmParserVariant : AsmParserVariant {
 }
 
 def ARM : Target {
-  // Pull in Instruction Info:
+  // Pull in Instruction Info.
   let InstructionSet = ARMInstrInfo;
   let AssemblyWriters = [ARMAsmWriter];
   let AssemblyParserVariants = [ARMAsmParserVariant];
diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 95db35c..582153d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -23,6 +23,8 @@
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
@@ -43,9 +45,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ARMBuildAttributes.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -589,12 +589,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   ATS.finishAttributeSection();
 }
 
-static bool isV8M(const ARMSubtarget *Subtarget) {
-  // Note that v8M Baseline is a subset of v6T2!
-  return (Subtarget->hasV8MBaselineOps() && !Subtarget->hasV6T2Ops()) ||
-         Subtarget->hasV8MMainlineOps();
-}
-
 //===----------------------------------------------------------------------===//
 // Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
 // FIXME:
@@ -602,39 +596,6 @@ static bool isV8M(const ARMSubtarget *Subtarget) {
 // to appear in the .ARM.attributes section in ELF.
 // Instead of subclassing the MCELFStreamer, we do the work here.
 
-static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
-                                            const ARMSubtarget *Subtarget) {
-  if (CPU == "xscale")
-    return ARMBuildAttrs::v5TEJ;
-
-  if (Subtarget->hasV8Ops()) {
-    if (Subtarget->isRClass())
-      return ARMBuildAttrs::v8_R;
-    return ARMBuildAttrs::v8_A;
-  } else if (Subtarget->hasV8MMainlineOps())
-    return ARMBuildAttrs::v8_M_Main;
-  else if (Subtarget->hasV7Ops()) {
-    if (Subtarget->isMClass() && Subtarget->hasDSP())
-      return ARMBuildAttrs::v7E_M;
-    return ARMBuildAttrs::v7;
-  } else if (Subtarget->hasV6T2Ops())
-    return ARMBuildAttrs::v6T2;
-  else if (Subtarget->hasV8MBaselineOps())
-    return ARMBuildAttrs::v8_M_Base;
-  else if (Subtarget->hasV6MOps())
-    return ARMBuildAttrs::v6S_M;
-  else if (Subtarget->hasV6Ops())
-    return ARMBuildAttrs::v6;
-  else if (Subtarget->hasV5TEOps())
-    return ARMBuildAttrs::v5TE;
-  else if (Subtarget->hasV5TOps())
-    return ARMBuildAttrs::v5T;
-  else if (Subtarget->hasV4TOps())
-    return ARMBuildAttrs::v4T;
-  else
-    return ARMBuildAttrs::v4;
-}
-
 // Returns true if all functions have the same function attribute value.
 // It also returns true when the module has no functions.
 static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr,
@@ -671,89 +632,8 @@ void ARMAsmPrinter::emitAttributes() {
       static_cast<const ARMBaseTargetMachine &>(TM);
   const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
 
-  const std::string &CPUString = STI.getCPUString();
-
-  if (!StringRef(CPUString).startswith("generic")) {
-    // FIXME: remove krait check when GNU tools support krait cpu
-    if (STI.isKrait()) {
-      ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
-      // We consider krait as a "cortex-a9" + hwdiv CPU
-      // Enable hwdiv through ".arch_extension idiv"
-      if (STI.hasDivide() || STI.hasDivideInARMMode())
-        ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM);
-    } else
-      ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
-  }
-
-  ATS.emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(CPUString, &STI));
-
-  // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
-  // profile is not applicable (e.g. pre v7, or cross-profile code)".
-  if (STI.hasV7Ops() || isV8M(&STI)) {
-    if (STI.isAClass()) {
-      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                        ARMBuildAttrs::ApplicationProfile);
-    } else if (STI.isRClass()) {
-      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                        ARMBuildAttrs::RealTimeProfile);
-    } else if (STI.isMClass()) {
-      ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                        ARMBuildAttrs::MicroControllerProfile);
-    }
-  }
-
-  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                    STI.hasARMOps() ? ARMBuildAttrs::Allowed
-                                    : ARMBuildAttrs::Not_Allowed);
-  if (isV8M(&STI)) {
-    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                      ARMBuildAttrs::AllowThumbDerived);
-  } else if (STI.isThumb1Only()) {
-    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
-  } else if (STI.hasThumb2()) {
-    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                      ARMBuildAttrs::AllowThumb32);
-  }
-
-  if (STI.hasNEON()) {
-    /* NEON is not exactly a VFP architecture, but GAS emit one of
-     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
-    if (STI.hasFPARMv8()) {
-      if (STI.hasCrypto())
-        ATS.emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8);
-      else
-        ATS.emitFPU(ARM::FK_NEON_FP_ARMV8);
-    } else if (STI.hasVFP4())
-      ATS.emitFPU(ARM::FK_NEON_VFPV4);
-    else
-      ATS.emitFPU(STI.hasFP16() ? ARM::FK_NEON_FP16 : ARM::FK_NEON);
-    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
-    if (STI.hasV8Ops())
-      ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                        STI.hasV8_1aOps() ? ARMBuildAttrs::AllowNeonARMv8_1a:
-                                            ARMBuildAttrs::AllowNeonARMv8);
-  } else {
-    if (STI.hasFPARMv8())
-      // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
-      // FPU, but there are two different names for it depending on the CPU.
-      ATS.emitFPU(STI.hasD16()
-                  ? (STI.isFPOnlySP() ? ARM::FK_FPV5_SP_D16 : ARM::FK_FPV5_D16)
-                  : ARM::FK_FP_ARMV8);
-    else if (STI.hasVFP4())
-      ATS.emitFPU(STI.hasD16()
-                  ? (STI.isFPOnlySP() ? ARM::FK_FPV4_SP_D16 : ARM::FK_VFPV4_D16)
-                  : ARM::FK_VFPV4);
-    else if (STI.hasVFP3())
-      ATS.emitFPU(STI.hasD16()
-                  // +d16
-                  ? (STI.isFPOnlySP()
-                     ? (STI.hasFP16() ? ARM::FK_VFPV3XD_FP16 : ARM::FK_VFPV3XD)
-                     : (STI.hasFP16() ? ARM::FK_VFPV3_D16_FP16 : ARM::FK_VFPV3_D16))
-                  // -d16
-                  : (STI.hasFP16() ? ARM::FK_VFPV3_FP16 : ARM::FK_VFPV3));
-    else if (STI.hasVFP2())
-      ATS.emitFPU(ARM::FK_VFPV2);
-  }
+  // Emit build attributes for the available hardware.
+  ATS.emitTargetAttributes(STI);
 
   // RW data addressing.
   if (isPositionIndependent()) {
@@ -844,34 +724,17 @@ void ARMAsmPrinter::emitAttributes() {
                       ARMBuildAttrs::Allowed);
   else
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                      ARMBuildAttrs::AllowIEE754);
-
-  if (STI.allowsUnalignedMem())
-    ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
-                      ARMBuildAttrs::Allowed);
-  else
-    ATS.emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
-                      ARMBuildAttrs::Not_Allowed);
+                      ARMBuildAttrs::AllowIEEE754);
 
   // FIXME: add more flags to ARMBuildAttributes.h
   // 8-bytes alignment stuff.
   ATS.emitAttribute(ARMBuildAttrs::ABI_align_needed, 1);
   ATS.emitAttribute(ARMBuildAttrs::ABI_align_preserved, 1);
 
-  // ABI_HardFP_use attribute to indicate single precision FP.
-  if (STI.isFPOnlySP())
-    ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
-                      ARMBuildAttrs::HardFPSinglePrecision);
-
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
   if (STI.isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
     ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
 
-  // FIXME: Should we signal R9 usage?
-
-  if (STI.hasFP16())
-    ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
-
   // FIXME: To support emitting this build attribute as GCC does, the
   // -mfp16-format option and associated plumbing must be
   // supported. For now the __fp16 type is exposed by default, so this
@@ -879,21 +742,6 @@ void ARMAsmPrinter::emitAttributes() {
   ATS.emitAttribute(ARMBuildAttrs::ABI_FP_16bit_format,
                     ARMBuildAttrs::FP16FormatIEEE);
 
-  if (STI.hasMPExtension())
-    ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
-
-  // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
-  // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
-  // It is not possible to produce DisallowDIV: if hwdiv is present in the base
-  // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
-  // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
-  // otherwise, the default value (AllowDIVIfExists) applies.
-  if (STI.hasDivideInARMMode() && !STI.hasV8Ops())
-    ATS.emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
-
-  if (STI.hasDSP() && isV8M(&STI))
-    ATS.emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed);
-
   if (MMI) {
     if (const Module *SourceModule = MMI->getModule()) {
       // ABI_PCS_wchar_t to indicate wchar_t width
@@ -930,16 +778,6 @@ void ARMAsmPrinter::emitAttributes() {
   else
     ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
                       ARMBuildAttrs::R9IsGPR);
-
-  if (STI.hasTrustZone() && STI.hasVirtualization())
-    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
-                      ARMBuildAttrs::AllowTZVirtualization);
-  else if (STI.hasTrustZone())
-    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
-                      ARMBuildAttrs::AllowTZ);
-  else if (STI.hasVirtualization())
-    ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
-                      ARMBuildAttrs::AllowVirtualization);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1142,6 +980,11 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
+  // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
+  // ARM mode tables.
+  EmitAlignment(2);
+
+  // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
   OutStreamer->EmitLabel(JTISymbol);
 
@@ -1255,11 +1098,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 
     switch (Opc) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
     case ARM::tPUSH:
       // Special case here: no src & dst reg, but two extra imp ops.
       StartOp = 2; NumOffset = 2;
+      LLVM_FALLTHROUGH;
     case ARM::STMDB_UPD:
     case ARM::t2STMDB_UPD:
     case ARM::VSTMDDB_UPD:
@@ -1291,7 +1135,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       int64_t Offset = 0;
       switch (Opc) {
       default:
-        MI->dump();
+        MI->print(errs());
         llvm_unreachable("Unsupported opcode for unwinding information");
       case ARM::MOVr:
       case ARM::tMOVr:
@@ -1346,11 +1190,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
         }
       }
     } else if (DstReg == ARM::SP) {
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
     }
     else {
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
     }
   }
@@ -1661,6 +1505,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::CONSTPOOL_ENTRY: {
+    if (Subtarget->genExecuteOnly())
+      llvm_unreachable("execute-only should not generate constant pools");
+
     /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
     /// in the function.  The first operand is the ID# for this instruction, the
     /// second is the index into the MachineConstantPool that this is, the third
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 70a3246..3cf5950 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -11,34 +11,58 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMFeatures.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -168,9 +192,8 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
                      .addReg(BaseReg)
                      .addImm(Amt)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     } else if (Amt != 0) {
       ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
       unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
@@ -180,17 +203,15 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
                      .addReg(OffReg)
                      .addReg(0)
                      .addImm(SOOpc)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     } else
       UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
                      .addReg(BaseReg)
                      .addReg(OffReg)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     break;
   }
   case ARMII::AddrMode3 : {
@@ -202,17 +223,15 @@ MachineInstr *ARMBaseInstrInfo::convertToThreeAddress(
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
                      .addReg(BaseReg)
                      .addImm(Amt)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     else
       UpdateMI = BuildMI(MF, MI.getDebugLoc(),
                          get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
                      .addReg(BaseReg)
                      .addReg(OffReg)
-                     .addImm(Pred)
-                     .addReg(0)
-                     .addReg(0);
+                     .add(predOps(Pred))
+                     .add(condCodeOp());
     break;
   }
   }
@@ -306,7 +325,6 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   // Walk backwards from the end of the basic block until the branch is
   // analyzed or we give up.
   while (isPredicated(*I) || I->isTerminator() || I->isDebugValue()) {
-
     // Flag to be raised on unanalyzeable instructions. This is useful in cases
     // where we want to clean up on the end of the basic block before we bail
     // out.
@@ -381,7 +399,6 @@ bool ARMBaseInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   return false;
 }
 
-
 unsigned ARMBaseInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                         int *BytesRemoved) const {
   assert(!BytesRemoved && "code size not handled");
@@ -433,20 +450,24 @@ unsigned ARMBaseInstrInfo::insertBranch(MachineBasicBlock &MBB,
   if (!FBB) {
     if (Cond.empty()) { // Unconditional branch?
       if (isThumb)
-        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).add(predOps(ARMCC::AL));
       else
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
     } else
-      BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
-        .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+      BuildMI(&MBB, DL, get(BccOpc))
+          .addMBB(TBB)
+          .addImm(Cond[0].getImm())
+          .add(Cond[1]);
     return 1;
   }
 
   // Two-way conditional branch.
-  BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
-    .addImm(Cond[0].getImm()).addOperand(Cond[1]);
+  BuildMI(&MBB, DL, get(BccOpc))
+      .addMBB(TBB)
+      .addImm(Cond[0].getImm())
+      .add(Cond[1]);
   if (isThumb)
-    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).add(predOps(ARMCC::AL));
   else
     BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
   return 2;
@@ -537,13 +558,68 @@ bool ARMBaseInstrInfo::DefinesPredicate(
   return Found;
 }
 
-static bool isCPSRDefined(const MachineInstr *MI) {
-  for (const auto &MO : MI->operands())
+bool ARMBaseInstrInfo::isCPSRDefined(const MachineInstr &MI) {
+  for (const auto &MO : MI.operands())
     if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead())
       return true;
   return false;
 }
 
+bool ARMBaseInstrInfo::isAddrMode3OpImm(const MachineInstr &MI,
+                                        unsigned Op) const {
+  const MachineOperand &Offset = MI.getOperand(Op + 1);
+  return Offset.getReg() != 0;
+}
+
+// Load with negative register offset requires additional 1cyc and +I unit
+// for Cortex A57
+bool ARMBaseInstrInfo::isAddrMode3OpMinusReg(const MachineInstr &MI,
+                                             unsigned Op) const {
+  const MachineOperand &Offset = MI.getOperand(Op + 1);
+  const MachineOperand &Opc = MI.getOperand(Op + 2);
+  assert(Opc.isImm());
+  assert(Offset.isReg());
+  int64_t OpcImm = Opc.getImm();
+
+  bool isSub = ARM_AM::getAM3Op(OpcImm) == ARM_AM::sub;
+  return (isSub && Offset.getReg() != 0);
+}
+
+bool ARMBaseInstrInfo::isLdstScaledReg(const MachineInstr &MI,
+                                       unsigned Op) const {
+  const MachineOperand &Opc = MI.getOperand(Op + 2);
+  unsigned OffImm = Opc.getImm();
+  return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
+}
+
+// Load, scaled register offset, not plus LSL2
+bool ARMBaseInstrInfo::isLdstScaledRegNotPlusLsl2(const MachineInstr &MI,
+                                                  unsigned Op) const {
+  const MachineOperand &Opc = MI.getOperand(Op + 2);
+  unsigned OffImm = Opc.getImm();
+
+  bool isAdd = ARM_AM::getAM2Op(OffImm) == ARM_AM::add;
+  unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+  ARM_AM::ShiftOpc ShiftOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+  if (ShiftOpc == ARM_AM::no_shift) return false; // not scaled
+  bool SimpleScaled = (isAdd && ShiftOpc == ARM_AM::lsl && Amt == 2);
+  return !SimpleScaled;
+}
+
+// Minus reg for ldstso addr mode
+bool ARMBaseInstrInfo::isLdstSoMinusReg(const MachineInstr &MI,
+                                        unsigned Op) const {
+  unsigned OffImm = MI.getOperand(Op + 2).getImm();
+  return ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+}
+
+// Load, scaled register offset
+bool ARMBaseInstrInfo::isAm2ScaledReg(const MachineInstr &MI,
+                                      unsigned Op) const {
+  unsigned OffImm = MI.getOperand(Op + 2).getImm();
+  return ARM_AM::getAM2ShiftOpc(OffImm) != ARM_AM::no_shift;
+}
+
 static bool isEligibleForITBlock(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return true;
@@ -569,14 +645,14 @@ static bool isEligibleForITBlock(const MachineInstr *MI) {
   case ARM::tSUBi3: // SUB (immediate) T1
   case ARM::tSUBi8: // SUB (immediate) T2
   case ARM::tSUBrr: // SUB (register) T1
-    return !isCPSRDefined(MI);
+    return !ARMBaseInstrInfo::isCPSRDefined(*MI);
   }
 }
 
 /// isPredicable - Return true if the specified instruction can be predicated.
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
-bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
   if (!MI.isPredicable())
     return false;
 
@@ -586,22 +662,25 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
   if (!isEligibleForITBlock(&MI))
     return false;
 
-  ARMFunctionInfo *AFI =
+  const ARMFunctionInfo *AFI =
       MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
+  // Neon instructions in Thumb2 IT blocks are deprecated, see ARMARM.
+  // In their ARM encoding, they can't be encoded in a conditional form.
+  if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+    return false;
+
   if (AFI->isThumb2Function()) {
     if (getSubtarget().restrictIT())
       return isV8EligibleForIT(&MI);
-  } else { // non-Thumb
-    if ((MI.getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
-      return false;
   }
 
   return true;
 }
 
 namespace llvm {
-template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
+
+template <> bool IsCPSRDead<MachineInstr>(const MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
@@ -614,7 +693,8 @@ template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
   // all definitions of CPSR are dead
   return true;
 }
-}
+
+} // end namespace llvm
 
 /// GetInstSize - Return the size of the specified MachineInstr.
 ///
@@ -698,9 +778,8 @@ void ARMBaseInstrInfo::copyFromCPSR(MachineBasicBlock &MBB,
   if (Subtarget.isMClass())
     MIB.addImm(0x800);
 
-  AddDefaultPred(MIB);
-
-  MIB.addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
+  MIB.add(predOps(ARMCC::AL))
+     .addReg(ARM::CPSR, RegState::Implicit | getKillRegState(KillSrc));
 }
 
 void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
@@ -718,11 +797,9 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB,
   else
     MIB.addImm(8);
 
-  MIB.addReg(SrcReg, getKillRegState(KillSrc));
-
-  AddDefaultPred(MIB);
-
-  MIB.addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
+  MIB.addReg(SrcReg, getKillRegState(KillSrc))
+     .add(predOps(ARMCC::AL))
+     .addReg(ARM::CPSR, RegState::Implicit | RegState::Define);
 }
 
 void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -733,8 +810,10 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
   if (GPRDest && GPRSrc) {
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
-                                    .addReg(SrcReg, getKillRegState(KillSrc))));
+    BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     return;
   }
 
@@ -758,7 +837,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
     if (Opc == ARM::VORRq)
       MIB.addReg(SrcReg, getKillRegState(KillSrc));
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
     return;
   }
 
@@ -845,10 +924,10 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // VORR takes two source operands.
     if (Opc == ARM::VORRq)
       Mov.addReg(Src);
-    Mov = AddDefaultPred(Mov);
+    Mov = Mov.add(predOps(ARMCC::AL));
     // MOVr can set CC.
     if (Opc == ARM::MOVr)
-      Mov = AddDefaultCC(Mov);
+      Mov = Mov.add(condCodeOp());
   }
   // Add implicit super-register defs and kills to the last instruction.
   Mov->addRegisterDefined(DestReg, TRI);
@@ -883,38 +962,47 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
       MFI.getObjectSize(FI), Align);
 
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
     case 4:
       if (ARM::GPRRegClass.hasSubClassEq(RC)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::STRi12))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VSTRS))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else
         llvm_unreachable("Unknown reg class!");
       break;
     case 8:
       if (ARM::DPRRegClass.hasSubClassEq(RC)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VSTRD))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addImm(0)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
         if (Subtarget.hasV5TEOps()) {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STRD));
           AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
           AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
-          MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
-
-          AddDefaultPred(MIB);
+          MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
+             .add(predOps(ARMCC::AL));
         } else {
           // Fallback to STM instruction, which has existed since the dawn of
           // time.
-          MachineInstrBuilder MIB =
-            AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
-                             .addFrameIndex(FI).addMemOperand(MMO));
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::STMIA))
+                                        .addFrameIndex(FI)
+                                        .addMemOperand(MMO)
+                                        .add(predOps(ARMCC::AL));
           AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
           AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
         }
@@ -925,15 +1013,18 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       if (ARM::DPairRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
-                     .addFrameIndex(FI).addImm(16)
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VST1q64))
+              .addFrameIndex(FI)
+              .addImm(16)
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         } else {
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addFrameIndex(FI)
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addFrameIndex(FI)
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         }
       } else
         llvm_unreachable("Unknown reg class!");
@@ -942,15 +1033,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
-                     .addFrameIndex(FI).addImm(16)
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VST1d64TPseudo))
+              .addFrameIndex(FI)
+              .addImm(16)
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         } else {
-          MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                       .addFrameIndex(FI))
-                       .addMemOperand(MMO);
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                                        .addFrameIndex(FI)
+                                        .add(predOps(ARMCC::AL))
+                                        .addMemOperand(MMO);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
           AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -963,15 +1056,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
           // FIXME: It's possible to only store part of the QQ register if the
           // spilled def has a sub-register index.
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
-                     .addFrameIndex(FI).addImm(16)
-                     .addReg(SrcReg, getKillRegState(isKill))
-                     .addMemOperand(MMO));
+          BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+              .addFrameIndex(FI)
+              .addImm(16)
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addMemOperand(MMO)
+              .add(predOps(ARMCC::AL));
         } else {
-          MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                       .addFrameIndex(FI))
-                       .addMemOperand(MMO);
+          MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                                        .addFrameIndex(FI)
+                                        .add(predOps(ARMCC::AL))
+                                        .addMemOperand(MMO);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
           MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -982,10 +1077,10 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       break;
     case 64:
       if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
-        MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                         .addFrameIndex(FI))
-                         .addMemOperand(MMO);
+        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                                      .addFrameIndex(FI)
+                                      .add(predOps(ARMCC::AL))
+                                      .addMemOperand(MMO);
         MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
         MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
         MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
@@ -1065,22 +1160,31 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
       MFI.getObjectSize(FI), Align);
 
-  switch (RC->getSize()) {
+  switch (TRI->getSpillSize(*RC)) {
   case 4:
     if (ARM::GPRRegClass.hasSubClassEq(RC)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
 
     } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
     } else
       llvm_unreachable("Unknown reg class!");
     break;
   case 8:
     if (ARM::DPRRegClass.hasSubClassEq(RC)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addMemOperand(MMO)
+          .add(predOps(ARMCC::AL));
     } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
       MachineInstrBuilder MIB;
 
@@ -1088,14 +1192,15 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         MIB = BuildMI(MBB, I, DL, get(ARM::LDRD));
         AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
         AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
-        MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO);
-
-        AddDefaultPred(MIB);
+        MIB.addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO)
+           .add(predOps(ARMCC::AL));
       } else {
         // Fallback to LDM instruction, which has existed since the dawn of
         // time.
-        MIB = AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDMIA))
-                                 .addFrameIndex(FI).addMemOperand(MMO));
+        MIB = BuildMI(MBB, I, DL, get(ARM::LDMIA))
+                  .addFrameIndex(FI)
+                  .addMemOperand(MMO)
+                  .add(predOps(ARMCC::AL));
         MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
       }
@@ -1108,13 +1213,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   case 16:
     if (ARM::DPairRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
-                     .addFrameIndex(FI).addImm(16)
-                     .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+            .addFrameIndex(FI)
+            .addImm(16)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
-                       .addFrameIndex(FI)
-                       .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+            .addFrameIndex(FI)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       }
     } else
       llvm_unreachable("Unknown reg class!");
@@ -1122,14 +1230,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   case 24:
     if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
-                     .addFrameIndex(FI).addImm(16)
-                     .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
+            .addFrameIndex(FI)
+            .addImm(16)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else {
-        MachineInstrBuilder MIB =
-          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                         .addFrameIndex(FI)
-                         .addMemOperand(MMO));
+        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                      .addFrameIndex(FI)
+                                      .addMemOperand(MMO)
+                                      .add(predOps(ARMCC::AL));
         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
@@ -1142,14 +1252,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
    case 32:
     if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
       if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
-                     .addFrameIndex(FI).addImm(16)
-                     .addMemOperand(MMO));
+        BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+            .addFrameIndex(FI)
+            .addImm(16)
+            .addMemOperand(MMO)
+            .add(predOps(ARMCC::AL));
       } else {
-        MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                       .addFrameIndex(FI))
-                       .addMemOperand(MMO);
+        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                      .addFrameIndex(FI)
+                                      .add(predOps(ARMCC::AL))
+                                      .addMemOperand(MMO);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
@@ -1162,10 +1274,10 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case 64:
     if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
-      MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                     .addFrameIndex(FI))
-                     .addMemOperand(MMO);
+      MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                    .addFrameIndex(FI)
+                                    .add(predOps(ARMCC::AL))
+                                    .addMemOperand(MMO);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
       MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
@@ -1248,7 +1360,7 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
     LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD
                                                  : isThumb1 ? ARM::tLDMIA_UPD
                                                             : ARM::LDMIA_UPD))
-             .addOperand(MI->getOperand(1));
+              .add(MI->getOperand(1));
   } else {
     LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA));
   }
@@ -1257,17 +1369,17 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
     STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD
                                                  : isThumb1 ? ARM::tSTMIA_UPD
                                                             : ARM::STMIA_UPD))
-             .addOperand(MI->getOperand(0));
+              .add(MI->getOperand(0));
   } else {
     STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA));
   }
 
-  AddDefaultPred(LDM.addOperand(MI->getOperand(3)));
-  AddDefaultPred(STM.addOperand(MI->getOperand(2)));
+  LDM.add(MI->getOperand(3)).add(predOps(ARMCC::AL));
+  STM.add(MI->getOperand(2)).add(predOps(ARMCC::AL));
 
   // Sort the scratch registers into ascending order.
   const TargetRegisterInfo &TRI = getRegisterInfo();
-  llvm::SmallVector<unsigned, 6> ScratchRegs;
+  SmallVector<unsigned, 6> ScratchRegs;
   for(unsigned I = 5; I < MI->getNumOperands(); ++I)
     ScratchRegs.push_back(MI->getOperand(I).getReg());
   std::sort(ScratchRegs.begin(), ScratchRegs.end(),
@@ -1285,7 +1397,6 @@ void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const {
   BB->erase(MI);
 }
 
-
 bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   if (MI.getOpcode() == TargetOpcode::LOAD_STACK_GUARD) {
     assert(getSubtarget().getTargetTriple().isOSBinFormatMachO() &&
@@ -1346,7 +1457,7 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MI.setDesc(get(ARM::VMOVD));
   MI.getOperand(0).setReg(DstRegD);
   MI.getOperand(1).setReg(SrcRegD);
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   // We are now reading SrcRegD instead of SrcRegS.  This may upset the
   // register scavenger and machine verifier, so we need to indicate that we
@@ -1735,39 +1846,63 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
       }
     }
   }
-
-  // Attempt to estimate the relative costs of predication versus branching.
-  // Here we scale up each component of UnpredCost to avoid precision issue when
-  // scaling NumCycles by Probability.
-  const unsigned ScalingUpFactor = 1024;
-  unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
-  UnpredCost += ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
-
-  return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
+  return isProfitableToIfCvt(MBB, NumCycles, ExtraPredCycles,
+                             MBB, 0, 0, Probability);
 }
 
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
                     unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &FMBB,
+                    MachineBasicBlock &FBB,
                     unsigned FCycles, unsigned FExtra,
                     BranchProbability Probability) const {
-  if (!TCycles || !FCycles)
+  if (!TCycles)
     return false;
 
   // Attempt to estimate the relative costs of predication versus branching.
   // Here we scale up each component of UnpredCost to avoid precision issue when
   // scaling TCycles/FCycles by Probability.
   const unsigned ScalingUpFactor = 1024;
-  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
-  unsigned FUnpredCost =
+
+  unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+  unsigned UnpredCost;
+  if (!Subtarget.hasBranchPredictor()) {
+    // When we don't have a branch predictor it's always cheaper to not take a
+    // branch than take it, so we have to take that into account.
+    unsigned NotTakenBranchCost = 1;
+    unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+    unsigned TUnpredCycles, FUnpredCycles;
+    if (!FCycles) {
+      // Triangle: TBB is the fallthrough
+      TUnpredCycles = TCycles + NotTakenBranchCost;
+      FUnpredCycles = TakenBranchCost;
+    } else {
+      // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+      TUnpredCycles = TCycles + TakenBranchCost;
+      FUnpredCycles = FCycles + NotTakenBranchCost;
+      // The branch at the end of FBB will disappear when it's predicated, so
+      // discount it from PredCost.
+      PredCost -= 1 * ScalingUpFactor;
+    }
+    // The total cost is the cost of each path scaled by their probabilites
+    unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+    unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+    UnpredCost = TUnpredCost + FUnpredCost;
+    // When predicating assume that the first IT can be folded away but later
+    // ones cost one cycle each
+    if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+      PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+    }
+  } else {
+    unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+    unsigned FUnpredCost =
       Probability.getCompl().scale(FCycles * ScalingUpFactor);
-  unsigned UnpredCost = TUnpredCost + FUnpredCost;
-  UnpredCost += 1 * ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+    UnpredCost = TUnpredCost + FUnpredCost;
+    UnpredCost += 1 * ScalingUpFactor; // The branch itself
+    UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+  }
 
-  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+  return PredCost <= UnpredCost;
 }
 
 bool
@@ -1793,7 +1928,6 @@ ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
   return (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
 }
 
-
 unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) {
   if (Opc == ARM::B)
     return ARM::Bcc;
@@ -1920,25 +2054,25 @@ ARMBaseInstrInfo::optimizeSelect(MachineInstr &MI,
   const MCInstrDesc &DefDesc = DefMI->getDesc();
   for (unsigned i = 1, e = DefDesc.getNumOperands();
        i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
-    NewMI.addOperand(DefMI->getOperand(i));
+    NewMI.add(DefMI->getOperand(i));
 
   unsigned CondCode = MI.getOperand(3).getImm();
   if (Invert)
     NewMI.addImm(ARMCC::getOppositeCondition(ARMCC::CondCodes(CondCode)));
   else
     NewMI.addImm(CondCode);
-  NewMI.addOperand(MI.getOperand(4));
+  NewMI.add(MI.getOperand(4));
 
   // DefMI is not the -S version that sets CPSR, so add an optional %noreg.
   if (NewMI->hasOptionalDef())
-    AddDefaultCC(NewMI);
+    NewMI.add(condCodeOp());
 
   // The output register value when the predicate is false is an implicit
   // register operand tied to the first def.
   // The tie makes the register allocator ensure the FalseReg is allocated the
   // same register as operand 0.
   FalseReg.setImplicit();
-  NewMI.addOperand(FalseReg);
+  NewMI.add(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
 
   // Update SeenMIs set: register newly created MI and erase removed DefMI.
@@ -1983,6 +2117,16 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::RSBSrsi, ARM::RSBrsi},
   {ARM::RSBSrsr, ARM::RSBrsr},
 
+  {ARM::tADDSi3, ARM::tADDi3},
+  {ARM::tADDSi8, ARM::tADDi8},
+  {ARM::tADDSrr, ARM::tADDrr},
+  {ARM::tADCS, ARM::tADC},
+
+  {ARM::tSUBSi3, ARM::tSUBi3},
+  {ARM::tSUBSi8, ARM::tSUBi8},
+  {ARM::tSUBSrr, ARM::tSUBrr},
+  {ARM::tSBCS, ARM::tSBC},
+
   {ARM::t2ADDSri, ARM::t2ADDri},
   {ARM::t2ADDSrr, ARM::t2ADDrr},
   {ARM::t2ADDSrs, ARM::t2ADDrs},
@@ -2011,9 +2155,10 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                    unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
     BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
-      .addReg(BaseReg, RegState::Kill)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-      .setMIFlags(MIFlags);
+        .addReg(BaseReg, RegState::Kill)
+        .add(predOps(Pred, PredReg))
+        .add(condCodeOp())
+        .setMIFlags(MIFlags);
     return;
   }
 
@@ -2033,9 +2178,11 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
     // Build the new ADD / SUB.
     unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
     BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
-      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-      .setMIFlags(MIFlags);
+        .addReg(BaseReg, RegState::Kill)
+        .addImm(ThisVal)
+        .add(predOps(Pred, PredReg))
+        .add(condCodeOp())
+        .setMIFlags(MIFlags);
     BaseReg = DestReg;
   }
 }
@@ -2154,7 +2301,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
   // Add the complete list back in.
   MachineInstrBuilder MIB(MF, &*MI);
   for (int i = RegList.size() - 1; i >= 0; --i)
-    MIB.addOperand(RegList[i]);
+    MIB.add(RegList[i]);
 
   return true;
 }
@@ -2213,33 +2360,30 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     unsigned NumBits = 0;
     unsigned Scale = 1;
     switch (AddrMode) {
-    case ARMII::AddrMode_i12: {
+    case ARMII::AddrMode_i12:
       ImmIdx = FrameRegIdx + 1;
       InstrOffs = MI.getOperand(ImmIdx).getImm();
       NumBits = 12;
       break;
-    }
-    case ARMII::AddrMode2: {
+    case ARMII::AddrMode2:
       ImmIdx = FrameRegIdx+2;
       InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
       if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
         InstrOffs *= -1;
       NumBits = 12;
       break;
-    }
-    case ARMII::AddrMode3: {
+    case ARMII::AddrMode3:
       ImmIdx = FrameRegIdx+2;
       InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
       if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
         InstrOffs *= -1;
       NumBits = 8;
       break;
-    }
     case ARMII::AddrMode4:
     case ARMII::AddrMode6:
       // Can't fold any offset even if it's zero.
       return false;
-    case ARMII::AddrMode5: {
+    case ARMII::AddrMode5:
       ImmIdx = FrameRegIdx+1;
       InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
       if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
@@ -2247,7 +2391,6 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       NumBits = 8;
       Scale = 4;
       break;
-    }
     default:
       llvm_unreachable("Unsupported addressing mode!");
     }
@@ -2401,6 +2544,63 @@ inline static bool isRedundantFlagInstr(MachineInstr *CmpI, unsigned SrcReg,
   return false;
 }
 
+static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) {
+  switch (MI->getOpcode()) {
+  default: return false;
+  case ARM::tLSLri:
+  case ARM::tLSRri:
+  case ARM::tLSLrr:
+  case ARM::tLSRrr:
+  case ARM::tSUBrr:
+  case ARM::tADDrr:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tMUL:
+    IsThumb1 = true;
+    LLVM_FALLTHROUGH;
+  case ARM::RSBrr:
+  case ARM::RSBri:
+  case ARM::RSCrr:
+  case ARM::RSCri:
+  case ARM::ADDrr:
+  case ARM::ADDri:
+  case ARM::ADCrr:
+  case ARM::ADCri:
+  case ARM::SUBrr:
+  case ARM::SUBri:
+  case ARM::SBCrr:
+  case ARM::SBCri:
+  case ARM::t2RSBri:
+  case ARM::t2ADDrr:
+  case ARM::t2ADDri:
+  case ARM::t2ADCrr:
+  case ARM::t2ADCri:
+  case ARM::t2SUBrr:
+  case ARM::t2SUBri:
+  case ARM::t2SBCrr:
+  case ARM::t2SBCri:
+  case ARM::ANDrr:
+  case ARM::ANDri:
+  case ARM::t2ANDrr:
+  case ARM::t2ANDri:
+  case ARM::ORRrr:
+  case ARM::ORRri:
+  case ARM::t2ORRrr:
+  case ARM::t2ORRri:
+  case ARM::EORrr:
+  case ARM::EORri:
+  case ARM::t2EORrr:
+  case ARM::t2EORri:
+  case ARM::t2LSRri:
+  case ARM::t2LSRrr:
+  case ARM::t2LSLri:
+  case ARM::t2LSLrr:
+    return true;
+  }
+}
+
 /// optimizeCompareInstr - Convert the instruction supplying the argument to the
 /// comparison into one that sets the zero bit in the flags register;
 /// Remove a redundant Compare instruction if an earlier instruction can set the
@@ -2462,6 +2662,41 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
       return false;
   }
 
+  bool IsThumb1 = false;
+  if (MI && !isOptimizeCompareCandidate(MI, IsThumb1))
+    return false;
+
+  // We also want to do this peephole for cases like this: if (a*b == 0),
+  // and optimise away the CMP instruction from the generated code sequence:
+  // MULS, MOVS, MOVS, CMP. Here the MOVS instructions load the boolean values
+  // resulting from the select instruction, but these MOVS instructions for
+  // Thumb1 (V6M) are flag setting and are thus preventing this optimisation.
+  // However, if we only have MOVS instructions in between the CMP and the
+  // other instruction (the MULS in this example), then the CPSR is dead so we
+  // can safely reorder the sequence into: MOVS, MOVS, MULS, CMP. We do this
+  // reordering and then continue the analysis hoping we can eliminate the
+  // CMP. This peephole works on the vregs, so is still in SSA form. As a
+  // consequence, the movs won't redefine/kill the MUL operands which would
+  // make this reordering illegal.
+  if (MI && IsThumb1) {
+    --I;
+    bool CanReorder = true;
+    const bool HasStmts = I != E;
+    for (; I != E; --I) {
+      if (I->getOpcode() != ARM::tMOVi8) {
+        CanReorder = false;
+        break;
+      }
+    }
+    if (HasStmts && CanReorder) {
+      MI = MI->removeFromParent();
+      E = CmpInstr;
+      CmpInstr.getParent()->insert(E, MI);
+    }
+    I = CmpInstr;
+    E = MI;
+  }
+
   // Check that CPSR isn't set between the comparison instruction and the one we
   // want to change. At the same time, search for Sub.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -2497,183 +2732,128 @@ bool ARMBaseInstrInfo::optimizeCompareInstr(
   if (isPredicated(*MI))
     return false;
 
-  bool IsThumb1 = false;
-  switch (MI->getOpcode()) {
-  default: break;
-  case ARM::tLSLri:
-  case ARM::tLSRri:
-  case ARM::tLSLrr:
-  case ARM::tLSRrr:
-  case ARM::tSUBrr:
-  case ARM::tADDrr:
-  case ARM::tADDi3:
-  case ARM::tADDi8:
-  case ARM::tSUBi3:
-  case ARM::tSUBi8:
-    IsThumb1 = true;
-    LLVM_FALLTHROUGH;
-  case ARM::RSBrr:
-  case ARM::RSBri:
-  case ARM::RSCrr:
-  case ARM::RSCri:
-  case ARM::ADDrr:
-  case ARM::ADDri:
-  case ARM::ADCrr:
-  case ARM::ADCri:
-  case ARM::SUBrr:
-  case ARM::SUBri:
-  case ARM::SBCrr:
-  case ARM::SBCri:
-  case ARM::t2RSBri:
-  case ARM::t2ADDrr:
-  case ARM::t2ADDri:
-  case ARM::t2ADCrr:
-  case ARM::t2ADCri:
-  case ARM::t2SUBrr:
-  case ARM::t2SUBri:
-  case ARM::t2SBCrr:
-  case ARM::t2SBCri:
-  case ARM::ANDrr:
-  case ARM::ANDri:
-  case ARM::t2ANDrr:
-  case ARM::t2ANDri:
-  case ARM::ORRrr:
-  case ARM::ORRri:
-  case ARM::t2ORRrr:
-  case ARM::t2ORRri:
-  case ARM::EORrr:
-  case ARM::EORri:
-  case ARM::t2EORrr:
-  case ARM::t2EORri:
-  case ARM::t2LSRri:
-  case ARM::t2LSRrr:
-  case ARM::t2LSLri:
-  case ARM::t2LSLrr: {
-    // Scan forward for the use of CPSR
-    // When checking against MI: if it's a conditional code that requires
-    // checking of the V bit or C bit, then this is not safe to do.
-    // It is safe to remove CmpInstr if CPSR is redefined or killed.
-    // If we are done with the basic block, we need to check whether CPSR is
-    // live-out.
-    SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
-        OperandsToUpdate;
-    bool isSafe = false;
-    I = CmpInstr;
-    E = CmpInstr.getParent()->end();
-    while (!isSafe && ++I != E) {
-      const MachineInstr &Instr = *I;
-      for (unsigned IO = 0, EO = Instr.getNumOperands();
-           !isSafe && IO != EO; ++IO) {
-        const MachineOperand &MO = Instr.getOperand(IO);
-        if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
-          isSafe = true;
-          break;
-        }
-        if (!MO.isReg() || MO.getReg() != ARM::CPSR)
-          continue;
-        if (MO.isDef()) {
-          isSafe = true;
-          break;
-        }
-        // Condition code is after the operand before CPSR except for VSELs.
-        ARMCC::CondCodes CC;
-        bool IsInstrVSel = true;
-        switch (Instr.getOpcode()) {
-        default:
-          IsInstrVSel = false;
-          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
-          break;
-        case ARM::VSELEQD:
-        case ARM::VSELEQS:
-          CC = ARMCC::EQ;
-          break;
-        case ARM::VSELGTD:
-        case ARM::VSELGTS:
-          CC = ARMCC::GT;
-          break;
-        case ARM::VSELGED:
-        case ARM::VSELGES:
-          CC = ARMCC::GE;
-          break;
-        case ARM::VSELVSS:
-        case ARM::VSELVSD:
-          CC = ARMCC::VS;
-          break;
-        }
+  // Scan forward for the use of CPSR
+  // When checking against MI: if it's a conditional code that requires
+  // checking of the V bit or C bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if CPSR is redefined or killed.
+  // If we are done with the basic block, we need to check whether CPSR is
+  // live-out.
+  SmallVector<std::pair<MachineOperand*, ARMCC::CondCodes>, 4>
+      OperandsToUpdate;
+  bool isSafe = false;
+  I = CmpInstr;
+  E = CmpInstr.getParent()->end();
+  while (!isSafe && ++I != E) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands();
+         !isSafe && IO != EO; ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(ARM::CPSR)) {
+        isSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != ARM::CPSR)
+        continue;
+      if (MO.isDef()) {
+        isSafe = true;
+        break;
+      }
+      // Condition code is after the operand before CPSR except for VSELs.
+      ARMCC::CondCodes CC;
+      bool IsInstrVSel = true;
+      switch (Instr.getOpcode()) {
+      default:
+        IsInstrVSel = false;
+        CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+        break;
+      case ARM::VSELEQD:
+      case ARM::VSELEQS:
+        CC = ARMCC::EQ;
+        break;
+      case ARM::VSELGTD:
+      case ARM::VSELGTS:
+        CC = ARMCC::GT;
+        break;
+      case ARM::VSELGED:
+      case ARM::VSELGES:
+        CC = ARMCC::GE;
+        break;
+      case ARM::VSELVSS:
+      case ARM::VSELVSD:
+        CC = ARMCC::VS;
+        break;
+      }
 
-        if (Sub) {
-          ARMCC::CondCodes NewCC = getSwappedCondition(CC);
-          if (NewCC == ARMCC::AL)
-            return false;
-          // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
-          // on CMP needs to be updated to be based on SUB.
-          // Push the condition code operands to OperandsToUpdate.
-          // If it is safe to remove CmpInstr, the condition code of these
-          // operands will be modified.
-          if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-              Sub->getOperand(2).getReg() == SrcReg) {
-            // VSel doesn't support condition code update.
-            if (IsInstrVSel)
-              return false;
-            OperandsToUpdate.push_back(
-                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
-          }
-        } else {
-          // No Sub, so this is x = <op> y, z; cmp x, 0.
-          switch (CC) {
-          case ARMCC::EQ: // Z
-          case ARMCC::NE: // Z
-          case ARMCC::MI: // N
-          case ARMCC::PL: // N
-          case ARMCC::AL: // none
-            // CPSR can be used multiple times, we should continue.
-            break;
-          case ARMCC::HS: // C
-          case ARMCC::LO: // C
-          case ARMCC::VS: // V
-          case ARMCC::VC: // V
-          case ARMCC::HI: // C Z
-          case ARMCC::LS: // C Z
-          case ARMCC::GE: // N V
-          case ARMCC::LT: // N V
-          case ARMCC::GT: // Z N V
-          case ARMCC::LE: // Z N V
-            // The instruction uses the V bit or C bit which is not safe.
+      if (Sub) {
+        ARMCC::CondCodes NewCC = getSwappedCondition(CC);
+        if (NewCC == ARMCC::AL)
+          return false;
+        // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based
+        // on CMP needs to be updated to be based on SUB.
+        // Push the condition code operands to OperandsToUpdate.
+        // If it is safe to remove CmpInstr, the condition code of these
+        // operands will be modified.
+        if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
+            Sub->getOperand(2).getReg() == SrcReg) {
+          // VSel doesn't support condition code update.
+          if (IsInstrVSel)
             return false;
-          }
+          OperandsToUpdate.push_back(
+              std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
         }
-      }
-    }
-
-    // If CPSR is not killed nor re-defined, we should check whether it is
-    // live-out. If it is live-out, do not optimize.
-    if (!isSafe) {
-      MachineBasicBlock *MBB = CmpInstr.getParent();
-      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-               SE = MBB->succ_end(); SI != SE; ++SI)
-        if ((*SI)->isLiveIn(ARM::CPSR))
+      } else {
+        // No Sub, so this is x = <op> y, z; cmp x, 0.
+        switch (CC) {
+        case ARMCC::EQ: // Z
+        case ARMCC::NE: // Z
+        case ARMCC::MI: // N
+        case ARMCC::PL: // N
+        case ARMCC::AL: // none
+          // CPSR can be used multiple times, we should continue.
+          break;
+        case ARMCC::HS: // C
+        case ARMCC::LO: // C
+        case ARMCC::VS: // V
+        case ARMCC::VC: // V
+        case ARMCC::HI: // C Z
+        case ARMCC::LS: // C Z
+        case ARMCC::GE: // N V
+        case ARMCC::LT: // N V
+        case ARMCC::GT: // Z N V
+        case ARMCC::LE: // Z N V
+          // The instruction uses the V bit or C bit which is not safe.
           return false;
+        }
+      }
     }
+  }
 
-    // Toggle the optional operand to CPSR (if it exists - in Thumb1 we always
-    // set CPSR so this is represented as an explicit output)
-    if (!IsThumb1) {
-      MI->getOperand(5).setReg(ARM::CPSR);
-      MI->getOperand(5).setIsDef(true);
-    }
-    assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
-    CmpInstr.eraseFromParent();
-
-    // Modify the condition code of operands in OperandsToUpdate.
-    // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
-    // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
-    for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
-      OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
-    return true;
+  // If CPSR is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!isSafe) {
+    MachineBasicBlock *MBB = CmpInstr.getParent();
+    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+             SE = MBB->succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(ARM::CPSR))
+        return false;
   }
+
+  // Toggle the optional operand to CPSR (if it exists - in Thumb1 we always
+  // set CPSR so this is represented as an explicit output)
+  if (!IsThumb1) {
+    MI->getOperand(5).setReg(ARM::CPSR);
+    MI->getOperand(5).setIsDef(true);
   }
-  
-  return false;
+  assert(!isPredicated(*MI) && "Can't use flags from predicated instruction");
+  CmpInstr.eraseFromParent();
+
+  // Modify the condition code of operands in OperandsToUpdate.
+  // Since we have SUB(r1, r2) and CMP(r2, r1), the condition code needs to
+  // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
+  for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
+    OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+
+  return true;
 }
 
 bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
@@ -2728,7 +2908,7 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     switch (UseOpc) {
     default: break;
     case ARM::ADDrr:
-    case ARM::SUBrr: {
+    case ARM::SUBrr:
       if (UseOpc == ARM::SUBrr && Commute)
         return false;
 
@@ -2744,9 +2924,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
       SOImmValV2 = (uint32_t)ARM_AM::getSOImmTwoPartSecond(ImmVal);
       break;
-    }
     case ARM::ORRrr:
-    case ARM::EORrr: {
+    case ARM::EORrr:
       if (!ARM_AM::isSOImmTwoPartVal(ImmVal))
         return false;
       SOImmValV1 = (uint32_t)ARM_AM::getSOImmTwoPartFirst(ImmVal);
@@ -2757,9 +2936,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       case ARM::EORrr: NewUseOpc = ARM::EORri; break;
       }
       break;
-    }
     case ARM::t2ADDrr:
-    case ARM::t2SUBrr: {
+    case ARM::t2SUBrr:
       if (UseOpc == ARM::t2SUBrr && Commute)
         return false;
 
@@ -2775,9 +2953,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
       SOImmValV2 = (uint32_t)ARM_AM::getT2SOImmTwoPartSecond(ImmVal);
       break;
-    }
     case ARM::t2ORRrr:
-    case ARM::t2EORrr: {
+    case ARM::t2EORrr:
       if (!ARM_AM::isT2SOImmTwoPartVal(ImmVal))
         return false;
       SOImmValV1 = (uint32_t)ARM_AM::getT2SOImmTwoPartFirst(ImmVal);
@@ -2789,7 +2966,6 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       }
       break;
     }
-    }
   }
   }
 
@@ -2797,11 +2973,12 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   unsigned Reg1 = UseMI.getOperand(OpIdx).getReg();
   bool isKill = UseMI.getOperand(OpIdx).isKill();
   unsigned NewReg = MRI->createVirtualRegister(MRI->getRegClass(Reg));
-  AddDefaultCC(
-      AddDefaultPred(BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
-                             get(NewUseOpc), NewReg)
-                         .addReg(Reg1, getKillRegState(isKill))
-                         .addImm(SOImmValV1)));
+  BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), get(NewUseOpc),
+          NewReg)
+      .addReg(Reg1, getKillRegState(isKill))
+      .addImm(SOImmValV1)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
   UseMI.setDesc(get(NewUseOpc));
   UseMI.getOperand(1).setReg(NewReg);
   UseMI.getOperand(1).setIsKill();
@@ -3261,6 +3438,22 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
   return DefCycle;
 }
 
+bool ARMBaseInstrInfo::isLDMBaseRegInList(const MachineInstr &MI) const {
+  unsigned BaseReg = MI.getOperand(0).getReg();
+  for (unsigned i = 1, sz = MI.getNumOperands(); i < sz; ++i) {
+    const auto &Op = MI.getOperand(i);
+    if (Op.isReg() && Op.getReg() == BaseReg)
+      return true;
+  }
+  return false;
+}
+unsigned
+ARMBaseInstrInfo::getLDMVariableDefsSize(const MachineInstr &MI) const {
+  // ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops
+  // (outs GPR:$wb), (ins GPR:$Rn, pred:$p (2xOp), reglist:$regs, variable_ops)
+  return MI.getNumOperands() + 1 - MI.getDesc().getNumOperands();
+}
+
 int
 ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
                                  const MCInstrDesc &DefMCID,
@@ -3413,7 +3606,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   case ARM::t2LDMDB:
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB_UPD:
-    LdmBypass = 1;
+    LdmBypass = true;
     DefCycle = getLDMDefCycle(ItinData, DefMCID, DefClass, DefIdx, DefAlign);
     break;
   }
@@ -3888,12 +4081,11 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     case ARM::t2LDRs:
     case ARM::t2LDRBs:
     case ARM::t2LDRHs:
-    case ARM::t2LDRSHs: {
+    case ARM::t2LDRSHs:
       // Thumb2 mode: lsl 0-3 only.
       Latency -= 2;
       break;
     }
-    }
   }
 
   if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment())
@@ -4032,7 +4224,8 @@ unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr &MI) const {
 
   const MCInstrDesc &MCID = MI.getDesc();
 
-  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+  if (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) &&
+                        !Subtarget.cheapPredicableCPSRDef())) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
     return 1;
@@ -4061,7 +4254,8 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   }
 
   const MCInstrDesc &MCID = MI.getDesc();
-  if (PredCost && (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR))) {
+  if (PredCost && (MCID.isCall() || (MCID.hasImplicitDefOfPhysReg(ARM::CPSR) &&
+                                     !Subtarget.cheapPredicableCPSRDef()))) {
     // When predicated, CPSR is an additional source operand for CPSR updating
     // instructions, this apparently increases their latencies.
     *PredCost = 1;
@@ -4180,14 +4374,14 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                  MachineMemOperand::MOInvariant;
     MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
         MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
-    MIB.addMemOperand(MMO);
-    AddDefaultPred(MIB);
+    MIB.addMemOperand(MMO).add(predOps(ARMCC::AL));
   }
 
   MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg);
-  MIB.addReg(Reg, RegState::Kill).addImm(0);
-  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-  AddDefaultPred(MIB);
+  MIB.addReg(Reg, RegState::Kill)
+     .addImm(0)
+     .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
+     .add(predOps(ARMCC::AL));
 }
 
 bool
@@ -4222,6 +4416,7 @@ enum ARMExeDomain {
   ExeVFP = 1,
   ExeNEON = 2
 };
+
 //
 // Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
 //
@@ -4345,8 +4540,10 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
     // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
     MI.setDesc(get(ARM::VORRd));
-    AddDefaultPred(
-        MIB.addReg(DstReg, RegState::Define).addReg(SrcReg).addReg(SrcReg));
+    MIB.addReg(DstReg, RegState::Define)
+        .addReg(SrcReg)
+        .addReg(SrcReg)
+        .add(predOps(ARMCC::AL));
     break;
   case ARM::VMOVRS:
     if (Domain != ExeNEON)
@@ -4366,9 +4563,10 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     // Note that DSrc has been widened and the other lane may be undef, which
     // contaminates the entire register.
     MI.setDesc(get(ARM::VGETLNi32));
-    AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
-                       .addReg(DReg, RegState::Undef)
-                       .addImm(Lane));
+    MIB.addReg(DstReg, RegState::Define)
+        .addReg(DReg, RegState::Undef)
+        .addImm(Lane)
+        .add(predOps(ARMCC::AL));
 
     // The old source should be an implicit use, otherwise we might think it
     // was dead before here.
@@ -4398,8 +4596,8 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
     MIB.addReg(DReg, RegState::Define)
         .addReg(DReg, getUndefRegState(!MI.readsRegister(DReg, TRI)))
         .addReg(SrcReg)
-        .addImm(Lane);
-    AddDefaultPred(MIB);
+        .addImm(Lane)
+        .add(predOps(ARMCC::AL));
 
     // The narrower destination must be marked as set to keep previous chains
     // in place.
@@ -4433,8 +4631,8 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
         MI.setDesc(get(ARM::VDUPLN32d));
         MIB.addReg(DDst, RegState::Define)
             .addReg(DDst, getUndefRegState(!MI.readsRegister(DDst, TRI)))
-            .addImm(SrcLane);
-        AddDefaultPred(MIB);
+            .addImm(SrcLane)
+            .add(predOps(ARMCC::AL));
 
         // Neither the source or the destination are naturally represented any
         // more, so add them in manually.
@@ -4470,10 +4668,9 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
       CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
       CurUndef = !MI.readsRegister(CurReg, TRI);
-      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
-
-      NewMIB.addImm(1);
-      AddDefaultPred(NewMIB);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef))
+            .addImm(1)
+            .add(predOps(ARMCC::AL));
 
       if (SrcLane == DstLane)
         NewMIB.addReg(SrcReg, RegState::Implicit);
@@ -4489,10 +4686,9 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
 
       CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
       CurUndef = CurReg == DSrc && !MI.readsRegister(CurReg, TRI);
-      MIB.addReg(CurReg, getUndefRegState(CurUndef));
-
-      MIB.addImm(1);
-      AddDefaultPred(MIB);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef))
+         .addImm(1)
+         .add(predOps(ARMCC::AL));
 
       if (SrcLane != DstLane)
         MIB.addReg(SrcReg, RegState::Implicit);
@@ -4505,7 +4701,6 @@ void ARMBaseInstrInfo::setExecutionDomain(MachineInstr &MI,
       break;
     }
   }
-
 }
 
 //===----------------------------------------------------------------------===//
@@ -4613,9 +4808,9 @@ void ARMBaseInstrInfo::breakPartialRegDependency(
 
   // Insert the dependency-breaking FCONSTD before MI.
   // 96 is the encoding of 0.5, but the actual value doesn't matter here.
-  AddDefaultPred(
-      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
-          .addImm(96));
+  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(ARM::FCONSTD), DReg)
+      .addImm(96)
+      .add(predOps(ARMCC::AL));
   MI.addRegisterKilled(DReg, TRI, true);
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index b01d5c8..c52e572 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -17,16 +17,21 @@
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/CodeGen.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <array>
+#include <cstdint>
 
 #define GET_INSTRINFO_HEADER
 #include "ARMGenInstrInfo.inc"
 
 namespace llvm {
-  class ARMSubtarget;
-  class ARMBaseRegisterInfo;
+
+class ARMBaseRegisterInfo;
+class ARMSubtarget;
 
 class ARMBaseInstrInfo : public ARMGenInstrInfo {
   const ARMSubtarget &Subtarget;
@@ -100,13 +105,9 @@ public:
   // Return whether the target has an explicit NOP encoding.
   bool hasNOP() const;
 
-  virtual void getNoopForElfTarget(MCInst &NopInst) const {
-    getNoopForMachoTarget(NopInst);
-  }
-
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
-  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
 
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineInstr &MI,
@@ -156,7 +157,25 @@ public:
   bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
+
+  // CPSR defined in instruction
+  static bool isCPSRDefined(const MachineInstr &MI);
+  bool isAddrMode3OpImm(const MachineInstr &MI, unsigned Op) const;
+  bool isAddrMode3OpMinusReg(const MachineInstr &MI, unsigned Op) const;
+
+  // Load, scaled register offset
+  bool isLdstScaledReg(const MachineInstr &MI, unsigned Op) const;
+  // Load, scaled register offset, not plus LSL2
+  bool isLdstScaledRegNotPlusLsl2(const MachineInstr &MI, unsigned Op) const;
+  // Minus reg for ldstso addr mode
+  bool isLdstSoMinusReg(const MachineInstr &MI, unsigned Op) const;
+  // Scaled register offset in address mode 2
+  bool isAm2ScaledReg(const MachineInstr &MI, unsigned Op) const;
+  // Load multiple, base reg in list
+  bool isLDMBaseRegInList(const MachineInstr &MI) const;
+  // get LDM variable defs size
+  unsigned getLDMVariableDefsSize(const MachineInstr &MI) const;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
@@ -399,27 +418,43 @@ public:
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   bool isSwiftFastImmShift(const MachineInstr *MI) const;
-};
 
-static inline
-const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
-  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
-}
+  /// Returns predicate register associated with the given frame instruction.
+  unsigned getFramePred(const MachineInstr &MI) const {
+    assert(isFrameInstr(MI));
+    // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP:
+    // - argument declared in the pattern:
+    // 0 - frame size
+    // 1 - arg of CALLSEQ_START/CALLSEQ_END
+    // 2 - predicate code (like ARMCC::AL)
+    // - added by predOps:
+    // 3 - predicate reg
+    return MI.getOperand(3).getReg();
+  }
+};
 
-static inline
-const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
-  return MIB.addReg(0);
+/// Get the operands corresponding to the given \p Pred value. By default, the
+/// predicate register is assumed to be 0 (no register), but you can pass in a
+/// \p PredReg if that is not the case.
+static inline std::array<MachineOperand, 2> predOps(ARMCC::CondCodes Pred,
+                                                    unsigned PredReg = 0) {
+  return {{MachineOperand::CreateImm(static_cast<int64_t>(Pred)),
+           MachineOperand::CreateReg(PredReg, false)}};
 }
 
-static inline
-const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
-                                          bool isDead = false) {
-  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+/// Get the operand corresponding to the conditional code result. By default,
+/// this is 0 (no register).
+static inline MachineOperand condCodeOp(unsigned CCReg = 0) {
+  return MachineOperand::CreateReg(CCReg, false);
 }
 
-static inline
-const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
-  return MIB.addReg(0);
+/// Get the operand corresponding to the conditional code result for Thumb1.
+/// This operand will always refer to CPSR and it will have the Define flag set.
+/// You can optionally set the Dead flag by means of \p isDead.
+static inline MachineOperand t1CondCodeOp(bool isDead = false) {
+  return MachineOperand::CreateReg(ARM::CPSR,
+                                   /*Define*/ true, /*Implicit*/ false,
+                                   /*Kill*/ false, isDead);
 }
 
 static inline
@@ -517,6 +552,6 @@ bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                          unsigned FrameReg, int &Offset,
                          const ARMBaseInstrInfo &TII);
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index d995c63..370c0a7 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -18,25 +18,35 @@
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <utility>
 
 #define DEBUG_TYPE "arm-register-info"
 
@@ -46,7 +56,7 @@
 using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo()
-    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), BasePtr(ARM::R6) {}
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC) {}
 
 static unsigned getFramePointerReg(const ARMSubtarget &STI) {
   return STI.useR7AsFramePointer() ? ARM::R7 : ARM::R11;
@@ -107,7 +117,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return CSR_NoRegs_RegMask;
 
   if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() &&
@@ -140,7 +150,6 @@ ARMBaseRegisterInfo::getSjLjDispatchPreservedMask(const MachineFunction &MF) con
     return CSR_FPRegs_RegMask;
 }
 
-
 const uint32_t *
 ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
                                                 CallingConv::ID CC) const {
@@ -154,7 +163,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   // both or otherwise does not want to enable this optimization, the function
   // should return NULL
   if (CC == CallingConv::GHC)
-    // This is academic becase all GHC calls are (supposed to be) tail calls
+    // This is academic because all GHC calls are (supposed to be) tail calls
     return nullptr;
   return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask
                               : CSR_AAPCS_ThisReturn_RegMask;
@@ -184,10 +193,11 @@ getReservedRegs(const MachineFunction &MF) const {
     for (unsigned R = 0; R < 16; ++R)
       markSuperRegs(Reserved, ARM::D16 + R);
   }
-  const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
-  for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
-    for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
-      if (Reserved.test(*SI)) markSuperRegs(Reserved, *I);
+  const TargetRegisterClass &RC = ARM::GPRPairRegClass;
+  for (unsigned Reg : RC)
+    for (MCSubRegIterator SI(Reg, this); SI.isValid(); ++SI)
+      if (Reserved.test(*SI))
+        markSuperRegs(Reserved, Reg);
 
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
@@ -236,11 +246,18 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   switch (RC->getID()) {
   default:
     return 0;
-  case ARM::tGPRRegClassID:
-    return TFI->hasFP(MF) ? 4 : 5;
+  case ARM::tGPRRegClassID: {
+    // hasFP ends up calling getMaxCallFrameComputed() which may not be
+    // available when getPressureLimit() is called as part of
+    // ScheduleDAGRRList.
+    bool HasFP = MF.getFrameInfo().isMaxCallFrameSizeComputed()
+                 ? TFI->hasFP(MF) : true;
+    return 5 - HasFP;
+  }
   case ARM::GPRRegClassID: {
-    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
-    return 10 - FP - (STI.isR9Reserved() ? 1 : 0);
+    bool HasFP = MF.getFrameInfo().isMaxCallFrameSizeComputed()
+                 ? TFI->hasFP(MF) : true;
+    return 10 - HasFP - (STI.isR9Reserved() ? 1 : 0);
   }
   case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
   case ARM::DPRRegClassID:
@@ -299,8 +316,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     Hints.push_back(PairedPhys);
 
   // Then prefer even or odd registers.
-  for (unsigned I = 0, E = Order.size(); I != E; ++I) {
-    unsigned Reg = Order[I];
+  for (unsigned Reg : Order) {
     if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
       continue;
     // Don't provide hints that are paired to a reserved register.
@@ -425,10 +441,11 @@ void ARMBaseRegisterInfo::emitLoadConstPool(
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
-    .addReg(DestReg, getDefRegState(true), SubIdx)
-    .addConstantPoolIndex(Idx)
-    .addImm(0).addImm(Pred).addReg(PredReg)
-    .setMIFlags(MIFlags);
+      .addReg(DestReg, getDefRegState(true), SubIdx)
+      .addConstantPoolIndex(Idx)
+      .addImm(0)
+      .add(predOps(Pred, PredReg))
+      .setMIFlags(MIFlags);
 }
 
 bool ARMBaseRegisterInfo::
@@ -474,26 +491,23 @@ getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const {
     Scale = 4;
     break;
   }
-  case ARMII::AddrMode2: {
+  case ARMII::AddrMode2:
     ImmIdx = Idx+2;
     InstrOffs = ARM_AM::getAM2Offset(MI->getOperand(ImmIdx).getImm());
     if (ARM_AM::getAM2Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
       InstrOffs = -InstrOffs;
     break;
-  }
-  case ARMII::AddrMode3: {
+  case ARMII::AddrMode3:
     ImmIdx = Idx+2;
     InstrOffs = ARM_AM::getAM3Offset(MI->getOperand(ImmIdx).getImm());
     if (ARM_AM::getAM3Op(MI->getOperand(ImmIdx).getImm()) == ARM_AM::sub)
       InstrOffs = -InstrOffs;
     break;
-  }
-  case ARMII::AddrModeT1_s: {
+  case ARMII::AddrModeT1_s:
     ImmIdx = Idx+1;
     InstrOffs = MI->getOperand(ImmIdx).getImm();
     Scale = 4;
     break;
-  }
   default:
     llvm_unreachable("Unsupported addressing mode!");
   }
@@ -609,7 +623,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx).addImm(Offset);
 
   if (!AFI->isThumb1OnlyFunction())
-    AddDefaultCC(AddDefaultPred(MIB));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
 }
 
 void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
@@ -636,7 +650,7 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
     assert(AFI->isThumb2Function());
     Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
   }
-  assert (Done && "Unable to resolve frame index!");
+  assert(Done && "Unable to resolve frame index!");
   (void)Done;
 }
 
@@ -645,11 +659,8 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned Ba
   const MCInstrDesc &Desc = MI->getDesc();
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
   unsigned i = 0;
-
-  while (!MI->getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!");
-  }
+  for (; !MI->getOperand(i).isFI(); ++i)
+    assert(i+1 < MI->getNumOperands() && "Instr doesn't have FrameIndex operand!");
 
   // AddrMode4 and AddrMode6 cannot handle any offset.
   if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
@@ -799,7 +810,8 @@ bool ARMBaseRegisterInfo::shouldCoalesce(MachineInstr *MI,
   if (!DstSubReg)
     return true;
   // Small registers don't frequently cause a problem, so we can coalesce them.
-  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+  if (getRegSizeInBits(*NewRC) < 256 && getRegSizeInBits(*DstRC) < 256 &&
+      getRegSizeInBits(*SrcRC) < 256)
     return true;
 
   auto NewRCWeight =
diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 330e153..2e91d9d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -15,24 +15,33 @@
 #define LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cstdint>
 
 #define GET_REGINFO_HEADER
 #include "ARMGenRegisterInfo.inc"
 
 namespace llvm {
+
 /// Register allocation hints.
 namespace ARMRI {
+
   enum {
     RegPairOdd  = 1,
     RegPairEven = 2
   };
-}
+
+} // end namespace ARMRI
 
 /// isARMArea1Register - Returns true if the register is a low register (r0-r7)
 /// or a stack/pc register that we should push/pop.
 static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
+
   switch (Reg) {
     case R0:  case R1:  case R2:  case R3:
     case R4:  case R5:  case R6:  case R7:
@@ -48,6 +57,7 @@ static inline bool isARMArea1Register(unsigned Reg, bool isIOS) {
 
 static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
+
   switch (Reg) {
     case R8: case R9: case R10: case R11: case R12:
       // iOS has this second area.
@@ -59,6 +69,7 @@ static inline bool isARMArea2Register(unsigned Reg, bool isIOS) {
 
 static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   using namespace ARM;
+
   switch (Reg) {
     case D15: case D14: case D13: case D12:
     case D11: case D10: case D9:  case D8:
@@ -87,7 +98,7 @@ protected:
   /// BasePtr - ARM physical register used as a base ptr in complex stack
   /// frames. I.e., when we need a 3rd base, not just SP and FP, due to
   /// variable size stack objects.
-  unsigned BasePtr;
+  unsigned BasePtr = ARM::R6;
 
   // Can be only subclassed.
   explicit ARMBaseRegisterInfo();
@@ -198,4 +209,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMBASEREGISTERINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h b/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
index 780544f..e0cb0aa 100644
--- a/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
 
-#include "ARM.h"
-#include "ARMMachineFunctionInfo.h"
-using namespace llvm;
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cstdint>
 
 namespace llvm {
 
@@ -44,31 +44,30 @@ struct BasicBlockInfo {
   ///
   /// Because worst case padding is used, the computed offset of an aligned
   /// block may not actually be aligned.
-  unsigned Offset;
+  unsigned Offset = 0;
 
   /// Size - Size of the basic block in bytes.  If the block contains
   /// inline assembly, this is a worst case estimate.
   ///
   /// The size does not include any alignment padding whether from the
   /// beginning of the block, or from an aligned jump table at the end.
-  unsigned Size;
+  unsigned Size = 0;
 
   /// KnownBits - The number of low bits in Offset that are known to be
   /// exact.  The remaining bits of Offset are an upper bound.
-  uint8_t KnownBits;
+  uint8_t KnownBits = 0;
 
   /// Unalign - When non-zero, the block contains instructions (inline asm)
   /// of unknown size.  The real size may be smaller than Size bytes by a
   /// multiple of 1 << Unalign.
-  uint8_t Unalign;
+  uint8_t Unalign = 0;
 
   /// PostAlign - When non-zero, the block terminator contains a .align
   /// directive, so the end of the block is aligned to 1 << PostAlign
   /// bytes.
-  uint8_t PostAlign;
+  uint8_t PostAlign = 0;
 
-  BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0),
-    PostAlign(0) {}
+  BasicBlockInfo() = default;
 
   /// Compute the number of known offset bits internally to this block.
   /// This number should be used to predict worst case padding when
@@ -107,4 +106,4 @@ struct BasicBlockInfo {
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMBASICBLOCKINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
index 52c95b6..051827a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -17,8 +17,11 @@
 
 #include "ARMBaseInstrInfo.h"
 #include "ARMISelLowering.h"
+#include "ARMSubtarget.h"
 
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
@@ -30,25 +33,61 @@ using namespace llvm;
 ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
     : CallLowering(&TLI) {}
 
-static bool isSupportedType(const DataLayout DL, const ARMTargetLowering &TLI,
+static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
                             Type *T) {
-  EVT VT = TLI.getValueType(DL, T);
-  if (!VT.isSimple() || !VT.isInteger() || VT.isVector())
+  if (T->isArrayTy())
+    return true;
+
+  if (T->isStructTy()) {
+    // For now we only allow homogeneous structs that we can manipulate with
+    // G_MERGE_VALUES and G_UNMERGE_VALUES
+    auto StructT = cast<StructType>(T);
+    for (unsigned i = 1, e = StructT->getNumElements(); i != e; ++i)
+      if (StructT->getElementType(i) != StructT->getElementType(0))
+        return false;
+    return true;
+  }
+
+  EVT VT = TLI.getValueType(DL, T, true);
+  if (!VT.isSimple() || VT.isVector() ||
+      !(VT.isInteger() || VT.isFloatingPoint()))
     return false;
 
   unsigned VTSize = VT.getSimpleVT().getSizeInBits();
-  return VTSize == 8 || VTSize == 16 || VTSize == 32;
+
+  if (VTSize == 64)
+    // FIXME: Support i64 too
+    return VT.isFloatingPoint();
+
+  return VTSize == 1 || VTSize == 8 || VTSize == 16 || VTSize == 32;
 }
 
 namespace {
-struct FuncReturnHandler : public CallLowering::ValueHandler {
-  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                    MachineInstrBuilder &MIB)
-      : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+/// Helper class for values going out through an ABI boundary (used for handling
+/// function return values and call parameters).
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), StackSize(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
-    llvm_unreachable("Don't know how to get a stack address yet");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    LLT p0 = LLT::pointer(0, 32);
+    LLT s32 = LLT::scalar(32);
+    unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildCopy(SPReg, ARM::SP);
+
+    unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+    MIRBuilder.buildConstant(OffsetReg, Offset);
+
+    unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    return AddrReg;
   }
 
   void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
@@ -56,26 +95,123 @@ struct FuncReturnHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
-    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
-
-    assert(VA.getLocInfo() != CCValAssign::SExt &&
-           VA.getLocInfo() != CCValAssign::ZExt &&
-           "ABI extensions not supported yet");
+    assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
 
-    MIRBuilder.buildCopy(PhysReg, ValVReg);
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
     MIB.addUse(PhysReg, RegState::Implicit);
   }
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    llvm_unreachable("Don't know how to assign a value to an address yet");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
+        /* Alignment */ 0);
+    MIRBuilder.buildStore(ExtReg, Addr, *MMO);
+  }
+
+  unsigned assignCustomValue(const CallLowering::ArgInfo &Arg,
+                             ArrayRef<CCValAssign> VAs) override {
+    CCValAssign VA = VAs[0];
+    assert(VA.needsCustom() && "Value doesn't need custom handling");
+    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    CCValAssign NextVA = VAs[1];
+    assert(NextVA.needsCustom() && "Value doesn't need custom handling");
+    assert(NextVA.getValVT() == MVT::f64 && "Unsupported type");
+
+    assert(VA.getValNo() == NextVA.getValNo() &&
+           "Values belong to different arguments");
+
+    assert(VA.isRegLoc() && "Value should be in reg");
+    assert(NextVA.isRegLoc() && "Value should be in reg");
+
+    unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+                          MRI.createGenericVirtualRegister(LLT::scalar(32))};
+    MIRBuilder.buildUnmerge(NewRegs, Arg.Reg);
+
+    bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
+    if (!IsLittle)
+      std::swap(NewRegs[0], NewRegs[1]);
+
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+    assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+
+    return 1;
+  }
+
+  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                 CCValAssign::LocInfo LocInfo,
+                 const CallLowering::ArgInfo &Info, CCState &State) override {
+    if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State))
+      return true;
+
+    StackSize =
+        std::max(StackSize, static_cast<uint64_t>(State.getNextStackOffset()));
+    return false;
   }
 
   MachineInstrBuilder &MIB;
+  uint64_t StackSize;
 };
 } // End anonymous namespace.
 
+void ARMCallLowering::splitToValueTypes(
+    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+    MachineFunction &MF, const SplitArgTy &PerformArgSplit) const {
+  const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+  const DataLayout &DL = MF.getDataLayout();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function *F = MF.getFunction();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+  if (SplitVTs.size() == 1) {
+    // Even if there is no splitting to do, we still want to replace the
+    // original type (e.g. pointer type -> integer).
+    auto Flags = OrigArg.Flags;
+    unsigned OriginalAlignment = DL.getABITypeAlignment(OrigArg.Ty);
+    Flags.setOrigAlign(OriginalAlignment);
+    SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx), Flags,
+                           OrigArg.IsFixed);
+    return;
+  }
+
+  unsigned FirstRegIdx = SplitArgs.size();
+  for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
+    EVT SplitVT = SplitVTs[i];
+    Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
+    auto Flags = OrigArg.Flags;
+
+    unsigned OriginalAlignment = DL.getABITypeAlignment(SplitTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    bool NeedsConsecutiveRegisters =
+        TLI.functionArgumentNeedsConsecutiveRegisters(
+            SplitTy, F->getCallingConv(), F->isVarArg());
+    if (NeedsConsecutiveRegisters) {
+      Flags.setInConsecutiveRegs();
+      if (i == e - 1)
+        Flags.setInConsecutiveRegsLast();
+    }
+
+    SplitArgs.push_back(
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+                SplitTy, Flags, OrigArg.IsFixed});
+  }
+
+  for (unsigned i = 0; i < Offsets.size(); ++i)
+    PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
+}
+
 /// Lower the return value for the already existing \p Ret. This assumes that
 /// \p MIRBuilder's insertion point is correct.
 bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
@@ -93,21 +229,29 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
   if (!isSupportedType(DL, TLI, Val->getType()))
     return false;
 
+  SmallVector<ArgInfo, 4> SplitVTs;
+  SmallVector<unsigned, 4> Regs;
+  ArgInfo RetInfo(VReg, Val->getType());
+  setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+  splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) {
+    Regs.push_back(Reg);
+  });
+
+  if (Regs.size() > 1)
+    MIRBuilder.buildUnmerge(Regs, VReg);
+
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
 
-  ArgInfo RetInfo(VReg, Val->getType());
-  setArgFlags(RetInfo, AttributeSet::ReturnIndex, DL, F);
-
-  FuncReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret);
-  return handleAssignments(MIRBuilder, AssignFn, RetInfo, RetHandler);
+  OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
+  return handleAssignments(MIRBuilder, SplitVTs, RetHandler);
 }
 
 bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                   const Value *Val, unsigned VReg) const {
   assert(!Val == !VReg && "Return value without a vreg");
 
-  auto Ret = AddDefaultPred(MIRBuilder.buildInstrNoInsert(ARM::BX_RET));
+  auto Ret = MIRBuilder.buildInstrNoInsert(ARM::BX_RET).add(predOps(ARMCC::AL));
 
   if (!lowerReturnVal(MIRBuilder, Val, VReg, Ret))
     return false;
@@ -117,13 +261,17 @@ bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 }
 
 namespace {
-struct FormalArgHandler : public CallLowering::ValueHandler {
-  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
-      : ValueHandler(MIRBuilder, MRI) {}
+/// Helper class for values coming in through an ABI boundary (used for handling
+/// formal arguments and call return values).
+struct IncomingValueHandler : public CallLowering::ValueHandler {
+  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       CCAssignFn AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
-    assert(Size == 4 && "Unsupported size");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
 
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
 
@@ -139,11 +287,30 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    assert(Size == 4 && "Unsupported size");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    if (VA.getLocInfo() == CCValAssign::SExt ||
+        VA.getLocInfo() == CCValAssign::ZExt) {
+      // If the value is zero- or sign-extended, its size becomes 4 bytes, so
+      // that's what we should load.
+      Size = 4;
+      assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
+
+      auto LoadVReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+      buildLoad(LoadVReg, Addr, Size, /* Alignment */ 0, MPO);
+      MIRBuilder.buildTrunc(ValVReg, LoadVReg);
+    } else {
+      // If the value is not extended, a simple load will suffice.
+      buildLoad(ValVReg, Addr, Size, /* Alignment */ 0, MPO);
+    }
+  }
 
+  void buildLoad(unsigned Val, unsigned Addr, uint64_t Size, unsigned Alignment,
+                 MachinePointerInfo &MPO) {
     auto MMO = MIRBuilder.getMF().getMachineMemOperand(
-        MPO, MachineMemOperand::MOLoad, Size, /* Alignment */ 0);
-    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+        MPO, MachineMemOperand::MOLoad, Size, Alignment);
+    MIRBuilder.buildLoad(Val, Addr, *MMO);
   }
 
   void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
@@ -151,12 +318,60 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
-    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+    assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
 
-    MIRBuilder.getMBB().addLiveIn(PhysReg);
+    // The necessary extensions are handled on the other side of the ABI
+    // boundary.
+    markPhysRegUsed(PhysReg);
     MIRBuilder.buildCopy(ValVReg, PhysReg);
   }
+
+  unsigned assignCustomValue(const ARMCallLowering::ArgInfo &Arg,
+                             ArrayRef<CCValAssign> VAs) override {
+    CCValAssign VA = VAs[0];
+    assert(VA.needsCustom() && "Value doesn't need custom handling");
+    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    CCValAssign NextVA = VAs[1];
+    assert(NextVA.needsCustom() && "Value doesn't need custom handling");
+    assert(NextVA.getValVT() == MVT::f64 && "Unsupported type");
+
+    assert(VA.getValNo() == NextVA.getValNo() &&
+           "Values belong to different arguments");
+
+    assert(VA.isRegLoc() && "Value should be in reg");
+    assert(NextVA.isRegLoc() && "Value should be in reg");
+
+    unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+                          MRI.createGenericVirtualRegister(LLT::scalar(32))};
+
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+    assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+
+    bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
+    if (!IsLittle)
+      std::swap(NewRegs[0], NewRegs[1]);
+
+    MIRBuilder.buildMerge(Arg.Reg, NewRegs);
+
+    return 1;
+  }
+
+  /// Marking a physical register as used is different between formal
+  /// parameters, where it's a basic block live-in, and call returns, where it's
+  /// an implicit-def of the call instruction.
+  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+};
+
+struct FormalArgHandler : public IncomingValueHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+  }
 };
 } // End anonymous namespace
 
@@ -170,34 +385,150 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (F.isVarArg())
     return false;
 
-  auto DL = MIRBuilder.getMF().getDataLayout();
+  auto &MF = MIRBuilder.getMF();
+  auto &MBB = MIRBuilder.getMBB();
+  auto DL = MF.getDataLayout();
   auto &TLI = *getTLI<ARMTargetLowering>();
 
-  auto &Args = F.getArgumentList();
-  unsigned ArgIdx = 0;
-  for (auto &Arg : Args) {
-    ArgIdx++;
-    if (!isSupportedType(DL, TLI, Arg.getType()))
-      return false;
+  auto Subtarget = TLI.getSubtarget();
+
+  if (Subtarget->isThumb())
+    return false;
 
-    // FIXME: This check as well as ArgIdx are going away as soon as we support
-    // loading values < 32 bits.
-    if (ArgIdx > 4 && Arg.getType()->getIntegerBitWidth() != 32)
+  for (auto &Arg : F.args())
+    if (!isSupportedType(DL, TLI, Arg.getType()))
       return false;
-  }
 
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
 
+  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
+                              AssignFn);
+
   SmallVector<ArgInfo, 8> ArgInfos;
+  SmallVector<unsigned, 4> SplitRegs;
   unsigned Idx = 0;
-  for (auto &Arg : Args) {
+  for (auto &Arg : F.args()) {
     ArgInfo AInfo(VRegs[Idx], Arg.getType());
-    setArgFlags(AInfo, Idx + 1, DL, F);
-    ArgInfos.push_back(AInfo);
+    setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F);
+
+    SplitRegs.clear();
+
+    splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
+      SplitRegs.push_back(Reg);
+    });
+
+    if (!SplitRegs.empty())
+      MIRBuilder.buildMerge(VRegs[Idx], SplitRegs);
+
     Idx++;
   }
 
-  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo());
-  return handleAssignments(MIRBuilder, AssignFn, ArgInfos, ArgHandler);
+  if (!MBB.empty())
+    MIRBuilder.setInstr(*MBB.begin());
+
+  return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
+}
+
+namespace {
+struct CallReturnHandler : public IncomingValueHandler {
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIB.addDef(PhysReg, RegState::Implicit);
+  }
+
+  MachineInstrBuilder MIB;
+};
+} // End anonymous namespace.
+
+bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                CallingConv::ID CallConv,
+                                const MachineOperand &Callee,
+                                const ArgInfo &OrigRet,
+                                ArrayRef<ArgInfo> OrigArgs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto &TLI = *getTLI<ARMTargetLowering>();
+  const auto &DL = MF.getDataLayout();
+  const auto &STI = MF.getSubtarget();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  if (MF.getSubtarget<ARMSubtarget>().genLongCalls())
+    return false;
+
+  auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN);
+
+  // Create the call instruction so we can add the implicit uses of arg
+  // registers, but don't insert it yet.
+  auto MIB = MIRBuilder.buildInstrNoInsert(ARM::BLX).add(Callee).addRegMask(
+      TRI->getCallPreservedMask(MF, CallConv));
+  if (Callee.isReg()) {
+    auto CalleeReg = Callee.getReg();
+    if (CalleeReg && !TRI->isPhysicalRegister(CalleeReg))
+      MIB->getOperand(0).setReg(constrainOperandRegClass(
+          MF, *TRI, MRI, *STI.getInstrInfo(), *STI.getRegBankInfo(),
+          *MIB.getInstr(), MIB->getDesc(), CalleeReg, 0));
+  }
+
+  SmallVector<ArgInfo, 8> ArgInfos;
+  for (auto Arg : OrigArgs) {
+    if (!isSupportedType(DL, TLI, Arg.Ty))
+      return false;
+
+    if (!Arg.IsFixed)
+      return false;
+
+    SmallVector<unsigned, 8> Regs;
+    splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
+      Regs.push_back(Reg);
+    });
+
+    if (Regs.size() > 1)
+      MIRBuilder.buildUnmerge(Regs, Arg.Reg);
+  }
+
+  auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+  OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
+  if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+    return false;
+
+  // Now we can add the actual call instruction to the correct basic block.
+  MIRBuilder.insertInstr(MIB);
+
+  if (!OrigRet.Ty->isVoidTy()) {
+    if (!isSupportedType(DL, TLI, OrigRet.Ty))
+      return false;
+
+    ArgInfos.clear();
+    SmallVector<unsigned, 8> SplitRegs;
+    splitToValueTypes(OrigRet, ArgInfos, MF,
+                      [&](unsigned Reg, uint64_t Offset) {
+                        SplitRegs.push_back(Reg);
+                      });
+
+    auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false);
+    CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
+    if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
+      return false;
+
+    if (!SplitRegs.empty()) {
+      // We have split the value and allocated each individual piece, now build
+      // it up again.
+      MIRBuilder.buildMerge(OrigRet.Reg, SplitRegs);
+    }
+  }
+
+  // We now know the size of the stack - update the ADJCALLSTACKDOWN
+  // accordingly.
+  CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL));
+
+  MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
+      .addImm(ArgHandler.StackSize)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
+
+  return true;
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
index 6a1b886..f5a6872 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -34,9 +34,22 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
 
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
+                 ArrayRef<ArgInfo> OrigArgs) const override;
+
 private:
   bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
                       unsigned VReg, MachineInstrBuilder &Ret) const;
+
+  typedef std::function<void(unsigned Reg, uint64_t Offset)> SplitArgTy;
+
+  /// Split an argument into one or more arguments that the CC lowering can cope
+  /// with (e.g. replace pointers with integers).
+  void splitToValueTypes(const ArgInfo &OrigArg,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         MachineFunction &MF,
+                         const SplitArgTy &PerformArgSplit) const;
 };
 } // End of namespace llvm
 #endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
index 7a7b7fe..bc7afdb 100644
--- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -273,9 +273,9 @@ def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>;
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
 
-def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP,
-                                           (sequence "R%u", 12, 1),
-                                           (sequence "D%u", 31, 0))>;
+def CSR_iOS_TLSCall
+    : CalleeSavedRegs<(add LR, SP, (sub(sequence "R%u", 12, 1), R9, R12),
+                      (sequence "D%u", 31, 0))>;
 
 // C++ TLS access function saves all registers except SP. Try to match
 // the order of CSRs in CSR_iOS.
diff --git a/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp b/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
index 64f187d..e145d0a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMComputeBlockSize.cpp
@@ -8,7 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBasicBlockInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <vector>
+
 using namespace llvm;
 
 namespace llvm {
@@ -69,4 +77,4 @@ std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF) {
   return BBInfo;
 }
 
-} // end namespace
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index be1a37e..667337d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -14,30 +14,50 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBasicBlockInfo.h"
 #include "ARMMachineFunctionInfo.h"
-#include "MCTargetDesc/ARMAddressingModes.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "Thumb2InstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-cp-islands"
 
+#define ARM_CP_ISLANDS_OPT_NAME \
+  "ARM constant island placement and branch shortening pass"
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -49,7 +69,6 @@ STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
 STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
 STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
 
-
 static cl::opt<bool>
 AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
           cl::desc("Adjust basic block layout to better use TB[BH]"));
@@ -64,6 +83,7 @@ static cl::opt<bool> SynthesizeThumb1TBB(
              "equivalent to the TBB/TBH instructions"));
 
 namespace {
+
   /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
   /// requires constant pool entries to be scattered among the instructions
   /// inside a function.  To do this, it completely ignores the normal LLVM
@@ -76,7 +96,6 @@ namespace {
   ///   CPE     - A constant pool entry that has been placed somewhere, which
   ///             tracks a list of users.
   class ARMConstantIslands : public MachineFunctionPass {
-
     std::vector<BasicBlockInfo> BBInfo;
 
     /// WaterList - A sorted list of basic blocks where islands could be placed
@@ -110,12 +129,14 @@ namespace {
       bool NegOk;
       bool IsSoImm;
       bool KnownAlignment;
+
       CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
              bool neg, bool soimm)
         : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm),
           KnownAlignment(false) {
         HighWaterMark = CPEMI->getParent();
       }
+
       /// getMaxDisp - Returns the maximum displacement supported by MI.
       /// Correct for unknown alignment.
       /// Conservatively subtract 2 bytes to handle weird alignment effects.
@@ -135,6 +156,7 @@ namespace {
       MachineInstr *CPEMI;
       unsigned CPI;
       unsigned RefCount;
+
       CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
         : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
     };
@@ -148,7 +170,7 @@ namespace {
     /// The first half of CPEntries contains generic constants, the second half
     /// contains jump tables. Use getCombinedIndex on a generic CPEMI to look up
     /// which vector it will be in here.
-    std::vector<std::vector<CPEntry> > CPEntries;
+    std::vector<std::vector<CPEntry>> CPEntries;
 
     /// Maps a JT index to the offset in CPEntries containing copies of that
     /// table. The equivalent map for a CONSTPOOL_ENTRY is the identity.
@@ -167,6 +189,7 @@ namespace {
       unsigned MaxDisp : 31;
       bool isCond : 1;
       unsigned UncondBr;
+
       ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, unsigned ubr)
         : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
     };
@@ -195,8 +218,10 @@ namespace {
     bool isThumb1;
     bool isThumb2;
     bool isPositionIndependentOrROPI;
+
   public:
     static char ID;
+
     ARMConstantIslands() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
@@ -207,7 +232,7 @@ namespace {
     }
 
     StringRef getPassName() const override {
-      return "ARM constant island placement and branch shortening pass";
+      return ARM_CP_ISLANDS_OPT_NAME;
     }
 
   private:
@@ -264,8 +289,10 @@ namespace {
                              U.getMaxDisp(), U.NegOk, U.IsSoImm);
     }
   };
+
   char ARMConstantIslands::ID = 0;
-}
+
+} // end anonymous namespace
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
 void ARMConstantIslands::verify() {
@@ -295,8 +322,9 @@ void ARMConstantIslands::verify() {
 #endif
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
-void ARMConstantIslands::dumpBBs() {
+LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
   DEBUG({
     for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
       const BasicBlockInfo &BBI = BBInfo[J];
@@ -308,12 +336,7 @@ void ARMConstantIslands::dumpBBs() {
     }
   });
 }
-
-/// createARMConstantIslandPass - returns an instance of the constpool
-/// island pass.
-FunctionPass *llvm::createARMConstantIslandPass() {
-  return new ARMConstantIslands();
-}
+#endif
 
 bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
@@ -782,6 +805,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           case ARM::LDRcp:
           case ARM::t2LDRpci:
           case ARM::t2LDRHpci:
+          case ARM::t2LDRBpci:
             Bits = 12;  // +-offset_12
             NegOk = true;
             break;
@@ -873,7 +897,6 @@ void ARMConstantIslands::updateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
   WaterList.insert(IP, NewBB);
 }
 
-
 /// Split the basic block containing MI into two blocks, which are joined by
 /// an unconditional branch.  Update data structures and renumber blocks to
 /// account for this change and returns the newly created block.
@@ -897,8 +920,9 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) {
   if (!isThumb)
     BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
   else
-    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
-            .addImm(ARMCC::AL).addReg(0);
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc))
+        .addMBB(NewBB)
+        .add(predOps(ARMCC::AL));
   ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
@@ -1296,8 +1320,9 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
       if (!isThumb)
         BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
       else
-        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
-          .addImm(ARMCC::AL).addReg(0);
+        BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr))
+            .addMBB(NewMBB)
+            .add(predOps(ARMCC::AL));
       unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
       ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                                       MaxDisp, false, UncondBr));
@@ -1477,7 +1502,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex,
   // add it to the island.
   U.HighWaterMark = NewIsland;
   U.CPEMI = BuildMI(NewIsland, DebugLoc(), CPEMI->getDesc())
-                .addImm(ID).addOperand(CPEMI->getOperand(1)).addImm(Size);
+                .addImm(ID)
+                .add(CPEMI->getOperand(1))
+                .addImm(Size);
   CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
   ++NumCPEs;
 
@@ -1681,8 +1708,9 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   Br.MI = &MBB->back();
   BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
   if (isThumb)
-    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
-            .addImm(ARMCC::AL).addReg(0);
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr))
+        .addMBB(DestBB)
+        .add(predOps(ARMCC::AL));
   else
     BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
   BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
@@ -1709,8 +1737,14 @@ bool ARMConstantIslands::undoLRSpillRestore() {
         MI->getNumExplicitOperands() == 3) {
       // Create the new insn and copy the predicate from the old.
       BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
-        .addOperand(MI->getOperand(0))
-        .addOperand(MI->getOperand(1));
+          .add(MI->getOperand(0))
+          .add(MI->getOperand(1));
+      MI->eraseFromParent();
+      MadeChange = true;
+    } else if (MI->getOpcode() == ARM::tPUSH &&
+               MI->getOperand(2).getReg() == ARM::LR &&
+               MI->getNumExplicitOperands() == 3) {
+      // Just remove the push.
       MI->eraseFromParent();
       MadeChange = true;
     }
@@ -1792,13 +1826,12 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
       Bits = 11;
       Scale = 2;
       break;
-    case ARM::t2Bcc: {
+    case ARM::t2Bcc:
       NewOpc = ARM::tBcc;
       Bits = 8;
       Scale = 2;
       break;
     }
-    }
     if (NewOpc) {
       unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
       MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
@@ -1983,6 +2016,54 @@ static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
          &*MBB->begin() == CPEMI;
 }
 
+static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
+                                         MachineInstr *JumpMI,
+                                         unsigned &DeadSize) {
+  // Remove a dead add between the LEA and JT, which used to compute EntryReg,
+  // but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg
+  // and is not clobbered / used.
+  MachineInstr *RemovableAdd = nullptr;
+  unsigned EntryReg = JumpMI->getOperand(0).getReg();
+
+  // Find the last ADD to set EntryReg
+  MachineBasicBlock::iterator I(LEAMI);
+  for (++I; &*I != JumpMI; ++I) {
+    if (I->getOpcode() == ARM::t2ADDrs && I->getOperand(0).getReg() == EntryReg)
+      RemovableAdd = &*I;
+  }
+
+  if (!RemovableAdd)
+    return;
+
+  // Ensure EntryReg is not clobbered or used.
+  MachineBasicBlock::iterator J(RemovableAdd);
+  for (++J; &*J != JumpMI; ++J) {
+    for (unsigned K = 0, E = J->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = J->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == EntryReg)
+        return;
+      if (MO.isUse() && MO.getReg() == EntryReg)
+        return;
+    }
+  }
+
+  DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
+  RemovableAdd->eraseFromParent();
+  DeadSize += 4;
+}
+
+static bool registerDefinedBetween(unsigned Reg,
+                                   MachineBasicBlock::iterator From,
+                                   MachineBasicBlock::iterator To,
+                                   const TargetRegisterInfo *TRI) {
+  for (auto I = From; I != To; ++I)
+    if (I->modifiesRegister(Reg, TRI))
+      return true;
+  return false;
+}
+
 /// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
 /// jumptables when it's possible.
 bool ARMConstantIslands::optimizeThumb2JumpTables() {
@@ -2060,6 +2141,12 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       IdxReg = Shift->getOperand(2).getReg();
       unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
 
+      // It's important that IdxReg is live until the actual TBB/TBH. Most of
+      // the range is checked later, but the LEA might still clobber it and not
+      // actually get removed.
+      if (BaseReg == IdxReg && !jumpTableFollowsTB(MI, User.CPEMI))
+        continue;
+
       MachineInstr *Load = User.MI->getNextNode();
       if (Load->getOpcode() != ARM::tLDRr)
         continue;
@@ -2069,6 +2156,16 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
         continue;
 
       // If we're in PIC mode, there should be another ADD following.
+      auto *TRI = STI->getRegisterInfo();
+
+      // %base cannot be redefined after the load as it will appear before
+      // TBB/TBH like:
+      //      %base =
+      //      %base =
+      //      tBB %base, %idx
+      if (registerDefinedBetween(BaseReg, Load->getNextNode(), MBB->end(), TRI))
+        continue;
+
       if (isPositionIndependentOrROPI) {
         MachineInstr *Add = Load->getNextNode();
         if (Add->getOpcode() != ARM::tADDrr ||
@@ -2078,22 +2175,26 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
           continue;
         if (Add->getOperand(0).getReg() != MI->getOperand(0).getReg())
           continue;
-
+        if (registerDefinedBetween(IdxReg, Add->getNextNode(), MI, TRI))
+          // IdxReg gets redefined in the middle of the sequence.
+          continue;
         Add->eraseFromParent();
         DeadSize += 2;
       } else {
         if (Load->getOperand(0).getReg() != MI->getOperand(0).getReg())
           continue;
+        if (registerDefinedBetween(IdxReg, Load->getNextNode(), MI, TRI))
+          // IdxReg gets redefined in the middle of the sequence.
+          continue;
       }
-      
-      
+
       // Now safe to delete the load and lsl. The LEA will be removed later.
       CanDeleteLEA = true;
       Shift->eraseFromParent();
       Load->eraseFromParent();
       DeadSize += 4;
     }
-    
+
     DEBUG(dbgs() << "Shrink JT: " << *MI);
     MachineInstr *CPEMI = User.CPEMI;
     unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
@@ -2117,7 +2218,10 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       NewJTMI->getOperand(0).setReg(ARM::PC);
       NewJTMI->getOperand(0).setIsKill(false);
 
-      if (CanDeleteLEA)  {
+      if (CanDeleteLEA) {
+        if (isThumb2)
+          RemoveDeadAddBetweenLEAAndJT(User.MI, MI, DeadSize);
+
         User.MI->eraseFromParent();
         DeadSize += isThumb2 ? 4 : 2;
 
@@ -2238,13 +2342,11 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   if (isThumb2)
     BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B))
         .addMBB(BB)
-        .addImm(ARMCC::AL)
-        .addReg(0);
+        .add(predOps(ARMCC::AL));
   else
     BuildMI(NewBB, DebugLoc(), TII->get(ARM::tB))
         .addMBB(BB)
-        .addImm(ARMCC::AL)
-        .addReg(0);
+        .add(predOps(ARMCC::AL));
 
   // Update internal data structures to account for the newly inserted MBB.
   MF->RenumberBlocks(NewBB);
@@ -2256,3 +2358,12 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   ++NumJTInserted;
   return NewBB;
 }
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+INITIALIZE_PASS(ARMConstantIslands, "arm-cp-islands", ARM_CP_ISLANDS_OPT_NAME,
+                false, false)
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
index 2d16028..9705c8b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -13,13 +13,17 @@
 
 #include "ARMConstantPoolValue.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cstdlib>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -44,7 +48,7 @@ ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
     LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
     AddCurrentAddress(addCurrentAddress) {}
 
-ARMConstantPoolValue::~ARMConstantPoolValue() {}
+ARMConstantPoolValue::~ARMConstantPoolValue() = default;
 
 StringRef ARMConstantPoolValue::getModifierText() const {
   switch (Modifier) {
@@ -94,9 +98,11 @@ ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   return false;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void ARMConstantPoolValue::dump() const {
   errs() << "  " << *this;
 }
+#endif
 
 void ARMConstantPoolValue::print(raw_ostream &O) const {
   if (Modifier) O << "(" << getModifierText() << ")";
diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
index 5f61832..61c5215 100644
--- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -14,10 +14,11 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
 #define LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cstddef>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
@@ -29,6 +30,7 @@ class LLVMContext;
 class MachineBasicBlock;
 
 namespace ARMCP {
+
   enum ARMCPKind {
     CPValue,
     CPExtSymbol,
@@ -47,7 +49,8 @@ namespace ARMCP {
     SECREL,      /// Section Relative (Windows TLS)
     SBREL,       /// Static Base Relative (RWPI)
   };
-}
+
+} // end namespace ARMCP
 
 /// ARMConstantPoolValue - ARM specific constantpool value. This is used to
 /// represent PC-relative displacement between the address of the load
@@ -169,9 +172,11 @@ public:
 
   const GlobalValue *getGV() const;
   const BlockAddress *getBlockAddress() const;
+
   const GlobalVariable *getPromotedGlobal() const {
     return dyn_cast_or_null<GlobalVariable>(GVar);
   }
+
   const Constant *getPromotedGlobalInit() const {
     return CVal;
   }
@@ -186,6 +191,7 @@ public:
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
   void print(raw_ostream &O) const override;
+
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA() ||
            APV->isPromotedGlobal();
@@ -267,6 +273,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMCONSTANTPOOLVALUE_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index baa4e03..46d8f0d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -19,6 +19,7 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -30,6 +31,7 @@
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-pseudo"
@@ -97,9 +99,9 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
     const MachineOperand &MO = OldMI.getOperand(i);
     assert(MO.isReg() && MO.getReg());
     if (MO.isUse())
-      UseMI.addOperand(MO);
+      UseMI.add(MO);
     else
-      DefMI.addOperand(MO);
+      DefMI.add(MO);
   }
 }
 
@@ -415,14 +417,14 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
     MIB.addReg(D3, RegState::Define | getDeadRegState(DstIsDead));
 
   if (TableEntry->isUpdating)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the addrmode6 operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   // Copy the am6offset operand.
   if (TableEntry->hasWritebackOperand)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // For an instruction writing double-spaced subregs, the pseudo instruction
   // has an extra operand that is a use of the super-register.  Record the
@@ -432,15 +434,15 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
     SrcOpIdx = OpIdx++;
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the super-register source operand used for double-spaced subregs over
   // to the new instruction as an implicit operand.
   if (SrcOpIdx != 0) {
     MachineOperand MO = MI.getOperand(SrcOpIdx);
     MO.setImplicit(true);
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
   // Add an implicit def for the super-register.
   MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
@@ -467,14 +469,14 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
                                     TII->get(TableEntry->RealOpc));
   unsigned OpIdx = 0;
   if (TableEntry->isUpdating)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the addrmode6 operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   // Copy the am6offset operand.
   if (TableEntry->hasWritebackOperand)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   bool SrcIsUndef = MI.getOperand(OpIdx).isUndef();
@@ -490,8 +492,8 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
     MIB.addReg(D3, getUndefRegState(SrcIsUndef));
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
     MIB->addRegisterKilled(SrcReg, TRI, true);
@@ -549,14 +551,14 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
   }
 
   if (TableEntry->isUpdating)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the addrmode6 operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   // Copy the am6offset operand.
   if (TableEntry->hasWritebackOperand)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   // Grab the super-register source.
   MachineOperand MO = MI.getOperand(OpIdx++);
@@ -579,12 +581,12 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
   OpIdx += 1;
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the super-register source to be an implicit source.
   MO.setImplicit(true);
-  MIB.addOperand(MO);
+  MIB.add(MO);
   if (TableEntry->IsLoad)
     // Add an implicit def for the super-register.
     MIB.addReg(DstReg, RegState::ImplicitDefine | getDeadRegState(DstIsDead));
@@ -605,9 +607,9 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   unsigned OpIdx = 0;
 
   // Transfer the destination register operand.
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
   if (IsExt)
-    MIB.addOperand(MI.getOperand(OpIdx++));
+    MIB.add(MI.getOperand(OpIdx++));
 
   bool SrcIsKill = MI.getOperand(OpIdx).isKill();
   unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
@@ -616,11 +618,11 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MIB.addReg(D0);
 
   // Copy the other source register operand.
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Copy the predicate operands.
-  MIB.addOperand(MI.getOperand(OpIdx++));
-  MIB.addOperand(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
+  MIB.add(MI.getOperand(OpIdx++));
 
   // Add an implicit kill and use for the super-reg.
   MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill));
@@ -696,8 +698,8 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     HI16 = HI16.addImm(SOImmValV2);
     LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-    LO16.addImm(Pred).addReg(PredReg).addReg(0);
-    HI16.addImm(Pred).addReg(PredReg).addReg(0);
+    LO16.addImm(Pred).addReg(PredReg).add(condCodeOp());
+    HI16.addImm(Pred).addReg(PredReg).add(condCodeOp());
     TransferImpOps(MI, LO16, HI16);
     MI.eraseFromParent();
     return;
@@ -755,14 +757,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   MI.eraseFromParent();
 }
 
-static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    MBB->addLiveIn(*I);
-}
-
 /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
-/// possible. This only gets used at -O0 so we don't care about efficiency of the
-/// generated code.
+/// possible. This only gets used at -O0 so we don't care about efficiency of
+/// the generated code.
 bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      unsigned LdrexOp, unsigned StrexOp,
@@ -771,16 +768,14 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   bool IsThumb = STI->isThumb();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  MachineOperand &Dest = MI.getOperand(0);
-  unsigned StatusReg = MI.getOperand(1).getReg();
-  MachineOperand &Addr = MI.getOperand(2);
-  MachineOperand &Desired = MI.getOperand(3);
-  MachineOperand &New = MI.getOperand(4);
-
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  const MachineOperand &Dest = MI.getOperand(0);
+  unsigned TempReg = MI.getOperand(1).getReg();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned DesiredReg = MI.getOperand(3).getReg();
+  unsigned NewReg = MI.getOperand(4).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -793,33 +788,30 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
 
   if (UxtOp) {
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), Desired.getReg())
-            .addReg(Desired.getReg(), RegState::Kill);
+        BuildMI(MBB, MBBI, DL, TII->get(UxtOp), DesiredReg)
+            .addReg(DesiredReg, RegState::Kill);
     if (!IsThumb)
       MIB.addImm(0);
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
   }
 
   // .Lloadcmp:
   //     ldrex rDest, [rAddr]
   //     cmp rDest, rDesired
   //     bne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(Dest.getReg());
-  LoadCmpBB->addLiveIn(Desired.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(LoadCmpBB, DL, TII->get(LdrexOp), Dest.getReg());
-  MIB.addReg(Addr.getReg());
+  MIB.addReg(AddrReg);
   if (LdrexOp == ARM::t2LDREX)
     MIB.addImm(0); // a 32-bit Thumb ldrex (only) allows an offset.
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
-  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
-                     .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
-                     .addOperand(Desired));
+  BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+      .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+      .addReg(DesiredReg)
+      .add(predOps(ARMCC::AL));
   unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
   BuildMI(LoadCmpBB, DL, TII->get(Bcc))
       .addMBB(DoneBB)
@@ -829,25 +821,21 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   LoadCmpBB->addSuccessor(StoreBB);
 
   // .Lstore:
-  //     strex rStatus, rNew, [rAddr]
-  //     cmp rStatus, #0
+  //     strex rTempReg, rNew, [rAddr]
+  //     cmp rTempReg, #0
   //     bne .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(New.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
-
-
-  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), StatusReg);
-  MIB.addOperand(New);
-  MIB.addOperand(Addr);
+  MIB = BuildMI(StoreBB, DL, TII->get(StrexOp), TempReg)
+    .addReg(NewReg)
+    .addReg(AddrReg);
   if (StrexOp == ARM::t2STREX)
     MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
-  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
-                     .addReg(StatusReg, RegState::Kill)
-                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(CMPri))
+      .addReg(TempReg, RegState::Kill)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
       .addMBB(LoadCmpBB)
       .addImm(ARMCC::NE)
@@ -857,12 +845,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute livein lists.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass around the loop to get loop carried registers right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
@@ -889,20 +889,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   MachineOperand &Dest = MI.getOperand(0);
-  unsigned StatusReg = MI.getOperand(1).getReg();
-  MachineOperand &Addr = MI.getOperand(2);
-  MachineOperand &Desired = MI.getOperand(3);
-  MachineOperand &New = MI.getOperand(4);
+  unsigned TempReg = MI.getOperand(1).getReg();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned DesiredReg = MI.getOperand(3).getReg();
+  MachineOperand New = MI.getOperand(4);
+  New.setIsKill(false);
 
   unsigned DestLo = TRI->getSubReg(Dest.getReg(), ARM::gsub_0);
   unsigned DestHi = TRI->getSubReg(Dest.getReg(), ARM::gsub_1);
-  unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
-  unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
-
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  unsigned DesiredLo = TRI->getSubReg(DesiredReg, ARM::gsub_0);
+  unsigned DesiredHi = TRI->getSubReg(DesiredReg, ARM::gsub_1);
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -916,28 +915,23 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   // .Lloadcmp:
   //     ldrexd rDestLo, rDestHi, [rAddr]
   //     cmp rDestLo, rDesiredLo
-  //     sbcs rStatus<dead>, rDestHi, rDesiredHi
+  //     sbcs rTempReg<dead>, rDestHi, rDesiredHi
   //     bne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(Dest.getReg());
-  LoadCmpBB->addLiveIn(Desired.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
   unsigned LDREXD = IsThumb ? ARM::t2LDREXD : ARM::LDREXD;
   MachineInstrBuilder MIB;
   MIB = BuildMI(LoadCmpBB, DL, TII->get(LDREXD));
   addExclusiveRegPair(MIB, Dest, RegState::Define, IsThumb, TRI);
-  MIB.addReg(Addr.getReg());
-  AddDefaultPred(MIB);
+  MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPrr = IsThumb ? ARM::tCMPhir : ARM::CMPrr;
-  AddDefaultPred(BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
-                     .addReg(DestLo, getKillRegState(Dest.isDead()))
-                     .addReg(DesiredLo, getKillRegState(Desired.isDead())));
+  BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
+      .addReg(DestLo, getKillRegState(Dest.isDead()))
+      .addReg(DesiredLo)
+      .add(predOps(ARMCC::AL));
 
   BuildMI(LoadCmpBB, DL, TII->get(CMPrr))
       .addReg(DestHi, getKillRegState(Dest.isDead()))
-      .addReg(DesiredHi, getKillRegState(Desired.isDead()))
+      .addReg(DesiredHi)
       .addImm(ARMCC::EQ).addReg(ARM::CPSR, RegState::Kill);
 
   unsigned Bcc = IsThumb ? ARM::tBcc : ARM::Bcc;
@@ -949,23 +943,19 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   LoadCmpBB->addSuccessor(StoreBB);
 
   // .Lstore:
-  //     strexd rStatus, rNewLo, rNewHi, [rAddr]
-  //     cmp rStatus, #0
+  //     strexd rTempReg, rNewLo, rNewHi, [rAddr]
+  //     cmp rTempReg, #0
   //     bne .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(New.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
-
   unsigned STREXD = IsThumb ? ARM::t2STREXD : ARM::STREXD;
-  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), StatusReg);
+  MIB = BuildMI(StoreBB, DL, TII->get(STREXD), TempReg);
   addExclusiveRegPair(MIB, New, 0, IsThumb, TRI);
-  MIB.addOperand(Addr);
-  AddDefaultPred(MIB);
+  MIB.addReg(AddrReg).add(predOps(ARMCC::AL));
 
   unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
-  AddDefaultPred(BuildMI(StoreBB, DL, TII->get(CMPri))
-                     .addReg(StatusReg, RegState::Kill)
-                     .addImm(0));
+  BuildMI(StoreBB, DL, TII->get(CMPri))
+      .addReg(TempReg, RegState::Kill)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
   BuildMI(StoreBB, DL, TII->get(Bcc))
       .addMBB(LoadCmpBB)
       .addImm(ARMCC::NE)
@@ -975,12 +965,24 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute livein lists.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass around the loop to get loop carried registers right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
@@ -1026,7 +1028,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
         // Add the default predicate in Thumb mode.
         if (STI->isThumb())
-          MIB.addImm(ARMCC::AL).addReg(0);
+          MIB.add(predOps(ARMCC::AL));
       } else if (RetOpcode == ARM::TCRETURNri) {
         BuildMI(MBB, MBBI, dl,
                 TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr))
@@ -1047,9 +1049,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
               MI.getOperand(1).getReg())
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4));
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4));
 
       MI.eraseFromParent();
       return true;
@@ -1059,10 +1061,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1070,11 +1072,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsi: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
               (MI.getOperand(1).getReg()))
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm())
-        .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addOperand(MI.getOperand(5))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm())
+          .addImm(MI.getOperand(4).getImm()) // 'pred'
+          .add(MI.getOperand(5))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1082,12 +1084,12 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsr: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
-        .addOperand(MI.getOperand(2))
-        .addOperand(MI.getOperand(3))
-        .addImm(MI.getOperand(4).getImm())
-        .addImm(MI.getOperand(5).getImm()) // 'pred'
-        .addOperand(MI.getOperand(6))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3))
+          .addImm(MI.getOperand(4).getImm())
+          .addImm(MI.getOperand(5).getImm()) // 'pred'
+          .add(MI.getOperand(6))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1097,9 +1099,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4));
+          .addImm(MI.getOperand(2).getImm())
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4));
       MI.eraseFromParent();
       return true;
     }
@@ -1108,10 +1110,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVi : ARM::MOVi;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4))
-        .addReg(0); // 's' bit
+          .addImm(MI.getOperand(2).getImm())
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1121,10 +1123,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addImm(MI.getOperand(2).getImm())
-        .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addOperand(MI.getOperand(4))
-        .addReg(0); // 's' bit
+          .addImm(MI.getOperand(2).getImm())
+          .addImm(MI.getOperand(3).getImm()) // 'pred'
+          .add(MI.getOperand(4))
+          .add(condCodeOp()); // 's' bit
 
       MI.eraseFromParent();
       return true;
@@ -1143,11 +1145,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       }
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
-        .addOperand(MI.getOperand(2))
-        .addImm(MI.getOperand(3).getImm())
-        .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addOperand(MI.getOperand(5))
-        .addReg(0); // 's' bit
+          .add(MI.getOperand(2))
+          .addImm(MI.getOperand(3).getImm())
+          .addImm(MI.getOperand(4).getImm()) // 'pred'
+          .add(MI.getOperand(5))
+          .add(condCodeOp()); // 's' bit
       MI.eraseFromParent();
       return true;
     }
@@ -1187,10 +1189,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                                     "bits set.");
           unsigned bicOpc = AFI->isThumbFunction() ?
             ARM::t2BICri : ARM::BICri;
-          AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                                              TII->get(bicOpc), ARM::R6)
-                                      .addReg(ARM::R6, RegState::Kill)
-                                      .addImm(MaxAlign-1)));
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6)
+              .addReg(ARM::R6, RegState::Kill)
+              .addImm(MaxAlign - 1)
+              .add(predOps(ARMCC::AL))
+              .add(condCodeOp());
         }
 
       }
@@ -1201,24 +1204,25 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
       // These are just fancy MOVs instructions.
-      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
-                             MI.getOperand(0).getReg())
-                     .addOperand(MI.getOperand(1))
-                     .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
-                                                  ARM_AM::lsr : ARM_AM::asr),
-                                                 1)))
-        .addReg(ARM::CPSR, RegState::Define);
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              MI.getOperand(0).getReg())
+          .add(MI.getOperand(1))
+          .addImm(ARM_AM::getSORegOpc(
+              (Opcode == ARM::MOVsrl_flag ? ARM_AM::lsr : ARM_AM::asr), 1))
+          .add(predOps(ARMCC::AL))
+          .addReg(ARM::CPSR, RegState::Define);
       MI.eraseFromParent();
       return true;
     }
     case ARM::RRX: {
       // This encodes as "MOVs Rd, Rm, rrx
       MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
-                               MI.getOperand(0).getReg())
-                       .addOperand(MI.getOperand(1))
-                       .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
-        .addReg(0);
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+                  MI.getOperand(0).getReg())
+              .add(MI.getOperand(1))
+              .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0))
+              .add(predOps(ARMCC::AL))
+              .add(condCodeOp());
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
       return true;
@@ -1241,18 +1245,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                   .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
         if (!Thumb)
           MIB.addImm(0);
-        MIB.addImm(static_cast<unsigned>(ARMCC::AL)).addReg(0);
+        MIB.add(predOps(ARMCC::AL));
 
         MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                       TII->get(Thumb ? ARM::tBLXr : ARM::BLX));
         if (Thumb)
-          MIB.addImm(static_cast<unsigned>(ARMCC::AL)).addReg(0);
+          MIB.add(predOps(ARMCC::AL));
         MIB.addReg(Reg, RegState::Kill);
       } else {
         MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
                       TII->get(Thumb ? ARM::tBL : ARM::BL));
         if (Thumb)
-          MIB.addImm(static_cast<unsigned>(ARMCC::AL)).addReg(0);
+          MIB.add(predOps(ARMCC::AL));
         MIB.addExternalSymbol("__aeabi_read_tp", 0);
       }
 
@@ -1268,15 +1272,15 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned DstReg = MI.getOperand(0).getReg();
       bool DstIsDead = MI.getOperand(0).isDead();
       MachineInstrBuilder MIB1 =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                               TII->get(NewLdOpc), DstReg)
-                       .addOperand(MI.getOperand(1)));
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewLdOpc), DstReg)
+              .add(MI.getOperand(1))
+              .add(predOps(ARMCC::AL));
       MIB1->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-      MachineInstrBuilder MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                                         TII->get(ARM::tPICADD))
-        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-        .addReg(DstReg)
-        .addOperand(MI.getOperand(2));
+      MachineInstrBuilder MIB2 =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD))
+              .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+              .addReg(DstReg)
+              .add(MI.getOperand(2));
       TransferImpOps(MI, MIB1, MIB2);
       MI.eraseFromParent();
       return true;
@@ -1319,7 +1323,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
       if (IsARM)
         MIB.addImm(0);
-      AddDefaultPred(MIB);
+      MIB.add(predOps(ARMCC::AL));
 
       if (IsPIC) {
         MachineInstrBuilder MIB =
@@ -1329,7 +1333,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
             .addImm(ARMPCLabelIndex);
 
         if (IsARM)
-          AddDefaultPred(MIB);
+          MIB.add(predOps(ARMCC::AL));
       }
 
       MI.eraseFromParent();
@@ -1368,7 +1372,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
         .addReg(DstReg).addImm(LabelId);
       if (isARM) {
-        AddDefaultPred(MIB3);
+        MIB3.add(predOps(ARMCC::AL));
         if (Opcode == ARM::MOV_ga_pcrel_ldr)
           MIB3->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       }
@@ -1388,9 +1392,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB =
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
               .addReg(ARM::LR)
-              .addOperand(MI.getOperand(0))
-              .addOperand(MI.getOperand(1))
-              .addOperand(MI.getOperand(2))
+              .add(MI.getOperand(0))
+              .add(MI.getOperand(1))
+              .add(MI.getOperand(2))
               .addReg(ARM::CPSR, RegState::Undef);
       TransferImpOps(MI, MIB, MIB);
       MI.eraseFromParent();
@@ -1407,11 +1411,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned DstReg = MI.getOperand(OpIdx++).getReg();
 
       // Copy the source register.
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Copy the predicate operands.
-      MIB.addOperand(MI.getOperand(OpIdx++));
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Add the destination operands (D subregs).
       unsigned D0 = TRI->getSubReg(DstReg, ARM::dsub_0);
@@ -1438,11 +1442,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned SrcReg = MI.getOperand(OpIdx++).getReg();
 
       // Copy the destination register.
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Copy the predicate operands.
-      MIB.addOperand(MI.getOperand(OpIdx++));
-      MIB.addOperand(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
+      MIB.add(MI.getOperand(OpIdx++));
 
       // Add the source operands (D subregs).
       unsigned D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
index df4dcb3..bf00ef6 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
@@ -21,30 +22,61 @@
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -54,24 +86,22 @@ namespace {
     enum {
       RegBase,
       FrameIndexBase
-    } BaseType;
+    } BaseType = RegBase;
 
     union {
       unsigned Reg;
       int FI;
     } Base;
 
-    int Offset;
+    int Offset = 0;
 
     // Innocuous defaults for our address.
-    Address()
-     : BaseType(RegBase), Offset(0) {
-       Base.Reg = 0;
-     }
+    Address() {
+      Base.Reg = 0;
+    }
   } Address;
 
 class ARMFastISel final : public FastISel {
-
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
   const ARMSubtarget *Subtarget;
@@ -99,8 +129,9 @@ class ARMFastISel final : public FastISel {
       Context = &funcInfo.Fn->getContext();
     }
 
-    // Code from FastISel.cpp.
   private:
+    // Code from FastISel.cpp.
+
     unsigned fastEmitInst_r(unsigned MachineInstOpcode,
                             const TargetRegisterClass *RC,
                             unsigned Op0, bool Op0IsKill);
@@ -117,18 +148,18 @@ class ARMFastISel final : public FastISel {
                             uint64_t Imm);
 
     // Backend specific FastISel code.
-  private:
+
     bool fastSelectInstruction(const Instruction *I) override;
     unsigned fastMaterializeConstant(const Constant *C) override;
     unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
     bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                              const LoadInst *LI) override;
     bool fastLowerArguments() override;
-  private:
+
   #include "ARMGenFastISel.inc"
 
     // Instruction selection routines.
-  private:
+
     bool SelectLoad(const Instruction *I);
     bool SelectStore(const Instruction *I);
     bool SelectBranch(const Instruction *I);
@@ -151,12 +182,12 @@ class ARMFastISel final : public FastISel {
     bool SelectShift(const Instruction *I, ARM_AM::ShiftOpc ShiftTy);
 
     // Utility routines.
-  private:
+
     bool isPositionIndependent() const;
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                    bool isZExt);
+                    bool isZExt, bool isEquality);
     bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
@@ -179,7 +210,7 @@ class ARMFastISel final : public FastISel {
     const TargetLowering *getTargetLowering() { return &TLI; }
 
     // Call handling routines.
-  private:
+
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
                                   bool Return,
                                   bool isVarArg);
@@ -198,7 +229,7 @@ class ARMFastISel final : public FastISel {
     bool ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call);
 
     // OptionalDef handling routines.
-  private:
+
     bool isARMNEONPred(const MachineInstr *MI);
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
@@ -219,8 +250,7 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
     return false;
 
   // Look to see if our OptionalDef is defining CPSR or CCR.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     if (MO.getReg() == ARM::CPSR)
       *CPSR = true;
@@ -236,8 +266,8 @@ bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
        AFI->isThumb2Function())
     return MI->isPredicable();
 
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
-    if (MCID.OpInfo[i].isPredicate())
+  for (const MCOperandInfo &opInfo : MCID.operands())
+    if (opInfo.isPredicate())
       return true;
 
   return false;
@@ -256,17 +286,13 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // Are we NEON in ARM mode and have a predicate operand? If so, I know
   // we're not predicable but add it anyways.
   if (isARMNEONPred(MI))
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
   // defines CPSR. All other OptionalDefines in ARM are the CCR register.
   bool CPSR = false;
-  if (DefinesOptionalPredicate(MI, &CPSR)) {
-    if (CPSR)
-      AddDefaultT1CC(MIB);
-    else
-      AddDefaultCC(MIB);
-  }
+  if (DefinesOptionalPredicate(MI, &CPSR))
+    MIB.add(CPSR ? t1CondCodeOp() : condCodeOp());
   return MIB;
 }
 
@@ -434,7 +460,6 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
 }
 
 unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
-
   if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
     return 0;
 
@@ -739,7 +764,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
           TmpOffset += SL->getElementOffset(Idx);
         } else {
           uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-          for (;;) {
+          while (true) {
             if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
               // Constant-offset addressing.
               TmpOffset += CI->getSExtValue() * S;
@@ -971,7 +996,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   // Create the base instruction, then add the operands.
   if (allocReg)
     ResultReg = createResultReg(RC);
-  assert (ResultReg > 255 && "Expected an allocated virtual register.");
+  assert(ResultReg > 255 && "Expected an allocated virtual register.");
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
   AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOLoad, useAM3);
@@ -1216,7 +1241,6 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // behavior.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
-
       // Get the compare predicate.
       // Try to take advantage of fallthrough opportunities.
       CmpInst::Predicate Predicate = CI->getPredicate();
@@ -1231,7 +1255,8 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       if (ARMPred == ARMCC::AL) return false;
 
       // Emit the compare.
-      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                      CI->isEquality()))
         return false;
 
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
@@ -1318,14 +1343,16 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 }
 
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                             bool isZExt) {
+                             bool isZExt, bool isEquality) {
   Type *Ty = Src1Value->getType();
   EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple()) return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
-  if (isFloat && !Subtarget->hasVFP2())
+  if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+    return false;
+
+  if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
     return false;
 
   // Check to see if the 2nd operand is a constant that we can encode directly
@@ -1364,10 +1391,18 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     // TODO: Verify compares.
     case MVT::f32:
       isICmp = false;
-      CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      // Equality comparisons shouldn't raise Invalid on uordered inputs.
+      if (isEquality)
+        CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
+      else
+        CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
       break;
     case MVT::f64:
       isICmp = false;
+      // Equality comparisons shouldn't raise Invalid on uordered inputs.
+      if (isEquality)
+        CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
+      else
       CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
       break;
     case MVT::i1:
@@ -1444,7 +1479,8 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   if (ARMPred == ARMCC::AL) return false;
 
   // Emit the compare.
-  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                  CI->isEquality()))
     return false;
 
   // Now set a register based on the comparison. Explicitly set the predicates
@@ -1466,7 +1502,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
 
 bool ARMFastISel::SelectFPExt(const Instruction *I) {
   // Make sure we have VFP and that we're extending float to double.
-  if (!Subtarget->hasVFP2()) return false;
+  if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
 
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() ||
@@ -1485,7 +1521,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
 
 bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   // Make sure we have VFP and that we're truncating double to float.
-  if (!Subtarget->hasVFP2()) return false;
+  if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
 
   Value *V = I->getOperand(0);
   if (!(I->getType()->isFloatTy() &&
@@ -1536,7 +1572,8 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
 
   unsigned Opc;
   if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
-  else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+  else if (Ty->isDoubleTy() && !Subtarget->isFPOnlySP())
+    Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
   else return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
@@ -1561,7 +1598,8 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
   unsigned Opc;
   Type *OpTy = I->getOperand(0)->getType();
   if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
-  else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+  else if (OpTy->isDoubleTy() && !Subtarget->isFPOnlySP())
+    Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
   else return false;
 
   // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
@@ -1596,7 +1634,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   bool UseImm = false;
   bool isNegativeImm = false;
   if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(2))) {
-    assert (VT == MVT::i32 && "Expecting an i32.");
+    assert(VT == MVT::i32 && "Expecting an i32.");
     Imm = (int)ConstInt->getValue().getZExtValue();
     if (Imm < 0) {
       isNegativeImm = true;
@@ -1663,7 +1701,8 @@ bool ARMFastISel::SelectDiv(const Instruction *I, bool isSigned) {
   // If we have integer div support we should have selected this automagically.
   // In case we have a real miss go ahead and return false and we'll pick
   // it up later.
-  if (Subtarget->hasDivide()) return false;
+  if (Subtarget->hasDivideInThumbMode())
+    return false;
 
   // Otherwise emit a libcall.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
@@ -1765,8 +1804,9 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
   // if we have them.
   // FIXME: It'd be nice to use NEON instructions.
   Type *Ty = I->getType();
-  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
-  if (isFloat && !Subtarget->hasVFP2())
+  if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+    return false;
+  if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
     return false;
 
   unsigned Opc;
@@ -1908,7 +1948,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(AdjStackDown))
-                  .addImm(NumBytes));
+                  .addImm(NumBytes).addImm(0));
 
   // Process the args.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -1926,16 +1966,16 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       case CCValAssign::SExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/false);
-        assert (Arg != 0 && "Failed to emit a sext");
+        assert(Arg != 0 && "Failed to emit a sext");
         ArgVT = DestVT;
         break;
       }
       case CCValAssign::AExt:
-        // Intentional fall-through.  Handle AExt and ZExt.
+      // Intentional fall-through.  Handle AExt and ZExt.
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
-        assert (Arg != 0 && "Failed to emit a zext");
+        assert(Arg != 0 && "Failed to emit a zext");
         ArgVT = DestVT;
         break;
       }
@@ -1960,6 +2000,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       assert(VA.getLocVT() == MVT::f64 &&
              "Custom lowering for v2f64 args not available");
 
+      // FIXME: ArgLocs[++i] may extend beyond ArgLocs.size()
       CCValAssign &NextVA = ArgLocs[++i];
 
       assert(VA.isRegLoc() && NextVA.isRegLoc() &&
@@ -2131,8 +2172,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(RetOpc));
   AddOptionalDefs(MIB);
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned R : RetRegs)
+    MIB.addReg(R, RegState::Implicit);
   return true;
 }
 
@@ -2192,8 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   ArgRegs.reserve(I->getNumOperands());
   ArgVTs.reserve(I->getNumOperands());
   ArgFlags.reserve(I->getNumOperands());
-  for (unsigned i = 0; i < I->getNumOperands(); ++i) {
-    Value *Op = I->getOperand(i);
+  for (Value *Op :  I->operands()) {
     unsigned Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
@@ -2230,15 +2270,15 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
                                     DbgLoc, TII.get(CallOpc));
   // BL / BLX don't take a predicate, but tBL / tBLX do.
   if (isThumb2)
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
   if (Subtarget->genLongCalls())
     MIB.addReg(CalleeReg);
   else
     MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2311,19 +2351,19 @@ bool ARMFastISel::SelectCall(const Instruction *I,
       break;
 
     ISD::ArgFlagsTy Flags;
-    unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+    unsigned ArgIdx = i - CS.arg_begin();
+    if (CS.paramHasAttr(ArgIdx, Attribute::SExt))
       Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+    if (CS.paramHasAttr(ArgIdx, Attribute::ZExt))
       Flags.setZExt();
 
     // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::SwiftSelf) ||
-        CS.paramHasAttr(AttrInd, Attribute::SwiftError) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+    if (CS.paramHasAttr(ArgIdx, Attribute::InReg) ||
+        CS.paramHasAttr(ArgIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(ArgIdx, Attribute::SwiftSelf) ||
+        CS.paramHasAttr(ArgIdx, Attribute::SwiftError) ||
+        CS.paramHasAttr(ArgIdx, Attribute::Nest) ||
+        CS.paramHasAttr(ArgIdx, Attribute::ByVal))
       return false;
 
     Type *ArgTy = (*i)->getType();
@@ -2373,7 +2413,7 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
@@ -2382,8 +2422,8 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
-  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i], RegState::Implicit);
+  for (unsigned R : RegArgs)
+    MIB.addReg(R, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2418,7 +2458,7 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
       else if (Len >= 2)
         VT = MVT::i16;
       else {
-        assert (Len == 1 && "Expected a length of 1!");
+        assert(Len == 1 && "Expected a length of 1!");
         VT = MVT::i8;
       }
     } else {
@@ -2433,9 +2473,9 @@ bool ARMFastISel::ARMTryEmitSmallMemCpy(Address Dest, Address Src,
     bool RV;
     unsigned ResultReg;
     RV = ARMEmitLoad(VT, ResultReg, Src);
-    assert (RV == true && "Should be able to handle this load.");
+    assert(RV && "Should be able to handle this load.");
     RV = ARMEmitStore(VT, ResultReg, Dest);
-    assert (RV == true && "Should be able to handle this store.");
+    assert(RV && "Should be able to handle this store.");
     (void)RV;
 
     unsigned Size = VT.getSizeInBits()/8;
@@ -2687,9 +2727,11 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     if (setsCPSR)
       MIB.addReg(ARM::CPSR, RegState::Define);
     SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
-    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
+    MIB.addReg(SrcReg, isKill * RegState::Kill)
+        .addImm(ImmEnc)
+        .add(predOps(ARMCC::AL));
     if (hasS)
-      AddDefaultCC(MIB);
+      MIB.add(condCodeOp());
     // Second instruction consumes the first's result.
     SrcReg = ResultReg;
   }
@@ -2779,7 +2821,6 @@ bool ARMFastISel::SelectShift(const Instruction *I,
 
 // TODO: SoftFP support.
 bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
-
   switch (I->getOpcode()) {
     case Instruction::Load:
       return SelectLoad(I);
@@ -2849,6 +2890,7 @@ bool ARMFastISel::fastSelectInstruction(const Instruction *I) {
 }
 
 namespace {
+
 // This table describes sign- and zero-extend instructions which can be
 // folded into a preceding load. All of these extends have an immediate
 // (sometimes a mask and sometimes a shift) that's applied after
@@ -2865,7 +2907,8 @@ const struct FoldableLoadExtendsStruct {
   { { ARM::SXTB,  ARM::t2SXTB  },   0, 0, MVT::i8  },
   { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
 };
-}
+
+} // end anonymous namespace
 
 /// \brief The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
@@ -2888,13 +2931,12 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   bool Found = false;
   bool isZExt;
-  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
-       i != e; ++i) {
-    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
-        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
-        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+  for (const FoldableLoadExtendsStruct &FLE : FoldableLoadExtends) {
+    if (FLE.Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FLE.ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FLE.ExpectedVT) == VT) {
       Found = true;
-      isZExt = FoldableLoadExtends[i].isZExt;
+      isZExt = FLE.isZExt;
     }
   }
   if (!Found) return false;
@@ -2933,7 +2975,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
           .addConstantPoolIndex(Idx);
   if (Opc == ARM::LDRcp)
     MIB.addImm(0);
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 
   // Fix the address by adding pc.
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
@@ -2944,7 +2986,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
             .addReg(TempReg)
             .addImm(ARMPCLabelIndex);
   if (!Subtarget->isThumb())
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 
   if (UseGOT_PREL && Subtarget->isThumb()) {
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
@@ -2981,20 +3023,18 @@ bool ARMFastISel::fastLowerArguments() {
 
   // Only handle simple cases. i.e. Up to 4 i8/i16/i32 scalar arguments
   // which are passed in r0 - r3.
-  unsigned Idx = 1;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++Idx) {
-    if (Idx > 4)
+  for (const Argument &Arg : F->args()) {
+    if (Arg.getArgNo() >= 4)
       return false;
 
-    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::ByVal))
+    if (Arg.hasAttribute(Attribute::InReg) ||
+        Arg.hasAttribute(Attribute::StructRet) ||
+        Arg.hasAttribute(Attribute::SwiftSelf) ||
+        Arg.hasAttribute(Attribute::SwiftError) ||
+        Arg.hasAttribute(Attribute::ByVal))
       return false;
 
-    Type *ArgTy = I->getType();
+    Type *ArgTy = Arg.getType();
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
@@ -3010,16 +3050,14 @@ bool ARMFastISel::fastLowerArguments() {
     }
   }
 
-
   static const MCPhysReg GPRArgRegs[] = {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
   const TargetRegisterClass *RC = &ARM::rGPRRegClass;
-  Idx = 0;
-  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++Idx) {
-    unsigned SrcReg = GPRArgRegs[Idx];
+  for (const Argument &Arg : F->args()) {
+    unsigned ArgNo = Arg.getArgNo();
+    unsigned SrcReg = GPRArgRegs[ArgNo];
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
@@ -3028,13 +3066,14 @@ bool ARMFastISel::fastLowerArguments() {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
-    updateValueMap(&*I, ResultReg);
+    updateValueMap(&Arg, ResultReg);
   }
 
   return true;
 }
 
 namespace llvm {
+
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
     if (funcInfo.MF->getSubtarget<ARMSubtarget>().useFastISel())
@@ -3042,4 +3081,5 @@ namespace llvm {
 
     return nullptr;
   }
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/ARMFeatures.h b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
index 0c910ab..8c0df4c 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFeatures.h
+++ b/contrib/llvm/lib/Target/ARM/ARMFeatures.h
@@ -19,10 +19,10 @@
 namespace llvm {
 
 template<typename InstrType> // could be MachineInstr or MCInst
-bool IsCPSRDead(InstrType *Instr);
+bool IsCPSRDead(const InstrType *Instr);
 
 template<typename InstrType> // could be MachineInstr or MCInst
-inline bool isV8EligibleForIT(InstrType *Instr) {
+inline bool isV8EligibleForIT(const InstrType *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index c72db8a..16b54e8 100644
--- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -16,19 +16,49 @@
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "arm-frame-lowering"
 
@@ -180,6 +210,7 @@ static bool WindowsRequiresStackProbe(const MachineFunction &MF,
 }
 
 namespace {
+
 struct StackAdjustingInsts {
   struct InstInfo {
     MachineBasicBlock::iterator I;
@@ -196,7 +227,8 @@ struct StackAdjustingInsts {
   }
 
   void addExtraBytes(const MachineBasicBlock::iterator I, unsigned ExtraBytes) {
-    auto Info = find_if(Insts, [&](InstInfo &Info) { return Info.I == I; });
+    auto Info =
+        llvm::find_if(Insts, [&](InstInfo &Info) { return Info.I == I; });
     assert(Info != Insts.end() && "invalid sp adjusting instruction");
     Info->SPAdjust += ExtraBytes;
   }
@@ -219,7 +251,8 @@ struct StackAdjustingInsts {
     }
   }
 };
-}
+
+} // end anonymous namespace
 
 /// Emit an instruction sequence that will align the address in
 /// register Reg by zero-ing out the lower bits.  For versions of the
@@ -252,38 +285,55 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
     //   lsr Reg, Reg, log2(Alignment)
     //   lsl Reg, Reg, log2(Alignment)
     if (CanUseBFC) {
-      AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
-                         .addReg(Reg, RegState::Kill)
-                         .addImm(~AlignMask));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::BFC), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(~AlignMask)
+          .add(predOps(ARMCC::AL));
     } else if (AlignMask <= 255) {
-      AddDefaultCC(
-          AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
-                             .addReg(Reg, RegState::Kill)
-                             .addImm(AlignMask)));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::BICri), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(AlignMask)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
     } else {
       assert(!MustBeSingleInstruction &&
              "Shouldn't call emitAligningInstructions demanding a single "
              "instruction to be emitted for large stack alignment for a target "
              "without BFC.");
-      AddDefaultCC(AddDefaultPred(
-          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
-              .addReg(Reg, RegState::Kill)
-              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))));
-      AddDefaultCC(AddDefaultPred(
-          BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
-              .addReg(Reg, RegState::Kill)
-              .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(ARM_AM::getSORegOpc(ARM_AM::lsr, NrBitsToZero))
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::MOVsi), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, NrBitsToZero))
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
     }
   } else {
     // Since this is only reached for Thumb-2 targets, the BFC instruction
     // should always be available.
     assert(CanUseBFC);
-    AddDefaultPred(BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
-                       .addReg(Reg, RegState::Kill)
-                       .addImm(~AlignMask));
+    BuildMI(MBB, MBBI, DL, TII.get(ARM::t2BFC), Reg)
+        .addReg(Reg, RegState::Kill)
+        .addImm(~AlignMask)
+        .add(predOps(ARMCC::AL));
   }
 }
 
+/// We need the offset of the frame pointer relative to other MachineFrameInfo
+/// offsets which are encoded relative to SP at function begin.
+/// See also emitPrologue() for how the FP is set up.
+/// Unfortunately we cannot determine this value in determineCalleeSaves() yet
+/// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use
+/// this to produce a conservative estimate that we check in an assert() later.
+static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) {
+  // This is a conservative estimation: Assume the frame pointer being r7 and
+  // pc("r15") up to r8 getting spilled before (= 8 registers).
+  return -AFI.getArgRegsSaveSize() - (8 * 4);
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -394,8 +444,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
   int FramePtrOffsetInPush = 0;
   if (HasFP) {
-    FramePtrOffsetInPush =
-        MFI.getObjectOffset(FramePtrSpillFI) + ArgRegsSaveSize;
+    int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
+    assert(getMaxFPOffset(*MF.getFunction(), *AFI) <= FPOffset &&
+           "Max FP estimation is wrong");
+    FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize;
     AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
   }
@@ -448,9 +500,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     uint32_t NumWords = NumBytes >> 2;
 
     if (NumWords < 65536)
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
-                     .addImm(NumWords)
-                     .setMIFlags(MachineInstr::FrameSetup));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+          .addImm(NumWords)
+          .setMIFlags(MachineInstr::FrameSetup)
+          .add(predOps(ARMCC::AL));
     else
       BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
         .addImm(NumWords)
@@ -462,10 +515,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     case CodeModel::Default:
     case CodeModel::Kernel:
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
-        .addImm((unsigned)ARMCC::AL).addReg(0)
-        .addExternalSymbol("__chkstk")
-        .addReg(ARM::R4, RegState::Implicit)
-        .setMIFlags(MachineInstr::FrameSetup);
+          .add(predOps(ARMCC::AL))
+          .addExternalSymbol("__chkstk")
+          .addReg(ARM::R4, RegState::Implicit)
+          .setMIFlags(MachineInstr::FrameSetup);
       break;
     case CodeModel::Large:
     case CodeModel::JITDefault:
@@ -474,18 +527,19 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlags(MachineInstr::FrameSetup);
 
       BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
-        .addImm((unsigned)ARMCC::AL).addReg(0)
-        .addReg(ARM::R12, RegState::Kill)
-        .addReg(ARM::R4, RegState::Implicit)
-        .setMIFlags(MachineInstr::FrameSetup);
+          .add(predOps(ARMCC::AL))
+          .addReg(ARM::R12, RegState::Kill)
+          .addReg(ARM::R4, RegState::Implicit)
+          .setMIFlags(MachineInstr::FrameSetup);
       break;
     }
 
-    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
-                                        ARM::SP)
-                                .addReg(ARM::SP, RegState::Kill)
-                                .addReg(ARM::R4, RegState::Kill)
-                                .setMIFlags(MachineInstr::FrameSetup)));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), ARM::SP)
+        .addReg(ARM::SP, RegState::Kill)
+        .addReg(ARM::R4, RegState::Kill)
+        .setMIFlags(MachineInstr::FrameSetup)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     NumBytes = 0;
   }
 
@@ -657,12 +711,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       // -- out lower bits in r4
       // mov sp, r4
       // FIXME: It will be better just to find spare register here.
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
-                         .addReg(ARM::SP, RegState::Kill));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+          .addReg(ARM::SP, RegState::Kill)
+          .add(predOps(ARMCC::AL));
       emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::R4, MaxAlign,
                                false);
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
-                         .addReg(ARM::R4, RegState::Kill));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+          .addReg(ARM::R4, RegState::Kill)
+          .add(predOps(ARMCC::AL));
     }
 
     AFI->setShouldRestoreSPFromFP(true);
@@ -675,14 +731,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // FIXME: Clarify FrameSetup flags here.
   if (RegInfo->hasBasePointer(MF)) {
     if (isARM)
-      BuildMI(MBB, MBBI, dl,
-              TII.get(ARM::MOVr), RegInfo->getBaseRegister())
-        .addReg(ARM::SP)
-        .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), RegInfo->getBaseRegister())
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .add(condCodeOp());
     else
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                             RegInfo->getBaseRegister())
-        .addReg(ARM::SP));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), RegInfo->getBaseRegister())
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL));
   }
 
   // If the frame has variable sized objects then the epilogue must restore
@@ -757,19 +813,21 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                  "No scratch register to restore SP from FP!");
           emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                  ARMCC::AL, 0, TII);
-          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                                 ARM::SP)
-            .addReg(ARM::R4));
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+              .addReg(ARM::R4)
+              .add(predOps(ARMCC::AL));
         }
       } else {
         // Thumb2 or ARM.
         if (isARM)
           BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
-            .addReg(FramePtr).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+              .addReg(FramePtr)
+              .add(predOps(ARMCC::AL))
+              .add(condCodeOp());
         else
-          AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                                 ARM::SP)
-            .addReg(FramePtr));
+          BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+              .addReg(FramePtr)
+              .add(predOps(ARMCC::AL));
       }
     } else if (NumBytes &&
                !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
@@ -829,7 +887,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
   // When dynamically realigning the stack, use the frame pointer for
   // parameters, and the stack/base pointer for locals.
   if (RegInfo->needsStackRealignment(MF)) {
-    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    assert(hasFP(MF) && "dynamic stack realignment without a FP!");
     if (isFixed) {
       FrameReg = RegInfo->getFrameRegister(MF);
       Offset = FPOffset;
@@ -910,8 +968,9 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      bool isLiveIn = MF.getRegInfo().isLiveIn(Reg);
-      if (!isLiveIn)
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      bool isLiveIn = MRI.isLiveIn(Reg);
+      if (!isLiveIn && !MRI.isReserved(Reg))
         MBB.addLiveIn(Reg);
       // If NoGap is true, push consecutive registers and then leave the rest
       // for other instructions. e.g.
@@ -936,18 +995,19 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
     });
 
     if (Regs.size() > 1 || StrOpc== 0) {
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
-                       .addReg(ARM::SP).setMIFlags(MIFlags));
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StmOpc), ARM::SP)
+                                    .addReg(ARM::SP)
+                                    .setMIFlags(MIFlags)
+                                    .add(predOps(ARMCC::AL));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i].first, getKillRegState(Regs[i].second));
     } else if (Regs.size() == 1) {
-      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
-                                        ARM::SP)
-        .addReg(Regs[0].first, getKillRegState(Regs[0].second))
-        .addReg(ARM::SP).setMIFlags(MIFlags)
-        .addImm(-4);
-      AddDefaultPred(MIB);
+      BuildMI(MBB, MI, DL, TII.get(StrOpc), ARM::SP)
+          .addReg(Regs[0].first, getKillRegState(Regs[0].second))
+          .addReg(ARM::SP)
+          .setMIFlags(MIFlags)
+          .addImm(-4)
+          .add(predOps(ARMCC::AL));
     }
     Regs.clear();
 
@@ -1027,9 +1087,9 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     });
 
     if (Regs.size() > 1 || LdrOpc == 0) {
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
-                       .addReg(ARM::SP));
+      MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
+                                    .addReg(ARM::SP)
+                                    .add(predOps(ARMCC::AL));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
       if (DeleteRet && MI != MBB.end()) {
@@ -1053,7 +1113,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
         MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
       } else
         MIB.addImm(4);
-      AddDefaultPred(MIB);
+      MIB.add(predOps(ARMCC::AL));
     }
     Regs.clear();
 
@@ -1114,9 +1174,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // sub r4, sp, #numregs * 8
   // The immediate is <= 64, so it doesn't need any special encoding.
   unsigned Opc = isThumb ? ARM::t2SUBri : ARM::SUBri;
-  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
-                                  .addReg(ARM::SP)
-                                  .addImm(8 * NumAlignedDPRCS2Regs)));
+  BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+      .addReg(ARM::SP)
+      .addImm(8 * NumAlignedDPRCS2Regs)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
 
   unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment();
   // We must set parameter MustBeSingleInstruction to true, since
@@ -1132,10 +1194,10 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // Leave r4 live, it is used below.
   Opc = isThumb ? ARM::tMOVr : ARM::MOVr;
   MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(Opc), ARM::SP)
-                            .addReg(ARM::R4);
-  MIB = AddDefaultPred(MIB);
+                                .addReg(ARM::R4)
+                                .add(predOps(ARMCC::AL));
   if (!isThumb)
-    AddDefaultCC(MIB);
+    MIB.add(condCodeOp());
 
   // Now spill NumAlignedDPRCS2Regs registers starting from d8.
   // r4 holds the stack slot address.
@@ -1147,11 +1209,12 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed),
-                           ARM::R4)
-                   .addReg(ARM::R4, RegState::Kill).addImm(16)
-                   .addReg(NextReg)
-                   .addReg(SupReg, RegState::ImplicitKill));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Qwb_fixed), ARM::R4)
+        .addReg(ARM::R4, RegState::Kill)
+        .addImm(16)
+        .addReg(NextReg)
+        .addReg(SupReg, RegState::ImplicitKill)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1165,9 +1228,12 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
     MBB.addLiveIn(SupReg);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
-                   .addReg(ARM::R4).addImm(16).addReg(NextReg)
-                   .addReg(SupReg, RegState::ImplicitKill));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1d64Q))
+        .addReg(ARM::R4)
+        .addImm(16)
+        .addReg(NextReg)
+        .addReg(SupReg, RegState::ImplicitKill)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1177,8 +1243,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QPRRegClass);
     MBB.addLiveIn(SupReg);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
-                   .addReg(ARM::R4).addImm(16).addReg(SupReg));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VST1q64))
+        .addReg(ARM::R4)
+        .addImm(16)
+        .addReg(SupReg)
+        .add(predOps(ARMCC::AL));
     NextReg += 2;
     NumAlignedDPRCS2Regs -= 2;
   }
@@ -1187,9 +1256,11 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs) {
     MBB.addLiveIn(NextReg);
     // vstr.64 uses addrmode5 which has an offset scale of 4.
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
-                   .addReg(NextReg)
-                   .addReg(ARM::R4).addImm((NextReg-R4BaseReg)*2));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VSTRD))
+        .addReg(NextReg)
+        .addReg(ARM::R4)
+        .addImm((NextReg - R4BaseReg) * 2)
+        .add(predOps(ARMCC::AL));
   }
 
   // The last spill instruction inserted should kill the scratch register r4.
@@ -1254,8 +1325,11 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   assert(!AFI->isThumb1OnlyFunction() && "Can't realign stack for thumb1");
 
   unsigned Opc = isThumb ? ARM::t2ADDri : ARM::ADDri;
-  AddDefaultCC(AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
-                              .addFrameIndex(D8SpillFI).addImm(0)));
+  BuildMI(MBB, MI, DL, TII.get(Opc), ARM::R4)
+      .addFrameIndex(D8SpillFI)
+      .addImm(0)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
 
   // Now restore NumAlignedDPRCS2Regs registers starting from d8.
   unsigned NextReg = ARM::D8;
@@ -1264,10 +1338,12 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs >= 6) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
-                   .addReg(ARM::R4, RegState::Define)
-                   .addReg(ARM::R4, RegState::Kill).addImm(16)
-                   .addReg(SupReg, RegState::ImplicitDefine));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Qwb_fixed), NextReg)
+        .addReg(ARM::R4, RegState::Define)
+        .addReg(ARM::R4, RegState::Kill)
+        .addImm(16)
+        .addReg(SupReg, RegState::ImplicitDefine)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1280,9 +1356,11 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs >= 4) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QQPRRegClass);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
-                   .addReg(ARM::R4).addImm(16)
-                   .addReg(SupReg, RegState::ImplicitDefine));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLD1d64Q), NextReg)
+        .addReg(ARM::R4)
+        .addImm(16)
+        .addReg(SupReg, RegState::ImplicitDefine)
+        .add(predOps(ARMCC::AL));
     NextReg += 4;
     NumAlignedDPRCS2Regs -= 4;
   }
@@ -1291,16 +1369,20 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   if (NumAlignedDPRCS2Regs >= 2) {
     unsigned SupReg = TRI->getMatchingSuperReg(NextReg, ARM::dsub_0,
                                                &ARM::QPRRegClass);
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
-                   .addReg(ARM::R4).addImm(16));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLD1q64), SupReg)
+        .addReg(ARM::R4)
+        .addImm(16)
+        .add(predOps(ARMCC::AL));
     NextReg += 2;
     NumAlignedDPRCS2Regs -= 2;
   }
 
   // Finally, use a vanilla vldr.64 for the remaining odd register.
   if (NumAlignedDPRCS2Regs)
-    AddDefaultPred(BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
-                   .addReg(ARM::R4).addImm(2*(NextReg-R4BaseReg)));
+    BuildMI(MBB, MI, DL, TII.get(ARM::VLDRD), NextReg)
+        .addReg(ARM::R4)
+        .addImm(2 * (NextReg - R4BaseReg))
+        .add(predOps(ARMCC::AL));
 
   // Last store kills r4.
   std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
@@ -1633,24 +1715,38 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   //        worth the effort and added fragility?
   unsigned EstimatedStackSize =
       MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
-  if (hasFP(MF)) {
+
+  // Determine biggest (positive) SP offset in MachineFrameInfo.
+  int MaxFixedOffset = 0;
+  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+    int MaxObjectOffset = MFI.getObjectOffset(I) + MFI.getObjectSize(I);
+    MaxFixedOffset = std::max(MaxFixedOffset, MaxObjectOffset);
+  }
+
+  bool HasFP = hasFP(MF);
+  if (HasFP) {
     if (AFI->hasStackFrame())
       EstimatedStackSize += 4;
   } else {
     // If FP is not used, SP will be used to access arguments, so count the
     // size of arguments into the estimation.
-    EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+    EstimatedStackSize += MaxFixedOffset;
   }
   EstimatedStackSize += 16; // For possible paddings.
 
-  bool BigStack = EstimatedStackSize >= estimateRSStackSizeLimit(MF, this) ||
-                  MFI.hasVarSizedObjects() ||
-                  (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF));
+  unsigned EstimatedRSStackSizeLimit = estimateRSStackSizeLimit(MF, this);
+  int MaxFPOffset = getMaxFPOffset(*MF.getFunction(), *AFI);
+  bool BigFrameOffsets = EstimatedStackSize >= EstimatedRSStackSizeLimit ||
+    MFI.hasVarSizedObjects() ||
+    (MFI.adjustsStack() && !canSimplifyCallFramePseudos(MF)) ||
+    // For large argument stacks fp relative addressed may overflow.
+    (HasFP && (MaxFixedOffset - MaxFPOffset) >= (int)EstimatedRSStackSizeLimit);
   bool ExtraCSSpill = false;
-  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
+  if (BigFrameOffsets ||
+      !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
 
-    if (hasFP(MF)) {
+    if (HasFP) {
       SavedRegs.set(FramePtr);
       // If the frame pointer is required by the ABI, also spill LR so that we
       // emit a complete frame record.
@@ -1658,11 +1754,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         SavedRegs.set(ARM::LR);
         LRSpilled = true;
         NumGPRSpills++;
-        auto LRPos = find(UnspilledCS1GPRs, ARM::LR);
+        auto LRPos = llvm::find(UnspilledCS1GPRs, ARM::LR);
         if (LRPos != UnspilledCS1GPRs.end())
           UnspilledCS1GPRs.erase(LRPos);
       }
-      auto FPPos = find(UnspilledCS1GPRs, FramePtr);
+      auto FPPos = llvm::find(UnspilledCS1GPRs, FramePtr);
       if (FPPos != UnspilledCS1GPRs.end())
         UnspilledCS1GPRs.erase(FPPos);
       NumGPRSpills++;
@@ -1721,7 +1817,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       }
 
       // r7 can be used if it is not being used as the frame pointer.
-      if (!hasFP(MF)) {
+      if (!HasFP) {
         if (SavedRegs.test(ARM::R7)) {
           --RegDeficit;
           DEBUG(dbgs() << "%R7 is saved low register, RegDeficit = "
@@ -1773,7 +1869,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         NumGPRSpills++;
         CS1Spilled = true;
         ExtraCSSpill = true;
-        UnspilledCS1GPRs.erase(find(UnspilledCS1GPRs, Reg));
+        UnspilledCS1GPRs.erase(llvm::find(UnspilledCS1GPRs, Reg));
         if (Reg == ARM::LR)
           LRSpilled = true;
       }
@@ -1786,7 +1882,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       SavedRegs.set(ARM::LR);
       NumGPRSpills++;
       SmallVectorImpl<unsigned>::iterator LRPos;
-      LRPos = find(UnspilledCS1GPRs, (unsigned)ARM::LR);
+      LRPos = llvm::find(UnspilledCS1GPRs, (unsigned)ARM::LR);
       if (LRPos != UnspilledCS1GPRs.end())
         UnspilledCS1GPRs.erase(LRPos);
 
@@ -1831,7 +1927,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // callee-saved register or reserve a special spill slot to facilitate
     // register scavenging. Thumb1 needs a spill slot for stack pointer
     // adjustments also, even when the frame itself is small.
-    if (BigStack && !ExtraCSSpill) {
+    if (BigFrameOffsets && !ExtraCSSpill) {
       // If any non-reserved CS register isn't spilled, just spill one or two
       // extra. That should take care of it!
       unsigned NumExtras = TargetAlign / 4;
@@ -1865,10 +1961,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
         // closest to SP or frame pointer.
         assert(RS && "Register scavenging not provided");
-        const TargetRegisterClass *RC = &ARM::GPRRegClass;
-        RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                          RC->getAlignment(),
-                                                          false));
+        const TargetRegisterClass &RC = ARM::GPRRegClass;
+        unsigned Size = TRI->getSpillSize(RC);
+        unsigned Align = TRI->getSpillAlignment(RC);
+        RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
       }
     }
   }
@@ -1890,7 +1986,7 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
     // ADJCALLSTACKUP   -> add, sp, sp, amount
     MachineInstr &Old = *I;
     DebugLoc dl = Old.getDebugLoc();
-    unsigned Amount = Old.getOperand(0).getImm();
+    unsigned Amount = TII.getFrameSize(Old);
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -1908,14 +2004,11 @@ MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
       ARMCC::CondCodes Pred =
           (PIdx == -1) ? ARMCC::AL
                        : (ARMCC::CondCodes)Old.getOperand(PIdx).getImm();
+      unsigned PredReg = TII.getFramePred(Old);
       if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
-        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
-        unsigned PredReg = Old.getOperand(2).getReg();
         emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
       } else {
-        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
-        unsigned PredReg = Old.getOperand(3).getReg();
         assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
         emitSPUpdate(isARM, MBB, I, dl, TII, Amount, MachineInstr::NoFlags,
                      Pred, PredReg);
@@ -2081,12 +2174,17 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // SR1: Scratch Register #1
   // push {SR0, SR1}
   if (Thumb) {
-    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH)))
-        .addReg(ScratchReg0).addReg(ScratchReg1);
+    BuildMI(PrevStackMBB, DL, TII.get(ARM::tPUSH))
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   } else {
-    AddDefaultPred(BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
-                   .addReg(ARM::SP, RegState::Define).addReg(ARM::SP))
-        .addReg(ScratchReg0).addReg(ScratchReg1);
+    BuildMI(PrevStackMBB, DL, TII.get(ARM::STMDB_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   }
 
   // Emit the relevant DWARF information about the change in stack pointer as
@@ -2106,21 +2204,29 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // mov SR1, sp
   if (Thumb) {
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
-                      .addReg(ARM::SP));
+    BuildMI(McrMBB, DL, TII.get(ARM::tMOVr), ScratchReg1)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL));
   } else if (CompareStackPointer) {
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
-                      .addReg(ARM::SP)).addReg(0);
+    BuildMI(McrMBB, DL, TII.get(ARM::MOVr), ScratchReg1)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
 
   // sub SR1, sp, #StackSize
   if (!CompareStackPointer && Thumb) {
-    AddDefaultPred(
-        AddDefaultCC(BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1))
-            .addReg(ScratchReg1).addImm(AlignedStackSize));
+    BuildMI(McrMBB, DL, TII.get(ARM::tSUBi8), ScratchReg1)
+        .add(condCodeOp())
+        .addReg(ScratchReg1)
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL));
   } else if (!CompareStackPointer) {
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
-                      .addReg(ARM::SP).addImm(AlignedStackSize)).addReg(0);
+    BuildMI(McrMBB, DL, TII.get(ARM::SUBri), ScratchReg1)
+        .addReg(ARM::SP)
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
 
   if (Thumb && ST->isThumb1Only()) {
@@ -2131,21 +2237,25 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
 
     // ldr SR0, [pc, offset(STACK_LIMIT)]
-    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
-                      .addConstantPoolIndex(CPI));
+    BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
+        .addConstantPoolIndex(CPI)
+        .add(predOps(ARMCC::AL));
 
     // ldr SR0, [SR0]
-    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
-                      .addReg(ScratchReg0).addImm(0));
+    BuildMI(GetMBB, DL, TII.get(ARM::tLDRi), ScratchReg0)
+        .addReg(ScratchReg0)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
   } else {
     // Get TLS base address from the coprocessor
     // mrc p15, #0, SR0, c13, c0, #3
-    AddDefaultPred(BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
-                     .addImm(15)
-                     .addImm(0)
-                     .addImm(13)
-                     .addImm(0)
-                     .addImm(3));
+    BuildMI(McrMBB, DL, TII.get(ARM::MRC), ScratchReg0)
+        .addImm(15)
+        .addImm(0)
+        .addImm(13)
+        .addImm(0)
+        .addImm(3)
+        .add(predOps(ARMCC::AL));
 
     // Use the last tls slot on android and a private field of the TCP on linux.
     assert(ST->isTargetAndroid() || ST->isTargetLinux());
@@ -2153,16 +2263,19 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
     // Get the stack limit from the right offset
     // ldr SR0, [sr0, #4 * TlsOffset]
-    AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
-                      .addReg(ScratchReg0).addImm(4 * TlsOffset));
+    BuildMI(GetMBB, DL, TII.get(ARM::LDRi12), ScratchReg0)
+        .addReg(ScratchReg0)
+        .addImm(4 * TlsOffset)
+        .add(predOps(ARMCC::AL));
   }
 
   // Compare stack limit with stack size requested.
   // cmp SR0, SR1
   Opcode = Thumb ? ARM::tCMPr : ARM::CMPrr;
-  AddDefaultPred(BuildMI(GetMBB, DL, TII.get(Opcode))
-                    .addReg(ScratchReg0)
-                    .addReg(ScratchReg1));
+  BuildMI(GetMBB, DL, TII.get(Opcode))
+      .addReg(ScratchReg0)
+      .addReg(ScratchReg1)
+      .add(predOps(ARMCC::AL));
 
   // This jump is taken if StackLimit < SP - stack required.
   Opcode = Thumb ? ARM::tBcc : ARM::Bcc;
@@ -2178,32 +2291,40 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // Pass first argument for the __morestack by Scratch Register #0.
   //   The amount size of stack required
   if (Thumb) {
-    AddDefaultPred(AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8),
-                                        ScratchReg0)).addImm(AlignedStackSize));
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg0)
+        .add(condCodeOp())
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL));
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
-                      .addImm(AlignedStackSize)).addReg(0);
+    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg0)
+        .addImm(AlignedStackSize)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
   // Pass second argument for the __morestack by Scratch Register #1.
   //   The amount size of stack consumed to save function arguments.
   if (Thumb) {
-    AddDefaultPred(
-        AddDefaultCC(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1))
-            .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())));
+    BuildMI(AllocMBB, DL, TII.get(ARM::tMOVi8), ScratchReg1)
+        .add(condCodeOp())
+        .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
+        .add(predOps(ARMCC::AL));
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
-                   .addImm(alignToARMConstant(ARMFI->getArgumentStackSize())))
-                   .addReg(0);
+    BuildMI(AllocMBB, DL, TII.get(ARM::MOVi), ScratchReg1)
+        .addImm(alignToARMConstant(ARMFI->getArgumentStackSize()))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
   }
 
   // push {lr} - Save return address of this function.
   if (Thumb) {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH)))
+    BuildMI(AllocMBB, DL, TII.get(ARM::tPUSH))
+        .add(predOps(ARMCC::AL))
         .addReg(ARM::LR);
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
+    BuildMI(AllocMBB, DL, TII.get(ARM::STMDB_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
         .addReg(ARM::LR);
   }
 
@@ -2220,7 +2341,8 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Call __morestack().
   if (Thumb) {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tBL)))
+    BuildMI(AllocMBB, DL, TII.get(ARM::tBL))
+        .add(predOps(ARMCC::AL))
         .addExternalSymbol("__morestack");
   } else {
     BuildMI(AllocMBB, DL, TII.get(ARM::BL))
@@ -2230,22 +2352,26 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // pop {lr} - Restore return address of this original function.
   if (Thumb) {
     if (ST->isThumb1Only()) {
-      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
-                     .addReg(ScratchReg0);
-      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
-                     .addReg(ScratchReg0));
+      BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))
+          .add(predOps(ARMCC::AL))
+          .addReg(ScratchReg0);
+      BuildMI(AllocMBB, DL, TII.get(ARM::tMOVr), ARM::LR)
+          .addReg(ScratchReg0)
+          .add(predOps(ARMCC::AL));
     } else {
-      AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
-                     .addReg(ARM::LR, RegState::Define)
-                     .addReg(ARM::SP, RegState::Define)
-                     .addReg(ARM::SP)
-                     .addImm(4));
+      BuildMI(AllocMBB, DL, TII.get(ARM::t2LDR_POST))
+          .addReg(ARM::LR, RegState::Define)
+          .addReg(ARM::SP, RegState::Define)
+          .addReg(ARM::SP)
+          .addImm(4)
+          .add(predOps(ARMCC::AL));
     }
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
-      .addReg(ARM::LR);
+    BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ARM::LR);
   }
 
   // Restore SR0 and SR1 in case of __morestack() was called.
@@ -2253,15 +2379,17 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   // scratch registers from here.
   // pop {SR0, SR1}
   if (Thumb) {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::tPOP)))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(AllocMBB, DL, TII.get(ARM::tPOP))
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   } else {
-    AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(AllocMBB, DL, TII.get(ARM::LDMIA_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   }
 
   // Update the CFA offset now that we've popped
@@ -2271,20 +2399,22 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // bx lr - Return from this function.
   Opcode = Thumb ? ARM::tBX_RET : ARM::BX_RET;
-  AddDefaultPred(BuildMI(AllocMBB, DL, TII.get(Opcode)));
+  BuildMI(AllocMBB, DL, TII.get(Opcode)).add(predOps(ARMCC::AL));
 
   // Restore SR0 and SR1 in case of __morestack() was not called.
   // pop {SR0, SR1}
   if (Thumb) {
-    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP)))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(PostStackMBB, DL, TII.get(ARM::tPOP))
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   } else {
-    AddDefaultPred(BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
-                   .addReg(ARM::SP, RegState::Define)
-                   .addReg(ARM::SP))
-      .addReg(ScratchReg0)
-      .addReg(ScratchReg1);
+    BuildMI(PostStackMBB, DL, TII.get(ARM::LDMIA_UPD))
+        .addReg(ARM::SP, RegState::Define)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL))
+        .addReg(ScratchReg0)
+        .addReg(ScratchReg1);
   }
 
   // Update the CFA offset now that we've popped
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 7572955..f75dd4d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -228,11 +228,6 @@ private:
                     const uint16_t *DOpcodes,
                     const uint16_t *QOpcodes = nullptr);
 
-  /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
-  /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
-  /// generated to force the table registers to be consecutive.
-  void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
-
   /// Try to select SBFX/UBFX instructions for ARM.
   bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
@@ -244,11 +239,8 @@ private:
 
   bool tryInlineAsm(SDNode *N);
 
-  void SelectConcatVector(SDNode *N);
   void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
 
-  bool trySMLAWSMULW(SDNode *N);
-
   void SelectCMP_SWAP(SDNode *N);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
@@ -547,11 +539,11 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
     SDValue NewMulConst;
     if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
       HandleSDNode Handle(N);
+      SDLoc Loc(N);
       replaceDAGValue(N.getOperand(1), NewMulConst);
       BaseReg = Handle.getValue();
-      Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
-                                                          PowerOfTwo),
-                                      SDLoc(N), MVT::i32);
+      Opc = CurDAG->getTargetConstant(
+          ARM_AM::getSORegOpc(ARM_AM::lsl, PowerOfTwo), Loc, MVT::i32);
       return true;
     }
   }
@@ -1866,6 +1858,14 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   return Opc; // If not one we handle, return it unchanged.
 }
 
+/// Returns true if the given increment is a Constant known to be equal to the
+/// access size performed by a NEON load/store. This means the "[rN]!" form can
+/// be used.
+static bool isPerfectIncrement(SDValue Inc, EVT VecTy, unsigned NumVecs) {
+  auto C = dyn_cast<ConstantSDNode>(Inc);
+  return C && C->getZExtValue() == VecTy.getSizeInBits() / 8 * NumVecs;
+}
+
 void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                 const uint16_t *DOpcodes,
                                 const uint16_t *QOpcodes0,
@@ -1933,13 +1933,13 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
       // FIXME: VLD1/VLD2 fixed increment doesn't need Reg0. Remove the reg0
       // case entirely when the rest are updated to that form, too.
-      if ((NumVecs <= 2) && !isa<ConstantSDNode>(Inc.getNode()))
+      bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
+      if ((NumVecs <= 2) && !IsImmUpdate)
         Opc = getVLDSTRegisterUpdateOpcode(Opc);
       // FIXME: We use a VLD1 for v1i64 even if the pseudo says vld2/3/4, so
       // check for that explicitly too. Horribly hacky, but temporary.
-      if ((NumVecs > 2 && !isVLDfixed(Opc)) ||
-          !isa<ConstantSDNode>(Inc.getNode()))
-        Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+      if ((NumVecs > 2 && !isVLDfixed(Opc)) || !IsImmUpdate)
+        Ops.push_back(IsImmUpdate ? Reg0 : Inc);
     }
     Ops.push_back(Pred);
     Ops.push_back(Reg0);
@@ -2087,11 +2087,12 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
       SDValue Inc = N->getOperand(AddrOpIdx + 1);
       // FIXME: VST1/VST2 fixed increment doesn't need Reg0. Remove the reg0
       // case entirely when the rest are updated to that form, too.
-      if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+      bool IsImmUpdate = isPerfectIncrement(Inc, VT, NumVecs);
+      if (NumVecs <= 2 && !IsImmUpdate)
         Opc = getVLDSTRegisterUpdateOpcode(Opc);
       // FIXME: We use a VST1 for v1i64 even if the pseudo says vld2/3/4, so
       // check for that explicitly too. Horribly hacky, but temporary.
-      if  (!isa<ConstantSDNode>(Inc.getNode()))
+      if  (!IsImmUpdate)
         Ops.push_back(Inc);
       else if (NumVecs > 2 && !isVSTfixed(Opc))
         Ops.push_back(Reg0);
@@ -2221,7 +2222,9 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
   Ops.push_back(Align);
   if (isUpdating) {
     SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+    bool IsImmUpdate =
+        isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+    Ops.push_back(IsImmUpdate ? Reg0 : Inc);
   }
 
   SDValue SuperReg;
@@ -2325,9 +2328,11 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
     // fixed-stride update instructions don't have an explicit writeback
     // operand. It's implicit in the opcode itself.
     SDValue Inc = N->getOperand(2);
-    if (NumVecs <= 2 && !isa<ConstantSDNode>(Inc.getNode()))
+    bool IsImmUpdate =
+        isPerfectIncrement(Inc, VT.getVectorElementType(), NumVecs);
+    if (NumVecs <= 2 && !IsImmUpdate)
       Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    if (!isa<ConstantSDNode>(Inc.getNode()))
+    if (!IsImmUpdate)
       Ops.push_back(Inc);
     // FIXME: VLD3 and VLD4 haven't been updated to that form yet.
     else if (NumVecs > 2)
@@ -2363,39 +2368,6 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
   CurDAG->RemoveDeadNode(N);
 }
 
-void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
-                                 unsigned Opc) {
-  assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  unsigned FirstTblReg = IsExt ? 2 : 1;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SDValue RegSeq;
-  SDValue V0 = N->getOperand(FirstTblReg + 0);
-  SDValue V1 = N->getOperand(FirstTblReg + 1);
-  if (NumVecs == 2)
-    RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
-  else {
-    SDValue V2 = N->getOperand(FirstTblReg + 2);
-    // If it's a vtbl3, form a quad D-register and leave the last part as
-    // an undef.
-    SDValue V3 = (NumVecs == 3)
-      ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
-      : N->getOperand(FirstTblReg + 3);
-    RegSeq = SDValue(createQuadDRegsNode(MVT::v4i64, V0, V1, V2, V3), 0);
-  }
-
-  SmallVector<SDValue, 6> Ops;
-  if (IsExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
-  Ops.push_back(getAL(CurDAG, dl)); // predicate
-  Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
-  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
-}
-
 bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
     return false;
@@ -2563,141 +2535,6 @@ bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
   return false;
 }
 
-static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
-                                 bool Accumulate) {
-  // For SM*WB, we need to some form of sext.
-  // For SM*WT, we need to search for (sra X, 16)
-  // Src1 then gets set to X.
-  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
-       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
-       SignExt.getOpcode() == ISD::AssertSext) &&
-       SignExt.getValueType() == MVT::i32) {
-
-    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
-    Src1 = SignExt.getOperand(0);
-    return true;
-  }
-
-  if (SignExt.getOpcode() != ISD::SRA)
-    return false;
-
-  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
-  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
-    return false;
-
-  SDValue Op0 = SignExt.getOperand(0);
-
-  // The sign extend operand for SM*WB could be generated by a shl and ashr.
-  if (Op0.getOpcode() == ISD::SHL) {
-    SDValue SHL = Op0;
-    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
-    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
-      return false;
-
-    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
-    Src1 = Op0.getOperand(0);
-    return true;
-  }
-  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
-  Src1 = SignExt.getOperand(0);
-  return true;
-}
-
-static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
-                                SDValue &Src1, bool Accumulate) {
-  // First we look for:
-  // (add (or (srl ?, 16), (shl ?, 16)))
-  if (OR.getOpcode() != ISD::OR)
-    return false;
-
-  SDValue SRL = OR.getOperand(0);
-  SDValue SHL = OR.getOperand(1);
-
-  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
-    SRL = OR.getOperand(1);
-    SHL = OR.getOperand(0);
-    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
-      return false;
-  }
-
-  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
-  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
-  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
-      SHLSrc1->getZExtValue() != 16)
-    return false;
-
-  // The first operands to the shifts need to be the two results from the
-  // same smul_lohi node.
-  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
-       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
-    return false;
-
-  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
-  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
-      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
-    return false;
-
-  // Now we have:
-  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
-  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
-  // For SMLAWB the 16-bit value will signed extended somehow.
-  // For SMLAWT only the SRA is required.
-
-  // Check both sides of SMUL_LOHI
-  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
-    Src0 = SMULLOHI->getOperand(1);
-  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
-                                  Accumulate)) {
-    Src0 = SMULLOHI->getOperand(0);
-  } else {
-    return false;
-  }
-  return true;
-}
-
-bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
-  if (!Subtarget->hasV6Ops() ||
-      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
-    return false;
-
-  SDLoc dl(N);
-  SDValue Src0 = N->getOperand(0);
-  SDValue Src1 = N->getOperand(1);
-  SDValue A, B;
-  unsigned Opc = 0;
-
-  if (N->getOpcode() == ISD::ADD) {
-    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
-      return false;
-
-    SDValue Acc;
-    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
-      Acc = Src1;
-    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
-      Acc = Src0;
-    } else {
-      return false;
-    }
-    if (Opc == 0)
-      return false;
-
-    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
-                      CurDAG->getRegister(0, MVT::i32) };
-    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
-    return true;
-  } else if (N->getOpcode() == ISD::OR &&
-             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
-    if (Opc == 0)
-      return false;
-
-    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
-                      CurDAG->getRegister(0, MVT::i32)};
-    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
-    return true;
-  }
-  return false;
-}
-
 /// We've got special pseudo-instructions for these
 void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
@@ -2726,15 +2563,6 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
-void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
-  // The only time a CONCAT_VECTORS operation can have legal types is when
-  // two 64-bit vectors are concatenated to a 128-bit vector.
-  EVT VT = N->getValueType(0);
-  if (!VT.is128BitVector() || N->getNumOperands() != 2)
-    llvm_unreachable("unexpected CONCAT_VECTORS");
-  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
-}
-
 static Optional<std::pair<unsigned, unsigned>>
 getContiguousRangeOfSetBits(const APInt &A) {
   unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
@@ -2826,11 +2654,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADD:
-  case ISD::OR:
-    if (trySMLAWSMULW(N))
-      return;
-    break;
   case ISD::WRITE_REGISTER:
     if (tryWriteRegister(N))
       return;
@@ -2859,9 +2682,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
       SDNode *ResNode;
       if (Subtarget->isThumb()) {
-        SDValue Pred = getAL(CurDAG, dl);
-        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
-        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        SDValue Ops[] = {
+          CPIdx,
+          getAL(CurDAG, dl),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
         ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other,
                                          Ops);
       } else {
@@ -2875,6 +2701,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
         ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
                                          Ops);
       }
+      // Annotate the Node with memory operand information so that MachineInstr
+      // queries work properly. This e.g. gives the register allocation the
+      // required information for rematerialization.
+      MachineFunction& MF = CurDAG->getMachineFunction();
+      MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+      MemOp[0] = MF.getMachineMemOperand(
+          MachinePointerInfo::getConstantPool(MF),
+          MachineMemOperand::MOLoad, 4, 4);
+
+      cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1);
+        
       ReplaceNode(N, ResNode);
       return;
     }
@@ -3046,49 +2883,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
     break;
   }
-  case ARMISD::VMOVRRD:
-    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
-                                          N->getOperand(0), getAL(CurDAG, dl),
-                                          CurDAG->getRegister(0, MVT::i32)));
-    return;
-  case ISD::UMUL_LOHI: {
-    if (Subtarget->isThumb1Only())
-      break;
-    if (Subtarget->isThumb()) {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(
-          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
-      return;
-    } else {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                        CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(N, CurDAG->getMachineNode(
-                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
-                         MVT::i32, MVT::i32, Ops));
-      return;
-    }
-  }
-  case ISD::SMUL_LOHI: {
-    if (Subtarget->isThumb1Only())
-      break;
-    if (Subtarget->isThumb()) {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(
-          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
-      return;
-    } else {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                        CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(N, CurDAG->getMachineNode(
-                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
-                         MVT::i32, MVT::i32, Ops));
-      return;
-    }
-  }
   case ARMISD::UMAAL: {
     unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
@@ -3099,38 +2893,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ARMISD::UMLAL:{
-    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
-    // 64-bit multiplication result.
-    if (Subtarget->hasV6Ops() && Subtarget->hasDSP() &&
-        N->getOperand(2).getOpcode() == ARMISD::ADDC &&
-        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
-
-      SDValue Addc = N->getOperand(2);
-      SDValue Adde = N->getOperand(3);
-
-      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
-
-        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
-        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
-
-        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
-        {
-          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
-          // RdLo = one operand to be added, lower 32-bits of res
-          // RdHi = other operand to be added, upper 32-bits of res
-          // Rn = first multiply operand
-          // Rm = second multiply operand
-          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                            Addc.getOperand(0), Addc.getOperand(1),
-                            getAL(CurDAG, dl),
-                            CurDAG->getRegister(0, MVT::i32) };
-          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
-          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
-          return;
-        }
-      }
-    }
-
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
@@ -3281,26 +3043,23 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       int64_t Addend = -C->getSExtValue();
 
       SDNode *Add = nullptr;
-      // In T2 mode, ADDS can be better than CMN if the immediate fits in a
+      // ADDS can be better than CMN if the immediate fits in a
       // 16-bit ADDS, which means either [0,256) for tADDi8 or [0,8) for tADDi3.
       // Outside that range we can just use a CMN which is 32-bit but has a
       // 12-bit immediate range.
-      if (Subtarget->isThumb2() && Addend < 1<<8) {
-        SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                          getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                          CurDAG->getRegister(0, MVT::i32) };
-        Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
-      } else if (!Subtarget->isThumb2() && Addend < 1<<8) {
-        // FIXME: Add T1 tADDi8 code.
-        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
-                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
-        Add = CurDAG->getMachineNode(ARM::tADDi8, dl, MVT::i32, Ops);
-      } else if (!Subtarget->isThumb2() && Addend < 1<<3) {
-        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
-                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
-        Add = CurDAG->getMachineNode(ARM::tADDi3, dl, MVT::i32, Ops);
+      if (Addend < 1<<8) {
+        if (Subtarget->isThumb2()) {
+          SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                            getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+          Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
+        } else {
+          unsigned Opc = (Addend < 1<<3) ? ARM::tADDi3 : ARM::tADDi8;
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+                           CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                           getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+          Add = CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+        }
       }
       if (Add) {
         SDValue Ops2[] = {SDValue(Add, 0), CurDAG->getConstant(0, dl, MVT::i32)};
@@ -3964,63 +3723,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     break;
   }
 
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-
-    case Intrinsic::arm_neon_vtbl2:
-      SelectVTBL(N, false, 2, ARM::VTBL2);
-      return;
-    case Intrinsic::arm_neon_vtbl3:
-      SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
-      return;
-    case Intrinsic::arm_neon_vtbl4:
-      SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
-      return;
-
-    case Intrinsic::arm_neon_vtbx2:
-      SelectVTBL(N, true, 2, ARM::VTBX2);
-      return;
-    case Intrinsic::arm_neon_vtbx3:
-      SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
-      return;
-    case Intrinsic::arm_neon_vtbx4:
-      SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
-      return;
-    }
-    break;
-  }
-
-  case ARMISD::VTBL1: {
-    SDLoc dl(N);
-    EVT VT = N->getValueType(0);
-    SDValue Ops[] = {N->getOperand(0), N->getOperand(1),
-                     getAL(CurDAG, dl),                 // Predicate
-                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
-    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops));
-    return;
-  }
-  case ARMISD::VTBL2: {
-    SDLoc dl(N);
-    EVT VT = N->getValueType(0);
-
-    // Form a REG_SEQUENCE to force register allocation.
-    SDValue V0 = N->getOperand(0);
-    SDValue V1 = N->getOperand(1);
-    SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
-
-    SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate
-                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
-    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops));
-    return;
-  }
-
-  case ISD::CONCAT_VECTORS:
-    SelectConcatVector(N);
-    return;
-
   case ISD::ATOMIC_CMP_SWAP:
     SelectCMP_SWAP(N);
     return;
@@ -4127,11 +3829,10 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
 // The flags here are common to those allowed for apsr in the A class cores and
 // those allowed for the special registers in the M class cores. Returns a
 // value representing which flags were present, -1 if invalid.
-static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
-  if (Flags.empty())
-    return 0x2 | (int)hasDSP;
-
+static inline int getMClassFlagsMask(StringRef Flags) {
   return StringSwitch<int>(Flags)
+          .Case("", 0x2) // no flags means nzcvq for psr registers, and 0x2 is
+                         // correct when flags are not permitted
           .Case("g", 0x1)
           .Case("nzcvq", 0x2)
           .Case("nzcvqg", 0x3)
@@ -4174,7 +3875,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
-  int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
+  int Mask = getMClassFlagsMask(Flags);
 
   // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
   // shouldn't have flags present.
@@ -4189,10 +3890,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   // The register was valid so need to put the mask in the correct place
   // (the flags need to be in bits 11-10) and combine with the SYSmvalue to
   // construct the operand for the instruction node.
-  if (SYSmvalue < 0x4)
-    return SYSmvalue | Mask << 10;
-
-  return SYSmvalue;
+  return SYSmvalue | Mask << 10;
 }
 
 static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
@@ -4205,7 +3903,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
     // The flags permitted for apsr are the same flags that are allowed in
     // M class registers. We get the flag value and then shift the flags into
     // the correct place to combine with the mask.
-    Mask = getMClassFlagsMask(Flags, true);
+    Mask = getMClassFlagsMask(Flags);
     if (Mask == -1)
       return -1;
     return Mask << 2;
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 0f84a23..27dda93 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13,46 +13,100 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMISelLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMPerfectShuffle.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
-#include "ARMTargetMachine.h"
-#include "ARMTargetObjectFile.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
-#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-isel"
@@ -72,7 +126,7 @@ static cl::opt<bool> EnableConstpoolPromotion(
     "arm-promote-constant", cl::Hidden,
     cl::desc("Enable / disable promotion of unnamed_addr constants into "
              "constant pools"),
-    cl::init(true));
+    cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
     "arm-promote-constant-max-size", cl::Hidden,
     cl::desc("Maximum size of constant to promote into a constant pool"),
@@ -82,21 +136,6 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
     cl::init(128));
 
-namespace {
-  class ARMCCState : public CCState {
-  public:
-    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
-               ParmContext PC)
-        : CCState(CC, isVarArg, MF, locs, C) {
-      assert(((PC == Call) || (PC == Prologue)) &&
-             "ARMCCState users must specify whether their context is call"
-             "or prologue generation.");
-      CallOrPrologue = PC;
-    }
-  };
-}
-
 // The APCS parameter registers.
 static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -162,7 +201,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
 
   if (!VT.isFloatingPoint() &&
       VT != MVT::v2i64 && VT != MVT::v1i64)
-    for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+    for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT, Legal);
 }
 
@@ -433,9 +472,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->isTargetWatchOS() ||
-      (Subtarget->isTargetIOS() &&
-       !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
+  if (Subtarget->isTargetMachO() &&
+      !(Subtarget->isTargetIOS() &&
+        Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
@@ -545,7 +584,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
-    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
@@ -563,7 +601,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
@@ -580,7 +617,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
-    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
@@ -685,10 +721,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  // ARM and Thumb2 support UMLAL/SMLAL.
-  if (!Subtarget->isThumb1Only())
-    setTargetDAGCombine(ISD::ADDC);
-
   if (Subtarget->isFPOnlySP()) {
     // When targeting a floating-point unit with only single-precision
     // operations, f64 is legal for the few double-precision instructions which
@@ -707,7 +739,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
-    setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
@@ -786,14 +817,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
 
-  if (!Subtarget->isThumb1Only()) {
-    // FIXME: We should do this for Thumb1 as well.
-    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
-    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
-    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
-    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
-  }
+  setOperationAction(ISD::ADDC,      MVT::i32, Custom);
+  setOperationAction(ISD::ADDE,      MVT::i32, Custom);
+  setOperationAction(ISD::SUBC,      MVT::i32, Custom);
+  setOperationAction(ISD::SUBE,      MVT::i32, Custom);
 
   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -820,7 +849,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
-  bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivide()
+  bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
                                         : Subtarget->hasDivideInARMMode();
   if (!hasDivide) {
     // These are expanded into libcalls if the cpu doesn't have HW divider.
@@ -828,7 +857,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
   }
 
-  if (Subtarget->isTargetWindows() && !Subtarget->hasDivide()) {
+  if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
     setOperationAction(ISD::SDIV, MVT::i32, Custom);
     setOperationAction(ISD::UDIV, MVT::i32, Custom);
 
@@ -1305,6 +1334,16 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
+  case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
+  case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
+  case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
+  case ARMISD::SMULWB:        return "ARMISD::SMULWB";
+  case ARMISD::SMULWT:        return "ARMISD::SMULWT";
+  case ARMISD::SMLALD:        return "ARMISD::SMLALD";
+  case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
+  case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
+  case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -1414,6 +1453,40 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
+static bool isSRL16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SRL)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+static bool isSRA16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SRA)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+static bool isSHL16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SHL)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+// Check for a signed 16-bit value. We special case SRA because it makes it
+// more simple when also looking for SRAs that aren't sign extending a
+// smaller value. Without the check, we'd need to take extra care with
+// checking order for some operations.
+static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
+  if (isSRA16(Op))
+    return isSHL16(Op.getOperand(0));
+  return DAG.ComputeNumSignBits(Op) == 17;
+}
+
 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
   switch (CC) {
@@ -1433,22 +1506,34 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
 
 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
-                        ARMCC::CondCodes &CondCode2) {
+                        ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
   CondCode2 = ARMCC::AL;
+  InvalidOnQNaN = true;
   switch (CC) {
   default: llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
-  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETOEQ:
+    CondCode = ARMCC::EQ;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETGT:
   case ISD::SETOGT: CondCode = ARMCC::GT; break;
   case ISD::SETGE:
   case ISD::SETOGE: CondCode = ARMCC::GE; break;
   case ISD::SETOLT: CondCode = ARMCC::MI; break;
   case ISD::SETOLE: CondCode = ARMCC::LS; break;
-  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETONE:
+    CondCode = ARMCC::MI;
+    CondCode2 = ARMCC::GT;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETO:   CondCode = ARMCC::VC; break;
   case ISD::SETUO:  CondCode = ARMCC::VS; break;
-  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUEQ:
+    CondCode = ARMCC::EQ;
+    CondCode2 = ARMCC::VS;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETUGT: CondCode = ARMCC::HI; break;
   case ISD::SETUGE: CondCode = ARMCC::PL; break;
   case ISD::SETLT:
@@ -1456,7 +1541,10 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   case ISD::SETLE:
   case ISD::SETULE: CondCode = ARMCC::LE; break;
   case ISD::SETNE:
-  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  case ISD::SETUNE:
+    CondCode = ARMCC::NE;
+    InvalidOnQNaN = false;
+    break;
   }
 }
 
@@ -1549,8 +1637,8 @@ SDValue ARMTargetLowering::LowerCallResult(
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
 
   // Copy all of the result registers out of their specified physreg.
@@ -1710,8 +1798,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1724,8 +1812,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!isSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
@@ -2088,10 +2175,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// this.
 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
                                     unsigned Align) const {
-  assert((State->getCallOrPrologue() == Prologue ||
-          State->getCallOrPrologue() == Call) &&
-         "unhandled ParmContext");
-
   // Byval (as with any stack) slots are always at least 4 byte aligned.
   Align = std::max(Align, 4U);
 
@@ -2148,7 +2231,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
                          const TargetInstrInfo *TII) {
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
-  int FI = INT_MAX;
+  int FI = std::numeric_limits<int>::max();
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!TargetRegisterInfo::isVirtualRegister(VR))
@@ -2178,7 +2261,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   } else
     return false;
 
-  assert(FI != INT_MAX);
+  assert(FI != std::numeric_limits<int>::max());
   if (!MFI.isFixedObjectIndex(FI))
     return false;
   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
@@ -2260,7 +2343,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
     if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
@@ -2362,8 +2445,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
@@ -2550,7 +2633,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   return true;
 }
 
-bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   if (!Subtarget->supportsTailCall())
     return false;
 
@@ -2586,12 +2669,35 @@ static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOVi.
-static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
   SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
+
+  // When generating execute-only code Constant Pools must be promoted to the
+  // global data section. It's a bit ugly that we can't share them across basic
+  // blocks, but this way we guarantee that execute-only behaves correct with
+  // position-independent addressing modes.
+  if (Subtarget->genExecuteOnly()) {
+    auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+    auto T = const_cast<Type*>(CP->getType());
+    auto C = const_cast<Constant*>(CP->getConstVal());
+    auto M = const_cast<Module*>(DAG.getMachineFunction().
+                                 getFunction()->getParent());
+    auto GV = new GlobalVariable(
+                    *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
+                    Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
+                    Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
+                    Twine(AFI->createPICLabelUId())
+                  );
+    SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
+                                            dl, PtrVT);
+    return LowerGlobalAddress(GA, DAG);
+  }
+
   if (CP->isMachineConstantPoolEntry())
     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
                                     CP->getAlignment());
@@ -2790,9 +2896,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
 
   // FIXME: is there useful debug info available here?
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
-               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+      DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2935,7 +3041,7 @@ static bool isSimpleType(Type *T) {
 }
 
 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
-                                     EVT PtrVT, SDLoc dl) {
+                                     EVT PtrVT, const SDLoc &dl) {
   // If we're creating a pool entry for a constant global with unnamed address,
   // and the global is small enough, we can emit it inline into the constant pool
   // to save ourselves an indirection.
@@ -2980,7 +3086,8 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
   unsigned RequiredPadding = 4 - (Size % 4);
   bool PaddingPossible =
     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
-  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
+  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
+      Size == 0)
     return SDValue();
 
   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
@@ -3034,6 +3141,19 @@ static bool isReadOnly(const GlobalValue *GV) {
          isa<Function>(GV);
 }
 
+SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Subtarget->getTargetTriple().getObjectFormat()) {
+  default: llvm_unreachable("unknown object format");
+  case Triple::COFF:
+    return LowerGlobalAddressWindows(Op, DAG);
+  case Triple::ELF:
+    return LowerGlobalAddressELF(Op, DAG);
+  case Triple::MachO:
+    return LowerGlobalAddressDarwin(Op, DAG);
+  }
+}
+
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -3080,15 +3200,22 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     return Result;
   } else if (Subtarget->isRWPI() && !IsRO) {
     // SB-relative.
-    ARMConstantPoolValue *CPV =
-      ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
-    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    SDValue G = DAG.getLoad(
-        PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    SDValue RelAddr;
+    if (Subtarget->useMovt(DAG.getMachineFunction())) {
+      ++NumMovwMovt;
+      SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
+      RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
+    } else { // use literal pool for address constant
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      RelAddr = DAG.getLoad(
+          PtrVT, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
-    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
+    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
     return Result;
   }
 
@@ -3219,6 +3346,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     }
     return Result;
   }
+  case Intrinsic::arm_neon_vabs:
+    return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
+                        Op.getOperand(1));
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu: {
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
@@ -3256,13 +3386,23 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
+  case Intrinsic::arm_neon_vtbl1:
+    return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::arm_neon_vtbl2:
+    return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
 }
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                                  const ARMSubtarget *Subtarget) {
-  // FIXME: handle "fence singlethread" more efficiently.
   SDLoc dl(Op);
+  ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
+  auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
+  if (SSID == SyncScope::SingleThread)
+    return Op;
+
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
@@ -3462,8 +3602,8 @@ SDValue ARMTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                    *DAG.getContext(), Prologue);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
 
   SmallVector<SDValue, 16> ArgValues;
@@ -3595,7 +3735,6 @@ SDValue ARMTargetLowering::LowerFormalArguments(
       InVals.push_back(ArgValue);
 
     } else { // VA.isRegLoc()
-
       // sanity check
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
@@ -3734,13 +3873,15 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
-                                     SelectionDAG &DAG, const SDLoc &dl) const {
+                                     SelectionDAG &DAG, const SDLoc &dl,
+                                     bool InvalidOnQNaN) const {
   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
+  SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
   else
-    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
@@ -3757,10 +3898,12 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   Cmp = Cmp.getOperand(0);
   Opc = Cmp.getOpcode();
   if (Opc == ARMISD::CMPFP)
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
+                      Cmp.getOperand(1), Cmp.getOperand(2));
   else {
     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
+                      Cmp.getOperand(1));
   }
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
@@ -3808,7 +3951,6 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
   return std::make_pair(Value, OverflowCmp);
 }
 
-
 SDValue
 ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
@@ -3832,7 +3974,6 @@ ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
-
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
@@ -4025,7 +4166,6 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
 // Additionally, the variable is returned in parameter V and the constant in K.
 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
                                     uint64_t &K) {
-
   SDValue LHS1 = Op.getOperand(0);
   SDValue RHS1 = Op.getOperand(1);
   SDValue TrueVal1 = Op.getOperand(2);
@@ -4046,10 +4186,10 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
   // in each conditional
   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
                                                         ? &RHS1
-                                                        : NULL;
+                                                        : nullptr;
   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
                                                         ? &RHS2
-                                                        : NULL;
+                                                        : nullptr;
   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
@@ -4073,13 +4213,15 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
   const SDValue *LowerCheckOp =
       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
           ? &Op
-          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
-                                                                       : NULL;
+          : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
+                ? &Op2
+                : nullptr;
   const SDValue *UpperCheckOp =
       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
           ? &Op
-          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2
-                                                                       : NULL;
+          : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
+                ? &Op2
+                : nullptr;
 
   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
     return false;
@@ -4104,7 +4246,6 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
 }
 
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
 
@@ -4162,7 +4303,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  FPCCToARMCC(CC, CondCode, CondCode2);
+  bool InvalidOnQNaN;
+  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   // Try to generate VMAXNM/VMINNM on ARMv8.
   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
@@ -4181,13 +4323,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
-    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
   }
   return Result;
@@ -4348,10 +4490,11 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  FPCCToARMCC(CC, CondCode, CondCode2);
+  bool InvalidOnQNaN;
+  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
@@ -4853,9 +4996,10 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
-  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                              DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
-                                              MVT::i32));
+  SDValue Ops[] = { DAG.getEntryNode(),
+                    DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
+
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
@@ -5212,15 +5356,15 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
     // Integer comparisons.
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal integer comparison");
-    case ISD::SETNE:  Invert = true;
+    case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
-    case ISD::SETLT:  Swap = true;
+    case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
-    case ISD::SETLE:  Swap = true;
+    case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
-    case ISD::SETULT: Swap = true;
+    case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
-    case ISD::SETULE: Swap = true;
+    case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
     }
 
@@ -5584,7 +5728,6 @@ static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   return true;
 }
 
-
 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                        bool &ReverseVEXT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
@@ -5758,7 +5901,10 @@ static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
-    WhichResult = M[i] == 0 ? 0 : 1;
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
     for (unsigned j = 0; j < NumElts; ++j) {
       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
         return false;
@@ -5789,7 +5935,10 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
 
   unsigned Half = NumElts / 2;
   for (unsigned i = 0; i < M.size(); i += NumElts) {
-    WhichResult = M[i] == 0 ? 0 : 1;
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
     for (unsigned j = 0; j < NumElts; j += Half) {
       unsigned Idx = WhichResult;
       for (unsigned k = 0; k < Half; ++k) {
@@ -5829,7 +5978,10 @@ static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
-    WhichResult = M[i] == 0 ? 0 : 1;
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
     unsigned Idx = WhichResult * NumElts / 2;
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
@@ -5862,7 +6014,10 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
-    WhichResult = M[i] == 0 ? 0 : 1;
+    if (M.size() == NumElts * 2)
+      WhichResult = i / NumElts;
+    else
+      WhichResult = M[i] == 0 ? 0 : 1;
     unsigned Idx = WhichResult * NumElts / 2;
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
@@ -6027,10 +6182,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   }
   if (ValueCounts.size() != 1)
     usesOnlyOneValue = false;
-  if (!Value.getNode() && ValueCounts.size() > 0)
+  if (!Value.getNode() && !ValueCounts.empty())
     Value = ValueCounts.begin()->first;
 
-  if (ValueCounts.size() == 0)
+  if (ValueCounts.empty())
     return DAG.getUNDEF(VT);
 
   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
@@ -6182,8 +6337,8 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
   struct ShuffleSourceInfo {
     SDValue Vec;
-    unsigned MinElt;
-    unsigned MaxElt;
+    unsigned MinElt = std::numeric_limits<unsigned>::max();
+    unsigned MaxElt = 0;
 
     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
     // be compatible with the shuffle we intend to construct. As a result
@@ -6192,13 +6347,12 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
     // Code should guarantee that element i in Vec starts at element "WindowBase
     // + i * WindowScale in ShuffleVec".
-    int WindowBase;
-    int WindowScale;
+    int WindowBase = 0;
+    int WindowScale = 1;
+
+    ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
 
     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
-    ShuffleSourceInfo(SDValue Vec)
-        : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
-          WindowScale(1) {}
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
@@ -6220,7 +6374,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
 
     // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
-    auto Source = find(Sources, SourceVec);
+    auto Source = llvm::find(Sources, SourceVec);
     if (Source == Sources.end())
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
@@ -6336,7 +6490,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     if (Entry.isUndef())
       continue;
 
-    auto Src = find(Sources, Entry.getOperand(0));
+    auto Src = llvm::find(Sources, Entry.getOperand(0));
     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
 
     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
@@ -6633,7 +6787,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
       EVT SubVT = SubV1.getValueType();
 
       // We expect these to have been canonicalized to -1.
-      assert(all_of(ShuffleMask, [&](int i) {
+      assert(llvm::all_of(ShuffleMask, [&](int i) {
         return i < (int)VT.getVectorNumElements();
       }) && "Unexpected shuffle index into UNDEF operand!");
 
@@ -6896,8 +7050,19 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
                                         N->getValueType(0),
                                         N->getOpcode());
 
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
-    return SkipLoadExtensionForVMULL(LD, DAG);
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
+           "Expected extending load");
+
+    SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
+    unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    SDValue extLoad =
+        DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
+
+    return newLoad;
+  }
 
   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
   // have been legalized as a BITCAST from v4i32.
@@ -7242,7 +7407,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+  Type *RetTy = StructType::get(ArgTy, ArgTy);
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
@@ -7258,9 +7423,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     ArgListEntry Entry;
     Entry.Node = SRet;
     Entry.Ty = RetTy->getPointerTo();
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isSRet = true;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsSRet = true;
     Args.push_back(Entry);
     RetTy = Type::getVoidTy(*DAG.getContext());
   }
@@ -7268,8 +7433,8 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   ArgListEntry Entry;
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
@@ -7427,6 +7592,9 @@ static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDValue VHi = DAG.getAnyExtOrTrunc(
       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
       dl, MVT::i32);
+  bool isBigEndian = DAG.getDataLayout().isBigEndian();
+  if (isBigEndian)
+    std::swap (VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
@@ -7454,10 +7622,14 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
 
-  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_0, SDLoc(N), MVT::i32,
-                                               SDValue(CmpSwap, 0)));
-  Results.push_back(DAG.getTargetExtractSubreg(ARM::gsub_1, SDLoc(N), MVT::i32,
-                                               SDValue(CmpSwap, 0)));
+  bool isBigEndian = DAG.getDataLayout().isBigEndian();
+
+  Results.push_back(
+      DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
+  Results.push_back(
+      DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
   Results.push_back(SDValue(CmpSwap, 2));
 }
 
@@ -7480,12 +7652,12 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
 
   Entry.Node = Val;
   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
-  Entry.isZExt = true;
+  Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Entry.Node = Exponent;
   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
-  Entry.isZExt = true;
+  Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
@@ -7517,21 +7689,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
-  case ISD::ConstantPool:
-    if (Subtarget->genExecuteOnly())
-      llvm_unreachable("execute-only should not generate constant pools");
-    return LowerConstantPool(Op, DAG);
+  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
-  case ISD::GlobalAddress:
-    switch (Subtarget->getTargetTriple().getObjectFormat()) {
-    default: llvm_unreachable("unknown object format");
-    case Triple::COFF:
-      return LowerGlobalAddressWindows(Op, DAG);
-    case Triple::ELF:
-      return LowerGlobalAddressELF(Op, DAG);
-    case Triple::MachO:
-      return LowerGlobalAddressDarwin(Op, DAG);
-    }
+  case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
@@ -7607,6 +7767,37 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  unsigned Opc = 0;
+  if (IntNo == Intrinsic::arm_smlald)
+    Opc = ARMISD::SMLALD;
+  else if (IntNo == Intrinsic::arm_smlaldx)
+    Opc = ARMISD::SMLALDX;
+  else if (IntNo == Intrinsic::arm_smlsld)
+    Opc = ARMISD::SMLSLD;
+  else if (IntNo == Intrinsic::arm_smlsldx)
+    Opc = ARMISD::SMLSLDX;
+  else
+    return;
+
+  SDLoc dl(N);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                           N->getOperand(3),
+                           DAG.getConstant(0, dl, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                           N->getOperand(3),
+                           DAG.getConstant(1, dl, MVT::i32));
+
+  SDValue LongMul = DAG.getNode(Opc, dl,
+                                DAG.getVTList(MVT::i32, MVT::i32),
+                                N->getOperand(1), N->getOperand(2),
+                                Lo, Hi);
+  Results.push_back(LongMul.getValue(0));
+  Results.push_back(LongMul.getValue(1));
+}
+
 /// ReplaceNodeResults - Replace the results of node with an illegal result
 /// type with new values built out of custom code.
 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
@@ -7648,6 +7839,8 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::ATOMIC_CMP_SWAP:
     ReplaceCMP_SWAP_64Results(N, Results, DAG);
     return;
+  case ISD::INTRINSIC_WO_CHAIN:
+    return ReplaceLongIntrinsic(N, Results, DAG);
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -7702,24 +7895,27 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   add    r5, pc
     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
-                   .addConstantPoolIndex(CPI)
-                   .addMemOperand(CPMMO));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+        .addConstantPoolIndex(CPI)
+        .addMemOperand(CPMMO)
+        .add(predOps(ARMCC::AL));
     // Set the low bit because of thumb mode.
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
-    AddDefaultCC(
-      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
-                     .addReg(NewVReg1, RegState::Kill)
-                     .addImm(0x01)));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+        .addReg(NewVReg1, RegState::Kill)
+        .addImm(0x01)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
       .addReg(NewVReg2, RegState::Kill)
       .addImm(PCLabelId);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
-                   .addReg(NewVReg3, RegState::Kill)
-                   .addFrameIndex(FI)
-                   .addImm(36)  // &jbuf[1] :: pc
-                   .addMemOperand(FIMMOSt));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+        .addReg(NewVReg3, RegState::Kill)
+        .addFrameIndex(FI)
+        .addImm(36) // &jbuf[1] :: pc
+        .addMemOperand(FIMMOSt)
+        .add(predOps(ARMCC::AL));
   } else if (isThumb) {
     // Incoming value: jbuf
     //   ldr.n  r1, LCPI1_4
@@ -7729,51 +7925,58 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     //   add    r2, $jbuf, #+4 ; &jbuf[1]
     //   str    r1, [r2]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
-                   .addConstantPoolIndex(CPI)
-                   .addMemOperand(CPMMO));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+        .addConstantPoolIndex(CPI)
+        .addMemOperand(CPMMO)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
       .addReg(NewVReg1, RegState::Kill)
       .addImm(PCLabelId);
     // Set the low bit because of thumb mode.
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addImm(1));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addImm(1)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg2, RegState::Kill)
-                   .addReg(NewVReg3, RegState::Kill));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addReg(NewVReg2, RegState::Kill)
+        .addReg(NewVReg3, RegState::Kill)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
             .addFrameIndex(FI)
             .addImm(36); // &jbuf[1] :: pc
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
-                   .addReg(NewVReg4, RegState::Kill)
-                   .addReg(NewVReg5, RegState::Kill)
-                   .addImm(0)
-                   .addMemOperand(FIMMOSt));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+        .addReg(NewVReg4, RegState::Kill)
+        .addReg(NewVReg5, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(FIMMOSt)
+        .add(predOps(ARMCC::AL));
   } else {
     // Incoming value: jbuf
     //   ldr  r1, LCPI1_1
     //   add  r1, pc, r1
     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
-                   .addConstantPoolIndex(CPI)
-                   .addImm(0)
-                   .addMemOperand(CPMMO));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
+        .addConstantPoolIndex(CPI)
+        .addImm(0)
+        .addMemOperand(CPMMO)
+        .add(predOps(ARMCC::AL));
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
-                   .addReg(NewVReg1, RegState::Kill)
-                   .addImm(PCLabelId));
-    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
-                   .addReg(NewVReg2, RegState::Kill)
-                   .addFrameIndex(FI)
-                   .addImm(36)  // &jbuf[1] :: pc
-                   .addMemOperand(FIMMOSt));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+        .addReg(NewVReg1, RegState::Kill)
+        .addImm(PCLabelId)
+        .add(predOps(ARMCC::AL));
+    BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+        .addReg(NewVReg2, RegState::Kill)
+        .addFrameIndex(FI)
+        .addImm(36) // &jbuf[1] :: pc
+        .addMemOperand(FIMMOSt)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -7791,7 +7994,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
-  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
        ++BB) {
@@ -7886,31 +8089,36 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
-                   .addFrameIndex(FI)
-                   .addImm(4)
-                   .addMemOperand(FIMMOLd));
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+        .addFrameIndex(FI)
+        .addImm(4)
+        .addMemOperand(FIMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
-                     .addReg(NewVReg1)
-                     .addImm(LPadList.size()));
+      BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+          .addReg(NewVReg1)
+          .addImm(LPadList.size())
+          .add(predOps(ARMCC::AL));
     } else {
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                     .addImm(NumLPads & 0xFFFF));
+      BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
+          .addImm(NumLPads & 0xFFFF)
+          .add(predOps(ARMCC::AL));
 
       unsigned VReg2 = VReg1;
       if ((NumLPads & 0xFFFF0000) != 0) {
         VReg2 = MRI->createVirtualRegister(TRC);
-        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
-                       .addReg(VReg1)
-                       .addImm(NumLPads >> 16));
+        BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
+            .addReg(VReg1)
+            .addImm(NumLPads >> 16)
+            .add(predOps(ARMCC::AL));
       }
 
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg2));
+      BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
+          .addReg(NewVReg1)
+          .addReg(VReg2)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
@@ -7919,16 +8127,17 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(ARM::CPSR);
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg3)
-                   .addJumpTableIndex(MJTI));
+    BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
+        .addJumpTableIndex(MJTI)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultCC(
-      AddDefaultPred(
-        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
+    BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
         .addReg(NewVReg3, RegState::Kill)
         .addReg(NewVReg1)
-        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
 
     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
       .addReg(NewVReg4, RegState::Kill)
@@ -7936,15 +8145,17 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addJumpTableIndex(MJTI);
   } else if (Subtarget->isThumb()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
-                   .addFrameIndex(FI)
-                   .addImm(1)
-                   .addMemOperand(FIMMOLd));
+    BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+        .addFrameIndex(FI)
+        .addImm(1)
+        .addMemOperand(FIMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
-                     .addReg(NewVReg1)
-                     .addImm(NumLPads));
+      BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+          .addReg(NewVReg1)
+          .addImm(NumLPads)
+          .add(predOps(ARMCC::AL));
     } else {
       MachineConstantPool *ConstantPool = MF->getConstantPool();
       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -7957,12 +8168,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
-                     .addReg(VReg1, RegState::Define)
-                     .addConstantPoolIndex(Idx));
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg1));
+      BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
+          .addReg(VReg1, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .add(predOps(ARMCC::AL));
+      BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
+          .addReg(NewVReg1)
+          .addReg(VReg1)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
@@ -7971,37 +8184,42 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(ARM::CPSR);
 
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg1)
-                   .addImm(2));
+    BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addReg(NewVReg1)
+        .addImm(2)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
-                   .addJumpTableIndex(MJTI));
+    BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+        .addJumpTableIndex(MJTI)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
-                   .addReg(ARM::CPSR, RegState::Define)
-                   .addReg(NewVReg2, RegState::Kill)
-                   .addReg(NewVReg3));
+    BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+        .addReg(ARM::CPSR, RegState::Define)
+        .addReg(NewVReg2, RegState::Kill)
+        .addReg(NewVReg3)
+        .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
 
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
-                   .addReg(NewVReg4, RegState::Kill)
-                   .addImm(0)
-                   .addMemOperand(JTMMOLd));
+    BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+        .addReg(NewVReg4, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(JTMMOLd)
+        .add(predOps(ARMCC::AL));
 
     unsigned NewVReg6 = NewVReg5;
     if (IsPositionIndependent) {
       NewVReg6 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
-                     .addReg(ARM::CPSR, RegState::Define)
-                     .addReg(NewVReg5, RegState::Kill)
-                     .addReg(NewVReg3));
+      BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+          .addReg(ARM::CPSR, RegState::Define)
+          .addReg(NewVReg5, RegState::Kill)
+          .addReg(NewVReg3)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
@@ -8009,31 +8227,36 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addJumpTableIndex(MJTI);
   } else {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
-                   .addFrameIndex(FI)
-                   .addImm(4)
-                   .addMemOperand(FIMMOLd));
+    BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+        .addFrameIndex(FI)
+        .addImm(4)
+        .addMemOperand(FIMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
-                     .addReg(NewVReg1)
-                     .addImm(NumLPads));
+      BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+          .addReg(NewVReg1)
+          .addImm(NumLPads)
+          .add(predOps(ARMCC::AL));
     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
-                     .addImm(NumLPads & 0xFFFF));
+      BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
+          .addImm(NumLPads & 0xFFFF)
+          .add(predOps(ARMCC::AL));
 
       unsigned VReg2 = VReg1;
       if ((NumLPads & 0xFFFF0000) != 0) {
         VReg2 = MRI->createVirtualRegister(TRC);
-        AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
-                       .addReg(VReg1)
-                       .addImm(NumLPads >> 16));
+        BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
+            .addReg(VReg1)
+            .addImm(NumLPads >> 16)
+            .add(predOps(ARMCC::AL));
       }
 
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg2));
+      BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+          .addReg(NewVReg1)
+          .addReg(VReg2)
+          .add(predOps(ARMCC::AL));
     } else {
       MachineConstantPool *ConstantPool = MF->getConstantPool();
       Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -8046,13 +8269,15 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
-                     .addReg(VReg1, RegState::Define)
-                     .addConstantPoolIndex(Idx)
-                     .addImm(0));
-      AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
-                     .addReg(NewVReg1)
-                     .addReg(VReg1, RegState::Kill));
+      BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
+          .addReg(VReg1, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .addImm(0)
+          .add(predOps(ARMCC::AL));
+      BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
+          .addReg(NewVReg1)
+          .addReg(VReg1, RegState::Kill)
+          .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
@@ -8061,23 +8286,25 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       .addReg(ARM::CPSR);
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
-    AddDefaultCC(
-      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
-                     .addReg(NewVReg1)
-                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
-                   .addJumpTableIndex(MJTI));
+    BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
+        .addJumpTableIndex(MJTI)
+        .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
-    AddDefaultPred(
-      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
-      .addReg(NewVReg3, RegState::Kill)
-      .addReg(NewVReg4)
-      .addImm(0)
-      .addMemOperand(JTMMOLd));
+    BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
+        .addReg(NewVReg3, RegState::Kill)
+        .addReg(NewVReg4)
+        .addImm(0)
+        .addMemOperand(JTMMOLd)
+        .add(predOps(ARMCC::AL));
 
     if (IsPositionIndependent) {
       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
@@ -8222,26 +8449,35 @@ static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
   assert(LdOpc != 0 && "Should have a load opcode");
   if (LdSize >= 8) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
-                       .addImm(0));
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrOut, RegState::Define)
+        .addReg(AddrIn)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb1) {
     // load + update AddrIn
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrIn).addImm(0));
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
-    MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(AddrIn).addImm(LdSize);
-    AddDefaultPred(MIB);
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrIn)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
+    BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
+        .add(t1CondCodeOp())
+        .addReg(AddrIn)
+        .addImm(LdSize)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb2) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
-                       .addImm(LdSize));
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrOut, RegState::Define)
+        .addReg(AddrIn)
+        .addImm(LdSize)
+        .add(predOps(ARMCC::AL));
   } else { // arm
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
-                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
-                       .addReg(0).addImm(LdSize));
+    BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+        .addReg(AddrOut, RegState::Define)
+        .addReg(AddrIn)
+        .addReg(0)
+        .addImm(LdSize)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -8254,24 +8490,36 @@ static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
   assert(StOpc != 0 && "Should have a store opcode");
   if (StSize >= 8) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
-                       .addReg(AddrIn).addImm(0).addReg(Data));
+    BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+        .addReg(AddrIn)
+        .addImm(0)
+        .addReg(Data)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb1) {
     // store + update AddrIn
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
-                       .addReg(AddrIn).addImm(0));
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
-    MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(AddrIn).addImm(StSize);
-    AddDefaultPred(MIB);
+    BuildMI(*BB, Pos, dl, TII->get(StOpc))
+        .addReg(Data)
+        .addReg(AddrIn)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
+    BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
+        .add(t1CondCodeOp())
+        .addReg(AddrIn)
+        .addImm(StSize)
+        .add(predOps(ARMCC::AL));
   } else if (IsThumb2) {
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
-                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+    BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+        .addReg(Data)
+        .addReg(AddrIn)
+        .addImm(StSize)
+        .add(predOps(ARMCC::AL));
   } else { // arm
-    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
-                       .addReg(Data).addReg(AddrIn).addReg(0)
-                       .addImm(StSize));
+    BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+        .addReg(Data)
+        .addReg(AddrIn)
+        .addReg(0)
+        .addImm(StSize)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -8402,16 +8650,15 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
       Vtmp = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl,
-                           TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16),
-                           Vtmp).addImm(LoopSize & 0xFFFF));
+    BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
+        .addImm(LoopSize & 0xFFFF)
+        .add(predOps(ARMCC::AL));
 
     if ((LoopSize & 0xFFFF0000) != 0)
-      AddDefaultPred(BuildMI(BB, dl,
-                             TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16),
-                             varEnd)
-                         .addReg(Vtmp)
-                         .addImm(LoopSize >> 16));
+      BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
+          .addReg(Vtmp)
+          .addImm(LoopSize >> 16)
+          .add(predOps(ARMCC::AL));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -8424,11 +8671,16 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
     if (IsThumb)
-      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
-          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+      BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
+          .addReg(varEnd, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .add(predOps(ARMCC::AL));
     else
-      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
-          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
+      BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
+          .addReg(varEnd, RegState::Define)
+          .addConstantPoolIndex(Idx)
+          .addImm(0)
+          .add(predOps(ARMCC::AL));
   }
   BB->addSuccessor(loopMBB);
 
@@ -8465,16 +8717,19 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
 
   // Decrement loop variable by UnitSize.
   if (IsThumb1) {
-    MachineInstrBuilder MIB =
-        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
-    MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(varPhi).addImm(UnitSize);
-    AddDefaultPred(MIB);
+    BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
+        .add(t1CondCodeOp())
+        .addReg(varPhi)
+        .addImm(UnitSize)
+        .add(predOps(ARMCC::AL));
   } else {
     MachineInstrBuilder MIB =
         BuildMI(*BB, BB->end(), dl,
                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB.addReg(varPhi)
+        .addImm(UnitSize)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
     MIB->getOperand(5).setReg(ARM::CPSR);
     MIB->getOperand(5).setIsDef(true);
   }
@@ -8545,11 +8800,14 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
   case CodeModel::Default:
   case CodeModel::Kernel:
     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
-      .addImm((unsigned)ARMCC::AL).addReg(0)
-      .addExternalSymbol("__chkstk")
-      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
-      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
-      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+        .add(predOps(ARMCC::AL))
+        .addExternalSymbol("__chkstk")
+        .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+        .addReg(ARM::R12,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .addReg(ARM::CPSR,
+                RegState::Implicit | RegState::Define | RegState::Dead);
     break;
   case CodeModel::Large:
   case CodeModel::JITDefault: {
@@ -8559,20 +8817,24 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
       .addExternalSymbol("__chkstk");
     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
-      .addImm((unsigned)ARMCC::AL).addReg(0)
-      .addReg(Reg, RegState::Kill)
-      .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
-      .addReg(ARM::R4, RegState::Implicit | RegState::Define)
-      .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead);
+        .add(predOps(ARMCC::AL))
+        .addReg(Reg, RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit | RegState::Define)
+        .addReg(ARM::R12,
+                RegState::Implicit | RegState::Define | RegState::Dead)
+        .addReg(ARM::CPSR,
+                RegState::Implicit | RegState::Define | RegState::Dead);
     break;
   }
   }
 
-  AddDefaultCC(AddDefaultPred(BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr),
-                                      ARM::SP)
-                         .addReg(ARM::SP, RegState::Kill)
-                         .addReg(ARM::R4, RegState::Kill)
-                         .setMIFlags(MachineInstr::FrameSetup)));
+  BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
+      .addReg(ARM::SP, RegState::Kill)
+      .addReg(ARM::R4, RegState::Kill)
+      .setMIFlags(MachineInstr::FrameSetup)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
 
   MI.eraseFromParent();
   return MBB;
@@ -8597,9 +8859,10 @@ ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
   MF->push_back(TrapBB);
   MBB->addSuccessor(TrapBB);
 
-  AddDefaultPred(BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
-                     .addReg(MI.getOperand(0).getReg())
-                     .addImm(0));
+  BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
+      .addReg(MI.getOperand(0).getReg())
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
       .addMBB(TrapBB)
       .addImm(ARMCC::EQ)
@@ -8617,18 +8880,18 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI.getOpcode()) {
   default: {
-    MI.dump();
+    MI.print(errs());
     llvm_unreachable("Unexpected instr type to insert");
   }
 
   // Thumb1 post-indexed loads are really just single-register LDMs.
   case ARM::tLDR_postidx: {
     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
-      .addOperand(MI.getOperand(1)) // Rn_wb
-      .addOperand(MI.getOperand(2)) // Rn
-      .addOperand(MI.getOperand(3)) // PredImm
-      .addOperand(MI.getOperand(4)) // PredReg
-      .addOperand(MI.getOperand(0)); // Rt
+        .add(MI.getOperand(1))  // Rn_wb
+        .add(MI.getOperand(2))  // Rn
+        .add(MI.getOperand(3))  // PredImm
+        .add(MI.getOperand(4))  // PredReg
+        .add(MI.getOperand(0)); // Rt
     MI.eraseFromParent();
     return BB;
   }
@@ -8659,12 +8922,12 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
     MachineMemOperand *MMO = *MI.memoperands_begin();
     BuildMI(*BB, MI, dl, TII->get(NewOpc))
-        .addOperand(MI.getOperand(0)) // Rn_wb
-        .addOperand(MI.getOperand(1)) // Rt
-        .addOperand(MI.getOperand(2)) // Rn
-        .addImm(Offset)               // offset (skip GPR==zero_reg)
-        .addOperand(MI.getOperand(5)) // pred
-        .addOperand(MI.getOperand(6))
+        .add(MI.getOperand(0)) // Rn_wb
+        .add(MI.getOperand(1)) // Rt
+        .add(MI.getOperand(2)) // Rn
+        .addImm(Offset)        // offset (skip GPR==zero_reg)
+        .add(MI.getOperand(5)) // pred
+        .add(MI.getOperand(6))
         .addMemOperand(MMO);
     MI.eraseFromParent();
     return BB;
@@ -8681,7 +8944,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
-      MIB.addOperand(MI.getOperand(i));
+      MIB.add(MI.getOperand(i));
     MI.eraseFromParent();
     return BB;
   }
@@ -8754,18 +9017,20 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     unsigned LHS1 = MI.getOperand(1).getReg();
     unsigned LHS2 = MI.getOperand(2).getReg();
     if (RHSisZero) {
-      AddDefaultPred(BuildMI(BB, dl,
-                             TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                     .addReg(LHS1).addImm(0));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+          .addReg(LHS1)
+          .addImm(0)
+          .add(predOps(ARMCC::AL));
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
         .addReg(LHS2).addImm(0)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     } else {
       unsigned RHS1 = MI.getOperand(3).getReg();
       unsigned RHS2 = MI.getOperand(4).getReg();
-      AddDefaultPred(BuildMI(BB, dl,
-                             TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
-                     .addReg(LHS1).addReg(RHS1));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+          .addReg(LHS1)
+          .addReg(RHS1)
+          .add(predOps(ARMCC::AL));
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
         .addReg(LHS2).addReg(RHS2)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
@@ -8779,7 +9044,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
     if (isThumb2)
-      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+      BuildMI(BB, dl, TII->get(ARM::t2B))
+          .addMBB(exitMBB)
+          .add(predOps(ARMCC::AL));
     else
       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
@@ -8842,9 +9109,10 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     RSBBB->addSuccessor(SinkBB);
 
     // insert a cmp at the end of BB
-    AddDefaultPred(BuildMI(BB, dl,
-                           TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
-                   .addReg(ABSSrcReg).addImm(0));
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+        .addReg(ABSSrcReg)
+        .addImm(0)
+        .add(predOps(ARMCC::AL));
 
     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
     BuildMI(BB, dl,
@@ -8855,9 +9123,11 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     // Note: BCC and rsbri will be converted into predicated rsbmi
     // by if-conversion pass
     BuildMI(*RSBBB, RSBBB->begin(), dl,
-      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
-      .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
-      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+            TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+        .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
+        .addImm(0)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
 
     // insert PHI in SinkBB,
     // reuse ABSDstReg to not change uses of ABS instruction
@@ -8927,19 +9197,45 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 
   // Rename pseudo opcodes.
   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
+  unsigned ccOutIdx;
   if (NewOpc) {
     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
-    assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
-           "converted opcode should be the same except for cc_out");
+    assert(MCID->getNumOperands() ==
+           MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
+        && "converted opcode should be the same except for cc_out"
+           " (and, on Thumb1, pred)");
 
     MI.setDesc(*MCID);
 
     // Add the optional cc_out operand
     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
-  }
-  unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+    // On Thumb1, move all input operands to the end, then add the predicate
+    if (Subtarget->isThumb1Only()) {
+      for (unsigned c = MCID->getNumOperands() - 4; c--;) {
+        MI.addOperand(MI.getOperand(1));
+        MI.RemoveOperand(1);
+      }
+
+      // Restore the ties
+      for (unsigned i = MI.getNumOperands(); i--;) {
+        const MachineOperand& op = MI.getOperand(i);
+        if (op.isReg() && op.isUse()) {
+          int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
+          if (DefIdx != -1)
+            MI.tieOperands(DefIdx, i);
+        }
+      }
+
+      MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
+      MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
+      ccOutIdx = 1;
+    } else
+      ccOutIdx = MCID->getNumOperands() - 1;
+  } else
+    ccOutIdx = MCID->getNumOperands() - 1;
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
@@ -8970,7 +9266,9 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   if (deadCPSR) {
     assert(!MI.getOperand(ccOutIdx).getReg() &&
            "expect uninitialized optional cc_out operand");
-    return;
+    // Thumb1 instructions must have the S bit even if the CPSR is dead.
+    if (!Subtarget->isThumb1Only())
+      return;
   }
 
   // If this instruction was defined with an optional CPSR def and its dag node
@@ -9032,7 +9330,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
     CC = N->getOperand(0);
-    if (CC.getValueType() != MVT::i1)
+    if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
       return false;
     Invert = !AllOnes;
     if (AllOnes)
@@ -9265,8 +9563,11 @@ AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       return SDValue();
   }
 
-  // Don't generate vpaddl+vmovn; we'll match it to vpadd later.
-  if (Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
+  // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
+  // we're using the entire input vector, otherwise there's a size/legality
+  // mismatch somewhere.
+  if (nextIndex != Vec.getValueType().getVectorNumElements() ||
+      Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
     return SDValue();
 
   // Create VPADDL node.
@@ -9308,10 +9609,90 @@ static SDValue findMUL_LOHI(SDValue V) {
   return SDValue();
 }
 
-static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb()) {
+    if (!Subtarget->hasDSP())
+      return SDValue();
+  } else if (!Subtarget->hasV5TEOps())
+    return SDValue();
+
+  // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
+  // accumulates the product into a 64-bit value. The 16-bit values will
+  // be sign extended somehow or SRA'd into 32-bit values
+  // (addc (adde (mul 16bit, 16bit), lo), hi)
+  SDValue Mul = AddcNode->getOperand(0);
+  SDValue Lo = AddcNode->getOperand(1);
+  if (Mul.getOpcode() != ISD::MUL) {
+    Lo = AddcNode->getOperand(0);
+    Mul = AddcNode->getOperand(1);
+    if (Mul.getOpcode() != ISD::MUL)
+      return SDValue();
+  }
+
+  SDValue SRA = AddeNode->getOperand(0);
+  SDValue Hi = AddeNode->getOperand(1);
+  if (SRA.getOpcode() != ISD::SRA) {
+    SRA = AddeNode->getOperand(1);
+    Hi = AddeNode->getOperand(0);
+    if (SRA.getOpcode() != ISD::SRA)
+      return SDValue();
+  }
+  if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
+    if (Const->getZExtValue() != 31)
+      return SDValue();
+  } else
+    return SDValue();
+
+  if (SRA.getOperand(0) != Mul)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(AddcNode);
+  unsigned Opcode = 0;
+  SDValue Op0;
+  SDValue Op1;
+
+  if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
+    Opcode = ARMISD::SMLALBB;
+    Op0 = Mul.getOperand(0);
+    Op1 = Mul.getOperand(1);
+  } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
+    Opcode = ARMISD::SMLALBT;
+    Op0 = Mul.getOperand(0);
+    Op1 = Mul.getOperand(1).getOperand(0);
+  } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
+    Opcode = ARMISD::SMLALTB;
+    Op0 = Mul.getOperand(0).getOperand(0);
+    Op1 = Mul.getOperand(1);
+  } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
+    Opcode = ARMISD::SMLALTT;
+    Op0 = Mul->getOperand(0).getOperand(0);
+    Op1 = Mul->getOperand(1).getOperand(0);
+  }
+
+  if (!Op0 || !Op1)
+    return SDValue();
+
+  SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                              Op0, Op1, Lo, Hi);
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(SMLAL.getNode(), 1);
+  SDValue LoMLALResult(SMLAL.getNode(), 0);
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
-
   // Look for multiply add opportunities.
   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
@@ -9326,7 +9707,17 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   //                  \      /
   //                    ADDC   <- hiAdd
   //
-  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+         "ADDE node has the wrong inputs");
+
+  // Check that we have a glued ADDC node.
+  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
+  if (AddcNode->getOpcode() != ARMISD::ADDC)
+    return SDValue();
+
   SDValue AddcOp0 = AddcNode->getOperand(0);
   SDValue AddcOp1 = AddcNode->getOperand(1);
 
@@ -9338,29 +9729,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
          AddcNode->getValueType(0) == MVT::i32 &&
          "Expect ADDC with two result values. First: i32");
 
-  // Check that we have a glued ADDC node.
-  if (AddcNode->getValueType(1) != MVT::Glue)
-    return SDValue();
-
-  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
+  // maybe a SMLAL which multiplies two 16-bit values.
   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
-    return SDValue();
-
-  // Look for the glued ADDE.
-  SDNode* AddeNode = AddcNode->getGluedUser();
-  if (!AddeNode)
-    return SDValue();
-
-  // Make sure it is really an ADDE.
-  if (AddeNode->getOpcode() != ISD::ADDE)
-    return SDValue();
-
-  assert(AddeNode->getNumOperands() == 3 &&
-         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
-         "ADDE node has the wrong inputs");
+    return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
 
   // Check for the triangle shape.
   SDValue AddeOp0 = AddeNode->getOperand(0);
@@ -9435,38 +9810,25 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
 
   // Return original node to notify the driver to stop replacing.
-  SDValue resNode(AddcNode, 0);
-  return resNode;
+  return SDValue(AddeNode, 0);
 }
 
-static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   // UMAAL is similar to UMLAL except that it adds two unsigned values.
   // While trying to combine for the other MLAL nodes, first search for the
-  // chance to use UMAAL. Check if Addc uses another addc node which can first
-  // be combined into a UMLAL. The other pattern is AddcNode being combined
-  // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
-
-  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
-      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
-
-  SDNode *PrevAddc = nullptr;
-  if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
-    PrevAddc = AddcNode->getOperand(0).getNode();
-  else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
-    PrevAddc = AddcNode->getOperand(1).getNode();
-
-  // If there's no addc chains, just return a search for any MLAL.
-  if (PrevAddc == nullptr)
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
-
-  // Try to convert the addc operand to an MLAL and if that fails try to
-  // combine AddcNode.
-  SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
-  if (MLAL != SDValue(PrevAddc, 0))
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+  // chance to use UMAAL. Check if Addc uses a node which has already
+  // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
+  // as the addend, and it's handled in PerformUMLALCombine.
+
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+    return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
+
+  // Check that we have a glued ADDC node.
+  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
+  if (AddcNode->getOpcode() != ARMISD::ADDC)
+    return SDValue();
 
   // Find the converted UMAAL or quit if it doesn't exist.
   SDNode *UmlalNode = nullptr;
@@ -9478,29 +9840,18 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
     UmlalNode = AddcNode->getOperand(1).getNode();
     AddHi = AddcNode->getOperand(0);
   } else {
-    return SDValue();
+    return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
   }
 
   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
   // the ADDC as well as Zero.
-  auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
-
-  if (!Zero || Zero->getZExtValue() != 0)
-    return SDValue();
-
-  // Check that we have a glued ADDC node.
-  if (AddcNode->getValueType(1) != MVT::Glue)
-    return SDValue();
-
-  // Look for the glued ADDE.
-  SDNode* AddeNode = AddcNode->getGluedUser();
-  if (!AddeNode)
+  if (!isNullConstant(UmlalNode->getOperand(3)))
     return SDValue();
 
-  if ((AddeNode->getOperand(0).getNode() == Zero &&
+  if ((isNullConstant(AddeNode->getOperand(0)) &&
        AddeNode->getOperand(1).getNode() == UmlalNode) ||
       (AddeNode->getOperand(0).getNode() == UmlalNode &&
-       AddeNode->getOperand(1).getNode() == Zero)) {
+       isNullConstant(AddeNode->getOperand(1)))) {
 
     SelectionDAG &DAG = DCI.DAG;
     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
@@ -9513,19 +9864,84 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
 
     // Return original node to notify the driver to stop replacing.
-    return SDValue(AddcNode, 0);
+    return SDValue(AddeNode, 0);
   }
   return SDValue();
 }
 
-/// PerformADDCCombine - Target-specific dag combine transform from
-/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
-/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
-static SDValue PerformADDCCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARMSubtarget *Subtarget) {
+static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+    return SDValue();
+
+  // Check that we have a pair of ADDC and ADDE as operands.
+  // Both addends of the ADDE must be zero.
+  SDNode* AddcNode = N->getOperand(2).getNode();
+  SDNode* AddeNode = N->getOperand(3).getNode();
+  if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
+      (AddeNode->getOpcode() == ARMISD::ADDE) &&
+      isNullConstant(AddeNode->getOperand(0)) &&
+      isNullConstant(AddeNode->getOperand(1)) &&
+      (AddeNode->getOperand(2).getNode() == AddcNode))
+    return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
+                       DAG.getVTList(MVT::i32, MVT::i32),
+                       {N->getOperand(0), N->getOperand(1),
+                        AddcNode->getOperand(0), AddcNode->getOperand(1)});
+  else
+    return SDValue();
+}
+
+static SDValue PerformAddcSubcCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  if (Subtarget->isThumb1Only()) {
+    SDValue RHS = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      int32_t imm = C->getSExtValue();
+      if (imm < 0 && imm > INT_MIN) {
+        SDLoc DL(N);
+        RHS = DAG.getConstant(-imm, DL, MVT::i32);
+        unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
+                                                           : ARMISD::ADDC;
+        return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
+      }
+    }
+  }
+  return SDValue();
+}
 
-  if (Subtarget->isThumb1Only()) return SDValue();
+static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  if (Subtarget->isThumb1Only()) {
+    SDValue RHS = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      int64_t imm = C->getSExtValue();
+      if (imm < 0) {
+        SDLoc DL(N);
+
+        // The with-carry-in form matches bitwise not instead of the negation.
+        // Effectively, the inverse interpretation of the carry flag already
+        // accounts for part of the negation.
+        RHS = DAG.getConstant(~imm, DL, MVT::i32);
+
+        unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
+                                                           : ARMISD::ADDE;
+        return DAG.getNode(Opcode, DL, N->getVTList(),
+                           N->getOperand(0), RHS, N->getOperand(2));
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// PerformADDECombine - Target-specific dag combine transform from
+/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
+static SDValue PerformADDECombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  // Only ARM and Thumb2 support UMLAL/SMLAL.
+  if (Subtarget->isThumb1Only())
+    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
 
   // Only perform the checks after legalize when the pattern is available.
   if (DCI.isBeforeLegalize()) return SDValue();
@@ -9722,7 +10138,6 @@ static SDValue PerformMULCombine(SDNode *N,
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
-
   // Attempt to use immediate-form VBIC
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   SDLoc dl(N);
@@ -9761,6 +10176,67 @@ static SDValue PerformANDCombine(SDNode *N,
   return SDValue();
 }
 
+// Try combining OR nodes to SMULWB, SMULWT.
+static SDValue PerformORCombineToSMULWBT(SDNode *OR,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasV6Ops() ||
+      (Subtarget->isThumb() &&
+       (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
+    return SDValue();
+
+  SDValue SRL = OR->getOperand(0);
+  SDValue SHL = OR->getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR->getOperand(1);
+    SHL = OR->getOperand(0);
+  }
+  if (!isSRL16(SRL) || !isSHL16(SHL))
+    return SDValue();
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return SDValue();
+
+  // Now we have:
+  // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMUWB the 16-bit value will signed extended somehow.
+  // For SMULWT only the SRA is required.
+  // Check both sides of SMUL_LOHI
+  SDValue OpS16 = SMULLOHI->getOperand(0);
+  SDValue OpS32 = SMULLOHI->getOperand(1);
+
+  SelectionDAG &DAG = DCI.DAG;
+  if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
+    OpS16 = OpS32;
+    OpS32 = SMULLOHI->getOperand(0);
+  }
+
+  SDLoc dl(OR);
+  unsigned Opcode = 0;
+  if (isS16(OpS16, DAG))
+    Opcode = ARMISD::SMULWB;
+  else if (isSRA16(OpS16)) {
+    Opcode = ARMISD::SMULWT;
+    OpS16 = OpS16->getOperand(0);
+  }
+  else
+    return SDValue();
+
+  SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
+  return SDValue(OR, 0);
+}
+
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
@@ -9798,6 +10274,8 @@ static SDValue PerformORCombine(SDNode *N,
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
+    if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
+      return Result;
   }
 
   // The code below optimizes (or (and X, Y), Z).
@@ -9906,7 +10384,7 @@ static SDValue PerformORCombine(SDNode *N,
         (Mask == ~Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
-      if (Subtarget->hasT2ExtractPack() &&
+      if (Subtarget->hasDSP() &&
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
@@ -9922,7 +10400,7 @@ static SDValue PerformORCombine(SDNode *N,
                (~Mask == Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
-      if (Subtarget->hasT2ExtractPack() &&
+      if (Subtarget->hasDSP() &&
           (Mask2 == 0xffff || Mask2 == 0xffff0000))
         return SDValue();
       // 2b
@@ -10485,11 +10963,8 @@ static SDValue CombineBaseUpdate(SDNode *N,
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
-    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
-      uint64_t IncVal = CInc->getZExtValue();
-      if (IncVal != NumBytes)
-        continue;
-    } else if (NumBytes >= 3 * 16) {
+    ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
+    if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
       // separate instructions that make it harder to use a non-constant update.
       continue;
@@ -11306,34 +11781,6 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
-                             APInt &KnownOne) {
-  if (Op.getOpcode() == ARMISD::BFI) {
-    // Conservatively, we can recurse down the first operand
-    // and just mask out all affected bits.
-    computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
-
-    // The operand to BFI is already a mask suitable for removing the bits it
-    // sets.
-    ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
-    const APInt &Mask = CI->getAPIntValue();
-    KnownZero &= Mask;
-    KnownOne &= Mask;
-    return;
-  }
-  if (Op.getOpcode() == ARMISD::CMOV) {
-    APInt KZ2(KnownZero.getBitWidth(), 0);
-    APInt KO2(KnownOne.getBitWidth(), 0);
-    computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
-    computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
-
-    KnownZero &= KZ2;
-    KnownOne &= KO2;
-    return;
-  }
-  return DAG.computeKnownBits(Op, KnownZero, KnownOne);
-}
-
 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
   // If we have a CMOV, OR and AND combination such as:
   //   if (x & CN)
@@ -11394,9 +11841,9 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D
 
   // Lastly, can we determine that the bits defined by OrCI
   // are zero in Y?
-  APInt KnownZero, KnownOne;
-  computeKnownBits(DAG, Y, KnownZero, KnownOne);
-  if ((OrCI & KnownZero) != OrCI)
+  KnownBits Known;
+  DAG.computeKnownBits(Y, Known);
+  if ((OrCI & Known.Zero) != OrCI)
     return SDValue();
 
   // OK, we can do the combine.
@@ -11534,16 +11981,16 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   }
 
   if (Res.getNode()) {
-    APInt KnownZero, KnownOne;
-    DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
+    KnownBits Known;
+    DAG.computeKnownBits(SDValue(N,0), Known);
     // Capture demanded bits information that would be otherwise lost.
-    if (KnownZero == 0xfffffffe)
+    if (Known.Zero == 0xfffffffe)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
                         DAG.getValueType(MVT::i1));
-    else if (KnownZero == 0xffffff00)
+    else if (Known.Zero == 0xffffff00)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
                         DAG.getValueType(MVT::i8));
-    else if (KnownZero == 0xffff0000)
+    else if (Known.Zero == 0xffff0000)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
                         DAG.getValueType(MVT::i16));
   }
@@ -11555,13 +12002,17 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
+  case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
+  case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ARMISD::ADDC:
+  case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
@@ -11593,6 +12044,56 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ARMISD::SMULWB: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMULWT: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALBB: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALBT: {
+    unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
+    APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
+    unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
+    APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALTB: {
+    unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
+    APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
+    unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
+    APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALTT: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -11688,12 +12189,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
     }
   }
 
-  // Lowering to i32/i16 if the size permits.
-  if (Size >= 4)
-    return MVT::i32;
-  else if (Size >= 2)
-    return MVT::i16;
-
   // Let the target-independent logic figure it out.
   return MVT::Other;
 }
@@ -12178,12 +12673,12 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 }
 
 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      APInt &KnownZero,
-                                                      APInt &KnownOne,
+                                                      KnownBits &Known,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
-  unsigned BitWidth = KnownOne.getBitWidth();
-  KnownZero = KnownOne = APInt(BitWidth, 0);
+  unsigned BitWidth = Known.getBitWidth();
+  Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
   case ARMISD::ADDC:
@@ -12193,17 +12688,18 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     // These nodes' second result is a boolean
     if (Op.getResNo() == 0)
       break;
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
-    DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
-    if (KnownZero == 0 && KnownOne == 0) return;
+    DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1);
+    if (Known.isUnknown())
+      return;
 
-    APInt KnownZeroRHS, KnownOneRHS;
-    DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
-    KnownZero &= KnownZeroRHS;
-    KnownOne  &= KnownOneRHS;
+    KnownBits KnownRHS;
+    DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1);
+    Known.Zero &= KnownRHS.Zero;
+    Known.One  &= KnownRHS.One;
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
@@ -12215,11 +12711,24 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     case Intrinsic::arm_ldrex: {
       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
       return;
     }
     }
   }
+  case ARMISD::BFI: {
+    // Conservatively, we can recurse down the first operand
+    // and just mask out all affected bits.
+    DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+
+    // The operand to BFI is already a mask suitable for removing the bits it
+    // sets.
+    ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
+    const APInt &Mask = CI->getAPIntValue();
+    Known.Zero &= Mask;
+    Known.One &= Mask;
+    return;
+  }
   }
 }
 
@@ -12588,8 +13097,8 @@ static TargetLowering::ArgListTy getDivRemArgList(
     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
     Entry.Node = N->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   if (Subtarget->isTargetWindows() && Args.size() >= 2)
@@ -12615,7 +13124,9 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   //     rem = a - b * div
   //     return {div, rem}
   // This should be lowered into UDIV/SDIV + MLS later on.
-  if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
+  bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
+                                        : Subtarget->hasDivideInARMMode();
+  if (hasDivide && Op->getValueType(0).isSimple() &&
       Op->getSimpleValueType(0) == MVT::i32) {
     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
     const SDValue Dividend = Op->getOperand(0);
@@ -12639,7 +13150,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
+  Type *RetTy = StructType::get(Ty, Ty);
 
   if (Subtarget->isTargetWindows())
     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
@@ -12861,7 +13372,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return true;
   }
   case Intrinsic::arm_stlexd:
-  case Intrinsic::arm_strexd: {
+  case Intrinsic::arm_strexd:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(2);
@@ -12871,9 +13382,9 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = false;
     Info.writeMem = true;
     return true;
-  }
+
   case Intrinsic::arm_ldaexd:
-  case Intrinsic::arm_ldrexd: {
+  case Intrinsic::arm_ldrexd:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(0);
@@ -12883,7 +13394,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.readMem = true;
     Info.writeMem = false;
     return true;
-  }
+
   default:
     break;
   }
@@ -12921,7 +13432,7 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
-      Function *MCR = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
+      Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
                         Builder.getInt32(0), Builder.getInt32(7),
                         Builder.getInt32(10), Builder.getInt32(5)};
@@ -12932,7 +13443,7 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
       llvm_unreachable("makeDMB on a target so old that it has no barriers");
     }
   } else {
-    Function *DMB = llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
+    Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
     // Only a full system barrier exists in the M-class architectures.
     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
     Constant *CDomain = Builder.getInt32(Domain);
@@ -12941,9 +13452,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
 }
 
 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
@@ -12952,7 +13463,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
   case AtomicOrdering::SequentiallyConsistent:
-    if (!IsStore)
+    if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
     /*FALLTHROUGH*/
   case AtomicOrdering::Release:
@@ -12966,9 +13477,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
 }
 
-Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
-                                          AtomicOrdering Ord, bool IsStore,
-                                          bool IsLoad) const {
+Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
@@ -13089,7 +13600,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
   if (ValTy->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
-    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+    Function *Ldrex = Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
@@ -13106,7 +13617,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
-  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+  Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldrex, Addr),
@@ -13118,7 +13629,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
+  Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
 }
 
 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
@@ -13154,6 +13665,39 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
               Addr});
 }
 
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+                                             const DataLayout &DL) const {
+  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+bool ARMTargetLowering::isLegalInterleavedAccessType(
+    VectorType *VecTy, const DataLayout &DL) const {
+
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+  // Ensure the vector doesn't have f16 elements. Even though we could do an
+  // i16 vldN, we can't hold the f16 vectors and will end up converting via
+  // f32.
+  if (VecTy->getElementType()->isHalfTy())
+    return false;
+
+  // Ensure the number of vector elements is greater than 1.
+  if (VecTy->getNumElements() < 2)
+    return false;
+
+  // Ensure the element type is legal.
+  if (ElSize != 8 && ElSize != 16 && ElSize != 32)
+    return false;
+
+  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+  // 128 will be split into multiple interleaved accesses.
+  return VecSize == 64 || VecSize % 128 == 0;
+}
+
 /// \brief Lower an interleaved load into a vldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -13178,64 +13722,99 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip if we do not have NEON and skip illegal vector types and vector types
-  // with i64/f64 elements (vldN doesn't support i64/f64 elements).
-  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
+  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
+  IRBuilder<> Builder(LI);
+
+  // The base address of the load.
+  Value *BaseAddr = LI->getPointerOperand();
+
+  if (NumLoads > 1) {
+    // If we're going to generate more than one load, reset the sub-vector type
+    // to something legal.
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
+
+    // We will compute the pointer operand of each load from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
+  }
+
+  assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+  Type *Tys[] = {VecTy, Int8Ptr};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                             Intrinsic::arm_neon_vld3,
                                             Intrinsic::arm_neon_vld4};
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
-  IRBuilder<> Builder(LI);
-  SmallVector<Value *, 2> Ops;
+  // Holds sub-vectors extracted from the load intrinsic return values. The
+  // sub-vectors are associated with the shufflevector instructions they will
+  // replace.
+  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
-  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
-  Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
-  Ops.push_back(Builder.getInt32(LI->getAlignment()));
+  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
-  Type *Tys[] = { VecTy, Int8Ptr };
-  Function *VldnFunc =
-      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
-  CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+    // If we're generating more than one load, compute the base address of
+    // subsequent loads as an offset from the previous.
+    if (LoadCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(
+          BaseAddr, VecTy->getVectorNumElements() * Factor);
 
-  // Replace uses of each shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SV = Shuffles[i];
-    unsigned Index = Indices[i];
+    SmallVector<Value *, 2> Ops;
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+    Ops.push_back(Builder.getInt32(LI->getAlignment()));
 
-    Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+    CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
 
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+    // Replace uses of each shufflevector with the corresponding vector loaded
+    // by ldN.
+    for (unsigned i = 0; i < Shuffles.size(); i++) {
+      ShuffleVectorInst *SV = Shuffles[i];
+      unsigned Index = Indices[i];
 
-    SV->replaceAllUsesWith(SubVec);
-  }
+      Value *SubVec = Builder.CreateExtractValue(VldN, Index);
 
-  return true;
-}
+      // Convert the integer vector to pointer vector if the element is pointer.
+      if (EltTy->isPointerTy())
+        SubVec = Builder.CreateIntToPtr(
+            SubVec, VectorType::get(SV->getType()->getVectorElementType(),
+                                    VecTy->getVectorNumElements()));
 
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+      SubVecs[SV].push_back(SubVec);
+    }
+  }
 
-  return ConstantVector::get(Mask);
+  // Replace uses of the shufflevector instructions with the sub-vectors
+  // returned by the load intrinsic. If a shufflevector instruction is
+  // associated with more than one sub-vector, those sub-vectors will be
+  // concatenated into a single wide vector.
+  for (ShuffleVectorInst *SVI : Shuffles) {
+    auto &SubVec = SubVecs[SVI];
+    auto *WideVec =
+        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+    SVI->replaceAllUsesWith(WideVec);
+  }
+
+  return true;
 }
 
 /// \brief Lower an interleaved store into a vstN intrinsic.
@@ -13279,15 +13858,15 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip if we do not have NEON and skip illegal vector types and vector types
-  // with i64/f64 elements (vstN doesn't support i64/f64 elements).
-  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
-      EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
@@ -13306,44 +13885,75 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
+  // The base address of the store.
+  Value *BaseAddr = SI->getPointerOperand();
+
+  if (NumStores > 1) {
+    // If we're going to generate more than one store, reset the lane length
+    // and sub-vector type to something legal.
+    LaneLen /= NumStores;
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+    // We will compute the pointer operand of each store from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
+  }
+
+  assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
+
+  auto Mask = SVI->getShuffleMask();
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+  Type *Tys[] = {Int8Ptr, SubVecTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
                                              Intrinsic::arm_neon_vst3,
                                              Intrinsic::arm_neon_vst4};
-  SmallVector<Value *, 6> Ops;
 
-  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
-  Type *Tys[] = { Int8Ptr, SubVecTy };
-  Function *VstNFunc = Intrinsic::getDeclaration(
-      SI->getModule(), StoreInts[Factor - 2], Tys);
+    // If we generating more than one store, we compute the base address of
+    // subsequent stores as an offset from the previous.
+    if (StoreCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
 
-  // Split the shufflevector operands into sub vectors for the new vstN call.
-  auto Mask = SVI->getShuffleMask();
-  for (unsigned i = 0; i < Factor; i++) {
-    if (Mask[i] >= 0) {
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
-    } else {
-      unsigned StartMask = 0;
-      for (unsigned j = 1; j < LaneLen; j++) {
-        if (Mask[j*Factor + i] >= 0) {
-          StartMask = Mask[j*Factor + i] - j;
-          break;
+    SmallVector<Value *, 6> Ops;
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+
+    Function *VstNFunc =
+        Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+    // Split the shufflevector operands into sub vectors for the new vstN call.
+    for (unsigned i = 0; i < Factor; i++) {
+      unsigned IdxI = StoreCount * LaneLen * Factor + i;
+      if (Mask[IdxI] >= 0) {
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+      } else {
+        unsigned StartMask = 0;
+        for (unsigned j = 1; j < LaneLen; j++) {
+          unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+          if (Mask[IdxJ * Factor + IdxI] >= 0) {
+            StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+            break;
+          }
         }
+        // Note: If all elements in a chunk are undefs, StartMask=0!
+        // Note: Filling undef gaps with random elements is ok, since
+        // those elements were being written anyway (with undefs).
+        // In the case of all undefs we're defaulting to using elems from 0
+        // Note: StartMask cannot be negative, it's checked in
+        // isReInterleaveMask
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
-      // Note: If all elements in a chunk are undefs, StartMask=0!
-      // Note: Filling undef gaps with random elements is ok, since
-      // those elements were being written anyway (with undefs).
-      // In the case of all undefs we're defaulting to using elems from 0
-      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
     }
-  }
 
-  Ops.push_back(Builder.getInt32(SI->getAlignment()));
-  Builder.CreateCall(VstNFunc, Ops);
+    Ops.push_back(Builder.getInt32(SI->getAlignment()));
+    Builder.CreateCall(VstNFunc, Ops);
+  }
   return true;
 }
 
@@ -13484,3 +14094,8 @@ void ARMTargetLowering::insertCopiesSplitCSR(
           .addReg(NewVR);
   }
 }
+
+void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
+  MF.getFrameInfo().computeMaxCallFrameSize(MF);
+  TargetLoweringBase::finalizeLowering(MF);
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
index 84c6eb8..f05b142 100644
--- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -175,9 +175,19 @@ class InstrItineraryData;
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      SMULWB,       // Signed multiply word by half word, bottom
+      SMULWT,       // Signed multiply word by half word, top
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
       UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
+      SMLALBB,      // 64-bit signed accumulate multiply bottom, bottom 16
+      SMLALBT,      // 64-bit signed accumulate multiply bottom, top 16
+      SMLALTB,      // 64-bit signed accumulate multiply top, bottom 16
+      SMLALTT,      // 64-bit signed accumulate multiply top, top 16
+      SMLALD,       // Signed multiply accumulate long dual
+      SMLALDX,      // Signed multiply accumulate long dual exchange
+      SMLSLD,       // Signed multiply subtract long dual
+      SMLSLDX,      // Signed multiply subtract long dual exchange
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -344,8 +354,8 @@ class InstrItineraryData;
                                     SDValue &Offset, ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
-    void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                       APInt &KnownOne,
+    void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth) const override;
 
@@ -473,10 +483,10 @@ class InstrItineraryData;
 
     void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
 
-    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                          bool IsStore, bool IsLoad) const override;
-    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                           bool IsStore, bool IsLoad) const override;
+    Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                  AtomicOrdering Ord) const override;
+    Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                   AtomicOrdering Ord) const override;
 
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
@@ -500,9 +510,19 @@ class InstrItineraryData;
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
+    bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+                          const SelectionDAG &DAG) const override {
+      // Do not merge to larger than i32.
+      return (MemVT.getSizeInBits() <= 32);
+    }
+
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
     bool supportSwiftError() const override {
       return true;
     }
@@ -514,6 +534,19 @@ class InstrItineraryData;
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
     CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
 
+    /// Returns true if \p VecTy is a legal interleaved access type. This
+    /// function checks the vector element type and the overall width of the
+    /// vector.
+    bool isLegalInterleavedAccessType(VectorType *VecTy,
+                                      const DataLayout &DL) const;
+
+    /// Returns the number of interleaved accesses that will be generated when
+    /// lowering accesses of the given type.
+    unsigned getNumInterleavedAccesses(VectorType *VecTy,
+                                       const DataLayout &DL) const;
+
+    void finalizeLowering(MachineFunction &MF) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -569,6 +602,8 @@ class InstrItineraryData;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
@@ -690,7 +725,7 @@ class InstrItineraryData;
 
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
-    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 
     SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
                     SDValue ARMcc, SDValue CCR, SDValue Cmp,
@@ -698,7 +733,7 @@ class InstrItineraryData;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                      const SDLoc &dl) const;
+                      const SDLoc &dl, bool InvalidOnQNaN) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
index 488439f..1bbe7f0 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -184,7 +184,7 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
 
 // ARM special operands for disassembly only.
 //
-def SetEndAsmOperand : ImmAsmOperand {
+def SetEndAsmOperand : ImmAsmOperand<0,1> {
   let Name = "SetEndImm";
   let ParserMethod = "parseSetEndImm";
 }
@@ -221,25 +221,25 @@ def banked_reg : Operand<i32> {
 //     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
 //     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
 //     64       64 - <imm> is encoded in imm6<5:0>
-def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8_asm_operand : ImmAsmOperand<1,8> { let Name = "ShrImm8"; }
 def shr_imm8  : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
   let EncoderMethod = "getShiftRight8Imm";
   let DecoderMethod = "DecodeShiftRight8Imm";
   let ParserMatchClass = shr_imm8_asm_operand;
 }
-def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16_asm_operand : ImmAsmOperand<1,16> { let Name = "ShrImm16"; }
 def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
   let EncoderMethod = "getShiftRight16Imm";
   let DecoderMethod = "DecodeShiftRight16Imm";
   let ParserMatchClass = shr_imm16_asm_operand;
 }
-def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32_asm_operand : ImmAsmOperand<1,32> { let Name = "ShrImm32"; }
 def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
   let EncoderMethod = "getShiftRight32Imm";
   let DecoderMethod = "DecodeShiftRight32Imm";
   let ParserMatchClass = shr_imm32_asm_operand;
 }
-def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64_asm_operand : ImmAsmOperand<1,64> { let Name = "ShrImm64"; }
 def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
   let EncoderMethod = "getShiftRight64Imm";
   let DecoderMethod = "DecodeShiftRight64Imm";
@@ -261,10 +261,19 @@ def const_pool_asm_imm : Operand<i32> {
 // Note: When EmitPriority == 1, the alias will be used for printing
 class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class ARMInstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsARM,UseNegativeImmediates]>;
 class  tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class  tInstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsThumb,UseNegativeImmediates]>;
 class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class t2InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsThumb2,UseNegativeImmediates]>;
 class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
 class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
@@ -948,7 +957,7 @@ class ADivA1I<bits<3> opcod, dag oops, dag iops,
 }
 
 // PKH instructions
-def PKHLSLAsmOperand : ImmAsmOperand {
+def PKHLSLAsmOperand : ImmAsmOperand<0,31> {
   let Name = "PKHLSLImm";
   let ParserMethod = "parsePKHLSLImm";
 }
@@ -1013,9 +1022,6 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
 class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
 }
-class Thumb2ExtractPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb2, HasT2ExtractPack];
-}
 //===----------------------------------------------------------------------===//
 // Thumb Instruction Format Definitions.
 //
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 27b6432..a0e2ac4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -32,8 +32,8 @@ using namespace llvm;
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
     : ARMBaseInstrInfo(STI), RI() {}
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void ARMInstrInfo::getNoop(MCInst &NopInst) const {
   if (hasNOP()) {
     NopInst.setOpcode(ARM::HINT);
     NopInst.addOperand(MCOperand::createImm(0));
@@ -129,8 +129,9 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
       MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
   MIB.addMemOperand(MMO);
-  MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg);
-  MIB.addReg(Reg, RegState::Kill).addImm(0);
-  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
-  AddDefaultPred(MIB);
+  BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addImm(0)
+      .setMemRefs(MI->memoperands_begin(), MI->memoperands_end())
+      .add(predOps(ARMCC::AL));
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
index 4b1b709..c87fb97 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.h
@@ -25,8 +25,8 @@ class ARMInstrInfo : public ARMBaseInstrInfo {
 public:
   explicit ARMInstrInfo(const ARMSubtarget &STI);
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  /// Return the noop instruction to use for a noop.
+  void getNoop(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
index c473939..7206083 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -16,7 +16,8 @@
 //
 
 // Type profiles.
-def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i32> ]>;
 def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
 def SDT_ARMStructByVal : SDTypeProfile<0, 4,
                                        [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
@@ -51,6 +52,8 @@ def SDT_ARMAnd     : SDTypeProfile<1, 2,
                                     SDTCisVT<2, i32>]>;
 
 def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ARMFCmp    : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
+                                          SDTCisVT<2, i32>]>;
 
 def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
@@ -90,6 +93,18 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
 
+def SDT_LongMac  : SDTypeProfile<2, 4, [SDTCisVT<0, i32>,
+                                        SDTCisSameAs<0, 1>,
+                                        SDTCisSameAs<0, 2>,
+                                        SDTCisSameAs<0, 3>,
+                                        SDTCisSameAs<0, 4>,
+                                        SDTCisSameAs<0, 5>]>;
+
+def ARMSmlald        : SDNode<"ARMISD::SMLALD", SDT_LongMac>;
+def ARMSmlaldx       : SDNode<"ARMISD::SMLALDX", SDT_LongMac>;
+def ARMSmlsld        : SDNode<"ARMISD::SMLSLD", SDT_LongMac>;
+def ARMSmlsldx       : SDNode<"ARMISD::SMLSLDX", SDT_LongMac>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -181,6 +196,13 @@ def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                          SDNPMayStore, SDNPMayLoad]>;
 
+def ARMsmulwb       : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>;
+def ARMsmulwt       : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>;
+def ARMsmlalbb      : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>;
+def ARMsmlalbt      : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
+def ARMsmlaltb      : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
+def ARMsmlaltt      : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -243,13 +265,10 @@ def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float conversions">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16","full half-float">;
-def HasDivide        : Predicate<"Subtarget->hasDivide()">,
-                                 AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
+def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
+                                 AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
                                  AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
-def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
-                                 AssemblerPredicate<"FeatureT2XtPk",
-                                                     "pack/extract">;
 def HasDSP           : Predicate<"Subtarget->hasDSP()">,
                                  AssemblerPredicate<"FeatureDSP", "dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
@@ -298,9 +317,16 @@ def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
                                  AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
+def UseNegativeImmediates :
+  Predicate<"false">,
+            AssemblerPredicate<"!FeatureNoNegativeImmediates",
+                               "NegativeImmediates">;
+
 // FIXME: Eventually this will be just "hasV6T2Ops".
-def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
-def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
+let RecomputePerFunction = 1 in {
+  def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
+  def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
+}
 def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
 def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
@@ -327,8 +353,10 @@ def UseVMOVSR : Predicate<"Subtarget->preferVMOVSR() ||"
 def DontUseVMOVSR : Predicate<"!Subtarget->preferVMOVSR() &&"
                               "Subtarget->useNEONForSinglePrecisionFP()">;
 
-def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
-def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
+let RecomputePerFunction = 1 in {
+  def IsLE             : Predicate<"MF->getDataLayout().isLittleEndian()">;
+  def IsBE             : Predicate<"MF->getDataLayout().isBigEndian()">;
+}
 
 def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
 
@@ -423,7 +451,16 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
 //
 
 // Immediate operands with a shared generic asm render method.
-class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+class ImmAsmOperand<int Low, int High> : AsmOperandClass {
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
+  let DiagnosticType = "ImmRange" # Low # "_" # High;
+}
+
+class ImmAsmOperandMinusOne<int Low, int High> : AsmOperandClass {
+  let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
+  let DiagnosticType = "ImmRange" # Low # "_" # High;
+}
 
 // Operands that are part of a memory addressing mode.
 class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; }
@@ -645,35 +682,45 @@ def arm_i32imm : PatLeaf<(imm), [{
 }]>;
 
 /// imm0_1 predicate - Immediate in the range [0,1].
-def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; }
 def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
 
 /// imm0_3 predicate - Immediate in the range [0,3].
-def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def Imm0_3AsmOperand: ImmAsmOperand<0,3> { let Name = "Imm0_3"; }
 def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
 
 /// imm0_7 predicate - Immediate in the range [0,7].
-def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def Imm0_7AsmOperand: ImmAsmOperand<0,7> {
+  let Name = "Imm0_7";
+}
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 8;
 }]> {
   let ParserMatchClass = Imm0_7AsmOperand;
 }
 
+/// imm8_255 predicate - Immediate in the range [8,255].
+def Imm8_255AsmOperand: ImmAsmOperand<8,255> { let Name = "Imm8_255"; }
+def imm8_255 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
+}]> {
+  let ParserMatchClass = Imm8_255AsmOperand;
+}
+
 /// imm8 predicate - Immediate is exactly 8.
-def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def Imm8AsmOperand: ImmAsmOperand<8,8> { let Name = "Imm8"; }
 def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
   let ParserMatchClass = Imm8AsmOperand;
 }
 
 /// imm16 predicate - Immediate is exactly 16.
-def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def Imm16AsmOperand: ImmAsmOperand<16,16> { let Name = "Imm16"; }
 def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
   let ParserMatchClass = Imm16AsmOperand;
 }
 
 /// imm32 predicate - Immediate is exactly 32.
-def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def Imm32AsmOperand: ImmAsmOperand<32,32> { let Name = "Imm32"; }
 def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
   let ParserMatchClass = Imm32AsmOperand;
 }
@@ -681,25 +728,25 @@ def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
 def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>;
 
 /// imm1_7 predicate - Immediate in the range [1,7].
-def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def Imm1_7AsmOperand: ImmAsmOperand<1,7> { let Name = "Imm1_7"; }
 def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
   let ParserMatchClass = Imm1_7AsmOperand;
 }
 
 /// imm1_15 predicate - Immediate in the range [1,15].
-def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def Imm1_15AsmOperand: ImmAsmOperand<1,15> { let Name = "Imm1_15"; }
 def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
   let ParserMatchClass = Imm1_15AsmOperand;
 }
 
 /// imm1_31 predicate - Immediate in the range [1,31].
-def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def Imm1_31AsmOperand: ImmAsmOperand<1,31> { let Name = "Imm1_31"; }
 def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
   let ParserMatchClass = Imm1_31AsmOperand;
 }
 
 /// imm0_15 predicate - Immediate in the range [0,15].
-def Imm0_15AsmOperand: ImmAsmOperand {
+def Imm0_15AsmOperand: ImmAsmOperand<0,15> {
   let Name = "Imm0_15";
   let DiagnosticType = "ImmRange0_15";
 }
@@ -710,7 +757,7 @@ def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
-def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def Imm0_31AsmOperand: ImmAsmOperand<0,31> { let Name = "Imm0_31"; }
 def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 32;
 }]> {
@@ -718,15 +765,15 @@ def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
-def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def Imm0_32AsmOperand: ImmAsmOperand<0,32> { let Name = "Imm0_32"; }
 def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm < 32;
+  return Imm >= 0 && Imm < 33;
 }]> {
   let ParserMatchClass = Imm0_32AsmOperand;
 }
 
 /// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
-def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def Imm0_63AsmOperand: ImmAsmOperand<0,63> { let Name = "Imm0_63"; }
 def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 64;
 }]> {
@@ -734,7 +781,7 @@ def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_239 predicate - Immediate in the range [0,239].
-def Imm0_239AsmOperand : ImmAsmOperand {
+def Imm0_239AsmOperand : ImmAsmOperand<0,239> {
   let Name = "Imm0_239";
   let DiagnosticType = "ImmRange0_239";
 }
@@ -743,13 +790,13 @@ def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
 }
 
 /// imm0_255 predicate - Immediate in the range [0,255].
-def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def Imm0_255AsmOperand : ImmAsmOperand<0,255> { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
   let ParserMatchClass = Imm0_255AsmOperand;
 }
 
-/// imm0_65535 - An immediate is in the range [0.65535].
-def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+/// imm0_65535 - An immediate is in the range [0,65535].
+def Imm0_65535AsmOperand: ImmAsmOperand<0,65535> { let Name = "Imm0_65535"; }
 def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 65536;
 }]> {
@@ -767,19 +814,23 @@ def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
 // FIXME: This really needs a Thumb version separate from the ARM version.
 // While the range is the same, and can thus use the same match class,
 // the encoding is different so it should have a different encoder method.
-def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def Imm0_65535ExprAsmOperand: AsmOperandClass {
+  let Name = "Imm0_65535Expr";
+  let RenderMethod = "addImmOperands";
+}
+
 def imm0_65535_expr : Operand<i32> {
   let EncoderMethod = "getHiLo16ImmOpValue";
   let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
-def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def Imm256_65535ExprAsmOperand: ImmAsmOperand<256,65535> { let Name = "Imm256_65535Expr"; }
 def imm256_65535_expr : Operand<i32> {
   let ParserMatchClass = Imm256_65535ExprAsmOperand;
 }
 
 /// imm24b - True if the 32-bit immediate is encodable in 24 bits.
-def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def Imm24bitAsmOperand: ImmAsmOperand<0,0xffffff> { let Name = "Imm24bit"; }
 def imm24b : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm <= 0xffffff;
 }]> {
@@ -808,7 +859,9 @@ def imm1_32_XFORM: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
                                    MVT::i32);
 }]>;
-def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def Imm1_32AsmOperand: ImmAsmOperandMinusOne<1,32> {
+  let Name = "Imm1_32";
+}
 def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
    uint64_t Imm = N->getZExtValue();
    return Imm > 0 && Imm <= 32;
@@ -822,8 +875,10 @@ def imm1_16_XFORM: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
                                    MVT::i32);
 }]>;
-def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
-def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+def Imm1_16AsmOperand: ImmAsmOperandMinusOne<1,16> { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, ImmLeaf<i32, [{
+    return Imm > 0 && Imm <= 16;
+  }],
     imm1_16_XFORM> {
   let PrintMethod = "printImmPlusOneOperand";
   let ParserMatchClass = Imm1_16AsmOperand;
@@ -1914,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
            [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
 
 def ADJCALLSTACKDOWN :
-PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
-           [(ARMcallseq_start timm:$amt)]>;
+PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary,
+           [(ARMcallseq_start timm:$amt, timm:$amt2)]>;
 }
 
 def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
@@ -1936,7 +1991,9 @@ def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
 def : InstAlias<"esb$p", (HINT 16, pred:$p)>, Requires<[IsARM, HasRAS]>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
-             "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
+             "\t$Rd, $Rn, $Rm",
+             [(set GPR:$Rd, (int_arm_sel GPR:$Rn, GPR:$Rm))]>,
+             Requires<[IsARM, HasV6]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -3425,8 +3482,12 @@ def : ARMV6Pat<(add rGPR:$Rn, (sext_inreg (srl rGPR:$Rm, imm8_or_16:$rot),
                (SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
 
 def SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
+def : ARMV6Pat<(int_arm_sxtb16 GPR:$Src),
+               (SXTB16 GPR:$Src, 0)>;
 
 def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+def : ARMV6Pat<(int_arm_sxtab16 GPR:$LHS, GPR:$RHS),
+               (SXTAB16 GPR:$LHS, GPR:$RHS, 0)>;
 
 // Zero extenders
 
@@ -3446,6 +3507,8 @@ def UXTB16 : AI_ext_rrot<0b01101100,
 //               (UXTB16r_rot GPR:$Src, 3)>;
 def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
                (UXTB16 GPR:$Src, 1)>;
+def : ARMV6Pat<(int_arm_uxtb16 GPR:$Src),
+               (UXTB16 GPR:$Src, 0)>;
 
 def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
@@ -3460,6 +3523,8 @@ def : ARMV6Pat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot), 0xFFFF)),
 
 // This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
 def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+def : ARMV6Pat<(int_arm_uxtab16 GPR:$LHS, GPR:$RHS),
+               (UXTAB16 GPR:$LHS, GPR:$RHS, 0)>;
 
 
 def SBFX  : I<(outs GPRnopc:$Rd),
@@ -3586,71 +3651,85 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
   let Unpredictable{11-8} = 0b1111;
 }
 
-// Saturating add/subtract
+// Wrappers around the AAI class
+class AAIRevOpr<bits<8> op27_20, bits<8> op11_4, string opc,
+                list<dag> pattern = []>
+  : AAI<op27_20, op11_4, opc,
+        pattern,
+        (ins GPRnopc:$Rm, GPRnopc:$Rn),
+        "\t$Rd, $Rm, $Rn">;
+
+class AAIIntrinsic<bits<8> op27_20, bits<8> op11_4, string opc,
+                 Intrinsic intrinsic>
+  : AAI<op27_20, op11_4, opc,
+        [(set GPRnopc:$Rd, (intrinsic GPRnopc:$Rn, GPRnopc:$Rm))]>;
 
+// Saturating add/subtract
+let hasSideEffects = 1 in {
+def QADD8   : AAIIntrinsic<0b01100010, 0b11111001, "qadd8", int_arm_qadd8>;
+def QADD16  : AAIIntrinsic<0b01100010, 0b11110001, "qadd16", int_arm_qadd16>;
+def QSUB16  : AAIIntrinsic<0b01100010, 0b11110111, "qsub16", int_arm_qsub16>;
+def QSUB8   : AAIIntrinsic<0b01100010, 0b11111111, "qsub8", int_arm_qsub8>;
+
+def QDADD   : AAIRevOpr<0b00010100, 0b00000101, "qdadd",
+              [(set GPRnopc:$Rd, (int_arm_qadd (int_arm_qadd GPRnopc:$Rm,
+                                                             GPRnopc:$Rm),
+                                  GPRnopc:$Rn))]>;
+def QDSUB   : AAIRevOpr<0b00010110, 0b00000101, "qdsub",
+              [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm,
+                                  (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>;
+def QSUB    : AAIRevOpr<0b00010010, 0b00000101, "qsub",
+              [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))]>;
 let DecoderMethod = "DecodeQADDInstruction" in
-def QADD    : AAI<0b00010000, 0b00000101, "qadd",
-                  [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
-                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
-
-def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
-                  [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
-                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
-def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [],
-                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
-                  "\t$Rd, $Rm, $Rn">;
-def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [],
-                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
-                  "\t$Rd, $Rm, $Rn">;
-
-def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
-def QADD8   : AAI<0b01100010, 0b11111001, "qadd8">;
-def QASX    : AAI<0b01100010, 0b11110011, "qasx">;
-def QSAX    : AAI<0b01100010, 0b11110101, "qsax">;
-def QSUB16  : AAI<0b01100010, 0b11110111, "qsub16">;
-def QSUB8   : AAI<0b01100010, 0b11111111, "qsub8">;
-def UQADD16 : AAI<0b01100110, 0b11110001, "uqadd16">;
-def UQADD8  : AAI<0b01100110, 0b11111001, "uqadd8">;
-def UQASX   : AAI<0b01100110, 0b11110011, "uqasx">;
-def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
-def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
-def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
+  def QADD    : AAIRevOpr<0b00010000, 0b00000101, "qadd",
+                [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))]>;
+}
+
+def UQADD16 : AAIIntrinsic<0b01100110, 0b11110001, "uqadd16", int_arm_uqadd16>;
+def UQADD8  : AAIIntrinsic<0b01100110, 0b11111001, "uqadd8", int_arm_uqadd8>;
+def UQSUB16 : AAIIntrinsic<0b01100110, 0b11110111, "uqsub16", int_arm_uqsub16>;
+def UQSUB8  : AAIIntrinsic<0b01100110, 0b11111111, "uqsub8", int_arm_uqsub8>;
+def QASX    : AAIIntrinsic<0b01100010, 0b11110011, "qasx", int_arm_qasx>;
+def QSAX    : AAIIntrinsic<0b01100010, 0b11110101, "qsax", int_arm_qsax>;
+def UQASX   : AAIIntrinsic<0b01100110, 0b11110011, "uqasx", int_arm_uqasx>;
+def UQSAX   : AAIIntrinsic<0b01100110, 0b11110101, "uqsax", int_arm_uqsax>;
 
 // Signed/Unsigned add/subtract
 
-def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
-def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
-def SADD8  : AAI<0b01100001, 0b11111001, "sadd8">;
-def SSAX   : AAI<0b01100001, 0b11110101, "ssax">;
-def SSUB16 : AAI<0b01100001, 0b11110111, "ssub16">;
-def SSUB8  : AAI<0b01100001, 0b11111111, "ssub8">;
-def UASX   : AAI<0b01100101, 0b11110011, "uasx">;
-def UADD16 : AAI<0b01100101, 0b11110001, "uadd16">;
-def UADD8  : AAI<0b01100101, 0b11111001, "uadd8">;
-def USAX   : AAI<0b01100101, 0b11110101, "usax">;
-def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
-def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
+def SASX   : AAIIntrinsic<0b01100001, 0b11110011, "sasx", int_arm_sasx>;
+def SADD16 : AAIIntrinsic<0b01100001, 0b11110001, "sadd16", int_arm_sadd16>;
+def SADD8  : AAIIntrinsic<0b01100001, 0b11111001, "sadd8", int_arm_sadd8>;
+def SSAX   : AAIIntrinsic<0b01100001, 0b11110101, "ssax", int_arm_ssax>;
+def SSUB16 : AAIIntrinsic<0b01100001, 0b11110111, "ssub16", int_arm_ssub16>;
+def SSUB8  : AAIIntrinsic<0b01100001, 0b11111111, "ssub8", int_arm_ssub8>;
+def UASX   : AAIIntrinsic<0b01100101, 0b11110011, "uasx", int_arm_uasx>;
+def UADD16 : AAIIntrinsic<0b01100101, 0b11110001, "uadd16", int_arm_uadd16>;
+def UADD8  : AAIIntrinsic<0b01100101, 0b11111001, "uadd8", int_arm_uadd8>;
+def USAX   : AAIIntrinsic<0b01100101, 0b11110101, "usax", int_arm_usax>;
+def USUB16 : AAIIntrinsic<0b01100101, 0b11110111, "usub16", int_arm_usub16>;
+def USUB8  : AAIIntrinsic<0b01100101, 0b11111111, "usub8", int_arm_usub8>;
 
 // Signed/Unsigned halving add/subtract
 
-def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
-def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
-def SHADD8  : AAI<0b01100011, 0b11111001, "shadd8">;
-def SHSAX   : AAI<0b01100011, 0b11110101, "shsax">;
-def SHSUB16 : AAI<0b01100011, 0b11110111, "shsub16">;
-def SHSUB8  : AAI<0b01100011, 0b11111111, "shsub8">;
-def UHASX   : AAI<0b01100111, 0b11110011, "uhasx">;
-def UHADD16 : AAI<0b01100111, 0b11110001, "uhadd16">;
-def UHADD8  : AAI<0b01100111, 0b11111001, "uhadd8">;
-def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
-def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
-def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
+def SHASX   : AAIIntrinsic<0b01100011, 0b11110011, "shasx", int_arm_shasx>;
+def SHADD16 : AAIIntrinsic<0b01100011, 0b11110001, "shadd16", int_arm_shadd16>;
+def SHADD8  : AAIIntrinsic<0b01100011, 0b11111001, "shadd8", int_arm_shadd8>;
+def SHSAX   : AAIIntrinsic<0b01100011, 0b11110101, "shsax", int_arm_shsax>;
+def SHSUB16 : AAIIntrinsic<0b01100011, 0b11110111, "shsub16", int_arm_shsub16>;
+def SHSUB8  : AAIIntrinsic<0b01100011, 0b11111111, "shsub8", int_arm_shsub8>;
+def UHASX   : AAIIntrinsic<0b01100111, 0b11110011, "uhasx", int_arm_uhasx>;
+def UHADD16 : AAIIntrinsic<0b01100111, 0b11110001, "uhadd16", int_arm_uhadd16>;
+def UHADD8  : AAIIntrinsic<0b01100111, 0b11111001, "uhadd8", int_arm_uhadd8>;
+def UHSAX   : AAIIntrinsic<0b01100111, 0b11110101, "uhsax", int_arm_uhsax>;
+def UHSUB16 : AAIIntrinsic<0b01100111, 0b11110111, "uhsub16", int_arm_uhsub16>;
+def UHSUB8  : AAIIntrinsic<0b01100111, 0b11111111, "uhsub8", int_arm_uhsub8>;
 
 // Unsigned Sum of Absolute Differences [and Accumulate].
 
 def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 MulFrm /* for convenience */, NoItinerary, "usad8",
-                "\t$Rd, $Rn, $Rm", []>,
+                "\t$Rd, $Rn, $Rm",
+             [(set GPR:$Rd, (int_arm_usad8 GPR:$Rn, GPR:$Rm))]>,
              Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
   bits<4> Rd;
   bits<4> Rn;
@@ -3664,7 +3743,8 @@ def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 }
 def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                 MulFrm /* for convenience */, NoItinerary, "usada8",
-                "\t$Rd, $Rn, $Rm, $Ra", []>,
+                "\t$Rd, $Rn, $Rm, $Ra",
+             [(set GPR:$Rd, (int_arm_usada8 GPR:$Rn, GPR:$Rm, GPR:$Ra))]>,
              Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
   bits<4> Rd;
   bits<4> Rn;
@@ -3679,7 +3759,6 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
 }
 
 // Signed/Unsigned saturate
-
 def SSAT : AI<(outs GPRnopc:$Rd),
               (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
               SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
@@ -3748,6 +3827,10 @@ def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos),
                (USAT imm0_31:$pos, GPRnopc:$a, 0)>;
 def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
              (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos),
+               (SSAT16 imm1_16:$pos, GPRnopc:$a)>;
+def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos),
+               (USAT16 imm0_15:$pos, GPRnopc:$a)>;
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
@@ -3850,6 +3933,7 @@ def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
   let Inst{11-0} = imm;
 }
 
+let AddedComplexity = 1 in
 def : ARMPat<(and   GPR:$src, mod_imm_not:$imm),
              (BICri GPR:$src, mod_imm_not:$imm)>;
 
@@ -3899,7 +3983,8 @@ def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
                     (ins GPRnopc:$Rn, GPRnopc:$Rm),
                     IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
                   [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
-                  Requires<[IsARM, HasV6]> {
+                  Requires<[IsARM, HasV6]>,
+         Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{15-12} = 0b0000;
   let Unpredictable{15-12} = 0b1111;
 }
@@ -3910,14 +3995,16 @@ def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
                            4, IIC_iMUL32,
                [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
                (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
-               Requires<[IsARM, NoV6, UseMulOps]>;
+               Requires<[IsARM, NoV6, UseMulOps]>,
+           Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 }
 
 def MLA  : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
                      (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
                      IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
         [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
-                     Requires<[IsARM, HasV6, UseMulOps]> {
+                     Requires<[IsARM, HasV6, UseMulOps]>,
+        Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
 }
@@ -3928,12 +4015,14 @@ def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
                             pred:$p, cc_out:$s), 4, IIC_iMAC32,
          [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
   (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+           Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                    IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
-                   Requires<[IsARM, HasV6T2, UseMulOps]> {
+                   Requires<[IsARM, HasV6T2, UseMulOps]>,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<4> Rn;
@@ -3949,26 +4038,38 @@ let hasSideEffects = 0 in {
 let isCommutable = 1 in {
 def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
-                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm",
+                    [(set GPR:$RdLo, GPR:$RdHi,
+                          (smullohi GPR:$Rn, GPR:$Rm))]>,
+                    Requires<[IsARM, HasV6]>,
+           Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
-                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm",
+                    [(set GPR:$RdLo, GPR:$RdHi,
+                          (umullohi GPR:$Rn, GPR:$Rm))]>,
+                    Requires<[IsARM, HasV6]>,
+           Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL]>;
 
 let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                             (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
-                            4, IIC_iMUL64, [],
+                            4, IIC_iMUL64,
+                            [(set GPR:$RdLo, GPR:$RdHi,
+                                  (smullohi GPR:$Rn, GPR:$Rm))],
           (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                             (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
-                            4, IIC_iMUL64, [],
+                            4, IIC_iMUL64,
+                            [(set GPR:$RdLo, GPR:$RdHi,
+                                  (umullohi GPR:$Rn, GPR:$Rm))],
           (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+             Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 }
 }
 
@@ -3976,17 +4077,20 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
                         (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+           Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
                         (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+            Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
                                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
                                IIC_iMAC64,
                     "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+            Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rm;
@@ -4004,13 +4108,15 @@ def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                               4, IIC_iMAC64, [],
              (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
                            pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                 (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
              (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
                            pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 }
 
 } // hasSideEffects
@@ -4019,13 +4125,15 @@ def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
                [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
-            Requires<[IsARM, HasV6]> {
+            Requires<[IsARM, HasV6]>,
+            Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{15-12} = 0b1111;
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
-            Requires<[IsARM, HasV6]> {
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMUL32, ReadMUL, ReadMUL]>  {
   let Inst{15-12} = 0b1111;
 }
 
@@ -4033,57 +4141,67 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
                [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
-            Requires<[IsARM, HasV6, UseMulOps]>;
+            Requires<[IsARM, HasV6, UseMulOps]>,
+            Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6, UseMulOps]>;
+            Requires<[IsARM, HasV6, UseMulOps]>,
+            Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 multiclass AI_smul<string opc> {
   def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sext_inreg GPR:$Rm, i16)))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sra GPR:$Rm, (i32 16))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sext_inreg GPR:$Rm, i16)))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sra GPR:$Rm, (i32 16))))]>,
-            Requires<[IsARM, HasV5TE]>;
+            Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
-              []>,
-           Requires<[IsARM, HasV5TE]>;
+              [(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
-              []>,
-            Requires<[IsARM, HasV5TE]>;
+              [(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>,
+            Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 }
 
 
@@ -4095,7 +4213,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd, (add GPR:$Ra,
                                (mul (sext_inreg GPRnopc:$Rn, i16),
                                        (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4103,7 +4222,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
                                           (sra GPRnopc:$Rm, (i32 16)))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4111,7 +4231,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                           (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4119,19 +4240,24 @@ multiclass AI_smla<string opc> {
              [(set GPRnopc:$Rd,
                    (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
-            Requires<[IsARM, HasV5TE, UseMulOps]>;
+            Requires<[IsARM, HasV5TE, UseMulOps]>,
+            Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              []>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              []>,
-            Requires<[IsARM, HasV5TE, UseMulOps]>;
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>,
+            Requires<[IsARM, HasV5TE, UseMulOps]>,
+            Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
   }
 }
 
@@ -4139,30 +4265,34 @@ defm SMUL : AI_smul<"smul">;
 defm SMLA : AI_smla<"smla">;
 
 // Halfword multiply accumulate long: SMLAL<x><y>.
-def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
+class SMLAL<bits<2> opc1, string asm>
+ : AMulxyI64<0b0001010, opc1,
+        (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+        (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+        IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+        Requires<[IsARM, HasV5TE]>,
+        Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
+
+def SMLALBB : SMLAL<0b00, "smlalbb">;
+def SMLALBT : SMLAL<0b10, "smlalbt">;
+def SMLALTB : SMLAL<0b01, "smlaltb">;
+def SMLALTT : SMLAL<0b11, "smlaltt">;
+
+def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALBB $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALBT $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALTB $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALTT $Rn, $Rm, $RLo, $RHi)>;
 
 // Helper class for AI_smld.
 class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
                     InstrItinClass itin, string opc, string asm>
-  : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
+  : AI<oops, iops, MulFrm, itin, opc, asm, []>,
+       Requires<[IsARM, HasV6]> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{27-23} = 0b01110;
@@ -4203,48 +4333,85 @@ multiclass AI_smld<bit sub, string opc> {
 
   def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
-                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
   def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
   def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
-                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+                  NoItinerary,
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+          Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
   def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
-                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
-
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+                  NoItinerary,
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+             Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 }
 
 defm SMLA : AI_smld<0, "smla">;
 defm SMLS : AI_smld<1, "smls">;
 
+def : ARMV6Pat<(int_arm_smlad GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+               (SMLAD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
+def : ARMV6Pat<(int_arm_smladx GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+               (SMLADX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
+def : ARMV6Pat<(int_arm_smlsd GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+               (SMLSD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
+def : ARMV6Pat<(int_arm_smlsdx GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
+               (SMLSDX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra)>;
+def : ARMV6Pat<(ARMSmlald GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+               (SMLALD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
+def : ARMV6Pat<(ARMSmlaldx GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+               (SMLALDX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
+def : ARMV6Pat<(ARMSmlsld GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+               (SMLSLD GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
+def : ARMV6Pat<(ARMSmlsldx GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+               (SMLSLDX GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi)>;
+
 multiclass AI_sdml<bit sub, string opc> {
 
   def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">,
+        Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
   def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">,
+         Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 }
 
 defm SMUA : AI_sdml<0, "smua">;
 defm SMUS : AI_sdml<1, "smus">;
 
+def : ARMV6Pat<(int_arm_smuad GPRnopc:$Rn, GPRnopc:$Rm),
+               (SMUAD GPRnopc:$Rn, GPRnopc:$Rm)>;
+def : ARMV6Pat<(int_arm_smuadx GPRnopc:$Rn, GPRnopc:$Rm),
+               (SMUADX GPRnopc:$Rn, GPRnopc:$Rm)>;
+def : ARMV6Pat<(int_arm_smusd GPRnopc:$Rn, GPRnopc:$Rm),
+               (SMUSD GPRnopc:$Rn, GPRnopc:$Rm)>;
+def : ARMV6Pat<(int_arm_smusdx GPRnopc:$Rn, GPRnopc:$Rm),
+               (SMUSDX GPRnopc:$Rn, GPRnopc:$Rm)>;
+
 //===----------------------------------------------------------------------===//
 //  Division Instructions (ARMv7-A with virtualization extension)
 //
 def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
                    "sdiv", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
-           Requires<[IsARM, HasDivideInARM]>;
+           Requires<[IsARM, HasDivideInARM]>,
+           Sched<[WriteDIV]>;
 
 def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
                    "udiv", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
-           Requires<[IsARM, HasDivideInARM]>;
+           Requires<[IsARM, HasDivideInARM]>,
+           Sched<[WriteDIV]>;
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
@@ -4831,14 +4998,15 @@ let AddedComplexity = 8 in {
   def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
 }
 
-// SWP/SWPB are deprecated in V6/V7.
+// SWP/SWPB are deprecated in V6/V7 and optional in v7VE.
+// FIXME Use InstAlias to generate LDREX/STREX pairs instead.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
                 (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
-                Requires<[PreV8]>;
+                Requires<[IsARM,PreV8]>;
 def SWPB: AIswp<1, (outs GPRnopc:$Rt),
                 (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
-                Requires<[PreV8]>;
+                Requires<[IsARM,PreV8]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4850,7 +5018,7 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
             [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                           imm:$CRm, imm:$opc2)]>,
-            Requires<[PreV8]> {
+            Requires<[IsARM,PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -4872,7 +5040,7 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                               imm:$CRm, imm:$opc2)]>,
-               Requires<[PreV8]> {
+               Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -5048,13 +5216,13 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
 
 defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
 defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
 defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -5132,7 +5300,7 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                      imm:$CRm, imm:$opc2)]>,
-                      Requires<[PreV8]>;
+                      Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -5140,7 +5308,7 @@ def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>,
-                      Requires<[PreV8]>;
+                      Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -5183,7 +5351,7 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
                   list<dag> pattern = []>
   : ABXI<0b1100, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
-    Requires<[PreV8]> {
+    Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -5525,20 +5693,52 @@ def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
 
 // smul* and smla*
 def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
-                 (SMULBB GPR:$a, GPR:$b)>;
+                 (SMULBB GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
-                 (SMULBT GPR:$a, GPR:$b)>;
+                 (SMULBT GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
-                (SMULTB GPR:$a, GPR:$b)>;
+                (SMULTB GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, sext_16_node:$b)),
-                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
-                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
+
+def : ARMV5TEPat<(int_arm_smulbb GPR:$a, GPR:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(int_arm_smulbt GPR:$a, GPR:$b),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(int_arm_smultb GPR:$a, GPR:$b),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(int_arm_smultt GPR:$a, GPR:$b),
+                 (SMULTT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(int_arm_smulwb GPR:$a, GPR:$b),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(int_arm_smulwt GPR:$a, GPR:$b),
+                 (SMULWT GPR:$a, GPR:$b)>;
+
+def : ARMV5TEPat<(int_arm_smlabb GPR:$a, GPR:$b, GPR:$acc),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(int_arm_smlabt GPR:$a, GPR:$b, GPR:$acc),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(int_arm_smlatb GPR:$a, GPR:$b, GPR:$acc),
                  (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(int_arm_smlatt GPR:$a, GPR:$b, GPR:$acc),
+                 (SMLATT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(int_arm_smlawb GPR:$a, GPR:$b, GPR:$acc),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(int_arm_smlawt GPR:$a, GPR:$b, GPR:$acc),
+                 (SMLAWT GPR:$a, GPR:$b, GPR:$acc)>;
 
 // Pre-v7 uses MCR for synchronization barriers.
 def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
@@ -5717,33 +5917,49 @@ def : MnemonicAlias<"usubaddx", "usax">;
 
 // "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
-def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+def : ARMInstSubst<"mov${s}${p} $Rd, $imm",
                    (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+def : ARMInstSubst<"mvn${s}${p} $Rd, $imm",
                    (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
-def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"bic${s}${p} $Rd, $Rn, $imm",
                    (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+def : ARMInstSubst<"bic${s}${p} $Rdn, $imm",
                    (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"and${s}${p} $Rd, $Rn, $imm",
                    (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+def : ARMInstSubst<"and${s}${p} $Rdn, $imm",
                    (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
 // Likewise, "add Rd, mod_imm_neg" -> sub
-def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"add${s}${p} $Rd, $Rn, $imm",
                  (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+def : ARMInstSubst<"add${s}${p} $Rd, $imm",
                  (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Likewise, "sub Rd, mod_imm_neg" -> add
+def : ARMInstSubst<"sub${s}${p} $Rd, $Rn, $imm",
+                 (ADDri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sub${s}${p} $Rd, $imm",
+                 (ADDri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+
+
+def : ARMInstSubst<"adc${s}${p} $Rd, $Rn, $imm",
+                 (SBCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"adc${s}${p} $Rdn, $imm",
+                 (SBCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sbc${s}${p} $Rd, $Rn, $imm",
+                 (ADCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sbc${s}${p} $Rdn, $imm",
+                 (ADCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+
 // Same for CMP <--> CMN via mod_imm_neg
-def : ARMInstAlias<"cmp${p} $Rd, $imm",
+def : ARMInstSubst<"cmp${p} $Rd, $imm",
                    (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
-def : ARMInstAlias<"cmn${p} $Rd, $imm",
+def : ARMInstSubst<"cmn${p} $Rd, $imm",
                    (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
 
 // The shifter forms of the MOV instruction are aliased to the ASR, LSL,
@@ -5837,21 +6053,28 @@ def SPACE : PseudoInst<(outs GPR:$Rd), (ins i32imm:$size, GPR:$Rn),
 // significantly more naive than the standard expansion: we conservatively
 // assume seq_cst, strong cmpxchg and omit clrex on failure.
 
-let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
     mayLoad = 1, mayStore = 1 in {
-def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+def CMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
                             (ins GPR:$addr, GPR:$desired, GPR:$new),
                             NoItinerary, []>, Sched<[]>;
 
-def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+def CMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
                              (ins GPR:$addr, GPR:$desired, GPR:$new),
                              NoItinerary, []>, Sched<[]>;
 
-def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$status),
+def CMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
                              (ins GPR:$addr, GPR:$desired, GPR:$new),
                              NoItinerary, []>, Sched<[]>;
 
-def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$status),
+def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPR:$temp),
                              (ins GPR:$addr, GPRPair:$desired, GPRPair:$new),
                              NoItinerary, []>, Sched<[]>;
 }
+
+def CompilerBarrier : PseudoInst<(outs), (ins i32imm:$ordering), NoItinerary,
+                                 [(atomic_fence imm:$ordering, 0)]> {
+  let hasSideEffects = 1;
+  let Size = 0;
+  let AsmString = "@ COMPILER BARRIER";
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
index b5fa8e9..858136a 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -587,6 +587,14 @@ def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
 def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
 def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
 
+def SDTARMVTBL1   : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
+                                         SDTCisVT<2, v8i8>]>;
+def SDTARMVTBL2   : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
+                                         SDTCisVT<2, v8i8>, SDTCisVT<3, v8i8>]>;
+def NEONvtbl1     : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>;
+def NEONvtbl2     : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
+
+
 def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{
   ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
   unsigned EltBits = 0;
@@ -666,7 +674,7 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1,
-          "vld1", Dt, "$Vd, $Rn", "", []> {
+          "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD1]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -674,7 +682,7 @@ class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
 class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1x2,
-          "vld1", Dt, "$Vd, $Rn", "", []> {
+          "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -695,7 +703,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                      (ins AddrMode:$Rn), IIC_VLD1u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -703,7 +711,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -712,7 +720,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                     (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -720,7 +728,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -739,7 +747,7 @@ defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
-          "$Vd, $Rn", "", []> {
+          "$Vd, $Rn", "", []>, Sched<[WriteVLD3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -748,7 +756,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                     (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -756,7 +764,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -772,15 +780,15 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
 defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
 defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
-def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
-def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
-def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
 
 // ...with 4 registers
 class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
-          "$Vd, $Rn", "", []> {
+          "$Vd, $Rn", "", []>, Sched<[WriteVLD4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -789,7 +797,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                     (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -797,7 +805,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -813,9 +821,9 @@ defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
-def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
-def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -829,22 +837,22 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
 }
 
 def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVLD2]>;
 
 def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 
-def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
-def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
-def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
+def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
+def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
 
 // ...with address register writeback:
 multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
@@ -867,45 +875,45 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
 }
 
 defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 
 defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
-                        addrmode6align64or128or256>;
+                        addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
-                        addrmode6align64or128or256>;
+                        addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
-                        addrmode6align64or128or256>;
+                        addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 
-def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
-def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
-def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
 
 // ...with double-spaced registers
 def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
-                      addrmode6align64or128>;
+                      addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
-                      addrmode6align64or128>;
+                      addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
-                      addrmode6align64or128>;
+                      addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
           (ins addrmode6:$Rn), IIC_VLD3,
-          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []>, Sched<[WriteVLD3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
@@ -915,9 +923,9 @@ def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
 def  VLD3d16  : VLD3D<0b0100, {0,1,0,?}, "16">;
 def  VLD3d32  : VLD3D<0b0100, {1,0,0,?}, "32">;
 
-def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>;
-def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
-def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
 
 // ...with address register writeback:
 class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -925,7 +933,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
 }
@@ -934,9 +942,9 @@ def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
 def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
 def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
 
-def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>;
-def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
-def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
 
 // ...with double-spaced registers:
 def VLD3q8      : VLD3D<0b0101, {0,0,0,?}, "8">;
@@ -946,25 +954,26 @@ def VLD3q8_UPD  : VLD3DWB<0b0101, {0,0,0,?}, "8">;
 def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
 def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
 
-def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>;
-def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
-def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
 
-def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
 
 //   VLD4     : Vector Load (multiple 4-element structures)
 class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
           (ins addrmode6:$Rn), IIC_VLD4,
-          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []>,
+    Sched<[WriteVLD4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
@@ -974,9 +983,9 @@ def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
 def  VLD4d16  : VLD4D<0b0000, {0,1,?,?}, "16">;
 def  VLD4d32  : VLD4D<0b0000, {1,0,?,?}, "32">;
 
-def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>;
-def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
-def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
 
 // ...with address register writeback:
 class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -984,7 +993,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
 }
@@ -993,9 +1002,9 @@ def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
 def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
 def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
 
-def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>;
-def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
-def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
 
 // ...with double-spaced registers:
 def VLD4q8      : VLD4D<0b0001, {0,0,?,?}, "8">;
@@ -1005,18 +1014,18 @@ def VLD4q8_UPD  : VLD4DWB<0b0001, {0,0,?,?}, "8">;
 def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
 def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
 
-def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>;
-def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
-def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
 
-def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
 
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1068,11 +1077,12 @@ class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           "$src = $Vd",
           [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
                                          (i32 (LoadOp addrmode6oneL32:$Rn)),
-                                         imm:$lane))]> {
+                                         imm:$lane))]>, Sched<[WriteVLD1]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVLD1LN";
 }
-class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln>,
+                                                    Sched<[WriteVLD1]> {
   let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
                                                (i32 (LoadOp addrmode6:$addr)),
                                                imm:$lane))];
@@ -1109,7 +1119,7 @@ class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
-          "$src = $Vd, $Rn.addr = $wb", []> {
+          "$src = $Vd, $Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
   let DecoderMethod = "DecodeVLD1LN";
 }
 
@@ -1126,16 +1136,16 @@ def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
   let Inst{4} = Rn{4};
 }
 
-def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>;
-def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
-def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
 
 //   VLD2LN   : Vector Load (single 2-element structure to one lane)
 class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
           (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
           IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
-          "$src1 = $Vd, $src2 = $dst2", []> {
+          "$src1 = $Vd, $src2 = $dst2", []>, Sched<[WriteVLD1]> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
   let DecoderMethod = "DecodeVLD2LN";
@@ -1151,9 +1161,9 @@ def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>;
-def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
-def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
 
 // ...with double-spaced registers:
 def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
@@ -1163,8 +1173,8 @@ def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
-def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
 
 // ...with address register writeback:
 class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1187,9 +1197,9 @@ def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
 
 def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -1198,8 +1208,8 @@ def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
 
 //   VLD3LN   : Vector Load (single 3-element structure to one lane)
 class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1207,7 +1217,7 @@ class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
           nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
-          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []>, Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVLD3LN";
 }
@@ -1222,9 +1232,9 @@ def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers:
 def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
@@ -1234,8 +1244,8 @@ def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
 
 // ...with address register writeback:
 class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1246,7 +1256,7 @@ class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VLD3lnu, "vld3", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
-          []> {
+          []>, Sched<[WriteVLD2]> {
   let DecoderMethod = "DecodeVLD3LN";
 }
 
@@ -1260,9 +1270,9 @@ def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
 
 def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -1271,8 +1281,8 @@ def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
 
 //   VLD4LN   : Vector Load (single 4-element structure to one lane)
 class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1281,7 +1291,8 @@ class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
           nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
-          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>,
+    Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD4LN";
@@ -1298,9 +1309,9 @@ def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers:
 def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
@@ -1311,8 +1322,8 @@ def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
 
 // ...with address register writeback:
 class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1339,9 +1350,9 @@ def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
 
 def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -1351,8 +1362,8 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
 
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1363,7 +1374,8 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
           (ins AddrMode:$Rn),
           IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
           [(set VecListOneDAllLanes:$Vd,
-                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
+   Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1426,7 +1438,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                      (outs VecListDPairAllLanes:$Vd, GPR:$wb),
                      (ins AddrMode:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1483,7 +1495,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
                      (outs VdTy:$Vd, GPR:$wb),
                      (ins AddrMode:$Rn), IIC_VLD2dupu,
                      "vld2", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
@@ -1492,7 +1504,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
                         (outs VdTy:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
   }
@@ -1516,7 +1528,8 @@ defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
 class VLD3DUP<bits<4> op7_4, string Dt>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
           (ins addrmode6dup:$Rn), IIC_VLD3dup,
-          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []>,
+    Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{4} = 0;
   let DecoderMethod = "DecodeVLD3DupInstruction";
@@ -1526,9 +1539,9 @@ def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
 def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
 def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
 
-def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
-def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
-def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers (not used for codegen):
 def VLD3DUPq8  : VLD3DUP<{0,0,1,?}, "8">;
@@ -1540,7 +1553,7 @@ class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
   let Inst{4} = 0;
   let DecoderMethod = "DecodeVLD3DupInstruction";
 }
@@ -1553,9 +1566,9 @@ def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8",  addrmode6dupalign64>;
 def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
 def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
 
-def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
-def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
-def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
 
 //   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
 class VLD4DUP<bits<4> op7_4, string Dt>
@@ -1572,9 +1585,9 @@ def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
 def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
 def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
 
-def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>;
-def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
-def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers (not used for codegen):
 def VLD4DUPq8  : VLD4DUP<{0,0,1,?}, "8">;
@@ -1587,7 +1600,7 @@ class VLD4DUPWB<bits<4> op7_4, string Dt>
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
           "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD4DupInstruction";
 }
@@ -1600,9 +1613,9 @@ def VLD4DUPq8_UPD  : VLD4DUPWB<{0,0,1,0}, "8">;
 def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
 def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
 
-def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
-def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
-def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
 
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1649,14 +1662,14 @@ class VSTQQQQWBPseudo<InstrItinClass itin>
 //   VST1     : Vector Store (multiple single elements)
 class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
-          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST1]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
 class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
-          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST2]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1677,7 +1690,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
                      (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1686,7 +1699,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
                         IIC_VLD1u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST1]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1695,7 +1708,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
                     (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1704,7 +1717,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
                         IIC_VLD1x2u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1724,7 +1737,7 @@ defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
           (ins AddrMode:$Rn, VecListThreeD:$Vd),
-          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1733,7 +1746,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
                     (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1742,7 +1755,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
                         IIC_VLD1x3u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1758,16 +1771,16 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
 defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
 defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
-def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
-def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
-def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
 
 // ...with 4 registers
 class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
           (ins AddrMode:$Rn, VecListFourD:$Vd),
           IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
-          []> {
+          []>, Sched<[WriteVST4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1776,7 +1789,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
                     (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1785,7 +1798,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1x4u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1801,9 +1814,9 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
-def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
-def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1816,22 +1829,22 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
 }
 
 def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVST2]>;
 def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVST2]>;
 def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVST2]>;
 
 def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVST4]>;
 def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVST4]>;
 def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVST4]>;
 
-def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
-def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
-def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
+def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
+def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
 
 // ...with address register writeback:
 multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
@@ -1839,7 +1852,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
   def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                      (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
@@ -1847,7 +1860,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
@@ -1856,7 +1869,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
                      (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
@@ -1865,7 +1878,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
@@ -1882,12 +1895,12 @@ defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
 defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 
-def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>;
-def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
-def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
 
 // ...with double-spaced registers
 def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2,
@@ -1907,7 +1920,7 @@ defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
 class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
-          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []>, Sched<[WriteVST3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
@@ -1917,9 +1930,9 @@ def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
 def  VST3d16  : VST3D<0b0100, {0,1,0,?}, "16">;
 def  VST3d32  : VST3D<0b0100, {1,0,0,?}, "32">;
 
-def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>;
-def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
-def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
 
 // ...with address register writeback:
 class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1927,7 +1940,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
 }
@@ -1936,9 +1949,9 @@ def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
 def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
 def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
 
-def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>;
-def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
-def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
 
 // ...with double-spaced registers:
 def VST3q8      : VST3D<0b0101, {0,0,0,?}, "8">;
@@ -1948,25 +1961,25 @@ def VST3q8_UPD  : VST3DWB<0b0101, {0,0,0,?}, "8">;
 def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
 def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
 
-def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>;
-def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
-def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
 
-def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
 
 //   VST4     : Vector Store (multiple 4-element structures)
 class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
           IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
-          "", []> {
+          "", []>, Sched<[WriteVST4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
@@ -1976,9 +1989,9 @@ def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
 def  VST4d16  : VST4D<0b0000, {0,1,?,?}, "16">;
 def  VST4d32  : VST4D<0b0000, {1,0,?,?}, "32">;
 
-def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>;
-def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
-def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
 
 // ...with address register writeback:
 class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1986,7 +1999,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
            "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
 }
@@ -1995,9 +2008,9 @@ def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
 def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
 def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
 
-def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>;
-def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
-def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
 
 // ...with double-spaced registers:
 def VST4q8      : VST4D<0b0001, {0,0,?,?}, "8">;
@@ -2007,18 +2020,18 @@ def VST4q8_UPD  : VST4DWB<0b0001, {0,0,?,?}, "8">;
 def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
 def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
 
-def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>;
-def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
-def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
 
-def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
 
 } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
@@ -2052,12 +2065,13 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
           (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
-          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]>,
+     Sched<[WriteVST1]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
-  : VSTQLNPseudo<IIC_VST1ln> {
+  : VSTQLNPseudo<IIC_VST1ln>, Sched<[WriteVST1]> {
   let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
                           addrmode6:$addr)];
 }
@@ -2096,11 +2110,12 @@ class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb",
           [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
-                                  AdrMode:$Rn, am6offset:$Rm))]> {
+                                  AdrMode:$Rn, am6offset:$Rm))]>,
+    Sched<[WriteVST1]> {
   let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
-  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  : VSTQLNWBPseudo<IIC_VST1lnu>, Sched<[WriteVST1]> {
   let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
                                         addrmode6:$addr, am6offset:$offset))];
 }
@@ -2131,7 +2146,7 @@ class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
           IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
-          "", []> {
+          "", []>, Sched<[WriteVST1]> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
   let DecoderMethod = "DecodeVST2LN";
@@ -2147,9 +2162,9 @@ def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>;
-def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
-def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
 
 // ...with double-spaced registers:
 def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
@@ -2161,8 +2176,8 @@ def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
   let Inst{4}   = Rn{4};
 }
 
-def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
-def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
 
 // ...with address register writeback:
 class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2185,9 +2200,9 @@ def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
 
 def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -2196,15 +2211,16 @@ def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
 class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
            nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
-          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []>,
+    Sched<[WriteVST2]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVST3LN";
 }
@@ -2219,9 +2235,9 @@ def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>;
-def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
-def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
 
 // ...with double-spaced registers:
 def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
@@ -2255,9 +2271,9 @@ def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
 
 def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -2266,8 +2282,8 @@ def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
 
 //   VST4LN   : Vector Store (single 4-element structure from one lane)
 class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2275,7 +2291,7 @@ class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
            nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
           "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
-          "", []> {
+          "", []>, Sched<[WriteVST2]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVST4LN";
@@ -2292,9 +2308,9 @@ def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>;
-def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
-def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
 
 // ...with double-spaced registers:
 def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
@@ -2305,8 +2321,8 @@ def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
-def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
 
 // ...with address register writeback:
 class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2331,9 +2347,9 @@ def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
 
 def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -2343,8 +2359,8 @@ def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
 
 } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
@@ -5550,8 +5566,7 @@ defm VSRI     : N2VShInsR_QHSD<1, 1, 0b0100, 1, "vsri">;
 
 //   VABS     : Vector Absolute Value
 defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0,
-                           IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
-                           int_arm_neon_vabs>;
+                           IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s", abs>;
 def  VABSfd   : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
                      "vabs", "f32",
                      v2f32, v2f32, fabs>;
@@ -5567,29 +5582,6 @@ def  VABShq   : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0,
                       v8f16, v8f16, fabs>,
                 Requires<[HasNEON, HasFullFP16]>;
 
-def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))),
-               (v2i32 (bitconvert (v8i8 (add DPR:$src,
-                                             (NEONvshrs DPR:$src, (i32 7))))))),
-          (VABSv8i8 DPR:$src)>;
-def : Pat<(xor (v2i32 (bitconvert (v4i16 (NEONvshrs DPR:$src, (i32 15))))),
-               (v2i32 (bitconvert (v4i16 (add DPR:$src,
-                                            (NEONvshrs DPR:$src, (i32 15))))))),
-          (VABSv4i16 DPR:$src)>;
-def : Pat<(xor (v2i32 (NEONvshrs DPR:$src, (i32 31))),
-               (v2i32 (add DPR:$src, (NEONvshrs DPR:$src, (i32 31))))),
-          (VABSv2i32 DPR:$src)>;
-def : Pat<(xor (v4i32 (bitconvert (v16i8 (NEONvshrs QPR:$src, (i32 7))))),
-               (v4i32 (bitconvert (v16i8 (add QPR:$src,
-                                             (NEONvshrs QPR:$src, (i32 7))))))),
-          (VABSv16i8 QPR:$src)>;
-def : Pat<(xor (v4i32 (bitconvert (v8i16 (NEONvshrs QPR:$src, (i32 15))))),
-               (v4i32 (bitconvert (v8i16 (add QPR:$src,
-                                            (NEONvshrs QPR:$src, (i32 15))))))),
-          (VABSv8i16 QPR:$src)>;
-def : Pat<(xor (v4i32 (NEONvshrs QPR:$src, (i32 31))),
-               (v4i32 (add QPR:$src, (NEONvshrs QPR:$src, (i32 31))))),
-          (VABSv4i32 QPR:$src)>;
-
 //   VQABS    : Vector Saturating Absolute Value
 defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
                            IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
@@ -6443,7 +6435,8 @@ def  VTBL1
   : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
         (ins VecListOneD:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
         "vtbl", "8", "$Vd, $Vn, $Vm", "",
-        [(set DPR:$Vd, (v8i8 (int_arm_neon_vtbl1 VecListOneD:$Vn, DPR:$Vm)))]>;
+        [(set DPR:$Vd, (v8i8 (NEONvtbl1 VecListOneD:$Vn, DPR:$Vm)))]>;
+
 let hasExtraSrcRegAllocReq = 1 in {
 def  VTBL2
   : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$Vd),
@@ -6498,6 +6491,49 @@ def  VTBX4Pseudo
                 IIC_VTBX4, "$orig = $dst", []>;
 } // DecoderMethod = "DecodeTBLInstruction"
 
+def : Pat<(v8i8 (NEONvtbl2 v8i8:$Vn0, v8i8:$Vn1, v8i8:$Vm)),
+          (v8i8 (VTBL2 (REG_SEQUENCE DPair, v8i8:$Vn0, dsub_0,
+                                            v8i8:$Vn1, dsub_1),
+                       v8i8:$Vm))>;
+def : Pat<(v8i8 (int_arm_neon_vtbx2 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1,
+                                    v8i8:$Vm)),
+          (v8i8 (VTBX2 v8i8:$orig,
+                       (REG_SEQUENCE DPair, v8i8:$Vn0, dsub_0,
+                                            v8i8:$Vn1, dsub_1),
+                       v8i8:$Vm))>;
+
+def : Pat<(v8i8 (int_arm_neon_vtbl3 v8i8:$Vn0, v8i8:$Vn1,
+                                    v8i8:$Vn2, v8i8:$Vm)),
+          (v8i8 (VTBL3Pseudo (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0,
+                                                 v8i8:$Vn1, dsub_1,
+                                                 v8i8:$Vn2, dsub_2,
+                                                 (v8i8 (IMPLICIT_DEF)), dsub_3),
+                             v8i8:$Vm))>;
+def : Pat<(v8i8 (int_arm_neon_vtbx3 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1,
+                                    v8i8:$Vn2, v8i8:$Vm)),
+          (v8i8 (VTBX3Pseudo v8i8:$orig,
+                             (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0,
+                                                 v8i8:$Vn1, dsub_1,
+                                                 v8i8:$Vn2, dsub_2,
+                                                 (v8i8 (IMPLICIT_DEF)), dsub_3),
+                             v8i8:$Vm))>;
+
+def : Pat<(v8i8 (int_arm_neon_vtbl4 v8i8:$Vn0, v8i8:$Vn1,
+                                    v8i8:$Vn2, v8i8:$Vn3, v8i8:$Vm)),
+          (v8i8 (VTBL4Pseudo (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0,
+                                                 v8i8:$Vn1, dsub_1,
+                                                 v8i8:$Vn2, dsub_2,
+                                                 v8i8:$Vn3, dsub_3),
+                             v8i8:$Vm))>;
+def : Pat<(v8i8 (int_arm_neon_vtbx4 v8i8:$orig, v8i8:$Vn0, v8i8:$Vn1,
+                                    v8i8:$Vn2, v8i8:$Vn3, v8i8:$Vm)),
+          (v8i8 (VTBX4Pseudo v8i8:$orig,
+                             (REG_SEQUENCE QQPR, v8i8:$Vn0, dsub_0,
+                                                 v8i8:$Vn1, dsub_1,
+                                                 v8i8:$Vn2, dsub_2,
+                                                 v8i8:$Vn3, dsub_3),
+                             v8i8:$Vm))>;
+
 // VRINT      : Vector Rounding
 multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
   let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
@@ -7139,6 +7175,17 @@ let Predicates = [IsBE] in {
                         (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
 }
 
+def : Pat<(v2i64 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v4i32 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v8i16 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v16i8 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+
 //===----------------------------------------------------------------------===//
 // Assembler aliases
 //
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
index a681f64..891a8f4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -19,7 +19,7 @@ def imm_sr_XFORM: SDNodeXForm<imm, [{
   unsigned Imm = N->getZExtValue();
   return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
 }]>;
-def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def ThumbSRImmAsmOperand: ImmAsmOperand<1,32> { let Name = "ImmThumbSR"; }
 def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   uint64_t Imm = N->getZExtValue();
   return Imm > 0 && Imm <= 32;
@@ -28,22 +28,31 @@ def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = ThumbSRImmAsmOperand;
 }
 
-def imm_comp_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
 def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
+def ThumbModImmNeg1_7AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg1_7"; }
+def mod_imm1_7_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return 0 < Value && Value < 8;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ThumbModImmNeg1_7AsmOperand;
+}
+
+def ThumbModImmNeg8_255AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg8_255"; }
+def mod_imm8_255_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return 7 < Value && Value < 256;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ThumbModImmNeg8_255AsmOperand;
+}
+
+
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
 
-def imm8_255 : ImmLeaf<i32, [{
-  return Imm >= 8 && Imm < 256;
-}]>;
 def imm8_255_neg : PatLeaf<(i32 imm), [{
   unsigned Val = -N->getZExtValue();
   return Val >= 8 && Val < 256;
@@ -275,8 +284,8 @@ def tADJCALLSTACKUP :
             Requires<[IsThumb, IsThumb1Only]>;
 
 def tADJCALLSTACKDOWN :
-  PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
-             [(ARMcallseq_start imm:$amt)]>,
+  PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary,
+             [(ARMcallseq_start imm:$amt, imm:$amt2)]>,
             Requires<[IsThumb, IsThumb1Only]>;
 }
 
@@ -407,9 +416,9 @@ def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
   let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
-def : tInstAlias<"add${p} sp, $imm",
+def : tInstSubst<"add${p} sp, $imm",
                  (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
-def : tInstAlias<"add${p} sp, sp, $imm",
+def : tInstSubst<"add${p} sp, sp, $imm",
                  (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
 
 // Can optionally specify SP as a three operand instruction.
@@ -910,7 +919,7 @@ let isAdd = 1 in {
   def tADC :                      // A8.6.2
     T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
                   "adc", "\t$Rdn, $Rm",
-                  [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+                  []>, Sched<[WriteALU]>;
 
   // Add immediate
   def tADDi3 :                    // A8.6.4 T1
@@ -938,6 +947,43 @@ let isAdd = 1 in {
                   "add", "\t$Rd, $Rn, $Rm",
                   [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
+  /// Similar to the above except these set the 's' bit so the
+  /// instruction modifies the CPSR register.
+  ///
+  /// These opcodes will be converted to the real non-S opcodes by
+  /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+  let hasPostISelHook = 1, Defs = [CPSR] in {
+    let isCommutable = 1, Uses = [CPSR] in
+    def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm,
+                                                            CPSR))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+    def tADDSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                              2, IIC_iALUi,
+                              [(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rm,
+                                                             imm0_7:$imm3))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+
+    def tADDSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
+                              2, IIC_iALUi,
+                              [(set tGPR:$Rdn, CPSR, (ARMaddc tGPR:$Rn,
+                                                      imm8_255:$imm8))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+
+    let isCommutable = 1 in
+    def tADDSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                              2, IIC_iALUr,
+                              [(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rn,
+                                                             tGPR:$Rm))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+  }
+
   let hasSideEffects = 0 in
   def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
                        "add", "\t$Rdn, $Rm", []>,
@@ -951,6 +997,12 @@ let isAdd = 1 in {
   }
 }
 
+def : tInstSubst<"sub${s}${p} $rd, $rn, $imm",
+                 (tADDi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
+def : tInstSubst<"sub${s}${p} $rdn, $imm",
+                 (tADDi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
+
+
 // AND register
 let isCommutable = 1 in
 def tAND :                      // A8.6.12
@@ -1197,7 +1249,7 @@ def tSBC :                      // A8.6.151
   T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sbc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                []>,
                 Sched<[WriteALU]>;
 
 // Subtract immediate
@@ -1218,6 +1270,14 @@ def tSUBi8 :                    // A8.6.210 T2
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
                     Sched<[WriteALU]>;
 
+def : tInstSubst<"add${s}${p} $rd, $rn, $imm",
+                 (tSUBi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
+
+
+def : tInstSubst<"add${s}${p} $rdn, $imm",
+                 (tSUBi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
+
+
 // Subtract register
 def tSUBrr :                    // A8.6.212
   T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
@@ -1226,6 +1286,42 @@ def tSUBrr :                    // A8.6.212
                 [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
                 Sched<[WriteALU]>;
 
+/// Similar to the above except these set the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+  let Uses = [CPSR] in
+  def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                          2, IIC_iALUr,
+                          [(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm,
+                                                          CPSR))]>,
+              Requires<[IsThumb1Only]>,
+              Sched<[WriteALU]>;
+
+  def tSUBSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                            2, IIC_iALUi,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rm,
+                                                           imm0_7:$imm3))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+  def tSUBSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
+                            2, IIC_iALUi,
+                            [(set tGPR:$Rdn, CPSR, (ARMsubc tGPR:$Rn,
+                                                            imm8_255:$imm8))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+  def tSUBSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rn,
+                                                           tGPR:$Rm))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+}
+
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
   T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1317,14 +1413,15 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 
 // Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
 // and make use of the same compressed jump table format as Thumb-2.
-let Size = 2 in {
+let Size = 2, isBranch = 1, isTerminator = 1, isBarrier = 1,
+    isIndirectBranch = 1 in {
 def tTBB_JT : tPseudoInst<(outs),
-        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
-        Sched<[WriteBr]>;
+        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
+         IIC_Br, []>, Sched<[WriteBr]>;
 
 def tTBH_JT : tPseudoInst<(outs),
-        (ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
-        Sched<[WriteBr]>;
+        (ins tGPRwithpc:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0,
+         IIC_Br, []>,  Sched<[WriteBr]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1386,22 +1483,6 @@ def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
 def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
             (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
 
-// Add with carry
-def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
-            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
-            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
-            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
-
-// Subtract with carry
-def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
-            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
-            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
-def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
-            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
-
 // Bswap 16 with load/store
 def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
             (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
@@ -1477,7 +1558,7 @@ def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
 // post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
 // different to how ISel expects them for a post-inc load, so use a pseudo
 // and expand it just after ISel.
-let usesCustomInserter = 1,
+let usesCustomInserter = 1, mayLoad =1,
     Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
  def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
                                (ins rGPR:$Rn, pred:$p),
@@ -1547,7 +1628,7 @@ def : T1Pat<(i32 thumb_immshifted:$src),
                     (thumb_immshifted_shamt imm:$src))>;
 
 def : T1Pat<(i32 imm0_255_comp:$src),
-            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+            (tMVN (tMOVi8 (imm_not_XFORM imm:$src)))>;
 
 def : T1Pat<(i32 imm256_510:$src),
             (tADDi8 (tMOVi8 255),
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 603d664..42eac12 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -76,7 +76,11 @@ def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
-def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm_asmoperand : AsmOperandClass {
+  let Name = "T2SOImm";
+  let RenderMethod = "addImmOperands";
+
+}
 def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
     return ARM_AM::getT2SOImmVal(Imm) != -1;
   }]> {
@@ -110,15 +114,14 @@ def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
 
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
-def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-  int64_t Value = -(int)N->getZExtValue();
-  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+def t2_so_imm_neg : Operand<i32>, ImmLeaf<i32, [{
+  return Imm && ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
 }], t2_so_imm_neg_XFORM> {
   let ParserMatchClass = t2_so_imm_neg_asmoperand;
 }
 
-/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0,4095].
+def imm0_4095_asmoperand: ImmAsmOperand<0,4095> { let Name = "Imm0_4095"; }
 def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 4096;
 }]> {
@@ -139,7 +142,7 @@ def imm1_255_neg : PatLeaf<(i32 imm), [{
 
 def imm0_255_not : PatLeaf<(i32 imm), [{
   return (uint32_t)(~N->getZExtValue()) < 255;
-}], imm_comp_XFORM>;
+}], imm_not_XFORM>;
 
 def lo5AllOne : PatLeaf<(i32 imm), [{
   // Returns true if all low 5-bits are 1.
@@ -538,7 +541,8 @@ class T2FourReg<dag oops, dag iops, InstrItinClass itin,
 class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
                 string opc, list<dag> pattern>
   : T2I<(outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
-         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern> {
+         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern>,
+    Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rn;
@@ -556,7 +560,8 @@ class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, string opc>
   : T2I<(outs rGPR:$RdLo, rGPR:$RdHi),
         (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
         opc, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi"> {
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+    Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rn;
@@ -977,7 +982,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
                   PatFrag opnode> {
   def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]>,
+            Sched<[WriteLd]> {
     bits<4> Rt;
     bits<17> addr;
     let Inst{31-25} = 0b1111100;
@@ -993,7 +999,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
   def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>,
+            Sched<[WriteLd]> {
     bits<4> Rt;
     bits<13> addr;
     let Inst{31-27} = 0b11111;
@@ -1015,7 +1022,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
   def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]>,
+            Sched<[WriteLd]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -1039,7 +1047,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   // from the PC.
   def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]>,
+            Sched<[WriteLd]> {
     let isReMaterializable = 1;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
@@ -1065,7 +1074,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
                   PatFrag opnode> {
   def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0001;
     let Inst{22-21} = opcod;
@@ -1082,7 +1092,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
   }
   def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -1102,7 +1113,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
   }
   def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -1121,28 +1133,10 @@ multiclass T2I_st<bits<2> opcod, string opc,
 
 /// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-             opc, ".w\t$Rd, $Rm$rot",
-             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-             Requires<[IsThumb2]> {
-   let Inst{31-27} = 0b11111;
-   let Inst{26-23} = 0b0100;
-   let Inst{22-20} = opcod;
-   let Inst{19-16} = 0b1111; // Rn
-   let Inst{15-12} = 0b1111;
-   let Inst{7} = 1;
-
-   bits<2> rot;
-   let Inst{5-4} = rot{1-0}; // rotate
-}
-
-// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
-class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
-             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
-            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-          Requires<[HasT2ExtractPack, IsThumb2]> {
+class T2I_ext_rrot_base<bits<3> opcod, dag iops, dag oops,
+                        string opc, string oprs,
+                        list<dag> pattern>
+  : T2TwoReg<iops, oops, IIC_iEXTr, opc, oprs, pattern> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1150,46 +1144,34 @@ class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15-12} = 0b1111;
   let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
-
-// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
-// supported yet.
-class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-             opc, "\t$Rd, $Rm$rot", []>,
-          Requires<[IsThumb2, HasT2ExtractPack]> {
-  bits<2> rot;
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0100;
-  let Inst{22-20} = opcod;
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15-12} = 0b1111;
-  let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
+  let Inst{5-4} = rot; // rotate
+}
+
+class T2I_ext_rrot<bits<3> opcod, string opc>
+  : T2I_ext_rrot_base<opcod,
+                      (outs rGPR:$Rd),
+                      (ins rGPR:$Rm, rot_imm:$rot),
+                      opc, ".w\t$Rd, $Rm$rot", []>,
+                      Requires<[IsThumb2]>,
+                      Sched<[WriteALU, ReadALU]>;
+
+// UXTB16, SXTB16 - Requires HasDSP, does not need the .w qualifier.
+class T2I_ext_rrot_xtb16<bits<3> opcod, string opc>
+  : T2I_ext_rrot_base<opcod,
+                      (outs rGPR:$Rd),
+                      (ins rGPR:$Rm, rot_imm:$rot),
+                      opc, "\t$Rd, $Rm$rot", []>,
+                      Requires<[HasDSP, IsThumb2]>,
+                      Sched<[WriteALU, ReadALU]>;
 
 /// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+class T2I_exta_rrot<bits<3> opcod, string opc>
   : T2ThreeReg<(outs rGPR:$Rd),
                (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
-               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
-             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
-           Requires<[HasT2ExtractPack, IsThumb2]> {
-  bits<2> rot;
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0100;
-  let Inst{22-20} = opcod;
-  let Inst{15-12} = 0b1111;
-  let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
-
-class T2I_exta_rrot_np<bits<3> opcod, string opc>
-  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
                IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
-               Requires<[HasT2ExtractPack, IsThumb2]> {
+               Requires<[HasDSP, IsThumb2]>,
+               Sched<[WriteALU, ReadALU]> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1279,7 +1261,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>,
+                 Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
@@ -1333,17 +1316,20 @@ let mayLoad = 1, hasSideEffects = 0 in {
 def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                 Sched<[WriteLd]>;
 
 def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
-                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 
 def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                 Sched<[WriteLd]>;
 
 def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
@@ -1353,41 +1339,45 @@ def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
 def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                Sched<[WriteLd]>;
 
 def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 
 def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []>;
+                            []>, Sched<[WriteLd]>;
 
 def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                   Sched<[WriteLd]>;
 
 def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []>;
+                            []>, Sched<[WriteLd]>;
 
 def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0
 
 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
   : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
-          "\t$Rt, $addr", []> {
+          "\t$Rt, $addr", []>, Sched<[WriteLd]> {
   bits<4> Rt;
   bits<13> addr;
   let Inst{31-27} = 0b11111;
@@ -1431,11 +1421,14 @@ class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
 }
 
 def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
-                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
-                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
-                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 
 // Store
 defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
@@ -1448,7 +1441,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
-               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>,
+               Sched<[WriteST]>;
 
 // Indexed stores
 
@@ -1457,19 +1451,22 @@ def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                             "str", "\t$Rt, $addr!",
-                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+                 Sched<[WriteST]>;
 
 def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                         "strh", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+                  Sched<[WriteST]>;
 
 def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
                         "strb", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+            Sched<[WriteST]>;
 } // mayStore = 1, hasSideEffects = 0
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
@@ -1480,7 +1477,8 @@ def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
                           "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
              [(set GPRnopc:$Rn_wb,
                   (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, addr_offset_none:$Rn,
@@ -1490,7 +1488,8 @@ def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
        [(set GPRnopc:$Rn_wb,
              (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, addr_offset_none:$Rn,
@@ -1500,7 +1499,8 @@ def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
         [(set GPRnopc:$Rn_wb,
               (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 // Pseudo-instructions for pattern matching the pre-indexed stores. We can't
 // put the patterns on the instruction definitions directly as ISel wants
@@ -1513,17 +1513,20 @@ def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 }
 
 // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
@@ -1531,7 +1534,7 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
   : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
-          "\t$Rt, $addr", []> {
+          "\t$Rt, $addr", []>, Sched<[WriteST]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = 0; // not signed
@@ -1557,7 +1560,8 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 let mayLoad = 1 in
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []>,
+                 Sched<[WriteLd]> {
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
@@ -1565,13 +1569,13 @@ let mayLoad = 1 in
 def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
                  IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
-                 "$addr.base = $wb", []>;
+                 "$addr.base = $wb", []>, Sched<[WriteLd]>;
 
 let mayStore = 1 in
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
-                 "$addr.base = $wb", []> {
+                 "$addr.base = $wb", []>, Sched<[WriteST]> {
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
@@ -1580,12 +1584,13 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
                       t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
-                 "$addr.base = $wb", []>;
+                 "$addr.base = $wb", []>, Sched<[WriteST]>;
 
 class T2Istrrel<bits<2> bit54, dag oops, dag iops,
                 string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
-            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]>,
+    Sched<[WriteST]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1861,7 +1866,7 @@ defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 //
 
 let hasSideEffects = 0 in
-def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr,
                    "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -1870,11 +1875,11 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0000;
 }
-def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                 pred:$p, zero_reg)>;
-def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                  pred:$p, CPSR)>;
-def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                pred:$p, CPSR)>;
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
@@ -1926,10 +1931,11 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
 
 def : InstAlias<"mov${p} $Rd, $imm",
                 (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
-                Requires<[IsThumb, HasV8MBaseline]>;
+                Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteALU]>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
-                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                        Sched<[WriteALU]>;
 
 let Constraints = "$src = $Rd" in {
 def t2MOVTi16 : T2I<(outs rGPR:$Rd),
@@ -1969,31 +1975,43 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 // Sign extenders
 
-def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
-                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
-def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
-                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
-def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb">;
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth">;
+def t2SXTB16 : T2I_ext_rrot_xtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab">;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah">;
+def t2SXTAB16 : T2I_exta_rrot<0b010, "sxtab16">;
+
+def : T2Pat<(sext_inreg (rotr rGPR:$Rn, rot_imm:$rot), i8),
+            (t2SXTB rGPR:$Rn, rot_imm:$rot)>;
+def : T2Pat<(sext_inreg (rotr rGPR:$Rn, rot_imm:$rot), i16),
+            (t2SXTH rGPR:$Rn, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn,
+                            (sext_inreg (rotr rGPR:$Rm, rot_imm:$rot), i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn,
+                            (sext_inreg (rotr rGPR:$Rm, rot_imm:$rot), i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(int_arm_sxtb16 rGPR:$Rn),
+                   (t2SXTB16 rGPR:$Rn, 0)>;
+def : Thumb2DSPPat<(int_arm_sxtab16 rGPR:$Rn, rGPR:$Rm),
+                   (t2SXTAB16 rGPR:$Rn, rGPR:$Rm, 0)>;
 
-def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
-                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
-                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
-def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
 
 // A simple right-shift can also be used in most cases (the exception is the
 // SXTH operations with a rotate of 24: there the non-contiguous bits are
 // relevant).
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (srl rGPR:$Rm, rot_imm:$rot), i8)),
                        (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (srl rGPR:$Rm, imm8_or_16:$rot), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (rotr rGPR:$Rm, (i32 24)), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (or (srl rGPR:$Rm, (i32 24)),
                                               (shl rGPR:$Rm, (i32 8))), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
@@ -2001,12 +2019,19 @@ def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
 // Zero extenders
 
 let AddedComplexity = 16 in {
-def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
-                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
-def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
-                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
-                                   UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb">;
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth">;
+def t2UXTB16 : T2I_ext_rrot_xtb16<0b011, "uxtb16">;
+
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x000000FF),
+                       (t2UXTB rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x0000FFFF),
+                       (t2UXTH rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x00FF00FF),
+                       (t2UXTB16 rGPR:$Rm, rot_imm:$rot)>;
+
+def : Thumb2DSPPat<(int_arm_uxtb16 rGPR:$Rm),
+                   (t2UXTB16 rGPR:$Rm, 0)>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
 //        The transformation should probably be done as a combiner action
@@ -2014,23 +2039,29 @@ def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
 //        eight bits of the source into the lower eight bits of the result.
 //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
 //            (t2UXTB16 rGPR:$Src, 3)>,
-//          Requires<[HasT2ExtractPack, IsThumb2]>;
+//          Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
             (t2UXTB16 rGPR:$Src, 1)>,
-        Requires<[HasT2ExtractPack, IsThumb2]>;
+        Requires<[HasDSP, IsThumb2]>;
 
-def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
-                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
-                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
-def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab">;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah">;
+def t2UXTAB16 : T2I_exta_rrot<0b011, "uxtab16">;
 
-def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (rotr rGPR:$Rm, rot_imm:$rot),
+                                            0x00FF)),
+                       (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (rotr rGPR:$Rm, rot_imm:$rot),
+                                            0xFFFF)),
+                       (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
                                            0xFF)),
                        (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
                                             0xFFFF)),
                        (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(int_arm_uxtab16 rGPR:$Rn, rGPR:$Rm),
+                      (t2UXTAB16 rGPR:$Rn, rGPR:$Rm, 0)>;
 }
 
 
@@ -2060,6 +2091,19 @@ defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
 }
 
+def : t2InstSubst<"adc${s}${p} $rd, $rn, $imm",
+                 (t2SBCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sbc${s}${p} $rd, $rn, $imm",
+                 (t2ADCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
+
+def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm",
+                 (t2SUBri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"addw${p} $rd, $rn, $imm",
+                 (t2SUBri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
+                 (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"subw${p} $rd, $rn, $imm",
+                 (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
 
@@ -2102,10 +2146,9 @@ def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
 def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
             (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
 
-// Select Bytes -- for disassembly only
-
 def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>,
+                NoItinerary, "sel", "\t$Rd, $Rn, $Rm",
+                [(set GPR:$Rd, (int_arm_sel GPR:$Rn, GPR:$Rm))]>,
           Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-24} = 0b010;
@@ -2119,9 +2162,7 @@ def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 // A6.3.13, A6.3.14, A6.3.15 Parallel addition and subtraction (signed/unsigned)
 // And Miscellaneous operations -- for disassembly only
 class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
-              list<dag> pat = [/* For disassembly only; pattern left blank */],
-              dag iops = (ins rGPR:$Rn, rGPR:$Rm),
-              string asm = "\t$Rd, $Rn, $Rm">
+              list<dag> pat, dag iops, string asm>
   : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>,
     Requires<[IsThumb2, HasDSP]> {
   let Inst{31-27} = 0b11111;
@@ -2139,60 +2180,72 @@ class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc,
   let Inst{3-0}   = Rm;
 }
 
-// Saturating add/subtract -- for disassembly only
-
-def t2QADD    : T2I_pam<0b000, 0b1000, "qadd",
-                        [(set rGPR:$Rd, (int_arm_qadd rGPR:$Rn, rGPR:$Rm))],
-                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
-def t2QADD16  : T2I_pam<0b001, 0b0001, "qadd16">;
-def t2QADD8   : T2I_pam<0b000, 0b0001, "qadd8">;
-def t2QASX    : T2I_pam<0b010, 0b0001, "qasx">;
-def t2QDADD   : T2I_pam<0b000, 0b1001, "qdadd", [],
-                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
-def t2QDSUB   : T2I_pam<0b000, 0b1011, "qdsub", [],
-                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
-def t2QSAX    : T2I_pam<0b110, 0b0001, "qsax">;
-def t2QSUB    : T2I_pam<0b000, 0b1010, "qsub",
-                        [(set rGPR:$Rd, (int_arm_qsub rGPR:$Rn, rGPR:$Rm))],
-                        (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
-def t2QSUB16  : T2I_pam<0b101, 0b0001, "qsub16">;
-def t2QSUB8   : T2I_pam<0b100, 0b0001, "qsub8">;
-def t2UQADD16 : T2I_pam<0b001, 0b0101, "uqadd16">;
-def t2UQADD8  : T2I_pam<0b000, 0b0101, "uqadd8">;
-def t2UQASX   : T2I_pam<0b010, 0b0101, "uqasx">;
-def t2UQSAX   : T2I_pam<0b110, 0b0101, "uqsax">;
-def t2UQSUB16 : T2I_pam<0b101, 0b0101, "uqsub16">;
-def t2UQSUB8  : T2I_pam<0b100, 0b0101, "uqsub8">;
-
-// Signed/Unsigned add/subtract -- for disassembly only
-
-def t2SASX    : T2I_pam<0b010, 0b0000, "sasx">;
-def t2SADD16  : T2I_pam<0b001, 0b0000, "sadd16">;
-def t2SADD8   : T2I_pam<0b000, 0b0000, "sadd8">;
-def t2SSAX    : T2I_pam<0b110, 0b0000, "ssax">;
-def t2SSUB16  : T2I_pam<0b101, 0b0000, "ssub16">;
-def t2SSUB8   : T2I_pam<0b100, 0b0000, "ssub8">;
-def t2UASX    : T2I_pam<0b010, 0b0100, "uasx">;
-def t2UADD16  : T2I_pam<0b001, 0b0100, "uadd16">;
-def t2UADD8   : T2I_pam<0b000, 0b0100, "uadd8">;
-def t2USAX    : T2I_pam<0b110, 0b0100, "usax">;
-def t2USUB16  : T2I_pam<0b101, 0b0100, "usub16">;
-def t2USUB8   : T2I_pam<0b100, 0b0100, "usub8">;
-
-// Signed/Unsigned halving add/subtract -- for disassembly only
-
-def t2SHASX   : T2I_pam<0b010, 0b0010, "shasx">;
-def t2SHADD16 : T2I_pam<0b001, 0b0010, "shadd16">;
-def t2SHADD8  : T2I_pam<0b000, 0b0010, "shadd8">;
-def t2SHSAX   : T2I_pam<0b110, 0b0010, "shsax">;
-def t2SHSUB16 : T2I_pam<0b101, 0b0010, "shsub16">;
-def t2SHSUB8  : T2I_pam<0b100, 0b0010, "shsub8">;
-def t2UHASX   : T2I_pam<0b010, 0b0110, "uhasx">;
-def t2UHADD16 : T2I_pam<0b001, 0b0110, "uhadd16">;
-def t2UHADD8  : T2I_pam<0b000, 0b0110, "uhadd8">;
-def t2UHSAX   : T2I_pam<0b110, 0b0110, "uhsax">;
-def t2UHSUB16 : T2I_pam<0b101, 0b0110, "uhsub16">;
-def t2UHSUB8  : T2I_pam<0b100, 0b0110, "uhsub8">;
+class T2I_pam_intrinsics<bits<3> op22_20, bits<4> op7_4, string opc,
+                         Intrinsic intrinsic>
+  : T2I_pam<op22_20, op7_4, opc,
+    [(set rGPR:$Rd, (intrinsic rGPR:$Rn, rGPR:$Rm))],
+    (ins rGPR:$Rn, rGPR:$Rm), "\t$Rd, $Rn, $Rm">;
+
+class T2I_pam_intrinsics_rev<bits<3> op22_20, bits<4> op7_4, string opc>
+  : T2I_pam<op22_20, op7_4, opc, [],
+    (ins rGPR:$Rm, rGPR:$Rn), "\t$Rd, $Rm, $Rn">;
+
+// Saturating add/subtract
+def t2QADD16  : T2I_pam_intrinsics<0b001, 0b0001, "qadd16", int_arm_qadd16>;
+def t2QADD8   : T2I_pam_intrinsics<0b000, 0b0001, "qadd8", int_arm_qadd8>;
+def t2QASX    : T2I_pam_intrinsics<0b010, 0b0001, "qasx", int_arm_qasx>;
+def t2UQSUB8  : T2I_pam_intrinsics<0b100, 0b0101, "uqsub8", int_arm_uqsub8>;
+def t2QSAX    : T2I_pam_intrinsics<0b110, 0b0001, "qsax", int_arm_qsax>;
+def t2QSUB16  : T2I_pam_intrinsics<0b101, 0b0001, "qsub16", int_arm_qsub16>;
+def t2QSUB8   : T2I_pam_intrinsics<0b100, 0b0001, "qsub8", int_arm_qsub8>;
+def t2UQADD16 : T2I_pam_intrinsics<0b001, 0b0101, "uqadd16", int_arm_uqadd16>;
+def t2UQADD8  : T2I_pam_intrinsics<0b000, 0b0101, "uqadd8", int_arm_uqadd8>;
+def t2UQASX   : T2I_pam_intrinsics<0b010, 0b0101, "uqasx", int_arm_uqasx>;
+def t2UQSAX   : T2I_pam_intrinsics<0b110, 0b0101, "uqsax", int_arm_uqsax>;
+def t2UQSUB16 : T2I_pam_intrinsics<0b101, 0b0101, "uqsub16", int_arm_uqsub16>;
+def t2QADD    : T2I_pam_intrinsics_rev<0b000, 0b1000, "qadd">;
+def t2QSUB    : T2I_pam_intrinsics_rev<0b000, 0b1010, "qsub">;
+def t2QDADD   : T2I_pam_intrinsics_rev<0b000, 0b1001, "qdadd">;
+def t2QDSUB   : T2I_pam_intrinsics_rev<0b000, 0b1011, "qdsub">;
+
+def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, rGPR:$Rn),
+                   (t2QADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, rGPR:$Rn),
+                   (t2QSUB rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+                   (t2QDADD rGPR:$Rm, rGPR:$Rn)>;
+def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
+                   (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
+
+// Signed/Unsigned add/subtract
+
+def t2SASX    : T2I_pam_intrinsics<0b010, 0b0000, "sasx", int_arm_sasx>;
+def t2SADD16  : T2I_pam_intrinsics<0b001, 0b0000, "sadd16", int_arm_sadd16>;
+def t2SADD8   : T2I_pam_intrinsics<0b000, 0b0000, "sadd8", int_arm_sadd8>;
+def t2SSAX    : T2I_pam_intrinsics<0b110, 0b0000, "ssax", int_arm_ssax>;
+def t2SSUB16  : T2I_pam_intrinsics<0b101, 0b0000, "ssub16", int_arm_ssub16>;
+def t2SSUB8   : T2I_pam_intrinsics<0b100, 0b0000, "ssub8", int_arm_ssub8>;
+def t2UASX    : T2I_pam_intrinsics<0b010, 0b0100, "uasx", int_arm_uasx>;
+def t2UADD16  : T2I_pam_intrinsics<0b001, 0b0100, "uadd16", int_arm_uadd16>;
+def t2UADD8   : T2I_pam_intrinsics<0b000, 0b0100, "uadd8", int_arm_uadd8>;
+def t2USAX    : T2I_pam_intrinsics<0b110, 0b0100, "usax", int_arm_usax>;
+def t2USUB16  : T2I_pam_intrinsics<0b101, 0b0100, "usub16", int_arm_usub16>;
+def t2USUB8   : T2I_pam_intrinsics<0b100, 0b0100, "usub8", int_arm_usub8>;
+
+// Signed/Unsigned halving add/subtract
+
+def t2SHASX   : T2I_pam_intrinsics<0b010, 0b0010, "shasx", int_arm_shasx>;
+def t2SHADD16 : T2I_pam_intrinsics<0b001, 0b0010, "shadd16", int_arm_shadd16>;
+def t2SHADD8  : T2I_pam_intrinsics<0b000, 0b0010, "shadd8", int_arm_shadd8>;
+def t2SHSAX   : T2I_pam_intrinsics<0b110, 0b0010, "shsax", int_arm_shsax>;
+def t2SHSUB16 : T2I_pam_intrinsics<0b101, 0b0010, "shsub16", int_arm_shsub16>;
+def t2SHSUB8  : T2I_pam_intrinsics<0b100, 0b0010, "shsub8", int_arm_shsub8>;
+def t2UHASX   : T2I_pam_intrinsics<0b010, 0b0110, "uhasx", int_arm_uhasx>;
+def t2UHADD16 : T2I_pam_intrinsics<0b001, 0b0110, "uhadd16", int_arm_uhadd16>;
+def t2UHADD8  : T2I_pam_intrinsics<0b000, 0b0110, "uhadd8", int_arm_uhadd8>;
+def t2UHSAX   : T2I_pam_intrinsics<0b110, 0b0110, "uhsax", int_arm_uhsax>;
+def t2UHSUB16 : T2I_pam_intrinsics<0b101, 0b0110, "uhsub16", int_arm_uhsub16>;
+def t2UHSUB8  : T2I_pam_intrinsics<0b100, 0b0110, "uhsub8", int_arm_uhsub8>;
 
 // Helper class for disassembly only
 // A6.3.16 & A6.3.17
@@ -2220,96 +2273,94 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
 // Unsigned Sum of Absolute Differences [and Accumulate].
 def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                                            (ins rGPR:$Rn, rGPR:$Rm),
-                        NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
+                        NoItinerary, "usad8", "\t$Rd, $Rn, $Rm",
+                        [(set rGPR:$Rd, (int_arm_usad8 rGPR:$Rn, rGPR:$Rm))]>,
           Requires<[IsThumb2, HasDSP]> {
   let Inst{15-12} = 0b1111;
 }
 def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                        (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary,
-                        "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
+                        "usada8", "\t$Rd, $Rn, $Rm, $Ra",
+          [(set rGPR:$Rd, (int_arm_usada8 rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>,
           Requires<[IsThumb2, HasDSP]>;
 
 // Signed/Unsigned saturate.
-class T2SatI<dag oops, dag iops, InstrItinClass itin,
-           string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+let hasSideEffects = 1 in
+class T2SatI<dag iops, string opc, string asm>
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, []> {
   bits<4> Rd;
   bits<4> Rn;
   bits<5> sat_imm;
-  bits<7> sh;
+  bits<6> sh;
 
-  let Inst{11-8}  = Rd;
+  let Inst{31-24} = 0b11110011;
+  let Inst{21} = sh{5};
+  let Inst{20} = 0;
   let Inst{19-16} = Rn;
-  let Inst{4-0}   = sat_imm;
-  let Inst{21}    = sh{5};
+  let Inst{15} = 0;
   let Inst{14-12} = sh{4-2};
-  let Inst{7-6}   = sh{1-0};
+  let Inst{11-8}  = Rd;
+  let Inst{7-6} = sh{1-0};
+  let Inst{5} = 0;
+  let Inst{4-0}   = sat_imm;
 }
 
-def t2SSAT: T2SatI<
-              (outs rGPR:$Rd),
-              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
-              Requires<[IsThumb2]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
+def t2SSAT: T2SatI<(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                   "ssat", "\t$Rd, $sat_imm, $Rn$sh">,
+                   Requires<[IsThumb2]> {
+  let Inst{23-22} = 0b00;
   let Inst{5}  = 0;
 }
 
-def t2SSAT16: T2SatI<
-                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
-                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-                Requires<[IsThumb2, HasDSP]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
-  let Inst{14-12} = 0b000; // imm3 = '000'
-  let Inst{7-6} = 0b00;    // imm2 = '00'
-  let Inst{5-4} = 0b00;
+def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn),
+                     "ssat16", "\t$Rd, $sat_imm, $Rn">,
+                     Requires<[IsThumb2, HasDSP]> {
+  let Inst{23-22} = 0b00;
+  let sh = 0b100000;
+  let Inst{4} = 0;
 }
 
-def t2USAT: T2SatI<
-               (outs rGPR:$Rd),
-               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
-                Requires<[IsThumb2]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
+def t2USAT: T2SatI<(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                    "usat", "\t$Rd, $sat_imm, $Rn$sh">,
+                    Requires<[IsThumb2]> {
+  let Inst{23-22} = 0b10;
 }
 
-def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
-                     NoItinerary,
-                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
+                     "usat16", "\t$Rd, $sat_imm, $Rn">,
                      Requires<[IsThumb2, HasDSP]> {
-  let Inst{31-22} = 0b1111001110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
-  let Inst{14-12} = 0b000; // imm3 = '000'
-  let Inst{7-6} = 0b00;    // imm2 = '00'
-  let Inst{5-4} = 0b00;
+  let Inst{23-22} = 0b10;
+  let sh = 0b100000;
+  let Inst{4} = 0;
 }
 
-def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
-def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>;
 def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
              (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>;
+def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos),
+            (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos),
+            (t2USAT imm0_31:$pos, GPR:$a, 0)>;
+def : T2Pat<(int_arm_ssat16 GPR:$a, imm1_16:$pos),
+            (t2SSAT16 imm1_16:$pos, GPR:$a)>;
+def : T2Pat<(int_arm_usat16 GPR:$a, imm0_15:$pos),
+            (t2USAT16 imm0_15:$pos, GPR:$a)>;
 
 //===----------------------------------------------------------------------===//
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm1_31, shl>;
 defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,  srl>;
 defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,  sra>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
 
+// LSL #0 is actually MOV, and has slightly different permitted registers to
+// LSL with non-zero shift
+def : t2InstAlias<"lsl${s}${p} $Rd, $Rm, #0",
+                  (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"lsl${s}${p}.w $Rd, $Rm, #0",
+                  (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
             (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
@@ -2547,7 +2598,8 @@ def : T2Pat<(t2_so_imm_not:$src),
 let isCommutable = 1 in
 def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                 "mul", "\t$Rd, $Rn, $Rm",
-                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]>,
+           Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2558,7 +2610,8 @@ def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
 class T2FourRegMLA<bits<4> op7_4, string opc, list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-               Requires<[IsThumb2, UseMulOps]> {
+               Requires<[IsThumb2, UseMulOps]>,
+    Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>  {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2575,8 +2628,12 @@ def t2MLS: T2FourRegMLA<0b0001, "mls",
 // Extra precision multiplies with low / high results
 let hasSideEffects = 0 in {
 let isCommutable = 1 in {
-def t2SMULL : T2MulLong<0b000, 0b0000, "smull", []>;
-def t2UMULL : T2MulLong<0b010, 0b0000, "umull", []>;
+def t2SMULL : T2MulLong<0b000, 0b0000, "smull",
+                        [(set rGPR:$RdLo, rGPR:$RdHi,
+                              (smullohi rGPR:$Rn, rGPR:$Rm))]>;
+def t2UMULL : T2MulLong<0b010, 0b0000, "umull",
+                        [(set rGPR:$RdLo, rGPR:$RdHi,
+                              (umullohi rGPR:$Rn, rGPR:$Rm))]>;
 } // isCommutable
 
 // Multiply + accumulate
@@ -2592,7 +2649,8 @@ class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
   : T2ThreeReg<(outs rGPR:$Rd),
                (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                opc, "\t$Rd, $Rn, $Rm", pattern>,
-               Requires<[IsThumb2, HasDSP]> {
+               Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2607,7 +2665,8 @@ class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
                      list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
               opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-              Requires<[IsThumb2, HasDSP, UseMulOps]> {
+              Requires<[IsThumb2, HasDSP, UseMulOps]>,
+    Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = op22_20;
@@ -2624,7 +2683,8 @@ class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
                      list<dag> pattern>
   : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, opc,
                "\t$Rd, $Rn, $Rm", pattern>,
-    Requires<[IsThumb2, HasDSP]> {
+    Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMUL16, ReadMUL, ReadMUL]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = op22_20;
@@ -2645,8 +2705,10 @@ def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb",
 def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
              [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
                                    (sra rGPR:$Rm, (i32 16))))]>;
-def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>;
-def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>;
+def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb",
+             [(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>;
+def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt",
+             [(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>;
 
 def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
                    (t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
@@ -2654,12 +2716,25 @@ def : Thumb2DSPPat<(mul sext_16_node:$Rn, (sra rGPR:$Rm, (i32 16))),
                    (t2SMULBT rGPR:$Rn, rGPR:$Rm)>;
 def : Thumb2DSPPat<(mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm),
                    (t2SMULTB rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(int_arm_smulbb rGPR:$Rn, rGPR:$Rm),
+                   (t2SMULBB rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(int_arm_smulbt rGPR:$Rn, rGPR:$Rm),
+                   (t2SMULBT rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(int_arm_smultb rGPR:$Rn, rGPR:$Rm),
+                   (t2SMULTB rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(int_arm_smultt rGPR:$Rn, rGPR:$Rm),
+                   (t2SMULTT rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(int_arm_smulwb rGPR:$Rn, rGPR:$Rm),
+                   (t2SMULWB rGPR:$Rn, rGPR:$Rm)>;
+def : Thumb2DSPPat<(int_arm_smulwt rGPR:$Rn, rGPR:$Rm),
+                   (t2SMULWT rGPR:$Rn, rGPR:$Rm)>;
 
 class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc,
                     list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMUL16,
                opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-    Requires<[IsThumb2, HasDSP, UseMulOps]> {
+    Requires<[IsThumb2, HasDSP, UseMulOps]>,
+    Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>  {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = op22_20;
@@ -2680,8 +2755,10 @@ def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb",
 def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
              [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
                                                  (sra rGPR:$Rm, (i32 16)))))]>;
-def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>;
-def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>;
+def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>;
+def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwt rGPR:$Rn, rGPR:$Rm)))]>;
 
 def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
                       (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
@@ -2692,58 +2769,93 @@ def : Thumb2DSPMulPat<(add rGPR:$Ra,
                         (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)),
                       (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
 
-class T2SMLAL<bits<3> op22_20, bits<4> op7_4, string opc, list<dag> pattern>
-  : T2FourReg_mac<1, op22_20, op7_4,
-                  (outs rGPR:$Ra, rGPR:$Rd),
-                  (ins rGPR:$Rn, rGPR:$Rm),
-                  IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
-                  Requires<[IsThumb2, HasDSP]>;
+def : Thumb2DSPPat<(int_arm_smlabb GPR:$a, GPR:$b, GPR:$acc),
+                   (t2SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : Thumb2DSPPat<(int_arm_smlabt GPR:$a, GPR:$b, GPR:$acc),
+                   (t2SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : Thumb2DSPPat<(int_arm_smlatb GPR:$a, GPR:$b, GPR:$acc),
+                   (t2SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : Thumb2DSPPat<(int_arm_smlatt GPR:$a, GPR:$b, GPR:$acc),
+                   (t2SMLATT GPR:$a, GPR:$b, GPR:$acc)>;
+def : Thumb2DSPPat<(int_arm_smlawb GPR:$a, GPR:$b, GPR:$acc),
+                   (t2SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : Thumb2DSPPat<(int_arm_smlawt GPR:$a, GPR:$b, GPR:$acc),
+                   (t2SMLAWT GPR:$a, GPR:$b, GPR:$acc)>;
 
 // Halfword multiple accumulate long: SMLAL<x><y>
-def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>;
-def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>;
-def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>;
-def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>;
-
-class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc>
+def t2SMLALBB : T2MlaLong<0b100, 0b1000, "smlalbb">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALBT : T2MlaLong<0b100, 0b1001, "smlalbt">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALTB : T2MlaLong<0b100, 0b1010, "smlaltb">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALTT : T2MlaLong<0b100, 0b1011, "smlaltt">,
+                          Requires<[IsThumb2, HasDSP]>;
+
+def : Thumb2DSPPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALBB $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALBT $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALTB $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALTT $Rn, $Rm, $RLo, $RHi)>;
+
+class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc,
+                    Intrinsic intrinsic>
   : T2ThreeReg_mac<0, op22_20, op7_4,
                    (outs rGPR:$Rd),
                    (ins rGPR:$Rn, rGPR:$Rm),
-                   IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm", []>,
-                   Requires<[IsThumb2, HasDSP]> {
+                   IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm",
+                   [(set rGPR:$Rd, (intrinsic rGPR:$Rn, rGPR:$Rm))]>,
+                   Requires<[IsThumb2, HasDSP]>,
+   Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   let Inst{15-12} = 0b1111;
 }
 
 // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
-def t2SMUAD: T2DualHalfMul<0b010, 0b0000, "smuad">;
-def t2SMUADX: T2DualHalfMul<0b010, 0b0001, "smuadx">;
-def t2SMUSD: T2DualHalfMul<0b100, 0b0000, "smusd">;
-def t2SMUSDX: T2DualHalfMul<0b100, 0b0001, "smusdx">;
+def t2SMUAD: T2DualHalfMul<0b010, 0b0000, "smuad", int_arm_smuad>;
+def t2SMUADX: T2DualHalfMul<0b010, 0b0001, "smuadx", int_arm_smuadx>;
+def t2SMUSD: T2DualHalfMul<0b100, 0b0000, "smusd", int_arm_smusd>;
+def t2SMUSDX: T2DualHalfMul<0b100, 0b0001, "smusdx", int_arm_smusdx>;
 
-class T2DualHalfMulAdd<bits<3> op22_20, bits<4> op7_4, string opc>
+class T2DualHalfMulAdd<bits<3> op22_20, bits<4> op7_4, string opc,
+                       Intrinsic intrinsic>
   : T2FourReg_mac<0, op22_20, op7_4,
                   (outs rGPR:$Rd),
                   (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra),
-                  IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm, $Ra", []>,
+                  IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm, $Ra",
+                  [(set rGPR:$Rd, (intrinsic rGPR:$Rn, rGPR:$Rm, rGPR:$Ra))]>,
                   Requires<[IsThumb2, HasDSP]>;
 
-def t2SMLAD   : T2DualHalfMulAdd<0b010, 0b0000, "smlad">;
-def t2SMLADX  : T2DualHalfMulAdd<0b010, 0b0001, "smladx">;
-def t2SMLSD   : T2DualHalfMulAdd<0b100, 0b0000, "smlsd">;
-def t2SMLSDX  : T2DualHalfMulAdd<0b100, 0b0001, "smlsdx">;
+def t2SMLAD   : T2DualHalfMulAdd<0b010, 0b0000, "smlad", int_arm_smlad>;
+def t2SMLADX  : T2DualHalfMulAdd<0b010, 0b0001, "smladx", int_arm_smladx>;
+def t2SMLSD   : T2DualHalfMulAdd<0b100, 0b0000, "smlsd", int_arm_smlsd>;
+def t2SMLSDX  : T2DualHalfMulAdd<0b100, 0b0001, "smlsdx", int_arm_smlsdx>;
 
 class T2DualHalfMulAddLong<bits<3> op22_20, bits<4> op7_4, string opc>
   : T2FourReg_mac<1, op22_20, op7_4,
                   (outs rGPR:$Ra, rGPR:$Rd),
-                  (ins rGPR:$Rn, rGPR:$Rm),
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi),
                   IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
-                  Requires<[IsThumb2, HasDSP]>;
+                  RegConstraint<"$Ra = $RLo, $Rd = $RHi">,
+                  Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
 def t2SMLALD  : T2DualHalfMulAddLong<0b100, 0b1100, "smlald">;
 def t2SMLALDX : T2DualHalfMulAddLong<0b100, 0b1101, "smlaldx">;
 def t2SMLSLD  : T2DualHalfMulAddLong<0b101, 0b1100, "smlsld">;
 def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">;
 
+def : Thumb2DSPPat<(ARMSmlald rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi),
+                   (t2SMLALD rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi)>;
+def : Thumb2DSPPat<(ARMSmlaldx rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi),
+                   (t2SMLALDX rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi)>;
+def : Thumb2DSPPat<(ARMSmlsld rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi),
+                   (t2SMLSLD rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi)>;
+def : Thumb2DSPPat<(ARMSmlsldx rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi),
+                   (t2SMLSLDX rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi)>;
+
 //===----------------------------------------------------------------------===//
 //  Division Instructions.
 //  Signed and unsigned division on v7-M
@@ -2751,7 +2863,8 @@ def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">;
 def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "sdiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+                 Requires<[HasDivideInThumb, IsThumb, HasV8MBaseline]>,
+             Sched<[WriteDIV]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
   let Inst{20} = 0b1;
@@ -2762,7 +2875,8 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
 def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "udiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+                 Requires<[HasDivideInThumb, IsThumb, HasV8MBaseline]>,
+             Sched<[WriteDIV]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
   let Inst{20} = 0b1;
@@ -2819,7 +2933,7 @@ def t2PKHBT : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
                                       (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Requires<[HasDSP, IsThumb2]>,
                   Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2835,10 +2949,10 @@ def t2PKHBT : T2ThreeReg<
 // Alternate cases for PKHBT where identities eliminate some nodes.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
@@ -2848,7 +2962,7 @@ def t2PKHTB : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
                                        (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Requires<[HasDSP, IsThumb2]>,
                   Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2867,14 +2981,14 @@ def t2PKHTB : T2ThreeReg<
 // pkhtb src1, src2, asr (17..31).
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
                 (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
 // CRC32 Instructions
@@ -3380,7 +3494,8 @@ def t2B   : T2I<(outs), (ins thumb_br_target:$target), IIC_Br,
   let AsmMatchConverter = "cvtThumbBranches";
 }
 
-let Size = 4, isNotDuplicable = 1, isIndirectBranch = 1 in {
+let Size = 4, isNotDuplicable = 1, isBranch = 1, isTerminator = 1,
+    isBarrier = 1, isIndirectBranch = 1 in {
 
 // available in both v8-M.Baseline and Thumb2 targets
 def t2BR_JT : t2basePseudoInst<(outs),
@@ -4216,13 +4331,13 @@ def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
 def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
            Requires<[IsThumb2]>;
 def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
             (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
             (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 }
 
 def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
@@ -4231,10 +4346,10 @@ def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
            Requires<[IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
             (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
             (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 
 // Atomic load/store patterns
 def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
@@ -4325,26 +4440,26 @@ def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
                            pred:$p, cc_out:$s)>;
 
 // add w/ negative immediates is just a sub.
-def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
                  cc_out:$s)>;
-def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${p} $Rd, $Rn, $imm",
            (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
-def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+def : t2InstSubst<"add${s}${p} $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
                cc_out:$s)>;
-def : t2InstAlias<"add${p} $Rdn, $imm",
+def : t2InstSubst<"add${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
-def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
                  cc_out:$s)>;
-def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"addw${p} $Rd, $Rn, $imm",
            (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
-def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+def : t2InstSubst<"add${s}${p}.w $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
                cc_out:$s)>;
-def : t2InstAlias<"addw${p} $Rdn, $imm",
+def : t2InstSubst<"addw${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
 
@@ -4431,10 +4546,10 @@ def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
 // input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
                 (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
                 (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 // PUSH/POP aliases for STM/LDM
 def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
@@ -4513,16 +4628,16 @@ def : t2InstAlias<"strh${p} $Rt, $addr",
 // Extend instruction optional rotate operand.
 def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
               (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
               (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
               (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm",
               (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm",
                 (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
@@ -4535,16 +4650,16 @@ def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
 
 def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
               (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
               (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
               (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm",
               (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 
 def : t2InstAlias<"uxtb${p} $Rd, $Rm",
                 (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
@@ -4560,7 +4675,7 @@ def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
                   (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
                 (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
-                Requires<[HasT2ExtractPack, IsThumb2]>;
+                Requires<[HasDSP, IsThumb2]>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
                   (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
@@ -4568,41 +4683,54 @@ def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
                   (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
                 (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
-                Requires<[HasT2ExtractPack, IsThumb2]>;
+                Requires<[HasDSP, IsThumb2]>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
                   (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
 
 // "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
-def : t2InstAlias<"mov${p} $Rd, $imm",
+def : t2InstSubst<"mov${p} $Rd, $imm",
                   (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
-def : t2InstAlias<"mvn${p} $Rd, $imm",
-                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstSubst<"mvn${s}${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
 // Same for AND <--> BIC
-def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"bic${s}${p} $Rd, $Rn, $imm",
                   (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+def : t2InstSubst<"bic${s}${p} $Rdn, $imm",
                   (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"and${s}${p} $Rd, $Rn, $imm",
                   (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+def : t2InstSubst<"and${s}${p} $Rdn, $imm",
                   (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
+// And ORR <--> ORN
+def : t2InstSubst<"orn${s}${p} $Rd, $Rn, $imm",
+                  (t2ORRri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstSubst<"orn${s}${p} $Rdn, $imm",
+                  (t2ORRri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstSubst<"orr${s}${p} $Rd, $Rn, $imm",
+                  (t2ORNri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
+def : t2InstSubst<"orr${s}${p} $Rdn, $imm",
+                  (t2ORNri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
+                           pred:$p, cc_out:$s)>;
 // Likewise, "add Rd, t2_so_imm_neg" -> sub
-def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
                   (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"add${s}${p} $Rd, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $imm",
                   (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
                            pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via t2_so_imm_neg
-def : t2InstAlias<"cmp${p} $Rd, $imm",
+def : t2InstSubst<"cmp${p} $Rd, $imm",
                   (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
-def : t2InstAlias<"cmn${p} $Rd, $imm",
+def : t2InstSubst<"cmn${p} $Rd, $imm",
                   (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 
 
@@ -4616,6 +4744,8 @@ def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
 
 // MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
 // these, unfortunately.
+// FIXME: LSL #0 in the shift should allow SP to be used as either the
+// source or destination (but not both).
 def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
 def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
@@ -4626,6 +4756,16 @@ def t2MOVsr: t2AsmPseudo<"mov${p} $Rd, $shift",
 def t2MOVSsr: t2AsmPseudo<"movs${p} $Rd, $shift",
                           (ins rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
 
+// Aliases for the above with the .w qualifier
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+                  (t2MOVsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+                  (t2MOVSsi rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
+def : t2InstAlias<"mov${p}.w $Rd, $shift",
+                  (t2MOVsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+def : t2InstAlias<"movs${p}.w $Rd, $shift",
+                  (t2MOVSsr rGPR:$Rd, so_reg_reg:$shift, pred:$p)>;
+
 // ADR w/o the .w suffix
 def : t2InstAlias<"adr${p} $Rd, $addr",
                   (t2ADR rGPR:$Rd, t2adrlabel:$addr, pred:$p)>;
@@ -4659,7 +4799,7 @@ def : t2InstAlias<"add${p} $Rd, pc, $imm",
 // Pseudo instruction ldr Rt, =immediate
 def t2LDRConstPool
   : t2AsmPseudo<"ldr${p} $Rt, $immediate",
-                (ins GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
+                (ins GPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
 // Version w/ the .w suffix.
 def : t2InstAlias<"ldr${p}.w $Rt, $immediate",
                   (t2LDRConstPool GPRnopc:$Rt,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
index e990486..5d887c4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -11,14 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>;
 def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
                                        SDTCisSameAs<1, 2>]>;
+def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+                                       SDTCisVT<2, f64>]>;
 
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMFCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
@@ -336,13 +339,15 @@ let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VADDD  : ADbI<0b11100, 0b11, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vadd", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fadd DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPALU64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDS  : ASbIn<0b11100, 0b11, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vadd", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]> {
+                   [(set SPR:$Sd, (fadd SPR:$Sn, SPR:$Sm))]>,
+             Sched<[WriteFPALU32]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -352,19 +357,22 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VSUBD  : ADbI<0b11100, 0b11, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpALU64, "vsub", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fsub DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPALU64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBS  : ASbIn<0b11100, 0b11, 1, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpALU32, "vsub", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]> {
+                   [(set SPR:$Sd, (fsub SPR:$Sn, SPR:$Sm))]>,
+             Sched<[WriteFPALU32]>{
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -374,37 +382,43 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+            Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VDIVD  : ADbI<0b11101, 0b00, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpDIV64, "vdiv", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fdiv DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPDIV64]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVS  : ASbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV32, "vdiv", ".f32\t$Sd, $Sn, $Sm",
-                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>;
+                  [(set SPR:$Sd, (fdiv SPR:$Sn, SPR:$Sm))]>,
+             Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
 def VMULD  : ADbI<0b11100, 0b10, 0, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vmul", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fmul DPR:$Dn, (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULS  : ASbIn<0b11100, 0b10, 0, 0,
                    (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                    IIC_fpMUL32, "vmul", ".f32\t$Sd, $Sn, $Sm",
-                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]> {
+                   [(set SPR:$Sd, (fmul SPR:$Sn, SPR:$Sm))]>,
+            Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -414,17 +428,20 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULH  : AHbI<0b11100, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
                   IIC_fpMUL64, "vnmul", ".f64\t$Dd, $Dn, $Dm",
-                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>;
+                  [(set DPR:$Dd, (fneg (fmul DPR:$Dn, (f64 DPR:$Dm))))]>,
+             Sched<[WriteFPMUL64, ReadFPMUL, ReadFPMUL]>;
 
 def VNMULS : ASbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL32, "vnmul", ".f32\t$Sd, $Sn, $Sm",
-                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]> {
+                  [(set SPR:$Sd, (fneg (fmul SPR:$Sn, SPR:$Sm)))]>,
+            Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -433,7 +450,8 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
 def VNMULH : AHbI<0b11100, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
                   IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
-                  []>;
+                  []>,
+             Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
   let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
@@ -501,12 +519,12 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
-                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
-                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -517,17 +535,15 @@ def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
                   []>;
 
-
-// FIXME: Verify encoding after integrated assembler is working.
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
-                  [/* For disassembly only; pattern left blank */]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>;
 
 def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
-                  [/* For disassembly only; pattern left blank */]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -566,7 +582,7 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
-                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -574,7 +590,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
-                   [(arm_cmpfp0 SPR:$Sd)]> {
+                   [(arm_cmpfp0 SPR:$Sd, (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -591,11 +607,10 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let Inst{5}   = 0;
 }
 
-// FIXME: Verify encoding after integrated assembler is working.
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -603,7 +618,7 @@ def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfp0 SPR:$Sd, (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -624,7 +639,8 @@ def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    IIC_fpCVTDS, "vcvt", ".f64.f32\t$Dd, $Sm",
-                   [(set DPR:$Dd, (fpextend SPR:$Sm))]> {
+                   [(set DPR:$Dd, (fpextend SPR:$Sm))]>,
+             Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Dd;
   bits<5> Sm;
@@ -641,7 +657,8 @@ def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
 // Special case encoding: bits 11-8 is 0b1011.
 def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
                     IIC_fpCVTSD, "vcvt", ".f32.f64\t$Sd, $Dm",
-                    [(set SPR:$Sd, (fpround DPR:$Dm))]> {
+                    [(set SPR:$Sd, (fpround DPR:$Dm))]>,
+              Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -663,31 +680,35 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 
 // Between half, single and double-precision.  For disassembly only.
 
-// FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
-                 Requires<[HasFP16]>;
+                 Requires<[HasFP16]>,
+            Sched<[WriteFPCVT]>;
 
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
-                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                   []>, Requires<[HasFPARMv8, HasDPVFP]>,
+              Sched<[WriteFPCVT]> {
   // Instruction operands.
   bits<5> Sm;
 
@@ -946,12 +967,14 @@ defm VRINTM : vrint_inst_anpm<"m", 0b11, ffloor>;
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
-                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>;
+                  [(set DPR:$Dd, (fsqrt (f64 DPR:$Dm)))]>,
+             Sched<[WriteFPSQRT64]>;
 
 def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
                   IIC_fpSQRT32, "vsqrt", ".f32\t$Sd, $Sm",
-                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>;
+                  [(set SPR:$Sd, (fsqrt SPR:$Sm))]>,
+             Sched<[WriteFPSQRT32]>;
 
 def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs SPR:$Sd), (ins SPR:$Sm),
@@ -987,7 +1010,8 @@ def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
 def VMOVRS : AVConv2I<0b11100001, 0b1010,
                       (outs GPR:$Rt), (ins SPR:$Sn),
                       IIC_fpMOVSI, "vmov", "\t$Rt, $Sn",
-                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]> {
+                      [(set GPR:$Rt, (bitconvert SPR:$Sn))]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<4> Rt;
   bits<5> Sn;
@@ -1010,7 +1034,8 @@ def VMOVSR : AVConv4I<0b11100000, 0b1010,
                       (outs SPR:$Sn), (ins GPR:$Rt),
                       IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
                       [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
-             Requires<[HasVFP2, UseVMOVSR]> {
+             Requires<[HasVFP2, UseVMOVSR]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Sn;
   bits<4> Rt;
@@ -1032,7 +1057,8 @@ let hasSideEffects = 0 in {
 def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
                         (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
                         IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
-                 [/* FIXME: Can't write pattern for multiple result instr*/]> {
+                 [(set GPR:$Rt, GPR:$Rt2, (arm_fmrrd DPR:$Dm))]>,
+               Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Dm;
   bits<4> Rt;
@@ -1059,7 +1085,8 @@ def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
                       (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
                  IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [/* For disassembly only; pattern left blank */]>,
+               Sched<[WriteFPMOV]> {
   bits<5> src1;
   bits<4> Rt;
   bits<4> Rt2;
@@ -1085,7 +1112,8 @@ def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
 def VMOVDRR : AVConv5I<0b11000100, 0b1011,
                       (outs DPR:$Dm), (ins GPR:$Rt, GPR:$Rt2),
                       IIC_fpMOVID, "vmov", "\t$Dm, $Rt, $Rt2",
-                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]> {
+                      [(set DPR:$Dm, (arm_fmdrr GPR:$Rt, GPR:$Rt2))]>,
+              Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Dm;
   bits<4> Rt;
@@ -1128,7 +1156,8 @@ let hasSideEffects = 0 in
 def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                      (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
-                [/* For disassembly only; pattern left blank */]> {
+                [/* For disassembly only; pattern left blank */]>,
+              Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> dst1;
   bits<4> src1;
@@ -1154,7 +1183,8 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
                       (outs GPR:$Rt), (ins SPR:$Sn),
                       IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
                       []>,
-             Requires<[HasFullFP16]> {
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<4> Rt;
   bits<5> Sn;
@@ -1173,7 +1203,8 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001,
                       (outs SPR:$Sn), (ins GPR:$Rt),
                       IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
                       []>,
-             Requires<[HasFullFP16]> {
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Sn;
   bits<4> Rt;
@@ -1254,7 +1285,8 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
 def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.s32\t$Dd, $Sm",
-                               []> {
+                               []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 }
 
@@ -1269,7 +1301,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 (outs SPR:$Sd),(ins SPR:$Sm),
                                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$Sd, $Sm",
-                                []> {
+                                []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1286,14 +1319,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
 def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
                                (outs SPR:$Sd), (ins SPR:$Sm),
                                IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
-                               []> {
+                               []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // s32
 }
 
 def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
                                (outs DPR:$Dd), (ins SPR:$Sm),
                                IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
-                               []> {
+                               []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 }
 
@@ -1308,7 +1343,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010,
                                 (outs SPR:$Sd), (ins SPR:$Sm),
                                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$Sd, $Sm",
-                                []> {
+                                []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1325,7 +1361,8 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
 def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
                                 (outs SPR:$Sd), (ins SPR:$Sm),
                                 IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
-                                []> {
+                                []>,
+             Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // u32
 }
 
@@ -1390,7 +1427,8 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
 def VTOSIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$Sd, $Dm",
-                                []> {
+                                []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
@@ -1405,7 +1443,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvt", ".s32.f32\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1423,14 +1462,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
 def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
                                IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
-                               []> {
+                               []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
@@ -1445,7 +1486,8 @@ let Predicates=[HasVFP2, HasDPVFP] in {
 def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvt", ".u32.f32\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 
   // Some single precision VFP instructions may be executed on both NEON and
@@ -1463,52 +1505,58 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
 def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 1; // Z bit
 }
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
-// FIXME: Verify encoding after integrated assembler is working.
 def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
-                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>{
+                                [(set SPR:$Sd, (int_arm_vcvtr (f64 DPR:$Dm)))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOSIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvtr", ".s32.f32\t$Sd, $Sm",
-                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]> {
+                                 [(set SPR:$Sd, (int_arm_vcvtr SPR:$Sm))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOSIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvtr", ".s32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".u32.f64\t$Sd, $Dm",
-                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>{
+                                [(set SPR:$Sd, (int_arm_vcvtru(f64 DPR:$Dm)))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTSI, "vcvtr", ".u32.f32\t$Sd, $Sm",
-                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]> {
+                                 [(set SPR:$Sd, (int_arm_vcvtru SPR:$Sm))]>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 
 def VTOUIRH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
                                  (outs SPR:$Sd), (ins SPR:$Sm),
                                  IIC_fpCVTHI, "vcvtr", ".u32.f16\t$Sd, $Sm",
-                                 []> {
+                                 []>,
+              Sched<[WriteFPCVT]> {
   let Inst{7} = 0; // Z bit
 }
 }
@@ -1528,8 +1576,7 @@ let Constraints = "$a = $dst" in {
 class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
-  Sched<[WriteCvtFP]> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{0};
@@ -1540,8 +1587,7 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
 class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
-    Sched<[WriteCvtFP]> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{4};
@@ -1553,26 +1599,31 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
 def VTOSHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTHI, "vcvt", ".s16.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOUHH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTHI, "vcvt", ".u16.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOSLH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTHI, "vcvt", ".s32.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOULH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTHI, "vcvt", ".u32.f16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTSI, "vcvt", ".s16.f32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1604,45 +1655,54 @@ def VTOULS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1111, 0b1010, 1,
 
 def VTOSHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".s16.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VTOUHD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".u16.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VTOSLD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1110, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VTOULD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1111, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 // Fixed-Point to FP:
 
 def VSHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.s16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VUHTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.u16\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VSLTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.s32\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VULTOH : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1001, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
                  IIC_fpCVTIH, "vcvt", ".f16.u32\t$dst, $a, $fbits", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]>,
+             Sched<[WriteFPCVT]>;
 
 def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.s16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1650,7 +1710,8 @@ def VSHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 0,
 
 def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
                        (outs SPR:$dst), (ins SPR:$a, fbits16:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.u16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1658,7 +1719,8 @@ def VUHTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 0,
 
 def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1666,7 +1728,8 @@ def VSLTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1010, 0b1010, 1,
 
 def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
                        (outs SPR:$dst), (ins SPR:$a, fbits32:$fbits),
-                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []> {
+                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1674,19 +1737,23 @@ def VULTOS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1011, 0b1010, 1,
 
 def VSHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.s16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VUHTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 0,
                        (outs DPR:$dst), (ins DPR:$a, fbits16:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.u16\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VSLTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1010, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
                        (outs DPR:$dst), (ins DPR:$a, fbits32:$fbits),
-                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>;
+                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a, $fbits", []>,
+             Sched<[WriteFPCVT]>;
 
 } // End of 'let Constraints = "$a = $dst" in'
 
@@ -1700,7 +1767,8 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1708,7 +1776,8 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1734,7 +1803,8 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1742,7 +1812,8 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+              Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1768,7 +1839,8 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+                Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1776,7 +1848,8 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+                Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1802,14 +1875,16 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>,
+               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
                   IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]> {
+                Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>,
+             Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -1838,7 +1913,8 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+            Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1846,7 +1922,8 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+            Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
@@ -1856,7 +1933,8 @@ def VFMAH : AHbI<0b11101, 0b10, 0, 0,
                   IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
                   []>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFusedMAC]>;
+              Requires<[HasFullFP16,UseFusedMAC]>,
+            Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
@@ -1880,7 +1958,8 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+              Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1888,7 +1967,8 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+              Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
@@ -1898,7 +1978,8 @@ def VFMSH : AHbI<0b11101, 0b10, 1, 0,
                   IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
                   []>,
               RegConstraint<"$Sdin = $Sd">,
-              Requires<[HasFullFP16,UseFusedMAC]>;
+              Requires<[HasFullFP16,UseFusedMAC]>,
+              Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
@@ -1929,7 +2010,8 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+                Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1937,7 +2019,8 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
                                            SPR:$Sdin))]>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+                Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+                Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
@@ -1947,7 +2030,8 @@ def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
                   IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
                   []>,
                 RegConstraint<"$Sdin = $Sd">,
-                Requires<[HasFullFP16,UseFusedMAC]>;
+                Requires<[HasFullFP16,UseFusedMAC]>,
+                Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
@@ -1978,14 +2062,16 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
+               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>,
+               Sched<[WriteFPMAC64, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
                   IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm",
              [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
                          RegConstraint<"$Sdin = $Sd">,
-                  Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]> {
+                  Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>,
+                  Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines.
 }
@@ -1995,7 +2081,8 @@ def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
                   IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
              []>,
                          RegConstraint<"$Sdin = $Sd">,
-                  Requires<[HasFullFP16,UseFusedMAC]>;
+                  Requires<[HasFullFP16,UseFusedMAC]>,
+                  Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 2bdbe4f..faed6b8 100644
--- a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -11,25 +11,112 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
-#include "ARMInstructionSelector.h"
 #include "ARMRegisterBankInfo.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "arm-isel"
 
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
 #error "You shouldn't build this"
 #endif
 
-ARMInstructionSelector::ARMInstructionSelector(const ARMSubtarget &STI,
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "ARMGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class ARMInstructionSelector : public InstructionSelector {
+public:
+  ARMInstructionSelector(const ARMBaseTargetMachine &TM, const ARMSubtarget &STI,
+                         const ARMRegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  bool selectImpl(MachineInstr &I) const;
+
+  struct CmpConstants;
+  struct InsertInfo;
+
+  bool selectCmp(CmpConstants Helper, MachineInstrBuilder &MIB,
+                 MachineRegisterInfo &MRI) const;
+
+  // Helper for inserting a comparison sequence that sets \p ResReg to either 1
+  // if \p LHSReg and \p RHSReg are in the relationship defined by \p Cond, or
+  // \p PrevRes otherwise. In essence, it computes PrevRes OR (LHS Cond RHS).
+  bool insertComparison(CmpConstants Helper, InsertInfo I, unsigned ResReg,
+                        ARMCC::CondCodes Cond, unsigned LHSReg, unsigned RHSReg,
+                        unsigned PrevRes) const;
+
+  // Set \p DestReg to \p Constant.
+  void putConstant(InsertInfo I, unsigned DestReg, unsigned Constant) const;
+
+  bool selectSelect(MachineInstrBuilder &MIB, MachineRegisterInfo &MRI) const;
+
+  // Check if the types match and both operands have the expected size and
+  // register bank.
+  bool validOpRegPair(MachineRegisterInfo &MRI, unsigned LHS, unsigned RHS,
+                      unsigned ExpectedSize, unsigned ExpectedRegBankID) const;
+
+  // Check if the register has the expected size and register bank.
+  bool validReg(MachineRegisterInfo &MRI, unsigned Reg, unsigned ExpectedSize,
+                unsigned ExpectedRegBankID) const;
+
+  const ARMBaseInstrInfo &TII;
+  const ARMBaseRegisterInfo &TRI;
+  const ARMBaseTargetMachine &TM;
+  const ARMRegisterBankInfo &RBI;
+  const ARMSubtarget &STI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "ARMGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+// We declare the temporaries used by selectImpl() in the class to minimize the
+// cost of constructing placeholder values.
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "ARMGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+} // end anonymous namespace
+
+namespace llvm {
+InstructionSelector *
+createARMInstructionSelector(const ARMBaseTargetMachine &TM,
+                             const ARMSubtarget &STI,
+                             const ARMRegisterBankInfo &RBI) {
+  return new ARMInstructionSelector(TM, STI, RBI);
+}
+}
+
+unsigned zero_reg = 0;
+
+#define GET_GLOBALISEL_IMPL
+#include "ARMGenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM,
+                                               const ARMSubtarget &STI,
                                                const ARMRegisterBankInfo &RBI)
     : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+      TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "ARMGenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "ARMGenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
 
 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
@@ -43,20 +130,21 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   assert(RegBank && "Can't get reg bank for virtual register");
 
   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
-  (void)DstSize;
-  unsigned SrcReg = I.getOperand(1).getReg();
-  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-  (void)SrcSize;
-  assert((DstSize == SrcSize ||
-          // Copies are a means to setup initial types, the number of
-          // bits may not exactly match.
-          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-           DstSize <= SrcSize)) &&
-         "Copy with different width?!");
-
-  assert(RegBank->getID() == ARM::GPRRegBankID && "Unsupported reg bank");
+  assert((RegBank->getID() == ARM::GPRRegBankID ||
+          RegBank->getID() == ARM::FPRRegBankID) &&
+         "Unsupported reg bank");
+
   const TargetRegisterClass *RC = &ARM::GPRRegClass;
 
+  if (RegBank->getID() == ARM::FPRRegBankID) {
+    if (DstSize == 32)
+      RC = &ARM::SPRRegClass;
+    else if (DstSize == 64)
+      RC = &ARM::DPRRegClass;
+    else
+      llvm_unreachable("Unsupported destination size");
+  }
+
   // No need to constrain SrcReg. It will get constrained when
   // we hit another of its uses or its defs.
   // Copies do not have constraints.
@@ -68,6 +156,375 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   return true;
 }
 
+static bool selectMergeValues(MachineInstrBuilder &MIB,
+                              const ARMBaseInstrInfo &TII,
+                              MachineRegisterInfo &MRI,
+                              const TargetRegisterInfo &TRI,
+                              const RegisterBankInfo &RBI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select merge without VFP");
+
+  // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
+  // into one DPR.
+  unsigned VReg0 = MIB->getOperand(0).getReg();
+  (void)VReg0;
+  assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
+         RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
+         "Unsupported operand for G_MERGE_VALUES");
+  unsigned VReg1 = MIB->getOperand(1).getReg();
+  (void)VReg1;
+  assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_MERGE_VALUES");
+  unsigned VReg2 = MIB->getOperand(2).getReg();
+  (void)VReg2;
+  assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_MERGE_VALUES");
+
+  MIB->setDesc(TII.get(ARM::VMOVDRR));
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+static bool selectUnmergeValues(MachineInstrBuilder &MIB,
+                                const ARMBaseInstrInfo &TII,
+                                MachineRegisterInfo &MRI,
+                                const TargetRegisterInfo &TRI,
+                                const RegisterBankInfo &RBI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select unmerge without VFP");
+
+  // We only support G_UNMERGE_VALUES as a way to break up one DPR into two
+  // GPRs.
+  unsigned VReg0 = MIB->getOperand(0).getReg();
+  (void)VReg0;
+  assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_UNMERGE_VALUES");
+  unsigned VReg1 = MIB->getOperand(1).getReg();
+  (void)VReg1;
+  assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_UNMERGE_VALUES");
+  unsigned VReg2 = MIB->getOperand(2).getReg();
+  (void)VReg2;
+  assert(MRI.getType(VReg2).getSizeInBits() == 64 &&
+         RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID &&
+         "Unsupported operand for G_UNMERGE_VALUES");
+
+  MIB->setDesc(TII.get(ARM::VMOVRRD));
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+/// Select the opcode for simple extensions (that translate to a single SXT/UXT
+/// instruction). Extension operations more complicated than that should not
+/// invoke this. Returns the original opcode if it doesn't know how to select a
+/// better one.
+static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) {
+  using namespace TargetOpcode;
+
+  if (Size != 8 && Size != 16)
+    return Opc;
+
+  if (Opc == G_SEXT)
+    return Size == 8 ? ARM::SXTB : ARM::SXTH;
+
+  if (Opc == G_ZEXT)
+    return Size == 8 ? ARM::UXTB : ARM::UXTH;
+
+  return Opc;
+}
+
+/// Select the opcode for simple loads and stores. For types smaller than 32
+/// bits, the value will be zero extended. Returns the original opcode if it
+/// doesn't know how to select a better one.
+static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
+                                      unsigned Size) {
+  bool isStore = Opc == TargetOpcode::G_STORE;
+
+  if (RegBank == ARM::GPRRegBankID) {
+    switch (Size) {
+    case 1:
+    case 8:
+      return isStore ? ARM::STRBi12 : ARM::LDRBi12;
+    case 16:
+      return isStore ? ARM::STRH : ARM::LDRH;
+    case 32:
+      return isStore ? ARM::STRi12 : ARM::LDRi12;
+    default:
+      return Opc;
+    }
+  }
+
+  if (RegBank == ARM::FPRRegBankID) {
+    switch (Size) {
+    case 32:
+      return isStore ? ARM::VSTRS : ARM::VLDRS;
+    case 64:
+      return isStore ? ARM::VSTRD : ARM::VLDRD;
+    default:
+      return Opc;
+    }
+  }
+
+  return Opc;
+}
+
+// When lowering comparisons, we sometimes need to perform two compares instead
+// of just one. Get the condition codes for both comparisons. If only one is
+// needed, the second member of the pair is ARMCC::AL.
+static std::pair<ARMCC::CondCodes, ARMCC::CondCodes>
+getComparePreds(CmpInst::Predicate Pred) {
+  std::pair<ARMCC::CondCodes, ARMCC::CondCodes> Preds = {ARMCC::AL, ARMCC::AL};
+  switch (Pred) {
+  case CmpInst::FCMP_ONE:
+    Preds = {ARMCC::GT, ARMCC::MI};
+    break;
+  case CmpInst::FCMP_UEQ:
+    Preds = {ARMCC::EQ, ARMCC::VS};
+    break;
+  case CmpInst::ICMP_EQ:
+  case CmpInst::FCMP_OEQ:
+    Preds.first = ARMCC::EQ;
+    break;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::FCMP_OGT:
+    Preds.first = ARMCC::GT;
+    break;
+  case CmpInst::ICMP_SGE:
+  case CmpInst::FCMP_OGE:
+    Preds.first = ARMCC::GE;
+    break;
+  case CmpInst::ICMP_UGT:
+  case CmpInst::FCMP_UGT:
+    Preds.first = ARMCC::HI;
+    break;
+  case CmpInst::FCMP_OLT:
+    Preds.first = ARMCC::MI;
+    break;
+  case CmpInst::ICMP_ULE:
+  case CmpInst::FCMP_OLE:
+    Preds.first = ARMCC::LS;
+    break;
+  case CmpInst::FCMP_ORD:
+    Preds.first = ARMCC::VC;
+    break;
+  case CmpInst::FCMP_UNO:
+    Preds.first = ARMCC::VS;
+    break;
+  case CmpInst::FCMP_UGE:
+    Preds.first = ARMCC::PL;
+    break;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::FCMP_ULT:
+    Preds.first = ARMCC::LT;
+    break;
+  case CmpInst::ICMP_SLE:
+  case CmpInst::FCMP_ULE:
+    Preds.first = ARMCC::LE;
+    break;
+  case CmpInst::FCMP_UNE:
+  case CmpInst::ICMP_NE:
+    Preds.first = ARMCC::NE;
+    break;
+  case CmpInst::ICMP_UGE:
+    Preds.first = ARMCC::HS;
+    break;
+  case CmpInst::ICMP_ULT:
+    Preds.first = ARMCC::LO;
+    break;
+  default:
+    break;
+  }
+  assert(Preds.first != ARMCC::AL && "No comparisons needed?");
+  return Preds;
+}
+
+struct ARMInstructionSelector::CmpConstants {
+  CmpConstants(unsigned CmpOpcode, unsigned FlagsOpcode, unsigned OpRegBank,
+               unsigned OpSize)
+      : ComparisonOpcode(CmpOpcode), ReadFlagsOpcode(FlagsOpcode),
+        OperandRegBankID(OpRegBank), OperandSize(OpSize) {}
+
+  // The opcode used for performing the comparison.
+  const unsigned ComparisonOpcode;
+
+  // The opcode used for reading the flags set by the comparison. May be
+  // ARM::INSTRUCTION_LIST_END if we don't need to read the flags.
+  const unsigned ReadFlagsOpcode;
+
+  // The assumed register bank ID for the operands.
+  const unsigned OperandRegBankID;
+
+  // The assumed size in bits for the operands.
+  const unsigned OperandSize;
+};
+
+struct ARMInstructionSelector::InsertInfo {
+  InsertInfo(MachineInstrBuilder &MIB)
+      : MBB(*MIB->getParent()), InsertBefore(std::next(MIB->getIterator())),
+        DbgLoc(MIB->getDebugLoc()) {}
+
+  MachineBasicBlock &MBB;
+  const MachineBasicBlock::instr_iterator InsertBefore;
+  const DebugLoc &DbgLoc;
+};
+
+void ARMInstructionSelector::putConstant(InsertInfo I, unsigned DestReg,
+                                         unsigned Constant) const {
+  (void)BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVi))
+      .addDef(DestReg)
+      .addImm(Constant)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
+}
+
+bool ARMInstructionSelector::validOpRegPair(MachineRegisterInfo &MRI,
+                                            unsigned LHSReg, unsigned RHSReg,
+                                            unsigned ExpectedSize,
+                                            unsigned ExpectedRegBankID) const {
+  return MRI.getType(LHSReg) == MRI.getType(RHSReg) &&
+         validReg(MRI, LHSReg, ExpectedSize, ExpectedRegBankID) &&
+         validReg(MRI, RHSReg, ExpectedSize, ExpectedRegBankID);
+}
+
+bool ARMInstructionSelector::validReg(MachineRegisterInfo &MRI, unsigned Reg,
+                                      unsigned ExpectedSize,
+                                      unsigned ExpectedRegBankID) const {
+  if (MRI.getType(Reg).getSizeInBits() != ExpectedSize) {
+    DEBUG(dbgs() << "Unexpected size for register");
+    return false;
+  }
+
+  if (RBI.getRegBank(Reg, MRI, TRI)->getID() != ExpectedRegBankID) {
+    DEBUG(dbgs() << "Unexpected register bank for register");
+    return false;
+  }
+
+  return true;
+}
+
+bool ARMInstructionSelector::selectCmp(CmpConstants Helper,
+                                       MachineInstrBuilder &MIB,
+                                       MachineRegisterInfo &MRI) const {
+  const InsertInfo I(MIB);
+
+  auto ResReg = MIB->getOperand(0).getReg();
+  if (!validReg(MRI, ResReg, 1, ARM::GPRRegBankID))
+    return false;
+
+  auto Cond =
+      static_cast<CmpInst::Predicate>(MIB->getOperand(1).getPredicate());
+  if (Cond == CmpInst::FCMP_TRUE || Cond == CmpInst::FCMP_FALSE) {
+    putConstant(I, ResReg, Cond == CmpInst::FCMP_TRUE ? 1 : 0);
+    MIB->eraseFromParent();
+    return true;
+  }
+
+  auto LHSReg = MIB->getOperand(2).getReg();
+  auto RHSReg = MIB->getOperand(3).getReg();
+  if (!validOpRegPair(MRI, LHSReg, RHSReg, Helper.OperandSize,
+                      Helper.OperandRegBankID))
+    return false;
+
+  auto ARMConds = getComparePreds(Cond);
+  auto ZeroReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
+  putConstant(I, ZeroReg, 0);
+
+  if (ARMConds.second == ARMCC::AL) {
+    // Simple case, we only need one comparison and we're done.
+    if (!insertComparison(Helper, I, ResReg, ARMConds.first, LHSReg, RHSReg,
+                          ZeroReg))
+      return false;
+  } else {
+    // Not so simple, we need two successive comparisons.
+    auto IntermediateRes = MRI.createVirtualRegister(&ARM::GPRRegClass);
+    if (!insertComparison(Helper, I, IntermediateRes, ARMConds.first, LHSReg,
+                          RHSReg, ZeroReg))
+      return false;
+    if (!insertComparison(Helper, I, ResReg, ARMConds.second, LHSReg, RHSReg,
+                          IntermediateRes))
+      return false;
+  }
+
+  MIB->eraseFromParent();
+  return true;
+}
+
+bool ARMInstructionSelector::insertComparison(CmpConstants Helper, InsertInfo I,
+                                              unsigned ResReg,
+                                              ARMCC::CondCodes Cond,
+                                              unsigned LHSReg, unsigned RHSReg,
+                                              unsigned PrevRes) const {
+  // Perform the comparison.
+  auto CmpI =
+      BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(Helper.ComparisonOpcode))
+          .addUse(LHSReg)
+          .addUse(RHSReg)
+          .add(predOps(ARMCC::AL));
+  if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+    return false;
+
+  // Read the comparison flags (if necessary).
+  if (Helper.ReadFlagsOpcode != ARM::INSTRUCTION_LIST_END) {
+    auto ReadI = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc,
+                         TII.get(Helper.ReadFlagsOpcode))
+                     .add(predOps(ARMCC::AL));
+    if (!constrainSelectedInstRegOperands(*ReadI, TII, TRI, RBI))
+      return false;
+  }
+
+  // Select either 1 or the previous result based on the value of the flags.
+  auto Mov1I = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVCCi))
+                   .addDef(ResReg)
+                   .addUse(PrevRes)
+                   .addImm(1)
+                   .add(predOps(Cond, ARM::CPSR));
+  if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+    return false;
+
+  return true;
+}
+
+bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
+                                          MachineRegisterInfo &MRI) const {
+  auto &MBB = *MIB->getParent();
+  auto InsertBefore = std::next(MIB->getIterator());
+  auto &DbgLoc = MIB->getDebugLoc();
+
+  // Compare the condition to 0.
+  auto CondReg = MIB->getOperand(1).getReg();
+  assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) &&
+         "Unsupported types for select operation");
+  auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::CMPri))
+                  .addUse(CondReg)
+                  .addImm(0)
+                  .add(predOps(ARMCC::AL));
+  if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI))
+    return false;
+
+  // Move a value into the result register based on the result of the
+  // comparison.
+  auto ResReg = MIB->getOperand(0).getReg();
+  auto TrueReg = MIB->getOperand(2).getReg();
+  auto FalseReg = MIB->getOperand(3).getReg();
+  assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) &&
+         validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) &&
+         "Unsupported types for select operation");
+  auto Mov1I = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::MOVCCr))
+                   .addDef(ResReg)
+                   .addUse(TrueReg)
+                   .addUse(FalseReg)
+                   .add(predOps(ARMCC::EQ, ARM::CPSR));
+  if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI))
+    return false;
+
+  MIB->eraseFromParent();
+  return true;
+}
+
 bool ARMInstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -83,24 +540,211 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     return true;
   }
 
+  if (selectImpl(I))
+    return true;
+
   MachineInstrBuilder MIB{MF, I};
+  bool isSExt = false;
 
   using namespace TargetOpcode;
   switch (I.getOpcode()) {
-  case G_ADD:
+  case G_SEXT:
+    isSExt = true;
+    LLVM_FALLTHROUGH;
+  case G_ZEXT: {
+    LLT DstTy = MRI.getType(I.getOperand(0).getReg());
+    // FIXME: Smaller destination sizes coming soon!
+    if (DstTy.getSizeInBits() != 32) {
+      DEBUG(dbgs() << "Unsupported destination size for extension");
+      return false;
+    }
+
+    LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
+    unsigned SrcSize = SrcTy.getSizeInBits();
+    switch (SrcSize) {
+    case 1: {
+      // ZExt boils down to & 0x1; for SExt we also subtract that from 0
+      I.setDesc(TII.get(ARM::ANDri));
+      MIB.addImm(1).add(predOps(ARMCC::AL)).add(condCodeOp());
+
+      if (isSExt) {
+        unsigned SExtResult = I.getOperand(0).getReg();
+
+        // Use a new virtual register for the result of the AND
+        unsigned AndResult = MRI.createVirtualRegister(&ARM::GPRRegClass);
+        I.getOperand(0).setReg(AndResult);
+
+        auto InsertBefore = std::next(I.getIterator());
+        auto SubI =
+            BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::RSBri))
+                .addDef(SExtResult)
+                .addUse(AndResult)
+                .addImm(0)
+                .add(predOps(ARMCC::AL))
+                .add(condCodeOp());
+        if (!constrainSelectedInstRegOperands(*SubI, TII, TRI, RBI))
+          return false;
+      }
+      break;
+    }
+    case 8:
+    case 16: {
+      unsigned NewOpc = selectSimpleExtOpc(I.getOpcode(), SrcSize);
+      if (NewOpc == I.getOpcode())
+        return false;
+      I.setDesc(TII.get(NewOpc));
+      MIB.addImm(0).add(predOps(ARMCC::AL));
+      break;
+    }
+    default:
+      DEBUG(dbgs() << "Unsupported source size for extension");
+      return false;
+    }
+    break;
+  }
+  case G_ANYEXT:
+  case G_TRUNC: {
+    // The high bits are undefined, so there's nothing special to do, just
+    // treat it as a copy.
+    auto SrcReg = I.getOperand(1).getReg();
+    auto DstReg = I.getOperand(0).getReg();
+
+    const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+    const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+    if (SrcRegBank.getID() != DstRegBank.getID()) {
+      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n");
+      return false;
+    }
+
+    if (SrcRegBank.getID() != ARM::GPRRegBankID) {
+      DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n");
+      return false;
+    }
+
+    I.setDesc(TII.get(COPY));
+    return selectCopy(I, TII, MRI, TRI, RBI);
+  }
+  case G_SELECT:
+    return selectSelect(MIB, MRI);
+  case G_ICMP: {
+    CmpConstants Helper(ARM::CMPrr, ARM::INSTRUCTION_LIST_END,
+                        ARM::GPRRegBankID, 32);
+    return selectCmp(Helper, MIB, MRI);
+  }
+  case G_FCMP: {
+    assert(TII.getSubtarget().hasVFP2() && "Can't select fcmp without VFP");
+
+    unsigned OpReg = I.getOperand(2).getReg();
+    unsigned Size = MRI.getType(OpReg).getSizeInBits();
+
+    if (Size == 64 && TII.getSubtarget().isFPOnlySP()) {
+      DEBUG(dbgs() << "Subtarget only supports single precision");
+      return false;
+    }
+    if (Size != 32 && Size != 64) {
+      DEBUG(dbgs() << "Unsupported size for G_FCMP operand");
+      return false;
+    }
+
+    CmpConstants Helper(Size == 32 ? ARM::VCMPS : ARM::VCMPD, ARM::FMSTAT,
+                        ARM::FPRRegBankID, Size);
+    return selectCmp(Helper, MIB, MRI);
+  }
+  case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
-    AddDefaultCC(AddDefaultPred(MIB));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
   case G_FRAME_INDEX:
     // Add 0 to the given frame index and hope it will eventually be folded into
     // the user(s).
     I.setDesc(TII.get(ARM::ADDri));
-    AddDefaultCC(AddDefaultPred(MIB.addImm(0)));
+    MIB.addImm(0).add(predOps(ARMCC::AL)).add(condCodeOp());
+    break;
+  case G_CONSTANT: {
+    unsigned Reg = I.getOperand(0).getReg();
+
+    if (!validReg(MRI, Reg, 32, ARM::GPRRegBankID))
+      return false;
+
+    I.setDesc(TII.get(ARM::MOVi));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+
+    auto &Val = I.getOperand(1);
+    if (Val.isCImm()) {
+      if (Val.getCImm()->getBitWidth() > 32)
+        return false;
+      Val.ChangeToImmediate(Val.getCImm()->getZExtValue());
+    }
+
+    if (!Val.isImm()) {
+      return false;
+    }
+
+    break;
+  }
+  case G_STORE:
+  case G_LOAD: {
+    const auto &MemOp = **I.memoperands_begin();
+    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      return false;
+    }
+
+    unsigned Reg = I.getOperand(0).getReg();
+    unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID();
+
+    LLT ValTy = MRI.getType(Reg);
+    const auto ValSize = ValTy.getSizeInBits();
+
+    assert((ValSize != 64 || TII.getSubtarget().hasVFP2()) &&
+           "Don't know how to load/store 64-bit value without VFP");
+
+    const auto NewOpc = selectLoadStoreOpCode(I.getOpcode(), RegBank, ValSize);
+    if (NewOpc == G_LOAD || NewOpc == G_STORE)
+      return false;
+
+    I.setDesc(TII.get(NewOpc));
+
+    if (NewOpc == ARM::LDRH || NewOpc == ARM::STRH)
+      // LDRH has a funny addressing mode (there's already a FIXME for it).
+      MIB.addReg(0);
+    MIB.addImm(0).add(predOps(ARMCC::AL));
     break;
-  case G_LOAD:
-    I.setDesc(TII.get(ARM::LDRi12));
-    AddDefaultPred(MIB.addImm(0));
+  }
+  case G_MERGE_VALUES: {
+    if (!selectMergeValues(MIB, TII, MRI, TRI, RBI))
+      return false;
+    break;
+  }
+  case G_UNMERGE_VALUES: {
+    if (!selectUnmergeValues(MIB, TII, MRI, TRI, RBI))
+      return false;
     break;
+  }
+  case G_BRCOND: {
+    if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) {
+      DEBUG(dbgs() << "Unsupported condition register for G_BRCOND");
+      return false;
+    }
+
+    // Set the flags.
+    auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri))
+                    .addReg(I.getOperand(0).getReg())
+                    .addImm(1)
+                    .add(predOps(ARMCC::AL));
+    if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI))
+      return false;
+
+    // Branch conditionally.
+    auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc))
+                      .add(I.getOperand(1))
+                      .add(predOps(ARMCC::EQ, ARM::CPSR));
+    if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI))
+      return false;
+    I.eraseFromParent();
+    return true;
+  }
   default:
     return false;
   }
diff --git a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h b/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h
deleted file mode 100644
index 5072cdd..0000000
--- a/contrib/llvm/lib/Target/ARM/ARMInstructionSelector.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===- ARMInstructionSelector ------------------------------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file declares the targeting of the InstructionSelector class for ARM.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
-#define LLVM_LIB_TARGET_ARM_ARMINSTRUCTIONSELECTOR_H
-
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-
-namespace llvm {
-class ARMBaseInstrInfo;
-class ARMBaseRegisterInfo;
-class ARMBaseTargetMachine;
-class ARMRegisterBankInfo;
-class ARMSubtarget;
-
-class ARMInstructionSelector : public InstructionSelector {
-public:
-  ARMInstructionSelector(const ARMSubtarget &STI,
-                         const ARMRegisterBankInfo &RBI);
-
-  virtual bool select(MachineInstr &I) const override;
-
-private:
-  const ARMBaseInstrInfo &TII;
-  const ARMBaseRegisterInfo &TRI;
-  const ARMRegisterBankInfo &RBI;
-};
-
-} // End llvm namespace.
-#endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 255ea4b..1c17c07 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -12,6 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMLegalizerInfo.h"
+#include "ARMCallLowering.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
@@ -23,22 +28,328 @@ using namespace llvm;
 #error "You shouldn't build this"
 #endif
 
-ARMLegalizerInfo::ARMLegalizerInfo() {
+static bool AEABI(const ARMSubtarget &ST) {
+  return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI();
+}
+
+ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   using namespace TargetOpcode;
 
   const LLT p0 = LLT::pointer(0, 32);
 
+  const LLT s1 = LLT::scalar(1);
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
 
   setAction({G_FRAME_INDEX, p0}, Legal);
 
-  setAction({G_LOAD, s32}, Legal);
-  setAction({G_LOAD, 1, p0}, Legal);
+  for (unsigned Op : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s1, s8, s16, s32, p0})
+      setAction({Op, Ty}, Legal);
+    setAction({Op, 1, p0}, Legal);
+  }
+
+  for (unsigned Op : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) {
+    for (auto Ty : {s1, s8, s16})
+      setAction({Op, Ty}, WidenScalar);
+    setAction({Op, s32}, Legal);
+  }
+
+  for (unsigned Op : {G_SDIV, G_UDIV}) {
+    for (auto Ty : {s8, s16})
+      setAction({Op, Ty}, WidenScalar);
+    if (ST.hasDivideInARMMode())
+      setAction({Op, s32}, Legal);
+    else
+      setAction({Op, s32}, Libcall);
+  }
+
+  for (unsigned Op : {G_SREM, G_UREM}) {
+    for (auto Ty : {s8, s16})
+      setAction({Op, Ty}, WidenScalar);
+    if (ST.hasDivideInARMMode())
+      setAction({Op, s32}, Lower);
+    else if (AEABI(ST))
+      setAction({Op, s32}, Custom);
+    else
+      setAction({Op, s32}, Libcall);
+  }
+
+  for (unsigned Op : {G_SEXT, G_ZEXT}) {
+    setAction({Op, s32}, Legal);
+    for (auto Ty : {s1, s8, s16})
+      setAction({Op, 1, Ty}, Legal);
+  }
+
+  setAction({G_GEP, p0}, Legal);
+  setAction({G_GEP, 1, s32}, Legal);
+
+  setAction({G_SELECT, s32}, Legal);
+  setAction({G_SELECT, p0}, Legal);
+  setAction({G_SELECT, 1, s1}, Legal);
+
+  setAction({G_BRCOND, s1}, Legal);
+
+  setAction({G_CONSTANT, s32}, Legal);
+  for (auto Ty : {s1, s8, s16})
+    setAction({G_CONSTANT, Ty}, WidenScalar);
+
+  setAction({G_ICMP, s1}, Legal);
+  for (auto Ty : {s8, s16})
+    setAction({G_ICMP, 1, Ty}, WidenScalar);
+  for (auto Ty : {s32, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
 
-  for (auto Ty : {s8, s16, s32})
-    setAction({G_ADD, Ty}, Legal);
+  if (!ST.useSoftFloat() && ST.hasVFP2()) {
+    setAction({G_FADD, s32}, Legal);
+    setAction({G_FADD, s64}, Legal);
+
+    setAction({G_LOAD, s64}, Legal);
+    setAction({G_STORE, s64}, Legal);
+
+    setAction({G_FCMP, s1}, Legal);
+    setAction({G_FCMP, 1, s32}, Legal);
+    setAction({G_FCMP, 1, s64}, Legal);
+  } else {
+    for (auto Ty : {s32, s64})
+      setAction({G_FADD, Ty}, Libcall);
+
+    setAction({G_FCMP, s1}, Legal);
+    setAction({G_FCMP, 1, s32}, Custom);
+    setAction({G_FCMP, 1, s64}, Custom);
+
+    if (AEABI(ST))
+      setFCmpLibcallsAEABI();
+    else
+      setFCmpLibcallsGNU();
+  }
+
+  for (unsigned Op : {G_FREM, G_FPOW})
+    for (auto Ty : {s32, s64})
+      setAction({Op, Ty}, Libcall);
 
   computeTables();
 }
+
+void ARMLegalizerInfo::setFCmpLibcallsAEABI() {
+  // FCMP_TRUE and FCMP_FALSE don't need libcalls, they should be
+  // default-initialized.
+  FCmp32Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1);
+  FCmp32Libcalls[CmpInst::FCMP_OEQ] = {
+      {RTLIB::OEQ_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_OGE] = {
+      {RTLIB::OGE_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_OGT] = {
+      {RTLIB::OGT_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_OLE] = {
+      {RTLIB::OLE_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_OLT] = {
+      {RTLIB::OLT_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_UNO] = {
+      {RTLIB::UO_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_ONE] = {
+      {RTLIB::OGT_F32, CmpInst::BAD_ICMP_PREDICATE},
+      {RTLIB::OLT_F32, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp32Libcalls[CmpInst::FCMP_UEQ] = {
+      {RTLIB::OEQ_F32, CmpInst::BAD_ICMP_PREDICATE},
+      {RTLIB::UO_F32, CmpInst::BAD_ICMP_PREDICATE}};
+
+  FCmp64Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1);
+  FCmp64Libcalls[CmpInst::FCMP_OEQ] = {
+      {RTLIB::OEQ_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_OGE] = {
+      {RTLIB::OGE_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_OGT] = {
+      {RTLIB::OGT_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_OLE] = {
+      {RTLIB::OLE_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_OLT] = {
+      {RTLIB::OLT_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_UNO] = {
+      {RTLIB::UO_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_ONE] = {
+      {RTLIB::OGT_F64, CmpInst::BAD_ICMP_PREDICATE},
+      {RTLIB::OLT_F64, CmpInst::BAD_ICMP_PREDICATE}};
+  FCmp64Libcalls[CmpInst::FCMP_UEQ] = {
+      {RTLIB::OEQ_F64, CmpInst::BAD_ICMP_PREDICATE},
+      {RTLIB::UO_F64, CmpInst::BAD_ICMP_PREDICATE}};
+}
+
+void ARMLegalizerInfo::setFCmpLibcallsGNU() {
+  // FCMP_TRUE and FCMP_FALSE don't need libcalls, they should be
+  // default-initialized.
+  FCmp32Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1);
+  FCmp32Libcalls[CmpInst::FCMP_OEQ] = {{RTLIB::OEQ_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_OGE] = {{RTLIB::OGE_F32, CmpInst::ICMP_SGE}};
+  FCmp32Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F32, CmpInst::ICMP_SGT}};
+  FCmp32Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F32, CmpInst::ICMP_SLE}};
+  FCmp32Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F32, CmpInst::ICMP_SLT}};
+  FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}};
+  FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_SGE}};
+  FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_SGT}};
+  FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_SLE}};
+  FCmp32Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F32, CmpInst::ICMP_SLT}};
+  FCmp32Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F32, CmpInst::ICMP_NE}};
+  FCmp32Libcalls[CmpInst::FCMP_UNO] = {{RTLIB::UO_F32, CmpInst::ICMP_NE}};
+  FCmp32Libcalls[CmpInst::FCMP_ONE] = {{RTLIB::OGT_F32, CmpInst::ICMP_SGT},
+                                       {RTLIB::OLT_F32, CmpInst::ICMP_SLT}};
+  FCmp32Libcalls[CmpInst::FCMP_UEQ] = {{RTLIB::OEQ_F32, CmpInst::ICMP_EQ},
+                                       {RTLIB::UO_F32, CmpInst::ICMP_NE}};
+
+  FCmp64Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1);
+  FCmp64Libcalls[CmpInst::FCMP_OEQ] = {{RTLIB::OEQ_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_OGE] = {{RTLIB::OGE_F64, CmpInst::ICMP_SGE}};
+  FCmp64Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F64, CmpInst::ICMP_SGT}};
+  FCmp64Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F64, CmpInst::ICMP_SLE}};
+  FCmp64Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F64, CmpInst::ICMP_SLT}};
+  FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}};
+  FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_SGE}};
+  FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_SGT}};
+  FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_SLE}};
+  FCmp64Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F64, CmpInst::ICMP_SLT}};
+  FCmp64Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F64, CmpInst::ICMP_NE}};
+  FCmp64Libcalls[CmpInst::FCMP_UNO] = {{RTLIB::UO_F64, CmpInst::ICMP_NE}};
+  FCmp64Libcalls[CmpInst::FCMP_ONE] = {{RTLIB::OGT_F64, CmpInst::ICMP_SGT},
+                                       {RTLIB::OLT_F64, CmpInst::ICMP_SLT}};
+  FCmp64Libcalls[CmpInst::FCMP_UEQ] = {{RTLIB::OEQ_F64, CmpInst::ICMP_EQ},
+                                       {RTLIB::UO_F64, CmpInst::ICMP_NE}};
+}
+
+ARMLegalizerInfo::FCmpLibcallsList
+ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
+                                  unsigned Size) const {
+  assert(CmpInst::isFPPredicate(Predicate) && "Unsupported FCmp predicate");
+  if (Size == 32)
+    return FCmp32Libcalls[Predicate];
+  if (Size == 64)
+    return FCmp64Libcalls[Predicate];
+  llvm_unreachable("Unsupported size for FCmp predicate");
+}
+
+bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      MachineIRBuilder &MIRBuilder) const {
+  using namespace TargetOpcode;
+
+  MIRBuilder.setInstr(MI);
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case G_SREM:
+  case G_UREM: {
+    unsigned OriginalResult = MI.getOperand(0).getReg();
+    auto Size = MRI.getType(OriginalResult).getSizeInBits();
+    if (Size != 32)
+      return false;
+
+    auto Libcall =
+        MI.getOpcode() == G_SREM ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+
+    // Our divmod libcalls return a struct containing the quotient and the
+    // remainder. We need to create a virtual register for it.
+    auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+    Type *ArgTy = Type::getInt32Ty(Ctx);
+    StructType *RetTy = StructType::get(Ctx, {ArgTy, ArgTy}, /* Packed */ true);
+    auto RetVal = MRI.createGenericVirtualRegister(
+        getLLTForType(*RetTy, MIRBuilder.getMF().getDataLayout()));
+
+    auto Status = createLibcall(MIRBuilder, Libcall, {RetVal, RetTy},
+                                {{MI.getOperand(1).getReg(), ArgTy},
+                                 {MI.getOperand(2).getReg(), ArgTy}});
+    if (Status != LegalizerHelper::Legalized)
+      return false;
+
+    // The remainder is the second result of divmod. Split the return value into
+    // a new, unused register for the quotient and the destination of the
+    // original instruction for the remainder.
+    MIRBuilder.buildUnmerge(
+        {MRI.createGenericVirtualRegister(LLT::scalar(32)), OriginalResult},
+        RetVal);
+    break;
+  }
+  case G_FCMP: {
+    assert(MRI.getType(MI.getOperand(2).getReg()) ==
+               MRI.getType(MI.getOperand(3).getReg()) &&
+           "Mismatched operands for G_FCMP");
+    auto OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+
+    auto OriginalResult = MI.getOperand(0).getReg();
+    auto Predicate =
+        static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+    auto Libcalls = getFCmpLibcalls(Predicate, OpSize);
+
+    if (Libcalls.empty()) {
+      assert((Predicate == CmpInst::FCMP_TRUE ||
+              Predicate == CmpInst::FCMP_FALSE) &&
+             "Predicate needs libcalls, but none specified");
+      MIRBuilder.buildConstant(OriginalResult,
+                               Predicate == CmpInst::FCMP_TRUE ? 1 : 0);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+    assert((OpSize == 32 || OpSize == 64) && "Unsupported operand size");
+    auto *ArgTy = OpSize == 32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx);
+    auto *RetTy = Type::getInt32Ty(Ctx);
+
+    SmallVector<unsigned, 2> Results;
+    for (auto Libcall : Libcalls) {
+      auto LibcallResult = MRI.createGenericVirtualRegister(LLT::scalar(32));
+      auto Status =
+          createLibcall(MIRBuilder, Libcall.LibcallID, {LibcallResult, RetTy},
+                        {{MI.getOperand(2).getReg(), ArgTy},
+                         {MI.getOperand(3).getReg(), ArgTy}});
+
+      if (Status != LegalizerHelper::Legalized)
+        return false;
+
+      auto ProcessedResult =
+          Libcalls.size() == 1
+              ? OriginalResult
+              : MRI.createGenericVirtualRegister(MRI.getType(OriginalResult));
+
+      // We have a result, but we need to transform it into a proper 1-bit 0 or
+      // 1, taking into account the different peculiarities of the values
+      // returned by the comparison functions.
+      CmpInst::Predicate ResultPred = Libcall.Predicate;
+      if (ResultPred == CmpInst::BAD_ICMP_PREDICATE) {
+        // We have a nice 0 or 1, and we just need to truncate it back to 1 bit
+        // to keep the types consistent.
+        MIRBuilder.buildTrunc(ProcessedResult, LibcallResult);
+      } else {
+        // We need to compare against 0.
+        assert(CmpInst::isIntPredicate(ResultPred) && "Unsupported predicate");
+        auto Zero = MRI.createGenericVirtualRegister(LLT::scalar(32));
+        MIRBuilder.buildConstant(Zero, 0);
+        MIRBuilder.buildICmp(ResultPred, ProcessedResult, LibcallResult, Zero);
+      }
+      Results.push_back(ProcessedResult);
+    }
+
+    if (Results.size() != 1) {
+      assert(Results.size() == 2 && "Unexpected number of results");
+      MIRBuilder.buildOr(OriginalResult, Results[0], Results[1]);
+    }
+    break;
+  }
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index ca3eea8..78ab941 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -14,16 +14,52 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
 #define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H
 
+#include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/IR/Instructions.h"
 
 namespace llvm {
 
-class LLVMContext;
+class ARMSubtarget;
 
 /// This class provides the information for the target register banks.
 class ARMLegalizerInfo : public LegalizerInfo {
 public:
-  ARMLegalizerInfo();
+  ARMLegalizerInfo(const ARMSubtarget &ST);
+
+  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &MIRBuilder) const override;
+
+private:
+  void setFCmpLibcallsGNU();
+  void setFCmpLibcallsAEABI();
+
+  struct FCmpLibcallInfo {
+    // Which libcall this is.
+    RTLIB::Libcall LibcallID;
+
+    // The predicate to be used when comparing the value returned by the
+    // function with a relevant constant (currently hard-coded to zero). This is
+    // necessary because often the libcall will return e.g. a value greater than
+    // 0 to represent 'true' and anything negative to represent 'false', or
+    // maybe 0 to represent 'true' and non-zero for 'false'. If no comparison is
+    // needed, this should be CmpInst::BAD_ICMP_PREDICATE.
+    CmpInst::Predicate Predicate;
+  };
+  using FCmpLibcallsList = SmallVector<FCmpLibcallInfo, 2>;
+
+  // Map from each FCmp predicate to the corresponding libcall infos. A FCmp
+  // instruction may be lowered to one or two libcalls, which is why we need a
+  // list. If two libcalls are needed, their results will be OR'ed.
+  using FCmpLibcallsMapTy = IndexedMap<FCmpLibcallsList>;
+
+  FCmpLibcallsMapTy FCmp32Libcalls;
+  FCmpLibcallsMapTy FCmp64Libcalls;
+
+  // Get the libcall(s) corresponding to \p Predicate for operands of \p Size
+  // bits.
+  FCmpLibcallsList getFCmpLibcalls(CmpInst::Predicate, unsigned Size) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 48ab491..7a452d4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -33,7 +34,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -517,8 +517,12 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
 
     if (InsertSub) {
       // An instruction above couldn't be updated, so insert a sub.
-      AddDefaultT1CC(BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
-        .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base)
+          .add(t1CondCodeOp(true))
+          .addReg(Base)
+          .addImm(WordOffset * 4)
+          .addImm(Pred)
+          .addReg(PredReg);
       return;
     }
 
@@ -534,9 +538,12 @@ void ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
     // information and *always* have to reset at the end of a block.
     // See PR21029.
     if (MBBI != MBB.end()) --MBBI;
-    AddDefaultT1CC(
-      BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base), true)
-      .addReg(Base).addImm(WordOffset * 4).addImm(Pred).addReg(PredReg);
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBi8), Base)
+        .add(t1CondCodeOp(true))
+        .addReg(Base)
+        .addImm(WordOffset * 4)
+        .addImm(Pred)
+        .addReg(PredReg);
   }
 }
 
@@ -602,13 +609,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback.
   // It's also not possible to merge an STR of the base register in Thumb1.
-  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+  if (isThumb1 && ContainsReg(Regs, Base)) {
     assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
-    if (Opcode == ARM::tLDRi) {
+    if (Opcode == ARM::tLDRi)
       Writeback = false;
-    } else if (Opcode == ARM::tSTRi) {
+    else if (Opcode == ARM::tSTRi)
       return nullptr;
-    }
   }
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
@@ -700,8 +706,8 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
             .addReg(Base, getKillRegState(KillOldBase));
         } else
           BuildMI(MBB, InsertBefore, DL, TII->get(ARM::tMOVr), NewBase)
-            .addReg(Base, getKillRegState(KillOldBase))
-            .addImm(Pred).addReg(PredReg);
+              .addReg(Base, getKillRegState(KillOldBase))
+              .add(predOps(Pred, PredReg));
 
         // The following ADDS/SUBS becomes an update.
         Base = NewBase;
@@ -710,17 +716,21 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
       if (BaseOpc == ARM::tADDrSPi) {
         assert(Offset % 4 == 0 && "tADDrSPi offset is scaled by 4");
         BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
-          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset/4)
-          .addImm(Pred).addReg(PredReg);
+            .addReg(Base, getKillRegState(KillOldBase))
+            .addImm(Offset / 4)
+            .add(predOps(Pred, PredReg));
       } else
-        AddDefaultT1CC(
-          BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase), true)
-          .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
-          .addImm(Pred).addReg(PredReg);
+        BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
+            .add(t1CondCodeOp(true))
+            .addReg(Base, getKillRegState(KillOldBase))
+            .addImm(Offset)
+            .add(predOps(Pred, PredReg));
     } else {
       BuildMI(MBB, InsertBefore, DL, TII->get(BaseOpc), NewBase)
-        .addReg(Base, getKillRegState(KillOldBase)).addImm(Offset)
-        .addImm(Pred).addReg(PredReg).addReg(0);
+          .addReg(Base, getKillRegState(KillOldBase))
+          .addImm(Offset)
+          .add(predOps(Pred, PredReg))
+          .add(condCodeOp());
     }
     Base = NewBase;
     BaseKill = true; // New base is always killed straight away.
@@ -1259,7 +1269,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
 
   // Transfer the rest of operands.
   for (unsigned OpNum = 3, e = MI->getNumOperands(); OpNum != e; ++OpNum)
-    MIB.addOperand(MI->getOperand(OpNum));
+    MIB.add(MI->getOperand(OpNum));
 
   // Transfer memoperands.
   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
@@ -1392,14 +1402,19 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
       } else {
         int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
         BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-          .addReg(Base, RegState::Define)
-          .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+            .addReg(Base, RegState::Define)
+            .addReg(Base)
+            .addReg(0)
+            .addImm(Imm)
+            .add(predOps(Pred, PredReg));
       }
     } else {
       // t2LDR_PRE, t2LDR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg())
-        .addReg(Base, RegState::Define)
-        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(Base, RegState::Define)
+          .addReg(Base)
+          .addImm(Offset)
+          .add(predOps(Pred, PredReg));
     }
   } else {
     MachineOperand &MO = MI->getOperand(0);
@@ -1410,13 +1425,18 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
       int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
-        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg);
+          .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+          .addReg(Base)
+          .addReg(0)
+          .addImm(Imm)
+          .add(predOps(Pred, PredReg));
     } else {
       // t2STR_PRE, t2STR_POST
       BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base)
-        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
-        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+          .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+          .addReg(Base)
+          .addImm(Offset)
+          .add(predOps(Pred, PredReg));
     }
   }
   MBB.erase(MBBI);
@@ -1462,12 +1482,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
   if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) {
-    MIB.addOperand(Reg0Op).addOperand(Reg1Op)
-       .addReg(BaseOp.getReg(), RegState::Define);
+    MIB.add(Reg0Op).add(Reg1Op).addReg(BaseOp.getReg(), RegState::Define);
   } else {
     assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST);
-    MIB.addReg(BaseOp.getReg(), RegState::Define)
-       .addOperand(Reg0Op).addOperand(Reg1Op);
+    MIB.addReg(BaseOp.getReg(), RegState::Define).add(Reg0Op).add(Reg1Op);
   }
   MIB.addReg(BaseOp.getReg(), RegState::Kill)
      .addImm(Offset).addImm(Pred).addReg(PredReg);
@@ -1477,7 +1495,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
 
   // Transfer implicit operands.
   for (const MachineOperand &MO : MI.implicit_operands())
-    MIB.addOperand(MO);
+    MIB.add(MO);
   MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
   MBB.erase(MBBI);
@@ -1891,8 +1909,9 @@ bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) {
 
   for (auto Use : Prev->uses())
     if (Use.isKill()) {
-      AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
-                         .addReg(Use.getReg(), RegState::Kill))
+      BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX))
+          .addReg(Use.getReg(), RegState::Kill)
+          .add(predOps(ARMCC::AL))
           .copyImplicitOps(*MBBI);
       MBB.erase(MBBI);
       MBB.erase(Prev);
@@ -1942,6 +1961,7 @@ namespace {
     static char ID;
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
+    AliasAnalysis *AA;
     const DataLayout *TD;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -1955,6 +1975,11 @@ namespace {
       return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
     }
 
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
   private:
     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
                           unsigned &NewOpc, unsigned &EvenReg,
@@ -1984,6 +2009,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
   MF  = &Fn;
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   bool Modified = false;
   for (MachineBasicBlock &MFI : Fn)
@@ -1997,28 +2023,19 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
                                       MachineBasicBlock::iterator E,
                                       SmallPtrSetImpl<MachineInstr*> &MemOps,
                                       SmallSet<unsigned, 4> &MemRegs,
-                                      const TargetRegisterInfo *TRI) {
+                                      const TargetRegisterInfo *TRI,
+                                      AliasAnalysis *AA) {
   // Are there stores / loads / calls between them?
-  // FIXME: This is overly conservative. We should make use of alias information
-  // some day.
   SmallSet<unsigned, 4> AddedRegPressure;
   while (++I != E) {
     if (I->isDebugValue() || MemOps.count(&*I))
       continue;
     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
       return false;
-    if (isLd && I->mayStore())
-      return false;
-    if (!isLd) {
-      if (I->mayLoad())
-        return false;
-      // It's not safe to move the first 'str' down.
-      // str r1, [r0]
-      // strh r5, [r0]
-      // str r4, [r0, #+4]
-      if (I->mayStore())
-        return false;
-    }
+    if (I->mayStore() || (!isLd && I->mayLoad()))
+      for (MachineInstr *MemOp : MemOps)
+        if (I->mayAlias(AA, *MemOp, /*UseTBAA*/ false))
+          return false;
     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
       MachineOperand &MO = I->getOperand(j);
       if (!MO.isReg())
@@ -2142,33 +2159,40 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     unsigned LastBytes = 0;
     unsigned NumMove = 0;
     for (int i = Ops.size() - 1; i >= 0; --i) {
+      // Make sure each operation has the same kind.
       MachineInstr *Op = Ops[i];
-      unsigned Loc = MI2LocMap[Op];
-      if (Loc <= FirstLoc) {
-        FirstLoc = Loc;
-        FirstOp = Op;
-      }
-      if (Loc >= LastLoc) {
-        LastLoc = Loc;
-        LastOp = Op;
-      }
-
       unsigned LSMOpcode
         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
       if (LastOpcode && LSMOpcode != LastOpcode)
         break;
 
+      // Check that we have a continuous set of offsets.
       int Offset = getMemoryOpOffset(*Op);
       unsigned Bytes = getLSMultipleTransferSize(Op);
       if (LastBytes) {
         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
           break;
       }
+
+      // Don't try to reschedule too many instructions.
+      if (NumMove == 8) // FIXME: Tune this limit.
+        break;
+
+      // Found a mergable instruction; save information about it.
+      ++NumMove;
       LastOffset = Offset;
       LastBytes = Bytes;
       LastOpcode = LSMOpcode;
-      if (++NumMove == 8) // FIXME: Tune this limit.
-        break;
+
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
     }
 
     if (NumMove <= 1)
@@ -2176,7 +2200,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     else {
       SmallPtrSet<MachineInstr*, 4> MemOps;
       SmallSet<unsigned, 4> MemRegs;
-      for (int i = NumMove-1; i >= 0; --i) {
+      for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) {
         MemOps.insert(Ops[i]);
         MemRegs.insert(Ops[i]->getOperand(0).getReg());
       }
@@ -2186,7 +2210,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
       if (DoMove)
         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
-                                           MemOps, MemRegs, TRI);
+                                           MemOps, MemRegs, TRI, AA);
       if (!DoMove) {
         for (unsigned i = 0; i != NumMove; ++i)
           Ops.pop_back();
diff --git a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index 07044b9..48b02d4 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -14,23 +14,36 @@
 
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
-using namespace llvm;
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
+using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
+  MCSymbolRefExpr::VariantKind SymbolVariant = MCSymbolRefExpr::VK_None;
+  if (MO.getTargetFlags() & ARMII::MO_SBREL)
+    SymbolVariant = MCSymbolRefExpr::VK_ARM_SBREL;
+
   const MCExpr *Expr =
-      MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+      MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
   switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
   default:
     llvm_unreachable("Unknown target flag on symbol operand");
@@ -38,12 +51,12 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
     break;
   case ARMII::MO_LO16:
     Expr =
-        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+        MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
     Expr = ARMMCExpr::createLower16(Expr, OutContext);
     break;
   case ARMII::MO_HI16:
     Expr =
-        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+        MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
     Expr = ARMMCExpr::createUpper16(Expr, OutContext);
     break;
   }
@@ -75,11 +88,10 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(
         MO.getMBB()->getSymbol(), OutContext));
     break;
-  case MachineOperand::MO_GlobalAddress: {
+  case MachineOperand::MO_GlobalAddress:
     MCOp = GetSymbolRef(MO,
                         GetARMGVSymbol(MO.getGlobal(), MO.getTargetFlags()));
     break;
-  }
   case MachineOperand::MO_ExternalSymbol:
     MCOp = GetSymbolRef(MO,
                         GetExternalSymbolSymbol(MO.getSymbolName()));
@@ -141,9 +153,7 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     break;
   }
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp;
     if (AP.lowerOperand(MO, MCOp)) {
       if (MCOp.isImm() && EncodeImms) {
@@ -199,11 +209,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
     .addImm(ARMCC::AL).addReg(0));
 
   MCInst Noop;
-  Subtarget->getInstrInfo()->getNoopForElfTarget(Noop);
+  Subtarget->getInstrInfo()->getNoop(Noop);
   for (int8_t I = 0; I < NoopsInSledCount; I++)
-  {
     OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
-  }
 
   OutStreamer->EmitLabel(Target);
   recordSled(CurSled, MI, Kind);
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 50d8f09..e25d36b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 
 using namespace llvm;
 
@@ -15,10 +16,4 @@ void ARMFunctionInfo::anchor() {}
 
 ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
     : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
-      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
-      StByValParamsPadding(0), ArgRegsSaveSize(0), ReturnRegsCount(0),
-      HasStackFrame(false), RestoreSPFromFP(false), LRSpilledForFarJump(false),
-      FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
-      GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), PICLabelUId(0),
-      VarArgsFrameIndex(0), HasITBlocks(false), ArgumentStackSize(0),
-      IsSplitCSR(false), PromotedGlobalsIncrease(0) {}
+      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()) {}
diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index 8c485e8..8161167 100644
--- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -14,11 +14,11 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
 
-#include "ARMSubtarget.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <utility>
 
 namespace llvm {
 
@@ -29,42 +29,42 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// isThumb - True if this function is compiled under Thumb mode.
   /// Used to initialized Align, so must precede it.
-  bool isThumb;
+  bool isThumb = false;
 
   /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
   /// to determine if function is compiled under Thumb mode, for that use
   /// 'isThumb'.
-  bool hasThumb2;
+  bool hasThumb2 = false;
 
   /// StByValParamsPadding - For parameter that is split between
   /// GPRs and memory; while recovering GPRs part, when
   /// StackAlignment > 4, and GPRs-part-size mod StackAlignment != 0,
   /// we need to insert gap before parameter start address. It allows to
   /// "attach" GPR-part to the part that was passed via stack.
-  unsigned StByValParamsPadding;
+  unsigned StByValParamsPadding = 0;
 
   /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
   ///
-  unsigned ArgRegsSaveSize;
+  unsigned ArgRegsSaveSize = 0;
 
   /// ReturnRegsCount - Number of registers used up in the return.
-  unsigned ReturnRegsCount;
+  unsigned ReturnRegsCount = 0;
 
   /// HasStackFrame - True if this function has a stack frame. Set by
   /// determineCalleeSaves().
-  bool HasStackFrame;
+  bool HasStackFrame = false;
 
   /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by
   /// emitPrologue.
-  bool RestoreSPFromFP;
+  bool RestoreSPFromFP = false;
 
   /// LRSpilledForFarJump - True if the LR register has been for spilled to
   /// enable far jump.
-  bool LRSpilledForFarJump;
+  bool LRSpilledForFarJump = false;
 
   /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
   /// spill stack offset.
-  unsigned FramePtrSpillOffset;
+  unsigned FramePtrSpillOffset = 0;
 
   /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
   /// register spills areas. For Mac OS X:
@@ -77,16 +77,16 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   ///
   /// Also see AlignedDPRCSRegs below. Not all D-regs need to go in area 3.
   /// Some may be spilled after the stack has been realigned.
-  unsigned GPRCS1Offset;
-  unsigned GPRCS2Offset;
-  unsigned DPRCSOffset;
+  unsigned GPRCS1Offset = 0;
+  unsigned GPRCS2Offset = 0;
+  unsigned DPRCSOffset = 0;
 
   /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
   /// areas.
-  unsigned GPRCS1Size;
-  unsigned GPRCS2Size;
-  unsigned DPRCSAlignGapSize;
-  unsigned DPRCSSize;
+  unsigned GPRCS1Size = 0;
+  unsigned GPRCS2Size = 0;
+  unsigned DPRCSAlignGapSize = 0;
+  unsigned DPRCSSize = 0;
 
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
   /// the aligned portion of the stack frame.  This is always a contiguous
@@ -95,15 +95,15 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// We do not keep track of the frame indices used for these registers - they
   /// behave like any other frame index in the aligned stack frame.  These
   /// registers also aren't included in DPRCSSize above.
-  unsigned NumAlignedDPRCS2Regs;
+  unsigned NumAlignedDPRCS2Regs = 0;
 
-  unsigned PICLabelUId;
+  unsigned PICLabelUId = 0;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
 
   /// HasITBlocks - True if IT blocks have been inserted.
-  bool HasITBlocks;
+  bool HasITBlocks = false;
 
   /// CPEClones - Track constant pool entries clones created by Constant Island
   /// pass.
@@ -111,7 +111,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// ArgumentStackSize - amount of bytes on stack consumed by the arguments
   /// being passed on the stack
-  unsigned ArgumentStackSize;
+  unsigned ArgumentStackSize = 0;
 
   /// CoalescedWeights - mapping of basic blocks to the rolling counter of
   /// coalesced weights.
@@ -119,26 +119,16 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies.
-  bool IsSplitCSR;
+  bool IsSplitCSR = false;
 
   /// Globals that have had their storage promoted into the constant pool.
   SmallPtrSet<const GlobalVariable*,2> PromotedGlobals;
 
   /// The amount the literal pool has been increasedby due to promoted globals.
-  int PromotedGlobalsIncrease;
+  int PromotedGlobalsIncrease = 0;
   
 public:
-  ARMFunctionInfo() :
-    isThumb(false),
-    hasThumb2(false),
-    ArgRegsSaveSize(0), ReturnRegsCount(0), HasStackFrame(false),
-    RestoreSPFromFP(false),
-    LRSpilledForFarJump(false),
-    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
-    GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0),
-    NumAlignedDPRCS2Regs(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false),
-    PromotedGlobalsIncrease(0) {}
+  ARMFunctionInfo() = default;
 
   explicit ARMFunctionInfo(MachineFunction &MF);
 
@@ -250,6 +240,7 @@ public:
     PromotedGlobalsIncrease = Sz;
   }
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_ARM_ARMMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
new file mode 100644
index 0000000..1b6e97c
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.cpp
@@ -0,0 +1,57 @@
+//===- ARMMacroFusion.cpp - ARM Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMacroFusion.h"
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const ARMSubtarget &ST = static_cast<const ARMSubtarget&>(TSI);
+
+  // Assume wildcards for unspecified instrs.
+  unsigned FirstOpcode =
+    FirstMI ? FirstMI->getOpcode()
+	    : static_cast<unsigned>(ARM::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  if (ST.hasFuseAES())
+    // Fuse AES crypto operations.
+    switch(SecondOpcode) {
+    // AES encode.
+    case ARM::AESMC :
+      return FirstOpcode == ARM::AESE ||
+             FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    // AES decode.
+    case ARM::AESIMC:
+      return FirstOpcode == ARM::AESD ||
+             FirstOpcode == ARM::INSTRUCTION_LIST_END;
+    }
+
+  return false;
+}
+
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h
new file mode 100644
index 0000000..1e4fc66
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMMacroFusion.h
@@ -0,0 +1,24 @@
+//===- ARMMacroFusion.h - ARM Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition of the DAG scheduling mutation
+/// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createARMMacroFusionDAGMutation());
+/// to ARMPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createARMMacroFusionDAGMutation();
+
+} // llvm
diff --git a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 581d5fe..7e4d598 100644
--- a/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) {
       }
     }
   }
+  bool Changed = false;
   // Remove the tagged DMB
   for (auto MI : ToRemove) {
     MI->eraseFromParent();
     ++NumDMBsRemoved;
+    Changed = true;
   }
 
-  return NumDMBsRemoved > 0;
+  return Changed;
 }
 
 /// createARMOptimizeBarriersPass - Returns an instance of the remove double
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 324087d..8449302 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -13,11 +13,15 @@
 
 #include "ARMRegisterBankInfo.h"
 #include "ARMInstrInfo.h" // For the register classes
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
+#define GET_TARGET_REGBANK_IMPL
+#include "ARMGenRegisterBank.inc"
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -29,44 +33,109 @@ using namespace llvm;
 // into an ARMGenRegisterBankInfo.def (similar to AArch64).
 namespace llvm {
 namespace ARM {
-const uint32_t GPRCoverageData[] = {
-    // Classes 0-31
-    (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) |
-        (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) |
-        (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) |
-        (1u << ARM::GPRnopc_and_hGPRRegClassID) |
-        (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) |
-        (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) |
-        (1u << ARM::hGPR_and_tcGPRRegClassID),
-    // Classes 32-63
-    0,
-    // Classes 64-96
-    0,
-    // FIXME: Some of the entries below this point can be safely removed once
-    // this is tablegenerated. It's only needed because of the hardcoded
-    // register class limit.
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
+enum PartialMappingIdx {
+  PMI_GPR,
+  PMI_SPR,
+  PMI_DPR,
+  PMI_Min = PMI_GPR,
+};
+
+RegisterBankInfo::PartialMapping PartMappings[]{
+    // GPR Partial Mapping
+    {0, 32, GPRRegBank},
+    // SPR Partial Mapping
+    {0, 32, FPRRegBank},
+    // DPR Partial Mapping
+    {0, 64, FPRRegBank},
 };
 
-RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData);
-RegisterBank *RegBanks[] = {&GPRRegBank};
+#ifndef NDEBUG
+static bool checkPartMapping(const RegisterBankInfo::PartialMapping &PM,
+                             unsigned Start, unsigned Length,
+                             unsigned RegBankID) {
+  return PM.StartIdx == Start && PM.Length == Length &&
+         PM.RegBank->getID() == RegBankID;
+}
+
+static void checkPartialMappings() {
+  assert(
+      checkPartMapping(PartMappings[PMI_GPR - PMI_Min], 0, 32, GPRRegBankID) &&
+      "Wrong mapping for GPR");
+  assert(
+      checkPartMapping(PartMappings[PMI_SPR - PMI_Min], 0, 32, FPRRegBankID) &&
+      "Wrong mapping for SPR");
+  assert(
+      checkPartMapping(PartMappings[PMI_DPR - PMI_Min], 0, 64, FPRRegBankID) &&
+      "Wrong mapping for DPR");
+}
+#endif
 
-RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
+enum ValueMappingIdx {
+  InvalidIdx = 0,
+  GPR3OpsIdx = 1,
+  SPR3OpsIdx = 4,
+  DPR3OpsIdx = 7,
+};
 
 RegisterBankInfo::ValueMapping ValueMappings[] = {
-    {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}};
+    // invalid
+    {nullptr, 0},
+    // 3 ops in GPRs
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    // 3 ops in SPRs
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    // 3 ops in DPRs
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    {&PartMappings[PMI_DPR - PMI_Min], 1}};
+
+#ifndef NDEBUG
+static bool checkValueMapping(const RegisterBankInfo::ValueMapping &VM,
+                              RegisterBankInfo::PartialMapping *BreakDown) {
+  return VM.NumBreakDowns == 1 && VM.BreakDown == BreakDown;
+}
+
+static void checkValueMappings() {
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx + 1],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx + 2],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx + 1],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx + 2],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx + 1],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx + 2],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+}
+#endif
 } // end namespace arm
 } // end namespace llvm
 
 ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : RegisterBankInfo(ARM::RegBanks, ARM::NumRegisterBanks) {
+    : ARMGenRegisterBankInfo() {
   static bool AlreadyInit = false;
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
@@ -97,6 +166,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
   assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+
+#ifndef NDEBUG
+  ARM::checkPartialMappings();
+  ARM::checkValueMappings();
+#endif
 }
 
 const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
@@ -105,8 +179,16 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
 
   switch (RC.getID()) {
   case GPRRegClassID:
+  case GPRnopcRegClassID:
+  case GPRspRegClassID:
   case tGPR_and_tcGPRRegClassID:
+  case tGPRRegClassID:
     return getRegBank(ARM::GPRRegBankID);
+  case SPR_8RegClassID:
+  case SPRRegClassID:
+  case DPR_8RegClassID:
+  case DPRRegClassID:
+    return getRegBank(ARM::FPRRegBankID);
   default:
     llvm_unreachable("Unsupported register kind");
   }
@@ -114,37 +196,163 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
   llvm_unreachable("Switch should handle all register classes");
 }
 
-RegisterBankInfo::InstructionMapping
+const RegisterBankInfo::InstructionMapping &
 ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   auto Opc = MI.getOpcode();
 
   // Try the default logic for non-generic instructions that are either copies
   // or already have some operands assigned to banks.
   if (!isPreISelGenericOpcode(Opc)) {
-    InstructionMapping Mapping = getInstrMappingImpl(MI);
+    const InstructionMapping &Mapping = getInstrMappingImpl(MI);
     if (Mapping.isValid())
       return Mapping;
   }
 
   using namespace TargetOpcode;
 
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned NumOperands = MI.getNumOperands();
-  const ValueMapping *OperandsMapping = &ARM::ValueMappings[0];
+  const ValueMapping *OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
 
   switch (Opc) {
   case G_ADD:
-  case G_LOAD:
+  case G_SUB:
+  case G_MUL:
+  case G_AND:
+  case G_OR:
+  case G_XOR:
+  case G_SDIV:
+  case G_UDIV:
+  case G_SEXT:
+  case G_ZEXT:
+  case G_ANYEXT:
+  case G_TRUNC:
+  case G_GEP:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
     // the real world we would use different mappings.
-    OperandsMapping = &ARM::ValueMappings[0];
+    OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
+    break;
+  case G_LOAD:
+  case G_STORE: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    OperandsMapping =
+        Ty.getSizeInBits() == 64
+            ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::GPR3OpsIdx]})
+            : &ARM::ValueMappings[ARM::GPR3OpsIdx];
     break;
+  }
+  case G_FADD: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    assert((Ty.getSizeInBits() == 32 || Ty.getSizeInBits() == 64) &&
+           "Unsupported size for G_FADD");
+    OperandsMapping = Ty.getSizeInBits() == 64
+                          ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
+                          : &ARM::ValueMappings[ARM::SPR3OpsIdx];
+    break;
+  }
+  case G_CONSTANT:
   case G_FRAME_INDEX:
-    OperandsMapping = getOperandsMapping({&ARM::ValueMappings[0], nullptr});
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
+    break;
+  case G_SELECT: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
+    LLT Ty2 = MRI.getType(MI.getOperand(1).getReg());
+    (void)Ty2;
+    assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT");
+    assert(Ty2.getSizeInBits() == 1 && "Unsupported size for G_SELECT");
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
+  case G_ICMP: {
+    LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
+    (void)Ty2;
+    assert(Ty2.getSizeInBits() == 32 && "Unsupported size for G_ICMP");
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
+  case G_FCMP: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    (void)Ty;
+    LLT Ty1 = MRI.getType(MI.getOperand(2).getReg());
+    LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
+    (void)Ty2;
+    assert(Ty.getSizeInBits() == 1 && "Unsupported size for G_FCMP");
+    assert(Ty1.getSizeInBits() == Ty2.getSizeInBits() &&
+           "Mismatched operand sizes for G_FCMP");
+
+    unsigned Size = Ty1.getSizeInBits();
+    assert((Size == 32 || Size == 64) && "Unsupported size for G_FCMP");
+
+    auto FPRValueMapping = Size == 32 ? &ARM::ValueMappings[ARM::SPR3OpsIdx]
+                                      : &ARM::ValueMappings[ARM::DPR3OpsIdx];
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+                            FPRValueMapping, FPRValueMapping});
+    break;
+  }
+  case G_MERGE_VALUES: {
+    // We only support G_MERGE_VALUES for creating a double precision floating
+    // point value out of two GPRs.
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    LLT Ty1 = MRI.getType(MI.getOperand(1).getReg());
+    LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
+    if (Ty.getSizeInBits() != 64 || Ty1.getSizeInBits() != 32 ||
+        Ty2.getSizeInBits() != 32)
+      return getInvalidInstructionMapping();
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx]});
+    break;
+  }
+  case G_UNMERGE_VALUES: {
+    // We only support G_UNMERGE_VALUES for splitting a double precision
+    // floating point value into two GPRs.
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    LLT Ty1 = MRI.getType(MI.getOperand(1).getReg());
+    LLT Ty2 = MRI.getType(MI.getOperand(2).getReg());
+    if (Ty.getSizeInBits() != 32 || Ty1.getSizeInBits() != 32 ||
+        Ty2.getSizeInBits() != 64)
+      return getInvalidInstructionMapping();
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::DPR3OpsIdx]});
+    break;
+  }
+  case G_BR:
+    OperandsMapping = getOperandsMapping({nullptr});
+    break;
+  case G_BRCOND:
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
   default:
-    return InstructionMapping{};
+    return getInvalidInstructionMapping();
   }
 
-  return InstructionMapping{DefaultMappingID, /*Cost=*/1, OperandsMapping,
-                            NumOperands};
+#ifndef NDEBUG
+  for (unsigned i = 0; i < NumOperands; i++) {
+    for (const auto &Mapping : OperandsMapping[i]) {
+      assert(
+          (Mapping.RegBank->getID() != ARM::FPRRegBankID ||
+           MF.getSubtarget<ARMSubtarget>().hasVFP2()) &&
+          "Trying to use floating point register bank on target without vfp");
+    }
+  }
+#endif
+
+  return getInstructionMapping(DefaultMappingID, /*Cost=*/1, OperandsMapping,
+                               NumOperands);
 }
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
index 773920e..9650b35 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -16,26 +16,28 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "ARMGenRegisterBank.inc"
+
 namespace llvm {
 
 class TargetRegisterInfo;
 
-namespace ARM {
-enum {
-  GPRRegBankID = 0, // General purpose registers
-  NumRegisterBanks,
+class ARMGenRegisterBankInfo : public RegisterBankInfo {
+#define GET_TARGET_REGBANK_CLASS
+#include "ARMGenRegisterBank.inc"
 };
-} // end namespace ARM
 
 /// This class provides the information for the target register banks.
-class ARMRegisterBankInfo final : public RegisterBankInfo {
+class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo {
 public:
   ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
 
   const RegisterBank &
   getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
 
-  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+  const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const override;
 };
 } // End llvm namespace.
 #endif
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td b/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td
new file mode 100644
index 0000000..7cd2d60
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterBanks.td
@@ -0,0 +1,14 @@
+//=- ARMRegisterBank.td - Describe the AArch64 Banks ---------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+def GPRRegBank : RegisterBank<"GPRB", [GPR, GPRwithAPSR]>;
+def FPRRegBank : RegisterBank<"FPRB", [SPR, DPR]>;
diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 02cbfb1..b10583b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -245,6 +245,10 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
 // the general GPR register class above (MOV, e.g.)
 def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)>;
 
+// Thumb registers R0-R7 and the PC. Some instructions like TBB or THH allow
+// the PC to be used as a destination operand as well.
+def tGPRwithpc : RegisterClass<"ARM", [i32], 32, (add tGPR, PC)>;
+
 // The high registers in thumb mode, R8-R15.
 def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 
diff --git a/contrib/llvm/lib/Target/ARM/ARMSchedule.td b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
index b7d2d34..53e012f 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSchedule.td
+++ b/contrib/llvm/lib/Target/ARM/ARMSchedule.td
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //===----------------------------------------------------------------------===//
-// Instruction scheduling annotations for out-of-order CPUs.
+// Instruction scheduling annotations for in-order and out-of-order CPUs.
 // These annotations are independent of the itinerary class defined below.
 // Here we define the subtarget independent read/write per-operand resources.
 // The subtarget schedule definitions will then map these to the subtarget's
@@ -54,6 +54,9 @@
 //  }
 //  def : ReadAdvance<ReadAdvanceALUsr, 3>;
 
+//===----------------------------------------------------------------------===//
+// Sched definitions for integer pipeline instructions
+//
 // Basic ALU operation.
 def WriteALU : SchedWrite;
 def ReadALU : SchedRead;
@@ -69,29 +72,84 @@ def WriteCMP : SchedWrite;
 def WriteCMPsi : SchedWrite;
 def WriteCMPsr : SchedWrite;
 
-// Division.
-def WriteDiv : SchedWrite;
+// Multiplys.
+def WriteMUL16   : SchedWrite; // 16-bit multiply.
+def WriteMUL32   : SchedWrite; // 32-bit multiply.
+def WriteMUL64Lo : SchedWrite; // 64-bit result. Low reg.
+def WriteMUL64Hi : SchedWrite; // 64-bit result. High reg.
+def ReadMUL  : SchedRead;
+
+// Multiply-accumulates.
+def WriteMAC16   : SchedWrite; // 16-bit mac.
+def WriteMAC32   : SchedWrite; // 32-bit mac.
+def WriteMAC64Lo : SchedWrite; // 64-bit mac. Low reg.
+def WriteMAC64Hi : SchedWrite; // 64-bit mac. High reg.
+def ReadMAC : SchedRead;
+
+// Divisions.
+def WriteDIV : SchedWrite;
 
-// Loads.
+// Loads/Stores.
 def WriteLd : SchedWrite;
 def WritePreLd : SchedWrite;
+def WriteST : SchedWrite;
 
 // Branches.
 def WriteBr : SchedWrite;
 def WriteBrL : SchedWrite;
 def WriteBrTbl : SchedWrite;
 
-// Fixpoint conversions.
-def WriteCvtFP : SchedWrite;
-
 // Noop.
 def WriteNoop : SchedWrite;
 
+//===----------------------------------------------------------------------===//
+// Sched definitions for floating-point and neon instructions
+//
+// Floating point conversions
+def WriteFPCVT : SchedWrite;
+def WriteFPMOV : SchedWrite; // FP -> GPR and vice-versa
+
+// ALU operations (32/64-bit)
+def WriteFPALU32 : SchedWrite;
+def WriteFPALU64 : SchedWrite;
+
+// Multiplication
+def WriteFPMUL32 : SchedWrite;
+def WriteFPMUL64 : SchedWrite;
+def ReadFPMUL    : SchedRead; // multiplier read
+def ReadFPMAC    : SchedRead; // accumulator read
+
+// Multiply-accumulate
+def WriteFPMAC32 : SchedWrite;
+def WriteFPMAC64 : SchedWrite;
+
+// Division
+def WriteFPDIV32 : SchedWrite;
+def WriteFPDIV64 : SchedWrite;
+
+// Square-root
+def WriteFPSQRT32 : SchedWrite;
+def WriteFPSQRT64 : SchedWrite;
+
+// Vector load and stores
+def WriteVLD1 : SchedWrite;
+def WriteVLD2 : SchedWrite;
+def WriteVLD3 : SchedWrite;
+def WriteVLD4 : SchedWrite;
+def WriteVST1 : SchedWrite;
+def WriteVST2 : SchedWrite;
+def WriteVST3 : SchedWrite;
+def WriteVST4 : SchedWrite;
+
+
 // Define TII for use in SchedVariant Predicates.
 def : PredicateProlog<[{
   const ARMBaseInstrInfo *TII =
     static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
   (void)TII;
+  const ARMSubtarget *STI =
+    static_cast<const ARMSubtarget*>(SchedModel->getSubtargetInfo());
+  (void)STI;
 }]>;
 
 def IsPredicatedPred : SchedPredicate<[{TII->isPredicated(*MI)}]>;
@@ -365,3 +423,5 @@ include "ARMScheduleA8.td"
 include "ARMScheduleA9.td"
 include "ARMScheduleSwift.td"
 include "ARMScheduleR52.td"
+include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td
new file mode 100644
index 0000000..525079d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -0,0 +1,1471 @@
+//=- ARMScheduleA57.td - ARM Cortex-A57 Scheduling Defs -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for ARM Cortex-A57 to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// *** Common description and scheduling model parameters taken from AArch64 ***
+// The Cortex-A57 is a traditional superscalar microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
+def IsCPSRDefinedPred : SchedPredicate<[{TII->isCPSRDefined(*MI)}]>;
+def IsCPSRDefinedAndPredicatedPred :
+  SchedPredicate<[{TII->isCPSRDefined(*MI) && TII->isPredicated(*MI)}]>;
+
+// Cortex A57 rev. r1p0 or later (false = r0px)
+def IsR1P0AndLaterPred : SchedPredicate<[{false}]>;
+
+// If Addrmode3 contains register offset (not immediate)
+def IsLdrAm3RegOffPred :
+  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 1)}]>;
+// The same predicate with operand offset 2 and 3:
+def IsLdrAm3RegOffPredX2 :
+  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 2)}]>;
+def IsLdrAm3RegOffPredX3 :
+  SchedPredicate<[{!TII->isAddrMode3OpImm(*MI, 3)}]>;
+
+// If Addrmode3 contains "minus register"
+def IsLdrAm3NegRegOffPred :
+  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 1)}]>;
+// The same predicate with operand offset 2 and 3:
+def IsLdrAm3NegRegOffPredX2 :
+  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 2)}]>;
+def IsLdrAm3NegRegOffPredX3 :
+  SchedPredicate<[{TII->isAddrMode3OpMinusReg(*MI, 3)}]>;
+
+// Load, scaled register offset, not plus LSL2
+def IsLdstsoScaledNotOptimalPredX0 :
+  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 0)}]>;
+def IsLdstsoScaledNotOptimalPred :
+  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 1)}]>;
+def IsLdstsoScaledNotOptimalPredX2 :
+  SchedPredicate<[{TII->isLdstScaledRegNotPlusLsl2(*MI, 2)}]>;
+
+// Load, scaled register offset
+def IsLdstsoScaledPred :
+  SchedPredicate<[{TII->isLdstScaledReg(*MI, 1)}]>;
+def IsLdstsoScaledPredX2 :
+  SchedPredicate<[{TII->isLdstScaledReg(*MI, 2)}]>;
+
+def IsLdstsoMinusRegPredX0 :
+  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 0)}]>;
+def IsLdstsoMinusRegPred :
+  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 1)}]>;
+def IsLdstsoMinusRegPredX2 :
+  SchedPredicate<[{TII->isLdstSoMinusReg(*MI, 2)}]>;
+
+// Load, scaled register offset
+def IsLdrAm2ScaledPred :
+  SchedPredicate<[{TII->isAm2ScaledReg(*MI, 1)}]>;
+
+// LDM, base reg in list
+def IsLdmBaseRegInList :
+  SchedPredicate<[{TII->isLDMBaseRegInList(*MI)}]>;
+
+class A57WriteLMOpsListType<list<SchedWriteRes> writes> {
+  list <SchedWriteRes> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
+
+// *** Common description and scheduling model parameters taken from AArch64 ***
+// (AArch64SchedA57.td)
+def CortexA57Model : SchedMachineModel {
+  let IssueWidth        =   3; // 3-way decode and dispatch
+  let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  16; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling.
+  let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Cortex-A57.
+// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
+// micro-ops wait for their operands and then issue out-of-order.
+
+def A57UnitB : ProcResource<1>;  // Type B micro-ops
+def A57UnitI : ProcResource<2>;  // Type I micro-ops
+def A57UnitM : ProcResource<1>;  // Type M micro-ops
+def A57UnitL : ProcResource<1>;  // Type L micro-ops
+def A57UnitS : ProcResource<1>;  // Type S micro-ops
+
+def A57UnitX : ProcResource<1>;  // Type X micro-ops (F1)
+def A57UnitW : ProcResource<1>;  // Type W micro-ops (F0)
+
+let SchedModel = CortexA57Model in {
+  def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>;    // Type V micro-ops
+}
+
+let SchedModel = CortexA57Model in {
+
+//===----------------------------------------------------------------------===//
+// Define customized scheduler read/write types specific to the Cortex-A57.
+
+include "ARMScheduleA57WriteRes.td"
+
+// To have "CompleteModel = 1", support of pseudos and special instructions
+def : InstRW<[WriteNoop], (instregex "(t)?BKPT$", "(t2)?CDP(2)?$",
+  "(t2)?CLREX$", "CONSTPOOL_ENTRY$", "COPY_STRUCT_BYVAL_I32$",
+  "(t2)?CPS[123]p$", "(t2)?DBG$", "(t2)?DMB$", "(t2)?DSB$", "ERET$",
+  "(t2|t)?HINT$", "(t)?HLT$", "(t2)?HVC$", "(t2)?ISB$", "ITasm$",
+  "(t2)?RFE(DA|DB|IA|IB)", "(t)?SETEND", "(t2)?SETPAN", "(t2)?SMC", "SPACE",
+  "(t2)?SRS(DA|DB|IA|IB)", "SWP(B)?", "t?TRAP", "UDF$", "t2DCPS", "t2SG",
+  "t2TT", "tCPS", "CMP_SWAP", "t?SVC", "t2IT", "CompilerBarrier")>;
+
+def : InstRW<[WriteNoop], (instregex "VMRS", "VMSR", "FMSTAT")>;
+
+// Specific memory instrs
+def : InstRW<[WriteNoop, WriteNoop], (instregex "(t2)?LDA", "(t2)?LDC", "(t2)?STC",
+  "(t2)?STL", "(t2)?LDREX", "(t2)?STREX", "MEMCPY")>;
+
+// coprocessor moves
+def : InstRW<[WriteNoop, WriteNoop], (instregex
+  "(t2)?MCR(2|R|R2)?$", "(t2)?MRC(2)?$",
+  "(t2)?MRRC(2)?$", "(t2)?MRS(banked|sys|_AR|_M|sys_AR)?$",
+  "(t2)?MSR(banked|i|_AR|_M)?$")>;
+
+// Deprecated instructions
+def : InstRW<[WriteNoop], (instregex "FLDM", "FSTM")>;
+
+// Pseudos
+def : InstRW<[WriteNoop], (instregex "(t2)?ABS$",
+  "(t)?ADJCALLSTACKDOWN$", "(t)?ADJCALLSTACKUP$", "(t2|t)?Int_eh_sjlj",
+  "tLDRpci_pic", "t2SUBS_PC_LR",
+  "JUMPTABLE", "tInt_WIN_eh_sjlj_longjmp",
+  "VLD(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
+  "VLD(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
+  "VST(1|2)LN(d|q)(WB_fixed_|WB_register_)?Asm",
+  "VST(3|4)(DUP|LN)?(d|q)(WB_fixed_|WB_register_)?Asm",
+  "WIN__CHKSTK", "WIN__DBZCHK")>;
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_1cyc_1I], (instrs COPY)>;
+
+// --- 3.2 Branch Instructions ---
+// B, BX, BL, BLX (imm, reg != LR, reg == LR), CBZ, CBNZ
+
+def : InstRW<[A57Write_1cyc_1B], (instregex "(t2|t)?B$", "t?BX", "(t2|t)?Bcc$",
+  "t?TAILJMP(d|r)", "TCRETURN(d|r)i", "tBfar", "tCBN?Z")>;
+def : InstRW<[A57Write_1cyc_1B_1I],
+  (instregex "t?BL$", "BL_pred$", "t?BLXi", "t?TPsoft")>;
+def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BLX", "tBLX(NS)?r")>;
+// Pseudos
+def : InstRW<[A57Write_2cyc_1B_1I], (instregex "BCCi64", "BCCZi64")>;
+def : InstRW<[A57Write_3cyc_1B_1I], (instregex "BR_JTadd", "t?BR_JTr",
+  "t2BR_JT", "t2BXJ", "(t2)?TB(B|H)(_JT)?$", "tBRIND")>;
+def : InstRW<[A57Write_6cyc_1B_1L], (instregex "BR_JTm")>;
+
+// --- 3.3 Arithmetic and Logical Instructions ---
+// ADD{S}, ADC{S}, ADR,	AND{S},	BIC{S},	CMN, CMP, EOR{S}, ORN{S}, ORR{S},
+// RSB{S}, RSC{S}, SUB{S}, SBC{S}, TEQ, TST
+
+def : InstRW<[A57Write_1cyc_1I], (instregex "tADDframe")>;
+
+// shift by register, conditional or unconditional
+// TODO: according to the doc, conditional uses I0/I1, unconditional uses M
+// Why more complex instruction uses more simple pipeline?
+// May be an error in doc.
+def A57WriteALUsi : SchedWriteVariant<[
+  // lsl #2, lsl #1, or lsr #1.
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1M]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def A57WriteALUsr : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def A57WriteALUSsr : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def A57ReadALUsr : SchedReadVariant<[
+  SchedVar<IsPredicatedPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,      [ReadDefault]>
+]>;
+def : SchedAlias<WriteALUsi,  A57WriteALUsi>;
+def : SchedAlias<WriteALUsr,  A57WriteALUsr>;
+def : SchedAlias<WriteALUSsr, A57WriteALUSsr>;
+def : SchedAlias<ReadALUsr,   A57ReadALUsr>;
+
+def A57WriteCMPsr : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1M]>
+]>;
+def : SchedAlias<WriteCMP,   A57Write_1cyc_1I>;
+def : SchedAlias<WriteCMPsi, A57Write_2cyc_1M>;
+def : SchedAlias<WriteCMPsr, A57WriteCMPsr>;
+
+// --- 3.4 Move and Shift Instructions ---
+// Move, basic
+// MOV{S}, MOVW, MVN{S}
+def : InstRW<[A57Write_1cyc_1I], (instregex "MOV(r|i|i16|r_TC)",
+  "(t2)?MVN(CC)?(r|i)", "BMOVPCB_CALL", "BMOVPCRX_CALL",
+  "MOVCC(r|i|i16|i32imm)", "tMOV", "tMVN")>;
+
+// Move, shift by immed, setflags/no setflags
+// (ASR, LSL, LSR, ROR, RRX)=MOVsi, MVN
+// setflags = isCPSRDefined
+def A57WriteMOVsi : SchedWriteVariant<[
+  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
+  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteMOVsi], (instregex "MOV(CC)?si", "MVNsi",
+  "ASRi", "(t2|t)ASRri", "LSRi", "(t2|t)LSRri", "LSLi", "(t2|t)LSLri", "RORi",
+  "(t2|t)RORri", "(t2)?RRX", "t2MOV", "tROR")>;
+
+// shift by register, conditional or unconditional, setflags/no setflags
+def A57WriteMOVsr : SchedWriteVariant<[
+  SchedVar<IsCPSRDefinedAndPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<IsCPSRDefinedPred,              [A57Write_2cyc_1M]>,
+  SchedVar<IsPredicatedPred,               [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,                    [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteMOVsr], (instregex "MOV(CC)?sr", "MVNsr", "t2MVNs",
+  "ASRr", "(t2|t)ASRrr", "LSRr", "(t2|t)LSRrr", "LSLr", "(t2|t)?LSLrr", "RORr",
+  "(t2|t)RORrr")>;
+
+// Move, top
+// MOVT - A57Write_2cyc_1M for r0px, A57Write_1cyc_1I for r1p0 and later
+def A57WriteMOVT : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred,             [A57Write_1cyc_1I]>,
+  SchedVar<NoSchedPred,                    [A57Write_2cyc_1M]>
+]>;
+def : InstRW<[A57WriteMOVT], (instregex "MOVTi16")>;
+
+def A57WriteI2pc :
+  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_1cyc_1I]>;
+def A57WriteI2ld :
+  WriteSequence<[A57Write_1cyc_1I, A57Write_1cyc_1I, A57Write_4cyc_1L]>;
+def : InstRW< [A57WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A57WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+// +2cyc for branch forms
+def : InstRW<[A57Write_3cyc_1I], (instregex "MOVPC(LR|RX)")>;
+
+// --- 3.5 Divide and Multiply Instructions ---
+// Divide: SDIV, UDIV
+// latency from documentration: 4 ‐ 20, maximum taken
+def : SchedAlias<WriteDIV, A57Write_20cyc_1M>;
+// Multiply: tMul not bound to common WriteRes types
+def : InstRW<[A57Write_3cyc_1M], (instregex "tMUL")>;
+def : SchedAlias<WriteMUL16, A57Write_3cyc_1M>;
+def : SchedAlias<WriteMUL32, A57Write_3cyc_1M>;
+def : ReadAdvance<ReadMUL, 0>;
+
+// Multiply accumulate: MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB,
+// SMLAWT, SMLAD{X}, SMLSD{X}, SMMLA{R}, SMMLS{R}
+// Multiply-accumulate pipelines support late-forwarding of accumulate operands
+// from similar μops, allowing a typical sequence of multiply-accumulate μops
+// to issue one every 1 cycle (sched advance = 2).
+def A57WriteMLA : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
+def A57WriteMLAL : SchedWriteRes<[A57UnitM]> { let Latency = 4; }
+def A57ReadMLA  : SchedReadAdvance<2, [A57WriteMLA, A57WriteMLAL]>;
+
+def : SchedAlias<WriteMAC16, A57WriteMLA>;
+def : SchedAlias<WriteMAC32, A57WriteMLA>;
+def : SchedAlias<ReadMAC,    A57ReadMLA>;
+
+def : SchedAlias<WriteMAC64Lo, A57WriteMLAL>;
+def : SchedAlias<WriteMAC64Hi, A57WriteMLAL>;
+
+// Multiply long: SMULL, UMULL
+def : SchedAlias<WriteMUL64Lo, A57Write_4cyc_1M>;
+def : SchedAlias<WriteMUL64Hi, A57Write_4cyc_1M>;
+
+// --- 3.6 Saturating and Parallel Arithmetic Instructions ---
+// Parallel	arith
+// SADD16, SADD8, SSUB16, SSUB8, UADD16, UADD8, USUB16, USUB8
+// Conditional GE-setting instructions require three extra μops
+// and two additional cycles to conditionally update the GE field.
+def A57WriteParArith : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_4cyc_1I_1M]>,
+  SchedVar<NoSchedPred,      [A57Write_2cyc_1I_1M]>
+]>;
+def : InstRW< [A57WriteParArith], (instregex
+  "(t2)?SADD(16|8)", "(t2)?SSUB(16|8)",
+  "(t2)?UADD(16|8)", "(t2)?USUB(16|8)")>;
+
+// Parallel	arith with exchange: SASX, SSAX, UASX, USAX
+def A57WriteParArithExch : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_5cyc_1I_1M]>,
+  SchedVar<NoSchedPred,      [A57Write_3cyc_1I_1M]>
+]>;
+def : InstRW<[A57WriteParArithExch],
+  (instregex "(t2)?SASX", "(t2)?SSAX", "(t2)?UASX", "(t2)?USAX")>;
+
+// Parallel	halving	arith
+// SHADD16, SHADD8, SHSUB16, SHSUB8, UHADD16, UHADD8, UHSUB16,	UHSUB8
+def : InstRW<[A57Write_2cyc_1M], (instregex
+  "(t2)?SHADD(16|8)", "(t2)?SHSUB(16|8)",
+  "(t2)?UHADD(16|8)", "(t2)?UHSUB(16|8)")>;
+
+// Parallel halving arith with exchange
+// SHASX, SHSAX, UHASX, UHSAX
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?SHASX", "(t2)?SHSAX",
+  "(t2)?UHASX", "(t2)?UHSAX")>;
+
+// Parallel	saturating arith
+// QADD16, QADD8, QSUB16, QSUB8, UQADD16, UQADD8, UQSUB16, UQSUB8
+def : InstRW<[A57Write_2cyc_1M], (instregex "QADD(16|8)", "QSUB(16|8)",
+  "UQADD(16|8)", "UQSUB(16|8)", "t2(U?)QADD", "t2(U?)QSUB")>;
+
+// Parallel	saturating arith with exchange
+// QASX, QSAX, UQASX, UQSAX
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QASX", "(t2)?QSAX",
+  "(t2)?UQASX", "(t2)?UQSAX")>;
+
+// Saturate: SSAT, SSAT16, USAT, USAT16
+def : InstRW<[A57Write_2cyc_1M],
+  (instregex "(t2)?SSAT(16)?", "(t2)?USAT(16)?")>;
+
+// Saturating arith: QADD, QSUB
+def : InstRW<[A57Write_2cyc_1M], (instregex "QADD$", "QSUB$")>;
+
+// Saturating doubling arith: QDADD, QDSUB
+def : InstRW<[A57Write_3cyc_1I_1M], (instregex "(t2)?QDADD", "(t2)?QDSUB")>;
+
+// --- 3.7 Miscellaneous Data-Processing Instructions ---
+// Bit field extract: SBFX, UBFX
+def : InstRW<[A57Write_1cyc_1I], (instregex "(t2)?SBFX", "(t2)?UBFX")>;
+
+// Bit field insert/clear: BFI, BFC
+def : InstRW<[A57Write_2cyc_1M], (instregex "(t2)?BFI", "(t2)?BFC")>;
+
+// Select bytes, conditional/unconditional
+def A57WriteSEL : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_2cyc_1I]>,
+  SchedVar<NoSchedPred,      [A57Write_1cyc_1I]>
+]>;
+def : InstRW<[A57WriteSEL], (instregex "(t2)?SEL")>;
+
+// Sign/zero extend, normal: SXTB, SXTH, UXTB, UXTH
+def : InstRW<[A57Write_1cyc_1I],
+  (instregex "(t2|t)?SXT(B|H)$", "(t2|t)?UXT(B|H)$")>;
+
+// Sign/zero extend and add, normal: SXTAB, SXTAH, UXTAB, UXTAH
+def : InstRW<[A57Write_2cyc_1M],
+  (instregex "(t2)?SXTA(B|H)$", "(t2)?UXTA(B|H)$")>;
+
+// Sign/zero extend and add, parallel: SXTAB16, UXTAB16
+def : InstRW<[A57Write_4cyc_1M], (instregex "(t2)?SXTAB16", "(t2)?UXTAB16")>;
+
+// Sum of absolute differences: USAD8, USADA8
+def : InstRW<[A57Write_3cyc_1M], (instregex "(t2)?USAD8", "(t2)?USADA8")>;
+
+// --- 3.8 Load Instructions ---
+
+// Load, immed offset
+// LDR and LDRB have LDRi12 and LDRBi12 forms for immediate
+def : InstRW<[A57Write_4cyc_1L], (instregex "LDRi12", "LDRBi12",
+  "LDRcp", "(t2|t)?LDRConstPool", "LDRLIT_ga_(pcrel|abs)",
+  "PICLDR", "tLDR")>;
+
+def : InstRW<[A57Write_4cyc_1L],
+  (instregex "t2LDRS?(B|H)?(pcrel|T|i8|i12|pci|pci_pic|s)?$")>;
+
+// For "Load, register offset, minus" we need +1cyc, +1I
+def A57WriteLdrAm3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,           [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAm3], (instregex "LDR(H|SH|SB)$")>;
+def A57WriteLdrAm3X2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,             [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAm3X2, A57WriteLdrAm3X2], (instregex "LDRD$")>;
+def : InstRW<[A57Write_4cyc_1L, A57Write_4cyc_1L], (instregex "t2LDRDi8")>;
+
+def A57WriteLdrAmLDSTSO : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_5cyc_1I_1L]>,
+  SchedVar<IsLdstsoMinusRegPred,         [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,                  [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WriteLdrAmLDSTSO], (instregex "LDRrs", "LDRBrs")>;
+
+def A57WrBackOne : SchedWriteRes<[]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+def A57WrBackTwo : SchedWriteRes<[]> {
+  let Latency = 2;
+  let NumMicroOps = 0;
+}
+def A57WrBackThree : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+
+// --- LDR pre-indexed ---
+// Load, immed pre-indexed (4 cyc for load result, 1 cyc for Base update)
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR_PRE_IMM",
+  "LDRB_PRE_IMM", "t2LDRB_PRE")>;
+
+// Load, register pre-indexed (4 cyc for load result, 2 cyc for Base update)
+// (5 cyc load result for not-lsl2 scaled)
+def A57WriteLdrAmLDSTSOPre : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L_1I]>
+]>;
+def : InstRW<[A57WriteLdrAmLDSTSOPre, A57WrBackTwo],
+  (instregex "LDR_PRE_REG", "LDRB_PRE_REG")>;
+
+def A57WriteLdrAm3PreWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX2, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,          [A57WrBackOne]>
+]>;
+def : InstRW<[A57Write_4cyc_1L, A57WriteLdrAm3PreWrBack],
+  (instregex "LDR(H|SH|SB)_PRE")>;
+def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
+  (instregex "t2LDR(H|SH|SB)?_PRE")>;
+
+// LDRD pre-indexed: 5(2) cyc for reg, 4(1) cyc for imm.
+def A57WriteLdrDAm3Pre : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX3, [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,          [A57Write_4cyc_1L_1I]>
+]>;
+def A57WriteLdrDAm3PreWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,          [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteLdrDAm3Pre, A57WriteLdrDAm3Pre, A57WriteLdrDAm3PreWrBack],
+  (instregex "LDRD_PRE")>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
+  (instregex "t2LDRD_PRE")>;
+
+// --- LDR post-indexed ---
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackOne], (instregex "LDR(T?)_POST_IMM",
+  "LDRB(T?)_POST_IMM", "LDR(SB|H|SH)Ti", "t2LDRB_POST")>;
+
+def A57WriteLdrAm3PostWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPred, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,        [A57WrBackOne]>
+]>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57WriteLdrAm3PostWrBack],
+  (instregex "LDR(H|SH|SB)_POST")>;
+def : InstRW<[A57Write_4cyc_1L, A57WrBackOne],
+  (instregex "t2LDR(H|SH|SB)?_POST")>;
+
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR_POST_REG",
+  "LDRB_POST_REG", "LDR(B?)T_POST$")>;
+
+def A57WriteLdrTRegPost : SchedWriteVariant<[
+  SchedVar<IsLdrAm2ScaledPred, [A57Write_4cyc_1I_1L_1M]>,
+  SchedVar<NoSchedPred,        [A57Write_4cyc_1L_1I]>
+]>;
+def A57WriteLdrTRegPostWrBack : SchedWriteVariant<[
+  SchedVar<IsLdrAm2ScaledPred, [A57WrBackThree]>,
+  SchedVar<NoSchedPred,        [A57WrBackTwo]>
+]>;
+// 4(3) "I0/I1,L,M" for scaled register, otherwise 4(2) "I0/I1,L"
+def : InstRW<[A57WriteLdrTRegPost, A57WriteLdrTRegPostWrBack],
+  (instregex "LDRT_POST_REG", "LDRBT_POST_REG")>;
+
+def : InstRW<[A57Write_4cyc_1L_1I, A57WrBackTwo], (instregex "LDR(SB|H|SH)Tr")>;
+
+def A57WriteLdrAm3PostWrBackX3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3RegOffPredX3, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,          [A57WrBackOne]>
+]>;
+// LDRD post-indexed: 4(2) cyc for reg, 4(1) cyc for imm.
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+  A57WriteLdrAm3PostWrBackX3], (instregex "LDRD_POST")>;
+def : InstRW<[A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I, A57WrBackOne],
+  (instregex "t2LDRD_POST")>;
+
+// --- Preload instructions ---
+// Preload, immed offset
+def : InstRW<[A57Write_4cyc_1L], (instregex "(t2)?PLDi12", "(t2)?PLDWi12",
+  "t2PLDW?(i8|pci|s)", "(t2)?PLI")>;
+
+// Preload, register offset,
+// 5cyc "I0/I1,L" for minus reg or scaled not plus lsl2
+// otherwise 4cyc "L"
+def A57WritePLD : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPredX0, [A57Write_5cyc_1I_1L]>,
+  SchedVar<IsLdstsoMinusRegPredX0,         [A57Write_5cyc_1I_1L]>,
+  SchedVar<NoSchedPred,                    [A57Write_4cyc_1L]>
+]>;
+def : InstRW<[A57WritePLD], (instregex "PLDrs", "PLDWrs")>;
+
+// --- Load multiple instructions ---
+foreach NumAddr = 1-8 in {
+  def A57LMAddrPred#NumAddr :
+    SchedPredicate<"(TII->getLDMVariableDefsSize(*MI)+1)/2 == "#NumAddr>;
+}
+
+def A57LDMOpsListNoregin : A57WriteLMOpsListType<
+                [A57Write_3cyc_1L, A57Write_3cyc_1L,
+                 A57Write_4cyc_1L, A57Write_4cyc_1L,
+                 A57Write_5cyc_1L, A57Write_5cyc_1L,
+                 A57Write_6cyc_1L, A57Write_6cyc_1L,
+                 A57Write_7cyc_1L, A57Write_7cyc_1L,
+                 A57Write_8cyc_1L, A57Write_8cyc_1L,
+                 A57Write_9cyc_1L, A57Write_9cyc_1L,
+                 A57Write_10cyc_1L, A57Write_10cyc_1L]>;
+def A57WriteLDMnoreginlist : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,     A57LDMOpsListNoregin.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,     A57LDMOpsListNoregin.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,     A57LDMOpsListNoregin.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,     A57LDMOpsListNoregin.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,     A57LDMOpsListNoregin.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,     A57LDMOpsListNoregin.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,     A57LDMOpsListNoregin.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,     A57LDMOpsListNoregin.Writes[0-15]>,
+  SchedVar<NoSchedPred,        A57LDMOpsListNoregin.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57LDMOpsListRegin : A57WriteLMOpsListType<
+                [A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+                 A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+                 A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+                 A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+                 A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+                 A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+                 A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
+                 A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I]>;
+def A57WriteLDMreginlist : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,     A57LDMOpsListRegin.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,     A57LDMOpsListRegin.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,     A57LDMOpsListRegin.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,     A57LDMOpsListRegin.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,     A57LDMOpsListRegin.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,     A57LDMOpsListRegin.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,     A57LDMOpsListRegin.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,     A57LDMOpsListRegin.Writes[0-15]>,
+  SchedVar<NoSchedPred,        A57LDMOpsListRegin.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57LDMOpsList_Upd : A57WriteLMOpsListType<
+              [A57WrBackOne,
+               A57Write_3cyc_1L_1I, A57Write_3cyc_1L_1I,
+               A57Write_4cyc_1L_1I, A57Write_4cyc_1L_1I,
+               A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+               A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+               A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+               A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+               A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+               A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I]>;
+def A57WriteLDM_Upd : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,     A57LDMOpsList_Upd.Writes[0-2]>,
+  SchedVar<A57LMAddrPred2,     A57LDMOpsList_Upd.Writes[0-4]>,
+  SchedVar<A57LMAddrPred3,     A57LDMOpsList_Upd.Writes[0-6]>,
+  SchedVar<A57LMAddrPred4,     A57LDMOpsList_Upd.Writes[0-8]>,
+  SchedVar<A57LMAddrPred5,     A57LDMOpsList_Upd.Writes[0-10]>,
+  SchedVar<A57LMAddrPred6,     A57LDMOpsList_Upd.Writes[0-12]>,
+  SchedVar<A57LMAddrPred7,     A57LDMOpsList_Upd.Writes[0-14]>,
+  SchedVar<A57LMAddrPred8,     A57LDMOpsList_Upd.Writes[0-16]>,
+  SchedVar<NoSchedPred,        A57LDMOpsList_Upd.Writes[0-16]>
+]> { let Variadic=1; }
+
+def A57WriteLDM : SchedWriteVariant<[
+  SchedVar<IsLdmBaseRegInList, [A57WriteLDMreginlist]>,
+  SchedVar<NoSchedPred,        [A57WriteLDMnoreginlist]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WriteLDM], (instregex "(t|t2|sys)?LDM(IA|DA|DB|IB)$")>;
+
+// TODO: no writeback latency defined in documentation (implemented as 1 cyc)
+def : InstRW<[A57WriteLDM_Upd],
+  (instregex "(t|t2|sys)?LDM(IA_UPD|DA_UPD|DB_UPD|IB_UPD|IA_RET)", "tPOP")>;
+
+// --- 3.9 Store Instructions ---
+
+// Store, immed offset
+def : InstRW<[A57Write_1cyc_1S], (instregex "STRi12", "STRBi12", "PICSTR",
+  "t2STR(B?)(T|i12|i8|s)", "t2STRDi8", "t2STRH(i12|i8|s)", "tSTR")>;
+
+// Store, register offset
+// For minus or for not plus lsl2 scaled we need 3cyc "I0/I1, S",
+// otherwise 1cyc S.
+def A57WriteStrAmLDSTSO : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPred, [A57Write_3cyc_1I_1S]>,
+  SchedVar<IsLdstsoMinusRegPred,         [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,                  [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAmLDSTSO], (instregex "STRrs", "STRBrs")>;
+
+// STRH,STRD: 3cyc "I0/I1, S" for minus reg, 1cyc S for imm or for plus reg.
+def A57WriteStrAm3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPred, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,           [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAm3], (instregex "STRH$")>;
+def A57WriteStrAm3X2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,             [A57Write_1cyc_1S]>
+]>;
+def : InstRW<[A57WriteStrAm3X2], (instregex "STRD$")>;
+
+// Store, immed pre-indexed (1cyc "S, I0/I1", 1cyc writeback)
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR_PRE_IMM",
+  "STRB_PRE_IMM", "STR(B)?(r|i)_preidx", "(t2)?STRH_(preidx|PRE)",
+  "t2STR(B?)_(PRE|preidx)", "t2STRD_PRE")>;
+
+// Store, register pre-indexed:
+// 1(1) "S, I0/I1" for plus reg
+// 3(2) "I0/I1, S" for minus reg
+// 1(2) "S, M" for scaled plus lsl2
+// 3(2) "I0/I1, S" for other scaled
+def A57WriteStrAmLDSTSOPre : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledNotOptimalPredX2, [A57Write_3cyc_1I_1S]>,
+  SchedVar<IsLdstsoMinusRegPredX2,         [A57Write_3cyc_1I_1S]>,
+  SchedVar<IsLdstsoScaledPredX2,           [A57Write_1cyc_1S_1M]>,
+  SchedVar<NoSchedPred,                    [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAmLDSTSOPreWrBack : SchedWriteVariant<[
+  SchedVar<IsLdstsoScaledPredX2,           [A57WrBackTwo]>,
+  SchedVar<IsLdstsoMinusRegPredX2,         [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,                    [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAmLDSTSOPreWrBack, A57WriteStrAmLDSTSOPre],
+  (instregex "STR_PRE_REG", "STRB_PRE_REG")>;
+
+// pre-indexed STRH/STRD (STRH_PRE, STRD_PRE)
+// 1(1) "S, I0/I1" for imm or reg plus
+// 3(2) "I0/I1, S" for reg minus
+def A57WriteStrAm3PreX2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAm3PreWrBackX2 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX2, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,             [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAm3PreWrBackX2, A57WriteStrAm3PreX2],
+  (instregex "STRH_PRE")>;
+
+def A57WriteStrAm3PreX3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX3, [A57Write_3cyc_1I_1S]>,
+  SchedVar<NoSchedPred,             [A57Write_1cyc_1S_1I]>
+]>;
+def A57WriteStrAm3PreWrBackX3 : SchedWriteVariant<[
+  SchedVar<IsLdrAm3NegRegOffPredX3, [A57WrBackTwo]>,
+  SchedVar<NoSchedPred,             [A57WrBackOne]>
+]>;
+def : InstRW<[A57WriteStrAm3PreWrBackX3, A57WriteStrAm3PreX3],
+  (instregex "STRD_PRE")>;
+
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I], (instregex "STR(T?)_POST_IMM",
+  "STRB(T?)_POST_IMM", "t2STR(B?)_POST")>;
+
+// 1(2) "S, M" for STR/STRB register post-indexed (both scaled or not)
+def : InstRW<[A57WrBackTwo, A57Write_1cyc_1S_1M], (instregex "STR(T?)_POST_REG",
+  "STRB(T?)_POST_REG", "STR(B?)T_POST$")>;
+
+// post-indexed STRH/STRD(STRH_POST, STRD_POST), STRHTi, STRHTr
+// 1(1) "S, I0/I1" both for reg or imm
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
+  (instregex "(t2)?STR(H|D)_POST", "STRHT(i|r)", "t2STRHT")>;
+
+// --- Store multiple instructions ---
+// TODO: no writeback latency defined in documentation
+def A57WriteSTM : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
+]>;
+def A57WriteSTM_Upd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
+]>;
+
+def : InstRW<[A57WriteSTM], (instregex "(t2|sys|t)?STM(IA|DA|DB|IB)$")>;
+def : InstRW<[A57WrBackOne, A57WriteSTM_Upd],
+  (instregex "(t2|sys|t)?STM(IA_UPD|DA_UPD|DB_UPD|IB_UPD)", "tPUSH")>;
+
+// --- 3.10 FP Data Processing Instructions ---
+def : SchedAlias<WriteFPALU32, A57Write_5cyc_1V>;
+def : SchedAlias<WriteFPALU64, A57Write_5cyc_1V>;
+
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(S|D|H)")>;
+
+// fp compare - 3cyc F1 for unconditional, 6cyc "F0/F1, F1" for conditional
+def A57WriteVcmp : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57Write_6cyc_1V_1X]>,
+  SchedVar<NoSchedPred,      [A57Write_3cyc_1X]>
+]>;
+def : InstRW<[A57WriteVcmp],
+  (instregex "VCMP(D|S|H|ZD|ZS|ZH)$", "VCMPE(D|S|H|ZD|ZS|ZH)")>;
+
+// fp convert
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "VCVT(A|N|P|M)(SH|UH|SS|US|SD|UD)", "VCVT(BDH|THD|TDH)")>;
+
+def : SchedAlias<WriteFPCVT, A57Write_5cyc_1V>;
+
+// FP round to integral
+def : InstRW<[A57Write_5cyc_1V], (instregex "VRINT(A|N|P|M|Z|R|X)(H|S|D)$")>;
+
+// FP divide, FP square root
+def : SchedAlias<WriteFPDIV32, A57Write_17cyc_1W>;
+def : SchedAlias<WriteFPDIV64, A57Write_32cyc_1W>;
+def : SchedAlias<WriteFPSQRT32, A57Write_17cyc_1W>;
+def : SchedAlias<WriteFPSQRT64, A57Write_32cyc_1W>;
+
+// FP max/min
+def : InstRW<[A57Write_5cyc_1V], (instregex "VMAX", "VMIN")>;
+
+// FP multiply-accumulate pipelines support late forwarding of the result
+// from FP multiply μops to the accumulate operands of an
+// FP multiply-accumulate μop. The latter can potentially be issued 1 cycle
+// after the FP multiply μop has been issued
+// FP multiply, FZ
+def A57WriteVMUL : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
+
+def : SchedAlias<WriteFPMUL32, A57WriteVMUL>;
+def : SchedAlias<WriteFPMUL64, A57WriteVMUL>;
+def : ReadAdvance<ReadFPMUL, 0>;
+
+// FP multiply accumulate, FZ: 9cyc "F0/F1" or 4 cyc for sequenced accumulate
+// VFMA, VFMS, VFNMA, VFNMS, VMLA, VMLS, VNMLA, VNMLS
+def A57WriteVFMA : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+
+// VFMA takes 9 cyc for common case and 4 cyc for VFMA->VFMA chain (5 read adv.)
+// VMUL takes 5 cyc for common case and 1 cyc for VMUL->VFMA chain (4 read adv.)
+// Currently, there is no way to define different read advances for VFMA operand
+// from VFMA or from VMUL, so there will be 5 read advance.
+// Zero latency (instead of one) for VMUL->VFMA shouldn't break something.
+// The same situation with ASIMD VMUL/VFMA instructions
+// def A57ReadVFMA : SchedRead;
+// def : ReadAdvance<A57ReadVFMA, 5, [A57WriteVFMA]>;
+// def : ReadAdvance<A57ReadVFMA, 4, [A57WriteVMUL]>;
+def A57ReadVFMA5 : SchedReadAdvance<5, [A57WriteVFMA, A57WriteVMUL]>;
+
+def : SchedAlias<WriteFPMAC32, A57WriteVFMA>;
+def : SchedAlias<WriteFPMAC64, A57WriteVFMA>;
+def : SchedAlias<ReadFPMAC, A57ReadVFMA5>;
+
+def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VSEL")>;
+
+// --- 3.11 FP Miscellaneous Instructions ---
+// VMOV: 3cyc "F0/F1" for imm/reg
+def : InstRW<[A57Write_3cyc_1V], (instregex "FCONST(D|S|H)")>;
+def : InstRW<[A57Write_3cyc_1V], (instregex "VMOV(D|S|H)(cc)?$")>;
+
+// 5cyc L for FP transfer, vfp to core reg,
+// 5cyc L for FP transfer, core reg to vfp
+def : SchedAlias<WriteFPMOV, A57Write_5cyc_1L>;
+// VMOVRRS/VMOVRRD in common code declared with one WriteFPMOV (instead of 2).
+def : InstRW<[A57Write_5cyc_1L, A57Write_5cyc_1L], (instregex "VMOV(RRS|RRD)")>;
+
+// 8cyc "L,F0/F1" for FP transfer, core reg to upper or lower half of vfp D-reg
+def : InstRW<[A57Write_8cyc_1L_1I], (instregex "VMOVDRR")>;
+
+// --- 3.12 FP Load Instructions ---
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLDR(D|S|H)")>;
+
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLDMQIA$")>;
+
+// FP load multiple (VLDM)
+
+def A57VLDMOpsListUncond : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L, A57Write_5cyc_1L,
+                A57Write_6cyc_1L, A57Write_6cyc_1L,
+                A57Write_7cyc_1L, A57Write_7cyc_1L,
+                A57Write_8cyc_1L, A57Write_8cyc_1L,
+                A57Write_9cyc_1L, A57Write_9cyc_1L,
+                A57Write_10cyc_1L, A57Write_10cyc_1L,
+                A57Write_11cyc_1L, A57Write_11cyc_1L,
+                A57Write_12cyc_1L, A57Write_12cyc_1L]>;
+def A57WriteVLDMuncond : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListUncond.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57VLDMOpsListCond : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L, A57Write_6cyc_1L,
+                A57Write_7cyc_1L, A57Write_8cyc_1L,
+                A57Write_9cyc_1L, A57Write_10cyc_1L,
+                A57Write_11cyc_1L, A57Write_12cyc_1L,
+                A57Write_13cyc_1L, A57Write_14cyc_1L,
+                A57Write_15cyc_1L, A57Write_16cyc_1L,
+                A57Write_17cyc_1L, A57Write_18cyc_1L,
+                A57Write_19cyc_1L, A57Write_20cyc_1L]>;
+def A57WriteVLDMcond : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListCond.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57WriteVLDM : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57WriteVLDMcond]>,
+  SchedVar<NoSchedPred,      [A57WriteVLDMuncond]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WriteVLDM], (instregex "VLDM(DIA|SIA)$")>;
+
+def A57VLDMOpsListUncond_Upd : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L_1I, A57Write_5cyc_1L_1I,
+                A57Write_6cyc_1L_1I, A57Write_6cyc_1L_1I,
+                A57Write_7cyc_1L_1I, A57Write_7cyc_1L_1I,
+                A57Write_8cyc_1L_1I, A57Write_8cyc_1L_1I,
+                A57Write_9cyc_1L_1I, A57Write_9cyc_1L_1I,
+                A57Write_10cyc_1L_1I, A57Write_10cyc_1L_1I,
+                A57Write_11cyc_1L_1I, A57Write_11cyc_1L_1I,
+                A57Write_12cyc_1L_1I, A57Write_12cyc_1L_1I]>;
+def A57WriteVLDMuncond_UPD : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListUncond_Upd.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListUncond_Upd.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListUncond_Upd.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListUncond_Upd.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListUncond_Upd.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListUncond_Upd.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListUncond_Upd.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListUncond_Upd.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListUncond_Upd.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57VLDMOpsListCond_Upd : A57WriteLMOpsListType<
+               [A57Write_5cyc_1L_1I, A57Write_6cyc_1L_1I,
+                A57Write_7cyc_1L_1I, A57Write_8cyc_1L_1I,
+                A57Write_9cyc_1L_1I, A57Write_10cyc_1L_1I,
+                A57Write_11cyc_1L_1I, A57Write_12cyc_1L_1I,
+                A57Write_13cyc_1L_1I, A57Write_14cyc_1L_1I,
+                A57Write_15cyc_1L_1I, A57Write_16cyc_1L_1I,
+                A57Write_17cyc_1L_1I, A57Write_18cyc_1L_1I,
+                A57Write_19cyc_1L_1I, A57Write_20cyc_1L_1I]>;
+def A57WriteVLDMcond_UPD : SchedWriteVariant<[
+  SchedVar<A57LMAddrPred1,  A57VLDMOpsListCond_Upd.Writes[0-1]>,
+  SchedVar<A57LMAddrPred2,  A57VLDMOpsListCond_Upd.Writes[0-3]>,
+  SchedVar<A57LMAddrPred3,  A57VLDMOpsListCond_Upd.Writes[0-5]>,
+  SchedVar<A57LMAddrPred4,  A57VLDMOpsListCond_Upd.Writes[0-7]>,
+  SchedVar<A57LMAddrPred5,  A57VLDMOpsListCond_Upd.Writes[0-9]>,
+  SchedVar<A57LMAddrPred6,  A57VLDMOpsListCond_Upd.Writes[0-11]>,
+  SchedVar<A57LMAddrPred7,  A57VLDMOpsListCond_Upd.Writes[0-13]>,
+  SchedVar<A57LMAddrPred8,  A57VLDMOpsListCond_Upd.Writes[0-15]>,
+  SchedVar<NoSchedPred,     A57VLDMOpsListCond_Upd.Writes[0-15]>
+]> { let Variadic=1; }
+
+def A57WriteVLDM_UPD : SchedWriteVariant<[
+  SchedVar<IsPredicatedPred, [A57WriteVLDMcond_UPD]>,
+  SchedVar<NoSchedPred,      [A57WriteVLDMuncond_UPD]>
+]> { let Variadic=1; }
+
+def : InstRW<[A57WrBackOne, A57WriteVLDM_UPD],
+  (instregex "VLDM(DIA_UPD|DDB_UPD|SIA_UPD|SDB_UPD)")>;
+
+// --- 3.13 FP Store Instructions ---
+def : InstRW<[A57Write_1cyc_1S], (instregex "VSTR(D|S|H)")>;
+
+def : InstRW<[A57Write_2cyc_1S], (instregex "VSTMQIA$")>;
+
+def A57WriteVSTMs : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S]>
+]>;
+def A57WriteVSTMd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S]>,
+    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S]>,
+    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S]>,
+    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S]>,
+    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S]>,
+    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S]>,
+    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S]>,
+    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S]>,
+    SchedVar<NoSchedPred,    [A57Write_4cyc_1S]>
+]>;
+def A57WriteVSTMs_Upd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_1cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred2, [A57Write_2cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred3, [A57Write_3cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred4, [A57Write_4cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred5, [A57Write_5cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred6, [A57Write_6cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred7, [A57Write_7cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred8, [A57Write_8cyc_1S_1I]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
+]>;
+def A57WriteVSTMd_Upd : SchedWriteVariant<[
+    SchedVar<A57LMAddrPred1, [A57Write_2cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred2, [A57Write_4cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred3, [A57Write_6cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred4, [A57Write_8cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred5, [A57Write_10cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred6, [A57Write_12cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred7, [A57Write_14cyc_1S_1I]>,
+    SchedVar<A57LMAddrPred8, [A57Write_16cyc_1S_1I]>,
+    SchedVar<NoSchedPred,    [A57Write_2cyc_1S_1I]>
+]>;
+
+def : InstRW<[A57WriteVSTMs], (instregex "VSTMSIA$")>;
+def : InstRW<[A57WriteVSTMd], (instregex "VSTMDIA$")>;
+def : InstRW<[A57WrBackOne, A57WriteVSTMs_Upd],
+  (instregex "VSTM(SIA_UPD|SDB_UPD)")>;
+def : InstRW<[A57WrBackOne, A57WriteVSTMd_Upd],
+  (instregex "VSTM(DIA_UPD|DDB_UPD)")>;
+
+// --- 3.14 ASIMD Integer Instructions ---
+
+// ASIMD absolute diff, 3cyc F0/F1 for integer VABD
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABD(s|u)")>;
+
+// ASIMD absolute diff accum: 4(1) F1 for D-form, 5(2) F1 for Q-form
+def A57WriteVABAD : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVABAD  : SchedReadAdvance<3, [A57WriteVABAD]>;
+def : InstRW<[A57WriteVABAD, A57ReadVABAD],
+  (instregex "VABA(s|u)(v8i8|v4i16|v2i32)")>;
+def A57WriteVABAQ : SchedWriteRes<[A57UnitX]> { let Latency = 5; }
+def A57ReadVABAQ  : SchedReadAdvance<3, [A57WriteVABAQ]>;
+def : InstRW<[A57WriteVABAQ, A57ReadVABAQ],
+  (instregex "VABA(s|u)(v16i8|v8i16|v4i32)")>;
+
+// ASIMD absolute diff accum long: 4(1) F1 for VABAL
+def A57WriteVABAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVABAL  : SchedReadAdvance<3, [A57WriteVABAL]>;
+def : InstRW<[A57WriteVABAL, A57ReadVABAL], (instregex "VABAL(s|u)")>;
+
+// ASIMD absolute diff long: 3cyc F0/F1 for VABDL
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABDL(s|u)")>;
+
+// ASIMD arith, basic
+def : InstRW<[A57Write_3cyc_1V], (instregex "VADD", "VADDL", "VADDW",
+  "VNEG(s8d|s16d|s32d|s8q|s16q|s32q|d|q)",
+  "VPADDi", "VPADDL", "VSUB", "VSUBL", "VSUBW")>;
+
+// ASIMD arith, complex
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS", "VADDHN", "VHADD", "VHSUB",
+  "VQABS", "VQADD", "VQNEG", "VQSUB",
+  "VRADDHN", "VRHADD", "VRSUBHN", "VSUBHN")>;
+
+// ASIMD compare
+def : InstRW<[A57Write_3cyc_1V],
+  (instregex "VCEQ", "VCGE", "VCGT", "VCLE", "VTST", "VCLT")>;
+
+// ASIMD logical
+def : InstRW<[A57Write_3cyc_1V],
+  (instregex "VAND", "VBIC", "VMVN", "VORR", "VORN", "VEOR")>;
+
+// ASIMD max/min
+def : InstRW<[A57Write_3cyc_1V],
+  (instregex "(VMAX|VMIN)(s|u)", "(VPMAX|VPMIN)(s8|s16|s32|u8|u16|u32)")>;
+
+// ASIMD multiply, D-form: 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
+// Cortex-A57 r1p0 and later reduce the latency of ASIMD multiply
+// and multiply-with-accumulate instructions relative to r0pX.
+def A57WriteVMULD_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def : InstRW<[A57WriteVMULD_VecInt], (instregex
+  "VMUL(v8i8|v4i16|v2i32|pd)", "VMULsl(v4i16|v2i32)",
+  "VQDMULH(sl)?(v4i16|v2i32)", "VQRDMULH(sl)?(v4i16|v2i32)")>;
+
+// ASIMD multiply, Q-form: 6cyc F0 for r0px, 5cyc F0 for r1p0 and later
+def A57WriteVMULQ_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
+def : InstRW<[A57WriteVMULQ_VecInt], (instregex
+  "VMUL(v16i8|v8i16|v4i32|pq)", "VMULsl(v8i16|v4i32)",
+  "VQDMULH(sl)?(v8i16|v4i32)", "VQRDMULH(sl)?(v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate, D-form
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAD_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def A57ReadVMLAD_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAD_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAD_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAD_VecInt, A57ReadVMLAD_VecInt],
+  (instregex "VMLA(sl)?(v8i8|v4i16|v2i32)", "VMLS(sl)?(v8i8|v4i16|v2i32)")>;
+
+// ASIMD multiply accumulate, Q-form
+// 6cyc F0 for r0px, 5cyc F0 for r1p0 and later, 2cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAQ_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_5cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_6cyc_1W]>]>;
+def A57ReadVMLAQ_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAQ_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAQ_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAQ_VecInt, A57ReadVMLAQ_VecInt],
+  (instregex "VMLA(sl)?(v16i8|v8i16|v4i32)", "VMLS(sl)?(v16i8|v8i16|v4i32)")>;
+
+// ASIMD multiply accumulate long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 1cyc for accumulate sequence
+// (4 or 3 ReadAdvance)
+def A57WriteVMLAL_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def A57ReadVMLAL_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<3, [A57WriteVMLAL_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<4, [A57WriteVMLAL_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVMLAL_VecInt, A57ReadVMLAL_VecInt],
+  (instregex "VMLAL(s|u)", "VMLSL(s|u)")>;
+
+// ASIMD multiply accumulate saturating long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later, 2cyc for accumulate sequence
+// (3 or 2 ReadAdvance)
+def A57WriteVQDMLAL_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def A57ReadVQDMLAL_VecInt : SchedReadVariant<[
+  SchedVar<IsR1P0AndLaterPred, [SchedReadAdvance<2, [A57WriteVQDMLAL_VecInt]>]>,
+  SchedVar<NoSchedPred,        [SchedReadAdvance<3, [A57WriteVQDMLAL_VecInt]>]>
+]>;
+def : InstRW<[A57WriteVQDMLAL_VecInt, A57ReadVQDMLAL_VecInt],
+  (instregex "VQDMLAL", "VQDMLSL")>;
+
+// ASIMD multiply long
+// 5cyc F0 for r0px, 4cyc F0 for r1p0 and later
+def A57WriteVMULL_VecInt : SchedWriteVariant<[
+  SchedVar<IsR1P0AndLaterPred, [A57Write_4cyc_1W]>,
+  SchedVar<NoSchedPred,        [A57Write_5cyc_1W]>]>;
+def : InstRW<[A57WriteVMULL_VecInt],
+  (instregex "VMULL(s|u|p8|sls|slu)", "VQDMULL")>;
+
+// ASIMD pairwise add and accumulate
+// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
+def A57WriteVPADAL : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadVPADAL  : SchedReadAdvance<3, [A57WriteVPADAL]>;
+def : InstRW<[A57WriteVPADAL, A57ReadVPADAL], (instregex "VPADAL(s|u)")>;
+
+// ASIMD shift accumulate
+// 4cyc F1, 1cyc for accumulate sequence (3cyc ReadAdvance)
+def A57WriteVSRA : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57ReadVSRA  : SchedReadAdvance<3, [A57WriteVSRA]>;
+def : InstRW<[A57WriteVSRA, A57ReadVSRA], (instregex "VSRA", "VRSRA")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[A57Write_3cyc_1X],
+  (instregex "VMOVL", "VSHLi", "VSHLL", "VSHR(s|u)", "VSHRN")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VQRSHRN", "VQRSHRUN", "VQSHL(si|ui|su)", "VQSHRN", "VQSHRUN", "VRSHR(s|u)",
+  "VRSHRN")>;
+
+// ASIMD shift by immed and insert, basic, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VSLI(v8i8|v4i16|v2i32|v1i64)", "VSRI(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by immed and insert, basic, Q-form
+def : InstRW<[A57Write_5cyc_1X], (instregex
+  "VSLI(v16i8|v8i16|v4i32|v2i64)", "VSRI(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, basic, D-form
+def : InstRW<[A57Write_3cyc_1X], (instregex
+  "VSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+// VQRSHL, VQSHL, VRSHL
+def : InstRW<[A57Write_4cyc_1X], (instregex
+  "VQRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)", "VQSHL(s|u)(v8i8|v4i16|v2i32|v1i64)",
+  "VRSHL(s|u)(v8i8|v4i16|v2i32|v1i64)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_1X], (instregex
+  "VQRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)", "VQSHL(s|u)(v16i8|v8i16|v4i32|v2i64)",
+  "VRSHL(s|u)(v16i8|v8i16|v4i32|v2i64)")>;
+
+// --- 3.15 ASIMD Floating-Point Instructions ---
+// ASIMD FP absolute value
+def : InstRW<[A57Write_3cyc_1V], (instregex "VABS(fd|fq|hd|hq)")>;
+
+// ASIMD FP arith
+def : InstRW<[A57Write_5cyc_1V], (instregex "VABD(fd|fq|hd|hq)",
+  "VADD(fd|fq|hd|hq)", "VPADD(f|h)", "VSUB(fd|fq|hd|hq)")>;
+
+// ASIMD FP compare
+def : InstRW<[A57Write_5cyc_1V], (instregex "VAC(GE|GT|LE|LT)",
+  "VC(EQ|GE|GT|LE)(fd|fq|hd|hq)")>;
+
+// ASIMD FP convert, integer
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "VCVT(f2sd|f2ud|s2fd|u2fd|f2sq|f2uq|s2fq|u2fq|f2xsd|f2xud|xs2fd|xu2fd)",
+  "VCVT(f2xsq|f2xuq|xs2fq|xu2fq)",
+  "VCVT(AN|MN|NN|PN)(SDf|SQf|UDf|UQf|SDh|SQh|UDh|UQh)")>;
+
+// ASIMD FP convert, half-precision: 8cyc F0/F1
+def : InstRW<[A57Write_8cyc_1V], (instregex
+  "VCVT(h2sd|h2ud|s2hd|u2hd|h2sq|h2uq|s2hq|u2hq|h2xsd|h2xud|xs2hd|xu2hd)",
+  "VCVT(h2xsq|h2xuq|xs2hq|xu2hq)",
+  "VCVT(f2h|h2f)")>;
+
+// ASIMD FP max/min
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "(VMAX|VMIN)(fd|fq|hd|hq)", "(VPMAX|VPMIN)(f|h)", "VMAXNM", "VMINNM")>;
+
+// ASIMD FP multiply
+def A57WriteVMUL_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def : InstRW<[A57WriteVMUL_VecFP], (instregex "VMUL(sl)?(fd|fq|hd|hq)")>;
+
+// ASIMD FP multiply accumulate: 9cyc F0/F1, 4cyc for accumulate sequence
+def A57WriteVMLA_VecFP  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57ReadVMLA_VecFP  :
+  SchedReadAdvance<5, [A57WriteVMLA_VecFP, A57WriteVMUL_VecFP]>;
+def : InstRW<[A57WriteVMLA_VecFP, A57ReadVMLA_VecFP],
+  (instregex "(VMLA|VMLS)(sl)?(fd|fq|hd|hq)", "(VFMA|VFMS)(fd|fq|hd|hq)")>;
+
+// ASIMD FP negate
+def : InstRW<[A57Write_3cyc_1V], (instregex "VNEG(fd|f32q|hd|hq)")>;
+
+// ASIMD FP round to integral
+def : InstRW<[A57Write_5cyc_1V], (instregex
+  "VRINT(AN|MN|NN|PN|XN|ZN)(Df|Qf|Dh|Qh)")>;
+
+// --- 3.16 ASIMD Miscellaneous Instructions ---
+
+// ASIMD bitwise insert
+def : InstRW<[A57Write_3cyc_1V], (instregex "VBIF", "VBIT", "VBSL")>;
+
+// ASIMD count
+def : InstRW<[A57Write_3cyc_1V], (instregex "VCLS", "VCLZ", "VCNT")>;
+
+// ASIMD duplicate, core reg: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VDUP(8|16|32)(d|q)")>;
+
+// ASIMD duplicate, scalar: 3cyc "F0/F1"
+def : InstRW<[A57Write_3cyc_1V], (instregex "VDUPLN(8|16|32)(d|q)")>;
+
+// ASIMD extract
+def : InstRW<[A57Write_3cyc_1V], (instregex "VEXT(d|q)(8|16|32|64)")>;
+
+// ASIMD move, immed
+def : InstRW<[A57Write_3cyc_1V], (instregex
+  "VMOV(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v1i64|v2i64|v2f32|v4f32)",
+  "VMOVQ0")>;
+
+// ASIMD move, narrowing
+def : InstRW<[A57Write_3cyc_1V], (instregex "VMOVN")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "VQMOVN")>;
+
+// ASIMD reciprocal estimate
+def : InstRW<[A57Write_5cyc_1V], (instregex "VRECPE", "VRSQRTE")>;
+
+// ASIMD reciprocal step, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "VRECPS", "VRSQRTS")>;
+
+// ASIMD reverse, swap, table lookup (1-2 reg)
+def : InstRW<[A57Write_3cyc_1V], (instregex "VREV", "VSWP", "VTB(L|X)(1|2)")>;
+
+// ASIMD table lookup (3-4 reg)
+def : InstRW<[A57Write_6cyc_1V], (instregex "VTBL(3|4)", "VTBX(3|4)")>;
+
+// ASIMD transfer, scalar to core reg: 6cyc "L, I0/I1"
+def : InstRW<[A57Write_6cyc_1L_1I], (instregex "VGETLN")>;
+
+// ASIMD transfer, core reg to scalar: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "VSETLN")>;
+
+// ASIMD transpose
+def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V], (instregex "VTRN")>;
+
+// ASIMD unzip/zip, D-form
+def : InstRW<[A57Write_3cyc_1V, A57Write_3cyc_1V],
+  (instregex "VUZPd", "VZIPd")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_1V, A57Write_6cyc_1V],
+  (instregex "VUZPq", "VZIPq")>;
+
+// --- 3.17 ASIMD Load Instructions ---
+
+// Overriden via InstRW for this processor.
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+// 1-2 reg: 5cyc L, +I for writeback, 1 cyc wb latency
+def : InstRW<[A57Write_5cyc_1L], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+def : InstRW<[A57Write_5cyc_1L_1I, A57WrBackOne],
+  (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+
+// 3-4 reg: 6cyc L, +I for writeback, 1 cyc wb latency
+def : InstRW<[A57Write_6cyc_1L],
+  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)$", "VLD1d64(T|Q)Pseudo")>;
+
+def : InstRW<[A57Write_6cyc_1L_1I, A57WrBackOne],
+  (instregex "VLD1(d|q)(8|16|32|64)(T|Q)wb")>;
+
+// ASIMD load, 1 element, one lane and all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex
+  "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne], (instregex
+  "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)", "VLD1LNq(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 2 element, multiple, 2 reg: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V],
+      (instregex "VLD2(d|q)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD2(d|q)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+
+// ASIMD load, 2 element, multiple, 4 reg: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V], (instregex "VLD2b(8|16|32)$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD2b(8|16|32)wb")>;
+
+// ASIMD load, 2 element, one lane and all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+      (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+                 "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+// 2 results + wb result
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V, A57WrBackOne],
+      (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+// 1 result + wb result
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb",
+                 "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 3 element, multiple, 3 reg: 9cyc "L, F0/F1"
+// 3 results
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
+      (instregex "VLD3(d|q)(8|16|32)$")>;
+// 1 result
+def : InstRW<[A57Write_9cyc_1L_1V],
+      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+// 3 results + wb
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+// 1 result + wb
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+// ASIMD load, 3 element, one lane, size 32: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+      (instregex "VLD3LN(d|q)32$",
+                 "VLD3LN(d|q)32Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)32_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)32Pseudo_UPD")>;
+
+// ASIMD load, 3 element, one lane, size 8/16: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V],
+      (instregex "VLD3LN(d|q)(8|16)$",
+                 "VLD3LN(d|q)(8|16)Pseudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)(8|16)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3LN(d|q)(8|16)Pseudo_UPD")>;
+
+// ASIMD load, 3 element, all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V],
+      (instregex "VLD3DUP(d|q)(8|16|32)$",
+                 "VLD3DUP(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3DUP(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD3DUP(d|q)(8|16|32)Pseudo_UPD")>;
+
+// ASIMD load, 4 element, multiple, 4 reg: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
+              A57Write_9cyc_1L_1V],
+      (instregex "VLD4(d|q)(8|16|32)$")>;
+def : InstRW<[A57Write_9cyc_1L_1V],
+      (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+// ASIMD load, 4 element, one lane, size 32: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
+              A57Write_8cyc_1L_1V],
+      (instregex "VLD4LN(d|q)32$",
+                 "VLD4LN(d|q)32Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57WrBackOne],
+      (instregex "VLD4LN(d|q)32_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4LN(d|q)32Pseudo_UPD")>;
+
+// ASIMD load, 4 element, one lane, size 8/16: 9cyc "L, F0/F1"
+def : InstRW<[A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V, A57Write_9cyc_1L_1V,
+              A57Write_9cyc_1L_1V],
+      (instregex "VLD4LN(d|q)(8|16)$",
+                 "VLD4LN(d|q)(8|16)Pseudo$")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57Write_9cyc_1L_1V_1I, A57Write_9cyc_1L_1V_1I,
+              A57WrBackOne],
+      (instregex "VLD4LN(d|q)(8|16)_UPD")>;
+def : InstRW<[A57Write_9cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4LN(d|q)(8|16)Pseudo_UPD")>;
+
+// ASIMD load, 4 element, all lanes: 8cyc "L, F0/F1"
+def : InstRW<[A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V, A57Write_8cyc_1L_1V,
+              A57Write_8cyc_1L_1V],
+      (instregex "VLD4DUP(d|q)(8|16|32)$",
+                 "VLD4DUP(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57Write_8cyc_1L_1V_1I, A57Write_8cyc_1L_1V_1I,
+              A57WrBackOne],
+      (instregex "VLD4DUP(d|q)(8|16|32)_UPD")>;
+def : InstRW<[A57Write_8cyc_1L_1V_1I, A57WrBackOne],
+      (instregex "VLD4DUP(d|q)(8|16|32)Pseudo_UPD")>;
+
+// --- 3.18 ASIMD Store Instructions ---
+
+// ASIMD store, 1 element, multiple, 1 reg: 1cyc S
+def : InstRW<[A57Write_1cyc_1S], (instregex "VST1d(8|16|32|64)$")>;
+def : InstRW<[A57WrBackOne, A57Write_1cyc_1S_1I],
+      (instregex "VST1d(8|16|32|64)wb")>;
+// ASIMD store, 1 element, multiple, 2 reg: 2cyc S
+def : InstRW<[A57Write_2cyc_1S], (instregex "VST1q(8|16|32|64)$")>;
+def : InstRW<[A57WrBackOne, A57Write_2cyc_1S_1I],
+      (instregex "VST1q(8|16|32|64)wb")>;
+// ASIMD store, 1 element, multiple, 3 reg: 3cyc S
+def : InstRW<[A57Write_3cyc_1S],
+      (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1I],
+      (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+// ASIMD store, 1 element, multiple, 4 reg: 4cyc S
+def : InstRW<[A57Write_4cyc_1S],
+      (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1I],
+      (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+// ASIMD store, 1 element, one lane: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 2 element, multiple, 2 reg: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST2(d|b)(8|16|32)$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST2(b|d)(8|16|32)wb")>;
+// ASIMD store, 2 element, multiple, 4 reg: 4cyc "F0/F1, S"
+def : InstRW<[A57Write_4cyc_1S_1V],
+      (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
+      (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+// ASIMD store, 2 element, one lane: 3cyc "F0/F1, S"
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST2LN(d|q)(8|16|32)_UPD",
+                 "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 3 element, multiple, 3 reg
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST3(d|q)(8|16|32)_UPD",
+                 "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+// ASIMD store, 3 element, one lane
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST3LN(d|q)(8|16|32)_UPD",
+                 "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+// ASIMD store, 4 element, multiple, 4 reg
+def : InstRW<[A57Write_4cyc_1S_1V],
+      (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_4cyc_1S_1V_1I],
+      (instregex "VST4(d|q)(8|16|32)_UPD",
+                 "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+// ASIMD store, 4 element, one lane
+def : InstRW<[A57Write_3cyc_1S_1V],
+      (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+def : InstRW<[A57WrBackOne, A57Write_3cyc_1S_1V_1I],
+      (instregex "VST4LN(d|q)(8|16|32)_UPD",
+                 "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+// --- 3.19 Cryptography Extensions ---
+// Crypto AES ops
+// AESD, AESE, AESIMC, AESMC: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+// Crypto polynomial (64x64) multiply long (VMULL.P64): 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^VMULLp64")>;
+// Crypto SHA1 xor ops: 6cyc F0/F1
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+// Crypto SHA1 fast ops: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+// Crypto SHA1 slow ops: 6cyc F0
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+// Crypto SHA256 fast ops: 3cyc F0
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+// Crypto SHA256 slow ops: 6cyc F0
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+
+// --- 3.20 CRC ---
+def : InstRW<[A57Write_3cyc_1W], (instregex "^(t2)?CRC32")>;
+
+// -----------------------------------------------------------------------------
+// Common definitions
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+def : SchedAlias<WriteALU, A57Write_1cyc_1I>;
+
+def : SchedAlias<WriteBr, A57Write_1cyc_1B>;
+def : SchedAlias<WriteBrL, A57Write_1cyc_1B_1I>;
+def : SchedAlias<WriteBrTbl, A57Write_1cyc_1B_1I>;
+def : SchedAlias<WritePreLd, A57Write_4cyc_1L>;
+
+def : SchedAlias<WriteLd, A57Write_4cyc_1L>;
+def : SchedAlias<WriteST, A57Write_1cyc_1S>;
+def : ReadAdvance<ReadALU, 0>;
+
+} // SchedModel = CortexA57Model
+
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
new file mode 100644
index 0000000..670717d
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA57WriteRes.td
@@ -0,0 +1,323 @@
+//=- ARMScheduleA57WriteRes.td - ARM Cortex-A57 Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Cortex-A57 specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: A57Write
+//   Latency: #cyc
+//   MicroOp Count/Types: #(B|I|M|L|S|X|W|V)
+//
+// e.g. A57Write_6cyc_1I_6S_4V means the total latency is 6 and there are
+//      11 micro-ops to be issued as follows: one to I pipe, six to S pipes and
+//      four to V pipes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define Generic 1 micro-op types
+
+def A57Write_5cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 5;  }
+def A57Write_5cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 5;  }
+def A57Write_5cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 5;  }
+def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
+def A57Write_17cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 17;
+                                                    let ResourceCycles = [17]; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
+                                                    let ResourceCycles = [18]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+                                                    let ResourceCycles = [19]; }
+def A57Write_20cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 20;
+                                                    let ResourceCycles = [20]; }
+def A57Write_1cyc_1B  : SchedWriteRes<[A57UnitB]> { let Latency = 1;  }
+def A57Write_1cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 1;  }
+def A57Write_2cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 2;  }
+def A57Write_3cyc_1I  : SchedWriteRes<[A57UnitI]> { let Latency = 3;  }
+def A57Write_1cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 1;  }
+def A57Write_2cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 2;  }
+def A57Write_3cyc_1S  : SchedWriteRes<[A57UnitS]> { let Latency = 3;  }
+def A57Write_2cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 2;  }
+def A57Write_32cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
+                                                    let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+                                                    let ResourceCycles = [35]; }
+def A57Write_3cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 3;  }
+def A57Write_3cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 3;  }
+def A57Write_3cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 3;  }
+def A57Write_3cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 3;  }
+
+// A57Write_3cyc_1L - A57Write_20cyc_1L
+foreach Lat = 3-20 in {
+  def A57Write_#Lat#cyc_1L : SchedWriteRes<[A57UnitL]> {
+    let Latency = Lat;
+  }
+}
+
+// A57Write_4cyc_1S - A57Write_16cyc_1S
+foreach Lat = 4-16 in {
+  def A57Write_#Lat#cyc_1S : SchedWriteRes<[A57UnitS]> {
+    let Latency = Lat;
+  }
+}
+
+def A57Write_4cyc_1M  : SchedWriteRes<[A57UnitL]> { let Latency = 4;  }
+def A57Write_4cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 4;  }
+def A57Write_4cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 4;  }
+def A57Write_5cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 5;  }
+def A57Write_6cyc_1X  : SchedWriteRes<[A57UnitX]> { let Latency = 6;  }
+def A57Write_6cyc_1W  : SchedWriteRes<[A57UnitW]> { let Latency = 6;  }
+def A57Write_8cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 8;  }
+def A57Write_9cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 9;  }
+def A57Write_6cyc_1M  : SchedWriteRes<[A57UnitM]> { let Latency = 6;  }
+def A57Write_6cyc_1V  : SchedWriteRes<[A57UnitV]> { let Latency = 6;  }
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 2 micro-op types
+
+def A57Write_64cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 64;
+  let NumMicroOps = 2;
+  let ResourceCycles = [32, 32];
+}
+def A57Write_6cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_7cyc_1V_1X  : SchedWriteRes<[A57UnitV,
+                                          A57UnitX]> {
+  let Latency     = 7;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_1L_1V  : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_9cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 9;
+  let NumMicroOps = 2;
+}
+def A57Write_8cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 8;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2L     : SchedWriteRes<[A57UnitL, A57UnitL]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_2W     : SchedWriteRes<[A57UnitW, A57UnitW]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1L  : SchedWriteRes<[A57UnitI,
+                                          A57UnitL]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_5cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_1L_1V : SchedWriteRes<[A57UnitL,
+                                          A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_10cyc_2V    : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1S_1I  : SchedWriteRes<[A57UnitS,
+                                          A57UnitI]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1S_1I  : SchedWriteRes<[A57UnitS,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1I  : SchedWriteRes<[A57UnitS,
+                                          A57UnitI]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_1cyc_1S_1M  : SchedWriteRes<[A57UnitS,
+                                          A57UnitM]> {
+  let Latency     = 1;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1B_1I  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_6cyc_1B_1L  : SchedWriteRes<[A57UnitB,
+                                          A57UnitI]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2S     : SchedWriteRes<[A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_2cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 2;
+  let NumMicroOps = 2;
+}
+def A57Write_36cyc_2X    : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 36;
+  let NumMicroOps = 2;
+  let ResourceCycles = [18, 18];
+}
+def A57Write_3cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1I_1M  : SchedWriteRes<[A57UnitI,
+                                          A57UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+// A57Write_3cyc_1L_1I - A57Write_20cyc_1L_1I
+foreach Lat = 3-20 in {
+  def A57Write_#Lat#cyc_1L_1I : SchedWriteRes<[A57UnitL, A57UnitI]> {
+    let Latency = Lat; let NumMicroOps = 2;
+  }
+}
+
+def A57Write_3cyc_1I_1S  : SchedWriteRes<[A57UnitI,
+                                          A57UnitS]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+def A57Write_4cyc_1S_1V  : SchedWriteRes<[A57UnitS,
+                                          A57UnitV]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+def A57Write_3cyc_2V     : SchedWriteRes<[A57UnitV, A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 2;
+}
+
+// A57Write_4cyc_1S_1I - A57Write_16cyc_1S_1I
+foreach Lat = 4-16 in {
+  def A57Write_#Lat#cyc_1S_1I : SchedWriteRes<[A57UnitS, A57UnitI]> {
+    let Latency = Lat; let NumMicroOps = 2;
+  }
+}
+
+def A57Write_4cyc_2X     : SchedWriteRes<[A57UnitX, A57UnitX]> {
+  let Latency     = 4;
+  let NumMicroOps = 2;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define Generic 3 micro-op types
+
+def A57Write_10cyc_3V       : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV]> {
+  let Latency     = 10;
+  let NumMicroOps = 3;
+}
+def A57Write_2cyc_1I_2S     : SchedWriteRes<[A57UnitI,
+                                             A57UnitS, A57UnitS]> {
+  let Latency     = 2;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1I_1S_1V  : SchedWriteRes<[A57UnitI,
+                                             A57UnitS,
+                                             A57UnitV]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_3cyc_1S_1V_1I  : SchedWriteRes<[A57UnitS,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 3;
+  let NumMicroOps = 3;
+}
+def A57Write_4cyc_1S_1V_1I  : SchedWriteRes<[A57UnitS,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+def A57Write_4cyc_1I_1L_1M  : SchedWriteRes<[A57UnitI, A57UnitL, A57UnitM]> {
+  let Latency     = 4;
+  let NumMicroOps = 3;
+}
+def A57Write_8cyc_1L_1V_1I  : SchedWriteRes<[A57UnitL,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 8;
+  let NumMicroOps = 3;
+}
+def A57Write_9cyc_1L_1V_1I  : SchedWriteRes<[A57UnitL,
+                                             A57UnitV,
+                                             A57UnitI]> {
+  let Latency     = 9;
+  let NumMicroOps = 3;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
index 519e595..4e72b13 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleA9.td
@@ -1944,6 +1944,16 @@ def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
 def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
 def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
                                                 let NumMicroOps = 0; }
+def : SchedAlias<WriteMUL16, A9WriteM16>;
+def : SchedAlias<WriteMUL32, A9WriteM>;
+def : SchedAlias<WriteMUL64Lo, A9WriteM>;
+def : SchedAlias<WriteMUL64Hi, A9WriteMHi>;
+def : SchedAlias<WriteMAC16, A9WriteM16>;
+def : SchedAlias<WriteMAC32, A9WriteM>;
+def : SchedAlias<WriteMAC64Lo, A9WriteM>;
+def : SchedAlias<WriteMAC64Hi, A9WriteMHi>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
 
 // Floating-point
 // Only one FP or AGU instruction may issue per cycle. We model this
@@ -1953,6 +1963,7 @@ def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
 def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
 def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
 def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+
 def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
 def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
 def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
@@ -1970,6 +1981,15 @@ def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
 def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
 def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
 
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
 // Reserve A9UnitFP for 2 consecutive cycles.
 def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
   let Latency = 4;
@@ -1992,6 +2012,7 @@ def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
 
 // Load Integer.
 def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+def : SchedAlias<WriteLd, A9WriteL>;
 // Load the upper 32-bits using the same micro-op.
 def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
                                      let NumMicroOps = 0; }
@@ -2471,6 +2492,34 @@ def : SchedAlias<WriteALUsr, A9WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
 def : SchedAlias<ReadALU, A9ReadALU>;
 def : SchedAlias<ReadALUsr, A9ReadALU>;
+def : SchedAlias<WriteST, A9WriteS>;
+
+// ===---------------------------------------------------------------------===//
+// Floating-point. Map target defined SchedReadWrite to processor specific ones
+//
+def : WriteRes<WriteFPCVT, [A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def : SchedAlias<WriteFPMOV, A9WriteFMov>;
+
+def : SchedAlias<WriteFPALU32, A9WriteF>;
+def : SchedAlias<WriteFPALU64, A9WriteF>;
+
+def : SchedAlias<WriteFPMUL32, A9WriteFMulS>;
+def : SchedAlias<WriteFPMUL64, A9WriteFMulD>;
+
+def : SchedAlias<WriteFPMAC32, A9WriteFMAS>;
+def : SchedAlias<WriteFPMAC64, A9WriteFMAD>;
+
+def : SchedAlias<WriteFPDIV32, A9WriteFDivS>;
+def : SchedAlias<WriteFPDIV64, A9WriteFDivD>;
+def : SchedAlias<WriteFPSQRT32, A9WriteFSqrtS>;
+def : SchedAlias<WriteFPSQRT64, A9WriteFSqrtD>;
+
+def : ReadAdvance<ReadFPMUL, 0>;
+def : ReadAdvance<ReadFPMAC, 0>;
+
+// ===---------------------------------------------------------------------===//
+// Subtarget-specific overrides. Map opcodes to list of SchedReadWrite types.
+//
 def : InstRW< [WriteALU],
       (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
                  "BICrr")>;
@@ -2518,12 +2567,11 @@ def : InstRW<[A9WriteLb],
       "LDRH", "LDRSH", "LDRSB")>;
 def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
 
-def : WriteRes<WriteDiv, []> { let Latency = 0; }
+def : WriteRes<WriteDIV, []> { let Latency = 0; }
 
 def : WriteRes<WriteBr, [A9UnitB]>;
 def : WriteRes<WriteBrL, [A9UnitB]>;
 def : WriteRes<WriteBrTbl, [A9UnitB]>;
 def : WriteRes<WritePreLd, []>;
-def : SchedAlias<WriteCvtFP, A9WriteF>;
 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
 } // SchedModel = CortexA9Model
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleM3.td b/contrib/llvm/lib/Target/ARM/ARMScheduleM3.td
new file mode 100644
index 0000000..93f8299
--- /dev/null
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleM3.td
@@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+  let IssueWidth        = 1; // Only IT can be dual-issued, so assume single-issue
+  let MicroOpBufferSize = 0; // In-order
+  let LoadLatency       = 2; // Latency when not pipelined, not pc-relative
+  let MispredictPenalty = 2; // Best case branch taken cost
+
+  let CompleteModel = 0;
+}
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
index 1b40742..782be9b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleR52.td
@@ -70,15 +70,13 @@ def : WriteRes<WriteCMP, [R52UnitALU]> { let Latency = 0; }
 def : WriteRes<WriteCMPsi, [R52UnitALU]> { let Latency = 0; }
 def : WriteRes<WriteCMPsr, [R52UnitALU]> { let Latency = 0; }
 
+// Multiply - aliased to sub-target specific later
+
 // Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2)
-def : WriteRes<WriteDiv, [R52UnitDiv]> {
-  let Latency = 8; let ResourceCycles = [8]; // not pipelined
+def : WriteRes<WriteDIV, [R52UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8]; // non-pipelined
 }
 
-// Loads
-def : WriteRes<WriteLd, [R52UnitLd]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [R52UnitLd]> { let Latency = 4; }
-
 // Branches  - LR written in Late EX2
 def : WriteRes<WriteBr, [R52UnitB]> { let Latency = 0; }
 def : WriteRes<WriteBrL, [R52UnitB]> { let Latency = 0; }
@@ -86,11 +84,50 @@ def : WriteRes<WriteBrTbl, [R52UnitALU]> { let Latency = 0; }
 
 // Misc
 def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
-def : WriteRes<WriteCvtFP, [R52UnitALU]> { let Latency = 3; }
 
+// Integer pipeline by-passes
 def : ReadAdvance<ReadALU, 1>;   // Operand needed in EX1 stage
 def : ReadAdvance<ReadALUsr, 0>; // Shift operands needed in ISS
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
+
+// Floating-point. Map target-defined SchedReadWrites to subtarget
+def : WriteRes<WriteFPMUL32, [R52UnitFPMUL]> { let Latency = 6; }
+
+def : WriteRes<WriteFPMUL64, [R52UnitFPMUL, R52UnitFPMUL]> {
+  let Latency = 6;
+}
+
+def : WriteRes<WriteFPMAC32, [R52UnitFPMUL, R52UnitFPALU]> {
+  let Latency = 11;     // as it is internally two insns (MUL then ADD)
+}
+
+def : WriteRes<WriteFPMAC64, [R52UnitFPMUL, R52UnitFPMUL,
+                              R52UnitFPALU, R52UnitFPALU]> {
+  let Latency = 11;
+}
+
+def : WriteRes<WriteFPDIV32, [R52UnitDiv]> {
+  let Latency = 7;          // FP div takes fixed #cycles
+  let ResourceCycles = [7]; // is not pipelined
+}
+
+def : WriteRes<WriteFPDIV64, [R52UnitDiv]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+
+def : WriteRes<WriteFPSQRT32, [R52UnitDiv]> { let Latency = 7; }
+def : WriteRes<WriteFPSQRT64, [R52UnitDiv]> { let Latency = 17; }
 
+// Overriden via InstRW for this processor.
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
+def : ReadAdvance<ReadFPMUL, 1>; // mul operand read in F1
+def : ReadAdvance<ReadFPMAC, 1>; // fp-mac operand read in F1
 
 //===----------------------------------------------------------------------===//
 // Subtarget-specific SchedReadWrites.
@@ -106,6 +143,9 @@ def : ReadAdvance<R52Read_F2, 2>;
 
 // Cortex-R52 specific SchedWrites for use with InstRW
 def R52WriteMAC        : SchedWriteRes<[R52UnitMAC]> { let Latency = 4; }
+def R52WriteMACHi      : SchedWriteRes<[R52UnitMAC]> {
+  let Latency = 4; let NumMicroOps = 0;
+}
 def R52WriteDIV        : SchedWriteRes<[R52UnitDiv]> {
   let Latency = 8; let ResourceCycles = [8]; // not pipelined
 }
@@ -120,6 +160,19 @@ def R52WriteALU_WRI    : SchedWriteRes<[R52UnitALU]> { let Latency = 4; }
 def R52WriteNoRSRC_EX2 : SchedWriteRes<[]> { let Latency = 3; }
 def R52WriteNoRSRC_WRI : SchedWriteRes<[]> { let Latency = 4; }
 
+// Alias generics to sub-target specific
+def : SchedAlias<WriteMUL16, R52WriteMAC>;
+def : SchedAlias<WriteMUL32, R52WriteMAC>;
+def : SchedAlias<WriteMUL64Lo, R52WriteMAC>;
+def : SchedAlias<WriteMUL64Hi, R52WriteMACHi>;
+def : SchedAlias<WriteMAC16, R52WriteMAC>;
+def : SchedAlias<WriteMAC32, R52WriteMAC>;
+def : SchedAlias<WriteMAC64Lo, R52WriteMAC>;
+def : SchedAlias<WriteMAC64Hi, R52WriteMACHi>;
+def : SchedAlias<WritePreLd, R52WriteLd>;
+def : SchedAlias<WriteLd, R52WriteLd>;
+def : SchedAlias<WriteST, R52WriteST>;
+
 def R52WriteFPALU_F3   : SchedWriteRes<[R52UnitFPALU]> { let Latency = 4; }
 def R52Write2FPALU_F3  : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
   let Latency = 4;
@@ -147,19 +200,17 @@ def R52Write2FPMAC_F5  : SchedWriteRes<[R52UnitFPMUL, R52UnitFPMUL,
 def R52WriteFPLd_F4    : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
 def R52WriteFPST_F4    : SchedWriteRes<[R52UnitLd]> { let Latency = 5; }
 
-def R52WriteFPDIV_SP   : SchedWriteRes<[R52UnitFPDIV]> {
-  let Latency = 7;          // FP div takes fixed #cycles
-  let ResourceCycles = [7]; // is not pipelined
- }
-def R52WriteFPDIV_DP   : SchedWriteRes<[R52UnitFPDIV]> {
-  let Latency = 17;
-  let ResourceCycles = [17];
-}
-
-
 //===----------------------------------------------------------------------===//
-// Subtarget-specific - map operands to SchedReadWrites
+// Floating-point. Map target defined SchedReadWrites to processor specific ones
+//
+def : SchedAlias<WriteFPCVT,   R52WriteFPALU_F5>;
+def : SchedAlias<WriteFPMOV, R52WriteFPALU_F3>;
+def : SchedAlias<WriteFPALU32, R52WriteFPALU_F5>;
+def : SchedAlias<WriteFPALU64, R52WriteFPALU_F5>;
 
+//===----------------------------------------------------------------------===//
+// Subtarget-specific overrides. Map opcodes to list of SchedReadWrites types.
+//
 def : InstRW<[WriteALU], (instrs COPY)>;
 
 def : InstRW<[R52WriteALU_EX2, R52Read_EX1, R52Read_ISS],
@@ -235,7 +286,7 @@ def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
       "t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
 
 def : InstRW <[R52WriteDIV, R52Read_ISS, R52Read_ISS],
-      (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+      (instregex "t2SDIV", "t2UDIV")>;
 
 // Loads (except POST) with SHL > 2, or ror, require 2 extra cycles.
 // However, that's non-trivial to specify, so we keep it uniform
@@ -294,15 +345,6 @@ def : InstRW<[R52WriteCC, R52Read_ISS], (instregex "TST")>;
 def : InstRW<[R52WriteLd], (instregex "MRS", "MRSbanked")>;
 def : InstRW<[R52WriteLd, R52Read_EX1], (instregex "MSR", "MSRbanked")>;
 
-//def : InstRW<[R52WriteLd, R52Read_ISS], (instregex "^LDRB?(_PRE_IMM|_POST_IMM)", "LDRrs")>;
-//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_PRE_REG", "LDRB?rr")>;
-//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_POST_REG")>;
-
-//def : InstRW<[R52WriteST, R52Read_ISS], (instregex "STRi12", "PICSTR")>;
-//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_PRE_REG", "STRB?_PRE_REG")>;
-//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_POST_REG", "STRB?_POST_REG")>;
-
-
 // Integer Load, Multiple.
 foreach Lat = 3-25 in {
   def R52WriteILDM#Lat#Cy : SchedWriteRes<[R52UnitLd]> {
@@ -492,12 +534,6 @@ def : InstRW<[R52Write2FPALU_F3, R52Read_F1, R52Read_F1], (instregex "(VACGE|VAC
 def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(D|S|H|fd|hd)")>;
 def : InstRW<[R52Write2FPALU_F5, R52Read_F1, R52Read_F1], (instregex "(VADD|VSUB)(fq|hq)")>;
 
-def : InstRW<[R52WriteFPDIV_SP, R52Read_F0, R52Read_F0], (instregex "VDIV(S|H)")>;
-def : InstRW<[R52WriteFPDIV_DP, R52Read_F0, R52Read_F0], (instregex "VDIVD")>;
-
-def : InstRW<[R52WriteFPMAC_F5, R52Read_F1, R52Read_F1, R52Read_F1],
-                                          (instregex "(VFMA|VFMS|VFNMA|VFNMS)(D|H|S)")>;
-
 def : InstRW<[R52WriteFPLd_F4, R52Read_ISS, R52Read_F1], (instregex "VLDR")>;
 def : InstRW<[R52WriteFPST_F4, R52Read_ISS, R52Read_F1], (instregex "VSTR")>;
 
@@ -682,21 +718,24 @@ def R52WriteSTM : SchedWriteVariant<[
 
 // Vector Load/Stores. Can issue only in slot-0. Can dual-issue with
 // another instruction in slot-1, but only in the last issue.
-def R52WriteVLD1Mem  : SchedWriteRes<[R52UnitLd]> { let Latency = 5;}
-def R52WriteVLD2Mem  : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD1, [R52UnitLd]> { let Latency = 5;}
+def : WriteRes<WriteVLD2, [R52UnitLd]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [2];
+  let SingleIssue = 1;
 }
-def R52WriteVLD3Mem  : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD3, [R52UnitLd]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [3];
+  let SingleIssue = 1;
 }
-def R52WriteVLD4Mem  : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD4, [R52UnitLd]> {
   let Latency = 8;
   let NumMicroOps = 7;
   let ResourceCycles = [4];
+  let SingleIssue = 1;
 }
 def R52WriteVST1Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 5;
@@ -777,9 +816,8 @@ def : InstRW<[R52Write2FPALU_F4, R52Read_F2, R52Read_F2], (instregex "(VHADD|VHS
 
 def : InstRW<[R52WriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
 def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VMAX", "VMIN", "VPMAX", "VPMIN")>;
-def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VMOV", "VORR", "VORN", "VREV")>;
+def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VORR", "VORN", "VREV")>;
 def : InstRW<[R52WriteNoRSRC_WRI], (instregex "VMRS")>;
-def : InstRW<[R52WriteFPMUL_F5, R52Read_F1, R52Read_F1, R52Read_F1], (instregex "VMUL", "VNMUL", "VMLA")>;
 def : InstRW<[R52WriteFPALU_F5, R52Read_F1], (instregex "VNEG")>;
 def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADDi")>;
 def : InstRW<[R52Write2FPALU_F4, R52Read_F1, R52Read_F1], (instregex "VPADAL", "VPADDL")>;
@@ -797,95 +835,6 @@ def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VR
 def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
 
 //---
-// VLDx. Vector Loads
-//---
-// 1-element structure load
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>;
-
-// 2-element structure load
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>;
-
-// 3-element structure load
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
-
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
-
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-
-// 4-element structure load
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
-
-
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-
-//---
 // VSTx. Vector Stores
 //---
 // 1-element structure store
diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
index ea2bf4b..b838688 100644
--- a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -133,6 +133,8 @@ let SchedModel = SwiftModel in {
   def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
   def : ReadAdvance<ReadALU, 0>;
   def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+  def : SchedAlias<WriteLd, SwiftWriteP2ThreeCycle>;
+  def : SchedAlias<WriteST, SwiftWriteP2>;
 
 
   def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
@@ -166,10 +168,10 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftWriteP01OneCycle2x_load],
         (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
 
-  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+  def SwiftWriteP0TwoCycleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
 
   def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
-    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCycleTwoUops ]>,
     SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
   ]>;
 
@@ -282,6 +284,18 @@ let SchedModel = SwiftModel in {
     let ResourceCycles = [2, 3];
   }
 
+  // Aliasing sub-target specific WriteRes to generic ones
+  def : SchedAlias<WriteMUL16, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteMUL32, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteMUL64Lo, SwiftP0P0P01FiveCycle>;
+  def : SchedAlias<WriteMUL64Hi, SwiftWrite5Cycle>;
+  def : SchedAlias<WriteMAC16, SwiftPredP0P01FourFiveCycle>;
+  def : SchedAlias<WriteMAC32, SwiftPredP0P01FourFiveCycle>;
+  def : SchedAlias<WriteMAC64Lo, SwiftWrite5Cycle>;
+  def : SchedAlias<WriteMAC64Hi, Swift2P03P01FiveCycle>;
+  def : ReadAdvance<ReadMUL, 0>;
+  def : SchedAlias<ReadMAC, SwiftReadAdvanceFourCyclesPred>;
+
   // 4.2.15 Integer Multiply Accumulate, Long
   // 4.2.16 Integer Multiply Accumulate, Dual
   // 4.2.17 Integer Multiply Accumulate Accumulate, Long
@@ -300,7 +314,7 @@ let SchedModel = SwiftModel in {
     let ResourceCycles = [1, 14];
   }
   // 4.2.18 Integer Divide
-  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : WriteRes<WriteDIV, [SwiftUnitDiv]>; // Workaround.
   def : InstRW <[SwiftDiv],
         (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
 
@@ -310,7 +324,7 @@ let SchedModel = SwiftModel in {
     let Latency = 3;
     let NumMicroOps = 2;
   }
-  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+  def SwiftWriteP2P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 4;
     let NumMicroOps = 2;
   }
@@ -343,7 +357,7 @@ let SchedModel = SwiftModel in {
         "tLDR(r|i|spi|pci|pciASM)")>;
   def : InstRW<[SwiftWriteP2ThreeCycle],
         (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
-  def : InstRW<[SwiftWriteP2P01FourCyle],
+  def : InstRW<[SwiftWriteP2P01FourCycle],
         (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
         "t2LDRpci_pic", "tLDRS(B|H)")>;
   def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
@@ -597,8 +611,6 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftWriteP1FourCycle],
         (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
                    "VMULL", "VQDMULL")>;
-  def : InstRW<[SwiftWriteP1SixCycle],
-        (instregex "VMULD", "VNMULD")>;
   def : InstRW<[SwiftWriteP1FourCycle],
         (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
         "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
@@ -607,8 +619,6 @@ let SchedModel = SwiftModel in {
 
   // 4.2.36 Advanced SIMD and VFP, Convert
   def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
-  // Fixpoint conversions.
-  def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
 
   // 4.2.37 Advanced SIMD and VFP, Move
   def : InstRW<[SwiftWriteP0TwoCycle],
@@ -1036,6 +1046,40 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
   def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
 
+  // ===---------------------------------------------------------------------===//
+  // Floating-point. Map target defined SchedReadWrite to processor specific ones
+  //
+  def : SchedAlias<WriteFPCVT, SwiftWriteP1FourCycle>;
+  def : SchedAlias<WriteFPMOV, SwiftWriteP2ThreeCycle>;
+
+  def : SchedAlias<WriteFPALU32, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteFPALU64, SwiftWriteP0SixCycle>;
+
+  def : SchedAlias<WriteFPMUL32, SwiftWriteP1FourCycle>;
+  def : SchedAlias<WriteFPMUL64, SwiftWriteP1SixCycle>;
+
+  def : SchedAlias<WriteFPMAC32, SwiftWriteP1FourCycle>;
+  def : SchedAlias<WriteFPMAC64, SwiftWriteP1FourCycle>;
+
+  def : SchedAlias<WriteFPDIV32, SwiftDiv17>;
+  def : SchedAlias<WriteFPSQRT32, SwiftDiv17>;
+
+  def : SchedAlias<WriteFPDIV64, SwiftDiv32>;
+  def : SchedAlias<WriteFPSQRT64, SwiftDiv32>;
+
+  def : ReadAdvance<ReadFPMUL, 0>;
+  def : ReadAdvance<ReadFPMAC, 0>;
+
+  // Overriden via InstRW for this processor.
+  def : WriteRes<WriteVLD1, []>;
+  def : WriteRes<WriteVLD2, []>;
+  def : WriteRes<WriteVLD3, []>;
+  def : WriteRes<WriteVLD4, []>;
+  def : WriteRes<WriteVST1, []>;
+  def : WriteRes<WriteVST2, []>;
+  def : WriteRes<WriteVST3, []>;
+  def : WriteRes<WriteVST4, []>;
+
   // Not specified.
   def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
   // Preload.
diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 3b99762..33dcf9b 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -95,7 +95,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
 
     Entry.Node = Src; 
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Entry.isSExt = false;
+    Entry.IsSExt = false;
     Args.push_back(Entry);
   } else {
     Entry.Node = Src;
@@ -114,11 +114,11 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(
-           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
-           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
-                                 TLI->getPointerTy(DAG.getDataLayout())),
-           std::move(Args))
+      .setLibCallee(
+          TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+          DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+                                TLI->getPointerTy(DAG.getDataLayout())),
+          std::move(Args))
       .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
@@ -198,17 +198,18 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
     return Chain;
 
   // Issue loads / stores for the trailing (1 - 3) bytes.
+  auto getRemainingValueType = [](unsigned BytesLeft) {
+    return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
+  };
+  auto getRemainingSize = [](unsigned BytesLeft) {
+    return (BytesLeft >= 2) ? 2 : 1;
+  };
+
   unsigned BytesLeftSave = BytesLeft;
   i = 0;
   while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
+    VT = getRemainingValueType(BytesLeft);
+    VTSize = getRemainingSize(BytesLeft);
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
@@ -224,14 +225,8 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
   i = 0;
   BytesLeft = BytesLeftSave;
   while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
+    VT = getRemainingValueType(BytesLeft);
+    VTSize = getRemainingSize(BytesLeft);
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, dl, MVT::i32)),
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
index e2df0bd..2c42a13 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -11,27 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARM.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "ARMCallLowering.h"
+#include "ARMLegalizerInfo.h"
+#include "ARMRegisterBankInfo.h"
+#endif
 #include "ARMSubtarget.h"
 #include "ARMFrameLowering.h"
-#include "ARMISelLowering.h"
 #include "ARMInstrInfo.h"
-#include "ARMMachineFunctionInfo.h"
-#include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <string>
 
 using namespace llvm;
 
@@ -76,11 +92,6 @@ ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
-/// EnableExecuteOnly - Enables the generation of execute-only code on supported
-/// targets
-static cl::opt<bool>
-EnableExecuteOnly("arm-execute-only");
-
 ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
                                                         StringRef FS) {
   ARMSubtarget &STI = initializeSubtargetDependencies(CPU, FS);
@@ -90,13 +101,41 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU,
   return new ARMFrameLowering(STI);
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct ARMGISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS,
                            const ARMBaseTargetMachine &TM, bool IsLittle)
     : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
-      GenExecuteOnly(EnableExecuteOnly), CPUString(CPU), IsLittle(IsLittle),
-      TargetTriple(TT), Options(TM.Options), TM(TM),
-      FrameLowering(initializeFrameLowering(CPU, FS)),
+      CPUString(CPU), IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options),
+      TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
       InstrInfo(isThumb1Only()
@@ -104,7 +143,29 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                     : !isThumb()
                           ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this)
                           : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)),
-      TLInfo(TM, *this), GISel() {}
+      TLInfo(TM, *this) {
+  assert((isThumb() || hasARMOps()) &&
+         "Target must either be thumb or support ARM operations!");
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
+  GISel->CallLoweringInfo.reset(new ARMCallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new ARMLegalizerInfo(*this));
+
+  auto *RBI = new ARMRegisterBankInfo(*getRegisterInfo());
+
+  // FIXME: At this point, we can't rely on Subtarget having RBI.
+  // It's awkward to mix passing RBI and the Subtarget; should we pass
+  // TII/TRI as well?
+  GISel->InstSelector.reset(createARMInstructionSelector(
+      *static_cast<const ARMBaseTargetMachine *>(&TM), *this, *RBI));
+
+  GISel->RegBankInfo.reset(RBI);
+#endif
+  setGISelAccessor(*GISel);
+}
 
 const CallLowering *ARMSubtarget::getCallLowering() const {
   assert(GISel && "Access to GlobalISel APIs not set");
@@ -148,11 +209,11 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
     if (isTargetDarwin()) {
       StringRef ArchName = TargetTriple.getArchName();
-      unsigned ArchKind = llvm::ARM::parseArch(ArchName);
-      if (ArchKind == llvm::ARM::AK_ARMV7S)
+      unsigned ArchKind = ARM::parseArch(ArchName);
+      if (ArchKind == ARM::AK_ARMV7S)
         // Default to the Swift CPU when targeting armv7s/thumbv7s.
         CPUString = "swift";
-      else if (ArchKind == llvm::ARM::AK_ARMV7K)
+      else if (ArchKind == ARM::AK_ARMV7K)
         // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
         // ARMv7k does not use SjLj exception handling.
         CPUString = "cortex-a7";
@@ -200,12 +261,12 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // support in the assembler and linker to be used. This would need to be
   // fixed to fully support tail calls in Thumb1.
   //
-  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
-  // LR.  This means if we need to reload LR, it takes an extra instructions,
-  // which outweighs the value of the tail call; but here we don't know yet
-  // whether LR is going to be used.  Probably the right approach is to
-  // generate the tail call here and turn it back into CALL/RET in
-  // emitEpilogue if LR is used.
+  // For ARMv8-M, we /do/ implement tail calls.  Doing this is tricky for v8-M
+  // baseline, since the LDM/POP instruction on Thumb doesn't take LR.  This
+  // means if we need to reload LR, it takes extra instructions, which outweighs
+  // the value of the tail call; but here we don't know yet whether LR is going
+  // to be used. We generate the tail call here and turn it back into CALL/RET
+  // in emitEpilogue if LR is used.
 
   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
   // but we need to make sure there are enough registers; the only valid
@@ -274,6 +335,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   case CortexM3:
   case ExynosM1:
   case CortexR52:
+  case Kryo:
     break;
   case Krait:
     PreISelOperandLatencyAdjustment = 1;
diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
index 8c8218d..e15b175 100644
--- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -14,47 +14,93 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
 #define LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
 
-
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMISelLowering.h"
-#include "ARMInstrInfo.h"
 #include "ARMSelectionDAGInfo.h"
-#include "ARMSubtarget.h"
-#include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "Thumb1FrameLowering.h"
-#include "Thumb1InstrInfo.h"
-#include "Thumb2InstrInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
 #include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "ARMGenSubtargetInfo.inc"
 
 namespace llvm {
+
+class ARMBaseTargetMachine;
 class GlobalValue;
 class StringRef;
-class TargetOptions;
-class ARMBaseTargetMachine;
 
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexR52, CortexM3,
-    CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
-    Krait, Swift, ExynosM1
+    Others,
+
+    CortexA12,
+    CortexA15,
+    CortexA17,
+    CortexA32,
+    CortexA35,
+    CortexA5,
+    CortexA53,
+    CortexA57,
+    CortexA7,
+    CortexA72,
+    CortexA73,
+    CortexA8,
+    CortexA9,
+    CortexM3,
+    CortexR4,
+    CortexR4F,
+    CortexR5,
+    CortexR52,
+    CortexR7,
+    ExynosM1,
+    Krait,
+    Kryo,
+    Swift
   };
   enum ARMProcClassEnum {
-    None, AClass, RClass, MClass
+    None,
+
+    AClass,
+    MClass,
+    RClass
   };
   enum ARMArchEnum {
-    ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
-    ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline,
+    ARMv2,
+    ARMv2a,
+    ARMv3,
+    ARMv3m,
+    ARMv4,
+    ARMv4t,
+    ARMv5,
+    ARMv5t,
+    ARMv5te,
+    ARMv5tej,
+    ARMv6,
+    ARMv6k,
+    ARMv6kz,
+    ARMv6m,
+    ARMv6sm,
+    ARMv6t2,
+    ARMv7a,
+    ARMv7em,
+    ARMv7m,
+    ARMv7r,
+    ARMv7ve,
+    ARMv81a,
+    ARMv82a,
+    ARMv8a,
+    ARMv8mBaseline,
+    ARMv8mMainline,
     ARMv8r
   };
 
@@ -162,16 +208,12 @@ protected:
   /// FP registers for VFPv3.
   bool HasD16 = false;
 
-  /// HasHardwareDivide - True if subtarget supports [su]div
-  bool HasHardwareDivide = false;
+  /// HasHardwareDivide - True if subtarget supports [su]div in Thumb mode
+  bool HasHardwareDivideInThumb = false;
 
   /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
   bool HasHardwareDivideInARM = false;
 
-  /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
-  /// instructions.
-  bool HasT2ExtractPack = false;
-
   /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
   /// instructions.
   bool HasDataBarrier = false;
@@ -192,6 +234,10 @@ protected:
   /// CPSR setting instruction.
   bool AvoidCPSRPartialUpdate = false;
 
+  /// CheapPredicableCPSRDef - If true, disable +1 predication cost
+  /// for instructions updating CPSR. Enabled for Cortex-A57.
+  bool CheapPredicableCPSRDef = false;
+
   /// AvoidMOVsShifterOperand - If true, codegen should avoid using flag setting
   /// movs with shifter operand (i.e. asr, lsl, lsr).
   bool AvoidMOVsShifterOperand = false;
@@ -200,6 +246,11 @@ protected:
   /// avoid issue "normal" call instructions to callees which do not return.
   bool HasRetAddrStack = false;
 
+  /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+  /// a branch predictor or not changes the expected cost of taking a branch
+  /// which affects the choice of whether to use predicated instructions.
+  bool HasBranchPredictor = true;
+
   /// HasMPExtension - True if the subtarget supports Multiprocessing
   /// extension (ARMv7 only).
   bool HasMPExtension = false;
@@ -239,6 +290,10 @@ protected:
   /// HasFPAO - if true, processor  does positive address offset computation faster
   bool HasFPAO = false;
 
+  /// HasFuseAES - if true, processor executes back to back AES instruction
+  /// pairs faster.
+  bool HasFuseAES = false;
+
   /// If true, if conversion may decide to leave some instructions unpredicated.
   bool IsProfitableToUnpredicate = false;
 
@@ -310,6 +365,10 @@ protected:
   /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
   bool UseSjLjEH = false;
 
+  /// Implicitly convert an instruction to a different one if its immediates
+  /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
+  bool NegativeImmediates = true;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment = 4;
@@ -362,6 +421,7 @@ public:
   unsigned getMaxInlineSizeThreshold() const {
     return 64;
   }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -373,15 +433,19 @@ public:
   const ARMSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
   const ARMBaseInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
+
   const ARMTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
+
   const ARMFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
   }
+
   const ARMBaseRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
@@ -451,19 +515,21 @@ public:
   bool hasCRC() const { return HasCRC; }
   bool hasRAS() const { return HasRAS; }
   bool hasVirtualization() const { return HasVirtualization; }
+
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP;
   }
 
-  bool hasDivide() const { return HasHardwareDivide; }
+  bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
-  bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
   bool hasV7Clrex() const { return HasV7Clrex; }
   bool hasAcquireRelease() const { return HasAcquireRelease; }
+
   bool hasAnyDataBarrier() const {
     return HasDataBarrier || (hasV6Ops() && !isThumb());
   }
+
   bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
@@ -490,8 +556,10 @@ public:
   bool nonpipelinedVFP() const { return NonpipelinedVFP; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
+  bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
   bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
   bool hasRetAddrStack() const { return HasRetAddrStack; }
+  bool hasBranchPredictor() const { return HasBranchPredictor; }
   bool hasMPExtension() const { return HasMPExtension; }
   bool hasDSP() const { return HasDSP; }
   bool useNaClTrap() const { return UseNaClTrap; }
@@ -503,6 +571,10 @@ public:
   bool hasD16() const { return HasD16; }
   bool hasFullFP16() const { return HasFullFP16; }
 
+  bool hasFuseAES() const { return HasFuseAES; }
+  /// \brief Return true if the CPU supports any kind of instruction fusion.
+  bool hasFusion() const { return hasFuseAES(); }
+
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
@@ -561,9 +633,10 @@ public:
            TargetTriple.getEnvironment() == Triple::EABIHF ||
            isTargetWindows() || isAAPCS16_ABI();
   }
+
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
 
-  virtual bool isXRaySupported() const override;
+  bool isXRaySupported() const override;
 
   bool isAPCS_ABI() const;
   bool isAAPCS_ABI() const;
@@ -588,6 +661,7 @@ public:
   bool useR7AsFramePointer() const {
     return isTargetDarwin() || (!isTargetWindows() && isThumb());
   }
+
   /// Returns true if the frame setup is split into two separate pushes (first
   /// r0-r7,lr then r8-r11), principally so that the frame pointer is adjacent
   /// to lr. This is always required on Thumb1-only targets, as the push and
@@ -656,6 +730,7 @@ public:
   /// True if fast-isel is used.
   bool useFastISel() const;
 };
-} // End llvm namespace
 
-#endif  // ARMSUBTARGET_H
+} // end namespace llvm
+
+#endif  // LLVM_LIB_TARGET_ARM_ARMSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 70c9567..c323a1d 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -10,30 +10,47 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMTargetMachine.h"
 #include "ARM.h"
-#include "ARMCallLowering.h"
-#include "ARMFrameLowering.h"
-#include "ARMInstructionSelector.h"
-#include "ARMLegalizerInfo.h"
-#include "ARMRegisterBankInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMMacroFusion.h"
+#include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "ARMTargetTransformInfo.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool>
@@ -57,91 +74,57 @@ static cl::opt<cl::boolOrDefault>
 EnableGlobalMerge("arm-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"));
 
+namespace llvm {
+  void initializeARMExecutionDepsFixPass(PassRegistry&);
+}
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
+  RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
   RegisterTargetMachine<ARMBETargetMachine> Y(getTheARMBETarget());
-  RegisterTargetMachine<ThumbLETargetMachine> A(getTheThumbLETarget());
-  RegisterTargetMachine<ThumbBETargetMachine> B(getTheThumbBETarget());
+  RegisterTargetMachine<ARMBETargetMachine> B(getTheThumbBETarget());
 
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeGlobalISel(Registry);
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
+  initializeARMConstantIslandsPass(Registry);
+  initializeARMExecutionDepsFixPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
-    return make_unique<TargetLoweringObjectFileMachO>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
   if (TT.isOSWindows())
-    return make_unique<TargetLoweringObjectFileCOFF>();
-  return make_unique<ARMElfTargetObjectFile>();
+    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
+  return llvm::make_unique<ARMElfTargetObjectFile>();
 }
 
 static ARMBaseTargetMachine::ARMABI
 computeTargetABI(const Triple &TT, StringRef CPU,
                  const TargetOptions &Options) {
-  if (Options.MCOptions.getABIName() == "aapcs16")
+  StringRef ABIName = Options.MCOptions.getABIName();
+
+  if (ABIName.empty())
+    ABIName = ARM::computeDefaultTargetABI(TT, CPU);
+
+  if (ABIName == "aapcs16")
     return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-  else if (Options.MCOptions.getABIName().startswith("aapcs"))
+  else if (ABIName.startswith("aapcs"))
     return ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  else if (Options.MCOptions.getABIName().startswith("apcs"))
+  else if (ABIName.startswith("apcs"))
     return ARMBaseTargetMachine::ARM_ABI_APCS;
 
-  assert(Options.MCOptions.getABIName().empty() &&
-         "Unknown target-abi option!");
-
-  ARMBaseTargetMachine::ARMABI TargetABI =
-      ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
-
-  unsigned ArchKind = llvm::ARM::parseCPUArch(CPU);
-  StringRef ArchName = llvm::ARM::getArchName(ArchKind);
-  // FIXME: This is duplicated code from the front end and should be unified.
-  if (TT.isOSBinFormatMachO()) {
-    if (TT.getEnvironment() == llvm::Triple::EABI ||
-        (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
-        llvm::ARM::parseArchProfile(ArchName) == llvm::ARM::PK_M) {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-    } else if (TT.isWatchABI()) {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
-    } else {
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-    }
-  } else if (TT.isOSWindows()) {
-    // FIXME: this is invalid for WindowsCE
-    TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-  } else {
-    // Select the default based on the platform.
-    switch (TT.getEnvironment()) {
-    case llvm::Triple::Android:
-    case llvm::Triple::GNUEABI:
-    case llvm::Triple::GNUEABIHF:
-    case llvm::Triple::MuslEABI:
-    case llvm::Triple::MuslEABIHF:
-    case llvm::Triple::EABIHF:
-    case llvm::Triple::EABI:
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-      break;
-    case llvm::Triple::GNU:
-      TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-      break;
-    default:
-      if (TT.isOSNetBSD())
-        TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
-      else
-        TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
-      break;
-    }
-  }
-
-  return TargetABI;
+  llvm_unreachable("Unhandled/unknown ABI Name!");
+  return ARMBaseTargetMachine::ARM_ABI_UNKNOWN;
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options,
                                      bool isLittle) {
   auto ABI = computeTargetABI(TT, CPU, Options);
-  std::string Ret = "";
+  std::string Ret;
 
   if (isLittle)
     // Little endian.
@@ -219,49 +202,38 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
                         CPU, FS, Options, getEffectiveRelocModel(TT, RM), CM,
                         OL),
       TargetABI(computeTargetABI(TT, CPU, Options)),
-      TLOF(createTLOF(getTargetTriple())),
-      Subtarget(TT, CPU, FS, *this, isLittle), isLittle(isLittle) {
+      TLOF(createTLOF(getTargetTriple())), isLittle(isLittle) {
 
   // Default to triple-appropriate float ABI
-  if (Options.FloatABIType == FloatABI::Default)
-    this->Options.FloatABIType =
-        Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft;
+  if (Options.FloatABIType == FloatABI::Default) {
+    if (TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+        TargetTriple.getEnvironment() == Triple::MuslEABIHF ||
+        TargetTriple.getEnvironment() == Triple::EABIHF ||
+        TargetTriple.isOSWindows() ||
+        TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
+      this->Options.FloatABIType = FloatABI::Hard;
+    else
+      this->Options.FloatABIType = FloatABI::Soft;
+  }
 
   // Default to triple-appropriate EABI
   if (Options.EABIVersion == EABI::Default ||
       Options.EABIVersion == EABI::Unknown) {
     // musl is compatible with glibc with regard to EABI version
-    if (Subtarget.isTargetGNUAEABI() || Subtarget.isTargetMuslAEABI())
+    if ((TargetTriple.getEnvironment() == Triple::GNUEABI ||
+	 TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
+	 TargetTriple.getEnvironment() == Triple::MuslEABI ||
+	 TargetTriple.getEnvironment() == Triple::MuslEABIHF) &&
+	!(TargetTriple.isOSWindows() || TargetTriple.isOSDarwin()))
       this->Options.EABIVersion = EABI::GNU;
     else
       this->Options.EABIVersion = EABI::EABI5;
   }
-}
 
-ARMBaseTargetMachine::~ARMBaseTargetMachine() {}
+  initAsmInfo();
+}
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-struct ARMGISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CallLoweringInfo;
-  std::unique_ptr<InstructionSelector> InstSelector;
-  std::unique_ptr<LegalizerInfo> Legalizer;
-  std::unique_ptr<RegisterBankInfo> RegBankInfo;
-  const CallLowering *getCallLowering() const override {
-    return CallLoweringInfo.get();
-  }
-  const InstructionSelector *getInstructionSelector() const override {
-    return InstSelector.get();
-  }
-  const LegalizerInfo *getLegalizerInfo() const override {
-    return Legalizer.get();
-  }
-  const RegisterBankInfo *getRegBankInfo() const override {
-    return RegBankInfo.get();
-  }
-};
-} // End anonymous namespace.
-#endif
+ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
 
 const ARMSubtarget *
 ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
@@ -294,24 +266,6 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<ARMSubtarget>(TargetTriple, CPU, FS, *this, isLittle);
-
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
-    GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new ARMLegalizerInfo());
-
-    auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
-
-    // FIXME: At this point, we can't rely on Subtarget having RBI.
-    // It's awkward to mix passing RBI and the Subtarget; should we pass
-    // TII/TRI as well?
-    GISel->InstSelector.reset(new ARMInstructionSelector(*I, *RBI));
-
-    GISel->RegBankInfo.reset(RBI);
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
@@ -322,22 +276,6 @@ TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
   });
 }
 
-void ARMTargetMachine::anchor() {}
-
-ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
-                                   Optional<Reloc::Model> RM,
-                                   CodeModel::Model CM, CodeGenOpt::Level OL,
-                                   bool isLittle)
-    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
-  initAsmInfo();
-  if (!Subtarget.hasARMOps())
-    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
-                       "support ARM mode execution!");
-}
-
-void ARMLETargetMachine::anchor() {}
 
 ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -345,9 +283,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        Optional<Reloc::Model> RM,
                                        CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ARMBETargetMachine::anchor() {}
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -355,51 +291,40 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        Optional<Reloc::Model> RM,
                                        CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-
-void ThumbTargetMachine::anchor() {}
-
-ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL, bool isLittle)
-    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
-  initAsmInfo();
-}
-
-void ThumbLETargetMachine::anchor() {}
-
-ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
-                                           StringRef CPU, StringRef FS,
-                                           const TargetOptions &Options,
-                                           Optional<Reloc::Model> RM,
-                                           CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
-    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ThumbBETargetMachine::anchor() {}
-
-ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
-                                           StringRef CPU, StringRef FS,
-                                           const TargetOptions &Options,
-                                           Optional<Reloc::Model> RM,
-                                           CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
-    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
+
 /// ARM Code Generator Pass Configuration Options.
 class ARMPassConfig : public TargetPassConfig {
 public:
-  ARMPassConfig(ARMBaseTargetMachine *TM, PassManagerBase &PM)
+  ARMPassConfig(ARMBaseTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   ARMBaseTargetMachine &getARMTargetMachine() const {
     return getTM<ARMBaseTargetMachine>();
   }
 
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMILive *DAG = createGenericSchedLive(C);
+    // add DAG Mutations here.
+    const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+    if (ST.hasFusion())
+      DAG->addMutation(createARMMacroFusionDAGMutation());
+    return DAG;
+  }
+
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+    // add DAG Mutations here.
+    const ARMSubtarget &ST = C->MF->getSubtarget<ARMSubtarget>();
+    if (ST.hasFusion())
+      DAG->addMutation(createARMMacroFusionDAGMutation());
+    return DAG;
+  }
+
   void addIRPasses() override;
   bool addPreISel() override;
   bool addInstSelector() override;
@@ -413,17 +338,31 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+class ARMExecutionDepsFix : public ExecutionDepsFix {
+public:
+  static char ID;
+  ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {}
+  StringRef getPassName() const override {
+    return "ARM Execution Dependency Fix";
+  }
+};
+char ARMExecutionDepsFix::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix",
+                "ARM Execution Dependency Fix", false, false)
 
 TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new ARMPassConfig(this, PM);
+  return new ARMPassConfig(*this, PM);
 }
 
 void ARMPassConfig::addIRPasses() {
   if (TM->Options.ThreadModel == ThreadModel::Single)
     addPass(createLowerAtomicPass());
   else
-    addPass(createAtomicExpandPass(TM));
+    addPass(createAtomicExpandPass());
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
@@ -438,7 +377,7 @@ void ARMPassConfig::addIRPasses() {
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createInterleavedAccessPass(TM));
+    addPass(createInterleavedAccessPass());
 }
 
 bool ARMPassConfig::addPreISel() {
@@ -508,7 +447,7 @@ void ARMPassConfig::addPreSched2() {
     if (EnableARMLoadStoreOpt)
       addPass(createARMLoadStoreOptimizationPass());
 
-    addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+    addPass(new ARMExecutionDepsFix());
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
index c6b70b9..22ce949 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -14,10 +14,14 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
 #define LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
 
-#include "ARMInstrInfo.h"
 #include "ARMSubtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
@@ -32,7 +36,6 @@ public:
 
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  ARMSubtarget        Subtarget;
   bool isLittle;
   mutable StringMap<std::unique_ptr<ARMSubtarget>> SubtargetMap;
 
@@ -43,8 +46,10 @@ public:
                        CodeGenOpt::Level OL, bool isLittle);
   ~ARMBaseTargetMachine() override;
 
-  const ARMSubtarget *getSubtargetImpl() const { return &Subtarget; }
   const ARMSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
+  const ARMSubtarget *getSubtargetImpl() const = delete;
   bool isLittleEndian() const { return isLittle; }
 
   /// \brief Get the TargetIRAnalysis for this target.
@@ -56,23 +61,15 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
-};
 
-/// ARM target machine.
-///
-class ARMTargetMachine : public ARMBaseTargetMachine {
-  virtual void anchor();
- public:
-   ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                    StringRef FS, const TargetOptions &Options,
-                    Optional<Reloc::Model> RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL, bool isLittle);
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
-/// ARM little endian target machine.
+/// ARM/Thumb little endian target machine.
 ///
-class ARMLETargetMachine : public ARMTargetMachine {
-  void anchor() override;
+class ARMLETargetMachine : public ARMBaseTargetMachine {
 public:
   ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -80,10 +77,9 @@ public:
                      CodeGenOpt::Level OL);
 };
 
-/// ARM big endian target machine.
+/// ARM/Thumb big endian target machine.
 ///
-class ARMBETargetMachine : public ARMTargetMachine {
-  void anchor() override;
+class ARMBETargetMachine : public ARMBaseTargetMachine {
 public:
   ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -91,41 +87,6 @@ public:
                      CodeGenOpt::Level OL);
 };
 
-/// Thumb target machine.
-/// Due to the way architectures are handled, this represents both
-///   Thumb-1 and Thumb-2.
-///
-class ThumbTargetMachine : public ARMBaseTargetMachine {
-  virtual void anchor();
-public:
-  ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL, bool isLittle);
-};
-
-/// Thumb little endian target machine.
-///
-class ThumbLETargetMachine : public ThumbTargetMachine {
-  void anchor() override;
-public:
-  ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                       StringRef FS, const TargetOptions &Options,
-                       Optional<Reloc::Model> RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
-/// Thumb big endian target machine.
-///
-class ThumbBETargetMachine : public ThumbTargetMachine {
-  void anchor() override;
-public:
-  ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                       StringRef FS, const TargetOptions &Options,
-                       Optional<Reloc::Model> RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 625c428..88bab64 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -8,16 +8,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetObjectFile.h"
+#include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Mangler.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -27,9 +30,9 @@ using namespace dwarf;
 
 void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
-  const ARMTargetMachine &ARM_TM = static_cast<const ARMTargetMachine &>(TM);
-  bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS;
-  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
+  const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
+  bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
+  //  genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
 
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(isAAPCS_ABI);
@@ -40,16 +43,6 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
 
   AttributesSection =
       getContext().getELFSection(".ARM.attributes", ELF::SHT_ARM_ATTRIBUTES, 0);
-
-  // Make code section unreadable when in execute-only mode
-  if (genExecuteOnly) {
-    unsigned  Type = ELF::SHT_PROGBITS;
-    unsigned Flags = ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_ARM_PURECODE;
-    // Since we cannot modify flags for an existing section, we create a new
-    // section with the right flags, and use 0 as the unique ID for
-    // execute-only text
-    TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
-  }
 }
 
 const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
@@ -71,21 +64,27 @@ getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
                                  getContext());
 }
 
-MCSection *
-ARMElfTargetObjectFile::getExplicitSectionGlobal(const GlobalObject *GO,
-                                                 SectionKind SK, const TargetMachine &TM) const {
+static bool isExecuteOnlyFunction(const GlobalObject *GO, SectionKind SK,
+                                  const TargetMachine &TM) {
+  if (const Function *F = dyn_cast<Function>(GO))
+    if (TM.getSubtarget<ARMSubtarget>(*F).genExecuteOnly() && SK.isText())
+      return true;
+  return false;
+}
+
+MCSection *ARMElfTargetObjectFile::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
   // Set execute-only access for the explicit section
-  if (genExecuteOnly && SK.isText())
+  if (isExecuteOnlyFunction(GO, SK, TM))
     SK = SectionKind::getExecuteOnly();
 
   return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
 }
 
-MCSection *
-ARMElfTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
-                                               SectionKind SK, const TargetMachine &TM) const {
+MCSection *ARMElfTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
   // Place the global in the execute-only text section
-  if (genExecuteOnly && SK.isText())
+  if (isExecuteOnlyFunction(GO, SK, TM))
     SK = SectionKind::getExecuteOnly();
 
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, SK, TM);
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
index 24e755d..bd7aa1c 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -11,19 +11,17 @@
 #define LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 
-class MCContext;
-class TargetMachine;
-
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
-  mutable bool genExecuteOnly = false;
 protected:
-  const MCSection *AttributesSection;
+  const MCSection *AttributesSection = nullptr;
+
 public:
   ARMElfTargetObjectFile()
-      : TargetLoweringObjectFileELF(), AttributesSection(nullptr) {
+      : TargetLoweringObjectFileELF() {
     PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
   }
 
@@ -47,4 +45,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_ARMTARGETOBJECTFILE_H
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2b6b36b..51b0fed 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -15,6 +15,24 @@ using namespace llvm;
 
 #define DEBUG_TYPE "armtti"
 
+bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
+                                     const Function *Callee) const {
+  const TargetMachine &TM = getTLI()->getTargetMachine();
+  const FeatureBitset &CallerBits =
+      TM.getSubtargetImpl(*Caller)->getFeatureBits();
+  const FeatureBitset &CalleeBits =
+      TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+  // To inline a callee, all features not in the whitelist must match exactly.
+  bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
+                    (CalleeBits & ~InlineFeatureWhitelist);
+  // For features in the whitelist, the callee's features must be a subset of
+  // the callers'.
+  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
+                     (CalleeBits & InlineFeatureWhitelist);
+  return MatchExact && MatchSubset;
+}
+
 int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
@@ -92,7 +110,8 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 }
 
 
-int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -310,7 +329,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
@@ -335,7 +355,7 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     return LT.first;
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -504,7 +524,7 @@ int ARMTTIImpl::getArithmeticInstrCost(
 }
 
 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
@@ -529,12 +549,14 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
-    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one vldN/vstN instruction.
+    if (NumElts % Factor == 0 &&
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 3c83cd9..0695a4e 100644
--- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -33,9 +33,38 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
+  // Currently the following features are excluded from InlineFeatureWhitelist.
+  // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureVFPOnlySP, FeatureD16
+  // Depending on whether they are set or unset, different
+  // instructions/registers are available. For example, inlining a callee with
+  // -thumb-mode in a caller with +thumb-mode, may cause the assembler to
+  // fail if the callee uses ARM only instructions, e.g. in inline asm.
+  const FeatureBitset InlineFeatureWhitelist = {
+      ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2,
+      ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8,
+      ARM::FeatureFullFP16, ARM::FeatureHWDivThumb,
+      ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex,
+      ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc,
+      ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt,
+      ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS,
+      ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing,
+      ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32,
+      ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR,
+      ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits,
+      ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg,
+      ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx,
+      ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs,
+      ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign,
+      ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding,
+      ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR,
+      ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp,
+      ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor,
+      ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization,
+      ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass,
+      ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
+      ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
+      ARM::FeatureNoNegativeImmediates
+  };
 
   const ARMSubtarget *getST() const { return ST; }
   const ARMTargetLowering *getTLI() const { return TLI; }
@@ -45,6 +74,9 @@ public:
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  bool areInlineCompatible(const Function *Caller,
+                           const Function *Callee) const;
+
   bool enableInterleavedAccessVectorization() { return true; }
 
   /// Floating-point computation using ARMv8 AArch32 Advanced
@@ -82,7 +114,7 @@ public:
     return 13;
   }
 
-  unsigned getRegisterBitWidth(bool Vector) {
+  unsigned getRegisterBitWidth(bool Vector) const {
     if (Vector) {
       if (ST->hasNEON())
         return 128;
@@ -98,9 +130,11 @@ public:
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
 
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
@@ -118,7 +152,7 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c243a2d..1129826 100644
--- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -17,6 +17,8 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -39,10 +41,8 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetParser.h"
@@ -67,6 +67,9 @@ static cl::opt<ImplicitItModeTy> ImplicitItMode(
                clEnumValN(ImplicitItModeTy::ThumbOnly, "thumb",
                           "Warn in ARM, emit implicit ITs in Thumb")));
 
+static cl::opt<bool> AddBuildAttributes("arm-add-build-attributes",
+                                        cl::init(false));
+
 class ARMOperand;
 
 enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
@@ -540,6 +543,10 @@ public:
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
+    // Add build attributes based on the selected target.
+    if (AddBuildAttributes)
+      getTargetStreamer().emitTargetAttributes(STI);
+
     // Not in an ITBlock to start with.
     ITState.CurPosition = ~0U;
 
@@ -915,40 +922,37 @@ public:
     int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
     return Val != -1;
   }
-  bool isFBits16() const {
+
+  template<int64_t N, int64_t M>
+  bool isImmediate() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return Value >= 0 && Value <= 16;
+    return Value >= N && Value <= M;
   }
-  bool isFBits32() const {
+  template<int64_t N, int64_t M>
+  bool isImmediateS4() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return Value >= 1 && Value <= 32;
+    return ((Value & 3) == 0) && Value >= N && Value <= M;
+  }
+  bool isFBits16() const {
+    return isImmediate<0, 17>();
+  }
+  bool isFBits32() const {
+    return isImmediate<1, 33>();
   }
   bool isImm8s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+    return isImmediateS4<-1020, 1020>();
   }
   bool isImm0_1020s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+    return isImmediateS4<0, 1020>();
   }
   bool isImm0_508s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+    return isImmediateS4<0, 508>();
   }
   bool isImm0_508s4Neg() const {
     if (!isImm()) return false;
@@ -958,27 +962,6 @@ public:
     // explicitly exclude zero. we want that to use the normal 0_508 version.
     return ((Value & 3) == 0) && Value > 0 && Value <= 508;
   }
-  bool isImm0_239() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 240;
-  }
-  bool isImm0_255() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 256;
-  }
-  bool isImm0_4095() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 4096;
-  }
   bool isImm0_4095Neg() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -986,145 +969,17 @@ public:
     int64_t Value = -CE->getValue();
     return Value > 0 && Value < 4096;
   }
-  bool isImm0_1() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 2;
-  }
-  bool isImm0_3() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 4;
-  }
   bool isImm0_7() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 8;
-  }
-  bool isImm0_15() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 16;
-  }
-  bool isImm0_31() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 32;
-  }
-  bool isImm0_63() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 64;
-  }
-  bool isImm8() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 8;
-  }
-  bool isImm16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 16;
-  }
-  bool isImm32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 32;
-  }
-  bool isShrImm8() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 8;
-  }
-  bool isShrImm16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 16;
-  }
-  bool isShrImm32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 32;
-  }
-  bool isShrImm64() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 64;
-  }
-  bool isImm1_7() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 8;
-  }
-  bool isImm1_15() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 16;
-  }
-  bool isImm1_31() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 32;
+    return isImmediate<0, 7>();
   }
   bool isImm1_16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 17;
+    return isImmediate<1, 16>();
   }
   bool isImm1_32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 33;
+    return isImmediate<1, 32>();
   }
-  bool isImm0_32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 33;
-  }
-  bool isImm0_65535() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 65536;
+  bool isImm8_255() const {
+    return isImmediate<8, 255>();
   }
   bool isImm256_65535Expr() const {
     if (!isImm()) return false;
@@ -1145,32 +1000,16 @@ public:
     return Value >= 0 && Value < 65536;
   }
   bool isImm24bit() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value <= 0xffffff;
+    return isImmediate<0, 0xffffff + 1>();
   }
   bool isImmThumbSR() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 33;
+    return isImmediate<1, 33>();
   }
   bool isPKHLSLImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 32;
+    return isImmediate<0, 32>();
   }
   bool isPKHASRImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 32;
+    return isImmediate<0, 33>();
   }
   bool isAdrLabel() const {
     // If we have an immediate that's not a constant, treat it as a label
@@ -1187,6 +1026,15 @@ public:
             ARM_AM::getSOImmVal(-Value) != -1);
   }
   bool isT2SOImm() const {
+    // If we have an immediate that's not a constant, treat it as an expression
+    // needing a fixup.
+    if (isImm() && !isa<MCConstantExpr>(getImm())) {
+      // We want to avoid matching :upper16: and :lower16: as we want these
+      // expressions to match in isImm0_65535Expr()
+      const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(getImm());
+      return (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
+                             ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16));
+    }
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -1245,6 +1093,20 @@ public:
     return ARM_AM::getSOImmVal(Value) == -1 &&
       ARM_AM::getSOImmVal(-Value) != -1;
   }
+  bool isThumbModImmNeg1_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int32_t Value = -(int32_t)CE->getValue();
+    return 0 < Value && Value < 8;
+  }
+  bool isThumbModImmNeg8_255() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int32_t Value = -(int32_t)CE->getValue();
+    return 7 < Value && Value < 256;
+  }
   bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
   bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
@@ -2035,6 +1897,20 @@ public:
     Inst.addOperand(MCOperand::createImm(Enc));
   }
 
+  void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Val = -CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Val = -CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
   void addBitfieldOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // Munge the lsb/width into a bitfield mask.
@@ -2141,7 +2017,7 @@ public:
     // The operand is actually a t2_so_imm, but we have its bitwise
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(~CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue()));
   }
 
   void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
@@ -2149,7 +2025,7 @@ public:
     // The operand is actually a t2_so_imm, but we have its
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
   }
 
   void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
@@ -4330,7 +4206,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
 
       // If some specific flag is already set, it means that some letter is
       // present more than once, this is not acceptable.
-      if (FlagsVal == ~0U || (FlagsVal & Flag))
+      if (Flag == ~0U || (FlagsVal & Flag))
         return MatchOperand_NoMatch;
       FlagsVal |= Flag;
     }
@@ -5373,6 +5249,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
     // Fall though for the Identifier case that is not a register or a
     // special name.
+    LLVM_FALLTHROUGH;
   }
   case AsmToken::LParen:  // parenthesized expressions like (_strcmp-4)
   case AsmToken::Integer: // things like 1f and 2b as a branch targets
@@ -5484,7 +5361,8 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   enum {
     COFF = (1 << MCObjectFileInfo::IsCOFF),
     ELF = (1 << MCObjectFileInfo::IsELF),
-    MACHO = (1 << MCObjectFileInfo::IsMachO)
+    MACHO = (1 << MCObjectFileInfo::IsMachO),
+    WASM = (1 << MCObjectFileInfo::IsWasm),
   };
   static const struct PrefixEntry {
     const char *Spelling;
@@ -5518,6 +5396,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   case MCObjectFileInfo::IsCOFF:
     CurrentFormat = COFF;
     break;
+  case MCObjectFileInfo::IsWasm:
+    CurrentFormat = WASM;
+    break;
   }
 
   if (~Prefix->SupportedFormats & CurrentFormat) {
@@ -6301,10 +6182,6 @@ bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
   else if (ListContainsPC && ListContainsLR)
     return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
                  "PC and LR may not be in the register list simultaneously");
-  else if (inITBlock() && !lastInITBlock() && ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "instruction must be outside of IT block or the last "
-                 "instruction in an IT block");
   return false;
 }
 
@@ -6366,6 +6243,12 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     return Warning(Loc, "predicated instructions should be in IT block");
   }
 
+  // PC-setting instructions in an IT block, but not the last instruction of
+  // the block, are UNPREDICTABLE.
+  if (inExplicitITBlock() && !lastInITBlock() && isITBlockTerminator(Inst)) {
+    return Error(Loc, "instruction must be outside of IT block or the last instruction in an IT block");
+  }
+
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
   case ARM::LDRD:
@@ -6676,6 +6559,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::MOVi16:
+  case ARM::MOVTi16:
   case ARM::t2MOVi16:
   case ARM::t2MOVTi16:
     {
@@ -6977,6 +6861,17 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
 bool ARMAsmParser::processInstruction(MCInst &Inst,
                                       const OperandVector &Operands,
                                       MCStreamer &Out) {
+  // Check if we have the wide qualifier, because if it's present we
+  // must avoid selecting a 16-bit thumb instruction.
+  bool HasWideQualifier = false;
+  for (auto &Op : Operands) {
+    ARMOperand &ARMOp = static_cast<ARMOperand&>(*Op);
+    if (ARMOp.isToken() && ARMOp.getToken() == ".w") {
+      HasWideQualifier = true;
+      break;
+    }
+  }
+
   switch (Inst.getOpcode()) {
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
@@ -7056,8 +6951,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
         Inst.getOperand(1).getImm() <= 0xff &&
-        !(static_cast<ARMOperand &>(*Operands[2]).isToken() &&
-          static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w"))
+        !HasWideQualifier)
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -7088,10 +6982,9 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     else if (Inst.getOpcode() == ARM::t2LDRConstPool)
       TmpInst.setOpcode(ARM::t2LDRpci);
     const ARMOperand &PoolOperand =
-      (static_cast<ARMOperand &>(*Operands[2]).isToken() &&
-       static_cast<ARMOperand &>(*Operands[2]).getToken() == ".w") ?
-      static_cast<ARMOperand &>(*Operands[4]) :
-      static_cast<ARMOperand &>(*Operands[3]);
+      (HasWideQualifier ?
+       static_cast<ARMOperand &>(*Operands[4]) :
+       static_cast<ARMOperand &>(*Operands[3]));
     const MCExpr *SubExprVal = PoolOperand.getConstantPoolImm();
     // If SubExprVal is a constant we may be able to use a MOV
     if (isa<MCConstantExpr>(SubExprVal) &&
@@ -8232,10 +8125,9 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2LSRri:
   case ARM::t2ASRri: {
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
-        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
-        !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-          static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("unexpected opcode");
@@ -8269,7 +8161,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         isARMLowRegister(Inst.getOperand(2).getReg()) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr))
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsr) &&
+        !HasWideQualifier)
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
@@ -8303,27 +8196,43 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     bool isNarrow = false;
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
-        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi))
+        inITBlock() == (Inst.getOpcode() == ARM::t2MOVsi) &&
+        !HasWideQualifier)
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
-    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
-    default: llvm_unreachable("unexpected opcode!");
-    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
-    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
-    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
-    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
-    case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
-    }
+    unsigned Shift = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
     unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    bool isMov = false;
+    // MOV rd, rm, LSL #0 is actually a MOV instruction
+    if (Shift == ARM_AM::lsl && Amount == 0) {
+      isMov = true;
+      // The 16-bit encoding of MOV rd, rm, LSL #N is explicitly encoding T2 of
+      // MOV (register) in the ARMv8-A and ARMv8-M manuals, and immediate 0 is
+      // unpredictable in an IT block so the 32-bit encoding T3 has to be used
+      // instead.
+      if (inITBlock()) {
+        isNarrow = false;
+      }
+      newOpc = isNarrow ? ARM::tMOVSr : ARM::t2MOVr;
+    } else {
+      switch(Shift) {
+      default: llvm_unreachable("unexpected opcode!");
+      case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+      case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+      case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+      case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+      case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+      }
+    }
     if (Amount == 32) Amount = 0;
     TmpInst.setOpcode(newOpc);
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
-    if (isNarrow)
+    if (isNarrow && !isMov)
       TmpInst.addOperand(MCOperand::createReg(
           Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
-    if (newOpc != ARM::t2RRX)
+    if (newOpc != ARM::t2RRX && !isMov)
       TmpInst.addOperand(MCOperand::createImm(Amount));
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
     TmpInst.addOperand(Inst.getOperand(4));
@@ -8515,11 +8424,10 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // wide encoding wasn't explicit.
     if (Inst.getOperand(0).getReg() != Inst.getOperand(1).getReg() ||
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
-        (unsigned)Inst.getOperand(2).getImm() > 255 ||
-        ((!inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != 0)) ||
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+        (Inst.getOperand(2).isImm() &&
+         (unsigned)Inst.getOperand(2).getImm() > 255) ||
+        Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+        HasWideQualifier)
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(Inst.getOpcode() == ARM::t2ADDri ?
@@ -8548,8 +8456,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     }
     if (!Transform ||
         Inst.getOperand(5).getReg() != 0 ||
-        (static_cast<ARMOperand &>(*Operands[3]).isToken() &&
-         static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w"))
+        HasWideQualifier)
       break;
     MCInst TmpInst;
     TmpInst.setOpcode(ARM::tADDhirr);
@@ -8667,12 +8574,10 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     // If we can use the 16-bit encoding and the user didn't explicitly
     // request the 32-bit variant, transform it here.
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
-        (unsigned)Inst.getOperand(1).getImm() <= 255 &&
-        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
-          Inst.getOperand(4).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        (Inst.getOperand(1).isImm() &&
+         (unsigned)Inst.getOperand(1).getImm() <= 255) &&
+        Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tMOVi8);
@@ -8693,8 +8598,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == ARMCC::AL &&
         Inst.getOperand(4).getReg() == ARM::CPSR &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        !HasWideQualifier) {
       // The operands aren't the same for tMOV[S]r... (no cc_out)
       MCInst TmpInst;
       TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
@@ -8716,8 +8620,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(2).getImm() == 0 &&
-        (!static_cast<ARMOperand &>(*Operands[2]).isToken() ||
-         static_cast<ARMOperand &>(*Operands[2]).getToken() != ".w")) {
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
       default: llvm_unreachable("Illegal opcode!");
@@ -8816,11 +8719,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
-        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
-         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
-             ".w"))) {
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -8856,11 +8756,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
-        ((!inITBlock() && Inst.getOperand(5).getReg() == ARM::CPSR) ||
-         (inITBlock() && Inst.getOperand(5).getReg() != ARM::CPSR)) &&
-        (!static_cast<ARMOperand &>(*Operands[3]).isToken() ||
-         !static_cast<ARMOperand &>(*Operands[3]).getToken().equals_lower(
-             ".w"))) {
+        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
         default: llvm_unreachable("unexpected opcode");
@@ -8918,6 +8815,9 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
         inITBlock())
       return Match_RequiresNotITBlock;
+    // LSL with zero immediate is not allowed in an IT block
+    if (Opc == ARM::tLSLri && Inst.getOperand(3).getImm() == 0 && inITBlock())
+      return Match_RequiresNotITBlock;
   } else if (isThumbOne()) {
     // Some high-register supporting Thumb1 encodings only allow both registers
     // to be from r0-r7 when in Thumb2.
@@ -8932,6 +8832,22 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_RequiresV6;
   }
 
+  // Before ARMv8 the rules for when SP is allowed in t2MOVr are more complex
+  // than the loop below can handle, so it uses the GPRnopc register class and
+  // we do SP handling here.
+  if (Opc == ARM::t2MOVr && !hasV8Ops())
+  {
+    // SP as both source and destination is not allowed
+    if (Inst.getOperand(0).getReg() == ARM::SP &&
+        Inst.getOperand(1).getReg() == ARM::SP)
+      return Match_RequiresV8;
+    // When flags-setting SP as either source or destination is not allowed
+    if (Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (Inst.getOperand(0).getReg() == ARM::SP ||
+         Inst.getOperand(1).getReg() == ARM::SP))
+      return Match_RequiresV8;
+  }
+
   for (unsigned I = 0; I < MCID.NumOperands; ++I)
     if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
       // rGPRRegClass excludes PC, and also excluded SP before ARMv8
@@ -8945,7 +8861,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 }
 
 namespace llvm {
-template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
+template <> inline bool IsCPSRDead<MCInst>(const MCInst *Instr) {
   return true; // In an assembly source, no need to second-guess
 }
 }
@@ -8975,6 +8891,7 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
   // operands. We only care about Thumb instructions here, as ARM instructions
   // obviously can't be in an IT block.
   switch (Inst.getOpcode()) {
+  case ARM::tLDMIA:
   case ARM::t2LDMIA:
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB:
@@ -9076,6 +8993,8 @@ unsigned ARMAsmParser::MatchInstruction(OperandVector &Operands, MCInst &Inst,
   return PlainMatchResult;
 }
 
+std::string ARMMnemonicSpellCheck(StringRef S, uint64_t FBS);
+
 static const char *getSubtargetFeatureName(uint64_t Val);
 bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                            OperandVector &Operands,
@@ -9088,6 +9007,13 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
                                  PendConditionalInstruction, Out);
 
+  SMLoc ErrorLoc;
+  if (ErrorInfo < Operands.size()) {
+    ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+  }
+
   switch (MatchResult) {
   case Match_Success:
     // Context sensitive operand constraints aren't handled by the matcher,
@@ -9162,9 +9088,13 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
     return Error(ErrorLoc, "invalid operand for instruction");
   }
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction",
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = ARMMnemonicSpellCheck(
+      ((ARMOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
                  ((ARMOperand &)*Operands[0]).getLocRange());
+  }
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
   case Match_RequiresITBlock:
@@ -9177,16 +9107,52 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv8 or later");
   case Match_RequiresFlagSetting:
     return Error(IDLoc, "no flag-preserving variant of this instruction available");
-  case Match_ImmRange0_15: {
-    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
-    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+  case Match_ImmRange0_1:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,1]");
+  case Match_ImmRange0_3:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,3]");
+  case Match_ImmRange0_7:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,7]");
+  case Match_ImmRange0_15:
     return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
-  }
-  case Match_ImmRange0_239: {
-    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
-    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+  case Match_ImmRange0_31:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,31]");
+  case Match_ImmRange0_32:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,32]");
+  case Match_ImmRange0_63:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,63]");
+  case Match_ImmRange0_239:
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
-  }
+  case Match_ImmRange0_255:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,255]");
+  case Match_ImmRange0_4095:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,4095]");
+  case Match_ImmRange0_65535:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,65535]");
+  case Match_ImmRange1_7:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,7]");
+  case Match_ImmRange1_8:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,8]");
+  case Match_ImmRange1_15:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,15]");
+  case Match_ImmRange1_16:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,16]");
+  case Match_ImmRange1_31:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,31]");
+  case Match_ImmRange1_32:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,32]");
+  case Match_ImmRange1_64:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,64]");
+  case Match_ImmRange8_8:
+    return Error(ErrorLoc, "immediate operand must be 8.");
+  case Match_ImmRange16_16:
+    return Error(ErrorLoc, "immediate operand must be 16.");
+  case Match_ImmRange32_32:
+    return Error(ErrorLoc, "immediate operand must be 32.");
+  case Match_ImmRange256_65535:
+    return Error(ErrorLoc, "immediate operand must be in the range [255,65535]");
+  case Match_ImmRange0_16777215:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,0xffffff]");
   case Match_AlignedMemoryRequiresNone:
   case Match_DupAlignedMemoryRequiresNone:
   case Match_AlignedMemoryRequires16:
@@ -10244,8 +10210,8 @@ static const struct {
   { ARM::AEK_CRYPTO,  Feature_HasV8,
     {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} },
   { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} },
-  { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
-    {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} },
+  { (ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass,
+    {ARM::FeatureHWDivThumb, ARM::FeatureHWDivARM} },
   { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} },
   { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} },
   { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} },
diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ac3d8c7..5ab236b 100644
--- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -7,21 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
-#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <vector>
 
 using namespace llvm;
@@ -31,6 +34,7 @@ using namespace llvm;
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+
   // Handles the condition code status of instructions in IT blocks
   class ITStatus
   {
@@ -81,9 +85,7 @@ namespace {
     private:
       std::vector<unsigned char> ITStates;
   };
-}
 
-namespace {
 /// ARM disassembler for all ARM platforms.
 class ARMDisassembler : public MCDisassembler {
 public:
@@ -91,7 +93,7 @@ public:
     MCDisassembler(STI, Ctx) {
   }
 
-  ~ARMDisassembler() override {}
+  ~ARMDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -106,7 +108,7 @@ public:
     MCDisassembler(STI, Ctx) {
   }
 
-  ~ThumbDisassembler() override {}
+  ~ThumbDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -118,7 +120,8 @@ private:
   DecodeStatus AddThumbPredicate(MCInst&) const;
   void UpdateThumbVFPPredicate(MCInst&) const;
 };
-}
+
+} // end anonymous namespace
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -135,7 +138,6 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
   llvm_unreachable("Invalid DecodeStatus!");
 }
 
-
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
 static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -319,7 +321,6 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
 static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 
-
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbBROperand(MCInst &Inst, unsigned Val,
@@ -395,8 +396,9 @@ static DecodeStatus DecodeT2ShifterImmOperand(MCInst &Inst, unsigned Val,
 
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
                                             uint64_t Address, const void *Decoder);
+
 #include "ARMGenDisassemblerTables.inc"
 
 static MCDisassembler *createARMDisassembler(const Target &T,
@@ -416,8 +418,7 @@ static DecodeStatus checkDecodedInstruction(MCInst &MI, uint64_t &Size,
                                             uint64_t Address, raw_ostream &OS,
                                             raw_ostream &CS,
                                             uint32_t Insn,
-                                            DecodeStatus Result)
-{
+                                            DecodeStatus Result) {
   switch (MI.getOpcode()) {
     case ARM::HVC: {
       // HVC is undefined if condition = 0xf otherwise upredictable
@@ -461,74 +462,39 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
   }
 
-  // VFP and NEON instructions, similarly, are shared between ARM
-  // and Thumb modes.
-  Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
-                             this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
+  struct DecodeTable {
+    const uint8_t *P;
+    bool DecodePred;
+  };
 
-  Result =
-      decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
+  const DecodeTable Tables[] = {
+      {DecoderTableVFP32, false},      {DecoderTableVFPV832, false},
+      {DecoderTableNEONData32, true},  {DecoderTableNEONLoadStore32, true},
+      {DecoderTableNEONDup32, true},   {DecoderTablev8NEON32, false},
+      {DecoderTablev8Crypto32, false},
+  };
 
-  Result =
-      decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
+  for (auto Table : Tables) {
+    Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      // Add a fake predicate operand, because we share these instruction
+      // definitions with Thumb2 where these instructions are predicable.
+      if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
+        return MCDisassembler::Fail;
+      return Result;
+    }
   }
 
-  Size = 0;
+  Size = 4;
   return MCDisassembler::Fail;
 }
 
 namespace llvm {
+
 extern const MCInstrDesc ARMInsts[];
-}
+
+} // end namespace llvm
 
 /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
 /// immediate Value in the MCInst.  The immediate Value has had any PC
@@ -859,7 +825,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   return MCDisassembler::Fail;
 }
 
-
 extern "C" void LLVMInitializeARMDisassembler() {
   TargetRegistry::RegisterMCDisassembler(getTheARMLETarget(),
                                          createARMDisassembler);
@@ -1056,7 +1021,6 @@ static const uint16_t QPRDecoderTable[] = {
     ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
 };
 
-
 static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 31 || (RegNo & 1) != 0)
@@ -1676,7 +1640,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::LDRD:
     case ARM::LDRD_PRE:
     case ARM::LDRD_POST:
-      if (type && Rn == 15){
+      if (type && Rn == 15) {
         if (Rt2 == 15)
           S = MCDisassembler::SoftFail;
         break;
@@ -1693,7 +1657,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::LDRH:
     case ARM::LDRH_PRE:
     case ARM::LDRH_POST:
-      if (type && Rn == 15){
+      if (type && Rn == 15) {
         if (Rt == 15)
           S = MCDisassembler::SoftFail;
         break;
@@ -1711,7 +1675,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn,
     case ARM::LDRSB:
     case ARM::LDRSB_PRE:
     case ARM::LDRSB_POST:
-      if (type && Rn == 15){
+      if (type && Rn == 15) {
         if (Rt == 15)
           S = MCDisassembler::SoftFail;
         break;
@@ -2309,7 +2273,6 @@ DecodeBranchImmInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -3748,7 +3711,6 @@ static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
-
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4073,7 +4035,7 @@ static DecodeStatus DecodeT2SOImm(MCInst &Inst, unsigned Val,
 
 static DecodeStatus
 DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
-                            uint64_t Address, const void *Decoder){
+                            uint64_t Address, const void *Decoder) {
   if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<9>(Val<<1) + 4,
                                 true, 2, Inst, Decoder))
     Inst.addOperand(MCOperand::createImm(SignExtend32<9>(Val << 1)));
@@ -4081,7 +4043,8 @@ DecodeThumbBCCTargetOperand(MCInst &Inst, unsigned Val,
 }
 
 static DecodeStatus DecodeThumbBLTargetOperand(MCInst &Inst, unsigned Val,
-                                       uint64_t Address, const void *Decoder){
+                                               uint64_t Address,
+                                               const void *Decoder) {
   // Val is passed in as S:J1:J2:imm10:imm11
   // Note no trailing zero after imm11.  Also the J1 and J2 values are from
   // the encoded instruction.  So here change to I1 and I2 values via:
@@ -4247,7 +4210,8 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
 }
 
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
-                                         uint64_t Address, const void *Decoder){
+                                         uint64_t Address,
+                                         const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Rd = fieldFromInstruction(Insn, 12, 4);
@@ -4323,7 +4287,6 @@ static DecodeStatus DecodeLDRPreReg(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeSTRPreImm(MCInst &Inst, unsigned Insn,
                             uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4506,7 +4469,6 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeVLD2LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4637,7 +4599,6 @@ static DecodeStatus DecodeVST2LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeVLD3LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4771,7 +4732,6 @@ static DecodeStatus DecodeVST3LN(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
                          uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -5266,9 +5226,8 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
   return S;
 }
 
-static DecodeStatus DecoderForMRRC2AndMCRR2(llvm::MCInst &Inst, unsigned Val,
+static DecodeStatus DecoderForMRRC2AndMCRR2(MCInst &Inst, unsigned Val,
                                             uint64_t Address, const void *Decoder) {
-
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned CRm = fieldFromInstruction(Val, 0, 4);
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 3667952..57b9136 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -20,7 +20,15 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -73,7 +81,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
-
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 9d80eed..86873a3 100644
--- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -235,4 +235,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index a58d5b3..a77df7a 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -7,15 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
-#include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMAsmBackend.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMAsmBackendDarwin.h"
 #include "MCTargetDesc/ARMAsmBackendELF.h"
 #include "MCTargetDesc/ARMAsmBackendWinCOFF.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -31,10 +33,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -98,6 +98,7 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_t2_movt_hi16", 0, 20, 0},
       {"fixup_t2_movw_lo16", 0, 20, 0},
       {"fixup_arm_mod_imm", 0, 12, 0},
+      {"fixup_t2_so_imm", 0, 26, 0},
   };
   const static MCFixupKindInfo InfosBE[ARM::NumTargetFixupKinds] = {
       // This table *must* be in the order that the fixup_* kinds are defined in
@@ -148,6 +149,7 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_t2_movt_hi16", 12, 20, 0},
       {"fixup_t2_movw_lo16", 12, 20, 0},
       {"fixup_arm_mod_imm", 20, 12, 0},
+      {"fixup_t2_so_imm", 26, 6, 0},
   };
 
   if (Kind < FirstTargetFixupKind)
@@ -356,14 +358,30 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
   return Value;
 }
 
-unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                         bool IsPCRel, MCContext *Ctx,
-                                         bool IsLittleEndian,
-                                         bool IsResolved) const {
+unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
+                                         const MCFixup &Fixup,
+                                         const MCValue &Target, uint64_t Value,
+                                         bool IsResolved, MCContext &Ctx,
+                                         bool IsLittleEndian) const {
   unsigned Kind = Fixup.getKind();
+
+  // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT
+  // and .word relocations they put the Thumb bit into the addend if possible.
+  // Other relocation types don't want this bit though (branches couldn't encode
+  // it if it *was* present, and no other relocations exist) and it can
+  // interfere with checking valid expressions.
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) &&
+        (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 ||
+         Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 ||
+         Kind == ARM::fixup_t2_movt_hi16))
+      Value |= 1;
+  }
+
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown fixup kind!");
+    Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
+    return 0;
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
@@ -373,7 +391,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_SecRel_4:
     return Value;
   case ARM::fixup_arm_movt_hi16:
-    if (!IsPCRel)
+    if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF())
       Value >>= 16;
     LLVM_FALLTHROUGH;
   case ARM::fixup_arm_movw_lo16: {
@@ -385,7 +403,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     return Value;
   }
   case ARM::fixup_t2_movt_hi16:
-    if (!IsPCRel)
+    if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF())
       Value >>= 16;
     LLVM_FALLTHROUGH;
   case ARM::fixup_t2_movw_lo16: {
@@ -412,8 +430,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       isAdd = false;
     }
-    if (Ctx && Value >= 4096) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 4096) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -433,8 +451,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       opc = 2; // 0b0010
     }
-    if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (ARM_AM::getSOImmVal(Value) == -1) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     // Encode the immediate and shift the opcode into place.
@@ -502,6 +520,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_bl: {
+    // FIXME: We get both thumb1 and thumb2 in here, so we can only check for
+    // the less strict thumb2 value.
+    if (!isInt<26>(Value - 4)) {
+      Ctx.reportError(Fixup.getLoc(), "Relocation out of range");
+      return 0;
+    }
+
     // The value doesn't encode the low bit (always zero) and is offset by
     // four. The 32-bit immediate value is encoded as
     //   imm32 = SignExtend(S:I1:I2:imm10:imm11:0)
@@ -541,8 +566,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    if (Ctx && Value % 4 != 0) {
-      Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+    if (Value % 4 != 0) {
+      Ctx.reportError(Fixup.getLoc(), "misaligned ARM call destination");
       return 0;
     }
 
@@ -568,10 +593,10 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case ARM::fixup_arm_thumb_cp:
     // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
     // could have an error on our hands.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
@@ -581,8 +606,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // CB instructions can only branch to offsets in [4, 126] in multiples of 2
     // so ensure that the raw value LSB is zero and it lies in [2, 130].
     // An offset of 2 will be relaxed to a NOP.
-    if (Ctx && ((int64_t)Value < 2 || Value > 0x82 || Value & 1)) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if ((int64_t)Value < 2 || Value > 0x82 || Value & 1) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     // Offset by 4 and don't encode the lower bit, which is always 0.
@@ -592,21 +617,21 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
-               !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
+        !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
     return ((Value - 4) >> 1) & 0x7ff;
   case ARM::fixup_arm_thumb_bcc:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
@@ -620,8 +645,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value = (Value & 0xf) | ((Value & 0xf0) << 4);
@@ -641,8 +666,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     }
     // These values don't encode the low two bits since they're always zero.
     Value >>= 2;
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -667,13 +692,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // These values don't encode the low bit since it's always zero.
-    if (Ctx && (Value & 1)) {
-      Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+    if (Value & 1) {
+      Ctx.reportError(Fixup.getLoc(), "invalid value for this fixup");
       return 0;
     }
     Value >>= 1;
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -687,38 +712,38 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_mod_imm:
     Value = ARM_AM::getSOImmVal(Value);
-    if (Ctx && Value >> 12) {
-      Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+    if (Value >> 12) {
+      Ctx.reportError(Fixup.getLoc(), "out of range immediate fixup value");
       return 0;
     }
     return Value;
+  case ARM::fixup_t2_so_imm: {
+    Value = ARM_AM::getT2SOImmVal(Value);
+    if ((int64_t)Value < 0) {
+      Ctx.reportError(Fixup.getLoc(), "out of range immediate fixup value");
+      return 0;
+    }
+    // Value will contain a 12-bit value broken up into a 4-bit shift in bits
+    // 11:8 and the 8-bit immediate in 0:7. The instruction has the immediate
+    // in 0:7. The 4-bit shift is split up into i:imm3 where i is placed at bit
+    // 10 of the upper half-word and imm3 is placed at 14:12 of the lower
+    // half-word.
+    uint64_t EncValue = 0;
+    EncValue |= (Value & 0x800) << 15;
+    EncValue |= (Value & 0x700) << 4;
+    EncValue |= (Value & 0xff);
+    return swapHalfWords(EncValue, IsLittleEndian);
+  }
   }
 }
 
-void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCAsmLayout &Layout,
-                                      const MCFixup &Fixup,
-                                      const MCFragment *DF,
-                                      const MCValue &Target, uint64_t &Value,
-                                      bool &IsResolved) {
+bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target) {
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
-  // MachO (the only user of "Value") tries to make .o files that look vaguely
-  // pre-linked, so for MOVW/MOVT and .word relocations they put the Thumb bit
-  // into the addend if possible. Other relocation types don't want this bit
-  // though (branches couldn't encode it if it *was* present, and no other
-  // relocations exist) and it can interfere with checking valid expressions.
-  if ((unsigned)Fixup.getKind() == FK_Data_4 ||
-      (unsigned)Fixup.getKind() == ARM::fixup_arm_movw_lo16 ||
-      (unsigned)Fixup.getKind() == ARM::fixup_arm_movt_hi16 ||
-      (unsigned)Fixup.getKind() == ARM::fixup_t2_movw_lo16 ||
-      (unsigned)Fixup.getKind() == ARM::fixup_t2_movt_hi16) {
-    if (Sym) {
-      if (Asm.isThumbFunc(Sym))
-        Value |= 1;
-    }
-  }
-  if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
+  const unsigned FixupKind = Fixup.getKind() ;
+  if ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     assert(Sym && "How did we resolve this?");
 
     // If the symbol is external the linker will handle it.
@@ -726,23 +751,32 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
 
     // If the symbol is out of range, produce a relocation and hope the
     // linker can handle it. GNU AS produces an error in this case.
-    if (Sym->isExternal() || Value >= 0x400004)
-      IsResolved = false;
+    if (Sym->isExternal())
+      return true;
+  }
+  // Create relocations for unconditional branches to function symbols with
+  // different execution mode in ELF binaries.
+  if (Sym && Sym->isELF()) {
+    unsigned Type = dyn_cast<MCSymbolELF>(Sym)->getType();
+    if ((Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)) {
+      if (Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_uncondbranch))
+        return true;
+      if (!Asm.isThumbFunc(Sym) && (FixupKind == ARM::fixup_arm_thumb_br ||
+                                    FixupKind == ARM::fixup_arm_thumb_bl ||
+                                    FixupKind == ARM::fixup_t2_condbranch ||
+                                    FixupKind == ARM::fixup_t2_uncondbranch))
+        return true;
+    }
   }
   // We must always generate a relocation for BL/BLX instructions if we have
   // a symbol to reference, as the linker relies on knowing the destination
   // symbol's thumb-ness to get interworking right.
-  if (A && ((unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_blx ||
-            (unsigned)Fixup.getKind() == ARM::fixup_arm_blx ||
-            (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
-            (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
-    IsResolved = false;
-
-  // Try to get the encoded value for the fixup as-if we're mapping it into
-  // the instruction. This allows adjustFixupValue() to issue a diagnostic
-  // if the value aren't invalid.
-  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
-                         IsLittleEndian, IsResolved);
+  if (A && (FixupKind == ARM::fixup_arm_thumb_blx ||
+            FixupKind == ARM::fixup_arm_blx ||
+            FixupKind == ARM::fixup_arm_uncondbl ||
+            FixupKind == ARM::fixup_arm_condbl))
+    return true;
+  return false;
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -788,6 +822,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_so_imm:
     return 4;
 
   case FK_SecRel_2:
@@ -840,28 +875,31 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
   case ARM::fixup_arm_mod_imm:
+  case ARM::fixup_t2_so_imm:
     // Instruction size is 4 bytes.
     return 4;
   }
 }
 
-void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsResolved) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value =
-      adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
+  MCContext &Ctx = Asm.getContext();
+  Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx,
+                           IsLittleEndian);
   if (!Value)
     return; // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
   if (!IsLittleEndian) {
     FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
-    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
   }
 
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 84caaac..0237496 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -38,19 +38,17 @@ public:
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  /// processFixupValue - Target hook to process the literal value of a fixup
-  /// if necessary.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
-  unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
-                            MCContext *Ctx, bool IsLittleEndian,
-                            bool IsResolved) const;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
+
+  unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup,
+                            const MCValue &Target, uint64_t Value,
+                            bool IsResolved, MCContext &Ctx,
+                            bool IsLittleEndian) const;
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
 
   unsigned getRelaxedOpcode(unsigned Op) const;
 
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 09dc017..bd729fa 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -11,7 +11,7 @@
 #define LLVM_LIB_TARGET_ARM_ARMASMBACKENDDARWIN_H
 
 #include "ARMAsmBackend.h"
-#include "llvm/Support/MachO.h"
+#include "llvm/BinaryFormat/MachO.h"
 
 namespace llvm {
 class ARMAsmBackendDarwin : public ARMAsmBackend {
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 088b420..92e553f 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -291,7 +291,11 @@ namespace ARMII {
 
     /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
     /// just that part of the flag set.
-    MO_OPTION_MASK = 0x1f,
+    MO_OPTION_MASK = 0x0f,
+
+    /// MO_SBREL - On a symbol operand, this represents a static base relative
+    /// relocation. Used in movw and movt instructions.
+    MO_SBREL = 0x10,
 
     /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
     /// to the symbol is for an import stub.  This is used for DLL import
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 6f19754..59f31be 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -7,32 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
   class ARMELFObjectWriter : public MCELFObjectTargetWriter {
     enum { DefaultEABIVersion = 0x05000000U };
-    unsigned GetRelocTypeInner(const MCValue &Target,
-                               const MCFixup &Fixup,
-                               bool IsPCRel) const;
 
+    unsigned GetRelocTypeInner(const MCValue &Target, const MCFixup &Fixup,
+                               bool IsPCRel, MCContext &Ctx) const;
 
   public:
     ARMELFObjectWriter(uint8_t OSABI);
 
-    ~ARMELFObjectWriter() override;
+    ~ARMELFObjectWriter() override = default;
 
     unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                           const MCFixup &Fixup, bool IsPCRel) const override;
@@ -40,15 +40,14 @@ namespace {
     bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                  unsigned Type) const override;
   };
-}
+
+} // end anonymous namespace
 
 ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
   : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI,
                             ELF::EM_ARM,
                             /*HasRelocationAddend*/ false) {}
 
-ARMELFObjectWriter::~ARMELFObjectWriter() {}
-
 bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
                                                  unsigned Type) const {
   // FIXME: This is extremely conservative. This really needs to use a
@@ -70,19 +69,20 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
-  return GetRelocTypeInner(Target, Fixup, IsPCRel);
+  return GetRelocTypeInner(Target, Fixup, IsPCRel, Ctx);
 }
 
 unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
-                                               bool IsPCRel) const  {
+                                               bool IsPCRel,
+                                               MCContext &Ctx) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   unsigned Type = 0;
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
     default:
-      report_fatal_error("unsupported relocation on symbol");
+      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_4:
       switch (Modifier) {
@@ -161,7 +161,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   } else {
     switch ((unsigned)Fixup.getKind()) {
     default:
-      report_fatal_error("unsupported relocation on symbol");
+      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_1:
       switch (Modifier) {
@@ -270,10 +270,26 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       }
       break;
     case ARM::fixup_t2_movt_hi16:
-      Type = ELF::R_ARM_THM_MOVT_ABS;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_THM_MOVT_ABS;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_THM_MOVT_BREL;
+        break;
+      }
       break;
     case ARM::fixup_t2_movw_lo16:
-      Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_THM_MOVW_BREL_NC;
+        break;
+      }
       break;
     }
   }
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index f6bb35d..93f4006 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -15,8 +15,13 @@
 
 #include "ARMRegisterInfo.h"
 #include "ARMUnwindOpAsm.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -24,25 +29,32 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/TargetParser.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 using namespace llvm;
 
@@ -101,16 +113,21 @@ ARMTargetAsmStreamer::ARMTargetAsmStreamer(MCStreamer &S,
                                            bool VerboseAsm)
     : ARMTargetStreamer(S), OS(OS), InstPrinter(InstPrinter),
       IsVerboseAsm(VerboseAsm) {}
+
 void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
 void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
 void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+
 void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
   OS << "\t.personality " << Personality->getName() << '\n';
 }
+
 void ARMTargetAsmStreamer::emitPersonalityIndex(unsigned Index) {
   OS << "\t.personalityindex " << Index << '\n';
 }
+
 void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+
 void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
                                      int64_t Offset) {
   OS << "\t.setfp\t";
@@ -121,6 +138,7 @@ void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
     OS << ", #" << Offset;
   OS << '\n';
 }
+
 void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
   assert((Reg != ARM::SP && Reg != ARM::PC) &&
          "the operand of .movsp cannot be either sp or pc");
@@ -131,9 +149,11 @@ void ARMTargetAsmStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
     OS << ", #" << Offset;
   OS << '\n';
 }
+
 void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
   OS << "\t.pad\t#" << Offset << '\n';
 }
+
 void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                        bool isVector) {
   assert(RegList.size() && "RegList should not be empty");
@@ -151,8 +171,9 @@ void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
 
   OS << "}\n";
 }
-void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
-}
+
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {}
+
 void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
   if (IsVerboseAsm) {
@@ -162,6 +183,7 @@ void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   }
   OS << "\n";
 }
+
 void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
                                              StringRef String) {
   switch (Attribute) {
@@ -179,6 +201,7 @@ void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
   }
   OS << "\n";
 }
+
 void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
                                                 unsigned IntValue,
                                                 StringRef StringValue) {
@@ -194,20 +217,25 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
   }
   OS << "\n";
 }
+
 void ARMTargetAsmStreamer::emitArch(unsigned Arch) {
   OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
 }
+
 void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
   OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
 }
+
 void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) {
   OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n';
 }
+
 void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
   OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n";
 }
-void ARMTargetAsmStreamer::finishAttributeSection() {
-}
+
+void ARMTargetAsmStreamer::finishAttributeSection() {}
+
 void
 ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
   OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
@@ -274,12 +302,12 @@ private:
   };
 
   StringRef CurrentVendor;
-  unsigned FPU;
-  unsigned Arch;
-  unsigned EmittedArch;
+  unsigned FPU = ARM::FK_INVALID;
+  unsigned Arch = ARM::AK_INVALID;
+  unsigned EmittedArch = ARM::AK_INVALID;
   SmallVector<AttributeItem, 64> Contents;
 
-  MCSection *AttributeSection;
+  MCSection *AttributeSection = nullptr;
 
   AttributeItem *getAttributeItem(unsigned Attribute) {
     for (size_t i = 0; i < Contents.size(); ++i)
@@ -393,9 +421,7 @@ private:
 
 public:
   ARMTargetELFStreamer(MCStreamer &S)
-    : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::FK_INVALID),
-      Arch(ARM::AK_INVALID), EmittedArch(ARM::AK_INVALID),
-      AttributeSection(nullptr) {}
+    : ARMTargetStreamer(S), CurrentVendor("aeabi") {}
 };
 
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
@@ -416,12 +442,11 @@ public:
 
   ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS,
                  MCCodeEmitter *Emitter, bool IsThumb)
-      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb),
-        MappingSymbolCounter(0), LastEMS(EMS_None) {
+      : MCELFStreamer(Context, TAB, OS, Emitter), IsThumb(IsThumb) {
     EHReset();
   }
 
-  ~ARMELFStreamer() {}
+  ~ARMELFStreamer() override = default;
 
   void FinishImpl() override;
 
@@ -439,20 +464,21 @@ public:
   void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
 
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
+    LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo);
     MCELFStreamer::ChangeSection(Section, Subsection);
+    auto LastMappingSymbol = LastMappingSymbols.find(Section);
+    if (LastMappingSymbol != LastMappingSymbols.end()) {
+      LastEMSInfo = std::move(LastMappingSymbol->second);
+      return;
+    }
+    LastEMSInfo.reset(new ElfMappingSymbolInfo(SMLoc(), nullptr, 0));
   }
 
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst& Inst,
-                       const MCSubtargetInfo &STI) override {
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override {
     if (IsThumb)
       EmitThumbMappingSymbol();
     else
@@ -507,15 +533,25 @@ public:
     MCELFStreamer::EmitBytes(Data);
   }
 
+  void FlushPendingMappingSymbol() {
+    if (!LastEMSInfo->hasInfo())
+      return;
+    ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
+    EmitMappingSymbol("$d", EMS->Loc, EMS->F, EMS->Offset);
+    EMS->resetInfo();
+  }
+
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
-    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
+    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
       if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
         return;
       }
+      getOrCreateDataFragment();
+    }
 
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size, Loc);
@@ -548,22 +584,54 @@ private:
     EMS_Data
   };
 
+  struct ElfMappingSymbolInfo {
+    explicit ElfMappingSymbolInfo(SMLoc Loc, MCFragment *F, uint64_t O)
+        : Loc(Loc), F(F), Offset(O), State(EMS_None) {}
+    void resetInfo() {
+      F = nullptr;
+      Offset = 0;
+    }
+    bool hasInfo() { return F != nullptr; }
+    SMLoc Loc;
+    MCFragment *F;
+    uint64_t Offset;
+    ElfMappingSymbol State;
+  };
+
   void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data) return;
+    if (LastEMSInfo->State == EMS_Data)
+      return;
+    else if (LastEMSInfo->State == EMS_None) {
+      // This is a tentative symbol, it won't really be emitted until it's
+      // actually needed.
+      ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
+      auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+      if (!DF)
+        return;
+      EMS->Loc = SMLoc();
+      EMS->F = getCurrentFragment();
+      EMS->Offset = DF->getContents().size();
+      LastEMSInfo->State = EMS_Data;
+      return;
+    }
     EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
+    LastEMSInfo->State = EMS_Data;
   }
 
   void EmitThumbMappingSymbol() {
-    if (LastEMS == EMS_Thumb) return;
+    if (LastEMSInfo->State == EMS_Thumb)
+      return;
+    FlushPendingMappingSymbol();
     EmitMappingSymbol("$t");
-    LastEMS = EMS_Thumb;
+    LastEMSInfo->State = EMS_Thumb;
   }
 
   void EmitARMMappingSymbol() {
-    if (LastEMS == EMS_ARM) return;
+    if (LastEMSInfo->State == EMS_ARM)
+      return;
+    FlushPendingMappingSymbol();
     EmitMappingSymbol("$a");
-    LastEMS = EMS_ARM;
+    LastEMSInfo->State = EMS_ARM;
   }
 
   void EmitMappingSymbol(StringRef Name) {
@@ -576,6 +644,17 @@ private:
     Symbol->setExternal(false);
   }
 
+  void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F,
+                         uint64_t Offset) {
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
+    EmitLabel(Symbol, Loc, F);
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
+    Symbol->setOffset(Offset);
+  }
+
   void EmitThumbFunc(MCSymbol *Func) override {
     getAssembler().setIsThumbFunc(Func);
     EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
@@ -599,10 +678,12 @@ private:
   void EmitFixup(const MCExpr *Expr, MCFixupKind Kind);
 
   bool IsThumb;
-  int64_t MappingSymbolCounter;
+  int64_t MappingSymbolCounter = 0;
+
+  DenseMap<const MCSection *, std::unique_ptr<ElfMappingSymbolInfo>>
+      LastMappingSymbols;
 
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS;
+  std::unique_ptr<ElfMappingSymbolInfo> LastEMSInfo;
 
   // ARM Exception Handling Frame Information
   MCSymbol *ExTab;
@@ -618,6 +699,7 @@ private:
   SmallVector<uint8_t, 64> Opcodes;
   UnwindOpcodeAssembler UnwindOpAsm;
 };
+
 } // end anonymous namespace
 
 ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
@@ -627,33 +709,42 @@ ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
 void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
 void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
 void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+
 void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
   getStreamer().emitPersonality(Personality);
 }
+
 void ARMTargetELFStreamer::emitPersonalityIndex(unsigned Index) {
   getStreamer().emitPersonalityIndex(Index);
 }
+
 void ARMTargetELFStreamer::emitHandlerData() {
   getStreamer().emitHandlerData();
 }
+
 void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
                                      int64_t Offset) {
   getStreamer().emitSetFP(FpReg, SpReg, Offset);
 }
+
 void ARMTargetELFStreamer::emitMovSP(unsigned Reg, int64_t Offset) {
   getStreamer().emitMovSP(Reg, Offset);
 }
+
 void ARMTargetELFStreamer::emitPad(int64_t Offset) {
   getStreamer().emitPad(Offset);
 }
+
 void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
                                        bool isVector) {
   getStreamer().emitRegSave(RegList, isVector);
 }
+
 void ARMTargetELFStreamer::emitUnwindRaw(int64_t Offset,
                                       const SmallVectorImpl<uint8_t> &Opcodes) {
   getStreamer().emitUnwindRaw(Offset, Opcodes);
 }
+
 void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
   assert(!Vendor.empty() && "Vendor cannot be empty.");
 
@@ -668,25 +759,31 @@ void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
   CurrentVendor = Vendor;
 
 }
+
 void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
 }
+
 void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
                                              StringRef Value) {
   setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
 }
+
 void ARMTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
                                                 unsigned IntValue,
                                                 StringRef StringValue) {
   setAttributeItems(Attribute, IntValue, StringValue,
                     /* OverwriteExisting= */ true);
 }
+
 void ARMTargetELFStreamer::emitArch(unsigned Value) {
   Arch = Value;
 }
+
 void ARMTargetELFStreamer::emitObjectArch(unsigned Value) {
   EmittedArch = Value;
 }
+
 void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   using namespace ARMBuildAttrs;
 
@@ -786,9 +883,11 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
     break;
   }
 }
+
 void ARMTargetELFStreamer::emitFPU(unsigned Value) {
   FPU = Value;
 }
+
 void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
   switch (FPU) {
   case ARM::FK_VFP:
@@ -920,6 +1019,7 @@ void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
     break;
   }
 }
+
 size_t ARMTargetELFStreamer::calculateContentSize() const {
   size_t Result = 0;
   for (size_t i = 0; i < Contents.size(); ++i) {
@@ -944,6 +1044,7 @@ size_t ARMTargetELFStreamer::calculateContentSize() const {
   }
   return Result;
 }
+
 void ARMTargetELFStreamer::finishAttributeSection() {
   // <format-version>
   // [ <section-length> "vendor-name"
@@ -1093,9 +1194,9 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
   const MCSymbolELF *Group = FnSection.getGroup();
   if (Group)
     Flags |= ELF::SHF_GROUP;
-  MCSectionELF *EHSection =
-      getContext().getELFSection(EHSecName, Type, Flags, 0, Group,
-                                 FnSection.getUniqueID(), nullptr, &FnSection);
+  MCSectionELF *EHSection = getContext().getELFSection(
+      EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(),
+      static_cast<const MCSymbolELF *>(&Fn));
 
   assert(EHSection && "Failed to get the required EH section");
 
@@ -1114,6 +1215,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
                     ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER,
                     SectionKind::getData(), FnStart);
 }
+
 void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
   MCDataFragment *Frag = getOrCreateDataFragment();
   Frag->getFixups().push_back(MCFixup::create(Frag->getContents().size(), Expr,
@@ -1396,8 +1498,6 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
     if (RelaxAll)
       S->getAssembler().setRelaxAll(true);
     return S;
-  }
-
 }
 
-
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 3fe2302..831589b 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
@@ -15,55 +15,47 @@
 namespace llvm {
 namespace ARM {
 enum Fixups {
-  // fixup_arm_ldst_pcrel_12 - 12-bit PC relative relocation for symbol
-  // addresses
+  // 12-bit PC relative relocation for symbol addresses
   fixup_arm_ldst_pcrel_12 = FirstTargetFixupKind,
 
-  // fixup_t2_ldst_pcrel_12 - Equivalent to fixup_arm_ldst_pcrel_12, with
-  // the 16-bit halfwords reordered.
+  // Equivalent to fixup_arm_ldst_pcrel_12, with the 16-bit halfwords reordered.
   fixup_t2_ldst_pcrel_12,
 
-  // fixup_arm_pcrel_10_unscaled - 10-bit PC relative relocation for symbol
-  // addresses used in LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
+  // 10-bit PC relative relocation for symbol addresses used in
+  // LDRD/LDRH/LDRB/etc. instructions. All bits are encoded.
   fixup_arm_pcrel_10_unscaled,
-  // fixup_arm_pcrel_10 - 10-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where the lower 2 bits are not encoded
-  // (so it's encoded as an 8-bit immediate).
+  // 10-bit PC relative relocation for symbol addresses used in VFP instructions
+  // where the lower 2 bits are not encoded (so it's encoded as an 8-bit
+  // immediate).
   fixup_arm_pcrel_10,
-  // fixup_t2_pcrel_10 - Equivalent to fixup_arm_pcrel_10, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
+  // Equivalent to fixup_arm_pcrel_10, accounting for the short-swapped encoding
+  // of Thumb2 instructions.
   fixup_t2_pcrel_10,
-  // fixup_arm_pcrel_9 - 9-bit PC relative relocation for symbol addresses
-  // used in VFP instructions where bit 0 not encoded (so it's encoded as an
-  // 8-bit immediate).
+  // 9-bit PC relative relocation for symbol addresses used in VFP instructions
+  // where bit 0 not encoded (so it's encoded as an 8-bit immediate).
   fixup_arm_pcrel_9,
-  // fixup_t2_pcrel_9 - Equivalent to fixup_arm_pcrel_9, accounting for
-  // the short-swapped encoding of Thumb2 instructions.
+  // Equivalent to fixup_arm_pcrel_9, accounting for the short-swapped encoding
+  // of Thumb2 instructions.
   fixup_t2_pcrel_9,
-  // fixup_thumb_adr_pcrel_10 - 10-bit PC relative relocation for symbol
-  // addresses where the lower 2 bits are not encoded (so it's encoded as an
-  // 8-bit immediate).
+  // 10-bit PC relative relocation for symbol addresses where the lower 2 bits
+  // are not encoded (so it's encoded as an 8-bit immediate).
   fixup_thumb_adr_pcrel_10,
-  // fixup_arm_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
+  // 12-bit PC relative relocation for the ADR instruction.
   fixup_arm_adr_pcrel_12,
-  // fixup_t2_adr_pcrel_12 - 12-bit PC relative relocation for the ADR
-  // instruction.
+  // 12-bit PC relative relocation for the ADR instruction.
   fixup_t2_adr_pcrel_12,
-  // fixup_arm_condbranch - 24-bit PC relative relocation for conditional branch
-  // instructions. 
+  // 24-bit PC relative relocation for conditional branch instructions.
   fixup_arm_condbranch,
-  // fixup_arm_uncondbranch - 24-bit PC relative relocation for 
-  // branch instructions. (unconditional)
+  // 24-bit PC relative relocation for branch instructions. (unconditional)
   fixup_arm_uncondbranch,
-  // fixup_t2_condbranch - 20-bit PC relative relocation for Thumb2 direct
-  // uconditional branch instructions.
+  // 20-bit PC relative relocation for Thumb2 direct uconditional branch
+  // instructions.
   fixup_t2_condbranch,
-  // fixup_t2_uncondbranch - 20-bit PC relative relocation for Thumb2 direct
-  // branch unconditional branch instructions.
+  // 20-bit PC relative relocation for Thumb2 direct branch unconditional branch
+  // instructions.
   fixup_t2_uncondbranch,
 
-  // fixup_arm_thumb_br - 12-bit fixup for Thumb B instructions.
+  // 12-bit fixup for Thumb B instructions.
   fixup_arm_thumb_br,
 
   // The following fixups handle the ARM BL instructions. These can be
@@ -75,46 +67,48 @@ enum Fixups {
   // MachO does not draw a distinction between the two cases, so it will treat
   // fixup_arm_uncondbl and fixup_arm_condbl as identical fixups.
 
-  // fixup_arm_uncondbl - Fixup for unconditional ARM BL instructions.
+  // Fixup for unconditional ARM BL instructions.
   fixup_arm_uncondbl,
 
-  // fixup_arm_condbl - Fixup for ARM BL instructions with nontrivial
-  // conditionalisation.
+  // Fixup for ARM BL instructions with nontrivial conditionalisation.
   fixup_arm_condbl,
 
-  // fixup_arm_blx - Fixup for ARM BLX instructions.
+  // Fixup for ARM BLX instructions.
   fixup_arm_blx,
 
-  // fixup_arm_thumb_bl - Fixup for Thumb BL instructions.
+  // Fixup for Thumb BL instructions.
   fixup_arm_thumb_bl,
 
-  // fixup_arm_thumb_blx - Fixup for Thumb BLX instructions.
+  // Fixup for Thumb BLX instructions.
   fixup_arm_thumb_blx,
 
-  // fixup_arm_thumb_cb - Fixup for Thumb branch instructions.
+  // Fixup for Thumb branch instructions.
   fixup_arm_thumb_cb,
 
-  // fixup_arm_thumb_cp - Fixup for Thumb load/store from constant pool instrs.
+  // Fixup for Thumb load/store from constant pool instrs.
   fixup_arm_thumb_cp,
 
-  // fixup_arm_thumb_bcc - Fixup for Thumb conditional branching instructions.
+  // Fixup for Thumb conditional branching instructions.
   fixup_arm_thumb_bcc,
 
   // The next two are for the movt/movw pair
   // the 16bit imm field are split into imm{15-12} and imm{11-0}
   fixup_arm_movt_hi16, // :upper16:
   fixup_arm_movw_lo16, // :lower16:
-  fixup_t2_movt_hi16, // :upper16:
-  fixup_t2_movw_lo16, // :lower16:
+  fixup_t2_movt_hi16,  // :upper16:
+  fixup_t2_movw_lo16,  // :lower16:
 
-  // fixup_arm_mod_imm - Fixup for mod_imm
+  // Fixup for mod_imm
   fixup_arm_mod_imm,
 
+  // Fixup for Thumb2 8-bit rotated operand
+  fixup_t2_so_imm,
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
 };
 }
-}
+} // namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 559a4f8..f1f35f4 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -11,22 +11,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 
 using namespace llvm;
 
@@ -36,9 +47,8 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
 STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
 namespace {
+
 class ARMMCCodeEmitter : public MCCodeEmitter {
-  ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
-  void operator=(const ARMMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   const MCContext &CTX;
   bool IsLittleEndian;
@@ -47,15 +57,18 @@ public:
   ARMMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx, bool IsLittle)
     : MCII(mcii), CTX(ctx), IsLittleEndian(IsLittle) {
   }
-
-  ~ARMMCCodeEmitter() override {}
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) = delete;
+  ARMMCCodeEmitter &operator=(const ARMMCCodeEmitter &) = delete;
+  ~ARMMCCodeEmitter() override = default;
 
   bool isThumb(const MCSubtargetInfo &STI) const {
     return STI.getFeatureBits()[ARM::ModeThumb];
   }
+
   bool isThumb2(const MCSubtargetInfo &STI) const {
     return isThumb(STI) && STI.getFeatureBits()[ARM::FeatureThumb2];
   }
+
   bool isTargetMachO(const MCSubtargetInfo &STI) const {
     const Triple &TT = STI.getTargetTriple();
     return TT.isOSBinFormatMachO();
@@ -200,6 +213,7 @@ public:
     case ARM_AM::ib: return 3;
     }
   }
+
   /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
   ///
   unsigned getShiftOp(ARM_AM::ShiftOpc ShOpc) const {
@@ -273,7 +287,6 @@ public:
   unsigned getSOImmOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
     const MCOperand &MO = MI.getOperand(Op);
 
     // We expect MO to be an immediate or an expression,
@@ -326,7 +339,17 @@ public:
   unsigned getT2SOImmOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-    unsigned SoImm = MI.getOperand(Op).getImm();
+    const MCOperand &MO = MI.getOperand(Op);
+
+    // Support for fixups (MCFixup)
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
+      // Fixups resolve to plain values that need to be encoded.
+      MCFixupKind Kind = MCFixupKind(ARM::fixup_t2_so_imm);
+      Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+      return 0;
+    }
+    unsigned SoImm = MO.getImm();
     unsigned Encoded =  ARM_AM::getT2SOImmVal(SoImm);
     assert(Encoded != ~0U && "Not a Thumb2 so_imm value?");
     return Encoded;
@@ -432,18 +455,6 @@ public:
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new ARMMCCodeEmitter(MCII, Ctx, true);
-}
-
-MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new ARMMCCodeEmitter(MCII, Ctx, false);
-}
-
 /// NEONThumb2DataIPostEncoder - Post-process encoded NEON data-processing
 /// instructions, and rewrite them to their Thumb2 form if we are currently in
 /// Thumb2 mode.
@@ -550,7 +561,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
 bool ARMMCCodeEmitter::
 EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
                        unsigned &Imm, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+                       const MCSubtargetInfo &STI) const {
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
 
@@ -1515,7 +1526,7 @@ getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
   uint32_t v = ~MO.getImm();
   uint32_t lsb = countTrailingZeros(v);
   uint32_t msb = (32 - countLeadingZeros (v)) - 1;
-  assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
+  assert(v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
   return lsb | (msb << 5);
 }
 
@@ -1700,3 +1711,15 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
 }
 
 #include "ARMGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createARMLEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, true);
+}
+
+MCCodeEmitter *llvm::createARMBEMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new ARMMCCodeEmitter(MCII, Ctx, false);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 9e4d202..b8a8b1f 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMMCTargetDesc.h"
 #include "ARMBaseInfo.h"
 #include "ARMMCAsmInfo.h"
-#include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCELFStreamer.h"
@@ -260,18 +260,37 @@ public:
       return false;
 
     int64_t Imm = Inst.getOperand(0).getImm();
-    // FIXME: This is not right for thumb.
     Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
     return true;
   }
 };
 
+class ThumbMCInstrAnalysis : public ARMMCInstrAnalysis {
+public:
+  ThumbMCInstrAnalysis(const MCInstrInfo *Info) : ARMMCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return false;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    Target = Addr+Imm+4; // In Thumb mode the PC is always off by 4 bytes.
+    return true;
+  }
+};
+
 }
 
 static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
   return new ARMMCInstrAnalysis(Info);
 }
 
+static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ThumbMCInstrAnalysis(Info);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
@@ -289,9 +308,6 @@ extern "C" void LLVMInitializeARMTargetMC() {
     TargetRegistry::RegisterMCSubtargetInfo(*T,
                                             ARM_MC::createARMMCSubtargetInfo);
 
-    // Register the MC instruction analyzer.
-    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
-
     TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
     TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer);
     TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer);
@@ -313,6 +329,12 @@ extern "C" void LLVMInitializeARMTargetMC() {
     TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo);
   }
 
+  // Register the MC instruction analyzer.
+  for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget()})
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
+  for (Target *T : {&getTheThumbLETarget(), &getTheThumbBETarget()})
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createThumbMCInstrAnalysis);
+
   // Register the MC Code Emitter
   for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
     TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 482bcf9..5516a1b 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//===- ARMMachORelocationInfo.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 using namespace llvm;
-using namespace object;
 
 namespace {
+
 class ARMMachORelocationInfo : public MCRelocationInfo {
 public:
   ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
@@ -35,7 +35,8 @@ public:
     }
   }
 };
-} // End unnamed namespace
+
+} // end anonymous namespace
 
 /// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
 MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index b77181f..4a8139d 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -21,7 +22,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index e49f1a3..4a94318 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -10,20 +10,25 @@
 // This file implements the ARMTargetStreamer class.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/ADT/MapVector.h"
+
+#include "ARMTargetMachine.h"
 #include "llvm/MC/ConstantPools.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
+
 //
 // ARMTargetStreamer Implemenation
 //
+
 ARMTargetStreamer::ARMTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S), ConstantPools(new AssemblerConstantPools()) {}
 
-ARMTargetStreamer::~ARMTargetStreamer() {}
+ARMTargetStreamer::~ARMTargetStreamer() = default;
 
 // The constant pool handling is shared by all ARMTargetStreamer
 // implementations.
@@ -74,5 +79,180 @@ void ARMTargetStreamer::finishAttributeSection() {}
 void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {}
 void
 ARMTargetStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) {}
-
 void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {}
+
+static ARMBuildAttrs::CPUArch getArchForCPU(const MCSubtargetInfo &STI) {
+  if (STI.getCPU() == "xscale")
+    return ARMBuildAttrs::v5TEJ;
+
+  if (STI.hasFeature(ARM::HasV8Ops)) {
+    if (STI.hasFeature(ARM::FeatureRClass))
+      return ARMBuildAttrs::v8_R;
+    return ARMBuildAttrs::v8_A;
+  } else if (STI.hasFeature(ARM::HasV8MMainlineOps))
+    return ARMBuildAttrs::v8_M_Main;
+  else if (STI.hasFeature(ARM::HasV7Ops)) {
+    if (STI.hasFeature(ARM::FeatureMClass) && STI.hasFeature(ARM::FeatureDSP))
+      return ARMBuildAttrs::v7E_M;
+    return ARMBuildAttrs::v7;
+  } else if (STI.hasFeature(ARM::HasV6T2Ops))
+    return ARMBuildAttrs::v6T2;
+  else if (STI.hasFeature(ARM::HasV8MBaselineOps))
+    return ARMBuildAttrs::v8_M_Base;
+  else if (STI.hasFeature(ARM::HasV6MOps))
+    return ARMBuildAttrs::v6S_M;
+  else if (STI.hasFeature(ARM::HasV6Ops))
+    return ARMBuildAttrs::v6;
+  else if (STI.hasFeature(ARM::HasV5TEOps))
+    return ARMBuildAttrs::v5TE;
+  else if (STI.hasFeature(ARM::HasV5TOps))
+    return ARMBuildAttrs::v5T;
+  else if (STI.hasFeature(ARM::HasV4TOps))
+    return ARMBuildAttrs::v4T;
+  else
+    return ARMBuildAttrs::v4;
+}
+
+static bool isV8M(const MCSubtargetInfo &STI) {
+  // Note that v8M Baseline is a subset of v6T2!
+  return (STI.hasFeature(ARM::HasV8MBaselineOps) &&
+          !STI.hasFeature(ARM::HasV6T2Ops)) ||
+         STI.hasFeature(ARM::HasV8MMainlineOps);
+}
+
+/// Emit the build attributes that only depend on the hardware that we expect
+// /to be available, and not on the ABI, or any source-language choices.
+void ARMTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
+  switchVendor("aeabi");
+
+  const StringRef CPUString = STI.getCPU();
+  if (!CPUString.empty() && !CPUString.startswith("generic")) {
+    // FIXME: remove krait check when GNU tools support krait cpu
+    if (STI.hasFeature(ARM::ProcKrait)) {
+      emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
+      // We consider krait as a "cortex-a9" + hwdiv CPU
+      // Enable hwdiv through ".arch_extension idiv"
+      if (STI.hasFeature(ARM::FeatureHWDivThumb) ||
+          STI.hasFeature(ARM::FeatureHWDivARM))
+        emitArchExtension(ARM::AEK_HWDIVTHUMB | ARM::AEK_HWDIVARM);
+    } else {
+      emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+    }
+  }
+
+  emitAttribute(ARMBuildAttrs::CPU_arch, getArchForCPU(STI));
+
+  if (STI.hasFeature(ARM::FeatureAClass)) {
+    emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::ApplicationProfile);
+  } else if (STI.hasFeature(ARM::FeatureRClass)) {
+    emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::RealTimeProfile);
+  } else if (STI.hasFeature(ARM::FeatureMClass)) {
+    emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::MicroControllerProfile);
+  }
+
+  emitAttribute(ARMBuildAttrs::ARM_ISA_use, STI.hasFeature(ARM::FeatureNoARM)
+                                                ? ARMBuildAttrs::Not_Allowed
+                                                : ARMBuildAttrs::Allowed);
+
+  if (isV8M(STI)) {
+    emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumbDerived);
+  } else if (STI.hasFeature(ARM::FeatureThumb2)) {
+    emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumb32);
+  } else if (STI.hasFeature(ARM::HasV4TOps)) {
+    emitAttribute(ARMBuildAttrs::THUMB_ISA_use, ARMBuildAttrs::Allowed);
+  }
+
+  if (STI.hasFeature(ARM::FeatureNEON)) {
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (STI.hasFeature(ARM::FeatureFPARMv8)) {
+      if (STI.hasFeature(ARM::FeatureCrypto))
+        emitFPU(ARM::FK_CRYPTO_NEON_FP_ARMV8);
+      else
+        emitFPU(ARM::FK_NEON_FP_ARMV8);
+    } else if (STI.hasFeature(ARM::FeatureVFP4))
+      emitFPU(ARM::FK_NEON_VFPV4);
+    else
+      emitFPU(STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_NEON_FP16
+                                               : ARM::FK_NEON);
+    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+    if (STI.hasFeature(ARM::HasV8Ops))
+      emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                    STI.hasFeature(ARM::HasV8_1aOps)
+                        ? ARMBuildAttrs::AllowNeonARMv8_1a
+                        : ARMBuildAttrs::AllowNeonARMv8);
+  } else {
+    if (STI.hasFeature(ARM::FeatureFPARMv8))
+      // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
+      // FPU, but there are two different names for it depending on the CPU.
+      emitFPU(STI.hasFeature(ARM::FeatureD16)
+                  ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV5_SP_D16
+                                                           : ARM::FK_FPV5_D16)
+                  : ARM::FK_FP_ARMV8);
+    else if (STI.hasFeature(ARM::FeatureVFP4))
+      emitFPU(STI.hasFeature(ARM::FeatureD16)
+                  ? (STI.hasFeature(ARM::FeatureVFPOnlySP) ? ARM::FK_FPV4_SP_D16
+                                                           : ARM::FK_VFPV4_D16)
+                  : ARM::FK_VFPV4);
+    else if (STI.hasFeature(ARM::FeatureVFP3))
+      emitFPU(
+          STI.hasFeature(ARM::FeatureD16)
+              // +d16
+              ? (STI.hasFeature(ARM::FeatureVFPOnlySP)
+                     ? (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3XD_FP16
+                                                         : ARM::FK_VFPV3XD)
+                     : (STI.hasFeature(ARM::FeatureFP16)
+                            ? ARM::FK_VFPV3_D16_FP16
+                            : ARM::FK_VFPV3_D16))
+              // -d16
+              : (STI.hasFeature(ARM::FeatureFP16) ? ARM::FK_VFPV3_FP16
+                                                  : ARM::FK_VFPV3));
+    else if (STI.hasFeature(ARM::FeatureVFP2))
+      emitFPU(ARM::FK_VFPV2);
+  }
+
+  // ABI_HardFP_use attribute to indicate single precision FP.
+  if (STI.hasFeature(ARM::FeatureVFPOnlySP))
+    emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+                  ARMBuildAttrs::HardFPSinglePrecision);
+
+  if (STI.hasFeature(ARM::FeatureFP16))
+    emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  if (STI.hasFeature(ARM::FeatureMP))
+    emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+  // Hardware divide in ARM mode is part of base arch, starting from ARMv8.
+  // If only Thumb hwdiv is present, it must also be in base arch (ARMv7-R/M).
+  // It is not possible to produce DisallowDIV: if hwdiv is present in the base
+  // arch, supplying -hwdiv downgrades the effective arch, via ClearImpliedBits.
+  // AllowDIVExt is only emitted if hwdiv isn't available in the base arch;
+  // otherwise, the default value (AllowDIVIfExists) applies.
+  if (STI.hasFeature(ARM::FeatureHWDivARM) && !STI.hasFeature(ARM::HasV8Ops))
+    emitAttribute(ARMBuildAttrs::DIV_use, ARMBuildAttrs::AllowDIVExt);
+
+  if (STI.hasFeature(ARM::FeatureDSP) && isV8M(STI))
+    emitAttribute(ARMBuildAttrs::DSP_extension, ARMBuildAttrs::Allowed);
+
+  if (STI.hasFeature(ARM::FeatureStrictAlign))
+    emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+                  ARMBuildAttrs::Not_Allowed);
+  else
+    emitAttribute(ARMBuildAttrs::CPU_unaligned_access,
+                  ARMBuildAttrs::Allowed);
+
+  if (STI.hasFeature(ARM::FeatureTrustZone) &&
+      STI.hasFeature(ARM::FeatureVirtualization))
+    emitAttribute(ARMBuildAttrs::Virtualization_use,
+                  ARMBuildAttrs::AllowTZVirtualization);
+  else if (STI.hasFeature(ARM::FeatureTrustZone))
+    emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowTZ);
+  else if (STI.hasFeature(ARM::FeatureVirtualization))
+    emitAttribute(ARMBuildAttrs::Virtualization_use,
+                  ARMBuildAttrs::AllowVirtualization);
+}
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index 173cc93..d3ab83b 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -14,12 +14,14 @@
 
 #include "ARMUnwindOpAsm.h"
 #include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
 
 using namespace llvm;
 
 namespace {
+
   /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
   /// with MSB to LSB per uint32_t ordering.  For example, the first byte will
   /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
@@ -27,20 +29,19 @@ namespace {
   class UnwindOpcodeStreamer {
   private:
     SmallVectorImpl<uint8_t> &Vec;
-    size_t Pos;
+    size_t Pos = 3;
 
   public:
-    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
-    }
+    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V) {}
 
     /// Emit the byte in MSB to LSB per uint32_t order.
-    inline void EmitByte(uint8_t elem) {
+    void EmitByte(uint8_t elem) {
       Vec[Pos] = elem;
       Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
     }
 
     /// Emit the size prefix.
-    inline void EmitSize(size_t Size) {
+    void EmitSize(size_t Size) {
       size_t SizeInWords = (Size + 3) / 4;
       assert(SizeInWords <= 0x100u &&
              "Only 256 additional words are allowed for unwind opcodes");
@@ -48,19 +49,20 @@ namespace {
     }
 
     /// Emit the personality index prefix.
-    inline void EmitPersonalityIndex(unsigned PI) {
+    void EmitPersonalityIndex(unsigned PI) {
       assert(PI < ARM::EHABI::NUM_PERSONALITY_INDEX &&
              "Invalid personality prefix");
       EmitByte(ARM::EHABI::EHT_COMPACT | PI);
     }
 
     /// Fill the rest of bytes with FINISH opcode.
-    inline void FillFinishOpcode() {
+    void FillFinishOpcode() {
       while (Pos < Vec.size())
         EmitByte(ARM::EHABI::UNWIND_OPCODE_FINISH);
     }
   };
-}
+
+} // end anonymous namespace
 
 void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
   if (RegSave == 0u)
@@ -153,7 +155,6 @@ void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
 
 void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
                                      SmallVectorImpl<uint8_t> &Result) {
-
   UnwindOpcodeStreamer OpStreamer(Result);
 
   if (HasPersonality) {
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index e0c113e..a7bfbdf 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -16,8 +16,8 @@
 #define LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ARMEHABI.h"
-#include "llvm/Support/DataTypes.h"
+#include <cstddef>
+#include <cstdint>
 
 namespace llvm {
 
@@ -25,13 +25,12 @@ class MCSymbol;
 
 class UnwindOpcodeAssembler {
 private:
-  llvm::SmallVector<uint8_t, 32> Ops;
-  llvm::SmallVector<unsigned, 8> OpBegins;
-  bool HasPersonality;
+  SmallVector<uint8_t, 32> Ops;
+  SmallVector<unsigned, 8> OpBegins;
+  bool HasPersonality = false;
 
 public:
-  UnwindOpcodeAssembler()
-      : HasPersonality(0) {
+  UnwindOpcodeAssembler() {
     OpBegins.push_back(0);
   }
 
@@ -40,12 +39,12 @@ public:
     Ops.clear();
     OpBegins.clear();
     OpBegins.push_back(0);
-    HasPersonality = 0;
+    HasPersonality = false;
   }
 
   /// Set the personality
   void setPersonality(const MCSymbol *Per) {
-    HasPersonality = 1;
+    HasPersonality = true;
   }
 
   /// Emit unwind opcodes for .save directives
@@ -88,6 +87,6 @@ private:
   }
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_MCTARGETDESC_ARMUNWINDOPASM_H
diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
index 166c04b..f74fb2e 100644
--- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -9,33 +9,41 @@
 
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
 namespace {
+
 class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
 public:
   ARMWinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
     assert(!Is64Bit && "AArch64 support not yet implemented");
   }
-  ~ARMWinCOFFObjectWriter() override {}
 
-  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsCrossSection,
+  ~ARMWinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
                         const MCAsmBackend &MAB) const override;
 
   bool recordRelocation(const MCFixup &) const override;
 };
 
-unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+} // end anonymous namespace
+
+unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
                                               const MCAsmBackend &MAB) const {
@@ -79,13 +87,13 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
 bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
   return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
 }
-}
 
 namespace llvm {
+
 MCObjectWriter *createARMWinCOFFObjectWriter(raw_pwrite_stream &OS,
                                              bool Is64Bit) {
   MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
   return createWinCOFFObjectWriter(MOTW, OS);
 }
-}
 
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 9953c61..5709b4e 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -12,13 +12,35 @@
 //===----------------------------------------------------------------------===//
 
 #include "Thumb1FrameLowering.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb1InstrInfo.h"
+#include "ThumbRegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <iterator>
+#include <vector>
 
 using namespace llvm;
 
@@ -61,13 +83,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // ADJCALLSTACKUP   -> add, sp, sp, amount
     MachineInstr &Old = *I;
     DebugLoc dl = Old.getDebugLoc();
-    unsigned Amount = Old.getOperand(0).getImm();
+    unsigned Amount = TII.getFrameSize(Old);
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      unsigned Align = getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
+      Amount = alignTo(Amount, getStackAlignment());
 
       // Replace the pseudo instruction with a new instruction...
       unsigned Opc = Old.getOpcode();
@@ -215,7 +236,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     case ARM::R12:
       if (STI.splitFramePushPop(MF))
         break;
-      // fallthough
+      LLVM_FALLTHROUGH;
     case ARM::R0:
     case ARM::R1:
     case ARM::R2:
@@ -238,9 +259,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   if (HasFP) {
     FramePtrOffsetInBlock +=
         MFI.getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize;
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
-      .setMIFlags(MachineInstr::FrameSetup));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+        .addReg(ARM::SP)
+        .addImm(FramePtrOffsetInBlock / 4)
+        .setMIFlags(MachineInstr::FrameSetup)
+        .add(predOps(ARMCC::AL));
     if(FramePtrOffsetInBlock) {
       CFAOffset += FramePtrOffsetInBlock;
       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
@@ -336,14 +359,19 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   // will be allocated after this, so we can still use the base pointer
   // to reference locals.
   if (RegInfo->hasBasePointer(MF))
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
-                   .addReg(ARM::SP));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), BasePtr)
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL));
 
   // If the frame has variable sized objects then the epilogue must restore
   // the sp from fp. We can assume there's an FP here since hasFP already
   // checks for hasVarSizedObjects.
   if (MFI.hasVarSizedObjects())
     AFI->setShouldRestoreSPFromFP(true);
+
+  // In some cases, virtual registers have been introduced, e.g. by uses of
+  // emitThumbRegPlusImmInReg.
+  MF.getProperties().reset(MachineFunctionProperties::Property::NoVRegs);
 }
 
 static bool isCSRestore(MachineInstr &MI, const MCPhysReg *CSRegs) {
@@ -408,13 +436,13 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
                "No scratch register to restore SP from FP!");
         emitThumbRegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
                                   TII, *RegInfo);
-        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                               ARM::SP)
-          .addReg(ARM::R4));
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+            .addReg(ARM::R4)
+            .add(predOps(ARMCC::AL));
       } else
-        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),
-                               ARM::SP)
-          .addReg(FramePtr));
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+            .addReg(FramePtr)
+            .add(predOps(ARMCC::AL));
     } else {
       if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET &&
           &MBB.front() != &*MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) {
@@ -493,12 +521,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
     if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET)
       return true;
     MachineInstrBuilder MIB =
-        AddDefaultPred(
-            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET)));
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))
+            .add(predOps(ARMCC::AL));
     // Copy implicit ops and popped registers, if any.
     for (auto MO: MBBI->operands())
       if (MO.isReg() && (MO.isImplicit() || MO.isDef()))
-        MIB.addOperand(MO);
+        MIB.add(MO);
     MIB.addReg(ARM::PC, RegState::Define);
     // Erase the old instruction (tBX_RET or tPOP).
     MBB.erase(MBBI);
@@ -507,14 +535,14 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
 
   // Look for a temporary register to use.
   // First, compute the liveness information.
-  LivePhysRegs UsedRegs(STI.getRegisterInfo());
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  LivePhysRegs UsedRegs(TRI);
   UsedRegs.addLiveOuts(MBB);
   // The semantic of pristines changed recently and now,
   // the callee-saved registers that are touched in the function
   // are not part of the pristines set anymore.
   // Add those callee-saved now.
-  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
-  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     UsedRegs.addReg(CSRegs[i]);
 
@@ -533,18 +561,17 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   // And some temporary register, just in case.
   unsigned TemporaryReg = 0;
   BitVector PopFriendly =
-      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+      TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
   assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
   // Rebuild the GPRs from the high registers because they are removed
   // form the GPR reg class for thumb1.
   BitVector GPRsNoLRSP =
-      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+      TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::hGPRRegClassID));
   GPRsNoLRSP |= PopFriendly;
   GPRsNoLRSP.reset(ARM::LR);
   GPRsNoLRSP.reset(ARM::SP);
   GPRsNoLRSP.reset(ARM::PC);
-  for (int Register = GPRsNoLRSP.find_first(); Register != -1;
-       Register = GPRsNoLRSP.find_next(Register)) {
+  for (unsigned Register : GPRsNoLRSP.set_bits()) {
     if (!UsedRegs.contains(Register)) {
       // Remember the first pop-friendly register and exit.
       if (PopFriendly.test(Register)) {
@@ -566,22 +593,23 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   if (TemporaryReg) {
     assert(!PopReg && "Unnecessary MOV is about to be inserted");
     PopReg = PopFriendly.find_first();
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                       .addReg(TemporaryReg, RegState::Define)
-                       .addReg(PopReg, RegState::Kill));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(TemporaryReg, RegState::Define)
+        .addReg(PopReg, RegState::Kill)
+        .add(predOps(ARMCC::AL));
   }
 
   if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) {
     // We couldn't use the direct restoration above, so
     // perform the opposite conversion: tPOP_RET to tPOP.
     MachineInstrBuilder MIB =
-        AddDefaultPred(
-            BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP)));
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP))
+            .add(predOps(ARMCC::AL));
     bool Popped = false;
     for (auto MO: MBBI->operands())
       if (MO.isReg() && (MO.isImplicit() || MO.isDef()) &&
           MO.getReg() != ARM::PC) {
-        MIB.addOperand(MO);
+        MIB.add(MO);
         if (!MO.isImplicit())
           Popped = true;
       }
@@ -590,23 +618,27 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
       MBB.erase(MIB.getInstr());
     // Erase the old instruction.
     MBB.erase(MBBI);
-    MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET)));
+    MBBI = BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET))
+               .add(predOps(ARMCC::AL));
   }
 
   assert(PopReg && "Do not know how to get LR");
-  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))
+      .add(predOps(ARMCC::AL))
       .addReg(PopReg, RegState::Define);
 
   emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize);
 
-  AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                     .addReg(ARM::LR, RegState::Define)
-                     .addReg(PopReg, RegState::Kill));
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+      .addReg(ARM::LR, RegState::Define)
+      .addReg(PopReg, RegState::Kill)
+      .add(predOps(ARMCC::AL));
 
   if (TemporaryReg)
-    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
-                       .addReg(PopReg, RegState::Define)
-                       .addReg(TemporaryReg, RegState::Kill));
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr))
+        .addReg(PopReg, RegState::Define)
+        .addReg(TemporaryReg, RegState::Kill)
+        .add(predOps(ARMCC::AL));
 
   return true;
 }
@@ -666,13 +698,14 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
       CopyRegs.insert(ArgReg);
 
   // Push the low registers and lr
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (!LoRegsToSave.empty()) {
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH));
-    AddDefaultPred(MIB);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
     for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) {
       if (LoRegsToSave.count(Reg)) {
-        bool isKill = !MF.getRegInfo().isLiveIn(Reg);
-        if (isKill)
+        bool isKill = !MRI.isLiveIn(Reg);
+        if (isKill && !MRI.isReserved(Reg))
           MBB.addLiveIn(Reg);
 
         MIB.addReg(Reg, getKillRegState(isKill));
@@ -708,22 +741,21 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
 
     // Create the PUSH, but don't insert it yet (the MOVs need to come first).
-    MachineInstrBuilder PushMIB = BuildMI(MF, DL, TII.get(ARM::tPUSH));
-    AddDefaultPred(PushMIB);
+    MachineInstrBuilder PushMIB =
+        BuildMI(MF, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
 
     SmallVector<unsigned, 4> RegsToPush;
     while (HiRegToSave != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
       if (HiRegsToSave.count(*HiRegToSave)) {
-        bool isKill = !MF.getRegInfo().isLiveIn(*HiRegToSave);
-        if (isKill)
+        bool isKill = !MRI.isLiveIn(*HiRegToSave);
+        if (isKill && !MRI.isReserved(*HiRegToSave))
           MBB.addLiveIn(*HiRegToSave);
 
         // Emit a MOV from the high reg to the low reg.
-        MachineInstrBuilder MIB =
-            BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
-        MIB.addReg(*CopyReg, RegState::Define);
-        MIB.addReg(*HiRegToSave, getKillRegState(isKill));
-        AddDefaultPred(MIB);
+        BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
+            .addReg(*CopyReg, RegState::Define)
+            .addReg(*HiRegToSave, getKillRegState(isKill))
+            .add(predOps(ARMCC::AL));
 
         // Record the register that must be added to the PUSH.
         RegsToPush.push_back(*CopyReg);
@@ -735,7 +767,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     }
 
     // Add the low registers to the PUSH, in ascending order.
-    for (unsigned Reg : reverse(RegsToPush))
+    for (unsigned Reg : llvm::reverse(RegsToPush))
       PushMIB.addReg(Reg, RegState::Kill);
 
     // Insert the PUSH instruction after the MOVs.
@@ -817,19 +849,18 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         findNextOrderedReg(std::begin(AllCopyRegs), CopyRegs, AllCopyRegsEnd);
 
     // Create the POP instruction.
-    MachineInstrBuilder PopMIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPOP));
-    AddDefaultPred(PopMIB);
+    MachineInstrBuilder PopMIB =
+        BuildMI(MBB, MI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
     while (HiRegToRestore != AllHighRegsEnd && CopyReg != AllCopyRegsEnd) {
       // Add the low register to the POP.
       PopMIB.addReg(*CopyReg, RegState::Define);
 
       // Create the MOV from low to high register.
-      MachineInstrBuilder MIB =
-          BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr));
-      MIB.addReg(*HiRegToRestore, RegState::Define);
-      MIB.addReg(*CopyReg, RegState::Kill);
-      AddDefaultPred(MIB);
+      BuildMI(MBB, MI, DL, TII.get(ARM::tMOVr))
+          .addReg(*HiRegToRestore, RegState::Define)
+          .addReg(*CopyReg, RegState::Kill)
+          .add(predOps(ARMCC::AL));
 
       CopyReg = findNextOrderedReg(++CopyReg, CopyRegs, AllCopyRegsEnd);
       HiRegToRestore =
@@ -837,11 +868,8 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     }
   }
 
-
-
-
-  MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP));
-  AddDefaultPred(MIB);
+  MachineInstrBuilder MIB =
+      BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
 
   bool NeedsPop = false;
   for (unsigned i = CSI.size(); i != 0; --i) {
@@ -859,6 +887,16 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         // ARMv4T requires BX, see emitEpilogue
         if (!STI.hasV5TOps())
           continue;
+        // Tailcall optimization failed; change TCRETURN to a tBL
+        if (MI->getOpcode() == ARM::TCRETURNdi ||
+            MI->getOpcode() == ARM::TCRETURNri) {
+          unsigned Opcode = MI->getOpcode() == ARM::TCRETURNdi
+                            ? ARM::tBL : ARM::tBLXr;
+          MachineInstrBuilder BL = BuildMI(MF, DL, TII.get(Opcode));
+          BL.add(predOps(ARMCC::AL));
+          BL.add(MI->getOperand(0));
+          MBB.insert(MI, &*BL);
+        }
         Reg = ARM::PC;
         (*MIB).setDesc(TII.get(ARM::tPOP_RET));
         if (MI != MBB.end())
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 4b4fbaa..3a3920a 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -24,8 +24,8 @@ using namespace llvm;
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
     : ARMBaseInstrInfo(STI), RI() {}
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void Thumb1InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void Thumb1InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(ARM::tMOVr);
   NopInst.addOperand(MCOperand::createReg(ARM::R8));
   NopInst.addOperand(MCOperand::createReg(ARM::R8));
@@ -50,20 +50,29 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (st.hasV6Ops() || ARM::hGPRRegClass.contains(SrcReg)
       || !ARM::tGPRRegClass.contains(DestReg))
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc)));
+    BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .add(predOps(ARMCC::AL));
   else {
-    // FIXME: The performance consequences of this are going to be atrocious.
-    // Some things to try that should be better:
-    //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
-    //   * 'movs $dst, $src' if cpsr isn't live
-    // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
+    // FIXME: Can also use 'mov hi, $src; mov $dst, hi',
+    // with hi as either r10 or r11.
+
+    const TargetRegisterInfo *RegInfo = st.getRegisterInfo();
+    if (MBB.computeRegisterLiveness(RegInfo, ARM::CPSR, I)
+        == MachineBasicBlock::LQR_Dead) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          ->addRegisterDead(ARM::CPSR, RegInfo);
+      return;
+    }
 
     // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPUSH)))
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tPOP)))
-      .addReg(DestReg, getDefRegState(true));
+    BuildMI(MBB, I, DL, get(ARM::tPUSH))
+        .add(predOps(ARMCC::AL))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    BuildMI(MBB, I, DL, get(ARM::tPOP))
+        .add(predOps(ARMCC::AL))
+        .addReg(DestReg, getDefRegState(true));
   }
 }
 
@@ -87,9 +96,12 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
         MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::tSTRspi))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
   }
 }
 
@@ -113,8 +125,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
         MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
   }
 }
 
diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 931914a..e8d9a9c 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -25,8 +25,8 @@ class Thumb1InstrInfo : public ARMBaseInstrInfo {
 public:
   explicit Thumb1InstrInfo(const ARMSubtarget &STI);
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  /// Return the noop instruction to use for a noop.
+  void getNoop(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
index d01fc8c..04bdd91 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -9,13 +9,26 @@
 
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include <cassert>
+#include <new>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "thumb2-it"
@@ -24,16 +37,18 @@ STATISTIC(NumITs,        "Number of IT blocks inserted");
 STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
 namespace {
+
   class Thumb2ITBlockPass : public MachineFunctionPass {
   public:
     static char ID;
-    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
 
     bool restrictIT;
     const Thumb2InstrInfo *TII;
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
 
+    Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
+
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
     MachineFunctionProperties getRequiredProperties() const override {
@@ -52,8 +67,10 @@ namespace {
                               SmallSet<unsigned, 4> &Uses);
     bool InsertITInstructions(MachineBasicBlock &MBB);
   };
+
   char Thumb2ITBlockPass::ID = 0;
-}
+
+} // end anonymous namespace
 
 /// TrackDefUses - Tracking what registers are being defined and used by
 /// instructions in the IT block. This also tracks "dependencies", i.e. uses
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 1c731d6..9125be9 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information -------------===//
+//===- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,16 +11,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Thumb2InstrInfo.h"
-#include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -30,10 +40,10 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
            cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI() {}
+    : ARMBaseInstrInfo(STI) {}
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void Thumb2InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(ARM::tHINT);
   NopInst.addOperand(MCOperand::createImm(0));
   NopInst.addOperand(MCOperand::createImm(ARMCC::AL));
@@ -117,8 +127,9 @@ void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (!ARM::GPRRegClass.contains(DestReg, SrcReg))
     return ARMBaseInstrInfo::copyPhysReg(MBB, I, DL, DestReg, SrcReg, KillSrc);
 
-  AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc)));
+  BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .add(predOps(ARMCC::AL));
 }
 
 void Thumb2InstrInfo::
@@ -138,9 +149,12 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
       RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
       RC == &ARM::GPRnopcRegClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
-                   .addReg(SrcReg, getKillRegState(isKill))
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
     return;
   }
 
@@ -156,8 +170,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2STRDi8));
     AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
     AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
-    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-    AddDefaultPred(MIB);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
     return;
   }
 
@@ -180,8 +193,11 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (RC == &ARM::GPRRegClass   || RC == &ARM::tGPRRegClass ||
       RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass ||
       RC == &ARM::GPRnopcRegClass) {
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
-                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .add(predOps(ARMCC::AL));
     return;
   }
 
@@ -198,8 +214,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::t2LDRDi8));
     AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
     AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
-    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO);
-    AddDefaultPred(MIB);
+    MIB.addFrameIndex(FI).addImm(0).addMemOperand(MMO).add(predOps(ARMCC::AL));
 
     if (TargetRegisterInfo::isPhysicalRegister(DestReg))
       MIB.addReg(DestReg, RegState::ImplicitDefine);
@@ -259,10 +274,11 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     if (Fits) {
       if (isSub) {
         BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
-          .addReg(BaseReg)
-          .addReg(DestReg, RegState::Kill)
-          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-          .setMIFlags(MIFlags);
+            .addReg(BaseReg)
+            .addReg(DestReg, RegState::Kill)
+            .add(predOps(Pred, PredReg))
+            .add(condCodeOp())
+            .setMIFlags(MIFlags);
       } else {
         // Here we know that DestReg is not SP but we do not
         // know anything about BaseReg. t2ADDrr is an invalid
@@ -270,10 +286,11 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         // is fine if SP is the first argument. To be sure we
         // do not generate invalid encoding, put BaseReg first.
         BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
-          .addReg(BaseReg)
-          .addReg(DestReg, RegState::Kill)
-          .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
-          .setMIFlags(MIFlags);
+            .addReg(BaseReg)
+            .addReg(DestReg, RegState::Kill)
+            .add(predOps(Pred, PredReg))
+            .add(condCodeOp())
+            .setMIFlags(MIFlags);
       }
       return;
     }
@@ -284,8 +301,10 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     unsigned Opc = 0;
     if (DestReg == ARM::SP && BaseReg != ARM::SP) {
       // mov sp, rn. Note t2MOVr cannot be used.
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr),DestReg)
-        .addReg(BaseReg).setMIFlags(MIFlags));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+          .addReg(BaseReg)
+          .setMIFlags(MIFlags)
+          .add(predOps(ARMCC::AL));
       BaseReg = ARM::SP;
       continue;
     }
@@ -296,8 +315,11 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
         assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
         Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
+        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+            .addReg(BaseReg)
+            .addImm(ThisVal / 4)
+            .setMIFlags(MIFlags)
+            .add(predOps(ARMCC::AL));
         NumBytes = 0;
         continue;
       }
@@ -334,12 +356,13 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
     }
 
     // Build the new ADD / SUB.
-    MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-                     .addReg(BaseReg, RegState::Kill)
-                     .addImm(ThisVal)).setMIFlags(MIFlags);
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                                  .addReg(BaseReg, RegState::Kill)
+                                  .addImm(ThisVal)
+                                  .add(predOps(ARMCC::AL))
+                                  .setMIFlags(MIFlags);
     if (HasCCOut)
-      AddDefaultCC(MIB);
+      MIB.add(condCodeOp());
 
     BaseReg = DestReg;
   }
@@ -474,7 +497,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       do MI.RemoveOperand(FrameRegIdx+1);
       while (MI.getNumOperands() > FrameRegIdx+1);
       MachineInstrBuilder MIB(*MI.getParent()->getParent(), &MI);
-      AddDefaultPred(MIB);
+      MIB.add(predOps(ARMCC::AL));
       return true;
     }
 
@@ -526,9 +549,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
     // Add cc_out operand if the original instruction did not have one.
     if (!HasCCOut)
       MI.addOperand(MachineOperand::CreateReg(0, false));
-
   } else {
-
     // AddrMode4 and AddrMode6 cannot handle any offset.
     if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
       return false;
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 15d6330..c834ba7 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -26,8 +26,8 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
 public:
   explicit Thumb2InstrInfo(const ARMSubtarget &STI);
 
-  /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  /// Return the noop instruction to use for a noop.
+  void getNoop(MCInst &NopInst) const override;
 
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index 8208e7e..d911dd9 100644
--- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -10,20 +10,38 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
-#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h" // To access Function attributes
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
 #include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "t2-reduce-size"
@@ -40,6 +58,7 @@ static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
                                      cl::init(-1), cl::Hidden);
 
 namespace {
+
   /// ReduceTable - A static table with information on mapping from wide
   /// opcodes to narrow
   struct ReduceEntry {
@@ -139,11 +158,12 @@ namespace {
   class Thumb2SizeReduce : public MachineFunctionPass {
   public:
     static char ID;
-    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
 
     const Thumb2InstrInfo *TII;
     const ARMSubtarget *STI;
 
+    Thumb2SizeReduce(std::function<bool(const Function &)> Ftor);
+
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     MachineFunctionProperties getRequiredProperties() const override {
@@ -201,19 +221,21 @@ namespace {
 
     struct MBBInfo {
       // The flags leaving this block have high latency.
-      bool HighLatencyCPSR;
+      bool HighLatencyCPSR = false;
       // Has this block been visited yet?
-      bool Visited;
+      bool Visited = false;
 
-      MBBInfo() : HighLatencyCPSR(false), Visited(false) {}
+      MBBInfo() = default;
     };
 
     SmallVector<MBBInfo, 8> BlockInfo;
 
     std::function<bool(const Function &)> PredicateFtor;
   };
+
   char Thumb2SizeReduce::ID = 0;
-}
+
+} // end anonymous namespace
 
 Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor)
     : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
@@ -490,14 +512,13 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     isLdStMul = true;
     break;
   }
-  case ARM::t2STMIA: {
+  case ARM::t2STMIA:
     // If the base register is killed, we don't care what its value is after the
     // instruction, so we can use an updating STMIA.
     if (!MI->getOperand(0).isKill())
       return false;
 
     break;
-  }
   case ARM::t2LDMIA_RET: {
     unsigned BaseReg = MI->getOperand(1).getReg();
     if (BaseReg != ARM::SP)
@@ -562,8 +583,8 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead);
 
   if (!isLdStMul) {
-    MIB.addOperand(MI->getOperand(0));
-    MIB.addOperand(MI->getOperand(1));
+    MIB.add(MI->getOperand(0));
+    MIB.add(MI->getOperand(1));
 
     if (HasImmOffset)
       MIB.addImm(OffsetImm / Scale);
@@ -577,7 +598,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Transfer the rest of operands.
   for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
-    MIB.addOperand(MI->getOperand(OpNum));
+    MIB.add(MI->getOperand(OpNum));
 
   // Transfer memoperands.
   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
@@ -621,12 +642,13 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
         MI->getOperand(MCID.getNumOperands()-1).getReg() == ARM::CPSR)
       return false;
 
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
-                                      TII->get(ARM::tADDrSPi))
-      .addOperand(MI->getOperand(0))
-      .addOperand(MI->getOperand(1))
-      .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
-    AddDefaultPred(MIB);
+    MachineInstrBuilder MIB =
+        BuildMI(MBB, MI, MI->getDebugLoc(),
+                TII->get(ARM::tADDrSPi))
+            .add(MI->getOperand(0))
+            .add(MI->getOperand(1))
+            .addImm(Imm / 4) // The tADDrSPi has an implied scale by four.
+            .add(predOps(ARMCC::AL));
 
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
@@ -652,11 +674,10 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
       switch (Opc) {
       default: break;
-      case ARM::t2ADDSri: {
+      case ARM::t2ADDSri:
         if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, IsSelfLoop))
           return true;
         LLVM_FALLTHROUGH;
-      }
       case ARM::t2ADDSrr:
         return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
       }
@@ -698,7 +719,6 @@ bool
 Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
                                 const ReduceEntry &Entry,
                                 bool LiveCPSR, bool IsSelfLoop) {
-
   if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
     return false;
 
@@ -785,13 +805,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
-  MIB.addOperand(MI->getOperand(0));
-  if (NewMCID.hasOptionalDef()) {
-    if (HasCC)
-      AddDefaultT1CC(MIB, CCDead);
-    else
-      AddNoT1CC(MIB);
-  }
+  MIB.add(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef())
+    MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
 
   // Transfer the rest of operands.
   unsigned NumOps = MCID.getNumOperands();
@@ -800,7 +816,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
       continue;
     if (SkipPred && MCID.OpInfo[i].isPredicate())
       continue;
-    MIB.addOperand(MI->getOperand(i));
+    MIB.add(MI->getOperand(i));
   }
 
   // Transfer MI flags.
@@ -880,13 +896,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
   // Add the 16-bit instruction.
   DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, NewMCID);
-  MIB.addOperand(MI->getOperand(0));
-  if (NewMCID.hasOptionalDef()) {
-    if (HasCC)
-      AddDefaultT1CC(MIB, CCDead);
-    else
-      AddNoT1CC(MIB);
-  }
+  MIB.add(MI->getOperand(0));
+  if (NewMCID.hasOptionalDef())
+    MIB.add(HasCC ? t1CondCodeOp(CCDead) : condCodeOp());
 
   // Transfer the rest of operands.
   unsigned NumOps = MCID.getNumOperands();
@@ -909,10 +921,10 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
       // Skip implicit def of CPSR. Either it's modeled as an optional
       // def now or it's already an implicit def on the new instruction.
       continue;
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
   if (!MCID.isPredicable() && NewMCID.isPredicable())
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 
   // Transfer MI flags.
   MIB.setMIFlags(MI->getFlags());
diff --git a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index 2efd63b..15a5675 100644
--- a/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -93,9 +93,10 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
   unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
-    .addReg(DestReg, getDefRegState(true), SubIdx)
-    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0)
-    .setMIFlags(MIFlags);
+      .addReg(DestReg, getDefRegState(true), SubIdx)
+      .addConstantPoolIndex(Idx)
+      .add(predOps(ARMCC::AL))
+      .setMIFlags(MIFlags);
 }
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
@@ -145,14 +146,17 @@ static void emitThumbRegPlusImmInReg(
     LdReg = MF.getRegInfo().createVirtualRegister(&ARM::tGPRRegClass);
 
   if (NumBytes <= 255 && NumBytes >= 0 && CanChangeCC) {
-    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)
+        .add(t1CondCodeOp())
         .addImm(NumBytes)
         .setMIFlags(MIFlags);
   } else if (NumBytes < 0 && NumBytes >= -255 && CanChangeCC) {
-    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg)
+        .add(t1CondCodeOp())
         .addImm(NumBytes)
         .setMIFlags(MIFlags);
-    AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg)
+        .add(t1CondCodeOp())
         .addReg(LdReg, RegState::Kill)
         .setMIFlags(MIFlags);
   } else if (ST.genExecuteOnly()) {
@@ -167,12 +171,12 @@ static void emitThumbRegPlusImmInReg(
                     : ((isHigh || !CanChangeCC) ? ARM::tADDhirr : ARM::tADDrr);
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
   if (Opc != ARM::tADDhirr)
-    MIB = AddDefaultT1CC(MIB);
+    MIB = MIB.add(t1CondCodeOp());
   if (DestReg == ARM::SP || isSub)
     MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
   else
     MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
-  AddDefaultPred(MIB);
+  MIB.add(predOps(ARMCC::AL));
 }
 
 /// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
@@ -307,12 +311,12 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(CopyOpc), DestReg);
     if (CopyNeedsCC)
-      MIB = AddDefaultT1CC(MIB);
+      MIB = MIB.add(t1CondCodeOp());
     MIB.addReg(BaseReg, RegState::Kill);
     if (CopyOpc != ARM::tMOVr) {
       MIB.addImm(CopyImm);
     }
-    AddDefaultPred(MIB.setMIFlags(MIFlags));
+    MIB.setMIFlags(MIFlags).add(predOps(ARMCC::AL));
 
     BaseReg = DestReg;
   }
@@ -324,10 +328,11 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
 
     MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(ExtraOpc), DestReg);
     if (ExtraNeedsCC)
-      MIB = AddDefaultT1CC(MIB);
-    MIB.addReg(BaseReg).addImm(ExtraImm);
-    MIB = AddDefaultPred(MIB);
-    MIB.setMIFlags(MIFlags);
+      MIB = MIB.add(t1CondCodeOp());
+    MIB.addReg(BaseReg)
+       .addImm(ExtraImm)
+       .add(predOps(ARMCC::AL))
+       .setMIFlags(MIFlags);
   }
 }
 
@@ -460,9 +465,10 @@ bool ThumbRegisterInfo::saveScavengerRegister(
   // a call clobbered register that we know won't be used in Thumb1 mode.
   const TargetInstrInfo &TII = *STI.getInstrInfo();
   DebugLoc DL;
-  AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
-    .addReg(ARM::R12, RegState::Define)
-    .addReg(Reg, RegState::Kill));
+  BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
+      .addReg(ARM::R12, RegState::Define)
+      .addReg(Reg, RegState::Kill)
+      .add(predOps(ARMCC::AL));
 
   // The UseMI is where we would like to restore the register. If there's
   // interference with R12 before then, however, we'll need to restore it
@@ -490,8 +496,10 @@ bool ThumbRegisterInfo::saveScavengerRegister(
     }
   }
   // Restore the register from R12
-  AddDefaultPred(BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr)).
-    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill));
+  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVr))
+      .addReg(Reg, RegState::Define)
+      .addReg(ARM::R12, RegState::Kill)
+      .add(predOps(ARMCC::AL));
 
   return true;
 }
@@ -621,5 +629,5 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Add predicate back if it's needed.
   if (MI.isPredicable())
-    AddDefaultPred(MIB);
+    MIB.add(predOps(ARMCC::AL));
 }
diff --git a/contrib/llvm/lib/Target/AVR/AVR.h b/contrib/llvm/lib/Target/AVR/AVR.h
index 8e5cc53..5eadf7b 100644
--- a/contrib/llvm/lib/Target/AVR/AVR.h
+++ b/contrib/llvm/lib/Target/AVR/AVR.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_AVR_H
 #define LLVM_AVR_H
 
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 4afdd3a..c058c9e 100644
--- a/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -18,8 +18,8 @@
 #include "InstPrinter/AVRInstPrinter.h"
 
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
@@ -112,7 +112,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       const AVRSubtarget &STI = MF->getSubtarget<AVRSubtarget>();
       const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
 
-      unsigned BytesPerReg = TRI.getMinimalPhysRegClass(Reg)->getSize();
+      const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+      unsigned BytesPerReg = TRI.getRegSizeInBits(*RC) / 8;
       assert(BytesPerReg <= 2 && "Only 8 and 16 bit regs are supported.");
 
       unsigned RegIdx = ByteNumber / BytesPerReg;
@@ -130,7 +131,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     }
   }
 
-  printOperand(MI, OpNum, O);
+  if (Error)
+    printOperand(MI, OpNum, O);
 
   return false;
 }
@@ -147,7 +149,10 @@ bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   (void)MO;
   assert(MO.isReg() && "Unexpected inline asm memory operand");
 
-  // TODO: We can look up the alternative name for the register if it's given.
+  // TODO: We should be able to look up the alternative name for
+  // the register if it's given.
+  // TableGen doesn't expose a way of getting retrieving names
+  // for registers.
   if (MI->getOperand(OpNum).getReg() == AVR::R31R30) {
     O << "Z";
   } else {
diff --git a/contrib/llvm/lib/Target/AVR/AVRDevices.td b/contrib/llvm/lib/Target/AVR/AVRDevices.td
index 9224af6..62def45 100644
--- a/contrib/llvm/lib/Target/AVR/AVRDevices.td
+++ b/contrib/llvm/lib/Target/AVR/AVRDevices.td
@@ -6,7 +6,6 @@
 // :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD.
 //        In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`.
 //        avr2 (with SRAM) adds the rest of the variants.
-// :TODO: s/AVRTiny/Tiny
 
 
 // A feature set aggregates features, grouping them. We don't want to create a
@@ -136,7 +135,7 @@ def ELFArchAVR4    : ELFArch<"EF_AVR_ARCH_AVR4">;
 def ELFArchAVR5    : ELFArch<"EF_AVR_ARCH_AVR5">;
 def ELFArchAVR51   : ELFArch<"EF_AVR_ARCH_AVR51">;
 def ELFArchAVR6    : ELFArch<"EF_AVR_ARCH_AVR6">;
-def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">;
+def ELFArchTiny    : ELFArch<"EF_AVR_ARCH_AVRTINY">;
 def ELFArchXMEGA1  : ELFArch<"EF_AVR_ARCH_XMEGA1">;
 def ELFArchXMEGA2  : ELFArch<"EF_AVR_ARCH_XMEGA2">;
 def ELFArchXMEGA3  : ELFArch<"EF_AVR_ARCH_XMEGA3">;
@@ -189,7 +188,7 @@ def FamilyAVR51          : Family<"avr51",
 def FamilyAVR6           : Family<"avr6",
                                  [FamilyAVR51]>;
 
-def FamilyAVRTiny        : Family<"avrtiny",
+def FamilyTiny           : Family<"avrtiny",
                                  [FamilyAVR0, FeatureBREAK, FeatureSRAM,
                                   FeatureTinyEncoding]>;
 
@@ -240,7 +239,7 @@ def : Device<"avrxmega4", FamilyXMEGA,   ELFArchXMEGA4>;
 def : Device<"avrxmega5", FamilyXMEGA,   ELFArchXMEGA5>;
 def : Device<"avrxmega6", FamilyXMEGA,   ELFArchXMEGA6>;
 def : Device<"avrxmega7", FamilyXMEGA,   ELFArchXMEGA7>;
-def : Device<"avrtiny",   FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"avrtiny",   FamilyTiny,    ELFArchTiny>;
 
 // Specific MCUs
 def : Device<"at90s1200",          FamilyAVR0, ELFArchAVR1>;
@@ -480,12 +479,12 @@ def : Device<"atxmega384d3",       FamilyXMEGA, ELFArchXMEGA6>;
 def : Device<"atxmega128a1",       FamilyXMEGA, ELFArchXMEGA7>;
 def : Device<"atxmega128a1u",      FamilyXMEGAU, ELFArchXMEGA7>;
 def : Device<"atxmega128a4u",      FamilyXMEGAU, ELFArchXMEGA7>;
-def : Device<"attiny4",            FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny5",            FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny9",            FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny10",           FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny20",           FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny40",           FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny102",          FamilyAVRTiny, ELFArchAVRTiny>;
-def : Device<"attiny104",          FamilyAVRTiny, ELFArchAVRTiny>;
+def : Device<"attiny4",            FamilyTiny, ELFArchTiny>;
+def : Device<"attiny5",            FamilyTiny, ELFArchTiny>;
+def : Device<"attiny9",            FamilyTiny, ELFArchTiny>;
+def : Device<"attiny10",           FamilyTiny, ELFArchTiny>;
+def : Device<"attiny20",           FamilyTiny, ELFArchTiny>;
+def : Device<"attiny40",           FamilyTiny, ELFArchTiny>;
+def : Device<"attiny102",          FamilyTiny, ELFArchTiny>;
+def : Device<"attiny104",          FamilyTiny, ELFArchTiny>;
 
diff --git a/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 1b2f2ce..540e05a 100644
--- a/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -88,6 +88,9 @@ private:
                                 unsigned ArithOpcode,
                                 Block &MBB,
                                 BlockIt MBBI);
+
+  /// Scavenges a free GPR8 register for use.
+  unsigned scavengeGPR8(MachineInstr &MI);
 };
 
 char AVRExpandPseudo::ID = 0;
@@ -509,8 +512,8 @@ bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
     const BlockAddress *BA = MI.getOperand(1).getBlockAddress();
     unsigned TF = MI.getOperand(1).getTargetFlags();
 
-    MIBLO.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_LO));
-    MIBHI.addOperand(MachineOperand::CreateBA(BA, TF | AVRII::MO_HI));
+    MIBLO.add(MachineOperand::CreateBA(BA, TF | AVRII::MO_LO));
+    MIBHI.add(MachineOperand::CreateBA(BA, TF | AVRII::MO_HI));
     break;
   }
   case MachineOperand::MO_Immediate: {
@@ -577,24 +580,43 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned OpLo, OpHi, DstLoReg, DstHiReg;
   unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned TmpReg = 0; // 0 for no temporary register
   unsigned SrcReg = MI.getOperand(1).getReg();
-  bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   OpLo = AVR::LDRdPtr;
   OpHi = AVR::LDDRdPtrQ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
-  assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
+  // Use a temporary register if src and dst registers are the same.
+  if (DstReg == SrcReg)
+    TmpReg = scavengeGPR8(MI);
+
+  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
+  // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(CurDstLoReg, RegState::Define)
     .addReg(SrcReg);
 
+  // Push low byte onto stack if necessary.
+  if (TmpReg)
+    buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg);
+
+  // Load high byte.
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+    .addReg(CurDstHiReg, RegState::Define)
     .addReg(SrcReg, getKillRegState(SrcIsKill))
     .addImm(1);
 
+  if (TmpReg) {
+    // Move the high byte into the final destination.
+    buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
+
+    // Move the low byte from the scratch space into the final destination.
+    buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
+  }
+
   MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   MIBHI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
@@ -669,9 +691,9 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned OpLo, OpHi, DstLoReg, DstHiReg;
   unsigned DstReg = MI.getOperand(0).getReg();
+  unsigned TmpReg = 0; // 0 for no temporary register
   unsigned SrcReg = MI.getOperand(1).getReg();
   unsigned Imm = MI.getOperand(2).getImm();
-  bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   OpLo = AVR::LDDRdPtrQ;
   OpHi = AVR::LDDRdPtrQ;
@@ -679,60 +701,35 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
 
   assert(Imm <= 63 && "Offset is out of range");
 
-  MachineInstr *MIBLO, *MIBHI;
-
-  // HACK: We shouldn't have instances of this instruction
-  // where src==dest because the instruction itself is
-  // marked earlyclobber. We do however get this instruction when
-  // loading from stack slots where the earlyclobber isn't useful.
-  //
-  // In this case, just use a temporary register.
-  if (DstReg == SrcReg) {
-    RegScavenger RS;
-
-    RS.enterBasicBlock(MBB);
-    RS.forward(MBBI);
-
-    BitVector Candidates =
-        TRI->getAllocatableSet
-        (*MBB.getParent(), &AVR::GPR8RegClass);
-
-    // Exclude all the registers being used by the instruction.
-    for (MachineOperand &MO : MI.operands()) {
-      if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        Candidates.reset(MO.getReg());
-    }
-
-    BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass);
-    Available &= Candidates;
+  // Use a temporary register if src and dst registers are the same.
+  if (DstReg == SrcReg)
+    TmpReg = scavengeGPR8(MI);
 
-    signed TmpReg = Available.find_first();
-    assert(TmpReg != -1 && "ran out of registers");
+  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
-    MIBLO = buildMI(MBB, MBBI, OpLo)
-      .addReg(TmpReg, RegState::Define)
-      .addReg(SrcReg)
-      .addImm(Imm);
+  // Load low byte.
+  auto MIBLO = buildMI(MBB, MBBI, OpLo)
+    .addReg(CurDstLoReg, RegState::Define)
+    .addReg(SrcReg)
+    .addImm(Imm);
 
-    buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstLoReg).addReg(TmpReg);
+  // Push low byte onto stack if necessary.
+  if (TmpReg)
+    buildMI(MBB, MBBI, AVR::PUSHRr).addReg(TmpReg);
 
-    MIBHI = buildMI(MBB, MBBI, OpHi)
-      .addReg(TmpReg, RegState::Define)
-      .addReg(SrcReg, getKillRegState(SrcIsKill))
-      .addImm(Imm + 1);
+  // Load high byte.
+  auto MIBHI = buildMI(MBB, MBBI, OpHi)
+    .addReg(CurDstHiReg, RegState::Define)
+    .addReg(SrcReg, getKillRegState(SrcIsKill))
+    .addImm(Imm + 1);
 
+  if (TmpReg) {
+    // Move the high byte into the final destination.
     buildMI(MBB, MBBI, AVR::MOVRdRr).addReg(DstHiReg).addReg(TmpReg);
-  } else {
-    MIBLO = buildMI(MBB, MBBI, OpLo)
-      .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(SrcReg)
-      .addImm(Imm);
 
-    MIBHI = buildMI(MBB, MBBI, OpHi)
-      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addReg(SrcReg, getKillRegState(SrcIsKill))
-      .addImm(Imm + 1);
+    // Move the low byte from the scratch space into the final destination.
+    buildMI(MBB, MBBI, AVR::POPRd).addReg(DstLoReg);
   }
 
   MIBLO->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -785,9 +782,8 @@ bool AVRExpandPseudo::expandAtomicBinaryOp(unsigned Opcode,
       auto Op1 = MI.getOperand(0);
       auto Op2 = MI.getOperand(1);
 
-      MachineInstr &NewInst = *buildMI(MBB, MBBI, Opcode)
-        .addOperand(Op1).addOperand(Op2)
-        .getInstr();
+      MachineInstr &NewInst =
+          *buildMI(MBB, MBBI, Opcode).add(Op1).add(Op2).getInstr();
       f(NewInst);
   });
 }
@@ -810,18 +806,42 @@ bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
       unsigned StoreOpcode = (Width == 8) ? AVR::STPtrRr : AVR::STWPtrRr;
 
       // Create the load
-      buildMI(MBB, MBBI, LoadOpcode).addOperand(Op1).addOperand(Op2);
+      buildMI(MBB, MBBI, LoadOpcode).add(Op1).add(Op2);
 
       // Create the arithmetic op
-      buildMI(MBB, MBBI, ArithOpcode)
-        .addOperand(Op1).addOperand(Op1)
-        .addOperand(Op2);
+      buildMI(MBB, MBBI, ArithOpcode).add(Op1).add(Op1).add(Op2);
 
       // Create the store
-      buildMI(MBB, MBBI, StoreOpcode).addOperand(Op2).addOperand(Op1);
+      buildMI(MBB, MBBI, StoreOpcode).add(Op2).add(Op1);
   });
 }
 
+unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  RegScavenger RS;
+
+  RS.enterBasicBlock(MBB);
+  RS.forward(MI);
+
+  BitVector Candidates =
+      TRI->getAllocatableSet
+      (*MBB.getParent(), &AVR::GPR8RegClass);
+
+  // Exclude all the registers being used by the instruction.
+  for (MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  BitVector Available = RS.getRegsAvailable(&AVR::GPR8RegClass);
+  Available &= Candidates;
+
+  signed Reg = Available.find_first();
+  assert(Reg != -1 && "ran out of registers");
+  return Reg;
+}
+
 template<>
 bool AVRExpandPseudo::expand<AVR::AtomicLoad8>(Block &MBB, BlockIt MBBI) {
   return expandAtomicBinaryOp(AVR::LDRdPtr, MBB, MBBI);
@@ -951,7 +971,6 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
   unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
   unsigned DstReg = MI.getOperand(0).getReg();
   unsigned SrcReg = MI.getOperand(1).getReg();
-  bool DstIsKill = MI.getOperand(0).isKill();
   bool SrcIsKill = MI.getOperand(1).isKill();
   OpLo = AVR::STPtrRr;
   OpHi = AVR::STDPtrQRr;
@@ -963,7 +982,7 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
     .addReg(SrcLoReg, getKillRegState(SrcIsKill));
 
   auto MIBHI = buildMI(MBB, MBBI, OpHi)
-    .addReg(DstReg, getKillRegState(DstIsKill))
+    .addReg(DstReg)
     .addImm(1)
     .addReg(SrcHiReg, getKillRegState(SrcIsKill));
 
diff --git a/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index b8cb221..0ec8e8b 100644
--- a/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -57,6 +57,7 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
+  bool HasFP = hasFP(MF);
 
   // Interrupt handlers re-enable interrupts in function entry.
   if (CallConv == CallingConv::AVR_INTR) {
@@ -65,6 +66,13 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // Save the frame pointer if we have one.
+  if (HasFP) {
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
+        .addReg(AVR::R29R28, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
   // Emit special prologue code to save R1, R0 and SREG in interrupt/signal
   // handlers before saving any other registers.
   if (CallConv == CallingConv::AVR_INTR ||
@@ -72,6 +80,7 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
         .addReg(AVR::R1R0, RegState::Kill)
         .setMIFlag(MachineInstr::FrameSetup);
+
     BuildMI(MBB, MBBI, DL, TII.get(AVR::INRdA), AVR::R0)
         .addImm(0x3f)
         .setMIFlag(MachineInstr::FrameSetup);
@@ -86,7 +95,7 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   // Early exit if the frame pointer is not needed in this function.
-  if (!hasFP(MF)) {
+  if (!HasFP) {
     return;
   }
 
@@ -165,6 +174,9 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
   }
 
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R29R28);
+
   // Early exit if there is no need to restore the frame pointer.
   if (!FrameSize) {
     return;
@@ -239,7 +251,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters(
     unsigned Reg = CSI[i - 1].getReg();
     bool IsNotLiveIn = !MBB.isLiveIn(Reg);
 
-    assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+    assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
            "Invalid register size");
 
     // Add the callee-saved register as live-in only if it is not already a
@@ -277,7 +289,7 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
   for (const CalleeSavedInfo &CCSI : CSI) {
     unsigned Reg = CCSI.getReg();
 
-    assert(TRI->getMinimalPhysRegClass(Reg)->getSize() == 1 &&
+    assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
            "Invalid register size");
 
     BuildMI(MBB, MI, DL, TII.get(AVR::POPRd), Reg);
@@ -363,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned int Opcode = MI->getOpcode();
-  int Amount = MI->getOperand(0).getImm();
+  int Amount = TII.getFrameSize(*MI);
 
   // Adjcallstackup does not need to allocate stack space for the call, instead
   // we insert push instructions that will allocate the necessary stack.
@@ -407,12 +419,9 @@ void AVRFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  // Spill register Y when it is used as the frame pointer.
-  if (hasFP(MF)) {
-    SavedRegs.set(AVR::R29R28);
-    SavedRegs.set(AVR::R29);
-    SavedRegs.set(AVR::R28);
-  }
+  // If we have a frame pointer, the Y register needs to be saved as well.
+  // We don't do that here however - the prologue and epilogue generation
+  // code will handle it specially.
 }
 /// The frame analyzer pass.
 ///
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 07fc3f6..7d3faac 100644
--- a/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -48,6 +48,8 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
   setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
 
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
 
@@ -77,6 +79,11 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
   setOperationAction(ISD::SRA_PARTS, MVT::i16, Expand);
   setOperationAction(ISD::SRL_PARTS, MVT::i16, Expand);
 
+  setOperationAction(ISD::ROTL, MVT::i8, Custom);
+  setOperationAction(ISD::ROTL, MVT::i16, Custom);
+  setOperationAction(ISD::ROTR, MVT::i8, Custom);
+  setOperationAction(ISD::ROTR, MVT::i16, Custom);
+
   setOperationAction(ISD::BR_CC, MVT::i8, Custom);
   setOperationAction(ISD::BR_CC, MVT::i16, Custom);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
@@ -271,6 +278,12 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SRL:
       return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
                          N->getOperand(1));
+    case ISD::ROTL:
+      return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0),
+                         N->getOperand(1));
+    case ISD::ROTR:
+      return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0),
+                         N->getOperand(1));
     case ISD::SRA:
       return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
                          N->getOperand(1));
@@ -311,7 +324,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
          "Invalid opcode for Div/Rem lowering");
-  bool isSigned = (Opcode == ISD::SDIVREM);
+  bool IsSigned = (Opcode == ISD::SDIVREM);
   EVT VT = Op->getValueType(0);
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -320,16 +333,16 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   default:
     llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:
-    LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
+    LC = IsSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
     break;
   case MVT::i16:
-    LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
+    LC = IsSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
     break;
   case MVT::i32:
-    LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+    LC = IsSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
     break;
   case MVT::i64:
-    LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
+    LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
     break;
   }
 
@@ -340,24 +353,24 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   for (SDValue const &Value : Op->op_values()) {
     Entry.Node = Value;
     Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = IsSigned;
+    Entry.IsZExt = !IsSigned;
     Args.push_back(Entry);
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr);
+  Type *RetTy = (Type *)StructType::get(Ty, Ty);
 
   SDLoc dl(Op);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
-      .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
       .setInRegister()
-      .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setSExtResult(IsSigned)
+      .setZExtResult(!IsSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
@@ -932,6 +945,12 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
   bool UsesStack = false;
   for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
     unsigned Size = Args[i];
+
+    // If we have a zero-sized argument, don't attempt to lower it.
+    // AVR-GCC does not support zero-sized arguments and so we need not
+    // worry about ABI compatibility.
+    if (Size == 0) continue;
+
     MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
 
     // If we have plenty of regs to pass the whole argument do it.
@@ -1147,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
-                               DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
@@ -1373,7 +1391,7 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // Don't emit the ret/reti instruction when the naked attribute is present in
   // the function being compiled.
   if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::Naked)) {
+          AttributeList::FunctionIndex, Attribute::Naked)) {
     return Chain;
   }
 
@@ -1432,6 +1450,22 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
     Opc = AVR::LSRWRd;
     RC = &AVR::DREGSRegClass;
     break;
+  case AVR::Rol8:
+    Opc = AVR::ROLRd;
+    RC = &AVR::GPR8RegClass;
+    break;
+  case AVR::Rol16:
+    Opc = AVR::ROLWRd;
+    RC = &AVR::DREGSRegClass;
+    break;
+  case AVR::Ror8:
+    Opc = AVR::RORRd;
+    RC = &AVR::GPR8RegClass;
+    break;
+  case AVR::Ror16:
+    Opc = AVR::RORWRd;
+    RC = &AVR::DREGSRegClass;
+    break;
   }
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1466,9 +1500,9 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
   unsigned DstReg = MI.getOperand(0).getReg();
 
   // BB:
-  // cp 0, N
+  // cpi N, 0
   // breq RemBB
-  BuildMI(BB, dl, TII.get(AVR::CPRdRr)).addReg(ShiftAmtSrcReg).addReg(AVR::R0);
+  BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
   BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);
 
   // LoopBB:
@@ -1544,6 +1578,10 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AVR::Lsl16:
   case AVR::Lsr8:
   case AVR::Lsr16:
+  case AVR::Rol8:
+  case AVR::Rol16:
+  case AVR::Ror8:
+  case AVR::Ror16:
   case AVR::Asr8:
   case AVR::Asr16:
     return insertShift(MI, MBB);
@@ -1572,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
-  MachineFunction::iterator I = MBB->getParent()->begin();
-  ++I;
+  MachineFunction::iterator I;
+  for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I);
+  if (I != MF->end()) ++I;
   MF->insert(I, trueMBB);
   MF->insert(I, falseMBB);
 
@@ -1975,4 +2014,3 @@ unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
 }
 
 } // end of namespace llvm
-
diff --git a/contrib/llvm/lib/Target/AVR/AVRISelLowering.h b/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
index a8cdc4e..b44c62a 100644
--- a/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/contrib/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -43,6 +43,8 @@ enum NodeType {
   ROL,     ///< Bit rotate left.
   LSLLOOP, ///< A loop of single logical shift left instructions.
   LSRLOOP, ///< A loop of single logical shift right instructions.
+  ROLLOOP, ///< A loop of single left bit rotate instructions.
+  RORLOOP, ///< A loop of single right bit rotate instructions.
   ASRLOOP, ///< A loop of single arithmetic shift right instructions.
   /// AVR conditional branches. Operand 0 is the chain operand, operand 1
   /// is the block to branch if condition is true, operand 2 is the
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 88f8892..744aa72 100644
--- a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -142,9 +142,9 @@ void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlignment(FrameIndex));
 
   unsigned Opcode = 0;
-  if (RC->hasType(MVT::i8)) {
+  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::STDPtrQRr;
-  } else if (RC->hasType(MVT::i16)) {
+  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     Opcode = AVR::STDWPtrQRr;
   } else {
     llvm_unreachable("Cannot store this register into a stack slot!");
@@ -176,9 +176,9 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       MFI.getObjectAlignment(FrameIndex));
 
   unsigned Opcode = 0;
-  if (RC->hasType(MVT::i8)) {
+  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
     Opcode = AVR::LDDRdPtrQ;
-  } else if (RC->hasType(MVT::i16)) {
+  } else if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     // Opcode = AVR::LDDWRdPtrQ;
     //:FIXME: remove this once PR13375 gets fixed
     Opcode = AVR::LDDWRdYQ;
@@ -402,7 +402,7 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
                                     ArrayRef<MachineOperand> Cond,
                                     const DebugLoc &DL,
                                     int *BytesAdded) const {
-  assert(!BytesAdded && "code size not handled");
+  if (BytesAdded) *BytesAdded = 0;
 
   // Shouldn't be a fall through.
   assert(TBB && "insertBranch must not be told to insert a fallthrough");
@@ -411,19 +411,24 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
   if (Cond.empty()) {
     assert(!FBB && "Unconditional branch with multiple successors!");
-    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB);
+    auto &MI = *BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB);
+    if (BytesAdded)
+      *BytesAdded += getInstSizeInBytes(MI);
     return 1;
   }
 
   // Conditional branch.
   unsigned Count = 0;
   AVRCC::CondCodes CC = (AVRCC::CondCodes)Cond[0].getImm();
-  BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB);
+  auto &CondMI = *BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB);
+
+  if (BytesAdded) *BytesAdded += getInstSizeInBytes(CondMI);
   ++Count;
 
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
-    BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB);
+    auto &MI = *BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB);
+    if (BytesAdded) *BytesAdded += getInstSizeInBytes(MI);
     ++Count;
   }
 
@@ -432,7 +437,7 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
                                     int *BytesRemoved) const {
-  assert(!BytesRemoved && "code size not handled");
+  if (BytesRemoved) *BytesRemoved = 0;
 
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
@@ -450,6 +455,7 @@ unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB,
     }
 
     // Remove the branch.
+    if (BytesRemoved) *BytesRemoved += getInstSizeInBytes(*I);
     I->eraseFromParent();
     I = MBB.end();
     ++Count;
@@ -494,5 +500,61 @@ unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   }
 }
 
+MachineBasicBlock *
+AVRInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AVR::JMPk:
+  case AVR::CALLk:
+  case AVR::RCALLk:
+  case AVR::RJMPk:
+  case AVR::BREQk:
+  case AVR::BRNEk:
+  case AVR::BRSHk:
+  case AVR::BRLOk:
+  case AVR::BRMIk:
+  case AVR::BRPLk:
+  case AVR::BRGEk:
+  case AVR::BRLTk:
+    return MI.getOperand(0).getMBB();
+  case AVR::BRBSsk:
+  case AVR::BRBCsk:
+    return MI.getOperand(1).getMBB();
+  case AVR::SBRCRrB:
+  case AVR::SBRSRrB:
+  case AVR::SBICAb:
+  case AVR::SBISAb:
+    llvm_unreachable("unimplemented branch instructions");
+  }
+}
+
+bool AVRInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
+                                         int64_t BrOffset) const {
+
+  switch (BranchOp) {
+  default:
+    llvm_unreachable("unexpected opcode!");
+  case AVR::JMPk:
+  case AVR::CALLk:
+    assert(BrOffset >= 0 && "offset must be absolute address");
+    return isUIntN(16, BrOffset);
+  case AVR::RCALLk:
+  case AVR::RJMPk:
+    return isIntN(13, BrOffset);
+  case AVR::BRBSsk:
+  case AVR::BRBCsk:
+  case AVR::BREQk:
+  case AVR::BRNEk:
+  case AVR::BRSHk:
+  case AVR::BRLOk:
+  case AVR::BRMIk:
+  case AVR::BRPLk:
+  case AVR::BRGEk:
+  case AVR::BRLTk:
+    return isIntN(7, BrOffset);
+  }
+}
+
 } // end of namespace llvm
 
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
index c5105da..f42d34f 100644
--- a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -103,6 +103,10 @@ public:
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
+  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+  bool isBranchOffsetInRange(unsigned BranchOpc,
+                             int64_t BrOffset) const override;
 private:
   const AVRRegisterInfo RI;
 };
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
index bc66379..184e4d5 100644
--- a/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -17,7 +17,7 @@ include "AVRInstrFormats.td"
 // AVR Type Profiles
 //===----------------------------------------------------------------------===//
 
-def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
@@ -64,6 +64,8 @@ def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
 // Pseudo shift nodes for non-constant shift amounts.
 def AVRlslLoop : SDNode<"AVRISD::LSLLOOP", SDTIntShiftOp>;
 def AVRlsrLoop : SDNode<"AVRISD::LSRLOOP", SDTIntShiftOp>;
+def AVRrolLoop : SDNode<"AVRISD::ROLLOOP", SDTIntShiftOp>;
+def AVRrorLoop : SDNode<"AVRISD::RORLOOP", SDTIntShiftOp>;
 def AVRasrLoop : SDNode<"AVRISD::ASRLOOP", SDTIntShiftOp>;
 
 //===----------------------------------------------------------------------===//
@@ -183,33 +185,33 @@ def call_target : Operand<iPTR>
 // A 16-bit address (which can lead to an R_AVR_16 relocation).
 def imm16 : Operand<i16>
 {
-    let EncoderMethod = "encodeImm<AVR::fixup_16>";
+    let EncoderMethod = "encodeImm<AVR::fixup_16, 2>";
 }
 
 /// A 6-bit immediate used in the ADIW/SBIW instructions.
 def imm_arith6 : Operand<i16>
 {
-    let EncoderMethod = "encodeImm<AVR::fixup_6_adiw>";
+    let EncoderMethod = "encodeImm<AVR::fixup_6_adiw, 0>";
 }
 
 /// An 8-bit immediate inside an instruction with the same format
 /// as the `LDI` instruction (the `FRdK` format).
 def imm_ldi8 : Operand<i8>
 {
-    let EncoderMethod = "encodeImm<AVR::fixup_ldi>";
+    let EncoderMethod = "encodeImm<AVR::fixup_ldi, 0>";
 }
 
 /// A 5-bit port number used in SBIC and friends (the `FIOBIT` format).
 def imm_port5 : Operand<i8>
 {
-    let EncoderMethod = "encodeImm<AVR::fixup_port5>";
+    let EncoderMethod = "encodeImm<AVR::fixup_port5, 0>";
 }
 
 /// A 6-bit port number used in the `IN` instruction and friends (the
 /// `FIORdA` format.
 def imm_port6 : Operand<i8>
 {
-    let EncoderMethod = "encodeImm<AVR::fixup_port6>";
+    let EncoderMethod = "encodeImm<AVR::fixup_port6, 0>";
 }
 
 // Addressing mode pattern reg+imm6
@@ -331,9 +333,9 @@ let Defs = [SP, SREG],
 Uses = [SP] in
 {
   def ADJCALLSTACKDOWN : Pseudo<(outs),
-                                (ins i16imm:$amt),
+                                (ins i16imm:$amt, i16imm:$amt2),
                                 "#ADJCALLSTACKDOWN",
-                                [(AVRcallseq_start timm:$amt)]>;
+                                [(AVRcallseq_start timm:$amt, timm:$amt2)]>;
 
   // R31R30 is used to update SP, since it is a scratch reg and this instruction
   // is placed after the function call then R31R30 should be always free.
@@ -694,7 +696,7 @@ Defs = [SREG] in
 }
 
 //===----------------------------------------------------------------------===//
-// One's/Two's Compliment
+// One's/Two's Complement
 //===----------------------------------------------------------------------===//
 let Constraints = "$src = $rd",
 Defs = [SREG] in
@@ -900,10 +902,9 @@ let Defs = [SREG] in
 
   // CPI Rd, K
   // Compares a register with an 8 bit immediate.
-  let Uses = [SREG] in
   def CPIRdK : FRdK<0b0011,
                     (outs),
-                    (ins GPR8:$rd, imm_ldi8:$k),
+                    (ins LD8:$rd, imm_ldi8:$k),
                     "cpi\t$rd, $k",
                     [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>;
 }
@@ -1410,17 +1411,11 @@ hasSideEffects = 0 in
   def LPMRdZ : FLPMX<0,
                      0,
                      (outs GPR8:$dst),
-                     (ins ZREGS:$z),
+                     (ins ZREG:$z),
                      "lpm\t$dst, $z",
                      []>,
                Requires<[HasLPMX]>;
 
-  def LPMWRdZ : Pseudo<(outs DREGS:$dst),
-                       (ins ZREGS:$z),
-                       "lpmw\t$dst, $z",
-                       []>,
-                Requires<[HasLPMX]>;
-
   // Load program memory, while postincrementing the Z register.
   let mayLoad = 1,
   Defs = [R31R30] in
@@ -1428,13 +1423,19 @@ hasSideEffects = 0 in
     def LPMRdZPi : FLPMX<0,
                          1,
                          (outs GPR8:$dst),
-                         (ins ZREGS:$z),
+                         (ins ZREG:$z),
                          "lpm\t$dst, $z+",
                          []>,
                    Requires<[HasLPMX]>;
 
+    def LPMWRdZ : Pseudo<(outs DREGS:$dst),
+                         (ins ZREG:$z),
+                         "lpmw\t$dst, $z",
+                         []>,
+                  Requires<[HasLPMX]>;
+
     def LPMWRdZPi : Pseudo<(outs DREGS:$dst),
-                           (ins ZREGS:$z),
+                           (ins ZREG:$z),
                            "lpmw\t$dst, $z+",
                            []>,
                     Requires<[HasLPMX]>;
@@ -1457,7 +1458,7 @@ hasSideEffects = 0 in
   def ELPMRdZ : FLPMX<1,
                       0,
                       (outs GPR8:$dst),
-                      (ins ZREGS:$z),
+                      (ins ZREG:$z),
                       "elpm\t$dst, $z",
                       []>,
                 Requires<[HasELPMX]>;
@@ -1466,7 +1467,7 @@ hasSideEffects = 0 in
   def ELPMRdZPi : FLPMX<1,
                         1,
                         (outs GPR8:$dst),
-                        (ins ZREGS: $z),
+                        (ins ZREG: $z),
                         "elpm\t$dst, $z+",
                         []>,
                   Requires<[HasELPMX]>;
@@ -1486,7 +1487,7 @@ let Uses = [R1, R0] in
   let Defs = [R31R30] in
   def SPMZPi : F16<0b1001010111111000,
                    (outs),
-                   (ins ZREGS:$z),
+                   (ins ZREG:$z),
                    "spm $z+",
                    []>,
                Requires<[HasSPMX]>;
@@ -1563,28 +1564,28 @@ hasSideEffects = 0 in
 // Read-Write-Modify (RMW) instructions.
 def XCHZRd : FZRd<0b100,
                   (outs GPR8:$rd),
-                  (ins ZREGS:$z),
+                  (ins ZREG:$z),
                   "xch\t$z, $rd",
                   []>,
              Requires<[SupportsRMW]>;
 
 def LASZRd : FZRd<0b101,
                   (outs GPR8:$rd),
-                  (ins ZREGS:$z),
+                  (ins ZREG:$z),
                   "las\t$z, $rd",
                   []>,
              Requires<[SupportsRMW]>;
 
 def LACZRd : FZRd<0b110,
                   (outs GPR8:$rd),
-                  (ins ZREGS:$z),
+                  (ins ZREG:$z),
                   "lac\t$z, $rd",
                   []>,
              Requires<[SupportsRMW]>;
 
 def LATZRd : FZRd<0b111,
                   (outs GPR8:$rd),
-                  (ins ZREGS:$z),
+                  (ins ZREG:$z),
                   "lat\t$z, $rd",
                   []>,
              Requires<[SupportsRMW]>;
@@ -1718,7 +1719,7 @@ Defs = [SREG] in
                      (implicit SREG)]>;
 
   // CBR Rd, K
-  // Alias for `ANDI Rd, COM(K)` where COM(K) is the compliment of K.
+  // Alias for `ANDI Rd, COM(K)` where COM(K) is the complement of K.
   // FIXME: This uses the 'complement' encoder. We need it to also use the
   // imm_ldi8 encoder. This will cause no fixups to be created on this instruction.
   def CBRRdK : FRdK<0b0111,
@@ -1932,7 +1933,6 @@ def Lsr8 : ShiftPseudo<
   [(set i8:$dst, (AVRlsrLoop i8:$src, i8:$cnt))]
 >;
 
-
 def Lsr16 : ShiftPseudo<
   (outs DREGS:$dst),
    (ins DREGS:$src, GPR8:$cnt),
@@ -1940,6 +1940,34 @@ def Lsr16 : ShiftPseudo<
    [(set i16:$dst, (AVRlsrLoop i16:$src, i8:$cnt))]
 >;
 
+def Rol8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Rol8 PSEUDO",
+  [(set i8:$dst, (AVRrolLoop i8:$src, i8:$cnt))]
+>;
+
+def Rol16 : ShiftPseudo<
+  (outs DREGS:$dst),
+  (ins DREGS:$src, GPR8:$cnt),
+  "# Rol16 PSEUDO",
+  [(set i16:$dst, (AVRrolLoop i16:$src, i8:$cnt))]
+>;
+
+def Ror8 : ShiftPseudo<
+  (outs GPR8:$dst),
+  (ins GPR8:$src, GPR8:$cnt),
+  "# Ror8 PSEUDO",
+  [(set i8:$dst, (AVRrorLoop i8:$src, i8:$cnt))]
+>;
+
+def Ror16 : ShiftPseudo<
+  (outs DREGS:$dst),
+  (ins DREGS:$src, GPR8:$cnt),
+  "# Ror16 PSEUDO",
+  [(set i16:$dst, (AVRrorLoop i16:$src, i8:$cnt))]
+>;
+
 def Asr8 : ShiftPseudo<
   (outs GPR8:$dst),
   (ins GPR8:$src, GPR8:$cnt),
diff --git a/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp b/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
index 5553dc2..e7fca74 100644
--- a/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRInstrumentFunctions.cpp
@@ -96,7 +96,7 @@ static void BuildSignatureCall(StringRef SymName, BasicBlock &BB, Function &F) {
   Value *FunctionName = CreateStringPtr(BB, F.getName());
 
   Value *Args[] = {FunctionName,
-                   ConstantInt::get(I16, F.getArgumentList().size())};
+                   ConstantInt::get(I16, F.arg_size())};
   CallInst::Create(Fn, Args, "", &BB);
 }
 
diff --git a/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
index 342fe55..dfefd09 100644
--- a/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRMCInstLower.cpp
@@ -37,10 +37,22 @@ MCOperand AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
         Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
   }
 
+  bool IsFunction = MO.isGlobal() && isa<Function>(MO.getGlobal());
+
   if (TF & AVRII::MO_LO) {
-    Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_LO8, Expr, IsNegated, Ctx);
+    if (IsFunction) {
+      // N.B. Should we use _GS fixups here to cope with >128k progmem?
+      Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_PM_LO8, Expr, IsNegated, Ctx);
+    } else {
+      Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_LO8, Expr, IsNegated, Ctx);
+    }
   } else if (TF & AVRII::MO_HI) {
-    Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_HI8, Expr, IsNegated, Ctx);
+    if (IsFunction) {
+      // N.B. Should we use _GS fixups here to cope with >128k progmem?
+      Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_PM_HI8, Expr, IsNegated, Ctx);
+    } else {
+      Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_HI8, Expr, IsNegated, Ctx);
+    }
   } else if (TF != 0) {
     llvm_unreachable("Unknown target flag on symbol operand");
   }
@@ -56,7 +68,7 @@ void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) con
 
     switch (MO.getType()) {
     default:
-      MI.dump();
+      MI.print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
index 48798bd..249dc55 100644
--- a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -51,8 +51,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  const AVRTargetMachine &TM = static_cast<const AVRTargetMachine&>(MF.getTarget());
-  const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering();
 
   // Reserve the intermediate result registers r1 and r2
   // The result of instructions like 'mul' is always stored here.
@@ -65,12 +63,18 @@ BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(AVR::SPH);
   Reserved.set(AVR::SP);
 
-  // Reserve the frame pointer registers r28 and r29 if the function requires one.
-  if (TFI->hasFP(MF)) {
-    Reserved.set(AVR::R28);
-    Reserved.set(AVR::R29);
-    Reserved.set(AVR::R29R28);
-  }
+  // We tenatively reserve the frame pointer register r29:r28 because the
+  // function may require one, but we cannot tell until register allocation
+  // is complete, which can be too late.
+  //
+  // Instead we just unconditionally reserve the Y register.
+  //
+  // TODO: Write a pass to enumerate functions which reserved the Y register
+  //       but didn't end up needing a frame pointer. In these, we can
+  //       convert one or two of the spills inside to use the Y register.
+  Reserved.set(AVR::R28);
+  Reserved.set(AVR::R29);
+  Reserved.set(AVR::R29R28);
 
   return Reserved;
 }
@@ -78,11 +82,12 @@ BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 const TargetRegisterClass *
 AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
                                            const MachineFunction &MF) const {
-  if (RC->hasType(MVT::i16)) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (TRI->isTypeLegalForClass(*RC, MVT::i16)) {
     return &AVR::DREGSRegClass;
   }
 
-  if (RC->hasType(MVT::i8)) {
+  if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
     return &AVR::GPR8RegClass;
   }
 
@@ -90,7 +95,8 @@ AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 }
 
 /// Fold a frame offset shared between two add instructions into a single one.
-static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) {
+static void foldFrameOffset(MachineBasicBlock::iterator &II, int &Offset, unsigned DstReg) {
+  MachineInstr &MI = *II;
   int Opcode = MI.getOpcode();
 
   // Don't bother trying if the next instruction is not an add or a sub.
@@ -115,6 +121,7 @@ static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) {
   }
 
   // Finally remove the instruction.
+  II++;
   MI.eraseFromParent();
 }
 
@@ -153,6 +160,8 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     unsigned DstReg = MI.getOperand(0).getReg();
     assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer");
 
+    II++; // Skip over the FRMIDX (and now MOVW) instruction.
+
     // Generally, to load a frame address two add instructions are emitted that
     // could get folded into a single one:
     //  movw    r31:r30, r29:r28
@@ -161,7 +170,8 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // to:
     //  movw    r31:r30, r29:r28
     //  adiw    r31:r30, 45
-    foldFrameOffset(*std::next(II), Offset, DstReg);
+    if (II != MBB.end())
+      foldFrameOffset(II, Offset, DstReg);
 
     // Select the best opcode based on DstReg and the offset size.
     switch (DstReg) {
@@ -182,7 +192,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     }
     }
 
-    MachineInstr *New = BuildMI(MBB, std::next(II), dl, TII.get(Opcode), DstReg)
+    MachineInstr *New = BuildMI(MBB, II, dl, TII.get(Opcode), DstReg)
                             .addReg(DstReg, RegState::Kill)
                             .addImm(Offset);
     New->getOperand(3).setIsDead();
@@ -263,4 +273,3 @@ void AVRRegisterInfo::splitReg(unsigned Reg,
 }
 
 } // end of namespace llvm
-
diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
index 32650fc..8162f12 100644
--- a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -110,8 +110,6 @@ CoveredBySubRegs = 1 in
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-//:TODO: use proper set instructions instead of using always "add"
-
 // Main 8-bit register class.
 def GPR8 : RegisterClass<"AVR", [i8], 8,
   (
@@ -199,14 +197,11 @@ def PTRDISPREGS : RegisterClass<"AVR", [i16], 8,
 
 // We have a bunch of instructions with an explicit Z register argument. We
 // model this using a register class containing only the Z register.
-// :TODO: Rename to 'ZREG'.
-def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
+def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
 
 // Register class used for the stack read pseudo instruction.
 def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>;
 
-//:TODO: if we remove this we get an error in tablegen
-//:TODO: this is just a hack, remove it once add16 works!
 // Status register.
 def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>;
 def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)>
diff --git a/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp b/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
index c228d05..556d69e 100644
--- a/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "AVRSubtarget.h"
 
-#include "llvm/Support/ELF.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #include "AVR.h"
diff --git a/contrib/llvm/lib/Target/AVR/AVRSubtarget.h b/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
index a37849c..b0e634f 100644
--- a/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/contrib/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -14,10 +14,9 @@
 #ifndef LLVM_AVR_SUBTARGET_H
 #define LLVM_AVR_SUBTARGET_H
 
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 #include "AVRFrameLowering.h"
 #include "AVRISelLowering.h"
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index fb32629..a9d61ff 100644
--- a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -15,12 +15,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#include "AVRTargetObjectFile.h"
 #include "AVR.h"
+#include "AVRTargetObjectFile.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 namespace llvm {
@@ -57,7 +57,7 @@ namespace {
 /// AVR Code Generator Pass Configuration Options.
 class AVRPassConfig : public TargetPassConfig {
 public:
-  AVRPassConfig(AVRTargetMachine *TM, PassManagerBase &PM)
+  AVRPassConfig(AVRTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   AVRTargetMachine &getAVRTargetMachine() const {
@@ -66,12 +66,13 @@ public:
 
   bool addInstSelector() override;
   void addPreSched2() override;
+  void addPreEmitPass() override;
   void addPreRegAlloc() override;
 };
 } // namespace
 
 TargetPassConfig *AVRTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new AVRPassConfig(this, PM);
+  return new AVRPassConfig(*this, PM);
 }
 
 extern "C" void LLVMInitializeAVRTarget() {
@@ -115,4 +116,9 @@ void AVRPassConfig::addPreSched2() {
   addPass(createAVRExpandPseudoPass());
 }
 
+void AVRPassConfig::addPreEmitPass() {
+  // Must run branch selection immediately preceding the asm printer.
+  addPass(&BranchRelaxationPassID);
+}
+
 } // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
index 1034519..795e94e 100644
--- a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.h
@@ -41,6 +41,10 @@ public:
 
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
+
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AVRSubtarget SubTarget;
diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
index af14d92..0cebb0f 100644
--- a/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -9,12 +9,12 @@
 
 #include "AVRTargetObjectFile.h"
 
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ELF.h"
 
 #include "AVR.h"
 
diff --git a/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 5b0398c..5004736 100644
--- a/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/contrib/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -18,12 +18,12 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
@@ -466,6 +466,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) {
     if (!tryParseRegisterOperand(Operands)) {
       return false;
     }
+    LLVM_FALLTHROUGH;
   case AsmToken::LParen:
   case AsmToken::Integer:
   case AsmToken::Dot:
diff --git a/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index d2a21fb..e69accf 100644
--- a/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/contrib/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -16,11 +16,11 @@
 #include "AVRSubtarget.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
index 316b783..0f34b8e 100644
--- a/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp
@@ -106,7 +106,7 @@ void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) ||
                     (MOI.RegClass == AVR::PTRDISPREGSRegClassID) ||
-                    (MOI.RegClass == AVR::ZREGSRegClassID);
+                    (MOI.RegClass == AVR::ZREGRegClassID);
 
     if (isPtrReg) {
       O << getRegisterName(Op.getReg(), AVR::ptr);
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 081d8b5..d182983 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -230,13 +230,25 @@ void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 namespace llvm {
 
 // Prepare value for the target space for it
-void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
+void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     uint64_t &Value,
                                      MCContext *Ctx) const {
   // The size of the fixup in bits.
   uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
 
   unsigned Kind = Fixup.getKind();
 
+  // Parsed LLVM-generated temporary labels are already
+  // adjusted for instruction size, but normal labels aren't.
+  //
+  // To handle both cases, we simply un-adjust the temporary label
+  // case so it acts like all other labels.
+  if (const MCSymbolRefExpr *A = Target.getSymA()) {
+    if (A->getSymbol().isTemporary())
+      Value += 2;
+  }
+
   switch (Kind) {
   default:
     llvm_unreachable("unhandled fixup");
@@ -333,9 +345,10 @@ MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
                                   MCELFObjectTargetWriter::getOSABI(OSType));
 }
 
-void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+void AVRAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target, MutableArrayRef<char> Data,
+                               uint64_t Value, bool IsPCRel) const {
+  adjustFixupValue(Fixup, Target, Value, &Asm.getContext());
   if (Value == 0)
     return; // Doesn't change encoding.
 
@@ -349,7 +362,7 @@ void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
@@ -436,30 +449,16 @@ bool AVRAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void AVRAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                      const MCAsmLayout &Layout,
-                                      const MCFixup &Fixup,
-                                      const MCFragment *DF,
-                                      const MCValue &Target, uint64_t &Value,
-                                      bool &IsResolved) {
+bool AVRAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target) {
   switch ((unsigned) Fixup.getKind()) {
+  default: return false;
   // Fixups which should always be recorded as relocations.
   case AVR::fixup_7_pcrel:
   case AVR::fixup_13_pcrel:
   case AVR::fixup_call:
-    IsResolved = false;
-    break;
-  default:
-    // Parsed LLVM-generated temporary labels are already
-    // adjusted for instruction size, but normal labels aren't.
-    //
-    // To handle both cases, we simply un-adjust the temporary label
-    // case so it acts like all other labels.
-    if (Target.getSymA()->getSymbol().isTemporary())
-      Value += 2;
-
-    adjustFixupValue(Fixup, Value, &Asm.getContext());
-    break;
+    return true;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 7ff4b8f..4a75e3b 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -35,12 +35,13 @@ public:
   AVRAsmBackend(Triple::OSType OSType)
       : MCAsmBackend(), OSType(OSType) {}
 
-  void adjustFixupValue(const MCFixup &Fixup, uint64_t &Value,
-                        MCContext *Ctx = nullptr) const;
+  void adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
+                        uint64_t &Value, MCContext *Ctx = nullptr) const;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
                   uint64_t Value, bool IsPCRel) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -63,10 +64,8 @@ public:
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override;
 
 private:
   Triple::OSType OSType;
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index 481de32..6d126ed 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -1,6 +1,8 @@
 #include "AVRELFStreamer.h"
 
-#include "llvm/Support/ELF.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/FormattedStream.h"
 
 #include "AVRMCTargetDesc.h"
@@ -31,7 +33,7 @@ static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) {
     EFlags |= ELF::EF_AVR_ARCH_AVR51;
   else if (Features[AVR::ELFArchAVR6])
     EFlags |= ELF::EF_AVR_ARCH_AVR6;
-  else if (Features[AVR::ELFArchAVRTiny])
+  else if (Features[AVR::ELFArchTiny])
     EFlags |= ELF::EF_AVR_ARCH_AVRTINY;
   else if (Features[AVR::ELFArchXMEGA1])
     EFlags |= ELF::EF_AVR_ARCH_XMEGA1;
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index cca3bcc..535bb01 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -18,11 +18,12 @@
 namespace llvm {
 
 AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
-  PointerSize = 2;
+  CodePointerSize = 2;
   CalleeSaveStackSlotSize = 2;
   CommentString = ";";
   PrivateGlobalPrefix = ".L";
   UsesELFSectionDirectiveForBSS = true;
+  UseIntegratedAssembler = true;
 }
 
 } // end of namespace llvm
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index e6dc886..4dbbce8 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -176,7 +177,7 @@ unsigned AVRMCCodeEmitter::encodeComplement(const MCInst &MI, unsigned OpNo,
   return (~0) - Imm;
 }
 
-template <AVR::Fixups Fixup>
+template <AVR::Fixups Fixup, unsigned Offset>
 unsigned AVRMCCodeEmitter::encodeImm(const MCInst &MI, unsigned OpNo,
                                      SmallVectorImpl<MCFixup> &Fixups,
                                      const MCSubtargetInfo &STI) const {
@@ -192,7 +193,7 @@ unsigned AVRMCCodeEmitter::encodeImm(const MCInst &MI, unsigned OpNo,
     }
 
     MCFixupKind FixupKind = static_cast<MCFixupKind>(Fixup);
-    Fixups.push_back(MCFixup::create(0, MO.getExpr(), FixupKind, MI.getLoc()));
+    Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), FixupKind, MI.getLoc()));
 
     return 0;
   }
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
index 5fa425c..883abf8 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -63,13 +63,14 @@ private:
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const;
 
-  /// Takes the compliment of a number (~0 - val).
+  /// Takes the complement of a number (~0 - val).
   unsigned encodeComplement(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
 
   /// Encodes an immediate value with a given fixup.
-  template <AVR::Fixups Fixup>
+  /// \tparam Offset The offset into the instruction for the fixup.
+  template <AVR::Fixups Fixup, unsigned Offset>
   unsigned encodeImm(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const;
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
index 400296b..085afd2 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp
@@ -9,11 +9,11 @@
 
 #include "AVRMCExpr.h"
 
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/MC/MCAsmLayout.h"
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
index a4fa5c0..826430e 100644
--- a/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AVRMCTargetDesc.h"
 #include "AVRELFStreamer.h"
 #include "AVRMCAsmInfo.h"
-#include "AVRMCTargetDesc.h"
 #include "AVRTargetStreamer.h"
 #include "InstPrinter/AVRInstPrinter.h"
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index c520146..9397c78 100644
--- a/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -18,10 +18,10 @@
 #include "BPFTargetMachine.h"
 #include "InstPrinter/BPFInstPrinter.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
@@ -35,14 +35,15 @@ using namespace llvm;
 namespace {
 class BPFAsmPrinter : public AsmPrinter {
 public:
-  explicit BPFAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
+  explicit BPFAsmPrinter(TargetMachine &TM,
+                         std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)) {}
 
   StringRef getPassName() const override { return "BPF Assembly Printer"; }
 
   void EmitInstruction(const MachineInstr *MI) override;
 };
-}
+} // namespace
 
 void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 1209144..f48429e 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -16,17 +16,21 @@
 #include "BPFRegisterInfo.h"
 #include "BPFSubtarget.h"
 #include "BPFTargetMachine.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "bpf-isel"
@@ -42,6 +46,8 @@ public:
     return "BPF DAG->DAG Pattern Instruction Selection";
   }
 
+  void PreprocessISelDAG() override;
+
 private:
 // Include the pieces autogenerated from the target description.
 #include "BPFGenDAGISel.inc"
@@ -51,15 +57,39 @@ private:
   // Complex Pattern for address selection.
   bool SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  // Node preprocessing cases
+  void PreprocessLoad(SDNode *Node, SelectionDAG::allnodes_iterator I);
+  void PreprocessCopyToReg(SDNode *Node);
+  void PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator I);
+
+  // Find constants from a constant structure
+  typedef std::vector<unsigned char> val_vec_type;
+  bool fillGenericConstant(const DataLayout &DL, const Constant *CV,
+                           val_vec_type &Vals, uint64_t Offset);
+  bool fillConstantDataArray(const DataLayout &DL, const ConstantDataArray *CDA,
+                             val_vec_type &Vals, int Offset);
+  bool fillConstantArray(const DataLayout &DL, const ConstantArray *CA,
+                         val_vec_type &Vals, int Offset);
+  bool fillConstantStruct(const DataLayout &DL, const ConstantStruct *CS,
+                          val_vec_type &Vals, int Offset);
+  bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset,
+                             uint64_t Size, unsigned char *ByteSeq);
+  bool checkLoadDef(unsigned DefReg, unsigned match_load_op);
+
+  // Mapping from ConstantStruct global value to corresponding byte-list values
+  std::map<const void *, val_vec_type> cs_vals_;
+  // Mapping from vreg to load memory opcode
+  std::map<unsigned, unsigned> load_to_vreg_;
 };
-}
+} // namespace
 
 // ComplexPattern used on BPF Load/Store instructions
 bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
   // if Address is FI, get the TargetFrameIndex.
   SDLoc DL(Addr);
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
     Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
     return true;
   }
@@ -71,7 +101,7 @@ bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
   // Addresses of the form Addr+const or Addr|const
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    if (isInt<32>(CN->getSExtValue())) {
+    if (isInt<16>(CN->getSExtValue())) {
 
       // If the first operand is a FI, get the TargetFI Node
       if (FrameIndexSDNode *FIN =
@@ -85,13 +115,14 @@ bool BPFDAGToDAGISel::SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
     }
   }
 
-  Base   = Addr;
+  Base = Addr;
   Offset = CurDAG->getTargetConstant(0, DL, MVT::i64);
   return true;
 }
 
 // ComplexPattern used on BPF FI instruction
-bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base,
+                                   SDValue &Offset) {
   SDLoc DL(Addr);
 
   if (!CurDAG->isBaseWithConstantOffset(Addr))
@@ -99,11 +130,10 @@ bool BPFDAGToDAGISel::SelectFIAddr(SDValue Addr, SDValue &Base, SDValue &Offset)
 
   // Addresses of the form Addr+const or Addr|const
   ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-  if (isInt<32>(CN->getSExtValue())) {
+  if (isInt<16>(CN->getSExtValue())) {
 
     // If the first operand is a FI, get the TargetFI Node
-    if (FrameIndexSDNode *FIN =
-            dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
       Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
     else
       return false;
@@ -129,7 +159,8 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
 
   // tablegen selection should be handled here.
   switch (Opcode) {
-  default: break;
+  default:
+    break;
   case ISD::SDIV: {
     DebugLoc Empty;
     const DebugLoc &DL = Node->getDebugLoc();
@@ -138,7 +169,7 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
     else
       errs() << "Error: ";
     errs() << "Unsupport signed division for DAG: ";
-    Node->dump(CurDAG);
+    Node->print(errs(), CurDAG);
     errs() << "Please convert to unsigned div/mod.\n";
     break;
   }
@@ -181,6 +212,367 @@ void BPFDAGToDAGISel::Select(SDNode *Node) {
   SelectCode(Node);
 }
 
+void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
+                                     SelectionDAG::allnodes_iterator I) {
+  union {
+    uint8_t c[8];
+    uint16_t s;
+    uint32_t i;
+    uint64_t d;
+  } new_val; // hold up the constant values replacing loads.
+  bool to_replace = false;
+  SDLoc DL(Node);
+  const LoadSDNode *LD = cast<LoadSDNode>(Node);
+  uint64_t size = LD->getMemOperand()->getSize();
+
+  if (!size || size > 8 || (size & (size - 1)))
+    return;
+
+  SDNode *LDAddrNode = LD->getOperand(1).getNode();
+  // Match LDAddr against either global_addr or (global_addr + offset)
+  unsigned opcode = LDAddrNode->getOpcode();
+  if (opcode == ISD::ADD) {
+    SDValue OP1 = LDAddrNode->getOperand(0);
+    SDValue OP2 = LDAddrNode->getOperand(1);
+
+    // We want to find the pattern global_addr + offset
+    SDNode *OP1N = OP1.getNode();
+    if (OP1N->getOpcode() <= ISD::BUILTIN_OP_END || OP1N->getNumOperands() == 0)
+      return;
+
+    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+    const GlobalAddressSDNode *GADN =
+        dyn_cast<GlobalAddressSDNode>(OP1N->getOperand(0).getNode());
+    const ConstantSDNode *CDN = dyn_cast<ConstantSDNode>(OP2.getNode());
+    if (GADN && CDN)
+      to_replace =
+          getConstantFieldValue(GADN, CDN->getZExtValue(), size, new_val.c);
+  } else if (LDAddrNode->getOpcode() > ISD::BUILTIN_OP_END &&
+             LDAddrNode->getNumOperands() > 0) {
+    DEBUG(dbgs() << "Check candidate load: "; LD->dump(); dbgs() << '\n');
+
+    SDValue OP1 = LDAddrNode->getOperand(0);
+    if (const GlobalAddressSDNode *GADN =
+            dyn_cast<GlobalAddressSDNode>(OP1.getNode()))
+      to_replace = getConstantFieldValue(GADN, 0, size, new_val.c);
+  }
+
+  if (!to_replace)
+    return;
+
+  // replacing the old with a new value
+  uint64_t val;
+  if (size == 1)
+    val = new_val.c[0];
+  else if (size == 2)
+    val = new_val.s;
+  else if (size == 4)
+    val = new_val.i;
+  else {
+    val = new_val.d;
+  }
+
+  DEBUG(dbgs() << "Replacing load of size " << size << " with constant " << val
+               << '\n');
+  SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
+
+  // After replacement, the current node is dead, we need to
+  // go backward one step to make iterator still work
+  I--;
+  SDValue From[] = {SDValue(Node, 0), SDValue(Node, 1)};
+  SDValue To[] = {NVal, NVal};
+  CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2);
+  I++;
+  // It is safe to delete node now
+  CurDAG->DeleteNode(Node);
+}
+
+void BPFDAGToDAGISel::PreprocessISelDAG() {
+  // Iterate through all nodes, interested in the following cases:
+  //
+  //  . loads from ConstantStruct or ConstantArray of constructs
+  //    which can be turns into constant itself, with this we can
+  //    avoid reading from read-only section at runtime.
+  //
+  //  . reg truncating is often the result of 8/16/32bit->64bit or
+  //    8/16bit->32bit conversion. If the reg value is loaded with
+  //    masked byte width, the AND operation can be removed since
+  //    BPF LOAD already has zero extension.
+  //
+  //    This also solved a correctness issue.
+  //    In BPF socket-related program, e.g., __sk_buff->{data, data_end}
+  //    are 32-bit registers, but later on, kernel verifier will rewrite
+  //    it with 64-bit value. Therefore, truncating the value after the
+  //    load will result in incorrect code.
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+                                       E = CurDAG->allnodes_end();
+       I != E;) {
+    SDNode *Node = &*I++;
+    unsigned Opcode = Node->getOpcode();
+    if (Opcode == ISD::LOAD)
+      PreprocessLoad(Node, I);
+    else if (Opcode == ISD::CopyToReg)
+      PreprocessCopyToReg(Node);
+    else if (Opcode == ISD::AND)
+      PreprocessTrunc(Node, I);
+  }
+}
+
+bool BPFDAGToDAGISel::getConstantFieldValue(const GlobalAddressSDNode *Node,
+                                            uint64_t Offset, uint64_t Size,
+                                            unsigned char *ByteSeq) {
+  const GlobalVariable *V = dyn_cast<GlobalVariable>(Node->getGlobal());
+
+  if (!V || !V->hasInitializer())
+    return false;
+
+  const Constant *Init = V->getInitializer();
+  const DataLayout &DL = CurDAG->getDataLayout();
+  val_vec_type TmpVal;
+
+  auto it = cs_vals_.find(static_cast<const void *>(Init));
+  if (it != cs_vals_.end()) {
+    TmpVal = it->second;
+  } else {
+    uint64_t total_size = 0;
+    if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(Init))
+      total_size =
+          DL.getStructLayout(cast<StructType>(CS->getType()))->getSizeInBytes();
+    else if (const ConstantArray *CA = dyn_cast<ConstantArray>(Init))
+      total_size = DL.getTypeAllocSize(CA->getType()->getElementType()) *
+                   CA->getNumOperands();
+    else
+      return false;
+
+    val_vec_type Vals(total_size, 0);
+    if (fillGenericConstant(DL, Init, Vals, 0) == false)
+      return false;
+    cs_vals_[static_cast<const void *>(Init)] = Vals;
+    TmpVal = std::move(Vals);
+  }
+
+  // test whether host endianness matches target
+  union {
+    uint8_t c[2];
+    uint16_t s;
+  } test_buf;
+  uint16_t test_val = 0x2345;
+  if (DL.isLittleEndian())
+    support::endian::write16le(test_buf.c, test_val);
+  else
+    support::endian::write16be(test_buf.c, test_val);
+
+  bool endian_match = test_buf.s == test_val;
+  for (uint64_t i = Offset, j = 0; i < Offset + Size; i++, j++)
+    ByteSeq[j] = endian_match ? TmpVal[i] : TmpVal[Offset + Size - 1 - j];
+
+  return true;
+}
+
+bool BPFDAGToDAGISel::fillGenericConstant(const DataLayout &DL,
+                                          const Constant *CV,
+                                          val_vec_type &Vals, uint64_t Offset) {
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
+
+  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
+    return true; // already done
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    uint64_t val = CI->getZExtValue();
+    DEBUG(dbgs() << "Byte array at offset " << Offset << " with value " << val
+                 << '\n');
+
+    if (Size > 8 || (Size & (Size - 1)))
+      return false;
+
+    // Store based on target endian
+    for (uint64_t i = 0; i < Size; ++i) {
+      Vals[Offset + i] = DL.isLittleEndian()
+                             ? ((val >> (i * 8)) & 0xFF)
+                             : ((val >> ((Size - i - 1) * 8)) & 0xFF);
+    }
+    return true;
+  }
+
+  if (const ConstantDataArray *CDA = dyn_cast<ConstantDataArray>(CV))
+    return fillConstantDataArray(DL, CDA, Vals, Offset);
+
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV))
+    return fillConstantArray(DL, CA, Vals, Offset);
+
+  if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
+    return fillConstantStruct(DL, CVS, Vals, Offset);
+
+  return false;
+}
+
+bool BPFDAGToDAGISel::fillConstantDataArray(const DataLayout &DL,
+                                            const ConstantDataArray *CDA,
+                                            val_vec_type &Vals, int Offset) {
+  for (unsigned i = 0, e = CDA->getNumElements(); i != e; ++i) {
+    if (fillGenericConstant(DL, CDA->getElementAsConstant(i), Vals, Offset) ==
+        false)
+      return false;
+    Offset += DL.getTypeAllocSize(CDA->getElementAsConstant(i)->getType());
+  }
+
+  return true;
+}
+
+bool BPFDAGToDAGISel::fillConstantArray(const DataLayout &DL,
+                                        const ConstantArray *CA,
+                                        val_vec_type &Vals, int Offset) {
+  for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
+    if (fillGenericConstant(DL, CA->getOperand(i), Vals, Offset) == false)
+      return false;
+    Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
+  }
+
+  return true;
+}
+
+bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL,
+                                         const ConstantStruct *CS,
+                                         val_vec_type &Vals, int Offset) {
+  const StructLayout *Layout = DL.getStructLayout(CS->getType());
+  for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
+    const Constant *Field = CS->getOperand(i);
+    uint64_t SizeSoFar = Layout->getElementOffset(i);
+    if (fillGenericConstant(DL, Field, Vals, Offset + SizeSoFar) == false)
+      return false;
+  }
+  return true;
+}
+
+void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) {
+  const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1));
+  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+    return;
+
+  const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2));
+  if (!LD)
+    return;
+
+  // Assign a load value to a virtual register. record its load width
+  unsigned mem_load_op = 0;
+  switch (LD->getMemOperand()->getSize()) {
+  default:
+    return;
+  case 4:
+    mem_load_op = BPF::LDW;
+    break;
+  case 2:
+    mem_load_op = BPF::LDH;
+    break;
+  case 1:
+    mem_load_op = BPF::LDB;
+    break;
+  }
+
+  DEBUG(dbgs() << "Find Load Value to VReg "
+               << TargetRegisterInfo::virtReg2Index(RegN->getReg()) << '\n');
+  load_to_vreg_[RegN->getReg()] = mem_load_op;
+}
+
+void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node,
+                                      SelectionDAG::allnodes_iterator I) {
+  ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+  if (!MaskN)
+    return;
+
+  unsigned match_load_op = 0;
+  switch (MaskN->getZExtValue()) {
+  default:
+    return;
+  case 0xFFFFFFFF:
+    match_load_op = BPF::LDW;
+    break;
+  case 0xFFFF:
+    match_load_op = BPF::LDH;
+    break;
+  case 0xFF:
+    match_load_op = BPF::LDB;
+    break;
+  }
+
+  // The Reg operand should be a virtual register, which is defined
+  // outside the current basic block. DAG combiner has done a pretty
+  // good job in removing truncating inside a single basic block.
+  SDValue BaseV = Node->getOperand(0);
+  if (BaseV.getOpcode() != ISD::CopyFromReg)
+    return;
+
+  const RegisterSDNode *RegN =
+      dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1));
+  if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
+    return;
+  unsigned AndOpReg = RegN->getReg();
+  DEBUG(dbgs() << "Examine %vreg" << TargetRegisterInfo::virtReg2Index(AndOpReg)
+               << '\n');
+
+  // Examine the PHI insns in the MachineBasicBlock to found out the
+  // definitions of this virtual register. At this stage (DAG2DAG
+  // transformation), only PHI machine insns are available in the machine basic
+  // block.
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+  MachineInstr *MII = nullptr;
+  for (auto &MI : *MBB) {
+    for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+      const MachineOperand &MOP = MI.getOperand(i);
+      if (!MOP.isReg() || !MOP.isDef())
+        continue;
+      unsigned Reg = MOP.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg) && Reg == AndOpReg) {
+        MII = &MI;
+        break;
+      }
+    }
+  }
+
+  if (MII == nullptr) {
+    // No phi definition in this block.
+    if (!checkLoadDef(AndOpReg, match_load_op))
+      return;
+  } else {
+    // The PHI node looks like:
+    //   %vreg2<def> = PHI %vreg0, <BB#1>, %vreg1, <BB#3>
+    // Trace each incoming definition, e.g., (%vreg0, BB#1) and (%vreg1, BB#3)
+    // The AND operation can be removed if both %vreg0 in BB#1 and %vreg1 in
+    // BB#3 are defined with with a load matching the MaskN.
+    DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
+    unsigned PrevReg = -1;
+    for (unsigned i = 0; i < MII->getNumOperands(); ++i) {
+      const MachineOperand &MOP = MII->getOperand(i);
+      if (MOP.isReg()) {
+        if (MOP.isDef())
+          continue;
+        PrevReg = MOP.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(PrevReg))
+          return;
+        if (!checkLoadDef(PrevReg, match_load_op))
+          return;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump();
+        dbgs() << '\n');
+
+  I--;
+  CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV);
+  I++;
+  CurDAG->DeleteNode(Node);
+}
+
+bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) {
+  auto it = load_to_vreg_.find(DefReg);
+  if (it == load_to_vreg_.end())
+    return false; // The definition of register is not exported yet.
+
+  return it->second == match_load_op;
+}
+
 FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) {
   return new BPFDAGToDAGISel(TM);
 }
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
index cca3492..81b0aa7 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "bpf-lower"
 
-static void fail(const SDLoc &DL, SelectionDAG &DAG, const char *Msg) {
+static void fail(const SDLoc &DL, SelectionDAG &DAG, const Twine &Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
       DiagnosticInfoUnsupported(*MF.getFunction(), Msg, DL.getDebugLoc()));
@@ -132,6 +132,10 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
 }
 
+bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  return false;
+}
+
 SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   case ISD::BR_CC:
@@ -257,8 +261,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   auto PtrVT = getPointerTy(MF.getDataLayout());
-  Chain = DAG.getCALLSEQ_START(
-      Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
 
   SmallVector<std::pair<unsigned, SDValue>, MaxArgs> RegsToPass;
 
@@ -306,11 +309,23 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    auto GV = G->getGlobal();
+    fail(CLI.DL, DAG,
+         "A call to global function '" + StringRef(GV->getName())
+         + "' is not supported. "
+         + (GV->isDeclaration() ?
+           "Only calls to predefined BPF helpers are allowed." :
+           "Please use __attribute__((always_inline) to make sure"
+           " this function is inlined."));
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), CLI.DL, PtrVT,
                                         G->getOffset(), 0);
-  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
+    fail(CLI.DL, DAG, Twine("A call to built-in function '"
+                            + StringRef(E->getSymbol())
+                            + "' is not supported."));
+  }
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -485,8 +500,11 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
+  auto N = cast<GlobalAddressSDNode>(Op);
+  assert(N->getOffset() == 0 && "Invalid offset for global address");
+
   SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalValue *GV = N->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64);
 
   return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
@@ -497,8 +515,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
+  bool isSelectOp = MI.getOpcode() == BPF::Select;
 
-  assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert");
+  assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert");
 
   // To "insert" a SELECT instruction, we actually have to insert the diamond
   // control-flow pattern.  The incoming instruction knows the destination vreg
@@ -530,48 +549,40 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   // Insert Branch if Flag
   unsigned LHS = MI.getOperand(1).getReg();
-  unsigned RHS = MI.getOperand(2).getReg();
   int CC = MI.getOperand(3).getImm();
+  int NewCC;
   switch (CC) {
   case ISD::SETGT:
-    BuildMI(BB, DL, TII.get(BPF::JSGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri;
     break;
   case ISD::SETUGT:
-    BuildMI(BB, DL, TII.get(BPF::JUGT_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri;
     break;
   case ISD::SETGE:
-    BuildMI(BB, DL, TII.get(BPF::JSGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri;
     break;
   case ISD::SETUGE:
-    BuildMI(BB, DL, TII.get(BPF::JUGE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri;
     break;
   case ISD::SETEQ:
-    BuildMI(BB, DL, TII.get(BPF::JEQ_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri;
     break;
   case ISD::SETNE:
-    BuildMI(BB, DL, TII.get(BPF::JNE_rr))
-        .addReg(LHS)
-        .addReg(RHS)
-        .addMBB(Copy1MBB);
+    NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri;
     break;
   default:
     report_fatal_error("unimplemented select CondCode " + Twine(CC));
   }
+  if (isSelectOp)
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addReg(MI.getOperand(2).getReg())
+        .addMBB(Copy1MBB);
+  else
+    BuildMI(BB, DL, TII.get(NewCC))
+        .addReg(LHS)
+        .addImm(MI.getOperand(2).getImm())
+        .addMBB(Copy1MBB);
 
   // Copy0MBB:
   //  %FalseValue = ...
diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
index 3d1726b..0b8a8ca 100644
--- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -42,6 +42,10 @@ public:
   // This method returns the name of a target specific DAG node.
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  // This method decides whether folding a constant offset
+  // with the given GlobalAddress is legal.
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index e38face..5351cfa 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BPF.h"
 #include "BPFInstrInfo.h"
+#include "BPF.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
index a7910de..f683578 100644
--- a/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/contrib/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -16,7 +16,8 @@ include "BPFInstrFormats.td"
 // Instruction Operands and Patterns
 
 // These are target-independent nodes, but have target-specific formats.
-def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+                                          SDTCisVT<1, iPTR>]>;
 def SDT_BPFCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_BPFCall         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
 def SDT_BPFSetFlag      : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>;
@@ -50,7 +51,7 @@ def u64imm   : Operand<i64> {
   let PrintMethod = "printImm64Operand";
 }
 
-def i64immSExt32 : PatLeaf<(imm),
+def i64immSExt32 : PatLeaf<(i64 imm),
                 [{return isInt<32>(N->getSExtValue()); }]>;
 
 // Addressing modes.
@@ -66,17 +67,17 @@ def MEMri : Operand<i64> {
 }
 
 // Conditional code predicates - used for pattern matching for jump instructions
-def BPF_CC_EQ  : PatLeaf<(imm),
+def BPF_CC_EQ  : PatLeaf<(i64 imm),
                          [{return (N->getZExtValue() == ISD::SETEQ);}]>;
-def BPF_CC_NE  : PatLeaf<(imm),
+def BPF_CC_NE  : PatLeaf<(i64 imm),
                          [{return (N->getZExtValue() == ISD::SETNE);}]>;
-def BPF_CC_GE  : PatLeaf<(imm),
+def BPF_CC_GE  : PatLeaf<(i64 imm),
                          [{return (N->getZExtValue() == ISD::SETGE);}]>;
-def BPF_CC_GT  : PatLeaf<(imm),
+def BPF_CC_GT  : PatLeaf<(i64 imm),
                          [{return (N->getZExtValue() == ISD::SETGT);}]>;
-def BPF_CC_GTU : PatLeaf<(imm),
+def BPF_CC_GTU : PatLeaf<(i64 imm),
                          [{return (N->getZExtValue() == ISD::SETUGT);}]>;
-def BPF_CC_GEU : PatLeaf<(imm),
+def BPF_CC_GEU : PatLeaf<(i64 imm),
                          [{return (N->getZExtValue() == ISD::SETUGE);}]>;
 
 // jump instructions
@@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1,
 
 // ADJCALLSTACKDOWN/UP pseudo insns
 let Defs = [R11], Uses = [R11] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
-                              "#ADJCALLSTACKDOWN $amt",
-                              [(BPFcallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
+                              [(BPFcallseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                               "#ADJCALLSTACKUP $amt1 $amt2",
                               [(BPFcallseq_end timm:$amt1, timm:$amt2)]>;
@@ -459,6 +460,11 @@ let usesCustomInserter = 1 in {
                       "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
                       [(set i64:$dst,
                        (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>;
+  def Select_Ri : Pseudo<(outs GPR:$dst),
+                      (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR:$src, GPR:$src2),
+                      "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2",
+                      [(set i64:$dst,
+                       (BPFselectcc i64:$lhs, (i64 imm:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>;
 }
 
 // load 64-bit global addr into register
@@ -470,6 +476,7 @@ def : Pat<(i64 (and (i64 GPR:$src), 0xffffFFFF)),
 
 // Calls
 def : Pat<(BPFcall tglobaladdr:$dst), (JAL tglobaladdr:$dst)>;
+def : Pat<(BPFcall texternalsym:$dst), (JAL texternalsym:$dst)>;
 def : Pat<(BPFcall imm:$dst), (JAL imm:$dst)>;
 
 // Loads
diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
index f64defe..c8528e8 100644
--- a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.cpp
@@ -29,6 +29,11 @@ BPFMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
   return Printer.getSymbol(MO.getGlobal());
 }
 
+MCSymbol *
+BPFMCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
 MCOperand BPFMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                              MCSymbol *Sym) const {
 
@@ -49,7 +54,7 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
@@ -66,6 +71,9 @@ void BPFMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       break;
     case MachineOperand::MO_RegisterMask:
       continue;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
     case MachineOperand::MO_GlobalAddress:
       MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
       break;
diff --git a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
index 054e894..eac811f 100644
--- a/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
+++ b/contrib/llvm/lib/Target/BPF/BPFMCInstLower.h
@@ -37,6 +37,7 @@ public:
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
 };
 }
 
diff --git a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
index 71846e3..273843e 100644
--- a/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFRegisterInfo.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BPF.h"
 #include "BPFRegisterInfo.h"
+#include "BPF.h"
 #include "BPFSubtarget.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -41,6 +42,18 @@ BitVector BPFRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+static void WarnSize(int Offset, MachineFunction &MF, DebugLoc& DL)
+{
+  if (Offset <= -512) {
+      auto F = MF.getFunction();
+      DiagnosticInfoUnsupported DiagStackSize(*F,
+          "Looks like the BPF stack limit of 512 bytes is exceeded. "
+          "Please move large on stack variables into BPF per-cpu array map.\n",
+          DL);
+      F->getContext().diagnose(DiagStackSize);
+  }
+}
+
 void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                           int SPAdj, unsigned FIOperandNum,
                                           RegScavenger *RS) const {
@@ -48,9 +61,18 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   unsigned i = 0;
   MachineInstr &MI = *II;
-  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
+  if (!DL)
+    /* try harder to get some debug loc */
+    for (auto &I : MBB)
+      if (I.getDebugLoc()) {
+        DL = I.getDebugLoc();
+        break;
+      }
+
   while (!MI.getOperand(i).isFI()) {
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
@@ -59,11 +81,11 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   unsigned FrameReg = getFrameRegister(MF);
   int FrameIndex = MI.getOperand(i).getIndex();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  MachineBasicBlock &MBB = *MI.getParent();
 
   if (MI.getOpcode() == BPF::MOV_rr) {
     int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
 
+    WarnSize(Offset, MF, DL);
     MI.getOperand(i).ChangeToRegister(FrameReg, false);
     unsigned reg = MI.getOperand(i - 1).getReg();
     BuildMI(MBB, ++II, DL, TII.get(BPF::ADD_ri), reg)
@@ -78,6 +100,8 @@ void BPFRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if (!isInt<32>(Offset))
     llvm_unreachable("bug in frame offset");
 
+  WarnSize(Offset, MF, DL);
+
   if (MI.getOpcode() == BPF::FI_ri) {
     // architecture does not really support FI_ri, replace it with
     //    MOV_rr <target_reg>, frame_reg
diff --git a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 8976956..d84b0a8 100644
--- a/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BPF.h"
 #include "BPFTargetMachine.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "BPF.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -58,7 +58,7 @@ namespace {
 // BPF Code Generator Pass Configuration Options.
 class BPFPassConfig : public TargetPassConfig {
 public:
-  BPFPassConfig(BPFTargetMachine *TM, PassManagerBase &PM)
+  BPFPassConfig(BPFTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   BPFTargetMachine &getBPFTargetMachine() const {
@@ -70,7 +70,7 @@ public:
 }
 
 TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new BPFPassConfig(this, PM);
+  return new BPFPassConfig(*this, PM);
 }
 
 // Install an instruction selector pass using
diff --git a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 9beefcd..a1d732c 100644
--- a/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/contrib/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -15,6 +15,8 @@
 #include "BPFSubtarget.h"
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -88,9 +90,9 @@ static DecodeStatus decodeMemoryOpValue(MCInst &Inst, unsigned Insn,
 }
 
 #include "BPFGenDisassemblerTables.inc"
-
 static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
-                                      uint64_t &Size, uint64_t &Insn) {
+                                      uint64_t &Size, uint64_t &Insn,
+                                      bool IsLittleEndian) {
   uint64_t Lo, Hi;
 
   if (Bytes.size() < 8) {
@@ -99,8 +101,14 @@ static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
   }
 
   Size = 8;
-  Hi = (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 0) | (Bytes[3] << 8);
-  Lo = (Bytes[4] << 0) | (Bytes[5] << 8) | (Bytes[6] << 16) | (Bytes[7] << 24);
+  if (IsLittleEndian) {
+    Hi = (Bytes[0] << 24) | (Bytes[1] << 16) | (Bytes[2] << 0) | (Bytes[3] << 8);
+    Lo = (Bytes[4] << 0) | (Bytes[5] << 8) | (Bytes[6] << 16) | (Bytes[7] << 24);
+  } else {
+    Hi = (Bytes[0] << 24) | ((Bytes[1] & 0x0F) << 20) | ((Bytes[1] & 0xF0) << 12) |
+         (Bytes[2] << 8) | (Bytes[3] << 0);
+    Lo = (Bytes[4] << 24) | (Bytes[5] << 16) | (Bytes[6] << 8) | (Bytes[7] << 0);
+  }
   Insn = Make_64(Hi, Lo);
 
   return MCDisassembler::Success;
@@ -111,10 +119,11 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                              uint64_t Address,
                                              raw_ostream &VStream,
                                              raw_ostream &CStream) const {
-  uint64_t Insn;
+  bool IsLittleEndian = getContext().getAsmInfo()->isLittleEndian();
+  uint64_t Insn, Hi;
   DecodeStatus Result;
 
-  Result = readInstruction64(Bytes, Address, Size, Insn);
+  Result = readInstruction64(Bytes, Address, Size, Insn, IsLittleEndian);
   if (Result == MCDisassembler::Fail) return MCDisassembler::Fail;
 
   Result = decodeInstruction(DecoderTableBPF64, Instr, Insn,
@@ -128,7 +137,10 @@ DecodeStatus BPFDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       return MCDisassembler::Fail;
     }
     Size = 16;
-    uint64_t Hi = (Bytes[12] << 0) | (Bytes[13] << 8) | (Bytes[14] << 16) | (Bytes[15] << 24);
+    if (IsLittleEndian)
+      Hi = (Bytes[12] << 0) | (Bytes[13] << 8) | (Bytes[14] << 16) | (Bytes[15] << 24);
+    else
+      Hi = (Bytes[12] << 24) | (Bytes[13] << 16) | (Bytes[14] << 8) | (Bytes[15] << 0);
     auto& Op = Instr.getOperand(1);
     Op.setImm(Make_64(Hi, Op.getImm()));
     break;
diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
index ffd29f3..64e986f 100644
--- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "BPF.h"
 #include "BPFInstPrinter.h"
+#include "BPF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index afc321ea..9fc812c 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -27,8 +27,9 @@ public:
     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
   ~BPFAsmBackend() override = default;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -61,16 +62,17 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target,
+                               MutableArrayRef<char> Data, uint64_t Value,
+                               bool IsResolved) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
   } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
     unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8;
 
     for (unsigned i = 0; i != Size; ++i) {
-      unsigned Idx = IsLittleEndian ? i : Size - i;
+      unsigned Idx = IsLittleEndian ? i : Size - i - 1;
       Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8));
     }
   } else {
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index ebe9abd..d5e1d77 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixup.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstdint>
 
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 559ac29..fd7c97b 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -42,7 +42,7 @@ public:
     // messed up in random places by 4 bytes. .debug_line
     // section will be parsable, but with odd offsets and
     // line numbers, etc.
-    PointerSize = 8;
+    CodePointerSize = 8;
   }
 };
 }
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index b584097..797904e 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "BPF.h"
 #include "InstPrinter/BPFInstPrinter.h"
-#include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "MCTargetDesc/BPFMCAsmInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index 3df673e..d1c97c9 100644
--- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
 #define LLVM_LIB_TARGET_BPF_MCTARGETDESC_BPFMCTARGETDESC_H
 
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 class MCAsmBackend;
diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index becc086..d901abb 100644
--- a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -17,11 +17,12 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonShuffler.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
@@ -42,13 +43,12 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@@ -63,21 +63,25 @@ using namespace llvm;
 static cl::opt<bool> EnableFutureRegs("mfuture-regs",
                                       cl::desc("Enable future registers"));
 
-static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
-cl::desc("Warn for missing parenthesis around predicate registers"),
-cl::init(true));
-static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
-cl::desc("Error for missing parenthesis around predicate registers"),
-cl::init(false));
-static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
-cl::desc("Warn for mismatching a signed and unsigned value"),
-cl::init(true));
-static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
-cl::desc("Warn for register names that arent contigious"),
-cl::init(true));
-static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
-cl::desc("Error for register names that aren't contigious"),
-cl::init(false));
+static cl::opt<bool> WarnMissingParenthesis(
+    "mwarn-missing-parenthesis",
+    cl::desc("Warn for missing parenthesis around predicate registers"),
+    cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis(
+    "merror-missing-parenthesis",
+    cl::desc("Error for missing parenthesis around predicate registers"),
+    cl::init(false));
+static cl::opt<bool> WarnSignedMismatch(
+    "mwarn-sign-mismatch",
+    cl::desc("Warn for mismatching a signed and unsigned value"),
+    cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister(
+    "mwarn-noncontigious-register",
+    cl::desc("Warn for register names that arent contigious"), cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister(
+    "merror-noncontigious-register",
+    cl::desc("Error for register names that aren't contigious"),
+    cl::init(false));
 
 namespace {
 
@@ -123,9 +127,11 @@ class HexagonAsmParser : public MCTargetAsmParser {
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
   bool OutOfRange(SMLoc IDLoc, long long Val, long long Max);
   int processInstruction(MCInst &Inst, OperandVector const &Operands,
                          SMLoc IDLoc);
@@ -168,11 +174,10 @@ public:
   bool parseInstruction(OperandVector &Operands);
   bool implicitExpressionLocation(OperandVector &Operands);
   bool parseExpressionOrOperand(OperandVector &Operands);
-  bool parseExpression(MCExpr const *& Expr);
+  bool parseExpression(MCExpr const *&Expr);
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc, OperandVector &Operands) override
-  {
+                        SMLoc NameLoc, OperandVector &Operands) override {
     llvm_unreachable("Unimplemented");
   }
 
@@ -289,45 +294,63 @@ public:
     return false;
   }
 
-  bool isf32Ext() const { return false; }
-  bool iss32_0Imm() const { return CheckImmRange(32, 0, true, true, false); }
-  bool iss23_2Imm() const { return CheckImmRange(23, 2, true, true, false); }
+  bool isa30_2Imm() const { return CheckImmRange(30, 2, true, true, true); }
+  bool isb30_2Imm() const { return CheckImmRange(30, 2, true, true, true); }
+  bool isb15_2Imm() const { return CheckImmRange(15, 2, true, true, false); }
+  bool isb13_2Imm() const { return CheckImmRange(13, 2, true, true, false); }
+
+  bool ism32_0Imm() const { return true; }
+
+  bool isf32Imm() const { return false; }
+  bool isf64Imm() const { return false; }
+  bool iss32_0Imm() const { return true; }
+  bool iss31_1Imm() const { return true; }
+  bool iss30_2Imm() const { return true; }
+  bool iss29_3Imm() const { return true; }
+  bool iss27_2Imm() const { return CheckImmRange(27, 2, true, true, false); }
+  bool iss10_0Imm() const { return CheckImmRange(10, 0, true, false, false); }
+  bool iss10_6Imm() const { return CheckImmRange(10, 6, true, false, false); }
+  bool iss9_0Imm() const { return CheckImmRange(9, 0, true, false, false); }
   bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
   bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
   bool iss7_0Imm() const { return CheckImmRange(7, 0, true, false, false); }
   bool iss6_0Imm() const { return CheckImmRange(6, 0, true, false, false); }
+  bool iss6_3Imm() const { return CheckImmRange(6, 3, true, false, false); }
   bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
   bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
   bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
   bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
-  bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
-  bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
   bool iss3_0Imm() const { return CheckImmRange(3, 0, true, false, false); }
 
   bool isu64_0Imm() const { return CheckImmRange(64, 0, false, true, true); }
-  bool isu32_0Imm() const { return CheckImmRange(32, 0, false, true, false); }
+  bool isu32_0Imm() const { return true; }
+  bool isu31_1Imm() const { return true; }
+  bool isu30_2Imm() const { return true; }
+  bool isu29_3Imm() const { return true; }
   bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
   bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
   bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
   bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
   bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
   bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
-  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
-  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
-  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
   bool isu10_0Imm() const { return CheckImmRange(10, 0, false, false, false); }
   bool isu9_0Imm() const { return CheckImmRange(9, 0, false, false, false); }
   bool isu8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
   bool isu7_0Imm() const { return CheckImmRange(7, 0, false, false, false); }
   bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
   bool isu5_0Imm() const { return CheckImmRange(5, 0, false, false, false); }
+  bool isu5_2Imm() const { return CheckImmRange(5, 2, false, false, false); }
+  bool isu5_3Imm() const { return CheckImmRange(5, 3, false, false, false); }
   bool isu4_0Imm() const { return CheckImmRange(4, 0, false, false, false); }
+  bool isu4_2Imm() const { return CheckImmRange(4, 2, false, false, false); }
   bool isu3_0Imm() const { return CheckImmRange(3, 0, false, false, false); }
+  bool isu3_1Imm() const { return CheckImmRange(3, 1, false, false, false); }
   bool isu2_0Imm() const { return CheckImmRange(2, 0, false, false, false); }
   bool isu1_0Imm() const { return CheckImmRange(1, 0, false, false, false); }
 
-  bool ism6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
-  bool isn8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
   bool isn1Const() const {
     if (!isImm())
       return false;
@@ -336,35 +359,18 @@ public:
       return false;
     return Value == -1;
   }
-
-  bool iss16_0Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
-  bool iss12_0Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
-  bool iss10_0Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
-  bool iss9_0Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
-  bool iss8_0Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
-  bool iss7_0Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
-  bool iss6_0Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
-  bool iss11_0Ext() const {
+  bool iss11_0Imm() const {
     return CheckImmRange(11 + 26, 0, true, true, true);
   }
-  bool iss11_1Ext() const {
+  bool iss11_1Imm() const {
     return CheckImmRange(11 + 26, 1, true, true, true);
   }
-  bool iss11_2Ext() const {
+  bool iss11_2Imm() const {
     return CheckImmRange(11 + 26, 2, true, true, true);
   }
-  bool iss11_3Ext() const {
+  bool iss11_3Imm() const {
     return CheckImmRange(11 + 26, 3, true, true, true);
   }
-
-  bool isu7_0Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
-  bool isu8_0Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
-  bool isu9_0Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
-  bool isu10_0Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
-  bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
-  bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
-  bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
-  bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
   bool isu32_0MustExt() const { return isImm(); }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -392,188 +398,10 @@ public:
     Inst.addOperand(MCOperand::createExpr(Expr));
   }
 
-  void addf32ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds32_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds23_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0Imm64Operands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds3_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-
-  void addu64_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu32_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu10_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu9_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu7_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu5_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu4_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu3_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu2_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu1_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addm6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addn8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds16_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds12_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds10_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds9_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds6_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
   void addn1ConstOperands(MCInst &Inst, unsigned N) const {
     addImmOperands(Inst, N);
   }
 
-  void addu7_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu8_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu9_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu10_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu32_0MustExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE =
-        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
-    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
-  }
-
-  void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE =
-        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
-    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
-  }
-
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
@@ -631,94 +459,16 @@ bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
   DEBUG(MCB.dump_pretty(dbgs()));
   DEBUG(dbgs() << "--\n");
 
+  MCB.setLoc(IDLoc);
   // Check the bundle for errors.
   const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI);
+  HexagonMCChecker Check(getContext(), MCII, getSTI(), MCB, *RI);
 
   bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(),
                                                         getContext(), MCB,
                                                         &Check);
 
-  while (Check.getNextErrInfo()) {
-    unsigned Reg = Check.getErrRegister();
-    Twine R(RI->getName(Reg));
-
-    uint64_t Err = Check.getError();
-    if (Err != HexagonMCErrInfo::CHECK_SUCCESS) {
-      if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err)
-        return Error(
-            IDLoc,
-            "unconditional branch cannot precede another branch in packet");
-
-      if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err ||
-          HexagonMCErrInfo::CHECK_ERROR_NEWV & Err)
-        return Error(IDLoc, "register `" + R +
-                                "' used with `.new' "
-                                "but not validly modified in the same packet");
-
-      if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err)
-        return Error(IDLoc, "register `" + R + "' modified more than once");
-
-      if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err)
-        return Error(IDLoc, "cannot write to read-only register `" + R + "'");
-
-      if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err)
-        return Error(IDLoc, "loop-setup and some branch instructions "
-                            "cannot be in the same packet");
-
-      if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) {
-        Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1');
-        return Error(IDLoc,
-                     "packet marked with `:endloop" + N + "' " +
-                         "cannot contain instructions that modify register " +
-                         "`" + R + "'");
-      }
-
-      if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err)
-        return Error(
-            IDLoc,
-            "instruction cannot appear in packet with other instructions");
-
-      if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err)
-        return Error(IDLoc, "too many slots used in packet");
-
-      if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) {
-        uint64_t Erm = Check.getShuffleError();
-
-        if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm)
-          return Error(IDLoc, "invalid instruction packet");
-        else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm)
-          return Error(IDLoc, "invalid instruction packet: too many stores");
-        else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm)
-          return Error(IDLoc, "invalid instruction packet: too many loads");
-        else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm)
-          return Error(IDLoc, "too many branches in packet");
-        else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm)
-          return Error(IDLoc, "invalid instruction packet: out of slots");
-        else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm)
-          return Error(IDLoc, "invalid instruction packet: slot error");
-        else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm)
-          return Error(IDLoc, "v60 packet violation");
-        else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm)
-          return Error(IDLoc, "slot 0 instruction does not allow slot 1 store");
-        else
-          return Error(IDLoc, "unknown error in instruction packet");
-      }
-    }
-
-    unsigned Warn = Check.getWarning();
-    if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) {
-      if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn)
-        Warning(IDLoc, "register `" + R + "' used with `.cur' "
-                                          "but not used in the same packet");
-      else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn)
-        Warning(IDLoc, "register `" + R + "' used with `.tmp' "
-                                          "but not used in the same packet");
-    }
-  }
-
   if (CheckOk) {
-    MCB.setLoc(IDLoc);
     if (HexagonMCInstrInfo::bundleSize(MCB) == 0) {
       assert(!HexagonMCInstrInfo::isInnerLoop(MCB));
       assert(!HexagonMCInstrInfo::isOuterLoop(MCB));
@@ -749,10 +499,6 @@ bool HexagonAsmParser::matchBundleOptions() {
       HexagonMCInstrInfo::setInnerLoop(MCB);
     else if (Option.compare_lower("endloop1") == 0)
       HexagonMCInstrInfo::setOuterLoop(MCB);
-    else if (Option.compare_lower("mem_noshuf") == 0)
-      HexagonMCInstrInfo::setMemReorderDisabled(MCB);
-    else if (Option.compare_lower("mem_shuf") == 0)
-      HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
     else
       return true;
     Lex();
@@ -770,8 +516,7 @@ void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
       int64_t Value (I.getImm());
       NewInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
           MCConstantExpr::create(Value, getContext()), getContext())));
-    }
-    else {
+    } else {
       if (I.isExpr() && cast<HexagonMCExpr>(I.getExpr())->signMismatch() &&
           WarnSignedMismatch)
         Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
@@ -1066,6 +811,9 @@ bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
 
 // validate register against architecture
 bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+  if (HexagonMCRegisterClasses[Hexagon::V62RegsRegClassID].contains(MatchNum))
+    if (!getSTI().getFeatureBits()[Hexagon::ArchV62])
+      return false;
   return true;
 }
 
@@ -1171,11 +919,15 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
 bool HexagonAsmParser::isLabel(AsmToken &Token) {
   MCAsmLexer &Lexer = getLexer();
   AsmToken const &Second = Lexer.getTok();
-  AsmToken Third = Lexer.peekTok();  
+  AsmToken Third = Lexer.peekTok();
   StringRef String = Token.getString();
   if (Token.is(AsmToken::TokenKind::LCurly) ||
       Token.is(AsmToken::TokenKind::RCurly))
     return false;
+  // special case for parsing vwhist256:sat
+  if (String.lower() == "vwhist256" && Second.is(AsmToken::Colon) &&
+      Third.getString().lower() == "sat")
+    return false;
   if (!Token.is(AsmToken::TokenKind::Identifier))
     return true;
   if (!matchRegister(String.lower()))
@@ -1540,13 +1292,13 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_iconst: {
     Inst.setOpcode(Hexagon::A2_addi);
     MCOperand Reg = Inst.getOperand(0);
-    MCOperand S16 = Inst.getOperand(1);
-    HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
-    HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+    MCOperand S27 = Inst.getOperand(1);
+    HexagonMCInstrInfo::setMustNotExtend(*S27.getExpr());
+    HexagonMCInstrInfo::setS27_2_reloc(*S27.getExpr());
     Inst.clear();
     Inst.addOperand(Reg);
     Inst.addOperand(MCOperand::createReg(Hexagon::R0));
-    Inst.addOperand(S16);
+    Inst.addOperand(S27);
     break;
   }
   case Hexagon::M4_mpyrr_addr:
@@ -1661,6 +1413,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Rx =  CONST32(#imm)" to "$Rx = memw(gp+#LABEL) "
   case Hexagon::CONST32:
     is32bit = true;
+    LLVM_FALLTHROUGH;
   // Translate a "$Rx:y =  CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) "
   case Hexagon::CONST64:
     // FIXME: need better way to detect AsmStreamer (upstream removed getKind())
@@ -1756,8 +1509,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
           TmpInst.setOpcode(Hexagon::L2_loadrdgp);
 
         TmpInst.addOperand(MO_0);
-        TmpInst.addOperand(
-            MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+        TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCSymbolRefExpr::create(Sym, getContext()), getContext())));
         Inst = TmpInst;
       }
     }
@@ -2142,6 +1895,67 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
+  case Hexagon::PS_loadrubabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrubgp);
+    break;
+  case Hexagon::PS_loadrbabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrbgp);
+    break;
+  case Hexagon::PS_loadruhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadruhgp);
+    break;
+  case Hexagon::PS_loadrhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrhgp);
+    break;
+  case Hexagon::PS_loadriabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrigp);
+    break;
+  case Hexagon::PS_loadrdabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrdgp);
+    break;
+  case Hexagon::PS_storerbabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerbgp);
+    break;
+  case Hexagon::PS_storerhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerhgp);
+    break;
+  case Hexagon::PS_storerfabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerfgp);
+    break;
+  case Hexagon::PS_storeriabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerigp);
+    break;
+  case Hexagon::PS_storerdabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerdgp);
+    break;
+  case Hexagon::PS_storerbnewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerbnewgp);
+    break;
+  case Hexagon::PS_storerhnewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerhnewgp);
+    break;
+  case Hexagon::PS_storerinewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerinewgp);
+    break;
+  case Hexagon::A2_zxtb: {
+    Inst.setOpcode(Hexagon::A2_andir);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, Context)));
+    break;
+  }
   } // switch
 
   return Match_Success;
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
index 963fb99..5b02aa3 100644
--- a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -65,9 +65,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <iterator>
 #include <cassert>
 #include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -317,6 +317,15 @@ bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
   return true;
 }
 
+BT::RegisterCell &BT::RegisterCell::regify(unsigned R) {
+  for (unsigned i = 0, n = width(); i < n; ++i) {
+    const BitValue &V = Bits[i];
+    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+      Bits[i].RefI = BitRef(R, i);
+  }
+  return *this;
+}
+
 uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
   // The general problem is with finding a register class that corresponds
   // to a given reference reg:sub. There can be several such classes, and
@@ -338,7 +347,7 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
 
   unsigned PhysS = (RR.Sub == 0) ? PhysR : TRI.getSubReg(PhysR, RR.Sub);
   const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(PhysS);
-  uint16_t BW = RC->getSize()*8;
+  uint16_t BW = TRI.getRegSizeInBits(*RC);
   return BW;
 }
 
@@ -378,12 +387,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
     return;
   assert(RR.Sub == 0 && "Unexpected sub-register in definition");
   // Eliminate all ref-to-reg-0 bit values: replace them with "self".
-  for (unsigned i = 0, n = RC.width(); i < n; ++i) {
-    const BitValue &V = RC[i];
-    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
-      RC[i].RefI = BitRef(RR.Reg, i);
-  }
-  M[RR.Reg] = RC;
+  M[RR.Reg] = RC.regify(RR.Reg);
 }
 
 // Check if the cell represents a compile-time integer value.
@@ -1007,12 +1011,7 @@ void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
 bool BT::reached(const MachineBasicBlock *B) const {
   int BN = B->getNumber();
   assert(BN >= 0);
-  for (EdgeSetType::iterator I = EdgeExec.begin(), E = EdgeExec.end();
-       I != E; ++I) {
-    if (I->second == BN)
-      return true;
-  }
-  return false;
+  return ReachedBB.count(BN);
 }
 
 // Visit an individual instruction. This could be a newly added instruction,
@@ -1032,6 +1031,8 @@ void BT::reset() {
   EdgeExec.clear();
   InstrExec.clear();
   Map.clear();
+  ReachedBB.clear();
+  ReachedBB.reserve(MF.size());
 }
 
 void BT::run() {
@@ -1064,6 +1065,7 @@ void BT::run() {
     if (EdgeExec.count(Edge))
       continue;
     EdgeExec.insert(Edge);
+    ReachedBB.insert(Edge.second);
 
     const MachineBasicBlock &B = *MF.getBlockNumbered(Edge.second);
     MachineBasicBlock::const_iterator It = B.begin(), End = B.end();
diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
index 48c5f22..7f49f43 100644
--- a/contrib/llvm/lib/Target/Hexagon/BitTracker.h
+++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
 #define LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -68,10 +69,11 @@ private:
   typedef std::set<const MachineInstr *> InstrSetType;
   typedef std::queue<CFGEdge> EdgeQueueType;
 
-  EdgeSetType EdgeExec;       // Executable flow graph edges.
-  InstrSetType InstrExec;     // Executable instructions.
-  EdgeQueueType FlowQ;        // Work queue of CFG edges.
-  bool Trace;                 // Enable tracing for debugging.
+  EdgeSetType EdgeExec;         // Executable flow graph edges.
+  InstrSetType InstrExec;       // Executable instructions.
+  EdgeQueueType FlowQ;          // Work queue of CFG edges.
+  DenseSet<unsigned> ReachedBB; // Cache of reached blocks.
+  bool Trace;                   // Enable tracing for debugging.
 
   const MachineEvaluator &ME;
   MachineFunction &MF;
@@ -283,6 +285,9 @@ struct BitTracker::RegisterCell {
     return !operator==(RC);
   }
 
+  // Replace the ref-to-reg-0 bit values with the given register.
+  RegisterCell &regify(unsigned R);
+
   // Generate a "ref" cell for the corresponding register. In the resulting
   // cell each bit will be described as being the same as the corresponding
   // bit in register Reg (i.e. the cell is "defined" by register Reg).
diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index c05fbc1..586220d 100644
--- a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -12,12 +12,12 @@
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -25,8 +25,8 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -57,11 +57,38 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
-
-  void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
   void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
 };
 
+namespace {
+  uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
+                     int64_t Value) {
+    MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+      MCB, HexagonMCInstrInfo::bundleSize(MCB));
+    if (!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+      return Value;
+    unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+    uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+    int64_t Bits;
+    bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+    assert(Success); (void)Success;
+    uint32_t Upper26 = static_cast<uint32_t>(Bits);
+    uint32_t Operand = Upper26 | Lower6;
+    return Operand;
+  }
+  HexagonDisassembler const &disassembler(void const *Decoder) {
+    return *static_cast<HexagonDisassembler const *>(Decoder);
+  }
+  template <size_t T>
+  void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+    HexagonDisassembler const &Disassembler = disassembler(Decoder);
+    int64_t FullValue =
+        fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI,
+                  SignExtend64<T>(tmp));
+    int64_t Extended = SignExtend64<32>(FullValue);
+    HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+  }
+}
 } // end anonymous namespace
 
 // Forward declare these because the auto-generated code will reference them.
@@ -70,6 +97,10 @@ public:
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Address,
+                                                      const void *Decoder);
 static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t Address,
                                                    const void *Decoder);
@@ -79,6 +110,9 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t Address,
                                                   const void *Decoder);
+static DecodeStatus
+DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t Address,
                                                   const void *Decoder);
@@ -98,31 +132,10 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder);
 
-static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
-static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
-                                 void const *Decoder);
-
-static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
-                                 raw_ostream &os);
-
-static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
-
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t Address, const void *Decoder);
-static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
+static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder);
 static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                  const void *Decoder);
 static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
@@ -135,13 +148,12 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
-static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                   const void *Decoder);
-static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                     const void *Decoder);
 
+#include "HexagonDepDecoders.h"
 #include "HexagonGenDisassemblerTables.inc"
 
 static MCDisassembler *createHexagonDisassembler(const Target &T,
@@ -175,20 +187,32 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Size += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
-  if(Result == MCDisassembler::Fail)
+  if (Result == MCDisassembler::Fail)
     return Result;
-  HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
-  if(!Checker.check())
+  if (Size > HEXAGON_MAX_PACKET_SIZE)
+    return MCDisassembler::Fail;
+  HexagonMCChecker Checker(getContext(), *MCII, STI, MI,
+                           *getContext().getRegisterInfo(), false);
+  if (!Checker.check())
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
 
-static HexagonDisassembler const &disassembler(void const *Decoder) {
-  return *static_cast<HexagonDisassembler const *>(Decoder);
+namespace {
+void adjustDuplex(MCInst &MI, MCContext &Context) {
+  switch (MI.getOpcode()) {
+  case Hexagon::SA1_setin1:
+    MI.insert(MI.begin() + 1,
+              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    break;
+  case Hexagon::SA1_dec:
+    MI.insert(MI.begin() + 2,
+              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    break;
+  default:
+    break;
+  }
 }
-
-static MCContext &contextFromDecoder(void const *Decoder) {
-  return disassembler(Decoder).getContext();
 }
 
 DecodeStatus HexagonDisassembler::getSingleInstruction(
@@ -196,8 +220,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     raw_ostream &os, raw_ostream &cs, bool &Complete) const {
   assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
 
-  uint32_t Instruction =
-      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+  uint32_t Instruction = support::endian::read32le(Bytes.data());
 
   auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
@@ -210,103 +233,92 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       return DecodeStatus::Fail;
   }
 
-  DecodeStatus Result = DecodeStatus::Success;
+  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+      MCB, HexagonMCInstrInfo::bundleSize(MCB));
+
+  DecodeStatus Result = DecodeStatus::Fail;
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
       HexagonII::INST_PARSE_DUPLEX) {
-    // Determine the instruction class of each instruction in the duplex.
-    unsigned duplexIClass, IClassLow, IClassHigh;
-
+    unsigned duplexIClass;
+    uint8_t const *DecodeLow, *DecodeHigh;
     duplexIClass = ((Instruction >> 28) & 0xe) | ((Instruction >> 13) & 0x1);
     switch (duplexIClass) {
     default:
       return MCDisassembler::Fail;
     case 0:
-      IClassLow = HexagonII::HSIG_L1;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_L132;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 1:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 2:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 3:
-      IClassLow = HexagonII::HSIG_A;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_A32;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 4:
-      IClassLow = HexagonII::HSIG_L1;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_L132;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 5:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 6:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 7:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 8:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 9:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 10:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_S1;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_S132;
       break;
     case 11:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_S1;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_S132;
       break;
     case 12:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 13:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 14:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_S2;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_S232;
       break;
     }
-
-    // Set the MCInst to be a duplex instruction. Which one doesn't matter.
-    MI.setOpcode(Hexagon::DuplexIClass0);
-
-    // Decode each instruction in the duplex.
-    // Create an MCInst for each instruction.
-    unsigned instLow = Instruction & 0x1fff;
-    unsigned instHigh = (Instruction >> 16) & 0x1fff;
-    unsigned opLow;
-    if (GetSubinstOpcode(IClassLow, instLow, opLow, os) !=
-        MCDisassembler::Success)
-      return MCDisassembler::Fail;
-    unsigned opHigh;
-    if (GetSubinstOpcode(IClassHigh, instHigh, opHigh, os) !=
-        MCDisassembler::Success)
-      return MCDisassembler::Fail;
+    MI.setOpcode(Hexagon::DuplexIClass0 + duplexIClass);
     MCInst *MILow = new (getContext()) MCInst;
-    MILow->setOpcode(opLow);
     MCInst *MIHigh = new (getContext()) MCInst;
-    MIHigh->setOpcode(opHigh);
-    addSubinstOperands(MILow, opLow, instLow);
-    addSubinstOperands(MIHigh, opHigh, instHigh);
-    // see ConvertToSubInst() in
-    // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
-
-    // Add the duplex instruction MCInsts as operands to the passed in MCInst.
+    Result = decodeInstruction(DecodeLow, *MILow, Instruction & 0x1fff, Address,
+                               this, STI);
+    if (Result != DecodeStatus::Success)
+      return DecodeStatus::Fail;
+    adjustDuplex(*MILow, getContext());
+    Result = decodeInstruction(
+        DecodeHigh, *MIHigh, (Instruction >> 16) & 0x1fff, Address, this, STI);
+    if (Result != DecodeStatus::Success)
+      return DecodeStatus::Fail;
+    adjustDuplex(*MIHigh, getContext());
     MCOperand OPLow = MCOperand::createInst(MILow);
     MCOperand OPHigh = MCOperand::createInst(MIHigh);
     MI.addOperand(OPLow);
@@ -316,34 +328,23 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     if ((Instruction & HexagonII::INST_PARSE_MASK) ==
         HexagonII::INST_PARSE_PACKET_END)
       Complete = true;
-    // Calling the auto-generated decoder function.
-    Result =
-        decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
 
-    // If a, "standard" insn isn't found check special cases.
-    if (MCDisassembler::Success != Result ||
-        MI.getOpcode() == Hexagon::A4_ext) {
-      Result = decodeImmext(MI, Instruction, this);
-      if (MCDisassembler::Success != Result) {
-        Result = decodeSpecial(MI, Instruction);
-      }
-    } else {
-      // If the instruction is a compound instruction, register values will
-      // follow the duplex model, so the register values in the MCInst are
-      // incorrect. If the instruction is a compound, loop through the
-      // operands and change registers appropriately.
-      if (HexagonMCInstrInfo::getType(*MCII, MI) == HexagonII::TypeCOMPOUND) {
-        for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
-          if (i->isReg()) {
-            unsigned reg = i->getReg() - Hexagon::R0;
-            i->setReg(getRegFromSubinstEncoding(reg));
-          }
-        }
-      }
-    }
+    if (Extender != nullptr)
+      Result = decodeInstruction(DecoderTableMustExtend32, MI, Instruction,
+                                 Address, this, STI);
+
+    if (Result != MCDisassembler::Success)
+      Result = decodeInstruction(DecoderTable32, MI, Instruction, Address, this,
+                                 STI);
+
+    if (Result != MCDisassembler::Success &&
+        STI.getFeatureBits()[Hexagon::ExtensionHVX])
+      Result = decodeInstruction(DecoderTableEXT_mmvec32, MI, Instruction,
+                                 Address, this, STI);
+
   }
 
-  switch(MI.getOpcode()) {
+  switch (MI.getOpcode()) {
   case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
   case Hexagon::J4_cmpeqn1_f_jumpnv_t:
   case Hexagon::J4_cmpeqn1_fp0_jump_nt:
@@ -368,7 +369,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   case Hexagon::J4_cmpgtn1_tp0_jump_t:
   case Hexagon::J4_cmpgtn1_tp1_jump_nt:
   case Hexagon::J4_cmpgtn1_tp1_jump_t:
-    MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
+    MI.insert(MI.begin() + 1,
+              MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
     break;
   default:
     break;
@@ -423,13 +425,10 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       return MCDisassembler::Fail;
   }
 
-  adjustExtendedInstructions(MI, MCB);
-  MCInst const *Extender =
-    HexagonMCInstrInfo::extenderForIndex(MCB,
-                                         HexagonMCInstrInfo::bundleSize(MCB));
-  if(Extender != nullptr) {
-    MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
-                          *MI.getOperand(1).getInst() : MI;
+  if (Extender != nullptr) {
+    MCInst const &Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI)
+                             ? *MI.getOperand(1).getInst()
+                             : MI;
     if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
         !HexagonMCInstrInfo::isExtended(*MCII, Inst))
       return MCDisassembler::Fail;
@@ -437,68 +436,6 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   return Result;
 }
 
-void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
-                                                     MCInst const &MCB) const {
-  if (!HexagonMCInstrInfo::hasExtenderForIndex(
-          MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
-    unsigned opcode;
-    // This code is used by the disassembler to disambiguate between GP
-    // relative and absolute addressing instructions since they both have
-    // same encoding bits. However, an absolute addressing instruction must
-    // follow an immediate extender. Disassembler alwaus select absolute
-    // addressing instructions first and uses this code to change them into
-    // GP relative instruction in the absence of the corresponding immediate
-    // extender.
-    switch (MCI.getOpcode()) {
-    case Hexagon::PS_storerbabs:
-      opcode = Hexagon::S2_storerbgp;
-      break;
-    case Hexagon::PS_storerhabs:
-      opcode = Hexagon::S2_storerhgp;
-      break;
-    case Hexagon::PS_storerfabs:
-      opcode = Hexagon::S2_storerfgp;
-      break;
-    case Hexagon::PS_storeriabs:
-      opcode = Hexagon::S2_storerigp;
-      break;
-    case Hexagon::PS_storerbnewabs:
-      opcode = Hexagon::S2_storerbnewgp;
-      break;
-    case Hexagon::PS_storerhnewabs:
-      opcode = Hexagon::S2_storerhnewgp;
-      break;
-    case Hexagon::PS_storerinewabs:
-      opcode = Hexagon::S2_storerinewgp;
-      break;
-    case Hexagon::PS_storerdabs:
-      opcode = Hexagon::S2_storerdgp;
-      break;
-    case Hexagon::PS_loadrbabs:
-      opcode = Hexagon::L2_loadrbgp;
-      break;
-    case Hexagon::PS_loadrubabs:
-      opcode = Hexagon::L2_loadrubgp;
-      break;
-    case Hexagon::PS_loadrhabs:
-      opcode = Hexagon::L2_loadrhgp;
-      break;
-    case Hexagon::PS_loadruhabs:
-      opcode = Hexagon::L2_loadruhgp;
-      break;
-    case Hexagon::PS_loadriabs:
-      opcode = Hexagon::L2_loadrigp;
-      break;
-    case Hexagon::PS_loadrdabs:
-      opcode = Hexagon::L2_loadrdgp;
-      break;
-    default:
-      opcode = MCI.getOpcode();
-    }
-    MCI.setOpcode(opcode);
-  }
-}
-
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
                                         ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
@@ -530,6 +467,20 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
 }
 
+static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Address,
+                                                      const void *Decoder) {
+  static const MCPhysReg GeneralSubRegDecoderTable[] = {
+      Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,
+      Hexagon::R4,  Hexagon::R5,  Hexagon::R6,  Hexagon::R7,
+      Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+      Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23,
+  };
+
+  return DecodeRegisterClass(Inst, RegNo, GeneralSubRegDecoderTable);
+}
+
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
@@ -557,6 +508,15 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
 }
 
+static DecodeStatus DecodeGeneralDoubleLow8RegsRegisterClass(
+    MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, const void *Decoder) {
+  static const MCPhysReg GeneralDoubleLow8RegDecoderTable[] = {
+      Hexagon::D0, Hexagon::D1, Hexagon::D2,  Hexagon::D3,
+      Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11};
+
+  return DecodeRegisterClass(Inst, RegNo, GeneralDoubleLow8RegDecoderTable);
+}
+
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
@@ -590,17 +550,23 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
                                                const void *Decoder) {
+  using namespace Hexagon;
   static const MCPhysReg CtrlRegDecoderTable[] = {
-    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
-    Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
-    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
-    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+    /*  0 */  SA0,        LC0,        SA1,        LC1,
+    /*  4 */  P3_0,       C5,         M0,         M1,
+    /*  8 */  USR,        PC,         UGP,        GP,
+    /* 12 */  CS0,        CS1,        UPCYCLELO,  UPCYCLEHI,
+    /* 16 */  FRAMELIMIT, FRAMEKEY,   PKTCOUNTLO, PKTCOUNTHI,
+    /* 20 */  0,          0,          0,          0,
+    /* 24 */  0,          0,          0,          0,
+    /* 28 */  0,          0,          UTIMERLO,   UTIMERHI
   };
 
   if (RegNo >= array_lengthof(CtrlRegDecoderTable))
     return MCDisassembler::Fail;
 
-  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+  static_assert(NoRegister == 0, "Expecting NoRegister to be 0");
+  if (CtrlRegDecoderTable[RegNo] == NoRegister)
     return MCDisassembler::Fail;
 
   unsigned Register = CtrlRegDecoderTable[RegNo];
@@ -611,20 +577,23 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t /*Address*/,
                                                  const void *Decoder) {
+  using namespace Hexagon;
   static const MCPhysReg CtrlReg64DecoderTable[] = {
-      Hexagon::C1_0,   Hexagon::NoRegister,
-      Hexagon::C3_2,   Hexagon::NoRegister,
-      Hexagon::C7_6,   Hexagon::NoRegister,
-      Hexagon::C9_8,   Hexagon::NoRegister,
-      Hexagon::C11_10, Hexagon::NoRegister,
-      Hexagon::CS,     Hexagon::NoRegister,
-      Hexagon::UPC,    Hexagon::NoRegister
+    /*  0 */  C1_0,       0,          C3_2,       0,
+    /*  4 */  C5_4,       0,          C7_6,       0,
+    /*  8 */  C9_8,       0,          C11_10,     0,
+    /* 12 */  CS,         0,          UPCYCLE,    0,
+    /* 16 */  C17_16,     0,          PKTCOUNT,   0,
+    /* 20 */  0,          0,          0,          0,
+    /* 24 */  0,          0,          0,          0,
+    /* 28 */  0,          0,          UTIMER,     0
   };
 
   if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
     return MCDisassembler::Fail;
 
-  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+  static_assert(NoRegister == 0, "Expecting NoRegister to be 0");
+  if (CtrlReg64DecoderTable[RegNo] == NoRegister)
     return MCDisassembler::Fail;
 
   unsigned Register = CtrlReg64DecoderTable[RegNo];
@@ -650,132 +619,23 @@ static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
-                          int64_t Value) {
-  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
-    MCB, HexagonMCInstrInfo::bundleSize(MCB));
-  if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
-    return Value;
-  unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
-  uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
-  int64_t Bits;
-  bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
-  assert(Success);(void)Success;
-  uint32_t Upper26 = static_cast<uint32_t>(Bits);
-  uint32_t Operand = Upper26 | Lower6;
-  return Operand;
-}
-
-template <size_t T>
-static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
-  HexagonDisassembler const &Disassembler = disassembler(Decoder);
-  int64_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, SignExtend64<T>(tmp));
-  int64_t Extended = SignExtend64<32>(FullValue);
-  HexagonMCInstrInfo::addConstant(MI, Extended,
-                                  Disassembler.getContext());
-}
-
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t /*Address*/,
                                        const void *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
-  int64_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, tmp);
+  int64_t FullValue =
+      fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI, tmp);
   assert(FullValue >= 0 && "Negative in unsigned decoder");
   HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
   return MCDisassembler::Success;
 }
 
-static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<16>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<12>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<11>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<13>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<14>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<10>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
-                                 const void *Decoder) {
-  signedDecoder<8>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<4>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<5>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<7>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<10>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<19>(MI, tmp, Decoder);
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+  tmp = SignExtend64(tmp, Bits);
+  signedDecoder<32>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
@@ -787,838 +647,13 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
   // r13_2 is not extendable, so if there are no extent bits, it's r13_2
   if (Bits == 0)
     Bits = 15;
-  uint32_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, SignExtend64(tmp, Bits));
+  uint32_t FullValue =
+      fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI,
+                SignExtend64(tmp, Bits));
   int64_t Extended = SignExtend64<32>(FullValue) + Address;
-  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
-                                              0, 4))
+  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, 0, 4))
     HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
   return MCDisassembler::Success;
 }
 
-// Addressing mode dependent load store opcode map.
-//   - If an insn is preceded by an extender the address is absolute.
-//      - memw(##symbol) = r0
-//   - If an insn is not preceded by an extender the address is GP relative.
-//      - memw(gp + #symbol) = r0
-// Please note that the instructions must be ordered in the descending order
-// of their opcode.
-// HexagonII::INST_ICLASS_ST
-static const unsigned int StoreConditionalOpcodeData[][2] = {
-    {S4_pstorerdfnew_abs, 0xafc02084},
-    {S4_pstorerdtnew_abs, 0xafc02080},
-    {S4_pstorerdf_abs, 0xafc00084},
-    {S4_pstorerdt_abs, 0xafc00080},
-    {S4_pstorerinewfnew_abs, 0xafa03084},
-    {S4_pstorerinewtnew_abs, 0xafa03080},
-    {S4_pstorerhnewfnew_abs, 0xafa02884},
-    {S4_pstorerhnewtnew_abs, 0xafa02880},
-    {S4_pstorerbnewfnew_abs, 0xafa02084},
-    {S4_pstorerbnewtnew_abs, 0xafa02080},
-    {S4_pstorerinewf_abs, 0xafa01084},
-    {S4_pstorerinewt_abs, 0xafa01080},
-    {S4_pstorerhnewf_abs, 0xafa00884},
-    {S4_pstorerhnewt_abs, 0xafa00880},
-    {S4_pstorerbnewf_abs, 0xafa00084},
-    {S4_pstorerbnewt_abs, 0xafa00080},
-    {S4_pstorerifnew_abs, 0xaf802084},
-    {S4_pstoreritnew_abs, 0xaf802080},
-    {S4_pstorerif_abs, 0xaf800084},
-    {S4_pstorerit_abs, 0xaf800080},
-    {S4_pstorerhfnew_abs, 0xaf402084},
-    {S4_pstorerhtnew_abs, 0xaf402080},
-    {S4_pstorerhf_abs, 0xaf400084},
-    {S4_pstorerht_abs, 0xaf400080},
-    {S4_pstorerbfnew_abs, 0xaf002084},
-    {S4_pstorerbtnew_abs, 0xaf002080},
-    {S4_pstorerbf_abs, 0xaf000084},
-    {S4_pstorerbt_abs, 0xaf000080}};
-// HexagonII::INST_ICLASS_LD
-
-// HexagonII::INST_ICLASS_LD_ST_2
-static unsigned int LoadStoreOpcodeData[][2] = {{PS_loadrdabs, 0x49c00000},
-                                                {PS_loadriabs, 0x49800000},
-                                                {PS_loadruhabs, 0x49600000},
-                                                {PS_loadrhabs, 0x49400000},
-                                                {PS_loadrubabs, 0x49200000},
-                                                {PS_loadrbabs, 0x49000000},
-                                                {PS_storerdabs, 0x48c00000},
-                                                {PS_storerinewabs, 0x48a01000},
-                                                {PS_storerhnewabs, 0x48a00800},
-                                                {PS_storerbnewabs, 0x48a00000},
-                                                {PS_storeriabs, 0x48800000},
-                                                {PS_storerfabs, 0x48600000},
-                                                {PS_storerhabs, 0x48400000},
-                                                {PS_storerbabs, 0x48000000}};
-static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
-static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
-
-static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
-  unsigned MachineOpcode = 0;
-  unsigned LLVMOpcode = 0;
-
-  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
-    for (size_t i = 0; i < NumCondS; ++i) {
-      if ((insn & StoreConditionalOpcodeData[i][1]) ==
-          StoreConditionalOpcodeData[i][1]) {
-        MachineOpcode = StoreConditionalOpcodeData[i][1];
-        LLVMOpcode = StoreConditionalOpcodeData[i][0];
-        break;
-      }
-    }
-  }
-  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
-    for (size_t i = 0; i < NumLS; ++i) {
-      if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
-        MachineOpcode = LoadStoreOpcodeData[i][1];
-        LLVMOpcode = LoadStoreOpcodeData[i][0];
-        break;
-      }
-    }
-  }
-
-  if (MachineOpcode) {
-    unsigned Value = 0;
-    unsigned shift = 0;
-    MI.setOpcode(LLVMOpcode);
-    // Remove the parse bits from the insn.
-    insn &= ~HexagonII::INST_PARSE_MASK;
-
-    switch (LLVMOpcode) {
-    default:
-      return MCDisassembler::Fail;
-      break;
-
-    case Hexagon::S4_pstorerdf_abs:
-    case Hexagon::S4_pstorerdt_abs:
-    case Hexagon::S4_pstorerdfnew_abs:
-    case Hexagon::S4_pstorerdtnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Rtt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::S4_pstorerbnewf_abs:
-    case Hexagon::S4_pstorerbnewt_abs:
-    case Hexagon::S4_pstorerbnewfnew_abs:
-    case Hexagon::S4_pstorerbnewtnew_abs:
-    case Hexagon::S4_pstorerhnewf_abs:
-    case Hexagon::S4_pstorerhnewt_abs:
-    case Hexagon::S4_pstorerhnewfnew_abs:
-    case Hexagon::S4_pstorerhnewtnew_abs:
-    case Hexagon::S4_pstorerinewf_abs:
-    case Hexagon::S4_pstorerinewt_abs:
-    case Hexagon::S4_pstorerinewfnew_abs:
-    case Hexagon::S4_pstorerinewtnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Nt
-      Value = (insn >> 8) & UINT64_C(7);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::S4_pstorerbf_abs:
-    case Hexagon::S4_pstorerbt_abs:
-    case Hexagon::S4_pstorerbfnew_abs:
-    case Hexagon::S4_pstorerbtnew_abs:
-    case Hexagon::S4_pstorerhf_abs:
-    case Hexagon::S4_pstorerht_abs:
-    case Hexagon::S4_pstorerhfnew_abs:
-    case Hexagon::S4_pstorerhtnew_abs:
-    case Hexagon::S4_pstorerif_abs:
-    case Hexagon::S4_pstorerit_abs:
-    case Hexagon::S4_pstorerifnew_abs:
-    case Hexagon::S4_pstoreritnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Rt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::L4_ploadrdf_abs:
-    case Hexagon::L4_ploadrdt_abs:
-    case Hexagon::L4_ploadrdfnew_abs:
-    case Hexagon::L4_ploadrdtnew_abs:
-      // op: Rdd
-      Value = insn & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: Pt
-      Value = ((insn >> 9) & UINT64_C(3));
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = ((insn >> 15) & UINT64_C(62));
-      Value |= ((insn >> 8) & UINT64_C(1));
-      MI.addOperand(MCOperand::createImm(Value));
-      break;
-
-    case Hexagon::L4_ploadrbf_abs:
-    case Hexagon::L4_ploadrbt_abs:
-    case Hexagon::L4_ploadrbfnew_abs:
-    case Hexagon::L4_ploadrbtnew_abs:
-    case Hexagon::L4_ploadrhf_abs:
-    case Hexagon::L4_ploadrht_abs:
-    case Hexagon::L4_ploadrhfnew_abs:
-    case Hexagon::L4_ploadrhtnew_abs:
-    case Hexagon::L4_ploadrubf_abs:
-    case Hexagon::L4_ploadrubt_abs:
-    case Hexagon::L4_ploadrubfnew_abs:
-    case Hexagon::L4_ploadrubtnew_abs:
-    case Hexagon::L4_ploadruhf_abs:
-    case Hexagon::L4_ploadruht_abs:
-    case Hexagon::L4_ploadruhfnew_abs:
-    case Hexagon::L4_ploadruhtnew_abs:
-    case Hexagon::L4_ploadrif_abs:
-    case Hexagon::L4_ploadrit_abs:
-    case Hexagon::L4_ploadrifnew_abs:
-    case Hexagon::L4_ploadritnew_abs:
-      // op: Rd
-      Value = insn & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: Pt
-      Value = (insn >> 9) & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 15) & UINT64_C(62);
-      Value |= (insn >> 8) & UINT64_C(1);
-      MI.addOperand(MCOperand::createImm(Value));
-      break;
-
-    // op: g16_2
-    case (Hexagon::PS_loadriabs):
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_loadrhabs:
-    case Hexagon::PS_loadruhabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_loadrbabs:
-    case Hexagon::PS_loadrubabs:
-      // op: Rd
-      Value |= insn & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(511);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      break;
-
-    case Hexagon::PS_loadrdabs:
-      Value = insn & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(511);
-      MI.addOperand(MCOperand::createImm(Value << 3));
-      break;
-
-    case Hexagon::PS_storerdabs:
-      // op: g16_3
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << 3));
-      // op: Rtt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    // op: g16_2
-    case Hexagon::PS_storerinewabs:
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_storerhnewabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_storerbnewabs:
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      // op: Nt
-      Value = (insn >> 8) & UINT64_C(7);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    // op: g16_2
-    case Hexagon::PS_storeriabs:
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_storerhabs:
-    case Hexagon::PS_storerfabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_storerbabs:
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      // op: Rt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-    }
-    return MCDisassembler::Success;
-  }
-  return MCDisassembler::Fail;
-}
-
-static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
-                                 void const *Decoder) {
-  // Instruction Class for a constant a extender: bits 31:28 = 0x0000
-  if ((~insn & 0xf0000000) == 0xf0000000) {
-    unsigned Value;
-    // 27:16 High 12 bits of 26-bit extender.
-    Value = (insn & 0x0fff0000) << 4;
-    // 13:0 Low 14 bits of 26-bit extender.
-    Value |= ((insn & 0x3fff) << 6);
-    MI.setOpcode(Hexagon::A4_ext);
-    HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
-    return MCDisassembler::Success;
-  }
-  return MCDisassembler::Fail;
-}
-
-// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
-enum subInstBinaryValues {
-  SA1_addi_BITS = 0x0000,
-  SA1_addi_MASK = 0x1800,
-  SA1_addrx_BITS = 0x1800,
-  SA1_addrx_MASK = 0x1f00,
-  SA1_addsp_BITS = 0x0c00,
-  SA1_addsp_MASK = 0x1c00,
-  SA1_and1_BITS = 0x1200,
-  SA1_and1_MASK = 0x1f00,
-  SA1_clrf_BITS = 0x1a70,
-  SA1_clrf_MASK = 0x1e70,
-  SA1_clrfnew_BITS = 0x1a50,
-  SA1_clrfnew_MASK = 0x1e70,
-  SA1_clrt_BITS = 0x1a60,
-  SA1_clrt_MASK = 0x1e70,
-  SA1_clrtnew_BITS = 0x1a40,
-  SA1_clrtnew_MASK = 0x1e70,
-  SA1_cmpeqi_BITS = 0x1900,
-  SA1_cmpeqi_MASK = 0x1f00,
-  SA1_combine0i_BITS = 0x1c00,
-  SA1_combine0i_MASK = 0x1d18,
-  SA1_combine1i_BITS = 0x1c08,
-  SA1_combine1i_MASK = 0x1d18,
-  SA1_combine2i_BITS = 0x1c10,
-  SA1_combine2i_MASK = 0x1d18,
-  SA1_combine3i_BITS = 0x1c18,
-  SA1_combine3i_MASK = 0x1d18,
-  SA1_combinerz_BITS = 0x1d08,
-  SA1_combinerz_MASK = 0x1d08,
-  SA1_combinezr_BITS = 0x1d00,
-  SA1_combinezr_MASK = 0x1d08,
-  SA1_dec_BITS = 0x1300,
-  SA1_dec_MASK = 0x1f00,
-  SA1_inc_BITS = 0x1100,
-  SA1_inc_MASK = 0x1f00,
-  SA1_seti_BITS = 0x0800,
-  SA1_seti_MASK = 0x1c00,
-  SA1_setin1_BITS = 0x1a00,
-  SA1_setin1_MASK = 0x1e40,
-  SA1_sxtb_BITS = 0x1500,
-  SA1_sxtb_MASK = 0x1f00,
-  SA1_sxth_BITS = 0x1400,
-  SA1_sxth_MASK = 0x1f00,
-  SA1_tfr_BITS = 0x1000,
-  SA1_tfr_MASK = 0x1f00,
-  SA1_zxtb_BITS = 0x1700,
-  SA1_zxtb_MASK = 0x1f00,
-  SA1_zxth_BITS = 0x1600,
-  SA1_zxth_MASK = 0x1f00,
-  SL1_loadri_io_BITS = 0x0000,
-  SL1_loadri_io_MASK = 0x1000,
-  SL1_loadrub_io_BITS = 0x1000,
-  SL1_loadrub_io_MASK = 0x1000,
-  SL2_deallocframe_BITS = 0x1f00,
-  SL2_deallocframe_MASK = 0x1fc0,
-  SL2_jumpr31_BITS = 0x1fc0,
-  SL2_jumpr31_MASK = 0x1fc4,
-  SL2_jumpr31_f_BITS = 0x1fc5,
-  SL2_jumpr31_f_MASK = 0x1fc7,
-  SL2_jumpr31_fnew_BITS = 0x1fc7,
-  SL2_jumpr31_fnew_MASK = 0x1fc7,
-  SL2_jumpr31_t_BITS = 0x1fc4,
-  SL2_jumpr31_t_MASK = 0x1fc7,
-  SL2_jumpr31_tnew_BITS = 0x1fc6,
-  SL2_jumpr31_tnew_MASK = 0x1fc7,
-  SL2_loadrb_io_BITS = 0x1000,
-  SL2_loadrb_io_MASK = 0x1800,
-  SL2_loadrd_sp_BITS = 0x1e00,
-  SL2_loadrd_sp_MASK = 0x1f00,
-  SL2_loadrh_io_BITS = 0x0000,
-  SL2_loadrh_io_MASK = 0x1800,
-  SL2_loadri_sp_BITS = 0x1c00,
-  SL2_loadri_sp_MASK = 0x1e00,
-  SL2_loadruh_io_BITS = 0x0800,
-  SL2_loadruh_io_MASK = 0x1800,
-  SL2_return_BITS = 0x1f40,
-  SL2_return_MASK = 0x1fc4,
-  SL2_return_f_BITS = 0x1f45,
-  SL2_return_f_MASK = 0x1fc7,
-  SL2_return_fnew_BITS = 0x1f47,
-  SL2_return_fnew_MASK = 0x1fc7,
-  SL2_return_t_BITS = 0x1f44,
-  SL2_return_t_MASK = 0x1fc7,
-  SL2_return_tnew_BITS = 0x1f46,
-  SL2_return_tnew_MASK = 0x1fc7,
-  SS1_storeb_io_BITS = 0x1000,
-  SS1_storeb_io_MASK = 0x1000,
-  SS1_storew_io_BITS = 0x0000,
-  SS1_storew_io_MASK = 0x1000,
-  SS2_allocframe_BITS = 0x1c00,
-  SS2_allocframe_MASK = 0x1e00,
-  SS2_storebi0_BITS = 0x1200,
-  SS2_storebi0_MASK = 0x1f00,
-  SS2_storebi1_BITS = 0x1300,
-  SS2_storebi1_MASK = 0x1f00,
-  SS2_stored_sp_BITS = 0x0a00,
-  SS2_stored_sp_MASK = 0x1e00,
-  SS2_storeh_io_BITS = 0x0000,
-  SS2_storeh_io_MASK = 0x1800,
-  SS2_storew_sp_BITS = 0x0800,
-  SS2_storew_sp_MASK = 0x1e00,
-  SS2_storewi0_BITS = 0x1000,
-  SS2_storewi0_MASK = 0x1f00,
-  SS2_storewi1_BITS = 0x1100,
-  SS2_storewi1_MASK = 0x1f00
-};
-
-static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
-                                 raw_ostream &os) {
-  switch (IClass) {
-  case HexagonII::HSIG_L1:
-    if ((inst & SL1_loadri_io_MASK) == SL1_loadri_io_BITS)
-      op = Hexagon::SL1_loadri_io;
-    else if ((inst & SL1_loadrub_io_MASK) == SL1_loadrub_io_BITS)
-      op = Hexagon::SL1_loadrub_io;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_L2:
-    if ((inst & SL2_deallocframe_MASK) == SL2_deallocframe_BITS)
-      op = Hexagon::SL2_deallocframe;
-    else if ((inst & SL2_jumpr31_MASK) == SL2_jumpr31_BITS)
-      op = Hexagon::SL2_jumpr31;
-    else if ((inst & SL2_jumpr31_f_MASK) == SL2_jumpr31_f_BITS)
-      op = Hexagon::SL2_jumpr31_f;
-    else if ((inst & SL2_jumpr31_fnew_MASK) == SL2_jumpr31_fnew_BITS)
-      op = Hexagon::SL2_jumpr31_fnew;
-    else if ((inst & SL2_jumpr31_t_MASK) == SL2_jumpr31_t_BITS)
-      op = Hexagon::SL2_jumpr31_t;
-    else if ((inst & SL2_jumpr31_tnew_MASK) == SL2_jumpr31_tnew_BITS)
-      op = Hexagon::SL2_jumpr31_tnew;
-    else if ((inst & SL2_loadrb_io_MASK) == SL2_loadrb_io_BITS)
-      op = Hexagon::SL2_loadrb_io;
-    else if ((inst & SL2_loadrd_sp_MASK) == SL2_loadrd_sp_BITS)
-      op = Hexagon::SL2_loadrd_sp;
-    else if ((inst & SL2_loadrh_io_MASK) == SL2_loadrh_io_BITS)
-      op = Hexagon::SL2_loadrh_io;
-    else if ((inst & SL2_loadri_sp_MASK) == SL2_loadri_sp_BITS)
-      op = Hexagon::SL2_loadri_sp;
-    else if ((inst & SL2_loadruh_io_MASK) == SL2_loadruh_io_BITS)
-      op = Hexagon::SL2_loadruh_io;
-    else if ((inst & SL2_return_MASK) == SL2_return_BITS)
-      op = Hexagon::SL2_return;
-    else if ((inst & SL2_return_f_MASK) == SL2_return_f_BITS)
-      op = Hexagon::SL2_return_f;
-    else if ((inst & SL2_return_fnew_MASK) == SL2_return_fnew_BITS)
-      op = Hexagon::SL2_return_fnew;
-    else if ((inst & SL2_return_t_MASK) == SL2_return_t_BITS)
-      op = Hexagon::SL2_return_t;
-    else if ((inst & SL2_return_tnew_MASK) == SL2_return_tnew_BITS)
-      op = Hexagon::SL2_return_tnew;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_A:
-    if ((inst & SA1_addi_MASK) == SA1_addi_BITS)
-      op = Hexagon::SA1_addi;
-    else if ((inst & SA1_addrx_MASK) == SA1_addrx_BITS)
-      op = Hexagon::SA1_addrx;
-    else if ((inst & SA1_addsp_MASK) == SA1_addsp_BITS)
-      op = Hexagon::SA1_addsp;
-    else if ((inst & SA1_and1_MASK) == SA1_and1_BITS)
-      op = Hexagon::SA1_and1;
-    else if ((inst & SA1_clrf_MASK) == SA1_clrf_BITS)
-      op = Hexagon::SA1_clrf;
-    else if ((inst & SA1_clrfnew_MASK) == SA1_clrfnew_BITS)
-      op = Hexagon::SA1_clrfnew;
-    else if ((inst & SA1_clrt_MASK) == SA1_clrt_BITS)
-      op = Hexagon::SA1_clrt;
-    else if ((inst & SA1_clrtnew_MASK) == SA1_clrtnew_BITS)
-      op = Hexagon::SA1_clrtnew;
-    else if ((inst & SA1_cmpeqi_MASK) == SA1_cmpeqi_BITS)
-      op = Hexagon::SA1_cmpeqi;
-    else if ((inst & SA1_combine0i_MASK) == SA1_combine0i_BITS)
-      op = Hexagon::SA1_combine0i;
-    else if ((inst & SA1_combine1i_MASK) == SA1_combine1i_BITS)
-      op = Hexagon::SA1_combine1i;
-    else if ((inst & SA1_combine2i_MASK) == SA1_combine2i_BITS)
-      op = Hexagon::SA1_combine2i;
-    else if ((inst & SA1_combine3i_MASK) == SA1_combine3i_BITS)
-      op = Hexagon::SA1_combine3i;
-    else if ((inst & SA1_combinerz_MASK) == SA1_combinerz_BITS)
-      op = Hexagon::SA1_combinerz;
-    else if ((inst & SA1_combinezr_MASK) == SA1_combinezr_BITS)
-      op = Hexagon::SA1_combinezr;
-    else if ((inst & SA1_dec_MASK) == SA1_dec_BITS)
-      op = Hexagon::SA1_dec;
-    else if ((inst & SA1_inc_MASK) == SA1_inc_BITS)
-      op = Hexagon::SA1_inc;
-    else if ((inst & SA1_seti_MASK) == SA1_seti_BITS)
-      op = Hexagon::SA1_seti;
-    else if ((inst & SA1_setin1_MASK) == SA1_setin1_BITS)
-      op = Hexagon::SA1_setin1;
-    else if ((inst & SA1_sxtb_MASK) == SA1_sxtb_BITS)
-      op = Hexagon::SA1_sxtb;
-    else if ((inst & SA1_sxth_MASK) == SA1_sxth_BITS)
-      op = Hexagon::SA1_sxth;
-    else if ((inst & SA1_tfr_MASK) == SA1_tfr_BITS)
-      op = Hexagon::SA1_tfr;
-    else if ((inst & SA1_zxtb_MASK) == SA1_zxtb_BITS)
-      op = Hexagon::SA1_zxtb;
-    else if ((inst & SA1_zxth_MASK) == SA1_zxth_BITS)
-      op = Hexagon::SA1_zxth;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_S1:
-    if ((inst & SS1_storeb_io_MASK) == SS1_storeb_io_BITS)
-      op = Hexagon::SS1_storeb_io;
-    else if ((inst & SS1_storew_io_MASK) == SS1_storew_io_BITS)
-      op = Hexagon::SS1_storew_io;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_S2:
-    if ((inst & SS2_allocframe_MASK) == SS2_allocframe_BITS)
-      op = Hexagon::SS2_allocframe;
-    else if ((inst & SS2_storebi0_MASK) == SS2_storebi0_BITS)
-      op = Hexagon::SS2_storebi0;
-    else if ((inst & SS2_storebi1_MASK) == SS2_storebi1_BITS)
-      op = Hexagon::SS2_storebi1;
-    else if ((inst & SS2_stored_sp_MASK) == SS2_stored_sp_BITS)
-      op = Hexagon::SS2_stored_sp;
-    else if ((inst & SS2_storeh_io_MASK) == SS2_storeh_io_BITS)
-      op = Hexagon::SS2_storeh_io;
-    else if ((inst & SS2_storew_sp_MASK) == SS2_storew_sp_BITS)
-      op = Hexagon::SS2_storew_sp;
-    else if ((inst & SS2_storewi0_MASK) == SS2_storewi0_BITS)
-      op = Hexagon::SS2_storewi0;
-    else if ((inst & SS2_storewi1_MASK) == SS2_storewi1_BITS)
-      op = Hexagon::SS2_storewi1;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  default:
-    os << "<unknown>";
-    return MCDisassembler::Fail;
-  }
-  return MCDisassembler::Success;
-}
-
-static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
-  if (encoded_reg < 8)
-    return Hexagon::R0 + encoded_reg;
-  else if (encoded_reg < 16)
-    return Hexagon::R0 + encoded_reg + 8;
-
-  // patently false value
-  return Hexagon::NoRegister;
-}
-
-static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
-  if (encoded_dreg < 4)
-    return Hexagon::D0 + encoded_dreg;
-  else if (encoded_dreg < 8)
-    return Hexagon::D0 + encoded_dreg + 4;
-
-  // patently false value
-  return Hexagon::NoRegister;
-}
 
-void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
-                                             unsigned inst) const {
-  int64_t operand;
-  MCOperand Op;
-  switch (opcode) {
-  case Hexagon::SL2_deallocframe:
-  case Hexagon::SL2_jumpr31:
-  case Hexagon::SL2_jumpr31_f:
-  case Hexagon::SL2_jumpr31_fnew:
-  case Hexagon::SL2_jumpr31_t:
-  case Hexagon::SL2_jumpr31_tnew:
-  case Hexagon::SL2_return:
-  case Hexagon::SL2_return_f:
-  case Hexagon::SL2_return_fnew:
-  case Hexagon::SL2_return_t:
-  case Hexagon::SL2_return_tnew:
-    // no operands for these instructions
-    break;
-  case Hexagon::SS2_allocframe:
-    // u 8-4{5_3}
-    operand = ((inst & 0x1f0) >> 4) << 3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL1_loadri_io:
-    // Rd 3-0, Rs 7-4, u 11-8{4_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 6;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL1_loadrub_io:
-    // Rd 3-0, Rs 7-4, u 11-8
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrb_io:
-    // Rd 3-0, Rs 7-4, u 10-8
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x700) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrh_io:
-  case Hexagon::SL2_loadruh_io:
-    // Rd 3-0, Rs 7-4, u 10-8{3_1}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x700) >> 8) << 1;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrd_sp:
-    // Rdd 2-0, u 7-3{5_3}
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x0f8) >> 3) << 3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadri_sp:
-    // Rd 3-0, u 8-4{5_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x1f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_addi:
-    // Rx 3-0 (x2), s7 10-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    MI->addOperand(Op);
-    operand = SignExtend64<7>((inst & 0x7f0) >> 4);
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_addrx:
-    // Rx 3-0 (x2), Rs 7-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SA1_and1:
-  case Hexagon::SA1_dec:
-  case Hexagon::SA1_inc:
-  case Hexagon::SA1_sxtb:
-  case Hexagon::SA1_sxth:
-  case Hexagon::SA1_tfr:
-  case Hexagon::SA1_zxtb:
-  case Hexagon::SA1_zxth:
-    // Rd 3-0, Rs 7-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SA1_addsp:
-    // Rd 3-0, u 9-4{6_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x3f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_seti:
-    // Rd 3-0, u 9-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x3f0) >> 4;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_clrf:
-  case Hexagon::SA1_clrfnew:
-  case Hexagon::SA1_clrt:
-  case Hexagon::SA1_clrtnew:
-  case Hexagon::SA1_setin1:
-    // Rd 3-0
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    if (opcode == Hexagon::SA1_setin1)
-      break;
-    MI->addOperand(MCOperand::createReg(Hexagon::P0));
-    break;
-  case Hexagon::SA1_cmpeqi:
-    // Rs 7-4, u 1-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = inst & 0x3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_combine0i:
-  case Hexagon::SA1_combine1i:
-  case Hexagon::SA1_combine2i:
-  case Hexagon::SA1_combine3i:
-    // Rdd 2-0, u 6-5
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x060) >> 5;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_combinerz:
-  case Hexagon::SA1_combinezr:
-    // Rdd 2-0, Rs 7-4
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS1_storeb_io:
-    // Rs 7-4, u 11-8, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS1_storew_io:
-    // Rs 7-4, u 11-8{4_2}, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0xf00) >> 8) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storebi0:
-  case Hexagon::SS2_storebi1:
-    // Rs 7-4, u 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = inst & 0xf;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SS2_storewi0:
-  case Hexagon::SS2_storewi1:
-    // Rs 7-4, u 3-0{4_2}
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SS2_stored_sp:
-    // s 8-3{6_3}, Rtt 2-0
-    operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storeh_io:
-    // Rs 7-4, u 10-8{3_1}, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x700) >> 8) << 1;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storew_sp:
-    // u 8-4{5_2}, Rd 3-0
-    operand = ((inst & 0x1f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  default:
-    // don't crash with an invalid subinstruction
-    // llvm_unreachable("Invalid subinstruction in duplex instruction");
-    break;
-  }
-}
diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
index 0b2b463..4767165 100644
--- a/contrib/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td
@@ -22,14 +22,12 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 
 // Hexagon Architectures
-def ArchV4:  SubtargetFeature<"v4",  "HexagonArchVersion", "V4",  "Hexagon V4">;
-def ArchV5:  SubtargetFeature<"v5",  "HexagonArchVersion", "V5",  "Hexagon V5">;
-def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
-def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+include "HexagonDepArch.td"
 
-def FeatureHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
+// Hexagon ISA Extensions
+def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
       "Hexagon HVX instructions">;
-def FeatureHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
+def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
       "Hexagon HVX Double instructions">;
 def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
       "Use constant-extended calls">;
@@ -37,19 +35,14 @@ def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasV5T             : Predicate<"HST->hasV5TOps()">;
-def NoV5T              : Predicate<"!HST->hasV5TOps()">;
-def HasV55T            : Predicate<"HST->hasV55TOps()">,
-                         AssemblerPredicate<"ArchV55">;
-def HasV60T            : Predicate<"HST->hasV60TOps()">,
-                         AssemblerPredicate<"ArchV60">;
+
 def UseMEMOP           : Predicate<"HST->useMemOps()">;
 def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
 def UseHVXDbl          : Predicate<"HST->useHVXDblOps()">,
-                         AssemblerPredicate<"FeatureHVXDbl">;
+                         AssemblerPredicate<"ExtensionHVXDbl">;
 def UseHVXSgl          : Predicate<"HST->useHVXSglOps()">;
 def UseHVX             : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
-                         AssemblerPredicate<"FeatureHVX">;
+                         AssemblerPredicate<"ExtensionHVX">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
@@ -81,7 +74,7 @@ class IntrinsicsRel;
 def getPredOpcode : InstrMapping {
   let FilterClass = "PredRel";
   // Instructions with the same BaseOpcode and isNVStore values form a row.
-  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isBrTaken", "isNT"];
   // Instructions with the same predicate sense form a column.
   let ColFields = ["PredSense"];
   // The key column is the unpredicated instructions.
@@ -132,7 +125,7 @@ def getPredNewOpcode : InstrMapping {
 //
 def getPredOldOpcode : InstrMapping {
   let FilterClass = "PredNewRel";
-  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore", "isBrTaken"];
   let ColFields = ["PNewValue"];
   let KeyCol = ["new"];
   let ValueCols = [[""]];
@@ -248,11 +241,18 @@ def getRealHWInstr : InstrMapping {
 //===----------------------------------------------------------------------===//
 include "HexagonSchedule.td"
 include "HexagonRegisterInfo.td"
-include "HexagonCallingConv.td"
-include "HexagonInstrInfo.td"
+include "HexagonOperands.td"
+include "HexagonDepOperands.td"
+include "HexagonDepITypes.td"
+include "HexagonInstrFormats.td"
+include "HexagonDepInstrFormats.td"
+include "HexagonDepInstrInfo.td"
+include "HexagonPseudo.td"
 include "HexagonPatterns.td"
+include "HexagonDepMappings.td"
 include "HexagonIntrinsics.td"
 include "HexagonIntrinsicsDerived.td"
+include "HexagonMapAsm2IntrinV62.gen.td"
 
 def HexagonInstrInfo : InstrInfo;
 
@@ -271,7 +271,9 @@ def : Proc<"hexagonv5",  HexagonModelV4,
 def : Proc<"hexagonv55", HexagonModelV55,
            [ArchV4, ArchV5, ArchV55]>;
 def : Proc<"hexagonv60", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60, FeatureHVX]>;
+           [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>;
+def : Proc<"hexagonv62", HexagonModelV62,
+           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ExtensionHVX]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 54db5ad..e689483 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
+#include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
@@ -23,6 +23,7 @@
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -43,7 +44,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -261,10 +261,34 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
   return Sym;
 }
 
+static MCInst ScaleVectorOffset(MCInst &Inst, unsigned OpNo,
+                                unsigned VectorSize, MCContext &Ctx) {
+  MCInst T;
+  T.setOpcode(Inst.getOpcode());
+  for (unsigned i = 0, n = Inst.getNumOperands(); i != n; ++i) {
+    if (i != OpNo) {
+      T.addOperand(Inst.getOperand(i));
+      continue;
+    }
+    MCOperand &ImmOp = Inst.getOperand(i);
+    const auto *HE = static_cast<const HexagonMCExpr*>(ImmOp.getExpr());
+    int32_t V = cast<MCConstantExpr>(HE->getExpr())->getValue();
+    auto *NewCE = MCConstantExpr::create(V / int32_t(VectorSize), Ctx);
+    auto *NewHE = HexagonMCExpr::create(NewCE, Ctx);
+    T.addOperand(MCOperand::createExpr(NewHE));
+  }
+  return T;
+}
+
 void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
                                                   const MachineInstr &MI) {
   MCInst &MappedInst = static_cast <MCInst &>(Inst);
   const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  const auto &VecRC = HST.useHVXSglOps() ? Hexagon::VectorRegsRegClass
+                                         : Hexagon::VectorRegs128BRegClass;
+  unsigned VectorSize = HST.getRegisterInfo()->getSpillSize(VecRC);
 
   switch (Inst.getOpcode()) {
   default: return;
@@ -274,7 +298,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     MCOperand Reg = Inst.getOperand(0);
     MCOperand S16 = Inst.getOperand(1);
     HexagonMCInstrInfo::setMustNotExtend(*S16.getExpr());
-    HexagonMCInstrInfo::setS23_2_reloc(*S16.getExpr());
+    HexagonMCInstrInfo::setS27_2_reloc(*S16.getExpr());
     Inst.clear();
     Inst.addOperand(Reg);
     Inst.addOperand(MCOperand::createReg(Hexagon::R0));
@@ -282,6 +306,36 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     break;
   }
 
+  case Hexagon::A2_tfrf: {
+    Inst.setOpcode(Hexagon::A2_paddif);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrt: {
+    Inst.setOpcode(Hexagon::A2_paddit);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrfnew: {
+    Inst.setOpcode(Hexagon::A2_paddifnew);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrtnew: {
+    Inst.setOpcode(Hexagon::A2_padditnew);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_zxtb: {
+    Inst.setOpcode(Hexagon::A2_andir);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, OutContext)));
+    break;
+  }
+
   // "$dst = CONST64(#$src1)",
   case Hexagon::CONST64:
     if (!OutStreamer->hasRawTextSupport()) {
@@ -376,6 +430,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
     return;
   }
+  case Hexagon::PS_call_nr:
+    Inst.setOpcode(Hexagon::J2_call);
+    break;
   case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
     MCOperand &MO = MappedInst.getOperand(2);
     int64_t Imm;
@@ -564,6 +621,181 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     return;
   }
 
+  case Hexagon::V6_vL32Ub_pi:
+  case Hexagon::V6_vL32b_cur_pi:
+  case Hexagon::V6_vL32b_nt_cur_pi:
+  case Hexagon::V6_vL32b_pi:
+  case Hexagon::V6_vL32b_nt_pi:
+  case Hexagon::V6_vL32b_nt_tmp_pi:
+  case Hexagon::V6_vL32b_tmp_pi:
+  case Hexagon::V6_vL32Ub_pi_128B:
+  case Hexagon::V6_vL32b_cur_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_pi_128B:
+  case Hexagon::V6_vL32b_pi_128B:
+  case Hexagon::V6_vL32b_nt_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pi_128B:
+  case Hexagon::V6_vL32b_tmp_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vL32b_cur_ai:
+  case Hexagon::V6_vL32b_nt_ai:
+  case Hexagon::V6_vL32b_nt_cur_ai:
+  case Hexagon::V6_vL32b_nt_tmp_ai:
+  case Hexagon::V6_vL32b_tmp_ai:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vL32b_cur_ai_128B:
+  case Hexagon::V6_vL32b_nt_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_ai_128B:
+  case Hexagon::V6_vL32b_tmp_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_pi:
+  case Hexagon::V6_vS32b_new_pi:
+  case Hexagon::V6_vS32b_nt_new_pi:
+  case Hexagon::V6_vS32b_nt_pi:
+  case Hexagon::V6_vS32b_pi:
+  case Hexagon::V6_vS32Ub_pi_128B:
+  case Hexagon::V6_vS32b_new_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_pi_128B:
+  case Hexagon::V6_vS32b_nt_pi_128B:
+  case Hexagon::V6_vS32b_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_new_ai:
+  case Hexagon::V6_vS32b_nt_ai:
+  case Hexagon::V6_vS32b_nt_new_ai:
+  case Hexagon::V6_vS32Ub_ai_128B:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vS32b_new_ai_128B:
+  case Hexagon::V6_vS32b_nt_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 1, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32b_cur_npred_pi:
+  case Hexagon::V6_vL32b_cur_pred_pi:
+  case Hexagon::V6_vL32b_npred_pi:
+  case Hexagon::V6_vL32b_nt_cur_npred_pi:
+  case Hexagon::V6_vL32b_nt_cur_pred_pi:
+  case Hexagon::V6_vL32b_nt_npred_pi:
+  case Hexagon::V6_vL32b_nt_pred_pi:
+  case Hexagon::V6_vL32b_nt_tmp_npred_pi:
+  case Hexagon::V6_vL32b_nt_tmp_pred_pi:
+  case Hexagon::V6_vL32b_pred_pi:
+  case Hexagon::V6_vL32b_tmp_npred_pi:
+  case Hexagon::V6_vL32b_tmp_pred_pi:
+  case Hexagon::V6_vL32b_cur_npred_pi_128B:
+  case Hexagon::V6_vL32b_cur_pred_pi_128B:
+  case Hexagon::V6_vL32b_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_pred_pi_128B:
+  case Hexagon::V6_vL32b_nt_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_pred_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pred_pi_128B:
+  case Hexagon::V6_vL32b_pred_pi_128B:
+  case Hexagon::V6_vL32b_tmp_npred_pi_128B:
+  case Hexagon::V6_vL32b_tmp_pred_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 4, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32b_cur_npred_ai:
+  case Hexagon::V6_vL32b_cur_pred_ai:
+  case Hexagon::V6_vL32b_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai:
+  case Hexagon::V6_vL32b_nt_npred_ai:
+  case Hexagon::V6_vL32b_nt_pred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai:
+  case Hexagon::V6_vL32b_pred_ai:
+  case Hexagon::V6_vL32b_tmp_npred_ai:
+  case Hexagon::V6_vL32b_tmp_pred_ai:
+  case Hexagon::V6_vL32b_cur_npred_ai_128B:
+  case Hexagon::V6_vL32b_cur_pred_ai_128B:
+  case Hexagon::V6_vL32b_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai_128B:
+  case Hexagon::V6_vL32b_nt_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_pred_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai_128B:
+  case Hexagon::V6_vL32b_pred_ai_128B:
+  case Hexagon::V6_vL32b_tmp_npred_ai_128B:
+  case Hexagon::V6_vL32b_tmp_pred_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_npred_pi:
+  case Hexagon::V6_vS32Ub_pred_pi:
+  case Hexagon::V6_vS32b_new_npred_pi:
+  case Hexagon::V6_vS32b_new_pred_pi:
+  case Hexagon::V6_vS32b_npred_pi:
+  case Hexagon::V6_vS32b_nqpred_pi:
+  case Hexagon::V6_vS32b_nt_new_npred_pi:
+  case Hexagon::V6_vS32b_nt_new_pred_pi:
+  case Hexagon::V6_vS32b_nt_npred_pi:
+  case Hexagon::V6_vS32b_nt_nqpred_pi:
+  case Hexagon::V6_vS32b_nt_pred_pi:
+  case Hexagon::V6_vS32b_nt_qpred_pi:
+  case Hexagon::V6_vS32b_pred_pi:
+  case Hexagon::V6_vS32b_qpred_pi:
+  case Hexagon::V6_vS32Ub_npred_pi_128B:
+  case Hexagon::V6_vS32Ub_pred_pi_128B:
+  case Hexagon::V6_vS32b_new_npred_pi_128B:
+  case Hexagon::V6_vS32b_new_pred_pi_128B:
+  case Hexagon::V6_vS32b_npred_pi_128B:
+  case Hexagon::V6_vS32b_nqpred_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_npred_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_pred_pi_128B:
+  case Hexagon::V6_vS32b_nt_npred_pi_128B:
+  case Hexagon::V6_vS32b_nt_nqpred_pi_128B:
+  case Hexagon::V6_vS32b_nt_pred_pi_128B:
+  case Hexagon::V6_vS32b_nt_qpred_pi_128B:
+  case Hexagon::V6_vS32b_pred_pi_128B:
+  case Hexagon::V6_vS32b_qpred_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_npred_ai:
+  case Hexagon::V6_vS32Ub_pred_ai:
+  case Hexagon::V6_vS32b_new_npred_ai:
+  case Hexagon::V6_vS32b_new_pred_ai:
+  case Hexagon::V6_vS32b_npred_ai:
+  case Hexagon::V6_vS32b_nqpred_ai:
+  case Hexagon::V6_vS32b_nt_new_npred_ai:
+  case Hexagon::V6_vS32b_nt_new_pred_ai:
+  case Hexagon::V6_vS32b_nt_npred_ai:
+  case Hexagon::V6_vS32b_nt_nqpred_ai:
+  case Hexagon::V6_vS32b_nt_pred_ai:
+  case Hexagon::V6_vS32b_nt_qpred_ai:
+  case Hexagon::V6_vS32b_pred_ai:
+  case Hexagon::V6_vS32b_qpred_ai:
+  case Hexagon::V6_vS32Ub_npred_ai_128B:
+  case Hexagon::V6_vS32Ub_pred_ai_128B:
+  case Hexagon::V6_vS32b_new_npred_ai_128B:
+  case Hexagon::V6_vS32b_new_pred_ai_128B:
+  case Hexagon::V6_vS32b_npred_ai_128B:
+  case Hexagon::V6_vS32b_nqpred_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_npred_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_pred_ai_128B:
+  case Hexagon::V6_vS32b_nt_npred_ai_128B:
+  case Hexagon::V6_vS32b_nt_nqpred_ai_128B:
+  case Hexagon::V6_vS32b_nt_pred_ai_128B:
+  case Hexagon::V6_vS32b_nt_qpred_ai_128B:
+  case Hexagon::V6_vS32b_pred_ai_128B:
+  case Hexagon::V6_vS32b_qpred_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
   }
 }
 
@@ -578,13 +810,9 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (MI->isBundle()) {
     const MachineBasicBlock* MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
-    unsigned IgnoreCount = 0;
 
     for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
-      if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
-          MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
-        ++IgnoreCount;
-      else
+      if (!MII->isDebugValue() && !MII->isImplicitDef())
         HexagonLowerToMC(MCII, &*MII, MCB, *this);
   }
   else
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index fe7278f..d75d95a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexbit"
-
 #include "HexagonBitTracker.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -42,10 +40,23 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexbit"
+
 using namespace llvm;
 
 static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
   cl::init(true), cl::desc("Preserve subregisters in tied operands"));
+static cl::opt<bool> GenExtract("hexbit-extract", cl::Hidden,
+  cl::init(true), cl::desc("Generate extract instructions"));
+static cl::opt<bool> GenBitSplit("hexbit-bitsplit", cl::Hidden,
+  cl::init(true), cl::desc("Generate bitsplit instructions"));
+
+static cl::opt<unsigned> MaxExtract("hexbit-max-extract", cl::Hidden,
+  cl::init(UINT_MAX));
+static unsigned CountExtract = 0;
+static cl::opt<unsigned> MaxBitSplit("hexbit-max-bitsplit", cl::Hidden,
+  cl::init(UINT_MAX));
+static unsigned CountBitSplit = 0;
 
 namespace llvm {
 
@@ -249,8 +260,6 @@ INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
 
 bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
       RegisterSet &AVs) {
-  MachineDomTreeNode *N = MDT->getNode(&B);
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
   bool Changed = false;
 
   if (T.TopDown)
@@ -262,10 +271,9 @@ bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
   RegisterSet NewAVs = AVs;
   NewAVs.insert(Defs);
 
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
-    Changed |= visitBlock(*SB, T, NewAVs);
-  }
+  for (auto *DTN : children<MachineDomTreeNode*>(MDT->getNode(&B)))
+    Changed |= visitBlock(*(DTN->getBlock()), T, NewAVs);
+
   if (!T.TopDown)
     Changed |= T.processBlock(B, AVs);
 
@@ -399,7 +407,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
   const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg);
   if (RR.Sub == 0) {
     Begin = 0;
-    Width = RC->getSize()*8;
+    Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC);
     return true;
   }
 
@@ -409,7 +417,7 @@ bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR,
     case Hexagon::DoubleRegsRegClassID:
     case Hexagon::VecDblRegsRegClassID:
     case Hexagon::VecDblRegs128BRegClassID:
-      Width = RC->getSize()*8 / 2;
+      Width = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 2;
       if (RR.Sub == Hexagon::isub_hi || RR.Sub == Hexagon::vsub_hi)
         Begin = Width;
       break;
@@ -896,6 +904,7 @@ const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
                   *MRI.getTargetRegisterInfo());
 
   auto VerifySR = [&HRI] (const TargetRegisterClass *RC, unsigned Sub) -> void {
+    (void)HRI;
     assert(Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_lo) ||
            Sub == HRI.getHexagonSubRegIndex(RC, Hexagon::ps_sub_hi));
   };
@@ -983,9 +992,9 @@ bool DeadCodeElimination::isDead(unsigned R) const {
 
 bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
   bool Changed = false;
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-    Changed |= runOnNode(*I);
+
+  for (auto *DTN : children<MachineDomTreeNode*>(N))
+    Changed |= runOnNode(DTN);
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
@@ -1045,8 +1054,8 @@ namespace {
   class RedundantInstrElimination : public Transformation {
   public:
     RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii,
-          MachineRegisterInfo &mri)
-        : Transformation(true), HII(hii), MRI(mri), BT(bt) {}
+          const HexagonRegisterInfo &hri, MachineRegisterInfo &mri)
+        : Transformation(true), HII(hii), HRI(hri), MRI(mri), BT(bt) {}
 
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
 
@@ -1061,6 +1070,7 @@ namespace {
     bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS);
 
     const HexagonInstrInfo &HII;
+    const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
     BitTracker &BT;
   };
@@ -1253,7 +1263,7 @@ bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
     assert(MI.getOperand(OpN).isReg());
     BitTracker::RegisterRef RR = MI.getOperand(OpN);
     const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI);
-    uint16_t Width = RC->getSize()*8;
+    uint16_t Width = HRI.getRegSizeInBits(*RC);
 
     if (!GotBits)
       T.set(Begin, Begin+Width);
@@ -1735,10 +1745,11 @@ namespace {
 // This is by no means complete
   class BitSimplification : public Transformation {
   public:
-    BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
-        const HexagonRegisterInfo &hri, MachineRegisterInfo &mri,
-        MachineFunction &mf)
-      : Transformation(true), HII(hii), HRI(hri), MRI(mri), MF(mf), BT(bt) {}
+    BitSimplification(BitTracker &bt, const MachineDominatorTree &mdt,
+        const HexagonInstrInfo &hii, const HexagonRegisterInfo &hri,
+        MachineRegisterInfo &mri, MachineFunction &mf)
+      : Transformation(true), MDT(mdt), HII(hii), HRI(hri), MRI(mri),
+        MF(mf), BT(bt) {}
 
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
 
@@ -1765,9 +1776,18 @@ namespace {
           const BitTracker::RegisterCell &RC);
     bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
           const BitTracker::RegisterCell &RC);
+    bool genBitSplit(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC, const RegisterSet &AVs);
     bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
           const BitTracker::RegisterCell &RC);
+    bool simplifyExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC, const RegisterSet &AVs);
 
+    // Cache of created instructions to avoid creating duplicates.
+    // XXX Currently only used by genBitSplit.
+    std::vector<MachineInstr*> NewMIs;
+
+    const MachineDominatorTree &MDT;
     const HexagonInstrInfo &HII;
     const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
@@ -1927,8 +1947,10 @@ bool BitSimplification::genStoreImmediate(MachineInstr *MI) {
   switch (Opc) {
     case Hexagon::S2_storeri_io:
       Align++;
+      LLVM_FALLTHROUGH;
     case Hexagon::S2_storerh_io:
       Align++;
+      LLVM_FALLTHROUGH;
     case Hexagon::S2_storerb_io:
       break;
     default:
@@ -2149,6 +2171,149 @@ bool BitSimplification::genExtractLow(MachineInstr *MI,
   return false;
 }
 
+bool BitSimplification::genBitSplit(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC,
+      const RegisterSet &AVs) {
+  if (!GenBitSplit)
+    return false;
+  if (MaxBitSplit.getNumOccurrences()) {
+    if (CountBitSplit >= MaxBitSplit)
+      return false;
+  }
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A4_bitsplit:
+    case Hexagon::A4_bitspliti:
+      return false;
+  }
+
+  unsigned W = RC.width();
+  if (W != 32)
+    return false;
+
+  auto ctlz = [] (const BitTracker::RegisterCell &C) -> unsigned {
+    unsigned Z = C.width();
+    while (Z > 0 && C[Z-1].is(0))
+      --Z;
+    return C.width() - Z;
+  };
+
+  // Count the number of leading zeros in the target RC.
+  unsigned Z = ctlz(RC);
+  if (Z == 0 || Z == W)
+    return false;
+
+  // A simplistic analysis: assume the source register (the one being split)
+  // is fully unknown, and that all its bits are self-references.
+  const BitTracker::BitValue &B0 = RC[0];
+  if (B0.Type != BitTracker::BitValue::Ref)
+    return false;
+
+  unsigned SrcR = B0.RefI.Reg;
+  unsigned SrcSR = 0;
+  unsigned Pos = B0.RefI.Pos;
+
+  // All the non-zero bits should be consecutive bits from the same register.
+  for (unsigned i = 1; i < W-Z; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type != BitTracker::BitValue::Ref)
+      return false;
+    if (V.RefI.Reg != SrcR || V.RefI.Pos != Pos+i)
+      return false;
+  }
+
+  // Now, find the other bitfield among AVs.
+  for (unsigned S = AVs.find_first(); S; S = AVs.find_next(S)) {
+    // The number of leading zeros here should be the number of trailing
+    // non-zeros in RC.
+    if (!BT.has(S))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(S);
+    if (SC.width() != W || ctlz(SC) != W-Z)
+      continue;
+    // The Z lower bits should now match SrcR.
+    const BitTracker::BitValue &S0 = SC[0];
+    if (S0.Type != BitTracker::BitValue::Ref || S0.RefI.Reg != SrcR)
+      continue;
+    unsigned P = S0.RefI.Pos;
+
+    if (Pos <= P && (Pos + W-Z) != P)
+      continue;
+    if (P < Pos && (P + Z) != Pos)
+      continue;
+    // The starting bitfield position must be at a subregister boundary.
+    if (std::min(P, Pos) != 0 && std::min(P, Pos) != 32)
+      continue;
+
+    unsigned I;
+    for (I = 1; I < Z; ++I) {
+      const BitTracker::BitValue &V = SC[I];
+      if (V.Type != BitTracker::BitValue::Ref)
+        break;
+      if (V.RefI.Reg != SrcR || V.RefI.Pos != P+I)
+        break;
+    }
+    if (I != Z)
+      continue;
+
+    // Generate bitsplit where S is defined.
+    if (MaxBitSplit.getNumOccurrences())
+      CountBitSplit++;
+    MachineInstr *DefS = MRI.getVRegDef(S);
+    assert(DefS != nullptr);
+    DebugLoc DL = DefS->getDebugLoc();
+    MachineBasicBlock &B = *DefS->getParent();
+    auto At = DefS->isPHI() ? B.getFirstNonPHI()
+                            : MachineBasicBlock::iterator(DefS);
+    if (MRI.getRegClass(SrcR)->getID() == Hexagon::DoubleRegsRegClassID)
+      SrcSR = (std::min(Pos, P) == 32) ? Hexagon::isub_hi : Hexagon::isub_lo;
+    if (!validateReg({SrcR,SrcSR}, Hexagon::A4_bitspliti, 1))
+      continue;
+    unsigned ImmOp = Pos <= P ? W-Z : Z;
+
+    // Find an existing bitsplit instruction if one already exists.
+    unsigned NewR = 0;
+    for (MachineInstr *In : NewMIs) {
+      if (In->getOpcode() != Hexagon::A4_bitspliti)
+        continue;
+      MachineOperand &Op1 = In->getOperand(1);
+      if (Op1.getReg() != SrcR || Op1.getSubReg() != SrcSR)
+        continue;
+      if (In->getOperand(2).getImm() != ImmOp)
+        continue;
+      // Check if the target register is available here.
+      MachineOperand &Op0 = In->getOperand(0);
+      MachineInstr *DefI = MRI.getVRegDef(Op0.getReg());
+      assert(DefI != nullptr);
+      if (!MDT.dominates(DefI, &*At))
+        continue;
+
+      // Found one that can be reused.
+      assert(Op0.getSubReg() == 0);
+      NewR = Op0.getReg();
+      break;
+    }
+    if (!NewR) {
+      NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+      auto NewBS = BuildMI(B, At, DL, HII.get(Hexagon::A4_bitspliti), NewR)
+                      .addReg(SrcR, 0, SrcSR)
+                      .addImm(ImmOp);
+      NewMIs.push_back(NewBS);
+    }
+    if (Pos <= P) {
+      HBS::replaceRegWithSub(RD.Reg, NewR, Hexagon::isub_lo, MRI);
+      HBS::replaceRegWithSub(S,      NewR, Hexagon::isub_hi, MRI);
+    } else {
+      HBS::replaceRegWithSub(S,      NewR, Hexagon::isub_lo, MRI);
+      HBS::replaceRegWithSub(RD.Reg, NewR, Hexagon::isub_hi, MRI);
+    }
+    return true;
+  }
+
+  return false;
+}
+
 // Check for tstbit simplification opportunity, where the bit being checked
 // can be tracked back to another register. For example:
 //   vreg2 = S2_lsr_i_r  vreg1, 5
@@ -2210,6 +2375,203 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
   return false;
 }
 
+// Detect whether RD is a bitfield extract (sign- or zero-extended) of
+// some register from the AVs set. Create a new corresponding instruction
+// at the location of MI. The intent is to recognize situations where
+// a sequence of instructions performs an operation that is equivalent to
+// an extract operation, such as a shift left followed by a shift right.
+bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC,
+      const RegisterSet &AVs) {
+  if (!GenExtract)
+    return false;
+  if (MaxExtract.getNumOccurrences()) {
+    if (CountExtract >= MaxExtract)
+      return false;
+    CountExtract++;
+  }
+
+  unsigned W = RC.width();
+  unsigned RW = W;
+  unsigned Len;
+  bool Signed;
+
+  // The code is mostly class-independent, except for the part that generates
+  // the extract instruction, and establishes the source register (in case it
+  // needs to use a subregister).
+  const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+  if (FRC != &Hexagon::IntRegsRegClass && FRC != &Hexagon::DoubleRegsRegClass)
+    return false;
+  assert(RD.Sub == 0);
+
+  // Observation:
+  // If the cell has a form of 00..0xx..x with k zeros and n remaining
+  // bits, this could be an extractu of the n bits, but it could also be
+  // an extractu of a longer field which happens to have 0s in the top
+  // bit positions.
+  // The same logic applies to sign-extended fields.
+  //
+  // Do not check for the extended extracts, since it would expand the
+  // search space quite a bit. The search may be expensive as it is.
+
+  const BitTracker::BitValue &TopV = RC[W-1];
+
+  // Eliminate candidates that have self-referential bits, since they
+  // cannot be extracts from other registers. Also, skip registers that
+  // have compile-time constant values.
+  bool IsConst = true;
+  for (unsigned I = 0; I != W; ++I) {
+    const BitTracker::BitValue &V = RC[I];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == RD.Reg)
+      return false;
+    IsConst = IsConst && (V.is(0) || V.is(1));
+  }
+  if (IsConst)
+    return false;
+
+  if (TopV.is(0) || TopV.is(1)) {
+    bool S = TopV.is(1);
+    for (--W; W > 0 && RC[W-1].is(S); --W)
+      ;
+    Len = W;
+    Signed = S;
+    // The sign bit must be a part of the field being extended.
+    if (Signed)
+      ++Len;
+  } else {
+    // This could still be a sign-extended extract.
+    assert(TopV.Type == BitTracker::BitValue::Ref);
+    if (TopV.RefI.Reg == RD.Reg || TopV.RefI.Pos == W-1)
+      return false;
+    for (--W; W > 0 && RC[W-1] == TopV; --W)
+      ;
+    // The top bits of RC are copies of TopV. One occurrence of TopV will
+    // be a part of the field.
+    Len = W + 1;
+    Signed = true;
+  }
+
+  // This would be just a copy. It should be handled elsewhere.
+  if (Len == RW)
+    return false;
+
+  DEBUG({
+    dbgs() << __func__ << " on reg: " << PrintReg(RD.Reg, &HRI, RD.Sub)
+           << ", MI: " << *MI;
+    dbgs() << "Cell: " << RC << '\n';
+    dbgs() << "Expected bitfield size: " << Len << " bits, "
+           << (Signed ? "sign" : "zero") << "-extended\n";
+  });
+
+  bool Changed = false;
+
+  for (unsigned R = AVs.find_first(); R != 0; R = AVs.find_next(R)) {
+    if (!BT.has(R))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(R);
+    unsigned SW = SC.width();
+
+    // The source can be longer than the destination, as long as its size is
+    // a multiple of the size of the destination. Also, we would need to be
+    // able to refer to the subregister in the source that would be of the
+    // same size as the destination, but only check the sizes here.
+    if (SW < RW || (SW % RW) != 0)
+      continue;
+
+    // The field can start at any offset in SC as long as it contains Len
+    // bits and does not cross subregister boundary (if the source register
+    // is longer than the destination).
+    unsigned Off = 0;
+    while (Off <= SW-Len) {
+      unsigned OE = (Off+Len)/RW;
+      if (OE != Off/RW) {
+        // The assumption here is that if the source (R) is longer than the
+        // destination, then the destination is a sequence of words of
+        // size RW, and each such word in R can be accessed via a subregister.
+        //
+        // If the beginning and the end of the field cross the subregister
+        // boundary, advance to the next subregister.
+        Off = OE*RW;
+        continue;
+      }
+      if (HBS::isEqual(RC, 0, SC, Off, Len))
+        break;
+      ++Off;
+    }
+
+    if (Off > SW-Len)
+      continue;
+
+    // Found match.
+    unsigned ExtOpc = 0;
+    if (Off == 0) {
+      if (Len == 8)
+        ExtOpc = Signed ? Hexagon::A2_sxtb : Hexagon::A2_zxtb;
+      else if (Len == 16)
+        ExtOpc = Signed ? Hexagon::A2_sxth : Hexagon::A2_zxth;
+      else if (Len < 10 && !Signed)
+        ExtOpc = Hexagon::A2_andir;
+    }
+    if (ExtOpc == 0) {
+      ExtOpc =
+          Signed ? (RW == 32 ? Hexagon::S4_extract  : Hexagon::S4_extractp)
+                 : (RW == 32 ? Hexagon::S2_extractu : Hexagon::S2_extractup);
+    }
+    unsigned SR = 0;
+    // This only recognizes isub_lo and isub_hi.
+    if (RW != SW && RW*2 != SW)
+      continue;
+    if (RW != SW)
+      SR = (Off/RW == 0) ? Hexagon::isub_lo : Hexagon::isub_hi;
+    Off = Off % RW;
+
+    if (!validateReg({R,SR}, ExtOpc, 1))
+      continue;
+
+    // Don't generate the same instruction as the one being optimized.
+    if (MI->getOpcode() == ExtOpc) {
+      // All possible ExtOpc's have the source in operand(1).
+      const MachineOperand &SrcOp = MI->getOperand(1);
+      if (SrcOp.getReg() == R)
+        continue;
+    }
+
+    DebugLoc DL = MI->getDebugLoc();
+    MachineBasicBlock &B = *MI->getParent();
+    unsigned NewR = MRI.createVirtualRegister(FRC);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+    auto MIB = BuildMI(B, At, DL, HII.get(ExtOpc), NewR)
+                  .addReg(R, 0, SR);
+    switch (ExtOpc) {
+      case Hexagon::A2_sxtb:
+      case Hexagon::A2_zxtb:
+      case Hexagon::A2_sxth:
+      case Hexagon::A2_zxth:
+        break;
+      case Hexagon::A2_andir:
+        MIB.addImm((1u << Len) - 1);
+        break;
+      case Hexagon::S4_extract:
+      case Hexagon::S2_extractu:
+      case Hexagon::S4_extractp:
+      case Hexagon::S2_extractup:
+        MIB.addImm(Len)
+           .addImm(Off);
+        break;
+      default:
+        llvm_unreachable("Unexpected opcode");
+    }
+
+    HBS::replaceReg(RD.Reg, NewR, MRI);
+    BT.put(BitTracker::RegisterRef(NewR), RC);
+    Changed = true;
+    break;
+  }
+
+  return Changed;
+}
+
 bool BitSimplification::processBlock(MachineBasicBlock &B,
       const RegisterSet &AVs) {
   if (!BT.reached(&B))
@@ -2247,12 +2609,15 @@ bool BitSimplification::processBlock(MachineBasicBlock &B,
 
     if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
       bool T = genPackhl(MI, RD, RC);
+      T = T || simplifyExtractLow(MI, RD, RC, AVB);
       Changed |= T;
       continue;
     }
 
     if (FRC->getID() == Hexagon::IntRegsRegClassID) {
-      bool T = genExtractHalf(MI, RD, RC);
+      bool T = genBitSplit(MI, RD, RC, AVB);
+      T = T || simplifyExtractLow(MI, RD, RC, AVB);
+      T = T || genExtractHalf(MI, RD, RC);
       T = T || genCombineHalf(MI, RD, RC);
       T = T || genExtractLow(MI, RD, RC);
       Changed |= T;
@@ -2294,7 +2659,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
   Changed |= visitBlock(Entry, ImmG, AIG);
 
   RegisterSet ARE;  // Available registers for RIE.
-  RedundantInstrElimination RIE(BT, HII, MRI);
+  RedundantInstrElimination RIE(BT, HII, HRI, MRI);
   bool Ried = visitBlock(Entry, RIE, ARE);
   if (Ried) {
     Changed = true;
@@ -2313,7 +2678,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
 
   BT.run();
   RegisterSet ABS;  // Available registers for BS.
-  BitSimplification BitS(BT, HII, HRI, MRI, MF);
+  BitSimplification BitS(BT, *MDT, HII, HRI, MRI, MF);
   Changed |= visitBlock(Entry, BitS, ABS);
 
   Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
@@ -2599,7 +2964,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB,
     for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) {
       const MachineOperand &Op = SI->getOperand(j);
       if (!Op.isReg()) {
-        MIB.addOperand(Op);
+        MIB.add(Op);
         continue;
       }
       if (!Op.isUse())
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index 436f88d..3de5310 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Hexagon.h"
 #include "HexagonBitTracker.h"
+#include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonTargetMachine.h"
@@ -57,12 +57,10 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
   // tion). To avoid the complications with in-memory arguments, only consi-
   // der the initial sequence of formal parameters that are known to be
   // passed via registers.
-  unsigned AttrIdx = 0;
   unsigned InVirtReg, InPhysReg = 0;
   const Function &F = *MF.getFunction();
   typedef Function::const_arg_iterator arg_iterator;
   for (arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
-    AttrIdx++;
     const Argument &Arg = *I;
     Type *ATy = Arg.getType();
     unsigned Width = 0;
@@ -74,8 +72,7 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
     // Module::AnyPointerSize.
     if (Width == 0 || Width > 64)
       break;
-    AttributeSet Attrs = F.getAttributes();
-    if (Attrs.hasAttribute(AttrIdx, Attribute::ByVal))
+    if (Arg.hasAttribute(Attribute::ByVal))
       continue;
     InPhysReg = getNextPhysReg(InPhysReg, Width);
     if (!InPhysReg)
@@ -83,9 +80,9 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
     InVirtReg = getVirtRegFor(InPhysReg);
     if (!InVirtReg)
       continue;
-    if (Attrs.hasAttribute(AttrIdx, Attribute::SExt))
+    if (Arg.hasAttribute(Attribute::SExt))
       VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::SExt, Width)));
-    else if (Attrs.hasAttribute(AttrIdx, Attribute::ZExt))
+    else if (Arg.hasAttribute(Attribute::ZExt))
       VRX.insert(std::make_pair(InVirtReg, ExtType(ExtType::ZExt, Width)));
   }
 }
@@ -272,6 +269,9 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
   // cases below.
   uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
 
+  // Register id of the 0th operand. It can be 0.
+  unsigned Reg0 = Reg[0].Reg;
+
   switch (Opc) {
     // Transfer immediate:
 
@@ -792,6 +792,17 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
     case A2_zxth:
       return rr0(eZXT(rc(1), 16), Outputs);
 
+    // Saturations
+
+    case A2_satb:
+      return rr0(eSXT(RegisterCell::self(0, W0).regify(Reg0), 8), Outputs);
+    case A2_sath:
+      return rr0(eSXT(RegisterCell::self(0, W0).regify(Reg0), 16), Outputs);
+    case A2_satub:
+      return rr0(eZXT(RegisterCell::self(0, W0).regify(Reg0), 8), Outputs);
+    case A2_satuh:
+      return rr0(eZXT(RegisterCell::self(0, W0).regify(Reg0), 16), Outputs);
+
     // Bit count:
 
     case S2_cl0:
@@ -926,6 +937,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &BI,
     case Hexagon::J2_jumpfnew:
     case Hexagon::J2_jumpfnewpt:
       Negated = true;
+      LLVM_FALLTHROUGH;
     case Hexagon::J2_jumpt:
     case Hexagon::J2_jumptpt:
     case Hexagon::J2_jumptnew:
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
index adc213c..1640b40 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -219,8 +219,7 @@ HexagonBlockRanges::HexagonBlockRanges(MachineFunction &mf)
     TII(*HST.getInstrInfo()), TRI(*HST.getRegisterInfo()),
     Reserved(TRI.getReservedRegs(mf)) {
   // Consider all non-allocatable registers as reserved.
-  for (auto I = TRI.regclass_begin(), E = TRI.regclass_end(); I != E; ++I) {
-    auto *RC = *I;
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
     if (RC->isAllocatable())
       continue;
     for (unsigned R : *RC)
@@ -233,14 +232,16 @@ HexagonBlockRanges::RegisterSet HexagonBlockRanges::getLiveIns(
       const TargetRegisterInfo &TRI) {
   RegisterSet LiveIns;
   RegisterSet Tmp;
+
   for (auto I : B.liveins()) {
-    if (I.LaneMask.all()) {
-      Tmp.insert({I.PhysReg,0});
+    MCSubRegIndexIterator S(I.PhysReg, &TRI);
+    if (I.LaneMask.all() || (I.LaneMask.any() && !S.isValid())) {
+      Tmp.insert({I.PhysReg, 0});
       continue;
     }
-    for (MCSubRegIndexIterator S(I.PhysReg, &TRI); S.isValid(); ++S) {
-      LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
-      if ((M & I.LaneMask).any())
+    for (; S.isValid(); ++S) {
+      unsigned SI = S.getSubRegIndex();
+      if ((I.LaneMask & TRI.getSubRegIndexLaneMask(SI)).any())
         Tmp.insert({S.getSubReg(), 0});
     }
   }
@@ -307,6 +308,8 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
     LastUse[R] = LastDef[R] = IndexType::None;
   };
 
+  RegisterSet Defs, Clobbers;
+
   for (auto &In : B) {
     if (In.isDebugValue())
       continue;
@@ -325,19 +328,67 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
           closeRange(S);
       }
     }
-    // Process defs.
+    // Process defs and clobbers.
+    Defs.clear();
+    Clobbers.clear();
     for (auto &Op : In.operands()) {
       if (!Op.isReg() || !Op.isDef() || Op.isUndef())
         continue;
       RegisterRef R = { Op.getReg(), Op.getSubReg() };
-      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
-        continue;
       for (auto S : expandToSubRegs(R, MRI, TRI)) {
-        if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
-          closeRange(S);
-        LastDef[S] = Index;
+        if (TargetRegisterInfo::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
+          continue;
+        if (Op.isDead())
+          Clobbers.insert(S);
+        else
+          Defs.insert(S);
+      }
+    }
+
+    for (auto &Op : In.operands()) {
+      if (!Op.isRegMask())
+        continue;
+      const uint32_t *BM = Op.getRegMask();
+      for (unsigned PR = 1, N = TRI.getNumRegs(); PR != N; ++PR) {
+        // Skip registers that have subregisters. A register is preserved
+        // iff its bit is set in the regmask, so if R1:0 was preserved, both
+        // R1 and R0 would also be present.
+        if (MCSubRegIterator(PR, &TRI, false).isValid())
+          continue;
+        if (Reserved[PR])
+          continue;
+        if (BM[PR/32] & (1u << (PR%32)))
+          continue;
+        RegisterRef R = { PR, 0 };
+        if (!Defs.count(R))
+          Clobbers.insert(R);
       }
     }
+    // Defs and clobbers can overlap, e.g.
+    // %D0<def,dead> = COPY %vreg5, %R0<imp-def>, %R1<imp-def>
+    for (RegisterRef R : Defs)
+      Clobbers.erase(R);
+
+    // Update maps for defs.
+    for (RegisterRef S : Defs) {
+      // Defs should already be expanded into subregs.
+      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+             !MCSubRegIterator(S.Reg, &TRI, false).isValid());
+      if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+        closeRange(S);
+      LastDef[S] = Index;
+    }
+    // Update maps for clobbers.
+    for (RegisterRef S : Clobbers) {
+      // Clobbers should already be expanded into subregs.
+      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+             !MCSubRegIterator(S.Reg, &TRI, false).isValid());
+      if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+        closeRange(S);
+      // Create a single-instruction range.
+      LastDef[S] = LastUse[S] = Index;
+      closeRange(S);
+    }
   }
 
   // Collect live-on-exit.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
index 7174803..769ec70 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonBlockRanges.h
@@ -14,8 +14,8 @@
 #include <cassert>
 #include <map>
 #include <set>
-#include <vector>
 #include <utility>
+#include <vector>
 
 namespace llvm {
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 2f8fe6e..c7b422e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -38,6 +38,7 @@ class HexagonCFGOptimizer : public MachineFunctionPass {
 
 private:
   void InvertAndChangeJumpTarget(MachineInstr &, MachineBasicBlock *);
+  bool isOnFallThroughPath(MachineBasicBlock *MBB);
 
 public:
   static char ID;
@@ -106,6 +107,14 @@ void HexagonCFGOptimizer::InvertAndChangeJumpTarget(
   MI.getOperand(1).setMBB(NewTarget);
 }
 
+bool HexagonCFGOptimizer::isOnFallThroughPath(MachineBasicBlock *MBB) {
+  if (MBB->canFallThrough())
+    return true;
+  for (MachineBasicBlock *PB : MBB->predecessors())
+    if (PB->isLayoutSuccessor(MBB) && PB->canFallThrough())
+      return true;
+  return false;
+}
 
 bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
   if (skipFunction(*Fn.getFunction()))
@@ -182,7 +191,6 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
         }
 
         if ((NumSuccs == 2) && LayoutSucc && (LayoutSucc->pred_size() == 1)) {
-
           // Ensure that BB2 has one instruction -- an unconditional jump.
           if ((LayoutSucc->size() == 1) &&
               IsUnconditionalJump(LayoutSucc->front().getOpcode())) {
@@ -211,9 +219,8 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
                 JumpAroundTarget->moveAfter(LayoutSucc);
                 // only move a block if it doesn't have a fall-thru. otherwise
                 // the CFG will be incorrect.
-                if (!UncondTarget->canFallThrough()) {
+                if (!isOnFallThroughPath(UncondTarget))
                   UncondTarget->moveAfter(JumpAroundTarget);
-                }
               }
 
               //
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
deleted file mode 100644
index e61b2a7..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ /dev/null
@@ -1,35 +0,0 @@
-//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the Hexagon architectures.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Return Value Calling Conventions
-//===----------------------------------------------------------------------===//
-
-// Hexagon 32-bit C return-value convention.
-def RetCC_Hexagon32 : CallingConv<[
-  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
-
-  // Alternatively, they are assigned to the stack in 4-byte aligned units.
-  CCAssignToStack<4, 4>
-]>;
-
-// Hexagon 32-bit C Calling convention.
-def CC_Hexagon32 : CallingConv<[
-  // All arguments get passed in integer registers if there is space.
-  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
-
-  // Alternatively, they are assigned to the stack in 4-byte aligned units.
-  CCAssignToStack<4, 4>
-]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 489da6b..b5b46f2 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -175,7 +175,8 @@ namespace {
       None      = 0,
       Root      = 0x01,
       Internal  = 0x02,
-      Used      = 0x04
+      Used      = 0x04,
+      InBounds  = 0x08
     };
 
     uint32_t Flags;
@@ -231,6 +232,11 @@ namespace {
         OS << ',';
       OS << "used";
     }
+    if (GN.Flags & GepNode::InBounds) {
+      if (Comma)
+        OS << ',';
+      OS << "inbounds";
+    }
     OS << "} ";
     if (GN.Flags & GepNode::Root)
       OS << "BaseVal:" << GN.BaseVal->getName() << '(' << GN.BaseVal << ')';
@@ -315,11 +321,8 @@ void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
   // visited".
 
   Order.push_back(Root);
-  DomTreeNode *DTN = DT->getNode(Root);
-  typedef GraphTraits<DomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType Iter;
-  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
-    getBlockTraversalOrder((*I)->getBlock(), Order);
+  for (auto *DTN : children<DomTreeNode*>(DT->getNode(Root)))
+    getBlockTraversalOrder(DTN->getBlock(), Order);
 }
 
 bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
@@ -337,10 +340,11 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
   DEBUG(dbgs() << "Visiting GEP: " << *GepI << '\n');
   GepNode *N = new (*Mem) GepNode;
   Value *PtrOp = GepI->getPointerOperand();
+  uint32_t InBounds = GepI->isInBounds() ? GepNode::InBounds : 0;
   ValueToNodeMap::iterator F = NM.find(PtrOp);
   if (F == NM.end()) {
     N->BaseVal = PtrOp;
-    N->Flags |= GepNode::Root;
+    N->Flags |= GepNode::Root | InBounds;
   } else {
     // If PtrOp was a GEP instruction, it must have already been processed.
     // The ValueToNodeMap entry for it is the last gep node in the generated
@@ -376,7 +380,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI,
     Value *Op = *OI;
     GepNode *Nx = new (*Mem) GepNode;
     Nx->Parent = PN;  // Link Nx to the previous node.
-    Nx->Flags |= GepNode::Internal;
+    Nx->Flags |= GepNode::Internal | InBounds;
     Nx->PTy = PtrTy;
     Nx->Idx = Op;
     Nodes.push_back(Nx);
@@ -1084,7 +1088,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
   GepNode *RN = NA[0];
   assert((RN->Flags & GepNode::Root) && "Creating GEP for non-root");
 
-  Value *NewInst = nullptr;
+  GetElementPtrInst *NewInst = nullptr;
   Value *Input = RN->BaseVal;
   Value **IdxList = new Value*[Num+1];
   unsigned nax = 0;
@@ -1115,6 +1119,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At,
     Type *InpTy = Input->getType();
     Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType();
     NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At);
+    NewInst->setIsInBounds(RN->Flags & GepNode::InBounds);
     DEBUG(dbgs() << "new GEP: " << *NewInst << '\n');
     Input = NewInst;
   } while (nax <= Num);
@@ -1235,11 +1240,8 @@ void HexagonCommonGEP::removeDeadCode() {
 
   for (unsigned i = 0; i < BO.size(); ++i) {
     BasicBlock *B = cast<BasicBlock>(BO[i]);
-    DomTreeNode *N = DT->getNode(B);
-    typedef GraphTraits<DomTreeNode*> GTN;
-    typedef GTN::ChildIteratorType Iter;
-    for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-      BO.push_back((*I)->getBlock());
+    for (auto DTN : children<DomTreeNode*>(DT->getNode(B)))
+      BO.push_back(DTN->getBlock());
   }
 
   for (unsigned i = BO.size(); i > 0; --i) {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 783b916..49ddd69 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -2244,6 +2244,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &BrI,
     case Hexagon::J2_jumpfnew:
     case Hexagon::J2_jumpfnewpt:
       Negated = true;
+      LLVM_FALLTHROUGH;
     case Hexagon::J2_jumpt:
     case Hexagon::J2_jumptnew:
     case Hexagon::J2_jumptnewpt:
@@ -2276,7 +2277,7 @@ Undetermined:
       goto Undetermined;
 
     uint32_t Props = PredC.properties();
-    bool CTrue = false, CFalse = false;;
+    bool CTrue = false, CFalse = false;
     if (Props & ConstantProperties::Zero)
       CFalse = true;
     else if (Props & ConstantProperties::NonZero)
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 3608099..6b4f534 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
-#include "llvm/PassSupport.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -22,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/PassSupport.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -440,22 +440,28 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
 
     // Put instructions that last defined integer or double registers into the
     // map.
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Op = MI.getOperand(I);
-      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
-        continue;
-      unsigned Reg = Op.getReg();
-      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-          LastDef[*SubRegs] = &MI;
-        }
-      } else if (Hexagon::IntRegsRegClass.contains(Reg))
-        LastDef[Reg] = &MI;
+    for (MachineOperand &Op : MI.operands()) {
+      if (Op.isReg()) {
+        if (!Op.isDef() || !Op.getReg())
+          continue;
+        unsigned Reg = Op.getReg();
+        if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            LastDef[*SubRegs] = &MI;
+        } else if (Hexagon::IntRegsRegClass.contains(Reg))
+          LastDef[Reg] = &MI;
+      } else if (Op.isRegMask()) {
+        for (unsigned Reg : Hexagon::IntRegsRegClass)
+          if (Op.clobbersPhysReg(Reg))
+            LastDef[Reg] = &MI;
+      }
     }
   }
 }
 
 bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
 
   if (IsCombinesDisabled) return false;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.h b/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.h
new file mode 100644
index 0000000..1009aa3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -0,0 +1,10 @@
+//===--- HexagonDepArch.h -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+enum HexagonArchEnum { V4,V5,V55,V60,V62 };
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td
new file mode 100644
index 0000000..5b1d02c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -0,0 +1,19 @@
+//===--- HexagonDepArch.td ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "V62", "Enable Hexagon V62 architecture">;
+def HasV62T : Predicate<"HST->hasV62TOps()">, AssemblerPredicate<"ArchV62">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Enable Hexagon V60 architecture">;
+def HasV60T : Predicate<"HST->hasV60TOps()">, AssemblerPredicate<"ArchV60">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Enable Hexagon V55 architecture">;
+def HasV55T : Predicate<"HST->hasV55TOps()">, AssemblerPredicate<"ArchV55">;
+def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Enable Hexagon V4 architecture">;
+def HasV4T : Predicate<"HST->hasV4TOps()">, AssemblerPredicate<"ArchV4">;
+def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Enable Hexagon V5 architecture">;
+def HasV5T : Predicate<"HST->hasV5TOps()">, AssemblerPredicate<"ArchV5">;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepDecoders.h b/contrib/llvm/lib/Target/Hexagon/HexagonDepDecoders.h
new file mode 100644
index 0000000..aa9787e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepDecoders.h
@@ -0,0 +1,64 @@
+//===--- HexagonDepDecoders.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<4>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<14>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<8>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<7>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<12>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<3>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<13>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<9>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<5>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
new file mode 100644
index 0000000..1c17882
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -0,0 +1,1143 @@
+//===--- HexagonDepIICHVX.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def tc_0317c6ca : InstrItinClass;
+def tc_1b93bdc6 : InstrItinClass;
+def tc_2171ebae : InstrItinClass;
+def tc_28978789 : InstrItinClass;
+def tc_316c637c : InstrItinClass;
+def tc_354299ad : InstrItinClass;
+def tc_35e92f8e : InstrItinClass;
+def tc_38208312 : InstrItinClass;
+def tc_4105d6b5 : InstrItinClass;
+def tc_41f4b64e : InstrItinClass;
+def tc_41f99e1c : InstrItinClass;
+def tc_45453b98 : InstrItinClass;
+def tc_4e2a5159 : InstrItinClass;
+def tc_4fd8566e : InstrItinClass;
+def tc_51cd3aab : InstrItinClass;
+def tc_5a9fc4ec : InstrItinClass;
+def tc_5c120602 : InstrItinClass;
+def tc_5cbf490b : InstrItinClass;
+def tc_644584f8 : InstrItinClass;
+def tc_69b6dd20 : InstrItinClass;
+def tc_6b78cf13 : InstrItinClass;
+def tc_6fd9ad30 : InstrItinClass;
+def tc_71337255 : InstrItinClass;
+def tc_72ad7b54 : InstrItinClass;
+def tc_77a4c701 : InstrItinClass;
+def tc_7c3f55c4 : InstrItinClass;
+def tc_7e9f581b : InstrItinClass;
+def tc_7fa82b08 : InstrItinClass;
+def tc_7fa8b40f : InstrItinClass;
+def tc_85d237e3 : InstrItinClass;
+def tc_8b6a873f : InstrItinClass;
+def tc_908a4c8c : InstrItinClass;
+def tc_9311da3f : InstrItinClass;
+def tc_9777e6bf : InstrItinClass;
+def tc_97c165b9 : InstrItinClass;
+def tc_99093773 : InstrItinClass;
+def tc_9b9642a1 : InstrItinClass;
+def tc_9c267309 : InstrItinClass;
+def tc_a3127e12 : InstrItinClass;
+def tc_a4c9df3b : InstrItinClass;
+def tc_aedb9f9e : InstrItinClass;
+def tc_b06ab583 : InstrItinClass;
+def tc_b712833a : InstrItinClass;
+def tc_b77635b4 : InstrItinClass;
+def tc_bbaf280e : InstrItinClass;
+def tc_bf142ae2 : InstrItinClass;
+def tc_c00bf9c9 : InstrItinClass;
+def tc_c4b515c5 : InstrItinClass;
+def tc_cbf6d1dc : InstrItinClass;
+def tc_cedf314b : InstrItinClass;
+def tc_d2cb81ea : InstrItinClass;
+def tc_d5090f3e : InstrItinClass;
+def tc_d642eff3 : InstrItinClass;
+def tc_d725e5b0 : InstrItinClass;
+def tc_d7bea0ec : InstrItinClass;
+def tc_d98f4d63 : InstrItinClass;
+def tc_da979fb3 : InstrItinClass;
+def tc_db5b9e2f : InstrItinClass;
+def tc_e172d86a : InstrItinClass;
+def tc_e231aa4f : InstrItinClass;
+def tc_e3748cdf : InstrItinClass;
+def tc_e5053c8f : InstrItinClass;
+def tc_e6299d16 : InstrItinClass;
+def tc_eb669007 : InstrItinClass;
+def tc_eda67dcd : InstrItinClass;
+def tc_f3fc3f83 : InstrItinClass;
+
+class DepHVXItinV55 {
+  list<InstrItinData> DepHVXItinV55_list = [
+    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_71337255, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9b9642a1, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>
+  ];
+}
+
+class DepHVXItinV60 {
+  list<InstrItinData> DepHVXItinV60_list = [
+    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_71337255, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9b9642a1, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>
+  ];
+}
+
+class DepHVXItinV62 {
+  list<InstrItinData> DepHVXItinV62_list = [
+    InstrItinData <tc_0317c6ca, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1b93bdc6, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2171ebae, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_28978789, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_316c637c, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_354299ad, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_35e92f8e, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38208312, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4105d6b5, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_41f4b64e, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_41f99e1c, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45453b98, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_4e2a5159, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4fd8566e, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_51cd3aab, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a9fc4ec, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5c120602, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5cbf490b, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_644584f8, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_69b6dd20, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6b78cf13, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fd9ad30, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_71337255, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_72ad7b54, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_77a4c701, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c3f55c4, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7e9f581b, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7fa82b08, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7fa8b40f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_85d237e3, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8b6a873f, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_908a4c8c, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9311da3f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9777e6bf, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_97c165b9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_99093773, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9b9642a1, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9c267309, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3127e12, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a4c9df3b, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_aedb9f9e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b06ab583, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b712833a, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b77635b4, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_bbaf280e, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_bf142ae2, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c00bf9c9, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c4b515c5, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cbf6d1dc, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cedf314b, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_d2cb81ea, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d5090f3e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d642eff3, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d725e5b0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7bea0ec, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d98f4d63, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da979fb3, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db5b9e2f, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e172d86a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e231aa4f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3748cdf, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e5053c8f, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_e6299d16, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_eb669007, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eda67dcd, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f3fc3f83, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>
+  ];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
new file mode 100644
index 0000000..261778b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -0,0 +1,2504 @@
+//===--- HexagonDepIICScalar.td -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def tc_049dfb74 : InstrItinClass;
+def tc_0767081f : InstrItinClass;
+def tc_07ac815d : InstrItinClass;
+def tc_090485bb : InstrItinClass;
+def tc_09c86199 : InstrItinClass;
+def tc_09faec3b : InstrItinClass;
+def tc_0cb867f2 : InstrItinClass;
+def tc_1000eb10 : InstrItinClass;
+def tc_128719e8 : InstrItinClass;
+def tc_136c4786 : InstrItinClass;
+def tc_14da557c : InstrItinClass;
+def tc_1b6011fb : InstrItinClass;
+def tc_1b834fe7 : InstrItinClass;
+def tc_1e062b18 : InstrItinClass;
+def tc_1e69aa99 : InstrItinClass;
+def tc_1f9668cc : InstrItinClass;
+def tc_1fe8323c : InstrItinClass;
+def tc_20a8e109 : InstrItinClass;
+def tc_210b2456 : InstrItinClass;
+def tc_251c87b2 : InstrItinClass;
+def tc_261d9b78 : InstrItinClass;
+def tc_28d296df : InstrItinClass;
+def tc_29c14515 : InstrItinClass;
+def tc_2aaab1e0 : InstrItinClass;
+def tc_2c8fe5ae : InstrItinClass;
+def tc_2d1e6f5c : InstrItinClass;
+def tc_2e55aa16 : InstrItinClass;
+def tc_30665cb0 : InstrItinClass;
+def tc_336e698c : InstrItinClass;
+def tc_34e882a4 : InstrItinClass;
+def tc_35fb9d13 : InstrItinClass;
+def tc_37326008 : InstrItinClass;
+def tc_3993c58b : InstrItinClass;
+def tc_3b4892c6 : InstrItinClass;
+def tc_3bea1824 : InstrItinClass;
+def tc_3c10f809 : InstrItinClass;
+def tc_3d905451 : InstrItinClass;
+def tc_3e61d314 : InstrItinClass;
+def tc_3eab77bd : InstrItinClass;
+def tc_43068634 : InstrItinClass;
+def tc_45631a8d : InstrItinClass;
+def tc_47ab9233 : InstrItinClass;
+def tc_47f0b7ad : InstrItinClass;
+def tc_485bb57c : InstrItinClass;
+def tc_4997da4a : InstrItinClass;
+def tc_511f28f6 : InstrItinClass;
+def tc_537e2013 : InstrItinClass;
+def tc_53ee6546 : InstrItinClass;
+def tc_548f402d : InstrItinClass;
+def tc_5625c6c1 : InstrItinClass;
+def tc_580a779c : InstrItinClass;
+def tc_583510c7 : InstrItinClass;
+def tc_5d806107 : InstrItinClass;
+def tc_5fa2857c : InstrItinClass;
+def tc_5fe9fcd0 : InstrItinClass;
+def tc_6264c5e0 : InstrItinClass;
+def tc_639d93ee : InstrItinClass;
+def tc_63cd9d2d : InstrItinClass;
+def tc_65dc7cc4 : InstrItinClass;
+def tc_69bb508b : InstrItinClass;
+def tc_6c52d277 : InstrItinClass;
+def tc_6c576d46 : InstrItinClass;
+def tc_70cabf66 : InstrItinClass;
+def tc_7639d4b0 : InstrItinClass;
+def tc_7675c0e9 : InstrItinClass;
+def tc_76c4c5ef : InstrItinClass;
+def tc_77781686 : InstrItinClass;
+def tc_78b3c689 : InstrItinClass;
+def tc_7986ba30 : InstrItinClass;
+def tc_7bc567a7 : InstrItinClass;
+def tc_7c2dcd4d : InstrItinClass;
+def tc_7ca2ea10 : InstrItinClass;
+def tc_7d01cbdc : InstrItinClass;
+def tc_7d9a56cd : InstrItinClass;
+def tc_81a23d44 : InstrItinClass;
+def tc_821c4233 : InstrItinClass;
+def tc_82f0f122 : InstrItinClass;
+def tc_84630363 : InstrItinClass;
+def tc_86442910 : InstrItinClass;
+def tc_87601822 : InstrItinClass;
+def tc_88fa2da6 : InstrItinClass;
+def tc_8c8041e6 : InstrItinClass;
+def tc_8cb685d9 : InstrItinClass;
+def tc_8def9c57 : InstrItinClass;
+def tc_8f0a6bad : InstrItinClass;
+def tc_8fab9ac3 : InstrItinClass;
+def tc_92d1833c : InstrItinClass;
+def tc_94e6ffd9 : InstrItinClass;
+def tc_95c54f8b : InstrItinClass;
+def tc_9a13af9d : InstrItinClass;
+def tc_9b73d261 : InstrItinClass;
+def tc_9c18c9a5 : InstrItinClass;
+def tc_9c68db63 : InstrItinClass;
+def tc_9ce7a5ab : InstrItinClass;
+def tc_9da3628f : InstrItinClass;
+def tc_9dafb7d3 : InstrItinClass;
+def tc_9df8b0dc : InstrItinClass;
+def tc_9e86015f : InstrItinClass;
+def tc_9f518242 : InstrItinClass;
+def tc_a12a5971 : InstrItinClass;
+def tc_a1fb80e1 : InstrItinClass;
+def tc_a333d2a9 : InstrItinClass;
+def tc_a4567c39 : InstrItinClass;
+def tc_a87879e8 : InstrItinClass;
+def tc_a9c993d9 : InstrItinClass;
+def tc_aad55963 : InstrItinClass;
+def tc_ab1b5e74 : InstrItinClass;
+def tc_ae0722f7 : InstrItinClass;
+def tc_ae2c2dc2 : InstrItinClass;
+def tc_ae762521 : InstrItinClass;
+def tc_b08b653e : InstrItinClass;
+def tc_b08be45e : InstrItinClass;
+def tc_b0f50e3c : InstrItinClass;
+def tc_b189ad4c : InstrItinClass;
+def tc_b324366f : InstrItinClass;
+def tc_b5bfaa60 : InstrItinClass;
+def tc_b5f5a094 : InstrItinClass;
+def tc_b86c7e8b : InstrItinClass;
+def tc_baccf077 : InstrItinClass;
+def tc_bc5561d8 : InstrItinClass;
+def tc_bcf0e36e : InstrItinClass;
+def tc_bd16579e : InstrItinClass;
+def tc_be995eaf : InstrItinClass;
+def tc_bf6fa601 : InstrItinClass;
+def tc_c0cd91a8 : InstrItinClass;
+def tc_c14739d5 : InstrItinClass;
+def tc_c1dbc916 : InstrItinClass;
+def tc_c58f771a : InstrItinClass;
+def tc_c85212ca : InstrItinClass;
+def tc_c8f9a6f6 : InstrItinClass;
+def tc_ca280e8b : InstrItinClass;
+def tc_cbe45117 : InstrItinClass;
+def tc_cd321066 : InstrItinClass;
+def tc_d108a090 : InstrItinClass;
+def tc_d1b5a4b6 : InstrItinClass;
+def tc_d2609065 : InstrItinClass;
+def tc_d267fa19 : InstrItinClass;
+def tc_d2a33af5 : InstrItinClass;
+def tc_d63b71d1 : InstrItinClass;
+def tc_d6a805a8 : InstrItinClass;
+def tc_d95f4e98 : InstrItinClass;
+def tc_da79106e : InstrItinClass;
+def tc_dbe218dd : InstrItinClass;
+def tc_dcfee7ae : InstrItinClass;
+def tc_e17ce9ad : InstrItinClass;
+def tc_e2480a7f : InstrItinClass;
+def tc_e2c08bb4 : InstrItinClass;
+def tc_e2c31426 : InstrItinClass;
+def tc_e578178f : InstrItinClass;
+def tc_e836c161 : InstrItinClass;
+def tc_e8c7a357 : InstrItinClass;
+def tc_eb07ef6f : InstrItinClass;
+def tc_ecfaae86 : InstrItinClass;
+def tc_ef0ebaaa : InstrItinClass;
+def tc_ef2676fd : InstrItinClass;
+def tc_f027ebe9 : InstrItinClass;
+def tc_f055fbb6 : InstrItinClass;
+def tc_f1240c08 : InstrItinClass;
+def tc_f16d5b17 : InstrItinClass;
+def tc_f1aa2cdb : InstrItinClass;
+def tc_f26aa619 : InstrItinClass;
+def tc_f4608adc : InstrItinClass;
+def tc_faab1248 : InstrItinClass;
+def tc_fcee8723 : InstrItinClass;
+def tc_feb4974b : InstrItinClass;
+
+class DepScalarItinV4 {
+  list<InstrItinData> DepScalarItinV4_list = [
+    InstrItinData <tc_049dfb74, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_0767081f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_07ac815d, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_090485bb, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_09c86199, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_09faec3b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_0cb867f2, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_1000eb10, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_128719e8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_136c4786, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_14da557c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_1b6011fb, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1b834fe7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1e062b18, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1e69aa99, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_1f9668cc, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_1fe8323c, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_20a8e109, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_210b2456, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_251c87b2, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_261d9b78, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_28d296df, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_29c14515, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_2aaab1e0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2c8fe5ae, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_2d1e6f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2e55aa16, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_30665cb0, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_336e698c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_34e882a4, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_35fb9d13, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_37326008, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3993c58b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_3b4892c6, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_3bea1824, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3c10f809, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3d905451, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3e61d314, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_3eab77bd, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_43068634, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_45631a8d, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_47ab9233, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_47f0b7ad, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_485bb57c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4997da4a, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_511f28f6, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_537e2013, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_53ee6546, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_548f402d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5625c6c1, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_580a779c, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_583510c7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5d806107, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5fa2857c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5fe9fcd0, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6264c5e0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_639d93ee, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_63cd9d2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_65dc7cc4, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_69bb508b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6c52d277, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_6c576d46, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_70cabf66, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7639d4b0, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7675c0e9, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_76c4c5ef, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_77781686, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_78b3c689, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7986ba30, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7bc567a7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7c2dcd4d, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7ca2ea10, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7d01cbdc, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7d9a56cd, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_81a23d44, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_821c4233, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_82f0f122, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_84630363, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_86442910, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_87601822, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_88fa2da6, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8c8041e6, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8cb685d9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8def9c57, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_8f0a6bad, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_8fab9ac3, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_92d1833c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_94e6ffd9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_95c54f8b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9a13af9d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9b73d261, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9c18c9a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9c68db63, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9ce7a5ab, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9da3628f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9dafb7d3, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9df8b0dc, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9e86015f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9f518242, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a12a5971, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a1fb80e1, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_a333d2a9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a4567c39, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a87879e8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a9c993d9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_aad55963, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ab1b5e74, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ae0722f7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ae2c2dc2, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ae762521, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b08b653e, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_b08be45e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b0f50e3c, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b189ad4c, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_b324366f, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_b5bfaa60, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b5f5a094, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b86c7e8b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_baccf077, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_bc5561d8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_bcf0e36e, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_bd16579e, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_be995eaf, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_bf6fa601, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c0cd91a8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c14739d5, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c1dbc916, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c58f771a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c85212ca, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c8f9a6f6, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_ca280e8b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_cbe45117, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_cd321066, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d108a090, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d1b5a4b6, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d2609065, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_d267fa19, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_d2a33af5, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_d63b71d1, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d6a805a8, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_d95f4e98, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_da79106e, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_dbe218dd, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_dcfee7ae, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e17ce9ad, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e2480a7f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e2c08bb4, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e2c31426, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e578178f, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_e836c161, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e8c7a357, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_eb07ef6f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ecfaae86, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_ef0ebaaa, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_ef2676fd, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f027ebe9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f055fbb6, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_f1240c08, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f16d5b17, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f1aa2cdb, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f26aa619, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_f4608adc, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_faab1248, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_fcee8723, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_feb4974b, [InstrStage<1, [SLOT3]>]>  ];
+}
+
+class DepScalarItinV5 {
+  list<InstrItinData> DepScalarItinV5_list = [
+    InstrItinData <tc_049dfb74, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_0767081f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_07ac815d, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_090485bb, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_09c86199, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_09faec3b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_0cb867f2, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_1000eb10, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_128719e8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_136c4786, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_14da557c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_1b6011fb, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1b834fe7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1e062b18, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1e69aa99, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_1f9668cc, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_1fe8323c, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_20a8e109, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_210b2456, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_251c87b2, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_261d9b78, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_28d296df, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_29c14515, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_2aaab1e0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2c8fe5ae, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_2d1e6f5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2e55aa16, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_30665cb0, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_336e698c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_34e882a4, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_35fb9d13, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_37326008, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3993c58b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_3b4892c6, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_3bea1824, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3c10f809, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3d905451, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3e61d314, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_3eab77bd, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_43068634, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_45631a8d, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_47ab9233, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_47f0b7ad, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_485bb57c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4997da4a, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_511f28f6, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_537e2013, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_53ee6546, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_548f402d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5625c6c1, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_580a779c, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_583510c7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5d806107, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5fa2857c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5fe9fcd0, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6264c5e0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_639d93ee, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_63cd9d2d, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_65dc7cc4, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_69bb508b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6c52d277, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_6c576d46, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_70cabf66, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7639d4b0, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7675c0e9, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_76c4c5ef, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_77781686, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_78b3c689, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7986ba30, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7bc567a7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7c2dcd4d, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7ca2ea10, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7d01cbdc, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7d9a56cd, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_81a23d44, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_821c4233, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_82f0f122, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_84630363, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_86442910, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_87601822, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_88fa2da6, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8c8041e6, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8cb685d9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8def9c57, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_8f0a6bad, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_8fab9ac3, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_92d1833c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_94e6ffd9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_95c54f8b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9a13af9d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9b73d261, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9c18c9a5, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9c68db63, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9ce7a5ab, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9da3628f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9dafb7d3, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9df8b0dc, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9e86015f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9f518242, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a12a5971, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a1fb80e1, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_a333d2a9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a4567c39, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a87879e8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a9c993d9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_aad55963, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ab1b5e74, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ae0722f7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ae2c2dc2, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ae762521, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b08b653e, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_b08be45e, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b0f50e3c, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b189ad4c, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_b324366f, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_b5bfaa60, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_b5f5a094, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b86c7e8b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_baccf077, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_bc5561d8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_bcf0e36e, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_bd16579e, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_be995eaf, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_bf6fa601, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c0cd91a8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c14739d5, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c1dbc916, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c58f771a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c85212ca, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c8f9a6f6, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_ca280e8b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_cbe45117, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_cd321066, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d108a090, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d1b5a4b6, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d2609065, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_d267fa19, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_d2a33af5, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_d63b71d1, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d6a805a8, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_d95f4e98, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_da79106e, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_dbe218dd, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_dcfee7ae, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e17ce9ad, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e2480a7f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e2c08bb4, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e2c31426, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e578178f, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_e836c161, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e8c7a357, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_eb07ef6f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ecfaae86, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_ef0ebaaa, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_ef2676fd, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f027ebe9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f055fbb6, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_f1240c08, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f16d5b17, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f1aa2cdb, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f26aa619, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_f4608adc, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_faab1248, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_fcee8723, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_feb4974b, [InstrStage<1, [SLOT3]>]>  ];
+}
+
+class DepScalarItinV55 {
+  list<InstrItinData> DepScalarItinV55_list = [
+    InstrItinData <tc_049dfb74, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0767081f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_07ac815d, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_090485bb, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_09c86199, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_09faec3b, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0cb867f2, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1000eb10, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_128719e8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_136c4786, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14da557c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b6011fb, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b834fe7, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e062b18, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e69aa99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1f9668cc, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe8323c, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a8e109, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_210b2456, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_251c87b2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_261d9b78, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28d296df, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_29c14515, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2aaab1e0, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c8fe5ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2d1e6f5c, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e55aa16, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_30665cb0, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_336e698c, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_34e882a4, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_35fb9d13, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_37326008, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3993c58b, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3b4892c6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3bea1824, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c10f809, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d905451, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e61d314, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3eab77bd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_43068634, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45631a8d, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_47ab9233, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_47f0b7ad, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_485bb57c, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4997da4a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_511f28f6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_537e2013, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53ee6546, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_548f402d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5625c6c1, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_580a779c, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_583510c7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d806107, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5fa2857c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5fe9fcd0, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6264c5e0, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_639d93ee, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63cd9d2d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65dc7cc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bb508b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6c52d277, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6c576d46, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_70cabf66, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7639d4b0, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7675c0e9, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76c4c5ef, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77781686, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_78b3c689, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7986ba30, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7bc567a7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c2dcd4d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_7ca2ea10, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d01cbdc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d9a56cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_81a23d44, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_821c4233, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_82f0f122, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84630363, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86442910, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_87601822, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_88fa2da6, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8c8041e6, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8cb685d9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8def9c57, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8f0a6bad, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8fab9ac3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92d1833c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_94e6ffd9, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95c54f8b, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_9a13af9d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b73d261, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c18c9a5, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c68db63, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9ce7a5ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9da3628f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9dafb7d3, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9df8b0dc, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e86015f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f518242, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a12a5971, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1fb80e1, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a333d2a9, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_a4567c39, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a87879e8, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9c993d9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aad55963, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ab1b5e74, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae0722f7, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae2c2dc2, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae762521, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b08b653e, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b08be45e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b0f50e3c, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b189ad4c, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b324366f, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b5bfaa60, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b5f5a094, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b86c7e8b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_baccf077, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bc5561d8, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bcf0e36e, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_bd16579e, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_be995eaf, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf6fa601, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0cd91a8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c14739d5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c1dbc916, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c58f771a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c85212ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8f9a6f6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ca280e8b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cbe45117, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_cd321066, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d108a090, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d1b5a4b6, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d2609065, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d267fa19, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_d2a33af5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d63b71d1, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d6a805a8, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d95f4e98, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da79106e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dbe218dd, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dcfee7ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e17ce9ad, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2480a7f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2c08bb4, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2c31426, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_e578178f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e836c161, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8c7a357, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eb07ef6f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ecfaae86, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_ef0ebaaa, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef2676fd, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_f027ebe9, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f055fbb6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1240c08, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f16d5b17, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1aa2cdb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f26aa619, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f4608adc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_faab1248, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fcee8723, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_feb4974b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV60 {
+  list<InstrItinData> DepScalarItinV60_list = [
+    InstrItinData <tc_049dfb74, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0767081f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_07ac815d, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_090485bb, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_09c86199, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_09faec3b, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0cb867f2, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1000eb10, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_128719e8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_136c4786, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14da557c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b6011fb, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b834fe7, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e062b18, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e69aa99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1f9668cc, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe8323c, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a8e109, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_210b2456, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_251c87b2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_261d9b78, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28d296df, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_29c14515, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2aaab1e0, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c8fe5ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2d1e6f5c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e55aa16, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_30665cb0, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_336e698c, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_34e882a4, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_35fb9d13, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_37326008, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3993c58b, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3b4892c6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3bea1824, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c10f809, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d905451, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e61d314, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3eab77bd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_43068634, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45631a8d, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_47ab9233, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_47f0b7ad, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_485bb57c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4997da4a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_511f28f6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_537e2013, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53ee6546, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_548f402d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5625c6c1, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_580a779c, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_583510c7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d806107, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5fa2857c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5fe9fcd0, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6264c5e0, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_639d93ee, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63cd9d2d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65dc7cc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bb508b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6c52d277, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6c576d46, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_70cabf66, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7639d4b0, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7675c0e9, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76c4c5ef, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77781686, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_78b3c689, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7986ba30, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7bc567a7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c2dcd4d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_7ca2ea10, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d01cbdc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d9a56cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_81a23d44, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_821c4233, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_82f0f122, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84630363, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86442910, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_87601822, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_88fa2da6, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8c8041e6, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8cb685d9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8def9c57, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8f0a6bad, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8fab9ac3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92d1833c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_94e6ffd9, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95c54f8b, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_9a13af9d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b73d261, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c18c9a5, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c68db63, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9ce7a5ab, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9da3628f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9dafb7d3, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9df8b0dc, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e86015f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f518242, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a12a5971, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1fb80e1, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a333d2a9, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_a4567c39, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a87879e8, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9c993d9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aad55963, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ab1b5e74, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae0722f7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae2c2dc2, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae762521, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b08b653e, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b08be45e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b0f50e3c, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b189ad4c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b324366f, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b5bfaa60, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b5f5a094, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b86c7e8b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_baccf077, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bc5561d8, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bcf0e36e, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_bd16579e, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_be995eaf, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf6fa601, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0cd91a8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c14739d5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c1dbc916, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c58f771a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c85212ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8f9a6f6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ca280e8b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cbe45117, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_cd321066, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d108a090, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d1b5a4b6, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d2609065, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d267fa19, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_d2a33af5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d63b71d1, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d6a805a8, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d95f4e98, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da79106e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dbe218dd, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dcfee7ae, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e17ce9ad, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2480a7f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2c08bb4, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2c31426, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_e578178f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e836c161, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8c7a357, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eb07ef6f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ecfaae86, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_ef0ebaaa, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef2676fd, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_f027ebe9, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f055fbb6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1240c08, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f16d5b17, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1aa2cdb, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f26aa619, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f4608adc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_faab1248, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fcee8723, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_feb4974b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV62 {
+  list<InstrItinData> DepScalarItinV62_list = [
+    InstrItinData <tc_049dfb74, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0767081f, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_07ac815d, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_090485bb, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_09c86199, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_09faec3b, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0cb867f2, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1000eb10, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_128719e8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_136c4786, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14da557c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b6011fb, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1b834fe7, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e062b18, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e69aa99, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1f9668cc, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe8323c, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a8e109, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_210b2456, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_251c87b2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_261d9b78, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28d296df, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_29c14515, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2aaab1e0, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c8fe5ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2d1e6f5c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e55aa16, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_30665cb0, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_336e698c, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_34e882a4, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_35fb9d13, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_37326008, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3993c58b, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3b4892c6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3bea1824, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c10f809, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d905451, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e61d314, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3eab77bd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_43068634, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45631a8d, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_47ab9233, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_47f0b7ad, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_485bb57c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4997da4a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_511f28f6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_537e2013, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53ee6546, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_548f402d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5625c6c1, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_580a779c, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_583510c7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d806107, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5fa2857c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5fe9fcd0, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6264c5e0, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_639d93ee, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63cd9d2d, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65dc7cc4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bb508b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6c52d277, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6c576d46, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_70cabf66, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7639d4b0, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7675c0e9, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76c4c5ef, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77781686, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_78b3c689, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7986ba30, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7bc567a7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c2dcd4d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_7ca2ea10, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d01cbdc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d9a56cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_81a23d44, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_821c4233, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_82f0f122, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84630363, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86442910, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_87601822, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_88fa2da6, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8c8041e6, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8cb685d9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8def9c57, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8f0a6bad, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8fab9ac3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92d1833c, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_94e6ffd9, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95c54f8b, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_9a13af9d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b73d261, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c18c9a5, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c68db63, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9ce7a5ab, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9da3628f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9dafb7d3, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9df8b0dc, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e86015f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f518242, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a12a5971, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1fb80e1, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a333d2a9, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_a4567c39, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a87879e8, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9c993d9, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aad55963, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ab1b5e74, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae0722f7, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae2c2dc2, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae762521, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b08b653e, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b08be45e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b0f50e3c, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b189ad4c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b324366f, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b5bfaa60, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b5f5a094, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b86c7e8b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_baccf077, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bc5561d8, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bcf0e36e, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_bd16579e, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_be995eaf, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf6fa601, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0cd91a8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c14739d5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c1dbc916, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c58f771a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c85212ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c8f9a6f6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ca280e8b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cbe45117, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_cd321066, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d108a090, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d1b5a4b6, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d2609065, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d267fa19, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_d2a33af5, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d63b71d1, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d6a805a8, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d95f4e98, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_da79106e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dbe218dd, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dcfee7ae, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e17ce9ad, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2480a7f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2c08bb4, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e2c31426, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_e578178f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e836c161, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8c7a357, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eb07ef6f, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ecfaae86, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_ef0ebaaa, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef2676fd, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_f027ebe9, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f055fbb6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1240c08, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f16d5b17, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1aa2cdb, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f26aa619, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f4608adc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_faab1248, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fcee8723, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_feb4974b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>
+  ];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepITypes.h b/contrib/llvm/lib/Target/Hexagon/HexagonDepITypes.h
new file mode 100644
index 0000000..be831b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepITypes.h
@@ -0,0 +1,52 @@
+//===--- HexagonDepITypes.h -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+namespace HexagonII {
+enum Type {
+  TypeALU32_2op = 0,
+  TypeALU32_3op = 1,
+  TypeALU32_ADDI = 2,
+  TypeALU64 = 3,
+  TypeCJ = 4,
+  TypeCR = 6,
+  TypeCVI_HIST = 10,
+  TypeCVI_VA = 16,
+  TypeCVI_VA_DV = 17,
+  TypeCVI_VINLANESAT = 18,
+  TypeCVI_VM_LD = 19,
+  TypeCVI_VM_NEW_ST = 20,
+  TypeCVI_VM_ST = 21,
+  TypeCVI_VM_STU = 22,
+  TypeCVI_VM_TMP_LD = 23,
+  TypeCVI_VM_VP_LDU = 24,
+  TypeCVI_VP = 25,
+  TypeCVI_VP_VS = 26,
+  TypeCVI_VS = 27,
+  TypeCVI_VX = 29,
+  TypeCVI_VX_DV = 30,
+  TypeCVI_VX_LATE = 31,
+  TypeDUPLEX = 33,
+  TypeENDLOOP = 34,
+  TypeEXTENDER = 35,
+  TypeJ = 36,
+  TypeLD = 37,
+  TypeM = 38,
+  TypeMAPPING = 39,
+  TypeNCJ = 40,
+  TypePSEUDO = 41,
+  TypeST = 42,
+  TypeSUBINSN = 43,
+  TypeS_2op = 44,
+  TypeS_3op = 45,
+  TypeV2LDST = 48,
+  TypeV4LDST = 49
+};
+}
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepITypes.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepITypes.td
new file mode 100644
index 0000000..ac1989e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepITypes.td
@@ -0,0 +1,47 @@
+//===--- HexagonDepITypes.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class IType<bits<6> t> { bits<6> Value = t; }
+def TypeALU32_2op : IType<0>;
+def TypeALU32_3op : IType<1>;
+def TypeALU32_ADDI : IType<2>;
+def TypeALU64 : IType<3>;
+def TypeCJ : IType<4>;
+def TypeCR : IType<6>;
+def TypeCVI_HIST : IType<10>;
+def TypeCVI_VA : IType<16>;
+def TypeCVI_VA_DV : IType<17>;
+def TypeCVI_VINLANESAT : IType<18>;
+def TypeCVI_VM_LD : IType<19>;
+def TypeCVI_VM_NEW_ST : IType<20>;
+def TypeCVI_VM_ST : IType<21>;
+def TypeCVI_VM_STU : IType<22>;
+def TypeCVI_VM_TMP_LD : IType<23>;
+def TypeCVI_VM_VP_LDU : IType<24>;
+def TypeCVI_VP : IType<25>;
+def TypeCVI_VP_VS : IType<26>;
+def TypeCVI_VS : IType<27>;
+def TypeCVI_VX : IType<29>;
+def TypeCVI_VX_DV : IType<30>;
+def TypeCVI_VX_LATE : IType<31>;
+def TypeDUPLEX : IType<33>;
+def TypeENDLOOP : IType<34>;
+def TypeEXTENDER : IType<35>;
+def TypeJ : IType<36>;
+def TypeLD : IType<37>;
+def TypeM : IType<38>;
+def TypeMAPPING : IType<39>;
+def TypeNCJ : IType<40>;
+def TypePSEUDO : IType<41>;
+def TypeST : IType<42>;
+def TypeSUBINSN : IType<43>;
+def TypeS_2op : IType<44>;
+def TypeS_3op : IType<45>;
+def TypeV2LDST : IType<48>;
+def TypeV4LDST : IType<49>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
new file mode 100644
index 0000000..1b24be4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -0,0 +1,3241 @@
+//===--- HexagonDepInstrFormats.td ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Enc_890909 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_527412 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_efaed8 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+}
+class Enc_a568d4 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_27b757 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_5de85f : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_0e41fa : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_802dc0 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_6b197f : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1f5d8f : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_51436c : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{13-0} = Ii{13-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_c7a204 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_db40cd : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_a1e29d : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_d15d19 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_e90a15 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{22-22} = n1{0-0};
+}
+class Enc_e0a47a : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_140c83 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_7eee72 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_d7dc10 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_736575 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{25-23} = n1{2-0};
+}
+class Enc_8dec2e : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_eaa9f8 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_509701 : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_830e5d : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <2> Pu4;
+  let Inst{24-23} = Pu4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_79b8c8 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_58a8bf : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_041d7b : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-23} = n1{3-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_f44229 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_aad80c : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_87c142 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_86a14b : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{7-3} = Ii{7-3};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_9a33d5 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_a56825 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9ea4cf : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_ee5ed0 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <2> n1;
+  let Inst{9-8} = n1{1-0};
+}
+class Enc_935d9b : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_61f0b0 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_bd6011 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_65d691 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_e8c45e : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_ca3887 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_a94f3b : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_625deb : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_1f5ba6 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_cd82bc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{21-21} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_399e12 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_d7a65e : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_607661 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6a5972 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{11-8} = Rt16{3-0};
+}
+class Enc_53dca9 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_27fd0e : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_93af4c : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-4} = Ii{6-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_5bdd42 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_71f1b4 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14640c : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_31db33 : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_65f095 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_784502 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6413b6 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-23} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_7a0ea6 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <1> n1;
+  let Inst{9-9} = n1{0-0};
+}
+class Enc_84bff1 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_74aef2 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_78e566 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_437f33 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_0527db : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_420cf3 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_e39bb2 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{9-4} = Ii{5-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_1b64fb : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_c6220b : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
+}
+class Enc_322e1b : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{23-23} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_989021 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_178717 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-23} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_78cbf0 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_052c7d : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_fcf7a7 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_55355c : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_211aaa : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6185fe : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_cd4705 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_2ebe3b : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3d5b28 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5ab2be : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_fef969 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_63eaeb : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{1-0} = Ii{1-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_95441f : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_372c9d : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4dff07 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_04c959 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_b62ef7 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2b518f : OpcodeHexagon {
+  bits <32> Ii;
+  let Inst{27-16} = Ii{31-20};
+  let Inst{13-0} = Ii{19-6};
+}
+class Enc_b388cf : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_ad1c74 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_74d4e5 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_c90aca : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_222336 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5e87ce : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_f7ea77 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_245865 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_88d4d9 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_c0cdde : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_226535 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_31aa6a : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_397f23 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_865390 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_98c0b8 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_bfbf03 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_ecbcc8 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_f5e933 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_3fc427 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_01d3d0 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_b0e9d8 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_3694bd : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-22} = n1{1-0};
+}
+class Enc_a42857 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_b7fad3 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{9-8} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_223005 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9e4c3f : OpcodeHexagon {
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rd16;
+  let Inst{19-16} = Rd16{3-0};
+}
+class Enc_8b8d61 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_88c16c : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_770858 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_bd811a : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Cd32;
+  let Inst{4-0} = Cd32{4-0};
+}
+class Enc_b05839 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_bc03e5 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_412ff0 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rxx32;
+  let Inst{12-8} = Rxx32{4-0};
+}
+class Enc_c9a18e : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_be32a5 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_e6abcf : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_6339d5 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_d6990d : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_6c9440 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_0d8adb : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_50e578 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1cf4ca : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_48b75f : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_b97f71 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9d1247 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_f4413a : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_f7430e : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_e7581c : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_2301d6 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_c31910 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{23-21} = Ii{7-5};
+  let Inst{13-13} = Ii{4-4};
+  let Inst{7-5} = Ii{3-1};
+  let Inst{3-3} = Ii{0-0};
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2f2f04 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_8d8a30 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_2d7491 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_a803e0 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_45364e : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_b909d2 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <7> n1;
+  let Inst{28-28} = n1{6-6};
+  let Inst{25-22} = n1{5-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_e6c957 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_fa3ba4 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_0d8870 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_9fae8a : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_18c338 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_5ccba9 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_0ed752 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Cdd32;
+  let Inst{4-0} = Cdd32{4-0};
+}
+class Enc_143445 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_3a3d62 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3e3989 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_152467 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_daea09 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{23-22} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+}
+class Enc_f37377 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_a198f6 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-5} = Ii{6-1};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_3dac0b : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_e38e1f : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_f8ecf9 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{20-16} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_7f1a05 : OpcodeHexagon {
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ry32;
+  let Inst{12-8} = Ry32{4-0};
+}
+class Enc_2df31d : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{9-4} = Ii{7-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_25bef0 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_f82302 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{23-23} = n1{0-0};
+}
+class Enc_83ee64 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_adf111 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_46c951 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_5d6c34 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_4df4e9 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_91b9fe : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_a7b8e8 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2b3f60 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Px4;
+  let Inst{6-5} = Px4{1-0};
+}
+class Enc_bd1cbc : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_a30110 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_f3f408 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_690862 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_2a3787 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_d5c73f : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3f97c8 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_d50cd3 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_729ff7 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_217147 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_b9c5fb : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_f394d3 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_0cb018 : OpcodeHexagon {
+  bits <5> Cs32;
+  let Inst{20-16} = Cs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_541f26 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_724154 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_179b35 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_585242 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_cf1927 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b84c4c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9ac432 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pu4;
+  let Inst{7-6} = Pu4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8203bb : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_e66a97 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_8c2412 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_284ebb : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_733b27 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_22c845 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9b0bc1 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_ea4c54 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_b72622 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_569cfe : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_96ce4f : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_143a3c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_57a33e : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-3} = Ii{7-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_311abd : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_a1640c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_de0214 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a90628 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_fda92c : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_831a7d : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_11a146 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_b15941 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b78edd : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_a27588 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_2a7b91 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_b43b67 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <2> Qx4;
+  let Inst{6-5} = Qx4{1-0};
+}
+class Enc_4aca3a : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <3> n1;
+  let Inst{29-29} = n1{2-2};
+  let Inst{26-25} = n1{1-0};
+}
+class Enc_b38ffc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_cda00a : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{19-16} = Ii{11-8};
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2fbf3c : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_70b24b : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2ae154 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_50b5ac : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_2ea740 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_08d755 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_1178da : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_8dbe85 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5a18b3 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{22-22} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_14d27a : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_a05677 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_f0cca7 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <6> II;
+  let Inst{20-16} = II{5-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_500cb0 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_7e5a82 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_12b6e9 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_6f70ca : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{8-4} = Ii{7-3};
+}
+class Enc_7222b7 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_e3b0c4 : OpcodeHexagon {
+}
+class Enc_a255dc : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_cb4b4e : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9cdba7 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_5cd7e9 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_454a26 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_a6853f : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <6> n1;
+  let Inst{29-29} = n1{5-5};
+  let Inst{26-25} = n1{4-3};
+  let Inst{23-22} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_c175d0 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_895bd9 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_ea23e4 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4dc228 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <10> II;
+  let Inst{20-16} = II{9-5};
+  let Inst{7-5} = II{4-2};
+  let Inst{1-0} = II{1-0};
+}
+class Enc_10bc21 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1aaec1 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_329361 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_d2c7f1 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_3680c2 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_1ef990 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_e957fb : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_c9e3bc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_2e1979 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_0b2e5b : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_d483b9 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_51635c : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_e26546 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_70fb07 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_277737 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{22-21} = Ii{7-6};
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-5} = Ii{4-2};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_5c124a : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_928ca1 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_da664b : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_7b7ba8 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_47ee5e : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
+}
+class Enc_8bcba4 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_3a2484 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_a5ed8a : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_cb9321 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{27-21} = Ii{15-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_668704 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-22} = n1{3-0};
+}
+class Enc_a7341a : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_5eac98 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_02553a : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_acd6ed : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-5} = Ii{8-3};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8e583a : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_b886fd : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_24a7dc : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_2d829e : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_4f4ed7 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_84b2cd : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_8dbdfe : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_90cd8b : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_bd0b33 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_c7cd90 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_405228 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <3> n1;
+  let Inst{28-28} = n1{2-2};
+  let Inst{24-23} = n1{1-0};
+}
+class Enc_81ac1d : OpcodeHexagon {
+  bits <24> Ii;
+  let Inst{24-16} = Ii{23-15};
+  let Inst{13-1} = Ii{14-2};
+}
+class Enc_395cc4 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_a51a9a : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+}
+class Enc_d44e31 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_f77fbc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_d2216a : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_85bf58 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_71bb9b : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_52a5dd : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5e2823 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_28a2dc : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_5138b3 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_84d359 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{3-0} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_e07374 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_323f2d : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_1a9974 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_1de724 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-22} = n1{2-0};
+}
+class Enc_dd766a : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_0b51ce : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b4e6cf : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_44215c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_a21d47 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{10-5} = Ii{5-0};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_cc449f : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_645d54 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_667b39 : OpcodeHexagon {
+  bits <5> Css32;
+  let Inst{20-16} = Css32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_927852 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_163a3c : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_b087ac : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_b1e1fb : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_1f19b5 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{9-5} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_b8c967 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_fb6577 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2bae10 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_c4dc92 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_03833b : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_dbd70c : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_f6fe0b : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{24-22} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_9e2e1c : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8df4be : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_66bce1 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{11-8} = Rd16{3-0};
+}
+class Enc_b8309d : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{8-3} = Ii{8-3};
+  bits <3> Rtt8;
+  let Inst{2-0} = Rtt8{2-0};
+}
+class Enc_5e8512 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_4f677b : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_3d920a : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_e83554 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_ed48be : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{6-5} = Ii{1-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_f8c1c4 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1aa186 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_134437 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qt4;
+  let Inst{23-22} = Qt4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_97d666 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_f82eaf : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{10-5} = Ii{7-2};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_69d63b : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_f79415 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_ce6828 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_800e04 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_ad1831 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_0fa531 : OpcodeHexagon {
+  bits <15> Ii;
+  let Inst{21-21} = Ii{14-14};
+  let Inst{13-13} = Ii{13-13};
+  let Inst{11-1} = Ii{12-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_7eaeb6 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_f55a0c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_f20719 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_eafd18 : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_7b523d : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_47ef61 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_cc857d : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_7fa7f6 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_0f8bab : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_7eb485 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_864a5a : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_c2b48e : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8c6530 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_448f7f : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_da8d43 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_a6ce9c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{3-0} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_eca7c8 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_4b39e4 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
new file mode 100644
index 0000000..30ebf89
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -0,0 +1,46012 @@
+//===--- HexagonDepInstrInfo.td -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def A2_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = abs($Rs32)",
+tc_94e6ffd9, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_absp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = abs($Rss32)",
+tc_94e6ffd9, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000100;
+let prefersSlot3 = 1;
+}
+def A2_abssat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = abs($Rs32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_add : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_addh_h16_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.h):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_addh_h16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.l):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_addh_h16_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_addh_h16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_addh_h16_sat_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.h):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.l):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_addh_l16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h)",
+tc_7ca2ea10, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_addh_l16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l)",
+tc_7ca2ea10, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_addh_l16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):sat",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_addh_l16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):sat",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,#$Ii)",
+tc_548f402d, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel {
+let Inst{31-28} = 0b1011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isPredicable = 1;
+let isAdd = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def A2_addp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let isCommutable = 1;
+let isAdd = 1;
+}
+def A2_addpsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let isCommutable = 1;
+}
+def A2_addsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32):sat",
+tc_b0f50e3c, TypeALU32_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_addsp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rs32,$Rtt32)",
+tc_bd16579e, TypeALU64> {
+let isPseudo = 1;
+}
+def A2_addsph : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):raw:hi",
+tc_bd16579e, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_addspl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):raw:lo",
+tc_bd16579e, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_and : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = and($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_and";
+let InputType = "reg";
+let BaseOpcode = "A2_and";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_andir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = and($Rs32,#$Ii)",
+tc_548f402d, TypeALU32_2op>, Enc_140c83, ImmRegRel {
+let Inst{31-22} = 0b0111011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_and";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_andp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = and($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_aslh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = aslh($Rs32)",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+let isPredicable = 1;
+}
+def A2_asrh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = asrh($Rs32)",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+let isPredicable = 1;
+}
+def A2_combine_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.h,$Rs32.h)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.h,$Rs32.l)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.l,$Rs32.h)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.l,$Rs32.l)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+tc_548f402d, TypeALU32_2op>, Enc_18c338 {
+let Inst{31-23} = 0b011111000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_combinew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = combine($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_be32a5, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110101000;
+let InputType = "reg";
+let BaseOpcode = "A2_combinew";
+let isPredicable = 1;
+}
+def A2_max : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = max($Rs32,$Rt32)",
+tc_47ab9233, TypeALU64>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_maxp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = max($Rss32,$Rtt32)",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_maxu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = maxu($Rs32,$Rt32)",
+tc_47ab9233, TypeALU64>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_maxup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = maxu($Rss32,$Rtt32)",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_min : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = min($Rt32,$Rs32)",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_minp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = min($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_minu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = minu($Rt32,$Rs32)",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_minup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = minu($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_neg : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = neg($Rs32)",
+tc_f16d5b17, TypeALU32_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_negp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = neg($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_negsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = neg($Rs32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_nop : HInst<
+(outs),
+(ins),
+"nop",
+tc_e2c31426, TypeALU32_2op>, Enc_e3b0c4 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0111111100000000;
+}
+def A2_not : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = not($Rs32)",
+tc_f16d5b17, TypeALU32_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_notp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = not($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_or : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = or($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_or";
+let InputType = "reg";
+let BaseOpcode = "A2_or";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_orir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = or($Rs32,#$Ii)",
+tc_548f402d, TypeALU32_2op>, Enc_140c83, ImmRegRel {
+let Inst{31-22} = 0b0111011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_or";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_orp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = or($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_paddf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = add($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = add($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = add($Rs32,#$Ii)",
+tc_1b6011fb, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddifnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = add($Rs32,#$Ii)",
+tc_28d296df, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-23} = 0b011101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = add($Rs32,#$Ii)",
+tc_1b6011fb, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011101000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_padditnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = add($Rs32,#$Ii)",
+tc_28d296df, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-23} = 0b011101000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = add($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = add($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_pandf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = and($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_and";
+}
+def A2_pandfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = and($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_and";
+}
+def A2_pandt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = and($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_and";
+}
+def A2_pandtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = and($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_and";
+}
+def A2_porf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = or($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_or";
+}
+def A2_porfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = or($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_or";
+}
+def A2_port : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = or($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_or";
+}
+def A2_portnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = or($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_or";
+}
+def A2_psubf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sub($Rt32,$Rs32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
+tc_28d296df, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sub($Rt32,$Rs32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
+tc_28d296df, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sub";
+}
+def A2_pxorf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = xor($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxorfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxort : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = xor($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxortnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_xor";
+}
+def A2_roundsat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = round($Rss32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = sat($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satb($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sath : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sath($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satub($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satuh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satuh($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_sub";
+let InputType = "reg";
+let BaseOpcode = "A2_sub";
+let isPredicable = 1;
+}
+def A2_subh_h16_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.h):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_subh_h16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.l):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_subh_h16_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_subh_h16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):<<16",
+tc_bd16579e, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_subh_h16_sat_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.h):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.l):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):sat:<<16",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_subh_l16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h)",
+tc_7ca2ea10, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_subh_l16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l)",
+tc_7ca2ea10, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A2_subh_l16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):sat",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_subh_l16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):sat",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_subp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = sub($Rtt32,$Rss32)",
+tc_9c18c9a5, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_subri : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = sub(#$Ii,$Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel {
+let Inst{31-22} = 0b0111011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_sub";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_subsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32):sat",
+tc_b0f50e3c, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_svaddh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vaddh($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svaddhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vaddh($Rs32,$Rt32):sat",
+tc_b0f50e3c, TypeALU32_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svadduhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vadduh($Rs32,$Rt32):sat",
+tc_b0f50e3c, TypeALU32_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svavgh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vavgh($Rs32,$Rt32)",
+tc_511f28f6, TypeALU32_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svavghs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vavgh($Rs32,$Rt32):rnd",
+tc_76c4c5ef, TypeALU32_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svnavgh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vnavgh($Rt32,$Rs32)",
+tc_511f28f6, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+}
+def A2_svsubh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubh($Rt32,$Rs32)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_svsubhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubh($Rt32,$Rs32):sat",
+tc_b0f50e3c, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_svsubuhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubuh($Rt32,$Rs32):sat",
+tc_b0f50e3c, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_swiz : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = swiz($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_sxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxtb($Rs32)",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+let isPredicable = 1;
+}
+def A2_sxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxth($Rs32)",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+let isPredicable = 1;
+}
+def A2_sxtw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = sxtw($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_3a3d62 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100010;
+}
+def A2_tfr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = $Rs32",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPredicable = 1;
+}
+def A2_tfrcrr : HInst<
+(outs IntRegs:$Rd32),
+(ins CtrRegs:$Cs32),
+"$Rd32 = $Cs32",
+tc_3b4892c6, TypeCR>, Enc_0cb018 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_tfrf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = $Rs32",
+tc_1b6011fb, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = $Rs32",
+tc_28d296df, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrih : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u16_0Imm:$Ii),
+"$Rx32.h = #$Ii",
+tc_548f402d, TypeALU32_2op>, Enc_51436c {
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def A2_tfril : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u16_0Imm:$Ii),
+"$Rx32.l = #$Ii",
+tc_548f402d, TypeALU32_2op>, Enc_51436c {
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def A2_tfrp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = $Rss32",
+tc_548f402d, TypeALU32_2op>, PredNewRel {
+let BaseOpcode = "A2_tfrp";
+let isPredicable = 1;
+let isPseudo = 1;
+}
+def A2_tfrpf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if (!$Pu4) $Rdd32 = $Rss32",
+tc_548f402d, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrpfnew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if (!$Pu4.new) $Rdd32 = $Rss32",
+tc_b08be45e, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrpi : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii),
+"$Rdd32 = #$Ii",
+tc_548f402d, TypeALU64> {
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isPseudo = 1;
+}
+def A2_tfrpt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if ($Pu4) $Rdd32 = $Rss32",
+tc_548f402d, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrptnew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if ($Pu4.new) $Rdd32 = $Rss32",
+tc_b08be45e, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrrcr : HInst<
+(outs CtrRegs:$Cd32),
+(ins IntRegs:$Rs32),
+"$Cd32 = $Rs32",
+tc_82f0f122, TypeCR>, Enc_bd811a {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_tfrsi : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii),
+"$Rd32 = #$Ii",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel {
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isPredicable = 1;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def A2_tfrt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = $Rs32",
+tc_1b6011fb, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = $Rs32",
+tc_28d296df, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vabsh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsh($Rss32)",
+tc_94e6ffd9, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000010;
+let prefersSlot3 = 1;
+}
+def A2_vabshsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsh($Rss32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vabsw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsw($Rss32)",
+tc_94e6ffd9, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000010;
+let prefersSlot3 = 1;
+}
+def A2_vabswsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsw($Rss32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vaddb_map : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddb($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vaddh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddh($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddh($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vaddub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddub($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddubs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddub($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vadduhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vadduh($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vaddw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddw($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddws : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddw($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vavgh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32)",
+tc_cd321066, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavghcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32):crnd",
+tc_63cd9d2d, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavghr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32):rnd",
+tc_37326008, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavgub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgub($Rss32,$Rtt32)",
+tc_cd321066, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavgubr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgub($Rss32,$Rtt32):rnd",
+tc_37326008, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavguh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguh($Rss32,$Rtt32)",
+tc_cd321066, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavguhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguh($Rss32,$Rtt32):rnd",
+tc_37326008, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavguw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguw($Rss32,$Rtt32)",
+tc_cd321066, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vavguwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguw($Rss32,$Rtt32):rnd",
+tc_37326008, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vavgw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32)",
+tc_cd321066, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vavgwcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32):crnd",
+tc_63cd9d2d, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vavgwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32):rnd",
+tc_37326008, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vcmpbeq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.eq($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b110000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpbgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.gtu($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b111000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpheq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.eq($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmphgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.gt($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmphgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.gtu($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpweq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.eq($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpwgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.gt($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpwgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.gtu($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vconj : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vconj($Rss32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vmaxb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxb($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_vmaxh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxh($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_vmaxub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxub($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_vmaxuh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxuh($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_vmaxuw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxuw($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_vmaxw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxw($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_vminb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminb($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+let prefersSlot3 = 1;
+}
+def A2_vminh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminh($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_vminub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminub($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_vminuh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminuh($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_vminuw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminuw($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_vminw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminw($Rtt32,$Rss32)",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+let prefersSlot3 = 1;
+}
+def A2_vnavgh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32)",
+tc_cd321066, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+}
+def A2_vnavghcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32):crnd:sat",
+tc_63cd9d2d, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavghr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32):rnd:sat",
+tc_63cd9d2d, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavgw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32)",
+tc_cd321066, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+}
+def A2_vnavgwcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32):crnd:sat",
+tc_63cd9d2d, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavgwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32):rnd:sat",
+tc_63cd9d2d, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vraddub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vraddub($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def A2_vraddub_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vraddub($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A2_vrsadub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrsadub($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def A2_vrsadub_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrsadub($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A2_vsubb_map : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vsubb($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vsubh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubh($Rtt32,$Rss32)",
+tc_9c18c9a5, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsubhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubh($Rtt32,$Rss32):sat",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vsubub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubub($Rtt32,$Rss32)",
+tc_9c18c9a5, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsububs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubub($Rtt32,$Rss32):sat",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vsubuhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubuh($Rtt32,$Rss32):sat",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vsubw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubw($Rtt32,$Rss32)",
+tc_9c18c9a5, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsubws : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubw($Rtt32,$Rss32):sat",
+tc_47ab9233, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_xor : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = xor($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let BaseOpcode = "A2_xor";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_xorp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = xor($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeALU64>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_zxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxtb($Rs32)",
+tc_548f402d, TypeALU32_2op>, PredNewRel {
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+let isPredicable = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_zxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxth($Rs32)",
+tc_f16d5b17, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+let isPredicable = 1;
+}
+def A4_addp_c : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Px4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
+"$Rdd32 = add($Rss32,$Rtt32,$Px4):carry",
+tc_a87879e8, TypeS_3op>, Enc_2b3f60 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010110;
+let isPredicateLate = 1;
+let Constraints = "$Px4 = $Px4in";
+}
+def A4_andn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = and($Rt32,~$Rs32)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A4_andnp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = and($Rtt32,~$Rss32)",
+tc_9c18c9a5, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+}
+def A4_bitsplit : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = bitsplit($Rs32,$Rt32)",
+tc_7ca2ea10, TypeALU64>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010100001;
+let prefersSlot3 = 1;
+}
+def A4_bitspliti : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rdd32 = bitsplit($Rs32,#$Ii)",
+tc_7ca2ea10, TypeS_2op>, Enc_311abd {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+let prefersSlot3 = 1;
+}
+def A4_boundscheck : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rs32,$Rtt32)",
+tc_c58f771a, TypeALU64> {
+let isPseudo = 1;
+}
+def A4_boundscheck_hi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rss32,$Rtt32):raw:hi",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_boundscheck_lo : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rss32,$Rtt32):raw:lo",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_cmpbeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.eq($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b110000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbeq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpbeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Pd4 = cmpb.eq($Rs32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_08d755, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101000;
+let CextOpcode = "A4_cmpbeq";
+let InputType = "imm";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpbgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.gt($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmpbgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s8_0Imm:$Ii),
+"$Pd4 = cmpb.gt($Rs32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_08d755, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101001;
+let CextOpcode = "A4_cmpbgt";
+let InputType = "imm";
+let isCompare = 1;
+}
+def A4_cmpbgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.gtu($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b111000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmpbgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmpb.gtu($Rs32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_02553a, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011101010;
+let CextOpcode = "A4_cmpbgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+}
+def A4_cmpheq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.eq($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpheq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpheqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmph.eq($Rs32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_08d755, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101000;
+let CextOpcode = "A4_cmpheq";
+let InputType = "imm";
+let isCommutable = 1;
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cmphgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.gt($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmphgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmphgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmph.gt($Rs32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_08d755, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101001;
+let CextOpcode = "A4_cmphgt";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cmphgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.gtu($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmphgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmphgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmph.gtu($Rs32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_02553a, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011101010;
+let CextOpcode = "A4_cmphgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+}
+def A4_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+tc_548f402d, TypeALU32_2op>, Enc_f0cca7 {
+let Inst{31-21} = 0b01111100100;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def A4_combineir : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rdd32 = combine(#$Ii,$Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_9cdba7 {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011001;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_combineri : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rdd32 = combine($Rs32,#$Ii)",
+tc_548f402d, TypeALU32_2op>, Enc_9cdba7 {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011000;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cround_ri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = cround($Rs32,#$Ii)",
+tc_63cd9d2d, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_cround_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cround($Rs32,$Rt32)",
+tc_63cd9d2d, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_ext : HInst<
+(outs),
+(ins u26_6Imm:$Ii),
+"immext(#$Ii)",
+tc_9a13af9d, TypeEXTENDER>, Enc_2b518f {
+let Inst{31-28} = 0b0000;
+}
+def A4_modwrapu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = modwrap($Rs32,$Rt32)",
+tc_47ab9233, TypeALU64>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_orn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = or($Rt32,~$Rs32)",
+tc_548f402d, TypeALU32_3op>, Enc_bd6011 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A4_ornp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = or($Rtt32,~$Rss32)",
+tc_9c18c9a5, TypeALU64>, Enc_ea23e4 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+}
+def A4_paslhf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = aslh($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslhfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = aslh($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = aslh($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslhtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = aslh($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_aslh";
+}
+def A4_pasrhf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = asrh($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrhfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = asrh($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = asrh($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrhtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = asrh($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_asrh";
+}
+def A4_psxtbf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sxtb($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sxtb($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sxtb($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sxtb($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxthf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sxth($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxthfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sxth($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxtht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sxth($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxthtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sxth($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxth";
+}
+def A4_pzxtbf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = zxtb($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = zxtb($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = zxtb($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = zxtb($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxthf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = zxth($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxthfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = zxth($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxtht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = zxth($Rs32)",
+tc_548f402d, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxthtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = zxth($Rs32)",
+tc_b08be45e, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxth";
+}
+def A4_rcmpeq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmp.eq($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeq";
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A4_rcmpeqi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = cmp.eq($Rs32,#$Ii)",
+tc_548f402d, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeqi";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_rcmpneq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = !cmp.eq($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpneq";
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A4_rcmpneqi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = !cmp.eq($Rs32,#$Ii)",
+tc_548f402d, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeqi";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_round_ri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = round($Rs32,#$Ii)",
+tc_63cd9d2d, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_round_ri_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = round($Rs32,#$Ii):sat",
+tc_63cd9d2d, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A4_round_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = round($Rs32,$Rt32)",
+tc_63cd9d2d, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_round_rr_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = round($Rs32,$Rt32):sat",
+tc_63cd9d2d, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A4_subp_c : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Px4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
+"$Rdd32 = sub($Rss32,$Rtt32,$Px4):carry",
+tc_a87879e8, TypeS_3op>, Enc_2b3f60 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010111;
+let isPredicateLate = 1;
+let Constraints = "$Px4 = $Px4in";
+}
+def A4_tfrcpp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins CtrRegs64:$Css32),
+"$Rdd32 = $Css32",
+tc_3b4892c6, TypeCR>, Enc_667b39 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101000000;
+}
+def A4_tfrpcp : HInst<
+(outs CtrRegs64:$Cdd32),
+(ins DoubleRegs:$Rss32),
+"$Cdd32 = $Rss32",
+tc_82f0f122, TypeCR>, Enc_0ed752 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100011001;
+}
+def A4_tlbmatch : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Pd4 = tlbmatch($Rss32,$Rt32)",
+tc_e2c08bb4, TypeALU64>, Enc_03833b {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+let isPredicateLate = 1;
+}
+def A4_vcmpbeq_any : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = any8(vcmpb.eq($Rss32,$Rtt32))",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_vcmpbeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u8_0Imm:$Ii),
+"$Pd4 = vcmpb.eq($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_0d8adb {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmpbgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.gt($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_vcmpbgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpb.gt($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_0d8adb {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmpbgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmpb.gtu($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_3680c2 {
+let Inst{4-2} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vcmpheqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmph.eq($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_0d8adb {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmphgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmph.gt($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_0d8adb {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmphgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmph.gtu($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_3680c2 {
+let Inst{4-2} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vcmpweqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpw.eq($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_0d8adb {
+let Inst{4-2} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmpwgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpw.gt($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_0d8adb {
+let Inst{4-2} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmpwgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmpw.gtu($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_3680c2 {
+let Inst{4-2} = 0b100;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vrmaxh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxh($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxuh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxuh($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxuw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxuw($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxw($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminh($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminuh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminuh($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminuw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminuw($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminw($Rss32,$Ru32)",
+tc_2aaab1e0, TypeS_3op>, Enc_412ff0 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A5_ACS : HInst<
+(outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
+tc_ae0722f7, TypeM>, Enc_831a7d, Requires<[HasV55T]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A5_vaddhubs : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vaddhub($Rss32,$Rtt32):sat",
+tc_63cd9d2d, TypeS_3op>, Enc_d2216a, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A6_vminub_RdP : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
+tc_583510c7, TypeM>, Enc_d2c7f1, Requires<[HasV62T]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def C2_all8 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = all8($Ps4)",
+tc_81a23d44, TypeCR>, Enc_65d691 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011101000;
+}
+def C2_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = and($Pt4,$Ps4)",
+tc_d63b71d1, TypeCR>, Enc_454a26 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011000000;
+}
+def C2_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = and($Pt4,!$Ps4)",
+tc_d63b71d1, TypeCR>, Enc_454a26 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011011000;
+}
+def C2_any8 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = any8($Ps4)",
+tc_81a23d44, TypeCR>, Enc_65d691 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011100000;
+}
+def C2_bitsclr : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = bitsclr($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111100;
+}
+def C2_bitsclri : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii),
+"$Pd4 = bitsclr($Rs32,#$Ii)",
+tc_5fa2857c, TypeS_2op>, Enc_5d6c34 {
+let Inst{7-2} = 0b000000;
+let Inst{31-21} = 0b10000101100;
+}
+def C2_bitsset : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = bitsset($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111010;
+}
+def C2_ccombinewf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rdd32 = combine($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewnewf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewnewt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
+tc_28d296df, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rdd32 = combine($Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_cmoveif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = #$Ii",
+tc_548f402d, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmoveit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = #$Ii",
+tc_548f402d, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmovenewif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = #$Ii",
+tc_b08be45e, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmovenewit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = #$Ii",
+tc_b08be45e, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.eq($Rs32,$Rt32)",
+tc_5fe9fcd0, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010000;
+let CextOpcode = "C2_cmpeq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C2_cmpeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.eq($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-22} = 0b0111010100;
+let CextOpcode = "C2_cmpeq";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C2_cmpeqp : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.eq($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C2_cmpgei : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s8_0Imm:$Ii),
+"$Pd4 = cmp.ge($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op> {
+let isCompare = 1;
+let isPseudo = 1;
+}
+def C2_cmpgeui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Pd4 = cmp.geu($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op> {
+let isCompare = 1;
+let isPseudo = 1;
+}
+def C2_cmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.gt($Rs32,$Rt32)",
+tc_5fe9fcd0, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010010;
+let CextOpcode = "C2_cmpgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C2_cmpgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.gt($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-22} = 0b0111010101;
+let CextOpcode = "C2_cmpgt";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C2_cmpgtp : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.gt($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCompare = 1;
+}
+def C2_cmpgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.gtu($Rs32,$Rt32)",
+tc_5fe9fcd0, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010011;
+let CextOpcode = "C2_cmpgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C2_cmpgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmp.gtu($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-21} = 0b01110101100;
+let CextOpcode = "C2_cmpgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def C2_cmpgtup : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.gtu($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCompare = 1;
+}
+def C2_cmplt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.lt($Rs32,$Rt32)",
+tc_9df8b0dc, TypeALU32_3op> {
+let isCompare = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_cmpltu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.ltu($Rs32,$Rt32)",
+tc_9df8b0dc, TypeALU32_3op> {
+let isCompare = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_mask : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4),
+"$Rdd32 = mask($Pt4)",
+tc_b86c7e8b, TypeS_2op>, Enc_78e566 {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b1000011000000000;
+}
+def C2_mux : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mux($Pu4,$Rs32,$Rt32)",
+tc_1b6011fb, TypeALU32_3op>, Enc_ea4c54 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def C2_muxii : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rd32 = mux($Pu4,#$Ii,#$II)",
+tc_1b6011fb, TypeALU32_2op>, Enc_830e5d {
+let Inst{31-25} = 0b0111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_muxir : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = mux($Pu4,$Rs32,#$Ii)",
+tc_1b6011fb, TypeALU32_2op>, Enc_e38e1f {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_muxri : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = mux($Pu4,#$Ii,$Rs32)",
+tc_1b6011fb, TypeALU32_2op>, Enc_e38e1f {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_not : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = not($Ps4)",
+tc_81a23d44, TypeCR>, Enc_65d691 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011110000;
+}
+def C2_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = or($Pt4,$Ps4)",
+tc_d63b71d1, TypeCR>, Enc_454a26 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011001000;
+}
+def C2_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = or($Pt4,!$Ps4)",
+tc_d63b71d1, TypeCR>, Enc_454a26 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011111000;
+}
+def C2_pxfer_map : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = $Ps4",
+tc_d63b71d1, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_tfrpr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Ps4),
+"$Rd32 = $Ps4",
+tc_b86c7e8b, TypeS_2op>, Enc_f5e933 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-18} = 0b10001001010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def C2_tfrrp : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32),
+"$Pd4 = $Rs32",
+tc_47f0b7ad, TypeS_2op>, Enc_48b75f {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-21} = 0b10000101010;
+}
+def C2_vitpack : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Rd32 = vitpack($Ps4,$Pt4)",
+tc_7ca2ea10, TypeS_2op>, Enc_527412 {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b10001001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def C2_vmux : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmux($Pu4,$Rss32,$Rtt32)",
+tc_d1b5a4b6, TypeALU64>, Enc_329361 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010001000;
+}
+def C2_xor : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = xor($Ps4,$Pt4)",
+tc_d63b71d1, TypeCR>, Enc_284ebb {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011010000;
+}
+def C4_addipc : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = add(pc,#$Ii)",
+tc_1fe8323c, TypeCR>, Enc_607661 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0110101001001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def C4_and_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,and($Pt4,$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011000100;
+}
+def C4_and_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,and($Pt4,!$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011100100;
+}
+def C4_and_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,or($Pt4,$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011001100;
+}
+def C4_and_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,or($Pt4,!$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011101100;
+}
+def C4_cmplte : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.gt($Rs32,$Rt32)",
+tc_5fe9fcd0, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010010;
+let CextOpcode = "C4_cmplte";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C4_cmpltei : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = !cmp.gt($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-22} = 0b0111010101;
+let CextOpcode = "C4_cmplte";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C4_cmplteu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.gtu($Rs32,$Rt32)",
+tc_5fe9fcd0, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010011;
+let CextOpcode = "C4_cmplteu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C4_cmplteui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = !cmp.gtu($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-21} = 0b01110101100;
+let CextOpcode = "C4_cmplteu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def C4_cmpneq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.eq($Rs32,$Rt32)",
+tc_5fe9fcd0, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010000;
+let CextOpcode = "C4_cmpneq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C4_cmpneqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = !cmp.eq($Rs32,#$Ii)",
+tc_9df8b0dc, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-22} = 0b0111010100;
+let CextOpcode = "C4_cmpneq";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C4_fastcorner9 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = fastcorner9($Ps4,$Pt4)",
+tc_d63b71d1, TypeCR>, Enc_284ebb {
+let Inst{7-2} = 0b100100;
+let Inst{13-10} = 0b1000;
+let Inst{31-18} = 0b01101011000000;
+}
+def C4_fastcorner9_not : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = !fastcorner9($Ps4,$Pt4)",
+tc_d63b71d1, TypeCR>, Enc_284ebb {
+let Inst{7-2} = 0b100100;
+let Inst{13-10} = 0b1000;
+let Inst{31-18} = 0b01101011000100;
+}
+def C4_nbitsclr : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !bitsclr($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111101;
+}
+def C4_nbitsclri : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii),
+"$Pd4 = !bitsclr($Rs32,#$Ii)",
+tc_5fa2857c, TypeS_2op>, Enc_5d6c34 {
+let Inst{7-2} = 0b000000;
+let Inst{31-21} = 0b10000101101;
+}
+def C4_nbitsset : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !bitsset($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111011;
+}
+def C4_or_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,and($Pt4,$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011010100;
+}
+def C4_or_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,and($Pt4,!$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011110100;
+}
+def C4_or_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,or($Pt4,$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011011100;
+}
+def C4_or_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,or($Pt4,!$Pu4))",
+tc_43068634, TypeCR>, Enc_9ac432 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011111100;
+}
+def F2_conv_d2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_d2df($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_d2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_d2sf($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2d : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2d($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2d_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2d($Rss32):chop",
+tc_e836c161, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2sf($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2ud : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2ud($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2ud_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2ud($Rss32):chop",
+tc_e836c161, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2uw : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2uw($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2uw_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2uw($Rss32):chop",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2w : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2w($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_df2w_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2w($Rss32):chop",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2d : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2d($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2d_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2d($Rs32):chop",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2df($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2ud : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2ud($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2ud_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2ud($Rs32):chop",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2uw : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2uw($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2uw_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2uw($Rs32):chop",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2w : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2w($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2w_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2w($Rs32):chop",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_ud2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_ud2df($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_b9c5fb, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_ud2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_ud2sf($Rss32)",
+tc_e836c161, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_uw2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_uw2df($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_uw2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_uw2sf($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_w2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_w2df($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_3a3d62, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_conv_w2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_w2sf($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_dfclass : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Pd4 = dfclass($Rss32,#$Ii)",
+tc_5fa2857c, TypeALU64>, Enc_1f19b5, Requires<[HasV5T]> {
+let Inst{4-2} = 0b100;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b11011100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_dfcmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpge : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpuo : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
+tc_c58f771a, TypeALU64>, Enc_fcf7a7, Requires<[HasV5T]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfimm_n : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u10_0Imm:$Ii),
+"$Rdd32 = dfmake(#$Ii):neg",
+tc_485bb57c, TypeALU64>, Enc_e6c957, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101100101;
+let prefersSlot3 = 1;
+}
+def F2_dfimm_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u10_0Imm:$Ii),
+"$Rdd32 = dfmake(#$Ii):pos",
+tc_485bb57c, TypeALU64>, Enc_e6c957, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101100100;
+let prefersSlot3 = 1;
+}
+def F2_sfadd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfadd($Rs32,$Rt32)",
+tc_3bea1824, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let isCommutable = 1;
+}
+def F2_sfclass : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = sfclass($Rs32,#$Ii)",
+tc_5fa2857c, TypeS_2op>, Enc_83ee64, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_sfcmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.eq($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpge : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.ge($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.gt($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpuo : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.uo($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e, Requires<[HasV5T]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sffixupd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sffixupd($Rs32,$Rt32)",
+tc_3bea1824, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+}
+def F2_sffixupn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sffixupn($Rs32,$Rt32)",
+tc_3bea1824, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+}
+def F2_sffixupr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sffixupr($Rs32)",
+tc_e836c161, TypeS_2op>, Enc_5e2823, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+}
+def F2_sffma : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += sfmpy($Rs32,$Rt32)",
+tc_2d1e6f5c, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffma_lib : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += sfmpy($Rs32,$Rt32):lib",
+tc_2d1e6f5c, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffma_sc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
+"$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
+tc_2e55aa16, TypeM>, Enc_437f33, Requires<[HasV5T]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffms : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= sfmpy($Rs32,$Rt32)",
+tc_2d1e6f5c, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffms_lib : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= sfmpy($Rs32,$Rt32):lib",
+tc_2d1e6f5c, TypeM>, Enc_2ae154, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sfimm_n : HInst<
+(outs IntRegs:$Rd32),
+(ins u10_0Imm:$Ii),
+"$Rd32 = sfmake(#$Ii):neg",
+tc_485bb57c, TypeALU64>, Enc_6c9440, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def F2_sfimm_p : HInst<
+(outs IntRegs:$Rd32),
+(ins u10_0Imm:$Ii),
+"$Rd32 = sfmake(#$Ii):pos",
+tc_485bb57c, TypeALU64>, Enc_6c9440, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def F2_sfinvsqrta : HInst<
+(outs IntRegs:$Rd32, PredRegs:$Pe4),
+(ins IntRegs:$Rs32),
+"$Rd32,$Pe4 = sfinvsqrta($Rs32)",
+tc_f1aa2cdb, TypeS_2op>, Enc_890909, Requires<[HasV5T]> {
+let Inst{13-7} = 0b0000000;
+let Inst{31-21} = 0b10001011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let isPredicateLate = 1;
+}
+def F2_sfmax : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmax($Rs32,$Rt32)",
+tc_f1240c08, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_sfmin : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmin($Rs32,$Rt32)",
+tc_f1240c08, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_sfmpy : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmpy($Rs32,$Rt32)",
+tc_3bea1824, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+let isCommutable = 1;
+}
+def F2_sfrecipa : HInst<
+(outs IntRegs:$Rd32, PredRegs:$Pe4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
+tc_09c86199, TypeM>, Enc_a94f3b, Requires<[HasV5T]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let isPredicateLate = 1;
+}
+def F2_sfsub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfsub($Rs32,$Rt32)",
+tc_3bea1824, TypeM>, Enc_5ab2be, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let Uses = [USR];
+}
+def J2_call : HInst<
+(outs),
+(ins a30_2Imm:$Ii),
+"call $Ii",
+tc_639d93ee, TypeJ>, Enc_81ac1d, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{31-25} = 0b0101101;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let isPredicable = 1;
+let hasSideEffects = 1;
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 24;
+let opExtentAlign = 2;
+}
+def J2_callf : HInst<
+(outs),
+(ins PredRegs:$Pu4, a30_2Imm:$Ii),
+"if (!$Pu4) call $Ii",
+tc_0767081f, TypeJ>, Enc_daea09, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_callr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"callr $Rs32",
+tc_ecfaae86, TypeJ>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010000101;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+}
+def J2_callrf : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) callr $Rs32",
+tc_84630363, TypeJ>, Enc_88d4d9 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+}
+def J2_callrt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) callr $Rs32",
+tc_84630363, TypeJ>, Enc_88d4d9 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010001000;
+let isPredicated = 1;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+}
+def J2_callt : HInst<
+(outs),
+(ins PredRegs:$Pu4, a30_2Imm:$Ii),
+"if ($Pu4) call $Ii",
+tc_0767081f, TypeJ>, Enc_daea09, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011101;
+let isPredicated = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_endloop0 : HInst<
+(outs),
+(ins),
+"endloop0",
+tc_aad55963, TypeJ> {
+let Uses = [LC0, SA0];
+let Defs = [LC0, P3, PC, USR];
+let isBranch = 1;
+let isTerminator = 1;
+let isPseudo = 1;
+}
+def J2_endloop01 : HInst<
+(outs),
+(ins),
+"endloop01",
+tc_aad55963, TypeJ> {
+let Uses = [LC0, LC1, SA0, SA1];
+let Defs = [LC0, LC1, P3, PC, USR];
+let isPseudo = 1;
+}
+def J2_endloop1 : HInst<
+(outs),
+(ins),
+"endloop1",
+tc_aad55963, TypeJ> {
+let Uses = [LC1, SA1];
+let Defs = [LC1, PC];
+let isBranch = 1;
+let isTerminator = 1;
+let isPseudo = 1;
+}
+def J2_jump : HInst<
+(outs),
+(ins b30_2Imm:$Ii),
+"jump $Ii",
+tc_a333d2a9, TypeJ>, Enc_81ac1d, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{31-25} = 0b0101100;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isBarrier = 1;
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 24;
+let opExtentAlign = 2;
+}
+def J2_jumpf : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4) jump:nt $Ii",
+tc_1b834fe7, TypeJ>, Enc_daea09, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpf_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, b15_2Imm:$Ii),
+"if (!$Pu4) jump $Ii",
+tc_1b834fe7, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumpfnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4.new) jump:nt $Ii",
+tc_537e2013, TypeJ>, Enc_daea09, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b010;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpfnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4.new) jump:t $Ii",
+tc_537e2013, TypeJ>, Enc_daea09, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b110;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpfpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4) jump:t $Ii",
+tc_b5bfaa60, TypeJ>, Enc_daea09, Requires<[HasV60T]>, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b100;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"jumpr $Rs32",
+tc_b08b653e, TypeJ>, Enc_ecbcc8, PredNewRel {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010010100;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isBarrier = 1;
+let isPredicable = 1;
+}
+def J2_jumprf : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr:nt $Rs32",
+tc_07ac815d, TypeJ>, Enc_88d4d9, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprf_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr $Rs32",
+tc_07ac815d, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumprfnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) jumpr:nt $Rs32",
+tc_1f9668cc, TypeJ>, Enc_88d4d9, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprfnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) jumpr:t $Rs32",
+tc_1f9668cc, TypeJ>, Enc_88d4d9, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprfpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr:t $Rs32",
+tc_a1fb80e1, TypeJ>, Enc_88d4d9, Requires<[HasV60T]>, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprgtez : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32>=#0) jump:nt $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000101;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprgtezpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32>=#0) jump:t $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000101;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprltez : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32<=#0) jump:nt $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000111;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprltezpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32<=#0) jump:t $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000111;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprnz : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32==#0) jump:nt $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprnzpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32==#0) jump:t $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr:nt $Rs32",
+tc_07ac815d, TypeJ>, Enc_88d4d9, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprt_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr $Rs32",
+tc_07ac815d, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumprtnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) jumpr:nt $Rs32",
+tc_1f9668cc, TypeJ>, Enc_88d4d9, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprtnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) jumpr:t $Rs32",
+tc_1f9668cc, TypeJ>, Enc_88d4d9, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprtpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr:t $Rs32",
+tc_a1fb80e1, TypeJ>, Enc_88d4d9, Requires<[HasV60T]>, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprz : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32!=#0) jump:nt $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprzpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32!=#0) jump:t $Ii",
+tc_b324366f, TypeCR>, Enc_0fa531 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4) jump:nt $Ii",
+tc_1b834fe7, TypeJ>, Enc_daea09, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpt_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, b15_2Imm:$Ii),
+"if ($Pu4) jump $Ii",
+tc_1b834fe7, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumptnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4.new) jump:nt $Ii",
+tc_537e2013, TypeJ>, Enc_daea09, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b010;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumptnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4.new) jump:t $Ii",
+tc_537e2013, TypeJ>, Enc_daea09, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b110;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumptpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4) jump:t $Ii",
+tc_b5bfaa60, TypeJ>, Enc_daea09, Requires<[HasV60T]>, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b100;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_loop0i : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"loop0($Ii,#$II)",
+tc_1000eb10, TypeCR>, Enc_4dc228 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001000;
+let Defs = [LC0, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop0r : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"loop0($Ii,$Rs32)",
+tc_f055fbb6, TypeCR>, Enc_864a5a {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000000;
+let Defs = [LC0, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop1i : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"loop1($Ii,#$II)",
+tc_1000eb10, TypeCR>, Enc_4dc228 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001001;
+let Defs = [LC1, SA1];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop1r : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"loop1($Ii,$Rs32)",
+tc_f055fbb6, TypeCR>, Enc_864a5a {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000001;
+let Defs = [LC1, SA1];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_pause : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"pause(#$Ii)",
+tc_b189ad4c, TypeJ>, Enc_a51a9a {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010001000000;
+let isSolo = 1;
+}
+def J2_ploop1si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp1loop0($Ii,#$II)",
+tc_feb4974b, TypeCR>, Enc_4dc228 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001101;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop1sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp1loop0($Ii,$Rs32)",
+tc_d6a805a8, TypeCR>, Enc_864a5a {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000101;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop2si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp2loop0($Ii,#$II)",
+tc_feb4974b, TypeCR>, Enc_4dc228 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001110;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop2sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp2loop0($Ii,$Rs32)",
+tc_d6a805a8, TypeCR>, Enc_864a5a {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000110;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop3si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp3loop0($Ii,#$II)",
+tc_feb4974b, TypeCR>, Enc_4dc228 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001111;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop3sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp3loop0($Ii,$Rs32)",
+tc_d6a805a8, TypeCR>, Enc_864a5a {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000111;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_trap0 : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"trap0(#$Ii)",
+tc_cbe45117, TypeJ>, Enc_a51a9a {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010000000000;
+let isSolo = 1;
+}
+def J4_cmpeq_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$II)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$II)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_e90a15, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_5a18b3, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_1de724, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14640c, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_668704, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_800e04, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_4aca3a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_f7ea77, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_405228, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_3a2484, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_736575, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_8e583a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$II)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$II)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_3694bd, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_a6853f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_a42857, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_f6fe0b, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_3e3989, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_b909d2, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_f82302, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_6413b6, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_b78edd, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_041d7b, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_b1e1fb, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_178717, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
+tc_580a779c, TypeNCJ>, Enc_c9a18e, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+tc_92d1833c, TypeCJ>, Enc_6a5972, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
+tc_09faec3b, TypeNCJ>, Enc_eafd18, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:nt $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:t $Ii",
+tc_d108a090, TypeCJ>, Enc_14d27a, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmplt_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
+tc_3e61d314, TypeNCJ>, Enc_5de85f, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_hintjumpr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"hintjr($Rs32)",
+tc_b08b653e, TypeJ>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010010101;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+}
+def J4_jumpseti : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u6_0Imm:$II, b30_2Imm:$Ii),
+"$Rd16 = #$II ; jump $Ii",
+tc_1e062b18, TypeCJ>, Enc_9e4c3f {
+let Inst{0-0} = 0b0;
+let Inst{31-22} = 0b0001011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_jumpsetr : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"$Rd16 = $Rs16 ; jump $Ii",
+tc_1e062b18, TypeCJ>, Enc_66bce1 {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!tstbit($Ns8.new,#0)) jump:nt $Ii",
+tc_dbe218dd, TypeNCJ>, Enc_69d63b {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!tstbit($Ns8.new,#0)) jump:t $Ii",
+tc_dbe218dd, TypeNCJ>, Enc_69d63b {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (!p0.new) jump:nt $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (!p0.new) jump:t $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (!p1.new) jump:nt $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (!p1.new) jump:t $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (tstbit($Ns8.new,#0)) jump:nt $Ii",
+tc_dbe218dd, TypeNCJ>, Enc_69d63b {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (tstbit($Ns8.new,#0)) jump:t $Ii",
+tc_dbe218dd, TypeNCJ>, Enc_69d63b {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (p0.new) jump:nt $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (p0.new) jump:t $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (p1.new) jump:nt $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (p1.new) jump:t $Ii",
+tc_eb07ef6f, TypeCJ>, Enc_ad1c74 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def L2_deallocframe : HInst<
+(outs),
+(ins),
+"deallocframe",
+tc_c1dbc916, TypeLD>, Enc_3a3d62 {
+let Inst{4-0} = 0b11110;
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010000000;
+let Inst{20-16} = 0b11110;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [R29, R30, R31];
+}
+def L2_loadalignb_io : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Ryy32 = memb_fifo($Rs32+#$Ii)",
+tc_14da557c, TypeLD>, Enc_a27588 {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignb_pbr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++$Mu2:brev)",
+tc_ae762521, TypeLD>, Enc_1f5d8f {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110100;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pci : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++#$Ii:circ($Mu2))",
+tc_d2a33af5, TypeLD>, Enc_74aef2 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pcr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++I:circ($Mu2))",
+tc_ae762521, TypeLD>, Enc_1f5d8f {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pi : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Ryy32 = memb_fifo($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_6b197f {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++$Mu2)",
+tc_ae762521, TypeLD>, Enc_1f5d8f {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_zomap : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
+"$Ryy32 = memb_fifo($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignh_io : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Ryy32 = memh_fifo($Rs32+#$Ii)",
+tc_14da557c, TypeLD>, Enc_5cd7e9 {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignh_pbr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++$Mu2:brev)",
+tc_ae762521, TypeLD>, Enc_1f5d8f {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110010;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pci : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++#$Ii:circ($Mu2))",
+tc_d2a33af5, TypeLD>, Enc_9e2e1c {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pcr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++I:circ($Mu2))",
+tc_ae762521, TypeLD>, Enc_1f5d8f {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pi : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Ryy32 = memh_fifo($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_bd1cbc {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++$Mu2)",
+tc_ae762521, TypeLD>, Enc_1f5d8f {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_zomap : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
+"$Ryy32 = memh_fifo($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadbsw2_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = membh($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_de0214 {
+let Inst{24-21} = 0b0001;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadbsw2_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_e83554 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = membh($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_152467 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = membh($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbsw4_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rdd32 = membh($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_2d7491 {
+let Inst{24-21} = 0b0111;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadbsw4_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110111;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_70b24b {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rdd32 = membh($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_71f1b4 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = membh($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbzw2_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memubh($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_de0214 {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadbzw2_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_e83554 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memubh($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_152467 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memubh($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbzw4_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rdd32 = memubh($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_2d7491 {
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadbzw4_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110101;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_70b24b {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rdd32 = memubh($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_71f1b4 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memubh($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrb_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memb($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_211aaa, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def L2_loadrb_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_e0a47a {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Rd32 = memb($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_222336, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memb($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrbgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memb(gp+#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrb_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def L2_loadrd_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii),
+"$Rdd32 = memd($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_fa3ba4, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def L2_loadrd_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111110;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_b05839 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii),
+"$Rdd32 = memd($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_5bdd42, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_7eee72 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrdgp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u29_3Imm:$Ii),
+"$Rdd32 = memd(gp+#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_509701, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b01001;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrd_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def L2_loadrh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memh($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_de0214, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadrh_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_e83554 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memh($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_152467, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memh($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrhgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memh(gp+#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrh_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def L2_loadri_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rd32 = memw($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_2a3787, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadri_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_27fd0e {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rd32 = memw($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_3d920a, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrigp : HInst<
+(outs IntRegs:$Rd32),
+(ins u30_2Imm:$Ii),
+"$Rd32 = memw(gp+#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadri_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def L2_loadrub_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memub($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_211aaa, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def L2_loadrub_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_e0a47a {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Rd32 = memub($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_222336, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memub($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrubgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memub(gp+#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrub_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def L2_loadruh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memuh($Rs32+#$Ii)",
+tc_bf6fa601, TypeLD>, Enc_de0214, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadruh_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++$Mu2:brev)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++#$Ii:circ($Mu2))",
+tc_3eab77bd, TypeLD>, Enc_e83554 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++I:circ($Mu2))",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memuh($Rx32++#$Ii)",
+tc_65dc7cc4, TypeLD>, Enc_152467, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++$Mu2)",
+tc_65dc7cc4, TypeLD>, Enc_74d4e5 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memuh($Rs32)",
+tc_bf6fa601, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadruhgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memuh(gp+#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadruh_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def L2_loadw_locked : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw_locked($Rs32)",
+tc_29c14515, TypeLD>, Enc_5e2823 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isSoloAX = 1;
+}
+def L2_ploadrbf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memb($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memb($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbt_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbt_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbt_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memb($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memb($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdf_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdf_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_9d1247, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdf_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rdd32 = memd($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdfnew_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdfnew_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_9d1247, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdfnew_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rdd32 = memd($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdt_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdt_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_9d1247, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdt_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rdd32 = memd($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdtnew_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdtnew_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_9d1247, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdtnew_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rdd32 = memd($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memh($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memh($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrht_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memh($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrht_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memh($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrht_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memh($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memh($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrif_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrif_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_b97f71, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrif_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memw($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrifnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrifnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_b97f71, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrifnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memw($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrit_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if ($Pt4) $Rd32 = memw($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrit_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if ($Pt4) $Rd32 = memw($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_b97f71, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrit_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memw($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadritnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadritnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_b97f71, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadritnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memw($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memub($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memub($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubt_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubt_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubt_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memub($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_f4413a, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memub($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memuh($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memuh($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruht_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh($Rs32+#$Ii)",
+tc_14da557c, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruht_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh($Rx32++#$Ii)",
+tc_ae762521, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruht_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memuh($Rs32)",
+tc_14da557c, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
+tc_65dc7cc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
+tc_e578178f, TypeLD>, Enc_733b27, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memuh($Rs32)",
+tc_65dc7cc4, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) += $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_d44e31 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_add_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) += $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) += $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_163a3c {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_add_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) += $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) += $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_226535 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_add_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) += $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) &= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_d44e31 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_and_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) &= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) &= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_163a3c {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_and_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) &= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) &= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_226535 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_and_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) &= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) += #$II",
+tc_da79106e, TypeV4LDST>, Enc_46c951 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_iadd_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) += #$II",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) += #$II",
+tc_da79106e, TypeV4LDST>, Enc_e66a97 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_iadd_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) += #$II",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) += #$II",
+tc_da79106e, TypeV4LDST>, Enc_84b2cd {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_iadd_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) += #$II",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) = clrbit(#$II)",
+tc_da79106e, TypeV4LDST>, Enc_46c951 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_iand_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) = clrbit(#$II)",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) = clrbit(#$II)",
+tc_da79106e, TypeV4LDST>, Enc_e66a97 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_iand_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) = clrbit(#$II)",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) = clrbit(#$II)",
+tc_da79106e, TypeV4LDST>, Enc_84b2cd {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_iand_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) = clrbit(#$II)",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) = setbit(#$II)",
+tc_da79106e, TypeV4LDST>, Enc_46c951 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ior_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) = setbit(#$II)",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) = setbit(#$II)",
+tc_da79106e, TypeV4LDST>, Enc_e66a97 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_ior_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) = setbit(#$II)",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) = setbit(#$II)",
+tc_da79106e, TypeV4LDST>, Enc_84b2cd {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_ior_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) = setbit(#$II)",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) -= #$II",
+tc_da79106e, TypeV4LDST>, Enc_46c951 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_isub_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) -= #$II",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) -= #$II",
+tc_da79106e, TypeV4LDST>, Enc_e66a97 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_isub_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) -= #$II",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) -= #$II",
+tc_da79106e, TypeV4LDST>, Enc_84b2cd {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_isub_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) -= #$II",
+tc_da79106e, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_loadalignb_ap : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Re32),
+(ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
+"$Ryy32 = memb_fifo($Re32=#$II)",
+tc_261d9b78, TypeLD>, Enc_f394d3 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010100;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignb_ur : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Ryy32 = memb_fifo($Rt32<<#$Ii+#$II)",
+tc_baccf077, TypeLD>, Enc_04c959 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100100;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 4;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignh_ap : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Re32),
+(ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
+"$Ryy32 = memh_fifo($Re32=#$II)",
+tc_261d9b78, TypeLD>, Enc_f394d3 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010010;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignh_ur : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Ryy32 = memh_fifo($Rt32<<#$Ii+#$II)",
+tc_baccf077, TypeLD>, Enc_04c959 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100010;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 4;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadbsw2_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = membh($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw2_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = membh($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw4_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = membh($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_7fa7f6 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010111;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw4_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = membh($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_6185fe {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100111;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw2_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memubh($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw2_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memubh($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw4_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = memubh($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_7fa7f6 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010101;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw4_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = memubh($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_6185fe {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100101;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadd_locked : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd_locked($Rs32)",
+tc_29c14515, TypeLD>, Enc_3a3d62 {
+let Inst{13-5} = 0b010000000;
+let Inst{31-21} = 0b10010010000;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let isSoloAX = 1;
+}
+def L4_loadrb_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memb($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrb_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+tc_5625c6c1, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+let isPredicable = 1;
+}
+def L4_loadrb_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memb($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrd_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = memd($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_7fa7f6 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011110;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrd_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+tc_5625c6c1, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010110;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+let isPredicable = 1;
+}
+def L4_loadrd_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = memd($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101110;
+let addrMode = BaseLongOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrh_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memh($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrh_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+tc_5625c6c1, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+let isPredicable = 1;
+}
+def L4_loadrh_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memh($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadri_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memw($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadri_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+tc_5625c6c1, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+let isPredicable = 1;
+}
+def L4_loadri_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memw($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrub_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memub($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrub_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+tc_5625c6c1, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+let isPredicable = 1;
+}
+def L4_loadrub_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memub($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadruh_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memuh($Re32=#$II)",
+tc_b5f5a094, TypeLD>, Enc_323f2d {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadruh_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+tc_5625c6c1, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+let isPredicable = 1;
+}
+def L4_loadruh_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memuh($Rt32<<#$Ii+#$II)",
+tc_7d9a56cd, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_or_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) |= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_d44e31 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_or_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) |= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_or_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) |= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_163a3c {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_or_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) |= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_or_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) |= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_226535 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_or_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) |= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ploadrbf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbt_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbt_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110010000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrdf_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2a7b91, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdf_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_98c0b8, AddrModeRel {
+let Inst{31-21} = 0b00110001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdfnew_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2a7b91, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdfnew_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_98c0b8, AddrModeRel {
+let Inst{31-21} = 0b00110011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdt_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2a7b91, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdt_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_98c0b8, AddrModeRel {
+let Inst{31-21} = 0b00110000110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdtnew_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2a7b91, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdtnew_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_98c0b8, AddrModeRel {
+let Inst{31-21} = 0b00110010110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrhf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110001010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrhfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrht_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memh(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrht_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110000010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrhtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110010010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrif_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrif_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrifnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrifnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrit_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memw(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrit_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadritnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadritnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110010100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrubf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubt_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubt_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110010001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadruhf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruhfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruht_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh(#$Ii)",
+tc_136c4786, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruht_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+tc_9dafb7d3, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110000011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruhtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh(#$Ii)",
+tc_b5f5a094, TypeLD>, Enc_2301d6, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+tc_128719e8, TypeLD>, Enc_2e1979, AddrModeRel {
+let Inst{31-21} = 0b00110010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_return : HInst<
+(outs),
+(ins),
+"dealloc_return",
+tc_dcfee7ae, TypeLD>, Enc_3a3d62, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isBarrier = 1;
+let isPredicable = 1;
+let isTaken = 1;
+}
+def L4_return_f : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4) dealloc_return",
+tc_9ce7a5ab, TypeLD>, Enc_b7fad3, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1100;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_fnew_pnt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4.new) dealloc_return:nt",
+tc_3993c58b, TypeLD>, Enc_b7fad3, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_fnew_pt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4.new) dealloc_return:t",
+tc_3993c58b, TypeLD>, Enc_b7fad3, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1110;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_t : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4) dealloc_return",
+tc_9ce7a5ab, TypeLD>, Enc_b7fad3, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_tnew_pnt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4.new) dealloc_return:nt",
+tc_3993c58b, TypeLD>, Enc_b7fad3, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_tnew_pt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4.new) dealloc_return:t",
+tc_3993c58b, TypeLD>, Enc_b7fad3, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_sub_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) -= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_d44e31 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_sub_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) -= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_sub_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) -= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_163a3c {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_sub_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) -= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_sub_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) -= $Rt32",
+tc_a9c993d9, TypeV4LDST>, Enc_226535 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let mayStore = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_sub_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) -= $Rt32",
+tc_a9c993d9, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M2_acci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += add($Rs32,$Rt32)",
+tc_c0cd91a8, TypeM>, Enc_2ae154, ImmRegRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_acci";
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_accii : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 += add($Rs32,#$Ii)",
+tc_c0cd91a8, TypeM>, Enc_c90aca, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_acci";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_cmaci_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpyi($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacr_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpyr($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32):sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacsc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32*):sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacsc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32*):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmpyi_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpyi($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_cmpyr_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpyr($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_cmpyrs_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrs_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrsc_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32*):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrsc_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32*):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpys_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32):sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpysc_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32*):sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpysc_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32*):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cnacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32):sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacsc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32*):sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacsc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32*):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_nac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_rnd_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_dpmpyss_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_dpmpyuu_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyuu_nac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyuu_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M2_hmmpyh_rs1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.h):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.h):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyl_rs1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.l):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.l):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_maci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyi($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_2ae154, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_maci";
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_macsin : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rx32 -= mpyi($Rs32,#$Ii)",
+tc_a12a5971, TypeM>, Enc_c90aca {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_macsip : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rx32 += mpyi($Rs32,#$Ii)",
+tc_a12a5971, TypeM>, Enc_c90aca, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_maci";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mmachs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmpyh_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_acc_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_nac_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_rnd_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_sat_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_up_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_up_s1_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpyd_acc_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_nac_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_rnd_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyi($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_5ab2be, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_mpyi";
+let InputType = "reg";
+}
+def M2_mpysin : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Rd32 = -mpyi($Rs32,#$Ii)",
+tc_ae2c2dc2, TypeM>, Enc_b8c967 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpysip : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rd32 = +mpyi($Rs32,#$Ii)",
+tc_ae2c2dc2, TypeM>, Enc_b8c967 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def M2_mpysmi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, m32_0Imm:$Ii),
+"$Rd32 = mpyi($Rs32,#$Ii)",
+tc_ae2c2dc2, TypeM>, ImmRegRel {
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "M2_mpyi";
+let InputType = "imm";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def M2_mpysu_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpysu($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_acc_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_nac_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_acc_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.h)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.h):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.l)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.l):<<1",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_nac_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.h)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.l)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyui : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyui($Rs32,$Rt32)",
+tc_8c8041e6, TypeM> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M2_nacci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= add($Rs32,$Rt32)",
+tc_c0cd91a8, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_naccii : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 -= add($Rs32,#$Ii)",
+tc_c0cd91a8, TypeM>, Enc_c90aca {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_subacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rx32 += sub($Rt32,$Rs32)",
+tc_c0cd91a8, TypeM>, Enc_a568d4 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_vabsdiffh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffh($Rtt32,$Rss32)",
+tc_63cd9d2d, TypeM>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M2_vabsdiffw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffw($Rtt32,$Rss32)",
+tc_63cd9d2d, TypeM>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+}
+def M2_vcmac_s0_sat_i : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vcmpyi($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vcmac_s0_sat_r : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vcmpyr($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vcmpy_s0_sat_i : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyi($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s0_sat_r : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyr($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s1_sat_i : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyi($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s1_sat_r : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyr($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpy($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vdmacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpy($Rss32,$Rtt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vdmpyrs_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vdmpy($Rss32,$Rtt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_d2216a {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpyrs_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vdmpy($Rss32,$Rtt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_d2216a {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpys_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpy($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpy($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmac2 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2s_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32):sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2s_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2su_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyhsu($Rs32,$Rt32):sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2su_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyhsu($Rs32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmpy2es_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyeh($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2es_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyeh($Rss32,$Rtt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyh($Rs32,$Rt32):sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s0pack : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vmpyh($Rs32,$Rt32):rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyh($Rs32,$Rt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s1pack : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vmpyh($Rs32,$Rt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM>, Enc_5ab2be {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2su_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyhsu($Rs32,$Rt32):sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2su_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyhsu($Rs32,$Rt32):<<1:sat",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vraddh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vraddh($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_d2216a {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_vradduh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vradduh($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_d2216a {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_vrcmaci_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyi($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmaci_s0c : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyi($Rss32,$Rtt32*)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmacr_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyr($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmacr_s0c : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyr($Rss32,$Rtt32*)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpyi_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyi($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyi_s0c : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyi($Rss32,$Rtt32*)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyr_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyr($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyr_s0c : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyr($Rss32,$Rtt32*)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpys_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += vrcmpys($Rss32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM> {
+let isPseudo = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_acc_s1_h : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_acc_s1_l : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vrcmpys($Rss32,$Rt32):<<1:sat",
+tc_8c8041e6, TypeM> {
+let isPseudo = 1;
+}
+def M2_vrcmpys_s1_h : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1_l : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1rp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = vrcmpys($Rss32,$Rt32):<<1:rnd:sat",
+tc_8c8041e6, TypeM> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def M2_vrcmpys_s1rp_h : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:hi",
+tc_8c8041e6, TypeM>, Enc_d2216a {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1rp_l : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:lo",
+tc_8c8041e6, TypeM>, Enc_d2216a {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrmac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyh($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrmpy_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyh($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_xor_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= xor($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= and($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= and($Rs32,~$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= or($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_xor : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= xor($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_cmpyi_wh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyiwh($Rss32,$Rt32):<<1:rnd:sat",
+tc_8c8041e6, TypeS_3op>, Enc_3d5b28 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyi_whc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
+tc_8c8041e6, TypeS_3op>, Enc_3d5b28, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyr_wh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyrwh($Rss32,$Rt32):<<1:rnd:sat",
+tc_8c8041e6, TypeS_3op>, Enc_3d5b28 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyr_whc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
+tc_8c8041e6, TypeS_3op>, Enc_3d5b28, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_mac_up_s1_sat : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_mpyri_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii, IntRegs:$Rs32, u6_0Imm:$II),
+"$Rd32 = add(#$Ii,mpyi($Rs32,#$II))",
+tc_a12a5971, TypeALU64>, Enc_322e1b, ImmRegRel {
+let Inst{31-24} = 0b11011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyri_addr";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyri_addr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Ru32, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rd32 = add($Ru32,mpyi($Rs32,#$Ii))",
+tc_a12a5971, TypeALU64>, Enc_420cf3, ImmRegRel {
+let Inst{31-23} = 0b110111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyri_addr";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyri_addr_u2 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Ru32, u6_2Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = add($Ru32,mpyi(#$Ii,$Rs32))",
+tc_69bb508b, TypeALU64>, Enc_277737 {
+let Inst{31-23} = 0b110111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M4_mpyrr_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add(#$Ii,mpyi($Rs32,$Rt32))",
+tc_8cb685d9, TypeALU64>, Enc_a7b8e8, ImmRegRel {
+let Inst{31-23} = 0b110101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyrr_addr";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyrr_addr : HInst<
+(outs IntRegs:$Ry32),
+(ins IntRegs:$Ru32, IntRegs:$Ry32in, IntRegs:$Rs32),
+"$Ry32 = add($Ru32,mpyi($Ry32in,$Rs32))",
+tc_8cb685d9, TypeM>, Enc_7f1a05, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyrr_addr";
+let InputType = "reg";
+let Constraints = "$Ry32 = $Ry32in";
+}
+def M4_nac_up_s1_sat : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32,$Rt32):<<1:sat",
+tc_8cb685d9, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= and($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= and($Rs32,~$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= or($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_xor : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= xor($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_pmpyw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = pmpyw($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M4_pmpyw_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 ^= pmpyw($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vpmpyh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vpmpyh($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101110;
+let prefersSlot3 = 1;
+}
+def M4_vpmpyh_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 ^= vpmpyh($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyweh($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyweh($Rss32,$Rtt32):<<1",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyweh($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyeh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyweh($Rss32,$Rtt32):<<1",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyoh_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpywoh($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyoh_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpywoh($Rss32,$Rtt32):<<1",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyoh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpywoh($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyoh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpywoh($Rss32,$Rtt32):<<1",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
+def M4_xor_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= and($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= and($Rs32,~$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= or($Rs32,$Rt32)",
+tc_3c10f809, TypeM>, Enc_2ae154 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 ^= xor($Rss32,$Rtt32)",
+tc_3c10f809, TypeS_3op>, Enc_88c16c {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vdmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
+tc_8cb685d9, TypeM>, Enc_88c16c, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vdmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
+tc_8c8041e6, TypeM>, Enc_a56825, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M5_vmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpybsu($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vmacbuu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpybu($Rs32,$Rt32)",
+tc_8cb685d9, TypeM>, Enc_61f0b0 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpybsu($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M5_vmpybuu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpybu($Rs32,$Rt32)",
+tc_8c8041e6, TypeM>, Enc_be32a5 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+}
+def M5_vrmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpybsu($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vrmacbuu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpybu($Rss32,$Rtt32)",
+tc_8cb685d9, TypeM>, Enc_88c16c {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vrmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpybsu($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M5_vrmpybuu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpybu($Rss32,$Rtt32)",
+tc_8c8041e6, TypeM>, Enc_a56825 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+}
+def M6_vabsdiffb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
+tc_faab1248, TypeM>, Enc_ea23e4, Requires<[HasV62T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+}
+def M6_vabsdiffub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
+tc_faab1248, TypeM>, Enc_ea23e4, Requires<[HasV62T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
+def PS_loadrbabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memb(#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_loadrdabs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u29_3Imm:$Ii),
+"$Rdd32 = memd(#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_509701, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def PS_loadrhabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memh(#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_loadriabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u30_2Imm:$Ii),
+"$Rd32 = memw(#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def PS_loadrubabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memub(#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_loadruhabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memuh(#$Ii)",
+tc_70cabf66, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtended = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerbabs : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb(#$Ii) = $Rt32",
+tc_c14739d5, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
+let Inst{24-21} = 0b0000;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_storerbnewabs : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Nt8),
+"memb(#$Ii) = $Nt8.new",
+tc_9e86015f, TypeV2LDST>, Enc_ad1831, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+let opNewValue = 1;
+}
+def PS_storerdabs : HInst<
+(outs),
+(ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd(#$Ii) = $Rtt32",
+tc_c14739d5, TypeV2LDST>, Enc_5c124a, AddrModeRel {
+let Inst{24-21} = 0b0110;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def PS_storerfabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(#$Ii) = $Rt32.h",
+tc_c14739d5, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerhabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(#$Ii) = $Rt32",
+tc_c14739d5, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerhnewabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Nt8),
+"memh(#$Ii) = $Nt8.new",
+tc_9e86015f, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+let opNewValue = 1;
+}
+def PS_storeriabs : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw(#$Ii) = $Rt32",
+tc_c14739d5, TypeV2LDST>, Enc_541f26, AddrModeRel {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def PS_storerinewabs : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Nt8),
+"memw(#$Ii) = $Nt8.new",
+tc_9e86015f, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def S2_addasl_rrri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32, u3_0Imm:$Ii),
+"$Rd32 = addasl($Rt32,$Rs32,#$Ii)",
+tc_090485bb, TypeS_3op>, Enc_47ef61 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_allocframe : HInst<
+(outs),
+(ins u11_3Imm:$Ii),
+"allocframe(#$Ii)",
+tc_0cb867f2, TypeST>, Enc_22c845 {
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10100000100;
+let Inst{20-16} = 0b11101;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [R29, R30, R31];
+let Defs = [R29, R30];
+}
+def S2_asl_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asl($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_5eac98 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_asl_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += asl($Rss32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= asl($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= asl($Rss32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= asl($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= asl($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asl($Rs32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asl_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += asl($Rs32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= asl($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= asl($Rs32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= asl($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asl($Rs32,#$Ii):sat",
+tc_47ab9233, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_asl_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= asl($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vaslh($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_12b6e9 {
+let Inst{7-5} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_asl_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vaslw($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_7e5a82 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_asl_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = asl($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_asl_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += asl($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= asl($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= asl($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= asl($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= asl($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asl($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asl_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += asl($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= asl($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= asl($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= asl($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asl($Rs32,$Rt32):sat",
+tc_47ab9233, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_asl_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vaslh($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_asl_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vaslw($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_asr_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asr($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_5eac98 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_asr_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += asr($Rss32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= asr($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= asr($Rss32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= asr($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_rnd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asr($Rss32,#$Ii):rnd",
+tc_63cd9d2d, TypeS_2op>, Enc_5eac98, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000000110;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_p_rnd_goodsyntax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asrrnd($Rss32,#$Ii)",
+tc_63cd9d2d, TypeS_2op>, Requires<[HasV5T]> {
+let isPseudo = 1;
+}
+def S2_asr_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asr($Rs32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += asr($Rs32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= asr($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= asr($Rs32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= asr($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asr($Rs32,#$Ii):rnd",
+tc_63cd9d2d, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_r_rnd_goodsyntax : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asrrnd($Rs32,#$Ii)",
+tc_63cd9d2d, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def S2_asr_i_svw_trun : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rd32 = vasrw($Rss32,#$Ii)",
+tc_7ca2ea10, TypeS_2op>, Enc_8dec2e {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_12b6e9 {
+let Inst{7-5} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_asr_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vasrw($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_7e5a82 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_asr_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = asr($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_asr_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += asr($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= asr($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= asr($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= asr($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= asr($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asr($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += asr($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= asr($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= asr($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= asr($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asr($Rs32,$Rt32):sat",
+tc_47ab9233, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_asr_r_svw_trun : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = vasrw($Rss32,$Rt32)",
+tc_7ca2ea10, TypeS_3op>, Enc_3d5b28 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_asr_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vasrh($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_asr_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vasrw($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_brev : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = brev($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_brevp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = brev($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000110;
+let prefersSlot3 = 1;
+}
+def S2_cabacdecbin : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = decbin($Rss32,$Rtt32)",
+tc_5d806107, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+let Defs = [P0];
+}
+def S2_cl0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = cl0($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_cl0p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = cl0($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_cl1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = cl1($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_cl1p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = cl1($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_clb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = clb($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_clbnorm : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = normamt($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_clbp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = clb($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_clrbit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = clrbit($Rs32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clrbit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = clrbit($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = ct0($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_ct0p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = ct0($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_ct1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = ct1($Rs32)",
+tc_ab1b5e74, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_ct1p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = ct1($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_deinterleave : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = deinterleave($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000110;
+let prefersSlot3 = 1;
+}
+def S2_extractu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = extractu($Rs32,#$Ii,#$II)",
+tc_c0cd91a8, TypeS_2op>, Enc_b388cf {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_extractu_rp : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rd32 = extractu($Rs32,$Rtt32)",
+tc_87601822, TypeS_3op>, Enc_e07374 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_extractup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rdd32 = extractu($Rss32,#$Ii,#$II)",
+tc_c0cd91a8, TypeS_2op>, Enc_b84c4c {
+let Inst{31-24} = 0b10000001;
+let prefersSlot3 = 1;
+}
+def S2_extractup_rp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = extractu($Rss32,$Rtt32)",
+tc_87601822, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+let prefersSlot3 = 1;
+}
+def S2_insert : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = insert($Rs32,#$Ii,#$II)",
+tc_d95f4e98, TypeS_2op>, Enc_a1e29d {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_insert_rp : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rx32 = insert($Rs32,$Rtt32)",
+tc_3c10f809, TypeS_3op>, Enc_179b35 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_insertp : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rxx32 = insert($Rss32,#$Ii,#$II)",
+tc_d95f4e98, TypeS_2op>, Enc_143a3c {
+let Inst{31-24} = 0b10000011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_insertp_rp : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 = insert($Rss32,$Rtt32)",
+tc_3c10f809, TypeS_3op>, Enc_88c16c {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_interleave : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = interleave($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000110;
+let prefersSlot3 = 1;
+}
+def S2_lfsp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = lfs($Rss32,$Rtt32)",
+tc_87601822, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+let prefersSlot3 = 1;
+}
+def S2_lsl_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = lsl($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_lsl_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += lsl($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= lsl($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= lsl($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= lsl($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= lsl($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = lsl($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsl_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += lsl($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= lsl($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= lsl($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= lsl($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlslh($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_lsl_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlslw($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_lsr_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = lsr($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_5eac98 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_lsr_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += lsr($Rss32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= lsr($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= lsr($Rss32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= lsr($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= lsr($Rss32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_70fb07 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = lsr($Rs32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsr_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += lsr($Rs32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= lsr($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= lsr($Rs32,#$Ii)",
+tc_c0cd91a8, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= lsr($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= lsr($Rs32,#$Ii)",
+tc_3c10f809, TypeS_2op>, Enc_28a2dc {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vlsrh($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_12b6e9 {
+let Inst{7-5} = 0b001;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_lsr_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vlsrw($Rss32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_7e5a82 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_lsr_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = lsr($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_lsr_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += lsr($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= lsr($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= lsr($Rss32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= lsr($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= lsr($Rss32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = lsr($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsr_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += lsr($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= lsr($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= lsr($Rs32,$Rt32)",
+tc_c0cd91a8, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= lsr($Rs32,$Rt32)",
+tc_3c10f809, TypeS_3op>, Enc_2ae154 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlsrh($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_lsr_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlsrw($Rss32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_packhl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = packhl($Rs32,$Rt32)",
+tc_548f402d, TypeALU32_3op>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110101100;
+let InputType = "reg";
+}
+def S2_parityp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = parity($Rss32,$Rtt32)",
+tc_87601822, TypeALU64>, Enc_d2216a {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_pstorerbf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32+#$Ii) = $Rt32",
+tc_3d905451, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S2_pstorerbf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rx32++#$Ii) = $Rt32",
+tc_9b73d261, TypeST>, Enc_cc449f, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32) = $Rt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerbfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rx32++#$Ii) = $Rt32",
+tc_7675c0e9, TypeST>, Enc_cc449f, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32+#$Ii) = $Nt8.new",
+tc_9da3628f, TypeV2LDST>, Enc_585242, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S2_pstorerbnewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rx32++#$Ii) = $Nt8.new",
+tc_e2480a7f, TypeST>, Enc_52a5dd, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32) = $Nt8.new",
+tc_9da3628f, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerbnewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
+tc_8fab9ac3, TypeST>, Enc_52a5dd, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32+#$Ii) = $Nt8.new",
+tc_9da3628f, TypeV2LDST>, Enc_585242, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S2_pstorerbnewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rx32++#$Ii) = $Nt8.new",
+tc_e2480a7f, TypeST>, Enc_52a5dd, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32) = $Nt8.new",
+tc_9da3628f, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerbnewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
+tc_8fab9ac3, TypeST>, Enc_52a5dd, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32+#$Ii) = $Rt32",
+tc_3d905451, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S2_pstorerbt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rx32++#$Ii) = $Rt32",
+tc_9b73d261, TypeST>, Enc_cc449f, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32) = $Rt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerbtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rx32++#$Ii) = $Rt32",
+tc_7675c0e9, TypeST>, Enc_cc449f, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32+#$Ii) = $Rtt32",
+tc_3d905451, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S2_pstorerdf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rx32++#$Ii) = $Rtt32",
+tc_9b73d261, TypeST>, Enc_9a33d5, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32) = $Rtt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerdfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
+tc_7675c0e9, TypeST>, Enc_9a33d5, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32+#$Ii) = $Rtt32",
+tc_3d905451, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S2_pstorerdt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rx32++#$Ii) = $Rtt32",
+tc_9b73d261, TypeST>, Enc_9a33d5, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32) = $Rtt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerdtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
+tc_7675c0e9, TypeST>, Enc_9a33d5, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerff_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Rt32.h",
+tc_3d905451, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerff_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Rt32.h",
+tc_9b73d261, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerff_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32) = $Rt32.h",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerffnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
+tc_7675c0e9, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerft_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+#$Ii) = $Rt32.h",
+tc_3d905451, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000011;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerft_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rx32++#$Ii) = $Rt32.h",
+tc_9b73d261, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerft_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32) = $Rt32.h",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerftnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
+tc_7675c0e9, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Rt32",
+tc_3d905451, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerhf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Rt32",
+tc_9b73d261, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32) = $Rt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerhfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32",
+tc_7675c0e9, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Nt8.new",
+tc_9da3628f, TypeV2LDST>, Enc_f44229, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S2_pstorerhnewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Nt8.new",
+tc_e2480a7f, TypeST>, Enc_31aa6a, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32) = $Nt8.new",
+tc_9da3628f, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerhnewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
+tc_8fab9ac3, TypeST>, Enc_31aa6a, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32+#$Ii) = $Nt8.new",
+tc_9da3628f, TypeV2LDST>, Enc_f44229, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S2_pstorerhnewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rx32++#$Ii) = $Nt8.new",
+tc_e2480a7f, TypeST>, Enc_31aa6a, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32) = $Nt8.new",
+tc_9da3628f, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerhnewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
+tc_8fab9ac3, TypeST>, Enc_31aa6a, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerht_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+#$Ii) = $Rt32",
+tc_3d905451, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerht_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rx32++#$Ii) = $Rt32",
+tc_9b73d261, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerht_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32) = $Rt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerhtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32",
+tc_7675c0e9, TypeST>, Enc_b886fd, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerif_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32+#$Ii) = $Rt32",
+tc_3d905451, TypeV2LDST>, Enc_397f23, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S2_pstorerif_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rx32++#$Ii) = $Rt32",
+tc_9b73d261, TypeST>, Enc_7eaeb6, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerif_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32) = $Rt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerifnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rx32++#$Ii) = $Rt32",
+tc_7675c0e9, TypeST>, Enc_7eaeb6, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32+#$Ii) = $Nt8.new",
+tc_9da3628f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S2_pstorerinewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rx32++#$Ii) = $Nt8.new",
+tc_e2480a7f, TypeST>, Enc_65f095, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32) = $Nt8.new",
+tc_9da3628f, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerinewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
+tc_8fab9ac3, TypeST>, Enc_65f095, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32+#$Ii) = $Nt8.new",
+tc_9da3628f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S2_pstorerinewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rx32++#$Ii) = $Nt8.new",
+tc_e2480a7f, TypeST>, Enc_65f095, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32) = $Nt8.new",
+tc_9da3628f, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerinewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
+tc_8fab9ac3, TypeST>, Enc_65f095, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerit_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32+#$Ii) = $Rt32",
+tc_3d905451, TypeV2LDST>, Enc_397f23, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S2_pstorerit_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rx32++#$Ii) = $Rt32",
+tc_9b73d261, TypeST>, Enc_7eaeb6, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerit_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32) = $Rt32",
+tc_3d905451, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstoreritnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rx32++#$Ii) = $Rt32",
+tc_7675c0e9, TypeST>, Enc_7eaeb6, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_setbit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = setbit($Rs32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_setbit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = setbit($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_shuffeb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = shuffeb($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffeh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = shuffeh($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffob : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = shuffob($Rtt32,$Rss32)",
+tc_9c18c9a5, TypeS_3op>, Enc_ea23e4 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffoh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = shuffoh($Rtt32,$Rss32)",
+tc_9c18c9a5, TypeS_3op>, Enc_ea23e4 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_storerb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) = $Rt32",
+tc_53ee6546, TypeST>, Enc_448f7f, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def S2_storerb_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++$Mu2:brev) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111000;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+tc_251c87b2, TypeST>, Enc_b15941 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++I:circ($Mu2)) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rx32++#$Ii) = $Rt32",
+tc_20a8e109, TypeST>, Enc_10bc21, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++$Mu2) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) = $Rt32",
+tc_53ee6546, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerbgp : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb(gp+#$Ii) = $Rt32",
+tc_c14739d5, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
+let Inst{24-21} = 0b0000;
+let Inst{31-27} = 0b01001;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def S2_storerbnew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rs32+#$Ii) = $Nt8.new",
+tc_6c576d46, TypeST>, Enc_4df4e9, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S2_storerbnew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++$Mu2:brev) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101111101;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+tc_9c68db63, TypeST>, Enc_96ce4f {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++I:circ($Mu2)) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rx32++#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_c7cd90, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++$Mu2) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memb($Rs32) = $Nt8.new",
+tc_6c576d46, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerbnewgp : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Nt8),
+"memb(gp+#$Ii) = $Nt8.new",
+tc_9e86015f, TypeV2LDST>, Enc_ad1831, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+let opNewValue = 1;
+}
+def S2_storerd_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+#$Ii) = $Rtt32",
+tc_53ee6546, TypeST>, Enc_ce6828, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def S2_storerd_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++$Mu2:brev) = $Rtt32",
+tc_20a8e109, TypeST>, Enc_928ca1 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111110;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++#$Ii:circ($Mu2)) = $Rtt32",
+tc_251c87b2, TypeST>, Enc_395cc4 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++I:circ($Mu2)) = $Rtt32",
+tc_20a8e109, TypeST>, Enc_928ca1 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rx32++#$Ii) = $Rtt32",
+tc_20a8e109, TypeST>, Enc_85bf58, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++$Mu2) = $Rtt32",
+tc_20a8e109, TypeST>, Enc_928ca1 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd($Rs32) = $Rtt32",
+tc_53ee6546, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerdgp : HInst<
+(outs),
+(ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd(gp+#$Ii) = $Rtt32",
+tc_c14739d5, TypeV2LDST>, Enc_5c124a, AddrModeRel {
+let Inst{24-21} = 0b0110;
+let Inst{31-27} = 0b01001;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerdabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def S2_storerf_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32.h",
+tc_53ee6546, TypeST>, Enc_e957fb, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def S2_storerf_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2:brev) = $Rt32.h",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111011;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Rt32.h",
+tc_251c87b2, TypeST>, Enc_935d9b {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++I:circ($Mu2)) = $Rt32.h",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rx32++#$Ii) = $Rt32.h",
+tc_20a8e109, TypeST>, Enc_052c7d, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2) = $Rt32.h",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) = $Rt32.h",
+tc_53ee6546, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerfgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(gp+#$Ii) = $Rt32.h",
+tc_c14739d5, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerfabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def S2_storerh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32",
+tc_53ee6546, TypeST>, Enc_e957fb, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def S2_storerh_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2:brev) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111010;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+tc_251c87b2, TypeST>, Enc_935d9b {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++I:circ($Mu2)) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rx32++#$Ii) = $Rt32",
+tc_20a8e109, TypeST>, Enc_052c7d, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) = $Rt32",
+tc_53ee6546, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerhgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(gp+#$Ii) = $Rt32",
+tc_c14739d5, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def S2_storerhnew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Nt8),
+"memh($Rs32+#$Ii) = $Nt8.new",
+tc_6c576d46, TypeST>, Enc_0d8870, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+let opNewValue = 2;
+}
+def S2_storerhnew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++$Mu2:brev) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101111101;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+tc_9c68db63, TypeST>, Enc_91b9fe {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++I:circ($Mu2)) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"memh($Rx32++#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_e26546, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b001;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++$Mu2) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memh($Rs32) = $Nt8.new",
+tc_6c576d46, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerhnewgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Nt8),
+"memh(gp+#$Ii) = $Nt8.new",
+tc_9e86015f, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+let opNewValue = 1;
+}
+def S2_storeri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) = $Rt32",
+tc_53ee6546, TypeST>, Enc_143445, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def S2_storeri_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++$Mu2:brev) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111100;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+tc_251c87b2, TypeST>, Enc_79b8c8 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++I:circ($Mu2)) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rx32++#$Ii) = $Rt32",
+tc_20a8e109, TypeST>, Enc_db40cd, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++$Mu2) = $Rt32",
+tc_20a8e109, TypeST>, Enc_d5c73f {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) = $Rt32",
+tc_53ee6546, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerigp : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw(gp+#$Ii) = $Rt32",
+tc_c14739d5, TypeV2LDST>, Enc_541f26, AddrModeRel {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b01001;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def S2_storerinew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Nt8),
+"memw($Rs32+#$Ii) = $Nt8.new",
+tc_6c576d46, TypeST>, Enc_690862, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+let opNewValue = 2;
+}
+def S2_storerinew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++$Mu2:brev) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101111101;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+tc_9c68db63, TypeST>, Enc_3f97c8 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++I:circ($Mu2)) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"memw($Rx32++#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_223005, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b010;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isPredicable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++$Mu2) = $Nt8.new",
+tc_c8f9a6f6, TypeST>, Enc_8dbe85 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memw($Rs32) = $Nt8.new",
+tc_6c576d46, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerinewgp : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Nt8),
+"memw(gp+#$Ii) = $Nt8.new",
+tc_9e86015f, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def S2_storew_locked : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw_locked($Rs32,$Pd4) = $Rt32",
+tc_7d01cbdc, TypeST>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000101;
+let accessSize = WordAccess;
+let isPredicateLate = 1;
+let isSoloAX = 1;
+let mayStore = 1;
+}
+def S2_svsathb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsathb($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_svsathub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsathub($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_tableidxb : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxb($Rs32,#$Ii,#$II):raw",
+tc_d95f4e98, TypeS_2op>, Enc_cd82bc {
+let Inst{31-22} = 0b1000011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxb_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxb($Rs32,#$Ii,#$II)",
+tc_d95f4e98, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxd : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxd($Rs32,#$Ii,#$II):raw",
+tc_d95f4e98, TypeS_2op>, Enc_cd82bc {
+let Inst{31-22} = 0b1000011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxd_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxd($Rs32,#$Ii,#$II)",
+tc_d95f4e98, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxh : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxh($Rs32,#$Ii,#$II):raw",
+tc_d95f4e98, TypeS_2op>, Enc_cd82bc {
+let Inst{31-22} = 0b1000011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxh_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxh($Rs32,#$Ii,#$II)",
+tc_d95f4e98, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxw : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxw($Rs32,#$Ii,#$II):raw",
+tc_d95f4e98, TypeS_2op>, Enc_cd82bc {
+let Inst{31-22} = 0b1000011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxw_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxw($Rs32,#$Ii,#$II)",
+tc_d95f4e98, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_togglebit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = togglebit($Rs32,#$Ii)",
+tc_9c18c9a5, TypeS_2op>, Enc_a05677 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_togglebit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = togglebit($Rs32,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_5ab2be {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_tstbit_i : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = tstbit($Rs32,#$Ii)",
+tc_5fa2857c, TypeS_2op>, Enc_83ee64 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101000;
+}
+def S2_tstbit_r : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = tstbit($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111000;
+}
+def S2_valignib : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, u3_0Imm:$Ii),
+"$Rdd32 = valignb($Rtt32,$Rss32,#$Ii)",
+tc_d1b5a4b6, TypeS_3op>, Enc_729ff7 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000000000;
+}
+def S2_valignrb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, PredRegs:$Pu4),
+"$Rdd32 = valignb($Rtt32,$Rss32,$Pu4)",
+tc_d1b5a4b6, TypeS_3op>, Enc_8c6530 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010000;
+}
+def S2_vcnegh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vcnegh($Rss32,$Rt32)",
+tc_47ab9233, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vcrotate : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vcrotate($Rss32,$Rt32)",
+tc_63cd9d2d, TypeS_3op>, Enc_927852 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vrcnegh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += vrcnegh($Rss32,$Rt32)",
+tc_8cb685d9, TypeS_3op>, Enc_1aa186 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_vrndpackwh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vrndwh($Rss32)",
+tc_88fa2da6, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_vrndpackwhs : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vrndwh($Rss32):sat",
+tc_94e6ffd9, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vsathb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsathb($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathb_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsathb($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsathub : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsathub($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathub_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsathub($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsatwh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsatwh($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsatwh_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsatwh($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsatwuh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsatwuh($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsatwuh_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsatwuh($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_b9c5fb {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsplatrb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsplatb($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_5e2823 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vsplatrh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsplath($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_3a3d62 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100010;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vspliceib : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, u3_0Imm:$Ii),
+"$Rdd32 = vspliceb($Rss32,$Rtt32,#$Ii)",
+tc_d1b5a4b6, TypeS_3op>, Enc_d50cd3 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000000100;
+}
+def S2_vsplicerb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Pu4),
+"$Rdd32 = vspliceb($Rss32,$Rtt32,$Pu4)",
+tc_d1b5a4b6, TypeS_3op>, Enc_dbd70c {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010100;
+}
+def S2_vsxtbh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsxtbh($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_3a3d62 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vsxthw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsxthw($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_3a3d62 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vtrunehb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vtrunehb($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vtrunewh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunewh($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_vtrunohb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vtrunohb($Rss32)",
+tc_b86c7e8b, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vtrunowh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunowh($Rss32,$Rtt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_vzxtbh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vzxtbh($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_3a3d62 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vzxthw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vzxthw($Rs32)",
+tc_b86c7e8b, TypeS_2op>, Enc_3a3d62 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S4_addaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,add($Ru32,#$Ii))",
+tc_090485bb, TypeALU64>, Enc_8b8d61 {
+let Inst{31-23} = 0b110110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_addi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = add(#$Ii,asl($Rx32in,#$II))",
+tc_c0cd91a8, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b100;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_addi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = add(#$Ii,lsr($Rx32in,#$II))",
+tc_c0cd91a8, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b100;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_andi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = and(#$Ii,asl($Rx32in,#$II))",
+tc_3c10f809, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b000;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_andi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = and(#$Ii,lsr($Rx32in,#$II))",
+tc_3c10f809, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b000;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_clbaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s6_0Imm:$Ii),
+"$Rd32 = add(clb($Rs32),#$Ii)",
+tc_87601822, TypeS_2op>, Enc_9fae8a {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10001100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_clbpaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, s6_0Imm:$Ii),
+"$Rd32 = add(clb($Rss32),#$Ii)",
+tc_87601822, TypeS_2op>, Enc_a1640c {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_clbpnorm : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = normamt($Rss32)",
+tc_ab1b5e74, TypeS_2op>, Enc_90cd8b {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extract : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = extract($Rs32,#$Ii,#$II)",
+tc_c0cd91a8, TypeS_2op>, Enc_b388cf {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extract_rp : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rd32 = extract($Rs32,$Rtt32)",
+tc_87601822, TypeS_3op>, Enc_e07374 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extractp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rdd32 = extract($Rss32,#$Ii,#$II)",
+tc_c0cd91a8, TypeS_2op>, Enc_b84c4c {
+let Inst{31-24} = 0b10001010;
+let prefersSlot3 = 1;
+}
+def S4_extractp_rp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = extract($Rss32,$Rtt32)",
+tc_87601822, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+}
+def S4_lsli : HInst<
+(outs IntRegs:$Rd32),
+(ins s6_0Imm:$Ii, IntRegs:$Rt32),
+"$Rd32 = lsl(#$Ii,$Rt32)",
+tc_9c18c9a5, TypeS_3op>, Enc_fef969 {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S4_ntstbit_i : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = !tstbit($Rs32,#$Ii)",
+tc_5fa2857c, TypeS_2op>, Enc_83ee64 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101001;
+}
+def S4_ntstbit_r : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !tstbit($Rs32,$Rt32)",
+tc_c58f771a, TypeS_3op>, Enc_c2b48e {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111001;
+}
+def S4_or_andi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 |= and($Rs32,#$Ii)",
+tc_3c10f809, TypeALU64>, Enc_b0e9d8 {
+let Inst{31-22} = 0b1101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_or_andix : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Ru32, IntRegs:$Rx32in, s32_0Imm:$Ii),
+"$Rx32 = or($Ru32,and($Rx32in,#$Ii))",
+tc_3c10f809, TypeALU64>, Enc_b4e6cf {
+let Inst{31-22} = 0b1101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_or_ori : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 |= or($Rs32,#$Ii)",
+tc_3c10f809, TypeALU64>, Enc_b0e9d8 {
+let Inst{31-22} = 0b1101101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_ori_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = or(#$Ii,asl($Rx32in,#$II))",
+tc_3c10f809, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b010;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_ori_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = or(#$Ii,lsr($Rx32in,#$II))",
+tc_3c10f809, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b010;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_parity : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = parity($Rs32,$Rt32)",
+tc_87601822, TypeALU64>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_pstorerbf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb(#$Ii) = $Rt32",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb(#$Ii) = $Rt32",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = $Rt32",
+tc_20a8e109, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32) = $Rt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerbnewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb(#$Ii) = $Nt8.new",
+tc_2c8fe5ae, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b000;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_77781686, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb(#$Ii) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeV2LDST>, Enc_585242, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_pstorerbnewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_8def9c57, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32) = $Nt8.new",
+tc_c8f9a6f6, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerbnewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb(#$Ii) = $Nt8.new",
+tc_2c8fe5ae, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b000;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_77781686, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb(#$Ii) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeV2LDST>, Enc_585242, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_pstorerbnewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_8def9c57, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32) = $Nt8.new",
+tc_c8f9a6f6, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerbt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb(#$Ii) = $Rt32",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110100000;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb(#$Ii) = $Rt32",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32+#$Ii) = $Rt32",
+tc_20a8e109, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110110000;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32) = $Rt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerdf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd(#$Ii) = $Rtt32",
+tc_c85212ca, TypeST>, Enc_50b5ac, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+tc_7bc567a7, TypeST>, Enc_1a9974, AddrModeRel {
+let Inst{31-21} = 0b00110101110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd(#$Ii) = $Rtt32",
+tc_336e698c, TypeST>, Enc_50b5ac, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
+tc_20a8e109, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S4_pstorerdfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+tc_7639d4b0, TypeST>, Enc_1a9974, AddrModeRel {
+let Inst{31-21} = 0b00110111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32) = $Rtt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerdt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd(#$Ii) = $Rtt32",
+tc_c85212ca, TypeST>, Enc_50b5ac, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+tc_7bc567a7, TypeST>, Enc_1a9974, AddrModeRel {
+let Inst{31-21} = 0b00110100110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd(#$Ii) = $Rtt32",
+tc_336e698c, TypeST>, Enc_50b5ac, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
+tc_20a8e109, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S4_pstorerdtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+tc_7639d4b0, TypeST>, Enc_1a9974, AddrModeRel {
+let Inst{31-21} = 0b00110110110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32) = $Rtt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerff_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh(#$Ii) = $Rt32.h",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerff_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110101011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerffnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh(#$Ii) = $Rt32.h",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerffnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
+tc_20a8e109, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerffnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerffnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32) = $Rt32.h",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerft_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh(#$Ii) = $Rt32.h",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerft_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110100011;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerftnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh(#$Ii) = $Rt32.h",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerftnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
+tc_20a8e109, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010011;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerftnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110110011;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerftnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32) = $Rt32.h",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerhf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh(#$Ii) = $Rt32",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110101010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh(#$Ii) = $Rt32",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32",
+tc_20a8e109, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerhfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32) = $Rt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerhnewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh(#$Ii) = $Nt8.new",
+tc_2c8fe5ae, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b001;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_77781686, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh(#$Ii) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeV2LDST>, Enc_f44229, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S4_pstorerhnewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_8def9c57, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32) = $Nt8.new",
+tc_c8f9a6f6, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerhnewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh(#$Ii) = $Nt8.new",
+tc_2c8fe5ae, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b001;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_77781686, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh(#$Ii) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeV2LDST>, Enc_f44229, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S4_pstorerhnewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_8def9c57, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32) = $Nt8.new",
+tc_c8f9a6f6, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerht_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh(#$Ii) = $Rt32",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerht_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110100010;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh(#$Ii) = $Rt32",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32",
+tc_20a8e109, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerhtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110110010;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32) = $Rt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerif_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw(#$Ii) = $Rt32",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerif_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110101100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerifnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw(#$Ii) = $Rt32",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerifnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = $Rt32",
+tc_20a8e109, TypeV2LDST>, Enc_397f23, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S4_pstorerifnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerifnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32) = $Rt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerinewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw(#$Ii) = $Nt8.new",
+tc_2c8fe5ae, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b010;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_77781686, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw(#$Ii) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S4_pstorerinewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_8def9c57, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32) = $Nt8.new",
+tc_c8f9a6f6, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerinewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw(#$Ii) = $Nt8.new",
+tc_2c8fe5ae, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b010;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_77781686, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw(#$Ii) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_44215c, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
+tc_c8f9a6f6, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S4_pstorerinewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_8def9c57, TypeST>, Enc_47ee5e, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32) = $Nt8.new",
+tc_c8f9a6f6, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerit_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw(#$Ii) = $Rt32",
+tc_c85212ca, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerit_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7bc567a7, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110100100;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstoreritnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw(#$Ii) = $Rt32",
+tc_336e698c, TypeST>, Enc_1cf4ca, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstoreritnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32+#$Ii) = $Rt32",
+tc_20a8e109, TypeV2LDST>, Enc_397f23, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S4_pstoreritnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_7639d4b0, TypeST>, Enc_6339d5, AddrModeRel {
+let Inst{31-21} = 0b00110110100;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstoreritnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32) = $Rt32",
+tc_20a8e109, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_stored_locked : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd_locked($Rs32,$Pd4) = $Rtt32",
+tc_7d01cbdc, TypeST>, Enc_d7dc10 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
+let accessSize = DoubleWordAccess;
+let isPredicateLate = 1;
+let isSoloAX = 1;
+let mayStore = 1;
+}
+def S4_storeirb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"memb($Rs32+#$Ii) = #$II",
+tc_fcee8723, TypeST>, Enc_8203bb, PredNewRel {
+let Inst{31-21} = 0b00111100000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeirb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memb($Rs32) = #$II",
+tc_fcee8723, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memb($Rs32+#$Ii) = #$II",
+tc_1e69aa99, TypeST>, Enc_d7a65e, PredNewRel {
+let Inst{31-21} = 0b00111000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memb($Rs32) = #$II",
+tc_1e69aa99, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = #$II",
+tc_8f0a6bad, TypeST>, Enc_d7a65e, PredNewRel {
+let Inst{31-21} = 0b00111001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memb($Rs32) = #$II",
+tc_8f0a6bad, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memb($Rs32+#$Ii) = #$II",
+tc_1e69aa99, TypeST>, Enc_d7a65e, PredNewRel {
+let Inst{31-21} = 0b00111000000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memb($Rs32) = #$II",
+tc_1e69aa99, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memb($Rs32+#$Ii) = #$II",
+tc_8f0a6bad, TypeST>, Enc_d7a65e, PredNewRel {
+let Inst{31-21} = 0b00111001000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memb($Rs32) = #$II",
+tc_8f0a6bad, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"memh($Rs32+#$Ii) = #$II",
+tc_fcee8723, TypeST>, Enc_a803e0, PredNewRel {
+let Inst{31-21} = 0b00111100001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeirh_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memh($Rs32) = #$II",
+tc_fcee8723, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memh($Rs32+#$Ii) = #$II",
+tc_1e69aa99, TypeST>, Enc_f20719, PredNewRel {
+let Inst{31-21} = 0b00111000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memh($Rs32) = #$II",
+tc_1e69aa99, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = #$II",
+tc_8f0a6bad, TypeST>, Enc_f20719, PredNewRel {
+let Inst{31-21} = 0b00111001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memh($Rs32) = #$II",
+tc_8f0a6bad, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirht_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memh($Rs32+#$Ii) = #$II",
+tc_1e69aa99, TypeST>, Enc_f20719, PredNewRel {
+let Inst{31-21} = 0b00111000001;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirht_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memh($Rs32) = #$II",
+tc_1e69aa99, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memh($Rs32+#$Ii) = #$II",
+tc_8f0a6bad, TypeST>, Enc_f20719, PredNewRel {
+let Inst{31-21} = 0b00111001001;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memh($Rs32) = #$II",
+tc_8f0a6bad, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeiri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"memw($Rs32+#$Ii) = #$II",
+tc_fcee8723, TypeST>, Enc_f37377, PredNewRel {
+let Inst{31-21} = 0b00111100010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeiri_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memw($Rs32) = #$II",
+tc_fcee8723, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirif_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memw($Rs32+#$Ii) = #$II",
+tc_1e69aa99, TypeST>, Enc_5ccba9, PredNewRel {
+let Inst{31-21} = 0b00111000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirif_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memw($Rs32) = #$II",
+tc_1e69aa99, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirifnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = #$II",
+tc_8f0a6bad, TypeST>, Enc_5ccba9, PredNewRel {
+let Inst{31-21} = 0b00111001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirifnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memw($Rs32) = #$II",
+tc_8f0a6bad, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirit_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memw($Rs32+#$Ii) = #$II",
+tc_1e69aa99, TypeST>, Enc_5ccba9, PredNewRel {
+let Inst{31-21} = 0b00111000010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirit_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memw($Rs32) = #$II",
+tc_1e69aa99, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeiritnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memw($Rs32+#$Ii) = #$II",
+tc_8f0a6bad, TypeST>, Enc_5ccba9, PredNewRel {
+let Inst{31-21} = 0b00111001010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeiritnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memw($Rs32) = #$II",
+tc_8f0a6bad, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storerb_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memb($Re32=#$II) = $Rt32",
+tc_336e698c, TypeST>, Enc_8bcba4, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerb_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_45631a8d, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011000;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storerb_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memb($Ru32<<#$Ii+#$II) = $Rt32",
+tc_a4567c39, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101000;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storerb_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerbnew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memb($Re32=#$II) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_724154, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerbnew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_be995eaf, TypeST>, Enc_c6220b, AddrModeRel {
+let Inst{6-3} = 0b0000;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerbnew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memb($Ru32<<#$Ii+#$II) = $Nt8.new",
+tc_210b2456, TypeST>, Enc_7eb485, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S4_storerb_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_storerd_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, DoubleRegs:$Rtt32),
+"memd($Re32=#$II) = $Rtt32",
+tc_336e698c, TypeST>, Enc_c7a204 {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S4_storerd_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerd_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+tc_45631a8d, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011110;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+let isPredicable = 1;
+}
+def S4_storerd_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, DoubleRegs:$Rtt32),
+"memd($Ru32<<#$Ii+#$II) = $Rtt32",
+tc_a4567c39, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101110;
+let addrMode = BaseLongOffset;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerf_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Re32=#$II) = $Rt32.h",
+tc_336e698c, TypeST>, Enc_8bcba4 {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S4_storerf_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerf_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+tc_45631a8d, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011011;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+let isPredicable = 1;
+}
+def S4_storerf_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Ru32<<#$Ii+#$II) = $Rt32.h",
+tc_a4567c39, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101011;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S4_storerf_rr";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerh_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Re32=#$II) = $Rt32",
+tc_336e698c, TypeST>, Enc_8bcba4, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerh_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_45631a8d, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011010;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storerh_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Ru32<<#$Ii+#$II) = $Rt32",
+tc_a4567c39, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101010;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerhnew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memh($Re32=#$II) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_724154, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b001;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerhnew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_be995eaf, TypeST>, Enc_c6220b, AddrModeRel {
+let Inst{6-3} = 0b0001;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerhnew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memh($Ru32<<#$Ii+#$II) = $Nt8.new",
+tc_210b2456, TypeST>, Enc_7eb485, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_storeri_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memw($Re32=#$II) = $Rt32",
+tc_336e698c, TypeST>, Enc_8bcba4, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeri_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+tc_45631a8d, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011100;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storeri_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memw($Ru32<<#$Ii+#$II) = $Rt32",
+tc_a4567c39, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101100;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerinew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memw($Re32=#$II) = $Nt8.new",
+tc_7986ba30, TypeST>, Enc_724154, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b010;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerinew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+tc_be995eaf, TypeST>, Enc_c6220b, AddrModeRel {
+let Inst{6-3} = 0b0010;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerinew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memw($Ru32<<#$Ii+#$II) = $Nt8.new",
+tc_210b2456, TypeST>, Enc_7eb485, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isNewValue = 1;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_subaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Ru32),
+"$Rd32 = add($Rs32,sub(#$Ii,$Ru32))",
+tc_090485bb, TypeALU64>, Enc_8b8d61 {
+let Inst{31-23} = 0b110110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_subi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = sub(#$Ii,asl($Rx32in,#$II))",
+tc_c0cd91a8, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b110;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_subi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = sub(#$Ii,lsr($Rx32in,#$II))",
+tc_c0cd91a8, TypeALU64>, Enc_c31910 {
+let Inst{2-0} = 0b110;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_vrcrotate : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rdd32 = vrcrotate($Rss32,$Rt32,#$Ii)",
+tc_6264c5e0, TypeS_3op>, Enc_645d54 {
+let Inst{7-6} = 0b11;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+}
+def S4_vrcrotate_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rxx32 += vrcrotate($Rss32,$Rt32,#$Ii)",
+tc_bc5561d8, TypeS_3op>, Enc_b72622 {
+let Inst{7-6} = 0b00;
+let Inst{31-21} = 0b11001011101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S4_vxaddsubh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubh($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxaddsubhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubh($Rss32,$Rtt32):rnd:>>1:sat",
+tc_63cd9d2d, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxaddsubw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubw($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddh($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddh($Rss32,$Rtt32):rnd:>>1:sat",
+tc_63cd9d2d, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddw($Rss32,$Rtt32):sat",
+tc_47ab9233, TypeS_3op>, Enc_a56825 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_asrhub_rnd_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):raw",
+tc_63cd9d2d, TypeS_2op>, Enc_11a146, Requires<[HasV5T]> {
+let Inst{7-5} = 0b100;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_asrhub_rnd_sat_goodsyntax : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
+tc_63cd9d2d, TypeS_2op>, Requires<[HasV5T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def S5_asrhub_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):sat",
+tc_63cd9d2d, TypeS_2op>, Enc_11a146, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_popcountp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = popcount($Rss32)",
+tc_ca280e8b, TypeS_2op>, Enc_90cd8b, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S5_vasrhrnd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii):raw",
+tc_63cd9d2d, TypeS_2op>, Enc_12b6e9, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000001;
+let prefersSlot3 = 1;
+}
+def S5_vasrhrnd_goodsyntax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii):rnd",
+tc_63cd9d2d, TypeS_2op>, Requires<[HasV5T]> {
+let isPseudo = 1;
+}
+def S6_rol_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = rol($Rss32,#$Ii)",
+tc_9f518242, TypeS_2op>, Enc_5eac98, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000000000;
+}
+def S6_rol_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += rol($Rss32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= rol($Rss32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= rol($Rss32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= rol($Rss32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= rol($Rss32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_70fb07, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = rol($Rs32,#$Ii)",
+tc_9f518242, TypeS_2op>, Enc_a05677, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S6_rol_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += rol($Rs32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= rol($Rs32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= rol($Rs32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= rol($Rs32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= rol($Rs32,#$Ii)",
+tc_e17ce9ad, TypeS_2op>, Enc_28a2dc, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_vsplatrbp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsplatb($Rs32)",
+tc_78b3c689, TypeS_2op>, Enc_3a3d62, Requires<[HasV62T]> {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100010;
+}
+def S6_vtrunehb_ppp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunehb($Rss32,$Rtt32)",
+tc_9f518242, TypeS_3op>, Enc_a56825, Requires<[HasV62T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S6_vtrunohb_ppp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunohb($Rss32,$Rtt32)",
+tc_9f518242, TypeS_3op>, Enc_a56825, Requires<[HasV62T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def SA1_addi : HInst<
+(outs GeneralSubRegs:$Rx16),
+(ins IntRegs:$Rx16in, s32_0Imm:$Ii),
+"$Rx16 = add($Rx16in,#$Ii)",
+tc_821c4233, TypeSUBINSN>, Enc_93af4c {
+let Inst{12-11} = 0b00;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+let Constraints = "$Rx16 = $Rx16in";
+}
+def SA1_addrx : HInst<
+(outs GeneralSubRegs:$Rx16),
+(ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16),
+"$Rx16 = add($Rx16in,$Rs16)",
+tc_821c4233, TypeSUBINSN>, Enc_0527db {
+let Inst{12-8} = 0b11000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let Constraints = "$Rx16 = $Rx16in";
+}
+def SA1_addsp : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u6_2Imm:$Ii),
+"$Rd16 = add(r29,#$Ii)",
+tc_d2609065, TypeSUBINSN>, Enc_2df31d {
+let Inst{12-10} = 0b011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_and1 : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = and($Rs16,#1)",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrf : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (!p0) $Rd16 = #0",
+tc_7c2dcd4d, TypeSUBINSN>, Enc_1f5ba6 {
+let Inst{12-4} = 0b110100111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrfnew : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (!p0.new) $Rd16 = #0",
+tc_f26aa619, TypeSUBINSN>, Enc_1f5ba6 {
+let Inst{12-4} = 0b110100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrt : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (p0) $Rd16 = #0",
+tc_7c2dcd4d, TypeSUBINSN>, Enc_1f5ba6 {
+let Inst{12-4} = 0b110100110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrtnew : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (p0.new) $Rd16 = #0",
+tc_f26aa619, TypeSUBINSN>, Enc_1f5ba6 {
+let Inst{12-4} = 0b110100100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_cmpeqi : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u2_0Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$Ii)",
+tc_e8c7a357, TypeSUBINSN>, Enc_63eaeb {
+let Inst{3-2} = 0b00;
+let Inst{12-8} = 0b11001;
+let AsmVariantName = "NonParsable";
+let Defs = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine0i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#0,#$Ii)",
+tc_d2609065, TypeSUBINSN>, Enc_ed48be {
+let Inst{4-3} = 0b00;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine1i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#1,#$Ii)",
+tc_d2609065, TypeSUBINSN>, Enc_ed48be {
+let Inst{4-3} = 0b01;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine2i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#2,#$Ii)",
+tc_d2609065, TypeSUBINSN>, Enc_ed48be {
+let Inst{4-3} = 0b10;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine3i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#3,#$Ii)",
+tc_d2609065, TypeSUBINSN>, Enc_ed48be {
+let Inst{4-3} = 0b11;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combinerz : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins GeneralSubRegs:$Rs16),
+"$Rdd8 = combine($Rs16,#0)",
+tc_d2609065, TypeSUBINSN>, Enc_399e12 {
+let Inst{3-3} = 0b1;
+let Inst{12-8} = 0b11101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combinezr : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins GeneralSubRegs:$Rs16),
+"$Rdd8 = combine(#0,$Rs16)",
+tc_d2609065, TypeSUBINSN>, Enc_399e12 {
+let Inst{3-3} = 0b0;
+let Inst{12-8} = 0b11101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_dec : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1),
+"$Rd16 = add($Rs16,#$n1)",
+tc_821c4233, TypeSUBINSN>, Enc_ee5ed0 {
+let Inst{12-8} = 0b10011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_inc : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = add($Rs16,#1)",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_seti : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u32_0Imm:$Ii),
+"$Rd16 = #$Ii",
+tc_d2609065, TypeSUBINSN>, Enc_e39bb2 {
+let Inst{12-10} = 0b010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def SA1_setin1 : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins n1Const:$n1),
+"$Rd16 = #$n1",
+tc_d2609065, TypeSUBINSN>, Enc_7a0ea6 {
+let Inst{12-4} = 0b110100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_sxtb : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = sxtb($Rs16)",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_sxth : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = sxth($Rs16)",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_tfr : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = $Rs16",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_zxtb : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = and($Rs16,#255)",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_zxth : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = zxth($Rs16)",
+tc_d2609065, TypeSUBINSN>, Enc_97d666 {
+let Inst{12-8} = 0b10110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SL1_loadri_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"$Rd16 = memw($Rs16+#$Ii)",
+tc_bf6fa601, TypeSUBINSN>, Enc_53dca9 {
+let Inst{12-12} = 0b0;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L1";
+}
+def SL1_loadrub_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"$Rd16 = memub($Rs16+#$Ii)",
+tc_bf6fa601, TypeSUBINSN>, Enc_c175d0 {
+let Inst{12-12} = 0b1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L1";
+}
+def SL2_deallocframe : HInst<
+(outs),
+(ins),
+"deallocframe",
+tc_86442910, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111100000000;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [R30, R29, R31];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31 : HInst<
+(outs),
+(ins),
+"jumpr r31",
+tc_35fb9d13, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111111000000;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [R31];
+let Defs = [PC];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_f : HInst<
+(outs),
+(ins),
+"if (!p0) jumpr r31",
+tc_35fb9d13, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111111000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_fnew : HInst<
+(outs),
+(ins),
+"if (!p0.new) jumpr:nt r31",
+tc_35fb9d13, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111111000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_t : HInst<
+(outs),
+(ins),
+"if (p0) jumpr r31",
+tc_35fb9d13, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111111000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_tnew : HInst<
+(outs),
+(ins),
+"if (p0.new) jumpr:nt r31",
+tc_35fb9d13, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111111000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrb_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_0Imm:$Ii),
+"$Rd16 = memb($Rs16+#$Ii)",
+tc_bf6fa601, TypeSUBINSN>, Enc_2fbf3c {
+let Inst{12-11} = 0b10;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrd_sp : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u5_3Imm:$Ii),
+"$Rdd8 = memd(r29+#$Ii)",
+tc_70cabf66, TypeSUBINSN>, Enc_86a14b {
+let Inst{12-8} = 0b11110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrh_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
+"$Rd16 = memh($Rs16+#$Ii)",
+tc_bf6fa601, TypeSUBINSN>, Enc_2bae10 {
+let Inst{12-11} = 0b00;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadri_sp : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u5_2Imm:$Ii),
+"$Rd16 = memw(r29+#$Ii)",
+tc_70cabf66, TypeSUBINSN>, Enc_51635c {
+let Inst{12-9} = 0b1110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadruh_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
+"$Rd16 = memuh($Rs16+#$Ii)",
+tc_bf6fa601, TypeSUBINSN>, Enc_2bae10 {
+let Inst{12-11} = 0b01;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return : HInst<
+(outs),
+(ins),
+"dealloc_return",
+tc_95c54f8b, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111101000000;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [R30];
+let Defs = [PC, R30, R29, R31];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_f : HInst<
+(outs),
+(ins),
+"if (!p0) dealloc_return",
+tc_95c54f8b, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_fnew : HInst<
+(outs),
+(ins),
+"if (!p0.new) dealloc_return:nt",
+tc_95c54f8b, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_t : HInst<
+(outs),
+(ins),
+"if (p0) dealloc_return",
+tc_95c54f8b, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111101000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_tnew : HInst<
+(outs),
+(ins),
+"if (p0.new) dealloc_return:nt",
+tc_95c54f8b, TypeSUBINSN>, Enc_e3b0c4 {
+let Inst{12-0} = 0b1111101000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let isReturn = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SS1_storeb_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii, GeneralSubRegs:$Rt16),
+"memb($Rs16+#$Ii) = $Rt16",
+tc_53ee6546, TypeSUBINSN>, Enc_b38ffc {
+let Inst{12-12} = 0b1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S1";
+}
+def SS1_storew_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii, GeneralSubRegs:$Rt16),
+"memw($Rs16+#$Ii) = $Rt16",
+tc_53ee6546, TypeSUBINSN>, Enc_f55a0c {
+let Inst{12-12} = 0b0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S1";
+}
+def SS2_allocframe : HInst<
+(outs),
+(ins u5_3Imm:$Ii),
+"allocframe(#$Ii)",
+tc_f027ebe9, TypeSUBINSN>, Enc_6f70ca {
+let Inst{3-0} = 0b0000;
+let Inst{12-9} = 0b1110;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R30, R29, R31];
+let Defs = [R30, R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storebi0 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"memb($Rs16+#$Ii) = #0",
+tc_6c52d277, TypeSUBINSN>, Enc_84d359 {
+let Inst{12-8} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storebi1 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"memb($Rs16+#$Ii) = #1",
+tc_6c52d277, TypeSUBINSN>, Enc_84d359 {
+let Inst{12-8} = 0b10011;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_stored_sp : HInst<
+(outs),
+(ins s6_3Imm:$Ii, GeneralDoubleLow8Regs:$Rtt8),
+"memd(r29+#$Ii) = $Rtt8",
+tc_c14739d5, TypeSUBINSN>, Enc_b8309d {
+let Inst{12-9} = 0b0101;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storeh_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii, GeneralSubRegs:$Rt16),
+"memh($Rs16+#$Ii) = $Rt16",
+tc_53ee6546, TypeSUBINSN>, Enc_625deb {
+let Inst{12-11} = 0b00;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storew_sp : HInst<
+(outs),
+(ins u5_2Imm:$Ii, GeneralSubRegs:$Rt16),
+"memw(r29+#$Ii) = $Rt16",
+tc_c14739d5, TypeSUBINSN>, Enc_87c142 {
+let Inst{12-9} = 0b0100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storewi0 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"memw($Rs16+#$Ii) = #0",
+tc_6c52d277, TypeSUBINSN>, Enc_a6ce9c {
+let Inst{12-8} = 0b10000;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storewi1 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"memw($Rs16+#$Ii) = #1",
+tc_6c52d277, TypeSUBINSN>, Enc_a6ce9c {
+let Inst{12-8} = 0b10001;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def V6_MAP_equb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_extractw : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rs32),
+"$Rd32 = vextract($Vu32,$Rs32)",
+tc_9777e6bf, TypeLD>, Enc_50e578, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_extractw_128B : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rs32),
+"$Rd32 = vextract($Vu32,$Rs32)",
+tc_9777e6bf, TypeLD>, Enc_50e578, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_extractw_alt : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rs32),
+"$Rd32.w = vextract($Vu32,$Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_extractw_alt_128B : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rs32),
+"$Rd32.w = vextract($Vu32,$Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_hi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vss32),
+"$Vd32 = hi($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_hi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vss32),
+"$Vd32 = hi($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ld0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ld0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldcnp0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldcnp0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldcnpnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldcnpnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldcp0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.cur = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldcp0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.cur = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldcpnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.cur = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldcpnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.cur = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldnp0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32 = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldnp0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32 = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldnpnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldnpnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldp0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32 = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldp0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32 = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldpnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldpnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldtnp0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldtnp0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldtnpnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldtnpnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldtp0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldtp0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldtpnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldtpnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32):nt",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldu0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmemu($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldu0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmemu($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lo : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vss32),
+"$Vd32 = lo($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lo_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vss32),
+"$Vd32 = lo($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplatb : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.b = vsplat($Rt32)",
+tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplatb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.b = vsplat($Rt32)",
+tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplath : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.h = vsplat($Rt32)",
+tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplath_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.h = vsplat($Rt32)",
+tc_6b78cf13, TypeCVI_VX>, Enc_a5ed8a, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplatw : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vsplat($Rt32)",
+tc_6b78cf13, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplatw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vsplat($Rt32)",
+tc_6b78cf13, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_and : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = and($Qs4,$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_and_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = and($Qs4,$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_and_n : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = and($Qs4,!$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_and_n_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = and($Qs4,!$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_not : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4),
+"$Qd4 = not($Qs4)",
+tc_71337255, TypeCVI_VA>, Enc_bfbf03, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_not_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4),
+"$Qd4 = not($Qs4)",
+tc_71337255, TypeCVI_VA>, Enc_bfbf03, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_or : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = or($Qs4,$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_or_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = or($Qs4,$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_or_n : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = or($Qs4,!$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_or_n_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = or($Qs4,!$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_scalar2 : HInst<
+(outs VecPredRegs:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq($Rt32)",
+tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[HasV60T,UseHVX]> {
+let Inst{13-2} = 0b000000010001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_scalar2_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq($Rt32)",
+tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[HasV60T,UseHVX]> {
+let Inst{13-2} = 0b000000010001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_scalar2v2 : HInst<
+(outs VecPredRegs:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq2($Rt32)",
+tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[HasV62T,UseHVX]> {
+let Inst{13-2} = 0b000000010011;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_scalar2v2_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq2($Rt32)",
+tc_4105d6b5, TypeCVI_VP>, Enc_7222b7, Requires<[HasV62T,UseHVX]> {
+let Inst{13-2} = 0b000000010011;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_xor : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = xor($Qs4,$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_xor_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = xor($Qs4,$Qt4)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_shuffeqh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_shuffeqh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_shuffeqw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_shuffeqw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_134437, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_st0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_st0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stn0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Os8),
+"vmem($Rt32) = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 1;
+}
+def V6_stn0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Os8),
+"vmem($Rt32) = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def V6_stnnt0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Os8),
+"vmem($Rt32):nt = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 1;
+}
+def V6_stnnt0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Os8),
+"vmem($Rt32):nt = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def V6_stnp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnpnt0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnpnt0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnq0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnq0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnqnt0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnqnt0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnt0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnt0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stpnt0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stpnt0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stq0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stq0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stqnt0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stqnt0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stu0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stu0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stunp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stunp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stup0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stup0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32Ub_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmemu($Rt32+#$Ii)",
+tc_35e92f8e, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32Ub_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmemu($Rt32+#$Ii)",
+tc_35e92f8e, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32Ub_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmemu($Rx32++#$Ii)",
+tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmemu($Rx32++#$Ii)",
+tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmemu($Rx32++$Mu2)",
+tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmemu($Rx32++$Mu2)",
+tc_4fd8566e, TypeCVI_VM_VP_LDU>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii)",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii)",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii)",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii)",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii):nt",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii):nt",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii):nt",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii):nt",
+tc_b712833a, TypeCVI_VM_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let CVINew = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2):nt",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii):nt",
+tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii):nt",
+tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii):nt",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii):nt",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2):nt",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2):nt",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2)",
+tc_eb669007, TypeCVI_VM_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+tc_5cbf490b, TypeCVI_VM_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+tc_da979fb3, TypeCVI_VM_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii)",
+tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii)",
+tc_77a4c701, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii)",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii)",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2)",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2)",
+tc_9c267309, TypeCVI_VM_TMP_LD>, Enc_2ebe3b, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+tc_51cd3aab, TypeCVI_VM_TMP_LD>, Enc_8d8a30, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_58a8bf, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+tc_38208312, TypeCVI_VM_TMP_LD>, Enc_f8c1c4, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmemu($Rt32+#$Ii) = $Vs32",
+tc_354299ad, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmemu($Rt32+#$Ii) = $Vs32",
+tc_354299ad, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmemu($Rx32++#$Ii) = $Vs32",
+tc_7fa82b08, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmemu($Rx32++#$Ii) = $Vs32",
+tc_7fa82b08, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmemu($Rx32++$Mu2) = $Vs32",
+tc_7fa82b08, TypeCVI_VM_STU>, Enc_d15d19, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmemu($Rx32++$Mu2) = $Vs32",
+tc_7fa82b08, TypeCVI_VM_STU>, Enc_d15d19, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+tc_d642eff3, TypeCVI_VM_STU>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+tc_6fd9ad30, TypeCVI_VM_STU>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rt32+#$Ii) = $Vs32",
+tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rt32+#$Ii) = $Vs32",
+tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_new_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rt32+#$Ii) = $Os8.new",
+tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 2;
+}
+def V6_vS32b_new_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rt32+#$Ii) = $Os8.new",
+tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def V6_vS32b_new_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_new_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_new_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001101;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001101;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rx32++#$Ii) = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rx32++#$Ii) = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"vmem($Rx32++$Mu2) = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"vmem($Rx32++$Mu2) = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_new_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_new_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nqpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nqpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rt32+#$Ii):nt = $Vs32",
+tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rt32+#$Ii):nt = $Vs32",
+tc_e3748cdf, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_new_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rt32+#$Ii):nt = $Os8.new",
+tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 2;
+}
+def V6_vS32b_nt_new_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rt32+#$Ii):nt = $Os8.new",
+tc_1b93bdc6, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def V6_vS32b_nt_new_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001111;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001111;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rx32++#$Ii):nt = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rx32++#$Ii):nt = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"vmem($Rx32++$Mu2):nt = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"vmem($Rx32++$Mu2):nt = $Os8.new",
+tc_db5b9e2f, TypeCVI_VM_NEW_ST>, Enc_cf1927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+tc_d5090f3e, TypeCVI_VM_NEW_ST>, Enc_f7430e, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_784502, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001010;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+tc_8b6a873f, TypeCVI_VM_NEW_ST>, Enc_372c9d, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001010;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let CVINew = 1;
+let isNewValue = 1;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_nqpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_nqpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rx32++#$Ii):nt = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rx32++#$Ii):nt = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmem($Rx32++$Mu2):nt = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmem($Rx32++$Mu2):nt = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_qpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_qpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNonTemporal = 1;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rx32++#$Ii) = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rx32++#$Ii) = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmem($Rx32++$Mu2) = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmem($Rx32++$Mu2) = $Vs32",
+tc_a4c9df3b, TypeCVI_VM_ST>, Enc_d15d19, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_85d237e3, TypeCVI_VM_ST>, Enc_27b757, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_865390, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_0317c6ca, TypeCVI_VM_ST>, Enc_1ef990, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_qpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
+tc_aedb9f9e, TypeCVI_VM_ST>, Enc_2ea740, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_qpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_0b51ce, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
+tc_99093773, TypeCVI_VM_ST>, Enc_4dff07, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vabsdiffh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vabs($Vu32.h)",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vabs($Vu32.h)",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vabs($Vu32.h):sat",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vabs($Vu32.h):sat",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsh($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsh($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vabs($Vu32.w)",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vabs($Vu32.w)",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vabs($Vu32.w):sat",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vabs($Vu32.w):sat",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsw($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsw($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.b += $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.b += $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.b += $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.b += $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddcarry : HInst<
+(outs VectorRegs:$Vd32, VecPredRegs:$Qx4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, VecPredRegs:$Qx4in),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
+tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vaddcarry_128B : HInst<
+(outs VectorRegs128B:$Vd32, VecPredRegs128B:$Qx4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, VecPredRegs128B:$Qx4in),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
+tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vaddclbh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddclbh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddclbw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddclbw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.h += $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.h += $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.h += $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.h += $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhw_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddububb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddububb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vadduh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vadduh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vadduh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vadduh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhw_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vadduw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vadduw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vadduw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vadduw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.w += $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.w += $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.w += $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.w += $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_valignb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign($Vu32,$Vv32,$Rt8)",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_valignb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign($Vu32,$Vv32,$Rt8)",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_valignbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = valign($Vu32,$Vv32,#$Ii)",
+tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_valignbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = valign($Vu32,$Vv32,#$Ii)",
+tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vand : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vand($Vu32,$Vv32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vand_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vand($Vu32,$Vv32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandnqrt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand(!$Qu4,$Rt32)",
+tc_e231aa4f, TypeCVI_VX>, Enc_7b7ba8, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandnqrt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand(!$Qu4,$Rt32)",
+tc_e231aa4f, TypeCVI_VX>, Enc_7b7ba8, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandnqrt_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand(!$Qu4,$Rt32)",
+tc_9311da3f, TypeCVI_VX>, Enc_895bd9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand(!$Qu4,$Rt32)",
+tc_9311da3f, TypeCVI_VX>, Enc_895bd9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandnqrt_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandqrt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand($Qu4,$Rt32)",
+tc_e231aa4f, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandqrt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand($Qu4,$Rt32)",
+tc_e231aa4f, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandqrt_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand($Qu4,$Rt32)",
+tc_9311da3f, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand($Qu4,$Rt32)",
+tc_9311da3f, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandqrt_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvnqv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vu32),
+"$Vd32 = vand(!$Qv4,$Vu32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvnqv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vu32),
+"$Vd32 = vand(!$Qv4,$Vu32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvqv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vu32),
+"$Vd32 = vand($Qv4,$Vu32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvqv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vu32),
+"$Vd32 = vand($Qv4,$Vu32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_c4dc92, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvrt : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qd4 = vand($Vu32,$Rt32)",
+tc_e231aa4f, TypeCVI_VX_LATE>, Enc_0f8bab, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvrt_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qd4 = vand($Vu32,$Rt32)",
+tc_e231aa4f, TypeCVI_VX_LATE>, Enc_0f8bab, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvrt_acc : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qx4 |= vand($Vu32,$Rt32)",
+tc_9311da3f, TypeCVI_VX_LATE>, Enc_adf111, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qx4 |= vand($Vu32,$Rt32)",
+tc_9311da3f, TypeCVI_VX_LATE>, Enc_adf111, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_alt : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_alt_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_alt : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qd4.ub = vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvrt_alt_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qd4.ub = vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasl($Vu32.h,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasl($Vu32.h,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vasl($Vu32.h,$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vasl($Vu32.h,$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaslh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaslh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasl($Vu32.w,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasl($Vu32.w,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslw_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasl($Vu32.w,$Rt32)",
+tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasl($Vu32.w,$Rt32)",
+tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vasl($Vu32.w,$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vasl($Vu32.w,$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaslw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaslw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasr($Vu32.h,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasr($Vu32.h,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhbrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhubrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhubsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vasr($Vu32.h,$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vasr($Vu32.h,$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vasrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vasrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasruwuhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasruwuhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasr($Vu32.w,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasr($Vu32.w,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrw_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasr($Vu32.w,$Rt32)",
+tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasr($Vu32.w,$Rt32)",
+tc_c00bf9c9, TypeCVI_VS>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwuhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwuhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+tc_7fa8b40f, TypeCVI_VS>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
+tc_7fa8b40f, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vasr($Vu32.w,$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vasr($Vu32.w,$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vasrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vasrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vassign : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = $Vu32",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassign_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = $Vu32",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vassignp : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32),
+"$Vdd32 = $Vuu32",
+CVI_VA, TypeCVI_VA_DV>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassignp_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32),
+"$Vdd32 = $Vuu32",
+CVI_VA, TypeCVI_VA_DV>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavghrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavghrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavghrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavghrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgubrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgubrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgubrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgubrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguhrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguhrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguhrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguhrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgwrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgwrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgwrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgwrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vccombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vccombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0h : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.uh = vcl0($Vu32.uh)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0h_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.uh = vcl0($Vu32.uh)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0h_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vcl0h($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0h_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vcl0h($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0w : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.uw = vcl0($Vu32.uw)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0w_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.uw = vcl0($Vu32.uw)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0w_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vcl0w($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0w_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vcl0w($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcmov : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32),
+"if ($Ps4) $Vd32 = $Vu32",
+tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcmov_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32),
+"if ($Ps4) $Vd32 = $Vu32",
+tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vcombine($Vu32,$Vv32)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isRegSequence = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vcombine($Vu32,$Vv32)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isRegSequence = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vd0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins),
+"$Vd32 = #0",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vd0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins),
+"$Vd32 = #0",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdeal : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vdeal($Vy32,$Vx32,$Rt32)",
+tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vdeal_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vdeal($Vy32,$Vx32,$Rt32)",
+tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vdealb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.b = vdeal($Vu32.b)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb4w_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdealb4w($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdealb4w($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.b = vdeal($Vu32.b)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vdealb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vdealb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vdeal($Vu32.h)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vdeal($Vu32.h)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vdealh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vdealh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealvdd : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealvdd_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdelta : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdelta($Vu32,$Vv32)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdelta_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdelta($Vu32,$Vv32)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_dv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_dv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhisat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhisat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhisat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhisat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsuisat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsuisat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_0e41fa, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsuisat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_cc857d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsuisat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsusat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsusat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsusat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsusat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhvsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhvsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhvsat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhvsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdsaduh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdsaduh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdsaduh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdsaduh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgth_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgth_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtub_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtub_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtuh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtuh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtuw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtuw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_95441f, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
+tc_a3127e12, TypeCVI_VA>, Enc_eaa9f8, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vhist : HInst<
+(outs),
+(ins),
+"vhist",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vhist_128B : HInst<
+(outs),
+(ins),
+"vhist",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vhistq : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vhist($Qv4)",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vhistq_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vhist($Qv4)",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vinsertwr : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, IntRegs:$Rt32),
+"$Vx32.w = vinsert($Rt32)",
+tc_e231aa4f, TypeCVI_VX_LATE>, Enc_569cfe, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b100000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vinsertwr_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"$Vx32.w = vinsert($Rt32)",
+tc_e231aa4f, TypeCVI_VX_LATE>, Enc_569cfe, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b100000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlalignb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlalignb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlalignbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
+tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlalignbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
+tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vlsrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vlsrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
+tc_41f4b64e, TypeCVI_VS>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vlsrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vlsrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb_nm : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvb_nm_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
+tc_c4b515c5, TypeCVI_VP>, Enc_a30110, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb_oracc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_245865, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_245865, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracci : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_cd4705, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracci_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_cd4705, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
+tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
+tc_c4b515c5, TypeCVI_VP>, Enc_0b2e5b, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh_nm : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwh_nm_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh_oracc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_7b523d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_7b523d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracci : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_1178da, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracci_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
+tc_cbf6d1dc, TypeCVI_VP_VS>, Enc_1178da, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwhi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_4b39e4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwhi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_4b39e4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vmax($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vmax($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmax($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmax($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmax($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmax($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vmin($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vmin($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmin($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmin($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmin($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmin($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabusv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabusv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabusv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vmpabus($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabusv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vmpabus($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabuuv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabuuv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabuuv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vmpabuu($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabuuv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vmpabuu($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpahb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpahb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpahb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpahb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpauhb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpauhb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpauhb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpauhb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybusv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybusv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybusv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybusv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh_64 : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_64_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhsat_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsrs : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhsrs_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhsrs_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhsrs_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhss : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhss_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhss_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhss_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhvsrs : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhvsrs_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhvsrs_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhvsrs_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyieoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyieoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiewh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyiewh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyiewh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiewuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiewuh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiewuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyih : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyih_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyih_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyih_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyihb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyihb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyihb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyihb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiowh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiowh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiowh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyiowh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiowh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyiowh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwub_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_64_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyowh_64_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyowh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_rnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_rnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd_sacc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:rnd:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:rnd:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyub : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyub_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyub_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyub_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyubv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyubv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyubv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyubv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_01d3d0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_5e8512, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuhv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuhv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuhv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_3fc427, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuhv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmux : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qt4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
+tc_a3127e12, TypeCVI_VA>, Enc_31db33, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmux_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qt4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
+tc_a3127e12, TypeCVI_VA>, Enc_31db33, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnccombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnccombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+tc_2171ebae, TypeCVI_VA_DV>, Enc_8c2412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vncmov : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32),
+"if (!$Ps4) $Vd32 = $Vu32",
+tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vncmov_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32),
+"if (!$Ps4) $Vd32 = $Vu32",
+tc_b06ab583, TypeCVI_VA>, Enc_770858, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamth : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vnormamt($Vu32.h)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamth_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vnormamt($Vu32.h)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamth_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnormamth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamth_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnormamth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamtw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vnormamt($Vu32.w)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamtw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vnormamt($Vu32.w)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamtw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnormamtw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamtw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnormamtw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnot : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnot($Vu32)",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnot_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnot($Vu32)",
+tc_71337255, TypeCVI_VA>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vor : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vor($Vu32,$Vv32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vor_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vor($Vu32,$Vv32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhb_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhb_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhub_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhub_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhub_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhub_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackob : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackob_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackob_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackob_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackoh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackoh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwuh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwuh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwuh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwuh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpopcounth : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vpopcount($Vu32.h)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpopcounth_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vpopcount($Vu32.h)",
+tc_d2cb81ea, TypeCVI_VS>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpopcounth_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vpopcounth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpopcounth_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vpopcounth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrdelta : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrdelta($Vu32,$Vv32)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrdelta_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrdelta($Vu32,$Vv32)",
+tc_f3fc3f83, TypeCVI_VP>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybus : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybus_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybus_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybus_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
+tc_69b6dd20, TypeCVI_VX>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyub_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
+tc_d725e5b0, TypeCVI_VX>, Enc_5138b3, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
+tc_908a4c8c, TypeCVI_VX>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
+tc_e172d86a, TypeCVI_VX_DV>, Enc_a7341a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vror : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vror($Vu32,$Rt32)",
+tc_bf142ae2, TypeCVI_VP>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vror_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vror($Vu32,$Rt32)",
+tc_bf142ae2, TypeCVI_VP>, Enc_b087ac, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduhub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduhub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduhub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrounduhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduhub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrounduhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrounduwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrounduwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
+tc_45453b98, TypeCVI_VS>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrsadubi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrsadubi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_7e9f581b, TypeCVI_VX_DV>, Enc_2f2f04, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrsadubi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+tc_41f99e1c, TypeCVI_VX_DV>, Enc_d483b9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrsadubi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsathub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
+tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsathub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
+tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsathub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsathub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsathub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsathub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatuwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatuwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatuwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsatuwuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatuwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsatuwuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsat($Vu32.w,$Vv32.w)",
+tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsat($Vu32.w,$Vv32.w)",
+tc_9b9642a1, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsatwh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsatwh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.h = vsxt($Vu32.b)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.h = vsxt($Vu32.b)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vsxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vsxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.w = vsxt($Vu32.h)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.w = vsxt($Vu32.h)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vsxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vsxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufeh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufeh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufeh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufeh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuff : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vshuff($Vy32,$Vx32,$Rt32)",
+tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vshuff_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vshuff($Vy32,$Vx32,$Rt32)",
+tc_5c120602, TypeCVI_VP_VS>, Enc_989021, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vshuffb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.b = vshuff($Vu32.b)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.b = vshuff($Vu32.b)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vshuffb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vshuffb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffeb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffeb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffeb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffeb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vshuff($Vu32.h)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vshuff($Vu32.h)",
+tc_e6299d16, TypeCVI_VP>, Enc_e7581c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vshuffh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vshuffh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffob : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffob_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffob_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffob_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffvdd : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffvdd_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
+tc_4e2a5159, TypeCVI_VP_VS>, Enc_24a7dc, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vshuffoeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vshuffoeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vshuffoeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vshuffoeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.b -= $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.b -= $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.b -= $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.b -= $Vu32.b",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubcarry : HInst<
+(outs VectorRegs:$Vd32, VecPredRegs:$Qx4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, VecPredRegs:$Qx4in),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
+tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vsubcarry_128B : HInst<
+(outs VectorRegs128B:$Vd32, VecPredRegs128B:$Qx4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, VecPredRegs128B:$Qx4in),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
+tc_5a9fc4ec, TypeCVI_VA>, Enc_b43b67, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vsubh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.h -= $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.h -= $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.h -= $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.h -= $Vu32.h",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubububb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubububb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubuh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubuh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
+tc_eda67dcd, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubuw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubuw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubuw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubuw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.w -= $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.w -= $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.w -= $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.w -= $Vu32.w",
+tc_a3127e12, TypeCVI_VA>, Enc_a90628, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
+tc_97c165b9, TypeCVI_VA_DV>, Enc_f8ecf9, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vswap : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecPredRegs:$Qt4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
+tc_316c637c, TypeCVI_VA_DV>, Enc_3dac0b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vswap_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecPredRegs128B:$Qt4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
+tc_316c637c, TypeCVI_VA_DV>, Enc_3dac0b, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpybus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpybus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpybus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpybus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyhb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyhb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
+tc_7c3f55c4, TypeCVI_VX_DV>, Enc_aad80c, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyhb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
+tc_d98f4d63, TypeCVI_VX_DV>, Enc_d6990d, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyhb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtran2x2_map : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vtrans2x2($Vy32,$Vx32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vtran2x2_map_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vtrans2x2($Vy32,$Vx32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vunpackb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.h = vunpack($Vu32.b)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.h = vunpack($Vu32.b)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.w = vunpack($Vu32.h)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.w = vunpack($Vu32.h)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackob : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32.h |= vunpacko($Vu32.b)",
+tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32.h |= vunpacko($Vu32.b)",
+tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32 |= vunpackob($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32 |= vunpackob($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32.w |= vunpacko($Vu32.h)",
+tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32.w |= vunpacko($Vu32.h)",
+tc_72ad7b54, TypeCVI_VP_VS>, Enc_500cb0, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32 |= vunpackoh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32 |= vunpackoh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackub : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uh = vunpack($Vu32.ub)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackub_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uh = vunpack($Vu32.ub)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackub_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackub($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackub_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackub($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackuh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uw = vunpack($Vu32.uh)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackuh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uw = vunpack($Vu32.uh)",
+tc_d7bea0ec, TypeCVI_VP_VS>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackuh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackuh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackuh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackuh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128 : HInst<
+(outs),
+(ins),
+"vwhist128",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128_128B : HInst<
+(outs),
+(ins),
+"vwhist128",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128m : HInst<
+(outs),
+(ins u1_0Imm:$Ii),
+"vwhist128(#$Ii)",
+tc_b77635b4, TypeCVI_HIST>, Enc_efaed8, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128m_128B : HInst<
+(outs),
+(ins u1_0Imm:$Ii),
+"vwhist128(#$Ii)",
+tc_b77635b4, TypeCVI_HIST>, Enc_efaed8, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128q : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist128($Qv4)",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128q_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist128($Qv4)",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128qm : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, u1_0Imm:$Ii),
+"vwhist128($Qv4,#$Ii)",
+tc_28978789, TypeCVI_HIST>, Enc_802dc0, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128qm_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, u1_0Imm:$Ii),
+"vwhist128($Qv4,#$Ii)",
+tc_28978789, TypeCVI_HIST>, Enc_802dc0, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256 : HInst<
+(outs),
+(ins),
+"vwhist256",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256_128B : HInst<
+(outs),
+(ins),
+"vwhist256",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256_sat : HInst<
+(outs),
+(ins),
+"vwhist256:sat",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256_sat_128B : HInst<
+(outs),
+(ins),
+"vwhist256:sat",
+tc_e5053c8f, TypeCVI_HIST>, Enc_e3b0c4, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256q : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist256($Qv4)",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256q_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist256($Qv4)",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256q_sat : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist256($Qv4):sat",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256q_sat_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist256($Qv4):sat",
+tc_cedf314b, TypeCVI_HIST>, Enc_217147, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vxor : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vxor($Vu32,$Vv32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vxor_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vxor($Vu32,$Vv32)",
+tc_bbaf280e, TypeCVI_VA>, Enc_45364e, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uh = vzxt($Vu32.ub)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uh = vzxt($Vu32.ub)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vzxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vzxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uw = vzxt($Vu32.uh)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uw = vzxt($Vu32.uh)",
+tc_644584f8, TypeCVI_VA_DV>, Enc_dd766a, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vzxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vzxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def Y2_barrier : HInst<
+(outs),
+(ins),
+"barrier",
+tc_ef2676fd, TypeST>, Enc_e3b0c4 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b1010100000000000;
+let isSoloAX = 1;
+let hasSideEffects = 1;
+}
+def Y2_break : HInst<
+(outs),
+(ins),
+"brkpt",
+tc_bcf0e36e, TypeCR>, Enc_e3b0c4 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0110110000100000;
+let isSolo = 1;
+}
+def Y2_dccleana : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dccleana($Rs32)",
+tc_30665cb0, TypeST>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000000;
+let isSoloAin1 = 1;
+let hasSideEffects = 1;
+}
+def Y2_dccleaninva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dccleaninva($Rs32)",
+tc_30665cb0, TypeST>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000010;
+let isSoloAin1 = 1;
+let hasSideEffects = 1;
+}
+def Y2_dcfetch : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dcfetch($Rs32)",
+tc_34e882a4, TypeMAPPING> {
+let hasSideEffects = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def Y2_dcfetchbo : HInst<
+(outs),
+(ins IntRegs:$Rs32, u11_3Imm:$Ii),
+"dcfetch($Rs32+#$Ii)",
+tc_ef0ebaaa, TypeLD>, Enc_2d829e {
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10010100000;
+let addrMode = BaseImmOffset;
+let hasSideEffects = 1;
+}
+def Y2_dcinva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dcinva($Rs32)",
+tc_30665cb0, TypeST>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000001;
+let isSoloAin1 = 1;
+let hasSideEffects = 1;
+}
+def Y2_dczeroa : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dczeroa($Rs32)",
+tc_30665cb0, TypeST>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000110;
+let isSoloAin1 = 1;
+let hasSideEffects = 1;
+let mayStore = 1;
+}
+def Y2_icinva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"icinva($Rs32)",
+tc_049dfb74, TypeJ>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010110110;
+let isSolo = 1;
+}
+def Y2_isync : HInst<
+(outs),
+(ins),
+"isync",
+tc_d267fa19, TypeJ>, Enc_e3b0c4 {
+let Inst{13-0} = 0b00000000000010;
+let Inst{31-16} = 0b0101011111000000;
+let isSolo = 1;
+}
+def Y2_syncht : HInst<
+(outs),
+(ins),
+"syncht",
+tc_ef2676fd, TypeST>, Enc_e3b0c4 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b1010100001000000;
+let isSolo = 1;
+}
+def Y4_l2fetch : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"l2fetch($Rs32,$Rt32)",
+tc_f4608adc, TypeST>, Enc_ca3887 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110000;
+let isSoloAX = 1;
+let mayStore = 1;
+let hasSideEffects = 1;
+}
+def Y4_trace : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"trace($Rs32)",
+tc_4997da4a, TypeCR>, Enc_ecbcc8 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01100010010;
+let isSoloAX = 1;
+}
+def Y5_l2fetch : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"l2fetch($Rs32,$Rtt32)",
+tc_f4608adc, TypeST>, Enc_e6abcf, Requires<[HasV5T]> {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110100;
+let isSoloAX = 1;
+let mayStore = 1;
+let hasSideEffects = 1;
+}
+def dep_A2_addsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32):sat:deprecated",
+tc_47ab9233, TypeALU64>, Enc_5ab2be {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def dep_A2_subsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32):sat:deprecated",
+tc_47ab9233, TypeALU64>, Enc_bd6011 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def dep_S2_packhl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = packhl($Rs32,$Rt32):deprecated",
+tc_9c18c9a5, TypeALU64>, Enc_be32a5 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010100000;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td
new file mode 100644
index 0000000..77a56a9
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepMappings.td
@@ -0,0 +1,654 @@
+//===--- HexagonDepMappings.td --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def A2_negAlias : InstAlias<"$Rd32=neg($Rs32)", (A2_subri IntRegs:$Rd32, 0, IntRegs:$Rs32)>;
+def A2_notAlias : InstAlias<"$Rd32=not($Rs32)", (A2_subri IntRegs:$Rd32, -1, IntRegs:$Rs32)>;
+def A2_tfrfAlias : InstAlias<"if (!$Pu4) $Rd32=$Rs32", (A2_paddif IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrfnewAlias : InstAlias<"if (!$Pu4.new) $Rd32=$Rs32", (A2_paddifnew IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrtAlias : InstAlias<"if ($Pu4) $Rd32=$Rs32", (A2_paddit IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrtnewAlias : InstAlias<"if ($Pu4.new) $Rd32=$Rs32", (A2_padditnew IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_vaddb_mapAlias : InstAlias<"$Rdd32=vaddb($Rss32,$Rtt32)", (A2_vaddub DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def A2_vsubb_mapAlias : InstAlias<"$Rdd32=vsubb($Rss32,$Rtt32)", (A2_vsubub DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def A2_zxtbAlias : InstAlias<"$Rd32=zxtb($Rs32)", (A2_andir IntRegs:$Rd32, IntRegs:$Rs32, 255)>;
+def C2_cmpltAlias : InstAlias<"$Pd4=cmp.lt($Rs32,$Rt32)", (C2_cmpgt PredRegs:$Pd4, IntRegs:$Rt32, IntRegs:$Rs32)>;
+def C2_cmpltuAlias : InstAlias<"$Pd4=cmp.ltu($Rs32,$Rt32)", (C2_cmpgtu PredRegs:$Pd4, IntRegs:$Rt32, IntRegs:$Rs32)>;
+def C2_pxfer_mapAlias : InstAlias<"$Pd4=$Ps4", (C2_or PredRegs:$Pd4, PredRegs:$Ps4, PredRegs:$Ps4)>;
+def J2_jumpf_nopred_mapAlias : InstAlias<"if (!$Pu4) jump $Ii", (J2_jumpf PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def J2_jumprf_nopred_mapAlias : InstAlias<"if (!$Pu4) jumpr $Rs32", (J2_jumprf PredRegs:$Pu4, IntRegs:$Rs32)>;
+def J2_jumprt_nopred_mapAlias : InstAlias<"if ($Pu4) jumpr $Rs32", (J2_jumprt PredRegs:$Pu4, IntRegs:$Rs32)>;
+def J2_jumpt_nopred_mapAlias : InstAlias<"if ($Pu4) jump $Ii", (J2_jumpt PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def L2_loadalignb_zomapAlias : InstAlias<"$Ryy32=memb_fifo($Rs32)", (L2_loadalignb_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
+def L2_loadalignh_zomapAlias : InstAlias<"$Ryy32=memh_fifo($Rs32)", (L2_loadalignh_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
+def L2_loadbsw2_zomapAlias : InstAlias<"$Rd32=membh($Rs32)", (L2_loadbsw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadbsw4_zomapAlias : InstAlias<"$Rdd32=membh($Rs32)", (L2_loadbsw4_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadbzw2_zomapAlias : InstAlias<"$Rd32=memubh($Rs32)", (L2_loadbzw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadbzw4_zomapAlias : InstAlias<"$Rdd32=memubh($Rs32)", (L2_loadbzw4_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadrb_zomapAlias : InstAlias<"$Rd32=memb($Rs32)", (L2_loadrb_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadrd_zomapAlias : InstAlias<"$Rdd32=memd($Rs32)", (L2_loadrd_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadrh_zomapAlias : InstAlias<"$Rd32=memh($Rs32)", (L2_loadrh_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadri_zomapAlias : InstAlias<"$Rd32=memw($Rs32)", (L2_loadri_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadrub_zomapAlias : InstAlias<"$Rd32=memub($Rs32)", (L2_loadrub_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadruh_zomapAlias : InstAlias<"$Rd32=memuh($Rs32)", (L2_loadruh_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_ploadrbf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memb($Rs32)", (L2_ploadrbf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memb($Rs32)", (L2_ploadrbfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbt_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memb($Rs32)", (L2_ploadrbt_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memb($Rs32)", (L2_ploadrbtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdf_zomapAlias : InstAlias<"if (!$Pt4) $Rdd32=memd($Rs32)", (L2_ploadrdf_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rdd32=memd($Rs32)", (L2_ploadrdfnew_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdt_zomapAlias : InstAlias<"if ($Pt4) $Rdd32=memd($Rs32)", (L2_ploadrdt_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rdd32=memd($Rs32)", (L2_ploadrdtnew_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memh($Rs32)", (L2_ploadrhf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memh($Rs32)", (L2_ploadrhfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrht_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memh($Rs32)", (L2_ploadrht_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memh($Rs32)", (L2_ploadrhtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrif_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memw($Rs32)", (L2_ploadrif_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrifnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memw($Rs32)", (L2_ploadrifnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrit_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memw($Rs32)", (L2_ploadrit_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadritnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memw($Rs32)", (L2_ploadritnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memub($Rs32)", (L2_ploadrubf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memub($Rs32)", (L2_ploadrubfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubt_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memub($Rs32)", (L2_ploadrubt_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memub($Rs32)", (L2_ploadrubtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memuh($Rs32)", (L2_ploadruhf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memuh($Rs32)", (L2_ploadruhfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruht_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memuh($Rs32)", (L2_ploadruht_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memuh($Rs32)", (L2_ploadruhtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L4_add_memopb_zomapAlias : InstAlias<"memb($Rs32)+=$Rt32", (L4_add_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_add_memoph_zomapAlias : InstAlias<"memh($Rs32)+=$Rt32", (L4_add_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_add_memopw_zomapAlias : InstAlias<"memw($Rs32)+=$Rt32", (L4_add_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memopb_zomapAlias : InstAlias<"memb($Rs32)&=$Rt32", (L4_and_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memoph_zomapAlias : InstAlias<"memh($Rs32)&=$Rt32", (L4_and_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memopw_zomapAlias : InstAlias<"memw($Rs32)&=$Rt32", (L4_and_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_iadd_memopb_zomapAlias : InstAlias<"memb($Rs32)+=#$II", (L4_iadd_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iadd_memoph_zomapAlias : InstAlias<"memh($Rs32)+=#$II", (L4_iadd_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iadd_memopw_zomapAlias : InstAlias<"memw($Rs32)+=#$II", (L4_iadd_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memopb_zomapAlias : InstAlias<"memb($Rs32)=clrbit(#$II)", (L4_iand_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memoph_zomapAlias : InstAlias<"memh($Rs32)=clrbit(#$II)", (L4_iand_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memopw_zomapAlias : InstAlias<"memw($Rs32)=clrbit(#$II)", (L4_iand_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memopb_zomapAlias : InstAlias<"memb($Rs32)=setbit(#$II)", (L4_ior_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memoph_zomapAlias : InstAlias<"memh($Rs32)=setbit(#$II)", (L4_ior_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memopw_zomapAlias : InstAlias<"memw($Rs32)=setbit(#$II)", (L4_ior_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memopb_zomapAlias : InstAlias<"memb($Rs32)-=#$II", (L4_isub_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memoph_zomapAlias : InstAlias<"memh($Rs32)-=#$II", (L4_isub_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memopw_zomapAlias : InstAlias<"memw($Rs32)-=#$II", (L4_isub_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_or_memopb_zomapAlias : InstAlias<"memb($Rs32)|=$Rt32", (L4_or_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_or_memoph_zomapAlias : InstAlias<"memh($Rs32)|=$Rt32", (L4_or_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_or_memopw_zomapAlias : InstAlias<"memw($Rs32)|=$Rt32", (L4_or_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memopb_zomapAlias : InstAlias<"memb($Rs32)-=$Rt32", (L4_sub_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memoph_zomapAlias : InstAlias<"memh($Rs32)-=$Rt32", (L4_sub_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memopw_zomapAlias : InstAlias<"memw($Rs32)-=$Rt32", (L4_sub_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def M2_mpyuiAlias : InstAlias<"$Rd32=mpyui($Rs32,$Rt32)", (M2_mpyi IntRegs:$Rd32, IntRegs:$Rs32, IntRegs:$Rt32)>;
+def S2_pstorerbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=$Rt32", (S2_pstorerbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerbnewf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=$Nt8.new", (S2_pstorerbnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerbnewt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=$Nt8.new", (S2_pstorerbnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerbt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=$Rt32", (S2_pstorerbt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerdf_zomapAlias : InstAlias<"if (!$Pv4) memd($Rs32)=$Rtt32", (S2_pstorerdf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_pstorerdt_zomapAlias : InstAlias<"if ($Pv4) memd($Rs32)=$Rtt32", (S2_pstorerdt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_pstorerff_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Rt32.h", (S2_pstorerff_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerft_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Rt32.h", (S2_pstorerft_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerhf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Rt32", (S2_pstorerhf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerhnewf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Nt8.new", (S2_pstorerhnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerhnewt_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Nt8.new", (S2_pstorerhnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerht_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Rt32", (S2_pstorerht_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerif_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=$Rt32", (S2_pstorerif_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerinewf_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=$Nt8.new", (S2_pstorerinewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerinewt_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=$Nt8.new", (S2_pstorerinewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerit_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=$Rt32", (S2_pstorerit_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerb_zomapAlias : InstAlias<"memb($Rs32)=$Rt32", (S2_storerb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerbnew_zomapAlias : InstAlias<"memb($Rs32)=$Nt8.new", (S2_storerbnew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_storerd_zomapAlias : InstAlias<"memd($Rs32)=$Rtt32", (S2_storerd_io IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_storerf_zomapAlias : InstAlias<"memh($Rs32)=$Rt32.h", (S2_storerf_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerh_zomapAlias : InstAlias<"memh($Rs32)=$Rt32", (S2_storerh_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerhnew_zomapAlias : InstAlias<"memh($Rs32)=$Nt8.new", (S2_storerhnew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_storeri_zomapAlias : InstAlias<"memw($Rs32)=$Rt32", (S2_storeri_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerinew_zomapAlias : InstAlias<"memw($Rs32)=$Nt8.new", (S2_storerinew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_tableidxb_goodsyntaxAlias : InstAlias<"$Rx32=tableidxb($Rs32,#$Ii,#$II)", (S2_tableidxb IntRegs:$Rx32, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II)>;
+def S4_pstorerbfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=$Rt32", (S4_pstorerbfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerbnewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=$Nt8.new", (S4_pstorerbnewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerbnewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=$Nt8.new", (S4_pstorerbnewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerbtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=$Rt32", (S4_pstorerbtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerdfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memd($Rs32)=$Rtt32", (S4_pstorerdfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S4_pstorerdtnew_zomapAlias : InstAlias<"if ($Pv4.new) memd($Rs32)=$Rtt32", (S4_pstorerdtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S4_pstorerffnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Rt32.h", (S4_pstorerffnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerftnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Rt32.h", (S4_pstorerftnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerhfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Rt32", (S4_pstorerhfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerhnewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Nt8.new", (S4_pstorerhnewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerhnewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Nt8.new", (S4_pstorerhnewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerhtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Rt32", (S4_pstorerhtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerifnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=$Rt32", (S4_pstorerifnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerinewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=$Nt8.new", (S4_pstorerinewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerinewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=$Nt8.new", (S4_pstorerinewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstoreritnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=$Rt32", (S4_pstoreritnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_storeirb_zomapAlias : InstAlias<"memb($Rs32)=#$II", (S4_storeirb_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=#$II", (S4_storeirbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=#$II", (S4_storeirbfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=#$II", (S4_storeirbt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=#$II", (S4_storeirbtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirh_zomapAlias : InstAlias<"memh($Rs32)=#$II", (S4_storeirh_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=#$II", (S4_storeirhf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=#$II", (S4_storeirhfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirht_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=#$II", (S4_storeirht_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=#$II", (S4_storeirhtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeiri_zomapAlias : InstAlias<"memw($Rs32)=#$II", (S4_storeiri_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirif_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=#$II", (S4_storeirif_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirifnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=#$II", (S4_storeirifnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirit_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=#$II", (S4_storeirit_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeiritnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=#$II", (S4_storeiritnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def V6_MAP_equbAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equhAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equwAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_extractw_altAlias : InstAlias<"$Rd32.w=vextract($Vu32,$Rs32)", (V6_extractw IntRegs:$Rd32, VectorRegs:$Vu32, IntRegs:$Rs32)>, Requires<[UseHVX]>;
+def V6_extractw_alt_128BAlias : InstAlias<"$Rd32.w=vextract($Vu32,$Rs32)", (V6_extractw IntRegs:$Rd32, VectorRegs:$Vu32, IntRegs:$Rs32)>, Requires<[UseHVX]>;
+def V6_ld0Alias : InstAlias<"$Vd32=vmem($Rt32)", (V6_vL32b_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ld0_128BAlias : InstAlias<"$Vd32=vmem($Rt32)", (V6_vL32b_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldnt0Alias : InstAlias<"$Vd32=vmem($Rt32):nt", (V6_vL32b_nt_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldnt0_128BAlias : InstAlias<"$Vd32=vmem($Rt32):nt", (V6_vL32b_nt_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldu0Alias : InstAlias<"$Vd32=vmemu($Rt32)", (V6_vL32Ub_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldu0_128BAlias : InstAlias<"$Vd32=vmemu($Rt32)", (V6_vL32Ub_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_st0Alias : InstAlias<"vmem($Rt32)=$Vs32", (V6_vS32b_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_st0_128BAlias : InstAlias<"vmem($Rt32)=$Vs32", (V6_vS32b_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stn0Alias : InstAlias<"vmem($Rt32)=$Os8.new", (V6_vS32b_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stn0_128BAlias : InstAlias<"vmem($Rt32)=$Os8.new", (V6_vS32b_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnnt0Alias : InstAlias<"vmem($Rt32):nt=$Os8.new", (V6_vS32b_nt_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnnt0_128BAlias : InstAlias<"vmem($Rt32):nt=$Os8.new", (V6_vS32b_nt_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnp0Alias : InstAlias<"if (!$Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnp0_128BAlias : InstAlias<"if (!$Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnpnt0Alias : InstAlias<"if (!$Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnpnt0_128BAlias : InstAlias<"if (!$Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnq0Alias : InstAlias<"if (!$Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnq0_128BAlias : InstAlias<"if (!$Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnqnt0Alias : InstAlias<"if (!$Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnqnt0_128BAlias : InstAlias<"if (!$Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnt0Alias : InstAlias<"vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnt0_128BAlias : InstAlias<"vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stp0Alias : InstAlias<"if ($Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stp0_128BAlias : InstAlias<"if ($Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stpnt0Alias : InstAlias<"if ($Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stpnt0_128BAlias : InstAlias<"if ($Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stq0Alias : InstAlias<"if ($Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stq0_128BAlias : InstAlias<"if ($Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stqnt0Alias : InstAlias<"if ($Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stqnt0_128BAlias : InstAlias<"if ($Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stu0Alias : InstAlias<"vmemu($Rt32)=$Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stu0_128BAlias : InstAlias<"vmemu($Rt32)=$Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stunp0Alias : InstAlias<"if (!$Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stunp0_128BAlias : InstAlias<"if (!$Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stup0Alias : InstAlias<"if ($Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stup0_128BAlias : InstAlias<"if ($Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_vabsdiffh_altAlias : InstAlias<"$Vd32=vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffh_alt_128BAlias : InstAlias<"$Vd32=vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffub_altAlias : InstAlias<"$Vd32=vabsdiffub($Vu32,$Vv32)", (V6_vabsdiffub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffub_alt_128BAlias : InstAlias<"$Vd32=vabsdiffub($Vu32,$Vv32)", (V6_vabsdiffub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffuh_altAlias : InstAlias<"$Vd32=vabsdiffuh($Vu32,$Vv32)", (V6_vabsdiffuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffuh_alt_128BAlias : InstAlias<"$Vd32=vabsdiffuh($Vu32,$Vv32)", (V6_vabsdiffuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffw_altAlias : InstAlias<"$Vd32=vabsdiffw($Vu32,$Vv32)", (V6_vabsdiffw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffw_alt_128BAlias : InstAlias<"$Vd32=vabsdiffw($Vu32,$Vv32)", (V6_vabsdiffw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsh_altAlias : InstAlias<"$Vd32=vabsh($Vu32)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_alt_128BAlias : InstAlias<"$Vd32=vabsh($Vu32)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_sat_altAlias : InstAlias<"$Vd32=vabsh($Vu32):sat", (V6_vabsh_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_sat_alt_128BAlias : InstAlias<"$Vd32=vabsh($Vu32):sat", (V6_vabsh_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuh_altAlias : InstAlias<"$Vd32.uh=vabs($Vu32.h)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuh_alt_128BAlias : InstAlias<"$Vd32.uh=vabs($Vu32.h)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuw_altAlias : InstAlias<"$Vd32.uw=vabs($Vu32.w)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuw_alt_128BAlias : InstAlias<"$Vd32.uw=vabs($Vu32.w)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_altAlias : InstAlias<"$Vd32=vabsw($Vu32)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_alt_128BAlias : InstAlias<"$Vd32=vabsw($Vu32)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_sat_altAlias : InstAlias<"$Vd32=vabsw($Vu32):sat", (V6_vabsw_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_sat_alt_128BAlias : InstAlias<"$Vd32=vabsw($Vu32):sat", (V6_vabsw_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddb_altAlias : InstAlias<"$Vd32=vaddb($Vu32,$Vv32)", (V6_vaddb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddb_alt_128BAlias : InstAlias<"$Vd32=vaddb($Vu32,$Vv32)", (V6_vaddb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddb_dv_altAlias : InstAlias<"$Vdd32=vaddb($Vuu32,$Vvv32)", (V6_vaddb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddb_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddb($Vuu32,$Vvv32)", (V6_vaddb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddbnq_altAlias : InstAlias<"if (!$Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbnq_alt_128BAlias : InstAlias<"if (!$Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbq_altAlias : InstAlias<"if ($Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbq_alt_128BAlias : InstAlias<"if ($Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddh_altAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32)", (V6_vaddh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddh_alt_128BAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32)", (V6_vaddh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddh_dv_altAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32)", (V6_vaddh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddh_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32)", (V6_vaddh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhnq_altAlias : InstAlias<"if (!$Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhnq_alt_128BAlias : InstAlias<"if (!$Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhq_altAlias : InstAlias<"if ($Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhq_alt_128BAlias : InstAlias<"if ($Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_altAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32):sat", (V6_vaddhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_alt_128BAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32):sat", (V6_vaddhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_dv_altAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32):sat", (V6_vaddhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32):sat", (V6_vaddhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhw_altAlias : InstAlias<"$Vdd32=vaddh($Vu32,$Vv32)", (V6_vaddhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhw_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vu32,$Vv32)", (V6_vaddhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubh_altAlias : InstAlias<"$Vdd32=vaddub($Vu32,$Vv32)", (V6_vaddubh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubh_alt_128BAlias : InstAlias<"$Vdd32=vaddub($Vu32,$Vv32)", (V6_vaddubh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_altAlias : InstAlias<"$Vd32=vaddub($Vu32,$Vv32):sat", (V6_vaddubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_alt_128BAlias : InstAlias<"$Vd32=vaddub($Vu32,$Vv32):sat", (V6_vaddubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_dv_altAlias : InstAlias<"$Vdd32=vaddub($Vuu32,$Vvv32):sat", (V6_vaddubsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddub($Vuu32,$Vvv32):sat", (V6_vaddubsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_altAlias : InstAlias<"$Vd32=vadduh($Vu32,$Vv32):sat", (V6_vadduhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_alt_128BAlias : InstAlias<"$Vd32=vadduh($Vu32,$Vv32):sat", (V6_vadduhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_dv_altAlias : InstAlias<"$Vdd32=vadduh($Vuu32,$Vvv32):sat", (V6_vadduhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vadduh($Vuu32,$Vvv32):sat", (V6_vadduhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhw_altAlias : InstAlias<"$Vdd32=vadduh($Vu32,$Vv32)", (V6_vadduhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhw_alt_128BAlias : InstAlias<"$Vdd32=vadduh($Vu32,$Vv32)", (V6_vadduhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_altAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32)", (V6_vaddw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_alt_128BAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32)", (V6_vaddw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_dv_altAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32)", (V6_vaddw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddw_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32)", (V6_vaddw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddwnq_altAlias : InstAlias<"if (!$Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwnq_alt_128BAlias : InstAlias<"if (!$Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwq_altAlias : InstAlias<"if ($Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwq_alt_128BAlias : InstAlias<"if ($Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_altAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32):sat", (V6_vaddwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_alt_128BAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32):sat", (V6_vaddwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_dv_altAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32):sat", (V6_vaddwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32):sat", (V6_vaddwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vandqrt_acc_altAlias : InstAlias<"$Vx32.ub|=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt_acc VectorRegs:$Vx32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_acc_alt_128BAlias : InstAlias<"$Vx32.ub|=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt_acc VectorRegs:$Vx32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_altAlias : InstAlias<"$Vd32.ub=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt VectorRegs:$Vd32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_alt_128BAlias : InstAlias<"$Vd32.ub=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt VectorRegs:$Vd32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_acc_altAlias : InstAlias<"$Qx4.ub|=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt_acc VecPredRegs:$Qx4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_acc_alt_128BAlias : InstAlias<"$Qx4.ub|=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt_acc VecPredRegs:$Qx4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_altAlias : InstAlias<"$Qd4.ub=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt VecPredRegs:$Qd4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_alt_128BAlias : InstAlias<"$Qd4.ub=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt VecPredRegs:$Qd4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslh_altAlias : InstAlias<"$Vd32=vaslh($Vu32,$Rt32)", (V6_vaslh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslh_alt_128BAlias : InstAlias<"$Vd32=vaslh($Vu32,$Rt32)", (V6_vaslh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslhv_altAlias : InstAlias<"$Vd32=vaslh($Vu32,$Vv32)", (V6_vaslhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslhv_alt_128BAlias : InstAlias<"$Vd32=vaslh($Vu32,$Vv32)", (V6_vaslhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslw_acc_altAlias : InstAlias<"$Vx32+=vaslw($Vu32,$Rt32)", (V6_vaslw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_acc_alt_128BAlias : InstAlias<"$Vx32+=vaslw($Vu32,$Rt32)", (V6_vaslw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_altAlias : InstAlias<"$Vd32=vaslw($Vu32,$Rt32)", (V6_vaslw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_alt_128BAlias : InstAlias<"$Vd32=vaslw($Vu32,$Rt32)", (V6_vaslw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslwv_altAlias : InstAlias<"$Vd32=vaslw($Vu32,$Vv32)", (V6_vaslwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslwv_alt_128BAlias : InstAlias<"$Vd32=vaslw($Vu32,$Vv32)", (V6_vaslwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrh_altAlias : InstAlias<"$Vd32=vasrh($Vu32,$Rt32)", (V6_vasrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrh_alt_128BAlias : InstAlias<"$Vd32=vasrh($Vu32,$Rt32)", (V6_vasrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrhbrndsat_altAlias : InstAlias<"$Vd32=vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhbrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhubrndsat_altAlias : InstAlias<"$Vd32=vasrhub($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhubrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhubsat_altAlias : InstAlias<"$Vd32=vasrhub($Vu32,$Vv32,$Rt8):sat", (V6_vasrhubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhv_altAlias : InstAlias<"$Vd32=vasrh($Vu32,$Vv32)", (V6_vasrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrhv_alt_128BAlias : InstAlias<"$Vd32=vasrh($Vu32,$Vv32)", (V6_vasrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrw_acc_altAlias : InstAlias<"$Vx32+=vasrw($Vu32,$Rt32)", (V6_vasrw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_acc_alt_128BAlias : InstAlias<"$Vx32+=vasrw($Vu32,$Rt32)", (V6_vasrw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_altAlias : InstAlias<"$Vd32=vasrw($Vu32,$Rt32)", (V6_vasrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_alt_128BAlias : InstAlias<"$Vd32=vasrw($Vu32,$Rt32)", (V6_vasrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrwh_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8)", (V6_vasrwhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwhrndsat_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrwhrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwhsat_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwuhsat_altAlias : InstAlias<"$Vd32=vasrwuh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwv_altAlias : InstAlias<"$Vd32=vasrw($Vu32,$Vv32)", (V6_vasrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrwv_alt_128BAlias : InstAlias<"$Vd32=vasrw($Vu32,$Vv32)", (V6_vasrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgh_altAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32)", (V6_vavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgh_alt_128BAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32)", (V6_vavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavghrnd_altAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32):rnd", (V6_vavghrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavghrnd_alt_128BAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32):rnd", (V6_vavghrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgub_altAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32)", (V6_vavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgub_alt_128BAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32)", (V6_vavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgubrnd_altAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32):rnd", (V6_vavgubrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgubrnd_alt_128BAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32):rnd", (V6_vavgubrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguh_altAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32)", (V6_vavguh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguh_alt_128BAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32)", (V6_vavguh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguhrnd_altAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32):rnd", (V6_vavguhrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguhrnd_alt_128BAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32):rnd", (V6_vavguhrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgw_altAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32)", (V6_vavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgw_alt_128BAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32)", (V6_vavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgwrnd_altAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32):rnd", (V6_vavgwrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgwrnd_alt_128BAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32):rnd", (V6_vavgwrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vcl0h_altAlias : InstAlias<"$Vd32=vcl0h($Vu32)", (V6_vcl0h VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0h_alt_128BAlias : InstAlias<"$Vd32=vcl0h($Vu32)", (V6_vcl0h VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0w_altAlias : InstAlias<"$Vd32=vcl0w($Vu32)", (V6_vcl0w VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0w_alt_128BAlias : InstAlias<"$Vd32=vcl0w($Vu32)", (V6_vcl0w VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vd0Alias : InstAlias<"$Vd32=#0", (V6_vxor VectorRegs:$Vd32, VectorRegs:$Vd32, VectorRegs:$Vd32)>, Requires<[UseHVX]>;
+def V6_vd0_128BAlias : InstAlias<"$Vd32=#0", (V6_vxor VectorRegs:$Vd32, VectorRegs:$Vd32, VectorRegs:$Vd32)>, Requires<[UseHVX]>;
+def V6_vdd0Alias : InstAlias<"$Vdd32=#0", (V6_vsubw_dv VecDblRegs:$Vdd32, W15, W15)>, Requires<[UseHVX]>;
+def V6_vdd0_128BAlias : InstAlias<"$Vdd32=#0", (V6_vsubw_dv VecDblRegs:$Vdd32, W15, W15)>, Requires<[UseHVX]>;
+def V6_vdealb4w_altAlias : InstAlias<"$Vd32=vdealb4w($Vu32,$Vv32)", (V6_vdealb4w VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdealb4w_alt_128BAlias : InstAlias<"$Vd32=vdealb4w($Vu32,$Vv32)", (V6_vdealb4w VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdealb_altAlias : InstAlias<"$Vd32=vdealb($Vu32)", (V6_vdealb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealb_alt_128BAlias : InstAlias<"$Vd32=vdealb($Vu32)", (V6_vdealb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealh_altAlias : InstAlias<"$Vd32=vdealh($Vu32)", (V6_vdealh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealh_alt_128BAlias : InstAlias<"$Vd32=vdealh($Vu32)", (V6_vdealh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_acc_altAlias : InstAlias<"$Vx32+=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_altAlias : InstAlias<"$Vd32=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_alt_128BAlias : InstAlias<"$Vd32=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_acc_altAlias : InstAlias<"$Vxx32+=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_altAlias : InstAlias<"$Vdd32=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_alt_128BAlias : InstAlias<"$Vdd32=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_acc_altAlias : InstAlias<"$Vx32+=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_altAlias : InstAlias<"$Vd32=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_alt_128BAlias : InstAlias<"$Vd32=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_acc_altAlias : InstAlias<"$Vxx32+=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_altAlias : InstAlias<"$Vdd32=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_alt_128BAlias : InstAlias<"$Vdd32=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_altAlias : InstAlias<"$Vd32=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_altAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_acc_altAlias : InstAlias<"$Vx32+=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_altAlias : InstAlias<"$Vd32=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_alt_128BAlias : InstAlias<"$Vd32=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_acc_altAlias : InstAlias<"$Vx32+=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_altAlias : InstAlias<"$Vd32=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_alt_128BAlias : InstAlias<"$Vd32=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_altAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_acc_altAlias : InstAlias<"$Vxx32+=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_altAlias : InstAlias<"$Vdd32=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_alt_128BAlias : InstAlias<"$Vdd32=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrh_altAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Rt32)", (V6_vlsrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrh_alt_128BAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Rt32)", (V6_vlsrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrhv_altAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Vv32)", (V6_vlsrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrhv_alt_128BAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Vv32)", (V6_vlsrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrw_altAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Rt32)", (V6_vlsrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrw_alt_128BAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Rt32)", (V6_vlsrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrwv_altAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Vv32)", (V6_vlsrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrwv_alt_128BAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Vv32)", (V6_vlsrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxh_altAlias : InstAlias<"$Vd32=vmaxh($Vu32,$Vv32)", (V6_vmaxh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxh_alt_128BAlias : InstAlias<"$Vd32=vmaxh($Vu32,$Vv32)", (V6_vmaxh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxub_altAlias : InstAlias<"$Vd32=vmaxub($Vu32,$Vv32)", (V6_vmaxub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxub_alt_128BAlias : InstAlias<"$Vd32=vmaxub($Vu32,$Vv32)", (V6_vmaxub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxuh_altAlias : InstAlias<"$Vd32=vmaxuh($Vu32,$Vv32)", (V6_vmaxuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxuh_alt_128BAlias : InstAlias<"$Vd32=vmaxuh($Vu32,$Vv32)", (V6_vmaxuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxw_altAlias : InstAlias<"$Vd32=vmaxw($Vu32,$Vv32)", (V6_vmaxw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxw_alt_128BAlias : InstAlias<"$Vd32=vmaxw($Vu32,$Vv32)", (V6_vmaxw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminh_altAlias : InstAlias<"$Vd32=vminh($Vu32,$Vv32)", (V6_vminh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminh_alt_128BAlias : InstAlias<"$Vd32=vminh($Vu32,$Vv32)", (V6_vminh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminub_altAlias : InstAlias<"$Vd32=vminub($Vu32,$Vv32)", (V6_vminub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminub_alt_128BAlias : InstAlias<"$Vd32=vminub($Vu32,$Vv32)", (V6_vminub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminuh_altAlias : InstAlias<"$Vd32=vminuh($Vu32,$Vv32)", (V6_vminuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminuh_alt_128BAlias : InstAlias<"$Vd32=vminuh($Vu32,$Vv32)", (V6_vminuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminw_altAlias : InstAlias<"$Vd32=vminw($Vu32,$Vv32)", (V6_vminw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminw_alt_128BAlias : InstAlias<"$Vd32=vminw($Vu32,$Vv32)", (V6_vminw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpabus_acc_altAlias : InstAlias<"$Vxx32+=vmpabus($Vuu32,$Rt32)", (V6_vmpabus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpabus($Vuu32,$Rt32)", (V6_vmpabus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_altAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Rt32)", (V6_vmpabus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_alt_128BAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Rt32)", (V6_vmpabus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabusv_altAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Vvv32)", (V6_vmpabusv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabusv_alt_128BAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Vvv32)", (V6_vmpabusv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabuuv_altAlias : InstAlias<"$Vdd32=vmpabuu($Vuu32,$Vvv32)", (V6_vmpabuuv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabuuv_alt_128BAlias : InstAlias<"$Vdd32=vmpabuu($Vuu32,$Vvv32)", (V6_vmpabuuv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpahb_acc_altAlias : InstAlias<"$Vxx32+=vmpahb($Vuu32,$Rt32)", (V6_vmpahb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpahb($Vuu32,$Rt32)", (V6_vmpahb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_altAlias : InstAlias<"$Vdd32=vmpahb($Vuu32,$Rt32)", (V6_vmpahb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_alt_128BAlias : InstAlias<"$Vdd32=vmpahb($Vuu32,$Rt32)", (V6_vmpahb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_acc_altAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Rt32)", (V6_vmpybus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Rt32)", (V6_vmpybus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_altAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Rt32)", (V6_vmpybus VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_alt_128BAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Rt32)", (V6_vmpybus VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_acc_altAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Vv32)", (V6_vmpybusv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Vv32)", (V6_vmpybusv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_altAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Vv32)", (V6_vmpybusv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_alt_128BAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Vv32)", (V6_vmpybusv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_acc_altAlias : InstAlias<"$Vxx32+=vmpyb($Vu32,$Vv32)", (V6_vmpybv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyb($Vu32,$Vv32)", (V6_vmpybv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_altAlias : InstAlias<"$Vdd32=vmpyb($Vu32,$Vv32)", (V6_vmpybv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_alt_128BAlias : InstAlias<"$Vdd32=vmpyb($Vu32,$Vv32)", (V6_vmpybv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyewuh_altAlias : InstAlias<"$Vd32=vmpyewuh($Vu32,$Vv32)", (V6_vmpyewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyewuh_alt_128BAlias : InstAlias<"$Vd32=vmpyewuh($Vu32,$Vv32)", (V6_vmpyewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyh_altAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Rt32)", (V6_vmpyh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyh_alt_128BAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Rt32)", (V6_vmpyh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsat_acc_altAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Rt32):sat", (V6_vmpyhsat_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsat_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Rt32):sat", (V6_vmpyhsat_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsrs_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:rnd:sat", (V6_vmpyhsrs VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsrs_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:rnd:sat", (V6_vmpyhsrs VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhss_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:sat", (V6_vmpyhss VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhss_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:sat", (V6_vmpyhss VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_acc_altAlias : InstAlias<"$Vxx32+=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_altAlias : InstAlias<"$Vdd32=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_alt_128BAlias : InstAlias<"$Vdd32=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_acc_altAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Vv32)", (V6_vmpyhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Vv32)", (V6_vmpyhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_altAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Vv32)", (V6_vmpyhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_alt_128BAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Vv32)", (V6_vmpyhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhvsrs_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyhvsrs VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhvsrs_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyhvsrs VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewh_acc_altAlias : InstAlias<"$Vx32+=vmpyiewh($Vu32,$Vv32)", (V6_vmpyiewh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiewh($Vu32,$Vv32)", (V6_vmpyiewh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_acc_altAlias : InstAlias<"$Vx32+=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_altAlias : InstAlias<"$Vd32=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_alt_128BAlias : InstAlias<"$Vd32=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_acc_altAlias : InstAlias<"$Vx32+=vmpyih($Vu32,$Vv32)", (V6_vmpyih_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyih($Vu32,$Vv32)", (V6_vmpyih_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_altAlias : InstAlias<"$Vd32=vmpyih($Vu32,$Vv32)", (V6_vmpyih VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_alt_128BAlias : InstAlias<"$Vd32=vmpyih($Vu32,$Vv32)", (V6_vmpyih VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_acc_altAlias : InstAlias<"$Vx32+=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_altAlias : InstAlias<"$Vd32=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_alt_128BAlias : InstAlias<"$Vd32=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiowh_altAlias : InstAlias<"$Vd32=vmpyiowh($Vu32,$Vv32)", (V6_vmpyiowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiowh_alt_128BAlias : InstAlias<"$Vd32=vmpyiowh($Vu32,$Vv32)", (V6_vmpyiowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_acc_altAlias : InstAlias<"$Vx32+=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_altAlias : InstAlias<"$Vd32=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_alt_128BAlias : InstAlias<"$Vd32=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_acc_altAlias : InstAlias<"$Vx32+=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_altAlias : InstAlias<"$Vd32=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_alt_128BAlias : InstAlias<"$Vd32=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_altAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:sat", (V6_vmpyowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_alt_128BAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:sat", (V6_vmpyowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_rnd_altAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyowh_rnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_rnd_alt_128BAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyowh_rnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyub_acc_altAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Rt32)", (V6_vmpyub_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Rt32)", (V6_vmpyub_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_altAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Rt32)", (V6_vmpyub VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_alt_128BAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Rt32)", (V6_vmpyub VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_acc_altAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Vv32)", (V6_vmpyubv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Vv32)", (V6_vmpyubv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_altAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Vv32)", (V6_vmpyubv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_alt_128BAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Vv32)", (V6_vmpyubv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_acc_altAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_altAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_alt_128BAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_acc_altAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_altAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_alt_128BAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgh_altAlias : InstAlias<"$Vd32=vnavgh($Vu32,$Vv32)", (V6_vnavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgh_alt_128BAlias : InstAlias<"$Vd32=vnavgh($Vu32,$Vv32)", (V6_vnavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgub_altAlias : InstAlias<"$Vd32=vnavgub($Vu32,$Vv32)", (V6_vnavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgub_alt_128BAlias : InstAlias<"$Vd32=vnavgub($Vu32,$Vv32)", (V6_vnavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgw_altAlias : InstAlias<"$Vd32=vnavgw($Vu32,$Vv32)", (V6_vnavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgw_alt_128BAlias : InstAlias<"$Vd32=vnavgw($Vu32,$Vv32)", (V6_vnavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnormamth_altAlias : InstAlias<"$Vd32=vnormamth($Vu32)", (V6_vnormamth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamth_alt_128BAlias : InstAlias<"$Vd32=vnormamth($Vu32)", (V6_vnormamth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamtw_altAlias : InstAlias<"$Vd32=vnormamtw($Vu32)", (V6_vnormamtw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamtw_alt_128BAlias : InstAlias<"$Vd32=vnormamtw($Vu32)", (V6_vnormamtw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vpackeb_altAlias : InstAlias<"$Vd32=vpackeb($Vu32,$Vv32)", (V6_vpackeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeb_alt_128BAlias : InstAlias<"$Vd32=vpackeb($Vu32,$Vv32)", (V6_vpackeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeh_altAlias : InstAlias<"$Vd32=vpackeh($Vu32,$Vv32)", (V6_vpackeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeh_alt_128BAlias : InstAlias<"$Vd32=vpackeh($Vu32,$Vv32)", (V6_vpackeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhb_sat_altAlias : InstAlias<"$Vd32=vpackhb($Vu32,$Vv32):sat", (V6_vpackhb_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhb_sat_alt_128BAlias : InstAlias<"$Vd32=vpackhb($Vu32,$Vv32):sat", (V6_vpackhb_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhub_sat_altAlias : InstAlias<"$Vd32=vpackhub($Vu32,$Vv32):sat", (V6_vpackhub_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhub_sat_alt_128BAlias : InstAlias<"$Vd32=vpackhub($Vu32,$Vv32):sat", (V6_vpackhub_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackob_altAlias : InstAlias<"$Vd32=vpackob($Vu32,$Vv32)", (V6_vpackob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackob_alt_128BAlias : InstAlias<"$Vd32=vpackob($Vu32,$Vv32)", (V6_vpackob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackoh_altAlias : InstAlias<"$Vd32=vpackoh($Vu32,$Vv32)", (V6_vpackoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackoh_alt_128BAlias : InstAlias<"$Vd32=vpackoh($Vu32,$Vv32)", (V6_vpackoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwh_sat_altAlias : InstAlias<"$Vd32=vpackwh($Vu32,$Vv32):sat", (V6_vpackwh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwh_sat_alt_128BAlias : InstAlias<"$Vd32=vpackwh($Vu32,$Vv32):sat", (V6_vpackwh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwuh_sat_altAlias : InstAlias<"$Vd32=vpackwuh($Vu32,$Vv32):sat", (V6_vpackwuh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwuh_sat_alt_128BAlias : InstAlias<"$Vd32=vpackwuh($Vu32,$Vv32):sat", (V6_vpackwuh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpopcounth_altAlias : InstAlias<"$Vd32=vpopcounth($Vu32)", (V6_vpopcounth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vpopcounth_alt_128BAlias : InstAlias<"$Vd32=vpopcounth($Vu32)", (V6_vpopcounth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_acc_altAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_altAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_alt_128BAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_acc_altAlias : InstAlias<"$Vxx32+=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_altAlias : InstAlias<"$Vdd32=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_alt_128BAlias : InstAlias<"$Vdd32=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_acc_altAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_altAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_alt_128BAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_acc_altAlias : InstAlias<"$Vx32+=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_altAlias : InstAlias<"$Vd32=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_alt_128BAlias : InstAlias<"$Vd32=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_acc_altAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_altAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_alt_128BAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_acc_altAlias : InstAlias<"$Vxx32+=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_altAlias : InstAlias<"$Vdd32=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_alt_128BAlias : InstAlias<"$Vdd32=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_acc_altAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_altAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_alt_128BAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhb_altAlias : InstAlias<"$Vd32=vroundhb($Vu32,$Vv32):sat", (V6_vroundhb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhb_alt_128BAlias : InstAlias<"$Vd32=vroundhb($Vu32,$Vv32):sat", (V6_vroundhb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhub_altAlias : InstAlias<"$Vd32=vroundhub($Vu32,$Vv32):sat", (V6_vroundhub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhub_alt_128BAlias : InstAlias<"$Vd32=vroundhub($Vu32,$Vv32):sat", (V6_vroundhub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwh_altAlias : InstAlias<"$Vd32=vroundwh($Vu32,$Vv32):sat", (V6_vroundwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwh_alt_128BAlias : InstAlias<"$Vd32=vroundwh($Vu32,$Vv32):sat", (V6_vroundwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwuh_altAlias : InstAlias<"$Vd32=vroundwuh($Vu32,$Vv32):sat", (V6_vroundwuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwuh_alt_128BAlias : InstAlias<"$Vd32=vroundwuh($Vu32,$Vv32):sat", (V6_vroundwuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrsadubi_acc_altAlias : InstAlias<"$Vxx32+=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_altAlias : InstAlias<"$Vdd32=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_alt_128BAlias : InstAlias<"$Vdd32=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vsathub_altAlias : InstAlias<"$Vd32=vsathub($Vu32,$Vv32)", (V6_vsathub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsathub_alt_128BAlias : InstAlias<"$Vd32=vsathub($Vu32,$Vv32)", (V6_vsathub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsatwh_altAlias : InstAlias<"$Vd32=vsatwh($Vu32,$Vv32)", (V6_vsatwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsatwh_alt_128BAlias : InstAlias<"$Vd32=vsatwh($Vu32,$Vv32)", (V6_vsatwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsb_altAlias : InstAlias<"$Vdd32=vsxtb($Vu32)", (V6_vsb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsb_alt_128BAlias : InstAlias<"$Vdd32=vsxtb($Vu32)", (V6_vsb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsh_altAlias : InstAlias<"$Vdd32=vsxth($Vu32)", (V6_vsh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsh_alt_128BAlias : InstAlias<"$Vdd32=vsxth($Vu32)", (V6_vsh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshufeh_altAlias : InstAlias<"$Vd32=vshuffeh($Vu32,$Vv32)", (V6_vshufeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufeh_alt_128BAlias : InstAlias<"$Vd32=vshuffeh($Vu32,$Vv32)", (V6_vshufeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffb_altAlias : InstAlias<"$Vd32=vshuffb($Vu32)", (V6_vshuffb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffb_alt_128BAlias : InstAlias<"$Vd32=vshuffb($Vu32)", (V6_vshuffb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffeb_altAlias : InstAlias<"$Vd32=vshuffeb($Vu32,$Vv32)", (V6_vshuffeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffeb_alt_128BAlias : InstAlias<"$Vd32=vshuffeb($Vu32,$Vv32)", (V6_vshuffeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffh_altAlias : InstAlias<"$Vd32=vshuffh($Vu32)", (V6_vshuffh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffh_alt_128BAlias : InstAlias<"$Vd32=vshuffh($Vu32)", (V6_vshuffh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffob_altAlias : InstAlias<"$Vd32=vshuffob($Vu32,$Vv32)", (V6_vshuffob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffob_alt_128BAlias : InstAlias<"$Vd32=vshuffob($Vu32,$Vv32)", (V6_vshuffob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeb_altAlias : InstAlias<"$Vdd32=vshuffoeb($Vu32,$Vv32)", (V6_vshufoeb VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeb_alt_128BAlias : InstAlias<"$Vdd32=vshuffoeb($Vu32,$Vv32)", (V6_vshufoeb VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeh_altAlias : InstAlias<"$Vdd32=vshuffoeh($Vu32,$Vv32)", (V6_vshufoeh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeh_alt_128BAlias : InstAlias<"$Vdd32=vshuffoeh($Vu32,$Vv32)", (V6_vshufoeh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoh_altAlias : InstAlias<"$Vd32=vshuffoh($Vu32,$Vv32)", (V6_vshufoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoh_alt_128BAlias : InstAlias<"$Vd32=vshuffoh($Vu32,$Vv32)", (V6_vshufoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_altAlias : InstAlias<"$Vd32=vsubb($Vu32,$Vv32)", (V6_vsubb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_alt_128BAlias : InstAlias<"$Vd32=vsubb($Vu32,$Vv32)", (V6_vsubb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_dv_altAlias : InstAlias<"$Vdd32=vsubb($Vuu32,$Vvv32)", (V6_vsubb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubb_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubb($Vuu32,$Vvv32)", (V6_vsubb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubbnq_altAlias : InstAlias<"if (!$Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbnq_alt_128BAlias : InstAlias<"if (!$Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbq_altAlias : InstAlias<"if ($Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbq_alt_128BAlias : InstAlias<"if ($Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubh_altAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32)", (V6_vsubh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubh_alt_128BAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32)", (V6_vsubh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubh_dv_altAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32)", (V6_vsubh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubh_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32)", (V6_vsubh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhnq_altAlias : InstAlias<"if (!$Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhnq_alt_128BAlias : InstAlias<"if (!$Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhq_altAlias : InstAlias<"if ($Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhq_alt_128BAlias : InstAlias<"if ($Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_altAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32):sat", (V6_vsubhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_alt_128BAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32):sat", (V6_vsubhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_dv_altAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32):sat", (V6_vsubhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32):sat", (V6_vsubhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhw_altAlias : InstAlias<"$Vdd32=vsubh($Vu32,$Vv32)", (V6_vsubhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhw_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vu32,$Vv32)", (V6_vsubhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububh_altAlias : InstAlias<"$Vdd32=vsubub($Vu32,$Vv32)", (V6_vsububh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububh_alt_128BAlias : InstAlias<"$Vdd32=vsubub($Vu32,$Vv32)", (V6_vsububh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_altAlias : InstAlias<"$Vd32=vsubub($Vu32,$Vv32):sat", (V6_vsububsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_alt_128BAlias : InstAlias<"$Vd32=vsubub($Vu32,$Vv32):sat", (V6_vsububsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_dv_altAlias : InstAlias<"$Vdd32=vsubub($Vuu32,$Vvv32):sat", (V6_vsububsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubub($Vuu32,$Vvv32):sat", (V6_vsububsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_altAlias : InstAlias<"$Vd32=vsubuh($Vu32,$Vv32):sat", (V6_vsubuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_alt_128BAlias : InstAlias<"$Vd32=vsubuh($Vu32,$Vv32):sat", (V6_vsubuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_dv_altAlias : InstAlias<"$Vdd32=vsubuh($Vuu32,$Vvv32):sat", (V6_vsubuhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubuh($Vuu32,$Vvv32):sat", (V6_vsubuhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhw_altAlias : InstAlias<"$Vdd32=vsubuh($Vu32,$Vv32)", (V6_vsubuhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhw_alt_128BAlias : InstAlias<"$Vdd32=vsubuh($Vu32,$Vv32)", (V6_vsubuhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_altAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32)", (V6_vsubw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_alt_128BAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32)", (V6_vsubw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_dv_altAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32)", (V6_vsubw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubw_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32)", (V6_vsubw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubwnq_altAlias : InstAlias<"if (!$Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwnq_alt_128BAlias : InstAlias<"if (!$Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwq_altAlias : InstAlias<"if ($Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwq_alt_128BAlias : InstAlias<"if ($Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_altAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32):sat", (V6_vsubwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_alt_128BAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32):sat", (V6_vsubwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_dv_altAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32):sat", (V6_vsubwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32):sat", (V6_vsubwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_acc_altAlias : InstAlias<"$Vxx32+=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_altAlias : InstAlias<"$Vdd32=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_alt_128BAlias : InstAlias<"$Vdd32=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_acc_altAlias : InstAlias<"$Vxx32+=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_altAlias : InstAlias<"$Vdd32=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_alt_128BAlias : InstAlias<"$Vdd32=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_acc_altAlias : InstAlias<"$Vxx32+=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_altAlias : InstAlias<"$Vdd32=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_alt_128BAlias : InstAlias<"$Vdd32=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtran2x2_mapAlias : InstAlias<"vtrans2x2($Vy32,$Vx32,$Rt32)", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtran2x2_map_128BAlias : InstAlias<"vtrans2x2($Vy32,$Vx32,$Rt32)", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vunpackb_altAlias : InstAlias<"$Vdd32=vunpackb($Vu32)", (V6_vunpackb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackb_alt_128BAlias : InstAlias<"$Vdd32=vunpackb($Vu32)", (V6_vunpackb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackh_altAlias : InstAlias<"$Vdd32=vunpackh($Vu32)", (V6_vunpackh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackh_alt_128BAlias : InstAlias<"$Vdd32=vunpackh($Vu32)", (V6_vunpackh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackoh_altAlias : InstAlias<"$Vxx32|=vunpackoh($Vu32)", (V6_vunpackoh VecDblRegs:$Vxx32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackoh_alt_128BAlias : InstAlias<"$Vxx32|=vunpackoh($Vu32)", (V6_vunpackoh VecDblRegs:$Vxx32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackub_altAlias : InstAlias<"$Vdd32=vunpackub($Vu32)", (V6_vunpackub VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackub_alt_128BAlias : InstAlias<"$Vdd32=vunpackub($Vu32)", (V6_vunpackub VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackuh_altAlias : InstAlias<"$Vdd32=vunpackuh($Vu32)", (V6_vunpackuh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackuh_alt_128BAlias : InstAlias<"$Vdd32=vunpackuh($Vu32)", (V6_vunpackuh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzb_altAlias : InstAlias<"$Vdd32=vzxtb($Vu32)", (V6_vzb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzb_alt_128BAlias : InstAlias<"$Vdd32=vzxtb($Vu32)", (V6_vzb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzh_altAlias : InstAlias<"$Vdd32=vzxth($Vu32)", (V6_vzh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzh_alt_128BAlias : InstAlias<"$Vdd32=vzxth($Vu32)", (V6_vzh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def Y2_dcfetchAlias : InstAlias<"dcfetch($Rs32)", (Y2_dcfetchbo IntRegs:$Rs32, 0)>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonDepOperands.td
new file mode 100644
index 0000000..0e83b26
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepOperands.td
@@ -0,0 +1,132 @@
+//===--- HexagonDepOperands.td --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
+def s3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
+def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
+def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
+def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
+def s10_6ImmOperand : AsmOperandClass { let Name = "s10_6Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s10_6Imm : Operand<i32> { let ParserMatchClass = s10_6ImmOperand; let DecoderMethod = "s10_6ImmDecoder"; }
+def s10_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 6>(N->getSExtValue());}]>;
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
+def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def a30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
+def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
+def s8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
+def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
+def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
+def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
+def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
+def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b15_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
+def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
+def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
+def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def m32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
+def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u3_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
+def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
+def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u1_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
+def s31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
+def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
+def s30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
+def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
+def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
+def s6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
+def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
+def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
+def s32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
+def s6_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
+def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
+def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
+def s4_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
+def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u16_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
+def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
+def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
+def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
+def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
+def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u7_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b13_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
+def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
+def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u2_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
+def s4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
+def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
+def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
+def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+def s10_0ImmOperand : AsmOperandClass { let Name = "s10_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s10_0Imm : Operand<i32> { let ParserMatchClass = s10_0ImmOperand; let DecoderMethod = "s10_0ImmDecoder"; }
+def s10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 0>(N->getSExtValue());}]>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h b/contrib/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
new file mode 100644
index 0000000..5296303
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
@@ -0,0 +1,132 @@
+//===--- HexagonDepTimingClasses.h ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+static bool is_TC3x(unsigned SchedClass) {
+  switch (SchedClass) {
+  case Hexagon::Sched::tc_1000eb10:
+  case Hexagon::Sched::tc_2aaab1e0:
+  case Hexagon::Sched::tc_4997da4a:
+  case Hexagon::Sched::tc_5d806107:
+  case Hexagon::Sched::tc_6264c5e0:
+  case Hexagon::Sched::tc_69bb508b:
+  case Hexagon::Sched::tc_8c8041e6:
+  case Hexagon::Sched::tc_8cb685d9:
+  case Hexagon::Sched::tc_a12a5971:
+  case Hexagon::Sched::tc_ae0722f7:
+  case Hexagon::Sched::tc_ae2c2dc2:
+  case Hexagon::Sched::tc_bc5561d8:
+  case Hexagon::Sched::tc_d6a805a8:
+  case Hexagon::Sched::tc_f055fbb6:
+  case Hexagon::Sched::tc_feb4974b:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool is_TC2early(unsigned SchedClass) {
+  switch (SchedClass) {
+  case Hexagon::Sched::tc_35fb9d13:
+  case Hexagon::Sched::tc_cbe45117:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool is_TC4x(unsigned SchedClass) {
+  switch (SchedClass) {
+  case Hexagon::Sched::tc_09c86199:
+  case Hexagon::Sched::tc_2d1e6f5c:
+  case Hexagon::Sched::tc_2e55aa16:
+  case Hexagon::Sched::tc_3bea1824:
+  case Hexagon::Sched::tc_e836c161:
+  case Hexagon::Sched::tc_f1aa2cdb:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool is_TC2(unsigned SchedClass) {
+  switch (SchedClass) {
+  case Hexagon::Sched::tc_090485bb:
+  case Hexagon::Sched::tc_1fe8323c:
+  case Hexagon::Sched::tc_37326008:
+  case Hexagon::Sched::tc_3c10f809:
+  case Hexagon::Sched::tc_47ab9233:
+  case Hexagon::Sched::tc_485bb57c:
+  case Hexagon::Sched::tc_511f28f6:
+  case Hexagon::Sched::tc_583510c7:
+  case Hexagon::Sched::tc_63cd9d2d:
+  case Hexagon::Sched::tc_76c4c5ef:
+  case Hexagon::Sched::tc_7ca2ea10:
+  case Hexagon::Sched::tc_87601822:
+  case Hexagon::Sched::tc_88fa2da6:
+  case Hexagon::Sched::tc_94e6ffd9:
+  case Hexagon::Sched::tc_ab1b5e74:
+  case Hexagon::Sched::tc_b0f50e3c:
+  case Hexagon::Sched::tc_bd16579e:
+  case Hexagon::Sched::tc_c0cd91a8:
+  case Hexagon::Sched::tc_ca280e8b:
+  case Hexagon::Sched::tc_cd321066:
+  case Hexagon::Sched::tc_d95f4e98:
+  case Hexagon::Sched::tc_e17ce9ad:
+  case Hexagon::Sched::tc_f1240c08:
+  case Hexagon::Sched::tc_faab1248:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool is_TC1(unsigned SchedClass) {
+  switch (SchedClass) {
+  case Hexagon::Sched::tc_07ac815d:
+  case Hexagon::Sched::tc_1b6011fb:
+  case Hexagon::Sched::tc_1b834fe7:
+  case Hexagon::Sched::tc_1e062b18:
+  case Hexagon::Sched::tc_1f9668cc:
+  case Hexagon::Sched::tc_43068634:
+  case Hexagon::Sched::tc_47f0b7ad:
+  case Hexagon::Sched::tc_537e2013:
+  case Hexagon::Sched::tc_548f402d:
+  case Hexagon::Sched::tc_5fa2857c:
+  case Hexagon::Sched::tc_5fe9fcd0:
+  case Hexagon::Sched::tc_78b3c689:
+  case Hexagon::Sched::tc_7c2dcd4d:
+  case Hexagon::Sched::tc_81a23d44:
+  case Hexagon::Sched::tc_821c4233:
+  case Hexagon::Sched::tc_92d1833c:
+  case Hexagon::Sched::tc_9a13af9d:
+  case Hexagon::Sched::tc_9c18c9a5:
+  case Hexagon::Sched::tc_9df8b0dc:
+  case Hexagon::Sched::tc_9f518242:
+  case Hexagon::Sched::tc_a1fb80e1:
+  case Hexagon::Sched::tc_a333d2a9:
+  case Hexagon::Sched::tc_a87879e8:
+  case Hexagon::Sched::tc_aad55963:
+  case Hexagon::Sched::tc_b08b653e:
+  case Hexagon::Sched::tc_b324366f:
+  case Hexagon::Sched::tc_b5bfaa60:
+  case Hexagon::Sched::tc_b86c7e8b:
+  case Hexagon::Sched::tc_c58f771a:
+  case Hexagon::Sched::tc_d108a090:
+  case Hexagon::Sched::tc_d1b5a4b6:
+  case Hexagon::Sched::tc_d2609065:
+  case Hexagon::Sched::tc_d63b71d1:
+  case Hexagon::Sched::tc_e2c31426:
+  case Hexagon::Sched::tc_e8c7a357:
+  case Hexagon::Sched::tc_eb07ef6f:
+  case Hexagon::Sched::tc_f16d5b17:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index a5351cd..8036101 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -59,15 +59,13 @@
 //         J2_jump <BB#6>, %PC<imp-def,dead>
 //     Successors according to CFG: BB#6 BB#3
 
-#define DEBUG_TYPE "hexagon-eif"
-
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -90,6 +88,8 @@
 #include <cassert>
 #include <iterator>
 
+#define DEBUG_TYPE "hexagon-eif"
+
 using namespace llvm;
 
 namespace llvm {
@@ -105,6 +105,8 @@ namespace {
     cl::init(false), cl::desc("Enable branch probability info"));
   cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
     cl::desc("Size limit in Hexagon early if-conversion"));
+  cl::opt<bool> SkipExitBranches("eif-no-loop-exit", cl::init(false),
+    cl::Hidden, cl::desc("Do not convert branches that may exit the loop"));
 
   struct PrintMB {
     PrintMB(const MachineBasicBlock *B) : MB(B) {}
@@ -142,8 +144,8 @@ namespace {
   raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
     OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
        << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
-       << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
-       << PrintMB(P.FP.FalseB)
+       << ", TrueB:" << PrintMB(P.FP.TrueB)
+       << ", FalseB:" << PrintMB(P.FP.FalseB)
        << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
     return OS;
   }
@@ -187,7 +189,8 @@ namespace {
     bool usesUndefVReg(const MachineInstr *MI) const;
     bool isValid(const FlowPattern &FP) const;
     unsigned countPredicateDefs(const MachineBasicBlock *B) const;
-    unsigned computePhiCost(MachineBasicBlock *B) const;
+    unsigned computePhiCost(const MachineBasicBlock *B,
+          const FlowPattern &FP) const;
     bool isProfitable(const FlowPattern &FP) const;
     bool isPredicableStore(const MachineInstr *MI) const;
     bool isSafeToSpeculate(const MachineInstr *MI) const;
@@ -199,6 +202,9 @@ namespace {
           MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
           unsigned PredR, bool IfTrue);
 
+    unsigned buildMux(MachineBasicBlock *B, MachineBasicBlock::iterator At,
+          const TargetRegisterClass *DRC, unsigned PredR, unsigned TR,
+          unsigned TSR, unsigned FR, unsigned FSR);
     void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
     void convert(const FlowPattern &FP);
 
@@ -230,7 +236,7 @@ bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
     return false;
   MachineBasicBlock *SB = *B->succ_begin();
   MachineLoop *L = MLI->getLoopFor(SB);
-  return L && SB == L->getHeader();
+  return L && SB == L->getHeader() && MDT->dominates(B, SB);
 }
 
 bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
@@ -264,9 +270,6 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
     // mark as diamond with both sides equal?
     return false;
   }
-  // Loop could be null for both.
-  if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
-    return false;
 
   // Record the true/false blocks in such a way that "true" means "if (PredR)",
   // and "false" means "if (!PredR)".
@@ -289,8 +292,14 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
   // it has a single successor. In fact, the block has to end either with
   // an unconditional branch (which can be predicated), or with a fall-
   // through.
-  bool TOk = (TNP == 1) && (TNS == 1);
-  bool FOk = (FNP == 1) && (FNS == 1);
+  // Also, skip blocks that do not belong to the same loop.
+  bool TOk = (TNP == 1 && TNS == 1 && MLI->getLoopFor(TB) == L);
+  bool FOk = (FNP == 1 && FNS == 1 && MLI->getLoopFor(FB) == L);
+
+  // If requested (via an option), do not consider branches where the
+  // true and false targets do not belong to the same loop.
+  if (SkipExitBranches && MLI->getLoopFor(TB) != MLI->getLoopFor(FB))
+    return false;
 
   // If neither is predicable, there is nothing interesting.
   if (!TOk && !FOk)
@@ -307,17 +316,15 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
       // Diamond: "if (P) then TB; else FB;".
     } else {
       // TOk && !FOk
-      if (TSB == FB) {
+      if (TSB == FB)
         JB = FB;
-        FB = nullptr;
-      }
+      FB = nullptr;
     }
   } else {
     // !TOk && FOk  (at least one must be true by now).
-    if (FSB == TB) {
+    if (FSB == TB)
       JB = TB;
-      TB = nullptr;
-    }
+    TB = nullptr;
   }
   // Don't try to predicate loop preheaders.
   if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
@@ -383,8 +390,14 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
       unsigned R = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(R))
         continue;
-      if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
-        continue;
+      switch (MRI->getRegClass(R)->getID()) {
+        case Hexagon::PredRegsRegClassID:
+        case Hexagon::VecPredRegsRegClassID:
+        case Hexagon::VecPredRegs128BRegClassID:
+          break;
+        default:
+          continue;
+      }
       for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
         if (U->getParent()->isPHI())
           return false;
@@ -442,24 +455,39 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
   return true;
 }
 
-unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
-  assert(B->pred_size() <= 2);
+unsigned HexagonEarlyIfConversion::computePhiCost(const MachineBasicBlock *B,
+      const FlowPattern &FP) const {
   if (B->pred_size() < 2)
     return 0;
 
   unsigned Cost = 0;
-  MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
-  for (I = B->begin(); I != E; ++I) {
-    const MachineOperand &RO1 = I->getOperand(1);
-    const MachineOperand &RO3 = I->getOperand(3);
-    assert(RO1.isReg() && RO3.isReg());
+  for (const MachineInstr &MI : *B) {
+    if (!MI.isPHI())
+      break;
+    // If both incoming blocks are one of the TrueB/FalseB/SplitB, then
+    // a MUX may be needed. Otherwise the PHI will need to be updated at
+    // no extra cost.
+    // Find the interesting PHI operands for further checks.
+    SmallVector<unsigned,2> Inc;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      const MachineBasicBlock *BB = MI.getOperand(i+1).getMBB();
+      if (BB == FP.SplitB || BB == FP.TrueB || BB == FP.FalseB)
+        Inc.push_back(i);
+    }
+    assert(Inc.size() <= 2);
+    if (Inc.size() < 2)
+      continue;
+
+    const MachineOperand &RA = MI.getOperand(1);
+    const MachineOperand &RB = MI.getOperand(3);
+    assert(RA.isReg() && RB.isReg());
     // Must have a MUX if the phi uses a subregister.
-    if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+    if (RA.getSubReg() != 0 || RB.getSubReg() != 0) {
       Cost++;
       continue;
     }
-    MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
-    MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+    const MachineInstr *Def1 = MRI->getVRegDef(RA.getReg());
+    const MachineInstr *Def3 = MRI->getVRegDef(RB.getReg());
     if (!HII->isPredicable(*Def1) || !HII->isPredicable(*Def3))
       Cost++;
   }
@@ -485,7 +513,6 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
 
 bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   if (FP.TrueB && FP.FalseB) {
-
     // Do not IfCovert if the branch is one sided.
     if (MBPI) {
       BranchProbability Prob(9, 10);
@@ -510,18 +537,16 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   // the code size. If the predicated blocks are smaller than a packet size,
   // approximate the spare room in the packet that could be filled with the
   // predicated/speculated instructions.
-  unsigned TS = 0, FS = 0, Spare = 0;
-  if (FP.TrueB) {
-    TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
-    if (TS < HEXAGON_PACKET_SIZE)
-      Spare += HEXAGON_PACKET_SIZE-TS;
-  }
-  if (FP.FalseB) {
-    FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
-    if (FS < HEXAGON_PACKET_SIZE)
-      Spare += HEXAGON_PACKET_SIZE-TS;
-  }
-  unsigned TotalIn = TS+FS;
+  auto TotalCount = [] (const MachineBasicBlock *B, unsigned &Spare) {
+    if (!B)
+      return 0u;
+    unsigned T = std::distance(B->begin(), B->getFirstTerminator());
+    if (T < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-T;
+    return T;
+  };
+  unsigned Spare = 0;
+  unsigned TotalIn = TotalCount(FP.TrueB, Spare) + TotalCount(FP.FalseB, Spare);
   DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
                << TotalIn << ", spare room: " << Spare << "\n");
   if (TotalIn >= SizeLimit+Spare)
@@ -536,17 +561,17 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   unsigned TotalPh = 0;
   unsigned PredDefs = countPredicateDefs(FP.SplitB);
   if (FP.JoinB) {
-    TotalPh = computePhiCost(FP.JoinB);
+    TotalPh = computePhiCost(FP.JoinB, FP);
     PredDefs += countPredicateDefs(FP.JoinB);
   } else {
     if (FP.TrueB && FP.TrueB->succ_size() > 0) {
       MachineBasicBlock *SB = *FP.TrueB->succ_begin();
-      TotalPh += computePhiCost(SB);
+      TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
     if (FP.FalseB && FP.FalseB->succ_size() > 0) {
       MachineBasicBlock *SB = *FP.FalseB->succ_begin();
-      TotalPh += computePhiCost(SB);
+      TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
   }
@@ -680,12 +705,12 @@ void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB,
     MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, HII->get(COpc));
     MachineInstr::mop_iterator MOI = MI->operands_begin();
     if (HII->isPostIncrement(*MI)) {
-      MIB.addOperand(*MOI);
+      MIB.add(*MOI);
       ++MOI;
     }
     MIB.addReg(PredR);
     for (const MachineOperand &MO : make_range(MOI, MI->operands_end()))
-      MIB.addOperand(MO);
+      MIB.add(MO);
 
     // Set memory references.
     MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
@@ -733,6 +758,43 @@ void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
   }
 }
 
+unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
+      MachineBasicBlock::iterator At, const TargetRegisterClass *DRC,
+      unsigned PredR, unsigned TR, unsigned TSR, unsigned FR, unsigned FSR) {
+  unsigned Opc = 0;
+  switch (DRC->getID()) {
+    case Hexagon::IntRegsRegClassID:
+      Opc = Hexagon::C2_mux;
+      break;
+    case Hexagon::DoubleRegsRegClassID:
+      Opc = Hexagon::PS_pselect;
+      break;
+    case Hexagon::VectorRegsRegClassID:
+      Opc = Hexagon::PS_vselect;
+      break;
+    case Hexagon::VecDblRegsRegClassID:
+      Opc = Hexagon::PS_wselect;
+      break;
+    case Hexagon::VectorRegs128BRegClassID:
+      Opc = Hexagon::PS_vselect_128B;
+      break;
+    case Hexagon::VecDblRegs128BRegClassID:
+      Opc = Hexagon::PS_wselect_128B;
+      break;
+    default:
+      llvm_unreachable("unexpected register type");
+  }
+  const MCInstrDesc &D = HII->get(Opc);
+
+  DebugLoc DL = B->findBranchDebugLoc();
+  unsigned MuxR = MRI->createVirtualRegister(DRC);
+  BuildMI(*B, At, DL, D, MuxR)
+    .addReg(PredR)
+    .addReg(TR, 0, TSR)
+    .addReg(FR, 0, FSR);
+  return MuxR;
+}
+
 void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
       const FlowPattern &FP) {
   // Visit all PHI nodes in the WhereB block and generate MUX instructions
@@ -759,40 +821,25 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
       TR = SR, TSR = SSR;
     else if (FR == 0)
       FR = SR, FSR = SSR;
-    assert(TR && FR);
-
-    using namespace Hexagon;
-
-    unsigned DR = PN->getOperand(0).getReg();
-    const TargetRegisterClass *RC = MRI->getRegClass(DR);
-    unsigned Opc = 0;
-    if (RC == &IntRegsRegClass)
-      Opc = C2_mux;
-    else if (RC == &DoubleRegsRegClass)
-      Opc = PS_pselect;
-    else if (RC == &VectorRegsRegClass)
-      Opc = PS_vselect;
-    else if (RC == &VecDblRegsRegClass)
-      Opc = PS_wselect;
-    else if (RC == &VectorRegs128BRegClass)
-      Opc = PS_vselect_128B;
-    else if (RC == &VecDblRegs128BRegClass)
-      Opc = PS_wselect_128B;
-    else
-      llvm_unreachable("unexpected register type");
-    const MCInstrDesc &D = HII->get(Opc);
-
-    MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
-    DebugLoc DL;
-    if (MuxAt != FP.SplitB->end())
-      DL = MuxAt->getDebugLoc();
-    unsigned MuxR = MRI->createVirtualRegister(RC);
-    BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
-      .addReg(FP.PredR)
-      .addReg(TR, 0, TSR)
-      .addReg(FR, 0, FSR);
-
-    PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+
+    assert(TR || FR);
+    unsigned MuxR = 0, MuxSR = 0;
+
+    if (TR && FR) {
+      unsigned DR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(DR);
+      MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC,
+                      FP.PredR, TR, TSR, FR, FSR);
+    } else if (TR) {
+      MuxR = TR;
+      MuxSR = TSR;
+    } else {
+      MuxR = FR;
+      MuxSR = FSR;
+    }
+
+    PN->addOperand(MachineOperand::CreateReg(MuxR, false, false, false, false,
+                                             false, false, MuxSR));
     PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
   }
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 8f070d8..a2f6dd6 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -86,8 +86,6 @@
 // however, is that finding the locations where the implicit uses need
 // to be added, and updating the live ranges will be more involved.
 
-#define DEBUG_TYPE "expand-condsets"
-
 #include "HexagonInstrInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
@@ -116,6 +114,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "expand-condsets"
+
 using namespace llvm;
 
 static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit",
@@ -362,14 +362,16 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   if (Range.empty())
     return;
 
-  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> bool {
+  // Return two booleans: { def-modifes-reg, def-covers-reg }.
+  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> std::pair<bool,bool> {
     if (!Op.isReg() || !Op.isDef())
-      return false;
+      return { false, false };
     unsigned DR = Op.getReg(), DSR = Op.getSubReg();
     if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
-      return false;
+      return { false, false };
     LaneBitmask SLM = getLaneMask(DR, DSR);
-    return (SLM & LM).any();
+    LaneBitmask A = SLM & LM;
+    return { A.any(), A == SLM };
   };
 
   // The splitting step will create pairs of predicated definitions without
@@ -453,20 +455,27 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   // Remove <dead> flags from all defs that are not dead after live range
   // extension, and collect all def operands. They will be used to generate
   // the necessary implicit uses.
+  // At the same time, add <dead> flag to all defs that are actually dead.
+  // This can happen, for example, when a mux with identical inputs is
+  // replaced with a COPY: the use of the predicate register disappears and
+  // the dead can become dead.
   std::set<RegisterRef> DefRegs;
   for (auto &Seg : Range) {
     if (!Seg.start.isRegister())
       continue;
     MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
     for (auto &Op : DefI->operands()) {
-      if (Seg.start.isDead() || !IsRegDef(Op))
-        continue;
-      DefRegs.insert(Op);
-      Op.setIsDead(false);
+      auto P = IsRegDef(Op);
+      if (P.second && Seg.end.isDead()) {
+        Op.setIsDead(true);
+      } else if (P.first) {
+        DefRegs.insert(Op);
+        Op.setIsDead(false);
+      }
     }
   }
 
-  // Finally, add implicit uses to each predicated def that is reached
+  // Now, add implicit uses to each predicated def that is reached
   // by other defs.
   for (auto &Seg : Range) {
     if (!Seg.start.isRegister() || !Range.liveAt(Seg.start.getPrevSlot()))
@@ -486,6 +495,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
     for (RegisterRef R : ImpUses)
       MachineInstrBuilder(MF, DefI).addReg(R.Reg, RegState::Implicit, R.Sub);
   }
+
 }
 
 void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
@@ -549,16 +559,27 @@ unsigned HexagonExpandCondsets::getCondTfrOpcode(const MachineOperand &SO,
     }
     unsigned PhysS = (RS.Sub == 0) ? PhysR : TRI->getSubReg(PhysR, RS.Sub);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysS);
-    switch (RC->getSize()) {
-      case 4:
+    switch (TRI->getRegSizeInBits(*RC)) {
+      case 32:
         return IfTrue ? A2_tfrt : A2_tfrf;
-      case 8:
+      case 64:
         return IfTrue ? A2_tfrpt : A2_tfrpf;
     }
     llvm_unreachable("Invalid register operand");
   }
-  if (SO.isImm() || SO.isFPImm())
-    return IfTrue ? C2_cmoveit : C2_cmoveif;
+  switch (SO.getType()) {
+    case MachineOperand::MO_Immediate:
+    case MachineOperand::MO_FPImmediate:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_TargetIndex:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_BlockAddress:
+      return IfTrue ? C2_cmoveit : C2_cmoveif;
+    default:
+      break;
+  }
   llvm_unreachable("Unexpected source operand");
 }
 
@@ -595,9 +616,9 @@ MachineInstr *HexagonExpandCondsets::genCondTfrFor(MachineOperand &SrcOp,
           .addReg(SrcOp.getReg(), SrcState, SrcOp.getSubReg());
   } else {
     MIB = BuildMI(B, At, DL, HII->get(Opc))
-          .addReg(DstR, DstState, DstSR)
-          .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
-          .addOperand(SrcOp);
+              .addReg(DstR, DstState, DstSR)
+              .addReg(PredOp.getReg(), PredState, PredOp.getSubReg())
+              .add(SrcOp);
   }
 
   DEBUG(dbgs() << "created an initial copy: " << *MIB);
@@ -622,6 +643,12 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   bool ReadUndef = MD.isUndef();
   MachineBasicBlock::iterator At = MI;
 
+  auto updateRegs = [&UpdRegs] (const MachineInstr &MI) -> void {
+    for (auto &Op : MI.operands())
+      if (Op.isReg())
+        UpdRegs.insert(Op.getReg());
+  };
+
   // If this is a mux of the same register, just replace it with COPY.
   // Ideally, this would happen earlier, so that register coalescing would
   // see it.
@@ -630,6 +657,8 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   if (ST.isReg() && SF.isReg()) {
     RegisterRef RT(ST);
     if (RT == RegisterRef(SF)) {
+      // Copy regs to update first.
+      updateRegs(MI);
       MI.setDesc(HII->get(TargetOpcode::COPY));
       unsigned S = getRegState(ST);
       while (MI.getNumOperands() > 1)
@@ -651,9 +680,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   LIS->InsertMachineInstrInMaps(*TfrF);
 
   // Will need to recalculate live intervals for all registers in MI.
-  for (auto &Op : MI.operands())
-    if (Op.isReg())
-      UpdRegs.insert(Op.getReg());
+  updateRegs(MI);
 
   removeInstr(MI);
   return true;
@@ -828,7 +855,7 @@ void HexagonExpandCondsets::predicateAt(const MachineOperand &DefOp,
   while (Ox < NP) {
     MachineOperand &MO = MI.getOperand(Ox);
     if (!MO.isReg() || !MO.isImplicit())
-      MB.addOperand(MO);
+      MB.add(MO);
     Ox++;
   }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index dfd1f1d..23d4e26 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -12,10 +12,9 @@
 // form.
 //===----------------------------------------------------------------------===//
 
-
-#include "llvm/ADT/DenseMap.h"
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -190,5 +189,5 @@ void HexagonFixupHwLoops::useExtLoopInstr(MachineFunction &MF,
   MIB = BuildMI(*MBB, MII, DL, TII->get(newOp));
 
   for (unsigned i = 0; i < MII->getNumOperands(); ++i)
-    MIB.addOperand(MII->getOperand(i));
+    MIB.add(MII->getOperand(i));
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a3f6273..e5e7519 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -8,10 +8,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-pei"
-
-#include "HexagonBlockRanges.h"
 #include "HexagonFrameLowering.h"
+#include "HexagonBlockRanges.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonRegisterInfo.h"
@@ -63,6 +61,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexagon-pei"
+
 // Hexagon stack frame layout as defined by the ABI:
 //
 //                                                       Incoming arguments
@@ -178,8 +178,8 @@ static cl::opt<bool> EnableSaveRestoreLong("enable-save-restore-long",
     cl::Hidden, cl::desc("Enable long calls for save-restore stubs."),
     cl::init(false), cl::ZeroOrMore);
 
-static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true),
-    cl::Hidden, cl::desc("Use allocframe more conservatively"));
+static cl::opt<bool> EliminateFramePointer("hexagon-fp-elim", cl::init(true),
+    cl::Hidden, cl::desc("Refrain from using FP whenever possible"));
 
 static cl::opt<bool> OptimizeSpillSlots("hexagon-opt-spill", cl::Hidden,
     cl::init(true), cl::desc("Optimize spill slots"));
@@ -301,16 +301,30 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
         // the frame creation/destruction instructions.
         if (MO.isFI())
           return true;
-        if (!MO.isReg())
-          continue;
-        unsigned R = MO.getReg();
-        // Virtual registers will need scavenging, which then may require
-        // a stack slot.
-        if (TargetRegisterInfo::isVirtualRegister(R))
-          return true;
-        for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
-          if (CSR[*S])
+        if (MO.isReg()) {
+          unsigned R = MO.getReg();
+          // Virtual registers will need scavenging, which then may require
+          // a stack slot.
+          if (TargetRegisterInfo::isVirtualRegister(R))
             return true;
+          for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
+            if (CSR[*S])
+              return true;
+          continue;
+        }
+        if (MO.isRegMask()) {
+          // A regmask would normally have all callee-saved registers marked
+          // as preserved, so this check would not be needed, but in case of
+          // ever having other regmasks (for other calling conventions),
+          // make sure they would be processed correctly.
+          const uint32_t *BM = MO.getRegMask();
+          for (int x = CSR.find_first(); x >= 0; x = CSR.find_next(x)) {
+            unsigned R = x;
+            // If this regmask does not preserve a CSR, a frame will be needed.
+            if (!(BM[R/32] & (1u << (R%32))))
+              return true;
+          }
+        }
       }
     }
     return false;
@@ -536,7 +550,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
-  DebugLoc dl;
 
   unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment());
 
@@ -570,77 +583,56 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
     MI->eraseFromParent();
   }
 
-  if (!hasFP(MF))
-    return;
-
-  // Check for overflow.
-  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
-  const unsigned int ALLOCFRAME_MAX = 16384;
-
-  // Create a dummy memory operand to avoid allocframe from being treated as
-  // a volatile memory reference.
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
-                            4, 4);
-
-  if (NumBytes >= ALLOCFRAME_MAX) {
-    // Emit allocframe(#0).
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
-      .addImm(0)
-      .addMemOperand(MMO);
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
 
-    // Subtract offset from frame pointer.
-    // We use a caller-saved non-parameter register for that.
-    unsigned CallerSavedReg = HRI.getFirstCallerSavedNonParamReg();
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::CONST32),
-            CallerSavedReg).addImm(NumBytes);
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_sub), SP)
+  if (hasFP(MF)) {
+    insertAllocframe(MBB, InsertPt, NumBytes);
+    if (AlignStack) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
+          .addReg(SP)
+          .addImm(-int64_t(MaxAlign));
+    }
+    // If the stack-checking is enabled, and we spilled the callee-saved
+    // registers inline (i.e. did not use a spill function), then call
+    // the stack checker directly.
+    if (EnableStackOVFSanitizer && !PrologueStubs)
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
+             .addExternalSymbol("__runtime_stack_check");
+  } else if (NumBytes > 0) {
+    assert(alignTo(NumBytes, 8) == NumBytes);
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
       .addReg(SP)
-      .addReg(CallerSavedReg);
-  } else {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
-      .addImm(NumBytes)
-      .addMemOperand(MMO);
-  }
-
-  if (AlignStack) {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
-        .addReg(SP)
-        .addImm(-int64_t(MaxAlign));
+      .addImm(-int(NumBytes));
   }
-
-  // If the stack-checking is enabled, and we spilled the callee-saved
-  // registers inline (i.e. did not use a spill function), then call
-  // the stack checker directly.
-  if (EnableStackOVFSanitizer && !PrologueStubs)
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_call_stk))
-           .addExternalSymbol("__runtime_stack_check");
 }
 
 void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   MachineFunction &MF = *MBB.getParent();
-  if (!hasFP(MF))
-    return;
-
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
   unsigned SP = HRI.getStackRegister();
 
+  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
+
+  if (!hasFP(MF)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (unsigned NumBytes = MFI.getStackSize()) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(NumBytes);
+    }
+    return;
+  }
+
   MachineInstr *RetI = getReturn(MBB);
   unsigned RetOpc = RetI ? RetI->getOpcode() : 0;
 
-  MachineBasicBlock::iterator InsertPt = MBB.getFirstTerminator();
-  DebugLoc DL;
-  if (InsertPt != MBB.end())
-    DL = InsertPt->getDebugLoc();
-  else if (!MBB.empty())
-    DL = std::prev(MBB.end())->getDebugLoc();
-
   // Handle EH_RETURN.
   if (RetOpc == Hexagon::EH_RETURN_JMPR) {
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::A2_add), SP)
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_add), SP)
         .addReg(SP)
         .addReg(Hexagon::R28);
     return;
@@ -685,16 +677,52 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
   // otherwise just add deallocframe. The function could be returning via a
   // tail call.
   if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
-    BuildMI(MBB, InsertPt, DL, HII.get(Hexagon::L2_deallocframe));
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe));
     return;
   }
   unsigned NewOpc = Hexagon::L4_return;
-  MachineInstr *NewI = BuildMI(MBB, RetI, DL, HII.get(NewOpc));
+  MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc));
   // Transfer the function live-out registers.
   NewI->copyImplicitOps(MF, *RetI);
   MBB.erase(RetI);
 }
 
+void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const {
+  MachineFunction &MF = *MBB.getParent();
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto &HRI = *HST.getRegisterInfo();
+
+  // Check for overflow.
+  // Hexagon_TODO: Ugh! hardcoding. Is there an API that can be used?
+  const unsigned int ALLOCFRAME_MAX = 16384;
+
+  // Create a dummy memory operand to avoid allocframe from being treated as
+  // a volatile memory reference.
+  auto *MMO = MF.getMachineMemOperand(MachinePointerInfo::getStack(MF, 0),
+                                      MachineMemOperand::MOStore, 4, 4);
+
+  DebugLoc dl = MBB.findDebugLoc(InsertPt);
+
+  if (NumBytes >= ALLOCFRAME_MAX) {
+    // Emit allocframe(#0).
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(0)
+      .addMemOperand(MMO);
+
+    // Subtract the size from the stack pointer.
+    unsigned SP = HRI.getStackRegister();
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+      .addReg(SP)
+      .addImm(-int(NumBytes));
+  } else {
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+      .addImm(NumBytes)
+      .addMemOperand(MMO);
+  }
+}
+
 void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF,
       MachineBasicBlock &SaveB) const {
   SetVector<unsigned> Worklist;
@@ -914,12 +942,11 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
 }
 
 bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return false;
+
   auto &MFI = MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
-
-  bool HasFixed = MFI.getNumFixedObjects();
-  bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI)
-                        .getLocalFrameObjectCount();
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool HasAlloca = MFI.hasVarSizedObjects();
 
@@ -933,18 +960,23 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
 
   // By default we want to use SP (since it's always there). FP requires
   // some setup (i.e. ALLOCFRAME).
-  // Fixed and preallocated objects need FP if the distance from them to
-  // the SP is unknown (as is with alloca or aligna).
-  if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign))
+  // Both, alloca and stack alignment modify the stack pointer by an
+  // undetermined value, so we need to save it at the entry to the function
+  // (i.e. use allocframe).
+  if (HasAlloca || HasExtraAlign)
     return true;
 
   if (MFI.getStackSize() > 0) {
-    if (EnableStackOVFSanitizer || UseAllocframe)
+    // If FP-elimination is disabled, we have to use FP at this point.
+    const TargetMachine &TM = MF.getTarget();
+    if (TM.Options.DisableFramePointerElim(MF) || !EliminateFramePointer)
+      return true;
+    if (EnableStackOVFSanitizer)
       return true;
   }
 
-  if (MFI.hasCalls() ||
-      MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR())
+  const auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  if (MFI.hasCalls() || HMFI.hasClobberLR())
     return true;
 
   return false;
@@ -1037,10 +1069,27 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   bool HasExtraAlign = HRI.needsStackRealignment(MF);
   bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None;
 
-  unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister();
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
-  unsigned AP = HMFI.getStackAlignBasePhysReg();
   unsigned FrameSize = MFI.getStackSize();
+  unsigned SP = HRI.getStackRegister();
+  unsigned FP = HRI.getFrameRegister();
+  unsigned AP = HMFI.getStackAlignBasePhysReg();
+  // It may happen that AP will be absent even HasAlloca && HasExtraAlign
+  // is true. HasExtraAlign may be set because of vector spills, without
+  // aligned locals or aligned outgoing function arguments. Since vector
+  // spills will ultimately be "unaligned", it is safe to use FP as the
+  // base register.
+  // In fact, in such a scenario the stack is actually not required to be
+  // aligned, although it may end up being aligned anyway, since this
+  // particular case is not easily detectable. The alignment will be
+  // unnecessary, but not incorrect.
+  // Unfortunately there is no quick way to verify that the above is
+  // indeed the case (and that it's not a result of an error), so just
+  // assume that missing AP will be replaced by FP.
+  // (A better fix would be to rematerialize AP from FP and always align
+  // vector spills.)
+  if (AP == 0)
+    AP = FP;
 
   bool UseFP = false, UseAP = false;  // Default: use SP (except at -O0).
   // Use FP at -O0, except when there are objects with extra alignment.
@@ -1105,7 +1154,7 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // there will be no SP -= FrameSize), so the frame size should not be
   // added to the calculated offset.
   int RealOffset = Offset;
-  if (!UseFP && !UseAP && HasFP)
+  if (!UseFP && !UseAP)
     RealOffset = FrameSize+Offset;
   return RealOffset;
 }
@@ -1411,7 +1460,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     if (!SRegs[S->Reg])
       continue;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(S->Reg);
-    int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), S->Offset);
+    int FI = MFI.CreateFixedSpillStackObject(TRI->getSpillSize(*RC), S->Offset);
     MinOffset = std::min(MinOffset, S->Offset);
     CSI.push_back(CalleeSavedInfo(S->Reg, FI));
     SRegs[S->Reg] = false;
@@ -1423,11 +1472,12 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   for (int x = SRegs.find_first(); x >= 0; x = SRegs.find_next(x)) {
     unsigned R = x;
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
-    int Off = MinOffset - RC->getSize();
-    unsigned Align = std::min(RC->getAlignment(), getStackAlignment());
+    unsigned Size = TRI->getSpillSize(*RC);
+    int Off = MinOffset - Size;
+    unsigned Align = std::min(TRI->getSpillAlignment(*RC), getStackAlignment());
     assert(isPowerOf2_32(Align));
     Off &= -Align;
-    int FI = MFI.CreateFixedSpillStackObject(RC->getSize(), Off);
+    int FI = MFI.CreateFixedSpillStackObject(Size, Off);
     MinOffset = std::min(MinOffset, Off);
     CSI.push_back(CalleeSavedInfo(R, FI));
     SRegs[R] = false;
@@ -1473,8 +1523,7 @@ bool HexagonFrameLowering::expandCopy(MachineBasicBlock &B,
     return false;
 
   unsigned TmpR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass);
-  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR)
-    .addOperand(MI->getOperand(1));
+  BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), TmpR).add(MI->getOperand(1));
   BuildMI(B, It, DL, HII.get(TargetOpcode::COPY), DstR)
     .addReg(TmpR, RegState::Kill);
 
@@ -1643,11 +1692,18 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   // defined. From the point of view of the liveness tracking, it is ok to
   // store it as a whole, but if we break it up we may end up storing a
   // register that is entirely undefined.
-  LivePhysRegs LPR(&HRI);
+  LivePhysRegs LPR(HRI);
   LPR.addLiveIns(B);
   SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
-  for (auto R = B.begin(); R != It; ++R)
+  for (auto R = B.begin(); R != It; ++R) {
+    Clobbers.clear();
     LPR.stepForward(*R, Clobbers);
+    // Dead defs are recorded in Clobbers, but are not automatically removed
+    // from the live set.
+    for (auto &C : Clobbers)
+      if (C.second->isReg() && C.second->isDead())
+        LPR.removeReg(C.first);
+  }
 
   DebugLoc DL = MI->getDebugLoc();
   unsigned SrcR = MI->getOperand(2).getReg();
@@ -1657,10 +1713,10 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   int FI = MI->getOperand(0).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-  unsigned Size = RC->getSize();
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned Size = HRI.getSpillSize(RC);
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned StoreOpc;
 
@@ -1714,10 +1770,10 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
   int FI = MI->getOperand(1).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-  unsigned Size = RC->getSize();
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned Size = HRI.getSpillSize(RC);
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned LoadOpc;
 
@@ -1757,16 +1813,16 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
   if (!MI->getOperand(0).isFI())
     return false;
 
+  auto &HRI = *HST.getRegisterInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned SrcR = MI->getOperand(2).getReg();
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned StoreOpc;
 
@@ -1795,15 +1851,15 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
   if (!MI->getOperand(1).isFI())
     return false;
 
+  auto &HRI = *HST.getRegisterInfo();
   DebugLoc DL = MI->getDebugLoc();
   unsigned DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
 
   bool Is128B = HST.useHVXDblOps();
-  auto *RC = !Is128B ? &Hexagon::VectorRegsRegClass
-                     : &Hexagon::VectorRegs128BRegClass;
-
-  unsigned NeedAlign = RC->getAlignment();
+  const auto &RC = !Is128B ? Hexagon::VectorRegsRegClass
+                           : Hexagon::VectorRegs128BRegClass;
+  unsigned NeedAlign = HRI.getSpillAlignment(RC);
   unsigned HasAlign = MFI.getObjectAlignment(FI);
   unsigned LoadOpc;
 
@@ -1912,7 +1968,7 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
       if (!needToReserveScavengingSpillSlots(MF, HRI, RC))
         continue;
       unsigned Num = RC == &Hexagon::IntRegsRegClass ? NumberScavengerSlots : 1;
-      unsigned S = RC->getSize(), A = RC->getAlignment();
+      unsigned S = HRI.getSpillSize(*RC), A = HRI.getSpillAlignment(*RC);
       for (unsigned i = 0; i < Num; i++) {
         int NewFI = MFI.CreateSpillStackObject(S, A);
         RS->addScavengingFrameIndex(NewFI);
@@ -1985,9 +2041,9 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
   // class HaveRC and a new class NewRC. Return nullptr if a common class
   // cannot be found, otherwise return the resulting class. If HaveRC is
   // nullptr, assume that it is still unset.
-  auto getCommonRC = [&HRI] (const TargetRegisterClass *HaveRC,
-                             const TargetRegisterClass *NewRC)
-        -> const TargetRegisterClass* {
+  auto getCommonRC =
+      [](const TargetRegisterClass *HaveRC,
+         const TargetRegisterClass *NewRC) -> const TargetRegisterClass * {
     if (HaveRC == nullptr || HaveRC == NewRC)
       return NewRC;
     // Different classes, both non-null. Pick the more general one.
@@ -2221,7 +2277,7 @@ void HexagonFrameLowering::optimizeSpillSlots(MachineFunction &MF,
         if (SrcRR.Reg != FoundR || SrcRR.Sub != 0) {
           const DebugLoc &DL = SI.getDebugLoc();
           CopyIn = BuildMI(B, StartIt, DL, HII.get(TargetOpcode::COPY), FoundR)
-                      .addOperand(SrcOp);
+                       .add(SrcOp);
         }
 
         ++StartIt;
@@ -2365,10 +2421,12 @@ void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
 /// be generated via inline code. If this function returns "true", inline
 /// code will be generated. If this function returns "false", additional
 /// checks are performed, which may still lead to the inline code.
-bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
+bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
     return true;
+  if (!hasFP(MF))
+    return true;
   if (!isOptSize(MF) && !isMinSize(MF))
     if (MF.getTarget().getOptLevel() > CodeGenOpt::Default)
       return true;
@@ -2395,7 +2453,7 @@ bool HexagonFrameLowering::shouldInlineCSR(MachineFunction &MF,
   return false;
 }
 
-bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
+bool HexagonFrameLowering::useSpillFunction(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
@@ -2408,7 +2466,7 @@ bool HexagonFrameLowering::useSpillFunction(MachineFunction &MF,
   return Threshold < NumCSI;
 }
 
-bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF,
+bool HexagonFrameLowering::useRestoreFunction(const MachineFunction &MF,
       const CSIVect &CSI) const {
   if (shouldInlineCSR(MF, CSI))
     return false;
@@ -2433,9 +2491,44 @@ bool HexagonFrameLowering::mayOverflowFrameOffset(MachineFunction &MF) const {
   unsigned StackSize = MF.getFrameInfo().estimateStackSize(MF);
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   // A fairly simplistic guess as to whether a potential load/store to a
-  // stack location could require an extra register. It does not account
-  // for store-immediate instructions.
-  if (HST.useHVXOps())
-    return StackSize > 256;
+  // stack location could require an extra register.
+  if (HST.useHVXOps() && StackSize > 256)
+    return true;
+
+  // Check if the function has store-immediate instructions that access
+  // the stack. Since the offset field is not extendable, if the stack
+  // size exceeds the offset limit (6 bits, shifted), the stores will
+  // require a new base register.
+  bool HasImmStack = false;
+  unsigned MinLS = ~0u;   // Log_2 of the memory access size.
+
+  for (const MachineBasicBlock &B : MF) {
+    for (const MachineInstr &MI : B) {
+      unsigned LS = 0;
+      switch (MI.getOpcode()) {
+        case Hexagon::S4_storeirit_io:
+        case Hexagon::S4_storeirif_io:
+        case Hexagon::S4_storeiri_io:
+          ++LS;
+          LLVM_FALLTHROUGH;
+        case Hexagon::S4_storeirht_io:
+        case Hexagon::S4_storeirhf_io:
+        case Hexagon::S4_storeirh_io:
+          ++LS;
+          LLVM_FALLTHROUGH;
+        case Hexagon::S4_storeirbt_io:
+        case Hexagon::S4_storeirbf_io:
+        case Hexagon::S4_storeirb_io:
+          if (MI.getOperand(0).isFI())
+            HasImmStack = true;
+          MinLS = std::min(MinLS, LS);
+          break;
+      }
+    }
+  }
+
+  if (HasImmStack)
+    return !isUInt<6>(StackSize >> MinLS);
+
   return false;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index 529a61d..f4d4e1b 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -48,6 +48,15 @@ public:
     return true;
   }
 
+  bool hasReservedCallFrame(const MachineFunction &MF) const override {
+    // We always reserve call frame as a part of the initial stack allocation.
+    return true;
+  }
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override {
+    // Override this function to avoid calling hasFP before CSI is set
+    // (the default implementation calls hasFP).
+    return true;
+  }
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
@@ -94,6 +103,8 @@ private:
       unsigned SP, unsigned CF) const;
   void insertPrologueInBlock(MachineBasicBlock &MBB, bool PrologueStubs) const;
   void insertEpilogueInBlock(MachineBasicBlock &MBB) const;
+  void insertAllocframe(MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator InsertPt, unsigned NumBytes) const;
   bool insertCSRSpillsInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
       const HexagonRegisterInfo &HRI, bool &PrologueStubs) const;
   bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI,
@@ -148,9 +159,9 @@ private:
 
   void addCalleeSaveRegistersAsImpOperand(MachineInstr *MI, const CSIVect &CSI,
       bool IsDef, bool IsKill) const;
-  bool shouldInlineCSR(MachineFunction &MF, const CSIVect &CSI) const;
-  bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const;
-  bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const;
+  bool shouldInlineCSR(const MachineFunction &MF, const CSIVect &CSI) const;
+  bool useSpillFunction(const MachineFunction &MF, const CSIVect &CSI) const;
+  bool useRestoreFunction(const MachineFunction &MF, const CSIVect &CSI) const;
   bool mayOverflowFrameOffset(MachineFunction &MF) const;
 };
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
index bb5e379..7c6de6d 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -14,10 +14,10 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -197,13 +197,13 @@ bool HexagonGenExtract::convert(Instruction *In) {
     // It is still ok to generate extract, but only if the mask eliminates
     // those bits (i.e. M does not have any bits set beyond U).
     APInt C = APInt::getHighBitsSet(BW, BW-U);
-    if (M.intersects(C) || !APIntOps::isMask(W, M))
+    if (M.intersects(C) || !M.isMask(W))
       return false;
   } else {
     // Check if M starts with a contiguous sequence of W times 1 bits. Get
     // the low U bits of M (which eliminates the 0 bits shifted in on the
     // left), and check if the result is APInt's "mask":
-    if (!APIntOps::isMask(W, M.getLoBits(U)))
+    if (!M.getLoBits(U).isMask(W))
       return false;
   }
 
@@ -221,11 +221,8 @@ bool HexagonGenExtract::convert(Instruction *In) {
 
 bool HexagonGenExtract::visitBlock(BasicBlock *B) {
   // Depth-first, bottom-up traversal.
-  DomTreeNode *DTN = DT->getNode(B);
-  typedef GraphTraits<DomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType Iter;
-  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
-    visitBlock((*I)->getBlock());
+  for (auto *DTN : children<DomTreeNode*>(DT->getNode(B)))
+    visitBlock(DTN->getBlock());
 
   // Allow limiting the number of generated extracts for debugging purposes.
   bool HasCutoff = ExtractCutoff.getPosition();
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 5a8e392..0a955ae 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexinsert"
-
 #include "BitTracker.h"
 #include "HexagonBitTracker.h"
 #include "HexagonInstrInfo.h"
@@ -17,9 +15,9 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -34,8 +32,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 #include <cassert>
@@ -44,6 +42,8 @@
 #include <utility>
 #include <vector>
 
+#define DEBUG_TYPE "hexinsert"
+
 using namespace llvm;
 
 static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U),
@@ -947,11 +947,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
     BlockDefs.insert(InsDefs);
   }
 
-  MachineDomTreeNode *N = MDT->getNode(B);
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType ChildIter;
-  for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
+  for (auto *DTN : children<MachineDomTreeNode*>(MDT->getNode(B))) {
+    MachineBasicBlock *SB = DTN->getBlock();
     collectInBlock(SB, AVs);
   }
 
@@ -1422,9 +1419,9 @@ bool HexagonGenInsert::generateInserts() {
 
 bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
   bool Changed = false;
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-    Changed |= removeDeadCode(*I);
+
+  for (auto *DTN : children<MachineDomTreeNode*>(N))
+    Changed |= removeDeadCode(DTN);
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
index a718df9..5abbcbb 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -40,8 +41,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
-#include <limits>
 #include <iterator>
+#include <limits>
 #include <utility>
 
 using namespace llvm;
@@ -59,9 +60,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonGenMux() : MachineFunctionPass(ID), HII(nullptr), HRI(nullptr) {
-      initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonGenMux() : MachineFunctionPass(ID) {}
 
     StringRef getPassName() const override {
       return "Hexagon generate mux instructions";
@@ -79,8 +78,8 @@ namespace {
     }
 
   private:
-    const HexagonInstrInfo *HII;
-    const HexagonRegisterInfo *HRI;
+    const HexagonInstrInfo *HII = nullptr;
+    const HexagonRegisterInfo *HRI = nullptr;
 
     struct CondsetInfo {
       unsigned PredR = 0;
@@ -134,7 +133,7 @@ namespace {
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(HexagonGenMux, "hexagon-mux",
+INITIALIZE_PASS(HexagonGenMux, "hexagon-gen-mux",
   "Hexagon generate mux instructions", false, false)
 
 void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const {
@@ -235,8 +234,11 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     unsigned DR = MI->getOperand(0).getReg();
     if (isRegPair(DR))
       continue;
+    MachineOperand &PredOp = MI->getOperand(1);
+    if (PredOp.isUndef())
+      continue;
 
-    unsigned PR = MI->getOperand(1).getReg();
+    unsigned PR = PredOp.getReg();
     unsigned Idx = I2X.lookup(MI);
     CondsetMap::iterator F = CM.find(DR);
     bool IfTrue = HII->isPredicatedTrue(Opc);
@@ -316,22 +318,49 @@ bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) {
     ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2));
   }
 
-  for (unsigned I = 0, N = ML.size(); I < N; ++I) {
-    MuxInfo &MX = ML[I];
-    MachineBasicBlock &B = *MX.At->getParent();
-    DebugLoc DL = MX.At->getDebugLoc();
+  for (MuxInfo &MX : ML) {
     unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF);
     if (!MxOpc)
       continue;
-    BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
-      .addReg(MX.PredR)
-      .addOperand(*MX.SrcT)
-      .addOperand(*MX.SrcF);
+    MachineBasicBlock &B = *MX.At->getParent();
+    const DebugLoc &DL = B.findDebugLoc(MX.At);
+    auto NewMux = BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR)
+                      .addReg(MX.PredR)
+                      .add(*MX.SrcT)
+                      .add(*MX.SrcF);
+    NewMux->clearKillInfo();
     B.erase(MX.Def1);
     B.erase(MX.Def2);
     Changed = true;
   }
 
+  // Fix up kill flags.
+
+  LivePhysRegs LPR(*HRI);
+  LPR.addLiveOuts(B);
+  auto IsLive = [&LPR,this] (unsigned Reg) -> bool {
+    for (MCSubRegIterator S(Reg, HRI, true); S.isValid(); ++S)
+      if (LPR.contains(*S))
+        return true;
+    return false;
+  };
+  for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+    // This isn't 100% accurate, but it's safe.
+    // It won't detect (as a kill) a case like this
+    //   r0 = add r0, 1    <-- r0 should be "killed"
+    //   ... = r0
+    for (MachineOperand &Op : I->operands()) {
+      if (!Op.isReg() || !Op.isUse())
+        continue;
+      assert(Op.getSubReg() == 0 && "Should have physical registers only");
+      bool Live = IsLive(Op.getReg());
+      Op.setIsKill(!Live);
+    }
+    LPR.stepBackward(*I);
+  }
+
   return Changed;
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index f14c733..2da2115 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "gen-pred"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SetVector.h"
@@ -35,6 +33,8 @@
 #include <set>
 #include <utility>
 
+#define DEBUG_TYPE "gen-pred"
+
 using namespace llvm;
 
 namespace llvm {
@@ -334,6 +334,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) {
         if (MRI->getRegClass(PR.R) != PredRC)
           return false;
         // If it is a copy between two predicate registers, fall through.
+        LLVM_FALLTHROUGH;
       }
       case Hexagon::C2_and:
       case Hexagon::C2_andn:
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index e477dcc..86a8089 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -100,6 +100,7 @@ namespace {
     MachineRegisterInfo        *MRI;
     MachineDominatorTree       *MDT;
     const HexagonInstrInfo     *TII;
+    const HexagonRegisterInfo  *TRI;
 #ifndef NDEBUG
     static int Counter;
 #endif
@@ -381,7 +382,9 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
-  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+  TII = HST.getInstrInfo();
+  TRI = HST.getRegisterInfo();
 
   for (auto &L : *MLI)
     if (!L->getParentLoop()) {
@@ -960,24 +963,21 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
 /// \brief Return true if the operation is invalid within hardware loop.
 bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
                                                   bool IsInnerHWLoop) const {
-
   // Call is not allowed because the callee may use a hardware loop except for
   // the case when the call never returns.
   if (MI->getDesc().isCall())
     return !TII->doesNotReturn(*MI);
 
   // Check if the instruction defines a hardware loop register.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isDef())
-      continue;
-    unsigned R = MO.getReg();
-    if (IsInnerHWLoop && (R == Hexagon::LC0 || R == Hexagon::SA0 ||
-                          R == Hexagon::LC1 || R == Hexagon::SA1))
-      return true;
-    if (!IsInnerHWLoop && (R == Hexagon::LC1 || R == Hexagon::SA1))
+  using namespace Hexagon;
+  static const unsigned Regs01[] = { LC0, SA0, LC1, SA1 };
+  static const unsigned Regs1[]  = { LC1, SA1 };
+  auto CheckRegs = IsInnerHWLoop ? makeArrayRef(Regs01, array_lengthof(Regs01))
+                                 : makeArrayRef(Regs1, array_lengthof(Regs1));
+  for (unsigned R : CheckRegs)
+    if (MI->modifiesRegister(R, TRI))
       return true;
-  }
+
   return false;
 }
 
@@ -1511,7 +1511,7 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
       int64_t V1, V2;
       if (!checkForImmediate(S1, V1) || !checkForImmediate(S2, V2))
         return false;
-      TV = V2 | (V1 << 32);
+      TV = V2 | (static_cast<uint64_t>(V1) << 32);
       break;
     }
     case TargetOpcode::REG_SEQUENCE: {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIICHVX.td b/contrib/llvm/lib/Target/Hexagon/HexagonIICHVX.td
new file mode 100644
index 0000000..1493d52
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIICHVX.td
@@ -0,0 +1,18 @@
+//===--- HexagonIICHVX.td -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def CVI_VA            : InstrItinClass;
+
+class HVXItin {
+  list<InstrItinData> HVXItin_list = [
+    InstrItinData<CVI_VA,
+      [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE,CVI_SHIFT, CVI_MPY0, CVI_MPY1]>],
+      [9, 7, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIICScalar.td b/contrib/llvm/lib/Target/Hexagon/HexagonIICScalar.td
new file mode 100644
index 0000000..5fe7133
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIICScalar.td
@@ -0,0 +1,32 @@
+//===--- HexagonIICScalar.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// These itinerary class descriptions are based on the instruction timing
+// classes as per V62. Curretnly, they are just extracted from
+// HexagonScheduleV62.td but will soon be auto-generated by HexagonGen.py.
+
+class PseudoItin {
+  list<InstrItinData> PseudoItin_list = [
+    InstrItinData<PSEUDO, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                          [1, 1, 1]>,
+    InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                            InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<DUPLEX,  [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+    InstrItinData<tc_ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>], [2]>
+  ];
+}
+
+class ScalarItin {
+  list<InstrItinData> ScalarItin_list = [
+    InstrItinData<LD_tc_ld_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                   [3, 1], [Hex_FWD, Hex_FWD]>,
+    InstrItinData<ST_tc_st_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                   [1, 1, 1], [Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index f6012d2..0163b2e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -71,6 +71,9 @@ public:
     return true;
   }
 
+  bool ComplexPatternFuncMutatesDAG() const override {
+    return true;
+  }
   void PreprocessISelDAG() override;
   void EmitFunctionEntryCode() override;
 
@@ -81,6 +84,7 @@ public:
   inline bool SelectAddrGP(SDValue &N, SDValue &R);
   bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP);
   bool SelectAddrFI(SDValue &N, SDValue &R);
+  bool DetectUseSxtw(SDValue &N, SDValue &R);
 
   StringRef getPassName() const override {
     return "Hexagon DAG->DAG Pattern Instruction Selection";
@@ -106,7 +110,6 @@ public:
   void SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl);
   void SelectStore(SDNode *N);
   void SelectSHL(SDNode *N);
-  void SelectMul(SDNode *N);
   void SelectZeroExtend(SDNode *N);
   void SelectIntrinsicWChain(SDNode *N);
   void SelectIntrinsicWOChain(SDNode *N);
@@ -118,11 +121,18 @@ public:
   #include "HexagonGenDAGISel.inc"
 
 private:
-  bool isValueExtension(const SDValue &Val, unsigned FromBits, SDValue &Src);
+  bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src);
   bool isOrEquivalentToAdd(const SDNode *N) const;
   bool isAlignedMemNode(const MemSDNode *N) const;
+  bool isSmallStackStore(const StoreSDNode *N) const;
   bool isPositiveHalfWord(const SDNode *N) const;
 
+  // DAG preprocessing functions.
+  void ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes);
+  void ppAddrReorderAddShl(std::vector<SDNode*> &&Nodes);
+  void ppAddrRewriteAndSrl(std::vector<SDNode*> &&Nodes);
+  void ppHoistZextI1(std::vector<SDNode*> &&Nodes);
+
   SmallDenseMap<SDNode *,int> RootWeights;
   SmallDenseMap<SDNode *,int> RootHeights;
   SmallDenseMap<const Value *,int> GAUsesInFunction;
@@ -231,22 +241,31 @@ void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) {
   case MVT::v32i16:
   case MVT::v16i32:
   case MVT::v8i64:
-    if (isAlignedMemNode(LD))
-      Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai;
-    else
+    if (isAlignedMemNode(LD)) {
+      if (LD->isNonTemporal())
+        Opcode = IsValidInc ? Hexagon::V6_vL32b_nt_pi : Hexagon::V6_vL32b_nt_ai;
+      else
+        Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai;
+    } else {
       Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi : Hexagon::V6_vL32Ub_ai;
+    }
     break;
   // 128B
   case MVT::v128i8:
   case MVT::v64i16:
   case MVT::v32i32:
   case MVT::v16i64:
-    if (isAlignedMemNode(LD))
-      Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B
-                          : Hexagon::V6_vL32b_ai_128B;
-    else
+    if (isAlignedMemNode(LD)) {
+      if (LD->isNonTemporal())
+        Opcode = IsValidInc ? Hexagon::V6_vL32b_nt_pi_128B
+                            : Hexagon::V6_vL32b_nt_ai_128B;
+      else
+        Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B
+                            : Hexagon::V6_vL32b_ai_128B;
+    } else {
       Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi_128B
                           : Hexagon::V6_vL32Ub_ai_128B;
+    }
     break;
   default:
     llvm_unreachable("Unexpected memory type in indexed load");
@@ -519,22 +538,31 @@ void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) {
   case MVT::v32i16:
   case MVT::v16i32:
   case MVT::v8i64:
-    if (isAlignedMemNode(ST))
-      Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai;
-    else
+    if (isAlignedMemNode(ST)) {
+      if (ST->isNonTemporal())
+        Opcode = IsValidInc ? Hexagon::V6_vS32b_nt_pi : Hexagon::V6_vS32b_nt_ai;
+      else
+        Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai;
+    } else {
       Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi : Hexagon::V6_vS32Ub_ai;
+    }
     break;
   // 128B
   case MVT::v128i8:
   case MVT::v64i16:
   case MVT::v32i32:
   case MVT::v16i64:
-    if (isAlignedMemNode(ST))
-      Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B
-                          : Hexagon::V6_vS32b_ai_128B;
-    else
+    if (isAlignedMemNode(ST)) {
+      if (ST->isNonTemporal())
+        Opcode = IsValidInc ? Hexagon::V6_vS32b_nt_pi_128B
+                            : Hexagon::V6_vS32b_nt_ai_128B;
+      else
+        Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B
+                            : Hexagon::V6_vS32b_ai_128B;
+    } else {
       Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi_128B
                           : Hexagon::V6_vS32Ub_ai_128B;
+    }
     break;
   default:
     llvm_unreachable("Unexpected memory type in indexed store");
@@ -591,90 +619,6 @@ void HexagonDAGToDAGISel::SelectStore(SDNode *N) {
   SelectCode(ST);
 }
 
-void HexagonDAGToDAGISel::SelectMul(SDNode *N) {
-  SDLoc dl(N);
-
-  // %conv.i = sext i32 %tmp1 to i64
-  // %conv2.i = sext i32 %add to i64
-  // %mul.i = mul nsw i64 %conv2.i, %conv.i
-  //
-  //   --- match with the following ---
-  //
-  // %mul.i = mpy (%tmp1, %add)
-  //
-
-  if (N->getValueType(0) == MVT::i64) {
-    // Shifting a i64 signed multiply.
-    SDValue MulOp0 = N->getOperand(0);
-    SDValue MulOp1 = N->getOperand(1);
-
-    SDValue OP0;
-    SDValue OP1;
-
-    // Handle sign_extend and sextload.
-    if (MulOp0.getOpcode() == ISD::SIGN_EXTEND) {
-      SDValue Sext0 = MulOp0.getOperand(0);
-      if (Sext0.getNode()->getValueType(0) != MVT::i32) {
-        SelectCode(N);
-        return;
-      }
-      OP0 = Sext0;
-    } else if (MulOp0.getOpcode() == ISD::LOAD) {
-      LoadSDNode *LD = cast<LoadSDNode>(MulOp0.getNode());
-      if (LD->getMemoryVT() != MVT::i32 ||
-          LD->getExtensionType() != ISD::SEXTLOAD ||
-          LD->getAddressingMode() != ISD::UNINDEXED) {
-        SelectCode(N);
-        return;
-      }
-      SDValue Chain = LD->getChain();
-      SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-      OP0 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
-                                            MVT::Other,
-                                            LD->getBasePtr(), TargetConst0,
-                                            Chain), 0);
-    } else {
-      SelectCode(N);
-      return;
-    }
-
-    // Same goes for the second operand.
-    if (MulOp1.getOpcode() == ISD::SIGN_EXTEND) {
-      SDValue Sext1 = MulOp1.getOperand(0);
-      if (Sext1.getNode()->getValueType(0) != MVT::i32) {
-        SelectCode(N);
-        return;
-      }
-      OP1 = Sext1;
-    } else if (MulOp1.getOpcode() == ISD::LOAD) {
-      LoadSDNode *LD = cast<LoadSDNode>(MulOp1.getNode());
-      if (LD->getMemoryVT() != MVT::i32 ||
-          LD->getExtensionType() != ISD::SEXTLOAD ||
-          LD->getAddressingMode() != ISD::UNINDEXED) {
-        SelectCode(N);
-        return;
-      }
-      SDValue Chain = LD->getChain();
-      SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
-      OP1 = SDValue(CurDAG->getMachineNode(Hexagon::L2_loadri_io, dl, MVT::i32,
-                                            MVT::Other,
-                                            LD->getBasePtr(), TargetConst0,
-                                            Chain), 0);
-    } else {
-      SelectCode(N);
-      return;
-    }
-
-    // Generate a mpy instruction.
-    SDNode *Result = CurDAG->getMachineNode(Hexagon::M2_dpmpyss_s0, dl,
-                                            MVT::i64, OP0, OP1);
-    ReplaceNode(N, Result);
-    return;
-  }
-
-  SelectCode(N);
-}
-
 void HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
   SDLoc dl(N);
   SDValue Shl_0 = N->getOperand(0);
@@ -837,7 +781,7 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
 
   SDValue V = N->getOperand(1);
   SDValue U;
-  if (isValueExtension(V, Bits, U)) {
+  if (keepsLowBits(V, Bits, U)) {
     SDValue R = CurDAG->getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
                                 N->getOperand(0), U);
     ReplaceNode(N, R.getNode());
@@ -932,55 +876,20 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
 
 
 void HexagonDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode()) {
-    N->setNodeId(-1);
-    return;   // Already selected.
-  }
+  if (N->isMachineOpcode())
+    return N->setNodeId(-1);  // Already selected.
 
   switch (N->getOpcode()) {
-  case ISD::Constant:
-    SelectConstant(N);
-    return;
-
-  case ISD::ConstantFP:
-    SelectConstantFP(N);
-    return;
-
-  case ISD::FrameIndex:
-    SelectFrameIndex(N);
-    return;
-
-  case ISD::BITCAST:
-    SelectBitcast(N);
-    return;
-
-  case ISD::SHL:
-    SelectSHL(N);
-    return;
-
-  case ISD::LOAD:
-    SelectLoad(N);
-    return;
-
-  case ISD::STORE:
-    SelectStore(N);
-    return;
-
-  case ISD::MUL:
-    SelectMul(N);
-    return;
-
-  case ISD::ZERO_EXTEND:
-    SelectZeroExtend(N);
-    return;
-
-  case ISD::INTRINSIC_W_CHAIN:
-    SelectIntrinsicWChain(N);
-    return;
-
-  case ISD::INTRINSIC_WO_CHAIN:
-    SelectIntrinsicWOChain(N);
-    return;
+  case ISD::Constant:             return SelectConstant(N);
+  case ISD::ConstantFP:           return SelectConstantFP(N);
+  case ISD::FrameIndex:           return SelectFrameIndex(N);
+  case ISD::BITCAST:              return SelectBitcast(N);
+  case ISD::SHL:                  return SelectSHL(N);
+  case ISD::LOAD:                 return SelectLoad(N);
+  case ISD::STORE:                return SelectStore(N);
+  case ISD::ZERO_EXTEND:          return SelectZeroExtend(N);
+  case ISD::INTRINSIC_W_CHAIN:    return SelectIntrinsicWChain(N);
+  case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
   }
 
   SelectCode(N);
@@ -1010,15 +919,52 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
 }
 
 
-void HexagonDAGToDAGISel::PreprocessISelDAG() {
+static bool isMemOPCandidate(SDNode *I, SDNode *U) {
+  // I is an operand of U. Check if U is an arithmetic (binary) operation
+  // usable in a memop, where the other operand is a loaded value, and the
+  // result of U is stored in the same location.
+
+  if (!U->hasOneUse())
+    return false;
+  unsigned Opc = U->getOpcode();
+  switch (Opc) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::AND:
+    case ISD::OR:
+      break;
+    default:
+      return false;
+  }
+
+  SDValue S0 = U->getOperand(0);
+  SDValue S1 = U->getOperand(1);
+  SDValue SY = (S0.getNode() == I) ? S1 : S0;
+
+  SDNode *UUse = *U->use_begin();
+  if (UUse->getNumValues() != 1)
+    return false;
+
+  // Check if one of the inputs to U is a load instruction and the output
+  // is used by a store instruction. If so and they also have the same
+  // base pointer, then don't preoprocess this node sequence as it
+  // can be matched to a memop.
+  SDNode *SYNode = SY.getNode();
+  if (UUse->getOpcode() == ISD::STORE && SYNode->getOpcode() == ISD::LOAD) {
+    SDValue LDBasePtr = cast<MemSDNode>(SYNode)->getBasePtr();
+    SDValue STBasePtr = cast<MemSDNode>(UUse)->getBasePtr();
+    if (LDBasePtr == STBasePtr)
+      return true;
+  }
+  return false;
+}
+
+
+// Transform: (or (select c x 0) z)  ->  (select c (or x z) z)
+//            (or (select c 0 y) z)  ->  (select c z (or y z))
+void HexagonDAGToDAGISel::ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes) {
   SelectionDAG &DAG = *CurDAG;
-  std::vector<SDNode*> Nodes;
-  for (SDNode &Node : DAG.allnodes())
-    Nodes.push_back(&Node);
 
-  // Simplify: (or (select c x 0) z)  ->  (select c (or x z) z)
-  //           (or (select c 0 y) z)  ->  (select c z (or y z))
-  // This may not be the right thing for all targets, so do it here.
   for (auto I : Nodes) {
     if (I->getOpcode() != ISD::OR)
       continue;
@@ -1056,18 +1002,22 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
       }
     }
   }
+}
+
+// Transform: (store ch val (add x (add (shl y c) e)))
+//        to: (store ch val (add x (shl (add y d) c))),
+// where e = (shl d c) for some integer d.
+// The purpose of this is to enable generation of loads/stores with
+// shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+// value c must be 0, 1 or 2.
+void HexagonDAGToDAGISel::ppAddrReorderAddShl(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
 
-  // Transform: (store ch addr (add x (add (shl y c) e)))
-  //        to: (store ch addr (add x (shl (add y d) c))),
-  // where e = (shl d c) for some integer d.
-  // The purpose of this is to enable generation of loads/stores with
-  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
-  // value c must be 0, 1 or 2.
   for (auto I : Nodes) {
     if (I->getOpcode() != ISD::STORE)
       continue;
 
-    // I matched: (store ch addr Off)
+    // I matched: (store ch val Off)
     SDValue Off = I->getOperand(2);
     // Off needs to match: (add x (add (shl y c) (shl d c))))
     if (Off.getOpcode() != ISD::ADD)
@@ -1109,15 +1059,192 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     SDValue NewShl = DAG.getNode(ISD::SHL, DL, VT, NewAdd, C);
     ReplaceNode(T0.getNode(), NewShl.getNode());
   }
+}
+
+// Transform: (load ch (add x (and (srl y c) Mask)))
+//        to: (load ch (add x (shl (srl y d) d-c)))
+// where
+// Mask = 00..0 111..1 0.0
+//          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+//          |     +-------- 1s
+//          +-------------- at most c 0s
+// Motivating example:
+// DAG combiner optimizes (add x (shl (srl y 5) 2))
+//                     to (add x (and (srl y 3) 1FFFFFFC))
+// which results in a constant-extended and(##...,lsr). This transformation
+// undoes this simplification for cases where the shl can be folded into
+// an addressing mode.
+void HexagonDAGToDAGISel::ppAddrRewriteAndSrl(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
+
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::LOAD && Opc != ISD::STORE)
+      continue;
+    SDValue Addr = Opc == ISD::LOAD ? N->getOperand(1) : N->getOperand(2);
+    // Addr must match: (add x T0)
+    if (Addr.getOpcode() != ISD::ADD)
+      continue;
+    SDValue T0 = Addr.getOperand(1);
+    // T0 must match: (and T1 Mask)
+    if (T0.getOpcode() != ISD::AND)
+      continue;
+
+    // We have an AND.
+    //
+    // Check the first operand. It must be: (srl y c).
+    SDValue S = T0.getOperand(0);
+    if (S.getOpcode() != ISD::SRL)
+      continue;
+    ConstantSDNode *SN = dyn_cast<ConstantSDNode>(S.getOperand(1).getNode());
+    if (SN == nullptr)
+      continue;
+    if (SN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t CV = SN->getZExtValue();
+
+    // Check the second operand: the supposed mask.
+    ConstantSDNode *MN = dyn_cast<ConstantSDNode>(T0.getOperand(1).getNode());
+    if (MN == nullptr)
+      continue;
+    if (MN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t Mask = MN->getZExtValue();
+    // Examine the mask.
+    uint32_t TZ = countTrailingZeros(Mask);
+    uint32_t M1 = countTrailingOnes(Mask >> TZ);
+    uint32_t LZ = countLeadingZeros(Mask);
+    // Trailing zeros + middle ones + leading zeros must equal the width.
+    if (TZ + M1 + LZ != 32)
+      continue;
+    // The number of trailing zeros will be encoded in the addressing mode.
+    if (TZ > 2)
+      continue;
+    // The number of leading zeros must be at most c.
+    if (LZ > CV)
+      continue;
+
+    // All looks good.
+    SDValue Y = S.getOperand(0);
+    EVT VT = Addr.getValueType();
+    SDLoc dl(S);
+    // TZ = D-C, so D = TZ+C.
+    SDValue D = DAG.getConstant(TZ+CV, dl, VT);
+    SDValue DC = DAG.getConstant(TZ, dl, VT);
+    SDValue NewSrl = DAG.getNode(ISD::SRL, dl, VT, Y, D);
+    SDValue NewShl = DAG.getNode(ISD::SHL, dl, VT, NewSrl, DC);
+    ReplaceNode(T0.getNode(), NewShl.getNode());
+  }
+}
+
+// Transform: (op ... (zext i1 c) ...) -> (select c (op ... 0 ...)
+//                                                  (op ... 1 ...))
+void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
+
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::ZERO_EXTEND)
+      continue;
+    SDValue OpI1 = N->getOperand(0);
+    EVT OpVT = OpI1.getValueType();
+    if (!OpVT.isSimple() || OpVT.getSimpleVT() != MVT::i1)
+      continue;
+    for (auto I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+      SDNode *U = *I;
+      if (U->getNumValues() != 1)
+        continue;
+      EVT UVT = U->getValueType(0);
+      if (!UVT.isSimple() || !UVT.isInteger() || UVT.getSimpleVT() == MVT::i1)
+        continue;
+      if (isMemOPCandidate(N, U))
+        continue;
+
+      // Potentially simplifiable operation.
+      unsigned I1N = I.getOperandNo();
+      SmallVector<SDValue,2> Ops(U->getNumOperands());
+      for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i)
+        Ops[i] = U->getOperand(i);
+      EVT BVT = Ops[I1N].getValueType();
+
+      SDLoc dl(U);
+      SDValue C0 = DAG.getConstant(0, dl, BVT);
+      SDValue C1 = DAG.getConstant(1, dl, BVT);
+      SDValue If0, If1;
+
+      if (isa<MachineSDNode>(U)) {
+        unsigned UseOpc = U->getMachineOpcode();
+        Ops[I1N] = C0;
+        If0 = SDValue(DAG.getMachineNode(UseOpc, dl, UVT, Ops), 0);
+        Ops[I1N] = C1;
+        If1 = SDValue(DAG.getMachineNode(UseOpc, dl, UVT, Ops), 0);
+      } else {
+        unsigned UseOpc = U->getOpcode();
+        Ops[I1N] = C0;
+        If0 = DAG.getNode(UseOpc, dl, UVT, Ops);
+        Ops[I1N] = C1;
+        If1 = DAG.getNode(UseOpc, dl, UVT, Ops);
+      }
+      SDValue Sel = DAG.getNode(ISD::SELECT, dl, UVT, OpI1, If1, If0);
+      DAG.ReplaceAllUsesWith(U, Sel.getNode());
+    }
+  }
+}
+
+void HexagonDAGToDAGISel::PreprocessISelDAG() {
+  // Repack all nodes before calling each preprocessing function,
+  // because each of them can modify the set of nodes.
+  auto getNodes = [this] () -> std::vector<SDNode*> {
+    std::vector<SDNode*> T;
+    T.reserve(CurDAG->allnodes_size());
+    for (SDNode &N : CurDAG->allnodes())
+      T.push_back(&N);
+    return T;
+  };
+
+  // Transform: (or (select c x 0) z)  ->  (select c (or x z) z)
+  //            (or (select c 0 y) z)  ->  (select c z (or y z))
+  ppSimplifyOrSelect0(getNodes());
+
+  // Transform: (store ch val (add x (add (shl y c) e)))
+  //        to: (store ch val (add x (shl (add y d) c))),
+  // where e = (shl d c) for some integer d.
+  // The purpose of this is to enable generation of loads/stores with
+  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+  // value c must be 0, 1 or 2.
+  ppAddrReorderAddShl(getNodes());
+
+  // Transform: (load ch (add x (and (srl y c) Mask)))
+  //        to: (load ch (add x (shl (srl y d) d-c)))
+  // where
+  // Mask = 00..0 111..1 0.0
+  //          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+  //          |     +-------- 1s
+  //          +-------------- at most c 0s
+  // Motivating example:
+  // DAG combiner optimizes (add x (shl (srl y 5) 2))
+  //                     to (add x (and (srl y 3) 1FFFFFFC))
+  // which results in a constant-extended and(##...,lsr). This transformation
+  // undoes this simplification for cases where the shl can be folded into
+  // an addressing mode.
+  ppAddrRewriteAndSrl(getNodes());
+
+  // Transform: (op ... (zext i1 c) ...) -> (select c (op ... 0 ...)
+  //                                                  (op ... 1 ...))
+  ppHoistZextI1(getNodes());
+
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "Preprocessed (Hexagon) selection DAG:";
+    CurDAG->dump();
+  });
 
   if (EnableAddressRebalancing) {
     rebalanceAddressTrees();
 
-    DEBUG(
-      dbgs() << "************* SelectionDAG after preprocessing: ***********\n";
+    DEBUG_WITH_TYPE("isel", {
+      dbgs() << "Address tree balanced selection DAG:";
       CurDAG->dump();
-      dbgs() << "************* End SelectionDAG after preprocessing ********\n";
-    );
+    });
   }
 }
 
@@ -1137,7 +1264,7 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
 }
 
 // Match a frame index that can be used in an addressing mode.
-bool HexagonDAGToDAGISel::SelectAddrFI(SDValue& N, SDValue &R) {
+bool HexagonDAGToDAGISel::SelectAddrFI(SDValue &N, SDValue &R) {
   if (N.getOpcode() != ISD::FrameIndex)
     return false;
   auto &HFI = *HST->getFrameLowering();
@@ -1198,16 +1325,83 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
   return false;
 }
 
-bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
-      unsigned FromBits, SDValue &Src) {
+bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
+  // This (complex pattern) function is meant to detect a sign-extension
+  // i32->i64 on a per-operand basis. This would allow writing single
+  // patterns that would cover a number of combinations of different ways
+  // a sign-extensions could be written. For example:
+  //   (mul (DetectUseSxtw x) (DetectUseSxtw y)) -> (M2_dpmpyss_s0 x y)
+  // could match either one of these:
+  //   (mul (sext x) (sext_inreg y))
+  //   (mul (sext-load *p) (sext_inreg y))
+  //   (mul (sext_inreg x) (sext y))
+  // etc.
+  //
+  // The returned value will have type i64 and its low word will
+  // contain the value being extended. The high bits are not specified.
+  // The returned type is i64 because the original type of N was i64,
+  // but the users of this function should only use the low-word of the
+  // result, e.g.
+  //  (mul sxtw:x, sxtw:y) -> (M2_dpmpyss_s0 (LoReg sxtw:x), (LoReg sxtw:y))
+
+  if (N.getValueType() != MVT::i64)
+    return false;
+  EVT SrcVT;
+  unsigned Opc = N.getOpcode();
+  switch (Opc) {
+    case ISD::SIGN_EXTEND:
+    case ISD::SIGN_EXTEND_INREG: {
+      // sext_inreg has the source type as a separate operand.
+      EVT T = Opc == ISD::SIGN_EXTEND
+                ? N.getOperand(0).getValueType()
+                : cast<VTSDNode>(N.getOperand(1))->getVT();
+      if (T.getSizeInBits() != 32)
+        return false;
+      R = N.getOperand(0);
+      break;
+    }
+    case ISD::LOAD: {
+      LoadSDNode *L = cast<LoadSDNode>(N);
+      if (L->getExtensionType() != ISD::SEXTLOAD)
+        return false;
+      // All extending loads extend to i32, so even if the value in
+      // memory is shorter than 32 bits, it will be i32 after the load.
+      if (L->getMemoryVT().getSizeInBits() > 32)
+        return false;
+      R = N;
+      break;
+    }
+    default:
+      return false;
+  }
+  EVT RT = R.getValueType();
+  if (RT == MVT::i64)
+    return true;
+  assert(RT == MVT::i32);
+  // This is only to produce a value of type i64. Do not rely on the
+  // high bits produced by this.
+  const SDLoc &dl(N);
+  SDValue Ops[] = {
+    CurDAG->getTargetConstant(Hexagon::DoubleRegsRegClassID, dl, MVT::i32),
+    R, CurDAG->getTargetConstant(Hexagon::isub_hi, dl, MVT::i32),
+    R, CurDAG->getTargetConstant(Hexagon::isub_lo, dl, MVT::i32)
+  };
+  SDNode *T = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl,
+                                     MVT::i64, Ops);
+  R = SDValue(T, 0);
+  return true;
+}
+
+bool HexagonDAGToDAGISel::keepsLowBits(const SDValue &Val, unsigned NumBits,
+      SDValue &Src) {
   unsigned Opc = Val.getOpcode();
   switch (Opc) {
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: {
-    SDValue const &Op0 = Val.getOperand(0);
+    const SDValue &Op0 = Val.getOperand(0);
     EVT T = Op0.getValueType();
-    if (T.isInteger() && T.getSizeInBits() == FromBits) {
+    if (T.isInteger() && T.getSizeInBits() == NumBits) {
       Src = Op0;
       return true;
     }
@@ -1218,23 +1412,23 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
   case ISD::AssertZext:
     if (Val.getOperand(0).getValueType().isInteger()) {
       VTSDNode *T = cast<VTSDNode>(Val.getOperand(1));
-      if (T->getVT().getSizeInBits() == FromBits) {
+      if (T->getVT().getSizeInBits() == NumBits) {
         Src = Val.getOperand(0);
         return true;
       }
     }
     break;
   case ISD::AND: {
-    // Check if this is an AND with "FromBits" of lower bits set to 1.
-    uint64_t FromMask = (1 << FromBits) - 1;
+    // Check if this is an AND with NumBits of lower bits set to 1.
+    uint64_t Mask = (1 << NumBits) - 1;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
-      if (C->getZExtValue() == FromMask) {
+      if (C->getZExtValue() == Mask) {
         Src = Val.getOperand(1);
         return true;
       }
     }
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
-      if (C->getZExtValue() == FromMask) {
+      if (C->getZExtValue() == Mask) {
         Src = Val.getOperand(0);
         return true;
       }
@@ -1243,16 +1437,16 @@ bool HexagonDAGToDAGISel::isValueExtension(const SDValue &Val,
   }
   case ISD::OR:
   case ISD::XOR: {
-    // OR/XOR with the lower "FromBits" bits set to 0.
-    uint64_t FromMask = (1 << FromBits) - 1;
+    // OR/XOR with the lower NumBits bits set to 0.
+    uint64_t Mask = (1 << NumBits) - 1;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(0))) {
-      if ((C->getZExtValue() & FromMask) == 0) {
+      if ((C->getZExtValue() & Mask) == 0) {
         Src = Val.getOperand(1);
         return true;
       }
     }
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val.getOperand(1))) {
-      if ((C->getZExtValue() & FromMask) == 0) {
+      if ((C->getZExtValue() & Mask) == 0) {
         Src = Val.getOperand(0);
         return true;
       }
@@ -1287,6 +1481,20 @@ bool HexagonDAGToDAGISel::isAlignedMemNode(const MemSDNode *N) const {
   return N->getAlignment() >= N->getMemoryVT().getStoreSize();
 }
 
+bool HexagonDAGToDAGISel::isSmallStackStore(const StoreSDNode *N) const {
+  unsigned StackSize = MF->getFrameInfo().estimateStackSize(*MF);
+  switch (N->getMemoryVT().getStoreSize()) {
+    case 1:
+      return StackSize <= 56;   // 1*2^6 - 8
+    case 2:
+      return StackSize <= 120;  // 2*2^6 - 8
+    case 4:
+      return StackSize <= 248;  // 4*2^6 - 8
+    default:
+      return false;
+  }
+}
+
 // Return true when the given node fits in a positive half word.
 bool HexagonDAGToDAGISel::isPositiveHalfWord(const SDNode *N) const {
   if (const ConstantSDNode *CN = dyn_cast<const ConstantSDNode>(N)) {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index e87e1e6..3997702 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Hexagon.h"
 #include "HexagonISelLowering.h"
+#include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
@@ -26,8 +26,8 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -256,7 +256,9 @@ static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
     return false;
   }
 
-  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+  if (LocVT == MVT::i1) {
+    LocVT = MVT::i32;
+  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
     ValVT = MVT::i32;
     if (ArgFlags.isSExt())
@@ -483,9 +485,7 @@ static bool RetCC_Hexagon32(unsigned ValNo, MVT ValVT,
     }
   }
 
-  unsigned Offset = State.AllocateStack(4, 4);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
+  return true;
 }
 
 static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
@@ -498,9 +498,7 @@ static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT,
     }
   }
 
-  unsigned Offset = State.AllocateStack(8, 8);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
+  return true;
 }
 
 static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
@@ -511,7 +509,6 @@ static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
   bool UseHVX = HST.useHVXOps();
   bool UseHVXDbl = HST.useHVXDblOps();
 
-  unsigned OffSiz = 64;
   if (LocVT == MVT::v16i32) {
     if (unsigned Reg = State.AllocateReg(Hexagon::V0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -523,18 +520,14 @@ static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT,
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
-    OffSiz = 128;
   } else if (LocVT == MVT::v64i32) {
     if (unsigned Reg = State.AllocateReg(Hexagon::W0)) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
-    OffSiz = 256;
   }
 
-  unsigned Offset = State.AllocateStack(OffSiz, OffSiz);
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return false;
+  return true;
 }
 
 void HexagonTargetLowering::promoteLdStType(MVT VT, MVT PromotedLdStVT) {
@@ -590,6 +583,16 @@ static bool isHvxVectorType(MVT Ty) {
   }
 }
 
+bool
+HexagonTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_Hexagon);
+}
+
 // LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is
 // passed by value, the function prototype is modified to return void and
 // the value is stored in memory pointed by a pointer passed by caller.
@@ -632,7 +635,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
-bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool HexagonTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // If either no tail call or told not to tail call at all, don't.
   auto Attr =
       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
@@ -644,11 +647,11 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 
 /// LowerCallResult - Lower the result values of an ISD::CALL into the
 /// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// Chain/Glue are the input chain/glue to use, and that TheCall is the call
 /// being lowered. Returns a SDNode with the same number of values as the
 /// ISD::CALL.
 SDValue HexagonTargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
@@ -671,21 +674,24 @@ SDValue HexagonTargetLowering::LowerCallResult(
       // predicate register as the call result.
       auto &MRI = DAG.getMachineFunction().getRegInfo();
       SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                       MVT::i32, InFlag);
+                                       MVT::i32, Glue);
       // FR0 = (Value, Chain, Glue)
       unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
       SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
                                      FR0.getValue(0), FR0.getValue(2));
       // TPR = (Chain, Glue)
-      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1,
-                                  TPR.getValue(1));
+      // Don't glue this CopyFromReg, because it copies from a virtual
+      // register. If it is glued to the call, InstrEmitter will add it
+      // as an implicit def to the call (EmitMachineNode).
+      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1);
+      Glue = TPR.getValue(1);
     } else {
       RetVal = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                  RVLocs[i].getValVT(), InFlag);
+                                  RVLocs[i].getValVT(), Glue);
+      Glue = RetVal.getValue(2);
     }
     InVals.push_back(RetVal.getValue(0));
     Chain = RetVal.getValue(1);
-    InFlag = RetVal.getValue(2);
   }
 
   return Chain;
@@ -710,6 +716,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
   // Check for varargs.
@@ -826,7 +833,6 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (NeedsArgAlign && Subtarget.hasV60TOps()) {
     DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
-    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     // V6 vectors passed by value have 64 or 128 byte alignment depending
     // on whether we are 64 byte vector mode or 128 byte.
     bool UseHVXDbl = Subtarget.useHVXDblOps();
@@ -840,16 +846,16 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
+  SDValue Glue;
   if (!IsTailCall) {
-    SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
-    Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
+    Glue = Chain.getValue(1);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
   // The Glue is necessary since all emitted instructions must be
   // stuck together.
-  SDValue Glue;
   if (!IsTailCall) {
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -902,14 +908,23 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
   }
 
+  const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (Glue.getNode())
     Ops.push_back(Glue);
 
   if (IsTailCall) {
-    MF.getFrameInfo().setHasTailCall();
+    MFI.setHasTailCall();
     return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
+  // Set this here because we need to know this for "hasFP" in frame lowering.
+  // The target-independent code calls getFrameRegister before setting it, and
+  // getFrameRegister uses hasFP to determine whether the function has FP.
+  MFI.setHasCalls(true);
+
   unsigned OpCode = DoesNotReturn ? HexagonISD::CALLnr : HexagonISD::CALL;
   Chain = DAG.getNode(OpCode, dl, NodeTys, Ops);
   Glue = Chain.getValue(1);
@@ -992,51 +1007,46 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 
 SDValue
 HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
   MachineFunction &MF = DAG.getMachineFunction();
-  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
-  switch (Node->getOpcode()) {
-    case ISD::INLINEASM: {
-      unsigned NumOps = Node->getNumOperands();
-      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
-        --NumOps;  // Ignore the flag operand.
-
-      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
-        if (FuncInfo.hasClobberLR())
-          break;
-        unsigned Flags =
-          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
-        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
-        ++i;  // Skip the ID value.
-
-        switch (InlineAsm::getKind(Flags)) {
-        default: llvm_unreachable("Bad flags!");
-          case InlineAsm::Kind_RegDef:
-          case InlineAsm::Kind_RegUse:
-          case InlineAsm::Kind_Imm:
-          case InlineAsm::Kind_Clobber:
-          case InlineAsm::Kind_Mem: {
-            for (; NumVals; --NumVals, ++i) {}
-            break;
-          }
-          case InlineAsm::Kind_RegDefEarlyClobber: {
-            for (; NumVals; --NumVals, ++i) {
-              unsigned Reg =
-                cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-
-              // Check it to be lr
-              const HexagonRegisterInfo *QRI = Subtarget.getRegisterInfo();
-              if (Reg == QRI->getRARegister()) {
-                FuncInfo.setHasClobberLR(true);
-                break;
-              }
-            }
-            break;
-          }
+  auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+  unsigned LR = HRI.getRARegister();
+
+  if (Op.getOpcode() != ISD::INLINEASM || HMFI.hasClobberLR())
+    return Op;
+
+  unsigned NumOps = Op.getNumOperands();
+  if (Op.getOperand(NumOps-1).getValueType() == MVT::Glue)
+    --NumOps;  // Ignore the flag operand.
+
+  for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+    unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
+    unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+    ++i;  // Skip the ID value.
+
+    switch (InlineAsm::getKind(Flags)) {
+      default:
+        llvm_unreachable("Bad flags!");
+      case InlineAsm::Kind_RegUse:
+      case InlineAsm::Kind_Imm:
+      case InlineAsm::Kind_Mem:
+        i += NumVals;
+        break;
+      case InlineAsm::Kind_Clobber:
+      case InlineAsm::Kind_RegDef:
+      case InlineAsm::Kind_RegDefEarlyClobber: {
+        for (; NumVals; --NumVals, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
+          if (Reg != LR)
+            continue;
+          HMFI.setHasClobberLR(true);
+          return Op;
         }
+        break;
       }
     }
-  } // Node->getOpcode
+  }
+
   return Op;
 }
 
@@ -1054,6 +1064,18 @@ SDValue HexagonTargetLowering::LowerPREFETCH(SDValue Op,
   return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
 }
 
+// Custom-handle ISD::READCYCLECOUNTER because the target-independent SDNode
+// is marked as having side-effects, while the register read on Hexagon does
+// not have any. TableGen refuses to accept the direct pattern from that node
+// to the A4_tfrcpp.
+SDValue HexagonTargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDLoc dl(Op);
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+  return DAG.getNode(HexagonISD::READCYCLE, dl, VTs, Chain);
+}
+
 SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
@@ -1140,10 +1162,25 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
       EVT RegVT = VA.getLocVT();
       if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
           RegVT == MVT::i32 || RegVT == MVT::f32) {
-        unsigned VReg =
+        unsigned VReg = 
           RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+        SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+        // Treat values of type MVT::i1 specially: they are passed in
+        // registers of type i32, but they need to remain as values of
+        // type i1 for consistency of the argument lowering.
+        if (VA.getValVT() == MVT::i1) {
+          // Generate a copy into a predicate register and use the value
+          // of the register as the "InVal".
+          unsigned PReg =
+            RegInfo.createVirtualRegister(&Hexagon::PredRegsRegClass);
+          SDNode *T = DAG.getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
+                                         Copy.getValue(0));
+          Copy = DAG.getCopyToReg(Copy.getValue(1), dl, PReg, SDValue(T, 0));
+          Copy = DAG.getCopyFromReg(Copy, dl, PReg, MVT::i1);
+        }
+        InVals.push_back(Copy);
+        Chain = Copy.getValue(1);
       } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
         unsigned VReg =
           RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
@@ -1217,7 +1254,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
         InVals.push_back(FIN);
       } else {
         InVals.push_back(
-            DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, MachinePointerInfo()));
+            DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
       }
     }
   }
@@ -1249,18 +1286,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV));
 }
 
-// Creates a SPLAT instruction for a constant value VAL.
-static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
-                           SDValue Val) {
-  if (VT.getSimpleVT() == MVT::v4i8)
-    return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val);
-
-  if (VT.getSimpleVT() == MVT::v4i16)
-    return DAG.getNode(HexagonISD::VSPLATH, dl, VT, Val);
-
-  return SDValue();
-}
-
 static bool isSExtFree(SDValue N) {
   // A sign-extend of a truncate of a sign-extend is free.
   if (N.getOpcode() == ISD::TRUNCATE &&
@@ -1272,17 +1297,6 @@ static bool isSExtFree(SDValue N) {
   return false;
 }
 
-SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue InpVal = Op.getOperand(0);
-  if (isa<ConstantSDNode>(InpVal)) {
-    uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue();
-    return DAG.getTargetConstant(countPopulation(V), dl, MVT::i64);
-  }
-  SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal);
-  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut);
-}
-
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
@@ -1350,79 +1364,6 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// Handle only specific vector loads.
-SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
-  SDValue Chain = LoadNode->getChain();
-  SDValue Ptr = Op.getOperand(1);
-  SDValue LoweredLoad;
-  SDValue Result;
-  SDValue Base = LoadNode->getBasePtr();
-  ISD::LoadExtType Ext = LoadNode->getExtensionType();
-  unsigned Alignment = LoadNode->getAlignment();
-  SDValue LoadChain;
-
-  if(Ext == ISD::NON_EXTLOAD)
-    Ext = ISD::ZEXTLOAD;
-
-  if (VT == MVT::v4i16) {
-    if (Alignment == 2) {
-      SDValue Loads[4];
-      // Base load.
-      Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base+2 load.
-      SDValue Increment = DAG.getConstant(2, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base and base+2.
-      SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32);
-      SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount);
-      SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]);
-      // Base + 4.
-      Increment = DAG.getConstant(4, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // Base + 6.
-      Increment = DAG.getConstant(6, DL, MVT::i32);
-      Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment);
-      Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr,
-                                LoadNode->getPointerInfo(), MVT::i16, Alignment,
-                                LoadNode->getMemOperand()->getFlags());
-      // SHL 16, then OR base+4 and base+6.
-      Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount);
-      SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]);
-      // Combine to i64. This could be optimised out later if we can
-      // affect reg allocation of this code.
-      Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2);
-      LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                              Loads[0].getValue(1), Loads[1].getValue(1),
-                              Loads[2].getValue(1), Loads[3].getValue(1));
-    } else {
-      // Perform default type expansion.
-      Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
-                           LoadNode->getAlignment(),
-                           LoadNode->getMemOperand()->getFlags());
-      LoadChain = Result.getValue(1);
-    }
-  } else
-    llvm_unreachable("Custom lowering unsupported load");
-
-  Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
-  // Since we pretend to lower a load, we need the original chain
-  // info attached to the result.
-  SDValue Ops[] = { Result, LoadChain };
-
-  return DAG.getMergeValues(Ops, DL);
-}
-
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
@@ -1571,9 +1512,10 @@ HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
 
 SDValue
 HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-      GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT, unsigned ReturnReg,
+      GlobalAddressSDNode *GA, SDValue Glue, EVT PtrVT, unsigned ReturnReg,
       unsigned char OperandFlags) const {
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
@@ -1585,23 +1527,21 @@ HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
   // 2. Callee which in this case is the Global address value.
   // 3. Registers live into the call.In this case its R0, as we
   //    have just one argument to be passed.
-  // 4. InFlag if there is any.
+  // 4. Glue.
   // Note: The order is important.
 
-  if (InFlag) {
-    SDValue Ops[] = { Chain, TGA,
-                      DAG.getRegister(Hexagon::R0, PtrVT), *InFlag };
-    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
-  } else {
-    SDValue Ops[]  = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT)};
-    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
-  }
+  const auto &HRI = *Subtarget.getRegisterInfo();
+  const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  SDValue Ops[] = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT),
+                    DAG.getRegisterMask(Mask), Glue };
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
 
   // Inform MFI that function has calls.
   MFI.setAdjustsStack(true);
 
-  SDValue Flag = Chain.getValue(1);
-  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+  Glue = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
 }
 
 //
@@ -1694,8 +1634,13 @@ HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
   InFlag = Chain.getValue(1);
 
-  return GetDynamicTLSAddr(DAG, Chain, GA, &InFlag, PtrVT,
-                           Hexagon::R0, HexagonII::MO_GDPLT);
+  unsigned Flags =
+      static_cast<const HexagonSubtarget &>(DAG.getSubtarget()).useLongCalls()
+          ? HexagonII::MO_GDPLT | HexagonII::HMOTF_ConstExtended
+          : HexagonII::MO_GDPLT;
+
+  return GetDynamicTLSAddr(DAG, Chain, GA, InFlag, PtrVT,
+                           Hexagon::R0, Flags);
 }
 
 //
@@ -1821,6 +1766,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
@@ -1891,17 +1837,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i8,  Promote);
   setOperationAction(ISD::CTPOP, MVT::i16, Promote);
   setOperationAction(ISD::CTPOP, MVT::i32, Promote);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
 
-  // We custom lower i64 to i64 mul, so that it is not considered as a legal
-  // operation. There is a pattern that will match i64 mul and transform it
-  // to a series of instructions.
-  setOperationAction(ISD::MUL,   MVT::i64, Expand);
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i64, Legal);
+  setOperationAction(ISD::MUL,   MVT::i64, Legal);
 
   for (unsigned IntExpOp :
        { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
          ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
-         ISD::BSWAP,     ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+         ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
          ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
     setOperationAction(IntExpOp, MVT::i32, Expand);
     setOperationAction(IntExpOp, MVT::i64, Expand);
@@ -1941,18 +1888,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   // Handling of vector operations.
   //
 
-  // Custom lower v4i16 load only. Let v4i16 store to be
-  // promoted for now.
   promoteLdStType(MVT::v4i8,  MVT::i32);
   promoteLdStType(MVT::v2i16, MVT::i32);
   promoteLdStType(MVT::v8i8,  MVT::i64);
+  promoteLdStType(MVT::v4i16, MVT::i64);
   promoteLdStType(MVT::v2i32, MVT::i64);
 
-  setOperationAction(ISD::LOAD,  MVT::v4i16, Custom);
-  setOperationAction(ISD::STORE, MVT::v4i16, Promote);
-  AddPromotedToType(ISD::LOAD,  MVT::v4i16, MVT::i64);
-  AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64);
-
   // Set the action for vector operations to "expand", then override it with
   // either "custom" or "legal" for specific cases.
   static const unsigned VectExpOps[] = {
@@ -1967,7 +1908,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     // Floating point arithmetic/math functions:
     ISD::FADD,    ISD::FSUB,    ISD::FMUL,    ISD::FMA,     ISD::FDIV,
     ISD::FREM,    ISD::FNEG,    ISD::FABS,    ISD::FSQRT,   ISD::FSIN,
-    ISD::FCOS,    ISD::FPOWI,   ISD::FPOW,    ISD::FLOG,    ISD::FLOG2,
+    ISD::FCOS,    ISD::FPOW,    ISD::FLOG,    ISD::FLOG2,
     ISD::FLOG10,  ISD::FEXP,    ISD::FEXP2,   ISD::FCEIL,   ISD::FTRUNC,
     ISD::FRINT,   ISD::FNEARBYINT,            ISD::FROUND,  ISD::FFLOOR,
     ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS,
@@ -2268,34 +2209,16 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
   case HexagonISD::JT:            return "HexagonISD::JT";
   case HexagonISD::PACKHL:        return "HexagonISD::PACKHL";
-  case HexagonISD::POPCOUNT:      return "HexagonISD::POPCOUNT";
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
-  case HexagonISD::SHUFFEB:       return "HexagonISD::SHUFFEB";
-  case HexagonISD::SHUFFEH:       return "HexagonISD::SHUFFEH";
-  case HexagonISD::SHUFFOB:       return "HexagonISD::SHUFFOB";
-  case HexagonISD::SHUFFOH:       return "HexagonISD::SHUFFOH";
   case HexagonISD::TC_RETURN:     return "HexagonISD::TC_RETURN";
-  case HexagonISD::VCMPBEQ:       return "HexagonISD::VCMPBEQ";
-  case HexagonISD::VCMPBGT:       return "HexagonISD::VCMPBGT";
-  case HexagonISD::VCMPBGTU:      return "HexagonISD::VCMPBGTU";
-  case HexagonISD::VCMPHEQ:       return "HexagonISD::VCMPHEQ";
-  case HexagonISD::VCMPHGT:       return "HexagonISD::VCMPHGT";
-  case HexagonISD::VCMPHGTU:      return "HexagonISD::VCMPHGTU";
-  case HexagonISD::VCMPWEQ:       return "HexagonISD::VCMPWEQ";
-  case HexagonISD::VCMPWGT:       return "HexagonISD::VCMPWGT";
-  case HexagonISD::VCMPWGTU:      return "HexagonISD::VCMPWGTU";
   case HexagonISD::VCOMBINE:      return "HexagonISD::VCOMBINE";
-  case HexagonISD::VPACK:         return "HexagonISD::VPACK";
-  case HexagonISD::VSHLH:         return "HexagonISD::VSHLH";
-  case HexagonISD::VSHLW:         return "HexagonISD::VSHLW";
-  case HexagonISD::VSPLATB:       return "HexagonISD::VSPLTB";
-  case HexagonISD::VSPLATH:       return "HexagonISD::VSPLATH";
-  case HexagonISD::VSRAH:         return "HexagonISD::VSRAH";
-  case HexagonISD::VSRAW:         return "HexagonISD::VSRAW";
-  case HexagonISD::VSRLH:         return "HexagonISD::VSRLH";
-  case HexagonISD::VSRLW:         return "HexagonISD::VSRLW";
-  case HexagonISD::VSXTBH:        return "HexagonISD::VSXTBH";
-  case HexagonISD::VSXTBW:        return "HexagonISD::VSXTBW";
+  case HexagonISD::VPACKE:        return "HexagonISD::VPACKE";
+  case HexagonISD::VPACKO:        return "HexagonISD::VPACKO";
+  case HexagonISD::VASL:          return "HexagonISD::VASL";
+  case HexagonISD::VASR:          return "HexagonISD::VASR";
+  case HexagonISD::VLSR:          return "HexagonISD::VLSR";
+  case HexagonISD::VSPLAT:        return "HexagonISD::VSPLAT";
+  case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
@@ -2383,7 +2306,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
 
     // Test if V1 is a SCALAR_TO_VECTOR.
     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return createSplat(DAG, dl, VT, V1.getOperand(0));
+      return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
 
     // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
     // (and probably will turn into a SCALAR_TO_VECTOR once legalization
@@ -2398,28 +2321,26 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
         }
       }
       if (IsScalarToVector)
-        return createSplat(DAG, dl, VT, V1.getOperand(0));
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0));
     }
-    return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32));
+    return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                       DAG.getConstant(Lane, dl, MVT::i32));
   }
 
   if (UseHVX) {
     ArrayRef<int> Mask = SVN->getMask();
     size_t MaskLen = Mask.size();
-    int ElemSizeInBits = VT.getScalarSizeInBits();
-    if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) ||
-        (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) {
-      // Return 1 for odd and 2 of even
-      StridedLoadKind Pattern = isStridedLoad(Mask);
+    unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen;
 
+    if ((Subtarget.useHVXSglOps() && SizeInBits == 64 * 8) ||
+        (Subtarget.useHVXDblOps() && SizeInBits == 128 * 8)) {
+      StridedLoadKind Pattern = isStridedLoad(Mask);
       if (Pattern == StridedLoadKind::NoPattern)
         return SDValue();
 
-      SDValue Vec0 = Op.getOperand(0);
-      SDValue Vec1 = Op.getOperand(1);
-      SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32);
-      SDValue Ops[] = { Vec1, Vec0, StridePattern };
-      return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops);
+      unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE
+                                                      : HexagonISD::VPACKO;
+      return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)});
     }
     // We used to assert in the "else" part here, but that is bad for Halide
     // Halide creates intermediate double registers by interleaving two
@@ -2476,13 +2397,13 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
   if (VT.getSimpleVT() == MVT::v4i16) {
     switch (Op.getOpcode()) {
     case ISD::SRA:
-      Result = DAG.getNode(HexagonISD::VSRAH, dl, VT, V3, CommonSplat);
+      Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
       break;
     case ISD::SHL:
-      Result = DAG.getNode(HexagonISD::VSHLH, dl, VT, V3, CommonSplat);
+      Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
       break;
     case ISD::SRL:
-      Result = DAG.getNode(HexagonISD::VSRLH, dl, VT, V3, CommonSplat);
+      Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
       break;
     default:
       return SDValue();
@@ -2490,13 +2411,13 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const {
   } else if (VT.getSimpleVT() == MVT::v2i32) {
     switch (Op.getOpcode()) {
     case ISD::SRA:
-      Result = DAG.getNode(HexagonISD::VSRAW, dl, VT, V3, CommonSplat);
+      Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat);
       break;
     case ISD::SHL:
-      Result = DAG.getNode(HexagonISD::VSHLW, dl, VT, V3, CommonSplat);
+      Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat);
       break;
     case ISD::SRL:
-      Result = DAG.getNode(HexagonISD::VSRLW, dl, VT, V3, CommonSplat);
+      Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat);
       break;
     default:
       return SDValue();
@@ -2520,19 +2441,26 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (Size > 64)
     return SDValue();
 
-  APInt APSplatBits, APSplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
   unsigned NElts = BVN->getNumOperands();
 
   // Try to generate a SPLAT instruction.
-  if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) &&
-      (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                            HasAnyUndefs, 0, true) && SplatBitSize <= 16)) {
-    unsigned SplatBits = APSplatBits.getZExtValue();
-    int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >>
-                       (32 - SplatBitSize));
-    return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32));
+  if (VT == MVT::v4i8 || VT == MVT::v4i16 || VT == MVT::v2i32) {
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, false)) {
+      if (SplatBitSize == VT.getVectorElementType().getSizeInBits()) {
+        unsigned ZV = APSplatBits.getZExtValue();
+        assert(SplatBitSize <= 32 && "Can only handle up to i32");
+        // Sign-extend the splat value from SplatBitSize to 32.
+        int32_t SV = SplatBitSize < 32
+                        ? int32_t(ZV << (32-SplatBitSize)) >> (32-SplatBitSize)
+                        : int32_t(ZV);
+        return DAG.getNode(HexagonISD::VSPLAT, dl, VT,
+                           DAG.getConstant(SV, dl, MVT::i32));
+      }
+    }
   }
 
   // Try to generate COMBINE to build v2i32 vectors.
@@ -2963,16 +2891,14 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
     case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
-    // Custom lower some vector loads.
-    case ISD::LOAD:                 return LowerLOAD(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
-    case ISD::CTPOP:                return LowerCTPOP(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
     case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
     case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
+    case ISD::READCYCLECOUNTER:     return LowerREADCYCLECOUNTER(Op, DAG);
   }
 }
 
@@ -3026,37 +2952,25 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
       }
     case 'q': // q0-q3
-      switch (VT.SimpleTy) {
+      switch (VT.getSizeInBits()) {
       default:
-        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
-      case MVT::v1024i1:
-      case MVT::v512i1:
-      case MVT::v32i16:
-      case MVT::v16i32:
-      case MVT::v64i8:
-      case MVT::v8i64:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled vector size");
+      case 512:
         return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+      case 1024:
+        return std::make_pair(0U, &Hexagon::VecPredRegs128BRegClass);
       }
     case 'v': // V0-V31
-      switch (VT.SimpleTy) {
+      switch (VT.getSizeInBits()) {
       default:
-        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
-      case MVT::v16i32:
-      case MVT::v32i16:
-      case MVT::v64i8:
-      case MVT::v8i64:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled vector size");
+      case 512:
         return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
-      case MVT::v32i32:
-      case MVT::v64i16:
-      case MVT::v16i64:
-      case MVT::v128i8:
+      case 1024:
         if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
           return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
         return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
-      case MVT::v256i8:
-      case MVT::v128i16:
-      case MVT::v64i32:
-      case MVT::v32i64:
+      case 2048:
         return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
       }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index a8ed29e..d66cbc9 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -50,42 +50,24 @@ namespace HexagonISD {
       JT,          // Jump table.
       CP,          // Constant pool.
 
-      POPCOUNT,
       COMBINE,
       PACKHL,
-      VSPLATB,
-      VSPLATH,
-      SHUFFEB,
-      SHUFFEH,
-      SHUFFOB,
-      SHUFFOH,
-      VSXTBH,
-      VSXTBW,
-      VSRAW,
-      VSRAH,
-      VSRLW,
-      VSRLH,
-      VSHLW,
-      VSHLH,
-      VCMPBEQ,
-      VCMPBGT,
-      VCMPBGTU,
-      VCMPHEQ,
-      VCMPHGT,
-      VCMPHGTU,
-      VCMPWEQ,
-      VCMPWGT,
-      VCMPWGTU,
+      VSPLAT,
+      VASL,
+      VASR,
+      VLSR,
 
       INSERT,
       INSERTRP,
       EXTRACTU,
       EXTRACTURP,
       VCOMBINE,
-      VPACK,
+      VPACKE,
+      VPACKO,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
+      READCYCLE,
 
       OP_END
     };
@@ -146,6 +128,7 @@ namespace HexagonISD {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue
@@ -163,7 +146,7 @@ namespace HexagonISD {
     SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
         SelectionDAG &DAG) const;
     SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-        GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT,
+        GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
         unsigned ReturnReg, unsigned char OperandFlags) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
 
@@ -179,18 +162,21 @@ namespace HexagonISD {
 
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
                         const SDLoc &dl, SelectionDAG &DAG) const override;
 
-    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
deleted file mode 100644
index 7283d94..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td
+++ /dev/null
@@ -1,652 +0,0 @@
-//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//                     Hexagon Instruction Mappings
-//===----------------------------------------------------------------------===//
-
-
-def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
-                (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
-                (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
-                (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memb({GP}+#$addr) = $Nt",
-                (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt",
-                (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
-                (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memw({GP}+#$addr) = $Nt",
-                (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memd({GP}+#$addr) = $Nt",
-                (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
-
-def : InstAlias<"$Nt = memb({GP}+#$addr)",
-                (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
-def : InstAlias<"$Nt = memub({GP}+#$addr)",
-                (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
-def : InstAlias<"$Nt = memh({GP}+#$addr)",
-                (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
-def : InstAlias<"$Nt = memuh({GP}+#$addr)",
-                (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
-def : InstAlias<"$Nt = memw({GP}+#$addr)",
-                (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
-def : InstAlias<"$Nt = memd({GP}+#$addr)",
-                (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
-
-// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
-def : InstAlias<"memb($Rs) = $Rt",
-      (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt",
-      (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt.h",
-      (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memw($Rs) = $Rt",
-      (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memb($Rs) = $Rt.new",
-      (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt.new",
-      (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memw($Rs) = $Rt.new",
-      (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memb($Rs) = #$S8",
-      (S4_storeirb_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memh($Rs) = #$S8",
-      (S4_storeirh_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memw($Rs) = #$S8",
-      (S4_storeiri_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memd($Rs) = $Rtt",
-      (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"memb($Rs) = setbit(#$U5)",
-      (L4_ior_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memh($Rs) = setbit(#$U5)",
-      (L4_ior_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memw($Rs) = setbit(#$U5)",
-      (L4_ior_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memb($Rs) = clrbit(#$U5)",
-      (L4_iand_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memh($Rs) = clrbit(#$U5)",
-      (L4_iand_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memw($Rs) = clrbit(#$U5)",
-      (L4_iand_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
-def : InstAlias<"$Rd = memb($Rs)",
-      (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memub($Rs)",
-      (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memh($Rs)",
-      (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memuh($Rs)",
-      (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memw($Rs)",
-      (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memd($Rs)",
-      (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memubh($Rs)",
-      (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memubh($Rs)",
-      (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = membh($Rs)",
-      (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = membh($Rs)",
-      (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memb_fifo($Rs)",
-      (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memh_fifo($Rs)",
-      (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
-//       to: if ($Pt) $Rd = memXX($Rs)
-def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
-      (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
-      (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
-      (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
-      (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
-      (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
-      (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
-//       to: if ($Pt) memXX($Rs) = $Rt
-def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
-      (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
-      (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
-      (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
-      (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
-      (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
-      (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
-      (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
-      (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
-      (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
-      (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
-      (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-
-// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
-//       to: if (!$Pt) $Rd = memXX($Rs)
-def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
-      (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
-      (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
-      (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
-      (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
-      (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
-      (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
-//       to: if (!$Pt) memXX($Rs) = $Rt
-def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
-      (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
-      (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
-      (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
-      (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
-      (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
-      (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
-      (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
-      (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
-      (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
-      (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
-      (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
-      (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
-      (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
-      (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
-      (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
-      (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
-      (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
-      (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
-      (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
-      (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
-      (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
-      (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
-      (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
-//       to: memXX($Rs) |= $Rt
-def : InstAlias<"memb($Rs) &= $Rt",
-      (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) |= $Rt",
-      (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) += $Rt",
-      (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) -= $Rt",
-      (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) += #$U5",
-      (L4_iadd_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) -= #$U5",
-      (L4_isub_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) &= $Rt",
-      (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) |= $Rt",
-      (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) += $Rt",
-      (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) -= $Rt",
-      (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) += #$U5",
-      (L4_iadd_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) -= #$U5",
-      (L4_isub_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) &= $Rt",
-      (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) |= $Rt",
-      (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) += $Rt",
-      (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) -= $Rt",
-      (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) += #$U5",
-      (L4_iadd_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) -= #$U5",
-      (L4_isub_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-//
-// Alias of: if ($Pv.new) memX($Rs) = $Rt
-//       to: if (p3.new) memX(r17 + #0) = $Rt
-def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
-      (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
-      (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
-      (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
-      (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
-      (S4_pstorerdtnew_io
-       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
-      (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
-      (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
-      (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
-      (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
-      (S4_pstorerdfnew_io
-       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-//
-// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
-//       to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
-def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
-      (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
-      (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
-      (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
-      (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
-      (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
-      (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
-      (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
-      (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
-      (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
-      (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
-      (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
-      (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"dcfetch($Rs)",
-      (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
-
-// Alias of some insn mappings, others must be handled by the parser
-def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
-      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
-      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-
-// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
-def : InstAlias<"$Rd = neg($Rs)",
-      (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
-
-def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
-def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
-def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
-def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
-
-def : InstAlias<"$Pd = $Ps",
-      (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
-
-def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
-      (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
-
-def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
-      (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
-      (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
-
-// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
-def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
-      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
-      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-
-// maps if (!Pu) jumpr Rs -> if (!Pu) jumpr:nt Rs
-def : InstAlias<"if (!$Pu) jumpr $Rs",
-      (J2_jumprf PredRegs:$Pu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// maps if (Pu) jumpr Rs -> if (Pu) jumpr:nt Rs
-def : InstAlias<"if ($Pu) jumpr $Rs",
-      (J2_jumprt PredRegs:$Pu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// maps if (!Pu) jump $r15_2 -> if (!Pu) jump:nt $r15_2
-def : InstAlias<"if (!$Pu) jump $r15_2",
-      (J2_jumpf PredRegs:$Pu, brtarget:$r15_2)>,
-      Requires<[HasV60T]>;
-
-// maps if (Pu) jump $r15_2 -> if (Pu) jump:nt $r15_2
-def : InstAlias<"if ($Pu) jump $r15_2",
-     (J2_jumpt PredRegs:$Pu, brtarget:$r15_2)>,
-     Requires<[HasV60T]>;
-
-def : InstAlias<"if ($src) jump $r15_2",
-      (J2_jumpt PredRegs:$src, brtarget:$r15_2), 0>;
-
-def : InstAlias<"if (!$src) jump $r15_2",
-      (J2_jumpf PredRegs:$src, brtarget:$r15_2), 0>;
-
-def : InstAlias<"if ($src1) jumpr $src2",
-      (J2_jumprt PredRegs:$src1, IntRegs:$src2), 0>;
-
-def : InstAlias<"if (!$src1) jumpr $src2",
-      (J2_jumprf PredRegs:$src1, IntRegs:$src2), 0>;
-
-// maps Vdd = Vss to Vdd = V6_vassignp(Vss)
-def : InstAlias<"$Vdd = $Vss",
-      (V6_vassignp VecDblRegs:$Vdd, VecDblRegs:$Vss)>,
-      Requires<[HasV60T]>;
-
-// maps Vd = #0 to Vd = vxor(Vd, Vd)
-def : InstAlias<"$Vd = #0",
-      (V6_vxor VectorRegs:$Vd, VectorRegs:$Vd, VectorRegs:$Vd)>,
-      Requires<[HasV60T]>;
-
-// maps Vdd  = #0 to Vdd = vsub(Vdd, Vdd)
-def : InstAlias<"$Vdd = #0",
-      (V6_vsubw_dv VecDblRegs:$Vdd, VecDblRegs:$Vdd, VecDblRegs:$Vdd)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd = vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd &= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd |= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd ^= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd = vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd &= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd |= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd ^= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd = vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd &= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd |= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd ^= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Rd.w = vextract($Vu, $Rs)" -> "$Rd = vextract($Vu, $Rs)"
-def : InstAlias<"$Rd.w = vextract($Vu, $Rs)",
-      (V6_extractw IntRegs:$Rd, VectorRegs:$Vu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// Mapping from vtrans2x2(Vy32,Vx32,Rt32) to vshuff(Vy32,Vx32,Rt32)
-def : InstAlias<"vtrans2x2($Vy, $Vx, $Rt)",
-      (V6_vshuff VectorRegs:$Vy, VectorRegs:$Vx, IntRegs:$Rt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmem($Rs)",
-      (V6_vL32b_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmem($Rs):nt",
-      (V6_vL32b_nt_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs)=$Vt",
-      (V6_vS32b_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs)=$Vt.new",
-      (V6_vS32b_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs):nt=$Vt.new",
-      (V6_vS32b_nt_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Qv) vmem($Rs)=$Vt",
-      (V6_vS32b_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Qv) vmem($Rs)=$Vt",
-      (V6_vS32b_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Qv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Qv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmem($Rs)=$Vt",
-      (V6_vS32b_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmem($Rs)=$Vt",
-      (V6_vS32b_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmemu($Rs)",
-      (V6_vL32Ub_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmemu($Rs)=$Vt",
-      (V6_vS32Ub_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmemu($Rs)=$Vt",
-      (V6_vS32Ub_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmemu($Rs)=$Vt",
-      (V6_vS32Ub_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
deleted file mode 100644
index 280832f..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td
+++ /dev/null
@@ -1,1019 +0,0 @@
-class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { opc{14-4}, src2};
-  let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
-}
-
-class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
-class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
-class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
-class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
-class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
-class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
-class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
-class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
-class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
-class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
-class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
-class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
-class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
-class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
-class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
-class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
-class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
-class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
-class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
-class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
-class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
-class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
-class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
-class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
-class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
-class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
-class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
-class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
-class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
-class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
-class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
-class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
-class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
-class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
-class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
-class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
-class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
-class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
-class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
-class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
-class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
-class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
-class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
-class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
-class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
-class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
-class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
-class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
-class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
-class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
-class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
-class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
-class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
-class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
-class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
-class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
-class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
-class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
-class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
-class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
-class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
-class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
-class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
-class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
-class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
-class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
-class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
-class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
-class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
-class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
-class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
-class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
-class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
-class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
-class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
-class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
-class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
-class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
-class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
-class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
-class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
-class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
-class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
-class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
-class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
-class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
-class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
-class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
-class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
-class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
-class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
-class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
-class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
-class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
-class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
-class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
-class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
-class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
-class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
-class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
-class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
-class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
-class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
-class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
-class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
-class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
-class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
-class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
-class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
-class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
-class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
-class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
-class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
-class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
-class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
-class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
-class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
-class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
-class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
-class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
-class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
-class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
-class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
-class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
-class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
-class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
-class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
-class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
-class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
-class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
-class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
-class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
-class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
-class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
-class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
-class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
-class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
-class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
-class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
-class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
-class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
-class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
-class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
-class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
-class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
-class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
-class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
-class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
-class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
-class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
-class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
-class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
-class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
-class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
-class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
-class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
-class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
-class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
-class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
-class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
-class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
-class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
-class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
-class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
-class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
-class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
-class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
-class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
-class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
-class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
-class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
-class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
-class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
-class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
-class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
-class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
-class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
-class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
-class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
-class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
-class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
-class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
-class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
-class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
-class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
-
-class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
-  let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
-}
-
-class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
-class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
-class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
-class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
-class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
-class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
-class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
-class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
-class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
-class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
-class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
-class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
-class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
-class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
-class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
-class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
-class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
-class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
-class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
-class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
-class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
-class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
-class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
-class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
-class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
-class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
-class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
-class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
-class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
-class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
-class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
-class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
-class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
-class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
-class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
-class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
-class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
-class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
-
-class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
-  let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
-class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
-class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
-class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
-class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
-class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
-class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
-class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
-class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
-class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
-class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
-class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
-
-class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { 0b00011110000000, opc{5-4} };
-  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
-class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
-class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
-class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
-class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
-class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
-class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
-class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
-class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
-class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
-class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
-class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
-class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
-class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
-class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
-class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
-class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
-class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
-class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
-class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
-class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
-class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
-class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
-class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
-class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
-
-class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
-class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
-class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
-class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
-class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
-class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
-class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
-
-class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
-class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
-class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
-class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
-class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
-class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
-class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
-
-class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
-class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
-class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
-
-class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
-class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
-class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
-class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
-class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
-
-class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<4> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{9-6};
-  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<11> src3;
-  bits<4> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{10-7};
-  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
-class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
-class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
-class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
-class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
-class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
-class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
-class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
-class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
-class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
-
-class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
-class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
-class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
-class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
-class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
-class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
-class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
-class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
-class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
-class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<4> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{9-6};
-  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
-class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
-class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
-class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<11> src3;
-  bits<4> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{10-7};
-  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
-class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
-class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
-class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
-
-// TODO: Change script to generate dst, src1, src2 instead of
-// dst, dst2, src1.
-class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
-class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
-class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
-class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
-class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
-class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
-class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
-
-class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
-class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
-class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
-class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
-class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
-class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
-class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
-
-
-// TODO: Change script to generate src1, src2 and src3 instead of
-// dst, src1, src2.
-class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
-  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
-class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
-class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
-
-class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
-  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
-class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
-class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
-
-// TODO: Change script to generate src1, src2 and src3 instead of
-// dst, src1, src2.
-class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
-class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
-class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
-
-// TODO: Change script to generate src1, src2,src3 and src4 instead of
-// dst, src1, src2, src3.
-class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<9> src3;
-  bits<3> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{8-6};
-  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
-class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
-class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
-class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
-class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
-class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
-class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
-class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
-class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
-class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
-
-// TODO: Change script to generate src1, src2,src3 and src4 instead of
-// dst, src1, src2, src3.
-class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<3> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{9-7};
-  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
-class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
-class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
-class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
-class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
-class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
-class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
-class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
-class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
-class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<9> src3;
-  bits<3> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{8-6};
-  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
-class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
-class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
-class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<3> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{9-7};
-  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
-class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
-class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
-class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
-
-class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<1> src2;
-
-  let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
-class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
-class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
-class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
-class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
-class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
-class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
-
-class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<1> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
-class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
-class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
-
-class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<1> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
-}
-
-class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
-class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
-
-class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<1> src3;
-  bits<5> src4;
-
-  let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
-class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
-class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
-class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
-class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
-class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
-class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
-class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
-class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
-class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<1> src3;
-  bits<3> src4;
-
-  let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
-class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
-class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
-class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
-
-
-class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<1> src3;
-
-  let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
-  let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
-}
-
-class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
-class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
-class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
-class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
-class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
-class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
-
-class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<2> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
-  let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
-}
-
-class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
-class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
-
-class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b00011001111, src3{4-0} };
-  let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
-}
-
-class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
-class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
-
-
-class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
-  let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
-}
-
-class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
-class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
-
-class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
-  let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
-}
-
-class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
-class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
-
-class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
-  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
-class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
-class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
-class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
-class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
-class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
-class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
-class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
-class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
-class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
-class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
-class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
-class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
-class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
-class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
-
-class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
-  let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
-}
-
-class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
-class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
-class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
-class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
-class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
-class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
-class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
-class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
-
-class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<2> src1;
-  bits<2> src2;
-
-  let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
-  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
-}
-
-class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
-class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
-class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
-class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
-class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
-
-class V6_pred_not_enc : OpcodeHexagon {
-  bits<2> dst;
-  bits<2> src1;
-
-  let Inst{31-16} = { 0b0001111000000011 };
-  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
-}
-
-class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<2> src1;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
-  let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
-}
-
-class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
-class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
-
-class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{15-5}, src1{4-0} };
-  let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
-}
-
-class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
-class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
-class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
-
-
-class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
-  let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
-}
-
-class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
-class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
-
-class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<6> src2;
-
-  let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
-  let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
-}
-
-class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
-class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
-class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
-class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
-class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
-class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
-
-class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { opc{14-4}, src1{4-0} };
-  let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
-class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
-class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
-class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
-class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
-class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
-class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
-class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
-
-class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
-
-  let Inst{31-16} = { opc{24-10}, 0 };
-  let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
-}
-
-class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
-class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
-class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
-class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
-
-class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
-  let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
-}
-
-class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
-class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
-
-class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
-  bits<5> src1;
-
-  let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
-  let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
-}
-
-class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
-class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
-
-class A5_ACS_enc : OpcodeHexagon {
-  bits<5> dst1;
-  bits<2> dst2;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b11101010101, src1{4-0} };
-  let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
-}
-
-class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<2> src3;
-
-  let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
-  let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
-}
-
-class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
-class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
-class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
-
-class V6_vhistq_enc : OpcodeHexagon {
-  bits<2> src1;
-
-  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
-  let Inst{13-0} = { 0b10000010000000 };
-}
-
-// TODO: Change script to generate dst1 instead of dst.
-class A6_vminub_RdP_enc : OpcodeHexagon {
-  bits<5> dst1;
-  bits<2> dst2;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b11101010111, src2{4-0} };
-  let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
-}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index fa3cccb..636a439 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -7,36 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-//                         Hexagon Instruction Flags +
-//
-//                    *** Must match HexagonBaseInfo.h ***
-//===----------------------------------------------------------------------===//
-
-class IType<bits<5> t> {
-  bits<5> Value = t;
-}
-def TypePSEUDO : IType<0>;
-def TypeALU32  : IType<1>;
-def TypeCR     : IType<2>;
-def TypeJR     : IType<3>;
-def TypeJ      : IType<4>;
-def TypeLD     : IType<5>;
-def TypeST     : IType<6>;
-def TypeSYSTEM : IType<7>;
-def TypeXTYPE  : IType<8>;
-def TypeENDLOOP: IType<31>;
-
-// Maintain list of valid subtargets for each instruction.
-class SubTarget<bits<6> value> {
-  bits<6> Value = value;
-}
-
-def HasAnySubT    : SubTarget<0x3f>;  // 111111
-def HasV5SubT     : SubTarget<0x3e>;  // 111110
-def HasV55SubT    : SubTarget<0x3c>;  // 111100
-def HasV60SubT    : SubTarget<0x38>;  // 111000
-
 // Addressing modes for load/store instructions
 class AddrModeType<bits<3> value> {
   bits<3> Value = value;
@@ -54,6 +24,7 @@ class MemAccessSize<bits<4> value> {
   bits<4> Value = value;
 }
 
+// MemAccessSize is represented as 1+log2(N) where N is size in bits.
 def NoMemAccess      : MemAccessSize<0>;// Not a memory access instruction.
 def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
@@ -70,10 +41,9 @@ def Vector128Access  : MemAccessSize<8>;// Vector access instruction (memv)
 class OpcodeHexagon {
   field bits<32> Inst = ?; // Default to an invalid insn.
   bits<4> IClass = 0; // ICLASS
+  bits<1> zero = 0;
 
   let Inst{31-28} = IClass;
-
-  bits<1> zero = 0;
 }
 
 class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -99,89 +69,89 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Instruction type according to the ISA.
   IType Type = type;
-  let TSFlags{4-0} = Type.Value;
+  let TSFlags{5-0} = Type.Value;
 
   // Solo instructions, i.e., those that cannot be in a packet with others.
   bits<1> isSolo = 0;
-  let TSFlags{5} = isSolo;
+  let TSFlags{6} = isSolo;
   // Packed only with A or X-type instructions.
   bits<1> isSoloAX = 0;
-  let TSFlags{6} = isSoloAX;
+  let TSFlags{7} = isSoloAX;
   // Only A-type instruction in first slot or nothing.
   bits<1> isSoloAin1 = 0;
-  let TSFlags{7} = isSoloAin1;
+  let TSFlags{8} = isSoloAin1;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{8} = isPredicated;
+  let TSFlags{9} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{9} = isPredicatedFalse;
+  let TSFlags{10} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{10} = isPredicatedNew;
+  let TSFlags{11} = isPredicatedNew;
   bits<1> isPredicateLate = 0;
-  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
+  let TSFlags{12} = isPredicateLate; // Late predicate producer insn.
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{12} = isNewValue; // New-value consumer insn.
+  let TSFlags{13} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{13} = hasNewValue; // New-value producer insn.
+  let TSFlags{14} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{16-14} = opNewValue; // New-value produced operand.
+  let TSFlags{17-15} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{18} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{18} = isNVStore; // New-value store insn.
+  let TSFlags{19} = isNVStore; // New-value store insn.
   bits<1> isCVLoadable = 0;
-  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  let TSFlags{20} = isCVLoadable; // Load that can become cur-value load.
   bits<1> isCVLoad = 0;
-  let TSFlags{20} = isCVLoad; // Cur-value load insn.
+  let TSFlags{21} = isCVLoad; // Cur-value load insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{21} = isExtendable; // Insn may be extended.
+  let TSFlags{22} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{22} = isExtended; // Insn must be extended.
+  let TSFlags{23} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
+  let TSFlags{26-24} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{27} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{32-28} = opExtentBits; //Number of bits of range before extending.
   bits<2> opExtentAlign = 0;
-  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
-
-  // If an instruction is valid on a subtarget, set the corresponding
-  // bit from validSubTargets.
-  // By default, instruction is valid on all subtargets.
-  SubTarget validSubTargets = HasAnySubT;
-  let TSFlags{39-34} = validSubTargets.Value;
+  let TSFlags{34-33} = opExtentAlign; // Alignment exponent before extending.
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{42-40} = addrMode.Value;
+  let TSFlags{43-41} = addrMode.Value;
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{46-43} = accessSize.Value;
+  let TSFlags{47-44} = accessSize.Value;
 
   bits<1> isTaken = 0;
-  let TSFlags {47} = isTaken; // Branch prediction.
+  let TSFlags {48} = isTaken; // Branch prediction.
 
   bits<1> isFP = 0;
-  let TSFlags {48} = isFP; // Floating-point.
+  let TSFlags {49} = isFP; // Floating-point.
 
   bits<1> hasNewValue2 = 0;
-  let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+  let TSFlags{51} = hasNewValue2; // Second New-value producer insn.
   bits<3> opNewValue2 = 0;
-  let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+  let TSFlags{54-52} = opNewValue2; // Second New-value produced operand.
 
   bits<1> isAccumulator = 0;
-  let TSFlags{54} = isAccumulator;
+  let TSFlags{55} = isAccumulator;
+
+  bits<1> prefersSlot3 = 0;
+  let TSFlags{56} = prefersSlot3; // Complex XU
 
   bit cofMax1 = 0;
   let TSFlags{60} = cofMax1;
 
+  bit CVINew = 0;
+  let TSFlags{61} = CVINew;
+
   // Fields used for relation models.
   bit isNonTemporal = 0;
   string isNT = ""; // set to "true" for non-temporal vector stores.
@@ -200,9 +170,13 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   let NValueST = !if(isNVStore, "true", "false");
   let isNT = !if(isNonTemporal, "true", "false");
 
+  let hasSideEffects = 0;
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
 
+class HInst<dag outs, dag ins, string asmstr, InstrItinClass itin, IType type> :
+      InstHexagon<outs, ins, asmstr, [], "", itin, type>;
+
 //===----------------------------------------------------------------------===//
 //                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
@@ -214,31 +188,10 @@ class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
-let mayLoad = 1 in
-class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
-
 class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                  string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
-
-// LD Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
-
-let mayLoad = 1 in
-class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
-let mayLoad = 1 in
-class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
-
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
@@ -247,123 +200,9 @@ class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
 
-class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : STInst<outs, ins, asmstr, pattern, cstr>;
-
-let mayStore = 1 in
-class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
-
-// Same as ST0Inst but doesn't derive from OpcodeHexagon.
-let mayStore = 1 in
-class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
-
-// ST Instruction Class in V2/V3 can take SLOT0 only.
-// ST Instruction Class in V4    can take SLOT0 & SLOT1.
-// Definition of the instruction class CHANGED from V2/V3 to V4.
-class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
-  : STInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// SYSTEM Instruction Class in V4 can take SLOT0 only
-// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
-class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>,
-    OpcodeHexagon;
-
-// ALU32 Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>, OpcodeHexagon;
-
-// ALU64 Instruction Class in V2/V3.
-// XTYPE Instruction Class in V4.
-// Definition of the instruction class NOT CHANGED.
-// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
-class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
-     OpcodeHexagon;
-
-class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
-  : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-
-// M Instruction Class in V2/V3.
-// XTYPE Instruction Class in V4.
-// Definition of the instruction class NOT CHANGED.
-// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
-class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
-    OpcodeHexagon;
-
-// Same as above but doesn't derive from OpcodeHexagon
-class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
-
-// M Instruction Class in V2/V3.
-// XTYPE Instruction Class in V4.
-// Definition of the instruction class NOT CHANGED.
-// Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
-class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = M_tc_2_SLOT23>
-    : MInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// S Instruction Class in V2/V3.
-// XTYPE Instruction Class in V4.
-// Definition of the instruction class NOT CHANGED.
-// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
-class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
-    OpcodeHexagon;
-
-class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
-
-// S Instruction Class in V2/V3.
-// XTYPE Instruction Class in V4.
-// Definition of the instruction class NOT CHANGED.
-// Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
-class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
-  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// J Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
-
-class JInst_CJUMP_UCJUMP<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "", InstrItinClass itin = J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
-
-// JR Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>, OpcodeHexagon;
-
-// CR Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>, OpcodeHexagon;
-
 let isCodeGenOnly = 1, isPseudo = 1 in
 class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123>
+              string cstr = "", InstrItinClass itin = tc_ENDLOOP>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>,
     OpcodeHexagon;
 
@@ -383,47 +222,6 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-
-//
-// ALU32 patterns
-//.
-class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-//
-// ALU64 patterns.
-//
-class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
-   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
-   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// Post increment ST Instruction.
-class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "">
-  : STInst<outs, ins, asmstr, pattern, cstr>;
-
-// Post increment LD Instruction.
-class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
-
 //===----------------------------------------------------------------------===//
 // V4 Instruction Format Definitions +
 //===----------------------------------------------------------------------===//
@@ -431,7 +229,7 @@ class LDInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 include "HexagonInstrFormatsV4.td"
 
 //===----------------------------------------------------------------------===//
-// V4 Instruction Format Definitions +
+// V55 Instruction Format Definitions +
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
@@ -441,5 +239,5 @@ include "HexagonInstrFormatsV4.td"
 include "HexagonInstrFormatsV60.td"
 
 //===----------------------------------------------------------------------===//
-// V60 Instruction Format Definitions +
+// V62 Instruction Format Definitions +
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 493d047..c5fa259 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -1,4 +1,4 @@
-//==- HexagonInstrFormats.td - Hexagon Instruction Formats --*- tablegen -*-==//
+//==- HexagonInstrFormatsV4.td - Hexagon Instruction Formats --*- tablegen -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,18 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//----------------------------------------------------------------------------//
-//                         Hexagon Instruction Flags
-//
-//                        *** Must match BaseInfo.h ***
-//----------------------------------------------------------------------------//
-
-def TypeV4LDST    : IType<9>;
-def TypeNV       : IType<10>;
-def TypeDUPLEX   : IType<11>;
-def TypeCOMPOUND : IType<12>;
-def TypePREFIX   : IType<30>;
-
 //                      Duplex Instruction Class Declaration
 //===----------------------------------------------------------------------===//
 
@@ -61,7 +49,7 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 
-  let TSFlags{4-0} = Type.Value;
+  let TSFlags{5-0} = Type.Value;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
@@ -97,64 +85,3 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
   bits<2> opExtentAlign = 0;
   let TSFlags{28-27} = opExtentAlign; // Alignment exponent before extending.
 }
-
-//----------------------------------------------------------------------------//
-//                         Instruction Classes Definitions
-//----------------------------------------------------------------------------//
-
-//
-// NV type instructions.
-//
-class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>, OpcodeHexagon;
-
-class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
-  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// Definition of Post increment new value store.
-class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
-  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// Post increment ST Instruction.
-let mayStore = 1 in
-class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
-  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-// New-value conditional branch.
-class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
-
-let mayLoad = 1, mayStore = 1 in
-class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeV4LDST>,
-    OpcodeHexagon;
-
-class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
-  : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
-  : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
-                TypePREFIX>, OpcodeHexagon;
-
-class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypeDUPLEX>,
-    OpcodeHexagon;
-
-class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>,
-    OpcodeHexagon;
-
-class CJInst_JMPSET<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
-    OpcodeHexagon;
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
index b9f4373..14bda0e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -12,227 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 //----------------------------------------------------------------------------//
-//                         Hexagon Instruction Flags +
-//
-//                        *** Must match BaseInfo.h ***
-//----------------------------------------------------------------------------//
-
-def TypeCVI_VA         : IType<13>;
-def TypeCVI_VA_DV      : IType<14>;
-def TypeCVI_VX         : IType<15>;
-def TypeCVI_VX_DV      : IType<16>;
-def TypeCVI_VP         : IType<17>;
-def TypeCVI_VP_VS      : IType<18>;
-def TypeCVI_VS         : IType<19>;
-def TypeCVI_VINLANESAT : IType<20>;
-def TypeCVI_VM_LD      : IType<21>;
-def TypeCVI_VM_TMP_LD  : IType<22>;
-def TypeCVI_VM_CUR_LD  : IType<23>;
-def TypeCVI_VM_VP_LDU  : IType<24>;
-def TypeCVI_VM_ST      : IType<25>;
-def TypeCVI_VM_NEW_ST  : IType<26>;
-def TypeCVI_VM_STU     : IType<27>;
-def TypeCVI_HIST       : IType<28>;
-//----------------------------------------------------------------------------//
 //                         Instruction Classes Definitions +
 //----------------------------------------------------------------------------//
 
-let validSubTargets = HasV60SubT in
-{
 class CVI_VA_Resource<dag outs, dag ins, string asmstr,
                        list<dag> pattern = [], string cstr = "",
                        InstrItinClass itin = CVI_VA>
    : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
      OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VA_DV>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_Resource_long<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VX_LONG>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_Resource_late<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VX_LATE>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
-     Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_Resource<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VX>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VX_DV>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VX_DV_SLOT2>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VX_DV_LONG>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VP_Resource_long<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VP_LONG>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VP_VS_EARLY>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VP_VS_LONG>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VP_VS_LONG_EARLY>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VS_Resource<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VS>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VINLANESAT>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VS_Resource_long<dag outs, dag ins, string asmstr,
-                           list<dag> pattern = [], string cstr = "",
-                           InstrItinClass itin = CVI_VS>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VM_LD>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr,
-                              list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VM_LD>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr,
-                             list<dag> pattern = [], string cstr = "",
-                             InstrItinClass itin = CVI_VM_TMP_LD>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr,
-                                  list<dag> pattern = [], string cstr = "",
-                                  InstrItinClass itin = CVI_VM_TMP_LD>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr,
-                             list<dag> pattern = [], string cstr = "",
-                             InstrItinClass itin = CVI_VM_CUR_LD>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr,
-                             list<dag> pattern = [], string cstr = "",
-                             InstrItinClass itin = CVI_VM_VP_LDU>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr,
-                                  list<dag> pattern = [], string cstr = "",
-                                  InstrItinClass itin = CVI_VM_VP_LDU>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VM_ST>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr,
-                              list<dag> pattern = [], string cstr = "",
-                              InstrItinClass itin = CVI_VM_ST>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr,
-                             list<dag> pattern = [], string cstr = "",
-                             InstrItinClass itin = CVI_VM_NEW_ST>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr,
-                                  list<dag> pattern = [], string cstr = "",
-                                  InstrItinClass itin = CVI_VM_NEW_ST>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr,
-                          list<dag> pattern = [], string cstr = "",
-                          InstrItinClass itin = CVI_VM_STU>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr,
-                               list<dag> pattern = [], string cstr = "",
-                               InstrItinClass itin = CVI_VM_STU>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-
-class CVI_HIST_Resource<dag outs, dag ins, string asmstr,
-                        list<dag> pattern = [], string cstr = "",
-                        InstrItinClass itin = CVI_HIST>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
-     OpcodeHexagon, Requires<[HasV60T, UseHVX]>;
-}
-
-let validSubTargets = HasV60SubT in
-{
-class CVI_VA_Resource1<dag outs, dag ins, string asmstr,
-                       list<dag> pattern = [], string cstr = "",
-                       InstrItinClass itin = CVI_VA>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>,
-     Requires<[HasV60T, UseHVX]>;
-
-class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr,
-                         list<dag> pattern = [], string cstr = "",
-                         InstrItinClass itin = CVI_VX_DV>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>,
-     Requires<[HasV60T, UseHVX]>;
-
-class CVI_HIST_Resource1<dag outs, dag ins, string asmstr,
-                        list<dag> pattern = [], string cstr = "",
-                        InstrItinClass itin = CVI_HIST>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>,
-     Requires<[HasV60T, UseHVX]>;
-}
-
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 0a7dc6b..c77c669 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HexagonInstrInfo.h"
 #include "Hexagon.h"
 #include "HexagonHazardRecognizer.h"
-#include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -57,8 +57,9 @@ using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
-#include "HexagonGenInstrInfo.inc"
+#include "HexagonDepTimingClasses.h"
 #include "HexagonGenDFAPacketizer.inc"
+#include "HexagonGenInstrInfo.inc"
 
 cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden,
   cl::init(false), cl::desc("Do not consider inline-asm a scheduling/"
@@ -93,10 +94,6 @@ static cl::opt<bool> UseDFAHazardRec("dfa-hazard-rec",
 ///
 /// Constants for Hexagon instructions.
 ///
-const int Hexagon_MEMV_OFFSET_MAX_128B = 896;   // #s4: -8*128...7*128
-const int Hexagon_MEMV_OFFSET_MIN_128B = -1024; // #s4
-const int Hexagon_MEMV_OFFSET_MAX = 448;  // #s4: -8*64...7*64
-const int Hexagon_MEMV_OFFSET_MIN = -512; // #s4
 const int Hexagon_MEMW_OFFSET_MAX = 4095;
 const int Hexagon_MEMW_OFFSET_MIN = -4096;
 const int Hexagon_MEMD_OFFSET_MAX = 8191;
@@ -152,10 +149,11 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
 /// On Hexagon, we have two instructions used to set-up the hardware loop
 /// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
 /// to indicate the end of a loop.
-static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
+      MachineBasicBlock *TargetBB,
       SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
-  int LOOPi;
-  int LOOPr;
+  unsigned LOOPi;
+  unsigned LOOPr;
   if (EndLoopOp == Hexagon::ENDLOOP0) {
     LOOPi = Hexagon::J2_loop0i;
     LOOPr = Hexagon::J2_loop0r;
@@ -165,26 +163,24 @@ static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
   }
 
   // The loop set-up instruction will be in a predecessor block
-  for (MachineBasicBlock::pred_iterator PB = BB->pred_begin(),
-         PE = BB->pred_end(); PB != PE; ++PB) {
+  for (MachineBasicBlock *PB : BB->predecessors()) {
     // If this has been visited, already skip it.
-    if (!Visited.insert(*PB).second)
+    if (!Visited.insert(PB).second)
       continue;
-    if (*PB == BB)
+    if (PB == BB)
       continue;
-    for (MachineBasicBlock::reverse_instr_iterator I = (*PB)->instr_rbegin(),
-           E = (*PB)->instr_rend(); I != E; ++I) {
-      int Opc = I->getOpcode();
+    for (auto I = PB->instr_rbegin(), E = PB->instr_rend(); I != E; ++I) {
+      unsigned Opc = I->getOpcode();
       if (Opc == LOOPi || Opc == LOOPr)
         return &*I;
-      // We've reached a different loop, which means the loop0 has been removed.
-      if (Opc == EndLoopOp)
+      // We've reached a different loop, which means the loop01 has been
+      // removed.
+      if (Opc == EndLoopOp && I->getOperand(0).getMBB() != TargetBB)
         return nullptr;
     }
     // Check the predecessors for the LOOP instruction.
-    MachineInstr *loop = findLoopInstr(*PB, EndLoopOp, Visited);
-    if (loop)
-      return loop;
+    if (MachineInstr *Loop = findLoopInstr(PB, EndLoopOp, TargetBB, Visited))
+      return Loop;
   }
   return nullptr;
 }
@@ -254,15 +250,19 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case Hexagon::L2_loadri_io:
   case Hexagon::L2_loadrd_io:
   case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vL32b_nt_ai:
   case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vL32b_nt_ai_128B:
   case Hexagon::V6_vL32Ub_ai:
   case Hexagon::V6_vL32Ub_ai_128B:
   case Hexagon::LDriw_pred:
   case Hexagon::LDriw_mod:
   case Hexagon::PS_vloadrq_ai:
   case Hexagon::PS_vloadrw_ai:
+  case Hexagon::PS_vloadrw_nt_ai:
   case Hexagon::PS_vloadrq_ai_128B:
-  case Hexagon::PS_vloadrw_ai_128B: {
+  case Hexagon::PS_vloadrw_ai_128B:
+  case Hexagon::PS_vloadrw_nt_ai_128B: {
     const MachineOperand OpFI = MI.getOperand(1);
     if (!OpFI.isFI())
       return 0;
@@ -597,7 +597,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
       // Since we're adding an ENDLOOP, there better be a LOOP instruction.
       // Check for it, and change the BB target if needed.
       SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, Cond[1].getMBB(),
+                                         VisitedBBs);
       assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
       Loop->getOperand(0).setMBB(TBB);
       // Add the ENDLOOP after the finding the LOOP0.
@@ -637,7 +638,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
     // Since we're adding an ENDLOOP, there better be a LOOP instruction.
     // Check for it, and change the BB target if needed.
     SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, Cond[1].getMBB(),
+                                       VisitedBBs);
     assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
     Loop->getOperand(0).setMBB(TBB);
     // Add the ENDLOOP after the finding the LOOP0.
@@ -687,7 +689,8 @@ unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
   MachineFunction *MF = MBB.getParent();
   DebugLoc DL = Cmp.getDebugLoc();
   SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(), VisitedBBs);
+  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(),
+                                     Cmp.getOperand(0).getMBB(), VisitedBBs);
   if (!Loop)
     return 0;
   // If the loop trip count is a compile-time value, then just change the
@@ -867,6 +870,9 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
   unsigned KillFlag = getKillRegState(isKill);
+  bool HasAlloca = MFI.hasVarSizedObjects();
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  const HexagonFrameLowering &HFI = *HST.getFrameLowering();
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
@@ -897,24 +903,36 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       .addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 128 ? Hexagon::V6_vS32Ub_ai_128B
                                : Hexagon::V6_vS32b_ai_128B;
     BuildMI(MBB, I, DL, get(Opc))
       .addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 64 ? Hexagon::V6_vS32Ub_ai
                               : Hexagon::V6_vS32b_ai;
     BuildMI(MBB, I, DL, get(Opc))
       .addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 64 ? Hexagon::PS_vstorerwu_ai
                               : Hexagon::PS_vstorerw_ai;
     BuildMI(MBB, I, DL, get(Opc))
       .addFrameIndex(FI).addImm(0)
       .addReg(SrcReg, KillFlag).addMemOperand(MMO);
   } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 128 ? Hexagon::PS_vstorerwu_ai_128B
                                : Hexagon::PS_vstorerw_ai_128B;
     BuildMI(MBB, I, DL, get(Opc))
@@ -933,6 +951,9 @@ void HexagonInstrInfo::loadRegFromStackSlot(
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
+  bool HasAlloca = MFI.hasVarSizedObjects();
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  const HexagonFrameLowering &HFI = *HST.getFrameLowering();
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
@@ -957,21 +978,33 @@ void HexagonInstrInfo::loadRegFromStackSlot(
     BuildMI(MBB, I, DL, get(Hexagon::PS_vloadrq_ai), DestReg)
       .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (Hexagon::VecDblRegs128BRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 128 ? Hexagon::PS_vloadrwu_ai_128B
                                : Hexagon::PS_vloadrw_ai_128B;
     BuildMI(MBB, I, DL, get(Opc), DestReg)
       .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (Hexagon::VectorRegs128BRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 128 ? Hexagon::V6_vL32Ub_ai_128B
                                : Hexagon::V6_vL32b_ai_128B;
     BuildMI(MBB, I, DL, get(Opc), DestReg)
       .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (Hexagon::VectorRegsRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 64 ? Hexagon::V6_vL32Ub_ai
                               : Hexagon::V6_vL32b_ai;
     BuildMI(MBB, I, DL, get(Opc), DestReg)
       .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
   } else if (Hexagon::VecDblRegsRegClass.hasSubClassEq(RC)) {
+    // If there are variable-sized objects, spills will not be aligned.
+    if (HasAlloca)
+      Align = HFI.getStackAlignment();
     unsigned Opc = Align < 64 ? Hexagon::PS_vloadrwu_ai
                               : Hexagon::PS_vloadrw_ai;
     BuildMI(MBB, I, DL, get(Opc), DestReg)
@@ -1074,13 +1107,13 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
       MachineInstr *MI1New =
           BuildMI(MBB, MI, DL, get(NewOpc))
-              .addOperand(MI.getOperand(0))
+              .add(MI.getOperand(0))
               .addImm(MI.getOperand(1).getImm())
               .addReg(SrcSubLo)
               .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MI1New->getOperand(0).setIsKill(false);
       BuildMI(MBB, MI, DL, get(NewOpc))
-          .addOperand(MI.getOperand(0))
+          .add(MI.getOperand(0))
           // The Vectors are indexed in multiples of vector size.
           .addImm(MI.getOperand(1).getImm() + Offset)
           .addReg(SrcSubHi)
@@ -1106,15 +1139,14 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
       unsigned DstReg = MI.getOperand(0).getReg();
       unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6;
-      MachineInstr *MI1New =
-          BuildMI(MBB, MI, DL, get(NewOpc),
-                  HRI.getSubReg(DstReg, Hexagon::vsub_lo))
-              .addOperand(MI.getOperand(1))
-              .addImm(MI.getOperand(2).getImm());
+      MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpc),
+                                     HRI.getSubReg(DstReg, Hexagon::vsub_lo))
+              .add(MI.getOperand(1))
+              .addImm(MI.getOperand(2).getImm())
+              .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
       MI1New->getOperand(1).setIsKill(false);
-      BuildMI(MBB, MI, DL, get(NewOpc),
-              HRI.getSubReg(DstReg, Hexagon::vsub_hi))
-          .addOperand(MI.getOperand(1))
+      BuildMI(MBB, MI, DL, get(NewOpc), HRI.getSubReg(DstReg, Hexagon::vsub_hi))
+          .add(MI.getOperand(1))
           // The Vectors are indexed in multiples of vector size.
           .addImm(MI.getOperand(2).getImm() + Offset)
           .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -1222,23 +1254,29 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       const MachineOperand &Op1 = MI.getOperand(1);
       const MachineOperand &Op2 = MI.getOperand(2);
       const MachineOperand &Op3 = MI.getOperand(3);
-      LivePhysRegs LiveAtMI(&HRI);
+      LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      unsigned PReg = Op1.getReg();
+      assert(Op1.getSubReg() == 0);
+      unsigned PState = getRegState(Op1);
+
       if (Op0.getReg() != Op2.getReg()) {
+        unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
+                                                  : PState;
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vcmov))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addOperand(Op2);
+                     .add(Op0)
+                     .addReg(PReg, S)
+                     .add(Op2);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
         IsDestLive = true;
       }
       if (Op0.getReg() != Op3.getReg()) {
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vncmov))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addOperand(Op3);
+                     .add(Op0)
+                     .addReg(PReg, PState)
+                     .add(Op3);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
       }
@@ -1251,18 +1289,24 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       MachineOperand &Op1 = MI.getOperand(1);
       MachineOperand &Op2 = MI.getOperand(2);
       MachineOperand &Op3 = MI.getOperand(3);
-      LivePhysRegs LiveAtMI(&HRI);
+      LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      unsigned PReg = Op1.getReg();
+      assert(Op1.getSubReg() == 0);
+      unsigned PState = getRegState(Op1);
 
       if (Op0.getReg() != Op2.getReg()) {
+        unsigned S = Op0.getReg() != Op3.getReg() ? PState & ~RegState::Kill
+                                                  : PState;
         unsigned SrcLo = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_lo);
         unsigned SrcHi = HRI.getSubReg(Op2.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vccombine))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addReg(SrcHi)
-                    .addReg(SrcLo);
+                     .add(Op0)
+                     .addReg(PReg, S)
+                     .add(Op1)
+                     .addReg(SrcHi)
+                     .addReg(SrcLo);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
         IsDestLive = true;
@@ -1271,10 +1315,10 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
         unsigned SrcLo = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_lo);
         unsigned SrcHi = HRI.getSubReg(Op3.getReg(), Hexagon::vsub_hi);
         auto T = BuildMI(MBB, MI, DL, get(Hexagon::V6_vnccombine))
-                    .addOperand(Op0)
-                    .addOperand(Op1)
-                    .addReg(SrcHi)
-                    .addReg(SrcLo);
+                     .add(Op0)
+                     .addReg(PReg, PState)
+                     .addReg(SrcHi)
+                     .addReg(SrcLo);
         if (IsDestLive)
           T.addReg(Op0.getReg(), RegState::Implicit);
       }
@@ -1376,7 +1420,7 @@ bool HexagonInstrInfo::PredicateInstruction(
     MachineOperand &Op = MI.getOperand(NOp);
     if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
       break;
-    T.addOperand(Op);
+    T.add(Op);
     NOp++;
   }
 
@@ -1386,7 +1430,7 @@ bool HexagonInstrInfo::PredicateInstruction(
   assert(GotPredReg);
   T.addReg(PredReg, PredRegFlags);
   while (NOp < NumOps)
-    T.addOperand(MI.getOperand(NOp++));
+    T.add(MI.getOperand(NOp++));
 
   MI.setDesc(get(PredOpc));
   while (unsigned n = MI.getNumOperands())
@@ -1413,19 +1457,37 @@ bool HexagonInstrInfo::DefinesPredicate(
   auto &HRI = getRegisterInfo();
   for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
     MachineOperand MO = MI.getOperand(oper);
-    if (MO.isReg() && MO.isDef()) {
+    if (MO.isReg()) {
+      if (!MO.isDef())
+        continue;
       const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
       if (RC == &Hexagon::PredRegsRegClass) {
         Pred.push_back(MO);
         return true;
       }
+      continue;
+    } else if (MO.isRegMask()) {
+      for (unsigned PR : Hexagon::PredRegsRegClass) {
+        if (!MI.modifiesRegister(PR, &HRI))
+          continue;
+        Pred.push_back(MO);
+        return true;
+      }
     }
   }
   return false;
 }
 
-bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
-  return MI.getDesc().isPredicable();
+bool HexagonInstrInfo::isPredicable(const MachineInstr &MI) const {
+  if (!MI.getDesc().isPredicable())
+    return false;
+
+  if (MI.isCall() || isTailCall(MI)) {
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    if (!MF.getSubtarget<HexagonSubtarget>().usePredicatedCalls())
+      return false;
+  }
+  return true;
 }
 
 bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
@@ -1602,6 +1664,7 @@ unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return getInstrTimingClassLatency(ItinData, MI);
 }
 
+
 DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
     const TargetSubtargetInfo &STI) const {
   const InstrItineraryData *II = STI.getInstrItineraryData();
@@ -1667,6 +1730,39 @@ bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
   return false;
 }
 
+std::pair<unsigned, unsigned>
+HexagonInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
+  return std::make_pair(TF & ~HexagonII::MO_Bitmasks,
+                        TF & HexagonII::MO_Bitmasks);
+}
+
+ArrayRef<std::pair<unsigned, const char*>>
+HexagonInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
+  using namespace HexagonII;
+  static const std::pair<unsigned, const char*> Flags[] = {
+    {MO_PCREL,  "hexagon-pcrel"},
+    {MO_GOT,    "hexagon-got"},
+    {MO_LO16,   "hexagon-lo16"},
+    {MO_HI16,   "hexagon-hi16"},
+    {MO_GPREL,  "hexagon-gprel"},
+    {MO_GDGOT,  "hexagon-gdgot"},
+    {MO_GDPLT,  "hexagon-gdplt"},
+    {MO_IE,     "hexagon-ie"},
+    {MO_IEGOT,  "hexagon-iegot"},
+    {MO_TPREL,  "hexagon-tprel"}
+  };
+  return makeArrayRef(Flags);
+}
+
+ArrayRef<std::pair<unsigned, const char*>>
+HexagonInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
+  using namespace HexagonII;
+  static const std::pair<unsigned, const char*> Flags[] = {
+    {HMOTF_ConstExtended, "hexagon-ext"}
+  };
+  return makeArrayRef(Flags);
+}
+
 unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *TRC;
@@ -1715,162 +1811,7 @@ bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
 
 // Return true if the instruction is a compund branch instruction.
 bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
-  return (getType(MI) == HexagonII::TypeCOMPOUND && MI.isBranch());
-}
-
-bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
-  return (MI.isBranch() && isPredicated(MI)) ||
-         isConditionalTransfer(MI) ||
-         isConditionalALU32(MI)    ||
-         isConditionalLoad(MI)     ||
-         // Predicated stores which don't have a .new on any operands.
-         (MI.mayStore() && isPredicated(MI) && !isNewValueStore(MI) &&
-          !isPredicatedNew(MI));
-}
-
-bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-    case Hexagon::A2_paddf:
-    case Hexagon::A2_paddfnew:
-    case Hexagon::A2_paddif:
-    case Hexagon::A2_paddifnew:
-    case Hexagon::A2_paddit:
-    case Hexagon::A2_padditnew:
-    case Hexagon::A2_paddt:
-    case Hexagon::A2_paddtnew:
-    case Hexagon::A2_pandf:
-    case Hexagon::A2_pandfnew:
-    case Hexagon::A2_pandt:
-    case Hexagon::A2_pandtnew:
-    case Hexagon::A2_porf:
-    case Hexagon::A2_porfnew:
-    case Hexagon::A2_port:
-    case Hexagon::A2_portnew:
-    case Hexagon::A2_psubf:
-    case Hexagon::A2_psubfnew:
-    case Hexagon::A2_psubt:
-    case Hexagon::A2_psubtnew:
-    case Hexagon::A2_pxorf:
-    case Hexagon::A2_pxorfnew:
-    case Hexagon::A2_pxort:
-    case Hexagon::A2_pxortnew:
-    case Hexagon::A4_paslhf:
-    case Hexagon::A4_paslhfnew:
-    case Hexagon::A4_paslht:
-    case Hexagon::A4_paslhtnew:
-    case Hexagon::A4_pasrhf:
-    case Hexagon::A4_pasrhfnew:
-    case Hexagon::A4_pasrht:
-    case Hexagon::A4_pasrhtnew:
-    case Hexagon::A4_psxtbf:
-    case Hexagon::A4_psxtbfnew:
-    case Hexagon::A4_psxtbt:
-    case Hexagon::A4_psxtbtnew:
-    case Hexagon::A4_psxthf:
-    case Hexagon::A4_psxthfnew:
-    case Hexagon::A4_psxtht:
-    case Hexagon::A4_psxthtnew:
-    case Hexagon::A4_pzxtbf:
-    case Hexagon::A4_pzxtbfnew:
-    case Hexagon::A4_pzxtbt:
-    case Hexagon::A4_pzxtbtnew:
-    case Hexagon::A4_pzxthf:
-    case Hexagon::A4_pzxthfnew:
-    case Hexagon::A4_pzxtht:
-    case Hexagon::A4_pzxthtnew:
-    case Hexagon::C2_ccombinewf:
-    case Hexagon::C2_ccombinewt:
-      return true;
-  }
-  return false;
-}
-
-// FIXME - Function name and it's functionality don't match.
-// It should be renamed to hasPredNewOpcode()
-bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const {
-  if (!MI.getDesc().mayLoad() || !isPredicated(MI))
-    return false;
-
-  int PNewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
-  // Instruction with valid predicated-new opcode can be promoted to .new.
-  return PNewOpcode >= 0;
-}
-
-// Returns true if an instruction is a conditional store.
-//
-// Note: It doesn't include conditional new-value stores as they can't be
-// converted to .new predicate.
-bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-    default: return false;
-    case Hexagon::S4_storeirbt_io:
-    case Hexagon::S4_storeirbf_io:
-    case Hexagon::S4_pstorerbt_rr:
-    case Hexagon::S4_pstorerbf_rr:
-    case Hexagon::S2_pstorerbt_io:
-    case Hexagon::S2_pstorerbf_io:
-    case Hexagon::S2_pstorerbt_pi:
-    case Hexagon::S2_pstorerbf_pi:
-    case Hexagon::S2_pstorerdt_io:
-    case Hexagon::S2_pstorerdf_io:
-    case Hexagon::S4_pstorerdt_rr:
-    case Hexagon::S4_pstorerdf_rr:
-    case Hexagon::S2_pstorerdt_pi:
-    case Hexagon::S2_pstorerdf_pi:
-    case Hexagon::S2_pstorerht_io:
-    case Hexagon::S2_pstorerhf_io:
-    case Hexagon::S4_storeirht_io:
-    case Hexagon::S4_storeirhf_io:
-    case Hexagon::S4_pstorerht_rr:
-    case Hexagon::S4_pstorerhf_rr:
-    case Hexagon::S2_pstorerht_pi:
-    case Hexagon::S2_pstorerhf_pi:
-    case Hexagon::S2_pstorerit_io:
-    case Hexagon::S2_pstorerif_io:
-    case Hexagon::S4_storeirit_io:
-    case Hexagon::S4_storeirif_io:
-    case Hexagon::S4_pstorerit_rr:
-    case Hexagon::S4_pstorerif_rr:
-    case Hexagon::S2_pstorerit_pi:
-    case Hexagon::S2_pstorerif_pi:
-
-    // V4 global address store before promoting to dot new.
-    case Hexagon::S4_pstorerdt_abs:
-    case Hexagon::S4_pstorerdf_abs:
-    case Hexagon::S4_pstorerbt_abs:
-    case Hexagon::S4_pstorerbf_abs:
-    case Hexagon::S4_pstorerht_abs:
-    case Hexagon::S4_pstorerhf_abs:
-    case Hexagon::S4_pstorerit_abs:
-    case Hexagon::S4_pstorerif_abs:
-      return true;
-
-    // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded
-    // from the "Conditional Store" list. Because a predicated new value store
-    // would NOT be promoted to a double dot new store.
-    // This function returns yes for those stores that are predicated but not
-    // yet promoted to predicate dot new instructions.
-  }
-}
-
-bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const {
-  switch (MI.getOpcode()) {
-    case Hexagon::A2_tfrt:
-    case Hexagon::A2_tfrf:
-    case Hexagon::C2_cmoveit:
-    case Hexagon::C2_cmoveif:
-    case Hexagon::A2_tfrtnew:
-    case Hexagon::A2_tfrfnew:
-    case Hexagon::C2_cmovenewit:
-    case Hexagon::C2_cmovenewif:
-    case Hexagon::A2_tfrpt:
-    case Hexagon::A2_tfrpf:
-      return true;
-
-    default:
-      return false;
-  }
-  return false;
+  return getType(MI) == HexagonII::TypeCJ && MI.isBranch();
 }
 
 // TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
@@ -1893,7 +1834,7 @@ bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
   const MachineOperand &MO = MI.getOperand(ExtOpNum);
   // Use MO operand flags to determine if MO
   // has the HMOTF_ConstExtended flag set.
-  if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+  if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
     return true;
   // If this is a Machine BB address we are talking about, and it is
   // not marked as extended, say so.
@@ -1903,9 +1844,6 @@ bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
   // We could be using an instruction with an extendable immediate and shoehorn
   // a global address into it. If it is a global address it will be constant
   // extended. We do this for COMBINE.
-  // We currently only handle isGlobal() because it is the only kind of
-  // object we are going to end up with here for now.
-  // In the future we probably should add isSymbol(), etc.
   if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() ||
       MO.isJTI() || MO.isCPI() || MO.isFPImm())
     return true;
@@ -1930,7 +1868,7 @@ bool HexagonInstrInfo::isDeallocRet(const MachineInstr &MI) const {
   case Hexagon::L4_return_fnew_pnt :
   case Hexagon::L4_return_tnew_pt :
   case Hexagon::L4_return_fnew_pt :
-   return true;
+    return true;
   }
   return false;
 }
@@ -1957,12 +1895,12 @@ bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
       if (RegA == RegB)
         return true;
 
-      if (Hexagon::DoubleRegsRegClass.contains(RegA))
+      if (TargetRegisterInfo::isPhysicalRegister(RegA))
         for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs)
           if (RegB == *SubRegs)
             return true;
 
-      if (Hexagon::DoubleRegsRegClass.contains(RegB))
+      if (TargetRegisterInfo::isPhysicalRegister(RegB))
         for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs)
           if (RegA == *SubRegs)
             return true;
@@ -2006,9 +1944,7 @@ bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr &MI) const {
 
   // Multiply
   unsigned SchedClass = MI.getDesc().getSchedClass();
-  if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23)
-    return true;
-  return false;
+  return is_TC4x(SchedClass) || is_TC3x(SchedClass);
 }
 
 bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const {
@@ -2059,11 +1995,9 @@ bool HexagonInstrInfo::isExtended(const MachineInstr &MI) const {
     return true;
   // Use MO operand flags to determine if one of MI's operands
   // has HMOTF_ConstExtended flag set.
-  for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
-       E = MI.operands_end(); I != E; ++I) {
-    if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended)
+  for (const MachineOperand &MO : MI.operands())
+    if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended)
       return true;
-  }
   return  false;
 }
 
@@ -2076,7 +2010,7 @@ bool HexagonInstrInfo::isFloat(const MachineInstr &MI) const {
 // No V60 HVX VMEM with A_INDIRECT.
 bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr &I,
       const MachineInstr &J) const {
-  if (!isV60VectorInstruction(I))
+  if (!isHVXVec(I))
     return false;
   if (!I.mayLoad() && !I.mayStore())
     return false;
@@ -2129,7 +2063,7 @@ bool HexagonInstrInfo::isJumpR(const MachineInstr &MI) const {
 bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr &MI,
       unsigned offset) const {
   // This selection of jump instructions matches to that what
-  // AnalyzeBranch can parse, plus NVJ.
+  // analyzeBranch can parse, plus NVJ.
   if (isNewValueJump(MI)) // r9:2
     return isInt<11>(offset);
 
@@ -2200,30 +2134,13 @@ bool HexagonInstrInfo::isLateResultInstr(const MachineInstr &MI) const {
   }
 
   unsigned SchedClass = MI.getDesc().getSchedClass();
-
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
-  case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
-  case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_1_SLOT23:
-  case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
-  case Hexagon::Sched::S_2op_tc_1_SLOT23:
-  case Hexagon::Sched::S_3op_tc_1_SLOT23:
-  case Hexagon::Sched::V2LDST_tc_ld_SLOT01:
-  case Hexagon::Sched::V2LDST_tc_st_SLOT0:
-  case Hexagon::Sched::V2LDST_tc_st_SLOT01:
-  case Hexagon::Sched::V4LDST_tc_ld_SLOT01:
-  case Hexagon::Sched::V4LDST_tc_st_SLOT0:
-  case Hexagon::Sched::V4LDST_tc_st_SLOT01:
-    return false;
-  }
-  return true;
+  return !is_TC1(SchedClass);
 }
 
 bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr &MI) const {
   // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply
   // resource, but all operands can be received late like an ALU instruction.
-  return MI.getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE;
+  return getType(MI) == HexagonII::TypeCVI_VX_LATE;
 }
 
 bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
@@ -2466,61 +2383,22 @@ bool HexagonInstrInfo::isTailCall(const MachineInstr &MI) const {
 // Returns true when SU has a timing class TC1.
 bool HexagonInstrInfo::isTC1(const MachineInstr &MI) const {
   unsigned SchedClass = MI.getDesc().getSchedClass();
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123:
-  case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123:
-  case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_1_SLOT23:
-  case Hexagon::Sched::EXTENDER_tc_1_SLOT0123:
-  //case Hexagon::Sched::M_tc_1_SLOT23:
-  case Hexagon::Sched::S_2op_tc_1_SLOT23:
-  case Hexagon::Sched::S_3op_tc_1_SLOT23:
-    return true;
-
-  default:
-    return false;
-  }
+  return is_TC1(SchedClass);
 }
 
 bool HexagonInstrInfo::isTC2(const MachineInstr &MI) const {
   unsigned SchedClass = MI.getDesc().getSchedClass();
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_2_SLOT23:
-  case Hexagon::Sched::CR_tc_2_SLOT3:
-  case Hexagon::Sched::M_tc_2_SLOT23:
-  case Hexagon::Sched::S_2op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_2_SLOT23:
-    return true;
-
-  default:
-    return false;
-  }
+  return is_TC2(SchedClass);
 }
 
 bool HexagonInstrInfo::isTC2Early(const MachineInstr &MI) const {
   unsigned SchedClass = MI.getDesc().getSchedClass();
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123:
-  case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_2early_SLOT23:
-  case Hexagon::Sched::CR_tc_2early_SLOT23:
-  case Hexagon::Sched::CR_tc_2early_SLOT3:
-  case Hexagon::Sched::J_tc_2early_SLOT0123:
-  case Hexagon::Sched::J_tc_2early_SLOT2:
-  case Hexagon::Sched::J_tc_2early_SLOT23:
-  case Hexagon::Sched::S_2op_tc_2early_SLOT23:
-  case Hexagon::Sched::S_3op_tc_2early_SLOT23:
-    return true;
-
-  default:
-    return false;
-  }
+  return is_TC2early(SchedClass);
 }
 
 bool HexagonInstrInfo::isTC4x(const MachineInstr &MI) const {
   unsigned SchedClass = MI.getDesc().getSchedClass();
-  return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23;
+  return is_TC4x(SchedClass);
 }
 
 // Schedule this ASAP.
@@ -2542,7 +2420,7 @@ bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
   return false;
 }
 
-bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr &MI) const {
+bool HexagonInstrInfo::isHVXVec(const MachineInstr &MI) const {
   const uint64_t V = getType(MI);
   return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST;
 }
@@ -2599,25 +2477,31 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   switch (Opcode) {
   case Hexagon::PS_vstorerq_ai:
   case Hexagon::PS_vstorerw_ai:
+  case Hexagon::PS_vstorerw_nt_ai:
   case Hexagon::PS_vloadrq_ai:
   case Hexagon::PS_vloadrw_ai:
+  case Hexagon::PS_vloadrw_nt_ai:
   case Hexagon::V6_vL32b_ai:
   case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vL32b_nt_ai:
+  case Hexagon::V6_vS32b_nt_ai:
   case Hexagon::V6_vL32Ub_ai:
   case Hexagon::V6_vS32Ub_ai:
-    return (Offset >= Hexagon_MEMV_OFFSET_MIN) &&
-      (Offset <= Hexagon_MEMV_OFFSET_MAX);
+    return isShiftedInt<4,6>(Offset);
 
   case Hexagon::PS_vstorerq_ai_128B:
   case Hexagon::PS_vstorerw_ai_128B:
+  case Hexagon::PS_vstorerw_nt_ai_128B:
   case Hexagon::PS_vloadrq_ai_128B:
   case Hexagon::PS_vloadrw_ai_128B:
+  case Hexagon::PS_vloadrw_nt_ai_128B:
   case Hexagon::V6_vL32b_ai_128B:
   case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vL32b_nt_ai_128B:
+  case Hexagon::V6_vS32b_nt_ai_128B:
   case Hexagon::V6_vL32Ub_ai_128B:
   case Hexagon::V6_vS32Ub_ai_128B:
-    return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) &&
-      (Offset <= Hexagon_MEMV_OFFSET_MAX_128B);
+    return isShiftedInt<4,7>(Offset);
 
   case Hexagon::J2_loop0i:
   case Hexagon::J2_loop1i:
@@ -2656,6 +2540,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L2_loadrh_io:
   case Hexagon::L2_loadruh_io:
   case Hexagon::S2_storerh_io:
+  case Hexagon::S2_storerf_io:
     return (Offset >= Hexagon_MEMH_OFFSET_MIN) &&
       (Offset <= Hexagon_MEMH_OFFSET_MAX);
 
@@ -2740,7 +2625,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
 }
 
 bool HexagonInstrInfo::isVecAcc(const MachineInstr &MI) const {
-  return isV60VectorInstruction(MI) && isAccumulator(MI);
+  return isHVXVec(MI) && isAccumulator(MI);
 }
 
 bool HexagonInstrInfo::isVecALU(const MachineInstr &MI) const {
@@ -2846,7 +2731,7 @@ bool HexagonInstrInfo::isZeroExtendingLoad(const MachineInstr &MI) const {
 // Add latency to instruction.
 bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
       const MachineInstr &MI2) const {
-  if (isV60VectorInstruction(MI1) && isV60VectorInstruction(MI2))
+  if (isHVXVec(MI1) && isHVXVec(MI2))
     if (!isVecUsableNextPacket(MI1, MI2))
       return true;
   return false;
@@ -2866,6 +2751,11 @@ bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
 /// \brief Can these instructions execute at the same time in a bundle.
 bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
       const MachineInstr &Second) const {
+  if (Second.mayStore() && First.getOpcode() == Hexagon::S2_allocframe) {
+    const MachineOperand &Op = Second.getOperand(0);
+    if (Op.isReg() && Op.isUse() && Op.getReg() == Hexagon::R29)
+      return true;
+  }
   if (DisableNVSchedule)
     return false;
   if (mayBeNewStore(Second)) {
@@ -2966,7 +2856,7 @@ bool HexagonInstrInfo::mayBeNewStore(const MachineInstr &MI) const {
 bool HexagonInstrInfo::producesStall(const MachineInstr &ProdMI,
       const MachineInstr &ConsMI) const {
   // There is no stall when ProdMI is not a V60 vector.
-  if (!isV60VectorInstruction(ProdMI))
+  if (!isHVXVec(ProdMI))
     return false;
 
   // There is no stall when ProdMI and ConsMI are not dependent.
@@ -2984,19 +2874,15 @@ bool HexagonInstrInfo::producesStall(const MachineInstr &ProdMI,
 bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
       MachineBasicBlock::const_instr_iterator BII) const {
   // There is no stall when I is not a V60 vector.
-  if (!isV60VectorInstruction(MI))
+  if (!isHVXVec(MI))
     return false;
 
   MachineBasicBlock::const_instr_iterator MII = BII;
   MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end();
 
-  if (!MII->isBundle()) {
+  if (!(*MII).isBundle()) {
     const MachineInstr &J = *MII;
-    if (!isV60VectorInstruction(J))
-      return false;
-    else if (isVecUsableNextPacket(J, MI))
-      return false;
-    return true;
+    return producesStall(J, MI);
   }
 
   for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) {
@@ -3009,10 +2895,12 @@ bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
 
 bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
       unsigned PredReg) const {
-  for (unsigned opNum = 0; opNum < MI.getNumOperands(); opNum++) {
-    const MachineOperand &MO = MI.getOperand(opNum);
+  for (const MachineOperand &MO : MI.operands()) {
+    // Predicate register must be explicitly defined.
+    if (MO.isRegMask() && MO.clobbersPhysReg(PredReg))
+      return false;
     if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
-      return false; // Predicate register must be explicitly defined.
+      return false;
   }
 
   // Hexagon Programmer's Reference says that decbin, memw_locked, and
@@ -3022,12 +2910,14 @@ bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
 }
 
 bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
-  return (Opcode == Hexagon::J2_jumpt)      ||
-         (Opcode == Hexagon::J2_jumpf)      ||
-         (Opcode == Hexagon::J2_jumptnew)   ||
-         (Opcode == Hexagon::J2_jumpfnew)   ||
-         (Opcode == Hexagon::J2_jumptnewpt) ||
-         (Opcode == Hexagon::J2_jumpfnewpt);
+  return Opcode == Hexagon::J2_jumpt      ||
+         Opcode == Hexagon::J2_jumptpt    ||
+         Opcode == Hexagon::J2_jumpf      ||
+         Opcode == Hexagon::J2_jumpfpt    ||
+         Opcode == Hexagon::J2_jumptnew   ||
+         Opcode == Hexagon::J2_jumpfnew   ||
+         Opcode == Hexagon::J2_jumptnewpt ||
+         Opcode == Hexagon::J2_jumpfnewpt;
 }
 
 bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
@@ -3320,18 +3210,55 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
     return Hexagon::V6_vL32b_cur_pi;
   case Hexagon::V6_vL32b_ai:
     return Hexagon::V6_vL32b_cur_ai;
+  case Hexagon::V6_vL32b_nt_pi:
+    return Hexagon::V6_vL32b_nt_cur_pi;
+  case Hexagon::V6_vL32b_nt_ai:
+    return Hexagon::V6_vL32b_nt_cur_ai;
   //128B
   case Hexagon::V6_vL32b_pi_128B:
     return Hexagon::V6_vL32b_cur_pi_128B;
   case Hexagon::V6_vL32b_ai_128B:
     return Hexagon::V6_vL32b_cur_ai_128B;
+  case Hexagon::V6_vL32b_nt_pi_128B:
+    return Hexagon::V6_vL32b_nt_cur_pi_128B;
+  case Hexagon::V6_vL32b_nt_ai_128B:
+    return Hexagon::V6_vL32b_nt_cur_ai_128B;
   }
   return 0;
 }
 
+// Return the regular version of the .cur instruction.
+int HexagonInstrInfo::getNonDotCurOp(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default: llvm_unreachable("Unknown .cur type");
+  case Hexagon::V6_vL32b_cur_pi:
+    return Hexagon::V6_vL32b_pi;
+  case Hexagon::V6_vL32b_cur_ai:
+    return Hexagon::V6_vL32b_ai;
+  case Hexagon::V6_vL32b_nt_cur_pi:
+    return Hexagon::V6_vL32b_nt_pi;
+  case Hexagon::V6_vL32b_nt_cur_ai:
+    return Hexagon::V6_vL32b_nt_ai;
+  //128B
+  case Hexagon::V6_vL32b_cur_pi_128B:
+    return Hexagon::V6_vL32b_pi_128B;
+  case Hexagon::V6_vL32b_cur_ai_128B:
+    return Hexagon::V6_vL32b_ai_128B;
+  case Hexagon::V6_vL32b_nt_cur_pi_128B:
+    return Hexagon::V6_vL32b_nt_pi_128B;
+  case Hexagon::V6_vL32b_nt_cur_ai_128B:
+    return Hexagon::V6_vL32b_nt_ai_128B;
+  }
+  return 0;
+}
+
+
 // The diagram below shows the steps involved in the conversion of a predicated
 // store instruction to its .new predicated new-value form.
 //
+// Note: It doesn't include conditional new-value stores as they can't be
+// converted to .new predicate.
+//
 //               p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ]
 //                ^           ^
 //               /             \ (not OK. it will cause new-value store to be
@@ -3347,7 +3274,6 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
 //                 p.old store
 //             [if (p0)memw(R0+#0)=R2]
 //
-//
 // The following set of instructions further explains the scenario where
 // conditional new-value store becomes invalid when promoted to .new predicate
 // form.
@@ -3415,7 +3341,9 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
     return NVOpcode;
 
   switch (MI.getOpcode()) {
-  default: llvm_unreachable("Unknown .new type");
+  default:
+    llvm::report_fatal_error(std::string("Unknown .new type: ") +
+      std::to_string(MI.getOpcode()).c_str());
   case Hexagon::S4_storerb_ur:
     return Hexagon::S4_storerbnew_ur;
 
@@ -3453,23 +3381,87 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
 // Returns the opcode to use when converting MI, which is a conditional jump,
 // into a conditional instruction which uses the .new value of the predicate.
 // We also use branch probabilities to add a hint to the jump.
+// If MBPI is null, all edges will be treated as equally likely for the
+// purposes of establishing a predication hint.
 int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
   // We assume that block can have at most two successors.
-  bool taken = false;
   const MachineBasicBlock *Src = MI.getParent();
   const MachineOperand &BrTarget = MI.getOperand(1);
-  const MachineBasicBlock *Dst = BrTarget.getMBB();
+  bool Taken = false;
+  const BranchProbability OneHalf(1, 2);
+
+  auto getEdgeProbability = [MBPI] (const MachineBasicBlock *Src,
+                                    const MachineBasicBlock *Dst) {
+    if (MBPI)
+      return MBPI->getEdgeProbability(Src, Dst);
+    return BranchProbability(1, Src->succ_size());
+  };
+
+  if (BrTarget.isMBB()) {
+    const MachineBasicBlock *Dst = BrTarget.getMBB();
+    Taken = getEdgeProbability(Src, Dst) >= OneHalf;
+  } else {
+    // The branch target is not a basic block (most likely a function).
+    // Since BPI only gives probabilities for targets that are basic blocks,
+    // try to identify another target of this branch (potentially a fall-
+    // -through) and check the probability of that target.
+    //
+    // The only handled branch combinations are:
+    // - one conditional branch,
+    // - one conditional branch followed by one unconditional branch.
+    // Otherwise, assume not-taken.
+    assert(MI.isConditionalBranch());
+    const MachineBasicBlock &B = *MI.getParent();
+    bool SawCond = false, Bad = false;
+    for (const MachineInstr &I : B) {
+      if (!I.isBranch())
+        continue;
+      if (I.isConditionalBranch()) {
+        SawCond = true;
+        if (&I != &MI) {
+          Bad = true;
+          break;
+        }
+      }
+      if (I.isUnconditionalBranch() && !SawCond) {
+        Bad = true;
+        break;
+      }
+    }
+    if (!Bad) {
+      MachineBasicBlock::const_instr_iterator It(MI);
+      MachineBasicBlock::const_instr_iterator NextIt = std::next(It);
+      if (NextIt == B.instr_end()) {
+        // If this branch is the last, look for the fall-through block.
+        for (const MachineBasicBlock *SB : B.successors()) {
+          if (!B.isLayoutSuccessor(SB))
+            continue;
+          Taken = getEdgeProbability(Src, SB) < OneHalf;
+          break;
+        }
+      } else {
+        assert(NextIt->isUnconditionalBranch());
+        // Find the first MBB operand and assume it's the target.
+        const MachineBasicBlock *BT = nullptr;
+        for (const MachineOperand &Op : NextIt->operands()) {
+          if (!Op.isMBB())
+            continue;
+          BT = Op.getMBB();
+          break;
+        }
+        Taken = BT && getEdgeProbability(Src, BT) < OneHalf;
+      }
+    } // if (!Bad)
+  }
 
-  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
-  if (Prediction >= BranchProbability(1,2))
-    taken = true;
+  // The Taken flag should be set to something reasonable by this point.
 
   switch (MI.getOpcode()) {
   case Hexagon::J2_jumpt:
-    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+    return Taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
   case Hexagon::J2_jumpf:
-    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+    return Taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
 
   default:
     llvm_unreachable("Unexpected jump instruction.");
@@ -3479,26 +3471,44 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
 // Return .new predicate version for an instruction.
 int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
-  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
-  if (NewOpcode >= 0) // Valid predicate new instruction
-    return NewOpcode;
-
   switch (MI.getOpcode()) {
   // Condtional Jumps
   case Hexagon::J2_jumpt:
   case Hexagon::J2_jumpf:
     return getDotNewPredJumpOp(MI, MBPI);
-
-  default:
-    assert(0 && "Unknown .new type");
   }
+
+  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+  if (NewOpcode >= 0)
+    return NewOpcode;
   return 0;
 }
 
-int HexagonInstrInfo::getDotOldOp(const int opc) const {
-  int NewOp = opc;
+int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+  int NewOp = MI.getOpcode();
   if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
     NewOp = Hexagon::getPredOldOpcode(NewOp);
+    // All Hexagon architectures have prediction bits on dot-new branches,
+    // but only Hexagon V60+ has prediction bits on dot-old ones. Make sure
+    // to pick the right opcode when converting back to dot-old.
+    if (!HST.getFeatureBits()[Hexagon::ArchV60]) {
+      switch (NewOp) {
+      case Hexagon::J2_jumptpt:
+        NewOp = Hexagon::J2_jumpt;
+        break;
+      case Hexagon::J2_jumpfpt:
+        NewOp = Hexagon::J2_jumpf;
+        break;
+      case Hexagon::J2_jumprtpt:
+        NewOp = Hexagon::J2_jumprt;
+        break;
+      case Hexagon::J2_jumprfpt:
+        NewOp = Hexagon::J2_jumprf;
+        break;
+      }
+    }
     assert(NewOp >= 0 &&
            "Couldn't change predicate new instruction to its old form.");
   }
@@ -3507,6 +3517,21 @@ int HexagonInstrInfo::getDotOldOp(const int opc) const {
     NewOp = Hexagon::getNonNVStore(NewOp);
     assert(NewOp >= 0 && "Couldn't change new-value store to its old form.");
   }
+
+  if (HST.hasV60TOps())
+    return NewOp;
+
+  // Subtargets prior to V60 didn't support 'taken' forms of predicated jumps.
+  switch (NewOp) {
+  case Hexagon::J2_jumpfpt:
+    return Hexagon::J2_jumpf;
+  case Hexagon::J2_jumptpt:
+    return Hexagon::J2_jumpt;
+  case Hexagon::J2_jumprfpt:
+    return Hexagon::J2_jumprf;
+  case Hexagon::J2_jumprtpt:
+    return Hexagon::J2_jumprt;
+  }
   return NewOp;
 }
 
@@ -3858,18 +3883,6 @@ short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr &MI) const {
   return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Real);
 }
 
-// Return first non-debug instruction in the basic block.
-MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
-      const {
-  for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) {
-    MachineInstr &MI = *MII;
-    if (MI.isDebugValue())
-      continue;
-    return &MI;
-  }
-  return nullptr;
-}
-
 unsigned HexagonInstrInfo::getInstrTimingClassLatency(
       const InstrItineraryData *ItinData, const MachineInstr &MI) const {
   // Default to one cycle for no itinerary. However, an "empty" itinerary may
@@ -3877,18 +3890,53 @@ unsigned HexagonInstrInfo::getInstrTimingClassLatency(
   if (!ItinData)
     return getInstrLatency(ItinData, MI);
 
-  // Get the latency embedded in the itinerary. If we're not using timing class
-  // latencies or if we using BSB scheduling, then restrict the maximum latency
-  // to 1 (that is, either 0 or 1).
   if (MI.isTransient())
     return 0;
-  unsigned Latency = ItinData->getStageLatency(MI.getDesc().getSchedClass());
-  if (!EnableTimingClassLatency ||
-      MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>().
-      useBSBScheduling())
-    if (Latency > 1)
-      Latency = 1;
-  return Latency;
+  return ItinData->getStageLatency(MI.getDesc().getSchedClass());
+}
+
+/// getOperandLatency - Compute and return the use operand latency of a given
+/// pair of def and use.
+/// In most cases, the static scheduling itinerary was enough to determine the
+/// operand latency. But it may not be possible for instructions with variable
+/// number of defs / uses.
+///
+/// This is a raw interface to the itinerary that may be directly overriden by
+/// a target. Use computeOperandLatency to get the best estimate of latency.
+int HexagonInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                        const MachineInstr &DefMI,
+                                        unsigned DefIdx,
+                                        const MachineInstr &UseMI,
+                                        unsigned UseIdx) const {
+  auto &RI = getRegisterInfo();
+  // Get DefIdx and UseIdx for super registers.
+  MachineOperand DefMO = DefMI.getOperand(DefIdx);
+
+  if (RI.isPhysicalRegister(DefMO.getReg())) {
+    if (DefMO.isImplicit()) {
+      for (MCSuperRegIterator SR(DefMO.getReg(), &RI); SR.isValid(); ++SR) {
+        int Idx = DefMI.findRegisterDefOperandIdx(*SR, false, false, &RI);
+        if (Idx != -1) {
+          DefIdx = Idx;
+          break;
+        }
+      }
+    }
+
+    MachineOperand UseMO = UseMI.getOperand(UseIdx);
+    if (UseMO.isImplicit()) {
+      for (MCSuperRegIterator SR(UseMO.getReg(), &RI); SR.isValid(); ++SR) {
+        int Idx = UseMI.findRegisterUseOperandIdx(*SR, false, &RI);
+        if (Idx != -1) {
+          UseIdx = Idx;
+          break;
+        }
+      }
+    }
+  }
+
+  return TargetInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx,
+                                            UseMI, UseIdx);
 }
 
 // inverts the predication logic.
@@ -4050,11 +4098,6 @@ unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
   return IS.getUnits();
 }
 
-unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const {
-  const uint64_t F = get(Opcode).TSFlags;
-  return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask;
-}
-
 // Calculate size of the basic block without debug instructions.
 unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const {
   return nonDbgMICount(BB->instr_begin(), BB->instr_end());
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 2358d4b..0436ce3 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -235,7 +235,7 @@ public:
   /// Return true if the specified instruction can be predicated.
   /// By default, this returns true for every instruction with a
   /// PredicateOperand.
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   /// Test if the given instruction should be considered a scheduling boundary.
   /// This primarily includes labels and terminators.
@@ -288,6 +288,40 @@ public:
   /// If the instruction is an increment of a constant value, return the amount.
   bool getIncrementValue(const MachineInstr &MI, int &Value) const override;
 
+  /// getOperandLatency - Compute and return the use operand latency of a given
+  /// pair of def and use.
+  /// In most cases, the static scheduling itinerary was enough to determine the
+  /// operand latency. But it may not be possible for instructions with variable
+  /// number of defs / uses.
+  ///
+  /// This is a raw interface to the itinerary that may be directly overriden by
+  /// a target. Use computeOperandLatency to get the best estimate of latency.
+  int getOperandLatency(const InstrItineraryData *ItinData,
+                        const MachineInstr &DefMI, unsigned DefIdx,
+                        const MachineInstr &UseMI,
+                        unsigned UseIdx) const override;
+
+  /// Decompose the machine operand's target flags into two values - the direct
+  /// target flag value and any of bit flags that are applied.
+  std::pair<unsigned, unsigned>
+  decomposeMachineOperandsTargetFlags(unsigned TF) const override;
+
+  /// Return an array that contains the direct target flag values and their
+  /// names.
+  ///
+  /// MIR Serialization is able to serialize only the target flags that are
+  /// defined by this method.
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableDirectMachineOperandTargetFlags() const override;
+
+  /// Return an array that contains the bitmask target flag values and their
+  /// names.
+  ///
+  /// MIR Serialization is able to serialize only the target flags that are
+  /// defined by this method.
+  ArrayRef<std::pair<unsigned, const char *>>
+  getSerializableBitmaskMachineOperandTargetFlags() const override;
+
   bool isTailCall(const MachineInstr &MI) const override;
 
   /// HexagonInstrInfo specifics.
@@ -301,11 +335,6 @@ public:
   bool isAccumulator(const MachineInstr &MI) const;
   bool isComplex(const MachineInstr &MI) const;
   bool isCompoundBranchInstr(const MachineInstr &MI) const;
-  bool isCondInst(const MachineInstr &MI) const;
-  bool isConditionalALU32 (const MachineInstr &MI) const;
-  bool isConditionalLoad(const MachineInstr &MI) const;
-  bool isConditionalStore(const MachineInstr &MI) const;
-  bool isConditionalTransfer(const MachineInstr &MI) const;
   bool isConstExtended(const MachineInstr &MI) const;
   bool isDeallocRet(const MachineInstr &MI) const;
   bool isDependent(const MachineInstr &ProdMI,
@@ -356,7 +385,7 @@ public:
   bool isTC4x(const MachineInstr &MI) const;
   bool isToBeScheduledASAP(const MachineInstr &MI1,
                            const MachineInstr &MI2) const;
-  bool isV60VectorInstruction(const MachineInstr &MI) const;
+  bool isHVXVec(const MachineInstr &MI) const;
   bool isValidAutoIncImm(const EVT VT, const int Offset) const;
   bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const;
   bool isVecAcc(const MachineInstr &MI) const;
@@ -399,12 +428,13 @@ public:
                              const MachineInstr &GB) const;
   int getCondOpcode(int Opc, bool sense) const;
   int getDotCurOp(const MachineInstr &MI) const;
+  int getNonDotCurOp(const MachineInstr &MI) const;
   int getDotNewOp(const MachineInstr &MI) const;
   int getDotNewPredJumpOp(const MachineInstr &MI,
                           const MachineBranchProbabilityInfo *MBPI) const;
   int getDotNewPredOp(const MachineInstr &MI,
                       const MachineBranchProbabilityInfo *MBPI) const;
-  int getDotOldOp(const int opc) const;
+  int getDotOldOp(const MachineInstr &MI) const;
   HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr &MI)
                                                          const;
   short getEquivalentHWInstr(const MachineInstr &MI) const;
@@ -424,7 +454,6 @@ public:
   unsigned getSize(const MachineInstr &MI) const;
   uint64_t getType(const MachineInstr &MI) const;
   unsigned getUnits(const MachineInstr &MI) const;
-  unsigned getValidSubTargets(const unsigned Opcode) const;
 
   /// getInstrTimingClassLatency - Compute the instruction latency of a given
   /// instruction using Timing Class information, if available.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
deleted file mode 100644
index c5719ad..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td
+++ /dev/null
@@ -1,4799 +0,0 @@
-//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrFormats.td"
-include "HexagonOperands.td"
-include "HexagonInstrEnc.td"
-
-//===----------------------------------------------------------------------===//
-// Compare
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isCompare = 1, InputType = "imm", isExtendable = 1,
-    opExtendable = 2 in
-class T_CMP <string mnemonic, bits<2> MajOp, bit isNot, Operand ImmOp>
-  : ALU32Inst <(outs PredRegs:$dst),
-               (ins IntRegs:$src1, ImmOp:$src2),
-  "$dst = "#!if(isNot, "!","")#mnemonic#"($src1, #$src2)",
-  [], "",ALU32_2op_tc_2early_SLOT0123 >, ImmRegRel {
-    bits<2> dst;
-    bits<5> src1;
-    bits<10> src2;
-    let CextOpcode = mnemonic;
-    let opExtentBits  = !if(!eq(mnemonic, "cmp.gtu"), 9, 10);
-    let isExtentSigned = !if(!eq(mnemonic, "cmp.gtu"), 0, 1);
-
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = !if(!eq(mnemonic, "cmp.gtu"), 0, src2{9});
-    let Inst{20-16} = src1;
-    let Inst{13-5}  = src2{8-0};
-    let Inst{4}     = isNot;
-    let Inst{3-2}   = 0b00;
-    let Inst{1-0}   = dst;
-  }
-
-def C2_cmpeqi   : T_CMP <"cmp.eq",  0b00, 0, s10_0Ext>;
-def C2_cmpgti   : T_CMP <"cmp.gt",  0b01, 0, s10_0Ext>;
-def C2_cmpgtui  : T_CMP <"cmp.gtu", 0b10, 0, u9_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// ALU32/ALU +
-//===----------------------------------------------------------------------===//
-// Add.
-
-let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
-class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
-                  bit IsComm>
-  : ALU32_rr<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-             "$Rd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredRel {
-  let isCommutable = IsComm;
-  let BaseOpcode = mnemonic#_rr;
-  let CextOpcode = mnemonic;
-
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1111;
-  let Inst{27} = 0b0;
-  let Inst{26-24} = MajOp;
-  let Inst{23-21} = MinOp;
-  let Inst{20-16} = !if(OpsRev,Rt,Rs);
-  let Inst{12-8} = !if(OpsRev,Rs,Rt);
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                       bit OpsRev, bit PredNot, bit PredNew>
-  : ALU32_rr<(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
-             "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") "#
-             "$Rd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
-  let isPredicated = 1;
-  let isPredicatedFalse = PredNot;
-  let isPredicatedNew = PredNew;
-  let BaseOpcode = mnemonic#_rr;
-  let CextOpcode = mnemonic;
-
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1111;
-  let Inst{27} = 0b1;
-  let Inst{26-24} = MajOp;
-  let Inst{23-21} = MinOp;
-  let Inst{20-16} = !if(OpsRev,Rt,Rs);
-  let Inst{13} = PredNew;
-  let Inst{12-8} = !if(OpsRev,Rs,Rt);
-  let Inst{7} = PredNot;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU32_combineh<string Op1, string Op2, bits<3> MajOp, bits<3> MinOp,
-                      bit OpsRev>
-  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, 0> {
-  let AsmString = "$Rd = combine($Rs"#Op1#", $Rt"#Op2#")";
-}
-
-def A2_combine_hh : T_ALU32_combineh<".h", ".h", 0b011, 0b100, 1>;
-def A2_combine_hl : T_ALU32_combineh<".h", ".l", 0b011, 0b101, 1>;
-def A2_combine_lh : T_ALU32_combineh<".l", ".h", 0b011, 0b110, 1>;
-def A2_combine_ll : T_ALU32_combineh<".l", ".l", 0b011, 0b111, 1>;
-
-class T_ALU32_3op_sfx<string mnemonic, string suffix, bits<3> MajOp,
-                      bits<3> MinOp, bit OpsRev, bit IsComm>
-  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, IsComm> {
-  let AsmString = "$Rd = "#mnemonic#"($Rs, $Rt)"#suffix;
-}
-
-def A2_svaddh   : T_ALU32_3op<"vaddh",   0b110, 0b000, 0, 1>;
-def A2_svsubh   : T_ALU32_3op<"vsubh",   0b110, 0b100, 1, 0>;
-
-let Defs = [USR_OVF], Itinerary = ALU32_3op_tc_2_SLOT0123 in {
-  def A2_svaddhs  : T_ALU32_3op_sfx<"vaddh",  ":sat", 0b110, 0b001, 0, 1>;
-  def A2_addsat   : T_ALU32_3op_sfx<"add",    ":sat", 0b110, 0b010, 0, 1>;
-  def A2_svadduhs : T_ALU32_3op_sfx<"vadduh", ":sat", 0b110, 0b011, 0, 1>;
-  def A2_svsubhs  : T_ALU32_3op_sfx<"vsubh",  ":sat", 0b110, 0b101, 1, 0>;
-  def A2_subsat   : T_ALU32_3op_sfx<"sub",    ":sat", 0b110, 0b110, 1, 0>;
-  def A2_svsubuhs : T_ALU32_3op_sfx<"vsubuh", ":sat", 0b110, 0b111, 1, 0>;
-}
-
-let Itinerary = ALU32_3op_tc_2_SLOT0123 in
-def A2_svavghs  : T_ALU32_3op_sfx<"vavgh",  ":rnd", 0b111, 0b001, 0, 1>;
-
-def A2_svavgh   : T_ALU32_3op<"vavgh",   0b111, 0b000, 0, 1>;
-def A2_svnavgh  : T_ALU32_3op<"vnavgh",  0b111, 0b011, 1, 0>;
-
-multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                         bit OpsRev> {
-  def t    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
-  def f    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 0>;
-  def tnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 1>;
-  def fnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 1>;
-}
-
-multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                          bit OpsRev, bit IsComm> {
-  let isPredicable = 1 in
-  def  A2_#NAME  : T_ALU32_3op  <mnemonic, MajOp, MinOp, OpsRev, IsComm>;
-  defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
-}
-
-defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
-defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
-defm or  : T_ALU32_3op_A2<"or",  0b001, 0b001, 0, 1>;
-defm sub : T_ALU32_3op_A2<"sub", 0b011, 0b001, 1, 0>;
-defm xor : T_ALU32_3op_A2<"xor", 0b001, 0b011, 0, 1>;
-
-// A few special cases producing register pairs:
-let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in {
-  def S2_packhl    : T_ALU32_3op  <"packhl",  0b101, 0b100, 0, 0>;
-
-  let isPredicable = 1 in
-    def A2_combinew  : T_ALU32_3op  <"combine", 0b101, 0b000, 0, 0>;
-
-  // Conditional combinew uses "newt/f" instead of "t/fnew".
-  def C2_ccombinewt    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 0>;
-  def C2_ccombinewf    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 0>;
-  def C2_ccombinewnewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 1>;
-  def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>;
-}
-
-let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg"  in
-class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm>
-  : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-             "$Pd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
-  let CextOpcode = mnemonic;
-  let isCommutable = IsComm;
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<2> Pd;
-
-  let IClass = 0b1111;
-  let Inst{27-24} = 0b0010;
-  let Inst{22-21} = MinOp;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4} = IsNeg;
-  let Inst{3-2} = 0b00;
-  let Inst{1-0} = Pd;
-}
-
-let Itinerary = ALU32_3op_tc_2early_SLOT0123 in {
-  def C2_cmpeq   : T_ALU32_3op_cmp< "cmp.eq",  0b00, 0, 1>;
-  def C2_cmpgt   : T_ALU32_3op_cmp< "cmp.gt",  0b10, 0, 0>;
-  def C2_cmpgtu  : T_ALU32_3op_cmp< "cmp.gtu", 0b11, 0, 0>;
-}
-
-let CextOpcode = "MUX", InputType = "reg", hasNewValue = 1 in
-def C2_mux: ALU32_rr<(outs IntRegs:$Rd),
-                     (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = mux($Pu, $Rs, $Rt)", [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let CextOpcode = "mux";
-  let InputType = "reg";
-  let hasSideEffects = 0;
-  let IClass = 0b1111;
-
-  let Inst{27-24} = 0b0100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-// Combines the two immediates into a double register.
-// Increase complexity to make it greater than any complexity of a combine
-// that involves a register.
-
-let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 1,
-    AddedComplexity = 75 in
-def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8_0Ext:$s8, s8_0Imm:$S8),
-  "$Rdd = combine(#$s8, #$S8)",
-  []> {
-    bits<5> Rdd;
-    bits<8> s8;
-    bits<8> S8;
-
-    let IClass = 0b0111;
-    let Inst{27-23} = 0b11000;
-    let Inst{22-16} = S8{7-1};
-    let Inst{13}    = S8{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated ADD of a reg and an Immediate value.
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_Addri_Pred <bit PredNot, bit PredNew>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
-  !if(PredNot, "if (!$Pu", "if ($Pu")#!if(PredNew,".new) $Rd = ",
-  ") $Rd = ")#"add($Rs, #$s8)"> {
-    bits<5> Rd;
-    bits<2> Pu;
-    bits<5> Rs;
-    bits<8> s8;
-
-    let isPredicatedNew = PredNew;
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23}    = PredNot;
-    let Inst{22-21} = Pu;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = PredNew;
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// A2_addi: Add a signed immediate to a register.
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_Addri <Operand immOp>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins IntRegs:$Rs, immOp:$s16),
-  "$Rd = add($Rs, #$s16)", [], "", ALU32_ADDI_tc_1_SLOT0123> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<16> s16;
-
-    let IClass = 0b1011;
-
-    let Inst{27-21} = s16{15-9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s16{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for ADD of a register and an immediate value.
-//===----------------------------------------------------------------------===//
-multiclass Addri_Pred<string mnemonic, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def NAME     : T_Addri_Pred<PredNot, 0>;
-    // Predicate new
-    def NAME#new : T_Addri_Pred<PredNot, 1>;
-  }
-}
-
-let isExtendable = 1, isExtentSigned = 1, InputType = "imm" in
-multiclass Addri_base<string mnemonic, SDNode OpNode> {
-  let CextOpcode = mnemonic, BaseOpcode = mnemonic#_ri in {
-    let opExtendable = 2, opExtentBits = 16, isPredicable = 1, isAdd = 1 in
-    def A2_#NAME : T_Addri<s16_0Ext>;
-
-    let opExtendable = 3, opExtentBits = 8, isPredicated = 1 in {
-      defm A2_p#NAME#t : Addri_Pred<mnemonic, 0>;
-      defm A2_p#NAME#f : Addri_Pred<mnemonic, 1>;
-    }
-  }
-}
-
-defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
-
-let hasNewValue = 1, hasSideEffects = 0, isPseudo = 1 in
-def A2_iconst
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins s23_2Imm:$s23_2),
-  "$Rd = iconst(#$s23_2)"> {}
-
-//===----------------------------------------------------------------------===//
-// Template class used for the following ALU32 instructions.
-// Rd=and(Rs,#s10)
-// Rd=or(Rs,#s10)
-//===----------------------------------------------------------------------===//
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
-InputType = "imm", hasNewValue = 1 in
-class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins IntRegs:$Rs, s10_0Ext:$s10),
-  "$Rd = "#mnemonic#"($Rs, #$s10)" ,
-  []> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<10> s10;
-    let CextOpcode = mnemonic;
-
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0110;
-    let Inst{23-22} = MinOp;
-    let Inst{21}    = s10{9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-def A2_orir  : T_ALU32ri_logical<"or", or, 0b10>, ImmRegRel;
-def A2_andir : T_ALU32ri_logical<"and", and, 0b00>, ImmRegRel;
-
-// Subtract register from immediate
-// Rd32=sub(#s10,Rs32)
-let isExtendable = 1, CextOpcode = "sub", opExtendable = 1, isExtentSigned = 1,
-    opExtentBits = 10, InputType = "imm", hasNewValue = 1, hasSideEffects = 0 in
-def A2_subri: ALU32_ri <(outs IntRegs:$Rd), (ins s10_0Ext:$s10, IntRegs:$Rs),
-  "$Rd = sub(#$s10, $Rs)", []>, ImmRegRel {
-    bits<5> Rd;
-    bits<10> s10;
-    bits<5> Rs;
-
-    let IClass = 0b0111;
-
-    let Inst{27-22} = 0b011001;
-    let Inst{21}    = s10{9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-// Nop.
-let hasSideEffects = 0 in
-def A2_nop: ALU32Inst <(outs), (ins), "nop" > {
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1111;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_tfr16<bit isHi>
-  : ALU32Inst <(outs IntRegs:$Rx), (ins IntRegs:$src1, u16_0Imm:$u16),
-  "$Rx"#!if(isHi, ".h", ".l")#" = #$u16",
-  [], "$src1 = $Rx" > {
-    bits<5> Rx;
-    bits<16> u16;
-
-    let IClass = 0b0111;
-    let Inst{27-26} = 0b00;
-    let Inst{25-24} = !if(isHi, 0b10, 0b01);
-    let Inst{23-22} = u16{15-14};
-    let Inst{21}    = 0b1;
-    let Inst{20-16} = Rx;
-    let Inst{13-0}  = u16{13-0};
-  }
-
-def A2_tfril: T_tfr16<0>;
-def A2_tfrih: T_tfr16<1>;
-
-// Conditional transfer is an alias to conditional "Rd = add(Rs, #0)".
-let isPredicated = 1, hasNewValue = 1, opNewValue = 0 in
-class T_tfr_pred<bit isPredNot, bit isPredNew>
-  : ALU32Inst<(outs IntRegs:$dst),
-              (ins PredRegs:$src1, IntRegs:$src2),
-              "if ("#!if(isPredNot, "!", "")#
-              "$src1"#!if(isPredNew, ".new", "")#
-              ") $dst = $src2"> {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-
-    let isPredicatedFalse = isPredNot;
-    let isPredicatedNew = isPredNew;
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23} = isPredNot;
-    let Inst{13} = isPredNew;
-    let Inst{12-5} = 0;
-    let Inst{4-0} = dst;
-    let Inst{22-21} = src1;
-    let Inst{20-16} = src2;
-  }
-
-let isPredicable = 1 in
-class T_tfr : ALU32Inst<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = $src"> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0111;
-
-    let Inst{27-21} = 0b0000011;
-    let Inst{20-16} = src;
-    let Inst{13}    = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-let InputType = "reg", hasNewValue = 1, hasSideEffects = 0 in
-multiclass tfr_base<string CextOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    def NAME : T_tfr;
-
-    // Predicate
-    def t : T_tfr_pred<0, 0>;
-    def f : T_tfr_pred<1, 0>;
-    // Predicate new
-    def tnew : T_tfr_pred<0, 1>;
-    def fnew : T_tfr_pred<1, 1>;
-  }
-}
-
-// Assembler mapped to C2_ccombinew[t|f|newt|newf].
-// Please don't add bits to this instruction as it'll be converted into
-// 'combine' before object code emission.
-let isPredicated = 1 in
-class T_tfrp_pred<bit PredNot, bit PredNew>
-  : ALU32_rr <(outs DoubleRegs:$dst),
-              (ins PredRegs:$src1, DoubleRegs:$src2),
-  "if ("#!if(PredNot, "!", "")#"$src1"
-        #!if(PredNew, ".new", "")#") $dst = $src2" > {
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = PredNew;
-  }
-
-// Assembler mapped to A2_combinew.
-// Please don't add bits to this instruction as it'll be converted into
-// 'combine' before object code emission.
-class T_tfrp : ALU32Inst <(outs DoubleRegs:$dst),
-               (ins DoubleRegs:$src),
-    "$dst = $src">;
-
-let hasSideEffects = 0 in
-multiclass TFR64_base<string BaseName> {
-  let BaseOpcode = BaseName in {
-    let isPredicable = 1 in
-    def NAME : T_tfrp;
-    // Predicate
-    def t : T_tfrp_pred <0, 0>;
-    def f : T_tfrp_pred <1, 0>;
-    // Predicate new
-    def tnew : T_tfrp_pred <0, 1>;
-    def fnew : T_tfrp_pred <1, 1>;
-  }
-}
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1, opExtentBits = 12,
-    isMoveImm = 1, opExtendable = 2, BaseOpcode = "TFRI", CextOpcode = "TFR",
-    hasSideEffects = 0, isPredicated = 1, hasNewValue = 1 in
-class T_TFRI_Pred<bit PredNot, bit PredNew>
-  : ALU32_ri<(outs IntRegs:$Rd), (ins PredRegs:$Pu, s12_0Ext:$s12),
-    "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") $Rd = #$s12",
-    [], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
-  let isPredicatedFalse = PredNot;
-  let isPredicatedNew = PredNew;
-
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<12> s12;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1110;
-  let Inst{23} = PredNot;
-  let Inst{22-21} = Pu;
-  let Inst{20} = 0b0;
-  let Inst{19-16,12-5} = s12;
-  let Inst{13} = PredNew;
-  let Inst{4-0} = Rd;
-}
-
-def C2_cmoveit    : T_TFRI_Pred<0, 0>;
-def C2_cmoveif    : T_TFRI_Pred<1, 0>;
-def C2_cmovenewit : T_TFRI_Pred<0, 1>;
-def C2_cmovenewif : T_TFRI_Pred<1, 1>;
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1,
-    CextOpcode = "TFR", BaseOpcode = "TFRI", hasNewValue = 1, opNewValue = 0,
-    isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1,
-    isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in
-def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16_0Ext:$s16), "$Rd = #$s16",
-    [], "", ALU32_2op_tc_1_SLOT0123>,
-    ImmRegRel, PredRel {
-  bits<5> Rd;
-  bits<16> s16;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1000;
-  let Inst{23-22,20-16,13-5} = s16;
-  let Inst{4-0} = Rd;
-}
-
-defm A2_tfr  : tfr_base<"TFR">, ImmRegRel, PredNewRel;
-let isAsmParserOnly = 1 in
-defm A2_tfrp : TFR64_base<"TFR64">, PredNewRel;
-
-// Assembler mapped
-let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
-    isAsmParserOnly = 1 in
-def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8_0Imm64:$src1),
-                      "$dst = #$src1",
-                      []>;
-
-// TODO: see if this instruction can be deleted..
-let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
-    isAsmParserOnly = 1 in {
-def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
-                         "$dst = #$src1">;
-def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
-                             (ins s8_0Ext:$src1, s8_0Imm:$src2),
-                             "$dst = combine(##$src1, #$src2)">;
-}
-
-//===----------------------------------------------------------------------===//
-// ALU32/ALU -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM +
-//===----------------------------------------------------------------------===//
-// Scalar mux register immediate.
-let hasSideEffects = 0, isExtentSigned = 1, CextOpcode = "MUX",
-    InputType = "imm", hasNewValue = 1, isExtendable = 1, opExtentBits = 8 in
-class T_MUX1 <bit MajOp, dag ins, string AsmStr>
-      : ALU32Inst <(outs IntRegs:$Rd), ins, AsmStr>, ImmRegRel {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<8> s8;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b0011;
-  let Inst{23} = MajOp;
-  let Inst{22-21} = Pu;
-  let Inst{20-16} = Rs;
-  let Inst{13}    = 0b0;
-  let Inst{12-5}  = s8;
-  let Inst{4-0}   = Rd;
-}
-
-let opExtendable = 2 in
-def C2_muxri : T_MUX1<0b1, (ins PredRegs:$Pu, s8_0Ext:$s8, IntRegs:$Rs),
-                           "$Rd = mux($Pu, #$s8, $Rs)">;
-
-let opExtendable = 3 in
-def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
-                           "$Rd = mux($Pu, $Rs, #$s8)">;
-
-// C2_muxii: Scalar mux immediates.
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 8, opExtendable = 2 in
-def C2_muxii: ALU32Inst <(outs IntRegs:$Rd),
-                         (ins PredRegs:$Pu, s8_0Ext:$s8, s8_0Imm:$S8),
-  "$Rd = mux($Pu, #$s8, #$S8)" ,
-  []> {
-    bits<5> Rd;
-    bits<2> Pu;
-    bits<8> s8;
-    bits<8> S8;
-
-    let IClass = 0b0111;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-23} = Pu;
-    let Inst{22-16} = S8{7-1};
-    let Inst{13}    = S8{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rd;
-  }
-
-let isCodeGenOnly = 1, isPseudo = 1 in
-def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
-      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-      ".error \"should not emit\" ", []>;
-
-
-//===----------------------------------------------------------------------===//
-// template class for non-predicated alu32_2op instructions
-// - aslh, asrh, sxtb, sxth, zxth
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, opNewValue = 0 in
-class T_ALU32_2op <string mnemonic, bits<3> minOp> :
-  ALU32Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-             "$Rd = "#mnemonic#"($Rs)", [] > {
-  bits<5> Rd;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-
-  let Inst{27-24} = 0b0000;
-  let Inst{23-21} = minOp;
-  let Inst{13} = 0b0;
-  let Inst{4-0} = Rd;
-  let Inst{20-16} = Rs;
-}
-
-//===----------------------------------------------------------------------===//
-// template class for predicated alu32_2op instructions
-// - aslh, asrh, sxtb, sxth, zxtb, zxth
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_ALU32_2op_Pred <string mnemonic, bits<3> minOp, bit isPredNot,
-                        bit isPredNew > :
-  ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs),
-             !if(isPredNot, "if (!$Pu", "if ($Pu")
-             #!if(isPredNew, ".new) ",") ")#"$Rd = "#mnemonic#"($Rs)"> {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-
-  let Inst{27-24} = 0b0000;
-  let Inst{23-21} = minOp;
-  let Inst{13} = 0b1;
-  let Inst{11} = isPredNot;
-  let Inst{10} = isPredNew;
-  let Inst{4-0} = Rd;
-  let Inst{9-8} = Pu;
-  let Inst{20-16} = Rs;
-}
-
-multiclass ALU32_2op_Pred<string mnemonic, bits<3> minOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def NAME : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 0>;
-
-    // Predicate new
-    let isPredicatedNew = 1 in
-    def NAME#new : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 1>;
-  }
-}
-
-multiclass ALU32_2op_base<string mnemonic, bits<3> minOp> {
-  let BaseOpcode = mnemonic in {
-    let isPredicable = 1, hasSideEffects = 0 in
-    def A2_#NAME : T_ALU32_2op<mnemonic, minOp>;
-
-    let isPredicated = 1, hasSideEffects = 0 in {
-      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
-      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
-    }
-  }
-}
-
-defm aslh : ALU32_2op_base<"aslh", 0b000>, PredNewRel;
-defm asrh : ALU32_2op_base<"asrh", 0b001>, PredNewRel;
-defm sxtb : ALU32_2op_base<"sxtb", 0b101>, PredNewRel;
-defm sxth : ALU32_2op_base<"sxth", 0b111>, PredNewRel;
-defm zxth : ALU32_2op_base<"zxth", 0b110>, PredNewRel;
-
-// Rd=zxtb(Rs): assembler mapped to Rd=and(Rs,#255).
-// Compiler would want to generate 'zxtb' instead of 'and' because 'zxtb' has
-// predicated forms while 'and' doesn't. Since integrated assembler can't
-// handle 'mapped' instructions, we need to encode 'zxtb' same as 'and' where
-// immediate operand is set to '255'.
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_ZXTB: ALU32Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rs),
-  "$Rd = zxtb($Rs)", [] > { // Rd = and(Rs,255)
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<10> s10 = 255;
-
-    let IClass = 0b0111;
-
-    let Inst{27-22} = 0b011000;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{21} = s10{9};
-    let Inst{13-5} = s10{8-0};
-}
-
-//Rd=zxtb(Rs): assembler mapped to "Rd=and(Rs,#255)
-multiclass ZXTB_base <string mnemonic, bits<3> minOp> {
-  let BaseOpcode = mnemonic in {
-    let isPredicable = 1, hasSideEffects = 0 in
-    def A2_#NAME : T_ZXTB;
-
-    let isPredicated = 1, hasSideEffects = 0 in {
-      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
-      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
-    }
-  }
-}
-
-defm zxtb : ZXTB_base<"zxtb",0b100>, PredNewRel;
-
-//===----------------------------------------------------------------------===//
-// Template class for vector add and avg
-//===----------------------------------------------------------------------===//
-
-class T_VectALU_64 <string opc, bits<3> majOp, bits<3> minOp,
-                   bit isSat, bit isRnd, bit isCrnd, bit SwapOps >
-  : ALU64_rr < (outs DoubleRegs:$Rdd),
-                (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(isRnd, ":rnd", "")
-                             #!if(isCrnd,":crnd","")
-                             #!if(isSat, ":sat", ""),
-  [], "", ALU64_tc_2_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b0011;
-    let Inst{23-21} = majOp;
-    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
-    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
-    let Inst{7-5} = minOp;
-    let Inst{4-0} = Rdd;
-  }
-
-// ALU64 - Vector add
-// Rdd=vadd[u][bhw](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vaddub  : T_VectALU_64 < "vaddub", 0b000, 0b000, 0, 0, 0, 0>;
-  def A2_vaddh   : T_VectALU_64 < "vaddh",  0b000, 0b010, 0, 0, 0, 0>;
-  def A2_vaddw   : T_VectALU_64 < "vaddw",  0b000, 0b101, 0, 0, 0, 0>;
-}
-
-// Rdd=vadd[u][bhw](Rss,Rtt):sat
-let Defs = [USR_OVF] in {
-  def A2_vaddubs : T_VectALU_64 < "vaddub", 0b000, 0b001, 1, 0, 0, 0>;
-  def A2_vaddhs  : T_VectALU_64 < "vaddh",  0b000, 0b011, 1, 0, 0, 0>;
-  def A2_vadduhs : T_VectALU_64 < "vadduh", 0b000, 0b100, 1, 0, 0, 0>;
-  def A2_vaddws  : T_VectALU_64 < "vaddw",  0b000, 0b110, 1, 0, 0, 0>;
-}
-
-// ALU64 - Vector average
-// Rdd=vavg[u][bhw](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vavgub : T_VectALU_64 < "vavgub", 0b010, 0b000, 0, 0, 0, 0>;
-  def A2_vavgh  : T_VectALU_64 < "vavgh",  0b010, 0b010, 0, 0, 0, 0>;
-  def A2_vavguh : T_VectALU_64 < "vavguh", 0b010, 0b101, 0, 0, 0, 0>;
-  def A2_vavgw  : T_VectALU_64 < "vavgw",  0b011, 0b000, 0, 0, 0, 0>;
-  def A2_vavguw : T_VectALU_64 < "vavguw", 0b011, 0b011, 0, 0, 0, 0>;
-}
-
-// Rdd=vavg[u][bhw](Rss,Rtt)[:rnd|:crnd]
-def A2_vavgubr : T_VectALU_64 < "vavgub", 0b010, 0b001, 0, 1, 0, 0>;
-def A2_vavghr  : T_VectALU_64 < "vavgh",  0b010, 0b011, 0, 1, 0, 0>;
-def A2_vavghcr : T_VectALU_64 < "vavgh",  0b010, 0b100, 0, 0, 1, 0>;
-def A2_vavguhr : T_VectALU_64 < "vavguh", 0b010, 0b110, 0, 1, 0, 0>;
-
-def A2_vavgwr  : T_VectALU_64 < "vavgw",  0b011, 0b001, 0, 1, 0, 0>;
-def A2_vavgwcr : T_VectALU_64 < "vavgw",  0b011, 0b010, 0, 0, 1, 0>;
-def A2_vavguwr : T_VectALU_64 < "vavguw", 0b011, 0b100, 0, 1, 0, 0>;
-
-// Rdd=vnavg[bh](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vnavgh   : T_VectALU_64 < "vnavgh", 0b100, 0b000, 0, 0, 0, 1>;
-  def A2_vnavgw   : T_VectALU_64 < "vnavgw", 0b100, 0b011, 0, 0, 0, 1>;
-}
-
-// Rdd=vnavg[bh](Rss,Rtt)[:rnd|:crnd]:sat
-let Defs = [USR_OVF] in {
-  def A2_vnavghr  : T_VectALU_64 < "vnavgh", 0b100, 0b001, 1, 1, 0, 1>;
-  def A2_vnavghcr : T_VectALU_64 < "vnavgh", 0b100, 0b010, 1, 0, 1, 1>;
-  def A2_vnavgwr  : T_VectALU_64 < "vnavgw", 0b100, 0b100, 1, 1, 0, 1>;
-  def A2_vnavgwcr : T_VectALU_64 < "vnavgw", 0b100, 0b110, 1, 0, 1, 1>;
-}
-
-// Rdd=vsub[u][bh](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vsubub  : T_VectALU_64 < "vsubub", 0b001, 0b000, 0, 0, 0, 1>;
-  def A2_vsubh   : T_VectALU_64 < "vsubh",  0b001, 0b010, 0, 0, 0, 1>;
-  def A2_vsubw   : T_VectALU_64 < "vsubw",  0b001, 0b101, 0, 0, 0, 1>;
-}
-
-// Rdd=vsub[u][bh](Rss,Rtt):sat
-let Defs = [USR_OVF] in {
-  def A2_vsububs : T_VectALU_64 < "vsubub", 0b001, 0b001, 1, 0, 0, 1>;
-  def A2_vsubhs  : T_VectALU_64 < "vsubh",  0b001, 0b011, 1, 0, 0, 1>;
-  def A2_vsubuhs : T_VectALU_64 < "vsubuh", 0b001, 0b100, 1, 0, 0, 1>;
-  def A2_vsubws  : T_VectALU_64 < "vsubw",  0b001, 0b110, 1, 0, 0, 1>;
-}
-
-// Rdd=vmax[u][bhw](Rss,Rtt)
-def A2_vmaxb  : T_VectALU_64 < "vmaxb",  0b110, 0b110, 0, 0, 0, 1>;
-def A2_vmaxub : T_VectALU_64 < "vmaxub", 0b110, 0b000, 0, 0, 0, 1>;
-def A2_vmaxh  : T_VectALU_64 < "vmaxh",  0b110, 0b001, 0, 0, 0, 1>;
-def A2_vmaxuh : T_VectALU_64 < "vmaxuh", 0b110, 0b010, 0, 0, 0, 1>;
-def A2_vmaxw  : T_VectALU_64 < "vmaxw",  0b110, 0b011, 0, 0, 0, 1>;
-def A2_vmaxuw : T_VectALU_64 < "vmaxuw", 0b101, 0b101, 0, 0, 0, 1>;
-
-// Rdd=vmin[u][bhw](Rss,Rtt)
-def A2_vminb  : T_VectALU_64 < "vminb",  0b110, 0b111, 0, 0, 0, 1>;
-def A2_vminub : T_VectALU_64 < "vminub", 0b101, 0b000, 0, 0, 0, 1>;
-def A2_vminh  : T_VectALU_64 < "vminh",  0b101, 0b001, 0, 0, 0, 1>;
-def A2_vminuh : T_VectALU_64 < "vminuh", 0b101, 0b010, 0, 0, 0, 1>;
-def A2_vminw  : T_VectALU_64 < "vminw",  0b101, 0b011, 0, 0, 0, 1>;
-def A2_vminuw : T_VectALU_64 < "vminuw", 0b101, 0b100, 0, 0, 0, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for vector compare
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_vcmp <string Str, bits<4> minOp>
-  : ALU64_rr <(outs PredRegs:$Pd),
-              (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = "#Str#"($Rss, $Rtt)", [],
-  "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = minOp{3};
-    let Inst{7-5} = minOp{2-0};
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector compare bytes
-def A2_vcmpbeq  : T_vcmp <"vcmpb.eq",  0b0110>;
-def A2_vcmpbgtu : T_vcmp <"vcmpb.gtu", 0b0111>;
-
-// Vector compare halfwords
-def A2_vcmpheq  : T_vcmp <"vcmph.eq",  0b0011>;
-def A2_vcmphgt  : T_vcmp <"vcmph.gt",  0b0100>;
-def A2_vcmphgtu : T_vcmp <"vcmph.gtu", 0b0101>;
-
-// Vector compare words
-def A2_vcmpweq  : T_vcmp <"vcmpw.eq",  0b0000>;
-def A2_vcmpwgt  : T_vcmp <"vcmpw.gt",  0b0001>;
-def A2_vcmpwgtu : T_vcmp <"vcmpw.gtu", 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PRED +
-//===----------------------------------------------------------------------===//
-// No bits needed.  If cmp.ge is found the assembler parser will
-// transform it to cmp.gt subtracting 1 from the immediate.
-let isPseudo = 1 in {
-def C2_cmpgei: ALU32Inst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, s8_0Ext:$s8),
-  "$Pd = cmp.ge($Rs, #$s8)">;
-def C2_cmpgeui: ALU32Inst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, u8_0Ext:$s8),
-  "$Pd = cmp.geu($Rs, #$s8)">;
-}
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PRED -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU +
-//===----------------------------------------------------------------------===//
-// Add.
-//===----------------------------------------------------------------------===//
-// Template Class
-// Add/Subtract halfword
-// Rd=add(Rt.L,Rs.[HL])[:sat]
-// Rd=sub(Rt.L,Rs.[HL])[:sat]
-// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
-// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
-//===----------------------------------------------------------------------===//
-
-let  hasNewValue = 1, opNewValue = 0 in
-class T_XTYPE_ADD_SUB <bits<2> LHbits, bit isSat, bit hasShift, bit isSub>
-  : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
-  "$Rd = "#!if(isSub,"sub","add")#"($Rt."
-          #!if(hasShift, !if(LHbits{1},"h","l"),"l") #", $Rs."
-          #!if(hasShift, !if(LHbits{0},"h)","l)"), !if(LHbits{1},"h)","l)"))
-          #!if(isSat,":sat","")
-          #!if(hasShift,":<<16",""), [], "", ALU64_tc_1_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01010;
-    let Inst{22} = hasShift;
-    let Inst{21} = isSub;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rd;
-    let Inst{12-8} = Rt;
-    let Inst{20-16} = Rs;
-  }
-
-//Rd=sub(Rt.L,Rs.[LH])
-def A2_subh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 1>;
-def A2_subh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 1>;
-
-//Rd=add(Rt.L,Rs.[LH])
-def A2_addh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 0>;
-def A2_addh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 0>;
-
-let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
-  //Rd=sub(Rt.L,Rs.[LH]):sat
-  def A2_subh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 1>;
-  def A2_subh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 1>;
-
-  //Rd=add(Rt.L,Rs.[LH]):sat
-  def A2_addh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 0>;
-  def A2_addh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 0>;
-}
-
-//Rd=sub(Rt.[LH],Rs.[LH]):<<16
-def A2_subh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 1>;
-def A2_subh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 1>;
-def A2_subh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 1>;
-def A2_subh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 1>;
-
-//Rd=add(Rt.[LH],Rs.[LH]):<<16
-def A2_addh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 0>;
-def A2_addh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 0>;
-def A2_addh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 0>;
-def A2_addh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 0>;
-
-let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
-  //Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
-  def A2_subh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 1>;
-  def A2_subh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 1>;
-  def A2_subh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 1>;
-  def A2_subh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 1>;
-
-  //Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
-  def A2_addh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 0>;
-  def A2_addh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 0>;
-  def A2_addh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 0>;
-  def A2_addh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 0>;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S2_parityp: ALU64Inst<(outs IntRegs:$Rd),
-      (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0000;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, opNewValue = 0, hasSideEffects = 0 in
-class T_XTYPE_MIN_MAX < bit isMax, bit isUnsigned >
-  : ALU64Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
-  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
-          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01011;
-    let Inst{22-21} = !if(isMax, 0b10, 0b01);
-    let Inst{7} = isUnsigned;
-    let Inst{4-0} = Rd;
-    let Inst{12-8} = !if(isMax, Rs, Rt);
-    let Inst{20-16} = !if(isMax, Rt, Rs);
-  }
-
-def A2_min  : T_XTYPE_MIN_MAX < 0, 0 >;
-def A2_minu : T_XTYPE_MIN_MAX < 0, 1 >;
-def A2_max  : T_XTYPE_MIN_MAX < 1, 0 >;
-def A2_maxu : T_XTYPE_MIN_MAX < 1, 1 >;
-
-class T_cmp64_rr<string mnemonic, bits<3> MinOp, bit IsComm>
-  : ALU64_rr<(outs PredRegs:$Pd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-             "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", ALU64_tc_2early_SLOT23> {
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0010100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = MinOp;
-  let Inst{1-0} = Pd;
-}
-
-def C2_cmpeqp  : T_cmp64_rr<"cmp.eq",  0b000, 1>;
-def C2_cmpgtp  : T_cmp64_rr<"cmp.gt",  0b010, 0>;
-def C2_cmpgtup : T_cmp64_rr<"cmp.gtu", 0b100, 0>;
-
-def C2_vmux : ALU64_rr<(outs DoubleRegs:$Rd),
-      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-      "$Rd = vmux($Pu, $Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
-  let hasSideEffects = 0;
-
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0001;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU64_rr<string mnemonic, string suffix, bits<4> RegType,
-                 bits<3> MajOp, bits<3> MinOp, bit OpsRev, bit IsComm,
-                 string Op2Pfx>
-  : ALU64_rr<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-             "$Rd = " #mnemonic# "($Rs, " #Op2Pfx# "$Rt)" #suffix, [],
-             "", ALU64_tc_1_SLOT23> {
-  let hasSideEffects = 0;
-  let isCommutable = IsComm;
-
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = RegType;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = !if (OpsRev,Rt,Rs);
-  let Inst{12-8} = !if (OpsRev,Rs,Rt);
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU64_arith<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit IsSat,
-                    bit OpsRev, bit IsComm>
-  : T_ALU64_rr<mnemonic, !if(IsSat,":sat",""), 0b0011, MajOp, MinOp, OpsRev,
-               IsComm, "">;
-
-let isAdd = 1 in
-def A2_addp : T_ALU64_arith<"add", 0b000, 0b111, 0, 0, 1>;
-def A2_subp : T_ALU64_arith<"sub", 0b001, 0b111, 0, 1, 0>;
-
-class T_ALU64_logical<string mnemonic, bits<3> MinOp, bit OpsRev, bit IsComm,
-                      bit IsNeg>
-  : T_ALU64_rr<mnemonic, "", 0b0011, 0b111, MinOp, OpsRev, IsComm,
-               !if(IsNeg,"~","")>;
-
-def A2_andp : T_ALU64_logical<"and", 0b000, 0, 1, 0>;
-def A2_orp  : T_ALU64_logical<"or",  0b010, 0, 1, 0>;
-def A2_xorp : T_ALU64_logical<"xor", 0b100, 0, 1, 0>;
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/BIT +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/PERM +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// CR +
-//===----------------------------------------------------------------------===//
-// Logical reductions on predicates.
-
-// Looping instructions.
-
-// Pipelined looping instructions.
-
-// Logical operations on predicates.
-let hasSideEffects = 0 in
-class T_LOGICAL_1OP<string MnOp, bits<2> OpBits>
-    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps),
-             "$Pd = " # MnOp # "($Ps)", [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-
-  let IClass = 0b0110;
-  let Inst{27-23} = 0b10111;
-  let Inst{22-21} = OpBits;
-  let Inst{20} = 0b0;
-  let Inst{17-16} = Ps;
-  let Inst{13} = 0b0;
-  let Inst{1-0} = Pd;
-}
-
-def C2_any8 : T_LOGICAL_1OP<"any8", 0b00>;
-def C2_all8 : T_LOGICAL_1OP<"all8", 0b01>;
-def C2_not  : T_LOGICAL_1OP<"not",  0b10>;
-
-let hasSideEffects = 0 in
-class T_LOGICAL_2OP<string MnOp, bits<3> OpBits, bit IsNeg, bit Rev>
-    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps, PredRegs:$Pt),
-             "$Pd = " # MnOp # "($Ps, " # !if (IsNeg,"!","") # "$Pt)",
-             [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-  bits<2> Pt;
-
-  let IClass = 0b0110;
-  let Inst{27-24} = 0b1011;
-  let Inst{23-21} = OpBits;
-  let Inst{20} = 0b0;
-  let Inst{17-16} = !if(Rev,Pt,Ps);  // Rs and Rt are reversed for some
-  let Inst{13} = 0b0;                // instructions.
-  let Inst{9-8} = !if(Rev,Ps,Pt);
-  let Inst{1-0} = Pd;
-}
-
-def C2_and  : T_LOGICAL_2OP<"and", 0b000, 0, 1>;
-def C2_or   : T_LOGICAL_2OP<"or",  0b001, 0, 1>;
-def C2_xor  : T_LOGICAL_2OP<"xor", 0b010, 0, 0>;
-def C2_andn : T_LOGICAL_2OP<"and", 0b011, 1, 1>;
-def C2_orn  : T_LOGICAL_2OP<"or",  0b111, 1, 1>;
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def C2_vitpack : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps, PredRegs:$Pt),
-      "$Rd = vitpack($Ps, $Pt)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Ps;
-  bits<2> Pt;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1001;
-  let Inst{22-21} = 0b00;
-  let Inst{17-16} = Ps;
-  let Inst{9-8} = Pt;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def C2_mask : SInst<(outs DoubleRegs:$Rd), (ins PredRegs:$Pt),
-      "$Rd = mask($Pt)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Pt;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0110;
-  let Inst{9-8} = Pt;
-  let Inst{4-0} = Rd;
-}
-
-// User control register transfer.
-//===----------------------------------------------------------------------===//
-// CR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// JR +
-//===----------------------------------------------------------------------===//
-
-class CondStr<string CReg, bit True, bit New> {
-  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
-}
-class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
-  string S = Mnemonic # !if(Taken, ":t", ":nt");
-}
-
-let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
-    isPredicable = 1,
-    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
-    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
-class T_JMP<string ExtStr>
-  : JInst_CJUMP_UCJUMP<(outs), (ins brtarget:$dst),
-      "jump " # ExtStr # "$dst",
-      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
-    bits<24> dst;
-    let IClass = 0b0101;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-16} = dst{23-15};
-    let Inst{13-1} = dst{14-2};
-}
-
-let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
-    isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
-    opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
-class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
-  : JInst_CJUMP_UCJUMP<(outs), (ins PredRegs:$src, brtarget:$dst),
-      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-        JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
-        ExtStr # "$dst",
-      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>, ImmRegRel {
-    let isTaken = isTak;
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = isPredNew;
-    bits<2> src;
-    bits<17> dst;
-
-    let IClass = 0b0101;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{21} = PredNot;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{9-8} = src;
-    let Inst{23-22} = dst{16-15};
-    let Inst{20-16} = dst{14-10};
-    let Inst{13} = dst{9};
-    let Inst{7-1} = dst{8-2};
-  }
-
-multiclass JMP_Pred<bit PredNot, string ExtStr> {
-  def NAME       : T_JMP_c<PredNot, 0, 0, ExtStr>; // not taken
-  // Predicate new
-  def NAME#newpt : T_JMP_c<PredNot, 1, 1, ExtStr>; // taken
-  def NAME#new   : T_JMP_c<PredNot, 1, 0, ExtStr>; // not taken
-}
-
-multiclass JMP_base<string BaseOp, string ExtStr> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMP<ExtStr>;
-    defm t : JMP_Pred<0, ExtStr>;
-    defm f : JMP_Pred<1, ExtStr>;
-  }
-}
-
-// Jumps to address stored in a register, JUMPR_MISC
-// if ([[!]P[.new]]) jumpr[:t/nt] Rs
-let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
-    isPredicable = 1, hasSideEffects = 0, InputType = "reg" in
-class T_JMPr
-  : JRInst<(outs), (ins IntRegs:$dst),
-      "jumpr $dst", [], "", J_tc_2early_SLOT2> {
-    bits<5> dst;
-
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0010100;
-    let Inst{20-16} = dst;
-}
-
-let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
-    hasSideEffects = 0, InputType = "reg" in
-class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
-  : JRInst <(outs), (ins PredRegs:$src, IntRegs:$dst),
-      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-        JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst", [],
-      "", J_tc_2early_SLOT2> {
-
-    let isTaken = isTak;
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = isPredNew;
-    bits<2> src;
-    bits<5> dst;
-
-    let IClass = 0b0101;
-
-    let Inst{27-22} = 0b001101;
-    let Inst{21} = PredNot;
-    let Inst{20-16} = dst;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{9-8} = src;
-}
-
-multiclass JMPR_Pred<bit PredNot> {
-  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
-  // Predicate new
-  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
-  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
-}
-
-multiclass JMPR_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMPr;
-    defm t : JMPR_Pred<0>;
-    defm f : JMPR_Pred<1>;
-  }
-}
-
-let isCall = 1, hasSideEffects = 1 in
-class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
-               dag InputDag = (ins IntRegs:$Rs)>
-  : JRInst<(outs), InputDag,
-      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
-                                 "if ($Pu) callr $Rs"),
-                                 "callr $Rs"),
-      [], "", J_tc_2early_SLOT2> {
-    bits<5> Rs;
-    bits<2> Pu;
-    let isPredicated = isPred;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b0101;
-    let Inst{27-25} = 0b000;
-    let Inst{24-23} = !if (isPred, 0b10, 0b01);
-    let Inst{22} = 0;
-    let Inst{21} = isPredNot;
-    let Inst{9-8} = !if (isPred, Pu, 0b00);
-    let Inst{20-16} = Rs;
-
-  }
-
-let Defs = VolatileV3.Regs in {
-  def J2_callrt : JUMPR_MISC_CALLR<1, 0, (ins PredRegs:$Pu, IntRegs:$Rs)>;
-  def J2_callrf : JUMPR_MISC_CALLR<1, 1, (ins PredRegs:$Pu, IntRegs:$Rs)>;
-}
-
-let isTerminator = 1, hasSideEffects = 0 in {
-  defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
-
-  defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
-
-  let isReturn = 1, isPseudo = 1, isCodeGenOnly = 1 in
-  defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
-}
-
-let validSubTargets  = HasV60SubT in
-multiclass JMPpt_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def tpt : T_JMP_c <0, 0, 1, "">; // Predicate true - taken
-    def fpt : T_JMP_c <1, 0, 1, "">; // Predicate false - taken
-  }
-}
-
-let validSubTargets  = HasV60SubT in
-multiclass JMPRpt_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def tpt : T_JMPr_c<0, 0, 1>; // predicate true - taken
-    def fpt : T_JMPr_c<1, 0, 1>; // predicate false - taken
-  }
-}
-
-defm J2_jumpr : JMPRpt_base<"JMPr">;
-defm J2_jump  : JMPpt_base<"JMP">;
-
-// A return through builtin_eh_return.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
-    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
-def EH_RETURN_JMPR : T_JMPr;
-
-//===----------------------------------------------------------------------===//
-// JR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LD +
-//===----------------------------------------------------------------------===//
-
-// Load - Base with Immediate offset addressing mode
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, AddedComplexity = 20 in
-class T_load_io <string mnemonic, RegisterClass RC, bits<4> MajOp,
-                 Operand ImmOp>
-  : LDInst<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src1 + #$offset)", []>, AddrModeRel {
-    bits<4> name;
-    bits<5> dst;
-    bits<5> src1;
-    bits<14> offset;
-    bits<11> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), offset{13-3},
-                     !if (!eq(ImmOpStr, "s11_2Ext"), offset{12-2},
-                     !if (!eq(ImmOpStr, "s11_1Ext"), offset{11-1},
-                                      /* s11_0Ext */ offset{10-0})));
-    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
-                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
-                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
-                                        /* s11_0Ext */ 11)));
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27}    = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let opExtendable = 3, isExtentSigned = 0, isPredicated = 1 in
-class T_pload_io <string mnemonic, RegisterClass RC, bits<4>MajOp,
-                  Operand ImmOp, bit isNot, bit isPredNew>
-  : LDInst<(outs RC:$dst),
-           (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "if ("#!if(isNot, "!$src1", "$src1")
-       #!if(isPredNew, ".new", "")
-       #") $dst = "#mnemonic#"($src2 + #$offset)",
-  [],"", V2LDST_tc_ld_SLOT01> , AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> offset;
-    bits<6> offsetBits;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), offset{8-3},
-                     !if (!eq(ImmOpStr, "u6_2Ext"), offset{7-2},
-                     !if (!eq(ImmOpStr, "u6_1Ext"), offset{6-1},
-                                      /* u6_0Ext */ offset{5-0})));
-    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
-                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
-                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
-                                        /* u6_0Ext */ 6)));
-    let hasNewValue = !if (!eq(ImmOpStr, "u6_3Ext"), 0, 1);
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isNot;
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b0;
-    let Inst{27}    = 0b0;
-    let Inst{26}    = isNot;
-    let Inst{25}    = isPredNew;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13}    = 0b0;
-    let Inst{12-11} = src1;
-    let Inst{10-5}  = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-let isExtendable = 1, hasSideEffects = 0, addrMode = BaseImmOffset in
-multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<4>MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    let isPredicable = 1 in
-    def L2_#NAME#_io : T_load_io <mnemonic, RC, MajOp, ImmOp>;
-
-    // Predicated
-    def L2_p#NAME#t_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 0>;
-    def L2_p#NAME#f_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 0>;
-
-    // Predicated new
-    def L2_p#NAME#tnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 1>;
-    def L2_p#NAME#fnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 1>;
-  }
-}
-
-let accessSize = ByteAccess in {
-  defm loadrb:  LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext, 0b1000>;
-  defm loadrub: LD_Idxd <"memub", "LDriub", IntRegs, s11_0Ext, u6_0Ext, 0b1001>;
-}
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  defm loadrh:  LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext, 0b1010>;
-  defm loadruh: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext, 0b1011>;
-}
-
-let accessSize = WordAccess, opExtentAlign = 2 in
-defm loadri: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext, 0b1100>;
-
-let accessSize = DoubleWordAccess, opExtentAlign = 3 in
-defm loadrd: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext, 0b1110>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  def L2_loadbsw2_io:   T_load_io<"membh",  IntRegs, 0b0001, s11_1Ext>;
-  def L2_loadbzw2_io:   T_load_io<"memubh", IntRegs, 0b0011, s11_1Ext>;
-}
-
-let accessSize = WordAccess, opExtentAlign = 2 in {
-  def L2_loadbzw4_io: T_load_io<"memubh", DoubleRegs, 0b0101, s11_2Ext>;
-  def L2_loadbsw4_io: T_load_io<"membh",  DoubleRegs, 0b0111, s11_2Ext>;
-}
-
-let addrMode = BaseImmOffset, isExtendable = 1, hasSideEffects = 0,
-    opExtendable = 3, isExtentSigned = 1  in
-class T_loadalign_io <string str, bits<4> MajOp, Operand ImmOp>
-  : LDInst<(outs DoubleRegs:$dst),
-           (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "$dst = "#str#"($src2 + #$offset)", [],
-  "$src1 = $dst">, AddrModeRel {
-    bits<4> name;
-    bits<5> dst;
-    bits<5> src2;
-    bits<12> offset;
-    bits<11> offsetBits;
-
-    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s11_1Ext"), offset{11-1},
-                                                  /* s11_0Ext */ offset{10-0});
-    let IClass = 0b1001;
-
-    let Inst{27}    = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let accessSize = HalfWordAccess, opExtentBits = 12, opExtentAlign = 1 in
-def L2_loadalignh_io: T_loadalign_io <"memh_fifo", 0b0010, s11_1Ext>;
-
-let accessSize = ByteAccess, opExtentBits = 11 in
-def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// Post increment load
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                     bits<4> MajOp >
-  : LDInstPI <(outs RC:$dst, IntRegs:$dst2),
-  (ins IntRegs:$src1, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src1++#$offset)" ,
-  [],
-  "$src1 = $dst2" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<5> src1;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13-12} = 0b00;
-    let Inst{8-5} = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_pload_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                          bits<4> MajOp, bit isPredNot, bit isPredNew >
-  : LDInst <(outs RC:$dst, IntRegs:$dst2),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
-  [] ,
-  "$src2 = $dst2" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12} = isPredNew;
-    let Inst{11} = isPredNot;
-    let Inst{10-9} = src1;
-    let Inst{8-5}  = offsetBits;
-    let Inst{4-0}  = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-
-multiclass LD_PostInc <string mnemonic, string BaseOp, RegisterClass RC,
-                       Operand ImmOp, bits<4> MajOp> {
-  let BaseOpcode = "POST_"#BaseOp in {
-    let isPredicable = 1 in
-    def L2_#NAME#_pi : T_load_pi < mnemonic, RC, ImmOp, MajOp>;
-
-    // Predicated
-    def L2_p#NAME#t_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 0>;
-    def L2_p#NAME#f_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 0>;
-
-    // Predicated new
-    def L2_p#NAME#tnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 1>;
-    def L2_p#NAME#fnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 1>;
-  }
-}
-
-// post increment byte loads with immediate offset
-let accessSize = ByteAccess in {
-  defm loadrb  : LD_PostInc <"memb",  "LDrib", IntRegs, s4_0Imm, 0b1000>;
-  defm loadrub : LD_PostInc <"memub", "LDriub", IntRegs, s4_0Imm, 0b1001>;
-}
-
-// post increment halfword loads with immediate offset
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  defm loadrh  : LD_PostInc <"memh",  "LDrih", IntRegs, s4_1Imm, 0b1010>;
-  defm loadruh : LD_PostInc <"memuh", "LDriuh", IntRegs, s4_1Imm, 0b1011>;
-}
-
-// post increment word loads with immediate offset
-let accessSize = WordAccess, opExtentAlign = 2 in
-defm loadri : LD_PostInc <"memw", "LDriw", IntRegs, s4_2Imm, 0b1100>;
-
-// post increment doubleword loads with immediate offset
-let accessSize = DoubleWordAccess, opExtentAlign = 3 in
-defm loadrd : LD_PostInc <"memd", "LDrid", DoubleRegs, s4_3Imm, 0b1110>;
-
-// Rd=memb[u]h(Rx++#s4:1)
-// Rdd=memb[u]h(Rx++#s4:2)
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  def L2_loadbsw2_pi   : T_load_pi <"membh", IntRegs, s4_1Imm, 0b0001>;
-  def L2_loadbzw2_pi   : T_load_pi <"memubh", IntRegs, s4_1Imm, 0b0011>;
-}
-let accessSize = WordAccess, opExtentAlign = 2, hasNewValue = 0 in {
-  def L2_loadbsw4_pi   : T_load_pi <"membh", DoubleRegs, s4_2Imm, 0b0111>;
-  def L2_loadbzw4_pi   : T_load_pi <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment fifo loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pi <string mnemonic, Operand ImmOp, bits<4> MajOp >
-  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$dst2),
-  (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src2++#$offset)" ,
-  [], "$src2 = $dst2, $src1 = $dst" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> offset;
-    bits<4> offsetBits;
-
-    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s4_1Imm"), offset{4-1},
-                                                  /* s4_0Imm */ offset{3-0});
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13-12} = 0b00;
-    let Inst{8-5} = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-// Ryy=memh_fifo(Rx++#s4:1)
-// Ryy=memb_fifo(Rx++#s4:0)
-let accessSize = ByteAccess in
-def L2_loadalignb_pi : T_loadalign_pi <"memb_fifo", s4_0Imm, 0b0100>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in
-def L2_loadalignh_pi : T_loadalign_pi <"memh_fifo", s4_1Imm, 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment loads with register offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pr <string mnemonic, RegisterClass RC, bits<4> MajOp,
-                       MemAccessSize AccessSz>
-  : LDInstPI <(outs RC:$dst, IntRegs:$_dst_),
-              (ins IntRegs:$src1, ModRegs:$src2),
-  "$dst = "#mnemonic#"($src1++$src2)" ,
-  [], "$src1 = $_dst_" > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<1> src2;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2;
-    let Inst{12}    = 0b0;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-let hasNewValue = 1 in {
-  def L2_loadrb_pr  : T_load_pr <"memb",  IntRegs, 0b1000, ByteAccess>;
-  def L2_loadrub_pr : T_load_pr <"memub", IntRegs, 0b1001, ByteAccess>;
-  def L2_loadrh_pr  : T_load_pr <"memh",  IntRegs, 0b1010, HalfWordAccess>;
-  def L2_loadruh_pr : T_load_pr <"memuh", IntRegs, 0b1011, HalfWordAccess>;
-  def L2_loadri_pr  : T_load_pr <"memw",  IntRegs, 0b1100, WordAccess>;
-
-  def L2_loadbzw2_pr : T_load_pr <"memubh", IntRegs, 0b0011, HalfWordAccess>;
-}
-
-def L2_loadrd_pr   : T_load_pr <"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
-def L2_loadbzw4_pr : T_load_pr <"memubh", DoubleRegs, 0b0101, WordAccess>;
-
-// Load predicate.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_pred : LDInst<(outs PredRegs:$dst),
-                        (ins IntRegs:$addr, s11_2Ext:$off),
-                        ".error \"should not emit\"", []>;
-// Load modifier.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_mod : LDInst<(outs ModRegs:$dst),
-                        (ins IntRegs:$addr, s11_2Ext:$off),
-                        ".error \"should not emit\"", []>;
-
-let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
-  def L2_deallocframe : LDInst<(outs), (ins),
-                     "deallocframe",
-                     []> {
-    let IClass = 0b1001;
-
-    let Inst{27-16} = 0b000000011110;
-    let Inst{13} = 0b0;
-    let Inst{4-0} = 0b11110;
-}
-
-// Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
-class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
-  : LDInst <(outs RC:$dst, IntRegs:$_dst_),
-            (ins IntRegs:$Rz, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
-  "$Rz = $_dst_" > {
-    bits<5> dst;
-    bits<5> Rz;
-    bit Mu;
-
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12} = 0b0;
-    let Inst{9} = 0b1;
-    let Inst{7} = 0b0;
-    let Inst{4-0} = dst;
- }
-
-let accessSize = ByteAccess in {
-  def L2_loadrb_pcr  : T_load_pcr <"memb",  IntRegs, 0b1000>;
-  def L2_loadrub_pcr : T_load_pcr <"memub", IntRegs, 0b1001>;
-}
-
-let accessSize = HalfWordAccess in {
-  def L2_loadrh_pcr   : T_load_pcr <"memh",   IntRegs, 0b1010>;
-  def L2_loadruh_pcr  : T_load_pcr <"memuh",  IntRegs, 0b1011>;
-  def L2_loadbsw2_pcr : T_load_pcr <"membh",  IntRegs, 0b0001>;
-  def L2_loadbzw2_pcr : T_load_pcr <"memubh", IntRegs, 0b0011>;
-}
-
-let accessSize = WordAccess in {
-  def  L2_loadri_pcr  : T_load_pcr <"memw", IntRegs, 0b1100>;
-  let hasNewValue = 0 in {
-    def L2_loadbzw4_pcr : T_load_pcr <"memubh", DoubleRegs, 0b0101>;
-    def L2_loadbsw4_pcr : T_load_pcr <"membh",  DoubleRegs, 0b0111>;
-  }
-}
-
-let accessSize = DoubleWordAccess in
-def L2_loadrd_pcr  : T_load_pcr <"memd", DoubleRegs, 0b1110>;
-
-// Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
-  : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
-            (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
-  "$Rz = $_dst_, $dst = $_src_" > {
-    bits<5> dst;
-    bits<5> Rz;
-    bit Mu;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13}    = Mu;
-    let Inst{12}    = 0b0;
-    let Inst{9}     = 0b1;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
- }
-
-def L2_loadalignb_pcr : T_loadalign_pcr <"memb_fifo", 0b0100, ByteAccess>;
-def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Circular loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let Uses = [CS], mayLoad = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_load_pci <string mnemonic, RegisterClass RC,
-                  Operand ImmOp, bits<4> MajOp>
-  : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
-             (ins IntRegs:$Rz, ImmOp:$offset, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ #$offset:circ($Mu))", [],
-  "$Rz = $_dst_"> {
-    bits<5> dst;
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let IClass      = 0b1001;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13}    = Mu;
-    let Inst{12}    = 0b0;
-    let Inst{9}     = 0b0;
-    let Inst{8-5}   = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-// Byte variants of circ load
-let accessSize = ByteAccess in {
-  def L2_loadrb_pci  : T_load_pci <"memb",  IntRegs, s4_0Imm, 0b1000>;
-  def L2_loadrub_pci : T_load_pci <"memub", IntRegs, s4_0Imm, 0b1001>;
-}
-
-// Half word variants of circ load
-let accessSize = HalfWordAccess in {
-  def L2_loadrh_pci   : T_load_pci <"memh",   IntRegs, s4_1Imm, 0b1010>;
-  def L2_loadruh_pci  : T_load_pci <"memuh",  IntRegs, s4_1Imm, 0b1011>;
-  def L2_loadbzw2_pci : T_load_pci <"memubh", IntRegs, s4_1Imm, 0b0011>;
-  def L2_loadbsw2_pci : T_load_pci <"membh",  IntRegs, s4_1Imm, 0b0001>;
-}
-
-// Word variants of circ load
-let accessSize = WordAccess in
-def L2_loadri_pci   : T_load_pci <"memw",   IntRegs,    s4_2Imm, 0b1100>;
-
-let accessSize = WordAccess, hasNewValue = 0 in {
-  def L2_loadbzw4_pci : T_load_pci <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
-  def L2_loadbsw4_pci : T_load_pci <"membh",  DoubleRegs, s4_2Imm, 0b0111>;
-}
-
-let accessSize = DoubleWordAccess, hasNewValue = 0 in
-def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
-
-
-// TODO: memb_fifo and memh_fifo must take destination register as input.
-// One-off circ loads - not enough in common to break into a class.
-let accessSize = ByteAccess in
-def L2_loadalignb_pci : T_load_pci <"memb_fifo", DoubleRegs, s4_0Imm, 0b0100>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in
-def L2_loadalignh_pci : T_load_pci <"memh_fifo", DoubleRegs, s4_1Imm, 0b0010>;
-
-// L[24]_load[wd]_locked: Load word/double with lock.
-let isSoloAX = 1 in
-class T_load_locked <string mnemonic, RegisterClass RC>
-  : LD0Inst <(outs RC:$dst),
-             (ins IntRegs:$src),
-    "$dst = "#mnemonic#"($src)"> {
-    bits<5> dst;
-    bits<5> src;
-    let IClass = 0b1001;
-    let Inst{27-21} = 0b0010000;
-    let Inst{20-16} = src;
-    let Inst{13-12} = !if (!eq(mnemonic, "memd_locked"), 0b01, 0b00);
-    let Inst{5}   = 0;
-    let Inst{4-0} = dst;
-}
-let hasNewValue = 1, accessSize = WordAccess, opNewValue = 0 in
-  def L2_loadw_locked : T_load_locked <"memw_locked", IntRegs>;
-let accessSize = DoubleWordAccess in
-  def L4_loadd_locked : T_load_locked <"memd_locked", DoubleRegs>;
-
-// S[24]_store[wd]_locked: Store word/double conditionally.
-let isSoloAX = 1, isPredicateLate = 1 in
-class T_store_locked <string mnemonic, RegisterClass RC>
-  : ST0Inst <(outs PredRegs:$Pd), (ins IntRegs:$Rs, RC:$Rt),
-    mnemonic#"($Rs, $Pd) = $Rt"> {
-    bits<2> Pd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1010;
-    let Inst{27-23} = 0b00001;
-    let Inst{22} = !if (!eq(mnemonic, "memw_locked"), 0b0, 0b1);
-    let Inst{21} = 0b1;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{1-0} = Pd;
-}
-
-let accessSize = WordAccess in
-def S2_storew_locked : T_store_locked <"memw_locked", IntRegs>;
-
-let accessSize = DoubleWordAccess in
-def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed loads with auto-increment register
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pbr<string mnemonic, RegisterClass RC,
-                            MemAccessSize addrSize, bits<4> majOp>
-  : LDInst
-    <(outs RC:$dst, IntRegs:$_dst_),
-     (ins IntRegs:$Rz, ModRegs:$Mu),
-     "$dst = "#mnemonic#"($Rz ++ $Mu:brev)" ,
-      [] , "$Rz = $_dst_" > {
-
-      let accessSize = addrSize;
-
-      bits<5> dst;
-      bits<5> Rz;
-      bits<1> Mu;
-
-      let IClass = 0b1001;
-
-      let Inst{27-25} = 0b111;
-      let Inst{24-21} = majOp;
-      let Inst{20-16} = Rz;
-      let Inst{13} = Mu;
-      let Inst{12} = 0b0;
-      let Inst{7} = 0b0;
-      let Inst{4-0} = dst;
-  }
-
-let hasNewValue =1, opNewValue = 0 in {
-  def L2_loadrb_pbr   : T_load_pbr <"memb",  IntRegs, ByteAccess, 0b1000>;
-  def L2_loadrub_pbr  : T_load_pbr <"memub", IntRegs, ByteAccess, 0b1001>;
-  def L2_loadrh_pbr   : T_load_pbr <"memh",  IntRegs, HalfWordAccess, 0b1010>;
-  def L2_loadruh_pbr  : T_load_pbr <"memuh", IntRegs, HalfWordAccess, 0b1011>;
-  def L2_loadbsw2_pbr : T_load_pbr <"membh", IntRegs, HalfWordAccess, 0b0001>;
-  def L2_loadbzw2_pbr : T_load_pbr <"memubh", IntRegs, HalfWordAccess, 0b0011>;
-  def L2_loadri_pbr : T_load_pbr <"memw", IntRegs, WordAccess, 0b1100>;
-}
-
-def L2_loadbzw4_pbr : T_load_pbr <"memubh", DoubleRegs, WordAccess, 0b0101>;
-def L2_loadbsw4_pbr : T_load_pbr <"membh",  DoubleRegs, WordAccess, 0b0111>;
-def L2_loadrd_pbr : T_load_pbr <"memd", DoubleRegs, DoubleWordAccess, 0b1110>;
-
-def L2_loadalignb_pbr :T_load_pbr <"memb_fifo", DoubleRegs, ByteAccess, 0b0100>;
-def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
-                                   HalfWordAccess, 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// LD -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/ALU +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYH +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords
-//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_M2_mpy < bits<2> LHbits, bit isSat, bit isRnd,
-                 bit hasShift, bit isUnsigned>
-  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
-                                       #", $Rt."#!if(LHbits{0},"h)","l)")
-                                       #!if(hasShift,":<<1","")
-                                       #!if(isRnd,":rnd","")
-                                       #!if(isSat,":sat",""),
-  [], "", M_tc_3x_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isRnd;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 0>;
-def M2_mpy_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 0>;
-def M2_mpy_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 0>;
-def M2_mpy_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 0>;
-def M2_mpy_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 0>;
-def M2_mpy_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 0>;
-def M2_mpy_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 0>;
-def M2_mpy_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 0>;
-
-//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 1>;
-def M2_mpyu_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 1>;
-def M2_mpyu_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 1>;
-def M2_mpyu_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 1>;
-def M2_mpyu_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 1>;
-def M2_mpyu_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 1>;
-def M2_mpyu_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 1>;
-def M2_mpyu_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 1>;
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]:rnd
-def M2_mpy_rnd_ll_s1: T_M2_mpy <0b00, 0, 1, 1, 0>;
-def M2_mpy_rnd_ll_s0: T_M2_mpy <0b00, 0, 1, 0, 0>;
-def M2_mpy_rnd_lh_s1: T_M2_mpy <0b01, 0, 1, 1, 0>;
-def M2_mpy_rnd_lh_s0: T_M2_mpy <0b01, 0, 1, 0, 0>;
-def M2_mpy_rnd_hl_s1: T_M2_mpy <0b10, 0, 1, 1, 0>;
-def M2_mpy_rnd_hl_s0: T_M2_mpy <0b10, 0, 1, 0, 0>;
-def M2_mpy_rnd_hh_s1: T_M2_mpy <0b11, 0, 1, 1, 0>;
-def M2_mpy_rnd_hh_s0: T_M2_mpy <0b11, 0, 1, 0, 0>;
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-let Defs = [USR_OVF] in {
-  def M2_mpy_sat_ll_s1: T_M2_mpy <0b00, 1, 0, 1, 0>;
-  def M2_mpy_sat_ll_s0: T_M2_mpy <0b00, 1, 0, 0, 0>;
-  def M2_mpy_sat_lh_s1: T_M2_mpy <0b01, 1, 0, 1, 0>;
-  def M2_mpy_sat_lh_s0: T_M2_mpy <0b01, 1, 0, 0, 0>;
-  def M2_mpy_sat_hl_s1: T_M2_mpy <0b10, 1, 0, 1, 0>;
-  def M2_mpy_sat_hl_s0: T_M2_mpy <0b10, 1, 0, 0, 0>;
-  def M2_mpy_sat_hh_s1: T_M2_mpy <0b11, 1, 0, 1, 0>;
-  def M2_mpy_sat_hh_s0: T_M2_mpy <0b11, 1, 0, 0, 0>;
-
-  def M2_mpy_sat_rnd_ll_s1: T_M2_mpy <0b00, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_ll_s0: T_M2_mpy <0b00, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_lh_s1: T_M2_mpy <0b01, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_lh_s0: T_M2_mpy <0b01, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_hl_s1: T_M2_mpy <0b10, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_hl_s0: T_M2_mpy <0b10, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_hh_s1: T_M2_mpy <0b11, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_hh_s0: T_M2_mpy <0b11, 1, 1, 0, 0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the accumulator.
-//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_M2_mpy_acc < bits<2> LHbits, bit isSat, bit isNac,
-                 bit hasShift, bit isUnsigned >
-  : MInst_acc<(outs IntRegs:$Rx), (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
-                              #"($Rs."#!if(LHbits{1},"h","l")
-                              #", $Rt."#!if(LHbits{0},"h)","l)")
-                              #!if(hasShift,":<<1","")
-                              #!if(isSat,":sat",""),
-  [], "$dst2 = $Rx", M_tc_3x_SLOT23 > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-    let Inst{27-24} = 0b1110;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isNac;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 0>;
-def M2_mpy_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 0>;
-def M2_mpy_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 0>;
-def M2_mpy_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 0>;
-def M2_mpy_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 0>;
-def M2_mpy_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 0>;
-def M2_mpy_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 0>;
-def M2_mpy_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 0>;
-
-//Rx += mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 1>;
-def M2_mpyu_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 1>;
-def M2_mpyu_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 1>;
-def M2_mpyu_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 1>;
-def M2_mpyu_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 1>;
-def M2_mpyu_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 1>;
-def M2_mpyu_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 1>;
-def M2_mpyu_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 1>;
-
-//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 0>;
-def M2_mpy_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 0>;
-def M2_mpy_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 0>;
-def M2_mpy_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 0>;
-def M2_mpy_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 0>;
-def M2_mpy_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 0>;
-def M2_mpy_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 0>;
-def M2_mpy_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 0>;
-
-//Rx -= mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 1>;
-def M2_mpyu_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 1>;
-def M2_mpyu_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 1>;
-def M2_mpyu_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 1>;
-def M2_mpyu_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 1>;
-def M2_mpyu_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 1>;
-def M2_mpyu_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 1>;
-def M2_mpyu_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 1>;
-
-//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
-def M2_mpy_acc_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 0, 0, 0>;
-
-//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
-def M2_mpy_nac_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 1, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the 64-bit destination register.
-//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-class T_M2_mpyd_acc < bits<2> LHbits, bit isNac, bit hasShift, bit isUnsigned>
-  : MInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rxx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
-                                #"($Rs."#!if(LHbits{1},"h","l")
-                                #", $Rt."#!if(LHbits{0},"h)","l)")
-                                #!if(hasShift,":<<1",""),
-  [], "$dst2 = $Rxx", M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0110;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isNac;
-    let Inst{7} = 0;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-def M2_mpyd_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 0>;
-def M2_mpyd_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 0>;
-def M2_mpyd_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 0>;
-def M2_mpyd_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 0>;
-
-def M2_mpyd_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 0>;
-def M2_mpyd_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 0>;
-def M2_mpyd_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 0>;
-def M2_mpyd_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 0>;
-
-def M2_mpyd_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 0>;
-def M2_mpyd_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 0>;
-def M2_mpyd_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 0>;
-def M2_mpyd_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 0>;
-
-def M2_mpyd_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 0>;
-def M2_mpyd_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 0>;
-def M2_mpyd_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 0>;
-def M2_mpyd_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 0>;
-
-def M2_mpyud_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 1>;
-def M2_mpyud_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 1>;
-def M2_mpyud_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 1>;
-def M2_mpyud_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 1>;
-
-def M2_mpyud_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 1>;
-def M2_mpyud_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 1>;
-def M2_mpyud_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 1>;
-def M2_mpyud_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 1>;
-
-def M2_mpyud_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 1>;
-def M2_mpyud_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 1>;
-def M2_mpyud_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 1>;
-def M2_mpyud_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 1>;
-
-def M2_mpyud_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 1>;
-def M2_mpyud_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 1>;
-def M2_mpyud_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 1>;
-def M2_mpyud_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Vector Multipy
-// Used for complex multiply real or imaginary, dual multiply and even halfwords
-//===----------------------------------------------------------------------===//
-class T_M2_vmpy < string opc, bits<3> MajOp, bits<3> MinOp, bit hasShift,
-                  bit isRnd, bit isSat >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                              #!if(isRnd,":rnd","")
-                              #!if(isSat,":sat",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
-let Defs = [USR_OVF] in {
-def M2_vcmpy_s1_sat_i: T_M2_vmpy <"vcmpyi", 0b110, 0b110, 1, 0, 1>;
-def M2_vcmpy_s0_sat_i: T_M2_vmpy <"vcmpyi", 0b010, 0b110, 0, 0, 1>;
-
-// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
-def M2_vcmpy_s1_sat_r: T_M2_vmpy <"vcmpyr", 0b101, 0b110, 1, 0, 1>;
-def M2_vcmpy_s0_sat_r: T_M2_vmpy <"vcmpyr", 0b001, 0b110, 0, 0, 1>;
-
-// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
-def M2_vdmpys_s1: T_M2_vmpy <"vdmpy", 0b100, 0b100, 1, 0, 1>;
-def M2_vdmpys_s0: T_M2_vmpy <"vdmpy", 0b000, 0b100, 0, 0, 1>;
-
-// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
-def M2_vmpy2es_s1: T_M2_vmpy <"vmpyeh", 0b100, 0b110, 1, 0, 1>;
-def M2_vmpy2es_s0: T_M2_vmpy <"vmpyeh", 0b000, 0b110, 0, 0, 1>;
-
-//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyh_s0:  T_M2_vmpy <"vmpywoh", 0b000, 0b111, 0, 0, 1>;
-def M2_mmpyh_s1:  T_M2_vmpy <"vmpywoh", 0b100, 0b111, 1, 0, 1>;
-def M2_mmpyh_rs0: T_M2_vmpy <"vmpywoh", 0b001, 0b111, 0, 1, 1>;
-def M2_mmpyh_rs1: T_M2_vmpy <"vmpywoh", 0b101, 0b111, 1, 1, 1>;
-
-//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyl_s0:  T_M2_vmpy <"vmpyweh", 0b000, 0b101, 0, 0, 1>;
-def M2_mmpyl_s1:  T_M2_vmpy <"vmpyweh", 0b100, 0b101, 1, 0, 1>;
-def M2_mmpyl_rs0: T_M2_vmpy <"vmpyweh", 0b001, 0b101, 0, 1, 1>;
-def M2_mmpyl_rs1: T_M2_vmpy <"vmpyweh", 0b101, 0b101, 1, 1, 1>;
-
-//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyuh_s0:  T_M2_vmpy <"vmpywouh", 0b010, 0b111, 0, 0, 1>;
-def M2_mmpyuh_s1:  T_M2_vmpy <"vmpywouh", 0b110, 0b111, 1, 0, 1>;
-def M2_mmpyuh_rs0: T_M2_vmpy <"vmpywouh", 0b011, 0b111, 0, 1, 1>;
-def M2_mmpyuh_rs1: T_M2_vmpy <"vmpywouh", 0b111, 0b111, 1, 1, 1>;
-
-//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyul_s0:  T_M2_vmpy <"vmpyweuh", 0b010, 0b101, 0, 0, 1>;
-def M2_mmpyul_s1:  T_M2_vmpy <"vmpyweuh", 0b110, 0b101, 1, 0, 1>;
-def M2_mmpyul_rs0: T_M2_vmpy <"vmpyweuh", 0b011, 0b101, 0, 1, 1>;
-def M2_mmpyul_rs1: T_M2_vmpy <"vmpyweuh", 0b111, 0b101, 1, 1, 1>;
-}
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_MType_mpy <string mnemonic, bits<4> RegTyBits, RegisterClass RC,
-                   bits<3> MajOp, bits<3> MinOp, bit isSat = 0, bit isRnd = 0,
-                   string op2Suffix = "", bit isRaw = 0, bit isHi = 0 >
-  : MInst <(outs IntRegs:$dst), (ins RC:$src1, RC:$src2),
-  "$dst = "#mnemonic
-           #"($src1, $src2"#op2Suffix#")"
-           #!if(MajOp{2}, ":<<1", "")
-           #!if(isRnd, ":rnd", "")
-           #!if(isSat, ":sat", "")
-           #!if(isRaw, !if(isHi, ":raw:hi", ":raw:lo"), ""), [] > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src2;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = dst;
-  }
-
-class T_MType_vrcmpy <string mnemonic, bits<3> MajOp, bits<3> MinOp, bit isHi>
-  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, 1, 1, "", 1, isHi>;
-
-class T_MType_dd  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0 >
-  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, isSat, isRnd>;
-
-class T_MType_rr1  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                    bit isSat = 0, bit isRnd = 0 >
-  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd>;
-
-class T_MType_rr2 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0, string op2str = "" >
-  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd, op2str>;
-
-def M2_vradduh    : T_MType_dd <"vradduh", 0b000, 0b001, 0, 0>;
-def M2_vdmpyrs_s0 : T_MType_dd <"vdmpy",   0b000, 0b000, 1, 1>;
-def M2_vdmpyrs_s1 : T_MType_dd <"vdmpy",   0b100, 0b000, 1, 1>;
-
-let CextOpcode = "mpyi", InputType = "reg" in
-def M2_mpyi    : T_MType_rr1 <"mpyi", 0b000, 0b000>, ImmRegRel;
-
-def M2_mpy_up  : T_MType_rr1 <"mpy",  0b000, 0b001>;
-def M2_mpyu_up : T_MType_rr1 <"mpyu", 0b010, 0b001>;
-
-def M2_dpmpyss_rnd_s0 : T_MType_rr1 <"mpy", 0b001, 0b001, 0, 1>;
-
-def M2_vmpy2s_s0pack : T_MType_rr1 <"vmpyh", 0b001, 0b111, 1, 1>;
-def M2_vmpy2s_s1pack : T_MType_rr1 <"vmpyh", 0b101, 0b111, 1, 1>;
-
-def M2_hmmpyh_rs1 : T_MType_rr2 <"mpy", 0b101, 0b100, 1, 1, ".h">;
-def M2_hmmpyl_rs1 : T_MType_rr2 <"mpy", 0b111, 0b100, 1, 1, ".l">;
-
-def M2_cmpyrs_s0  : T_MType_rr2 <"cmpy", 0b001, 0b110, 1, 1>;
-def M2_cmpyrs_s1  : T_MType_rr2 <"cmpy", 0b101, 0b110, 1, 1>;
-def M2_cmpyrsc_s0 : T_MType_rr2 <"cmpy", 0b011, 0b110, 1, 1, "*">;
-def M2_cmpyrsc_s1 : T_MType_rr2 <"cmpy", 0b111, 0b110, 1, 1, "*">;
-
-// V4 Instructions
-def M2_vraddh : T_MType_dd <"vraddh", 0b001, 0b111, 0>;
-def M2_mpysu_up : T_MType_rr1 <"mpysu", 0b011, 0b001, 0>;
-def M2_mpy_up_s1 : T_MType_rr1 <"mpy", 0b101, 0b010, 0>;
-def M2_mpy_up_s1_sat : T_MType_rr1 <"mpy", 0b111, 0b000, 1>;
-
-def M2_hmmpyh_s1 : T_MType_rr2 <"mpy", 0b101, 0b000, 1, 0, ".h">;
-def M2_hmmpyl_s1 : T_MType_rr2 <"mpy", 0b101, 0b001, 1, 0, ".l">;
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern>
-  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, ImmOp:$u8),
-  "$Rd ="#!if(isNeg, "- ", "+ ")#"mpyi($Rs, #$u8)" ,
-   pattern, "", M_tc_3x_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<8> u8;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0000;
-    let Inst{23} = isNeg;
-    let Inst{13} = 0b0;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{12-5} = u8;
-  }
-
-let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in
-def M2_mpysip : T_MType_mpy_ri <0, u8_0Ext, []>;
-
-def M2_mpysin :  T_MType_mpy_ri <1, u8_0Imm, []>;
-
-// Assember mapped to M2_mpyi
-let isAsmParserOnly = 1 in
-def M2_mpyui : MInst<(outs IntRegs:$dst),
-                     (ins IntRegs:$src1, IntRegs:$src2),
-  "$dst = mpyui($src1, $src2)">;
-
-// Rd=mpyi(Rs,#m9)
-// s9 is NOT the same as m9 - but it works.. so far.
-// Assembler maps to either Rd=+mpyi(Rs,#u8) or Rd=-mpyi(Rs,#u8)
-// depending on the value of m9. See Arch Spec.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
-    CextOpcode = "mpyi", InputType = "imm", hasNewValue = 1,
-    isAsmParserOnly = 1 in
-def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9_0Ext:$src2),
-    "$dst = mpyi($src1, #$src2)", []>, ImmRegRel;
-
-let hasNewValue = 1, isExtendable = 1,  opExtentBits = 8, opExtendable = 3,
-    InputType = "imm" in
-class T_MType_acc_ri <string mnemonic, bits<3> MajOp, Operand ImmOp,
-                      list<dag> pattern = []>
- : MInst < (outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, ImmOp:$src3),
-  "$dst "#mnemonic#"($src2, #$src3)",
-  pattern, "$src1 = $dst", M_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src2;
-    bits<8> src3;
-
-    let IClass = 0b1110;
-
-    let Inst{27-26} = 0b00;
-    let Inst{25-23} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b0;
-    let Inst{12-5} = src3;
-    let Inst{4-0} = dst;
-  }
-
-let InputType = "reg", hasNewValue = 1 in
-class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                      bit isSwap = 0, list<dag> pattern = [], bit hasNot = 0,
-                      bit isSat = 0, bit isShift = 0>
-  : MInst < (outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-  "$dst "#mnemonic#"($src2, "#!if(hasNot, "~$src3)","$src3)")
-                          #!if(isShift, ":<<1", "")
-                          #!if(isSat, ":sat", ""),
-  pattern, "$src1 = $dst", M_tc_2_SLOT23 > {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> src3;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = !if(isSwap, src3, src2);
-    let Inst{13} = 0b0;
-    let Inst{12-8} = !if(isSwap, src2, src3);
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in {
-  def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8_0Ext, []>, ImmRegRel;
-
-  def M2_maci   : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0, []>, ImmRegRel;
-}
-
-let CextOpcode = "ADD_acc" in {
-  let isExtentSigned = 1 in
-  def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8_0Ext, []>, ImmRegRel;
-
-  def M2_acci  : T_MType_acc_rr <"+= add",  0b000, 0b001, 0, []>, ImmRegRel;
-}
-
-let CextOpcode = "SUB_acc" in {
-  let isExtentSigned = 1 in
-  def M2_naccii : T_MType_acc_ri <"-= add", 0b101, s8_0Ext>, ImmRegRel;
-
-  def M2_nacci  : T_MType_acc_rr <"-= add",  0b100, 0b001, 0>, ImmRegRel;
-}
-
-let Itinerary = M_tc_3x_SLOT23 in
-def M2_macsin : T_MType_acc_ri <"-= mpyi", 0b011, u8_0Ext>;
-
-def M2_xor_xacc : T_MType_acc_rr < "^= xor", 0b100, 0b011, 0>;
-def M2_subacc : T_MType_acc_rr <"+= sub",  0b000, 0b011, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- XType Vector Instructions
-//===----------------------------------------------------------------------===//
-class T_XTYPE_Vect < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_XTYPE_Vect_acc < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
-  : MInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd += "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
-  [], "$dst2 = $Rdd",M_tc_3x_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_XTYPE_Vect_diff < bits<3> MajOp, string opc >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rtt, DoubleRegs:$Rss),
-  "$Rdd = "#opc#"($Rtt, $Rss)",
-  [], "",M_tc_2_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = 0b000;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector reduce add unsigned bytes: Rdd32=vrmpybu(Rss32,Rtt32)
-def A2_vraddub: T_XTYPE_Vect <"vraddub", 0b010, 0b001, 0>;
-def A2_vraddub_acc: T_XTYPE_Vect_acc <"vraddub", 0b010, 0b001, 0>;
-
-// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
-def A2_vrsadub: T_XTYPE_Vect <"vrsadub", 0b010, 0b010, 0>;
-def A2_vrsadub_acc: T_XTYPE_Vect_acc <"vrsadub", 0b010, 0b010, 0>;
-
-// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
-def M2_vabsdiffh: T_XTYPE_Vect_diff<0b011, "vabsdiffh">;
-
-// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
-def M2_vabsdiffw: T_XTYPE_Vect_diff<0b001, "vabsdiffw">;
-
-// Vector reduce complex multiply real or imaginary:
-// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
-def M2_vrcmpyi_s0:  T_XTYPE_Vect <"vrcmpyi", 0b000, 0b000, 0>;
-def M2_vrcmpyi_s0c: T_XTYPE_Vect <"vrcmpyi", 0b010, 0b000, 1>;
-def M2_vrcmaci_s0:  T_XTYPE_Vect_acc <"vrcmpyi", 0b000, 0b000, 0>;
-def M2_vrcmaci_s0c: T_XTYPE_Vect_acc <"vrcmpyi", 0b010, 0b000, 1>;
-
-def M2_vrcmpyr_s0:  T_XTYPE_Vect <"vrcmpyr", 0b000, 0b001, 0>;
-def M2_vrcmpyr_s0c: T_XTYPE_Vect <"vrcmpyr", 0b011, 0b001, 1>;
-def M2_vrcmacr_s0:  T_XTYPE_Vect_acc <"vrcmpyr", 0b000, 0b001, 0>;
-def M2_vrcmacr_s0c: T_XTYPE_Vect_acc <"vrcmpyr", 0b011, 0b001, 1>;
-
-// Vector reduce halfwords:
-// Rdd[+]=vrmpyh(Rss,Rtt)
-def M2_vrmpy_s0: T_XTYPE_Vect <"vrmpyh", 0b000, 0b010, 0>;
-def M2_vrmac_s0: T_XTYPE_Vect_acc <"vrmpyh", 0b000, 0b010, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Vector Multipy with accumulation.
-// Used for complex multiply real or imaginary, dual multiply and even halfwords
-//===----------------------------------------------------------------------===//
-let Defs = [USR_OVF] in
-class T_M2_vmpy_acc_sat < string opc, bits<3> MajOp, bits<3> MinOp,
-                          bit hasShift, bit isRnd >
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                               #!if(isRnd,":rnd","")#":sat",
-  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_M2_vmpy_acc < string opc, bits<3> MajOp, bits<3> MinOp,
-                      bit hasShift, bit isRnd >
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                               #!if(isRnd,":rnd",""),
-  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector multiply word by signed half with accumulation
-// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmacls_s1:  T_M2_vmpy_acc_sat <"vmpyweh", 0b100, 0b101, 1, 0>;
-def M2_mmacls_s0:  T_M2_vmpy_acc_sat <"vmpyweh", 0b000, 0b101, 0, 0>;
-def M2_mmacls_rs1: T_M2_vmpy_acc_sat <"vmpyweh", 0b101, 0b101, 1, 1>;
-def M2_mmacls_rs0: T_M2_vmpy_acc_sat <"vmpyweh", 0b001, 0b101, 0, 1>;
-
-def M2_mmachs_s1:  T_M2_vmpy_acc_sat <"vmpywoh", 0b100, 0b111, 1, 0>;
-def M2_mmachs_s0:  T_M2_vmpy_acc_sat <"vmpywoh", 0b000, 0b111, 0, 0>;
-def M2_mmachs_rs1: T_M2_vmpy_acc_sat <"vmpywoh", 0b101, 0b111, 1, 1>;
-def M2_mmachs_rs0: T_M2_vmpy_acc_sat <"vmpywoh", 0b001, 0b111, 0, 1>;
-
-// Vector multiply word by unsigned half with accumulation
-// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmaculs_s1:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b110, 0b101, 1, 0>;
-def M2_mmaculs_s0:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b010, 0b101, 0, 0>;
-def M2_mmaculs_rs1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b111, 0b101, 1, 1>;
-def M2_mmaculs_rs0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b011, 0b101, 0, 1>;
-
-def M2_mmacuhs_s1:  T_M2_vmpy_acc_sat <"vmpywouh", 0b110, 0b111, 1, 0>;
-def M2_mmacuhs_s0:  T_M2_vmpy_acc_sat <"vmpywouh", 0b010, 0b111, 0, 0>;
-def M2_mmacuhs_rs1: T_M2_vmpy_acc_sat <"vmpywouh", 0b111, 0b111, 1, 1>;
-def M2_mmacuhs_rs0: T_M2_vmpy_acc_sat <"vmpywouh", 0b011, 0b111, 0, 1>;
-
-// Vector multiply even halfwords with accumulation
-// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
-def M2_vmac2es:    T_M2_vmpy_acc     <"vmpyeh", 0b001, 0b010, 0, 0>;
-def M2_vmac2es_s1: T_M2_vmpy_acc_sat <"vmpyeh", 0b100, 0b110, 1, 0>;
-def M2_vmac2es_s0: T_M2_vmpy_acc_sat <"vmpyeh", 0b000, 0b110, 0, 0>;
-
-// Vector dual multiply with accumulation
-// Rxx+=vdmpy(Rss,Rtt)[:sat]
-def M2_vdmacs_s1: T_M2_vmpy_acc_sat <"vdmpy", 0b100, 0b100, 1, 0>;
-def M2_vdmacs_s0: T_M2_vmpy_acc_sat <"vdmpy", 0b000, 0b100, 0, 0>;
-
-// Vector complex multiply real or imaginary with accumulation
-// Rxx+=vcmpy[ir](Rss,Rtt):sat
-def M2_vcmac_s0_sat_r: T_M2_vmpy_acc_sat <"vcmpyr", 0b001, 0b100, 0, 0>;
-def M2_vcmac_s0_sat_i: T_M2_vmpy_acc_sat <"vcmpyi", 0b010, 0b100, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Multiply signed/unsigned halfwords with and without
-// saturation and rounding
-//===----------------------------------------------------------------------===//
-class T_M2_mpyd < bits<2> LHbits, bit isRnd, bit hasShift, bit isUnsigned >
-  : MInst < (outs DoubleRegs:$Rdd), (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rdd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
-                                       #", $Rt."#!if(LHbits{0},"h)","l)")
-                                       #!if(hasShift,":<<1","")
-                                       #!if(isRnd,":rnd",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isRnd;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-}
-
-def M2_mpyd_hh_s0: T_M2_mpyd<0b11, 0, 0, 0>;
-def M2_mpyd_hl_s0: T_M2_mpyd<0b10, 0, 0, 0>;
-def M2_mpyd_lh_s0: T_M2_mpyd<0b01, 0, 0, 0>;
-def M2_mpyd_ll_s0: T_M2_mpyd<0b00, 0, 0, 0>;
-
-def M2_mpyd_hh_s1: T_M2_mpyd<0b11, 0, 1, 0>;
-def M2_mpyd_hl_s1: T_M2_mpyd<0b10, 0, 1, 0>;
-def M2_mpyd_lh_s1: T_M2_mpyd<0b01, 0, 1, 0>;
-def M2_mpyd_ll_s1: T_M2_mpyd<0b00, 0, 1, 0>;
-
-def M2_mpyd_rnd_hh_s0: T_M2_mpyd<0b11, 1, 0, 0>;
-def M2_mpyd_rnd_hl_s0: T_M2_mpyd<0b10, 1, 0, 0>;
-def M2_mpyd_rnd_lh_s0: T_M2_mpyd<0b01, 1, 0, 0>;
-def M2_mpyd_rnd_ll_s0: T_M2_mpyd<0b00, 1, 0, 0>;
-
-def M2_mpyd_rnd_hh_s1: T_M2_mpyd<0b11, 1, 1, 0>;
-def M2_mpyd_rnd_hl_s1: T_M2_mpyd<0b10, 1, 1, 0>;
-def M2_mpyd_rnd_lh_s1: T_M2_mpyd<0b01, 1, 1, 0>;
-def M2_mpyd_rnd_ll_s1: T_M2_mpyd<0b00, 1, 1, 0>;
-
-//Rdd=mpyu(Rs.[HL],Rt.[HL])[:<<1]
-def M2_mpyud_hh_s0: T_M2_mpyd<0b11, 0, 0, 1>;
-def M2_mpyud_hl_s0: T_M2_mpyd<0b10, 0, 0, 1>;
-def M2_mpyud_lh_s0: T_M2_mpyd<0b01, 0, 0, 1>;
-def M2_mpyud_ll_s0: T_M2_mpyd<0b00, 0, 0, 1>;
-
-def M2_mpyud_hh_s1: T_M2_mpyd<0b11, 0, 1, 1>;
-def M2_mpyud_hl_s1: T_M2_mpyd<0b10, 0, 1, 1>;
-def M2_mpyud_lh_s1: T_M2_mpyd<0b01, 0, 1, 1>;
-def M2_mpyud_ll_s1: T_M2_mpyd<0b00, 0, 1, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class for xtype mpy:
-// Vector multiply
-// Complex multiply
-// multiply 32X32 and use full result
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_XTYPE_mpy64 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                     bit isSat, bit hasShift, bit isConj>
-   : MInst <(outs DoubleRegs:$Rdd),
-            (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rdd = "#mnemonic#"($Rs, $Rt"#!if(isConj,"*)",")")
-                                #!if(hasShift,":<<1","")
-                                #!if(isSat,":sat",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template Class for xtype mpy with accumulation into 64-bit:
-// Vector multiply
-// Complex multiply
-// multiply 32X32 and use full result
-//===----------------------------------------------------------------------===//
-class T_XTYPE_mpy64_acc <string op1, string op2, bits<3> MajOp, bits<3> MinOp,
-                         bit isSat, bit hasShift, bit isConj>
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rxx "#op2#"= "#op1#"($Rs, $Rt"#!if(isConj,"*)",")")
-                                   #!if(hasShift,":<<1","")
-                                   #!if(isSat,":sat",""),
-
-  [] , "$dst2 = $Rxx" > {
-    bits<5> Rxx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-  }
-
-// MPY - Multiply and use full result
-// Rdd = mpy[u](Rs,Rt)
-def M2_dpmpyss_s0 : T_XTYPE_mpy64 < "mpy", 0b000, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_s0 : T_XTYPE_mpy64 < "mpyu", 0b010, 0b000, 0, 0, 0>;
-
-// Rxx[+-]= mpy[u](Rs,Rt)
-def M2_dpmpyss_acc_s0 : T_XTYPE_mpy64_acc < "mpy",  "+", 0b000, 0b000, 0, 0, 0>;
-def M2_dpmpyss_nac_s0 : T_XTYPE_mpy64_acc < "mpy",  "-", 0b001, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_acc_s0 : T_XTYPE_mpy64_acc < "mpyu", "+", 0b010, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_nac_s0 : T_XTYPE_mpy64_acc < "mpyu", "-", 0b011, 0b000, 0, 0, 0>;
-
-// Complex multiply real or imaginary
-// Rxx=cmpy[ir](Rs,Rt)
-def M2_cmpyi_s0 : T_XTYPE_mpy64 < "cmpyi", 0b000, 0b001, 0, 0, 0>;
-def M2_cmpyr_s0 : T_XTYPE_mpy64 < "cmpyr", 0b000, 0b010, 0, 0, 0>;
-
-// Rxx+=cmpy[ir](Rs,Rt)
-def M2_cmaci_s0 : T_XTYPE_mpy64_acc < "cmpyi", "+", 0b000, 0b001, 0, 0, 0>;
-def M2_cmacr_s0 : T_XTYPE_mpy64_acc < "cmpyr", "+", 0b000, 0b010, 0, 0, 0>;
-
-// Complex multiply
-// Rdd=cmpy(Rs,Rt)[:<<]:sat
-def M2_cmpys_s0 : T_XTYPE_mpy64 < "cmpy", 0b000, 0b110, 1, 0, 0>;
-def M2_cmpys_s1 : T_XTYPE_mpy64 < "cmpy", 0b100, 0b110, 1, 1, 0>;
-
-// Rdd=cmpy(Rs,Rt*)[:<<]:sat
-def M2_cmpysc_s0 : T_XTYPE_mpy64 < "cmpy", 0b010, 0b110, 1, 0, 1>;
-def M2_cmpysc_s1 : T_XTYPE_mpy64 < "cmpy", 0b110, 0b110, 1, 1, 1>;
-
-// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
-def M2_cmacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b000, 0b110, 1, 0, 0>;
-def M2_cnacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b000, 0b111, 1, 0, 0>;
-def M2_cmacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b100, 0b110, 1, 1, 0>;
-def M2_cnacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b100, 0b111, 1, 1, 0>;
-
-// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
-def M2_cmacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b010, 0b110, 1, 0, 1>;
-def M2_cnacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b010, 0b111, 1, 0, 1>;
-def M2_cmacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b110, 0b110, 1, 1, 1>;
-def M2_cnacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b110, 0b111, 1, 1, 1>;
-
-// Vector multiply halfwords
-// Rdd=vmpyh(Rs,Rt)[:<<]:sat
-//let Defs = [USR_OVF] in {
-  def M2_vmpy2s_s1 : T_XTYPE_mpy64 < "vmpyh", 0b100, 0b101, 1, 1, 0>;
-  def M2_vmpy2s_s0 : T_XTYPE_mpy64 < "vmpyh", 0b000, 0b101, 1, 0, 0>;
-//}
-
-// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
-def M2_vmac2     : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b001, 0b001, 0, 0, 0>;
-def M2_vmac2s_s1 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b100, 0b101, 1, 1, 0>;
-def M2_vmac2s_s0 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b000, 0b101, 1, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYS +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYS -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/VB +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/VB -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/VH  +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/VH  -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ST +
-//===----------------------------------------------------------------------===//
-///
-// Store doubleword.
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<4> MajOp, bit isHalf >
-  : STInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-  mnemonic#"($src1++#$offset) = $src2"#!if(isHalf, ".h", ""),
-  [], "$src1 = $_dst_" >,
-  AddrModeRel {
-    bits<5> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src2;
-    let Inst{7}     = 0b0;
-    let Inst{6-3}   = offsetBits;
-    let Inst{1}     = 0b0;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
-  : STInst <(outs IntRegs:$_dst_),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2++#$offset) = $src3"#!if(isHalf, ".h", ""),
-  [], "$src2 = $_dst_" >,
-  AddrModeRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<5> src3;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12-8} = src3;
-    let Inst{7} = isPredNew;
-    let Inst{6-3} = offsetBits;
-    let Inst{2} = isPredNot;
-    let Inst{1-0} = src1;
-  }
-
-multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
-                      Operand ImmOp, bits<4> MajOp, bit isHalf = 0 > {
-
-  let BaseOpcode = "POST_"#BaseOp in {
-    def S2_#NAME#_pi : T_store_pi <mnemonic, RC, ImmOp, MajOp, isHalf>;
-
-    // Predicated
-    def S2_p#NAME#t_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 0, 0>;
-    def S2_p#NAME#f_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 1, 0>;
-
-    // Predicated new
-    def S2_p#NAME#tnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
-                                          isHalf, 0, 1>;
-    def S2_p#NAME#fnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
-                                          isHalf, 1, 1>;
-  }
-}
-
-let accessSize = ByteAccess in
-defm storerb: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm, 0b1000>;
-
-let accessSize = HalfWordAccess in
-defm storerh: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm, 0b1010>;
-
-let accessSize = WordAccess in
-defm storeri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm, 0b1100>;
-
-let accessSize = DoubleWordAccess in
-defm storerd: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm, 0b1110>;
-
-let accessSize = HalfWordAccess, isNVStorable = 0 in
-defm storerf: ST_PostInc <"memh", "STrih_H", IntRegs, s4_1Imm, 0b1011, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment stores with register offset.
-//===----------------------------------------------------------------------===//
-class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                     MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$src1, ModRegs:$src2, RC:$src3),
-  mnemonic#"($src1++$src2) = $src3"#!if(isHalf, ".h", ""),
-  [], "$src1 = $_dst_" > {
-    bits<5> src1;
-    bits<1> src2;
-    bits<5> src3;
-    let accessSize = AccessSz;
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13} = src2;
-    let Inst{12-8} = src3;
-    let Inst{7} = 0b0;
-  }
-
-def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
-def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
-def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
-def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
-def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
-
-let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
-class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                  bits<3> MajOp, bit isH = 0>
-  : STInst <(outs),
-            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-  mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
-  AddrModeRel, ImmRegRel {
-    bits<5> src1;
-    bits<14> src2; // Actual address offset
-    bits<5> src3;
-    bits<11> offsetBits; // Represents offset encoding
-
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
-                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
-                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
-                                        /* s11_0Ext */ 11)));
-    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), src2{13-3},
-                     !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
-                     !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
-                                      /* s11_0Ext */ src2{10-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-    let IClass = 0b1010;
-
-    let Inst{27} = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24} = 0b1;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13} = offsetBits{8};
-    let Inst{12-8} = src3;
-    let Inst{7-0} = offsetBits{7-0};
-  }
-
-let opExtendable = 2, isPredicated = 1 in
-class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3>MajOp, bit PredNot, bit isPredNew, bit isH = 0>
-  : STInst <(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2+#$src3) = $src4"#!if(isH,".h",""),
-  [],"",V2LDST_tc_st_SLOT01 >,
-   AddrModeRel, ImmRegRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> src3; // Actual address offset
-    bits<5> src4;
-    bits<6> offsetBits; // Represents offset encoding
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = PredNot;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
-                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
-                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
-                                        /* u6_0Ext */ 6)));
-    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), src3{8-3},
-                     !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
-                     !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
-                                      /* u6_0Ext */ src3{5-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0100;
-
-    let Inst{27} = 0b0;
-    let Inst{26} = PredNot;
-    let Inst{25} = isPredNew;
-    let Inst{24} = 0b0;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = offsetBits{5};
-    let Inst{12-8} = src4;
-    let Inst{7-3} = offsetBits{4-0};
-    let Inst{1-0} = src1;
-  }
-
-let isExtendable = 1, hasSideEffects = 0 in
-multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                 Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    def S2_#NAME#_io : T_store_io <mnemonic, RC, ImmOp, MajOp, isH>;
-
-    // Predicated
-    def S2_p#NAME#t_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 0, 0, isH>;
-    def S2_p#NAME#f_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 1, 0, isH>;
-
-    // Predicated new
-    def S4_p#NAME#tnew_io : T_pstore_io <mnemonic, RC, predImmOp,
-                                         MajOp, 0, 1, isH>;
-    def S4_p#NAME#fnew_io : T_pstore_io <mnemonic, RC, predImmOp,
-                                         MajOp, 1, 1, isH>;
-  }
-}
-
-let addrMode = BaseImmOffset, InputType = "imm" in {
-  let accessSize = ByteAccess in
-    defm storerb: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext, u6_0Ext, 0b000>;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-    defm storerh: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext, u6_1Ext, 0b010>;
-
-  let accessSize = WordAccess, opExtentAlign = 2 in
-    defm storeri: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext, u6_2Ext, 0b100>;
-
-  let accessSize = DoubleWordAccess, isNVStorable = 0, opExtentAlign = 3 in
-    defm storerd: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
-                            u6_3Ext, 0b110>;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-    defm storerf: ST_Idxd < "memh", "STrif", IntRegs, s11_1Ext,
-                            u6_1Ext, 0b011, 1>;
-}
-
-// Store predicate.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_pred : STInst<(outs),
-      (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
-      ".error \"should not emit\"", []>;
-// Store modifier.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_mod : STInst<(outs),
-      (ins IntRegs:$addr, s11_2Ext:$off, ModRegs:$src1),
-      ".error \"should not emit\"", []>;
-
-// S2_allocframe: Allocate stack frame.
-let Defs = [R29, R30], Uses = [R29, R31, R30],
-    hasSideEffects = 0, accessSize = DoubleWordAccess in
-def S2_allocframe: ST0Inst <
-  (outs), (ins u11_3Imm:$u11_3),
-  "allocframe(#$u11_3)" > {
-    bits<14> u11_3;
-
-    let IClass = 0b1010;
-    let Inst{27-16} = 0b000010011101;
-    let Inst{13-11} = 0b000;
-    let Inst{10-0} = u11_3{13-3};
-  }
-
-// S2_storer[bhwdf]_pci: Store byte/half/word/double.
-// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
-let Uses = [CS], addrMode = PostInc in
-class T_store_pci <string mnemonic, RegisterClass RC,
-                         Operand Imm, bits<4>MajOp,
-                         MemAccessSize AlignSize, string RegSrc = "Rt">
-  : STInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, RC:$Rt),
-  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $"#RegSrc#"",
-  [] ,
-  "$Rz = $_dst_" > {
-    bits<5> Rz;
-    bits<7> offset;
-    bits<1> Mu;
-    bits<5> Rt;
-    let accessSize = AlignSize;
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
-                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
-
-    let IClass = 0b1010;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-8} = Rt;
-    let Inst{7} = 0b0;
-    let Inst{6-3} =
-      !if (!eq(!cast<string>(AlignSize), "DoubleWordAccess"), offset{6-3},
-      !if (!eq(!cast<string>(AlignSize), "WordAccess"),       offset{5-2},
-      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"),   offset{4-1},
-                                       /* ByteAccess */       offset{3-0})));
-    let Inst{1} = 0b0;
-  }
-
-def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
-                                 ByteAccess>;
-def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
-                                 HalfWordAccess>;
-def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
-                                 HalfWordAccess, "Rt.h">;
-def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
-                                 WordAccess>;
-def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
-                                 DoubleWordAccess>;
-
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4,
-    addrMode = PostInc in
-class T_storenew_pci <string mnemonic, Operand Imm,
-                             bits<2>MajOp, MemAccessSize AlignSize>
-  : NVInst < (outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, IntRegs:$Nt),
-  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $Nt.new",
-  [],
-  "$Rz = $_dst_"> {
-    bits<5> Rz;
-    bits<6> offset;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let accessSize = AlignSize;
-
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b1001101;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = Nt;
-    let Inst{7} = 0b0;
-    let Inst{6-3} =
-      !if (!eq(!cast<string>(AlignSize), "WordAccess"),     offset{5-2},
-      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
-                                       /* ByteAccess */     offset{3-0}));
-    let Inst{1} = 0b0;
-  }
-
-def S2_storerbnew_pci : T_storenew_pci <"memb", s4_0Imm, 0b00, ByteAccess>;
-def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
-def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Circular stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let Uses = [CS], addrMode = PostInc in
-class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
-                               MemAccessSize AlignSize, string RegSrc = "Rt">
-  : STInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, ModRegs:$Mu, RC:$Rt),
-  #mnemonic#"($Rz ++ I:circ($Mu)) = $"#RegSrc#"",
-  [],
-  "$Rz = $_dst_" > {
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<5> Rt;
-
-    let accessSize = AlignSize;
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
-                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
-
-    let IClass = 0b1010;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-8} = Rt;
-    let Inst{7} = 0b0;
-    let Inst{1} = 0b1;
-  }
-
-def S2_storerb_pcr : T_store_pcr<"memb", IntRegs, 0b1000, ByteAccess>;
-def S2_storerh_pcr : T_store_pcr<"memh", IntRegs, 0b1010, HalfWordAccess>;
-def S2_storeri_pcr : T_store_pcr<"memw", IntRegs, 0b1100, WordAccess>;
-def S2_storerd_pcr : T_store_pcr<"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
-def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
-                                 HalfWordAccess, "Rt.h">;
-
-//===----------------------------------------------------------------------===//
-// Circular .new stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    addrMode = PostInc in
-class T_storenew_pcr <string mnemonic, bits<2>MajOp,
-                                   MemAccessSize AlignSize>
-  : NVInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
-  #mnemonic#"($Rz ++ I:circ($Mu)) = $Nt.new" ,
-  [] ,
-  "$Rz = $_dst_"> {
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let accessSize = AlignSize;
-
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b1001101;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = Nt;
-    let Inst{7} = 0b0;
-    let Inst{1} = 0b1;
-  }
-
-def S2_storerbnew_pcr : T_storenew_pcr <"memb", 0b00, ByteAccess>;
-def S2_storerhnew_pcr : T_storenew_pcr <"memh", 0b01, HalfWordAccess>;
-def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_store_pbr<string mnemonic, RegisterClass RC,
-                            MemAccessSize addrSize, bits<3> majOp,
-                            bit isHalf = 0>
-  : STInst
-    <(outs IntRegs:$_dst_),
-     (ins IntRegs:$Rz, ModRegs:$Mu, RC:$src),
-     #mnemonic#"($Rz ++ $Mu:brev) = $src"#!if (!eq(isHalf, 1), ".h", ""),
-     [], "$Rz = $_dst_" > {
-
-      let accessSize = addrSize;
-
-      bits<5> Rz;
-      bits<1> Mu;
-      bits<5> src;
-
-      let IClass = 0b1010;
-
-      let Inst{27-24} = 0b1111;
-      let Inst{23-21} = majOp;
-      let Inst{7} = 0b0;
-      let Inst{20-16} = Rz;
-      let Inst{13} = Mu;
-      let Inst{12-8} = src;
-    }
-
-let isNVStorable = 1 in {
-  let BaseOpcode = "S2_storerb_pbr" in
-  def S2_storerb_pbr : T_store_pbr<"memb", IntRegs, ByteAccess,
-                                             0b000>, NewValueRel;
-  let BaseOpcode = "S2_storerh_pbr" in
-  def S2_storerh_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess,
-                                             0b010>, NewValueRel;
-  let BaseOpcode = "S2_storeri_pbr" in
-  def S2_storeri_pbr : T_store_pbr<"memw", IntRegs, WordAccess,
-                                             0b100>, NewValueRel;
-}
-
-def S2_storerf_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess, 0b011, 1>;
-def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed .new stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    hasSideEffects = 0, addrMode = PostInc in
-class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
-  : NVInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
-     #mnemonic#"($Rz ++ $Mu:brev) = $Nt.new", [],
-     "$Rz = $_dst_">, NewValueRel {
-    let accessSize = addrSize;
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1111101;
-    let Inst{12-11} = majOp;
-    let Inst{7} = 0b0;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{10-8} = Nt;
-  }
-
-let BaseOpcode = "S2_storerb_pbr" in
-def S2_storerbnew_pbr : T_storenew_pbr<"memb", ByteAccess, 0b00>;
-
-let BaseOpcode = "S2_storerh_pbr" in
-def S2_storerhnew_pbr : T_storenew_pbr<"memh", HalfWordAccess, 0b01>;
-
-let BaseOpcode = "S2_storeri_pbr" in
-def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
-
-//===----------------------------------------------------------------------===//
-// ST -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template class for S_2op instructions.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S2op_1 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
-                RegisterClass RCIn, bits<2> MajOp, bits<3> MinOp, bit isSat>
-  : SInst <(outs RCOut:$dst), (ins RCIn:$src),
-  "$dst = "#mnemonic#"($src)"#!if(isSat, ":sat", ""),
-  [], "", S_2op_tc_1_SLOT23 > {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-22} = MajOp;
-    let Inst{21} = 0b0;
-    let Inst{20-16} = src;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-class T_S2op_1_di <string mnemonic, bits<2> MajOp, bits<3> MinOp>
-  : T_S2op_1 <mnemonic, 0b0100, DoubleRegs, IntRegs, MajOp, MinOp, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_1_id <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
-  : T_S2op_1 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, isSat>;
-
-let hasNewValue = 1 in
-class T_S2op_1_ii <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
-  : T_S2op_1 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp, isSat>;
-
-// Vector sign/zero extend
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def S2_vsxtbh : T_S2op_1_di <"vsxtbh", 0b00, 0b000>;
-  def S2_vsxthw : T_S2op_1_di <"vsxthw", 0b00, 0b100>;
-  def S2_vzxtbh : T_S2op_1_di <"vzxtbh", 0b00, 0b010>;
-  def S2_vzxthw : T_S2op_1_di <"vzxthw", 0b00, 0b110>;
-}
-
-// Vector splat bytes/halfwords
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def S2_vsplatrb : T_S2op_1_ii <"vsplatb", 0b01, 0b111>;
-  def S2_vsplatrh : T_S2op_1_di <"vsplath", 0b01, 0b010>;
-}
-
-// Sign extend word to doubleword
-def A2_sxtw   : T_S2op_1_di <"sxtw", 0b01, 0b000>;
-
-// Vector saturate and pack
-let Defs = [USR_OVF] in {
-  def S2_svsathb  : T_S2op_1_ii <"vsathb", 0b10, 0b000>;
-  def S2_svsathub : T_S2op_1_ii <"vsathub", 0b10, 0b010>;
-  def S2_vsathb   : T_S2op_1_id <"vsathb", 0b00, 0b110>;
-  def S2_vsathub  : T_S2op_1_id <"vsathub", 0b00, 0b000>;
-  def S2_vsatwh   : T_S2op_1_id <"vsatwh", 0b00, 0b010>;
-  def S2_vsatwuh  : T_S2op_1_id <"vsatwuh", 0b00, 0b100>;
-}
-
-// Vector truncate
-def S2_vtrunohb : T_S2op_1_id <"vtrunohb", 0b10, 0b000>;
-def S2_vtrunehb : T_S2op_1_id <"vtrunehb", 0b10, 0b010>;
-
-// Swizzle the bytes of a word
-def A2_swiz : T_S2op_1_ii <"swiz", 0b10, 0b111>;
-
-// Saturate
-let Defs = [USR_OVF] in {
-  def A2_sat   : T_S2op_1_id <"sat", 0b11, 0b000>;
-  def A2_satb  : T_S2op_1_ii <"satb", 0b11, 0b111>;
-  def A2_satub : T_S2op_1_ii <"satub", 0b11, 0b110>;
-  def A2_sath  : T_S2op_1_ii <"sath", 0b11, 0b100>;
-  def A2_satuh : T_S2op_1_ii <"satuh", 0b11, 0b101>;
-  def A2_roundsat : T_S2op_1_id <"round", 0b11, 0b001, 0b1>;
-}
-
-let Itinerary = S_2op_tc_2_SLOT23 in {
-  // Vector round and pack
-  def S2_vrndpackwh   : T_S2op_1_id <"vrndwh", 0b10, 0b100>;
-
-  let Defs = [USR_OVF] in
-  def S2_vrndpackwhs  : T_S2op_1_id <"vrndwh", 0b10, 0b110, 1>;
-
-  // Bit reverse
-  def S2_brev : T_S2op_1_ii <"brev", 0b01, 0b110>;
-
-  // Absolute value word
-  def A2_abs    : T_S2op_1_ii <"abs", 0b10, 0b100>;
-
-  let Defs = [USR_OVF] in
-  def A2_abssat : T_S2op_1_ii <"abs", 0b10, 0b101, 1>;
-
-  // Negate with saturation
-  let Defs = [USR_OVF] in
-  def A2_negsat : T_S2op_1_ii <"neg", 0b10, 0b110, 1>;
-}
-
-class T_S2op_2 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
-                RegisterClass RCIn, bits<3> MajOp, bits<3> MinOp,
-                bit isSat, bit isRnd, list<dag> pattern = []>
-  : SInst <(outs RCOut:$dst),
-  (ins RCIn:$src, u5_0Imm:$u5),
-  "$dst = "#mnemonic#"($src, #$u5)"#!if(isSat, ":sat", "")
-                                   #!if(isRnd, ":rnd", ""),
-  pattern, "", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src;
-    bits<5> u5;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src;
-    let Inst{13} = 0b0;
-    let Inst{12-8} = u5;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-class T_S2op_2_di <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : T_S2op_2 <mnemonic, 0b1000, DoubleRegs, IntRegs, MajOp, MinOp, 0, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_2_id <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : T_S2op_2 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, 0, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_2_ii <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0, list<dag> pattern = []>
-  : T_S2op_2 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp,
-              isSat, isRnd, pattern>;
-
-class T_S2op_shift <string mnemonic, bits<3> MajOp, bits<3> MinOp, SDNode OpNd>
-  : T_S2op_2_ii <mnemonic, MajOp, MinOp, 0, 0, []>;
-
-// Vector arithmetic shift right by immediate with truncate and pack
-def S2_asr_i_svw_trun : T_S2op_2_id <"vasrw", 0b110, 0b010>;
-
-// Arithmetic/logical shift right/left by immediate
-let Itinerary = S_2op_tc_1_SLOT23 in {
-  def S2_asr_i_r : T_S2op_shift <"asr", 0b000, 0b000, sra>;
-  def S2_lsr_i_r : T_S2op_shift <"lsr", 0b000, 0b001, srl>;
-  def S2_asl_i_r : T_S2op_shift <"asl", 0b000, 0b010, shl>;
-}
-
-// Shift left by immediate with saturation
-let Defs = [USR_OVF] in
-def S2_asl_i_r_sat : T_S2op_2_ii <"asl", 0b010, 0b010, 1>;
-
-// Shift right with round
-def S2_asr_i_r_rnd : T_S2op_2_ii <"asr", 0b010, 0b000, 0, 1>;
-
-let isAsmParserOnly = 1 in
-def S2_asr_i_r_rnd_goodsyntax
-  : SInst <(outs IntRegs:$dst), (ins  IntRegs:$src, u5_0Imm:$u5),
-  "$dst = asrrnd($src, #$u5)",
-  [], "", S_2op_tc_1_SLOT23>;
-
-let isAsmParserOnly = 1 in
-def A2_not: ALU32_rr<(outs IntRegs:$dst),(ins IntRegs:$src),
-  "$dst = not($src)">;
-
-class T_S2op_3<string opc, bits<2>MajOp, bits<3>minOp, bits<1> sat = 0>
-  : SInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
-           "$Rdd = "#opc#"($Rss)"#!if(!eq(sat, 1),":sat","")> {
-  bits<5> Rss;
-  bits<5> Rdd;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0;
-  let Inst{23-22} = MajOp;
-  let Inst{20-16} = Rss;
-  let Inst{7-5} = minOp;
-  let Inst{4-0} = Rdd;
-}
-
-def A2_absp : T_S2op_3 <"abs", 0b10, 0b110>;
-def A2_negp : T_S2op_3 <"neg", 0b10, 0b101>;
-def A2_notp : T_S2op_3 <"not", 0b10, 0b100>;
-
-// Innterleave/deinterleave
-def S2_interleave   : T_S2op_3 <"interleave",   0b11, 0b101>;
-def S2_deinterleave : T_S2op_3 <"deinterleave", 0b11, 0b100>;
-
-// Vector Complex conjugate
-def A2_vconj : T_S2op_3 <"vconj", 0b10, 0b111, 1>;
-
-// Vector saturate without pack
-def S2_vsathb_nopack  : T_S2op_3 <"vsathb",  0b00, 0b111>;
-def S2_vsathub_nopack : T_S2op_3 <"vsathub", 0b00, 0b100>;
-def S2_vsatwh_nopack  : T_S2op_3 <"vsatwh",  0b00, 0b110>;
-def S2_vsatwuh_nopack : T_S2op_3 <"vsatwuh", 0b00, 0b101>;
-
-// Vector absolute value halfwords with and without saturation
-// Rdd64=vabsh(Rss64)[:sat]
-def A2_vabsh    : T_S2op_3 <"vabsh", 0b01, 0b100>;
-def A2_vabshsat : T_S2op_3 <"vabsh", 0b01, 0b101, 1>;
-
-// Vector absolute value words with and without saturation
-def A2_vabsw    : T_S2op_3 <"vabsw", 0b01, 0b110>;
-def A2_vabswsat : T_S2op_3 <"vabsw", 0b01, 0b111, 1>;
-
-//===----------------------------------------------------------------------===//
-// STYPE/BIT +
-//===----------------------------------------------------------------------===//
-// Bit count
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_COUNT_LEADING<string MnOp, bits<3> MajOp, bits<3> MinOp, bit Is32,
-                dag Out, dag Inp>
-    : SInst<Out, Inp, "$Rd = "#MnOp#"($Rs)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  let IClass = 0b1000;
-  let Inst{27} = 0b1;
-  let Inst{26} = Is32;
-  let Inst{25-24} = 0b00;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-class T_COUNT_LEADING_32<string MnOp, bits<3> MajOp, bits<3> MinOp>
-    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b1,
-                      (outs IntRegs:$Rd), (ins IntRegs:$Rs)>;
-
-class T_COUNT_LEADING_64<string MnOp, bits<3> MajOp, bits<3> MinOp>
-    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b0,
-                      (outs IntRegs:$Rd), (ins DoubleRegs:$Rs)>;
-
-def S2_cl0     : T_COUNT_LEADING_32<"cl0",     0b000, 0b101>;
-def S2_cl1     : T_COUNT_LEADING_32<"cl1",     0b000, 0b110>;
-def S2_ct0     : T_COUNT_LEADING_32<"ct0",     0b010, 0b100>;
-def S2_ct1     : T_COUNT_LEADING_32<"ct1",     0b010, 0b101>;
-def S2_cl0p    : T_COUNT_LEADING_64<"cl0",     0b010, 0b010>;
-def S2_cl1p    : T_COUNT_LEADING_64<"cl1",     0b010, 0b100>;
-def S2_clb     : T_COUNT_LEADING_32<"clb",     0b000, 0b100>;
-def S2_clbp    : T_COUNT_LEADING_64<"clb",     0b010, 0b000>;
-def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
-
-// The 64-bit counts leading/trailing are defined in HexagonInstrInfoV4.td.
-
-// Bit set/clear/toggle
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_SCT_BIT_IMM<string MnOp, bits<3> MinOp>
-    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, u5_0Imm:$u5),
-            "$Rd = "#MnOp#"($Rs, #$u5)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> u5;
-  let IClass = 0b1000;
-  let Inst{27-21} = 0b1100110;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b0;
-  let Inst{12-8} = u5;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_SCT_BIT_REG<string MnOp, bits<2> MinOp>
-    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Rd = "#MnOp#"($Rs, $Rt)", [], "", S_3op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-22} = 0b011010;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-6} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-def S2_clrbit_i    : T_SCT_BIT_IMM<"clrbit",    0b001>;
-def S2_setbit_i    : T_SCT_BIT_IMM<"setbit",    0b000>;
-def S2_togglebit_i : T_SCT_BIT_IMM<"togglebit", 0b010>;
-def S2_clrbit_r    : T_SCT_BIT_REG<"clrbit",    0b01>;
-def S2_setbit_r    : T_SCT_BIT_REG<"setbit",    0b00>;
-def S2_togglebit_r : T_SCT_BIT_REG<"togglebit", 0b10>;
-
-// Bit test
-
-let hasSideEffects = 0 in
-class T_TEST_BIT_IMM<string MnOp, bits<3> MajOp>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u5_0Imm:$u5),
-            "$Pd = "#MnOp#"($Rs, #$u5)",
-            [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> u5;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0101;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0;
-  let Inst{12-8} = u5;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0 in
-class T_TEST_BIT_REG<string MnOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Pd = "#MnOp#"($Rs, $Rt)",
-            [], "", S_3op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-22} = 0b011100;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{1-0} = Pd;
-}
-
-def S2_tstbit_i : T_TEST_BIT_IMM<"tstbit", 0b000>;
-def S2_tstbit_r : T_TEST_BIT_REG<"tstbit", 0>;
-
-let hasSideEffects = 0 in
-class T_TEST_BITS_IMM<string MnOp, bits<2> MajOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u6_0Imm:$u6),
-            "$Pd = "#MnOp#"($Rs, #$u6)",
-            [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<6> u6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0101;
-  let Inst{23-22} = MajOp;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = u6;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0 in
-class T_TEST_BITS_REG<string MnOp, bits<2> MajOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Pd = "#MnOp#"($Rs, $Rt)",
-            [], "", S_3op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-24} = 0b0111;
-  let Inst{23-22} = MajOp;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{1-0} = Pd;
-}
-
-def C2_bitsclri : T_TEST_BITS_IMM<"bitsclr", 0b10, 0>;
-def C2_bitsclr  : T_TEST_BITS_REG<"bitsclr", 0b10, 0>;
-def C2_bitsset  : T_TEST_BITS_REG<"bitsset", 0b01, 0>;
-
-//===----------------------------------------------------------------------===//
-// STYPE/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PERM +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/PRED +
-//===----------------------------------------------------------------------===//
-
-// Predicate transfer.
-let hasSideEffects = 0, hasNewValue = 1 in
-def C2_tfrpr : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps),
-      "$Rd = $Ps", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Ps;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1001;
-  let Inst{22} = 0b1;
-  let Inst{17-16} = Ps;
-  let Inst{4-0} = Rd;
-}
-
-// Transfer general register to predicate.
-let hasSideEffects = 0 in
-def C2_tfrrp: SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs),
-      "$Pd = $Rs", [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-
-  let IClass = 0b1000;
-  let Inst{27-21} = 0b0101010;
-  let Inst{20-16} = Rs;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0, isCodeGenOnly = 1 in
-def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src),
-     "$dst = $src">;
-
-//===----------------------------------------------------------------------===//
-// STYPE/PRED -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/SHIFT +
-//===----------------------------------------------------------------------===//
-class S_2OpInstImm<string Mnemonic, bits<3>MajOp, bits<3>MinOp,
-                   Operand Imm, list<dag> pattern = [], bit isRnd = 0>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, Imm:$src2),
-           "$dst = "#Mnemonic#"($src1, #$src2)"#!if(isRnd, ":rnd", ""),
-           pattern> {
-  bits<5> src1;
-  bits<5> dst;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = src1;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = dst;
-}
-
-class S_2OpInstImmI6<string Mnemonic, SDNode OpNode, bits<3>MinOp>
-  : S_2OpInstImm<Mnemonic, 0b000, MinOp, u6_0Imm, []> {
-  bits<6> src2;
-  let Inst{13-8} = src2;
-}
-
-// Shift by immediate.
-def S2_asr_i_p : S_2OpInstImmI6<"asr", sra, 0b000>;
-def S2_asl_i_p : S_2OpInstImmI6<"asl", shl, 0b010>;
-def S2_lsr_i_p : S_2OpInstImmI6<"lsr", srl, 0b001>;
-
-// Shift left by small amount and add.
-let AddedComplexity = 100, hasNewValue = 1, hasSideEffects = 0 in
-def S2_addasl_rrri: SInst <(outs IntRegs:$Rd),
-                           (ins IntRegs:$Rt, IntRegs:$Rs, u3_0Imm:$u3),
-  "$Rd = addasl($Rt, $Rs, #$u3)" , [],
-  "", S_3op_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-    bits<3> u3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b0100000;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = u3;
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// STYPE/SHIFT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/VH +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/VH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/VW +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/VW -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/SUPER +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/USER +
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 1, isSoloAX = 1 in
-def Y2_barrier : SYSInst<(outs), (ins), "barrier", [],"",ST_tc_st_SLOT0> {
-  let Inst{31-28} = 0b1010;
-  let Inst{27-21} = 0b1000000;
-}
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/SUPER -
-//===----------------------------------------------------------------------===//
-
-// Generate frameindex addresses. The main reason for the offset operand is
-// that every instruction that is allowed to have frame index as an operand
-// will then have that operand followed by an immediate operand (the offset).
-// This simplifies the frame-index elimination code.
-//
-let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
-    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def PS_fi  : ALU32_ri<(outs IntRegs:$Rd),
-                        (ins IntRegs:$fi, s32_0Imm:$off), "">;
-  def PS_fia : ALU32_ri<(outs IntRegs:$Rd),
-                        (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
-}
-
-//===----------------------------------------------------------------------===//
-// CRUSER - Type.
-//===----------------------------------------------------------------------===//
-// HW loop
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, hasSideEffects = 0 in
-class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
-         : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
-           #mnemonic#"($offset, #$src2)",
-           [], "" , CR_tc_3x_SLOT3> {
-    bits<9> offset;
-    bits<10> src2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-22} = 0b100100;
-    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
-    let Inst{20-16} = src2{9-5};
-    let Inst{12-8} = offset{8-4};
-    let Inst{7-5} = src2{4-2};
-    let Inst{4-3} = offset{3-2};
-    let Inst{1-0} = src2{1-0};
-}
-
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, hasSideEffects = 0 in
-class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
-         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
-           #mnemonic#"($offset, $src2)",
-           [], "" ,CR_tc_3x_SLOT3> {
-    bits<9> offset;
-    bits<5> src2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-22} = 0b000000;
-    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
-    let Inst{20-16} = src2;
-    let Inst{12-8} = offset{8-4};
-    let Inst{4-3} = offset{3-2};
-  }
-
-multiclass LOOP_ri<string mnemonic> {
-  def i : LOOP_iBase<mnemonic, brtarget>;
-  def r : LOOP_rBase<mnemonic, brtarget>;
-
-  let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
-    def iext: LOOP_iBase<mnemonic, brtargetExt, 1>;
-    def rext: LOOP_rBase<mnemonic, brtargetExt, 1>;
-  }
-}
-
-
-let Defs = [SA0, LC0, USR] in
-defm J2_loop0 : LOOP_ri<"loop0">;
-
-// Interestingly only loop0's appear to set usr.lpcfg
-let Defs = [SA1, LC1] in
-defm J2_loop1 : LOOP_ri<"loop1">;
-
-let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
-    Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop0",
-                       []>;
-}
-
-let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
-    Defs = [PC, LC1], Uses = [SA1, LC1] in {
-def ENDLOOP1 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop1",
-                       []>;
-}
-
-// Pipelined loop instructions, sp[123]loop0
-let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, isPredicateLate = 1 in
-class SPLOOP_iBase<string SP, bits<2> op>
-  : CRInst <(outs), (ins brtarget:$r7_2, u10_0Imm:$U10),
-  "p3 = sp"#SP#"loop0($r7_2, #$U10)" > {
-    bits<9> r7_2;
-    bits<10> U10;
-
-    let IClass = 0b0110;
-
-    let Inst{22-21} = op;
-    let Inst{27-23} = 0b10011;
-    let Inst{20-16} = U10{9-5};
-    let Inst{12-8} = r7_2{8-4};
-    let Inst{7-5} = U10{4-2};
-    let Inst{4-3} = r7_2{3-2};
-    let Inst{1-0} = U10{1-0};
-  }
-
-let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, isPredicateLate = 1 in
-class SPLOOP_rBase<string SP, bits<2> op>
-  : CRInst <(outs), (ins brtarget:$r7_2, IntRegs:$Rs),
-  "p3 = sp"#SP#"loop0($r7_2, $Rs)" > {
-    bits<9> r7_2;
-    bits<5> Rs;
-
-    let IClass = 0b0110;
-
-    let Inst{22-21} = op;
-    let Inst{27-23} = 0b00001;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = r7_2{8-4};
-    let Inst{4-3} = r7_2{3-2};
-  }
-
-multiclass SPLOOP_ri<string mnemonic, bits<2> op> {
-  def i : SPLOOP_iBase<mnemonic, op>;
-  def r : SPLOOP_rBase<mnemonic, op>;
-}
-
-defm J2_ploop1s : SPLOOP_ri<"1", 0b01>;
-defm J2_ploop2s : SPLOOP_ri<"2", 0b10>;
-defm J2_ploop3s : SPLOOP_ri<"3", 0b11>;
-
-// if (Rs[!>=<]=#0) jump:[t/nt]
-let Defs = [PC], isPredicated = 1, isBranch = 1, hasSideEffects = 0,
-    hasSideEffects = 0 in
-class J2_jump_0_Base<string compare, bit isTak, bits<2> op>
-  : CRInst <(outs), (ins IntRegs:$Rs, brtarget:$r13_2),
-  "if ($Rs"#compare#"#0) jump"#!if(isTak, ":t", ":nt")#" $r13_2" > {
-    bits<5> Rs;
-    bits<15> r13_2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-24} = 0b0001;
-    let Inst{23-22} = op;
-    let Inst{12} = isTak;
-    let Inst{21} = r13_2{14};
-    let Inst{20-16} = Rs;
-    let Inst{11-1} = r13_2{12-2};
-    let Inst{13} = r13_2{13};
-  }
-
-multiclass J2_jump_compare_0<string compare, bits<2> op> {
-  def NAME    : J2_jump_0_Base<compare, 0, op>;
-  def NAME#pt : J2_jump_0_Base<compare, 1, op>;
-}
-
-defm J2_jumprz    : J2_jump_compare_0<"!=", 0b00>;
-defm J2_jumprgtez : J2_jump_compare_0<">=", 0b01>;
-defm J2_jumprnz   : J2_jump_compare_0<"==", 0b10>;
-defm J2_jumprltez : J2_jump_compare_0<"<=", 0b11>;
-
-// Transfer to/from Control/GPR Guest/GPR
-let hasSideEffects = 0 in
-class TFR_CR_RS_base<RegisterClass CTRC, RegisterClass RC, bit isDouble>
-  : CRInst <(outs CTRC:$dst), (ins RC:$src),
-  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0110;
-
-    let Inst{27-25} = 0b001;
-    let Inst{24} = isDouble;
-    let Inst{23-21} = 0b001;
-    let Inst{20-16} = src;
-    let Inst{4-0} = dst;
-  }
-
-def A2_tfrrcr : TFR_CR_RS_base<CtrRegs, IntRegs, 0b0>;
-def A4_tfrpcp : TFR_CR_RS_base<CtrRegs64, DoubleRegs, 0b1>;
-def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
-def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
-
-let hasSideEffects = 0 in
-class TFR_RD_CR_base<RegisterClass RC, RegisterClass CTRC, bit isSingle>
-  : CRInst <(outs RC:$dst), (ins CTRC:$src),
-  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0110;
-
-    let Inst{27-26} = 0b10;
-    let Inst{25} = isSingle;
-    let Inst{24-21} = 0b0000;
-    let Inst{20-16} = src;
-    let Inst{4-0} = dst;
-  }
-
-let hasNewValue = 1, opNewValue = 0 in
-def A2_tfrcrr : TFR_RD_CR_base<IntRegs, CtrRegs, 1>;
-def A4_tfrcpp : TFR_RD_CR_base<DoubleRegs, CtrRegs64, 0>;
-def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
-def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
-
-// Y4_trace: Send value to etm trace.
-let isSoloAX = 1, hasSideEffects = 0 in
-def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs),
-  "trace($Rs)"> {
-    bits<5> Rs;
-
-    let IClass = 0b0110;
-    let Inst{27-21} = 0b0010010;
-    let Inst{20-16} = Rs;
-  }
-
-// HI/LO Instructions
-let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
-    hasNewValue = 1, opNewValue = 0 in
-class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
-  : ALU32_ri<(outs IntRegs:$dst),
-              (ins u16_0Imm:$imm_value),
-              "$dst"#RegHalf#" = $imm_value", []> {
-    bits<5> dst;
-    bits<32> imm_value;
-    let IClass = 0b0111;
-
-    let Inst{27} = Rs;
-    let Inst{26-24} = MajOp;
-    let Inst{21} = MinOp;
-    let Inst{20-16} = dst;
-    let Inst{23-22} = imm_value{15-14};
-    let Inst{13-0} = imm_value{13-0};
-}
-
-let isAsmParserOnly = 1 in {
-  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
-  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
-}
-
-let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
-  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
-                "$Rd = CONST32(#$v)", []>;
-  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
-                "$Rd = CONST64(#$v)", []>;
-}
-
-let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
-    isCodeGenOnly = 1 in
-def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
-
-let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
-    isCodeGenOnly = 1 in
-def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
-
-let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              ".error \"should not emit\" ", []>;
-
-let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                             ".error \"should not emit\" ", []>;
-
-// Call subroutine indirectly.
-let Defs = VolatileV3.Regs in
-def J2_callr : JUMPR_MISC_CALLR<0, 1>;
-
-// Indirect tail-call.
-let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
-    isTerminator = 1, isCodeGenOnly = 1 in
-def PS_tailcall_r : T_JMPr;
-
-// Direct tail-calls.
-let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
-    isTerminator = 1, isCodeGenOnly = 1 in
-def PS_tailcall_i : JInst<(outs), (ins calltarget:$dst), "", []>;
-
-// The reason for the custom inserter is to record all ALLOCA instructions
-// in MachineFunctionInfo.
-let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
-def PS_alloca: ALU32Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
-
-let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
-def PS_aligna : ALU32Inst<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
-
-// XTYPE/SHIFT
-//
-//===----------------------------------------------------------------------===//
-// Template Class
-// Shift by immediate/register and accumulate/logical
-//===----------------------------------------------------------------------===//
-
-// Rx[+-&|]=asr(Rs,#u5)
-// Rx[+-&|^]=lsr(Rs,#u5)
-// Rx[+-&|^]=asl(Rs,#u5)
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_shift_imm_acc_r <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs IntRegs:$Rx),
-              (ins IntRegs:$src1, IntRegs:$Rs, u5_0Imm:$u5),
-  "$Rx "#opc2#opc1#"($Rs, #$u5)", [],
-  "$src1 = $Rx", S_2op_tc_2_SLOT23> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> u5;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b1110;
-    let Inst{23-22} = majOp{2-1};
-    let Inst{13} = 0b0;
-    let Inst{7} = majOp{0};
-    let Inst{6-5} = minOp;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = u5;
-  }
-
-// Rx[+-&|]=asr(Rs,Rt)
-// Rx[+-&|^]=lsr(Rs,Rt)
-// Rx[+-&|^]=asl(Rs,Rt)
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_shift_reg_acc_r <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<2> majOp, bits<2> minOp>
-  : SInst_acc<(outs IntRegs:$Rx),
-              (ins IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#opc2#opc1#"($Rs, $Rt)", [],
-  "$src1 = $Rx", S_3op_tc_2_SLOT23 > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{23-22} = majOp;
-    let Inst{7-6} = minOp;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-// Rxx[+-&|]=asr(Rss,#u6)
-// Rxx[+-&|^]=lsr(Rss,#u6)
-// Rxx[+-&|^]=asl(Rss,#u6)
-
-class T_shift_imm_acc_p <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$src1, DoubleRegs:$Rss, u6_0Imm:$u6),
-  "$Rxx "#opc2#opc1#"($Rss, #$u6)", [],
-  "$src1 = $Rxx", S_2op_tc_2_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<6> u6;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-22} = majOp{2-1};
-    let Inst{7} = majOp{0};
-    let Inst{6-5} = minOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{13-8} = u6;
-  }
-
-
-// Rxx[+-&|]=asr(Rss,Rt)
-// Rxx[+-&|^]=lsr(Rss,Rt)
-// Rxx[+-&|^]=asl(Rss,Rt)
-// Rxx[+-&|^]=lsl(Rss,Rt)
-
-class T_shift_reg_acc_p <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$src1, DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rxx "#opc2#opc1#"($Rss, $Rt)", [],
-  "$src1 = $Rxx", S_3op_tc_2_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = majOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rt;
-    let Inst{7-6} = minOp;
-    let Inst{4-0} = Rxx;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multi-class for the shift instructions with logical/arithmetic operators.
-//===----------------------------------------------------------------------===//
-
-multiclass xtype_imm_base<string OpcStr1, string OpcStr2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp > {
-  def _i_r#NAME : T_shift_imm_acc_r< OpcStr1, OpcStr2, OpNode1,
-                                     OpNode2, majOp, minOp >;
-  def _i_p#NAME : T_shift_imm_acc_p< OpcStr1, OpcStr2, OpNode1,
-                                     OpNode2, majOp, minOp >;
-}
-
-multiclass xtype_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  defm _acc  : xtype_imm_base< opc1, "+= ", OpNode, add, 0b001, minOp>;
-
-  defm _nac  : xtype_imm_base< opc1, "-= ", OpNode, sub, 0b000, minOp>;
-  defm _and  : xtype_imm_base< opc1, "&= ", OpNode, and, 0b010, minOp>;
-  defm _or   : xtype_imm_base< opc1, "|= ", OpNode,  or, 0b011, minOp>;
-}
-
-multiclass xtype_xor_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
-let AddedComplexity = 100 in
-  defm _xacc  : xtype_imm_base< opc1, "^= ", OpNode, xor, 0b100, minOp>;
-}
-
-defm S2_asr : xtype_imm_acc<"asr", sra, 0b00>;
-
-defm S2_lsr : xtype_imm_acc<"lsr", srl, 0b01>,
-              xtype_xor_imm_acc<"lsr", srl, 0b01>;
-
-defm S2_asl : xtype_imm_acc<"asl", shl, 0b10>,
-              xtype_xor_imm_acc<"asl", shl, 0b10>;
-
-multiclass xtype_reg_acc_r<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  def _acc : T_shift_reg_acc_r <opc1, "+= ", OpNode, add, 0b11, minOp>;
-
-  def _nac : T_shift_reg_acc_r <opc1, "-= ", OpNode, sub, 0b10, minOp>;
-  def _and : T_shift_reg_acc_r <opc1, "&= ", OpNode, and, 0b01, minOp>;
-  def _or  : T_shift_reg_acc_r <opc1, "|= ", OpNode,  or, 0b00, minOp>;
-}
-
-multiclass xtype_reg_acc_p<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  def _acc : T_shift_reg_acc_p <opc1, "+= ", OpNode, add, 0b110, minOp>;
-
-  def _nac : T_shift_reg_acc_p <opc1, "-= ", OpNode, sub, 0b100, minOp>;
-  def _and : T_shift_reg_acc_p <opc1, "&= ", OpNode, and, 0b010, minOp>;
-  def _or  : T_shift_reg_acc_p <opc1, "|= ", OpNode,  or, 0b000, minOp>;
-  def _xor : T_shift_reg_acc_p <opc1, "^= ", OpNode, xor, 0b011, minOp>;
-}
-
-multiclass xtype_reg_acc<string OpcStr, SDNode OpNode, bits<2> minOp > {
-  defm _r_r : xtype_reg_acc_r <OpcStr, OpNode, minOp>;
-  defm _r_p : xtype_reg_acc_p <OpcStr, OpNode, minOp>;
-}
-
-defm S2_asl : xtype_reg_acc<"asl", shl, 0b10>;
-defm S2_asr : xtype_reg_acc<"asr", sra, 0b00>;
-defm S2_lsr : xtype_reg_acc<"lsr", srl, 0b01>;
-defm S2_lsl : xtype_reg_acc<"lsl", shl, 0b11>;
-
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S3op_1 <string mnemonic, RegisterClass RC, bits<2> MajOp, bits<3> MinOp,
-                bit SwapOps, bit isSat = 0, bit isRnd = 0, bit hasShift = 0>
-  : SInst <(outs RC:$dst),
-           (ins DoubleRegs:$src1, DoubleRegs:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)"#!if(isRnd, ":rnd", "")
-                                     #!if(hasShift,":>>1","")
-                                     #!if(isSat, ":sat", ""),
-  [], "", S_3op_tc_2_SLOT23 > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0001;
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = !if (SwapOps, src2, src1);
-    let Inst{12-8}  = !if (SwapOps, src1, src2);
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = dst;
-  }
-
-class T_S3op_64 <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit SwapOps,
-                 bit isSat = 0, bit isRnd = 0, bit hasShift = 0 >
-  : T_S3op_1 <mnemonic, DoubleRegs, MajOp, MinOp, SwapOps,
-              isSat, isRnd, hasShift>;
-
-let Itinerary = S_3op_tc_1_SLOT23 in {
-  def S2_shuffeb : T_S3op_64 < "shuffeb", 0b00, 0b010, 0>;
-  def S2_shuffeh : T_S3op_64 < "shuffeh", 0b00, 0b110, 0>;
-  def S2_shuffob : T_S3op_64 < "shuffob", 0b00, 0b100, 1>;
-  def S2_shuffoh : T_S3op_64 < "shuffoh", 0b10, 0b000, 1>;
-
-  def S2_vtrunewh : T_S3op_64 < "vtrunewh", 0b10, 0b010, 0>;
-  def S2_vtrunowh : T_S3op_64 < "vtrunowh", 0b10, 0b100, 0>;
-}
-
-def S2_lfsp : T_S3op_64 < "lfs", 0b10, 0b110, 0>;
-
-let hasSideEffects = 0 in
-class T_S3op_2 <string mnemonic, bits<3> MajOp, bit SwapOps>
-  : SInst < (outs DoubleRegs:$Rdd),
-            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu)",
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<2> Pu;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
-    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
-    let Inst{6-5} = Pu;
-    let Inst{4-0} = Rdd;
-  }
-
-def S2_valignrb  : T_S3op_2 < "valignb",  0b000, 1>;
-def S2_vsplicerb : T_S3op_2 < "vspliceb", 0b100, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template class used by vector shift, vector rotate, vector neg,
-// 32-bit shift, 64-bit shifts, etc.
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0 in
-class T_S3op_3 <string mnemonic, RegisterClass RC, bits<2> MajOp,
-                 bits<2> MinOp, bit isSat = 0, list<dag> pattern = [] >
-  : SInst <(outs RC:$dst),
-           (ins RC:$src1, IntRegs:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)"#!if(isSat, ":sat", ""),
-  pattern, "", S_3op_tc_1_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b0110, 0b0011);
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{12-8} = src2;
-    let Inst{7-6} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-let hasNewValue = 1 in
-class T_S3op_shift32 <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, IntRegs, 0b01, MinOp, 0, []>;
-
-let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in
-class T_S3op_shift32_Sat <string mnemonic, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, IntRegs, 0b00, MinOp, 1, []>;
-
-
-class T_S3op_shift64 <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, DoubleRegs, 0b10, MinOp, 0, []>;
-
-
-class T_S3op_shiftVect <string mnemonic, bits<2> MajOp, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, DoubleRegs, MajOp, MinOp, 0, []>;
-
-
-// Shift by register
-// Rdd=[asr|lsr|asl|lsl](Rss,Rt)
-
-def S2_asr_r_p : T_S3op_shift64 < "asr", sra, 0b00>;
-def S2_lsr_r_p : T_S3op_shift64 < "lsr", srl, 0b01>;
-def S2_asl_r_p : T_S3op_shift64 < "asl", shl, 0b10>;
-def S2_lsl_r_p : T_S3op_shift64 < "lsl", shl, 0b11>;
-
-// Rd=[asr|lsr|asl|lsl](Rs,Rt)
-
-def S2_asr_r_r : T_S3op_shift32<"asr", sra, 0b00>;
-def S2_lsr_r_r : T_S3op_shift32<"lsr", srl, 0b01>;
-def S2_asl_r_r : T_S3op_shift32<"asl", shl, 0b10>;
-def S2_lsl_r_r : T_S3op_shift32<"lsl", shl, 0b11>;
-
-// Shift by register with saturation
-// Rd=asr(Rs,Rt):sat
-// Rd=asl(Rs,Rt):sat
-
-let Defs = [USR_OVF] in {
-  def S2_asr_r_r_sat : T_S3op_shift32_Sat<"asr", 0b00>;
-  def S2_asl_r_r_sat : T_S3op_shift32_Sat<"asl", 0b10>;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_S3op_8 <string opc, bits<3> MinOp, bit isSat, bit isRnd, bit hasShift, bit hasSplat = 0>
-  : SInst < (outs IntRegs:$Rd),
-            (ins DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rd = "#opc#"($Rss, $Rt"#!if(hasSplat, "*", "")#")"
-                           #!if(hasShift, ":<<1", "")
-                           #!if(isRnd, ":rnd", "")
-                           #!if(isSat, ":sat", ""),
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = Rd;
-  }
-
-def S2_asr_r_svw_trun : T_S3op_8<"vasrw", 0b010, 0, 0, 0>;
-
-let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def S2_vcrotate : T_S3op_shiftVect < "vcrotate", 0b11, 0b00>;
-
-let hasSideEffects = 0 in
-class T_S3op_7 <string mnemonic, bit MajOp >
-  : SInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, u3_0Imm:$u3),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, #$u3)" ,
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<3> u3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0000;
-    let Inst{23}    = MajOp;
-    let Inst{20-16} = !if(MajOp, Rss, Rtt);
-    let Inst{12-8}  =  !if(MajOp, Rtt, Rss);
-    let Inst{7-5}   = u3;
-    let Inst{4-0}   = Rdd;
-  }
-
-def S2_valignib  : T_S3op_7 < "valignb", 0>;
-def S2_vspliceib : T_S3op_7 < "vspliceb", 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for 'insert bitfield' instructions
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S3op_insert <string mnemonic, RegisterClass RC>
-  : SInst <(outs RC:$dst),
-           (ins RC:$src1, RC:$src2, DoubleRegs:$src3),
-  "$dst = "#mnemonic#"($src2, $src3)" ,
-  [], "$src1 = $dst", S_3op_tc_1_SLOT23 > {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> src3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-26} = 0b10;
-    let Inst{25-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b00, 0b10);
-    let Inst{23}    = 0b0;
-    let Inst{20-16} = src2;
-    let Inst{12-8}  = src3;
-    let Inst{4-0}   = dst;
-  }
-
-let hasSideEffects = 0 in
-class T_S2op_insert <bits<4> RegTyBits, RegisterClass RC, Operand ImmOp>
-  : SInst <(outs RC:$dst), (ins RC:$dst2, RC:$src1, ImmOp:$src2, ImmOp:$src3),
-  "$dst = insert($src1, #$src2, #$src3)",
-  [], "$dst2 = $dst", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<6> src2;
-    bits<6> src3;
-    bit bit23;
-    bit bit13;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5}, 0);
-    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23}    = bit23;
-    let Inst{22-21} = src3{4-3};
-    let Inst{20-16} = src1;
-    let Inst{13}    = bit13;
-    let Inst{12-8}  = src2{4-0};
-    let Inst{7-5}   = src3{2-0};
-    let Inst{4-0}   = dst;
-  }
-
-// Rx=insert(Rs,Rtt)
-// Rx=insert(Rs,#u5,#U5)
-let hasNewValue = 1 in {
-  def S2_insert_rp : T_S3op_insert <"insert", IntRegs>;
-  def S2_insert    : T_S2op_insert <0b1111, IntRegs, u5_0Imm>;
-}
-
-// Rxx=insert(Rss,Rtt)
-// Rxx=insert(Rss,#u6,#U6)
-def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>;
-def S2_insertp    : T_S2op_insert <0b0011, DoubleRegs, u6_0Imm>;
-
-
-//===----------------------------------------------------------------------===//
-// Template class for 'extract bitfield' instructions
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_S3op_extract <string mnemonic, bits<2> MinOp>
-  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
-  "$Rd = "#mnemonic#"($Rs, $Rtt)",
-  [], "", S_3op_tc_2_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rtt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b100100;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-6}   = MinOp;
-    let Inst{4-0}   = Rd;
-  }
-
-let hasSideEffects = 0 in
-class T_S2op_extract <string mnemonic, bits<4> RegTyBits,
-                      RegisterClass RC, Operand ImmOp>
-  : SInst <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2, ImmOp:$src3),
-  "$dst = "#mnemonic#"($src1, #$src2, #$src3)",
-  [], "", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<6> src2;
-    bits<6> src3;
-    bit bit23;
-    bit bit13;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5},
-                !if (!eq(mnemonic, "extractu"), 0, 1));
-
-    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23}    = bit23;
-    let Inst{22-21} = src3{4-3};
-    let Inst{20-16} = src1;
-    let Inst{13}    = bit13;
-    let Inst{12-8}  = src2{4-0};
-    let Inst{7-5}   = src3{2-0};
-    let Inst{4-0}   = dst;
-  }
-
-// Extract bitfield
-
-// Rdd=extractu(Rss,Rtt)
-// Rdd=extractu(Rss,#u6,#U6)
-def S2_extractup_rp : T_S3op_64 < "extractu", 0b00, 0b000, 0>;
-def S2_extractup    : T_S2op_extract <"extractu", 0b0001, DoubleRegs, u6_0Imm>;
-
-// Rd=extractu(Rs,Rtt)
-// Rd=extractu(Rs,#u5,#U5)
-let hasNewValue = 1 in {
-  def S2_extractu_rp : T_S3op_extract<"extractu", 0b00>;
-  def S2_extractu    : T_S2op_extract <"extractu", 0b1101, IntRegs, u5_0Imm>;
-}
-
-//===----------------------------------------------------------------------===//
-// :raw for of tableindx[bdhw] insns
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class tableidxRaw<string OpStr, bits<2>MinOp>
-  : SInst <(outs IntRegs:$Rx),
-           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, s6_0Imm:$S6),
-           "$Rx = "#OpStr#"($Rs, #$u4, #$S6):raw",
-    [], "$Rx = $_dst_" > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<4> u4;
-    bits<6> S6;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b0111;
-    let Inst{23-22} = MinOp;
-    let Inst{21}    = u4{3};
-    let Inst{20-16} = Rs;
-    let Inst{13-8}  = S6;
-    let Inst{7-5}   = u4{2-0};
-    let Inst{4-0}   = Rx;
-  }
-
-def S2_tableidxb : tableidxRaw<"tableidxb", 0b00>;
-def S2_tableidxh : tableidxRaw<"tableidxh", 0b01>;
-def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>;
-def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>;
-
-//===----------------------------------------------------------------------===//
-// Template class for 'table index' instructions which are assembler mapped
-// to their :raw format.
-//===----------------------------------------------------------------------===//
-let isPseudo = 1 in
-class tableidx_goodsyntax <string mnemonic>
-  : SInst <(outs IntRegs:$Rx),
-           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, u5_0Imm:$u5),
-           "$Rx = "#mnemonic#"($Rs, #$u4, #$u5)",
-           [], "$Rx = $_dst_" >;
-
-def S2_tableidxb_goodsyntax : tableidx_goodsyntax<"tableidxb">;
-def S2_tableidxh_goodsyntax : tableidx_goodsyntax<"tableidxh">;
-def S2_tableidxw_goodsyntax : tableidx_goodsyntax<"tableidxw">;
-def S2_tableidxd_goodsyntax : tableidx_goodsyntax<"tableidxd">;
-
-//===----------------------------------------------------------------------===//
-// V3 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV3.td"
-
-//===----------------------------------------------------------------------===//
-// V3 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V4 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV4.td"
-
-//===----------------------------------------------------------------------===//
-// V4 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V5 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV5.td"
-
-//===----------------------------------------------------------------------===//
-// V5 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V60 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV60.td"
-
-//===----------------------------------------------------------------------===//
-// V60 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU32/64/Vector +
-//===----------------------------------------------------------------------===///
-
-include "HexagonInstrInfoVector.td"
-
-include "HexagonInstrAlias.td"
-include "HexagonSystemInst.td"
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
deleted file mode 100644
index 225f944..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ /dev/null
@@ -1,215 +0,0 @@
-//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V3 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// J +
-//===----------------------------------------------------------------------===//
-// Call subroutine.
-let isCall = 1, hasSideEffects = 1, isPredicable = 1,
-    isExtended = 0, isExtendable = 1, opExtendable = 0,
-    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
-class T_Call<bit CSR, string ExtStr>
-  : JInst<(outs), (ins calltarget:$dst),
-      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
-  let BaseOpcode = "call";
-  bits<24> dst;
-
-  let Defs = !if (CSR, VolatileV3.Regs, []);
-  let IClass = 0b0101;
-  let Inst{27-25} = 0b101;
-  let Inst{24-16,13-1} = dst{23-2};
-  let Inst{0} = 0b0;
-}
-
-let isCall = 1, hasSideEffects = 1, isPredicated = 1,
-    isExtended = 0, isExtendable = 1, opExtendable = 1,
-    isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
-class T_CallPred<bit CSR, bit IfTrue, string ExtStr>
-  : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
-      CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
-      [], "", J_tc_2early_SLOT23> {
-  let BaseOpcode = "call";
-  let isPredicatedFalse = !if(IfTrue,0,1);
-  bits<2> Pu;
-  bits<17> dst;
-
-  let Defs = !if (CSR, VolatileV3.Regs, []);
-  let IClass = 0b0101;
-  let Inst{27-24} = 0b1101;
-  let Inst{23-22,20-16,13,7-1} = dst{16-2};
-  let Inst{21} = !if(IfTrue,0,1);
-  let Inst{11} = 0b0;
-  let Inst{9-8} = Pu;
-}
-
-multiclass T_Calls<bit CSR, string ExtStr> {
-  def NAME : T_Call<CSR, ExtStr>;
-  def t    : T_CallPred<CSR, 1, ExtStr>;
-  def f    : T_CallPred<CSR, 0, ExtStr>;
-}
-
-defm J2_call: T_Calls<1, "">, PredRel;
-
-let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
-    Defs = VolatileV3.Regs in
-def PS_call_nr : T_Call<1, "">, PredRel;
-
-let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
-    Defs = [PC, R31, R6, R7, P0] in
-def PS_call_stk :  T_Call<0, "">, PredRel;
-
-//===----------------------------------------------------------------------===//
-// J -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// JR +
-//===----------------------------------------------------------------------===//
-// Call subroutine from register.
-
-let isCodeGenOnly = 1, Defs = VolatileV3.Regs in {
-  def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
-}
-
-//===----------------------------------------------------------------------===//
-// JR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU +
-//===----------------------------------------------------------------------===//
-
-let Defs = [USR_OVF], Itinerary = ALU64_tc_2_SLOT23 in
-def A2_addpsat : T_ALU64_arith<"add", 0b011, 0b101, 1, 0, 1>;
-
-class T_ALU64_addsp_hl<string suffix, bits<3> MinOp>
-  : T_ALU64_rr<"add", suffix, 0b0011, 0b011, MinOp, 0, 0, "">;
-
-def A2_addspl : T_ALU64_addsp_hl<":raw:lo", 0b110>;
-def A2_addsph : T_ALU64_addsp_hl<":raw:hi", 0b111>;
-
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def A2_addsp : ALU64_rr<(outs DoubleRegs:$Rd),
-  (ins IntRegs:$Rs, DoubleRegs:$Rt), "$Rd = add($Rs, $Rt)", [],
-  "", ALU64_tc_1_SLOT23>;
-
-
-let hasSideEffects = 0 in
-class T_XTYPE_MIN_MAX_P<bit isMax, bit isUnsigned>
-  : ALU64Inst<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rt, DoubleRegs:$Rs),
-  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
-          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-
-  let Inst{27-23} = 0b00111;
-  let Inst{22-21} = !if(isMax, 0b10, 0b01);
-  let Inst{20-16} = !if(isMax, Rt, Rs);
-  let Inst{12-8} = !if(isMax, Rs, Rt);
-  let Inst{7} = 0b1;
-  let Inst{6} = !if(isMax, 0b0, 0b1);
-  let Inst{5} = isUnsigned;
-  let Inst{4-0} = Rd;
-}
-
-def A2_minp  : T_XTYPE_MIN_MAX_P<0, 0>;
-def A2_minup : T_XTYPE_MIN_MAX_P<0, 1>;
-def A2_maxp  : T_XTYPE_MIN_MAX_P<1, 0>;
-def A2_maxup : T_XTYPE_MIN_MAX_P<1, 1>;
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// :raw form of vrcmpys:hi/lo insns
-//===----------------------------------------------------------------------===//
-// Vector reduce complex multiply by scalar.
-let Defs = [USR_OVF], hasSideEffects = 0 in
-class T_vrcmpRaw<string HiLo, bits<3>MajOp>:
-  MInst<(outs DoubleRegs:$Rdd),
-         (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-         "$Rdd = vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, []> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b100;
-    let Inst{4-0}   = Rdd;
-}
-
-def M2_vrcmpys_s1_h: T_vrcmpRaw<"hi", 0b101>;
-def M2_vrcmpys_s1_l: T_vrcmpRaw<"lo", 0b111>;
-
-// Assembler mapped to M2_vrcmpys_s1_h or M2_vrcmpys_s1_l
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def M2_vrcmpys_s1
- : MInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
- "$Rdd=vrcmpys($Rss,$Rt):<<1:sat">;
-
-// Vector reduce complex multiply by scalar with accumulation.
-let Defs = [USR_OVF], hasSideEffects = 0 in
-class T_vrcmpys_acc<string HiLo, bits<3>MajOp>:
-  MInst <(outs DoubleRegs:$Rxx),
-         (ins DoubleRegs:$_src_, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, [],
-  "$Rxx = $_src_"> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b100;
-    let Inst{4-0}   = Rxx;
-  }
-
-def M2_vrcmpys_acc_s1_h: T_vrcmpys_acc<"hi", 0b101>;
-def M2_vrcmpys_acc_s1_l: T_vrcmpys_acc<"lo", 0b111>;
-
-// Assembler mapped to M2_vrcmpys_acc_s1_h or M2_vrcmpys_acc_s1_l
-
-let isAsmParserOnly = 1 in
-def M2_vrcmpys_acc_s1
-  : MInst <(outs DoubleRegs:$dst),
-           (ins DoubleRegs:$dst2, DoubleRegs:$src1, IntRegs:$src2),
-           "$dst += vrcmpys($src1, $src2):<<1:sat", [],
-           "$dst2 = $dst">;
-
-def M2_vrcmpys_s1rp_h : T_MType_vrcmpy <"vrcmpys", 0b101, 0b110, 1>;
-def M2_vrcmpys_s1rp_l : T_MType_vrcmpy <"vrcmpys", 0b101, 0b111, 0>;
-
-// Assembler mapped to M2_vrcmpys_s1rp_h or M2_vrcmpys_s1rp_l
-let isAsmParserOnly = 1 in
-def M2_vrcmpys_s1rp
-  : MInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rd=vrcmpys($Rss,$Rt):<<1:rnd:sat">;
-
-
-// S2_cabacdecbin: Cabac decode bin.
-let Defs = [P0], isPredicateLate = 1, Itinerary = S_3op_tc_1_SLOT23 in
-def S2_cabacdecbin : T_S3op_64 < "decbin", 0b11, 0b110, 0>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
deleted file mode 100644
index 18943a0..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ /dev/null
@@ -1,3301 +0,0 @@
-//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V4 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-def DuplexIClass0:  InstDuplex < 0 >;
-def DuplexIClass1:  InstDuplex < 1 >;
-def DuplexIClass2:  InstDuplex < 2 >;
-let isExtendable = 1 in {
-  def DuplexIClass3:  InstDuplex < 3 >;
-  def DuplexIClass4:  InstDuplex < 4 >;
-  def DuplexIClass5:  InstDuplex < 5 >;
-  def DuplexIClass6:  InstDuplex < 6 >;
-  def DuplexIClass7:  InstDuplex < 7 >;
-}
-def DuplexIClass8:  InstDuplex < 8 >;
-def DuplexIClass9:  InstDuplex < 9 >;
-def DuplexIClassA:  InstDuplex < 0xA >;
-def DuplexIClassB:  InstDuplex < 0xB >;
-def DuplexIClassC:  InstDuplex < 0xC >;
-def DuplexIClassD:  InstDuplex < 0xD >;
-def DuplexIClassE:  InstDuplex < 0xE >;
-def DuplexIClassF:  InstDuplex < 0xF >;
-
-let hasSideEffects = 0 in
-class T_Immext<Operand ImmType>
-  : EXTENDERInst<(outs), (ins ImmType:$imm),
-                 "immext(#$imm)", []> {
-    bits<32> imm;
-    let IClass = 0b0000;
-
-    let Inst{27-16} = imm{31-20};
-    let Inst{13-0} = imm{19-6};
-  }
-
-def A4_ext : T_Immext<u26_6Imm>;
-let isCodeGenOnly = 1 in {
-  let isBranch = 1 in
-    def A4_ext_b : T_Immext<brtarget>;
-  let isCall = 1 in
-    def A4_ext_c : T_Immext<calltarget>;
-  def A4_ext_g : T_Immext<globaladdress>;
-}
-
-// Hexagon V4 Architecture spec defines 8 instruction classes:
-// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
-// compiler)
-
-// LD Instructions:
-// ========================================
-// Loads (8/16/32/64 bit)
-// Deallocframe
-
-// ST Instructions:
-// ========================================
-// Stores (8/16/32/64 bit)
-// Allocframe
-
-// ALU32 Instructions:
-// ========================================
-// Arithmetic / Logical (32 bit)
-// Vector Halfword
-
-// XTYPE Instructions (32/64 bit):
-// ========================================
-// Arithmetic, Logical, Bit Manipulation
-// Multiply (Integer, Fractional, Complex)
-// Permute / Vector Permute Operations
-// Predicate Operations
-// Shift / Shift with Add/Sub/Logical
-// Vector Byte ALU
-// Vector Halfword (ALU, Shift, Multiply)
-// Vector Word (ALU, Shift)
-
-// J Instructions:
-// ========================================
-// Jump/Call PC-relative
-
-// JR Instructions:
-// ========================================
-// Jump/Call Register
-
-// MEMOP Instructions:
-// ========================================
-// Operation on memory (8/16/32 bit)
-
-// NV Instructions:
-// ========================================
-// New-value Jumps
-// New-value Stores
-
-// CR Instructions:
-// ========================================
-// Control-Register Transfers
-// Hardware Loop Setup
-// Predicate Logicals & Reductions
-
-// SYSTEM Instructions (not implemented in the compiler):
-// ========================================
-// Prefetch
-// Cache Maintenance
-// Bus Operations
-
-
-//===----------------------------------------------------------------------===//
-// ALU32 +
-//===----------------------------------------------------------------------===//
-
-class T_ALU32_3op_not<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                      bit OpsRev>
-  : T_ALU32_3op<mnemonic, MajOp, MinOp, OpsRev, 0> {
-  let AsmString = "$Rd = "#mnemonic#"($Rs, ~$Rt)";
-}
-
-let BaseOpcode = "andn_rr", CextOpcode = "andn" in
-def A4_andn    : T_ALU32_3op_not<"and", 0b001, 0b100, 1>;
-let BaseOpcode = "orn_rr", CextOpcode = "orn" in
-def A4_orn     : T_ALU32_3op_not<"or",  0b001, 0b101, 1>;
-
-let CextOpcode = "rcmp.eq" in
-def A4_rcmpeq  : T_ALU32_3op<"cmp.eq",  0b011, 0b010, 0, 1>;
-let CextOpcode = "!rcmp.eq" in
-def A4_rcmpneq : T_ALU32_3op<"!cmp.eq", 0b011, 0b011, 0, 1>;
-
-def C4_cmpneq  : T_ALU32_3op_cmp<"!cmp.eq",  0b00, 1, 1>;
-def C4_cmplte  : T_ALU32_3op_cmp<"!cmp.gt",  0b10, 1, 0>;
-def C4_cmplteu : T_ALU32_3op_cmp<"!cmp.gtu", 0b11, 1, 0>;
-
-class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm>
-  : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-    "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", S_3op_tc_2early_SLOT23>,
-    ImmRegRel {
-  let InputType = "reg";
-  let CextOpcode = mnemonic;
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1100;
-  let Inst{27-21} = 0b0111110;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = MinOp;
-  let Inst{1-0} = Pd;
-}
-
-def A4_cmpbeq  : T_CMP_rrbh<"cmpb.eq",  0b110, 1>;
-def A4_cmpbgt  : T_CMP_rrbh<"cmpb.gt",  0b010, 0>;
-def A4_cmpbgtu : T_CMP_rrbh<"cmpb.gtu", 0b111, 0>;
-def A4_cmpheq  : T_CMP_rrbh<"cmph.eq",  0b011, 1>;
-def A4_cmphgt  : T_CMP_rrbh<"cmph.gt",  0b100, 0>;
-def A4_cmphgtu : T_CMP_rrbh<"cmph.gtu", 0b101, 0>;
-
-class T_CMP_ribh<string mnemonic, bits<2> MajOp, bit IsHalf, bit IsComm,
-                 Operand ImmType, bit IsImmExt, bit IsImmSigned, int ImmBits>
-  : ALU64Inst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, ImmType:$Imm),
-    "$Pd = "#mnemonic#"($Rs, #$Imm)", [], "", ALU64_tc_2early_SLOT23>,
-    ImmRegRel {
-  let InputType = "imm";
-  let CextOpcode = mnemonic;
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-  let isExtendable = IsImmExt;
-  let opExtendable = !if (IsImmExt, 2, 0);
-  let isExtentSigned = IsImmSigned;
-  let opExtentBits = ImmBits;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<8> Imm;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b1101;
-  let Inst{22-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{12-5} = Imm;
-  let Inst{4} = 0b0;
-  let Inst{3} = IsHalf;
-  let Inst{1-0} = Pd;
-}
-
-def A4_cmpbeqi  : T_CMP_ribh<"cmpb.eq",  0b00, 0, 1, u8_0Imm, 0, 0, 8>;
-def A4_cmpbgti  : T_CMP_ribh<"cmpb.gt",  0b01, 0, 0, s8_0Imm, 0, 1, 8>;
-def A4_cmpbgtui : T_CMP_ribh<"cmpb.gtu", 0b10, 0, 0, u7_0Ext, 1, 0, 7>;
-def A4_cmpheqi  : T_CMP_ribh<"cmph.eq",  0b00, 1, 1, s8_0Ext, 1, 1, 8>;
-def A4_cmphgti  : T_CMP_ribh<"cmph.gt",  0b01, 1, 0, s8_0Ext, 1, 1, 8>;
-def A4_cmphgtui : T_CMP_ribh<"cmph.gtu", 0b10, 1, 0, u7_0Ext, 1, 0, 7>;
-
-class T_RCMP_EQ_ri<string mnemonic, bit IsNeg>
-  : ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s8_0Ext:$s8),
-    "$Rd = "#mnemonic#"($Rs, #$s8)", [], "", ALU32_2op_tc_1_SLOT0123>,
-    ImmRegRel {
-  let InputType = "imm";
-  let CextOpcode = !if (IsNeg, "!rcmp.eq", "rcmp.eq");
-  let isExtendable = 1;
-  let opExtendable = 2;
-  let isExtentSigned = 1;
-  let opExtentBits = 8;
-  let hasNewValue = 1;
-
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<8> s8;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b0011;
-  let Inst{22} = 0b1;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b1;
-  let Inst{12-5} = s8;
-  let Inst{4-0} = Rd;
-}
-
-def A4_rcmpeqi  : T_RCMP_EQ_ri<"cmp.eq",  0>;
-def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>;
-
-//===----------------------------------------------------------------------===//
-// ALU32 -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM +
-//===----------------------------------------------------------------------===//
-
-// Combine a word and an immediate into a register pair.
-let hasSideEffects = 0, isExtentSigned = 1, isExtendable = 1,
-    opExtentBits = 8 in
-class T_Combine1 <bits<2> MajOp, dag ins, string AsmStr>
-  : ALU32Inst <(outs DoubleRegs:$Rdd), ins, AsmStr> {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<8> s8;
-
-    let IClass      = 0b0111;
-    let Inst{27-24} = 0b0011;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b1;
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-let opExtendable = 2 in
-def A4_combineri : T_Combine1<0b00, (ins IntRegs:$Rs, s8_0Ext:$s8),
-                                    "$Rdd = combine($Rs, #$s8)">;
-
-let opExtendable = 1 in
-def A4_combineir : T_Combine1<0b01, (ins s8_0Ext:$s8, IntRegs:$Rs),
-                                    "$Rdd = combine(#$s8, $Rs)">;
-
-// A4_combineii: Set two small immediates.
-let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in
-def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8_0Imm:$s8, u6_0Ext:$U6),
-  "$Rdd = combine(#$s8, #$U6)"> {
-    bits<5> Rdd;
-    bits<8> s8;
-    bits<6> U6;
-
-    let IClass = 0b0111;
-    let Inst{27-23} = 0b11001;
-    let Inst{20-16} = U6{5-1};
-    let Inst{13}    = U6{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LD +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template class for load instructions with Absolute set addressing mode.
-//===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, opExtentBits = 6, addrMode = AbsoluteSet,
-    hasSideEffects = 0 in
-class T_LD_abs_set<string mnemonic, RegisterClass RC, bits<4>MajOp>:
-            LDInst<(outs RC:$dst1, IntRegs:$dst2),
-            (ins u6_0Ext:$addr),
-            "$dst1 = "#mnemonic#"($dst2 = #$addr)",
-            []> {
-  bits<7> name;
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<6> addr;
-
-  let IClass = 0b1001;
-  let Inst{27-25} = 0b101;
-  let Inst{24-21} = MajOp;
-  let Inst{13-12} = 0b01;
-  let Inst{4-0}   = dst1;
-  let Inst{20-16} = dst2;
-  let Inst{11-8}  = addr{5-2};
-  let Inst{6-5}   = addr{1-0};
-}
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  def L4_loadrb_ap   : T_LD_abs_set <"memb",   IntRegs, 0b1000>;
-  def L4_loadrub_ap  : T_LD_abs_set <"memub",  IntRegs, 0b1001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  def L4_loadrh_ap  : T_LD_abs_set <"memh",  IntRegs, 0b1010>;
-  def L4_loadruh_ap : T_LD_abs_set <"memuh", IntRegs, 0b1011>;
-  def L4_loadbsw2_ap : T_LD_abs_set <"membh",  IntRegs, 0b0001>;
-  def L4_loadbzw2_ap : T_LD_abs_set <"memubh", IntRegs, 0b0011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-  def L4_loadri_ap : T_LD_abs_set <"memw", IntRegs, 0b1100>;
-
-let accessSize = WordAccess in {
-  def L4_loadbzw4_ap : T_LD_abs_set <"memubh", DoubleRegs, 0b0101>;
-  def L4_loadbsw4_ap : T_LD_abs_set <"membh",  DoubleRegs, 0b0111>;
-}
-
-let accessSize = DoubleWordAccess in
-def L4_loadrd_ap : T_LD_abs_set <"memd", DoubleRegs, 0b1110>;
-
-let accessSize = ByteAccess in
-  def L4_loadalignb_ap : T_LD_abs_set <"memb_fifo", DoubleRegs, 0b0100>;
-
-let accessSize = HalfWordAccess in
-def L4_loadalignh_ap : T_LD_abs_set <"memh_fifo", DoubleRegs, 0b0010>;
-
-// Load - Indirect with long offset
-let InputType = "imm", addrMode = BaseLongOffset, isExtended = 1,
-opExtentBits = 6, opExtendable = 3 in
-class T_LoadAbsReg <string mnemonic, string CextOp, RegisterClass RC,
-                    bits<4> MajOp>
-  : LDInst <(outs RC:$dst), (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3),
-  "$dst = "#mnemonic#"($src1<<#$src2 + #$src3)",
-  [] >, ImmRegShl {
-    bits<5> dst;
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    let CextOpcode = CextOp;
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12}    = 0b1;
-    let Inst{11-8}  = src3{5-2};
-    let Inst{7}     = src2{0};
-    let Inst{6-5}   = src3{1-0};
-    let Inst{4-0}   = dst;
-  }
-
-let accessSize = ByteAccess in {
-  def L4_loadrb_ur  : T_LoadAbsReg<"memb",  "LDrib", IntRegs, 0b1000>;
-  def L4_loadrub_ur : T_LoadAbsReg<"memub", "LDriub", IntRegs, 0b1001>;
-  def L4_loadalignb_ur : T_LoadAbsReg<"memb_fifo", "LDrib_fifo",
-                                      DoubleRegs, 0b0100>;
-}
-
-let accessSize = HalfWordAccess in {
-  def L4_loadrh_ur   : T_LoadAbsReg<"memh",   "LDrih",    IntRegs, 0b1010>;
-  def L4_loadruh_ur  : T_LoadAbsReg<"memuh",  "LDriuh",   IntRegs, 0b1011>;
-  def L4_loadbsw2_ur : T_LoadAbsReg<"membh",  "LDribh2",  IntRegs, 0b0001>;
-  def L4_loadbzw2_ur : T_LoadAbsReg<"memubh", "LDriubh2", IntRegs, 0b0011>;
-  def L4_loadalignh_ur : T_LoadAbsReg<"memh_fifo", "LDrih_fifo",
-                                      DoubleRegs, 0b0010>;
-}
-
-let accessSize = WordAccess in {
-  def L4_loadri_ur   : T_LoadAbsReg<"memw", "LDriw", IntRegs, 0b1100>;
-  def L4_loadbsw4_ur : T_LoadAbsReg<"membh", "LDribh4", DoubleRegs, 0b0111>;
-  def L4_loadbzw4_ur : T_LoadAbsReg<"memubh", "LDriubh4", DoubleRegs, 0b0101>;
-}
-
-let accessSize = DoubleWordAccess in
-def L4_loadrd_ur  : T_LoadAbsReg<"memd", "LDrid", DoubleRegs, 0b1110>;
-
-
-//===----------------------------------------------------------------------===//
-// Template classes for the non-predicated load instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-class T_load_rr <string mnemonic, RegisterClass RC, bits<3> MajOp>:
-   LDInst<(outs RC:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2_0Imm:$u2),
-  "$dst = "#mnemonic#"($src1 + $src2<<#$u2)",
-  [], "", V4LDST_tc_ld_SLOT01>, ImmRegShl, AddrModeRel {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-    bits<2> u2;
-
-    let IClass = 0b0011;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{12-8}  = src2;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated load instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated =  1 in
-class T_pload_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                  bit isNot, bit isPredNew>:
-   LDInst <(outs RC:$dst),
-           (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2_0Imm:$u2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$u2)",
-  [], "", V4LDST_tc_ld_SLOT01>, AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<5> src3;
-    bits<2> u2;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-
-    let IClass = 0b0011;
-
-    let Inst{27-26} = 0b00;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{12-8}  = src3;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = src1;
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// multiclass for load instructions with base + register offset
-// addressing mode
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = BaseRegOffset in
-multiclass ld_idxd_shl <string mnemonic, string CextOp, RegisterClass RC,
-                        bits<3> MajOp > {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl,
-      InputType = "reg" in {
-    let isPredicable = 1 in
-    def L4_#NAME#_rr : T_load_rr <mnemonic, RC, MajOp>;
-
-    // Predicated
-    def L4_p#NAME#t_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 0>;
-    def L4_p#NAME#f_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 0>;
-
-    // Predicated new
-    def L4_p#NAME#tnew_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 1>;
-    def L4_p#NAME#fnew_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 1>;
-  }
-}
-
-let hasNewValue = 1, accessSize = ByteAccess in {
-  defm loadrb  : ld_idxd_shl<"memb", "LDrib", IntRegs, 0b000>;
-  defm loadrub : ld_idxd_shl<"memub", "LDriub", IntRegs, 0b001>;
-}
-
-let hasNewValue = 1, accessSize = HalfWordAccess in {
-  defm loadrh  : ld_idxd_shl<"memh", "LDrih", IntRegs, 0b010>;
-  defm loadruh : ld_idxd_shl<"memuh", "LDriuh", IntRegs, 0b011>;
-}
-
-let hasNewValue = 1, accessSize = WordAccess in
-defm loadri : ld_idxd_shl<"memw", "LDriw", IntRegs, 0b100>;
-
-let accessSize = DoubleWordAccess in
-defm loadrd  : ld_idxd_shl<"memd", "LDrid", DoubleRegs, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// LD -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ST +
-//===----------------------------------------------------------------------===//
-///
-//===----------------------------------------------------------------------===//
-// Template class for store instructions with Absolute set addressing mode.
-//===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 1, opExtentBits = 6,
-    addrMode = AbsoluteSet in
-class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
-                   bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst<(outs IntRegs:$dst),
-           (ins u6_0Ext:$addr, RC:$src),
-    mnemonic#"($dst = #$addr) = $src"#!if(isHalf, ".h","")>, NewValueRel {
-    bits<5> dst;
-    bits<6> addr;
-    bits<5> src;
-    let accessSize = AccessSz;
-    let BaseOpcode = BaseOp#"_AbsSet";
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = dst;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src;
-    let Inst{7}     = 0b1;
-    let Inst{5-0}   = addr;
-  }
-
-def S4_storerb_ap : T_ST_absset <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
-def S4_storerh_ap : T_ST_absset <"memh", "STrih", IntRegs, 0b010,
-                                 HalfWordAccess>;
-def S4_storeri_ap : T_ST_absset <"memw", "STriw", IntRegs, 0b100, WordAccess>;
-
-let isNVStorable = 0 in {
-  def S4_storerf_ap : T_ST_absset <"memh", "STrif", IntRegs,
-                                   0b011, HalfWordAccess, 1>;
-  def S4_storerd_ap : T_ST_absset <"memd", "STrid", DoubleRegs,
-                                   0b110, DoubleWordAccess>;
-}
-
-let opExtendable = 1, isNewValue = 1, isNVStore = 1, opNewValue = 2,
-isExtended = 1, opExtentBits= 6 in
-class T_ST_absset_nv <string mnemonic, string BaseOp, bits<2> MajOp,
-                      MemAccessSize AccessSz >
-  : NVInst <(outs IntRegs:$dst),
-            (ins u6_0Ext:$addr, IntRegs:$src),
-    mnemonic#"($dst = #$addr) = $src.new">, NewValueRel {
-    bits<5> dst;
-    bits<6> addr;
-    bits<3> src;
-    let accessSize = AccessSz;
-    let BaseOpcode = BaseOp#"_AbsSet";
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = dst;
-    let Inst{13-11} = 0b000;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src;
-    let Inst{7}     = 0b1;
-    let Inst{5-0}   = addr;
-  }
-
-let mayStore = 1, addrMode = AbsoluteSet in {
-  def S4_storerbnew_ap : T_ST_absset_nv <"memb", "STrib", 0b00, ByteAccess>;
-  def S4_storerhnew_ap : T_ST_absset_nv <"memh", "STrih", 0b01, HalfWordAccess>;
-  def S4_storerinew_ap : T_ST_absset_nv <"memw", "STriw", 0b10, WordAccess>;
-}
-
-let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
-    addrMode = BaseLongOffset, AddedComplexity = 40 in
-class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
-                     bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst<(outs),
-           (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, RC:$src4),
-   mnemonic#"($src1<<#$src2 + #$src3) = $src4"#!if(isHalf, ".h",""),
-   []>, ImmRegShl, NewValueRel {
-
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    bits<5> src4;
-
-    let accessSize = AccessSz;
-    let CextOpcode = CextOp;
-    let BaseOpcode = CextOp#"_shl";
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} =0b1101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12-8}  = src4;
-    let Inst{7}     = 0b1;
-    let Inst{6}     = src2{0};
-    let Inst{5-0}   = src3;
-}
-
-def S4_storerb_ur : T_StoreAbsReg <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
-def S4_storerh_ur : T_StoreAbsReg <"memh", "STrih", IntRegs, 0b010,
-                                   HalfWordAccess>;
-def S4_storerf_ur : T_StoreAbsReg <"memh", "STrif", IntRegs, 0b011,
-                                   HalfWordAccess, 1>;
-def S4_storeri_ur : T_StoreAbsReg <"memw", "STriw", IntRegs, 0b100, WordAccess>;
-def S4_storerd_ur : T_StoreAbsReg <"memd", "STrid", DoubleRegs, 0b110,
-                                   DoubleWordAccess>;
-
-let mayStore = 1, isNVStore = 1, isExtended = 1, addrMode = BaseLongOffset,
-    opExtentBits = 6, isNewValue = 1, opNewValue = 3, opExtendable = 2 in
-class T_StoreAbsRegNV <string mnemonic, string CextOp, bits<2> MajOp,
-                       MemAccessSize AccessSz>
-  : NVInst <(outs ),
-            (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, IntRegs:$src4),
-  mnemonic#"($src1<<#$src2 + #$src3) = $src4.new">, NewValueRel {
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    bits<3> src4;
-
-    let CextOpcode  = CextOp;
-    let BaseOpcode  = CextOp#"_shl";
-    let IClass      = 0b1010;
-
-    let Inst{27-21} = 0b1101101;
-    let Inst{12-11} = 0b00;
-    let Inst{7}     = 0b1;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src4;
-    let Inst{6}     = src2{0};
-    let Inst{5-0}   = src3;
-  }
-
-def S4_storerbnew_ur : T_StoreAbsRegNV <"memb", "STrib", 0b00, ByteAccess>;
-def S4_storerhnew_ur : T_StoreAbsRegNV <"memh", "STrih", 0b01, HalfWordAccess>;
-def S4_storerinew_ur : T_StoreAbsRegNV <"memw", "STriw", 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Template classes for the non-predicated store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicable = 1 in
-class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
-  : STInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
-  mnemonic#"($Rs + $Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
-  [],"",V4LDST_tc_st_SLOT01>, ImmRegShl, AddrModeRel {
-
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<5> Rt;
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0011;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-0}   = Rt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated = 1 in
-class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                   bit isNot, bit isPredNew, bit isH>
-  : STInst <(outs),
-            (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
-
-  !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
-  [], "", V4LDST_tc_st_SLOT01> , AddrModeRel{
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<5> Rt;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0011;
-
-    let Inst{27-26} = 0b01;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = Pv;
-    let Inst{4-0}   = Rt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the new-value store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, isNewValue = 1, opNewValue = 3 in
-class T_store_new_rr <string mnemonic, bits<2> MajOp> :
-  NVInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
-  mnemonic#"($Rs + $Ru<<#$u2) = $Nt.new",
-  [],"",V4LDST_tc_st_SLOT0>, ImmRegShl, AddrModeRel {
-
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<3> Nt;
-
-    let IClass = 0b0011;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-3}   = MajOp;
-    let Inst{2-0}   = Nt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated new-value store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, isNewValue = 1, opNewValue = 4 in
-class T_pstore_new_rr <string mnemonic, bits<2> MajOp, bit isNot, bit isPredNew>
-  : NVInst<(outs),
-           (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
-   !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-   ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Nt.new",
-   [], "", V4LDST_tc_st_SLOT0>, AddrModeRel {
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<3> Nt;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-
-    let IClass = 0b0011;
-    let Inst{27-26} = 0b01;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = 0b101;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = Pv;
-    let Inst{4-3}   = MajOp;
-    let Inst{2-0}   = Nt;
-  }
-
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + register offset addressing
-// mode
-//===----------------------------------------------------------------------===//
-let isNVStorable = 1 in
-multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC,
-                       bits<3> MajOp, bit isH = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    def S4_#NAME#_rr : T_store_rr <mnemonic, RC, MajOp, isH>;
-
-    // Predicated
-    def S4_p#NAME#t_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 0, isH>;
-    def S4_p#NAME#f_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 0, isH>;
-
-    // Predicated new
-    def S4_p#NAME#tnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 1, isH>;
-    def S4_p#NAME#fnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 1, isH>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass for new-value store instructions with base + register offset
-// addressing mode.
-//===----------------------------------------------------------------------===//
-let mayStore = 1, isNVStore = 1 in
-multiclass ST_Idxd_shl_nv <string mnemonic, string CextOp, RegisterClass RC,
-                           bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    def S4_#NAME#new_rr : T_store_new_rr<mnemonic, MajOp>;
-
-    // Predicated
-    def S4_p#NAME#newt_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 0>;
-    def S4_p#NAME#newf_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 0>;
-
-    // Predicated new
-    def S4_p#NAME#newtnew_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 1>;
-  }
-}
-
-let addrMode = BaseRegOffset, InputType = "reg", hasSideEffects = 0 in {
-  let accessSize = ByteAccess in
-  defm storerb: ST_Idxd_shl<"memb", "STrib", IntRegs, 0b000>,
-                ST_Idxd_shl_nv<"memb", "STrib", IntRegs, 0b00>;
-
-  let accessSize = HalfWordAccess in
-  defm storerh: ST_Idxd_shl<"memh", "STrih", IntRegs, 0b010>,
-                ST_Idxd_shl_nv<"memh", "STrih", IntRegs, 0b01>;
-
-  let accessSize = WordAccess in
-  defm storeri: ST_Idxd_shl<"memw", "STriw", IntRegs, 0b100>,
-                ST_Idxd_shl_nv<"memw", "STriw", IntRegs, 0b10>;
-
-  let isNVStorable = 0, accessSize = DoubleWordAccess in
-  defm storerd: ST_Idxd_shl<"memd", "STrid", DoubleRegs, 0b110>;
-
-  let isNVStorable = 0, accessSize = HalfWordAccess in
-  defm storerf: ST_Idxd_shl<"memh", "STrif", IntRegs, 0b011, 1>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
-    opExtendable = 2 in
-class T_StoreImm <string mnemonic, Operand OffsetOp, bits<2> MajOp >
-  : STInst <(outs ), (ins IntRegs:$Rs, OffsetOp:$offset, s8_0Ext:$S8),
-  mnemonic#"($Rs+#$offset)=#$S8",
-  [], "", V4LDST_tc_st_SLOT01>,
-  ImmRegRel, PredNewRel {
-    bits<5> Rs;
-    bits<8> S8;
-    bits<8> offset;
-    bits<6> offsetBits;
-
-    string OffsetOpStr = !cast<string>(OffsetOp);
-    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
-                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
-                                         /* u6_0Imm */ offset{5-0}));
-
-    let IClass = 0b0011;
-
-    let Inst{27-25} = 0b110;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-7}  = offsetBits;
-    let Inst{13}    = S8{7};
-    let Inst{6-0}   = S8{6-0};
-  }
-
-let isPredicated = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 6,
-    opExtendable = 3 in
-class T_StoreImm_pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
-                       bit isPredNot, bit isPredNew >
-  : STInst <(outs ),
-            (ins PredRegs:$Pv, IntRegs:$Rs, OffsetOp:$offset, s6_0Ext:$S6),
-  !if(isPredNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($Rs+#$offset)=#$S6",
-  [], "", V4LDST_tc_st_SLOT01>,
-  ImmRegRel, PredNewRel {
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<6> S6;
-    bits<8> offset;
-    bits<6> offsetBits;
-
-    string OffsetOpStr = !cast<string>(OffsetOp);
-    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
-                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
-                                         /* u6_0Imm */ offset{5-0}));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b0011;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24}    = isPredNew;
-    let Inst{23}    = isPredNot;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = S6{5};
-    let Inst{12-7}  = offsetBits;
-    let Inst{6-5}   = Pv;
-    let Inst{4-0}   = S6{4-0};
-  }
-
-
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + immediate offset
-// addressing mode and immediate stored value.
-// mem[bhw](Rx++#s4:3)=#s8
-// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
-//===----------------------------------------------------------------------===//
-
-multiclass ST_Imm_Pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
-                        bit PredNot> {
-  def _io    : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 0>;
-  // Predicate new
-  def new_io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 1>;
-}
-
-multiclass ST_Imm <string mnemonic, string CextOp, Operand OffsetOp,
-                   bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
-    def _io : T_StoreImm <mnemonic, OffsetOp, MajOp>;
-
-    defm t : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 0>;
-    defm f : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 1>;
-  }
-}
-
-let hasSideEffects = 0, addrMode = BaseImmOffset,
-    InputType = "imm" in {
-  let accessSize = ByteAccess in
-  defm S4_storeirb : ST_Imm<"memb", "STrib", u6_0Imm, 0b00>;
-
-  let accessSize = HalfWordAccess in
-  defm S4_storeirh : ST_Imm<"memh", "STrih", u6_1Imm, 0b01>;
-
-  let accessSize = WordAccess in
-  defm S4_storeiri : ST_Imm<"memw", "STriw", u6_2Imm, 0b10>;
-}
-
-//===----------------------------------------------------------------------===
-// ST -
-//===----------------------------------------------------------------------===
-
-
-//===----------------------------------------------------------------------===//
-// NV/ST +
-//===----------------------------------------------------------------------===//
-
-let opNewValue = 2, opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
-class T_store_io_nv <string mnemonic, RegisterClass RC,
-                    Operand ImmOp, bits<2>MajOp>
-  : NVInst_V4 <(outs),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-  mnemonic#"($src1+#$src2) = $src3.new",
-  [],"",ST_tc_st_SLOT0> {
-    bits<5> src1;
-    bits<13> src2; // Actual address offset
-    bits<3> src3;
-    bits<11> offsetBits; // Represents offset encoding
-
-    let opExtentBits = !if (!eq(mnemonic, "memb"), 11,
-                       !if (!eq(mnemonic, "memh"), 12,
-                       !if (!eq(mnemonic, "memw"), 13, 0)));
-
-    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
-                        !if (!eq(mnemonic, "memh"), 1,
-                        !if (!eq(mnemonic, "memw"), 2, 0)));
-
-    let offsetBits = !if (!eq(mnemonic, "memb"),  src2{10-0},
-                     !if (!eq(mnemonic, "memh"),  src2{11-1},
-                     !if (!eq(mnemonic, "memw"),  src2{12-2}, 0)));
-
-    let IClass = 0b1010;
-
-    let Inst{27} = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = 0b1101;
-    let Inst{20-16} = src1;
-    let Inst{13} = offsetBits{8};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src3;
-    let Inst{7-0} = offsetBits{7-0};
-  }
-
-let opExtendable = 2, opNewValue = 3, isPredicated = 1 in
-class T_pstore_io_nv <string mnemonic, RegisterClass RC, Operand predImmOp,
-                         bits<2>MajOp, bit PredNot, bit isPredNew>
-  : NVInst_V4 <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC:$src4),
-  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2+#$src3) = $src4.new",
-  [],"",V2LDST_tc_st_SLOT0> {
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> src3;
-    bits<3> src4;
-    bits<6> offsetBits; // Represents offset encoding
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = PredNot;
-    let opExtentBits = !if (!eq(mnemonic, "memb"), 6,
-                       !if (!eq(mnemonic, "memh"), 7,
-                       !if (!eq(mnemonic, "memw"), 8, 0)));
-
-    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
-                        !if (!eq(mnemonic, "memh"), 1,
-                        !if (!eq(mnemonic, "memw"), 2, 0)));
-
-    let offsetBits = !if (!eq(mnemonic, "memb"), src3{5-0},
-                     !if (!eq(mnemonic, "memh"), src3{6-1},
-                     !if (!eq(mnemonic, "memw"), src3{7-2}, 0)));
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b0;
-    let Inst{26}    = PredNot;
-    let Inst{25}    = isPredNew;
-    let Inst{24-21} = 0b0101;
-    let Inst{20-16} = src2;
-    let Inst{13}    = offsetBits{5};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src4;
-    let Inst{7-3}   = offsetBits{4-0};
-    let Inst{2}     = 0b0;
-    let Inst{1-0}   = src1;
-  }
-
-// multiclass for new-value store instructions with base + immediate offset.
-//
-let mayStore = 1, isNVStore = 1, isNewValue = 1, hasSideEffects = 0,
-    isExtendable = 1 in
-multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<2> MajOp> {
-
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    def S2_#NAME#new_io : T_store_io_nv <mnemonic, RC, ImmOp, MajOp>;
-    // Predicated
-    def S2_p#NAME#newt_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 0, 0>;
-    def S2_p#NAME#newf_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 1, 0>;
-    // Predicated new
-    def S4_p#NAME#newtnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
-                                              MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
-                                              MajOp, 1, 1>;
-  }
-}
-
-let addrMode = BaseImmOffset, InputType = "imm" in {
-  let accessSize = ByteAccess in
-  defm storerb: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
-                           u6_0Ext, 0b00>, AddrModeRel;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-  defm storerh: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
-                           u6_1Ext, 0b01>, AddrModeRel;
-
-  let accessSize = WordAccess, opExtentAlign = 2 in
-  defm storeri: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
-                           u6_2Ext, 0b10>, AddrModeRel;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment loads with register offset.
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1 in
-def L2_loadbsw2_pr : T_load_pr <"membh", IntRegs, 0b0001, HalfWordAccess>;
-
-def L2_loadbsw4_pr : T_load_pr <"membh", DoubleRegs, 0b0111, WordAccess>;
-
-let hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pr <string mnemonic, bits<4> MajOp, MemAccessSize AccessSz>
-  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$_dst_),
-              (ins DoubleRegs:$src1, IntRegs:$src2, ModRegs:$src3),
-  "$dst = "#mnemonic#"($src2++$src3)", [],
-  "$src1 = $dst, $src2 = $_dst_"> {
-    bits<5> dst;
-    bits<5> src2;
-    bits<1> src3;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13}    = src3;
-    let Inst{12}    = 0b0;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-def L2_loadalignb_pr : T_loadalign_pr <"memb_fifo", 0b0100, ByteAccess>;
-def L2_loadalignh_pr : T_loadalign_pr <"memh_fifo", 0b0010, HalfWordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment .new stores
-// mem[bhwd](Rx++#s4:[0123])=Nt.new
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
-    isNewValue = 1, opNewValue = 3 in
-class T_StorePI_nv <string mnemonic, Operand ImmOp, bits<2> MajOp >
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins IntRegs:$src1, ImmOp:$offset, IntRegs:$src2),
-  mnemonic#"($src1++#$offset) = $src2.new",
-  [], "$src1 = $_dst_">,
-  AddrModeRel {
-    bits<5> src1;
-    bits<3> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0}));
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = src1;
-    let Inst{13} = 0b0;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src2;
-    let Inst{7} = 0b0;
-    let Inst{6-3} = offsetBits;
-    let Inst{1} = 0b0;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment .new stores
-// if([!]Pv[.new]) mem[bhwd](Rx++#s4:[0123])=Nt.new
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
-    isNewValue = 1, opNewValue = 4 in
-class T_StorePI_nv_pred <string mnemonic, Operand ImmOp,
-                         bits<2> MajOp, bit isPredNot, bit isPredNew >
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins PredRegs:$src1, IntRegs:$src2,
-                      ImmOp:$offset, IntRegs:$src3),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2++#$offset) = $src3.new",
-  [], "$src2 = $_dst_">,
-  AddrModeRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<3> src3;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0}));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src3;
-    let Inst{7} = isPredNew;
-    let Inst{6-3} = offsetBits;
-    let Inst{2} = isPredNot;
-    let Inst{1-0} = src1;
-  }
-
-multiclass ST_PostInc_Pred_nv<string mnemonic, Operand ImmOp,
-                              bits<2> MajOp, bit PredNot> {
-  def _pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 0>;
-
-  // Predicate new
-  def new_pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 1>;
-}
-
-multiclass ST_PostInc_nv<string mnemonic, string BaseOp, Operand ImmOp,
-                         bits<2> MajOp> {
-  let BaseOpcode = "POST_"#BaseOp in {
-    def S2_#NAME#_pi : T_StorePI_nv <mnemonic, ImmOp, MajOp>;
-
-    // Predicated
-    defm S2_p#NAME#t : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 0>;
-    defm S2_p#NAME#f : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 1>;
-  }
-}
-
-let accessSize = ByteAccess in
-defm storerbnew: ST_PostInc_nv <"memb", "STrib", s4_0Imm, 0b00>;
-
-let accessSize = HalfWordAccess in
-defm storerhnew: ST_PostInc_nv <"memh", "STrih", s4_1Imm, 0b01>;
-
-let accessSize = WordAccess in
-defm storerinew: ST_PostInc_nv <"memw", "STriw", s4_2Imm, 0b10>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment .new stores with register offset
-//===----------------------------------------------------------------------===//
-let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
-class T_StorePI_RegNV <string mnemonic, bits<2> MajOp, MemAccessSize AccessSz>
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins IntRegs:$src1, ModRegs:$src2, IntRegs:$src3),
-  #mnemonic#"($src1++$src2) = $src3.new",
-  [], "$src1 = $_dst_"> {
-    bits<5> src1;
-    bits<1> src2;
-    bits<3> src3;
-    let accessSize = AccessSz;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1101101;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src3;
-    let Inst{7}     = 0b0;
-  }
-
-def S2_storerbnew_pr : T_StorePI_RegNV<"memb", 0b00, ByteAccess>;
-def S2_storerhnew_pr : T_StorePI_RegNV<"memh", 0b01, HalfWordAccess>;
-def S2_storerinew_pr : T_StorePI_RegNV<"memw", 0b10, WordAccess>;
-
-// memb(Rx++#s4:0:circ(Mu))=Nt.new
-// memb(Rx++I:circ(Mu))=Nt.new
-// memb(Rx++Mu:brev)=Nt.new
-// memh(Rx++#s4:1:circ(Mu))=Nt.new
-// memh(Rx++I:circ(Mu))=Nt.new
-// memh(Rx++Mu)=Nt.new
-// memh(Rx++Mu:brev)=Nt.new
-
-// memw(Rx++#s4:2:circ(Mu))=Nt.new
-// memw(Rx++I:circ(Mu))=Nt.new
-// memw(Rx++Mu)=Nt.new
-// memw(Rx++Mu:brev)=Nt.new
-
-//===----------------------------------------------------------------------===//
-// NV/ST -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NV/J +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps with the register
-// operands.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
-                      bit isNegCond, bit isTak>
-  : NVInst_V4<(outs),
-    (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-    "if ("#!if(isNegCond, "!","")#mnemonic#
-    "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
-    "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      bits<5> src1;
-      bits<5> src2;
-      bits<3> Ns;    // New-Value Operand
-      bits<5> RegOp; // Non-New-Value Operand
-      bits<11> offset;
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let opNewValue{0} = NvOpNum;
-
-      let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
-      let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
-
-      let IClass = 0b0010;
-      let Inst{27-26} = 0b00;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = Ns;
-      let Inst{13} = isTak;
-      let Inst{12-8} = RegOp;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-
-multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
-                       bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
-}
-
-// NvOpNum = 0 -> First Operand is a new-value Register
-// NvOpNum = 1 -> Second Operand is a new-value Register
-
-multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
-                       bit NvOpNum> {
-  let BaseOpcode = BaseOp#_NVJ in {
-    defm _t_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
-    defm _f_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
-  }
-}
-
-// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_cmpeq  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
-  defm J4_cmpgt  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
-  defm J4_cmpgtu : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
-  defm J4_cmplt  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
-  defm J4_cmpltu : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps instruction
-// with a register and an unsigned immediate (U5) operand.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
-                         bit isTak>
-  : NVInst_V4<(outs),
-    (ins IntRegs:$src1, u5_0Imm:$src2, brtarget:$offset),
-    "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let isTaken = isTak;
-
-      bits<3> src1;
-      bits<5> src2;
-      bits<11> offset;
-
-      let IClass = 0b0010;
-      let Inst{26} = 0b1;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = src1;
-      let Inst{13} = isTak;
-      let Inst{12-8} = src2;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJri_template<mnemonic, majOp, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJri_template<mnemonic, majOp, isNegCond, 1>;
-}
-
-multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
-  let BaseOpcode = BaseOp#_NVJri in {
-    defm _t_jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
-    defm _f_jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
-  }
-}
-
-// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_cmpeqi  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
-  defm J4_cmpgti  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
-  defm J4_cmpgtui : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps instruction
-// with a register and an hardcoded 0/-1 immediate value.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
-                            bit isNegCond, bit isTak>
-  : NVInst_V4<(outs),
-    !if(!eq(ImmVal, "{-1}"),
-        (ins IntRegs:$src1, n1Const:$n1, brtarget:$offset),
-        (ins IntRegs:$src1, brtarget:$offset)),
-    "if ("#!if(isNegCond, "!","")#mnemonic
-    #"($src1.new, #" # !if(!eq(ImmVal, "{-1}"), "$n1", ImmVal) # ")) jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let isTaken = isTak;
-      let opExtendable = !if(!eq(ImmVal, "{-1}"), 2, 1);
-
-      bits<3> src1;
-      bits<11> offset;
-      let IClass = 0b0010;
-      let Inst{26} = 0b1;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = src1;
-      let Inst{13} = isTak;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
-                             bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
-}
-
-multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
-                             string ImmVal> {
-  let BaseOpcode = BaseOp#_NVJ_ConstImm in {
-    defm _t_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True
-    defm _f_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False
-  }
-}
-
-// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
-// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_tstbit0 : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
-  defm J4_cmpeqn1 : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "{-1}">, PredRel;
-  defm J4_cmpgtn1 : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "{-1}">, PredRel;
-}
-
-// J4_hintjumpr: Hint indirect conditional jump.
-let isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def J4_hintjumpr: JRInst <
-  (outs),
-  (ins IntRegs:$Rs),
-  "hintjr($Rs)"> {
-    bits<5> Rs;
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0010101;
-    let Inst{20-16} = Rs;
-  }
-
-//===----------------------------------------------------------------------===//
-// NV/J -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// CR +
-//===----------------------------------------------------------------------===//
-
-// PC-relative add
-let hasNewValue = 1, isExtendable = 1, opExtendable = 1,
-    isExtentSigned = 0, opExtentBits = 6, hasSideEffects = 0, Uses = [PC] in
-def C4_addipc : CRInst <(outs IntRegs:$Rd), (ins u6_0Ext:$u6),
-  "$Rd = add(pc, #$u6)", [], "", CR_tc_2_SLOT3 > {
-    bits<5> Rd;
-    bits<6> u6;
-
-    let IClass = 0b0110;
-    let Inst{27-16} = 0b101001001001;
-    let Inst{12-7} = u6;
-    let Inst{4-0} = Rd;
-  }
-
-
-
-let hasSideEffects = 0 in
-class T_LOGICAL_3OP<string MnOp1, string MnOp2, bits<2> OpBits, bit IsNeg>
-    : CRInst<(outs PredRegs:$Pd),
-             (ins PredRegs:$Ps, PredRegs:$Pt, PredRegs:$Pu),
-             "$Pd = " # MnOp1 # "($Ps, " # MnOp2 # "($Pt, " #
-                   !if (IsNeg,"!","") # "$Pu))",
-             [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-  bits<2> Pt;
-  bits<2> Pu;
-
-  let IClass = 0b0110;
-  let Inst{27-24} = 0b1011;
-  let Inst{23} = IsNeg;
-  let Inst{22-21} = OpBits;
-  let Inst{20} = 0b1;
-  let Inst{17-16} = Ps;
-  let Inst{13} = 0b0;
-  let Inst{9-8} = Pt;
-  let Inst{7-6} = Pu;
-  let Inst{1-0} = Pd;
-}
-
-def C4_and_and  : T_LOGICAL_3OP<"and", "and", 0b00, 0>;
-def C4_and_or   : T_LOGICAL_3OP<"and", "or",  0b01, 0>;
-def C4_or_and   : T_LOGICAL_3OP<"or",  "and", 0b10, 0>;
-def C4_or_or    : T_LOGICAL_3OP<"or",  "or",  0b11, 0>;
-def C4_and_andn : T_LOGICAL_3OP<"and", "and", 0b00, 1>;
-def C4_and_orn  : T_LOGICAL_3OP<"and", "or",  0b01, 1>;
-def C4_or_andn  : T_LOGICAL_3OP<"or",  "and", 0b10, 1>;
-def C4_or_orn   : T_LOGICAL_3OP<"or",  "or",  0b11, 1>;
-
-//===----------------------------------------------------------------------===//
-// CR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/ALU +
-//===----------------------------------------------------------------------===//
-
-// Logical with-not instructions.
-def A4_andnp : T_ALU64_logical<"and", 0b001, 1, 0, 1>;
-def A4_ornp  : T_ALU64_logical<"or",  0b011, 1, 0, 1>;
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def S4_parity: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101111;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-//  Add and accumulate.
-//  Rd=add(Rs,add(Ru,#s6))
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 6,
-    opExtendable = 3 in
-def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
-                            (ins IntRegs:$Rs, IntRegs:$Ru, s6_0Ext:$s6),
-  "$Rd = add($Rs, add($Ru, #$s6))" , [],
-  "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<6> s6;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b10110;
-    let Inst{22-21} = s6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = s6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = s6{2-0};
-    let Inst{4-0}   = Ru;
-  }
-
-let isExtentSigned = 1, hasSideEffects = 0, hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 6, opExtendable = 2 in
-def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd),
-                           (ins IntRegs:$Rs, s6_0Ext:$s6, IntRegs:$Ru),
-  "$Rd = add($Rs, sub(#$s6, $Ru))",
-  [], "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<6> s6;
-    bits<5> Ru;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b10111;
-    let Inst{22-21} = s6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = s6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = s6{2-0};
-    let Inst{4-0}   = Ru;
-  }
-
-def S4_extractp_rp : T_S3op_64 < "extract",  0b11, 0b100, 0>;
-def S4_extractp    : T_S2op_extract <"extract",  0b1010, DoubleRegs, u6_0Imm>;
-
-let hasNewValue = 1 in {
-  def S4_extract_rp : T_S3op_extract<"extract",  0b01>;
-  def S4_extract    : T_S2op_extract <"extract",  0b1101, IntRegs, u5_0Imm>;
-}
-
-// Complex add/sub halfwords/words
-let Defs = [USR_OVF] in {
-  def S4_vxaddsubh : T_S3op_64 < "vxaddsubh", 0b01, 0b100, 0, 1>;
-  def S4_vxaddsubw : T_S3op_64 < "vxaddsubw", 0b01, 0b000, 0, 1>;
-  def S4_vxsubaddh : T_S3op_64 < "vxsubaddh", 0b01, 0b110, 0, 1>;
-  def S4_vxsubaddw : T_S3op_64 < "vxsubaddw", 0b01, 0b010, 0, 1>;
-}
-
-let Defs = [USR_OVF] in {
-  def S4_vxaddsubhr : T_S3op_64 < "vxaddsubh", 0b11, 0b000, 0, 1, 1, 1>;
-  def S4_vxsubaddhr : T_S3op_64 < "vxsubaddh", 0b11, 0b010, 0, 1, 1, 1>;
-}
-
-let Itinerary = M_tc_3x_SLOT23, Defs = [USR_OVF] in {
-  def M4_mac_up_s1_sat: T_MType_acc_rr<"+= mpy", 0b011, 0b000, 0, [], 0, 1, 1>;
-  def M4_nac_up_s1_sat: T_MType_acc_rr<"-= mpy", 0b011, 0b001, 0, [], 0, 1, 1>;
-}
-
-// Logical xor with xor accumulation.
-// Rxx^=xor(Rss,Rtt)
-let hasSideEffects = 0 in
-def M4_xor_xacc
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx ^= xor($Rss, $Rtt)", [],
-  "$dst2 = $Rxx", S_3op_tc_1_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b101010;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rxx;
-  }
-
-// Rotate and reduce bytes
-// Rdd=vrcrotate(Rss,Rt,#u2)
-let hasSideEffects = 0 in
-def S4_vrcrotate
-  : SInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
-  "$Rdd = vrcrotate($Rss, $Rt, #$u2)",
-  [], "", S_3op_tc_3x_SLOT23> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rt;
-    bits<2> u2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b001111;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = u2{1};
-    let Inst{12-8}  = Rt;
-    let Inst{7-6}   = 0b11;
-    let Inst{5}     = u2{0};
-    let Inst{4-0}   = Rdd;
-  }
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-let hasSideEffects = 0 in
-def S4_vrcrotate_acc
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
-  "$Rxx += vrcrotate($Rss, $Rt, #$u2)", [],
-  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-    bits<2> u2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = u2{1};
-    let Inst{12-8}  = Rt;
-    let Inst{5}     = u2{0};
-    let Inst{4-0}   = Rxx;
-  }
-
-// Vector reduce conditional negate halfwords
-let hasSideEffects = 0 in
-def S2_vrcnegh
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rxx += vrcnegh($Rss, $Rt)", [],
-  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011001;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = 0b1;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = 0b111;
-    let Inst{4-0}   = Rxx;
-  }
-
-// Split bitfield
-def A4_bitspliti : T_S2op_2_di <"bitsplit", 0b110, 0b100>;
-
-// Arithmetic/Convergent round
-def A4_cround_ri : T_S2op_2_ii <"cround", 0b111, 0b000>;
-
-def A4_round_ri  : T_S2op_2_ii <"round", 0b111, 0b100>;
-
-let Defs = [USR_OVF] in
-def A4_round_ri_sat : T_S2op_2_ii <"round", 0b111, 0b110, 1>;
-
-// Logical-logical words.
-// Compound or-and -- Rx=or(Ru,and(Rx,#s10))
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 10,
-    opExtendable = 3 in
-def S4_or_andix:
-  ALU64Inst<(outs IntRegs:$Rx),
-            (ins IntRegs:$Ru, IntRegs:$_src_, s10_0Ext:$s10),
-  "$Rx = or($Ru, and($_src_, #$s10))" , [] ,
-  "$_src_ = $Rx", ALU64_tc_2_SLOT23> {
-    bits<5> Rx;
-    bits<5> Ru;
-    bits<10> s10;
-
-    let IClass = 0b1101;
-
-    let Inst{27-22} = 0b101001;
-    let Inst{20-16} = Rx;
-    let Inst{21}    = s10{9};
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Ru;
-  }
-
-// Miscellaneous ALU64 instructions.
-//
-let hasNewValue = 1, hasSideEffects = 0 in
-def A4_modwrapu: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = modwrap($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0011111;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = 0b111;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def A4_bitsplit: ALU64Inst<(outs DoubleRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = bitsplit($Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0100;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def dep_S2_packhl: ALU64Inst<(outs DoubleRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = packhl($Rs, $Rt):deprecated", [], "", ALU64_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0100;
-  let Inst{21} = 0b0;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def dep_A2_addsat: ALU64Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = add($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7} = 0b0;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def dep_A2_subsat: ALU64Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = sub($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101100;
-  let Inst{20-16} = Rt;
-  let Inst{12-8} = Rs;
-  let Inst{7} = 0b1;
-  let Inst{4-0} = Rd;
-}
-
-// Rx[&|]=xor(Rs,Rt)
-def M4_or_xor   : T_MType_acc_rr < "|= xor", 0b110, 0b001, 0>;
-def M4_and_xor  : T_MType_acc_rr < "&= xor", 0b010, 0b010, 0>;
-
-// Rx[&|^]=or(Rs,Rt)
-def M4_xor_or   : T_MType_acc_rr < "^= or",  0b110, 0b011, 0>;
-
-let CextOpcode = "ORr_ORr" in
-def M4_or_or    : T_MType_acc_rr < "|= or",  0b110, 0b000, 0>;
-def M4_and_or   : T_MType_acc_rr < "&= or",  0b010, 0b001, 0>;
-
-// Rx[&|^]=and(Rs,Rt)
-def M4_xor_and  : T_MType_acc_rr < "^= and", 0b110, 0b010, 0>;
-
-let CextOpcode = "ORr_ANDr" in
-def M4_or_and   : T_MType_acc_rr < "|= and", 0b010, 0b011, 0>;
-def M4_and_and  : T_MType_acc_rr < "&= and", 0b010, 0b000, 0>;
-
-// Rx[&|^]=and(Rs,~Rt)
-def M4_xor_andn : T_MType_acc_rr < "^= and", 0b001, 0b010, 0, [], 1>;
-def M4_or_andn  : T_MType_acc_rr < "|= and", 0b001, 0b000, 0, [], 1>;
-def M4_and_andn : T_MType_acc_rr < "&= and", 0b001, 0b001, 0, [], 1>;
-
-// Compound or-or and or-and
-let isExtentSigned = 1, InputType = "imm", hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 10, opExtendable = 3 in
-class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode>
-  : MInst_acc <(outs IntRegs:$Rx),
-               (ins IntRegs:$src1, IntRegs:$Rs, s10_0Ext:$s10),
-  "$Rx |= "#mnemonic#"($Rs, #$s10)", [],
-  "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<10> s10;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{21}    = s10{9};
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rx;
-  }
-
-let CextOpcode = "ORr_ANDr" in
-def S4_or_andi : T_CompOR <"and", 0b00, and>;
-
-let CextOpcode = "ORr_ORr" in
-def S4_or_ori : T_CompOR <"or", 0b10, or>;
-
-//    Modulo wrap
-//        Rd=modwrap(Rs,Rt)
-//    Round
-//        Rd=cround(Rs,#u5)
-//        Rd=cround(Rs,Rt)
-//        Rd=round(Rs,#u5)[:sat]
-//        Rd=round(Rs,Rt)[:sat]
-//    Vector reduce add unsigned halfwords
-//        Rd=vraddh(Rss,Rtt)
-//    Vector add bytes
-//        Rdd=vaddb(Rss,Rtt)
-//    Vector conditional negate
-//        Rdd=vcnegh(Rss,Rt)
-//        Rxx+=vrcnegh(Rss,Rt)
-//    Vector maximum bytes
-//        Rdd=vmaxb(Rtt,Rss)
-//    Vector reduce maximum halfwords
-//        Rxx=vrmaxh(Rss,Ru)
-//        Rxx=vrmaxuh(Rss,Ru)
-//    Vector reduce maximum words
-//        Rxx=vrmaxuw(Rss,Ru)
-//        Rxx=vrmaxw(Rss,Ru)
-//    Vector minimum bytes
-//        Rdd=vminb(Rtt,Rss)
-//    Vector reduce minimum halfwords
-//        Rxx=vrminh(Rss,Ru)
-//        Rxx=vrminuh(Rss,Ru)
-//    Vector reduce minimum words
-//        Rxx=vrminuw(Rss,Ru)
-//        Rxx=vrminw(Rss,Ru)
-//    Vector subtract bytes
-//        Rdd=vsubb(Rss,Rtt)
-
-//===----------------------------------------------------------------------===//
-// XTYPE/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/BIT +
-//===----------------------------------------------------------------------===//
-
-// Bit reverse
-def S2_brevp : T_S2op_3 <"brev", 0b11, 0b110>;
-
-// Bit count
-def S2_ct0p : T_COUNT_LEADING_64<"ct0", 0b111, 0b010>;
-def S2_ct1p : T_COUNT_LEADING_64<"ct1", 0b111, 0b100>;
-def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S4_clbaddi : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s6_0Imm:$s6),
-    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  bits<6> s6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1100;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = s6;
-  let Inst{7-5} = 0b000;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S4_clbpaddi : SInst<(outs IntRegs:$Rd), (ins DoubleRegs:$Rs, s6_0Imm:$s6),
-    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  bits<6> s6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1000;
-  let Inst{23-21} = 0b011;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = s6;
-  let Inst{7-5} = 0b010;
-  let Inst{4-0} = Rd;
-}
-
-
-// Bit test/set/clear
-def S4_ntstbit_i : T_TEST_BIT_IMM<"!tstbit", 0b001>;
-def S4_ntstbit_r : T_TEST_BIT_REG<"!tstbit", 1>;
-
-def C4_nbitsset  : T_TEST_BITS_REG<"!bitsset", 0b01, 1>;
-def C4_nbitsclr  : T_TEST_BITS_REG<"!bitsclr", 0b10, 1>;
-def C4_nbitsclri : T_TEST_BITS_IMM<"!bitsclr", 0b10, 1>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY +
-//===----------------------------------------------------------------------===//
-
-// Rd=add(#u6,mpyi(Rs,#U6)) -- Multiply by immed and add immed.
-
-let hasNewValue = 1, isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
-def M4_mpyri_addi : MInst<(outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6, IntRegs:$Rs, u6_0Imm:$U6),
-  "$Rd = add(#$u6, mpyi($Rs, #$U6))" , [],"",ALU64_tc_3x_SLOT23> {
-    bits<5> Rd;
-    bits<6> u6;
-    bits<5> Rs;
-    bits<6> U6;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23}    = U6{5};
-    let Inst{22-21} = u6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = u6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = u6{2-0};
-    let Inst{4-0}   = U6{4-0};
-  }
-
-// Rd=add(#u6,mpyi(Rs,Rt))
-let CextOpcode = "ADD_MPY", InputType = "imm", hasNewValue = 1,
-    isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
-def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = add(#$u6, mpyi($Rs, $Rt))" , [], "", ALU64_tc_3x_SLOT23>, ImmRegRel {
-    bits<5> Rd;
-    bits<6> u6;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01110;
-    let Inst{22-21} = u6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = u6{3};
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = u6{2-0};
-    let Inst{4-0}   = Rd;
-  }
-
-let hasNewValue = 1 in
-class T_AddMpy <bit MajOp, PatLeaf ImmPred, dag ins>
-  : ALU64Inst <(outs IntRegs:$dst), ins,
-  "$dst = add($src1, mpyi("#!if(MajOp,"$src3, #$src2))",
-                                      "#$src2, $src3))"), [],
-  "", ALU64_tc_3x_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<8> src2;
-    bits<5> src3;
-
-    let IClass = 0b1101;
-
-    bits<6> ImmValue = !if(MajOp, src2{5-0}, src2{7-2});
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23}    = MajOp;
-    let Inst{22-21} = ImmValue{5-4};
-    let Inst{20-16} = src3;
-    let Inst{13}    = ImmValue{3};
-    let Inst{12-8}  = dst;
-    let Inst{7-5}   = ImmValue{2-0};
-    let Inst{4-0}   = src1;
-  }
-
-def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred,
-                       (ins IntRegs:$src1, u6_2Imm:$src2, IntRegs:$src3)>;
-
-let isExtendable = 1, opExtentBits = 6, opExtendable = 3,
-    CextOpcode = "ADD_MPY", InputType = "imm" in
-def M4_mpyri_addr : T_AddMpy<0b1, u32_0ImmPred,
-                    (ins IntRegs:$src1, IntRegs:$src3, u6_0Ext:$src2)>, ImmRegRel;
-
-// Rx=add(Ru,mpyi(Rx,Rs))
-let CextOpcode = "ADD_MPY", InputType = "reg", hasNewValue = 1 in
-def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx),
-                              (ins IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs),
-  "$Rx = add($Ru, mpyi($_src_, $Rs))", [],
-  "$_src_ = $Rx", M_tc_3x_SLOT23>, ImmRegRel {
-    bits<5> Rx;
-    bits<5> Ru;
-    bits<5> Rs;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b0011000;
-    let Inst{12-8} = Rx;
-    let Inst{4-0} = Ru;
-    let Inst{20-16} = Rs;
-  }
-
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def M4_vrmpyeh_s0 : T_M2_vmpy<"vrmpyweh", 0b010, 0b100, 0, 0, 0>;
-def M4_vrmpyeh_s1 : T_M2_vmpy<"vrmpyweh", 0b110, 0b100, 1, 0, 0>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def M4_vrmpyoh_s0 : T_M2_vmpy<"vrmpywoh", 0b001, 0b010, 0, 0, 0>;
-def M4_vrmpyoh_s1 : T_M2_vmpy<"vrmpywoh", 0b101, 0b010, 1, 0, 0>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def M4_vrmpyeh_acc_s0: T_M2_vmpy_acc<"vrmpyweh", 0b001, 0b110, 0, 0>;
-def M4_vrmpyeh_acc_s1: T_M2_vmpy_acc<"vrmpyweh", 0b101, 0b110, 1, 0>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def M4_vrmpyoh_acc_s0: T_M2_vmpy_acc<"vrmpywoh", 0b011, 0b110, 0, 0>;
-def M4_vrmpyoh_acc_s1: T_M2_vmpy_acc<"vrmpywoh", 0b111, 0b110, 1, 0>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<]:sat
-def M2_vmpy2su_s0 : T_XTYPE_mpy64 < "vmpyhsu", 0b000, 0b111, 1, 0, 0>;
-def M2_vmpy2su_s1 : T_XTYPE_mpy64 < "vmpyhsu", 0b100, 0b111, 1, 1, 0>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def M2_vmac2su_s0 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b011, 0b101, 1, 0, 0>;
-def M2_vmac2su_s1 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b111, 0b101, 1, 1, 0>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def M4_vpmpyh : T_XTYPE_mpy64 < "vpmpyh", 0b110, 0b111, 0, 0, 0>;
-
-// Rxx^=vpmpyh(Rs,Rt)
-def M4_vpmpyh_acc : T_XTYPE_mpy64_acc < "vpmpyh", "^", 0b101, 0b111, 0, 0, 0>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def M4_pmpyw : T_XTYPE_mpy64 < "pmpyw", 0b010, 0b111, 0, 0, 0>;
-
-// Rxx^=pmpyw(Rs,Rt)
-def M4_pmpyw_acc  : T_XTYPE_mpy64_acc < "pmpyw", "^", 0b001, 0b111, 0, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/Vector compare
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Template class for vector compare
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0 in
-class T_vcmpImm <string Str, bits<2> cmpOp, bits<2> minOp, Operand ImmOprnd>
-  : ALU64_rr <(outs PredRegs:$Pd),
-              (ins DoubleRegs:$Rss, ImmOprnd:$Imm),
-  "$Pd = "#Str#"($Rss, #$Imm)",
-  [], "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<32> Imm;
-    bits<8> ImmBits;
-    let ImmBits{6-0} = Imm{6-0};
-    let ImmBits{7} = !if (!eq(cmpOp,0b10), 0b0, Imm{7}); // 0 for vcmp[bhw].gtu
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{22-21} = cmpOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-5} = ImmBits;
-    let Inst{4-3} = minOp;
-    let Inst{1-0} = Pd;
-  }
-
-// Vector compare bytes
-def A4_vcmpbgt   : T_vcmp <"vcmpb.gt", 0b1010>;
-
-let AsmString = "$Pd = any8(vcmpb.eq($Rss, $Rtt))" in
-def A4_vcmpbeq_any : T_vcmp <"any8(vcmpb.gt", 0b1000>;
-
-def A4_vcmpbeqi  : T_vcmpImm <"vcmpb.eq",  0b00, 0b00, u8_0Imm>;
-def A4_vcmpbgti  : T_vcmpImm <"vcmpb.gt",  0b01, 0b00, s8_0Imm>;
-def A4_vcmpbgtui : T_vcmpImm <"vcmpb.gtu", 0b10, 0b00, u7_0Imm>;
-
-// Vector compare halfwords
-def A4_vcmpheqi  : T_vcmpImm <"vcmph.eq",  0b00, 0b01, s8_0Imm>;
-def A4_vcmphgti  : T_vcmpImm <"vcmph.gt",  0b01, 0b01, s8_0Imm>;
-def A4_vcmphgtui : T_vcmpImm <"vcmph.gtu", 0b10, 0b01, u7_0Imm>;
-
-// Vector compare words
-def A4_vcmpweqi  : T_vcmpImm <"vcmpw.eq",  0b00, 0b10, s8_0Imm>;
-def A4_vcmpwgti  : T_vcmpImm <"vcmpw.gt",  0b01, 0b10, s8_0Imm>;
-def A4_vcmpwgtui : T_vcmpImm <"vcmpw.gtu", 0b10, 0b10, u7_0Imm>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/SHIFT +
-//===----------------------------------------------------------------------===//
-// Shift by immediate and accumulate/logical.
-// Rx=add(#u8,asl(Rx,#U5))  Rx=add(#u8,lsr(Rx,#U5))
-// Rx=sub(#u8,asl(Rx,#U5))  Rx=sub(#u8,lsr(Rx,#U5))
-// Rx=and(#u8,asl(Rx,#U5))  Rx=and(#u8,lsr(Rx,#U5))
-// Rx=or(#u8,asl(Rx,#U5))   Rx=or(#u8,lsr(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-    hasNewValue = 1, opNewValue = 0 in
-class T_S4_ShiftOperate<string MnOp, string MnSh, bit asl_lsr,
-                        bits<2> MajOp, InstrItinClass Itin>
-  : MInst_acc<(outs IntRegs:$Rd), (ins u8_0Ext:$u8, IntRegs:$Rx, u5_0Imm:$U5),
-      "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))",
-      [], "$Rd = $Rx", Itin> {
-
-  bits<5> Rd;
-  bits<8> u8;
-  bits<5> Rx;
-  bits<5> U5;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b1110;
-  let Inst{23-21} = u8{7-5};
-  let Inst{20-16} = Rd;
-  let Inst{13} = u8{4};
-  let Inst{12-8} = U5;
-  let Inst{7-5} = u8{3-1};
-  let Inst{4} = asl_lsr;
-  let Inst{3} = u8{0};
-  let Inst{2-1} = MajOp;
-}
-
-multiclass T_ShiftOperate<string mnemonic, bits<2> MajOp, InstrItinClass Itin> {
-  def _asl_ri : T_S4_ShiftOperate<mnemonic, "asl", 0, MajOp, Itin>;
-  def _lsr_ri : T_S4_ShiftOperate<mnemonic, "lsr", 1, MajOp, Itin>;
-}
-
-defm S4_addi : T_ShiftOperate<"add", 0b10, ALU64_tc_2_SLOT23>;
-defm S4_andi : T_ShiftOperate<"and", 0b00, ALU64_tc_2_SLOT23>;
-defm S4_ori  : T_ShiftOperate<"or",  0b01, ALU64_tc_1_SLOT23>;
-defm S4_subi : T_ShiftOperate<"sub", 0b11, ALU64_tc_1_SLOT23>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def S2_vcnegh   : T_S3op_shiftVect < "vcnegh",   0b11, 0b01>;
-
-// Rd=[cround|round](Rs,Rt)
-let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in {
-  def A4_cround_rr    : T_S3op_3 < "cround", IntRegs, 0b11, 0b00>;
-  def A4_round_rr     : T_S3op_3 < "round", IntRegs, 0b11, 0b10>;
-}
-
-// Rd=round(Rs,Rt):sat
-let hasNewValue = 1, Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def A4_round_rr_sat : T_S3op_3 < "round", IntRegs, 0b11, 0b11, 1>;
-
-// Rd=[cmpyiwh|cmpyrwh](Rss,Rt):<<1:rnd:sat
-let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
-  def M4_cmpyi_wh     : T_S3op_8<"cmpyiwh", 0b100, 1, 1, 1>;
-  def M4_cmpyr_wh     : T_S3op_8<"cmpyrwh", 0b110, 1, 1, 1>;
-}
-
-// Rdd=[add|sub](Rss,Rtt,Px):carry
-let isPredicateLate = 1, hasSideEffects = 0 in
-class T_S3op_carry <string mnemonic, bits<3> MajOp>
-  : SInst < (outs DoubleRegs:$Rdd, PredRegs:$Px),
-            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu):carry",
-  [], "$Px = $Pu", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<2> Pu;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{6-5}   = Pu;
-    let Inst{4-0}   = Rdd;
-  }
-
-def A4_addp_c : T_S3op_carry < "add", 0b110 >;
-def A4_subp_c : T_S3op_carry < "sub", 0b111 >;
-
-let Itinerary = S_3op_tc_3_SLOT23, hasSideEffects = 0 in
-class T_S3op_6 <string mnemonic, bits<3> MinOp, bit isUnsigned>
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Ru),
-  "$Rxx = "#mnemonic#"($Rss, $Ru)" ,
-  [] , "$dst2 = $Rxx"> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Ru;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011001;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = isUnsigned;
-    let Inst{12-8}  = Rxx;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = Ru;
-  }
-
-// Vector reduce maximum halfwords
-// Rxx=vrmax[u]h(Rss,Ru)
-def A4_vrmaxh  : T_S3op_6 < "vrmaxh",  0b001, 0>;
-def A4_vrmaxuh : T_S3op_6 < "vrmaxuh", 0b001, 1>;
-
-// Vector reduce maximum words
-// Rxx=vrmax[u]w(Rss,Ru)
-def A4_vrmaxw  : T_S3op_6 < "vrmaxw",  0b010, 0>;
-def A4_vrmaxuw : T_S3op_6 < "vrmaxuw", 0b010, 1>;
-
-// Vector reduce minimum halfwords
-// Rxx=vrmin[u]h(Rss,Ru)
-def A4_vrminh  : T_S3op_6 < "vrminh",  0b101, 0>;
-def A4_vrminuh : T_S3op_6 < "vrminuh", 0b101, 1>;
-
-// Vector reduce minimum words
-// Rxx=vrmin[u]w(Rss,Ru)
-def A4_vrminw  : T_S3op_6 < "vrminw",  0b110, 0>;
-def A4_vrminuw : T_S3op_6 < "vrminuw", 0b110, 1>;
-
-// Shift an immediate left by register amount.
-let hasNewValue = 1, hasSideEffects = 0 in
-def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6_0Imm:$s6, IntRegs:$Rt),
-  "$Rd = lsl(#$s6, $Rt)" , [], "", S_3op_tc_1_SLOT23> {
-    bits<5> Rd;
-    bits<6> s6;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b011010;
-    let Inst{20-16} = s6{5-1};
-    let Inst{12-8}  = Rt;
-    let Inst{7-6}   = 0b11;
-    let Inst{4-0}   = Rd;
-    let Inst{5}     = s6{0};
-  }
-
-//===----------------------------------------------------------------------===//
-// XTYPE/SHIFT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MEMOP
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// Template class for MemOp instructions with the register value.
-//===----------------------------------------------------------------------===//
-class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
-                     string memOp, bits<2> memOpBits> :
-      MEMInst_V4<(outs),
-                 (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
-                 opc#"($base+#$offset)"#memOp#"$delta",
-                 []>,
-                 Requires<[UseMEMOP]> {
-
-    bits<5> base;
-    bits<5> delta;
-    bits<32> offset;
-    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
-
-    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
-                     !if (!eq(opcBits, 0b01), offset{6-1},
-                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
-
-    let opExtentAlign = opcBits;
-    let IClass = 0b0011;
-    let Inst{27-24} = 0b1110;
-    let Inst{22-21} = opcBits;
-    let Inst{20-16} = base;
-    let Inst{13} = 0b0;
-    let Inst{12-7} = offsetBits;
-    let Inst{6-5} = memOpBits;
-    let Inst{4-0} = delta;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for MemOp instructions with the immediate value.
-//===----------------------------------------------------------------------===//
-class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
-                     string memOp, bits<2> memOpBits> :
-      MEMInst_V4 <(outs),
-                  (ins IntRegs:$base, ImmOp:$offset, u5_0Imm:$delta),
-                  opc#"($base+#$offset)"#memOp#"#$delta"
-                  #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
-                  []>,
-                  Requires<[UseMEMOP]> {
-
-    bits<5> base;
-    bits<5> delta;
-    bits<32> offset;
-    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
-
-    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
-                     !if (!eq(opcBits, 0b01), offset{6-1},
-                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
-
-    let opExtentAlign = opcBits;
-    let IClass = 0b0011;
-    let Inst{27-24} = 0b1111;
-    let Inst{22-21} = opcBits;
-    let Inst{20-16} = base;
-    let Inst{13} = 0b0;
-    let Inst{12-7} = offsetBits;
-    let Inst{6-5} = memOpBits;
-    let Inst{4-0} = delta;
-}
-
-// multiclass to define MemOp instructions with register operand.
-multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
-  def L4_add#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
-  def L4_sub#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
-  def L4_and#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
-  def L4_or#NAME  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
-}
-
-// multiclass to define MemOp instructions with immediate Operand.
-multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
-  def L4_iadd#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
-  def L4_isub#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
-  def L4_iand#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = clrbit(", 0b10>;
-  def L4_ior#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = setbit(", 0b11>;
-}
-
-multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
-  defm _#NAME : MemOp_rr <opc, opcBits, ImmOp>;
-  defm _#NAME : MemOp_ri <opc, opcBits, ImmOp>;
-}
-
-// Define MemOp instructions.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
-  let opExtentBits = 6, accessSize = ByteAccess in
-  defm memopb_io : MemOp_base <"memb", 0b00, u6_0Ext>;
-
-  let opExtentBits = 7, accessSize = HalfWordAccess in
-  defm memoph_io : MemOp_base <"memh", 0b01, u6_1Ext>;
-
-  let opExtentBits = 8, accessSize = WordAccess in
-  defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PRED +
-//===----------------------------------------------------------------------===//
-
-// Hexagon V4 only supports these flavors of byte/half compare instructions:
-// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
-// hardware. However, compiler can still implement these patterns through
-// appropriate patterns combinations based on current implemented patterns.
-// The implemented patterns are: EQ/GT/GTU.
-// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
-
-// Following instruction is not being extended as it results into the
-// incorrect code for negative numbers.
-// Pd=cmpb.eq(Rs,#u8)
-
-// p=!cmp.eq(r1,#s10)
-def C4_cmpneqi  : T_CMP <"cmp.eq",  0b00, 1, s10_0Ext>;
-def C4_cmpltei  : T_CMP <"cmp.gt",  0b01, 1, s10_0Ext>;
-def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PRED -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Multiclass for DeallocReturn
-//===----------------------------------------------------------------------===//
-class L4_RETURN<string mnemonic, bit isNot, bit isPredNew, bit isTak>
-  : LD0Inst<(outs), (ins PredRegs:$src),
-  !if(isNot, "if (!$src", "if ($src")#
-  !if(isPredNew, ".new) ", ") ")#mnemonic#
-  !if(isPredNew, #!if(isTak,":t", ":nt"),""),
-  [], "", LD_tc_3or4stall_SLOT0> {
-
-    bits<2> src;
-    let BaseOpcode = "L4_RETURN";
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-    let isTaken = isTak;
-    let IClass = 0b1001;
-
-    let Inst{27-16} = 0b011000011110;
-
-    let Inst{13} = isNot;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{10} = 0b0;
-    let Inst{9-8} = src;
-    let Inst{4-0} = 0b11110;
-  }
-
-// Produce all predicated forms, p, !p, p.new, !p.new, :t, :nt
-multiclass L4_RETURN_PRED<string mnemonic, bit PredNot> {
-  let isPredicated = 1 in {
-    def _#NAME# : L4_RETURN <mnemonic, PredNot, 0, 1>;
-    def _#NAME#new_pnt : L4_RETURN <mnemonic, PredNot, 1, 0>;
-    def _#NAME#new_pt : L4_RETURN <mnemonic, PredNot, 1, 1>;
-  }
-}
-
-multiclass LD_MISC_L4_RETURN<string mnemonic> {
-  let isBarrier = 1, isPredicable = 1 in
-    def NAME : LD0Inst <(outs), (ins), mnemonic, [], "",
-                        LD_tc_3or4stall_SLOT0> {
-      let BaseOpcode = "L4_RETURN";
-      let IClass = 0b1001;
-      let Inst{27-16} = 0b011000011110;
-      let Inst{13-10} = 0b0000;
-      let Inst{4-0} = 0b11110;
-    }
-  defm t : L4_RETURN_PRED<mnemonic, 0 >;
-  defm f : L4_RETURN_PRED<mnemonic, 1 >;
-}
-
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], hasSideEffects = 0 in
-defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
-
-// Restore registers and dealloc return function call.
-let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
-
-  let isExtended = 1, opExtendable = 0 in
-  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
-
-  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
-    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
-
-    let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
-  }
-}
-
-// Restore registers and dealloc frame before a tail call.
-let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<0, "">, PredRel;
-
-  let isExtended = 1, opExtendable = 0 in
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<0, "">, PredRel;
-
-    let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
-  }
-}
-
-// Save registers function call.
-let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
-  def SAVE_REGISTERS_CALL_V4 : T_Call<0, "">, PredRel;
-
-  let isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [P0] in
-  def SAVE_REGISTERS_CALL_V4STK : T_Call<0, "">, PredRel;
-
-  let Defs = [P0], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28] in
-  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, P0] in
-  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<0, "">, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1 in
-class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                    bits<2>MajOp, bit isAbs, bit isHalf>
-  : STInst<(outs), (ins ImmOp:$addr, RC:$src),
-  mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
-  [], "", V2LDST_tc_st_SLOT01> {
-    bits<19> addr;
-    bits<5> src;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-    let Uses = !if (isAbs, [], [GP]);
-
-    let IClass = 0b0100;
-    let Inst{27} = 1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24}    = 0b0;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = isHalf;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13}    = offsetBits{8};
-    let Inst{12-8}  = src;
-    let Inst{7-0}   = offsetBits{7-0};
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
-class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
-                       bit isHalf, bit isNot, bit isNew>
-  : STInst<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, RC: $src2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
-  ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
-  [], "", ST_tc_st_SLOT01>, AddrModeRel {
-    bits<2> src1;
-    bits<6> absaddr;
-    bits<5> src2;
-
-    let isPredicatedNew = isNew;
-    let isPredicatedFalse = isNot;
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = isHalf;
-    let Inst{17-16} = absaddr{5-4};
-    let Inst{13}    = isNew;
-    let Inst{12-8}  = src2;
-    let Inst{7}     = 0b1;
-    let Inst{6-3}   = absaddr{3-0};
-    let Inst{2}     = isNot;
-    let Inst{1-0}   = src1;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<2> MajOp, bit isHalf>
-  : T_StoreAbsGP <mnemonic, RC, u32_0MustExt, MajOp, 1, isHalf>,
-                  AddrModeRel {
-  string ImmOpStr = !cast<string>(ImmOp);
-  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                      /* u16_0Imm */ 16)));
-
-  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                       /* u16_0Imm */ 0)));
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass for store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-let addrMode = Absolute, isExtended = 1 in
-multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
-                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def PS_#NAME#abs : T_StoreAbs <mnemonic, RC, ImmOp, MajOp, isHalf>;
-
-    // Predicated
-    def S4_p#NAME#t_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 0>;
-    def S4_p#NAME#f_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 0>;
-
-    // .new Predicated
-    def S4_p#NAME#tnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 1>;
-    def S4_p#NAME#fnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 1>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated new-value store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
-    isNewValue = 1, opNewValue = 1 in
-class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp>
-  : NVInst_V4<(outs), (ins ImmOp:$addr, IntRegs:$src),
-  mnemonic #"(#$addr) = $src.new",
-  [], "", V2LDST_tc_st_SLOT0> {
-    bits<19> addr;
-    bits<3> src;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-    let IClass = 0b0100;
-
-    let Inst{27} = 1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24-21} = 0b0101;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13}    = offsetBits{8};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src;
-    let Inst{7-0}   = offsetBits{7-0};
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated new-value store instructions with
-// absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
-    isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
-class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
-  : NVInst_V4<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, IntRegs:$src2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
-  ") ")#mnemonic#"(#$absaddr) = $src2.new",
-  [], "", ST_tc_st_SLOT0>, AddrModeRel {
-    bits<2> src1;
-    bits<6> absaddr;
-    bits<3> src2;
-
-    let isPredicatedNew = isNew;
-    let isPredicatedFalse = isNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = 0b101;
-    let Inst{17-16} = absaddr{5-4};
-    let Inst{13}    = isNew;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src2;
-    let Inst{7}     = 0b1;
-    let Inst{6-3}   = absaddr{3-0};
-    let Inst{2}     = isNot;
-    let Inst{1-0}   = src1;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated new-value store instructions with
-// absolute addressing.
-//===----------------------------------------------------------------------===//
-class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
-  : T_StoreAbsGP_NV <mnemonic, u32_0MustExt, MajOp>, AddrModeRel {
-
-  string ImmOpStr = !cast<string>(ImmOp);
-  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                      /* u16_0Imm */ 16)));
-
-  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                       /* u16_0Imm */ 0)));
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass for new-value store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-let addrMode = Absolute, isExtended = 1  in
-multiclass ST_Abs_NV <string mnemonic, string CextOp, Operand ImmOp,
-                   bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def PS_#NAME#newabs : T_StoreAbs_NV <mnemonic, ImmOp, MajOp>;
-
-    // Predicated
-    def S4_p#NAME#newt_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 0>;
-    def S4_p#NAME#newf_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 0>;
-
-    // .new Predicated
-    def S4_p#NAME#newtnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 1>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Stores with absolute addressing
-//===----------------------------------------------------------------------===//
-let accessSize = ByteAccess in
-defm storerb : ST_Abs    <"memb", "STrib", IntRegs, u16_0Imm, 0b00>,
-               ST_Abs_NV <"memb", "STrib", u16_0Imm, 0b00>;
-
-let accessSize = HalfWordAccess in
-defm storerh : ST_Abs    <"memh", "STrih", IntRegs, u16_1Imm, 0b01>,
-               ST_Abs_NV <"memh", "STrih", u16_1Imm, 0b01>;
-
-let accessSize = WordAccess in
-defm storeri : ST_Abs    <"memw", "STriw", IntRegs, u16_2Imm, 0b10>,
-               ST_Abs_NV <"memw", "STriw", u16_2Imm, 0b10>;
-
-let isNVStorable = 0, accessSize = DoubleWordAccess in
-defm storerd : ST_Abs <"memd", "STrid", DoubleRegs, u16_3Imm, 0b11>;
-
-let isNVStorable = 0, accessSize = HalfWordAccess in
-defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
-
-//===----------------------------------------------------------------------===//
-// GP-relative stores.
-// mem[bhwd](#global)=Rt
-// Once predicated, these instructions map to absolute addressing mode.
-// if ([!]Pv[.new]) mem[bhwd](##global)=Rt
-//===----------------------------------------------------------------------===//
-
-let Uses = [GP], isAsmParserOnly = 1 in
-class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
-                 Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
-  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
-    // Set BaseOpcode same as absolute addressing instructions so that
-    // non-predicated GP-Rel instructions can have relate with predicated
-    // Absolute instruction.
-    let BaseOpcode = BaseOp#_abs;
-  }
-
-let Uses = [GP], isAsmParserOnly = 1 in
-multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
-                  bits<2> MajOp, bit isHalf = 0> {
-  // Set BaseOpcode same as absolute addressing instructions so that
-  // non-predicated GP-Rel instructions can have relate with predicated
-  // Absolute instruction.
-  let BaseOpcode = BaseOp#_abs in {
-    def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
-                                0, isHalf>;
-    // New-value store
-    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp> ;
-  }
-}
-
-let accessSize = ByteAccess in
-defm S2_storerb : ST_GP<"memb", "STrib", u16_0Imm, 0b00>, NewValueRel;
-
-let accessSize = HalfWordAccess in
-defm S2_storerh : ST_GP<"memh", "STrih", u16_1Imm, 0b01>, NewValueRel;
-
-let accessSize = WordAccess in
-defm S2_storeri : ST_GP<"memw", "STriw", u16_2Imm, 0b10>, NewValueRel;
-
-let isNVStorable = 0, accessSize = DoubleWordAccess in
-def S2_storerdgp : T_StoreGP <"memd", "STrid", DoubleRegs,
-                              u16_3Imm, 0b11>, PredNewRel;
-
-let isNVStorable = 0, accessSize = HalfWordAccess in
-def S2_storerfgp : T_StoreGP <"memh", "STrif", IntRegs,
-                              u16_1Imm, 0b01, 1>, PredNewRel;
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated load instructions with
-// absolute addressing mode.
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0 in
-class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3> MajOp>
-  : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
-  "$dst = "#mnemonic# "(#$addr)",
-  [], "", V2LDST_tc_ld_SLOT01> {
-    bits<5> dst;
-    bits<19> addr;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24}    = 0b1;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, u32_0MustExt, MajOp>, AddrModeRel {
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                       !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                       !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                        /* u16_0Imm */ 16)));
-
-    let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                        !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                        !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                        /* u16_0Imm */ 0)));
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated load instructions with
-// absolute addressing mode.
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
-    opExtendable = 2 in
-class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                      bit isPredNot, bit isPredNew>
-  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32_0MustExt:$absaddr),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<6> absaddr;
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = absaddr{5-1};
-    let Inst{13} = 0b1;
-    let Inst{12} = isPredNew;
-    let Inst{11} = isPredNot;
-    let Inst{10-9} = src1;
-    let Inst{8} = absaddr{0};
-    let Inst{7} = 0b1;
-    let Inst{4-0} = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for the load instructions with absolute addressing mode.
-//===----------------------------------------------------------------------===//
-multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bits<3> MajOp,
-                       bit PredNot> {
-  def _abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 0>;
-  // Predicate new
-  def new_abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 1>;
-}
-
-let addrMode = Absolute, isExtended = 1 in
-multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC,
-                  Operand ImmOp, bits<3> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 1, isPredicable = 1 in
-    def PS_#NAME#abs: T_LoadAbs <mnemonic, RC, ImmOp, MajOp>;
-
-    // Predicated
-    defm L4_p#NAME#t : LD_Abs_Pred<mnemonic, RC, MajOp, 0>;
-    defm L4_p#NAME#f : LD_Abs_Pred<mnemonic, RC, MajOp, 1>;
-  }
-}
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  defm loadrb  : LD_Abs<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
-  defm loadrub : LD_Abs<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  defm loadrh  : LD_Abs<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
-  defm loadruh : LD_Abs<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-defm loadri  : LD_Abs<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
-
-let accessSize = DoubleWordAccess in
-defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// multiclass for load instructions with GP-relative addressing mode.
-// Rx=mem[bhwd](##global)
-// Once predicated, these instructions map to absolute addressing mode.
-// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
-//===----------------------------------------------------------------------===//
-
-let isAsmParserOnly = 1, Uses = [GP] in
-class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
-                bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
-    let BaseOpcode = BaseOp#_abs;
-  }
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  def L2_loadrbgp  : T_LoadGP<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
-  def L2_loadrubgp : T_LoadGP<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  def L2_loadrhgp  : T_LoadGP<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
-  def L2_loadruhgp : T_LoadGP<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-def L2_loadrigp  : T_LoadGP<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
-
-let accessSize = DoubleWordAccess in
-def L2_loadrdgp  : T_LoadGP<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// :raw for of boundscheck:hi:lo insns
-//===----------------------------------------------------------------------===//
-
-// A4_boundscheck_lo: Detect if a register is within bounds.
-let hasSideEffects = 0 in
-def A4_boundscheck_lo: ALU64Inst <
-  (outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = boundscheck($Rss, $Rtt):raw:lo"> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = 0b1;
-    let Inst{7-5} = 0b100;
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// A4_boundscheck_hi: Detect if a register is within bounds.
-let hasSideEffects = 0 in
-def A4_boundscheck_hi: ALU64Inst <
-  (outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = boundscheck($Rss, $Rtt):raw:hi"> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = 0b1;
-    let Inst{7-5} = 0b101;
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def A4_boundscheck : MInst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
-  "$Pd=boundscheck($Rs,$Rtt)">;
-
-// A4_tlbmatch: Detect if a VA/ASID matches a TLB entry.
-let isPredicateLate = 1, hasSideEffects = 0 in
-def A4_tlbmatch : ALU64Inst<(outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rs, IntRegs:$Rt),
-  "$Pd = tlbmatch($Rs, $Rt)",
-  [], "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1101;
-    let Inst{27-23} = 0b00100;
-    let Inst{20-16} = Rs;
-    let Inst{13} = 0b1;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = 0b011;
-    let Inst{1-0} = Pd;
-  }
-
-// Use LD0Inst for dcfetch, but set "mayLoad" to 0 because this doesn't
-// really do a load.
-let hasSideEffects = 1, mayLoad = 0 in
-def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
-      "dcfetch($Rs + #$u11_3)",
-      [], "", LD_tc_ld_SLOT0> {
-  bits<5> Rs;
-  bits<14> u11_3;
-
-  let IClass = 0b1001;
-  let Inst{27-21} = 0b0100000;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b0;
-  let Inst{10-0} = u11_3{13-3};
-}
-
-
-//===----------------------------------------------------------------------===//
-// Compound instructions
-//===----------------------------------------------------------------------===//
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1,
-    opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
-    isTerminator = 1 in
-class CJInst_tstbit_R0<string px, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
-  ""#px#" = tstbit($Rs, #0); if ("
-    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{24-23} = 0b11;
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{9-8} = 0b11;
-  let Inst{7-1} = r9_2{8-2};
-}
-
-let Defs = [PC, P0], Uses = [P0] in {
-  def J4_tstbit0_tp0_jump_nt : CJInst_tstbit_R0<"p0", 0, "nt">;
-  def J4_tstbit0_tp0_jump_t : CJInst_tstbit_R0<"p0", 0, "t">;
-  def J4_tstbit0_fp0_jump_nt : CJInst_tstbit_R0<"p0", 1, "nt">;
-  def J4_tstbit0_fp0_jump_t : CJInst_tstbit_R0<"p0", 1, "t">;
-}
-
-let Defs = [PC, P1], Uses = [P1] in {
-  def J4_tstbit0_tp1_jump_nt : CJInst_tstbit_R0<"p1", 0, "nt">;
-  def J4_tstbit0_tp1_jump_t : CJInst_tstbit_R0<"p1", 0, "t">;
-  def J4_tstbit0_fp1_jump_nt : CJInst_tstbit_R0<"p1", 1, "nt">;
-  def J4_tstbit0_fp1_jump_t : CJInst_tstbit_R0<"p1", 1, "t">;
-}
-
-
-let isBranch = 1, hasSideEffects = 0,
-    isExtentSigned = 1, isPredicated = 1, isPredicatedNew = 1,
-    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2,
-    opExtendable = 2, isTerminator = 1 in
-class CJInst_RR<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs, $Rt); if ("
-   #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<4> Rt;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-23} = !if (!eq(op, "eq"),  0b01000,
-                    !if (!eq(op, "gt"),  0b01001,
-                    !if (!eq(op, "gtu"), 0b01010, 0)));
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  // px: Predicate reg 0/1
-  let Inst{12} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{11-8} = Rt;
-  let Inst{7-1} = r9_2{8-2};
-}
-
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_RR<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_RR<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_RR<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_RR<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_RR<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_RR<string op>{
-  defm J4_cmp#NAME#_t : T_tnt_CJInst_RR<op, 0>;
-  defm J4_cmp#NAME#_f : T_tnt_CJInst_RR<op, 1>;
-}
-// TypeCJ Instructions compare RR and jump
-defm eq : T_pnp_CJInst_RR<"eq">;
-defm gt : T_pnp_CJInst_RR<"gt">;
-defm gtu : T_pnp_CJInst_RR<"gtu">;
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2, isTerminator = 1 in
-class CJInst_RU5<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, u5_0Imm:$U5, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs, #$U5); if ("
-    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<5> U5;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  // px: Predicate reg 0/1
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{24-23} = !if (!eq(op, "eq"),  0b00,
-                    !if (!eq(op, "gt"),  0b01,
-                    !if (!eq(op, "gtu"), 0b10, 0)));
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{12-8} = U5;
-  let Inst{7-1} = r9_2{8-2};
-}
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_RU5<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_RU5<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_RU5<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_RU5<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_RU5<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_RU5<string op>{
-  defm J4_cmp#NAME#i_t : T_tnt_CJInst_RU5<op, 0>;
-  defm J4_cmp#NAME#i_f : T_tnt_CJInst_RU5<op, 1>;
-}
-// TypeCJ Instructions compare RI and jump
-defm eq : T_pnp_CJInst_RU5<"eq">;
-defm gt : T_pnp_CJInst_RU5<"gt">;
-defm gtu : T_pnp_CJInst_RU5<"gtu">;
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1,
-    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2, opExtendable = 2,
-    isTerminator = 1 in
-class CJInst_Rn1<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, n1Const:$n1, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs,#$n1); if ("
-  #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-
-  let Inst{24-23} = 0b11;
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{9-8} = !if (!eq(op, "eq"),  0b00,
-                  !if (!eq(op, "gt"),  0b01, 0));
-  let Inst{7-1} = r9_2{8-2};
-}
-
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_Rn1<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_Rn1<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_Rn1<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_Rn1<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_Rn1<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_Rn1<string op>{
-  defm J4_cmp#NAME#n1_t : T_tnt_CJInst_Rn1<op, 0>;
-  defm J4_cmp#NAME#n1_f : T_tnt_CJInst_Rn1<op, 1>;
-}
-// TypeCJ Instructions compare -1 and jump
-defm eq : T_pnp_CJInst_Rn1<"eq">;
-defm gt : T_pnp_CJInst_Rn1<"gt">;
-
-// J4_jumpseti: Direct unconditional jump and set register to immediate.
-let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
-    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpseti: CJInst_JMPSET <
-  (outs IntRegs:$Rd),
-  (ins u6_0Imm:$U6, brtarget:$r9_2),
-  "$Rd = #$U6 ; jump $r9_2"> {
-    bits<4> Rd;
-    bits<6> U6;
-    bits<11> r9_2;
-
-    let IClass = 0b0001;
-    let Inst{27-24} = 0b0110;
-    let Inst{21-20} = r9_2{10-9};
-    let Inst{19-16} = Rd;
-    let Inst{13-8} = U6;
-    let Inst{7-1} = r9_2{8-2};
-  }
-
-// J4_jumpsetr: Direct unconditional jump and transfer register.
-let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
-    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpsetr: CJInst_JMPSET <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, brtarget:$r9_2),
-  "$Rd = $Rs ; jump $r9_2"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<11> r9_2;
-
-    let IClass = 0b0001;
-    let Inst{27-24} = 0b0111;
-    let Inst{21-20} = r9_2{10-9};
-    let Inst{11-8} = Rd;
-    let Inst{19-16} = Rs;
-    let Inst{7-1} = r9_2{8-2};
-  }
-
-// Duplex instructions
-//===----------------------------------------------------------------------===//
-include "HexagonIsetDx.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
deleted file mode 100644
index cd19b69..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ /dev/null
@@ -1,497 +0,0 @@
-//=- HexagonInstrInfoV5.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V5 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY
-//===----------------------------------------------------------------------===//
-
-  //Rdd[+]=vrmpybsu(Rss,Rtt)
-let Predicates = [HasV5T] in {
-  def M5_vrmpybsu: T_XTYPE_Vect<"vrmpybsu", 0b110, 0b001, 0>;
-  def M5_vrmacbsu: T_XTYPE_Vect_acc<"vrmpybsu", 0b110, 0b001, 0>;
-
-  //Rdd[+]=vrmpybu(Rss,Rtt)
-  def M5_vrmpybuu: T_XTYPE_Vect<"vrmpybu", 0b100, 0b001, 0>;
-  def M5_vrmacbuu: T_XTYPE_Vect_acc<"vrmpybu", 0b100, 0b001, 0>;
-
-  def M5_vdmpybsu: T_M2_vmpy<"vdmpybsu", 0b101, 0b001, 0, 0, 1>;
-  def M5_vdmacbsu: T_M2_vmpy_acc_sat <"vdmpybsu", 0b001, 0b001, 0, 0>;
-}
-
-// Vector multiply bytes
-// Rdd=vmpyb[s]u(Rs,Rt)
-let Predicates = [HasV5T] in {
-  def M5_vmpybsu: T_XTYPE_mpy64 <"vmpybsu", 0b010, 0b001, 0, 0, 0>;
-  def M5_vmpybuu: T_XTYPE_mpy64 <"vmpybu",  0b100, 0b001, 0, 0, 0>;
-
-  // Rxx+=vmpyb[s]u(Rs,Rt)
-  def M5_vmacbsu: T_XTYPE_mpy64_acc <"vmpybsu", "+", 0b110, 0b001, 0, 0, 0>;
-  def M5_vmacbuu: T_XTYPE_mpy64_acc <"vmpybu", "+", 0b100, 0b001, 0, 0, 0>;
-
-  // Rd=vaddhub(Rss,Rtt):sat
-  let hasNewValue = 1, opNewValue = 0 in
-    def A5_vaddhubs: T_S3op_1 <"vaddhub", IntRegs, 0b01, 0b001, 0, 1>;
-}
-
-def S2_asr_i_p_rnd : S_2OpInstImm<"asr", 0b110, 0b111, u6_0Imm, [], 1>,
-      Requires<[HasV5T]> {
-  bits<6> src2;
-  let Inst{13-8} = src2;
-}
-
-let isAsmParserOnly = 1 in
-def S2_asr_i_p_rnd_goodsyntax
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6_0Imm:$src2),
-    "$dst = asrrnd($src1, #$src2)">;
-
-def C4_fastcorner9 : T_LOGICAL_2OP<"fastcorner9", 0b000, 0, 0>,
-  Requires<[HasV5T]> {
-  let Inst{13,7,4} = 0b111;
-}
-
-def C4_fastcorner9_not : T_LOGICAL_2OP<"!fastcorner9", 0b000, 0, 0>,
-  Requires<[HasV5T]> {
-  let Inst{20,13,7,4} = 0b1111;
-}
-
-let hasNewValue = 1, validSubTargets = HasV5SubT in
-def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
-  "$Rd = popcount($Rss)", [], "", S_2op_tc_2_SLOT23>,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rss;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1000011;
-    let Inst{7-5} = 0b011;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rss;
-  }
-
-let isFP = 1, hasNewValue = 1, opNewValue = 0 in
-class T_MInstFloat <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : MInst<(outs IntRegs:$Rd),
-          (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = "#mnemonic#"($Rs, $Rt)", [],
-  "" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13} = 0b0;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rd;
-  }
-
-let isCommutable = 1 in {
-  def F2_sfadd : T_MInstFloat < "sfadd", 0b000, 0b000>;
-  def F2_sfmpy : T_MInstFloat < "sfmpy", 0b010, 0b000>;
-}
-
-def F2_sfsub : T_MInstFloat < "sfsub", 0b000, 0b001>;
-
-let Itinerary = M_tc_3x_SLOT23 in {
-  def F2_sfmax : T_MInstFloat < "sfmax", 0b100, 0b000>;
-  def F2_sfmin : T_MInstFloat < "sfmin", 0b100, 0b001>;
-}
-
-let Itinerary = M_tc_3or4x_SLOT23 in {
-def F2_sffixupn : T_MInstFloat < "sffixupn", 0b110, 0b000>;
-def F2_sffixupd : T_MInstFloat < "sffixupd", 0b110, 0b001>;
-}
-
-// F2_sfrecipa: Reciprocal approximation for division.
-let Uses = [USR], isPredicateLate = 1, isFP = 1,
-    hasSideEffects = 0, hasNewValue = 1, Itinerary = M_tc_3or4x_SLOT23 in
-def F2_sfrecipa: MInst <
-  (outs IntRegs:$Rd, PredRegs:$Pe),
-  (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd, $Pe = sfrecipa($Rs, $Rt)">,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<2> Pe;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-    let Inst{27-21} = 0b1011111;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6-5}   = Pe;
-    let Inst{4-0}   = Rd;
-  }
-
-// F2_dfcmpeq: Floating point compare for equal.
-let Uses = [USR], isCompare = 1, isFP = 1 in
-class T_fcmp <string mnemonic, RegisterClass RC, bits<3> MinOp,
-              list<dag> pattern = [] >
-  : ALU64Inst <(outs PredRegs:$dst), (ins RC:$src1, RC:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)", pattern,
-  "" , ALU64_tc_2early_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<2> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1101;
-
-    let Inst{27-21} = 0b0010111;
-    let Inst{20-16} = src1;
-    let Inst{12-8}  = src2;
-    let Inst{7-5}   = MinOp;
-    let Inst{1-0}   = dst;
-  }
-
-class T_fcmp64 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
-  : T_fcmp <mnemonic, DoubleRegs, MinOp, []> {
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0010111;
-}
-
-class T_fcmp32 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
-  : T_fcmp <mnemonic, IntRegs, MinOp, []> {
-  let IClass = 0b1100;
-  let Inst{27-21} = 0b0111111;
-}
-
-def F2_dfcmpeq : T_fcmp64<"dfcmp.eq", setoeq, 0b000>;
-def F2_dfcmpgt : T_fcmp64<"dfcmp.gt", setogt, 0b001>;
-def F2_dfcmpge : T_fcmp64<"dfcmp.ge", setoge, 0b010>;
-def F2_dfcmpuo : T_fcmp64<"dfcmp.uo", setuo,  0b011>;
-
-def F2_sfcmpge : T_fcmp32<"sfcmp.ge", setoge, 0b000>;
-def F2_sfcmpuo : T_fcmp32<"sfcmp.uo", setuo,  0b001>;
-def F2_sfcmpeq : T_fcmp32<"sfcmp.eq", setoeq, 0b011>;
-def F2_sfcmpgt : T_fcmp32<"sfcmp.gt", setogt, 0b100>;
-
-// F2 convert template classes:
-let Uses = [USR], isFP = 1 in
-class F2_RDD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
-                         string chop ="">
-  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
-   "$Rdd = "#mnemonic#"($Rss)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rdd;
-     bits<5> Rss;
-
-     let IClass = 0b1000;
-
-     let Inst{27-21} = 0b0000111;
-     let Inst{20-16} = Rss;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rdd;
-  }
-
-let Uses = [USR], isFP = 1 in
-class F2_RDD_RS_CONVERT<string mnemonic, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs DoubleRegs:$Rdd), (ins IntRegs:$Rs),
-   "$Rdd = "#mnemonic#"($Rs)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rdd;
-     bits<5> Rs;
-
-     let IClass = 0b1000;
-
-     let Inst{27-21} = 0b0100100;
-     let Inst{20-16} = Rs;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rdd;
-  }
-
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class F2_RD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
-   "$Rd = "#mnemonic#"($Rss)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rd;
-     bits<5> Rss;
-
-     let IClass = 0b1000;
-
-     let Inst{27-24} = 0b1000;
-     let Inst{23-21} = MinOp;
-     let Inst{20-16} = Rss;
-     let Inst{7-5} = 0b001;
-     let Inst{4-0} = Rd;
-  }
-
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class F2_RD_RS_CONVERT<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-   "$Rd = "#mnemonic#"($Rs)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rd;
-     bits<5> Rs;
-
-     let IClass = 0b1000;
-
-     let Inst{27-24} = 0b1011;
-     let Inst{23-21} = MajOp;
-     let Inst{20-16} = Rs;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rd;
-  }
-
-// Convert single precision to double precision and vice-versa.
-def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000>;
-def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000>;
-
-// Convert Integer to Floating Point.
-def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010>;
-def F2_conv_ud2sf : F2_RD_RSS_CONVERT <"convert_ud2sf", 0b001>;
-def F2_conv_uw2sf : F2_RD_RS_CONVERT <"convert_uw2sf", 0b001, 0b000>;
-def F2_conv_w2sf : F2_RD_RS_CONVERT <"convert_w2sf", 0b010, 0b000>;
-def F2_conv_d2df : F2_RDD_RSS_CONVERT <"convert_d2df", 0b011>;
-def F2_conv_ud2df : F2_RDD_RSS_CONVERT <"convert_ud2df", 0b010>;
-def F2_conv_uw2df : F2_RDD_RS_CONVERT <"convert_uw2df", 0b001>;
-def F2_conv_w2df : F2_RDD_RS_CONVERT <"convert_w2df", 0b010>;
-
-// Convert Floating Point to Integer.
-def F2_conv_df2uw_chop : F2_RD_RSS_CONVERT <"convert_df2uw", 0b101, ":chop">;
-def F2_conv_df2w_chop : F2_RD_RSS_CONVERT <"convert_df2w", 0b111, ":chop">;
-def F2_conv_sf2uw_chop : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b001,
-                                           ":chop">;
-def F2_conv_sf2w_chop : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b001,
-                                          ":chop">;
-def F2_conv_df2d_chop : F2_RDD_RSS_CONVERT <"convert_df2d", 0b110, ":chop">;
-def F2_conv_df2ud_chop : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b111, ":chop">;
-def F2_conv_sf2d_chop : F2_RDD_RS_CONVERT <"convert_sf2d", 0b110, ":chop">;
-def F2_conv_sf2ud_chop : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b101, ":chop">;
-
-// Convert Floating Point to Integer: non-chopped.
-let AddedComplexity = 20, Predicates = [HasV5T] in {
-  def F2_conv_df2d : F2_RDD_RSS_CONVERT <"convert_df2d", 0b000>;
-  def F2_conv_df2ud : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b001>;
-  def F2_conv_sf2ud : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b011>;
-  def F2_conv_sf2d : F2_RDD_RS_CONVERT <"convert_sf2d", 0b100>;
-  def F2_conv_df2uw : F2_RD_RSS_CONVERT <"convert_df2uw", 0b011>;
-  def F2_conv_df2w : F2_RD_RSS_CONVERT <"convert_df2w", 0b100>;
-  def F2_conv_sf2uw : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b000>;
-  def F2_conv_sf2w : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b000>;
-}
-
-// Fix up radicand.
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-def F2_sffixupr: SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-  "$Rd = sffixupr($Rs)",
-  [], "" , S_2op_tc_3or4x_SLOT23>, Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rs;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rs;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rd;
-  }
-
-// F2_sffma: Floating-point fused multiply add.
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class T_sfmpy_acc <bit isSub, bit isLib>
-  : MInst<(outs IntRegs:$Rx),
-          (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#!if(isSub, "-=","+=")#" sfmpy($Rs, $Rt)"#!if(isLib, ":lib",""),
-  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b1111000;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6}     = isLib;
-    let Inst{5}     = isSub;
-    let Inst{4-0}   = Rx;
-  }
-
-def F2_sffma: T_sfmpy_acc <0, 0>;
-def F2_sffms: T_sfmpy_acc <1, 0>;
-def F2_sffma_lib: T_sfmpy_acc <0, 1>;
-def F2_sffms_lib: T_sfmpy_acc <1, 1>;
-
-// Floating-point fused multiply add w/ additional scaling (2**pu).
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-def F2_sffma_sc: MInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt, PredRegs:$Pu),
-  "$Rx += sfmpy($Rs, $Rt, $Pu):scale" ,
-  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<2> Pu;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b1111011;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6-5}   = Pu;
-    let Inst{4-0}   = Rx;
-  }
-
-//===----------------------------------------------------------------------===//
-// :natural forms of vasrh and vasrhub insns
-//===----------------------------------------------------------------------===//
-// S5_asrhub_rnd_sat: Vector arithmetic shift right by immediate with round,
-// saturate, and pack.
-let Defs = [USR_OVF], hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_ASRHUB<bit isSat>
-  : SInst <(outs IntRegs:$Rd),
-  (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rd = vasrhub($Rss, #$u4):"#!if(isSat, "sat", "raw"),
-  [], "", S_2op_tc_2_SLOT23>,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rss;
-    bits<4> u4;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1000011;
-    let Inst{20-16} = Rss;
-    let Inst{13-12} = 0b00;
-    let Inst{11-8} = u4;
-    let Inst{7-6} = 0b10;
-    let Inst{5} = isSat;
-    let Inst{4-0} = Rd;
-  }
-
-def S5_asrhub_rnd_sat : T_ASRHUB <0>;
-def S5_asrhub_sat : T_ASRHUB <1>;
-
-let isAsmParserOnly = 1 in
-def S5_asrhub_rnd_sat_goodsyntax
-  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rd = vasrhub($Rss, #$u4):rnd:sat">, Requires<[HasV5T]>;
-
-// S5_vasrhrnd: Vector arithmetic shift right by immediate with round.
-let hasSideEffects = 0 in
-def S5_vasrhrnd : SInst <(outs DoubleRegs:$Rdd),
-                         (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rdd = vasrh($Rss, #$u4):raw">,
-  Requires<[HasV5T]> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<4> u4;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b0000001;
-    let Inst{20-16} = Rss;
-    let Inst{13-12} = 0b00;
-    let Inst{11-8}  = u4;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rdd;
-  }
-
-let isAsmParserOnly = 1 in
-def S5_vasrhrnd_goodsyntax
-  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rdd = vasrh($Rss,#$u4):rnd">, Requires<[HasV5T]>;
-
-// Floating point reciprocal square root approximation
-let Uses = [USR], isPredicateLate = 1, isFP = 1,
-    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0,
-    validSubTargets = HasV5SubT in
-def F2_sfinvsqrta: SInst <
-  (outs IntRegs:$Rd, PredRegs:$Pe),
-  (ins IntRegs:$Rs),
-  "$Rd, $Pe = sfinvsqrta($Rs)" > ,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<2> Pe;
-    bits<5> Rs;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1011111;
-    let Inst{20-16} = Rs;
-    let Inst{7} = 0b0;
-    let Inst{6-5} = Pe;
-    let Inst{4-0} = Rd;
-  }
-
-// Complex multiply 32x16
-let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
-  def M4_cmpyi_whc : T_S3op_8<"cmpyiwh", 0b101, 1, 1, 1, 1>;
-  def M4_cmpyr_whc : T_S3op_8<"cmpyrwh", 0b111, 1, 1, 1, 1>;
-}
-
-// Classify floating-point value
-let Uses = [USR], isFP = 1 in
-def F2_sfclass : T_TEST_BIT_IMM<"sfclass", 0b111>, Requires<[HasV5T]>;
-
-let Uses = [USR], isFP = 1 in
-def F2_dfclass: ALU64Inst<(outs PredRegs:$Pd), (ins DoubleRegs:$Rss, u5_0Imm:$u5),
-  "$Pd = dfclass($Rss, #$u5)",
-  [], "" , ALU64_tc_2early_SLOT23 > , Requires<[HasV5T]> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> u5;
-
-    let IClass = 0b1101;
-    let Inst{27-21} = 0b1100100;
-    let Inst{20-16} = Rss;
-    let Inst{12-10} = 0b000;
-    let Inst{9-5}   = u5;
-    let Inst{4-3}   = 0b10;
-    let Inst{1-0}   = Pd;
-  }
-
-// Instructions to create floating point constant
-class T_fimm <string mnemonic, RegisterClass RC, bits<4> RegType, bit isNeg>
-  : ALU64Inst<(outs RC:$dst), (ins u10_0Imm:$src),
-  "$dst = "#mnemonic#"(#$src)"#!if(isNeg, ":neg", ":pos"),
-  [], "", ALU64_tc_2_SLOT23>, Requires<[HasV5T]> {
-    bits<5> dst;
-    bits<10> src;
-
-    let IClass = 0b1101;
-    let Inst{27-24} = RegType;
-    let Inst{23}    = 0b0;
-    let Inst{22}    = isNeg;
-    let Inst{21}    = src{9};
-    let Inst{13-5}  = src{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let hasNewValue = 1, opNewValue = 0 in {
-  def F2_sfimm_p : T_fimm <"sfmake", IntRegs, 0b0110, 0>;
-  def F2_sfimm_n : T_fimm <"sfmake", IntRegs, 0b0110, 1>;
-}
-
-def F2_dfimm_p : T_fimm <"dfmake", DoubleRegs, 0b1001, 0>;
-def F2_dfimm_n : T_fimm <"dfmake", DoubleRegs, 0b1001, 1>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
deleted file mode 100644
index c50141b..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td
+++ /dev/null
@@ -1,2068 +0,0 @@
-//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-// Vector load
-let Predicates = [HasV60T, UseHVX] in
-let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
-  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
-                  IType type = TypeCVI_VM_LD>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-// Vector store
-let Predicates = [HasV60T, UseHVX] in
-let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
-class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = CVI_VM_ST,
-                IType type = TypeCVI_VM_ST>
-: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-//===----------------------------------------------------------------------===//
-// Vector loads with base + immediate offset
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access in
-class T_vload_ai<string asmStr>
-  : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
-                asmStr>;
-
-let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
-class T_vload_ai_128B<string asmStr>
-  : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
-                asmStr>;
-
-let isCVLoadable = 1, hasNewValue = 1 in {
-  def V6_vL32b_ai         : T_vload_ai <"$dst = vmem($src1+#$src2)">,
-                            V6_vL32b_ai_enc;
-  def V6_vL32b_nt_ai      : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
-                            V6_vL32b_nt_ai_enc;
-  // 128B
-  def V6_vL32b_ai_128B    : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
-                            V6_vL32b_ai_128B_enc;
-  def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
-                            V6_vL32b_nt_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
-  def V6_vL32Ub_ai      : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
-                          V6_vL32Ub_ai_enc;
-  def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
-                          V6_vL32Ub_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
-    hasNewValue = 1 in {
-  def V6_vL32b_cur_ai    : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
-                           V6_vL32b_cur_ai_enc;
-  def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
-                           V6_vL32b_nt_cur_ai_enc;
-  // 128B
-  def V6_vL32b_cur_ai_128B    : T_vload_ai_128B
-                                <"$dst.cur = vmem($src1+#$src2)">,
-                                V6_vL32b_cur_ai_128B_enc;
-  def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
-                                <"$dst.cur = vmem($src1+#$src2):nt">,
-                                V6_vL32b_nt_cur_ai_128B_enc;
-}
-
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
-  def V6_vL32b_tmp_ai    : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
-                           V6_vL32b_tmp_ai_enc;
-  def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
-                           V6_vL32b_nt_tmp_ai_enc;
-  // 128B
-  def V6_vL32b_tmp_ai_128B    : T_vload_ai_128B
-                                <"$dst.tmp = vmem($src1+#$src2)">,
-                                V6_vL32b_tmp_ai_128B_enc;
-  def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
-                                <"$dst.tmp = vmem($src1+#$src2)">,
-                                V6_vL32b_nt_tmp_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - unconditional
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access, isPredicable = 1 in
-class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
-                   RegisterClass RC, bit isNT>
-  : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_ai         : T_vstore_ai_64B <"vmem", "vS32b_ai">,
-                            V6_vS32b_ai_enc;
-  def V6_vS32b_ai_128B    : T_vstore_ai_128B <"vmem", "vS32b_ai">,
-                            V6_vS32b_ai_128B_enc;
-}
-
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_ai      : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
-                            V6_vS32b_nt_ai_enc;
-  def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
-                            V6_vS32b_nt_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vS32Ub_ai">,
-                          V6_vS32Ub_ai_enc;
-  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vS32Ub_ai">,
-                          V6_vS32Ub_ai_128B_enc;
-}
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - unconditional new
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
-    isPredicable = 1, Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
-class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
-  : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
-  : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
-  : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
-
-def V6_vS32b_new_ai      : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
-def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
-                           V6_vS32b_new_ai_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_ai      : T_vstore_new_ai_64B<"vS32b_ai", 1>,
-                                V6_vS32b_nt_new_ai_enc;
-  def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
-                                V6_vS32b_nt_new_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - conditional
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isPredicated = 1 in
-class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
-                        RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "
-     #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
-                            bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
-                             bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
-                      isPredNot, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pred_ai     : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
-                             V6_vS32b_pred_ai_enc;
-  def V6_vS32b_npred_ai    : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
-                             V6_vS32b_npred_ai_enc;
-  // 128B
-  def V6_vS32b_pred_ai_128B    : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
-                                 V6_vS32b_pred_ai_128B_enc;
-  def V6_vS32b_npred_ai_128B   : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
-                                 V6_vS32b_npred_ai_128B_enc;
-}
-
-
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_ai  : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
-                             V6_vS32b_nt_pred_ai_enc;
-  def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
-                             V6_vS32b_nt_npred_ai_enc;
-  // 128B
-  def V6_vS32b_nt_pred_ai_128B  : T_vstore_pred_ai_128B
-                                  <"vmem", "vS32b_ai", 0, 1>,
-                                  V6_vS32b_nt_pred_ai_128B_enc;
-  def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
-                                  <"vmem", "vS32b_ai", 1, 1>,
-                                  V6_vS32b_nt_npred_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_ai  : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
-                           V6_vS32Ub_pred_ai_enc;
-  def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
-                           V6_vS32Ub_npred_ai_enc;
-  // 128B
-  def V6_vS32Ub_pred_ai_128B  :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
-                               V6_vS32Ub_pred_ai_128B_enc;
-  def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
-                               V6_vS32Ub_npred_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset in
-class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
-                         bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs),
-               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
-
-def V6_vS32b_qpred_ai  : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
-def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
-                         V6_vS32b_nqpred_ai_enc;
-def V6_vS32b_nt_qpred_ai  : T_vstore_qpred_ai_64B <0, 1>,
-                            V6_vS32b_nt_qpred_ai_enc;
-def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
-                            V6_vS32b_nt_nqpred_ai_enc;
-// 128B
-def V6_vS32b_qpred_ai_128B  : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
-def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
-                              V6_vS32b_nqpred_ai_128B_enc;
-def V6_vS32b_nt_qpred_ai_128B  : T_vstore_qpred_ai_128B<0, 1>,
-                                 V6_vS32b_nt_qpred_ai_128B_enc;
-def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
-                                 V6_vS32b_nt_nqpred_ai_128B_enc;
-
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - conditional new
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
-    isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
-class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
-                            bit isPredNot, bit isNT>
-  : V6_STInst <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
-                          isPredNot, isNT>;
-
-
-def V6_vS32b_new_pred_ai     : T_vstore_new_pred_ai_64B <"vS32b_ai">,
-                               V6_vS32b_new_pred_ai_enc;
-def V6_vS32b_new_npred_ai    : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
-                               V6_vS32b_new_npred_ai_enc;
-// 128B
-def V6_vS32b_new_pred_ai_128B     : T_vstore_new_pred_ai_128B <"vS32b_ai">,
-                                    V6_vS32b_new_pred_ai_128B_enc;
-def V6_vS32b_new_npred_ai_128B    : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
-                                    V6_vS32b_new_npred_ai_128B_enc;
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pred_ai  : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
-                                 V6_vS32b_nt_new_pred_ai_enc;
-  def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
-                                 V6_vS32b_nt_new_npred_ai_enc;
-  // 128B
-  def V6_vS32b_nt_new_pred_ai_128B  : T_vstore_new_pred_ai_128B
-                                      <"vS32b_ai", 0, 1>,
-                                      V6_vS32b_nt_new_pred_ai_128B_enc;
-  def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
-                                      <"vS32b_ai", 1, 1>,
-                                      V6_vS32b_nt_new_npred_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, hasNewValue = 1 in
-class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
-  : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
-    "$src1 = $_dst_">;
-
-let accessSize = Vector64Access in
-class T_vload_pi_64B <string asmStr>
-  : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vload_pi_128B <string asmStr>
-  : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
-
-let isCVLoadable = 1 in {
-  def V6_vL32b_pi    : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
-                       V6_vL32b_pi_enc;
-  def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
-                       V6_vL32b_nt_pi_enc;
-  // 128B
-  def V6_vL32b_pi_128B    : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
-                            V6_vL32b_pi_128B_enc;
-  def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
-                            V6_vL32b_nt_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
-  def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
-                     V6_vL32Ub_pi_enc;
-  // 128B
-  def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
-                          V6_vL32Ub_pi_128B_enc;
-}
-
-let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
-  def V6_vL32b_cur_pi    : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
-                           V6_vL32b_cur_pi_enc;
-  def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
-                           V6_vL32b_nt_cur_pi_enc;
-  // 128B
-  def V6_vL32b_cur_pi_128B    : T_vload_pi_128B
-                                <"$dst.cur = vmem($src1++#$src2)">,
-                                V6_vL32b_cur_pi_128B_enc;
-  def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
-                                <"$dst.cur = vmem($src1++#$src2):nt">,
-                                V6_vL32b_nt_cur_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
-  def V6_vL32b_tmp_pi    : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
-                           V6_vL32b_tmp_pi_enc;
-  def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
-                           V6_vL32b_nt_tmp_pi_enc;
-  //128B
-  def V6_vL32b_tmp_pi_128B    : T_vload_pi_128B
-                                <"$dst.tmp = vmem($src1++#$src2)">,
-                                V6_vL32b_tmp_pi_128B_enc;
-  def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
-                                <"$dst.tmp = vmem($src1++#$src2):nt">,
-                                V6_vL32b_nt_tmp_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, isPredicable = 1 in
-class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
-                   RegisterClass RC, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
-  def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
-                         V6_vS32b_pi_128B_enc;
-}
-
-let isNVStorable = 1 , isNonTemporal = 1  in {
-  def V6_vS32b_nt_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
-                            V6_vS32b_nt_pi_enc;
-  def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
-                            V6_vS32b_nt_pi_128B_enc;
-}
-
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pi      : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
-                          V6_vS32Ub_pi_enc;
-  def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
-                          V6_vS32Ub_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment unconditional .new vector stores with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, isNVStore = 1 in
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
-class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
-    "$src1 = $_dst_">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
-  : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
-  : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
-
-
-def V6_vS32b_new_pi      : T_vstore_new_pi_64B <"vS32b_pi">,
-                           V6_vS32b_new_pi_enc;
-def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
-                           V6_vS32b_new_pi_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pi      : T_vstore_new_pi_64B <"vS32b_pi", 1>,
-                                V6_vS32b_nt_new_pi_enc;
-  def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
-                                V6_vS32b_nt_new_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional vector stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, addrMode = PostInc in
-class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
-                        RegisterClass RC, bit isPredNot, bit isNT>
-  : V6_STInst<(outs IntRegs:$_dst_),
-             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
-                            bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
-                             bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
-                      isPredNot, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pred_pi     : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
-                             V6_vS32b_pred_pi_enc;
-  def V6_vS32b_npred_pi    : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
-                             V6_vS32b_npred_pi_enc;
-  // 128B
-  def V6_vS32b_pred_pi_128B  : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
-                               V6_vS32b_pred_pi_128B_enc;
-  def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
-                               V6_vS32b_npred_pi_128B_enc;
-}
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_pi  : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
-                             V6_vS32b_nt_pred_pi_enc;
-  def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
-                             V6_vS32b_nt_npred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_pred_pi_128B  : T_vstore_pred_pi_128B
-                                  <"vmem", "vS32b_pi", 0, 1>,
-                                  V6_vS32b_nt_pred_pi_128B_enc;
-  def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
-                                  <"vmem", "vS32b_pi", 1, 1>,
-                                  V6_vS32b_nt_npred_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_pi  : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
-                           V6_vS32Ub_pred_pi_enc;
-  def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
-                           V6_vS32Ub_npred_pi_enc;
-  // 128B
-  def V6_vS32Ub_pred_pi_128B  : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
-                                V6_vS32Ub_pred_pi_128B_enc;
-  def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
-                                V6_vS32Ub_npred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with immediate offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc in
-class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
-                         bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">;
-
-let accessSize = Vector64Access in
-class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
-
-def V6_vS32b_qpred_pi  : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
-def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
-// 128B
-def V6_vS32b_qpred_pi_128B  : T_vstore_qpred_pi_128B,
-                              V6_vS32b_qpred_pi_128B_enc;
-def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
-                              V6_vS32b_nqpred_pi_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_qpred_pi  : T_vstore_qpred_pi_64B <0, 1>,
-                              V6_vS32b_nt_qpred_pi_enc;
-  def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
-                              V6_vS32b_nt_nqpred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_qpred_pi_128B  : T_vstore_qpred_pi_128B<0, 1>,
-                                   V6_vS32b_nt_qpred_pi_128B_enc;
-  def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
-                                   V6_vS32b_nt_nqpred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with immediate offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
-    isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
-class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
-                            bit isPredNot, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new", [],
-    "$src2 = $_dst_"> , NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
-                          isPredNot, isNT>;
-
-def V6_vS32b_new_pred_pi     : T_vstore_new_pred_pi_64B <"vS32b_pi">,
-                               V6_vS32b_new_pred_pi_enc;
-def V6_vS32b_new_npred_pi    : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
-                               V6_vS32b_new_npred_pi_enc;
-// 128B
-def V6_vS32b_new_pred_pi_128B    : T_vstore_new_pred_pi_128B <"vS32b_pi">,
-                                   V6_vS32b_new_pred_pi_128B_enc;
-def V6_vS32b_new_npred_pi_128B   : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
-                                   V6_vS32b_new_npred_pi_128B_enc;
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pred_pi  : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
-                                 V6_vS32b_nt_new_pred_pi_enc;
-  def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
-                                 V6_vS32b_nt_new_npred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
-                                     <"vS32b_pi", 0, 1>,
-                                     V6_vS32b_nt_new_pred_pi_128B_enc;
-  def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
-                                      <"vS32b_pi", 1, 1>,
-                                      V6_vS32b_nt_new_npred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector loads with register offset
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1 in
-class T_vload_ppu<string asmStr>
-  : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let isCVLoadable = 1 in {
-  def V6_vL32b_ppu    : T_vload_ppu <"$dst = vmem($src1++$src2)">,
-                        V6_vL32b_ppu_enc;
-  def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
-                        V6_vL32b_nt_ppu_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
-def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
-                     V6_vL32Ub_ppu_enc;
-
-let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
-  def V6_vL32b_cur_ppu    : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
-                             V6_vL32b_cur_ppu_enc;
-  def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
-                             V6_vL32b_nt_cur_ppu_enc;
-}
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
-  def V6_vL32b_tmp_ppu    : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
-                             V6_vL32b_tmp_ppu_enc;
-  def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
-                             V6_vL32b_nt_tmp_ppu_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with register offset
-//===----------------------------------------------------------------------===//
-let isPredicable = 1 in
-class T_vstore_ppu <string mnemonic, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
-    mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_ppu    : T_vstore_ppu <"vmem">,
-                        V6_vS32b_ppu_enc;
-  let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
-  def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
-                        V6_vS32b_nt_ppu_enc;
-}
-
-let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
-def V6_vS32Ub_ppu   : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
-class T_vstore_new_ppu <bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
-    "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let BaseOpcode = "vS32b_ppu" in
-def V6_vS32b_new_ppu    : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
-
-let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
-def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1 in
-class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst<(outs IntRegs:$_dst_),
-           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-}
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
-  def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
-}
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_ppu  : T_vstore_pred_ppu <"vmem", 0, 1>,
-                              V6_vS32b_nt_pred_ppu_enc;
-  def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
-                              V6_vS32b_nt_npred_ppu_enc;
-}
-
-let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
-    Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_ppu  : T_vstore_pred_ppu <"vmemu">,
-                            V6_vS32Ub_pred_ppu_enc;
-  def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
-                            V6_vS32Ub_npred_ppu_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with register offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-        (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel;
-
-def V6_vS32b_qpred_ppu  : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
-def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
-def V6_vS32b_nt_qpred_ppu  : T_vstore_qpred_ppu<0, 1>,
-                             V6_vS32b_nt_qpred_ppu_enc;
-def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
-                             V6_vS32b_nt_nqpred_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
-    isNewValue = 1, opNewValue = 4, isNVStore = 1 in
-class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-}
-
-let BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_new_pred_ppu  : T_vstore_new_pred_ppu,
-                               V6_vS32b_new_pred_ppu_enc;
-  def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
-                               V6_vS32b_new_npred_ppu_enc;
-}
-
-let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
-def V6_vS32b_nt_new_pred_ppu :  T_vstore_new_pred_ppu<0, 1>,
-                                V6_vS32b_nt_new_pred_ppu_enc;
-def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
-                                V6_vS32b_nt_new_npred_ppu_enc;
-}
-
-
-// Vector load/store pseudos
-
-let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
-class STrivv_template<RegisterClass RC>
-  : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
-
-def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-
-
-let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
-class LDrivv_template<RegisterClass RC>
-  : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
-
-def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-
-// Store vector predicate pseudo.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
-  def PS_vstorerq_ai : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vstorerq_ai_128B : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXDbl]>;
-}
-
-// Load vector predicate pseudo.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
-  def PS_vloadrq_ai : LDInst<(outs VecPredRegs:$dst),
-              (ins IntRegs:$base, s32_0Imm:$offset),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vloadrq_ai_128B : LDInst<(outs VecPredRegs128B:$dst),
-              (ins IntRegs:$base, s32_0Imm:$offset),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXDbl]>;
-}
-
-class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = CVI_VA_DV,
-              IType type = TypeCVI_VA_DV>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
-  def PS_vselect: VSELInst<(outs VectorRegs:$dst),
-        (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
-        Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
-        (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
-        "", []>, Requires<[HasV60T,UseHVXDbl]>;
-  def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
-        (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
-        Requires<[HasV60T,UseHVXSgl]>;
-  def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
-        (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
-        "", []>, Requires<[HasV60T,UseHVXDbl]>;
-}
-
-let hasNewValue = 1 in
-class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
-    asmString >;
-
-multiclass T_vmpy <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_vmpy <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                      !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_vmpy_VV <string asmString>:
-  T_vmpy <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_vmpy_WW <string asmString>:
-  T_vmpy <asmString, VecDblRegs, VecDblRegs>;
-
-multiclass T_vmpy_VW <string asmString>:
-  T_vmpy <asmString, VectorRegs, VecDblRegs>;
-
-multiclass T_vmpy_WV <string asmString>:
-  T_vmpy <asmString, VecDblRegs, VectorRegs>;
-
-defm V6_vtmpyb   :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
-defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
-defm V6_vdsaduh  :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
-defm V6_vmpybus  :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
-defm V6_vmpabus  :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
-defm V6_vmpahb   :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
-defm V6_vmpyh    :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
-defm V6_vmpyuh   :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
-defm V6_vmpyiwh  :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
-defm V6_vtmpyhb  :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
-defm V6_vmpyub   :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
-
-let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
-defm V6_vmpyihb  :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
-
-defm V6_vdmpybus_dv :
-     T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
-defm V6_vdmpyhsusat :
-     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
-defm V6_vdmpyhsuisat :
-     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
-defm V6_vdmpyhsat :
-     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
-defm V6_vdmpyhisat :
-     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
-defm V6_vdmpyhb_dv :
-     T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
-defm V6_vmpyhss :
-     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
-defm V6_vmpyhsrs :
-     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in
-defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
-
-let Itinerary = CVI_VX, Type = TypeCVI_VX in {
-defm V6_vdmpyhb  : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
-defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
-defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
-defm V6_vmpyiwb  : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
-defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vasrw  : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
-defm V6_vasrh  : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
-defm V6_vaslw  : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
-defm V6_vaslh  : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
-defm V6_vlsrw  : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
-defm V6_vlsrh  : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
-}
-
-let hasNewValue = 1 in
-class T_HVX_alu <string asmString, InstrItinClass itin,
-                 RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
-    asmString >{
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_alu <string asmString, RegisterClass RCout,
-           RegisterClass RCin, InstrItinClass itin> {
-  def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu <asmString, itin,
-                              !cast<RegisterClass>(RCout#"128B"),
-                              !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_alu_VV <string asmString>:
-  T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
-
-multiclass T_HVX_alu_WW <string asmString>:
-  T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
-
-multiclass T_HVX_alu_WV <string asmString>:
-  T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
-
-
-let Itinerary  =  CVI_VX, Type  =  TypeCVI_VX in {
-defm V6_vrmpyubv :
-     T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
-defm V6_vrmpybv :
-     T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
-defm V6_vrmpybusv :
-     T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
-defm V6_vabsdiffub :
-     T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
-defm V6_vabsdiffh :
-     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
-defm V6_vabsdiffuh :
-     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
-defm V6_vabsdiffw :
-     T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
-}
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vdmpyhvsat :
-     T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
-defm V6_vmpyhvsrs :
-     T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
-defm V6_vmpyih :
-     T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
-}
-
-defm V6_vand :
-     T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
-defm V6_vor :
-     T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
-defm V6_vxor :
-     T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
-defm V6_vaddw :
-     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
-defm V6_vaddubsat :
-     T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
-defm V6_vadduhsat :
-     T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
-defm V6_vaddhsat :
-     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
-defm V6_vaddwsat :
-     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
-defm V6_vsubb :
-     T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
-defm V6_vsubh :
-     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
-defm V6_vsubw :
-     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
-defm V6_vsububsat :
-     T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
-defm V6_vsubuhsat :
-     T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
-defm V6_vsubhsat :
-     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
-defm V6_vsubwsat :
-     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
-defm V6_vavgub :
-     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
-defm V6_vavguh :
-     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
-defm V6_vavgh :
-     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
-defm V6_vavgw :
-     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
-defm V6_vnavgub :
-     T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
-defm V6_vnavgh :
-     T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
-defm V6_vnavgw :
-     T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
-defm V6_vavgubrnd :
-     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
-defm V6_vavguhrnd :
-     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
-defm V6_vavghrnd :
-     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
-defm V6_vavgwrnd :
-     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
-
-defm V6_vmpybv :
-     T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
-defm V6_vmpyubv :
-     T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
-defm V6_vmpybusv :
-     T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
-defm V6_vmpyhv :
-     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
-defm V6_vmpyuhv :
-     T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
-defm V6_vmpyhus :
-     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
-defm V6_vaddubh :
-     T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
-defm V6_vadduhw :
-     T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
-defm V6_vaddhw :
-     T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
-defm V6_vsububh :
-     T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
-defm V6_vsubuhw :
-     T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
-defm V6_vsubhw :
-     T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
-
-defm V6_vaddb_dv :
-     T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
-defm V6_vaddh_dv :
-     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
-defm V6_vaddw_dv :
-     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
-defm V6_vaddubsat_dv :
-     T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
-defm V6_vadduhsat_dv :
-     T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
-defm V6_vaddhsat_dv :
-     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
-defm V6_vaddwsat_dv :
-     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
-defm V6_vsubb_dv :
-     T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
-defm V6_vsubh_dv :
-     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
-defm V6_vsubw_dv :
-     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
-defm V6_vsububsat_dv :
-     T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
-defm V6_vsubuhsat_dv :
-     T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
-defm V6_vsubhsat_dv :
-     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
-defm V6_vsubwsat_dv :
-     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
-
-let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
-defm V6_vmpabusv :
-     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
-defm V6_vmpabuuv :
-     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
-}
-
-let isAccumulator = 1, hasNewValue = 1 in
-class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
-                     RegisterClass RCin1, RegisterClass RCin2>
-  : CVI_VA_Resource1 <(outs RCout:$dst),
-                      (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
-           RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
-  def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
-                   !cast<RegisterClass>(RCout#"128B"),
-                   !cast<RegisterClass>(RCin1#"128B"),
-                   !cast<RegisterClass>(RCin2#
-                   !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
-}
-
-multiclass T_HVX_vmpyacc_VVR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
-
-multiclass T_HVX_vmpyacc_VWR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WVR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WWR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_VVV <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WVV <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
-
-
-defm V6_vtmpyb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
-     V6_vtmpyb_acc_enc;
-defm V6_vtmpybus_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
-     V6_vtmpybus_acc_enc;
-defm V6_vtmpyhb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
-     V6_vtmpyhb_acc_enc;
-defm V6_vdmpyhb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
-     V6_vdmpyhb_acc_enc;
-defm V6_vrmpyub_acc :
-     T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
-     V6_vrmpyub_acc_enc;
-defm V6_vrmpybus_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
-     V6_vrmpybus_acc_enc;
-defm V6_vdmpybus_acc :
-     T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
-     V6_vdmpybus_acc_enc;
-defm V6_vdmpybus_dv_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
-     V6_vdmpybus_dv_acc_enc;
-defm V6_vdmpyhsuisat_acc :
-     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
-     V6_vdmpyhsuisat_acc_enc;
-defm V6_vdmpyhisat_acc :
-     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhisat_acc_enc;
-defm V6_vdmpyhb_dv_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
-     V6_vdmpyhb_dv_acc_enc;
-defm V6_vmpybus_acc :
-     T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
-     V6_vmpybus_acc_enc;
-defm V6_vmpabus_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
-     V6_vmpabus_acc_enc;
-defm V6_vmpahb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
-     V6_vmpahb_acc_enc;
-defm V6_vmpyhsat_acc :
-     T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
-     V6_vmpyhsat_acc_enc;
-defm V6_vmpyuh_acc :
-     T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
-     V6_vmpyuh_acc_enc;
-defm V6_vmpyiwb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
-     V6_vmpyiwb_acc_enc;
-defm V6_vdsaduh_acc :
-     T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
-     V6_vdsaduh_acc_enc;
-defm V6_vmpyihb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
-     V6_vmpyihb_acc_enc;
-defm V6_vmpyub_acc :
-     T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
-     V6_vmpyub_acc_enc;
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vdmpyhsusat_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
-     V6_vdmpyhsusat_acc_enc;
-defm V6_vdmpyhsat_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhsat_acc_enc;
-defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
-     <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vaslw_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
-defm V6_vasrw_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
-}
-
-defm V6_vdmpyhvsat_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhvsat_acc_enc;
-defm V6_vmpybusv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
-     V6_vmpybusv_acc_enc;
-defm V6_vmpybv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
-defm V6_vmpyhus_acc :
-     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
-defm V6_vmpyhv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
-defm V6_vmpyiewh_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
-     V6_vmpyiewh_acc_enc;
-defm V6_vmpyiewuh_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
-     V6_vmpyiewuh_acc_enc;
-defm V6_vmpyih_acc :
-     T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
-defm V6_vmpyowh_rnd_sacc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
-     V6_vmpyowh_rnd_sacc_enc;
-defm V6_vmpyowh_sacc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
-     V6_vmpyowh_sacc_enc;
-defm V6_vmpyubv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
-     V6_vmpyubv_acc_enc;
-defm V6_vmpyuhv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
-     V6_vmpyuhv_acc_enc;
-defm V6_vrmpybusv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
-     V6_vrmpybusv_acc_enc;
-defm V6_vrmpybv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
-defm V6_vrmpyubv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
-     V6_vrmpyubv_acc_enc;
-
-
-class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst),
-                      (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = CVI_VA;
-  let Type = TypeCVI_VA;
-}
-
-multiclass T_HVX_vcmp <string asmString> {
-  def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_veqb_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
-defm V6_veqh_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
-defm V6_veqw_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
-defm V6_vgtb_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
-defm V6_vgth_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
-defm V6_vgtw_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
-defm V6_vgtub_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
-defm V6_vgtuh_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
-defm V6_vgtuw_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
-defm V6_veqb_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
-defm V6_veqh_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
-defm V6_veqw_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
-defm V6_vgtb_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
-defm V6_vgth_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
-defm V6_vgtw_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
-defm V6_vgtub_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
-defm V6_vgtuh_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
-defm V6_vgtuw_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
-defm V6_veqb_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
-defm V6_veqh_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
-defm V6_veqw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
-defm V6_vgtb_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
-defm V6_vgth_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
-defm V6_vgtw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
-defm V6_vgtub_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
-defm V6_vgtuh_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
-defm V6_vgtuw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
-
-defm V6_vminub :
-     T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
-defm V6_vminuh :
-     T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
-defm V6_vminh :
-     T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
-defm V6_vminw :
-     T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
-defm V6_vmaxub :
-     T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
-defm V6_vmaxuh :
-     T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
-defm V6_vmaxh :
-     T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
-defm V6_vmaxw :
-     T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
-defm V6_vshuffeb :
-     T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
-defm V6_vshuffob :
-     T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
-defm V6_vshufeh :
-     T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
-defm V6_vshufoh :
-     T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vmpyowh_rnd :
-     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
-     V6_vmpyowh_rnd_enc;
-defm V6_vmpyiewuh :
-     T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
-defm V6_vmpyewuh :
-     T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
-defm V6_vmpyowh :
-     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
-defm V6_vmpyiowh :
-     T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
-}
-let Itinerary = CVI_VX, Type = TypeCVI_VX in
-defm V6_vmpyieoh :
-     T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
-defm V6_vshufoeh :
-     T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
-defm V6_vshufoeb :
-     T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
-}
-
-let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
-defm V6_vcombine :
-     T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
-
-let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
-defm V6_vsathub :
-     T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
-defm V6_vsatwh :
-     T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vroundwh :
-     T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
-defm V6_vroundwuh :
-     T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
-defm V6_vroundhb :
-     T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
-defm V6_vroundhub :
-     T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
-defm V6_vasrwv :
-     T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
-defm V6_vlsrwv :
-     T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
-defm V6_vlsrhv :
-     T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
-defm V6_vasrhv :
-     T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
-defm V6_vaslwv :
-     T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
-defm V6_vaslhv :
-     T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
-}
-
-defm V6_vaddb :
-     T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
-defm V6_vaddh :
-     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in {
-defm V6_vdelta :
-     T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
-defm V6_vrdelta :
-     T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
-defm V6_vdealb4w :
-     T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
-defm V6_vpackeb :
-     T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
-defm V6_vpackeh :
-     T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
-defm V6_vpackhub_sat :
-     T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
-defm V6_vpackhb_sat :
-     T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
-defm V6_vpackwuh_sat :
-     T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
-defm V6_vpackwh_sat :
-     T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
-defm V6_vpackob :
-     T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
-defm V6_vpackoh :
-     T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
-  : CVI_VA_Resource1 <(outs RC2:$dst),
-                      (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = CVI_VA;
-  let Type = TypeCVI_VA;
-}
-
-multiclass T_HVX_condALU <string asmString> {
-  def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_vaddbq  : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
-                  V6_vaddbq_enc;
-defm V6_vaddhq  : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
-                  V6_vaddhq_enc;
-defm V6_vaddwq  : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
-                  V6_vaddwq_enc;
-defm V6_vsubbq  : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
-                  V6_vsubbq_enc;
-defm V6_vsubhq  : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
-                  V6_vsubhq_enc;
-defm V6_vsubwq  : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
-                  V6_vsubwq_enc;
-defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
-                  V6_vaddbnq_enc;
-defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
-                  V6_vaddhnq_enc;
-defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
-                  V6_vaddwnq_enc;
-defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
-                  V6_vsubbnq_enc;
-defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
-                  V6_vsubhnq_enc;
-defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
-                  V6_vsubwnq_enc;
-
-let hasNewValue = 1 in
-class T_HVX_alu_2op <string asmString, InstrItinClass itin,
-                 RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
-    asmString >{
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
-           RegisterClass RCin, InstrItinClass itin> {
-  def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu_2op <asmString, itin,
-                              !cast<RegisterClass>(RCout#"128B"),
-                              !cast<RegisterClass>(RCin#"128B")>;
-}
-
-let hasNewValue = 1 in
-multiclass T_HVX_alu_2op_VV <string asmString>:
-  T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
-
-multiclass T_HVX_alu_2op_WV <string asmString>:
-  T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
-
-
-defm V6_vabsh     : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
-                    V6_vabsh_enc;
-defm V6_vabsw     : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
-                    V6_vabsw_enc;
-defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
-                    V6_vabsh_sat_enc;
-defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
-                    V6_vabsw_sat_enc;
-defm V6_vnot      : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
-                    V6_vnot_enc;
-defm V6_vassign   : T_HVX_alu_2op_VV <"$dst = $src1">,
-                    V6_vassign_enc;
-
-defm V6_vzb       : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
-                    V6_vzb_enc;
-defm V6_vzh       : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
-                    V6_vzh_enc;
-defm V6_vsb       : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
-                    V6_vsb_enc;
-defm V6_vsh       : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
-                    V6_vsh_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in {
-defm V6_vdealh    : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
-                    V6_vdealh_enc;
-defm V6_vdealb    : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
-                    V6_vdealb_enc;
-defm V6_vshuffh   : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
-                    V6_vshuffh_enc;
-defm V6_vshuffb   : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
-                    V6_vshuffb_enc;
-}
-
-let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
-defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
-                    V6_vunpackub_enc;
-defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
-                    V6_vunpackuh_enc;
-defm V6_vunpackb  : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
-                    V6_vunpackb_enc;
-defm V6_vunpackh  : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
-                    V6_vunpackh_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vcl0w     : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
-                    V6_vcl0w_enc;
-defm V6_vcl0h     : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
-                    V6_vcl0h_enc;
-defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
-                    V6_vnormamtw_enc;
-defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
-                    V6_vnormamth_enc;
-defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
-                     V6_vpopcounth_enc;
-}
-
-let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
-    Type = TypeCVI_VX_DV in
-class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$dst),
-                      (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
-    asmString, [], "$dst = $_src_" > ;
-
-
-multiclass T_HVX_vmpyacc2 <string asmString> {
-  def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
-}
-
-defm V6_vrmpybusi_acc :
-     T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
-     V6_vrmpybusi_acc_enc;
-defm V6_vrsadubi_acc :
-     T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
-     V6_vrsadubi_acc_enc;
-defm V6_vrmpyubi_acc :
-     T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
-     V6_vrmpyubi_acc_enc;
-
-
-let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
-class T_HVX_vmpy2 <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
-    asmString>;
-
-
-multiclass T_HVX_vmpy2 <string asmString> {
-  def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
-}
-
-defm V6_vrmpybusi :
-     T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
-defm V6_vrsadubi :
-     T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
-defm V6_vrmpyubi :
-     T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
-
-
-let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
-    hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
-class T_HVX_perm <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
-                      (ins RC:$src1, RC:$src2, IntRegs:$src3),
-    asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
-
-multiclass T_HVX_perm <string asmString> {
-  def NAME : T_HVX_perm <asmString, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
-}
-
-let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
-  defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
-  defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
-}
-
-// Conditional vector move.
-let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_HVX_cmov <bit isPredNot, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
-    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-multiclass T_HVX_cmov <bit isPredNot = 0> {
-  def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
-}
-
-defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
-defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
-
-// Conditional vector combine.
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
-    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 < (outs RCout:$dst),
-    (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
-    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-multiclass T_HVX_ccombine <bit isPredNot = 0> {
-  def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
-}
-
-defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
-defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
-
-let hasNewValue = 1 in
-class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst),
-    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
-    asmString >;
-
-multiclass T_HVX_shift <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_HVX_shift <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                           !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_shift_VV <string asmString>:
-  T_HVX_shift <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_shift_WV <string asmString>:
-  T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
-defm V6_valignb :
-     T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
-defm V6_vlalignb :
-     T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vasrwh :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
-defm V6_vasrwhsat :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
-     V6_vasrwhsat_enc;
-defm V6_vasrwhrndsat :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
-     V6_vasrwhrndsat_enc;
-defm V6_vasrwuhsat :
-     T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
-     V6_vasrwuhsat_enc;
-defm V6_vasrhubsat :
-     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
-     V6_vasrhubsat_enc;
-defm V6_vasrhubrndsat :
-     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
-     V6_vasrhubrndsat_enc;
-defm V6_vasrhbrndsat :
-     T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
-     V6_vasrhbrndsat_enc;
-}
-
-// Assembler mapped -- alias?
-//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
-defm V6_vshuffvdd :
-     T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
-defm V6_vdealvdd :
-     T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
-}
-
-let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
-class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
-    asmString, [], "$dst = $_src_">;
-
-multiclass T_HVX_unpack <string asmString> {
-  def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
-}
-
-defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
-defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
-    hasSideEffects = 0 in
-class T_HVX_valign <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3_0Imm:$src3),
-    asmString>;
-
-multiclass T_HVX_valign <string asmString> {
-  def NAME : T_HVX_valign <asmString, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
-}
-
-defm V6_valignbi :
-     T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
-defm V6_vlalignbi :
-     T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
-class T_HVX_predAlu <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
-    asmString>;
-
-multiclass T_HVX_predAlu <string asmString> {
-  def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
-}
-
-defm V6_pred_and  : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
-defm V6_pred_or   : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
-defm V6_pred_xor  : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
-defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
-defm V6_pred_and_n :
-     T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA in
-class T_HVX_prednot <RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
-    "$dst = not($src1)">, V6_pred_not_enc;
-
-def V6_pred_not : T_HVX_prednot <VecPredRegs>;
-let isCodeGenOnly =  1 in
-def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA in
-class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
-    asmString >;
-
-multiclass T_HVX_vcmp2 <string asmString> {
-  def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_veqb : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
-defm V6_veqh : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
-defm V6_veqw : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
-defm V6_vgtb : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
-defm V6_vgth : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
-defm V6_vgtw : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
-defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
-defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
-defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
-
-let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
-class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
-    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
-
-def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
-
-let isAccumulator = 1 in
-class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
-    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
-
-def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
-
-let hasNewValue =  1, hasSideEffects = 0 in
-class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCin:$src1, IntRegs:$src2),
-    "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
-
-def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_V6_lvsplatw <RegisterClass RC>
-  : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
-    "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
-
-def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
-
-
-let hasNewValue = 1 in
-class T_V6_vinsertwr <RegisterClass RC>
-  : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
-    "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
-    V6_vinsertwr_enc;
-
-def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
-
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
-class T_V6_pred_scalar2 <RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
-    "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
-
-def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
-
-class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
-    "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
-
-def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
-
-let validSubTargets = HasV60SubT in
-class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
-  : SInst2 <(outs RC:$dst), (ins  RC:$src1, ImmOp:$src2), asmString>;
-
-class T_HVX_rol_R <string asmString>
-  : T_HVX_rol <asmString, IntRegs, u5_0Imm>;
-class T_HVX_rol_P <string asmString>
-  : T_HVX_rol <asmString, DoubleRegs, u6_0Imm>;
-
-def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
-let hasNewValue = 1, opNewValue = 0 in
-def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
-
-let validSubTargets = HasV60SubT in
-class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
-  : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
-    asmString, [], "$dst = $_src_" >;
-
-class T_HVX_rol_acc_P <string asmString>
-  : T_HVX_rol_acc <asmString, DoubleRegs, u6_0Imm>;
-
-class T_HVX_rol_acc_R <string asmString>
-  : T_HVX_rol_acc <asmString, IntRegs, u5_0Imm>;
-
-def S6_rol_i_p_nac :
-    T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
-def S6_rol_i_p_acc :
-    T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
-def S6_rol_i_p_and :
-    T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
-def S6_rol_i_p_or  :
-    T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
-def S6_rol_i_p_xacc :
-    T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
-
-let hasNewValue = 1, opNewValue = 0 in {
-def S6_rol_i_r_nac :
-    T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
-def S6_rol_i_r_acc :
-    T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
-def S6_rol_i_r_and :
-    T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
-def S6_rol_i_r_or :
-    T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
-def S6_rol_i_r_xacc :
-    T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
-}
-
-let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
-class T_V6_extractw <RegisterClass RC>
-  : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
-    "$dst = vextract($src1,$src2)">, V6_extractw_enc;
-
-def V6_extractw : T_V6_extractw <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
-
-let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT  in
-class T_sys0op <string asmString>
-  : ST1Inst <(outs), (ins), asmString>;
-
-let isSolo = 1, validSubTargets = HasV55SubT in {
-def Y5_l2gunlock   : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
-def Y5_l2gclean    : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
-def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
-}
-
-class T_sys1op <string asmString, RegisterClass RC>
-  : ST1Inst <(outs), (ins RC:$src1), asmString>;
-
-class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
-class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
-
-let isSoloAX = 1, validSubTargets = HasV55SubT in
-def Y5_l2unlocka     : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
-
-let isSolo = 1, validSubTargets = HasV60SubT in {
-def Y6_l2gcleanpa    : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
-def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
-}
-
-let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
-    validSubTargets = HasV55SubT in
-def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
-  "$dst = l2locka($src1)">, Y5_l2locka_enc;
-
-// not defined on etc side. why?
-// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
-
-let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
-    hasSideEffects = 0,
-validSubTargets = HasV55SubT in
-def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
-  (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
-  "$dst1,$dst2 = vacsh($src1,$src2)", [],
-  "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
-    hasSideEffects = 0 in
-class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
-                  RegisterClass RCin2>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
-
-multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
-  def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
-                               VecPredRegs128B, VectorRegs128B>;
-}
-
-multiclass T_HVX_alu2_V <string asmString> :
-  T_HVX_alu2 <asmString, VectorRegs>;
-
-multiclass T_HVX_alu2_W <string asmString> :
-  T_HVX_alu2 <asmString, VecDblRegs>;
-
-defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
-    hasSideEffects = 0 in
-defm V6_vmux  : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
-
-class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
-
-multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                           !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_vlutb_V <string asmString> :
-  T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_vlutb_W <string asmString> :
-  T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
-
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
-class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
-                       RegisterClass RCin>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
-    asmString, [], "$dst = $_src_">;
-
-multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
-                            RegisterClass RCin> {
-  def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vlutb_acc<asmString,
-                                   !cast<RegisterClass>(RCout#"128B"),
-                                   !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_vlutb_acc_V <string asmString> :
-  T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_vlutb_acc_W <string asmString> :
-  T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
-
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
-defm V6_vlutvvb:
-     T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
-
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
-defm V6_vlutvwh:
-     T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
-
-let hasNewValue = 1 in {
-  defm V6_vlutvvb_oracc:
-       T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
-       V6_vlutvvb_oracc_enc;
-  defm V6_vlutvwh_oracc:
-       T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
-       V6_vlutvwh_oracc_enc;
-}
-
-// It's a fake instruction and should not be defined?
-def S2_cabacencbin
-  : SInst2<(outs DoubleRegs:$dst),
-          (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-    "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
-
-// Vhist instructions
-def V6_vhistq
-  : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
-    "vhist($src1)">, V6_vhistq_enc;
-
-def V6_vhist
-  : CVI_HIST_Resource1 <(outs), (ins),
-    "vhist" >, V6_vhist_enc;
-
-
-let isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def V6_vd0: CVI_VA_Resource<(outs VectorRegs:$dst), (ins), "$dst = #0", []>;
-  def V6_vd0_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst), (ins),
-      "$dst = #0", []>;
-
-  def V6_vassignp: CVI_VA_Resource<(outs VecDblRegs:$dst),
-      (ins VecDblRegs:$src), "", []>;
-  def V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
-      (ins VecDblRegs128B:$src), "", []>;
-
-  def V6_lo: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
-      "", []>;
-  def V6_lo_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
-      (ins VecDblRegs128B:$src1), "", []>;
-
-  def V6_hi: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
-      "", []>;
-  def V6_hi_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
-      (ins VecDblRegs128B:$src1), "", []>;
-}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
deleted file mode 100644
index e3520bd..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ /dev/null
@@ -1,69 +0,0 @@
-//===- HexagonInstrInfoVector.td - Hexagon Vector Patterns -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon Vector instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-// Vector shift support. Vector shifting in Hexagon is rather different
-// from internal representation of LLVM.
-// LLVM assumes all shifts (in vector case) will have the form
-// <VT> = SHL/SRA/SRL <VT> by <VT>
-// while Hexagon has the following format:
-// <VT> = SHL/SRA/SRL <VT> by <IT/i32>
-// As a result, special care is needed to guarantee correctness and
-// performance.
-class vshift_v4i16<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
-  : S_2OpInstImm<Str, MajOp, MinOp, u4_0Imm, []> {
-  bits<4> src2;
-  let Inst{11-8} = src2;
-}
-
-class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
-  : S_2OpInstImm<Str, MajOp, MinOp, u5_0Imm, []> {
-  bits<5> src2;
-  let Inst{12-8} = src2;
-}
-
-def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
-def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
-def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
-
-def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
-def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
-def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;
-
-// Vector shift words by register
-def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
-def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
-def S2_asl_r_vw : T_S3op_shiftVect < "vaslw", 0b00, 0b10>;
-def S2_lsl_r_vw : T_S3op_shiftVect < "vlslw", 0b00, 0b11>;
-
-// Vector shift halfwords by register
-def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
-def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
-def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
-def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
-
-
-// Hexagon doesn't have a vector multiply with C semantics.
-// Instead, generate a pseudo instruction that gets expaneded into two
-// scalar MPYI instructions.
-// This is expanded by ExpandPostRAPseudos.
-let isPseudo = 1 in
-def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
-      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
-
-let isPseudo = 1 in
-def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
-      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
-      "$Rd = $Rx">;
-
-
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index d4f303b..104a286 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1347,6 +1347,37 @@ def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
 def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred, I64>;
 def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
 
+multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
+  def : Pat<(IntID VecPredRegs:$src1, IntRegs:$src2, VectorRegs:$src3),
+            (MI VecPredRegs:$src1, IntRegs:$src2, #0, VectorRegs:$src3)>,
+        Requires<[UseHVXSgl]>;
+
+  def : Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                             IntRegs:$src2,
+                                             VectorRegs128B:$src3),
+            (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+                                            IntRegs:$src2, #0,
+                                            VectorRegs128B:$src3)>,
+        Requires<[UseHVXDbl]>;
+}
+
+defm : MaskedStore <V6_vS32b_qpred_ai, int_hexagon_V6_vmaskedstoreq>;
+defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>;
+defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>;
+defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>;
+
+//*******************************************************************
+//           SYSTEM
+//*******************************************************************
+
+def: T_R_pat<Y2_dccleana,    int_hexagon_Y2_dccleana>;
+def: T_R_pat<Y2_dccleaninva, int_hexagon_Y2_dccleaninva>;
+def: T_R_pat<Y2_dcinva,      int_hexagon_Y2_dcinva>;
+def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
+
+def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
+def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index a45e1c9..f438b3e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -790,7 +790,7 @@ def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
 defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
 defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
 
-def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
 
 def: Pat<(v64i16 (trunc v64i32:$Vdd)),
          (v64i16 (V6_vpackwh_sat_128B
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td b/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td
deleted file mode 100644
index ebedf2c..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonIsetDx.td
+++ /dev/null
@@ -1,728 +0,0 @@
-//=- HexagonIsetDx.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon duplex instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// SA1_combine1i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine1i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#1, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b01;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SL2_jumpr31_f: Indirect conditional jump if false.
-// SL2_jumpr31_f -> SL2_jumpr31_fnew
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_f: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0) jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b101;
-  }
-
-// SL2_deallocframe: Deallocate stack frame.
-let Defs = [R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
-def SL2_deallocframe: SUBInst <
-  (outs ),
-  (ins ),
-  "deallocframe"> {
-    let Inst{12-6} = 0b1111100;
-    let Inst{2} = 0b0;
-  }
-
-// SL2_return_f: Deallocate stack frame and return.
-// SL2_return_f -> SL2_return_fnew
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_f: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0) dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b101;
-  }
-
-// SA1_combine3i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine3i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#3, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b11;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SS2_storebi0: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS2_storebi0: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "memb($Rs + #$u4_0)=#0"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12-8} = 0b10010;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_0;
-  }
-
-// SA1_clrtnew: Clear if true.
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrtnew: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if ($Pu.new) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b100;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_loadruh_io: Load half.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadruh_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
-  "$Rd = memuh($Rs + #$u3_1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u3_1;
-
-    let Inst{12-11} = 0b01;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-  }
-
-// SL2_jumpr31_tnew: Indirect conditional jump if true.
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_tnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0.new) jumpr:nt r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b110;
-  }
-
-// SA1_addi: Add.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 1, opExtentBits = 7, opExtendable = 2 in
-def SA1_addi: SUBInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$_src_, s7_0Ext:$s7),
-  "$Rx = add($_src_, #$s7)" ,
-  [] ,
-  "$_src_ = $Rx"> {
-    bits<4> Rx;
-    bits<7> s7;
-
-    let Inst{12-11} = 0b00;
-    let Inst{3-0} = Rx;
-    let Inst{10-4} = s7;
-  }
-
-// SL1_loadrub_io: Load byte.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
-def SL1_loadrub_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "$Rd = memub($Rs + #$u4_0)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12} = 0b1;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_0;
-  }
-
-// SL1_loadri_io: Load word.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL1_loadri_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "$Rd = memw($Rs + #$u4_2)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12} = 0b0;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_2{5-2};
-  }
-
-// SA1_cmpeqi: Compareimmed.
-let Defs = [P0], isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_cmpeqi: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u2_0Imm:$u2),
-  "p0 = cmp.eq($Rs, #$u2)"> {
-    bits<4> Rs;
-    bits<2> u2;
-
-    let Inst{12-8} = 0b11001;
-    let Inst{7-4} = Rs;
-    let Inst{1-0} = u2;
-  }
-
-// SA1_combinerz: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combinerz: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins IntRegs:$Rs),
-  "$Rdd = combine($Rs, #0)"> {
-    bits<3> Rdd;
-    bits<4> Rs;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b1;
-    let Inst{3} = 0b1;
-    let Inst{2-0} = Rdd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_return_t: Deallocate stack frame and return.
-// SL2_return_t -> SL2_return_tnew
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_t: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0) dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b100;
-  }
-
-// SS2_allocframe: Allocate stack frame.
-let Defs = [R29, R30], Uses = [R30, R31, R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
-def SS2_allocframe: SUBInst <
-  (outs ),
-  (ins u5_3Imm:$u5_3),
-  "allocframe(#$u5_3)"> {
-    bits<8> u5_3;
-
-    let Inst{12-9} = 0b1110;
-    let Inst{8-4} = u5_3{7-3};
-  }
-
-// SS2_storeh_io: Store half.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = HalfWordAccess in
-def SS2_storeh_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1, IntRegs:$Rt),
-  "memh($Rs + #$u3_1) = $Rt"> {
-    bits<4> Rs;
-    bits<4> u3_1;
-    bits<4> Rt;
-
-    let Inst{12-11} = 0b00;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-    let Inst{3-0} = Rt;
-  }
-
-// SS2_storewi0: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storewi0: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "memw($Rs + #$u4_2)=#0"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12-8} = 0b10000;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_2{5-2};
-  }
-
-// SS2_storewi1: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storewi1: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "memw($Rs + #$u4_2)=#1"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12-8} = 0b10001;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_2{5-2};
-  }
-
-// SL2_jumpr31: Indirect conditional jump if true.
-let Defs = [PC], Uses = [R31], isCodeGenOnly = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31: SUBInst <
-  (outs ),
-  (ins ),
-  "jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2} = 0b0;
-  }
-
-// SA1_combinezr: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combinezr: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins IntRegs:$Rs),
-  "$Rdd = combine(#0, $Rs)"> {
-    bits<3> Rdd;
-    bits<4> Rs;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b1;
-    let Inst{3} = 0b0;
-    let Inst{2-0} = Rdd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_loadrh_io: Load half.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadrh_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
-  "$Rd = memh($Rs + #$u3_1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u3_1;
-
-    let Inst{12-11} = 0b00;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-  }
-
-// SA1_addrx: Add.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_addrx: SUBInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$_src_, IntRegs:$Rs),
-  "$Rx = add($_src_, $Rs)" ,
-  [] ,
-  "$_src_ = $Rx"> {
-    bits<4> Rx;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b11000;
-    let Inst{3-0} = Rx;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_setin1: Set to -1.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_setin1: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins ),
-  "$Rd = #{-1}"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6} = 0b0;
-    let Inst{3-0} = Rd;
-  }
-
-// SA1_sxth: Sxth.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_sxth: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = sxth($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10100;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_combine0i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine0i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#0, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b00;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SA1_combine2i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine2i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#2, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b10;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SA1_sxtb: Sxtb.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_sxtb: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = sxtb($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10101;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_clrf: Clear if false.
-// SA1_clrf -> SA1_clrfnew
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrf: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if (!$Pu) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b111;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_loadrb_io: Load byte.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadrb_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_0Imm:$u3_0),
-  "$Rd = memb($Rs + #$u3_0)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<3> u3_0;
-
-    let Inst{12-11} = 0b10;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_0;
-  }
-
-// SA1_tfr: Tfr.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_tfr: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = $Rs"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10000;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_loadrd_sp: Load dword.
-let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
-def SL2_loadrd_sp: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u5_3Imm:$u5_3),
-  "$Rdd = memd(r29 + #$u5_3)"> {
-    bits<3> Rdd;
-    bits<8> u5_3;
-
-    let Inst{12-8} = 0b11110;
-    let Inst{2-0} = Rdd;
-    let Inst{7-3} = u5_3{7-3};
-  }
-
-// SA1_and1: And #1.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_and1: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = and($Rs, #1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10010;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SS2_storebi1: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS2_storebi1: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "memb($Rs + #$u4_0)=#1"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12-8} = 0b10011;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_0;
-  }
-
-// SA1_inc: Inc.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_inc: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = add($Rs, #1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10001;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SS2_stored_sp: Store dword.
-let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
-def SS2_stored_sp: SUBInst <
-  (outs ),
-  (ins s6_3Imm:$s6_3, DoubleRegs:$Rtt),
-  "memd(r29 + #$s6_3) = $Rtt"> {
-    bits<9> s6_3;
-    bits<3> Rtt;
-
-    let Inst{12-9} = 0b0101;
-    let Inst{8-3} = s6_3{8-3};
-    let Inst{2-0} = Rtt;
-  }
-
-// SS2_storew_sp: Store word.
-let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storew_sp: SUBInst <
-  (outs ),
-  (ins u5_2Imm:$u5_2, IntRegs:$Rt),
-  "memw(r29 + #$u5_2) = $Rt"> {
-    bits<7> u5_2;
-    bits<4> Rt;
-
-    let Inst{12-9} = 0b0100;
-    let Inst{8-4} = u5_2{6-2};
-    let Inst{3-0} = Rt;
-  }
-
-// SL2_jumpr31_fnew: Indirect conditional jump if false.
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_fnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0.new) jumpr:nt r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b111;
-  }
-
-// SA1_clrt: Clear if true.
-// SA1_clrt -> SA1_clrtnew
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrt: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if ($Pu) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b110;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_return: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return: SUBInst <
-  (outs ),
-  (ins ),
-  "dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2} = 0b0;
-  }
-
-// SA1_dec: Dec.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_dec: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = add($Rs,#{-1})"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10011;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_seti: Set immed.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 0, opExtentBits = 6, opExtendable = 1 in
-def SA1_seti: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6),
-  "$Rd = #$u6"> {
-    bits<4> Rd;
-    bits<6> u6;
-
-    let Inst{12-10} = 0b010;
-    let Inst{3-0} = Rd;
-    let Inst{9-4} = u6;
-  }
-
-// SL2_jumpr31_t: Indirect conditional jump if true.
-// SL2_jumpr31_t -> SL2_jumpr31_tnew
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_t: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0) jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b100;
-  }
-
-// SA1_clrfnew: Clear if false.
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrfnew: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if (!$Pu.new) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b101;
-    let Inst{3-0} = Rd;
-  }
-
-// SS1_storew_io: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS1_storew_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2, IntRegs:$Rt),
-  "memw($Rs + #$u4_2) = $Rt"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-    bits<4> Rt;
-
-    let Inst{12} = 0b0;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_2{5-2};
-    let Inst{3-0} = Rt;
-  }
-
-// SA1_zxtb: Zxtb.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_zxtb: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = and($Rs, #255)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10111;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_addsp: Add.
-let Uses = [R29], isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_addsp: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u6_2Imm:$u6_2),
-  "$Rd = add(r29, #$u6_2)"> {
-    bits<4> Rd;
-    bits<8> u6_2;
-
-    let Inst{12-10} = 0b011;
-    let Inst{3-0} = Rd;
-    let Inst{9-4} = u6_2{7-2};
-  }
-
-// SL2_loadri_sp: Load word.
-let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadri_sp: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u5_2Imm:$u5_2),
-  "$Rd = memw(r29 + #$u5_2)"> {
-    bits<4> Rd;
-    bits<7> u5_2;
-
-    let Inst{12-9} = 0b1110;
-    let Inst{3-0} = Rd;
-    let Inst{8-4} = u5_2{6-2};
-  }
-
-// SS1_storeb_io: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS1_storeb_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0, IntRegs:$Rt),
-  "memb($Rs + #$u4_0) = $Rt"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-    bits<4> Rt;
-
-    let Inst{12} = 0b1;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_0;
-    let Inst{3-0} = Rt;
-  }
-
-// SL2_return_tnew: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_tnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0.new) dealloc_return:nt"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b110;
-  }
-
-// SL2_return_fnew: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_fnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0.new) dealloc_return:nt"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b111;
-  }
-
-// SA1_zxth: Zxth.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_zxth: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = zxth($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10110;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
new file mode 100644
index 0000000..f82ad6c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -0,0 +1,2341 @@
+//===--- HexagonLoopIdiomRecognition.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "hexagon-lir"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <algorithm>
+#include <array>
+
+using namespace llvm;
+
+static cl::opt<bool> DisableMemcpyIdiom("disable-memcpy-idiom",
+  cl::Hidden, cl::init(false),
+  cl::desc("Disable generation of memcpy in loop idiom recognition"));
+
+static cl::opt<bool> DisableMemmoveIdiom("disable-memmove-idiom",
+  cl::Hidden, cl::init(false),
+  cl::desc("Disable generation of memmove in loop idiom recognition"));
+
+static cl::opt<unsigned> RuntimeMemSizeThreshold("runtime-mem-idiom-threshold",
+  cl::Hidden, cl::init(0), cl::desc("Threshold (in bytes) for the runtime "
+  "check guarding the memmove."));
+
+static cl::opt<unsigned> CompileTimeMemSizeThreshold(
+  "compile-time-mem-idiom-threshold", cl::Hidden, cl::init(64),
+  cl::desc("Threshold (in bytes) to perform the transformation, if the "
+    "runtime loop count (mem transfer size) is known at compile-time."));
+
+static cl::opt<bool> OnlyNonNestedMemmove("only-nonnested-memmove-idiom",
+  cl::Hidden, cl::init(true),
+  cl::desc("Only enable generating memmove in non-nested loops"));
+
+cl::opt<bool> HexagonVolatileMemcpy("disable-hexagon-volatile-memcpy",
+  cl::Hidden, cl::init(false),
+  cl::desc("Enable Hexagon-specific memcpy for volatile destination."));
+
+static cl::opt<unsigned> SimplifyLimit("hlir-simplify-limit", cl::init(10000),
+  cl::Hidden, cl::desc("Maximum number of simplification steps in HLIR"));
+
+static const char *HexagonVolatileMemcpyName
+  = "hexagon_memcpy_forward_vp4cp4n2";
+
+
+namespace llvm {
+  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
+  Pass *createHexagonLoopIdiomPass();
+}
+
+namespace {
+  class HexagonLoopIdiomRecognize : public LoopPass {
+  public:
+    static char ID;
+    explicit HexagonLoopIdiomRecognize() : LoopPass(ID) {
+      initializeHexagonLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+    }
+    StringRef getPassName() const override {
+      return "Recognize Hexagon-specific loop idioms";
+    }
+
+   void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addPreserved<TargetLibraryInfoWrapperPass>();
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  private:
+    unsigned getStoreSizeInBytes(StoreInst *SI);
+    int getSCEVStride(const SCEVAddRecExpr *StoreEv);
+    bool isLegalStore(Loop *CurLoop, StoreInst *SI);
+    void collectStores(Loop *CurLoop, BasicBlock *BB,
+        SmallVectorImpl<StoreInst*> &Stores);
+    bool processCopyingStore(Loop *CurLoop, StoreInst *SI, const SCEV *BECount);
+    bool coverLoop(Loop *L, SmallVectorImpl<Instruction*> &Insts) const;
+    bool runOnLoopBlock(Loop *CurLoop, BasicBlock *BB, const SCEV *BECount,
+        SmallVectorImpl<BasicBlock*> &ExitBlocks);
+    bool runOnCountableLoop(Loop *L);
+
+    AliasAnalysis *AA;
+    const DataLayout *DL;
+    DominatorTree *DT;
+    LoopInfo *LF;
+    const TargetLibraryInfo *TLI;
+    ScalarEvolution *SE;
+    bool HasMemcpy, HasMemmove;
+  };
+}
+
+char HexagonLoopIdiomRecognize::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
+    "Recognize Hexagon-specific loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
+    "Recognize Hexagon-specific loop idioms", false, false)
+
+
+namespace {
+  struct Simplifier {
+    typedef std::function<Value* (Instruction*, LLVMContext&)> Rule;
+
+    void addRule(const Rule &R) { Rules.push_back(R); }
+
+  private:
+    struct WorkListType {
+      WorkListType() = default;
+
+      void push_back(Value* V) {
+        // Do not push back duplicates.
+        if (!S.count(V)) { Q.push_back(V); S.insert(V); }
+      }
+      Value *pop_front_val() {
+        Value *V = Q.front(); Q.pop_front(); S.erase(V);
+        return V;
+      }
+      bool empty() const { return Q.empty(); }
+
+    private:
+      std::deque<Value*> Q;
+      std::set<Value*> S;
+    };
+
+    typedef std::set<Value*> ValueSetType;
+    std::vector<Rule> Rules;
+
+  public:
+    struct Context {
+      typedef DenseMap<Value*,Value*> ValueMapType;
+
+      Value *Root;
+      ValueSetType Used;    // The set of all cloned values used by Root.
+      ValueSetType Clones;  // The set of all cloned values.
+      LLVMContext &Ctx;
+
+      Context(Instruction *Exp)
+        : Ctx(Exp->getParent()->getParent()->getContext()) {
+        initialize(Exp);
+      }
+      ~Context() { cleanup(); }
+      void print(raw_ostream &OS, const Value *V) const;
+
+      Value *materialize(BasicBlock *B, BasicBlock::iterator At);
+
+    private:
+      void initialize(Instruction *Exp);
+      void cleanup();
+
+      template <typename FuncT> void traverse(Value *V, FuncT F);
+      void record(Value *V);
+      void use(Value *V);
+      void unuse(Value *V);
+
+      bool equal(const Instruction *I, const Instruction *J) const;
+      Value *find(Value *Tree, Value *Sub) const;
+      Value *subst(Value *Tree, Value *OldV, Value *NewV);
+      void replace(Value *OldV, Value *NewV);
+      void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
+
+      friend struct Simplifier;
+    };
+
+    Value *simplify(Context &C);
+  };
+
+  struct PE {
+    PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
+    const Simplifier::Context &C;
+    const Value *V;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
+  raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
+    P.C.print(OS, P.V ? P.V : P.C.Root);
+    return OS;
+  }
+}
+
+
+template <typename FuncT>
+void Simplifier::Context::traverse(Value *V, FuncT F) {
+  WorkListType Q;
+  Q.push_back(V);
+
+  while (!Q.empty()) {
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    if (!U || U->getParent())
+      continue;
+    if (!F(U))
+      continue;
+    for (Value *Op : U->operands())
+      Q.push_back(Op);
+  }
+}
+
+
+void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
+  const auto *U = dyn_cast<const Instruction>(V);
+  if (!U) {
+    OS << V << '(' << *V << ')';
+    return;
+  }
+
+  if (U->getParent()) {
+    OS << U << '(';
+    U->printAsOperand(OS, true);
+    OS << ')';
+    return;
+  }
+
+  unsigned N = U->getNumOperands();
+  if (N != 0)
+    OS << U << '(';
+  OS << U->getOpcodeName();
+  for (const Value *Op : U->operands()) {
+    OS << ' ';
+    print(OS, Op);
+  }
+  if (N != 0)
+    OS << ')';
+}
+
+
+void Simplifier::Context::initialize(Instruction *Exp) {
+  // Perform a deep clone of the expression, set Root to the root
+  // of the clone, and build a map from the cloned values to the
+  // original ones.
+  ValueMapType M;
+  BasicBlock *Block = Exp->getParent();
+  WorkListType Q;
+  Q.push_back(Exp);
+
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    if (M.find(V) != M.end())
+      continue;
+    if (Instruction *U = dyn_cast<Instruction>(V)) {
+      if (isa<PHINode>(U) || U->getParent() != Block)
+        continue;
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+      M.insert({U, U->clone()});
+    }
+  }
+
+  for (std::pair<Value*,Value*> P : M) {
+    Instruction *U = cast<Instruction>(P.second);
+    for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+      auto F = M.find(U->getOperand(i));
+      if (F != M.end())
+        U->setOperand(i, F->second);
+    }
+  }
+
+  auto R = M.find(Exp);
+  assert(R != M.end());
+  Root = R->second;
+
+  record(Root);
+  use(Root);
+}
+
+
+void Simplifier::Context::record(Value *V) {
+  auto Record = [this](Instruction *U) -> bool {
+    Clones.insert(U);
+    return true;
+  };
+  traverse(V, Record);
+}
+
+
+void Simplifier::Context::use(Value *V) {
+  auto Use = [this](Instruction *U) -> bool {
+    Used.insert(U);
+    return true;
+  };
+  traverse(V, Use);
+}
+
+
+void Simplifier::Context::unuse(Value *V) {
+  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
+    return;
+
+  auto Unuse = [this](Instruction *U) -> bool {
+    if (!U->use_empty())
+      return false;
+    Used.erase(U);
+    return true;
+  };
+  traverse(V, Unuse);
+}
+
+
+Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
+  if (Tree == OldV)
+    return NewV;
+  if (OldV == NewV)
+    return Tree;
+
+  WorkListType Q;
+  Q.push_back(Tree);
+  while (!Q.empty()) {
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    // If U is not an instruction, or it's not a clone, skip it.
+    if (!U || U->getParent())
+      continue;
+    for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+      Value *Op = U->getOperand(i);
+      if (Op == OldV) {
+        U->setOperand(i, NewV);
+        unuse(OldV);
+      } else {
+        Q.push_back(Op);
+      }
+    }
+  }
+  return Tree;
+}
+
+
+void Simplifier::Context::replace(Value *OldV, Value *NewV) {
+  if (Root == OldV) {
+    Root = NewV;
+    use(Root);
+    return;
+  }
+
+  // NewV may be a complex tree that has just been created by one of the
+  // transformation rules. We need to make sure that it is commoned with
+  // the existing Root to the maximum extent possible.
+  // Identify all subtrees of NewV (including NewV itself) that have
+  // equivalent counterparts in Root, and replace those subtrees with
+  // these counterparts.
+  WorkListType Q;
+  Q.push_back(NewV);
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    Instruction *U = dyn_cast<Instruction>(V);
+    if (!U || U->getParent())
+      continue;
+    if (Value *DupV = find(Root, V)) {
+      if (DupV != V)
+        NewV = subst(NewV, V, DupV);
+    } else {
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+    }
+  }
+
+  // Now, simply replace OldV with NewV in Root.
+  Root = subst(Root, OldV, NewV);
+  use(Root);
+}
+
+
+void Simplifier::Context::cleanup() {
+  for (Value *V : Clones) {
+    Instruction *U = cast<Instruction>(V);
+    if (!U->getParent())
+      U->dropAllReferences();
+  }
+
+  for (Value *V : Clones) {
+    Instruction *U = cast<Instruction>(V);
+    if (!U->getParent())
+      U->deleteValue();
+  }
+}
+
+
+bool Simplifier::Context::equal(const Instruction *I,
+                                const Instruction *J) const {
+  if (I == J)
+    return true;
+  if (!I->isSameOperationAs(J))
+    return false;
+  if (isa<PHINode>(I))
+    return I->isIdenticalTo(J);
+
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
+    if (OpI == OpJ)
+      continue;
+    auto *InI = dyn_cast<const Instruction>(OpI);
+    auto *InJ = dyn_cast<const Instruction>(OpJ);
+    if (InI && InJ) {
+      if (!equal(InI, InJ))
+        return false;
+    } else if (InI != InJ || !InI)
+      return false;
+  }
+  return true;
+}
+
+
+Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
+  Instruction *SubI = dyn_cast<Instruction>(Sub);
+  WorkListType Q;
+  Q.push_back(Tree);
+
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    if (V == Sub)
+      return V;
+    Instruction *U = dyn_cast<Instruction>(V);
+    if (!U || U->getParent())
+      continue;
+    if (SubI && equal(SubI, U))
+      return U;
+    assert(!isa<PHINode>(U));
+    for (Value *Op : U->operands())
+      Q.push_back(Op);
+  }
+  return nullptr;
+}
+
+
+void Simplifier::Context::link(Instruction *I, BasicBlock *B,
+      BasicBlock::iterator At) {
+  if (I->getParent())
+    return;
+
+  for (Value *Op : I->operands()) {
+    if (Instruction *OpI = dyn_cast<Instruction>(Op))
+      link(OpI, B, At);
+  }
+
+  B->getInstList().insert(At, I);
+}
+
+
+Value *Simplifier::Context::materialize(BasicBlock *B,
+      BasicBlock::iterator At) {
+  if (Instruction *RootI = dyn_cast<Instruction>(Root))
+    link(RootI, B, At);
+  return Root;
+}
+
+
+Value *Simplifier::simplify(Context &C) {
+  WorkListType Q;
+  Q.push_back(C.Root);
+  unsigned Count = 0;
+  const unsigned Limit = SimplifyLimit;
+
+  while (!Q.empty()) {
+    if (Count++ >= Limit)
+      break;
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    if (!U || U->getParent() || !C.Used.count(U))
+      continue;
+    bool Changed = false;
+    for (Rule &R : Rules) {
+      Value *W = R(U, C.Ctx);
+      if (!W)
+        continue;
+      Changed = true;
+      C.record(W);
+      C.replace(U, W);
+      Q.push_back(C.Root);
+      break;
+    }
+    if (!Changed) {
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+    }
+  }
+  return Count < Limit ? C.Root : nullptr;
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of PolynomialMultiplyRecognize
+//
+//===----------------------------------------------------------------------===//
+
+namespace {
+  class PolynomialMultiplyRecognize {
+  public:
+    explicit PolynomialMultiplyRecognize(Loop *loop, const DataLayout &dl,
+        const DominatorTree &dt, const TargetLibraryInfo &tli,
+        ScalarEvolution &se)
+      : CurLoop(loop), DL(dl), DT(dt), TLI(tli), SE(se) {}
+
+    bool recognize();
+  private:
+    typedef SetVector<Value*> ValueSeq;
+
+    IntegerType *getPmpyType() const {
+      LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
+      return IntegerType::get(Ctx, 32);
+    }
+    bool isPromotableTo(Value *V, IntegerType *Ty);
+    void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
+    bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
+
+    Value *getCountIV(BasicBlock *BB);
+    bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
+    void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
+          ValueSeq &Late);
+    bool classifyInst(Instruction *UseI, ValueSeq &Early, ValueSeq &Late);
+    bool commutesWithShift(Instruction *I);
+    bool highBitsAreZero(Value *V, unsigned IterCount);
+    bool keepsHighBitsZero(Value *V, unsigned IterCount);
+    bool isOperandShifted(Instruction *I, Value *Op);
+    bool convertShiftsToLeft(BasicBlock *LoopB, BasicBlock *ExitB,
+          unsigned IterCount);
+    void cleanupLoopBody(BasicBlock *LoopB);
+
+    struct ParsedValues {
+      ParsedValues() : M(nullptr), P(nullptr), Q(nullptr), R(nullptr),
+          X(nullptr), Res(nullptr), IterCount(0), Left(false), Inv(false) {}
+      Value *M, *P, *Q, *R, *X;
+      Instruction *Res;
+      unsigned IterCount;
+      bool Left, Inv;
+    };
+
+    bool matchLeftShift(SelectInst *SelI, Value *CIV, ParsedValues &PV);
+    bool matchRightShift(SelectInst *SelI, ParsedValues &PV);
+    bool scanSelect(SelectInst *SI, BasicBlock *LoopB, BasicBlock *PrehB,
+          Value *CIV, ParsedValues &PV, bool PreScan);
+    unsigned getInverseMxN(unsigned QP);
+    Value *generate(BasicBlock::iterator At, ParsedValues &PV);
+
+    void setupSimplifier();
+
+    Simplifier Simp;
+    Loop *CurLoop;
+    const DataLayout &DL;
+    const DominatorTree &DT;
+    const TargetLibraryInfo &TLI;
+    ScalarEvolution &SE;
+  };
+}
+
+
+Value *PolynomialMultiplyRecognize::getCountIV(BasicBlock *BB) {
+  pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+  if (std::distance(PI, PE) != 2)
+    return nullptr;
+  BasicBlock *PB = (*PI == BB) ? *std::next(PI) : *PI;
+
+  for (auto I = BB->begin(), E = BB->end(); I != E && isa<PHINode>(I); ++I) {
+    auto *PN = cast<PHINode>(I);
+    Value *InitV = PN->getIncomingValueForBlock(PB);
+    if (!isa<ConstantInt>(InitV) || !cast<ConstantInt>(InitV)->isZero())
+      continue;
+    Value *IterV = PN->getIncomingValueForBlock(BB);
+    if (!isa<BinaryOperator>(IterV))
+      continue;
+    auto *BO = dyn_cast<BinaryOperator>(IterV);
+    if (BO->getOpcode() != Instruction::Add)
+      continue;
+    Value *IncV = nullptr;
+    if (BO->getOperand(0) == PN)
+      IncV = BO->getOperand(1);
+    else if (BO->getOperand(1) == PN)
+      IncV = BO->getOperand(0);
+    if (IncV == nullptr)
+      continue;
+
+    if (auto *T = dyn_cast<ConstantInt>(IncV))
+      if (T->getZExtValue() == 1)
+        return PN;
+  }
+  return nullptr;
+}
+
+
+static void replaceAllUsesOfWithIn(Value *I, Value *J, BasicBlock *BB) {
+  for (auto UI = I->user_begin(), UE = I->user_end(); UI != UE;) {
+    Use &TheUse = UI.getUse();
+    ++UI;
+    if (auto *II = dyn_cast<Instruction>(TheUse.getUser()))
+      if (BB == II->getParent())
+        II->replaceUsesOfWith(I, J);
+  }
+}
+
+
+bool PolynomialMultiplyRecognize::matchLeftShift(SelectInst *SelI,
+      Value *CIV, ParsedValues &PV) {
+  // Match the following:
+  //   select (X & (1 << i)) != 0 ? R ^ (Q << i) : R
+  //   select (X & (1 << i)) == 0 ? R : R ^ (Q << i)
+  // The condition may also check for equality with the masked value, i.e
+  //   select (X & (1 << i)) == (1 << i) ? R ^ (Q << i) : R
+  //   select (X & (1 << i)) != (1 << i) ? R : R ^ (Q << i);
+
+  Value *CondV = SelI->getCondition();
+  Value *TrueV = SelI->getTrueValue();
+  Value *FalseV = SelI->getFalseValue();
+
+  using namespace PatternMatch;
+
+  CmpInst::Predicate P;
+  Value *A = nullptr, *B = nullptr, *C = nullptr;
+
+  if (!match(CondV, m_ICmp(P, m_And(m_Value(A), m_Value(B)), m_Value(C))) &&
+      !match(CondV, m_ICmp(P, m_Value(C), m_And(m_Value(A), m_Value(B)))))
+    return false;
+  if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
+    return false;
+  // Matched: select (A & B) == C ? ... : ...
+  //          select (A & B) != C ? ... : ...
+
+  Value *X = nullptr, *Sh1 = nullptr;
+  // Check (A & B) for (X & (1 << i)):
+  if (match(A, m_Shl(m_One(), m_Specific(CIV)))) {
+    Sh1 = A;
+    X = B;
+  } else if (match(B, m_Shl(m_One(), m_Specific(CIV)))) {
+    Sh1 = B;
+    X = A;
+  } else {
+    // TODO: Could also check for an induction variable containing single
+    // bit shifted left by 1 in each iteration.
+    return false;
+  }
+
+  bool TrueIfZero;
+
+  // Check C against the possible values for comparison: 0 and (1 << i):
+  if (match(C, m_Zero()))
+    TrueIfZero = (P == CmpInst::ICMP_EQ);
+  else if (C == Sh1)
+    TrueIfZero = (P == CmpInst::ICMP_NE);
+  else
+    return false;
+
+  // So far, matched:
+  //   select (X & (1 << i)) ? ... : ...
+  // including variations of the check against zero/non-zero value.
+
+  Value *ShouldSameV = nullptr, *ShouldXoredV = nullptr;
+  if (TrueIfZero) {
+    ShouldSameV = TrueV;
+    ShouldXoredV = FalseV;
+  } else {
+    ShouldSameV = FalseV;
+    ShouldXoredV = TrueV;
+  }
+
+  Value *Q = nullptr, *R = nullptr, *Y = nullptr, *Z = nullptr;
+  Value *T = nullptr;
+  if (match(ShouldXoredV, m_Xor(m_Value(Y), m_Value(Z)))) {
+    // Matched: select +++ ? ... : Y ^ Z
+    //          select +++ ? Y ^ Z : ...
+    // where +++ denotes previously checked matches.
+    if (ShouldSameV == Y)
+      T = Z;
+    else if (ShouldSameV == Z)
+      T = Y;
+    else
+      return false;
+    R = ShouldSameV;
+    // Matched: select +++ ? R : R ^ T
+    //          select +++ ? R ^ T : R
+    // depending on TrueIfZero.
+
+  } else if (match(ShouldSameV, m_Zero())) {
+    // Matched: select +++ ? 0 : ...
+    //          select +++ ? ... : 0
+    if (!SelI->hasOneUse())
+      return false;
+    T = ShouldXoredV;
+    // Matched: select +++ ? 0 : T
+    //          select +++ ? T : 0
+
+    Value *U = *SelI->user_begin();
+    if (!match(U, m_Xor(m_Specific(SelI), m_Value(R))) &&
+        !match(U, m_Xor(m_Value(R), m_Specific(SelI))))
+      return false;
+    // Matched: xor (select +++ ? 0 : T), R
+    //          xor (select +++ ? T : 0), R
+  } else
+    return false;
+
+  // The xor input value T is isolated into its own match so that it could
+  // be checked against an induction variable containing a shifted bit
+  // (todo).
+  // For now, check against (Q << i).
+  if (!match(T, m_Shl(m_Value(Q), m_Specific(CIV))) &&
+      !match(T, m_Shl(m_ZExt(m_Value(Q)), m_ZExt(m_Specific(CIV)))))
+    return false;
+  // Matched: select +++ ? R : R ^ (Q << i)
+  //          select +++ ? R ^ (Q << i) : R
+
+  PV.X = X;
+  PV.Q = Q;
+  PV.R = R;
+  PV.Left = true;
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::matchRightShift(SelectInst *SelI,
+      ParsedValues &PV) {
+  // Match the following:
+  //   select (X & 1) != 0 ? (R >> 1) ^ Q : (R >> 1)
+  //   select (X & 1) == 0 ? (R >> 1) : (R >> 1) ^ Q
+  // The condition may also check for equality with the masked value, i.e
+  //   select (X & 1) == 1 ? (R >> 1) ^ Q : (R >> 1)
+  //   select (X & 1) != 1 ? (R >> 1) : (R >> 1) ^ Q
+
+  Value *CondV = SelI->getCondition();
+  Value *TrueV = SelI->getTrueValue();
+  Value *FalseV = SelI->getFalseValue();
+
+  using namespace PatternMatch;
+
+  Value *C = nullptr;
+  CmpInst::Predicate P;
+  bool TrueIfZero;
+
+  if (match(CondV, m_ICmp(P, m_Value(C), m_Zero())) ||
+      match(CondV, m_ICmp(P, m_Zero(), m_Value(C)))) {
+    if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
+      return false;
+    // Matched: select C == 0 ? ... : ...
+    //          select C != 0 ? ... : ...
+    TrueIfZero = (P == CmpInst::ICMP_EQ);
+  } else if (match(CondV, m_ICmp(P, m_Value(C), m_One())) ||
+             match(CondV, m_ICmp(P, m_One(), m_Value(C)))) {
+    if (P != CmpInst::ICMP_EQ && P != CmpInst::ICMP_NE)
+      return false;
+    // Matched: select C == 1 ? ... : ...
+    //          select C != 1 ? ... : ...
+    TrueIfZero = (P == CmpInst::ICMP_NE);
+  } else
+    return false;
+
+  Value *X = nullptr;
+  if (!match(C, m_And(m_Value(X), m_One())) &&
+      !match(C, m_And(m_One(), m_Value(X))))
+    return false;
+  // Matched: select (X & 1) == +++ ? ... : ...
+  //          select (X & 1) != +++ ? ... : ...
+
+  Value *R = nullptr, *Q = nullptr;
+  if (TrueIfZero) {
+    // The select's condition is true if the tested bit is 0.
+    // TrueV must be the shift, FalseV must be the xor.
+    if (!match(TrueV, m_LShr(m_Value(R), m_One())))
+      return false;
+    // Matched: select +++ ? (R >> 1) : ...
+    if (!match(FalseV, m_Xor(m_Specific(TrueV), m_Value(Q))) &&
+        !match(FalseV, m_Xor(m_Value(Q), m_Specific(TrueV))))
+      return false;
+    // Matched: select +++ ? (R >> 1) : (R >> 1) ^ Q
+    // with commuting ^.
+  } else {
+    // The select's condition is true if the tested bit is 1.
+    // TrueV must be the xor, FalseV must be the shift.
+    if (!match(FalseV, m_LShr(m_Value(R), m_One())))
+      return false;
+    // Matched: select +++ ? ... : (R >> 1)
+    if (!match(TrueV, m_Xor(m_Specific(FalseV), m_Value(Q))) &&
+        !match(TrueV, m_Xor(m_Value(Q), m_Specific(FalseV))))
+      return false;
+    // Matched: select +++ ? (R >> 1) ^ Q : (R >> 1)
+    // with commuting ^.
+  }
+
+  PV.X = X;
+  PV.Q = Q;
+  PV.R = R;
+  PV.Left = false;
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
+      BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
+      bool PreScan) {
+  using namespace PatternMatch;
+  // The basic pattern for R = P.Q is:
+  // for i = 0..31
+  //   R = phi (0, R')
+  //   if (P & (1 << i))        ; test-bit(P, i)
+  //     R' = R ^ (Q << i)
+  //
+  // Similarly, the basic pattern for R = (P/Q).Q - P
+  // for i = 0..31
+  //   R = phi(P, R')
+  //   if (R & (1 << i))
+  //     R' = R ^ (Q << i)
+
+  // There exist idioms, where instead of Q being shifted left, P is shifted
+  // right. This produces a result that is shifted right by 32 bits (the
+  // non-shifted result is 64-bit).
+  //
+  // For R = P.Q, this would be:
+  // for i = 0..31
+  //   R = phi (0, R')
+  //   if ((P >> i) & 1)
+  //     R' = (R >> 1) ^ Q      ; R is cycled through the loop, so it must
+  //   else                     ; be shifted by 1, not i.
+  //     R' = R >> 1
+  //
+  // And for the inverse:
+  // for i = 0..31
+  //   R = phi (P, R')
+  //   if (R & 1)
+  //     R' = (R >> 1) ^ Q
+  //   else
+  //     R' = R >> 1
+
+  // The left-shifting idioms share the same pattern:
+  //   select (X & (1 << i)) ? R ^ (Q << i) : R
+  // Similarly for right-shifting idioms:
+  //   select (X & 1) ? (R >> 1) ^ Q
+
+  if (matchLeftShift(SelI, CIV, PV)) {
+    // If this is a pre-scan, getting this far is sufficient.
+    if (PreScan)
+      return true;
+
+    // Need to make sure that the SelI goes back into R.
+    auto *RPhi = dyn_cast<PHINode>(PV.R);
+    if (!RPhi)
+      return false;
+    if (SelI != RPhi->getIncomingValueForBlock(LoopB))
+      return false;
+    PV.Res = SelI;
+
+    // If X is loop invariant, it must be the input polynomial, and the
+    // idiom is the basic polynomial multiply.
+    if (CurLoop->isLoopInvariant(PV.X)) {
+      PV.P = PV.X;
+      PV.Inv = false;
+    } else {
+      // X is not loop invariant. If X == R, this is the inverse pmpy.
+      // Otherwise, check for an xor with an invariant value. If the
+      // variable argument to the xor is R, then this is still a valid
+      // inverse pmpy.
+      PV.Inv = true;
+      if (PV.X != PV.R) {
+        Value *Var = nullptr, *Inv = nullptr, *X1 = nullptr, *X2 = nullptr;
+        if (!match(PV.X, m_Xor(m_Value(X1), m_Value(X2))))
+          return false;
+        auto *I1 = dyn_cast<Instruction>(X1);
+        auto *I2 = dyn_cast<Instruction>(X2);
+        if (!I1 || I1->getParent() != LoopB) {
+          Var = X2;
+          Inv = X1;
+        } else if (!I2 || I2->getParent() != LoopB) {
+          Var = X1;
+          Inv = X2;
+        } else
+          return false;
+        if (Var != PV.R)
+          return false;
+        PV.M = Inv;
+      }
+      // The input polynomial P still needs to be determined. It will be
+      // the entry value of R.
+      Value *EntryP = RPhi->getIncomingValueForBlock(PrehB);
+      PV.P = EntryP;
+    }
+
+    return true;
+  }
+
+  if (matchRightShift(SelI, PV)) {
+    // If this is an inverse pattern, the Q polynomial must be known at
+    // compile time.
+    if (PV.Inv && !isa<ConstantInt>(PV.Q))
+      return false;
+    if (PreScan)
+      return true;
+    // There is no exact matching of right-shift pmpy.
+    return false;
+  }
+
+  return false;
+}
+
+
+bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
+      IntegerType *DestTy) {
+  IntegerType *T = dyn_cast<IntegerType>(Val->getType());
+  if (!T || T->getBitWidth() > DestTy->getBitWidth())
+    return false;
+  if (T->getBitWidth() == DestTy->getBitWidth())
+    return true;
+  // Non-instructions are promotable. The reason why an instruction may not
+  // be promotable is that it may produce a different result if its operands
+  // and the result are promoted, for example, it may produce more non-zero
+  // bits. While it would still be possible to represent the proper result
+  // in a wider type, it may require adding additional instructions (which
+  // we don't want to do).
+  Instruction *In = dyn_cast<Instruction>(Val);
+  if (!In)
+    return true;
+  // The bitwidth of the source type is smaller than the destination.
+  // Check if the individual operation can be promoted.
+  switch (In->getOpcode()) {
+    case Instruction::PHI:
+    case Instruction::ZExt:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::LShr: // Shift right is ok.
+    case Instruction::Select:
+      return true;
+    case Instruction::ICmp:
+      if (CmpInst *CI = cast<CmpInst>(In))
+        return CI->isEquality() || CI->isUnsigned();
+      llvm_unreachable("Cast failed unexpectedly");
+    case Instruction::Add:
+      return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
+  }
+  return false;
+}
+
+
+void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
+      IntegerType *DestTy, BasicBlock *LoopB) {
+  // Leave boolean values alone.
+  if (!In->getType()->isIntegerTy(1))
+    In->mutateType(DestTy);
+  unsigned DestBW = DestTy->getBitWidth();
+
+  // Handle PHIs.
+  if (PHINode *P = dyn_cast<PHINode>(In)) {
+    unsigned N = P->getNumIncomingValues();
+    for (unsigned i = 0; i != N; ++i) {
+      BasicBlock *InB = P->getIncomingBlock(i);
+      if (InB == LoopB)
+        continue;
+      Value *InV = P->getIncomingValue(i);
+      IntegerType *Ty = cast<IntegerType>(InV->getType());
+      // Do not promote values in PHI nodes of type i1.
+      if (Ty != P->getType()) {
+        // If the value type does not match the PHI type, the PHI type
+        // must have been promoted.
+        assert(Ty->getBitWidth() < DestBW);
+        InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
+        P->setIncomingValue(i, InV);
+      }
+    }
+  } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
+    Value *Op = Z->getOperand(0);
+    if (Op->getType() == Z->getType())
+      Z->replaceAllUsesWith(Op);
+    Z->eraseFromParent();
+    return;
+  }
+
+  // Promote immediates.
+  for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
+      if (CI->getType()->getBitWidth() < DestBW)
+        In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
+  }
+}
+
+
+bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
+      BasicBlock *ExitB) {
+  assert(LoopB);
+  // Skip loops where the exit block has more than one predecessor. The values
+  // coming from the loop block will be promoted to another type, and so the
+  // values coming into the exit block from other predecessors would also have
+  // to be promoted.
+  if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
+    return false;
+  IntegerType *DestTy = getPmpyType();
+  // Check if the exit values have types that are no wider than the type
+  // that we want to promote to.
+  unsigned DestBW = DestTy->getBitWidth();
+  for (Instruction &In : *ExitB) {
+    PHINode *P = dyn_cast<PHINode>(&In);
+    if (!P)
+      break;
+    if (P->getNumIncomingValues() != 1)
+      return false;
+    assert(P->getIncomingBlock(0) == LoopB);
+    IntegerType *T = dyn_cast<IntegerType>(P->getType());
+    if (!T || T->getBitWidth() > DestBW)
+      return false;
+  }
+
+  // Check all instructions in the loop.
+  for (Instruction &In : *LoopB)
+    if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
+      return false;
+
+  // Perform the promotion.
+  std::vector<Instruction*> LoopIns;
+  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
+                 [](Instruction &In) { return &In; });
+  for (Instruction *In : LoopIns)
+    promoteTo(In, DestTy, LoopB);
+
+  // Fix up the PHI nodes in the exit block.
+  Instruction *EndI = ExitB->getFirstNonPHI();
+  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
+  for (auto I = ExitB->begin(); I != End; ++I) {
+    PHINode *P = dyn_cast<PHINode>(I);
+    if (!P)
+      break;
+    Type *Ty0 = P->getIncomingValue(0)->getType();
+    Type *PTy = P->getType();
+    if (PTy != Ty0) {
+      assert(Ty0 == DestTy);
+      // In order to create the trunc, P must have the promoted type.
+      P->mutateType(Ty0);
+      Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
+      // In order for the RAUW to work, the types of P and T must match.
+      P->mutateType(PTy);
+      P->replaceAllUsesWith(T);
+      // Final update of the P's type.
+      P->mutateType(Ty0);
+      cast<Instruction>(T)->setOperand(0, P);
+    }
+  }
+
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
+      ValueSeq &Cycle) {
+  // Out = ..., In, ...
+  if (Out == In)
+    return true;
+
+  auto *BB = cast<Instruction>(Out)->getParent();
+  bool HadPhi = false;
+
+  for (auto U : Out->users()) {
+    auto *I = dyn_cast<Instruction>(&*U);
+    if (I == nullptr || I->getParent() != BB)
+      continue;
+    // Make sure that there are no multi-iteration cycles, e.g.
+    //   p1 = phi(p2)
+    //   p2 = phi(p1)
+    // The cycle p1->p2->p1 would span two loop iterations.
+    // Check that there is only one phi in the cycle.
+    bool IsPhi = isa<PHINode>(I);
+    if (IsPhi && HadPhi)
+      return false;
+    HadPhi |= IsPhi;
+    if (Cycle.count(I))
+      return false;
+    Cycle.insert(I);
+    if (findCycle(I, In, Cycle))
+      break;
+    Cycle.remove(I);
+  }
+  return !Cycle.empty();
+}
+
+
+void PolynomialMultiplyRecognize::classifyCycle(Instruction *DivI,
+      ValueSeq &Cycle, ValueSeq &Early, ValueSeq &Late) {
+  // All the values in the cycle that are between the phi node and the
+  // divider instruction will be classified as "early", all other values
+  // will be "late".
+
+  bool IsE = true;
+  unsigned I, N = Cycle.size();
+  for (I = 0; I < N; ++I) {
+    Value *V = Cycle[I];
+    if (DivI == V)
+      IsE = false;
+    else if (!isa<PHINode>(V))
+      continue;
+    // Stop if found either.
+    break;
+  }
+  // "I" is the index of either DivI or the phi node, whichever was first.
+  // "E" is "false" or "true" respectively.
+  ValueSeq &First = !IsE ? Early : Late;
+  for (unsigned J = 0; J < I; ++J)
+    First.insert(Cycle[J]);
+
+  ValueSeq &Second = IsE ? Early : Late;
+  Second.insert(Cycle[I]);
+  for (++I; I < N; ++I) {
+    Value *V = Cycle[I];
+    if (DivI == V || isa<PHINode>(V))
+      break;
+    Second.insert(V);
+  }
+
+  for (; I < N; ++I)
+    First.insert(Cycle[I]);
+}
+
+
+bool PolynomialMultiplyRecognize::classifyInst(Instruction *UseI,
+      ValueSeq &Early, ValueSeq &Late) {
+  // Select is an exception, since the condition value does not have to be
+  // classified in the same way as the true/false values. The true/false
+  // values do have to be both early or both late.
+  if (UseI->getOpcode() == Instruction::Select) {
+    Value *TV = UseI->getOperand(1), *FV = UseI->getOperand(2);
+    if (Early.count(TV) || Early.count(FV)) {
+      if (Late.count(TV) || Late.count(FV))
+        return false;
+      Early.insert(UseI);
+    } else if (Late.count(TV) || Late.count(FV)) {
+      if (Early.count(TV) || Early.count(FV))
+        return false;
+      Late.insert(UseI);
+    }
+    return true;
+  }
+
+  // Not sure what would be the example of this, but the code below relies
+  // on having at least one operand.
+  if (UseI->getNumOperands() == 0)
+    return true;
+
+  bool AE = true, AL = true;
+  for (auto &I : UseI->operands()) {
+    if (Early.count(&*I))
+      AL = false;
+    else if (Late.count(&*I))
+      AE = false;
+  }
+  // If the operands appear "all early" and "all late" at the same time,
+  // then it means that none of them are actually classified as either.
+  // This is harmless.
+  if (AE && AL)
+    return true;
+  // Conversely, if they are neither "all early" nor "all late", then
+  // we have a mixture of early and late operands that is not a known
+  // exception.
+  if (!AE && !AL)
+    return false;
+
+  // Check that we have covered the two special cases.
+  assert(AE != AL);
+
+  if (AE)
+    Early.insert(UseI);
+  else
+    Late.insert(UseI);
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::commutesWithShift(Instruction *I) {
+  switch (I->getOpcode()) {
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::LShr:
+    case Instruction::Shl:
+    case Instruction::Select:
+    case Instruction::ICmp:
+    case Instruction::PHI:
+      break;
+    default:
+      return false;
+  }
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V,
+      unsigned IterCount) {
+  auto *T = dyn_cast<IntegerType>(V->getType());
+  if (!T)
+    return false;
+
+  KnownBits Known(T->getBitWidth());
+  computeKnownBits(V, Known, DL);
+  return Known.countMinLeadingZeros() >= IterCount;
+}
+
+
+bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
+      unsigned IterCount) {
+  // Assume that all inputs to the value have the high bits zero.
+  // Check if the value itself preserves the zeros in the high bits.
+  if (auto *C = dyn_cast<ConstantInt>(V))
+    return C->getValue().countLeadingZeros() >= IterCount;
+
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    switch (I->getOpcode()) {
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      case Instruction::LShr:
+      case Instruction::Select:
+      case Instruction::ICmp:
+      case Instruction::PHI:
+      case Instruction::ZExt:
+        return true;
+    }
+  }
+
+  return false;
+}
+
+
+bool PolynomialMultiplyRecognize::isOperandShifted(Instruction *I, Value *Op) {
+  unsigned Opc = I->getOpcode();
+  if (Opc == Instruction::Shl || Opc == Instruction::LShr)
+    return Op != I->getOperand(1);
+  return true;
+}
+
+
+bool PolynomialMultiplyRecognize::convertShiftsToLeft(BasicBlock *LoopB,
+      BasicBlock *ExitB, unsigned IterCount) {
+  Value *CIV = getCountIV(LoopB);
+  if (CIV == nullptr)
+    return false;
+  auto *CIVTy = dyn_cast<IntegerType>(CIV->getType());
+  if (CIVTy == nullptr)
+    return false;
+
+  ValueSeq RShifts;
+  ValueSeq Early, Late, Cycled;
+
+  // Find all value cycles that contain logical right shifts by 1.
+  for (Instruction &I : *LoopB) {
+    using namespace PatternMatch;
+    Value *V = nullptr;
+    if (!match(&I, m_LShr(m_Value(V), m_One())))
+      continue;
+    ValueSeq C;
+    if (!findCycle(&I, V, C))
+      continue;
+
+    // Found a cycle.
+    C.insert(&I);
+    classifyCycle(&I, C, Early, Late);
+    Cycled.insert(C.begin(), C.end());
+    RShifts.insert(&I);
+  }
+
+  // Find the set of all values affected by the shift cycles, i.e. all
+  // cycled values, and (recursively) all their users.
+  ValueSeq Users(Cycled.begin(), Cycled.end());
+  for (unsigned i = 0; i < Users.size(); ++i) {
+    Value *V = Users[i];
+    if (!isa<IntegerType>(V->getType()))
+      return false;
+    auto *R = cast<Instruction>(V);
+    // If the instruction does not commute with shifts, the loop cannot
+    // be unshifted.
+    if (!commutesWithShift(R))
+      return false;
+    for (auto I = R->user_begin(), E = R->user_end(); I != E; ++I) {
+      auto *T = cast<Instruction>(*I);
+      // Skip users from outside of the loop. They will be handled later.
+      // Also, skip the right-shifts and phi nodes, since they mix early
+      // and late values.
+      if (T->getParent() != LoopB || RShifts.count(T) || isa<PHINode>(T))
+        continue;
+
+      Users.insert(T);
+      if (!classifyInst(T, Early, Late))
+        return false;
+    }
+  }
+
+  if (Users.size() == 0)
+    return false;
+
+  // Verify that high bits remain zero.
+  ValueSeq Internal(Users.begin(), Users.end());
+  ValueSeq Inputs;
+  for (unsigned i = 0; i < Internal.size(); ++i) {
+    auto *R = dyn_cast<Instruction>(Internal[i]);
+    if (!R)
+      continue;
+    for (Value *Op : R->operands()) {
+      auto *T = dyn_cast<Instruction>(Op);
+      if (T && T->getParent() != LoopB)
+        Inputs.insert(Op);
+      else
+        Internal.insert(Op);
+    }
+  }
+  for (Value *V : Inputs)
+    if (!highBitsAreZero(V, IterCount))
+      return false;
+  for (Value *V : Internal)
+    if (!keepsHighBitsZero(V, IterCount))
+      return false;
+
+  // Finally, the work can be done. Unshift each user.
+  IRBuilder<> IRB(LoopB);
+  std::map<Value*,Value*> ShiftMap;
+  typedef std::map<std::pair<Value*,Type*>,Value*> CastMapType;
+  CastMapType CastMap;
+
+  auto upcast = [] (CastMapType &CM, IRBuilder<> &IRB, Value *V,
+        IntegerType *Ty) -> Value* {
+    auto H = CM.find(std::make_pair(V, Ty));
+    if (H != CM.end())
+      return H->second;
+    Value *CV = IRB.CreateIntCast(V, Ty, false);
+    CM.insert(std::make_pair(std::make_pair(V, Ty), CV));
+    return CV;
+  };
+
+  for (auto I = LoopB->begin(), E = LoopB->end(); I != E; ++I) {
+    if (isa<PHINode>(I) || !Users.count(&*I))
+      continue;
+    using namespace PatternMatch;
+    // Match lshr x, 1.
+    Value *V = nullptr;
+    if (match(&*I, m_LShr(m_Value(V), m_One()))) {
+      replaceAllUsesOfWithIn(&*I, V, LoopB);
+      continue;
+    }
+    // For each non-cycled operand, replace it with the corresponding
+    // value shifted left.
+    for (auto &J : I->operands()) {
+      Value *Op = J.get();
+      if (!isOperandShifted(&*I, Op))
+        continue;
+      if (Users.count(Op))
+        continue;
+      // Skip shifting zeros.
+      if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
+        continue;
+      // Check if we have already generated a shift for this value.
+      auto F = ShiftMap.find(Op);
+      Value *W = (F != ShiftMap.end()) ? F->second : nullptr;
+      if (W == nullptr) {
+        IRB.SetInsertPoint(&*I);
+        // First, the shift amount will be CIV or CIV+1, depending on
+        // whether the value is early or late. Instead of creating CIV+1,
+        // do a single shift of the value.
+        Value *ShAmt = CIV, *ShVal = Op;
+        auto *VTy = cast<IntegerType>(ShVal->getType());
+        auto *ATy = cast<IntegerType>(ShAmt->getType());
+        if (Late.count(&*I))
+          ShVal = IRB.CreateShl(Op, ConstantInt::get(VTy, 1));
+        // Second, the types of the shifted value and the shift amount
+        // must match.
+        if (VTy != ATy) {
+          if (VTy->getBitWidth() < ATy->getBitWidth())
+            ShVal = upcast(CastMap, IRB, ShVal, ATy);
+          else
+            ShAmt = upcast(CastMap, IRB, ShAmt, VTy);
+        }
+        // Ready to generate the shift and memoize it.
+        W = IRB.CreateShl(ShVal, ShAmt);
+        ShiftMap.insert(std::make_pair(Op, W));
+      }
+      I->replaceUsesOfWith(Op, W);
+    }
+  }
+
+  // Update the users outside of the loop to account for having left
+  // shifts. They would normally be shifted right in the loop, so shift
+  // them right after the loop exit.
+  // Take advantage of the loop-closed SSA form, which has all the post-
+  // loop values in phi nodes.
+  IRB.SetInsertPoint(ExitB, ExitB->getFirstInsertionPt());
+  for (auto P = ExitB->begin(), Q = ExitB->end(); P != Q; ++P) {
+    if (!isa<PHINode>(P))
+      break;
+    auto *PN = cast<PHINode>(P);
+    Value *U = PN->getIncomingValueForBlock(LoopB);
+    if (!Users.count(U))
+      continue;
+    Value *S = IRB.CreateLShr(PN, ConstantInt::get(PN->getType(), IterCount));
+    PN->replaceAllUsesWith(S);
+    // The above RAUW will create
+    //   S = lshr S, IterCount
+    // so we need to fix it back into
+    //   S = lshr PN, IterCount
+    cast<User>(S)->replaceUsesOfWith(S, PN);
+  }
+
+  return true;
+}
+
+
+void PolynomialMultiplyRecognize::cleanupLoopBody(BasicBlock *LoopB) {
+  for (auto &I : *LoopB)
+    if (Value *SV = SimplifyInstruction(&I, {DL, &TLI, &DT}))
+      I.replaceAllUsesWith(SV);
+
+  for (auto I = LoopB->begin(), N = I; I != LoopB->end(); I = N) {
+    N = std::next(I);
+    RecursivelyDeleteTriviallyDeadInstructions(&*I, &TLI);
+  }
+}
+
+
+unsigned PolynomialMultiplyRecognize::getInverseMxN(unsigned QP) {
+  // Arrays of coefficients of Q and the inverse, C.
+  // Q[i] = coefficient at x^i.
+  std::array<char,32> Q, C;
+
+  for (unsigned i = 0; i < 32; ++i) {
+    Q[i] = QP & 1;
+    QP >>= 1;
+  }
+  assert(Q[0] == 1);
+
+  // Find C, such that
+  // (Q[n]*x^n + ... + Q[1]*x + Q[0]) * (C[n]*x^n + ... + C[1]*x + C[0]) = 1
+  //
+  // For it to have a solution, Q[0] must be 1. Since this is Z2[x], the
+  // operations * and + are & and ^ respectively.
+  //
+  // Find C[i] recursively, by comparing i-th coefficient in the product
+  // with 0 (or 1 for i=0).
+  //
+  // C[0] = 1, since C[0] = Q[0], and Q[0] = 1.
+  C[0] = 1;
+  for (unsigned i = 1; i < 32; ++i) {
+    // Solve for C[i] in:
+    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i]Q[0] = 0
+    // This is equivalent to
+    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] ^ C[i] = 0
+    // which is
+    //   C[0]Q[i] ^ C[1]Q[i-1] ^ ... ^ C[i-1]Q[1] = C[i]
+    unsigned T = 0;
+    for (unsigned j = 0; j < i; ++j)
+      T = T ^ (C[j] & Q[i-j]);
+    C[i] = T;
+  }
+
+  unsigned QV = 0;
+  for (unsigned i = 0; i < 32; ++i)
+    if (C[i])
+      QV |= (1 << i);
+
+  return QV;
+}
+
+
+Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
+      ParsedValues &PV) {
+  IRBuilder<> B(&*At);
+  Module *M = At->getParent()->getParent()->getParent();
+  Value *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw);
+
+  Value *P = PV.P, *Q = PV.Q, *P0 = P;
+  unsigned IC = PV.IterCount;
+
+  if (PV.M != nullptr)
+    P0 = P = B.CreateXor(P, PV.M);
+
+  // Create a bit mask to clear the high bits beyond IterCount.
+  auto *BMI = ConstantInt::get(P->getType(), APInt::getLowBitsSet(32, IC));
+
+  if (PV.IterCount != 32)
+    P = B.CreateAnd(P, BMI);
+
+  if (PV.Inv) {
+    auto *QI = dyn_cast<ConstantInt>(PV.Q);
+    assert(QI && QI->getBitWidth() <= 32);
+
+    // Again, clearing bits beyond IterCount.
+    unsigned M = (1 << PV.IterCount) - 1;
+    unsigned Tmp = (QI->getZExtValue() | 1) & M;
+    unsigned QV = getInverseMxN(Tmp) & M;
+    auto *QVI = ConstantInt::get(QI->getType(), QV);
+    P = B.CreateCall(PMF, {P, QVI});
+    P = B.CreateTrunc(P, QI->getType());
+    if (IC != 32)
+      P = B.CreateAnd(P, BMI);
+  }
+
+  Value *R = B.CreateCall(PMF, {P, Q});
+
+  if (PV.M != nullptr)
+    R = B.CreateXor(R, B.CreateIntCast(P0, R->getType(), false));
+
+  return R;
+}
+
+
+void PolynomialMultiplyRecognize::setupSimplifier() {
+  Simp.addRule(
+    // Sink zext past bitwise operations.
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::ZExt)
+        return nullptr;
+      Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
+      if (!T)
+        return nullptr;
+      switch (T->getOpcode()) {
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor:
+          break;
+        default:
+          return nullptr;
+      }
+      IRBuilder<> B(Ctx);
+      return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
+                           B.CreateZExt(T->getOperand(0), I->getType()),
+                           B.CreateZExt(T->getOperand(1), I->getType()));
+    });
+  Simp.addRule(
+    // (xor (and x a) (and y a)) -> (and (xor x y) a)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::Xor)
+        return nullptr;
+      Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
+      Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
+      if (!And0 || !And1)
+        return nullptr;
+      if (And0->getOpcode() != Instruction::And ||
+          And1->getOpcode() != Instruction::And)
+        return nullptr;
+      if (And0->getOperand(1) != And1->getOperand(1))
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
+                         And0->getOperand(1));
+    });
+  Simp.addRule(
+    // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
+    // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
+      if (!BO)
+        return nullptr;
+      Instruction::BinaryOps Op = BO->getOpcode();
+      if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
+        IRBuilder<> B(Ctx);
+        Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
+        Value *Z = BO->getOperand(1);
+        return B.CreateSelect(Sel->getCondition(),
+                              B.CreateBinOp(Op, X, Z),
+                              B.CreateBinOp(Op, Y, Z));
+      }
+      if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
+        IRBuilder<> B(Ctx);
+        Value *X = BO->getOperand(0);
+        Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
+        return B.CreateSelect(Sel->getCondition(),
+                              B.CreateBinOp(Op, X, Y),
+                              B.CreateBinOp(Op, X, Z));
+      }
+      return nullptr;
+    });
+  Simp.addRule(
+    // (select c (select c x y) z) -> (select c x z)
+    // (select c x (select c y z)) -> (select c x z)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      SelectInst *Sel = dyn_cast<SelectInst>(I);
+      if (!Sel)
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      Value *C = Sel->getCondition();
+      if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
+        if (Sel0->getCondition() == C)
+          return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
+      }
+      if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
+        if (Sel1->getCondition() == C)
+          return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
+      }
+      return nullptr;
+    });
+  Simp.addRule(
+    // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::Or)
+        return nullptr;
+      Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
+      if (!LShr || LShr->getOpcode() != Instruction::LShr)
+        return nullptr;
+      ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
+      if (!One || One->getZExtValue() != 1)
+        return nullptr;
+      ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
+        return nullptr;
+      return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
+    });
+  Simp.addRule(
+    // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::LShr)
+        return nullptr;
+      BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
+      if (!BitOp)
+        return nullptr;
+      switch (BitOp->getOpcode()) {
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor:
+          break;
+        default:
+          return nullptr;
+      }
+      IRBuilder<> B(Ctx);
+      Value *S = I->getOperand(1);
+      return B.CreateBinOp(BitOp->getOpcode(),
+                B.CreateLShr(BitOp->getOperand(0), S),
+                B.CreateLShr(BitOp->getOperand(1), S));
+    });
+  Simp.addRule(
+    // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      auto IsBitOp = [](unsigned Op) -> bool {
+        switch (Op) {
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            return true;
+        }
+        return false;
+      };
+      BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
+      if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
+        return nullptr;
+      BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
+      if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
+        return nullptr;
+      ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
+      ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
+      if (!CA || !CB)
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      Value *X = BitOp2->getOperand(0);
+      return B.CreateBinOp(BitOp2->getOpcode(), X,
+                B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
+    });
+}
+
+
+bool PolynomialMultiplyRecognize::recognize() {
+  DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
+               << *CurLoop << '\n');
+  // Restrictions:
+  // - The loop must consist of a single block.
+  // - The iteration count must be known at compile-time.
+  // - The loop must have an induction variable starting from 0, and
+  //   incremented in each iteration of the loop.
+  BasicBlock *LoopB = CurLoop->getHeader();
+  DEBUG(dbgs() << "Loop header:\n" << *LoopB);
+
+  if (LoopB != CurLoop->getLoopLatch())
+    return false;
+  BasicBlock *ExitB = CurLoop->getExitBlock();
+  if (ExitB == nullptr)
+    return false;
+  BasicBlock *EntryB = CurLoop->getLoopPreheader();
+  if (EntryB == nullptr)
+    return false;
+
+  unsigned IterCount = 0;
+  const SCEV *CT = SE.getBackedgeTakenCount(CurLoop);
+  if (isa<SCEVCouldNotCompute>(CT))
+    return false;
+  if (auto *CV = dyn_cast<SCEVConstant>(CT))
+    IterCount = CV->getValue()->getZExtValue() + 1;
+
+  Value *CIV = getCountIV(LoopB);
+  ParsedValues PV;
+  PV.IterCount = IterCount;
+  DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
+
+  setupSimplifier();
+
+  // Perform a preliminary scan of select instructions to see if any of them
+  // looks like a generator of the polynomial multiply steps. Assume that a
+  // loop can only contain a single transformable operation, so stop the
+  // traversal after the first reasonable candidate was found.
+  // XXX: Currently this approach can modify the loop before being 100% sure
+  // that the transformation can be carried out.
+  bool FoundPreScan = false;
+  for (Instruction &In : *LoopB) {
+    SelectInst *SI = dyn_cast<SelectInst>(&In);
+    if (!SI)
+      continue;
+
+    Simplifier::Context C(SI);
+    Value *T = Simp.simplify(C);
+    SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
+    DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
+    if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
+      FoundPreScan = true;
+      if (SelI != SI) {
+        Value *NewSel = C.materialize(LoopB, SI->getIterator());
+        SI->replaceAllUsesWith(NewSel);
+        RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
+      }
+      break;
+    }
+  }
+
+  if (!FoundPreScan) {
+    DEBUG(dbgs() << "Have not found candidates for pmpy\n");
+    return false;
+  }
+
+  if (!PV.Left) {
+    // The right shift version actually only returns the higher bits of
+    // the result (each iteration discards the LSB). If we want to convert it
+    // to a left-shifting loop, the working data type must be at least as
+    // wide as the target's pmpy instruction.
+    if (!promoteTypes(LoopB, ExitB))
+      return false;
+    if (!convertShiftsToLeft(LoopB, ExitB, IterCount))
+      return false;
+    cleanupLoopBody(LoopB);
+  }
+
+  // Scan the loop again, find the generating select instruction.
+  bool FoundScan = false;
+  for (Instruction &In : *LoopB) {
+    SelectInst *SelI = dyn_cast<SelectInst>(&In);
+    if (!SelI)
+      continue;
+    DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
+    FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
+    if (FoundScan)
+      break;
+  }
+  assert(FoundScan);
+
+  DEBUG({
+    StringRef PP = (PV.M ? "(P+M)" : "P");
+    if (!PV.Inv)
+      dbgs() << "Found pmpy idiom: R = " << PP << ".Q\n";
+    else
+      dbgs() << "Found inverse pmpy idiom: R = (" << PP << "/Q).Q) + "
+             << PP << "\n";
+    dbgs() << "  Res:" << *PV.Res << "\n  P:" << *PV.P << "\n";
+    if (PV.M)
+      dbgs() << "  M:" << *PV.M << "\n";
+    dbgs() << "  Q:" << *PV.Q << "\n";
+    dbgs() << "  Iteration count:" << PV.IterCount << "\n";
+  });
+
+  BasicBlock::iterator At(EntryB->getTerminator());
+  Value *PM = generate(At, PV);
+  if (PM == nullptr)
+    return false;
+
+  if (PM->getType() != PV.Res->getType())
+    PM = IRBuilder<>(&*At).CreateIntCast(PM, PV.Res->getType(), false);
+
+  PV.Res->replaceAllUsesWith(PM);
+  PV.Res->eraseFromParent();
+  return true;
+}
+
+
+unsigned HexagonLoopIdiomRecognize::getStoreSizeInBytes(StoreInst *SI) {
+  uint64_t SizeInBits = DL->getTypeSizeInBits(SI->getValueOperand()->getType());
+  assert(((SizeInBits & 7) || (SizeInBits >> 32) == 0) &&
+         "Don't overflow unsigned.");
+  return (unsigned)SizeInBits >> 3;
+}
+
+
+int HexagonLoopIdiomRecognize::getSCEVStride(const SCEVAddRecExpr *S) {
+  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getOperand(1)))
+    return SC->getAPInt().getSExtValue();
+  return 0;
+}
+
+
+bool HexagonLoopIdiomRecognize::isLegalStore(Loop *CurLoop, StoreInst *SI) {
+  // Allow volatile stores if HexagonVolatileMemcpy is enabled.
+  if (!(SI->isVolatile() && HexagonVolatileMemcpy) && !SI->isSimple())
+    return false;
+
+  Value *StoredVal = SI->getValueOperand();
+  Value *StorePtr = SI->getPointerOperand();
+
+  // Reject stores that are so large that they overflow an unsigned.
+  uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
+  if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided store.  If we have something else, it's a
+  // random store we can't handle.
+  auto *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+    return false;
+
+  // Check to see if the stride matches the size of the store.  If so, then we
+  // know that every byte is touched in the loop.
+  int Stride = getSCEVStride(StoreEv);
+  if (Stride == 0)
+    return false;
+  unsigned StoreSize = getStoreSizeInBytes(SI);
+  if (StoreSize != unsigned(std::abs(Stride)))
+    return false;
+
+  // The store must be feeding a non-volatile load.
+  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+  if (!LI || !LI->isSimple())
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  Value *LoadPtr = LI->getPointerOperand();
+  auto *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
+  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+    return false;
+
+  // The store and load must share the same stride.
+  if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+    return false;
+
+  // Success.  This store can be converted into a memcpy.
+  return true;
+}
+
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access.  The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+                      const SCEV *BECount, unsigned StoreSize,
+                      AliasAnalysis &AA,
+                      SmallPtrSetImpl<Instruction *> &Ignored) {
+  // Get the location that may be stored across the loop.  Since the access
+  // is strided positively through memory, we say that the modified location
+  // starts at the pointer and has infinite size.
+  uint64_t AccessSize = MemoryLocation::UnknownSize;
+
+  // If the loop iterates a fixed number of times, we can refine the access
+  // size to be exactly the size of the memset, which is (BECount+1)*StoreSize
+  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+    AccessSize = (BECst->getValue()->getZExtValue() + 1) * StoreSize;
+
+  // TODO: For this to be really effective, we have to dive into the pointer
+  // operand in the store.  Store to &A[i] of 100 will always return may alias
+  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+  // which will then no-alias a store to &A[100].
+  MemoryLocation StoreLoc(Ptr, AccessSize);
+
+  for (auto *B : L->blocks())
+    for (auto &I : *B)
+      if (Ignored.count(&I) == 0 && (AA.getModRefInfo(&I, StoreLoc) & Access))
+        return true;
+
+  return false;
+}
+
+
+void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
+      SmallVectorImpl<StoreInst*> &Stores) {
+  Stores.clear();
+  for (Instruction &I : *BB)
+    if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+      if (isLegalStore(CurLoop, SI))
+        Stores.push_back(SI);
+}
+
+
+bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
+      StoreInst *SI, const SCEV *BECount) {
+  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
+         "Expected only non-volatile stores, or Hexagon-specific memcpy"
+         "to volatile destination.");
+
+  Value *StorePtr = SI->getPointerOperand();
+  auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+  unsigned Stride = getSCEVStride(StoreEv);
+  unsigned StoreSize = getStoreSizeInBytes(SI);
+  if (Stride != StoreSize)
+    return false;
+
+  // See if the pointer expression is an AddRec like {base,+,1} on the current
+  // loop, which indicates a strided load.  If we have something else, it's a
+  // random load we can't handle.
+  LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+  auto *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+  // The trip count of the loop and the base pointer of the addrec SCEV is
+  // guaranteed to be loop invariant, which means that it should dominate the
+  // header.  This allows us to insert code for it in the preheader.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  Instruction *ExpPt = Preheader->getTerminator();
+  IRBuilder<> Builder(ExpPt);
+  SCEVExpander Expander(*SE, *DL, "hexagon-loop-idiom");
+
+  Type *IntPtrTy = Builder.getIntPtrTy(*DL, SI->getPointerAddressSpace());
+
+  // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
+  // this into a memcpy/memmove in the loop preheader now if we want.  However,
+  // this would be unsafe to do if there is anything else in the loop that may
+  // read or write the memory region we're storing to.  For memcpy, this
+  // includes the load that feeds the stores.  Check for an alias by generating
+  // the base address and checking everything.
+  Value *StoreBasePtr = Expander.expandCodeFor(StoreEv->getStart(),
+      Builder.getInt8PtrTy(SI->getPointerAddressSpace()), ExpPt);
+  Value *LoadBasePtr = nullptr;
+
+  bool Overlap = false;
+  bool DestVolatile = SI->isVolatile();
+  Type *BECountTy = BECount->getType();
+
+  if (DestVolatile) {
+    // The trip count must fit in i32, since it is the type of the "num_words"
+    // argument to hexagon_memcpy_forward_vp4cp4n2.
+    if (StoreSize != 4 || DL->getTypeSizeInBits(BECountTy) > 32) {
+CleanupAndExit:
+      // If we generated new code for the base pointer, clean up.
+      Expander.clear();
+      if (StoreBasePtr && (LoadBasePtr != StoreBasePtr)) {
+        RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+        StoreBasePtr = nullptr;
+      }
+      if (LoadBasePtr) {
+        RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
+        LoadBasePtr = nullptr;
+      }
+      return false;
+    }
+  }
+
+  SmallPtrSet<Instruction*, 2> Ignore1;
+  Ignore1.insert(SI);
+  if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                            StoreSize, *AA, Ignore1)) {
+    // Check if the load is the offending instruction.
+    Ignore1.insert(LI);
+    if (mayLoopAccessLocation(StoreBasePtr, MRI_ModRef, CurLoop, BECount,
+                              StoreSize, *AA, Ignore1)) {
+      // Still bad. Nothing we can do.
+      goto CleanupAndExit;
+    }
+    // It worked with the load ignored.
+    Overlap = true;
+  }
+
+  if (!Overlap) {
+    if (DisableMemcpyIdiom || !HasMemcpy)
+      goto CleanupAndExit;
+  } else {
+    // Don't generate memmove if this function will be inlined. This is
+    // because the caller will undergo this transformation after inlining.
+    Function *Func = CurLoop->getHeader()->getParent();
+    if (Func->hasFnAttribute(Attribute::AlwaysInline))
+      goto CleanupAndExit;
+
+    // In case of a memmove, the call to memmove will be executed instead
+    // of the loop, so we need to make sure that there is nothing else in
+    // the loop than the load, store and instructions that these two depend
+    // on.
+    SmallVector<Instruction*,2> Insts;
+    Insts.push_back(SI);
+    Insts.push_back(LI);
+    if (!coverLoop(CurLoop, Insts))
+      goto CleanupAndExit;
+
+    if (DisableMemmoveIdiom || !HasMemmove)
+      goto CleanupAndExit;
+    bool IsNested = CurLoop->getParentLoop() != 0;
+    if (IsNested && OnlyNonNestedMemmove)
+      goto CleanupAndExit;
+  }
+
+  // For a memcpy, we have to make sure that the input array is not being
+  // mutated by the loop.
+  LoadBasePtr = Expander.expandCodeFor(LoadEv->getStart(),
+      Builder.getInt8PtrTy(LI->getPointerAddressSpace()), ExpPt);
+
+  SmallPtrSet<Instruction*, 2> Ignore2;
+  Ignore2.insert(SI);
+  if (mayLoopAccessLocation(LoadBasePtr, MRI_Mod, CurLoop, BECount, StoreSize,
+                            *AA, Ignore2))
+    goto CleanupAndExit;
+
+  // Check the stride.
+  bool StridePos = getSCEVStride(LoadEv) >= 0;
+
+  // Currently, the volatile memcpy only emulates traversing memory forward.
+  if (!StridePos && DestVolatile)
+    goto CleanupAndExit;
+
+  bool RuntimeCheck = (Overlap || DestVolatile);
+
+  BasicBlock *ExitB;
+  if (RuntimeCheck) {
+    // The runtime check needs a single exit block.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    CurLoop->getUniqueExitBlocks(ExitBlocks);
+    if (ExitBlocks.size() != 1)
+      goto CleanupAndExit;
+    ExitB = ExitBlocks[0];
+  }
+
+  // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
+  // pointer size if it isn't already.
+  LLVMContext &Ctx = SI->getContext();
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
+  unsigned Alignment = std::min(SI->getAlignment(), LI->getAlignment());
+  DebugLoc DLoc = SI->getDebugLoc();
+
+  const SCEV *NumBytesS =
+      SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+  if (StoreSize != 1)
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
+                               SCEV::FlagNUW);
+  Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, ExpPt);
+  if (Instruction *In = dyn_cast<Instruction>(NumBytes))
+    if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
+      NumBytes = Simp;
+
+  CallInst *NewCall;
+
+  if (RuntimeCheck) {
+    unsigned Threshold = RuntimeMemSizeThreshold;
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes)) {
+      uint64_t C = CI->getZExtValue();
+      if (Threshold != 0 && C < Threshold)
+        goto CleanupAndExit;
+      if (C < CompileTimeMemSizeThreshold)
+        goto CleanupAndExit;
+    }
+
+    BasicBlock *Header = CurLoop->getHeader();
+    Function *Func = Header->getParent();
+    Loop *ParentL = LF->getLoopFor(Preheader);
+    StringRef HeaderName = Header->getName();
+
+    // Create a new (empty) preheader, and update the PHI nodes in the
+    // header to use the new preheader.
+    BasicBlock *NewPreheader = BasicBlock::Create(Ctx, HeaderName+".rtli.ph",
+                                                  Func, Header);
+    if (ParentL)
+      ParentL->addBasicBlockToLoop(NewPreheader, *LF);
+    IRBuilder<>(NewPreheader).CreateBr(Header);
+    for (auto &In : *Header) {
+      PHINode *PN = dyn_cast<PHINode>(&In);
+      if (!PN)
+        break;
+      int bx = PN->getBasicBlockIndex(Preheader);
+      if (bx >= 0)
+        PN->setIncomingBlock(bx, NewPreheader);
+    }
+    DT->addNewBlock(NewPreheader, Preheader);
+    DT->changeImmediateDominator(Header, NewPreheader);
+
+    // Check for safe conditions to execute memmove.
+    // If stride is positive, copying things from higher to lower addresses
+    // is equivalent to memmove.  For negative stride, it's the other way
+    // around.  Copying forward in memory with positive stride may not be
+    // same as memmove since we may be copying values that we just stored
+    // in some previous iteration.
+    Value *LA = Builder.CreatePtrToInt(LoadBasePtr, IntPtrTy);
+    Value *SA = Builder.CreatePtrToInt(StoreBasePtr, IntPtrTy);
+    Value *LowA = StridePos ? SA : LA;
+    Value *HighA = StridePos ? LA : SA;
+    Value *CmpA = Builder.CreateICmpULT(LowA, HighA);
+    Value *Cond = CmpA;
+
+    // Check for distance between pointers.
+    Value *Dist = Builder.CreateSub(HighA, LowA);
+    Value *CmpD = Builder.CreateICmpSLT(NumBytes, Dist);
+    Value *CmpEither = Builder.CreateOr(Cond, CmpD);
+    Cond = CmpEither;
+
+    if (Threshold != 0) {
+      Type *Ty = NumBytes->getType();
+      Value *Thr = ConstantInt::get(Ty, Threshold);
+      Value *CmpB = Builder.CreateICmpULT(Thr, NumBytes);
+      Value *CmpBoth = Builder.CreateAnd(Cond, CmpB);
+      Cond = CmpBoth;
+    }
+    BasicBlock *MemmoveB = BasicBlock::Create(Ctx, Header->getName()+".rtli",
+                                              Func, NewPreheader);
+    if (ParentL)
+      ParentL->addBasicBlockToLoop(MemmoveB, *LF);
+    Instruction *OldT = Preheader->getTerminator();
+    Builder.CreateCondBr(Cond, MemmoveB, NewPreheader);
+    OldT->eraseFromParent();
+    Preheader->setName(Preheader->getName()+".old");
+    DT->addNewBlock(MemmoveB, Preheader);
+    // Find the new immediate dominator of the exit block.
+    BasicBlock *ExitD = Preheader;
+    for (auto PI = pred_begin(ExitB), PE = pred_end(ExitB); PI != PE; ++PI) {
+      BasicBlock *PB = *PI;
+      ExitD = DT->findNearestCommonDominator(ExitD, PB);
+      if (!ExitD)
+        break;
+    }
+    // If the prior immediate dominator of ExitB was dominated by the
+    // old preheader, then the old preheader becomes the new immediate
+    // dominator.  Otherwise don't change anything (because the newly
+    // added blocks are dominated by the old preheader).
+    if (ExitD && DT->dominates(Preheader, ExitD)) {
+      DomTreeNode *BN = DT->getNode(ExitB);
+      DomTreeNode *DN = DT->getNode(ExitD);
+      BN->setIDom(DN);
+    }
+
+    // Add a call to memmove to the conditional block.
+    IRBuilder<> CondBuilder(MemmoveB);
+    CondBuilder.CreateBr(ExitB);
+    CondBuilder.SetInsertPoint(MemmoveB->getTerminator());
+
+    if (DestVolatile) {
+      Type *Int32Ty = Type::getInt32Ty(Ctx);
+      Type *Int32PtrTy = Type::getInt32PtrTy(Ctx);
+      Type *VoidTy = Type::getVoidTy(Ctx);
+      Module *M = Func->getParent();
+      Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
+                                            Int32PtrTy, Int32PtrTy, Int32Ty);
+      Function *Fn = cast<Function>(CF);
+      Fn->setLinkage(Function::ExternalLinkage);
+
+      const SCEV *OneS = SE->getConstant(Int32Ty, 1);
+      const SCEV *BECount32 = SE->getTruncateOrZeroExtend(BECount, Int32Ty);
+      const SCEV *NumWordsS = SE->getAddExpr(BECount32, OneS, SCEV::FlagNUW);
+      Value *NumWords = Expander.expandCodeFor(NumWordsS, Int32Ty,
+                                               MemmoveB->getTerminator());
+      if (Instruction *In = dyn_cast<Instruction>(NumWords))
+        if (Value *Simp = SimplifyInstruction(In, {*DL, TLI, DT}))
+          NumWords = Simp;
+
+      Value *Op0 = (StoreBasePtr->getType() == Int32PtrTy)
+                      ? StoreBasePtr
+                      : CondBuilder.CreateBitCast(StoreBasePtr, Int32PtrTy);
+      Value *Op1 = (LoadBasePtr->getType() == Int32PtrTy)
+                      ? LoadBasePtr
+                      : CondBuilder.CreateBitCast(LoadBasePtr, Int32PtrTy);
+      NewCall = CondBuilder.CreateCall(Fn, {Op0, Op1, NumWords});
+    } else {
+      NewCall = CondBuilder.CreateMemMove(StoreBasePtr, LoadBasePtr,
+                                          NumBytes, Alignment);
+    }
+  } else {
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr,
+                                   NumBytes, Alignment);
+    // Okay, the memcpy has been formed.  Zap the original store and
+    // anything that feeds into it.
+    RecursivelyDeleteTriviallyDeadInstructions(SI, TLI);
+  }
+
+  NewCall->setDebugLoc(DLoc);
+
+  DEBUG(dbgs() << "  Formed " << (Overlap ? "memmove: " : "memcpy: ")
+               << *NewCall << "\n"
+               << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+               << "    from store ptr=" << *StoreEv << " at: " << *SI << "\n");
+
+  return true;
+}
+
+
+// \brief Check if the instructions in Insts, together with their dependencies
+// cover the loop in the sense that the loop could be safely eliminated once
+// the instructions in Insts are removed.
+bool HexagonLoopIdiomRecognize::coverLoop(Loop *L,
+      SmallVectorImpl<Instruction*> &Insts) const {
+  SmallSet<BasicBlock*,8> LoopBlocks;
+  for (auto *B : L->blocks())
+    LoopBlocks.insert(B);
+
+  SetVector<Instruction*> Worklist(Insts.begin(), Insts.end());
+
+  // Collect all instructions from the loop that the instructions in Insts
+  // depend on (plus their dependencies, etc.).  These instructions will
+  // constitute the expression trees that feed those in Insts, but the trees
+  // will be limited only to instructions contained in the loop.
+  for (unsigned i = 0; i < Worklist.size(); ++i) {
+    Instruction *In = Worklist[i];
+    for (auto I = In->op_begin(), E = In->op_end(); I != E; ++I) {
+      Instruction *OpI = dyn_cast<Instruction>(I);
+      if (!OpI)
+        continue;
+      BasicBlock *PB = OpI->getParent();
+      if (!LoopBlocks.count(PB))
+        continue;
+      Worklist.insert(OpI);
+    }
+  }
+
+  // Scan all instructions in the loop, if any of them have a user outside
+  // of the loop, or outside of the expressions collected above, then either
+  // the loop has a side-effect visible outside of it, or there are
+  // instructions in it that are not involved in the original set Insts.
+  for (auto *B : L->blocks()) {
+    for (auto &In : *B) {
+      if (isa<BranchInst>(In) || isa<DbgInfoIntrinsic>(In))
+        continue;
+      if (!Worklist.count(&In) && In.mayHaveSideEffects())
+        return false;
+      for (const auto &K : In.users()) {
+        Instruction *UseI = dyn_cast<Instruction>(K);
+        if (!UseI)
+          continue;
+        BasicBlock *UseB = UseI->getParent();
+        if (LF->getLoopFor(UseB) != L)
+          return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count.  This block is known to be in the current
+/// loop and not in any subloops.
+bool HexagonLoopIdiomRecognize::runOnLoopBlock(Loop *CurLoop, BasicBlock *BB,
+      const SCEV *BECount, SmallVectorImpl<BasicBlock*> &ExitBlocks) {
+  // We can only promote stores in this block if they are unconditionally
+  // executed in the loop.  For a block to be unconditionally executed, it has
+  // to dominate all the exit blocks of the loop.  Verify this now.
+  auto DominatedByBB = [this,BB] (BasicBlock *EB) -> bool {
+    return DT->dominates(BB, EB);
+  };
+  if (!std::all_of(ExitBlocks.begin(), ExitBlocks.end(), DominatedByBB))
+    return false;
+
+  bool MadeChange = false;
+  // Look for store instructions, which may be optimized to memset/memcpy.
+  SmallVector<StoreInst*,8> Stores;
+  collectStores(CurLoop, BB, Stores);
+
+  // Optimize the store into a memcpy, if it feeds an similarly strided load.
+  for (auto &SI : Stores)
+    MadeChange |= processCopyingStore(CurLoop, SI, BECount);
+
+  return MadeChange;
+}
+
+
+bool HexagonLoopIdiomRecognize::runOnCountableLoop(Loop *L) {
+  PolynomialMultiplyRecognize PMR(L, *DL, *DT, *TLI, *SE);
+  if (PMR.recognize())
+    return true;
+
+  if (!HasMemcpy && !HasMemmove)
+    return false;
+
+  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+  assert(!isa<SCEVCouldNotCompute>(BECount) &&
+         "runOnCountableLoop() called on a loop without a predictable"
+         "backedge-taken count");
+
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  bool Changed = false;
+
+  // Scan all the blocks in the loop that are not in subloops.
+  for (auto *BB : L->getBlocks()) {
+    // Ignore blocks in subloops.
+    if (LF->getLoopFor(BB) != L)
+      continue;
+    Changed |= runOnLoopBlock(L, BB, BECount, ExitBlocks);
+  }
+
+  return Changed;
+}
+
+
+bool HexagonLoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  const Module &M = *L->getHeader()->getParent()->getParent();
+  if (Triple(M.getTargetTriple()).getArch() != Triple::hexagon)
+    return false;
+
+  if (skipLoop(L))
+    return false;
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy" || Name == "memmove")
+    return false;
+
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DL = &L->getHeader()->getModule()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LF = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  HasMemcpy = TLI->has(LibFunc_memcpy);
+  HasMemmove = TLI->has(LibFunc_memmove);
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L))
+    return runOnCountableLoop(L);
+  return false;
+}
+
+
+Pass *llvm::createHexagonLoopIdiomPass() {
+  return new HexagonLoopIdiomRecognize();
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index a5dc002..072501d 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -39,7 +39,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   // Populate the relocation type based on Hexagon target flags
   // set on an operand
   MCSymbolRefExpr::VariantKind RelocationType;
-  switch (MO.getTargetFlags()) {
+  switch (MO.getTargetFlags() & ~HexagonII::HMOTF_ConstExtended) {
   default:
     RelocationType = MCSymbolRefExpr::VK_None;
     break;
@@ -109,11 +109,14 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
 
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_RegisterMask:
+      continue;
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
+      if (MO.isImplicit())
+        continue;
       MCO = MCOperand::createReg(MO.getReg());
       break;
     case MachineOperand::MO_FPImmediate: {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 9ff9d93..1a26805 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -49,7 +49,7 @@ static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden,
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 namespace {
 class HexagonCallMutation : public ScheduleDAGMutation {
@@ -74,7 +74,9 @@ bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII,
     return false;
 
   // TypeXTYPE are 64 bit operations.
-  if (HII.getType(*Inst2.getInstr()) == HexagonII::TypeXTYPE)
+  unsigned Type = HII.getType(*Inst2.getInstr());
+  if (Type == HexagonII::TypeS_2op || Type == HexagonII::TypeS_3op ||
+    Type == HexagonII::TypeALU64 || Type == HexagonII::TypeM)
     return true;
   return false;
 }
@@ -561,40 +563,33 @@ void ConvergingVLIWScheduler::readyQueueVerboseDump(
 }
 #endif
 
-/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
-/// of SU, return it, otherwise return null.
-static SUnit *getSingleUnscheduledPred(SUnit *SU) {
-  SUnit *OnlyAvailablePred = nullptr;
-  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-       I != E; ++I) {
-    SUnit &Pred = *I->getSUnit();
-    if (!Pred.isScheduled) {
-      // We found an available, but not scheduled, predecessor.  If it's the
-      // only one we have found, keep track of it... otherwise give up.
-      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
-        return nullptr;
-      OnlyAvailablePred = &Pred;
-    }
+/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) {
+  if (SU->NumPredsLeft == 0)
+    return false;
+
+  for (auto &Pred : SU->Preds) {
+    // We found an available, but not scheduled, predecessor.
+    if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2))
+      return false;
   }
-  return OnlyAvailablePred;
+
+  return true;
 }
 
-/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
-/// of SU, return it, otherwise return null.
-static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
-  SUnit *OnlyAvailableSucc = nullptr;
-  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
-       I != E; ++I) {
-    SUnit &Succ = *I->getSUnit();
-    if (!Succ.isScheduled) {
-      // We found an available, but not scheduled, successor.  If it's the
-      // only one we have found, keep track of it... otherwise give up.
-      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
-        return nullptr;
-      OnlyAvailableSucc = &Succ;
-    }
+/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor
+/// of SU, return true (we may have duplicates)
+static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
+  if (SU->NumSuccsLeft == 0)
+    return false;
+
+  for (auto &Succ : SU->Succs) {
+    // We found an available, but not scheduled, successor.
+    if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2))
+      return false;
   }
-  return OnlyAvailableSucc;
+  return true;
 }
 
 // Constants used to denote relative importance of
@@ -671,12 +666,12 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
     // Count the number of nodes that
     // this node is the sole unscheduled node for.
     for (const SDep &SI : SU->Succs)
-      if (getSingleUnscheduledPred(SI.getSUnit()) == SU)
+      if (isSingleUnscheduledPred(SI.getSUnit(), SU))
         ++NumNodesBlocking;
   } else {
     // How many unscheduled predecessors block this node?
     for (const SDep &PI : SU->Preds)
-      if (getSingleUnscheduledSucc(PI.getSUnit()) == SU)
+      if (isSingleUnscheduledSucc(PI.getSUnit(), SU))
         ++NumNodesBlocking;
   }
   ResCount += (NumNodesBlocking * ScaleTwo);
@@ -742,7 +737,7 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
 
   // Give less preference to an instruction that will cause a stall with
   // an instruction in the previous packet.
-  if (QII.isV60VectorInstruction(Instr)) {
+  if (QII.isHVXVec(Instr)) {
     // Check for stalls in the previous packet.
     if (Q.getID() == TopQID) {
       for (auto J : Top.ResourceModel->OldPacket)
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
index dc10028..810abf3 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -32,14 +32,10 @@
 using namespace llvm;
 
 namespace llvm {
-//===----------------------------------------------------------------------===//
-// ConvergingVLIWScheduler - Implementation of the standard
-// MachineSchedStrategy.
-//===----------------------------------------------------------------------===//
 
 class VLIWResourceModel {
   /// ResourcesModel - Represents VLIW state.
-  /// Not limited to VLIW targets per say, but assumes
+  /// Not limited to VLIW targets per se, but assumes
   /// definition of DFA by a target.
   DFAPacketizer *ResourcesModel;
 
@@ -110,6 +106,11 @@ public:
   void schedule() override;
 };
 
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
 /// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
 /// to balance the schedule.
 class ConvergingVLIWScheduler : public MachineSchedStrategy {
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
new file mode 100644
index 0000000..0b4ac14
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
@@ -0,0 +1,204 @@
+//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+           (MI VectorRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegsLow8:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, IntRegsLow8:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegsLow8:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegsLow8:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+           (MI VectorRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+           (MI VecDblRegs:$src1, VecDblRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+           (MI VecDblRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+           (MI VecPredRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+           (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VecPredRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VecPredRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2),
+           (MI VecPredRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID IntRegs:$src1),
+           (MI IntRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+           (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+           (MI VecPredRegs:$src1, VecPredRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, VecPredRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, VecPredRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4),
+           (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4),
+           (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>;
+def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>;
+def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>;
+def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>;
+def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>;
+
+defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>;
+defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>;
+defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>;
+defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>;
+defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>;
+defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>;
+defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>;
+defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>;
+defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>;
+defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>;
+defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>;
+defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>;
+defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>;
+defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>;
+defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>;
+defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>;
+defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>;
+defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>;
+defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>;
+defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>;
+defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>;
+defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>;
+defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>;
+defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>;
+defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>;
+defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>;
+defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>;
+defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>;
+defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>;
+defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>;
+defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>;
+defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>;
+defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>;
+defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>;
+defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>;
+defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>;
+defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>;
+defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>;
+defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 72d8011..e93f075 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -69,9 +69,7 @@ namespace {
   public:
     static char ID;
 
-    HexagonNewValueJump() : MachineFunctionPass(ID) {
-      initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonNewValueJump() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -130,6 +128,8 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
   if (II->getOpcode() == TargetOpcode::KILL)
     return false;
 
+  if (II->isImplicitDef())
+    return false;
 
   // Make sure there there is no 'def' or 'use' of any of the uses of
   // feeder insn between it's definition, this MI and jump, jmpInst
@@ -443,8 +443,6 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     unsigned predReg = 0; // predicate reg of the jump.
     unsigned cmpReg1 = 0;
     int cmpOp2 = 0;
-    bool MO1IsKill = false;
-    bool MO2IsKill = false;
     MachineBasicBlock::iterator jmpPos;
     MachineBasicBlock::iterator cmpPos;
     MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
@@ -546,14 +544,10 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // We need cmpReg1 and cmpOp2(imm or reg) while building
           // new value jump instruction.
           cmpReg1 = MI.getOperand(1).getReg();
-          if (MI.getOperand(1).isKill())
-            MO1IsKill = true;
 
-          if (isSecondOpReg) {
+          if (isSecondOpReg)
             cmpOp2 = MI.getOperand(2).getReg();
-            if (MI.getOperand(2).isKill())
-              MO2IsKill = true;
-          } else
+          else
             cmpOp2 = MI.getOperand(2).getImm();
           continue;
         }
@@ -603,11 +597,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
             if ((COp == Hexagon::C2_cmpeq || COp == Hexagon::C4_cmpneq) &&
                 (feederReg == (unsigned) cmpOp2)) {
               unsigned tmp = cmpReg1;
-              bool tmpIsKill = MO1IsKill;
               cmpReg1 = cmpOp2;
-              MO1IsKill = MO2IsKill;
               cmpOp2 = tmp;
-              MO2IsKill = tmpIsKill;
             }
 
             // Now we have swapped the operands, all we need to check is,
@@ -621,31 +612,33 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
           // make sure we are respecting the kill values of
           // the operands of the feeder.
 
-          bool updatedIsKill = false;
-          for (unsigned i = 0; i < MI.getNumOperands(); i++) {
-            MachineOperand &MO = MI.getOperand(i);
-            if (MO.isReg() && MO.isUse()) {
-              unsigned feederReg = MO.getReg();
-              for (MachineBasicBlock::iterator localII = feederPos,
-                   end = jmpPos; localII != end; localII++) {
-                MachineInstr &localMI = *localII;
-                for (unsigned j = 0; j < localMI.getNumOperands(); j++) {
-                  MachineOperand &localMO = localMI.getOperand(j);
-                  if (localMO.isReg() && localMO.isUse() &&
-                      localMO.isKill() && feederReg == localMO.getReg()) {
-                    // We found that there is kill of a use register
-                    // Set up a kill flag on the register
-                    localMO.setIsKill(false);
-                    MO.setIsKill();
-                    updatedIsKill = true;
-                    break;
-                  }
+          auto TransferKills = [jmpPos,cmpPos] (MachineInstr &MI) {
+            for (MachineOperand &MO : MI.operands()) {
+              if (!MO.isReg() || !MO.isUse())
+                continue;
+              unsigned UseR = MO.getReg();
+              for (auto I = std::next(MI.getIterator()); I != jmpPos; ++I) {
+                if (I == cmpPos)
+                  continue;
+                for (MachineOperand &Op : I->operands()) {
+                  if (!Op.isReg() || !Op.isUse() || !Op.isKill())
+                    continue;
+                  if (Op.getReg() != UseR)
+                    continue;
+                  // We found that there is kill of a use register
+                  // Set up a kill flag on the register
+                  Op.setIsKill(false);
+                  MO.setIsKill(true);
+                  return;
                 }
-                if (updatedIsKill) break;
               }
             }
-            if (updatedIsKill) break;
-          }
+          };
+
+          TransferKills(*feederPos);
+          TransferKills(*cmpPos);
+          bool MO1IsKill = cmpPos->killsRegister(cmpReg1, QRI);
+          bool MO2IsKill = isSecondOpReg && cmpPos->killsRegister(cmpOp2, QRI);
 
           MBB->splice(jmpPos, MI.getParent(), MI);
           MBB->splice(jmpPos, MI.getParent(), cmpInstr);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
index 9833105..f80e0ef 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td
@@ -1,298 +1,33 @@
-//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
+//===--- HexagonOperands.td -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
-// This file is distributed under the University of Illnois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
-def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; }
-def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; }
-def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; }
-def s8_0Imm64Operand : AsmOperandClass { let Name = "s8_0Imm64"; }
-def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; }
-def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
-def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
-def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
-def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
-def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
-def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
-def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; }
-def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; }
-def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
-def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
-def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
-def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
-def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
-def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
-def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; }
-def u9_0ImmOperand : AsmOperandClass { let Name = "u9_0Imm"; }
-def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; }
-def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; }
-def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
-def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
-def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
-def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
-def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; }
-def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; }
-def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; }
-def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; }
-def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; }
-def n8_0ImmOperand : AsmOperandClass { let Name = "n8_0Imm"; }
-// Immediate operands.
-
-let OperandType = "OPERAND_IMMEDIATE",
-    DecoderMethod = "unsignedImmDecoder" in {
-  def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand;
-                                let DecoderMethod = "s32_0ImmDecoder"; }
-  def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
-  def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand;
-                               let DecoderMethod = "s8_0ImmDecoder"; }
-  def s8_0Imm64 : Operand<i64>  { let ParserMatchClass = s8_0Imm64Operand;
-                                  let DecoderMethod = "s8_0ImmDecoder"; }
-  def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand;
-                             let DecoderMethod = "s6_0ImmDecoder"; }
-  def s6_3Imm : Operand<i32>;
-  def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
-                               let DecoderMethod = "s4_0ImmDecoder"; }
-  def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
-                               let DecoderMethod = "s4_1ImmDecoder"; }
-  def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
-                               let DecoderMethod = "s4_2ImmDecoder"; }
-  def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
-                               let DecoderMethod = "s4_3ImmDecoder"; }
-  def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
-  def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; }
-  def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
-  def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
-  def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
-  def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
-  def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
-  def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
-  def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; }
-  def u9_0Imm : Operand<i32> { let ParserMatchClass = u9_0ImmOperand; }
-  def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; }
-  def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; }
-  def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
-  def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
-  def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
-  def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
-  def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; }
-  def u5_1Imm : Operand<i32>;
-  def u5_2Imm : Operand<i32>;
-  def u5_3Imm : Operand<i32>;
-  def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; }
-  def u4_1Imm : Operand<i32>;
-  def u4_2Imm : Operand<i32>;
-  def u4_3Imm : Operand<i32>;
-  def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; }
-  def u3_1Imm : Operand<i32>;
-  def u3_2Imm : Operand<i32>;
-  def u3_3Imm : Operand<i32>;
-  def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; }
-  def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; }
-  def n8_0Imm : Operand<i32> { let ParserMatchClass = n8_0ImmOperand; }
-}
-
-let OperandType = "OPERAND_IMMEDIATE" in {
-  def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
-                               let PrintMethod = "prints4_6ImmOperand";
-                               let DecoderMethod = "s4_6ImmDecoder";}
-  def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
-                               let DecoderMethod = "s4_6ImmDecoder";}
-  def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
-                               let PrintMethod = "prints3_6ImmOperand";
-                               let DecoderMethod = "s3_6ImmDecoder";}
-  def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
-                               let DecoderMethod = "s3_6ImmDecoder";}
-}
-def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
-def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
-
-//
-// Immediate predicates
-//
-def s32_0ImmPred  : PatLeaf<(i32 imm), [{
+def f32ImmOperand : AsmOperandClass { let Name = "f32Imm"; }
+def f32Imm : Operand<f32> { let ParserMatchClass = f32ImmOperand; }
+def f64ImmOperand : AsmOperandClass { let Name = "f64Imm"; }
+def f64Imm : Operand<f64> { let ParserMatchClass = f64ImmOperand; }
+def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+def s9_0ImmOperand : AsmOperandClass { let Name = "s9_0Imm"; }
+def s9_0Imm : Operand<i32> { let ParserMatchClass = s9_0ImmOperand; }
+def s27_2ImmOperand : AsmOperandClass { let Name = "s27_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s27_2Imm : Operand<i32> { let ParserMatchClass = s27_2ImmOperand; }
+def r32_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isInt<32>(v);
 }]>;
-
-def s31_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<31,1>(v);
-}]>;
-
-def s30_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<30,2>(v);
-}]>;
-
-def s29_3ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<29,3>(v);
-}]>;
-
-def s10_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<10>(v);
-}]>;
-
-def s8_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<8>(v);
-}]>;
-
-def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<8>(v);
-}]>;
-
-def s6_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<6>(v);
-}]>;
-
-def s4_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<4>(v);
-}]>;
-
-def s4_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,1>(v);
-}]>;
-
-def s4_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,2>(v);
-}]>;
-
-def s4_3ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,3>(v);
-}]>;
-
-def u32_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<32>(v);
-}]>;
-
-def u16_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<16>(v);
-}]>;
-
-def u11_3ImmPred : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<11,3>(v);
-}]>;
-
 def u9_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isUInt<9>(v);
 }]>;
-
-def u8_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<8>(v);
-}]>;
-
-def u6_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<6>(v);
-}]>;
-
-def u6_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<6,1>(v);
-}]>;
-
-def u6_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<6,2>(v);
-}]>;
-
-def u5_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<5>(v);
-}]>;
-
-def u4_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<4>(v);
-}]>;
-
-def u3_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<3>(v);
-}]>;
-
-def u2_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<2>(v);
-}]>;
-
-// Extendable immediate operands.
-def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
-def s16_0ExtOperand : AsmOperandClass { let Name = "s16_0Ext"; }
-def s12_0ExtOperand : AsmOperandClass { let Name = "s12_0Ext"; }
-def s10_0ExtOperand : AsmOperandClass { let Name = "s10_0Ext"; }
-def s9_0ExtOperand : AsmOperandClass { let Name = "s9_0Ext"; }
-def s8_0ExtOperand : AsmOperandClass { let Name = "s8_0Ext"; }
-def s7_0ExtOperand : AsmOperandClass { let Name = "s7_0Ext"; }
-def s6_0ExtOperand : AsmOperandClass { let Name = "s6_0Ext"; }
-def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
-def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
-def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
-def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
-def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
-def u7_0ExtOperand : AsmOperandClass { let Name = "u7_0Ext"; }
-def u8_0ExtOperand : AsmOperandClass { let Name = "u8_0Ext"; }
-def u9_0ExtOperand : AsmOperandClass { let Name = "u9_0Ext"; }
-def u10_0ExtOperand : AsmOperandClass { let Name = "u10_0Ext"; }
-def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
-def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
-def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
-def u32_0MustExtOperand : AsmOperandClass { let Name = "u32_0MustExt"; }
-
-
-
-let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
-    DecoderMethod = "unsignedImmDecoder" in {
-  def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
-  def s16_0Ext : Operand<i32> { let ParserMatchClass = s16_0ExtOperand;
-                                let DecoderMethod = "s16_0ImmDecoder"; }
-  def s12_0Ext : Operand<i32> { let ParserMatchClass = s12_0ExtOperand;
-                                let DecoderMethod = "s12_0ImmDecoder"; }
-  def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
-                                let DecoderMethod = "s11_0ImmDecoder"; }
-  def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
-                                let DecoderMethod = "s11_1ImmDecoder"; }
-  def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
-                                let DecoderMethod = "s11_2ImmDecoder"; }
-  def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
-                                let DecoderMethod = "s11_3ImmDecoder"; }
-  def s10_0Ext : Operand<i32> { let ParserMatchClass = s10_0ExtOperand;
-                                let DecoderMethod = "s10_0ImmDecoder"; }
-  def s9_0Ext : Operand<i32> { let ParserMatchClass = s9_0ExtOperand;
-                               let DecoderMethod = "s9_0ImmDecoder"; }
-  def s8_0Ext : Operand<i32> { let ParserMatchClass = s8_0ExtOperand;
-                               let DecoderMethod = "s8_0ImmDecoder"; }
-  def s7_0Ext : Operand<i32> { let ParserMatchClass = s7_0ExtOperand; }
-  def s6_0Ext : Operand<i32> { let ParserMatchClass = s6_0ExtOperand;
-                               let DecoderMethod = "s6_0ImmDecoder"; }
-  def u7_0Ext : Operand<i32> { let ParserMatchClass = u7_0ExtOperand; }
-  def u8_0Ext : Operand<i32> { let ParserMatchClass = u8_0ExtOperand; }
-  def u9_0Ext : Operand<i32> { let ParserMatchClass = u9_0ExtOperand; }
-  def u10_0Ext : Operand<i32> { let ParserMatchClass = u10_0ExtOperand; }
-  def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
-  def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
-  def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
-  def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
-  def u32_0MustExt : Operand<i32> { let ParserMatchClass = u32_0MustExtOperand; }
-}
-
+def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; let RenderMethod = "addImmOperands"; }
+def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
+def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
+def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
 
 // This complex pattern exists only to create a machine instruction operand
 // of type "frame index". There doesn't seem to be a way to do that directly
@@ -305,28 +40,6 @@ def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>;
 def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>;
 def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>;
 
-// Address operands.
-
-let PrintMethod = "printGlobalOperand" in {
-  def globaladdress : Operand<i32>;
-  def globaladdressExt : Operand<i32>;
-}
-
-let PrintMethod = "printJumpTable" in
-def jumptablebase : Operand<i32>;
-
-def brtarget : Operand<OtherVT> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
-def brtargetExt : Operand<OtherVT> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
-def calltarget : Operand<i32> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
 
 def bblabel : Operand<i32>;
 def bbl     : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 89db467..374ffa3 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -10,8 +10,6 @@
 // load/store instructions.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "opt-addr-mode"
-
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
@@ -35,7 +33,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
-#include <map>
+
+#define DEBUG_TYPE "opt-addr-mode"
 
 static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit",
   cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode "
@@ -45,10 +44,8 @@ using namespace llvm;
 using namespace rdf;
 
 namespace llvm {
-
   FunctionPass *createHexagonOptAddrMode();
-  void initializeHexagonOptAddrModePass(PassRegistry &);
-
+  void initializeHexagonOptAddrModePass(PassRegistry&);
 } // end namespace llvm
 
 namespace {
@@ -59,10 +56,7 @@ public:
 
   HexagonOptAddrMode()
       : MachineFunctionPass(ID), HII(nullptr), MDT(nullptr), DFG(nullptr),
-        LV(nullptr) {
-    PassRegistry &R = *PassRegistry::getPassRegistry();
-    initializeHexagonOptAddrModePass(R);
-  }
+        LV(nullptr) {}
 
   StringRef getPassName() const override {
     return "Optimize addressing mode of load/store";
@@ -84,7 +78,6 @@ private:
   MachineDominatorTree *MDT;
   DataFlowGraph *DFG;
   DataFlowGraph::DefStackMap DefM;
-  std::map<RegisterRef, std::map<NodeId, NodeId>> RDefMap;
   Liveness *LV;
   MISetType Deleted;
 
@@ -99,8 +92,6 @@ private:
   void getAllRealUses(NodeAddr<StmtNode *> SN, NodeList &UNodeList);
   bool allValidCandidates(NodeAddr<StmtNode *> SA, NodeList &UNodeList);
   short getBaseWithLongOffset(const MachineInstr &MI) const;
-  void updateMap(NodeAddr<InstrNode *> IA);
-  bool constructDefMap(MachineBasicBlock *B);
   bool changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
                    unsigned ImmOpNum);
   bool changeLoad(MachineInstr *OldMI, MachineOperand ImmOp, unsigned ImmOpNum);
@@ -112,11 +103,11 @@ private:
 
 char HexagonOptAddrMode::ID = 0;
 
-INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "opt-amode",
+INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt",
                       "Optimize addressing mode", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
-INITIALIZE_PASS_END(HexagonOptAddrMode, "opt-amode", "Optimize addressing mode",
+INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode",
                     false, false)
 
 bool HexagonOptAddrMode::hasRepForm(MachineInstr &MI, unsigned TfrDefR) {
@@ -173,8 +164,11 @@ bool HexagonOptAddrMode::canRemoveAddasl(NodeAddr<StmtNode *> AddAslSN,
   for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
     NodeAddr<UseNode *> UA = *I;
     NodeAddr<InstrNode *> IA = UA.Addr->getOwner(*DFG);
-    if ((UA.Addr->getFlags() & NodeAttrs::PhiRef) ||
-        RDefMap[OffsetRR][IA.Id] != OffsetRegRD)
+    if (UA.Addr->getFlags() & NodeAttrs::PhiRef)
+      return false;
+    NodeAddr<RefNode*> AA = LV->getNearestAliasedRef(OffsetRR, IA);
+    if ((DFG->IsDef(AA) && AA.Id != OffsetRegRD) ||
+         AA.Addr->getReachingDef() != OffsetRegRD)
       return false;
 
     MachineInstr &UseMI = *NodeAddr<StmtNode *>(IA).Addr->getCode();
@@ -208,7 +202,16 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
     NodeAddr<UseNode *> UN = *I;
     RegisterRef UR = UN.Addr->getRegRef(*DFG);
     NodeSet Visited, Defs;
-    const auto &ReachingDefs = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    const auto &P = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    if (!P.second) {
+      DEBUG({
+        dbgs() << "*** Unable to collect all reaching defs for use ***\n"
+               << PrintNode<UseNode*>(UN, *DFG) << '\n'
+               << "The program's complexity may exceed the limits.\n";
+      });
+      return false;
+    }
+    const auto &ReachingDefs = P.first;
     if (ReachingDefs.size() > 1) {
       DEBUG({
         dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
@@ -230,7 +233,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
   for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
     DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
                  << "\n");
-    RegisterRef DR = DFG->normalizeRef(DA.Addr->getRegRef(*DFG));
+    RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
 
     auto UseSet = LV->getAllReachedUses(DR, DA);
 
@@ -250,7 +253,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
                      << Print<Liveness::RefMap>(phiUse, *DFG) << "\n");
         if (!phiUse.empty()) {
           for (auto I : phiUse) {
-            if (DR.Reg != I.first)
+            if (!DFG->getPRI().alias(RegisterRef(I.first), DR))
               continue;
             auto phiUseSet = I.second;
             for (auto phiUI : phiUseSet) {
@@ -333,17 +336,17 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
       short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-      MIB.addOperand(OldMI->getOperand(0));
-      MIB.addOperand(OldMI->getOperand(2));
-      MIB.addOperand(OldMI->getOperand(3));
-      MIB.addOperand(ImmOp);
+      MIB.add(OldMI->getOperand(0));
+      MIB.add(OldMI->getOperand(2));
+      MIB.add(OldMI->getOperand(3));
+      MIB.add(ImmOp);
       OpStart = 4;
       Changed = true;
     } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
       short NewOpCode = HII->getAbsoluteForm(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode))
-                .addOperand(OldMI->getOperand(0));
+                .add(OldMI->getOperand(0));
       const GlobalValue *GV = ImmOp.getGlobal();
       int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(2).getImm();
 
@@ -359,9 +362,9 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
     short NewOpCode = HII->xformRegToImmOffset(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
     MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-    MIB.addOperand(OldMI->getOperand(0));
-    MIB.addOperand(OldMI->getOperand(1));
-    MIB.addOperand(ImmOp);
+    MIB.add(OldMI->getOperand(0));
+    MIB.add(OldMI->getOperand(1));
+    MIB.add(ImmOp);
     OpStart = 4;
     Changed = true;
     DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
@@ -370,7 +373,7 @@ bool HexagonOptAddrMode::changeLoad(MachineInstr *OldMI, MachineOperand ImmOp,
 
   if (Changed)
     for (unsigned i = OpStart; i < OpEnd; ++i)
-      MIB.addOperand(OldMI->getOperand(i));
+      MIB.add(OldMI->getOperand(i));
 
   return Changed;
 }
@@ -390,10 +393,10 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       short NewOpCode = HII->getBaseWithLongOffset(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
       MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-      MIB.addOperand(OldMI->getOperand(1));
-      MIB.addOperand(OldMI->getOperand(2));
-      MIB.addOperand(ImmOp);
-      MIB.addOperand(OldMI->getOperand(3));
+      MIB.add(OldMI->getOperand(1));
+      MIB.add(OldMI->getOperand(2));
+      MIB.add(ImmOp);
+      MIB.add(OldMI->getOperand(3));
       OpStart = 4;
     } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
       short NewOpCode = HII->getAbsoluteForm(*OldMI);
@@ -402,7 +405,7 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       const GlobalValue *GV = ImmOp.getGlobal();
       int64_t Offset = ImmOp.getOffset() + OldMI->getOperand(1).getImm();
       MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
-      MIB.addOperand(OldMI->getOperand(2));
+      MIB.add(OldMI->getOperand(2));
       OpStart = 3;
     }
     Changed = true;
@@ -412,9 +415,9 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
     short NewOpCode = HII->xformRegToImmOffset(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
     MIB = BuildMI(*BB, InsertPt, OldMI->getDebugLoc(), HII->get(NewOpCode));
-    MIB.addOperand(OldMI->getOperand(0));
-    MIB.addOperand(ImmOp);
-    MIB.addOperand(OldMI->getOperand(1));
+    MIB.add(OldMI->getOperand(0));
+    MIB.add(ImmOp);
+    MIB.add(OldMI->getOperand(1));
     OpStart = 2;
     Changed = true;
     DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
@@ -422,7 +425,7 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
   }
   if (Changed)
     for (unsigned i = OpStart; i < OpEnd; ++i)
-      MIB.addOperand(OldMI->getOperand(i));
+      MIB.add(OldMI->getOperand(i));
 
   return Changed;
 }
@@ -473,26 +476,26 @@ bool HexagonOptAddrMode::changeAddAsl(NodeAddr<UseNode *> AddAslUN,
         BuildMI(*BB, InsertPt, UseMI->getDebugLoc(), HII->get(NewOpCode));
     // change mem(Rs + # ) -> mem(Rt << # + ##)
     if (UseMID.mayLoad()) {
-      MIB.addOperand(UseMI->getOperand(0));
-      MIB.addOperand(AddAslMI->getOperand(2));
-      MIB.addOperand(AddAslMI->getOperand(3));
+      MIB.add(UseMI->getOperand(0));
+      MIB.add(AddAslMI->getOperand(2));
+      MIB.add(AddAslMI->getOperand(3));
       const GlobalValue *GV = ImmOp.getGlobal();
-      MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm(),
+      MIB.addGlobalAddress(GV, UseMI->getOperand(2).getImm()+ImmOp.getOffset(),
                            ImmOp.getTargetFlags());
       OpStart = 3;
     } else if (UseMID.mayStore()) {
-      MIB.addOperand(AddAslMI->getOperand(2));
-      MIB.addOperand(AddAslMI->getOperand(3));
+      MIB.add(AddAslMI->getOperand(2));
+      MIB.add(AddAslMI->getOperand(3));
       const GlobalValue *GV = ImmOp.getGlobal();
-      MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm(),
+      MIB.addGlobalAddress(GV, UseMI->getOperand(1).getImm()+ImmOp.getOffset(),
                            ImmOp.getTargetFlags());
-      MIB.addOperand(UseMI->getOperand(2));
+      MIB.add(UseMI->getOperand(2));
       OpStart = 3;
     } else
       llvm_unreachable("Unhandled instruction");
 
     for (unsigned i = OpStart; i < OpEnd; ++i)
-      MIB.addOperand(UseMI->getOperand(i));
+      MIB.add(UseMI->getOperand(i));
 
     Deleted.insert(UseMI);
   }
@@ -532,9 +535,9 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
         !MI->getOperand(1).isGlobal())
       continue;
 
-    DEBUG(dbgs() << "[Analyzing A2_tfrsi]: " << *MI << "\n");
-    DEBUG(dbgs() << "\t[InstrNode]: " << Print<NodeAddr<InstrNode *>>(IA, *DFG)
-                 << "\n");
+    DEBUG(dbgs() << "[Analyzing " << HII->getName(MI->getOpcode()) << "]: "
+                 << *MI << "\n\t[InstrNode]: "
+                 << Print<NodeAddr<InstrNode *>>(IA, *DFG) << '\n');
 
     NodeList UNodeList;
     getAllRealUses(SA, UNodeList);
@@ -588,47 +591,10 @@ bool HexagonOptAddrMode::processBlock(NodeAddr<BlockNode *> BA) {
   return Changed;
 }
 
-void HexagonOptAddrMode::updateMap(NodeAddr<InstrNode *> IA) {
-  RegisterSet RRs;
-  for (NodeAddr<RefNode *> RA : IA.Addr->members(*DFG))
-    RRs.insert(RA.Addr->getRegRef(*DFG));
-  bool Common = false;
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    Common = true;
-    break;
-  }
-  if (!Common)
-    return;
-
-  for (auto &R : RDefMap) {
-    auto F = DefM.find(R.first.Reg);
-    if (F == DefM.end() || F->second.empty())
-      continue;
-    R.second[IA.Id] = F->second.top()->Id;
-  }
-}
-
-bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
-  bool Changed = false;
-  auto BA = DFG->getFunc().Addr->findBlock(B, *DFG);
-  DFG->markBlock(BA.Id, DefM);
-
-  for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) {
-    updateMap(IA);
-    DFG->pushDefs(IA, DefM);
-  }
-
-  MachineDomTreeNode *N = MDT->getNode(B);
-  for (auto I : *N)
-    Changed |= constructDefMap(I->getBlock());
-
-  DFG->releaseBlock(BA.Id, DefM);
-  return Changed;
-}
-
 bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   bool Changed = false;
   auto &HST = MF.getSubtarget<HexagonSubtarget>();
   auto &MRI = MF.getRegInfo();
@@ -639,15 +605,15 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   const TargetOperandInfo TOI(*HII);
 
   DataFlowGraph G(MF, *HII, TRI, *MDT, MDF, TOI);
-  G.build();
+  // Need to keep dead phis because we can propagate uses of registers into
+  // nodes dominated by those would-be phis.
+  G.build(BuildOptions::KeepDeadPhis);
   DFG = &G;
 
   Liveness L(MRI, *DFG);
   L.computePhiInfo();
   LV = &L;
 
-  constructDefMap(&DFG->getMF().front());
-
   Deleted.clear();
   NodeAddr<FuncNode *> FA = DFG->getFunc();
   DEBUG(dbgs() << "==== [RefMap#]=====:\n "
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
index ad81287..804a547 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -1,13 +1,14 @@
+//==- HexagonPatterns.td - Target Description for Hexagon -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
 // Pattern fragment that combines the value type and the register class
 // into a single parameter.
-// The pat frags in the definitions below need to have a named register,
-// otherwise i32 will be assumed regardless of the register class. The
-// name of the register does not matter.
-def I1  : PatLeaf<(i1 PredRegs:$R)>;
-def I32 : PatLeaf<(i32 IntRegs:$R)>;
-def I64 : PatLeaf<(i64 DoubleRegs:$R)>;
-def F32 : PatLeaf<(f32 IntRegs:$R)>;
-def F64 : PatLeaf<(f64 DoubleRegs:$R)>;
 
 // Pattern fragments to extract the low and high subregisters from a
 // 64-bit value.
@@ -17,6 +18,16 @@ def HiReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_hi)>;
 def IsOrAdd: PatFrag<(ops node:$Addr, node:$off),
     (or node:$Addr, node:$off), [{ return isOrEquivalentToAdd(N); }]>;
 
+def Iss4_6 : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  return isShiftedInt<4,6>(V);
+}]>;
+
+def Iss4_7 : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  return isShiftedInt<4,7>(V);
+}]>;
+
 def IsPow2_32 : PatLeaf<(i32 imm), [{
   uint32_t V = N->getZExtValue();
   return isPowerOf2_32(V);
@@ -89,6 +100,11 @@ def LogN2_64 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(Log2_64(NV), SDLoc(N), MVT::i32);
 }]>;
 
+def ToZext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A4_combineir 0, (i32 $Rs)))>;
+def ToSext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A2_sxtw (i32 $Rs)))>;
+
 
 class T_CMP_pat <InstHexagon MI, PatFrag OpNode, PatLeaf ImmPred>
   : Pat<(i1 (OpNode I32:$src1, ImmPred:$src2)),
@@ -153,8 +169,12 @@ def: Pat<(sub s32_0ImmPred:$s10, IntRegs:$Rs),
 def: Pat<(not I32:$src1),
          (A2_subri -1, IntRegs:$src1)>;
 
+def TruncI64ToI32: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
 def: Pat<(s32_0ImmPred:$s16), (A2_tfrsi imm:$s16)>;
-def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi imm:$s8)>;
+def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi (TruncI64ToI32 $s8))>;
 
 def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, I32:$Rs),
           (C2_muxri I1:$Pu, imm:$s8, I32:$Rs)>;
@@ -274,7 +294,7 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
 
-def: Pat<(br bb:$dst),                  (J2_jump brtarget:$dst)>;
+def: Pat<(br bb:$dst),                  (J2_jump b30_2Imm:$dst)>;
 def: Pat<(brcond I1:$src1, bb:$block),  (J2_jumpt PredRegs:$src1, bb:$block)>;
 def: Pat<(brind I32:$dst),              (J2_jumpr IntRegs:$dst)>;
 
@@ -334,7 +354,7 @@ def: Pat<(add (mul IntRegs:$src2, u32_0ImmPred:$src3), IntRegs:$src1),
          (M2_macsip IntRegs:$src1, IntRegs:$src2, imm:$src3)>;
 def: Pat<(add (mul I32:$src2, I32:$src3), I32:$src1),
          (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
-def: Pat<(add (add IntRegs:$src2, u32_0ImmPred:$src3), IntRegs:$src1),
+def: Pat<(add (add IntRegs:$src2, s32_0ImmPred:$src3), IntRegs:$src1),
          (M2_accii IntRegs:$src1, IntRegs:$src2, imm:$src3)>;
 def: Pat<(add (add I32:$src2, I32:$src3), I32:$src1),
          (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
@@ -371,48 +391,47 @@ def: T_MType_acc_pat3 <M4_or_andn, and, or>;
 def: T_MType_acc_pat3 <M4_and_andn, and, and>;
 def: T_MType_acc_pat3 <M4_xor_andn, and, xor>;
 
+// This complex pattern is really only to detect various forms of
+// sign-extension i32->i64. The selected value will be of type i64
+// whose low word is the value being extended. The high word is
+// unspecified.
+def Usxtw : ComplexPattern<i64, 1, "DetectUseSxtw", [], []>;
+
 def Aext64: PatFrag<(ops node:$Rs), (i64 (anyext node:$Rs))>;
-def Sext64: PatFrag<(ops node:$Rs), (i64 (sext node:$Rs))>;
 def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
+def Sext64: PatLeaf<(i64 Usxtw:$Rs)>;
 
-// Return true if for a 32 to 64-bit sign-extended load.
-def Sext64Ld : PatLeaf<(i64 DoubleRegs:$src1), [{
-  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
-  if (!LD)
-    return false;
-  return LD->getExtensionType() == ISD::SEXTLOAD &&
-         LD->getMemoryVT().getScalarType() == MVT::i32;
-}]>;
-
-def: Pat<(mul (Aext64 I32:$src1), (Aext64 I32:$src2)),
-         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(i32 (trunc (sra (mul Sext64:$Rs, Sext64:$Rt), (i32 32)))),
+         (M2_mpy_up (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
+def: Pat<(i32 (trunc (srl (mul Sext64:$Rs, Sext64:$Rt), (i32 32)))),
+         (M2_mpy_up (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
-def: Pat<(mul (Sext64 I32:$src1), (Sext64 I32:$src2)),
-         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>;
+def: Pat<(mul (Aext64 I32:$Rs), (Aext64 I32:$Rt)),
+         (M2_dpmpyuu_s0 I32:$Rs, I32:$Rt)>;
 
-def: Pat<(mul Sext64Ld:$src1, Sext64Ld:$src2),
-         (M2_dpmpyss_s0 (LoReg DoubleRegs:$src1), (LoReg DoubleRegs:$src2))>;
+def: Pat<(mul Sext64:$Rs, Sext64:$Rt),
+         (M2_dpmpyss_s0 (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
 // Multiply and accumulate, use full result.
 // Rxx[+-]=mpy(Rs,Rt)
 
-def: Pat<(add I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
-         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)),
+         (M2_dpmpyss_acc_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
-def: Pat<(sub I64:$src1, (mul (Sext64 I32:$src2), (Sext64 I32:$src3))),
-         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(sub I64:$Rx, (mul Sext64:$Rs, Sext64:$Rt)),
+         (M2_dpmpyss_nac_s0 I64:$Rx, (LoReg Sext64:$Rs), (LoReg Sext64:$Rt))>;
 
-def: Pat<(add I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
-         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))),
+         (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
-def: Pat<(add I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
-         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(add I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))),
+         (M2_dpmpyuu_acc_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
-def: Pat<(sub I64:$src1, (mul (Aext64 I32:$src2), (Aext64 I32:$src3))),
-         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(sub I64:$Rx, (mul (Aext64 I32:$Rs), (Aext64 I32:$Rt))),
+         (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
-def: Pat<(sub I64:$src1, (mul (Zext64 I32:$src2), (Zext64 I32:$src3))),
-         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>;
+def: Pat<(sub I64:$Rx, (mul (Zext64 I32:$Rs), (Zext64 I32:$Rt))),
+         (M2_dpmpyuu_nac_s0 I64:$Rx, I32:$Rs, I32:$Rt)>;
 
 class Storepi_pat<PatFrag Store, PatFrag Value, PatFrag Offset,
                   InstHexagon MI>
@@ -534,7 +553,8 @@ def: Storexm_simple_pat<truncstorei8,  I64, LoReg, S2_storerb_io>;
 def: Storexm_simple_pat<truncstorei16, I64, LoReg, S2_storerh_io>;
 def: Storexm_simple_pat<truncstorei32, I64, LoReg, S2_storeri_io>;
 
-def: Pat <(Sext64 I32:$src), (A2_sxtw I32:$src)>;
+def: Pat <(i64 (sext I32:$src)),            (A2_sxtw I32:$src)>;
+def: Pat <(i64 (sext_inreg I64:$src, i32)), (A2_sxtw (LoReg I64:$src))>;
 
 def: Pat<(select (i1 (setlt I32:$src, 0)), (sub 0, I32:$src), I32:$src),
          (A2_abs IntRegs:$src)>;
@@ -668,6 +688,8 @@ def I32toI1: OutPatFrag<(ops node:$Rs),
 defm: Storexm_pat<store, I1, s32_0ImmPred, I1toI32, S2_storerb_io>;
 def: Storexm_simple_pat<store, I1, I1toI32, S2_storerb_io>;
 
+def: Pat<(sra (add (sra I64:$src, u6_0ImmPred:$u6), 1), (i32 1)),
+         (S2_asr_i_p_rnd DoubleRegs:$src, imm:$u6)>, Requires<[HasV5T]>;
 def: Pat<(sra I64:$src, u6_0ImmPred:$u6),
          (S2_asr_i_p DoubleRegs:$src, imm:$u6)>;
 def: Pat<(srl I64:$src, u6_0ImmPred:$u6),
@@ -695,15 +717,16 @@ def HexagonCONST32    : SDNode<"HexagonISD::CONST32",    SDTHexagonCONST32>;
 def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
 
 // Map TLS addressses to A2_tfrsi.
-def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s16_0Ext:$addr)>;
-def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s16_0Ext:$label)>;
+def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s32_0Imm:$addr)>;
+def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s32_0Imm:$label)>;
 
 def: Pat<(i64 imm:$v), (CONST64 imm:$v)>;
 def: Pat<(i1 0), (PS_false)>;
 def: Pat<(i1 1), (PS_true)>;
 
 // Pseudo instructions.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
@@ -721,8 +744,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall,
                           [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
 
-def: Pat<(callseq_start timm:$amt),
-          (ADJCALLSTACKDOWN imm:$amt)>;
+def: Pat<(callseq_start timm:$amt, timm:$amt2),
+          (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>;
 def: Pat<(callseq_end timm:$amt1, timm:$amt2),
          (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>;
 
@@ -779,27 +802,19 @@ def: Pat<(i64 (sext_inreg I64:$src1, i16)),
 def: Pat<(i64 (sext_inreg I64:$src1, i8)),
          (A2_sxtw (A2_sxtb (LoReg DoubleRegs:$src1)))>;
 
-// We want to prevent emitting pnot's as much as possible.
-// Map brcond with an unsupported setcc to a J2_jumpf.
-def : Pat <(brcond (i1 (setne I32:$src1, I32:$src2)),
-                        bb:$offset),
-      (J2_jumpf (C2_cmpeq I32:$src1, I32:$src2),
-                bb:$offset)>;
-
-def : Pat <(brcond (i1 (setne I32:$src1, s10_0ImmPred:$src2)),
-                        bb:$offset),
-      (J2_jumpf (C2_cmpeqi I32:$src1, s10_0ImmPred:$src2), bb:$offset)>;
-
-def: Pat<(brcond (i1 (setne I1:$src1, (i1 -1))), bb:$offset),
-         (J2_jumpf PredRegs:$src1, bb:$offset)>;
-
-def: Pat<(brcond (i1 (setne I1:$src1, (i1 0))), bb:$offset),
-         (J2_jumpt PredRegs:$src1, bb:$offset)>;
+def: Pat<(brcond (i1 (setne I32:$Rs, I32:$Rt)), bb:$offset),
+         (J2_jumpf (C2_cmpeq I32:$Rs, I32:$Rt), bb:$offset)>;
+def: Pat<(brcond (i1 (setne I32:$Rs, s10_0ImmPred:$s10)), bb:$offset),
+         (J2_jumpf (C2_cmpeqi I32:$Rs, imm:$s10), bb:$offset)>;
+def: Pat<(brcond (i1 (setne I1:$Pu, (i1 -1))), bb:$offset),
+         (J2_jumpf PredRegs:$Pu, bb:$offset)>;
+def: Pat<(brcond (i1 (setne I1:$Pu, (i1 0))), bb:$offset),
+         (J2_jumpt PredRegs:$Pu, bb:$offset)>;
 
 // cmp.lt(Rs, Imm) -> !cmp.ge(Rs, Imm) -> !cmp.gt(Rs, Imm-1)
-def: Pat<(brcond (i1 (setlt I32:$src1, s8_0ImmPred:$src2)), bb:$offset),
-        (J2_jumpf (C2_cmpgti IntRegs:$src1, (SDEC1 s8_0ImmPred:$src2)),
-                  bb:$offset)>;
+def: Pat<(brcond (i1 (setlt I32:$Rs, s8_0ImmPred:$s8)), bb:$offset),
+         (J2_jumpf (C2_cmpgti IntRegs:$Rs, (SDEC1 imm:$s8)), bb:$offset)>;
+
 
 // Map from a 64-bit select to an emulated 64-bit mux.
 // Hexagon does not support 64-bit MUXes; so emulate with combines.
@@ -853,15 +868,13 @@ def: Pat<(i1 (setne I1:$src1, I1:$src2)),
 def: Pat<(i1 (setne I64:$src1, I64:$src2)),
          (C2_not (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>;
 
-// Map cmpge(Rs, Rt) -> !cmpgt(Rs, Rt).
-// rs >= rt -> !(rt > rs).
-def : Pat <(i1 (setge I32:$src1, I32:$src2)),
-      (i1 (C2_not (i1 (C2_cmpgt I32:$src2, I32:$src1))))>;
+// rs >= rt -> rt <= rs
+def: Pat<(i1 (setge I32:$Rs, I32:$Rt)),
+         (C4_cmplte I32:$Rt, I32:$Rs)>;
 
-// cmpge(Rs, Imm) -> cmpgt(Rs, Imm-1)
 let AddedComplexity = 30 in
-def: Pat<(i1 (setge I32:$src1, s32_0ImmPred:$src2)),
-         (C2_cmpgti IntRegs:$src1, (SDEC1 s32_0ImmPred:$src2))>;
+def: Pat<(i1 (setge I32:$Rs, s32_0ImmPred:$s10)),
+         (C2_cmpgti IntRegs:$Rs, (SDEC1 imm:$s10))>;
 
 // Map cmpge(Rss, Rtt) -> !cmpgt(Rtt, Rss).
 // rss >= rtt -> !(rtt > rss).
@@ -898,26 +911,35 @@ def: Pat<(i1 (setule I64:$src1, I64:$src2)),
          (C2_not (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>;
 
 // Sign extends.
-// i1 -> i32
-def: Pat<(i32 (sext I1:$src1)),
-         (C2_muxii PredRegs:$src1, -1, 0)>;
+// sext i1->i32
+def: Pat<(i32 (sext I1:$Pu)),
+         (C2_muxii I1:$Pu, -1, 0)>;
 
-// i1 -> i64
-def: Pat<(i64 (sext I1:$src1)),
-         (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>;
+// sext i1->i64
+def: Pat<(i64 (sext I1:$Pu)),
+         (A2_combinew (C2_muxii PredRegs:$Pu, -1, 0),
+                      (C2_muxii PredRegs:$Pu, -1, 0))>;
 
 // Zero extends.
-// i1 -> i32
-def: Pat<(i32 (zext I1:$src1)),
-         (C2_muxii PredRegs:$src1, 1, 0)>;
+// zext i1->i32
+def: Pat<(i32 (zext I1:$Pu)),
+         (C2_muxii PredRegs:$Pu, 1, 0)>;
+
+// zext i1->i64
+def: Pat<(i64 (zext I1:$Pu)),
+         (ToZext64 (C2_muxii PredRegs:$Pu, 1, 0))>;
+
+// zext i32->i64
+def: Pat<(Zext64 I32:$Rs),
+         (ToZext64 IntRegs:$Rs)>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def: Pat<(i32 (anyext I1:$src1)),
-         (C2_muxii PredRegs:$src1, 1, 0)>;
+def: Pat<(i32 (anyext I1:$Pu)),
+         (C2_muxii PredRegs:$Pu, 1, 0)>;
 
-// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
-def: Pat<(i64 (anyext I1:$src1)),
-         (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>;
+// Map from Rss = Pd to Rdd = combine(#0, (mux(Pd, #1, #0)))
+def: Pat<(i64 (anyext I1:$Pu)),
+         (ToZext64 (C2_muxii PredRegs:$Pu, 1, 0))>;
 
 // Clear the sign bit in a 64-bit register.
 def ClearSign : OutPatFrag<(ops node:$Rss),
@@ -1138,8 +1160,8 @@ multiclass MinMax_pats_p<PatFrag Op, InstHexagon Inst, InstHexagon SwapInst> {
   defm: T_MinMax_pats<Op, I64, Inst, SwapInst>;
 }
 
-def: Pat<(add (Sext64 I32:$Rs), I64:$Rt),
-         (A2_addsp IntRegs:$Rs, DoubleRegs:$Rt)>;
+def: Pat<(add Sext64:$Rs, I64:$Rt),
+         (A2_addsp (LoReg Sext64:$Rs), DoubleRegs:$Rt)>;
 
 let AddedComplexity = 200 in {
   defm: MinMax_pats_p<setge,  A2_maxp,  A2_minp>;
@@ -1244,11 +1266,6 @@ def: Pat<(HexagonCOMBINE s32_0ImmPred:$s8, s8_0ImmPred:$S8),
 }
 
 
-def ToZext64: OutPatFrag<(ops node:$Rs),
-  (i64 (A4_combineir 0, (i32 $Rs)))>;
-def ToSext64: OutPatFrag<(ops node:$Rs),
-  (i64 (A2_sxtw (i32 $Rs)))>;
-
 // Patterns to generate indexed loads with different forms of the address:
 // - frameindex,
 // - base + offset,
@@ -1349,14 +1366,6 @@ let AddedComplexity = 20 in {
   def: Loadxs_simple_pat<load,        i64, L4_loadrd_rr>;
 }
 
-// zext i1->i64
-def: Pat<(i64 (zext I1:$src1)),
-         (ToZext64 (C2_muxii PredRegs:$src1, 1, 0))>;
-
-// zext i32->i64
-def: Pat<(Zext64 I32:$src1),
-         (ToZext64 IntRegs:$src1)>;
-
 let AddedComplexity = 40 in
 multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT,
                            PatFrag stOp> {
@@ -1466,16 +1475,22 @@ def i32in8ImmPred: PatLeaf<(i32 imm), [{
   return v == (int64_t)(int8_t)v;
 }]>;
 
+class SmallStackStore<PatFrag Store>
+  : PatFrag<(ops node:$Val, node:$Addr), (Store node:$Val, node:$Addr), [{
+  return isSmallStackStore(cast<StoreSDNode>(N));
+}]>;
 
 let AddedComplexity = 40 in {
   // Even though the offset is not extendable in the store-immediate, we
   // can still generate the fi# in the base address. If the final offset
   // is not valid for the instruction, we will replace it with a scratch
   // register.
-//  def: Storexm_fi_pat <truncstorei8, s32_0ImmPred, ToImmByte, S4_storeirb_io>;
-//  def: Storexm_fi_pat <truncstorei16, i16in8ImmPred, ToImmHalf,
-//                       S4_storeirh_io>;
-//  def: Storexm_fi_pat <store, i32in8ImmPred, ToImmWord, S4_storeiri_io>;
+  def: Storexm_fi_pat <SmallStackStore<truncstorei8>, s32_0ImmPred,
+                       ToImmByte, S4_storeirb_io>;
+  def: Storexm_fi_pat <SmallStackStore<truncstorei16>, i16in8ImmPred,
+                       ToImmHalf, S4_storeirh_io>;
+  def: Storexm_fi_pat <SmallStackStore<store>, i32in8ImmPred,
+                       ToImmWord, S4_storeiri_io>;
 
 //  defm: Storexm_fi_add_pat <truncstorei8, s32_0ImmPred, u6_0ImmPred, ToImmByte,
 //                            S4_storeirb_io>;
@@ -1587,6 +1602,15 @@ def: Pat<(i64 (cttz I64:$Rss)), (ToZext64 (S2_ct0p I64:$Rss))>;
 def: Pat<(i64 (ctlz (not I64:$Rss))), (ToZext64 (S2_cl1p I64:$Rss))>;
 def: Pat<(i64 (cttz (not I64:$Rss))), (ToZext64 (S2_ct1p I64:$Rss))>;
 
+def: Pat<(i64 (ctpop I64:$Rss)), (ToZext64 (S5_popcountp I64:$Rss))>;
+def: Pat<(i32 (ctpop I32:$Rs)), (S5_popcountp (A4_combineir 0, I32:$Rs))>;
+
+def: Pat<(bitreverse I32:$Rs), (S2_brev I32:$Rs)>;
+def: Pat<(bitreverse I64:$Rss), (S2_brevp I64:$Rss)>;
+
+def: Pat<(bswap I32:$Rs), (A2_swiz I32:$Rs)>;
+def: Pat<(bswap I64:$Rss), (A2_combinew (A2_swiz (LoReg $Rss)),
+                                        (A2_swiz (HiReg $Rss)))>;
 
 let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
   def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
@@ -1622,9 +1646,14 @@ def: Pat<(i1 (setne (and I32:$Rs, I32:$Rt), I32:$Rt)),
 
 def: Pat<(add (mul I32:$Rs, u6_0ImmPred:$U6), u32_0ImmPred:$u6),
          (M4_mpyri_addi imm:$u6, IntRegs:$Rs, imm:$U6)>;
+def: Pat<(add (mul I32:$Rs, u6_0ImmPred:$U6),
+              (HexagonCONST32 tglobaladdr:$global)),
+         (M4_mpyri_addi tglobaladdr:$global, IntRegs:$Rs, imm:$U6)>;
 def: Pat<(add (mul I32:$Rs, I32:$Rt), u32_0ImmPred:$u6),
          (M4_mpyrr_addi imm:$u6, IntRegs:$Rs, IntRegs:$Rt)>;
-
+def: Pat<(add (mul I32:$Rs, I32:$Rt),
+              (HexagonCONST32 tglobaladdr:$global)),
+         (M4_mpyrr_addi tglobaladdr:$global, IntRegs:$Rs, IntRegs:$Rt)>;
 def: Pat<(add I32:$src1, (mul I32:$src3, u6_2ImmPred:$src2)),
          (M4_mpyri_addr_u2 IntRegs:$src1, imm:$src2, IntRegs:$src3)>;
 def: Pat<(add I32:$src1, (mul I32:$src3, u32_0ImmPred:$src2)),
@@ -2117,6 +2146,11 @@ let AddedComplexity  = 30 in {
   def: Storea_pat<truncstorei8,  I32, u32_0ImmPred, PS_storerbabs>;
   def: Storea_pat<truncstorei16, I32, u32_0ImmPred, PS_storerhabs>;
   def: Storea_pat<store,         I32, u32_0ImmPred, PS_storeriabs>;
+  def: Storea_pat<store,         I64, u32_0ImmPred, PS_storerdabs>;
+
+  def: Stoream_pat<truncstorei8,  I64, u32_0ImmPred, LoReg, PS_storerbabs>;
+  def: Stoream_pat<truncstorei16, I64, u32_0ImmPred, LoReg, PS_storerhabs>;
+  def: Stoream_pat<truncstorei32, I64, u32_0ImmPred, LoReg, PS_storeriabs>;
 }
 
 let AddedComplexity  = 30 in {
@@ -2125,6 +2159,19 @@ let AddedComplexity  = 30 in {
   def: Loada_pat<zextloadi8,  i32, u32_0ImmPred, PS_loadrubabs>;
   def: Loada_pat<sextloadi16, i32, u32_0ImmPred, PS_loadrhabs>;
   def: Loada_pat<zextloadi16, i32, u32_0ImmPred, PS_loadruhabs>;
+  def: Loada_pat<load,        i64, u32_0ImmPred, PS_loadrdabs>;
+
+  def: Loadam_pat<extloadi8,   i64, u32_0ImmPred, ToZext64, PS_loadrubabs>;
+  def: Loadam_pat<sextloadi8,  i64, u32_0ImmPred, ToSext64, PS_loadrbabs>;
+  def: Loadam_pat<zextloadi8,  i64, u32_0ImmPred, ToZext64, PS_loadrubabs>;
+
+  def: Loadam_pat<extloadi16,  i64, u32_0ImmPred, ToZext64, PS_loadruhabs>;
+  def: Loadam_pat<sextloadi16, i64, u32_0ImmPred, ToSext64, PS_loadrhabs>;
+  def: Loadam_pat<zextloadi16, i64, u32_0ImmPred, ToZext64, PS_loadruhabs>;
+
+  def: Loadam_pat<extloadi32,  i64, u32_0ImmPred, ToZext64, PS_loadriabs>;
+  def: Loadam_pat<sextloadi32, i64, u32_0ImmPred, ToSext64, PS_loadriabs>;
+  def: Loadam_pat<zextloadi32, i64, u32_0ImmPred, ToZext64, PS_loadriabs>;
 }
 
 // Indexed store word - global address.
@@ -2203,6 +2250,12 @@ def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>;
 def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>;
 def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>;
 
+// Prefer this pattern to S2_asl_i_p_or for the special case of joining
+// two 32-bit words into a 64-bit word.
+let AddedComplexity = 200 in
+def: Pat<(or (shl (Aext64 I32:$a), (i32 32)), (Zext64 I32:$b)),
+         (A2_combinew I32:$a, I32:$b)>;
+
 def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)),
                      (i64 (zext (i32 (and I32:$a, (i32 65535)))))),
                  (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))),
@@ -2235,12 +2288,6 @@ def ftoi : SDNodeXForm<fpimm, [{
 def: Pat<(sra (i64 (add (sra I64:$src1, u6_0ImmPred:$src2), 1)), (i32 1)),
          (S2_asr_i_p_rnd I64:$src1, imm:$src2)>;
 
-def SDTHexagonI32I64: SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
-                                           SDTCisVT<1, i64>]>;
-def HexagonPOPCOUNT: SDNode<"HexagonISD::POPCOUNT", SDTHexagonI32I64>;
-
-def: Pat<(HexagonPOPCOUNT I64:$Rss), (S5_popcountp I64:$Rss)>;
-
 let AddedComplexity = 20 in {
   defm: Loadx_pat<load, f32, s30_2ImmPred, L2_loadri_io>;
   defm: Loadx_pat<load, f64, s29_3ImmPred, L2_loadrd_io>;
@@ -2701,6 +2748,15 @@ def: Pat<(fneg F64:$Rs),
               (S2_togglebit_i (HiReg $Rs), 31), isub_hi,
               (i32 (LoReg $Rs)), isub_lo)>;
 
+def: Pat<(mul I64:$Rss, I64:$Rtt),
+         (A2_combinew
+           (M2_maci (M2_maci (HiReg (M2_dpmpyuu_s0 (LoReg $Rss), (LoReg $Rtt))),
+                             (LoReg $Rss),
+                             (HiReg $Rtt)),
+                    (LoReg $Rtt),
+                    (HiReg $Rss)),
+           (LoReg (M2_dpmpyuu_s0 (LoReg $Rss), (LoReg $Rtt))))>;
+
 def alignedload : PatFrag<(ops node:$addr), (load $addr), [{
   return isAlignedMemNode(dyn_cast<MemSDNode>(N));
 }]>;
@@ -2718,19 +2774,11 @@ def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [
 }]>;
 
 
-def s4_6ImmPred: PatLeaf<(i32 imm), [{
-  int64_t V = N->getSExtValue();
-  return isShiftedInt<4,6>(V);
-}]>;
-
-def s4_7ImmPred: PatLeaf<(i32 imm), [{
-  int64_t V = N->getSExtValue();
-  return isShiftedInt<4,7>(V);
-}]>;
-
-
 multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Aligned stores
+  def : Pat<(alignednontemporalstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
+            (V6_vS32b_nt_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>,
+            Requires<[UseHVXSgl]>;
   def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
             (V6_vS32b_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>,
             Requires<[UseHVXSgl]>;
@@ -2739,6 +2787,9 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
             Requires<[UseHVXSgl]>;
 
   // 128B Aligned stores
+  def : Pat<(alignednontemporalstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
+            (V6_vS32b_nt_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
   def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr),
             (V6_vS32b_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>,
             Requires<[UseHVXDbl]>;
@@ -2748,26 +2799,36 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
 
   // Fold Add R+OFF into vector store.
   let AddedComplexity = 10 in {
+    def : Pat<(alignednontemporalstore (VTSgl VectorRegs:$src1),
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32b_nt_ai IntRegs:$src2, Iss4_6:$offset,
+                           (VTSgl VectorRegs:$src1))>,
+              Requires<[UseHVXSgl]>;
     def : Pat<(alignedstore (VTSgl VectorRegs:$src1),
-                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
-              (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32b_ai IntRegs:$src2, Iss4_6:$offset,
                            (VTSgl VectorRegs:$src1))>,
               Requires<[UseHVXSgl]>;
     def : Pat<(unalignedstore (VTSgl VectorRegs:$src1),
-                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
-              (V6_vS32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32Ub_ai IntRegs:$src2, Iss4_6:$offset,
                            (VTSgl VectorRegs:$src1))>,
               Requires<[UseHVXSgl]>;
 
     // Fold Add R+OFF into vector store 128B.
+    def : Pat<(alignednontemporalstore (VTDbl VectorRegs128B:$src1),
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32b_nt_ai_128B IntRegs:$src2, Iss4_7:$offset,
+                                (VTDbl VectorRegs128B:$src1))>,
+              Requires<[UseHVXDbl]>;
     def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1),
-                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
-              (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32b_ai_128B IntRegs:$src2, Iss4_7:$offset,
                                 (VTDbl VectorRegs128B:$src1))>,
               Requires<[UseHVXDbl]>;
     def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1),
-                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
-              (V6_vS32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset,
                                 (VTDbl VectorRegs128B:$src1))>,
               Requires<[UseHVXDbl]>;
   }
@@ -2781,6 +2842,9 @@ defm : vS32b_ai_pats <v8i64,  v16i64>;
 
 multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Aligned loads
+  def : Pat < (VTSgl (alignednontemporalload IntRegs:$addr)),
+              (V6_vL32b_nt_ai IntRegs:$addr, 0) >,
+              Requires<[UseHVXSgl]>;
   def : Pat < (VTSgl (alignedload IntRegs:$addr)),
               (V6_vL32b_ai IntRegs:$addr, 0) >,
               Requires<[UseHVXSgl]>;
@@ -2789,6 +2853,9 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
               Requires<[UseHVXSgl]>;
 
   // 128B Load
+  def : Pat < (VTDbl (alignednontemporalload IntRegs:$addr)),
+              (V6_vL32b_nt_ai_128B IntRegs:$addr, 0) >,
+              Requires<[UseHVXDbl]>;
   def : Pat < (VTDbl (alignedload IntRegs:$addr)),
               (V6_vL32b_ai_128B IntRegs:$addr, 0) >,
               Requires<[UseHVXDbl]>;
@@ -2798,18 +2865,24 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
 
   // Fold Add R+OFF into vector load.
   let AddedComplexity = 10 in {
-    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
-              (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+    def : Pat<(VTDbl (alignednontemporalload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32b_nt_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
+               Requires<[UseHVXDbl]>;
+    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32b_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
                Requires<[UseHVXDbl]>;
-    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
-              (V6_vL32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
                Requires<[UseHVXDbl]>;
 
-    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
-              (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+    def : Pat<(VTSgl (alignednontemporalload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32b_nt_ai IntRegs:$src2, Iss4_6:$offset)>,
+              Requires<[UseHVXSgl]>;
+    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32b_ai IntRegs:$src2, Iss4_6:$offset)>,
               Requires<[UseHVXSgl]>;
-    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
-              (V6_vL32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32Ub_ai IntRegs:$src2, Iss4_6:$offset)>,
               Requires<[UseHVXSgl]>;
   }
 }
@@ -2820,6 +2893,9 @@ defm : vL32b_ai_pats <v16i32, v32i32>;
 defm : vL32b_ai_pats <v8i64,  v16i64>;
 
 multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+  def : Pat<(alignednontemporalstore (VTSgl VecDblRegs:$src1), IntRegs:$addr),
+            (PS_vstorerw_nt_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
+           Requires<[UseHVXSgl]>;
   def : Pat<(alignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr),
             (PS_vstorerw_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
            Requires<[UseHVXSgl]>;
@@ -2827,6 +2903,10 @@ multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> {
             (PS_vstorerwu_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>,
            Requires<[UseHVXSgl]>;
 
+  def : Pat<(alignednontemporalstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
+            (PS_vstorerw_nt_ai_128B IntRegs:$addr, 0,
+                  (VTDbl VecDblRegs128B:$src1))>,
+            Requires<[UseHVXDbl]>;
   def : Pat<(alignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr),
             (PS_vstorerw_ai_128B IntRegs:$addr, 0,
                   (VTDbl VecDblRegs128B:$src1))>,
@@ -2843,6 +2923,9 @@ defm : STrivv_pats <v32i32, v64i32>;
 defm : STrivv_pats <v16i64, v32i64>;
 
 multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> {
+  def : Pat<(VTSgl (alignednontemporalload I32:$addr)),
+            (PS_vloadrw_nt_ai I32:$addr, 0)>,
+           Requires<[UseHVXSgl]>;
   def : Pat<(VTSgl (alignedload I32:$addr)),
             (PS_vloadrw_ai I32:$addr, 0)>,
            Requires<[UseHVXSgl]>;
@@ -2850,6 +2933,9 @@ multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> {
             (PS_vloadrwu_ai I32:$addr, 0)>,
            Requires<[UseHVXSgl]>;
 
+  def : Pat<(VTDbl (alignednontemporalload I32:$addr)),
+            (PS_vloadrw_nt_ai_128B I32:$addr, 0)>,
+           Requires<[UseHVXDbl]>;
   def : Pat<(VTDbl (alignedload I32:$addr)),
             (PS_vloadrw_ai_128B I32:$addr, 0)>,
            Requires<[UseHVXDbl]>;
@@ -2891,45 +2977,40 @@ def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs),
          (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
          Requires<[UseHVXDbl]>;
 
-def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>,
-                                          SDTCisInt<3>]>;
-
-def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>;
-
-// 0 as the last argument denotes vpacke. 1 denotes vpacko
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs),
-                              (v64i8 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                               (v32i16 VectorRegs:$Vt), (i32 0))),
-         (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs),
-                             (v32i16 VectorRegs:$Vt), (i32 1))),
-         (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>,
-         Requires<[UseHVXSgl]>;
-
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs),
-                             (v128i8 VecDblRegs:$Vt), (i32 1))),
-         (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                             (v64i16 VecDblRegs:$Vt), (i32 0))),
-         (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-         Requires<[UseHVXDbl]>;
-def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs),
-                            (v64i16 VecDblRegs:$Vt), (i32 1))),
-        (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>,
-        Requires<[UseHVXDbl]>;
+def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>;
+
+def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>;
+def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>;
+
+let Predicates = [UseHVXSgl] in {
+  def: Pat<(v64i8 (HexagonVPACKE (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v64i8 (HexagonVPACKO (v64i8 VectorRegs:$Vs),
+                                 (v64i8 VectorRegs:$Vt))),
+           (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKE (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>;
+  def: Pat<(v32i16 (HexagonVPACKO (v32i16 VectorRegs:$Vs),
+                                  (v32i16 VectorRegs:$Vt))),
+           (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>;
+}
+
+let Predicates = [UseHVXDbl] in {
+  def: Pat<(v128i8 (HexagonVPACKE (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v128i8 (HexagonVPACKO (v128i8 VecDblRegs:$Vs),
+                                  (v128i8 VecDblRegs:$Vt))),
+           (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKE (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+           (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+  def: Pat<(v64i16 (HexagonVPACKO (v64i16 VecDblRegs:$Vs),
+                                  (v64i16 VecDblRegs:$Vt))),
+          (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>;
+}
 
 def V2I1:  PatLeaf<(v2i1  PredRegs:$R)>;
 def V4I1:  PatLeaf<(v4i1  PredRegs:$R)>;
@@ -2982,16 +3063,20 @@ def : Pat<(v2i16 (add (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
 def : Pat<(v2i16 (sub (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))),
           (A2_svsubh IntRegs:$src1, IntRegs:$src2)>;
 
-def HexagonVSPLATB: SDNode<"HexagonISD::VSPLATB", SDTUnaryOp>;
-def HexagonVSPLATH: SDNode<"HexagonISD::VSPLATH", SDTUnaryOp>;
+def SDTHexagonVSPLAT: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+def HexagonVSPLAT: SDNode<"HexagonISD::VSPLAT", SDTHexagonVSPLAT>;
 
 // Replicate the low 8-bits from 32-bits input register into each of the
 // four bytes of 32-bits destination register.
-def: Pat<(v4i8  (HexagonVSPLATB I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
+def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>;
 
 // Replicate the low 16-bits from 32-bits input register into each of the
 // four halfwords of 64-bits destination register.
-def: Pat<(v4i16 (HexagonVSPLATH I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>;
+
+def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)),
+         (A2_combineii imm:$s8, imm:$s8)>;
+def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (A2_combinew I32:$Rs, I32:$Rs)>;
 
 
 class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type>
@@ -3019,94 +3104,51 @@ def: VArith_pat <A2_xorp,   xor, V8I8>;
 def: VArith_pat <A2_xorp,   xor, V4I16>;
 def: VArith_pat <A2_xorp,   xor, V2I32>;
 
-def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_lsr_i_vw V2I32:$b, imm:$c)>;
-def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c),
-                                                    (i32 u5_0ImmPred:$c))))),
+def: Pat<(v2i32 (shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))),
          (S2_asl_i_vw V2I32:$b, imm:$c)>;
 
-def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))),
          (S2_asr_i_vh V4I16:$b, imm:$c)>;
-def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))),
          (S2_lsr_i_vh V4I16:$b, imm:$c)>;
-def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))),
+def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))),
          (S2_asl_i_vh V4I16:$b, imm:$c)>;
 
 
-def SDTHexagon_v2i32_v2i32_i32 : SDTypeProfile<1, 2,
-  [SDTCisSameAs<0, 1>, SDTCisVT<0, v2i32>, SDTCisInt<2>]>;
-def SDTHexagon_v4i16_v4i16_i32 : SDTypeProfile<1, 2,
-  [SDTCisSameAs<0, 1>, SDTCisVT<0, v4i16>, SDTCisInt<2>]>;
+def SDTHexagonVShift
+  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVec<0>, SDTCisVT<2, i32>]>;
 
-def HexagonVSRAW: SDNode<"HexagonISD::VSRAW", SDTHexagon_v2i32_v2i32_i32>;
-def HexagonVSRAH: SDNode<"HexagonISD::VSRAH", SDTHexagon_v4i16_v4i16_i32>;
-def HexagonVSRLW: SDNode<"HexagonISD::VSRLW", SDTHexagon_v2i32_v2i32_i32>;
-def HexagonVSRLH: SDNode<"HexagonISD::VSRLH", SDTHexagon_v4i16_v4i16_i32>;
-def HexagonVSHLW: SDNode<"HexagonISD::VSHLW", SDTHexagon_v2i32_v2i32_i32>;
-def HexagonVSHLH: SDNode<"HexagonISD::VSHLH", SDTHexagon_v4i16_v4i16_i32>;
+def HexagonVASL: SDNode<"HexagonISD::VASL", SDTHexagonVShift>;
+def HexagonVASR: SDNode<"HexagonISD::VASR", SDTHexagonVShift>;
+def HexagonVLSR: SDNode<"HexagonISD::VLSR", SDTHexagonVShift>;
 
-def: Pat<(v2i32 (HexagonVSRAW V2I32:$Rs, u5_0ImmPred:$u5)),
+def: Pat<(v2i32 (HexagonVASL V2I32:$Rs, u5_0ImmPred:$u5)),
+         (S2_asl_i_vw V2I32:$Rs, imm:$u5)>;
+def: Pat<(v4i16 (HexagonVASL V4I16:$Rs, u4_0ImmPred:$u4)),
+         (S2_asl_i_vh V4I16:$Rs, imm:$u4)>;
+def: Pat<(v2i32 (HexagonVASR V2I32:$Rs, u5_0ImmPred:$u5)),
          (S2_asr_i_vw V2I32:$Rs, imm:$u5)>;
-def: Pat<(v4i16 (HexagonVSRAH V4I16:$Rs, u4_0ImmPred:$u4)),
+def: Pat<(v4i16 (HexagonVASR V4I16:$Rs, u4_0ImmPred:$u4)),
          (S2_asr_i_vh V4I16:$Rs, imm:$u4)>;
-def: Pat<(v2i32 (HexagonVSRLW V2I32:$Rs, u5_0ImmPred:$u5)),
+def: Pat<(v2i32 (HexagonVLSR V2I32:$Rs, u5_0ImmPred:$u5)),
          (S2_lsr_i_vw V2I32:$Rs, imm:$u5)>;
-def: Pat<(v4i16 (HexagonVSRLH V4I16:$Rs, u4_0ImmPred:$u4)),
+def: Pat<(v4i16 (HexagonVLSR V4I16:$Rs, u4_0ImmPred:$u4)),
          (S2_lsr_i_vh V4I16:$Rs, imm:$u4)>;
-def: Pat<(v2i32 (HexagonVSHLW V2I32:$Rs, u5_0ImmPred:$u5)),
-         (S2_asl_i_vw V2I32:$Rs, imm:$u5)>;
-def: Pat<(v4i16 (HexagonVSHLH V4I16:$Rs, u4_0ImmPred:$u4)),
-         (S2_asl_i_vh V4I16:$Rs, imm:$u4)>;
 
 class vshift_rr_pat<InstHexagon MI, SDNode Op, PatFrag Value>
   : Pat <(Op Value:$Rs, I32:$Rt),
          (MI Value:$Rs, I32:$Rt)>;
 
-def: vshift_rr_pat <S2_asr_r_vw, HexagonVSRAW, V2I32>;
-def: vshift_rr_pat <S2_asr_r_vh, HexagonVSRAH, V4I16>;
-def: vshift_rr_pat <S2_lsr_r_vw, HexagonVSRLW, V2I32>;
-def: vshift_rr_pat <S2_lsr_r_vh, HexagonVSRLH, V4I16>;
-def: vshift_rr_pat <S2_asl_r_vw, HexagonVSHLW, V2I32>;
-def: vshift_rr_pat <S2_asl_r_vh, HexagonVSHLH, V4I16>;
-
-
-def SDTHexagonVecCompare_v8i8 : SDTypeProfile<1, 2,
-  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v8i8>]>;
-def SDTHexagonVecCompare_v4i16 : SDTypeProfile<1, 2,
-  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v4i16>]>;
-def SDTHexagonVecCompare_v2i32 : SDTypeProfile<1, 2,
-  [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v2i32>]>;
-
-def HexagonVCMPBEQ:  SDNode<"HexagonISD::VCMPBEQ",  SDTHexagonVecCompare_v8i8>;
-def HexagonVCMPBGT:  SDNode<"HexagonISD::VCMPBGT",  SDTHexagonVecCompare_v8i8>;
-def HexagonVCMPBGTU: SDNode<"HexagonISD::VCMPBGTU", SDTHexagonVecCompare_v8i8>;
-def HexagonVCMPHEQ:  SDNode<"HexagonISD::VCMPHEQ",  SDTHexagonVecCompare_v4i16>;
-def HexagonVCMPHGT:  SDNode<"HexagonISD::VCMPHGT",  SDTHexagonVecCompare_v4i16>;
-def HexagonVCMPHGTU: SDNode<"HexagonISD::VCMPHGTU", SDTHexagonVecCompare_v4i16>;
-def HexagonVCMPWEQ:  SDNode<"HexagonISD::VCMPWEQ",  SDTHexagonVecCompare_v2i32>;
-def HexagonVCMPWGT:  SDNode<"HexagonISD::VCMPWGT",  SDTHexagonVecCompare_v2i32>;
-def HexagonVCMPWGTU: SDNode<"HexagonISD::VCMPWGTU", SDTHexagonVecCompare_v2i32>;
-
-
-class vcmp_i1_pat<InstHexagon MI, SDNode Op, PatFrag Value>
-  : Pat <(i1 (Op Value:$Rs, Value:$Rt)),
-         (MI Value:$Rs, Value:$Rt)>;
-
-def: vcmp_i1_pat<A2_vcmpbeq,  HexagonVCMPBEQ,  V8I8>;
-def: vcmp_i1_pat<A4_vcmpbgt,  HexagonVCMPBGT,  V8I8>;
-def: vcmp_i1_pat<A2_vcmpbgtu, HexagonVCMPBGTU, V8I8>;
-
-def: vcmp_i1_pat<A2_vcmpheq,  HexagonVCMPHEQ,  V4I16>;
-def: vcmp_i1_pat<A2_vcmphgt,  HexagonVCMPHGT,  V4I16>;
-def: vcmp_i1_pat<A2_vcmphgtu, HexagonVCMPHGTU, V4I16>;
-
-def: vcmp_i1_pat<A2_vcmpweq,  HexagonVCMPWEQ,  V2I32>;
-def: vcmp_i1_pat<A2_vcmpwgt,  HexagonVCMPWGT,  V2I32>;
-def: vcmp_i1_pat<A2_vcmpwgtu, HexagonVCMPWGTU, V2I32>;
+def: vshift_rr_pat <S2_asl_r_vw, HexagonVASL, V2I32>;
+def: vshift_rr_pat <S2_asl_r_vh, HexagonVASL, V4I16>;
+def: vshift_rr_pat <S2_asr_r_vw, HexagonVASR, V2I32>;
+def: vshift_rr_pat <S2_asr_r_vh, HexagonVASR, V4I16>;
+def: vshift_rr_pat <S2_lsr_r_vw, HexagonVLSR, V2I32>;
+def: vshift_rr_pat <S2_lsr_r_vh, HexagonVLSR, V4I16>;
 
 
 class vcmp_vi1_pat<InstHexagon MI, PatFrag Op, PatFrag InVal, ValueType OutTy>
@@ -3216,13 +3258,6 @@ def: Pat<(v4i8 (trunc V4I16:$Rs)),
 def: Pat<(v2i16 (trunc V2I32:$Rs)),
          (LoReg (S2_packhl (HiReg $Rs), (LoReg $Rs)))>;
 
-
-def HexagonVSXTBH : SDNode<"HexagonISD::VSXTBH", SDTUnaryOp>;
-def HexagonVSXTBW : SDNode<"HexagonISD::VSXTBW", SDTUnaryOp>;
-
-def: Pat<(i64 (HexagonVSXTBH I32:$Rs)), (S2_vsxtbh I32:$Rs)>;
-def: Pat<(i64 (HexagonVSXTBW I32:$Rs)), (S2_vsxthw I32:$Rs)>;
-
 def: Pat<(v4i16 (zext   V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
 def: Pat<(v2i32 (zext   V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
 def: Pat<(v4i16 (anyext V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
@@ -3253,8 +3288,8 @@ def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt),
                       (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>;
 
 def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)),
-         (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)),
-                             (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>;
+         (LoReg (S2_vtrunewh (A2_combineii 0, 0),
+                             (vmpyh V2I16:$Rs, V2I16:$Rt)))>;
 
 // Multiplies two v4i16 vectors.
 def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
@@ -3283,31 +3318,6 @@ def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
          (A2_combinew (S2_vtrunehb (VMPYB_no_V5 (HiReg $Rs), (HiReg $Rt))),
                       (S2_vtrunehb (VMPYB_no_V5 (LoReg $Rs), (LoReg $Rt))))>;
 
-def SDTHexagonBinOp64 : SDTypeProfile<1, 2,
-  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>]>;
-
-def HexagonSHUFFEB: SDNode<"HexagonISD::SHUFFEB", SDTHexagonBinOp64>;
-def HexagonSHUFFEH: SDNode<"HexagonISD::SHUFFEH", SDTHexagonBinOp64>;
-def HexagonSHUFFOB: SDNode<"HexagonISD::SHUFFOB", SDTHexagonBinOp64>;
-def HexagonSHUFFOH: SDNode<"HexagonISD::SHUFFOH", SDTHexagonBinOp64>;
-
-class ShufflePat<InstHexagon MI, SDNode Op>
-  : Pat<(i64 (Op DoubleRegs:$src1, DoubleRegs:$src2)),
-        (i64 (MI DoubleRegs:$src1, DoubleRegs:$src2))>;
-
-// Shuffles even bytes for i=0..3: A[2*i].b = C[2*i].b; A[2*i+1].b = B[2*i].b
-def: ShufflePat<S2_shuffeb, HexagonSHUFFEB>;
-
-// Shuffles odd bytes for i=0..3: A[2*i].b = C[2*i+1].b; A[2*i+1].b = B[2*i+1].b
-def: ShufflePat<S2_shuffob, HexagonSHUFFOB>;
-
-// Shuffles even half for i=0,1: A[2*i].h = C[2*i].h; A[2*i+1].h = B[2*i].h
-def: ShufflePat<S2_shuffeh, HexagonSHUFFEH>;
-
-// Shuffles odd half for i=0,1: A[2*i].h = C[2*i+1].h; A[2*i+1].h = B[2*i+1].h
-def: ShufflePat<S2_shuffoh, HexagonSHUFFOH>;
-
-
 // Truncated store from v4i16 to v4i8.
 def truncstorev4i8: PatFrag<(ops node:$val, node:$ptr),
                             (truncstore node:$val, node:$ptr),
@@ -3345,3 +3355,11 @@ def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)),
 def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)),
          (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>;
 
+
+// Read cycle counter.
+//
+def SDTInt64Leaf: SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
+  [SDNPHasChain]>;
+
+def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index ee32093..7d961a2 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -100,9 +100,6 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
-
-  private:
-    void ChangeOpInto(MachineOperand &Dst, MachineOperand &Src);
   };
 }
 
@@ -132,7 +129,9 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
     PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
-    for (MachineInstr &MI : *MBB) {
+    for (auto I = MBB->begin(), E = MBB->end(), NextI = I; I != E; I = NextI) {
+      NextI = std::next(I);
+      MachineInstr &MI = *I;
       // Look for sign extends:
       // %vreg170<def> = SXTW %vreg166
       if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
@@ -280,14 +279,13 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (NewOp) {
             unsigned PSrc = MI.getOperand(PR).getReg();
             if (unsigned POrig = PeepholeMap.lookup(PSrc)) {
-              MI.getOperand(PR).setReg(POrig);
+              BuildMI(*MBB, MI.getIterator(), MI.getDebugLoc(),
+                      QII->get(NewOp), MI.getOperand(0).getReg())
+                .addReg(POrig)
+                .add(MI.getOperand(S2))
+                .add(MI.getOperand(S1));
               MRI->clearKillFlags(POrig);
-              MI.setDesc(QII->get(NewOp));
-              // Swap operands S1 and S2.
-              MachineOperand Op1 = MI.getOperand(S1);
-              MachineOperand Op2 = MI.getOperand(S2);
-              ChangeOpInto(MI.getOperand(S1), Op2);
-              ChangeOpInto(MI.getOperand(S2), Op1);
+              MI.eraseFromParent();
             }
           } // if (NewOp)
         } // if (!Done)
@@ -299,40 +297,6 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
-void HexagonPeephole::ChangeOpInto(MachineOperand &Dst, MachineOperand &Src) {
-  assert (&Dst != &Src && "Cannot duplicate into itself");
-  switch (Dst.getType()) {
-    case MachineOperand::MO_Register:
-      if (Src.isReg()) {
-        Dst.setReg(Src.getReg());
-        Dst.setSubReg(Src.getSubReg());
-        MRI->clearKillFlags(Src.getReg());
-      } else if (Src.isImm()) {
-        Dst.ChangeToImmediate(Src.getImm());
-      } else {
-        llvm_unreachable("Unexpected src operand type");
-      }
-      break;
-
-    case MachineOperand::MO_Immediate:
-      if (Src.isImm()) {
-        Dst.setImm(Src.getImm());
-      } else if (Src.isReg()) {
-        Dst.ChangeToRegister(Src.getReg(), Src.isDef(), Src.isImplicit(),
-                             false, Src.isDead(), Src.isUndef(),
-                             Src.isDebug());
-        Dst.setSubReg(Src.getSubReg());
-      } else {
-        llvm_unreachable("Unexpected src operand type");
-      }
-      break;
-
-    default:
-      llvm_unreachable("Unexpected dst operand type");
-      break;
-  }
-}
-
 FunctionPass *llvm::createHexagonPeephole() {
   return new HexagonPeephole();
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td b/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td
new file mode 100644
index 0000000..b42c1ab
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -0,0 +1,529 @@
+//===--- HexagonPseudo.td -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// The pat frags in the definitions below need to have a named register,
+// otherwise i32 will be assumed regardless of the register class. The
+// name of the register does not matter.
+def I1  : PatLeaf<(i1 PredRegs:$R)>;
+def I32 : PatLeaf<(i32 IntRegs:$R)>;
+def I64 : PatLeaf<(i64 DoubleRegs:$R)>;
+def F32 : PatLeaf<(f32 IntRegs:$R)>;
+def F64 : PatLeaf<(f64 DoubleRegs:$R)>;
+
+let PrintMethod = "printGlobalOperand" in {
+  def globaladdress : Operand<i32>;
+  def globaladdressExt : Operand<i32>;
+}
+
+let isPseudo = 1 in {
+let isCodeGenOnly = 0 in
+def A2_iconst : Pseudo<(outs IntRegs:$Rd32),
+    (ins s27_2Imm:$Ii), "${Rd32}=iconst(#${Ii})">;
+
+def DUPLEX_Pseudo : InstHexagon<(outs),
+    (ins s32_0Imm:$offset), "DUPLEX", [], "", DUPLEX, TypePSEUDO>;
+}
+
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V2_ext : InstHexagon<(outs DoubleRegs:$dst),
+    (ins s32_0Imm:$src1, s8_0Imm:$src2),
+    "$dst=combine(#$src1,#$src2)", [], "",
+    A2_combineii.Itinerary, TypeALU32_2op>, OpcodeHexagon;
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    hasNewValue = 1, opNewValue = 0 in
+class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp,
+                InstHexagon rootInst>
+  : InstHexagon<(outs IntRegs:$dst),
+                (ins u16_0Imm:$imm_value),
+                "$dst"#RegHalf#"=#$imm_value", [], "",
+                rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
+    bits<5> dst;
+    bits<32> imm_value;
+
+    let Inst{27} = Rs;
+    let Inst{26-24} = MajOp;
+    let Inst{21} = MinOp;
+    let Inst{20-16} = dst;
+    let Inst{23-22} = imm_value{15-14};
+    let Inst{13-0} = imm_value{13-0};
+}
+
+let isAsmParserOnly = 1 in {
+  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1, A2_tfril>;
+  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1, A2_tfrih>;
+}
+
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
+  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
+                "$Rd = CONST32(#$v)", []>;
+  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
+                "$Rd = CONST64(#$v)", []>;
+}
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_true : InstHexagon<(outs PredRegs:$dst), (ins), "",
+              [(set I1:$dst, 1)], "", C2_orn.Itinerary, TypeCR>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "",
+               [(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                              ".error \"should not emit\" ", []>;
+
+let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                             ".error \"should not emit\" ", []>;
+
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins b30_2Imm:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC1], Uses = [SA1, LC1] in {
+def ENDLOOP1 : Endloop<(outs), (ins b30_2Imm:$offset),
+                       ":endloop1",
+                       []>;
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_iBase<string mnemonic, InstHexagon rootInst>
+         : InstHexagon <(outs), (ins b30_2Imm:$offset, u10_0Imm:$src2),
+           #mnemonic#"($offset,#$src2)",
+           [], "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
+    bits<9> offset;
+    bits<10> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2{9-5};
+    let Inst{12-8} = offset{8-4};
+    let Inst{7-5} = src2{4-2};
+    let Inst{4-3} = offset{3-2};
+    let Inst{1-0} = src2{1-0};
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_rBase<string mnemonic, InstHexagon rootInst>
+         : InstHexagon<(outs), (ins b30_2Imm:$offset, IntRegs:$src2),
+           #mnemonic#"($offset,$src2)",
+           [], "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
+    bits<9> offset;
+    bits<5> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b000000;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2;
+    let Inst{12-8} = offset{8-4};
+    let Inst{4-3} = offset{3-2};
+  }
+
+let Defs = [SA0, LC0, USR], isCodeGenOnly = 1, isExtended = 1,
+    opExtendable = 0 in {
+  def J2_loop0iext : LOOP_iBase<"loop0", J2_loop0i>;
+  def J2_loop1iext : LOOP_iBase<"loop1", J2_loop1i>;
+}
+
+// Interestingly only loop0's appear to set usr.lpcfg
+let Defs = [SA1, LC1], isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
+  def J2_loop0rext : LOOP_rBase<"loop0", J2_loop0r>;
+  def J2_loop1rext : LOOP_rBase<"loop1", J2_loop1r>;
+}
+
+let isCall = 1, hasSideEffects = 1, isPredicable = 0,
+    isExtended = 0, isExtendable = 1, opExtendable = 0,
+    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
+class T_Call<string ExtStr>
+  : InstHexagon<(outs), (ins a30_2Imm:$dst),
+      "call " # ExtStr # "$dst", [], "", J2_call.Itinerary, TypeJ>,
+    OpcodeHexagon {
+  let BaseOpcode = "call";
+  bits<24> dst;
+
+  let IClass = 0b0101;
+  let Inst{27-25} = 0b101;
+  let Inst{24-16,13-1} = dst{23-2};
+  let Inst{0} = 0b0;
+}
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1, Defs = [R16],
+    isPredicable = 0 in
+def CALLProfile :  T_Call<"">;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+    Defs = [PC, R31, R6, R7, P0] in
+def PS_call_stk : T_Call<"">;
+
+// Call, no return.
+let isCall = 1, hasSideEffects = 1, cofMax1 = 1, isCodeGenOnly = 1 in
+def PS_callr_nr: InstHexagon<(outs), (ins IntRegs:$Rs),
+    "callr $Rs", [], "", J2_callr.Itinerary, TypeJ>, OpcodeHexagon {
+    bits<5> Rs;
+    bits<2> Pu;
+    let isPredicatedFalse = 1;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0000101;
+    let Inst{20-16} = Rs;
+  }
+
+let isCall = 1, hasSideEffects = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 0, isCodeGenOnly = 1,
+    BaseOpcode = "PS_call_nr", isExtentSigned = 1, opExtentAlign = 2 in
+class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops,
+              InstrItinClass itin>
+  : Pseudo<(outs), iops, "">, PredRel {
+    bits<2> Pu;
+    bits<17> dst;
+    let opExtentBits = nbits;
+    let isPredicable = 0;  // !if(isPred, 0, 1);
+    let isPredicated = 0;  // isPred;
+    let isPredicatedFalse = isFalse;
+}
+
+def PS_call_nr : Call_nr<24, 0, 0, (ins s32_0Imm:$Ii), J2_call.Itinerary>;
+//def PS_call_nrt: Call_nr<17, 1, 0, (ins PredRegs:$Pu, s32_0Imm:$dst),
+//                         J2_callt.Itinerary>;
+//def PS_call_nrf: Call_nr<17, 1, 1, (ins PredRegs:$Pu, s32_0Imm:$dst),
+//                         J2_callf.Itinerary>;
+
+let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
+    isPredicable = 1, hasSideEffects = 0, InputType = "reg",
+    cofMax1 = 1 in
+class T_JMPr <InstHexagon rootInst>
+  :  InstHexagon<(outs), (ins IntRegs:$dst), "jumpr $dst", [],
+                 "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
+    bits<5> dst;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010100;
+    let Inst{20-16} = dst;
+}
+
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
+    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr<J2_jumpr>;
+
+// Indirect tail-call.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_r : T_JMPr<J2_jumpr>;
+
+//
+// Direct tail-calls.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_i : Pseudo<(outs), (ins a30_2Imm:$dst), "", []>;
+
+let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
+def PS_aligna : Pseudo<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
+
+// Generate frameindex addresses. The main reason for the offset operand is
+// that every instruction that is allowed to have frame index as an operand
+// will then have that operand followed by an immediate operand (the offset).
+// This simplifies the frame-index elimination code.
+//
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def PS_fi  : Pseudo<(outs IntRegs:$Rd),
+                         (ins IntRegs:$fi, s32_0Imm:$off), "">;
+  def PS_fia : Pseudo<(outs IntRegs:$Rd),
+                         (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
+}
+
+class CondStr<string CReg, bit True, bit New> {
+  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
+}
+class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+  string S = Mnemonic # !if(Taken, ":t", ":nt");
+}
+let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
+    hasSideEffects = 0, InputType = "reg", cofMax1 = 1 in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak, InstHexagon rootInst>
+  :  InstHexagon<(outs), (ins PredRegs:$src, IntRegs:$dst),
+                 CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+                 JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst",
+                 [], "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
+
+    let isTaken = isTak;
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<5> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-22} = 0b001101;
+    let Inst{21} = PredNot;
+    let Inst{20-16} = dst;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+}
+
+let isTerminator = 1, hasSideEffects = 0, isReturn = 1, isCodeGenOnly = 1,
+    isBarrier = 1, BaseOpcode = "JMPret" in {
+  def PS_jmpret : T_JMPr<J2_jumpr>, PredNewRel;
+  def PS_jmprett : T_JMPr_c<0, 0, 0, J2_jumprt>, PredNewRel;
+  def PS_jmpretf : T_JMPr_c<1, 0, 0, J2_jumprf>, PredNewRel;
+  def PS_jmprettnew : T_JMPr_c<0, 1, 0, J2_jumprtnew>, PredNewRel;
+  def PS_jmpretfnew : T_JMPr_c<1, 1, 0, J2_jumprfnew>, PredNewRel;
+  def PS_jmprettnewpt : T_JMPr_c<0, 1, 1, J2_jumprtnewpt>, PredNewRel;
+  def PS_jmpretfnewpt : T_JMPr_c<1, 1, 1, J2_jumprfnewpt>, PredNewRel;
+}
+
+//defm V6_vtran2x2_map : HexagonMapping<(outs VectorRegs:$Vy32, VectorRegs:$Vx32), (ins VectorRegs:$Vx32in, IntRegs:$Rt32), "vtrans2x2(${Vy32},${Vx32},${Rt32})", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, VectorRegs:$Vx32in, IntRegs:$Rt32)>;
+
+// The reason for the custom inserter is to record all ALLOCA instructions
+// in MachineFunctionInfo.
+let Defs = [R29], hasSideEffects = 1 in
+def PS_alloca: Pseudo <(outs IntRegs:$Rd),
+                       (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+                        (ins IntRegs:$addr, s32_0Imm:$off),
+                        ".error \"should not emit\"", []>;
+
+// Load modifier.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_mod : LDInst<(outs ModRegs:$dst),
+                        (ins IntRegs:$addr, s32_0Imm:$off),
+                        ".error \"should not emit\"", []>;
+
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+def PS_pselect: InstHexagon<(outs DoubleRegs:$Rd),
+      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      ".error \"should not emit\" ", [], "", A2_tfrpt.Itinerary, TypeALU32_2op>;
+
+let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
+    isPredicable = 1,
+    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
+class T_JMP: InstHexagon<(outs), (ins b30_2Imm:$dst),
+      "jump $dst",
+      [], "", J2_jump.Itinerary, TypeJ>, OpcodeHexagon {
+    bits<24> dst;
+    let IClass = 0b0101;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-16} = dst{23-15};
+    let Inst{13-1} = dst{14-2};
+}
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP;
+  }
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<"">, PredRel;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<"">, PredRel;
+  }
+}
+
+// Save registers function call.
+let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
+  def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel;
+
+  let Defs = [P0] in
+  def SAVE_REGISTERS_CALL_V4STK : T_Call<"">, PredRel;
+
+  let Defs = [P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28] in
+  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, P0] in
+  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<"">, PredRel;
+}
+
+// Vector store pseudos
+let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+    mayStore = 1, hasSideEffects = 0 in
+class STrivv_template<RegisterClass RC, InstHexagon rootInst>
+  : InstHexagon<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src),
+    "", [], "", rootInst.Itinerary, rootInst.Type>;
+
+def PS_vstorerw_ai: STrivv_template<VecDblRegs, V6_vS32b_ai>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32b_ai_128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+def PS_vstorerw_nt_ai: STrivv_template<VecDblRegs, V6_vS32b_nt_ai>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerw_nt_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32b_nt_ai_128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+def PS_vstorerwu_ai: STrivv_template<VecDblRegs, V6_vS32Ub_ai>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32Ub_ai_128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in {
+  def PS_vstorerq_ai: Pseudo<(outs),
+        (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs:$Qt), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vstorerq_ai_128B: Pseudo<(outs),
+        (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs128B:$Qt), "", []>,
+        Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Vector load pseudos
+let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
+    mayLoad = 1, hasSideEffects = 0 in
+class LDrivv_template<RegisterClass RC, InstHexagon rootInst>
+  : InstHexagon<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off),
+    "", [], "", rootInst.Itinerary, rootInst.Type>;
+
+def PS_vloadrw_ai: LDrivv_template<VecDblRegs, V6_vL32b_ai>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32b_ai_128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+def PS_vloadrw_nt_ai: LDrivv_template<VecDblRegs, V6_vL32b_nt_ai>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrw_nt_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32b_nt_ai_128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+def PS_vloadrwu_ai: LDrivv_template<VecDblRegs, V6_vL32Ub_ai>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32Ub_ai_128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in {
+  def PS_vloadrq_ai: Pseudo<(outs VecPredRegs:$Qd),
+        (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vloadrq_ai_128B: Pseudo<(outs VecPredRegs128B:$Qd),
+        (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
+        Requires<[HasV60T,UseHVXDbl]>;
+}
+
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+class VSELInst<dag outs, dag ins, InstHexagon rootInst>
+  : InstHexagon<outs, ins, "", [], "", rootInst.Itinerary, rootInst.Type>;
+
+def PS_vselect: VSELInst<(outs VectorRegs:$dst),
+      (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+      V6_vcmov>, Requires<[HasV60T,UseHVXSgl]>;
+def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
+      (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+      V6_vcmov>, Requires<[HasV60T,UseHVXDbl]>;
+
+def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
+      (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3),
+      V6_vccombine>, Requires<[HasV60T,UseHVXSgl]>;
+def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
+      (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
+      V6_vccombine>, Requires<[HasV60T,UseHVXDbl]>;
+
+// Store predicate.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_pred : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, PredRegs:$src1),
+      ".error \"should not emit\"", []>;
+// Store modifier.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_mod : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, ModRegs:$src1),
+      ".error \"should not emit\"", []>;
+
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V4 : InstHexagon<(outs DoubleRegs:$dst),
+    (ins u64_0Imm:$src1),
+    "$dst = #$src1", [], "",
+    A2_combineii.Itinerary, TypeALU32_2op>, OpcodeHexagon;
+
+// Hexagon doesn't have a vector multiply with C semantics.
+// Instead, generate a pseudo instruction that gets expaneded into two
+// scalar MPYI instructions.
+// This is expanded by ExpandPostRAPseudos.
+let isPseudo = 1 in
+def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
+
+let isPseudo = 1 in
+def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
+      "$Rd = $Rx">;
+
+def DuplexIClass0:  InstDuplex < 0 >;
+def DuplexIClass1:  InstDuplex < 1 >;
+def DuplexIClass2:  InstDuplex < 2 >;
+let isExtendable = 1 in {
+  def DuplexIClass3:  InstDuplex < 3 >;
+  def DuplexIClass4:  InstDuplex < 4 >;
+  def DuplexIClass5:  InstDuplex < 5 >;
+  def DuplexIClass6:  InstDuplex < 6 >;
+  def DuplexIClass7:  InstDuplex < 7 >;
+}
+def DuplexIClass8:  InstDuplex < 8 >;
+def DuplexIClass9:  InstDuplex < 9 >;
+def DuplexIClassA:  InstDuplex < 0xA >;
+def DuplexIClassB:  InstDuplex < 0xB >;
+def DuplexIClassC:  InstDuplex < 0xC >;
+def DuplexIClassD:  InstDuplex < 0xD >;
+def DuplexIClassE:  InstDuplex < 0xE >;
+def DuplexIClassF:  InstDuplex < 0xF >;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 30640e1..b3aba50 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -94,7 +94,7 @@ struct HexagonDCE : public DeadCodeElimination {
 
 
 bool HexagonCP::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
-  auto mapRegs = [MI,&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
+  auto mapRegs = [&EM] (RegisterRef DstR, RegisterRef SrcR) -> void {
     EM.insert(std::make_pair(DstR, SrcR));
   };
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d3f230d..1fc1579 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -36,6 +36,9 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
+
 using namespace llvm;
 
 HexagonRegisterInfo::HexagonRegisterInfo()
@@ -47,11 +50,6 @@ bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const {
          R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1;
 }
 
-bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const {
-  return Hexagon::R16 <= Reg && Reg <= Hexagon::R27;
-}
-
-
 const MCPhysReg *
 HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF,
       const TargetRegisterClass *RC) const {
@@ -125,6 +123,7 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case HexagonSubtarget::V5:
   case HexagonSubtarget::V55:
   case HexagonSubtarget::V60:
+  case HexagonSubtarget::V62:
     return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
   }
 
@@ -133,25 +132,47 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 }
 
 
+const uint32_t *HexagonRegisterInfo::getCallPreservedMask(
+      const MachineFunction &MF, CallingConv::ID) const {
+  return HexagonCSR_RegMask;
+}
+
+
 BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   const {
   BitVector Reserved(getNumRegs());
   Reserved.set(Hexagon::R29);
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
-  Reserved.set(Hexagon::PC);
-  Reserved.set(Hexagon::D14);
-  Reserved.set(Hexagon::D15);
-  Reserved.set(Hexagon::LC0);
-  Reserved.set(Hexagon::LC1);
-  Reserved.set(Hexagon::SA0);
-  Reserved.set(Hexagon::SA1);
-  Reserved.set(Hexagon::UGP);
-  Reserved.set(Hexagon::GP);
-  Reserved.set(Hexagon::CS0);
-  Reserved.set(Hexagon::CS1);
-  Reserved.set(Hexagon::CS);
-  Reserved.set(Hexagon::USR);
+  // Control registers.
+  Reserved.set(Hexagon::SA0);         // C0
+  Reserved.set(Hexagon::LC0);         // C1
+  Reserved.set(Hexagon::SA1);         // C2
+  Reserved.set(Hexagon::LC1);         // C3
+  Reserved.set(Hexagon::P3_0);        // C4
+  Reserved.set(Hexagon::USR);         // C8
+  Reserved.set(Hexagon::PC);          // C9
+  Reserved.set(Hexagon::UGP);         // C10
+  Reserved.set(Hexagon::GP);          // C11
+  Reserved.set(Hexagon::CS0);         // C12
+  Reserved.set(Hexagon::CS1);         // C13
+  Reserved.set(Hexagon::UPCYCLELO);   // C14
+  Reserved.set(Hexagon::UPCYCLEHI);   // C15
+  Reserved.set(Hexagon::FRAMELIMIT);  // C16
+  Reserved.set(Hexagon::FRAMEKEY);    // C17
+  Reserved.set(Hexagon::PKTCOUNTLO);  // C18
+  Reserved.set(Hexagon::PKTCOUNTHI);  // C19
+  Reserved.set(Hexagon::UTIMERLO);    // C30
+  Reserved.set(Hexagon::UTIMERHI);    // C31
+  // Out of the control registers, only C8 is explicitly defined in
+  // HexagonRegisterInfo.td. If others are defined, make sure to add
+  // them here as well.
+  Reserved.set(Hexagon::C8);
+  Reserved.set(Hexagon::USR_OVF);
+
+  for (int x = Reserved.find_first(); x >= 0; x = Reserved.find_next(x))
+    markSuperRegs(Reserved, x);
+
   return Reserved;
 }
 
@@ -267,6 +288,3 @@ unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
   return Hexagon::R6;
 }
 
-
-#define GET_REGINFO_TARGET_DESC
-#include "HexagonGenRegisterInfo.inc"
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
index 1fb295b..5f65fad 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -35,7 +35,8 @@ public:
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF)
         const override;
-
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+        CallingConv::ID) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
@@ -76,7 +77,6 @@ public:
   unsigned getFirstCallerSavedNonParamReg() const;
 
   bool isEHReturnCalleeSaveReg(unsigned Reg) const;
-  bool isCalleeSaveReg(unsigned Reg) const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index a75f351..45dbb3a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -122,12 +122,6 @@ let Namespace = "Hexagon" in {
   def P2 : Rp<2, "p2">, DwarfRegNum<[65]>;
   def P3 : Rp<3, "p3">, DwarfRegNum<[66]>;
 
-  // Modifier registers.
-  // C6 and C7 can also be M0 and M1, but register names must be unique, even
-  // if belonging to different register classes.
-  def M0 : Mx<0, "m0">, DwarfRegNum<[72]>;
-  def M1 : Mx<1, "m1">, DwarfRegNum<[73]>;
-
   // Fake register to represent USR.OVF bit. Artihmetic/saturating instruc-
   // tions modify this bit, and multiple such instructions are allowed in the
   // same packet. We need to ignore output dependencies on this bit, but not
@@ -140,41 +134,54 @@ let Namespace = "Hexagon" in {
   }
 
   // Control registers.
-  def SA0  : Rc<0,  "sa0",       ["c0"]>,   DwarfRegNum<[67]>;
-  def LC0  : Rc<1,  "lc0",       ["c1"]>,   DwarfRegNum<[68]>;
-  def SA1  : Rc<2,  "sa1",       ["c2"]>,   DwarfRegNum<[69]>;
-  def LC1  : Rc<3,  "lc1",       ["c3"]>,   DwarfRegNum<[70]>;
-  def P3_0 : Rc<4,  "p3:0",      ["c4"], [P0, P1, P2, P3]>,
-                                            DwarfRegNum<[71]>;
-  def C5   : Rc<5,  "c5",        ["c5"]>,   DwarfRegNum<[72]>; // future use
-  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[73]>;
-  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[74]>;
+  def SA0:        Rc<0,  "sa0",        ["c0"]>,    DwarfRegNum<[67]>;
+  def LC0:        Rc<1,  "lc0",        ["c1"]>,    DwarfRegNum<[68]>;
+  def SA1:        Rc<2,  "sa1",        ["c2"]>,    DwarfRegNum<[69]>;
+  def LC1:        Rc<3,  "lc1",        ["c3"]>,    DwarfRegNum<[70]>;
+  def P3_0:       Rc<4,  "p3:0",       ["c4"], [P0, P1, P2, P3]>,
+                                                   DwarfRegNum<[71]>;
+  // When defining more Cn registers, make sure to explicitly mark them
+  // as reserved in HexagonRegisterInfo.cpp.
+  def C5:         Rc<5,  "c5",         ["c5"]>,    DwarfRegNum<[72]>;
+  def M0:         Rc<6,  "m0",         ["c6"]>,    DwarfRegNum<[73]>;
+  def M1:         Rc<7,  "m1",         ["c7"]>,    DwarfRegNum<[74]>;
   // Define C8 separately and make it aliased with USR.
   // The problem is that USR has subregisters (e.g. overflow). If USR was
   // specified as a subregister of C9_8, it would imply that subreg_overflow
   // and isub_lo can be composed, which leads to all kinds of issues
   // with lane masks.
-  def C8   : Rc<8,  "c8",       [], [USR]>, DwarfRegNum<[75]>;
-  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[76]>;
-  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[77]>;
-  def GP   : Rc<11, "gp",        ["c11"]>,  DwarfRegNum<[78]>;
-  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[79]>;
-  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[80]>;
-  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[81]>;
-  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[82]>;
+  def C8:         Rc<8,  "c8",         [], [USR]>, DwarfRegNum<[75]>;
+  def PC:         Rc<9,  "pc",         ["c9"]>,    DwarfRegNum<[76]>;
+  def UGP:        Rc<10, "ugp",        ["c10"]>,   DwarfRegNum<[77]>;
+  def GP:         Rc<11, "gp",         ["c11"]>,   DwarfRegNum<[78]>;
+  def CS0:        Rc<12, "cs0",        ["c12"]>,   DwarfRegNum<[79]>;
+  def CS1:        Rc<13, "cs1",        ["c13"]>,   DwarfRegNum<[80]>;
+  def UPCYCLELO:  Rc<14, "upcyclelo",  ["c14"]>,   DwarfRegNum<[81]>;
+  def UPCYCLEHI:  Rc<15, "upcyclehi",  ["c15"]>,   DwarfRegNum<[82]>;
+  def FRAMELIMIT: Rc<16, "framelimit", ["c16"]>,   DwarfRegNum<[83]>;
+  def FRAMEKEY:   Rc<17, "framekey",   ["c17"]>,   DwarfRegNum<[84]>;
+  def PKTCOUNTLO: Rc<18, "pktcountlo", ["c18"]>,   DwarfRegNum<[85]>;
+  def PKTCOUNTHI: Rc<19, "pktcounthi", ["c19"]>,   DwarfRegNum<[86]>;
+  def UTIMERLO:   Rc<30, "utimerlo",   ["c30"]>,   DwarfRegNum<[97]>;
+  def UTIMERHI:   Rc<31, "utimerhi",   ["c31"]>,   DwarfRegNum<[98]>;
 }
 
   // Control registers pairs.
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-    def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
-    def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
-    def C5_4   : Rcc<4,   "c5:4",  [P3_0, C5]>,              DwarfRegNum<[71]>;
-    def C7_6   : Rcc<6,   "c7:6",  [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
+    def C1_0:     Rcc<0,  "c1:0",   [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+    def C3_2:     Rcc<2,  "c3:2",   [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+    def C5_4:     Rcc<4,  "c5:4",   [P3_0, C5]>,              DwarfRegNum<[71]>;
+    def C7_6:     Rcc<6,  "c7:6",   [M0, M1],   ["m1:0"]>,    DwarfRegNum<[72]>;
     // Use C8 instead of USR as a subregister of C9_8.
-    def C9_8   : Rcc<8,   "c9:8",  [C8, PC]>,                DwarfRegNum<[74]>;
-    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
-    def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
-    def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
+    def C9_8:     Rcc<8,  "c9:8",   [C8, PC]>,                DwarfRegNum<[74]>;
+    def C11_10:   Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
+    def CS:       Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
+    def UPCYCLE:  Rcc<14, "c15:14", [UPCYCLELO, UPCYCLEHI]>,  DwarfRegNum<[80]>;
+    def C17_16:   Rcc<16, "c17:16", [FRAMELIMIT, FRAMEKEY]>,  DwarfRegNum<[83]>;
+    def PKTCOUNT: Rcc<18, "c19:18", [PKTCOUNTLO, PKTCOUNTHI], ["pktcount"]>,
+                                                              DwarfRegNum<[85]>;
+    def UTIMER:   Rcc<30, "c31:30", [UTIMERLO, UTIMERHI], ["utimer"]>,
+                                                              DwarfRegNum<[97]>;
   }
 
   foreach i = 0-31 in {
@@ -219,6 +226,10 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
 }
 
 // Registers are listed in reverse order for allocation preference reasons.
+def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
+                                   (add R23, R22, R21, R20, R19, R18, R17,
+                                        R16, R7, R6, R5, R4, R3, R2, R1, R0)>;
+
 def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
                                 (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
 
@@ -226,6 +237,10 @@ def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
                                (add (sequence "D%u", 0, 4),
                                     (sequence "D%u", 6, 13), D5, D14, D15)>;
 
+def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
+                                          (add D11, D10, D9, D8, D3, D2, D1,
+                                               D0)>;
+
 def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
                                (add (sequence "V%u", 0, 31))>;
 
@@ -259,28 +274,28 @@ def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
 
 let Size = 32, isAllocatable = 0 in
 def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
-                            (add LC0, SA0, LC1, SA1,
-                                 P3_0, C5,
-                                 M0, M1, C6, C7, C8, CS0, CS1, UPCL, UPCH,
-                                 USR, UGP, GP, PC)>;
+  (add LC0, SA0, LC1, SA1, P3_0, C5, C8, PC, UGP, GP, CS0, CS1,
+       UPCYCLELO, UPCYCLEHI,
+       FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI,
+       M0, M1, USR)>;
 
 let isAllocatable = 0 in
 def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
 
 let Size = 64, isAllocatable = 0 in
 def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
-                              (add C1_0, C3_2, C7_6, C9_8, C11_10, CS, UPC)>;
-
-def VolatileV3 {
-  list<Register> Regs = [D0, D1, D2, D3, D4, D5, D6, D7,
-                         R28, R31,
-                         P0, P1, P2, P3,
-                         M0, M1,
-                         LC0, LC1, SA0, SA1, USR, USR_OVF, CS0, CS1,
-                         V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
-                         V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
-                         V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
-                         W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11,
-                         W12, W13, W14, W15,
-                         Q0, Q1, Q2, Q3];
-}
+  (add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE, C17_16,
+       PKTCOUNT, UTIMER)>;
+
+// These registers are new for v62 and onward.
+// The function RegisterMatchesArch() uses this list for validation.
+let isAllocatable = 0 in
+def V62Regs : RegisterClass<"Hexagon", [i32], 32,
+                            (add FRAMELIMIT, FRAMEKEY,   C17_16,
+                                 PKTCOUNTLO, PKTCOUNTHI, PKTCOUNT,
+                                 UTIMERLO,   UTIMERHI,   UTIMER)>;
+
+
+def HexagonCSR
+  : CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
+                         R24, R25, R26, R27)>;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
index 6e4987b..ffee03e7 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -7,6 +7,55 @@
 //
 //===----------------------------------------------------------------------===//
 
+def Hex_FWD : Bypass;
+def HVX_FWD : Bypass;
+
+// Functional Units.
+def SLOT0       : FuncUnit;
+def SLOT1       : FuncUnit;
+def SLOT2       : FuncUnit;
+def SLOT3       : FuncUnit;
+// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
+// rather than taking an execution slot. This special unit is needed
+// to schedule an ENDLOOP with 4 other instructions.
+def SLOT_ENDLOOP: FuncUnit;
+
+// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec".
+def CVI_ST     : FuncUnit;
+def CVI_XLANE  : FuncUnit;
+def CVI_SHIFT  : FuncUnit;
+def CVI_MPY0   : FuncUnit;
+def CVI_MPY1   : FuncUnit;
+def CVI_LD     : FuncUnit;
+
+// Combined functional units.
+def CVI_XLSHF  : FuncUnit;
+def CVI_MPY01  : FuncUnit;
+def CVI_ALL    : FuncUnit;
+def CVI_ALL_NOMEM : FuncUnit;
+
+// Combined functional unit data.
+def HexagonComboFuncsV60 :
+    ComboFuncUnits<[
+      ComboFuncData<CVI_XLSHF    , [CVI_XLANE, CVI_SHIFT]>,
+      ComboFuncData<CVI_MPY01    , [CVI_MPY0, CVI_MPY1]>,
+      ComboFuncData<CVI_ALL      , [CVI_ST, CVI_XLANE, CVI_SHIFT,
+                                    CVI_MPY0, CVI_MPY1, CVI_LD]>,
+      ComboFuncData<CVI_ALL_NOMEM, [CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1]>
+    ]>;
+
+// Itinerary classes.
+def PSEUDO          : InstrItinClass;
+def PSEUDOM         : InstrItinClass;
+def DUPLEX          : InstrItinClass;
+def tc_ENDLOOP      : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Auto-generated itinerary classes
+//===----------------------------------------------------------------------===//
+include "HexagonDepIICScalar.td"
+include "HexagonDepIICHVX.td"
+
 //===----------------------------------------------------------------------===//
 // V4 Machine Info +
 //===----------------------------------------------------------------------===//
@@ -20,5 +69,13 @@ include "HexagonScheduleV55.td"
 // V60 Machine Info -
 //===----------------------------------------------------------------------===//
 
+include "HexagonIICScalar.td"
+include "HexagonIICHVX.td"
 include "HexagonScheduleV60.td"
 
+//===----------------------------------------------------------------------===//
+// V62 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV62.td"
+
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
index 7416baa..69b704a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -7,188 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 
-// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
-// This file describes that machine information.
-
-//
-//    |===========|==================================================|
-//    | PIPELINE  |              Instruction Classes                 |
-//    |===========|==================================================|
-//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
-//    |-----------|--------------------------------------------------|
-//    | SLOT1     |  LD       ST    ALU32                            |
-//    |-----------|--------------------------------------------------|
-//    | SLOT2     |  XTYPE          ALU32     J         JR           |
-//    |-----------|--------------------------------------------------|
-//    | SLOT3     |  XTYPE          ALU32     J         CR           |
-//    |===========|==================================================|
-
-// Functional Units.
-def SLOT0       : FuncUnit;
-def SLOT1       : FuncUnit;
-def SLOT2       : FuncUnit;
-def SLOT3       : FuncUnit;
-// Endloop is a pseudo instruction that is encoded with 2 bits in a packet
-// rather than taking an execution slot. This special unit is needed
-// to schedule an ENDLOOP with 4 other instructions.
-def SLOT_ENDLOOP: FuncUnit;
-
-// Itinerary classes.
-def PSEUDO      : InstrItinClass;
-def PSEUDOM     : InstrItinClass;
-// ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
-def DUPLEX      : InstrItinClass;
-def PREFIX      : InstrItinClass;
-def COMPOUND_CJ_ARCHDEPSLOT    : InstrItinClass;
-def COMPOUND    : InstrItinClass;
+def LD_tc_ld_SLOT01 : InstrItinClass;
+def ST_tc_st_SLOT01 : InstrItinClass;
+
+class HexagonV4PseudoItin {
+  list<InstrItinData> V4PseudoItin_list = [
+    InstrItinData<PSEUDO,     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData<PSEUDOM,    [InstrStage<1, [SLOT2, SLOT3], 0>,
+                               InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData<DUPLEX,     [InstrStage<1, [SLOT0]>]>,
+    InstrItinData<tc_ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>]>
+  ];
+}
 
-def ALU32_2op_tc_1_SLOT0123  : InstrItinClass;
-def ALU32_2op_tc_2early_SLOT0123  : InstrItinClass;
-def ALU32_3op_tc_2early_SLOT0123  : InstrItinClass;
-def ALU32_3op_tc_1_SLOT0123  : InstrItinClass;
-def ALU32_3op_tc_2_SLOT0123  : InstrItinClass;
-def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
-def ALU64_tc_1_SLOT23        : InstrItinClass;
-def ALU64_tc_2_SLOT23        : InstrItinClass;
-def ALU64_tc_2early_SLOT23   : InstrItinClass;
-def ALU64_tc_3x_SLOT23       : InstrItinClass;
-def CR_tc_2_SLOT3            : InstrItinClass;
-def CR_tc_2early_SLOT23      : InstrItinClass;
-def CR_tc_2early_SLOT3       : InstrItinClass;
-def CR_tc_3x_SLOT23          : InstrItinClass;
-def CR_tc_3x_SLOT3           : InstrItinClass;
-def J_tc_2early_SLOT23       : InstrItinClass;
-def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT       : InstrItinClass;
-def J_tc_2early_SLOT2        : InstrItinClass;
-def LD_tc_ld_SLOT01          : InstrItinClass;
-def LD_tc_ld_SLOT0           : InstrItinClass;
-def LD_tc_3or4stall_SLOT0    : InstrItinClass;
-def M_tc_2_SLOT23            : InstrItinClass;
-def M_tc_3_SLOT23            : InstrItinClass;
-def M_tc_1_SLOT23            : InstrItinClass;
-def M_tc_3x_SLOT23           : InstrItinClass;
-def M_tc_3or4x_SLOT23        : InstrItinClass;
-def ST_tc_st_SLOT01          : InstrItinClass;
-def ST_tc_st_SLOT0           : InstrItinClass;
-def ST_tc_ld_SLOT0           : InstrItinClass;
-def ST_tc_3stall_SLOT0       : InstrItinClass;
-def S_2op_tc_1_SLOT23        : InstrItinClass;
-def S_2op_tc_2_SLOT23        : InstrItinClass;
-def S_2op_tc_2early_SLOT23   : InstrItinClass;
-def S_2op_tc_3or4x_SLOT23    : InstrItinClass;
-def S_3op_tc_1_SLOT23        : InstrItinClass;
-def S_3op_tc_2_SLOT23        : InstrItinClass;
-def S_3op_tc_2early_SLOT23   : InstrItinClass;
-def S_3op_tc_3_SLOT23        : InstrItinClass;
-def S_3op_tc_3x_SLOT23       : InstrItinClass;
-def NCJ_tc_3or4stall_SLOT0   : InstrItinClass;
-def V2LDST_tc_ld_SLOT01      : InstrItinClass;
-def V2LDST_tc_st_SLOT0       : InstrItinClass;
-def V2LDST_tc_st_SLOT01      : InstrItinClass;
-def V4LDST_tc_ld_SLOT01      : InstrItinClass;
-def V4LDST_tc_st_SLOT0       : InstrItinClass;
-def V4LDST_tc_st_SLOT01      : InstrItinClass;
-def J_tc_2early_SLOT0123     : InstrItinClass;
-def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
-def S_3op_tc_3stall_SLOT23   : InstrItinClass;
+def HexagonV4ItinList : DepScalarItinV4, HexagonV4PseudoItin {
+  list<InstrItinData> V4Itin_list = [
+    InstrItinData<LD_tc_ld_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData<ST_tc_st_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>
+  ];
+  list<InstrItinData> ItinList =
+    !listconcat(V4Itin_list, DepScalarItinV4_list, V4PseudoItin_list);
+}
 
 def HexagonItinerariesV4 :
-      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
-        // ALU32
-        InstrItinData<ALU32_2op_tc_1_SLOT0123  ,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_3op_tc_1_SLOT0123   ,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_3op_tc_2_SLOT0123   ,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_ADDI_tc_1_SLOT0123  ,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-
-        // ALU64
-        InstrItinData<ALU64_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-
-        // CR -> System
-        InstrItinData<CR_tc_2_SLOT3          , [InstrStage<1, [SLOT3]>]>,
-        InstrItinData<CR_tc_2early_SLOT3     , [InstrStage<1, [SLOT3]>]>,
-        InstrItinData<CR_tc_3x_SLOT3         , [InstrStage<1, [SLOT3]>]>,
-
-        // Jump (conditional/unconditional/return etc)
-        // CR
-        InstrItinData<CR_tc_2early_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<CR_tc_3x_SLOT23        , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        // J
-        InstrItinData<J_tc_2early_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        // JR
-        InstrItinData<J_tc_2early_SLOT2      , [InstrStage<1, [SLOT2]>]>,
-
-        //Load
-        InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
-
-        // M
-        InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-
-        // Store
-        // ST
-        InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        // ST0
-        InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
-
-        // S
-        InstrItinData<S_2op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_3or4x_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3stall_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>,
-
-        // SYS
-        InstrItinData<ST_tc_3stall_SLOT0     , [InstrStage<1, [SLOT0]>]>,
-
-        // New Value Compare Jump
-        InstrItinData<NCJ_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
-
-        // Mem ops - MEM_V4
-        InstrItinData<V2LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<V2LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<V2LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<V4LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<V4LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<V4LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
-
-        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
-
-        // ENDLOOP
-        InstrItinData<J_tc_2early_SLOT0123   , [InstrStage<1, [SLOT_ENDLOOP]>]>,
-
-        // Extender/PREFIX
-        InstrItinData<EXTENDER_tc_1_SLOT0123,
-                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-
-        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                InstrStage<1, [SLOT2, SLOT3]>]>
-      ]>;
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP],
+                           [Hex_FWD], HexagonV4ItinList.ItinList>;
 
 def HexagonModelV4 : SchedMachineModel {
   // Max issue per cycle == bundle width.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
index b2a75f7..ca738be 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -1,4 +1,4 @@
-//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=//
+//=-HexagonScheduleV55.td - HexagonV55 Scheduling Definitions -*- tablegen -*=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,179 +7,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine.
-// This file describes that machine information.
 
-//
-//    |===========|==================================================|
-//    | PIPELINE  |              Instruction Classes                 |
-//    |===========|==================================================|
-//    | SLOT0     |  LD       ST    ALU32     MEMOP     NV    SYSTEM |
-//    |-----------|--------------------------------------------------|
-//    | SLOT1     |  LD       ST    ALU32                            |
-//    |-----------|--------------------------------------------------|
-//    | SLOT2     |  XTYPE          ALU32     J         JR           |
-//    |-----------|--------------------------------------------------|
-//    | SLOT3     |  XTYPE          ALU32     J         CR           |
-//    |===========|==================================================|
+class HexagonV55PseudoItin {
+  list<InstrItinData> V55PseudoItin_list = [
+    InstrItinData<PSEUDO, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                          [1, 1, 1]>,
+    InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                            InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<DUPLEX,     [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+    InstrItinData<tc_ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>], [2]>
+  ];
+}
 
-def CJ_tc_1_SLOT23              : InstrItinClass;
-def CJ_tc_2early_SLOT23         : InstrItinClass;
-def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass;
-def COPROC_VX_vtc_long_SLOT23   : InstrItinClass;
-def COPROC_VX_vtc_SLOT23        : InstrItinClass;
-def J_tc_3stall_SLOT2           : InstrItinClass;
-def MAPPING_tc_1_SLOT0123       : InstrItinClass;
-def M_tc_3stall_SLOT23          : InstrItinClass;
+def HexagonV55ItinList : DepScalarItinV55,
+                         HexagonV55PseudoItin {
+  list<InstrItinData> V55Itin_list = [
+    InstrItinData<LD_tc_ld_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>], [2, 1]>,
+    InstrItinData<ST_tc_st_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                   [1, 1, 1]>
+  ];
+  list<InstrItinData> ItinList =
+    !listconcat(V55Itin_list, DepScalarItinV55_list,
+                V55PseudoItin_list);
+}
 
 def HexagonItinerariesV55 :
-      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
-        // ALU32
-        InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
-        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
-        InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
-        InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
-        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
-        InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
-
-        // ALU64
-        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [1, 1, 1]>,
-        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [2, 1, 1]>,
-        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [2, 1, 1]>,
-        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [3, 1, 1]>,
-
-        // CR -> System
-        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
-        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
-        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<1, [SLOT3]>], [3, 1, 1]>,
-
-        // Jump (conditional/unconditional/return etc)
-        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                           [2, 1, 1, 1]>,
-        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
-                                           [3, 1, 1, 1]>,
-        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                           [1, 1, 1, 1]>,
-        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                           [2, 1, 1, 1]>,
-        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
-                                           [2, 1, 1, 1]>,
-        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT,
-                                 [InstrStage<1, [SLOT2, SLOT3]>], [2, 1, 1, 1]>,
-
-        // JR
-        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<1, [SLOT2]>], [2, 1, 1]>,
-        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<1, [SLOT2]>], [3, 1, 1]>,
-
-        // Extender
-        InstrItinData<EXTENDER_tc_1_SLOT0123,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
-
-        // Load
-        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
-                                             [2, 1]>,
-        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1]>,
-        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [2, 1]>,
-
-        // M
-        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                          [1, 1, 1]>,
-        InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                          [2, 1, 1]>,
-        InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                          [1, 1, 1]>,
-        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
-                                          [3, 1, 1]>,
-        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
-                                          [3, 1, 1]>,
-        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                          [3, 1, 1]>,
-
-        // Store
-        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
-                                          [1, 1, 1]>,
-        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
-        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
-        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
-
-        // S
-        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [1, 1, 1]>,
-        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [2, 1, 1]>,
-        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [2, 1, 1]>,
-        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [3, 1, 1]>,
-        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [1, 1, 1]>,
-        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [2, 1, 1]>,
-        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [2, 1, 1]>,
-        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [3, 1, 1]>,
-        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [3, 1, 1]>,
-        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
-                                              [3, 1, 1]>,
-
-        // New Value Compare Jump
-        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>],
-                                              [3, 1, 1, 1]>,
-
-        // Mem ops
-        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
-                                            [1, 1, 1, 1]>,
-        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
-                                            [2, 1, 1, 1]>,
-        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
-                                            [1, 1, 1, 1]>,
-        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
-                                            [1, 1, 1, 1]>,
-        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
-                                            [3, 1, 1, 1]>,
-        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
-                                            [1, 1, 1, 1]>,
-
-        // Endloop
-        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<1, [SLOT_ENDLOOP]>],
-                                            [2]>,
-
-        // Vector
-        InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
-                      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 1]>,
-        InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
-                      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
-        InstrItinData<COPROC_VX_vtc_SLOT23 ,
-                      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1, 1]>,
-        InstrItinData<MAPPING_tc_1_SLOT0123      ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
-                      [1, 1, 1, 1]>,
-
-        // Misc
-        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>],
-                                                [1, 1, 1]>,
-        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>],
-                                 [1, 1, 1]>,
-        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
-        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
-                               [1, 1, 1]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
-                               [1, 1, 1]>,
-        InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>
-      ]>;
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP],
+                           [Hex_FWD], HexagonV55ItinList.ItinList>;
 
 def HexagonModelV55 : SchedMachineModel {
   // Max issue per cycle == bundle width.
@@ -190,5 +44,5 @@ def HexagonModelV55 : SchedMachineModel {
 }
 
 //===----------------------------------------------------------------------===//
-// Hexagon V4 Resource Definitions -
+// Hexagon V55 Resource Definitions -
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
index dc2ce43..a2544c9 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -7,56 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec".
-def CVI_ST     : FuncUnit;
-def CVI_XLANE  : FuncUnit;
-def CVI_SHIFT  : FuncUnit;
-def CVI_MPY0   : FuncUnit;
-def CVI_MPY1   : FuncUnit;
-def CVI_LD     : FuncUnit;
-
-// Combined functional units.
-def CVI_XLSHF  : FuncUnit;
-def CVI_MPY01  : FuncUnit;
-def CVI_ALL    : FuncUnit;
-
-// Combined functional unit data.
-def HexagonComboFuncsV60 :
-    ComboFuncUnits<[
-      ComboFuncData<CVI_XLSHF    , [CVI_XLANE, CVI_SHIFT]>,
-      ComboFuncData<CVI_MPY01    , [CVI_MPY0, CVI_MPY1]>,
-      ComboFuncData<CVI_ALL      , [CVI_ST, CVI_XLANE, CVI_SHIFT,
-                                    CVI_MPY0, CVI_MPY1, CVI_LD]>
-    ]>;
-
-// Note: When adding additional vector scheduling classes, add the
-// corresponding methods to the class HexagonInstrInfo.
-def CVI_VA           : InstrItinClass;
-def CVI_VA_DV        : InstrItinClass;
-def CVI_VX_LONG      : InstrItinClass;
-def CVI_VX_LATE      : InstrItinClass;
-def CVI_VX           : InstrItinClass;
-def CVI_VX_DV_LONG   : InstrItinClass;
-def CVI_VX_DV        : InstrItinClass;
-def CVI_VX_DV_SLOT2  : InstrItinClass;
-def CVI_VP           : InstrItinClass;
-def CVI_VP_LONG      : InstrItinClass;
-def CVI_VP_VS_EARLY  : InstrItinClass;
-def CVI_VP_VS_LONG_EARLY   : InstrItinClass;
-def CVI_VP_VS_LONG   : InstrItinClass;
-def CVI_VP_VS   : InstrItinClass;
-def CVI_VP_DV        : InstrItinClass;
-def CVI_VS           : InstrItinClass;
-def CVI_VINLANESAT   : InstrItinClass;
-def CVI_VM_LD        : InstrItinClass;
-def CVI_VM_TMP_LD    : InstrItinClass;
-def CVI_VM_CUR_LD    : InstrItinClass;
-def CVI_VM_VP_LDU    : InstrItinClass;
-def CVI_VM_ST        : InstrItinClass;
-def CVI_VM_NEW_ST    : InstrItinClass;
-def CVI_VM_STU       : InstrItinClass;
-def CVI_HIST         : InstrItinClass;
-def CVI_VA_EXT       : InstrItinClass;
 
 // There are four SLOTS (four parallel pipelines) in Hexagon V60 machine.
 // This file describes that machine information.
@@ -103,190 +53,20 @@ def CVI_VA_EXT       : InstrItinClass;
 // S0123| CVI_VA_EXT Extract                                                  |
 //      |=====================================================================|
 
+def HexagonV60ItinList : DepScalarItinV60, ScalarItin,
+                         DepHVXItinV60,
+                         HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV60_list, ScalarItin_list,
+                DepHVXItinV60_list, HVXItin_list, PseudoItin_list);
+}
+
 def HexagonItinerariesV60 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
                             CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
-                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [
-        // ALU32
-        InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
-                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
-                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
-                      [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
-                      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-
-        // ALU64
-        InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
-
-        // CR -> System
-        InstrItinData<CR_tc_2_SLOT3      , [InstrStage<2, [SLOT3]>]>,
-        InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>,
-        InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<3, [SLOT3]>]>,
-
-        // Jump (conditional/unconditional/return etc)
-        InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-
-        // JR
-        InstrItinData<J_tc_2early_SLOT2  , [InstrStage<2, [SLOT2]>]>,
-        InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<3, [SLOT2]>]>,
-
-        // Extender
-        InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
-                              [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-
-        // Load
-        InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
-        InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
-
-        // M
-        InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
-        InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
-
-        // Store
-        InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
-        InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
-
-        // S
-        InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
-        InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
-
-        // New Value Compare Jump
-        InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
-
-        // Mem ops
-        InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>,
-        InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>,
-
-        // Endloop
-        InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>,
-
-        // Vector
-        InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
-                             [InstrStage<3, [SLOT0, SLOT1]>]>,
-        InstrItinData<COPROC_VX_vtc_long_SLOT23  ,
-                             [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<COPROC_VX_vtc_SLOT23 ,
-                             [InstrStage<3, [SLOT2, SLOT3]>]>,
-        InstrItinData<MAPPING_tc_1_SLOT0123      ,
-                             [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-
-        // Duplex and Compound
-        InstrItinData<DUPLEX     , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<COMPOUND_CJ_ARCHDEPSLOT   , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        // Misc
-        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<PSEUDOM    , [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                    InstrStage<1, [SLOT2, SLOT3]>]>,
-
-        // Latest CVI spec definitions.
-        InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLANE,CVI_SHIFT,
-                                                   CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VA_DV,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>,
-        InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>,
-                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VX_DV_LONG,
-                                   [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                    InstrStage<1, [CVI_MPY01]>]>,
-        InstrItinData<CVI_VX_DV,
-                                   [InstrStage<1, [SLOT2, SLOT3], 0>,
-                                    InstrStage<1, [CVI_MPY01]>]>,
-        InstrItinData<CVI_VX_DV_SLOT2,
-                                   [InstrStage<1, [SLOT2], 0>,
-                                    InstrStage<1, [CVI_MPY01]>]>,
-        InstrItinData<CVI_VP,      [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLANE]>]>,
-        InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLANE]>]>,
-        InstrItinData<CVI_VP_VS_EARLY,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLSHF]>]>,
-        InstrItinData<CVI_VP_VS_LONG,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLSHF]>]>,
-        InstrItinData<CVI_VP_VS,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLSHF]>]>,
-        InstrItinData<CVI_VP_VS_LONG_EARLY,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLSHF]>]>,
-        InstrItinData<CVI_VP_DV  , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_XLSHF]>]>,
-        InstrItinData<CVI_VS,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_SHIFT]>]>,
-        InstrItinData<CVI_VINLANESAT,
-                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_SHIFT]>]>,
-        InstrItinData<CVI_VM_LD  , [InstrStage<1, [SLOT0, SLOT1], 0>,
-                                    InstrStage<1, [CVI_LD], 0>,
-                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
-                                                   CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
-                                    InstrStage<1, [CVI_LD]>]>,
-        InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>,
-                                    InstrStage<1, [CVI_LD], 0>,
-                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
-                                                   CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>,
-                                    InstrStage<1, [SLOT1], 0>,
-                                    InstrStage<1, [CVI_LD], 0>,
-                                    InstrStage<1, [CVI_XLANE]>]>,
-        InstrItinData<CVI_VM_ST  , [InstrStage<1, [SLOT0], 0>,
-                                    InstrStage<1, [CVI_ST], 0>,
-                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
-                                                   CVI_MPY0, CVI_MPY1]>]>,
-        InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>,
-                                    InstrStage<1, [CVI_ST]>]>,
-        InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>,
-                                    InstrStage<1, [SLOT1], 0>,
-                                    InstrStage<1, [CVI_ST], 0>,
-                                    InstrStage<1, [CVI_XLANE]>]>,
-        InstrItinData<CVI_HIST   , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
-                                    InstrStage<1, [CVI_ALL]>]>
-      ]>;
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM],
+                            [Hex_FWD, HVX_FWD], HexagonV60ItinList.ItinList>;
 
 def HexagonModelV60 : SchedMachineModel {
   // Max issue per cycle == bundle width.
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV62.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV62.td
new file mode 100644
index 0000000..a0a8595
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV62.td
@@ -0,0 +1,37 @@
+//=-HexagonScheduleV62.td - HexagonV62 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ScalarItin contains some old itineraries still used by a
+// handful of instructions. Hopefully, we will be able to get rid of them soon.
+
+def HexagonV62ItinList : DepScalarItinV62, ScalarItin,
+                         DepHVXItinV62, HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV62_list, ScalarItin_list,
+                DepHVXItinV62_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV62 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM],
+                           [Hex_FWD, HVX_FWD], HexagonV62ItinList.ItinList>;
+
+def HexagonModelV62 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV62;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V62 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 1073053..002e87f 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -51,11 +51,12 @@ SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                 Type::getVoidTy(*DAG.getContext()),
-                 DAG.getTargetExternalSymbol(SpecialMemcpyName,
-                      TLI.getPointerTy(DAG.getDataLayout()), Flags),
-                 std::move(Args))
+      .setLibCallee(
+          TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+          Type::getVoidTy(*DAG.getContext()),
+          DAG.getTargetExternalSymbol(
+              SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout()), Flags),
+          std::move(Args))
       .setDiscardResult();
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 2c93721..4fa929a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -13,8 +13,8 @@
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -131,13 +131,15 @@ namespace {
 INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double",
   "Hexagon Split Double Registers", false, false)
 
-void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os,
       const USet &Part, const TargetRegisterInfo &TRI) {
   dbgs() << '{';
   for (auto I : Part)
     dbgs() << ' ' << PrintReg(I, &TRI);
   dbgs() << " }";
 }
+#endif
 
 bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const {
   for (auto I : IRM) {
@@ -348,6 +350,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
                        MI->getOperand(2).getImm());
     case Hexagon::A4_combineri:
       ImmX++;
+      // Fall through into A4_combineir.
+      LLVM_FALLTHROUGH;
     case Hexagon::A4_combineir: {
       ImmX++;
       int64_t V = MI->getOperand(ImmX).getImm();
@@ -391,7 +395,7 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
 
 bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
       const {
-  unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+  unsigned FixedNum = 0, LoopPhiNum = 0;
   int32_t TotalP = 0;
 
   for (unsigned DR : Part) {
@@ -428,7 +432,6 @@ bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
           LoopPhiNum++;
       }
       // Splittable instruction.
-      SplitNum++;
       int32_t P = profit(UseI);
       if (P == std::numeric_limits<int>::min())
         return false;
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 8c23a24..0aada8a 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.cpp - Hexagon Subtarget Information --------------===//
+//===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HexagonSubtarget.h"
 #include "Hexagon.h"
+#include "HexagonInstrInfo.h"
 #include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
 #include <map>
 
 using namespace llvm;
@@ -73,6 +83,10 @@ static cl::opt<bool> OverrideLongCalls("hexagon-long-calls",
   cl::Hidden, cl::ZeroOrMore, cl::init(false),
   cl::desc("If present, forces/disables the use of long calls"));
 
+static cl::opt<bool> EnablePredicatedCalls("hexagon-pred-calls",
+  cl::Hidden, cl::ZeroOrMore, cl::init(false),
+  cl::desc("Consider calls to be predicable"));
+
 void HexagonSubtarget::initializeEnvironment() {
   UseMemOps = false;
   ModeIEEERndNear = false;
@@ -88,6 +102,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
     { "hexagonv5", V5 },
     { "hexagonv55", V55 },
     { "hexagonv60", V60 },
+    { "hexagonv62", V62 },
   };
 
   auto foundIt = CpuTable.find(CPUString);
@@ -114,9 +129,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      FrameLowering() {
-
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
   initializeEnvironment();
 
   // Initialize scheduling itinerary for the specified CPU.
@@ -138,6 +151,58 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
   UseBSBScheduling = hasV60TOps() && EnableBSBSched;
 }
 
+/// \brief Perform target specific adjustments to the latency of a schedule
+/// dependency.
+void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
+                                             SDep &Dep) const {
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
+  if (!Src->isInstr() || !Dst->isInstr())
+    return;
+
+  const HexagonInstrInfo *QII = getInstrInfo();
+
+  // Instructions with .new operands have zero latency.
+  SmallSet<SUnit *, 4> ExclSrc;
+  SmallSet<SUnit *, 4> ExclDst;
+  if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
+      isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  if (!hasV60TOps())
+    return;
+
+  // If it's a REG_SEQUENCE, use its destination instruction to determine
+  // the correct latency.
+  if (DstInst->isRegSequence() && Dst->NumSuccs == 1) {
+    unsigned RSeqReg = DstInst->getOperand(0).getReg();
+    MachineInstr *RSeqDst = Dst->Succs[0].getSUnit()->getInstr();
+    unsigned UseIdx = -1;
+    for (unsigned OpNum = 0; OpNum < RSeqDst->getNumOperands(); OpNum++) {
+      const MachineOperand &MO = RSeqDst->getOperand(OpNum);
+      if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == RSeqReg) {
+        UseIdx = OpNum;
+        break;
+      }
+    }
+    unsigned RSeqLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst,
+                                                        0, *RSeqDst, UseIdx));
+    Dep.setLatency(RSeqLatency);
+  }
+
+  // Try to schedule uses near definitions to generate .cur.
+  ExclSrc.clear();
+  ExclDst.clear();
+  if (EnableDotCurSched && QII->isToBeScheduledASAP(*SrcInst, *DstInst) &&
+      isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) {
+    Dep.setLatency(0);
+    return;
+  }
+
+  updateLatency(*SrcInst, *DstInst, Dep);
+}
 
 void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
   for (auto &SU : DAG->SUnits) {
@@ -153,19 +218,19 @@ void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
 
   for (auto &SU : DAG->SUnits) {
     // Update the latency of chain edges between v60 vector load or store
-    // instructions to be 1. These instructions cannot be scheduled in the
+    // instructions to be 1. These instruction cannot be scheduled in the
     // same packet.
     MachineInstr &MI1 = *SU.getInstr();
     auto *QII = static_cast<const HexagonInstrInfo*>(DAG->TII);
     bool IsStoreMI1 = MI1.mayStore();
     bool IsLoadMI1 = MI1.mayLoad();
-    if (!QII->isV60VectorInstruction(MI1) || !(IsStoreMI1 || IsLoadMI1))
+    if (!QII->isHVXVec(MI1) || !(IsStoreMI1 || IsLoadMI1))
       continue;
     for (auto &SI : SU.Succs) {
       if (SI.getKind() != SDep::Order || SI.getLatency() != 0)
         continue;
       MachineInstr &MI2 = *SI.getSUnit()->getInstr();
-      if (!QII->isV60VectorInstruction(MI2))
+      if (!QII->isHVXVec(MI2))
         continue;
       if ((IsStoreMI1 && MI2.mayStore()) || (IsLoadMI1 && MI2.mayLoad())) {
         SI.setLatency(1);
@@ -182,18 +247,18 @@ void HexagonSubtarget::HexagonDAGMutation::apply(ScheduleDAGInstrs *DAG) {
   }
 }
 
-
 void HexagonSubtarget::getPostRAMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(
+      llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
 void HexagonSubtarget::getSMSMutations(
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(make_unique<HexagonSubtarget::HexagonDAGMutation>());
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
+  Mutations.push_back(
+      llvm::make_unique<HexagonSubtarget::HexagonDAGMutation>());
 }
 
-
 // Pin the vtable to this file.
 void HexagonSubtarget::anchor() {}
 
@@ -203,69 +268,99 @@ bool HexagonSubtarget::enableMachineScheduler() const {
   return true;
 }
 
-bool HexagonSubtarget::enableSubRegLiveness() const {
-  return EnableSubregLiveness;
+bool HexagonSubtarget::usePredicatedCalls() const {
+  return EnablePredicatedCalls;
 }
 
-// This helper function is responsible for increasing the latency only.
 void HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
       MachineInstr &DstInst, SDep &Dep) const {
+  if (Dep.isArtificial()) {
+    Dep.setLatency(1);
+    return;
+  }
+
   if (!hasV60TOps())
     return;
 
   auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
 
-  if (EnableVecFrwdSched && QII.addLatencyToSchedule(SrcInst, DstInst)) {
-    // Vec frwd scheduling.
-    Dep.setLatency(Dep.getLatency() + 1);
-  } else if (useBSBScheduling() &&
-             QII.isLateInstrFeedsEarlyInstr(SrcInst, DstInst)) {
-    // BSB scheduling.
-    Dep.setLatency(Dep.getLatency() + 1);
-  } else if (EnableTCLatencySched) {
-    // TClass latency scheduling.
-    // Check if SrcInst produces in 2C an operand of DstInst taken in stage 2B.
-    if (QII.isTC1(SrcInst) || QII.isTC2(SrcInst))
-      if (!QII.isTC1(DstInst) && !QII.isTC2(DstInst))
-        Dep.setLatency(Dep.getLatency() + 1);
-  }
+  // BSB scheduling.
+  if (QII.isHVXVec(SrcInst) || useBSBScheduling())
+    Dep.setLatency((Dep.getLatency() + 1) >> 1);
 }
 
-/// If the SUnit has a zero latency edge, return the other SUnit.
-static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
-  for (auto &I : Deps)
-    if (I.isAssignedRegDep() && I.getLatency() == 0 &&
-        !I.getSUnit()->getInstr()->isPseudo())
-      return I.getSUnit();
-  return nullptr;
+void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
+  MachineInstr *SrcI = Src->getInstr();
+  for (auto &I : Src->Succs) {
+    if (!I.isAssignedRegDep() || I.getSUnit() != Dst)
+      continue;
+    unsigned DepR = I.getReg();
+    int DefIdx = -1;
+    for (unsigned OpNum = 0; OpNum < SrcI->getNumOperands(); OpNum++) {
+      const MachineOperand &MO = SrcI->getOperand(OpNum);
+      if (MO.isReg() && MO.isDef() && MO.getReg() == DepR)
+        DefIdx = OpNum;
+    }
+    assert(DefIdx >= 0 && "Def Reg not found in Src MI");
+    MachineInstr *DstI = Dst->getInstr();
+    for (unsigned OpNum = 0; OpNum < DstI->getNumOperands(); OpNum++) {
+      const MachineOperand &MO = DstI->getOperand(OpNum);
+      if (MO.isReg() && MO.isUse() && MO.getReg() == DepR) {
+        int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcI,
+                                                   DefIdx, *DstI, OpNum));
+
+        // For some instructions (ex: COPY), we might end up with < 0 latency
+        // as they don't have any Itinerary class associated with them.
+        if (Latency <= 0)
+          Latency = 1;
+
+        I.setLatency(Latency);
+        updateLatency(*SrcI, *DstI, I);
+      }
+    }
+
+    // Update the latency of opposite edge too.
+    for (auto &J : Dst->Preds) {
+      if (J.getSUnit() != Src)
+        continue;
+      J.setLatency(I.getLatency());
+    }
+  }
 }
 
 /// Change the latency between the two SUnits.
-void HexagonSubtarget::changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps,
-      SUnit *Dst, unsigned Lat) const {
-  MachineInstr &SrcI = *Src->getInstr();
-  for (auto &I : Deps) {
+void HexagonSubtarget::changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat)
+      const {
+  for (auto &I : Src->Succs) {
     if (I.getSUnit() != Dst)
       continue;
+    SDep T = I;
     I.setLatency(Lat);
-    SUnit *UpdateDst = I.getSUnit();
-    updateLatency(SrcI, *UpdateDst->getInstr(), I);
+
     // Update the latency of opposite edge too.
-    for (auto &PI : UpdateDst->Preds) {
-      if (PI.getSUnit() != Src || !PI.isAssignedRegDep())
-        continue;
-      PI.setLatency(Lat);
-      updateLatency(SrcI, *UpdateDst->getInstr(), PI);
-    }
+    T.setSUnit(Src);
+    auto F = std::find(Dst->Preds.begin(), Dst->Preds.end(), T);
+    assert(F != Dst->Preds.end());
+    F->setLatency(I.getLatency());
   }
 }
 
+/// If the SUnit has a zero latency edge, return the other SUnit.
+static SUnit *getZeroLatency(SUnit *N, SmallVector<SDep, 4> &Deps) {
+  for (auto &I : Deps)
+    if (I.isAssignedRegDep() && I.getLatency() == 0 &&
+        !I.getSUnit()->getInstr()->isPseudo())
+      return I.getSUnit();
+  return nullptr;
+}
+
 // Return true if these are the best two instructions to schedule
 // together with a zero latency. Only one dependence should have a zero
 // latency. If there are multiple choices, choose the best, and change
-// ther others, if needed.
+// the others, if needed.
 bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
-      const HexagonInstrInfo *TII) const {
+      const HexagonInstrInfo *TII, SmallSet<SUnit*, 4> &ExclSrc,
+      SmallSet<SUnit*, 4> &ExclDst) const {
   MachineInstr &SrcInst = *Src->getInstr();
   MachineInstr &DstInst = *Dst->getInstr();
 
@@ -276,6 +371,16 @@ bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
   if (SrcInst.isPHI() || DstInst.isPHI())
     return false;
 
+  if (!TII->isToBeScheduledASAP(SrcInst, DstInst) &&
+      !TII->canExecuteInBundle(SrcInst, DstInst))
+    return false;
+
+  // The architecture doesn't allow three dependent instructions in the same
+  // packet. So, if the destination has a zero latency successor, then it's
+  // not a candidate for a zero latency predecessor.
+  if (getZeroLatency(Dst, Dst->Succs) != nullptr)
+    return false;
+
   // Check if the Dst instruction is the best candidate first.
   SUnit *Best = nullptr;
   SUnit *DstBest = nullptr;
@@ -289,98 +394,53 @@ bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst,
   if (Best != Dst)
     return false;
 
-  // The caller frequents adds the same dependence twice. If so, then
+  // The caller frequently adds the same dependence twice. If so, then
   // return true for this case too.
-  if (Src == SrcBest && Dst == DstBest)
+  if ((Src == SrcBest && Dst == DstBest ) ||
+      (SrcBest == nullptr && Dst == DstBest) ||
+      (Src == SrcBest && Dst == nullptr))
     return true;
 
   // Reassign the latency for the previous bests, which requires setting
   // the dependence edge in both directions.
-  if (SrcBest != nullptr)
-    changeLatency(SrcBest, SrcBest->Succs, Dst, 1);
-  if (DstBest != nullptr)
-    changeLatency(Src, Src->Succs, DstBest, 1);
-  // If there is an edge from SrcBest to DstBst, then try to change that
-  // to 0 now.
-  if (SrcBest && DstBest)
-    changeLatency(SrcBest, SrcBest->Succs, DstBest, 0);
-
-  return true;
-}
-
-// Update the latency of a Phi when the Phi bridges two instructions that
-// require a multi-cycle latency.
-void HexagonSubtarget::changePhiLatency(MachineInstr &SrcInst, SUnit *Dst,
-      SDep &Dep) const {
-  if (!SrcInst.isPHI() || Dst->NumPreds == 0 || Dep.getLatency() != 0)
-    return;
-
-  for (const SDep &PI : Dst->Preds) {
-    if (PI.getLatency() != 0)
-      continue;
-    Dep.setLatency(2);
-    break;
+  if (SrcBest != nullptr) {
+    if (!hasV60TOps())
+      changeLatency(SrcBest, Dst, 1);
+    else
+      restoreLatency(SrcBest, Dst);
   }
-}
-
-/// \brief Perform target specific adjustments to the latency of a schedule
-/// dependency.
-void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
-                                             SDep &Dep) const {
-  MachineInstr *SrcInst = Src->getInstr();
-  MachineInstr *DstInst = Dst->getInstr();
-  if (!Src->isInstr() || !Dst->isInstr())
-    return;
-
-  const HexagonInstrInfo *QII = static_cast<const HexagonInstrInfo *>(getInstrInfo());
-
-  // Instructions with .new operands have zero latency.
-  if (QII->canExecuteInBundle(*SrcInst, *DstInst) &&
-      isBestZeroLatency(Src, Dst, QII)) {
-    Dep.setLatency(0);
-    return;
-  }
-
-  if (!hasV60TOps())
-    return;
-
-  // Don't adjust the latency of post-increment part of the instruction.
-  if (QII->isPostIncrement(*SrcInst) && Dep.isAssignedRegDep()) {
-    if (SrcInst->mayStore())
-      return;
-    if (Dep.getReg() != SrcInst->getOperand(0).getReg())
-      return;
-  } else if (QII->isPostIncrement(*DstInst) && Dep.getKind() == SDep::Anti) {
-    if (DstInst->mayStore())
-      return;
-    if (Dep.getReg() != DstInst->getOperand(0).getReg())
-      return;
-  } else if (QII->isPostIncrement(*DstInst) && DstInst->mayStore() &&
-             Dep.isAssignedRegDep()) {
-    MachineOperand &Op = DstInst->getOperand(DstInst->getNumOperands() - 1);
-    if (Op.isReg() && Dep.getReg() != Op.getReg())
-      return;
-  }
-
-  // Check if we need to change any the latency values when Phis are added.
-  if (useBSBScheduling() && SrcInst->isPHI()) {
-    changePhiLatency(*SrcInst, Dst, Dep);
-    return;
+  if (DstBest != nullptr) {
+    if (!hasV60TOps())
+      changeLatency(Src, DstBest, 1);
+    else
+      restoreLatency(Src, DstBest);
   }
 
-  // If it's a REG_SEQUENCE, use its destination instruction to determine
-  // the correct latency.
-  if (DstInst->isRegSequence() && Dst->NumSuccs == 1)
-    DstInst = Dst->Succs[0].getSUnit()->getInstr();
-
-  // Try to schedule uses near definitions to generate .cur.
-  if (EnableDotCurSched && QII->isToBeScheduledASAP(*SrcInst, *DstInst) &&
-      isBestZeroLatency(Src, Dst, QII)) {
-    Dep.setLatency(0);
-    return;
+  // Attempt to find another opprotunity for zero latency in a different
+  // dependence.
+  if (SrcBest && DstBest)
+    // If there is an edge from SrcBest to DstBst, then try to change that
+    // to 0 now.
+    changeLatency(SrcBest, DstBest, 0);
+  else if (DstBest) {
+    // Check if the previous best destination instruction has a new zero
+    // latency dependence opportunity.
+    ExclSrc.insert(Src);
+    for (auto &I : DstBest->Preds)
+      if (ExclSrc.count(I.getSUnit()) == 0 &&
+          isBestZeroLatency(I.getSUnit(), DstBest, TII, ExclSrc, ExclDst))
+        changeLatency(I.getSUnit(), DstBest, 0);
+  } else if (SrcBest) {
+    // Check if previous best source instruction has a new zero latency
+    // dependence opportunity.
+    ExclDst.insert(Dst);
+    for (auto &I : SrcBest->Succs)
+      if (ExclDst.count(I.getSUnit()) == 0 &&
+          isBestZeroLatency(SrcBest, I.getSUnit(), TII, ExclSrc, ExclDst))
+        changeLatency(SrcBest, I.getSUnit(), 0);
   }
 
-  updateLatency(*SrcInst, *DstInst, Dep);
+  return true;
 }
 
 unsigned HexagonSubtarget::getL1CacheLineSize() const {
@@ -391,3 +451,6 @@ unsigned HexagonSubtarget::getL1PrefetchDistance() const {
   return 32;
 }
 
+bool HexagonSubtarget::enableSubRegLiveness() const {
+  return EnableSubregLiveness;
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index f2b9cda..753dca00 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -1,4 +1,4 @@
-//===-- HexagonSubtarget.h - Define Subtarget for the Hexagon ---*- C++ -*-===//
+//===- HexagonSubtarget.h - Define Subtarget for the Hexagon ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,12 +15,17 @@
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
 #include "HexagonFrameLowering.h"
-#include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
+#include "HexagonISelLowering.h"
 #include "HexagonSelectionDAGInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
 #include <string>
+#include <vector>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "HexagonGenSubtargetInfo.inc"
@@ -30,6 +35,12 @@
 
 namespace llvm {
 
+class MachineInstr;
+class SDep;
+class SUnit;
+class TargetMachine;
+class Triple;
+
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
   virtual void anchor();
 
@@ -38,9 +49,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool ModeIEEERndNear;
 
 public:
-  enum HexagonArchEnum {
-    V4, V5, V55, V60
-  };
+#include "HexagonDepArch.h"
 
   HexagonArchEnum HexagonArchVersion;
   /// True if the target should use Back-Skip-Back scheduling. This is the
@@ -59,6 +68,7 @@ private:
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
   InstrItineraryData InstrItins;
+
   void initializeEnvironment();
 
 public:
@@ -98,14 +108,19 @@ public:
   bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
   bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
   bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
+  bool hasV62TOps() const { return getHexagonArchVersion() >= V62; }
+  bool hasV62TOpsOnly() const { return getHexagonArchVersion() == V62; }
+
   bool modeIEEERndNear() const { return ModeIEEERndNear; }
   bool useHVXOps() const { return UseHVXOps; }
   bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
   bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; }
   bool useLongCalls() const { return UseLongCalls; }
+  bool usePredicatedCalls() const;
 
   bool useBSBScheduling() const { return UseBSBScheduling; }
   bool enableMachineScheduler() const override;
+
   // Always use the TargetLowering default scheduler.
   // FIXME: This will use the vliw scheduler which is probably just hurting
   // compiler time and will be removed eventually anyway.
@@ -122,6 +137,7 @@ public:
   unsigned getSmallDataThreshold() const {
     return Hexagon_SMALL_DATA_THRESHOLD;
   }
+
   const HexagonArchEnum &getHexagonArchVersion() const {
     return HexagonArchVersion;
   }
@@ -145,13 +161,12 @@ private:
   // Helper function responsible for increasing the latency only.
   void updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst, SDep &Dep)
       const;
-  void changeLatency(SUnit *Src, SmallVector<SDep, 4> &Deps, SUnit *Dst,
-      unsigned Lat) const;
-  bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII)
-      const;
-  void changePhiLatency(MachineInstr &SrcInst, SUnit *Dst, SDep &Dep) const;
+  void restoreLatency(SUnit *Src, SUnit *Dst) const;
+  void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const;
+  bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII,
+      SmallSet<SUnit*, 4> &ExclSrc, SmallSet<SUnit*, 4> &ExclDst) const;
 };
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td b/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
deleted file mode 100644
index 629a987..0000000
--- a/contrib/llvm/lib/Target/Hexagon/HexagonSystemInst.td
+++ /dev/null
@@ -1,134 +0,0 @@
-//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                     Cache manipulation instructions.
-//===----------------------------------------------------------------------===//
-let mayStore = 1 in
-class ST_MISC_CACHEOP<dag outs, dag ins,
-              string asmstr, list<dag> pattern = [],
-              bits<3> amode, bits<3> type, bits<1> un>
-  : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
-
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<5> Rd;
-    let Inst{31-28} = 0b1010;
-    let Inst{27-25} = amode;
-    let Inst{24-22} = type;
-    let Inst{21}    = un;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rt;
-    let Inst{4-0}   = Rd;
-}
-
-let mayStore = 1 in
-class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
-              string asmstr, list<dag> pattern = [],
-              bits<3> amode, bits<3> type, bits<1> un>
-  : SYSInst<outs, ins, asmstr, pattern, ""> {
-
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<5> Rd;
-    let Inst{31-28} = 0b1010;
-    let Inst{27-25} = amode;
-    let Inst{24-22} = type;
-    let Inst{21}    = un;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rt;
-    let Inst{4-0}   = Rd;
-}
-
-
-let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
-def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
-    "syncht" , [], 0b100, 0b001, 0b0>;
-}
-
-let Rt = 0, Rd = 0 in {
-let isSoloAin1 = 1 in {
-  def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
-  def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
-  def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
-  }
-}
-
-let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
-  def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
-  def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
-      "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
-}
-
-let hasSideEffects = 0, isSolo = 1 in
-class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
-  : JRInst <
-  (outs), (ins IntRegs:$Rs),
-  #mnemonic#"($Rs)" > {
-    bits<5> Rs;
-
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0110110;
-    let Inst{20-16} = Rs;
-    let Inst{13-12} = 0b00;
-    let Inst{11} = MajOp;
-  }
-// Instruction cache invalidate
-def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
-
-// Zero an aligned 32-byte cacheline.
-let isSoloAin1 = 1 in
-def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
-  "dczeroa($Rs)"> {
-    bits<5> Rs;
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b0000110;
-    let Inst{13} = 0b0;
-    let Inst{20-16} = Rs;
-  }
-
-// Memory synchronization.
-let hasSideEffects = 0, isSolo = 1 in
-def Y2_isync: JRInst <(outs), (ins),
-  "isync"> {
-    let IClass = 0b0101;
-    let Inst{27-16} = 0b011111000000;
-    let Inst{13} = 0b0;
-    let Inst{9-0} = 0b0000000010;
-  }
-
-//===----------------------------------------------------------------------===//
-//                     System/User instructions.
-//===----------------------------------------------------------------------===//
-// traps and pause
-let hasSideEffects = 0, isSolo = 1 in
-class J2_MISC_TRAP_PAUSE<string mnemonic, bits<2> MajOp>
-  : JRInst
-  <(outs), (ins u8_0Imm:$u8),
-   #mnemonic#"(#$u8)"> {
-    bits<8> u8;
-
-    let IClass = 0b0101;
-    let Inst{27-24} = 0b0100;
-    let Inst{23-22} = MajOp;
-    let Inst{12-8} = u8{7-3};
-    let Inst{4-2} = u8{2-0};
-  }
-def J2_trap0 : J2_MISC_TRAP_PAUSE<"trap0", 0b00>;
-def J2_trap1 : J2_MISC_TRAP_PAUSE<"trap1", 0b10>;
-def J2_pause : J2_MISC_TRAP_PAUSE<"pause", 0b01>;
-
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 132d12a..7d88b51 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
@@ -98,11 +99,6 @@ static cl::opt<bool> EnableVectorPrint("enable-hexagon-vector-print",
 extern "C" int HexagonTargetMachineModule;
 int HexagonTargetMachineModule = 0;
 
-extern "C" void LLVMInitializeHexagonTarget() {
-  // Register the target.
-  RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
-}
-
 static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
   return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
 }
@@ -114,6 +110,12 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
 namespace llvm {
   extern char &HexagonExpandCondsetsID;
   void initializeHexagonExpandCondsetsPass(PassRegistry&);
+  void initializeHexagonGenMuxPass(PassRegistry&);
+  void initializeHexagonLoopIdiomRecognizePass(PassRegistry&);
+  void initializeHexagonNewValueJumpPass(PassRegistry&);
+  void initializeHexagonOptAddrModePass(PassRegistry&);
+  void initializeHexagonPacketizerPass(PassRegistry&);
+  Pass *createHexagonLoopIdiomPass();
 
   FunctionPass *createHexagonBitSimplify();
   FunctionPass *createHexagonBranchRelaxation();
@@ -150,6 +152,18 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
+extern "C" void LLVMInitializeHexagonTarget() {
+  // Register the target.
+  RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget());
+
+  PassRegistry &PR = *PassRegistry::getPassRegistry();
+  initializeHexagonGenMuxPass(PR);
+  initializeHexagonLoopIdiomRecognizePass(PR);
+  initializeHexagonNewValueJumpPass(PR);
+  initializeHexagonOptAddrModePass(PR);
+  initializeHexagonPacketizerPass(PR);
+}
+
 HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
@@ -172,11 +186,11 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
 
 const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
+  AttributeList FnAttrs = F.getAttributes();
   Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-cpu");
   Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
@@ -196,6 +210,14 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
+void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) {
+  PMB.addExtension(
+    PassManagerBuilder::EP_LateLoopOptimizations,
+    [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      PM.add(createHexagonLoopIdiomPass());
+    });
+}
+
 TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() {
   return TargetIRAnalysis([this](const Function &F) {
     return TargetTransformInfo(HexagonTTIImpl(this, F));
@@ -209,7 +231,7 @@ namespace {
 /// Hexagon Code Generator Pass Configuration Options.
 class HexagonPassConfig : public TargetPassConfig {
 public:
-  HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
+  HexagonPassConfig(HexagonTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
@@ -231,14 +253,14 @@ public:
 } // namespace
 
 TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new HexagonPassConfig(this, PM);
+  return new HexagonPassConfig(*this, PM);
 }
 
 void HexagonPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
-  addPass(createAtomicExpandPass(TM));
+  addPass(createAtomicExpandPass());
   if (!NoOpt) {
     if (EnableLoopPrefetch)
       addPass(createLoopDataPrefetchPass());
@@ -262,27 +284,26 @@ bool HexagonPassConfig::addInstSelector() {
   if (!NoOpt) {
     // Create logical operations on predicate registers.
     if (EnableGenPred)
-      addPass(createHexagonGenPredicate(), false);
+      addPass(createHexagonGenPredicate());
     // Rotate loops to expose bit-simplification opportunities.
     if (EnableLoopResched)
-      addPass(createHexagonLoopRescheduling(), false);
+      addPass(createHexagonLoopRescheduling());
     // Split double registers.
     if (!DisableHSDR)
       addPass(createHexagonSplitDoubleRegs());
     // Bit simplification.
     if (EnableBitSimplify)
-      addPass(createHexagonBitSimplify(), false);
+      addPass(createHexagonBitSimplify());
     addPass(createHexagonPeephole());
-    printAndVerify("After hexagon peephole pass");
     // Constant propagation.
     if (!DisableHCP) {
-      addPass(createHexagonConstPropagationPass(), false);
-      addPass(&UnreachableMachineBlockElimID, false);
+      addPass(createHexagonConstPropagationPass());
+      addPass(&UnreachableMachineBlockElimID);
     }
     if (EnableGenInsert)
-      addPass(createHexagonGenInsert(), false);
+      addPass(createHexagonGenInsert());
     if (EnableEarlyIf)
-      addPass(createHexagonEarlyIfConversion(), false);
+      addPass(createHexagonEarlyIfConversion());
   }
 
   return false;
@@ -293,9 +314,9 @@ void HexagonPassConfig::addPreRegAlloc() {
     if (EnableExpandCondsets)
       insertPass(&RegisterCoalescerID, &HexagonExpandCondsetsID);
     if (!DisableStoreWidening)
-      addPass(createHexagonStoreWidening(), false);
+      addPass(createHexagonStoreWidening());
     if (!DisableHardwareLoops)
-      addPass(createHexagonHardwareLoops(), false);
+      addPass(createHexagonHardwareLoops());
   }
   if (TM->getOptLevel() >= CodeGenOpt::Default)
     addPass(&MachinePipelinerID);
@@ -306,16 +327,16 @@ void HexagonPassConfig::addPostRegAlloc() {
     if (EnableRDFOpt)
       addPass(createHexagonRDFOpt());
     if (!DisableHexagonCFGOpt)
-      addPass(createHexagonCFGOptimizer(), false);
+      addPass(createHexagonCFGOptimizer());
     if (!DisableAModeOpt)
-      addPass(createHexagonOptAddrMode(), false);
+      addPass(createHexagonOptAddrMode());
   }
 }
 
 void HexagonPassConfig::addPreSched2() {
-  addPass(createHexagonCopyToCombine(), false);
+  addPass(createHexagonCopyToCombine());
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(&IfConverterID, false);
+    addPass(&IfConverterID);
   addPass(createHexagonSplitConst32AndConst64());
 }
 
@@ -323,17 +344,17 @@ void HexagonPassConfig::addPreEmitPass() {
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
   if (!NoOpt)
-    addPass(createHexagonNewValueJump(), false);
+    addPass(createHexagonNewValueJump());
 
-  addPass(createHexagonBranchRelaxation(), false);
+  addPass(createHexagonBranchRelaxation());
 
   // Create Packets.
   if (!NoOpt) {
     if (!DisableHardwareLoops)
-      addPass(createHexagonFixupHwLoops(), false);
+      addPass(createHexagonFixupHwLoops());
     // Generate MUX from pairs of conditional transfers.
     if (EnableGenMux)
-      addPass(createHexagonGenMux(), false);
+      addPass(createHexagonGenMux());
 
     addPass(createHexagonPacketizer(), false);
   }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
index 70835c0..3d01929 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -37,6 +37,7 @@ public:
 
   static unsigned getModuleMatchQuality(const Module &M);
 
+  void adjustPassManager(PassManagerBuilder &PMB) override;
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetIRAnalysis getTargetIRAnalysis() override;
 
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index c9c4f95..ea86c9c 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalObject.h"
@@ -28,7 +29,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -49,6 +49,14 @@ static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
   cl::Hidden, cl::init(false),
   cl::desc("Trace global value placement"));
 
+static cl::opt<bool>
+    EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon jump tables in function section"));
+
+static cl::opt<bool>
+    EmitLutInText("hexagon-emit-lut-text", cl::Hidden, cl::init(false),
+                 cl::desc("Emit hexagon lookup tables in function section"));
+
 // TraceGVPlacement controls messages for all builds. For builds with assertions
 // (debug or release), messages are also controlled by the usual debug flags
 // (e.g. -debug and -debug-only=globallayout)
@@ -132,6 +140,13 @@ MCSection *HexagonTargetObjectFile::SelectSectionForGlobal(
          << (Kind.isBSS() ? "kind_bss " : "" )
          << (Kind.isBSSLocal() ? "kind_bss_local " : "" ));
 
+  // If the lookup table is used by more than one function, do not place
+  // it in text section.
+  if (EmitLutInText && GO->getName().startswith("switch.table")) {
+    if (const Function *Fn = getLutUsedFunction(GO))
+      return selectSectionForLookupTable(GO, TM, Fn);
+  }
+
   if (isGlobalInSmallSection(GO, TM))
     return selectSmallSectionForGlobal(GO, Kind, TM);
 
@@ -256,6 +271,11 @@ unsigned HexagonTargetObjectFile::getSmallDataSize() const {
   return SmallDataThreshold;
 }
 
+bool HexagonTargetObjectFile::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  return EmitJtInText;
+}
+
 /// Descends any type down to "elementary" components,
 /// discovering the smallest addressable one.
 /// If zero is returned, declaration will not be modified.
@@ -393,3 +413,39 @@ MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal(
   // Otherwise, we work the same as ELF.
   return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM);
 }
+
+// Return the function that uses the lookup table. If there are more
+// than one live function that uses this look table, bail out and place
+// the lookup table in default section.
+const Function *
+HexagonTargetObjectFile::getLutUsedFunction(const GlobalObject *GO) const {
+  const Function *ReturnFn = nullptr;
+  for (auto U : GO->users()) {
+    // validate each instance of user to be a live function.
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I)
+      continue;
+    auto *Bb = I->getParent();
+    if (!Bb)
+      continue;
+    auto *UserFn = Bb->getParent();
+    if (!ReturnFn)
+      ReturnFn = UserFn;
+    else if (ReturnFn != UserFn)
+      return nullptr;
+  }
+  return ReturnFn;
+}
+
+MCSection *HexagonTargetObjectFile::selectSectionForLookupTable(
+    const GlobalObject *GO, const TargetMachine &TM, const Function *Fn) const {
+
+  SectionKind Kind = SectionKind::getText();
+  // If the function has explicit section, place the lookup table in this
+  // explicit section.
+  if (Fn->hasSection())
+    return getExplicitSectionGlobal(Fn, Kind, TM);
+
+  const auto *FuncObj = dyn_cast<GlobalObject>(Fn);
+  return SelectSectionForGlobal(FuncObj, Kind, TM);
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 58dff2b..eff44f0 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -33,6 +33,11 @@ namespace llvm {
 
     unsigned getSmallDataSize() const;
 
+    bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                             const Function &F) const override;
+
+    const Function *getLutUsedFunction(const GlobalObject *GO) const;
+
   private:
     MCSectionELF *SmallDataSection;
     MCSectionELF *SmallBSSSection;
@@ -43,6 +48,10 @@ namespace llvm {
     MCSection *selectSmallSectionForGlobal(const GlobalObject *GO,
                                            SectionKind Kind,
                                            const TargetMachine &TM) const;
+
+    MCSection *selectSectionForLookupTable(const GlobalObject *GO,
+                                           const TargetMachine &TM,
+                                           const Function *Fn) const;
   };
 
 } // namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index d578bfa..aac810e 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -21,6 +21,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "hexagontti"
 
+static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables",
+  cl::init(true), cl::Hidden,
+  cl::desc("Control lookup table emission on Hexagon target"));
+
 TargetTransformInfo::PopcntSupportKind
 HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
   // Return Fast Hardware support as every input  < 64 bits will be promoted
@@ -29,7 +33,7 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
 }
 
 // The Hexagon target can unroll loops with run-time trip counts.
-void HexagonTTIImpl::getUnrollingPreferences(Loop *L,
+void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   UP.Runtime = UP.Partial = true;
 }
@@ -46,8 +50,9 @@ unsigned HexagonTTIImpl::getCacheLineSize() const {
   return getST()->getL1CacheLineSize();
 }
 
-int HexagonTTIImpl::getUserCost(const User *U) {
-  auto isCastFoldedIntoLoad = [] (const CastInst *CI) -> bool {
+int HexagonTTIImpl::getUserCost(const User *U,
+                                ArrayRef<const Value *> Operands) {
+  auto isCastFoldedIntoLoad = [](const CastInst *CI) -> bool {
     if (!CI->isIntegerCast())
       return false;
     const LoadInst *LI = dyn_cast<const LoadInst>(CI->getOperand(0));
@@ -67,5 +72,9 @@ int HexagonTTIImpl::getUserCost(const User *U) {
   if (const CastInst *CI = dyn_cast<const CastInst>(U))
     if (isCastFoldedIntoLoad(CI))
       return TargetTransformInfo::TCC_Free;
-  return BaseT::getUserCost(U);
+  return BaseT::getUserCost(U, Operands);
+}
+
+bool HexagonTTIImpl::shouldBuildLookupTables() const {
+   return EmitLookupTables;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 8414bfc..ab5a6e0 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -46,7 +46,8 @@ public:
   TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const;
 
   // The Hexagon target can unroll loops with run-time trip counts.
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   // L1 cache prefetch.
   unsigned getPrefetchDistance() const;
@@ -61,7 +62,10 @@ public:
 
   /// @}
 
-  int getUserCost(const User *U);
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands);
+
+  // Hexagon specific decision to generate a lookup table.
+  bool shouldBuildLookupTables() const;
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 7b1247d..a3021e3 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -16,10 +16,10 @@
 // prune the dependence.
 //
 //===----------------------------------------------------------------------===//
+#include "HexagonVLIWPacketizer.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "HexagonTargetMachine.h"
-#include "HexagonVLIWPacketizer.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -60,9 +60,7 @@ namespace {
   class HexagonPacketizer : public MachineFunctionPass {
   public:
     static char ID;
-    HexagonPacketizer() : MachineFunctionPass(ID) {
-      initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry());
-    }
+    HexagonPacketizer() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
@@ -89,14 +87,14 @@ namespace {
   char HexagonPacketizer::ID = 0;
 }
 
-INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer",
-                      false, false)
+INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer",
+                      "Hexagon Packetizer", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer",
-                    false, false)
+INITIALIZE_PASS_END(HexagonPacketizer, "hexagon-packetizer",
+                    "Hexagon Packetizer", false, false)
 
 HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF,
       MachineLoopInfo &MLI, AliasAnalysis *AA,
@@ -214,12 +212,12 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MB : MF) {
     auto Begin = MB.begin(), End = MB.end();
     while (Begin != End) {
-      // First the first non-boundary starting from the end of the last
+      // Find the first non-boundary starting from the end of the last
       // scheduling region.
       MachineBasicBlock::iterator RB = Begin;
       while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF))
         ++RB;
-      // First the first boundary starting from the beginning of the new
+      // Find the first boundary starting from the beginning of the new
       // region.
       MachineBasicBlock::iterator RE = RB;
       while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF))
@@ -273,25 +271,17 @@ bool HexagonPacketizerList::isCallDependent(const MachineInstr &MI,
     if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister())
       return true;
 
-  // Check if this is a predicate dependence.
-  const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg);
-  if (RC == &Hexagon::PredRegsRegClass)
-    return true;
-
-  // Assumes that the first operand of the CALLr is the function address.
-  if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) {
-    const MachineOperand MO = MI.getOperand(0);
-    if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg))
-      return true;
+  // Call-like instructions can be packetized with preceding instructions
+  // that define registers implicitly used or modified by the call. Explicit
+  // uses are still prohibited, as in the case of indirect calls:
+  //   r0 = ...
+  //   J2_jumpr r0
+  if (DepType == SDep::Data) {
+    for (const MachineOperand MO : MI.operands())
+      if (MO.isReg() && MO.getReg() == DepReg && !MO.isImplicit())
+        return true;
   }
 
-  if (HII->isJumpR(MI)) {
-    const MachineOperand &MO = HII->isPredicated(MI) ? MI.getOperand(1)
-                                                     : MI.getOperand(0);
-    assert(MO.isReg() && MO.isUse());
-    if (MO.getReg() == DepReg)
-      return true;
-  }
   return false;
 }
 
@@ -333,11 +323,13 @@ bool HexagonPacketizerList::isNewifiable(const MachineInstr &MI,
       const TargetRegisterClass *NewRC) {
   // Vector stores can be predicated, and can be new-value stores, but
   // they cannot be predicated on a .new predicate value.
-  if (NewRC == &Hexagon::PredRegsRegClass)
-    if (HII->isV60VectorInstruction(MI) && MI.mayStore())
+  if (NewRC == &Hexagon::PredRegsRegClass) {
+    if (HII->isHVXVec(MI) && MI.mayStore())
       return false;
-  return HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn() ||
-         HII->mayBeNewStore(MI);
+    return HII->isPredicated(MI) && HII->getDotNewPredOp(MI, nullptr) > 0;
+  }
+  // If the class is not PredRegs, it could only apply to new-value stores.
+  return HII->mayBeNewStore(MI);
 }
 
 // Promote an instructiont to its .cur form.
@@ -356,7 +348,7 @@ void HexagonPacketizerList::cleanUpDotCur() {
   MachineInstr *MI = nullptr;
   for (auto BI : CurrentPacketMIs) {
     DEBUG(dbgs() << "Cleanup packet has "; BI->dump(););
-    if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) {
+    if (HII->isDotCurInst(*BI)) {
       MI = BI;
       continue;
     }
@@ -369,7 +361,7 @@ void HexagonPacketizerList::cleanUpDotCur() {
   if (!MI)
     return;
   // We did not find a use of the CUR, so de-cur it.
-  MI->setDesc(HII->get(Hexagon::V6_vL32b_ai));
+  MI->setDesc(HII->get(HII->getNonDotCurOp(*MI)));
   DEBUG(dbgs() << "Demoted CUR "; MI->dump(););
 }
 
@@ -377,9 +369,9 @@ void HexagonPacketizerList::cleanUpDotCur() {
 bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr &MI,
       const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII,
       const TargetRegisterClass *RC) {
-  if (!HII->isV60VectorInstruction(MI))
+  if (!HII->isHVXVec(MI))
     return false;
-  if (!HII->isV60VectorInstruction(*MII))
+  if (!HII->isHVXVec(*MII))
     return false;
 
   // Already a dot new instruction.
@@ -440,7 +432,7 @@ bool HexagonPacketizerList::promoteToDotNew(MachineInstr &MI,
 }
 
 bool HexagonPacketizerList::demoteToDotOld(MachineInstr &MI) {
-  int NewOpcode = HII->getDotOldOp(MI.getOpcode());
+  int NewOpcode = HII->getDotOldOp(MI);
   MI.setDesc(HII->get(NewOpcode));
   return true;
 }
@@ -720,6 +712,8 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
   // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
   for (auto &MO : PacketMI.operands()) {
+    if (MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+      return false;
     if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
       continue;
     unsigned R = MO.getReg();
@@ -758,10 +752,16 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI,
   return false;
 }
 
-static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) {
-  for (auto &MO : I.operands())
-    if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
+static bool isImplicitDependency(const MachineInstr &I, bool CheckDef,
+      unsigned DepReg) {
+  for (auto &MO : I.operands()) {
+    if (CheckDef && MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+      return true;
+    if (!MO.isReg() || MO.getReg() != DepReg || !MO.isImplicit())
+      continue;
+    if (CheckDef == MO.isDef())
       return true;
+  }
   return false;
 }
 
@@ -793,7 +793,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
 
   // If dependency is trough an implicitly defined register, we should not
   // newify the use.
-  if (isImplicitDependency(PI, DepReg))
+  if (isImplicitDependency(PI, true, DepReg) ||
+      isImplicitDependency(MI, false, DepReg))
     return false;
 
   const MCInstrDesc& MCID = PI.getDesc();
@@ -803,8 +804,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
 
   // predicate .new
   if (RC == &Hexagon::PredRegsRegClass)
-    if (HII->isCondInst(MI) || HII->isJumpR(MI) || MI.isReturn())
-      return HII->predCanBeUsedAsDotNew(PI, DepReg);
+    return HII->predCanBeUsedAsDotNew(PI, DepReg);
 
   if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI))
     return false;
@@ -1046,7 +1046,9 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
     // XTYPE instructions.  Since there is no convenient way of identifying fp
     // XTYPE instructions, only allow grouping with ALU32 for now.
     unsigned TJ = HII.getType(MJ);
-    if (TJ != HexagonII::TypeALU32)
+    if (TJ != HexagonII::TypeALU32_2op &&
+        TJ != HexagonII::TypeALU32_3op &&
+        TJ != HexagonII::TypeALU32_ADDI)
       return true;
     break;
   }
@@ -1171,6 +1173,36 @@ bool HexagonPacketizerList::hasControlDependence(const MachineInstr &I,
          (J.isBranch() || J.isCall() || J.isBarrier());
 }
 
+bool HexagonPacketizerList::hasRegMaskDependence(const MachineInstr &I,
+                                                 const MachineInstr &J) {
+  // Adding I to a packet that has J.
+
+  // Regmasks are not reflected in the scheduling dependency graph, so
+  // we need to check them manually. This code assumes that regmasks only
+  // occur on calls, and the problematic case is when we add an instruction
+  // defining a register R to a packet that has a call that clobbers R via
+  // a regmask. Those cannot be packetized together, because the call will
+  // be executed last. That's also a reson why it is ok to add a call
+  // clobbering R to a packet that defines R.
+
+  // Look for regmasks in J.
+  for (const MachineOperand &OpJ : J.operands()) {
+    if (!OpJ.isRegMask())
+      continue;
+    assert((J.isCall() || HII->isTailCall(J)) && "Regmask on a non-call");
+    for (const MachineOperand &OpI : I.operands()) {
+      if (OpI.isReg()) {
+        if (OpJ.clobbersPhysReg(OpI.getReg()))
+          return true;
+      } else if (OpI.isRegMask()) {
+        // Both are regmasks. Assume that they intersect.
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
                                                     const MachineInstr &J) {
   bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
@@ -1217,6 +1249,14 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   if (Dependence)
     return false;
 
+  // Regmasks are not accounted for in the scheduling graph, so we need
+  // to explicitly check for dependencies caused by them. They should only
+  // appear on calls, so it's not too pessimistic to reject all regmask
+  // dependencies.
+  Dependence = hasRegMaskDependence(I, J);
+  if (Dependence)
+    return false;
+
   // V4 allows dual stores. It does not allow second store, if the first
   // store is not in SLOT0. New value store, new value jump, dealloc_return
   // and memop always take SLOT0. Arch spec 3.4.4.2.
@@ -1320,7 +1360,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
 
     // Data dpendence ok if we have load.cur.
     if (DepType == SDep::Data && HII->isDotCurInst(J)) {
-      if (HII->isV60VectorInstruction(I))
+      if (HII->isHVXVec(I))
         continue;
     }
 
@@ -1329,6 +1369,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) {
         if (promoteToDotNew(I, DepType, II, RC)) {
           PromotedToDotNew = true;
+          if (cannotCoexist(I, J))
+            FoundSequentialDependence = true;
           continue;
         }
       }
@@ -1373,26 +1415,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
         DepType != SDep::Output)
       continue;
 
-    // Ignore output dependences due to superregs. We can write to two
-    // different subregisters of R1:0 for instance in the same cycle.
-
-    // If neither I nor J defines DepReg, then this is a superfluous output
-    // dependence. The dependence must be of the form:
-    //   R0 = ...
-    //   R1 = ...
-    // and there is an output dependence between the two instructions with
-    // DepReg = D0.
-    // We want to ignore these dependences. Ideally, the dependence
-    // constructor should annotate such dependences. We can then avoid this
-    // relatively expensive check.
-    //
     if (DepType == SDep::Output) {
-      // DepReg is the register that's responsible for the dependence.
-      unsigned DepReg = SUJ->Succs[i].getReg();
-
-      // Check if I and J really defines DepReg.
-      if (!I.definesRegister(DepReg) && !J.definesRegister(DepReg))
-        continue;
       FoundSequentialDependence = true;
       break;
     }
@@ -1465,13 +1488,19 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     //   R0 = ...                   ; SUI
     // Those cannot be packetized together, since the call will observe
     // the effect of the assignment to R0.
-    if (DepType == SDep::Anti && J.isCall()) {
+    if ((DepType == SDep::Anti || DepType == SDep::Output) && J.isCall()) {
       // Check if I defines any volatile register. We should also check
       // registers that the call may read, but these happen to be a
       // subset of the volatile register set.
-      for (const MCPhysReg *P = J.getDesc().ImplicitDefs; P && *P; ++P) {
-        if (!I.modifiesRegister(*P, HRI))
+      for (const MachineOperand &Op : I.operands()) {
+        if (Op.isReg() && Op.isDef()) {
+          unsigned R = Op.getReg();
+          if (!J.readsRegister(R, HRI) && !J.modifiesRegister(R, HRI))
+            continue;
+        } else if (!Op.isRegMask()) {
+          // If I has a regmask assume dependency.
           continue;
+        }
         FoundSequentialDependence = true;
         break;
       }
@@ -1502,10 +1531,9 @@ bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {
   MachineInstr &I = *SUI->getInstr();
   MachineInstr &J = *SUJ->getInstr();
 
-  if (cannotCoexist(I, J))
-    return false;
+  bool Coexist = !cannotCoexist(I, J);
 
-  if (!Dependence)
+  if (Coexist && !Dependence)
     return true;
 
   // Check if the instruction was promoted to a dot-new. If so, demote it
@@ -1528,14 +1556,13 @@ MachineBasicBlock::iterator
 HexagonPacketizerList::addToPacket(MachineInstr &MI) {
   MachineBasicBlock::iterator MII = MI.getIterator();
   MachineBasicBlock *MBB = MI.getParent();
-  if (MI.isImplicitDef()) {
-    unsigned R = MI.getOperand(0).getReg();
-    if (Hexagon::IntRegsRegClass.contains(R)) {
-      MCSuperRegIterator S(R, HRI, false);
-      MI.addOperand(MachineOperand::CreateReg(*S, true, true));
-    }
+
+  if (CurrentPacketMIs.size() == 0)
+    PacketStalls = false;
+  PacketStalls |= producesStall(MI);
+
+  if (MI.isImplicitDef())
     return MII;
-  }
   assert(ResourceTracker->canReserveResources(MI));
 
   bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI);
@@ -1609,23 +1636,13 @@ bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
 }
 
 
-// Return true when ConsMI uses a register defined by ProdMI.
-static bool isDependent(const MachineInstr &ProdMI,
-      const MachineInstr &ConsMI) {
-  if (!ProdMI.getOperand(0).isReg())
-    return false;
-  unsigned DstReg = ProdMI.getOperand(0).getReg();
-
-  for (auto &Op : ConsMI.operands())
-    if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg)
-      // The MIs depend on each other.
-      return true;
-
-  return false;
-}
-
 // V60 forward scheduling.
 bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
+  // If the packet already stalls, then ignore the stall from a subsequent
+  // instruction in the same packet.
+  if (PacketStalls)
+    return false;
+
   // Check whether the previous packet is in a different loop. If this is the
   // case, there is little point in trying to avoid a stall because that would
   // favor the rare case (loop entry) over the common case (loop iteration).
@@ -1640,40 +1657,58 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
       return false;
   }
 
-  // Check for stall between two vector instructions.
-  if (HII->isV60VectorInstruction(I)) {
-    for (auto J : OldPacketMIs) {
-      if (!HII->isV60VectorInstruction(*J))
-        continue;
-      if (isDependent(*J, I) && !HII->isVecUsableNextPacket(*J, I))
-        return true;
-    }
-    return false;
+  SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
+
+  // Check if the latency is 0 between this instruction and any instruction
+  // in the current packet. If so, we disregard any potential stalls due to
+  // the instructions in the previous packet. Most of the instruction pairs
+  // that can go together in the same packet have 0 latency between them.
+  // Only exceptions are newValueJumps as they're generated much later and
+  // the latencies can't be changed at that point. Another is .cur
+  // instructions if its consumer has a 0 latency successor (such as .new).
+  // In this case, the latency between .cur and the consumer stays non-zero
+  // even though we can have  both .cur and .new in the same packet. Changing
+  // the latency to 0 is not an option as it causes software pipeliner to
+  // not pipeline in some cases.
+
+  // For Example:
+  // {
+  //   I1:  v6.cur = vmem(r0++#1)
+  //   I2:  v7 = valign(v6,v4,r2)
+  //   I3:  vmem(r5++#1) = v7.new
+  // }
+  // Here I2 and I3 has 0 cycle latency, but I1 and I2 has 2.
+
+  for (auto J : CurrentPacketMIs) {
+    SUnit *SUJ = MIToSUnit[J];
+    for (auto &Pred : SUI->Preds)
+      if (Pred.getSUnit() == SUJ &&
+          (Pred.getLatency() == 0 || HII->isNewValueJump(I) ||
+           HII->isToBeScheduledASAP(*J, I)))
+        return false;
   }
 
-  // Check for stall between two scalar instructions. First, check that
-  // there is no definition of a use in the current packet, because it
-  // may be a candidate for .new.
-  for (auto J : CurrentPacketMIs)
-    if (!HII->isV60VectorInstruction(*J) && isDependent(*J, I))
-      return false;
+  // Check if the latency is greater than one between this instruction and any
+  // instruction in the previous packet.
+  for (auto J : OldPacketMIs) {
+    SUnit *SUJ = MIToSUnit[J];
+    for (auto &Pred : SUI->Preds)
+      if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
+        return true;
+  }
 
-  // Check for stall between I and instructions in the previous packet.
-  if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) {
-    for (auto J : OldPacketMIs) {
-      if (HII->isV60VectorInstruction(*J))
-        continue;
-      if (!HII->isLateInstrFeedsEarlyInstr(*J, I))
-        continue;
-      if (isDependent(*J, I) && !HII->canExecuteInBundle(*J, I))
+  // Check if the latency is greater than one between this instruction and any
+  // instruction in the previous packet.
+  for (auto J : OldPacketMIs) {
+    SUnit *SUJ = MIToSUnit[J];
+    for (auto &Pred : SUI->Preds)
+      if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
         return true;
-    }
   }
 
   return false;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index b28b926..adb92b6 100644
--- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -7,6 +7,9 @@
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 
 namespace llvm {
+class HexagonInstrInfo;
+class HexagonRegisterInfo;
+
 class HexagonPacketizerList : public VLIWPacketizerList {
   // Vector of instructions assigned to the packet that has just been created.
   std::vector<MachineInstr*> OldPacketMIs;
@@ -31,6 +34,10 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   // Track MIs with ignored dependence.
   std::vector<MachineInstr*> IgnoreDepMIs;
 
+  // Set to true if the packet contains an instruction that stalls with an
+  // instruction from the previous packet.
+  bool PacketStalls = false;
+
 protected:
   /// \brief A handle to the branch probability pass.
   const MachineBranchProbabilityInfo *MBPI;
@@ -109,6 +116,7 @@ protected:
   void reserveResourcesForConstExt();
   bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
 };
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index c140bd1..2a0edda 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -9,12 +9,12 @@
 
 #include "Hexagon.h"
 #include "HexagonFixupKinds.h"
-#include "HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCCodeEmitter.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -58,10 +58,12 @@ class HexagonAsmBackend : public MCAsmBackend {
     RF.getContents() = Code;
     RF.getFixups() = Fixups;
   }
+
 public:
-  HexagonAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) :
-    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
-    Extender(nullptr) {}
+  HexagonAsmBackend(const Target &T, const Triple &TT, uint8_t OSABI,
+      StringRef CPU) :
+      OSABI(OSABI), CPU(CPU), MCII(T.createMCInstrInfo()),
+      RelaxTarget(new MCInst *), Extender(nullptr) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createHexagonELFObjectWriter(OS, OSABI, CPU);
@@ -88,101 +90,105 @@ public:
       // This table *must* be in same the order of fixup_* kinds in
       // HexagonFixupKinds.h.
       //
-      // namei                          offset  bits  flags
-      { "fixup_Hexagon_B22_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B15_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B7_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_LO16",             0,    32,   0 },
-      { "fixup_Hexagon_HI16",             0,    32,   0 },
-      { "fixup_Hexagon_32",               0,    32,   0 },
-      { "fixup_Hexagon_16",               0,    32,   0 },
-      { "fixup_Hexagon_8",                0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_0",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_1",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_2",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_3",        0,    32,   0 },
-      { "fixup_Hexagon_HL16",             0,    32,   0 },
-      { "fixup_Hexagon_B13_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B9_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B32_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_32_6_X",           0,    32,   0 },
-      { "fixup_Hexagon_B22_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B15_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B13_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B9_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B7_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_16_X",             0,    32,   0 },
-      { "fixup_Hexagon_12_X",             0,    32,   0 },
-      { "fixup_Hexagon_11_X",             0,    32,   0 },
-      { "fixup_Hexagon_10_X",             0,    32,   0 },
-      { "fixup_Hexagon_9_X",              0,    32,   0 },
-      { "fixup_Hexagon_8_X",              0,    32,   0 },
-      { "fixup_Hexagon_7_X",              0,    32,   0 },
-      { "fixup_Hexagon_6_X",              0,    32,   0 },
-      { "fixup_Hexagon_32_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_COPY",             0,    32,   0 },
-      { "fixup_Hexagon_GLOB_DAT",         0,    32,   0 },
-      { "fixup_Hexagon_JMP_SLOT",         0,    32,   0 },
-      { "fixup_Hexagon_RELATIVE",         0,    32,   0 },
-      { "fixup_Hexagon_PLT_B22_PCREL",    0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GOTREL_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_32",        0,    32,   0 },
-      { "fixup_Hexagon_GOT_LO16",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_HI16",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_32",           0,    32,   0 },
-      { "fixup_Hexagon_GOT_16",           0,    32,   0 },
-      { "fixup_Hexagon_DTPMOD_32",        0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_32",        0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_16",        0,    32,   0 },
-      { "fixup_Hexagon_GD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_LD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GD_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_IE_LO16",          0,    32,   0 },
-      { "fixup_Hexagon_IE_HI16",          0,    32,   0 },
-      { "fixup_Hexagon_IE_32",            0,    32,   0 },
-      { "fixup_Hexagon_IE_16",            0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_TPREL_LO16",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_HI16",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_32",         0,    32,   0 },
-      { "fixup_Hexagon_TPREL_16",         0,    32,   0 },
-      { "fixup_Hexagon_6_PCREL_X",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GOTREL_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_GOT_32_6_X",       0,    32,   0 },
-      { "fixup_Hexagon_GOT_16_X",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_11_X",         0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_IE_32_6_X",        0,    32,   0 },
-      { "fixup_Hexagon_IE_16_X",          0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_TPREL_32_6_X",     0,    32,   0 },
-      { "fixup_Hexagon_TPREL_16_X",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_11_X",       0,    32,   0 }
+      // namei                          offset  bits    flags
+      { "fixup_Hexagon_B22_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LO16",           0,      32,     0 },
+      { "fixup_Hexagon_HI16",           0,      32,     0 },
+      { "fixup_Hexagon_32",             0,      32,     0 },
+      { "fixup_Hexagon_16",             0,      32,     0 },
+      { "fixup_Hexagon_8",              0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_0",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_1",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_2",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_3",      0,      32,     0 },
+      { "fixup_Hexagon_HL16",           0,      32,     0 },
+      { "fixup_Hexagon_B13_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B32_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_32_6_X",         0,      32,     0 },
+      { "fixup_Hexagon_B22_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B13_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL_X",     0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL_X",     0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_16_X",           0,      32,     0 },
+      { "fixup_Hexagon_12_X",           0,      32,     0 },
+      { "fixup_Hexagon_11_X",           0,      32,     0 },
+      { "fixup_Hexagon_10_X",           0,      32,     0 },
+      { "fixup_Hexagon_9_X",            0,      32,     0 },
+      { "fixup_Hexagon_8_X",            0,      32,     0 },
+      { "fixup_Hexagon_7_X",            0,      32,     0 },
+      { "fixup_Hexagon_6_X",            0,      32,     0 },
+      { "fixup_Hexagon_32_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_COPY",           0,      32,     0 },
+      { "fixup_Hexagon_GLOB_DAT",       0,      32,     0 },
+      { "fixup_Hexagon_JMP_SLOT",       0,      32,     0 },
+      { "fixup_Hexagon_RELATIVE",       0,      32,     0 },
+      { "fixup_Hexagon_PLT_B22_PCREL",  0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_32",      0,      32,     0 },
+      { "fixup_Hexagon_GOT_LO16",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_HI16",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_32",         0,      32,     0 },
+      { "fixup_Hexagon_GOT_16",         0,      32,     0 },
+      { "fixup_Hexagon_DTPMOD_32",      0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_32",      0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_16",      0,      32,     0 },
+      { "fixup_Hexagon_GD_PLT_B22_PCREL",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B22_PCREL",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GD_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_IE_LO16",        0,      32,     0 },
+      { "fixup_Hexagon_IE_HI16",        0,      32,     0 },
+      { "fixup_Hexagon_IE_32",          0,      32,     0 },
+      { "fixup_Hexagon_IE_16",          0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_TPREL_LO16",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_HI16",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_32",       0,      32,     0 },
+      { "fixup_Hexagon_TPREL_16",       0,      32,     0 },
+      { "fixup_Hexagon_6_PCREL_X",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_GOT_32_6_X",     0,      32,     0 },
+      { "fixup_Hexagon_GOT_16_X",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_11_X",       0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_IE_32_6_X",      0,      32,     0 },
+      { "fixup_Hexagon_IE_16_X",        0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_TPREL_32_6_X",   0,      32,     0 },
+      { "fixup_Hexagon_TPREL_16_X",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_11_X",     0,      32,     0 },
+      { "fixup_Hexagon_GD_PLT_B22_PCREL_X",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GD_PLT_B32_PCREL_X",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B22_PCREL_X",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B32_PCREL_X",0,     32,     MCFixupKindInfo::FKF_IsPCRel }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -193,13 +199,8 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  /// processFixupValue - Target hook to adjust the literal value of a fixup
-  /// if necessary. IsResolved signals whether the caller believes a relocation
-  /// is needed; the target can modify the value. The default does nothing.
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
     MCFixupKind Kind = Fixup.getKind();
 
     switch((unsigned)Kind) {
@@ -289,9 +290,13 @@ public:
       case fixup_Hexagon_32_PCREL:
       case fixup_Hexagon_6_PCREL_X:
       case fixup_Hexagon_23_REG:
+      case fixup_Hexagon_27_REG:
+      case fixup_Hexagon_GD_PLT_B22_PCREL_X:
+      case fixup_Hexagon_GD_PLT_B32_PCREL_X:
+      case fixup_Hexagon_LD_PLT_B22_PCREL_X:
+      case fixup_Hexagon_LD_PLT_B32_PCREL_X:
         // These relocations should always have a relocation recorded
-        IsResolved = false;
-        return;
+        return true;
 
       case fixup_Hexagon_B22_PCREL:
         //IsResolved = false;
@@ -308,7 +313,7 @@ public:
       case fixup_Hexagon_B7_PCREL:
       case fixup_Hexagon_B7_PCREL_X:
         if (DisableFixup)
-          IsResolved = false;
+          return true;
         break;
 
       case FK_Data_1:
@@ -317,8 +322,9 @@ public:
       case FK_PCRel_4:
       case fixup_Hexagon_32:
         // Leave these relocations alone as they are used for EH.
-        return;
+        return false;
     }
+    return false;
   }
 
   /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -345,6 +351,8 @@ public:
       case fixup_Hexagon_B9_PCREL_X:
       case fixup_Hexagon_B7_PCREL:
       case fixup_Hexagon_B7_PCREL_X:
+      case fixup_Hexagon_GD_PLT_B32_PCREL_X:
+      case fixup_Hexagon_LD_PLT_B32_PCREL_X:
         return 4;
     }
   }
@@ -372,6 +380,8 @@ public:
         break;
 
       case fixup_Hexagon_B32_PCREL_X:
+      case fixup_Hexagon_GD_PLT_B32_PCREL_X:
+      case fixup_Hexagon_LD_PLT_B32_PCREL_X:
         Value >>= 6;
         break;
     }
@@ -400,8 +410,9 @@ public:
   /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t FixupValue, bool IsPCRel) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t FixupValue, bool IsResolved) const override {
 
     // When FixupValue is 0 the relocation is external and there
     // is nothing for us to do.
@@ -416,8 +427,8 @@ public:
     // to a real offset before we can use it.
     uint32_t Offset = Fixup.getOffset();
     unsigned NumBytes = getFixupKindNumBytes(Kind);
-    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-    char *InstAddr = Data + Offset;
+    assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+    char *InstAddr = Data.data() + Offset;
 
     Value = adjustFixupValue(Kind, FixupValue);
     if(!Value)
@@ -431,6 +442,7 @@ public:
       case fixup_Hexagon_B7_PCREL:
         if (!(isIntN(7, sValue)))
           HandleFixupError(7, 2, (int64_t)FixupValue, "B7_PCREL");
+        LLVM_FALLTHROUGH;
       case fixup_Hexagon_B7_PCREL_X:
         InstMask = 0x00001f18;  // Word32_B7
         Reloc = (((Value >> 2) & 0x1f) << 8) |    // Value 6-2 = Target 12-8
@@ -440,6 +452,7 @@ public:
       case fixup_Hexagon_B9_PCREL:
         if (!(isIntN(9, sValue)))
           HandleFixupError(9, 2, (int64_t)FixupValue, "B9_PCREL");
+        LLVM_FALLTHROUGH;
       case fixup_Hexagon_B9_PCREL_X:
         InstMask = 0x003000fe;  // Word32_B9
         Reloc = (((Value >> 7) & 0x3) << 20) |    // Value 8-7 = Target 21-20
@@ -451,6 +464,7 @@ public:
       case fixup_Hexagon_B13_PCREL:
         if (!(isIntN(13, sValue)))
           HandleFixupError(13, 2, (int64_t)FixupValue, "B13_PCREL");
+        LLVM_FALLTHROUGH;
       case fixup_Hexagon_B13_PCREL_X:
         InstMask = 0x00202ffe;  // Word32_B13
         Reloc = (((Value >> 12) & 0x1) << 21) |    // Value 12   = Target 21
@@ -461,6 +475,7 @@ public:
       case fixup_Hexagon_B15_PCREL:
         if (!(isIntN(15, sValue)))
           HandleFixupError(15, 2, (int64_t)FixupValue, "B15_PCREL");
+        LLVM_FALLTHROUGH;
       case fixup_Hexagon_B15_PCREL_X:
         InstMask = 0x00df20fe;  // Word32_B15
         Reloc = (((Value >> 13) & 0x3) << 22) |    // Value 14-13 = Target 23-22
@@ -472,6 +487,7 @@ public:
       case fixup_Hexagon_B22_PCREL:
         if (!(isIntN(22, sValue)))
           HandleFixupError(22, 2, (int64_t)FixupValue, "B22_PCREL");
+        LLVM_FALLTHROUGH;
       case fixup_Hexagon_B22_PCREL_X:
         InstMask = 0x01ff3ffe;  // Word32_B22
         Reloc = (((Value >> 13) & 0x1ff) << 16) |  // Value 21-13 = Target 24-16
@@ -501,7 +517,7 @@ public:
           dbgs() << "\tBValue=0x"; dbgs().write_hex(Value) <<
             ": AValue=0x"; dbgs().write_hex(FixupValue) <<
             ": Offset=" << Offset <<
-            ": Size=" << DataSize <<
+            ": Size=" << Data.size() <<
             ": OInst=0x"; dbgs().write_hex(OldData) <<
             ": Reloc=0x"; dbgs().write_hex(Reloc););
 
@@ -524,10 +540,9 @@ public:
     bool Relaxable = false;
     // Branches and loop-setup insns are handled as necessary by relaxation.
     if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
-        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) ==
-             HexagonII::TypeCOMPOUND &&
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCJ &&
          MCID.isBranch()) ||
-        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNCJ &&
          MCID.isBranch()) ||
         (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
          HMI.getOpcode() != Hexagon::C4_addipc))
@@ -710,21 +725,24 @@ public:
               break;
             }
             case MCFragment::FT_Relaxable: {
+              MCContext &Context = Asm.getContext();
               auto &RF = cast<MCRelaxableFragment>(*K);
               auto &Inst = const_cast<MCInst &>(RF.getInst());
               while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < 4) {
-                MCInst *Nop = new (Asm.getContext()) MCInst;
+                MCInst *Nop = new (Context) MCInst;
                 Nop->setOpcode(Hexagon::A2_nop);
                 Inst.addOperand(MCOperand::createInst(Nop));
                 Size -= 4;
                 if (!HexagonMCChecker(
-                           *MCII, RF.getSubtargetInfo(), Inst, Inst,
-                           *Asm.getContext().getRegisterInfo()).check()) {
+                         Context, *MCII, RF.getSubtargetInfo(), Inst,
+                         *Context.getRegisterInfo(), false)
+                         .check()) {
                   Inst.erase(Inst.end() - 1);
                   Size = 0;
                 }
               }
-              bool Error = HexagonMCShuffle(*MCII, RF.getSubtargetInfo(), Inst);
+              bool Error = HexagonMCShuffle(Context, true, *MCII,
+                                            RF.getSubtargetInfo(), Inst);
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
@@ -739,15 +757,17 @@ public:
       }
     }
   }
-};
-} // end anonymous namespace
+}; // class HexagonAsmBackend
 
-namespace llvm {
-MCAsmBackend *createHexagonAsmBackend(Target const &T,
+} // namespace
+
+// MCAsmBackend
+MCAsmBackend *llvm::createHexagonAsmBackend(Target const &T,
                                       MCRegisterInfo const & /*MRI*/,
                                       const Triple &TT, StringRef CPU,
                                       const MCTargetOptions &Options) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
-  return new HexagonAsmBackend(T, OSABI, CPU);
-}
+
+  StringRef CPUString = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  return new HexagonAsmBackend(T, TT, OSABI, CPUString);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 4292f6b..7f90e83 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 
+#include "HexagonDepITypes.h"
 #include "HexagonMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <stdint.h>
@@ -27,57 +28,14 @@ namespace llvm {
 /// instruction info tracks.
 ///
 namespace HexagonII {
-  // *** The code below must match HexagonInstrFormat*.td *** //
-
-  // Insn types.
-  // *** Must match HexagonInstrFormat*.td ***
-  enum Type {
-    TypePSEUDO  = 0,
-    TypeALU32   = 1,
-    TypeCR      = 2,
-    TypeJR      = 3,
-    TypeJ       = 4,
-    TypeLD      = 5,
-    TypeST      = 6,
-    TypeSYSTEM  = 7,
-    TypeXTYPE   = 8,
-    TypeV4LDST  = 9,
-    TypeNV      = 10,
-    TypeDUPLEX  = 11,
-    TypeCOMPOUND = 12,
-    TypeCVI_FIRST     = 13,
-    TypeCVI_VA        = TypeCVI_FIRST,
-    TypeCVI_VA_DV     = 14,
-    TypeCVI_VX        = 15,
-    TypeCVI_VX_DV     = 16,
-    TypeCVI_VP        = 17,
-    TypeCVI_VP_VS     = 18,
-    TypeCVI_VS        = 19,
-    TypeCVI_VINLANESAT= 20,
-    TypeCVI_VM_LD     = 21,
-    TypeCVI_VM_TMP_LD = 22,
-    TypeCVI_VM_CUR_LD = 23,
-    TypeCVI_VM_VP_LDU = 24,
-    TypeCVI_VM_ST     = 25,
-    TypeCVI_VM_NEW_ST = 26,
-    TypeCVI_VM_STU    = 27,
-    TypeCVI_HIST      = 28,
-    TypeCVI_LAST      = TypeCVI_HIST,
-    TypePREFIX  = 30, // Such as extenders.
-    TypeENDLOOP = 31  // Such as end of a HW loop.
-  };
+  unsigned const TypeCVI_FIRST = TypeCVI_HIST;
+  unsigned const TypeCVI_LAST = TypeCVI_VX_LATE;
 
   enum SubTarget {
-    HasV2SubT     = 0xf,
-    HasV2SubTOnly = 0x1,
-    NoV2SubT      = 0x0,
-    HasV3SubT     = 0xe,
-    HasV3SubTOnly = 0x2,
-    NoV3SubT      = 0x1,
-    HasV4SubT     = 0xc,
-    NoV4SubT      = 0x3,
-    HasV5SubT     = 0x8,
-    NoV5SubT      = 0x7
+    HasV4SubT     = 0x3f,
+    HasV5SubT     = 0x3e,
+    HasV55SubT    = 0x3c,
+    HasV60SubT    = 0x38,
   };
 
   enum AddrMode {
@@ -107,118 +65,117 @@ namespace HexagonII {
   enum {
     // This 5-bit field describes the insn type.
     TypePos  = 0,
-    TypeMask = 0x1f,
+    TypeMask = 0x3f,
 
     // Solo instructions.
-    SoloPos  = 5,
+    SoloPos  = 6,
     SoloMask = 0x1,
     // Packed only with A or X-type instructions.
-    SoloAXPos  = 6,
+    SoloAXPos  = 7,
     SoloAXMask = 0x1,
     // Only A-type instruction in first slot or nothing.
-    SoloAin1Pos  = 7,
+    SoloAin1Pos  = 8,
     SoloAin1Mask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 8,
+    PredicatedPos  = 9,
     PredicatedMask = 0x1,
-    PredicatedFalsePos  = 9,
+    PredicatedFalsePos  = 10,
     PredicatedFalseMask = 0x1,
-    PredicatedNewPos  = 10,
+    PredicatedNewPos  = 11,
     PredicatedNewMask = 0x1,
-    PredicateLatePos  = 11,
+    PredicateLatePos  = 12,
     PredicateLateMask = 0x1,
 
     // New-Value consumer instructions.
-    NewValuePos  = 12,
+    NewValuePos  = 13,
     NewValueMask = 0x1,
     // New-Value producer instructions.
-    hasNewValuePos  = 13,
+    hasNewValuePos  = 14,
     hasNewValueMask = 0x1,
     // Which operand consumes or produces a new value.
-    NewValueOpPos  = 14,
+    NewValueOpPos  = 15,
     NewValueOpMask = 0x7,
     // Stores that can become new-value stores.
-    mayNVStorePos  = 17,
+    mayNVStorePos  = 18,
     mayNVStoreMask = 0x1,
     // New-value store instructions.
-    NVStorePos  = 18,
+    NVStorePos  = 19,
     NVStoreMask = 0x1,
     // Loads that can become current-value loads.
-    mayCVLoadPos  = 19,
+    mayCVLoadPos  = 20,
     mayCVLoadMask = 0x1,
     // Current-value load instructions.
-    CVLoadPos  = 20,
+    CVLoadPos  = 21,
     CVLoadMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 21,
+    ExtendablePos  = 22,
     ExtendableMask = 0x1,
     // Insns must be extended.
-    ExtendedPos  = 22,
+    ExtendedPos  = 23,
     ExtendedMask = 0x1,
     // Which operand may be extended.
-    ExtendableOpPos  = 23,
+    ExtendableOpPos  = 24,
     ExtendableOpMask = 0x7,
     // Signed or unsigned range.
-    ExtentSignedPos  = 26,
+    ExtentSignedPos  = 27,
     ExtentSignedMask = 0x1,
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 27,
+    ExtentBitsPos  = 28,
     ExtentBitsMask = 0x1f,
     // Alignment power-of-two before extending operand.
-    ExtentAlignPos  = 32,
+    ExtentAlignPos  = 33,
     ExtentAlignMask = 0x3,
 
-    // Valid subtargets
-    validSubTargetPos  = 34,
-    validSubTargetMask = 0xf,
-
     // Addressing mode for load/store instructions.
-    AddrModePos  = 40,
+    AddrModePos  = 41,
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
-    MemAccessSizePos = 43,
+    MemAccessSizePos = 44,
     MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
-    TakenPos = 47,
+    TakenPos = 48,
     TakenMask = 0x1,
 
     // Floating-point instructions.
-    FPPos  = 48,
+    FPPos  = 49,
     FPMask = 0x1,
 
     // New-Value producer-2 instructions.
-    hasNewValuePos2  = 50,
+    hasNewValuePos2  = 51,
     hasNewValueMask2 = 0x1,
-
     // Which operand consumes or produces a new value.
-    NewValueOpPos2  = 51,
+    NewValueOpPos2  = 52,
     NewValueOpMask2 = 0x7,
 
     // Accumulator instructions.
-    AccumulatorPos = 54,
+    AccumulatorPos = 55,
     AccumulatorMask = 0x1,
 
     // Complex XU, prevent xu competition by preferring slot3
-    PrefersSlot3Pos = 55,
+    PrefersSlot3Pos = 56,
     PrefersSlot3Mask = 0x1,
 
     CofMax1Pos = 60,
-    CofMax1Mask = 0x1
+    CofMax1Mask = 0x1,
+
+    CVINewPos = 61,
+    CVINewMask = 0x1
   };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
 
   // Hexagon specific MO operand flag mask.
   enum HexagonMOTargetFlagVal {
-    //===------------------------------------------------------------------===//
-    // Hexagon Specific MachineOperand flags.
+    // Hexagon-specific MachineOperand target flags.
+    //
+    // When chaning these, make sure to update
+    // getSerializableDirectMachineOperandTargetFlags and
+    // getSerializableBitmaskMachineOperandTargetFlags if needed.
     MO_NO_FLAG,
 
-    HMOTF_ConstExtended = 1,
-
     /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
     /// Used for computing a global address for PIC compilations
     MO_PCREL,
@@ -250,7 +207,15 @@ namespace HexagonII {
 
     // MO_TPREL - indicates relocation for TLS
     // local Executable method
-    MO_TPREL
+    MO_TPREL,
+
+    // HMOTF_ConstExtended
+    // Addendum to above, indicates a const extended op
+    // Can be used as a mask.
+    HMOTF_ConstExtended = 0x80,
+
+    // Union of all bitmasks (currently only HMOTF_ConstExtended).
+    MO_Bitmasks = HMOTF_ConstExtended
   };
 
   // Hexagon Sub-instruction classes.
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index 944e235..b975e31 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -284,6 +284,16 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_HEX_TPREL_11_X;
   case fixup_Hexagon_23_REG:
     return ELF::R_HEX_23_REG;
+  case fixup_Hexagon_27_REG:
+    return ELF::R_HEX_27_REG;
+  case fixup_Hexagon_GD_PLT_B22_PCREL_X:
+    return ELF::R_HEX_GD_PLT_B22_PCREL_X;
+  case fixup_Hexagon_GD_PLT_B32_PCREL_X:
+    return ELF::R_HEX_GD_PLT_B32_PCREL_X;
+  case fixup_Hexagon_LD_PLT_B22_PCREL_X:
+    return ELF::R_HEX_LD_PLT_B22_PCREL_X;
+  case fixup_Hexagon_LD_PLT_B32_PCREL_X:
+    return ELF::R_HEX_LD_PLT_B32_PCREL_X;
   }
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
index 4c97ebb..3473276 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonFixupKinds.h
@@ -111,6 +111,11 @@ enum Fixups {
   fixup_Hexagon_TPREL_16_X,
   fixup_Hexagon_TPREL_11_X,
   fixup_Hexagon_23_REG,
+  fixup_Hexagon_27_REG,
+  fixup_Hexagon_GD_PLT_B22_PCREL_X,
+  fixup_Hexagon_GD_PLT_B32_PCREL_X,
+  fixup_Hexagon_LD_PLT_B22_PCREL_X,
+  fixup_Hexagon_LD_PLT_B32_PCREL_X,
 
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 42fcc5a..1929152 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "HexagonAsmPrinter.h"
 #include "HexagonInstPrinter.h"
+#include "HexagonAsmPrinter.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -125,46 +125,6 @@ void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
   O << -1;
 }
 
-void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<9>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
-  O << formatImm(Imm/64);
-}
-
-void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<10>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
-  O << formatImm(Imm/128);
-}
-
-void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<10>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
-  O << formatImm(Imm/64);
-}
-
-void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<11>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
-  O << formatImm(Imm/128);
-}
-
 void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
                                             raw_ostream &O) const {
   printOperand(MI, OpNo, O);
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 5f42118..ac8e391 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -44,14 +44,6 @@ public:
                           raw_ostream &O) const;
   void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
                            raw_ostream &O) const;
-  void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
   void printBranchOperand(MCInst const *MI, unsigned OpNo,
                           raw_ostream &O) const;
   void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index c619c36..446b3b2 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -23,6 +23,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = nullptr;  // .xword is only supported by V9.
   CommentString = "//";
+  SupportsDebugInformation = true;
 
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
@@ -30,8 +31,8 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
 
-  SupportsDebugInformation = true;
   MinInstAlignment = 4;
   UsesELFSectionDirectiveForBSS  = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
+  UseLogicalShr = false;
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 07c9ad9..3bb658b 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -16,23 +16,27 @@
 
 #include "HexagonBaseInfo.h"
 
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false),
-  cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity"));
+static cl::opt<bool>
+    RelaxNVChecks("relax-nv-checks", cl::init(false), cl::ZeroOrMore,
+                  cl::Hidden, cl::desc("Relax checks of new-value validity"));
 
 const HexagonMCChecker::PredSense
-  HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
+    HexagonMCChecker::Unconditional(Hexagon::NoRegister, false);
 
 void HexagonMCChecker::init() {
   // Initialize read-only registers set.
   ReadOnly.insert(Hexagon::PC);
+  ReadOnly.insert(Hexagon::C9_8);
 
   // Figure out the loop-registers definitions.
   if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
@@ -46,42 +50,49 @@ void HexagonMCChecker::init() {
 
   if (HexagonMCInstrInfo::isBundle(MCB))
     // Unfurl a bundle.
-    for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      init(*I.getInst());
+    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+      MCInst const &Inst = *I.getInst();
+      if (HexagonMCInstrInfo::isDuplex(MCII, Inst)) {
+        init(*Inst.getOperand(0).getInst());
+        init(*Inst.getOperand(1).getInst());
+      } else
+        init(Inst);
     }
   else
     init(MCB);
 }
 
-void HexagonMCChecker::init(MCInst const& MCI) {
-  const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
+void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
+                               bool &isTrue) {
+  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+    // Note an used predicate register.
+    PredReg = R;
+    isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+    // Note use of new predicate register.
+    if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+      NewPreds.insert(PredReg);
+  } else
+    // Note register use.  Super-registers are not tracked directly,
+    // but their components.
+    for (MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+         SRI.isValid(); ++SRI)
+      if (!MCSubRegIterator(*SRI, &RI).isValid())
+        // Skip super-registers used indirectly.
+        Uses.insert(*SRI);
+}
+
+void HexagonMCChecker::init(MCInst const &MCI) {
+  const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
   unsigned PredReg = Hexagon::NoRegister;
   bool isTrue = false;
 
   // Get used registers.
   for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
-    if (MCI.getOperand(i).isReg()) {
-      unsigned R = MCI.getOperand(i).getReg();
-
-      if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
-        // Note an used predicate register.
-        PredReg = R;
-        isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
-
-        // Note use of new predicate register.
-        if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
-          NewPreds.insert(PredReg);
-      }
-      else
-        // Note register use.  Super-registers are not tracked directly,
-        // but their components.
-        for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
-           SRI.isValid();
-           ++SRI)
-         if (!MCSubRegIterator(*SRI, &RI).isValid())
-           // Skip super-registers used indirectly.
-           Uses.insert(*SRI);
-    }
+    if (MCI.getOperand(i).isReg())
+      initReg(MCI, MCI.getOperand(i).getReg(), PredReg, isTrue);
+  for (unsigned i = 0; i < MCID.getNumImplicitUses(); ++i)
+    initReg(MCI, MCID.getImplicitUses()[i], PredReg, isTrue);
 
   // Get implicit register definitions.
   if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
@@ -99,10 +110,10 @@ void HexagonMCChecker::init(MCInst const& MCI) {
 
       if (Hexagon::USR_OVF == R)
         // Many insns change the USR implicitly, but only one or another flag.
-        // The instruction table models the USR.OVF flag, which can be implicitly
-        // modified more than once, but cannot be modified in the same packet
-        // with an instruction that modifies is explicitly. Deal with such situ-
-        // ations individually.
+        // The instruction table models the USR.OVF flag, which can be
+        // implicitly modified more than once, but cannot be modified in the
+        // same packet with an instruction that modifies is explicitly. Deal
+        // with such situations individually.
         SoftDefs.insert(R);
       else if (isPredicateRegister(R) &&
                HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
@@ -114,8 +125,7 @@ void HexagonMCChecker::init(MCInst const& MCI) {
 
   // Figure out explicit register definitions.
   for (unsigned i = 0; i < MCID.getNumDefs(); ++i) {
-    unsigned R = MCI.getOperand(i).getReg(),
-             S = Hexagon::NoRegister;
+    unsigned R = MCI.getOperand(i).getReg(), S = Hexagon::NoRegister;
     // USR has subregisters (while C8 does not for technical reasons), so
     // reset R to USR, since we know how to handle multiple defs of USR,
     // taking into account its subregisters.
@@ -124,9 +134,8 @@ void HexagonMCChecker::init(MCInst const& MCI) {
 
     // Note register definitions, direct ones as well as indirect side-effects.
     // Super-registers are not tracked directly, but their components.
-    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
-        SRI.isValid();
-        ++SRI) {
+    for (MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+         SRI.isValid(); ++SRI) {
       if (MCSubRegIterator(*SRI, &RI).isValid())
         // Skip super-registers defined indirectly.
         continue;
@@ -146,22 +155,19 @@ void HexagonMCChecker::init(MCInst const& MCI) {
         // Only an explicit definition of P3:0 is noted as such; if a
         // side-effect, then note as a soft definition.
         SoftDefs.insert(*SRI);
-      else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI))
+      else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) &&
+               isPredicateRegister(*SRI))
         // Some insns produce predicates too late to be used in the same packet.
         LatePreds.insert(*SRI);
-      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD)
-        // Current loads should be used in the same packet.
-        // TODO: relies on the impossibility of a current and a temporary loads
-        // in the same packet.
-        CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue));
-      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD)
+      else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) ==
+                             HexagonII::TypeCVI_VM_TMP_LD)
         // Temporary loads should be used in the same packet, but don't commit
         // results, so it should be disregarded if another insn changes the same
         // register.
         // TODO: relies on the impossibility of a current and a temporary loads
         // in the same packet.
         TmpDefs.insert(*SRI);
-      else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) )
+      else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI))
         // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and
         // destination registers with this instruction. same for vdeal(Vx,Vy,Rx)
         Uses.insert(*SRI);
@@ -177,25 +183,25 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     if (HexagonMCInstrInfo::isCompound(MCII, MCI))
       compoundRegisterMap(R); // Compound insns have a limited register range.
 
-    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
-        SRI.isValid();
-        ++SRI)
+    for (MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+         SRI.isValid(); ++SRI)
       if (!MCSubRegIterator(*SRI, &RI).isValid())
         // No super-registers defined indirectly.
-        NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
-                                              HexagonMCInstrInfo::isFloat(MCII, MCI)));
+        NewDefs[*SRI].push_back(NewSense::Def(
+            PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+            HexagonMCInstrInfo::isFloat(MCII, MCI)));
 
     // For fairly unique 2-dot-new producers, example:
     // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers.
     if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) {
       unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg();
 
-      for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid());
-          SRI.isValid();
-          ++SRI)
+      bool HasSubRegs = MCSubRegIterator(R2, &RI).isValid();
+      for (MCRegAliasIterator SRI(R2, &RI, !HasSubRegs); SRI.isValid(); ++SRI)
         if (!MCSubRegIterator(*SRI, &RI).isValid())
-          NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
-                                                HexagonMCInstrInfo::isFloat(MCII, MCI)));
+          NewDefs[*SRI].push_back(NewSense::Def(
+              PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI),
+              HexagonMCInstrInfo::isFloat(MCII, MCI)));
     }
   }
 
@@ -216,39 +222,142 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     if (!MCSubRegIterator(N, &RI).isValid()) {
       // Super-registers cannot use new values.
       if (MCID.isBranch())
-        NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+        NewUses[N] = NewSense::Jmp(
+            llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNCJ);
       else
-        NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+        NewUses[N] = NewSense::Use(
+            PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
     }
   }
 }
 
-HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx,
-                                   MCRegisterInfo const &ri)
-    : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI),
-      bLoadErrInfo(false) {
+HexagonMCChecker::HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII,
+                                   MCSubtargetInfo const &STI, MCInst &mcb,
+                                   MCRegisterInfo const &ri, bool ReportErrors)
+    : Context(Context), MCB(mcb), RI(ri), MCII(MCII), STI(STI),
+      ReportErrors(ReportErrors) {
   init();
 }
 
-bool HexagonMCChecker::check() {
+bool HexagonMCChecker::check(bool FullCheck) {
   bool chkB = checkBranches();
   bool chkP = checkPredicates();
   bool chkNV = checkNewValues();
   bool chkR = checkRegisters();
+  bool chkRRO = checkRegistersReadOnly();
+  bool chkELB = checkEndloopBranches();
+  checkRegisterCurDefs();
   bool chkS = checkSolo();
-  bool chkSh = checkShuffle();
-  bool chkSl = checkSlots();
-  bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
+  bool chkSh = true;
+  if (FullCheck)
+    chkSh = checkShuffle();
+  bool chkSl = true;
+  if (FullCheck)
+    chkSl = checkSlots();
+  bool chkAXOK = checkAXOK();
+  bool chk = chkB && chkP && chkNV && chkR && chkRRO && chkELB && chkS &&
+             chkSh && chkSl && chkAXOK;
 
   return chk;
 }
 
-bool HexagonMCChecker::checkSlots()
+bool HexagonMCChecker::checkEndloopBranches() {
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
+    if (Desc.isBranch() || Desc.isCall()) {
+      auto Inner = HexagonMCInstrInfo::isInnerLoop(MCB);
+      if (Inner || HexagonMCInstrInfo::isOuterLoop(MCB)) {
+        reportError(I.getLoc(),
+                    llvm::Twine("packet marked with `:endloop") +
+                        (Inner ? "0" : "1") + "' " +
+                        "cannot contain instructions that modify register " +
+                        "`" + llvm::Twine(RI.getName(Hexagon::PC)) + "'");
+        return false;
+      }
+    }
+  }
+  return true;
+}
 
-{
+namespace {
+bool isDuplexAGroup(unsigned Opcode) {
+  switch (Opcode) {
+  case Hexagon::SA1_addi:
+  case Hexagon::SA1_addrx:
+  case Hexagon::SA1_addsp:
+  case Hexagon::SA1_and1:
+  case Hexagon::SA1_clrf:
+  case Hexagon::SA1_clrfnew:
+  case Hexagon::SA1_clrt:
+  case Hexagon::SA1_clrtnew:
+  case Hexagon::SA1_cmpeqi:
+  case Hexagon::SA1_combine0i:
+  case Hexagon::SA1_combine1i:
+  case Hexagon::SA1_combine2i:
+  case Hexagon::SA1_combine3i:
+  case Hexagon::SA1_combinerz:
+  case Hexagon::SA1_combinezr:
+  case Hexagon::SA1_dec:
+  case Hexagon::SA1_inc:
+  case Hexagon::SA1_seti:
+  case Hexagon::SA1_setin1:
+  case Hexagon::SA1_sxtb:
+  case Hexagon::SA1_sxth:
+  case Hexagon::SA1_tfr:
+  case Hexagon::SA1_zxtb:
+  case Hexagon::SA1_zxth:
+    return true;
+    break;
+  default:
+    return false;
+  }
+}
+
+bool isNeitherAnorX(MCInstrInfo const &MCII, MCInst const &ID) {
+  unsigned Result = 0;
+  unsigned Type = HexagonMCInstrInfo::getType(MCII, ID);
+  if (Type == HexagonII::TypeDUPLEX) {
+    unsigned subInst0Opcode = ID.getOperand(0).getInst()->getOpcode();
+    unsigned subInst1Opcode = ID.getOperand(1).getInst()->getOpcode();
+    Result += !isDuplexAGroup(subInst0Opcode);
+    Result += !isDuplexAGroup(subInst1Opcode);
+  } else
+    Result +=
+        Type != HexagonII::TypeALU32_2op && Type != HexagonII::TypeALU32_3op &&
+        Type != HexagonII::TypeALU32_ADDI && Type != HexagonII::TypeS_2op &&
+        Type != HexagonII::TypeS_3op &&
+        (Type != HexagonII::TypeALU64 || HexagonMCInstrInfo::isFloat(MCII, ID));
+  return Result != 0;
+}
+} // namespace
+
+bool HexagonMCChecker::checkAXOK() {
+  MCInst const *HasSoloAXInst = nullptr;
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    if (HexagonMCInstrInfo::isSoloAX(MCII, I)) {
+      HasSoloAXInst = &I;
+    }
+  }
+  if (!HasSoloAXInst)
+    return true;
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    if (&I != HasSoloAXInst && isNeitherAnorX(MCII, I)) {
+      reportError(
+          HasSoloAXInst->getLoc(),
+          llvm::Twine("Instruction can only be in a packet with ALU or "
+                      "non-FPU XTYPE instructions"));
+      reportError(I.getLoc(),
+                  llvm::Twine("Not an ALU or non-FPU XTYPE instruction"));
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HexagonMCChecker::checkSlots() {
   unsigned slotsUsed = 0;
-  for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) {
-    MCInst const& MCI = *HMI.getInst();
+  for (auto HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+    MCInst const &MCI = *HMI.getInst();
     if (HexagonMCInstrInfo::isImmext(MCI))
       continue;
     if (HexagonMCInstrInfo::isDuplex(MCII, MCI))
@@ -258,9 +367,7 @@ bool HexagonMCChecker::checkSlots()
   }
 
   if (slotsUsed > HEXAGON_PACKET_SIZE) {
-    HexagonMCErrInfo errInfo;
-    errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS);
-    addErrInfo(errInfo);
+    reportError("invalid instruction packet: out of slots");
     return false;
   }
   return true;
@@ -268,11 +375,9 @@ bool HexagonMCChecker::checkSlots()
 
 // Check legal use of branches.
 bool HexagonMCChecker::checkBranches() {
-  HexagonMCErrInfo errInfo;
   if (HexagonMCInstrInfo::isBundle(MCB)) {
     bool hasConditional = false;
-    unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
-             NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+    unsigned Branches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
              Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
 
     for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
@@ -284,12 +389,6 @@ bool HexagonMCChecker::checkBranches() {
       if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
           HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
         ++Branches;
-        if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
-            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
-          ++NewIndirectBranches;
-        if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
-          ++NewValueBranches;
-
         if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
             HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
           hasConditional = true;
@@ -298,25 +397,14 @@ bool HexagonMCChecker::checkBranches() {
           Unconditional = i; // Record the position of the unconditional branch.
         }
       }
-      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
-          HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
-        ++Returns;
     }
 
-    if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
-      if (HexagonMCInstrInfo::isInnerLoop(MCB) ||
-          HexagonMCInstrInfo::isOuterLoop(MCB)) {
-        // Error out if there's any branch in a loop-end packet.
-        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC);
-        addErrInfo(errInfo);
-        return false;
-      }
     if (Branches > 1)
       if (!hasConditional || Conditional > Unconditional) {
         // Error out if more than one unconditional branch or
         // the conditional branch appears after the unconditional one.
-        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES);
-        addErrInfo(errInfo);
+        reportError(
+            "unconditional branch cannot precede another branch in packet");
         return false;
       }
   }
@@ -326,31 +414,28 @@ bool HexagonMCChecker::checkBranches() {
 
 // Check legal use of predicate registers.
 bool HexagonMCChecker::checkPredicates() {
-  HexagonMCErrInfo errInfo;
   // Check for proper use of new predicate registers.
-  for (const auto& I : NewPreds) {
+  for (const auto &I : NewPreds) {
     unsigned P = I;
 
     if (!Defs.count(P) || LatePreds.count(P)) {
       // Error out if the new predicate register is not defined,
       // or defined "late"
       // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }").
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P);
-      addErrInfo(errInfo);
+      reportErrorNewValue(P);
       return false;
     }
   }
 
   // Check for proper use of auto-anded of predicate registers.
-  for (const auto& I : LatePreds) {
+  for (const auto &I : LatePreds) {
     unsigned P = I;
 
     if (LatePreds.count(P) > 1 || Defs.count(P)) {
       // Error out if predicate register defined "late" multiple times or
       // defined late and regularly defined
       // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }".
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P);
-      addErrInfo(errInfo);
+      reportErrorRegisters(P);
       return false;
     }
   }
@@ -360,15 +445,12 @@ bool HexagonMCChecker::checkPredicates() {
 
 // Check legal use of new values.
 bool HexagonMCChecker::checkNewValues() {
-  HexagonMCErrInfo errInfo;
-  memset(&errInfo, 0, sizeof(errInfo));
-  for (auto& I : NewUses) {
+  for (auto &I : NewUses) {
     unsigned R = I.first;
     NewSense &US = I.second;
 
     if (!hasValidNewValueDef(US, NewDefs[R])) {
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R);
-      addErrInfo(errInfo);
+      reportErrorNewValue(R);
       return false;
     }
   }
@@ -376,25 +458,61 @@ bool HexagonMCChecker::checkNewValues() {
   return true;
 }
 
+bool HexagonMCChecker::checkRegistersReadOnly() {
+  for (auto I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+    MCInst const &Inst = *I.getInst();
+    unsigned Defs = HexagonMCInstrInfo::getDesc(MCII, Inst).getNumDefs();
+    for (unsigned j = 0; j < Defs; ++j) {
+      MCOperand const &Operand = Inst.getOperand(j);
+      assert(Operand.isReg() && "Def is not a register");
+      unsigned Register = Operand.getReg();
+      if (ReadOnly.find(Register) != ReadOnly.end()) {
+        reportError(Inst.getLoc(), "Cannot write to read-only register `" +
+                                       llvm::Twine(RI.getName(Register)) + "'");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool HexagonMCChecker::registerUsed(unsigned Register) {
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB))
+    for (unsigned j = HexagonMCInstrInfo::getDesc(MCII, I).getNumDefs(),
+                  n = I.getNumOperands();
+         j < n; ++j) {
+      MCOperand const &Operand = I.getOperand(j);
+      if (Operand.isReg() && Operand.getReg() == Register)
+        return true;
+    }
+  return false;
+}
+
+void HexagonMCChecker::checkRegisterCurDefs() {
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    if (HexagonMCInstrInfo::isCVINew(MCII, I) &&
+        HexagonMCInstrInfo::getDesc(MCII, I).mayLoad()) {
+      unsigned Register = I.getOperand(0).getReg();
+      if (!registerUsed(Register))
+        reportWarning("Register `" + llvm::Twine(RI.getName(Register)) +
+                      "' used with `.cur' "
+                      "but not used in the same packet");
+    }
+  }
+}
+
 // Check for legal register uses and definitions.
 bool HexagonMCChecker::checkRegisters() {
-  HexagonMCErrInfo errInfo;
   // Check for proper register definitions.
-  for (const auto& I : Defs) {
+  for (const auto &I : Defs) {
     unsigned R = I.first;
 
-    if (ReadOnly.count(R)) {
-      // Error out for definitions of read-only registers.
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R);
-      addErrInfo(errInfo);
-      return false;
-    }
     if (isLoopRegister(R) && Defs.count(R) > 1 &&
         (HexagonMCInstrInfo::isInnerLoop(MCB) ||
          HexagonMCInstrInfo::isOuterLoop(MCB))) {
       // Error out for definitions of loop registers at the end of a loop.
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R);
-      addErrInfo(errInfo);
+      reportError("loop-setup and some branch instructions "
+                  "cannot be in the same packet");
       return false;
     }
     if (SoftDefs.count(R)) {
@@ -402,8 +520,7 @@ bool HexagonMCChecker::checkRegisters() {
       // (e.g., "{ usr = r0; r0 = sfadd(...) }").
       unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:.
       unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
-      addErrInfo(errInfo);
+      reportErrorRegisters(BadR);
       return false;
     }
     if (!isPredicateRegister(R) && Defs[R].size() > 1) {
@@ -416,20 +533,18 @@ bool HexagonMCChecker::checkRegisters() {
         // changes, conditional or not.
         unsigned UsrR = Hexagon::USR;
         unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R;
-        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR);
-        addErrInfo(errInfo);
+        reportErrorRegisters(BadR);
         return false;
       }
       // Check for multiple conditional register definitions.
-      for (const auto& J : PM) {
+      for (const auto &J : PM) {
         PredSense P = J;
 
         // Check for multiple uses of the same condition.
         if (PM.count(P) > 1) {
           // Error out on conditional changes based on the same predicate
           // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }").
-          errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
-          addErrInfo(errInfo);
+          reportErrorRegisters(R);
           return false;
         }
         // Check for the use of the complementary condition.
@@ -437,44 +552,33 @@ bool HexagonMCChecker::checkRegisters() {
         if (PM.count(P) && PM.size() > 2) {
           // Error out on conditional changes based on the same predicate
           // multiple times
-          // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }").
-          errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R);
-          addErrInfo(errInfo);
+          // (e.g., "if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =...").
+          reportErrorRegisters(R);
           return false;
         }
       }
     }
   }
 
-  // Check for use of current definitions.
-  for (const auto& I : CurDefs) {
-    unsigned R = I;
-
-    if (!Uses.count(R)) {
-      // Warn on an unused current definition.
-      errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R);
-      addErrInfo(errInfo);
-      return true;
-    }
-  }
-
   // Check for use of temporary definitions.
-  for (const auto& I : TmpDefs) {
+  for (const auto &I : TmpDefs) {
     unsigned R = I;
 
     if (!Uses.count(R)) {
       // special case for vhist
       bool vHistFound = false;
-      for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-        if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) {
-          vHistFound = true;  // vhist() implicitly uses ALL REGxx.tmp
+      for (auto const &HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+        if (llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) ==
+            HexagonII::TypeCVI_HIST) {
+          vHistFound = true; // vhist() implicitly uses ALL REGxx.tmp
           break;
         }
       }
       // Warn on an unused temporary definition.
       if (vHistFound == false) {
-        errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R);
-        addErrInfo(errInfo);
+        reportWarning("register `" + llvm::Twine(RI.getName(R)) +
+                      "' used with `.tmp' "
+                      "but not used in the same packet");
         return true;
       }
     }
@@ -485,45 +589,25 @@ bool HexagonMCChecker::checkRegisters() {
 
 // Check for legal use of solo insns.
 bool HexagonMCChecker::checkSolo() {
-  HexagonMCErrInfo errInfo;
-  if (HexagonMCInstrInfo::isBundle(MCB) &&
-      HexagonMCInstrInfo::bundleSize(MCB) > 1) {
-    for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) {
-        errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO);
-        addErrInfo(errInfo);
+  if (HexagonMCInstrInfo::bundleSize(MCB) > 1)
+    for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+      if (llvm::HexagonMCInstrInfo::isSolo(MCII, I)) {
+        reportError(I.getLoc(), "Instruction is marked `isSolo' and "
+                                "cannot have other instructions in "
+                                "the same packet");
         return false;
       }
     }
-  }
 
   return true;
 }
 
 bool HexagonMCChecker::checkShuffle() {
-  HexagonMCErrInfo errInfo;
-  // Branch info is lost when duplexing. The unduplexed insns must be
-  // checked and only branch errors matter for this case.
-  HexagonMCShuffler MCS(MCII, STI, MCB);
-  if (!MCS.check()) {
-    if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
-      errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
-      errInfo.setShuffleError(MCS.getError());
-      addErrInfo(errInfo);
-      return false;
-    }
-  }
-  HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
-  if (!MCSDX.check()) {
-    errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
-    errInfo.setShuffleError(MCSDX.getError());
-    addErrInfo(errInfo);
-    return false;
-  }
-  return true;
+  HexagonMCShuffler MCSDX(Context, ReportErrors, MCII, STI, MCB);
+  return MCSDX.check();
 }
 
-void HexagonMCChecker::compoundRegisterMap(unsigned& Register) {
+void HexagonMCChecker::compoundRegisterMap(unsigned &Register) {
   switch (Register) {
   default:
     break;
@@ -555,7 +639,7 @@ void HexagonMCChecker::compoundRegisterMap(unsigned& Register) {
 }
 
 bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use,
-      const NewSenseList &Defs) const {
+                                           const NewSenseList &Defs) const {
   bool Strict = !RelaxNVChecks;
 
   for (unsigned i = 0, n = Defs.size(); i < n; ++i) {
@@ -583,3 +667,30 @@ bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use,
   return false;
 }
 
+void HexagonMCChecker::reportErrorRegisters(unsigned Register) {
+  reportError("register `" + llvm::Twine(RI.getName(Register)) +
+              "' modified more than once");
+}
+
+void HexagonMCChecker::reportErrorNewValue(unsigned Register) {
+  reportError("register `" + llvm::Twine(RI.getName(Register)) +
+              "' used with `.new' "
+              "but not validly modified in the same packet");
+}
+
+void HexagonMCChecker::reportError(llvm::Twine const &Msg) {
+  reportError(MCB.getLoc(), Msg);
+}
+
+void HexagonMCChecker::reportError(SMLoc Loc, llvm::Twine const &Msg) {
+  if (ReportErrors)
+    Context.reportError(Loc, Msg);
+}
+
+void HexagonMCChecker::reportWarning(llvm::Twine const &Msg) {
+  if (ReportErrors) {
+    auto SM = Context.getSourceManager();
+    if (SM)
+      SM->PrintMessage(MCB.getLoc(), SourceMgr::DK_Warning, Msg);
+  }
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 33e2279..027f78b 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -24,59 +24,14 @@ using namespace llvm;
 namespace llvm {
 class MCOperandInfo;
 
-typedef struct {
-  unsigned Error, Warning, ShuffleError;
-  unsigned Register;
-} ErrInfo_T;
-
-class HexagonMCErrInfo {
-public:
-  enum {
-    CHECK_SUCCESS         = 0,
-    // Errors.
-    CHECK_ERROR_BRANCHES  = 0x00001,
-    CHECK_ERROR_NEWP      = 0x00002,
-    CHECK_ERROR_NEWV      = 0x00004,
-    CHECK_ERROR_REGISTERS = 0x00008,
-    CHECK_ERROR_READONLY  = 0x00010,
-    CHECK_ERROR_LOOP      = 0x00020,
-    CHECK_ERROR_ENDLOOP   = 0x00040,
-    CHECK_ERROR_SOLO      = 0x00080,
-    CHECK_ERROR_SHUFFLE   = 0x00100,
-    CHECK_ERROR_NOSLOTS   = 0x00200,
-    CHECK_ERROR_UNKNOWN   = 0x00400,
-    // Warnings.
-    CHECK_WARN_CURRENT    = 0x10000,
-    CHECK_WARN_TEMPORARY  = 0x20000
-  };
-  ErrInfo_T s;
-
-  void reset() {
-    s.Error = CHECK_SUCCESS;
-    s.Warning = CHECK_SUCCESS;
-    s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS;
-    s.Register = Hexagon::NoRegister;
-  };
-  HexagonMCErrInfo() {
-    reset();
-  };
-
-  void setError(unsigned e, unsigned r = Hexagon::NoRegister)
-    { s.Error = e; s.Register = r; };
-  void setWarning(unsigned w, unsigned r = Hexagon::NoRegister)
-    { s.Warning = w; s.Register = r; };
-  void setShuffleError(unsigned e) { s.ShuffleError = e; };
-};
-
 /// Check for a valid bundle.
 class HexagonMCChecker {
-  /// Insn bundle.
-  MCInst& MCB;
-  MCInst& MCBDX;
-  const MCRegisterInfo& RI;
+  MCContext &Context;
+  MCInst &MCB;
+  const MCRegisterInfo &RI;
   MCInstrInfo const &MCII;
   MCSubtargetInfo const &STI;
-  bool bLoadErrInfo;
+  bool ReportErrors;
 
   /// Set of definitions: register #, if predicated, if predicated true.
   typedef std::pair<unsigned, bool> PredSense;
@@ -99,23 +54,23 @@ class HexagonMCChecker {
     bool IsFloat, IsNVJ, Cond;
     // The special-case "constructors":
     static NewSense Jmp(bool isNVJ) {
-      NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ,
-                      /*Cond=*/ false };
+      NewSense NS = {/*PredReg=*/0, /*IsFloat=*/false, /*IsNVJ=*/isNVJ,
+                     /*Cond=*/false};
       return NS;
     }
     static NewSense Use(unsigned PR, bool True) {
-      NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false,
-                      /*Cond=*/ True };
+      NewSense NS = {/*PredReg=*/PR, /*IsFloat=*/false, /*IsNVJ=*/false,
+                     /*Cond=*/True};
       return NS;
     }
     static NewSense Def(unsigned PR, bool True, bool Float) {
-      NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false,
-                      /*Cond=*/ True };
+      NewSense NS = {/*PredReg=*/PR, /*IsFloat=*/Float, /*IsNVJ=*/false,
+                     /*Cond=*/True};
       return NS;
     }
   };
   /// Set of definitions that produce new register:
-  typedef llvm::SmallVector<NewSense,2> NewSenseList;
+  typedef llvm::SmallVector<NewSense, 2> NewSenseList;
   typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator;
   llvm::DenseMap<unsigned, NewSenseList> NewDefs;
 
@@ -123,10 +78,6 @@ class HexagonMCChecker {
   typedef std::set<unsigned>::iterator SoftDefsIterator;
   std::set<unsigned> SoftDefs;
 
-  /// Set of current definitions committed to the register file.
-  typedef std::set<unsigned>::iterator CurDefsIterator;
-  std::set<unsigned> CurDefs;
-
   /// Set of temporary definitions not committed to the register file.
   typedef std::set<unsigned>::iterator TmpDefsIterator;
   std::set<unsigned> TmpDefs;
@@ -151,67 +102,51 @@ class HexagonMCChecker {
   typedef std::set<unsigned>::iterator ReadOnlyIterator;
   std::set<unsigned> ReadOnly;
 
-  std::queue<ErrInfo_T> ErrInfoQ;
-  HexagonMCErrInfo CrntErrInfo;
-
-  void getErrInfo() {
-    if (bLoadErrInfo == true) {
-      if (ErrInfoQ.empty()) {
-        CrntErrInfo.reset();
-      } else {
-        CrntErrInfo.s = ErrInfoQ.front();
-        ErrInfoQ.pop();
-      }
-    }
-    bLoadErrInfo = false;
-  }
-
   void init();
-  void init(MCInst const&);
+  void init(MCInst const &);
+  void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
+
+  bool registerUsed(unsigned Register);
 
   // Checks performed.
   bool checkBranches();
   bool checkPredicates();
   bool checkNewValues();
   bool checkRegisters();
+  bool checkRegistersReadOnly();
+  bool checkEndloopBranches();
+  void checkRegisterCurDefs();
   bool checkSolo();
   bool checkShuffle();
   bool checkSlots();
+  bool checkAXOK();
 
-  static void compoundRegisterMap(unsigned&);
+  static void compoundRegisterMap(unsigned &);
 
   bool isPredicateRegister(unsigned R) const {
-    return (Hexagon::P0 == R || Hexagon::P1 == R ||
-            Hexagon::P2 == R || Hexagon::P3 == R);
+    return (Hexagon::P0 == R || Hexagon::P1 == R || Hexagon::P2 == R ||
+            Hexagon::P3 == R);
   };
   bool isLoopRegister(unsigned R) const {
-    return (Hexagon::SA0 == R || Hexagon::LC0 == R ||
-            Hexagon::SA1 == R || Hexagon::LC1 == R);
+    return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R ||
+            Hexagon::LC1 == R);
   };
 
-  bool hasValidNewValueDef(const NewSense &Use,
-                           const NewSenseList &Defs) const;
+  bool hasValidNewValueDef(const NewSense &Use, const NewSenseList &Defs) const;
 
-  public:
-  explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
-                            const MCRegisterInfo& ri);
-
-  bool check();
-
-  /// add a new error/warning
-  void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
-
-  /// Return the error code for the last operation in the insn bundle.
-  unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; };
-  unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; };
-  unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; };
-  unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; };
-  bool getNextErrInfo() {
-    bLoadErrInfo = true;
-    return (ErrInfoQ.empty()) ? false : (getErrInfo(), true);
-  }
+public:
+  explicit HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII,
+                            MCSubtargetInfo const &STI, MCInst &mcb,
+                            const MCRegisterInfo &ri, bool ReportErrors = true);
+
+  bool check(bool FullCheck = true);
+  void reportErrorRegisters(unsigned Register);
+  void reportErrorNewValue(unsigned Register);
+  void reportError(SMLoc Loc, llvm::Twine const &Msg);
+  void reportError(llvm::Twine const &Msg);
+  void reportWarning(llvm::Twine const &Msg);
 };
 
-}
+} // namespace llvm
 
 #endif // HEXAGONMCCHECKER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 2645a17..50f00d1 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/HexagonMCCodeEmitter.h"
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonFixupKinds.h"
-#include "MCTargetDesc/HexagonMCCodeEmitter.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
@@ -35,38 +35,40 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
                                            MCContext &aMCT)
     : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
-      Extended(new bool(false)), CurrentBundle(new MCInst const *) {}
+      Extended(new bool(false)), CurrentBundle(new MCInst const *),
+      CurrentIndex(new size_t(0)) {}
 
-uint32_t HexagonMCCodeEmitter::parseBits(size_t Instruction, size_t Last,
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Last,
                                          MCInst const &MCB,
                                          MCInst const &MCI) const {
   bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
-  if (Instruction == 0) {
+  if (*CurrentIndex == 0) {
     if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
       assert(!Duplex);
-      assert(Instruction != Last);
+      assert(*CurrentIndex != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
-  if (Instruction == 1) {
+  if (*CurrentIndex == 1) {
     if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
       assert(!Duplex);
-      assert(Instruction != Last);
+      assert(*CurrentIndex != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
   if (Duplex) {
-    assert(Instruction == Last);
+    assert(*CurrentIndex == Last);
     return HexagonII::INST_PARSE_DUPLEX;
   }
-  if(Instruction == Last)
+  if(*CurrentIndex == Last)
     return HexagonII::INST_PARSE_PACKET_END;
   return HexagonII::INST_PARSE_NOT_END;
 }
 
-void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
+/// EncodeInstruction - Emit the bundle
+void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                              SmallVectorImpl<MCFixup> &Fixups,
-                                             MCSubtargetInfo const &STI) const {
+                                             const MCSubtargetInfo &STI) const {
   MCInst &HMB = const_cast<MCInst &>(MI);
 
   assert(HexagonMCInstrInfo::isBundle(HMB));
@@ -74,7 +76,7 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
   *Addend = 0;
   *Extended = false;
   *CurrentBundle = &MI;
-  size_t Instruction = 0;
+  *CurrentIndex = 0;
   size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
   for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
     MCInst &HMI = const_cast<MCInst &>(*I.getInst());
@@ -82,11 +84,10 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
                                 computeAvailableFeatures(STI.getFeatureBits()));
 
     EncodeSingleInstruction(HMI, OS, Fixups, STI,
-                            parseBits(Instruction, Last, HMB, HMI),
-                            Instruction);
+                            parseBits(Last, HMB, HMI));
     *Extended = HexagonMCInstrInfo::isImmext(HMI);
     *Addend += HEXAGON_INSTR_SIZE;
-    ++Instruction;
+    ++*CurrentIndex;
   }
   return;
 }
@@ -107,165 +108,44 @@ static bool RegisterMatches(unsigned Consumer, unsigned Producer,
 /// EncodeSingleInstruction - Emit a single
 void HexagonMCCodeEmitter::EncodeSingleInstruction(
     const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI, uint32_t Parse, size_t Index) const {
-  MCInst HMB = MI;
-  assert(!HexagonMCInstrInfo::isBundle(HMB));
+    const MCSubtargetInfo &STI, uint32_t Parse) const {
+  assert(!HexagonMCInstrInfo::isBundle(MI));
   uint64_t Binary;
 
-  // Compound instructions are limited to using registers 0-7 and 16-23
-  // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
-  static unsigned RegMap[8] = {Hexagon::R8,  Hexagon::R9,  Hexagon::R10,
-                               Hexagon::R11, Hexagon::R12, Hexagon::R13,
-                               Hexagon::R14, Hexagon::R15};
-
   // Pseudo instructions don't get encoded and shouldn't be here
   // in the first place!
-  assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
+  assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo() &&
          "pseudo-instruction found");
   DEBUG(dbgs() << "Encoding insn"
-                  " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                  " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
                                                                     "\n");
 
-  if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
-    for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
-      if (HMB.getOperand(i).isReg()) {
-        unsigned Reg =
-            MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
-        if ((Reg <= 23) && (Reg >= 16))
-          HMB.getOperand(i).setReg(RegMap[Reg - 16]);
-      }
-  }
-
-  if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
-    // Calculate the new value distance to the associated producer
-    MCOperand &MCO =
-        HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
-    unsigned SOffset = 0;
-    unsigned VOffset = 0;
-    unsigned Register = MCO.getReg();
-    unsigned Register1;
-    unsigned Register2;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
-    auto i = Instructions.begin() + Index - 1;
-    for (;; --i) {
-      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
-      MCInst const &Inst = *i->getInst();
-      if (HexagonMCInstrInfo::isImmext(Inst))
-        continue;
-      ++SOffset;
-      if (HexagonMCInstrInfo::isVector(MCII, Inst))
-        // Vector instructions don't count scalars
-        ++VOffset;
-      Register1 =
-          HexagonMCInstrInfo::hasNewValue(MCII, Inst)
-              ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
-              : static_cast<unsigned>(Hexagon::NoRegister);
-      Register2 =
-          HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
-              ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
-              : static_cast<unsigned>(Hexagon::NoRegister);
-      if (!RegisterMatches(Register, Register1, Register2))
-        // This isn't the register we're looking for
-        continue;
-      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
-        // Producer is unpredicated
-        break;
-      assert(HexagonMCInstrInfo::isPredicated(MCII, HMB) &&
-             "Unpredicated consumer depending on predicated producer");
-      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
-          HexagonMCInstrInfo::isPredicatedTrue(MCII, HMB))
-        // Producer predicate sense matched ours
-        break;
-    }
-    // Hexagon PRM 10.11 Construct Nt from distance
-    unsigned Offset =
-        HexagonMCInstrInfo::isVector(MCII, HMB) ? VOffset : SOffset;
-    Offset <<= 1;
-    Offset |=
-        HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
-    MCO.setReg(Offset + Hexagon::R0);
-  }
-
-  Binary = getBinaryCodeForInstr(HMB, Fixups, STI);
+  Binary = getBinaryCodeForInstr(MI, Fixups, STI);
   // Check for unimplemented instructions. Immediate extenders
   // are encoded as zero, so they need to be accounted for.
-  if ((!Binary) &&
-      ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
-       (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
-       (HMB.getOpcode() != A4_ext_g))) {
+  if (!Binary &&
+      MI.getOpcode() != DuplexIClass0 &&
+      MI.getOpcode() != A4_ext) {
     DEBUG(dbgs() << "Unimplemented inst: "
-                    " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                    " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
                                                                       "\n");
     llvm_unreachable("Unimplemented Instruction");
   }
   Binary |= Parse;
 
   // if we need to emit a duplexed instruction
-  if (HMB.getOpcode() >= Hexagon::DuplexIClass0 &&
-      HMB.getOpcode() <= Hexagon::DuplexIClassF) {
+  if (MI.getOpcode() >= Hexagon::DuplexIClass0 &&
+      MI.getOpcode() <= Hexagon::DuplexIClassF) {
     assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
            "Emitting duplex without duplex parse bits");
-    unsigned dupIClass;
-    switch (HMB.getOpcode()) {
-    case Hexagon::DuplexIClass0:
-      dupIClass = 0;
-      break;
-    case Hexagon::DuplexIClass1:
-      dupIClass = 1;
-      break;
-    case Hexagon::DuplexIClass2:
-      dupIClass = 2;
-      break;
-    case Hexagon::DuplexIClass3:
-      dupIClass = 3;
-      break;
-    case Hexagon::DuplexIClass4:
-      dupIClass = 4;
-      break;
-    case Hexagon::DuplexIClass5:
-      dupIClass = 5;
-      break;
-    case Hexagon::DuplexIClass6:
-      dupIClass = 6;
-      break;
-    case Hexagon::DuplexIClass7:
-      dupIClass = 7;
-      break;
-    case Hexagon::DuplexIClass8:
-      dupIClass = 8;
-      break;
-    case Hexagon::DuplexIClass9:
-      dupIClass = 9;
-      break;
-    case Hexagon::DuplexIClassA:
-      dupIClass = 10;
-      break;
-    case Hexagon::DuplexIClassB:
-      dupIClass = 11;
-      break;
-    case Hexagon::DuplexIClassC:
-      dupIClass = 12;
-      break;
-    case Hexagon::DuplexIClassD:
-      dupIClass = 13;
-      break;
-    case Hexagon::DuplexIClassE:
-      dupIClass = 14;
-      break;
-    case Hexagon::DuplexIClassF:
-      dupIClass = 15;
-      break;
-    default:
-      llvm_unreachable("Unimplemented DuplexIClass");
-      break;
-    }
+    unsigned dupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
     // 29 is the bit position.
     // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
     // Last bit is moved to bit position 13
     Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
 
-    const MCInst *subInst0 = HMB.getOperand(0).getInst();
-    const MCInst *subInst1 = HMB.getOperand(1).getInst();
+    const MCInst *subInst0 = MI.getOperand(0).getInst();
+    const MCInst *subInst1 = MI.getOperand(1).getInst();
 
     // get subinstruction slot 0
     unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
@@ -293,14 +173,13 @@ void raise_relocation_error(unsigned bits, unsigned kind) {
 /// getFixupNoBits - Some insns are not extended and thus have no
 /// bits.  These cases require a more brute force method for determining
 /// the correct relocation.
-namespace {
-Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
-                                      const MCOperand &MO,
-                                      const MCSymbolRefExpr::VariantKind kind) {
+Hexagon::Fixups HexagonMCCodeEmitter::getFixupNoBits(
+    MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
+    const MCSymbolRefExpr::VariantKind kind) const {
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
   unsigned insnType = llvm::HexagonMCInstrInfo::getType(MCII, MI);
 
-  if (insnType == HexagonII::TypePREFIX) {
+  if (insnType == HexagonII::TypeEXTENDER) {
     switch (kind) {
     case MCSymbolRefExpr::VK_GOTREL:
       return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
@@ -319,11 +198,26 @@ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
     case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
     case MCSymbolRefExpr::VK_Hexagon_PCREL:
-    case MCSymbolRefExpr::VK_None:
-      if (MCID.isBranch())
-        return Hexagon::fixup_Hexagon_B32_PCREL_X;
-      else
-        return Hexagon::fixup_Hexagon_32_6_X;
+      return Hexagon::fixup_Hexagon_B32_PCREL_X;
+    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+      return Hexagon::fixup_Hexagon_GD_PLT_B32_PCREL_X;
+    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+      return Hexagon::fixup_Hexagon_LD_PLT_B32_PCREL_X;
+
+    case MCSymbolRefExpr::VK_None: {
+      auto Insts = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+      for (auto I = Insts.begin(), N = Insts.end(); I != N; ++I) {
+        if (I->getInst() == &MI) {
+          const MCInst &NextI = *(I+1)->getInst();
+          const MCInstrDesc &D = HexagonMCInstrInfo::getDesc(MCII, NextI);
+          if (D.isBranch() || D.isCall() ||
+              HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
+            return Hexagon::fixup_Hexagon_B32_PCREL_X;
+          return Hexagon::fixup_Hexagon_32_6_X;
+        }
+      }
+      raise_relocation_error(0, kind);
+    }
     default:
       raise_relocation_error(0, kind);
     }
@@ -406,7 +300,6 @@ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   }
   llvm_unreachable("Relocation exit not taken");
 }
-}
 
 namespace llvm {
 extern const MCInstrDesc HexagonInsts[];
@@ -430,6 +323,8 @@ namespace {
     case fixup_Hexagon_PLT_B22_PCREL:
     case fixup_Hexagon_GD_PLT_B22_PCREL:
     case fixup_Hexagon_LD_PLT_B22_PCREL:
+    case fixup_Hexagon_GD_PLT_B22_PCREL_X:
+    case fixup_Hexagon_LD_PLT_B22_PCREL_X:
     case fixup_Hexagon_6_PCREL_X:
       return true;
     default:
@@ -450,7 +345,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
   int64_t Value;
   if (ME->evaluateAsAbsolute(Value))
     return Value;
-  assert(ME->getKind() == MCExpr::SymbolRef || ME->getKind() == MCExpr::Binary);
+  assert(ME->getKind() == MCExpr::SymbolRef ||
+         ME->getKind() == MCExpr::Binary);
   if (ME->getKind() == MCExpr::Binary) {
     MCBinaryExpr const *Binary = cast<MCBinaryExpr>(ME);
     getExprOpValue(MI, MO, Binary->getLHS(), Fixups, STI);
@@ -525,10 +421,12 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
   case 22:
     switch (kind) {
     case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
-      FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL_X
+                            : Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL;
       break;
     case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
-      FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
+      FixupKind = *Extended ? Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL_X
+                            : Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL;
       break;
     case MCSymbolRefExpr::VK_None:
       FixupKind = *Extended ? Hexagon::fixup_Hexagon_B22_PCREL_X
@@ -578,10 +476,33 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
     } else
       switch (kind) {
       case MCSymbolRefExpr::VK_None: {
-        if (HexagonMCInstrInfo::s23_2_reloc(*MO.getExpr()))
-          FixupKind = Hexagon::fixup_Hexagon_23_REG;
+        if (HexagonMCInstrInfo::s27_2_reloc(*MO.getExpr()))
+          FixupKind = Hexagon::fixup_Hexagon_27_REG;
         else
-          raise_relocation_error(bits, kind);
+          if (MCID.mayStore() || MCID.mayLoad()) {
+            for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
+                 ++ImpUses) {
+              if (*ImpUses != Hexagon::GP)
+                continue;
+              switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+              case HexagonII::MemAccessSize::ByteAccess:
+                FixupKind = fixup_Hexagon_GPREL16_0;
+                break;
+              case HexagonII::MemAccessSize::HalfWordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_1;
+                break;
+              case HexagonII::MemAccessSize::WordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_2;
+                break;
+              case HexagonII::MemAccessSize::DoubleWordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_3;
+                break;
+              default:
+                raise_relocation_error(bits, kind);
+              }
+            }
+          } else
+            raise_relocation_error(bits, kind);
         break;
       }
       case MCSymbolRefExpr::VK_DTPREL:
@@ -681,6 +602,12 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
       case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
         FixupKind = Hexagon::fixup_Hexagon_LD_GOT_11_X;
         break;
+      case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+        FixupKind = Hexagon::fixup_Hexagon_GD_PLT_B22_PCREL_X;
+        break;
+      case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+        FixupKind = Hexagon::fixup_Hexagon_LD_PLT_B22_PCREL_X;
+        break;
       case MCSymbolRefExpr::VK_None:
         FixupKind = Hexagon::fixup_Hexagon_11_X;
         break;
@@ -795,19 +722,72 @@ unsigned
 HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
                                         SmallVectorImpl<MCFixup> &Fixups,
                                         MCSubtargetInfo const &STI) const {
+#ifndef NDEBUG
+  size_t OperandNumber = ~0U;
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i)
+    if (&MI.getOperand(i) == &MO) {
+      OperandNumber = i;
+      break;
+    }
+  assert((OperandNumber != ~0U) && "Operand not found");
+#endif
+
+  if (HexagonMCInstrInfo::isNewValue(MCII, MI) &&
+      &MO == &MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI))) {
+    // Calculate the new value distance to the associated producer
+    MCOperand const &MCO =
+      MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI));
+    unsigned SOffset = 0;
+    unsigned VOffset = 0;
+    unsigned Register = MCO.getReg();
+    unsigned Register1;
+    unsigned Register2;
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.begin() + *CurrentIndex - 1;
+    for (;; --i) {
+      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
+      MCInst const &Inst = *i->getInst();
+      if (HexagonMCInstrInfo::isImmext(Inst))
+        continue;
+      ++SOffset;
+      if (HexagonMCInstrInfo::isVector(MCII, Inst))
+        // Vector instructions don't count scalars
+        ++VOffset;
+      Register1 =
+        HexagonMCInstrInfo::hasNewValue(MCII, Inst)
+        ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
+        : static_cast<unsigned>(Hexagon::NoRegister);
+      Register2 =
+        HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
+        ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
+        : static_cast<unsigned>(Hexagon::NoRegister);
+      if (!RegisterMatches(Register, Register1, Register2))
+        // This isn't the register we're looking for
+        continue;
+      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+        // Producer is unpredicated
+        break;
+      assert(HexagonMCInstrInfo::isPredicated(MCII, MI) &&
+        "Unpredicated consumer depending on predicated producer");
+      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
+        HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
+        // Producer predicate sense matched ours
+        break;
+    }
+    // Hexagon PRM 10.11 Construct Nt from distance
+    unsigned Offset =
+      HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset : SOffset;
+    Offset <<= 1;
+    Offset |=
+      HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+    return Offset;
+  }
   assert(!MO.isImm());
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    if (HexagonMCInstrInfo::isSubInstruction(MI))
+    if (HexagonMCInstrInfo::isSubInstruction(MI) ||
+        llvm::HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCJ)
       return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
-    switch(MI.getOpcode()){
-    case Hexagon::A2_tfrrcr:
-    case Hexagon::A2_tfrcrr:
-      if(Reg == Hexagon::M0)
-        Reg = Hexagon::C6;
-      if(Reg == Hexagon::M1)
-        Reg = Hexagon::C7;
-    }
     return MCT.getRegisterInfo()->getEncodingValue(Reg);
   }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 8e0667d..c3a4bee 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -15,6 +15,7 @@
 #ifndef HEXAGONMCCODEEMITTER_H
 #define HEXAGONMCCODEEMITTER_H
 
+#include "MCTargetDesc/HexagonFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -31,18 +32,22 @@ class HexagonMCCodeEmitter : public MCCodeEmitter {
   std::unique_ptr<unsigned> Addend;
   std::unique_ptr<bool> Extended;
   std::unique_ptr<MCInst const *> CurrentBundle;
+  std::unique_ptr<size_t> CurrentIndex;
 
   // helper routine for getMachineOpValue()
   unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
                           const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
+  Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+                                 const MCOperand &MO,
+                                 const MCSymbolRefExpr::VariantKind kind) const;
+
 public:
   HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
 
   // Return parse bits for instruction `MCI' inside bundle `MCB'
-  uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB,
-                    MCInst const &MCI) const;
+  uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
 
   void encodeInstruction(MCInst const &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -51,7 +56,7 @@ public:
   void EncodeSingleInstruction(const MCInst &MI, raw_ostream &OS,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI,
-                               uint32_t Parse, size_t Index) const;
+                               uint32_t Parse) const;
 
   // \brief TableGen'erated function for getting the
   // binary encoding for an instruction.
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 9a09a17..127c97e 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -14,6 +14,7 @@
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
@@ -396,7 +397,7 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
 /// is found update the contents fo the bundle with the compound insn.
 /// If a compound instruction is found then the bundle will have one
 /// additional slot.
-void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
+void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                                      MCContext &Context, MCInst &MCI) {
   assert(HexagonMCInstrInfo::isBundle(MCI) &&
          "Non-Bundle where Bundle expected");
@@ -405,8 +406,24 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
   if (MCI.size() < 2)
     return;
 
+  bool StartedValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI);
+
+  // Create a vector, needed to keep the order of jump instructions.
+  MCInst CheckList(MCI);
+
   // Look for compounds until none are found, only update the bundle when
   // a compound is found.
-  while (lookForCompound(MCII, Context, MCI))
-    ;
+  while (lookForCompound(MCII, Context, CheckList)) {
+    // Keep the original bundle around in case the shuffle fails.
+    MCInst OriginalBundle(MCI);
+
+    // Need to update the bundle.
+    MCI = CheckList;
+
+    if (StartedValid &&
+        !llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI)) {
+       DEBUG(dbgs() << "Found ERROR\n");
+      MCI = OriginalBundle;
+    }
+  }
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 413f052..c7114c7 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -262,6 +263,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::EH_RETURN_JMPR:
 
   case Hexagon::J2_jumpr:
+  case Hexagon::PS_jmpret:
     // jumpr r31
     // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
     DstReg = MCI.getOperand(0).getReg();
@@ -275,6 +277,12 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::J2_jumprfnew:
   case Hexagon::J2_jumprtnewpt:
   case Hexagon::J2_jumprfnewpt:
+  case Hexagon::PS_jmprett:
+  case Hexagon::PS_jmpretf:
+  case Hexagon::PS_jmprettnew:
+  case Hexagon::PS_jmpretfnew:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmpretfnewpt:
     DstReg = MCI.getOperand(1).getReg();
     SrcReg = MCI.getOperand(0).getReg();
     // [if ([!]p0[.new])] jumpr r31
@@ -284,15 +292,10 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     }
     break;
   case Hexagon::L4_return_t:
-
   case Hexagon::L4_return_f:
-
   case Hexagon::L4_return_tnew_pnt:
-
   case Hexagon::L4_return_fnew_pnt:
-
   case Hexagon::L4_return_tnew_pt:
-
   case Hexagon::L4_return_fnew_pt:
     // [if ([!]p0[.new])] dealloc_return
     SrcReg = MCI.getOperand(0).getReg();
@@ -565,7 +568,8 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
 bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
                                              MCInst const &MIa, bool ExtendedA,
                                              MCInst const &MIb, bool ExtendedB,
-                                             bool bisReversable) {
+                                             bool bisReversable,
+                                             MCSubtargetInfo const &STI) {
   // Slot 1 cannot be extended in duplexes PRM 10.5
   if (ExtendedA)
     return false;
@@ -625,11 +629,16 @@ bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
       return false;
   }
 
-  // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
-  //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
-  if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
-    if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
-      return false;
+  if (STI.getCPU().equals_lower("hexagonv4") ||
+      STI.getCPU().equals_lower("hexagonv5") ||
+      STI.getCPU().equals_lower("hexagonv55") ||
+      STI.getCPU().equals_lower("hexagonv60")) {
+    // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
+    //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
+    if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
+      if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
+        return false;
+    }
   }
 
   return (isDuplexPairMatch(MIaG, MIbG));
@@ -692,32 +701,32 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break;
   case Hexagon::A2_addi:
     Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value);
-    assert(Absolute);(void)Absolute;
-    if (Value == 1) {
-      Result.setOpcode(Hexagon::SA1_inc);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      break;
-    } //  1,2 SUBInst $Rd = add($Rs, #1)
-    else if (Value == -1) {
-      Result.setOpcode(Hexagon::SA1_dec);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      break;
-    } //  1,2 SUBInst $Rd = add($Rs,#-1)
-    else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
-      Result.setOpcode(Hexagon::SA1_addsp);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 2);
-      break;
-    } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
-    else {
-      Result.setOpcode(Hexagon::SA1_addi);
-      addOps(Result, Inst, 0);
-      addOps(Result, Inst, 1);
-      addOps(Result, Inst, 2);
-      break;
-    } //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
+    if (Absolute) {
+      if (Value == 1) {
+        Result.setOpcode(Hexagon::SA1_inc);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 1);
+        break;
+      } //  1,2 SUBInst $Rd = add($Rs, #1)
+      if (Value == -1) {
+        Result.setOpcode(Hexagon::SA1_dec);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 1);
+        addOps(Result, Inst, 2);
+        break;
+      } //  1,2 SUBInst $Rd = add($Rs,#-1)
+      if (Inst.getOperand(1).getReg() == Hexagon::R29) {
+        Result.setOpcode(Hexagon::SA1_addsp);
+        addOps(Result, Inst, 0);
+        addOps(Result, Inst, 2);
+        break;
+      } //  1,3 SUBInst $Rd = add(r29, #$u6_2)
+    }
+    Result.setOpcode(Hexagon::SA1_addi);
+    addOps(Result, Inst, 0);
+    addOps(Result, Inst, 1);
+    addOps(Result, Inst, 2);
+    break; //    1,2,3 SUBInst $Rx = add($Rx, #$s7)
   case Hexagon::A2_add:
     Result.setOpcode(Hexagon::SA1_addrx);
     addOps(Result, Inst, 0);
@@ -806,20 +815,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break; //    none  SUBInst deallocframe
   case Hexagon::EH_RETURN_JMPR:
   case Hexagon::J2_jumpr:
+  case Hexagon::PS_jmpret:
     Result.setOpcode(Hexagon::SL2_jumpr31);
     break; //    none  SUBInst jumpr r31
   case Hexagon::J2_jumprf:
+  case Hexagon::PS_jmpretf:
     Result.setOpcode(Hexagon::SL2_jumpr31_f);
     break; //    none  SUBInst if (!p0) jumpr r31
   case Hexagon::J2_jumprfnew:
   case Hexagon::J2_jumprfnewpt:
+  case Hexagon::PS_jmpretfnewpt:
+  case Hexagon::PS_jmpretfnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_fnew);
     break; //    none  SUBInst if (!p0.new) jumpr:nt r31
   case Hexagon::J2_jumprt:
+  case Hexagon::PS_jmprett:
     Result.setOpcode(Hexagon::SL2_jumpr31_t);
     break; //    none  SUBInst if (p0) jumpr r31
   case Hexagon::J2_jumprtnew:
   case Hexagon::J2_jumprtnewpt:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmprettnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_tnew);
     break; //    none  SUBInst if (p0.new) jumpr:nt r31
   case Hexagon::L2_loadrb_io:
@@ -966,6 +982,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     if (Absolute && Value == -1) {
       Result.setOpcode(Hexagon::SA1_setin1);
       addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
       break; //  2 1 SUBInst $Rd = #-1
     } else {
       Result.setOpcode(Hexagon::SA1_seti);
@@ -1005,6 +1022,7 @@ static bool isStoreInst(unsigned opCode) {
 
 SmallVector<DuplexCandidate, 8>
 HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
+                                          MCSubtargetInfo const &STI,
                                           MCInst const &MCB) {
   assert(isBundle(MCB));
   SmallVector<DuplexCandidate, 8> duplexToTry;
@@ -1033,7 +1051,7 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
               HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
               *MCB.getOperand(j).getInst(),
               HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
-              bisReversable)) {
+              bisReversable, STI)) {
         // Get iClass.
         unsigned iClass = iClassOfDuplexPair(
             getDuplexCandidateGroup(*MCB.getOperand(k).getInst()),
@@ -1058,7 +1076,7 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
                 HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
                 *MCB.getOperand(k).getInst(),
                 HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
-                bisReversable)) {
+                bisReversable, STI)) {
           // Get iClass.
           unsigned iClass = iClassOfDuplexPair(
               getDuplexCandidateGroup(*MCB.getOperand(j).getInst()),
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 226470c..47007e0 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -29,7 +30,6 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
@@ -37,30 +37,19 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned>
-    GPSize("gpsize", cl::NotHidden,
-           cl::desc("Global Pointer Addressing Size.  The default size is 8."),
-           cl::Prefix, cl::init(8));
-
-void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
-                                           const MCSubtargetInfo &STI) {
-  MCInst HMI = HexagonMCInstrInfo::createBundle();
-  MCInst *MCB;
-
-  if (MCK.getOpcode() != Hexagon::BUNDLE) {
-    HMI.addOperand(MCOperand::createInst(&MCK));
-    MCB = &HMI;
-  } else
-    MCB = const_cast<MCInst *>(&MCK);
-
-  // Examines packet and pad the packet, if needed, when an
-  // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
-  HexagonMCShuffle(*MCII, STI, *MCB);
-
-  assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
+static cl::opt<unsigned> GPSize
+  ("gpsize", cl::NotHidden,
+   cl::desc("Global Pointer Addressing Size.  The default size is 8."),
+   cl::Prefix,
+   cl::init(8));
+
+void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
+                                           const MCSubtargetInfo &STI, bool) {
+  assert(MCB.getOpcode() == Hexagon::BUNDLE);
+  assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
+  assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
   bool Extended = false;
-  for (auto &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+  for (auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
     MCInst *MCI = const_cast<MCInst *>(I.getInst());
     if (Extended) {
       if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
@@ -77,11 +66,12 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
 
   // At this point, MCB is a bundle
   // Iterate through the bundle and assign addends for the instructions
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
     MCInst *MCI = const_cast<MCInst *>(I.getInst());
     EmitSymbol(*MCI);
   }
-  MCObjectStreamer::EmitInstruction(*MCB, STI);
+
+  MCObjectStreamer::EmitInstruction(MCB, STI);
 }
 
 void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) {
@@ -119,9 +109,11 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     MCSectionSubPair P = getCurrentSection();
     SwitchSection(&Section);
 
-    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-    EmitLabel(Symbol);
-    EmitZeros(Size);
+    if (ELFSymbol->isUndefined(false)) {
+      EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+      EmitLabel(Symbol);
+      EmitZeros(Size);
+    }
 
     // Update the maximum alignment of the section if necessary.
     if (ByteAlignment > Section.getAlignment())
@@ -144,9 +136,10 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
   ELFSymbol->setSize(MCConstantExpr::create(Size, getContext()));
 }
 
-void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
-    MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment,
-    unsigned AccessSize) {
+void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
+                                                         uint64_t Size,
+                                                         unsigned ByteAlignment,
+                                                         unsigned AccessSize) {
   getAssembler().registerSymbol(*Symbol);
   auto ELFSymbol = cast<MCSymbolELF>(Symbol);
   ELFSymbol->setBinding(ELF::STB_LOCAL);
@@ -154,11 +147,12 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
   HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
 }
 
-namespace llvm {
 
-MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                     raw_pwrite_stream &OS, MCCodeEmitter *CE) {
-  return new HexagonMCELFStreamer(Context, MAB, OS, CE);
-}
+namespace llvm {
+  MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
+                                       MCAsmBackend &MAB,
+                                       raw_pwrite_stream &OS, MCCodeEmitter *CE) {
+    return new HexagonMCELFStreamer(Context, MAB, OS, CE);
+  }
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index 0ac1a68..024dff1 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -27,7 +27,15 @@ public:
       : MCELFStreamer(Context, TAB, OS, Emitter),
         MCII(createHexagonMCInstrInfo()) {}
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  HexagonMCELFStreamer(MCContext &Context,
+                       MCAsmBackend &TAB,
+                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter,
+                       MCAssembler *Assembler) :
+  MCELFStreamer(Context, TAB, OS, Emitter),
+  MCII (createHexagonMCInstrInfo()) {}
+
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override;
   void EmitSymbol(const MCInst &Inst);
   void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment,
@@ -36,8 +44,9 @@ public:
                                  unsigned ByteAlignment, unsigned AccessSize);
 };
 
-MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                     raw_pwrite_stream &OS, MCCodeEmitter *CE);
+MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
+                                     MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                     MCCodeEmitter *CE);
 
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index e93906a..9fbe299 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -11,7 +11,9 @@
 #include "HexagonMCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -36,7 +38,47 @@ MCFragment *llvm::HexagonMCExpr::findAssociatedFragment() const {
   return Expr->findAssociatedFragment();
 }
 
-void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Cannot handle nested target MCExpr");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *be = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(be->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(be->getRHS(), Asm);
+    break;
+  }
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &symRef = *cast<MCSymbolRefExpr>(Expr);
+    switch (symRef.getKind()) {
+    default:
+      return;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_TPREL:
+      break;
+    }
+    cast<MCSymbolELF>(symRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  auto expr = getExpr();
+  fixELFSymbolsInTLSFixupsImpl(expr, Asm);
+}
 
 MCExpr const *HexagonMCExpr::getExpr() const { return Expr; }
 
@@ -52,9 +94,9 @@ void HexagonMCExpr::setMustNotExtend(bool Val) {
 }
 bool HexagonMCExpr::mustNotExtend() const { return MustNotExtend; }
 
-bool HexagonMCExpr::s23_2_reloc() const { return S23_2_reloc; }
-void HexagonMCExpr::setS23_2_reloc(bool Val) {
-  S23_2_reloc = Val;
+bool HexagonMCExpr::s27_2_reloc() const { return S27_2_reloc; }
+void HexagonMCExpr::setS27_2_reloc(bool Val) {
+  S27_2_reloc = Val;
 }
 
 bool HexagonMCExpr::classof(MCExpr const *E) {
@@ -62,7 +104,7 @@ bool HexagonMCExpr::classof(MCExpr const *E) {
 }
 
 HexagonMCExpr::HexagonMCExpr(MCExpr const *Expr)
-    : Expr(Expr), MustNotExtend(false), MustExtend(false), S23_2_reloc(false),
+    : Expr(Expr), MustNotExtend(false), MustExtend(false), S27_2_reloc(false),
       SignMismatch(false) {}
 
 void HexagonMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
@@ -75,4 +117,4 @@ void HexagonMCExpr::setSignMismatch(bool Val) {
 
 bool HexagonMCExpr::signMismatch() const {
   return SignMismatch;
-}
-\ No newline at end of file
+}
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
index bca40cf..acfd996 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -29,8 +29,8 @@ public:
   bool mustExtend() const;
   void setMustNotExtend(bool Val = true);
   bool mustNotExtend() const;
-  void setS23_2_reloc(bool Val = true);
-  bool s23_2_reloc() const;
+  void setS27_2_reloc(bool Val = true);
+  bool s27_2_reloc() const;
   void setSignMismatch(bool Val = true);
   bool signMismatch() const;
 
@@ -39,7 +39,7 @@ private:
   MCExpr const *Expr;
   bool MustNotExtend;
   bool MustExtend;
-  bool S23_2_reloc;
+  bool S27_2_reloc;
   bool SignMismatch;
 };
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index e627f02..5fe638a 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -16,13 +16,55 @@
 #include "Hexagon.h"
 #include "HexagonBaseInfo.h"
 #include "HexagonMCChecker.h"
-
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
+
+Hexagon::PacketIterator::PacketIterator(MCInstrInfo const &MCII,
+                                        MCInst const &Inst)
+    : MCII(MCII), BundleCurrent(Inst.begin() +
+                                HexagonMCInstrInfo::bundleInstructionsOffset),
+      BundleEnd(Inst.end()), DuplexCurrent(Inst.end()), DuplexEnd(Inst.end()) {}
+
+Hexagon::PacketIterator::PacketIterator(MCInstrInfo const &MCII,
+                                        MCInst const &Inst, std::nullptr_t)
+    : MCII(MCII), BundleCurrent(Inst.end()), BundleEnd(Inst.end()),
+      DuplexCurrent(Inst.end()), DuplexEnd(Inst.end()) {}
+
+Hexagon::PacketIterator &Hexagon::PacketIterator::operator++() {
+  if (DuplexCurrent != DuplexEnd) {
+    ++DuplexCurrent;
+    if (DuplexCurrent == DuplexEnd) {
+      DuplexCurrent = BundleEnd;
+      DuplexEnd = BundleEnd;
+    }
+    return *this;
+  }
+  ++BundleCurrent;
+  if (BundleCurrent != BundleEnd) {
+    MCInst const &Inst = *BundleCurrent->getInst();
+    if (HexagonMCInstrInfo::isDuplex(MCII, Inst)) {
+      DuplexCurrent = Inst.begin();
+      DuplexEnd = Inst.end();
+    }
+  }
+  return *this;
+}
+
+MCInst const &Hexagon::PacketIterator::operator*() const {
+  if (DuplexCurrent != DuplexEnd)
+    return *DuplexCurrent->getInst();
+  return *BundleCurrent->getInst();
+}
+
+bool Hexagon::PacketIterator::operator==(PacketIterator const &Other) const {
+  return BundleCurrent == Other.BundleCurrent && BundleEnd == Other.BundleEnd &&
+         DuplexCurrent == Other.DuplexCurrent && DuplexEnd == Other.DuplexEnd;
+}
+
 void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value,
                                      MCContext &Context) {
   MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context)));
@@ -42,6 +84,14 @@ void HexagonMCInstrInfo::addConstExtender(MCContext &Context,
   MCB.addOperand(MCOperand::createInst(XMCI));
 }
 
+iterator_range<Hexagon::PacketIterator>
+HexagonMCInstrInfo::bundleInstructions(MCInstrInfo const &MCII,
+                                       MCInst const &MCI) {
+  assert(isBundle(MCI));
+  return make_range(Hexagon::PacketIterator(MCII, MCI),
+                    Hexagon::PacketIterator(MCII, MCI, nullptr));
+}
+
 iterator_range<MCInst::const_iterator>
 HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) {
   assert(isBundle(MCI));
@@ -59,31 +109,36 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
                                             MCSubtargetInfo const &STI,
                                             MCContext &Context, MCInst &MCB,
                                             HexagonMCChecker *Check) {
-  // Examine the packet and convert pairs of instructions to compound
-  // instructions when possible.
-  if (!HexagonDisableCompound)
-    HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
   // Check the bundle for errors.
-  bool CheckOk = Check ? Check->check() : true;
+  bool CheckOk = Check ? Check->check(false) : true;
   if (!CheckOk)
     return false;
-  HexagonMCShuffle(MCII, STI, MCB);
+  // Examine the packet and convert pairs of instructions to compound
+  // instructions when possible.
+  if (!HexagonDisableCompound)
+    HexagonMCInstrInfo::tryCompound(MCII, STI, Context, MCB);
+  HexagonMCShuffle(Context, false, MCII, STI, MCB);
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
   MCInst InstBundlePreDuplex = MCInst(MCB);
   if (!HexagonDisableDuplex) {
     SmallVector<DuplexCandidate, 8> possibleDuplexes;
-    possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
-    HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
+    possibleDuplexes =
+        HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB);
+    HexagonMCShuffle(Context, MCII, STI, MCB, possibleDuplexes);
   }
   // Examines packet and pad the packet, if needed, when an
   // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(Context, MCB);
+  HexagonMCInstrInfo::padEndloop(MCB, Context);
   // If compounding and duplexing didn't reduce the size below
   // 4 or less we have a packet that is too big.
   if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
     return false;
-  HexagonMCShuffle(MCII, STI, MCB);
+  // Check the bundle for errors.
+  CheckOk = Check ? Check->check(true) : true;
+  if (!CheckOk)
+    return false;
+  HexagonMCShuffle(Context, true, MCII, STI, MCB);
   return true;
 }
 
@@ -111,32 +166,14 @@ MCInst HexagonMCInstrInfo::createBundle() {
   return Result;
 }
 
-MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
-                                         MCInst const &inst0,
-                                         MCInst const &inst1) {
-  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
-  MCInst *duplexInst = new (Context) MCInst;
-  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
-
-  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
-  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
-  duplexInst->addOperand(MCOperand::createInst(SubInst0));
-  duplexInst->addOperand(MCOperand::createInst(SubInst1));
-  return duplexInst;
-}
-
 MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
                                           MCInst const &Inst,
                                           MCOperand const &MO) {
   assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
          HexagonMCInstrInfo::isExtended(MCII, Inst));
 
-  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
   MCInst XMI;
-  XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
-                 HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
-                    ? Hexagon::A4_ext_b
-                    : Hexagon::A4_ext);
+  XMI.setOpcode(Hexagon::A4_ext);
   if (MO.isImm())
     XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
   else if (MO.isExpr())
@@ -146,6 +183,20 @@ MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
   return XMI;
 }
 
+MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
+                                         MCInst const &inst0,
+                                         MCInst const &inst1) {
+  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
+  MCInst *duplexInst = new (Context) MCInst;
+  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
+
+  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
+  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
+  duplexInst->addOperand(MCOperand::createInst(SubInst0));
+  duplexInst->addOperand(MCOperand::createInst(SubInst1));
+  return duplexInst;
+}
+
 MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
                                                    size_t Index) {
   assert(Index <= bundleSize(MCB));
@@ -173,22 +224,9 @@ HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
                                    HexagonII::MemAccesSizeMask));
 }
 
-unsigned HexagonMCInstrInfo::getBitCount(MCInstrInfo const &MCII,
-                                         MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
-}
-
-// Return constant extended operand number.
-unsigned short HexagonMCInstrInfo::getCExtOpNum(MCInstrInfo const &MCII,
-                                                MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
-}
-
 MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
                                                MCInst const &MCI) {
-  return (MCII.get(MCI.getOpcode()));
+  return MCII.get(MCI.getOpcode());
 }
 
 unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
@@ -276,38 +314,36 @@ unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
   return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
 }
 
-// Return the max value that a constant extendable operand can have
-// without being extended.
+/// Return the maximum value of an extendable operand.
 int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned isSigned =
-      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
 
-  if (isSigned) // if value is signed
-    return ~(-1U << (bits - 1));
-  else
-    return ~(-1U << bits);
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+
+  if (S) // if value is signed
+    return (1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1)) - 1;
+  return (1 << HexagonMCInstrInfo::getExtentBits(MCII, MCI)) - 1;
 }
 
-// Return the min value that a constant extendable operand can have
-// without being extended.
+/// Return the minimum value of an extendable operand.
 int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned isSigned =
-      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
 
-  if (isSigned) // if value is signed
-    return -1U << (bits - 1);
-  else
-    return 0;
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+
+  if (S) // if value is signed
+    return -(1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1));
+  return 0;
 }
 
 StringRef HexagonMCInstrInfo::getName(MCInstrInfo const &MCII,
-                                        MCInst const &MCI) {
+                                      MCInst const &MCI) {
   return MCII.getName(MCI.getOpcode());
 }
 
@@ -319,9 +355,7 @@ unsigned short HexagonMCInstrInfo::getNewValueOp(MCInstrInfo const &MCII,
 
 MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
                                                         MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned const O =
-      (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
+  unsigned O = HexagonMCInstrInfo::getNewValueOp(MCII, MCI);
   MCOperand const &MCO = MCI.getOperand(O);
 
   assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
@@ -349,45 +383,54 @@ HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
   return (MCO);
 }
 
-int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
-                                     MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-
-  HexagonII::SubTarget Target = static_cast<HexagonII::SubTarget>(
-      (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask);
-
-  switch (Target) {
-  default:
-    return Hexagon::ArchV4;
-  case HexagonII::HasV5SubT:
-    return Hexagon::ArchV5;
-  }
-}
-
-// Return the Hexagon ISA class for the insn.
+/// Return the Hexagon ISA class for the insn.
 unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-
+  const uint64_t F = MCII.get(MCI.getOpcode()).TSFlags;
   return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
 }
 
+/// Return the slots this instruction can execute out of
 unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
                                       MCSubtargetInfo const &STI,
                                       MCInst const &MCI) {
-
   const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
   int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
   return ((II[SchedClass].FirstStage + HexagonStages)->getUnits());
 }
 
-bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+/// Return the slots this instruction consumes in addition to
+/// the slot(s) it can execute out of
+
+unsigned HexagonMCInstrInfo::getOtherReservedSlots(MCInstrInfo const &MCII,
+                                                   MCSubtargetInfo const &STI,
+                                                   MCInst const &MCI) {
+  const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+  int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  unsigned Slots = 0;
+
+  // FirstStage are slots that this instruction can execute in.
+  // FirstStage+1 are slots that are also consumed by this instruction.
+  // For example: vmemu can only execute in slot 0 but also consumes slot 1.
+  for (unsigned Stage = II[SchedClass].FirstStage + 1;
+       Stage < II[SchedClass].LastStage; ++Stage) {
+    unsigned Units = (Stage + HexagonStages)->getUnits();
+    if (Units > HexagonGetLastSlot())
+      break;
+    // fyi: getUnits() will return 0x1, 0x2, 0x4 or 0x8
+    Slots |= Units;
+  }
+
+  // if 0 is returned, then no additional slots are consumed by this inst.
+  return Slots;
+}
+
+bool HexagonMCInstrInfo::hasDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
   if (!HexagonMCInstrInfo::isBundle(MCI))
     return false;
 
-  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
-    auto MI = I.getInst();
-    if (isImmext(*MI))
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCI)) {
+    if (HexagonMCInstrInfo::isDuplex(MCII, I))
       return true;
   }
 
@@ -398,7 +441,19 @@ bool HexagonMCInstrInfo::hasExtenderForIndex(MCInst const &MCB, size_t Index) {
   return extenderForIndex(MCB, Index) != nullptr;
 }
 
-// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInstrInfo::hasImmExt( MCInst const &MCI) {
+  if (!HexagonMCInstrInfo::isBundle(MCI))
+    return false;
+
+  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
+    if (isImmext(*I.getInst()))
+      return true;
+  }
+
+  return false;
+}
+
+/// Return whether the insn produces a value.
 bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -418,46 +473,19 @@ MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
   return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
 }
 
+/// Return where the instruction is an accumulator.
+bool HexagonMCInstrInfo::isAccumulator(MCInstrInfo const &MCII,
+                                       MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
 bool HexagonMCInstrInfo::isBundle(MCInst const &MCI) {
   auto Result = Hexagon::BUNDLE == MCI.getOpcode();
   assert(!Result || (MCI.size() > 0 && MCI.getOperand(0).isImm()));
   return Result;
 }
 
-// Return whether the insn is an actual insn.
-bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
-          !HexagonMCInstrInfo::isPrefix(MCII, MCI) &&
-          HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
-}
-
-bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
-}
-
-bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
-                                    MCInst const &MCI) {
-  return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
-}
-
-bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
-  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
-          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
-}
-
-bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
-}
-
-// Return whether the instruction needs to be constant extended.
-// 1) Always return true if the instruction has 'isExtended' flag set.
-//
-// isExtendable:
-// 2) For immediate extended operands, return true only if the value is
-//    out-of-range.
-// 3) For global address, always return true.
-
 bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   if (HexagonMCInstrInfo::isExtended(MCII, MCI))
@@ -470,9 +498,9 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
     return true;
   // Branch insns are handled as necessary by relaxation.
   if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
-      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCJ &&
        HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
-      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNCJ &&
        HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
     return false;
   // Otherwise loop instructions and other CR insts are handled by relaxation
@@ -492,6 +520,35 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
   return (MinValue > Value || Value > MaxValue);
 }
 
+bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return !HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
+         !HexagonMCInstrInfo::isPrefix(MCII, MCI);
+}
+
+bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
+}
+
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  return (getType(MCII, MCI) == HexagonII::TypeCJ);
+}
+
+bool HexagonMCInstrInfo::isCVINew(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::CVINewPos) & HexagonII::CVINewMask);
+}
+
+bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
+          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
+}
+
+bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
   uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -510,9 +567,7 @@ bool HexagonMCInstrInfo::isFloat(MCInstrInfo const &MCII, MCInst const &MCI) {
 }
 
 bool HexagonMCInstrInfo::isImmext(MCInst const &MCI) {
-  auto Op = MCI.getOpcode();
-  return (Op == Hexagon::A4_ext_b || Op == Hexagon::A4_ext_c ||
-          Op == Hexagon::A4_ext_g || Op == Hexagon::A4_ext);
+  return MCI.getOpcode() == Hexagon::A4_ext;
 }
 
 bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
@@ -530,20 +585,17 @@ bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
           (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
 }
 
-// Return whether the insn is a new-value consumer.
+/// Return whether the insn expects newly produced value.
 bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
 }
 
-// Return whether the operand can be constant extended.
-bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
-                                           MCInst const &MCI,
-                                           unsigned short OperandNum) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) ==
-         OperandNum;
+/// Return whether the operand is extendable.
+bool HexagonMCInstrInfo::isOpExtendable(MCInstrInfo const &MCII,
+                                        MCInst const &MCI, unsigned short O) {
+  return (O == HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
 }
 
 bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
@@ -558,6 +610,10 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
+bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeEXTENDER == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -582,12 +638,22 @@ bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
   return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
 }
 
-bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
+/// Return whether the insn can be packaged only with A and X-type insns.
+bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
 }
 
-bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+/// Return whether the insn can be packaged only with an A-type insn in slot #1.
+bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
+}
+
+/// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = MCII.get(MCI.getOpcode()).TSFlags;
   return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
 }
 
@@ -663,17 +729,6 @@ bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
   }
 }
 
-bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
-}
-
-bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
-                                    MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
-}
-
 bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
   if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
       (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
@@ -705,16 +760,26 @@ bool HexagonMCInstrInfo::mustExtend(MCExpr const &Expr) {
   return HExpr.mustExtend();
 }
 void HexagonMCInstrInfo::setMustNotExtend(MCExpr const &Expr, bool Val) {
-  HexagonMCExpr &HExpr =
-      const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HexagonMCExpr &HExpr = const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
   HExpr.setMustNotExtend(Val);
 }
 bool HexagonMCInstrInfo::mustNotExtend(MCExpr const &Expr) {
   HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
   return HExpr.mustNotExtend();
 }
+void HexagonMCInstrInfo::setS27_2_reloc(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
+  HExpr.setS27_2_reloc(Val);
+}
+bool HexagonMCInstrInfo::s27_2_reloc(MCExpr const &Expr) {
+  HexagonMCExpr const *HExpr = llvm::dyn_cast<HexagonMCExpr>(&Expr);
+  if (!HExpr)
+    return false;
+  return HExpr->s27_2_reloc();
+}
 
-void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
+void HexagonMCInstrInfo::padEndloop(MCInst &MCB, MCContext &Context) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
   assert(isBundle(MCB));
@@ -727,22 +792,8 @@ void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
 
 bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
-  if (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR)
-    return false;
-
-  unsigned SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_2_SLOT23:
-  case Hexagon::Sched::ALU64_tc_3x_SLOT23:
-  case Hexagon::Sched::M_tc_2_SLOT23:
-  case Hexagon::Sched::M_tc_3x_SLOT23:
-  case Hexagon::Sched::S_2op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_3x_SLOT23:
-    return true;
-  }
-  return false;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::PrefersSlot3Pos) & HexagonII::PrefersSlot3Mask;
 }
 
 void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
@@ -778,15 +829,6 @@ void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
   Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
   assert(isMemStoreReorderEnabled(MCI));
 }
-void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
-  HexagonMCExpr &HExpr =
-      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
-  HExpr.setS23_2_reloc(Val);
-}
-bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
-  HexagonMCExpr const &HExpr = *llvm::cast<HexagonMCExpr>(&Expr);
-  return HExpr.s23_2_reloc();
-}
 
 void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
   assert(isBundle(MCI));
@@ -806,4 +848,4 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
     return 0x1;
   return 0;
 }
-}
+} // namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index d701c3a..ca44c3a 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -19,11 +19,8 @@
 
 namespace llvm {
 class HexagonMCChecker;
-class MCContext;
 class MCInstrDesc;
 class MCInstrInfo;
-class MCInst;
-class MCOperand;
 class MCSubtargetInfo;
 namespace HexagonII {
 enum class MemAccessSize;
@@ -34,6 +31,25 @@ public:
   DuplexCandidate(unsigned i, unsigned j, unsigned iClass)
       : packetIndexI(i), packetIndexJ(j), iClass(iClass) {}
 };
+namespace Hexagon {
+class PacketIterator {
+  MCInstrInfo const &MCII;
+  MCInst::const_iterator BundleCurrent;
+  MCInst::const_iterator BundleEnd;
+  MCInst::const_iterator DuplexCurrent;
+  MCInst::const_iterator DuplexEnd;
+
+public:
+  PacketIterator(MCInstrInfo const &MCII, MCInst const &Inst);
+  PacketIterator(MCInstrInfo const &MCII, MCInst const &Inst, std::nullptr_t);
+  PacketIterator &operator++();
+  MCInst const &operator*() const;
+  bool operator==(PacketIterator const &Other) const;
+  bool operator!=(PacketIterator const &Other) const {
+    return !(*this == Other);
+  }
+};
+} // namespace Hexagon
 namespace HexagonMCInstrInfo {
 size_t const innerLoopOffset = 0;
 int64_t const innerLoopMask = 1 << innerLoopOffset;
@@ -57,6 +73,8 @@ void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
                       MCInst const &MCI);
 
 // Returns a iterator range of instructions in this bundle
+iterator_range<Hexagon::PacketIterator>
+bundleInstructions(MCInstrInfo const &MCII, MCInst const &MCI);
 iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI);
 
 // Returns the number of instructions in the bundle
@@ -67,16 +85,6 @@ bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                         MCContext &Context, MCInst &MCB,
                         HexagonMCChecker *Checker);
 
-// Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
-
-MCInst createBundle();
-
-// Return the extender for instruction at Index or nullptr if none
-MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
-void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
-                    MCInst const &MCI);
-
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
                      MCInst const &inst1);
@@ -86,27 +94,28 @@ MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
 // Convert this instruction in to a duplex subinst
 MCInst deriveSubInst(MCInst const &Inst);
 
+// Clamp off upper 26 bits of extendable operand for emission
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
 // Return the extender for instruction at Index or nullptr if none
 MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                    MCInst const &MCI);
 
 // Return memory access size
 HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII,
                                        MCInst const &MCI);
-
-// Return number of bits in the constant extended operand.
-unsigned getBitCount(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return constant extended operand number.
-unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
-
 MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return which duplex group this instruction belongs to
 unsigned getDuplexCandidateGroup(MCInst const &MI);
 
 // Return a list of all possible instruction duplex combinations
-SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
-                                                      MCInst const &MCB);
+SmallVector<DuplexCandidate, 8>
+getDuplexPossibilties(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst const &MCB);
 unsigned getDuplexRegisterNumbering(unsigned Reg);
 
 MCExpr const &getExpr(MCExpr const &Expr);
@@ -144,14 +153,15 @@ unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
 MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
                                      MCInst const &MCI);
 
-int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
-
 // Return the Hexagon ISA class for the insn.
 unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
 
 /// Return the slots used by the insn.
 unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                   MCInst const &MCI);
+unsigned getOtherReservedSlots(MCInstrInfo const &MCII,
+                               MCSubtargetInfo const &STI, MCInst const &MCI);
+bool hasDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Does the packet have an extender for the instruction at Index
 bool hasExtenderForIndex(MCInst const &MCB, size_t Index);
@@ -161,19 +171,6 @@ bool hasImmExt(MCInst const &MCI);
 // Return whether the instruction is a legal new-value producer.
 bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return the instruction at Index
-MCInst const &instruction(MCInst const &MCB, size_t Index);
-
-// Returns whether this MCInst is a wellformed bundle
-bool isBundle(MCInst const &MCI);
-
-// Return whether the insn is an actual insn.
-bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
-bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
-bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return the duplex iclass given the two duplex classes
 unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
 
 int64_t minConstant(MCInst const &MCI, size_t Index);
@@ -189,8 +186,21 @@ template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
   return isUInt<N>(minConstant(MCI, Index));
 }
 
+// Return the instruction at Index
+MCInst const &instruction(MCInst const &MCB, size_t Index);
+bool isAccumulator(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Returns whether this MCInst is a wellformed bundle
+bool isBundle(MCInst const &MCI);
+
+// Return whether the insn is an actual insn.
+bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
+
 // Return whether the instruction needs to be constant extended.
 bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCVINew(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Is this double register suitable for use in a duplex subinst
 bool isDblRegForSubInst(unsigned Reg);
@@ -229,15 +239,12 @@ bool isMemStoreReorderEnabled(MCInst const &MCI);
 
 // Return whether the insn is a new-value consumer.
 bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return true if the operand can be constant extended.
-bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
-                       unsigned short OperandNum);
+bool isOpExtendable(MCInstrInfo const &MCII, MCInst const &MCI, unsigned short);
 
 // Can these two instructions be duplexed
 bool isOrderedDuplexPair(MCInstrInfo const &MCII, MCInst const &MIa,
                          bool ExtendedA, MCInst const &MIb, bool ExtendedB,
-                         bool bisReversable);
+                         bool bisReversable, MCSubtargetInfo const &STI);
 
 // Returns whether this bundle is an endloop1
 bool isOuterLoop(MCInst const &MCI);
@@ -270,21 +277,20 @@ bool mustExtend(MCExpr const &Expr);
 bool mustNotExtend(MCExpr const &Expr);
 
 // Pad the bundle with nops to satisfy endloop requirements
-void padEndloop(MCContext &Context, MCInst &MCI);
-
+void padEndloop(MCInst &MCI, MCContext &Context);
 bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Replace the instructions inside MCB, represented by Candidate
-void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
+void replaceDuplex(MCContext &Context, MCInst &MCI, DuplexCandidate Candidate);
 
-bool s23_2_reloc(MCExpr const &Expr);
+bool s27_2_reloc(MCExpr const &Expr);
 // Marks a bundle as endloop0
 void setInnerLoop(MCInst &MCI);
 void setMemReorderDisabled(MCInst &MCI);
 void setMemStoreReorderEnabled(MCInst &MCI);
 void setMustExtend(MCExpr const &Expr, bool Val = true);
 void setMustNotExtend(MCExpr const &Expr, bool Val = true);
-void setS23_2_reloc(MCExpr const &Expr, bool Val = true);
+void setS27_2_reloc(MCExpr const &Expr, bool Val = true);
 
 // Marks a bundle as endloop1
 void setOuterLoop(MCInst &MCI);
@@ -295,8 +301,9 @@ unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
                         unsigned Producer2);
 
 // Attempt to find and replace compound pairs
-void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
-}
-}
+void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                 MCContext &Context, MCInst &MCI);
+} // namespace HexagonMCInstrInfo
+} // namespace llvm
 
 #endif // LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 7f8e7a4..b2c7f15 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -14,9 +14,9 @@
 
 #define DEBUG_TYPE "hexagon-shuffle"
 
+#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -33,58 +33,58 @@ void HexagonMCShuffler::init(MCInst &MCB) {
     MCInst const *Extender = nullptr;
     // Copy the bundle for the shuffling.
     for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
-      MCInst *MI = const_cast<MCInst *>(I.getInst());
+      MCInst &MI = *const_cast<MCInst *>(I.getInst());
+      DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode()) << '\n');
+      assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo());
 
-      if (!HexagonMCInstrInfo::isImmext(*MI)) {
-        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
-               false);
+      if (!HexagonMCInstrInfo::isImmext(MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, MI));
         Extender = nullptr;
       } else
-        Extender = MI;
+        Extender = &MI;
     }
   }
 
+  Loc = MCB.getLoc();
   BundleFlags = MCB.getOperand(0).getImm();
 }
 
-void HexagonMCShuffler::init(MCInst &MCB, MCInst const *AddMI,
+void HexagonMCShuffler::init(MCInst &MCB, MCInst const &AddMI,
                              bool bInsertAtFront) {
   if (HexagonMCInstrInfo::isBundle(MCB)) {
-    if (bInsertAtFront && AddMI)
-      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
-             false);
+    if (bInsertAtFront)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, AddMI));
     MCInst const *Extender = nullptr;
     // Copy the bundle for the shuffling.
     for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
       assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
-      MCInst *MI = const_cast<MCInst *>(I.getInst());
-      if (!HexagonMCInstrInfo::isImmext(*MI)) {
-        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
-               false);
+      MCInst &MI = *const_cast<MCInst *>(I.getInst());
+      if (!HexagonMCInstrInfo::isImmext(MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, MI));
         Extender = nullptr;
       } else
-        Extender = MI;
+        Extender = &MI;
     }
-    if (!bInsertAtFront && AddMI)
-      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
-             false);
+    if (!bInsertAtFront)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, AddMI));
   }
 
+  Loc = MCB.getLoc();
   BundleFlags = MCB.getOperand(0).getImm();
 }
 
 void HexagonMCShuffler::copyTo(MCInst &MCB) {
   MCB.clear();
   MCB.addOperand(MCOperand::createImm(BundleFlags));
+  MCB.setLoc(Loc);
   // Copy the results into the bundle.
   for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
 
-    MCInst const *MI = I->getDesc();
+    MCInst const &MI = I->getDesc();
     MCInst const *Extender = I->getExtender();
     if (Extender)
       MCB.addOperand(MCOperand::createInst(Extender));
-    MCB.addOperand(MCOperand::createInst(MI));
+    MCB.addOperand(MCOperand::createInst(&MI));
   }
 }
 
@@ -92,15 +92,16 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
   if (shuffle()) {
     // Copy the results into the bundle.
     copyTo(MCB);
-  } else
-    DEBUG(MCB.dump());
-
-  return (!getError());
+    return true;
+  }
+  DEBUG(MCB.dump());
+  return false;
 }
 
-bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal,
+                            MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                             MCInst &MCB) {
-  HexagonMCShuffler MCS(MCII, STI, MCB);
+  HexagonMCShuffler MCS(Context, Fatal, MCII, STI, MCB);
 
   if (DisableShuffle)
     // Ignore if user chose so.
@@ -120,40 +121,16 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
     return false;
   }
 
-  // Reorder the bundle and copy the result.
-  if (!MCS.reshuffleTo(MCB)) {
-    // Unless there is any error, which should not happen at this point.
-    unsigned shuffleError = MCS.getError();
-    switch (shuffleError) {
-    default:
-      llvm_unreachable("unknown error");
-    case HexagonShuffler::SHUFFLE_ERROR_INVALID:
-      llvm_unreachable("invalid packet");
-    case HexagonShuffler::SHUFFLE_ERROR_STORES:
-      llvm_unreachable("too many stores");
-    case HexagonShuffler::SHUFFLE_ERROR_LOADS:
-      llvm_unreachable("too many loads");
-    case HexagonShuffler::SHUFFLE_ERROR_BRANCHES:
-      llvm_unreachable("too many branches");
-    case HexagonShuffler::SHUFFLE_ERROR_NOSLOTS:
-      llvm_unreachable("no suitable slot");
-    case HexagonShuffler::SHUFFLE_ERROR_SLOTS:
-      llvm_unreachable("over-subscribed slots");
-    case HexagonShuffler::SHUFFLE_SUCCESS: // Single instruction case.
-      return true;
-    }
-  }
-
-  return true;
+  return MCS.reshuffleTo(MCB);
 }
 
-unsigned
-llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                       MCContext &Context, MCInst &MCB,
+bool
+llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
+                       MCSubtargetInfo const &STI, MCInst &MCB,
                        SmallVector<DuplexCandidate, 8> possibleDuplexes) {
 
   if (DisableShuffle)
-    return HexagonShuffler::SHUFFLE_SUCCESS;
+    return false;
 
   if (!HexagonMCInstrInfo::bundleSize(MCB)) {
     // There once was a bundle:
@@ -163,47 +140,45 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
     // After the IMPLICIT_DEFs were removed by the asm printer, the bundle
     // became empty.
     DEBUG(dbgs() << "Skipping empty bundle");
-    return HexagonShuffler::SHUFFLE_SUCCESS;
+    return false;
   } else if (!HexagonMCInstrInfo::isBundle(MCB)) {
     DEBUG(dbgs() << "Skipping stand-alone insn");
-    return HexagonShuffler::SHUFFLE_SUCCESS;
+    return false;
   }
 
   bool doneShuffling = false;
-  unsigned shuffleError;
   while (possibleDuplexes.size() > 0 && (!doneShuffling)) {
     // case of Duplex Found
     DuplexCandidate duplexToTry = possibleDuplexes.pop_back_val();
     MCInst Attempt(MCB);
     HexagonMCInstrInfo::replaceDuplex(Context, Attempt, duplexToTry);
-    HexagonMCShuffler MCS(MCII, STI, Attempt); // copy packet to the shuffler
+    HexagonMCShuffler MCS(Context, false, MCII, STI, Attempt); // copy packet to the shuffler
     if (MCS.size() == 1) {                     // case of one duplex
       // copy the created duplex in the shuffler to the bundle
       MCS.copyTo(MCB);
-      return HexagonShuffler::SHUFFLE_SUCCESS;
+      return false;
     }
     // try shuffle with this duplex
     doneShuffling = MCS.reshuffleTo(MCB);
-    shuffleError = MCS.getError();
 
     if (doneShuffling)
       break;
   }
 
   if (doneShuffling == false) {
-    HexagonMCShuffler MCS(MCII, STI, MCB);
+    HexagonMCShuffler MCS(Context, false, MCII, STI, MCB);
     doneShuffling = MCS.reshuffleTo(MCB); // shuffle
-    shuffleError = MCS.getError();
   }
   if (!doneShuffling)
-    return shuffleError;
+    return true;
 
-  return HexagonShuffler::SHUFFLE_SUCCESS;
+  return false;
 }
 
-bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                            MCInst &MCB, MCInst const *AddMI, int fixupCount) {
-  if (!HexagonMCInstrInfo::isBundle(MCB) || !AddMI)
+bool llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
+                            MCSubtargetInfo const &STI, MCInst &MCB,
+                            MCInst const &AddMI, int fixupCount) {
+  if (!HexagonMCInstrInfo::isBundle(MCB))
     return false;
 
   // if fixups present, make sure we don't insert too many nops that would
@@ -211,8 +186,15 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   unsigned int bundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if (bundleSize >= HEXAGON_PACKET_SIZE)
     return false;
+  bool bhasDuplex = HexagonMCInstrInfo::hasDuplex(MCII, MCB);
   if (fixupCount >= 2) {
-    return false;
+    if (bhasDuplex) {
+      if (bundleSize >= HEXAGON_PACKET_SIZE - 1) {
+        return false;
+      }
+    } else {
+      return false;
+    }
   } else {
     if (bundleSize == HEXAGON_PACKET_SIZE - 1 && fixupCount)
       return false;
@@ -221,16 +203,15 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   if (DisableShuffle)
     return false;
 
-  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI);
-  if (!MCS.reshuffleTo(MCB)) {
-    unsigned shuffleError = MCS.getError();
-    switch (shuffleError) {
-    default:
-      return false;
-    case HexagonShuffler::SHUFFLE_SUCCESS: // single instruction case
-      return true;
-    }
-  }
+  // mgl: temporary code (shuffler doesn't take into account the fact that
+  // a duplex takes up two slots.  for example, 3 nops can be put into a packet
+  // containing a duplex oversubscribing slots by 1).
+  unsigned maxBundleSize = (HexagonMCInstrInfo::hasImmExt(MCB))
+                               ? HEXAGON_PACKET_SIZE
+                               : HEXAGON_PACKET_SIZE - 1;
+  if (bhasDuplex && bundleSize >= maxBundleSize)
+    return false;
 
-  return true;
+  HexagonMCShuffler MCS(Context, false, MCII, STI, MCB, AddMI, false);
+  return MCS.reshuffleTo(MCB);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
index a21cce1..dbe85b4 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -18,25 +18,20 @@
 #include "MCTargetDesc/HexagonShuffler.h"
 
 namespace llvm {
-
 class MCInst;
-
 // Insn bundle shuffler.
 class HexagonMCShuffler : public HexagonShuffler {
-  bool immext_present;
-  bool duplex_present;
-
 public:
-  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                    MCInst &MCB)
-      : HexagonShuffler(MCII, STI) {
+  HexagonMCShuffler(MCContext &Context, bool Fatal, MCInstrInfo const &MCII,
+                    MCSubtargetInfo const &STI, MCInst &MCB)
+      : HexagonShuffler(Context, Fatal, MCII, STI) {
     init(MCB);
   };
-  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                    MCInst &MCB, const MCInst *AddMI,
-                    bool bInsertAtFront = false)
-      : HexagonShuffler(MCII, STI) {
-    init(MCB, AddMI, bInsertAtFront);
+  HexagonMCShuffler(MCContext &Context, bool Fatal, MCInstrInfo const &MCII,
+                    MCSubtargetInfo const &STI, MCInst &MCB,
+                    MCInst const &AddMI, bool InsertAtFront)
+      : HexagonShuffler(Context, Fatal, MCII, STI) {
+    init(MCB, AddMI, InsertAtFront);
   };
 
   // Copy reordered bundle to another.
@@ -44,22 +39,20 @@ public:
   // Reorder and copy result to another.
   bool reshuffleTo(MCInst &MCB);
 
-  bool immextPresent() const { return immext_present; };
-  bool duplexPresent() const { return duplex_present; };
-
 private:
   void init(MCInst &MCB);
-  void init(MCInst &MCB, const MCInst *AddMI, bool bInsertAtFront = false);
+  void init(MCInst &MCB, MCInst const &AddMI, bool InsertAtFront);
 };
 
 // Invocation of the shuffler.
-bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                      MCInst &);
-bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                      MCInst &, const MCInst *, int);
-unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                          MCContext &Context, MCInst &,
-                          SmallVector<DuplexCandidate, 8>);
-}
+bool HexagonMCShuffle(MCContext &Context, bool Fatal, MCInstrInfo const &MCII,
+                      MCSubtargetInfo const &STI, MCInst &);
+bool HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
+                      MCSubtargetInfo const &STI, MCInst &, MCInst const &,
+                      int);
+bool HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
+                      MCSubtargetInfo const &STI, MCInst &,
+                      SmallVector<DuplexCandidate, 8>);
+} // namespace llvm
 
 #endif // HEXAGONMCSHUFFLER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 694cf58..1a36154 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -11,25 +11,26 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "Hexagon.h"
 #include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
 #include <new>
@@ -66,6 +67,12 @@ static cl::opt<bool> HexagonV55ArchVariant("mv55", cl::Hidden, cl::init(false),
 static cl::opt<bool> HexagonV60ArchVariant("mv60", cl::Hidden, cl::init(false),
   cl::desc("Build for Hexagon V60"));
 
+static cl::opt<bool> HexagonV62ArchVariant("mv62", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V62"));
+
+static cl::opt<bool> EnableHVX("mhvx", cl::Hidden, cl::init(false),
+  cl::desc("Enable Hexagon Vector Extension (HVX)"));
+
 static StringRef DefaultArch = "hexagonv60";
 
 static StringRef HexagonGetArchVariant() {
@@ -77,6 +84,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv55";
   if (HexagonV60ArchVariant)
     return "hexagonv60";
+  if (HexagonV62ArchVariant)
+    return "hexagonv62";
   return "";
 }
 
@@ -95,31 +104,16 @@ StringRef Hexagon_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
   return ArchV;
 }
 
-MCInstrInfo *llvm::createHexagonMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitHexagonMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitHexagonMCRegisterInfo(X, Hexagon::R31);
-  return X;
-}
-
-static MCSubtargetInfo *
-createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  CPU = Hexagon_MC::selectHexagonCPU(TT, CPU);
-  return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
-}
+unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV4FU::SLOT3; }
 
 namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
 public:
   HexagonTargetAsmStreamer(MCStreamer &S,
-                           formatted_raw_ostream &, bool,
-                           MCInstPrinter &)
+                           formatted_raw_ostream &OS,
+                           bool isVerboseAsm,
+                           MCInstPrinter &IP)
       : HexagonTargetStreamer(S) {}
 
   void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS,
@@ -156,24 +150,15 @@ public:
 
 class HexagonTargetELFStreamer : public HexagonTargetStreamer {
 public:
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
   HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
       : HexagonTargetStreamer(S) {
-    auto Bits = STI.getFeatureBits();
-    unsigned Flags = 0;
-    if (Bits[Hexagon::ArchV60])
-      Flags = ELF::EF_HEXAGON_MACH_V60;
-    else if (Bits[Hexagon::ArchV55])
-      Flags = ELF::EF_HEXAGON_MACH_V55;
-    else if (Bits[Hexagon::ArchV5])
-      Flags = ELF::EF_HEXAGON_MACH_V5;
-    else if (Bits[Hexagon::ArchV4])
-      Flags = ELF::EF_HEXAGON_MACH_V4;
-    getStreamer().getAssembler().setELFHeaderEFlags(Flags);
+    MCAssembler &MCA = getStreamer().getAssembler();
+    MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
-  MCELFStreamer &getStreamer() {
-    return static_cast<MCELFStreamer &>(Streamer);
-  }
 
   void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                               unsigned ByteAlignment,
@@ -196,13 +181,26 @@ public:
 
 } // end anonymous namespace
 
+llvm::MCInstrInfo *llvm::createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R31);
+  return X;
+}
+
 static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
                                          const Triple &TT) {
   MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(nullptr, Hexagon::R30, 0);
+      MCCFIInstruction::createDefCfa(nullptr,
+          MRI.getDwarfRegNum(Hexagon::R30, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -212,31 +210,138 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
                                                  const MCInstrInfo &MII,
-                                                 const MCRegisterInfo &MRI) {
+                                                 const MCRegisterInfo &MRI)
+{
   if (SyntaxVariant == 0)
-    return (new HexagonInstPrinter(MAI, MII, MRI));
+    return new HexagonInstPrinter(MAI, MII, MRI);
   else
     return nullptr;
 }
 
-static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
-                                                   formatted_raw_ostream &OS,
-                                                   MCInstPrinter *InstPrint,
-                                                   bool IsVerboseAsm) {
-  return new HexagonTargetAsmStreamer(S,  OS, IsVerboseAsm, *InstPrint);
+static MCTargetStreamer *
+createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                          MCInstPrinter *IP, bool IsVerboseAsm) {
+  return new HexagonTargetAsmStreamer(S, OS, IsVerboseAsm, *IP);
 }
 
-static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
-                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
-                                    MCCodeEmitter *Emitter, bool RelaxAll) {
-  return createHexagonELFStreamer(Context, MAB, OS, Emitter);
+static MCStreamer *createMCStreamer(Triple const &T,
+                                    MCContext &Context,
+                                    MCAsmBackend &MAB,
+                                    raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll) {
+  return createHexagonELFStreamer(T, Context, MAB, OS, Emitter);
 }
 
 static MCTargetStreamer *
-createHexagonObjectTargetStreamer(MCStreamer &S, MCSubtargetInfo const &STI) {
+createHexagonObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   return new HexagonTargetELFStreamer(S, STI);
 }
 
+static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
+  uint64_t FB = STI->getFeatureBits().to_ullong();
+  if (FB & (1ULL << F))
+    STI->ToggleFeature(F);
+}
+
+static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
+  uint64_t FB = STI->getFeatureBits().to_ullong();
+  return (FB & (1ULL << F)) != 0;
+}
+
+StringRef Hexagon_MC::ParseHexagonTriple(const Triple &TT, StringRef CPU) {
+  StringRef CPUName = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  StringRef FS = "";
+  if (EnableHVX) {
+    if (CPUName.equals_lower("hexagonv60") ||
+        CPUName.equals_lower("hexagonv62"))
+      FS = "+hvx";
+  }
+  return FS;
+}
+
+static bool isCPUValid(std::string CPU)
+{
+  std::vector<std::string> table
+  {
+    "hexagonv4",
+    "hexagonv5",
+    "hexagonv55",
+    "hexagonv60",
+    "hexagonv62",
+  };
+
+  return std::find(table.begin(), table.end(), CPU) != table.end();
+}
+
+MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
+                                                          StringRef CPU,
+                                                          StringRef FS) {
+  StringRef ArchFS = (FS.size()) ? FS : Hexagon_MC::ParseHexagonTriple(TT, CPU);
+  StringRef CPUName = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  if (!isCPUValid(CPUName.str())) {
+    errs() << "error: invalid CPU \"" << CPUName.str().c_str()
+           << "\" specified\n";
+    return nullptr;
+  }
+
+  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
+  if (X->getFeatureBits()[Hexagon::ExtensionHVXDbl]) {
+    llvm::FeatureBitset Features = X->getFeatureBits();
+    X->setFeatureBits(Features.set(Hexagon::ExtensionHVX));
+  }
+  return X;
+}
+
+unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
+  static std::map<StringRef,unsigned> ElfFlags = {
+    {"hexagonv4",  ELF::EF_HEXAGON_MACH_V4},
+    {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
+    {"hexagonv55", ELF::EF_HEXAGON_MACH_V55},
+    {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
+    {"hexagonv62", ELF::EF_HEXAGON_MACH_V62},
+  };
+
+  auto F = ElfFlags.find(STI.getCPU());
+  assert(F != ElfFlags.end() && "Unrecognized Architecture");
+  return F->second;
+}
+
+namespace {
+class HexagonMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  HexagonMCInstrAnalysis(MCInstrInfo const *Info) : MCInstrAnalysis(Info) {}
+
+  bool isUnconditionalBranch(MCInst const &Inst) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  bool isConditionalBranch(MCInst const &Inst) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  bool evaluateBranch(MCInst const &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    if(!HexagonMCInstrInfo::isExtendable(*Info, Inst))
+      return false;
+    auto const &Extended(HexagonMCInstrInfo::getExtendableOperand(*Info, Inst));
+    assert(Extended.isExpr());
+    int64_t Value;
+    if(!Extended.getExpr()->evaluateAsAbsolute(Value))
+      return false;
+    Target = Value;
+    return true;
+  }
+};
+}
+
+static MCInstrAnalysis *createHexagonMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new HexagonMCInstrAnalysis(Info);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
@@ -252,7 +357,7 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
 
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(getTheHexagonTarget(),
-                                          createHexagonMCSubtargetInfo);
+    Hexagon_MC::createHexagonMCSubtargetInfo);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(getTheHexagonTarget(),
@@ -262,8 +367,18 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   TargetRegistry::RegisterMCAsmBackend(getTheHexagonTarget(),
                                        createHexagonAsmBackend);
 
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(getTheHexagonTarget(),
+                                          createHexagonMCInstrAnalysis);
+
   // Register the obj streamer
-  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(), createMCStreamer);
+  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(),
+                                      createMCStreamer);
+
+  // Register the obj target streamer
+  TargetRegistry::RegisterObjectTargetStreamer(getTheHexagonTarget(),
+                                      createHexagonObjectTargetStreamer);
 
   // Register the asm streamer
   TargetRegistry::RegisterAsmTargetStreamer(getTheHexagonTarget(),
@@ -272,7 +387,4 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC Inst Printer
   TargetRegistry::RegisterMCInstPrinter(getTheHexagonTarget(),
                                         createHexagonMCInstPrinter);
-
-  TargetRegistry::RegisterObjectTargetStreamer(
-      getTheHexagonTarget(), createHexagonObjectTargetStreamer);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 6e677e9..6bb69be 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -41,6 +41,18 @@ extern cl::opt<bool> HexagonDisableDuplex;
 extern const InstrStage HexagonStages[];
 
 MCInstrInfo *createHexagonMCInstrInfo();
+MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT);
+
+namespace Hexagon_MC {
+  StringRef ParseHexagonTriple(const Triple &TT, StringRef CPU);
+  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+
+  /// Create a Hexagon MCSubtargetInfo instance. This is exposed so Asm parser,
+  /// etc. do not need to go through TargetRegistry.
+  MCSubtargetInfo *createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                                StringRef FS);
+  unsigned GetELFFlags(const MCSubtargetInfo &STI);
+}
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCRegisterInfo &MRI,
@@ -54,13 +66,9 @@ MCAsmBackend *createHexagonAsmBackend(const Target &T,
 MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
                                              uint8_t OSABI, StringRef CPU);
 
-namespace Hexagon_MC {
-
-  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
-
-} // end namespace Hexagon_MC
+unsigned HexagonGetLastSlot();
 
-} // end namespace llvm
+} // End llvm namespace
 
 // Define symbolic names for Hexagon registers.  This defines a mapping from
 // register name to register number.
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 88f37d6..1604e7c 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -14,16 +14,18 @@
 
 #define DEBUG_TYPE "hexagon-shuffle"
 
-#include <algorithm>
-#include <utility>
+#include "HexagonShuffler.h"
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
-#include "HexagonShuffler.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <utility>
 
 using namespace llvm;
 
@@ -37,16 +39,16 @@ class HexagonBid {
   unsigned Bid;
 
 public:
-  HexagonBid() : Bid(0){};
-  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; };
+  HexagonBid() : Bid(0) {}
+  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; }
 
   // Check if the insn priority is overflowed.
-  bool isSold() const { return (Bid >= MAX); };
+  bool isSold() const { return (Bid >= MAX); }
 
   HexagonBid &operator+=(const HexagonBid &B) {
     Bid += B.Bid;
     return *this;
-  };
+  }
 };
 
 // Slot shuffling allocation.
@@ -56,7 +58,7 @@ class HexagonUnitAuction {
   unsigned isSold : HEXAGON_PACKET_SIZE;
 
 public:
-  HexagonUnitAuction() : isSold(0){};
+  HexagonUnitAuction(unsigned cs = 0) : isSold(cs){};
 
   // Allocate slots.
   bool bid(unsigned B) {
@@ -70,29 +72,29 @@ public:
           isSold |= Scores[i].isSold() << i;
         }
       return true;
-      ;
     } else
       // Error if the desired slots are already full.
       return false;
-  };
+  }
 };
 } // end anonymous namespace
 
 unsigned HexagonResource::setWeight(unsigned s) {
   const unsigned SlotWeight = 8;
   const unsigned MaskWeight = SlotWeight - 1;
-  bool Key = (1 << s) & getUnits();
-
-  // TODO: Improve this API so that we can prevent misuse statically.
-  assert(SlotWeight * s < 32 && "Argument to setWeight too large.");
+  unsigned Units = getUnits();
+  unsigned Key = ((1u << s) & Units) != 0;
 
   // Calculate relative weight of the insn for the given slot, weighing it the
   // heavier the more restrictive the insn is and the lowest the slots that the
   // insn may be executed in.
-  Weight =
-      (Key << (SlotWeight * s)) * ((MaskWeight - countPopulation(getUnits()))
-                                   << countTrailingZeros(getUnits()));
-  return (Weight);
+  if (Key == 0 || Units == 0 || (SlotWeight * s >= 32))
+    return Weight = 0;
+
+  unsigned Ctpop = countPopulation(Units);
+  unsigned Cttz = countTrailingZeros(Units);
+  Weight = (1u << (SlotWeight * s)) * ((MaskWeight - Ctpop) << Cttz);
+  return Weight;
 }
 
 void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
@@ -100,16 +102,18 @@ void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
   (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
+  (*TUL)[HexagonII::TypeCVI_VX_LATE] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
   (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
   (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
   (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
-  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] =
+      (CPU == "hexagonv60")
+          ? UnitsAndLanes(CVI_SHIFT, 1)
+          : UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_LD] =
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
-  (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
   (*TUL)[HexagonII::TypeCVI_VM_ST] =
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
@@ -141,9 +145,45 @@ HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
   }
 }
 
-HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
+struct CVIUnits {
+  unsigned Units;
+  unsigned Lanes;
+};
+typedef SmallVector<struct CVIUnits, 8> HVXInstsT;
+
+static unsigned makeAllBits(unsigned startBit, unsigned Lanes)
+
+{
+  for (unsigned i = 1; i < Lanes; ++i)
+    startBit = (startBit << 1) | startBit;
+  return startBit;
+}
+
+static bool checkHVXPipes(const HVXInstsT &hvxInsts, unsigned startIdx,
+                          unsigned usedUnits)
+
+{
+  if (startIdx < hvxInsts.size()) {
+    if (!hvxInsts[startIdx].Units)
+      return checkHVXPipes(hvxInsts, startIdx + 1, usedUnits);
+    for (unsigned b = 0x1; b <= 0x8; b <<= 1) {
+      if ((hvxInsts[startIdx].Units & b) == 0)
+        continue;
+      unsigned allBits = makeAllBits(b, hvxInsts[startIdx].Lanes);
+      if ((allBits & usedUnits) == 0) {
+        if (checkHVXPipes(hvxInsts, startIdx + 1, usedUnits | allBits))
+          return true;
+      }
+    }
+    return false;
+  }
+  return true;
+}
+
+HexagonShuffler::HexagonShuffler(MCContext &Context, bool ReportErrors,
+                                 MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
-    : MCII(MCII), STI(STI) {
+    : Context(Context), MCII(MCII), STI(STI), ReportErrors(ReportErrors) {
   reset();
   HexagonCVIResource::SetupTUL(&TUL, STI.getCPU());
 }
@@ -151,130 +191,183 @@ HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
 void HexagonShuffler::reset() {
   Packet.clear();
   BundleFlags = 0;
-  Error = SHUFFLE_SUCCESS;
 }
 
-void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
-                             unsigned S, bool X) {
-  HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
+void HexagonShuffler::append(MCInst const &ID, MCInst const *Extender,
+                             unsigned S) {
+  HexagonInstr PI(&TUL, MCII, &ID, Extender, S);
 
   Packet.push_back(PI);
 }
 
+static struct {
+  unsigned first;
+  unsigned second;
+} jumpSlots[] = {{8, 4}, {8, 2}, {8, 1}, {4, 2}, {4, 1}, {2, 1}};
+#define MAX_JUMP_SLOTS (sizeof(jumpSlots) / sizeof(jumpSlots[0]))
+
 /// Check that the packet is legal and enforce relative insn order.
 bool HexagonShuffler::check() {
   // Descriptive slot masks.
   const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1, slotOne = 0x2,
-                 slotThree = 0x8, slotFirstJump = 0x8, slotLastJump = 0x4,
+                 slotThree = 0x8, // slotFirstJump = 0x8,
                  slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
   // Highest slots for branches and stores used to keep their original order.
-  unsigned slotJump = slotFirstJump;
+  // unsigned slotJump = slotFirstJump;
   unsigned slotLoadStore = slotFirstLoadStore;
   // Number of branches, solo branches, indirect branches.
   unsigned jumps = 0, jump1 = 0;
   // Number of memory operations, loads, solo loads, stores, solo stores, single
   // stores.
   unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
-  // Number of HVX loads, HVX stores.
-  unsigned CVIloads = 0, CVIstores = 0;
-  // Number of duplex insns, solo insns.
-  unsigned duplex = 0, solo = 0;
-  // Number of insns restricting other insns in the packet to A and X types,
-  // which is neither A or X types.
-  unsigned onlyAX = 0, neitherAnorX = 0;
+  // Number of duplex insns
+  unsigned duplex = 0;
   // Number of insns restricting other insns in slot #1 to A type.
   unsigned onlyAin1 = 0;
   // Number of insns restricting any insn in slot #1, except A2_nop.
   unsigned onlyNo1 = 0;
-  unsigned xtypeFloat = 0;
   unsigned pSlot3Cnt = 0;
+  unsigned nvstores = 0;
+  unsigned memops = 0;
+  unsigned deallocs = 0;
   iterator slot3ISJ = end();
+  std::vector<iterator> foundBranches;
+  unsigned reservedSlots = 0;
 
   // Collect information from the insns in the packet.
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const *ID = ISJ->getDesc();
-
-    if (HexagonMCInstrInfo::isSolo(MCII, *ID))
-      solo += !ISJ->isSoloException();
-    else if (HexagonMCInstrInfo::isSoloAX(MCII, *ID))
-      onlyAX += !ISJ->isSoloException();
-    else if (HexagonMCInstrInfo::isSoloAin1(MCII, *ID))
-      onlyAin1 += !ISJ->isSoloException();
-    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32 &&
-        HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeXTYPE)
-      ++neitherAnorX;
-    if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID)) {
+    MCInst const &ID = ISJ->getDesc();
+
+    if (HexagonMCInstrInfo::isSoloAin1(MCII, ID))
+      ++onlyAin1;
+    if (HexagonMCInstrInfo::prefersSlot3(MCII, ID)) {
       ++pSlot3Cnt;
       slot3ISJ = ISJ;
     }
-    if (HexagonMCInstrInfo::isCofMax1(MCII, *ID))
+    reservedSlots |= HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID);
+    if (HexagonMCInstrInfo::isCofMax1(MCII, ID))
       ++jump1;
 
-    switch (HexagonMCInstrInfo::getType(MCII, *ID)) {
-    case HexagonII::TypeXTYPE:
-      if (HexagonMCInstrInfo::isFloat(MCII, *ID))
-        ++xtypeFloat;
+    switch (HexagonMCInstrInfo::getType(MCII, ID)) {
+    case HexagonII::TypeS_2op:
+    case HexagonII::TypeS_3op:
+    case HexagonII::TypeALU64:
       break;
-    case HexagonII::TypeJR:
     case HexagonII::TypeJ:
       ++jumps;
+      foundBranches.push_back(ISJ);
       break;
     case HexagonII::TypeCVI_VM_VP_LDU:
       ++onlyNo1;
+      LLVM_FALLTHROUGH;
     case HexagonII::TypeCVI_VM_LD:
     case HexagonII::TypeCVI_VM_TMP_LD:
-    case HexagonII::TypeCVI_VM_CUR_LD:
-      ++CVIloads;
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
-      if (ISJ->Core.getUnits() == slotSingleLoad)
+      if (ISJ->Core.getUnits() == slotSingleLoad ||
+          HexagonMCInstrInfo::getType(MCII, ID) == HexagonII::TypeCVI_VM_VP_LDU)
         ++load0;
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
-        ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isReturn()) {
+        ++deallocs, ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+        foundBranches.push_back(ISJ);
+      }
       break;
     case HexagonII::TypeCVI_VM_STU:
       ++onlyNo1;
+      LLVM_FALLTHROUGH;
     case HexagonII::TypeCVI_VM_ST:
     case HexagonII::TypeCVI_VM_NEW_ST:
-      ++CVIstores;
     case HexagonII::TypeST:
       ++stores;
       ++memory;
-      if (ISJ->Core.getUnits() == slotSingleStore)
+      if (ISJ->Core.getUnits() == slotSingleStore ||
+          HexagonMCInstrInfo::getType(MCII, ID) == HexagonII::TypeCVI_VM_STU)
         ++store0;
       break;
     case HexagonII::TypeV4LDST:
       ++loads;
       ++stores;
       ++store1;
+      ++memops;
       ++memory;
       break;
-    case HexagonII::TypeNV:
+    case HexagonII::TypeNCJ:
       ++memory; // NV insns are memory-like.
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch())
-        ++jumps, ++jump1;
+      ++jumps, ++jump1;
+      foundBranches.push_back(ISJ);
+      break;
+    case HexagonII::TypeV2LDST:
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+        ++loads;
+        ++memory;
+        if (ISJ->Core.getUnits() == slotSingleLoad ||
+            HexagonMCInstrInfo::getType(MCII, ID) ==
+                HexagonII::TypeCVI_VM_VP_LDU)
+          ++load0;
+      } else {
+        assert(HexagonMCInstrInfo::getDesc(MCII, ID).mayStore());
+        ++memory;
+        ++stores;
+        if (HexagonMCInstrInfo::isNewValue(MCII, ID))
+          ++nvstores;
+      }
       break;
     case HexagonII::TypeCR:
     // Legacy conditional branch predicated on a register.
-    case HexagonII::TypeSYSTEM:
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad())
-        ++loads;
+    case HexagonII::TypeCJ:
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      break;
+    case HexagonII::TypeDUPLEX: {
+      ++duplex;
+      MCInst const &Inst0 = *ID.getOperand(0).getInst();
+      MCInst const &Inst1 = *ID.getOperand(1).getInst();
+      if (HexagonMCInstrInfo::isCofMax1(MCII, Inst0))
+        ++jump1;
+      if (HexagonMCInstrInfo::isCofMax1(MCII, Inst1))
+        ++jump1;
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst0).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst1).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst0).isReturn()) {
+        ++deallocs, ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+        foundBranches.push_back(ISJ);
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst1).isReturn()) {
+        ++deallocs, ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+        foundBranches.push_back(ISJ);
+      }
       break;
     }
+    }
   }
 
   // Check if the packet is legal.
-  if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) ||
-      (duplex > 1 || (duplex && memory)) || (solo && size() > 1) ||
-      (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) {
-    Error = SHUFFLE_ERROR_INVALID;
+  if ((load0 > 1 || store0 > 1) ||
+      (duplex > 1 || (duplex && memory))) {
+    reportError(llvm::Twine("invalid instruction packet"));
     return false;
   }
 
   if (jump1 && jumps > 1) {
     // Error if single branch with another branch.
-    Error = SHUFFLE_ERROR_BRANCHES;
+    reportError(llvm::Twine("too many branches in packet"));
+    return false;
+  }
+  if ((nvstores || memops) && stores > 1) {
+    reportError(llvm::Twine("slot 0 instruction does not allow slot 1 store"));
+    return false;
+  }
+  if (deallocs && stores) {
+    reportError(llvm::Twine("slot 0 instruction does not allow slot 1 store"));
     return false;
   }
 
@@ -282,56 +375,42 @@ bool HexagonShuffler::check() {
   // TODO: need to reserve slots #0 and #1 for duplex insns.
   bool bOnlySlot3 = false;
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const *ID = ISJ->getDesc();
+    MCInst const &ID = ISJ->getDesc();
 
     if (!ISJ->Core.getUnits()) {
       // Error if insn may not be executed in any slot.
-      Error = SHUFFLE_ERROR_UNKNOWN;
       return false;
     }
 
     // Exclude from slot #1 any insn but A2_nop.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).getOpcode() != Hexagon::A2_nop)
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).getOpcode() != Hexagon::A2_nop)
       if (onlyNo1)
         ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
 
     // Exclude from slot #1 any insn but A-type.
-    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32)
+    if (HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_2op &&
+        HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_3op &&
+        HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_ADDI)
       if (onlyAin1)
         ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
 
-    // Branches must keep the original order.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch() ||
-        HexagonMCInstrInfo::getDesc(MCII, *ID).isCall())
-      if (jumps > 1) {
-        if (slotJump < slotLastJump) {
-          // Error if indirect branch with another branch or
-          // no more slots available for branches.
-          Error = SHUFFLE_ERROR_BRANCHES;
-          return false;
-        }
-        // Pin the branch to the highest slot available to it.
-        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotJump);
-        // Update next highest slot available to branches.
-        slotJump >>= 1;
-      }
-
     // A single load must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad()) {
-      if (loads == 1 && loads == memory)
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+      if (loads == 1 && loads == memory && memops == 0)
         // Pin the load to slot #0.
         ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
     }
 
     // A single store must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayStore()) {
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayStore()) {
       if (!store0) {
         if (stores == 1)
           ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
         else if (stores > 1) {
           if (slotLoadStore < slotLastLoadStore) {
             // Error if no more slots available for stores.
-            Error = SHUFFLE_ERROR_STORES;
+            reportError(
+                llvm::Twine("invalid instruction packet: too many stores"));
             return false;
           }
           // Pin the store to the highest slot available to it.
@@ -342,30 +421,77 @@ bool HexagonShuffler::check() {
       }
       if (store1 && stores > 1) {
         // Error if a single store with another store.
-        Error = SHUFFLE_ERROR_STORES;
+        reportError(llvm::Twine("invalid instruction packet: too many stores"));
         return false;
       }
     }
 
-    // flag if an instruction can only be executed in slot 3
+    // flag if an instruction requires to be in slot 3
     if (ISJ->Core.getUnits() == slotThree)
       bOnlySlot3 = true;
 
     if (!ISJ->Core.getUnits()) {
       // Error if insn may not be executed in any slot.
-      Error = SHUFFLE_ERROR_NOSLOTS;
+      reportError(llvm::Twine("invalid instruction packet: out of slots"));
       return false;
     }
   }
 
+  // preserve branch order
   bool validateSlots = true;
-  if (bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+  if (jumps > 1) {
+    if (foundBranches.size() > 2) {
+      reportError(llvm::Twine("too many branches in packet"));
+      return false;
+    }
+
+    // try all possible choices
+    for (unsigned int i = 0; i < MAX_JUMP_SLOTS; ++i) {
+      // validate first jump with this slot rule
+      if (!(jumpSlots[i].first & foundBranches[0]->Core.getUnits()))
+        continue;
+
+      // validate second jump with this slot rule
+      if (!(jumpSlots[i].second & foundBranches[1]->Core.getUnits()))
+        continue;
+
+      // both valid for this configuration, set new slot rules
+      PacketSave = Packet;
+      foundBranches[0]->Core.setUnits(jumpSlots[i].first);
+      foundBranches[1]->Core.setUnits(jumpSlots[i].second);
+
+      HexagonUnitAuction AuctionCore(reservedSlots);
+      std::sort(begin(), end(), HexagonInstr::lessCore);
+
+      // see if things ok with that instruction being pinned to slot "slotJump"
+      bool bFail = false;
+      for (iterator I = begin(); I != end() && bFail != true; ++I)
+        if (!AuctionCore.bid(I->Core.getUnits()))
+          bFail = true;
+
+      // if yes, great, if not then restore original slot mask
+      if (!bFail) {
+        validateSlots = false; // all good, no need to re-do auction
+        break;
+      } else
+        // restore original values
+        Packet = PacketSave;
+    }
+    if (validateSlots == true) {
+      reportError(llvm::Twine("invalid instruction packet: out of slots"));
+      return false;
+    }
+  }
+
+  if (jumps <= 1 && bOnlySlot3 == false && pSlot3Cnt == 1 &&
+      slot3ISJ != end()) {
+    validateSlots = true;
     // save off slot mask of instruction marked with A_PREFER_SLOT3
     // and then pin it to slot #3
     unsigned saveUnits = slot3ISJ->Core.getUnits();
     slot3ISJ->Core.setUnits(saveUnits & slotThree);
 
-    HexagonUnitAuction AuctionCore;
+    HexagonUnitAuction AuctionCore(reservedSlots);
     std::sort(begin(), end(), HexagonInstr::lessCore);
 
     // see if things ok with that instruction being pinned to slot #3
@@ -379,40 +505,49 @@ bool HexagonShuffler::check() {
       validateSlots = false; // all good, no need to re-do auction
     else
       for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-        MCInst const *ID = ISJ->getDesc();
-        if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID))
+        MCInst const &ID = ISJ->getDesc();
+        if (HexagonMCInstrInfo::prefersSlot3(MCII, ID))
           ISJ->Core.setUnits(saveUnits);
       }
   }
 
-  // Check if any slot, core, is over-subscribed.
+  // Check if any slot, core or CVI, is over-subscribed.
   // Verify the core slot subscriptions.
   if (validateSlots) {
-    HexagonUnitAuction AuctionCore;
+    HexagonUnitAuction AuctionCore(reservedSlots);
 
     std::sort(begin(), end(), HexagonInstr::lessCore);
 
     for (iterator I = begin(); I != end(); ++I)
       if (!AuctionCore.bid(I->Core.getUnits())) {
-        Error = SHUFFLE_ERROR_SLOTS;
+        reportError(llvm::Twine("invalid instruction packet: slot error"));
         return false;
       }
   }
   // Verify the CVI slot subscriptions.
-  {
-    HexagonUnitAuction AuctionCVI;
-
-    std::sort(begin(), end(), HexagonInstr::lessCVI);
-
-    for (iterator I = begin(); I != end(); ++I)
-      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
-        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
-          Error = SHUFFLE_ERROR_SLOTS;
-          return false;
-        }
+  std::sort(begin(), end(), HexagonInstr::lessCVI);
+  // create vector of hvx instructions to check
+  HVXInstsT hvxInsts;
+  hvxInsts.clear();
+  for (iterator I = begin(); I != end(); ++I) {
+    struct CVIUnits inst;
+    inst.Units = I->CVI.getUnits();
+    inst.Lanes = I->CVI.getLanes();
+    if (inst.Units == 0)
+      continue; // not an hvx inst or an hvx inst that doesn't uses any pipes
+    hvxInsts.push_back(inst);
+  }
+  // if there are any hvx instructions in this packet, check pipe usage
+  if (hvxInsts.size() > 0) {
+    unsigned startIdx, usedUnits;
+    startIdx = usedUnits = 0x0;
+    if (checkHVXPipes(hvxInsts, startIdx, usedUnits) == false) {
+      // too many pipes used to be valid
+      reportError(llvm::Twine("invalid instruction packet: slot error"));
+      return false;
+    }
   }
 
-  Error = SHUFFLE_SUCCESS;
   return true;
 }
 
@@ -420,12 +555,13 @@ bool HexagonShuffler::shuffle() {
   if (size() > HEXAGON_PACKET_SIZE) {
     // Ignore a packet with with more than what a packet can hold
     // or with compound or duplex insns for now.
-    Error = SHUFFLE_ERROR_INVALID;
+    reportError(llvm::Twine("invalid instruction packet"));
     return false;
   }
 
   // Check and prepare packet.
-  if (size() > 1 && check())
+  bool Ok = true;
+  if (size() > 1 && (Ok = check()))
     // Reorder the handles for each slot.
     for (unsigned nSlot = 0, emptySlots = 0; nSlot < HEXAGON_PACKET_SIZE;
          ++nSlot) {
@@ -452,12 +588,19 @@ bool HexagonShuffler::shuffle() {
     }
 
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
-    DEBUG(dbgs().write_hex(ISJ->Core.getUnits());
-          dbgs() << ':'
-                 << HexagonMCInstrInfo::getDesc(MCII, *ISJ->getDesc())
-                        .getOpcode();
+    DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
+      dbgs() << '/';
+      dbgs().write_hex(ISJ->CVI.getUnits()) << '|';
+      dbgs() << ISJ->CVI.getLanes();
+    } dbgs() << ':'
+             << HexagonMCInstrInfo::getDesc(MCII, ISJ->getDesc()).getOpcode();
           dbgs() << '\n');
   DEBUG(dbgs() << '\n');
 
-  return (!getError());
+  return Ok;
+}
+
+void HexagonShuffler::reportError(llvm::Twine const &Msg) {
+  if (ReportErrors)
+    Context.reportError(Loc, Msg);
 }
diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index a093f85..10a9590 100644
--- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -35,7 +35,8 @@ public:
   HexagonResource(unsigned s) { setUnits(s); };
 
   void setUnits(unsigned s) {
-    Slots = s & ~(~0U << HEXAGON_PACKET_SIZE);
+    Slots = s & ((1u << HEXAGON_PACKET_SIZE) - 1);
+    setWeight(s);
   };
   unsigned setWeight(unsigned s);
 
@@ -86,10 +87,10 @@ public:
                      unsigned s, MCInst const *id);
   static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
 
-  bool isValid() const { return (Valid); };
-  unsigned getLanes() const { return (Lanes); };
-  bool mayLoad() const { return (Load); };
-  bool mayStore() const { return (Store); };
+  bool isValid() const { return Valid; };
+  unsigned getLanes() const { return Lanes; };
+  bool mayLoad() const { return Load; };
+  bool mayStore() const { return Store; };
 };
 
 // Handle to an insn used by the shuffling algorithm.
@@ -100,21 +101,17 @@ class HexagonInstr {
   MCInst const *Extender;
   HexagonResource Core;
   HexagonCVIResource CVI;
-  bool SoloException;
 
 public:
   HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
                MCInstrInfo const &MCII, MCInst const *id,
-               MCInst const *Extender, unsigned s, bool x = false)
-      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
-        SoloException(x) {};
+               MCInst const *Extender, unsigned s)
+      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id) {};
 
-  MCInst const *getDesc() const { return (ID); };
+  MCInst const &getDesc() const { return *ID; };
 
   MCInst const *getExtender() const { return Extender; }
 
-  unsigned isSoloException() const { return (SoloException); };
-
   // Check if the handles are in ascending order for shuffling purposes.
   bool operator<(const HexagonInstr &B) const {
     return (HexagonResource::lessWeight(B.Core, Core));
@@ -136,34 +133,23 @@ class HexagonShuffler {
 
   // Insn handles in a bundle.
   HexagonPacket Packet;
-
-  // Shuffling error code.
-  unsigned Error;
+  HexagonPacket PacketSave;
 
   HexagonCVIResource::TypeUnitsAndLanes TUL;
 
 protected:
+  MCContext &Context;
   int64_t BundleFlags;
   MCInstrInfo const &MCII;
   MCSubtargetInfo const &STI;
+  SMLoc Loc;
+  bool ReportErrors;
 
 public:
   typedef HexagonPacket::iterator iterator;
 
-  enum {
-    SHUFFLE_SUCCESS = 0,    ///< Successful operation.
-    SHUFFLE_ERROR_INVALID,  ///< Invalid bundle.
-    SHUFFLE_ERROR_STORES,   ///< No free slots for store insns.
-    SHUFFLE_ERROR_LOADS,    ///< No free slots for load insns.
-    SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns.
-    SHUFFLE_ERROR_NOSLOTS,  ///< No free slots for other insns.
-    SHUFFLE_ERROR_SLOTS,    ///< Over-subscribed slots.
-    SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60).
-    SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict
-    SHUFFLE_ERROR_UNKNOWN   ///< Unknown error.
-  };
-
-  explicit HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI);
+  HexagonShuffler(MCContext &Context, bool ReportErrors,
+                  MCInstrInfo const &MCII, MCSubtargetInfo const &STI);
 
   // Reset to initial state.
   void reset();
@@ -178,13 +164,11 @@ public:
   iterator end() { return (Packet.end()); };
 
   // Add insn handle to the bundle .
-  void append(MCInst const *ID, MCInst const *Extender, unsigned S,
-              bool X = false);
+  void append(MCInst const &ID, MCInst const *Extender, unsigned S);
 
   // Return the error code for the last check or shuffling of the bundle.
-  void setError(unsigned Err) { Error = Err; };
-  unsigned getError() const { return (Error); };
+  void reportError(llvm::Twine const &Msg);
 };
-}
+} // namespace llvm
 
 #endif // HEXAGONSHUFFLER_H
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
index 3928716..ea86ffb 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -11,6 +11,7 @@
 
 #include "RDFCopy.h"
 #include "RDFGraph.h"
+#include "RDFLiveness.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -53,47 +54,12 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
 void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
   CopyMap.insert(std::make_pair(SA.Id, EM));
   Copies.push_back(SA.Id);
-
-  for (auto I : EM) {
-    auto FS = DefM.find(I.second.Reg);
-    if (FS == DefM.end() || FS->second.empty())
-      continue; // Undefined source
-    RDefMap[I.second][SA.Id] = FS->second.top()->Id;
-    // Insert DstR into the map.
-    RDefMap[I.first];
-  }
-}
-
-
-void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
-  RegisterSet RRs;
-  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
-    RRs.insert(RA.Addr->getRegRef(DFG));
-  bool Common = false;
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    Common = true;
-    break;
-  }
-  if (!Common)
-    return;
-
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    auto F = DefM.find(R.first.Reg);
-    if (F == DefM.end() || F->second.empty())
-      continue;
-    R.second[IA.Id] = F->second.top()->Id;
-  }
 }
 
 
 bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
   bool Changed = false;
-  auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
-  DFG.markBlock(BA.Id, DefM);
+  NodeAddr<BlockNode*> BA = DFG.findBlock(B);
 
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
     if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
@@ -102,20 +68,30 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
       if (interpretAsCopy(SA.Addr->getCode(), EM))
         recordCopy(SA, EM);
     }
-
-    updateMap(IA);
-    DFG.pushDefs(IA, DefM);
   }
 
   MachineDomTreeNode *N = MDT.getNode(B);
   for (auto I : *N)
     Changed |= scanBlock(I->getBlock());
 
-  DFG.releaseBlock(BA.Id, DefM);
   return Changed;
 }
 
 
+NodeId CopyPropagation::getLocalReachingDef(RegisterRef RefRR,
+      NodeAddr<InstrNode*> IA) {
+  NodeAddr<RefNode*> RA = L.getNearestAliasedRef(RefRR, IA);
+  if (RA.Id != 0) {
+    if (RA.Addr->getKind() == NodeAttrs::Def)
+      return RA.Id;
+    assert(RA.Addr->getKind() == NodeAttrs::Use);
+    if (NodeId RD = RA.Addr->getReachingDef())
+      return RD;
+  }
+  return 0;
+}
+
+
 bool CopyPropagation::run() {
   scanBlock(&DFG.getMF().front());
 
@@ -129,14 +105,6 @@ bool CopyPropagation::run() {
                << Print<RegisterRef>(J.second, DFG);
       dbgs() << " }\n";
     }
-    dbgs() << "\nRDef map:\n";
-    for (auto R : RDefMap) {
-      dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
-      for (auto &M : R.second)
-        dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
-               << Print<NodeId>(M.second, DFG);
-      dbgs() << " }\n";
-    }
   }
 
   bool Changed = false;
@@ -176,8 +144,7 @@ bool CopyPropagation::run() {
       if (DR == SR)
         continue;
 
-      auto &RDefSR = RDefMap[SR];
-      NodeId RDefSR_SA = RDefSR[SA.Id];
+      NodeId AtCopy = getLocalReachingDef(SR, SA);
 
       for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
         auto UA = DFG.addr<UseNode*>(N);
@@ -190,7 +157,8 @@ bool CopyPropagation::run() {
 
         NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
         assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
-        if (RDefSR[IA.Id] != RDefSR_SA)
+        NodeId AtUse = getLocalReachingDef(SR, IA);
+        if (AtCopy != AtUse)
           continue;
 
         MachineOperand &Op = UA.Addr->getOp();
@@ -206,8 +174,8 @@ bool CopyPropagation::run() {
         Op.setReg(NewReg);
         Op.setSubReg(0);
         DFG.unlinkUse(UA, false);
-        if (RDefSR_SA != 0) {
-          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(RDefSR_SA));
+        if (AtCopy != 0) {
+          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(AtCopy));
         } else {
           UA.Addr->setReachingDef(0);
           UA.Addr->setSibling(0);
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
index 5ece11b..bbd625c 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
+++ b/contrib/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -11,6 +11,9 @@
 #define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 
 #include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
 #include <map>
 #include <vector>
 
@@ -24,7 +27,7 @@ namespace rdf {
 
   struct CopyPropagation {
     CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
-        Trace(false) {}
+        L(dfg.getMF().getRegInfo(), dfg), Trace(false) {}
 
     virtual ~CopyPropagation() = default;
 
@@ -39,18 +42,16 @@ namespace rdf {
   private:
     const MachineDominatorTree &MDT;
     DataFlowGraph &DFG;
-    DataFlowGraph::DefStackMap DefM;
+    Liveness L;
     bool Trace;
 
-    // map: register -> (map: stmt -> reaching def)
-    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
     // map: statement -> (map: dst reg -> src reg)
     std::map<NodeId, EqualityMap> CopyMap;
     std::vector<NodeId> Copies;
 
     void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
-    void updateMap(NodeAddr<InstrNode*> IA);
     bool scanBlock(MachineBasicBlock *B);
+    NodeId getLocalReachingDef(RegisterRef RefRR, NodeAddr<InstrNode*> IA);
   };
 
 } // end namespace rdf
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
index 63177d5..60a12dc 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -9,9 +9,9 @@
 //
 // RDF-based generic dead code elimination.
 
+#include "RDFDeadCode.h"
 #include "RDFGraph.h"
 #include "RDFLiveness.h"
-#include "RDFDeadCode.h"
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -62,9 +62,19 @@ bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
     return true;
   if (MI->isPHI())
     return false;
-  for (auto &Op : MI->operands())
+  for (auto &Op : MI->operands()) {
     if (Op.isReg() && MRI.isReserved(Op.getReg()))
       return true;
+    if (Op.isRegMask()) {
+      const uint32_t *BM = Op.getRegMask();
+      for (unsigned R = 0, RN = DFG.getTRI().getNumRegs(); R != RN; ++R) {
+        if (BM[R/32] & (1u << (R%32)))
+          continue;
+        if (MRI.isReserved(R))
+          return true;
+      }
+    }
+  }
   return false;
 }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
index fa272ea..8d12723 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.cpp
@@ -10,8 +10,8 @@
 // Target-independent, SSA-based data flow graph for register data flow (RDF).
 //
 #include "RDFGraph.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -276,7 +276,7 @@ raw_ostream &operator<< (raw_ostream &OS,
   MachineBasicBlock *BB = P.Obj.Addr->getCode();
   unsigned NP = BB->pred_size();
   std::vector<int> Ns;
-  auto PrintBBs = [&OS,&P] (std::vector<int> Ns) -> void {
+  auto PrintBBs = [&OS] (std::vector<int> Ns) -> void {
     unsigned N = Ns.size();
     for (int I : Ns) {
       OS << "BB#" << I;
@@ -424,7 +424,7 @@ RegisterRef RefNode::getRegRef(const DataFlowGraph &G) const {
   if (NodeAttrs::flags(Attrs) & NodeAttrs::PhiRef)
     return G.unpack(Ref.PR);
   assert(Ref.Op != nullptr);
-  return G.makeRegRef(Ref.Op->getReg(), Ref.Op->getSubReg());
+  return G.makeRegRef(*Ref.Op);
 }
 
 // Set the register reference in the reference node directly (for references
@@ -617,8 +617,12 @@ bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
 // Check if the definition of RR produces an unspecified value.
 bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
       const {
+  const MachineOperand &Op = In.getOperand(OpNum);
+  if (Op.isRegMask())
+    return true;
+  assert(Op.isReg());
   if (In.isCall())
-    if (In.getOperand(OpNum).isImplicit())
+    if (Op.isDef() && Op.isDead())
       return true;
   return false;
 }
@@ -654,109 +658,6 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
   return false;
 }
 
-RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
-  RegisterId SuperReg = RR.Reg;
-  while (true) {
-    MCSuperRegIterator SR(SuperReg, &TRI, false);
-    if (!SR.isValid())
-      break;
-    SuperReg = *SR;
-  }
-
-  const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
-  LaneBitmask Common = RR.Mask & RC.LaneMask;
-  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
-  LaneBitmask SuperMask = TRI.composeSubRegIndexLaneMask(Sub, Common);
-  return RegisterRef(SuperReg, SuperMask);
-}
-
-bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
-  RegisterRef NR = normalize(RR);
-  auto F = Masks.find(NR.Reg);
-  if (F != Masks.end()) {
-    if ((F->second & NR.Mask).any())
-      return true;
-  }
-  if (CheckUnits) {
-    for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U)
-      if (ExpAliasUnits.test(*U))
-        return true;
-  }
-  return false;
-}
-
-bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
-  // Always have a cover for empty lane mask.
-  RegisterRef NR = normalize(RR);
-  if (NR.Mask.none())
-    return true;
-  auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    return false;
-  return (NR.Mask & F->second) == NR.Mask;
-}
-
-RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
-  RegisterRef NR = normalize(RR);
-  auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    Masks.insert({NR.Reg, NR.Mask});
-  else
-    F->second |= NR.Mask;
-
-  // Visit all register units to see if there are any that were created
-  // by explicit aliases. Add those that were to the bit vector.
-  for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U) {
-    MCRegUnitRootIterator R(*U, &TRI);
-    ++R;
-    if (!R.isValid())
-      continue;
-    ExpAliasUnits.set(*U);
-    CheckUnits = true;
-  }
-  return *this;
-}
-
-RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
-  for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
-    insert(RegisterRef(P.first, P.second));
-  return *this;
-}
-
-RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
-  RegisterRef NR = normalize(RR);
-  auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    return *this;
-  LaneBitmask NewM = F->second & ~NR.Mask;
-  if (NewM.none())
-    Masks.erase(F);
-  else
-    F->second = NewM;
-  return *this;
-}
-
-RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
-  for (std::pair<RegisterId,LaneBitmask> P : RG.Masks)
-    clear(RegisterRef(P.first, P.second));
-  return *this;
-}
-
-RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
-  RegisterAggr T(TRI);
-  T.insert(RR).clear(*this);
-  if (T.empty())
-    return RegisterRef();
-  return RegisterRef(T.begin()->first, T.begin()->second);
-}
-
-void RegisterAggr::print(raw_ostream &OS) const {
-  OS << '{';
-  for (auto I : Masks)
-    OS << ' ' << PrintReg(I.first, &TRI) << PrintLaneMaskOpt(I.second);
-  OS << " }";
-}
-
 //
 // The data flow graph construction.
 //
@@ -764,7 +665,8 @@ void RegisterAggr::print(raw_ostream &OS) const {
 DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
       const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
       const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
-    : MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
+    : MF(mf), TII(tii), TRI(tri), PRI(tri, mf), MDT(mdt), MDF(mdf), TOI(toi),
+      LiveIns(PRI) {
 }
 
 // The implementation of the definition stack.
@@ -857,17 +759,6 @@ unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
 
 // Register information.
 
-// Get the list of references aliased to RR. Lane masks are ignored.
-RegisterSet DataFlowGraph::getAliasSet(RegisterId Reg) const {
-  // Do not include RR in the alias set.
-  RegisterSet AS;
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-
-  for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
-    AS.insert(RegisterRef(*AI));
-  return AS;
-}
-
 RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
   RegisterSet LR;
   const Function &F = *MF.getFunction();
@@ -1010,11 +901,22 @@ void DataFlowGraph::build(unsigned Options) {
   BlockRefsMap RefM;
   buildBlockRefs(EA, RefM);
 
-  // Add function-entry phi nodes.
+  // Collect function live-ins and entry block live-ins.
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
+  MachineBasicBlock &EntryB = *EA.Addr->getCode();
+  assert(EntryB.pred_empty() && "Function entry block has predecessors");
+  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
+    LiveIns.insert(RegisterRef(I->first));
+  if (MRI.tracksLiveness()) {
+    for (auto I : EntryB.liveins())
+      LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
+  }
+
+  // Add function-entry phi nodes for the live-in registers.
+  //for (std::pair<RegisterId,LaneBitmask> P : LiveIns) {
+  for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
+    RegisterRef RR = *I;
     NodeAddr<PhiNode*> PA = newPhi(EA);
-    RegisterRef RR = RegisterRef(I->first);
     uint16_t PhiFlags = NodeAttrs::PhiRef | NodeAttrs::Preserving;
     NodeAddr<DefNode*> DA = newDef(PA, RR, PhiFlags);
     PA.Addr->addMember(DA, *this);
@@ -1071,27 +973,19 @@ void DataFlowGraph::build(unsigned Options) {
 }
 
 RegisterRef DataFlowGraph::makeRegRef(unsigned Reg, unsigned Sub) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(PhysicalRegisterInfo::isRegMaskId(Reg) ||
+         TargetRegisterInfo::isPhysicalRegister(Reg));
+  assert(Reg != 0);
   if (Sub != 0)
     Reg = TRI.getSubReg(Reg, Sub);
   return RegisterRef(Reg);
 }
 
-RegisterRef DataFlowGraph::normalizeRef(RegisterRef RR) const {
-  // FIXME copied from RegisterAggr
-  RegisterId SuperReg = RR.Reg;
-  while (true) {
-    MCSuperRegIterator SR(SuperReg, &TRI, false);
-    if (!SR.isValid())
-      break;
-    SuperReg = *SR;
-  }
-
-  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
-  const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(RR.Reg);
-  LaneBitmask SuperMask = RR.Mask &
-                          TRI.composeSubRegIndexLaneMask(Sub, RC.LaneMask);
-  return RegisterRef(SuperReg, SuperMask);
+RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
+  assert(Op.isReg() || Op.isRegMask());
+  if (Op.isReg())
+    return makeRegRef(Op.getReg(), Op.getSubReg());
+  return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll());
 }
 
 RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
@@ -1100,13 +994,13 @@ RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
     return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
   }
 #ifndef NDEBUG
-  RegisterRef NAR = normalizeRef(AR);
-  RegisterRef NBR = normalizeRef(BR);
-  assert(NAR.Reg != NBR.Reg);
+//  RegisterRef NAR = PRI.normalize(AR);
+//  RegisterRef NBR = PRI.normalize(BR);
+//  assert(NAR.Reg != NBR.Reg);
 #endif
   // This isn't strictly correct, because the overlap may happen in the
   // part masked out.
-  if (TRI.regsOverlap(AR.Reg, BR.Reg))
+  if (PRI.alias(AR, BR))
     return AR;
   return RegisterRef();
 }
@@ -1137,11 +1031,61 @@ void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
 
 // Push all definitions from the instruction node IA to an appropriate
 // stack in DefM.
+void DataFlowGraph::pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  pushClobbers(IA, DefM);
+  pushDefs(IA, DefM);
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  NodeSet Visited;
+  std::set<RegisterId> Defined;
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) {
+    if (Visited.count(DA.Id))
+      continue;
+    if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering))
+      continue;
+
+    NodeList Rel = getRelatedRefs(IA, DA);
+    NodeAddr<DefNode*> PDA = Rel.front();
+    RegisterRef RR = PDA.Addr->getRegRef(*this);
+
+    // Push the definition on the stack for the register and all aliases.
+    // The def stack traversal in linkNodeUp will check the exact aliasing.
+    DefM[RR.Reg].push(DA);
+    Defined.insert(RR.Reg);
+    for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
+      // Check that we don't push the same def twice.
+      assert(A != RR.Reg);
+      if (!Defined.count(A))
+        DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (NodeAddr<NodeBase*> T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
 void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
-  NodeList Defs = IA.Addr->members_if(IsDef, *this);
   NodeSet Visited;
 #ifndef NDEBUG
-  RegisterSet Defined;
+  std::set<RegisterId> Defined;
 #endif
 
   // The important objectives of this function are:
@@ -1156,9 +1100,11 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
   //     unspecified order), but the order does not matter from the data-
   //     -flow perspective.
 
-  for (NodeAddr<DefNode*> DA : Defs) {
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) {
     if (Visited.count(DA.Id))
       continue;
+    if (DA.Addr->getFlags() & NodeAttrs::Clobbering)
+      continue;
 
     NodeList Rel = getRelatedRefs(IA, DA);
     NodeAddr<DefNode*> PDA = Rel.front();
@@ -1166,7 +1112,7 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
 #ifndef NDEBUG
     // Assert if the register is defined in two or more unrelated defs.
     // This could happen if there are two or more def operands defining it.
-    if (!Defined.insert(RR).second) {
+    if (!Defined.insert(RR.Reg).second) {
       MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
       dbgs() << "Multiple definitions of register: "
              << Print<RegisterRef>(RR, *this) << " in\n  " << *MI
@@ -1177,10 +1123,10 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
     // Push the definition on the stack for the register and all aliases.
     // The def stack traversal in linkNodeUp will check the exact aliasing.
     DefM[RR.Reg].push(DA);
-    for (RegisterRef A : getAliasSet(RR.Reg /*FIXME? use RegisterRef*/)) {
+    for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
       // Check that we don't push the same def twice.
-      assert(A != RR);
-      DefM[A.Reg].push(DA);
+      assert(A != RR.Reg);
+      DefM[A].push(DA);
     }
     // Mark all the related defs as visited.
     for (NodeAddr<NodeBase*> T : Rel)
@@ -1203,59 +1149,6 @@ NodeList DataFlowGraph::getRelatedRefs(NodeAddr<InstrNode*> IA,
   return Refs;
 }
 
-// Return true if RA and RB overlap, false otherwise.
-bool DataFlowGraph::alias(RegisterRef RA, RegisterRef RB) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
-  assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
-
-  MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
-  MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
-  // Reg units are returned in the numerical order.
-  while (UMA.isValid() && UMB.isValid()) {
-    std::pair<uint32_t,LaneBitmask> PA = *UMA;
-    std::pair<uint32_t,LaneBitmask> PB = *UMB;
-    if (PA.first == PB.first) {
-      // Lane mask of 0 (given by the iterator) should be treated as "full".
-      // This can happen when the register has only one unit, or when the
-      // unit corresponds to explicit aliasing. In such cases, the lane mask
-      // from RegisterRef should be ignored.
-      if (PA.second.none() || PB.second.none())
-        return true;
-
-      // At this point the common unit corresponds to a subregister. The lane
-      // masks correspond to the lane mask of that unit within the original
-      // register, for example assuming register quadruple q0 = r3:0, and
-      // a register pair d1 = r3:2, the lane mask of r2 in q0 may be 0b0100,
-      // while the lane mask of r2 in d1 may be 0b0001.
-      LaneBitmask LA = PA.second & RA.Mask;
-      LaneBitmask LB = PB.second & RB.Mask;
-      if (LA.any() && LB.any()) {
-        unsigned Root = *MCRegUnitRootIterator(PA.first, &TRI);
-        // If register units were guaranteed to only have 1 bit in any lane
-        // mask, the code below would not be necessary. This is because LA
-        // and LB would have at most 1 bit set each, and that bit would be
-        // guaranteed to correspond to the given register unit.
-        uint32_t SubA = TRI.getSubRegIndex(RA.Reg, Root);
-        uint32_t SubB = TRI.getSubRegIndex(RB.Reg, Root);
-        const TargetRegisterClass &RC = *TRI.getMinimalPhysRegClass(Root);
-        LaneBitmask MaskA = TRI.reverseComposeSubRegIndexLaneMask(SubA, LA);
-        LaneBitmask MaskB = TRI.reverseComposeSubRegIndexLaneMask(SubB, LB);
-        if ((MaskA & MaskB & RC.LaneMask).any())
-          return true;
-      }
-
-      ++UMA;
-      ++UMB;
-      continue;
-    }
-    if (PA.first < PB.first)
-      ++UMA;
-    else if (PB.first < PA.first)
-      ++UMB;
-  }
-  return false;
-}
-
 // Clear all information in the graph.
 void DataFlowGraph::reset() {
   Memory.clear();
@@ -1370,58 +1263,53 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     if (In.isCall())
       return true;
     // Is tail call?
-    if (In.isBranch())
+    if (In.isBranch()) {
       for (const MachineOperand &Op : In.operands())
         if (Op.isGlobal() || Op.isSymbol())
           return true;
+      // Assume indirect branches are calls. This is for the purpose of
+      // keeping implicit operands, and so it won't hurt on intra-function
+      // indirect branches.
+      if (In.isIndirectBranch())
+        return true;
+    }
     return false;
   };
 
   auto isDefUndef = [this] (const MachineInstr &In, RegisterRef DR) -> bool {
     // This instruction defines DR. Check if there is a use operand that
     // would make DR live on entry to the instruction.
-    for (const MachineOperand &UseOp : In.operands()) {
-      if (!UseOp.isReg() || !UseOp.isUse() || UseOp.isUndef())
+    for (const MachineOperand &Op : In.operands()) {
+      if (!Op.isReg() || Op.getReg() == 0 || !Op.isUse() || Op.isUndef())
         continue;
-      RegisterRef UR = makeRegRef(UseOp.getReg(), UseOp.getSubReg());
-      if (alias(DR, UR))
+      RegisterRef UR = makeRegRef(Op);
+      if (PRI.alias(DR, UR))
         return false;
     }
     return true;
   };
 
-  // Collect a set of registers that this instruction implicitly uses
-  // or defines. Implicit operands from an instruction will be ignored
-  // unless they are listed here.
-  RegisterSet ImpUses, ImpDefs;
-  if (const uint16_t *ImpD = In.getDesc().getImplicitDefs())
-    while (uint16_t R = *ImpD++)
-      ImpDefs.insert(RegisterRef(R));
-  if (const uint16_t *ImpU = In.getDesc().getImplicitUses())
-    while (uint16_t R = *ImpU++)
-      ImpUses.insert(RegisterRef(R));
-
   bool IsCall = isCall(In);
-  bool NeedsImplicit = IsCall || In.isInlineAsm() || In.isReturn();
-  bool IsPredicated = TII.isPredicated(In);
   unsigned NumOps = In.getNumOperands();
 
   // Avoid duplicate implicit defs. This will not detect cases of implicit
   // defs that define registers that overlap, but it is not clear how to
   // interpret that in the absence of explicit defs. Overlapping explicit
   // defs are likely illegal already.
-  RegisterSet DoneDefs;
+  BitVector DoneDefs(TRI.getNumRegs());
   // Process explicit defs first.
   for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isDef() || Op.isImplicit())
       continue;
-    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
+    unsigned R = Op.getReg();
+    if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
+      continue;
     uint16_t Flags = NodeAttrs::None;
     if (TOI.isPreserving(In, OpN)) {
       Flags |= NodeAttrs::Preserving;
       // If the def is preserving, check if it is also undefined.
-      if (isDefUndef(In, RR))
+      if (isDefUndef(In, makeRegRef(Op)))
         Flags |= NodeAttrs::Undef;
     }
     if (TOI.isClobbering(In, OpN))
@@ -1432,7 +1320,25 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
       Flags |= NodeAttrs::Dead;
     NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
     SA.Addr->addMember(DA, *this);
-    DoneDefs.insert(RR);
+    assert(!DoneDefs.test(R));
+    DoneDefs.set(R);
+  }
+
+  // Process reg-masks (as clobbers).
+  BitVector DoneClobbers(TRI.getNumRegs());
+  for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
+    MachineOperand &Op = In.getOperand(OpN);
+    if (!Op.isRegMask())
+      continue;
+    uint16_t Flags = NodeAttrs::Clobbering | NodeAttrs::Fixed |
+                     NodeAttrs::Dead;
+    NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
+    SA.Addr->addMember(DA, *this);
+    // Record all clobbered registers in DoneDefs.
+    const uint32_t *RM = Op.getRegMask();
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i)
+      if (!(RM[i/32] & (1u << (i%32))))
+        DoneClobbers.set(i);
   }
 
   // Process implicit defs, skipping those that have already been added
@@ -1441,11 +1347,10 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isDef() || !Op.isImplicit())
       continue;
-    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
-    if (!NeedsImplicit && !ImpDefs.count(RR))
-      continue;
-    if (DoneDefs.count(RR))
+    unsigned R = Op.getReg();
+    if (!R || !TargetRegisterInfo::isPhysicalRegister(R) || DoneDefs.test(R))
       continue;
+    RegisterRef RR = makeRegRef(Op);
     uint16_t Flags = NodeAttrs::None;
     if (TOI.isPreserving(In, OpN)) {
       Flags |= NodeAttrs::Preserving;
@@ -1457,24 +1362,22 @@ void DataFlowGraph::buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In) {
       Flags |= NodeAttrs::Clobbering;
     if (TOI.isFixedReg(In, OpN))
       Flags |= NodeAttrs::Fixed;
-    if (IsCall && Op.isDead())
+    if (IsCall && Op.isDead()) {
+      if (DoneClobbers.test(R))
+        continue;
       Flags |= NodeAttrs::Dead;
+    }
     NodeAddr<DefNode*> DA = newDef(SA, Op, Flags);
     SA.Addr->addMember(DA, *this);
-    DoneDefs.insert(RR);
+    DoneDefs.set(R);
   }
 
   for (unsigned OpN = 0; OpN < NumOps; ++OpN) {
     MachineOperand &Op = In.getOperand(OpN);
     if (!Op.isReg() || !Op.isUse())
       continue;
-    RegisterRef RR = makeRegRef(Op.getReg(), Op.getSubReg());
-    // Add implicit uses on return and call instructions, and on predicated
-    // instructions regardless of whether or not they appear in the instruction
-    // descriptor's list.
-    bool Implicit = Op.isImplicit();
-    bool TakeImplicit = NeedsImplicit || IsPredicated;
-    if (Implicit && !TakeImplicit && !ImpUses.count(RR))
+    unsigned R = Op.getReg();
+    if (!R || !TargetRegisterInfo::isPhysicalRegister(R))
       continue;
     uint16_t Flags = NodeAttrs::None;
     if (Op.isUndef())
@@ -1570,7 +1473,7 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
 
   auto MaxCoverIn = [this] (RegisterRef RR, RegisterSet &RRs) -> RegisterRef {
     for (RegisterRef I : RRs)
-      if (I != RR && RegisterAggr::isCoverOf(I, RR, TRI))
+      if (I != RR && RegisterAggr::isCoverOf(I, RR, PRI))
         RR = I;
     return RR;
   };
@@ -1597,7 +1500,7 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, BlockRefsMap &RefM,
   auto Aliased = [this,&MaxRefs](RegisterRef RR,
                                  std::vector<unsigned> &Closure) -> bool {
     for (unsigned I : Closure)
-      if (alias(RR, MaxRefs[I]))
+      if (PRI.alias(RR, MaxRefs[I]))
         return true;
     return false;
   };
@@ -1708,7 +1611,7 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
   NodeAddr<T> TAP;
 
   // References from the def stack that have been examined so far.
-  RegisterAggr Defs(TRI);
+  RegisterAggr Defs(PRI);
 
   for (auto I = DS.top(), E = DS.bottom(); I != E; I.down()) {
     RegisterRef QR = I->Addr->getRegRef(*this);
@@ -1744,13 +1647,15 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
 }
 
 // Create data-flow links for all reference nodes in the statement node SA.
-void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+template <typename Predicate>
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA,
+      Predicate P) {
 #ifndef NDEBUG
   RegisterSet Defs;
 #endif
 
   // Link all nodes (upwards in the data-flow) with their reaching defs.
-  for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+  for (NodeAddr<RefNode*> RA : SA.Addr->members_if(P, *this)) {
     uint16_t Kind = RA.Addr->getKind();
     assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
     RegisterRef RR = RA.Addr->getRegRef(*this);
@@ -1779,6 +1684,13 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   // Push block delimiters.
   markBlock(BA.Id, DefM);
 
+  auto IsClobber = [] (NodeAddr<RefNode*> RA) -> bool {
+    return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+  auto IsNoClobber = [] (NodeAddr<RefNode*> RA) -> bool {
+    return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+
   assert(BA.Addr && "block node address is needed to create a data-flow link");
   // For each non-phi instruction in the block, link all the defs and uses
   // to their reaching defs. For any member of the block (including phis),
@@ -1786,10 +1698,17 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
     // Ignore phi nodes here. They will be linked part by part from the
     // predecessors.
-    if (IA.Addr->getKind() == NodeAttrs::Stmt)
-      linkStmtRefs(DefM, IA);
+    if (IA.Addr->getKind() == NodeAttrs::Stmt) {
+      linkStmtRefs(DefM, IA, IsUse);
+      linkStmtRefs(DefM, IA, IsClobber);
+    }
 
     // Push the definitions on the stack.
+    pushClobbers(IA, DefM);
+
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA, IsNoClobber);
+
     pushDefs(IA, DefM);
   }
 
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
index 49d78a8..52f3903 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
+++ b/contrib/llvm/lib/Target/Hexagon/RDFGraph.h
@@ -225,6 +225,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
 #define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
 
+#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/LaneBitmask.h"
@@ -260,7 +261,6 @@ namespace llvm {
 namespace rdf {
 
   typedef uint32_t NodeId;
-  typedef uint32_t RegisterId;
 
   struct DataFlowGraph;
 
@@ -412,25 +412,6 @@ namespace rdf {
     AllocatorTy MemPool;
   };
 
-  struct RegisterRef {
-    RegisterId Reg;
-    LaneBitmask Mask;
-
-    RegisterRef() : RegisterRef(0) {}
-    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
-      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
-
-    operator bool() const { return Reg != 0 && Mask.any(); }
-    bool operator== (const RegisterRef &RR) const {
-      return Reg == RR.Reg && Mask == RR.Mask;
-    }
-    bool operator!= (const RegisterRef &RR) const {
-      return !operator==(RR);
-    }
-    bool operator< (const RegisterRef &RR) const {
-      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
-    }
-  };
   typedef std::set<RegisterRef> RegisterSet;
 
   struct TargetOperandInfo {
@@ -450,39 +431,6 @@ namespace rdf {
     uint32_t MaskId;
   };
 
-  // Template class for a map translating uint32_t into arbitrary types.
-  // The map will act like an indexed set: upon insertion of a new object,
-  // it will automatically assign a new index to it. Index of 0 is treated
-  // as invalid and is never allocated.
-  template <typename T, unsigned N = 32>
-  struct IndexedSet {
-    IndexedSet() : Map() { Map.reserve(N); }
-
-    T get(uint32_t Idx) const {
-      // Index Idx corresponds to Map[Idx-1].
-      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
-      return Map[Idx-1];
-    }
-
-    uint32_t insert(T Val) {
-      // Linear search.
-      auto F = llvm::find(Map, Val);
-      if (F != Map.end())
-        return F - Map.begin() + 1;
-      Map.push_back(Val);
-      return Map.size();  // Return actual_index + 1.
-    }
-
-    uint32_t find(T Val) const {
-      auto F = llvm::find(Map, Val);
-      assert(F != Map.end());
-      return F - Map.begin();
-    }
-
-  private:
-    std::vector<T> Map;
-  };
-
   struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
     LaneMaskIndex() = default;
 
@@ -497,55 +445,6 @@ namespace rdf {
       assert(LM.any());
       return LM.all() ? 0 : find(LM);
     }
-
-    PackedRegisterRef pack(RegisterRef RR) {
-      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
-    }
-    PackedRegisterRef pack(RegisterRef RR) const {
-      return { RR.Reg, getIndexForLaneMask(RR.Mask) };
-    }
-
-    RegisterRef unpack(PackedRegisterRef PR) const {
-      return RegisterRef(PR.Reg, getLaneMaskForIndex(PR.MaskId));
-    }
-  };
-
-  struct RegisterAggr {
-    RegisterAggr(const TargetRegisterInfo &tri)
-        : ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false), TRI(tri) {}
-    RegisterAggr(const RegisterAggr &RG) = default;
-
-    bool empty() const { return Masks.empty(); }
-    bool hasAliasOf(RegisterRef RR) const;
-    bool hasCoverOf(RegisterRef RR) const;
-    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
-                          const TargetRegisterInfo &TRI) {
-      return RegisterAggr(TRI).insert(RA).hasCoverOf(RB);
-    }
-
-    RegisterAggr &insert(RegisterRef RR);
-    RegisterAggr &insert(const RegisterAggr &RG);
-    RegisterAggr &clear(RegisterRef RR);
-    RegisterAggr &clear(const RegisterAggr &RG);
-
-    RegisterRef clearIn(RegisterRef RR) const;
-
-    void print(raw_ostream &OS) const;
-
-  private:
-    typedef std::unordered_map<RegisterId, LaneBitmask> MapType;
-
-  public:
-    typedef MapType::const_iterator iterator;
-    iterator begin() const { return Masks.begin(); }
-    iterator end() const { return Masks.end(); }
-    RegisterRef normalize(RegisterRef RR) const;
-
-  private:
-    MapType Masks;
-    BitVector ExpAliasUnits; // Register units for explicit aliases.
-    bool CheckUnits;
-    const TargetRegisterInfo &TRI;
   };
 
   struct NodeBase {
@@ -609,7 +508,8 @@ namespace rdf {
   static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
         "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
 
-  typedef std::vector<NodeAddr<NodeBase*>> NodeList;
+//  typedef std::vector<NodeAddr<NodeBase*>> NodeList;
+  typedef SmallVector<NodeAddr<NodeBase*>,4> NodeList;
   typedef std::set<NodeId> NodeSet;
 
   struct RefNode : public NodeBase {
@@ -761,8 +661,10 @@ namespace rdf {
     MachineFunction &getMF() const { return MF; }
     const TargetInstrInfo &getTII() const { return TII; }
     const TargetRegisterInfo &getTRI() const { return TRI; }
+    const PhysicalRegisterInfo &getPRI() const { return PRI; }
     const MachineDominatorTree &getDT() const { return MDT; }
     const MachineDominanceFrontier &getDF() const { return MDF; }
+    const RegisterAggr &getLiveIns() const { return LiveIns; }
 
     struct DefStack {
       DefStack() = default;
@@ -828,15 +730,22 @@ namespace rdf {
     typedef std::unordered_map<RegisterId,DefStack> DefStackMap;
 
     void build(unsigned Options = BuildOptions::None);
-    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     void markBlock(NodeId B, DefStackMap &DefM);
     void releaseBlock(NodeId B, DefStackMap &DefM);
 
-    PackedRegisterRef pack(RegisterRef RR)       { return LMI.pack(RR); }
-    PackedRegisterRef pack(RegisterRef RR) const { return LMI.pack(RR); }
-    RegisterRef unpack(PackedRegisterRef PR) const { return LMI.unpack(PR); }
+    PackedRegisterRef pack(RegisterRef RR) {
+      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
+    }
+    PackedRegisterRef pack(RegisterRef RR) const {
+      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
+    }
+    RegisterRef unpack(PackedRegisterRef PR) const {
+      return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId));
+    }
+
     RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
-    RegisterRef normalizeRef(RegisterRef RR) const;
+    RegisterRef makeRegRef(const MachineOperand &Op) const;
     RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
 
     NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
@@ -853,6 +762,10 @@ namespace rdf {
     NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA) const;
 
+    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) const {
+      return BlockNodes.at(BB);
+    }
+
     void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
       unlinkUseDF(UA);
       if (RemoveFromOwner)
@@ -898,13 +811,9 @@ namespace rdf {
       return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
     }
 
-    // Register aliasing.
-    bool alias(RegisterRef RA, RegisterRef RB) const;
-
   private:
     void reset();
 
-    RegisterSet getAliasSet(RegisterId Reg) const;
     RegisterSet getLandingPadLiveIns() const;
 
     NodeAddr<NodeBase*> newNode(uint16_t Attrs);
@@ -940,9 +849,12 @@ namespace rdf {
         NodeAddr<BlockNode*> BA);
     void removeUnusedPhis();
 
+    void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
         NodeAddr<T> TA, DefStack &DS);
-    void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+    template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
+        NodeAddr<StmtNode*> SA, Predicate P);
     void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
 
     void unlinkUseDF(NodeAddr<UseNode*> UA);
@@ -953,23 +865,21 @@ namespace rdf {
       IA.Addr->removeMember(RA, *this);
     }
 
-    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) {
-      return BlockNodes[BB];
-    }
+    MachineFunction &MF;
+    const TargetInstrInfo &TII;
+    const TargetRegisterInfo &TRI;
+    const PhysicalRegisterInfo PRI;
+    const MachineDominatorTree &MDT;
+    const MachineDominanceFrontier &MDF;
+    const TargetOperandInfo &TOI;
 
+    RegisterAggr LiveIns;
     NodeAddr<FuncNode*> Func;
     NodeAllocator Memory;
     // Local map:  MachineBasicBlock -> NodeAddr<BlockNode*>
     std::map<MachineBasicBlock*,NodeAddr<BlockNode*>> BlockNodes;
     // Lane mask map.
     LaneMaskIndex LMI;
-
-    MachineFunction &MF;
-    const TargetInstrInfo &TII;
-    const TargetRegisterInfo &TRI;
-    const MachineDominatorTree &MDT;
-    const MachineDominanceFrontier &MDF;
-    const TargetOperandInfo &TOI;
   };  // struct DataFlowGraph
 
   template <typename Predicate>
@@ -1013,12 +923,6 @@ namespace rdf {
     return MM;
   }
 
-  // Optionally print the lane mask, if it is not ~0.
-  struct PrintLaneMaskOpt {
-    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
-    LaneBitmask Mask;
-  };
-  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
 
   template <typename T> struct Print;
   template <typename T>
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
index e74c4bf..83e8968 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.cpp
@@ -23,19 +23,23 @@
 // and Embedded Architectures and Compilers", 8 (4),
 // <10.1145/2086696.2086706>. <hal-00647369>
 //
-#include "RDFGraph.h"
 #include "RDFLiveness.h"
+#include "RDFGraph.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 using namespace rdf;
 
+static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
+  cl::Hidden, cl::desc("Maximum recursion level"));
+
 namespace llvm {
 namespace rdf {
   template<>
@@ -85,7 +89,8 @@ namespace rdf {
 // the data-flow.
 
 NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
-      NodeAddr<RefNode*> RefA, bool FullChain, const RegisterAggr &DefRRs) {
+      NodeAddr<RefNode*> RefA, bool TopShadows, bool FullChain,
+      const RegisterAggr &DefRRs) {
   NodeList RDefs; // Return value.
   SetVector<NodeId> DefQ;
   SetVector<NodeId> Owners;
@@ -105,6 +110,11 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   auto SNA = DFG.addr<RefNode*>(Start);
   if (NodeId RD = SNA.Addr->getReachingDef())
     DefQ.insert(RD);
+  if (TopShadows) {
+    for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA))
+      if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
 
   // Collect all the reaching defs, going up until a phi node is encountered,
   // or there are no more reaching defs. From this set, the actual set of
@@ -119,7 +129,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
     // Stop at the covering/overwriting def of the initial register reference.
     RegisterRef RR = TA.Addr->getRegRef(DFG);
     if (!DFG.IsPreservingDef(TA))
-      if (RegisterAggr::isCoverOf(RR, RefRR, TRI))
+      if (RegisterAggr::isCoverOf(RR, RefRR, PRI))
         continue;
     // Get the next level of reaching defs. This will include multiple
     // reaching defs for shadows.
@@ -134,7 +144,7 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   for (NodeId N : DefQ) {
     auto TA = DFG.addr<DefNode*>(N);
     bool IsPhi = TA.Addr->getFlags() & NodeAttrs::PhiRef;
-    if (!IsPhi && !DFG.alias(RefRR, TA.Addr->getRegRef(DFG)))
+    if (!IsPhi && !PRI.alias(RefRR, TA.Addr->getRegRef(DFG)))
       continue;
     Defs.insert(TA.Id);
     Owners.insert(TA.Addr->getOwner(DFG).Id);
@@ -241,20 +251,30 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
 }
 
 
-NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
-      NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs) {
+std::pair<NodeSet,bool>
+Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+      NodeSet &Visited, const NodeSet &Defs) {
+  return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest);
+}
+
+
+std::pair<NodeSet,bool>
+Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+      NodeSet &Visited, const NodeSet &Defs, unsigned Nest, unsigned MaxNest) {
+  if (Nest > MaxNest)
+    return { NodeSet(), false };
   // Collect all defined registers. Do not consider phis to be defining
   // anything, only collect "real" definitions.
-  RegisterAggr DefRRs(TRI);
+  RegisterAggr DefRRs(PRI);
   for (NodeId D : Defs) {
     const auto DA = DFG.addr<const DefNode*>(D);
     if (!(DA.Addr->getFlags() & NodeAttrs::PhiRef))
       DefRRs.insert(DA.Addr->getRegRef(DFG));
   }
 
-  NodeList RDs = getAllReachingDefs(RefRR, RefA, true, DefRRs);
+  NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs);
   if (RDs.empty())
-    return Defs;
+    return { Defs, true };
 
   // Make a copy of the preexisting definitions and add the newly found ones.
   NodeSet TmpDefs = Defs;
@@ -273,12 +293,74 @@ NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
     Visited.insert(PA.Id);
     // Go over all phi uses and get the reaching defs for each use.
     for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
-      const auto &T = getAllReachingDefsRec(RefRR, U, Visited, TmpDefs);
-      Result.insert(T.begin(), T.end());
+      const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
+                                                Nest+1, MaxNest);
+      if (!T.second)
+        return { T.first, false };
+      Result.insert(T.first.begin(), T.first.end());
     }
   }
 
-  return Result;
+  return { Result, true };
+}
+
+/// Find the nearest ref node aliased to RefRR, going upwards in the data
+/// flow, starting from the instruction immediately preceding Inst.
+NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR,
+      NodeAddr<InstrNode*> IA) {
+  NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+  NodeList Ins = BA.Addr->members(DFG);
+  NodeId FindId = IA.Id;
+  auto E = Ins.rend();
+  auto B = std::find_if(Ins.rbegin(), E,
+                        [FindId] (const NodeAddr<InstrNode*> T) {
+                          return T.Id == FindId;
+                        });
+  // Do not scan IA (which is what B would point to).
+  if (B != E)
+    ++B;
+
+  do {
+    // Process the range of instructions from B to E.
+    for (NodeAddr<InstrNode*> I : make_range(B, E)) {
+      NodeList Refs = I.Addr->members(DFG);
+      NodeAddr<RefNode*> Clob, Use;
+      // Scan all the refs in I aliased to RefRR, and return the one that
+      // is the closest to the output of I, i.e. def > clobber > use.
+      for (NodeAddr<RefNode*> R : Refs) {
+        if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR))
+          continue;
+        if (DFG.IsDef(R)) {
+          // If it's a non-clobbering def, just return it.
+          if (!(R.Addr->getFlags() & NodeAttrs::Clobbering))
+            return R;
+          Clob = R;
+        } else {
+          Use = R;
+        }
+      }
+      if (Clob.Id != 0)
+        return Clob;
+      if (Use.Id != 0)
+        return Use;
+    }
+
+    // Go up to the immediate dominator, if any.
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    BA = NodeAddr<BlockNode*>();
+    if (MachineDomTreeNode *N = MDT.getNode(BB)) {
+      if ((N = N->getIDom()))
+        BA = DFG.findBlock(N->getBlock());
+    }
+    if (!BA.Id)
+      break;
+
+    Ins = BA.Addr->members(DFG);
+    B = Ins.rbegin();
+    E = Ins.rend();
+  } while (true);
+
+  return NodeAddr<RefNode*>();
 }
 
 
@@ -299,7 +381,7 @@ NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
     auto UA = DFG.addr<UseNode*>(U);
     if (!(UA.Addr->getFlags() & NodeAttrs::Undef)) {
       RegisterRef UR = UA.Addr->getRegRef(DFG);
-      if (DFG.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
+      if (PRI.alias(RefRR, UR) && !DefRRs.hasCoverOf(UR))
         Uses.insert(U);
     }
     U = UA.Addr->getSibling();
@@ -312,7 +394,7 @@ NodeSet Liveness::getAllReachedUses(RegisterRef RefRR,
     RegisterRef DR = DA.Addr->getRegRef(DFG);
     // If this def is already covered, it cannot reach anything new.
     // Similarly, skip it if it is not aliased to the interesting register.
-    if (DefRRs.hasCoverOf(DR) || !DFG.alias(RefRR, DR))
+    if (DefRRs.hasCoverOf(DR) || !PRI.alias(RefRR, DR))
       continue;
     NodeSet T;
     if (DFG.IsPreservingDef(DA)) {
@@ -343,6 +425,7 @@ void Liveness::computePhiInfo() {
   // phi use -> (map: reaching phi -> set of registers defined in between)
   std::map<NodeId,std::map<NodeId,RegisterAggr>> PhiUp;
   std::vector<NodeId> PhiUQ;  // Work list of phis for upward propagation.
+  std::map<NodeId,RegisterAggr> PhiDRs;  // Phi -> registers defined by it.
 
   // Go over all phis.
   for (NodeAddr<PhiNode*> PhiA : Phis) {
@@ -355,12 +438,15 @@ void Liveness::computePhiInfo() {
     // For each def, add to the queue all reached (non-phi) defs.
     SetVector<NodeId> DefQ;
     NodeSet PhiDefs;
+    RegisterAggr DRs(PRI);
     for (NodeAddr<RefNode*> R : PhiRefs) {
       if (!DFG.IsRef<NodeAttrs::Def>(R))
         continue;
+      DRs.insert(R.Addr->getRegRef(DFG));
       DefQ.insert(R.Id);
       PhiDefs.insert(R.Id);
     }
+    PhiDRs.insert(std::make_pair(PhiA.Id, DRs));
 
     // Collect the super-set of all possible reached uses. This set will
     // contain all uses reached from this phi, either directly from the
@@ -377,9 +463,9 @@ void Liveness::computePhiInfo() {
         NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
         uint16_t F = A.Addr->getFlags();
         if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
-	  RegisterRef R = DFG.normalizeRef(getRestrictedRegRef(A));
+          RegisterRef R = PRI.normalize(A.Addr->getRegRef(DFG));
           RealUses[R.Reg].insert({A.Id,R.Mask});
-	}
+        }
         UN = A.Addr->getSibling();
       }
       // Visit all reached defs, and add them to the queue. These defs may
@@ -411,30 +497,33 @@ void Liveness::computePhiInfo() {
     //      = R1:0     u6     Not reached by d1 (covered collectively
     //                        by d3 and d5), but following reached
     //                        defs and uses from d1 will lead here.
-    auto InPhiDefs = [&PhiDefs] (NodeAddr<DefNode*> DA) -> bool {
-      return PhiDefs.count(DA.Id);
-    };
     for (auto UI = RealUses.begin(), UE = RealUses.end(); UI != UE; ) {
       // For each reached register UI->first, there is a set UI->second, of
       // uses of it. For each such use, check if it is reached by this phi,
       // i.e. check if the set of its reaching uses intersects the set of
       // this phi's defs.
-      NodeRefSet &Uses = UI->second;
-      for (auto I = Uses.begin(), E = Uses.end(); I != E; ) {
-        auto UA = DFG.addr<UseNode*>(I->first);
+      NodeRefSet Uses = UI->second;
+      UI->second.clear();
+      for (std::pair<NodeId,LaneBitmask> I : Uses) {
+        auto UA = DFG.addr<UseNode*>(I.first);
         // Undef flag is checked above.
         assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
-	RegisterRef R(UI->first, I->second);
-        NodeList RDs = getAllReachingDefs(R, UA);
-        if (any_of(RDs, InPhiDefs))
-          ++I;
-        else
-          I = Uses.erase(I);
+        RegisterRef R(UI->first, I.second);
+        // Calculate the exposed part of the reached use.
+        RegisterAggr Covered(PRI);
+        for (NodeAddr<DefNode*> DA : getAllReachingDefs(R, UA)) {
+          if (PhiDefs.count(DA.Id))
+            break;
+          Covered.insert(DA.Addr->getRegRef(DFG));
+        }
+        if (RegisterRef RC = Covered.clearIn(R)) {
+          // We are updating the map for register UI->first, so we need
+          // to map RC to be expressed in terms of that register.
+          RegisterRef S = PRI.mapTo(RC, UI->first);
+          UI->second.insert({I.first, S.Mask});
+        }
       }
-      if (Uses.empty())
-        UI = RealUses.erase(UI);
-      else
-        ++UI;
+      UI = UI->second.empty() ? RealUses.erase(UI) : std::next(UI);
     }
 
     // If this phi reaches some "real" uses, add it to the queue for upward
@@ -452,32 +541,29 @@ void Liveness::computePhiInfo() {
     for (auto I : PhiRefs) {
       if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id))
         continue;
-      NodeAddr<UseNode*> UA = I;
-
-      // Given a phi use UA, traverse all related phi uses (including UA).
-      // The related phi uses may reach different phi nodes or may reach the
-      // same phi node. If multiple uses reach the same phi P, the intervening
-      // defs must be accumulated for all such uses. To group all such uses
-      // into one set, map their node ids to the first use id that reaches P.
-      std::map<NodeId,NodeId> FirstUse; // Phi reached up -> first phi use.
-
-      for (NodeAddr<UseNode*> VA : DFG.getRelatedRefs(PhiA, UA)) {
-        SeenUses.insert(VA.Id);
-        RegisterAggr DefRRs(TRI);
-        for (NodeAddr<DefNode*> DA : getAllReachingDefs(VA)) {
-          if (DA.Addr->getFlags() & NodeAttrs::PhiRef) {
-            NodeId RP = DA.Addr->getOwner(DFG).Id;
-            NodeId FU = FirstUse.insert({RP,VA.Id}).first->second;
-            std::map<NodeId,RegisterAggr> &M = PhiUp[FU];
-            auto F = M.find(RP);
-            if (F == M.end())
-              M.insert(std::make_pair(RP, DefRRs));
-            else
-              F->second.insert(DefRRs);
-          }
-          DefRRs.insert(DA.Addr->getRegRef(DFG));
+      NodeAddr<PhiUseNode*> PUA = I;
+      if (PUA.Addr->getReachingDef() == 0)
+        continue;
+
+      RegisterRef UR = PUA.Addr->getRegRef(DFG);
+      NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs);
+      RegisterAggr DefRRs(PRI);
+
+      for (NodeAddr<DefNode*> D : Ds) {
+        if (D.Addr->getFlags() & NodeAttrs::PhiRef) {
+          NodeId RP = D.Addr->getOwner(DFG).Id;
+          std::map<NodeId,RegisterAggr> &M = PhiUp[PUA.Id];
+          auto F = M.find(RP);
+          if (F == M.end())
+            M.insert(std::make_pair(RP, DefRRs));
+          else
+            F->second.insert(DefRRs);
         }
+        DefRRs.insert(D.Addr->getRegRef(DFG));
       }
+
+      for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PhiA, PUA))
+        SeenUses.insert(T.Id);
     }
   }
 
@@ -522,7 +608,7 @@ void Liveness::computePhiInfo() {
 
     for (NodeAddr<UseNode*> UA : PUs) {
       std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
-      RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(UA));
+      RegisterRef UR = PRI.normalize(UA.Addr->getRegRef(DFG));
       for (const std::pair<NodeId,RegisterAggr> &P : PUM) {
         bool Changed = false;
         const RegisterAggr &MidDefs = P.second;
@@ -540,14 +626,19 @@ void Liveness::computePhiInfo() {
         //       then add (R-MidDefs,U) to RealUseMap[P]
         //
         for (const std::pair<RegisterId,NodeRefSet> &T : RUM) {
-          RegisterRef R = DFG.restrictRef(RegisterRef(T.first), UR);
-          if (!R)
+          RegisterRef R(T.first);
+          // The current phi (PA) could be a phi for a regmask. It could
+          // reach a whole variety of uses that are not related to the
+          // specific upward phi (P.first).
+          const RegisterAggr &DRs = PhiDRs.at(P.first);
+          if (!DRs.hasAliasOf(R))
             continue;
+          R = PRI.mapTo(DRs.intersectWith(R), T.first);
           for (std::pair<NodeId,LaneBitmask> V : T.second) {
-            RegisterRef S = DFG.restrictRef(RegisterRef(R.Reg, V.second), R);
-            if (!S)
+            LaneBitmask M = R.Mask & V.second;
+            if (M.none())
               continue;
-            if (RegisterRef SS = MidDefs.clearIn(S)) {
+            if (RegisterRef SS = MidDefs.clearIn(RegisterRef(R.Reg, M))) {
               NodeRefSet &RS = RealUseMap[P.first][SS.Reg];
               Changed |= RS.insert({V.first,SS.Mask}).second;
             }
@@ -645,30 +736,48 @@ void Liveness::computeLiveIns() {
       if (RUs.empty())
         continue;
 
+      NodeSet SeenUses;
       for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        if (!SeenUses.insert(U.Id).second)
+          continue;
         NodeAddr<PhiUseNode*> PUA = U;
         if (PUA.Addr->getReachingDef() == 0)
           continue;
 
-        // Mark all reached "real" uses of P as live on exit in the
-        // predecessor.
-        // Remap all the RUs so that they have a correct reaching def.
+        // Each phi has some set (possibly empty) of reached "real" uses,
+        // that is, uses that are part of the compiled program. Such a use
+        // may be located in some farther block, but following a chain of
+        // reaching defs will eventually lead to this phi.
+        // Any chain of reaching defs may fork at a phi node, but there
+        // will be a path upwards that will lead to this phi. Now, this
+        // chain will need to fork at this phi, since some of the reached
+        // uses may have definitions joining in from multiple predecessors.
+        // For each reached "real" use, identify the set of reaching defs
+        // coming from each predecessor P, and add them to PhiLOX[P].
+        //
         auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
         RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
 
-        RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(PUA));
-        for (const std::pair<RegisterId,NodeRefSet> &T : RUs) {
-          // Check if T.first aliases UR?
-          LaneBitmask M;
-          for (std::pair<NodeId,LaneBitmask> P : T.second)
-            M |= P.second;
-
-          RegisterRef S = DFG.restrictRef(RegisterRef(T.first, M), UR);
-          if (!S)
-            continue;
-          for (NodeAddr<DefNode*> D : getAllReachingDefs(S, PUA))
-            LOX[S.Reg].insert({D.Id, S.Mask});
+        for (const std::pair<RegisterId,NodeRefSet> &RS : RUs) {
+          // We need to visit each individual use.
+          for (std::pair<NodeId,LaneBitmask> P : RS.second) {
+            // Create a register ref corresponding to the use, and find
+            // all reaching defs starting from the phi use, and treating
+            // all related shadows as a single use cluster.
+            RegisterRef S(RS.first, P.second);
+            NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs);
+            for (NodeAddr<DefNode*> D : Ds) {
+              // Calculate the mask corresponding to the visited def.
+              RegisterAggr TA(PRI);
+              TA.insert(D.Addr->getRegRef(DFG)).intersect(S);
+              LaneBitmask TM = TA.makeRegRef().Mask;
+              LOX[S.Reg].insert({D.Id, TM});
+            }
+          }
         }
+
+        for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PA, PUA))
+          SeenUses.insert(T.Id);
       }  // for U : phi uses
     }  // for P : Phis
   }  // for B : Blocks
@@ -684,9 +793,7 @@ void Liveness::computeLiveIns() {
   traverse(&MF.front(), LiveIn);
 
   // Add function live-ins to the live-in set of the function entry block.
-  auto &EntryIn = LiveMap[&MF.front()];
-  for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
-    EntryIn.insert(RegisterRef(I->first));
+  LiveMap[&MF.front()].insert(DFG.getLiveIns());
 
   if (Trace) {
     // Dump the liveness map
@@ -702,19 +809,9 @@ void Liveness::computeLiveIns() {
       //dbgs() << "\tcomp = " << Print<RegisterAggr>(LiveMap[&B], DFG) << '\n';
 
       LV.clear();
-      for (std::pair<RegisterId,LaneBitmask> P : LiveMap[&B]) {
-        MCSubRegIndexIterator S(P.first, &TRI);
-        if (!S.isValid()) {
-          LV.push_back(RegisterRef(P.first));
-          continue;
-        }
-        do {
-          LaneBitmask M = TRI.getSubRegIndexLaneMask(S.getSubRegIndex());
-          if ((M & P.second).any())
-            LV.push_back(RegisterRef(S.getSubReg()));
-          ++S;
-        } while (S.isValid());
-      }
+      const RegisterAggr &LG = LiveMap[&B];
+      for (auto I = LG.rr_begin(), E = LG.rr_end(); I != E; ++I)
+        LV.push_back(*I);
       std::sort(LV.begin(), LV.end());
       dbgs() << "\tcomp = {";
       for (auto I : LV)
@@ -735,9 +832,10 @@ void Liveness::resetLiveIns() {
     for (auto I : T)
       B.removeLiveIn(I);
     // Add the newly computed live-ins.
-    auto &LiveIns = LiveMap[&B];
-    for (auto I : LiveIns) {
-      B.addLiveIn({MCPhysReg(I.first), I.second});
+    const RegisterAggr &LiveIns = LiveMap[&B];
+    for (auto I = LiveIns.rr_begin(), E = LiveIns.rr_end(); I != E; ++I) {
+      RegisterRef R = *I;
+      B.addLiveIn({MCPhysReg(R.Reg), R.Mask});
     }
   }
 }
@@ -791,7 +889,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
         Live.reset(*SR);
     }
     for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isUse())
+      if (!Op.isReg() || !Op.isUse() || Op.isUndef())
         continue;
       unsigned R = Op.getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(R))
@@ -803,9 +901,8 @@ void Liveness::resetKills(MachineBasicBlock *B) {
         IsLive = true;
         break;
       }
-      if (IsLive)
-        continue;
-      Op.setIsKill(true);
+      if (!IsLive)
+        Op.setIsKill(true);
       for (MCSubRegIterator SR(R, &TRI, true); SR.isValid(); ++SR)
         Live.set(*SR);
     }
@@ -813,17 +910,6 @@ void Liveness::resetKills(MachineBasicBlock *B) {
 }
 
 
-RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
-  assert(DFG.IsRef<NodeAttrs::Use>(RA));
-  if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
-    NodeId RD = RA.Addr->getReachingDef();
-    assert(RD);
-    RA = DFG.addr<DefNode*>(RD);
-  }
-  return RA.Addr->getRegRef(DFG);
-}
-
-
 // Helper function to obtain the basic block containing the reaching def
 // of the given use.
 MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
@@ -921,7 +1007,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
       // propagated upwards. This only applies to non-preserving defs,
       // and to the parts of the register actually covered by those defs.
       // (Note that phi defs should always be preserving.)
-      RegisterAggr RRs(TRI);
+      RegisterAggr RRs(PRI);
       LRef.Mask = OR.second;
 
       if (!DFG.IsPreservingDef(DA)) {
@@ -949,10 +1035,9 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
           // registers are not covering LRef. The first def from the
           // upward chain will be live.
           // Subtract all accumulated defs (RRs) from LRef.
-          RegisterAggr L(TRI);
-          L.insert(LRef).clear(RRs);
-          assert(!L.empty());
-          NewDefs.insert({TA.Id,L.begin()->second});
+          RegisterRef T = RRs.clearIn(LRef);
+          assert(T);
+          NewDefs.insert({TA.Id,T.Mask});
           break;
         }
 
@@ -983,7 +1068,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
     for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
       if (UA.Addr->getFlags() & NodeAttrs::Undef)
         continue;
-      RegisterRef RR = DFG.normalizeRef(UA.Addr->getRegRef(DFG));
+      RegisterRef RR = PRI.normalize(UA.Addr->getRegRef(DFG));
       for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
         if (getBlockWithRef(D.Id) != B)
           LiveIn[RR.Reg].insert({D.Id,RR.Mask});
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
index c88396f..6f2615b 100644
--- a/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
+++ b/contrib/llvm/lib/Target/Hexagon/RDFLiveness.h
@@ -33,7 +33,7 @@ namespace rdf {
     // This is really a std::map, except that it provides a non-trivial
     // default constructor to the element accessed via [].
     struct LiveMapType {
-      LiveMapType(const TargetRegisterInfo &tri) : Empty(tri) {}
+      LiveMapType(const PhysicalRegisterInfo &pri) : Empty(pri) {}
 
       RegisterAggr &operator[] (MachineBasicBlock *B) {
         return Map.emplace(B, Empty).first->second;
@@ -49,26 +49,31 @@ namespace rdf {
     typedef std::map<RegisterId,NodeRefSet> RefMap;
 
     Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
-      : DFG(g), TRI(g.getTRI()), MDT(g.getDT()), MDF(g.getDF()),
-        MRI(mri), LiveMap(g.getTRI()), Empty(), NoRegs(g.getTRI()),
-        Trace(false) {}
+      : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
+        MDF(g.getDF()), LiveMap(g.getPRI()), Empty(),
+        NoRegs(g.getPRI()), Trace(false) {}
 
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        bool FullChain, const RegisterAggr &DefRRs);
+        bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
     NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false, NoRegs);
+      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
+                                false, NoRegs);
     }
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefRR, RefA, false, NoRegs);
+      return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
     }
-    NodeSet getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        NodeSet &Visited, const NodeSet &Defs);
     NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
         const RegisterAggr &DefRRs);
     NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
       return getAllReachedUses(RefRR, DefA, NoRegs);
     }
 
+    std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
+        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
+
+    NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
+        NodeAddr<InstrNode*> IA);
+
     LiveMapType &getLiveMap() { return LiveMap; }
     const LiveMapType &getLiveMap() const { return LiveMap; }
     const RefMap &getRealUses(NodeId P) const {
@@ -87,9 +92,9 @@ namespace rdf {
   private:
     const DataFlowGraph &DFG;
     const TargetRegisterInfo &TRI;
+    const PhysicalRegisterInfo &PRI;
     const MachineDominatorTree &MDT;
     const MachineDominanceFrontier &MDF;
-    MachineRegisterInfo &MRI;
     LiveMapType LiveMap;
     const RefMap Empty;
     const RegisterAggr NoRegs;
@@ -121,12 +126,13 @@ namespace rdf {
     // the dominator tree), create a map: block -> set of uses live on exit.
     std::map<MachineBasicBlock*,RefMap> PhiLOX;
 
-    bool isRestrictedToRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
-        RegisterRef RR) const;
-    RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
     MachineBasicBlock *getBlockWithRef(NodeId RN) const;
     void traverse(MachineBasicBlock *B, RefMap &LiveIn);
     void emptify(RefMap &M);
+
+    std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
+        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
+        unsigned Nest, unsigned MaxNest);
   };
 } // namespace rdf
 } // namespace llvm
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFRegisters.cpp b/contrib/llvm/lib/Target/Hexagon/RDFRegisters.cpp
new file mode 100644
index 0000000..2aabf4e
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFRegisters.cpp
@@ -0,0 +1,372 @@
+//===--- RDFRegisters.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RDFRegisters.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
+using namespace llvm;
+using namespace rdf;
+
+PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
+      const MachineFunction &mf)
+    : TRI(tri) {
+  RegInfos.resize(TRI.getNumRegs());
+
+  BitVector BadRC(TRI.getNumRegs());
+  for (const TargetRegisterClass *RC : TRI.regclasses()) {
+    for (MCPhysReg R : *RC) {
+      RegInfo &RI = RegInfos[R];
+      if (RI.RegClass != nullptr && !BadRC[R]) {
+        if (RC->LaneMask != RI.RegClass->LaneMask) {
+          BadRC.set(R);
+          RI.RegClass = nullptr;
+        }
+      } else
+        RI.RegClass = RC;
+    }
+  }
+
+  UnitInfos.resize(TRI.getNumRegUnits());
+
+  for (uint32_t U = 0, NU = TRI.getNumRegUnits(); U != NU; ++U) {
+    if (UnitInfos[U].Reg != 0)
+      continue;
+    MCRegUnitRootIterator R(U, &TRI);
+    assert(R.isValid());
+    RegisterId F = *R;
+    ++R;
+    if (R.isValid()) {
+      UnitInfos[U].Mask = LaneBitmask::getAll();
+      UnitInfos[U].Reg = F;
+    } else {
+      for (MCRegUnitMaskIterator I(F, &TRI); I.isValid(); ++I) {
+        std::pair<uint32_t,LaneBitmask> P = *I;
+        UnitInfo &UI = UnitInfos[P.first];
+        UI.Reg = F;
+        if (P.second.any()) {
+          UI.Mask = P.second;
+        } else {
+          if (const TargetRegisterClass *RC = RegInfos[F].RegClass)
+            UI.Mask = RC->LaneMask;
+          else
+            UI.Mask = LaneBitmask::getAll();
+        }
+      }
+    }
+  }
+
+  for (const uint32_t *RM : TRI.getRegMasks())
+    RegMasks.insert(RM);
+  for (const MachineBasicBlock &B : mf)
+    for (const MachineInstr &In : B)
+      for (const MachineOperand &Op : In.operands())
+        if (Op.isRegMask())
+          RegMasks.insert(Op.getRegMask());
+
+  MaskInfos.resize(RegMasks.size()+1);
+  for (uint32_t M = 1, NM = RegMasks.size(); M <= NM; ++M) {
+    BitVector PU(TRI.getNumRegUnits());
+    const uint32_t *MB = RegMasks.get(M);
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
+      if (!(MB[i/32] & (1u << (i%32))))
+        continue;
+      for (MCRegUnitIterator U(i, &TRI); U.isValid(); ++U)
+        PU.set(*U);
+    }
+    MaskInfos[M].Units = PU.flip();
+  }
+}
+
+RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
+  return RR;
+}
+
+std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
+  // Do not include RR in the alias set.
+  std::set<RegisterId> AS;
+  assert(isRegMaskId(Reg) || TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (isRegMaskId(Reg)) {
+    // XXX SLOW
+    const uint32_t *MB = getRegMaskBits(Reg);
+    for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
+      if (MB[i/32] & (1u << (i%32)))
+        continue;
+      AS.insert(i);
+    }
+    for (const uint32_t *RM : RegMasks) {
+      RegisterId MI = getRegMaskId(RM);
+      if (MI != Reg && aliasMM(RegisterRef(Reg), RegisterRef(MI)))
+        AS.insert(MI);
+    }
+    return AS;
+  }
+
+  for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
+    AS.insert(*AI);
+  for (const uint32_t *RM : RegMasks) {
+    RegisterId MI = getRegMaskId(RM);
+    if (aliasRM(RegisterRef(Reg), RegisterRef(MI)))
+      AS.insert(MI);
+  }
+  return AS;
+}
+
+bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
+  assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+
+  MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
+  MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
+  // Reg units are returned in the numerical order.
+  while (UMA.isValid() && UMB.isValid()) {
+    // Skip units that are masked off in RA.
+    std::pair<RegisterId,LaneBitmask> PA = *UMA;
+    if (PA.second.any() && (PA.second & RA.Mask).none()) {
+      ++UMA;
+      continue;
+    }
+    // Skip units that are masked off in RB.
+    std::pair<RegisterId,LaneBitmask> PB = *UMB;
+    if (PB.second.any() && (PB.second & RB.Mask).none()) {
+      ++UMB;
+      continue;
+    }
+
+    if (PA.first == PB.first)
+      return true;
+    if (PA.first < PB.first)
+      ++UMA;
+    else if (PB.first < PA.first)
+      ++UMB;
+  }
+  return false;
+}
+
+bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(RR.Reg) && isRegMaskId(RM.Reg));
+  const uint32_t *MB = getRegMaskBits(RM.Reg);
+  bool Preserved = MB[RR.Reg/32] & (1u << (RR.Reg%32));
+  // If the lane mask information is "full", e.g. when the given lane mask
+  // is a superset of the lane mask from the register class, check the regmask
+  // bit directly.
+  if (RR.Mask == LaneBitmask::getAll())
+    return !Preserved;
+  const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass;
+  if (RC != nullptr && (RR.Mask & RC->LaneMask) == RC->LaneMask)
+    return !Preserved;
+
+  // Otherwise, check all subregisters whose lane mask overlaps the given
+  // mask. For each such register, if it is preserved by the regmask, then
+  // clear the corresponding bits in the given mask. If at the end, all
+  // bits have been cleared, the register does not alias the regmask (i.e.
+  // is it preserved by it).
+  LaneBitmask M = RR.Mask;
+  for (MCSubRegIndexIterator SI(RR.Reg, &TRI); SI.isValid(); ++SI) {
+    LaneBitmask SM = TRI.getSubRegIndexLaneMask(SI.getSubRegIndex());
+    if ((SM & RR.Mask).none())
+      continue;
+    unsigned SR = SI.getSubReg();
+    if (!(MB[SR/32] & (1u << (SR%32))))
+      continue;
+    // The subregister SR is preserved.
+    M &= ~SM;
+    if (M.none())
+      return false;
+  }
+
+  return true;
+}
+
+bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const {
+  assert(isRegMaskId(RM.Reg) && isRegMaskId(RN.Reg));
+  unsigned NumRegs = TRI.getNumRegs();
+  const uint32_t *BM = getRegMaskBits(RM.Reg);
+  const uint32_t *BN = getRegMaskBits(RN.Reg);
+
+  for (unsigned w = 0, nw = NumRegs/32; w != nw; ++w) {
+    // Intersect the negations of both words. Disregard reg=0,
+    // i.e. 0th bit in the 0th word.
+    uint32_t C = ~BM[w] & ~BN[w];
+    if (w == 0)
+      C &= ~1;
+    if (C)
+      return true;
+  }
+
+  // Check the remaining registers in the last word.
+  unsigned TailRegs = NumRegs % 32;
+  if (TailRegs == 0)
+    return false;
+  unsigned TW = NumRegs / 32;
+  uint32_t TailMask = (1u << TailRegs) - 1;
+  if (~BM[TW] & ~BN[TW] & TailMask)
+    return true;
+
+  return false;
+}
+
+RegisterRef PhysicalRegisterInfo::mapTo(RegisterRef RR, unsigned R) const {
+  if (RR.Reg == R)
+    return RR;
+  if (unsigned Idx = TRI.getSubRegIndex(R, RR.Reg))
+    return RegisterRef(R, TRI.composeSubRegIndexLaneMask(Idx, RR.Mask));
+  if (unsigned Idx = TRI.getSubRegIndex(RR.Reg, R)) {
+    const RegInfo &RI = RegInfos[R];
+    LaneBitmask RCM = RI.RegClass ? RI.RegClass->LaneMask
+                                  : LaneBitmask::getAll();
+    LaneBitmask M = TRI.reverseComposeSubRegIndexLaneMask(Idx, RR.Mask);
+    return RegisterRef(R, M & RCM);
+  }
+  llvm_unreachable("Invalid arguments: unrelated registers?");
+}
+
+
+bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg))
+    return Units.anyCommon(PRI.getMaskUnits(RR.Reg));
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t,LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      if (Units.test(P.first))
+        return true;
+  }
+  return false;
+}
+
+bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
+    BitVector T(PRI.getMaskUnits(RR.Reg));
+    return T.reset(Units).none();
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t,LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      if (!Units.test(P.first))
+        return false;
+  }
+  return true;
+}
+
+RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
+    Units |= PRI.getMaskUnits(RR.Reg);
+    return *this;
+  }
+
+  for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+    std::pair<uint32_t,LaneBitmask> P = *U;
+    if (P.second.none() || (P.second & RR.Mask).any())
+      Units.set(P.first);
+  }
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::insert(const RegisterAggr &RG) {
+  Units |= RG.Units;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::intersect(RegisterRef RR) {
+  return intersect(RegisterAggr(PRI).insert(RR));
+}
+
+RegisterAggr &RegisterAggr::intersect(const RegisterAggr &RG) {
+  Units &= RG.Units;
+  return *this;
+}
+
+RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
+  return clear(RegisterAggr(PRI).insert(RR));
+}
+
+RegisterAggr &RegisterAggr::clear(const RegisterAggr &RG) {
+  Units.reset(RG.Units);
+  return *this;
+}
+
+RegisterRef RegisterAggr::intersectWith(RegisterRef RR) const {
+  RegisterAggr T(PRI);
+  T.insert(RR).intersect(*this);
+  if (T.empty())
+    return RegisterRef();
+  RegisterRef NR = T.makeRegRef();
+  assert(NR);
+  return NR;
+}
+
+RegisterRef RegisterAggr::clearIn(RegisterRef RR) const {
+  return RegisterAggr(PRI).insert(RR).clear(*this).makeRegRef();
+}
+
+RegisterRef RegisterAggr::makeRegRef() const {
+  int U = Units.find_first();
+  if (U < 0)
+    return RegisterRef();
+
+  auto AliasedRegs = [this] (uint32_t Unit, BitVector &Regs) {
+    for (MCRegUnitRootIterator R(Unit, &PRI.getTRI()); R.isValid(); ++R)
+      for (MCSuperRegIterator S(*R, &PRI.getTRI(), true); S.isValid(); ++S)
+        Regs.set(*S);
+  };
+
+  // Find the set of all registers that are aliased to all the units
+  // in this aggregate.
+
+  // Get all the registers aliased to the first unit in the bit vector.
+  BitVector Regs(PRI.getTRI().getNumRegs());
+  AliasedRegs(U, Regs);
+  U = Units.find_next(U);
+
+  // For each other unit, intersect it with the set of all registers
+  // aliased that unit.
+  while (U >= 0) {
+    BitVector AR(PRI.getTRI().getNumRegs());
+    AliasedRegs(U, AR);
+    Regs &= AR;
+    U = Units.find_next(U);
+  }
+
+  // If there is at least one register remaining, pick the first one,
+  // and consolidate the masks of all of its units contained in this
+  // aggregate.
+
+  int F = Regs.find_first();
+  if (F <= 0)
+    return RegisterRef();
+
+  LaneBitmask M;
+  for (MCRegUnitMaskIterator I(F, &PRI.getTRI()); I.isValid(); ++I) {
+    std::pair<uint32_t,LaneBitmask> P = *I;
+    if (Units.test(P.first))
+      M |= P.second.none() ? LaneBitmask::getAll() : P.second;
+  }
+  return RegisterRef(F, M);
+}
+
+void RegisterAggr::print(raw_ostream &OS) const {
+  OS << '{';
+  for (int U = Units.find_first(); U >= 0; U = Units.find_next(U))
+    OS << ' ' << PrintRegUnit(U, &PRI.getTRI());
+  OS << " }";
+}
+
+RegisterAggr::rr_iterator::rr_iterator(const RegisterAggr &RG,
+      bool End)
+    : Owner(&RG) {
+  for (int U = RG.Units.find_first(); U >= 0; U = RG.Units.find_next(U)) {
+    RegisterRef R = RG.PRI.getRefForUnit(U);
+    Masks[R.Reg] |= R.Mask;
+  }
+  Pos = End ? Masks.end() : Masks.begin();
+  Index = End ? Masks.size() : 0;
+}
+
diff --git a/contrib/llvm/lib/Target/Hexagon/RDFRegisters.h b/contrib/llvm/lib/Target/Hexagon/RDFRegisters.h
new file mode 100644
index 0000000..09b733c
--- /dev/null
+++ b/contrib/llvm/lib/Target/Hexagon/RDFRegisters.h
@@ -0,0 +1,219 @@
+//===--- RDFRegisters.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
+#define LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+namespace rdf {
+
+  typedef uint32_t RegisterId;
+
+  // Template class for a map translating uint32_t into arbitrary types.
+  // The map will act like an indexed set: upon insertion of a new object,
+  // it will automatically assign a new index to it. Index of 0 is treated
+  // as invalid and is never allocated.
+  template <typename T, unsigned N = 32>
+  struct IndexedSet {
+    IndexedSet() : Map() { Map.reserve(N); }
+
+    T get(uint32_t Idx) const {
+      // Index Idx corresponds to Map[Idx-1].
+      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
+      return Map[Idx-1];
+    }
+
+    uint32_t insert(T Val) {
+      // Linear search.
+      auto F = llvm::find(Map, Val);
+      if (F != Map.end())
+        return F - Map.begin() + 1;
+      Map.push_back(Val);
+      return Map.size();  // Return actual_index + 1.
+    }
+
+    uint32_t find(T Val) const {
+      auto F = llvm::find(Map, Val);
+      assert(F != Map.end());
+      return F - Map.begin() + 1;
+    }
+
+    uint32_t size() const { return Map.size(); }
+
+    typedef typename std::vector<T>::const_iterator const_iterator;
+    const_iterator begin() const { return Map.begin(); }
+    const_iterator end() const { return Map.end(); }
+
+  private:
+    std::vector<T> Map;
+  };
+
+  struct RegisterRef {
+    RegisterId Reg = 0;
+    LaneBitmask Mask = LaneBitmask::getNone();
+
+    RegisterRef() = default;
+    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
+      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
+
+    operator bool() const {
+      return Reg != 0 && Mask.any();
+    }
+    bool operator== (const RegisterRef &RR) const {
+      return Reg == RR.Reg && Mask == RR.Mask;
+    }
+    bool operator!= (const RegisterRef &RR) const {
+      return !operator==(RR);
+    }
+    bool operator< (const RegisterRef &RR) const {
+      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
+    }
+  };
+
+
+  struct PhysicalRegisterInfo {
+    PhysicalRegisterInfo(const TargetRegisterInfo &tri,
+                         const MachineFunction &mf);
+
+    static bool isRegMaskId(RegisterId R) {
+      return TargetRegisterInfo::isStackSlot(R);
+    }
+    RegisterId getRegMaskId(const uint32_t *RM) const {
+      return TargetRegisterInfo::index2StackSlot(RegMasks.find(RM));
+    }
+    const uint32_t *getRegMaskBits(RegisterId R) const {
+      return RegMasks.get(TargetRegisterInfo::stackSlot2Index(R));
+    }
+    RegisterRef normalize(RegisterRef RR) const;
+
+    bool alias(RegisterRef RA, RegisterRef RB) const {
+      if (!isRegMaskId(RA.Reg))
+        return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
+      return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
+    }
+    std::set<RegisterId> getAliasSet(RegisterId Reg) const;
+
+    RegisterRef getRefForUnit(uint32_t U) const {
+      return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask);
+    }
+    const BitVector &getMaskUnits(RegisterId MaskId) const {
+      return MaskInfos[TargetRegisterInfo::stackSlot2Index(MaskId)].Units;
+    }
+    RegisterRef mapTo(RegisterRef RR, unsigned R) const;
+
+    const TargetRegisterInfo &getTRI() const { return TRI; }
+
+  private:
+    struct RegInfo {
+      const TargetRegisterClass *RegClass = nullptr;
+    };
+    struct UnitInfo {
+      RegisterId Reg = 0;
+      LaneBitmask Mask;
+    };
+    struct MaskInfo {
+      BitVector Units;
+    };
+
+    const TargetRegisterInfo &TRI;
+    IndexedSet<const uint32_t*> RegMasks;
+    std::vector<RegInfo> RegInfos;
+    std::vector<UnitInfo> UnitInfos;
+    std::vector<MaskInfo> MaskInfos;
+
+    bool aliasRR(RegisterRef RA, RegisterRef RB) const;
+    bool aliasRM(RegisterRef RR, RegisterRef RM) const;
+    bool aliasMM(RegisterRef RM, RegisterRef RN) const;
+  };
+
+
+  struct RegisterAggr {
+    RegisterAggr(const PhysicalRegisterInfo &pri)
+        : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
+    RegisterAggr(const RegisterAggr &RG) = default;
+
+    bool empty() const { return Units.none(); }
+    bool hasAliasOf(RegisterRef RR) const;
+    bool hasCoverOf(RegisterRef RR) const;
+    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
+                          const PhysicalRegisterInfo &PRI) {
+      return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
+    }
+
+    RegisterAggr &insert(RegisterRef RR);
+    RegisterAggr &insert(const RegisterAggr &RG);
+    RegisterAggr &intersect(RegisterRef RR);
+    RegisterAggr &intersect(const RegisterAggr &RG);
+    RegisterAggr &clear(RegisterRef RR);
+    RegisterAggr &clear(const RegisterAggr &RG);
+
+    RegisterRef intersectWith(RegisterRef RR) const;
+    RegisterRef clearIn(RegisterRef RR) const;
+    RegisterRef makeRegRef() const;
+
+    void print(raw_ostream &OS) const;
+
+    struct rr_iterator {
+      typedef std::map<RegisterId,LaneBitmask> MapType;
+    private:
+      MapType Masks;
+      MapType::iterator Pos;
+      unsigned Index;
+      const RegisterAggr *Owner;
+    public:
+      rr_iterator(const RegisterAggr &RG, bool End);
+      RegisterRef operator*() const {
+        return RegisterRef(Pos->first, Pos->second);
+      }
+      rr_iterator &operator++() {
+        ++Pos;
+        ++Index;
+        return *this;
+      }
+      bool operator==(const rr_iterator &I) const {
+        assert(Owner == I.Owner);
+        return Index == I.Index;
+      }
+      bool operator!=(const rr_iterator &I) const {
+        return !(*this == I);
+      }
+    };
+
+    rr_iterator rr_begin() const {
+      return rr_iterator(*this, false);
+    }
+    rr_iterator rr_end() const {
+      return rr_iterator(*this, true);
+    }
+
+  private:
+    BitVector Units;
+    const PhysicalRegisterInfo &PRI;
+  };
+
+
+  // Optionally print the lane mask, if it is not ~0.
+  struct PrintLaneMaskOpt {
+    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
+    LaneBitmask Mask;
+  };
+  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
+
+} // namespace rdf
+} // namespace llvm
+
+#endif
+
diff --git a/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 57ead97..1394ac7 100644
--- a/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -28,8 +28,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -787,6 +787,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() {
   case AsmToken::Dot:
     if (!Parser.parseExpression(ExprVal))
       return LanaiOperand::createImm(ExprVal, Start, End);
+    LLVM_FALLTHROUGH;
   default:
     return nullptr;
   }
@@ -1096,7 +1097,7 @@ StringRef LanaiAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
   return Mnemonic;
 }
 
-bool IsMemoryAssignmentError(const OperandVector &Operands) {
+static bool IsMemoryAssignmentError(const OperandVector &Operands) {
   // Detects if a memory operation has an erroneous base register modification.
   // Memory operations are detected by matching the types of operands.
   //
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index d156294..0a9cac2 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "LanaiISelLowering.h"
 #include "Lanai.h"
 #include "LanaiCondCode.h"
-#include "LanaiISelLowering.h"
 #include "LanaiMachineFunctionInfo.h"
 #include "LanaiSubtarget.h"
 #include "LanaiTargetObjectFile.h"
@@ -38,10 +38,11 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetCallingConv.h"
@@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
     ByValArgs.push_back(FIPtr);
   }
 
-  Chain = DAG.getCALLSEQ_START(
-      Chain,
-      DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true),
-      DL);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N,
 
   return SDValue();
 }
+
+void LanaiTargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  unsigned BitWidth = Known.getBitWidth();
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case LanaiISD::SETCC:
+    Known = KnownBits(BitWidth);
+    Known.Zero.setBits(1, BitWidth);
+    break;
+  case LanaiISD::SELECT_CC:
+    KnownBits Known2;
+    DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+    Known.Zero &= Known2.Zero;
+    Known.One &= Known2.One;
+    break;
+  }
+}
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
index c2fba4f..49ad52a 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
+++ b/contrib/llvm/lib/Target/Lanai/LanaiISelLowering.h
@@ -106,6 +106,11 @@ public:
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
 private:
   SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
                          CallingConv::ID CallConv, bool IsVarArg,
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index fcd5da8..a7c9a7a 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -518,7 +518,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
   const MCInstrDesc &DefDesc = DefMI->getDesc();
   for (unsigned i = 1, e = DefDesc.getNumOperands();
        i != e && !DefDesc.OpInfo[i].isPredicate(); ++i)
-    NewMI.addOperand(DefMI->getOperand(i));
+    NewMI.add(DefMI->getOperand(i));
 
   unsigned CondCode = MI.getOperand(3).getImm();
   if (Invert)
@@ -531,7 +531,7 @@ LanaiInstrInfo::optimizeSelect(MachineInstr &MI,
   // register operand tied to the first def.  The tie makes the register
   // allocator ensure the FalseReg is allocated the same register as operand 0.
   FalseReg.setImplicit();
-  NewMI.addOperand(FalseReg);
+  NewMI.add(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
 
   // Update SeenMIs set: register newly created MI and erase removed DefMI.
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
index 285fca1..776fee1 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
+++ b/contrib/llvm/lib/Target/Lanai/LanaiInstrInfo.td
@@ -22,7 +22,8 @@ include "LanaiInstrFormats.td"
 // -------------------------------------------------- //
 
 //  These are target-independent nodes, but have target-specific formats.
-def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
 def SDT_LanaiCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                           SDTCisVT<1, i32>]>;
 def SDT_LanaiCall         : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
@@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SP.
 let Defs = [SP], Uses = [SP] in {
-  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                                "#ADJCALLSTACKDOWN $amt",
-                                [(CallSeqStart timm:$amt)]>;
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                "#ADJCALLSTACKDOWN $amt1 $amt2",
+                                [(CallSeqStart timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                 "#ADJCALLSTACKUP $amt1 $amt2",
                                 [(CallSeqEnd timm:$amt1, timm:$amt2)]>;
@@ -770,9 +771,6 @@ let Uses = [SR] in {
                     [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>;
 }
 
-// SCC's output is already 1-bit so and'ing with 1 is redundant.
-def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>;
-
 // Select with hardware support
 let Uses = [SR], isSelect = 1 in {
   def SELECT : InstRR<0b111, (outs GPR:$Rd),
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
index 39c6335..90ede65 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiMCInstLower.cpp
@@ -130,7 +130,7 @@ void LanaiMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
       break;
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     }
 
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
index 12a2571..fe54589 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -264,12 +264,6 @@ LanaiRegisterInfo::getFrameRegister(const MachineFunction & /*MF*/) const {
 
 unsigned LanaiRegisterInfo::getBaseRegister() const { return Lanai::R14; }
 
-bool LanaiRegisterInfo::canRealignStack(const MachineFunction &MF) const {
-  if (!TargetRegisterInfo::canRealignStack(MF))
-    return false;
-  return true;
-}
-
 unsigned LanaiRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("no exception support");
   return 0;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
index c6e4590..d88a191 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
+++ b/contrib/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -41,8 +41,6 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
-  bool canRealignStack(const MachineFunction &MF) const override;
-
   // Debug information queries.
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const override;
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
index 2a9bc25..a2f005c 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp
@@ -76,7 +76,7 @@ namespace {
 // Lanai Code Generator Pass Configuration Options.
 class LanaiPassConfig : public TargetPassConfig {
 public:
-  LanaiPassConfig(LanaiTargetMachine *TM, PassManagerBase *PassManager)
+  LanaiPassConfig(LanaiTargetMachine &TM, PassManagerBase *PassManager)
       : TargetPassConfig(TM, *PassManager) {}
 
   LanaiTargetMachine &getLanaiTargetMachine() const {
@@ -91,7 +91,7 @@ public:
 
 TargetPassConfig *
 LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {
-  return new LanaiPassConfig(this, &PassManager);
+  return new LanaiPassConfig(*this, &PassManager);
 }
 
 // Install an instruction selector pass.
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
index 5278c70..083ba6f 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -49,6 +49,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 } // namespace llvm
 
diff --git a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
index 7475dbd..38e7510 100644
--- a/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -10,13 +10,13 @@
 
 #include "LanaiSubtarget.h"
 #include "LanaiTargetMachine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index a04fe81..bbce5f6 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -49,8 +49,9 @@ public:
   LanaiAsmBackend(const Target &T, Triple::OSType OST)
       : MCAsmBackend(), OSType(OST) {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -88,9 +89,10 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned /*DataSize*/, uint64_t Value,
-                                 bool /*IsPCRel*/) const {
+void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                 const MCValue &Target,
+                                 MutableArrayRef<char> Data, uint64_t Value,
+                                 bool /*IsResolved*/) const {
   MCFixupKind Kind = Fixup.getKind();
   Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
 
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
index e02bba5..64cd334 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiELFObjectWriter.cpp
@@ -9,8 +9,8 @@
 
 #include "MCTargetDesc/LanaiBaseInfo.h"
 #include "MCTargetDesc/LanaiFixupKinds.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index f5b5335..c372741 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -19,8 +19,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -89,7 +89,7 @@ public:
 
 } // end anonymous namespace
 
-Lanai::Fixups FixupKind(const MCExpr *Expr) {
+static Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
   if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
@@ -134,8 +134,8 @@ unsigned LanaiMCCodeEmitter::getMachineOpValue(
 }
 
 // Helper function to adjust P and Q bits on load and store instructions.
-unsigned adjustPqBits(const MCInst &Inst, unsigned Value, unsigned PBitShift,
-                      unsigned QBitShift) {
+static unsigned adjustPqBits(const MCInst &Inst, unsigned Value,
+                             unsigned PBitShift, unsigned QBitShift) {
   const MCOperand AluOp = Inst.getOperand(3);
   unsigned AluCode = AluOp.getImm();
 
diff --git a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index a47ff9f..bcbde2b 100644
--- a/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -11,9 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LanaiMCAsmInfo.h"
 #include "LanaiMCTargetDesc.h"
 #include "InstPrinter/LanaiInstPrinter.h"
+#include "LanaiMCAsmInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInst.h"
diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index c26b308..82e6731 100644
--- a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 void MSP430MCAsmInfo::anchor() { }
 
 MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT) {
-  PointerSize = CalleeSaveStackSlotSize = 2;
+  CodePointerSize = CalleeSaveStackSlotSize = 2;
 
   CommentString = ";";
 
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430.td b/contrib/llvm/lib/Target/MSP430/MSP430.td
index dfea669..203864d 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430.td
+++ b/contrib/llvm/lib/Target/MSP430/MSP430.td
@@ -22,6 +22,18 @@ def FeatureX
  : SubtargetFeature<"ext", "ExtendedInsts", "true",
                     "Enable MSP430-X extensions">;
 
+def FeatureHWMult16
+ : SubtargetFeature<"hwmult16", "HWMultMode", "HWMult16",
+                    "Enable 16-bit hardware multiplier">;
+
+def FeatureHWMult32
+ : SubtargetFeature<"hwmult32", "HWMultMode", "HWMult32",
+                    "Enable 32-bit hardware multiplier">;
+
+def FeatureHWMultF5
+ : SubtargetFeature<"hwmultf5", "HWMultMode", "HWMultF5",
+                    "Enable F5 series hardware multiplier">;
+
 //===----------------------------------------------------------------------===//
 // MSP430 supported processors.
 //===----------------------------------------------------------------------===//
@@ -29,6 +41,8 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
+def : Proc<"msp430",          []>;
+def : Proc<"msp430x",         [FeatureX]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index abf062f..f39c21f 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MSP430.h"
 #include "InstPrinter/MSP430InstPrinter.h"
+#include "MSP430.h"
 #include "MSP430InstrInfo.h"
 #include "MSP430MCInstLower.h"
 #include "MSP430TargetMachine.h"
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
index 5fd6b63..424b5ae 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -194,8 +194,8 @@ bool MSP430BSel::expandBranches(OffsetVector &BlockOffsets) {
         // Jump over the long branch on the opposite condition
         TII->reverseBranchCondition(Cond);
         MI = BuildMI(*MBB, MI, dl, TII->get(MSP430::JCC))
-                             .addMBB(NextMBB)
-                             .addOperand(Cond[0]);
+                 .addMBB(NextMBB)
+                 .add(Cond[0]);
         InstrSizeDiff += TII->getInstSizeInBytes(*MI);
         ++MI;
       }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
index b38f578..0434f8a 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
+++ b/contrib/llvm/lib/Target/MSP430/MSP430CallingConv.td
@@ -13,11 +13,11 @@
 // MSP430 Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 def RetCC_MSP430 : CallingConv<[
-  // i8 are returned in registers R15B, R14B, R13B, R12B
-  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+  // i8 are returned in registers R12B, R13B, R14B, R15B
+  CCIfType<[i8], CCAssignToReg<[R12B, R13B, R14B, R15B]>>,
 
-  // i16 are returned in registers R15, R14, R13, R12
-  CCIfType<[i16], CCAssignToReg<[R15, R14, R13, R12]>>
+  // i16 are returned in registers R12, R13, R14, R15
+  CCIfType<[i16], CCAssignToReg<[R12, R13, R14, R15]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index f1cb0b6..b4ff8f6 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
     // adjcallstackdown instruction into 'add SP, <amt>'
     // TODO: consider using push / pop instead of sub + store / add
     MachineInstr &Old = *I;
-    uint64_t Amount = Old.getOperand(0).getImm();
+    uint64_t Amount = TII.getFrameSize(Old);
     if (Amount != 0) {
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
@@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
       } else {
         assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode());
         // factor out the amount the callee already popped.
-        uint64_t CalleeAmt = Old.getOperand(1).getImm();
-        Amount -= CalleeAmt;
+        Amount -= TII.getFramePoppedByCallee(Old);
         if (Amount)
           New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri),
                         MSP430::SP)
@@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
   } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.
-    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+    if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) {
       MachineInstr &Old = *I;
       MachineInstr *New =
           BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP)
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 6e481b6..0b02f79 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -61,7 +61,8 @@ namespace {
       return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
     }
 
-    void dump() {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    LLVM_DUMP_METHOD void dump() {
       errs() << "MSP430ISelAddressMode " << this << '\n';
       if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
         errs() << "Base.Reg ";
@@ -83,6 +84,7 @@ namespace {
       } else if (JT != -1)
         errs() << " JT" << JT << " Align" << Align << '\n';
     }
+#endif
   };
 }
 
@@ -401,12 +403,12 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
     if (Node->hasOneUse()) {
-      CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
+      CurDAG->SelectNodeTo(Node, MSP430::ADDframe, MVT::i16, TFI,
                            CurDAG->getTargetConstant(0, dl, MVT::i16));
       return;
     }
     ReplaceNode(Node, CurDAG->getMachineNode(
-                          MSP430::ADD16ri, dl, MVT::i16, TFI,
+                          MSP430::ADDframe, dl, MVT::i16, TFI,
                           CurDAG->getTargetConstant(0, dl, MVT::i16)));
     return;
   }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 73346b9..dae14fd 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -38,24 +38,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "msp430-lower"
 
-typedef enum {
-  NoHWMult,
-  HWMultIntr,
-  HWMultNoIntr
-} HWMultUseMode;
-
-static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode", cl::Hidden,
-           cl::desc("Hardware multiplier use mode"),
-           cl::init(HWMultNoIntr),
-           cl::values(
-             clEnumValN(NoHWMult, "no",
-                "Do not use hardware multiplier"),
-             clEnumValN(HWMultIntr, "interrupts",
-                "Assume hardware multiplier can be used inside interrupts"),
-             clEnumValN(HWMultNoIntr, "use",
-                "Assume hardware multiplier cannot be used inside interrupts")));
-
 MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
                                            const MSP430Subtarget &STI)
     : TargetLowering(TM) {
@@ -131,29 +113,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
 
   // FIXME: Implement efficiently multiplication by a constant
-  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
-  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
-  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
-  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
-  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
-  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MUL,              MVT::i8,    Promote);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Promote);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Promote);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Promote);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Promote);
+  setOperationAction(ISD::MUL,              MVT::i16,   LibCall);
   setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
   setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
   setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
   setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
 
-  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
-  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
-  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
-  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
-  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
-  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
-  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIV,             MVT::i8,    Promote);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Promote);
+  setOperationAction(ISD::UREM,             MVT::i8,    Promote);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Promote);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Promote);
+  setOperationAction(ISD::SREM,             MVT::i8,    Promote);
+  setOperationAction(ISD::UDIV,             MVT::i16,   LibCall);
   setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
-  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
-  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   LibCall);
+  setOperationAction(ISD::SDIV,             MVT::i16,   LibCall);
   setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
-  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   LibCall);
 
   // varargs support
   setOperationAction(ISD::VASTART,          MVT::Other, Custom);
@@ -162,15 +144,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
   setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
 
-  // Libcalls names.
-  if (HWMultMode == HWMultIntr) {
-    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
-    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
-  } else if (HWMultMode == HWMultNoIntr) {
-    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
-    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  // EABI Libcalls - EABI Section 6.2
+  const struct {
+    const RTLIB::Libcall Op;
+    const char * const Name;
+    const ISD::CondCode Cond;
+  } LibraryCalls[] = {
+    // Floating point conversions - EABI Table 6
+    { RTLIB::FPROUND_F64_F32,   "__mspabi_cvtdf",   ISD::SETCC_INVALID },
+    { RTLIB::FPEXT_F32_F64,     "__mspabi_cvtfd",   ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOSINT_F64_I16,  "__mspabi_fixdi", ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F64_I32,  "__mspabi_fixdli",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F64_I64,  "__mspabi_fixdlli", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOUINT_F64_I16,  "__mspabi_fixdu", ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F64_I32,  "__mspabi_fixdul",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F64_I64,  "__mspabi_fixdull", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOSINT_F32_I16,  "__mspabi_fixfi", ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F32_I32,  "__mspabi_fixfli",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOSINT_F32_I64,  "__mspabi_fixflli", ISD::SETCC_INVALID },
+    // The following is NOT implemented in libgcc
+    //{ RTLIB::FPTOUINT_F32_I16,  "__mspabi_fixfu", ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F32_I32,  "__mspabi_fixful",  ISD::SETCC_INVALID },
+    { RTLIB::FPTOUINT_F32_I64,  "__mspabi_fixfull", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::SINTTOFP_I16_F64,  "__mspabi_fltid", ISD::SETCC_INVALID },
+    { RTLIB::SINTTOFP_I32_F64,  "__mspabi_fltlid",  ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::SINTTOFP_I64_F64,  "__mspabi_fltllid", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::UINTTOFP_I16_F64,  "__mspabi_fltud", ISD::SETCC_INVALID },
+    { RTLIB::UINTTOFP_I32_F64,  "__mspabi_fltuld",  ISD::SETCC_INVALID },
+    // The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::UINTTOFP_I64_F64,  "__mspabi_fltulld", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::SINTTOFP_I16_F32,  "__mspabi_fltif", ISD::SETCC_INVALID },
+    { RTLIB::SINTTOFP_I32_F32,  "__mspabi_fltlif",  ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::SINTTOFP_I64_F32,  "__mspabi_fltllif", ISD::SETCC_INVALID },
+    // TODO The following IS implemented in libgcc
+    //{ RTLIB::UINTTOFP_I16_F32,  "__mspabi_fltuf", ISD::SETCC_INVALID },
+    { RTLIB::UINTTOFP_I32_F32,  "__mspabi_fltulf",  ISD::SETCC_INVALID },
+    // The following IS implemented in libgcc but is not in the EABI
+    { RTLIB::UINTTOFP_I64_F32,  "__mspabi_fltullf", ISD::SETCC_INVALID },
+
+    // Floating point comparisons - EABI Table 7
+    { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ },
+    { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE },
+    { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE },
+    { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT },
+    { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE },
+    { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT },
+    { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ },
+    { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE },
+    { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE },
+    { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT },
+    { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE },
+    { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT },
+
+    // Floating point arithmetic - EABI Table 8
+    { RTLIB::ADD_F64,  "__mspabi_addd", ISD::SETCC_INVALID },
+    { RTLIB::ADD_F32,  "__mspabi_addf", ISD::SETCC_INVALID },
+    { RTLIB::DIV_F64,  "__mspabi_divd", ISD::SETCC_INVALID },
+    { RTLIB::DIV_F32,  "__mspabi_divf", ISD::SETCC_INVALID },
+    { RTLIB::MUL_F64,  "__mspabi_mpyd", ISD::SETCC_INVALID },
+    { RTLIB::MUL_F32,  "__mspabi_mpyf", ISD::SETCC_INVALID },
+    { RTLIB::SUB_F64,  "__mspabi_subd", ISD::SETCC_INVALID },
+    { RTLIB::SUB_F32,  "__mspabi_subf", ISD::SETCC_INVALID },
+    // The following are NOT implemented in libgcc
+    // { RTLIB::NEG_F64,  "__mspabi_negd", ISD::SETCC_INVALID },
+    // { RTLIB::NEG_F32,  "__mspabi_negf", ISD::SETCC_INVALID },
+
+    // TODO: SLL/SRA/SRL are in libgcc, RLL isn't
+
+    // Universal Integer Operations - EABI Table 9
+    { RTLIB::SDIV_I16,   "__mspabi_divi", ISD::SETCC_INVALID },
+    { RTLIB::SDIV_I32,   "__mspabi_divli", ISD::SETCC_INVALID },
+    { RTLIB::SDIV_I64,   "__mspabi_divlli", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I16,   "__mspabi_divu", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I32,   "__mspabi_divul", ISD::SETCC_INVALID },
+    { RTLIB::UDIV_I64,   "__mspabi_divull", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I16,   "__mspabi_remi", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I32,   "__mspabi_remli", ISD::SETCC_INVALID },
+    { RTLIB::SREM_I64,   "__mspabi_remlli", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I16,   "__mspabi_remu", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I32,   "__mspabi_remul", ISD::SETCC_INVALID },
+    { RTLIB::UREM_I64,   "__mspabi_remull", ISD::SETCC_INVALID },
+
+  };
+
+  for (const auto &LC : LibraryCalls) {
+    setLibcallName(LC.Op, LC.Name);
+    if (LC.Cond != ISD::SETCC_INVALID)
+      setCmpLibcallCC(LC.Op, LC.Cond);
   }
 
+  if (STI.hasHWMult16()) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_hw" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_hw" },
+      // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else if (STI.hasHWMult32()) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_hw32" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_hw32" },
+      // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else if (STI.hasHWMultF5()) {
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi_f5hw" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl_f5hw" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll_f5hw" },
+      // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc
+      // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+  } else { // NoHWMult
+    const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+    } LibraryCalls[] = {
+      // Integer Multiply - EABI Table 9
+      { RTLIB::MUL_I16,   "__mspabi_mpyi" },
+      { RTLIB::MUL_I32,   "__mspabi_mpyl" },
+      { RTLIB::MUL_I64,   "__mspabi_mpyll" },
+      // The __mspabi_mpysl* functions are NOT implemented in libgcc
+      // The __mspabi_mpyul* functions are NOT implemented in libgcc
+    };
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+    }
+    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN);
+  }
+
+  // Several of the runtime library functions use a special calling conv
+  setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN);
+  setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN);
+  // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll
+
   setMinFunctionAlignment(1);
   setPrefFunctionAlignment(2);
 }
@@ -245,13 +395,20 @@ MSP430TargetLowering::getRegForInlineAsmConstraint(
 template<typename ArgT>
 static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
                               SmallVectorImpl<unsigned> &Out) {
-  unsigned CurrentArgIndex = ~0U;
-  for (unsigned i = 0, e = Args.size(); i != e; i++) {
-    if (CurrentArgIndex == Args[i].OrigArgIndex) {
-      Out.back()++;
+  unsigned CurrentArgIndex;
+
+  if (Args.empty())
+    return;
+
+  CurrentArgIndex = Args[0].OrigArgIndex;
+  Out.push_back(0);
+
+  for (auto &Arg : Args) {
+    if (CurrentArgIndex == Arg.OrigArgIndex) {
+      Out.back() += 1;
     } else {
       Out.push_back(1);
-      CurrentArgIndex++;
+      CurrentArgIndex = Arg.OrigArgIndex;
     }
   }
 }
@@ -274,10 +431,27 @@ template<typename ArgT>
 static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
-  static const MCPhysReg RegList[] = {
-    MSP430::R15, MSP430::R14, MSP430::R13, MSP430::R12
+  static const MCPhysReg CRegList[] = {
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
+  };
+  static const unsigned CNbRegs = array_lengthof(CRegList);
+  static const MCPhysReg BuiltinRegList[] = {
+    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
   };
-  static const unsigned NbRegs = array_lengthof(RegList);
+  static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList);
+
+  ArrayRef<MCPhysReg> RegList;
+  unsigned NbRegs;
+
+  bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN);
+  if (Builtin) {
+    RegList = BuiltinRegList;
+    NbRegs = BuiltinNbRegs;
+  } else {
+    RegList = CRegList;
+    NbRegs = CNbRegs;
+  }
 
   if (State.isVarArg()) {
     AnalyzeVarArgs(State, Args);
@@ -287,8 +461,13 @@ static void AnalyzeArguments(CCState &State,
   SmallVector<unsigned, 4> ArgsParts;
   ParseFunctionArgs(Args, ArgsParts);
 
+  if (Builtin) {
+    assert(ArgsParts.size() == 2 &&
+        "Builtin calling convention requires two arguments");
+  }
+
   unsigned RegsLeft = NbRegs;
-  bool UseStack = false;
+  bool UsedStack = false;
   unsigned ValNo = 0;
 
   for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
@@ -316,20 +495,27 @@ static void AnalyzeArguments(CCState &State,
 
     unsigned Parts = ArgsParts[i];
 
-    if (!UseStack && Parts <= RegsLeft) {
-      unsigned FirstVal = ValNo;
+    if (Builtin) {
+      assert(Parts == 4 &&
+          "Builtin calling convention requires 64-bit arguments");
+    }
+
+    if (!UsedStack && Parts == 2 && RegsLeft == 1) {
+      // Special case for 32-bit register split, see EABI section 3.3.3
+      unsigned Reg = State.AllocateReg(RegList);
+      State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+      RegsLeft -= 1;
+
+      UsedStack = true;
+      CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    } else if (Parts <= RegsLeft) {
       for (unsigned j = 0; j < Parts; j++) {
         unsigned Reg = State.AllocateReg(RegList);
         State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
         RegsLeft--;
       }
-
-      // Reverse the order of the pieces to agree with the "big endian" format
-      // required in the calling convention ABI.
-      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
-      std::reverse(B, B + Parts);
     } else {
-      UseStack = true;
+      UsedStack = true;
       for (unsigned j = 0; j < Parts; j++)
         CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
     }
@@ -351,10 +537,6 @@ static void AnalyzeReturnValues(CCState &State,
                                 SmallVectorImpl<CCValAssign> &RVLocs,
                                 const SmallVectorImpl<ArgT> &Args) {
   AnalyzeRetResult(State, Args);
-
-  // Reverse splitted return values to get the "big endian" format required
-  // to agree with the calling convention ABI.
-  std::reverse(RVLocs.begin(), RVLocs.end());
 }
 
 SDValue MSP430TargetLowering::LowerFormalArguments(
@@ -395,6 +577,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   switch (CallConv) {
   default:
     llvm_unreachable("Unsupported calling convention");
+  case CallingConv::MSP430_BUILTIN:
   case CallingConv::Fast:
   case CallingConv::C:
     return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
@@ -496,9 +679,33 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
     }
   }
 
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = FuncInfo->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(MVT::i16));
+        FuncInfo->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+    }
+  }
+
   return Chain;
 }
 
+bool
+MSP430TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                     MachineFunction &MF,
+                                     bool IsVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_MSP430);
+}
+
 SDValue
 MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                   bool isVarArg,
@@ -506,6 +713,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                   const SmallVectorImpl<SDValue> &OutVals,
                                   const SDLoc &dl, SelectionDAG &DAG) const {
 
+  MachineFunction &MF = DAG.getMachineFunction();
+
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -537,6 +746,22 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  if (MF.getFunction()->hasStructRetAttr()) {
+    MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in entry block");
+
+    SDValue Val =
+      DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy(DAG.getDataLayout()));
+    unsigned R12 = MSP430::R12;
+
+    Chain = DAG.getCopyToReg(Chain, dl, R12, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(R12, getPointerTy(DAG.getDataLayout())));
+  }
+
   unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
                   MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
 
@@ -551,7 +776,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-// TODO: sret.
 SDValue MSP430TargetLowering::LowerCCCCallTo(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -568,8 +792,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
   unsigned NumBytes = CCInfo.getNextStackOffset();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1219,7 +1442,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
                 BB->end());
   RemBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  // Add edges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
   BB->addSuccessor(LoopBB);
   BB->addSuccessor(RemBB);
   LoopBB->addSuccessor(RemBB);
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
index 8864807..3a72962 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -158,6 +158,12 @@ namespace llvm {
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const override;
 
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF,
+                        bool IsVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index e3259bd..d81f17e 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -85,6 +85,12 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  int64_t getFramePoppedByCallee(const MachineInstr &I) const {
+    assert(isFrameInstr(I) && "Not a frame instruction");
+    assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative");
+    return I.getOperand(1).getImm();
+  }
 };
 
 }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
index c0c29b9..cec4304 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.td
@@ -23,7 +23,8 @@ class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
 // Type Profiles.
 //===----------------------------------------------------------------------===//
 def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
-def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>,
+                                             SDTCisVT<1, i16>]>;
 def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                   SDTCisPtrTy<0>]>;
@@ -113,15 +114,21 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber SR.
 let Defs = [SP, SR], Uses = [SP] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               "#ADJCALLSTACKDOWN",
-                              [(MSP430callseq_start timm:$amt)]>;
+                              [(MSP430callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               "#ADJCALLSTACKUP",
                               [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
+let Defs = [SR], Uses = [SP] in {
+def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset),
+                      "# ADDframe PSEUDO", []>;
+}
+
 let usesCustomInserter = 1 in {
+  let Uses = [SR] in {
   def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
                         "# Select8 PSEUDO",
                         [(set GR8:$dst,
@@ -130,6 +137,7 @@ let usesCustomInserter = 1 in {
                         "# Select16 PSEUDO",
                         [(set GR16:$dst,
                           (MSP430selectcc GR16:$src, GR16:$src2, imm:$cc))]>;
+  }
   let Defs = [SR] in {
   def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
                         "# Shl8 PSEUDO",
@@ -207,7 +215,7 @@ let isCall = 1 in
   // a use to prevent stack-pointer assignments that appear immediately
   // before calls from potentially appearing dead. Uses for argument
   // registers are added manually.
-  let Defs = [R12, R13, R14, R15, SR],
+  let Defs = [R11, R12, R13, R14, R15, SR],
       Uses = [SP] in {
     def CALLi     : II16i<0x0,
                           (outs), (ins i16imm:$dst),
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
index 47b0e27..e771638 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -119,7 +119,7 @@ void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index 2d93731..fcaa8a1 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -33,15 +33,23 @@ class MSP430MachineFunctionInfo : public MachineFunctionInfo {
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
 public:
   MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
 
   explicit MSP430MachineFunctionInfo(MachineFunction &MF)
-    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0), SRetReturnReg(0) {}
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
   int getRAIndex() const { return ReturnAddrIndex; }
   void setRAIndex(int Index) { ReturnAddrIndex = Index; }
 
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 81cd9d1..7a3b7a8 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const Function* F = MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
-    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
   static const MCPhysReg CalleeSavedRegsFP[] = {
     MSP430::R5, MSP430::R6, MSP430::R7,
-    MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11,
+    MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
   static const MCPhysReg CalleeSavedRegsIntr[] = {
@@ -127,7 +127,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Fold imm into offset
   Offset += MI.getOperand(FIOperandNum + 1).getImm();
 
-  if (MI.getOpcode() == MSP430::ADD16ri) {
+  if (MI.getOpcode() == MSP430::ADDframe) {
     // This is actually "load effective address" of the stack slot
     // instruction. We have only two-address instructions, thus we need to
     // expand it into mov + add
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
index 6216348..776a9dc 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -19,6 +19,20 @@ using namespace llvm;
 
 #define DEBUG_TYPE "msp430-subtarget"
 
+static cl::opt<MSP430Subtarget::HWMultEnum>
+HWMultModeOption("mhwmult", cl::Hidden,
+           cl::desc("Hardware multiplier use mode for MSP430"),
+           cl::init(MSP430Subtarget::NoHWMult),
+           cl::values(
+             clEnumValN(MSP430Subtarget::NoHWMult, "none",
+                "Do not use hardware multiplier"),
+             clEnumValN(MSP430Subtarget::HWMult16, "16bit",
+                "Use 16-bit hardware multiplier"),
+             clEnumValN(MSP430Subtarget::HWMult32, "32bit",
+                "Use 32-bit hardware multiplier"),
+             clEnumValN(MSP430Subtarget::HWMultF5, "f5series",
+                "Use F5 series hardware multiplier")));
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "MSP430GenSubtargetInfo.inc"
@@ -27,7 +41,18 @@ void MSP430Subtarget::anchor() { }
 
 MSP430Subtarget &
 MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  ParseSubtargetFeatures("generic", FS);
+  ExtendedInsts = false;
+  HWMultMode = NoHWMult;
+
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "msp430";
+
+  ParseSubtargetFeatures(CPUName, FS);
+
+  if (HWMultModeOption != NoHWMult)
+    HWMultMode = HWMultModeOption;
+
   return *this;
 }
 
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
index 1a00d85..8828dfd 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
+++ b/contrib/llvm/lib/Target/MSP430/MSP430Subtarget.h
@@ -30,8 +30,15 @@ namespace llvm {
 class StringRef;
 
 class MSP430Subtarget : public MSP430GenSubtargetInfo {
+public:
+  enum HWMultEnum {
+    NoHWMult, HWMult16, HWMult32, HWMultF5
+  };
+
+private:
   virtual void anchor();
   bool ExtendedInsts;
+  HWMultEnum HWMultMode;
   MSP430FrameLowering FrameLowering;
   MSP430InstrInfo InstrInfo;
   MSP430TargetLowering TLInfo;
@@ -50,6 +57,10 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  bool hasHWMult16() const { return HWMultMode == HWMult16; }
+  bool hasHWMult32() const { return HWMultMode == HWMult32; }
+  bool hasHWMultF5() const { return HWMultMode == HWMultF5; }
+
   const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
diff --git a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index bebe5fa..982c6fe 100644
--- a/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -32,16 +32,20 @@ static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
   return *RM;
 }
 
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options) {
+  return "e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16";
+}
+
 MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
                                          const TargetOptions &Options,
                                          Optional<Reloc::Model> RM,
                                          CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, "e-m:e-p:16:16-i32:16:32-a:16-n8:16", TT, CPU, FS,
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
                         Options, getEffectiveRelocModel(RM), CM, OL),
       TLOF(make_unique<TargetLoweringObjectFileELF>()),
-      // FIXME: Check DataLayout string.
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
@@ -52,7 +56,7 @@ namespace {
 /// MSP430 Code Generator Pass Configuration Options.
 class MSP430PassConfig : public TargetPassConfig {
 public:
-  MSP430PassConfig(MSP430TargetMachine *TM, PassManagerBase &PM)
+  MSP430PassConfig(MSP430TargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   MSP430TargetMachine &getMSP430TargetMachine() const {
@@ -65,7 +69,7 @@ public:
 } // namespace
 
 TargetPassConfig *MSP430TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new MSP430PassConfig(this, PM);
+  return new MSP430PassConfig(*this, PM);
 }
 
 bool MSP430PassConfig::addInstSelector() {
diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index d054578..e12188e 100644
--- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -7,47 +7,68 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIFlagsSection.h"
 #include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
-#include "MipsRegisterInfo.h"
-#include "MipsTargetObjectFile.h"
 #include "MipsTargetStreamer.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <memory>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mips-asm-parser"
 
 namespace llvm {
+
 class MCInstrInfo;
-}
+
+} // end namespace llvm
 
 namespace {
+
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions(const FeatureBitset &Features_) :
-    ATReg(1), Reorder(true), Macro(true), Features(Features_) {}
+  MipsAssemblerOptions(const FeatureBitset &Features_) : Features(Features_) {}
 
   MipsAssemblerOptions(const MipsAssemblerOptions *Opts) {
     ATReg = Opts->getATRegIndex();
@@ -84,12 +105,13 @@ public:
   static const FeatureBitset AllArchRelatedMask;
 
 private:
-  unsigned ATReg;
-  bool Reorder;
-  bool Macro;
+  unsigned ATReg = 1;
+  bool Reorder = true;
+  bool Macro = true;
   FeatureBitset Features;
 };
-}
+
+} // end anonymous namespace
 
 const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
     Mips::FeatureMips1, Mips::FeatureMips2, Mips::FeatureMips3,
@@ -103,6 +125,7 @@ const FeatureBitset MipsAssemblerOptions::AllArchRelatedMask = {
 };
 
 namespace {
+
 class MipsAsmParser : public MCTargetAsmParser {
   MipsTargetStreamer &getTargetStreamer() {
     MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
@@ -147,6 +170,8 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool parseBracketSuffix(StringRef Name, OperandVector &Operands);
 
+  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID);
+
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
@@ -192,9 +217,15 @@ class MipsAsmParser : public MCTargetAsmParser {
                                unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
                                MCStreamer &Out, const MCSubtargetInfo *STI);
 
+  bool emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MCSymbol *Sym);
+
   bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
                      MCStreamer &Out, const MCSubtargetInfo *STI);
 
+  bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU,
+                         SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
+
   bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
                          const MCOperand &Offset, bool Is32BitAddress,
                          SMLoc IDLoc, MCStreamer &Out,
@@ -252,6 +283,18 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                  const MCSubtargetInfo *STI);
 
+  bool expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                    const MCSubtargetInfo *STI);
+
+  bool expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                  const MCSubtargetInfo *STI);
+
+  bool expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                   const MCSubtargetInfo *STI);
+
+  bool expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
+
   bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                              const MCSubtargetInfo *STI, bool IsLoad);
 
@@ -279,6 +322,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseDirectiveSet();
   bool parseDirectiveOption();
   bool parseInsnDirective();
+  bool parseRSectionDirective(StringRef Section);
   bool parseSSectionDirective(StringRef Section, unsigned Type);
 
   bool parseSetAtDirective();
@@ -299,6 +343,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetPushDirective();
   bool parseSetSoftFloatDirective();
   bool parseSetHardFloatDirective();
+  bool parseSetMtDirective();
+  bool parseSetNoMtDirective();
 
   bool parseSetAssignment();
 
@@ -339,6 +385,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   /// This should be used in pseudo-instruction expansions which need AT.
   unsigned getATReg(SMLoc Loc);
 
+  bool canUseATReg();
+
   bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                           const MCSubtargetInfo *STI);
 
@@ -466,9 +514,11 @@ public:
   bool isGP64bit() const {
     return getSTI().getFeatureBits()[Mips::FeatureGP64Bit];
   }
+
   bool isFP64bit() const {
     return getSTI().getFeatureBits()[Mips::FeatureFP64Bit];
   }
+
   const MipsABIInfo &getABI() const { return ABI; }
   bool isABI_N32() const { return ABI.IsN32(); }
   bool isABI_N64() const { return ABI.IsN64(); }
@@ -484,48 +534,63 @@ public:
   bool inMicroMipsMode() const {
     return getSTI().getFeatureBits()[Mips::FeatureMicroMips];
   }
+
   bool hasMips1() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips1];
   }
+
   bool hasMips2() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips2];
   }
+
   bool hasMips3() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips3];
   }
+
   bool hasMips4() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips4];
   }
+
   bool hasMips5() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips5];
   }
+
   bool hasMips32() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips32];
   }
+
   bool hasMips64() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips64];
   }
+
   bool hasMips32r2() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips32r2];
   }
+
   bool hasMips64r2() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips64r2];
   }
+
   bool hasMips32r3() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]);
   }
+
   bool hasMips64r3() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]);
   }
+
   bool hasMips32r5() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]);
   }
+
   bool hasMips64r5() const {
     return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]);
   }
+
   bool hasMips32r6() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips32r6];
   }
+
   bool hasMips64r6() const {
     return getSTI().getFeatureBits()[Mips::FeatureMips64r6];
   }
@@ -533,15 +598,19 @@ public:
   bool hasDSP() const {
     return getSTI().getFeatureBits()[Mips::FeatureDSP];
   }
+
   bool hasDSPR2() const {
     return getSTI().getFeatureBits()[Mips::FeatureDSPR2];
   }
+
   bool hasDSPR3() const {
     return getSTI().getFeatureBits()[Mips::FeatureDSPR3];
   }
+
   bool hasMSA() const {
     return getSTI().getFeatureBits()[Mips::FeatureMSA];
   }
+
   bool hasCnMips() const {
     return (getSTI().getFeatureBits()[Mips::FeatureCnMips]);
   }
@@ -561,6 +630,9 @@ public:
   bool useSoftFloat() const {
     return getSTI().getFeatureBits()[Mips::FeatureSoftFloat];
   }
+  bool hasMT() const {
+    return getSTI().getFeatureBits()[Mips::FeatureMT];
+  }
 
   /// Warn if RegIndex is the same as the current AT.
   void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc);
@@ -627,9 +699,6 @@ public:
     }
   }
 };
-}
-
-namespace {
 
 /// MipsOperand - Instances of this class represent a parsed Mips machine
 /// instruction.
@@ -671,6 +740,22 @@ public:
   MipsOperand(KindTy K, MipsAsmParser &Parser)
       : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
 
+  ~MipsOperand() override {
+    switch (Kind) {
+    case k_Immediate:
+      break;
+    case k_Memory:
+      delete Mem.Base;
+      break;
+    case k_RegList:
+      delete RegList.List;
+    case k_RegisterIndex:
+    case k_Token:
+    case k_RegPair:
+      break;
+    }
+  }
+
 private:
   /// For diagnostics, and checking the assembler temporary
   MipsAsmParser &AsmParser;
@@ -716,7 +801,7 @@ private:
                                                 const MCRegisterInfo *RegInfo,
                                                 SMLoc S, SMLoc E,
                                                 MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_RegisterIndex, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_RegisterIndex, Parser);
     Op->RegIdx.Index = Index;
     Op->RegIdx.RegInfo = RegInfo;
     Op->RegIdx.Kind = RegKind;
@@ -896,6 +981,16 @@ public:
   /// Render the operand to an MCInst as a GPR32
   /// Asserts if the wrong number of operands are requested, or the operand
   /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR32ZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
+  void addGPR32NonZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
   void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
@@ -929,6 +1024,16 @@ public:
     Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
   }
 
+  void addStrictlyAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
+  }
+
+  void addStrictlyFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
+  }
+
   void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
@@ -945,6 +1050,15 @@ public:
                     "registers");
   }
 
+  void addStrictlyFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGR32Reg()));
+    // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
+    if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
+      AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
+                                "registers");
+  }
+
   void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getFGRH32Reg()));
@@ -1104,45 +1218,58 @@ public:
     // $0/$zero here so that MCK_ZERO works correctly.
     return isGPRAsmReg() && RegIdx.Index == 0;
   }
+
   bool isRegIdx() const { return Kind == k_RegisterIndex; }
   bool isImm() const override { return Kind == k_Immediate; }
+
   bool isConstantImm() const {
     int64_t Res;
     return isImm() && getImm()->evaluateAsAbsolute(Res);
   }
+
   bool isConstantImmz() const {
     return isConstantImm() && getConstantImm() == 0;
   }
+
   template <unsigned Bits, int Offset = 0> bool isConstantUImm() const {
     return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset);
   }
+
   template <unsigned Bits> bool isSImm() const {
     return isConstantImm() ? isInt<Bits>(getConstantImm()) : isImm();
   }
+
   template <unsigned Bits> bool isUImm() const {
     return isConstantImm() ? isUInt<Bits>(getConstantImm()) : isImm();
   }
+
   template <unsigned Bits> bool isAnyImm() const {
     return isConstantImm() ? (isInt<Bits>(getConstantImm()) ||
                               isUInt<Bits>(getConstantImm()))
                            : isImm();
   }
+
   template <unsigned Bits, int Offset = 0> bool isConstantSImm() const {
     return isConstantImm() && isInt<Bits>(getConstantImm() - Offset);
   }
+
   template <unsigned Bottom, unsigned Top> bool isConstantUImmRange() const {
     return isConstantImm() && getConstantImm() >= Bottom &&
            getConstantImm() <= Top;
   }
+
   bool isToken() const override {
     // Note: It's not possible to pretend that other operand kinds are tokens.
     // The matcher emitter checks tokens first.
     return Kind == k_Token;
   }
+
   bool isMem() const override { return Kind == k_Memory; }
+
   bool isConstantMemOff() const {
     return isMem() && isa<MCConstantExpr>(getMemOff());
   }
+
   // Allow relocation operators.
   // FIXME: This predicate and others need to look through binary expressions
   //        and determine whether a Value is a constant or not.
@@ -1160,28 +1287,34 @@ public:
     bool IsReloc = getMemOff()->evaluateAsRelocatable(Res, nullptr, nullptr);
     return IsReloc && isShiftedInt<Bits, ShiftAmount>(Res.getConstant());
   }
+
   bool isMemWithGRPMM16Base() const {
     return isMem() && getMemBase()->isMM16AsmReg();
   }
+
   template <unsigned Bits> bool isMemWithUimmOffsetSP() const {
     return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
       && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP);
   }
+
   template <unsigned Bits> bool isMemWithUimmWordAlignedOffsetSP() const {
     return isMem() && isConstantMemOff() && isUInt<Bits>(getConstantMemOff())
       && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
       && (getMemBase()->getGPR32Reg() == Mips::SP);
   }
+
   template <unsigned Bits> bool isMemWithSimmWordAlignedOffsetGP() const {
     return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff())
       && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx()
       && (getMemBase()->getGPR32Reg() == Mips::GP);
   }
+
   template <unsigned Bits, unsigned ShiftLeftAmount>
   bool isScaledUImm() const {
     return isConstantImm() &&
            isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm());
   }
+
   template <unsigned Bits, unsigned ShiftLeftAmount>
   bool isScaledSImm() const {
     if (isConstantImm() && isShiftedInt<Bits, ShiftLeftAmount>(getConstantImm()))
@@ -1193,6 +1326,7 @@ public:
     bool Success = getImm()->evaluateAsRelocatable(Res, nullptr, nullptr);
     return Success && isShiftedInt<Bits, ShiftLeftAmount>(Res.getConstant());
   }
+
   bool isRegList16() const {
     if (!isRegList())
       return false;
@@ -1217,14 +1351,18 @@ public:
 
     return true;
   }
+
   bool isInvNum() const { return Kind == k_Immediate; }
+
   bool isLSAImm() const {
     if (!isConstantImm())
       return false;
     int64_t Val = getConstantImm();
     return 1 <= Val && Val <= 4;
   }
+
   bool isRegList() const { return Kind == k_RegList; }
+
   bool isMovePRegPair() const {
     if (Kind != k_RegList || RegList.List->size() != 2)
       return false;
@@ -1257,6 +1395,7 @@ public:
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
+
   bool isRegPair() const {
     return Kind == k_RegPair && RegIdx.Index <= 30;
   }
@@ -1310,7 +1449,7 @@ public:
 
   static std::unique_ptr<MipsOperand> CreateToken(StringRef Str, SMLoc S,
                                                   MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_Token, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_Token, Parser);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -1385,7 +1524,7 @@ public:
 
   static std::unique_ptr<MipsOperand>
   CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_Immediate, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_Immediate, Parser);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -1395,7 +1534,7 @@ public:
   static std::unique_ptr<MipsOperand>
   CreateMem(std::unique_ptr<MipsOperand> Base, const MCExpr *Off, SMLoc S,
             SMLoc E, MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_Memory, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_Memory, Parser);
     Op->Mem.Base = Base.release();
     Op->Mem.Off = Off;
     Op->StartLoc = S;
@@ -1406,9 +1545,9 @@ public:
   static std::unique_ptr<MipsOperand>
   CreateRegList(SmallVectorImpl<unsigned> &Regs, SMLoc StartLoc, SMLoc EndLoc,
                 MipsAsmParser &Parser) {
-    assert (Regs.size() > 0 && "Empty list not allowed");
+    assert(Regs.size() > 0 && "Empty list not allowed");
 
-    auto Op = make_unique<MipsOperand>(k_RegList, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_RegList, Parser);
     Op->RegList.List = new SmallVector<unsigned, 10>(Regs.begin(), Regs.end());
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
@@ -1418,7 +1557,7 @@ public:
   static std::unique_ptr<MipsOperand> CreateRegPair(const MipsOperand &MOP,
                                                     SMLoc S, SMLoc E,
                                                     MipsAsmParser &Parser) {
-    auto Op = make_unique<MipsOperand>(k_RegPair, Parser);
+    auto Op = llvm::make_unique<MipsOperand>(k_RegPair, Parser);
     Op->RegIdx.Index = MOP.RegIdx.Index;
     Op->RegIdx.RegInfo = MOP.RegIdx.RegInfo;
     Op->RegIdx.Kind = MOP.RegIdx.Kind;
@@ -1427,14 +1566,25 @@ public:
     return Op;
   }
 
+ bool isGPRZeroAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index == 0;
+  }
+
+ bool isGPRNonZeroAsmReg() const {
+   return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index > 0 &&
+          RegIdx.Index <= 31;
+  }
+
   bool isGPRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
   }
+
   bool isMM16AsmReg() const {
     if (!(isRegIdx() && RegIdx.Kind))
       return false;
     return ((RegIdx.Index >= 2 && RegIdx.Index <= 7)
             || RegIdx.Index == 16 || RegIdx.Index == 17);
+
   }
   bool isMM16AsmRegZero() const {
     if (!(isRegIdx() && RegIdx.Kind))
@@ -1443,42 +1593,58 @@ public:
             (RegIdx.Index >= 2 && RegIdx.Index <= 7) ||
             RegIdx.Index == 17);
   }
+
   bool isMM16AsmRegMoveP() const {
     if (!(isRegIdx() && RegIdx.Kind))
       return false;
     return (RegIdx.Index == 0 || (RegIdx.Index >= 2 && RegIdx.Index <= 3) ||
       (RegIdx.Index >= 16 && RegIdx.Index <= 20));
   }
+
   bool isFGRAsmReg() const {
     // AFGR64 is $0-$15 but we handle this in getAFGR64()
     return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
   }
+
+  bool isStrictlyFGRAsmReg() const {
+    // AFGR64 is $0-$15 but we handle this in getAFGR64()
+    return isRegIdx() && RegIdx.Kind == RegKind_FGR && RegIdx.Index <= 31;
+  }
+
   bool isHWRegsAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
   }
+
   bool isCCRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_CCR && RegIdx.Index <= 31;
   }
+
   bool isFCCAsmReg() const {
     if (!(isRegIdx() && RegIdx.Kind & RegKind_FCC))
       return false;
     return RegIdx.Index <= 7;
   }
+
   bool isACCAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_ACC && RegIdx.Index <= 3;
   }
+
   bool isCOP0AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP0 && RegIdx.Index <= 31;
   }
+
   bool isCOP2AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
   }
+
   bool isCOP3AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
   }
+
   bool isMSA128AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
   }
+
   bool isMSACtrlAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_MSACtrl && RegIdx.Index <= 7;
   }
@@ -1488,22 +1654,6 @@ public:
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
 
-  virtual ~MipsOperand() {
-    switch (Kind) {
-    case k_Immediate:
-      break;
-    case k_Memory:
-      delete Mem.Base;
-      break;
-    case k_RegList:
-      delete RegList.List;
-    case k_RegisterIndex:
-    case k_Token:
-    case k_RegPair:
-      break;
-    }
-  }
-
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case k_Immediate:
@@ -1553,11 +1703,15 @@ public:
     }
   }
 }; // class MipsOperand
-} // namespace
+
+} // end anonymous namespace
 
 namespace llvm {
+
 extern const MCInstrDesc MipsInsts[];
-}
+
+} // end namespace llvm
+
 static const MCInstrDesc &getInstDesc(unsigned Opcode) {
   return MipsInsts[Opcode];
 }
@@ -1785,6 +1939,62 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     }
   }
 
+  // Warn on division by zero. We're checking here as all instructions get
+  // processed here, not just the macros that need expansion.
+  //
+  // The MIPS backend models most of the divison instructions and macros as
+  // three operand instructions. The pre-R6 divide instructions however have
+  // two operands and explicitly define HI/LO as part of the instruction,
+  // not in the operands.
+  unsigned FirstOp = 1;
+  unsigned SecondOp = 2;
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case Mips::SDivIMacro:
+  case Mips::UDivIMacro:
+  case Mips::DSDivIMacro:
+  case Mips::DUDivIMacro:
+    if (Inst.getOperand(2).getImm() == 0) {
+      if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+          Inst.getOperand(1).getReg() == Mips::ZERO_64)
+        Warning(IDLoc, "dividing zero by zero");
+      else
+        Warning(IDLoc, "division by zero");
+    }
+    break;
+  case Mips::DSDIV:
+  case Mips::SDIV:
+  case Mips::UDIV:
+  case Mips::DUDIV:
+  case Mips::UDIV_MM:
+  case Mips::SDIV_MM:
+    FirstOp = 0;
+    SecondOp = 1;
+    LLVM_FALLTHROUGH;
+  case Mips::SDivMacro:
+  case Mips::DSDivMacro:
+  case Mips::UDivMacro:
+  case Mips::DUDivMacro:
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DIVU_MMR6:
+  case Mips::DDIVU_MM64R6:
+  case Mips::DIV_MMR6:
+  case Mips::DDIV_MM64R6:
+    if (Inst.getOperand(SecondOp).getReg() == Mips::ZERO ||
+        Inst.getOperand(SecondOp).getReg() == Mips::ZERO_64) {
+      if (Inst.getOperand(FirstOp).getReg() == Mips::ZERO ||
+          Inst.getOperand(FirstOp).getReg() == Mips::ZERO_64)
+        Warning(IDLoc, "dividing zero by zero");
+      else
+        Warning(IDLoc, "division by zero");
+    }
+    break;
+  }
+
   // For PIC code convert unconditional jump to unconditional branch.
   if ((Inst.getOpcode() == Mips::J || Inst.getOpcode() == Mips::J_MM) &&
       inPicMode()) {
@@ -2135,6 +2345,8 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BneImm:
   case Mips::BeqImm:
+  case Mips::BEQLImmMacro:
+  case Mips::BNELImmMacro:
     return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BLT:
   case Mips::BLE:
@@ -2170,15 +2382,19 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::BGTULImmMacro:
     return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SDivMacro:
+  case Mips::SDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
                                                          : MER_Success;
   case Mips::DSDivMacro:
+  case Mips::DSDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
                                                         : MER_Success;
   case Mips::UDivMacro:
+  case Mips::UDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
                                                           : MER_Success;
   case Mips::DUDivMacro:
+  case Mips::DUDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
                                                          : MER_Success;
   case Mips::PseudoTRUNC_W_S:
@@ -2190,6 +2406,27 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::PseudoTRUNC_W_D:
     return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail
                                                           : MER_Success;
+
+  case Mips::LoadImmSingleGPR:
+    return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmSingleFGR:
+    return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmDoubleGPR:
+    return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmDoubleFGR:
+      return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmDoubleFGR_32:
+    return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
   case Mips::Ulh:
     return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::Ulhu:
@@ -2200,11 +2437,24 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::Usw:
     return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::NORImm:
+  case Mips::NORImm64:
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLTImm64:
+    if (isInt<16>(Inst.getOperand(2).getImm())) {
+      Inst.setOpcode(Mips::SLTi64);
+      return MER_NotAMacro;
+    }
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLTUImm64:
+    if (isInt<16>(Inst.getOperand(2).getImm())) {
+      Inst.setOpcode(Mips::SLTiu64);
+      return MER_NotAMacro;
+    }
     return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
-  case Mips::ADDi:
-  case Mips::ADDiu:
-  case Mips::SLTi:
-  case Mips::SLTiu:
+  case Mips::ADDi:   case Mips::ADDi_MM:
+  case Mips::ADDiu:  case Mips::ADDiu_MM:
+  case Mips::SLTi:   case Mips::SLTi_MM:
+  case Mips::SLTiu:  case Mips::SLTiu_MM:
     if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
         Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
       int64_t ImmValue = Inst.getOperand(2).getImm();
@@ -2214,9 +2464,9 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                                          : MER_Success;
     }
     return MER_NotAMacro;
-  case Mips::ANDi:
-  case Mips::ORi:
-  case Mips::XORi:
+  case Mips::ANDi:  case Mips::ANDi_MM:  case Mips::ANDi64:
+  case Mips::ORi:   case Mips::ORi_MM:   case Mips::ORi64:
+  case Mips::XORi:  case Mips::XORi_MM:  case Mips::XORi64:
     if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
         Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
       int64_t ImmValue = Inst.getOperand(2).getImm();
@@ -2240,6 +2490,17 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::ABSMacro:
     return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULImmMacro:
+  case Mips::DMULImmMacro:
+    return expandMulImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULOMacro:
+  case Mips::DMULOMacro:
+    return expandMulO(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULOUMacro:
+  case Mips::DMULOUMacro:
+    return expandMulOU(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::DMULMacro:
+    return expandDMULMacro(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::LDMacro:
   case Mips::SDMacro:
     return expandLoadStoreDMacro(Inst, IDLoc, Out, STI,
@@ -2392,7 +2653,6 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
 
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
-
     if (!Is32BitImm && !isInt<32>(ImmValue)) {
       // Traditional behaviour seems to special case this particular value. It's
       // not clear why other masks are handled differently.
@@ -2535,6 +2795,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
                                             bool Is32BitSym, SMLoc IDLoc,
                                             MCStreamer &Out,
                                             const MCSubtargetInfo *STI) {
+  // FIXME: These expansions do not respect -mxgot.
   MipsTargetStreamer &TOut = getTargetStreamer();
   bool UseSrcReg = SrcReg != Mips::NoRegister;
   warnIfNoMacro(IDLoc);
@@ -2554,8 +2815,12 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     // symbol in the final relocation is external and not modified with a
     // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT16.
     if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
-        Res.getConstant() == 0 && !Res.getSymA()->getSymbol().isInSection() &&
-        !Res.getSymA()->getSymbol().isTemporary()) {
+        Res.getConstant() == 0 &&
+        !(Res.getSymA()->getSymbol().isInSection() ||
+          Res.getSymA()->getSymbol().isTemporary() ||
+          (Res.getSymA()->getSymbol().isELF() &&
+           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+               ELF::STB_LOCAL))) {
       const MCExpr *CallExpr =
           MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
       TOut.emitRRX(Mips::LW, DstReg, ABI.GetGlobalPtr(),
@@ -2611,6 +2876,85 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
     return false;
   }
 
+  if (inPicMode() && ABI.ArePtrs64bit()) {
+    MCValue Res;
+    if (!SymExpr->evaluateAsRelocatable(Res, nullptr, nullptr)) {
+      Error(IDLoc, "expected relocatable expression");
+      return true;
+    }
+    if (Res.getSymB() != nullptr) {
+      Error(IDLoc, "expected relocatable expression with only one symbol");
+      return true;
+    }
+
+    // The case where the result register is $25 is somewhat special. If the
+    // symbol in the final relocation is external and not modified with a
+    // constant then we must use R_MIPS_CALL16 instead of R_MIPS_GOT_DISP.
+    if ((DstReg == Mips::T9 || DstReg == Mips::T9_64) && !UseSrcReg &&
+        Res.getConstant() == 0 &&
+        !(Res.getSymA()->getSymbol().isInSection() ||
+          Res.getSymA()->getSymbol().isTemporary() ||
+          (Res.getSymA()->getSymbol().isELF() &&
+           cast<MCSymbolELF>(Res.getSymA()->getSymbol()).getBinding() ==
+               ELF::STB_LOCAL))) {
+      const MCExpr *CallExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_GOT_CALL, SymExpr, getContext());
+      TOut.emitRRX(Mips::LD, DstReg, ABI.GetGlobalPtr(),
+                   MCOperand::createExpr(CallExpr), IDLoc, STI);
+      return false;
+    }
+
+    // The remaining cases are:
+    //   Small offset: ld $tmp, %got_disp(symbol)($gp)
+    //                >daddiu $tmp, $tmp, offset
+    //                >daddu $rd, $tmp, $rs
+    // The daddiu's marked with a '>' may be omitted if they are redundant. If
+    // this happens then the last instruction must use $rd as the result
+    // register.
+    const MipsMCExpr *GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP,
+                                                   Res.getSymA(),
+                                                   getContext());
+    const MCExpr *LoExpr = nullptr;
+    if (Res.getConstant() != 0) {
+      // Symbols fully resolve with just the %got_disp(symbol) but we
+      // must still account for any offset to the symbol for
+      // expressions like symbol+8.
+      LoExpr = MCConstantExpr::create(Res.getConstant(), getContext());
+
+      // FIXME: Offsets greater than 16 bits are not yet implemented.
+      // FIXME: The correct range is a 32-bit sign-extended number.
+      if (Res.getConstant() < -0x8000 || Res.getConstant() > 0x7fff) {
+        Error(IDLoc, "macro instruction uses large offset, which is not "
+                     "currently supported");
+        return true;
+      }
+    }
+
+    unsigned TmpReg = DstReg;
+    if (UseSrcReg &&
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
+                                                               SrcReg)) {
+      // If $rs is the same as $rd, we need to use AT.
+      // If it is not available we exit.
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      TmpReg = ATReg;
+    }
+
+    TOut.emitRRX(Mips::LD, TmpReg, ABI.GetGlobalPtr(),
+                 MCOperand::createExpr(GotExpr), IDLoc, STI);
+
+    if (LoExpr)
+      TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+
+    if (UseSrcReg)
+      TOut.emitRRR(Mips::DADDu, DstReg, TmpReg, SrcReg, IDLoc, STI);
+
+    return false;
+  }
+
   const MipsMCExpr *HiExpr =
       MipsMCExpr::create(MipsMCExpr::MEK_HI, SymExpr, getContext());
   const MipsMCExpr *LoExpr =
@@ -2618,20 +2962,24 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
-    // We always need AT for the 64-bit expansion.
-    // If it is not available we exit.
-    unsigned ATReg = getATReg(IDLoc);
-    if (!ATReg)
-      return true;
+    // We need AT for the 64-bit expansion in the cases where the optional
+    // source register is the destination register and for the superscalar
+    // scheduled form.
+    //
+    // If it is not available we exit if the destination is the same as the
+    // source register.
 
     const MipsMCExpr *HighestExpr =
         MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
     const MipsMCExpr *HigherExpr =
         MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
 
-    if (UseSrcReg &&
-        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
-                                                               SrcReg)) {
+    bool RdRegIsRsReg =
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg);
+
+    if (canUseATReg() && UseSrcReg && RdRegIsRsReg) {
+      unsigned ATReg = getATReg(IDLoc);
+
       // If $rs is the same as $rd:
       // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
       //                        daddiu $at, $at, %higher(sym)
@@ -2653,29 +3001,65 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);
 
       return false;
-    }
+    } else if (canUseATReg() && !RdRegIsRsReg) {
+      unsigned ATReg = getATReg(IDLoc);
 
-    // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
-    // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
-    //                            lui    $at, %hi(sym)
-    //                            daddiu $rd, $rd, %higher(sym)
-    //                            daddiu $at, $at, %lo(sym)
-    //                            dsll32 $rd, $rd, 0
-    //                            daddu  $rd, $rd, $at
-    //                            (daddu  $rd, $rd, $rs)
-    TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
-                STI);
-    TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
-    TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
-                 MCOperand::createExpr(HigherExpr), IDLoc, STI);
-    TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
-                 IDLoc, STI);
-    TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
-    TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
-    if (UseSrcReg)
-      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+      // If the $rs is different from $rd or if $rs isn't specified and we
+      // have $at available:
+      // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+      //                            lui    $at, %hi(sym)
+      //                            daddiu $rd, $rd, %higher(sym)
+      //                            daddiu $at, $at, %lo(sym)
+      //                            dsll32 $rd, $rd, 0
+      //                            daddu  $rd, $rd, $at
+      //                            (daddu  $rd, $rd, $rs)
+      //
+      // Which is preferred for superscalar issue.
+      TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
+      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
 
-    return false;
+      return false;
+    } else if (!canUseATReg() && !RdRegIsRsReg) {
+      // Otherwise, synthesize the address in the destination register
+      // serially:
+      // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+      //                            daddiu $rd, $rd, %higher(sym)
+      //                            dsll   $rd, $rd, 16
+      //                            daddiu $rd, $rd, %hi(sym)
+      //                            dsll   $rd, $rd, 16
+      //                            daddiu $rd, $rd, %lo(sym)
+      TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HiExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+
+      return false;
+    } else {
+      // We have a case where SrcReg == DstReg and we don't have $at
+      // available. We can't expand this case, so error out appropriately.
+      assert(SrcReg == DstReg && !canUseATReg() &&
+             "Could have expanded dla but didn't?");
+      reportParseError(IDLoc,
+                     "pseudo-instruction requires $at, which is not available");
+      return true;
+    }
   }
 
   // And now, the 32-bit symbol address expansion:
@@ -2711,6 +3095,302 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
   return false;
 }
 
+// Each double-precision register DO-D15 overlaps with two of the single
+// precision registers F0-F31. As an example, all of the following hold true:
+// D0 + 1 == F1, F1 + 1 == D1, F1 + 1 == F2, depending on the context.
+static unsigned nextReg(unsigned Reg) {
+  if (MipsMCRegisterClasses[Mips::FGR32RegClassID].contains(Reg))
+    return Reg == (unsigned)Mips::F31 ? (unsigned)Mips::F0 : Reg + 1;
+  switch (Reg) {
+  default: llvm_unreachable("Unknown register in assembly macro expansion!");
+  case Mips::ZERO: return Mips::AT;
+  case Mips::AT:   return Mips::V0;
+  case Mips::V0:   return Mips::V1;
+  case Mips::V1:   return Mips::A0;
+  case Mips::A0:   return Mips::A1;
+  case Mips::A1:   return Mips::A2;
+  case Mips::A2:   return Mips::A3;
+  case Mips::A3:   return Mips::T0;
+  case Mips::T0:   return Mips::T1;
+  case Mips::T1:   return Mips::T2;
+  case Mips::T2:   return Mips::T3;
+  case Mips::T3:   return Mips::T4;
+  case Mips::T4:   return Mips::T5;
+  case Mips::T5:   return Mips::T6;
+  case Mips::T6:   return Mips::T7;
+  case Mips::T7:   return Mips::S0;
+  case Mips::S0:   return Mips::S1;
+  case Mips::S1:   return Mips::S2;
+  case Mips::S2:   return Mips::S3;
+  case Mips::S3:   return Mips::S4;
+  case Mips::S4:   return Mips::S5;
+  case Mips::S5:   return Mips::S6;
+  case Mips::S6:   return Mips::S7;
+  case Mips::S7:   return Mips::T8;
+  case Mips::T8:   return Mips::T9;
+  case Mips::T9:   return Mips::K0;
+  case Mips::K0:   return Mips::K1;
+  case Mips::K1:   return Mips::GP;
+  case Mips::GP:   return Mips::SP;
+  case Mips::SP:   return Mips::FP;
+  case Mips::FP:   return Mips::RA;
+  case Mips::RA:   return Mips::ZERO;
+  case Mips::D0:   return Mips::F1;
+  case Mips::D1:   return Mips::F3;
+  case Mips::D2:   return Mips::F5;
+  case Mips::D3:   return Mips::F7;
+  case Mips::D4:   return Mips::F9;
+  case Mips::D5:   return Mips::F11;
+  case Mips::D6:   return Mips::F13;
+  case Mips::D7:   return Mips::F15;
+  case Mips::D8:   return Mips::F17;
+  case Mips::D9:   return Mips::F19;
+  case Mips::D10:   return Mips::F21;
+  case Mips::D11:   return Mips::F23;
+  case Mips::D12:   return Mips::F25;
+  case Mips::D13:   return Mips::F27;
+  case Mips::D14:   return Mips::F29;
+  case Mips::D15:   return Mips::F31;
+  }
+}
+
+// FIXME: This method is too general. In principle we should compute the number
+// of instructions required to synthesize the immediate inline compared to
+// synthesizing the address inline and relying on non .text sections.
+// For static O32 and N32 this may yield a small benefit, for static N64 this is
+// likely to yield a much larger benefit as we have to synthesize a 64bit
+// address to load a 64 bit value.
+bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
+                                       MCSymbol *Sym) {
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  if(IsPicEnabled) {
+    const MCExpr *GotSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *GotExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext());
+
+    if(isABI_O32() || isABI_N32()) {
+      TOut.emitRRX(Mips::LW, ATReg, Mips::GP, MCOperand::createExpr(GotExpr),
+                   IDLoc, STI);
+    } else { //isABI_N64()
+      TOut.emitRRX(Mips::LD, ATReg, Mips::GP, MCOperand::createExpr(GotExpr),
+                   IDLoc, STI);
+    }
+  } else { //!IsPicEnabled
+    const MCExpr *HiSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *HiExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_HI, HiSym, getContext());
+
+    // FIXME: This is technically correct but gives a different result to gas,
+    // but gas is incomplete there (it has a fixme noting it doesn't work with
+    // 64-bit addresses).
+    // FIXME: With -msym32 option, the address expansion for N64 should probably
+    // use the O32 / N32 case. It's safe to use the 64 address expansion as the
+    // symbol's value is considered sign extended.
+    if(isABI_O32() || isABI_N32()) {
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+    } else { //isABI_N64()
+      const MCExpr *HighestSym =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      const MipsMCExpr *HighestExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, HighestSym, getContext());
+      const MCExpr *HigherSym =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      const MipsMCExpr *HigherExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, HigherSym, getContext());
+
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+    }
+  }
+  return false;
+}
+
+bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
+                                      bool Is64FPU, SMLoc IDLoc,
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+         "Invalid instruction operand.");
+
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+  uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
+  // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
+  // exponent field), convert it to double (e.g. 1 to 1.0)
+  if ((HiImmOp64 & 0x7ff00000) == 0) {
+    APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
+    ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
+  }
+
+  uint32_t LoImmOp64 = ImmOp64 & 0xffffffff;
+  HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
+
+  if (IsSingle) {
+    // Conversion of a double in an uint64_t to a float in a uint32_t,
+    // retaining the bit pattern of a float.
+    uint32_t ImmOp32;
+    double doubleImm = BitsToDouble(ImmOp64);
+    float tmp_float = static_cast<float>(doubleImm);
+    ImmOp32 = FloatToBits(tmp_float);
+
+    if (IsGPR) {
+      if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc,
+                        Out, STI))
+        return true;
+      return false;
+    } else {
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      if (LoImmOp64 == 0) {
+        if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc,
+                          Out, STI))
+          return true;
+        TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI);
+        return false;
+      }
+
+      MCSection *CS = getStreamer().getCurrentSectionOnly();
+      // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+      // where appropriate.
+      MCSection *ReadOnlySection = getContext().getELFSection(
+          ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+      MCSymbol *Sym = getContext().createTempSymbol();
+      const MCExpr *LoSym =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      const MipsMCExpr *LoExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+      getStreamer().SwitchSection(ReadOnlySection);
+      getStreamer().EmitLabel(Sym, IDLoc);
+      getStreamer().EmitIntValue(ImmOp32, 4);
+      getStreamer().SwitchSection(CS);
+
+      if(emitPartialAddress(TOut, IDLoc, Sym))
+        return true;
+      TOut.emitRRX(Mips::LWC1, FirstReg, ATReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+    }
+    return false;
+  }
+
+  // if(!IsSingle)
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  if (IsGPR) {
+    if (LoImmOp64 == 0) {
+      if(isABI_N32() || isABI_N64()) {
+        if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true,
+                          IDLoc, Out, STI))
+          return true;
+        return false;
+      } else {
+        if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true,
+                        IDLoc, Out, STI))
+          return true;
+
+        if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true,
+                        IDLoc, Out, STI))
+          return true;
+        return false;
+      }
+    }
+
+    MCSection *CS = getStreamer().getCurrentSectionOnly();
+    MCSection *ReadOnlySection = getContext().getELFSection(
+        ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+    MCSymbol *Sym = getContext().createTempSymbol();
+    const MCExpr *LoSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *LoExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+    getStreamer().SwitchSection(ReadOnlySection);
+    getStreamer().EmitLabel(Sym, IDLoc);
+    getStreamer().EmitIntValue(HiImmOp64, 4);
+    getStreamer().EmitIntValue(LoImmOp64, 4);
+    getStreamer().SwitchSection(CS);
+
+    if(emitPartialAddress(TOut, IDLoc, Sym))
+      return true;
+    if(isABI_N64())
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+    else
+      TOut.emitRRX(Mips::ADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+
+    if(isABI_N32() || isABI_N64())
+      TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI);
+    else {
+      TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI);
+      TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI);
+    }
+    return false;
+  } else { // if(!IsGPR && !IsSingle)
+    if ((LoImmOp64 == 0) &&
+        !((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) {
+      // FIXME: In the case where the constant is zero, we can load the
+      // register directly from the zero register.
+      if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc,
+                        Out, STI))
+        return true;
+      if (isABI_N32() || isABI_N64())
+        TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI);
+      else if (hasMips32r2()) {
+        TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+        TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI);
+      } else {
+        TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI);
+        TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+      }
+      return false;
+    }
+
+    MCSection *CS = getStreamer().getCurrentSectionOnly();
+    // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+    // where appropriate.
+    MCSection *ReadOnlySection = getContext().getELFSection(
+        ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+    MCSymbol *Sym = getContext().createTempSymbol();
+    const MCExpr *LoSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *LoExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+    getStreamer().SwitchSection(ReadOnlySection);
+    getStreamer().EmitLabel(Sym, IDLoc);
+    getStreamer().EmitIntValue(HiImmOp64, 4);
+    getStreamer().EmitIntValue(LoImmOp64, 4);
+    getStreamer().SwitchSection(CS);
+
+    if(emitPartialAddress(TOut, IDLoc, Sym))
+      return true;
+    TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg,
+                 MCOperand::createExpr(LoExpr), IDLoc, STI);
+  }
+  return false;
+}
+
 bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
                                                MCStreamer &Out,
                                                const MCSubtargetInfo *STI) {
@@ -2769,6 +3449,8 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) &&
          "expected immediate or expression operand");
 
+  bool IsLikely = false;
+
   unsigned OpCode = 0;
   switch(Inst.getOpcode()) {
     case Mips::BneImm:
@@ -2777,16 +3459,29 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     case Mips::BeqImm:
       OpCode = Mips::BEQ;
       break;
+    case Mips::BEQLImmMacro:
+      OpCode = Mips::BEQL;
+      IsLikely = true;
+      break;
+    case Mips::BNELImmMacro:
+      OpCode = Mips::BNEL;
+      IsLikely = true;
+      break;
     default:
       llvm_unreachable("Unknown immediate branch pseudo-instruction.");
       break;
   }
 
   int64_t ImmValue = ImmOp.getImm();
-  if (ImmValue == 0)
-    TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
-                 STI);
-  else {
+  if (ImmValue == 0) {
+    if (IsLikely) {
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO,
+                   MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
+      TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+    } else
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+              STI);
+  } else {
     warnIfNoMacro(IDLoc);
 
     unsigned ATReg = getATReg(IDLoc);
@@ -2797,7 +3492,12 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                       IDLoc, Out, STI))
       return true;
 
-    TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
+    if (IsLikely) {
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
+              MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
+      TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+    } else
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
   }
   return false;
 }
@@ -2904,9 +3604,9 @@ bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
   unsigned Opcode = Inst.getOpcode();
   unsigned NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM32_MM : Mips::LWM32_MM;
 
-  assert (Inst.getOperand(OpNum - 1).isImm() &&
-          Inst.getOperand(OpNum - 2).isReg() &&
-          Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
+  assert(Inst.getOperand(OpNum - 1).isImm() &&
+         Inst.getOperand(OpNum - 2).isReg() &&
+         Inst.getOperand(OpNum - 3).isReg() && "Invalid instruction operand.");
 
   if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 &&
       Inst.getOperand(OpNum - 1).getImm() >= 0 &&
@@ -3185,6 +3885,14 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
+// Expand a integer division macro.
+//
+// Notably we don't have to emit a warning when encountering $rt as the $zero
+// register, or 0 as an immediate. processInstruction() has already done that.
+//
+// The destination register can only be $zero when expanding (S)DivIMacro or
+// D(S)DivMacro.
+
 bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                               const MCSubtargetInfo *STI, const bool IsMips64,
                               const bool Signed) {
@@ -3200,67 +3908,88 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert(RsRegOp.isReg() && "expected register operand kind");
   unsigned RsReg = RsRegOp.getReg();
 
-  const MCOperand &RtRegOp = Inst.getOperand(2);
-  assert(RtRegOp.isReg() && "expected register operand kind");
-  unsigned RtReg = RtRegOp.getReg();
+  unsigned RtReg;
+  int64_t ImmValue;
+
+  const MCOperand &RtOp = Inst.getOperand(2);
+  assert((RtOp.isReg() || RtOp.isImm()) &&
+         "expected register or immediate operand kind");
+  if (RtOp.isReg())
+    RtReg = RtOp.getReg();
+  else
+    ImmValue = RtOp.getImm();
+
   unsigned DivOp;
   unsigned ZeroReg;
+  unsigned SubOp;
 
   if (IsMips64) {
     DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
     ZeroReg = Mips::ZERO_64;
+    SubOp = Mips::DSUB;
   } else {
     DivOp = Signed ? Mips::SDIV : Mips::UDIV;
     ZeroReg = Mips::ZERO;
+    SubOp = Mips::SUB;
   }
 
   bool UseTraps = useTraps();
 
-  if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
-    if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
-      Warning(IDLoc, "dividing zero by zero");
-    if (IsMips64) {
-      if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
-        if (UseTraps) {
-          TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
-          return false;
-        }
+  if (RtOp.isImm()) {
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
 
+    if (ImmValue == 0) {
+      if (UseTraps)
+        TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
+      else
         TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
-        return false;
-      }
+      return false;
+    }
+
+    if (ImmValue == 1) {
+      TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI);
+      return false;
+    } else if (Signed && ImmValue == -1) {
+      TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
+      return false;
     } else {
-      TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+      if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
+                        false, Inst.getLoc(), Out, STI))
+        return true;
+      TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
+      TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
       return false;
     }
+    return true;
   }
 
+  // If the macro expansion of (d)div(u) would always trap or break, insert
+  // the trap/break and exit. This gives a different result to GAS. GAS has
+  // an inconsistency/missed optimization in that not all cases are handled
+  // equivalently. As the observed behaviour is the same, we're ok.
   if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
-    Warning(IDLoc, "division by zero");
-    if (Signed) {
-      if (UseTraps) {
-        TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
-        return false;
-      }
-
-      TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+    if (UseTraps) {
+      TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
       return false;
     }
+    TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+    return false;
   }
 
-  // FIXME: The values for these two BranchTarget variables may be different in
-  // micromips. These magic numbers need to be removed.
-  unsigned BranchTargetNoTraps;
-  unsigned BranchTarget;
+  // Temporary label for first branch traget
+  MCContext &Context = TOut.getStreamer().getContext();
+  MCSymbol *BrTarget;
+  MCOperand LabelOp;
 
   if (UseTraps) {
-    BranchTarget = IsMips64 ? 12 : 8;
     TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
   } else {
-    BranchTarget = IsMips64 ? 20 : 16;
-    BranchTargetNoTraps = 8;
     // Branch to the li instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, STI);
+    BrTarget = Context.createTempSymbol();
+    LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+    TOut.emitRRX(Mips::BNE, RtReg, ZeroReg, LabelOp, IDLoc, STI);
   }
 
   TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
@@ -3269,6 +3998,9 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
 
   if (!Signed) {
+    if (!UseTraps)
+      TOut.getStreamer().EmitLabel(BrTarget);
+
     TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
     return false;
   }
@@ -3277,15 +4009,23 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   if (!ATReg)
     return true;
 
+  if (!UseTraps)
+    TOut.getStreamer().EmitLabel(BrTarget);
+
   TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
+
+  // Temporary label for the second branch target.
+  MCSymbol *BrTargetEnd = Context.createTempSymbol();
+  MCOperand LabelOpEnd =
+      MCOperand::createExpr(MCSymbolRefExpr::create(BrTargetEnd, Context));
+
+  // Branch to the mflo instruction.
+  TOut.emitRRX(Mips::BNE, RtReg, ATReg, LabelOpEnd, IDLoc, STI);
+
   if (IsMips64) {
-    // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
     TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
     TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
   } else {
-    // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
     TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
   }
 
@@ -3293,10 +4033,12 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
   else {
     // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, STI);
+    TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI);
     TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
     TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
   }
+
+  TOut.getStreamer().EmitLabel(BrTargetEnd);
   TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
   return false;
 }
@@ -3503,10 +4245,10 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
                                          const MCSubtargetInfo *STI) {
   MipsTargetStreamer &TOut = getTargetStreamer();
 
-  assert (Inst.getNumOperands() == 3 && "Invalid operand count");
-  assert (Inst.getOperand(0).isReg() &&
-          Inst.getOperand(1).isReg() &&
-          Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isImm() && "Invalid instruction operand.");
 
   unsigned ATReg = Mips::NoRegister;
   unsigned FinalDstReg = Mips::NoRegister;
@@ -3514,7 +4256,7 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
   unsigned SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
 
-  bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+  bool Is32Bit = isInt<32>(ImmValue) || (!isGP64bit() && isUInt<32>(ImmValue));
 
   unsigned FinalOpcode = Inst.getOpcode();
 
@@ -3530,30 +4272,69 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     switch (FinalOpcode) {
     default:
       llvm_unreachable("unimplemented expansion");
-    case (Mips::ADDi):
+    case Mips::ADDi:
       FinalOpcode = Mips::ADD;
       break;
-    case (Mips::ADDiu):
+    case Mips::ADDiu:
       FinalOpcode = Mips::ADDu;
       break;
-    case (Mips::ANDi):
+    case Mips::ANDi:
       FinalOpcode = Mips::AND;
       break;
-    case (Mips::NORImm):
+    case Mips::NORImm:
       FinalOpcode = Mips::NOR;
       break;
-    case (Mips::ORi):
+    case Mips::ORi:
       FinalOpcode = Mips::OR;
       break;
-    case (Mips::SLTi):
+    case Mips::SLTi:
       FinalOpcode = Mips::SLT;
       break;
-    case (Mips::SLTiu):
+    case Mips::SLTiu:
       FinalOpcode = Mips::SLTu;
       break;
-    case (Mips::XORi):
+    case Mips::XORi:
       FinalOpcode = Mips::XOR;
       break;
+    case Mips::ADDi_MM:
+      FinalOpcode = Mips::ADD_MM;
+      break;
+    case Mips::ADDiu_MM:
+      FinalOpcode = Mips::ADDu_MM;
+      break;
+    case Mips::ANDi_MM:
+      FinalOpcode = Mips::AND_MM;
+      break;
+    case Mips::ORi_MM:
+      FinalOpcode = Mips::OR_MM;
+      break;
+    case Mips::SLTi_MM:
+      FinalOpcode = Mips::SLT_MM;
+      break;
+    case Mips::SLTiu_MM:
+      FinalOpcode = Mips::SLTu_MM;
+      break;
+    case Mips::XORi_MM:
+      FinalOpcode = Mips::XOR_MM;
+      break;
+    case Mips::ANDi64:
+      FinalOpcode = Mips::AND64;
+      break;
+    case Mips::NORImm64:
+      FinalOpcode = Mips::NOR64;
+      break;
+    case Mips::ORi64:
+      FinalOpcode = Mips::OR64;
+      break;
+    case Mips::SLTImm64:
+      FinalOpcode = Mips::SLT64;
+      break;
+    case Mips::SLTUImm64:
+      FinalOpcode = Mips::SLTu64;
+      break;
+    case Mips::XORi64:
+      FinalOpcode = Mips::XOR64;
+      break;
     }
 
     if (FinalDstReg == Mips::NoRegister)
@@ -3578,7 +4359,6 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   unsigned SecondShift = Mips::NOP;
 
   if (hasMips32r2()) {
-
     if (DReg == SReg) {
       TmpReg = getATReg(Inst.getLoc());
       if (!TmpReg)
@@ -3600,7 +4380,6 @@ bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   if (hasMips32()) {
-
     switch (Inst.getOpcode()) {
     default:
       llvm_unreachable("unexpected instruction opcode");
@@ -3642,7 +4421,6 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
   unsigned SecondShift = Mips::NOP;
 
   if (hasMips32r2()) {
-
     if (Inst.getOpcode() == Mips::ROLImm) {
       uint64_t MaxShift = 32;
       uint64_t ShiftValue = ImmValue;
@@ -3661,7 +4439,6 @@ bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc,
   }
 
   if (hasMips32()) {
-
     if (ImmValue == 0) {
       TOut.emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), STI);
       return false;
@@ -3707,7 +4484,6 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   unsigned SecondShift = Mips::NOP;
 
   if (hasMips64r2()) {
-
     if (TmpReg == SReg) {
       TmpReg = getATReg(Inst.getLoc());
       if (!TmpReg)
@@ -3729,7 +4505,6 @@ bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   }
 
   if (hasMips64()) {
-
     switch (Inst.getOpcode()) {
     default:
       llvm_unreachable("unexpected instruction opcode");
@@ -3773,7 +4548,6 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
   MCInst TmpInst;
 
   if (hasMips64r2()) {
-
     unsigned FinalOpcode = Mips::NOP;
     if (ImmValue == 0)
       FinalOpcode = Mips::DROTR;
@@ -3801,7 +4575,6 @@ bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc,
   }
 
   if (hasMips64()) {
-
     if (ImmValue == 0) {
       TOut.emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), STI);
       return false;
@@ -3871,43 +4644,117 @@ bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
-static unsigned nextReg(unsigned Reg) {
-  switch (Reg) {
-  case Mips::ZERO: return Mips::AT;
-  case Mips::AT:   return Mips::V0;
-  case Mips::V0:   return Mips::V1;
-  case Mips::V1:   return Mips::A0;
-  case Mips::A0:   return Mips::A1;
-  case Mips::A1:   return Mips::A2;
-  case Mips::A2:   return Mips::A3;
-  case Mips::A3:   return Mips::T0;
-  case Mips::T0:   return Mips::T1;
-  case Mips::T1:   return Mips::T2;
-  case Mips::T2:   return Mips::T3;
-  case Mips::T3:   return Mips::T4;
-  case Mips::T4:   return Mips::T5;
-  case Mips::T5:   return Mips::T6;
-  case Mips::T6:   return Mips::T7;
-  case Mips::T7:   return Mips::S0;
-  case Mips::S0:   return Mips::S1;
-  case Mips::S1:   return Mips::S2;
-  case Mips::S2:   return Mips::S3;
-  case Mips::S3:   return Mips::S4;
-  case Mips::S4:   return Mips::S5;
-  case Mips::S5:   return Mips::S6;
-  case Mips::S6:   return Mips::S7;
-  case Mips::S7:   return Mips::T8;
-  case Mips::T8:   return Mips::T9;
-  case Mips::T9:   return Mips::K0;
-  case Mips::K0:   return Mips::K1;
-  case Mips::K1:   return Mips::GP;
-  case Mips::GP:   return Mips::SP;
-  case Mips::SP:   return Mips::FP;
-  case Mips::FP:   return Mips::RA;
-  case Mips::RA:   return Mips::ZERO;
-  default:         return 0;
+bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                 const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int32_t ImmValue = Inst.getOperand(2).getImm();
+
+  ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out, STI);
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
+              SrcReg, ATReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  ATReg = getATReg(Inst.getLoc());
+  if (!ATReg)
+    return true;
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULOMacro ? Mips::MULT : Mips::DMULT,
+              SrcReg, TmpReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  TOut.emitRRI(Inst.getOpcode() == Mips::MULOMacro ? Mips::SRA : Mips::DSRA32,
+               DstReg, DstReg, 0x1F, IDLoc, STI);
+
+  TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
+
+  if (useTraps()) {
+    TOut.emitRRI(Mips::TNE, DstReg, ATReg, 6, IDLoc, STI);
+  } else {
+    MCContext & Context = TOut.getStreamer().getContext();
+    MCSymbol * BrTarget = Context.createTempSymbol();
+    MCOperand LabelOp =
+        MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+
+    TOut.emitRRX(Mips::BEQ, DstReg, ATReg, LabelOp, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitNop(IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
+
+    TOut.getStreamer().EmitLabel(BrTarget);
+  }
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULOUMacro ? Mips::MULTu : Mips::DMULTu,
+              SrcReg, TmpReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+  if (useTraps()) {
+    TOut.emitRRI(Mips::TNE, ATReg, Mips::ZERO, 6, IDLoc, STI);
+  } else {
+    MCContext & Context = TOut.getStreamer().getContext();
+    MCSymbol * BrTarget = Context.createTempSymbol();
+    MCOperand LabelOp =
+        MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+
+    TOut.emitRRX(Mips::BEQ, ATReg, Mips::ZERO, LabelOp, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitNop(IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
+
+    TOut.getStreamer().EmitLabel(BrTarget);
   }
 
+  return false;
+}
+
+bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  TOut.emitRR(Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI);
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
 }
 
 // Expand 'ld $<reg> offset($reg2)' to 'lw $<reg>, offset($reg2);
@@ -3985,7 +4832,6 @@ bool MipsAsmParser::expandSeq(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
 bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                const MCSubtargetInfo *STI) {
-
   warnIfNoMacro(IDLoc);
   MipsTargetStreamer &TOut = getTargetStreamer();
 
@@ -4158,17 +5004,15 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                             MCStreamer &Out,
                                             uint64_t &ErrorInfo,
                                             bool MatchingInlineAsm) {
-
   MCInst Inst;
   unsigned MatchResult =
       MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
-  case Match_Success: {
+  case Match_Success:
     if (processInstruction(Inst, IDLoc, Out, STI))
       return true;
     return false;
-  }
   case Match_MissingFeature:
     Error(IDLoc, "instruction requires a CPU feature not currently enabled");
     return true;
@@ -4353,7 +5197,7 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   CC = StringSwitch<unsigned>(Name)
            .Case("zero", 0)
-           .Case("at", 1)
+           .Cases("at", "AT", 1)
            .Case("a0", 4)
            .Case("a1", 5)
            .Case("a2", 6)
@@ -4441,7 +5285,6 @@ int MipsAsmParser::matchHWRegsRegisterName(StringRef Name) {
 }
 
 int MipsAsmParser::matchFPURegisterName(StringRef Name) {
-
   if (Name[0] == 'f') {
     StringRef NumString = Name.substr(1);
     unsigned IntVal;
@@ -4455,7 +5298,6 @@ int MipsAsmParser::matchFPURegisterName(StringRef Name) {
 }
 
 int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
-
   if (Name.startswith("fcc")) {
     StringRef NumString = Name.substr(3);
     unsigned IntVal;
@@ -4469,7 +5311,6 @@ int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
 }
 
 int MipsAsmParser::matchACRegisterName(StringRef Name) {
-
   if (Name.startswith("ac")) {
     StringRef NumString = Name.substr(2);
     unsigned IntVal;
@@ -4511,6 +5352,10 @@ int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
   return CC;
 }
 
+bool MipsAsmParser::canUseATReg() {
+  return AssemblerOptions.back()->getATRegIndex() != 0;
+}
+
 unsigned MipsAsmParser::getATReg(SMLoc Loc) {
   unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
   if (ATIndex == 0) {
@@ -4589,7 +5434,6 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 }
 
 bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
-
   switch (Expr->getKind()) {
   case MCExpr::Constant:
     return true;
@@ -5491,6 +6335,39 @@ bool MipsAsmParser::parseSetNoOddSPRegDirective() {
   return false;
 }
 
+bool MipsAsmParser::parseSetMtDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "mt".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  setFeatureBits(Mips::FeatureMT, "mt");
+  getTargetStreamer().emitDirectiveSetMt();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMtDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nomt".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureMT, "mt");
+
+  getTargetStreamer().emitDirectiveSetNoMt();
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
 bool MipsAsmParser::parseSetPopDirective() {
   MCAsmParser &Parser = getParser();
   SMLoc Loc = getLexer().getLoc();
@@ -5522,7 +6399,7 @@ bool MipsAsmParser::parseSetPushDirective() {
 
   // Create a copy of the current assembler options environment and push it.
   AssemblerOptions.push_back(
-              make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
+        llvm::make_unique<MipsAssemblerOptions>(AssemblerOptions.back().get()));
 
   getTargetStreamer().emitDirectiveSetPush();
   return false;
@@ -5914,6 +6791,14 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetAtDirective();
   } else if (Tok.getString() == "arch") {
     return parseSetArchDirective();
+  } else if (Tok.getString() == "bopt") {
+    Warning(Tok.getLoc(), "'bopt' feature is unsupported");
+    getParser().Lex();
+    return false;
+  } else if (Tok.getString() == "nobopt") {
+    // We're already running in nobopt mode, so nothing to do.
+    getParser().Lex();
+    return false;
   } else if (Tok.getString() == "fp") {
     return parseSetFpDirective();
   } else if (Tok.getString() == "oddspreg") {
@@ -5983,6 +6868,10 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetMsaDirective();
   } else if (Tok.getString() == "nomsa") {
     return parseSetNoMsaDirective();
+  } else if (Tok.getString() == "mt") {
+    return parseSetMtDirective();
+  } else if (Tok.getString() == "nomt") {
+    return parseSetNoMtDirective();
   } else if (Tok.getString() == "softfloat") {
     return parseSetSoftFloatDirective();
   } else if (Tok.getString() == "hardfloat") {
@@ -6001,7 +6890,7 @@ bool MipsAsmParser::parseDirectiveSet() {
 bool MipsAsmParser::parseDataDirective(unsigned Size, SMLoc L) {
   MCAsmParser &Parser = getParser();
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
         return true;
@@ -6191,6 +7080,23 @@ bool MipsAsmParser::parseInsnDirective() {
   return false;
 }
 
+/// parseRSectionDirective
+///  ::= .rdata
+bool MipsAsmParser::parseRSectionDirective(StringRef Section) {
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  MCSection *ELFSection = getContext().getELFSection(
+      Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  getParser().getStreamer().SwitchSection(ELFSection);
+
+  getParser().Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 /// parseSSectionDirective
 ///  ::= .sbss
 ///  ::= .sdata
@@ -6215,6 +7121,7 @@ bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) {
 ///  ::= .module fp=value
 ///  ::= .module softfloat
 ///  ::= .module hardfloat
+///  ::= .module mt
 bool MipsAsmParser::parseDirectiveModule() {
   MCAsmParser &Parser = getParser();
   MCAsmLexer &Lexer = getLexer();
@@ -6314,6 +7221,25 @@ bool MipsAsmParser::parseDirectiveModule() {
     }
 
     return false; // parseDirectiveModule has finished successfully.
+  } else if (Option == "mt") {
+    setModuleFeatureBits(Mips::FeatureMT, "mt");
+
+    // Synchronize the ABI Flags information with the FeatureBits information we
+    // updated above.
+    getTargetStreamer().updateABIInfo(*this);
+
+    // If printing assembly, use the recently updated ABI Flags information.
+    // If generating ELF, don't do anything (the .MIPS.abiflags section gets
+    // emitted later).
+    getTargetStreamer().emitDirectiveModuleMT();
+
+    // If this is not the end of the statement, report an error.
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token, expected end of statement");
+      return false;
+    }
+
+    return false; // parseDirectiveModule has finished successfully.
   } else {
     return Error(L, "'" + Twine(Option) + "' is not a valid .module option.");
   }
@@ -6738,6 +7664,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     parseInsnDirective();
     return false;
   }
+  if (IDVal == ".rdata") {
+    parseRSectionDirective(".rodata");
+    return false;
+  }
   if (IDVal == ".sbss") {
     parseSSectionDirective(IDVal, ELF::SHT_NOBITS);
     return false;
@@ -6773,3 +7703,15 @@ extern "C" void LLVMInitializeMipsAsmParser() {
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #include "MipsGenAsmMatcher.inc"
+
+bool MipsAsmParser::mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) {
+  // Find the appropriate table for this asm variant.
+  const MatchEntry *Start, *End;
+  switch (VariantID) {
+  default: llvm_unreachable("invalid variant!");
+  case 0: Start = std::begin(MatchTable0); End = std::end(MatchTable0); break;
+  }
+  // Search the table.
+  auto MnemonicRange = std::equal_range(Start, End, Mnemonic, LessOpcode());
+  return MnemonicRange.first != MnemonicRange.second;
+}
diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index f80efb1..b0b9943 100644
--- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -1,4 +1,4 @@
-//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//===- MipsDisassembler.cpp - Disassembler for Mips -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips.h"
-#include "MipsRegisterInfo.h"
-#include "MipsSubtarget.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,6 +39,7 @@ namespace {
 class MipsDisassembler : public MCDisassembler {
   bool IsMicroMips;
   bool IsBigEndian;
+
 public:
   MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool IsBigEndian)
       : MCDisassembler(STI, Ctx),
@@ -42,9 +49,11 @@ public:
   bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
   bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
   bool hasMips32() const { return STI.getFeatureBits()[Mips::FeatureMips32]; }
+
   bool hasMips32r6() const {
     return STI.getFeatureBits()[Mips::FeatureMips32r6];
   }
+
   bool isFP64() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
 
   bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
@@ -527,11 +536,13 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
                                        const void *Decoder);
 
 namespace llvm {
+
 Target &getTheMipselTarget();
 Target &getTheMipsTarget();
 Target &getTheMips64Target();
 Target &getTheMips64elTarget();
-}
+
+} // end namespace llvm
 
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
@@ -1106,6 +1117,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                               raw_ostream &CStream) const {
   uint32_t Insn;
   DecodeStatus Result;
+  Size = 0;
 
   if (IsMicroMips) {
     Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
@@ -1168,98 +1180,88 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       }
     }
 
-    // This is an invalid instruction. Let the disassembler move forward by the
-    // minimum instruction size.
+    // This is an invalid instruction. Claim that the Size is 2 bytes. Since
+    // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
+    // could form a valid instruction. The two bytes we rejected as an
+    // instruction could have actually beeen an inline constant pool that is
+    // unconditionally branched over.
     Size = 2;
     return MCDisassembler::Fail;
   }
 
+  // Attempt to read the instruction so that we can attempt to decode it. If
+  // the buffer is not 4 bytes long, let the higher level logic figure out
+  // what to do with a size of zero and MCDisassembler::Fail.
   Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
-  if (Result == MCDisassembler::Fail) {
-    Size = 4;
+  if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  }
+
+  // The only instruction size for standard encoded MIPS.
+  Size = 4;
 
   if (hasCOP3()) {
     DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
     Result =
         decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6() && isGP64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6() && isPTR64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips2() && isPTR64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasCnMips()) {
     DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (isGP64()) {
     DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result =
       decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
+  if (Result != MCDisassembler::Fail)
     return Result;
-  }
 
-  Size = 4;
   return MCDisassembler::Fail;
 }
 
@@ -1267,16 +1269,13 @@ static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder) {
-
   return MCDisassembler::Fail;
-
 }
 
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
-
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1620,7 +1619,7 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
   switch(Inst.getOpcode())
   {
   default:
-    assert (0 && "Unexpected instruction");
+    assert(false && "Unexpected instruction");
     return MCDisassembler::Fail;
     break;
   case Mips::LD_B:
@@ -1980,7 +1979,6 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   if (RegNo > 30 || RegNo %2)
     return MCDisassembler::Fail;
 
-  ;
   unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
@@ -2128,7 +2126,6 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
                                      const void *Decoder) {
-
   unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
   Inst.addOperand(MCOperand::createImm(JumpOffset));
   return MCDisassembler::Success;
@@ -2267,7 +2264,14 @@ static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   const void *Decoder) {
   // First we need to grab the pos(lsb) from MCInst.
   int Pos = Inst.getOperand(2).getImm();
-  int Size = (int) Insn - Pos + 1;
+  if (Inst.getOpcode() == Mips::DINSU)
+    Pos += 32;
+  int Size;
+  if (Inst.getOpcode() == Mips::DINSM ||
+      Inst.getOpcode() == Mips::DINSU)
+    Size = (int) Insn - Pos + 33;
+  else
+    Size = (int) Insn - Pos + 1;
   Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
   return MCDisassembler::Success;
 }
@@ -2363,7 +2367,6 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
                                        uint64_t Address, const void *Decoder) {
-
   unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
 
   switch (RegPair) {
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index 932d38a..4a2b75b 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -1,4 +1,4 @@
-//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//===- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsABIFlagsSection.h"
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MipsABIFlags.h"
 
 using namespace llvm;
 
@@ -51,6 +55,7 @@ uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
 }
 
 namespace llvm {
+
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   // Write out a Elf_Internal_ABIFlags_v0 struct
   OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
@@ -66,4 +71,5 @@ MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
   return OS;
 }
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 3966cae..9abd4f1 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -1,4 +1,4 @@
-//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//===- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,10 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include <cstdint>
 
 namespace llvm {
 
@@ -23,36 +24,32 @@ struct MipsABIFlagsSection {
   enum class FpABIKind { ANY, XX, S32, S64, SOFT };
 
   // Version of flags structure.
-  uint16_t Version;
+  uint16_t Version = 0;
   // The level of the ISA: 1-5, 32, 64.
-  uint8_t ISALevel;
+  uint8_t ISALevel = 0;
   // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
-  uint8_t ISARevision;
+  uint8_t ISARevision = 0;
   // The size of general purpose registers.
-  Mips::AFL_REG GPRSize;
+  Mips::AFL_REG GPRSize = Mips::AFL_REG_NONE;
   // The size of co-processor 1 registers.
-  Mips::AFL_REG CPR1Size;
+  Mips::AFL_REG CPR1Size = Mips::AFL_REG_NONE;
   // The size of co-processor 2 registers.
-  Mips::AFL_REG CPR2Size;
+  Mips::AFL_REG CPR2Size = Mips::AFL_REG_NONE;
   // Processor-specific extension.
-  Mips::AFL_EXT ISAExtension;
+  Mips::AFL_EXT ISAExtension = Mips::AFL_EXT_NONE;
   // Mask of ASEs used.
-  uint32_t ASESet;
+  uint32_t ASESet = 0;
 
-  bool OddSPReg;
+  bool OddSPReg = false;
 
-  bool Is32BitABI;
+  bool Is32BitABI = false;
 
 protected:
   // The floating-point ABI.
-  FpABIKind FpABI;
+  FpABIKind FpABI = FpABIKind::ANY;
 
 public:
-  MipsABIFlagsSection()
-      : Version(0), ISALevel(0), ISARevision(0), GPRSize(Mips::AFL_REG_NONE),
-        CPR1Size(Mips::AFL_REG_NONE), CPR2Size(Mips::AFL_REG_NONE),
-        ISAExtension(Mips::AFL_EXT_NONE), ASESet(0), OddSPReg(false),
-        Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+  MipsABIFlagsSection() = default;
 
   uint16_t getVersionValue() { return (uint16_t)Version; }
   uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
@@ -80,6 +77,7 @@ public:
     FpABI = Value;
     Is32BitABI = IsABI32Bit;
   }
+
   StringRef getFpABIString(FpABIKind Value);
 
   template <class PredicateLibrary>
@@ -161,6 +159,8 @@ public:
       ASESet |= Mips::AFL_ASE_MICROMIPS;
     if (P.inMips16Mode())
       ASESet |= Mips::AFL_ASE_MIPS16;
+    if (P.hasMT())
+      ASESet |= Mips::AFL_ASE_MT;
   }
 
   template <class PredicateLibrary>
@@ -195,6 +195,7 @@ public:
 };
 
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 38b11f7..a1ed0ea 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 //
 
-#include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsAsmBackend.h"
+#include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -34,7 +34,7 @@ using namespace llvm;
 
 // Prepare value for the target space for it
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx = nullptr) {
+                                 MCContext &Ctx) {
 
   unsigned Kind = Fixup.getKind();
 
@@ -74,8 +74,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // address range. Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isInt<16>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC16 fixup");
       return 0;
     }
     break;
@@ -84,8 +84,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 19-bit signed immediate.
-    if (!isInt<19>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+    if (!isInt<19>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC19 fixup");
       return 0;
     }
     break;
@@ -121,8 +121,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 7-bit signed immediate.
-    if (!isInt<7>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+    if (!isInt<7>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC7 fixup");
       return 0;
     }
     break;
@@ -131,8 +131,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 10-bit signed immediate.
-    if (!isInt<10>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+    if (!isInt<10>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC10 fixup");
       return 0;
     }
     break;
@@ -141,8 +141,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isInt<16>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC16 fixup");
       return 0;
     }
     break;
@@ -150,21 +150,21 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isInt<18>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
       return 0;
     }
     break;
   case Mips::fixup_MICROMIPS_PC18_S3:
     // Check alignment.
-    if ((Value & 7) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if ((Value & 7)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
     }
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isInt<18>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
       return 0;
     }
     break;
@@ -172,8 +172,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isInt<21>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC21 fixup");
       return 0;
     }
     break;
@@ -181,8 +181,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isInt<26>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
@@ -190,8 +190,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isInt<26>(Value) && Ctx) {
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value)) {
+      Ctx.reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
@@ -199,8 +199,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isInt<21>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC21 fixup");
       return 0;
     }
     break;
@@ -235,11 +235,13 @@ static unsigned calculateMMLEIndex(unsigned i) {
 /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
 /// data fragment, at the offset specified by the fixup and following the
 /// fixup kind as appropriate.
-void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                unsigned DataSize, uint64_t Value,
-                                bool IsPCRel) const {
+void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                const MCValue &Target,
+                                MutableArrayRef<char> Data, uint64_t Value,
+                                bool IsResolved) const {
   MCFixupKind Kind = Fixup.getKind();
-  Value = adjustFixupValue(Fixup, Value);
+  MCContext &Ctx = Asm.getContext();
+  Value = adjustFixupValue(Fixup, Value, Ctx);
 
   if (!Value)
     return; // Doesn't change encoding.
@@ -366,6 +368,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_LDM",         0,     16,   0 },
     { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
     { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+    { "fixup_MICROMIPS_GOTTPREL",        0,     16,   0 },
     { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 },
     { "fixup_Mips_SUB",                  0,     64,   0 },
@@ -437,6 +440,7 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_LDM",         16,     16,   0 },
     { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16,     16,   0 },
     { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16,     16,   0 },
+    { "fixup_MICROMIPS_GOTTPREL",        16,     16,   0 },
     { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 },
     { "fixup_Mips_SUB",                   0,     64,   0 },
@@ -471,24 +475,6 @@ bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-/// processFixupValue - Target hook to process the literal value of a fixup
-/// if necessary.
-void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                       const MCAsmLayout &Layout,
-                                       const MCFixup &Fixup,
-                                       const MCFragment *DF,
-                                       const MCValue &Target,
-                                       uint64_t &Value,
-                                       bool &IsResolved) {
-  // At this point we'll ignore the value returned by adjustFixupValue as
-  // we are only checking if the fixup can be applied correctly. We have
-  // access to MCContext from here which allows us to report a fatal error
-  // with *possibly* a source code location.
-  // The caller will also ignore any changes we make to Value
-  // (recordRelocation() overwrites it with it's own calculation).
-  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
 // MCAsmBackend
 MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
                                              const MCRegisterInfo &MRI,
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index f260cfa..8ebde3b 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -38,8 +38,9 @@ public:
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -82,11 +83,6 @@ public:
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index b2efd72..d116ac3 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -7,33 +7,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-#include <list>
-#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
-#include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <list>
+#include <utility>
 
 #define DEBUG_TYPE "mips-elf-object-writer"
 
 using namespace llvm;
 
 namespace {
+
 /// Holds additional information needed by the relocation ordering algorithm.
 struct MipsRelocationEntry {
   const ELFRelocationEntry R; ///< The relocation.
-  bool Matched;               ///< Is this relocation part of a match.
+  bool Matched = false;       ///< Is this relocation part of a match.
 
-  MipsRelocationEntry(const ELFRelocationEntry &R) : R(R), Matched(false) {}
+  MipsRelocationEntry(const ELFRelocationEntry &R) : R(R) {}
 
   void print(raw_ostream &Out) const {
     R.print(Out);
@@ -53,23 +58,33 @@ public:
   MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64,
                       bool IsLittleEndian);
 
-  ~MipsELFObjectWriter() override;
+  ~MipsELFObjectWriter() override = default;
 
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
   bool needsRelocateWithSymbol(const MCSymbol &Sym,
                                unsigned Type) const override;
-  virtual void sortRelocs(const MCAssembler &Asm,
-                          std::vector<ELFRelocationEntry> &Relocs) override;
+  void sortRelocs(const MCAssembler &Asm,
+                  std::vector<ELFRelocationEntry> &Relocs) override;
+};
+
+/// The possible results of the Predicate function used by find_best.
+enum FindBestPredicateResult {
+  FindBest_NoMatch = 0,  ///< The current element is not a match.
+  FindBest_Match,        ///< The current element is a match but better ones are
+                         ///  possible.
+  FindBest_PerfectMatch, ///< The current element is an unbeatable match.
 };
 
+} // end anonymous namespace
+
 /// Copy elements in the range [First, Last) to d1 when the predicate is true or
 /// d2 when the predicate is false. This is essentially both std::copy_if and
 /// std::remove_copy_if combined into a single pass.
 template <class InputIt, class OutputIt1, class OutputIt2, class UnaryPredicate>
-std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
-                                             OutputIt1 d1, OutputIt2 d2,
-                                             UnaryPredicate Predicate) {
+static std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
+                                                    OutputIt1 d1, OutputIt2 d2,
+                                                    UnaryPredicate Predicate) {
   for (InputIt I = First; I != Last; ++I) {
     if (Predicate(*I)) {
       *d1 = *I;
@@ -83,14 +98,6 @@ std::pair<OutputIt1, OutputIt2> copy_if_else(InputIt First, InputIt Last,
   return std::make_pair(d1, d2);
 }
 
-/// The possible results of the Predicate function used by find_best.
-enum FindBestPredicateResult {
-  FindBest_NoMatch = 0,  ///< The current element is not a match.
-  FindBest_Match,        ///< The current element is a match but better ones are
-                         ///  possible.
-  FindBest_PerfectMatch, ///< The current element is an unbeatable match.
-};
-
 /// Find the best match in the range [First, Last).
 ///
 /// An element matches when Predicate(X) returns FindBest_Match or
@@ -101,8 +108,8 @@ enum FindBestPredicateResult {
 /// This is similar to std::find_if but finds the best of multiple possible
 /// matches.
 template <class InputIt, class UnaryPredicate, class Comparator>
-InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
-                  Comparator BetterThan) {
+static InputIt find_best(InputIt First, InputIt Last, UnaryPredicate Predicate,
+                         Comparator BetterThan) {
   InputIt Best = Last;
 
   for (InputIt I = First; I != Last; ++I) {
@@ -202,16 +209,12 @@ static void dumpRelocs(const char *Prefix, const Container &Relocs) {
 }
 #endif
 
-} // end anonymous namespace
-
 MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
                                          bool _isN64, bool IsLittleEndian)
     : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
                               /*HasRelocationAddend*/ _isN64,
                               /*IsN64*/ _isN64) {}
 
-MipsELFObjectWriter::~MipsELFObjectWriter() {}
-
 unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
                                            const MCValue &Target,
                                            const MCFixup &Fixup,
@@ -371,6 +374,8 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_MICROMIPS_TLS_DTPREL_HI16;
   case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
     return ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+  case Mips::fixup_MICROMIPS_GOTTPREL:
+    return ELF::R_MICROMIPS_TLS_GOTTPREL;
   case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
     return ELF::R_MICROMIPS_TLS_TPREL_HI16;
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
@@ -419,7 +424,6 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
 /// always match using the expressions from the source.
 void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm,
                                      std::vector<ELFRelocationEntry> &Relocs) {
-
   // We do not need to sort the relocation table for RELA relocations which
   // N32/N64 uses as the relocation addend contains the value we require,
   // rather than it being split across a pair of relocations.
@@ -524,6 +528,8 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MIPS_GOT16:
   case ELF::R_MIPS16_GOT16:
   case ELF::R_MICROMIPS_GOT16:
+  case ELF::R_MIPS_HIGHER:
+  case ELF::R_MIPS_HIGHEST:
   case ELF::R_MIPS_HI16:
   case ELF::R_MIPS16_HI16:
   case ELF::R_MICROMIPS_HI16:
@@ -567,8 +573,6 @@ bool MipsELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
   case ELF::R_MIPS_INSERT_A:
   case ELF::R_MIPS_INSERT_B:
   case ELF::R_MIPS_DELETE:
-  case ELF::R_MIPS_HIGHER:
-  case ELF::R_MIPS_HIGHEST:
   case ELF::R_MIPS_CALL_HI16:
   case ELF::R_MIPS_CALL_LO16:
   case ELF::R_MIPS_SCN_DISP:
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e7d687e..f658aad 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -8,15 +8,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsELFStreamer.h"
+#include "MipsOptionRecord.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
 void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
-                                      const MCSubtargetInfo &STI) {
+                                      const MCSubtargetInfo &STI, bool) {
   MCELFStreamer::EmitInstruction(Inst, STI);
 
   MCContext &Context = getContext();
@@ -51,7 +55,7 @@ void MipsELFStreamer::createPendingLabelRelocs() {
   Labels.clear();
 }
 
-void MipsELFStreamer::EmitLabel(MCSymbol *Symbol) {
+void MipsELFStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCELFStreamer::EmitLabel(Symbol);
   Labels.push_back(Symbol);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index a241cde..f5eda11 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -1,4 +1,4 @@
-//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//===- MipsELFStreamer.h - ELF Object Output --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,6 +21,7 @@
 #include <memory>
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -31,12 +32,10 @@ class MipsELFStreamer : public MCELFStreamer {
   MipsRegInfoRecord *RegInfoRecord;
   SmallVector<MCSymbol*, 4> Labels;
 
-
 public:
   MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, MAB, OS, Emitter) {
-
     RegInfoRecord = new MipsRegInfoRecord(this, Context);
     MipsOptionRecords.push_back(
         std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
@@ -46,12 +45,13 @@ public:
   /// \p Inst is actually emitted. For example, we can inspect the operands and
   /// gather sufficient information that allows us to reason about the register
   /// usage for the translation unit.
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool = false) override;
 
   /// Overriding this function allows us to record all labels that should be
   /// marked as microMIPS. Based on this data marking is done in
   /// EmitInstruction.
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .section directive is processed.
@@ -72,5 +72,6 @@ public:
 MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_pwrite_stream &OS,
                                      MCCodeEmitter *Emitter, bool RelaxAll);
-} // namespace llvm.
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index 1492962..6148a1b 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -203,6 +203,9 @@ namespace Mips {
     // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
     fixup_MICROMIPS_TLS_DTPREL_LO16,
 
+    // resulting in - R_MICROMIPS_TLS_GOTTPREL.
+    fixup_MICROMIPS_GOTTPREL,
+
     // resulting in - R_MICROMIPS_TLS_TPREL_HI16
     fixup_MICROMIPS_TLS_TPREL_HI16,
 
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index a44a35f..11411d9 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -19,13 +19,11 @@ using namespace llvm;
 void MipsMCAsmInfo::anchor() { }
 
 MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
-  if ((TheTriple.getArch() == Triple::mips) ||
-      (TheTriple.getArch() == Triple::mips64))
-    IsLittleEndian = false;
+  IsLittleEndian = TheTriple.isLittleEndian();
 
   if ((TheTriple.getArch() == Triple::mips64el) ||
       (TheTriple.getArch() == Triple::mips64)) {
-    PointerSize = CalleeSaveStackSlotSize = 8;
+    CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   // FIXME: This condition isn't quite right but it's the best we can do until
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 0614316..0330824 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -10,22 +10,29 @@
 // This file implements the MipsMCCodeEmitter class.
 //
 //===----------------------------------------------------------------------===//
-//
 
 #include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
@@ -34,6 +41,7 @@
 #undef GET_INSTRMAP_INFO
 
 namespace llvm {
+
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                          const MCRegisterInfo &MRI,
                                          MCContext &Ctx) {
@@ -45,12 +53,12 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          MCContext &Ctx) {
   return new MipsMCCodeEmitter(MCII, Ctx, true);
 }
-} // End of namespace llvm.
+
+} // end namespace llvm
 
 // If the D<shift> instruction has a shift amount that is greater
 // than 31 (checked in calling routine), lower it to a D<shift>32 instruction
 static void LowerLargeShift(MCInst& Inst) {
-
   assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
   assert(Inst.getOperand(2).isImm());
 
@@ -103,24 +111,25 @@ static void LowerDins(MCInst& InstIn) {
   assert(InstIn.getOperand(3).isImm());
   int64_t size = InstIn.getOperand(3).getImm();
 
-  if (size <= 32) {
-    if (pos < 32)  // DINS, do nothing
-      return;
+  assert((pos + size) <= 64 &&
+         "DINS cannot have position plus size over 64");
+  if (pos < 32) {
+    if ((pos + size) > 0 && (pos + size) <= 32)
+      return; // DINS, do nothing
+    else if ((pos + size) > 32) {
+      //DINSM
+      InstIn.getOperand(3).setImm(size - 32);
+      InstIn.setOpcode(Mips::DINSM);
+    }
+  } else if ((pos + size) > 32 && (pos + size) <= 64) {
     // DINSU
     InstIn.getOperand(2).setImm(pos - 32);
     InstIn.setOpcode(Mips::DINSU);
-    return;
   }
-  // DINSM
-  assert(pos < 32 && "DINS cannot have both size and pos > 32");
-  InstIn.getOperand(3).setImm(size - 32);
-  InstIn.setOpcode(Mips::DINSM);
-  return;
 }
 
 // Fix a bad compact branch encoding for beqc/bnec.
 void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
-
   // Encoding may be illegal !(rs < rt), but this situation is
   // easily fixed.
   unsigned RegOp0 = Inst.getOperand(0).getReg();
@@ -146,7 +155,6 @@ void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
 
   Inst.getOperand(0).setReg(RegOp1);
   Inst.getOperand(1).setReg(RegOp0);
-
 }
 
 bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
@@ -186,7 +194,6 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const
 {
-
   // Non-pseudo instructions that get changed for direct object
   // only based on operand values.
   // If this list of instructions get much longer we will move
@@ -272,7 +279,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -295,7 +301,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -318,7 +323,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -342,7 +346,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -366,7 +369,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -388,7 +390,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -410,7 +411,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -433,7 +433,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -456,7 +455,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -479,7 +477,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -501,7 +498,6 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
 unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
     const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -525,7 +521,6 @@ unsigned MipsMCCodeEmitter::
 getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   if (MO.isImm()) return MO.getImm();
@@ -544,7 +539,6 @@ unsigned MipsMCCodeEmitter::
 getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 4.
   if (MO.isImm()) return MO.getImm()>>2;
@@ -562,7 +556,6 @@ unsigned MipsMCCodeEmitter::
 getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 2.
   if (MO.isImm()) return MO.getImm() >> 1;
@@ -580,7 +573,6 @@ unsigned MipsMCCodeEmitter::
 getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     // The immediate is encoded as 'immediate << 2'.
@@ -599,7 +591,6 @@ unsigned MipsMCCodeEmitter::
 getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     int Value = MO.getImm();
@@ -613,7 +604,6 @@ unsigned MipsMCCodeEmitter::
 getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     unsigned Value = MO.getImm();
@@ -627,7 +617,6 @@ unsigned MipsMCCodeEmitter::
 getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     unsigned Binary = (MO.getImm() >> 2) & 0x0000ffff;
@@ -680,7 +669,8 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                                    : Mips::fixup_Mips_DTPREL_LO;
       break;
     case MipsMCExpr::MEK_GOTTPREL:
-      FixupKind = Mips::fixup_Mips_GOTTPREL;
+      FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOTTPREL
+                                   : Mips::fixup_Mips_GOTTPREL;
       break;
     case MipsMCExpr::MEK_GOT:
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_GOT16
@@ -711,7 +701,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
     case MipsMCExpr::MEK_GPREL:
       FixupKind = Mips::fixup_Mips_GPREL16;
       break;
-    case MipsMCExpr::MEK_LO: {
+    case MipsMCExpr::MEK_LO:
       // Check for %lo(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff()) {
         FixupKind = Mips::fixup_Mips_GPOFF_LO;
@@ -720,7 +710,6 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
                                    : Mips::fixup_Mips_LO16;
       break;
-    }
     case MipsMCExpr::MEK_HIGHEST:
       FixupKind = Mips::fixup_Mips_HIGHEST;
       break;
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 2d041dc..d12d319 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -1,4 +1,4 @@
-//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//===- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,29 +10,25 @@
 // This file defines the MipsMCCodeEmitter class.
 //
 //===----------------------------------------------------------------------===//
-//
 
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Support/DataTypes.h"
-
-using namespace llvm;
+#include <cstdint>
 
 namespace llvm {
+
 class MCContext;
 class MCExpr;
+class MCFixup;
 class MCInst;
 class MCInstrInfo;
-class MCFixup;
 class MCOperand;
 class MCSubtargetInfo;
 class raw_ostream;
 
 class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
-  void operator=(const MipsMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
   bool IsLittleEndian;
@@ -43,8 +39,9 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
 public:
   MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
       : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
-
-  ~MipsMCCodeEmitter() override {}
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
+  MipsMCCodeEmitter &operator=(const MipsMCCodeEmitter &) = delete;
+  ~MipsMCCodeEmitter() override = default;
 
   void EmitByte(unsigned char C, raw_ostream &OS) const;
 
@@ -270,9 +267,11 @@ public:
   unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
-  private:
+
+private:
   void LowerCompactBranch(MCInst& Inst) const;
-}; // class MipsMCCodeEmitter
-} // namespace llvm.
+};
+
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 082bb87..aad6bf3 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -8,12 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsMCExpr.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index d1a4334..495d525 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -1,4 +1,4 @@
-//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -70,6 +70,7 @@ public:
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
+
   MCFragment *findAssociatedFragment() const override {
     return getSubExpr()->findAssociatedFragment();
   }
@@ -86,6 +87,7 @@ public:
     return isGpOff(Kind);
   }
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index aef9bd3..9266f0e 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -20,7 +20,11 @@
 #include "Mips.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCNaCl.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -38,14 +42,14 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
 public:
   MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
-      : MipsELFStreamer(Context, TAB, OS, Emitter), PendingCall(false) {}
+      : MipsELFStreamer(Context, TAB, OS, Emitter) {}
 
-  ~MipsNaClELFStreamer() override {}
+  ~MipsNaClELFStreamer() override = default;
 
 private:
   // Whether we started the sandboxing sequence for calls.  Calls are bundled
   // with branch delays and aligned to the bundle end.
-  bool PendingCall;
+  bool PendingCall = false;
 
   bool isIndirectJump(const MCInst &MI) {
     if (MI.getOpcode() == Mips::JALR) {
@@ -135,8 +139,8 @@ private:
 public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer.  We override it to mask dangerous instructions.
-  void EmitInstruction(const MCInst &Inst,
-                       const MCSubtargetInfo &STI) override {
+  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                       bool) override {
     // Sandbox indirect jumps.
     if (isIndirectJump(Inst)) {
       if (PendingCall)
@@ -265,4 +269,4 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
   return S;
 }
 
-}
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 24b6028..2d84528 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -1,4 +1,4 @@
-//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//===- MipsOptionRecord.cpp - Abstraction for storing information ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,9 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsOptionRecord.h"
+#include "MipsABIInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 7f79eb4..2907b77 100644
--- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -13,16 +13,17 @@
 
 #include "MipsTargetStreamer.h"
 #include "InstPrinter/MipsInstPrinter.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCExpr.h"
 #include "MipsMCTargetDesc.h"
 #include "MipsTargetObjectFile.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 
@@ -49,6 +50,8 @@ void MipsTargetStreamer::emitDirectiveSetMacro() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoMacro() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMt() {}
+void MipsTargetStreamer::emitDirectiveSetNoMt() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) {
   forbidModuleDirective();
@@ -117,6 +120,7 @@ void MipsTargetStreamer::emitDirectiveModuleOddSPReg() {
 }
 void MipsTargetStreamer::emitDirectiveModuleSoftFloat() {}
 void MipsTargetStreamer::emitDirectiveModuleHardFloat() {}
+void MipsTargetStreamer::emitDirectiveModuleMT() {}
 void MipsTargetStreamer::emitDirectiveSetFp(
     MipsABIFlagsSection::FpABIKind Value) {
   forbidModuleDirective();
@@ -391,6 +395,16 @@ void MipsTargetAsmStreamer::emitDirectiveSetNoMsa() {
   MipsTargetStreamer::emitDirectiveSetNoMsa();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetMt() {
+  OS << "\t.set\tmt\n";
+  MipsTargetStreamer::emitDirectiveSetMt();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMt() {
+  OS << "\t.set\tnomt\n";
+  MipsTargetStreamer::emitDirectiveSetNoMt();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetAt() {
   OS << "\t.set\tat\n";
   MipsTargetStreamer::emitDirectiveSetAt();
@@ -655,6 +669,10 @@ void MipsTargetAsmStreamer::emitDirectiveModuleHardFloat() {
   OS << "\t.module\thardfloat\n";
 }
 
+void MipsTargetAsmStreamer::emitDirectiveModuleMT() {
+  OS << "\t.module\tmt\n";
+}
+
 // This part is for ELF object output.
 MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
@@ -685,6 +703,17 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   // issues as well.
   unsigned EFlags = MCA.getELFHeaderEFlags();
 
+  // FIXME: Fix a dependency issue by instantiating the ABI object to some
+  // default based off the triple. The triple doesn't describe the target
+  // fully, but any external user of the API that uses the MCTargetStreamer
+  // would otherwise crash on assertion failure.
+
+  ABI = MipsABIInfo(
+      STI.getTargetTriple().getArch() == Triple::ArchType::mipsel ||
+              STI.getTargetTriple().getArch() == Triple::ArchType::mips
+          ? MipsABIInfo::O32()
+          : MipsABIInfo::N64());
+
   // Architecture
   if (Features[Mips::FeatureMips64r6])
     EFlags |= ELF::EF_MIPS_ARCH_64R6;
@@ -721,23 +750,18 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   if (Features[Mips::FeatureNaN2008])
     EFlags |= ELF::EF_MIPS_NAN2008;
 
-  // -mabicalls and -mplt are not implemented but we should act as if they were
-  // given.
-  EFlags |= ELF::EF_MIPS_CPIC;
-
   MCA.setELFHeaderEFlags(EFlags);
 }
 
 void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  if (!isMicroMipsEnabled())
-    return;
   getStreamer().getAssembler().registerSymbol(*Symbol);
   uint8_t Type = Symbol->getType();
   if (Type != ELF::STT_FUNC)
     return;
 
-  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+  if (isMicroMipsEnabled())
+    Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
 }
 
 void MipsTargetELFStreamer::finish() {
@@ -795,10 +819,13 @@ void MipsTargetELFStreamer::finish() {
   } else if (Features[Mips::FeatureMips64r2] || Features[Mips::FeatureMips64])
     EFlags |= ELF::EF_MIPS_32BITMODE;
 
-  // If we've set the cpic eflag and we're n64, go ahead and set the pic
-  // one as well.
-  if (EFlags & ELF::EF_MIPS_CPIC && getABI().IsN64())
-    EFlags |= ELF::EF_MIPS_PIC;
+  // -mplt is not implemented but we should act as if it was
+  // given.
+  if (!Features[Mips::FeatureNoABICalls])
+    EFlags |= ELF::EF_MIPS_CPIC;
+
+  if (Pic)
+    EFlags |= ELF::EF_MIPS_PIC | ELF::EF_MIPS_CPIC;
 
   MCA.setELFHeaderEFlags(EFlags);
 
@@ -904,10 +931,10 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   const MCExpr *Size = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
       ExprRef, Context);
-  int64_t AbsSize;
-  if (!Size->evaluateAsAbsolute(AbsSize, MCA))
-    llvm_unreachable("Function size must be evaluatable as absolute");
-  Size = MCConstantExpr::create(AbsSize, Context);
+
+  // The ELFObjectWriter can determine the absolute size as it has access to
+  // the layout information of the assembly file, so a size expression rather
+  // than an absolute value is ok here.
   static_cast<MCSymbolELF *>(Sym)->setSize(Size);
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
index 05aad51..38b09d1 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td
@@ -475,29 +475,11 @@ defm : MaterializeImms<i64, ZERO_64, DADDIU_MM64R6, LUi64, ORi64>;
 //
 //===----------------------------------------------------------------------===//
 
-def : MipsPat<(MipsLo tglobaladdr:$in),
-              (DADDIU_MM64R6 ZERO_64, tglobaladdr:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tblockaddress:$in),
-              (DADDIU_MM64R6 ZERO_64, tblockaddress:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tjumptable:$in),
-              (DADDIU_MM64R6 ZERO_64, tjumptable:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tconstpool:$in),
-              (DADDIU_MM64R6 ZERO_64, tconstpool:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo tglobaltlsaddr:$in),
-              (DADDIU_MM64R6 ZERO_64, tglobaltlsaddr:$in)>, ISA_MICROMIPS64R6;
-def : MipsPat<(MipsLo texternalsym:$in),
-              (DADDIU_MM64R6 ZERO_64, texternalsym:$in)>, ISA_MICROMIPS64R6;
-
-def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tglobaladdr:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tblockaddress:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tjumptable:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tconstpool:$lo)>, ISA_MICROMIPS64R6;
-def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (DADDIU_MM64R6 GPR64:$hi, tglobaltlsaddr:$lo)>, ISA_MICROMIPS64R6;
+defm : MipsHiLoRelocs<LUi64, DADDIU_MM64R6, ZERO_64, GPR64Opnd>, SYM_32,
+                      ISA_MICROMIPS64R6;
+
+defm : MipsHighestHigherHiLoRelocs<LUi64, DADDIU_MM64R6>, SYM_64,
+                                   ISA_MICROMIPS64R6;
 
 def : MipsPat<(addc GPR64:$lhs, GPR64:$rhs),
               (DADDU_MM64R6 GPR64:$lhs, GPR64:$rhs)>, ISA_MICROMIPS64R6;
@@ -566,3 +548,15 @@ def : MipsInstAlias<"dnegu $rt, $rs",
 def : MipsInstAlias<"dnegu $rt",
                     (DSUBU_MM64R6 GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rt), 1>,
                     ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt,
+                                  GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                    (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rt,
+                                  GPR32Opnd:$rs), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsrl $rd, $rt",
+                    (DSRLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd,
+                                  GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6;
+def : MipsInstAlias<"dsll $rd, $rt",
+                    (DSLLV_MM64R6 GPR64Opnd:$rd, GPR64Opnd:$rd,
+                                  GPR32Opnd:$rt), 0>, ISA_MICROMIPS64R6;
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
index c0de9e7..ee554bc 100644
--- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1136,12 +1136,6 @@ let Predicates = [InMicroMips] in {
   def : MipsInstAlias<
           "sgtu $rs, $rt",
           (SLTu_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
-  def : MipsInstAlias<"slt $rs, $rt, $imm",
-                      (SLTi_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
-                               simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<"sltu $rs, $rt, $imm",
-                      (SLTiu_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
-                                simm32_relaxed:$imm), 0>;
   def : MipsInstAlias<"sll $rd, $rt, $rs",
                       (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
   def : MipsInstAlias<"sra $rd, $rt, $rs",
@@ -1163,18 +1157,21 @@ let Predicates = [InMicroMips] in {
   def : MipsInstAlias<"rotr $rt, $imm",
                       (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
   def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
-  def : MipsInstAlias<"and $rs, $rt, $imm",
-                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-  def : MipsInstAlias<"and $rs, $imm",
-                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
-  def : MipsInstAlias<"or $rs, $rt, $imm",
-                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-  def : MipsInstAlias<"or $rs, $imm",
-                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
-  def : MipsInstAlias<"xor $rs, $rt, $imm",
-                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-  def : MipsInstAlias<"xor $rs, $imm",
-                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>;
+
   def : MipsInstAlias<"not $rt, $rs",
                       (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
   def : MipsInstAlias<"not $rt",
diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp b/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
new file mode 100644
index 0000000..35948e3
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -0,0 +1,392 @@
+//=== MicroMipsSizeReduction.cpp - MicroMips size reduction pass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///\file
+/// This pass is used to reduce the size of instructions where applicable.
+///
+/// TODO: Implement microMIPS64 support.
+/// TODO: Implement support for reducing into lwp/swp instruction.
+//===----------------------------------------------------------------------===//
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "micromips-reduce-size"
+
+STATISTIC(NumReduced, "Number of 32-bit instructions reduced to 16-bit ones");
+
+namespace {
+
+/// Order of operands to transfer
+// TODO: Will be extended when additional optimizations are added
+enum OperandTransfer {
+  OT_NA,          ///< Not applicable
+  OT_OperandsAll, ///< Transfer all operands
+};
+
+/// Reduction type
+// TODO: Will be extended when additional optimizations are added
+enum ReduceType {
+  RT_OneInstr ///< Reduce one instruction into a smaller instruction
+};
+
+// Information about immediate field restrictions
+struct ImmField {
+  ImmField() : ImmFieldOperand(-1), Shift(0), LBound(0), HBound(0) {}
+  ImmField(uint8_t Shift, int16_t LBound, int16_t HBound,
+           int8_t ImmFieldOperand)
+      : ImmFieldOperand(ImmFieldOperand), Shift(Shift), LBound(LBound),
+        HBound(HBound) {}
+  int8_t ImmFieldOperand; // Immediate operand, -1 if it does not exist
+  uint8_t Shift;          // Shift value
+  int16_t LBound;         // Low bound of the immediate operand
+  int16_t HBound;         // High bound of the immediate operand
+};
+
+/// Information about operands
+// TODO: Will be extended when additional optimizations are added
+struct OpInfo {
+  OpInfo(enum OperandTransfer TransferOperands)
+      : TransferOperands(TransferOperands) {}
+  OpInfo() : TransferOperands(OT_NA) {}
+
+  enum OperandTransfer
+      TransferOperands; ///< Operands to transfer to the new instruction
+};
+
+// Information about opcodes
+struct OpCodes {
+  OpCodes(unsigned WideOpc, unsigned NarrowOpc)
+      : WideOpc(WideOpc), NarrowOpc(NarrowOpc) {}
+
+  unsigned WideOpc;   ///< Wide opcode
+  unsigned NarrowOpc; ///< Narrow opcode
+};
+
+/// ReduceTable - A static table with information on mapping from wide
+/// opcodes to narrow
+struct ReduceEntry {
+
+  enum ReduceType eRType; ///< Reduction type
+  bool (*ReduceFunction)(
+      MachineInstr *MI,
+      const ReduceEntry &Entry); ///< Pointer to reduce function
+  struct OpCodes Ops;            ///< All relevant OpCodes
+  struct OpInfo OpInf;           ///< Characteristics of operands
+  struct ImmField Imm;           ///< Characteristics of immediate field
+
+  ReduceEntry(enum ReduceType RType, struct OpCodes Op,
+              bool (*F)(MachineInstr *MI, const ReduceEntry &Entry),
+              struct OpInfo OpInf, struct ImmField Imm)
+      : eRType(RType), ReduceFunction(F), Ops(Op), OpInf(OpInf), Imm(Imm) {}
+
+  unsigned NarrowOpc() const { return Ops.NarrowOpc; }
+  unsigned WideOpc() const { return Ops.WideOpc; }
+  int16_t LBound() const { return Imm.LBound; }
+  int16_t HBound() const { return Imm.HBound; }
+  uint8_t Shift() const { return Imm.Shift; }
+  int8_t ImmField() const { return Imm.ImmFieldOperand; }
+  enum OperandTransfer TransferOperands() const {
+    return OpInf.TransferOperands;
+  }
+  enum ReduceType RType() const { return eRType; }
+
+  // operator used by std::equal_range
+  bool operator<(const unsigned int r) const { return (WideOpc() < r); }
+
+  // operator used by std::equal_range
+  friend bool operator<(const unsigned int r, const struct ReduceEntry &re) {
+    return (r < re.WideOpc());
+  }
+};
+
+class MicroMipsSizeReduce : public MachineFunctionPass {
+public:
+  static char ID;
+  MicroMipsSizeReduce();
+
+  static const MipsInstrInfo *MipsII;
+  const MipsSubtarget *Subtarget;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  llvm::StringRef getPassName() const override {
+    return "microMIPS instruction size reduction pass";
+  }
+
+private:
+  /// Reduces width of instructions in the specified basic block.
+  bool ReduceMBB(MachineBasicBlock &MBB);
+
+  /// Attempts to reduce MI, returns true on success.
+  bool ReduceMI(const MachineBasicBlock::instr_iterator &MII);
+
+  // Attempts to reduce LW/SW instruction into LWSP/SWSP,
+  // returns true on success.
+  static bool ReduceXWtoXWSP(MachineInstr *MI, const ReduceEntry &Entry);
+
+  // Attempts to reduce LBU/LHU instruction into LBU16/LHU16,
+  // returns true on success.
+  static bool ReduceLXUtoLXU16(MachineInstr *MI, const ReduceEntry &Entry);
+
+  // Attempts to reduce SB/SH instruction into SB16/SH16,
+  // returns true on success.
+  static bool ReduceSXtoSX16(MachineInstr *MI, const ReduceEntry &Entry);
+
+  // Attempts to reduce arithmetic instructions, returns true on success
+  static bool ReduceArithmeticInstructions(MachineInstr *MI,
+                                           const ReduceEntry &Entry);
+
+  // Changes opcode of an instruction
+  static bool ReplaceInstruction(MachineInstr *MI, const ReduceEntry &Entry);
+
+  // Table with transformation rules for each instruction
+  static llvm::SmallVector<ReduceEntry, 16> ReduceTable;
+};
+
+char MicroMipsSizeReduce::ID = 0;
+const MipsInstrInfo *MicroMipsSizeReduce::MipsII;
+
+// This table must be sorted by WideOpc as a main criterion and
+// ReduceType as a sub-criterion (when wide opcodes are the same)
+llvm::SmallVector<ReduceEntry, 16> MicroMipsSizeReduce::ReduceTable = {
+
+    // ReduceType, OpCodes, ReduceFunction,
+    // OpInfo(TransferOperands),
+    // ImmField(Shift, LBound, HBound, ImmFieldPosition)
+    {RT_OneInstr, OpCodes(Mips::ADDu, Mips::ADDU16_MM),
+     ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
+     ImmField(0, 0, 0, -1)},
+    {RT_OneInstr, OpCodes(Mips::ADDu_MM, Mips::ADDU16_MM),
+     ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
+     ImmField(0, 0, 0, -1)},
+    {RT_OneInstr, OpCodes(Mips::LBu, Mips::LBU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)},
+    {RT_OneInstr, OpCodes(Mips::LBu_MM, Mips::LBU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(0, -1, 15, 2)},
+    {RT_OneInstr, OpCodes(Mips::LHu, Mips::LHU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::LHu_MM, Mips::LHU16_MM), ReduceLXUtoLXU16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::LW, Mips::LWSP_MM), ReduceXWtoXWSP,
+     OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+    {RT_OneInstr, OpCodes(Mips::LW_MM, Mips::LWSP_MM), ReduceXWtoXWSP,
+     OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+    {RT_OneInstr, OpCodes(Mips::SB, Mips::SB16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SB_MM, Mips::SB16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(0, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SH, Mips::SH16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SH_MM, Mips::SH16_MM), ReduceSXtoSX16,
+     OpInfo(OT_OperandsAll), ImmField(1, 0, 16, 2)},
+    {RT_OneInstr, OpCodes(Mips::SUBu, Mips::SUBU16_MM),
+     ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
+     ImmField(0, 0, 0, -1)},
+    {RT_OneInstr, OpCodes(Mips::SUBu_MM, Mips::SUBU16_MM),
+     ReduceArithmeticInstructions, OpInfo(OT_OperandsAll),
+     ImmField(0, 0, 0, -1)},
+    {RT_OneInstr, OpCodes(Mips::SW, Mips::SWSP_MM), ReduceXWtoXWSP,
+     OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+    {RT_OneInstr, OpCodes(Mips::SW_MM, Mips::SWSP_MM), ReduceXWtoXWSP,
+     OpInfo(OT_OperandsAll), ImmField(2, 0, 32, 2)},
+};
+}
+
+// Returns true if the machine operand MO is register SP
+static bool IsSP(const MachineOperand &MO) {
+  if (MO.isReg() && ((MO.getReg() == Mips::SP)))
+    return true;
+  return false;
+}
+
+// Returns true if the machine operand MO is register $16, $17, or $2-$7.
+static bool isMMThreeBitGPRegister(const MachineOperand &MO) {
+  if (MO.isReg() && Mips::GPRMM16RegClass.contains(MO.getReg()))
+    return true;
+  return false;
+}
+
+// Returns true if the machine operand MO is register $0, $17, or $2-$7.
+static bool isMMSourceRegister(const MachineOperand &MO) {
+  if (MO.isReg() && Mips::GPRMM16ZeroRegClass.contains(MO.getReg()))
+    return true;
+  return false;
+}
+
+// Returns true if the operand Op is an immediate value
+// and writes the immediate value into variable Imm
+static bool GetImm(MachineInstr *MI, unsigned Op, int64_t &Imm) {
+
+  if (!MI->getOperand(Op).isImm())
+    return false;
+  Imm = MI->getOperand(Op).getImm();
+  return true;
+}
+
+// Returns true if the variable Value has the number of least-significant zero
+// bits equal to Shift and if the shifted value is between the bounds
+static bool InRange(int64_t Value, unsigned short Shift, int LBound,
+                    int HBound) {
+  int64_t Value2 = Value >> Shift;
+  if ((Value2 << Shift) == Value && (Value2 >= LBound) && (Value2 < HBound))
+    return true;
+  return false;
+}
+
+// Returns true if immediate operand is in range
+static bool ImmInRange(MachineInstr *MI, const ReduceEntry &Entry) {
+
+  int64_t offset;
+
+  if (!GetImm(MI, Entry.ImmField(), offset))
+    return false;
+
+  if (!InRange(offset, Entry.Shift(), Entry.LBound(), Entry.HBound()))
+    return false;
+
+  return true;
+}
+
+MicroMipsSizeReduce::MicroMipsSizeReduce() : MachineFunctionPass(ID) {}
+
+bool MicroMipsSizeReduce::ReduceMI(
+    const MachineBasicBlock::instr_iterator &MII) {
+
+  MachineInstr *MI = &*MII;
+  unsigned Opcode = MI->getOpcode();
+
+  // Search the table.
+  llvm::SmallVector<ReduceEntry, 16>::const_iterator Start =
+      std::begin(ReduceTable);
+  llvm::SmallVector<ReduceEntry, 16>::const_iterator End =
+      std::end(ReduceTable);
+
+  std::pair<llvm::SmallVector<ReduceEntry, 16>::const_iterator,
+            llvm::SmallVector<ReduceEntry, 16>::const_iterator>
+      Range = std::equal_range(Start, End, Opcode);
+
+  if (Range.first == Range.second)
+    return false;
+
+  for (llvm::SmallVector<ReduceEntry, 16>::const_iterator Entry = Range.first;
+       Entry != Range.second; ++Entry)
+    if (((*Entry).ReduceFunction)(&(*MII), *Entry))
+      return true;
+
+  return false;
+}
+
+bool MicroMipsSizeReduce::ReduceXWtoXWSP(MachineInstr *MI,
+                                         const ReduceEntry &Entry) {
+
+  if (!ImmInRange(MI, Entry))
+    return false;
+
+  if (!IsSP(MI->getOperand(1)))
+    return false;
+
+  return ReplaceInstruction(MI, Entry);
+}
+
+bool MicroMipsSizeReduce::ReduceArithmeticInstructions(
+    MachineInstr *MI, const ReduceEntry &Entry) {
+
+  if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
+      !isMMThreeBitGPRegister(MI->getOperand(1)) ||
+      !isMMThreeBitGPRegister(MI->getOperand(2)))
+    return false;
+
+  return ReplaceInstruction(MI, Entry);
+}
+
+bool MicroMipsSizeReduce::ReduceLXUtoLXU16(MachineInstr *MI,
+                                           const ReduceEntry &Entry) {
+
+  if (!ImmInRange(MI, Entry))
+    return false;
+
+  if (!isMMThreeBitGPRegister(MI->getOperand(0)) ||
+      !isMMThreeBitGPRegister(MI->getOperand(1)))
+    return false;
+
+  return ReplaceInstruction(MI, Entry);
+}
+
+bool MicroMipsSizeReduce::ReduceSXtoSX16(MachineInstr *MI,
+                                         const ReduceEntry &Entry) {
+
+  if (!ImmInRange(MI, Entry))
+    return false;
+
+  if (!isMMSourceRegister(MI->getOperand(0)) ||
+      !isMMThreeBitGPRegister(MI->getOperand(1)))
+    return false;
+
+  return ReplaceInstruction(MI, Entry);
+}
+
+bool MicroMipsSizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),
+                                    E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator NextMII;
+
+  // Iterate through the instructions in the basic block
+  for (; MII != E; MII = NextMII) {
+    NextMII = std::next(MII);
+    MachineInstr *MI = &*MII;
+
+    // Don't reduce bundled instructions or pseudo operations
+    if (MI->isBundle() || MI->isTransient())
+      continue;
+
+    // Try to reduce 32-bit instruction into 16-bit instruction
+    Modified |= ReduceMI(MII);
+  }
+
+  return Modified;
+}
+
+bool MicroMipsSizeReduce::ReplaceInstruction(MachineInstr *MI,
+                                             const ReduceEntry &Entry) {
+
+  MI->setDesc(MipsII->get(Entry.NarrowOpc()));
+  DEBUG(dbgs() << "Converted into 16-bit: " << *MI);
+  ++NumReduced;
+  return true;
+}
+
+bool MicroMipsSizeReduce::runOnMachineFunction(MachineFunction &MF) {
+
+  Subtarget = &static_cast<const MipsSubtarget &>(MF.getSubtarget());
+
+  // TODO: Add support for other subtargets:
+  // microMIPS32r6 and microMIPS64r6
+  if (!Subtarget->inMicroMipsMode() || !Subtarget->hasMips32r2())
+    return false;
+
+  MipsII = static_cast<const MipsInstrInfo *>(Subtarget->getInstrInfo());
+
+  bool Modified = false;
+  MachineFunction::iterator I = MF.begin(), E = MF.end();
+
+  for (; I != E; ++I)
+    Modified |= ReduceMBB(*I);
+  return Modified;
+}
+
+/// Returns an instance of the MicroMips size reduction pass.
+FunctionPass *llvm::createMicroMipsSizeReductionPass() {
+  return new MicroMipsSizeReduce();
+}
diff --git a/contrib/llvm/lib/Target/Mips/Mips.h b/contrib/llvm/lib/Target/Mips/Mips.h
index d9faf33..008b950 100644
--- a/contrib/llvm/lib/Target/Mips/Mips.h
+++ b/contrib/llvm/lib/Target/Mips/Mips.h
@@ -23,15 +23,16 @@ namespace llvm {
   class ModulePass;
   class FunctionPass;
 
-  ModulePass *createMipsOs16Pass(MipsTargetMachine &TM);
-  ModulePass *createMips16HardFloatPass(MipsTargetMachine &TM);
+  ModulePass *createMipsOs16Pass();
+  ModulePass *createMips16HardFloatPass();
 
-  FunctionPass *createMipsModuleISelDagPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsOptimizePICCallPass(MipsTargetMachine &TM);
-  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsModuleISelDagPass();
+  FunctionPass *createMipsOptimizePICCallPass();
+  FunctionPass *createMipsDelaySlotFillerPass();
   FunctionPass *createMipsHazardSchedule();
-  FunctionPass *createMipsLongBranchPass(MipsTargetMachine &TM);
+  FunctionPass *createMipsLongBranchPass();
   FunctionPass *createMipsConstantIslandPass();
+  FunctionPass *createMicroMipsSizeReductionPass();
 } // end namespace llvm;
 
 #endif
diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td
index 670272d..6ceb055 100644
--- a/contrib/llvm/lib/Target/Mips/Mips.td
+++ b/contrib/llvm/lib/Target/Mips/Mips.td
@@ -156,6 +156,8 @@ def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
                                 "Mips64r6 ISA Support [experimental]",
                                 [FeatureMips32r6, FeatureMips64r5,
                                  FeatureNaN2008]>;
+def FeatureSym32       : SubtargetFeature<"sym32", "HasSym32", "true",
+                                          "Symbols are 32 bit on Mips64">;
 
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
                                       "Mips16 mode">;
@@ -183,6 +185,14 @@ def FeatureUseTCCInDIV : SubtargetFeature<
                                "UseTCCInDIV", "false",
                                "Force the assembler to use trapping">;
 
+def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true",
+                                    "Disable 4-operand madd.fmt and related instructions">;
+
+def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">;
+
+def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true",
+                                        "Disable use of the jal instruction">;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index e7ceca9..09e41e1 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -1,4 +1,4 @@
-//===-- Mips16FrameLowering.cpp - Mips16 Frame Information ----------------===//
+//===- Mips16FrameLowering.cpp - Mips16 Frame Information -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,14 +17,23 @@
 #include "MipsInstrInfo.h"
 #include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
 
 using namespace llvm;
 
@@ -63,7 +72,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
-  if (CSI.size()) {
+  if (!CSI.empty()) {
     const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
     for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
@@ -80,7 +89,6 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
       .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
-
 }
 
 void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 191006d..3c24261 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsTargetMachine.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
@@ -28,14 +29,16 @@ namespace {
   public:
     static char ID;
 
-    Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID), TM(TM_) {}
+    Mips16HardFloat() : ModulePass(ID) {}
 
     StringRef getPassName() const override { return "MIPS16 Hard Float Pass"; }
 
-    bool runOnModule(Module &M) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetPassConfig>();
+      ModulePass::getAnalysisUsage(AU);
+    }
 
-  protected:
-    const MipsTargetMachine &TM;
+    bool runOnModule(Module &M) override;
   };
 
   static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
@@ -405,7 +408,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
           "__mips16_ret_dc"
         };
         const char *Name = Helper[RV];
-        AttributeSet A;
+        AttributeList A;
         Value *Params[] = {RVal};
         Modified = true;
         //
@@ -414,13 +417,13 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         // during call setup, the proper call lowering to the helper
         // functions will take place.
         //
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            "__Mips16RetHelper");
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            Attribute::ReadNone);
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            Attribute::NoInline);
-        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T));
         CallInst::Create(F, Params, "", &I);
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
         FunctionType *FT = CI->getFunctionType();
@@ -490,15 +493,14 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
 // remove the use-soft-float attribute
 //
 static void removeUseSoftFloat(Function &F) {
-  AttributeSet A;
+  AttrBuilder B;
   DEBUG(errs() << "removing -use-soft-float\n");
-  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
-                     "use-soft-float", "false");
-  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  B.addAttribute("use-soft-float", "false");
+  F.removeAttributes(AttributeList::FunctionIndex, B);
   if (F.hasFnAttribute("use-soft-float")) {
     DEBUG(errs() << "still has -use-soft-float\n");
   }
-  F.addAttributes(AttributeSet::FunctionIndex, A);
+  F.addAttributes(AttributeList::FunctionIndex, B);
 }
 
 
@@ -521,6 +523,8 @@ static void removeUseSoftFloat(Function &F) {
 //       during call lowering but it should be moved here in the future.
 //
 bool Mips16HardFloat::runOnModule(Module &M) {
+  auto &TM = static_cast<const MipsTargetMachine &>(
+      getAnalysis<TargetPassConfig>().getTM<TargetMachine>());
   DEBUG(errs() << "Run on Module Mips16HardFloat\n");
   bool Modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
@@ -542,6 +546,6 @@ bool Mips16HardFloat::runOnModule(Module &M) {
 }
 
 
-ModulePass *llvm::createMips16HardFloatPass(MipsTargetMachine &TM) {
-  return new Mips16HardFloat(TM);
+ModulePass *llvm::createMips16HardFloatPass() {
+  return new Mips16HardFloat();
 }
diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
index 021fb86..52bf690 100644
--- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -766,6 +766,7 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
   let hasDelaySlot = 1;
   let isTerminator=1;
   let isBarrier=1;
+  let isReturn=1;
 }
 
 def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
@@ -773,6 +774,7 @@ def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
   let isIndirectBranch = 1;
   let isTerminator=1;
   let isBarrier=1;
+  let isReturn=1;
 }
 
 def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index 1b4d73b..7daea16 100644
--- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -326,9 +326,9 @@ class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd, II_AUIPC>;
 class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                     InstrItinClass itin = NoItinerary>
       : MipsR6Arch<instr_asm> {
-  dag OutOperandList = (outs GPROpnd:$rs);
-  dag InOperandList = (ins GPROpnd:$rt, uimm16:$imm);
-  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  dag OutOperandList = (outs GPROpnd:$rt);
+  dag InOperandList = (ins GPROpnd:$rs, uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm");
   list<dag> Pattern = [];
   InstrItinClass Itinerary = itin;
 }
@@ -917,6 +917,12 @@ def : MipsInstAlias<"jrc $rs", (JIC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
 let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalrc $rs", (JIALC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
 }
+
+def : MipsInstAlias<"div $rs, $rt", (DIV GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                         GPR32Opnd:$rt)>, ISA_MIPS32R6;
+def : MipsInstAlias<"divu $rs, $rt", (DIVU GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                           GPR32Opnd:$rt)>, ISA_MIPS32R6;
+
 //===----------------------------------------------------------------------===//
 //
 // Patterns and Pseudo Instructions
diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
index 521e22f..3dba7ce 100644
--- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -326,6 +326,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
               EXT_FM<5>, ISA_MIPS64R2;
 }
 
+let isCodeGenOnly = 1, AdditionalPredicates = [NotInMicroMips] in {
+  def DEXT64_32 : InstSE<(outs GPR64Opnd:$rt),
+                         (ins GPR32Opnd:$rs, uimm5_report_uimm6:$pos,
+                              uimm5_plus1:$size),
+                         "dext $rt, $rs, $pos, $size", [], II_EXT, FrmR, "dext">,
+                  EXT_FM<3>, ISA_MIPS64R2;
+}
+
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
                      "dsll\t$rd, $rt, 32", [], II_DSLL>;
@@ -356,11 +364,11 @@ class Count1s<string opstr, RegisterOperand RO>:
   let TwoOperandAliasConstraint = "$rd = $rs";
 }
 
-class ExtsCins<string opstr, InstrItinClass itin,
-               SDPatternOperator Op = null_frag>:
-  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
-         !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
-         [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+class ExtsCins<string opstr, InstrItinClass itin, RegisterOperand RO,
+               PatFrag PosImm, SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm5:$pos, uimm5:$lenm1),
+         !strconcat(opstr, "\t$rt, $rs, $pos, $lenm1"),
+         [(set RO:$rt, (Op RO:$rs, PosImm:$pos, imm:$lenm1))],
          itin, FrmR, opstr> {
   let TwoOperandAliasConstraint = "$rt = $rs";
 }
@@ -424,13 +432,28 @@ def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
   let Defs = [HI0, LO0, P0, P1, P2];
 }
 
-// Extract a signed bit field /+32
-def EXTS  : ExtsCins<"exts", II_EXT>, EXTS_FM<0x3a>, ASE_CNMIPS;
-def EXTS32: ExtsCins<"exts32", II_EXT>, EXTS_FM<0x3b>, ASE_CNMIPS;
-
-// Clear and insert a bit field /+32
-def CINS  : ExtsCins<"cins", II_INS>, EXTS_FM<0x32>, ASE_CNMIPS;
-def CINS32: ExtsCins<"cins32", II_INS>, EXTS_FM<0x33>, ASE_CNMIPS;
+let AdditionalPredicates = [NotInMicroMips] in {
+  // Extract a signed bit field /+32
+  def EXTS  : ExtsCins<"exts", II_EXT, GPR64Opnd, immZExt5>, EXTS_FM<0x3a>,
+              ASE_MIPS64_CNMIPS;
+  def EXTS32: ExtsCins<"exts32", II_EXT, GPR64Opnd, immZExt5Plus32>,
+              EXTS_FM<0x3b>, ASE_MIPS64_CNMIPS;
+
+  // Clear and insert a bit field /+32
+  def CINS  : ExtsCins<"cins", II_INS, GPR64Opnd, immZExt5, MipsCIns>,
+              EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+  def CINS32: ExtsCins<"cins32", II_INS, GPR64Opnd, immZExt5Plus32, MipsCIns>,
+              EXTS_FM<0x33>, ASE_MIPS64_CNMIPS;
+  let isCodeGenOnly = 1 in {
+    def CINS_i32 : ExtsCins<"cins", II_INS, GPR32Opnd, immZExt5, MipsCIns>,
+                   EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+    def CINS64_32 :InstSE<(outs GPR64Opnd:$rt),
+                          (ins GPR32Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+                          "cins\t$rt, $rs, $pos, $lenm1", [], II_INS, FrmR,
+                          "cins">,
+                   EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+  }
+}
 
 // Move to multiplier/product register
 def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>,
@@ -513,41 +536,87 @@ def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
 def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 
 // hi/lo relocs
-def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
-def : MipsPat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
-def : MipsPat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
-def : MipsPat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
-def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi64 tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+let AdditionalPredicates = [NotInMicroMips] in
+defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi64 texternalsym:$in)>;
+
+multiclass MipsHighestHigherHiLoRelocs<Instruction Lui, Instruction Daddiu> {
+  def : MipsPat<(MipsJmpLink (i64 texternalsym:$dst)),
+                (JAL texternalsym:$dst)>;
+  def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
+                (Lui tglobaladdr:$in)>;
+  def : MipsPat<(MipsHighest (i64 tblockaddress:$in)),
+                (Lui tblockaddress:$in)>;
+  def : MipsPat<(MipsHighest (i64 tjumptable:$in)),
+                (Lui tjumptable:$in)>;
+  def : MipsPat<(MipsHighest (i64 tconstpool:$in)),
+                (Lui tconstpool:$in)>;
+  def : MipsPat<(MipsHighest (i64 tglobaltlsaddr:$in)),
+                (Lui tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsHighest (i64 texternalsym:$in)),
+                (Lui texternalsym:$in)>;
+
+  def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)),
+                (Daddiu ZERO_64, tglobaladdr:$in)>;
+  def : MipsPat<(MipsHigher (i64 tblockaddress:$in)),
+                (Daddiu ZERO_64, tblockaddress:$in)>;
+  def : MipsPat<(MipsHigher (i64 tjumptable:$in)),
+                (Daddiu ZERO_64, tjumptable:$in)>;
+  def : MipsPat<(MipsHigher (i64 tconstpool:$in)),
+                (Daddiu ZERO_64, tconstpool:$in)>;
+  def : MipsPat<(MipsHigher (i64 tglobaltlsaddr:$in)),
+                (Daddiu ZERO_64, tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsHigher (i64 texternalsym:$in)),
+                (Daddiu ZERO_64, texternalsym:$in)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))),
+                (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tblockaddress:$lo))),
+                (Daddiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tjumptable:$lo))),
+                (Daddiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tconstpool:$lo))),
+                (Daddiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaltlsaddr:$lo))),
+                (Daddiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
+                (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tblockaddress:$lo))),
+                (Daddiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tjumptable:$lo))),
+                (Daddiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tconstpool:$lo))),
+                (Daddiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaltlsaddr:$lo))),
+                (Daddiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
+                (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tblockaddress:$lo))),
+                (Daddiu GPR64:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tjumptable:$lo))),
+                (Daddiu GPR64:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tconstpool:$lo))),
+                (Daddiu GPR64:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaltlsaddr:$lo))),
+                (Daddiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+}
+
+// highest/higher/hi/lo relocs
+let AdditionalPredicates = [NotInMicroMips] in
+defm : MipsHighestHigherHiLoRelocs<LUi64, DADDiu>, SYM_64;
+
+def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+def : WrapperPat<tconstpool, DADDiu, GPR64>;
+def : WrapperPat<texternalsym, DADDiu, GPR64>;
+def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+def : WrapperPat<tjumptable, DADDiu, GPR64>;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
 
-let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
-  def : MipsPat<(MipsLo tblockaddress:$in),
-                (DADDiu ZERO_64, tblockaddress:$in)>;
-  def : MipsPat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
-  def : MipsPat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
-  def : MipsPat<(MipsLo tglobaltlsaddr:$in),
-                (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
-  def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
-
-  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
-                (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
-                (DADDiu GPR64:$hi, tblockaddress:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
-                (DADDiu GPR64:$hi, tjumptable:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
-                (DADDiu GPR64:$hi, tconstpool:$lo)>;
-  def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
-                (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
-
-  def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
-  def : WrapperPat<tconstpool, DADDiu, GPR64>;
-  def : WrapperPat<texternalsym, DADDiu, GPR64>;
-  def : WrapperPat<tblockaddress, DADDiu, GPR64>;
-  def : WrapperPat<tjumptable, DADDiu, GPR64>;
-  def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
-}
 
 defm : BrcondPats<GPR64, BEQ64, BEQ, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
@@ -600,6 +669,14 @@ def : MipsPat<(i64 (anyext GPR32:$src)),
 def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
 def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
 
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(i64 (zext GPR32:$src)), (DEXT64_32 GPR32:$src, 0, 32)>,
+        ISA_MIPS64R2;
+  def : MipsPat<(i64 (zext (i32 (shl GPR32:$rt, immZExt5:$imm)))),
+                (CINS64_32 GPR32:$rt, imm:$imm, (immZExt5To31 imm:$imm))>,
+        ASE_MIPS64_CNMIPS;
+}
+
 // Sign extend in register
 def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
               (SLL64_64 GPR64:$src)>;
@@ -661,10 +738,16 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"daddu $rs, $imm",
                       (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
                       0>, ISA_MIPS3;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi64, GPR64Opnd, imm64>,
+         GPR_64;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi64, GPR64Opnd, imm64>,
+         GPR_64;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
+         GPR_64;
 }
-def : MipsInstAlias<"dsll $rd, $rt, $rs",
-                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
-                    ISA_MIPS3;
 let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"dneg $rt, $rs",
                       (DSUB GPR64Opnd:$rt, ZERO_64, GPR64Opnd:$rs), 1>,
@@ -707,9 +790,18 @@ def : MipsInstAlias<"dsra $rd, $rt, $rs",
                     (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                     ISA_MIPS3;
 let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                      (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                      ISA_MIPS3;
   def : MipsInstAlias<"dsrl $rd, $rt, $rs",
                       (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
                       ISA_MIPS3;
+  def : MipsInstAlias<"dsrl $rd, $rt",
+                      (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>,
+                      ISA_MIPS3;
+  def : MipsInstAlias<"dsll $rd, $rt",
+                      (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rd, GPR32Opnd:$rt), 0>,
+                      ISA_MIPS3;
 
 // Two operand (implicit 0 selector) versions:
   def : MipsInstAlias<"dmtc0 $rt, $rd",
@@ -741,21 +833,21 @@ def : MipsInstAlias<"bbit1 $rs, $p, $offset",
 def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
                     (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 def : MipsInstAlias<"exts $rt, $pos, $lenm1",
                     (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 
 // cins with $pos 32-63 in converted to cins32 with $pos 0-31
 def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
                     (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 def : MipsInstAlias<"cins $rt, $pos, $lenm1",
                     (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
@@ -770,3 +862,81 @@ def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
                                        "dla\t$rt, $addr">;
 def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
                                        "dla\t$rt, $imm64">;
+
+def DMULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                  simm32_relaxed:$imm),
+                                     "dmul\t$rs, $rt, $imm">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def DMULOMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                GPR64Opnd:$rd),
+                                   "dmulo\t$rs, $rt, $rd">,
+                 ISA_MIPS3_NOT_32R6_64R6;
+def DMULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                 GPR64Opnd:$rd),
+                                    "dmulou\t$rs, $rt, $rd">,
+                  ISA_MIPS3_NOT_32R6_64R6;
+
+def DMULMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                               GPR64Opnd:$rd),
+                                  "dmul\t$rs, $rt, $rd"> {
+  let InsnPredicates = [HasMips3, NotMips64r6, NotCnMips];
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSDivMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "ddiv\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DSDivIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, imm64:$imm),
+                                      "ddiv\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+  def DUDivMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "ddivu\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DUDivIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, imm64:$imm),
+                                      "ddivu\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+
+  // GAS expands 'div' and 'ddiv' differently when the destination
+  // register is $zero and the instruction is in the two operand
+  // form. 'ddiv' gets expanded, while 'div' is not expanded.
+
+  def : MipsInstAlias<"ddiv $rs, $rt", (DSDivMacro GPR64Opnd:$rs,
+                                               GPR64Opnd:$rs,
+                                               GPR64Opnd:$rt), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"ddiv $rd, $imm", (DSDivIMacro GPR64Opnd:$rd,
+                                                     GPR64Opnd:$rd,
+                                                     imm64:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+
+  // GAS expands 'divu' and 'ddivu' differently when the destination
+  // register is $zero and the instruction is in the two operand
+  // form. 'ddivu' gets expanded, while 'divu' is not expanded.
+
+  def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rs), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"ddivu $rd, $imm", (DUDivIMacro GPR64Opnd:$rd,
+                                                      GPR64Opnd:$rd,
+                                                      imm64:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+}
+
+def NORImm64 : NORIMM_DESC_BASE<GPR64Opnd, imm64>, GPR_64;
+def : MipsInstAlias<"nor\t$rs, $imm", (NORImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                imm64:$imm)>, GPR_64;
+def SLTImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
+                                 (ins GPR64Opnd:$rt, imm64:$imm),
+                                 "slt\t$rs, $rt, $imm">, GPR_64;
+def : MipsInstAlias<"slt\t$rs, $imm", (SLTImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                imm64:$imm)>, GPR_64;
+def SLTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
+                                  (ins GPR64Opnd:$rt, imm64:$imm),
+                                  "sltu\t$rs, $rt, $imm">, GPR_64;
+def : MipsInstAlias<"sltu\t$rs, $imm", (SLTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                  imm64:$imm)>, GPR_64;
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 04d6529..f7ff7c3 100644
--- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -12,17 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MipsAsmPrinter.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
-#include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
 #include "MipsTargetMachine.h"
 #include "MipsTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -39,10 +40,10 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -79,6 +80,9 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     NaClAlignIndirectJumpTargets(MF);
 
   AsmPrinter::runOnMachineFunction(MF);
+
+  emitXRayTable();
+
   return true;
 }
 
@@ -132,6 +136,7 @@ void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
 
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MipsTargetStreamer &TS = getTargetStreamer();
+  unsigned Opc = MI->getOpcode();
   TS.forbidModuleDirective();
 
   if (MI->isDebugValue()) {
@@ -143,20 +148,20 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   // If we just ended a constant pool, mark it as such.
-  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+  if (InConstantPool && Opc != Mips::CONSTPOOL_ENTRY) {
     OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
     InConstantPool = false;
   }
-  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+  if (Opc == Mips::CONSTPOOL_ENTRY) {
     // CONSTPOOL_ENTRY - This instruction represents a floating
-    //constant pool in the function.  The first operand is the ID#
+    // constant pool in the function.  The first operand is the ID#
     // for this instruction, the second is the index into the
     // MachineConstantPool that this is, the third is the size in
     // bytes of this constant pool entry.
     // The required alignment is specified on the basic block holding this MI.
     //
     unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
-    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+    unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex();
 
     // If this is the first entry of the pool, mark it.
     if (!InConstantPool) {
@@ -174,6 +179,17 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  switch (Opc) {
+  case Mips::PATCHABLE_FUNCTION_ENTER:
+    LowerPATCHABLE_FUNCTION_ENTER(*MI);
+    return;
+  case Mips::PATCHABLE_FUNCTION_EXIT:
+    LowerPATCHABLE_FUNCTION_EXIT(*MI);
+    return;
+  case Mips::PATCHABLE_TAIL_CALL:
+    LowerPATCHABLE_TAIL_CALL(*MI);
+    return;
+  }
 
   MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
@@ -257,9 +273,9 @@ void MipsAsmPrinter::printSavedRegsBitmask() {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   // size of stack area to which FP callee-saved regs are saved.
-  unsigned CPURegSize = Mips::GPR32RegClass.getSize();
-  unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
-  unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
+  unsigned CPURegSize = TRI->getRegSizeInBits(Mips::GPR32RegClass) / 8;
+  unsigned FGR32RegSize = TRI->getRegSizeInBits(Mips::FGR32RegClass) / 8;
+  unsigned AFGR64RegSize = TRI->getRegSizeInBits(Mips::AFGR64RegClass) / 8;
   bool HasAFGR64Reg = false;
   unsigned CSFPRegsSize = 0;
 
@@ -574,6 +590,8 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MipsII::MO_GOT:      O << "%got(";    break;
   case MipsII::MO_ABS_HI:   O << "%hi(";     break;
   case MipsII::MO_ABS_LO:   O << "%lo(";     break;
+  case MipsII::MO_HIGHER:   O << "%higher("; break;
+  case MipsII::MO_HIGHEST:  O << "%highest(("; break;
   case MipsII::MO_TLSGD:    O << "%tlsgd(";  break;
   case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
   case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
@@ -698,7 +716,7 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
     //        Ideally it should test for properties of the ABI and not the ABI
     //        itself.
     //        For the moment, I'm only correcting enough to make MIPS-IV work.
-    if (!isPositionIndependent() && !ABI.IsN64())
+    if (!isPositionIndependent() && STI.hasSym32())
       TS.emitDirectiveOptionPic0();
   }
 
@@ -1032,6 +1050,116 @@ void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
   OutStreamer->SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
 }
 
+void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
+  const uint8_t NoopsInSledCount = Subtarget->isGP64bit() ? 15 : 11;
+  // For mips32 we want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B .tmpN
+  //   11 NOP instructions (44 bytes)
+  //   ADDIU T9, T9, 52 
+  // .tmpN
+  //
+  // We need the 44 bytes (11 instructions) because at runtime, we'd
+  // be patching over the full 48 bytes (12 instructions) with the following
+  // pattern:
+  //
+  //   ADDIU	SP, SP, -8
+  //   NOP
+  //   SW	RA, 4(SP)
+  //   SW       T9, 0(SP)
+  //   LUI      T9, %hi(__xray_FunctionEntry/Exit)
+  //   ORI      T9, T9, %lo(__xray_FunctionEntry/Exit)
+  //   LUI      T0, %hi(function_id)
+  //   JALR	T9
+  //   ORI	T0, T0, %lo(function_id)
+  //   LW	T9, 0(SP)
+  //   LW       RA, 4(SP)
+  //   ADDIU    SP, SP, 8
+  //
+  // We add 52 bytes to t9 because we want to adjust the function pointer to
+  // the actual start of function i.e. the address just after the noop sled.
+  // We do this because gp displacement relocation is emitted at the start of
+  // of the function i.e after the nop sled and to correctly calculate the
+  // global offset table address, t9 must hold the address of the instruction
+  // containing the gp displacement relocation.
+  // FIXME: Is this correct for the static relocation model?
+  //
+  // For mips64 we want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B .tmpN
+  //   15 NOP instructions (60 bytes)
+  // .tmpN
+  //
+  // We need the 60 bytes (15 instructions) because at runtime, we'd
+  // be patching over the full 64 bytes (16 instructions) with the following
+  // pattern:
+  //
+  //   DADDIU   SP, SP, -16
+  //   NOP
+  //   SD       RA, 8(SP)
+  //   SD       T9, 0(SP)
+  //   LUI      T9, %highest(__xray_FunctionEntry/Exit)
+  //   ORI      T9, T9, %higher(__xray_FunctionEntry/Exit)
+  //   DSLL     T9, T9, 16
+  //   ORI      T9, T9, %hi(__xray_FunctionEntry/Exit)
+  //   DSLL     T9, T9, 16
+  //   ORI      T9, T9, %lo(__xray_FunctionEntry/Exit)
+  //   LUI      T0, %hi(function_id)
+  //   JALR     T9
+  //   ADDIU    T0, T0, %lo(function_id)
+  //   LD       T9, 0(SP)
+  //   LD       RA, 8(SP)
+  //   DADDIU   SP, SP, 16
+  //
+  OutStreamer->EmitCodeAlignment(4);
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Emit "B .tmpN" instruction, which jumps over the nop sled to the actual
+  // start of function
+  const MCExpr *TargetExpr = MCSymbolRefExpr::create(
+      Target, MCSymbolRefExpr::VariantKind::VK_None, OutContext);
+  EmitToStreamer(*OutStreamer, MCInstBuilder(Mips::BEQ)
+                                   .addReg(Mips::ZERO)
+                                   .addReg(Mips::ZERO)
+                                   .addExpr(TargetExpr));
+
+  for (int8_t I = 0; I < NoopsInSledCount; I++)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(Mips::SLL)
+                                     .addReg(Mips::ZERO)
+                                     .addReg(Mips::ZERO)
+                                     .addImm(0));
+
+  OutStreamer->EmitLabel(Target);
+
+  if (!Subtarget->isGP64bit()) {
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(Mips::ADDiu)
+                       .addReg(Mips::T9)
+                       .addReg(Mips::T9)
+                       .addImm(0x34));
+  }
+
+  recordSled(CurSled, MI, Kind);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::TAIL_CALL);
+}
+
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
                                            raw_ostream &OS) {
   // TODO: implement
@@ -1039,7 +1167,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
-void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value,
+void MipsAsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
                                           unsigned Size) const {
   switch (Size) {
   case 4:
diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
index c5cf524..4699e1b 100644
--- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -35,7 +35,21 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
+  //===------------------------------------------------------------------===//
+  // XRay implementation
+  //===------------------------------------------------------------------===//
+public:
+  // XRay-specific lowering for Mips.
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+  // Helper function that emits the XRay sleds we've collected for a particular
+  // function.
+  void EmitXRayTable();
+
 private:
+  void EmitSled(const MachineInstr &MI, SledKind Kind);
+
   // tblgen'erated function.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
@@ -140,7 +154,7 @@ public:
   void EmitStartOfAsmFile(Module &M) override;
   void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-  void EmitDebugValue(const MCExpr *Value, unsigned Size) const override;
+  void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const override;
 };
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
index 7af988c..6a03ee9 100644
--- a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -38,7 +38,7 @@ static bool isF128SoftLibCall(const char *CallSym) {
 
 /// This function returns true if Ty is fp128, {f128} or i128 which was
 /// originally a fp128.
-static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
+static bool originalTypeIsF128(const Type *Ty, const char *Func) {
   if (Ty->isFP128Ty())
     return true;
 
@@ -46,12 +46,25 @@ static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
       Ty->getStructElementType(0)->isFP128Ty())
     return true;
 
-  const ExternalSymbolSDNode *ES =
-      dyn_cast_or_null<const ExternalSymbolSDNode>(CallNode);
-
   // If the Ty is i128 and the function being called is a long double emulation
   // routine, then the original type is f128.
-  return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
+  return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func));
+}
+
+/// Return true if the original type was vXfXX.
+static bool originalEVTTypeIsVectorFloat(EVT Ty) {
+  if (Ty.isVector() && Ty.getVectorElementType().isFloatingPoint())
+    return true;
+
+  return false;
+}
+
+/// Return true if the original type was vXfXX / vXfXX.
+static bool originalTypeIsVectorFloat(const Type * Ty) {
+  if (Ty->isVectorTy() && Ty->isFPOrFPVectorTy())
+    return true;
+
+  return false;
 }
 
 MipsCCState::SpecialCallingConvType
@@ -73,16 +86,16 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
 
 void MipsCCState::PreAnalyzeCallResultForF128(
     const SmallVectorImpl<ISD::InputArg> &Ins,
-    const TargetLowering::CallLoweringInfo &CLI) {
+    const Type *RetTy, const char *Call) {
   for (unsigned i = 0; i < Ins.size(); ++i) {
     OriginalArgWasF128.push_back(
-        originalTypeIsF128(CLI.RetTy, CLI.Callee.getNode()));
-    OriginalArgWasFloat.push_back(CLI.RetTy->isFloatingPointTy());
+        originalTypeIsF128(RetTy, Call));
+    OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy());
   }
 }
 
-/// Identify lowered values that originated from f128 arguments and record
-/// this for use by RetCC_MipsN.
+/// Identify lowered values that originated from f128 or float arguments and
+/// record this for use by RetCC_MipsN.
 void MipsCCState::PreAnalyzeReturnForF128(
     const SmallVectorImpl<ISD::OutputArg> &Outs) {
   const MachineFunction &MF = getMachineFunction();
@@ -94,23 +107,44 @@ void MipsCCState::PreAnalyzeReturnForF128(
   }
 }
 
-/// Identify lowered values that originated from f128 arguments and record
+/// Identify lower values that originated from vXfXX and record
+/// this.
+void MipsCCState::PreAnalyzeCallResultForVectorFloat(
+    const SmallVectorImpl<ISD::InputArg> &Ins, const Type *RetTy) {
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy));
+  }
+}
+
+/// Identify lowered values that originated from vXfXX arguments and record
 /// this.
+void MipsCCState::PreAnalyzeReturnForVectorFloat(
+    const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    ISD::OutputArg Out = Outs[i];
+    OriginalRetWasFloatVector.push_back(
+        originalEVTTypeIsVectorFloat(Out.ArgVT));
+  }
+}
+
+/// Identify lowered values that originated from f128, float and sret to vXfXX
+/// arguments and record this.
 void MipsCCState::PreAnalyzeCallOperands(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-    const SDNode *CallNode) {
+    const char *Func) {
   for (unsigned i = 0; i < Outs.size(); ++i) {
-    OriginalArgWasF128.push_back(
-        originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode));
-    OriginalArgWasFloat.push_back(
-        FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy());
+    TargetLowering::ArgListEntry FuncArg = FuncArgs[Outs[i].OrigArgIndex];
+
+    OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func));
+    OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
+    OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
     CallOperandIsFixed.push_back(Outs[i].IsFixed);
   }
 }
 
-/// Identify lowered values that originated from f128 arguments and record
-/// this.
+/// Identify lowered values that originated from f128, float and vXfXX arguments
+/// and record this.
 void MipsCCState::PreAnalyzeFormalArgumentsForF128(
     const SmallVectorImpl<ISD::InputArg> &Ins) {
   const MachineFunction &MF = getMachineFunction();
@@ -123,6 +157,7 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128(
     if (Ins[i].Flags.isSRet()) {
       OriginalArgWasF128.push_back(false);
       OriginalArgWasFloat.push_back(false);
+      OriginalArgWasFloatVector.push_back(false);
       continue;
     }
 
@@ -132,5 +167,10 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128(
     OriginalArgWasF128.push_back(
         originalTypeIsF128(FuncArg->getType(), nullptr));
     OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
+
+    // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
+    // first argument is actually an SRet pointer to a vector, then the next
+    // argument slot is $a2.
+    OriginalArgWasFloatVector.push_back(FuncArg->getType()->isVectorTy());
   }
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.h b/contrib/llvm/lib/Target/Mips/MipsCCState.h
index 081c393..2790169 100644
--- a/contrib/llvm/lib/Target/Mips/MipsCCState.h
+++ b/contrib/llvm/lib/Target/Mips/MipsCCState.h
@@ -31,7 +31,7 @@ private:
   /// Identify lowered values that originated from f128 arguments and record
   /// this for use by RetCC_MipsN.
   void PreAnalyzeCallResultForF128(const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   const TargetLowering::CallLoweringInfo &CLI);
+                                   const Type *RetTy, const char * Func);
 
   /// Identify lowered values that originated from f128 arguments and record
   /// this for use by RetCC_MipsN.
@@ -42,19 +42,36 @@ private:
   void
   PreAnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                          std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-                         const SDNode *CallNode);
+                         const char *Func);
 
   /// Identify lowered values that originated from f128 arguments and record
-  /// this.
+  /// this for use by RetCC_MipsN.
   void
   PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
 
+  void
+  PreAnalyzeCallResultForVectorFloat(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     const Type *RetTy);
+
+  void PreAnalyzeFormalArgumentsForVectorFloat(
+      const SmallVectorImpl<ISD::InputArg> &Ins);
+
+  void
+  PreAnalyzeReturnForVectorFloat(const SmallVectorImpl<ISD::OutputArg> &Outs);
+
   /// Records whether the value has been lowered from an f128.
   SmallVector<bool, 4> OriginalArgWasF128;
 
   /// Records whether the value has been lowered from float.
   SmallVector<bool, 4> OriginalArgWasFloat;
 
+  /// Records whether the value has been lowered from a floating point vector.
+  SmallVector<bool, 4> OriginalArgWasFloatVector;
+
+  /// Records whether the return value has been lowered from a floating point
+  /// vector.
+  SmallVector<bool, 4> OriginalRetWasFloatVector;
+
   /// Records whether the value was a fixed argument.
   /// See ISD::OutputArg::IsFixed,
   SmallVector<bool, 4> CallOperandIsFixed;
@@ -73,11 +90,12 @@ public:
   AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                       CCAssignFn Fn,
                       std::vector<TargetLowering::ArgListEntry> &FuncArgs,
-                      const SDNode *CallNode) {
-    PreAnalyzeCallOperands(Outs, FuncArgs, CallNode);
+                      const char *Func) {
+    PreAnalyzeCallOperands(Outs, FuncArgs, Func);
     CCState::AnalyzeCallOperands(Outs, Fn);
     OriginalArgWasF128.clear();
     OriginalArgWasFloat.clear();
+    OriginalArgWasFloatVector.clear();
     CallOperandIsFixed.clear();
   }
 
@@ -96,31 +114,38 @@ public:
     CCState::AnalyzeFormalArguments(Ins, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
   }
 
   void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
-                         CCAssignFn Fn,
-                         const TargetLowering::CallLoweringInfo &CLI) {
-    PreAnalyzeCallResultForF128(Ins, CLI);
+                         CCAssignFn Fn, const Type *RetTy,
+                         const char *Func) {
+    PreAnalyzeCallResultForF128(Ins, RetTy, Func);
+    PreAnalyzeCallResultForVectorFloat(Ins, RetTy);
     CCState::AnalyzeCallResult(Ins, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
   }
 
   void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
                      CCAssignFn Fn) {
     PreAnalyzeReturnForF128(Outs);
+    PreAnalyzeReturnForVectorFloat(Outs);
     CCState::AnalyzeReturn(Outs, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
   }
 
   bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                    CCAssignFn Fn) {
     PreAnalyzeReturnForF128(ArgsFlags);
+    PreAnalyzeReturnForVectorFloat(ArgsFlags);
     bool Return = CCState::CheckReturn(ArgsFlags, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
     return Return;
   }
 
@@ -128,6 +153,12 @@ public:
   bool WasOriginalArgFloat(unsigned ValNo) {
       return OriginalArgWasFloat[ValNo];
   }
+  bool WasOriginalArgVectorFloat(unsigned ValNo) const {
+    return OriginalArgWasFloatVector[ValNo];
+  }
+  bool WasOriginalRetVectorFloat(unsigned ValNo) const {
+    return OriginalRetWasFloatVector[ValNo];
+  }
   bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
   SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
 };
diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
index a57cb7b..b5df78f 100644
--- a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
+++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td
@@ -37,6 +37,10 @@ class CCIfOrigArgWasF128<CCAction A>
 class CCIfArgIsVarArg<CCAction A>
     : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
 
+/// Match if the return was a floating point vector.
+class CCIfOrigArgWasNotVectorFloat<CCAction A>
+    : CCIf<"!static_cast<MipsCCState *>(&State)"
+                "->WasOriginalRetVectorFloat(ValNo)", A>;
 
 /// Match if the special calling conv is the specified value.
 class CCIfSpecialCallingConv<string CC, CCAction A>
@@ -93,8 +97,10 @@ def RetCC_MipsO32 : CallingConv<[
   // Promote i1/i8/i16 return values to i32.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
-  // i32 are returned in registers V0, V1, A0, A1
-  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>,
+  // i32 are returned in registers V0, V1, A0, A1, unless the original return
+  // type was a vector of floats.
+  CCIfOrigArgWasNotVectorFloat<CCIfType<[i32],
+                                        CCAssignToReg<[V0, V1, A0, A1]>>>,
 
   // f32 are returned in registers F0, F2
   CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 08b8ed3..ff43a39 100644
--- a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//
 // This pass is used to make Pc relative loads of constants.
 // For now, only Mips16 will use this. 
 //
@@ -19,30 +18,43 @@
 // This can be particularly helpful in static relocation mode for embedded
 // non-linux targets.
 //
-//
+//===----------------------------------------------------------------------===//
 
 #include "Mips.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips16InstrInfo.h"
 #include "MipsMachineFunction.h"
-#include "MipsTargetMachine.h"
+#include "MipsSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <vector>
 
 using namespace llvm;
 
@@ -58,7 +70,6 @@ static cl::opt<bool>
 AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
           cl::desc("Align constant islands in code"));
 
-
 // Rather than do make check tests with huge amounts of code, we force
 // the test to use this amount.
 //
@@ -178,7 +189,6 @@ static unsigned int branchMaxOffsets(unsigned int Opcode) {
 
 namespace {
 
-
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
@@ -195,7 +205,6 @@ namespace {
   ///             tracks a list of users.
 
   class MipsConstantIslands : public MachineFunctionPass {
-
     /// BasicBlockInfo - Information about the offset and size of a single
     /// basic block.
     struct BasicBlockInfo {
@@ -208,14 +217,16 @@ namespace {
       ///
       /// Because worst case padding is used, the computed offset of an aligned
       /// block may not actually be aligned.
-      unsigned Offset;
+      unsigned Offset = 0;
 
       /// Size - Size of the basic block in bytes.  If the block contains
       /// inline assembly, this is a worst case estimate.
       ///
       /// The size does not include any alignment padding whether from the
       /// beginning of the block, or from an aligned jump table at the end.
-      unsigned Size;
+      unsigned Size = 0;
+
+      BasicBlockInfo() = default;
 
       // FIXME: ignore LogAlign for this patch
       //
@@ -223,9 +234,6 @@ namespace {
         unsigned PO = Offset + Size;
         return PO;
       }
-
-      BasicBlockInfo() : Offset(0), Size(0) {}
-
     };
 
     std::vector<BasicBlockInfo> BBInfo;
@@ -257,13 +265,16 @@ namespace {
       MachineInstr *MI;
       MachineInstr *CPEMI;
       MachineBasicBlock *HighWaterMark;
+
     private:
       unsigned MaxDisp;
       unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
                                 // with different displacements
       unsigned LongFormOpcode;
+
     public:
       bool NegOk;
+
       CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
              bool neg,
              unsigned longformmaxdisp, unsigned longformopcode)
@@ -272,18 +283,22 @@ namespace {
           NegOk(neg){
         HighWaterMark = CPEMI->getParent();
       }
+
       /// getMaxDisp - Returns the maximum displacement supported by MI.
       unsigned getMaxDisp() const {
         unsigned xMaxDisp = ConstantIslandsSmallOffset?
                             ConstantIslandsSmallOffset: MaxDisp;
         return xMaxDisp;
       }
+
       void setMaxDisp(unsigned val) {
         MaxDisp = val;
       }
+
       unsigned getLongFormMaxDisp() const {
         return LongFormMaxDisp;
       }
+
       unsigned getLongFormOpcode() const {
           return LongFormOpcode;
       }
@@ -300,6 +315,7 @@ namespace {
     MachineInstr *CPEMI;
     unsigned CPI;
     unsigned RefCount;
+
     CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
       : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
   };
@@ -309,7 +325,7 @@ namespace {
   /// existed upon entry to this pass), it keeps a vector of entries.
   /// Original elements are cloned as we go along; the clones are
   /// put in the vector of the original element, but have distinct CPIs.
-  std::vector<std::vector<CPEntry> > CPEntries;
+  std::vector<std::vector<CPEntry>> CPEntries;
 
   /// ImmBranch - One per immediate branch, keeping the machine instruction
   /// pointer, conditional or unconditional, the max displacement,
@@ -320,6 +336,7 @@ namespace {
     unsigned MaxDisp : 31;
     bool isCond : 1;
     int UncondBr;
+
     ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
       : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
   };
@@ -332,29 +349,27 @@ namespace {
   /// the branch fix up pass.
   bool HasFarJump;
 
-  const MipsSubtarget *STI;
+  const MipsSubtarget *STI = nullptr;
   const Mips16InstrInfo *TII;
   MipsFunctionInfo *MFI;
-  MachineFunction *MF;
-  MachineConstantPool *MCP;
+  MachineFunction *MF = nullptr;
+  MachineConstantPool *MCP = nullptr;
 
   unsigned PICLabelUId;
-  bool PrescannedForConstants;
+  bool PrescannedForConstants = false;
 
   void initPICLabelUId(unsigned UId) {
     PICLabelUId = UId;
   }
 
-
   unsigned createPICLabelUId() {
     return PICLabelUId++;
   }
 
   public:
     static char ID;
-    MipsConstantIslands()
-        : MachineFunctionPass(ID), STI(nullptr), MF(nullptr), MCP(nullptr),
-          PrescannedForConstants(false) {}
+
+    MipsConstantIslands() : MachineFunctionPass(ID) {}
 
     StringRef getPassName() const override { return "Mips Constant Islands"; }
 
@@ -403,13 +418,11 @@ namespace {
     bool fixupUnconditionalBr(ImmBranch &Br);
 
     void prescanForConstants();
-
-  private:
-
   };
 
   char MipsConstantIslands::ID = 0;
-} // end of anonymous namespace
+
+} // end anonymous namespace
 
 bool MipsConstantIslands::isOffsetInRange
   (unsigned UserOffset, unsigned TrialOffset,
@@ -417,20 +430,17 @@ bool MipsConstantIslands::isOffsetInRange
   return isOffsetInRange(UserOffset, TrialOffset,
                          U.getMaxDisp(), U.NegOk);
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
-void MipsConstantIslands::dumpBBs() {
-  DEBUG({
-    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
-      const BasicBlockInfo &BBI = BBInfo[J];
-      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
-             << format(" size=%#x\n", BBInfo[J].Size);
-    }
-  });
-}
-/// Returns a pass that converts branches to long branches.
-FunctionPass *llvm::createMipsConstantIslandPass() {
-  return new MipsConstantIslands();
+LLVM_DUMP_METHOD void MipsConstantIslands::dumpBBs() {
+  for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+    const BasicBlockInfo &BBI = BBInfo[J];
+    dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+           << format(" size=%#x\n", BBInfo[J].Size);
+  }
 }
+#endif
 
 bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // The intention is for this to be a mips16 only pass for now
@@ -527,7 +537,6 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
   MF->push_back(BB);
 
-
   // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
   unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
 
@@ -647,7 +656,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
   for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
     computeBlockSize(&*I);
 
-
   // Compute block offsets.
   adjustBBOffsetsAfter(&MF->front());
 
@@ -737,7 +745,6 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
       if (Opc == Mips::CONSTPOOL_ENTRY)
         continue;
 
-
       // Scan the instructions for constant pool operands.
       for (unsigned op = 0, e = MI.getNumOperands(); op != e; ++op)
         if (MI.getOperand(op).isCPI()) {
@@ -784,12 +791,9 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           // Instructions can only use one CP entry, don't bother scanning the
           // rest of the operands.
           break;
-
         }
-
     }
   }
-
 }
 
 /// computeBlockSize - Compute the size and some alignment information for MBB.
@@ -921,8 +925,6 @@ MipsConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
   return NewBB;
 }
 
-
-
 /// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
 /// reference) is within MaxDisp of TrialOffset (a proposed location of a
 /// constant pool entry).
@@ -1337,7 +1339,6 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
   if (result==1) return false;
   else if (result==2) return true;
 
-
   // Look for water where we can place this CPE.
   MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
   MachineBasicBlock *NewMBB;
@@ -1371,7 +1372,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // it.  Check for this so it will be removed from the WaterList.
     // Also remove any entry from NewWaterList.
     MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
-    IP = find(WaterList, WaterBB);
+    IP = llvm::find(WaterList, WaterBB);
     if (IP != WaterList.end())
       NewWaterList.erase(WaterBB);
 
@@ -1473,9 +1474,7 @@ bool MipsConstantIslands::removeUnusedCPEntries() {
 /// specific BB can fit in MI's displacement field.
 bool MipsConstantIslands::isBBInRange
   (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
-
-unsigned PCAdj = 4;
-
+  unsigned PCAdj = 4;
   unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
   unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
 
@@ -1553,7 +1552,6 @@ MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   return true;
 }
 
-
 /// fixupConditionalBr - Fix up a conditional branch whose destination is too
 /// far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
@@ -1614,7 +1612,6 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
     }
   }
 
-
   if (NeedSplit) {
     splitBlockBeforeInstr(*MI);
     // No need for the branch to the next block. We're adding an unconditional
@@ -1654,7 +1651,6 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   return true;
 }
 
-
 void MipsConstantIslands::prescanForConstants() {
   unsigned J = 0;
   (void)J;
@@ -1667,11 +1663,11 @@ void MipsConstantIslands::prescanForConstants() {
           PrescannedForConstants = true;
           DEBUG(dbgs() << "constant island constant " << *I << "\n");
           J = I->getNumOperands();
-          DEBUG(dbgs() << "num operands " << J  << "\n");
+          DEBUG(dbgs() << "num operands " << J << "\n");
           MachineOperand& Literal = I->getOperand(1);
           if (Literal.isImm()) {
             int64_t V = Literal.getImm();
-            DEBUG(dbgs() << "literal " << V  << "\n");
+            DEBUG(dbgs() << "literal " << V << "\n");
             Type *Int32Ty =
               Type::getInt32Ty(MF->getFunction()->getContext());
             const Constant *C = ConstantInt::get(Int32Ty, V);
@@ -1692,3 +1688,8 @@ void MipsConstantIslands::prescanForConstants() {
     }
   }
 }
+
+/// Returns a pass that converts branches to long branches.
+FunctionPass *llvm::createMipsConstantIslandPass() {
+  return new MipsConstantIslands();
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
index ac9a81b..c238a65 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -19,6 +19,7 @@ def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
 def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
 def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
 def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+def immSExt10 : ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
 
 // Mips-specific dsp nodes
 def SDT_MipsExtr : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
@@ -851,8 +852,8 @@ class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
 class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, uimm8,
                                     immZExt8, NoItinerary, DSPROpnd>;
 
-class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, uimm10,
-                                    immZExt10, NoItinerary, DSPROpnd>;
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, simm10,
+                                    immSExt10, NoItinerary, DSPROpnd>;
 
 class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
                                              NoItinerary, DSPROpnd, GPR32Opnd>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index c821084..4a34e31 100644
--- a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -14,21 +14,39 @@
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
 
 using namespace llvm;
 
@@ -84,6 +102,7 @@ static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
 );
 
 namespace {
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
   typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
@@ -91,6 +110,7 @@ namespace {
   class RegDefsUses {
   public:
     RegDefsUses(const TargetRegisterInfo &TRI);
+
     void init(const MachineInstr &MI);
 
     /// This function sets all caller-saved registers in Defs.
@@ -120,18 +140,18 @@ namespace {
   /// Base class for inspecting loads and stores.
   class InspectMemInstr {
   public:
-    InspectMemInstr(bool ForbidMemInstr_)
-      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
-        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+    InspectMemInstr(bool ForbidMemInstr_) : ForbidMemInstr(ForbidMemInstr_) {}
+    virtual ~InspectMemInstr() = default;
 
     /// Return true if MI cannot be moved to delay slot.
     bool hasHazard(const MachineInstr &MI);
 
-    virtual ~InspectMemInstr() {}
-
   protected:
     /// Flags indicating whether loads or stores have been seen.
-    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+    bool OrigSeenLoad = false;
+    bool OrigSeenStore = false;
+    bool SeenLoad = false;
+    bool SeenStore = false;
 
     /// Memory instructions are not allowed to move to delay slot if this flag
     /// is true.
@@ -145,6 +165,7 @@ namespace {
   class NoMemInstr : public InspectMemInstr {
   public:
     NoMemInstr() : InspectMemInstr(true) {}
+
   private:
     bool hasHazard_(const MachineInstr &MI) override { return true; }
   };
@@ -153,6 +174,7 @@ namespace {
   class LoadFromStackOrConst : public InspectMemInstr {
   public:
     LoadFromStackOrConst() : InspectMemInstr(false) {}
+
   private:
     bool hasHazard_(const MachineInstr &MI) override;
   };
@@ -183,17 +205,18 @@ namespace {
 
     /// Flags indicating whether loads or stores with no underlying objects have
     /// been seen.
-    bool SeenNoObjLoad, SeenNoObjStore;
+    bool SeenNoObjLoad = false;
+    bool SeenNoObjStore = false;
   };
 
   class Filler : public MachineFunctionPass {
   public:
-    Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm) { }
+    Filler() : MachineFunctionPass(ID), TM(nullptr) {}
 
     StringRef getPassName() const override { return "Mips Delay Slot Filler"; }
 
     bool runOnMachineFunction(MachineFunction &F) override {
+      TM = &F.getTarget();
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -267,12 +290,14 @@ namespace {
 
     bool terminateSearch(const MachineInstr &Candidate) const;
 
-    TargetMachine &TM;
+    const TargetMachine *TM;
 
     static char ID;
   };
+
   char Filler::ID = 0;
-} // end of anonymous namespace
+
+} // end anonymous namespace
 
 static bool hasUnoccupiedSlot(const MachineInstr *MI) {
   return MI->hasDelaySlot() && !MI->isBundledWithSucc();
@@ -361,7 +386,7 @@ void RegDefsUses::setCallerSaved(const MachineInstr &MI) {
 void RegDefsUses::setUnallocatableRegs(const MachineFunction &MF) {
   BitVector AllocSet = TRI.getAllocatableSet(MF);
 
-  for (int R = AllocSet.find_first(); R != -1; R = AllocSet.find_next(R))
+  for (unsigned R : AllocSet.set_bits())
     for (MCRegAliasIterator AI(R, &TRI, false); AI.isValid(); ++AI)
       AllocSet.set(*AI);
 
@@ -458,8 +483,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
 }
 
 MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
-    : InspectMemInstr(false), MFI(MFI_), DL(DL), SeenNoObjLoad(false),
-      SeenNoObjStore(false) {}
+    : InspectMemInstr(false), MFI(MFI_), DL(DL) {}
 
 bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   bool HasHazard = false;
@@ -540,7 +564,7 @@ Iter Filler::replaceWithCompactBranch(MachineBasicBlock &MBB, Iter Branch,
 
 // For given opcode returns opcode of corresponding instruction with short
 // delay slot.
-// For the pseudo TAILCALL*_MM instrunctions return the short delay slot
+// For the pseudo TAILCALL*_MM instructions return the short delay slot
 // form. Unfortunately, TAILCALL<->b16 is denied as b16 has a limited range
 // that is too short to make use of for tail calls.
 static int getEquivalentCallShort(int Opcode) {
@@ -586,7 +610,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     Changed = true;
 
     // Delay slot filling is disabled at -O0.
-    if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None)) {
+    if (!DisableDelaySlotFiller && (TM->getOptLevel() != CodeGenOpt::None)) {
       bool Filled = false;
 
       if (MipsCompactBranchPolicy.getValue() != CB_Always ||
@@ -646,12 +670,6 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   return Changed;
 }
 
-/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
-/// slots in Mips MachineFunctions
-FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
-  return new Filler(tm);
-}
-
 template<typename IterTy>
 bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
                          RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
@@ -889,3 +907,7 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const {
           Candidate.isPosition() || Candidate.isInlineAsm() ||
           Candidate.hasUnmodeledSideEffects());
 }
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass() { return new Filler(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
index a44192f..f79cb0e 100644
--- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -1,4 +1,4 @@
-//===-- MipsFastISel.cpp - Mips FastISel implementation --------------------===//
+//===-- MipsFastISel.cpp - Mips FastISel implementation -------------------===//
 //
 // The LLVM Compiler Infrastructure
 //
@@ -14,24 +14,62 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsCCState.h"
-#include "MipsInstrInfo.h"
 #include "MipsISelLowering.h"
+#include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
-#include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <new>
 
 #define DEBUG_TYPE "mips-fastisel"
 
@@ -47,35 +85,40 @@ class MipsFastISel final : public FastISel {
     typedef enum { RegBase, FrameIndexBase } BaseKind;
 
   private:
-    BaseKind Kind;
+    BaseKind Kind = RegBase;
     union {
       unsigned Reg;
       int FI;
     } Base;
 
-    int64_t Offset;
+    int64_t Offset = 0;
 
-    const GlobalValue *GV;
+    const GlobalValue *GV = nullptr;
 
   public:
     // Innocuous defaults for our address.
-    Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
+    Address() { Base.Reg = 0; }
+
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
+
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
       Base.Reg = Reg;
     }
+
     unsigned getReg() const {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
+
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index access!");
       Base.FI = FI;
     }
+
     unsigned getFI() const {
       assert(isFIBase() && "Invalid base frame index access!");
       return Base.FI;
@@ -165,14 +208,17 @@ private:
   MachineInstrBuilder emitInst(unsigned Opc) {
     return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
   }
+
   MachineInstrBuilder emitInst(unsigned Opc, unsigned DstReg) {
     return BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                    DstReg);
   }
+
   MachineInstrBuilder emitInstStore(unsigned Opc, unsigned SrcReg,
                                     unsigned MemReg, int64_t MemOffset) {
     return emitInst(Opc).addReg(SrcReg).addReg(MemReg).addImm(MemOffset);
   }
+
   MachineInstrBuilder emitInstLoad(unsigned Opc, unsigned DstReg,
                                    unsigned MemReg, int64_t MemOffset) {
     return emitInst(Opc, DstReg).addReg(MemReg).addImm(MemOffset);
@@ -198,6 +244,7 @@ private:
   bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
                        unsigned &NumBytes);
   bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
+
   const MipsABIInfo &getABI() const {
     return static_cast<const MipsTargetMachine &>(TM).getABI();
   }
@@ -220,7 +267,8 @@ public:
 
 #include "MipsGenFastISel.inc"
 };
-} // end anonymous namespace.
+
+} // end anonymous namespace
 
 static bool CC_Mips(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
@@ -414,7 +462,6 @@ unsigned MipsFastISel::fastMaterializeConstant(const Constant *C) {
 }
 
 bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
-
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
@@ -432,10 +479,9 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
   switch (Opcode) {
   default:
     break;
-  case Instruction::BitCast: {
+  case Instruction::BitCast:
     // Look through bitcasts.
     return computeAddress(U->getOperand(0), Addr);
-  }
   case Instruction::GetElementPtr: {
     Address SavedAddr = Addr;
     int64_t TmpOffset = Addr.getOffset();
@@ -451,7 +497,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {
         TmpOffset += SL->getElementOffset(Idx);
       } else {
         uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
-        for (;;) {
+        while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
             TmpOffset += CI->getSExtValue() * S;
@@ -613,14 +659,12 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg);
     break;
   }
-  case CmpInst::ICMP_UGT: {
+  case CmpInst::ICMP_UGT:
     emitInst(Mips::SLTu, ResultReg).addReg(RightReg).addReg(LeftReg);
     break;
-  }
-  case CmpInst::ICMP_ULT: {
+  case CmpInst::ICMP_ULT:
     emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg);
     break;
-  }
   case CmpInst::ICMP_UGE: {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg);
@@ -633,14 +677,12 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
     break;
   }
-  case CmpInst::ICMP_SGT: {
+  case CmpInst::ICMP_SGT:
     emitInst(Mips::SLT, ResultReg).addReg(RightReg).addReg(LeftReg);
     break;
-  }
-  case CmpInst::ICMP_SLT: {
+  case CmpInst::ICMP_SLT:
     emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg);
     break;
-  }
   case CmpInst::ICMP_SGE: {
     unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg);
@@ -709,6 +751,7 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
   }
   return true;
 }
+
 bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                             unsigned Alignment) {
   //
@@ -716,35 +759,30 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   //
   unsigned Opc;
   switch (VT.SimpleTy) {
-  case MVT::i32: {
+  case MVT::i32:
     ResultReg = createResultReg(&Mips::GPR32RegClass);
     Opc = Mips::LW;
     break;
-  }
-  case MVT::i16: {
+  case MVT::i16:
     ResultReg = createResultReg(&Mips::GPR32RegClass);
     Opc = Mips::LHu;
     break;
-  }
-  case MVT::i8: {
+  case MVT::i8:
     ResultReg = createResultReg(&Mips::GPR32RegClass);
     Opc = Mips::LBu;
     break;
-  }
-  case MVT::f32: {
+  case MVT::f32:
     if (UnsupportedFPMode)
       return false;
     ResultReg = createResultReg(&Mips::FGR32RegClass);
     Opc = Mips::LWC1;
     break;
-  }
-  case MVT::f64: {
+  case MVT::f64:
     if (UnsupportedFPMode)
       return false;
     ResultReg = createResultReg(&Mips::AFGR64RegClass);
     Opc = Mips::LDC1;
     break;
-  }
   default:
     return false;
   }
@@ -1095,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
   if (NumBytes < 16)
     NumBytes = 16;
 
-  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16);
+  emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0);
   // Process the args.
   MVT firstMVT;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -1222,8 +1260,11 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
   emitInst(Mips::ADJCALLSTACKUP).addImm(16).addImm(0);
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
-    CCInfo.AnalyzeCallResult(RetVT, RetCC_Mips);
+    MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
+
+    CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy,
+                             CLI.Symbol ? CLI.Symbol->getName().data()
+                                        : nullptr);
 
     // Only handle a single return value.
     if (RVLocs.size() != 1)
@@ -1286,11 +1327,10 @@ bool MipsFastISel::fastLowerArguments() {
   // Only handle simple cases. i.e. All arguments are directly mapped to
   // registers of the appropriate type.
   SmallVector<AllocatedReg, 4> Allocation;
-  unsigned Idx = 1;
   for (const auto &FormalArg : F->args()) {
-    if (F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::ByVal)) {
+    if (FormalArg.hasAttribute(Attribute::InReg) ||
+        FormalArg.hasAttribute(Attribute::StructRet) ||
+        FormalArg.hasAttribute(Attribute::ByVal)) {
       DEBUG(dbgs() << ".. gave up (inreg, structret, byval)\n");
       return false;
     }
@@ -1302,7 +1342,8 @@ bool MipsFastISel::fastLowerArguments() {
     }
 
     EVT ArgVT = TLI.getValueType(DL, ArgTy);
-    DEBUG(dbgs() << ".. " << (Idx - 1) << ": " << ArgVT.getEVTString() << "\n");
+    DEBUG(dbgs() << ".. " << FormalArg.getArgNo() << ": "
+                 << ArgVT.getEVTString() << "\n");
     if (!ArgVT.isSimple()) {
       DEBUG(dbgs() << ".. .. gave up (not a simple type)\n");
       return false;
@@ -1312,8 +1353,8 @@ bool MipsFastISel::fastLowerArguments() {
     case MVT::i1:
     case MVT::i8:
     case MVT::i16:
-      if (!F->getAttributes().hasAttribute(Idx, Attribute::SExt) &&
-          !F->getAttributes().hasAttribute(Idx, Attribute::ZExt)) {
+      if (!FormalArg.hasAttribute(Attribute::SExt) &&
+          !FormalArg.hasAttribute(Attribute::ZExt)) {
         // It must be any extend, this shouldn't happen for clang-generated IR
         // so just fall back on SelectionDAG.
         DEBUG(dbgs() << ".. .. gave up (i8/i16 arg is not extended)\n");
@@ -1334,7 +1375,7 @@ bool MipsFastISel::fastLowerArguments() {
       break;
 
     case MVT::i32:
-      if (F->getAttributes().hasAttribute(Idx, Attribute::ZExt)) {
+      if (FormalArg.hasAttribute(Attribute::ZExt)) {
         // The O32 ABI does not permit a zero-extended i32.
         DEBUG(dbgs() << ".. .. gave up (i32 arg is zero extended)\n");
         return false;
@@ -1397,23 +1438,20 @@ bool MipsFastISel::fastLowerArguments() {
       DEBUG(dbgs() << ".. .. gave up (unknown type)\n");
       return false;
     }
-
-    ++Idx;
   }
 
-  Idx = 0;
   for (const auto &FormalArg : F->args()) {
-    unsigned SrcReg = Allocation[Idx].Reg;
-    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[Idx].RC);
+    unsigned ArgNo = FormalArg.getArgNo();
+    unsigned SrcReg = Allocation[ArgNo].Reg;
+    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[ArgNo].RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
-    unsigned ResultReg = createResultReg(Allocation[Idx].RC);
+    unsigned ResultReg = createResultReg(Allocation[ArgNo].RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(DstReg, getKillRegState(true));
     updateValueMap(&FormalArg, ResultReg);
-    ++Idx;
   }
 
   // Calculate the size of the incoming arguments area.
@@ -1730,6 +1768,7 @@ bool MipsFastISel::selectTrunc(const Instruction *I) {
   updateValueMap(I, SrcReg);
   return true;
 }
+
 bool MipsFastISel::selectIntExt(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Src = I->getOperand(0);
@@ -1757,6 +1796,7 @@ bool MipsFastISel::selectIntExt(const Instruction *I) {
   updateValueMap(I, ResultReg);
   return true;
 }
+
 bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                    unsigned DestReg) {
   unsigned ShiftAmt;
@@ -2074,8 +2114,10 @@ unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
 }
 
 namespace llvm {
+
 FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
                                const TargetLibraryInfo *libInfo) {
   return new MipsFastISel(funcInfo, libInfo);
 }
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
index b2cf039..ef05166 100644
--- a/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -119,7 +119,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
 
   // Conservatively assume all callee-saved registers will be saved.
   for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
-    unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+    unsigned Size = TRI.getSpillSize(*TRI.getMinimalPhysRegClass(*R));
     Offset = alignTo(Offset + Size, Size);
   }
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
index 31b8612..f6fcf6e 100644
--- a/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsHazardSchedule.cpp
@@ -36,7 +36,7 @@
 ///
 /// A) A previous pass has created a compact branch directly.
 /// B) Transforming a delay slot branch into compact branch. This case can be
-///    difficult to process as lookahead for hazards is insufficent, as
+///    difficult to process as lookahead for hazards is insufficient, as
 ///    backwards delay slot fillling can also produce hazards in previously
 ///    processed instuctions.
 ///
@@ -103,23 +103,24 @@ static Iter getNextMachineInstrInBB(Iter Position) {
 
 // Find the next real instruction from the current position, looking through
 // basic block boundaries.
-static Iter getNextMachineInstr(Iter Position, MachineBasicBlock *Parent) {
+static std::pair<Iter, bool> getNextMachineInstr(Iter Position, MachineBasicBlock * Parent) {
   if (Position == Parent->end()) {
-    MachineBasicBlock *Succ = Parent->getNextNode();
-    if (Succ != nullptr && Parent->isSuccessor(Succ)) {
-      Position = Succ->begin();
-      Parent = Succ;
-    } else {
-      llvm_unreachable(
-          "Should have identified the end of the function earlier!");
-    }
+    do {
+      MachineBasicBlock *Succ = Parent->getNextNode();
+      if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+        Position = Succ->begin();
+        Parent = Succ;
+      } else {
+        return std::make_pair(Position, true);
+      }
+    } while (Parent->empty());
   }
 
   Iter Instr = getNextMachineInstrInBB(Position);
   if (Instr == Parent->end()) {
     return getNextMachineInstr(Instr, Parent);
   }
-  return Instr;
+  return std::make_pair(Instr, false);
 }
 
 bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
@@ -145,7 +146,9 @@ bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
       bool LastInstInFunction =
           std::next(I) == FI->end() && std::next(FI) == MF.end();
       if (!LastInstInFunction) {
-        Inst = getNextMachineInstr(std::next(I), &*FI);
+        std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
+        LastInstInFunction |= Res.second;
+        Inst = Res.first;
       }
 
       if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 9c511bd..20319f8 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -22,12 +22,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallingConv.h"
@@ -71,6 +71,48 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   return true;
 }
 
+// The MIPS MSA ABI passes vector arguments in the integer register set.
+// The number of integer registers used is dependant on the ABI used.
+MVT MipsTargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
+  if (VT.isVector() && Subtarget.hasMSA())
+    return Subtarget.isABI_O32() ? MVT::i32 : MVT::i64;
+  return MipsTargetLowering::getRegisterType(VT);
+}
+
+MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                      EVT VT) const {
+  if (VT.isVector()) {
+      if (Subtarget.isABI_O32()) {
+        return MVT::i32;
+      } else {
+        return (VT.getSizeInBits() == 32) ? MVT::i32 : MVT::i64;
+      }
+  }
+  return MipsTargetLowering::getRegisterType(Context, VT);
+}
+
+unsigned MipsTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+                                                           EVT VT) const {
+  if (VT.isVector())
+    return std::max((VT.getSizeInBits() / (Subtarget.isABI_O32() ? 32 : 64)),
+                    1U);
+  return MipsTargetLowering::getNumRegisters(Context, VT);
+}
+
+unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+
+  // Break down vector types to either 2 i64s or 4 i32s.
+  RegisterVT = getRegisterTypeForCallingConv(Context, VT) ;
+  IntermediateVT = RegisterVT;
+  NumIntermediates = VT.getSizeInBits() < RegisterVT.getSizeInBits()
+                         ? VT.getVectorNumElements()
+                         : VT.getSizeInBits() / RegisterVT.getSizeInBits();
+
+  return NumIntermediates;
+}
+
 SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
   MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
   return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
@@ -112,8 +154,11 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::FIRST_NUMBER:      break;
   case MipsISD::JmpLink:           return "MipsISD::JmpLink";
   case MipsISD::TailCall:          return "MipsISD::TailCall";
+  case MipsISD::Highest:           return "MipsISD::Highest";
+  case MipsISD::Higher:            return "MipsISD::Higher";
   case MipsISD::Hi:                return "MipsISD::Hi";
   case MipsISD::Lo:                return "MipsISD::Lo";
+  case MipsISD::GotHi:             return "MipsISD::GotHi";
   case MipsISD::GPRel:             return "MipsISD::GPRel";
   case MipsISD::ThreadPointer:     return "MipsISD::ThreadPointer";
   case MipsISD::Ret:               return "MipsISD::Ret";
@@ -144,6 +189,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Sync:              return "MipsISD::Sync";
   case MipsISD::Ext:               return "MipsISD::Ext";
   case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::CIns:              return "MipsISD::CIns";
   case MipsISD::LWL:               return "MipsISD::LWL";
   case MipsISD::LWR:               return "MipsISD::LWR";
   case MipsISD::SWL:               return "MipsISD::SWL";
@@ -318,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::UDIV, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
+  if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) {
+    setOperationAction(ISD::ADDC, MVT::i32, Expand);
+    setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i64, Expand);
+  setOperationAction(ISD::SUBE, MVT::i64, Expand);
+
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_CC,             MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC,             MVT::f64,   Expand);
@@ -358,7 +416,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
   setOperationAction(ISD::FSINCOS,           MVT::f32,   Expand);
   setOperationAction(ISD::FSINCOS,           MVT::f64,   Expand);
-  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
   setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
@@ -424,7 +481,9 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::SHL);
 
   if (ABI.IsO32()) {
     // These libcalls are not available in 32-bit.
@@ -466,8 +525,9 @@ MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                      !Subtarget.hasMips32r6() && !Subtarget.inMips16Mode() &&
                      !Subtarget.inMicroMipsMode();
 
-  // Disable if we don't generate PIC or the ABI isn't O32.
-  if (!TM.isPositionIndependent() || !TM.getABI().IsO32())
+  // Disable if either of the following is true:
+  // We do not generate PIC, the ABI is not O32, LargeGOT is being used.
+  if (!TM.isPositionIndependent() || !TM.getABI().IsO32() || LargeGOT)
     UseFastISel = false;
 
   return UseFastISel ? Mips::createFastISel(funcInfo, libInfo) : nullptr;
@@ -699,41 +759,81 @@ static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // Pattern match EXT.
-  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
-  //  => ext $dst, $src, size, pos
   if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
     return SDValue();
 
-  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
-  unsigned ShiftRightOpc = ShiftRight.getOpcode();
-
-  // Op's first operand must be a shift right.
-  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
-    return SDValue();
+  SDValue FirstOperand = N->getOperand(0);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  SDValue Mask = N->getOperand(1);
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
 
-  // The second operand of the shift must be an immediate.
+  uint64_t Pos = 0, SMPos, SMSize;
   ConstantSDNode *CN;
-  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
-    return SDValue();
-
-  uint64_t Pos = CN->getZExtValue();
-  uint64_t SMPos, SMSize;
+  SDValue NewOperand;
+  unsigned Opc;
 
   // Op's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
       !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
     return SDValue();
 
-  // Return if the shifted mask does not start at bit 0 or the sum of its size
-  // and Pos exceeds the word's size.
-  EVT ValTy = N->getValueType(0);
-  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
-    return SDValue();
+  if (FirstOperandOpc == ISD::SRA || FirstOperandOpc == ISD::SRL) {
+    // Pattern match EXT.
+    //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+    //  => ext $dst, $src, pos, size
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    Pos = CN->getZExtValue();
+
+    // Return if the shifted mask does not start at bit 0 or the sum of its size
+    // and Pos exceeds the word's size.
+    if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+      return SDValue();
+
+    Opc = MipsISD::Ext;
+    NewOperand = FirstOperand.getOperand(0);
+  } else if (FirstOperandOpc == ISD::SHL && Subtarget.hasCnMips()) {
+    // Pattern match CINS.
+    //  $dst = and (shl $src , pos), mask
+    //  => cins $dst, $src, pos, size
+    // mask is a shifted mask with consecutive 1's, pos = shift amount,
+    // size = population count.
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    Pos = CN->getZExtValue();
+
+    if (SMPos != Pos || Pos >= ValTy.getSizeInBits() || SMSize >= 32 ||
+        Pos + SMSize > ValTy.getSizeInBits())
+      return SDValue();
+
+    NewOperand = FirstOperand.getOperand(0);
+    // SMSize is 'location' (position) in this case, not size.
+    SMSize--;
+    Opc = MipsISD::CIns;
+  } else {
+    // Pattern match EXT.
+    //  $dst = and $src, (2**size - 1) , if size > 16
+    //  => ext $dst, $src, pos, size , pos = 0
 
-  SDLoc DL(N);
-  return DAG.getNode(MipsISD::Ext, DL, ValTy,
-                     ShiftRight.getOperand(0),
+    // If the mask is <= 0xffff, andi can be used instead.
+    if (CN->getZExtValue() <= 0xffff)
+      return SDValue();
+
+    // Return if the mask doesn't start at position 0.
+    if (SMPos)
+      return SDValue();
+
+    Opc = MipsISD::Ext;
+    NewOperand = FirstOperand;
+  }
+  return DAG.getNode(Opc, DL, ValTy, NewOperand,
                      DAG.getConstant(Pos, DL, MVT::i32),
                      DAG.getConstant(SMSize, DL, MVT::i32));
 }
@@ -750,7 +850,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
   uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
-  ConstantSDNode *CN;
+  ConstantSDNode *CN, *CN1;
 
   // See if Op's first operand matches (and $src1 , mask0).
   if (And0.getOpcode() != ISD::AND)
@@ -761,47 +861,202 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // See if Op's second operand matches (and (shl $src, pos), mask1).
-  if (And1.getOpcode() != ISD::AND)
+  if (And1.getOpcode() == ISD::AND &&
+      And1.getOperand(0).getOpcode() == ISD::SHL) {
+
+    if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+        !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+      return SDValue();
+
+    // The shift masks must have the same position and size.
+    if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+      return SDValue();
+
+    SDValue Shl = And1.getOperand(0);
+
+    if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+      return SDValue();
+
+    unsigned Shamt = CN->getZExtValue();
+
+    // Return if the shift amount and the first bit position of mask are not the
+    // same.
+    EVT ValTy = N->getValueType(0);
+    if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+      return SDValue();
+
+    SDLoc DL(N);
+    return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
+                       DAG.getConstant(SMPos0, DL, MVT::i32),
+                       DAG.getConstant(SMSize0, DL, MVT::i32),
+                       And0.getOperand(0));
+  } else {
+    // Pattern match DINS.
+    //  $dst = or (and $src, mask0), mask1
+    //  where mask0 = ((1 << SMSize0) -1) << SMPos0
+    //  => dins $dst, $src, pos, size
+    if (~CN->getSExtValue() == ((((int64_t)1 << SMSize0) - 1) << SMPos0) &&
+        ((SMSize0 + SMPos0 <= 64 && Subtarget.hasMips64r2()) ||
+         (SMSize0 + SMPos0 <= 32))) {
+      // Check if AND instruction has constant as argument
+      bool isConstCase = And1.getOpcode() != ISD::AND;
+      if (And1.getOpcode() == ISD::AND) {
+        if (!(CN1 = dyn_cast<ConstantSDNode>(And1->getOperand(1))))
+          return SDValue();
+      } else {
+        if (!(CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1))))
+          return SDValue();
+      }
+      // Don't generate INS if constant OR operand doesn't fit into bits
+      // cleared by constant AND operand.
+      if (CN->getSExtValue() & CN1->getSExtValue())
+        return SDValue();
+
+      SDLoc DL(N);
+      EVT ValTy = N->getOperand(0)->getValueType(0);
+      SDValue Const1;
+      SDValue SrlX;
+      if (!isConstCase) {
+        Const1 = DAG.getConstant(SMPos0, DL, MVT::i32);
+        SrlX = DAG.getNode(ISD::SRL, DL, And1->getValueType(0), And1, Const1);
+      }
+      return DAG.getNode(
+          MipsISD::Ins, DL, N->getValueType(0),
+          isConstCase
+              ? DAG.getConstant(CN1->getSExtValue() >> SMPos0, DL, ValTy)
+              : SrlX,
+          DAG.getConstant(SMPos0, DL, MVT::i32),
+          DAG.getConstant(ValTy.getSizeInBits() / 8 < 8 ? SMSize0 & 31
+                                                        : SMSize0,
+                          DL, MVT::i32),
+          And0->getOperand(0));
+
+    }
     return SDValue();
+  }
+}
 
-  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
-      !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG,
+                                       const MipsSubtarget &Subtarget) {
+  // ROOTNode must have a multiplication as an operand for the match to be
+  // successful.
+  if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL &&
+      ROOTNode->getOperand(1).getOpcode() != ISD::MUL)
     return SDValue();
 
-  // The shift masks must have the same position and size.
-  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+  // We don't handle vector types here.
+  if (ROOTNode->getValueType(0).isVector())
     return SDValue();
 
-  SDValue Shl = And1.getOperand(0);
-  if (Shl.getOpcode() != ISD::SHL)
+  // For MIPS64, madd / msub instructions are inefficent to use with 64 bit
+  // arithmetic. E.g.
+  // (add (mul a b) c) =>
+  //   let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in
+  //   MIPS64:   (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32)
+  //   or
+  //   MIPS64R2: (dins (mflo res) (mfhi res) 32 32)
+  //
+  // The overhead of setting up the Hi/Lo registers and reassembling the
+  // result makes this a dubious optimzation for MIPS64. The core of the
+  // problem is that Hi/Lo contain the upper and lower 32 bits of the
+  // operand and result.
+  //
+  // It requires a chain of 4 add/mul for MIPS64R2 to get better code
+  // density than doing it naively, 5 for MIPS64. Additionally, using
+  // madd/msub on MIPS64 requires the operands actually be 32 bit sign
+  // extended operands, not true 64 bit values.
+  //
+  // FIXME: For the moment, disable this completely for MIPS64.
+  if (Subtarget.hasMips64())
     return SDValue();
 
-  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+  SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(0)
+                     : ROOTNode->getOperand(1);
+
+  SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL
+                     ? ROOTNode->getOperand(1)
+                     : ROOTNode->getOperand(0);
+
+  // Transform this to a MADD only if the user of this node is the add.
+  // If there are other users of the mul, this function returns here.
+  if (!Mult.hasOneUse())
     return SDValue();
 
-  unsigned Shamt = CN->getZExtValue();
+  // maddu and madd are unusual instructions in that on MIPS64 bits 63..31
+  // must be in canonical form, i.e. sign extended. For MIPS32, the operands
+  // of the multiply must have 32 or more sign bits, otherwise we cannot
+  // perform this optimization. We have to check this here as we're performing
+  // this optimization pre-legalization.
+  SDValue MultLHS = Mult->getOperand(0);
+  SDValue MultRHS = Mult->getOperand(1);
 
-  // Return if the shift amount and the first bit position of mask are not the
-  // same.
-  EVT ValTy = N->getValueType(0);
-  if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+  bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND &&
+                  MultRHS->getOpcode() == ISD::SIGN_EXTEND;
+  bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND &&
+                    MultRHS->getOpcode() == ISD::ZERO_EXTEND;
+
+  if (!IsSigned && !IsUnsigned)
     return SDValue();
 
-  SDLoc DL(N);
-  return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
-                     DAG.getConstant(SMPos0, DL, MVT::i32),
-                     DAG.getConstant(SMSize0, DL, MVT::i32),
-                     And0.getOperand(0));
+  // Initialize accumulator.
+  SDLoc DL(ROOTNode);
+  SDValue TopHalf;
+  SDValue BottomHalf;
+  BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                              CurDAG.getIntPtrConstant(0, DL));
+
+  TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand,
+                           CurDAG.getIntPtrConstant(1, DL));
+  SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
+                                  BottomHalf,
+                                  TopHalf);
+
+  // Create MipsMAdd(u) / MipsMSub(u) node.
+  bool IsAdd = ROOTNode->getOpcode() == ISD::ADD;
+  unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd)
+                          : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub);
+  SDValue MAddOps[3] = {
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)),
+      CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn};
+  EVT VTs[2] = {MVT::i32, MVT::i32};
+  SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps);
+
+  SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
+  SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
+  SDValue Combined =
+      CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi);
+  return Combined;
+}
+
+static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // (sub v0 (mul v1, v2)) => (msub v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
+
+    return SDValue();
+  }
+
+  return SDValue();
 }
 
 static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
+  // (add v0 (mul v1, v2)) => (madd v1, v2, v0)
+  if (DCI.isBeforeLegalizeOps()) {
+    if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
+        !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64)
+      return performMADD_MSUBCombine(N, DAG, Subtarget);
 
-  if (DCI.isBeforeLegalizeOps())
     return SDValue();
+  }
 
+  // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt))
   SDValue Add = N->getOperand(1);
 
   if (Add.getOpcode() != ISD::ADD)
@@ -852,6 +1107,58 @@ static SDValue performAssertZextCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // Pattern match CINS.
+  //  $dst = shl (and $src , imm), pos
+  //  => cins $dst, $src, pos, size
+
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasCnMips())
+    return SDValue();
+
+  SDValue FirstOperand = N->getOperand(0);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  SDValue SecondOperand = N->getOperand(1);
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+
+  uint64_t Pos = 0, SMPos, SMSize;
+  ConstantSDNode *CN;
+  SDValue NewOperand;
+
+  // The second operand of the shift must be an immediate.
+  if (!(CN = dyn_cast<ConstantSDNode>(SecondOperand)))
+    return SDValue();
+
+  Pos = CN->getZExtValue();
+
+  if (Pos >= ValTy.getSizeInBits())
+    return SDValue();
+
+  if (FirstOperandOpc != ISD::AND)
+    return SDValue();
+
+  // AND's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  if (SMPos != 0 || SMSize > 32 || Pos + SMSize > ValTy.getSizeInBits())
+    return SDValue();
+
+  NewOperand = FirstOperand.getOperand(0);
+  // SMSize is 'location' (position) in this case, not size.
+  SMSize--;
+
+  return DAG.getNode(MipsISD::CIns, DL, ValTy, NewOperand,
+                     DAG.getConstant(Pos, DL, MVT::i32),
+                     DAG.getConstant(SMSize, DL, MVT::i32));
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -875,6 +1182,10 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performADDCombine(N, DAG, DCI, Subtarget);
   case ISD::AssertZext:
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
+  case ISD::SHL:
+    return performSHLCombine(N, DAG, DCI, Subtarget);
+  case ISD::SUB:
+    return performSUBCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -1733,7 +2044,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = N->getGlobal();
 
-  if (!isPositionIndependent() && !ABI.IsN64()) {
+  if (!isPositionIndependent()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
@@ -1742,8 +2053,10 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
-    // %hi/%lo relocation
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+                                 // %hi/%lo relocation
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                 // %highest/%higher/%hi/%lo relocation
+                                 : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
   }
 
   // Every other architecture would use shouldAssumeDSOLocal in here, but
@@ -1777,8 +2090,9 @@ SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
   BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (!isPositionIndependent() && !ABI.IsN64())
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+  if (!isPositionIndependent())
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
 
   return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
@@ -1820,8 +2134,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
-      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
+    CLI.setDebugLoc(DL)
+        .setChain(DAG.getEntryNode())
+        .setLibCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -1870,8 +2185,9 @@ lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
   JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (!isPositionIndependent() && !ABI.IsN64())
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+  if (!isPositionIndependent())
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
 
   return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
@@ -1882,7 +2198,7 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
   EVT Ty = Op.getValueType();
 
-  if (!isPositionIndependent() && !ABI.IsN64()) {
+  if (!isPositionIndependent()) {
     const MipsTargetObjectFile *TLOF =
         static_cast<const MipsTargetObjectFile *>(
             getTargetMachine().getObjFileLowering());
@@ -1892,10 +2208,11 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
       // %gp_rel relocation
       return getAddrGPRel(N, SDLoc(N), Ty, DAG);
 
-    return getAddrNonPIC(N, SDLoc(N), Ty, DAG);
+    return Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
   }
 
-  return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
+ return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64());
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -2410,6 +2727,11 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 //       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
 //       not used, it must be shadowed. If only A3 is available, shadow it and
 //       go to stack.
+// vXiX - Received as scalarized i32s, passed in A0 - A3 and the stack.
+// vXf32 - Passed in either a pair of registers {A0, A1}, {A2, A3} or {A0 - A3}
+//         with the remainder spilled to the stack.
+// vXf64 - Passed in either {A0, A1, A2, A3} or {A2, A3} and in both cases
+//         spilling the remainder to the stack.
 //
 //  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
@@ -2421,8 +2743,13 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
       State.getMachineFunction().getSubtarget());
 
   static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+
+  const MipsCCState * MipsState = static_cast<MipsCCState *>(&State);
+
   static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
 
+  static const MCPhysReg FloatVectorIntRegs[] = { Mips::A0, Mips::A2 };
+
   // Do not process byval args here.
   if (ArgFlags.isByVal())
     return true;
@@ -2460,8 +2787,26 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 State.getFirstUnallocated(F32Regs) != ValNo;
   unsigned OrigAlign = ArgFlags.getOrigAlign();
   bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
-
-  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
+  bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo);
+
+  // The MIPS vector ABI for floats passes them in a pair of registers
+  if (ValVT == MVT::i32 && isVectorFloat) {
+    // This is the start of an vector that was scalarized into an unknown number
+    // of components. It doesn't matter how many there are. Allocate one of the
+    // notional 8 byte aligned registers which map onto the argument stack, and
+    // shadow the register lost to alignment requirements.
+    if (ArgFlags.isSplit()) {
+      Reg = State.AllocateReg(FloatVectorIntRegs);
+      if (Reg == Mips::A2)
+        State.AllocateReg(Mips::A1);
+      else if (Reg == 0)
+        State.AllocateReg(Mips::A3);
+    } else {
+      // If we're an intermediate component of the split, we can just attempt to
+      // allocate a register directly.
+      Reg = State.AllocateReg(IntRegs);
+    }
+  } else if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
     Reg = State.AllocateReg(IntRegs);
     // If this is the first part of an i64 arg,
     // the allocated register must be either A0 or A2.
@@ -2645,7 +2990,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // caller side but removing it breaks the frame size calculation.
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), Callee.getNode());
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(Callee.getNode());
+  CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
+                             ES ? ES->getSymbol() : nullptr);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
@@ -2679,7 +3027,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true);
 
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP,
@@ -2796,14 +3144,27 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
-  bool IsPICCall = (ABI.IsN64() || IsPIC); // true if calls are translated to
-                                           // jalr $25
+
   SDValue CalleeLo;
   EVT Ty = Callee.getValueType();
   bool GlobalOrExternal = false, IsCallReloc = false;
 
+  // The long-calls feature is ignored in case of PIC.
+  // While we do not support -mshared / -mno-shared properly,
+  // ignore long-calls in case of -mabicalls too.
+  if (Subtarget.useLongCalls() && !Subtarget.isABICalls() && !IsPIC) {
+    // Get the address of the callee into a register to prevent
+    // using of the `jal` instruction for the direct call.
+    if (auto *N = dyn_cast<GlobalAddressSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+    else if (auto *N = dyn_cast<ExternalSymbolSDNode>(Callee))
+      Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG)
+                                    : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG);
+  }
+
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    if (IsPICCall) {
+    if (IsPIC) {
       const GlobalValue *Val = G->getGlobal();
       InternalLinkage = Val->hasInternalLinkage();
 
@@ -2828,7 +3189,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
 
-    if (!ABI.IsN64() && !IsPIC) // !N64 && static
+    if (!IsPIC) // static
       Callee = DAG.getTargetExternalSymbol(
           Sym, getPointerTy(DAG.getDataLayout()), MipsII::MO_NO_FLAG);
     else if (LargeGOT) {
@@ -2836,7 +3197,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      MipsII::MO_CALL_LO16, Chain,
                                      FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
-    } else { // N64 || PIC
+    } else { // PIC
       Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
                              FuncInfo->callPtrInfo(Sym));
       IsCallReloc = true;
@@ -2848,7 +3209,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> Ops(1, Chain);
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  getOpndList(Ops, RegsToPass, IsPICCall, GlobalOrExternal, InternalLinkage,
+  getOpndList(Ops, RegsToPass, IsPIC, GlobalOrExternal, InternalLinkage,
               IsCallReloc, CLI, Callee, Chain);
 
   if (IsTailCall) {
@@ -2881,7 +3242,11 @@ SDValue MipsTargetLowering::LowerCallResult(
   SmallVector<CCValAssign, 16> RVLocs;
   MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                      *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI);
+
+  const ExternalSymbolSDNode *ES =
+      dyn_cast_or_null<const ExternalSymbolSDNode>(CLI.Callee.getNode());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy,
+                           ES ? ES->getSymbol() : nullptr);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -3683,7 +4048,9 @@ bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 }
 
 unsigned MipsTargetLowering::getJumpTableEncoding() const {
-  if (ABI.IsN64())
+
+  // FIXME: For space reasons this should be: EK_GPRel32BlockAddress.
+  if (ABI.IsN64() && isPositionIndependent())
     return MachineJumpTableInfo::EK_GPRel64BlockAddress;
 
   return TargetLowering::getJumpTableEncoding();
diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
index cddf090..0e47ed3 100644
--- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -37,14 +37,23 @@ namespace llvm {
       // Tail call
       TailCall,
 
-      // Get the Higher 16 bits from a 32-bit immediate
+      // Get the Highest (63-48) 16 bits from a 64-bit immediate
+      Highest,
+
+      // Get the Higher (47-32) 16 bits from a 64-bit immediate
+      Higher,
+
+      // Get the High 16 bits from a 32/64-bit immediate
       // No relation with Mips Hi register
       Hi,
 
-      // Get the Lower 16 bits from a 32-bit immediate
+      // Get the Lower 16 bits from a 32/64-bit immediate
       // No relation with Mips Lo register
       Lo,
 
+      // Get the High 16 bits from a 32 bit immediate for accessing the GOT.
+      GotHi,
+
       // Handle gp_rel (small data/bss sections) relocation.
       GPRel,
 
@@ -107,6 +116,7 @@ namespace llvm {
 
       Ext,
       Ins,
+      CIns,
 
       // EXTR.W instrinsic nodes.
       EXTP,
@@ -238,6 +248,33 @@ namespace llvm {
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    /// Return the register type for a given MVT, ensuring vectors are treated
+    /// as a series of gpr sized integers.
+    virtual MVT getRegisterTypeForCallingConv(MVT VT) const override;
+
+    /// Return the register type for a given MVT, ensuring vectors are treated
+    /// as a series of gpr sized integers.
+    virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                              EVT VT) const override;
+
+    /// Return the number of registers for a given MVT, ensuring vectors are
+    /// treated as a series of gpr sized integers.
+    virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                                   EVT VT) const override;
+
+    /// Break down vectors to the correct number of gpr sized integers.
+    virtual unsigned getVectorTypeBreakdownForCallingConv(
+        LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+        unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
+    /// Return the correct alignment for the current calling convention.
+    virtual unsigned
+    getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const override {
+      if (ArgTy->isVectorTy())
+        return std::min(DL.getABITypeAlignment(ArgTy), 8U);
+      return DL.getABITypeAlignment(ArgTy);
+    }
+
     ISD::NodeType getExtendForAtomicOps() const override {
       return ISD::SIGN_EXTEND;
     }
@@ -297,7 +334,7 @@ namespace llvm {
     }
 
     bool isJumpTableRelative() const override {
-      return getTargetMachine().isPositionIndependent() || ABI.IsN64();
+      return getTargetMachine().isPositionIndependent();
     }
 
   protected:
@@ -344,8 +381,8 @@ namespace llvm {
                                   SelectionDAG &DAG, unsigned HiFlag,
                                   unsigned LoFlag, SDValue Chain,
                                   const MachinePointerInfo &PtrInfo) const {
-      SDValue Hi =
-          DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(N, Ty, DAG, HiFlag));
+      SDValue Hi = DAG.getNode(MipsISD::GotHi, DL, Ty,
+                               getTargetNode(N, Ty, DAG, HiFlag));
       Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
       SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
                                     getTargetNode(N, Ty, DAG, LoFlag));
@@ -356,6 +393,8 @@ namespace llvm {
     // computing a symbol's address in non-PIC mode:
     //
     // (add %hi(sym), %lo(sym))
+    //
+    // This method covers O32, N32 and N64 in sym32 mode.
     template <class NodeTy>
     SDValue getAddrNonPIC(NodeTy *N, const SDLoc &DL, EVT Ty,
                           SelectionDAG &DAG) const {
@@ -364,7 +403,37 @@ namespace llvm {
       return DAG.getNode(ISD::ADD, DL, Ty,
                          DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
                          DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
-    }
+   }
+
+   // This method creates the following nodes, which are necessary for
+   // computing a symbol's address in non-PIC mode for N64.
+   //
+   // (add (shl (add (shl (add %highest(sym), %higher(sim)), 16), %high(sym)),
+   //            16), %lo(%sym))
+   //
+   // FIXME: This method is not efficent for (micro)MIPS64R6.
+   template <class NodeTy>
+   SDValue getAddrNonPICSym64(NodeTy *N, const SDLoc &DL, EVT Ty,
+                          SelectionDAG &DAG) const {
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+
+      SDValue Highest =
+          DAG.getNode(MipsISD::Highest, DL, Ty,
+                      getTargetNode(N, Ty, DAG, MipsII::MO_HIGHEST));
+      SDValue Higher = getTargetNode(N, Ty, DAG, MipsII::MO_HIGHER);
+      SDValue HigherPart =
+          DAG.getNode(ISD::ADD, DL, Ty, Highest,
+                      DAG.getNode(MipsISD::Higher, DL, Ty, Higher));
+      SDValue Cst = DAG.getConstant(16, DL, MVT::i32);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, Ty, HigherPart, Cst);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, Ty, Shift,
+                                DAG.getNode(MipsISD::Hi, DL, Ty, Hi));
+      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, Ty, Add, Cst);
+
+      return DAG.getNode(ISD::ADD, DL, Ty, Shift2,
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+   }
 
     // This method creates the following nodes, which are necessary for
     // computing a symbol's address using gp-relative addressing:
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
index df42d56..0333fe6 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -443,8 +443,17 @@ let AdditionalPredicates = [NotInMicroMips] in {
 }
 def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
                           bitconvert>, MFC1_FM<0>;
+def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
+def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>,
+               FGR_64 {
+  let DecoderNamespace = "Mips64";
+}
+
 let AdditionalPredicates = [NotInMicroMips] in {
   def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>,
                   MFC1_FM<3>, ISA_MIPS32R2, FGR_32;
@@ -557,11 +566,11 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
 def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-             MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+             MADDS_FM<4, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6, MADD4;
 def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-             MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
+             MADDS_FM<5, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6, MADD4;
 
-let AdditionalPredicates = [NoNaNsFPMath] in {
+let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
                 MADDS_FM<6, 0>, INSN_MIPS4_32R2_NOT_32R6_64R6;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
@@ -569,11 +578,11 @@ let AdditionalPredicates = [NoNaNsFPMath] in {
 }
 
 def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-               MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+               MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32, MADD4;
 def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-               MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
+               MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32, MADD4;
 
-let AdditionalPredicates = [NoNaNsFPMath] in {
+let AdditionalPredicates = [NoNaNsFPMath, HasMadd4] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
                   MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
@@ -582,12 +591,12 @@ let AdditionalPredicates = [NoNaNsFPMath] in {
 
 let DecoderNamespace = "Mips64" in {
   def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+                 MADDS_FM<4, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64, MADD4;
   def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
+                 MADDS_FM<5, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64, MADD4;
 }
 
-let AdditionalPredicates = [NoNaNsFPMath],
+let AdditionalPredicates = [NoNaNsFPMath, HasMadd4],
     DecoderNamespace = "Mips64" in {
   def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
                   MADDS_FM<6, 1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_64;
@@ -681,6 +690,29 @@ def PseudoTRUNC_W_D : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
                                         "trunc.w.d\t$fd, $fs, $rs">,
                       FGR_64, HARDFLOAT;
 
+def LoadImmSingleGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.s\t$rd, $fpimm">;
+
+def LoadImmSingleFGR : MipsAsmPseudoInst<(outs StrictlyFGR32Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.s\t$rd, $fpimm">,
+                       HARDFLOAT;
+
+def LoadImmDoubleGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.d\t$rd, $fpimm">;
+
+def LoadImmDoubleFGR_32 : MipsAsmPseudoInst<(outs StrictlyAFGR64Opnd:$rd),
+                                            (ins imm64:$fpimm),
+                                            "li.d\t$rd, $fpimm">,
+                          FGR_32, HARDFLOAT;
+
+def LoadImmDoubleFGR : MipsAsmPseudoInst<(outs StrictlyFGR64Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.d\t$rd, $fpimm">,
+                       FGR_64, HARDFLOAT;
+
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 19af191..4adf77f 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -103,12 +103,9 @@ void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
 
   for (unsigned i = 1; i < Cond.size(); ++i) {
-    if (Cond[i].isReg())
-      MIB.addReg(Cond[i].getReg());
-    else if (Cond[i].isImm())
-      MIB.addImm(Cond[i].getImm());
-    else
-       assert(false && "Cannot copy operand");
+    assert((Cond[i].isImm() || Cond[i].isReg()) &&
+           "Cannot copy operand for conditional branch!");
+    MIB.add(Cond[i]);
   }
   MIB.addMBB(TBB);
 }
@@ -482,7 +479,7 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
       MIB->RemoveOperand(0);
 
     for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J) {
-      MIB.addOperand(I->getOperand(J));
+      MIB.add(I->getOperand(J));
     }
 
     MIB.addImm(0);
@@ -492,7 +489,7 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
       if (BranchWithZeroOperand && (unsigned)ZeroOperandPosition == J)
         continue;
 
-      MIB.addOperand(I->getOperand(J));
+      MIB.add(I->getOperand(J));
     }
   }
 
@@ -501,3 +498,31 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
   MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
   return MIB;
 }
+
+bool MipsInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                                          unsigned &SrcOpIdx2) const {
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  switch (MI.getOpcode()) {
+  case Mips::DPADD_U_H:
+  case Mips::DPADD_U_W:
+  case Mips::DPADD_U_D:
+  case Mips::DPADD_S_H:
+  case Mips::DPADD_S_W:
+  case Mips::DPADD_S_D: {
+    // The first operand is both input and output, so it should not commute
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3))
+      return false;
+
+    if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
+      return false;
+    return true;
+  }
+  }
+  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
index 347b918..45d700d 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -135,6 +135,9 @@ public:
   MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
                                          MachineBasicBlock::iterator I) const;
 
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
 protected:
   bool isZeroImm(const MachineOperand &op) const;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
index 5bc4833..89a5854 100644
--- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -21,7 +21,7 @@ def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisSameAs<1, 2>,
                                                 SDTCisSameAs<3, 4>,
                                                 SDTCisInt<4>]>;
-def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
 def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
 def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
@@ -59,10 +59,20 @@ def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
 // Hi and Lo nodes are used to handle global addresses. Used on
 // MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
 // static model. (nothing to do with Mips Registers Hi and Lo)
+
+// Hi is the odd node out, on MIPS64 it can expand to either daddiu when
+// using static relocations with 64 bit symbols, or lui when using 32 bit
+// symbols.
+def MipsHigher : SDNode<"MipsISD::Higher", SDTIntUnaryOp>;
+def MipsHighest : SDNode<"MipsISD::Highest", SDTIntUnaryOp>;
 def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
 def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+
 def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
 
+// Hi node for accessing the GOT.
+def MipsGotHi : SDNode<"MipsISD::GotHi", SDTIntUnaryOp>;
+
 // TlsGd node is used to handle General Dynamic TLS
 def MipsTlsGd : SDNode<"MipsISD::TlsGd", SDTIntUnaryOp>;
 
@@ -128,6 +138,7 @@ def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
 
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+def MipsCIns : SDNode<"MipsISD::CIns", SDT_Ext>;
 
 def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
                      [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -205,6 +216,10 @@ def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
                       AssemblerPredicate<"FeatureCnMips">;
 def NotCnMips    :    Predicate<"!Subtarget->hasCnMips()">,
                       AssemblerPredicate<"!FeatureCnMips">;
+def IsSym32     :     Predicate<"Subtarget->HasSym32()">,
+                      AssemblerPredicate<"FeatureSym32">;
+def IsSym64     :     Predicate<"!Subtarget->HasSym32()">,
+                      AssemblerPredicate<"!FeatureSym32">;
 def RelocNotPIC :     Predicate<"!TM.isPositionIndependent()">;
 def RelocPIC    :     Predicate<"TM.isPositionIndependent()">;
 def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
@@ -223,7 +238,10 @@ def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
                       AssemblerPredicate<"FeatureEVA,FeatureMips32r2">;
 def HasMSA : Predicate<"Subtarget->hasMSA()">,
              AssemblerPredicate<"FeatureMSA">;
-
+def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
+               AssemblerPredicate<"!FeatureMadd4">;
+def HasMT  : Predicate<"Subtarget->hasMT()">,
+             AssemblerPredicate<"FeatureMT">;
 
 //===----------------------------------------------------------------------===//
 // Mips GPR size adjectives.
@@ -237,6 +255,14 @@ class PTR_32 { list<Predicate> PTRPredicates = [IsPTR32bit]; }
 class PTR_64 { list<Predicate> PTRPredicates = [IsPTR64bit]; }
 
 //===----------------------------------------------------------------------===//
+// Mips Symbol size adjectives.
+// They are mutally exculsive.
+//===----------------------------------------------------------------------===//
+
+class SYM_32 { list<Predicate> SYMPredicates = [IsSym32]; }
+class SYM_64 { list<Predicate> SYMPredicates = [IsSym64]; }
+
+//===----------------------------------------------------------------------===//
 // Mips ISA/ASE membership and instruction group membership adjectives.
 // They are mutually exclusive.
 //===----------------------------------------------------------------------===//
@@ -357,6 +383,10 @@ class ASE_MSA64 {
   list<Predicate> InsnPredicates = [HasMSA, HasMips64];
 }
 
+class ASE_MT {
+  list <Predicate> InsnPredicates = [HasMT];
+}
+
 // Class used for separating microMIPSr6 and microMIPS (r3) instruction.
 // It can be used only on instructions that doesn't inherit PredicateControl.
 class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl {
@@ -367,6 +397,10 @@ class ASE_NOT_DSP {
   list<Predicate> InsnPredicates = [NotDSP];
 }
 
+class MADD4 {
+  list<Predicate> AdditionalPredicates = [HasMadd4];
+}
+
 //===----------------------------------------------------------------------===//
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
@@ -519,7 +553,7 @@ def UImm32CoercedAsmOperandClass : UImmAnyAsmOperandClass<33, []> {
 def SImm32RelaxedAsmOperandClass
     : SImmAsmOperandClass<32, [UImm32CoercedAsmOperandClass]> {
   let Name = "SImm32_Relaxed";
-  let PredicateMethod = "isAnyImm<32>";
+  let PredicateMethod = "isAnyImm<33>";
   let DiagnosticType = "SImm32_Relaxed";
 }
 def SImm32AsmOperandClass
@@ -1150,6 +1184,10 @@ def immZExt5Plus33 : PatLeaf<(imm), [{
   return isUInt<5>(N->getZExtValue() - 33);
 }]>;
 
+def immZExt5To31 : SDNodeXForm<imm, [{
+  return getImm(N, 31 - N->getZExtValue());
+}]>;
+
 // True if (N + 1) fits in 16-bit field.
 def immSExt16Plus1 : PatLeaf<(imm), [{
   return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
@@ -1692,8 +1730,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in {
 }
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1 in {
-def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt),
-                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                                  [(callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                                   [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
@@ -2281,9 +2319,38 @@ def SEQIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
 def : MipsInstAlias<"seq $rd, $imm",
                     (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
                     NOT_ASE_CNMIPS;
+
+def MULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                                 simm32_relaxed:$imm),
+                                    "mul\t$rd, $rs, $imm">,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def MULOMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                               GPR32Opnd:$rt),
+                                  "mulo\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def MULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                                GPR32Opnd:$rt),
+                                   "mulou\t$rd, $rs, $rt">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
+
+multiclass OneOrTwoOperandMacroImmediateAlias<string Memnomic,
+                                              Instruction Opcode,
+                                              RegisterOperand RO = GPR32Opnd,
+                                              Operand Imm = simm32_relaxed> {
+  def : MipsInstAlias<!strconcat(Memnomic, " $rs, $rt, $imm"),
+                                (Opcode RO:$rs,
+                                        RO:$rt,
+                                        Imm:$imm), 0>;
+  def : MipsInstAlias<!strconcat(Memnomic, " $rs, $imm"),
+                                (Opcode RO:$rs,
+                                        RO:$rs,
+                                        Imm:$imm), 0>;
+}
+
 def : MipsInstAlias<"move $dst, $src",
                     (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
       GPR_32 {
@@ -2296,26 +2363,7 @@ def : MipsInstAlias<"move $dst, $src",
 }
 def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "addu $rs, $rt, $imm",
-          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "addu $rs, $imm",
-          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "add $rs, $rt, $imm",
-          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>,
-          ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "add $rs, $imm",
-          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>,
-          ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "and $rs, $rt, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "and $rs, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+
 def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
@@ -2343,36 +2391,26 @@ let AdditionalPredicates = [NotInMicroMips] in {
           "sgtu $$rs, $rt",
           (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
   def : MipsInstAlias<
-          "slt $rs, $rt, $imm",
-          (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "sltu $rt, $rs, $imm",
-          (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "and $rs, $rt, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "and $rs, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "xor $rs, $rt, $imm",
-          (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "xor $rs, $imm",
-          (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "or $rs, $rt, $imm",
-          (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "or $rs, $imm",
-          (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
           "not $rt, $rs",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
   def : MipsInstAlias<
           "not $rt",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
   def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, GPR_32;
 }
 def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
@@ -2445,6 +2483,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
 def : MipsInstAlias<"sync",
                     (SYNC 0), 1>, ISA_MIPS2;
+
+def : MipsInstAlias<"mulo $rs, $rt",
+                    (MULOMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"mulou $rs, $rt",
+                    (MULOUMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -2472,9 +2518,12 @@ def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
 def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
                       "jal\t$rs"> ;
 
-def NORImm : MipsAsmPseudoInst<
-                 (outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm32:$imm),
-                 "nor\t$rs, $rt, $imm"> ;
+class NORIMM_DESC_BASE<RegisterOperand RO, DAGOperand Imm> :
+   MipsAsmPseudoInst<(outs RO:$rs), (ins RO:$rt, Imm:$imm),
+                      "nor\t$rs, $rt, $imm">;
+def NORImm : NORIMM_DESC_BASE<GPR32Opnd, simm32_relaxed>, GPR_32;
+def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                              simm32_relaxed:$imm)>, GPR_32;
 
 let hasDelaySlot = 1, isCTI = 1 in {
 def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
@@ -2512,6 +2561,9 @@ class CondBranchImmPseudo<string instr_asm> :
   MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
                     !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
 
+def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6;
+def BNELImmMacro : CondBranchImmPseudo<"bnel">, ISA_MIPS2_NOT_32R6_64R6;
+
 def BLTImmMacro  : CondBranchImmPseudo<"blt">;
 def BLEImmMacro  : CondBranchImmPseudo<"ble">;
 def BGEImmMacro  : CondBranchImmPseudo<"bge">;
@@ -2535,34 +2587,46 @@ def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
 // Once the tablegen-erated errors are made better, this needs to be fixed and
 // predicates needs to be restored.
 
-def SDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+def SDivMacro : MipsAsmPseudoInst<(outs GPR32NonZeroOpnd:$rd),
                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
                                   "div\t$rd, $rs, $rt">,
                 ISA_MIPS1_NOT_32R6_64R6;
+def SDivIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32:$imm),
+                                   "div\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
 def UDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
                                   "divu\t$rd, $rs, $rt">,
                 ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"div $rt, $rs", (SDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                               GPR32Opnd:$rs), 0>,
+def UDivIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32:$imm),
+                                   "divu\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
+
+def : MipsInstAlias<"div $rs, $rt", (SDIV GPR32ZeroOpnd:$rs,
+                                          GPR32Opnd:$rt), 0>,
+     ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rs, $rt", (SDivMacro GPR32NonZeroOpnd:$rs,
+                                               GPR32NonZeroOpnd:$rs,
+                                               GPR32Opnd:$rt), 0>,
+     ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rd, $imm", (SDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                                 simm32:$imm), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+
+def : MipsInstAlias<"divu $rt, $rs", (UDIV GPR32ZeroOpnd:$rt,
+                                           GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32NonZeroOpnd:$rt,
+                                                GPR32NonZeroOpnd:$rt,
                                                 GPR32Opnd:$rs), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def DSDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
-                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddiv\t$rd, $rs, $rt">,
-                 ISA_MIPS64_NOT_64R6;
-def DUDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
-                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddivu\t$rd, $rs, $rt">,
-                 ISA_MIPS64_NOT_64R6;
-def : MipsInstAlias<"ddiv $rt, $rs", (DSDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                                 GPR32Opnd:$rs), 0>,
-      ISA_MIPS64_NOT_64R6;
-def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                                  GPR32Opnd:$rs), 0>,
-      ISA_MIPS64_NOT_64R6;
+
+def : MipsInstAlias<"divu $rd, $imm", (UDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                                  simm32:$imm), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
 
 def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
                             "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
@@ -2647,30 +2711,40 @@ def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
 def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
               (TAILCALL texternalsym:$dst)>;
 // hi/lo relocs
-def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
-def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
-def : MipsPat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
-def : MipsPat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
-def : MipsPat<(MipsHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsHi texternalsym:$in), (LUi texternalsym:$in)>;
-
-def : MipsPat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
-def : MipsPat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
-def : MipsPat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
-def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
-def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
-def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
-
-def : MipsPat<(add GPR32:$hi, (MipsLo tglobaladdr:$lo)),
-              (ADDiu GPR32:$hi, tglobaladdr:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tblockaddress:$lo)),
-              (ADDiu GPR32:$hi, tblockaddress:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tjumptable:$lo)),
-              (ADDiu GPR32:$hi, tjumptable:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tconstpool:$lo)),
-              (ADDiu GPR32:$hi, tconstpool:$lo)>;
-def : MipsPat<(add GPR32:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (ADDiu GPR32:$hi, tglobaltlsaddr:$lo)>;
+multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
+                          Register ZeroReg, RegisterOperand GPROpnd> {
+  def : MipsPat<(MipsHi tglobaladdr:$in), (Lui tglobaladdr:$in)>;
+  def : MipsPat<(MipsHi tblockaddress:$in), (Lui tblockaddress:$in)>;
+  def : MipsPat<(MipsHi tjumptable:$in), (Lui tjumptable:$in)>;
+  def : MipsPat<(MipsHi tconstpool:$in), (Lui tconstpool:$in)>;
+  def : MipsPat<(MipsHi tglobaltlsaddr:$in), (Lui tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsHi texternalsym:$in), (Lui texternalsym:$in)>;
+
+  def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
+  def : MipsPat<(MipsLo tblockaddress:$in),
+                (Addiu ZeroReg, tblockaddress:$in)>;
+  def : MipsPat<(MipsLo tjumptable:$in), (Addiu ZeroReg, tjumptable:$in)>;
+  def : MipsPat<(MipsLo tconstpool:$in), (Addiu ZeroReg, tconstpool:$in)>;
+  def : MipsPat<(MipsLo tglobaltlsaddr:$in),
+                (Addiu ZeroReg, tglobaltlsaddr:$in)>;
+  def : MipsPat<(MipsLo texternalsym:$in), (Addiu ZeroReg, texternalsym:$in)>;
+
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)),
+              (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tblockaddress:$lo)),
+              (Addiu GPROpnd:$hi, tblockaddress:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tjumptable:$lo)),
+              (Addiu GPROpnd:$hi, tjumptable:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tconstpool:$lo)),
+              (Addiu GPROpnd:$hi, tconstpool:$lo)>;
+  def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (Addiu GPROpnd:$hi, tglobaltlsaddr:$lo)>;
+}
+
+defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>;
+
+def : MipsPat<(MipsGotHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : MipsPat<(MipsGotHi texternalsym:$in), (LUi texternalsym:$in)>;
 
 // gp_rel relocs
 def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
@@ -2850,6 +2924,10 @@ include "MipsMSAInstrInfo.td"
 include "MipsEVAInstrFormats.td"
 include "MipsEVAInstrInfo.td"
 
+// MT
+include "MipsMTInstrFormats.td"
+include "MipsMTInstrInfo.td"
+
 // Micromips
 include "MicroMipsInstrFormats.td"
 include "MicroMipsInstrInfo.td"
diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
index 1087d0e..b95f115 100644
--- a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp
@@ -13,20 +13,31 @@
 // FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
 //===----------------------------------------------------------------------===//
 
-#include "Mips.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -47,24 +58,25 @@ static cl::opt<bool> ForceLongBranch(
   cl::Hidden);
 
 namespace {
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
   struct MBBInfo {
-    uint64_t Size, Address;
-    bool HasLongBranch;
-    MachineInstr *Br;
+    uint64_t Size = 0;
+    uint64_t Address;
+    bool HasLongBranch = false;
+    MachineInstr *Br = nullptr;
 
-    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
+    MBBInfo() = default;
   };
 
   class MipsLongBranch : public MachineFunctionPass {
-
   public:
     static char ID;
-    MipsLongBranch(TargetMachine &tm)
-        : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
-          ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
+
+    MipsLongBranch()
+        : MachineFunctionPass(ID), ABI(MipsABIInfo::Unknown()) {}
 
     StringRef getPassName() const override { return "Mips Long Branch"; }
 
@@ -83,7 +95,6 @@ namespace {
                        MachineBasicBlock *MBBOpnd);
     void expandToLongBranch(MBBInfo &Info);
 
-    const TargetMachine &TM;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
     bool IsPIC;
@@ -92,13 +103,8 @@ namespace {
   };
 
   char MipsLongBranch::ID = 0;
-} // end of anonymous namespace
 
-/// createMipsLongBranchPass - Returns a pass that converts branches to long
-/// branches.
-FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
-  return new MipsLongBranch(tm);
-}
+} // end anonymous namespace
 
 /// Iterate over list of Br's operands and search for a MachineBasicBlock
 /// operand.
@@ -461,6 +467,12 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       static_cast<const MipsSubtarget &>(F.getSubtarget());
   const MipsInstrInfo *TII =
       static_cast<const MipsInstrInfo *>(STI.getInstrInfo());
+
+
+  const TargetMachine& TM = F.getTarget();
+  IsPIC = TM.isPositionIndependent();
+  ABI = static_cast<const MipsTargetMachine &>(TM).getABI();
+
   LongBranchSeqSize =
       !IsPIC ? 2 : (ABI.IsN64() ? 10 : (!STI.isTargetNaCl() ? 9 : 10));
 
@@ -530,3 +542,7 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
 
   return true;
 }
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass() { return new MipsLongBranch(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
index 8b04fcb..bf79f0f 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -3781,6 +3781,80 @@ let Predicates = [HasMSA] in {
        ISA_MIPS1_NOT_32R6_64R6;
 }
 
+def vsplati64_imm_eq_63 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat(BV, Imm, EltTy.getSizeInBits()) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 63;
+}]>;
+
+def immi32Cst7  : ImmLeaf<i32, [{return isUInt<32>(Imm) && Imm == 7;}]>;
+def immi32Cst15 : ImmLeaf<i32, [{return isUInt<32>(Imm) && Imm == 15;}]>;
+def immi32Cst31 : ImmLeaf<i32, [{return isUInt<32>(Imm) && Imm == 31;}]>;
+
+def vsplati8imm7 :   PatFrag<(ops node:$wt),
+                             (and node:$wt, (vsplati8 immi32Cst7))>;
+def vsplati16imm15 : PatFrag<(ops node:$wt),
+                             (and node:$wt, (vsplati16 immi32Cst15))>;
+def vsplati32imm31 : PatFrag<(ops node:$wt),
+                             (and node:$wt, (vsplati32 immi32Cst31))>;
+def vsplati64imm63 : PatFrag<(ops node:$wt),
+                             (and node:$wt, vsplati64_imm_eq_63)>;
+
+class MSAShiftPat<SDNode Node, ValueType VT, MSAInst Insn, dag Vec> :
+  MSAPat<(VT (Node VT:$ws, (VT (and VT:$wt, Vec)))),
+         (VT (Insn VT:$ws, VT:$wt))>;
+
+class MSABitPat<SDNode Node, ValueType VT, MSAInst Insn, PatFrag Frag> :
+  MSAPat<(VT (Node VT:$ws, (shl vsplat_imm_eq_1, (Frag VT:$wt)))),
+         (VT (Insn VT:$ws, VT:$wt))>;
+
+multiclass MSAShiftPats<SDNode Node, string Insn> {
+  def : MSAShiftPat<Node, v16i8, !cast<MSAInst>(Insn#_B),
+                    (vsplati8 immi32Cst7)>;
+  def : MSAShiftPat<Node, v8i16, !cast<MSAInst>(Insn#_H),
+                    (vsplati16 immi32Cst15)>;
+  def : MSAShiftPat<Node, v4i32, !cast<MSAInst>(Insn#_W),
+                    (vsplati32 immi32Cst31)>;
+  def : MSAPat<(v2i64 (Node v2i64:$ws, (v2i64 (and v2i64:$wt,
+                                                   vsplati64_imm_eq_63)))),
+               (v2i64 (!cast<MSAInst>(Insn#_D) v2i64:$ws, v2i64:$wt))>;
+}
+
+multiclass MSABitPats<SDNode Node, string Insn> {
+  def : MSABitPat<Node, v16i8, !cast<MSAInst>(Insn#_B), vsplati8imm7>;
+  def : MSABitPat<Node, v8i16, !cast<MSAInst>(Insn#_H), vsplati16imm15>;
+  def : MSABitPat<Node, v4i32, !cast<MSAInst>(Insn#_W), vsplati32imm31>;
+  def : MSAPat<(Node v2i64:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                     (vsplati64imm63 v2i64:$wt))),
+               (v2i64 (!cast<MSAInst>(Insn#_D) v2i64:$ws, v2i64:$wt))>;
+}
+
+defm : MSAShiftPats<shl, "SLL">;
+defm : MSAShiftPats<srl, "SRL">;
+defm : MSAShiftPats<sra, "SRA">;
+defm : MSABitPats<xor, "BNEG">;
+defm : MSABitPats<or, "BSET">;
+
+def : MSAPat<(and v16i8:$ws, (xor (shl vsplat_imm_eq_1,
+                                       (vsplati8imm7 v16i8:$wt)),
+                                  immAllOnesV)),
+             (v16i8 (BCLR_B v16i8:$ws, v16i8:$wt))>;
+def : MSAPat<(and v8i16:$ws, (xor (shl vsplat_imm_eq_1,
+                                       (vsplati16imm15 v8i16:$wt)),
+                             immAllOnesV)),
+             (v8i16 (BCLR_H v8i16:$ws, v8i16:$wt))>;
+def : MSAPat<(and v4i32:$ws, (xor (shl vsplat_imm_eq_1,
+                                       (vsplati32imm31 v4i32:$wt)),
+                             immAllOnesV)),
+             (v4i32 (BCLR_W v4i32:$ws, v4i32:$wt))>;
+def : MSAPat<(and v2i64:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+                                       (vsplati64imm63 v2i64:$wt)),
+                                  (bitconvert (v4i32 immAllOnesV)))),
+             (v2i64 (BCLR_D v2i64:$ws, v2i64:$wt))>;
+
 // Vector extraction with fixed index.
 //
 // Extracting 32-bit values on MSA32 should always use COPY_S_W rather than
diff --git a/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td
new file mode 100644
index 0000000..64bee5b
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMTInstrFormats.td
@@ -0,0 +1,78 @@
+//===-- MipsMTInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe the MIPS MT instructions format
+//
+//  opcode - operation code.
+//  rt     - destination register
+//
+//===----------------------------------------------------------------------===//
+
+class MipsMTInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                   PredicateControl {
+  let DecoderNamespace = "Mips";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+class OPCODE1<bits<1> Val> {
+  bits<1> Value = Val;
+}
+
+def OPCODE_SC_D : OPCODE1<0b0>;
+def OPCODE_SC_E : OPCODE1<0b1>;
+
+class FIELD5<bits<5> Val> {
+  bits<5> Value = Val;
+}
+
+def FIELD5_1_DMT_EMT  : FIELD5<0b00001>;
+def FIELD5_2_DMT_EMT  : FIELD5<0b01111>;
+def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>;
+
+class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst {
+  bits<32> Inst;
+
+  bits<5> rt;
+  let Inst{31-26} = 0b010000; // COP0
+  let Inst{25-21} = 0b01011;  // MFMC0
+  let Inst{20-16} = rt;
+  let Inst{15-11} = Op1.Value;
+  let Inst{10-6}  = Op2.Value;
+  let Inst{5}     = sc.Value;
+  let Inst{4-3}   = 0b00;
+  let Inst{2-0}   = 0b001;
+}
+
+class SPECIAL3_MT_FORK : MipsMTInst {
+  bits<32> Inst;
+
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  let Inst{31-26} = 0b011111; // SPECIAL3
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5-0}   = 0b001000; // FORK
+}
+
+class SPECIAL3_MT_YIELD : MipsMTInst {
+  bits<32> Inst;
+
+  bits<5> rs;
+  bits<5> rd;
+  let Inst{31-26} = 0b011111; // SPECIAL3
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5-0}   = 0b001001; // FORK
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMTInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMTInstrInfo.td
new file mode 100644
index 0000000..ab6693f
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/MipsMTInstrInfo.td
@@ -0,0 +1,98 @@
+//===-- MipsMTInstrInfo.td - Mips MT Instruction Infos -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MIPS MT Instruction Encodings
+//===----------------------------------------------------------------------===//
+
+class DMT_ENC : COP0_MFMC0_MT<FIELD5_1_DMT_EMT, FIELD5_2_DMT_EMT,
+                              OPCODE_SC_D>;
+
+class EMT_ENC : COP0_MFMC0_MT<FIELD5_1_DMT_EMT, FIELD5_2_DMT_EMT,
+                              OPCODE_SC_E>;
+
+class DVPE_ENC : COP0_MFMC0_MT<FIELD5_1_2_DVPE_EVPE, FIELD5_1_2_DVPE_EVPE,
+                               OPCODE_SC_D>;
+
+class EVPE_ENC : COP0_MFMC0_MT<FIELD5_1_2_DVPE_EVPE, FIELD5_1_2_DVPE_EVPE,
+                               OPCODE_SC_E>;
+
+class FORK_ENC : SPECIAL3_MT_FORK;
+
+class YIELD_ENC : SPECIAL3_MT_YIELD;
+
+//===----------------------------------------------------------------------===//
+// MIPS MT Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+class MT_1R_DESC_BASE<string instr_asm, InstrItinClass Itin = NoItinerary> {
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins);
+  string AsmString = !strconcat(instr_asm, "\t$rt");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = Itin;
+}
+
+class FORK_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd);
+  dag InOperandList = (ins GPR32Opnd:$rt);
+  string AsmString = "fork\t$rd, $rs, $rt";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_FORK;
+}
+
+class YIELD_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins GPR32Opnd:$rs);
+  string AsmString = "yield\t$rd, $rs";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = II_YIELD;
+}
+
+class DMT_DESC : MT_1R_DESC_BASE<"dmt", II_DMT>;
+
+class EMT_DESC : MT_1R_DESC_BASE<"emt", II_EMT>;
+
+class DVPE_DESC : MT_1R_DESC_BASE<"dvpe", II_DVPE>;
+
+class EVPE_DESC : MT_1R_DESC_BASE<"evpe", II_EVPE>;
+
+//===----------------------------------------------------------------------===//
+// MIPS MT Instruction Definitions
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 1, isNotDuplicable = 1,
+    AdditionalPredicates = [NotInMicroMips] in {
+  def DMT : DMT_ENC, DMT_DESC, ASE_MT;
+
+  def EMT : EMT_ENC, EMT_DESC, ASE_MT;
+
+  def DVPE : DVPE_ENC, DVPE_DESC, ASE_MT;
+
+  def EVPE : EVPE_ENC, EVPE_DESC, ASE_MT;
+
+  def FORK : FORK_ENC, FORK_DESC, ASE_MT;
+
+  def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT;
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS MT Instruction Definitions
+//===----------------------------------------------------------------------===//
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsInstAlias<"dmt", (DMT ZERO), 1>, ASE_MT;
+
+  def : MipsInstAlias<"emt", (EMT ZERO), 1>, ASE_MT;
+
+  def : MipsInstAlias<"dvpe", (DVPE ZERO), 1>, ASE_MT;
+
+  def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT;
+
+  def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT;
+}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
index d0609b1..e01c03d 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/MipsBaseInfo.h"
-#include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -24,7 +23,7 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
-MipsFunctionInfo::~MipsFunctionInfo() {}
+MipsFunctionInfo::~MipsFunctionInfo() = default;
 
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
@@ -54,14 +53,15 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
 }
 
 void MipsFunctionInfo::createEhDataRegsFI() {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   for (int I = 0; I < 4; ++I) {
-    const TargetRegisterClass *RC =
+    const TargetRegisterClass &RC =
         static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64()
-            ? &Mips::GPR64RegClass
-            : &Mips::GPR32RegClass;
+            ? Mips::GPR64RegClass
+            : Mips::GPR32RegClass;
 
-    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(RC->getSize(),
-        RC->getAlignment(), false);
+    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC),
+        TRI.getSpillAlignment(RC), false);
   }
 }
 
@@ -70,11 +70,12 @@ void MipsFunctionInfo::createISRRegFI() {
   // The current implementation only supports Mips32r2+ not Mips64rX. Status
   // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture,
   // however Mips32r2+ is the supported architecture.
-  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const TargetRegisterClass &RC = Mips::GPR32RegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
   for (int I = 0; I < 2; ++I)
     ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
-        RC->getSize(), RC->getAlignment(), false);
+        TRI.getSpillSize(RC), TRI.getSpillAlignment(RC), false);
 }
 
 bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
@@ -94,11 +95,12 @@ MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
 }
 
 int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   if (MoveF64ViaSpillFI == -1) {
     MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject(
-        RC->getSize(), RC->getAlignment(), false);
+        TRI.getSpillSize(*RC), TRI.getSpillAlignment(*RC), false);
   }
   return MoveF64ViaSpillFI;
 }
 
-void MipsFunctionInfo::anchor() { }
+void MipsFunctionInfo::anchor() {}
diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
index c9e5fdd..553a667 100644
--- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
+++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -1,4 +1,4 @@
-//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//===- MipsMachineFunctionInfo.h - Private data used for Mips ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,12 +15,8 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
 
 #include "Mips16HardFloatInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
 #include <map>
 
 namespace llvm {
@@ -29,12 +25,9 @@ namespace llvm {
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
 public:
-  MipsFunctionInfo(MachineFunction &MF)
-      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0),
-        CallsEhReturn(false), IsISR(false), SaveS2(false),
-        MoveF64ViaSpillFI(-1) {}
+  MipsFunctionInfo(MachineFunction &MF) : MF(MF) {}
 
-  ~MipsFunctionInfo();
+  ~MipsFunctionInfo() override;
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
@@ -81,25 +74,26 @@ public:
 
   int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
 
-  std::map<const char *, const llvm::Mips16HardFloatInfo::FuncSignature *>
+  std::map<const char *, const Mips16HardFloatInfo::FuncSignature *>
   StubsNeeded;
 
 private:
   virtual void anchor();
 
   MachineFunction& MF;
+
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
+  unsigned SRetReturnReg = 0;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
-  unsigned GlobalBaseReg;
+  unsigned GlobalBaseReg = 0;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
 
   /// True if function has a byval argument.
   bool HasByvalArg;
@@ -108,25 +102,25 @@ private:
   unsigned IncomingArgSize;
 
   /// CallsEhReturn - Whether the function calls llvm.eh.return.
-  bool CallsEhReturn;
+  bool CallsEhReturn = false;
 
   /// Frame objects for spilling eh data registers.
   int EhDataRegFI[4];
 
   /// ISR - Whether the function is an Interrupt Service Routine.
-  bool IsISR;
+  bool IsISR = false;
 
   /// Frame objects for spilling C0_STATUS, C0_EPC
   int ISRDataRegFI[2];
 
   // saveS2
-  bool SaveS2;
+  bool SaveS2 = false;
 
   /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the
   /// O32 FPXX ABI is enabled. -1 is used to denote invalid index.
-  int MoveF64ViaSpillFI;
+  int MoveF64ViaSpillFI = -1;
 };
 
-} // end of namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index cf85eb3..ceacaa4 100644
--- a/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
 
 #include "Mips.h"
 #include "MipsTargetMachine.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -22,18 +23,19 @@ namespace {
   public:
     static char ID;
 
-    explicit MipsModuleDAGToDAGISel(MipsTargetMachine &TM_)
-      : MachineFunctionPass(ID), TM(TM_) {}
+    MipsModuleDAGToDAGISel() : MachineFunctionPass(ID) {}
 
     // Pass Name
     StringRef getPassName() const override {
       return "MIPS DAG->DAG Pattern Instruction Selection";
     }
 
-    bool runOnMachineFunction(MachineFunction &MF) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetPassConfig>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
 
-  protected:
-    MipsTargetMachine &TM;
+    bool runOnMachineFunction(MachineFunction &MF) override;
   };
 
   char MipsModuleDAGToDAGISel::ID = 0;
@@ -41,10 +43,12 @@ namespace {
 
 bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(errs() << "In MipsModuleDAGToDAGISel::runMachineFunction\n");
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<MipsTargetMachine>();
   TM.resetSubtarget(&MF);
   return false;
 }
 
-llvm::FunctionPass *llvm::createMipsModuleISelDagPass(MipsTargetMachine &TM) {
-  return new MipsModuleDAGToDAGISel(TM);
+llvm::FunctionPass *llvm::createMipsModuleISelDagPass() {
+  return new MipsModuleDAGToDAGISel();
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
index f33857f..79c8395 100644
--- a/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips.h"
 #include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/ScopedHashTable.h"
@@ -59,7 +59,7 @@ private:
 
 class OptimizePICCall : public MachineFunctionPass {
 public:
-  OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
+  OptimizePICCall() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "Mips OptimizePICCall"; }
 
@@ -116,9 +116,10 @@ static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
 
 /// Return type of register Reg.
 static MVT::SimpleValueType getRegTy(unsigned Reg, MachineFunction &MF) {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
-  assert(RC->vt_end() - RC->vt_begin() == 1);
-  return *RC->vt_begin();
+  assert(TRI.legalclasstypes_end(*RC) - TRI.legalclasstypes_begin(*RC) == 1);
+  return *TRI.legalclasstypes_begin(*RC);
 }
 
 /// Do the following transformation:
@@ -256,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
 
   // Get the instruction that loads the function address from the GOT.
   Reg = MO->getReg();
-  Val = (Value*)nullptr;
+  Val = nullptr;
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = MRI.getVRegDef(Reg);
 
@@ -296,6 +297,6 @@ void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
 }
 
 /// Return an OptimizeCall object.
-FunctionPass *llvm::createMipsOptimizePICCallPass(MipsTargetMachine &TM) {
-  return new OptimizePICCall(TM);
+FunctionPass *llvm::createMipsOptimizePICCallPass() {
+  return new OptimizePICCall();
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
index 23f0b70..4708784 100644
--- a/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
+++ b/contrib/llvm/lib/Target/Mips/MipsOptionRecord.h
@@ -1,4 +1,4 @@
-//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//===- MipsOptionRecord.h - Abstraction for storing information -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,14 +23,16 @@
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MipsELFStreamer;
-class MCSubtargetInfo;
 
 class MipsOptionRecord {
 public:
-  virtual ~MipsOptionRecord(){};
+  virtual ~MipsOptionRecord() = default;
+
   virtual void EmitMipsOptionRecord() = 0;
 };
 
@@ -53,7 +55,8 @@ public:
     COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
     COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
   }
-  ~MipsRegInfoRecord() override {}
+
+  ~MipsRegInfoRecord() override = default;
 
   void EmitMipsOptionRecord() override;
   void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
@@ -74,5 +77,7 @@ private:
   uint32_t ri_cprmask[4];
   int64_t ri_gp_value;
 };
-} // namespace llvm
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
index 51ac562..7ee45c2 100644
--- a/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsOs16.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Instructions.h"
 #include "Mips.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -57,7 +57,7 @@ static  bool needsFPFromSig(Function &F) {
     ;
   }
   if (F.arg_size() >=1) {
-    Argument &Arg = F.getArgumentList().front();
+    Argument &Arg = *F.arg_begin();
     switch (Arg.getType()->getTypeID()) {
     case Type::FloatTyID:
     case Type::DoubleTyID:
@@ -155,6 +155,4 @@ bool MipsOs16::runOnModule(Module &M) {
   return modified;
 }
 
-ModulePass *llvm::createMipsOs16Pass(MipsTargetMachine &TM) {
-  return new MipsOs16;
-}
+ModulePass *llvm::createMipsOs16Pass() { return new MipsOs16(); }
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 65be350..de3389b 100644
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -286,7 +286,9 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
                << "spOffset   : " << spOffset << "\n"
-               << "stackSize  : " << stackSize << "\n");
+               << "stackSize  : " << stackSize << "\n"
+               << "alignment  : "
+               << MF.getFrameInfo().getObjectAlignment(FrameIndex) << "\n");
 
   eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
index 8c82239..08fb3d7 100644
--- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.td
@@ -290,6 +290,25 @@ class GPR32Class<list<ValueType> regTypes> :
   K0, K1, GP, SP, FP, RA)>;
 
 def GPR32 : GPR32Class<[i32]>;
+
+def GPR32ZERO : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO)>;
+
+def GPR32NONZERO : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  AT,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7,
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
+  // Reserved
+  K0, K1, GP, SP, FP, RA)>;
+
 def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
 def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
@@ -317,7 +336,7 @@ def GPRMM16MoveP : RegisterClass<"Mips", [i32], 32, (add
   S0, S2, S3, S4)>;
 
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
-// Reserved
+  // Reserved
   ZERO_64, AT_64,
   // Return Values and Arguments
   V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
@@ -479,6 +498,16 @@ def GPR64AsmOperand : MipsAsmRegOperand {
   let PredicateMethod = "isGPRAsmReg";
 }
 
+def GPR32ZeroAsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32ZeroAsmReg";
+  let PredicateMethod = "isGPRZeroAsmReg";
+}
+
+def GPR32NonZeroAsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32NonZeroAsmReg";
+  let PredicateMethod = "isGPRNonZeroAsmReg";
+}
+
 def GPR32AsmOperand : MipsAsmRegOperand {
   let Name = "GPR32AsmReg";
   let PredicateMethod = "isGPRAsmReg";
@@ -523,16 +552,31 @@ def AFGR64AsmOperand : MipsAsmRegOperand {
   let PredicateMethod = "isFGRAsmReg";
 }
 
+def StrictlyAFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "StrictlyAFGR64AsmReg";
+  let PredicateMethod = "isStrictlyFGRAsmReg";
+}
+
 def FGR64AsmOperand : MipsAsmRegOperand {
   let Name = "FGR64AsmReg";
   let PredicateMethod = "isFGRAsmReg";
 }
 
+def StrictlyFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "StrictlyFGR64AsmReg";
+  let PredicateMethod = "isStrictlyFGRAsmReg";
+}
+
 def FGR32AsmOperand : MipsAsmRegOperand {
   let Name = "FGR32AsmReg";
   let PredicateMethod = "isFGRAsmReg";
 }
 
+def StrictlyFGR32AsmOperand : MipsAsmRegOperand {
+  let Name = "StrictlyFGR32AsmReg";
+  let PredicateMethod = "isStrictlyFGRAsmReg";
+}
+
 def FGRH32AsmOperand : MipsAsmRegOperand {
   let Name = "FGRH32AsmReg";
   let PredicateMethod = "isFGRAsmReg";
@@ -550,6 +594,14 @@ def MSACtrlAsmOperand : MipsAsmRegOperand {
   let Name = "MSACtrlAsmReg";
 }
 
+def GPR32ZeroOpnd : RegisterOperand<GPR32ZERO> {
+  let ParserMatchClass = GPR32ZeroAsmOperand;
+}
+
+def GPR32NonZeroOpnd : RegisterOperand<GPR32NONZERO> {
+  let ParserMatchClass = GPR32NonZeroAsmOperand;
+}
+
 def GPR32Opnd : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32AsmOperand;
 }
@@ -602,14 +654,26 @@ def AFGR64Opnd : RegisterOperand<AFGR64> {
   let ParserMatchClass = AFGR64AsmOperand;
 }
 
+def StrictlyAFGR64Opnd : RegisterOperand<AFGR64> {
+  let ParserMatchClass = StrictlyAFGR64AsmOperand;
+}
+
 def FGR64Opnd : RegisterOperand<FGR64> {
   let ParserMatchClass = FGR64AsmOperand;
 }
 
+def StrictlyFGR64Opnd : RegisterOperand<FGR64> {
+  let ParserMatchClass = StrictlyFGR64AsmOperand;
+}
+
 def FGR32Opnd : RegisterOperand<FGR32> {
   let ParserMatchClass = FGR32AsmOperand;
 }
 
+def StrictlyFGR32Opnd : RegisterOperand<FGR32> {
+  let ParserMatchClass = StrictlyFGR32AsmOperand;
+}
+
 def FGRCCOpnd : RegisterOperand<FGRCC> {
   // The assembler doesn't use register classes so we can re-use
   // FGR32AsmOperand.
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 4996d07..102ebb2 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -12,26 +12,41 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsSEFrameLowering.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
 #include "MipsSEInstrInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
-namespace {
-typedef MachineBasicBlock::iterator Iter;
-
 static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
   if (Mips::ACC64RegClass.contains(Src))
     return std::make_pair((unsigned)Mips::PseudoMFHI,
@@ -47,6 +62,8 @@ static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
   return std::make_pair(0, 0);
 }
 
+namespace {
+
 /// Helper class to expand pseudos.
 class ExpandPseudo {
 public:
@@ -54,6 +71,8 @@ public:
   bool expand();
 
 private:
+  typedef MachineBasicBlock::iterator Iter;
+
   bool expandInstr(MachineBasicBlock &MBB, Iter I);
   void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
   void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
@@ -74,7 +93,8 @@ private:
   const MipsSEInstrInfo &TII;
   const MipsRegisterInfo &RegInfo;
 };
-}
+
+} // end anonymous namespace
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
     : MF(MF_), MRI(MF.getRegInfo()),
@@ -240,7 +260,8 @@ bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
   //  copy dst_hi, $vr1
 
   unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
-  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *DstRC = RegInfo.getMinimalPhysRegClass(Dst);
+  unsigned VRegSize = RegInfo.getRegSizeInBits(*DstRC) / 16;
   const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -419,7 +440,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
-  if (CSI.size()) {
+  if (!CSI.empty()) {
     // Find the instruction past the last instruction that saves a callee-saved
     // register to the stack.
     for (unsigned i = 0; i < CSI.size(); ++i)
@@ -471,7 +492,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       } else {
         // Reg is either in GPR32 or FGR32.
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-            nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
       }
@@ -534,7 +555,6 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
 void MipsSEFrameLowering::emitInterruptPrologueStub(
     MachineFunction &MF, MachineBasicBlock &MBB) const {
-
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -722,7 +742,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
 
 void MipsSEFrameLowering::emitInterruptEpilogueStub(
     MachineFunction &MF, MachineBasicBlock &MBB) const {
-
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -820,7 +839,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 bool
 MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
   // Reserve call frame if the size of the maximum call frame fits into 16-bit
   // immediate field and there are no variable sized objects on the stack.
   // Make sure the second register scavenger spill slot can be accessed with one
@@ -841,6 +859,7 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                                BitVector &SavedRegs,
                                                RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MipsABIInfo ABI = STI.getABI();
   unsigned FP = ABI.GetFramePtr();
@@ -866,10 +885,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (ExpandPseudo(MF).expand()) {
     // The spill slot should be half the size of the accumulator. If target is
     // mips64, it should be 64-bit, otherwise it should be 32-bt.
-    const TargetRegisterClass *RC = STI.hasMips64() ?
-      &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-    int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
-                                                  RC->getAlignment(), false);
+    const TargetRegisterClass &RC = STI.hasMips64() ?
+      Mips::GPR64RegClass : Mips::GPR32RegClass;
+    int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
+                                                 TRI->getSpillAlignment(RC),
+                                                 false);
     RS->addScavengingFrameIndex(FI);
   }
 
@@ -880,10 +900,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (isInt<16>(MaxSPOffset))
     return;
 
-  const TargetRegisterClass *RC =
-      ABI.ArePtrs64bit() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  int FI = MF.getFrameInfo().CreateStackObject(RC->getSize(),
-                                                RC->getAlignment(), false);
+  const TargetRegisterClass &RC =
+      ABI.ArePtrs64bit() ? Mips::GPR64RegClass : Mips::GPR32RegClass;
+  int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
+                                               TRI->getSpillAlignment(RC),
+                                               false);
   RS->addScavengingFrameIndex(FI);
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index 63cd3ce..bf30deb 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -1,4 +1,4 @@
-//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//===- MipsSEFrameLowering.h - Mips32/64 frame lowering ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,8 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
 
 #include "MipsFrameLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <vector>
 
 namespace llvm {
 
@@ -47,6 +49,7 @@ private:
   void emitInterruptPrologueStub(MachineFunction &MF,
                                  MachineBasicBlock &MBB) const;
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 92d3c00..4be26dd 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -24,11 +24,11 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -97,11 +97,13 @@ bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
   // Check if MI is "addiu $dst, $zero, 0" or "daddiu $dst, $zero, 0".
   if ((MI.getOpcode() == Mips::ADDiu) &&
       (MI.getOperand(1).getReg() == Mips::ZERO) &&
+      (MI.getOperand(2).isImm()) &&
       (MI.getOperand(2).getImm() == 0)) {
     DstReg = MI.getOperand(0).getReg();
     ZeroReg = Mips::ZERO;
   } else if ((MI.getOpcode() == Mips::DADDiu) &&
              (MI.getOperand(1).getReg() == Mips::ZERO_64) &&
+             (MI.getOperand(2).isImm()) &&
              (MI.getOperand(2).getImm() == 0)) {
     DstReg = MI.getOperand(0).getReg();
     ZeroReg = Mips::ZERO_64;
@@ -243,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
   }
 }
 
-void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                        SDValue CmpLHS, const SDLoc &DL,
-                                        SDNode *Node) const {
-  unsigned Opc = InFlag.getOpcode(); (void)Opc;
-
-  assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
-          (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&
-         "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
-
-  unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu;
-  if (Subtarget->isGP64bit()) {
-    SLTuOp = Mips::SLTu64;
-    ADDuOp = Mips::DADDu;
-  }
-
-  SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const {
+  SDValue InFlag = Node->getOperand(2);
+  unsigned Opc = InFlag.getOpcode();
   SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
 
-  SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops);
-
-  if (Subtarget->isGP64bit()) {
-    // On 64-bit targets, sltu produces an i64 but our backend currently says
-    // that SLTu64 produces an i32. We need to fix this in the long run but for
-    // now, just make the DAG type-correct by asserting the upper bits are zero.
-    Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT,
-                                   CurDAG->getTargetConstant(0, DL, VT),
-                                   SDValue(Carry, 0),
-                                   CurDAG->getTargetConstant(Mips::sub_32, DL,
-                                                             VT));
+  // In the base case, we can rely on the carry bit from the addsc
+  // instruction.
+  if (Opc == ISD::ADDC) {
+    SDValue Ops[3] = {LHS, RHS, InFlag};
+    CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops);
+    return;
   }
 
-  // Generate a second addition only if we know that RHS is not a
-  // constant-zero node.
-  SDNode *AddCarry = Carry;
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS);
-  if (!C || C->getZExtValue())
-    AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS);
+  assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!");
+
+  // The more complex case is when there is a chain of ISD::ADDE nodes like:
+  // (adde (adde (adde (addc a b) c) d) e).
+  //
+  // The addwc instruction does not write to the carry bit, instead it writes
+  // to bit 20 of the dsp control register. To match this series of nodes, each
+  // intermediate adde node must be expanded to write the carry bit before the
+  // addition.
+
+  // Start by reading the overflow field for addsc and moving the value to the
+  // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP
+  // corresponds to reading/writing the entire control register to/from a GPR.
+
+  SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32);
+
+  SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32);
+
+  SDNode *DSPCtrlField =
+      CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag);
+
+  SDNode *Carry = CurDAG->getMachineNode(
+      Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne);
 
-  CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0));
+  SDValue Ops[4] = {SDValue(DSPCtrlField, 0),
+                    CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne,
+                    SDValue(Carry, 0)};
+  SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops);
+
+  // My reading of the the MIPS DSP 3.01 specification isn't as clear as I
+  // would like about whether bit 20 always gets overwritten by addwc.
+  // Hence take an extremely conservative view and presume it's sticky. We
+  // therefore need to clear it.
+
+  SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+  SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)};
+  SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps);
+
+  SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue,
+                                         SDValue(DSPCtrlFinal, 0), CstOne);
+
+  SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)};
+  CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands);
 }
 
 /// Match frameindex
@@ -690,7 +710,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
     // as the original value.
     if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
 
-      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation() - 1, SDLoc(N),
                                       EltTy);
       return true;
     }
@@ -722,7 +742,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
     // Extract the run of set bits starting with bit zero, and test that the
     // result is the same as the original value
     if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
-      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation() - 1, SDLoc(N),
                                       EltTy);
       return true;
     }
@@ -763,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   switch(Opcode) {
   default: break;
 
-  case ISD::SUBE: {
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu;
-    selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node);
-    return true;
-  }
-
   case ISD::ADDE: {
-    if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC.
-      break;
-    SDValue InFlag = Node->getOperand(2);
-    unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu;
-    selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node);
+    selectAddE(Node, DL);
     return true;
   }
 
@@ -932,6 +941,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     // same set/ of registers. Similarly, ldi.h isn't capable of producing {
     // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
 
+    const MipsABIInfo &ABI =
+        static_cast<const MipsTargetMachine &>(TM).getABI();
+
     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
     APInt SplatValue, SplatUndef;
     unsigned SplatBitSize;
@@ -969,13 +981,233 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
       break;
     }
 
-    if (!SplatValue.isSignedIntN(10))
-      return false;
+    SDNode *Res;
 
-    SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
-                                            ViaVecTy.getVectorElementType());
-
-    SDNode *Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+    // If we have a signed 10 bit integer, we can splat it directly.
+    //
+    // If we have something bigger we can synthesize the value into a GPR and
+    // splat from there.
+    if (SplatValue.isSignedIntN(10)) {
+      SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
+                                              ViaVecTy.getVectorElementType());
+
+      Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+    } else if (SplatValue.isSignedIntN(16) &&
+               ((ABI.IsO32() && SplatBitSize < 64) ||
+                (ABI.IsN32() || ABI.IsN64()))) {
+      // Only handle signed 16 bit values when the element size is GPR width.
+      // MIPS64 can handle all the cases but MIPS32 would need to handle
+      // negative cases specifically here. Instead, handle those cases as
+      // 64bit values.
+
+      bool Is32BitSplat = ABI.IsO32() || SplatBitSize < 64;
+      const unsigned ADDiuOp = Is32BitSplat ? Mips::ADDiu : Mips::DADDiu;
+      const MVT SplatMVT = Is32BitSplat ? MVT::i32 : MVT::i64;
+      SDValue ZeroVal = CurDAG->getRegister(
+          Is32BitSplat ? Mips::ZERO : Mips::ZERO_64, SplatMVT);
+
+      const unsigned FILLOp =
+          SplatBitSize == 16
+              ? Mips::FILL_H
+              : (SplatBitSize == 32 ? Mips::FILL_W
+                                    : (SplatBitSize == 64 ? Mips::FILL_D : 0));
+
+      assert(FILLOp != 0 && "Unknown FILL Op for splat synthesis!");
+      assert((!ABI.IsO32() || (FILLOp != Mips::FILL_D)) &&
+             "Attempting to use fill.d on MIPS32!");
+
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, SplatMVT);
+
+      Res = CurDAG->getMachineNode(ADDiuOp, DL, SplatMVT, ZeroVal, LoVal);
+      Res = CurDAG->getMachineNode(FILLOp, DL, ViaVecTy, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(32) && SplatBitSize == 32) {
+      // Only handle the cases where the splat size agrees with the size
+      // of the SplatValue here.
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      assert((Hi || Lo) && "Zero case reached 32 bit case splat synthesis!");
+      Res = CurDAG->getMachineNode(Mips::FILL_W, DL, MVT::v4i32, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(32) && SplatBitSize == 64 &&
+               (ABI.IsN32() || ABI.IsN64())) {
+      // N32 and N64 can perform some tricks that O32 can't for signed 32 bit
+      // integers due to having 64bit registers. lui will cause the necessary
+      // zero/sign extension.
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      Res = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Hi >> 15) & 0x1), DL, MVT::i64),
+              SDValue(Res, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+      Res =
+          CurDAG->getMachineNode(Mips::FILL_D, DL, MVT::v2i64, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(64)) {
+      // If we have a 64 bit Splat value, we perform a similar sequence to the
+      // above:
+      //
+      // MIPS32:                            MIPS64:
+      //   lui $res, %highest(val)            lui $res, %highest(val)
+      //   ori $res, $res, %higher(val)       ori $res, $res, %higher(val)
+      //   lui $res2, %hi(val)                lui $res2, %hi(val)
+      //   ori $res2, %res2, %lo(val)         ori $res2, %res2, %lo(val)
+      //   $res3 = fill $res2                 dinsu $res, $res2, 0, 32
+      //   $res4 = insert.w $res3[1], $res    fill.d $res
+      //   splat.d $res4, 0
+      //
+      // The ability to use dinsu is guaranteed as MSA requires MIPSR5. This saves
+      // having to materialize the value by shifts and ors.
+      //
+      // FIXME: Implement the preferred sequence for MIPS64R6:
+      //
+      // MIPS64R6:
+      //   ori $res, $zero, %lo(val)
+      //   daui $res, $res, %hi(val)
+      //   dahi $res, $res, %higher(val)
+      //   dati $res, $res, %highest(cal)
+      //   fill.d $res
+      //
+
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      const unsigned Higher = SplatValue.lshr(32).getLoBits(16).getZExtValue();
+      const unsigned Highest = SplatValue.lshr(48).getLoBits(16).getZExtValue();
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+      SDValue HigherVal = CurDAG->getTargetConstant(Higher, DL, MVT::i32);
+      SDValue HighestVal = CurDAG->getTargetConstant(Highest, DL, MVT::i32);
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      // Independent of whether we're targeting MIPS64 or not, the basic
+      // operations are the same. Also, directly use the $zero register if
+      // the 16 bit chunk is zero.
+      //
+      // For optimization purposes we always synthesize the splat value as
+      // an i32 value, then if we're targetting MIPS64, use SUBREG_TO_REG
+      // just before combining the values with dinsu to produce an i64. This
+      // enables SelectionDAG to aggressively share components of splat values
+      // where possible.
+      //
+      // FIXME: This is the general constant synthesis problem. This code
+      //        should be factored out into a class shared between all the
+      //        classes that need it. Specifically, for a splat size of 64
+      //        bits that's a negative number we can do better than LUi/ORi
+      //        for the upper 32bits.
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      SDNode *HiRes;
+      if (Highest)
+        HiRes = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HighestVal);
+
+      if (Higher)
+        HiRes = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                       Highest ? SDValue(HiRes, 0) : ZeroVal,
+                                       HigherVal);
+
+
+      if (ABI.IsO32()) {
+        Res = CurDAG->getMachineNode(Mips::FILL_W, DL, MVT::v4i32,
+                                     (Hi || Lo) ? SDValue(Res, 0) : ZeroVal);
+
+        Res = CurDAG->getMachineNode(
+            Mips::INSERT_W, DL, MVT::v4i32, SDValue(Res, 0),
+            (Highest || Higher) ? SDValue(HiRes, 0) : ZeroVal,
+            CurDAG->getTargetConstant(1, DL, MVT::i32));
+
+        const TargetLowering *TLI = getTargetLowering();
+        const TargetRegisterClass *RC =
+            TLI->getRegClassFor(ViaVecTy.getSimpleVT());
+
+        Res = CurDAG->getMachineNode(
+            Mips::COPY_TO_REGCLASS, DL, ViaVecTy, SDValue(Res, 0),
+            CurDAG->getTargetConstant(RC->getID(), DL, MVT::i32));
+
+        Res = CurDAG->getMachineNode(
+            Mips::SPLATI_D, DL, MVT::v2i64, SDValue(Res, 0),
+            CurDAG->getTargetConstant(0, DL, MVT::i32));
+      } else if (ABI.IsN64() || ABI.IsN32()) {
+
+        SDValue Zero64Val = CurDAG->getRegister(Mips::ZERO_64, MVT::i64);
+        const bool HiResNonZero = Highest || Higher;
+        const bool ResNonZero = Hi || Lo;
+
+        if (HiResNonZero)
+          HiRes = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Highest >> 15) & 0x1), DL, MVT::i64),
+              SDValue(HiRes, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+        if (ResNonZero)
+          Res = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Hi >> 15) & 0x1), DL, MVT::i64),
+              SDValue(Res, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+        // We have 3 cases:
+        //   The HiRes is nonzero but Res is $zero  => dsll32 HiRes, 0
+        //   The Res is nonzero but HiRes is $zero  => dinsu Res, $zero, 32, 32
+        //   Both are non zero                      => dinsu Res, HiRes, 32, 32
+        //
+        // The obvious "missing" case is when both are zero, but that case is
+        // handled by the ldi case.
+        if (ResNonZero) {
+          SDValue Ops[4] = {HiResNonZero ? SDValue(HiRes, 0) : Zero64Val,
+                            CurDAG->getTargetConstant(64, DL, MVT::i32),
+                            CurDAG->getTargetConstant(32, DL, MVT::i32),
+                            SDValue(Res, 0)};
+
+          Res = CurDAG->getMachineNode(Mips::DINSU, DL, MVT::i64, Ops);
+        } else if (HiResNonZero) {
+          Res = CurDAG->getMachineNode(
+              Mips::DSLL32, DL, MVT::i64, SDValue(HiRes, 0),
+              CurDAG->getTargetConstant(0, DL, MVT::i32));
+        } else
+          llvm_unreachable(
+              "Zero splat value handled by non-zero 64bit splat synthesis!");
+
+        Res = CurDAG->getMachineNode(Mips::FILL_D, DL, MVT::v2i64, SDValue(Res, 0));
+      } else
+        llvm_unreachable("Unknown ABI in MipsISelDAGToDAG!");
+
+    } else
+      return false;
 
     if (ResVecTy != ViaVecTy) {
       // If LdiOp is writing to a different register class to ResVecTy, then
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
index f89a350..6f38289 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -41,8 +41,7 @@ private:
                                            const SDLoc &dl, EVT Ty, bool HasLo,
                                            bool HasHi);
 
-  void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                      const SDLoc &DL, SDNode *Node) const;
+  void selectAddE(SDNode *Node, const SDLoc &DL) const;
 
   bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const;
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index f28e8b3..72b2738 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::LOAD,               MVT::i32, Custom);
   setOperationAction(ISD::STORE,              MVT::i32, Custom);
 
-  setTargetDAGCombine(ISD::ADDE);
-  setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::MUL);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
   return MipsTargetLowering::LowerOperation(Op, DAG);
 }
 
-// selectMADD -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc multLo, Lo0), (adde multHi, Hi0),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
-  // ADDENode's second operand must be a flag output of an ADDC node in order
-  // for the matching to be successful.
-  SDNode *ADDCNode = ADDENode->getOperand(2).getNode();
-
-  if (ADDCNode->getOpcode() != ISD::ADDC)
-    return false;
-
-  SDValue MultHi = ADDENode->getOperand(0);
-  SDValue MultLo = ADDCNode->getOperand(0);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MADD only if ADDENode and ADDCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than ADDENode or ADDCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MADD instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(ADDENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  ADDCNode->getOperand(1),
-                                  ADDENode->getOperand(1));
-
-  // create MipsMAdd(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd;
-
-  SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of adde and addc here
-  if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
-  }
-  if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-// selectMSUB -
-// Transforms a subgraph in CurDAG if the following pattern is found:
-//  (addc Lo0, multLo), (sube Hi0, multHi),
-// where,
-//  multHi/Lo: product of multiplication
-//  Lo0: initial value of Lo register
-//  Hi0: initial value of Hi register
-// Return true if pattern matching was successful.
-static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
-  // SUBENode's second operand must be a flag output of an SUBC node in order
-  // for the matching to be successful.
-  SDNode *SUBCNode = SUBENode->getOperand(2).getNode();
-
-  if (SUBCNode->getOpcode() != ISD::SUBC)
-    return false;
-
-  SDValue MultHi = SUBENode->getOperand(1);
-  SDValue MultLo = SUBCNode->getOperand(1);
-  SDNode *MultNode = MultHi.getNode();
-  unsigned MultOpc = MultHi.getOpcode();
-
-  // MultHi and MultLo must be generated by the same node,
-  if (MultLo.getNode() != MultNode)
-    return false;
-
-  // and it must be a multiplication.
-  if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI)
-    return false;
-
-  // MultLo amd MultHi must be the first and second output of MultNode
-  // respectively.
-  if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0)
-    return false;
-
-  // Transform this to a MSUB only if SUBENode and SUBCNode are the only users
-  // of the values of MultNode, in which case MultNode will be removed in later
-  // phases.
-  // If there exist users other than SUBENode or SUBCNode, this function returns
-  // here, which will result in MultNode being mapped to a single MULT
-  // instruction node rather than a pair of MULT and MSUB instructions being
-  // produced.
-  if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
-    return false;
-
-  SDLoc DL(SUBENode);
-
-  // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
-                                  SUBCNode->getOperand(0),
-                                  SUBENode->getOperand(0));
-
-  // create MipsSub(u) node
-  MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub;
-
-  SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue,
-                                 MultNode->getOperand(0),// Factor 0
-                                 MultNode->getOperand(1),// Factor 1
-                                 ACCIn);
-
-  // replace uses of sube and subc here
-  if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
-  }
-  if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
-  }
-
-  return true;
-}
-
-static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() &&
-      N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
 //
 // Performs the following transformations:
@@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const MipsSubtarget &Subtarget) {
-  if (DCI.isBeforeLegalize())
-    return SDValue();
-
-  if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 &&
-      selectMSUB(N, &DAG))
-    return SDValue(N, 0);
-
-  return SDValue();
-}
-
 static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT,
                             EVT ShiftTy, SelectionDAG &DAG) {
   // Clear the upper (64 - VT.sizeInBits) bits.
@@ -1110,20 +938,17 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Val;
 
   switch (N->getOpcode()) {
-  case ISD::ADDE:
-    return performADDECombine(N, DAG, DCI, Subtarget);
   case ISD::AND:
     Val = performANDCombine(N, DAG, DCI, Subtarget);
     break;
   case ISD::OR:
     Val = performORCombine(N, DAG, DCI, Subtarget);
     break;
-  case ISD::SUBE:
-    return performSUBECombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
-    return performSHLCombine(N, DAG, DCI, Subtarget);
+    Val = performSHLCombine(N, DAG, DCI, Subtarget);
+    break;
   case ISD::SRA:
     return performSRACombine(N, DAG, DCI, Subtarget);
   case ISD::SRL:
@@ -1432,19 +1257,22 @@ static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
 static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   EVT ResVecTy = Op->getValueType(0);
   EVT ViaVecTy = ResVecTy;
+  bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian();
   SDLoc DL(Op);
 
   // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
   // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
   // lanes.
-  SDValue LaneA;
-  SDValue LaneB = Op->getOperand(2);
+  SDValue LaneA = Op->getOperand(OpNr);
+  SDValue LaneB;
 
   if (ResVecTy == MVT::v2i64) {
-    LaneA = DAG.getConstant(0, DL, MVT::i32);
+    LaneB = DAG.getConstant(0, DL, MVT::i32);
     ViaVecTy = MVT::v4i32;
+    if(BigEndian)
+      std::swap(LaneA, LaneB);
   } else
-    LaneA = LaneB;
+    LaneB = LaneA;
 
   SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
                       LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
@@ -1452,8 +1280,11 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue Result = DAG.getBuildVector(
       ViaVecTy, DL, makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
-  if (ViaVecTy != ResVecTy)
-    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+  if (ViaVecTy != ResVecTy) {
+    SDValue One = DAG.getConstant(1, DL, ViaVecTy);
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy,
+                         DAG.getNode(ISD::AND, DL, ViaVecTy, Result, One));
+  }
 
   return Result;
 }
@@ -1546,11 +1377,24 @@ static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
 }
 
+static SDValue truncateVecElts(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDValue Vec = Op->getOperand(2);
+  bool BigEndian = !DAG.getSubtarget().getTargetTriple().isLittleEndian();
+  MVT ResEltTy = ResTy == MVT::v2i64 ? MVT::i64 : MVT::i32;
+  SDValue ConstValue = DAG.getConstant(Vec.getScalarValueSizeInBits() - 1,
+                                       DL, ResEltTy);
+  SDValue SplatVec = getBuildVectorSplat(ResTy, ConstValue, BigEndian, DAG);
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Vec, SplatVec);
+}
+
 static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
   EVT ResTy = Op->getValueType(0);
   SDLoc DL(Op);
   SDValue One = DAG.getConstant(1, DL, ResTy);
-  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, truncateVecElts(Op, DAG));
 
   return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
                      DAG.getNOT(DL, Bit, ResTy));
@@ -1643,7 +1487,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
       report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
-                                       Op->getConstantOperandVal(3));
+                                       Op->getConstantOperandVal(3) + 1);
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
                        DAG.getConstant(Mask, DL, VecTy, true),
                        Op->getOperand(2), Op->getOperand(1));
@@ -1658,7 +1502,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
       report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
-                                      Op->getConstantOperandVal(3));
+                                      Op->getConstantOperandVal(3) + 1);
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
                        DAG.getConstant(Mask, DL, VecTy, true),
                        Op->getOperand(2), Op->getOperand(1));
@@ -1686,7 +1530,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
                        DAG.getNode(ISD::SHL, DL, VecTy, One,
-                                   Op->getOperand(2)));
+                                   truncateVecElts(Op, DAG)));
   }
   case Intrinsic::mips_bnegi_b:
   case Intrinsic::mips_bnegi_h:
@@ -1722,7 +1566,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
                        DAG.getNode(ISD::SHL, DL, VecTy, One,
-                                   Op->getOperand(2)));
+                                   truncateVecElts(Op, DAG)));
   }
   case Intrinsic::mips_bseti_b:
   case Intrinsic::mips_bseti_h:
@@ -2209,7 +2053,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_sll_w:
   case Intrinsic::mips_sll_d:
     return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
-                       Op->getOperand(2));
+                       truncateVecElts(Op, DAG));
   case Intrinsic::mips_slli_b:
   case Intrinsic::mips_slli_h:
   case Intrinsic::mips_slli_w:
@@ -2239,7 +2083,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_sra_w:
   case Intrinsic::mips_sra_d:
     return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
-                       Op->getOperand(2));
+                       truncateVecElts(Op, DAG));
   case Intrinsic::mips_srai_b:
   case Intrinsic::mips_srai_h:
   case Intrinsic::mips_srai_w:
@@ -2269,7 +2113,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::mips_srl_w:
   case Intrinsic::mips_srl_d:
     return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
-                       Op->getOperand(2));
+                       truncateVecElts(Op, DAG));
   case Intrinsic::mips_srli_b:
   case Intrinsic::mips_srli_h:
   case Intrinsic::mips_srli_w:
@@ -2529,11 +2373,10 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
         SplatBitSize != 64)
       return SDValue();
 
-    // If the value fits into a simm10 then we can use ldi.[bhwd]
-    // However, if it isn't an integer type we will have to bitcast from an
-    // integer type first. Also, if there are any undefs, we must lower them
-    // to defined values first.
-    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+    // If the value isn't an integer type we will have to bitcast
+    // from an integer type first. Also, if there are any undefs, we must
+    // lower them to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs)
       return Op;
 
     EVT ViaVecTy;
@@ -3577,9 +3420,17 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI,
                                : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass
                                                         : &Mips::GPR64RegClass);
   const bool UsingMips32 = RC == &Mips::GPR32RegClass;
-  unsigned Rs = RegInfo.createVirtualRegister(RC);
+  unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
 
   BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0);
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp)
+        .addImm(0)
+        .addReg(Rs)
+        .addImm(Mips::sub_32);
+    Rs = Tmp;
+  }
   BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64))
       .addReg(Rs)
       .addReg(Rt)
@@ -3628,7 +3479,13 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI,
   MachineInstrBuilder MIB =
       BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::LH : Mips::LH64), Rt);
   for (unsigned i = 1; i < MI.getNumOperands(); i++)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
+
+  if(!UsingMips32) {
+    unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32);
+    Rt = Tmp;
+  }
 
   BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt);
 
@@ -3697,6 +3554,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   assert(Subtarget.hasMSA() && Subtarget.hasMips32r2());
 
   bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64;
+  bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
@@ -3707,7 +3565,9 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1;
+  unsigned MFC1Opc = IsFGR64onMips64
+                         ? Mips::DMFC1
+                         : (IsFGR64onMips32 ? Mips::MFC1_D64 : Mips::MFC1);
   unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W;
 
   // Perform the register class copy as mentioned above.
@@ -3716,7 +3576,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI,
   BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp);
   unsigned WPHI = Wtemp;
 
-  if (!Subtarget.hasMips64() && IsFGR64) {
+  if (IsFGR64onMips32) {
     unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC);
     BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs);
     unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
@@ -3810,7 +3670,9 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI,
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetRegisterClass *GPRRC =
       IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
-  unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1;
+  unsigned MTC1Opc = IsFGR64onMips64
+                         ? Mips::DMTC1
+                         : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1);
   unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W;
 
   unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index ea703d0..ee07479 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -207,13 +207,16 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC164;
-  else if (RC->hasType(MVT::v16i8))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::ST_B;
-  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::ST_H;
-  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::ST_W;
-  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::ST_D;
   else if (Mips::LO32RegClass.hasSubClassEq(RC))
     Opc = Mips::SW;
@@ -280,13 +283,16 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC164;
-  else if (RC->hasType(MVT::v16i8))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v16i8))
     Opc = Mips::LD_B;
-  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v8i16) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v8f16))
     Opc = Mips::LD_H;
-  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v4i32) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v4f32))
     Opc = Mips::LD_W;
-  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+  else if (TRI->isTypeLegalForClass(*RC, MVT::v2i64) ||
+           TRI->isTypeLegalForClass(*RC, MVT::v2f64))
     Opc = Mips::LD_D;
   else if (Mips::HI32RegClass.hasSubClassEq(RC))
     Opc = Mips::LW;
@@ -540,11 +546,20 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) const {
+
+  MachineInstrBuilder MIB;
   if (Subtarget.isGP64bit())
-    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
-        .addReg(Mips::RA_64);
+    MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+              .addReg(Mips::RA_64, RegState::Undef);
   else
-    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
+    MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn))
+              .addReg(Mips::RA, RegState::Undef);
+
+  // Retain any imp-use flags.
+  for (auto & MO : I->operands()) {
+    if (MO.isImplicit())
+      MIB.add(MO);
+  }
 }
 
 void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
@@ -558,8 +573,8 @@ MipsSEInstrInfo::compareOpndSize(unsigned Opc,
   const MCInstrDesc &Desc = get(Opc);
   assert(Desc.NumOperands == 2 && "Unary instruction expected.");
   const MipsRegisterInfo *RI = &getRegisterInfo();
-  unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
-  unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+  unsigned DstRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 0, RI, MF));
+  unsigned SrcRegSize = RI->getRegSizeInBits(*getRegClass(Desc, 1, RI, MF));
 
   return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
index c0de59b..c2947bb 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSchedule.td
+++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td
@@ -84,6 +84,7 @@ def II_DIVU             : InstrItinClass;
 def II_DIV_D            : InstrItinClass;
 def II_DIV_S            : InstrItinClass;
 def II_DMFC0            : InstrItinClass;
+def II_DMT              : InstrItinClass;
 def II_DMTC0            : InstrItinClass;
 def II_DMFC1            : InstrItinClass;
 def II_DMTC1            : InstrItinClass;
@@ -113,8 +114,12 @@ def II_DSBH             : InstrItinClass;
 def II_DSHD             : InstrItinClass;
 def II_DSUBU            : InstrItinClass;
 def II_DSUB             : InstrItinClass;
+def II_DVPE             : InstrItinClass;
+def II_EMT              : InstrItinClass;
+def II_EVPE             : InstrItinClass;
 def II_EXT              : InstrItinClass; // Any EXT instruction
 def II_FLOOR            : InstrItinClass;
+def II_FORK             : InstrItinClass;
 def II_INS              : InstrItinClass; // Any INS instruction
 def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo.
 def II_J                : InstrItinClass;
@@ -345,6 +350,7 @@ def II_WRPGPR           : InstrItinClass;
 def II_RDPGPR           : InstrItinClass;
 def II_DVP              : InstrItinClass;
 def II_EVP              : InstrItinClass;
+def II_YIELD            : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Mips Generic instruction itineraries.
@@ -386,6 +392,7 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_DCLZ            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DMOD            , [InstrStage<17, [IMULDIV]>]>,
   InstrItinData<II_DMODU           , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<II_DMT             , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_DSLL            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSLL32          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DSRL            , [InstrStage<1,  [ALU]>]>,
@@ -404,7 +411,11 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_DSHD            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DCLO            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DCLZ            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_DVPE            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_EMT             , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<II_EVPE            , [InstrStage<2,  [ALU]>]>,
   InstrItinData<II_EXT             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_FORK            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_INS             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_LUI             , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_MOVE            , [InstrStage<1,  [ALU]>]>,
@@ -670,5 +681,6 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<II_WRPGPR          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_RDPGPR          , [InstrStage<1,  [ALU]>]>,
   InstrItinData<II_DVP             , [InstrStage<1,  [ALU]>]>,
-  InstrItinData<II_EVP             , [InstrStage<1,  [ALU]>]>
+  InstrItinData<II_EVP             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<II_YIELD           , [InstrStage<5,  [ALU]>]>
 ]>;
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
index 15a0401..89cda67 100644
--- a/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -187,7 +187,11 @@ def GenericIssueCOP0 : ProcResource<1> { let Super = GenericCOP0; }
 def GenericWriteCOP0TLB : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 4; }
 def GenericWriteCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 3; }
 def GenericReadCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 2; }
-def GnereicReadWritePGPR : SchedWriteRes<[GenericIssueCOP0]>;
+def GenericReadWritePGPR : SchedWriteRes<[GenericIssueCOP0]>;
+def GenericReadWriteCOP0Long : SchedWriteRes<[GenericIssueCOP0]> {
+  let Latency = 5;
+}
+def GenericWriteCOP0Short : SchedWriteRes<[GenericIssueCOP0]>;
 
 def : ItinRW<[GenericWriteCOP0TLB], [II_TLBP, II_TLBR, II_TLBWI, II_TLBWR]>;
 def : ItinRW<[GenericWriteCOP0TLB], [II_TLBINV, II_TLBINVF]>;
@@ -261,6 +265,14 @@ def : ItinRW<[GenericWriteLoad], [II_LBE, II_LBUE, II_LHE, II_LHUE, II_LWE,
 
 def : ItinRW<[GenericWriteLoad], [II_LWLE, II_LWRE]>;
 
+// MIPS MT instructions
+// ====================
+
+def : ItinRW<[GenericWriteMove], [II_DMT, II_DVPE, II_EMT, II_EVPE]>;
+
+def : ItinRW<[GenericReadWriteCOP0Long], [II_YIELD]>;
+def : ItinRW<[GenericWriteCOP0Short], [II_FORK]>;
+
 // MIPS32R6 and MIPS16e
 // ====================
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
index 882a241..fedfac2 100644
--- a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
+++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -19,7 +19,7 @@ def MipsP5600Model : SchedMachineModel {
                                          HasMips64, HasMips64r2, HasCnMips,
                                          InMicroMips, InMips16Mode,
                                          HasMicroMips32r6, HasMicroMips64r6,
-                                         HasDSP, HasDSPR2];
+                                         HasDSP, HasDSPR2, HasMT];
 
 }
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
index 3e7570f..eba21e0 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
 #include "Mips.h"
+#include "MipsMachineFunction.h"
 #include "MipsRegisterInfo.h"
-#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
@@ -59,9 +59,8 @@ static cl::opt<bool>
 
 void MipsSubtarget::anchor() { }
 
-MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
-                             const std::string &FS, bool little,
-                             const MipsTargetMachine &TM)
+MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                             bool little, const MipsTargetMachine &TM)
     : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
       IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
       NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
@@ -70,15 +69,14 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false),
       InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false),
       HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16),
-      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM),
+      Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false),
+      HasEVA(false), DisableMadd4(false), HasMT(false), TM(TM),
       TargetTriple(TT), TSInfo(),
       InstrInfo(
           MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))),
       FrameLowering(MipsFrameLowering::create(*this)),
       TLInfo(MipsTargetLowering::create(TM, *this)) {
 
-  PreviousInMips16Mode = InMips16Mode;
-
   if (MipsArchVersion == MipsDefault)
     MipsArchVersion = Mips32;
 
@@ -117,6 +115,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
   if (NoABICalls && TM.isPositionIndependent())
     report_fatal_error("position-independent code requires '-mabicalls'");
 
+  if (isABI_N64() && !TM.isPositionIndependent() && !hasSym32())
+    NoABICalls = true;
+
   // Set UseSmallSection.
   UseSmallSection = GPOpt;
   if (!NoABICalls && GPOpt) {
diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
index 38d3cee..cce3b8c 100644
--- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -78,7 +78,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // IsNan2008 - IEEE 754-2008 NaN encoding.
   bool IsNaN2008bit;
 
-  // IsFP64bit - General-purpose registers are 64 bits wide
+  // IsGP64bit - General-purpose registers are 64 bits wide
   bool IsGP64bit;
 
   // IsPTR64bit - Pointers are 64 bit wide
@@ -119,9 +119,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // Mips16 hard float
   bool InMips16HardFloat;
 
-  // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
-  bool PreviousInMips16Mode;
-
   // InMicroMips -- can process MicroMips instructions
   bool InMicroMipsMode;
 
@@ -142,8 +139,21 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // UseTCCInDIV -- Enables the use of trapping in the assembler.
   bool UseTCCInDIV;
 
+  // Sym32 -- On Mips64 symbols are 32 bits.
+  bool HasSym32;
+
   // HasEVA -- supports EVA ASE.
   bool HasEVA;
+ 
+  // nomadd4 - disables generation of 4-operand madd.s, madd.d and
+  // related instructions.
+  bool DisableMadd4;
+
+  // HasMT -- support MT ASE.
+  bool HasMT;
+
+  // Disable use of the `jal` instruction.
+  bool UseLongCalls = false;
 
   InstrItineraryData InstrItins;
 
@@ -175,8 +185,8 @@ public:
 
   /// This constructor initializes the data members to match that
   /// of the specified triple.
-  MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
-                bool little, const MipsTargetMachine &TM);
+  MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little,
+                const MipsTargetMachine &TM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -229,7 +239,11 @@ public:
   unsigned getGPRSizeInBytes() const { return isGP64bit() ? 8 : 4; }
   bool isPTR64bit() const { return IsPTR64bit; }
   bool isPTR32bit() const { return !IsPTR64bit; }
+  bool hasSym32() const {
+    return (HasSym32 && isABI_N64()) || isABI_N32() || isABI_O32();
+  }
   bool isSingleFloat() const { return IsSingleFloat; }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
   bool inMips16ModeDefault() const {
@@ -249,13 +263,17 @@ public:
   bool hasDSPR2() const { return HasDSPR2; }
   bool hasDSPR3() const { return HasDSPR3; }
   bool hasMSA() const { return HasMSA; }
+  bool disableMadd4() const { return DisableMadd4; }
   bool hasEVA() const { return HasEVA; }
+  bool hasMT() const { return HasMT; }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
   bool useSoftFloat() const { return IsSoftFloat; }
 
+  bool useLongCalls() const { return UseLongCalls; }
+
   bool enableLongBranchPass() const {
     return hasStandardEncoding() || allowMixed16_32();
   }
@@ -271,6 +289,8 @@ public:
 
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
 
+  bool isXRaySupported() const override { return true; }
+
   // for now constant islands are on for the whole compilation unit but we only
   // really use them if in addition we are in mips16 mode
   static bool useConstantIslands();
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index bb48188..330ae19 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -12,26 +12,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsTargetMachine.h"
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "Mips.h"
-#include "Mips16FrameLowering.h"
 #include "Mips16ISelDAGToDAG.h"
-#include "Mips16ISelLowering.h"
-#include "Mips16InstrInfo.h"
-#include "MipsFrameLowering.h"
-#include "MipsInstrInfo.h"
-#include "MipsSEFrameLowering.h"
 #include "MipsSEISelDAGToDAG.h"
-#include "MipsSEISelLowering.h"
-#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetObjectFile.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
 
 using namespace llvm;
 
@@ -48,7 +51,7 @@ extern "C" void LLVMInitializeMipsTarget() {
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options,
                                      bool isLittle) {
-  std::string Ret = "";
+  std::string Ret;
   MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions);
 
   // There are both little and big endian mips.
@@ -102,7 +105,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
                         CPU, FS, Options, getEffectiveRelocModel(CM, RM), CM,
                         OL),
-      isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
+      isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()),
       ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
       Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this),
       NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
@@ -113,9 +116,9 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-MipsTargetMachine::~MipsTargetMachine() {}
+MipsTargetMachine::~MipsTargetMachine() = default;
 
-void MipsebTargetMachine::anchor() { }
+void MipsebTargetMachine::anchor() {}
 
 MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
@@ -125,7 +128,7 @@ MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OL)
     : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void MipselTargetMachine::anchor() { }
+void MipselTargetMachine::anchor() {}
 
 MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
@@ -151,6 +154,11 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
   bool hasNoMips16Attr =
       !F.getFnAttribute("nomips16").hasAttribute(Attribute::None);
 
+  bool HasMicroMipsAttr =
+      !F.getFnAttribute("micromips").hasAttribute(Attribute::None);
+  bool HasNoMicroMipsAttr =
+      !F.getFnAttribute("nomicromips").hasAttribute(Attribute::None);
+
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
   // function, so we can enable it as a subtarget feature.
@@ -162,6 +170,10 @@ MipsTargetMachine::getSubtargetImpl(const Function &F) const {
     FS += FS.empty() ? "+mips16" : ",+mips16";
   else if (hasNoMips16Attr)
     FS += FS.empty() ? "-mips16" : ",-mips16";
+  if (HasMicroMipsAttr)
+    FS += FS.empty() ? "+micromips" : ",+micromips";
+  else if (HasNoMicroMipsAttr)
+    FS += FS.empty() ? "-micromips" : ",-micromips";
   if (softFloat)
     FS += FS.empty() ? "+soft-float" : ",+soft-float";
 
@@ -182,14 +194,14 @@ void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
 
   Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(*MF->getFunction()));
   MF->setSubtarget(Subtarget);
-  return;
 }
 
 namespace {
+
 /// Mips Code Generator Pass Configuration Options.
 class MipsPassConfig : public TargetPassConfig {
 public:
-  MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
+  MipsPassConfig(MipsTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
     // The current implementation of long branch pass requires a scratch
     // register ($at) to be available before branch instructions. Tail merging
@@ -209,35 +221,34 @@ public:
   void addIRPasses() override;
   bool addInstSelector() override;
   void addPreEmitPass() override;
-
   void addPreRegAlloc() override;
-
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new MipsPassConfig(this, PM);
+  return new MipsPassConfig(*this, PM);
 }
 
 void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
-  addPass(createAtomicExpandPass(&getMipsTargetMachine()));
+  addPass(createAtomicExpandPass());
   if (getMipsSubtarget().os16())
-    addPass(createMipsOs16Pass(getMipsTargetMachine()));
+    addPass(createMipsOs16Pass());
   if (getMipsSubtarget().inMips16HardFloat())
-    addPass(createMips16HardFloatPass(getMipsTargetMachine()));
+    addPass(createMips16HardFloatPass());
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
 bool MipsPassConfig::addInstSelector() {
-  addPass(createMipsModuleISelDagPass(getMipsTargetMachine()));
+  addPass(createMipsModuleISelDagPass());
   addPass(createMips16ISelDag(getMipsTargetMachine(), getOptLevel()));
   addPass(createMipsSEISelDag(getMipsTargetMachine(), getOptLevel()));
   return false;
 }
 
 void MipsPassConfig::addPreRegAlloc() {
-  addPass(createMipsOptimizePICCallPass(getMipsTargetMachine()));
+  addPass(createMipsOptimizePICCallPass());
 }
 
 TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
@@ -257,14 +268,14 @@ TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() {
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 void MipsPassConfig::addPreEmitPass() {
-  MipsTargetMachine &TM = getMipsTargetMachine();
+  addPass(createMicroMipsSizeReductionPass());
 
   // The delay slot filler pass can potientially create forbidden slot (FS)
   // hazards for MIPSR6 which the hazard schedule pass (HSP) will fix. Any
   // (new) pass that creates compact branches after the HSP must handle FS
   // hazards itself or be pipelined before the HSP.
-  addPass(createMipsDelaySlotFillerPass(TM));
+  addPass(createMipsDelaySlotFillerPass());
   addPass(createMipsHazardSchedule());
-  addPass(createMipsLongBranchPass(TM));
+  addPass(createMipsLongBranchPass());
   addPass(createMipsConstantIslandPass());
 }
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
index e4cf17e..a346286 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.h
@@ -1,4 +1,4 @@
-//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//===- MipsTargetMachine.h - Define TargetMachine for Mips ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,14 @@
 
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsSubtarget.h"
-#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
-class formatted_raw_ostream;
-class MipsRegisterInfo;
 
 class MipsTargetMachine : public LLVMTargetMachine {
   bool isLittle;
@@ -67,12 +66,17 @@ public:
 
   bool isLittleEndian() const { return isLittle; }
   const MipsABIInfo &getABI() const { return ABI; }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 /// Mips32/64 big endian target machine.
 ///
 class MipsebTargetMachine : public MipsTargetMachine {
   virtual void anchor();
+
 public:
   MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
@@ -84,6 +88,7 @@ public:
 ///
 class MipselTargetMachine : public MipsTargetMachine {
   virtual void anchor();
+
 public:
   MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
@@ -91,6 +96,6 @@ public:
                       CodeGenOpt::Level OL);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index c5d6a05..4d73c39 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -10,13 +10,13 @@
 #include "MipsTargetObjectFile.h"
 #include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
index 41ebe41..7d9f99c 100644
--- a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -40,6 +40,8 @@ public:
   virtual void emitDirectiveSetNoMacro();
   virtual void emitDirectiveSetMsa();
   virtual void emitDirectiveSetNoMsa();
+  virtual void emitDirectiveSetMt();
+  virtual void emitDirectiveSetNoMt();
   virtual void emitDirectiveSetAt();
   virtual void emitDirectiveSetAtWithArg(unsigned RegNo);
   virtual void emitDirectiveSetNoAt();
@@ -96,6 +98,7 @@ public:
   virtual void emitDirectiveModuleOddSPReg();
   virtual void emitDirectiveModuleSoftFloat();
   virtual void emitDirectiveModuleHardFloat();
+  virtual void emitDirectiveModuleMT();
   virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value);
   virtual void emitDirectiveSetOddSPReg();
   virtual void emitDirectiveSetNoOddSPReg();
@@ -204,6 +207,8 @@ public:
   void emitDirectiveSetNoMacro() override;
   void emitDirectiveSetMsa() override;
   void emitDirectiveSetNoMsa() override;
+  void emitDirectiveSetMt() override;
+  void emitDirectiveSetNoMt() override;
   void emitDirectiveSetAt() override;
   void emitDirectiveSetAtWithArg(unsigned RegNo) override;
   void emitDirectiveSetNoAt() override;
@@ -267,6 +272,7 @@ public:
   void emitDirectiveModuleOddSPReg() override;
   void emitDirectiveModuleSoftFloat() override;
   void emitDirectiveModuleHardFloat() override;
+  void emitDirectiveModuleMT() override;
   void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override;
   void emitDirectiveSetOddSPReg() override;
   void emitDirectiveSetNoOddSPReg() override;
diff --git a/contrib/llvm/lib/Target/Mips/Relocation.txt b/contrib/llvm/lib/Target/Mips/Relocation.txt
new file mode 100644
index 0000000..f1a6fd8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Mips/Relocation.txt
@@ -0,0 +1,125 @@
+MIPS Relocation Principles
+
+In LLVM, there are several elements of the llvm::ISD::NodeType enum
+that deal with addresses and/or relocations. These are defined in
+include/llvm/Target/TargetSelectionDAG.td, namely:
+    GlobalAddress, GlobalTLSAddress, JumpTable, ConstantPool,
+    ExternalSymbol, BlockAddress
+The MIPS backend uses several principles to handle these.
+
+1. Code for lowering addresses references to machine dependent code is
+factored into common code for generating different address forms and
+is called by the relocation model specific lowering function, using
+templated functions. For example:
+
+  // lib/Target/Mips/MipsISelLowering.cpp
+  SDValue MipsTargetLowering::
+  lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+
+calls
+
+  template <class NodeTy> // lib/Target/Mips/MipsISelLowering.h
+  SDValue getAddrLocal(NodeTy *N, const SDLoc &DL, EVT Ty,
+                       SelectionDAG &DAG, bool IsN32OrN64) const
+
+which calls the overloaded function:
+
+  // lib/Target/Mips/MipsISelLowering.h
+  SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
+
+2. Generic address nodes are lowered to some combination of target
+independent and machine specific SDNodes (for example:
+MipsISD::{Highest, Higher, Hi, Lo}) depending upon relocation model,
+ABI, and compilation options.
+
+The choice of specific instructions that are to be used is delegated
+to ISel which in turn relies on TableGen patterns to choose subtarget
+specific instructions. For example, in getAddrLocal, the pseudo-code
+generated is:
+
+  (add (load (wrapper $gp, %got(sym)), %lo(sym))
+
+where "%lo" represents an instance of an SDNode with opcode
+"MipsISD::Lo", "wrapper" indicates one with opcode "MipsISD::Wrapper",
+and "%got" the global table pointer "getGlobalReg(...)". The "add" is
+"ISD::ADD", not a target dependent one.
+
+3. A TableGen multiclass pattern "MipsHiLoRelocs" is used to define a
+template pattern parameterized over the load upper immediate
+instruction, add operation, the zero register, and register class.
+Here the instantiation of MipsHiLoRelocs in MipsInstrInfo.td is used
+to MIPS32 to compute addresses for the static relocation model.
+
+  // lib/Target/Mips/MipsInstrInfo.td
+  multiclass MipsHiLoRelocs<Instruction Lui, Instruction Addiu,
+                            Register ZeroReg, RegisterOperand GPROpnd> {
+    def : MipsPat<(MipsHi tglobaladdr:$in), (Lui tglobaladdr:$in)>;
+    ...
+    def : MipsPat<(MipsLo tglobaladdr:$in), (Addiu ZeroReg, tglobaladdr:$in)>;
+    ...
+    def : MipsPat<(add GPROpnd:$hi, (MipsLo tglobaladdr:$lo)),
+                (Addiu GPROpnd:$hi, tglobaladdr:$lo)>;
+    ...
+  }
+  defm : MipsHiLoRelocs<LUi, ADDiu, ZERO, GPR32Opnd>;
+
+  // lib/Target/Mips/Mips64InstrInfo.td
+  defm : MipsHiLoRelocs<LUi64, DADDiu, ZERO_64, GPR64Opnd>, SYM_32;
+
+The instantiation in Mips64InstrInfo.td is used for MIPS64 in ILP32
+mode, as guarded by the predicate "SYM_32" and also for a submode of
+LP64 where symbols are assumed to be 32 bits wide. A similar
+multiclass for MIPS64 in LP64 mode is also defined:
+
+  // lib/Target/Mips/Mips64InstrInfo.td
+  multiclass MipsHighestHigherHiLoRelocs<Instruction Lui,
+                                         Instruction Daddiu> {
+  ...
+    def : MipsPat<(MipsHighest (i64 tglobaladdr:$in)),
+                  (Lui tglobaladdr:$in)>;
+  ...
+    def : MipsPat<(MipsHigher (i64 tglobaladdr:$in)),
+                  (Daddiu ZERO_64, tglobaladdr:$in)>;
+  ...
+    def : MipsPat<(add GPR64:$hi, (MipsHigher (i64 tglobaladdr:$lo))),
+                  (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  ...
+    def : MipsPat<(add GPR64:$hi, (MipsHi (i64 tglobaladdr:$lo))),
+                  (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  ...
+    def : MipsPat<(add GPR64:$hi, (MipsLo (i64 tglobaladdr:$lo))),
+                  (Daddiu GPR64:$hi, tglobaladdr:$lo)>;
+  }
+
+and it is instantiated twice:
+
+  // lib/Target/Mips/Mips64InstrInfo.td
+  defm : MipsHighestHigherHiLoRelocs<LUi64, DADDiu>, SYM_64;
+  // lib/Target/Mips/MicroMips64r6InstrInfo.td
+  defm : MipsHighestHigherHiLoRelocs<LUi64, DADDIU_MM64R6>, SYM_64,
+                                     ISA_MICROMIPS64R6;
+
+These patterns are used during instruction selection to match
+MipsISD::{Highest, Higher, Hi, Lo} to a specific machine instruction
+and operands.
+
+More details on how multiclasses in TableGen work can be found in the
+section "Multiclass definitions and instances" in the document
+"TableGen Language Introduction"
+
+4. Instruction definitions are multiply defined to cover the different
+register classes. In some cases, such as LW/LW64, this also accounts
+for the difference in the results of instruction execution. On MIPS32,
+"lw" loads a 32 bit value from memory. On MIPS64, "lw" loads a 32 bit
+value from memory and sign extends the value to 64 bits.
+
+  // lib/Target/Mips/MipsInstrInfo.td
+  def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16_relaxed>, LUI_FM;
+  // lib/Target/Mips/Mips64InstrInfo.td
+  def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64_relaxed>, LUI_FM;
+
+defines two names "LUi" and "LUi64" with two different register
+classes, but with the same encoding---"LUI_FM". These instructions load a
+16-bit immediate into bits 31-16 and clear the lower 15 bits. On MIPS64,
+the result is sign-extended to 64 bits.
diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index 4594c22..b774fe1 100644
--- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -61,6 +61,12 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   case 6:
     OS << "%fd";
     break;
+  case 7:
+    OS << "%h";
+    break;
+  case 8:
+    OS << "%hh";
+    break;
   }
 
   unsigned VReg = RegNo & 0x0FFFFFFF;
@@ -247,8 +253,12 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
         O << "s";
       else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
         O << "u";
-      else
+      else if (Imm == NVPTX::PTXLdStInstCode::Untyped)
+        O << "b";
+      else if (Imm == NVPTX::PTXLdStInstCode::Float)
         O << "f";
+      else
+        llvm_unreachable("Unknown register type");
     } else if (!strcmp(Modifier, "vec")) {
       if (Imm == NVPTX::PTXLdStInstCode::V2)
         O << ".v2";
diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 78bdf4e..bdd0f15 100644
--- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -27,7 +27,7 @@ void NVPTXMCAsmInfo::anchor() {}
 
 NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple) {
   if (TheTriple.getArch() == Triple::nvptx64) {
-    PointerSize = CalleeSaveStackSlotSize = 8;
+    CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   CommentString = "//";
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
index c455a43..902d1b2 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h
@@ -45,10 +45,8 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM,
                                  llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
-FunctionPass *createNVPTXInferAddressSpacesPass();
 FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
 FunctionPass *createNVVMReflectPass();
-FunctionPass *createNVVMReflectPass(const StringMap<int> &Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
@@ -108,7 +106,8 @@ enum AddressSpace {
 enum FromType {
   Unsigned = 0,
   Signed,
-  Float
+  Float,
+  Untyped
 };
 enum VecType {
   Scalar = 1,
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 3c2594c..0139646 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -12,11 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NVPTXAsmPrinter.h"
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
-#include "NVPTXAsmPrinter.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXRegisterInfo.h"
@@ -73,8 +73,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -320,6 +320,10 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
 
     switch (Cnt->getType()->getTypeID()) {
     default: report_fatal_error("Unsupported FP type"); break;
+    case Type::HalfTyID:
+      MCOp = MCOperand::createExpr(
+        NVPTXFloatMCExpr::createConstantFPHalf(Val, OutContext));
+      break;
     case Type::FloatTyID:
       MCOp = MCOperand::createExpr(
         NVPTXFloatMCExpr::createConstantFPSingle(Val, OutContext));
@@ -357,6 +361,10 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
       Ret = (5 << 28);
     } else if (RC == &NVPTX::Float64RegsRegClass) {
       Ret = (6 << 28);
+    } else if (RC == &NVPTX::Float16RegsRegClass) {
+      Ret = (7 << 28);
+    } else if (RC == &NVPTX::Float16x2RegsRegClass) {
+      Ret = (8 << 28);
     } else {
       report_fatal_error("Bad register class");
     }
@@ -396,12 +404,15 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
       unsigned size = 0;
       if (auto *ITy = dyn_cast<IntegerType>(Ty)) {
         size = ITy->getBitWidth();
-        if (size < 32)
-          size = 32;
       } else {
         assert(Ty->isFloatingPointTy() && "Floating point type expected here");
         size = Ty->getPrimitiveSizeInBits();
       }
+      // PTX ABI requires all scalar return values to be at least 32
+      // bits in size.  fp16 normally uses .b16 as its storage type in
+      // PTX, so its size must be adjusted here, too.
+      if (size < 32)
+        size = 32;
 
       O << ".param .b" << size << " func_retval0";
     } else if (isa<PointerType>(Ty)) {
@@ -1221,7 +1232,8 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   else
     O << " .align " << GVar->getAlignment();
 
-  if (ETy->isFloatingPointTy() || ETy->isIntegerTy() || ETy->isPointerTy()) {
+  if (ETy->isFloatingPointTy() || ETy->isPointerTy() ||
+      (ETy->isIntegerTy() && ETy->getScalarSizeInBits() <= 64)) {
     O << " .";
     // Special case: ABI requires that we use .u8 for predicates
     if (ETy->isIntegerTy(1))
@@ -1262,6 +1274,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     // targets that support these high level field accesses. Structs, arrays
     // and vectors are lowered into arrays of bytes.
     switch (ETy->getTypeID()) {
+    case Type::IntegerTyID: // Integers larger than 64 bits
     case Type::StructTyID:
     case Type::ArrayTyID:
     case Type::VectorTyID:
@@ -1376,6 +1389,9 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const {
     }
     break;
   }
+  case Type::HalfTyID:
+    // fp16 is stored as .b16 for compatibility with pre-sm_53 PTX assembly.
+    return "b16";
   case Type::FloatTyID:
     return "f32";
   case Type::DoubleTyID:
@@ -1477,7 +1493,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
   const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
@@ -1534,12 +1550,12 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       }
     }
 
-    if (!PAL.hasAttribute(paramIndex + 1, Attribute::ByVal)) {
+    if (!PAL.hasParamAttribute(paramIndex, Attribute::ByVal)) {
       if (Ty->isAggregateType() || Ty->isVectorTy()) {
         // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
         // size = typeallocsize of element type
-        unsigned align = PAL.getParamAlignment(paramIndex + 1);
+        unsigned align = PAL.getParamAlignment(paramIndex);
         if (align == 0)
           align = DL.getABITypeAlignment(Ty);
 
@@ -1601,6 +1617,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
           sz = 32;
       } else if (isa<PointerType>(Ty))
         sz = thePointerTy.getSizeInBits();
+      else if (Ty->isHalfTy())
+        // PTX ABI requires all scalar parameters to be at least 32
+        // bits in size.  fp16 normally uses .b16 as its storage type
+        // in PTX, so its size must be adjusted here, too.
+        sz = 32;
       else
         sz = Ty->getPrimitiveSizeInBits();
       if (isABI)
@@ -1620,7 +1641,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // Just print .param .align <a> .b8 .param[size];
       // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
-      unsigned align = PAL.getParamAlignment(paramIndex + 1);
+      unsigned align = PAL.getParamAlignment(paramIndex);
       if (align == 0)
         align = DL.getABITypeAlignment(ETy);
       // Work around a bug in ptxas. When PTX code takes address of
@@ -1977,6 +1998,17 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
   const DataLayout &DL = getDataLayout();
   int Bytes;
 
+  // Integers of arbitrary width
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    APInt Val = CI->getValue();
+    for (unsigned I = 0, E = DL.getTypeAllocSize(CPV->getType()); I < E; ++I) {
+      uint8_t Byte = Val.getLoBits(8).getZExtValue();
+      aggBuffer->addBytes(&Byte, 1, 1);
+      Val.lshrInPlace(8);
+    }
+    return;
+  }
+
   // Old constants
   if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV)) {
     if (CPV->getNumOperands())
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 3907762..916b0e1 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTX.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
 #include "NVPTXUtilities.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Constants.h"
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 43c478f..2749772 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -26,23 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "nvptx-isel"
 
-static cl::opt<int> UsePrecDivF32(
-    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
-    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
-             " IEEE Compliant F32 div.rnd if available."),
-    cl::init(2));
-
-static cl::opt<bool>
-UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
-          cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
-          cl::init(true));
-
-static cl::opt<bool>
-FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
-           cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
-           cl::init(false));
-
-
 /// createNVPTXISelDag - This pass converts a legalized DAG into a
 /// NVPTX-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -57,45 +40,20 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
 }
 
 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
-    Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
-    return SelectionDAGISel::runOnMachineFunction(MF);
+  Subtarget = &static_cast<const NVPTXSubtarget &>(MF.getSubtarget());
+  return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
 int NVPTXDAGToDAGISel::getDivF32Level() const {
-  if (UsePrecDivF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-div32=N is used on the command-line, always honor it
-    return UsePrecDivF32;
-  } else {
-    // Otherwise, use div.approx if fast math is enabled
-    if (TM.Options.UnsafeFPMath)
-      return 0;
-    else
-      return 2;
-  }
+  return Subtarget->getTargetLowering()->getDivF32Level();
 }
 
 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
-  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
-    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
-    return UsePrecSqrtF32;
-  } else {
-    // Otherwise, use sqrt.approx if fast math is enabled
-    return !TM.Options.UnsafeFPMath;
-  }
+  return Subtarget->getTargetLowering()->usePrecSqrtF32();
 }
 
 bool NVPTXDAGToDAGISel::useF32FTZ() const {
-  if (FtzEnabled.getNumOccurrences() > 0) {
-    // If nvptx-f32ftz is used on the command-line, always honor it
-    return FtzEnabled;
-  } else {
-    const Function *F = MF->getFunction();
-    // Otherwise, check for an nvptx-f32ftz attribute on the function
-    if (F->hasFnAttribute("nvptx-f32ftz"))
-      return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
-    else
-      return false;
-  }
+  return Subtarget->getTargetLowering()->useF32FTZ(*MF);
 }
 
 bool NVPTXDAGToDAGISel::allowFMA() const {
@@ -103,6 +61,11 @@ bool NVPTXDAGToDAGISel::allowFMA() const {
   return TL->allowFMA(*MF, OptLevel);
 }
 
+bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
+  const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
+  return TL->allowUnsafeFPMath(*MF);
+}
+
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 void NVPTXDAGToDAGISel::Select(SDNode *N) {
@@ -121,6 +84,14 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryStore(N))
       return;
     break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (tryEXTRACT_VECTOR_ELEMENT(N))
+      return;
+    break;
+  case NVPTXISD::SETP_F16X2:
+    SelectSETP_F16X2(N);
+    return;
+
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
     if (tryLoadVector(N))
@@ -515,6 +486,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
   case ISD::ADDRSPACECAST:
     SelectAddrSpaceCast(N);
     return;
+  case ISD::ConstantFP:
+    if (tryConstantFP16(N))
+      return;
+    break;
   default:
     break;
   }
@@ -536,6 +511,140 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   }
 }
 
+// There's no way to specify FP16 immediates in .f16 ops, so we have to
+// load them into an .f16 register first.
+bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
+  if (N->getValueType(0) != MVT::f16)
+    return false;
+  SDValue Val = CurDAG->getTargetConstantFP(
+      cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), MVT::f16);
+  SDNode *LoadConstF16 =
+      CurDAG->getMachineNode(NVPTX::LOAD_CONST_F16, SDLoc(N), MVT::f16, Val);
+  ReplaceNode(N, LoadConstF16);
+  return true;
+}
+
+// Map ISD:CONDCODE value to appropriate CmpMode expected by
+// NVPTXInstPrinter::printCmpMode()
+static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
+  using NVPTX::PTXCmpMode::CmpMode;
+  unsigned PTXCmpMode = [](ISD::CondCode CC) {
+    switch (CC) {
+    default:
+      llvm_unreachable("Unexpected condition code.");
+    case ISD::SETOEQ:
+      return CmpMode::EQ;
+    case ISD::SETOGT:
+      return CmpMode::GT;
+    case ISD::SETOGE:
+      return CmpMode::GE;
+    case ISD::SETOLT:
+      return CmpMode::LT;
+    case ISD::SETOLE:
+      return CmpMode::LE;
+    case ISD::SETONE:
+      return CmpMode::NE;
+    case ISD::SETO:
+      return CmpMode::NUM;
+    case ISD::SETUO:
+      return CmpMode::NotANumber;
+    case ISD::SETUEQ:
+      return CmpMode::EQU;
+    case ISD::SETUGT:
+      return CmpMode::GTU;
+    case ISD::SETUGE:
+      return CmpMode::GEU;
+    case ISD::SETULT:
+      return CmpMode::LTU;
+    case ISD::SETULE:
+      return CmpMode::LEU;
+    case ISD::SETUNE:
+      return CmpMode::NEU;
+    case ISD::SETEQ:
+      return CmpMode::EQ;
+    case ISD::SETGT:
+      return CmpMode::GT;
+    case ISD::SETGE:
+      return CmpMode::GE;
+    case ISD::SETLT:
+      return CmpMode::LT;
+    case ISD::SETLE:
+      return CmpMode::LE;
+    case ISD::SETNE:
+      return CmpMode::NE;
+    }
+  }(CondCode.get());
+
+  if (FTZ)
+    PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
+
+  return PTXCmpMode;
+}
+
+bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
+  unsigned PTXCmpMode =
+      getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
+  SDLoc DL(N);
+  SDNode *SetP = CurDAG->getMachineNode(
+      NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
+      N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
+  ReplaceNode(N, SetP);
+  return true;
+}
+
+// Find all instances of extract_vector_elt that use this v2f16 vector
+// and coalesce them into a scattering move instruction.
+bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
+  SDValue Vector = N->getOperand(0);
+
+  // We only care about f16x2 as it's the only real vector type we
+  // need to deal with.
+  if (Vector.getSimpleValueType() != MVT::v2f16)
+    return false;
+
+  // Find and record all uses of this vector that extract element 0 or 1.
+  SmallVector<SDNode *, 4> E0, E1;
+  for (const auto &U : Vector.getNode()->uses()) {
+    if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      continue;
+    if (U->getOperand(0) != Vector)
+      continue;
+    if (const ConstantSDNode *IdxConst =
+            dyn_cast<ConstantSDNode>(U->getOperand(1))) {
+      if (IdxConst->getZExtValue() == 0)
+        E0.push_back(U);
+      else if (IdxConst->getZExtValue() == 1)
+        E1.push_back(U);
+      else
+        llvm_unreachable("Invalid vector index.");
+    }
+  }
+
+  // There's no point scattering f16x2 if we only ever access one
+  // element of it.
+  if (E0.empty() || E1.empty())
+    return false;
+
+  unsigned Op = NVPTX::SplitF16x2;
+  // If the vector has been BITCAST'ed from i32, we can use original
+  // value directly and avoid register-to-register move.
+  SDValue Source = Vector;
+  if (Vector->getOpcode() == ISD::BITCAST) {
+    Op = NVPTX::SplitI32toF16x2;
+    Source = Vector->getOperand(0);
+  }
+  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
+  // into f16,f16 SplitF16x2(V)
+  SDNode *ScatterOp =
+      CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
+  for (auto *Node : E0)
+    ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
+  for (auto *Node : E1)
+    ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
+
+  return true;
+}
+
 static unsigned int getCodeAddrSpace(MemSDNode *N) {
   const Value *Src = N->getMemOperand()->getValue();
 
@@ -681,6 +790,35 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   }
 }
 
+// Helper function template to reduce amount of boilerplate code for
+// opcode selection.
+static Optional<unsigned> pickOpcodeForVT(
+    MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
+    unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
+    unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
+  switch (VT) {
+  case MVT::i1:
+  case MVT::i8:
+    return Opcode_i8;
+  case MVT::i16:
+    return Opcode_i16;
+  case MVT::i32:
+    return Opcode_i32;
+  case MVT::i64:
+    return Opcode_i64;
+  case MVT::f16:
+    return Opcode_f16;
+  case MVT::v2f16:
+    return Opcode_f16x2;
+  case MVT::f32:
+    return Opcode_f32;
+  case MVT::f64:
+    return Opcode_f64;
+  default:
+    return None;
+  }
+}
+
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -709,33 +847,32 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
       codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
-  // Vector Setting
-  MVT SimpleVT = LoadedVT.getSimpleVT();
-  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    unsigned num = SimpleVT.getVectorNumElements();
-    if (num == 2)
-      vecType = NVPTX::PTXLdStInstCode::V2;
-    else if (num == 4)
-      vecType = NVPTX::PTXLdStInstCode::V4;
-    else
-      return false;
-  }
-
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT SimpleVT = LoadedVT.getSimpleVT();
   MVT ScalarVT = SimpleVT.getScalarType();
   // Read at least 8 bits (predicates are stored as 8-bit values)
   unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int fromType;
+
+  // Vector Setting
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
+    // v2f16 is loaded using ld.b32
+    fromTypeWidth = 32;
+  }
+
   if ((LD->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
-    fromType = NVPTX::PTXLdStInstCode::Float;
+    // f16 uses .b16 as its storage type.
+    fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                             : NVPTX::PTXLdStInstCode::Float;
   else
     fromType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -744,169 +881,72 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   SDValue Addr;
   SDValue Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
-    switch (TargetVT) {
-    case MVT::i8:
-      Opcode = NVPTX::LD_i8_avar;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::LD_i16_avar;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::LD_i32_avar;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::LD_i64_avar;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::LD_f32_avar;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::LD_f64_avar;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(
+        TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
+        NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
+        NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                           : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
-    switch (TargetVT) {
-    case MVT::i8:
-      Opcode = NVPTX::LD_i8_asi;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::LD_i16_asi;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::LD_i32_asi;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::LD_i64_asi;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::LD_f32_asi;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::LD_f64_asi;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
+                                 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
+                                 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
+                                 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                           : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
-    if (TM.is64Bit()) {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_ari_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_ari_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_ari_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_ari_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_ari_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_ari_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_ari;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_ari;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_ari;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_ari;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_ari;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_ari;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
+          NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
+          NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
+    else
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
+          NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
+          NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else {
-    if (TM.is64Bit()) {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_areg_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_areg_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_areg_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_areg_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_areg_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_areg_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_areg;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_areg;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_areg;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_areg;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_areg;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_areg;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
+          NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
+          NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
+          NVPTX::LD_f64_areg_64);
+    else
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
+          NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
+          NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), N1, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   }
 
   if (!NVPTXLD)
@@ -925,7 +965,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   MemSDNode *MemSD = cast<MemSDNode>(N);
@@ -968,7 +1008,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   if (ExtensionType == ISD::SEXTLOAD)
     FromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
-    FromType = NVPTX::PTXLdStInstCode::Float;
+    FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                             : NVPTX::PTXLdStInstCode::Float;
   else
     FromType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -987,111 +1028,67 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
   EVT EltVT = N->getValueType(0);
 
+  // v8f16 is a special case. PTX doesn't have ld.v8.f16
+  // instruction. Instead, we split the vector into v2f16 chunks and
+  // load them with ld.v4.b32.
+  if (EltVT == MVT::v2f16) {
+    assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
+    EltVT = MVT::i32;
+    FromType = NVPTX::PTXLdStInstCode::Untyped;
+    FromTypeWidth = 32;
+  }
+
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v2_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v2_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v2_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LDV_i64_v2_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v2_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LDV_f64_v2_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
+                               NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
+                               NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
+                               NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
       break;
     case NVPTXISD::LoadV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v4_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v4_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v4_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v4_avar;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
+                          NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
+                          NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
+                          NVPTX::LDV_f32_v4_avar, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v2_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v2_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v2_asi;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LDV_i64_v2_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v2_asi;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LDV_f64_v2_asi;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
+                               NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
+                               NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
+                               NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
       break;
     case NVPTXISD::LoadV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v4_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v4_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v4_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v4_asi;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
+                          NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
+                          NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
+                          NVPTX::LDV_f32_v4_asi, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1099,46 +1096,19 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_ari_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_ari_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
+            NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
+            NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
+            NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
+            NVPTX::LDV_f64_v2_ari_64);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
+            NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
+            NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
+            NVPTX::LDV_f32_v4_ari_64, None);
         break;
       }
     } else {
@@ -1146,101 +1116,47 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_ari;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_ari;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_ari;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
+                                 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
+                                 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
+                                 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_ari;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
+                            NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
+                            NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
+                            NVPTX::LDV_f32_v4_ari, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_areg_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_areg_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
+            NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
+            NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
+            NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
+            NVPTX::LDV_f64_v2_areg_64);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
+            NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
+            NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
+            NVPTX::LDV_f32_v4_areg_64, None);
         break;
       }
     } else {
@@ -1248,54 +1164,28 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_areg;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_areg;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
+                            NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
+                            NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
+                            NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
+                            NVPTX::LDV_f64_v2_areg);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_areg;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
+            NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
+            NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
+            NVPTX::LDV_f32_v4_areg, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -1338,7 +1228,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     Mem = cast<MemSDNode>(N);
   }
 
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   SDValue Base, Offset, Addr;
@@ -1366,142 +1256,72 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     default:
       return false;
     case ISD::INTRINSIC_W_CHAIN:
-      if (IsLDG) {
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
-          break;
-        }
-      } else {
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
-          break;
-        }
-      }
+      if (IsLDG)
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+      else
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
       break;
     case NVPTXISD::LDGV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDUV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDGV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
+                               NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
       break;
     case NVPTXISD::LDUV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
+                               NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1510,139 +1330,68 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
         break;
       }
     } else {
@@ -1651,146 +1400,75 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
         break;
       }
     }
-
-    SDValue Ops[] = { Base, Offset, Chain };
-
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    if (!Opcode)
+      return false;
+    SDValue Ops[] = {Base, Offset, Chain};
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
@@ -1798,139 +1476,68 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
         break;
       }
     } else {
@@ -1939,145 +1546,75 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -2151,24 +1688,23 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   // Vector Setting
   MVT SimpleVT = StoreVT.getSimpleVT();
   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    unsigned num = SimpleVT.getVectorNumElements();
-    if (num == 2)
-      vecType = NVPTX::PTXLdStInstCode::V2;
-    else if (num == 4)
-      vecType = NVPTX::PTXLdStInstCode::V4;
-    else
-      return false;
-  }
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   //
   MVT ScalarVT = SimpleVT.getScalarType();
   unsigned toTypeWidth = ScalarVT.getSizeInBits();
+  if (SimpleVT.isVector()) {
+    assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
+    // v2f16 is stored using st.b32
+    toTypeWidth = 32;
+  }
+
   unsigned int toType;
   if (ScalarVT.isFloatingPoint())
-    toType = NVPTX::PTXLdStInstCode::Float;
+    // f16 uses .b16 as its storage type.
+    toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                           : NVPTX::PTXLdStInstCode::Float;
   else
     toType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -2178,173 +1714,73 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   SDValue Addr;
   SDValue Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N2, Addr)) {
-    switch (SourceVT) {
-    case MVT::i8:
-      Opcode = NVPTX::ST_i8_avar;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::ST_i16_avar;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::ST_i32_avar;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::ST_i64_avar;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::ST_f32_avar;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::ST_f64_avar;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
+                             NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
+                             NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
+                             NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
                       Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
                           : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
-    switch (SourceVT) {
-    case MVT::i8:
-      Opcode = NVPTX::ST_i8_asi;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::ST_i16_asi;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::ST_i32_asi;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::ST_i64_asi;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::ST_f32_asi;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::ST_f64_asi;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
+                             NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
+                             NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
+                             NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
                           : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (TM.is64Bit()) {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_ari_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_ari_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_ari_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_ari_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_ari_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_ari_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_ari;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_ari;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_ari;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_ari;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_ari;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_ari;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
+          NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
+          NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
+    else
+      Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
+                               NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
+                               NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
+                               NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
+    if (!Opcode)
+      return false;
+
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else {
-    if (TM.is64Bit()) {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_areg_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_areg_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_areg_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_areg_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_areg_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_areg_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_areg;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_areg;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_areg;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_areg;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_areg;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_areg;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode =
+          pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
+                          NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
+                          NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
+                          NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
+    else
+      Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
+                               NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
+                               NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
+                               NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
                       Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   }
 
   if (!NVPTXST)
@@ -2361,7 +1797,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *ST;
   EVT EltVT = Op1.getValueType();
@@ -2391,7 +1827,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
   unsigned ToType;
   if (ScalarVT.isFloatingPoint())
-    ToType = NVPTX::PTXLdStInstCode::Float;
+    ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                           : NVPTX::PTXLdStInstCode::Float;
   else
     ToType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -2418,6 +1855,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     return false;
   }
 
+  // v8f16 is a special case. PTX doesn't have st.v8.f16
+  // instruction. Instead, we split the vector into v2f16 chunks and
+  // store them with st.v4.b32.
+  if (EltVT == MVT::v2f16) {
+    assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
+    EltVT = MVT::i32;
+    ToType = NVPTX::PTXLdStInstCode::Untyped;
+    ToTypeWidth = 32;
+  }
+
   StOps.push_back(getI32Imm(IsVolatile, DL));
   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
   StOps.push_back(getI32Imm(VecType, DL));
@@ -2429,46 +1876,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     default:
       return false;
     case NVPTXISD::StoreV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v2_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v2_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v2_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::STV_i64_v2_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v2_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::STV_f64_v2_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
+                               NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
+                               NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
+                               NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
       break;
     case NVPTXISD::StoreV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v4_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v4_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v4_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v4_avar;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
+                          NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
+                          NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
+                          NVPTX::STV_f32_v4_avar, None);
       break;
     }
     StOps.push_back(Addr);
@@ -2478,46 +1897,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     default:
       return false;
     case NVPTXISD::StoreV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v2_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v2_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v2_asi;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::STV_i64_v2_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v2_asi;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::STV_f64_v2_asi;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
+                               NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
+                               NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
+                               NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
       break;
     case NVPTXISD::StoreV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v4_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v4_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v4_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v4_asi;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
+                          NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
+                          NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
+                          NVPTX::STV_f32_v4_asi, None);
       break;
     }
     StOps.push_back(Base);
@@ -2529,46 +1920,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_ari_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_ari_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
+            NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
+            NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
+            NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
+            NVPTX::STV_f64_v2_ari_64);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
+            NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
+            NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
+            NVPTX::STV_f32_v4_ari_64, None);
         break;
       }
     } else {
@@ -2576,46 +1940,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_ari;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_ari;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_ari;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
+                                 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
+                                 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
+                                 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_ari;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
+                            NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
+                            NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
+                            NVPTX::STV_f32_v4_ari, None);
         break;
       }
     }
@@ -2627,46 +1963,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_areg_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_areg_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
+            NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
+            NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
+            NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
+            NVPTX::STV_f64_v2_areg_64);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
+            NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
+            NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
+            NVPTX::STV_f32_v4_areg_64, None);
         break;
       }
     } else {
@@ -2674,55 +1983,31 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_areg;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_areg;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
+                            NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
+                            NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
+                            NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
+                            NVPTX::STV_f64_v2_areg);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
+                            NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
+                            NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
+                            NVPTX::STV_f32_v4_areg, None);
         break;
       }
     }
     StOps.push_back(N2);
   }
 
+  if (!Opcode)
+    return false;
+
   StOps.push_back(Chain);
 
-  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
@@ -2757,87 +2042,36 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   EVT EltVT = Node->getValueType(0);
   EVT MemVT = Mem->getMemoryVT();
 
-  unsigned Opc = 0;
+  Optional<unsigned> Opcode;
 
   switch (VecSize) {
   default:
     return false;
   case 1:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemI8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemI8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemI16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemI32;
-      break;
-    case MVT::i64:
-      Opc = NVPTX::LoadParamMemI64;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemF32;
-      break;
-    case MVT::f64:
-      Opc = NVPTX::LoadParamMemF64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
+                             NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
+                             NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
+                             NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
+                             NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
     break;
   case 2:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemV2I8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemV2I8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemV2I16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemV2I32;
-      break;
-    case MVT::i64:
-      Opc = NVPTX::LoadParamMemV2I64;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemV2F32;
-      break;
-    case MVT::f64:
-      Opc = NVPTX::LoadParamMemV2F64;
-      break;
-    }
+    Opcode =
+        pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
+                        NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
+                        NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
+                        NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
+                        NVPTX::LoadParamMemV2F64);
     break;
   case 4:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemV4I8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemV4I8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemV4I16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemV4I32;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemV4F32;
-      break;
-    }
+    Opcode = pickOpcodeForVT(
+        MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
+        NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
+        NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
+        NVPTX::LoadParamMemV4F32, None);
     break;
   }
+  if (!Opcode)
+    return false;
 
   SDVTList VTs;
   if (VecSize == 1) {
@@ -2856,7 +2090,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
-  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
   return true;
 }
 
@@ -2893,89 +2127,36 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  unsigned Opcode = 0;
+  Optional<unsigned> Opcode = 0;
   switch (NumElts) {
   default:
     return false;
   case 1:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalI8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalI8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalI16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalI32;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::StoreRetvalI64;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalF32;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::StoreRetvalF64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
+                             NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
+                             NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
+                             NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
     break;
   case 2:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalV2I8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalV2I8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalV2I16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalV2I32;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::StoreRetvalV2I64;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalV2F32;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::StoreRetvalV2F64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
+                             NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
+                             NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
+                             NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
     break;
   case 4:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalV4I8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalV4I8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalV4I16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalV4I32;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalV4F32;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
+                             NVPTX::StoreRetvalV4I32, None,
+                             NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
+                             NVPTX::StoreRetvalV4F32, None);
     break;
   }
+  if (!Opcode)
+    return false;
 
-  SDNode *Ret =
-      CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -3024,88 +2205,36 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  unsigned Opcode = 0;
+  Optional<unsigned> Opcode = 0;
   switch (N->getOpcode()) {
   default:
     switch (NumElts) {
     default:
       return false;
     case 1:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamI8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamI8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamI16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamI32;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamI64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamF32;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::StoreParamF64;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamI8, NVPTX::StoreParamI16,
+                               NVPTX::StoreParamI32, NVPTX::StoreParamI64,
+                               NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
+                               NVPTX::StoreParamF32, NVPTX::StoreParamF64);
       break;
     case 2:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamV2I8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamV2I8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamV2I16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamV2I32;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamV2I64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamV2F32;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::StoreParamV2F64;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
+                               NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
+                               NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
+                               NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
       break;
     case 4:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamV4I8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamV4I8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamV4I16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamV4I32;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamV4F32;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
+                               NVPTX::StoreParamV4I32, None,
+                               NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
+                               NVPTX::StoreParamV4F32, None);
       break;
     }
+    if (!Opcode)
+      return false;
     break;
   // Special case: if we have a sign-extend/zero-extend node, insert the
   // conversion instruction first, and use that as the value operand to
@@ -3132,7 +2261,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
 
   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
   SDNode *Ret =
-      CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+      CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 0591035..8fc38e7 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -34,6 +34,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   bool usePrecSqrtF32() const;
   bool useF32FTZ() const;
   bool allowFMA() const;
+  bool allowUnsafeFPMath() const;
 
 public:
   explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
@@ -69,6 +70,9 @@ private:
   bool tryTextureIntrinsic(SDNode *N);
   bool trySurfaceIntrinsic(SDNode *N);
   bool tryBFE(SDNode *N);
+  bool tryConstantFP16(SDNode *N);
+  bool SelectSETP_F16X2(SDNode *N);
+  bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
 
   inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7a760fd..f800d91 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1,4653 +1,4667 @@
-//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that NVPTX uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/NVPTXBaseInfo.h"
-#include "NVPTX.h"
-#include "NVPTXISelLowering.h"
-#include "NVPTXSection.h"
-#include "NVPTXSubtarget.h"
-#include "NVPTXTargetMachine.h"
-#include "NVPTXTargetObjectFile.h"
-#include "NVPTXUtilities.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineValueType.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetCallingConv.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "nvptx-lower"
-
-using namespace llvm;
-
-static unsigned int uniqueCallSite = 0;
-
-static cl::opt<bool> sched4reg(
-    "nvptx-sched4reg",
-    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
-
-static cl::opt<unsigned>
-FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
-                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
-                             " 1: do it  2: do it aggressively"),
-                    cl::init(2));
-
-static bool IsPTXVectorType(MVT VT) {
-  switch (VT.SimpleTy) {
-  default:
-    return false;
-  case MVT::v2i1:
-  case MVT::v4i1:
-  case MVT::v2i8:
-  case MVT::v4i8:
-  case MVT::v2i16:
-  case MVT::v4i16:
-  case MVT::v2i32:
-  case MVT::v4i32:
-  case MVT::v2i64:
-  case MVT::v2f32:
-  case MVT::v4f32:
-  case MVT::v2f64:
-    return true;
-  }
-}
-
-/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
-/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
-/// into their primitive components.
-/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
-/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
-/// LowerCall, and LowerReturn.
-static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
-                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
-                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
-                               uint64_t StartingOffset = 0) {
-  SmallVector<EVT, 16> TempVTs;
-  SmallVector<uint64_t, 16> TempOffsets;
-
-  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
-  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
-    EVT VT = TempVTs[i];
-    uint64_t Off = TempOffsets[i];
-    if (VT.isVector())
-      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
-        ValueVTs.push_back(VT.getVectorElementType());
-        if (Offsets)
-          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
-      }
-    else {
-      ValueVTs.push_back(VT);
-      if (Offsets)
-        Offsets->push_back(Off);
-    }
-  }
-}
-
-// NVPTXTargetLowering Constructor.
-NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
-                                         const NVPTXSubtarget &STI)
-    : TargetLowering(TM), nvTM(&TM), STI(STI) {
-  // always lower memset, memcpy, and memmove intrinsics to load/store
-  // instructions, rather
-  // then generating calls to memset, mempcy or memmove.
-  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
-  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
-  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
-
-  setBooleanContents(ZeroOrNegativeOneBooleanContent);
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
-  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
-  // condition branches.
-  setJumpIsExpensive(true);
-
-  // Wide divides are _very_ slow. Try to reduce the width of the divide if
-  // possible.
-  addBypassSlowDiv(64, 32);
-
-  // By default, use the Source scheduling
-  if (sched4reg)
-    setSchedulingPreference(Sched::RegPressure);
-  else
-    setSchedulingPreference(Sched::Source);
-
-  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
-  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
-  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
-  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
-  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
-  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
-
-  // Operations not directly supported by NVPTX.
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
-  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
-  // For others we will expand to a SHL/SRA pair.
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-
-  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
-  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
-
-  if (STI.hasROT64()) {
-    setOperationAction(ISD::ROTL, MVT::i64, Legal);
-    setOperationAction(ISD::ROTR, MVT::i64, Legal);
-  } else {
-    setOperationAction(ISD::ROTL, MVT::i64, Expand);
-    setOperationAction(ISD::ROTR, MVT::i64, Expand);
-  }
-  if (STI.hasROT32()) {
-    setOperationAction(ISD::ROTL, MVT::i32, Legal);
-    setOperationAction(ISD::ROTR, MVT::i32, Legal);
-  } else {
-    setOperationAction(ISD::ROTL, MVT::i32, Expand);
-    setOperationAction(ISD::ROTR, MVT::i32, Expand);
-  }
-
-  setOperationAction(ISD::ROTL, MVT::i16, Expand);
-  setOperationAction(ISD::ROTR, MVT::i16, Expand);
-  setOperationAction(ISD::ROTL, MVT::i8, Expand);
-  setOperationAction(ISD::ROTR, MVT::i8, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
-
-  // Indirect branch is not supported.
-  // This also disables Jump Table creation.
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::BRIND, MVT::Other, Expand);
-
-  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-
-  // We want to legalize constant related memmove and memcopy
-  // intrinsics.
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-
-  // Turn FP extload into load/fpextend
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
-  // Turn FP truncstore into trunc + store.
-  // FIXME: vector types should also be expanded
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
-  // PTX does not support load / store predicate registers
-  setOperationAction(ISD::LOAD, MVT::i1, Custom);
-  setOperationAction(ISD::STORE, MVT::i1, Custom);
-
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
-    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
-    setTruncStoreAction(VT, MVT::i1, Expand);
-  }
-
-  // This is legal in NVPTX
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-
-  // TRAP can be lowered to PTX trap
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-
-  setOperationAction(ISD::ADDC, MVT::i64, Expand);
-  setOperationAction(ISD::ADDE, MVT::i64, Expand);
-
-  // Register custom handling for vector loads/stores
-  for (MVT VT : MVT::vector_valuetypes()) {
-    if (IsPTXVectorType(VT)) {
-      setOperationAction(ISD::LOAD, VT, Custom);
-      setOperationAction(ISD::STORE, VT, Custom);
-      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
-    }
-  }
-
-  // Custom handling for i8 intrinsics
-  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
-  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
-  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
-  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
-  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
-  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
-  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
-  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
-  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
-
-  // PTX does not directly support SELP of i1, so promote to i32 first
-  setOperationAction(ISD::SELECT, MVT::i1, Custom);
-
-  // PTX cannot multiply two i64s in a single instruction.
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
-  // We have some custom DAG combine patterns for these nodes
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::FADD);
-  setTargetDAGCombine(ISD::MUL);
-  setTargetDAGCombine(ISD::SHL);
-  setTargetDAGCombine(ISD::SELECT);
-  setTargetDAGCombine(ISD::SREM);
-  setTargetDAGCombine(ISD::UREM);
-
-  // Library functions.  These default to Expand, but we have instructions
-  // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f64, Legal);
-  setOperationAction(ISD::FROUND, MVT::f32, Legal);
-  setOperationAction(ISD::FROUND, MVT::f64, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
-  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
-  // No FPOW or FREM in PTX.
-
-  // Now deduce the information based on the above mentioned
-  // actions
-  computeRegisterProperties(STI.getRegisterInfo());
-}
-
-const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((NVPTXISD::NodeType)Opcode) {
-  case NVPTXISD::FIRST_NUMBER:
-    break;
-  case NVPTXISD::CALL:
-    return "NVPTXISD::CALL";
-  case NVPTXISD::RET_FLAG:
-    return "NVPTXISD::RET_FLAG";
-  case NVPTXISD::LOAD_PARAM:
-    return "NVPTXISD::LOAD_PARAM";
-  case NVPTXISD::Wrapper:
-    return "NVPTXISD::Wrapper";
-  case NVPTXISD::DeclareParam:
-    return "NVPTXISD::DeclareParam";
-  case NVPTXISD::DeclareScalarParam:
-    return "NVPTXISD::DeclareScalarParam";
-  case NVPTXISD::DeclareRet:
-    return "NVPTXISD::DeclareRet";
-  case NVPTXISD::DeclareScalarRet:
-    return "NVPTXISD::DeclareScalarRet";
-  case NVPTXISD::DeclareRetParam:
-    return "NVPTXISD::DeclareRetParam";
-  case NVPTXISD::PrintCall:
-    return "NVPTXISD::PrintCall";
-  case NVPTXISD::PrintConvergentCall:
-    return "NVPTXISD::PrintConvergentCall";
-  case NVPTXISD::PrintCallUni:
-    return "NVPTXISD::PrintCallUni";
-  case NVPTXISD::PrintConvergentCallUni:
-    return "NVPTXISD::PrintConvergentCallUni";
-  case NVPTXISD::LoadParam:
-    return "NVPTXISD::LoadParam";
-  case NVPTXISD::LoadParamV2:
-    return "NVPTXISD::LoadParamV2";
-  case NVPTXISD::LoadParamV4:
-    return "NVPTXISD::LoadParamV4";
-  case NVPTXISD::StoreParam:
-    return "NVPTXISD::StoreParam";
-  case NVPTXISD::StoreParamV2:
-    return "NVPTXISD::StoreParamV2";
-  case NVPTXISD::StoreParamV4:
-    return "NVPTXISD::StoreParamV4";
-  case NVPTXISD::StoreParamS32:
-    return "NVPTXISD::StoreParamS32";
-  case NVPTXISD::StoreParamU32:
-    return "NVPTXISD::StoreParamU32";
-  case NVPTXISD::CallArgBegin:
-    return "NVPTXISD::CallArgBegin";
-  case NVPTXISD::CallArg:
-    return "NVPTXISD::CallArg";
-  case NVPTXISD::LastCallArg:
-    return "NVPTXISD::LastCallArg";
-  case NVPTXISD::CallArgEnd:
-    return "NVPTXISD::CallArgEnd";
-  case NVPTXISD::CallVoid:
-    return "NVPTXISD::CallVoid";
-  case NVPTXISD::CallVal:
-    return "NVPTXISD::CallVal";
-  case NVPTXISD::CallSymbol:
-    return "NVPTXISD::CallSymbol";
-  case NVPTXISD::Prototype:
-    return "NVPTXISD::Prototype";
-  case NVPTXISD::MoveParam:
-    return "NVPTXISD::MoveParam";
-  case NVPTXISD::StoreRetval:
-    return "NVPTXISD::StoreRetval";
-  case NVPTXISD::StoreRetvalV2:
-    return "NVPTXISD::StoreRetvalV2";
-  case NVPTXISD::StoreRetvalV4:
-    return "NVPTXISD::StoreRetvalV4";
-  case NVPTXISD::PseudoUseParam:
-    return "NVPTXISD::PseudoUseParam";
-  case NVPTXISD::RETURN:
-    return "NVPTXISD::RETURN";
-  case NVPTXISD::CallSeqBegin:
-    return "NVPTXISD::CallSeqBegin";
-  case NVPTXISD::CallSeqEnd:
-    return "NVPTXISD::CallSeqEnd";
-  case NVPTXISD::CallPrototype:
-    return "NVPTXISD::CallPrototype";
-  case NVPTXISD::LoadV2:
-    return "NVPTXISD::LoadV2";
-  case NVPTXISD::LoadV4:
-    return "NVPTXISD::LoadV4";
-  case NVPTXISD::LDGV2:
-    return "NVPTXISD::LDGV2";
-  case NVPTXISD::LDGV4:
-    return "NVPTXISD::LDGV4";
-  case NVPTXISD::LDUV2:
-    return "NVPTXISD::LDUV2";
-  case NVPTXISD::LDUV4:
-    return "NVPTXISD::LDUV4";
-  case NVPTXISD::StoreV2:
-    return "NVPTXISD::StoreV2";
-  case NVPTXISD::StoreV4:
-    return "NVPTXISD::StoreV4";
-  case NVPTXISD::FUN_SHFL_CLAMP:
-    return "NVPTXISD::FUN_SHFL_CLAMP";
-  case NVPTXISD::FUN_SHFR_CLAMP:
-    return "NVPTXISD::FUN_SHFR_CLAMP";
-  case NVPTXISD::IMAD:
-    return "NVPTXISD::IMAD";
-  case NVPTXISD::Dummy:
-    return "NVPTXISD::Dummy";
-  case NVPTXISD::MUL_WIDE_SIGNED:
-    return "NVPTXISD::MUL_WIDE_SIGNED";
-  case NVPTXISD::MUL_WIDE_UNSIGNED:
-    return "NVPTXISD::MUL_WIDE_UNSIGNED";
-  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
-  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
-  case NVPTXISD::Tex1DFloatFloatLevel:
-    return "NVPTXISD::Tex1DFloatFloatLevel";
-  case NVPTXISD::Tex1DFloatFloatGrad:
-    return "NVPTXISD::Tex1DFloatFloatGrad";
-  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
-  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
-  case NVPTXISD::Tex1DS32FloatLevel:
-    return "NVPTXISD::Tex1DS32FloatLevel";
-  case NVPTXISD::Tex1DS32FloatGrad:
-    return "NVPTXISD::Tex1DS32FloatGrad";
-  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
-  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
-  case NVPTXISD::Tex1DU32FloatLevel:
-    return "NVPTXISD::Tex1DU32FloatLevel";
-  case NVPTXISD::Tex1DU32FloatGrad:
-    return "NVPTXISD::Tex1DU32FloatGrad";
-  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
-  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
-  case NVPTXISD::Tex1DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
-  case NVPTXISD::Tex1DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
-  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
-  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
-  case NVPTXISD::Tex1DArrayS32FloatLevel:
-    return "NVPTXISD::Tex1DArrayS32FloatLevel";
-  case NVPTXISD::Tex1DArrayS32FloatGrad:
-    return "NVPTXISD::Tex1DArrayS32FloatGrad";
-  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
-  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
-  case NVPTXISD::Tex1DArrayU32FloatLevel:
-    return "NVPTXISD::Tex1DArrayU32FloatLevel";
-  case NVPTXISD::Tex1DArrayU32FloatGrad:
-    return "NVPTXISD::Tex1DArrayU32FloatGrad";
-  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
-  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
-  case NVPTXISD::Tex2DFloatFloatLevel:
-    return "NVPTXISD::Tex2DFloatFloatLevel";
-  case NVPTXISD::Tex2DFloatFloatGrad:
-    return "NVPTXISD::Tex2DFloatFloatGrad";
-  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
-  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
-  case NVPTXISD::Tex2DS32FloatLevel:
-    return "NVPTXISD::Tex2DS32FloatLevel";
-  case NVPTXISD::Tex2DS32FloatGrad:
-    return "NVPTXISD::Tex2DS32FloatGrad";
-  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
-  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
-  case NVPTXISD::Tex2DU32FloatLevel:
-    return "NVPTXISD::Tex2DU32FloatLevel";
-  case NVPTXISD::Tex2DU32FloatGrad:
-    return "NVPTXISD::Tex2DU32FloatGrad";
-  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
-  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
-  case NVPTXISD::Tex2DArrayFloatFloatLevel:
-    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
-  case NVPTXISD::Tex2DArrayFloatFloatGrad:
-    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
-  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
-  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
-  case NVPTXISD::Tex2DArrayS32FloatLevel:
-    return "NVPTXISD::Tex2DArrayS32FloatLevel";
-  case NVPTXISD::Tex2DArrayS32FloatGrad:
-    return "NVPTXISD::Tex2DArrayS32FloatGrad";
-  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
-  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
-  case NVPTXISD::Tex2DArrayU32FloatLevel:
-    return "NVPTXISD::Tex2DArrayU32FloatLevel";
-  case NVPTXISD::Tex2DArrayU32FloatGrad:
-    return "NVPTXISD::Tex2DArrayU32FloatGrad";
-  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
-  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
-  case NVPTXISD::Tex3DFloatFloatLevel:
-    return "NVPTXISD::Tex3DFloatFloatLevel";
-  case NVPTXISD::Tex3DFloatFloatGrad:
-    return "NVPTXISD::Tex3DFloatFloatGrad";
-  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
-  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
-  case NVPTXISD::Tex3DS32FloatLevel:
-    return "NVPTXISD::Tex3DS32FloatLevel";
-  case NVPTXISD::Tex3DS32FloatGrad:
-    return "NVPTXISD::Tex3DS32FloatGrad";
-  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
-  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
-  case NVPTXISD::Tex3DU32FloatLevel:
-    return "NVPTXISD::Tex3DU32FloatLevel";
-  case NVPTXISD::Tex3DU32FloatGrad:
-    return "NVPTXISD::Tex3DU32FloatGrad";
-  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
-  case NVPTXISD::TexCubeFloatFloatLevel:
-    return "NVPTXISD::TexCubeFloatFloatLevel";
-  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
-  case NVPTXISD::TexCubeS32FloatLevel:
-    return "NVPTXISD::TexCubeS32FloatLevel";
-  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
-  case NVPTXISD::TexCubeU32FloatLevel:
-    return "NVPTXISD::TexCubeU32FloatLevel";
-  case NVPTXISD::TexCubeArrayFloatFloat:
-    return "NVPTXISD::TexCubeArrayFloatFloat";
-  case NVPTXISD::TexCubeArrayFloatFloatLevel:
-    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
-  case NVPTXISD::TexCubeArrayS32Float:
-    return "NVPTXISD::TexCubeArrayS32Float";
-  case NVPTXISD::TexCubeArrayS32FloatLevel:
-    return "NVPTXISD::TexCubeArrayS32FloatLevel";
-  case NVPTXISD::TexCubeArrayU32Float:
-    return "NVPTXISD::TexCubeArrayU32Float";
-  case NVPTXISD::TexCubeArrayU32FloatLevel:
-    return "NVPTXISD::TexCubeArrayU32FloatLevel";
-  case NVPTXISD::Tld4R2DFloatFloat:
-    return "NVPTXISD::Tld4R2DFloatFloat";
-  case NVPTXISD::Tld4G2DFloatFloat:
-    return "NVPTXISD::Tld4G2DFloatFloat";
-  case NVPTXISD::Tld4B2DFloatFloat:
-    return "NVPTXISD::Tld4B2DFloatFloat";
-  case NVPTXISD::Tld4A2DFloatFloat:
-    return "NVPTXISD::Tld4A2DFloatFloat";
-  case NVPTXISD::Tld4R2DS64Float:
-    return "NVPTXISD::Tld4R2DS64Float";
-  case NVPTXISD::Tld4G2DS64Float:
-    return "NVPTXISD::Tld4G2DS64Float";
-  case NVPTXISD::Tld4B2DS64Float:
-    return "NVPTXISD::Tld4B2DS64Float";
-  case NVPTXISD::Tld4A2DS64Float:
-    return "NVPTXISD::Tld4A2DS64Float";
-  case NVPTXISD::Tld4R2DU64Float:
-    return "NVPTXISD::Tld4R2DU64Float";
-  case NVPTXISD::Tld4G2DU64Float:
-    return "NVPTXISD::Tld4G2DU64Float";
-  case NVPTXISD::Tld4B2DU64Float:
-    return "NVPTXISD::Tld4B2DU64Float";
-  case NVPTXISD::Tld4A2DU64Float:
-    return "NVPTXISD::Tld4A2DU64Float";
-
-  case NVPTXISD::TexUnified1DFloatS32:
-    return "NVPTXISD::TexUnified1DFloatS32";
-  case NVPTXISD::TexUnified1DFloatFloat:
-    return "NVPTXISD::TexUnified1DFloatFloat";
-  case NVPTXISD::TexUnified1DFloatFloatLevel:
-    return "NVPTXISD::TexUnified1DFloatFloatLevel";
-  case NVPTXISD::TexUnified1DFloatFloatGrad:
-    return "NVPTXISD::TexUnified1DFloatFloatGrad";
-  case NVPTXISD::TexUnified1DS32S32:
-    return "NVPTXISD::TexUnified1DS32S32";
-  case NVPTXISD::TexUnified1DS32Float:
-    return "NVPTXISD::TexUnified1DS32Float";
-  case NVPTXISD::TexUnified1DS32FloatLevel:
-    return "NVPTXISD::TexUnified1DS32FloatLevel";
-  case NVPTXISD::TexUnified1DS32FloatGrad:
-    return "NVPTXISD::TexUnified1DS32FloatGrad";
-  case NVPTXISD::TexUnified1DU32S32:
-    return "NVPTXISD::TexUnified1DU32S32";
-  case NVPTXISD::TexUnified1DU32Float:
-    return "NVPTXISD::TexUnified1DU32Float";
-  case NVPTXISD::TexUnified1DU32FloatLevel:
-    return "NVPTXISD::TexUnified1DU32FloatLevel";
-  case NVPTXISD::TexUnified1DU32FloatGrad:
-    return "NVPTXISD::TexUnified1DU32FloatGrad";
-  case NVPTXISD::TexUnified1DArrayFloatS32:
-    return "NVPTXISD::TexUnified1DArrayFloatS32";
-  case NVPTXISD::TexUnified1DArrayFloatFloat:
-    return "NVPTXISD::TexUnified1DArrayFloatFloat";
-  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
-  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
-  case NVPTXISD::TexUnified1DArrayS32S32:
-    return "NVPTXISD::TexUnified1DArrayS32S32";
-  case NVPTXISD::TexUnified1DArrayS32Float:
-    return "NVPTXISD::TexUnified1DArrayS32Float";
-  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
-    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
-  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
-    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
-  case NVPTXISD::TexUnified1DArrayU32S32:
-    return "NVPTXISD::TexUnified1DArrayU32S32";
-  case NVPTXISD::TexUnified1DArrayU32Float:
-    return "NVPTXISD::TexUnified1DArrayU32Float";
-  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
-    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
-  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
-    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
-  case NVPTXISD::TexUnified2DFloatS32:
-    return "NVPTXISD::TexUnified2DFloatS32";
-  case NVPTXISD::TexUnified2DFloatFloat:
-    return "NVPTXISD::TexUnified2DFloatFloat";
-  case NVPTXISD::TexUnified2DFloatFloatLevel:
-    return "NVPTXISD::TexUnified2DFloatFloatLevel";
-  case NVPTXISD::TexUnified2DFloatFloatGrad:
-    return "NVPTXISD::TexUnified2DFloatFloatGrad";
-  case NVPTXISD::TexUnified2DS32S32:
-    return "NVPTXISD::TexUnified2DS32S32";
-  case NVPTXISD::TexUnified2DS32Float:
-    return "NVPTXISD::TexUnified2DS32Float";
-  case NVPTXISD::TexUnified2DS32FloatLevel:
-    return "NVPTXISD::TexUnified2DS32FloatLevel";
-  case NVPTXISD::TexUnified2DS32FloatGrad:
-    return "NVPTXISD::TexUnified2DS32FloatGrad";
-  case NVPTXISD::TexUnified2DU32S32:
-    return "NVPTXISD::TexUnified2DU32S32";
-  case NVPTXISD::TexUnified2DU32Float:
-    return "NVPTXISD::TexUnified2DU32Float";
-  case NVPTXISD::TexUnified2DU32FloatLevel:
-    return "NVPTXISD::TexUnified2DU32FloatLevel";
-  case NVPTXISD::TexUnified2DU32FloatGrad:
-    return "NVPTXISD::TexUnified2DU32FloatGrad";
-  case NVPTXISD::TexUnified2DArrayFloatS32:
-    return "NVPTXISD::TexUnified2DArrayFloatS32";
-  case NVPTXISD::TexUnified2DArrayFloatFloat:
-    return "NVPTXISD::TexUnified2DArrayFloatFloat";
-  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
-  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
-    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
-  case NVPTXISD::TexUnified2DArrayS32S32:
-    return "NVPTXISD::TexUnified2DArrayS32S32";
-  case NVPTXISD::TexUnified2DArrayS32Float:
-    return "NVPTXISD::TexUnified2DArrayS32Float";
-  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
-    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
-  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
-    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
-  case NVPTXISD::TexUnified2DArrayU32S32:
-    return "NVPTXISD::TexUnified2DArrayU32S32";
-  case NVPTXISD::TexUnified2DArrayU32Float:
-    return "NVPTXISD::TexUnified2DArrayU32Float";
-  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
-    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
-  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
-    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
-  case NVPTXISD::TexUnified3DFloatS32:
-    return "NVPTXISD::TexUnified3DFloatS32";
-  case NVPTXISD::TexUnified3DFloatFloat:
-    return "NVPTXISD::TexUnified3DFloatFloat";
-  case NVPTXISD::TexUnified3DFloatFloatLevel:
-    return "NVPTXISD::TexUnified3DFloatFloatLevel";
-  case NVPTXISD::TexUnified3DFloatFloatGrad:
-    return "NVPTXISD::TexUnified3DFloatFloatGrad";
-  case NVPTXISD::TexUnified3DS32S32:
-    return "NVPTXISD::TexUnified3DS32S32";
-  case NVPTXISD::TexUnified3DS32Float:
-    return "NVPTXISD::TexUnified3DS32Float";
-  case NVPTXISD::TexUnified3DS32FloatLevel:
-    return "NVPTXISD::TexUnified3DS32FloatLevel";
-  case NVPTXISD::TexUnified3DS32FloatGrad:
-    return "NVPTXISD::TexUnified3DS32FloatGrad";
-  case NVPTXISD::TexUnified3DU32S32:
-    return "NVPTXISD::TexUnified3DU32S32";
-  case NVPTXISD::TexUnified3DU32Float:
-    return "NVPTXISD::TexUnified3DU32Float";
-  case NVPTXISD::TexUnified3DU32FloatLevel:
-    return "NVPTXISD::TexUnified3DU32FloatLevel";
-  case NVPTXISD::TexUnified3DU32FloatGrad:
-    return "NVPTXISD::TexUnified3DU32FloatGrad";
-  case NVPTXISD::TexUnifiedCubeFloatFloat:
-    return "NVPTXISD::TexUnifiedCubeFloatFloat";
-  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
-    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
-  case NVPTXISD::TexUnifiedCubeS32Float:
-    return "NVPTXISD::TexUnifiedCubeS32Float";
-  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeU32Float:
-    return "NVPTXISD::TexUnifiedCubeU32Float";
-  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
-  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayS32Float:
-    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
-  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
-  case NVPTXISD::TexUnifiedCubeArrayU32Float:
-    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
-  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
-    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
-  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
-    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
-  case NVPTXISD::Tld4UnifiedR2DS64Float:
-    return "NVPTXISD::Tld4UnifiedR2DS64Float";
-  case NVPTXISD::Tld4UnifiedG2DS64Float:
-    return "NVPTXISD::Tld4UnifiedG2DS64Float";
-  case NVPTXISD::Tld4UnifiedB2DS64Float:
-    return "NVPTXISD::Tld4UnifiedB2DS64Float";
-  case NVPTXISD::Tld4UnifiedA2DS64Float:
-    return "NVPTXISD::Tld4UnifiedA2DS64Float";
-  case NVPTXISD::Tld4UnifiedR2DU64Float:
-    return "NVPTXISD::Tld4UnifiedR2DU64Float";
-  case NVPTXISD::Tld4UnifiedG2DU64Float:
-    return "NVPTXISD::Tld4UnifiedG2DU64Float";
-  case NVPTXISD::Tld4UnifiedB2DU64Float:
-    return "NVPTXISD::Tld4UnifiedB2DU64Float";
-  case NVPTXISD::Tld4UnifiedA2DU64Float:
-    return "NVPTXISD::Tld4UnifiedA2DU64Float";
-
-  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
-  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
-  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
-  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
-  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
-  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
-  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
-  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
-  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
-  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
-  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
-
-  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
-  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
-  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
-  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
-  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
-  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
-  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
-  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
-  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
-  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
-  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
-
-  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
-  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
-  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
-  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
-  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
-  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
-  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
-  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
-  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
-  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
-  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
-
-  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
-  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
-  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
-  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
-  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
-  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
-  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
-  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
-  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
-  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
-  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
-
-  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
-  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
-  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
-  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
-  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
-  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
-  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
-  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
-  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
-  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
-  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
-
-  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
-  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
-  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
-  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
-  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
-  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
-  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
-  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
-  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
-  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
-  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
-
-  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
-  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
-  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
-  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
-  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
-  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
-  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
-  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
-  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
-  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
-  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
-
-  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
-  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
-  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
-  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
-  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
-  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
-  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
-  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
-  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
-  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
-  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
-
-  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
-  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
-  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
-  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
-  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
-  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
-  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
-  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
-  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
-  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
-  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
-
-  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
-  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
-  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
-  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
-  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
-  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
-  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
-  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
-  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
-  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
-  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
-
-  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
-  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
-  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
-  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
-  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
-  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
-  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
-  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
-  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
-  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
-  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
-
-  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
-  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
-  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
-  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
-  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
-  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
-  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
-  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
-  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
-  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
-  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
-
-  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
-  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
-  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
-  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
-  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
-  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
-  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
-  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
-  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
-  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
-  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
-
-  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
-  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
-  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
-  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
-  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
-  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
-  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
-  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
-  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
-  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
-  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
-
-  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
-  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
-  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
-  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
-  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
-  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
-  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
-  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
-  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
-  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
-  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
-  }
-  return nullptr;
-}
-
-TargetLoweringBase::LegalizeTypeAction
-NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
-  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
-    return TypeSplitVector;
-
-  return TargetLoweringBase::getPreferredVectorAction(VT);
-}
-
-SDValue
-NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
-  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
-}
-
-std::string NVPTXTargetLowering::getPrototype(
-    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
-    const ImmutableCallSite *CS) const {
-  auto PtrVT = getPointerTy(DL);
-
-  bool isABI = (STI.getSmVersion() >= 20);
-  assert(isABI && "Non-ABI compilation is not supported");
-  if (!isABI)
-    return "";
-
-  std::stringstream O;
-  O << "prototype_" << uniqueCallSite << " : .callprototype ";
-
-  if (retTy->getTypeID() == Type::VoidTyID) {
-    O << "()";
-  } else {
-    O << "(";
-    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
-      unsigned size = 0;
-      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
-        size = ITy->getBitWidth();
-        if (size < 32)
-          size = 32;
-      } else {
-        assert(retTy->isFloatingPointTy() &&
-               "Floating point type expected here");
-        size = retTy->getPrimitiveSizeInBits();
-      }
-
-      O << ".param .b" << size << " _";
-    } else if (isa<PointerType>(retTy)) {
-      O << ".param .b" << PtrVT.getSizeInBits() << " _";
-    } else if ((retTy->getTypeID() == Type::StructTyID) ||
-               isa<VectorType>(retTy)) {
-      auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
-      O << ".param .align " << retAlignment << " .b8 _["
-        << DL.getTypeAllocSize(retTy) << "]";
-    } else {
-      llvm_unreachable("Unknown return type");
-    }
-    O << ") ";
-  }
-  O << "_ (";
-
-  bool first = true;
-
-  unsigned OIdx = 0;
-  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
-    Type *Ty = Args[i].Ty;
-    if (!first) {
-      O << ", ";
-    }
-    first = false;
-
-    if (!Outs[OIdx].Flags.isByVal()) {
-      if (Ty->isAggregateType() || Ty->isVectorTy()) {
-        unsigned align = 0;
-        const CallInst *CallI = cast<CallInst>(CS->getInstruction());
-        // +1 because index 0 is reserved for return type alignment
-        if (!getAlign(*CallI, i + 1, align))
-          align = DL.getABITypeAlignment(Ty);
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        O << ".param .align " << align << " .b8 ";
-        O << "_";
-        O << "[" << sz << "]";
-        // update the index for Outs
-        SmallVector<EVT, 16> vtparts;
-        ComputeValueVTs(*this, DL, Ty, vtparts);
-        if (unsigned len = vtparts.size())
-          OIdx += len - 1;
-        continue;
-      }
-       // i8 types in IR will be i16 types in SDAG
-      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
-              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
-             "type mismatch between callee prototype and arguments");
-      // scalar type
-      unsigned sz = 0;
-      if (isa<IntegerType>(Ty)) {
-        sz = cast<IntegerType>(Ty)->getBitWidth();
-        if (sz < 32)
-          sz = 32;
-      } else if (isa<PointerType>(Ty))
-        sz = PtrVT.getSizeInBits();
-      else
-        sz = Ty->getPrimitiveSizeInBits();
-      O << ".param .b" << sz << " ";
-      O << "_";
-      continue;
-    }
-    auto *PTy = dyn_cast<PointerType>(Ty);
-    assert(PTy && "Param with byval attribute should be a pointer type");
-    Type *ETy = PTy->getElementType();
-
-    unsigned align = Outs[OIdx].Flags.getByValAlign();
-    unsigned sz = DL.getTypeAllocSize(ETy);
-    O << ".param .align " << align << " .b8 ";
-    O << "_";
-    O << "[" << sz << "]";
-  }
-  O << ");";
-  return O.str();
-}
-
-unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
-                                                   const ImmutableCallSite *CS,
-                                                   Type *Ty, unsigned Idx,
-                                                   const DataLayout &DL) const {
-  if (!CS) {
-    // CallSite is zero, fallback to ABI type alignment
-    return DL.getABITypeAlignment(Ty);
-  }
-
-  unsigned Align = 0;
-  const Value *DirectCallee = CS->getCalledFunction();
-
-  if (!DirectCallee) {
-    // We don't have a direct function symbol, but that may be because of
-    // constant cast instructions in the call.
-    const Instruction *CalleeI = CS->getInstruction();
-    assert(CalleeI && "Call target is not a function or derived value?");
-
-    // With bitcast'd call targets, the instruction will be the call
-    if (isa<CallInst>(CalleeI)) {
-      // Check if we have call alignment metadata
-      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
-        return Align;
-
-      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
-      // Ignore any bitcast instructions
-      while (isa<ConstantExpr>(CalleeV)) {
-        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
-        if (!CE->isCast())
-          break;
-        // Look through the bitcast
-        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
-      }
-
-      // We have now looked past all of the bitcasts.  Do we finally have a
-      // Function?
-      if (isa<Function>(CalleeV))
-        DirectCallee = CalleeV;
-    }
-  }
-
-  // Check for function alignment information if we found that the
-  // ultimate target is a Function
-  if (DirectCallee)
-    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
-      return Align;
-
-  // Call is indirect or alignment information is not available, fall back to
-  // the ABI type alignment
-  return DL.getABITypeAlignment(Ty);
-}
-
-SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                                       SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG = CLI.DAG;
-  SDLoc dl = CLI.DL;
-  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
-  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
-  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
-  SDValue Callee = CLI.Callee;
-  bool &isTailCall = CLI.IsTailCall;
-  ArgListTy &Args = CLI.getArgs();
-  Type *retTy = CLI.RetTy;
-  ImmutableCallSite *CS = CLI.CS;
-
-  bool isABI = (STI.getSmVersion() >= 20);
-  assert(isABI && "Non-ABI compilation is not supported");
-  if (!isABI)
-    return Chain;
-  MachineFunction &MF = DAG.getMachineFunction();
-  const Function *F = MF.getFunction();
-  auto &DL = MF.getDataLayout();
-
-  SDValue tempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getIntPtrConstant(uniqueCallSite, dl, true),
-                               dl);
-  SDValue InFlag = Chain.getValue(1);
-
-  unsigned paramCount = 0;
-  // Args.size() and Outs.size() need not match.
-  // Outs.size() will be larger
-  //   * if there is an aggregate argument with multiple fields (each field
-  //     showing up separately in Outs)
-  //   * if there is a vector argument with more than typical vector-length
-  //     elements (generally if more than 4) where each vector element is
-  //     individually present in Outs.
-  // So a different index should be used for indexing into Outs/OutVals.
-  // See similar issue in LowerFormalArguments.
-  unsigned OIdx = 0;
-  // Declare the .params or .reg need to pass values
-  // to the function
-  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
-    EVT VT = Outs[OIdx].VT;
-    Type *Ty = Args[i].Ty;
-
-    if (!Outs[OIdx].Flags.isByVal()) {
-      if (Ty->isAggregateType()) {
-        // aggregate
-        SmallVector<EVT, 16> vtparts;
-        SmallVector<uint64_t, 16> Offsets;
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
-                           0);
-
-        unsigned align =
-            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
-        // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
-                                                             MVT::i32),
-                                      DAG.getConstant(paramCount, dl, MVT::i32),
-                                      DAG.getConstant(sz, dl, MVT::i32),
-                                      InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps);
-        InFlag = Chain.getValue(1);
-        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-          EVT elemtype = vtparts[j];
-          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
-          if (elemtype.isInteger() && (sz < 8))
-            sz = 8;
-          SDValue StVal = OutVals[OIdx];
-          if (elemtype.getSizeInBits() < 16) {
-            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
-          }
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(Offsets[j], dl, MVT::i32),
-                                     StVal, InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          elemtype, MachinePointerInfo(),
-                                          ArgAlign);
-          InFlag = Chain.getValue(1);
-          ++OIdx;
-        }
-        if (vtparts.size() > 0)
-          --OIdx;
-        ++paramCount;
-        continue;
-      }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        unsigned align =
-            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
-        // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain,
-                                      DAG.getConstant(align, dl, MVT::i32),
-                                      DAG.getConstant(paramCount, dl, MVT::i32),
-                                      DAG.getConstant(sz, dl, MVT::i32),
-                                      InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps);
-        InFlag = Chain.getValue(1);
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        EVT EltVT = ObjectVT.getVectorElementType();
-        EVT MemVT = EltVT;
-        bool NeedExtend = false;
-        if (EltVT.getSizeInBits() < 16) {
-          NeedExtend = true;
-          EltVT = MVT::i16;
-        }
-
-        // V1 store
-        if (NumElts == 1) {
-          SDValue Elt = OutVals[OIdx++];
-          if (NeedExtend)
-            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
-
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(0, dl, MVT::i32), Elt,
-                                     InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          MemVT, MachinePointerInfo());
-          InFlag = Chain.getValue(1);
-        } else if (NumElts == 2) {
-          SDValue Elt0 = OutVals[OIdx++];
-          SDValue Elt1 = OutVals[OIdx++];
-          if (NeedExtend) {
-            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
-            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
-          }
-
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(0, dl, MVT::i32), Elt0,
-                                     Elt1, InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          MemVT, MachinePointerInfo());
-          InFlag = Chain.getValue(1);
-        } else {
-          unsigned curOffset = 0;
-          // V4 stores
-          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the
-          // vector will be expanded to a power of 2 elements, so we know we can
-          // always round up to the next multiple of 4 when creating the vector
-          // stores.
-          // e.g.  4 elem => 1 st.v4
-          //       6 elem => 2 st.v4
-          //       8 elem => 2 st.v4
-          //      11 elem => 3 st.v4
-          unsigned VecSize = 4;
-          if (EltVT.getSizeInBits() == 64)
-            VecSize = 2;
-
-          // This is potentially only part of a vector, so assume all elements
-          // are packed together.
-          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
-
-          for (unsigned i = 0; i < NumElts; i += VecSize) {
-            // Get values
-            SDValue StoreVal;
-            SmallVector<SDValue, 8> Ops;
-            Ops.push_back(Chain);
-            Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
-            Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
-
-            unsigned Opc = NVPTXISD::StoreParamV2;
-
-            StoreVal = OutVals[OIdx++];
-            if (NeedExtend)
-              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-            Ops.push_back(StoreVal);
-
-            if (i + 1 < NumElts) {
-              StoreVal = OutVals[OIdx++];
-              if (NeedExtend)
-                StoreVal =
-                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-            } else {
-              StoreVal = DAG.getUNDEF(EltVT);
-            }
-            Ops.push_back(StoreVal);
-
-            if (VecSize == 4) {
-              Opc = NVPTXISD::StoreParamV4;
-              if (i + 2 < NumElts) {
-                StoreVal = OutVals[OIdx++];
-                if (NeedExtend)
-                  StoreVal =
-                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-              } else {
-                StoreVal = DAG.getUNDEF(EltVT);
-              }
-              Ops.push_back(StoreVal);
-
-              if (i + 3 < NumElts) {
-                StoreVal = OutVals[OIdx++];
-                if (NeedExtend)
-                  StoreVal =
-                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-              } else {
-                StoreVal = DAG.getUNDEF(EltVT);
-              }
-              Ops.push_back(StoreVal);
-            }
-
-            Ops.push_back(InFlag);
-
-            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
-                                            MemVT, MachinePointerInfo());
-            InFlag = Chain.getValue(1);
-            curOffset += PerStoreOffset;
-          }
-        }
-        ++paramCount;
-        --OIdx;
-        continue;
-      }
-      // Plain scalar
-      // for ABI,    declare .param .b<size> .param<n>;
-      unsigned sz = VT.getSizeInBits();
-      bool needExtend = false;
-      if (VT.isInteger()) {
-        if (sz < 16)
-          needExtend = true;
-        if (sz < 32)
-          sz = 32;
-      }
-      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareParamOps[] = { Chain,
-                                    DAG.getConstant(paramCount, dl, MVT::i32),
-                                    DAG.getConstant(sz, dl, MVT::i32),
-                                    DAG.getConstant(0, dl, MVT::i32), InFlag };
-      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                          DeclareParamOps);
-      InFlag = Chain.getValue(1);
-      SDValue OutV = OutVals[OIdx];
-      if (needExtend) {
-        // zext/sext i1 to i16
-        unsigned opc = ISD::ZERO_EXTEND;
-        if (Outs[OIdx].Flags.isSExt())
-          opc = ISD::SIGN_EXTEND;
-        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
-      }
-      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue CopyParamOps[] = { Chain,
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(0, dl, MVT::i32), OutV,
-                                 InFlag };
-
-      unsigned opcode = NVPTXISD::StoreParam;
-      if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
-        opcode = NVPTXISD::StoreParamU32;
-      else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
-        opcode = NVPTXISD::StoreParamS32;
-      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
-                                      VT, MachinePointerInfo());
-
-      InFlag = Chain.getValue(1);
-      ++paramCount;
-      continue;
-    }
-    // struct or vector
-    SmallVector<EVT, 16> vtparts;
-    SmallVector<uint64_t, 16> Offsets;
-    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
-    assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
-                       vtparts, &Offsets, 0);
-
-    // declare .param .align <align> .b8 .param<n>[<size>];
-    unsigned sz = Outs[OIdx].Flags.getByValSize();
-    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
-    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
-    // so we don't need to worry about natural alignment or not.
-    // See TargetLowering::LowerCallTo().
-
-    // Enforce minumum alignment of 4 to work around ptxas miscompile
-    // for sm_50+. See corresponding alignment adjustment in
-    // emitFunctionParamList() for details.
-    if (ArgAlign < 4)
-      ArgAlign = 4;
-    SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(sz, dl, MVT::i32), InFlag};
-    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                        DeclareParamOps);
-    InFlag = Chain.getValue(1);
-    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-      EVT elemtype = vtparts[j];
-      int curOffset = Offsets[j];
-      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
-      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
-                                    DAG.getConstant(curOffset, dl, PtrVT));
-      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
-                                   MachinePointerInfo(), PartAlign);
-      if (elemtype.getSizeInBits() < 16) {
-        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
-      }
-      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue CopyParamOps[] = { Chain,
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(curOffset, dl, MVT::i32),
-                                 theVal, InFlag };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                                      CopyParamOps, elemtype,
-                                      MachinePointerInfo());
-
-      InFlag = Chain.getValue(1);
-    }
-    ++paramCount;
-  }
-
-  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-  unsigned retAlignment = 0;
-
-  // Handle Result
-  if (Ins.size() > 0) {
-    SmallVector<EVT, 16> resvtparts;
-    ComputeValueVTs(*this, DL, retTy, resvtparts);
-
-    // Declare
-    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
-    //  .param .b<size-in-bits> retval0
-    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
-    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
-    // these three types to match the logic in
-    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
-    // Plus, this behavior is consistent with nvcc's.
-    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
-        retTy->isPointerTy()) {
-      // Scalar needs to be at least 32bit wide
-      if (resultsz < 32)
-        resultsz = 32;
-      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
-                                  DAG.getConstant(resultsz, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
-      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
-                          DeclareRetOps);
-      InFlag = Chain.getValue(1);
-    } else {
-      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL);
-      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareRetOps[] = { Chain,
-                                  DAG.getConstant(retAlignment, dl, MVT::i32),
-                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
-      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
-                          DeclareRetOps);
-      InFlag = Chain.getValue(1);
-    }
-  }
-
-  if (!Func) {
-    // This is indirect function call case : PTX requires a prototype of the
-    // form
-    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
-    // to be emitted, and the label has to used as the last arg of call
-    // instruction.
-    // The prototype is embedded in a string and put as the operand for a
-    // CallPrototype SDNode which will print out to the value of the string.
-    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto =
-        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
-    const char *ProtoStr =
-      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
-    SDValue ProtoOps[] = {
-      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
-    };
-    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
-    InFlag = Chain.getValue(1);
-  }
-  // Op to just print "call"
-  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue PrintCallOps[] = {
-    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
-  };
-  // We model convergent calls as separate opcodes.
-  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
-  if (CLI.IsConvergent)
-    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
-                                              : NVPTXISD::PrintConvergentCall;
-  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
-  InFlag = Chain.getValue(1);
-
-  // Ops to print out the function name
-  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
-  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
-  InFlag = Chain.getValue(1);
-
-  // Ops to print out the param list
-  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue CallArgBeginOps[] = { Chain, InFlag };
-  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
-                      CallArgBeginOps);
-  InFlag = Chain.getValue(1);
-
-  for (unsigned i = 0, e = paramCount; i != e; ++i) {
-    unsigned opcode;
-    if (i == (e - 1))
-      opcode = NVPTXISD::LastCallArg;
-    else
-      opcode = NVPTXISD::CallArg;
-    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
-                             DAG.getConstant(i, dl, MVT::i32), InFlag };
-    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
-    InFlag = Chain.getValue(1);
-  }
-  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue CallArgEndOps[] = { Chain,
-                              DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
-                              InFlag };
-  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
-  InFlag = Chain.getValue(1);
-
-  if (!Func) {
-    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue PrototypeOps[] = { Chain,
-                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
-                               InFlag };
-    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
-    InFlag = Chain.getValue(1);
-  }
-
-  // Generate loads from param memory/moves from registers for result
-  if (Ins.size() > 0) {
-    if (retTy && retTy->isVectorTy()) {
-      EVT ObjectVT = getValueType(DL, retTy);
-      unsigned NumElts = ObjectVT.getVectorNumElements();
-      EVT EltVT = ObjectVT.getVectorElementType();
-      assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
-                                                      ObjectVT) == NumElts &&
-             "Vector was not scalarized");
-      unsigned sz = EltVT.getSizeInBits();
-      bool needTruncate = sz < 8;
-
-      if (NumElts == 1) {
-        // Just a simple load
-        SmallVector<EVT, 4> LoadRetVTs;
-        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-          // If loading i1/i8 result, generate
-          //   load.b8 i16
-          //   if i1
-          //   trunc i16 to i1
-          LoadRetVTs.push_back(MVT::i16);
-        } else
-          LoadRetVTs.push_back(EltVT);
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(0, dl, MVT::i32), InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParam, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
-        Chain = retval.getValue(1);
-        InFlag = retval.getValue(2);
-        SDValue Ret0 = retval;
-        if (needTruncate)
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
-        InVals.push_back(Ret0);
-      } else if (NumElts == 2) {
-        // LoadV2
-        SmallVector<EVT, 4> LoadRetVTs;
-        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-          // If loading i1/i8 result, generate
-          //   load.b8 i16
-          //   if i1
-          //   trunc i16 to i1
-          LoadRetVTs.push_back(MVT::i16);
-          LoadRetVTs.push_back(MVT::i16);
-        } else {
-          LoadRetVTs.push_back(EltVT);
-          LoadRetVTs.push_back(EltVT);
-        }
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(0, dl, MVT::i32), InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParamV2, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
-        Chain = retval.getValue(2);
-        InFlag = retval.getValue(3);
-        SDValue Ret0 = retval.getValue(0);
-        SDValue Ret1 = retval.getValue(1);
-        if (needTruncate) {
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
-          InVals.push_back(Ret0);
-          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
-          InVals.push_back(Ret1);
-        } else {
-          InVals.push_back(Ret0);
-          InVals.push_back(Ret1);
-        }
-      } else {
-        // Split into N LoadV4
-        unsigned Ofst = 0;
-        unsigned VecSize = 4;
-        unsigned Opc = NVPTXISD::LoadParamV4;
-        if (EltVT.getSizeInBits() == 64) {
-          VecSize = 2;
-          Opc = NVPTXISD::LoadParamV2;
-        }
-        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-        for (unsigned i = 0; i < NumElts; i += VecSize) {
-          SmallVector<EVT, 8> LoadRetVTs;
-          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-            // If loading i1/i8 result, generate
-            //   load.b8 i16
-            //   if i1
-            //   trunc i16 to i1
-            for (unsigned j = 0; j < VecSize; ++j)
-              LoadRetVTs.push_back(MVT::i16);
-          } else {
-            for (unsigned j = 0; j < VecSize; ++j)
-              LoadRetVTs.push_back(EltVT);
-          }
-          LoadRetVTs.push_back(MVT::Other);
-          LoadRetVTs.push_back(MVT::Glue);
-          SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                  DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
-          SDValue retval = DAG.getMemIntrinsicNode(
-              Opc, dl, DAG.getVTList(LoadRetVTs),
-              LoadRetOps, EltVT, MachinePointerInfo());
-          if (VecSize == 2) {
-            Chain = retval.getValue(2);
-            InFlag = retval.getValue(3);
-          } else {
-            Chain = retval.getValue(4);
-            InFlag = retval.getValue(5);
-          }
-
-          for (unsigned j = 0; j < VecSize; ++j) {
-            if (i + j >= NumElts)
-              break;
-            SDValue Elt = retval.getValue(j);
-            if (needTruncate)
-              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
-            InVals.push_back(Elt);
-          }
-          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-        }
-      }
-    } else {
-      SmallVector<EVT, 16> VTs;
-      SmallVector<uint64_t, 16> Offsets;
-      auto &DL = DAG.getDataLayout();
-      ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0);
-      assert(VTs.size() == Ins.size() && "Bad value decomposition");
-      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL);
-      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-        unsigned sz = VTs[i].getSizeInBits();
-        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
-        bool needTruncate = false;
-        if (VTs[i].isInteger() && sz < 8) {
-          sz = 8;
-          needTruncate = true;
-        }
-
-        SmallVector<EVT, 4> LoadRetVTs;
-        EVT TheLoadType = VTs[i];
-        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
-          // This is for integer types only, and specifically not for
-          // aggregates.
-          LoadRetVTs.push_back(MVT::i32);
-          TheLoadType = MVT::i32;
-          needTruncate = true;
-        } else if (sz < 16) {
-          // If loading i1/i8 result, generate
-          //   load i8 (-> i16)
-          //   trunc i16 to i1/i8
-
-          // FIXME: Do we need to set needTruncate to true here, too?  We could
-          // not figure out what this branch is for in D17872, so we left it
-          // alone.  The comment above about loading i1/i8 may be wrong, as the
-          // branch above seems to cover integers of size < 32.
-          LoadRetVTs.push_back(MVT::i16);
-        } else
-          LoadRetVTs.push_back(Ins[i].VT);
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(Offsets[i], dl, MVT::i32),
-                                InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParam, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps,
-            TheLoadType, MachinePointerInfo(), AlignI);
-        Chain = retval.getValue(1);
-        InFlag = retval.getValue(2);
-        SDValue Ret0 = retval.getValue(0);
-        if (needTruncate)
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
-        InVals.push_back(Ret0);
-      }
-    }
-  }
-
-  Chain = DAG.getCALLSEQ_END(Chain,
-                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
-                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
-                                                   true),
-                             InFlag, dl);
-  uniqueCallSite++;
-
-  // set isTailCall to false for now, until we figure out how to express
-  // tail call optimization in PTX
-  isTailCall = false;
-  return Chain;
-}
-
-// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
-// (see LegalizeDAG.cpp). This is slow and uses local memory.
-// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
-SDValue
-NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
-  SmallVector<SDValue, 8> Ops;
-  unsigned NumOperands = Node->getNumOperands();
-  for (unsigned i = 0; i < NumOperands; ++i) {
-    SDValue SubOp = Node->getOperand(i);
-    EVT VVT = SubOp.getNode()->getValueType(0);
-    EVT EltVT = VVT.getVectorElementType();
-    unsigned NumSubElem = VVT.getVectorNumElements();
-    for (unsigned j = 0; j < NumSubElem; ++j) {
-      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
-                                DAG.getIntPtrConstant(j, dl)));
-    }
-  }
-  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
-}
-
-/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
-/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
-///    amount, or
-/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
-///    amount.
-SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
-  if (VTBits == 32 && STI.getSmVersion() >= 35) {
-    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
-    // {dHi, dLo} = {aHi, aLo} >> Amt
-    //   dHi = aHi >> Amt
-    //   dLo = shf.r.clamp aLo, aHi, Amt
-
-    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
-                             ShAmt);
-
-    SDValue Ops[2] = { Lo, Hi };
-    return DAG.getMergeValues(Ops, dl);
-  }
-  else {
-    // {dHi, dLo} = {aHi, aLo} >> Amt
-    // - if (Amt>=size) then
-    //      dLo = aHi >> (Amt-size)
-    //      dHi = aHi >> Amt (this is either all 0 or all 1)
-    //   else
-    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
-    //      dHi = aHi >> Amt
-
-    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
-                                   DAG.getConstant(VTBits, dl, MVT::i32),
-                                   ShAmt);
-    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
-                                     DAG.getConstant(VTBits, dl, MVT::i32));
-    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-
-    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
-                               DAG.getConstant(VTBits, dl, MVT::i32),
-                               ISD::SETGE);
-    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
-
-    SDValue Ops[2] = { Lo, Hi };
-    return DAG.getMergeValues(Ops, dl);
-  }
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which
-/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
-///    amount, or
-/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
-///    amount.
-SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-
-  if (VTBits == 32 && STI.getSmVersion() >= 35) {
-    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
-    // {dHi, dLo} = {aHi, aLo} << Amt
-    //   dHi = shf.l.clamp aLo, aHi, Amt
-    //   dLo = aLo << Amt
-
-    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
-                             ShAmt);
-    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-
-    SDValue Ops[2] = { Lo, Hi };
-    return DAG.getMergeValues(Ops, dl);
-  }
-  else {
-    // {dHi, dLo} = {aHi, aLo} << Amt
-    // - if (Amt>=size) then
-    //      dLo = aLo << Amt (all 0)
-    //      dLo = aLo << (Amt-size)
-    //   else
-    //      dLo = aLo << Amt
-    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
-
-    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
-                                   DAG.getConstant(VTBits, dl, MVT::i32),
-                                   ShAmt);
-    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
-                                     DAG.getConstant(VTBits, dl, MVT::i32));
-    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
-    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
-                               DAG.getConstant(VTBits, dl, MVT::i32),
-                               ISD::SETGE);
-    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
-
-    SDValue Ops[2] = { Lo, Hi };
-    return DAG.getMergeValues(Ops, dl);
-  }
-}
-
-SDValue
-NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  case ISD::RETURNADDR:
-    return SDValue();
-  case ISD::FRAMEADDR:
-    return SDValue();
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::INTRINSIC_W_CHAIN:
-    return Op;
-  case ISD::BUILD_VECTOR:
-  case ISD::EXTRACT_SUBVECTOR:
-    return Op;
-  case ISD::CONCAT_VECTORS:
-    return LowerCONCAT_VECTORS(Op, DAG);
-  case ISD::STORE:
-    return LowerSTORE(Op, DAG);
-  case ISD::LOAD:
-    return LowerLOAD(Op, DAG);
-  case ISD::SHL_PARTS:
-    return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRA_PARTS:
-  case ISD::SRL_PARTS:
-    return LowerShiftRightParts(Op, DAG);
-  case ISD::SELECT:
-    return LowerSelect(Op, DAG);
-  default:
-    llvm_unreachable("Custom lowering not defined for operation");
-  }
-}
-
-SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Op0 = Op->getOperand(0);
-  SDValue Op1 = Op->getOperand(1);
-  SDValue Op2 = Op->getOperand(2);
-  SDLoc DL(Op.getNode());
-
-  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
-
-  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
-  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
-  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
-  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
-
-  return Trunc;
-}
-
-SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getValueType() == MVT::i1)
-    return LowerLOADi1(Op, DAG);
-  else
-    return SDValue();
-}
-
-// v = ld i1* addr
-//   =>
-// v1 = ld i8* addr (-> i16)
-// v = trunc i16 to i1
-SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  LoadSDNode *LD = cast<LoadSDNode>(Node);
-  SDLoc dl(Node);
-  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
-  assert(Node->getValueType(0) == MVT::i1 &&
-         "Custom lowering for i1 load only");
-  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
-                              LD->getPointerInfo(), LD->getAlignment(),
-                              LD->getMemOperand()->getFlags());
-  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
-  // The legalizer (the caller) is expecting two values from the legalized
-  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
-  // in LegalizeDAG.cpp which also uses MergeValues.
-  SDValue Ops[] = { result, LD->getChain() };
-  return DAG.getMergeValues(Ops, dl);
-}
-
-SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  EVT ValVT = Op.getOperand(1).getValueType();
-  if (ValVT == MVT::i1)
-    return LowerSTOREi1(Op, DAG);
-  else if (ValVT.isVector())
-    return LowerSTOREVector(Op, DAG);
-  else
-    return SDValue();
-}
-
-SDValue
-NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *N = Op.getNode();
-  SDValue Val = N->getOperand(1);
-  SDLoc DL(N);
-  EVT ValVT = Val.getValueType();
-
-  if (ValVT.isVector()) {
-    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
-    // legal.  We can (and should) split that into 2 stores of <2 x double> here
-    // but I'm leaving that as a TODO for now.
-    if (!ValVT.isSimple())
-      return SDValue();
-    switch (ValVT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v2i8:
-    case MVT::v2i16:
-    case MVT::v2i32:
-    case MVT::v2i64:
-    case MVT::v2f32:
-    case MVT::v2f64:
-    case MVT::v4i8:
-    case MVT::v4i16:
-    case MVT::v4i32:
-    case MVT::v4f32:
-      // This is a "native" vector type
-      break;
-    }
-
-    MemSDNode *MemSD = cast<MemSDNode>(N);
-    const DataLayout &TD = DAG.getDataLayout();
-
-    unsigned Align = MemSD->getAlignment();
-    unsigned PrefAlign =
-        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
-    if (Align < PrefAlign) {
-      // This store is not sufficiently aligned, so bail out and let this vector
-      // store be scalarized.  Note that we may still be able to emit smaller
-      // vector stores.  For example, if we are storing a <4 x float> with an
-      // alignment of 8, this check will fail but the legalizer will try again
-      // with 2 x <2 x float>, which will succeed with an alignment of 8.
-      return SDValue();
-    }
-
-    unsigned Opcode = 0;
-    EVT EltVT = ValVT.getVectorElementType();
-    unsigned NumElts = ValVT.getVectorNumElements();
-
-    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
-    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-    // stored type to i16 and propagate the "real" type as the memory type.
-    bool NeedExt = false;
-    if (EltVT.getSizeInBits() < 16)
-      NeedExt = true;
-
-    switch (NumElts) {
-    default:
-      return SDValue();
-    case 2:
-      Opcode = NVPTXISD::StoreV2;
-      break;
-    case 4:
-      Opcode = NVPTXISD::StoreV4;
-      break;
-    }
-
-    SmallVector<SDValue, 8> Ops;
-
-    // First is the chain
-    Ops.push_back(N->getOperand(0));
-
-    // Then the split values
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
-                                   DAG.getIntPtrConstant(i, DL));
-      if (NeedExt)
-        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
-      Ops.push_back(ExtVal);
-    }
-
-    // Then any remaining arguments
-    Ops.append(N->op_begin() + 2, N->op_end());
-
-    SDValue NewSt = DAG.getMemIntrinsicNode(
-        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
-        MemSD->getMemoryVT(), MemSD->getMemOperand());
-
-    //return DCI.CombineTo(N, NewSt, true);
-    return NewSt;
-  }
-
-  return SDValue();
-}
-
-// st i1 v, addr
-//    =>
-// v1 = zxt v to i16
-// st.u8 i16, addr
-SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
-  SDNode *Node = Op.getNode();
-  SDLoc dl(Node);
-  StoreSDNode *ST = cast<StoreSDNode>(Node);
-  SDValue Tmp1 = ST->getChain();
-  SDValue Tmp2 = ST->getBasePtr();
-  SDValue Tmp3 = ST->getValue();
-  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
-  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
-  SDValue Result =
-      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
-                        ST->getAlignment(), ST->getMemOperand()->getFlags());
-  return Result;
-}
-
-SDValue
-NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
-  std::string ParamSym;
-  raw_string_ostream ParamStr(ParamSym);
-
-  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
-  ParamStr.flush();
-
-  std::string *SavedStr =
-    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
-  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
-}
-
-// Check to see if the kernel argument is image*_t or sampler_t
-
-static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
-  static const char *const specialTypes[] = { "struct._image2d_t",
-                                              "struct._image3d_t",
-                                              "struct._sampler_t" };
-
-  Type *Ty = arg->getType();
-  auto *PTy = dyn_cast<PointerType>(Ty);
-
-  if (!PTy)
-    return false;
-
-  if (!context)
-    return false;
-
-  auto *STy = dyn_cast<StructType>(PTy->getElementType());
-  if (!STy || STy->isLiteral())
-    return false;
-
-  return std::find(std::begin(specialTypes), std::end(specialTypes),
-                   STy->getName()) != std::end(specialTypes);
-}
-
-SDValue NVPTXTargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  const DataLayout &DL = DAG.getDataLayout();
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  const Function *F = MF.getFunction();
-  const AttributeSet &PAL = F->getAttributes();
-  const TargetLowering *TLI = STI.getTargetLowering();
-
-  SDValue Root = DAG.getRoot();
-  std::vector<SDValue> OutChains;
-
-  bool isABI = (STI.getSmVersion() >= 20);
-  assert(isABI && "Non-ABI compilation is not supported");
-  if (!isABI)
-    return Chain;
-
-  std::vector<Type *> argTypes;
-  std::vector<const Argument *> theArgs;
-  for (const Argument &I : F->args()) {
-    theArgs.push_back(&I);
-    argTypes.push_back(I.getType());
-  }
-  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
-  // Ins.size() will be larger
-  //   * if there is an aggregate argument with multiple fields (each field
-  //     showing up separately in Ins)
-  //   * if there is a vector argument with more than typical vector-length
-  //     elements (generally if more than 4) where each vector element is
-  //     individually present in Ins.
-  // So a different index should be used for indexing into Ins.
-  // See similar issue in LowerCall.
-  unsigned InsIdx = 0;
-
-  int idx = 0;
-  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
-    Type *Ty = argTypes[i];
-
-    // If the kernel argument is image*_t or sampler_t, convert it to
-    // a i32 constant holding the parameter position. This can later
-    // matched in the AsmPrinter to output the correct mangled name.
-    if (isImageOrSamplerVal(
-            theArgs[i],
-            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
-                                     : nullptr))) {
-      assert(isKernelFunction(*F) &&
-             "Only kernels can have image/sampler params");
-      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
-      continue;
-    }
-
-    if (theArgs[i]->use_empty()) {
-      // argument is dead
-      if (Ty->isAggregateType()) {
-        SmallVector<EVT, 16> vtparts;
-
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
-        assert(vtparts.size() > 0 && "empty aggregate type not expected");
-        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
-             ++parti) {
-          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
-          ++InsIdx;
-        }
-        if (vtparts.size() > 0)
-          --InsIdx;
-        continue;
-      }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
-        for (unsigned parti = 0; parti < NumRegs; ++parti) {
-          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
-          ++InsIdx;
-        }
-        if (NumRegs > 0)
-          --InsIdx;
-        continue;
-      }
-      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
-      continue;
-    }
-
-    // In the following cases, assign a node order of "idx+1"
-    // to newly created nodes. The SDNodes for params have to
-    // appear in the same order as their order of appearance
-    // in the original function. "idx+1" holds that order.
-    if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
-      if (Ty->isAggregateType()) {
-        SmallVector<EVT, 16> vtparts;
-        SmallVector<uint64_t, 16> offsets;
-
-        // NOTE: Here, we lose the ability to issue vector loads for vectors
-        // that are a part of a struct.  This should be investigated in the
-        // future.
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
-                           0);
-        assert(vtparts.size() > 0 && "empty aggregate type not expected");
-        bool aggregateIsPacked = false;
-        if (StructType *STy = dyn_cast<StructType>(Ty))
-          aggregateIsPacked = STy->isPacked();
-
-        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
-             ++parti) {
-          EVT partVT = vtparts[parti];
-          Value *srcValue = Constant::getNullValue(
-              PointerType::get(partVT.getTypeForEVT(F->getContext()),
-                               ADDRESS_SPACE_PARAM));
-          SDValue srcAddr =
-              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
-                          DAG.getConstant(offsets[parti], dl, PtrVT));
-          unsigned partAlign = aggregateIsPacked
-                                   ? 1
-                                   : DL.getABITypeAlignment(
-                                         partVT.getTypeForEVT(F->getContext()));
-          SDValue p;
-          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
-            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
-                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
-            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
-                               MachinePointerInfo(srcValue), partVT, partAlign);
-          } else {
-            p = DAG.getLoad(partVT, dl, Root, srcAddr,
-                            MachinePointerInfo(srcValue), partAlign);
-          }
-          if (p.getNode())
-            p.getNode()->setIROrder(idx + 1);
-          InVals.push_back(p);
-          ++InsIdx;
-        }
-        if (vtparts.size() > 0)
-          --InsIdx;
-        continue;
-      }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
-               "Vector was not scalarized");
-        EVT EltVT = ObjectVT.getVectorElementType();
-
-        // V1 load
-        // f32 = load ...
-        if (NumElts == 1) {
-          // We only have one element, so just directly load it
-          Value *SrcValue = Constant::getNullValue(PointerType::get(
-              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-          SDValue P = DAG.getLoad(
-              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
-              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
-              MachineMemOperand::MODereferenceable |
-                  MachineMemOperand::MOInvariant);
-          if (P.getNode())
-            P.getNode()->setIROrder(idx + 1);
-
-          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
-            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
-          InVals.push_back(P);
-          ++InsIdx;
-        } else if (NumElts == 2) {
-          // V2 load
-          // f32,f32 = load ...
-          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
-          Value *SrcValue = Constant::getNullValue(PointerType::get(
-              VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-          SDValue P = DAG.getLoad(
-              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
-              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
-              MachineMemOperand::MODereferenceable |
-                  MachineMemOperand::MOInvariant);
-          if (P.getNode())
-            P.getNode()->setIROrder(idx + 1);
-
-          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                     DAG.getIntPtrConstant(0, dl));
-          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                     DAG.getIntPtrConstant(1, dl));
-
-          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
-            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
-            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
-          }
-
-          InVals.push_back(Elt0);
-          InVals.push_back(Elt1);
-          InsIdx += 2;
-        } else {
-          // V4 loads
-          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the vector will be expanded to a power of 2 elements, so we know we
-          // can always round up to the next multiple of 4 when creating the
-          // vector loads.
-          // e.g.  4 elem => 1 ld.v4
-          //       6 elem => 2 ld.v4
-          //       8 elem => 2 ld.v4
-          //      11 elem => 3 ld.v4
-          unsigned VecSize = 4;
-          if (EltVT.getSizeInBits() == 64) {
-            VecSize = 2;
-          }
-          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-          unsigned Ofst = 0;
-          for (unsigned i = 0; i < NumElts; i += VecSize) {
-            Value *SrcValue = Constant::getNullValue(
-                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
-                                 ADDRESS_SPACE_PARAM));
-            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
-                                          DAG.getConstant(Ofst, dl, PtrVT));
-            SDValue P = DAG.getLoad(
-                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
-                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
-                MachineMemOperand::MODereferenceable |
-                    MachineMemOperand::MOInvariant);
-            if (P.getNode())
-              P.getNode()->setIROrder(idx + 1);
-
-            for (unsigned j = 0; j < VecSize; ++j) {
-              if (i + j >= NumElts)
-                break;
-              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                        DAG.getIntPtrConstant(j, dl));
-              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
-                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
-              InVals.push_back(Elt);
-            }
-            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-          }
-          InsIdx += NumElts;
-        }
-
-        if (NumElts > 0)
-          --InsIdx;
-        continue;
-      }
-      // A plain scalar.
-      EVT ObjectVT = getValueType(DL, Ty);
-      // If ABI, load from the param symbol
-      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-      Value *srcValue = Constant::getNullValue(PointerType::get(
-          ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-      SDValue p;
-       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
-        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
-                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
-        p = DAG.getExtLoad(
-            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
-            ObjectVT,
-            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
-      } else {
-        p = DAG.getLoad(
-            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
-            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
-      }
-      if (p.getNode())
-        p.getNode()->setIROrder(idx + 1);
-      InVals.push_back(p);
-      continue;
-    }
-
-    // Param has ByVal attribute
-    // Return MoveParam(param symbol).
-    // Ideally, the param symbol can be returned directly,
-    // but when SDNode builder decides to use it in a CopyToReg(),
-    // machine instruction fails because TargetExternalSymbol
-    // (not lowered) is target dependent, and CopyToReg assumes
-    // the source is lowered.
-    EVT ObjectVT = getValueType(DL, Ty);
-    assert(ObjectVT == Ins[InsIdx].VT &&
-           "Ins type did not match function type");
-    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
-    if (p.getNode())
-      p.getNode()->setIROrder(idx + 1);
-    InVals.push_back(p);
-  }
-
-  // Clang will check explicit VarArg and issue error if any. However, Clang
-  // will let code with
-  // implicit var arg like f() pass. See bug 617733.
-  // We treat this case as if the arg list is empty.
-  // if (F.isVarArg()) {
-  // assert(0 && "VarArg not supported yet!");
-  //}
-
-  if (!OutChains.empty())
-    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
-
-  return Chain;
-}
-
-SDValue
-NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                 bool isVarArg,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 const SDLoc &dl, SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  const Function *F = MF.getFunction();
-  Type *RetTy = F->getReturnType();
-  const DataLayout &TD = DAG.getDataLayout();
-
-  bool isABI = (STI.getSmVersion() >= 20);
-  assert(isABI && "Non-ABI compilation is not supported");
-  if (!isABI)
-    return Chain;
-
-  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
-    // If we have a vector type, the OutVals array will be the scalarized
-    // components and we have combine them into 1 or more vector stores.
-    unsigned NumElts = VTy->getNumElements();
-    assert(NumElts == Outs.size() && "Bad scalarization of return value");
-
-    // const_cast can be removed in later LLVM versions
-    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
-    bool NeedExtend = false;
-    if (EltVT.getSizeInBits() < 16)
-      NeedExtend = true;
-
-    // V1 store
-    if (NumElts == 1) {
-      SDValue StoreVal = OutVals[0];
-      // We only have one element, so just directly store it
-      if (NeedExtend)
-        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      EltVT, MachinePointerInfo());
-    } else if (NumElts == 2) {
-      // V2 store
-      SDValue StoreVal0 = OutVals[0];
-      SDValue StoreVal1 = OutVals[1];
-
-      if (NeedExtend) {
-        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
-        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
-      }
-
-      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
-                        StoreVal1 };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      EltVT, MachinePointerInfo());
-    } else {
-      // V4 stores
-      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
-      // vector will be expanded to a power of 2 elements, so we know we can
-      // always round up to the next multiple of 4 when creating the vector
-      // stores.
-      // e.g.  4 elem => 1 st.v4
-      //       6 elem => 2 st.v4
-      //       8 elem => 2 st.v4
-      //      11 elem => 3 st.v4
-
-      unsigned VecSize = 4;
-      if (OutVals[0].getValueSizeInBits() == 64)
-        VecSize = 2;
-
-      unsigned Offset = 0;
-
-      EVT VecVT =
-          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-      unsigned PerStoreOffset =
-          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-
-      for (unsigned i = 0; i < NumElts; i += VecSize) {
-        // Get values
-        SDValue StoreVal;
-        SmallVector<SDValue, 8> Ops;
-        Ops.push_back(Chain);
-        Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
-        unsigned Opc = NVPTXISD::StoreRetvalV2;
-        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
-
-        StoreVal = OutVals[i];
-        if (NeedExtend)
-          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-        Ops.push_back(StoreVal);
-
-        if (i + 1 < NumElts) {
-          StoreVal = OutVals[i + 1];
-          if (NeedExtend)
-            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-        } else {
-          StoreVal = DAG.getUNDEF(ExtendedVT);
-        }
-        Ops.push_back(StoreVal);
-
-        if (VecSize == 4) {
-          Opc = NVPTXISD::StoreRetvalV4;
-          if (i + 2 < NumElts) {
-            StoreVal = OutVals[i + 2];
-            if (NeedExtend)
-              StoreVal =
-                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-          } else {
-            StoreVal = DAG.getUNDEF(ExtendedVT);
-          }
-          Ops.push_back(StoreVal);
-
-          if (i + 3 < NumElts) {
-            StoreVal = OutVals[i + 3];
-            if (NeedExtend)
-              StoreVal =
-                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-          } else {
-            StoreVal = DAG.getUNDEF(ExtendedVT);
-          }
-          Ops.push_back(StoreVal);
-        }
-
-        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
-        Chain =
-            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
-                                    EltVT, MachinePointerInfo());
-        Offset += PerStoreOffset;
-      }
-    }
-  } else {
-    SmallVector<EVT, 16> ValVTs;
-    SmallVector<uint64_t, 16> Offsets;
-    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
-    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
-
-    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-      SDValue theVal = OutVals[i];
-      EVT TheValType = theVal.getValueType();
-      unsigned numElems = 1;
-      if (TheValType.isVector())
-        numElems = TheValType.getVectorNumElements();
-      for (unsigned j = 0, je = numElems; j != je; ++j) {
-        SDValue TmpVal = theVal;
-        if (TheValType.isVector())
-          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                               TheValType.getVectorElementType(), TmpVal,
-                               DAG.getIntPtrConstant(j, dl));
-        EVT TheStoreType = ValVTs[i];
-        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
-          // The following zero-extension is for integer types only, and
-          // specifically not for aggregates.
-          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
-          TheStoreType = MVT::i32;
-        }
-        else if (TmpVal.getValueSizeInBits() < 16)
-          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
-
-        SDValue Ops[] = {
-          Chain,
-          DAG.getConstant(Offsets[i], dl, MVT::i32),
-          TmpVal };
-        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                        DAG.getVTList(MVT::Other), Ops,
-                                        TheStoreType,
-                                        MachinePointerInfo());
-      }
-    }
-  }
-
-  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
-}
-
-void NVPTXTargetLowering::LowerAsmOperandForConstraint(
-    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
-    SelectionDAG &DAG) const {
-  if (Constraint.length() > 1)
-    return;
-  else
-    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
-  switch (Intrinsic) {
-  default:
-    return 0;
-
-  case Intrinsic::nvvm_tex_1d_v4f32_s32:
-    return NVPTXISD::Tex1DFloatS32;
-  case Intrinsic::nvvm_tex_1d_v4f32_f32:
-    return NVPTXISD::Tex1DFloatFloat;
-  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
-    return NVPTXISD::Tex1DFloatFloatLevel;
-  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
-    return NVPTXISD::Tex1DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_1d_v4s32_s32:
-    return NVPTXISD::Tex1DS32S32;
-  case Intrinsic::nvvm_tex_1d_v4s32_f32:
-    return NVPTXISD::Tex1DS32Float;
-  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
-    return NVPTXISD::Tex1DS32FloatLevel;
-  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
-    return NVPTXISD::Tex1DS32FloatGrad;
-  case Intrinsic::nvvm_tex_1d_v4u32_s32:
-    return NVPTXISD::Tex1DU32S32;
-  case Intrinsic::nvvm_tex_1d_v4u32_f32:
-    return NVPTXISD::Tex1DU32Float;
-  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
-    return NVPTXISD::Tex1DU32FloatLevel;
-  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
-    return NVPTXISD::Tex1DU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
-    return NVPTXISD::Tex1DArrayFloatS32;
-  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
-    return NVPTXISD::Tex1DArrayFloatFloat;
-  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
-    return NVPTXISD::Tex1DArrayFloatFloatLevel;
-  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
-    return NVPTXISD::Tex1DArrayFloatFloatGrad;
-  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
-    return NVPTXISD::Tex1DArrayS32S32;
-  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
-    return NVPTXISD::Tex1DArrayS32Float;
-  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
-    return NVPTXISD::Tex1DArrayS32FloatLevel;
-  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
-    return NVPTXISD::Tex1DArrayS32FloatGrad;
-  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
-    return NVPTXISD::Tex1DArrayU32S32;
-  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
-    return NVPTXISD::Tex1DArrayU32Float;
-  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
-    return NVPTXISD::Tex1DArrayU32FloatLevel;
-  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
-    return NVPTXISD::Tex1DArrayU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_2d_v4f32_s32:
-    return NVPTXISD::Tex2DFloatS32;
-  case Intrinsic::nvvm_tex_2d_v4f32_f32:
-    return NVPTXISD::Tex2DFloatFloat;
-  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
-    return NVPTXISD::Tex2DFloatFloatLevel;
-  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
-    return NVPTXISD::Tex2DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_2d_v4s32_s32:
-    return NVPTXISD::Tex2DS32S32;
-  case Intrinsic::nvvm_tex_2d_v4s32_f32:
-    return NVPTXISD::Tex2DS32Float;
-  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
-    return NVPTXISD::Tex2DS32FloatLevel;
-  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
-    return NVPTXISD::Tex2DS32FloatGrad;
-  case Intrinsic::nvvm_tex_2d_v4u32_s32:
-    return NVPTXISD::Tex2DU32S32;
-  case Intrinsic::nvvm_tex_2d_v4u32_f32:
-    return NVPTXISD::Tex2DU32Float;
-  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
-    return NVPTXISD::Tex2DU32FloatLevel;
-  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
-    return NVPTXISD::Tex2DU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
-    return NVPTXISD::Tex2DArrayFloatS32;
-  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
-    return NVPTXISD::Tex2DArrayFloatFloat;
-  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
-    return NVPTXISD::Tex2DArrayFloatFloatLevel;
-  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
-    return NVPTXISD::Tex2DArrayFloatFloatGrad;
-  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
-    return NVPTXISD::Tex2DArrayS32S32;
-  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
-    return NVPTXISD::Tex2DArrayS32Float;
-  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
-    return NVPTXISD::Tex2DArrayS32FloatLevel;
-  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
-    return NVPTXISD::Tex2DArrayS32FloatGrad;
-  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
-    return NVPTXISD::Tex2DArrayU32S32;
-  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
-    return NVPTXISD::Tex2DArrayU32Float;
-  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
-    return NVPTXISD::Tex2DArrayU32FloatLevel;
-  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
-    return NVPTXISD::Tex2DArrayU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_3d_v4f32_s32:
-    return NVPTXISD::Tex3DFloatS32;
-  case Intrinsic::nvvm_tex_3d_v4f32_f32:
-    return NVPTXISD::Tex3DFloatFloat;
-  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
-    return NVPTXISD::Tex3DFloatFloatLevel;
-  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
-    return NVPTXISD::Tex3DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_3d_v4s32_s32:
-    return NVPTXISD::Tex3DS32S32;
-  case Intrinsic::nvvm_tex_3d_v4s32_f32:
-    return NVPTXISD::Tex3DS32Float;
-  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
-    return NVPTXISD::Tex3DS32FloatLevel;
-  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
-    return NVPTXISD::Tex3DS32FloatGrad;
-  case Intrinsic::nvvm_tex_3d_v4u32_s32:
-    return NVPTXISD::Tex3DU32S32;
-  case Intrinsic::nvvm_tex_3d_v4u32_f32:
-    return NVPTXISD::Tex3DU32Float;
-  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
-    return NVPTXISD::Tex3DU32FloatLevel;
-  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
-    return NVPTXISD::Tex3DU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_cube_v4f32_f32:
-    return NVPTXISD::TexCubeFloatFloat;
-  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
-    return NVPTXISD::TexCubeFloatFloatLevel;
-  case Intrinsic::nvvm_tex_cube_v4s32_f32:
-    return NVPTXISD::TexCubeS32Float;
-  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
-    return NVPTXISD::TexCubeS32FloatLevel;
-  case Intrinsic::nvvm_tex_cube_v4u32_f32:
-    return NVPTXISD::TexCubeU32Float;
-  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
-    return NVPTXISD::TexCubeU32FloatLevel;
-
-  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
-    return NVPTXISD::TexCubeArrayFloatFloat;
-  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
-    return NVPTXISD::TexCubeArrayFloatFloatLevel;
-  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
-    return NVPTXISD::TexCubeArrayS32Float;
-  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
-    return NVPTXISD::TexCubeArrayS32FloatLevel;
-  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
-    return NVPTXISD::TexCubeArrayU32Float;
-  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
-    return NVPTXISD::TexCubeArrayU32FloatLevel;
-
-  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
-    return NVPTXISD::Tld4R2DFloatFloat;
-  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
-    return NVPTXISD::Tld4G2DFloatFloat;
-  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
-    return NVPTXISD::Tld4B2DFloatFloat;
-  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
-    return NVPTXISD::Tld4A2DFloatFloat;
-  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
-    return NVPTXISD::Tld4R2DS64Float;
-  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
-    return NVPTXISD::Tld4G2DS64Float;
-  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
-    return NVPTXISD::Tld4B2DS64Float;
-  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
-    return NVPTXISD::Tld4A2DS64Float;
-  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
-    return NVPTXISD::Tld4R2DU64Float;
-  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
-    return NVPTXISD::Tld4G2DU64Float;
-  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
-    return NVPTXISD::Tld4B2DU64Float;
-  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
-    return NVPTXISD::Tld4A2DU64Float;
-
-  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
-    return NVPTXISD::TexUnified1DFloatS32;
-  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
-    return NVPTXISD::TexUnified1DFloatFloat;
-  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
-    return NVPTXISD::TexUnified1DFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
-    return NVPTXISD::TexUnified1DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
-    return NVPTXISD::TexUnified1DS32S32;
-  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
-    return NVPTXISD::TexUnified1DS32Float;
-  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
-    return NVPTXISD::TexUnified1DS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
-    return NVPTXISD::TexUnified1DS32FloatGrad;
-  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
-    return NVPTXISD::TexUnified1DU32S32;
-  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
-    return NVPTXISD::TexUnified1DU32Float;
-  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
-    return NVPTXISD::TexUnified1DU32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
-    return NVPTXISD::TexUnified1DU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
-    return NVPTXISD::TexUnified1DArrayFloatS32;
-  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
-    return NVPTXISD::TexUnified1DArrayFloatFloat;
-  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
-    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
-    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
-  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
-    return NVPTXISD::TexUnified1DArrayS32S32;
-  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
-    return NVPTXISD::TexUnified1DArrayS32Float;
-  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
-    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
-    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
-  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
-    return NVPTXISD::TexUnified1DArrayU32S32;
-  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
-    return NVPTXISD::TexUnified1DArrayU32Float;
-  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
-    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
-    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
-    return NVPTXISD::TexUnified2DFloatS32;
-  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
-    return NVPTXISD::TexUnified2DFloatFloat;
-  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
-    return NVPTXISD::TexUnified2DFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
-    return NVPTXISD::TexUnified2DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
-    return NVPTXISD::TexUnified2DS32S32;
-  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
-    return NVPTXISD::TexUnified2DS32Float;
-  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
-    return NVPTXISD::TexUnified2DS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
-    return NVPTXISD::TexUnified2DS32FloatGrad;
-  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
-    return NVPTXISD::TexUnified2DU32S32;
-  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
-    return NVPTXISD::TexUnified2DU32Float;
-  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
-    return NVPTXISD::TexUnified2DU32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
-    return NVPTXISD::TexUnified2DU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
-    return NVPTXISD::TexUnified2DArrayFloatS32;
-  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
-    return NVPTXISD::TexUnified2DArrayFloatFloat;
-  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
-    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
-    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
-  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
-    return NVPTXISD::TexUnified2DArrayS32S32;
-  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
-    return NVPTXISD::TexUnified2DArrayS32Float;
-  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
-    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
-    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
-  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
-    return NVPTXISD::TexUnified2DArrayU32S32;
-  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
-    return NVPTXISD::TexUnified2DArrayU32Float;
-  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
-    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
-    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
-    return NVPTXISD::TexUnified3DFloatS32;
-  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
-    return NVPTXISD::TexUnified3DFloatFloat;
-  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
-    return NVPTXISD::TexUnified3DFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
-    return NVPTXISD::TexUnified3DFloatFloatGrad;
-  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
-    return NVPTXISD::TexUnified3DS32S32;
-  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
-    return NVPTXISD::TexUnified3DS32Float;
-  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
-    return NVPTXISD::TexUnified3DS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
-    return NVPTXISD::TexUnified3DS32FloatGrad;
-  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
-    return NVPTXISD::TexUnified3DU32S32;
-  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
-    return NVPTXISD::TexUnified3DU32Float;
-  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
-    return NVPTXISD::TexUnified3DU32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
-    return NVPTXISD::TexUnified3DU32FloatGrad;
-
-  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
-    return NVPTXISD::TexUnifiedCubeFloatFloat;
-  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
-    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
-    return NVPTXISD::TexUnifiedCubeS32Float;
-  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
-    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
-    return NVPTXISD::TexUnifiedCubeU32Float;
-  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
-    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
-
-  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
-    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
-  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
-    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
-  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
-    return NVPTXISD::TexUnifiedCubeArrayS32Float;
-  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
-    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
-  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
-    return NVPTXISD::TexUnifiedCubeArrayU32Float;
-  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
-    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
-
-  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
-    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
-  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
-    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
-  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
-    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
-    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
-  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
-    return NVPTXISD::Tld4UnifiedR2DS64Float;
-  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
-    return NVPTXISD::Tld4UnifiedG2DS64Float;
-  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
-    return NVPTXISD::Tld4UnifiedB2DS64Float;
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
-    return NVPTXISD::Tld4UnifiedA2DS64Float;
-  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
-    return NVPTXISD::Tld4UnifiedR2DU64Float;
-  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
-    return NVPTXISD::Tld4UnifiedG2DU64Float;
-  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
-    return NVPTXISD::Tld4UnifiedB2DU64Float;
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
-    return NVPTXISD::Tld4UnifiedA2DU64Float;
-  }
-}
-
-static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
-  switch (Intrinsic) {
-  default:
-    return 0;
-  case Intrinsic::nvvm_suld_1d_i8_clamp:
-    return NVPTXISD::Suld1DI8Clamp;
-  case Intrinsic::nvvm_suld_1d_i16_clamp:
-    return NVPTXISD::Suld1DI16Clamp;
-  case Intrinsic::nvvm_suld_1d_i32_clamp:
-    return NVPTXISD::Suld1DI32Clamp;
-  case Intrinsic::nvvm_suld_1d_i64_clamp:
-    return NVPTXISD::Suld1DI64Clamp;
-  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
-    return NVPTXISD::Suld1DV2I8Clamp;
-  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
-    return NVPTXISD::Suld1DV2I16Clamp;
-  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
-    return NVPTXISD::Suld1DV2I32Clamp;
-  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
-    return NVPTXISD::Suld1DV2I64Clamp;
-  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
-    return NVPTXISD::Suld1DV4I8Clamp;
-  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
-    return NVPTXISD::Suld1DV4I16Clamp;
-  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
-    return NVPTXISD::Suld1DV4I32Clamp;
-  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
-    return NVPTXISD::Suld1DArrayI8Clamp;
-  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
-    return NVPTXISD::Suld1DArrayI16Clamp;
-  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
-    return NVPTXISD::Suld1DArrayI32Clamp;
-  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
-    return NVPTXISD::Suld1DArrayI64Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
-    return NVPTXISD::Suld1DArrayV2I8Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
-    return NVPTXISD::Suld1DArrayV2I16Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
-    return NVPTXISD::Suld1DArrayV2I32Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
-    return NVPTXISD::Suld1DArrayV2I64Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
-    return NVPTXISD::Suld1DArrayV4I8Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
-    return NVPTXISD::Suld1DArrayV4I16Clamp;
-  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
-    return NVPTXISD::Suld1DArrayV4I32Clamp;
-  case Intrinsic::nvvm_suld_2d_i8_clamp:
-    return NVPTXISD::Suld2DI8Clamp;
-  case Intrinsic::nvvm_suld_2d_i16_clamp:
-    return NVPTXISD::Suld2DI16Clamp;
-  case Intrinsic::nvvm_suld_2d_i32_clamp:
-    return NVPTXISD::Suld2DI32Clamp;
-  case Intrinsic::nvvm_suld_2d_i64_clamp:
-    return NVPTXISD::Suld2DI64Clamp;
-  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
-    return NVPTXISD::Suld2DV2I8Clamp;
-  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
-    return NVPTXISD::Suld2DV2I16Clamp;
-  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
-    return NVPTXISD::Suld2DV2I32Clamp;
-  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
-    return NVPTXISD::Suld2DV2I64Clamp;
-  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
-    return NVPTXISD::Suld2DV4I8Clamp;
-  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
-    return NVPTXISD::Suld2DV4I16Clamp;
-  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
-    return NVPTXISD::Suld2DV4I32Clamp;
-  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
-    return NVPTXISD::Suld2DArrayI8Clamp;
-  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
-    return NVPTXISD::Suld2DArrayI16Clamp;
-  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
-    return NVPTXISD::Suld2DArrayI32Clamp;
-  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
-    return NVPTXISD::Suld2DArrayI64Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
-    return NVPTXISD::Suld2DArrayV2I8Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
-    return NVPTXISD::Suld2DArrayV2I16Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
-    return NVPTXISD::Suld2DArrayV2I32Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
-    return NVPTXISD::Suld2DArrayV2I64Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
-    return NVPTXISD::Suld2DArrayV4I8Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
-    return NVPTXISD::Suld2DArrayV4I16Clamp;
-  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
-    return NVPTXISD::Suld2DArrayV4I32Clamp;
-  case Intrinsic::nvvm_suld_3d_i8_clamp:
-    return NVPTXISD::Suld3DI8Clamp;
-  case Intrinsic::nvvm_suld_3d_i16_clamp:
-    return NVPTXISD::Suld3DI16Clamp;
-  case Intrinsic::nvvm_suld_3d_i32_clamp:
-    return NVPTXISD::Suld3DI32Clamp;
-  case Intrinsic::nvvm_suld_3d_i64_clamp:
-    return NVPTXISD::Suld3DI64Clamp;
-  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
-    return NVPTXISD::Suld3DV2I8Clamp;
-  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
-    return NVPTXISD::Suld3DV2I16Clamp;
-  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
-    return NVPTXISD::Suld3DV2I32Clamp;
-  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
-    return NVPTXISD::Suld3DV2I64Clamp;
-  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
-    return NVPTXISD::Suld3DV4I8Clamp;
-  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
-    return NVPTXISD::Suld3DV4I16Clamp;
-  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
-    return NVPTXISD::Suld3DV4I32Clamp;
-  case Intrinsic::nvvm_suld_1d_i8_trap:
-    return NVPTXISD::Suld1DI8Trap;
-  case Intrinsic::nvvm_suld_1d_i16_trap:
-    return NVPTXISD::Suld1DI16Trap;
-  case Intrinsic::nvvm_suld_1d_i32_trap:
-    return NVPTXISD::Suld1DI32Trap;
-  case Intrinsic::nvvm_suld_1d_i64_trap:
-    return NVPTXISD::Suld1DI64Trap;
-  case Intrinsic::nvvm_suld_1d_v2i8_trap:
-    return NVPTXISD::Suld1DV2I8Trap;
-  case Intrinsic::nvvm_suld_1d_v2i16_trap:
-    return NVPTXISD::Suld1DV2I16Trap;
-  case Intrinsic::nvvm_suld_1d_v2i32_trap:
-    return NVPTXISD::Suld1DV2I32Trap;
-  case Intrinsic::nvvm_suld_1d_v2i64_trap:
-    return NVPTXISD::Suld1DV2I64Trap;
-  case Intrinsic::nvvm_suld_1d_v4i8_trap:
-    return NVPTXISD::Suld1DV4I8Trap;
-  case Intrinsic::nvvm_suld_1d_v4i16_trap:
-    return NVPTXISD::Suld1DV4I16Trap;
-  case Intrinsic::nvvm_suld_1d_v4i32_trap:
-    return NVPTXISD::Suld1DV4I32Trap;
-  case Intrinsic::nvvm_suld_1d_array_i8_trap:
-    return NVPTXISD::Suld1DArrayI8Trap;
-  case Intrinsic::nvvm_suld_1d_array_i16_trap:
-    return NVPTXISD::Suld1DArrayI16Trap;
-  case Intrinsic::nvvm_suld_1d_array_i32_trap:
-    return NVPTXISD::Suld1DArrayI32Trap;
-  case Intrinsic::nvvm_suld_1d_array_i64_trap:
-    return NVPTXISD::Suld1DArrayI64Trap;
-  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
-    return NVPTXISD::Suld1DArrayV2I8Trap;
-  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
-    return NVPTXISD::Suld1DArrayV2I16Trap;
-  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
-    return NVPTXISD::Suld1DArrayV2I32Trap;
-  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
-    return NVPTXISD::Suld1DArrayV2I64Trap;
-  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
-    return NVPTXISD::Suld1DArrayV4I8Trap;
-  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
-    return NVPTXISD::Suld1DArrayV4I16Trap;
-  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
-    return NVPTXISD::Suld1DArrayV4I32Trap;
-  case Intrinsic::nvvm_suld_2d_i8_trap:
-    return NVPTXISD::Suld2DI8Trap;
-  case Intrinsic::nvvm_suld_2d_i16_trap:
-    return NVPTXISD::Suld2DI16Trap;
-  case Intrinsic::nvvm_suld_2d_i32_trap:
-    return NVPTXISD::Suld2DI32Trap;
-  case Intrinsic::nvvm_suld_2d_i64_trap:
-    return NVPTXISD::Suld2DI64Trap;
-  case Intrinsic::nvvm_suld_2d_v2i8_trap:
-    return NVPTXISD::Suld2DV2I8Trap;
-  case Intrinsic::nvvm_suld_2d_v2i16_trap:
-    return NVPTXISD::Suld2DV2I16Trap;
-  case Intrinsic::nvvm_suld_2d_v2i32_trap:
-    return NVPTXISD::Suld2DV2I32Trap;
-  case Intrinsic::nvvm_suld_2d_v2i64_trap:
-    return NVPTXISD::Suld2DV2I64Trap;
-  case Intrinsic::nvvm_suld_2d_v4i8_trap:
-    return NVPTXISD::Suld2DV4I8Trap;
-  case Intrinsic::nvvm_suld_2d_v4i16_trap:
-    return NVPTXISD::Suld2DV4I16Trap;
-  case Intrinsic::nvvm_suld_2d_v4i32_trap:
-    return NVPTXISD::Suld2DV4I32Trap;
-  case Intrinsic::nvvm_suld_2d_array_i8_trap:
-    return NVPTXISD::Suld2DArrayI8Trap;
-  case Intrinsic::nvvm_suld_2d_array_i16_trap:
-    return NVPTXISD::Suld2DArrayI16Trap;
-  case Intrinsic::nvvm_suld_2d_array_i32_trap:
-    return NVPTXISD::Suld2DArrayI32Trap;
-  case Intrinsic::nvvm_suld_2d_array_i64_trap:
-    return NVPTXISD::Suld2DArrayI64Trap;
-  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
-    return NVPTXISD::Suld2DArrayV2I8Trap;
-  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
-    return NVPTXISD::Suld2DArrayV2I16Trap;
-  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
-    return NVPTXISD::Suld2DArrayV2I32Trap;
-  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
-    return NVPTXISD::Suld2DArrayV2I64Trap;
-  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
-    return NVPTXISD::Suld2DArrayV4I8Trap;
-  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
-    return NVPTXISD::Suld2DArrayV4I16Trap;
-  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
-    return NVPTXISD::Suld2DArrayV4I32Trap;
-  case Intrinsic::nvvm_suld_3d_i8_trap:
-    return NVPTXISD::Suld3DI8Trap;
-  case Intrinsic::nvvm_suld_3d_i16_trap:
-    return NVPTXISD::Suld3DI16Trap;
-  case Intrinsic::nvvm_suld_3d_i32_trap:
-    return NVPTXISD::Suld3DI32Trap;
-  case Intrinsic::nvvm_suld_3d_i64_trap:
-    return NVPTXISD::Suld3DI64Trap;
-  case Intrinsic::nvvm_suld_3d_v2i8_trap:
-    return NVPTXISD::Suld3DV2I8Trap;
-  case Intrinsic::nvvm_suld_3d_v2i16_trap:
-    return NVPTXISD::Suld3DV2I16Trap;
-  case Intrinsic::nvvm_suld_3d_v2i32_trap:
-    return NVPTXISD::Suld3DV2I32Trap;
-  case Intrinsic::nvvm_suld_3d_v2i64_trap:
-    return NVPTXISD::Suld3DV2I64Trap;
-  case Intrinsic::nvvm_suld_3d_v4i8_trap:
-    return NVPTXISD::Suld3DV4I8Trap;
-  case Intrinsic::nvvm_suld_3d_v4i16_trap:
-    return NVPTXISD::Suld3DV4I16Trap;
-  case Intrinsic::nvvm_suld_3d_v4i32_trap:
-    return NVPTXISD::Suld3DV4I32Trap;
-  case Intrinsic::nvvm_suld_1d_i8_zero:
-    return NVPTXISD::Suld1DI8Zero;
-  case Intrinsic::nvvm_suld_1d_i16_zero:
-    return NVPTXISD::Suld1DI16Zero;
-  case Intrinsic::nvvm_suld_1d_i32_zero:
-    return NVPTXISD::Suld1DI32Zero;
-  case Intrinsic::nvvm_suld_1d_i64_zero:
-    return NVPTXISD::Suld1DI64Zero;
-  case Intrinsic::nvvm_suld_1d_v2i8_zero:
-    return NVPTXISD::Suld1DV2I8Zero;
-  case Intrinsic::nvvm_suld_1d_v2i16_zero:
-    return NVPTXISD::Suld1DV2I16Zero;
-  case Intrinsic::nvvm_suld_1d_v2i32_zero:
-    return NVPTXISD::Suld1DV2I32Zero;
-  case Intrinsic::nvvm_suld_1d_v2i64_zero:
-    return NVPTXISD::Suld1DV2I64Zero;
-  case Intrinsic::nvvm_suld_1d_v4i8_zero:
-    return NVPTXISD::Suld1DV4I8Zero;
-  case Intrinsic::nvvm_suld_1d_v4i16_zero:
-    return NVPTXISD::Suld1DV4I16Zero;
-  case Intrinsic::nvvm_suld_1d_v4i32_zero:
-    return NVPTXISD::Suld1DV4I32Zero;
-  case Intrinsic::nvvm_suld_1d_array_i8_zero:
-    return NVPTXISD::Suld1DArrayI8Zero;
-  case Intrinsic::nvvm_suld_1d_array_i16_zero:
-    return NVPTXISD::Suld1DArrayI16Zero;
-  case Intrinsic::nvvm_suld_1d_array_i32_zero:
-    return NVPTXISD::Suld1DArrayI32Zero;
-  case Intrinsic::nvvm_suld_1d_array_i64_zero:
-    return NVPTXISD::Suld1DArrayI64Zero;
-  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
-    return NVPTXISD::Suld1DArrayV2I8Zero;
-  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
-    return NVPTXISD::Suld1DArrayV2I16Zero;
-  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
-    return NVPTXISD::Suld1DArrayV2I32Zero;
-  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
-    return NVPTXISD::Suld1DArrayV2I64Zero;
-  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
-    return NVPTXISD::Suld1DArrayV4I8Zero;
-  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
-    return NVPTXISD::Suld1DArrayV4I16Zero;
-  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
-    return NVPTXISD::Suld1DArrayV4I32Zero;
-  case Intrinsic::nvvm_suld_2d_i8_zero:
-    return NVPTXISD::Suld2DI8Zero;
-  case Intrinsic::nvvm_suld_2d_i16_zero:
-    return NVPTXISD::Suld2DI16Zero;
-  case Intrinsic::nvvm_suld_2d_i32_zero:
-    return NVPTXISD::Suld2DI32Zero;
-  case Intrinsic::nvvm_suld_2d_i64_zero:
-    return NVPTXISD::Suld2DI64Zero;
-  case Intrinsic::nvvm_suld_2d_v2i8_zero:
-    return NVPTXISD::Suld2DV2I8Zero;
-  case Intrinsic::nvvm_suld_2d_v2i16_zero:
-    return NVPTXISD::Suld2DV2I16Zero;
-  case Intrinsic::nvvm_suld_2d_v2i32_zero:
-    return NVPTXISD::Suld2DV2I32Zero;
-  case Intrinsic::nvvm_suld_2d_v2i64_zero:
-    return NVPTXISD::Suld2DV2I64Zero;
-  case Intrinsic::nvvm_suld_2d_v4i8_zero:
-    return NVPTXISD::Suld2DV4I8Zero;
-  case Intrinsic::nvvm_suld_2d_v4i16_zero:
-    return NVPTXISD::Suld2DV4I16Zero;
-  case Intrinsic::nvvm_suld_2d_v4i32_zero:
-    return NVPTXISD::Suld2DV4I32Zero;
-  case Intrinsic::nvvm_suld_2d_array_i8_zero:
-    return NVPTXISD::Suld2DArrayI8Zero;
-  case Intrinsic::nvvm_suld_2d_array_i16_zero:
-    return NVPTXISD::Suld2DArrayI16Zero;
-  case Intrinsic::nvvm_suld_2d_array_i32_zero:
-    return NVPTXISD::Suld2DArrayI32Zero;
-  case Intrinsic::nvvm_suld_2d_array_i64_zero:
-    return NVPTXISD::Suld2DArrayI64Zero;
-  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
-    return NVPTXISD::Suld2DArrayV2I8Zero;
-  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
-    return NVPTXISD::Suld2DArrayV2I16Zero;
-  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
-    return NVPTXISD::Suld2DArrayV2I32Zero;
-  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
-    return NVPTXISD::Suld2DArrayV2I64Zero;
-  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
-    return NVPTXISD::Suld2DArrayV4I8Zero;
-  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
-    return NVPTXISD::Suld2DArrayV4I16Zero;
-  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
-    return NVPTXISD::Suld2DArrayV4I32Zero;
-  case Intrinsic::nvvm_suld_3d_i8_zero:
-    return NVPTXISD::Suld3DI8Zero;
-  case Intrinsic::nvvm_suld_3d_i16_zero:
-    return NVPTXISD::Suld3DI16Zero;
-  case Intrinsic::nvvm_suld_3d_i32_zero:
-    return NVPTXISD::Suld3DI32Zero;
-  case Intrinsic::nvvm_suld_3d_i64_zero:
-    return NVPTXISD::Suld3DI64Zero;
-  case Intrinsic::nvvm_suld_3d_v2i8_zero:
-    return NVPTXISD::Suld3DV2I8Zero;
-  case Intrinsic::nvvm_suld_3d_v2i16_zero:
-    return NVPTXISD::Suld3DV2I16Zero;
-  case Intrinsic::nvvm_suld_3d_v2i32_zero:
-    return NVPTXISD::Suld3DV2I32Zero;
-  case Intrinsic::nvvm_suld_3d_v2i64_zero:
-    return NVPTXISD::Suld3DV2I64Zero;
-  case Intrinsic::nvvm_suld_3d_v4i8_zero:
-    return NVPTXISD::Suld3DV4I8Zero;
-  case Intrinsic::nvvm_suld_3d_v4i16_zero:
-    return NVPTXISD::Suld3DV4I16Zero;
-  case Intrinsic::nvvm_suld_3d_v4i32_zero:
-    return NVPTXISD::Suld3DV4I32Zero;
-  }
-}
-
-// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
-// TgtMemIntrinsic
-// because we need the information that is only available in the "Value" type
-// of destination
-// pointer. In particular, the address space information.
-bool NVPTXTargetLowering::getTgtMemIntrinsic(
-    IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  default:
-    return false;
-
-  case Intrinsic::nvvm_atomic_load_add_f32:
-  case Intrinsic::nvvm_atomic_load_inc_32:
-  case Intrinsic::nvvm_atomic_load_dec_32:
-
-  case Intrinsic::nvvm_atomic_add_gen_f_cta:
-  case Intrinsic::nvvm_atomic_add_gen_f_sys:
-  case Intrinsic::nvvm_atomic_add_gen_i_cta:
-  case Intrinsic::nvvm_atomic_add_gen_i_sys:
-  case Intrinsic::nvvm_atomic_and_gen_i_cta:
-  case Intrinsic::nvvm_atomic_and_gen_i_sys:
-  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
-  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
-  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
-  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
-  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
-  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
-  case Intrinsic::nvvm_atomic_max_gen_i_cta:
-  case Intrinsic::nvvm_atomic_max_gen_i_sys:
-  case Intrinsic::nvvm_atomic_min_gen_i_cta:
-  case Intrinsic::nvvm_atomic_min_gen_i_sys:
-  case Intrinsic::nvvm_atomic_or_gen_i_cta:
-  case Intrinsic::nvvm_atomic_or_gen_i_sys:
-  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
-  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
-  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
-  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
-    auto &DL = I.getModule()->getDataLayout();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = getValueType(DL, I.getType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = true;
-    Info.align = 0;
-    return true;
-  }
-
-  case Intrinsic::nvvm_ldu_global_i:
-  case Intrinsic::nvvm_ldu_global_f:
-  case Intrinsic::nvvm_ldu_global_p: {
-    auto &DL = I.getModule()->getDataLayout();
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = getValueType(DL, I.getType());
-    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy(DL);
-    else
-      Info.memVT = getValueType(DL, I.getType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
-
-    return true;
-  }
-  case Intrinsic::nvvm_ldg_global_i:
-  case Intrinsic::nvvm_ldg_global_f:
-  case Intrinsic::nvvm_ldg_global_p: {
-    auto &DL = I.getModule()->getDataLayout();
-
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
-      Info.memVT = getValueType(DL, I.getType());
-    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
-      Info.memVT = getPointerTy(DL);
-    else
-      Info.memVT = getValueType(DL, I.getType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
-
-    return true;
-  }
-
-  case Intrinsic::nvvm_tex_1d_v4f32_s32:
-  case Intrinsic::nvvm_tex_1d_v4f32_f32:
-  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
-  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
-  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_v4f32_s32:
-  case Intrinsic::nvvm_tex_2d_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
-  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_3d_v4f32_s32:
-  case Intrinsic::nvvm_tex_3d_v4f32_f32:
-  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_cube_v4f32_f32:
-  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
-  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
-  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
-  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
-  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
-  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
-  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
-  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
-  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
-    Info.opc = getOpcForTextureInstr(Intrinsic);
-    Info.memVT = MVT::v4f32;
-    Info.ptrVal = nullptr;
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = 16;
-    return true;
-
-  case Intrinsic::nvvm_tex_1d_v4s32_s32:
-  case Intrinsic::nvvm_tex_1d_v4s32_f32:
-  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
-  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
-  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_2d_v4s32_s32:
-  case Intrinsic::nvvm_tex_2d_v4s32_f32:
-  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
-  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
-  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_3d_v4s32_s32:
-  case Intrinsic::nvvm_tex_3d_v4s32_f32:
-  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_cube_v4s32_f32:
-  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
-  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_cube_v4u32_f32:
-  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
-  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_1d_v4u32_s32:
-  case Intrinsic::nvvm_tex_1d_v4u32_f32:
-  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
-  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
-  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_2d_v4u32_s32:
-  case Intrinsic::nvvm_tex_2d_v4u32_f32:
-  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
-  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
-  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_3d_v4u32_s32:
-  case Intrinsic::nvvm_tex_3d_v4u32_f32:
-  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
-  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
-  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
-  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
-  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
-  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
-  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
-  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
-  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
-  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
-  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
-  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
-  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
-  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
-  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
-  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
-    Info.opc = getOpcForTextureInstr(Intrinsic);
-    Info.memVT = MVT::v4i32;
-    Info.ptrVal = nullptr;
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = 16;
-    return true;
-
-  case Intrinsic::nvvm_suld_1d_i8_clamp:
-  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
-  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
-  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
-  case Intrinsic::nvvm_suld_2d_i8_clamp:
-  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
-  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
-  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
-  case Intrinsic::nvvm_suld_3d_i8_clamp:
-  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
-  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
-  case Intrinsic::nvvm_suld_1d_i8_trap:
-  case Intrinsic::nvvm_suld_1d_v2i8_trap:
-  case Intrinsic::nvvm_suld_1d_v4i8_trap:
-  case Intrinsic::nvvm_suld_1d_array_i8_trap:
-  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
-  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
-  case Intrinsic::nvvm_suld_2d_i8_trap:
-  case Intrinsic::nvvm_suld_2d_v2i8_trap:
-  case Intrinsic::nvvm_suld_2d_v4i8_trap:
-  case Intrinsic::nvvm_suld_2d_array_i8_trap:
-  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
-  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
-  case Intrinsic::nvvm_suld_3d_i8_trap:
-  case Intrinsic::nvvm_suld_3d_v2i8_trap:
-  case Intrinsic::nvvm_suld_3d_v4i8_trap:
-  case Intrinsic::nvvm_suld_1d_i8_zero:
-  case Intrinsic::nvvm_suld_1d_v2i8_zero:
-  case Intrinsic::nvvm_suld_1d_v4i8_zero:
-  case Intrinsic::nvvm_suld_1d_array_i8_zero:
-  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
-  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
-  case Intrinsic::nvvm_suld_2d_i8_zero:
-  case Intrinsic::nvvm_suld_2d_v2i8_zero:
-  case Intrinsic::nvvm_suld_2d_v4i8_zero:
-  case Intrinsic::nvvm_suld_2d_array_i8_zero:
-  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
-  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
-  case Intrinsic::nvvm_suld_3d_i8_zero:
-  case Intrinsic::nvvm_suld_3d_v2i8_zero:
-  case Intrinsic::nvvm_suld_3d_v4i8_zero:
-    Info.opc = getOpcForSurfaceInstr(Intrinsic);
-    Info.memVT = MVT::i8;
-    Info.ptrVal = nullptr;
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = 16;
-    return true;
-
-  case Intrinsic::nvvm_suld_1d_i16_clamp:
-  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
-  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
-  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
-  case Intrinsic::nvvm_suld_2d_i16_clamp:
-  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
-  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
-  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
-  case Intrinsic::nvvm_suld_3d_i16_clamp:
-  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
-  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
-  case Intrinsic::nvvm_suld_1d_i16_trap:
-  case Intrinsic::nvvm_suld_1d_v2i16_trap:
-  case Intrinsic::nvvm_suld_1d_v4i16_trap:
-  case Intrinsic::nvvm_suld_1d_array_i16_trap:
-  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
-  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
-  case Intrinsic::nvvm_suld_2d_i16_trap:
-  case Intrinsic::nvvm_suld_2d_v2i16_trap:
-  case Intrinsic::nvvm_suld_2d_v4i16_trap:
-  case Intrinsic::nvvm_suld_2d_array_i16_trap:
-  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
-  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
-  case Intrinsic::nvvm_suld_3d_i16_trap:
-  case Intrinsic::nvvm_suld_3d_v2i16_trap:
-  case Intrinsic::nvvm_suld_3d_v4i16_trap:
-  case Intrinsic::nvvm_suld_1d_i16_zero:
-  case Intrinsic::nvvm_suld_1d_v2i16_zero:
-  case Intrinsic::nvvm_suld_1d_v4i16_zero:
-  case Intrinsic::nvvm_suld_1d_array_i16_zero:
-  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
-  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
-  case Intrinsic::nvvm_suld_2d_i16_zero:
-  case Intrinsic::nvvm_suld_2d_v2i16_zero:
-  case Intrinsic::nvvm_suld_2d_v4i16_zero:
-  case Intrinsic::nvvm_suld_2d_array_i16_zero:
-  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
-  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
-  case Intrinsic::nvvm_suld_3d_i16_zero:
-  case Intrinsic::nvvm_suld_3d_v2i16_zero:
-  case Intrinsic::nvvm_suld_3d_v4i16_zero:
-    Info.opc = getOpcForSurfaceInstr(Intrinsic);
-    Info.memVT = MVT::i16;
-    Info.ptrVal = nullptr;
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = 16;
-    return true;
-
-  case Intrinsic::nvvm_suld_1d_i32_clamp:
-  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
-  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
-  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
-  case Intrinsic::nvvm_suld_2d_i32_clamp:
-  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
-  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
-  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
-  case Intrinsic::nvvm_suld_3d_i32_clamp:
-  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
-  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
-  case Intrinsic::nvvm_suld_1d_i32_trap:
-  case Intrinsic::nvvm_suld_1d_v2i32_trap:
-  case Intrinsic::nvvm_suld_1d_v4i32_trap:
-  case Intrinsic::nvvm_suld_1d_array_i32_trap:
-  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
-  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
-  case Intrinsic::nvvm_suld_2d_i32_trap:
-  case Intrinsic::nvvm_suld_2d_v2i32_trap:
-  case Intrinsic::nvvm_suld_2d_v4i32_trap:
-  case Intrinsic::nvvm_suld_2d_array_i32_trap:
-  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
-  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
-  case Intrinsic::nvvm_suld_3d_i32_trap:
-  case Intrinsic::nvvm_suld_3d_v2i32_trap:
-  case Intrinsic::nvvm_suld_3d_v4i32_trap:
-  case Intrinsic::nvvm_suld_1d_i32_zero:
-  case Intrinsic::nvvm_suld_1d_v2i32_zero:
-  case Intrinsic::nvvm_suld_1d_v4i32_zero:
-  case Intrinsic::nvvm_suld_1d_array_i32_zero:
-  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
-  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
-  case Intrinsic::nvvm_suld_2d_i32_zero:
-  case Intrinsic::nvvm_suld_2d_v2i32_zero:
-  case Intrinsic::nvvm_suld_2d_v4i32_zero:
-  case Intrinsic::nvvm_suld_2d_array_i32_zero:
-  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
-  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
-  case Intrinsic::nvvm_suld_3d_i32_zero:
-  case Intrinsic::nvvm_suld_3d_v2i32_zero:
-  case Intrinsic::nvvm_suld_3d_v4i32_zero:
-    Info.opc = getOpcForSurfaceInstr(Intrinsic);
-    Info.memVT = MVT::i32;
-    Info.ptrVal = nullptr;
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = 16;
-    return true;
-
-  case Intrinsic::nvvm_suld_1d_i64_clamp:
-  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
-  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
-  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
-  case Intrinsic::nvvm_suld_2d_i64_clamp:
-  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
-  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
-  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
-  case Intrinsic::nvvm_suld_3d_i64_clamp:
-  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
-  case Intrinsic::nvvm_suld_1d_i64_trap:
-  case Intrinsic::nvvm_suld_1d_v2i64_trap:
-  case Intrinsic::nvvm_suld_1d_array_i64_trap:
-  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
-  case Intrinsic::nvvm_suld_2d_i64_trap:
-  case Intrinsic::nvvm_suld_2d_v2i64_trap:
-  case Intrinsic::nvvm_suld_2d_array_i64_trap:
-  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
-  case Intrinsic::nvvm_suld_3d_i64_trap:
-  case Intrinsic::nvvm_suld_3d_v2i64_trap:
-  case Intrinsic::nvvm_suld_1d_i64_zero:
-  case Intrinsic::nvvm_suld_1d_v2i64_zero:
-  case Intrinsic::nvvm_suld_1d_array_i64_zero:
-  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
-  case Intrinsic::nvvm_suld_2d_i64_zero:
-  case Intrinsic::nvvm_suld_2d_v2i64_zero:
-  case Intrinsic::nvvm_suld_2d_array_i64_zero:
-  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
-  case Intrinsic::nvvm_suld_3d_i64_zero:
-  case Intrinsic::nvvm_suld_3d_v2i64_zero:
-    Info.opc = getOpcForSurfaceInstr(Intrinsic);
-    Info.memVT = MVT::i64;
-    Info.ptrVal = nullptr;
-    Info.offset = 0;
-    Info.vol = false;
-    Info.readMem = true;
-    Info.writeMem = false;
-    Info.align = 16;
-    return true;
-  }
-  return false;
-}
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-/// Used to guide target specific optimizations, like loop strength reduction
-/// (LoopStrengthReduce.cpp) and memory optimization for address mode
-/// (CodeGenPrepare.cpp)
-bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
-                                                const AddrMode &AM, Type *Ty,
-                                                unsigned AS) const {
-  // AddrMode - This represents an addressing mode of:
-  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
-  //
-  // The legal address modes are
-  // - [avar]
-  // - [areg]
-  // - [areg+immoff]
-  // - [immAddr]
-
-  if (AM.BaseGV) {
-    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
-  }
-
-  switch (AM.Scale) {
-  case 0: // "r", "r+i" or "i" is allowed
-    break;
-  case 1:
-    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
-      return false;
-    // Otherwise we have r+i.
-    break;
-  default:
-    // No scale > 1 is allowed
-    return false;
-  }
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-//                         NVPTX Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-NVPTXTargetLowering::ConstraintType
-NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default:
-      break;
-    case 'b':
-    case 'r':
-    case 'h':
-    case 'c':
-    case 'l':
-    case 'f':
-    case 'd':
-    case '0':
-    case 'N':
-      return C_RegisterClass;
-    }
-  }
-  return TargetLowering::getConstraintType(Constraint);
-}
-
-std::pair<unsigned, const TargetRegisterClass *>
-NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
-                                                  StringRef Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'b':
-      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
-    case 'c':
-      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
-    case 'h':
-      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
-    case 'r':
-      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
-    case 'l':
-    case 'N':
-      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
-    case 'f':
-      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
-    case 'd':
-      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
-    }
-  }
-  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-}
-
-//===----------------------------------------------------------------------===//
-//                         NVPTX DAG Combining
-//===----------------------------------------------------------------------===//
-
-bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
-                                   CodeGenOpt::Level OptLevel) const {
-  const Function *F = MF.getFunction();
-  const TargetOptions &TO = MF.getTarget().Options;
-
-  // Always honor command-line argument
-  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
-    return FMAContractLevelOpt > 0;
-  } else if (OptLevel == 0) {
-    // Do not contract if we're not optimizing the code
-    return false;
-  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
-    // Honor TargetOptions flags that explicitly say fusion is okay
-    return true;
-  } else if (F->hasFnAttribute("unsafe-fp-math")) {
-    // Check for unsafe-fp-math=true coming from Clang
-    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
-    StringRef Val = Attr.getValueAsString();
-    if (Val == "true")
-      return true;
-  }
-
-  // We did not have a clear indication that fusion is allowed, so assume not
-  return false;
-}
-
-/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
-/// operands N0 and N1.  This is a helper for PerformADDCombine that is
-/// called with the default operands, and if that fails, with commuted
-/// operands.
-static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                             const NVPTXSubtarget &Subtarget,
-                                             CodeGenOpt::Level OptLevel) {
-  SelectionDAG  &DAG = DCI.DAG;
-  // Skip non-integer, non-scalar case
-  EVT VT=N0.getValueType();
-  if (VT.isVector())
-    return SDValue();
-
-  // fold (add (mul a, b), c) -> (mad a, b, c)
-  //
-  if (N0.getOpcode() == ISD::MUL) {
-    assert (VT.isInteger());
-    // For integer:
-    // Since integer multiply-add costs the same as integer multiply
-    // but is more costly than integer add, do the fusion only when
-    // the mul is only used in the add.
-    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
-        !N0.getNode()->hasOneUse())
-      return SDValue();
-
-    // Do the folding
-    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
-                       N0.getOperand(0), N0.getOperand(1), N1);
-  }
-  else if (N0.getOpcode() == ISD::FMUL) {
-    if (VT == MVT::f32 || VT == MVT::f64) {
-      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
-          &DAG.getTargetLoweringInfo());
-      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
-        return SDValue();
-
-      // For floating point:
-      // Do the fusion only when the mul has less than 5 uses and all
-      // are add.
-      // The heuristic is that if a use is not an add, then that use
-      // cannot be fused into fma, therefore mul is still needed anyway.
-      // If there are more than 4 uses, even if they are all add, fusing
-      // them will increase register pressue.
-      //
-      int numUses = 0;
-      int nonAddCount = 0;
-      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
-           UE = N0.getNode()->use_end();
-           UI != UE; ++UI) {
-        numUses++;
-        SDNode *User = *UI;
-        if (User->getOpcode() != ISD::FADD)
-          ++nonAddCount;
-      }
-      if (numUses >= 5)
-        return SDValue();
-      if (nonAddCount) {
-        int orderNo = N->getIROrder();
-        int orderNo2 = N0.getNode()->getIROrder();
-        // simple heuristics here for considering potential register
-        // pressure, the logics here is that the differnce are used
-        // to measure the distance between def and use, the longer distance
-        // more likely cause register pressure.
-        if (orderNo - orderNo2 < 500)
-          return SDValue();
-
-        // Now, check if at least one of the FMUL's operands is live beyond the node N,
-        // which guarantees that the FMA will not increase register pressure at node N.
-        bool opIsLive = false;
-        const SDNode *left = N0.getOperand(0).getNode();
-        const SDNode *right = N0.getOperand(1).getNode();
-
-        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
-          opIsLive = true;
-
-        if (!opIsLive)
-          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
-            SDNode *User = *UI;
-            int orderNo3 = User->getIROrder();
-            if (orderNo3 > orderNo) {
-              opIsLive = true;
-              break;
-            }
-          }
-
-        if (!opIsLive)
-          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
-            SDNode *User = *UI;
-            int orderNo3 = User->getIROrder();
-            if (orderNo3 > orderNo) {
-              opIsLive = true;
-              break;
-            }
-          }
-
-        if (!opIsLive)
-          return SDValue();
-      }
-
-      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1), N1);
-    }
-  }
-
-  return SDValue();
-}
-
-/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
-///
-static SDValue PerformADDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const NVPTXSubtarget &Subtarget,
-                                 CodeGenOpt::Level OptLevel) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  // First try with the default operand order.
-  if (SDValue Result =
-          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
-    return Result;
-
-  // If that didn't work, try again with the operands commuted.
-  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
-}
-
-static SDValue PerformANDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  // The type legalizer turns a vector load of i8 values into a zextload to i16
-  // registers, optionally ANY_EXTENDs it (if target type is integer),
-  // and ANDs off the high 8 bits. Since we turn this load into a
-  // target-specific DAG node, the DAG combiner fails to eliminate these AND
-  // nodes. Do that here.
-  SDValue Val = N->getOperand(0);
-  SDValue Mask = N->getOperand(1);
-
-  if (isa<ConstantSDNode>(Val)) {
-    std::swap(Val, Mask);
-  }
-
-  SDValue AExt;
-  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
-  if (Val.getOpcode() == ISD::ANY_EXTEND) {
-    AExt = Val;
-    Val = Val->getOperand(0);
-  }
-
-  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
-    Val = Val->getOperand(0);
-  }
-
-  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
-      Val->getOpcode() == NVPTXISD::LoadV4) {
-    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
-    if (!MaskCnst) {
-      // Not an AND with a constant
-      return SDValue();
-    }
-
-    uint64_t MaskVal = MaskCnst->getZExtValue();
-    if (MaskVal != 0xff) {
-      // Not an AND that chops off top 8 bits
-      return SDValue();
-    }
-
-    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
-    if (!Mem) {
-      // Not a MemSDNode?!?
-      return SDValue();
-    }
-
-    EVT MemVT = Mem->getMemoryVT();
-    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
-      // We only handle the i8 case
-      return SDValue();
-    }
-
-    unsigned ExtType =
-      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
-        getZExtValue();
-    if (ExtType == ISD::SEXTLOAD) {
-      // If for some reason the load is a sextload, the and is needed to zero
-      // out the high 8 bits
-      return SDValue();
-    }
-
-    bool AddTo = false;
-    if (AExt.getNode() != nullptr) {
-      // Re-insert the ext as a zext.
-      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
-                            AExt.getValueType(), Val);
-      AddTo = true;
-    }
-
-    // If we get here, the AND is unnecessary.  Just replace it with the load
-    DCI.CombineTo(N, Val, AddTo);
-  }
-
-  return SDValue();
-}
-
-static SDValue PerformSELECTCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  // Currently this detects patterns for integer min and max and
-  // lowers them to PTX-specific intrinsics that enable hardware
-  // support.
-
-  const SDValue Cond = N->getOperand(0);
-  if (Cond.getOpcode() != ISD::SETCC) return SDValue();
-
-  const SDValue LHS = Cond.getOperand(0);
-  const SDValue RHS = Cond.getOperand(1);
-  const SDValue True = N->getOperand(1);
-  const SDValue False = N->getOperand(2);
-  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
-    return SDValue();
-
-  const EVT VT = N->getValueType(0);
-  if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
-
-  const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-  SDValue Larger;  // The larger of LHS and RHS when condition is true.
-  switch (CC) {
-    case ISD::SETULT:
-    case ISD::SETULE:
-    case ISD::SETLT:
-    case ISD::SETLE:
-      Larger = RHS;
-      break;
-
-    case ISD::SETGT:
-    case ISD::SETGE:
-    case ISD::SETUGT:
-    case ISD::SETUGE:
-      Larger = LHS;
-      break;
-
-    default:
-      return SDValue();
-  }
-  const bool IsMax = (Larger == True);
-  const bool IsSigned = ISD::isSignedIntSetCC(CC);
-
-  unsigned IntrinsicId;
-  if (VT == MVT::i32) {
-    if (IsSigned)
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
-    else
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
-  } else {
-    assert(VT == MVT::i64);
-    if (IsSigned)
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
-    else
-      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
-  }
-
-  SDLoc DL(N);
-  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                         DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
-}
-
-static SDValue PerformREMCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 CodeGenOpt::Level OptLevel) {
-  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
-
-  // Don't do anything at less than -O2.
-  if (OptLevel < CodeGenOpt::Default)
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-  bool IsSigned = N->getOpcode() == ISD::SREM;
-  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
-
-  const SDValue &Num = N->getOperand(0);
-  const SDValue &Den = N->getOperand(1);
-
-  for (const SDNode *U : Num->uses()) {
-    if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
-        U->getOperand(1) == Den) {
-      // Num % Den -> Num - (Num / Den) * Den
-      return DAG.getNode(ISD::SUB, DL, VT, Num,
-                         DAG.getNode(ISD::MUL, DL, VT,
-                                     DAG.getNode(DivOpc, DL, VT, Num, Den),
-                                     Den));
-    }
-  }
-  return SDValue();
-}
-
-enum OperandSignedness {
-  Signed = 0,
-  Unsigned,
-  Unknown
-};
-
-/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
-/// that can be demoted to \p OptSize bits without loss of information. The
-/// signedness of the operand, if determinable, is placed in \p S.
-static bool IsMulWideOperandDemotable(SDValue Op,
-                                      unsigned OptSize,
-                                      OperandSignedness &S) {
-  S = Unknown;
-
-  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
-      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
-    EVT OrigVT = Op.getOperand(0).getValueType();
-    if (OrigVT.getSizeInBits() <= OptSize) {
-      S = Signed;
-      return true;
-    }
-  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
-    EVT OrigVT = Op.getOperand(0).getValueType();
-    if (OrigVT.getSizeInBits() <= OptSize) {
-      S = Unsigned;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
-/// be demoted to \p OptSize bits without loss of information. If the operands
-/// contain a constant, it should appear as the RHS operand. The signedness of
-/// the operands is placed in \p IsSigned.
-static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
-                                        unsigned OptSize,
-                                        bool &IsSigned) {
-  OperandSignedness LHSSign;
-
-  // The LHS operand must be a demotable op
-  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
-    return false;
-
-  // We should have been able to determine the signedness from the LHS
-  if (LHSSign == Unknown)
-    return false;
-
-  IsSigned = (LHSSign == Signed);
-
-  // The RHS can be a demotable op or a constant
-  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
-    const APInt &Val = CI->getAPIntValue();
-    if (LHSSign == Unsigned) {
-      return Val.isIntN(OptSize);
-    } else {
-      return Val.isSignedIntN(OptSize);
-    }
-  } else {
-    OperandSignedness RHSSign;
-    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
-      return false;
-
-    return LHSSign == RHSSign;
-  }
-}
-
-/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
-/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
-/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
-/// amount.
-static SDValue TryMULWIDECombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
-  EVT MulType = N->getValueType(0);
-  if (MulType != MVT::i32 && MulType != MVT::i64) {
-    return SDValue();
-  }
-
-  SDLoc DL(N);
-  unsigned OptSize = MulType.getSizeInBits() >> 1;
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-
-  // Canonicalize the multiply so the constant (if any) is on the right
-  if (N->getOpcode() == ISD::MUL) {
-    if (isa<ConstantSDNode>(LHS)) {
-      std::swap(LHS, RHS);
-    }
-  }
-
-  // If we have a SHL, determine the actual multiply amount
-  if (N->getOpcode() == ISD::SHL) {
-    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
-    if (!ShlRHS) {
-      return SDValue();
-    }
-
-    APInt ShiftAmt = ShlRHS->getAPIntValue();
-    unsigned BitWidth = MulType.getSizeInBits();
-    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
-      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
-      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
-    } else {
-      return SDValue();
-    }
-  }
-
-  bool Signed;
-  // Verify that our operands are demotable
-  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
-    return SDValue();
-  }
-
-  EVT DemotedVT;
-  if (MulType == MVT::i32) {
-    DemotedVT = MVT::i16;
-  } else {
-    DemotedVT = MVT::i32;
-  }
-
-  // Truncate the operands to the correct size. Note that these are just for
-  // type consistency and will (likely) be eliminated in later phases.
-  SDValue TruncLHS =
-    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
-  SDValue TruncRHS =
-    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
-
-  unsigned Opc;
-  if (Signed) {
-    Opc = NVPTXISD::MUL_WIDE_SIGNED;
-  } else {
-    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
-  }
-
-  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
-}
-
-/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
-static SDValue PerformMULCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 CodeGenOpt::Level OptLevel) {
-  if (OptLevel > 0) {
-    // Try mul.wide combining at OptLevel > 0
-    if (SDValue Ret = TryMULWIDECombine(N, DCI))
-      return Ret;
-  }
-
-  return SDValue();
-}
-
-/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
-static SDValue PerformSHLCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 CodeGenOpt::Level OptLevel) {
-  if (OptLevel > 0) {
-    // Try mul.wide combining at OptLevel > 0
-    if (SDValue Ret = TryMULWIDECombine(N, DCI))
-      return Ret;
-  }
-
-  return SDValue();
-}
-
-SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
-                                               DAGCombinerInfo &DCI) const {
-  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
-  switch (N->getOpcode()) {
-    default: break;
-    case ISD::ADD:
-    case ISD::FADD:
-      return PerformADDCombine(N, DCI, STI, OptLevel);
-    case ISD::MUL:
-      return PerformMULCombine(N, DCI, OptLevel);
-    case ISD::SHL:
-      return PerformSHLCombine(N, DCI, OptLevel);
-    case ISD::AND:
-      return PerformANDCombine(N, DCI);
-    case ISD::SELECT:
-      return PerformSELECTCombine(N, DCI);
-    case ISD::UREM:
-    case ISD::SREM:
-      return PerformREMCombine(N, DCI, OptLevel);
-  }
-  return SDValue();
-}
-
-/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
-static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
-                              SmallVectorImpl<SDValue> &Results) {
-  EVT ResVT = N->getValueType(0);
-  SDLoc DL(N);
-
-  assert(ResVT.isVector() && "Vector load must have vector type");
-
-  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
-  // legal.  We can (and should) split that into 2 loads of <2 x double> here
-  // but I'm leaving that as a TODO for now.
-  assert(ResVT.isSimple() && "Can only handle simple types");
-  switch (ResVT.getSimpleVT().SimpleTy) {
-  default:
-    return;
-  case MVT::v2i8:
-  case MVT::v2i16:
-  case MVT::v2i32:
-  case MVT::v2i64:
-  case MVT::v2f32:
-  case MVT::v2f64:
-  case MVT::v4i8:
-  case MVT::v4i16:
-  case MVT::v4i32:
-  case MVT::v4f32:
-    // This is a "native" vector type
-    break;
-  }
-
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-
-  unsigned Align = LD->getAlignment();
-  auto &TD = DAG.getDataLayout();
-  unsigned PrefAlign =
-      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
-  if (Align < PrefAlign) {
-    // This load is not sufficiently aligned, so bail out and let this vector
-    // load be scalarized.  Note that we may still be able to emit smaller
-    // vector loads.  For example, if we are loading a <4 x float> with an
-    // alignment of 8, this check will fail but the legalizer will try again
-    // with 2 x <2 x float>, which will succeed with an alignment of 8.
-    return;
-  }
-
-  EVT EltVT = ResVT.getVectorElementType();
-  unsigned NumElts = ResVT.getVectorNumElements();
-
-  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
-  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-  // loaded type to i16 and propagate the "real" type as the memory type.
-  bool NeedTrunc = false;
-  if (EltVT.getSizeInBits() < 16) {
-    EltVT = MVT::i16;
-    NeedTrunc = true;
-  }
-
-  unsigned Opcode = 0;
-  SDVTList LdResVTs;
-
-  switch (NumElts) {
-  default:
-    return;
-  case 2:
-    Opcode = NVPTXISD::LoadV2;
-    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
-    break;
-  case 4: {
-    Opcode = NVPTXISD::LoadV4;
-    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
-    LdResVTs = DAG.getVTList(ListVTs);
-    break;
-  }
-  }
-
-  // Copy regular operands
-  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
-
-  // The select routine does not have access to the LoadSDNode instance, so
-  // pass along the extension information
-  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
-
-  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
-                                          LD->getMemoryVT(),
-                                          LD->getMemOperand());
-
-  SmallVector<SDValue, 4> ScalarRes;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Res = NewLD.getValue(i);
-    if (NeedTrunc)
-      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
-    ScalarRes.push_back(Res);
-  }
-
-  SDValue LoadChain = NewLD.getValue(NumElts);
-
-  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
-
-  Results.push_back(BuildVec);
-  Results.push_back(LoadChain);
-}
-
-static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
-                                     SmallVectorImpl<SDValue> &Results) {
-  SDValue Chain = N->getOperand(0);
-  SDValue Intrin = N->getOperand(1);
-  SDLoc DL(N);
-
-  // Get the intrinsic ID
-  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
-  switch (IntrinNo) {
-  default:
-    return;
-  case Intrinsic::nvvm_ldg_global_i:
-  case Intrinsic::nvvm_ldg_global_f:
-  case Intrinsic::nvvm_ldg_global_p:
-  case Intrinsic::nvvm_ldu_global_i:
-  case Intrinsic::nvvm_ldu_global_f:
-  case Intrinsic::nvvm_ldu_global_p: {
-    EVT ResVT = N->getValueType(0);
-
-    if (ResVT.isVector()) {
-      // Vector LDG/LDU
-
-      unsigned NumElts = ResVT.getVectorNumElements();
-      EVT EltVT = ResVT.getVectorElementType();
-
-      // Since LDU/LDG are target nodes, we cannot rely on DAG type
-      // legalization.
-      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
-      // loaded type to i16 and propagate the "real" type as the memory type.
-      bool NeedTrunc = false;
-      if (EltVT.getSizeInBits() < 16) {
-        EltVT = MVT::i16;
-        NeedTrunc = true;
-      }
-
-      unsigned Opcode = 0;
-      SDVTList LdResVTs;
-
-      switch (NumElts) {
-      default:
-        return;
-      case 2:
-        switch (IntrinNo) {
-        default:
-          return;
-        case Intrinsic::nvvm_ldg_global_i:
-        case Intrinsic::nvvm_ldg_global_f:
-        case Intrinsic::nvvm_ldg_global_p:
-          Opcode = NVPTXISD::LDGV2;
-          break;
-        case Intrinsic::nvvm_ldu_global_i:
-        case Intrinsic::nvvm_ldu_global_f:
-        case Intrinsic::nvvm_ldu_global_p:
-          Opcode = NVPTXISD::LDUV2;
-          break;
-        }
-        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
-        break;
-      case 4: {
-        switch (IntrinNo) {
-        default:
-          return;
-        case Intrinsic::nvvm_ldg_global_i:
-        case Intrinsic::nvvm_ldg_global_f:
-        case Intrinsic::nvvm_ldg_global_p:
-          Opcode = NVPTXISD::LDGV4;
-          break;
-        case Intrinsic::nvvm_ldu_global_i:
-        case Intrinsic::nvvm_ldu_global_f:
-        case Intrinsic::nvvm_ldu_global_p:
-          Opcode = NVPTXISD::LDUV4;
-          break;
-        }
-        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
-        LdResVTs = DAG.getVTList(ListVTs);
-        break;
-      }
-      }
-
-      SmallVector<SDValue, 8> OtherOps;
-
-      // Copy regular operands
-
-      OtherOps.push_back(Chain); // Chain
-                                 // Skip operand 1 (intrinsic ID)
-      // Others
-      OtherOps.append(N->op_begin() + 2, N->op_end());
-
-      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
-
-      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
-                                              MemSD->getMemoryVT(),
-                                              MemSD->getMemOperand());
-
-      SmallVector<SDValue, 4> ScalarRes;
-
-      for (unsigned i = 0; i < NumElts; ++i) {
-        SDValue Res = NewLD.getValue(i);
-        if (NeedTrunc)
-          Res =
-              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
-        ScalarRes.push_back(Res);
-      }
-
-      SDValue LoadChain = NewLD.getValue(NumElts);
-
-      SDValue BuildVec =
-          DAG.getBuildVector(ResVT, DL, ScalarRes);
-
-      Results.push_back(BuildVec);
-      Results.push_back(LoadChain);
-    } else {
-      // i8 LDG/LDU
-      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
-             "Custom handling of non-i8 ldu/ldg?");
-
-      // Just copy all operands as-is
-      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
-
-      // Force output to i16
-      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
-
-      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
-
-      // We make sure the memory type is i8, which will be used during isel
-      // to select the proper instruction.
-      SDValue NewLD =
-          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
-                                  MVT::i8, MemSD->getMemOperand());
-
-      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
-                                    NewLD.getValue(0)));
-      Results.push_back(NewLD.getValue(1));
-    }
-  }
-  }
-}
-
-void NVPTXTargetLowering::ReplaceNodeResults(
-    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  default:
-    report_fatal_error("Unhandled custom legalization");
-  case ISD::LOAD:
-    ReplaceLoadVector(N, DAG, Results);
-    return;
-  case ISD::INTRINSIC_W_CHAIN:
-    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
-    return;
-  }
-}
-
-// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
-void NVPTXSection::anchor() {}
-
-NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
-  delete static_cast<NVPTXSection *>(TextSection);
-  delete static_cast<NVPTXSection *>(DataSection);
-  delete static_cast<NVPTXSection *>(BSSSection);
-  delete static_cast<NVPTXSection *>(ReadOnlySection);
-
-  delete static_cast<NVPTXSection *>(StaticCtorSection);
-  delete static_cast<NVPTXSection *>(StaticDtorSection);
-  delete static_cast<NVPTXSection *>(LSDASection);
-  delete static_cast<NVPTXSection *>(EHFrameSection);
-  delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
-  delete static_cast<NVPTXSection *>(DwarfInfoSection);
-  delete static_cast<NVPTXSection *>(DwarfLineSection);
-  delete static_cast<NVPTXSection *>(DwarfFrameSection);
-  delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
-  delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
-  delete static_cast<NVPTXSection *>(DwarfStrSection);
-  delete static_cast<NVPTXSection *>(DwarfLocSection);
-  delete static_cast<NVPTXSection *>(DwarfARangesSection);
-  delete static_cast<NVPTXSection *>(DwarfRangesSection);
-  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
-}
-
-MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
-    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  return getDataSection();
-}
+//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that NVPTX uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXISelLowering.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "NVPTX.h"
+#include "NVPTXSection.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "NVPTXTargetObjectFile.h"
+#include "NVPTXUtilities.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetCallingConv.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "nvptx-lower"
+
+using namespace llvm;
+
+static unsigned int uniqueCallSite = 0;
+
+static cl::opt<bool> sched4reg(
+    "nvptx-sched4reg",
+    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
+
+static cl::opt<unsigned>
+FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
+                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
+                             " 1: do it  2: do it aggressively"),
+                    cl::init(2));
+
+static cl::opt<int> UsePrecDivF32(
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
+    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
+             " IEEE Compliant F32 div.rnd if available."),
+    cl::init(2));
+
+static cl::opt<bool> UsePrecSqrtF32(
+    "nvptx-prec-sqrtf32", cl::Hidden,
+    cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+    cl::init(true));
+
+static cl::opt<bool> FtzEnabled(
+    "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+    cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+    cl::init(false));
+
+int NVPTXTargetLowering::getDivF32Level() const {
+  if (UsePrecDivF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-div32=N is used on the command-line, always honor it
+    return UsePrecDivF32;
+  } else {
+    // Otherwise, use div.approx if fast math is enabled
+    if (getTargetMachine().Options.UnsafeFPMath)
+      return 0;
+    else
+      return 2;
+  }
+}
+
+bool NVPTXTargetLowering::usePrecSqrtF32() const {
+  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+    return UsePrecSqrtF32;
+  } else {
+    // Otherwise, use sqrt.approx if fast math is enabled
+    return !getTargetMachine().Options.UnsafeFPMath;
+  }
+}
+
+bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
+  // TODO: Get rid of this flag; there can be only one way to do this.
+  if (FtzEnabled.getNumOccurrences() > 0) {
+    // If nvptx-f32ftz is used on the command-line, always honor it
+    return FtzEnabled;
+  } else {
+    const Function *F = MF.getFunction();
+    // Otherwise, check for an nvptx-f32ftz attribute on the function
+    if (F->hasFnAttribute("nvptx-f32ftz"))
+      return F->getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
+    else
+      return false;
+  }
+}
+
+static bool IsPTXVectorType(MVT VT) {
+  switch (VT.SimpleTy) {
+  default:
+    return false;
+  case MVT::v2i1:
+  case MVT::v4i1:
+  case MVT::v2i8:
+  case MVT::v4i8:
+  case MVT::v2i16:
+  case MVT::v4i16:
+  case MVT::v2i32:
+  case MVT::v4i32:
+  case MVT::v2i64:
+  case MVT::v2f16:
+  case MVT::v4f16:
+  case MVT::v8f16: // <4 x f16x2>
+  case MVT::v2f32:
+  case MVT::v4f32:
+  case MVT::v2f64:
+    return true;
+  }
+}
+
+/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
+/// into their primitive components.
+/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+/// LowerCall, and LowerReturn.
+static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
+                               uint64_t StartingOffset = 0) {
+  SmallVector<EVT, 16> TempVTs;
+  SmallVector<uint64_t, 16> TempOffsets;
+
+  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
+  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+    EVT VT = TempVTs[i];
+    uint64_t Off = TempOffsets[i];
+    // Split vectors into individual elements, except for v2f16, which
+    // we will pass as a single scalar.
+    if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      EVT EltVT = VT.getVectorElementType();
+      // Vectors with an even number of f16 elements will be passed to
+      // us as an array of v2f16 elements. We must match this so we
+      // stay in sync with Ins/Outs.
+      if (EltVT == MVT::f16 && NumElts % 2 == 0) {
+        EltVT = MVT::v2f16;
+        NumElts /= 2;
+      }
+      for (unsigned j = 0; j != NumElts; ++j) {
+        ValueVTs.push_back(EltVT);
+        if (Offsets)
+          Offsets->push_back(Off + j * EltVT.getStoreSize());
+      }
+    } else {
+      ValueVTs.push_back(VT);
+      if (Offsets)
+        Offsets->push_back(Off);
+    }
+  }
+}
+
+// Check whether we can merge loads/stores of some of the pieces of a
+// flattened function parameter or return value into a single vector
+// load/store.
+//
+// The flattened parameter is represented as a list of EVTs and
+// offsets, and the whole structure is aligned to ParamAlignment. This
+// function determines whether we can load/store pieces of the
+// parameter starting at index Idx using a single vectorized op of
+// size AccessSize. If so, it returns the number of param pieces
+// covered by the vector op. Otherwise, it returns 1.
+static unsigned CanMergeParamLoadStoresStartingAt(
+    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
+    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
+  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
+
+  // Can't vectorize if param alignment is not sufficient.
+  if (AccessSize > ParamAlignment)
+    return 1;
+  // Can't vectorize if offset is not aligned.
+  if (Offsets[Idx] & (AccessSize - 1))
+    return 1;
+
+  EVT EltVT = ValueVTs[Idx];
+  unsigned EltSize = EltVT.getStoreSize();
+
+  // Element is too large to vectorize.
+  if (EltSize >= AccessSize)
+    return 1;
+
+  unsigned NumElts = AccessSize / EltSize;
+  // Can't vectorize if AccessBytes if not a multiple of EltSize.
+  if (AccessSize != EltSize * NumElts)
+    return 1;
+
+  // We don't have enough elements to vectorize.
+  if (Idx + NumElts > ValueVTs.size())
+    return 1;
+
+  // PTX ISA can only deal with 2- and 4-element vector ops.
+  if (NumElts != 4 && NumElts != 2)
+    return 1;
+
+  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
+    // Types do not match.
+    if (ValueVTs[j] != EltVT)
+      return 1;
+
+    // Elements are not contiguous.
+    if (Offsets[j] - Offsets[j - 1] != EltSize)
+      return 1;
+  }
+  // OK. We can vectorize ValueVTs[i..i+NumElts)
+  return NumElts;
+}
+
+// Flags for tracking per-element vectorization state of loads/stores
+// of a flattened function parameter or return value.
+enum ParamVectorizationFlags {
+  PVF_INNER = 0x0, // Middle elements of a vector.
+  PVF_FIRST = 0x1, // First element of the vector.
+  PVF_LAST = 0x2,  // Last element of the vector.
+  // Scalar is effectively a 1-element vector.
+  PVF_SCALAR = PVF_FIRST | PVF_LAST
+};
+
+// Computes whether and how we can vectorize the loads/stores of a
+// flattened function parameter or return value.
+//
+// The flattened parameter is represented as the list of ValueVTs and
+// Offsets, and is aligned to ParamAlignment bytes. We return a vector
+// of the same size as ValueVTs indicating how each piece should be
+// loaded/stored (i.e. as a scalar, or as part of a vector
+// load/store).
+static SmallVector<ParamVectorizationFlags, 16>
+VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
+                     const SmallVectorImpl<uint64_t> &Offsets,
+                     unsigned ParamAlignment) {
+  // Set vector size to match ValueVTs and mark all elements as
+  // scalars by default.
+  SmallVector<ParamVectorizationFlags, 16> VectorInfo;
+  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
+
+  // Check what we can vectorize using 128/64/32-bit accesses.
+  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
+    // Skip elements we've already processed.
+    assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
+    for (unsigned AccessSize : {16, 8, 4, 2}) {
+      unsigned NumElts = CanMergeParamLoadStoresStartingAt(
+          I, AccessSize, ValueVTs, Offsets, ParamAlignment);
+      // Mark vectorized elements.
+      switch (NumElts) {
+      default:
+        llvm_unreachable("Unexpected return value");
+      case 1:
+        // Can't vectorize using this size, try next smaller size.
+        continue;
+      case 2:
+        assert(I + 1 < E && "Not enough elements.");
+        VectorInfo[I] = PVF_FIRST;
+        VectorInfo[I + 1] = PVF_LAST;
+        I += 1;
+        break;
+      case 4:
+        assert(I + 3 < E && "Not enough elements.");
+        VectorInfo[I] = PVF_FIRST;
+        VectorInfo[I + 1] = PVF_INNER;
+        VectorInfo[I + 2] = PVF_INNER;
+        VectorInfo[I + 3] = PVF_LAST;
+        I += 3;
+        break;
+      }
+      // Break out of the inner loop because we've already succeeded
+      // using largest possible AccessSize.
+      break;
+    }
+  }
+  return VectorInfo;
+}
+
+// NVPTXTargetLowering Constructor.
+NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
+                                         const NVPTXSubtarget &STI)
+    : TargetLowering(TM), nvTM(&TM), STI(STI) {
+  // always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather
+  // then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
+  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
+
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
+  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
+  // condition branches.
+  setJumpIsExpensive(true);
+
+  // Wide divides are _very_ slow. Try to reduce the width of the divide if
+  // possible.
+  addBypassSlowDiv(64, 32);
+
+  // By default, use the Source scheduling
+  if (sched4reg)
+    setSchedulingPreference(Sched::RegPressure);
+  else
+    setSchedulingPreference(Sched::Source);
+
+  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
+                                    LegalizeAction NoF16Action) {
+    setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
+  };
+
+  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
+  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
+  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
+  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
+  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
+  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
+  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
+  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
+
+  // Conversion to/from FP16/FP16x2 is always legal.
+  setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
+  setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
+  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
+
+  // Operations not directly supported by NVPTX.
+  setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+  // For others we will expand to a SHL/SRA pair.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
+
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+
+  if (STI.hasROT64()) {
+    setOperationAction(ISD::ROTL, MVT::i64, Legal);
+    setOperationAction(ISD::ROTR, MVT::i64, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i64, Expand);
+    setOperationAction(ISD::ROTR, MVT::i64, Expand);
+  }
+  if (STI.hasROT32()) {
+    setOperationAction(ISD::ROTL, MVT::i32, Legal);
+    setOperationAction(ISD::ROTR, MVT::i32, Legal);
+  } else {
+    setOperationAction(ISD::ROTL, MVT::i32, Expand);
+    setOperationAction(ISD::ROTR, MVT::i32, Expand);
+  }
+
+  setOperationAction(ISD::ROTL, MVT::i16, Expand);
+  setOperationAction(ISD::ROTR, MVT::i16, Expand);
+  setOperationAction(ISD::ROTL, MVT::i8, Expand);
+  setOperationAction(ISD::ROTR, MVT::i8, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+
+  // We want to legalize constant related memmove and memcopy
+  // intrinsics.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  // Turn FP extload into load/fpextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+  // Turn FP truncstore into trunc + store.
+  // FIXME: vector types should also be expanded
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PTX does not support load / store predicate registers
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setTruncStoreAction(VT, MVT::i1, Expand);
+  }
+
+  // This is legal in NVPTX
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
+
+  // TRAP can be lowered to PTX trap
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+
+  // Register custom handling for vector loads/stores
+  for (MVT VT : MVT::vector_valuetypes()) {
+    if (IsPTXVectorType(VT)) {
+      setOperationAction(ISD::LOAD, VT, Custom);
+      setOperationAction(ISD::STORE, VT, Custom);
+      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
+    }
+  }
+
+  // Custom handling for i8 intrinsics
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
+    setOperationAction(ISD::ABS,  Ty, Legal);
+    setOperationAction(ISD::SMIN, Ty, Legal);
+    setOperationAction(ISD::SMAX, Ty, Legal);
+    setOperationAction(ISD::UMIN, Ty, Legal);
+    setOperationAction(ISD::UMAX, Ty, Legal);
+
+    setOperationAction(ISD::CTPOP, Ty, Legal);
+    setOperationAction(ISD::CTLZ, Ty, Legal);
+  }
+
+  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+
+  // PTX does not directly support SELP of i1, so promote to i32 first
+  setOperationAction(ISD::SELECT, MVT::i1, Custom);
+
+  // PTX cannot multiply two i64s in a single instruction.
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+
+  // We have some custom DAG combine patterns for these nodes
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SREM);
+  setTargetDAGCombine(ISD::UREM);
+
+  // setcc for f16x2 needs special handling to prevent legalizer's
+  // attempt to scalarize it due to v2i1 not being legal.
+  if (STI.allowFP16Math())
+    setTargetDAGCombine(ISD::SETCC);
+
+  // Promote fp16 arithmetic if fp16 hardware isn't available or the
+  // user passed --nvptx-no-fp16-math. The flag is useful because,
+  // although sm_53+ GPUs have some sort of FP16 support in
+  // hardware, only sm_53 and sm_60 have full implementation. Others
+  // only have token amount of hardware and are likely to run faster
+  // by using fp32 units instead.
+  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
+    setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
+  }
+
+  // There's no neg.f16 instruction. Expand to (0-x).
+  setOperationAction(ISD::FNEG, MVT::f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
+
+  // (would be) Library functions.
+
+  // These map to conversion instructions for scalar FP types.
+  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
+                         ISD::FROUND, ISD::FTRUNC}) {
+    setOperationAction(Op, MVT::f16, Legal);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setOperationAction(Op, MVT::v2f16, Expand);
+  }
+
+  // 'Expand' implements FCOPYSIGN without calling an external library.
+  setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+
+  // These map to corresponding instructions for f32/f64. f16 must be
+  // promoted to f32. v2f16 is expanded to f16, which is then promoted
+  // to f32.
+  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
+                         ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
+    setOperationAction(Op, MVT::f16, Promote);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setOperationAction(Op, MVT::v2f16, Expand);
+  }
+  setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
+  setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
+  setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
+  setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
+
+  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
+  // No FPOW or FREM in PTX.
+
+  // Now deduce the information based on the above mentioned
+  // actions
+  computeRegisterProperties(STI.getRegisterInfo());
+}
+
+const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch ((NVPTXISD::NodeType)Opcode) {
+  case NVPTXISD::FIRST_NUMBER:
+    break;
+  case NVPTXISD::CALL:
+    return "NVPTXISD::CALL";
+  case NVPTXISD::RET_FLAG:
+    return "NVPTXISD::RET_FLAG";
+  case NVPTXISD::LOAD_PARAM:
+    return "NVPTXISD::LOAD_PARAM";
+  case NVPTXISD::Wrapper:
+    return "NVPTXISD::Wrapper";
+  case NVPTXISD::DeclareParam:
+    return "NVPTXISD::DeclareParam";
+  case NVPTXISD::DeclareScalarParam:
+    return "NVPTXISD::DeclareScalarParam";
+  case NVPTXISD::DeclareRet:
+    return "NVPTXISD::DeclareRet";
+  case NVPTXISD::DeclareScalarRet:
+    return "NVPTXISD::DeclareScalarRet";
+  case NVPTXISD::DeclareRetParam:
+    return "NVPTXISD::DeclareRetParam";
+  case NVPTXISD::PrintCall:
+    return "NVPTXISD::PrintCall";
+  case NVPTXISD::PrintConvergentCall:
+    return "NVPTXISD::PrintConvergentCall";
+  case NVPTXISD::PrintCallUni:
+    return "NVPTXISD::PrintCallUni";
+  case NVPTXISD::PrintConvergentCallUni:
+    return "NVPTXISD::PrintConvergentCallUni";
+  case NVPTXISD::LoadParam:
+    return "NVPTXISD::LoadParam";
+  case NVPTXISD::LoadParamV2:
+    return "NVPTXISD::LoadParamV2";
+  case NVPTXISD::LoadParamV4:
+    return "NVPTXISD::LoadParamV4";
+  case NVPTXISD::StoreParam:
+    return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamV2:
+    return "NVPTXISD::StoreParamV2";
+  case NVPTXISD::StoreParamV4:
+    return "NVPTXISD::StoreParamV4";
+  case NVPTXISD::StoreParamS32:
+    return "NVPTXISD::StoreParamS32";
+  case NVPTXISD::StoreParamU32:
+    return "NVPTXISD::StoreParamU32";
+  case NVPTXISD::CallArgBegin:
+    return "NVPTXISD::CallArgBegin";
+  case NVPTXISD::CallArg:
+    return "NVPTXISD::CallArg";
+  case NVPTXISD::LastCallArg:
+    return "NVPTXISD::LastCallArg";
+  case NVPTXISD::CallArgEnd:
+    return "NVPTXISD::CallArgEnd";
+  case NVPTXISD::CallVoid:
+    return "NVPTXISD::CallVoid";
+  case NVPTXISD::CallVal:
+    return "NVPTXISD::CallVal";
+  case NVPTXISD::CallSymbol:
+    return "NVPTXISD::CallSymbol";
+  case NVPTXISD::Prototype:
+    return "NVPTXISD::Prototype";
+  case NVPTXISD::MoveParam:
+    return "NVPTXISD::MoveParam";
+  case NVPTXISD::StoreRetval:
+    return "NVPTXISD::StoreRetval";
+  case NVPTXISD::StoreRetvalV2:
+    return "NVPTXISD::StoreRetvalV2";
+  case NVPTXISD::StoreRetvalV4:
+    return "NVPTXISD::StoreRetvalV4";
+  case NVPTXISD::PseudoUseParam:
+    return "NVPTXISD::PseudoUseParam";
+  case NVPTXISD::RETURN:
+    return "NVPTXISD::RETURN";
+  case NVPTXISD::CallSeqBegin:
+    return "NVPTXISD::CallSeqBegin";
+  case NVPTXISD::CallSeqEnd:
+    return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::CallPrototype:
+    return "NVPTXISD::CallPrototype";
+  case NVPTXISD::LoadV2:
+    return "NVPTXISD::LoadV2";
+  case NVPTXISD::LoadV4:
+    return "NVPTXISD::LoadV4";
+  case NVPTXISD::LDGV2:
+    return "NVPTXISD::LDGV2";
+  case NVPTXISD::LDGV4:
+    return "NVPTXISD::LDGV4";
+  case NVPTXISD::LDUV2:
+    return "NVPTXISD::LDUV2";
+  case NVPTXISD::LDUV4:
+    return "NVPTXISD::LDUV4";
+  case NVPTXISD::StoreV2:
+    return "NVPTXISD::StoreV2";
+  case NVPTXISD::StoreV4:
+    return "NVPTXISD::StoreV4";
+  case NVPTXISD::FUN_SHFL_CLAMP:
+    return "NVPTXISD::FUN_SHFL_CLAMP";
+  case NVPTXISD::FUN_SHFR_CLAMP:
+    return "NVPTXISD::FUN_SHFR_CLAMP";
+  case NVPTXISD::IMAD:
+    return "NVPTXISD::IMAD";
+  case NVPTXISD::SETP_F16X2:
+    return "NVPTXISD::SETP_F16X2";
+  case NVPTXISD::Dummy:
+    return "NVPTXISD::Dummy";
+  case NVPTXISD::MUL_WIDE_SIGNED:
+    return "NVPTXISD::MUL_WIDE_SIGNED";
+  case NVPTXISD::MUL_WIDE_UNSIGNED:
+    return "NVPTXISD::MUL_WIDE_UNSIGNED";
+  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
+  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    return "NVPTXISD::Tex1DFloatFloatLevel";
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    return "NVPTXISD::Tex1DFloatFloatGrad";
+  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
+  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
+  case NVPTXISD::Tex1DS32FloatLevel:
+    return "NVPTXISD::Tex1DS32FloatLevel";
+  case NVPTXISD::Tex1DS32FloatGrad:
+    return "NVPTXISD::Tex1DS32FloatGrad";
+  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
+  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
+  case NVPTXISD::Tex1DU32FloatLevel:
+    return "NVPTXISD::Tex1DU32FloatLevel";
+  case NVPTXISD::Tex1DU32FloatGrad:
+    return "NVPTXISD::Tex1DU32FloatGrad";
+  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
+  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
+  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
+  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
+  case NVPTXISD::Tex1DArrayS32FloatLevel:
+    return "NVPTXISD::Tex1DArrayS32FloatLevel";
+  case NVPTXISD::Tex1DArrayS32FloatGrad:
+    return "NVPTXISD::Tex1DArrayS32FloatGrad";
+  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
+  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
+  case NVPTXISD::Tex1DArrayU32FloatLevel:
+    return "NVPTXISD::Tex1DArrayU32FloatLevel";
+  case NVPTXISD::Tex1DArrayU32FloatGrad:
+    return "NVPTXISD::Tex1DArrayU32FloatGrad";
+  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
+  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    return "NVPTXISD::Tex2DFloatFloatLevel";
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    return "NVPTXISD::Tex2DFloatFloatGrad";
+  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
+  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
+  case NVPTXISD::Tex2DS32FloatLevel:
+    return "NVPTXISD::Tex2DS32FloatLevel";
+  case NVPTXISD::Tex2DS32FloatGrad:
+    return "NVPTXISD::Tex2DS32FloatGrad";
+  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
+  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
+  case NVPTXISD::Tex2DU32FloatLevel:
+    return "NVPTXISD::Tex2DU32FloatLevel";
+  case NVPTXISD::Tex2DU32FloatGrad:
+    return "NVPTXISD::Tex2DU32FloatGrad";
+  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
+  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
+  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
+  case NVPTXISD::Tex2DArrayS32FloatLevel:
+    return "NVPTXISD::Tex2DArrayS32FloatLevel";
+  case NVPTXISD::Tex2DArrayS32FloatGrad:
+    return "NVPTXISD::Tex2DArrayS32FloatGrad";
+  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
+  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
+  case NVPTXISD::Tex2DArrayU32FloatLevel:
+    return "NVPTXISD::Tex2DArrayU32FloatLevel";
+  case NVPTXISD::Tex2DArrayU32FloatGrad:
+    return "NVPTXISD::Tex2DArrayU32FloatGrad";
+  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
+  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    return "NVPTXISD::Tex3DFloatFloatLevel";
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    return "NVPTXISD::Tex3DFloatFloatGrad";
+  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
+  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
+  case NVPTXISD::Tex3DS32FloatLevel:
+    return "NVPTXISD::Tex3DS32FloatLevel";
+  case NVPTXISD::Tex3DS32FloatGrad:
+    return "NVPTXISD::Tex3DS32FloatGrad";
+  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
+  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
+  case NVPTXISD::Tex3DU32FloatLevel:
+    return "NVPTXISD::Tex3DU32FloatLevel";
+  case NVPTXISD::Tex3DU32FloatGrad:
+    return "NVPTXISD::Tex3DU32FloatGrad";
+  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
+  case NVPTXISD::TexCubeFloatFloatLevel:
+    return "NVPTXISD::TexCubeFloatFloatLevel";
+  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
+  case NVPTXISD::TexCubeS32FloatLevel:
+    return "NVPTXISD::TexCubeS32FloatLevel";
+  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
+  case NVPTXISD::TexCubeU32FloatLevel:
+    return "NVPTXISD::TexCubeU32FloatLevel";
+  case NVPTXISD::TexCubeArrayFloatFloat:
+    return "NVPTXISD::TexCubeArrayFloatFloat";
+  case NVPTXISD::TexCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexCubeArrayS32Float:
+    return "NVPTXISD::TexCubeArrayS32Float";
+  case NVPTXISD::TexCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexCubeArrayS32FloatLevel";
+  case NVPTXISD::TexCubeArrayU32Float:
+    return "NVPTXISD::TexCubeArrayU32Float";
+  case NVPTXISD::TexCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4R2DFloatFloat:
+    return "NVPTXISD::Tld4R2DFloatFloat";
+  case NVPTXISD::Tld4G2DFloatFloat:
+    return "NVPTXISD::Tld4G2DFloatFloat";
+  case NVPTXISD::Tld4B2DFloatFloat:
+    return "NVPTXISD::Tld4B2DFloatFloat";
+  case NVPTXISD::Tld4A2DFloatFloat:
+    return "NVPTXISD::Tld4A2DFloatFloat";
+  case NVPTXISD::Tld4R2DS64Float:
+    return "NVPTXISD::Tld4R2DS64Float";
+  case NVPTXISD::Tld4G2DS64Float:
+    return "NVPTXISD::Tld4G2DS64Float";
+  case NVPTXISD::Tld4B2DS64Float:
+    return "NVPTXISD::Tld4B2DS64Float";
+  case NVPTXISD::Tld4A2DS64Float:
+    return "NVPTXISD::Tld4A2DS64Float";
+  case NVPTXISD::Tld4R2DU64Float:
+    return "NVPTXISD::Tld4R2DU64Float";
+  case NVPTXISD::Tld4G2DU64Float:
+    return "NVPTXISD::Tld4G2DU64Float";
+  case NVPTXISD::Tld4B2DU64Float:
+    return "NVPTXISD::Tld4B2DU64Float";
+  case NVPTXISD::Tld4A2DU64Float:
+    return "NVPTXISD::Tld4A2DU64Float";
+
+  case NVPTXISD::TexUnified1DFloatS32:
+    return "NVPTXISD::TexUnified1DFloatS32";
+  case NVPTXISD::TexUnified1DFloatFloat:
+    return "NVPTXISD::TexUnified1DFloatFloat";
+  case NVPTXISD::TexUnified1DFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DFloatFloatLevel";
+  case NVPTXISD::TexUnified1DFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DFloatFloatGrad";
+  case NVPTXISD::TexUnified1DS32S32:
+    return "NVPTXISD::TexUnified1DS32S32";
+  case NVPTXISD::TexUnified1DS32Float:
+    return "NVPTXISD::TexUnified1DS32Float";
+  case NVPTXISD::TexUnified1DS32FloatLevel:
+    return "NVPTXISD::TexUnified1DS32FloatLevel";
+  case NVPTXISD::TexUnified1DS32FloatGrad:
+    return "NVPTXISD::TexUnified1DS32FloatGrad";
+  case NVPTXISD::TexUnified1DU32S32:
+    return "NVPTXISD::TexUnified1DU32S32";
+  case NVPTXISD::TexUnified1DU32Float:
+    return "NVPTXISD::TexUnified1DU32Float";
+  case NVPTXISD::TexUnified1DU32FloatLevel:
+    return "NVPTXISD::TexUnified1DU32FloatLevel";
+  case NVPTXISD::TexUnified1DU32FloatGrad:
+    return "NVPTXISD::TexUnified1DU32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayFloatS32:
+    return "NVPTXISD::TexUnified1DArrayFloatS32";
+  case NVPTXISD::TexUnified1DArrayFloatFloat:
+    return "NVPTXISD::TexUnified1DArrayFloatFloat";
+  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified1DArrayS32S32:
+    return "NVPTXISD::TexUnified1DArrayS32S32";
+  case NVPTXISD::TexUnified1DArrayS32Float:
+    return "NVPTXISD::TexUnified1DArrayS32Float";
+  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified1DArrayU32S32:
+    return "NVPTXISD::TexUnified1DArrayU32S32";
+  case NVPTXISD::TexUnified1DArrayU32Float:
+    return "NVPTXISD::TexUnified1DArrayU32Float";
+  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified2DFloatS32:
+    return "NVPTXISD::TexUnified2DFloatS32";
+  case NVPTXISD::TexUnified2DFloatFloat:
+    return "NVPTXISD::TexUnified2DFloatFloat";
+  case NVPTXISD::TexUnified2DFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DFloatFloatLevel";
+  case NVPTXISD::TexUnified2DFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DFloatFloatGrad";
+  case NVPTXISD::TexUnified2DS32S32:
+    return "NVPTXISD::TexUnified2DS32S32";
+  case NVPTXISD::TexUnified2DS32Float:
+    return "NVPTXISD::TexUnified2DS32Float";
+  case NVPTXISD::TexUnified2DS32FloatLevel:
+    return "NVPTXISD::TexUnified2DS32FloatLevel";
+  case NVPTXISD::TexUnified2DS32FloatGrad:
+    return "NVPTXISD::TexUnified2DS32FloatGrad";
+  case NVPTXISD::TexUnified2DU32S32:
+    return "NVPTXISD::TexUnified2DU32S32";
+  case NVPTXISD::TexUnified2DU32Float:
+    return "NVPTXISD::TexUnified2DU32Float";
+  case NVPTXISD::TexUnified2DU32FloatLevel:
+    return "NVPTXISD::TexUnified2DU32FloatLevel";
+  case NVPTXISD::TexUnified2DU32FloatGrad:
+    return "NVPTXISD::TexUnified2DU32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayFloatS32:
+    return "NVPTXISD::TexUnified2DArrayFloatS32";
+  case NVPTXISD::TexUnified2DArrayFloatFloat:
+    return "NVPTXISD::TexUnified2DArrayFloatFloat";
+  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
+  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
+    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
+  case NVPTXISD::TexUnified2DArrayS32S32:
+    return "NVPTXISD::TexUnified2DArrayS32S32";
+  case NVPTXISD::TexUnified2DArrayS32Float:
+    return "NVPTXISD::TexUnified2DArrayS32Float";
+  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
+  case NVPTXISD::TexUnified2DArrayU32S32:
+    return "NVPTXISD::TexUnified2DArrayU32S32";
+  case NVPTXISD::TexUnified2DArrayU32Float:
+    return "NVPTXISD::TexUnified2DArrayU32Float";
+  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
+    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
+  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
+    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
+  case NVPTXISD::TexUnified3DFloatS32:
+    return "NVPTXISD::TexUnified3DFloatS32";
+  case NVPTXISD::TexUnified3DFloatFloat:
+    return "NVPTXISD::TexUnified3DFloatFloat";
+  case NVPTXISD::TexUnified3DFloatFloatLevel:
+    return "NVPTXISD::TexUnified3DFloatFloatLevel";
+  case NVPTXISD::TexUnified3DFloatFloatGrad:
+    return "NVPTXISD::TexUnified3DFloatFloatGrad";
+  case NVPTXISD::TexUnified3DS32S32:
+    return "NVPTXISD::TexUnified3DS32S32";
+  case NVPTXISD::TexUnified3DS32Float:
+    return "NVPTXISD::TexUnified3DS32Float";
+  case NVPTXISD::TexUnified3DS32FloatLevel:
+    return "NVPTXISD::TexUnified3DS32FloatLevel";
+  case NVPTXISD::TexUnified3DS32FloatGrad:
+    return "NVPTXISD::TexUnified3DS32FloatGrad";
+  case NVPTXISD::TexUnified3DU32S32:
+    return "NVPTXISD::TexUnified3DU32S32";
+  case NVPTXISD::TexUnified3DU32Float:
+    return "NVPTXISD::TexUnified3DU32Float";
+  case NVPTXISD::TexUnified3DU32FloatLevel:
+    return "NVPTXISD::TexUnified3DU32FloatLevel";
+  case NVPTXISD::TexUnified3DU32FloatGrad:
+    return "NVPTXISD::TexUnified3DU32FloatGrad";
+  case NVPTXISD::TexUnifiedCubeFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeFloatFloat";
+  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeS32Float:
+    return "NVPTXISD::TexUnifiedCubeS32Float";
+  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeU32Float:
+    return "NVPTXISD::TexUnifiedCubeU32Float";
+  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
+  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayS32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
+  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
+  case NVPTXISD::TexUnifiedCubeArrayU32Float:
+    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
+  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
+    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
+  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
+    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
+  case NVPTXISD::Tld4UnifiedR2DS64Float:
+    return "NVPTXISD::Tld4UnifiedR2DS64Float";
+  case NVPTXISD::Tld4UnifiedG2DS64Float:
+    return "NVPTXISD::Tld4UnifiedG2DS64Float";
+  case NVPTXISD::Tld4UnifiedB2DS64Float:
+    return "NVPTXISD::Tld4UnifiedB2DS64Float";
+  case NVPTXISD::Tld4UnifiedA2DS64Float:
+    return "NVPTXISD::Tld4UnifiedA2DS64Float";
+  case NVPTXISD::Tld4UnifiedR2DU64Float:
+    return "NVPTXISD::Tld4UnifiedR2DU64Float";
+  case NVPTXISD::Tld4UnifiedG2DU64Float:
+    return "NVPTXISD::Tld4UnifiedG2DU64Float";
+  case NVPTXISD::Tld4UnifiedB2DU64Float:
+    return "NVPTXISD::Tld4UnifiedB2DU64Float";
+  case NVPTXISD::Tld4UnifiedA2DU64Float:
+    return "NVPTXISD::Tld4UnifiedA2DU64Float";
+
+  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
+  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
+  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
+  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
+  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
+  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
+  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
+  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
+  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
+  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
+  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
+
+  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
+  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
+  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
+  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
+  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
+  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
+  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
+  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
+  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
+  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
+  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
+  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
+  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
+  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
+  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
+  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
+  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
+  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
+  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
+  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
+  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
+
+  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
+  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
+  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
+  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
+  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
+  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
+  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
+  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
+  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
+  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
+  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
+
+  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
+  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
+  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
+  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
+  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
+  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
+  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
+  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
+  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
+  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
+  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
+
+  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
+  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
+  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
+  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
+  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
+  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
+  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
+  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
+  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
+
+  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
+  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
+  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
+  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
+  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
+  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
+  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
+  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
+  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
+  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
+  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
+  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
+  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
+  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
+  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
+  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
+  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
+
+  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
+  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
+  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
+  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
+  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
+  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
+  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
+  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
+  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
+  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
+  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
+  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
+  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
+  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
+  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
+  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
+  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
+
+  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
+  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
+  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
+  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
+  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
+  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
+  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
+  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
+  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
+  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
+  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
+
+  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
+  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
+  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
+  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
+  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
+  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
+  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
+  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
+  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
+  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
+  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
+
+  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
+  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
+  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
+  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
+  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
+  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
+  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
+  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
+  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
+  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
+  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
+
+  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
+  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
+  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
+  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
+  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
+  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
+  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
+  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
+  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
+  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
+  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
+
+  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
+  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
+  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
+  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
+  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
+  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
+  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
+  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
+  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
+  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
+  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
+  }
+  return nullptr;
+}
+
+TargetLoweringBase::LegalizeTypeAction
+NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
+  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
+    return TypeSplitVector;
+  if (VT == MVT::v2f16)
+    return TypeLegal;
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
+SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                             int Enabled, int &ExtraSteps,
+                                             bool &UseOneConst,
+                                             bool Reciprocal) const {
+  if (!(Enabled == ReciprocalEstimate::Enabled ||
+        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
+    return SDValue();
+
+  if (ExtraSteps == ReciprocalEstimate::Unspecified)
+    ExtraSteps = 0;
+
+  SDLoc DL(Operand);
+  EVT VT = Operand.getValueType();
+  bool Ftz = useF32FTZ(DAG.getMachineFunction());
+
+  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(IID, DL, MVT::i32), Operand);
+  };
+
+  // The sqrt and rsqrt refinement processes assume we always start out with an
+  // approximation of the rsqrt.  Therefore, if we're going to do any refinement
+  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
+  // any refinement, we must return a regular sqrt.
+  if (Reciprocal || ExtraSteps > 0) {
+    if (VT == MVT::f32)
+      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
+                                   : Intrinsic::nvvm_rsqrt_approx_f);
+    else if (VT == MVT::f64)
+      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
+    else
+      return SDValue();
+  } else {
+    if (VT == MVT::f32)
+      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
+                                   : Intrinsic::nvvm_sqrt_approx_f);
+    else {
+      // There's no sqrt.approx.f64 instruction, so we emit
+      // reciprocal(rsqrt(x)).  This is faster than
+      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
+      // x * rsqrt(x).)
+      return DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, VT,
+          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
+          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
+    }
+  }
+}
+
+SDValue
+NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
+  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
+}
+
+std::string NVPTXTargetLowering::getPrototype(
+    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
+    const ImmutableCallSite *CS) const {
+  auto PtrVT = getPointerTy(DL);
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return "";
+
+  std::stringstream O;
+  O << "prototype_" << uniqueCallSite << " : .callprototype ";
+
+  if (retTy->getTypeID() == Type::VoidTyID) {
+    O << "()";
+  } else {
+    O << "(";
+    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
+      unsigned size = 0;
+      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
+        size = ITy->getBitWidth();
+      } else {
+        assert(retTy->isFloatingPointTy() &&
+               "Floating point type expected here");
+        size = retTy->getPrimitiveSizeInBits();
+      }
+      // PTX ABI requires all scalar return values to be at least 32
+      // bits in size.  fp16 normally uses .b16 as its storage type in
+      // PTX, so its size must be adjusted here, too.
+      if (size < 32)
+        size = 32;
+
+      O << ".param .b" << size << " _";
+    } else if (isa<PointerType>(retTy)) {
+      O << ".param .b" << PtrVT.getSizeInBits() << " _";
+    } else if (retTy->isAggregateType() || retTy->isVectorTy()) {
+      auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
+      O << ".param .align " << retAlignment << " .b8 _["
+        << DL.getTypeAllocSize(retTy) << "]";
+    } else {
+      llvm_unreachable("Unknown return type");
+    }
+    O << ") ";
+  }
+  O << "_ (";
+
+  bool first = true;
+
+  unsigned OIdx = 0;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    Type *Ty = Args[i].Ty;
+    if (!first) {
+      O << ", ";
+    }
+    first = false;
+
+    if (!Outs[OIdx].Flags.isByVal()) {
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        unsigned align = 0;
+        const CallInst *CallI = cast<CallInst>(CS->getInstruction());
+        // +1 because index 0 is reserved for return type alignment
+        if (!getAlign(*CallI, i + 1, align))
+          align = DL.getABITypeAlignment(Ty);
+        unsigned sz = DL.getTypeAllocSize(Ty);
+        O << ".param .align " << align << " .b8 ";
+        O << "_";
+        O << "[" << sz << "]";
+        // update the index for Outs
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*this, DL, Ty, vtparts);
+        if (unsigned len = vtparts.size())
+          OIdx += len - 1;
+        continue;
+      }
+      // i8 types in IR will be i16 types in SDAG
+      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
+              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+             "type mismatch between callee prototype and arguments");
+      // scalar type
+      unsigned sz = 0;
+      if (isa<IntegerType>(Ty)) {
+        sz = cast<IntegerType>(Ty)->getBitWidth();
+        if (sz < 32)
+          sz = 32;
+      } else if (isa<PointerType>(Ty)) {
+        sz = PtrVT.getSizeInBits();
+      } else if (Ty->isHalfTy())
+        // PTX ABI requires all scalar parameters to be at least 32
+        // bits in size.  fp16 normally uses .b16 as its storage type
+        // in PTX, so its size must be adjusted here, too.
+        sz = 32;
+      else
+        sz = Ty->getPrimitiveSizeInBits();
+      O << ".param .b" << sz << " ";
+      O << "_";
+      continue;
+    }
+    auto *PTy = dyn_cast<PointerType>(Ty);
+    assert(PTy && "Param with byval attribute should be a pointer type");
+    Type *ETy = PTy->getElementType();
+
+    unsigned align = Outs[OIdx].Flags.getByValAlign();
+    unsigned sz = DL.getTypeAllocSize(ETy);
+    O << ".param .align " << align << " .b8 ";
+    O << "_";
+    O << "[" << sz << "]";
+  }
+  O << ");";
+  return O.str();
+}
+
+unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                                   const ImmutableCallSite *CS,
+                                                   Type *Ty, unsigned Idx,
+                                                   const DataLayout &DL) const {
+  if (!CS) {
+    // CallSite is zero, fallback to ABI type alignment
+    return DL.getABITypeAlignment(Ty);
+  }
+
+  unsigned Align = 0;
+  const Value *DirectCallee = CS->getCalledFunction();
+
+  if (!DirectCallee) {
+    // We don't have a direct function symbol, but that may be because of
+    // constant cast instructions in the call.
+    const Instruction *CalleeI = CS->getInstruction();
+    assert(CalleeI && "Call target is not a function or derived value?");
+
+    // With bitcast'd call targets, the instruction will be the call
+    if (isa<CallInst>(CalleeI)) {
+      // Check if we have call alignment metadata
+      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+        return Align;
+
+      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      // Ignore any bitcast instructions
+      while (isa<ConstantExpr>(CalleeV)) {
+        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+        if (!CE->isCast())
+          break;
+        // Look through the bitcast
+        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
+      }
+
+      // We have now looked past all of the bitcasts.  Do we finally have a
+      // Function?
+      if (isa<Function>(CalleeV))
+        DirectCallee = CalleeV;
+    }
+  }
+
+  // Check for function alignment information if we found that the
+  // ultimate target is a Function
+  if (DirectCallee)
+    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
+      return Align;
+
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
+  return DL.getABITypeAlignment(Ty);
+}
+
+SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                       SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc dl = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &isTailCall = CLI.IsTailCall;
+  ArgListTy &Args = CLI.getArgs();
+  Type *RetTy = CLI.RetTy;
+  ImmutableCallSite *CS = CLI.CS;
+  const DataLayout &DL = DAG.getDataLayout();
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  SDValue tempChain = Chain;
+  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
+  SDValue InFlag = Chain.getValue(1);
+
+  unsigned paramCount = 0;
+  // Args.size() and Outs.size() need not match.
+  // Outs.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Outs)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Outs.
+  // So a different index should be used for indexing into Outs/OutVals.
+  // See similar issue in LowerFormalArguments.
+  unsigned OIdx = 0;
+  // Declare the .params or .reg need to pass values
+  // to the function
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    EVT VT = Outs[OIdx].VT;
+    Type *Ty = Args[i].Ty;
+
+    if (!Outs[OIdx].Flags.isByVal()) {
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
+      unsigned ArgAlign =
+          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      unsigned AllocSize = DL.getTypeAllocSize(Ty);
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      bool NeedAlign; // Does argument declaration specify alignment?
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        SDValue DeclareParamOps[] = {
+            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+            DAG.getConstant(paramCount, dl, MVT::i32),
+            DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps);
+        NeedAlign = true;
+      } else {
+        // declare .param .b<size> .param<n>;
+        if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
+          // PTX ABI requires integral types to be at least 32 bits in
+          // size. FP16 is loaded/stored using i16, so it's handled
+          // here as well.
+          AllocSize = 4;
+        }
+        SDValue DeclareScalarParamOps[] = {
+            Chain, DAG.getConstant(paramCount, dl, MVT::i32),
+            DAG.getConstant(AllocSize * 8, dl, MVT::i32),
+            DAG.getConstant(0, dl, MVT::i32), InFlag};
+        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                            DeclareScalarParamOps);
+        NeedAlign = false;
+      }
+      InFlag = Chain.getValue(1);
+
+      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+      // than 32-bits are sign extended or zero extended, depending on
+      // whether they are signed or unsigned types. This case applies
+      // only to scalar parameters and not to aggregate values.
+      bool ExtendIntegerParam =
+          Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
+
+      auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+      SmallVector<SDValue, 6> StoreOperands;
+      for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+        // New store.
+        if (VectorInfo[j] & PVF_FIRST) {
+          assert(StoreOperands.empty() && "Unfinished preceeding store.");
+          StoreOperands.push_back(Chain);
+          StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+          StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
+        }
+
+        EVT EltVT = VTs[j];
+        SDValue StVal = OutVals[OIdx];
+        if (ExtendIntegerParam) {
+          assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
+          // zext/sext to i32
+          StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                        : ISD::ZERO_EXTEND,
+                              dl, MVT::i32, StVal);
+        } else if (EltVT.getSizeInBits() < 16) {
+          // Use 16-bit registers for small stores as it's the
+          // smallest general purpose register size supported by NVPTX.
+          StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+        }
+
+        // Record the value to store.
+        StoreOperands.push_back(StVal);
+
+        if (VectorInfo[j] & PVF_LAST) {
+          unsigned NumElts = StoreOperands.size() - 3;
+          NVPTXISD::NodeType Op;
+          switch (NumElts) {
+          case 1:
+            Op = NVPTXISD::StoreParam;
+            break;
+          case 2:
+            Op = NVPTXISD::StoreParamV2;
+            break;
+          case 4:
+            Op = NVPTXISD::StoreParamV4;
+            break;
+          default:
+            llvm_unreachable("Invalid vector info.");
+          }
+
+          StoreOperands.push_back(InFlag);
+
+          // Adjust type of the store op if we've extended the scalar
+          // return value.
+          EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
+          unsigned EltAlign =
+              NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
+
+          Chain = DAG.getMemIntrinsicNode(
+              Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
+              TheStoreType, MachinePointerInfo(), EltAlign,
+              /* Volatile */ false, /* ReadMem */ false,
+              /* WriteMem */ true, /* Size */ 0);
+          InFlag = Chain.getValue(1);
+
+          // Cleanup.
+          StoreOperands.clear();
+        }
+        ++OIdx;
+      }
+      assert(StoreOperands.empty() && "Unfinished parameter store.");
+      if (VTs.size() > 0)
+        --OIdx;
+      ++paramCount;
+      continue;
+    }
+
+    // ByVal arguments
+    SmallVector<EVT, 16> VTs;
+    SmallVector<uint64_t, 16> Offsets;
+    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
+    assert(PTy && "Type of a byval parameter should be pointer");
+    ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
+
+    // declare .param .align <align> .b8 .param<n>[<size>];
+    unsigned sz = Outs[OIdx].Flags.getByValSize();
+    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+    // so we don't need to worry about natural alignment or not.
+    // See TargetLowering::LowerCallTo().
+
+    // Enforce minumum alignment of 4 to work around ptxas miscompile
+    // for sm_50+. See corresponding alignment adjustment in
+    // emitFunctionParamList() for details.
+    if (ArgAlign < 4)
+      ArgAlign = 4;
+    SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+                                 DAG.getConstant(paramCount, dl, MVT::i32),
+                                 DAG.getConstant(sz, dl, MVT::i32), InFlag};
+    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                        DeclareParamOps);
+    InFlag = Chain.getValue(1);
+    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+      EVT elemtype = VTs[j];
+      int curOffset = Offsets[j];
+      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+      auto PtrVT = getPointerTy(DL);
+      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
+                                    DAG.getConstant(curOffset, dl, PtrVT));
+      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                   MachinePointerInfo(), PartAlign);
+      if (elemtype.getSizeInBits() < 16) {
+        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+      }
+      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue CopyParamOps[] = { Chain,
+                                 DAG.getConstant(paramCount, dl, MVT::i32),
+                                 DAG.getConstant(curOffset, dl, MVT::i32),
+                                 theVal, InFlag };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                                      CopyParamOps, elemtype,
+                                      MachinePointerInfo(), /* Align */ 0,
+                                      /* Volatile */ false, /* ReadMem */ false,
+                                      /* WriteMem */ true, /* Size */ 0);
+
+      InFlag = Chain.getValue(1);
+    }
+    ++paramCount;
+  }
+
+  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
+  unsigned retAlignment = 0;
+
+  // Handle Result
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> resvtparts;
+    ComputeValueVTs(*this, DL, RetTy, resvtparts);
+
+    // Declare
+    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
+    //  .param .b<size-in-bits> retval0
+    unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
+    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
+    // these three types to match the logic in
+    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
+    // Plus, this behavior is consistent with nvcc's.
+    if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
+        RetTy->isPointerTy()) {
+      // Scalar needs to be at least 32bit wide
+      if (resultsz < 32)
+        resultsz = 32;
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+                                  DAG.getConstant(resultsz, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                          DeclareRetOps);
+      InFlag = Chain.getValue(1);
+    } else {
+      retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain,
+                                  DAG.getConstant(retAlignment, dl, MVT::i32),
+                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
+                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                          DeclareRetOps);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  if (!Func) {
+    // This is indirect function call case : PTX requires a prototype of the
+    // form
+    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
+    // to be emitted, and the label has to used as the last arg of call
+    // instruction.
+    // The prototype is embedded in a string and put as the operand for a
+    // CallPrototype SDNode which will print out to the value of the string.
+    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
+    const char *ProtoStr =
+      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+    SDValue ProtoOps[] = {
+      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
+    };
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
+    InFlag = Chain.getValue(1);
+  }
+  // Op to just print "call"
+  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue PrintCallOps[] = {
+    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
+  };
+  // We model convergent calls as separate opcodes.
+  unsigned Opcode = Func ? NVPTXISD::PrintCallUni : NVPTXISD::PrintCall;
+  if (CLI.IsConvergent)
+    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
+                                              : NVPTXISD::PrintConvergentCall;
+  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the function name
+  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
+  InFlag = Chain.getValue(1);
+
+  // Ops to print out the param list
+  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgBeginOps[] = { Chain, InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
+                      CallArgBeginOps);
+  InFlag = Chain.getValue(1);
+
+  for (unsigned i = 0, e = paramCount; i != e; ++i) {
+    unsigned opcode;
+    if (i == (e - 1))
+      opcode = NVPTXISD::LastCallArg;
+    else
+      opcode = NVPTXISD::CallArg;
+    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
+                             DAG.getConstant(i, dl, MVT::i32), InFlag };
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
+    InFlag = Chain.getValue(1);
+  }
+  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue CallArgEndOps[] = { Chain,
+                              DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
+                              InFlag };
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
+  InFlag = Chain.getValue(1);
+
+  if (!Func) {
+    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue PrototypeOps[] = { Chain,
+                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
+                               InFlag };
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Generate loads from param memory/moves from registers for result
+  if (Ins.size() > 0) {
+    SmallVector<EVT, 16> VTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
+    assert(VTs.size() == Ins.size() && "Bad value decomposition");
+
+    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+
+    SmallVector<EVT, 6> LoadVTs;
+    int VecIdx = -1; // Index of the first element of the vector.
+
+    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+    // 32-bits are sign extended or zero extended, depending on whether
+    // they are signed or unsigned types.
+    bool ExtendIntegerRetVal =
+        RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+
+    for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+      bool needTruncate = false;
+      EVT TheLoadType = VTs[i];
+      EVT EltType = Ins[i].VT;
+      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+      if (ExtendIntegerRetVal) {
+        TheLoadType = MVT::i32;
+        EltType = MVT::i32;
+        needTruncate = true;
+      } else if (TheLoadType.getSizeInBits() < 16) {
+        if (VTs[i].isInteger())
+          needTruncate = true;
+        EltType = MVT::i16;
+      }
+
+      // Record index of the very first element of the vector.
+      if (VectorInfo[i] & PVF_FIRST) {
+        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+        VecIdx = i;
+      }
+
+      LoadVTs.push_back(EltType);
+
+      if (VectorInfo[i] & PVF_LAST) {
+        unsigned NumElts = LoadVTs.size();
+        LoadVTs.push_back(MVT::Other);
+        LoadVTs.push_back(MVT::Glue);
+        NVPTXISD::NodeType Op;
+        switch (NumElts) {
+        case 1:
+          Op = NVPTXISD::LoadParam;
+          break;
+        case 2:
+          Op = NVPTXISD::LoadParamV2;
+          break;
+        case 4:
+          Op = NVPTXISD::LoadParamV4;
+          break;
+        default:
+          llvm_unreachable("Invalid vector info.");
+        }
+
+        SDValue LoadOperands[] = {
+            Chain, DAG.getConstant(1, dl, MVT::i32),
+            DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
+        SDValue RetVal = DAG.getMemIntrinsicNode(
+            Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
+            MachinePointerInfo(), EltAlign, /* Volatile */ false,
+            /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0);
+
+        for (unsigned j = 0; j < NumElts; ++j) {
+          SDValue Ret = RetVal.getValue(j);
+          if (needTruncate)
+            Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
+          InVals.push_back(Ret);
+        }
+        Chain = RetVal.getValue(NumElts);
+        InFlag = RetVal.getValue(NumElts + 1);
+
+        // Cleanup
+        VecIdx = -1;
+        LoadVTs.clear();
+      }
+    }
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
+                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
+                                                   true),
+                             InFlag, dl);
+  uniqueCallSite++;
+
+  // set isTailCall to false for now, until we figure out how to express
+  // tail call optimization in PTX
+  isTailCall = false;
+  return Chain;
+}
+
+// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
+// (see LegalizeDAG.cpp). This is slow and uses local memory.
+// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
+SDValue
+NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  SmallVector<SDValue, 8> Ops;
+  unsigned NumOperands = Node->getNumOperands();
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue SubOp = Node->getOperand(i);
+    EVT VVT = SubOp.getNode()->getValueType(0);
+    EVT EltVT = VVT.getVectorElementType();
+    unsigned NumSubElem = VVT.getVectorNumElements();
+    for (unsigned j = 0; j < NumSubElem; ++j) {
+      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
+                                DAG.getIntPtrConstant(j, dl)));
+    }
+  }
+  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
+}
+
+// We can init constant f16x2 with a single .b32 move.  Normally it
+// would get lowered as two constant loads and vector-packing move.
+//        mov.b16         %h1, 0x4000;
+//        mov.b16         %h2, 0x3C00;
+//        mov.b32         %hh2, {%h2, %h1};
+// Instead we want just a constant move:
+//        mov.b32         %hh2, 0x40003C00
+//
+// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
+// generates good SASS in both cases.
+SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  //return Op;
+  if (!(Op->getValueType(0) == MVT::v2f16 &&
+        isa<ConstantFPSDNode>(Op->getOperand(0)) &&
+        isa<ConstantFPSDNode>(Op->getOperand(1))))
+    return Op;
+
+  APInt E0 =
+      cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
+  APInt E1 =
+      cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
+  SDValue Const =
+      DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
+}
+
+SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Index = Op->getOperand(1);
+  // Constant index will be matched by tablegen.
+  if (isa<ConstantSDNode>(Index.getNode()))
+    return Op;
+
+  // Extract individual elements and select one of them.
+  SDValue Vector = Op->getOperand(0);
+  EVT VectorVT = Vector.getValueType();
+  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
+  EVT EltVT = VectorVT.getVectorElementType();
+
+  SDLoc dl(Op.getNode());
+  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                           DAG.getIntPtrConstant(0, dl));
+  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                           DAG.getIntPtrConstant(1, dl));
+  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
+                         ISD::CondCode::SETEQ);
+}
+
+/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  if (VTBits == 32 && STI.getSmVersion() >= 35) {
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    //   dHi = aHi >> Amt
+    //   dLo = shf.r.clamp aLo, aHi, Amt
+
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+    // {dHi, dLo} = {aHi, aLo} >> Amt
+    // - if (Amt>=size) then
+    //      dLo = aHi >> (Amt-size)
+    //      dHi = aHi >> Amt (this is either all 0 or all 1)
+    //   else
+    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
+    //      dHi = aHi >> Amt
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, dl, MVT::i32),
+                                   ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, dl, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, dl, MVT::i32),
+                               ISD::SETGE);
+    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which
+/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
+///    amount, or
+/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
+///    amount.
+SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+
+  if (VTBits == 32 && STI.getSmVersion() >= 35) {
+    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    //   dHi = shf.l.clamp aLo, aHi, Amt
+    //   dLo = aLo << Amt
+
+    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
+                             ShAmt);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+  else {
+    // {dHi, dLo} = {aHi, aLo} << Amt
+    // - if (Amt>=size) then
+    //      dLo = aLo << Amt (all 0)
+    //      dLo = aLo << (Amt-size)
+    //   else
+    //      dLo = aLo << Amt
+    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
+
+    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                   DAG.getConstant(VTBits, dl, MVT::i32),
+                                   ShAmt);
+    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                     DAG.getConstant(VTBits, dl, MVT::i32));
+    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
+                               DAG.getConstant(VTBits, dl, MVT::i32),
+                               ISD::SETGE);
+    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
+
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, dl);
+  }
+}
+
+SDValue
+NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  case ISD::RETURNADDR:
+    return SDValue();
+  case ISD::FRAMEADDR:
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::INTRINSIC_W_CHAIN:
+    return Op;
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return Op;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::CONCAT_VECTORS:
+    return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG);
+  case ISD::LOAD:
+    return LowerLOAD(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::SELECT:
+    return LowerSelect(Op, DAG);
+  default:
+    llvm_unreachable("Custom lowering not defined for operation");
+  }
+}
+
+SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Op0 = Op->getOperand(0);
+  SDValue Op1 = Op->getOperand(1);
+  SDValue Op2 = Op->getOperand(2);
+  SDLoc DL(Op.getNode());
+
+  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
+
+  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
+
+  return Trunc;
+}
+
+SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op.getValueType() == MVT::i1)
+    return LowerLOADi1(Op, DAG);
+
+  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+  // loads and have to handle it here.
+  if (Op.getValueType() == MVT::v2f16) {
+    LoadSDNode *Load = cast<LoadSDNode>(Op);
+    EVT MemVT = Load->getMemoryVT();
+    if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                            Load->getAddressSpace(), Load->getAlignment())) {
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+      return DAG.getMergeValues(Ops, SDLoc(Op));
+    }
+  }
+
+  return SDValue();
+}
+
+// v = ld i1* addr
+//   =>
+// v1 = ld i8* addr (-> i16)
+// v = trunc i16 to i1
+SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDLoc dl(Node);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
+  assert(Node->getValueType(0) == MVT::i1 &&
+         "Custom lowering for i1 load only");
+  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
+                              LD->getPointerInfo(), LD->getAlignment(),
+                              LD->getMemOperand()->getFlags());
+  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+  // The legalizer (the caller) is expecting two values from the legalized
+  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+  // in LegalizeDAG.cpp which also uses MergeValues.
+  SDValue Ops[] = { result, LD->getChain() };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  if (VT == MVT::i1)
+    return LowerSTOREi1(Op, DAG);
+
+  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+  // stores and have to handle it here.
+  if (VT == MVT::v2f16 &&
+      !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          Store->getAddressSpace(), Store->getAlignment()))
+    return expandUnalignedStore(Store, DAG);
+
+  if (VT.isVector())
+    return LowerSTOREVector(Op, DAG);
+
+  return SDValue();
+}
+
+SDValue
+NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDValue Val = N->getOperand(1);
+  SDLoc DL(N);
+  EVT ValVT = Val.getValueType();
+
+  if (ValVT.isVector()) {
+    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+    // legal.  We can (and should) split that into 2 stores of <2 x double> here
+    // but I'm leaving that as a TODO for now.
+    if (!ValVT.isSimple())
+      return SDValue();
+    switch (ValVT.getSimpleVT().SimpleTy) {
+    default:
+      return SDValue();
+    case MVT::v2i8:
+    case MVT::v2i16:
+    case MVT::v2i32:
+    case MVT::v2i64:
+    case MVT::v2f16:
+    case MVT::v2f32:
+    case MVT::v2f64:
+    case MVT::v4i8:
+    case MVT::v4i16:
+    case MVT::v4i32:
+    case MVT::v4f16:
+    case MVT::v4f32:
+    case MVT::v8f16: // <4 x f16x2>
+      // This is a "native" vector type
+      break;
+    }
+
+    MemSDNode *MemSD = cast<MemSDNode>(N);
+    const DataLayout &TD = DAG.getDataLayout();
+
+    unsigned Align = MemSD->getAlignment();
+    unsigned PrefAlign =
+        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
+    if (Align < PrefAlign) {
+      // This store is not sufficiently aligned, so bail out and let this vector
+      // store be scalarized.  Note that we may still be able to emit smaller
+      // vector stores.  For example, if we are storing a <4 x float> with an
+      // alignment of 8, this check will fail but the legalizer will try again
+      // with 2 x <2 x float>, which will succeed with an alignment of 8.
+      return SDValue();
+    }
+
+    unsigned Opcode = 0;
+    EVT EltVT = ValVT.getVectorElementType();
+    unsigned NumElts = ValVT.getVectorNumElements();
+
+    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
+    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+    // stored type to i16 and propagate the "real" type as the memory type.
+    bool NeedExt = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExt = true;
+
+    bool StoreF16x2 = false;
+    switch (NumElts) {
+    default:
+      return SDValue();
+    case 2:
+      Opcode = NVPTXISD::StoreV2;
+      break;
+    case 4:
+      Opcode = NVPTXISD::StoreV4;
+      break;
+    case 8:
+      // v8f16 is a special case. PTX doesn't have st.v8.f16
+      // instruction. Instead, we split the vector into v2f16 chunks and
+      // store them with st.v4.b32.
+      assert(EltVT == MVT::f16 && "Wrong type for the vector.");
+      Opcode = NVPTXISD::StoreV4;
+      StoreF16x2 = true;
+      break;
+    }
+
+    SmallVector<SDValue, 8> Ops;
+
+    // First is the chain
+    Ops.push_back(N->getOperand(0));
+
+    if (StoreF16x2) {
+      // Combine f16,f16 -> v2f16
+      NumElts /= 2;
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                 DAG.getIntPtrConstant(i * 2, DL));
+        SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                 DAG.getIntPtrConstant(i * 2 + 1, DL));
+        SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
+        Ops.push_back(V2);
+      }
+    } else {
+      // Then the split values
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                     DAG.getIntPtrConstant(i, DL));
+        if (NeedExt)
+          ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+        Ops.push_back(ExtVal);
+      }
+    }
+
+    // Then any remaining arguments
+    Ops.append(N->op_begin() + 2, N->op_end());
+
+    SDValue NewSt =
+        DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+                                MemSD->getMemoryVT(), MemSD->getMemOperand());
+
+    // return DCI.CombineTo(N, NewSt, true);
+    return NewSt;
+  }
+
+  return SDValue();
+}
+
+// st i1 v, addr
+//    =>
+// v1 = zxt v to i16
+// st.u8 i16, addr
+SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3 = ST->getValue();
+  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+  SDValue Result =
+      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
+                        ST->getAlignment(), ST->getMemOperand()->getFlags());
+  return Result;
+}
+
+SDValue
+NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
+  std::string ParamSym;
+  raw_string_ostream ParamStr(ParamSym);
+
+  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+  ParamStr.flush();
+
+  std::string *SavedStr =
+    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
+}
+
+// Check to see if the kernel argument is image*_t or sampler_t
+
+static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
+  static const char *const specialTypes[] = { "struct._image2d_t",
+                                              "struct._image3d_t",
+                                              "struct._sampler_t" };
+
+  Type *Ty = arg->getType();
+  auto *PTy = dyn_cast<PointerType>(Ty);
+
+  if (!PTy)
+    return false;
+
+  if (!context)
+    return false;
+
+  auto *STy = dyn_cast<StructType>(PTy->getElementType());
+  if (!STy || STy->isLiteral())
+    return false;
+
+  return std::find(std::begin(specialTypes), std::end(specialTypes),
+                   STy->getName()) != std::end(specialTypes);
+}
+
+SDValue NVPTXTargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const DataLayout &DL = DAG.getDataLayout();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  const Function *F = MF.getFunction();
+  const AttributeList &PAL = F->getAttributes();
+  const TargetLowering *TLI = STI.getTargetLowering();
+
+  SDValue Root = DAG.getRoot();
+  std::vector<SDValue> OutChains;
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  std::vector<Type *> argTypes;
+  std::vector<const Argument *> theArgs;
+  for (const Argument &I : F->args()) {
+    theArgs.push_back(&I);
+    argTypes.push_back(I.getType());
+  }
+  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+  // Ins.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Ins)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Ins.
+  // So a different index should be used for indexing into Ins.
+  // See similar issue in LowerCall.
+  unsigned InsIdx = 0;
+
+  int idx = 0;
+  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
+    Type *Ty = argTypes[i];
+
+    // If the kernel argument is image*_t or sampler_t, convert it to
+    // a i32 constant holding the parameter position. This can later
+    // matched in the AsmPrinter to output the correct mangled name.
+    if (isImageOrSamplerVal(
+            theArgs[i],
+            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
+                                     : nullptr))) {
+      assert(isKernelFunction(*F) &&
+             "Only kernels can have image/sampler params");
+      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
+      continue;
+    }
+
+    if (theArgs[i]->use_empty()) {
+      // argument is dead
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+
+        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(DL, Ty);
+        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+        for (unsigned parti = 0; parti < NumRegs; ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (NumRegs > 0)
+          --InsIdx;
+        continue;
+      }
+      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+      continue;
+    }
+
+    // In the following cases, assign a node order of "idx+1"
+    // to newly created nodes. The SDNodes for params have to
+    // appear in the same order as their order of appearance
+    // in the original function. "idx+1" holds that order.
+    if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
+      bool aggregateIsPacked = false;
+      if (StructType *STy = dyn_cast<StructType>(Ty))
+        aggregateIsPacked = STy->isPacked();
+
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
+      assert(VTs.size() > 0 && "Unexpected empty type.");
+      auto VectorInfo =
+          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
+
+      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+      int VecIdx = -1; // Index of the first element of the current vector.
+      for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
+        if (VectorInfo[parti] & PVF_FIRST) {
+          assert(VecIdx == -1 && "Orphaned vector.");
+          VecIdx = parti;
+        }
+
+        // That's the last element of this store op.
+        if (VectorInfo[parti] & PVF_LAST) {
+          unsigned NumElts = parti - VecIdx + 1;
+          EVT EltVT = VTs[parti];
+          // i1 is loaded/stored as i8.
+          EVT LoadVT = EltVT;
+          if (EltVT == MVT::i1)
+            LoadVT = MVT::i8;
+          else if (EltVT == MVT::v2f16)
+            // getLoad needs a vector type, but it can't handle
+            // vectors which contain v2f16 elements. So we must load
+            // using i32 here and then bitcast back.
+            LoadVT = MVT::i32;
+
+          EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
+          SDValue VecAddr =
+              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                          DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
+          Value *srcValue = Constant::getNullValue(PointerType::get(
+              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+          SDValue P =
+              DAG.getLoad(VecVT, dl, Root, VecAddr,
+                          MachinePointerInfo(srcValue), aggregateIsPacked,
+                          MachineMemOperand::MODereferenceable |
+                              MachineMemOperand::MOInvariant);
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+          for (unsigned j = 0; j < NumElts; ++j) {
+            SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
+                                      DAG.getIntPtrConstant(j, dl));
+            // We've loaded i1 as an i8 and now must truncate it back to i1
+            if (EltVT == MVT::i1)
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
+            // v2f16 was loaded as an i32. Now we must bitcast it back.
+            else if (EltVT == MVT::v2f16)
+              Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+            // Extend the element if necessary (e.g. an i8 is loaded
+            // into an i16 register)
+            if (Ins[InsIdx].VT.isInteger() &&
+                Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+              unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                           : ISD::ZERO_EXTEND;
+              Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
+            }
+            InVals.push_back(Elt);
+          }
+
+          // Reset vector tracking state.
+          VecIdx = -1;
+        }
+        ++InsIdx;
+      }
+      if (VTs.size() > 0)
+        --InsIdx;
+      continue;
+    }
+
+    // Param has ByVal attribute
+    // Return MoveParam(param symbol).
+    // Ideally, the param symbol can be returned directly,
+    // but when SDNode builder decides to use it in a CopyToReg(),
+    // machine instruction fails because TargetExternalSymbol
+    // (not lowered) is target dependent, and CopyToReg assumes
+    // the source is lowered.
+    EVT ObjectVT = getValueType(DL, Ty);
+    assert(ObjectVT == Ins[InsIdx].VT &&
+           "Ins type did not match function type");
+    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+    if (p.getNode())
+      p.getNode()->setIROrder(idx + 1);
+    InVals.push_back(p);
+  }
+
+  // Clang will check explicit VarArg and issue error if any. However, Clang
+  // will let code with
+  // implicit var arg like f() pass. See bug 617733.
+  // We treat this case as if the arg list is empty.
+  // if (F.isVarArg()) {
+  // assert(0 && "VarArg not supported yet!");
+  //}
+
+  if (!OutChains.empty())
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
+
+  return Chain;
+}
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 const SDLoc &dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  Type *RetTy = MF.getFunction()->getReturnType();
+
+  bool isABI = (STI.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  const DataLayout DL = DAG.getDataLayout();
+  SmallVector<EVT, 16> VTs;
+  SmallVector<uint64_t, 16> Offsets;
+  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+
+  auto VectorInfo = VectorizePTXValueVTs(
+      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
+
+  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+  // 32-bits are sign extended or zero extended, depending on whether
+  // they are signed or unsigned types.
+  bool ExtendIntegerRetVal =
+      RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+
+  SmallVector<SDValue, 6> StoreOperands;
+  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+    // New load/store. Record chain and offset operands.
+    if (VectorInfo[i] & PVF_FIRST) {
+      assert(StoreOperands.empty() && "Orphaned operand list.");
+      StoreOperands.push_back(Chain);
+      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+    }
+
+    SDValue RetVal = OutVals[i];
+    if (ExtendIntegerRetVal) {
+      RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                  : ISD::ZERO_EXTEND,
+                           dl, MVT::i32, RetVal);
+    } else if (RetVal.getValueSizeInBits() < 16) {
+      // Use 16-bit registers for small load-stores as it's the
+      // smallest general purpose register size supported by NVPTX.
+      RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
+    }
+
+    // Record the value to return.
+    StoreOperands.push_back(RetVal);
+
+    // That's the last element of this store op.
+    if (VectorInfo[i] & PVF_LAST) {
+      NVPTXISD::NodeType Op;
+      unsigned NumElts = StoreOperands.size() - 2;
+      switch (NumElts) {
+      case 1:
+        Op = NVPTXISD::StoreRetval;
+        break;
+      case 2:
+        Op = NVPTXISD::StoreRetvalV2;
+        break;
+      case 4:
+        Op = NVPTXISD::StoreRetvalV4;
+        break;
+      default:
+        llvm_unreachable("Invalid vector info.");
+      }
+
+      // Adjust type of load/store op if we've extended the scalar
+      // return value.
+      EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
+                                      StoreOperands, TheStoreType,
+                                      MachinePointerInfo(), /* Align */ 1,
+                                      /* Volatile */ false, /* ReadMem */ false,
+                                      /* WriteMem */ true, /* Size */ 0);
+      // Cleanup vector state.
+      StoreOperands.clear();
+    }
+  }
+
+  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+void NVPTXTargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  if (Constraint.length() > 1)
+    return;
+  else
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+    return NVPTXISD::Tex1DFloatS32;
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloat;
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+    return NVPTXISD::Tex1DS32S32;
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+    return NVPTXISD::Tex1DS32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+    return NVPTXISD::Tex1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+    return NVPTXISD::Tex1DU32S32;
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+    return NVPTXISD::Tex1DU32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+    return NVPTXISD::Tex1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+    return NVPTXISD::Tex1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+    return NVPTXISD::Tex1DArrayS32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+    return NVPTXISD::Tex1DArrayU32S32;
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+    return NVPTXISD::Tex2DFloatS32;
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloat;
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+    return NVPTXISD::Tex2DS32S32;
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+    return NVPTXISD::Tex2DS32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+    return NVPTXISD::Tex2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+    return NVPTXISD::Tex2DU32S32;
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+    return NVPTXISD::Tex2DU32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+    return NVPTXISD::Tex2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+    return NVPTXISD::Tex2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+    return NVPTXISD::Tex2DArrayS32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+    return NVPTXISD::Tex2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+    return NVPTXISD::Tex2DArrayU32S32;
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+    return NVPTXISD::Tex2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+    return NVPTXISD::Tex3DFloatS32;
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloat;
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+    return NVPTXISD::Tex3DS32S32;
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+    return NVPTXISD::Tex3DS32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+    return NVPTXISD::Tex3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+    return NVPTXISD::Tex3DU32S32;
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+    return NVPTXISD::Tex3DU32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+    return NVPTXISD::Tex3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloat;
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+    return NVPTXISD::TexCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+    return NVPTXISD::TexCubeS32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+    return NVPTXISD::TexCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+    return NVPTXISD::TexCubeU32Float;
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+    return NVPTXISD::TexCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4R2DFloatFloat;
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4G2DFloatFloat;
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4B2DFloatFloat;
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4A2DFloatFloat;
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4R2DS64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4G2DS64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4B2DS64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4A2DS64Float;
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4R2DU64Float;
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4G2DU64Float;
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4B2DU64Float;
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4A2DU64Float;
+
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+    return NVPTXISD::TexUnified1DFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+    return NVPTXISD::TexUnified1DS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+    return NVPTXISD::TexUnified1DU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+    return NVPTXISD::TexUnified1DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+    return NVPTXISD::TexUnified1DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+    return NVPTXISD::TexUnified1DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+    return NVPTXISD::TexUnified2DFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+    return NVPTXISD::TexUnified2DS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+    return NVPTXISD::TexUnified2DU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+    return NVPTXISD::TexUnified2DArrayFloatS32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+    return NVPTXISD::TexUnified2DArrayS32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+    return NVPTXISD::TexUnified2DArrayU32S32;
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+    return NVPTXISD::TexUnified3DFloatS32;
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloat;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+    return NVPTXISD::TexUnified3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+    return NVPTXISD::TexUnified3DS32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+    return NVPTXISD::TexUnified3DS32FloatGrad;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+    return NVPTXISD::TexUnified3DU32S32;
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32Float;
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+    return NVPTXISD::TexUnified3DU32FloatGrad;
+
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
+
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32Float;
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
+
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedR2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedG2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedB2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+    return NVPTXISD::Tld4UnifiedA2DS64Float;
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedR2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedG2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedB2DU64Float;
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+    return NVPTXISD::Tld4UnifiedA2DU64Float;
+  }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+    return NVPTXISD::Suld1DI8Clamp;
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+    return NVPTXISD::Suld1DI16Clamp;
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+    return NVPTXISD::Suld1DI32Clamp;
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+    return NVPTXISD::Suld1DI64Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+    return NVPTXISD::Suld1DV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+    return NVPTXISD::Suld1DV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+    return NVPTXISD::Suld1DV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+    return NVPTXISD::Suld1DV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+    return NVPTXISD::Suld1DV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+    return NVPTXISD::Suld1DV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+    return NVPTXISD::Suld1DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+    return NVPTXISD::Suld1DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+    return NVPTXISD::Suld1DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+    return NVPTXISD::Suld1DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+    return NVPTXISD::Suld1DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+    return NVPTXISD::Suld1DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+    return NVPTXISD::Suld1DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+    return NVPTXISD::Suld1DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+    return NVPTXISD::Suld1DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+    return NVPTXISD::Suld1DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+    return NVPTXISD::Suld1DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+    return NVPTXISD::Suld1DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+    return NVPTXISD::Suld2DI8Clamp;
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+    return NVPTXISD::Suld2DI16Clamp;
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+    return NVPTXISD::Suld2DI32Clamp;
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+    return NVPTXISD::Suld2DI64Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+    return NVPTXISD::Suld2DV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+    return NVPTXISD::Suld2DV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+    return NVPTXISD::Suld2DV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+    return NVPTXISD::Suld2DV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+    return NVPTXISD::Suld2DV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+    return NVPTXISD::Suld2DV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+    return NVPTXISD::Suld2DV4I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+    return NVPTXISD::Suld2DArrayI8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+    return NVPTXISD::Suld2DArrayI16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+    return NVPTXISD::Suld2DArrayI32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+    return NVPTXISD::Suld2DArrayI64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+    return NVPTXISD::Suld2DArrayV2I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+    return NVPTXISD::Suld2DArrayV2I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+    return NVPTXISD::Suld2DArrayV2I32Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+    return NVPTXISD::Suld2DArrayV2I64Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+    return NVPTXISD::Suld2DArrayV4I8Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+    return NVPTXISD::Suld2DArrayV4I16Clamp;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+    return NVPTXISD::Suld2DArrayV4I32Clamp;
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+    return NVPTXISD::Suld3DI8Clamp;
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+    return NVPTXISD::Suld3DI16Clamp;
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+    return NVPTXISD::Suld3DI32Clamp;
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+    return NVPTXISD::Suld3DI64Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+    return NVPTXISD::Suld3DV2I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+    return NVPTXISD::Suld3DV2I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+    return NVPTXISD::Suld3DV2I32Clamp;
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+    return NVPTXISD::Suld3DV2I64Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+    return NVPTXISD::Suld3DV4I8Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+    return NVPTXISD::Suld3DV4I16Clamp;
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+    return NVPTXISD::Suld3DV4I32Clamp;
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+    return NVPTXISD::Suld1DI8Trap;
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+    return NVPTXISD::Suld1DI16Trap;
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+    return NVPTXISD::Suld1DI32Trap;
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+    return NVPTXISD::Suld1DI64Trap;
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+    return NVPTXISD::Suld1DV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+    return NVPTXISD::Suld1DV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+    return NVPTXISD::Suld1DV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+    return NVPTXISD::Suld1DV2I64Trap;
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+    return NVPTXISD::Suld1DV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+    return NVPTXISD::Suld1DV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+    return NVPTXISD::Suld1DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+    return NVPTXISD::Suld1DArrayI8Trap;
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+    return NVPTXISD::Suld1DArrayI16Trap;
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+    return NVPTXISD::Suld1DArrayI32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+    return NVPTXISD::Suld1DArrayI64Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+    return NVPTXISD::Suld1DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+    return NVPTXISD::Suld1DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+    return NVPTXISD::Suld1DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+    return NVPTXISD::Suld1DArrayV2I64Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+    return NVPTXISD::Suld1DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+    return NVPTXISD::Suld1DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+    return NVPTXISD::Suld1DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+    return NVPTXISD::Suld2DI8Trap;
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+    return NVPTXISD::Suld2DI16Trap;
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+    return NVPTXISD::Suld2DI32Trap;
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+    return NVPTXISD::Suld2DI64Trap;
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+    return NVPTXISD::Suld2DV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+    return NVPTXISD::Suld2DV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+    return NVPTXISD::Suld2DV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+    return NVPTXISD::Suld2DV2I64Trap;
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+    return NVPTXISD::Suld2DV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+    return NVPTXISD::Suld2DV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+    return NVPTXISD::Suld2DV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+    return NVPTXISD::Suld2DArrayI8Trap;
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+    return NVPTXISD::Suld2DArrayI16Trap;
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+    return NVPTXISD::Suld2DArrayI32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+    return NVPTXISD::Suld2DArrayI64Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+    return NVPTXISD::Suld2DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+    return NVPTXISD::Suld2DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+    return NVPTXISD::Suld2DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+    return NVPTXISD::Suld2DArrayV2I64Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+    return NVPTXISD::Suld2DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+    return NVPTXISD::Suld2DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+    return NVPTXISD::Suld2DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+    return NVPTXISD::Suld3DI8Trap;
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+    return NVPTXISD::Suld3DI16Trap;
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+    return NVPTXISD::Suld3DI32Trap;
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+    return NVPTXISD::Suld3DI64Trap;
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+    return NVPTXISD::Suld3DV2I8Trap;
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+    return NVPTXISD::Suld3DV2I16Trap;
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+    return NVPTXISD::Suld3DV2I32Trap;
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+    return NVPTXISD::Suld3DV2I64Trap;
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+    return NVPTXISD::Suld3DV4I8Trap;
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+    return NVPTXISD::Suld3DV4I16Trap;
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+    return NVPTXISD::Suld3DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+    return NVPTXISD::Suld1DI8Zero;
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+    return NVPTXISD::Suld1DI16Zero;
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+    return NVPTXISD::Suld1DI32Zero;
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+    return NVPTXISD::Suld1DI64Zero;
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+    return NVPTXISD::Suld1DV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+    return NVPTXISD::Suld1DV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+    return NVPTXISD::Suld1DV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+    return NVPTXISD::Suld1DV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+    return NVPTXISD::Suld1DV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+    return NVPTXISD::Suld1DV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+    return NVPTXISD::Suld1DV4I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+    return NVPTXISD::Suld1DArrayI8Zero;
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+    return NVPTXISD::Suld1DArrayI16Zero;
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+    return NVPTXISD::Suld1DArrayI32Zero;
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+    return NVPTXISD::Suld1DArrayI64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+    return NVPTXISD::Suld1DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+    return NVPTXISD::Suld1DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+    return NVPTXISD::Suld1DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+    return NVPTXISD::Suld1DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+    return NVPTXISD::Suld1DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+    return NVPTXISD::Suld1DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+    return NVPTXISD::Suld1DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+    return NVPTXISD::Suld2DI8Zero;
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+    return NVPTXISD::Suld2DI16Zero;
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+    return NVPTXISD::Suld2DI32Zero;
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+    return NVPTXISD::Suld2DI64Zero;
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+    return NVPTXISD::Suld2DV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+    return NVPTXISD::Suld2DV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+    return NVPTXISD::Suld2DV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+    return NVPTXISD::Suld2DV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+    return NVPTXISD::Suld2DV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+    return NVPTXISD::Suld2DV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+    return NVPTXISD::Suld2DV4I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+    return NVPTXISD::Suld2DArrayI8Zero;
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+    return NVPTXISD::Suld2DArrayI16Zero;
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+    return NVPTXISD::Suld2DArrayI32Zero;
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+    return NVPTXISD::Suld2DArrayI64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+    return NVPTXISD::Suld2DArrayV2I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+    return NVPTXISD::Suld2DArrayV2I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+    return NVPTXISD::Suld2DArrayV2I32Zero;
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+    return NVPTXISD::Suld2DArrayV2I64Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+    return NVPTXISD::Suld2DArrayV4I8Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+    return NVPTXISD::Suld2DArrayV4I16Zero;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+    return NVPTXISD::Suld2DArrayV4I32Zero;
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+    return NVPTXISD::Suld3DI8Zero;
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+    return NVPTXISD::Suld3DI16Zero;
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+    return NVPTXISD::Suld3DI32Zero;
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+    return NVPTXISD::Suld3DI64Zero;
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+    return NVPTXISD::Suld3DV2I8Zero;
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+    return NVPTXISD::Suld3DV2I16Zero;
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+    return NVPTXISD::Suld3DV2I32Zero;
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
+    return NVPTXISD::Suld3DV2I64Zero;
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
+    return NVPTXISD::Suld3DV4I8Zero;
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
+    return NVPTXISD::Suld3DV4I16Zero;
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
+    return NVPTXISD::Suld3DV4I32Zero;
+  }
+}
+
+// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
+// TgtMemIntrinsic
+// because we need the information that is only available in the "Value" type
+// of destination
+// pointer. In particular, the address space information.
+bool NVPTXTargetLowering::getTgtMemIntrinsic(
+    IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  default:
+    return false;
+
+  case Intrinsic::nvvm_atomic_load_add_f32:
+  case Intrinsic::nvvm_atomic_load_inc_32:
+  case Intrinsic::nvvm_atomic_load_dec_32:
+
+  case Intrinsic::nvvm_atomic_add_gen_f_cta:
+  case Intrinsic::nvvm_atomic_add_gen_f_sys:
+  case Intrinsic::nvvm_atomic_add_gen_i_cta:
+  case Intrinsic::nvvm_atomic_add_gen_i_sys:
+  case Intrinsic::nvvm_atomic_and_gen_i_cta:
+  case Intrinsic::nvvm_atomic_and_gen_i_sys:
+  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
+  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
+  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
+  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
+  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
+  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
+  case Intrinsic::nvvm_atomic_max_gen_i_cta:
+  case Intrinsic::nvvm_atomic_max_gen_i_sys:
+  case Intrinsic::nvvm_atomic_min_gen_i_cta:
+  case Intrinsic::nvvm_atomic_min_gen_i_sys:
+  case Intrinsic::nvvm_atomic_or_gen_i_cta:
+  case Intrinsic::nvvm_atomic_or_gen_i_sys:
+  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
+  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
+  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
+  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
+    auto &DL = I.getModule()->getDataLayout();
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = getValueType(DL, I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = true;
+    Info.align = 0;
+    return true;
+  }
+
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    auto &DL = I.getModule()->getDataLayout();
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
+      Info.memVT = getValueType(DL, I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
+      Info.memVT = getPointerTy(DL);
+    else
+      Info.memVT = getValueType(DL, I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+
+    return true;
+  }
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p: {
+    auto &DL = I.getModule()->getDataLayout();
+
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
+      Info.memVT = getValueType(DL, I.getType());
+    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
+      Info.memVT = getPointerTy(DL);
+    else
+      Info.memVT = getValueType(DL, I.getType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+
+    return true;
+  }
+
+  case Intrinsic::nvvm_tex_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::v4f32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+
+  case Intrinsic::nvvm_tex_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
+  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
+  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
+  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
+  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::v4i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+
+  case Intrinsic::nvvm_suld_1d_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
+  case Intrinsic::nvvm_suld_3d_i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_i8_zero:
+  case Intrinsic::nvvm_suld_1d_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_v4i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_i8_zero:
+  case Intrinsic::nvvm_suld_2d_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_v4i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
+  case Intrinsic::nvvm_suld_3d_i8_zero:
+  case Intrinsic::nvvm_suld_3d_v2i8_zero:
+  case Intrinsic::nvvm_suld_3d_v4i8_zero:
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i8;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+
+  case Intrinsic::nvvm_suld_1d_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
+  case Intrinsic::nvvm_suld_3d_i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_i16_zero:
+  case Intrinsic::nvvm_suld_1d_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_v4i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_i16_zero:
+  case Intrinsic::nvvm_suld_2d_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_v4i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
+  case Intrinsic::nvvm_suld_3d_i16_zero:
+  case Intrinsic::nvvm_suld_3d_v2i16_zero:
+  case Intrinsic::nvvm_suld_3d_v4i16_zero:
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i16;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+
+  case Intrinsic::nvvm_suld_1d_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
+  case Intrinsic::nvvm_suld_3d_i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
+  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_i32_zero:
+  case Intrinsic::nvvm_suld_1d_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_v4i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_i32_zero:
+  case Intrinsic::nvvm_suld_2d_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_v4i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
+  case Intrinsic::nvvm_suld_3d_i32_zero:
+  case Intrinsic::nvvm_suld_3d_v2i32_zero:
+  case Intrinsic::nvvm_suld_3d_v4i32_zero:
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+
+  case Intrinsic::nvvm_suld_1d_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
+  case Intrinsic::nvvm_suld_3d_i64_clamp:
+  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
+  case Intrinsic::nvvm_suld_1d_i64_trap:
+  case Intrinsic::nvvm_suld_1d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_i64_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_i64_trap:
+  case Intrinsic::nvvm_suld_2d_v2i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_i64_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
+  case Intrinsic::nvvm_suld_3d_i64_trap:
+  case Intrinsic::nvvm_suld_3d_v2i64_trap:
+  case Intrinsic::nvvm_suld_1d_i64_zero:
+  case Intrinsic::nvvm_suld_1d_v2i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_i64_zero:
+  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_i64_zero:
+  case Intrinsic::nvvm_suld_2d_v2i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_i64_zero:
+  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
+  case Intrinsic::nvvm_suld_3d_i64_zero:
+  case Intrinsic::nvvm_suld_3d_v2i64_zero:
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i64;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  return false;
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+/// Used to guide target specific optimizations, like loop strength reduction
+/// (LoopStrengthReduce.cpp) and memory optimization for address mode
+/// (CodeGenPrepare.cpp)
+bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
+                                                const AddrMode &AM, Type *Ty,
+                                                unsigned AS) const {
+  // AddrMode - This represents an addressing mode of:
+  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+  //
+  // The legal address modes are
+  // - [avar]
+  // - [areg]
+  // - [areg+immoff]
+  // - [immAddr]
+
+  if (AM.BaseGV) {
+    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
+  }
+
+  switch (AM.Scale) {
+  case 0: // "r", "r+i" or "i" is allowed
+    break;
+  case 1:
+    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
+      return false;
+    // Otherwise we have r+i.
+    break;
+  default:
+    // No scale > 1 is allowed
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+NVPTXTargetLowering::ConstraintType
+NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'b':
+    case 'r':
+    case 'h':
+    case 'c':
+    case 'l':
+    case 'f':
+    case 'd':
+    case '0':
+    case 'N':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'b':
+      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
+    case 'c':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'h':
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
+    case 'r':
+      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
+    case 'l':
+    case 'N':
+      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
+    case 'f':
+      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
+    case 'd':
+      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                         NVPTX DAG Combining
+//===----------------------------------------------------------------------===//
+
+bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
+                                   CodeGenOpt::Level OptLevel) const {
+  // Always honor command-line argument
+  if (FMAContractLevelOpt.getNumOccurrences() > 0)
+    return FMAContractLevelOpt > 0;
+
+  // Do not contract if we're not optimizing the code.
+  if (OptLevel == 0)
+    return false;
+
+  // Honor TargetOptions flags that explicitly say fusion is okay.
+  if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
+    return true;
+
+  return allowUnsafeFPMath(MF);
+}
+
+bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
+  // Honor TargetOptions flags that explicitly say unsafe math is okay.
+  if (MF.getTarget().Options.UnsafeFPMath)
+    return true;
+
+  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
+  const Function *F = MF.getFunction();
+  if (F->hasFnAttribute("unsafe-fp-math")) {
+    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+    StringRef Val = Attr.getValueAsString();
+    if (Val == "true")
+      return true;
+  }
+
+  return false;
+}
+
+/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
+/// operands N0 and N1.  This is a helper for PerformADDCombine that is
+/// called with the default operands, and if that fails, with commuted
+/// operands.
+static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                             const NVPTXSubtarget &Subtarget,
+                                             CodeGenOpt::Level OptLevel) {
+  SelectionDAG  &DAG = DCI.DAG;
+  // Skip non-integer, non-scalar case
+  EVT VT=N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  // fold (add (mul a, b), c) -> (mad a, b, c)
+  //
+  if (N0.getOpcode() == ISD::MUL) {
+    assert (VT.isInteger());
+    // For integer:
+    // Since integer multiply-add costs the same as integer multiply
+    // but is more costly than integer add, do the fusion only when
+    // the mul is only used in the add.
+    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
+        !N0.getNode()->hasOneUse())
+      return SDValue();
+
+    // Do the folding
+    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+  else if (N0.getOpcode() == ISD::FMUL) {
+    if (VT == MVT::f32 || VT == MVT::f64) {
+      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
+          &DAG.getTargetLoweringInfo());
+      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
+        return SDValue();
+
+      // For floating point:
+      // Do the fusion only when the mul has less than 5 uses and all
+      // are add.
+      // The heuristic is that if a use is not an add, then that use
+      // cannot be fused into fma, therefore mul is still needed anyway.
+      // If there are more than 4 uses, even if they are all add, fusing
+      // them will increase register pressue.
+      //
+      int numUses = 0;
+      int nonAddCount = 0;
+      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+           UE = N0.getNode()->use_end();
+           UI != UE; ++UI) {
+        numUses++;
+        SDNode *User = *UI;
+        if (User->getOpcode() != ISD::FADD)
+          ++nonAddCount;
+      }
+      if (numUses >= 5)
+        return SDValue();
+      if (nonAddCount) {
+        int orderNo = N->getIROrder();
+        int orderNo2 = N0.getNode()->getIROrder();
+        // simple heuristics here for considering potential register
+        // pressure, the logics here is that the differnce are used
+        // to measure the distance between def and use, the longer distance
+        // more likely cause register pressure.
+        if (orderNo - orderNo2 < 500)
+          return SDValue();
+
+        // Now, check if at least one of the FMUL's operands is live beyond the node N,
+        // which guarantees that the FMA will not increase register pressure at node N.
+        bool opIsLive = false;
+        const SDNode *left = N0.getOperand(0).getNode();
+        const SDNode *right = N0.getOperand(1).getNode();
+
+        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
+          opIsLive = true;
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
+            SDNode *User = *UI;
+            int orderNo3 = User->getIROrder();
+            if (orderNo3 > orderNo) {
+              opIsLive = true;
+              break;
+            }
+          }
+
+        if (!opIsLive)
+          return SDValue();
+      }
+
+      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1), N1);
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+///
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const NVPTXSubtarget &Subtarget,
+                                 CodeGenOpt::Level OptLevel) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // First try with the default operand order.
+  if (SDValue Result =
+          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
+    return Result;
+
+  // If that didn't work, try again with the operands commuted.
+  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
+}
+
+static SDValue PerformANDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // The type legalizer turns a vector load of i8 values into a zextload to i16
+  // registers, optionally ANY_EXTENDs it (if target type is integer),
+  // and ANDs off the high 8 bits. Since we turn this load into a
+  // target-specific DAG node, the DAG combiner fails to eliminate these AND
+  // nodes. Do that here.
+  SDValue Val = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+
+  if (isa<ConstantSDNode>(Val)) {
+    std::swap(Val, Mask);
+  }
+
+  SDValue AExt;
+  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
+  if (Val.getOpcode() == ISD::ANY_EXTEND) {
+    AExt = Val;
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
+    Val = Val->getOperand(0);
+  }
+
+  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
+      Val->getOpcode() == NVPTXISD::LoadV4) {
+    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
+    if (!MaskCnst) {
+      // Not an AND with a constant
+      return SDValue();
+    }
+
+    uint64_t MaskVal = MaskCnst->getZExtValue();
+    if (MaskVal != 0xff) {
+      // Not an AND that chops off top 8 bits
+      return SDValue();
+    }
+
+    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
+    if (!Mem) {
+      // Not a MemSDNode?!?
+      return SDValue();
+    }
+
+    EVT MemVT = Mem->getMemoryVT();
+    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
+      // We only handle the i8 case
+      return SDValue();
+    }
+
+    unsigned ExtType =
+      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
+        getZExtValue();
+    if (ExtType == ISD::SEXTLOAD) {
+      // If for some reason the load is a sextload, the and is needed to zero
+      // out the high 8 bits
+      return SDValue();
+    }
+
+    bool AddTo = false;
+    if (AExt.getNode() != nullptr) {
+      // Re-insert the ext as a zext.
+      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+                            AExt.getValueType(), Val);
+      AddTo = true;
+    }
+
+    // If we get here, the AND is unnecessary.  Just replace it with the load
+    DCI.CombineTo(N, Val, AddTo);
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformREMCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
+
+  // Don't do anything at less than -O2.
+  if (OptLevel < CodeGenOpt::Default)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  bool IsSigned = N->getOpcode() == ISD::SREM;
+  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
+
+  const SDValue &Num = N->getOperand(0);
+  const SDValue &Den = N->getOperand(1);
+
+  for (const SDNode *U : Num->uses()) {
+    if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
+        U->getOperand(1) == Den) {
+      // Num % Den -> Num - (Num / Den) * Den
+      return DAG.getNode(ISD::SUB, DL, VT, Num,
+                         DAG.getNode(ISD::MUL, DL, VT,
+                                     DAG.getNode(DivOpc, DL, VT, Num, Den),
+                                     Den));
+    }
+  }
+  return SDValue();
+}
+
+enum OperandSignedness {
+  Signed = 0,
+  Unsigned,
+  Unknown
+};
+
+/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
+/// that can be demoted to \p OptSize bits without loss of information. The
+/// signedness of the operand, if determinable, is placed in \p S.
+static bool IsMulWideOperandDemotable(SDValue Op,
+                                      unsigned OptSize,
+                                      OperandSignedness &S) {
+  S = Unknown;
+
+  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() <= OptSize) {
+      S = Signed;
+      return true;
+    }
+  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
+    EVT OrigVT = Op.getOperand(0).getValueType();
+    if (OrigVT.getSizeInBits() <= OptSize) {
+      S = Unsigned;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
+/// be demoted to \p OptSize bits without loss of information. If the operands
+/// contain a constant, it should appear as the RHS operand. The signedness of
+/// the operands is placed in \p IsSigned.
+static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
+                                        unsigned OptSize,
+                                        bool &IsSigned) {
+  OperandSignedness LHSSign;
+
+  // The LHS operand must be a demotable op
+  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
+    return false;
+
+  // We should have been able to determine the signedness from the LHS
+  if (LHSSign == Unknown)
+    return false;
+
+  IsSigned = (LHSSign == Signed);
+
+  // The RHS can be a demotable op or a constant
+  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
+    const APInt &Val = CI->getAPIntValue();
+    if (LHSSign == Unsigned) {
+      return Val.isIntN(OptSize);
+    } else {
+      return Val.isSignedIntN(OptSize);
+    }
+  } else {
+    OperandSignedness RHSSign;
+    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
+      return false;
+
+    return LHSSign == RHSSign;
+  }
+}
+
+/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
+/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
+/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
+/// amount.
+static SDValue TryMULWIDECombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  EVT MulType = N->getValueType(0);
+  if (MulType != MVT::i32 && MulType != MVT::i64) {
+    return SDValue();
+  }
+
+  SDLoc DL(N);
+  unsigned OptSize = MulType.getSizeInBits() >> 1;
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Canonicalize the multiply so the constant (if any) is on the right
+  if (N->getOpcode() == ISD::MUL) {
+    if (isa<ConstantSDNode>(LHS)) {
+      std::swap(LHS, RHS);
+    }
+  }
+
+  // If we have a SHL, determine the actual multiply amount
+  if (N->getOpcode() == ISD::SHL) {
+    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
+    if (!ShlRHS) {
+      return SDValue();
+    }
+
+    APInt ShiftAmt = ShlRHS->getAPIntValue();
+    unsigned BitWidth = MulType.getSizeInBits();
+    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
+      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
+      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
+    } else {
+      return SDValue();
+    }
+  }
+
+  bool Signed;
+  // Verify that our operands are demotable
+  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
+    return SDValue();
+  }
+
+  EVT DemotedVT;
+  if (MulType == MVT::i32) {
+    DemotedVT = MVT::i16;
+  } else {
+    DemotedVT = MVT::i32;
+  }
+
+  // Truncate the operands to the correct size. Note that these are just for
+  // type consistency and will (likely) be eliminated in later phases.
+  SDValue TruncLHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
+  SDValue TruncRHS =
+    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
+
+  unsigned Opc;
+  if (Signed) {
+    Opc = NVPTXISD::MUL_WIDE_SIGNED;
+  } else {
+    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
+  }
+
+  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
+}
+
+/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
+static SDValue PerformMULCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    if (SDValue Ret = TryMULWIDECombine(N, DCI))
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
+static SDValue PerformSHLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 CodeGenOpt::Level OptLevel) {
+  if (OptLevel > 0) {
+    // Try mul.wide combining at OptLevel > 0
+    if (SDValue Ret = TryMULWIDECombine(N, DCI))
+      return Ret;
+  }
+
+  return SDValue();
+}
+
+static SDValue PerformSETCCCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  EVT CCType = N->getValueType(0);
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+
+  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
+    return SDValue();
+
+  SDLoc DL(N);
+  // setp.f16x2 returns two scalar predicates, which we need to
+  // convert back to v2i1. The returned result will be scalarized by
+  // the legalizer, but the comparison will remain a single vector
+  // instruction.
+  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
+                                   DCI.DAG.getVTList(MVT::i1, MVT::i1),
+                                   {A, B, N->getOperand(2)});
+  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
+                         CCNode.getValue(1));
+}
+
+SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
+  switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::FADD:
+      return PerformADDCombine(N, DCI, STI, OptLevel);
+    case ISD::MUL:
+      return PerformMULCombine(N, DCI, OptLevel);
+    case ISD::SHL:
+      return PerformSHLCombine(N, DCI, OptLevel);
+    case ISD::AND:
+      return PerformANDCombine(N, DCI);
+    case ISD::UREM:
+    case ISD::SREM:
+      return PerformREMCombine(N, DCI, OptLevel);
+    case ISD::SETCC:
+      return PerformSETCCCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
+static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &Results) {
+  EVT ResVT = N->getValueType(0);
+  SDLoc DL(N);
+
+  assert(ResVT.isVector() && "Vector load must have vector type");
+
+  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
+  // legal.  We can (and should) split that into 2 loads of <2 x double> here
+  // but I'm leaving that as a TODO for now.
+  assert(ResVT.isSimple() && "Can only handle simple types");
+  switch (ResVT.getSimpleVT().SimpleTy) {
+  default:
+    return;
+  case MVT::v2i8:
+  case MVT::v2i16:
+  case MVT::v2i32:
+  case MVT::v2i64:
+  case MVT::v2f16:
+  case MVT::v2f32:
+  case MVT::v2f64:
+  case MVT::v4i8:
+  case MVT::v4i16:
+  case MVT::v4i32:
+  case MVT::v4f16:
+  case MVT::v4f32:
+  case MVT::v8f16: // <4 x f16x2>
+    // This is a "native" vector type
+    break;
+  }
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  unsigned Align = LD->getAlignment();
+  auto &TD = DAG.getDataLayout();
+  unsigned PrefAlign =
+      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
+  if (Align < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return;
+  }
+
+  EVT EltVT = ResVT.getVectorElementType();
+  unsigned NumElts = ResVT.getVectorNumElements();
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propagate the "real" type as the memory type.
+  bool NeedTrunc = false;
+  if (EltVT.getSizeInBits() < 16) {
+    EltVT = MVT::i16;
+    NeedTrunc = true;
+  }
+
+  unsigned Opcode = 0;
+  SDVTList LdResVTs;
+  bool LoadF16x2 = false;
+
+  switch (NumElts) {
+  default:
+    return;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+    break;
+  case 4: {
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
+  case 8: {
+    // v8f16 is a special case. PTX doesn't have ld.v8.f16
+    // instruction. Instead, we split the vector into v2f16 chunks and
+    // load them with ld.v4.b32.
+    assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
+    LoadF16x2 = true;
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
+                     MVT::Other};
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
+  }
+
+  // Copy regular operands
+  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                          LD->getMemoryVT(),
+                                          LD->getMemOperand());
+
+  SmallVector<SDValue, 8> ScalarRes;
+  if (LoadF16x2) {
+    // Split v2f16 subvectors back into individual elements.
+    NumElts /= 2;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue SubVector = NewLD.getValue(i);
+      SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                               DAG.getIntPtrConstant(0, DL));
+      SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                               DAG.getIntPtrConstant(1, DL));
+      ScalarRes.push_back(E0);
+      ScalarRes.push_back(E1);
+    }
+  } else {
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue Res = NewLD.getValue(i);
+      if (NeedTrunc)
+        Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+      ScalarRes.push_back(Res);
+    }
+  }
+
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
+
+  Results.push_back(BuildVec);
+  Results.push_back(LoadChain);
+}
+
+static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &Results) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Intrin = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Get the intrinsic ID
+  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
+  switch (IntrinNo) {
+  default:
+    return;
+  case Intrinsic::nvvm_ldg_global_i:
+  case Intrinsic::nvvm_ldg_global_f:
+  case Intrinsic::nvvm_ldg_global_p:
+  case Intrinsic::nvvm_ldu_global_i:
+  case Intrinsic::nvvm_ldu_global_f:
+  case Intrinsic::nvvm_ldu_global_p: {
+    EVT ResVT = N->getValueType(0);
+
+    if (ResVT.isVector()) {
+      // Vector LDG/LDU
+
+      unsigned NumElts = ResVT.getVectorNumElements();
+      EVT EltVT = ResVT.getVectorElementType();
+
+      // Since LDU/LDG are target nodes, we cannot rely on DAG type
+      // legalization.
+      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+      // loaded type to i16 and propagate the "real" type as the memory type.
+      bool NeedTrunc = false;
+      if (EltVT.getSizeInBits() < 16) {
+        EltVT = MVT::i16;
+        NeedTrunc = true;
+      }
+
+      unsigned Opcode = 0;
+      SDVTList LdResVTs;
+
+      switch (NumElts) {
+      default:
+        return;
+      case 2:
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV2;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV2;
+          break;
+        }
+        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+        break;
+      case 4: {
+        switch (IntrinNo) {
+        default:
+          return;
+        case Intrinsic::nvvm_ldg_global_i:
+        case Intrinsic::nvvm_ldg_global_f:
+        case Intrinsic::nvvm_ldg_global_p:
+          Opcode = NVPTXISD::LDGV4;
+          break;
+        case Intrinsic::nvvm_ldu_global_i:
+        case Intrinsic::nvvm_ldu_global_f:
+        case Intrinsic::nvvm_ldu_global_p:
+          Opcode = NVPTXISD::LDUV4;
+          break;
+        }
+        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
+        LdResVTs = DAG.getVTList(ListVTs);
+        break;
+      }
+      }
+
+      SmallVector<SDValue, 8> OtherOps;
+
+      // Copy regular operands
+
+      OtherOps.push_back(Chain); // Chain
+                                 // Skip operand 1 (intrinsic ID)
+      // Others
+      OtherOps.append(N->op_begin() + 2, N->op_end());
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                              MemSD->getMemoryVT(),
+                                              MemSD->getMemOperand());
+
+      SmallVector<SDValue, 4> ScalarRes;
+
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Res = NewLD.getValue(i);
+        if (NeedTrunc)
+          Res =
+              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+        ScalarRes.push_back(Res);
+      }
+
+      SDValue LoadChain = NewLD.getValue(NumElts);
+
+      SDValue BuildVec =
+          DAG.getBuildVector(ResVT, DL, ScalarRes);
+
+      Results.push_back(BuildVec);
+      Results.push_back(LoadChain);
+    } else {
+      // i8 LDG/LDU
+      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
+             "Custom handling of non-i8 ldu/ldg?");
+
+      // Just copy all operands as-is
+      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
+
+      // Force output to i16
+      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
+
+      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
+
+      // We make sure the memory type is i8, which will be used during isel
+      // to select the proper instruction.
+      SDValue NewLD =
+          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+                                  MVT::i8, MemSD->getMemOperand());
+
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                    NewLD.getValue(0)));
+      Results.push_back(NewLD.getValue(1));
+    }
+  }
+  }
+}
+
+void NVPTXTargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    report_fatal_error("Unhandled custom legalization");
+  case ISD::LOAD:
+    ReplaceLoadVector(N, DAG, Results);
+    return;
+  case ISD::INTRINSIC_W_CHAIN:
+    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
+    return;
+  }
+}
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+  delete static_cast<NVPTXSection *>(TextSection);
+  delete static_cast<NVPTXSection *>(DataSection);
+  delete static_cast<NVPTXSection *>(BSSSection);
+  delete static_cast<NVPTXSection *>(ReadOnlySection);
+
+  delete static_cast<NVPTXSection *>(StaticCtorSection);
+  delete static_cast<NVPTXSection *>(StaticDtorSection);
+  delete static_cast<NVPTXSection *>(LSDASection);
+  delete static_cast<NVPTXSection *>(EHFrameSection);
+  delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
+  delete static_cast<NVPTXSection *>(DwarfInfoSection);
+  delete static_cast<NVPTXSection *>(DwarfLineSection);
+  delete static_cast<NVPTXSection *>(DwarfFrameSection);
+  delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
+  delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
+  delete static_cast<NVPTXSection *>(DwarfStrSection);
+  delete static_cast<NVPTXSection *>(DwarfLocSection);
+  delete static_cast<NVPTXSection *>(DwarfARangesSection);
+  delete static_cast<NVPTXSection *>(DwarfRangesSection);
+  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
+}
+
+MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  return getDataSection();
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index e433aed..9d7b70d 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -56,6 +56,7 @@ enum NodeType : unsigned {
   MUL_WIDE_SIGNED,
   MUL_WIDE_UNSIGNED,
   IMAD,
+  SETP_F16X2,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -73,7 +74,7 @@ enum NodeType : unsigned {
   StoreParamV2,
   StoreParamV4,
   StoreParamS32, // to sext and store a <32bit value, not used currently
-  StoreParamU32, // to zext and store a <32bit value, not used currently 
+  StoreParamU32, // to zext and store a <32bit value, not used currently
   StoreRetval,
   StoreRetvalV2,
   StoreRetvalV4,
@@ -510,17 +511,48 @@ public:
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
+  // Get the degree of precision we want from 32-bit floating point division
+  // operations.
+  //
+  //  0 - Use ptx div.approx
+  //  1 - Use ptx.div.full (approximate, but less so than div.approx)
+  //  2 - Use IEEE-compliant div instructions, if available.
+  int getDivF32Level() const;
+
+  // Get whether we should use a precise or approximate 32-bit floating point
+  // sqrt instruction.
+  bool usePrecSqrtF32() const;
+
+  // Get whether we should use instructions that flush floating-point denormals
+  // to sign-preserving zero.
+  bool useF32FTZ(const MachineFunction &MF) const;
+
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                          int &ExtraSteps, bool &UseOneConst,
+                          bool Reciprocal) const override;
+
+  unsigned combineRepeatedFPDivisors() const override { return 2; }
+
   bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
+  bool allowUnsafeFPMath(MachineFunction &MF) const;
 
   bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; }
 
   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
 
+  // The default is to transform llvm.ctlz(x, false) (where false indicates that
+  // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+  // and avoids calling ctlz in that case.  We have a dedicated ctlz
+  // instruction, so we say that ctlz is cheap to speculate.
+  bool isCheapToSpeculateCtlz() const override { return true; }
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
 
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 8d00bbb..f12ed81b 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -96,9 +96,7 @@ bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
     // This is an OpenCL sampler, so it must be a samplerref
     replaceWith(&I, ConstantInt::getTrue(I.getContext()));
     return true;
-  } else if (isImageWriteOnly(*TexHandle) ||
-             isImageReadWrite(*TexHandle) ||
-             isImageReadOnly(*TexHandle)) {
+  } else if (isImage(*TexHandle)) {
     // This is an OpenCL image, so it cannot be a samplerref
     replaceWith(&I, ConstantInt::getFalse(I.getContext()));
     return true;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
deleted file mode 100644
index f4940c9..0000000
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInferAddressSpaces.cpp
+++ /dev/null
@@ -1,583 +0,0 @@
-//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// CUDA C/C++ includes memory space designation as variable type qualifers (such
-// as __global__ and __shared__). Knowing the space of a memory access allows
-// CUDA compilers to emit faster PTX loads and stores. For example, a load from
-// shared memory can be translated to `ld.shared` which is roughly 10% faster
-// than a generic `ld` on an NVIDIA Tesla K40c.
-//
-// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
-// compilers must infer the memory space of an address expression from
-// type-qualified variables.
-//
-// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
-// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
-// places only type-qualified variables in specific address spaces, and then
-// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
-// (so-called the generic address space) for other instructions to use.
-//
-// For example, the Clang translates the following CUDA code
-//   __shared__ float a[10];
-//   float v = a[i];
-// to
-//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
-//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
-//   %v = load float, float* %1 ; emits ld.f32
-// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
-// redirected to %0 (the generic version of @a).
-//
-// The optimization implemented in this file propagates specific address spaces
-// from type-qualified variable declarations to its users. For example, it
-// optimizes the above IR to
-//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
-// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
-// codegen is able to emit ld.shared.f32 for %v.
-//
-// Address space inference works in two steps. First, it uses a data-flow
-// analysis to infer as many generic pointers as possible to point to only one
-// specific address space. In the above example, it can prove that %1 only
-// points to addrspace(3). This algorithm was published in
-//   CUDA: Compiling and optimizing for a GPU platform
-//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
-//   ICCS 2012
-//
-// Then, address space inference replaces all refinable generic pointers with
-// equivalent specific pointers.
-//
-// The major challenge of implementing this optimization is handling PHINodes,
-// which may create loops in the data flow graph. This brings two complications.
-//
-// First, the data flow analysis in Step 1 needs to be circular. For example,
-//     %generic.input = addrspacecast float addrspace(3)* %input to float*
-//   loop:
-//     %y = phi [ %generic.input, %y2 ]
-//     %y2 = getelementptr %y, 1
-//     %v = load %y2
-//     br ..., label %loop, ...
-// proving %y specific requires proving both %generic.input and %y2 specific,
-// but proving %y2 specific circles back to %y. To address this complication,
-// the data flow analysis operates on a lattice:
-//   uninitialized > specific address spaces > generic.
-// All address expressions (our implementation only considers phi, bitcast,
-// addrspacecast, and getelementptr) start with the uninitialized address space.
-// The monotone transfer function moves the address space of a pointer down a
-// lattice path from uninitialized to specific and then to generic. A join
-// operation of two different specific address spaces pushes the expression down
-// to the generic address space. The analysis completes once it reaches a fixed
-// point.
-//
-// Second, IR rewriting in Step 2 also needs to be circular. For example,
-// converting %y to addrspace(3) requires the compiler to know the converted
-// %y2, but converting %y2 needs the converted %y. To address this complication,
-// we break these cycles using "undef" placeholders. When converting an
-// instruction `I` to a new address space, if its operand `Op` is not converted
-// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
-// For instance, our algorithm first converts %y to
-//   %y' = phi float addrspace(3)* [ %input, undef ]
-// Then, it converts %y2 to
-//   %y2' = getelementptr %y', 1
-// Finally, it fixes the undef in %y' so that
-//   %y' = phi float addrspace(3)* [ %input, %y2' ]
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "nvptx-infer-addrspace"
-
-#include "NVPTX.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-
-using namespace llvm;
-
-namespace {
-const unsigned ADDRESS_SPACE_UNINITIALIZED = (unsigned)-1;
-
-using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
-
-/// \brief NVPTXInferAddressSpaces
-class NVPTXInferAddressSpaces: public FunctionPass {
-public:
-  static char ID;
-
-  NVPTXInferAddressSpaces() : FunctionPass(ID) {}
-
-  bool runOnFunction(Function &F) override;
-
-private:
-  // Returns the new address space of V if updated; otherwise, returns None.
-  Optional<unsigned>
-  updateAddressSpace(const Value &V,
-                     const ValueToAddrSpaceMapTy &InferredAddrSpace);
-
-  // Tries to infer the specific address space of each address expression in
-  // Postorder.
-  void inferAddressSpaces(const std::vector<Value *> &Postorder,
-                          ValueToAddrSpaceMapTy *InferredAddrSpace);
-
-  // Changes the generic address expressions in function F to point to specific
-  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
-  // all generic address expressions in the use-def graph of function F.
-  bool
-  rewriteWithNewAddressSpaces(const std::vector<Value *> &Postorder,
-                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
-                              Function *F);
-};
-} // end anonymous namespace
-
-char NVPTXInferAddressSpaces::ID = 0;
-
-namespace llvm {
-void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
-}
-INITIALIZE_PASS(NVPTXInferAddressSpaces, "nvptx-infer-addrspace",
-                "Infer address spaces",
-                false, false)
-
-// Returns true if V is an address expression.
-// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
-// getelementptr operators.
-static bool isAddressExpression(const Value &V) {
-  if (!isa<Operator>(V))
-    return false;
-
-  switch (cast<Operator>(V).getOpcode()) {
-  case Instruction::PHI:
-  case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
-  case Instruction::GetElementPtr:
-    return true;
-  default:
-    return false;
-  }
-}
-
-// Returns the pointer operands of V.
-//
-// Precondition: V is an address expression.
-static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
-  assert(isAddressExpression(V));
-  const Operator& Op = cast<Operator>(V);
-  switch (Op.getOpcode()) {
-  case Instruction::PHI: {
-    auto IncomingValues = cast<PHINode>(Op).incoming_values();
-    return SmallVector<Value *, 2>(IncomingValues.begin(),
-                                   IncomingValues.end());
-  }
-  case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
-  case Instruction::GetElementPtr:
-    return {Op.getOperand(0)};
-  default:
-    llvm_unreachable("Unexpected instruction type.");
-  }
-}
-
-// If V is an unvisited generic address expression, appends V to PostorderStack
-// and marks it as visited.
-static void appendsGenericAddressExpressionToPostorderStack(
-    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
-    DenseSet<Value *> *Visited) {
-  assert(V->getType()->isPointerTy());
-  if (isAddressExpression(*V) &&
-      V->getType()->getPointerAddressSpace() ==
-          AddressSpace::ADDRESS_SPACE_GENERIC) {
-    if (Visited->insert(V).second)
-      PostorderStack->push_back(std::make_pair(V, false));
-  }
-}
-
-// Returns all generic address expressions in function F. The elements are
-// ordered in postorder.
-static std::vector<Value *> collectGenericAddressExpressions(Function &F) {
-  // This function implements a non-recursive postorder traversal of a partial
-  // use-def graph of function F.
-  std::vector<std::pair<Value*, bool>> PostorderStack;
-  // The set of visited expressions.
-  DenseSet<Value*> Visited;
-  // We only explore address expressions that are reachable from loads and
-  // stores for now because we aim at generating faster loads and stores.
-  for (Instruction &I : instructions(F)) {
-    if (isa<LoadInst>(I)) {
-      appendsGenericAddressExpressionToPostorderStack(
-          I.getOperand(0), &PostorderStack, &Visited);
-    } else if (isa<StoreInst>(I)) {
-      appendsGenericAddressExpressionToPostorderStack(
-          I.getOperand(1), &PostorderStack, &Visited);
-    }
-  }
-
-  std::vector<Value *> Postorder; // The resultant postorder.
-  while (!PostorderStack.empty()) {
-    // If the operands of the expression on the top are already explored,
-    // adds that expression to the resultant postorder.
-    if (PostorderStack.back().second) {
-      Postorder.push_back(PostorderStack.back().first);
-      PostorderStack.pop_back();
-      continue;
-    }
-    // Otherwise, adds its operands to the stack and explores them.
-    PostorderStack.back().second = true;
-    for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
-      appendsGenericAddressExpressionToPostorderStack(
-          PtrOperand, &PostorderStack, &Visited);
-    }
-  }
-  return Postorder;
-}
-
-// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
-// of OperandUse.get() in the new address space. If the clone is not ready yet,
-// returns an undef in the new address space as a placeholder.
-static Value *operandWithNewAddressSpaceOrCreateUndef(
-    const Use &OperandUse, unsigned NewAddrSpace,
-    const ValueToValueMapTy &ValueWithNewAddrSpace,
-    SmallVectorImpl<const Use *> *UndefUsesToFix) {
-  Value *Operand = OperandUse.get();
-  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
-    return NewOperand;
-
-  UndefUsesToFix->push_back(&OperandUse);
-  return UndefValue::get(
-      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
-}
-
-// Returns a clone of `I` with its operands converted to those specified in
-// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
-// operand whose address space needs to be modified might not exist in
-// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
-// adds that operand use to UndefUsesToFix so that caller can fix them later.
-//
-// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
-// from a pointer whose type already matches. Therefore, this function returns a
-// Value* instead of an Instruction*.
-static Value *cloneInstructionWithNewAddressSpace(
-    Instruction *I, unsigned NewAddrSpace,
-    const ValueToValueMapTy &ValueWithNewAddrSpace,
-    SmallVectorImpl<const Use *> *UndefUsesToFix) {
-  Type *NewPtrType =
-      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
-
-  if (I->getOpcode() == Instruction::AddrSpaceCast) {
-    Value *Src = I->getOperand(0);
-    // Because `I` is generic, the source address space must be specific.
-    // Therefore, the inferred address space must be the source space, according
-    // to our algorithm.
-    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
-    if (Src->getType() != NewPtrType)
-      return new BitCastInst(Src, NewPtrType);
-    return Src;
-  }
-
-  // Computes the converted pointer operands.
-  SmallVector<Value *, 4> NewPointerOperands;
-  for (const Use &OperandUse : I->operands()) {
-    if (!OperandUse.get()->getType()->isPointerTy())
-      NewPointerOperands.push_back(nullptr);
-    else
-      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
-          OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
-  }
-
-  switch (I->getOpcode()) {
-  case Instruction::BitCast:
-    return new BitCastInst(NewPointerOperands[0], NewPtrType);
-  case Instruction::PHI: {
-    assert(I->getType()->isPointerTy());
-    PHINode *PHI = cast<PHINode>(I);
-    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
-    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
-      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
-      NewPHI->addIncoming(NewPointerOperands[OperandNo],
-                          PHI->getIncomingBlock(Index));
-    }
-    return NewPHI;
-  }
-  case Instruction::GetElementPtr: {
-    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
-    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-        GEP->getSourceElementType(), NewPointerOperands[0],
-        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
-    NewGEP->setIsInBounds(GEP->isInBounds());
-    return NewGEP;
-  }
-  default:
-    llvm_unreachable("Unexpected opcode");
-  }
-}
-
-// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
-// constant expression `CE` with its operands replaced as specified in
-// ValueWithNewAddrSpace.
-static Value *cloneConstantExprWithNewAddressSpace(
-    ConstantExpr *CE, unsigned NewAddrSpace,
-    const ValueToValueMapTy &ValueWithNewAddrSpace) {
-  Type *TargetType =
-      CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
-
-  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
-    // Because CE is generic, the source address space must be specific.
-    // Therefore, the inferred address space must be the source space according
-    // to our algorithm.
-    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
-           NewAddrSpace);
-    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
-  }
-
-  // Computes the operands of the new constant expression.
-  SmallVector<Constant *, 4> NewOperands;
-  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
-    Constant *Operand = CE->getOperand(Index);
-    // If the address space of `Operand` needs to be modified, the new operand
-    // with the new address space should already be in ValueWithNewAddrSpace
-    // because (1) the constant expressions we consider (i.e. addrspacecast,
-    // bitcast, and getelementptr) do not incur cycles in the data flow graph
-    // and (2) this function is called on constant expressions in postorder.
-    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
-      NewOperands.push_back(cast<Constant>(NewOperand));
-    } else {
-      // Otherwise, reuses the old operand.
-      NewOperands.push_back(Operand);
-    }
-  }
-
-  if (CE->getOpcode() == Instruction::GetElementPtr) {
-    // Needs to specify the source type while constructing a getelementptr
-    // constant expression.
-    return CE->getWithOperands(
-        NewOperands, TargetType, /*OnlyIfReduced=*/false,
-        NewOperands[0]->getType()->getPointerElementType());
-  }
-
-  return CE->getWithOperands(NewOperands, TargetType);
-}
-
-// Returns a clone of the value `V`, with its operands replaced as specified in
-// ValueWithNewAddrSpace. This function is called on every generic address
-// expression whose address space needs to be modified, in postorder.
-//
-// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
-static Value *
-cloneValueWithNewAddressSpace(Value *V, unsigned NewAddrSpace,
-                              const ValueToValueMapTy &ValueWithNewAddrSpace,
-                              SmallVectorImpl<const Use *> *UndefUsesToFix) {
-  // All values in Postorder are generic address expressions.
-  assert(isAddressExpression(*V) &&
-         V->getType()->getPointerAddressSpace() ==
-             AddressSpace::ADDRESS_SPACE_GENERIC);
-
-  if (Instruction *I = dyn_cast<Instruction>(V)) {
-    Value *NewV = cloneInstructionWithNewAddressSpace(
-        I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
-    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
-      if (NewI->getParent() == nullptr) {
-        NewI->insertBefore(I);
-        NewI->takeName(I);
-      }
-    }
-    return NewV;
-  }
-
-  return cloneConstantExprWithNewAddressSpace(
-      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
-}
-
-// Defines the join operation on the address space lattice (see the file header
-// comments).
-static unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) {
-  if (AS1 == AddressSpace::ADDRESS_SPACE_GENERIC ||
-      AS2 == AddressSpace::ADDRESS_SPACE_GENERIC)
-    return AddressSpace::ADDRESS_SPACE_GENERIC;
-
-  if (AS1 == ADDRESS_SPACE_UNINITIALIZED)
-    return AS2;
-  if (AS2 == ADDRESS_SPACE_UNINITIALIZED)
-    return AS1;
-
-  // The join of two different specific address spaces is generic.
-  return AS1 == AS2 ? AS1 : (unsigned)AddressSpace::ADDRESS_SPACE_GENERIC;
-}
-
-bool NVPTXInferAddressSpaces::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  // Collects all generic address expressions in postorder.
-  std::vector<Value *> Postorder = collectGenericAddressExpressions(F);
-
-  // Runs a data-flow analysis to refine the address spaces of every expression
-  // in Postorder.
-  ValueToAddrSpaceMapTy InferredAddrSpace;
-  inferAddressSpaces(Postorder, &InferredAddrSpace);
-
-  // Changes the address spaces of the generic address expressions who are
-  // inferred to point to a specific address space.
-  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
-}
-
-void NVPTXInferAddressSpaces::inferAddressSpaces(
-    const std::vector<Value *> &Postorder,
-    ValueToAddrSpaceMapTy *InferredAddrSpace) {
-  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
-  // Initially, all expressions are in the uninitialized address space.
-  for (Value *V : Postorder)
-    (*InferredAddrSpace)[V] = ADDRESS_SPACE_UNINITIALIZED;
-
-  while (!Worklist.empty()) {
-    Value* V = Worklist.pop_back_val();
-
-    // Tries to update the address space of the stack top according to the
-    // address spaces of its operands.
-    DEBUG(dbgs() << "Updating the address space of\n"
-                 << "  " << *V << "\n");
-    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
-    if (!NewAS.hasValue())
-      continue;
-    // If any updates are made, grabs its users to the worklist because
-    // their address spaces can also be possibly updated.
-    DEBUG(dbgs() << "  to " << NewAS.getValue() << "\n");
-    (*InferredAddrSpace)[V] = NewAS.getValue();
-
-    for (Value *User : V->users()) {
-      // Skip if User is already in the worklist.
-      if (Worklist.count(User))
-        continue;
-
-      auto Pos = InferredAddrSpace->find(User);
-      // Our algorithm only updates the address spaces of generic address
-      // expressions, which are those in InferredAddrSpace.
-      if (Pos == InferredAddrSpace->end())
-        continue;
-
-      // Function updateAddressSpace moves the address space down a lattice
-      // path. Therefore, nothing to do if User is already inferred as
-      // generic (the bottom element in the lattice).
-      if (Pos->second == AddressSpace::ADDRESS_SPACE_GENERIC)
-        continue;
-
-      Worklist.insert(User);
-    }
-  }
-}
-
-Optional<unsigned> NVPTXInferAddressSpaces::updateAddressSpace(
-    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) {
-  assert(InferredAddrSpace.count(&V));
-
-  // The new inferred address space equals the join of the address spaces
-  // of all its pointer operands.
-  unsigned NewAS = ADDRESS_SPACE_UNINITIALIZED;
-  for (Value *PtrOperand : getPointerOperands(V)) {
-    unsigned OperandAS;
-    if (InferredAddrSpace.count(PtrOperand))
-      OperandAS = InferredAddrSpace.lookup(PtrOperand);
-    else
-      OperandAS = PtrOperand->getType()->getPointerAddressSpace();
-    NewAS = joinAddressSpaces(NewAS, OperandAS);
-    // join(generic, *) = generic. So we can break if NewAS is already generic.
-    if (NewAS == AddressSpace::ADDRESS_SPACE_GENERIC)
-      break;
-  }
-
-  unsigned OldAS = InferredAddrSpace.lookup(&V);
-  assert(OldAS != AddressSpace::ADDRESS_SPACE_GENERIC);
-  if (OldAS == NewAS)
-    return None;
-  return NewAS;
-}
-
-bool NVPTXInferAddressSpaces::rewriteWithNewAddressSpaces(
-    const std::vector<Value *> &Postorder,
-    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) {
-  // For each address expression to be modified, creates a clone of it with its
-  // pointer operands converted to the new address space. Since the pointer
-  // operands are converted, the clone is naturally in the new address space by
-  // construction.
-  ValueToValueMapTy ValueWithNewAddrSpace;
-  SmallVector<const Use *, 32> UndefUsesToFix;
-  for (Value* V : Postorder) {
-    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
-    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
-      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
-          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
-    }
-  }
-
-  if (ValueWithNewAddrSpace.empty())
-    return false;
-
-  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
-  for (const Use* UndefUse : UndefUsesToFix) {
-    User *V = UndefUse->getUser();
-    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
-    unsigned OperandNo = UndefUse->getOperandNo();
-    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
-    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
-  }
-
-  // Replaces the uses of the old address expressions with the new ones.
-  for (Value *V : Postorder) {
-    Value *NewV = ValueWithNewAddrSpace.lookup(V);
-    if (NewV == nullptr)
-      continue;
-
-    SmallVector<Use *, 4> Uses;
-    for (Use &U : V->uses())
-      Uses.push_back(&U);
-    DEBUG(dbgs() << "Replacing the uses of " << *V << "\n  to\n  " << *NewV
-                 << "\n");
-    for (Use *U : Uses) {
-      if (isa<LoadInst>(U->getUser()) ||
-          (isa<StoreInst>(U->getUser()) && U->getOperandNo() == 1)) {
-        // If V is used as the pointer operand of a load/store, sets the pointer
-        // operand to NewV. This replacement does not change the element type,
-        // so the resultant load/store is still valid.
-        U->set(NewV);
-      } else if (isa<Instruction>(U->getUser())) {
-        // Otherwise, replaces the use with generic(NewV).
-        // TODO: Some optimization opportunities are missed. For example, in
-        //   %0 = icmp eq float* %p, %q
-        // if both p and q are inferred to be shared, we can rewrite %0 as
-        //   %0 = icmp eq float addrspace(3)* %new_p, %new_q
-        // instead of currently
-        //   %generic_p = addrspacecast float addrspace(3)* %new_p to float*
-        //   %generic_q = addrspacecast float addrspace(3)* %new_q to float*
-        //   %0 = icmp eq float* %generic_p, %generic_q
-        if (Instruction *I = dyn_cast<Instruction>(V)) {
-          BasicBlock::iterator InsertPos = std::next(I->getIterator());
-          while (isa<PHINode>(InsertPos))
-            ++InsertPos;
-          U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
-        } else {
-          U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
-                                                V->getType()));
-        }
-      }
-    }
-    if (V->use_empty())
-      RecursivelyDeleteTriviallyDeadInstructions(V);
-  }
-
-  return true;
-}
-
-FunctionPass *llvm::createNVPTXInferAddressSpacesPass() {
-  return new NVPTXInferAddressSpaces();
-}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 7f89742..da563f0 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
+#include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -38,7 +38,7 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
 
-  if (DestRC->getSize() != SrcRC->getSize())
+  if (RegInfo.getRegSizeInBits(*DestRC) != RegInfo.getRegSizeInBits(*SrcRC))
     report_fatal_error("Copy one register into another with a different width");
 
   unsigned Op;
@@ -52,6 +52,11 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else if (DestRC == &NVPTX::Int64RegsRegClass) {
     Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr
                                              : NVPTX::BITCONVERT_64_F2I);
+  } else if (DestRC == &NVPTX::Float16RegsRegClass) {
+    Op = (SrcRC == &NVPTX::Float16RegsRegClass ? NVPTX::FMOV16rr
+                                               : NVPTX::BITCONVERT_16_I2F);
+  } else if (DestRC == &NVPTX::Float16x2RegsRegClass) {
+    Op = NVPTX::IMOV32rr;
   } else if (DestRC == &NVPTX::Float32RegsRegClass) {
     Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
                                                : NVPTX::BITCONVERT_32_I2F);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 0fbb044..b5b5ea1 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1,2807 +1,3165 @@
-//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the PTX instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-include "NVPTXInstrFormats.td"
-
-// A NOP instruction
-let hasSideEffects = 0 in {
-  def NOP : NVPTXInst<(outs), (ins), "", []>;
-}
-
-// List of vector specific properties
-def isVecLD      : VecInstTypeEnum<1>;
-def isVecST      : VecInstTypeEnum<2>;
-def isVecBuild   : VecInstTypeEnum<3>;
-def isVecShuffle : VecInstTypeEnum<4>;
-def isVecExtract : VecInstTypeEnum<5>;
-def isVecInsert  : VecInstTypeEnum<6>;
-def isVecDest    : VecInstTypeEnum<7>;
-def isVecOther   : VecInstTypeEnum<15>;
-
-//===----------------------------------------------------------------------===//
-// NVPTX Operand Definitions.
-//===----------------------------------------------------------------------===//
-
-def brtarget    : Operand<OtherVT>;
-
-// CVT conversion modes
-// These must match the enum in NVPTX.h
-def CvtNONE : PatLeaf<(i32 0x0)>;
-def CvtRNI  : PatLeaf<(i32 0x1)>;
-def CvtRZI  : PatLeaf<(i32 0x2)>;
-def CvtRMI  : PatLeaf<(i32 0x3)>;
-def CvtRPI  : PatLeaf<(i32 0x4)>;
-def CvtRN   : PatLeaf<(i32 0x5)>;
-def CvtRZ   : PatLeaf<(i32 0x6)>;
-def CvtRM   : PatLeaf<(i32 0x7)>;
-def CvtRP   : PatLeaf<(i32 0x8)>;
-
-def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
-def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
-def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
-def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
-def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
-def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
-def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
-def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
-def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
-
-def CvtSAT      : PatLeaf<(i32 0x20)>;
-def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
-
-def CvtMode : Operand<i32> {
-  let PrintMethod = "printCvtMode";
-}
-
-// Compare modes
-// These must match the enum in NVPTX.h
-def CmpEQ   : PatLeaf<(i32 0)>;
-def CmpNE   : PatLeaf<(i32 1)>;
-def CmpLT   : PatLeaf<(i32 2)>;
-def CmpLE   : PatLeaf<(i32 3)>;
-def CmpGT   : PatLeaf<(i32 4)>;
-def CmpGE   : PatLeaf<(i32 5)>;
-def CmpEQU  : PatLeaf<(i32 10)>;
-def CmpNEU  : PatLeaf<(i32 11)>;
-def CmpLTU  : PatLeaf<(i32 12)>;
-def CmpLEU  : PatLeaf<(i32 13)>;
-def CmpGTU  : PatLeaf<(i32 14)>;
-def CmpGEU  : PatLeaf<(i32 15)>;
-def CmpNUM  : PatLeaf<(i32 16)>;
-def CmpNAN  : PatLeaf<(i32 17)>;
-
-def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
-def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
-def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
-def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
-def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
-def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
-def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
-def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
-def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
-def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
-def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
-def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
-def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
-def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
-
-def CmpMode : Operand<i32> {
-  let PrintMethod = "printCmpMode";
-}
-
-//===----------------------------------------------------------------------===//
-// NVPTX Instruction Predicate Definitions
-//===----------------------------------------------------------------------===//
-
-
-def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
-def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
-def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
-def useAtomRedG32forGen32 :
-  Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
-def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
-def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
-def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
-def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
-def useAtomRedG64forGen64 :
-  Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
-def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
-def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
-def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
-def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
-def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
-def hasVote : Predicate<"Subtarget->hasVote()">;
-def hasDouble : Predicate<"Subtarget->hasDouble()">;
-def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
-def hasLDG : Predicate<"Subtarget->hasLDG()">;
-def hasLDU : Predicate<"Subtarget->hasLDU()">;
-def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
-
-def doF32FTZ : Predicate<"useF32FTZ()">;
-def doNoF32FTZ : Predicate<"!useF32FTZ()">;
-
-def doMulWide      : Predicate<"doMulWide">;
-
-def allowFMA : Predicate<"allowFMA()">;
-def noFMA : Predicate<"!allowFMA()">;
-
-def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
-def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
-
-def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
-def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
-
-def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
-def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
-
-def true : Predicate<"true">;
-
-def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
-
-
-//===----------------------------------------------------------------------===//
-// Some Common Instruction Class Templates
-//===----------------------------------------------------------------------===//
-
-// Template for instructions which take three int64, int32, or int16 args.
-// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
-multiclass I3<string OpcStr, SDNode OpNode> {
-  def i64rr :
-    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
-  def i64ri :
-    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-  def i32rr :
-    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-  def i32ri :
-    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def i16rr :
-    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
-  def i16ri :
-    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
-}
-
-// Template for instructions which take 3 int32 args.  The instructions are
-// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
-multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
-   def i32rr :
-     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-   def i32ri :
-     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-}
-
-// Template for instructions which take three fp64 or fp32 args.  The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
-//
-// Also defines ftz (flush subnormal inputs and results to sign-preserving
-// zero) variants for fp32 functions.
-//
-// This multiclass should be used for nodes that cannot be folded into FMAs.
-// For nodes that can be folded into FMAs (i.e. adds and muls), use
-// F3_fma_component.
-multiclass F3<string OpcStr, SDNode OpNode> {
-   def f64rr :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, Float64Regs:$b),
-               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
-   def f64ri :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, f64imm:$b),
-               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
-   def f32rr_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-               Requires<[doF32FTZ]>;
-   def f32ri_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
-               Requires<[doF32FTZ]>;
-   def f32rr :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
-   def f32ri :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
-}
-
-// Template for instructions which take three fp64 or fp32 args.  The
-// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
-//
-// Also defines ftz (flush subnormal inputs and results to sign-preserving
-// zero) variants for fp32 functions.
-//
-// This multiclass should be used for nodes that can be folded to make fma ops.
-// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
-// just like the non ".rn" op, but prevents ptxas from creating FMAs.
-multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
-   def f64rr :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, Float64Regs:$b),
-               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
-               Requires<[allowFMA]>;
-   def f64ri :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, f64imm:$b),
-               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
-               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
-               Requires<[allowFMA]>;
-   def f32rr_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-               Requires<[allowFMA, doF32FTZ]>;
-   def f32ri_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
-               Requires<[allowFMA, doF32FTZ]>;
-   def f32rr :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-               Requires<[allowFMA]>;
-   def f32ri :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
-               Requires<[allowFMA]>;
-
-   // These have strange names so we don't perturb existing mir tests.
-   def _rnf64rr :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, Float64Regs:$b),
-               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
-               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf64ri :
-     NVPTXInst<(outs Float64Regs:$dst),
-               (ins Float64Regs:$a, f64imm:$b),
-               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
-               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf32rr_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-               Requires<[noFMA, doF32FTZ]>;
-   def _rnf32ri_ftz :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
-               Requires<[noFMA, doF32FTZ]>;
-   def _rnf32rr :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, Float32Regs:$b),
-               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
-               Requires<[noFMA]>;
-   def _rnf32ri :
-     NVPTXInst<(outs Float32Regs:$dst),
-               (ins Float32Regs:$a, f32imm:$b),
-               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
-               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
-               Requires<[noFMA]>;
-}
-
-// Template for operations which take two f32 or f64 operands.  Provides three
-// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
-// subnormal inputs and results to zero).
-multiclass F2<string OpcStr, SDNode OpNode> {
-   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
-                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
-                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
-   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
-                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
-                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
-                           Requires<[doF32FTZ]>;
-   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
-                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
-                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
-}
-
-//===----------------------------------------------------------------------===//
-// NVPTX Instructions.
-//===----------------------------------------------------------------------===//
-
-//-----------------------------------
-// Type Conversion
-//-----------------------------------
-
-let hasSideEffects = 0 in {
-  // Generate a cvt to the given type from all possible types.  Each instance
-  // takes a CvtMode immediate that defines the conversion mode to use.  It can
-  // be CvtNONE to omit a conversion mode.
-  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
-    def _s8 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int16Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s8\t$dst, $src;"), []>;
-    def _u8 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int16Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u8\t$dst, $src;"), []>;
-    def _s16 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int16Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s16\t$dst, $src;"), []>;
-    def _u16 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int16Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u16\t$dst, $src;"), []>;
-    def _f16 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int16Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".f16\t$dst, $src;"), []>;
-    def _s32 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int32Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s32\t$dst, $src;"), []>;
-    def _u32 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int32Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u32\t$dst, $src;"), []>;
-    def _s64 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int64Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".s64\t$dst, $src;"), []>;
-    def _u64 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Int64Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".u64\t$dst, $src;"), []>;
-    def _f32 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Float32Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".f32\t$dst, $src;"), []>;
-    def _f64 :
-      NVPTXInst<(outs RC:$dst),
-                (ins Float64Regs:$src, CvtMode:$mode),
-                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
-                FromName, ".f64\t$dst, $src;"), []>;
-  }
-
-  // Generate cvts from all types to all types.
-  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
-  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
-  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
-  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
-  defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
-  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
-  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
-  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
-  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
-  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
-  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
-
-  // These cvts are different from those above: The source and dest registers
-  // are of the same type.
-  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-                                    "cvt.s16.s8 \t$dst, $src;", []>;
-  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-                                    "cvt.s32.s8 \t$dst, $src;", []>;
-  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-                                    "cvt.s32.s16 \t$dst, $src;", []>;
-  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                                    "cvt.s64.s8 \t$dst, $src;", []>;
-  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                                    "cvt.s64.s16 \t$dst, $src;", []>;
-  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                                    "cvt.s64.s32 \t$dst, $src;", []>;
-}
-
-//-----------------------------------
-// Integer Arithmetic
-//-----------------------------------
-
-// Template for xor masquerading as int1 arithmetic.
-multiclass ADD_SUB_i1<SDNode OpNode> {
-   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
-                      "xor.pred \t$dst, $a, $b;",
-                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
-   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
-                      "xor.pred \t$dst, $a, $b;",
-                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
-}
-
-// int1 addition and subtraction are both just xor.
-defm ADD_i1 : ADD_SUB_i1<add>;
-defm SUB_i1 : ADD_SUB_i1<sub>;
-
-// int16, int32, and int64 signed addition.  Since nvptx is 2's compliment, we
-// also use these for unsigned arithmetic.
-defm ADD : I3<"add.s", add>;
-defm SUB : I3<"sub.s", sub>;
-
-// int32 addition and subtraction with carry-out.
-// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
-defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
-defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
-
-// int32 addition and subtraction with carry-in and carry-out.
-defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
-defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
-
-defm MULT : I3<"mul.lo.s", mul>;
-
-defm MULTHS : I3<"mul.hi.s", mulhs>;
-defm MULTHU : I3<"mul.hi.u", mulhu>;
-
-defm SDIV : I3<"div.s", sdiv>;
-defm UDIV : I3<"div.u", udiv>;
-
-// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
-// will lower it.
-defm SREM : I3<"rem.s", srem>;
-defm UREM : I3<"rem.u", urem>;
-
-
-//
-// Wide multiplication
-//
-def MULWIDES64 :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-            "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-            "mul.wide.s32 \t$dst, $a, $b;", []>;
-def MULWIDES64Imm64 :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
-            "mul.wide.s32 \t$dst, $a, $b;", []>;
-
-def MULWIDEU64 :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-            "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-            "mul.wide.u32 \t$dst, $a, $b;", []>;
-def MULWIDEU64Imm64 :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
-            "mul.wide.u32 \t$dst, $a, $b;", []>;
-
-def MULWIDES32 :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-            "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-            "mul.wide.s16 \t$dst, $a, $b;", []>;
-def MULWIDES32Imm32 :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-            "mul.wide.s16 \t$dst, $a, $b;", []>;
-
-def MULWIDEU32 :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-            "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-            "mul.wide.u16 \t$dst, $a, $b;", []>;
-def MULWIDEU32Imm32 :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-            "mul.wide.u16 \t$dst, $a, $b;", []>;
-
-def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
-def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
-def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
-
-// Matchers for signed, unsigned mul.wide ISD nodes.
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
-          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
-      Requires<[doMulWide]>;
-
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
-          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
-      Requires<[doMulWide]>;
-
-// Predicates used for converting some patterns to mul.wide.
-def SInt32Const : PatLeaf<(imm), [{
-  const APInt &v = N->getAPIntValue();
-  return v.isSignedIntN(32);
-}]>;
-
-def UInt32Const : PatLeaf<(imm), [{
-  const APInt &v = N->getAPIntValue();
-  return v.isIntN(32);
-}]>;
-
-def SInt16Const : PatLeaf<(imm), [{
-  const APInt &v = N->getAPIntValue();
-  return v.isSignedIntN(16);
-}]>;
-
-def UInt16Const : PatLeaf<(imm), [{
-  const APInt &v = N->getAPIntValue();
-  return v.isIntN(16);
-}]>;
-
-def Int5Const : PatLeaf<(imm), [{
-  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
-  const APInt &v = N->getAPIntValue();
-  return v.sge(0) && v.slt(32);
-}]>;
-
-def Int4Const : PatLeaf<(imm), [{
-  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
-  const APInt &v = N->getAPIntValue();
-  return v.sge(0) && v.slt(16);
-}]>;
-
-def SHL2MUL32 : SDNodeXForm<imm, [{
-  const APInt &v = N->getAPIntValue();
-  APInt temp(32, 1);
-  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
-}]>;
-
-def SHL2MUL16 : SDNodeXForm<imm, [{
-  const APInt &v = N->getAPIntValue();
-  APInt temp(16, 1);
-  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
-}]>;
-
-// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
-def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
-          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
-      Requires<[doMulWide]>;
-def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
-          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
-      Requires<[doMulWide]>;
-
-def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
-          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
-      Requires<[doMulWide]>;
-def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
-          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
-      Requires<[doMulWide]>;
-
-// Convert "sign/zero-extend then multiply" to mul.wide.
-def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
-          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
-          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
-      Requires<[doMulWide]>;
-
-def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
-          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
-          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
-      Requires<[doMulWide]>;
-
-def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
-          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
-          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
-      Requires<[doMulWide]>;
-
-def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
-          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
-      Requires<[doMulWide]>;
-def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
-          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
-      Requires<[doMulWide]>;
-
-//
-// Integer multiply-add
-//
-def SDTIMAD :
-  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
-                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
-def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
-
-def MAD16rrr :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
-def MAD16rri :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
-def MAD16rir :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
-def MAD16rii :
-  NVPTXInst<(outs Int16Regs:$dst),
-            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
-            "mad.lo.s16 \t$dst, $a, $b, $c;",
-            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD32rrr :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
-def MAD32rri :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
-def MAD32rir :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
-def MAD32rii :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
-            "mad.lo.s32 \t$dst, $a, $b, $c;",
-            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
-
-def MAD64rrr :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
-def MAD64rri :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
-def MAD64rir :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
-def MAD64rii :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
-            "mad.lo.s64 \t$dst, $a, $b, $c;",
-            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
-
-def INEG16 :
-  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "neg.s16 \t$dst, $src;",
-            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
-def INEG32 :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-            "neg.s32 \t$dst, $src;",
-            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
-def INEG64 :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-            "neg.s64 \t$dst, $src;",
-            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
-
-//-----------------------------------
-// Floating Point Arithmetic
-//-----------------------------------
-
-// Constant 1.0f
-def FloatConst1 : PatLeaf<(fpimm), [{
-  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
-         N->getValueAPF().convertToFloat() == 1.0f;
-}]>;
-// Constant 1.0 (double)
-def DoubleConst1 : PatLeaf<(fpimm), [{
-  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
-         N->getValueAPF().convertToDouble() == 1.0;
-}]>;
-
-defm FADD : F3_fma_component<"add", fadd>;
-defm FSUB : F3_fma_component<"sub", fsub>;
-defm FMUL : F3_fma_component<"mul", fmul>;
-
-defm FMIN : F3<"min", fminnum>;
-defm FMAX : F3<"max", fmaxnum>;
-
-defm FABS  : F2<"abs", fabs>;
-defm FNEG  : F2<"neg", fneg>;
-defm FSQRT : F2<"sqrt.rn", fsqrt>;
-
-//
-// F64 division
-//
-def FDIV641r :
-  NVPTXInst<(outs Float64Regs:$dst),
-            (ins f64imm:$a, Float64Regs:$b),
-            "rcp.rn.f64 \t$dst, $b;",
-            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
-def FDIV64rr :
-  NVPTXInst<(outs Float64Regs:$dst),
-            (ins Float64Regs:$a, Float64Regs:$b),
-            "div.rn.f64 \t$dst, $a, $b;",
-            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
-def FDIV64ri :
-  NVPTXInst<(outs Float64Regs:$dst),
-            (ins Float64Regs:$a, f64imm:$b),
-            "div.rn.f64 \t$dst, $a, $b;",
-            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
-
-//
-// F32 Approximate reciprocal
-//
-def FDIV321r_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.approx.ftz.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV321r :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.approx.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_APPROX]>;
-//
-// F32 Approximate division
-//
-def FDIV32approxrr_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, Float32Regs:$b),
-            "div.approx.ftz.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxri_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, f32imm:$b),
-            "div.approx.ftz.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
-def FDIV32approxrr :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, Float32Regs:$b),
-            "div.approx.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_APPROX]>;
-def FDIV32approxri :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, f32imm:$b),
-            "div.approx.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_APPROX]>;
-//
-// F32 Semi-accurate reciprocal
-//
-// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
-//
-def FDIV321r_approx_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.approx.ftz.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV321r_approx :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.approx.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_FULL]>;
-//
-// F32 Semi-accurate division
-//
-def FDIV32rr_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, Float32Regs:$b),
-            "div.full.ftz.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32ri_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, f32imm:$b),
-            "div.full.ftz.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_FULL, doF32FTZ]>;
-def FDIV32rr :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, Float32Regs:$b),
-            "div.full.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[do_DIVF32_FULL]>;
-def FDIV32ri :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, f32imm:$b),
-            "div.full.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[do_DIVF32_FULL]>;
-//
-// F32 Accurate reciprocal
-//
-def FDIV321r_prec_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.rn.ftz.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[reqPTX20, doF32FTZ]>;
-def FDIV321r_prec :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins f32imm:$a, Float32Regs:$b),
-            "rcp.rn.f32 \t$dst, $b;",
-            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
-            Requires<[reqPTX20]>;
-//
-// F32 Accurate division
-//
-def FDIV32rr_prec_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, Float32Regs:$b),
-            "div.rn.ftz.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32ri_prec_ftz :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, f32imm:$b),
-            "div.rn.ftz.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[doF32FTZ, reqPTX20]>;
-def FDIV32rr_prec :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, Float32Regs:$b),
-            "div.rn.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
-            Requires<[reqPTX20]>;
-def FDIV32ri_prec :
-  NVPTXInst<(outs Float32Regs:$dst),
-            (ins Float32Regs:$a, f32imm:$b),
-            "div.rn.f32 \t$dst, $a, $b;",
-            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
-            Requires<[reqPTX20]>;
-
-//
-// F32 rsqrt
-//
-
-def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
-                       "rsqrt.approx.f32 \t$dst, $b;", []>;
-
-// Convert 1.0f/sqrt(x) to rsqrt.approx.f32.  (There is an rsqrt.approx.f64, but
-// it's emulated in software.)
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
-         (RSQRTF32approx1r Float32Regs:$b)>,
-         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
-
-multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
-   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
-                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
-                       Requires<[Pred]>;
-   def rri : NVPTXInst<(outs RC:$dst),
-                       (ins RC:$a, RC:$b, ImmCls:$c),
-                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
-                       Requires<[Pred]>;
-   def rir : NVPTXInst<(outs RC:$dst),
-                       (ins RC:$a, ImmCls:$b, RC:$c),
-                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
-                       Requires<[Pred]>;
-   def rii : NVPTXInst<(outs RC:$dst),
-                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
-                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
-                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
-                       Requires<[Pred]>;
-}
-
-defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
-defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
-defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
-
-// sin/cos
-def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
-                      "sin.approx.f32 \t$dst, $src;",
-                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>;
-def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
-                      "cos.approx.f32 \t$dst, $src;",
-                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
-
-// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
-// i.e. "poor man's fmod()"
-
-// frem - f32 FTZ
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
-          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
-            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
-             Float32Regs:$y))>,
-          Requires<[doF32FTZ]>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
-          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
-            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
-             fpimm:$y))>,
-          Requires<[doF32FTZ]>;
-
-// frem - f32
-def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
-          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
-            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
-             Float32Regs:$y))>;
-def : Pat<(frem Float32Regs:$x, fpimm:$y),
-          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
-            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
-             fpimm:$y))>;
-
-// frem - f64
-def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
-          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
-            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
-             Float64Regs:$y))>;
-def : Pat<(frem Float64Regs:$x, fpimm:$y),
-          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
-            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
-             fpimm:$y))>;
-
-//-----------------------------------
-// Bitwise operations
-//-----------------------------------
-
-// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
-// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
-multiclass BITWISE<string OpcStr, SDNode OpNode> {
-  def b1rr :
-    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
-              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
-              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
-  def b1ri :
-    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
-              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
-              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
-  def b16rr :
-    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
-  def b16ri :
-    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
-  def b32rr :
-    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
-              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-  def b32ri :
-    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
-              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def b64rr :
-    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
-              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
-  def b64ri :
-    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
-              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-}
-
-defm OR  : BITWISE<"or", or>;
-defm AND : BITWISE<"and", and>;
-defm XOR : BITWISE<"xor", xor>;
-
-def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
-                      "not.pred \t$dst, $src;",
-                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
-def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-                      "not.b16 \t$dst, $src;",
-                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
-def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
-                      "not.b32 \t$dst, $src;",
-                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
-def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
-                       "not.b64 \t$dst, $src;",
-                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
-
-// Template for left/right shifts.  Takes three operands,
-//   [dest (reg), src (reg), shift (reg or imm)].
-// dest and src may be int64, int32, or int16, but shift is always int32.
-//
-// This template also defines a 32-bit shift (imm, imm) instruction.
-multiclass SHIFT<string OpcStr, SDNode OpNode> {
-   def i64rr :
-     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
-   def i64ri :
-     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
-               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
-   def i32rr :
-     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-   def i32ri :
-     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
-   def i32ii :
-     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
-               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
-   def i16rr :
-     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
-               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
-   def i16ri :
-     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
-               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
-}
-
-defm SHL : SHIFT<"shl.b", shl>;
-defm SRA : SHIFT<"shr.s", sra>;
-defm SRL : SHIFT<"shr.u", srl>;
-
-//
-// Rotate: Use ptx shf instruction if available.
-//
-
-// 32 bit r2 = rotl r1, n
-//    =>
-//        r2 = shf.l r1, r1, n
-def ROTL32imm_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
-            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
-           Requires<[hasHWROT32]>;
-
-def ROTL32reg_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
-           Requires<[hasHWROT32]>;
-
-// 32 bit r2 = rotr r1, n
-//    =>
-//        r2 = shf.r r1, r1, n
-def ROTR32imm_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
-            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
-           Requires<[hasHWROT32]>;
-
-def ROTR32reg_hw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
-            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
-           Requires<[hasHWROT32]>;
-
-// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
-def ROT32imm_sw :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
-            "{{\n\t"
-            ".reg .b32 %lhs;\n\t"
-            ".reg .b32 %rhs;\n\t"
-            "shl.b32 \t%lhs, $src, $amt1;\n\t"
-            "shr.b32 \t%rhs, $src, $amt2;\n\t"
-            "add.u32 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            []>;
-
-def SUB_FRM_32 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
-          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
-      Requires<[noHWROT32]>;
-def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
-          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
-      Requires<[noHWROT32]>;
-
-// 32-bit software rotate left by register.
-def ROTL32reg_sw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b32 %lhs;\n\t"
-            ".reg .b32 %rhs;\n\t"
-            ".reg .b32 %amt2;\n\t"
-            "shl.b32 \t%lhs, $src, $amt;\n\t"
-            "sub.s32 \t%amt2, 32, $amt;\n\t"
-            "shr.b32 \t%rhs, $src, %amt2;\n\t"
-            "add.u32 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
-           Requires<[noHWROT32]>;
-
-// 32-bit software rotate right by register.
-def ROTR32reg_sw :
-  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b32 %lhs;\n\t"
-            ".reg .b32 %rhs;\n\t"
-            ".reg .b32 %amt2;\n\t"
-            "shr.b32 \t%lhs, $src, $amt;\n\t"
-            "sub.s32 \t%amt2, 32, $amt;\n\t"
-            "shl.b32 \t%rhs, $src, %amt2;\n\t"
-            "add.u32 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
-           Requires<[noHWROT32]>;
-
-// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
-def ROT64imm_sw :
-  NVPTXInst<(outs Int64Regs:$dst),
-            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
-            "{{\n\t"
-            ".reg .b64 %lhs;\n\t"
-            ".reg .b64 %rhs;\n\t"
-            "shl.b64 \t%lhs, $src, $amt1;\n\t"
-            "shr.b64 \t%rhs, $src, $amt2;\n\t"
-            "add.u64 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            []>;
-
-def SUB_FRM_64 : SDNodeXForm<imm, [{
-    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
-}]>;
-
-def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
-          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
-def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
-          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
-
-// 64-bit software rotate left by register.
-def ROTL64reg_sw :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b64 %lhs;\n\t"
-            ".reg .b64 %rhs;\n\t"
-            ".reg .u32 %amt2;\n\t"
-            "shl.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
-            "shr.b64 \t%rhs, $src, %amt2;\n\t"
-            "add.u64 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
-
-def ROTR64reg_sw :
-  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
-            "{{\n\t"
-            ".reg .b64 %lhs;\n\t"
-            ".reg .b64 %rhs;\n\t"
-            ".reg .u32 %amt2;\n\t"
-            "shr.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
-            "shl.b64 \t%rhs, $src, %amt2;\n\t"
-            "add.u64 \t$dst, %lhs, %rhs;\n\t"
-            "}}",
-            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
-
-//
-// Funnnel shift in clamp mode
-//
-
-// Create SDNodes so they can be used in the DAG code, e.g.
-// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
-def SDTIntShiftDOp :
-  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                       SDTCisInt<0>, SDTCisInt<3>]>;
-def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
-def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
-
-def FUNSHFLCLAMP :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
-            [(set Int32Regs:$dst,
-              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
-
-def FUNSHFRCLAMP :
-  NVPTXInst<(outs Int32Regs:$dst),
-            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
-            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
-            [(set Int32Regs:$dst,
-             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
-
-//
-// BFE - bit-field extract
-//
-
-// Template for BFE instructions.  Takes four args,
-//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
-// Start may be an imm only if end is also an imm.  FIXME: Is this a
-// restriction in PTX?
-//
-// dest and src may be int32 or int64, but start and end are always int32.
-multiclass BFE<string TyStr, RegisterClass RC> {
-  def rrr
-    : NVPTXInst<(outs RC:$d),
-                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
-                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
-  def rri
-    : NVPTXInst<(outs RC:$d),
-                (ins RC:$a, Int32Regs:$b, i32imm:$c),
-                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
-  def rii
-    : NVPTXInst<(outs RC:$d),
-                (ins RC:$a, i32imm:$b, i32imm:$c),
-                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
-}
-
-let hasSideEffects = 0 in {
-  defm BFE_S32 : BFE<"s32", Int32Regs>;
-  defm BFE_U32 : BFE<"u32", Int32Regs>;
-  defm BFE_S64 : BFE<"s64", Int64Regs>;
-  defm BFE_U64 : BFE<"u64", Int64Regs>;
-}
-
-//-----------------------------------
-// Comparison instructions (setp, set)
-//-----------------------------------
-
-// FIXME: This doesn't cover versions of set and setp that combine with a
-// boolean predicate, e.g. setp.eq.and.b16.
-
-let hasSideEffects = 0 in {
-  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
-    def rr :
-      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
-                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
-                           "\t$dst, $a, $b;"), []>;
-    def ri :
-      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
-                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
-                           "\t$dst, $a, $b;"), []>;
-    def ir :
-      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
-                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
-                           "\t$dst, $a, $b;"), []>;
-  }
-}
-
-defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
-defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
-defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
-defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
-defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
-defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
-defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
-defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
-defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
-defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
-defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
-
-// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
-// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
-// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
-
-let hasSideEffects = 0 in {
-  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
-    def rr : NVPTXInst<(outs Int32Regs:$dst),
-                       (ins RC:$a, RC:$b, CmpMode:$cmp),
-                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
-    def ri : NVPTXInst<(outs Int32Regs:$dst),
-                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
-                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
-    def ir : NVPTXInst<(outs Int32Regs:$dst),
-                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
-                       !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
-  }
-}
-
-defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
-defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
-defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
-defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
-defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
-defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
-defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
-defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
-defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
-defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
-defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
-
-//-----------------------------------
-// Selection instructions (selp)
-//-----------------------------------
-
-// FIXME: Missing slct
-
-// selp instructions that don't have any pattern matches; we explicitly use
-// them within this file.
-let hasSideEffects = 0 in {
-  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
-    def rr : NVPTXInst<(outs RC:$dst),
-                       (ins RC:$a, RC:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-    def ri : NVPTXInst<(outs RC:$dst),
-                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-    def ir : NVPTXInst<(outs RC:$dst),
-                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-    def ii : NVPTXInst<(outs RC:$dst),
-                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
-                       !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
-  }
-
-  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
-                          SDNode ImmNode> {
-    def rr :
-      NVPTXInst<(outs RC:$dst),
-                (ins RC:$a, RC:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
-    def ri :
-      NVPTXInst<(outs RC:$dst),
-                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
-    def ir :
-      NVPTXInst<(outs RC:$dst),
-                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
-    def ii :
-      NVPTXInst<(outs RC:$dst),
-                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
-                !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
-                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
-  }
-}
-
-// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
-// good.
-defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
-defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
-defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
-defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
-defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
-defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
-defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
-defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
-defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
-defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
-defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
-
-//-----------------------------------
-// Data Movement (Load / Store, Move)
-//-----------------------------------
-
-def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
-                            [SDNPWantRoot]>;
-def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
-                              [SDNPWantRoot]>;
-
-def MEMri : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops Int32Regs, i32imm);
-}
-def MEMri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops Int64Regs, i64imm);
-}
-
-def imem : Operand<iPTR> {
-  let PrintMethod = "printOperand";
-}
-
-def imemAny : Operand<iPTRAny> {
-  let PrintMethod = "printOperand";
-}
-
-def LdStCode : Operand<i32> {
-  let PrintMethod = "printLdStCode";
-}
-
-def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
-def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
-
-// Load a memory address into a u32 or u64 register.
-def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
-                         "mov.u32 \t$dst, $a;",
-                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
-                           "mov.u64 \t$dst, $a;",
-                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
-
-// Get pointer to local stack.
-let hasSideEffects = 0 in {
-  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
-                                     "mov.u32 \t$d, __local_depot$num;", []>;
-  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
-                                    "mov.u64 \t$d, __local_depot$num;", []>;
-}
-
-
-// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let IsSimpleMove=1, hasSideEffects=0 in {
-  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
-                           "mov.pred \t$dst, $sss;", []>;
-  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
-                           "mov.u16 \t$dst, $sss;", []>;
-  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
-                           "mov.u32 \t$dst, $sss;", []>;
-  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
-                           "mov.u64 \t$dst, $sss;", []>;
-
-  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
-                           "mov.f32 \t$dst, $src;", []>;
-  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
-                           "mov.f64 \t$dst, $src;", []>;
-}
-
-def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
-                        "mov.pred \t$dst, $src;",
-                        [(set Int1Regs:$dst, imm:$src)]>;
-def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
-                         "mov.u16 \t$dst, $src;",
-                         [(set Int16Regs:$dst, imm:$src)]>;
-def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
-                         "mov.u32 \t$dst, $src;",
-                         [(set Int32Regs:$dst, imm:$src)]>;
-def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
-                        "mov.u64 \t$dst, $src;",
-                        [(set Int64Regs:$dst, imm:$src)]>;
-
-def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
-                         "mov.f32 \t$dst, $src;",
-                         [(set Float32Regs:$dst, fpimm:$src)]>;
-def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
-                         "mov.f64 \t$dst, $src;",
-                         [(set Float64Regs:$dst, fpimm:$src)]>;
-
-def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
-
-//---- Copy Frame Index ----
-def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
-                            "add.u32 \t$dst, ${addr:add};",
-                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
-def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
-                            "add.u64 \t$dst, ${addr:add};",
-                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
-
-//-----------------------------------
-// Comparison and Selection
-//-----------------------------------
-
-multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
-                       Instruction setp_16rr,
-                       Instruction setp_16ri,
-                       Instruction setp_16ir,
-                       Instruction setp_32rr,
-                       Instruction setp_32ri,
-                       Instruction setp_32ir,
-                       Instruction setp_64rr,
-                       Instruction setp_64ri,
-                       Instruction setp_64ir,
-                       Instruction set_16rr,
-                       Instruction set_16ri,
-                       Instruction set_16ir,
-                       Instruction set_32rr,
-                       Instruction set_32ri,
-                       Instruction set_32ir,
-                       Instruction set_64rr,
-                       Instruction set_64ri,
-                       Instruction set_64ir> {
-  // i16 -> pred
-  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
-            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
-  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
-            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
-  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
-            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
-  // i32 -> pred
-  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
-            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
-  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
-            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
-  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
-            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
-  // i64 -> pred
-  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
-            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
-  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
-            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
-  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
-            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
-
-  // i16 -> i32
-  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
-            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
-  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
-            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
-  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
-            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
-  // i32 -> i32
-  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
-            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
-  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
-            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
-  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
-            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
-  // i64 -> i32
-  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
-            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
-  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
-            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
-  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
-            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
-}
-
-multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
-  : ISET_FORMAT<OpNode, Mode,
-                SETP_s16rr, SETP_s16ri, SETP_s16ir,
-                SETP_s32rr, SETP_s32ri, SETP_s32ir,
-                SETP_s64rr, SETP_s64ri, SETP_s64ir,
-                SET_s16rr, SET_s16ri, SET_s16ir,
-                SET_s32rr, SET_s32ri, SET_s32ir,
-                SET_s64rr, SET_s64ri, SET_s64ir> {
-  // TableGen doesn't like empty multiclasses.
-  def : PatLeaf<(i32 0)>;
-}
-
-multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
-  : ISET_FORMAT<OpNode, Mode,
-                SETP_u16rr, SETP_u16ri, SETP_u16ir,
-                SETP_u32rr, SETP_u32ri, SETP_u32ir,
-                SETP_u64rr, SETP_u64ri, SETP_u64ir,
-                SET_u16rr, SET_u16ri, SET_u16ir,
-                SET_u32rr, SET_u32ri, SET_u32ir,
-                SET_u64rr, SET_u64ri, SET_u64ir> {
-  // TableGen doesn't like empty multiclasses.
-  def : PatLeaf<(i32 0)>;
-}
-
-defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
-defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
-defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
-defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
-defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
-defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
-defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
-defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
-defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
-defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
-defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
-defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
-
-// i1 compares
-def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
-          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
-          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
-
-def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
-          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
-          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-
-// i1 compare -> i32
-def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
-          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
-          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
-
-
-
-multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
-  // f32 -> pred
-  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
-            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
-        Requires<[doF32FTZ]>;
-  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
-            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
-  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
-            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
-        Requires<[doF32FTZ]>;
-  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
-            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
-  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
-            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
-        Requires<[doF32FTZ]>;
-  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
-            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
-
-  // f64 -> pred
-  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
-            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
-  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
-            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
-  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
-            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
-
-  // f32 -> i32
-  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
-            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
-        Requires<[doF32FTZ]>;
-  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
-            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
-  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
-            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
-        Requires<[doF32FTZ]>;
-  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
-            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
-  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
-            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
-        Requires<[doF32FTZ]>;
-  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
-            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
-
-  // f64 -> i32
-  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
-            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
-  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
-            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
-  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
-            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
-}
-
-defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
-defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
-defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
-defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
-defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
-defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
-
-defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
-defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
-defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
-defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
-defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
-defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
-
-defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
-defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
-defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
-defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
-defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
-defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
-
-defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
-defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
-
-// FIXME: What is this doing here?  Can it be deleted?
-// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
-//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
-def SDTDeclareParamProfile :
-  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTDeclareScalarParamProfile :
-  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
-def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
-def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
-def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
-def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
-def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
-def SDTCallValProfile : SDTypeProfile<1, 0, []>;
-def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
-def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
-def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
-def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
-def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
-
-def DeclareParam :
-  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareScalarParam :
-  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRetParam :
-  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def DeclareRet :
-  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LoadParam :
-  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
-         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV2 :
-  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
-         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def LoadParamV4 :
-  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
-         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
-def PrintCall :
-  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintConvergentCall :
-  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintCallUni :
-  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def PrintConvergentCallUni :
-  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParam :
-  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV2 :
-  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamV4 :
-  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamU32 :
-  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def StoreParamS32 :
-  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgBegin :
-  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArg :
-  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def LastCallArg :
-  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallArgEnd :
-  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVoid :
-  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def Prototype :
-  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def CallVal :
-  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def MoveParam :
-  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
-def StoreRetval :
-  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
-         [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV2 :
-  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
-         [SDNPHasChain, SDNPSideEffect]>;
-def StoreRetvalV4 :
-  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
-         [SDNPHasChain, SDNPSideEffect]>;
-def PseudoUseParam :
-  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def RETURNNode :
-  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
-         [SDNPHasChain, SDNPSideEffect]>;
-
-let mayLoad = 1 in {
-  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                  !strconcat(!strconcat("ld.param", opstr),
-                             "\t$dst, [retval0+$b];"),
-                  []>;
-
-  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
-                  !strconcat("ld.param.v2", opstr,
-                             "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
-
-  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
-                        regclass:$dst4),
-                  (ins i32imm:$b),
-                  !strconcat("ld.param.v4", opstr,
-                             "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
-                  []>;
-}
-
-class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
-                !strconcat("mov", opstr, "\t$dst, retval$b;"),
-                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
-
-let mayStore = 1 in {
-  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                  !strconcat("st.param", opstr, "\t[param$a+$b], $val;"),
-                  []>;
-
-  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
-                               i32imm:$a, i32imm:$b),
-                  !strconcat("st.param.v2", opstr,
-                             "\t[param$a+$b], {{$val, $val2}};"),
-                  []>;
-
-  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
-                               regclass:$val4, i32imm:$a,
-                               i32imm:$b),
-                  !strconcat("st.param.v4", opstr,
-                             "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
-                  []>;
-
-  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
-                  !strconcat("st.param", opstr, "\t[func_retval0+$a], $val;"),
-                  []>;
-
-  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
-                  !strconcat("st.param.v2", opstr,
-                             "\t[func_retval0+$a], {{$val, $val2}};"),
-                  []>;
-
-  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
-        NVPTXInst<(outs),
-                  (ins regclass:$val, regclass:$val2, regclass:$val3,
-                       regclass:$val4, i32imm:$a),
-                  !strconcat("st.param.v4", opstr,
-                             "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
-                  []>;
-}
-
-let isCall=1 in {
-  multiclass CALL<string OpcStr, SDNode OpNode> {
-     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
-     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
-     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
-     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
-     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
-       [(OpNode (i32 4))]>;
-     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
-       [(OpNode (i32 5))]>;
-     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
-                            "retval5), "),
-       [(OpNode (i32 6))]>;
-     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
-                            "retval5, retval6), "),
-       [(OpNode (i32 7))]>;
-     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
-       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
-                            "retval5, retval6, retval7), "),
-       [(OpNode (i32 8))]>;
-  }
-}
-
-defm Call : CALL<"call", PrintCall>;
-defm CallUni : CALL<"call.uni", PrintCallUni>;
-
-// Convergent call instructions.  These are identical to regular calls, except
-// they have the isConvergent bit set.
-let isConvergent=1 in {
-  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
-  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
-}
-
-def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
-def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
-def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
-def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
-def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
-def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
-def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
-def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
-def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
-def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
-def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
-def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
-def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
-def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
-def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
-def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
-
-def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
-def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
-
-def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
-def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
-def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
-def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
-def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
-def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
-
-def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
-def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
-def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
-
-def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
-def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
-def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
-def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
-def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
-
-def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
-def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
-def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
-def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
-def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
-def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
-def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
-def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
-def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
-def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
-def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
-
-def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
-def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
-def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
-def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
-def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
-
-def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
-def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
-def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
-def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
-
-class CallArgInst<NVPTXRegClass regclass> :
-  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
-            [(CallArg (i32 0), regclass:$a)]>;
-
-class LastCallArgInst<NVPTXRegClass regclass> :
-  NVPTXInst<(outs), (ins regclass:$a), "$a",
-            [(LastCallArg (i32 0), regclass:$a)]>;
-
-def CallArgI64     : CallArgInst<Int64Regs>;
-def CallArgI32     : CallArgInst<Int32Regs>;
-def CallArgI16     : CallArgInst<Int16Regs>;
-def CallArgF64     : CallArgInst<Float64Regs>;
-def CallArgF32     : CallArgInst<Float32Regs>;
-
-def LastCallArgI64 : LastCallArgInst<Int64Regs>;
-def LastCallArgI32 : LastCallArgInst<Int32Regs>;
-def LastCallArgI16 : LastCallArgInst<Int16Regs>;
-def LastCallArgF64 : LastCallArgInst<Float64Regs>;
-def LastCallArgF32 : LastCallArgInst<Float32Regs>;
-
-def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
-                              [(CallArg (i32 0), (i32 imm:$a))]>;
-def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
-                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
-
-def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
-                             [(CallArg (i32 1), (i32 imm:$a))]>;
-def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
-                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
-
-def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
-                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
-def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
-                                  [(CallVoid Int32Regs:$addr)]>;
-def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
-                                  [(CallVoid Int64Regs:$addr)]>;
-def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
-                                  [(Prototype (i32 imm:$val))]>;
-
-def DeclareRetMemInst :
-  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
-            ".param .align $align .b8 retval$num[$size];",
-            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetScalarInst :
-  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
-            ".param .b$size retval$num;",
-            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
-def DeclareRetRegInst :
-  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
-            ".reg .b$size retval$num;",
-            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
-
-def DeclareParamInst :
-  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
-            ".param .align $align .b8 param$a[$size];",
-            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
-def DeclareScalarParamInst :
-  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-            ".param .b$size param$a;",
-            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
-def DeclareScalarRegInst :
-  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
-            ".reg .b$size param$a;",
-            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
-
-class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
-  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
-            !strconcat("mov", asmstr, "\t$dst, $src;"),
-            [(set regclass:$dst, (MoveParam regclass:$src))]>;
-
-def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
-def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
-def MoveParamI16 :
-  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-            "cvt.u16.u32\t$dst, $src;",
-            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
-def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
-def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
-
-class PseudoUseParamInst<NVPTXRegClass regclass> :
-  NVPTXInst<(outs), (ins regclass:$src),
-            "// Pseudo use of $src",
-            [(PseudoUseParam regclass:$src)]>;
-
-def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
-def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
-def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
-def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
-def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
-
-
-//
-// Load / Store Handling
-//
-multiclass LD<NVPTXRegClass regclass> {
-  def _avar : NVPTXInst<
-    (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr];", []>;
-  def _areg : NVPTXInst<
-    (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr];", []>;
-  def _areg_64 : NVPTXInst<
-    (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr];", []>;
-  def _ari : NVPTXInst<
-    (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr+$offset];", []>;
-  def _ari_64 : NVPTXInst<
-    (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr+$offset];", []>;
-  def _asi : NVPTXInst<
-    (outs regclass:$dst),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t$dst, [$addr+$offset];", []>;
-}
-
-let mayLoad=1, hasSideEffects=0 in {
-  defm LD_i8  : LD<Int16Regs>;
-  defm LD_i16 : LD<Int16Regs>;
-  defm LD_i32 : LD<Int32Regs>;
-  defm LD_i64 : LD<Int64Regs>;
-  defm LD_f32 : LD<Float32Regs>;
-  defm LD_f64 : LD<Float64Regs>;
-}
-
-multiclass ST<NVPTXRegClass regclass> {
-  def _avar : NVPTXInst<
-    (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
-    " \t[$addr], $src;", []>;
-  def _areg : NVPTXInst<
-    (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
-    " \t[$addr], $src;", []>;
-  def _areg_64 : NVPTXInst<
-    (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
-    " \t[$addr], $src;", []>;
-  def _ari : NVPTXInst<
-    (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
-    " \t[$addr+$offset], $src;", []>;
-  def _ari_64 : NVPTXInst<
-    (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
-    " \t[$addr+$offset], $src;", []>;
-  def _asi : NVPTXInst<
-    (outs),
-    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
-         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
-    " \t[$addr+$offset], $src;", []>;
-}
-
-let mayStore=1, hasSideEffects=0 in {
-  defm ST_i8  : ST<Int16Regs>;
-  defm ST_i16 : ST<Int16Regs>;
-  defm ST_i32 : ST<Int32Regs>;
-  defm ST_i64 : ST<Int64Regs>;
-  defm ST_f32 : ST<Float32Regs>;
-  defm ST_f64 : ST<Float64Regs>;
-}
-
-// The following is used only in and after vector elementizations.  Vector
-// elementization happens at the machine instruction level, so the following
-// instructions never appear in the DAG.
-multiclass LD_VEC<NVPTXRegClass regclass> {
-  def _v2_avar : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr];", []>;
-  def _v2_areg : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr];", []>;
-  def _v2_areg_64 : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr];", []>;
-  def _v2_ari : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
-  def _v2_ari_64 : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
-  def _v2_asi : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
-  def _v4_avar : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
-  def _v4_areg : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
-  def _v4_areg_64 : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
-  def _v4_ari : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
-  def _v4_ari_64 : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
-  def _v4_asi : NVPTXInst<
-    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
-    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
-}
-let mayLoad=1, hasSideEffects=0 in {
-  defm LDV_i8  : LD_VEC<Int16Regs>;
-  defm LDV_i16 : LD_VEC<Int16Regs>;
-  defm LDV_i32 : LD_VEC<Int32Regs>;
-  defm LDV_i64 : LD_VEC<Int64Regs>;
-  defm LDV_f32 : LD_VEC<Float32Regs>;
-  defm LDV_f64 : LD_VEC<Float64Regs>;
-}
-
-multiclass ST_VEC<NVPTXRegClass regclass> {
-  def _v2_avar : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr], {{$src1, $src2}};", []>;
-  def _v2_areg : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr], {{$src1, $src2}};", []>;
-  def _v2_areg_64 : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr], {{$src1, $src2}};", []>;
-  def _v2_ari : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
-         i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr+$offset], {{$src1, $src2}};", []>;
-  def _v2_ari_64 : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
-         i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr+$offset], {{$src1, $src2}};", []>;
-  def _v2_asi : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
-         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
-         i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr+$offset], {{$src1, $src2}};", []>;
-  def _v4_avar : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
-  def _v4_areg : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
-  def _v4_areg_64 : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
-  def _v4_ari : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
-  def _v4_ari_64 : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
-    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
-  def _v4_asi : NVPTXInst<
-    (outs),
-    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
-         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
-         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
-    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
-    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
-}
-
-let mayStore=1, hasSideEffects=0 in {
-  defm STV_i8  : ST_VEC<Int16Regs>;
-  defm STV_i16 : ST_VEC<Int16Regs>;
-  defm STV_i32 : ST_VEC<Int32Regs>;
-  defm STV_i64 : ST_VEC<Int64Regs>;
-  defm STV_f32 : ST_VEC<Float32Regs>;
-  defm STV_f64 : ST_VEC<Float64Regs>;
-}
-
-
-//---- Conversion ----
-
-class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
-  NVPTXRegClass regclassOut> :
-           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
-           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
-     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
-
-def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
-def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
-def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
-def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
-
-// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
-// we cannot specify floating-point literals in isel patterns.  Therefore, we
-// use an integer selp to select either 1 or 0 and then cvt to floating-point.
-
-// sint -> f32
-def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
-          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
-          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
-          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
-          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
-
-// uint -> f32
-def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
-          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
-          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
-          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
-          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
-
-// sint -> f64
-def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
-          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
-          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
-          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
-          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
-
-// uint -> f64
-def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
-          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
-          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
-          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
-def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
-          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
-
-
-// f32 -> sint
-def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
-          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
-          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
-          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
-          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
-          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
-
-// f32 -> uint
-def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
-          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
-          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
-          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
-          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
-          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
-
-// f64 -> sint
-def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
-          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
-          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
-          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
-          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
-
-// f64 -> uint
-def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
-          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
-def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
-          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
-          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
-def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
-          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
-
-// sext i1
-def : Pat<(i16 (sext Int1Regs:$a)),
-          (SELP_s16ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (sext Int1Regs:$a)),
-          (SELP_s32ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (sext Int1Regs:$a)),
-          (SELP_s64ii -1, 0, Int1Regs:$a)>;
-
-// zext i1
-def : Pat<(i16 (zext Int1Regs:$a)),
-          (SELP_u16ii 1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (zext Int1Regs:$a)),
-          (SELP_u32ii 1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (zext Int1Regs:$a)),
-          (SELP_u64ii 1, 0, Int1Regs:$a)>;
-
-// anyext i1
-def : Pat<(i16 (anyext Int1Regs:$a)),
-          (SELP_u16ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i32 (anyext Int1Regs:$a)),
-          (SELP_u32ii -1, 0, Int1Regs:$a)>;
-def : Pat<(i64 (anyext Int1Regs:$a)),
-          (SELP_u64ii -1, 0, Int1Regs:$a)>;
-
-// sext i16
-def : Pat<(i32 (sext Int16Regs:$a)),
-          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (sext Int16Regs:$a)),
-          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
-
-// zext i16
-def : Pat<(i32 (zext Int16Regs:$a)),
-          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (zext Int16Regs:$a)),
-          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
-
-// anyext i16
-def : Pat<(i32 (anyext Int16Regs:$a)),
-          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i64 (anyext Int16Regs:$a)),
-          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
-
-// sext i32
-def : Pat<(i64 (sext Int32Regs:$a)),
-          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
-
-// zext i32
-def : Pat<(i64 (zext Int32Regs:$a)),
-          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
-
-// anyext i32
-def : Pat<(i64 (anyext Int32Regs:$a)),
-          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
-
-
-// truncate i64
-def : Pat<(i32 (trunc Int64Regs:$a)),
-          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
-def : Pat<(i16 (trunc Int64Regs:$a)),
-          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
-def : Pat<(i1 (trunc Int64Regs:$a)),
-          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
-
-// truncate i32
-def : Pat<(i16 (trunc Int32Regs:$a)),
-          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
-def : Pat<(i1 (trunc Int32Regs:$a)),
-          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
-
-// truncate i16
-def : Pat<(i1 (trunc Int16Regs:$a)),
-          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
-
-// sext_inreg
-def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
-def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
-def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
-def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
-
-
-// Select instructions with 32-bit predicates
-def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
-          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
-          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
-          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
-          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
-          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
-          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
-
-
-let hasSideEffects = 0 in {
-  // pack a set of smaller int registers to a larger int register
-  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
-                             (ins Int16Regs:$s1, Int16Regs:$s2,
-                                  Int16Regs:$s3, Int16Regs:$s4),
-                             "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>;
-  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
-                             (ins Int16Regs:$s1, Int16Regs:$s2),
-                             "mov.b32\t$d, {{$s1, $s2}};", []>;
-  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
-                             (ins Int32Regs:$s1, Int32Regs:$s2),
-                             "mov.b64\t$d, {{$s1, $s2}};", []>;
-  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
-                             (ins Float32Regs:$s1, Float32Regs:$s2),
-                             "mov.b64\t$d, {{$s1, $s2}};", []>;
-
-  // unpack a larger int register to a set of smaller int registers
-  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
-                                   Int16Regs:$d3, Int16Regs:$d4),
-                             (ins Int64Regs:$s),
-                             "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>;
-  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
-                             (ins Int32Regs:$s),
-                             "mov.b32\t{{$d1, $d2}}, $s;", []>;
-  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
-                             (ins Int64Regs:$s),
-                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
-  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
-                             (ins Float64Regs:$s),
-                             "mov.b64\t{{$d1, $d2}}, $s;", []>;
-}
-
-// Count leading zeros
-let hasSideEffects = 0 in {
-  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                         "clz.b32\t$d, $a;", []>;
-  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                         "clz.b64\t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
-def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
-// than 16 bits to store). We also need to subtract 16 because the
-// high-order 16 zeros were counted.
-def : Pat<(ctlz Int16Regs:$a),
-          (SUBi16ri (CVT_u16_u32 (CLZr32
-            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE), 16)>;
-
-// Population count
-let hasSideEffects = 0 in {
-  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
-                          "popc.b32\t$d, $a;", []>;
-  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-                          "popc.b64\t$d, $a;", []>;
-}
-
-// 32-bit has a direct PTX instruction
-def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
-
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
-def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
-
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
-// than 16 bits to store)
-def : Pat<(ctpop Int16Regs:$a),
-          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-
-// fpround f64 -> f32
-def : Pat<(f32 (fpround Float64Regs:$a)),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fpround Float64Regs:$a)),
-          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
-
-// fpextend f32 -> f64
-def : Pat<(f64 (fpextend Float32Regs:$a)),
-          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f64 (fpextend Float32Regs:$a)),
-          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
-
-def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
-                     [SDNPHasChain, SDNPOptInGlue]>;
-
-// fceil, ffloor, fround, ftrunc.
-
-def : Pat<(fceil Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(fceil Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fceil Float64Regs:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
-
-def : Pat<(ffloor Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(ffloor Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(ffloor Float64Regs:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
-
-def : Pat<(fround Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(f32 (fround Float32Regs:$a)),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(f64 (fround Float64Regs:$a)),
-          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
-def : Pat<(ftrunc Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(ftrunc Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(ftrunc Float64Regs:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
-
-// nearbyint and rint are implemented as rounding to nearest even.  This isn't
-// strictly correct, because it causes us to ignore the rounding mode.  But it
-// matches what CUDA's "libm" does.
-
-def : Pat<(fnearbyint Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(fnearbyint Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(fnearbyint Float64Regs:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
-def : Pat<(frint Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(frint Float32Regs:$a),
-          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
-def : Pat<(frint Float64Regs:$a),
-          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
-
-
-//-----------------------------------
-// Control-flow
-//-----------------------------------
-
-let isTerminator=1 in {
-   let isReturn=1, isBarrier=1 in
-      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
-
-   let isBranch=1 in
-      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
-                              "@$a bra \t$target;",
-                              [(brcond Int1Regs:$a, bb:$target)]>;
-   let isBranch=1 in
-      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
-                                   "@!$a bra \t$target;", []>;
-
-   let isBranch=1, isBarrier=1 in
-      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
-                           "bra.uni \t$target;", [(br bb:$target)]>;
-}
-
-def : Pat<(brcond Int32Regs:$a, bb:$target),
-          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
-
-// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
-// conditional branch if the target block is the next block so that the code
-// can fall through to the target block.  The invertion is done by 'xor
-// condition, 1', which will be translated to (setne condition, -1).  Since ptx
-// supports '@!pred bra target', we should use it.
-def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
-          (CBranchOther Int1Regs:$a, bb:$target)>;
-
-// Call
-def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
-def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
-                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                            SDNPSideEffect]>;
-
-def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
-def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def calltarget : Operand<i32>;
-let isCall=1 in {
-   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
-}
-
-def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
-def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
-
-// Pseudo instructions.
-class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : NVPTXInst<outs, ins, asmstr, pattern>;
-
-def Callseq_Start :
-  NVPTXInst<(outs), (ins i32imm:$amt),
-            "\\{ // callseq $amt\n"
-            "\t.reg .b32 temp_param_reg;",
-           [(callseq_start timm:$amt)]>;
-def Callseq_End :
-  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-            "\\} // callseq $amt1",
-            [(callseq_end timm:$amt1, timm:$amt2)]>;
-
-// trap instruction
-def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
-
-// Call prototype wrapper
-def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def CallPrototype :
-  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def ProtoIdent : Operand<i32> {
-  let PrintMethod = "printProtoIdent";
-}
-def CALL_PROTOTYPE :
-  NVPTXInst<(outs), (ins ProtoIdent:$ident),
-            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
-
-
-include "NVPTXIntrinsics.td"
-
-
-//-----------------------------------
-// Notes
-//-----------------------------------
-// BSWAP is currently expanded. The following is a more efficient
-// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
-// - for sm_20, use pmpt (use vector scalar mov to get the pack and
-//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
-// register.
+//===- NVPTXInstrInfo.td - NVPTX Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "NVPTXInstrFormats.td"
+
+// A NOP instruction
+let hasSideEffects = 0 in {
+  def NOP : NVPTXInst<(outs), (ins), "", []>;
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
+  def f16imm : Operand<f16>;
+}
+
+// List of vector specific properties
+def isVecLD      : VecInstTypeEnum<1>;
+def isVecST      : VecInstTypeEnum<2>;
+def isVecBuild   : VecInstTypeEnum<3>;
+def isVecShuffle : VecInstTypeEnum<4>;
+def isVecExtract : VecInstTypeEnum<5>;
+def isVecInsert  : VecInstTypeEnum<6>;
+def isVecDest    : VecInstTypeEnum<7>;
+def isVecOther   : VecInstTypeEnum<15>;
+
+//===----------------------------------------------------------------------===//
+// NVPTX Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+def brtarget    : Operand<OtherVT>;
+
+// CVT conversion modes
+// These must match the enum in NVPTX.h
+def CvtNONE : PatLeaf<(i32 0x0)>;
+def CvtRNI  : PatLeaf<(i32 0x1)>;
+def CvtRZI  : PatLeaf<(i32 0x2)>;
+def CvtRMI  : PatLeaf<(i32 0x3)>;
+def CvtRPI  : PatLeaf<(i32 0x4)>;
+def CvtRN   : PatLeaf<(i32 0x5)>;
+def CvtRZ   : PatLeaf<(i32 0x6)>;
+def CvtRM   : PatLeaf<(i32 0x7)>;
+def CvtRP   : PatLeaf<(i32 0x8)>;
+
+def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
+def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
+def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
+def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
+def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
+def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
+def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
+def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
+
+def CvtSAT      : PatLeaf<(i32 0x20)>;
+def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
+
+def CvtMode : Operand<i32> {
+  let PrintMethod = "printCvtMode";
+}
+
+// Compare modes
+// These must match the enum in NVPTX.h
+def CmpEQ   : PatLeaf<(i32 0)>;
+def CmpNE   : PatLeaf<(i32 1)>;
+def CmpLT   : PatLeaf<(i32 2)>;
+def CmpLE   : PatLeaf<(i32 3)>;
+def CmpGT   : PatLeaf<(i32 4)>;
+def CmpGE   : PatLeaf<(i32 5)>;
+def CmpEQU  : PatLeaf<(i32 10)>;
+def CmpNEU  : PatLeaf<(i32 11)>;
+def CmpLTU  : PatLeaf<(i32 12)>;
+def CmpLEU  : PatLeaf<(i32 13)>;
+def CmpGTU  : PatLeaf<(i32 14)>;
+def CmpGEU  : PatLeaf<(i32 15)>;
+def CmpNUM  : PatLeaf<(i32 16)>;
+def CmpNAN  : PatLeaf<(i32 17)>;
+
+def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
+def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
+def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
+def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
+def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
+def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
+def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
+def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
+def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
+def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
+def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
+def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
+def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
+def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
+
+def CmpMode : Operand<i32> {
+  let PrintMethod = "printCmpMode";
+}
+def VecElement : Operand<i32> {
+  let PrintMethod = "printVecElement";
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instruction Predicate Definitions
+//===----------------------------------------------------------------------===//
+
+
+def hasAtomRedG32 : Predicate<"Subtarget->hasAtomRedG32()">;
+def hasAtomRedS32 : Predicate<"Subtarget->hasAtomRedS32()">;
+def hasAtomRedGen32 : Predicate<"Subtarget->hasAtomRedGen32()">;
+def useAtomRedG32forGen32 :
+  Predicate<"!Subtarget->hasAtomRedGen32() && Subtarget->hasAtomRedG32()">;
+def hasBrkPt : Predicate<"Subtarget->hasBrkPt()">;
+def hasAtomRedG64 : Predicate<"Subtarget->hasAtomRedG64()">;
+def hasAtomRedS64 : Predicate<"Subtarget->hasAtomRedS64()">;
+def hasAtomRedGen64 : Predicate<"Subtarget->hasAtomRedGen64()">;
+def useAtomRedG64forGen64 :
+  Predicate<"!Subtarget->hasAtomRedGen64() && Subtarget->hasAtomRedG64()">;
+def hasAtomAddF32 : Predicate<"Subtarget->hasAtomAddF32()">;
+def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
+def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
+def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
+def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
+def hasVote : Predicate<"Subtarget->hasVote()">;
+def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def reqPTX20 : Predicate<"Subtarget->reqPTX20()">;
+def hasLDG : Predicate<"Subtarget->hasLDG()">;
+def hasLDU : Predicate<"Subtarget->hasLDU()">;
+def hasGenericLdSt : Predicate<"Subtarget->hasGenericLdSt()">;
+
+def doF32FTZ : Predicate<"useF32FTZ()">;
+def doNoF32FTZ : Predicate<"!useF32FTZ()">;
+
+def doMulWide      : Predicate<"doMulWide">;
+
+def allowFMA : Predicate<"allowFMA()">;
+def noFMA : Predicate<"!allowFMA()">;
+def allowUnsafeFPMath : Predicate<"allowUnsafeFPMath()">;
+
+def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
+
+def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
+
+def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">;
+def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
+
+def true : Predicate<"true">;
+
+def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
+
+def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
+
+//===----------------------------------------------------------------------===//
+// Some Common Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Template for instructions which take three int64, int32, or int16 args.
+// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
+multiclass I3<string OpcStr, SDNode OpNode> {
+  def i64rr :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def i64ri :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+              !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+  def i32rr :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def i32ri :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+              !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def i16rr :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def i16ri :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
+}
+
+// Template for instructions which take 3 int32 args.  The instructions are
+// named "<OpcStr>.s32" (e.g. "addc.cc.s32").
+multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, ".s32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+}
+
+// Template for instructions which take three fp64 or fp32 args.  The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "min.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32 functions.
+//
+// This multiclass should be used for nodes that cannot be folded into FMAs.
+// For nodes that can be folded into FMAs (i.e. adds and muls), use
+// F3_fma_component.
+multiclass F3<string OpcStr, SDNode OpNode> {
+   def f64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
+   def f64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
+   def f32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[doF32FTZ]>;
+   def f32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[doF32FTZ]>;
+   def f32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
+   def f32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+}
+
+// Template for instructions which take three FP args.  The
+// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
+//
+// Also defines ftz (flush subnormal inputs and results to sign-preserving
+// zero) variants for fp32/fp16 functions.
+//
+// This multiclass should be used for nodes that can be folded to make fma ops.
+// In this case, we use the ".rn" variant when FMA is disabled, as this behaves
+// just like the non ".rn" op, but prevents ptxas from creating FMAs.
+multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
+   def f64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+               Requires<[allowFMA]>;
+   def f64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA]>;
+   def f32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[allowFMA, doF32FTZ]>;
+   def f32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA, doF32FTZ]>;
+   def f32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[allowFMA]>;
+   def f32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[allowFMA]>;
+
+   def f16rr_ftz :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+   def f16rr :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA]>;
+
+   def f16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+   def f16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA]>;
+
+   // These have strange names so we don't perturb existing mir tests.
+   def _rnf64rr :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, Float64Regs:$b),
+               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf64ri :
+     NVPTXInst<(outs Float64Regs:$dst),
+               (ins Float64Regs:$a, f64imm:$b),
+               !strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
+               [(set Float64Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf32rr_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[noFMA, doF32FTZ]>;
+   def _rnf32ri_ftz :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".rn.ftz.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA, doF32FTZ]>;
+   def _rnf32rr :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, Float32Regs:$b),
+               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf32ri :
+     NVPTXInst<(outs Float32Regs:$dst),
+               (ins Float32Regs:$a, f32imm:$b),
+               !strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
+               [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
+               Requires<[noFMA]>;
+   def _rnf16rr_ftz :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, noFMA, doF32FTZ]>;
+   def _rnf16rr :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, noFMA]>;
+   def _rnf16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, noFMA, doF32FTZ]>;
+   def _rnf16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, noFMA]>;
+}
+
+// Template for operations which take two f32 or f64 operands.  Provides three
+// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
+// subnormal inputs and results to zero).
+multiclass F2<string OpcStr, SDNode OpNode> {
+   def f64 :     NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
+                           !strconcat(OpcStr, ".f64 \t$dst, $a;"),
+                           [(set Float64Regs:$dst, (OpNode Float64Regs:$a))]>;
+   def f32_ftz : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           !strconcat(OpcStr, ".ftz.f32 \t$dst, $a;"),
+                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>,
+                           Requires<[doF32FTZ]>;
+   def f32 :     NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$a),
+                           !strconcat(OpcStr, ".f32 \t$dst, $a;"),
+                           [(set Float32Regs:$dst, (OpNode Float32Regs:$a))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// NVPTX Instructions.
+//===----------------------------------------------------------------------===//
+
+//-----------------------------------
+// Type Conversion
+//-----------------------------------
+
+let hasSideEffects = 0 in {
+  // Generate a cvt to the given type from all possible types.  Each instance
+  // takes a CvtMode immediate that defines the conversion mode to use.  It can
+  // be CvtNONE to omit a conversion mode.
+  multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+    def _s8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s8 \t$dst, $src;"), []>;
+    def _u8 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u8 \t$dst, $src;"), []>;
+    def _s16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s16 \t$dst, $src;"), []>;
+    def _u16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u16 \t$dst, $src;"), []>;
+    def _s32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s32 \t$dst, $src;"), []>;
+    def _u32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u32 \t$dst, $src;"), []>;
+    def _s64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".s64 \t$dst, $src;"), []>;
+    def _u64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Int64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".u64 \t$dst, $src;"), []>;
+    def _f16 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float16Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f16 \t$dst, $src;"), []>;
+    def _f32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f32 \t$dst, $src;"), []>;
+    def _f64 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float64Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                FromName, ".f64 \t$dst, $src;"), []>;
+  }
+
+  // Generate cvts from all types to all types.
+  defm CVT_s8  : CVT_FROM_ALL<"s8",  Int16Regs>;
+  defm CVT_u8  : CVT_FROM_ALL<"u8",  Int16Regs>;
+  defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+  defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+  defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+  defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+  defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+  defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+  defm CVT_f16 : CVT_FROM_ALL<"f16", Float16Regs>;
+  defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+  defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+  // These cvts are different from those above: The source and dest registers
+  // are of the same type.
+  def CVT_INREG_s16_s8 :  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                                    "cvt.s16.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s32_s8 :  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                    "cvt.s32.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                                    "cvt.s32.s16 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s8 :  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s8 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s16 \t$dst, $src;", []>;
+  def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                                    "cvt.s64.s32 \t$dst, $src;", []>;
+}
+
+//-----------------------------------
+// Integer Arithmetic
+//-----------------------------------
+
+// Template for xor masquerading as int1 arithmetic.
+multiclass ADD_SUB_i1<SDNode OpNode> {
+   def _rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+                      "xor.pred \t$dst, $a, $b;",
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+   def _ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+                      "xor.pred \t$dst, $a, $b;",
+                      [(set Int1Regs:$dst, (OpNode Int1Regs:$a, (imm):$b))]>;
+}
+
+// int1 addition and subtraction are both just xor.
+defm ADD_i1 : ADD_SUB_i1<add>;
+defm SUB_i1 : ADD_SUB_i1<sub>;
+
+// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
+// also use these for unsigned arithmetic.
+defm ADD : I3<"add.s", add>;
+defm SUB : I3<"sub.s", sub>;
+
+// int32 addition and subtraction with carry-out.
+// FIXME: PTX 4.3 adds a 64-bit add.cc (and maybe also 64-bit addc.cc?).
+defm ADDCC : ADD_SUB_INT_32<"add.cc", addc>;
+defm SUBCC : ADD_SUB_INT_32<"sub.cc", subc>;
+
+// int32 addition and subtraction with carry-in and carry-out.
+defm ADDCCC : ADD_SUB_INT_32<"addc.cc", adde>;
+defm SUBCCC : ADD_SUB_INT_32<"subc.cc", sube>;
+
+defm MULT : I3<"mul.lo.s", mul>;
+
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+// The ri versions of rem.s and rem.u won't be selected; DAGCombiner::visitSREM
+// will lower it.
+defm SREM : I3<"rem.s", srem>;
+defm UREM : I3<"rem.u", urem>;
+
+// Integer absolute value.  NumBits should be one minus the bit width of RC.
+// This idiom implements the algorithm at
+// http://graphics.stanford.edu/~seander/bithacks.html#IntegerAbs.
+multiclass ABS<RegisterClass RC, string SizeName> {
+  def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
+                  !strconcat("abs", SizeName, " \t$dst, $a;"),
+                  [(set RC:$dst, (abs RC:$a))]>;
+}
+defm ABS_16 : ABS<Int16Regs, ".s16">;
+defm ABS_32 : ABS<Int32Regs, ".s32">;
+defm ABS_64 : ABS<Int64Regs, ".s64">;
+
+// Integer min/max.
+defm SMAX : I3<"max.s", smax>;
+defm UMAX : I3<"max.u", umax>;
+defm SMIN : I3<"min.s", smin>;
+defm UMIN : I3<"min.u", umin>;
+
+//
+// Wide multiplication
+//
+def MULWIDES64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+def MULWIDES64Imm64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+            "mul.wide.s32 \t$dst, $a, $b;", []>;
+
+def MULWIDEU64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+def MULWIDEU64Imm64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i64imm:$b),
+            "mul.wide.u32 \t$dst, $a, $b;", []>;
+
+def MULWIDES32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+def MULWIDES32Imm32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+            "mul.wide.s16 \t$dst, $a, $b;", []>;
+
+def MULWIDEU32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+def MULWIDEU32Imm32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+            "mul.wide.u16 \t$dst, $a, $b;", []>;
+
+def SDTMulWide : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>;
+def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>;
+
+// Matchers for signed, unsigned mul.wide ISD nodes.
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_signed Int16Regs:$a, imm:$b)),
+          (MULWIDES32Imm Int16Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i32 (mul_wide_unsigned Int16Regs:$a, imm:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_signed Int32Regs:$a, imm:$b)),
+          (MULWIDES64Imm Int32Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(i64 (mul_wide_unsigned Int32Regs:$a, imm:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, imm:$b)>,
+      Requires<[doMulWide]>;
+
+// Predicates used for converting some patterns to mul.wide.
+def SInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isSignedIntN(32);
+}]>;
+
+def UInt32Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isIntN(32);
+}]>;
+
+def SInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isSignedIntN(16);
+}]>;
+
+def UInt16Const : PatLeaf<(imm), [{
+  const APInt &v = N->getAPIntValue();
+  return v.isIntN(16);
+}]>;
+
+def Int5Const : PatLeaf<(imm), [{
+  // Check if 0 <= v < 32; only then will the result of (x << v) be an int32.
+  const APInt &v = N->getAPIntValue();
+  return v.sge(0) && v.slt(32);
+}]>;
+
+def Int4Const : PatLeaf<(imm), [{
+  // Check if 0 <= v < 16; only then will the result of (x << v) be an int16.
+  const APInt &v = N->getAPIntValue();
+  return v.sge(0) && v.slt(16);
+}]>;
+
+def SHL2MUL32 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(32, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i32);
+}]>;
+
+def SHL2MUL16 : SDNodeXForm<imm, [{
+  const APInt &v = N->getAPIntValue();
+  APInt temp(16, 1);
+  return CurDAG->getTargetConstant(temp.shl(v), SDLoc(N), MVT::i16);
+}]>;
+
+// Convert "sign/zero-extend, then shift left by an immediate" to mul.wide.
+def : Pat<(shl (sext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDES64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+      Requires<[doMulWide]>;
+def : Pat<(shl (zext Int32Regs:$a), (i32 Int5Const:$b)),
+          (MULWIDEU64Imm Int32Regs:$a, (SHL2MUL32 node:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(shl (sext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDES32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+      Requires<[doMulWide]>;
+def : Pat<(shl (zext Int16Regs:$a), (i16 Int4Const:$b)),
+          (MULWIDEU32Imm Int16Regs:$a, (SHL2MUL16 node:$b))>,
+      Requires<[doMulWide]>;
+
+// Convert "sign/zero-extend then multiply" to mul.wide.
+def : Pat<(mul (sext Int32Regs:$a), (sext Int32Regs:$b)),
+          (MULWIDES64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (sext Int32Regs:$a), (i64 SInt32Const:$b)),
+          (MULWIDES64Imm64 Int32Regs:$a, (i64 SInt32Const:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int32Regs:$a), (zext Int32Regs:$b)),
+          (MULWIDEU64 Int32Regs:$a, Int32Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (zext Int32Regs:$a), (i64 UInt32Const:$b)),
+          (MULWIDEU64Imm64 Int32Regs:$a, (i64 UInt32Const:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(mul (sext Int16Regs:$a), (sext Int16Regs:$b)),
+          (MULWIDES32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (sext Int16Regs:$a), (i32 SInt16Const:$b)),
+          (MULWIDES32Imm32 Int16Regs:$a, (i32 SInt16Const:$b))>,
+      Requires<[doMulWide]>;
+
+def : Pat<(mul (zext Int16Regs:$a), (zext Int16Regs:$b)),
+          (MULWIDEU32 Int16Regs:$a, Int16Regs:$b)>,
+      Requires<[doMulWide]>;
+def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
+          (MULWIDEU32Imm32 Int16Regs:$a, (i32 UInt16Const:$b))>,
+      Requires<[doMulWide]>;
+
+//
+// Integer multiply-add
+//
+def SDTIMAD :
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>,
+                       SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def imad : SDNode<"NVPTXISD::IMAD", SDTIMAD>;
+
+def MAD16rrr :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, Int16Regs:$c))]>;
+def MAD16rri :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, Int16Regs:$b, i16imm:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, Int16Regs:$b, imm:$c))]>;
+def MAD16rir :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, i16imm:$b, Int16Regs:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, Int16Regs:$c))]>;
+def MAD16rii :
+  NVPTXInst<(outs Int16Regs:$dst),
+            (ins Int16Regs:$a, i16imm:$b, i16imm:$c),
+            "mad.lo.s16 \t$dst, $a, $b, $c;",
+            [(set Int16Regs:$dst, (imad Int16Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD32rrr :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, Int32Regs:$b, Int32Regs:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, Int32Regs:$c))]>;
+def MAD32rri :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, Int32Regs:$b, i32imm:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, Int32Regs:$b, imm:$c))]>;
+def MAD32rir :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, i32imm:$b, Int32Regs:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, Int32Regs:$c))]>;
+def MAD32rii :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$a, i32imm:$b, i32imm:$c),
+            "mad.lo.s32 \t$dst, $a, $b, $c;",
+            [(set Int32Regs:$dst, (imad Int32Regs:$a, imm:$b, imm:$c))]>;
+
+def MAD64rrr :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, Int64Regs:$b, Int64Regs:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, Int64Regs:$c))]>;
+def MAD64rri :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, Int64Regs:$b, i64imm:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, Int64Regs:$b, imm:$c))]>;
+def MAD64rir :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, i64imm:$b, Int64Regs:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, Int64Regs:$c))]>;
+def MAD64rii :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$a, i64imm:$b, i64imm:$c),
+            "mad.lo.s64 \t$dst, $a, $b, $c;",
+            [(set Int64Regs:$dst, (imad Int64Regs:$a, imm:$b, imm:$c))]>;
+
+def INEG16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+            "neg.s16 \t$dst, $src;",
+            [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
+def INEG32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+            "neg.s32 \t$dst, $src;",
+            [(set Int32Regs:$dst, (ineg Int32Regs:$src))]>;
+def INEG64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+            "neg.s64 \t$dst, $src;",
+            [(set Int64Regs:$dst, (ineg Int64Regs:$src))]>;
+
+//-----------------------------------
+// Floating Point Arithmetic
+//-----------------------------------
+
+// Constant 1.0f
+def FloatConst1 : PatLeaf<(fpimm), [{
+  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEsingle() &&
+         N->getValueAPF().convertToFloat() == 1.0f;
+}]>;
+// Constant 1.0 (double)
+def DoubleConst1 : PatLeaf<(fpimm), [{
+  return &N->getValueAPF().getSemantics() == &llvm::APFloat::IEEEdouble() &&
+         N->getValueAPF().convertToDouble() == 1.0;
+}]>;
+
+// Loads FP16 constant into a register.
+//
+// ptxas does not have hex representation for fp16, so we can't use
+// fp16 immediate values in .f16 instructions. Instead we have to load
+// the constant into a register using mov.b16.
+def LOAD_CONST_F16 :
+  NVPTXInst<(outs Float16Regs:$dst), (ins f16imm:$a),
+            "mov.b16 \t$dst, $a;", []>;
+
+defm FADD : F3_fma_component<"add", fadd>;
+defm FSUB : F3_fma_component<"sub", fsub>;
+defm FMUL : F3_fma_component<"mul", fmul>;
+
+defm FMIN : F3<"min", fminnum>;
+defm FMAX : F3<"max", fmaxnum>;
+
+defm FABS  : F2<"abs", fabs>;
+defm FNEG  : F2<"neg", fneg>;
+defm FSQRT : F2<"sqrt.rn", fsqrt>;
+
+//
+// F64 division
+//
+def FDIV641r :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins f64imm:$a, Float64Regs:$b),
+            "rcp.rn.f64 \t$dst, $b;",
+            [(set Float64Regs:$dst, (fdiv DoubleConst1:$a, Float64Regs:$b))]>;
+def FDIV64rr :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins Float64Regs:$a, Float64Regs:$b),
+            "div.rn.f64 \t$dst, $a, $b;",
+            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, Float64Regs:$b))]>;
+def FDIV64ri :
+  NVPTXInst<(outs Float64Regs:$dst),
+            (ins Float64Regs:$a, f64imm:$b),
+            "div.rn.f64 \t$dst, $a, $b;",
+            [(set Float64Regs:$dst, (fdiv Float64Regs:$a, fpimm:$b))]>;
+
+//
+// F32 Approximate reciprocal
+//
+def FDIV321r_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV321r :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Approximate division
+//
+def FDIV32approxrr_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.approx.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxri_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.approx.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_APPROX, doF32FTZ]>;
+def FDIV32approxrr :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.approx.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+def FDIV32approxri :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.approx.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_APPROX]>;
+//
+// F32 Semi-accurate reciprocal
+//
+// rcp.approx gives the same result as div.full(1.0f, a) and is faster.
+//
+def FDIV321r_approx_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV321r_approx :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.approx.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+//
+// F32 Semi-accurate division
+//
+def FDIV32rr_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.full.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32ri_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.full.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_FULL, doF32FTZ]>;
+def FDIV32rr :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.full.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+def FDIV32ri :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.full.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[do_DIVF32_FULL]>;
+//
+// F32 Accurate reciprocal
+//
+def FDIV321r_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.rn.ftz.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20, doF32FTZ]>;
+def FDIV321r_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins f32imm:$a, Float32Regs:$b),
+            "rcp.rn.f32 \t$dst, $b;",
+            [(set Float32Regs:$dst, (fdiv FloatConst1:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20]>;
+//
+// F32 Accurate division
+//
+def FDIV32rr_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.rn.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32ri_prec_ftz :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.rn.ftz.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[doF32FTZ, reqPTX20]>;
+def FDIV32rr_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, Float32Regs:$b),
+            "div.rn.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, Float32Regs:$b))]>,
+            Requires<[reqPTX20]>;
+def FDIV32ri_prec :
+  NVPTXInst<(outs Float32Regs:$dst),
+            (ins Float32Regs:$a, f32imm:$b),
+            "div.rn.f32 \t$dst, $a, $b;",
+            [(set Float32Regs:$dst, (fdiv Float32Regs:$a, fpimm:$b))]>,
+            Requires<[reqPTX20]>;
+
+//
+// FMA
+//
+
+multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                       Requires<[Pred]>;
+   def rri : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, RC:$b, ImmCls:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, fpimm:$c))]>,
+                       Requires<[Pred]>;
+   def rir : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, fpimm:$b, RC:$c))]>,
+                       Requires<[Pred]>;
+   def rii : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, ImmCls:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
+                       Requires<[Pred]>;
+}
+
+multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
+   def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
+                       !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
+                       [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
+                       Requires<[useFP16Math, Pred]>;
+}
+
+defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
+defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
+defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
+defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
+defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
+defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
+defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
+
+// sin/cos
+def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "sin.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
+                      Requires<[allowUnsafeFPMath]>;
+def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                      "cos.approx.f32 \t$dst, $src;",
+                      [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>,
+                      Requires<[allowUnsafeFPMath]>;
+
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y)),
+// i.e. "poor man's fmod()"
+
+// frem - f32 FTZ
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+             Float32Regs:$y))>,
+          Requires<[doF32FTZ]>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+             fpimm:$y))>,
+          Requires<[doF32FTZ]>;
+
+// frem - f32
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+             Float32Regs:$y))>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+// frem - f64
+def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+             Float64Regs:$y))>;
+def : Pat<(frem Float64Regs:$x, fpimm:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+//-----------------------------------
+// Bitwise operations
+//-----------------------------------
+
+// Template for three-arg bitwise operations.  Takes three args, Creates .b16,
+// .b32, .b64, and .pred (predicate registers -- i.e., i1) versions of OpcStr.
+multiclass BITWISE<string OpcStr, SDNode OpNode> {
+  def b1rr :
+    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, Int1Regs:$b),
+              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, Int1Regs:$b))]>;
+  def b1ri :
+    NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
+              !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
+              [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
+  def b16rr :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
+              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
+  def b16ri :
+    NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
+              !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
+              [(set Int16Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
+  def b32rr :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+  def b32ri :
+    NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+              !strconcat(OpcStr, ".b32  \t$dst, $a, $b;"),
+              [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
+  def b64rr :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
+              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
+  def b64ri :
+    NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
+              !strconcat(OpcStr, ".b64  \t$dst, $a, $b;"),
+              [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
+}
+
+defm OR  : BITWISE<"or", or>;
+defm AND : BITWISE<"and", and>;
+defm XOR : BITWISE<"xor", xor>;
+
+def NOT1  : NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
+                      "not.pred \t$dst, $src;",
+                      [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
+def NOT16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                      "not.b16 \t$dst, $src;",
+                      [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
+def NOT32 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                      "not.b32 \t$dst, $src;",
+                      [(set Int32Regs:$dst, (not Int32Regs:$src))]>;
+def NOT64 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                       "not.b64 \t$dst, $src;",
+                       [(set Int64Regs:$dst, (not Int64Regs:$src))]>;
+
+// Template for left/right shifts.  Takes three operands,
+//   [dest (reg), src (reg), shift (reg or imm)].
+// dest and src may be int64, int32, or int16, but shift is always int32.
+//
+// This template also defines a 32-bit shift (imm, imm) instruction.
+multiclass SHIFT<string OpcStr, SDNode OpNode> {
+   def i64rr :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, Int32Regs:$b))]>;
+   def i64ri :
+     NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
+               [(set Int64Regs:$dst, (OpNode Int64Regs:$a, (i32 imm:$b)))]>;
+   def i32rr :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
+   def i32ri :
+     NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, (i32 imm:$b)))]>;
+   def i32ii :
+     NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, i32imm:$b),
+               !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
+               [(set Int32Regs:$dst, (OpNode (i32 imm:$a), (i32 imm:$b)))]>;
+   def i16rr :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int32Regs:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, Int32Regs:$b))]>;
+   def i16ri :
+     NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i32imm:$b),
+               !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
+               [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>;
+}
+
+defm SHL : SHIFT<"shl.b", shl>;
+defm SRA : SHIFT<"shr.s", sra>;
+defm SRL : SHIFT<"shr.u", srl>;
+
+// Bit-reverse
+def BREV32 :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a),
+             "brev.b32 \t$dst, $a;",
+             [(set Int32Regs:$dst, (bitreverse Int32Regs:$a))]>;
+def BREV64 :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a),
+             "brev.b64 \t$dst, $a;",
+             [(set Int64Regs:$dst, (bitreverse Int64Regs:$a))]>;
+
+//
+// Rotate: Use ptx shf instruction if available.
+//
+
+// 32 bit r2 = rotl r1, n
+//    =>
+//        r2 = shf.l r1, r1, n
+def ROTL32imm_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, (i32 imm:$amt)))]>,
+           Requires<[hasHWROT32]>;
+
+def ROTL32reg_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "shf.l.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[hasHWROT32]>;
+
+// 32 bit r2 = rotr r1, n
+//    =>
+//        r2 = shf.r r1, r1, n
+def ROTR32imm_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, i32imm:$amt),
+            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, (i32 imm:$amt)))]>,
+           Requires<[hasHWROT32]>;
+
+def ROTR32reg_hw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "shf.r.wrap.b32 \t$dst, $src, $src, $amt;",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[hasHWROT32]>;
+
+// 32-bit software rotate by immediate.  $amt2 should equal 32 - $amt1.
+def ROT32imm_sw :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$src, i32imm:$amt1, i32imm:$amt2),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            "shl.b32 \t%lhs, $src, $amt1;\n\t"
+            "shr.b32 \t%rhs, $src, $amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            []>;
+
+def SUB_FRM_32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(32 - N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, imm:$amt, (SUB_FRM_32 node:$amt))>,
+      Requires<[noHWROT32]>;
+def : Pat<(rotr Int32Regs:$src, (i32 imm:$amt)),
+          (ROT32imm_sw Int32Regs:$src, (SUB_FRM_32 node:$amt), imm:$amt)>,
+      Requires<[noHWROT32]>;
+
+// 32-bit software rotate left by register.
+def ROTL32reg_sw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            ".reg .b32 %amt2;\n\t"
+            "shl.b32 \t%lhs, $src, $amt;\n\t"
+            "sub.s32 \t%amt2, 32, $amt;\n\t"
+            "shr.b32 \t%rhs, $src, %amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int32Regs:$dst, (rotl Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[noHWROT32]>;
+
+// 32-bit software rotate right by register.
+def ROTR32reg_sw :
+  NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b32 %lhs;\n\t"
+            ".reg .b32 %rhs;\n\t"
+            ".reg .b32 %amt2;\n\t"
+            "shr.b32 \t%lhs, $src, $amt;\n\t"
+            "sub.s32 \t%amt2, 32, $amt;\n\t"
+            "shl.b32 \t%rhs, $src, %amt2;\n\t"
+            "add.u32 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int32Regs:$dst, (rotr Int32Regs:$src, Int32Regs:$amt))]>,
+           Requires<[noHWROT32]>;
+
+// 64-bit software rotate by immediate.  $amt2 should equal 64 - $amt1.
+def ROT64imm_sw :
+  NVPTXInst<(outs Int64Regs:$dst),
+            (ins Int64Regs:$src, i32imm:$amt1, i32imm:$amt2),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            "shl.b64 \t%lhs, $src, $amt1;\n\t"
+            "shr.b64 \t%rhs, $src, $amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            []>;
+
+def SUB_FRM_64 : SDNodeXForm<imm, [{
+    return CurDAG->getTargetConstant(64-N->getZExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
+def : Pat<(rotl Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, imm:$amt, (SUB_FRM_64 node:$amt))>;
+def : Pat<(rotr Int64Regs:$src, (i32 imm:$amt)),
+          (ROT64imm_sw Int64Regs:$src, (SUB_FRM_64 node:$amt), imm:$amt)>;
+
+// 64-bit software rotate left by register.
+def ROTL64reg_sw :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            ".reg .u32 %amt2;\n\t"
+            "shl.b64 \t%lhs, $src, $amt;\n\t"
+            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "shr.b64 \t%rhs, $src, %amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int64Regs:$dst, (rotl Int64Regs:$src, Int32Regs:$amt))]>;
+
+def ROTR64reg_sw :
+  NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src, Int32Regs:$amt),
+            "{{\n\t"
+            ".reg .b64 %lhs;\n\t"
+            ".reg .b64 %rhs;\n\t"
+            ".reg .u32 %amt2;\n\t"
+            "shr.b64 \t%lhs, $src, $amt;\n\t"
+            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "shl.b64 \t%rhs, $src, %amt2;\n\t"
+            "add.u64 \t$dst, %lhs, %rhs;\n\t"
+            "}}",
+            [(set Int64Regs:$dst, (rotr Int64Regs:$src, Int32Regs:$amt))]>;
+
+//
+// Funnnel shift in clamp mode
+//
+
+// Create SDNodes so they can be used in the DAG code, e.g.
+// NVPTXISelLowering (LowerShiftLeftParts and LowerShiftRightParts)
+def SDTIntShiftDOp :
+  SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                       SDTCisInt<0>, SDTCisInt<3>]>;
+def FUN_SHFL_CLAMP : SDNode<"NVPTXISD::FUN_SHFL_CLAMP", SDTIntShiftDOp, []>;
+def FUN_SHFR_CLAMP : SDNode<"NVPTXISD::FUN_SHFR_CLAMP", SDTIntShiftDOp, []>;
+
+def FUNSHFLCLAMP :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+            "shf.l.clamp.b32 \t$dst, $lo, $hi, $amt;",
+            [(set Int32Regs:$dst,
+              (FUN_SHFL_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+def FUNSHFRCLAMP :
+  NVPTXInst<(outs Int32Regs:$dst),
+            (ins Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt),
+            "shf.r.clamp.b32 \t$dst, $lo, $hi, $amt;",
+            [(set Int32Regs:$dst,
+             (FUN_SHFR_CLAMP Int32Regs:$lo, Int32Regs:$hi, Int32Regs:$amt))]>;
+
+//
+// BFE - bit-field extract
+//
+
+// Template for BFE instructions.  Takes four args,
+//   [dest (reg), src (reg), start (reg or imm), end (reg or imm)].
+// Start may be an imm only if end is also an imm.  FIXME: Is this a
+// restriction in PTX?
+//
+// dest and src may be int32 or int64, but start and end are always int32.
+multiclass BFE<string TyStr, RegisterClass RC> {
+  def rrr
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, Int32Regs:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rri
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, Int32Regs:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+  def rii
+    : NVPTXInst<(outs RC:$d),
+                (ins RC:$a, i32imm:$b, i32imm:$c),
+                !strconcat("bfe.", TyStr, " \t$d, $a, $b, $c;"), []>;
+}
+
+let hasSideEffects = 0 in {
+  defm BFE_S32 : BFE<"s32", Int32Regs>;
+  defm BFE_U32 : BFE<"u32", Int32Regs>;
+  defm BFE_S64 : BFE<"s64", Int64Regs>;
+  defm BFE_U64 : BFE<"u64", Int64Regs>;
+}
+
+//-----------------------------------
+// Comparison instructions (setp, set)
+//-----------------------------------
+
+// FIXME: This doesn't cover versions of set and setp that combine with a
+// boolean predicate, e.g. setp.eq.and.b16.
+
+let hasSideEffects = 0 in {
+  multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr :
+      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, RC:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           " \t$dst, $a, $b;"), []>;
+    def ri :
+      NVPTXInst<(outs Int1Regs:$dst), (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           " \t$dst, $a, $b;"), []>;
+    def ir :
+      NVPTXInst<(outs Int1Regs:$dst), (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr,
+                           " \t$dst, $a, $b;"), []>;
+  }
+}
+
+defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+def SETP_f16rr :
+      NVPTXInst<(outs Int1Regs:$dst),
+                (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
+                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
+                []>, Requires<[useFP16Math]>;
+
+def SETP_f16x2rr :
+      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
+                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
+                []>,
+                Requires<[useFP16Math]>;
+
+
+// FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
+// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
+// reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
+
+let hasSideEffects = 0 in {
+  multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins RC:$a, RC:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
+    def ri : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
+    def ir : NVPTXInst<(outs Int32Regs:$dst),
+                       (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                       !strconcat("set$cmp.", TypeStr, " \t$dst, $a, $b;"), []>;
+  }
+}
+
+defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f16 : SET<"f16", Float16Regs, f16imm>;
+defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+
+//-----------------------------------
+// Selection instructions (selp)
+//-----------------------------------
+
+// FIXME: Missing slct
+
+// selp instructions that don't have any pattern matches; we explicitly use
+// them within this file.
+let hasSideEffects = 0 in {
+  multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+    def rr : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, RC:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+    def ri : NVPTXInst<(outs RC:$dst),
+                       (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+    def ir : NVPTXInst<(outs RC:$dst),
+                       (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+    def ii : NVPTXInst<(outs RC:$dst),
+                       (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                       !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
+  }
+
+  multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+                          SDNode ImmNode> {
+    def rr :
+      NVPTXInst<(outs RC:$dst),
+                (ins RC:$a, RC:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+    def ri :
+      NVPTXInst<(outs RC:$dst),
+                (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+    def ir :
+      NVPTXInst<(outs RC:$dst),
+                (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+    def ii :
+      NVPTXInst<(outs RC:$dst),
+                (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                !strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
+                [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+  }
+}
+
+// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
+// good.
+defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
+defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+
+def SELP_f16x2rr :
+    NVPTXInst<(outs Float16x2Regs:$dst),
+              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
+              "selp.b32 \t$dst, $a, $b, $p;",
+              [(set Float16x2Regs:$dst,
+                    (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
+
+//-----------------------------------
+// Data Movement (Load / Store, Move)
+//-----------------------------------
+
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
+                            [SDNPWantRoot]>;
+def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
+                              [SDNPWantRoot]>;
+
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int32Regs, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops Int64Regs, i64imm);
+}
+
+def imem : Operand<iPTR> {
+  let PrintMethod = "printOperand";
+}
+
+def imemAny : Operand<iPTRAny> {
+  let PrintMethod = "printOperand";
+}
+
+def LdStCode : Operand<i32> {
+  let PrintMethod = "printLdStCode";
+}
+
+def SDTWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def Wrapper    : SDNode<"NVPTXISD::Wrapper", SDTWrapper>;
+
+// Load a memory address into a u32 or u64 register.
+def MOV_ADDR : NVPTXInst<(outs Int32Regs:$dst), (ins imem:$a),
+                         "mov.u32 \t$dst, $a;",
+                         [(set Int32Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
+                           "mov.u64 \t$dst, $a;",
+                           [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
+
+// Get pointer to local stack.
+let hasSideEffects = 0 in {
+  def MOV_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+                                     "mov.u32 \t$d, __local_depot$num;", []>;
+  def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+                                    "mov.u64 \t$d, __local_depot$num;", []>;
+}
+
+
+// copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
+let IsSimpleMove=1, hasSideEffects=0 in {
+  def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
+                           "mov.pred \t$dst, $sss;", []>;
+  def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
+                           "mov.u16 \t$dst, $sss;", []>;
+  def IMOV32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
+                           "mov.u32 \t$dst, $sss;", []>;
+  def IMOV64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
+                           "mov.u64 \t$dst, $sss;", []>;
+
+  def FMOV16rr : NVPTXInst<(outs Float16Regs:$dst), (ins Float16Regs:$src),
+                           // We have to use .b16 here as there's no mov.f16.
+                           "mov.b16 \t$dst, $src;", []>;
+  def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
+                           "mov.f32 \t$dst, $src;", []>;
+  def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
+                           "mov.f64 \t$dst, $src;", []>;
+}
+
+def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                        "mov.pred \t$dst, $src;",
+                        [(set Int1Regs:$dst, imm:$src)]>;
+def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                         "mov.u16 \t$dst, $src;",
+                         [(set Int16Regs:$dst, imm:$src)]>;
+def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                         "mov.u32 \t$dst, $src;",
+                         [(set Int32Regs:$dst, imm:$src)]>;
+def IMOV64i : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                        "mov.u64 \t$dst, $src;",
+                        [(set Int64Regs:$dst, imm:$src)]>;
+
+def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                         "mov.f32 \t$dst, $src;",
+                         [(set Float32Regs:$dst, fpimm:$src)]>;
+def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                         "mov.f64 \t$dst, $src;",
+                         [(set Float64Regs:$dst, fpimm:$src)]>;
+
+def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
+
+//---- Copy Frame Index ----
+def LEA_ADDRi :   NVPTXInst<(outs Int32Regs:$dst), (ins MEMri:$addr),
+                            "add.u32 \t$dst, ${addr:add};",
+                            [(set Int32Regs:$dst, ADDRri:$addr)]>;
+def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
+                            "add.u64 \t$dst, ${addr:add};",
+                            [(set Int64Regs:$dst, ADDRri64:$addr)]>;
+
+//-----------------------------------
+// Comparison and Selection
+//-----------------------------------
+
+multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+                       Instruction setp_16rr,
+                       Instruction setp_16ri,
+                       Instruction setp_16ir,
+                       Instruction setp_32rr,
+                       Instruction setp_32ri,
+                       Instruction setp_32ir,
+                       Instruction setp_64rr,
+                       Instruction setp_64ri,
+                       Instruction setp_64ir,
+                       Instruction set_16rr,
+                       Instruction set_16ri,
+                       Instruction set_16ir,
+                       Instruction set_32rr,
+                       Instruction set_32ri,
+                       Instruction set_32ir,
+                       Instruction set_64rr,
+                       Instruction set_64ri,
+                       Instruction set_64ir> {
+  // i16 -> pred
+  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> pred
+  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> pred
+  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+
+  // i16 -> i32
+  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> i32
+  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> i32
+  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
+}
+
+multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_s16rr, SETP_s16ri, SETP_s16ir,
+                SETP_s32rr, SETP_s32ri, SETP_s32ir,
+                SETP_s64rr, SETP_s64ri, SETP_s64ir,
+                SET_s16rr, SET_s16ri, SET_s16ir,
+                SET_s32rr, SET_s32ri, SET_s32ir,
+                SET_s64rr, SET_s64ri, SET_s64ir> {
+  // TableGen doesn't like empty multiclasses.
+  def : PatLeaf<(i32 0)>;
+}
+
+multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_u16rr, SETP_u16ri, SETP_u16ir,
+                SETP_u32rr, SETP_u32ri, SETP_u32ir,
+                SETP_u64rr, SETP_u64ri, SETP_u64ir,
+                SET_u16rr, SET_u16ri, SET_u16ir,
+                SET_u32rr, SET_u32ri, SET_u32ir,
+                SET_u64rr, SET_u64ri, SET_u64ir> {
+  // TableGen doesn't like empty multiclasses.
+  def : PatLeaf<(i32 0)>;
+}
+
+defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+
+// i1 compares
+def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
+def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+// i1 compare -> i32
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+
+
+multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+  // f16 -> pred
+  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math,doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+        Requires<[useFP16Math,doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math,doF32FTZ]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+
+  // f32 -> pred
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> pred
+  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+
+  // f16 -> i32
+  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math, doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),
+            (SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
+        Requires<[useFP16Math, doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),
+            (SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
+        Requires<[useFP16Math]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
+        Requires<[useFP16Math, doF32FTZ]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),
+            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
+        Requires<[useFP16Math]>;
+
+  // f32 -> i32
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> i32
+  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+}
+
+defm FSetOGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetOLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetOGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetOLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetOEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetONE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+
+defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+
+defm FSetGT : FSET_FORMAT<setgt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setlt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setle, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<seteq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setne, CmpNE, CmpNE_FTZ>;
+
+defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
+
+// FIXME: What is this doing here?  Can it be deleted?
+// def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
+//                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDTDeclareParamProfile :
+  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTDeclareScalarParamProfile :
+  SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
+def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
+def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
+def SDTCallValProfile : SDTypeProfile<1, 0, []>;
+def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
+def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
+def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
+
+def DeclareParam :
+  SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareScalarParam :
+  SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRetParam :
+  SDNode<"NVPTXISD::DeclareRetParam", SDTDeclareParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def DeclareRet :
+  SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LoadParam :
+  SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2 :
+  SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4 :
+  SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def PrintCall :
+  SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCall :
+  SDNode<"NVPTXISD::PrintConvergentCall", SDTPrintCallProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintCallUni :
+  SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def PrintConvergentCallUni :
+  SDNode<"NVPTXISD::PrintConvergentCallUni", SDTPrintCallUniProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParam :
+  SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 :
+  SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 :
+  SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamU32 :
+  SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamS32 :
+  SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgBegin :
+  SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArg :
+  SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def LastCallArg :
+  SDNode<"NVPTXISD::LastCallArg", SDTCallArgProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallArgEnd :
+  SDNode<"NVPTXISD::CallArgEnd", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVoid :
+  SDNode<"NVPTXISD::CallVoid", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def Prototype :
+  SDNode<"NVPTXISD::Prototype", SDTCallVoidProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def CallVal :
+  SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def MoveParam :
+  SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>;
+def StoreRetval :
+  SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2 :
+  SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4 :
+  SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+         [SDNPHasChain, SDNPSideEffect]>;
+def PseudoUseParam :
+  SDNode<"NVPTXISD::PseudoUseParam", SDTPseudoUseParamProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def RETURNNode :
+  SDNode<"NVPTXISD::RETURN", SDTCallArgMarkProfile,
+         [SDNPHasChain, SDNPSideEffect]>;
+
+let mayLoad = 1 in {
+  class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                  !strconcat("ld.param", opstr, " \t$dst, [retval0+$b];"),
+                  []>;
+
+  class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+                  !strconcat("ld.param.v2", opstr,
+                             " \t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+  class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+                        regclass:$dst4),
+                  (ins i32imm:$b),
+                  !strconcat("ld.param.v4", opstr,
+                             " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"),
+                  []>;
+}
+
+class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
+                !strconcat("mov", opstr, " \t$dst, retval$b;"),
+                [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
+
+let mayStore = 1 in {
+  class StoreParamInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
+                  !strconcat("st.param", opstr, " \t[param$a+$b], $val;"),
+                  []>;
+
+  class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+                               i32imm:$a, i32imm:$b),
+                  !strconcat("st.param.v2", opstr,
+                             " \t[param$a+$b], {{$val, $val2}};"),
+                  []>;
+
+  class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, regclass:$val3,
+                               regclass:$val4, i32imm:$a,
+                               i32imm:$b),
+                  !strconcat("st.param.v4", opstr,
+                             " \t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                  []>;
+
+  class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
+                  !strconcat("st.param", opstr, " \t[func_retval0+$a], $val;"),
+                  []>;
+
+  class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+                  !strconcat("st.param.v2", opstr,
+                             " \t[func_retval0+$a], {{$val, $val2}};"),
+                  []>;
+
+  class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+        NVPTXInst<(outs),
+                  (ins regclass:$val, regclass:$val2, regclass:$val3,
+                       regclass:$val4, i32imm:$a),
+                  !strconcat("st.param.v4", opstr,
+                             " \t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                  []>;
+}
+
+let isCall=1 in {
+  multiclass CALL<string OpcStr, SDNode OpNode> {
+     def PrintCallNoRetInst : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " "), [(OpNode (i32 0))]>;
+     def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0), "), [(OpNode (i32 1))]>;
+     def PrintCallRetInst2 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1), "), [(OpNode (i32 2))]>;
+     def PrintCallRetInst3 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2), "), [(OpNode (i32 3))]>;
+     def PrintCallRetInst4 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3), "),
+       [(OpNode (i32 4))]>;
+     def PrintCallRetInst5 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4), "),
+       [(OpNode (i32 5))]>;
+     def PrintCallRetInst6 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5), "),
+       [(OpNode (i32 6))]>;
+     def PrintCallRetInst7 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5, retval6), "),
+       [(OpNode (i32 7))]>;
+     def PrintCallRetInst8 : NVPTXInst<(outs), (ins),
+       !strconcat(OpcStr, " (retval0, retval1, retval2, retval3, retval4, "
+                            "retval5, retval6, retval7), "),
+       [(OpNode (i32 8))]>;
+  }
+}
+
+defm Call : CALL<"call", PrintCall>;
+defm CallUni : CALL<"call.uni", PrintCallUni>;
+
+// Convergent call instructions.  These are identical to regular calls, except
+// they have the isConvergent bit set.
+let isConvergent=1 in {
+  defm ConvergentCall : CALL<"call", PrintConvergentCall>;
+  defm ConvergentCallUni : CALL<"call.uni", PrintConvergentCallUni>;
+}
+
+def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
+def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
+def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
+def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
+def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
+def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
+def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
+def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
+def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
+def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
+def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
+def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
+def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
+def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
+def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
+def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
+def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
+def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
+def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
+def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
+
+def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
+def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
+
+def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
+def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
+def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
+def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
+def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
+
+def StoreParamV4I32  : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
+def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
+
+def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
+def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
+def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
+def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
+def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
+def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
+def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
+def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
+def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+
+def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
+def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
+def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
+def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
+def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
+def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
+def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
+def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
+def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
+def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
+
+def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
+def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
+def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
+def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
+def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
+def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
+def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
+def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
+def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
+
+def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
+def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
+def CallArgEndInst0  : NVPTXInst<(outs), (ins), ")", [(CallArgEnd (i32 0))]>;
+def RETURNInst       : NVPTXInst<(outs), (ins), "ret;", [(RETURNNode)]>;
+
+class CallArgInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs), (ins regclass:$a), "$a, ",
+            [(CallArg (i32 0), regclass:$a)]>;
+
+class LastCallArgInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs), (ins regclass:$a), "$a",
+            [(LastCallArg (i32 0), regclass:$a)]>;
+
+def CallArgI64     : CallArgInst<Int64Regs>;
+def CallArgI32     : CallArgInst<Int32Regs>;
+def CallArgI16     : CallArgInst<Int16Regs>;
+def CallArgF64     : CallArgInst<Float64Regs>;
+def CallArgF32     : CallArgInst<Float32Regs>;
+
+def LastCallArgI64 : LastCallArgInst<Int64Regs>;
+def LastCallArgI32 : LastCallArgInst<Int32Regs>;
+def LastCallArgI16 : LastCallArgInst<Int16Regs>;
+def LastCallArgF64 : LastCallArgInst<Float64Regs>;
+def LastCallArgF32 : LastCallArgInst<Float32Regs>;
+
+def CallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a, ",
+                              [(CallArg (i32 0), (i32 imm:$a))]>;
+def LastCallArgI32imm : NVPTXInst<(outs), (ins i32imm:$a), "$a",
+                                  [(LastCallArg (i32 0), (i32 imm:$a))]>;
+
+def CallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a, ",
+                             [(CallArg (i32 1), (i32 imm:$a))]>;
+def LastCallArgParam : NVPTXInst<(outs), (ins i32imm:$a), "param$a",
+                                 [(LastCallArg (i32 1), (i32 imm:$a))]>;
+
+def CallVoidInst :      NVPTXInst<(outs), (ins imem:$addr), "$addr, ",
+                                  [(CallVoid (Wrapper tglobaladdr:$addr))]>;
+def CallVoidInstReg :   NVPTXInst<(outs), (ins Int32Regs:$addr), "$addr, ",
+                                  [(CallVoid Int32Regs:$addr)]>;
+def CallVoidInstReg64 : NVPTXInst<(outs), (ins Int64Regs:$addr), "$addr, ",
+                                  [(CallVoid Int64Regs:$addr)]>;
+def PrototypeInst :     NVPTXInst<(outs), (ins i32imm:$val), ", prototype_$val;",
+                                  [(Prototype (i32 imm:$val))]>;
+
+def DeclareRetMemInst :
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$size, i32imm:$num),
+            ".param .align $align .b8 retval$num[$size];",
+            [(DeclareRetParam (i32 imm:$align), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetScalarInst :
+  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+            ".param .b$size retval$num;",
+            [(DeclareRet (i32 1), (i32 imm:$size), (i32 imm:$num))]>;
+def DeclareRetRegInst :
+  NVPTXInst<(outs), (ins i32imm:$size, i32imm:$num),
+            ".reg .b$size retval$num;",
+            [(DeclareRet (i32 2), (i32 imm:$size), (i32 imm:$num))]>;
+
+def DeclareParamInst :
+  NVPTXInst<(outs), (ins i32imm:$align, i32imm:$a, i32imm:$size),
+            ".param .align $align .b8 param$a[$size];",
+            [(DeclareParam (i32 imm:$align), (i32 imm:$a), (i32 imm:$size))]>;
+def DeclareScalarParamInst :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+            ".param .b$size param$a;",
+            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
+def DeclareScalarRegInst :
+  NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
+            ".reg .b$size param$a;",
+            [(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;
+
+class MoveParamInst<NVPTXRegClass regclass, string asmstr> :
+  NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
+            !strconcat("mov", asmstr, " \t$dst, $src;"),
+            [(set regclass:$dst, (MoveParam regclass:$src))]>;
+
+def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;
+def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
+def MoveParamI16 :
+  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+            "cvt.u16.u32 \t$dst, $src;",
+            [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
+def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
+def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
+def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;
+
+class PseudoUseParamInst<NVPTXRegClass regclass> :
+  NVPTXInst<(outs), (ins regclass:$src),
+            "// Pseudo use of $src",
+            [(PseudoUseParam regclass:$src)]>;
+
+def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
+def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
+def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
+def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
+def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
+
+
+//
+// Load / Store Handling
+//
+multiclass LD<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _areg : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _areg_64 : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr];", []>;
+  def _ari : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+  def _ari_64 : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+  def _asi : NVPTXInst<
+    (outs regclass:$dst),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t$dst, [$addr+$offset];", []>;
+}
+
+let mayLoad=1, hasSideEffects=0 in {
+  defm LD_i8  : LD<Int16Regs>;
+  defm LD_i16 : LD<Int16Regs>;
+  defm LD_i32 : LD<Int32Regs>;
+  defm LD_i64 : LD<Int64Regs>;
+  defm LD_f16 : LD<Float16Regs>;
+  defm LD_f16x2 : LD<Float16x2Regs>;
+  defm LD_f32 : LD<Float32Regs>;
+  defm LD_f64 : LD<Float64Regs>;
+}
+
+multiclass ST<NVPTXRegClass regclass> {
+  def _avar : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _areg_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr], $src;", []>;
+  def _ari : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+  def _ari_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+  def _asi : NVPTXInst<
+    (outs),
+    (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+         LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+    " \t[$addr+$offset], $src;", []>;
+}
+
+let mayStore=1, hasSideEffects=0 in {
+  defm ST_i8  : ST<Int16Regs>;
+  defm ST_i16 : ST<Int16Regs>;
+  defm ST_i32 : ST<Int32Regs>;
+  defm ST_i64 : ST<Int64Regs>;
+  defm ST_f16 : ST<Float16Regs>;
+  defm ST_f16x2 : ST<Float16x2Regs>;
+  defm ST_f32 : ST<Float32Regs>;
+  defm ST_f64 : ST<Float64Regs>;
+}
+
+// The following is used only in and after vector elementizations.  Vector
+// elementization happens at the machine instruction level, so the following
+// instructions never appear in the DAG.
+multiclass LD_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_areg : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_areg_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr];", []>;
+  def _v2_ari : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v2_ari_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v2_asi : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
+  def _v4_avar : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_areg : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_areg_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
+  def _v4_ari : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+  def _v4_ari_64 : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+  def _v4_asi : NVPTXInst<
+    (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
+    (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
+}
+let mayLoad=1, hasSideEffects=0 in {
+  defm LDV_i8  : LD_VEC<Int16Regs>;
+  defm LDV_i16 : LD_VEC<Int16Regs>;
+  defm LDV_i32 : LD_VEC<Int32Regs>;
+  defm LDV_i64 : LD_VEC<Int64Regs>;
+  defm LDV_f16 : LD_VEC<Float16Regs>;
+  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
+  defm LDV_f32 : LD_VEC<Float32Regs>;
+  defm LDV_f64 : LD_VEC<Float64Regs>;
+}
+
+multiclass ST_VEC<NVPTXRegClass regclass> {
+  def _v2_avar : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_areg_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2}};", []>;
+  def _v2_ari : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v2_ari_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v2_asi : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+         LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
+         i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2}};", []>;
+  def _v4_avar : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_areg : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_areg_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_ari : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_ari_64 : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+    "\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+  def _v4_asi : NVPTXInst<
+    (outs),
+    (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
+         LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+         i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+    "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+    "$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
+}
+
+let mayStore=1, hasSideEffects=0 in {
+  defm STV_i8  : ST_VEC<Int16Regs>;
+  defm STV_i16 : ST_VEC<Int16Regs>;
+  defm STV_i32 : ST_VEC<Int32Regs>;
+  defm STV_i64 : ST_VEC<Int64Regs>;
+  defm STV_f16 : ST_VEC<Float16Regs>;
+  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
+  defm STV_f32 : ST_VEC<Float32Regs>;
+  defm STV_f64 : ST_VEC<Float64Regs>;
+}
+
+//---- Conversion ----
+
+class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
+  NVPTXRegClass regclassOut> :
+           NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
+           !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
+     [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
+
+def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
+def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;
+def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
+def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
+def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
+def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
+def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
+
+// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+// we cannot specify floating-point literals in isel patterns.  Therefore, we
+// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+
+// sint -> f16
+def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
+          (CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f16 (sint_to_fp Int16Regs:$a)),
+          (CVT_f16_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp Int32Regs:$a)),
+          (CVT_f16_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f16 (sint_to_fp Int64Regs:$a)),
+          (CVT_f16_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f16
+def : Pat<(f16 (uint_to_fp Int1Regs:$a)),
+          (CVT_f16_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f16 (uint_to_fp Int16Regs:$a)),
+          (CVT_f16_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp Int32Regs:$a)),
+          (CVT_f16_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f16 (uint_to_fp Int64Regs:$a)),
+          (CVT_f16_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f32
+def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f32
+def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f64
+def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f64
+def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+
+
+// f16 -> sint
+def : Pat<(i1 (fp_to_sint Float16Regs:$a)),
+          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
+          (CVT_s16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float16Regs:$a)),
+          (CVT_s16_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
+          (CVT_s32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float16Regs:$a)),
+          (CVT_s32_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
+          (CVT_s64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float16Regs:$a)),
+          (CVT_s64_f16 Float16Regs:$a, CvtRZI)>;
+
+// f16 -> uint
+def : Pat<(i1 (fp_to_uint Float16Regs:$a)),
+          (SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
+          (CVT_u16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float16Regs:$a)),
+          (CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
+          (CVT_u32_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float16Regs:$a)),
+          (CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
+          (CVT_u64_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float16Regs:$a)),
+          (CVT_u64_f16 Float16Regs:$a, CvtRZI)>;
+
+// f32 -> sint
+def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f32 -> uint
+def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f64 -> sint
+def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+
+// f64 -> uint
+def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+
+// sext i1
+def : Pat<(i16 (sext Int1Regs:$a)),
+          (SELP_s16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (sext Int1Regs:$a)),
+          (SELP_s32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (sext Int1Regs:$a)),
+          (SELP_s64ii -1, 0, Int1Regs:$a)>;
+
+// zext i1
+def : Pat<(i16 (zext Int1Regs:$a)),
+          (SELP_u16ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (zext Int1Regs:$a)),
+          (SELP_u32ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (zext Int1Regs:$a)),
+          (SELP_u64ii 1, 0, Int1Regs:$a)>;
+
+// anyext i1
+def : Pat<(i16 (anyext Int1Regs:$a)),
+          (SELP_u16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (anyext Int1Regs:$a)),
+          (SELP_u32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (anyext Int1Regs:$a)),
+          (SELP_u64ii -1, 0, Int1Regs:$a)>;
+
+// sext i16
+def : Pat<(i32 (sext Int16Regs:$a)),
+          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext Int16Regs:$a)),
+          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+
+// zext i16
+def : Pat<(i32 (zext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// anyext i16
+def : Pat<(i32 (anyext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// sext i32
+def : Pat<(i64 (sext Int32Regs:$a)),
+          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+
+// zext i32
+def : Pat<(i64 (zext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+// anyext i32
+def : Pat<(i64 (anyext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+
+// truncate i64
+def : Pat<(i32 (trunc Int64Regs:$a)),
+          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i16 (trunc Int64Regs:$a)),
+          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int64Regs:$a)),
+          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i32
+def : Pat<(i16 (trunc Int32Regs:$a)),
+          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int32Regs:$a)),
+          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i16
+def : Pat<(i1 (trunc Int16Regs:$a)),
+          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+
+// sext_inreg
+def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+
+
+// Select instructions with 32-bit predicates
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
+          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+
+
+let hasSideEffects = 0 in {
+  // pack a set of smaller int registers to a larger int register
+  def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
+                             (ins Int16Regs:$s1, Int16Regs:$s2,
+                                  Int16Regs:$s3, Int16Regs:$s4),
+                             "mov.b64 \t$d, {{$s1, $s2, $s3, $s4}};", []>;
+  def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
+                             (ins Int16Regs:$s1, Int16Regs:$s2),
+                             "mov.b32 \t$d, {{$s1, $s2}};", []>;
+  def V2I32toI64 : NVPTXInst<(outs Int64Regs:$d),
+                             (ins Int32Regs:$s1, Int32Regs:$s2),
+                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
+  def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
+                             (ins Float32Regs:$s1, Float32Regs:$s2),
+                             "mov.b64 \t$d, {{$s1, $s2}};", []>;
+
+  // unpack a larger int register to a set of smaller int registers
+  def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
+                                   Int16Regs:$d3, Int16Regs:$d4),
+                             (ins Int64Regs:$s),
+                             "mov.b64 \t{{$d1, $d2, $d3, $d4}}, $s;", []>;
+  def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
+                             (ins Int32Regs:$s),
+                             "mov.b32 \t{{$d1, $d2}}, $s;", []>;
+  def I64toV2I32 : NVPTXInst<(outs Int32Regs:$d1, Int32Regs:$d2),
+                             (ins Int64Regs:$s),
+                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+  def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
+                             (ins Float64Regs:$s),
+                             "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+
+}
+
+let hasSideEffects = 0 in {
+  // Extract element of f16x2 register. PTX does not provide any way
+  // to access elements of f16x2 vector directly, so we need to
+  // extract it using a temporary register.
+  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
+                               (ins Float16x2Regs:$src),
+                               "{{ .reg .b16 \t%tmp_hi;\n\t"
+                               "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
+                               [(set Float16Regs:$dst,
+                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
+  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
+                               (ins Float16x2Regs:$src),
+                               "{{ .reg .b16 \t%tmp_lo;\n\t"
+                               "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
+                               [(set Float16Regs:$dst,
+                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
+
+  // Coalesce two f16 registers into f16x2
+  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
+                             (ins Float16Regs:$a, Float16Regs:$b),
+                             "mov.b32 \t$dst, {{$a, $b}};",
+                             [(set Float16x2Regs:$dst,
+                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
+
+  // Directly initializing underlying the b32 register is one less SASS
+  // instruction than than vector-packing move.
+  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
+                              "mov.b32 \t$dst, $src;",
+                              []>;
+
+  // Split f16x2 into two f16 registers.
+  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                              (ins Float16x2Regs:$src),
+                              "mov.b32 \t{{$lo, $hi}}, $src;",
+                              []>;
+  // Split an i32 into two f16
+  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                                   (ins Int32Regs:$src),
+                                   "mov.b32 \t{{$lo, $hi}}, $src;",
+                                   []>;
+}
+
+// Count leading zeros
+let hasSideEffects = 0 in {
+  def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                         "clz.b32 \t$d, $a;", []>;
+  def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                         "clz.b64 \t$d, $a;", []>;
+}
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
+
+// The return type of the ctlz ISD node is the same as its input, but the PTX
+// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
+// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+// truncating back down to 32 bits.
+def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
+
+// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+// result back to 16-bits if necessary.  We also need to subtract 16 because
+// the high-order 16 zeros were counted.
+//
+// TODO: NVPTX has a mov.b32 b32reg, {imm, b16reg} instruction, which we could
+// use to save one SASS instruction (on sm_35 anyway):
+//
+//   mov.b32 $tmp, {0xffff, $a}
+//   ctlz.b32 $result, $tmp
+//
+// That is, instead of zero-extending the input to 32 bits, we'd "one-extend"
+// and then ctlz that value.  This way we don't have to subtract 16 from the
+// result.  Unfortunately today we don't have a way to generate
+// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
+def : Pat<(ctlz Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32
+           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
+
+// Population count
+let hasSideEffects = 0 in {
+  def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                          "popc.b32 \t$d, $a;", []>;
+  def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                          "popc.b64 \t$d, $a;", []>;
+}
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
+// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
+// pattern that avoids the type conversion if we're truncating the result to
+// i32 anyway.
+def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
+// If we know that we're storing into an i32, we can avoid the final trunc.
+def : Pat<(ctpop Int16Regs:$a),
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
+
+// fpround f32 -> f16
+def : Pat<(f16 (fpround Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f16 (fpround Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
+// fpround f64 -> f16
+def : Pat<(f16 (fpround Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f16 (fpround Float64Regs:$a)),
+          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+
+// fpround f64 -> f32
+def : Pat<(f32 (fpround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+
+// fpextend f16 -> f32
+def : Pat<(f32 (fpextend Float16Regs:$a)),
+          (CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend Float16Regs:$a)),
+          (CVT_f32_f16 Float16Regs:$a, CvtNONE)>;
+
+// fpextend f16 -> f64
+def : Pat<(f64 (fpextend Float16Regs:$a)),
+          (CVT_f64_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend Float16Regs:$a)),
+          (CVT_f64_f16 Float16Regs:$a, CvtNONE)>;
+
+// fpextend f32 -> f64
+def : Pat<(f64 (fpextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fpextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
+
+def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInGlue]>;
+
+// fceil, ffloor, fround, ftrunc.
+
+def : Pat<(fceil Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fceil Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fceil Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fceil Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fceil Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(ffloor Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ffloor Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ffloor Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ffloor Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ffloor Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
+
+def : Pat<(fround Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f16 (fround Float16Regs:$a)),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fround Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float32Regs:$a)),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(f64 (fround Float64Regs:$a)),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(ftrunc Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ftrunc Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ftrunc Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(ftrunc Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(ftrunc Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
+
+// nearbyint and rint are implemented as rounding to nearest even.  This isn't
+// strictly correct, because it causes us to ignore the rounding mode.  But it
+// matches what CUDA's "libm" does.
+
+def : Pat<(fnearbyint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fnearbyint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fnearbyint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(fnearbyint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(fnearbyint Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+def : Pat<(frint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(frint Float16Regs:$a),
+          (CVT_f16_f16 Float16Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(frint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(frint Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>, Requires<[doNoF32FTZ]>;
+def : Pat<(frint Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
+
+
+//-----------------------------------
+// Control-flow
+//-----------------------------------
+
+let isTerminator=1 in {
+   let isReturn=1, isBarrier=1 in
+      def Return : NVPTXInst<(outs), (ins), "ret;", [(retflag)]>;
+
+   let isBranch=1 in
+      def CBranch : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                              "@$a bra \t$target;",
+                              [(brcond Int1Regs:$a, bb:$target)]>;
+   let isBranch=1 in
+      def CBranchOther : NVPTXInst<(outs), (ins Int1Regs:$a, brtarget:$target),
+                                   "@!$a bra \t$target;", []>;
+
+   let isBranch=1, isBarrier=1 in
+      def GOTO : NVPTXInst<(outs), (ins brtarget:$target),
+                           "bra.uni \t$target;", [(br bb:$target)]>;
+}
+
+def : Pat<(brcond Int32Regs:$a, bb:$target),
+          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
+
+// SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
+// conditional branch if the target block is the next block so that the code
+// can fall through to the target block.  The invertion is done by 'xor
+// condition, 1', which will be translated to (setne condition, -1).  Since ptx
+// supports '@!pred bra target', we should use it.
+def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target),
+          (CBranchOther Int1Regs:$a, bb:$target)>;
+
+// Call
+def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                            SDTCisVT<1, i32>]>;
+def SDT_NVPTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart,
+                           [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_NVPTXCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPSideEffect]>;
+
+def SDT_NVPTXCall : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"NVPTXISD::CALL", SDT_NVPTXCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def calltarget : Operand<i32>;
+let isCall=1 in {
+   def CALL : NVPTXInst<(outs), (ins calltarget:$dst), "call \t$dst, (1);", []>;
+}
+
+def : Pat<(call tglobaladdr:$dst), (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst), (CALL texternalsym:$dst)>;
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : NVPTXInst<outs, ins, asmstr, pattern>;
+
+def Callseq_Start :
+  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "\\{ // callseq $amt1, $amt2\n"
+            "\t.reg .b32 temp_param_reg;",
+            [(callseq_start timm:$amt1, timm:$amt2)]>;
+def Callseq_End :
+  NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+            "\\} // callseq $amt1",
+            [(callseq_end timm:$amt1, timm:$amt2)]>;
+
+// trap instruction
+def trapinst : NVPTXInst<(outs), (ins), "trap;", [(trap)]>;
+
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype :
+  SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+  let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE :
+  NVPTXInst<(outs), (ins ProtoIdent:$ident),
+            "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+include "NVPTXIntrinsics.td"
+
+
+//-----------------------------------
+// Notes
+//-----------------------------------
+// BSWAP is currently expanded. The following is a more efficient
+// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
+// - for sm_20, use pmpt (use vector scalar mov to get the pack and
+//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
+// register.
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b0408f1..8d228a9 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -36,33 +36,39 @@ let isConvergent = 1 in {
 def INT_BARRIER0 : NVPTXInst<(outs), (ins),
                   "bar.sync \t0;",
       [(int_nvvm_barrier0)]>;
+def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
+                  "bar.sync \t$src1;",
+      [(int_nvvm_barrier_n Int32Regs:$src1)]>;
+def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
+                  "bar.sync \t$src1, $src2;",
+      [(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>;
 def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
   !strconcat("{{ \n\t",
-      !strconcat(".reg .pred \t%p1; \n\t",
-      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
-      !strconcat("bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
-        !strconcat("}}", ""))))),
+             ".reg .pred \t%p1; \n\t",
+             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
+             "bar.red.popc.u32 \t$dst, 0, %p1; \n\t",
+             "}}"),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_popc Int32Regs:$pred))]>;
 def INT_BARRIER0_AND : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
   !strconcat("{{ \n\t",
-      !strconcat(".reg .pred \t%p1; \n\t",
-      !strconcat(".reg .pred \t%p2; \n\t",
-      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
-      !strconcat("bar.red.and.pred \t%p2, 0, %p1; \n\t",
-      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
-        !strconcat("}}", ""))))))),
+             ".reg .pred \t%p1; \n\t",
+             ".reg .pred \t%p2; \n\t",
+             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
+             "bar.red.and.pred \t%p2, 0, %p1; \n\t",
+             "selp.u32 \t$dst, 1, 0, %p2; \n\t",
+             "}}"),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_and Int32Regs:$pred))]>;
 def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
   !strconcat("{{ \n\t",
-      !strconcat(".reg .pred \t%p1; \n\t",
-      !strconcat(".reg .pred \t%p2; \n\t",
-      !strconcat("setp.ne.u32 \t%p1, $pred, 0; \n\t",
-      !strconcat("bar.red.or.pred \t%p2, 0, %p1; \n\t",
-      !strconcat("selp.u32 \t$dst, 1, 0, %p2; \n\t",
-        !strconcat("}}", ""))))))),
+             ".reg .pred \t%p1; \n\t",
+             ".reg .pred \t%p2; \n\t",
+             "setp.ne.u32 \t%p1, $pred, 0; \n\t",
+             "bar.red.or.pred \t%p2, 0, %p1; \n\t",
+             "selp.u32 \t$dst, 1, 0, %p2; \n\t",
+             "}}"),
       [(set Int32Regs:$dst, (int_nvvm_barrier0_or Int32Regs:$pred))]>;
 
-def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync\t$i;",
+def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
                              [(int_nvvm_bar_sync imm:$i)]>;
 
 // shfl.{up,down,bfly,idx}.b32
@@ -187,16 +193,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass,
 // MISC
 //
 
-def INT_NVVM_CLZ_I : F_MATH_1<"clz.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_clz_i>;
-def INT_NVVM_CLZ_LL : F_MATH_1<"clz.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
-  int_nvvm_clz_ll>;
-
-def INT_NVVM_POPC_I : F_MATH_1<"popc.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_popc_i>;
-def INT_NVVM_POPC_LL : F_MATH_1<"popc.b64 \t$dst, $src0;", Int32Regs, Int64Regs,
-  int_nvvm_popc_ll>;
-
 def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
   Int32Regs, Int32Regs, Int32Regs, int_nvvm_prmt>;
 
@@ -204,26 +200,6 @@ def INT_NVVM_PRMT : F_MATH_3<"prmt.b32 \t$dst, $src0, $src1, $src2;", Int32Regs,
 // Min Max
 //
 
-def INT_NVVM_MIN_I : F_MATH_2<"min.s32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_min_i>;
-def INT_NVVM_MIN_UI : F_MATH_2<"min.u32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_min_ui>;
-
-def INT_NVVM_MIN_LL : F_MATH_2<"min.s64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_min_ll>;
-def INT_NVVM_MIN_ULL : F_MATH_2<"min.u64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_min_ull>;
-
-def INT_NVVM_MAX_I : F_MATH_2<"max.s32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_max_i>;
-def INT_NVVM_MAX_UI : F_MATH_2<"max.u32 \t$dst, $src0, $src1;", Int32Regs,
-  Int32Regs, Int32Regs, int_nvvm_max_ui>;
-
-def INT_NVVM_MAX_LL : F_MATH_2<"max.s64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_max_ll>;
-def INT_NVVM_MAX_ULL : F_MATH_2<"max.u64 \t$dst, $src0, $src1;", Int64Regs,
-  Int64Regs, Int64Regs, int_nvvm_max_ull>;
-
 def INT_NVVM_FMIN_F : F_MATH_2<"min.f32 \t$dst, $src0, $src1;", Float32Regs,
   Float32Regs, Float32Regs, int_nvvm_fmin_f>;
 def INT_NVVM_FMIN_FTZ_F : F_MATH_2<"min.ftz.f32 \t$dst, $src0, $src1;",
@@ -239,6 +215,7 @@ def INT_NVVM_FMIN_D : F_MATH_2<"min.f64 \t$dst, $src0, $src1;", Float64Regs,
 def INT_NVVM_FMAX_D : F_MATH_2<"max.f64 \t$dst, $src0, $src1;", Float64Regs,
   Float64Regs, Float64Regs, int_nvvm_fmax_d>;
 
+
 //
 // Multiplication
 //
@@ -321,15 +298,6 @@ def INT_NVVM_DIV_RP_D : F_MATH_2<"div.rp.f64 \t$dst, $src0, $src1;",
   Float64Regs, Float64Regs, Float64Regs, int_nvvm_div_rp_d>;
 
 //
-// Brev
-//
-
-def INT_NVVM_BREV32 : F_MATH_1<"brev.b32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_brev32>;
-def INT_NVVM_BREV64 : F_MATH_1<"brev.b64 \t$dst, $src0;", Int64Regs, Int64Regs,
-  int_nvvm_brev64>;
-
-//
 // Sad
 //
 
@@ -360,11 +328,6 @@ def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
 // Abs
 //
 
-def INT_NVVM_ABS_I : F_MATH_1<"abs.s32 \t$dst, $src0;", Int32Regs, Int32Regs,
-  int_nvvm_abs_i>;
-def INT_NVVM_ABS_LL : F_MATH_1<"abs.s64 \t$dst, $src0;", Int64Regs, Int64Regs,
-  int_nvvm_abs_ll>;
-
 def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs,
   Float32Regs, int_nvvm_fabs_ftz_f>;
 def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs,
@@ -703,16 +666,18 @@ def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
 def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
   Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
 
-def INT_NVVM_D2I_LO : F_MATH_1<!strconcat("{{\n\t",
-                       !strconcat(".reg .b32 %temp; \n\t",
-             !strconcat("mov.b64 \t{$dst, %temp}, $src0;\n\t",
-               "}}"))),
-             Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
-def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
-                       !strconcat(".reg .b32 %temp; \n\t",
-                         !strconcat("mov.b64 \t{%temp, $dst}, $src0;\n\t",
-                           "}}"))),
-             Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
+def INT_NVVM_D2I_LO : F_MATH_1<
+  !strconcat("{{\n\t",
+             ".reg .b32 %temp; \n\t",
+             "mov.b64 \t{$dst, %temp}, $src0;\n\t",
+             "}}"),
+  Int32Regs, Float64Regs, int_nvvm_d2i_lo>;
+def INT_NVVM_D2I_HI : F_MATH_1<
+  !strconcat("{{\n\t",
+             ".reg .b32 %temp; \n\t",
+             "mov.b64 \t{%temp, $dst}, $src0;\n\t",
+             "}}"),
+  Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
 
 def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
           (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
@@ -803,49 +768,10 @@ def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
           (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
 
 
-// FIXME: Ideally, we could use these patterns instead of the scope-creating
-// patterns, but ptxas does not like these since .s16 is not compatible with
-// .f16.  The solution is to use .bXX for all integer register types, but we
-// are not there yet.
-//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
-//          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
-//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
-//          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
-//
-//def : Pat<(int_nvvm_h2f Int16Regs:$a),
-//          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
-
-def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
-                                   !strconcat(".reg .b16 %temp;\n\t",
-           !strconcat("cvt.rn.ftz.f16.f32 \t%temp, $src0;\n\t",
-           !strconcat("mov.b16 \t$dst, %temp;\n",
-             "}}")))),
-                                   Int16Regs, Float32Regs, int_nvvm_f2h_rn_ftz>;
-def INT_NVVM_F2H_RN : F_MATH_1<!strconcat("{{\n\t",
-                                   !strconcat(".reg .b16 %temp;\n\t",
-           !strconcat("cvt.rn.f16.f32 \t%temp, $src0;\n\t",
-           !strconcat("mov.b16 \t$dst, %temp;\n",
-             "}}")))),
-           Int16Regs, Float32Regs, int_nvvm_f2h_rn>;
-
-def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
-                            !strconcat(".reg .b16 %temp;\n\t",
-          !strconcat("mov.b16 \t%temp, $src0;\n\t",
-          !strconcat("cvt.f32.f16 \t$dst, %temp;\n\t",
-            "}}")))),
-          Float32Regs, Int16Regs, int_nvvm_h2f>;
-
-def : Pat<(f32 (f16_to_fp Int16Regs:$a)),
-          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
-def : Pat<(i16 (fp_to_f16 Float32Regs:$a)),
-          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
-
-def : Pat<(f64 (f16_to_fp Int16Regs:$a)),
-          (CVT_f64_f16 Int16Regs:$a, CvtNONE)>;
-def : Pat<(i16 (fp_to_f16 Float64Regs:$a)),
-          (CVT_f16_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+          (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ))>;
+def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+          (BITCONVERT_16_F2I (CVT_f16_f32 Float32Regs:$a, CvtRN))>;
 
 //
 // Bitcast
@@ -882,20 +808,12 @@ multiclass F_ATOMIC_2_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, SDNode IMM, Predicate Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
   Requires<[Pred]>;
   def imm : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, IMMType:$b),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b;", ""),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, IMM:$b))]>,
   Requires<[Pred]>;
 }
 multiclass F_ATOMIC_2<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
@@ -911,21 +829,13 @@ multiclass F_ATOMIC_2_NEG_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
   Operand IMMType, Predicate Pred> {
   def reg : NVPTXInst<(outs regclass:$dst), (ins ptrclass:$addr, regclass:$b),
-    !strconcat("{{ \n\t",
-         !strconcat(".reg \t.s",
-         !strconcat(TypeStr,
-         !strconcat(" temp; \n\t",
-         !strconcat("neg.s",
-         !strconcat(TypeStr,
-         !strconcat(" \ttemp, $b; \n\t",
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(".u",
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], temp; \n\t",
-           !strconcat("}}", "")))))))))))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
+    !strconcat(
+      "{{ \n\t",
+      ".reg \t.s", TypeStr, " temp; \n\t",
+      "neg.s", TypeStr, " \ttemp, $b; \n\t",
+      "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t",
+      "}}"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b))]>,
   Requires<[Pred]>;
 }
 multiclass F_ATOMIC_2_NEG<NVPTXRegClass regclass, string SpaceStr,
@@ -943,40 +853,26 @@ multiclass F_ATOMIC_3_imp<NVPTXRegClass ptrclass, NVPTXRegClass regclass,
   Operand IMMType, Predicate Pred> {
   def reg : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, regclass:$b, regclass:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst,
-           (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
-         Requires<[Pred]>;
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, regclass:$c))]>,
+  Requires<[Pred]>;
+
   def imm1 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, IMMType:$b, regclass:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, regclass:$c))]>,
   Requires<[Pred]>;
+
   def imm2 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, regclass:$b, IMMType:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, regclass:$b, imm:$c))]>,
   Requires<[Pred]>;
+
   def imm3 : NVPTXInst<(outs regclass:$dst),
     (ins ptrclass:$addr, IMMType:$b, IMMType:$c),
-               !strconcat("atom",
-         !strconcat(SpaceStr,
-         !strconcat(OpcStr,
-         !strconcat(TypeStr,
-         !strconcat(" \t$dst, [$addr], $b, $c;", ""))))),
-         [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
+    !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+    [(set regclass:$dst, (IntOp ptrclass:$addr, imm:$b, imm:$c))]>,
   Requires<[Pred]>;
 }
 multiclass F_ATOMIC_3<NVPTXRegClass regclass, string SpaceStr, string TypeStr,
@@ -1607,6 +1503,8 @@ defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f16 : LDU_G<"b16 \t$result, [$src];", Float16Regs>;
+defm INT_PTX_LDU_GLOBAL_f16x2 : LDU_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
 defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
@@ -1657,6 +1555,10 @@ defm INT_PTX_LDU_G_v2i16_ELE
   : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDU_G_v2i32_ELE
   : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f16_ELE
+  : VLDU_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+defm INT_PTX_LDU_G_v2f16x2_ELE
+  : VLDU_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_G_v2f32_ELE
   : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDU_G_v2i64_ELE
@@ -1671,6 +1573,12 @@ defm INT_PTX_LDU_G_v4i16_ELE
 defm INT_PTX_LDU_G_v4i32_ELE
   : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE
+  : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE
+  : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float16x2Regs>;
 defm INT_PTX_LDU_G_v4f32_ELE
   : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Float32Regs>;
@@ -1710,6 +1618,10 @@ defm INT_PTX_LDG_GLOBAL_i32
   : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDG_GLOBAL_i64
   : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f16
+  : LDG_G<"b16 \t$result, [$src];", Float16Regs>;
+defm INT_PTX_LDG_GLOBAL_f16x2
+  : LDG_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_GLOBAL_f32
   : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDG_GLOBAL_f64
@@ -1765,6 +1677,10 @@ defm INT_PTX_LDG_G_v2i16_ELE
   : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v2i32_ELE
   : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f16_ELE
+  : VLDG_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+defm INT_PTX_LDG_G_v2f16x2_ELE
+  : VLDG_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v2f32_ELE
   : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDG_G_v2i64_ELE
@@ -1777,17 +1693,21 @@ defm INT_PTX_LDG_G_v4i16_ELE
   : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v4i32_ELE
   : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f16_ELE
+  : VLDG_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16Regs>;
+defm INT_PTX_LDG_G_v4f16x2_ELE
+  : VLDG_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v4f32_ELE
   : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
 
 
 multiclass NG_TO_G<string Str, Intrinsic Intrin> {
    def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          !strconcat("cvta.", !strconcat(Str, ".u32 \t$result, $src;")),
+          !strconcat("cvta.", Str, ".u32 \t$result, $src;"),
       [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
    def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          !strconcat("cvta.", !strconcat(Str, ".u64 \t$result, $src;")),
+          !strconcat("cvta.", Str, ".u64 \t$result, $src;"),
       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
 
@@ -1821,11 +1741,11 @@ multiclass NG_TO_G<string Str, Intrinsic Intrin> {
 
 multiclass G_TO_NG<string Str, Intrinsic Intrin> {
    def _yes : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          !strconcat("cvta.to.", !strconcat(Str, ".u32 \t$result, $src;")),
+          !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"),
       [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
    def _yes_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          !strconcat("cvta.to.", !strconcat(Str, ".u64 \t$result, $src;")),
+          !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"),
       [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>,
    Requires<[hasGenericLdSt]>;
    def _no : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
@@ -1983,7 +1903,7 @@ def ISSPACEP_SHARED_64
 // Special register reads
 def MOV_SPECIAL : NVPTXInst<(outs Int32Regs:$d),
                             (ins SpecialRegs:$r),
-                            "mov.b32\t$d, $r;", []>;
+                            "mov.b32 \t$d, $r;", []>;
 
 def : Pat<(int_nvvm_read_ptx_sreg_envreg0), (MOV_SPECIAL ENVREG0)>;
 def : Pat<(int_nvvm_read_ptx_sreg_envreg1), (MOV_SPECIAL ENVREG1)>;
@@ -2046,20 +1966,18 @@ def : Pat<(int_nvvm_rotate_b32 Int32Regs:$src, Int32Regs:$amt),
       Requires<[noHWROT32]> ;
 
 let hasSideEffects = 0 in {
-  def GET_LO_INT64
-    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
-                !strconcat("{{\n\t",
-                !strconcat(".reg .b32 %dummy;\n\t",
-                !strconcat("mov.b64 \t{$dst,%dummy}, $src;\n\t",
-          !strconcat("}}", "")))),
+  def GET_LO_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+    !strconcat("{{\n\t",
+               ".reg .b32 %dummy;\n\t",
+               "mov.b64 \t{$dst,%dummy}, $src;\n\t",
+               "}}"),
           []> ;
 
-  def GET_HI_INT64
-    : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
-                !strconcat("{{\n\t",
-                !strconcat(".reg .b32 %dummy;\n\t",
-                !strconcat("mov.b64 \t{%dummy,$dst}, $src;\n\t",
-          !strconcat("}}", "")))),
+  def GET_HI_INT64 : NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$src),
+    !strconcat("{{\n\t",
+               ".reg .b32 %dummy;\n\t",
+               "mov.b64 \t{%dummy,$dst}, $src;\n\t",
+               "}}"),
           []> ;
 }
 
@@ -2164,19 +2082,19 @@ def TEX_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_F32_F32_LEVEL
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
-              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
 def TEX_1D_F32_F32_GRAD
@@ -2184,27 +2102,27 @@ def TEX_1D_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
 def TEX_1D_S32_F32_GRAD
@@ -2212,27 +2130,27 @@ def TEX_1D_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
-              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
-              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              "tex.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
               []>;
 def TEX_1D_U32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], $lod;",
               []>;
 def TEX_1D_U32_F32_GRAD
@@ -2240,7 +2158,7 @@ def TEX_1D_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -2248,14 +2166,14 @@ def TEX_1D_ARRAY_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_F32_F32_LEVEL
@@ -2263,7 +2181,7 @@ def TEX_1D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_1D_ARRAY_F32_F32_GRAD
@@ -2271,21 +2189,21 @@ def TEX_1D_ARRAY_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_ARRAY_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_S32_F32_LEVEL
@@ -2293,7 +2211,7 @@ def TEX_1D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_1D_ARRAY_S32_F32_GRAD
@@ -2301,21 +2219,21 @@ def TEX_1D_ARRAY_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_1D_ARRAY_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}];",
               []>;
 def TEX_1D_ARRAY_U32_F32_LEVEL
@@ -2323,7 +2241,7 @@ def TEX_1D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_1D_ARRAY_U32_F32_GRAD
@@ -2331,7 +2249,7 @@ def TEX_1D_ARRAY_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -2339,14 +2257,14 @@ def TEX_2D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_F32_F32_LEVEL
@@ -2354,7 +2272,7 @@ def TEX_2D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_2D_F32_F32_GRAD
@@ -2363,7 +2281,7 @@ def TEX_2D_F32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2371,14 +2289,14 @@ def TEX_2D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_S32_F32_LEVEL
@@ -2386,7 +2304,7 @@ def TEX_2D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_2D_S32_F32_GRAD
@@ -2395,7 +2313,7 @@ def TEX_2D_S32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2403,14 +2321,14 @@ def TEX_2D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TEX_2D_U32_F32_LEVEL
@@ -2418,7 +2336,7 @@ def TEX_2D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_2D_U32_F32_GRAD
@@ -2427,7 +2345,7 @@ def TEX_2D_U32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2437,7 +2355,7 @@ def TEX_2D_ARRAY_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_F32_F32
@@ -2445,7 +2363,7 @@ def TEX_2D_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_F32_F32_LEVEL
@@ -2453,7 +2371,7 @@ def TEX_2D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_2D_ARRAY_F32_F32_GRAD
@@ -2462,7 +2380,7 @@ def TEX_2D_ARRAY_F32_F32_GRAD
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2471,7 +2389,7 @@ def TEX_2D_ARRAY_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_S32_F32
@@ -2479,7 +2397,7 @@ def TEX_2D_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_S32_F32_LEVEL
@@ -2487,7 +2405,7 @@ def TEX_2D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_2D_ARRAY_S32_F32_GRAD
@@ -2497,7 +2415,7 @@ def TEX_2D_ARRAY_S32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2506,7 +2424,7 @@ def TEX_2D_ARRAY_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_U32_F32
@@ -2514,7 +2432,7 @@ def TEX_2D_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_2D_ARRAY_U32_F32_LEVEL
@@ -2522,7 +2440,7 @@ def TEX_2D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_2D_ARRAY_U32_F32_GRAD
@@ -2532,7 +2450,7 @@ def TEX_2D_ARRAY_U32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -2542,7 +2460,7 @@ def TEX_3D_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_F32_F32
@@ -2550,7 +2468,7 @@ def TEX_3D_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_F32_F32_LEVEL
@@ -2558,7 +2476,7 @@ def TEX_3D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_3D_F32_F32_GRAD
@@ -2569,7 +2487,7 @@ def TEX_3D_F32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -2579,7 +2497,7 @@ def TEX_3D_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_S32_F32
@@ -2587,7 +2505,7 @@ def TEX_3D_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_S32_F32_LEVEL
@@ -2595,7 +2513,7 @@ def TEX_3D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_3D_S32_F32_GRAD
@@ -2606,7 +2524,7 @@ def TEX_3D_S32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -2616,7 +2534,7 @@ def TEX_3D_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_U32_F32
@@ -2624,7 +2542,7 @@ def TEX_3D_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_3D_U32_F32_LEVEL
@@ -2632,7 +2550,7 @@ def TEX_3D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_3D_U32_F32_GRAD
@@ -2643,7 +2561,7 @@ def TEX_3D_U32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -2654,7 +2572,7 @@ def TEX_CUBE_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_CUBE_F32_F32_LEVEL
@@ -2663,7 +2581,7 @@ def TEX_CUBE_F32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_CUBE_S32_F32
@@ -2671,7 +2589,7 @@ def TEX_CUBE_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_CUBE_S32_F32_LEVEL
@@ -2680,7 +2598,7 @@ def TEX_CUBE_S32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_CUBE_U32_F32
@@ -2688,7 +2606,7 @@ def TEX_CUBE_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_CUBE_U32_F32_LEVEL
@@ -2697,7 +2615,7 @@ def TEX_CUBE_U32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 
@@ -2706,7 +2624,7 @@ def TEX_CUBE_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_CUBE_ARRAY_F32_F32_LEVEL
@@ -2715,7 +2633,7 @@ def TEX_CUBE_ARRAY_F32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_CUBE_ARRAY_S32_F32
@@ -2723,7 +2641,7 @@ def TEX_CUBE_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_CUBE_ARRAY_S32_F32_LEVEL
@@ -2732,7 +2650,7 @@ def TEX_CUBE_ARRAY_S32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_CUBE_ARRAY_U32_F32
@@ -2740,7 +2658,7 @@ def TEX_CUBE_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_CUBE_ARRAY_U32_F32_LEVEL
@@ -2749,7 +2667,7 @@ def TEX_CUBE_ARRAY_U32_F32_LEVEL
               (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, $s, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 
@@ -2757,84 +2675,84 @@ def TLD4_R_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_G_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_B_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_A_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_R_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_G_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_B_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_A_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_R_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_G_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_B_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 def TLD4_A_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, $s, \\{$x, $y\\}];",
               []>;
 }
@@ -2847,19 +2765,19 @@ def TEX_UNIFIED_1D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_F32_F32_LEVEL
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$lod),
-              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_F32_F32_GRAD
@@ -2867,27 +2785,27 @@ def TEX_UNIFIED_1D_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_S32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_S32_F32_GRAD
@@ -2895,27 +2813,27 @@ def TEX_UNIFIED_1D_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x),
-              "tex.1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x),
-              "tex.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
+              "tex.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, [$t, \\{$x\\}];",
               []>;
 def TEX_UNIFIED_1D_U32_F32_LEVEL
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_U32_F32_GRAD
@@ -2923,7 +2841,7 @@ def TEX_UNIFIED_1D_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -2931,14 +2849,14 @@ def TEX_UNIFIED_1D_ARRAY_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
@@ -2946,7 +2864,7 @@ def TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
@@ -2954,21 +2872,21 @@ def TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
@@ -2976,7 +2894,7 @@ def TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
@@ -2984,21 +2902,21 @@ def TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x),
-              "tex.a1d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x),
-              "tex.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}];",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
@@ -3006,7 +2924,7 @@ def TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$lod),
-              "tex.level.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], $lod;",
               []>;
 def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
@@ -3014,7 +2932,7 @@ def TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$gradx, Float32Regs:$grady),
-              "tex.grad.a1d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a1d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
               []>;
 
@@ -3022,14 +2940,14 @@ def TEX_UNIFIED_2D_F32_S32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_F32_F32_LEVEL
@@ -3037,7 +2955,7 @@ def TEX_UNIFIED_2D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_F32_F32_GRAD
@@ -3046,7 +2964,7 @@ def TEX_UNIFIED_2D_F32_F32_GRAD
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3054,14 +2972,14 @@ def TEX_UNIFIED_2D_S32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_S32_F32_LEVEL
@@ -3069,7 +2987,7 @@ def TEX_UNIFIED_2D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_S32_F32_GRAD
@@ -3078,7 +2996,7 @@ def TEX_UNIFIED_2D_S32_F32_GRAD
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3086,14 +3004,14 @@ def TEX_UNIFIED_2D_U32_S32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y),
-              "tex.2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tex.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_U32_F32_LEVEL
@@ -3101,7 +3019,7 @@ def TEX_UNIFIED_2D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$lod),
-              "tex.level.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_U32_F32_GRAD
@@ -3110,7 +3028,7 @@ def TEX_UNIFIED_2D_U32_F32_GRAD
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3120,7 +3038,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_F32_F32
@@ -3128,7 +3046,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
@@ -3136,7 +3054,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
@@ -3145,7 +3063,7 @@ def TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3154,7 +3072,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_S32_F32
@@ -3162,7 +3080,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
@@ -3170,7 +3088,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
@@ -3180,7 +3098,7 @@ def TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3189,7 +3107,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Int32Regs:$x,
                    Int32Regs:$y),
-              "tex.a2d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_U32_F32
@@ -3197,7 +3115,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y),
-              "tex.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}];",
               []>;
 def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
@@ -3205,7 +3123,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l, Float32Regs:$x,
                    Float32Regs:$y, Float32Regs:$lod),
-              "tex.level.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], $lod;",
               []>;
 def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
@@ -3215,7 +3133,7 @@ def TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
                    Float32Regs:$y,
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$grady0, Float32Regs:$grady1),
-              "tex.grad.a2d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.a2d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
               "\\{$grady0, $grady1\\};",
               []>;
@@ -3225,7 +3143,7 @@ def TEX_UNIFIED_3D_F32_S32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_F32_F32
@@ -3233,7 +3151,7 @@ def TEX_UNIFIED_3D_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_F32_F32_LEVEL
@@ -3241,7 +3159,7 @@ def TEX_UNIFIED_3D_F32_F32_LEVEL
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_3D_F32_F32_GRAD
@@ -3252,7 +3170,7 @@ def TEX_UNIFIED_3D_F32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -3262,7 +3180,7 @@ def TEX_UNIFIED_3D_S32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_S32_F32
@@ -3270,7 +3188,7 @@ def TEX_UNIFIED_3D_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_S32_F32_LEVEL
@@ -3278,7 +3196,7 @@ def TEX_UNIFIED_3D_S32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_3D_S32_F32_GRAD
@@ -3289,7 +3207,7 @@ def TEX_UNIFIED_3D_S32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -3299,7 +3217,7 @@ def TEX_UNIFIED_3D_U32_S32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$x, Int32Regs:$y,
                    Int32Regs:$z),
-              "tex.3d.v4.u32.s32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.s32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_U32_F32
@@ -3307,7 +3225,7 @@ def TEX_UNIFIED_3D_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z),
-              "tex.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_3D_U32_F32_LEVEL
@@ -3315,7 +3233,7 @@ def TEX_UNIFIED_3D_U32_F32_LEVEL
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y,
                    Float32Regs:$z, Float32Regs:$lod),
-              "tex.level.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_3D_U32_F32_GRAD
@@ -3326,7 +3244,7 @@ def TEX_UNIFIED_3D_U32_F32_GRAD
                    Float32Regs:$gradx0, Float32Regs:$gradx1,
                    Float32Regs:$gradx2, Float32Regs:$grady0,
                    Float32Regs:$grady1, Float32Regs:$grady2),
-              "tex.grad.3d.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.grad.3d.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], "
               "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
               "\\{$grady0, $grady1, $grady2, $grady2\\};",
@@ -3337,7 +3255,7 @@ def TEX_UNIFIED_CUBE_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_F32_F32_LEVEL
@@ -3346,7 +3264,7 @@ def TEX_UNIFIED_CUBE_F32_F32_LEVEL
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_S32_F32
@@ -3354,7 +3272,7 @@ def TEX_UNIFIED_CUBE_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_S32_F32_LEVEL
@@ -3363,7 +3281,7 @@ def TEX_UNIFIED_CUBE_S32_F32_LEVEL
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_U32_F32
@@ -3371,7 +3289,7 @@ def TEX_UNIFIED_CUBE_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_U32_F32_LEVEL
@@ -3380,7 +3298,7 @@ def TEX_UNIFIED_CUBE_U32_F32_LEVEL
               (ins Int64Regs:$t,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.cube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.cube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$x, $y, $z, $z\\}], $lod;",
               []>;
 
@@ -3389,7 +3307,7 @@ def TEX_UNIFIED_CUBE_ARRAY_F32_F32
                     Float32Regs:$b, Float32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l,
                Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
@@ -3398,7 +3316,7 @@ def TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.f32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_S32_F32
@@ -3406,7 +3324,7 @@ def TEX_UNIFIED_CUBE_ARRAY_S32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
@@ -3415,7 +3333,7 @@ def TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.s32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_U32_F32
@@ -3423,7 +3341,7 @@ def TEX_UNIFIED_CUBE_ARRAY_U32_F32
                     Int32Regs:$b, Int32Regs:$a),
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z),
-              "tex.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}];",
               []>;
 def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
@@ -3432,7 +3350,7 @@ def TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
               (ins Int64Regs:$t, Int32Regs:$l,
                    Float32Regs:$x, Float32Regs:$y, Float32Regs:$z,
                    Float32Regs:$lod),
-              "tex.level.acube.v4.u32.f32\t\\{$r, $g, $b, $a\\}, "
+              "tex.level.acube.v4.u32.f32 \t\\{$r, $g, $b, $a\\}, "
               "[$t, \\{$l, $x, $y, $z\\}], $lod;",
               []>;
 
@@ -3440,84 +3358,84 @@ def TLD4_UNIFIED_R_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_G_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_B_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_A_2D_F32_F32
   : NVPTXInst<(outs Float32Regs:$v0, Float32Regs:$v1,
                     Float32Regs:$v2, Float32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.f32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.f32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_R_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_G_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_B_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_A_2D_S32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.s32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.s32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_R_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.r.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.r.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_G_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.g.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.g.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_B_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.b.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.b.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 def TLD4_UNIFIED_A_2D_U32_F32
   : NVPTXInst<(outs Int32Regs:$v0, Int32Regs:$v1,
                     Int32Regs:$v2, Int32Regs:$v3),
               (ins Int64Regs:$t, Float32Regs:$x, Float32Regs:$y),
-              "tld4.a.2d.v4.u32.f32\t\\{$v0, $v1, $v2, $v3\\}, "
+              "tld4.a.2d.v4.u32.f32 \t\\{$v0, $v1, $v2, $v3\\}, "
               "[$t, \\{$x, $y\\}];",
               []>;
 }
@@ -7172,12 +7090,12 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
 
 class PTX_READ_SREG_R64<string regname, Intrinsic intop>
   : NVPTXInst<(outs Int64Regs:$d), (ins),
-              !strconcat(!strconcat("mov.u64\t$d, %", regname), ";"),
+              !strconcat("mov.u64 \t$d, %", regname, ";"),
               [(set Int64Regs:$d, (intop))]>;
 
 class PTX_READ_SREG_R32<string regname, Intrinsic intop>
   : NVPTXInst<(outs Int32Regs:$d), (ins),
-              !strconcat(!strconcat("mov.u32\t$d, %", regname), ";"),
+              !strconcat("mov.u32 \t$d, %", regname, ";"),
               [(set Int32Regs:$d, (intop))]>;
 
 // TODO Add read vector-version of special registers
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index b925b63..989f0a3 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXLowerAggrCopies.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -26,6 +27,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 #define DEBUG_TYPE "nvptx"
 
@@ -41,6 +43,7 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<StackProtector>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override;
@@ -54,194 +57,14 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
 char NVPTXLowerAggrCopies::ID = 0;
 
-// Lower memcpy to loop.
-void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
-                         Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
-                         bool DstIsVolatile, LLVMContext &Context,
-                         Function &F) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-  BasicBlock *NewBB =
-      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
-
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // SrcAddr and DstAddr are expected to be pointer types,
-  // so no check is made here.
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
-  // Cast pointers to (char *)
-  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
-  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
-
-  // load from SrcAddr+LoopIndex
-  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
-  // word-sized loads and stores.
-  Value *Element =
-      LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
-                                 LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
-                             SrcIsVolatile);
-  // store at DstAddr+LoopIndex
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
-                                                        DstAddr, LoopIndex),
-                          DstIsVolatile);
-
-  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
-// Lower memmove to IR. memmove is required to correctly copy overlapping memory
-// regions; therefore, it has to check the relative positions of the source and
-// destination pointers and choose the copy direction accordingly.
-//
-// The code below is an IR rendition of this C function:
-//
-// void* memmove(void* dst, const void* src, size_t n) {
-//   unsigned char* d = dst;
-//   const unsigned char* s = src;
-//   if (s < d) {
-//     // copy backwards
-//     while (n--) {
-//       d[n] = s[n];
-//     }
-//   } else {
-//     // copy forward
-//     for (size_t i = 0; i < n; ++i) {
-//       d[i] = s[i];
-//     }
-//   }
-//   return dst;
-// }
-void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
-                          Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
-                          bool DstIsVolatile, LLVMContext &Context,
-                          Function &F) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-
-  // Create the a comparison of src and dst, based on which we jump to either
-  // the forward-copy part of the function (if src >= dst) or the backwards-copy
-  // part (if src < dst).
-  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
-  // structure. Its block terminators (unconditional branches) are replaced by
-  // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
-                                      SrcAddr, DstAddr, "compare_src_dst");
-  TerminatorInst *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
-                                &ElseTerm);
-
-  // Each part of the function consists of two blocks:
-  //   copy_backwards:        used to skip the loop when n == 0
-  //   copy_backwards_loop:   the actual backwards loop BB
-  //   copy_forward:          used to skip the loop when n == 0
-  //   copy_forward_loop:     the actual forward loop BB
-  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
-  CopyBackwardsBB->setName("copy_backwards");
-  BasicBlock *CopyForwardBB = ElseTerm->getParent();
-  CopyForwardBB->setName("copy_forward");
-  BasicBlock *ExitBB = ConvertedInst->getParent();
-  ExitBB->setName("memmove_done");
-
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
-  // between both backwards and forward copy clauses.
-  ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
-
-  // Copying backwards.
-  BasicBlock *LoopBB =
-      BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  Value *IndexPtr = LoopBuilder.CreateSub(
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateLoad(
-      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
-  LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
-      ExitBB, LoopBB);
-  LoopPhi->addIncoming(IndexPtr, LoopBB);
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
-  ThenTerm->eraseFromParent();
-
-  // Copying forward.
-  BasicBlock *FwdLoopBB =
-      BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *FwdElement = FwdLoopBuilder.CreateLoad(
-      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
-  FwdLoopBuilder.CreateStore(
-      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
-                              ExitBB, FwdLoopBB);
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
-  ElseTerm->eraseFromParent();
-}
-
-// Lower memset to loop.
-void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
-                         Value *CopyLen, Value *SetValue, LLVMContext &Context,
-                         Function &F) {
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-  BasicBlock *NewBB =
-      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
-
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // Cast pointer to the type of value getting stored
-  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-  DstAddr = Builder.CreateBitCast(DstAddr,
-                                  PointerType::get(SetValue->getType(), dstAS));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
-  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
-
-  LoopBuilder.CreateStore(
-      SetValue,
-      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
-      false);
-
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
 bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   SmallVector<LoadInst *, 4> AggrLoads;
   SmallVector<MemIntrinsic *, 4> MemCalls;
 
   const DataLayout &DL = F.getParent()->getDataLayout();
   LLVMContext &Context = F.getParent()->getContext();
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   // Collect all aggregate loads and mem* calls.
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
@@ -285,15 +108,26 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     Value *SrcAddr = LI->getOperand(0);
     Value *DstAddr = SI->getOperand(1);
     unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
-    Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
-
-    convertMemCpyToLoop(/* ConvertedInst */ SI,
-                        /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
-                        /* CopyLen */ CopyLen,
-                        /* SrcIsVolatile */ LI->isVolatile(),
-                        /* DstIsVolatile */ SI->isVolatile(),
-                        /* Context */ Context,
-                        /* Function F */ F);
+    ConstantInt *CopyLen =
+        ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
+
+    if (!TTI.useWideIRMemcpyLoopLowering()) {
+      createMemCpyLoop(/* ConvertedInst */ SI,
+                       /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                       /* CopyLen */ CopyLen,
+                       /* SrcAlign */ LI->getAlignment(),
+                       /* DestAlign */ SI->getAlignment(),
+                       /* SrcIsVolatile */ LI->isVolatile(),
+                       /* DstIsVolatile */ SI->isVolatile());
+    } else {
+      createMemCpyLoopKnownSize(/* ConvertedInst */ SI,
+                                /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                                /* CopyLen */ CopyLen,
+                                /* SrcAlign */ LI->getAlignment(),
+                                /* DestAlign */ SI->getAlignment(),
+                                /* SrcIsVolatile */ LI->isVolatile(),
+                                /* DstIsVolatile */ SI->isVolatile(), TTI);
+    }
 
     SI->eraseFromParent();
     LI->eraseFromParent();
@@ -302,31 +136,11 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   // Transform mem* intrinsic calls.
   for (MemIntrinsic *MemCall : MemCalls) {
     if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
-      convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
-                          /* SrcAddr */ Memcpy->getRawSource(),
-                          /* DstAddr */ Memcpy->getRawDest(),
-                          /* CopyLen */ Memcpy->getLength(),
-                          /* SrcIsVolatile */ Memcpy->isVolatile(),
-                          /* DstIsVolatile */ Memcpy->isVolatile(),
-                          /* Context */ Context,
-                          /* Function F */ F);
+      expandMemCpyAsLoop(Memcpy, TTI);
     } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
-      convertMemMoveToLoop(/* ConvertedInst */ Memmove,
-                           /* SrcAddr */ Memmove->getRawSource(),
-                           /* DstAddr */ Memmove->getRawDest(),
-                           /* CopyLen */ Memmove->getLength(),
-                           /* SrcIsVolatile */ Memmove->isVolatile(),
-                           /* DstIsVolatile */ Memmove->isVolatile(),
-                           /* Context */ Context,
-                           /* Function F */ F);
-
+      expandMemMoveAsLoop(Memmove);
     } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
-      convertMemSetToLoop(/* ConvertedInst */ Memset,
-                          /* DstAddr */ Memset->getRawDest(),
-                          /* CopyLen */ Memset->getLength(),
-                          /* SetValue */ Memset->getValue(),
-                          /* Context */ Context,
-                          /* Function F */ F);
+      expandMemSetAsLoop(Memset);
     }
     MemCall->eraseFromParent();
   }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 3f0c7be..139dc7f 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -90,8 +90,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
-#include "NVPTXUtilities.h"
 #include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
@@ -159,11 +159,12 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
-  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace();
+  AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1));
+  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo()));
   Arg->replaceAllUsesWith(AllocA);
 
   Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
index eab5ee8..86a28f7 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -27,6 +27,13 @@ void NVPTXFloatMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
 
   switch (Kind) {
   default: llvm_unreachable("Invalid kind!");
+  case VK_NVPTX_HALF_PREC_FLOAT:
+    // ptxas does not have a way to specify half-precision floats.
+    // Instead we have to print and load fp16 constants as .b16
+    OS << "0x";
+    NumHex = 4;
+    APF.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+    break;
   case VK_NVPTX_SINGLE_PREC_FLOAT:
     OS << "0f";
     NumHex = 8;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
index 7f833c4..95741d9 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -22,8 +22,9 @@ class NVPTXFloatMCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
     VK_NVPTX_None,
-    VK_NVPTX_SINGLE_PREC_FLOAT,   // FP constant in single-precision
-    VK_NVPTX_DOUBLE_PREC_FLOAT    // FP constant in double-precision
+    VK_NVPTX_HALF_PREC_FLOAT,   // FP constant in half-precision
+    VK_NVPTX_SINGLE_PREC_FLOAT, // FP constant in single-precision
+    VK_NVPTX_DOUBLE_PREC_FLOAT  // FP constant in double-precision
   };
 
 private:
@@ -40,6 +41,11 @@ public:
   static const NVPTXFloatMCExpr *create(VariantKind Kind, const APFloat &Flt,
                                         MCContext &Ctx);
 
+  static const NVPTXFloatMCExpr *createConstantFPHalf(const APFloat &Flt,
+                                                        MCContext &Ctx) {
+    return create(VK_NVPTX_HALF_PREC_FLOAT, Flt, Ctx);
+  }
+
   static const NVPTXFloatMCExpr *createConstantFPSingle(const APFloat &Flt,
                                                         MCContext &Ctx) {
     return create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
index 49e6397..4e902c0 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -36,8 +36,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
@@ -113,7 +113,7 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
       BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
               Root.getOperand(0).getReg())
           .addReg(NVPTX::VRFrameLocal)
-          .addOperand(Prev.getOperand(2));
+          .add(Prev.getOperand(2));
 
   MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 6cbf060..8d46694 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -27,12 +27,19 @@ using namespace llvm;
 
 namespace llvm {
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
-  if (RC == &NVPTX::Float32RegsRegClass) {
+  if (RC == &NVPTX::Float32RegsRegClass)
     return ".f32";
-  }
-  if (RC == &NVPTX::Float64RegsRegClass) {
+  if (RC == &NVPTX::Float16RegsRegClass)
+    // Ideally fp16 registers should be .f16, but this syntax is only
+    // supported on sm_53+. On the other hand, .b16 registers are
+    // accepted for all supported fp16 instructions on all GPU
+    // variants, so we can use them instead.
+    return ".b16";
+  if (RC == &NVPTX::Float16x2RegsRegClass)
+    return ".b32";
+  if (RC == &NVPTX::Float64RegsRegClass)
     return ".f64";
-  } else if (RC == &NVPTX::Int64RegsRegClass) {
+  if (RC == &NVPTX::Int64RegsRegClass)
     // We use untyped (.b) integer registers here as NVCC does.
     // Correctness of generated code does not depend on register type,
     // but using .s/.u registers runs into ptxas bug that prevents
@@ -52,40 +59,37 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
     //   add.f16v2 rb32,rb32,rb32; // OK
     //   add.f16v2 rs32,rs32,rs32; // OK
     return ".b64";
-  } else if (RC == &NVPTX::Int32RegsRegClass) {
+  if (RC == &NVPTX::Int32RegsRegClass)
     return ".b32";
-  } else if (RC == &NVPTX::Int16RegsRegClass) {
+  if (RC == &NVPTX::Int16RegsRegClass)
     return ".b16";
-  } else if (RC == &NVPTX::Int1RegsRegClass) {
+  if (RC == &NVPTX::Int1RegsRegClass)
     return ".pred";
-  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+  if (RC == &NVPTX::SpecialRegsRegClass)
     return "!Special!";
-  } else {
-    return "INTERNAL";
-  }
-  return "";
+  return "INTERNAL";
 }
 
 std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
-  if (RC == &NVPTX::Float32RegsRegClass) {
+  if (RC == &NVPTX::Float32RegsRegClass)
     return "%f";
-  }
-  if (RC == &NVPTX::Float64RegsRegClass) {
+  if (RC == &NVPTX::Float16RegsRegClass)
+    return "%h";
+  if (RC == &NVPTX::Float16x2RegsRegClass)
+    return "%hh";
+  if (RC == &NVPTX::Float64RegsRegClass)
     return "%fd";
-  } else if (RC == &NVPTX::Int64RegsRegClass) {
+  if (RC == &NVPTX::Int64RegsRegClass)
     return "%rd";
-  } else if (RC == &NVPTX::Int32RegsRegClass) {
+  if (RC == &NVPTX::Int32RegsRegClass)
     return "%r";
-  } else if (RC == &NVPTX::Int16RegsRegClass) {
+  if (RC == &NVPTX::Int16RegsRegClass)
     return "%rs";
-  } else if (RC == &NVPTX::Int1RegsRegClass) {
+  if (RC == &NVPTX::Int1RegsRegClass)
     return "%p";
-  } else if (RC == &NVPTX::SpecialRegsRegClass) {
+  if (RC == &NVPTX::SpecialRegsRegClass)
     return "!Special!";
-  } else {
-    return "INTERNAL";
-  }
-  return "";
+  return "INTERNAL";
 }
 }
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
index ff6ccc4..f04764a 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -36,6 +36,8 @@ foreach i = 0-4 in {
   def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
   def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
   def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
+  def H#i  : NVPTXReg<"%h"#i>;  // 16-bit float
+  def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float
   def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
   def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
 
@@ -57,6 +59,8 @@ def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
 def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
 def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
 def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>;
+def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;
 def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
 def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
 def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
index b0472de..d736eaa 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h
@@ -31,7 +31,7 @@ public:
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
-  void PrintSwitchToSection(const MCAsmInfo &MAI,
+  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                             raw_ostream &OS,
                             const MCExpr *Subsection) const override {}
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 6e1f427..acbee86 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -23,6 +23,11 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_CTOR
 #include "NVPTXGenSubtargetInfo.inc"
 
+static cl::opt<bool>
+    NoF16Math("nvptx-no-f16-math", cl::ZeroOrMore, cl::Hidden,
+              cl::desc("NVPTX Specific: Disable generation of f16 math ops."),
+              cl::init(false));
+
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
@@ -57,3 +62,7 @@ bool NVPTXSubtarget::hasImageHandles() const {
   // Disabled, otherwise
   return false;
 }
+
+bool NVPTXSubtarget::allowFP16Math() const {
+  return hasFP16Math() && NoF16Math == false;
+}
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index da020a9..96618cf 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -101,6 +101,8 @@ public:
   inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
   inline bool hasROT64() const { return SmVersion >= 20; }
   bool hasImageHandles() const;
+  bool hasFP16Math() const { return SmVersion >= 53; }
+  bool allowFP16Math() const;
 
   unsigned int getSmVersion() const { return SmVersion; }
   std::string getTargetName() const { return TargetName; }
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index eb357e0..2b6ba8c 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NVPTXTargetMachine.h"
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
-#include "NVPTXTargetMachine.h"
 #include "NVPTXTargetObjectFile.h"
 #include "NVPTXTargetTransformInfo.h"
 #include "llvm/ADT/STLExtras.h"
@@ -28,6 +28,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -50,7 +51,6 @@ void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
-void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
@@ -70,7 +70,6 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeGenericToNVVMPass(PR);
   initializeNVPTXAllocaHoistingPass(PR);
   initializeNVPTXAssignValidGlobalNamesPass(PR);
-  initializeNVPTXInferAddressSpacesPass(PR);
   initializeNVPTXLowerArgsPass(PR);
   initializeNVPTXLowerAllocaPass(PR);
   initializeNVPTXLowerAggrCopiesPass(PR);
@@ -133,7 +132,7 @@ namespace {
 
 class NVPTXPassConfig : public TargetPassConfig {
 public:
-  NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
+  NVPTXPassConfig(NVPTXTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   NVPTXTargetMachine &getNVPTXTargetMachine() const {
@@ -164,12 +163,16 @@ private:
 } // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new NVPTXPassConfig(this, PM);
+  return new NVPTXPassConfig(*this, PM);
 }
 
-void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
-  PM.add(createNVVMReflectPass());
-  PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+  Builder.addExtension(
+    PassManagerBuilder::EP_EarlyAsPossible,
+    [&](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      PM.add(createNVVMReflectPass());
+      PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
+    });
 }
 
 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
@@ -190,7 +193,7 @@ void NVPTXPassConfig::addAddressSpaceInferencePasses() {
   // be eliminated by SROA.
   addPass(createSROAPass());
   addPass(createNVPTXLowerAllocaPass());
-  addPass(createNVPTXInferAddressSpacesPass());
+  addPass(createInferAddressSpacesPass());
 }
 
 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 78a0538..2f3981b 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -61,9 +61,13 @@ public:
     return TLOF.get();
   }
 
-  void addEarlyAsPossiblePasses(PassManagerBase &PM) override;
+  void adjustPassManager(PassManagerBuilder &) override;
+
   TargetIRAnalysis getTargetIRAnalysis() override;
 
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 }; // NVPTXTargetMachine.
 
 class NVPTXTargetMachine32 : public NVPTXTargetMachine {
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index dd77070..a64d955 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -141,9 +141,9 @@ int NVPTXTTIImpl::getArithmeticInstrCost(
   }
 }
 
-void NVPTXTTIImpl::getUnrollingPreferences(Loop *L,
+void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                            TTI::UnrollingPreferences &UP) {
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 
   // Enable partial unrolling and runtime unrolling, but reduce the
   // threshold.  This partially unrolls small loops which are often
diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index b6c271a..f987892 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -45,6 +45,10 @@ public:
 
   bool isSourceOfDivergence(const Value *V);
 
+  unsigned getFlatAddressSpace() const {
+    return AddressSpace::ADDRESS_SPACE_GENERIC;
+  }
+
   // Increase the inlining cost threshold by a factor of 5, reflecting that
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() { return 5; }
@@ -57,7 +61,8 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 };
 
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
index 9c71a2e..11277f5 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp
@@ -15,8 +15,8 @@
 #include "NVPTX.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
index c639c4d..152b665 100644
--- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp
@@ -10,11 +10,10 @@
 // This pass replaces occurrences of __nvvm_reflect("foo") and llvm.nvvm.reflect
 // with an integer.
 //
-// We choose the value we use by looking, in this order, at:
-//
-//  * the -nvvm-reflect-list flag, which has the format "foo=1,bar=42",
-//  * the StringMap passed to the pass's constructor, and
-//  * metadata in the module itself.
+// We choose the value we use by looking at metadata in the module itself.  Note
+// that we intentionally only have one way to choose these values, because other
+// parts of LLVM (particularly, InstCombineCall) rely on being able to predict
+// the values chosen by this pass.
 //
 // If we see an unknown string, we replace its call with 0.
 //
@@ -49,30 +48,17 @@ namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
 class NVVMReflect : public FunctionPass {
-private:
-  StringMap<int> VarMap;
-
 public:
   static char ID;
-  NVVMReflect() : NVVMReflect(StringMap<int>()) {}
-
-  NVVMReflect(const StringMap<int> &Mapping)
-      : FunctionPass(ID), VarMap(Mapping) {
+  NVVMReflect() : FunctionPass(ID) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
-    setVarMap();
   }
 
   bool runOnFunction(Function &) override;
-
-private:
-  void setVarMap();
 };
 }
 
 FunctionPass *llvm::createNVVMReflectPass() { return new NVVMReflect(); }
-FunctionPass *llvm::createNVVMReflectPass(const StringMap<int> &Mapping) {
-  return new NVVMReflect(Mapping);
-}
 
 static cl::opt<bool>
 NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
@@ -83,35 +69,6 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 "Replace occurrences of __nvvm_reflect() calls with 0/1", false,
                 false)
 
-static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
-            cl::desc("A list of string=num assignments"),
-            cl::ValueRequired);
-
-/// The command line can look as follows :
-/// -nvvm-reflect-list a=1,b=2 -nvvm-reflect-list c=3,d=0 -R e=2
-/// The strings "a=1,b=2", "c=3,d=0", "e=2" are available in the
-/// ReflectList vector. First, each of ReflectList[i] is 'split'
-/// using "," as the delimiter. Then each of this part is split
-/// using "=" as the delimiter.
-void NVVMReflect::setVarMap() {
-  for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Option : "  << ReflectList[i] << "\n");
-    SmallVector<StringRef, 4> NameValList;
-    StringRef(ReflectList[i]).split(NameValList, ',');
-    for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) {
-      SmallVector<StringRef, 2> NameValPair;
-      NameValList[j].split(NameValPair, '=');
-      assert(NameValPair.size() == 2 && "name=val expected");
-      std::stringstream ValStream(NameValPair[1]);
-      int Val;
-      ValStream >> Val;
-      assert((!(ValStream.fail())) && "integer value expected");
-      VarMap[NameValPair[0]] = Val;
-    }
-  }
-}
-
 bool NVVMReflect::runOnFunction(Function &F) {
   if (!NVVMReflectEnabled)
     return false;
@@ -199,11 +156,10 @@ bool NVVMReflect::runOnFunction(Function &F) {
     DEBUG(dbgs() << "Arg of _reflect : " << ReflectArg << "\n");
 
     int ReflectVal = 0; // The default value is 0
-    auto Iter = VarMap.find(ReflectArg);
-    if (Iter != VarMap.end())
-      ReflectVal = Iter->second;
-    else if (ReflectArg == "__CUDA_FTZ") {
-      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.
+    if (ReflectArg == "__CUDA_FTZ") {
+      // Try to pull __CUDA_FTZ from the nvvm-reflect-ftz module flag.  Our
+      // choice here must be kept in sync with AutoUpgrade, which uses the same
+      // technique to detect whether ftz is enabled.
       if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
               F.getParent()->getModuleFlag("nvvm-reflect-ftz")))
         ReflectVal = Flag->getSExtValue();
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
new file mode 100644
index 0000000..d913166
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
@@ -0,0 +1,25 @@
+//===-- Nios2MCTargetDesc.cpp - Nios2 Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Nios2 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2MCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "Nios2GenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "Nios2GenRegisterInfo.inc"
+
+extern "C" void LLVMInitializeNios2TargetMC() {}
diff --git a/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
new file mode 100644
index 0000000..d426062
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
@@ -0,0 +1,34 @@
+//===-- Nios2MCTargetDesc.h - Nios2 Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Nios2 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H
+#define LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H
+
+namespace llvm {
+class Target;
+class Triple;
+
+Target &getTheNios2Target();
+
+} // namespace llvm
+
+// Defines symbolic names for Nios2 registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "Nios2GenRegisterInfo.inc"
+
+// Defines symbolic names for the Nios2 instructions.
+#define GET_INSTRINFO_ENUM
+#include "Nios2GenInstrInfo.inc"
+
+#endif
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2.h b/contrib/llvm/lib/Target/Nios2/Nios2.h
new file mode 100644
index 0000000..87202f4
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2.h
@@ -0,0 +1,25 @@
+//===-- Nios2.h - Top-level interface for Nios2 representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Nios2 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2_H
+#define LLVM_LIB_TARGET_NIOS2_NIOS2_H
+
+#include "MCTargetDesc/Nios2MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Nios2TargetMachine;
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2.td b/contrib/llvm/lib/Target/Nios2/Nios2.td
new file mode 100644
index 0000000..e8abba8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2.td
@@ -0,0 +1,29 @@
+//===-- Nios2.td - Describe the Nios2 Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Target-dependent interfaces
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "Nios2RegisterInfo.td"
+include "Nios2InstrInfo.td"
+
+def Nios2InstrInfo : InstrInfo;
+
+def Nios2 : Target { let InstructionSet = Nios2InstrInfo; }
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td b/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td
new file mode 100644
index 0000000..79868be
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2InstrFormats.td
@@ -0,0 +1,117 @@
+//===-- Nios2InstrFormats.td - Nios2 Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NIOS2 instructions format
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def Pseudo : Format<0>;
+def FrmI : Format<1>;
+def FrmR : Format<2>;
+def FrmJ : Format<3>;
+def FrmOther : Format<4>; // Instruction w/ a custom format
+
+// Generic Nios2 Format
+class Nios2Inst<dag outs, dag ins, string asmstr, list<dag> pattern, Format f>
+    : Instruction {
+  field bits<32> Inst;
+  Format Form = f;
+
+  let Namespace = "Nios2";
+
+  let Size = 4;
+
+  bits<6> Opcode = 0;
+
+  // Bottom 6 bits are the 'opcode' field
+  let Inst{5 - 0} = Opcode;
+
+  let OutOperandList = outs;
+  let InOperandList = ins;
+
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  //
+  // Attributes specific to Nios2 instructions:
+  //
+  bits<3> FormBits = Form.Value;
+
+  // TSFlags layout should be kept in sync with Nios2InstrInfo.h.
+  let TSFlags{2 - 0} = FormBits;
+
+  let DecoderNamespace = "Nios2";
+}
+
+// Nios2 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern, Format f>
+    : Nios2Inst<outs, ins, asmstr, pattern, f> {
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Nios2 : <|A|B|immediate|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSE<outs, ins, asmstr, pattern, FrmI> {
+  bits<5> rA;
+  bits<5> rB;
+  bits<16> imm;
+
+  let Opcode = op;
+
+  let Inst{31 - 27} = rA;
+  let Inst{26 - 22} = rB;
+  let Inst{21 - 6} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Format R instruction : <|A|B|C|opx|imm|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> opx, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSE<outs, ins, asmstr, pattern, FrmR> {
+  bits<5> rA;
+  bits<5> rB;
+  bits<5> rC;
+  bits<5> imm = 0;
+
+  // opcode is always 0x3a for R instr.
+  let Opcode = 0x3a;
+
+  let Inst{31 - 27} = rA;
+  let Inst{26 - 22} = rB;
+  let Inst{21 - 17} = rC;
+  // opx stands for opcode extension
+  let Inst{16 - 11} = opx;
+  // optional 5-bit immediate value
+  let Inst{10 - 6}  = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Nios2 : <|address|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSE<outs, ins, asmstr, pattern, FrmJ> {
+  bits<26> addr;
+
+  let Opcode = op;
+
+  let Inst{31 - 6} = addr;
+}
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td
new file mode 100644
index 0000000..5e4815a
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2InstrInfo.td
@@ -0,0 +1,50 @@
+//===- Nios2InstrInfo.td - Target Description for Nios2 ------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Nios2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "Nios2InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Nios2 Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def simm16      : Operand<i32> {
+  let DecoderMethod= "DecodeSimm16";
+}
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode,
+                  Operand Od, PatLeaf imm_type, RegisterClass RC> :
+  FI<op, (outs RC:$rB), (ins RC:$rA, Od:$imm16),
+     !strconcat(instr_asm, "\t$rB, $rA, $imm16"),
+     [(set RC:$rB, (OpNode RC:$rA, imm_type:$imm16))]> {
+  let isReMaterializable = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Nios2 R1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDi : ArithLogicI<0x04, "addi", add, simm16, immSExt16, CPURegs>;
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2RegisterInfo.td b/contrib/llvm/lib/Target/Nios2/Nios2RegisterInfo.td
new file mode 100644
index 0000000..1808815
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2RegisterInfo.td
@@ -0,0 +1,60 @@
+//===-- Nios2RegisterInfo.td - Nios2 Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// We have bank of 32 registers.
+class Nios2Reg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Nios2";
+}
+
+// Nios2 CPU Registers
+class Nios2GPRReg<bits<5> num, string n> : Nios2Reg<n> {
+  let Num = num;
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Nios2" in {
+  // General Purpose Registers
+  def ZERO : Nios2GPRReg<0, "zero">, DwarfRegNum<[ 0 ]>;
+  def AT : Nios2GPRReg<1, "at">, DwarfRegNum<[ 1 ]>;
+  foreach RegNum = 2 - 23 in {
+    def R #RegNum : Nios2GPRReg<RegNum, "r" #RegNum>, DwarfRegNum<[ RegNum ]>;
+  }
+  def ET : Nios2GPRReg<24, "et">, DwarfRegNum<[ 24 ]>;
+  def BT : Nios2GPRReg<25, "bt">, DwarfRegNum<[ 25 ]>;
+  def GP : Nios2GPRReg<26, "gp">, DwarfRegNum<[ 26 ]>;
+  def SP : Nios2GPRReg<27, "sp">, DwarfRegNum<[ 27 ]>;
+  def FP : Nios2GPRReg<28, "fp">, DwarfRegNum<[ 28 ]>;
+  def EA : Nios2GPRReg<29, "ea">, DwarfRegNum<[ 29 ]>;
+  def BA : Nios2GPRReg<30, "ba">, DwarfRegNum<[ 30 ]>;
+  def RA : Nios2GPRReg<31, "ra">, DwarfRegNum<[ 31 ]>;
+  def PC : Nios2Reg<"pc">, DwarfRegNum<[ 32 ]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def CPURegs : RegisterClass<"Nios2", [ i32 ], 32,
+                            (add
+                            // Reserved
+                            ZERO,
+                            AT,
+                            // Return Values and Arguments
+                            (sequence "R%u", 2, 7),
+                            // Not preserved across procedure calls
+                            // Caller saved
+                            (sequence "R%u", 8, 15),
+                            // Callee saved
+                            (sequence "R%u", 16, 23),
+                            // Reserved
+                            ET, BT, GP, SP, FP, EA, BA, RA, PC)>;
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2TargetMachine.cpp b/contrib/llvm/lib/Target/Nios2/Nios2TargetMachine.cpp
new file mode 100644
index 0000000..16d4eab
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2TargetMachine.cpp
@@ -0,0 +1,46 @@
+//===-- Nios2TargetMachine.cpp - Define TargetMachine for Nios2 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Nios2 target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2TargetMachine.h"
+#include "Nios2.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nios2"
+
+extern "C" void LLVMInitializeNios2Target() {
+  // Register the target.
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options) {
+  return "e-p:32:32:32-i8:8:32-i16:16:32-n32";
+}
+
+static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue() || CM == CodeModel::JITDefault)
+    return Reloc::Static;
+  return *RM;
+}
+
+Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
+                        Options, getEffectiveRelocModel(CM, RM), CM, OL) {}
+
+Nios2TargetMachine::~Nios2TargetMachine() {}
diff --git a/contrib/llvm/lib/Target/Nios2/Nios2TargetMachine.h b/contrib/llvm/lib/Target/Nios2/Nios2TargetMachine.h
new file mode 100644
index 0000000..7f145c8
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/Nios2TargetMachine.h
@@ -0,0 +1,30 @@
+//===-- Nios2TargetMachine.h - Define TargetMachine for Nios2 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Nios2 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H
+#define LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Nios2TargetMachine : public LLVMTargetMachine {
+public:
+  Nios2TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+  ~Nios2TargetMachine() override;
+};
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp b/contrib/llvm/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
new file mode 100644
index 0000000..e317686
--- /dev/null
+++ b/contrib/llvm/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
@@ -0,0 +1,24 @@
+//===-- Nios2TargetInfo.cpp - Nios2 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheNios2Target() {
+  static Target TheNios2Target;
+  return TheNios2Target;
+}
+
+extern "C" void LLVMInitializeNios2TargetInfo() {
+  RegisterTarget<Triple::nios2,
+                 /*HasJIT=*/true>
+      X(getTheNios2Target(), "nios2", "Nios2");
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 12ffbfd..11d2237 100644
--- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -204,6 +204,17 @@ static const unsigned G8Regs[] = {
   PPC::X28, PPC::X29, PPC::X30, PPC::X31
 };
 
+static const unsigned G80Regs[] = {
+  PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3,
+  PPC::X4, PPC::X5, PPC::X6, PPC::X7,
+  PPC::X8, PPC::X9, PPC::X10, PPC::X11,
+  PPC::X12, PPC::X13, PPC::X14, PPC::X15,
+  PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+  PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+  PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+  PPC::X28, PPC::X29, PPC::X30, PPC::X31
+};
+
 static const unsigned QFRegs[] = {
   PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3,
   PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7,
@@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, G8Regs);
 }
 
+static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, G80Regs);
+}
+
 #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass
 #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass
 
diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 609d959..baf5902 100644
--- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCInstPrinter.h"
-#include "PPCInstrInfo.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPCInstrInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
   
-  if (MI->getOpcode() == PPC::RLDICR) {
+  if (MI->getOpcode() == PPC::RLDICR ||
+      MI->getOpcode() == PPC::RLDICR_32) {
     unsigned char SH = MI->getOperand(2).getImm();
     unsigned char ME = MI->getOperand(3).getImm();
     // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5847b3a..bdad2fe 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -7,8 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -18,9 +20,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
@@ -113,8 +113,9 @@ public:
     return (IsLittleEndian? InfosLE : InfosBE)[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -130,12 +131,11 @@ public:
     }
   }
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
     switch ((PPC::Fixups)Fixup.getKind()) {
-    default: break;
+    default:
+      return false;
     case PPC::fixup_ppc_br24:
     case PPC::fixup_ppc_br24abs:
       // If the target symbol has a local entry point we must not attempt
@@ -148,10 +148,10 @@ public:
           // and thus the shift to pack it.
           unsigned Other = S->getOther() << 2;
           if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0)
-            IsResolved = false;
+            return true;
         }
       }
-      break;
+      return false;
     }
   }
 
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index fd279c6..1488bd5 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "MCTargetDesc/PPCMCExpr.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index ae43e59d..dce4439 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -17,35 +17,31 @@
 namespace llvm {
 namespace PPC {
 enum Fixups {
-  // fixup_ppc_br24 - 24-bit PC relative relocation for direct branches like 'b'
-  // and 'bl'.
+  // 24-bit PC relative relocation for direct branches like 'b' and 'bl'.
   fixup_ppc_br24 = FirstTargetFixupKind,
-  
-  /// fixup_ppc_brcond14 - 14-bit PC relative relocation for conditional
-  /// branches.
+
+  /// 14-bit PC relative relocation for conditional branches.
   fixup_ppc_brcond14,
-  
-  /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
-  /// like 'ba' and 'bla'.
+
+  /// 24-bit absolute relocation for direct branches like 'ba' and 'bla'.
   fixup_ppc_br24abs,
 
-  /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
-  /// branches.
+  /// 14-bit absolute relocation for conditional branches.
   fixup_ppc_brcond14abs,
 
-  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
-  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  /// A 16-bit fixup corresponding to lo16(_foo) or ha16(_foo) for instrs like
+  /// 'li' or 'addis'.
   fixup_ppc_half16,
-  
-  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
-  /// implied 2 zero bits for instrs like 'std'.
+
+  /// A 14-bit fixup corresponding to lo16(_foo) with implied 2 zero bits for
+  /// instrs like 'std'.
   fixup_ppc_half16ds,
 
-  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
-  /// to __tls_get_addr for the TLS general and local dynamic models,
-  /// or inserts the thread-pointer register number.
+  /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
+  /// TLS general and local dynamic models, or inserts the thread-pointer
+  /// register number.
   fixup_ppc_nofixup,
-  
+
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index d8fab5b..d30bf1a 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -20,7 +20,7 @@ void PPCMCAsmInfoDarwin::anchor() { }
 
 PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
   if (is64Bit) {
-    PointerSize = CalleeSaveStackSlotSize = 8;
+    CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
   IsLittleEndian = false;
 
@@ -50,7 +50,7 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   NeedsLocalForSize = true;
 
   if (is64Bit) {
-    PointerSize = CalleeSaveStackSlotSize = 8;
+    CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
   IsLittleEndian = T.getArch() == Triple::ppc64le;
 
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 017d21a..92c8c22 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -11,22 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "PPCInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOpcodes.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
@@ -34,10 +40,8 @@ using namespace llvm;
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
-class PPCMCCodeEmitter : public MCCodeEmitter {
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
-  void operator=(const PPCMCCodeEmitter &) = delete;
 
+class PPCMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   const MCContext &CTX;
   bool IsLittleEndian;
@@ -46,8 +50,9 @@ public:
   PPCMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : MCII(mcii), CTX(ctx),
         IsLittleEndian(ctx.getAsmInfo()->isLittleEndian()) {}
-
-  ~PPCMCCodeEmitter() override {}
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) = delete;
+  void operator=(const PPCMCCodeEmitter &) = delete;
+  ~PPCMCCodeEmitter() override = default;
 
   unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
@@ -103,6 +108,7 @@ public:
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const override {
@@ -137,7 +143,7 @@ public:
       }
       break;
     default:
-      llvm_unreachable ("Invalid instruction size");
+      llvm_unreachable("Invalid instruction size");
     }
     
     ++MCNumEmitted;  // Keep track of the # of mi's emitted.
@@ -238,7 +244,6 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
   return RegBits;
 }
 
-
 unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
@@ -266,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12;
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isImm());
+  assert(MO.isImm() && !(MO.getImm() % 16) &&
+         "Expecting an immediate that is a multiple of 16");
 
   return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits;
 }
@@ -286,7 +292,6 @@ unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
   return reverseBits(Imm | RegBits) >> 22;
 }
 
-
 unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
                                               SmallVectorImpl<MCFixup> &Fixups,
                                               const MCSubtargetInfo &STI)
@@ -302,7 +307,6 @@ unsigned PPCMCCodeEmitter::getSPE4DisEncoding(const MCInst &MI, unsigned OpNo,
   return reverseBits(Imm | RegBits) >> 22;
 }
 
-
 unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
                                               SmallVectorImpl<MCFixup> &Fixups,
                                               const MCSubtargetInfo &STI)
@@ -318,7 +322,6 @@ unsigned PPCMCCodeEmitter::getSPE2DisEncoding(const MCInst &MI, unsigned OpNo,
   return reverseBits(Imm | RegBits) >> 22;
 }
 
-
 unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
@@ -383,7 +386,5 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   return MO.getImm();
 }
 
-
-
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "PPCGenMCCodeEmitter.inc"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index 6b97d4c..54f6643 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCFixupKinds.h"
 #include "PPCMCExpr.h"
+#include "PPCFixupKinds.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index bbd10e5..e8f220e 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -11,23 +11,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "InstPrinter/PPCInstPrinter.h"
-#include "PPCMCAsmInfo.h"
+#include "MCTargetDesc/PPCMCAsmInfo.h"
 #include "PPCTargetStreamer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -41,9 +48,10 @@ using namespace llvm;
 #include "PPCGenRegisterInfo.inc"
 
 // Pin the vtable to this file.
-PPCTargetStreamer::~PPCTargetStreamer() {}
 PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
+PPCTargetStreamer::~PPCTargetStreamer() = default;
+
 static MCInstrInfo *createPPCMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitPPCMCInstrInfo(X);
@@ -96,12 +104,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
 }
 
 namespace {
+
 class PPCTargetAsmStreamer : public PPCTargetStreamer {
   formatted_raw_ostream &OS;
 
 public:
   PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
       : PPCTargetStreamer(S), OS(OS) {}
+
   void emitTCEntry(const MCSymbol &S) override {
     OS << "\t.tc ";
     OS << S.getName();
@@ -109,12 +119,15 @@ public:
     OS << S.getName();
     OS << '\n';
   }
+
   void emitMachine(StringRef CPU) override {
     OS << "\t.machine " << CPU << '\n';
   }
+
   void emitAbiVersion(int AbiVersion) override {
     OS << "\t.abiversion " << AbiVersion << '\n';
   }
+
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
 
@@ -129,18 +142,22 @@ public:
 class PPCTargetELFStreamer : public PPCTargetStreamer {
 public:
   PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+
   MCELFStreamer &getStreamer() {
     return static_cast<MCELFStreamer &>(Streamer);
   }
+
   void emitTCEntry(const MCSymbol &S) override {
     // Creates a R_PPC64_TOC relocation
     Streamer.EmitValueToAlignment(8);
     Streamer.EmitSymbolValue(&S, 8);
   }
+
   void emitMachine(StringRef CPU) override {
     // FIXME: Is there anything to do in here or does this directive only
     // limit the parser?
   }
+
   void emitAbiVersion(int AbiVersion) override {
     MCAssembler &MCA = getStreamer().getAssembler();
     unsigned Flags = MCA.getELFHeaderEFlags();
@@ -148,6 +165,7 @@ public:
     Flags |= (AbiVersion & ELF::EF_PPC64_ABI);
     MCA.setELFHeaderEFlags(Flags);
   }
+
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     MCAssembler &MCA = getStreamer().getAssembler();
 
@@ -170,6 +188,7 @@ public:
     if ((Flags & ELF::EF_PPC64_ABI) == 0)
       MCA.setELFHeaderEFlags(Flags | 2);
   }
+
   void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
     auto *Symbol = cast<MCSymbolELF>(S);
     // When encoding an assignment to set symbol A to symbol B, also copy
@@ -188,21 +207,26 @@ public:
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
 public:
   PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
+
   void emitTCEntry(const MCSymbol &S) override {
     llvm_unreachable("Unknown pseudo-op: .tc");
   }
+
   void emitMachine(StringRef CPU) override {
     // FIXME: We should update the CPUType, CPUSubType in the Object file if
     // the new values are different from the defaults.
   }
+
   void emitAbiVersion(int AbiVersion) override {
     llvm_unreachable("Unknown pseudo-op: .abiversion");
   }
+
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     llvm_unreachable("Unknown pseudo-op: .localentry");
   }
 };
-}
+
+} // end anonymous namespace
 
 static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 0989e0c..893233e 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -17,23 +17,22 @@
 // GCC #defines PPC on Linux but we use it as our namespace name
 #undef PPC
 
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/MathExtras.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
-class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
 class Triple;
 class StringRef;
 class raw_pwrite_stream;
-class raw_ostream;
 
 Target &getThePPC32Target();
 Target &getThePPC64Target();
@@ -83,7 +82,7 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
   return false;
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
 // Generated files will use "namespace PPC". To avoid symbol clash,
 // undefine PPC here. PPC may be predefined on some hosts.
@@ -103,4 +102,4 @@ static inline bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
 #define GET_SUBTARGETINFO_ENUM
 #include "PPCGenSubtargetInfo.inc"
 
-#endif
+#endif // LLVM_LIB_TARGET_POWERPC_MCTARGETDESC_PPCMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index 1f38a8c..d550627 100644
--- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -7,9 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -18,7 +19,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MachO.h"
 
 using namespace llvm;
 
@@ -151,7 +151,7 @@ static void makeRelocationInfo(MachO::any_relocation_info &MRE,
   // The bitfield offsets that work (as determined by trial-and-error)
   // are different than what is documented in the mach-o manuals.
   // This appears to be an endianness issue; reversing the order of the
-  // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+  // documented bitfields in <llvm/BinaryFormat/MachO.h> fixes this (but
   // breaks x86/ARM assembly).
   MRE.r_word1 = ((Index << 8) |    // was << 0
                  (IsPCRel << 7) |  // was << 24
@@ -219,11 +219,11 @@ bool PPCMachObjectWriter::recordScatteredRelocation(
     const MCSymbol *SB = &B->getSymbol();
 
     if (!SB->getFragment())
-      report_fatal_error("symbol '" + B->getSymbol().getName() +
+      report_fatal_error("symbol '" + SB->getName() +
                          "' can not be undefined in a subtraction expression");
 
-    // FIXME: is Type correct? see include/llvm/Support/MachO.h
-    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    // FIXME: is Type correct? see include/llvm/BinaryFormat/MachO.h
+    Value2 = Writer->getSymbolAddress(*SB, Layout);
     FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
   // FIXME: does FixedValue get used??
diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h
index e01f49d..ad92ac8 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPC.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPC.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPC_H
 #define LLVM_LIB_TARGET_POWERPC_PPC_H
 
+#include "llvm/Support/CodeGen.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 
 // GCC #defines PPC on Linux but we use it as our namespace name
@@ -24,12 +25,11 @@ namespace llvm {
   class PPCTargetMachine;
   class PassRegistry;
   class FunctionPass;
-  class ImmutablePass;
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
 
-  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+  FunctionPass *createPPCCTRLoops();
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
@@ -42,14 +42,17 @@ namespace llvm {
   FunctionPass *createPPCMIPeepholePass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCQPXLoadSplatPass();
-  FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+  FunctionPass *createPPCISelDag(PPCTargetMachine &TM, CodeGenOpt::Level OL);
   FunctionPass *createPPCTLSDynamicCallPass();
   FunctionPass *createPPCBoolRetToIntPass();
+  FunctionPass *createPPCExpandISELPass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
 
   void initializePPCVSXFMAMutatePass(PassRegistry&);
   void initializePPCBoolRetToIntPass(PassRegistry&);
+  void initializePPCExpandISELPass(PassRegistry &);
+  void initializePPCTLSDynamicCallPass(PassRegistry &);
   extern char &PPCVSXFMAMutateID;
 
   namespace PPCII {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f0e0ebc..841b8c5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -16,11 +16,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
-#include "PPCInstrInfo.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCExpr.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
@@ -29,6 +29,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -55,11 +57,9 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
@@ -112,7 +112,9 @@ public:
     void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
     bool runOnMachineFunction(MachineFunction &MF) override {
       Subtarget = &MF.getSubtarget<PPCSubtarget>();
-      return AsmPrinter::runOnMachineFunction(MF);
+      bool Changed = AsmPrinter::runOnMachineFunction(MF);
+      emitXRayTable();
+      return Changed;
     }
   };
 
@@ -134,6 +136,7 @@ public:
 
     void EmitFunctionBodyStart() override;
     void EmitFunctionBodyEnd() override;
+    void EmitInstruction(const MachineInstr *MI) override;
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -402,7 +405,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
                                       .addImm(CallTarget & 0xFFFF));
 
       // Save the current TOC pointer before the remote call.
-      int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+      int TOCSaveOffset = Subtarget->getFrameLowering()->getTOCSaveOffset();
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
                                       .addReg(PPC::X2)
                                       .addImm(TOCSaveOffset)
@@ -1046,6 +1049,97 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (!Subtarget->isPPC64())
+    return PPCAsmPrinter::EmitInstruction(MI);
+
+  switch (MI->getOpcode()) {
+  default:
+    return PPCAsmPrinter::EmitInstruction(MI);
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+    // .begin:
+    //   b .end # lis 0, FuncId[16..32]
+    //   nop    # li  0, FuncId[0..15]
+    //   std 0, -8(1)
+    //   mflr 0
+    //   bl __xray_FunctionEntry
+    //   mtlr 0
+    // .end:
+    //
+    // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+    // of instructions change.
+    MCSymbol *BeginOfSled = OutContext.createTempSymbol();
+    MCSymbol *EndOfSled = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(BeginOfSled);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::B).addExpr(
+                       MCSymbolRefExpr::create(EndOfSled, OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL8_NOP)
+                       .addExpr(MCSymbolRefExpr::create(
+                           OutContext.getOrCreateSymbol("__xray_FunctionEntry"),
+                           OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    OutStreamer->EmitLabel(EndOfSled);
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
+    break;
+  }
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT: {
+    // .p2align 3
+    // .begin:
+    //   b(lr)? # lis 0, FuncId[16..32]
+    //   nop    # li  0, FuncId[0..15]
+    //   std 0, -8(1)
+    //   mflr 0
+    //   bl __xray_FunctionExit
+    //   mtlr 0
+    // .end:
+    //   b(lr)?
+    //
+    // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+    // of instructions change.
+    const MachineInstr *Next = [&] {
+      MachineBasicBlock::const_iterator It(MI);
+      assert(It != MI->getParent()->end());
+      ++It;
+      assert(It->isReturn());
+      return &*It;
+    }();
+    OutStreamer->EmitCodeAlignment(8);
+    MCSymbol *BeginOfSled = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(BeginOfSled);
+    MCInst TmpInst;
+    LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL8_NOP)
+                       .addExpr(MCSymbolRefExpr::create(
+                           OutContext.getOrCreateSymbol("__xray_FunctionExit"),
+                           OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
+    break;
+  }
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+  case TargetOpcode::PATCHABLE_RET:
+    // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really
+    // lower to a PPC::B instruction. The PPC::B instruction is generated
+    // before it, and handled by the normal case.
+    llvm_unreachable("Tail call is handled in the normal case. See comments"
+                     "around this assert.");
+  }
+}
+
 void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
     PPCTargetStreamer *TS =
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 93c201d..55e105d 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -7,15 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements converting i1 values to i32 if they could be more
+// This file implements converting i1 values to i32/i64 if they could be more
 // profitably allocated as GPRs rather than CRs. This pass will become totally
 // unnecessary if Register Bank Allocation and Global Instruction Selection ever
 // go upstream.
 //
-// Presently, the pass converts i1 Constants, and Arguments to i32 if the
+// Presently, the pass converts i1 Constants, and Arguments to i32/i64 if the
 // transitive closure of their uses includes only PHINodes, CallInsts, and
 // ReturnInsts. The rational is that arguments are generally passed and returned
-// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will
+// in GPRs rather than CRs, so casting them to i32/i64 at the LLVM IR level will
 // actually save casts at the Machine Instruction level.
 //
 // It might be useful to expand this pass to add bit-wise operations to the list
@@ -33,11 +33,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "PPCTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
@@ -50,8 +51,9 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Pass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Casting.h"
 #include <cassert>
 
 using namespace llvm;
@@ -87,17 +89,19 @@ class PPCBoolRetToInt : public FunctionPass {
     return Defs;
   }
 
-  // Translate a i1 value to an equivalent i32 value:
-  static Value *translate(Value *V) {
-    Type *Int32Ty = Type::getInt32Ty(V->getContext());
+  // Translate a i1 value to an equivalent i32/i64 value:
+  Value *translate(Value *V) {
+    Type *IntTy = ST->isPPC64() ? Type::getInt64Ty(V->getContext())
+                                : Type::getInt32Ty(V->getContext());
+
     if (auto *C = dyn_cast<Constant>(V))
-      return ConstantExpr::getZExt(C, Int32Ty);
+      return ConstantExpr::getZExt(C, IntTy);
     if (auto *P = dyn_cast<PHINode>(V)) {
       // Temporarily set the operands to 0. We'll fix this later in
       // runOnUse.
-      Value *Zero = Constant::getNullValue(Int32Ty);
+      Value *Zero = Constant::getNullValue(IntTy);
       PHINode *Q =
-        PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P);
+        PHINode::Create(IntTy, P->getNumIncomingValues(), P->getName(), P);
       for (unsigned i = 0; i < P->getNumOperands(); ++i)
         Q->addIncoming(Zero, P->getIncomingBlock(i));
       return Q;
@@ -109,7 +113,7 @@ class PPCBoolRetToInt : public FunctionPass {
 
     auto InstPt =
       A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode();
-    return new ZExtInst(V, Int32Ty, "", InstPt);
+    return new ZExtInst(V, IntTy, "", InstPt);
   }
 
   typedef SmallPtrSet<const PHINode *, 8> PHINodeSet;
@@ -185,6 +189,13 @@ class PPCBoolRetToInt : public FunctionPass {
     if (skipFunction(F))
       return false;
 
+    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+    if (!TPC)
+      return false;
+
+    auto &TM = TPC->getTM<PPCTargetMachine>();
+    ST = TM.getSubtargetImpl(F);
+
     PHINodeSet PromotablePHINodes = getPromotablePHINodes(F);
     B2IMap Bool2IntMap;
     bool Changed = false;
@@ -205,7 +216,7 @@ class PPCBoolRetToInt : public FunctionPass {
     return Changed;
   }
 
-  static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
+  bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes,
                        B2IMap &BoolToIntMap) {
     auto Defs = findAllDefs(U);
 
@@ -262,13 +273,16 @@ class PPCBoolRetToInt : public FunctionPass {
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  const PPCSubtarget *ST;
 };
 
 } // end anonymous namespace
 
 char PPCBoolRetToInt::ID = 0;
 INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int",
-                "Convert i1 constants to i32 if they are returned",
+                "Convert i1 constants to i32/i64 if they are returned",
                 false, false)
 
 FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
index ae76386..d0b66f9 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -15,8 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCSubtarget.h"
@@ -78,7 +78,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
   BlockSizes.resize(Fn.getNumBlockIDs());
 
   auto GetAlignmentAdjustment =
-    [TII](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
+    [](MachineBasicBlock &MBB, unsigned Offset) -> unsigned {
     unsigned Align = MBB.getAlignment();
     if (!Align)
       return 0;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index 2c62a0f..53f33ac 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -23,14 +23,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "PPC.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
@@ -43,6 +44,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -81,10 +83,7 @@ namespace {
   public:
     static char ID;
 
-    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
-      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
-    }
-    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+    PPCCTRLoops() : FunctionPass(ID) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -99,16 +98,18 @@ namespace {
     }
 
   private:
-    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool mightUseCTR(BasicBlock *BB);
     bool convertToCTRLoop(Loop *L);
 
   private:
-    PPCTargetMachine *TM;
+    const PPCTargetMachine *TM;
+    const PPCSubtarget *STI;
+    const PPCTargetLowering *TLI;
+    const DataLayout *DL;
+    const TargetLibraryInfo *LibInfo;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    const DataLayout *DL;
     DominatorTree *DT;
-    const TargetLibraryInfo *LibInfo;
     bool PreserveLCSSA;
   };
 
@@ -149,9 +150,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
 
-FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
-  return new PPCCTRLoops(TM);
-}
+FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
 
 #ifndef NDEBUG
 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
@@ -169,6 +168,14 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  TM = &TPC->getTM<PPCTargetMachine>();
+  STI = TM->getSubtargetImpl(F);
+  TLI = STI->getTargetLowering();
+
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
@@ -198,8 +205,7 @@ static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
 
 // Determining the address of a TLS variable results in a function call in
 // certain TLS models.
-static bool memAddrUsesCTR(const PPCTargetMachine *TM,
-                           const Value *MemAddr) {
+static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
   const auto *GV = dyn_cast<GlobalValue>(MemAddr);
   if (!GV) {
     // Recurse to check for constants that refer to TLS global variables.
@@ -213,35 +219,35 @@ static bool memAddrUsesCTR(const PPCTargetMachine *TM,
 
   if (!GV->isThreadLocal())
     return false;
-  if (!TM)
-    return true;
-  TLSModel::Model Model = TM->getTLSModel(GV);
+  TLSModel::Model Model = TM.getTLSModel(GV);
   return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
 }
 
-bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+// Loop through the inline asm constraints and look for something that clobbers
+// ctr.
+static bool asmClobbersCTR(InlineAsm *IA) {
+  InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+  for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+    InlineAsm::ConstraintInfo &C = CIV[i];
+    if (C.Type != InlineAsm::isInput)
+      for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+        if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+          return true;
+  }
+  return false;
+}
+
+bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
   for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
        J != JE; ++J) {
     if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      // Inline ASM is okay, unless it clobbers the ctr register.
       if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
-        // Inline ASM is okay, unless it clobbers the ctr register.
-        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
-        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
-          InlineAsm::ConstraintInfo &C = CIV[i];
-          if (C.Type != InlineAsm::isInput)
-            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
-              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
-                return true;
-        }
-
+	if (asmClobbersCTR(IA))
+	  return true;
         continue;
       }
 
-      if (!TM)
-        return true;
-      const TargetLowering *TLI =
-          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
-
       if (Function *F = CI->getCalledFunction()) {
         // Most intrinsics don't become function calls, but some might.
         // sin, cos, exp and log are always calls.
@@ -298,15 +304,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
               return true;
             else
               continue; // ISD::FCOPYSIGN is never a library call.
-          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
-          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
-          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
-          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
-          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
-          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
-          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
-          case Intrinsic::minnum:    Opcode = ISD::FMINNUM;    break;
-          case Intrinsic::maxnum:    Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
+          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
           }
         }
 
@@ -315,7 +323,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
         // (i.e. soft float or atomics). If adapting for targets that do,
         // additional care is required here.
 
-        LibFunc::Func Func;
+        LibFunc Func;
         if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
             LibInfo->getLibFunc(F->getName(), Func) &&
             LibInfo->hasOptimizedCodeGen(Func)) {
@@ -329,58 +337,57 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
 
           switch (Func) {
           default: return true;
-          case LibFunc::copysign:
-          case LibFunc::copysignf:
+          case LibFunc_copysign:
+          case LibFunc_copysignf:
             continue; // ISD::FCOPYSIGN is never a library call.
-          case LibFunc::copysignl:
+          case LibFunc_copysignl:
             return true;
-          case LibFunc::fabs:
-          case LibFunc::fabsf:
-          case LibFunc::fabsl:
+          case LibFunc_fabs:
+          case LibFunc_fabsf:
+          case LibFunc_fabsl:
             continue; // ISD::FABS is never a library call.
-          case LibFunc::sqrt:
-          case LibFunc::sqrtf:
-          case LibFunc::sqrtl:
+          case LibFunc_sqrt:
+          case LibFunc_sqrtf:
+          case LibFunc_sqrtl:
             Opcode = ISD::FSQRT; break;
-          case LibFunc::floor:
-          case LibFunc::floorf:
-          case LibFunc::floorl:
+          case LibFunc_floor:
+          case LibFunc_floorf:
+          case LibFunc_floorl:
             Opcode = ISD::FFLOOR; break;
-          case LibFunc::nearbyint:
-          case LibFunc::nearbyintf:
-          case LibFunc::nearbyintl:
+          case LibFunc_nearbyint:
+          case LibFunc_nearbyintf:
+          case LibFunc_nearbyintl:
             Opcode = ISD::FNEARBYINT; break;
-          case LibFunc::ceil:
-          case LibFunc::ceilf:
-          case LibFunc::ceill:
+          case LibFunc_ceil:
+          case LibFunc_ceilf:
+          case LibFunc_ceill:
             Opcode = ISD::FCEIL; break;
-          case LibFunc::rint:
-          case LibFunc::rintf:
-          case LibFunc::rintl:
+          case LibFunc_rint:
+          case LibFunc_rintf:
+          case LibFunc_rintl:
             Opcode = ISD::FRINT; break;
-          case LibFunc::round:
-          case LibFunc::roundf:
-          case LibFunc::roundl:
+          case LibFunc_round:
+          case LibFunc_roundf:
+          case LibFunc_roundl:
             Opcode = ISD::FROUND; break;
-          case LibFunc::trunc:
-          case LibFunc::truncf:
-          case LibFunc::truncl:
+          case LibFunc_trunc:
+          case LibFunc_truncf:
+          case LibFunc_truncl:
             Opcode = ISD::FTRUNC; break;
-          case LibFunc::fmin:
-          case LibFunc::fminf:
-          case LibFunc::fminl:
+          case LibFunc_fmin:
+          case LibFunc_fminf:
+          case LibFunc_fminl:
             Opcode = ISD::FMINNUM; break;
-          case LibFunc::fmax:
-          case LibFunc::fmaxf:
-          case LibFunc::fmaxl:
+          case LibFunc_fmax:
+          case LibFunc_fmaxf:
+          case LibFunc_fmaxl:
             Opcode = ISD::FMAXNUM; break;
           }
         }
 
         if (Opcode) {
-          auto &DL = CI->getModule()->getDataLayout();
-          MVT VTy = TLI->getSimpleValueType(DL, CI->getArgOperand(0)->getType(),
-                                            true);
+          MVT VTy = TLI->getSimpleValueType(
+              *DL, CI->getArgOperand(0)->getType(), true);
           if (VTy == MVT::Other)
             return true;
 
@@ -404,17 +411,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       CastInst *CI = cast<CastInst>(J);
       if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
           CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
-          isLargeIntegerTy(TT.isArch32Bit(), CI->getSrcTy()->getScalarType()) ||
-          isLargeIntegerTy(TT.isArch32Bit(), CI->getDestTy()->getScalarType()))
+          isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
+          isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
         return true;
-    } else if (isLargeIntegerTy(TT.isArch32Bit(),
+    } else if (isLargeIntegerTy(!TM->isPPC64(),
                                 J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::UDiv ||
                 J->getOpcode() == Instruction::SDiv ||
                 J->getOpcode() == Instruction::URem ||
                 J->getOpcode() == Instruction::SRem)) {
       return true;
-    } else if (TT.isArch32Bit() &&
+    } else if (!TM->isPPC64() &&
                isLargeIntegerTy(false, J->getType()->getScalarType()) &&
                (J->getOpcode() == Instruction::Shl ||
                 J->getOpcode() == Instruction::AShr ||
@@ -426,16 +433,11 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
       // On PowerPC, indirect jumps use the counter register.
       return true;
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
-      if (!TM)
-        return true;
-      const TargetLowering *TLI =
-          TM->getSubtargetImpl(*BB->getParent())->getTargetLowering();
-
       if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
         return true;
     }
 
-    if (TM->getSubtargetImpl(*BB->getParent())->getTargetLowering()->useSoftFloat()) {
+    if (STI->useSoftFloat()) {
       switch(J->getOpcode()) {
       case Instruction::FAdd:
       case Instruction::FSub:
@@ -454,7 +456,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
     }
 
     for (Value *Operand : J->operands())
-      if (memAddrUsesCTR(TM, Operand))
+      if (memAddrUsesCTR(*TM, Operand))
         return true;
   }
 
@@ -464,11 +466,6 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
 bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   bool MadeChange = false;
 
-  const Triple TT =
-      Triple(L->getHeader()->getParent()->getParent()->getTargetTriple());
-  if (!TT.isArch32Bit() && !TT.isArch64Bit())
-    return MadeChange; // Unknown arch. type.
-
   // Process nested loops first.
   for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
     MadeChange |= convertToCTRLoop(*I);
@@ -493,7 +490,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // want to use the counter register if the loop contains calls.
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I)
-    if (mightUseCTR(TT, *I))
+    if (mightUseCTR(*I))
       return MadeChange;
 
   SmallVector<BasicBlock*, 4> ExitingBlocks;
@@ -515,7 +512,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
     } else if (!SE->isLoopInvariant(EC, L))
       continue;
 
-    if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+    if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
       continue;
 
     // We now have a loop-invariant count of loop iterations (which is not the
@@ -569,7 +566,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // preheader, then we can use it (except if the preheader contains a use of
   // the CTR register because some such uses might be reordered by the
   // selection DAG after the mtctr instruction).
-  if (!Preheader || mightUseCTR(TT, Preheader))
+  if (!Preheader || mightUseCTR(Preheader))
     Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
   if (!Preheader)
     return MadeChange;
@@ -580,10 +577,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // selected branch.
   MadeChange = true;
 
-  SCEVExpander SCEVE(*SE, Preheader->getModule()->getDataLayout(), "loopcnt");
+  SCEVExpander SCEVE(*SE, *DL, "loopcnt");
   LLVMContext &C = SE->getContext();
-  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
-                                       Type::getInt32Ty(C);
+  Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
   if (!ExitCount->getType()->isPointerTy() &&
       ExitCount->getType() != CountType)
     ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
@@ -611,7 +607,10 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   // The old condition may be dead now, and may have even created a dead PHI
   // (the original induction variable).
   RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-  DeleteDeadPHIs(CountedExitBlock);
+  // Run through the basic blocks of the loop and see if any of them have dead
+  // PHIs that can be removed.
+  for (auto I : L->blocks())
+    DeleteDeadPHIs(I);
 
   ++NumCTRLoops;
   return MadeChange;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index 6bd2296..811e4dd 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp b/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
new file mode 100644
index 0000000..41e3190
--- /dev/null
+++ b/contrib/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -0,0 +1,458 @@
+//===------------- PPCExpandISEL.cpp - Expand ISEL instruction ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass that expands the ISEL instruction into an if-then-else sequence.
+// This pass must be run post-RA since all operands must be physical registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-expand-isel"
+
+STATISTIC(NumExpanded, "Number of ISEL instructions expanded");
+STATISTIC(NumRemoved, "Number of ISEL instructions removed");
+STATISTIC(NumFolded, "Number of ISEL instructions folded");
+
+// If -ppc-gen-isel=false is set, we will disable generating the ISEL
+// instruction on all PPC targets. Otherwise, if the user set option
+// -misel or the platform supports ISEL by default, still generate the
+// ISEL instruction, else expand it.
+static cl::opt<bool>
+    GenerateISEL("ppc-gen-isel",
+                 cl::desc("Enable generating the ISEL instruction."),
+                 cl::init(true), cl::Hidden);
+
+namespace {
+class PPCExpandISEL : public MachineFunctionPass {
+  DebugLoc dl;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  bool IsTrueBlockRequired;
+  bool IsFalseBlockRequired;
+  MachineBasicBlock *TrueBlock;
+  MachineBasicBlock *FalseBlock;
+  MachineBasicBlock *NewSuccessor;
+  MachineBasicBlock::iterator TrueBlockI;
+  MachineBasicBlock::iterator FalseBlockI;
+
+  typedef SmallVector<MachineInstr *, 4> BlockISELList;
+  typedef SmallDenseMap<int, BlockISELList> ISELInstructionList;
+
+  // A map of MBB numbers to their lists of contained ISEL instructions.
+  ISELInstructionList ISELInstructions;
+
+  /// Initialize the object.
+  void initialize(MachineFunction &MFParam);
+
+  void handleSpecialCases(BlockISELList &BIL, MachineBasicBlock *MBB);
+  void reorganizeBlockLayout(BlockISELList &BIL, MachineBasicBlock *MBB);
+  void populateBlocks(BlockISELList &BIL);
+  void expandMergeableISELs(BlockISELList &BIL);
+  void expandAndMergeISELs();
+
+  bool canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI);
+
+  ///  Is this instruction an ISEL or ISEL8?
+  static bool isISEL(const MachineInstr &MI) {
+    return (MI.getOpcode() == PPC::ISEL || MI.getOpcode() == PPC::ISEL8);
+  }
+
+  ///  Is this instruction an ISEL8?
+  static bool isISEL8(const MachineInstr &MI) {
+    return (MI.getOpcode() == PPC::ISEL8);
+  }
+
+  /// Are the two operands using the same register?
+  bool useSameRegister(const MachineOperand &Op1, const MachineOperand &Op2) {
+    return (Op1.getReg() == Op2.getReg());
+  }
+
+  ///
+  ///  Collect all ISEL instructions from the current function.
+  ///
+  /// Walk the current function and collect all the ISEL instructions that are
+  /// found. The instructions are placed in the ISELInstructions vector.
+  ///
+  /// \return true if any ISEL instructions were found, false otherwise
+  ///
+  bool collectISELInstructions();
+
+public:
+  static char ID;
+  PPCExpandISEL() : MachineFunctionPass(ID) {
+    initializePPCExpandISELPass(*PassRegistry::getPassRegistry());
+  }
+
+  ///
+  ///  Determine whether to generate the ISEL instruction or expand it.
+  ///
+  /// Expand ISEL instruction into if-then-else sequence when one of
+  /// the following two conditions hold:
+  /// (1) -ppc-gen-isel=false
+  /// (2) hasISEL() return false
+  /// Otherwise, still generate ISEL instruction.
+  /// The -ppc-gen-isel option is set to true by default. Which means the ISEL
+  /// instruction is still generated by default on targets that support them.
+  ///
+  /// \return true if ISEL should be expanded into if-then-else code sequence;
+  ///         false if ISEL instruction should be generated, i.e. not expaned.
+  ///
+  static bool isExpandISELEnabled(const MachineFunction &MF);
+
+#ifndef NDEBUG
+  void DumpISELInstructions() const;
+#endif
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!isExpandISELEnabled(MF))
+      return false;
+
+    DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+    initialize(MF);
+
+    if (!collectISELInstructions()) {
+      DEBUG(dbgs() << "No ISEL instructions in this function\n");
+      return false;
+    }
+
+#ifndef NDEBUG
+    DumpISELInstructions();
+#endif
+
+    expandAndMergeISELs();
+
+    return true;
+  }
+};
+} // end anonymous namespace
+
+void PPCExpandISEL::initialize(MachineFunction &MFParam) {
+  MF = &MFParam;
+  TII = MF->getSubtarget().getInstrInfo();
+  ISELInstructions.clear();
+}
+
+bool PPCExpandISEL::isExpandISELEnabled(const MachineFunction &MF) {
+  return !GenerateISEL || !MF.getSubtarget<PPCSubtarget>().hasISEL();
+}
+
+bool PPCExpandISEL::collectISELInstructions() {
+  for (MachineBasicBlock &MBB : *MF) {
+    BlockISELList thisBlockISELs;
+    for (MachineInstr &MI : MBB)
+      if (isISEL(MI))
+        thisBlockISELs.push_back(&MI);
+    if (!thisBlockISELs.empty())
+      ISELInstructions.insert(std::make_pair(MBB.getNumber(), thisBlockISELs));
+  }
+  return !ISELInstructions.empty();
+}
+
+#ifndef NDEBUG
+void PPCExpandISEL::DumpISELInstructions() const {
+  for (const auto &I : ISELInstructions) {
+    DEBUG(dbgs() << "BB#" << I.first << ":\n");
+    for (const auto &VI : I.second)
+      DEBUG(dbgs() << "    "; VI->print(dbgs()));
+  }
+}
+#endif
+
+/// Contiguous ISELs that have the same condition can be merged.
+bool PPCExpandISEL::canMerge(MachineInstr *PrevPushedMI, MachineInstr *MI) {
+  // Same Condition Register?
+  if (!useSameRegister(PrevPushedMI->getOperand(3), MI->getOperand(3)))
+    return false;
+
+  MachineBasicBlock::iterator PrevPushedMBBI = *PrevPushedMI;
+  MachineBasicBlock::iterator MBBI = *MI;
+  return (std::prev(MBBI) == PrevPushedMBBI); // Contiguous ISELs?
+}
+
+void PPCExpandISEL::expandAndMergeISELs() {
+  for (auto &BlockList : ISELInstructions) {
+    DEBUG(dbgs() << "Expanding ISEL instructions in BB#" << BlockList.first
+                 << "\n");
+
+    BlockISELList &CurrentISELList = BlockList.second;
+    auto I = CurrentISELList.begin();
+    auto E = CurrentISELList.end();
+
+    while (I != E) {
+      BlockISELList SubISELList;
+
+      SubISELList.push_back(*I++);
+
+      // Collect the ISELs that can be merged together.
+      while (I != E && canMerge(SubISELList.back(), *I))
+        SubISELList.push_back(*I++);
+
+      expandMergeableISELs(SubISELList);
+    }
+  }
+}
+
+void PPCExpandISEL::handleSpecialCases(BlockISELList &BIL,
+                                       MachineBasicBlock *MBB) {
+  IsTrueBlockRequired = false;
+  IsFalseBlockRequired = false;
+
+  auto MI = BIL.begin();
+  while (MI != BIL.end()) {
+    assert(isISEL(**MI) && "Expecting an ISEL instruction");
+    DEBUG(dbgs() << "ISEL: " << **MI << "\n");
+
+    MachineOperand &Dest = (*MI)->getOperand(0);
+    MachineOperand &TrueValue = (*MI)->getOperand(1);
+    MachineOperand &FalseValue = (*MI)->getOperand(2);
+
+    // If at least one of the ISEL instructions satisfy the following
+    // condition, we need the True Block:
+    // The Dest Register and True Value Register are not the same
+    // Similarly, if at least one of the ISEL instructions satisfy the
+    // following condition, we need the False Block:
+    // The Dest Register and False Value Register are not the same.
+
+    bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
+    bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
+
+    // Special case 1, all registers used by ISEL are the same one.
+    if (!IsADDIInstRequired && !IsORIInstRequired) {
+      DEBUG(dbgs() << "Remove redudant ISEL instruction.");
+      NumRemoved++;
+      (*MI)->eraseFromParent();
+      // Setting MI to the erase result keeps the iterator valid and increased.
+      MI = BIL.erase(MI);
+      continue;
+    }
+
+    // Special case 2, the two input registers used by ISEL are the same.
+    // Note 1: We favor merging ISEL expansions over folding a single one. If
+    // the passed list has multiple merge-able ISEL's, we won't fold any.
+    // Note 2: There is no need to test for PPC::R0/PPC::X0 because PPC::ZERO/
+    // PPC::ZERO8 will be used for the first operand if the value is meant to
+    // be zero. In this case, the useSameRegister method will return false,
+    // thereby preventing this ISEL from being folded.
+
+    if (useSameRegister(TrueValue, FalseValue) && (BIL.size() == 1)) {
+      DEBUG(dbgs() << "Fold the ISEL instruction to an unconditonal copy.");
+      NumFolded++;
+      BuildMI(*MBB, (*MI), dl, TII->get(isISEL8(**MI) ? PPC::ADDI8 : PPC::ADDI))
+          .add(Dest)
+          .add(TrueValue)
+          .add(MachineOperand::CreateImm(0));
+      (*MI)->eraseFromParent();
+      // Setting MI to the erase result keeps the iterator valid and increased.
+      MI = BIL.erase(MI);
+      continue;
+    }
+
+    IsTrueBlockRequired |= IsADDIInstRequired;
+    IsFalseBlockRequired |= IsORIInstRequired;
+    MI++;
+  }
+}
+
+void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
+                                          MachineBasicBlock *MBB) {
+  if (BIL.empty())
+    return;
+
+  assert((IsTrueBlockRequired || IsFalseBlockRequired) &&
+         "Should have been handled by special cases earlier!");
+
+  MachineBasicBlock *Successor = nullptr;
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineBasicBlock::iterator MBBI = (*BIL.back());
+  NewSuccessor = (MBBI != MBB->getLastNonDebugInstr() || !MBB->canFallThrough())
+                     // Another BB is needed to move the instructions that
+                     // follow this ISEL.  If the ISEL is the last instruction
+                     // in a block that can't fall through, we also need a block
+                     // to branch to.
+                     ? MF->CreateMachineBasicBlock(LLVM_BB)
+                     : nullptr;
+
+  MachineFunction::iterator It = MBB->getIterator();
+  ++It; // Point to the successor block of MBB.
+
+  // If NewSuccessor is NULL then the last ISEL in this group is the last
+  // non-debug instruction in this block. Find the fall-through successor
+  // of this block to use when updating the CFG below.
+  if (!NewSuccessor) {
+    for (auto &Succ : MBB->successors()) {
+      if (MBB->isLayoutSuccessor(Succ)) {
+        Successor = Succ;
+        break;
+      }
+    }
+  } else
+    Successor = NewSuccessor;
+
+  // The FalseBlock and TrueBlock are inserted after the MBB block but before
+  // its successor.
+  // Note this need to be done *after* the above setting the Successor code.
+  if (IsFalseBlockRequired) {
+    FalseBlock = MF->CreateMachineBasicBlock(LLVM_BB);
+    MF->insert(It, FalseBlock);
+  }
+
+  if (IsTrueBlockRequired) {
+    TrueBlock = MF->CreateMachineBasicBlock(LLVM_BB);
+    MF->insert(It, TrueBlock);
+  }
+
+  if (NewSuccessor) {
+    MF->insert(It, NewSuccessor);
+
+    // Transfer the rest of this block into the new successor block.
+    NewSuccessor->splice(NewSuccessor->end(), MBB,
+                         std::next(MachineBasicBlock::iterator(BIL.back())),
+                         MBB->end());
+    NewSuccessor->transferSuccessorsAndUpdatePHIs(MBB);
+
+    // Copy the original liveIns of MBB to NewSuccessor.
+    for (auto &LI : MBB->liveins())
+      NewSuccessor->addLiveIn(LI);
+
+    // After splitting the NewSuccessor block, Regs defined but not killed
+    // in MBB should be treated as liveins of NewSuccessor.
+    // Note: Cannot use stepBackward instead since we are using the Reg
+    // liveness state at the end of MBB (liveOut of MBB) as the liveIn for
+    // NewSuccessor. Otherwise, will cause cyclic dependence.
+    LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
+    SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers;
+    for (MachineInstr &MI : *MBB)
+      LPR.stepForward(MI, Clobbers);
+    for (auto &LI : LPR)
+      NewSuccessor->addLiveIn(LI);
+  } else {
+    // Remove successor from MBB.
+    MBB->removeSuccessor(Successor);
+  }
+
+  // Note that this needs to be done *after* transfering the successors from MBB
+  // to the NewSuccessor block, otherwise these blocks will also be transferred
+  // as successors!
+  MBB->addSuccessor(IsTrueBlockRequired ? TrueBlock : Successor);
+  MBB->addSuccessor(IsFalseBlockRequired ? FalseBlock : Successor);
+
+  if (IsTrueBlockRequired) {
+    TrueBlockI = TrueBlock->begin();
+    TrueBlock->addSuccessor(Successor);
+  }
+
+  if (IsFalseBlockRequired) {
+    FalseBlockI = FalseBlock->begin();
+    FalseBlock->addSuccessor(Successor);
+  }
+
+  // Conditional branch to the TrueBlock or Successor
+  BuildMI(*MBB, BIL.back(), dl, TII->get(PPC::BC))
+      .add(BIL.back()->getOperand(3))
+      .addMBB(IsTrueBlockRequired ? TrueBlock : Successor);
+
+  // Jump over the true block to the new successor if the condition is false.
+  BuildMI(*(IsFalseBlockRequired ? FalseBlock : MBB),
+          (IsFalseBlockRequired ? FalseBlockI : BIL.back()), dl,
+          TII->get(PPC::B))
+      .addMBB(Successor);
+
+  if (IsFalseBlockRequired)
+    FalseBlockI = FalseBlock->begin(); // get the position of PPC::B
+}
+
+void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
+  for (auto &MI : BIL) {
+    assert(isISEL(*MI) && "Expecting an ISEL instruction");
+
+    MachineOperand &Dest = MI->getOperand(0);       // location to store to
+    MachineOperand &TrueValue = MI->getOperand(1);  // Value to store if
+                                                       // condition is true
+    MachineOperand &FalseValue = MI->getOperand(2); // Value to store if
+                                                       // condition is false
+    MachineOperand &ConditionRegister = MI->getOperand(3); // Condition
+
+    DEBUG(dbgs() << "Dest: " << Dest << "\n");
+    DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
+    DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
+    DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
+
+
+    // If the Dest Register and True Value Register are not the same one, we
+    // need the True Block.
+    bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
+    bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
+
+    if (IsADDIInstRequired) {
+      // Copy the result into the destination if the condition is true.
+      BuildMI(*TrueBlock, TrueBlockI, dl,
+              TII->get(isISEL8(*MI) ? PPC::ADDI8 : PPC::ADDI))
+          .add(Dest)
+          .add(TrueValue)
+          .add(MachineOperand::CreateImm(0));
+
+      // Add the LiveIn registers required by true block.
+      TrueBlock->addLiveIn(TrueValue.getReg());
+    }
+
+    if (IsORIInstRequired) {
+      // Add the LiveIn registers required by false block.
+      FalseBlock->addLiveIn(FalseValue.getReg());
+    }
+
+    if (NewSuccessor) {
+      // Add the LiveIn registers required by NewSuccessor block.
+      NewSuccessor->addLiveIn(Dest.getReg());
+      NewSuccessor->addLiveIn(TrueValue.getReg());
+      NewSuccessor->addLiveIn(FalseValue.getReg());
+      NewSuccessor->addLiveIn(ConditionRegister.getReg());
+    }
+
+    // Copy the value into the destination if the condition is false.
+    if (IsORIInstRequired)
+      BuildMI(*FalseBlock, FalseBlockI, dl,
+              TII->get(isISEL8(*MI) ? PPC::ORI8 : PPC::ORI))
+          .add(Dest)
+          .add(FalseValue)
+          .add(MachineOperand::CreateImm(0));
+
+    MI->eraseFromParent(); // Remove the ISEL instruction.
+
+    NumExpanded++;
+  }
+}
+
+void PPCExpandISEL::expandMergeableISELs(BlockISELList &BIL) {
+  // At this stage all the ISELs of BIL are in the same MBB.
+  MachineBasicBlock *MBB = BIL.back()->getParent();
+
+  handleSpecialCases(BIL, MBB);
+  reorganizeBlockLayout(BIL, MBB);
+  populateBlocks(BIL);
+}
+
+INITIALIZE_PASS(PPCExpandISEL, DEBUG_TYPE, "PowerPC Expand ISEL Generation",
+                false, false)
+char PPCExpandISEL::ID = 0;
+
+FunctionPass *llvm::createPPCExpandISELPass() { return new PPCExpandISEL(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 9b91b9a..bc99571 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
-#include "PPCCallingConv.h"
+#include "PPC.h"
 #include "PPCCCState.h"
+#include "PPCCallingConv.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCSubtarget.h"
@@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   // Issue CALLSEQ_START.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TII.getCallFrameSetupOpcode()))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Prepare to assign register arguments.  Every argument uses up a
   // GPR protocol register even if it's passed in a floating-point
@@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
     }
 
     case PPC::EXTSW:
+    case PPC::EXTSW_32:
     case PPC::EXTSW_32_64: {
       if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
         return false;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index f9ea871..b49c334 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -433,25 +433,21 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
-
-  // If we are a leaf function, and use up to 224 bytes of stack space,
-  // don't have a frame pointer, calls, or dynamic alloca then we do not need
-  // to adjust the stack pointer (we fit in the Red Zone).
-  // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate
-  // stackless code if all local vars are reg-allocated.
-  bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
   unsigned LR = RegInfo->getRARegister();
-  if (!DisableRedZone &&
-      (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
-       !Subtarget.isSVR4ABI() ||                   //   allocated locals.
-        FrameSize == 0) &&
-      FrameSize <= 224 &&                          // Fits in red zone.
-      !MFI.hasVarSizedObjects() &&                 // No dynamic alloca.
-      !MFI.adjustsStack() &&                       // No calls.
-      !MustSaveLR(MF, LR) &&
-      !RegInfo->hasBasePointer(MF)) { // No special alignment.
+  bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+  bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca.
+                       !MFI.adjustsStack() &&       // No calls.
+                       !MustSaveLR(MF, LR) &&       // No need to save LR.
+                       !RegInfo->hasBasePointer(MF); // No special alignment.
+
+  // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless
+  // code if all local vars are reg-allocated.
+  bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize();
+
+  // Check whether we can skip adjusting the stack pointer (by using red zone)
+  if (!DisableRedZone && CanUseRedZone && FitsInRedZone) {
     // No need for frame
     if (UpdateMF)
       MFI.setStackSize(0);
@@ -519,11 +515,10 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
   unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
-  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FP8Reg;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI)
@@ -616,8 +611,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
     return true;
 
   // Get the list of callee-saved registers for the target.
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());
 
   // Get all the available registers in the block.
@@ -663,8 +657,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
 // and the stack frame is large, we need two scratch registers.
 bool
 PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MachineFunction &MF = *(MBB->getParent());
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned FrameSize = determineFrameLayout(MF, false);
@@ -694,10 +687,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
@@ -1221,10 +1212,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   if (MBBI != MBB.end())
     dl = MBBI->getDebugLoc();
   
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1549,8 +1538,7 @@ void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
   if (MBBI != MBB.end())
     dl = MBBI->getDebugLoc();
 
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
 
   // Create branch instruction for pseudo tail call return instruction
   unsigned RetOpcode = MBBI->getOpcode();
@@ -1588,8 +1576,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1791,8 +1778,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     HasGPSaveArea = true;
   }
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   if (RegInfo->hasBasePointer(MF)) {
     int FI = PFI->getBasePointerSaveIndex();
     assert(FI && "No Base Pointer Save Slot!");
@@ -1880,8 +1866,13 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   }
 
   if (HasVRSaveArea) {
-    // Insert alignment padding, we need 16-byte alignment.
-    LowerBound = (LowerBound - 15) & ~(15);
+    // Insert alignment padding, we need 16-byte alignment. Note: for postive
+    // number the alignment formula is : y = (x + (n-1)) & (~(n-1)). But since
+    // we are using negative number here (the stack grows downward). We should
+    // use formula : y = x & (~(n-1)). Where x is the size before aligning, n
+    // is the alignment size ( n = 16 here) and y is the size after aligning.
+    assert(LowerBound <= 0 && "Expect LowerBound have a non-positive value!");
+    LowerBound &= ~(15);
 
     for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
       int FI = VRegs[i].getFrameIdx();
@@ -1913,12 +1904,13 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   if (MFI.hasVarSizedObjects() || spillsCR(MF) || spillsVRSAVE(MF) ||
       hasNonRISpills(MF) || (hasSpills(MF) && !isInt<16>(StackSize))) {
-    const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-    const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-    const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                      RC->getAlignment(),
-                                                      false));
+    const TargetRegisterClass &GPRC = PPC::GPRCRegClass;
+    const TargetRegisterClass &G8RC = PPC::G8RCRegClass;
+    const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC;
+    const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo();
+    unsigned Size = TRI.getSpillSize(RC);
+    unsigned Align = TRI.getSpillAlignment(RC);
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
 
     // Might we have over-aligned allocas?
     bool HasAlVars = MFI.hasVarSizedObjects() &&
@@ -1926,9 +1918,7 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
 
     // These kinds of spills might need two registers.
     if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
-      RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                        RC->getAlignment(),
-                                                        false));
+      RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
 
   }
 }
@@ -1945,8 +1935,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
@@ -2087,8 +2076,7 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   bool CR2Spilled = false;
   bool CR3Spilled = false;
   bool CR4Spilled = false;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 1e51c1f..901539b 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -12,34 +12,76 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
+#include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <new>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "ppc-codegen"
 
+STATISTIC(NumSextSetcc,
+          "Number of (sext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(NumZextSetcc,
+          "Number of (zext(setcc)) nodes expanded into GPR sequence.");
+STATISTIC(SignExtensionsAdded,
+          "Number of sign extensions for compare inputs added.");
+STATISTIC(ZeroExtensionsAdded,
+          "Number of zero extensions for compare inputs added.");
+STATISTIC(NumLogicOpsOnComparison,
+          "Number of logical ops on i1 values calculated in GPR.");
+STATISTIC(OmittedForNonExtendUses,
+          "Number of compares not eliminated as they have non-extending uses.");
+
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -60,6 +102,7 @@ static cl::opt<bool> EnableBranchHint(
     cl::Hidden);
 
 namespace {
+
   //===--------------------------------------------------------------------===//
   /// PPCDAGToDAGISel - PPC specific code to select PPC machine
   /// instructions for SelectionDAG operations.
@@ -69,9 +112,10 @@ namespace {
     const PPCSubtarget *PPCSubTarget;
     const PPCTargetLowering *PPCLowering;
     unsigned GlobalBaseReg;
+
   public:
-    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
-        : SelectionDAGISel(tm), TM(tm) {}
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm, CodeGenOpt::Level OptLevel)
+        : SelectionDAGISel(tm, OptLevel), TM(tm) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
@@ -134,7 +178,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -167,7 +211,11 @@ namespace {
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4);
+    }
+
+    bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16);
     }
 
     // Select an address into a single register.
@@ -184,7 +232,6 @@ namespace {
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
                                       std::vector<SDValue> &OutOps) override {
-
       switch(ConstraintID) {
       default:
         errs() << "ConstraintID: " << ConstraintID << "\n";
@@ -223,7 +270,34 @@ namespace {
 #include "PPCGenDAGISel.inc"
 
 private:
+    // Conversion type for interpreting results of a 32-bit instruction as
+    // a 64-bit value or vice versa.
+    enum ExtOrTruncConversion { Ext, Trunc };
+
+    // Modifiers to guide how an ISD::SETCC node's result is to be computed
+    // in a GPR.
+    // ZExtOrig - use the original condition code, zero-extend value
+    // ZExtInvert - invert the condition code, zero-extend value
+    // SExtOrig - use the original condition code, sign-extend value
+    // SExtInvert - invert the condition code, sign-extend value
+    enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert };
+
     bool trySETCC(SDNode *N);
+    bool tryEXTEND(SDNode *N);
+    bool tryLogicOpOfCompares(SDNode *N);
+    SDValue computeLogicOpInGPR(SDValue LogicOp);
+    SDValue signExtendInputIfNeeded(SDValue Input);
+    SDValue zeroExtendInputIfNeeded(SDValue Input);
+    SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv);
+    SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue get64BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue get64BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                                int64_t RHSValue, SDLoc dl);
+    SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -235,9 +309,11 @@ private:
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
 
+    bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
     void transferMemOperands(SDNode *N, SDNode *Result);
   };
-}
+
+} // end anonymous namespace
 
 /// InsertVRSaveCode - Once the entire function has been instruction selected,
 /// all virtual registers are created and all machine instructions are built,
@@ -303,7 +379,6 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
   }
 }
 
-
 /// getGlobalBaseReg - Output the instructions required to put the
 /// base address to use for accessing globals into a register.
 ///
@@ -349,26 +424,6 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
       .getNode();
 }
 
-/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
-/// or 64-bit immediate, and if the value can be accurately represented as a
-/// sign extension from a 16-bit value.  If so, this returns true and the
-/// immediate.
-static bool isIntS16Immediate(SDNode *N, short &Imm) {
-  if (N->getOpcode() != ISD::Constant)
-    return false;
-
-  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
-  if (N->getValueType(0) == MVT::i32)
-    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
-  else
-    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
-}
-
-static bool isIntS16Immediate(SDValue Op, short &Imm) {
-  return isIntS16Immediate(Op.getNode(), Imm);
-}
-
-
 /// isInt32Immediate - This method tests to see if the node is a 32-bit constant
 /// operand. If so Imm will receive the 32-bit value.
 static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
@@ -515,12 +570,12 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDLoc dl(N);
 
-  APInt LKZ, LKO, RKZ, RKO;
-  CurDAG->computeKnownBits(Op0, LKZ, LKO);
-  CurDAG->computeKnownBits(Op1, RKZ, RKO);
+  KnownBits LKnown, RKnown;
+  CurDAG->computeKnownBits(Op0, LKnown);
+  CurDAG->computeKnownBits(Op1, RKnown);
 
-  unsigned TargetMask = LKZ.getZExtValue();
-  unsigned InsertMask = RKZ.getZExtValue();
+  unsigned TargetMask = LKnown.Zero.getZExtValue();
+  unsigned InsertMask = RKnown.Zero.getZExtValue();
 
   if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
     unsigned Op0Opc = Op0.getOpcode();
@@ -563,9 +618,9 @@ bool PPCDAGToDAGISel::tryBitfieldInsert(SDNode *N) {
        // The AND mask might not be a constant, and we need to make sure that
        // if we're going to fold the masking with the insert, all bits not
        // know to be zero in the mask are known to be one.
-        APInt MKZ, MKO;
-        CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
-        bool CanFoldMask = InsertMask == MKO.getZExtValue();
+        KnownBits MKnown;
+        CurDAG->computeKnownBits(Op1.getOperand(1), MKnown);
+        bool CanFoldMask = InsertMask == MKnown.One.getZExtValue();
 
         unsigned SHOpc = Op1.getOperand(0).getOpcode();
         if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
@@ -659,7 +714,10 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) {
 
 static unsigned getInt64Count(int64_t Imm) {
   unsigned Count = getInt64CountDirect(Imm);
-  if (Count == 1)
+
+  // If the instruction count is 1 or 2, we do not need further analysis
+  // since rotate + load constant requires at least 2 instructions.
+  if (Count <= 2)
     return Count;
 
   for (unsigned r = 1; r < 63; ++r) {
@@ -769,7 +827,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl,
 
 static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) {
   unsigned Count = getInt64CountDirect(Imm);
-  if (Count == 1)
+
+  // If the instruction count is 1 or 2, we do not need further analysis
+  // since rotate + load constant requires at least 2 instructions.
+  if (Count <= 2)
     return getInt64Direct(CurDAG, dl, Imm);
 
   unsigned RMin = 0;
@@ -833,6 +894,7 @@ static SDNode *getInt64(SelectionDAG *CurDAG, SDNode *N) {
 }
 
 namespace {
+
 class BitPermutationSelector {
   struct ValueBit {
     SDValue V;
@@ -898,14 +960,12 @@ class BitPermutationSelector {
   // associated with each) used to choose the lowering method.
   struct ValueRotInfo {
     SDValue V;
-    unsigned RLAmt;
-    unsigned NumGroups;
-    unsigned FirstGroupStartIdx;
-    bool Repl32;
+    unsigned RLAmt = std::numeric_limits<unsigned>::max();
+    unsigned NumGroups = 0;
+    unsigned FirstGroupStartIdx = std::numeric_limits<unsigned>::max();
+    bool Repl32 = false;
 
-    ValueRotInfo()
-      : RLAmt(UINT32_MAX), NumGroups(0), FirstGroupStartIdx(UINT32_MAX),
-        Repl32(false) {}
+    ValueRotInfo() = default;
 
     // For sorting (in reverse order) by NumGroups, and then by
     // FirstGroupStartIdx.
@@ -1985,7 +2045,8 @@ public:
     return RNLM;
   }
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) {
   if (N->getValueType(0) != MVT::i32 &&
@@ -2057,7 +2118,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                               getI32Imm(Imm & 0xFFFF, dl)), 0);
       Opc = PPC::CMPLW;
     } else {
-      short SImm;
+      int16_t SImm;
       if (isIntS16Immediate(RHS, SImm))
         return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
                                               getI32Imm((int)SImm & 0xFFFF,
@@ -2104,7 +2165,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                               getI64Imm(Imm & 0xFFFF, dl)), 0);
       Opc = PPC::CMPLD;
     } else {
-      short SImm;
+      int16_t SImm;
       if (isIntS16Immediate(RHS, SImm))
         return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
                                               getI64Imm(SImm & 0xFFFF, dl)),
@@ -2443,6 +2504,525 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   return true;
 }
 
+// Is this opcode a bitwise logical operation?
+static bool isLogicOp(unsigned Opc) {
+  return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
+}
+
+/// If this node is a sign/zero extension of an integer comparison,
+/// it can usually be computed in GPR's rather than using comparison
+/// instructions and ISEL. We only do this on 64-bit targets for now
+/// as the code is specialized for 64-bit (it uses 64-bit instructions
+/// and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) {
+  if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+    return false;
+  assert((N->getOpcode() == ISD::ZERO_EXTEND ||
+          N->getOpcode() == ISD::SIGN_EXTEND) &&
+          "Expecting a zero/sign extend node!");
+
+  SDValue WideRes;
+  // If we are zero-extending the result of a logical operation on i1
+  // values, we can keep the values in GPRs.
+  if (isLogicOp(N->getOperand(0).getOpcode()) &&
+      N->getOperand(0).getValueType() == MVT::i1 &&
+      N->getOpcode() == ISD::ZERO_EXTEND)
+    WideRes = computeLogicOpInGPR(N->getOperand(0));
+  else if (N->getOperand(0).getOpcode() != ISD::SETCC)
+    return false;
+  else
+    WideRes =
+      getSETCCInGPR(N->getOperand(0),
+                    N->getOpcode() == ISD::SIGN_EXTEND ?
+                    SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig);
+
+  if (!WideRes)
+    return false;
+
+  SDLoc dl(N);
+  bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32;
+  bool Output32Bit = N->getValueType(0) == MVT::i32;
+
+  NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0;
+  NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1;
+
+  SDValue ConvOp = WideRes;
+  if (Inputs32Bit != Output32Bit)
+    ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext :
+                           ExtOrTruncConversion::Trunc);
+  ReplaceNode(N, ConvOp.getNode());
+
+  return true;
+}
+
+// Lower a logical operation on i1 values into a GPR sequence if possible.
+// The result can be kept in a GPR if requested.
+// Three types of inputs can be handled:
+// - SETCC
+// - TRUNCATE
+// - Logical operation (AND/OR/XOR)
+// There is also a special case that is handled (namely a complement operation
+// achieved with xor %a, -1).
+SDValue PPCDAGToDAGISel::computeLogicOpInGPR(SDValue LogicOp) {
+  assert(isLogicOp(LogicOp.getOpcode()) &&
+         "Can only handle logic operations here.");
+  assert(LogicOp.getValueType() == MVT::i1 &&
+         "Can only handle logic operations on i1 values here.");
+  SDLoc dl(LogicOp);
+  SDValue LHS, RHS;
+
+  // Special case: xor %a, -1
+  bool IsBitwiseNegation = isBitwiseNot(LogicOp);
+
+  // Produces a GPR sequence for each operand of the binary logic operation.
+  // For SETCC, it produces the respective comparison, for TRUNCATE it truncates
+  // the value in a GPR and for logic operations, it will recursively produce
+  // a GPR sequence for the operation.
+  auto getLogicOperand = [&] (SDValue Operand) -> SDValue {
+    unsigned OperandOpcode = Operand.getOpcode();
+    if (OperandOpcode == ISD::SETCC)
+      return getSETCCInGPR(Operand, SetccInGPROpts::ZExtOrig);
+    else if (OperandOpcode == ISD::TRUNCATE) {
+      SDValue InputOp = Operand.getOperand(0);
+      EVT InVT = InputOp.getValueType();
+      return
+        SDValue(CurDAG->getMachineNode(InVT == MVT::i32 ? PPC::RLDICL_32 :
+                                       PPC::RLDICL, dl, InVT, InputOp,
+                                       getI64Imm(0, dl), getI64Imm(63, dl)), 0);
+    } else if (isLogicOp(OperandOpcode))
+      return computeLogicOpInGPR(Operand);
+    return SDValue();
+  };
+  LHS = getLogicOperand(LogicOp.getOperand(0));
+  RHS = getLogicOperand(LogicOp.getOperand(1));
+
+  // If a GPR sequence can't be produced for the LHS we can't proceed.
+  // Not producing a GPR sequence for the RHS is only a problem if this isn't
+  // a bitwise negation operation.
+  if (!LHS || (!RHS && !IsBitwiseNegation))
+    return SDValue();
+
+  NumLogicOpsOnComparison++;
+
+  // We will use the inputs as 64-bit values.
+  if (LHS.getValueType() == MVT::i32)
+    LHS = addExtOrTrunc(LHS, ExtOrTruncConversion::Ext);
+  if (!IsBitwiseNegation && RHS.getValueType() == MVT::i32)
+    RHS = addExtOrTrunc(RHS, ExtOrTruncConversion::Ext);
+
+  unsigned NewOpc;
+  switch (LogicOp.getOpcode()) {
+  default: llvm_unreachable("Unknown logic operation.");
+  case ISD::AND: NewOpc = PPC::AND8; break;
+  case ISD::OR:  NewOpc = PPC::OR8;  break;
+  case ISD::XOR: NewOpc = PPC::XOR8; break;
+  }
+
+  if (IsBitwiseNegation) {
+    RHS = getI64Imm(1, dl);
+    NewOpc = PPC::XORI8;
+  }
+
+  return SDValue(CurDAG->getMachineNode(NewOpc, dl, MVT::i64, LHS, RHS), 0);
+
+}
+
+/// Try performing logical operations on results of comparisons in GPRs.
+/// It is typically preferred from a performance perspective over performing
+/// the operations on individual bits in the CR. We only do this on 64-bit
+/// targets for now as the code is specialized for 64-bit (it uses 64-bit
+/// instructions and assumes 64-bit registers).
+bool PPCDAGToDAGISel::tryLogicOpOfCompares(SDNode *N) {
+  if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64())
+    return false;
+  if (N->getValueType(0) != MVT::i1)
+    return false;
+  assert(isLogicOp(N->getOpcode()) &&
+         "Expected a logic operation on setcc results.");
+  SDValue LoweredLogical = computeLogicOpInGPR(SDValue(N, 0));
+  if (!LoweredLogical)
+    return false;
+
+  SDLoc dl(N);
+  bool IsBitwiseNegate = LoweredLogical.getMachineOpcode() == PPC::XORI8;
+  unsigned SubRegToExtract = IsBitwiseNegate ? PPC::sub_eq : PPC::sub_gt;
+  SDValue CR0Reg = CurDAG->getRegister(PPC::CR0, MVT::i32);
+  SDValue LHS = LoweredLogical.getOperand(0);
+  SDValue RHS = LoweredLogical.getOperand(1);
+  SDValue WideOp;
+  SDValue OpToConvToRecForm;
+
+  // Look through any 32-bit to 64-bit implicit extend nodes to find the opcode
+  // that is input to the XORI.
+  if (IsBitwiseNegate &&
+      LoweredLogical.getOperand(0).getMachineOpcode() == PPC::INSERT_SUBREG)
+    OpToConvToRecForm = LoweredLogical.getOperand(0).getOperand(1);
+  else if (IsBitwiseNegate)
+    // If the input to the XORI isn't an extension, that's what we're after.
+    OpToConvToRecForm = LoweredLogical.getOperand(0);
+  else
+    // If this is not an XORI, it is a reg-reg logical op and we can convert it
+    // to record-form.
+    OpToConvToRecForm = LoweredLogical;
+
+  // Get the record-form version of the node we're looking to use to get the
+  // CR result from.
+  uint16_t NonRecOpc = OpToConvToRecForm.getMachineOpcode();
+  int NewOpc = PPCInstrInfo::getRecordFormOpcode(NonRecOpc);
+
+  // Convert the right node to record-form. This is either the logical we're
+  // looking at or it is the input node to the negation (if we're looking at
+  // a bitwise negation).
+  if (NewOpc != -1 && IsBitwiseNegate) {
+    // The input to the XORI has a record-form. Use it.
+    assert(LoweredLogical.getConstantOperandVal(1) == 1 &&
+           "Expected a PPC::XORI8 only for bitwise negation.");
+    // Emit the record-form instruction.
+    std::vector<SDValue> Ops;
+    for (int i = 0, e = OpToConvToRecForm.getNumOperands(); i < e; i++)
+      Ops.push_back(OpToConvToRecForm.getOperand(i));
+
+    WideOp =
+      SDValue(CurDAG->getMachineNode(NewOpc, dl,
+                                     OpToConvToRecForm.getValueType(),
+                                     MVT::Glue, Ops), 0);
+  } else {
+    assert((NewOpc != -1 || !IsBitwiseNegate) &&
+           "No record form available for AND8/OR8/XOR8?");
+    WideOp =
+      SDValue(CurDAG->getMachineNode(NewOpc == -1 ? PPC::ANDIo8 : NewOpc, dl,
+                                     MVT::i64, MVT::Glue, LHS, RHS), 0);
+  }
+
+  // Select this node to a single bit from CR0 set by the record-form node
+  // just created. For bitwise negation, use the EQ bit which is the equivalent
+  // of negating the result (i.e. it is a bit set when the result of the
+  // operation is zero).
+  SDValue SRIdxVal =
+    CurDAG->getTargetConstant(SubRegToExtract, dl, MVT::i32);
+  SDValue CRBit =
+    SDValue(CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                   MVT::i1, CR0Reg, SRIdxVal,
+                                   WideOp.getValue(1)), 0);
+  ReplaceNode(N, CRBit.getNode());
+  return true;
+}
+
+/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only sign-extend 32-bit values here.");
+  unsigned Opc = Input.getOpcode();
+
+  // The value was sign extended and then truncated to 32-bits. No need to
+  // sign extend it again.
+  if (Opc == ISD::TRUNCATE &&
+      (Input.getOperand(0).getOpcode() == ISD::AssertSext ||
+       Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND))
+    return Input;
+
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  // The input is a sign-extending load. No reason to sign-extend.
+  if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD)
+    return Input;
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  // We don't sign-extend constants and already sign-extended values.
+  if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG ||
+      Opc == ISD::SIGN_EXTEND)
+    return Input;
+
+  SDLoc dl(Input);
+  SignExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0);
+}
+
+/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it.
+/// Useful when emitting comparison code for 32-bit values without using
+/// the compare instruction (which only considers the lower 32-bits).
+SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) {
+  assert(Input.getValueType() == MVT::i32 &&
+         "Can only zero-extend 32-bit values here.");
+  LoadSDNode *InputLoad = dyn_cast<LoadSDNode>(Input);
+  unsigned Opc = Input.getOpcode();
+
+  // No need to zero-extend loaded values (unless they're loaded with
+  // a sign-extending load).
+  if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD)
+    return Input;
+
+  ConstantSDNode *InputConst = dyn_cast<ConstantSDNode>(Input);
+  bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0;
+  // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have
+  // to conservatively actually clear the high bits. We also don't need to
+  // zero-extend constants or values that are already zero-extended.
+  if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND)
+    return Input;
+
+  SDLoc dl(Input);
+  ZeroExtensionsAdded++;
+  return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input,
+                                        getI64Imm(0, dl), getI64Imm(32, dl)),
+                 0);
+}
+
+// Handle a 32-bit value in a 64-bit register and vice-versa. These are of
+// course not actual zero/sign extensions that will generate machine code,
+// they're just a way to reinterpret a 32 bit value in a register as a
+// 64 bit value and vice-versa.
+SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes,
+                                       ExtOrTruncConversion Conv) {
+  SDLoc dl(NatWidthRes);
+
+  // For reinterpreting 32-bit values as 64 bit values, we generate
+  // INSERT_SUBREG IMPLICIT_DEF:i64, <input>, TargetConstant:i32<1>
+  if (Conv == ExtOrTruncConversion::Ext) {
+    SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0);
+    SDValue SubRegIdx =
+      CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+    return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64,
+                                          ImDef, NatWidthRes, SubRegIdx), 0);
+  }
+
+  assert(Conv == ExtOrTruncConversion::Trunc &&
+         "Unknown convertion between 32 and 64 bit values.");
+  // For reinterpreting 64-bit values as 32-bit values, we just need to
+  // EXTRACT_SUBREG (i.e. extract the low word).
+  SDValue SubRegIdx =
+    CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32);
+  return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32,
+                                        NatWidthRes, SubRegIdx), 0);
+}
+
+/// Produces a zero-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5)
+    // (zext (setcc %a, 0, seteq))  -> (lshr (cntlzw %a), 5)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
+      getI32Imm(31, dl) };
+    return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32,
+                                          ShiftOps), 0);
+  }
+  case ISD::SETNE: {
+    // (zext (setcc %a, %b, setne)) -> (xor (lshr (cntlzw (xor %a, %b)), 5), 1)
+    // (zext (setcc %a, 0, setne))  -> (xor (lshr (cntlzw %a), 5), 1)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl),
+      getI32Imm(31, dl) };
+    SDValue Shift =
+      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
+                                          getI32Imm(1, dl)), 0);
+  }
+  }
+}
+
+/// Produces a sign-extended result of comparing two 32-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (sext (setcc %a, %b, seteq)) ->
+    //   (ashr (shl (ctlz (xor %a, %b)), 58), 63)
+    // (sext (setcc %a, 0, seteq)) ->
+    //   (ashr (shl (ctlz %a), 58), 63)
+    SDValue CountInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Cntlzw =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0);
+    SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) };
+    SDValue Sldi =
+      SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi,
+                                          getI32Imm(63, dl)), 0);
+  }
+  case ISD::SETNE: {
+    // Bitwise xor the operands, count leading zeros, shift right by 5 bits and
+    // flip the bit, finally take 2's complement.
+    // (sext (setcc %a, %b, setne)) ->
+    //   (neg (xor (lshr (ctlz (xor %a, %b)), 5), 1))
+    // Same as above, but the first xor is not needed.
+    // (sext (setcc %a, 0, setne)) ->
+    //   (neg (xor (lshr (ctlz %a), 5), 1))
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0);
+    SDValue ShiftOps[] =
+      { Clz, getI32Imm(27, dl), getI32Imm(5, dl), getI32Imm(31, dl) };
+    SDValue Shift =
+      SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, ShiftOps), 0);
+    SDValue Xori =
+      SDValue(CurDAG->getMachineNode(PPC::XORI, dl, MVT::i32, Shift,
+                                     getI32Imm(1, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Xori), 0);
+  }
+  }
+}
+
+/// Produces a zero-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get64BitZExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // (zext (setcc %a, %b, seteq)) -> (lshr (ctlz (xor %a, %b)), 6)
+    // (zext (setcc %a, 0, seteq)) ->  (lshr (ctlz %a), 6)
+    SDValue Xor = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Clz =
+      SDValue(CurDAG->getMachineNode(PPC::CNTLZD, dl, MVT::i64, Xor), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::RLDICL, dl, MVT::i64, Clz,
+                                          getI64Imm(58, dl), getI64Imm(63, dl)),
+                   0);
+  }
+  }
+}
+
+/// Produces a sign-extended result of comparing two 64-bit values according to
+/// the passed condition code.
+SDValue PPCDAGToDAGISel::get64BitSExtCompare(SDValue LHS, SDValue RHS,
+                                             ISD::CondCode CC,
+                                             int64_t RHSValue, SDLoc dl) {
+  bool IsRHSZero = RHSValue == 0;
+  switch (CC) {
+  default: return SDValue();
+  case ISD::SETEQ: {
+    // {addc.reg, addc.CA} = (addcarry (xor %a, %b), -1)
+    // (sext (setcc %a, %b, seteq)) -> (sube addc.reg, addc.reg, addc.CA)
+    // {addcz.reg, addcz.CA} = (addcarry %a, -1)
+    // (sext (setcc %a, 0, seteq)) -> (sube addcz.reg, addcz.reg, addcz.CA)
+    SDValue AddInput = IsRHSZero ? LHS :
+      SDValue(CurDAG->getMachineNode(PPC::XOR8, dl, MVT::i64, LHS, RHS), 0);
+    SDValue Addic =
+      SDValue(CurDAG->getMachineNode(PPC::ADDIC8, dl, MVT::i64, MVT::Glue,
+                                     AddInput, getI32Imm(~0U, dl)), 0);
+    return SDValue(CurDAG->getMachineNode(PPC::SUBFE8, dl, MVT::i64, Addic,
+                                          Addic, Addic.getValue(1)), 0);
+  }
+  }
+}
+
+/// Does this SDValue have any uses for which keeping the value in a GPR is
+/// appropriate. This is meant to be used on values that have type i1 since
+/// it is somewhat meaningless to ask if values of other types can be kept in
+/// GPR's.
+static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
+  assert(Compare.getOpcode() == ISD::SETCC &&
+         "An ISD::SETCC node required here.");
+
+  // For values that have a single use, the caller should obviously already have
+  // checked if that use is an extending use. We check the other uses here.
+  if (Compare.hasOneUse())
+    return true;
+  // We want the value in a GPR if it is being extended, used for a select, or
+  // used in logical operations.
+  for (auto CompareUse : Compare.getNode()->uses())
+    if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
+        CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
+        CompareUse->getOpcode() != ISD::SELECT &&
+        !isLogicOp(CompareUse->getOpcode())) {
+      OmittedForNonExtendUses++;
+      return false;
+    }
+  return true;
+}
+
+/// Returns an equivalent of a SETCC node but with the result the same width as
+/// the inputs. This can nalso be used for SELECT_CC if either the true or false
+/// values is a power of two while the other is zero.
+SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare,
+                                       SetccInGPROpts ConvOpts) {
+  assert((Compare.getOpcode() == ISD::SETCC ||
+          Compare.getOpcode() == ISD::SELECT_CC) &&
+         "An ISD::SETCC node required here.");
+
+  // Don't convert this comparison to a GPR sequence because there are uses
+  // of the i1 result (i.e. uses that require the result in the CR).
+  if ((Compare.getOpcode() == ISD::SETCC) && !allUsesExtend(Compare, CurDAG))
+    return SDValue();
+
+  SDValue LHS = Compare.getOperand(0);
+  SDValue RHS = Compare.getOperand(1);
+
+  // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC.
+  int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2;
+  ISD::CondCode CC =
+    cast<CondCodeSDNode>(Compare.getOperand(CCOpNum))->get();
+  EVT InputVT = LHS.getValueType();
+  if (InputVT != MVT::i32 && InputVT != MVT::i64)
+    return SDValue();
+
+  if (ConvOpts == SetccInGPROpts::ZExtInvert ||
+      ConvOpts == SetccInGPROpts::SExtInvert)
+    CC = ISD::getSetCCInverse(CC, true);
+
+  bool Inputs32Bit = InputVT == MVT::i32;
+  if (ISD::isSignedIntSetCC(CC) && Inputs32Bit) {
+    LHS = signExtendInputIfNeeded(LHS);
+    RHS = signExtendInputIfNeeded(RHS);
+  } else if (ISD::isUnsignedIntSetCC(CC) && Inputs32Bit) {
+    LHS = zeroExtendInputIfNeeded(LHS);
+    RHS = zeroExtendInputIfNeeded(RHS);
+  }
+
+  SDLoc dl(Compare);
+  ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(RHS);
+  int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX;
+  bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig ||
+    ConvOpts == SetccInGPROpts::SExtInvert;
+
+  if (IsSext && Inputs32Bit)
+    return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  else if (Inputs32Bit)
+    return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+  else if (IsSext)
+    return get64BitSExtCompare(LHS, RHS, CC, RHSValue, dl);
+  return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl);
+}
+
+/// Does this node represent a load/store node whose address can be represented
+/// with a register plus an immediate that's a multiple of \p Val:
+bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
+  LoadSDNode *LDN = dyn_cast<LoadSDNode>(N);
+  StoreSDNode *STN = dyn_cast<StoreSDNode>(N);
+  SDValue AddrOp;
+  if (LDN)
+    AddrOp = LDN->getOperand(1);
+  else if (STN)
+    AddrOp = STN->getOperand(2);
+
+  short Imm = 0;
+  if (AddrOp.getOpcode() == ISD::ADD)
+    return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val);
+
+  // If the address comes from the outside, the offset will be zero.
+  return AddrOp.getOpcode() == ISD::CopyFromReg;
+}
+
 void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -2450,7 +3030,6 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
   cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 }
 
-
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -2474,19 +3053,24 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   switch (N->getOpcode()) {
   default: break;
 
-  case ISD::Constant: {
+  case ISD::Constant:
     if (N->getValueType(0) == MVT::i64) {
       ReplaceNode(N, getInt64(CurDAG, N));
       return;
     }
     break;
-  }
 
-  case ISD::SETCC: {
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    if (tryEXTEND(N))
+      return;
+    break;
+
+  case ISD::SETCC:
     if (trySETCC(N))
       return;
     break;
-  }
+
   case PPCISD::GlobalBaseReg:
     ReplaceNode(N, getGlobalBaseReg());
     return;
@@ -2502,11 +3086,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     return;
   }
 
-  case PPCISD::READ_TIME_BASE: {
+  case PPCISD::READ_TIME_BASE:
     ReplaceNode(N, CurDAG->getMachineNode(PPC::ReadTB, dl, MVT::i32, MVT::i32,
                                           MVT::Other, N->getOperand(0)));
     return;
-  }
 
   case PPCISD::SRA_ADDZE: {
     SDValue N0 = N->getOperand(0);
@@ -2626,6 +3209,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ISD::AND: {
+    if (tryLogicOpOfCompares(N))
+      return;
+
     unsigned Imm, Imm2, SH, MB, ME;
     uint64_t Imm64;
 
@@ -2690,6 +3276,19 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
       return;
     }
+    // If this is a negated 64-bit zero-extension mask,
+    // i.e. the immediate is a sequence of ones from most significant side
+    // and all zero for reminder, we should use rldicr.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(~Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 63 - countTrailingOnes(~Imm64);
+      SH = 0;
+      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+      return;
+    }
+
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
@@ -2732,15 +3331,18 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       if (tryBitfieldInsert(N))
         return;
 
-    short Imm;
+    if (tryLogicOpOfCompares(N))
+      return;
+
+    int16_t Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
         isIntS16Immediate(N->getOperand(1), Imm)) {
-      APInt LHSKnownZero, LHSKnownOne;
-      CurDAG->computeKnownBits(N->getOperand(0), LHSKnownZero, LHSKnownOne);
+      KnownBits LHSKnown;
+      CurDAG->computeKnownBits(N->getOperand(0), LHSKnown);
 
       // If this is equivalent to an add, then we can fold it with the
       // FrameIndex calculation.
-      if ((LHSKnownZero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
+      if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)Imm) == ~0ULL) {
         selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
         return;
       }
@@ -2749,8 +3351,13 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ISD::XOR: {
+    if (tryLogicOpOfCompares(N))
+      return;
+    break;
+  }
   case ISD::ADD: {
-    short Imm;
+    int16_t Imm;
     if (N->getOperand(0)->getOpcode() == ISD::FrameIndex &&
         isIntS16Immediate(N->getOperand(1), Imm)) {
       selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm);
@@ -2911,8 +3518,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
       return;
     }
-
     break;
+
   case ISD::VECTOR_SHUFFLE:
     if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
@@ -2940,7 +3547,11 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
             SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) {
           SDValue Chain = LD->getChain();
           SDValue Ops[] = { Base, Offset, Chain };
-          CurDAG->SelectNodeTo(N, PPC::LXVDSX, N->getValueType(0), Ops);
+          MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+          MemOp[0] = LD->getMemOperand();
+          SDNode *NewN = CurDAG->SelectNodeTo(N, PPC::LXVDSX,
+                                              N->getValueType(0), Ops);
+          cast<MachineSDNode>(NewN)->setMemRefs(MemOp, MemOp + 1);
           return;
         }
       }
@@ -3088,7 +3699,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
                                           SDValue(Tmp, 0), GA));
     return;
   }
-  case PPCISD::PPC32_PICGOT: {
+  case PPCISD::PPC32_PICGOT:
     // Generate a PIC-safe GOT reference.
     assert(!PPCSubTarget->isPPC64() && PPCSubTarget->isSVR4ABI() &&
       "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
@@ -3096,7 +3707,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
                          PPCLowering->getPointerTy(CurDAG->getDataLayout()),
                          MVT::i32);
     return;
-  }
+
   case PPCISD::VADD_SPLAT: {
     // This expands into one of three sequences, depending on whether
     // the first operand is odd or even, positive or negative.
@@ -3139,7 +3750,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue TmpVal = SDValue(Tmp, 0);
       ReplaceNode(N, CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal));
       return;
-
     } else if (Elt > 0) {
       // Elt is odd and positive, in the range [17,31].
       //
@@ -3154,7 +3764,6 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       ReplaceNode(N, CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0),
                                             SDValue(Tmp2, 0)));
       return;
-
     } else {
       // Elt is odd and negative, in the range [-31,-17].
       //
@@ -3199,7 +3808,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   SDValue RHS, LHS;
-  bool BytesFound[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+  bool BytesFound[8] = {false, false, false, false, false, false, false, false};
   uint64_t Mask = 0, Alt = 0;
 
   auto IsByteSelectCC = [this](SDValue O, unsigned &b,
@@ -3436,11 +4045,13 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
                                             O0.getNode(), O1.getNode());
     };
 
+    // FIXME: When the semantics of the interaction between select and undef
+    // are clearly defined, it may turn out to be unnecessary to break here.
     SDValue TrueRes = TryFold(ConstTrue);
-    if (!TrueRes)
+    if (!TrueRes || TrueRes.isUndef())
       break;
     SDValue FalseRes = TryFold(ConstFalse);
-    if (!FalseRes)
+    if (!FalseRes || FalseRes.isUndef())
       break;
 
     // For us to materialize these using one instruction, we must be able to
@@ -3499,7 +4110,6 @@ void PPCDAGToDAGISel::PreprocessISelDAG() {
 /// PostprocessISelDAG - Perform some late peephole optimizations
 /// on the DAG representation.
 void PPCDAGToDAGISel::PostprocessISelDAG() {
-
   // Skip peepholes at -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
@@ -3515,10 +4125,6 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // be folded with the isel so that we don't need to materialize a register
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
-  // If we're not using isel, then this does not matter.
-  if (!PPCSubTarget->hasISEL())
-    return false;
-
   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
        UI != UE; ++UI) {
     SDNode *User = *UI;
@@ -4520,10 +5126,10 @@ void PPCDAGToDAGISel::PeepholePPC64() {
   }
 }
 
-
 /// createPPCISelDag - This pass converts a legalized DAG into a
 /// PowerPC-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
-  return new PPCDAGToDAGISel(TM);
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new PPCDAGToDAGISel(TM, OptLevel);
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2b9195b..b3a3c73 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -13,37 +13,87 @@
 
 #include "PPCISelLowering.h"
 #include "MCTargetDesc/PPCPredicates.h"
-#include "PPCCallingConv.h"
+#include "PPC.h"
 #include "PPCCCState.h"
+#include "PPCCallingConv.h"
+#include "PPCFrameLowering.h"
+#include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
+#include "PPCRegisterInfo.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
-#include "PPCTargetObjectFile.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 #include <list>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -86,7 +136,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
   }
 
-  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  // Match BITREVERSE to customized fast code sequence in the td file.
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
   for (MVT VT : MVT::integer_valuetypes()) {
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
@@ -125,7 +179,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
     }
 
-    // PowerPC does not support direct load / store of condition registers
+    // PowerPC does not support direct load/store of condition registers.
     setOperationAction(ISD::LOAD, MVT::i1, Custom);
     setOperationAction(ISD::STORE, MVT::i1, Custom);
 
@@ -154,11 +208,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
   setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
 
-  // PowerPC has no SREM/UREM instructions
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
+  // PowerPC has no SREM/UREM instructions unless we are on P9
+  // On P9 we may use a hardware instruction to compute the remainder.
+  // The instructions are not legalized directly because in the cases where the
+  // result of both the remainder and the division is required it is more
+  // efficient to compute the remainder from the result of the division rather
+  // than use the remainder instruction.
+  if (Subtarget.isISA3_0()) {
+    setOperationAction(ISD::SREM, MVT::i32, Custom);
+    setOperationAction(ISD::UREM, MVT::i32, Custom);
+    setOperationAction(ISD::SREM, MVT::i64, Custom);
+    setOperationAction(ISD::UREM, MVT::i64, Custom);
+  } else {
+    setOperationAction(ISD::SREM, MVT::i32, Expand);
+    setOperationAction(ISD::UREM, MVT::i32, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::UREM, MVT::i64, Expand);
+  }
 
   // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
@@ -360,6 +426,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   // To handle counter-based loop conditions.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
 
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
   // Comparisons that require checking two conditions.
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
@@ -484,7 +555,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FABS, VT, Expand);
-      setOperationAction(ISD::FPOWI, VT, Expand);
       setOperationAction(ISD::FFLOOR, VT, Expand);
       setOperationAction(ISD::FCEIL,  VT, Expand);
       setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -634,6 +704,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setOperationAction(ISD::SRA, MVT::v2i64, Legal);
         setOperationAction(ISD::SRL, MVT::v2i64, Legal);
 
+        // 128 bit shifts can be accomplished via 3 instructions for SHL and
+        // SRL, but not for SRA because of the instructions available:
+        // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
+        // doing
+        setOperationAction(ISD::SHL, MVT::v1i128, Expand);
+        setOperationAction(ISD::SRL, MVT::v1i128, Expand);
+        setOperationAction(ISD::SRA, MVT::v1i128, Expand);
+
         setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
       }
       else {
@@ -687,6 +765,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     if (Subtarget.hasP9Vector()) {
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+      // 128 bit shifts can be accomplished via 3 instructions for SHL and
+      // SRL, but not for SRA because of the instructions available:
+      // VS{RL} and VS{RL}O.
+      setOperationAction(ISD::SHL, MVT::v1i128, Legal);
+      setOperationAction(ISD::SRL, MVT::v1i128, Legal);
+      setOperationAction(ISD::SRA, MVT::v1i128, Expand);
     }
   }
 
@@ -728,7 +813,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FABS , MVT::v4f64, Legal);
     setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
     setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
-    setOperationAction(ISD::FPOWI , MVT::v4f64, Expand);
     setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
@@ -774,7 +858,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FABS , MVT::v4f32, Legal);
     setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOWI , MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
@@ -873,6 +956,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
 
   // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget.hasFPCVT())
@@ -971,6 +1057,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxLoadsPerMemcmp = 128;
+  } else {
+    MaxLoadsPerMemcmp = 8;
+    MaxLoadsPerMemcmpOptSize = 4;
   }
 }
 
@@ -1042,6 +1132,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
   case PPCISD::XXINSERT:        return "PPCISD::XXINSERT";
+  case PPCISD::XXREVERSE:       return "PPCISD::XXREVERSE";
+  case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
   case PPCISD::CMPB:            return "PPCISD::CMPB";
   case PPCISD::Hi:              return "PPCISD::Hi";
@@ -1080,6 +1172,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
   case PPCISD::STXSIX:          return "PPCISD::STXSIX";
   case PPCISD::VEXTS:           return "PPCISD::VEXTS";
+  case PPCISD::SExtVElems:      return "PPCISD::SExtVElems";
   case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
   case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
@@ -1523,21 +1616,47 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
   return true;
 }
 
-bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
-                          unsigned &InsertAtByte, bool &Swap, bool IsLE) {
-
-  // Check that the mask is shuffling words
-  for (unsigned i = 0; i < 4; ++i) {
-    unsigned B0 = N->getMaskElt(i*4);
-    unsigned B1 = N->getMaskElt(i*4+1);
-    unsigned B2 = N->getMaskElt(i*4+2);
-    unsigned B3 = N->getMaskElt(i*4+3);
-    if (B0 % 4)
+/// Check that the mask is shuffling N byte elements. Within each N byte
+/// element of the mask, the indices could be either in increasing or
+/// decreasing order as long as they are consecutive.
+/// \param[in] N the shuffle vector SD Node to analyze
+/// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
+/// Word/DoubleWord/QuadWord).
+/// \param[in] StepLen the delta indices number among the N byte element, if
+/// the mask is in increasing/decreasing order then it is 1/-1.
+/// \return true iff the mask is shuffling N byte elements.
+static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
+                                   int StepLen) {
+  assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
+         "Unexpected element width.");
+  assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
+
+  unsigned NumOfElem = 16 / Width;
+  unsigned MaskVal[16]; //  Width is never greater than 16
+  for (unsigned i = 0; i < NumOfElem; ++i) {
+    MaskVal[0] = N->getMaskElt(i * Width);
+    if ((StepLen == 1) && (MaskVal[0] % Width)) {
       return false;
-    if (B1 != B0+1 || B2 != B1+1 || B3 != B2+1)
+    } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
       return false;
+    }
+
+    for (unsigned int j = 1; j < Width; ++j) {
+      MaskVal[j] = N->getMaskElt(i * Width + j);
+      if (MaskVal[j] != MaskVal[j-1] + StepLen) {
+        return false;
+      }
+    }
   }
 
+  return true;
+}
+
+bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                          unsigned &InsertAtByte, bool &Swap, bool IsLE) {
+  if (!isNByteElemShuffleMask(N, 4, 1))
+    return false;
+
   // Now we look at mask elements 0,4,8,12
   unsigned M0 = N->getMaskElt(0) / 4;
   unsigned M1 = N->getMaskElt(4) / 4;
@@ -1608,6 +1727,158 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
   return false;
 }
 
+bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                               bool &Swap, bool IsLE) {
+  assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+  // Ensure each byte index of the word is consecutive.
+  if (!isNByteElemShuffleMask(N, 4, 1))
+    return false;
+
+  // Now we look at mask elements 0,4,8,12, which are the beginning of words.
+  unsigned M0 = N->getMaskElt(0) / 4;
+  unsigned M1 = N->getMaskElt(4) / 4;
+  unsigned M2 = N->getMaskElt(8) / 4;
+  unsigned M3 = N->getMaskElt(12) / 4;
+
+  // If both vector operands for the shuffle are the same vector, the mask will
+  // contain only elements from the first one and the second one will be undef.
+  if (N->getOperand(1).isUndef()) {
+    assert(M0 < 4 && "Indexing into an undef vector?");
+    if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
+      return false;
+
+    ShiftElts = IsLE ? (4 - M0) % 4 : M0;
+    Swap = false;
+    return true;
+  }
+
+  // Ensure each word index of the ShuffleVector Mask is consecutive.
+  if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
+    return false;
+
+  if (IsLE) {
+    if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
+      // Input vectors don't need to be swapped if the leading element
+      // of the result is one of the 3 left elements of the second vector
+      // (or if there is no shift to be done at all).
+      Swap = false;
+      ShiftElts = (8 - M0) % 8;
+    } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
+      // Input vectors need to be swapped if the leading element
+      // of the result is one of the 3 left elements of the first vector
+      // (or if we're shifting by 4 - thereby simply swapping the vectors).
+      Swap = true;
+      ShiftElts = (4 - M0) % 4;
+    }
+
+    return true;
+  } else {                                          // BE
+    if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
+      // Input vectors don't need to be swapped if the leading element
+      // of the result is one of the 4 elements of the first vector.
+      Swap = false;
+      ShiftElts = M0;
+    } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
+      // Input vectors need to be swapped if the leading element
+      // of the result is one of the 4 elements of the right vector.
+      Swap = true;
+      ShiftElts = M0 - 4;
+    }
+
+    return true;
+  }
+}
+
+bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
+  assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+
+  if (!isNByteElemShuffleMask(N, Width, -1))
+    return false;
+
+  for (int i = 0; i < 16; i += Width)
+    if (N->getMaskElt(i) != i + Width - 1)
+      return false;
+
+  return true;
+}
+
+bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
+  return isXXBRShuffleMaskHelper(N, 2);
+}
+
+bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
+  return isXXBRShuffleMaskHelper(N, 4);
+}
+
+bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
+  return isXXBRShuffleMaskHelper(N, 8);
+}
+
+bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
+  return isXXBRShuffleMaskHelper(N, 16);
+}
+
+/// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
+/// if the inputs to the instruction should be swapped and set \p DM to the
+/// value for the immediate.
+/// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
+/// AND element 0 of the result comes from the first input (LE) or second input
+/// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
+/// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
+/// mask.
+bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
+                               bool &Swap, bool IsLE) {
+  assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+
+  // Ensure each byte index of the double word is consecutive.
+  if (!isNByteElemShuffleMask(N, 8, 1))
+    return false;
+
+  unsigned M0 = N->getMaskElt(0) / 8;
+  unsigned M1 = N->getMaskElt(8) / 8;
+  assert(((M0 | M1) < 4) && "A mask element out of bounds?");
+
+  // If both vector operands for the shuffle are the same vector, the mask will
+  // contain only elements from the first one and the second one will be undef.
+  if (N->getOperand(1).isUndef()) {
+    if ((M0 | M1) < 2) {
+      DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
+      Swap = false;
+      return true;
+    } else
+      return false;
+  }
+
+  if (IsLE) {
+    if (M0 > 1 && M1 < 2) {
+      Swap = false;
+    } else if (M0 < 2 && M1 > 1) {
+      M0 = (M0 + 2) % 4;
+      M1 = (M1 + 2) % 4;
+      Swap = true;
+    } else
+      return false;
+
+    // Note: if control flow comes here that means Swap is already set above
+    DM = (((~M1) & 1) << 1) + ((~M0) & 1);
+    return true;
+  } else { // BE
+    if (M0 < 2 && M1 > 1) {
+      Swap = false;
+    } else if (M0 > 1 && M1 < 2) {
+      M0 = (M0 + 2) % 4;
+      M1 = (M1 + 2) % 4;
+      Swap = true;
+    } else
+      return false;
+
+    // Note: if control flow comes here that means Swap is already set above
+    DM = (M0 << 1) + (M1 & 1);
+    return true;
+  }
+}
+
+
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
@@ -1643,7 +1914,6 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
       // If the element isn't a constant, bail fully out.
       if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
 
-
       if (!UniquedVals[i&(Multiple-1)].getNode())
         UniquedVals[i&(Multiple-1)] = N->getOperand(i);
       else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
@@ -1763,17 +2033,17 @@ int PPC::isQVALIGNIShuffleMask(SDNode *N) {
 /// or 64-bit immediate, and if the value can be accurately represented as a
 /// sign extension from a 16-bit value.  If so, this returns true and the
 /// immediate.
-static bool isIntS16Immediate(SDNode *N, short &Imm) {
+bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
   if (!isa<ConstantSDNode>(N))
     return false;
 
-  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
   if (N->getValueType(0) == MVT::i32)
     return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
   else
     return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
 }
-static bool isIntS16Immediate(SDValue Op, short &Imm) {
+bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
   return isIntS16Immediate(Op.getNode(), Imm);
 }
 
@@ -1783,7 +2053,7 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) {
 bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
                                             SDValue &Index,
                                             SelectionDAG &DAG) const {
-  short imm = 0;
+  int16_t imm = 0;
   if (N.getOpcode() == ISD::ADD) {
     if (isIntS16Immediate(N.getOperand(1), imm))
       return false;    // r+i
@@ -1800,17 +2070,14 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
     // If this is an or of disjoint bitfields, we can codegen this as an add
     // (for better address arithmetic) if the LHS and RHS of the OR are provably
     // disjoint.
-    APInt LHSKnownZero, LHSKnownOne;
-    APInt RHSKnownZero, RHSKnownOne;
-    DAG.computeKnownBits(N.getOperand(0),
-                         LHSKnownZero, LHSKnownOne);
-
-    if (LHSKnownZero.getBoolValue()) {
-      DAG.computeKnownBits(N.getOperand(1),
-                           RHSKnownZero, RHSKnownOne);
+    KnownBits LHSKnown, RHSKnown;
+    DAG.computeKnownBits(N.getOperand(0), LHSKnown);
+
+    if (LHSKnown.Zero.getBoolValue()) {
+      DAG.computeKnownBits(N.getOperand(1), RHSKnown);
       // If all of the bits are known zero on the LHS or RHS, the add won't
       // carry.
-      if (~(LHSKnownZero | RHSKnownZero) == 0) {
+      if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
         Base = N.getOperand(0);
         Index = N.getOperand(1);
         return true;
@@ -1863,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
 
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.  If Aligned is true, only accept displacements
-/// suitable for STD and friends, i.e. multiples of 4.
+/// represented as reg+reg.  If \p Alignment is non-zero, only accept
+/// displacements that are multiples of that value.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
                                             SelectionDAG &DAG,
-                                            bool Aligned) const {
+                                            unsigned Alignment) const {
   // FIXME dl should come from parent load or store, not from address
   SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
@@ -1876,9 +2143,9 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     return false;
 
   if (N.getOpcode() == ISD::ADD) {
-    short imm = 0;
+    int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -1900,16 +2167,16 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       return true;  // [&g+r]
     }
   } else if (N.getOpcode() == ISD::OR) {
-    short imm = 0;
+    int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!Aligned || (imm & 3) == 0)) {
+        (!Alignment || (imm % Alignment) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
-      APInt LHSKnownZero, LHSKnownOne;
-      DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+      KnownBits LHSKnown;
+      DAG.computeKnownBits(N.getOperand(0), LHSKnown);
 
-      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+      if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
         // carry.
         if (FrameIndexSDNode *FI =
@@ -1928,8 +2195,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
 
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
-    short Imm;
-    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
+    int16_t Imm;
+    if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) {
       Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
       Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -1939,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // Handle 32-bit sext immediates with LIS + addr mode.
     if ((CN->getValueType(0) == MVT::i32 ||
          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
-        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
+        (!Alignment || (CN->getZExtValue() % Alignment) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -1973,10 +2240,15 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   if (SelectAddressRegReg(N, Base, Index, DAG))
     return true;
 
-  // If the operand is an addition, always emit this as [r+r], since this is
-  // better (for code size, and execution, as the memop does the add for free)
-  // than emitting an explicit add.
-  if (N.getOpcode() == ISD::ADD) {
+  // If the address is the result of an add, we will utilize the fact that the
+  // address calculation includes an implicit add.  However, we can reduce
+  // register pressure if we do not materialize a constant just for use as the
+  // index register.  We only get rid of the add if it is not an add of a
+  // value and a 16-bit signed constant and both have a single use.
+  int16_t imm = 0;
+  if (N.getOpcode() == ISD::ADD &&
+      (!isIntS16Immediate(N.getOperand(1), imm) ||
+       !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
     Base = N.getOperand(0);
     Index = N.getOperand(1);
     return true;
@@ -2026,7 +2298,6 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   }
 
   if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
-
     // Common code will reject creating a pre-inc form if the base pointer
     // is a frame index, or if N is a store and the base pointer is either
     // the same as or a predecessor of the value being stored.  Check for
@@ -2050,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
       return false;
   }
 
@@ -2277,7 +2548,6 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
-
   // FIXME: TLS addresses currently use medium model code sequences,
   // which is the most useful form.  Eventually support for small and
   // large models could be added if users need it, at the cost of
@@ -2300,8 +2570,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                PPCII::MO_TPREL_HA);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                                PPCII::MO_TPREL_LO);
-    SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
-                                     is64bit ? MVT::i64 : MVT::i32);
+    SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
+                             : DAG.getRegister(PPC::R2, MVT::i32);
+
     SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
     return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
   }
@@ -2602,10 +2873,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-               std::move(Args));
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -2737,7 +3007,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-bool 
+bool
 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
                                                   MVT &LocVT,
                                                   CCValAssign::LocInfo &LocInfo,
@@ -2752,7 +3022,7 @@ llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
   unsigned RegNum = State.getFirstUnallocated(ArgRegs);
   int RegsLeft = NumArgRegs - RegNum;
 
-  // Skip if there is not enough registers left for long double type (4 gpr regs 
+  // Skip if there is not enough registers left for long double type (4 gpr regs
   // in soft float mode) and put long double argument on the stack.
   if (RegNum != NumArgRegs && RegsLeft < 4) {
     for (int i = 0; i < RegsLeft; i++) {
@@ -4066,7 +4336,7 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
 
 static bool
 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
-  if (CS->arg_size() != CallerFn->getArgumentList().size())
+  if (CS->arg_size() != CallerFn->arg_size())
     return false;
 
   ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
@@ -4222,11 +4492,12 @@ namespace {
 struct TailCallArgumentInfo {
   SDValue Arg;
   SDValue FrameIdxOp;
-  int       FrameIdx;
+  int FrameIdx = 0;
 
-  TailCallArgumentInfo() : FrameIdx(0) {}
+  TailCallArgumentInfo() = default;
 };
-}
+
+} // end anonymous namespace
 
 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
 static void StoreTailCallArgumentsToStackSlot(
@@ -4406,7 +4677,6 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
             SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
             SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
             ImmutableCallSite *CS, const PPCSubtarget &Subtarget) {
-
   bool isPPC64 = Subtarget.isPPC64();
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
@@ -4602,7 +4872,6 @@ SDValue PPCTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
@@ -4649,7 +4918,6 @@ SDValue PPCTargetLowering::FinishCall(
     SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
     unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
     SmallVectorImpl<SDValue> &InVals, ImmutableCallSite *CS) const {
-
   std::vector<EVT> NodeTys;
   SmallVector<SDValue, 8> Ops;
   unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, CallSeqStart, dl,
@@ -4909,8 +5177,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be moved somewhere else
@@ -4960,9 +5227,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
                                   Flags, DAG, dl);
 
       // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1),
-                           SDLoc(MemcpyCall));
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
+                                                     SDLoc(MemcpyCall));
       DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                              NewCallSeqStart.getNode());
       Chain = CallSeqStart = NewCallSeqStart;
@@ -5043,9 +5309,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
   // The MEMCPY must go outside the CALLSEQ_START..END.
-  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                             CallSeqStart.getNode()->getOperand(1),
-                             SDLoc(MemcpyCall));
+  int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
+                                                 SDLoc(MemcpyCall));
   DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                          NewCallSeqStart.getNode());
   return NewCallSeqStart;
@@ -5059,7 +5325,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite *CS) const {
-
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
@@ -5105,10 +5370,30 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   };
 
   const unsigned NumGPRs = array_lengthof(GPR);
-  const unsigned NumFPRs = 13;
+  const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
   const unsigned NumVRs  = array_lengthof(VR);
   const unsigned NumQFPRs = NumFPRs;
 
+  // On ELFv2, we can avoid allocating the parameter area if all the arguments
+  // can be passed to the callee in registers.
+  // For the fast calling convention, there is another check below.
+  // Note: We should keep consistent with LowerFormalArguments_64SVR4()
+  bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
+  if (!HasParameterArea) {
+    unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+    unsigned AvailableFPRs = NumFPRs;
+    unsigned AvailableVRs = NumVRs;
+    unsigned NumBytesTmp = NumBytes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (Outs[i].Flags.isNest()) continue;
+      if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
+                                PtrByteSize, LinkageSize, ParamAreaSize,
+                                NumBytesTmp, AvailableFPRs, AvailableVRs,
+                                Subtarget.hasQPX()))
+        HasParameterArea = true;
+    }
+  }
+
   // When using the fast calling convention, we don't provide backing for
   // arguments that will be in registers.
   unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
@@ -5176,13 +5461,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
 
   unsigned NumBytesActuallyUsed = NumBytes;
 
-  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // In the old ELFv1 ABI,
+  // the prolog code of the callee may store up to 8 GPR argument registers to
   // the stack, allowing va_start to index over them in memory if its varargs.
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
-  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
-  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+  // In the ELFv2 ABI, we allocate the parameter area iff a callee
+  // really requires memory operands, e.g. a vararg function.
+  if (HasParameterArea)
+    NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+  else
+    NumBytes = LinkageSize;
 
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
@@ -5204,8 +5494,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -5401,6 +5690,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
@@ -5486,6 +5777,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
@@ -5520,6 +5813,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // GPRs when within range.  For now, we always put the value in both
       // locations (or even all three).
       if (isVarArg) {
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
@@ -5552,6 +5847,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
@@ -5572,6 +5869,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
       if (isVarArg) {
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
@@ -5604,6 +5903,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
@@ -5618,7 +5919,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     }
   }
 
-  assert(NumBytesActuallyUsed == ArgOffset);
+  assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
+         "mismatch in size of parameter area");
   (void)NumBytesActuallyUsed;
 
   if (!MemOpChains.empty())
@@ -5673,7 +5975,6 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     ImmutableCallSite *CS) const {
-
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -5752,8 +6053,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -6065,7 +6365,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &dl, SelectionDAG &DAG) const {
-
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
@@ -6133,7 +6432,7 @@ PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
-  // Get the corect type for integers.
+  // Get the correct type for integers.
   EVT IntVT = Op.getValueType();
 
   // Get the inputs.
@@ -6150,7 +6449,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
   // When we pop the dynamic allocation we need to restore the SP link.
   SDLoc dl(Op);
 
-  // Get the corect type for pointers.
+  // Get the correct type for pointers.
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Construct the stack pointer operand.
@@ -6225,7 +6524,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Size  = Op.getOperand(1);
   SDLoc dl(Op);
 
-  // Get the corect type for pointers.
+  // Get the correct type for pointers.
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   // Negate the size.
   SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
@@ -6356,6 +6655,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     default: break;       // SETUO etc aren't handled by fsel.
     case ISD::SETNE:
       std::swap(TV, FV);
+      LLVM_FALLTHROUGH;
     case ISD::SETEQ:
       if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
         LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
@@ -6367,6 +6667,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SETULT:
     case ISD::SETLT:
       std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+      LLVM_FALLTHROUGH;
     case ISD::SETOGE:
     case ISD::SETGE:
       if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
@@ -6375,6 +6676,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SETUGT:
     case ISD::SETGT:
       std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+      LLVM_FALLTHROUGH;
     case ISD::SETOLE:
     case ISD::SETLE:
       if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
@@ -6388,8 +6690,9 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   default: break;       // SETUO etc aren't handled by fsel.
   case ISD::SETNE:
     std::swap(TV, FV);
+    LLVM_FALLTHROUGH;
   case ISD::SETEQ:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
@@ -6399,25 +6702,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
   case ISD::SETULT:
   case ISD::SETLT:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
   case ISD::SETOGE:
   case ISD::SETGE:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
   case ISD::SETUGT:
   case ISD::SETGT:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
   case ISD::SETOLE:
   case ISD::SETLE:
-    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags);
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
     if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
       Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
     return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
@@ -6585,6 +6888,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
 // Given the head of the old chain, ResChain, insert a token factor containing
 // it and NewResChain, and make users of ResChain now be users of that token
 // factor.
+// TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
 void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
                                         SDValue NewResChain,
                                         SelectionDAG &DAG) const {
@@ -7585,6 +7889,53 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
+
+  if (Subtarget.hasVSX() &&
+      PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
+    if (Swap)
+      std::swap(V1, V2);
+    SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+    SDValue Conv2 =
+        DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
+
+    SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
+                              DAG.getConstant(ShiftElts, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
+  }
+
+  if (Subtarget.hasVSX() &&
+    PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
+    if (Swap)
+      std::swap(V1, V2);
+    SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
+    SDValue Conv2 =
+        DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
+
+    SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
+                              DAG.getConstant(ShiftElts, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
+  }
+
+  if (Subtarget.hasP9Vector()) {
+     if (PPC::isXXBRHShuffleMask(SVOp)) {
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
+      SDValue ReveHWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v8i16, Conv);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
+    } else if (PPC::isXXBRWShuffleMask(SVOp)) {
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+      SDValue ReveWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v4i32, Conv);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
+    } else if (PPC::isXXBRDShuffleMask(SVOp)) {
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
+      SDValue ReveDWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v2i64, Conv);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
+    } else if (PPC::isXXBRQShuffleMask(SVOp)) {
+      SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
+      SDValue ReveQWord = DAG.getNode(PPCISD::XXREVERSE, dl, MVT::v1i128, Conv);
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
+    }
+  }
+
   if (Subtarget.hasVSX()) {
     if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
       int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
@@ -7612,7 +7963,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
       return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
     }
-
   }
 
   if (Subtarget.hasQPX()) {
@@ -7792,24 +8142,39 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
                                  bool &isDot, const PPCSubtarget &Subtarget) {
   unsigned IntrinsicID =
-    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+      cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
   CompareOpc = -1;
   isDot = false;
   switch (IntrinsicID) {
-  default: return false;
-    // Comparison predicates.
-  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  default:
+    return false;
+  // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:
+    CompareOpc = 966;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    CompareOpc = 198;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequb_p:
+    CompareOpc = 6;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequh_p:
+    CompareOpc = 70;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequw_p:
+    CompareOpc = 134;
+    isDot = true;
+    break;
   case Intrinsic::ppc_altivec_vcmpequd_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 199;
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
   case Intrinsic::ppc_altivec_vcmpneb_p:
   case Intrinsic::ppc_altivec_vcmpneh_p:
@@ -7818,45 +8183,80 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpnezh_p:
   case Intrinsic::ppc_altivec_vcmpnezw_p:
     if (Subtarget.hasP9Altivec()) {
-      switch(IntrinsicID) {
-      default: llvm_unreachable("Unknown comparison intrinsic.");
-      case Intrinsic::ppc_altivec_vcmpneb_p: CompareOpc = 7; break;
-      case Intrinsic::ppc_altivec_vcmpneh_p: CompareOpc = 71; break;
-      case Intrinsic::ppc_altivec_vcmpnew_p: CompareOpc = 135; break;
-      case Intrinsic::ppc_altivec_vcmpnezb_p: CompareOpc = 263; break;
-      case Intrinsic::ppc_altivec_vcmpnezh_p: CompareOpc = 327; break;
-      case Intrinsic::ppc_altivec_vcmpnezw_p: CompareOpc = 391; break;
+      switch (IntrinsicID) {
+      default:
+        llvm_unreachable("Unknown comparison intrinsic.");
+      case Intrinsic::ppc_altivec_vcmpneb_p:
+        CompareOpc = 7;
+        break;
+      case Intrinsic::ppc_altivec_vcmpneh_p:
+        CompareOpc = 71;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnew_p:
+        CompareOpc = 135;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezb_p:
+        CompareOpc = 263;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezh_p:
+        CompareOpc = 327;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezw_p:
+        CompareOpc = 391;
+        break;
       }
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
-  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p:
+    CompareOpc = 454;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    CompareOpc = 710;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    CompareOpc = 774;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    CompareOpc = 838;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    CompareOpc = 902;
+    isDot = true;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtsd_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 967;
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
-  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
-  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p:
+    CompareOpc = 518;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    CompareOpc = 582;
+    isDot = true;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p:
+    CompareOpc = 646;
+    isDot = true;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtud_p:
     if (Subtarget.hasP8Altivec()) {
       CompareOpc = 711;
-      isDot = 1;
+      isDot = true;
     } else
       return false;
-
     break;
-    // VSX predicate comparisons use the same infrastructure
+
+  // VSX predicate comparisons use the same infrastructure
   case Intrinsic::ppc_vsx_xvcmpeqdp_p:
   case Intrinsic::ppc_vsx_xvcmpgedp_p:
   case Intrinsic::ppc_vsx_xvcmpgtdp_p:
@@ -7865,33 +8265,51 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_vsx_xvcmpgtsp_p:
     if (Subtarget.hasVSX()) {
       switch (IntrinsicID) {
-      case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break;
-      case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break;
-      case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break;
-      case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break;
-      case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break;
-      case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break;
+      case Intrinsic::ppc_vsx_xvcmpeqdp_p:
+        CompareOpc = 99;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgedp_p:
+        CompareOpc = 115;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgtdp_p:
+        CompareOpc = 107;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpeqsp_p:
+        CompareOpc = 67;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgesp_p:
+        CompareOpc = 83;
+        break;
+      case Intrinsic::ppc_vsx_xvcmpgtsp_p:
+        CompareOpc = 75;
+        break;
       }
-      isDot = 1;
-    }
-    else
+      isDot = true;
+    } else
       return false;
-
     break;
 
-    // Normal Comparisons.
-  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:
+    CompareOpc = 966;
+    break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:
+    CompareOpc = 198;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequb:
+    CompareOpc = 6;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequh:
+    CompareOpc = 70;
+    break;
+  case Intrinsic::ppc_altivec_vcmpequw:
+    CompareOpc = 134;
+    break;
   case Intrinsic::ppc_altivec_vcmpequd:
-    if (Subtarget.hasP8Altivec()) {
+    if (Subtarget.hasP8Altivec())
       CompareOpc = 199;
-      isDot = 0;
-    } else
+    else
       return false;
-
     break;
   case Intrinsic::ppc_altivec_vcmpneb:
   case Intrinsic::ppc_altivec_vcmpneh:
@@ -7899,43 +8317,67 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   case Intrinsic::ppc_altivec_vcmpnezb:
   case Intrinsic::ppc_altivec_vcmpnezh:
   case Intrinsic::ppc_altivec_vcmpnezw:
-    if (Subtarget.hasP9Altivec()) {
+    if (Subtarget.hasP9Altivec())
       switch (IntrinsicID) {
-      default: llvm_unreachable("Unknown comparison intrinsic.");
-      case Intrinsic::ppc_altivec_vcmpneb: CompareOpc = 7; break;
-      case Intrinsic::ppc_altivec_vcmpneh: CompareOpc = 71; break;
-      case Intrinsic::ppc_altivec_vcmpnew: CompareOpc = 135; break;
-      case Intrinsic::ppc_altivec_vcmpnezb: CompareOpc = 263; break;
-      case Intrinsic::ppc_altivec_vcmpnezh: CompareOpc = 327; break;
-      case Intrinsic::ppc_altivec_vcmpnezw: CompareOpc = 391; break;
+      default:
+        llvm_unreachable("Unknown comparison intrinsic.");
+      case Intrinsic::ppc_altivec_vcmpneb:
+        CompareOpc = 7;
+        break;
+      case Intrinsic::ppc_altivec_vcmpneh:
+        CompareOpc = 71;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnew:
+        CompareOpc = 135;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezb:
+        CompareOpc = 263;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezh:
+        CompareOpc = 327;
+        break;
+      case Intrinsic::ppc_altivec_vcmpnezw:
+        CompareOpc = 391;
+        break;
       }
-      isDot = 0;
-    } else
+    else
       return false;
     break;
-  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:
+    CompareOpc = 454;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:
+    CompareOpc = 710;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:
+    CompareOpc = 774;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:
+    CompareOpc = 838;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:
+    CompareOpc = 902;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtsd:
-    if (Subtarget.hasP8Altivec()) {
+    if (Subtarget.hasP8Altivec())
       CompareOpc = 967;
-      isDot = 0;
-    } else
+    else
       return false;
-
     break;
-  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
-  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:
+    CompareOpc = 518;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:
+    CompareOpc = 582;
+    break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:
+    CompareOpc = 646;
+    break;
   case Intrinsic::ppc_altivec_vcmpgtud:
-    if (Subtarget.hasP8Altivec()) {
+    if (Subtarget.hasP8Altivec())
       CompareOpc = 711;
-      isDot = 0;
-    } else
+    else
       return false;
-
     break;
   }
   return true;
@@ -7950,9 +8392,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   if (IntrinsicID == Intrinsic::thread_pointer) {
     // Reads the thread pointer register, used for __builtin_thread_pointer.
-    bool is64bit = Subtarget.isPPC64();
-    return DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
-                           is64bit ? MVT::i64 : MVT::i32);
+    if (Subtarget.isPPC64())
+      return DAG.getRegister(PPC::X13, MVT::i64);
+    return DAG.getRegister(PPC::R2, MVT::i32);
   }
 
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
@@ -8019,6 +8461,40 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   return Flags;
 }
 
+SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
+  // the beginning of the argument list.
+  int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
+  SDLoc DL(Op);
+  switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
+  case Intrinsic::ppc_cfence: {
+    assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
+    assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
+    return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
+                                      DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
+                                                  Op.getOperand(ArgStart + 1)),
+                                      Op.getOperand(0)),
+                   0);
+  }
+  default:
+    break;
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
+  // Check for a DIV with the same operands as this REM.
+  for (auto UI : Op.getOperand(1)->uses()) {
+    if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||
+        (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
+      if (UI->getOperand(0) == Op.getOperand(0) &&
+          UI->getOperand(1) == Op.getOperand(1))
+        return SDValue();
+  }
+  return Op;
+}
+
 SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -8044,7 +8520,7 @@ SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
-                                                   SelectionDAG &DAG) const {
+                                                 SelectionDAG &DAG) const {
   SDLoc dl(Op);
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -8484,6 +8960,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   // Frame & Return address.
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+
+  case ISD::INTRINSIC_VOID:
+    return LowerINTRINSIC_VOID(Op, DAG);
+  case ISD::SREM:
+  case ISD::UREM:
+    return LowerREM(Op, DAG);
   }
 }
 
@@ -8575,9 +9057,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) {
 
 // The mappings for emitLeading/TrailingFence is taken from
 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
-Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
-                                         AtomicOrdering Ord, bool IsStore,
-                                         bool IsLoad) const {
+Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
+                                                 Instruction *Inst,
+                                                 AtomicOrdering Ord) const {
   if (Ord == AtomicOrdering::SequentiallyConsistent)
     return callIntrinsic(Builder, Intrinsic::ppc_sync);
   if (isReleaseOrStronger(Ord))
@@ -8585,15 +9067,22 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   return nullptr;
 }
 
-Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
-                                          AtomicOrdering Ord, bool IsStore,
-                                          bool IsLoad) const {
-  if (IsLoad && isAcquireOrStronger(Ord))
+Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
+                                                  Instruction *Inst,
+                                                  AtomicOrdering Ord) const {
+  if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
+    // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
+    // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+    // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+    if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
+      return Builder.CreateCall(
+          Intrinsic::getDeclaration(
+              Builder.GetInsertBlock()->getParent()->getParent(),
+              Intrinsic::ppc_cfence, {Inst->getType()}),
+          {Inst});
+    // FIXME: Can use isync for rmw operation.
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
-  // FIXME: this is too conservative, a dependent branch + isync is enough.
-  // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
-  // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
-  // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
+  }
   return nullptr;
 }
 
@@ -8889,6 +9378,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -8902,7 +9392,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
@@ -8985,7 +9475,6 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
-  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
@@ -9174,10 +9663,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
   MachineFunction *F = BB->getParent();
 
-  if (Subtarget.hasISEL() &&
-      (MI.getOpcode() == PPC::SELECT_CC_I4 ||
+  if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
        MI.getOpcode() == PPC::SELECT_CC_I8 ||
-       MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8)) {
+       MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8) {
     SmallVector<MachineOperand, 2> Cond;
     if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
         MI.getOpcode() == PPC::SELECT_CC_I8)
@@ -9417,7 +9905,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     BB = EmitAtomicBinary(MI, BB, 4, 0);
   else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
     BB = EmitAtomicBinary(MI, BB, 8, 0);
-
   else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
            MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
            (Subtarget.hasPartwordAtomics() &&
@@ -10028,14 +10515,12 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
   return false;
 }
 
-
 /// This function is called when we have proved that a SETCC node can be replaced
 /// by subtraction (and other supporting instructions) so that the result of
 /// comparison is kept in a GPR instead of CR. This function is purely for
 /// codegen purposes and has some flags to guide the codegen process.
 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
                                      bool Swap, SDLoc &DL, SelectionDAG &DAG) {
-
   assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
 
   // Zero extend the operands to the largest legal integer. Originally, they
@@ -10068,7 +10553,6 @@ static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
 
 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
-
   assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
 
   SelectionDAG &DAG = DCI.DAG;
@@ -10155,17 +10639,16 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
     } else {
       // This is neither a signed nor an unsigned comparison, just make sure
       // that the high bits are equal.
-      APInt Op1Zero, Op1One;
-      APInt Op2Zero, Op2One;
-      DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
-      DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
+      KnownBits Op1Known, Op2Known;
+      DAG.computeKnownBits(N->getOperand(0), Op1Known);
+      DAG.computeKnownBits(N->getOperand(1), Op2Known);
 
       // We don't really care about what is known about the first bit (if
       // anything), so clear it in all masks prior to comparing them.
-      Op1Zero.clearBit(0); Op1One.clearBit(0);
-      Op2Zero.clearBit(0); Op2One.clearBit(0);
+      Op1Known.Zero.clearBit(0); Op1Known.One.clearBit(0);
+      Op2Known.Zero.clearBit(0); Op2Known.One.clearBit(0);
 
-      if (Op1Zero != Op2Zero || Op1One != Op2One)
+      if (Op1Known.Zero != Op2Known.Zero || Op1Known.One != Op2Known.One)
         return SDValue();
     }
   }
@@ -10842,6 +11325,132 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// This function adds the required vector_shuffle needed to get
+// the elements of the vector extract in the correct position
+// as specified by the CorrectElems encoding.
+static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
+                                      SDValue Input, uint64_t Elems,
+                                      uint64_t CorrectElems) {
+  SDLoc dl(N);
+
+  unsigned NumElems = Input.getValueType().getVectorNumElements();
+  SmallVector<int, 16> ShuffleMask(NumElems, -1);
+
+  // Knowing the element indices being extracted from the original
+  // vector and the order in which they're being inserted, just put
+  // them at element indices required for the instruction.
+  for (unsigned i = 0; i < N->getNumOperands(); i++) {
+    if (DAG.getDataLayout().isLittleEndian())
+      ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
+    else
+      ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
+    CorrectElems = CorrectElems >> 8;
+    Elems = Elems >> 8;
+  }
+
+  SDValue Shuffle =
+      DAG.getVectorShuffle(Input.getValueType(), dl, Input,
+                           DAG.getUNDEF(Input.getValueType()), ShuffleMask);
+
+  EVT Ty = N->getValueType(0);
+  SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
+  return BV;
+}
+
+// Look for build vector patterns where input operands come from sign
+// extended vector_extract elements of specific indices. If the correct indices
+// aren't used, add a vector shuffle to fix up the indices and create a new
+// PPCISD:SExtVElems node which selects the vector sign extend instructions
+// during instruction selection.
+static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
+  // This array encodes the indices that the vector sign extend instructions
+  // extract from when extending from one type to another for both BE and LE.
+  // The right nibble of each byte corresponds to the LE incides.
+  // and the left nibble of each byte corresponds to the BE incides.
+  // For example: 0x3074B8FC  byte->word
+  // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
+  // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
+  // For example: 0x000070F8  byte->double word
+  // For LE: the allowed indices are: 0x0,0x8
+  // For BE: the allowed indices are: 0x7,0xF
+  uint64_t TargetElems[] = {
+      0x3074B8FC, // b->w
+      0x000070F8, // b->d
+      0x10325476, // h->w
+      0x00003074, // h->d
+      0x00001032, // w->d
+  };
+
+  uint64_t Elems = 0;
+  int Index;
+  SDValue Input;
+
+  auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
+    if (!Op)
+      return false;
+    if (Op.getOpcode() != ISD::SIGN_EXTEND)
+      return false;
+
+    SDValue Extract = Op.getOperand(0);
+    if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
+    if (!ExtOp)
+      return false;
+
+    Index = ExtOp->getZExtValue();
+    if (Input && Input != Extract.getOperand(0))
+      return false;
+
+    if (!Input)
+      Input = Extract.getOperand(0);
+
+    Elems = Elems << 8;
+    Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
+    Elems |= Index;
+
+    return true;
+  };
+
+  // If the build vector operands aren't sign extended vector extracts,
+  // of the same input vector, then return.
+  for (unsigned i = 0; i < N->getNumOperands(); i++) {
+    if (!isSExtOfVecExtract(N->getOperand(i))) {
+      return SDValue();
+    }
+  }
+
+  // If the vector extract indicies are not correct, add the appropriate
+  // vector_shuffle.
+  int TgtElemArrayIdx;
+  int InputSize = Input.getValueType().getScalarSizeInBits();
+  int OutputSize = N->getValueType(0).getScalarSizeInBits();
+  if (InputSize + OutputSize == 40)
+    TgtElemArrayIdx = 0;
+  else if (InputSize + OutputSize == 72)
+    TgtElemArrayIdx = 1;
+  else if (InputSize + OutputSize == 48)
+    TgtElemArrayIdx = 2;
+  else if (InputSize + OutputSize == 80)
+    TgtElemArrayIdx = 3;
+  else if (InputSize + OutputSize == 96)
+    TgtElemArrayIdx = 4;
+  else
+    return SDValue();
+
+  uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
+  CorrectElems = DAG.getDataLayout().isLittleEndian()
+                     ? CorrectElems & 0x0F0F0F0F0F0F0F0F
+                     : CorrectElems & 0xF0F0F0F0F0F0F0F0;
+  if (Elems != CorrectElems) {
+    return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
+  }
+
+  // Regular lowering will catch cases where a shuffle is not needed.
+  return SDValue();
+}
+
 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   assert(N->getOpcode() == ISD::BUILD_VECTOR &&
@@ -10869,6 +11478,15 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
   if (Reduced)
     return Reduced;
 
+  // If we're building a vector out of extended elements from another vector
+  // we have P9 vector integer extend instructions.
+  if (Subtarget.hasP9Altivec()) {
+    Reduced = combineBVOfVecSExt(N, DAG);
+    if (Reduced)
+      return Reduced;
+  }
+
+
   if (N->getValueType(0) != MVT::v2f64)
     return SDValue();
 
@@ -11053,6 +11671,14 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
   }
 
   MVT VecTy = N->getValueType(0).getSimpleVT();
+
+  // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
+  // aligned and the type is a vector with elements up to 4 bytes
+  if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
+      && VecTy.getScalarSizeInBits() <= 32 ) {
+    return SDValue();
+  }
+
   SDValue LoadOps[] = { Chain, Base };
   SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
                                          DAG.getVTList(MVT::v2f64, MVT::Other),
@@ -11117,6 +11743,13 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
   SDValue Src = N->getOperand(SrcOpnd);
   MVT VecTy = Src.getValueType().getSimpleVT();
 
+  // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
+  // aligned and the type is a vector with elements up to 4 bytes
+  if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
+      && VecTy.getScalarSizeInBits() <= 32 ) {
+    return SDValue();
+  }
+
   // All stores are done as v2f64 and possible bit cast.
   if (VecTy != MVT::v2f64) {
     Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
@@ -11141,6 +11774,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
+  case ISD::SHL:
+    return combineSHL(N, DCI);
+  case ISD::SRA:
+    return combineSRA(N, DCI);
+  case ISD::SRL:
+    return combineSRL(N, DCI);
   case PPCISD::SHL:
     if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
         return N->getOperand(0);
@@ -11227,9 +11866,20 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (BSwapOp.getValueType() == MVT::i16)
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
 
+      // If the type of BSWAP operand is wider than stored memory width
+      // it need to be shifted to the right side before STBRX.
+      EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
+      if (Op1VT.bitsGT(mVT)) {
+        int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
+        BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
+                              DAG.getConstant(Shift, dl, MVT::i32));
+        // Need to truncate if this is a bswap of i64 stored as i32/i16.
+        if (Op1VT == MVT::i64)
+          BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
+      }
+
       SDValue Ops[] = {
-        N->getOperand(0), BSwapOp, N->getOperand(2),
-        DAG.getValueType(N->getOperand(1).getValueType())
+        N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
       };
       return
         DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
@@ -11570,7 +12220,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     break;
-  case ISD::INTRINSIC_W_CHAIN: {
+  case ISD::INTRINSIC_W_CHAIN:
     // For little endian, VSX loads require generating lxvd2x/xxswapd.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
     if (Subtarget.needsSwapsForVSXMemOps()) {
@@ -11583,8 +12233,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     break;
-  }
-  case ISD::INTRINSIC_VOID: {
+  case ISD::INTRINSIC_VOID:
     // For little endian, VSX stores require generating xxswapd/stxvd2x.
     // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
     if (Subtarget.needsSwapsForVSXMemOps()) {
@@ -11597,7 +12246,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       }
     }
     break;
-  }
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
@@ -11635,9 +12283,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // Return N so it doesn't get rechecked!
       return SDValue(N, 0);
     }
-
     break;
-  case PPCISD::VCMP: {
+  case PPCISD::VCMP:
     // If a VCMPo node already exists with exactly the same operands as this
     // node, use its result instead of this node (VCMPo computes both a CR6 and
     // a normal output).
@@ -11687,7 +12334,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         return SDValue(VCMPoNode, 0);
     }
     break;
-  }
   case ISD::BRCOND: {
     SDValue Cond = N->getOperand(1);
     SDValue Target = N->getOperand(2);
@@ -11845,17 +12491,17 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 //===----------------------------------------------------------------------===//
 
 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      APInt &KnownZero,
-                                                      APInt &KnownOne,
+                                                      KnownBits &Known,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
-  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
   case PPCISD::LBRX: {
     // lhbrx is known to have the top bits cleared out.
     if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
-      KnownZero = 0xFFFF0000;
+      Known.Zero = 0xFFFF0000;
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -11877,7 +12523,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     case Intrinsic::ppc_altivec_vcmpgtuh_p:
     case Intrinsic::ppc_altivec_vcmpgtuw_p:
     case Intrinsic::ppc_altivec_vcmpgtud_p:
-      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      Known.Zero = ~1U;  // All bits but the low one are known to be zero.
       break;
     }
   }
@@ -12295,7 +12941,6 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            unsigned Intrinsic) const {
-
   switch (Intrinsic) {
   case Intrinsic::ppc_qpx_qvlfd:
   case Intrinsic::ppc_qpx_qvlfs:
@@ -12753,7 +13398,6 @@ void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
 }
 
 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-
   if (!VT.isSimple() || !Subtarget.hasVSX())
     return false;
 
@@ -12768,3 +13412,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
     return Imm.isPosZero();
   }
 }
+
+// For vector shift operation op, fold
+// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
+static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
+                                  SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  unsigned Opcode = N->getOpcode();
+  unsigned TargetOpcode;
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unexpected shift operation");
+  case ISD::SHL:
+    TargetOpcode = PPCISD::SHL;
+    break;
+  case ISD::SRL:
+    TargetOpcode = PPCISD::SRL;
+    break;
+  case ISD::SRA:
+    TargetOpcode = PPCISD::SRA;
+    break;
+  }
+
+  if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
+      N1->getOpcode() == ISD::AND)
+    if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
+      if (Mask->getZExtValue() == OpSizeInBits - 1)
+        return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
+  if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
+    return Value;
+
+  return SDValue();
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 05acd25..49d7d82 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -17,13 +17,26 @@
 
 #include "PPC.h"
 #include "PPCInstrInfo.h"
-#include "PPCRegisterInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Target/TargetLowering.h"
+#include <utility>
 
 namespace llvm {
+
   namespace PPCISD {
+
     enum NodeType : unsigned {
       // Start the numbering where the builtin ops and target ops leave off.
       FIRST_NUMBER = ISD::BUILTIN_OP_END,
@@ -54,6 +67,10 @@ namespace llvm {
       /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
       VEXTS,
 
+      /// SExtVElems, takes an input vector of a smaller type and sign
+      /// extends to an output vector of a larger type.
+      SExtVElems,
+
       /// Reciprocal estimate instructions (unary FP ops).
       FRE, FRSQRTE,
 
@@ -73,10 +90,18 @@ namespace llvm {
       ///
       XXINSERT,
 
+      /// XXREVERSE - The PPC VSX reverse instruction
+      ///
+      XXREVERSE,
+
       /// VECSHL - The PPC VSX shift left instruction
       ///
       VECSHL,
 
+      /// XXPERMDI - The PPC XXPERMDI instruction
+      ///
+      XXPERMDI,
+
       /// The CMPB instruction (takes two operands of i32 or i64).
       CMPB,
 
@@ -104,9 +129,13 @@ namespace llvm {
       /// at function entry, used for PIC code.
       GlobalBaseReg,
 
-      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
-      /// shift amounts.  These nodes are generated by the multi-precision shift
-      /// code.
+      /// These nodes represent PPC shifts.
+      ///
+      /// For scalar types, only the last `n + 1` bits of the shift amounts
+      /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+      /// for exact behaviors.
+      ///
+      /// For vector types, only the last n bits are used. See vsld.
       SRL, SRA, SHL,
 
       /// The combination of sra[wd]i and addze used to implemented signed
@@ -398,10 +427,12 @@ namespace llvm {
       /// the last operand.
       TOC_ENTRY
     };
-  }
+
+  } // end namespace PPCISD
 
   /// Define some predicates that are used for node matching.
   namespace PPC {
+
     /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
     /// VPKUHUM instruction.
     bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
@@ -431,7 +462,32 @@ namespace llvm {
     /// a VMRGEW or VMRGOW instruction
     bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
                              unsigned ShuffleKind, SelectionDAG &DAG);
-  
+    /// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXSLDWI instruction.
+    bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                              bool &Swap, bool IsLE);
+
+    /// isXXBRHShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXBRH instruction.
+    bool isXXBRHShuffleMask(ShuffleVectorSDNode *N);
+
+    /// isXXBRWShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXBRW instruction.
+    bool isXXBRWShuffleMask(ShuffleVectorSDNode *N);
+
+    /// isXXBRDShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXBRD instruction.
+    bool isXXBRDShuffleMask(ShuffleVectorSDNode *N);
+
+    /// isXXBRQShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXBRQ instruction.
+    bool isXXBRQShuffleMask(ShuffleVectorSDNode *N);
+
+    /// isXXPERMDIShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXPERMDI instruction.
+    bool isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                              bool &Swap, bool IsLE);
+
     /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
     /// shift amount, otherwise return -1.
     int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
@@ -465,7 +521,8 @@ namespace llvm {
     /// If this is a qvaligni shuffle mask, return the shift
     /// amount, otherwise return -1.
     int isQVALIGNIShuffleMask(SDNode *N);
-  }
+
+  } // end namespace PPC
 
   class PPCTargetLowering : public TargetLowering {
     const PPCSubtarget &Subtarget;
@@ -492,6 +549,7 @@ namespace llvm {
         return TypeWidenVector;
       return TargetLoweringBase::getPreferredVectorAction(VT);
     }
+
     bool useSoftFloat() const override;
 
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
@@ -514,6 +572,10 @@ namespace llvm {
       return true;
     }
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
     bool supportSplitCSR(MachineFunction *MF) const override {
       return
         MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
@@ -554,7 +616,7 @@ namespace llvm {
     /// is not better represented as reg+reg.  If Aligned is true, only accept
     /// displacements suitable for STD and friends, i.e. multiples of 4.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
-                             SelectionDAG &DAG, bool Aligned) const;
+                             SelectionDAG &DAG, unsigned Alignment) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
@@ -585,8 +647,8 @@ namespace llvm {
                                SelectionDAG &DAG) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
+                                       KnownBits &Known,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
@@ -596,10 +658,10 @@ namespace llvm {
       return true;
     }
 
-    Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                                  bool IsStore, bool IsLoad) const override;
-    Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord,
-                                   bool IsStore, bool IsLoad) const override;
+    Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                  AtomicOrdering Ord) const override;
+    Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
+                                   AtomicOrdering Ord) const override;
 
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
@@ -694,6 +756,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool convertSelectOfConstantsToMath() const override {
+      return true;
+    }
+
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     bool getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -785,15 +851,13 @@ namespace llvm {
       SDValue Chain;
       SDValue ResChain;
       MachinePointerInfo MPI;
-      bool IsDereferenceable;
-      bool IsInvariant;
-      unsigned Alignment;
+      bool IsDereferenceable = false;
+      bool IsInvariant = false;
+      unsigned Alignment = 0;
       AAMDNodes AAInfo;
-      const MDNode *Ranges;
+      const MDNode *Ranges = nullptr;
 
-      ReuseLoadInfo()
-          : IsDereferenceable(false), IsInvariant(false), Alignment(0),
-            Ranges(nullptr) {}
+      ReuseLoadInfo() = default;
 
       MachineMemOperand::Flags MMOFlags() const {
         MachineMemOperand::Flags F = MachineMemOperand::MONone;
@@ -878,6 +942,8 @@ namespace llvm {
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
@@ -906,15 +972,13 @@ namespace llvm {
                          const SDLoc &dl, SelectionDAG &DAG,
                          SmallVectorImpl<SDValue> &InVals) const override;
 
-    SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
-    bool
-      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                   bool isVarArg,
-                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   LLVMContext &Context) const override;
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -978,6 +1042,9 @@ namespace llvm {
     SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
 
     /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
     /// SETCC with integer subtraction when (1) there is a legal way of doing it
@@ -994,14 +1061,16 @@ namespace llvm {
     CCAssignFn *useFastISelCCs(unsigned Flag) const;
 
     SDValue
-      combineElementTruncationToVectorTruncation(SDNode *N,
-                                                 DAGCombinerInfo &DCI) const;
+    combineElementTruncationToVectorTruncation(SDNode *N,
+                                               DAGCombinerInfo &DCI) const;
   };
 
   namespace PPC {
+
     FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
                              const TargetLibraryInfo *LibInfo);
-  }
+
+  } // end namespace PPC
 
   bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                   CCValAssign::LocInfo &LocInfo,
@@ -1026,6 +1095,10 @@ namespace llvm {
                                            CCValAssign::LocInfo &LocInfo,
                                            ISD::ArgFlagsTy &ArgFlags,
                                            CCState &State);
-}
 
-#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+  bool isIntS16Immediate(SDNode *N, int16_t &Imm);
+  bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index fbec878..e2af5e5 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -253,11 +253,11 @@ def LDAT : X_RD5_RS5_IM5<31, 614, (outs g8rc:$rD), (ins g8rc:$rA, u5imm:$FC),
            Requires<[IsISA3_0]>;
 }
 
-let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in
+let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STDCX : XForm_1<31, 214, (outs), (ins g8rc:$rS, memrr:$dst),
                     "stdcx. $rS, $dst", IIC_LdStSTDCX, []>, isDOT;
 
-let mayStore = 1, hasSideEffects = 0 in
+let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STDAT : X_RD5_RS5_IM5<31, 742, (outs), (ins g8rc:$rS, g8rc:$rA, u5imm:$FC),
                           "stdat $rS, $rA, $FC", IIC_LdStStore>, isPPC64,
             Requires<[IsISA3_0]>;
@@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in
 defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS),
                              "extsw", "$rA, $rS", IIC_IntSimple,
                              [(set i64:$rA, (sext i32:$rS))]>, isPPC64;
+let isCodeGenOnly = 1 in
+def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS),
+                        "extsw $rA, $rS", IIC_IntSimple,
+                        []>, isPPC64;
 
 defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
                          "sradi", "$rA, $rS, $SH", IIC_IntRotateDI,
                          [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def SRADI_32  : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH),
+                         "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64;
+
 defm CNTLZD : XForm_11r<31,  58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IIC_IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
@@ -674,6 +683,16 @@ def DIVDE : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                      "divde $rT, $rA, $rB", IIC_IntDivD,
                      [(set i64:$rT, (int_ppc_divde g8rc:$rA, g8rc:$rB))]>,
                      isPPC64, Requires<[HasExtDiv]>;
+
+let Predicates = [IsISA3_0] in {
+def MODSD : XForm_8<31, 777, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                        "modsd $rT, $rA, $rB", IIC_IntDivW,
+                        [(set i64:$rT, (srem i64:$rA, i64:$rB))]>;
+def MODUD : XForm_8<31, 265, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
+                        "modud $rT, $rA, $rB", IIC_IntDivW,
+                        [(set i64:$rT, (urem i64:$rA, i64:$rB))]>;
+}
+
 let Defs = [CR0] in
 def DIVDEo : XOForm_1<31, 425, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                       "divde. $rT, $rA, $rB", IIC_IntDivD,
@@ -721,15 +740,26 @@ defm RLDICL : MDForm_1r<30, 0,
 // For fast-isel:
 let isCodeGenOnly = 1 in
 def RLDICL_32_64 : MDForm_1<30, 0,
-                           (outs g8rc:$rA),
+                            (outs g8rc:$rA),
+                            (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                            "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                            []>, isPPC64;
+// End fast-isel.
+let Interpretation64Bit = 1, isCodeGenOnly = 1 in
+defm RLDICL_32 : MDForm_1r<30, 0,
+                           (outs gprc:$rA),
                            (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
-                           "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                           "rldicl", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                            []>, isPPC64;
-// End fast-isel.
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
                         []>, isPPC64;
+let isCodeGenOnly = 1 in
+def RLDICR_32 : MDForm_1<30, 1,
+                         (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                         "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI,
+                         []>, isPPC64;
 defm RLDIC  : MDForm_1r<30, 2,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI,
@@ -942,13 +972,15 @@ def LDMX : XForm_1<31, 309, (outs g8rc:$rD), (ins memrr:$src),
 
 // Support for medium and large code model.
 let hasSideEffects = 0 in {
+let isReMaterializable = 1 in {
 def ADDIStocHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        "#ADDIStocHA", []>, isPPC64;
+def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                     "#ADDItocL", []>, isPPC64;
+}
 let mayLoad = 1 in
 def LDtocL: Pseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg),
                    "#LDtocL", []>, isPPC64;
-def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                     "#ADDItocL", []>, isPPC64;
 }
 
 // Support for thread-local storage.
@@ -963,6 +995,10 @@ def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                         [(set i64:$rD,
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
                  isPPC64;
+
+let isPseudo = 1, Defs = [CR7], Itinerary = IIC_LdStSync in
+def CFENCE8 : Pseudo<(outs), (ins g8rc:$cr), "#CFENCE8", []>;
+
 def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
           (ADD8TLS $in, tglobaltlsaddr:$g)>;
 def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
@@ -977,7 +1013,9 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                  isPPC64;
 // LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
 // explicitly defined when this op is created, so not mentioned here.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be
+// correct because the branch select pass is relying on it.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8,
     Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
 def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                         "#GETtlsADDR",
@@ -1082,7 +1120,7 @@ def STDBRX: XForm_8<31, 660, (outs), (ins g8rc:$rS, memrr:$dst),
 }
 
 // Stores with Update (pre-inc).
-let PPC970_Unit = 2, mayStore = 1 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
 def STBU8 : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins g8rc:$rS, memri:$dst),
                    "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
@@ -1232,6 +1270,10 @@ def : Pat<(srl i64:$rS, i32:$rB),
 def : Pat<(shl i64:$rS, i32:$rB),
           (SLD $rS, $rB)>;
 
+// SUBFIC
+def : Pat<(sub imm64SExt16:$imm, i64:$in),
+          (SUBFIC8 $in, imm:$imm)>;
+
 // SHL/SRL
 def : Pat<(shl i64:$in, (i32 imm:$imm)),
           (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index 5c02274..5465b5f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -407,7 +407,7 @@ def MTVSCR : VXForm_5<1604, (outs), (ins vrrc:$vB),
                       "mtvscr $vB", IIC_LdStLoad,
                       [(int_ppc_altivec_mtvscr v4i32:$vB)]>; 
 
-let PPC970_Unit = 2 in {  // Loads.
+let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {  // Loads.
 def LVEBX: XForm_1<31,   7, (outs vrrc:$vD), (ins memrr:$src),
                    "lvebx $vD, $src", IIC_LdStLoad,
                    [(set v16i8:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
@@ -434,7 +434,7 @@ def LVSR : XForm_1<31,  38, (outs vrrc:$vD), (ins memrr:$src),
                    [(set v16i8:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
                    PPC970_Unit_LSU;
 
-let PPC970_Unit = 2 in {   // Stores.
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {   // Stores.
 def STVEBX: XForm_8<31, 135, (outs), (ins vrrc:$rS, memrr:$dst),
                    "stvebx $rS, $dst", IIC_LdStStore,
                    [(int_ppc_altivec_stvebx v16i8:$rS, xoaddr:$dst)]>;
@@ -851,6 +851,10 @@ def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
 // Additional Altivec Patterns
 //
 
+// Extended mnemonics
+def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+
 // Loads.
 def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
 
@@ -983,6 +987,16 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSLH $vA, $vB))>;
 def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v1i128 (shl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
+def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSLB $vA, $vB))>;
+def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSLH $vA, $vB))>;
+def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSLW $vA, $vB))>;
+def : Pat<(v1i128 (PPCshl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSL (VSLO $vA, $vB), (VSPLTB 15, $vB)))>;
 
 def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRB $vA, $vB))>;
@@ -990,6 +1004,16 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRH $vA, $vB))>;
 def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v1i128 (srl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
+def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRW $vA, $vB))>;
+def : Pat<(v1i128 (PPCsrl v1i128:$vA, v1i128:$vB)),
+          (v1i128 (VSR (VSRO $vA, $vB), (VSPLTB 15, $vB)))>;
 
 def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)),
           (v16i8 (VSRAB $vA, $vB))>;
@@ -997,6 +1021,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)),
           (v8i16 (VSRAH $vA, $vB))>;
 def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)),
           (v4i32 (VSRAW $vA, $vB))>;
+def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)),
+          (v16i8 (VSRAB $vA, $vB))>;
+def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)),
+          (v8i16 (VSRAH $vA, $vB))>;
+def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)),
+          (v4i32 (VSRAW $vA, $vB))>;
 
 // Float to integer and integer to float conversions
 def : Pat<(v4i32 (fp_to_sint v4f32:$vA)),
@@ -1068,14 +1098,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB),
 // Vector shifts
 def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>;
 def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vsld $vD, $vA, $vB", IIC_VecGeneral,
-                    [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>;
+                    "vsld $vD, $vA, $vB", IIC_VecGeneral, []>;
 def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                   "vsrd $vD, $vA, $vB", IIC_VecGeneral,
-                   [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>;
+                   "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>;
 def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                    "vsrad $vD, $vA, $vB", IIC_VecGeneral,
-                    [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>;
+                    "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>;
+
+def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSLD $vA, $vB))>;
+def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRD $vA, $vB))>;
+def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRAD $vA, $vB))>;
+def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)),
+          (v2i64 (VSRAD $vA, $vB))>;
 
 // Vector Integer Arithmetic Instructions
 let isCommutable = 1 in {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 2e0b935..e74ba38 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -65,7 +65,9 @@ UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
 void PPCInstrInfo::anchor() {}
 
 PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
-    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+                      /* CatchRetOpcode */ -1,
+                      STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
       Subtarget(STI), RI(STI.getTargetMachine()) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
@@ -290,6 +292,29 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   return 0;
 }
 
+// For opcodes with the ReMaterializable flag set, this function is called to
+// verify the instruction is really rematable.  
+bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                                     AliasAnalysis *AA) const {
+  switch (MI.getOpcode()) {
+  default: 
+    // This function should only be called for opcodes with the ReMaterializable
+    // flag set.
+    llvm_unreachable("Unknown rematerializable operation!");
+    break;
+  case PPC::LI:
+  case PPC::LI8:
+  case PPC::LIS:
+  case PPC::LIS8:
+  case PPC::QVGPCI:
+  case PPC::ADDIStocHA:
+  case PPC::ADDItocL:
+  case PPC::LOAD_STACK_GUARD:
+    return true;
+  }
+  return false;
+}
+
 unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                           int &FrameIndex) const {
   // Note: This list must be kept consistent with StoreRegToStackSlot.
@@ -438,8 +463,8 @@ void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB,
   BuildMI(MBB, MI, DL, get(Opcode));
 }
 
-/// getNoopForMachoTarget - Return the noop instruction to use for a noop.
-void PPCInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+/// Return the noop instruction to use for a noop.
+void PPCInstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(PPC::NOP);
 }
 
@@ -662,12 +687,14 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
                               (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                               (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
     else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
-      BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+      BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB);
     else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
-      BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+      BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB);
     else                // Conditional branch
       BuildMI(&MBB, DL, get(PPC::BCC))
-        .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+          .addImm(Cond[0].getImm())
+          .add(Cond[1])
+          .addMBB(TBB);
     return 1;
   }
 
@@ -677,12 +704,14 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
                             (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                             (isPPC64 ? PPC::BDZ8  : PPC::BDZ))).addMBB(TBB);
   else if (Cond[0].getImm() == PPC::PRED_BIT_SET)
-    BuildMI(&MBB, DL, get(PPC::BC)).addOperand(Cond[1]).addMBB(TBB);
+    BuildMI(&MBB, DL, get(PPC::BC)).add(Cond[1]).addMBB(TBB);
   else if (Cond[0].getImm() == PPC::PRED_BIT_UNSET)
-    BuildMI(&MBB, DL, get(PPC::BCn)).addOperand(Cond[1]).addMBB(TBB);
+    BuildMI(&MBB, DL, get(PPC::BCn)).add(Cond[1]).addMBB(TBB);
   else
     BuildMI(&MBB, DL, get(PPC::BCC))
-      .addImm(Cond[0].getImm()).addOperand(Cond[1]).addMBB(TBB);
+        .addImm(Cond[0].getImm())
+        .add(Cond[1])
+        .addMBB(TBB);
   BuildMI(&MBB, DL, get(PPC::B)).addMBB(FBB);
   return 2;
 }
@@ -692,9 +721,6 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                 ArrayRef<MachineOperand> Cond,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
-  if (!Subtarget.hasISEL())
-    return false;
-
   if (Cond.size() != 2)
     return false;
 
@@ -736,9 +762,6 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 &&
          "PPC branch conditions have two components!");
 
-  assert(Subtarget.hasISEL() &&
-         "Cannot insert select on target without ISEL support");
-
   // Get the register classes.
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC =
@@ -1493,7 +1516,7 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
   return Found;
 }
 
-bool PPCInstrInfo::isPredicable(MachineInstr &MI) const {
+bool PPCInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned OpC = MI.getOpcode();
   switch (OpC) {
   default:
@@ -1533,6 +1556,8 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case PPC::FCMPUD:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
+    Value = 0;
+    Mask = 0;
     return true;
   }
 }
@@ -1591,9 +1616,12 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
 
       // We can perform this optimization, equality only, if MI is
       // zero-extending.
+      // FIXME: Other possible target instructions include ANDISo and
+      //        RLWINM aliases, such as ROTRWI, EXTLWI, SLWI and SRWI.
       if (MIOpC == PPC::CNTLZW || MIOpC == PPC::CNTLZWo ||
           MIOpC == PPC::SLW    || MIOpC == PPC::SLWo ||
           MIOpC == PPC::SRW    || MIOpC == PPC::SRWo ||
+          MIOpC == PPC::ANDIo  ||
           isZeroExtendingRotate) {
         noSub = true;
         equalityOnly = true;
@@ -1607,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (equalityOnly) {
     // We need to check the uses of the condition register in order to reject
     // non-equality comparisons.
-    for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg),
-         IE = MRI->use_instr_end(); I != IE; ++I) {
+    for (MachineRegisterInfo::use_instr_iterator
+         I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end();
+         I != IE; ++I) {
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
@@ -1630,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL;
        ++I) {
     bool FoundUse = false;
-    for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg),
-         JE = MRI->use_instr_end(); J != JE; ++J)
+    for (MachineRegisterInfo::use_instr_iterator
+         J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end();
+         J != JE; ++J)
       if (&*J == &*I) {
         FoundUse = true;
         break;
@@ -1641,6 +1671,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       break;
   }
 
+  SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
+  SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
+
   // There are two possible candidates which can be changed to set CR[01].
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
@@ -1652,9 +1685,37 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   // same BB as the comparison. This is to allow the check below to avoid calls
   // (and other explicit clobbers); instead we should really check for these
   // more explicitly (in at least a few predecessors).
-  else if (MI->getParent() != CmpInstr.getParent() || Value != 0) {
-    // PPC does not have a record-form SUBri.
+  else if (MI->getParent() != CmpInstr.getParent())
     return false;
+  else if (Value != 0) {
+    // The record-form instructions set CR bit based on signed comparison against 0.
+    // We try to convert a compare against 1 or -1 into a compare against 0.
+    bool Success = false;
+    if (!equalityOnly && MRI->hasOneUse(CRReg)) {
+      MachineInstr *UseMI = &*MRI->use_instr_begin(CRReg);
+      if (UseMI->getOpcode() == PPC::BCC) {
+        PPC::Predicate Pred = (PPC::Predicate)UseMI->getOperand(0).getImm();
+        int16_t Immed = (int16_t)Value;
+
+        if (Immed == -1 && Pred == PPC::PRED_GT) {
+          // We convert "greater than -1" into "greater than or equal to 0",
+          // since we are assuming signed comparison by !equalityOnly
+          PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                  PPC::PRED_GE));
+          Success = true;
+        }
+        else if (Immed == 1 && Pred == PPC::PRED_LT) {
+          // We convert "less than 1" into "less than or equal to 0".
+          PredsToUpdate.push_back(std::make_pair(&(UseMI->getOperand(0)),
+                                  PPC::PRED_LE));
+          Success = true;
+        }
+      }
+    }
+
+    // PPC does not have a record-form SUBri.
+    if (!Success)
+      return false;
   }
 
   // Search for Sub.
@@ -1720,15 +1781,14 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   if (NewOpC == -1)
     return false;
 
-  SmallVector<std::pair<MachineOperand*, PPC::Predicate>, 4> PredsToUpdate;
-  SmallVector<std::pair<MachineOperand*, unsigned>, 4> SubRegsToUpdate;
-
   // If we have SUB(r1, r2) and CMP(r2, r1), the condition code based on CMP
   // needs to be updated to be based on SUB.  Push the condition code
   // operands to OperandsToUpdate.  If it is safe to remove CmpInstr, the
   // condition code of these operands will be modified.
+  // Here, Value == 0 means we haven't converted comparison against 1 or -1 to
+  // comparison against 0, which may modify predicate.
   bool ShouldSwap = false;
-  if (Sub) {
+  if (Sub && Value == 0) {
     ShouldSwap = SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
       Sub->getOperand(2).getReg() == SrcReg;
 
@@ -1765,6 +1825,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
       } else // We need to abort on a user we don't understand.
         return false;
     }
+  assert(!(Value != 0 && ShouldSwap) &&
+         "Non-zero immediate support and ShouldSwap"
+         "may conflict in updating predicate");
 
   // Create a new virtual register to hold the value of the CR set by the
   // record-form instruction. If the instruction was not previously in
@@ -1836,8 +1899,7 @@ unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     PatchPointOpers Opers(&MI);
     return Opers.getNumPatchBytes();
   } else {
-    const MCInstrDesc &Desc = get(Opcode);
-    return Desc.getSize();
+    return get(Opcode).getSize();
   }
 }
 
@@ -1874,6 +1936,8 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
 }
 
 bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  auto &MBB = *MI.getParent();
+  auto DL = MI.getDebugLoc();
   switch (MI.getOpcode()) {
   case TargetOpcode::LOAD_STACK_GUARD: {
     assert(Subtarget.isTargetLinux() &&
@@ -1892,6 +1956,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case PPC::DFSTOREf64: {
     assert(Subtarget.hasP9Vector() &&
            "Invalid D-Form Pseudo-ops on non-P9 target.");
+    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+           "D-form op must have register and immediate operands");
     unsigned UpperOpcode, LowerOpcode;
     switch (MI.getOpcode()) {
     case PPC::DFLOADf32:
@@ -1921,6 +1987,17 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(Opcode));
     return true;
   }
+  case PPC::CFENCE8: {
+    auto Val = MI.getOperand(0).getReg();
+    BuildMI(MBB, MI, DL, get(PPC::CMPD), PPC::CR7).addReg(Val).addReg(Val);
+    BuildMI(MBB, MI, DL, get(PPC::CTRL_DEP))
+        .addImm(PPC::PRED_NE_MINUS)
+        .addReg(PPC::CR7)
+        .addImm(1);
+    MI.setDesc(get(PPC::ISYNC));
+    MI.RemoveOperand(0);
+    return true;
+  }
   }
   return false;
 }
@@ -1931,3 +2008,7 @@ PPCInstrInfo::updatedRC(const TargetRegisterClass *RC) const {
     return &PPC::VSRCRegClass;
   return RC;
 }
+
+int PPCInstrInfo::getRecordFormOpcode(unsigned Opcode) {
+  return PPC::getRecordFormOpcode(Opcode);
+}
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 32b2f00..b0629c8 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -162,6 +162,8 @@ public:
                              unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
+                                         AliasAnalysis *AA) const override;
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
@@ -253,7 +255,7 @@ public:
   bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   // Comparison optimization.
 
@@ -269,7 +271,7 @@ public:
   ///
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
 
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  void getNoop(MCInst &NopInst) const override;
 
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
@@ -290,6 +292,7 @@ public:
     return Reg >= PPC::V0 && Reg <= PPC::V31;
   }
   const TargetRegisterClass *updatedRC(const TargetRegisterClass *RC) const;
+  static int getRecordFormOpcode(unsigned Opcode);
 };
 
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index f615cc7..dd7fc26 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -32,8 +32,12 @@ def SDT_PPCstxsix : SDTypeProfile<0, 3, [
 def SDT_PPCVexts  : SDTypeProfile<1, 2, [
   SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
 ]>;
+def SDT_PPCSExtVElems  : SDTypeProfile<1, 1, [
+  SDTCisVec<0>, SDTCisVec<1>
+]>;
 
-def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                           SDTCisVT<1, i32> ]>;
 def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                          SDTCisVT<1, i32> ]>;
 def SDT_PPCvperm   : SDTypeProfile<1, 3, [
@@ -45,13 +49,21 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
 ]>;
 
 def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
-  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
 ]>;
 
 def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
 ]>;
 
+def SDT_PPCVecReverse: SDTypeProfile<1, 1, [ SDTCisVec<0>,
+  SDTCisVec<1>
+]>;
+
+def SDT_PPCxxpermdi: SDTypeProfile<1, 3, [ SDTCisVec<0>,
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+]>;
+
 def SDT_PPCvcmp : SDTypeProfile<1, 3, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
 ]>;
@@ -114,14 +126,15 @@ def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
                        [SDNPHasChain, SDNPMayStore]>;
 def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
-                       [SDNPHasChain, SDNPMayLoad]>;
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
-                       [SDNPHasChain, SDNPMayLoad]>;
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
                        [SDNPHasChain, SDNPMayLoad]>;
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
                        [SDNPHasChain, SDNPMayStore]>;
 def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
+def PPCSExtVElems  : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>;
 
 // Extract FPSCR (not modeled at the DAG level).
 def PPCmffs   : SDNode<"PPCISD::MFFS",
@@ -169,6 +182,8 @@ def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
 def PPCxxinsert  : SDNode<"PPCISD::XXINSERT", SDT_PPCVecInsert, []>;
+def PPCxxreverse : SDNode<"PPCISD::XXREVERSE", SDT_PPCVecReverse, []>;
+def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
 def PPCqvfperm   : SDNode<"PPCISD::QVFPERM", SDT_PPCqvfperm, []>;
@@ -243,7 +258,7 @@ def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
-                           [SDNPHasChain, SDNPMayLoad]>;
+                           [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore]>;
 
@@ -390,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() < 4;
 }]>;
 
+// This is a somewhat weaker condition than actually checking for 16-byte
+// alignment. It is simply checking that the displacement can be represented
+// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form
+// instructions).
+def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                               (store node:$val, node:$ptr), [{
+  return isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr),
+                                  (store node:$val, node:$ptr), [{
+  return !isOffsetMultipleOf(N, 16);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Flag Definitions.
 
@@ -770,9 +804,10 @@ def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
 }
 
 // A single-register address. This is used with the SjLj
-// pseudo-instructions.
+// pseudo-instructions which tranlates to LD/LWZ.  These instructions requires
+// G8RC_NOX0 registers.
 def memr : Operand<iPTR> {
-  let MIOperandInfo = (ops ptr_rc:$ptrreg);
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
 }
 def PPCTLSRegOperand : AsmOperandClass {
   let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
@@ -799,7 +834,8 @@ def pred : Operand<OtherVT> {
 def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>;  // "std"
+def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
 
 // The address in a single register. This is used with the SjLj
 // pseudo-instructions.
@@ -1098,9 +1134,11 @@ multiclass AForm_3r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
 
 let hasCtrlDep = 1 in {
 let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
-                              [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "#ADJCALLSTACKDOWN $amt1 $amt2",
+                              [(callseq_start timm:$amt1, timm:$amt2)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
@@ -1219,9 +1257,15 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
   // a two-value operand where a dag node expects two operands. :(
   let isCodeGenOnly = 1 in {
-    def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
-                    "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
-                    /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+    class BCC_class : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+                            "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
+                            /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+    def BCC : BCC_class;
+
+    // The same as BCC, except that it's not a terminator. Used for introducing
+    // control flow dependency without creating new blocks.
+    let isTerminator = 0 in def CTRL_DEP : BCC_class;
+
     def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
                      "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
 
@@ -1648,7 +1692,7 @@ let usesCustomInserter = 1 in {
 }
 
 // Instructions to support atomic operations
-let mayLoad = 1, hasSideEffects = 0 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
 def LBARX : XForm_1<31,  52, (outs gprc:$rD), (ins memrr:$src),
                     "lbarx $rD, $src", IIC_LdStLWARX, []>,
                     Requires<[HasPartwordAtomics]>;
@@ -1681,7 +1725,7 @@ def LWAT : X_RD5_RS5_IM5<31, 582, (outs gprc:$rD), (ins gprc:$rA, u5imm:$FC),
            Requires<[IsISA3_0]>;
 }
 
-let Defs = [CR0], mayStore = 1, hasSideEffects = 0 in {
+let Defs = [CR0], mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
 def STBCX : XForm_1<31, 694, (outs), (ins gprc:$rS, memrr:$dst),
                     "stbcx. $rS, $dst", IIC_LdStSTWCX, []>,
                     isDOT, Requires<[HasPartwordAtomics]>;
@@ -1694,7 +1738,7 @@ def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
                     "stwcx. $rS, $dst", IIC_LdStSTWCX, []>, isDOT;
 }
 
-let mayStore = 1, hasSideEffects = 0 in
+let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
 def STWAT : X_RD5_RS5_IM5<31, 710, (outs), (ins gprc:$rS, gprc:$rA, u5imm:$FC),
                           "stwat $rS, $rA, $FC", IIC_LdStStore>,
             Requires<[IsISA3_0]>;
@@ -1740,7 +1784,7 @@ def LFD : DForm_1<50, (outs f8rc:$rD), (ins memri:$src),
 
 
 // Unindexed (r+i) Loads with Update (preinc).
-let mayLoad = 1, hasSideEffects = 0 in {
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
 def LBZU : DForm_1<35, (outs gprc:$rD, ptr_rc_nor0:$ea_result), (ins memri:$addr),
                    "lbzu $rD, $addr", IIC_LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
@@ -1813,7 +1857,7 @@ def LFDUX : XForm_1<31, 631, (outs f8rc:$rD, ptr_rc_nor0:$ea_result),
 
 // Indexed (r+r) Loads.
 //
-let PPC970_Unit = 2 in {
+let PPC970_Unit = 2, mayLoad = 1, mayStore = 0 in {
 def LBZX : XForm_1<31,  87, (outs gprc:$rD), (ins memrr:$src),
                    "lbzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (zextloadi8 xaddr:$src))]>;
@@ -1827,8 +1871,6 @@ def LHZX : XForm_1<31, 279, (outs gprc:$rD), (ins memrr:$src),
 def LWZX : XForm_1<31,  23, (outs gprc:$rD), (ins memrr:$src),
                    "lwzx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (load xaddr:$src))]>;
-                   
-                   
 def LHBRX : XForm_1<31, 790, (outs gprc:$rD), (ins memrr:$src),
                    "lhbrx $rD, $src", IIC_LdStLoad,
                    [(set i32:$rD, (PPClbrx xoaddr:$src, i16))]>;
@@ -1860,7 +1902,7 @@ def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
 //
 
 // Unindexed (r+i) Stores.
-let PPC970_Unit = 2 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STB  : DForm_1<38, (outs), (ins gprc:$rS, memri:$src),
                    "stb $rS, $src", IIC_LdStStore,
                    [(truncstorei8 i32:$rS, iaddr:$src)]>;
@@ -1879,7 +1921,7 @@ def STFD : DForm_1<54, (outs), (ins f8rc:$rS, memri:$dst),
 }
 
 // Unindexed (r+i) Stores with Update (preinc).
-let PPC970_Unit = 2, mayStore = 1 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBU  : DForm_1<39, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memri:$dst),
                     "stbu $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.reg = $ea_res">, NoEncode<"$ea_res">;
@@ -1948,7 +1990,7 @@ def STFDX : XForm_28<31, 727, (outs), (ins f8rc:$frS, memrr:$dst),
 }
 
 // Indexed (r+r) Stores with Update (preinc).
-let PPC970_Unit = 2, mayStore = 1 in {
+let PPC970_Unit = 2, mayStore = 1, mayLoad = 0 in {
 def STBUX : XForm_8<31, 247, (outs ptr_rc_nor0:$ea_res), (ins gprc:$rS, memrr:$dst),
                     "stbux $rS, $dst", IIC_LdStStoreUpd, []>,
                     RegConstraint<"$dst.ptrreg = $ea_res">, NoEncode<"$ea_res">,
@@ -2531,6 +2573,14 @@ let Uses = [RM] in {
                       "mffs. $rT", IIC_IntMFFS, []>, isDOT;
 }
 
+let Predicates = [IsISA3_0] in {
+def MODSW : XForm_8<31, 779, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "modsw $rT, $rA, $rB", IIC_IntDivW,
+                        [(set i32:$rT, (srem i32:$rA, i32:$rB))]>;
+def MODUW : XForm_8<31, 267, (outs gprc:$rT), (ins gprc:$rA, gprc:$rB),
+                        "moduw $rT, $rA, $rB", IIC_IntDivW,
+                        [(set i32:$rT, (urem i32:$rA, i32:$rB))]>;
+}
 
 let PPC970_Unit = 1, hasSideEffects = 0 in {  // FXU Operations.
 // XO-Form instructions.  Arithmetic instructions that can set overflow bit
@@ -4164,6 +4214,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0
 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi $rA, $rS, $n",
+                (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>;
 def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
 
 def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b",
@@ -4422,3 +4474,190 @@ def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>;
 def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>;
 
 } // IsISA3_0
+
+// Fast 32-bit reverse bits algorithm:
+// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
+// n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA);
+// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit):
+// n = ((n >> 2) & 0x33333333) | ((n << 2) & 0xCCCCCCCC);
+// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
+// n = ((n >> 4) & 0x0F0F0F0F) | ((n << 4) & 0xF0F0F0F0);
+// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4]):
+// Step 4.1: Put B4,B2 in the right position (rotate left 3 bytes):
+// n' = (n rotl 24);  After which n' = [B4, B1, B2, B3]
+// Step 4.2: Insert B3 to the right position:
+// n' = rlwimi n', n, 8, 8, 15;  After which n' = [B4, B3, B2, B3]
+// Step 4.3: Insert B1 to the right position:
+// n' = rlwimi n', n, 8, 24, 31;  After which n' = [B4, B3, B2, B1]
+def MaskValues {
+  dag Lo1 = (ORI (LIS 0x5555), 0x5555);
+  dag Hi1 = (ORI (LIS 0xAAAA), 0xAAAA);
+  dag Lo2 = (ORI (LIS 0x3333), 0x3333);
+  dag Hi2 = (ORI (LIS 0xCCCC), 0xCCCC);
+  dag Lo4 = (ORI (LIS 0x0F0F), 0x0F0F);
+  dag Hi4 = (ORI (LIS 0xF0F0), 0xF0F0);
+}
+
+def Shift1 {
+  dag Right = (RLWINM $A, 31, 1, 31);
+  dag Left = (RLWINM $A, 1, 0, 30);
+}
+
+def Swap1 {
+  dag Bit = (OR (AND Shift1.Right, MaskValues.Lo1),
+   (AND Shift1.Left, MaskValues.Hi1));
+}
+
+def Shift2 {
+  dag Right = (RLWINM Swap1.Bit, 30, 2, 31);
+  dag Left = (RLWINM Swap1.Bit, 2, 0, 29);
+}
+
+def Swap2 {
+  dag Bits = (OR (AND Shift2.Right, MaskValues.Lo2),
+                 (AND Shift2.Left, MaskValues.Hi2));
+}
+
+def Shift4 {
+  dag Right = (RLWINM Swap2.Bits, 28, 4, 31);
+  dag Left = (RLWINM Swap2.Bits, 4, 0, 27);
+}
+
+def Swap4 {
+  dag Bits = (OR (AND Shift4.Right, MaskValues.Lo4),
+                 (AND Shift4.Left, MaskValues.Hi4));
+}
+
+def Rotate {
+  dag Left3Bytes = (RLWINM Swap4.Bits, 24, 0, 31);
+}
+
+def RotateInsertByte3 {
+  dag Left = (RLWIMI Rotate.Left3Bytes, Swap4.Bits, 8, 8, 15);
+}
+
+def RotateInsertByte1 {
+  dag Left = (RLWIMI RotateInsertByte3.Left, Swap4.Bits, 8, 24, 31);
+}
+
+def : Pat<(i32 (bitreverse i32:$A)),
+  (RLDICL_32 RotateInsertByte1.Left, 0, 32)>;
+
+// Fast 64-bit reverse bits algorithm:
+// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
+// n = ((n >> 1) & 0x5555555555555555) | ((n << 1) & 0xAAAAAAAAAAAAAAAA);
+// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit):
+// n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC);
+// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit):
+// n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0);
+// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]):
+// Apply the same byte reverse algorithm mentioned above for the fast 32-bit
+// reverse to both the high 32 bit and low 32 bit of the 64 bit value. And
+// then OR them together to get the final result.
+def MaskValues64 {
+  dag Lo1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo1, sub_32));
+  dag Hi1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi1, sub_32));
+  dag Lo2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo2, sub_32));
+  dag Hi2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi2, sub_32));
+  dag Lo4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo4, sub_32));
+  dag Hi4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi4, sub_32));
+}
+
+def DWMaskValues {
+  dag Lo1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo1, 32, 31), 0x5555), 0x5555);
+  dag Hi1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi1, 32, 31), 0xAAAA), 0xAAAA);
+  dag Lo2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo2, 32, 31), 0x3333), 0x3333);
+  dag Hi2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi2, 32, 31), 0xCCCC), 0xCCCC);
+  dag Lo4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo4, 32, 31), 0x0F0F), 0x0F0F);
+  dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0);
+}
+
+def DWShift1 {
+  dag Right = (RLDICL $A, 63, 1);
+  dag Left = (RLDICR $A, 1, 62);
+}
+
+def DWSwap1 {
+  dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1),
+                 (AND8 DWShift1.Left, DWMaskValues.Hi1));
+}
+
+def DWShift2 {
+  dag Right = (RLDICL DWSwap1.Bit, 62, 2);
+  dag Left = (RLDICR DWSwap1.Bit, 2, 61);
+}
+
+def DWSwap2 {
+  dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2),
+                  (AND8 DWShift2.Left, DWMaskValues.Hi2));
+}
+
+def DWShift4 {
+  dag Right = (RLDICL DWSwap2.Bits, 60, 4);
+  dag Left = (RLDICR DWSwap2.Bits, 4, 59);
+}
+
+def DWSwap4 {
+  dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4),
+                  (AND8 DWShift4.Left, DWMaskValues.Hi4));
+}
+
+// Bit swap is done, now start byte swap.
+def DWExtractLo32 {
+  dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32));
+}
+
+def DWRotateLo32 {
+  dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31);
+}
+
+def DWLo32RotateInsertByte3 {
+  dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15);
+}
+
+// Lower 32 bits in the right order
+def DWLo32RotateInsertByte1 {
+  dag Left =
+    (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31);
+}
+
+def ExtendLo32 {
+  dag To64Bit =
+    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+          DWLo32RotateInsertByte1.Left, sub_32));
+}
+
+def DWShiftHi32 { // SRDI DWSwap4.Bits, 32)
+  dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32);
+}
+
+def DWExtractHi32 {
+  dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32));
+}
+
+def DWRotateHi32 {
+  dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31);
+}
+
+def DWHi32RotateInsertByte3 {
+  dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15);
+}
+
+// High 32 bits in the right order, but in the low 32-bit position
+def DWHi32RotateInsertByte1 {
+  dag Left =
+    (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31);
+}
+
+def ExtendHi32 {
+  dag To64Bit =
+    (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+          DWHi32RotateInsertByte1.Left, sub_32));
+}
+
+def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32
+  dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31);
+}
+
+def : Pat<(i64 (bitreverse i64:$A)),
+  (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 0d9e345..942e8b3 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -62,7 +62,7 @@ def SDTVecConv : SDTypeProfile<1, 2, [
 ]>;
 
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
-                        [SDNPHasChain, SDNPMayLoad]>;
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
                         [SDNPHasChain, SDNPMayStore]>;
 def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
@@ -117,7 +117,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
 let Uses = [RM] in {
 
   // Load indexed instructions
-  let mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
     def LXSDX : XX1Form<31, 588,
                         (outs vsfrc:$XT), (ins memrr:$src),
@@ -138,11 +138,11 @@ let Uses = [RM] in {
     def LXVW4X : XX1Form<31, 780,
                          (outs vsrc:$XT), (ins memrr:$src),
                          "lxvw4x $XT, $src", IIC_LdStLFD,
-                         [(set v4i32:$XT, (int_ppc_vsx_lxvw4x xoaddr:$src))]>;
+                         []>;
   } // mayLoad
 
   // Store indexed instructions
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
     def STXSDX : XX1Form<31, 716,
                         (outs), (ins vsfrc:$XT, memrr:$dst),
@@ -160,7 +160,7 @@ let Uses = [RM] in {
     def STXVW4X : XX1Form<31, 908,
                          (outs), (ins vsrc:$XT, memrr:$dst),
                          "stxvw4x $XT, $dst", IIC_LdStSTFD,
-                         [(store v4i32:$XT, xoaddr:$dst)]>;
+                         []>;
     }
   } // mayStore
 
@@ -843,7 +843,9 @@ let Uses = [RM] in {
 
   def XXPERMDI : XX3Form_2<60, 10,
                        (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, u2imm:$DM),
-                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm, []>;
+                       "xxpermdi $XT, $XA, $XB, $DM", IIC_VecPerm,
+                       [(set v2i64:$XT, (PPCxxpermdi v2i64:$XA, v2i64:$XB,
+                         imm32SExt16:$DM))]>;
   let isCodeGenOnly = 1 in
   def XXPERMDIs : XX3Form_2s<60, 10, (outs vsrc:$XT), (ins vsfrc:$XA, u2imm:$DM),
                              "xxpermdi $XT, $XA, $XA, $DM", IIC_VecPerm, []>;
@@ -1041,8 +1043,6 @@ let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
   // Stores.
   def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
             (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
-            (STXVW4X $rS, xoaddr:$dst)>;
   def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
             (STXVD2X $rS, xoaddr:$dst)>;
   def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
@@ -1053,8 +1053,12 @@ let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
   def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
   def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
   def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>;
   def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
   def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
 }
 
 // Permutes.
@@ -1064,6 +1068,10 @@ def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
 
+// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and
+// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable.
+def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>;
+
 // Selects.
 def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
           (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
@@ -1197,7 +1205,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                        [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
 
   // VSX scalar loads introduced in ISA 2.07
-  let mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
     def LXSSPX : XX1Form<31, 524, (outs vssrc:$XT), (ins memrr:$src),
                          "lxsspx $XT, $src", IIC_LdStLFD,
@@ -1211,7 +1219,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   } // mayLoad
 
   // VSX scalar stores introduced in ISA 2.07
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
     def STXSSPX : XX1Form<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
                           "stxsspx $XT, $dst", IIC_LdStSTFD,
@@ -1410,6 +1418,11 @@ let Predicates = [HasDirectMove] in {
                               "mfvsrd $rA, $XT", IIC_VecGeneral,
                               [(set i64:$rA, (PPCmfvsr f64:$XT))]>,
       Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT),
+                             "mfvsrd $rA, $XT", IIC_VecGeneral,
+                             []>,
+      Requires<[In64BitMode]>;
   def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
                                "mfvsrwz $rA, $XT", IIC_VecGeneral,
                                [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
@@ -1429,7 +1442,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
   def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
                               "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
 
-  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
                        "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
                        []>, Requires<[In64BitMode]>;
 
@@ -1440,6 +1453,13 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
 } // IsISA3_0, HasDirectMove
 } // UseVSXReg = 1
 
+// We want to parse this from asm, but we don't want to emit this as it would
+// be emitted with a VSX reg. So leave Emit = 0 here.
+def : InstAlias<"mfvrd $rA, $XT",
+                (MFVRD g8rc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprd $rA, $src",
+                (MFVSRD g8rc:$rA, f8rc:$src)>;
+
 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
     the value up into element 0 (both BE and LE). Namely, entities smaller than
     a doubleword are shifted left and moved for BE. For LE, they're moved, then
@@ -1878,8 +1898,100 @@ let Predicates = [IsLittleEndian, HasVSX] in
   def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
             (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
 
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+// Variable index unsigned vector_extract on Power9
+let Predicates = [HasP9Altivec, IsLittleEndian] in {
+  def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
+            (VEXTUBRX $Idx, $S)>;
+
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
+            (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
+            (VEXTUHRX (LI8 0), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
+            (VEXTUHRX (LI8 2), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
+            (VEXTUHRX (LI8 4), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
+            (VEXTUHRX (LI8 6), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
+            (VEXTUHRX (LI8 8), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
+            (VEXTUHRX (LI8 10), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
+            (VEXTUHRX (LI8 12), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
+            (VEXTUHRX (LI8 14), $S)>;
+
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+            (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
+            (VEXTUWRX (LI8 0), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
+            (VEXTUWRX (LI8 4), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
+            (VEXTUWRX (LI8 8), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
+            (VEXTUWRX (LI8 12), $S)>;
+
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+            (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
+            (EXTSW (VEXTUWRX (LI8 0), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
+            (EXTSW (VEXTUWRX (LI8 4), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
+            (EXTSW (VEXTUWRX (LI8 8), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
+            (EXTSW (VEXTUWRX (LI8 12), $S))>;
+}
+let Predicates = [HasP9Altivec, IsBigEndian] in {
+  def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
+            (VEXTUBLX $Idx, $S)>;
+
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
+            (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
+            (VEXTUHLX (LI8 0), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
+            (VEXTUHLX (LI8 2), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
+            (VEXTUHLX (LI8 4), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
+            (VEXTUHLX (LI8 6), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
+            (VEXTUHLX (LI8 8), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
+            (VEXTUHLX (LI8 10), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
+            (VEXTUHLX (LI8 12), $S)>;
+  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
+            (VEXTUHLX (LI8 14), $S)>;
+
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+            (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
+            (VEXTUWLX (LI8 0), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
+            (VEXTUWLX (LI8 4), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
+            (VEXTUWLX (LI8 8), $S)>;
+  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
+            (VEXTUWLX (LI8 12), $S)>;
+
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+            (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
+            (EXTSW (VEXTUWLX (LI8 0), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
+            (EXTSW (VEXTUWLX (LI8 4), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
+            (EXTSW (VEXTUWLX (LI8 8), $S))>;
+  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
+            (EXTSW (VEXTUWLX (LI8 12), $S))>;
+}
 
 let Predicates = [IsLittleEndian, HasDirectMove] in {
   // v16i8 scalar <-> vector conversions (LE)
@@ -2186,7 +2298,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // UseVSXReg = 1
 
   // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
-  // seperate pattern so that it can convert the input register class from
+  // separate pattern so that it can convert the input register class from
   // VRRC(v8i16) to VSRC.
   def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
             (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
@@ -2320,6 +2432,16 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc, []>;
   def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
 
+  // Vector Reverse
+  def : Pat<(v8i16 (PPCxxreverse v8i16 :$A)),
+            (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
+  def : Pat<(v4i32 (PPCxxreverse v4i32 :$A)),
+            (v4i32 (XXBRW $A))>;
+  def : Pat<(v2i64 (PPCxxreverse v2i64 :$A)),
+            (v2i64 (XXBRD $A))>;
+  def : Pat<(v1i128 (PPCxxreverse v1i128 :$A)),
+            (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
+
   // Vector Permute
   def XXPERM  : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
                                 IIC_VecPerm, []>;
@@ -2335,7 +2457,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
   // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
-  let mayLoad = 1 in {
+  let mayLoad = 1, mayStore = 0 in {
   // Load Vector
   def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
                             "lxv $XT, $src", IIC_LdStLFD, []>, UseVSXReg;
@@ -2365,8 +2487,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // Load Vector Indexed
   def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc,
-                [(set v2f64:$XT, (load xoaddr:$src))]>;
-
+                [(set v2f64:$XT, (load xaddr:$src))]>;
   // Load Vector (Left-justified) with Length
   def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
                    "lxvl $XT, $src, $rB", IIC_LdStLoad,
@@ -2383,7 +2504,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
   // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
-  let mayStore = 1 in {
+  let mayStore = 1, mayLoad = 0 in {
   // Store Vector
   def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
                              "stxv $XT, $dst", IIC_LdStSTFD, []>, UseVSXReg;
@@ -2416,7 +2537,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // Store Vector Indexed
   def STXVX    : X_XS6_RA5_RB5<31,  396, "stxvx"   , vsrc,
-                 [(store v2f64:$XT, xoaddr:$dst)]>;
+                 [(store v2f64:$XT, xaddr:$dst)]>;
 
   // Store Vector (Left-justified) with Length
   def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
@@ -2484,21 +2605,42 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
   } // IsLittleEndian, HasP9Vector
 
-  def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
+  // D-Form Load/Store
+  def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>;
+
+  def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst),
+            (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst),
+            (STXV $rS, memrix16:$dst)>;
+
+
+  def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+  def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
   def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
   def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
+  def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
+            (STXVX $rS, xoaddr:$dst)>;
   def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
             (STXVX $rS, xoaddr:$dst)>;
   def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
             (STXVX $rS, xoaddr:$dst)>;
-
   def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
             (v4i32 (LXVWSX xoaddr:$src))>;
   def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
@@ -2650,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   let isPseudo = 1 in {
     def DFLOADf32  : Pseudo<(outs vssrc:$XT), (ins memrix:$src),
                             "#DFLOADf32",
-                            [(set f32:$XT, (load iaddr:$src))]>;
+                            [(set f32:$XT, (load ixaddr:$src))]>;
     def DFLOADf64  : Pseudo<(outs vsfrc:$XT), (ins memrix:$src),
                             "#DFLOADf64",
-                            [(set f64:$XT, (load iaddr:$src))]>;
+                            [(set f64:$XT, (load ixaddr:$src))]>;
     def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst),
                             "#DFSTOREf32",
-                            [(store f32:$XT, iaddr:$dst)]>;
+                            [(store f32:$XT, ixaddr:$dst)]>;
     def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
                             "#DFSTOREf64",
-                            [(store f64:$XT, iaddr:$dst)]>;
+                            [(store f64:$XT, ixaddr:$dst)]>;
   }
-  def : Pat<(f64 (extloadf32 iaddr:$src)),
-            (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (extloadf32 iaddr:$src))),
-            (f32 (DFLOADf32 iaddr:$src))>;
+  def : Pat<(f64 (extloadf32 ixaddr:$src)),
+            (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>;
+  def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))),
+            (f32 (DFLOADf32 ixaddr:$src))>;
 } // end HasP9Vector, AddedComplexity
 
 // Integer extend helper dags 32 -> 64
@@ -2681,6 +2823,58 @@ def DblToFlt {
   dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0))));
   dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
 }
+
+def ByteToWord {
+  dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8));
+  dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8));
+  dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8));
+  dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8));
+  dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8));
+  dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8));
+  dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8));
+  dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8));
+}
+
+def ByteToDWord {
+  dag LE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8));
+  dag LE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8));
+  dag BE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8));
+  dag BE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8));
+}
+
+def HWordToWord {
+  dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16));
+  dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16));
+  dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16));
+  dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16));
+  dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16));
+  dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16));
+  dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16));
+  dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16));
+}
+
+def HWordToDWord {
+  dag LE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16));
+  dag LE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16));
+  dag BE_A0 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16));
+  dag BE_A1 = (i64 (sext_inreg
+              (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16));
+}
+
+def WordToDWord {
+  dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0))));
+  dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2))));
+  dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1))));
+  dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3))));
+}
+
 def FltToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (extloadf32 xoaddr:$A)))));
 }
@@ -2690,9 +2884,15 @@ def FltToUIntLoad {
 def FltToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
 }
+def FltToLongLoadP9 {
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A)))));
+}
 def FltToULongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
 }
+def FltToULongLoadP9 {
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A)))));
+}
 def FltToLong {
   dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
 }
@@ -2714,9 +2914,15 @@ def DblToULong {
 def DblToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
 }
+def DblToIntLoadP9 {
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A)))));
+}
 def DblToUIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
 }
+def DblToUIntLoadP9 {
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A)))));
+}
 def DblToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
 }
@@ -2884,19 +3090,19 @@ let AddedComplexity = 400 in {
               (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
     def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
               (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+    def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+                                (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
+    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+                                (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>;
+    def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+    def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddr:$A),
+                                              (DFLOADf32 ixaddr:$A),
                                               VSFRC)), 0))>;
   }
 
@@ -2921,4 +3127,49 @@ let AddedComplexity = 400 in {
               (VMRGOW (COPY_TO_REGCLASS (MTVSRDD AnyExts.D, AnyExts.B), VSRC),
                       (COPY_TO_REGCLASS (MTVSRDD AnyExts.C, AnyExts.A), VSRC))>;
   }
+  // P9 Altivec instructions that can be used to build vectors.
+  // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
+  // with complexities of existing build vector patterns in this file.
+  let Predicates = [HasP9Altivec, IsLittleEndian] in {
+    def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)),
+              (v2i64 (VEXTSW2D $A))>;
+    def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)),
+              (v2i64 (VEXTSH2D $A))>;
+    def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1,
+                      HWordToWord.LE_A2, HWordToWord.LE_A3)),
+              (v4i32 (VEXTSH2W $A))>;
+    def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1,
+                      ByteToWord.LE_A2, ByteToWord.LE_A3)),
+              (v4i32 (VEXTSB2W $A))>;
+    def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
+              (v2i64 (VEXTSB2D $A))>;
+  }
+
+  let Predicates = [HasP9Altivec, IsBigEndian] in {
+    def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)),
+              (v2i64 (VEXTSW2D $A))>;
+    def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)),
+              (v2i64 (VEXTSH2D $A))>;
+    def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1,
+                      HWordToWord.BE_A2, HWordToWord.BE_A3)),
+              (v4i32 (VEXTSH2W $A))>;
+    def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
+                      ByteToWord.BE_A2, ByteToWord.BE_A3)),
+              (v4i32 (VEXTSB2W $A))>;
+    def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
+              (v2i64 (VEXTSB2D $A))>;
+  }
+
+  let Predicates = [HasP9Altivec] in {
+    def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)),
+              (v2i64 (VEXTSB2D $A))>;
+    def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)),
+              (v2i64 (VEXTSH2D $A))>;
+    def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)),
+              (v2i64 (VEXTSW2D $A))>;
+    def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)),
+              (v4i32 (VEXTSB2W $A))>;
+    def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)),
+              (v4i32 (VEXTSH2W $A))>;
+  }
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
index 2c3e755..a349fa1 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
@@ -39,6 +39,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -72,9 +73,10 @@ namespace {
   public:
     static char ID; // Pass ID, replacement for typeid
 
-    PPCLoopPreIncPrep() : FunctionPass(ID), TM(nullptr) {
+    PPCLoopPreIncPrep() : FunctionPass(ID) {
       initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
     }
+
     PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
       initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
     }
@@ -93,7 +95,7 @@ namespace {
     bool rotateLoop(Loop *L);
 
   private:
-    PPCTargetMachine *TM;
+    PPCTargetMachine *TM = nullptr;
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index e527b01..b310493 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "MCTargetDesc/PPCMCExpr.h"
+#include "PPC.h"
 #include "PPCSubtarget.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
@@ -148,7 +148,7 @@ void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       assert(!MO.getSubReg() && "Subregs should be eliminated!");
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 2413af3..ff5f17c 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -19,9 +19,9 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -147,9 +147,9 @@ bool PPCMIPeephole::simplifyCode(void) {
                       << "Optimizing load-and-splat/splat "
                       "to load-and-splat/copy: ");
                 DEBUG(MI.dump());
-                BuildMI(MBB, &MI, MI.getDebugLoc(),
-                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
-                  .addOperand(MI.getOperand(1));
+                BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                        MI.getOperand(0).getReg())
+                    .add(MI.getOperand(1));
                 ToErase = &MI;
                 Simplified = true;
               }
@@ -169,9 +169,9 @@ bool PPCMIPeephole::simplifyCode(void) {
                       << "Optimizing splat/swap or splat/splat "
                       "to splat/copy: ");
                 DEBUG(MI.dump());
-                BuildMI(MBB, &MI, MI.getDebugLoc(),
-                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
-                  .addOperand(MI.getOperand(1));
+                BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                        MI.getOperand(0).getReg())
+                    .add(MI.getOperand(1));
                 ToErase = &MI;
                 Simplified = true;
               }
@@ -194,9 +194,9 @@ bool PPCMIPeephole::simplifyCode(void) {
               else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
                 DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
                 DEBUG(MI.dump());
-                BuildMI(MBB, &MI, MI.getDebugLoc(),
-                        TII->get(PPC::COPY), MI.getOperand(0).getReg())
-                  .addOperand(DefMI->getOperand(1));
+                BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+                        MI.getOperand(0).getReg())
+                    .add(DefMI->getOperand(1));
                 ToErase = &MI;
                 Simplified = true;
               }
@@ -251,7 +251,7 @@ bool PPCMIPeephole::simplifyCode(void) {
           DEBUG(MI.dump());
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
                   MI.getOperand(0).getReg())
-              .addOperand(MI.getOperand(OpNo));
+              .add(MI.getOperand(OpNo));
           ToErase = &MI;
           Simplified = true;
         }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 9d91e31..bc2d9a0 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -8,14 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCMachineFunctionInfo.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
-void PPCFunctionInfo::anchor() { }
+void PPCFunctionInfo::anchor() {}
 
 MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
   const DataLayout &DL = MF.getDataLayout();
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 4c29aa0..202e100 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -26,17 +27,17 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// FramePointerSaveIndex - Frame index of where the old frame pointer is
   /// stored.  Also used as an anchor for instructions that need to be altered
   /// when using frame pointers (dyna_add, dyna_sub.)
-  int FramePointerSaveIndex;
+  int FramePointerSaveIndex = 0;
   
   /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
   ///
-  int ReturnAddrSaveIndex;
+  int ReturnAddrSaveIndex = 0;
 
   /// Frame index where the old base pointer is stored.
-  int BasePointerSaveIndex;
+  int BasePointerSaveIndex = 0;
 
   /// Frame index where the old PIC base pointer is stored.
-  int PICBasePointerSaveIndex;
+  int PICBasePointerSaveIndex = 0;
 
   /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
   /// function.  This is only valid after the initial scan of the function by
@@ -44,54 +45,58 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   bool MustSaveLR;
 
   /// Does this function have any stack spills.
-  bool HasSpills;
+  bool HasSpills = false;
 
   /// Does this function spill using instructions with only r+r (not r+i)
   /// forms.
-  bool HasNonRISpills;
+  bool HasNonRISpills = false;
 
   /// SpillsCR - Indicates whether CR is spilled in the current function.
-  bool SpillsCR;
+  bool SpillsCR = false;
 
   /// Indicates whether VRSAVE is spilled in the current function.
-  bool SpillsVRSAVE;
+  bool SpillsVRSAVE = false;
 
   /// LRStoreRequired - The bool indicates whether there is some explicit use of
   /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
   /// requires that the code generator produce a store of LR to the stack on
   /// entry, even though LR may otherwise apparently not be used.
-  bool LRStoreRequired;
+  bool LRStoreRequired = false;
 
   /// This function makes use of the PPC64 ELF TOC base pointer (register r2).
-  bool UsesTOCBasePtr;
+  bool UsesTOCBasePtr = false;
 
   /// MinReservedArea - This is the frame size that is at least reserved in a
   /// potential caller (parameter+linkage area).
-  unsigned MinReservedArea;
+  unsigned MinReservedArea = 0;
 
   /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
   /// amount the stack pointer is adjusted to make the frame bigger for tail
   /// calls. Used for creating an area before the register spill area.
-  int TailCallSPDelta;
+  int TailCallSPDelta = 0;
 
   /// HasFastCall - Does this function contain a fast call. Used to determine
   /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
-  bool HasFastCall;
+  bool HasFastCall = false;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
+  int VarArgsFrameIndex = 0;
+
   /// VarArgsStackOffset - StackOffset for start of stack
   /// arguments.
-  int VarArgsStackOffset;
+
+  int VarArgsStackOffset = 0;
+
   /// VarArgsNumGPR - Index of the first unused integer
   /// register for parameter passing.
-  unsigned VarArgsNumGPR;
+  unsigned VarArgsNumGPR = 0;
+
   /// VarArgsNumFPR - Index of the first unused double
   /// register for parameter passing.
-  unsigned VarArgsNumFPR;
+  unsigned VarArgsNumFPR = 0;
 
   /// CRSpillFrameIndex - FrameIndex for CR spill slot for 32-bit SVR4.
-  int CRSpillFrameIndex;
+  int CRSpillFrameIndex = 0;
 
   /// If any of CR[2-4] need to be saved in the prologue and restored in the
   /// epilogue then they are added to this array. This is used for the
@@ -102,35 +107,14 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   MachineFunction &MF;
 
   /// Whether this uses the PIC Base register or not.
-  bool UsesPICBase;
+  bool UsesPICBase = false;
 
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies
-  bool IsSplitCSR;
+  bool IsSplitCSR = false;
 
 public:
-  explicit PPCFunctionInfo(MachineFunction &MF) 
-    : FramePointerSaveIndex(0),
-      ReturnAddrSaveIndex(0),
-      BasePointerSaveIndex(0),
-      PICBasePointerSaveIndex(0),
-      HasSpills(false),
-      HasNonRISpills(false),
-      SpillsCR(false),
-      SpillsVRSAVE(false),
-      LRStoreRequired(false),
-      UsesTOCBasePtr(false),
-      MinReservedArea(0),
-      TailCallSPDelta(0),
-      HasFastCall(false),
-      VarArgsFrameIndex(0),
-      VarArgsStackOffset(0),
-      VarArgsNumGPR(0),
-      VarArgsNumFPR(0),
-      CRSpillFrameIndex(0),
-      MF(MF),
-      UsesPICBase(0),
-      IsSplitCSR(false) {}
+  explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {}
 
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
@@ -211,7 +195,6 @@ public:
   MCSymbol *getTOCOffsetSymbol() const;
 };
 
-} // end of namespace llvm
-
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_POWERPC_PPCMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index e492014..9207165 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -209,89 +209,84 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // The ZERO register is not really a register, but the representation of r0
   // when used in instructions that treat r0 as the constant 0.
-  Reserved.set(PPC::ZERO);
-  Reserved.set(PPC::ZERO8);
+  markSuperRegs(Reserved, PPC::ZERO);
 
   // The FP register is also not really a register, but is the representation
   // of the frame pointer register used by ISD::FRAMEADDR.
-  Reserved.set(PPC::FP);
-  Reserved.set(PPC::FP8);
+  markSuperRegs(Reserved, PPC::FP);
 
   // The BP register is also not really a register, but is the representation
   // of the base pointer register used by setjmp.
-  Reserved.set(PPC::BP);
-  Reserved.set(PPC::BP8);
+  markSuperRegs(Reserved, PPC::BP);
 
   // The counter registers must be reserved so that counter-based loops can
   // be correctly formed (and the mtctr instructions are not DCE'd).
-  Reserved.set(PPC::CTR);
-  Reserved.set(PPC::CTR8);
+  markSuperRegs(Reserved, PPC::CTR);
+  markSuperRegs(Reserved, PPC::CTR8);
 
-  Reserved.set(PPC::R1);
-  Reserved.set(PPC::LR);
-  Reserved.set(PPC::LR8);
-  Reserved.set(PPC::RM);
+  markSuperRegs(Reserved, PPC::R1);
+  markSuperRegs(Reserved, PPC::LR);
+  markSuperRegs(Reserved, PPC::LR8);
+  markSuperRegs(Reserved, PPC::RM);
 
   if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
-    Reserved.set(PPC::VRSAVE);
+    markSuperRegs(Reserved, PPC::VRSAVE);
 
   // The SVR4 ABI reserves r2 and r13
   if (Subtarget.isSVR4ABI()) {
-    Reserved.set(PPC::R2);  // System-reserved register
-    Reserved.set(PPC::R13); // Small Data Area pointer register
+    // We only reserve r2 if we need to use the TOC pointer. If we have no
+    // explicit uses of the TOC pointer (meaning we're a leaf function with
+    // no constant-pool loads, etc.) and we have no potential uses inside an
+    // inline asm block, then we can treat r2 has an ordinary callee-saved
+    // register.
+    const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+    if (!TM.isPPC64() || FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
+      markSuperRegs(Reserved, PPC::R2);  // System-reserved register
+    markSuperRegs(Reserved, PPC::R13); // Small Data Area pointer register
   }
 
   // On PPC64, r13 is the thread pointer. Never allocate this register.
-  if (TM.isPPC64()) {
-    Reserved.set(PPC::R13);
-
-    Reserved.set(PPC::X1);
-    Reserved.set(PPC::X13);
-
-    if (TFI->needsFP(MF))
-      Reserved.set(PPC::X31);
-
-    if (hasBasePointer(MF))
-      Reserved.set(PPC::X30);
-
-    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
-    if (Subtarget.isSVR4ABI()) {
-      // We only reserve r2 if we need to use the TOC pointer. If we have no
-      // explicit uses of the TOC pointer (meaning we're a leaf function with
-      // no constant-pool loads, etc.) and we have no potential uses inside an
-      // inline asm block, then we can treat r2 has an ordinary callee-saved
-      // register.
-      const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-      if (FuncInfo->usesTOCBasePtr() || MF.hasInlineAsm())
-        Reserved.set(PPC::X2);
-      else
-        Reserved.reset(PPC::R2);
-    }
-  }
+  if (TM.isPPC64())
+    markSuperRegs(Reserved, PPC::R13);
 
   if (TFI->needsFP(MF))
-    Reserved.set(PPC::R31);
+    markSuperRegs(Reserved, PPC::R31);
 
   bool IsPositionIndependent = TM.isPositionIndependent();
   if (hasBasePointer(MF)) {
     if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
-      Reserved.set(PPC::R29);
+      markSuperRegs(Reserved, PPC::R29);
     else
-      Reserved.set(PPC::R30);
+      markSuperRegs(Reserved, PPC::R30);
   }
 
   if (Subtarget.isSVR4ABI() && !TM.isPPC64() && IsPositionIndependent)
-    Reserved.set(PPC::R30);
+    markSuperRegs(Reserved, PPC::R30);
 
   // Reserve Altivec registers when Altivec is unavailable.
   if (!Subtarget.hasAltivec())
     for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
          IE = PPC::VRRCRegClass.end(); I != IE; ++I)
-      Reserved.set(*I);
+      markSuperRegs(Reserved, *I);
 
+  assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
 }
 
+bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
+                                               const MachineFunction &MF) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+  if (TM.isELFv2ABI() && PhysReg == PPC::X2) {
+    // X2 is guaranteed to be preserved within a function if it is reserved.
+    // The reason it's reserved is that it's the TOC pointer (and the function
+    // uses the TOC). In functions where it isn't reserved (i.e. leaf functions
+    // with no TOC access), we can't claim that it is preserved.
+    return (getReservedRegs(MF).test(PPC::X2));
+  } else {
+    return false;
+  }
+}
+
 unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                               MachineFunction &MF) const {
   const PPCFrameLowering *TFI = getFrameLowering(MF);
@@ -394,9 +389,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
-    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
-      .addReg(PPC::R31)
-      .addImm(FrameSize);
+    if (LP64)
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg)
+        .addReg(PPC::X31)
+        .addImm(FrameSize);
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+        .addReg(PPC::R31)
+        .addImm(FrameSize);
   } else if (LP64) {
     BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
       .addImm(0)
@@ -483,8 +483,10 @@ void PPCRegisterInfo::lowerDynamicAreaOffset(
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
 
   unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+  bool is64Bit = TM.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
-  BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg())
+  BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LI8 : PPC::LI),
+          MI.getOperand(0).getReg())
       .addImm(maxCallFrameSize);
   MBB.erase(II);
 }
@@ -752,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-// Figure out if the offset in the instruction must be a multiple of 4.
-// This is true for instructions like "STD".
-static bool usesIXAddr(const MachineInstr &MI) {
+// If the offset must be a multiple of some value, return what that value is.
+static unsigned offsetMinAlign(const MachineInstr &MI) {
   unsigned OpC = MI.getOpcode();
 
   switch (OpC) {
   default:
-    return false;
+    return 1;
   case PPC::LWA:
   case PPC::LWA_32:
   case PPC::LD:
+  case PPC::LDU:
   case PPC::STD:
-    return true;
+  case PPC::STDU:
+  case PPC::DFLOADf32:
+  case PPC::DFLOADf64:
+  case PPC::DFSTOREf32:
+  case PPC::DFSTOREf64:
+  case PPC::LXSD:
+  case PPC::LXSSP:
+  case PPC::STXSD:
+  case PPC::STXSSP:
+    return 4;
+  case PPC::LXV:
+  case PPC::STXV:
+    return 16;
   }
 }
 
@@ -850,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum).ChangeToRegister(
     FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
 
-  // Figure out if the offset in the instruction is shifted right two bits.
-  bool isIXAddr = usesIXAddr(MI);
-
   // If the instruction is not present in ImmToIdxMap, then it has no immediate
   // form (and must be r+r).
   bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP &&
@@ -881,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // happen in invalid code.
   assert(OpC != PPC::DBG_VALUE &&
          "This should be handled in a target-independent way");
-  if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) ||
+  if (!noImmForm && ((isInt<16>(Offset) &&
+                      ((Offset % offsetMinAlign(MI)) == 0)) ||
                      OpC == TargetOpcode::STACKMAP ||
                      OpC == TargetOpcode::PATCHPOINT)) {
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
@@ -1074,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
   return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
          MI->getOpcode() == TargetOpcode::STACKMAP ||
          MI->getOpcode() == TargetOpcode::PATCHPOINT ||
-         (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
+         (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0);
 }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 4a96327..0bbb71f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -83,6 +83,7 @@ public:
   void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const override;
 
   /// We require the register scavenger.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
index 8e52da5..79963dd 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP8.td
@@ -377,7 +377,7 @@ def P8Itineraries : ProcessorItineraries<
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [7, 1, 1]>,
   InstrItinData<IIC_VecPerm     , [InstrStage<1, [P8_DU1, P8_DU2], 0>,
-                                   InstrStage<1, [P8_FPU2, P8_FPU2]>],
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [3, 1, 1]>
 ]>;
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index a9c1bd7..a01995a 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/contrib/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -260,8 +260,8 @@ let SchedModel = P9Model in {
 
   // ***************** Defining Itinerary Class Resources *****************
 
-  def : ItinRW<[P9_DFU_76C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntSimple,
-                                         IIC_IntGeneral]>;
+  def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C],
+               [IIC_IntSimple, IIC_IntGeneral]>;
 
   def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C],
                [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>;
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index e8a87e7..ccf0f80 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -220,8 +220,8 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
-unsigned char PPCSubtarget::classifyGlobalReference(
-    const GlobalValue *GV) const {
+unsigned char
+PPCSubtarget::classifyGlobalReference(const GlobalValue *GV) const {
   // Note that currently we don't generate non-pic references.
   // If a caller wants that, this will have to be updated.
 
@@ -229,23 +229,9 @@ unsigned char PPCSubtarget::classifyGlobalReference(
   if (TM.getCodeModel() == CodeModel::Large)
     return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
 
-  unsigned char flags = PPCII::MO_PIC_FLAG;
-
-  // Only if the relocation mode is PIC do we have to worry about
-  // interposition. In all other cases we can use a slightly looser standard to
-  // decide how to access the symbol.
-  if (TM.getRelocationModel() == Reloc::PIC_) {
-    // If it's local, or it's non-default, it can't be interposed.
-    if (!GV->hasLocalLinkage() &&
-        GV->hasDefaultVisibility()) {
-      flags |= PPCII::MO_NLP_FLAG;
-    }
-    return flags;
-  }
-
-  if (GV->isStrongDefinitionForLinker())
-    return flags;
-  return flags | PPCII::MO_NLP_FLAG;
+  if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return PPCII::MO_PIC_FLAG;
+  return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG;
 }
 
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 7fd9079..90d11f4 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -272,6 +272,13 @@ public:
 
     return 16;
   }
+
+  // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no
+  // red zone and PPC64 SVR4ABI has a 288-byte red zone.
+  unsigned  getRedZoneSize() const {
+    return isDarwinABI() ? 224 : (isPPC64() ? 288 : 0);
+  }
+
   bool hasHTM() const { return HasHTM; }
   bool hasFusion() const { return HasFusion; }
   bool hasFloat128() const { return HasFloat128; }
@@ -298,7 +305,9 @@ public:
   bool isSVR4ABI() const { return !isDarwinABI(); }
   bool isELFv2ABI() const;
 
-  bool enableEarlyIfConversion() const override { return hasISEL(); }
+  /// Originally, this function return hasISEL(). Now we always enable it,
+  /// but may expand the ISEL instruction later.
+  bool enableEarlyIfConversion() const override { return true; }
 
   // Scheduling customization.
   bool enableMachineScheduler() const override;
@@ -316,6 +325,8 @@ public:
   /// classifyGlobalReference - Classify a global variable reference for the
   /// current subtarget accourding to how we should reference it.
   unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+  bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
 };
 } // End llvm namespace
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 0c1260a..5f8085f 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -21,9 +21,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -52,6 +52,7 @@ namespace {
 protected:
     bool processBlock(MachineBasicBlock &MBB) {
       bool Changed = false;
+      bool NeedFence = true;
       bool Is64Bit = MBB.getParent()->getSubtarget<PPCSubtarget>().isPPC64();
 
       for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
@@ -62,6 +63,16 @@ protected:
             MI.getOpcode() != PPC::ADDItlsldLADDR &&
             MI.getOpcode() != PPC::ADDItlsgdLADDR32 &&
             MI.getOpcode() != PPC::ADDItlsldLADDR32) {
+
+          // Although we create ADJCALLSTACKDOWN and ADJCALLSTACKUP
+          // as scheduling fences, we skip creating fences if we already
+          // have existing ADJCALLSTACKDOWN/UP to avoid nesting,
+          // which causes verification error with -verify-machineinstrs.
+          if (MI.getOpcode() == PPC::ADJCALLSTACKDOWN)
+            NeedFence = false;
+          else if (MI.getOpcode() == PPC::ADJCALLSTACKUP)
+            NeedFence = true;
+
           ++I;
           continue;
         }
@@ -96,10 +107,15 @@ protected:
           break;
         }
 
-        // Don't really need to save data to the stack - the clobbered
+        // We create ADJCALLSTACKUP and ADJCALLSTACKDOWN around _tls_get_addr
+        // as schduling fence to avoid it is scheduled before
+        // mflr in the prologue and the address in LR is clobbered (PR25839).
+        // We don't really need to save data to the stack - the clobbered
         // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr)
         // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR).
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0);
+        if (NeedFence)
+          BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0)
+                                                              .addImm(0);
 
         // Expand into two ops built prior to the existing instruction.
         MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3)
@@ -115,7 +131,8 @@ protected:
                               .addReg(GPR3));
         Call->addOperand(MI.getOperand(3));
 
-        BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
+        if (NeedFence)
+          BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKUP)).addImm(0).addImm(0);
 
         BuildMI(MBB, I, DL, TII->get(TargetOpcode::COPY), OutReg)
           .addReg(GPR3);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 7c53a56..17345b6 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -61,8 +61,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 91b1d24..fe092cc 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -12,20 +12,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPC.h"
+#include "PPCSubtarget.h"
 #include "PPCTargetObjectFile.h"
 #include "PPCTargetTransformInfo.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::
@@ -74,12 +86,14 @@ EnableMachineCombinerPass("ppc-machine-combiner",
 
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
-  RegisterTargetMachine<PPC32TargetMachine> A(getThePPC32Target());
-  RegisterTargetMachine<PPC64TargetMachine> B(getThePPC64Target());
-  RegisterTargetMachine<PPC64TargetMachine> C(getThePPC64LETarget());
+  RegisterTargetMachine<PPCTargetMachine> A(getThePPC32Target());
+  RegisterTargetMachine<PPCTargetMachine> B(getThePPC64Target());
+  RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializePPCBoolRetToIntPass(PR);
+  initializePPCExpandISELPass(PR);
+  initializePPCTLSDynamicCallPass(PR);
 }
 
 /// Return the datalayout string of a subtarget.
@@ -149,9 +163,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   // If it isn't a Mach-O file then it's going to be a linux ELF
   // object file.
   if (TT.isOSDarwin())
-    return make_unique<TargetLoweringObjectFileMachO>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
 
-  return make_unique<PPC64LinuxTargetObjectFile>();
+  return llvm::make_unique<PPC64LinuxTargetObjectFile>();
 }
 
 static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
@@ -164,32 +178,34 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
   assert(Options.MCOptions.getABIName().empty() &&
          "Unknown target-abi option!");
 
-  if (!TT.isMacOSX()) {
-    switch (TT.getArch()) {
-    case Triple::ppc64le:
-      return PPCTargetMachine::PPC_ABI_ELFv2;
-    case Triple::ppc64:
-      return PPCTargetMachine::PPC_ABI_ELFv1;
-    default:
-      // Fallthrough.
-      ;
-    }
+  if (TT.isMacOSX())
+    return PPCTargetMachine::PPC_ABI_UNKNOWN;
+
+  switch (TT.getArch()) {
+  case Triple::ppc64le:
+    return PPCTargetMachine::PPC_ABI_ELFv2;
+  case Triple::ppc64:
+    return PPCTargetMachine::PPC_ABI_ELFv1;
+  default:
+    return PPCTargetMachine::PPC_ABI_UNKNOWN;
   }
-  return PPCTargetMachine::PPC_ABI_UNKNOWN;
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue()) {
-    if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
-      if (!TT.isOSBinFormatMachO() && !TT.isMacOSX())
-        return Reloc::PIC_;
-    }
-    if (TT.isOSDarwin())
-      return Reloc::DynamicNoPIC;
-    return Reloc::Static;
-  }
-  return *RM;
+  if (RM.hasValue())
+    return *RM;
+
+  // Darwin defaults to dynamic-no-pic.
+  if (TT.isOSDarwin())
+    return Reloc::DynamicNoPIC;
+
+  // Non-darwin 64-bit platforms are PIC by default.
+  if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le)
+    return Reloc::PIC_;
+
+  // 32-bit is static by default.
+  return Reloc::Static;
 }
 
 // The FeatureString here is a little subtle. We are modifying the feature
@@ -205,33 +221,11 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                         computeFSAdditions(FS, OL, TT), Options,
                         getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      TargetABI(computeTargetABI(TT, Options)),
-      Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
-
+      TargetABI(computeTargetABI(TT, Options)) {
   initAsmInfo();
 }
 
-PPCTargetMachine::~PPCTargetMachine() {}
-
-void PPC32TargetMachine::anchor() { }
-
-PPC32TargetMachine::PPC32TargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
-
-void PPC64TargetMachine::anchor() { }
-
-PPC64TargetMachine::PPC64TargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : PPCTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+PPCTargetMachine::~PPCTargetMachine() = default;
 
 const PPCSubtarget *
 PPCTargetMachine::getSubtargetImpl(const Function &F) const {
@@ -281,10 +275,11 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// PPC Code Generator Pass Configuration Options.
 class PPCPassConfig : public TargetPassConfig {
 public:
-  PPCPassConfig(PPCTargetMachine *TM, PassManagerBase &PM)
+  PPCPassConfig(PPCTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   PPCTargetMachine &getPPCTargetMachine() const {
@@ -300,16 +295,17 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new PPCPassConfig(this, PM);
+  return new PPCPassConfig(*this, PM);
 }
 
 void PPCPassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createPPCBoolRetToIntPass());
-  addPass(createAtomicExpandPass(&getPPCTargetMachine()));
+  addPass(createAtomicExpandPass());
 
   // For the BG/Q (or if explicitly requested), add explicit data prefetch
   // intrinsics.
@@ -341,7 +337,7 @@ bool PPCPassConfig::addPreISel() {
     addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
 
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops(getPPCTargetMachine()));
+    addPass(createPPCCTRLoops());
 
   return false;
 }
@@ -357,7 +353,7 @@ bool PPCPassConfig::addILPOpts() {
 
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
-  addPass(createPPCISelDag(getPPCTargetMachine()));
+  addPass(createPPCISelDag(getPPCTargetMachine(), getOptLevel()));
 
 #ifndef NDEBUG
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
@@ -393,7 +389,7 @@ void PPCPassConfig::addPreRegAlloc() {
   // FIXME: We probably don't need to run these for -fPIE.
   if (getPPCTargetMachine().isPositionIndependent()) {
     // FIXME: LiveVariables should not be necessary here!
-    // PPCTLSDYnamicCallPass uses LiveIntervals which previously dependet on
+    // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on
     // LiveVariables. This (unnecessary) dependency has been removed now,
     // however a stage-2 clang build fails without LiveVariables computed here.
     addPass(&LiveVariablesID, false);
@@ -416,6 +412,8 @@ void PPCPassConfig::addPreSched2() {
 }
 
 void PPCPassConfig::addPreEmitPass() {
+  addPass(createPPCExpandISELPass());
+
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createPPCEarlyReturnPass(), false);
   // Must run branch selection immediately preceding the asm printer.
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
index 59b4f1e..be70550 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.h
@@ -23,13 +23,12 @@ namespace llvm {
 
 /// Common code between 32-bit and 64-bit PowerPC targets.
 ///
-class PPCTargetMachine : public LLVMTargetMachine {
+class PPCTargetMachine final : public LLVMTargetMachine {
 public:
   enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 };
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   PPCABI TargetABI;
-  PPCSubtarget Subtarget;
 
   mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
@@ -42,6 +41,9 @@ public:
   ~PPCTargetMachine() override;
 
   const PPCSubtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
+  const PPCSubtarget *getSubtargetImpl() const = delete;
 
   // Pass Pipeline Configuration
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
@@ -56,30 +58,11 @@ public:
     const Triple &TT = getTargetTriple();
     return (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le);
   };
-};
-
-/// PowerPC 32-bit target machine.
-///
-class PPC32TargetMachine : public PPCTargetMachine {
-  virtual void anchor();
-public:
-  PPC32TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
-};
 
-/// PowerPC 64-bit target machine.
-///
-class PPC64TargetMachine : public PPCTargetMachine {
-  virtual void anchor();
-public:
-  PPC64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL);
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
-
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
index dbe7617..310fea9 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -1,4 +1,4 @@
-//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//===- PPCTargetStreamer.h - PPC Target Streamer ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,18 +10,26 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
+
+class MCExpr;
+class MCSymbol;
+class MCSymbolELF;
+
 class PPCTargetStreamer : public MCTargetStreamer {
 public:
   PPCTargetStreamer(MCStreamer &S);
   ~PPCTargetStreamer() override;
+
   virtual void emitTCEntry(const MCSymbol &S) = 0;
   virtual void emitMachine(StringRef CPU) = 0;
   virtual void emitAbiVersion(int AbiVersion) = 0;
   virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f94d1ea..6110706 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -189,7 +189,7 @@ int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }
 
-void PPCTTIImpl::getUnrollingPreferences(Loop *L,
+void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
@@ -201,7 +201,7 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L,
     UP.AllowExpensiveTripCount = true;
   }
 
-  BaseT::getUnrollingPreferences(L, UP);
+  BaseT::getUnrollingPreferences(L, SE, UP);
 }
 
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
@@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  MaxLoadSize = 8;
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }
@@ -225,7 +230,7 @@ unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
   return ST->hasVSX() ? 64 : 32;
 }
 
-unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
     if (ST->hasQPX()) return 256;
     if (ST->hasAltivec()) return 128;
@@ -239,9 +244,18 @@ unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
 }
 
 unsigned PPCTTIImpl::getCacheLineSize() {
-  // This is currently only used for the data prefetch pass which is only
-  // enabled for BG/Q by default.
-  return CacheLineSize;
+  // Check first if the user specified a custom line size.
+  if (CacheLineSize.getNumOccurrences() > 0)
+    return CacheLineSize;
+
+  // On P7, P8 or P9 we have a cache line size of 128.
+  unsigned Directive = ST->getDarwinDirective();
+  if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
+      Directive == PPC::DIR_PWR9)
+    return 128;
+
+  // On other processors return a default of 64 bytes.
+  return 64;
 }
 
 unsigned PPCTTIImpl::getPrefetchDistance() {
@@ -302,14 +316,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return LT.first;
 }
 
-int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -352,7 +368,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 }
 
 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -401,6 +417,10 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
     return Cost;
 
+  // Newer PPC supports unaligned memory access.
+  if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
+    return Cost;
+
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
 
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 30ee281..99ca639 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -52,7 +52,8 @@ public:
                     Type *Ty);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   /// @}
 
@@ -60,9 +61,10 @@ public:
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getCacheLineSize();
   unsigned getPrefetchDistance();
   unsigned getMaxInterleaveFactor(unsigned VF);
@@ -74,11 +76,13 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
index 3b5d8f0..93fe323 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "PPC.h"
 #include "PPCHazardRecognizers.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
@@ -112,7 +112,7 @@ protected:
                   TII->get(TargetOpcode::SUBREG_TO_REG), NewVReg)
               .addImm(1) // add 1, not 0, because there is no implicit clearing
                          // of the high bits.
-              .addOperand(SrcMO)
+              .add(SrcMO)
               .addImm(PPC::sub_64);
 
           // The source of the original copy is now the new virtual register.
@@ -132,7 +132,7 @@ protected:
           unsigned NewVReg = MRI.createVirtualRegister(DstRC);
           BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
                   NewVReg)
-              .addOperand(SrcMO);
+              .add(SrcMO);
 
           // Transform the original copy into a subregister extraction copy.
           SrcMO.setReg(NewVReg);
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index f6d20ce..a57484e 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
index 8197285..7d34efd 100644
--- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
+++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp
@@ -42,9 +42,9 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "PPCInstrInfo.h"
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
@@ -195,8 +195,10 @@ public:
       return false;
 
     // If we don't have VSX on the subtarget, don't do anything.
+    // Also, on Power 9 the load and store ops preserve element order and so
+    // the swaps are not required.
     const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>();
-    if (!STI.hasVSX())
+    if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps())
       return false;
 
     bool Changed = false;
@@ -522,7 +524,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() {
 
   if (RelevantFunction) {
     DEBUG(dbgs() << "Swap vector when first built\n\n");
-    dumpSwapVector();
+    DEBUG(dumpSwapVector());
   }
 
   return RelevantFunction;
@@ -731,7 +733,7 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() {
   }
 
   DEBUG(dbgs() << "Swap vector after web analysis:\n\n");
-  dumpSwapVector();
+  DEBUG(dumpSwapVector());
 }
 
 // Walk the swap vector entries looking for swaps fed by permuting loads
@@ -936,9 +938,9 @@ bool PPCVSXSwapRemoval::removeSwaps() {
       Changed = true;
       MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
       MachineBasicBlock *MBB = MI->getParent();
-      BuildMI(*MBB, MI, MI->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-        .addOperand(MI->getOperand(1));
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(TargetOpcode::COPY),
+              MI->getOperand(0).getReg())
+          .add(MI->getOperand(1));
 
       DEBUG(dbgs() << format("Replaced %d with copy: ",
                              SwapVector[EntryIdx].VSEId));
@@ -951,77 +953,78 @@ bool PPCVSXSwapRemoval::removeSwaps() {
   return Changed;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // For debug purposes, dump the contents of the swap vector.
-void PPCVSXSwapRemoval::dumpSwapVector() {
+LLVM_DUMP_METHOD void PPCVSXSwapRemoval::dumpSwapVector() {
 
   for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) {
 
     MachineInstr *MI = SwapVector[EntryIdx].VSEMI;
     int ID = SwapVector[EntryIdx].VSEId;
 
-    DEBUG(dbgs() << format("%6d", ID));
-    DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID)));
-    DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber()));
-    DEBUG(dbgs() << format("  %14s  ",
-                           TII->getName(MI->getOpcode()).str().c_str()));
+    dbgs() << format("%6d", ID);
+    dbgs() << format("%6d", EC->getLeaderValue(ID));
+    dbgs() << format(" BB#%3d", MI->getParent()->getNumber());
+    dbgs() << format("  %14s  ", TII->getName(MI->getOpcode()).str().c_str());
 
     if (SwapVector[EntryIdx].IsLoad)
-      DEBUG(dbgs() << "load ");
+      dbgs() << "load ";
     if (SwapVector[EntryIdx].IsStore)
-      DEBUG(dbgs() << "store ");
+      dbgs() << "store ";
     if (SwapVector[EntryIdx].IsSwap)
-      DEBUG(dbgs() << "swap ");
+      dbgs() << "swap ";
     if (SwapVector[EntryIdx].MentionsPhysVR)
-      DEBUG(dbgs() << "physreg ");
+      dbgs() << "physreg ";
     if (SwapVector[EntryIdx].MentionsPartialVR)
-      DEBUG(dbgs() << "partialreg ");
+      dbgs() << "partialreg ";
 
     if (SwapVector[EntryIdx].IsSwappable) {
-      DEBUG(dbgs() << "swappable ");
+      dbgs() << "swappable ";
       switch(SwapVector[EntryIdx].SpecialHandling) {
       default:
-        DEBUG(dbgs() << "special:**unknown**");
+        dbgs() << "special:**unknown**";
         break;
       case SH_NONE:
         break;
       case SH_EXTRACT:
-        DEBUG(dbgs() << "special:extract ");
+        dbgs() << "special:extract ";
         break;
       case SH_INSERT:
-        DEBUG(dbgs() << "special:insert ");
+        dbgs() << "special:insert ";
         break;
       case SH_NOSWAP_LD:
-        DEBUG(dbgs() << "special:load ");
+        dbgs() << "special:load ";
         break;
       case SH_NOSWAP_ST:
-        DEBUG(dbgs() << "special:store ");
+        dbgs() << "special:store ";
         break;
       case SH_SPLAT:
-        DEBUG(dbgs() << "special:splat ");
+        dbgs() << "special:splat ";
         break;
       case SH_XXPERMDI:
-        DEBUG(dbgs() << "special:xxpermdi ");
+        dbgs() << "special:xxpermdi ";
         break;
       case SH_COPYWIDEN:
-        DEBUG(dbgs() << "special:copywiden ");
+        dbgs() << "special:copywiden ";
         break;
       }
     }
 
     if (SwapVector[EntryIdx].WebRejected)
-      DEBUG(dbgs() << "rejected ");
+      dbgs() << "rejected ";
     if (SwapVector[EntryIdx].WillRemove)
-      DEBUG(dbgs() << "remove ");
+      dbgs() << "remove ";
 
-    DEBUG(dbgs() << "\n");
+    dbgs() << "\n";
 
     // For no-asserts builds.
     (void)MI;
     (void)ID;
   }
 
-  DEBUG(dbgs() << "\n");
+  dbgs() << "\n";
 }
+#endif
 
 } // end default namespace
 
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index f8ef142..be83efc 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -12,10 +12,10 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,8 +32,9 @@ public:
       : MCAsmBackend(), OSABI(OSABI), Is64Bit(Is64Bit) {}
   ~RISCVAsmBackend() override {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -69,9 +70,10 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
+void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                                 const MCValue &Target,
+                                 MutableArrayRef<char> Data, uint64_t Value,
+                                 bool IsResolved) const {
   return;
 }
 
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
index b164df8..d622911 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 void RISCVMCAsmInfo::anchor() {}
 
 RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) {
-  PointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4;
+  CodePointerSize = CalleeSaveStackSlotSize = TT.isArch64Bit() ? 8 : 4;
   CommentString = "#";
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index b2ed137..9309d49 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -13,13 +13,13 @@
 
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 4fc69a7..41be0a2 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -44,13 +44,12 @@ static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
 
 static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const Triple &TT) {
-  MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
-  return MAI;
+  return new RISCVMCAsmInfo(TT);
 }
 
 extern "C" void LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
-    RegisterMCAsmInfoFn X(*T, createRISCVMCAsmInfo);
+    TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createRISCVMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createRISCVMCRegisterInfo);
     TargetRegistry::RegisterMCAsmBackend(*T, createRISCVAsmBackend);
diff --git a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index ddc3bf3..7c98b1c 100644
--- a/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H
 #define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVMCTARGETDESC_H
 
+#include "llvm/Config/config.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/DataTypes.h"
-#include "llvm/Config/config.h"
 
 namespace llvm {
 class MCAsmBackend;
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 1e9bc3b..3fab712 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/contrib/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -44,8 +44,9 @@ class RISCVInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : RISCVInst<outs, ins, asmstr, pattern> {
+    : RISCVInst<outs, ins, "", pattern> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class FR<bits<7> funct7, bits<3> funct3, bits<7> opcode, dag outs, dag ins,
diff --git a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index afbbe00..744d7b8 100644
--- a/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -13,10 +13,10 @@
 
 #include "RISCVTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -32,7 +32,7 @@ static std::string computeDataLayout(const Triple &TT) {
     return "e-m:e-i64:64-n32:64-S128";
   } else {
     assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
-    return "e-m:e-i64:64-n32-S128";
+    return "e-m:e-p:32:32-i64:64-n32-S128";
   }
 }
 
@@ -51,8 +51,10 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(TT, RM), CM, OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()) {}
+      TLOF(make_unique<TargetLoweringObjectFileELF>()) {
+  initAsmInfo();
+}
 
 TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new TargetPassConfig(this, PM);
+  return new TargetPassConfig(*this, PM);
 }
diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index e775aa6..087c037 100644
--- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -10,31 +10,48 @@
 #include "MCTargetDesc/SparcMCExpr.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
 
 using namespace llvm;
 
 // The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
 // namespace. But SPARC backend uses "SP" as its namespace.
 namespace llvm {
-  namespace Sparc {
+namespace Sparc {
+
     using namespace SP;
-  }
-}
+
+} // end namespace Sparc
+} // end namespace llvm
 
 namespace {
+
 class SparcOperand;
-class SparcAsmParser : public MCTargetAsmParser {
 
+class SparcAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
   /// @name Auto-generated Match Functions
@@ -95,9 +112,10 @@ public:
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
-
 };
 
+} // end anonymous namespace
+
   static const MCPhysReg IntRegs[32] = {
     Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
     Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
@@ -166,6 +184,8 @@ public:
     Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23,
     Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31};
   
+namespace {
+
 /// SparcOperand - Instances of this class represent a parsed Sparc machine
 /// instruction.
 class SparcOperand : public MCParsedAsmOperand {
@@ -219,6 +239,7 @@ private:
     struct ImmOp Imm;
     struct MemOp Mem;
   };
+
 public:
   SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 
@@ -464,7 +485,7 @@ public:
   }
 };
 
-} // end namespace
+} // end anonymous namespace
 
 bool SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
                                SmallVectorImpl<MCInst> &Instructions) {
@@ -591,9 +612,8 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Implement any new match types added!");
 }
 
-bool SparcAsmParser::
-ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
-{
+bool SparcAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                   SMLoc &EndLoc) {
   const AsmToken &Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
@@ -695,7 +715,7 @@ ParseDirective(AsmToken DirectiveID)
 
 bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
         return true;
@@ -717,7 +737,6 @@ bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
 
 OperandMatchResultTy
 SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
-
   SMLoc S, E;
   unsigned BaseReg = 0;
 
@@ -824,7 +843,6 @@ SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 OperandMatchResultTy
 SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
                                      bool isCall) {
-
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
@@ -910,11 +928,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
 
 OperandMatchResultTy
 SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
-
   // parse (,a|,pn|,pt)+
 
   while (getLexer().is(AsmToken::Comma)) {
-
     Parser.Lex(); // Eat the comma
 
     if (!getLexer().is(AsmToken::Identifier))
@@ -929,10 +945,8 @@ SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
-                                       unsigned &RegNo,
-                                       unsigned &RegKind)
-{
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+                                       unsigned &RegKind) {
   int64_t intVal = 0;
   RegNo = 0;
   RegKind = SparcOperand::rk_None;
@@ -1211,8 +1225,7 @@ static bool hasGOTReference(const MCExpr *Expr) {
 
 const SparcMCExpr *
 SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
-                                    const MCExpr *subExpr)
-{
+                                    const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
   // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
@@ -1236,8 +1249,7 @@ SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
-                                            SMLoc &EndLoc)
-{
+                                            SMLoc &EndLoc) {
   AsmToken Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return false;
diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
index 6f9cc31..df819cc 100644
--- a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -96,7 +96,7 @@ namespace {
 /// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
 /// slots in Sparc MachineFunctions
 ///
-FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+FunctionPass *llvm::createSparcDelaySlotFillerPass() {
   return new Filler;
 }
 
diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index da7e0b7..8e298e8 100644
--- a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -14,11 +14,11 @@
 #include "Sparc.h"
 #include "SparcRegisterInfo.h"
 #include "SparcSubtarget.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp b/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
index 0acc287..ca6a0dc 100755
--- a/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
+++ b/contrib/llvm/lib/Target/Sparc/LeonPasses.cpp
@@ -21,9 +21,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-LEONMachineFunctionPass::LEONMachineFunctionPass(TargetMachine &tm, char &ID)
-    : MachineFunctionPass(ID) {}
-
 LEONMachineFunctionPass::LEONMachineFunctionPass(char &ID)
     : MachineFunctionPass(ID) {}
 
@@ -72,8 +69,7 @@ int LEONMachineFunctionPass::getUnusedFPRegister(MachineRegisterInfo &MRI) {
 //
 char InsertNOPLoad::ID = 0;
 
-InsertNOPLoad::InsertNOPLoad(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+InsertNOPLoad::InsertNOPLoad() : LEONMachineFunctionPass(ID) {}
 
 bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -114,7 +110,7 @@ bool InsertNOPLoad::runOnMachineFunction(MachineFunction &MF) {
 //
 char FixFSMULD::ID = 0;
 
-FixFSMULD::FixFSMULD(TargetMachine &tm) : LEONMachineFunctionPass(tm, ID) {}
+FixFSMULD::FixFSMULD() : LEONMachineFunctionPass(ID) {}
 
 bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -203,8 +199,7 @@ bool FixFSMULD::runOnMachineFunction(MachineFunction &MF) {
 //
 char ReplaceFMULS::ID = 0;
 
-ReplaceFMULS::ReplaceFMULS(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+ReplaceFMULS::ReplaceFMULS() : LEONMachineFunctionPass(ID) {}
 
 bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -287,8 +282,7 @@ bool ReplaceFMULS::runOnMachineFunction(MachineFunction &MF) {
 
 char DetectRoundChange::ID = 0;
 
-DetectRoundChange::DetectRoundChange(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+DetectRoundChange::DetectRoundChange() : LEONMachineFunctionPass(ID) {}
 
 bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
@@ -338,8 +332,7 @@ bool DetectRoundChange::runOnMachineFunction(MachineFunction &MF) {
 //
 char FixAllFDIVSQRT::ID = 0;
 
-FixAllFDIVSQRT::FixAllFDIVSQRT(TargetMachine &tm)
-    : LEONMachineFunctionPass(tm, ID) {}
+FixAllFDIVSQRT::FixAllFDIVSQRT() : LEONMachineFunctionPass(ID) {}
 
 bool FixAllFDIVSQRT::runOnMachineFunction(MachineFunction &MF) {
   Subtarget = &MF.getSubtarget<SparcSubtarget>();
diff --git a/contrib/llvm/lib/Target/Sparc/LeonPasses.h b/contrib/llvm/lib/Target/Sparc/LeonPasses.h
index 2158cb6..99cdfc4 100755
--- a/contrib/llvm/lib/Target/Sparc/LeonPasses.h
+++ b/contrib/llvm/lib/Target/Sparc/LeonPasses.h
@@ -32,7 +32,6 @@ protected:
   std::vector<int> UsedRegisters;
 
 protected:
-  LEONMachineFunctionPass(TargetMachine &tm, char &ID);
   LEONMachineFunctionPass(char &ID);
 
   int GetRegIndexForOperand(MachineInstr &MI, int OperandIndex);
@@ -48,7 +47,7 @@ class LLVM_LIBRARY_VISIBILITY InsertNOPLoad : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  InsertNOPLoad(TargetMachine &tm);
+  InsertNOPLoad();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -62,7 +61,7 @@ class LLVM_LIBRARY_VISIBILITY FixFSMULD : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  FixFSMULD(TargetMachine &tm);
+  FixFSMULD();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -74,7 +73,7 @@ class LLVM_LIBRARY_VISIBILITY ReplaceFMULS : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  ReplaceFMULS(TargetMachine &tm);
+  ReplaceFMULS();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -89,7 +88,7 @@ class LLVM_LIBRARY_VISIBILITY DetectRoundChange
 public:
   static char ID;
 
-  DetectRoundChange(TargetMachine &tm);
+  DetectRoundChange();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
@@ -102,7 +101,7 @@ class LLVM_LIBRARY_VISIBILITY FixAllFDIVSQRT : public LEONMachineFunctionPass {
 public:
   static char ID;
 
-  FixAllFDIVSQRT(TargetMachine &tm);
+  FixAllFDIVSQRT();
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 6106a6c..0a72a44 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmBackend.h"
 #include "MCTargetDesc/SparcFixupKinds.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
@@ -61,14 +61,6 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Sparc::fixup_sparc_lo10:
     return Value & 0x3ff;
 
-  case Sparc::fixup_sparc_tls_ldo_hix22:
-  case Sparc::fixup_sparc_tls_le_hix22:
-    return (~Value >> 10) & 0x3fffff;
-
-  case Sparc::fixup_sparc_tls_ldo_lox10:
-  case Sparc::fixup_sparc_tls_le_lox10:
-    return (~(~Value & 0x3ff)) & 0x1fff;
-
   case Sparc::fixup_sparc_h44:
     return (Value >> 22) & 0x3fffff;
 
@@ -84,6 +76,13 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Sparc::fixup_sparc_hm:
     return (Value >> 32) & 0x3ff;
 
+  case Sparc::fixup_sparc_tls_ldo_hix22:
+  case Sparc::fixup_sparc_tls_le_hix22:
+  case Sparc::fixup_sparc_tls_ldo_lox10:
+  case Sparc::fixup_sparc_tls_le_lox10:
+    assert(Value == 0 && "Sparc TLS relocs expect zero Value");
+    return 0;
+
   case Sparc::fixup_sparc_tls_gd_add:
   case Sparc::fixup_sparc_tls_gd_call:
   case Sparc::fixup_sparc_tls_ldm_add:
@@ -203,15 +202,15 @@ namespace {
       return InfosBE[Kind - FirstTargetFixupKind];
     }
 
-    void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                           const MCFixup &Fixup, const MCFragment *DF,
-                           const MCValue &Target, uint64_t &Value,
-                           bool &IsResolved) override {
+    bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                               const MCValue &Target) override {
       switch ((Sparc::Fixups)Fixup.getKind()) {
-      default: break;
+      default:
+        return false;
       case Sparc::fixup_sparc_wplt30:
         if (Target.getSymA()->getSymbol().isTemporary())
-          return;
+          return false;
+        LLVM_FALLTHROUGH;
       case Sparc::fixup_sparc_tls_gd_hi22:
       case Sparc::fixup_sparc_tls_gd_lo10:
       case Sparc::fixup_sparc_tls_gd_add:
@@ -229,7 +228,8 @@ namespace {
       case Sparc::fixup_sparc_tls_ie_ldx:
       case Sparc::fixup_sparc_tls_ie_add:
       case Sparc::fixup_sparc_tls_le_hix22:
-      case Sparc::fixup_sparc_tls_le_lox10:  IsResolved = false; break;
+      case Sparc::fixup_sparc_tls_le_lox10:
+        return true;
       }
     }
 
@@ -273,8 +273,9 @@ namespace {
     ELFSparcAsmBackend(const Target &T, Triple::OSType OSType) :
       SparcAsmBackend(T), OSType(OSType) { }
 
-    void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel) const override {
+    void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                    const MCValue &Target, MutableArrayRef<char> Data,
+                    uint64_t Value, bool IsResolved) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 280c6d7..50e8825 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//===- SparcMCAsmInfo.cpp - Sparc asm properties --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,7 +14,10 @@
 #include "SparcMCAsmInfo.h"
 #include "SparcMCExpr.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
 
 using namespace llvm;
 
@@ -25,7 +28,7 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Triple &TheTriple) {
   IsLittleEndian = (TheTriple.getArch() == Triple::sparcel);
 
   if (isV9) {
-    PointerSize = CalleeSaveStackSlotSize = 8;
+    CodePointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   Data16bitsDirective = "\t.half\t";
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index ad44122..5e8d0cb 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//===- SparcMCAsmInfo.h - Sparc asm properties -----------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
+
 class Triple;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
@@ -24,6 +25,7 @@ class SparcELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit SparcELFMCAsmInfo(const Triple &TheTriple);
+
   const MCExpr*
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
@@ -33,6 +35,6 @@ public:
 
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 86341c6..684f669 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,20 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcMCExpr.h"
 #include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCExpr.h"
 #include "SparcMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,17 +42,17 @@ using namespace llvm;
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
+
 class SparcMCCodeEmitter : public MCCodeEmitter {
-  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
-  void operator=(const SparcMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 
 public:
   SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : MCII(mcii), Ctx(ctx) {}
-
-  ~SparcMCCodeEmitter() override {}
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
+  SparcMCCodeEmitter &operator=(const SparcMCCodeEmitter &) = delete;
+  ~SparcMCCodeEmitter() override = default;
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -79,13 +88,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-} // end anonymous namespace
 
-MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new SparcMCCodeEmitter(MCII, Ctx);
-}
+} // end anonymous namespace
 
 void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                            SmallVectorImpl<MCFixup> &Fixups,
@@ -121,12 +125,10 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   ++MCNumEmitted;  // Keep track of the # of mi's emitted.
 }
 
-
 unsigned SparcMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
-
   if (MO.isReg())
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
 
@@ -209,6 +211,7 @@ getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
                                    (MCFixupKind)Sparc::fixup_sparc_br19));
   return 0;
 }
+
 unsigned SparcMCCodeEmitter::
 getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
@@ -227,3 +230,9 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "SparcGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new SparcMCCodeEmitter(MCII, Ctx);
+}
diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index e85a8cd..a77f760 100644
--- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -19,7 +19,6 @@
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Object/ELF.h"
 
-
 using namespace llvm;
 
 #define DEBUG_TYPE "sparcmcexpr"
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.h b/contrib/llvm/lib/Target/Sparc/Sparc.h
index 0a8272d..4135e4e 100644
--- a/contrib/llvm/lib/Target/Sparc/Sparc.h
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.h
@@ -28,7 +28,7 @@ namespace llvm {
   class MachineInstr;
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
-  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass();
 
   void LowerSparcMachineInstrToMCInst(const MachineInstr *MI,
                                       MCInst &OutMI,
diff --git a/contrib/llvm/lib/Target/Sparc/Sparc.td b/contrib/llvm/lib/Target/Sparc/Sparc.td
index 11004c5..91cab00 100644
--- a/contrib/llvm/lib/Target/Sparc/Sparc.td
+++ b/contrib/llvm/lib/Target/Sparc/Sparc.td
@@ -20,6 +20,10 @@ include "llvm/Target/Target.td"
 // SPARC Subtarget features.
 //
 
+def FeatureSoftMulDiv
+  : SubtargetFeature<"soft-mul-div", "UseSoftMulDiv", "true",
+                     "Use software emulation for integer multiply and divide">;
+
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
@@ -75,7 +79,7 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
-def : Proc<"v7",              []>;
+def : Proc<"v7",              [FeatureSoftMulDiv]>;
 def : Proc<"v8",              []>;
 def : Proc<"supersparc",      []>;
 def : Proc<"sparclite",       []>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index 31a128a..19fb945 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -12,9 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Sparc.h"
 #include "InstPrinter/SparcInstPrinter.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "Sparc.h"
 #include "SparcInstrInfo.h"
 #include "SparcTargetMachine.h"
 #include "SparcTargetStreamer.h"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 122f830..c07cc21 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -288,11 +288,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   return true;
@@ -305,8 +305,8 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineFrameInfo    &MFI = MF.getFrameInfo();
 
   return !(MFI.hasCalls()                  // has calls
-           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
-           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || MRI.isPhysRegUsed(SP::L0)    // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6)    // %SP is used
            || hasFP(MF));                  // need %FP
 }
 
@@ -314,11 +314,10 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (MRI.reg_nodbg_empty(reg))
+    if (!MRI.isPhysRegUsed(reg))
       continue;
 
     unsigned mapped_reg = reg - SP::I0 + SP::O0;
-    assert(MRI.reg_nodbg_empty(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 2ac9aae..6767a59 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 using namespace llvm;
 
 
@@ -772,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -1164,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
-                               DL);
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
 
   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -1690,6 +1689,19 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
   setOperationAction(ISD::MUL,       MVT::i32, Expand);
 
+  if (Subtarget->useSoftMulDiv()) {
+    // .umul works for both signed and unsigned
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setLibcallName(RTLIB::MUL_I32, ".umul");
+
+    setOperationAction(ISD::SDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::SDIV_I32, ".div");
+
+    setOperationAction(ISD::UDIV, MVT::i32, Expand);
+    setLibcallName(RTLIB::UDIV_I32, ".udiv");
+  }
+
   if (Subtarget->is64Bit()) {
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
@@ -1875,24 +1887,24 @@ EVT SparcTargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
 /// combiner.
 void SparcTargetLowering::computeKnownBitsForTargetNode
                                 (const SDValue Op,
-                                 APInt &KnownZero,
-                                 APInt &KnownOne,
+                                 KnownBits &Known,
+                                 const APInt &DemandedElts,
                                  const SelectionDAG &DAG,
                                  unsigned Depth) const {
-  APInt KnownZero2, KnownOne2;
-  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  KnownBits Known2;
+  Known.resetAll();
 
   switch (Op.getOpcode()) {
   default: break;
   case SPISD::SELECT_ICC:
   case SPISD::SELECT_XCC:
   case SPISD::SELECT_FCC:
-    DAG.computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    DAG.computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
-    KnownOne &= KnownOne2;
-    KnownZero &= KnownZero2;
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
     break;
   }
 }
@@ -2057,7 +2069,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Chain = DAG.getEntryNode();
     SDValue InFlag;
 
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
     Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
     InFlag = Chain.getValue(1);
     SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2177,8 +2189,8 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
     if (!Subtarget->is64Bit())
-      Entry.isSRet = true;
-    Entry.isReturned = false;
+      Entry.IsSRet = true;
+    Entry.IsReturned = false;
     Args.push_back(Entry);
     RetTyABI = Type::getVoidTy(*DAG.getContext());
   }
@@ -3233,6 +3245,7 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                       MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -3244,7 +3257,8 @@ SparcTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+  (void)TRI;
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
@@ -3383,7 +3397,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
-    case 'r': return C_RegisterClass;
+    case 'r':
+    case 'f':
+    case 'e':
+      return C_RegisterClass;
     case 'I': // SIMM13
       return C_Other;
     }
@@ -3462,6 +3479,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &SP::IntPairRegClass);
       else
         return std::make_pair(0U, &SP::IntRegsRegClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &SP::FPRegsRegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &SP::LowDFPRegsRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SP::LowQFPRegsRegClass);
+      llvm_unreachable("Unknown ValueType for f-register-type!");
+      break;
+    case 'e':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &SP::FPRegsRegClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, &SP::DFPRegsRegClass);
+      else if (VT == MVT::f128)
+        return std::make_pair(0U, &SP::QFPRegsRegClass);
+      llvm_unreachable("Unknown ValueType for e-register-type!");
+      break;
     }
   } else if (!Constraint.empty() && Constraint.size() <= 5
               && Constraint[0] == '{' && *(Constraint.end()-1) == '}') {
diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
index e0a421b..cc6386b 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -66,8 +66,8 @@ namespace llvm {
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
     void computeKnownBitsForTargetNode(const SDValue Op,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
+                                       KnownBits &Known,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 5a19c62..3194ad4 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -27,6 +27,9 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 // True when generating 64-bit code. This also implies HasV9.
 def Is64Bit : Predicate<"Subtarget->is64Bit()">;
 
+def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">,
+              AssemblerPredicate<"FeatureSoftMulDiv">;
+
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
 def HasV9   : Predicate<"Subtarget->isV9()">,
@@ -195,7 +198,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP",
                            [SDNPHasChain, SDNPSideEffect]>;
 
 //  These are target-independent nodes, but have target-specific formats.
-def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32> ]>;
 def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32> ]>;
 
@@ -404,9 +408,9 @@ let Defs = [O7] in {
 }
 
 let Defs = [O6], Uses = [O6] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                               "!ADJCALLSTACKDOWN $amt",
-                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                               "!ADJCALLSTACKDOWN $amt1, $amt2",
+                               [(callseq_start timm:$amt1, timm:$amt2)]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             "!ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
index a3cedcb..a784124 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Sparc.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "Sparc.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index 6ecfddf..6625eaa 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
 
 // Floating point register classes.
 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
-
 def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
-
 def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
 
+// The Low?FPRegs classes are used only for inline-asm constraints.
+def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>;
+
 // Floating point control register classes.
 def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>;
 
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index 43ddef3..daac56a 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -28,6 +28,7 @@ void SparcSubtarget::anchor() { }
 
 SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
+  UseSoftMulDiv = false;
   IsV9 = false;
   IsLeon = false;
   V8DeprecatedInsts = false;
diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
index fa42da4..d181399 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h
@@ -32,6 +32,7 @@ class StringRef;
 class SparcSubtarget : public SparcGenSubtargetInfo {
   Triple TargetTriple;
   virtual void anchor();
+  bool UseSoftMulDiv;
   bool IsV9;
   bool IsLeon;
   bool V8DeprecatedInsts;
@@ -76,6 +77,7 @@ public:
 
   bool enableMachineScheduler() const override;
 
+  bool useSoftMulDiv() const { return UseSoftMulDiv; }
   bool isV9() const { return IsV9; }
   bool isLeon() const { return IsLeon; }
   bool isVIS() const { return IsVIS; }
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 4ae6406..c7a1ca2 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SparcTargetMachine.h"
-#include "SparcTargetObjectFile.h"
-#include "Sparc.h"
 #include "LeonPasses.h"
+#include "Sparc.h"
+#include "SparcTargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -114,7 +114,7 @@ namespace {
 /// Sparc Code Generator Pass Configuration Options.
 class SparcPassConfig : public TargetPassConfig {
 public:
-  SparcPassConfig(SparcTargetMachine *TM, PassManagerBase &PM)
+  SparcPassConfig(SparcTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   SparcTargetMachine &getSparcTargetMachine() const {
@@ -128,11 +128,11 @@ public:
 } // namespace
 
 TargetPassConfig *SparcTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new SparcPassConfig(this, PM);
+  return new SparcPassConfig(*this, PM);
 }
 
 void SparcPassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass(&getSparcTargetMachine()));
+  addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
 }
@@ -143,26 +143,26 @@ bool SparcPassConfig::addInstSelector() {
 }
 
 void SparcPassConfig::addPreEmitPass(){
-  addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
+  addPass(createSparcDelaySlotFillerPass());
 
   if (this->getSparcTargetMachine().getSubtargetImpl()->insertNOPLoad())
   {
-    addPass(new InsertNOPLoad(getSparcTargetMachine()));
+    addPass(new InsertNOPLoad());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->fixFSMULD())
   {
-    addPass(new FixFSMULD(getSparcTargetMachine()));
+    addPass(new FixFSMULD());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->replaceFMULS())
   {
-    addPass(new ReplaceFMULS(getSparcTargetMachine()));
+    addPass(new ReplaceFMULS());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->detectRoundChange()) {
-    addPass(new DetectRoundChange(getSparcTargetMachine()));
+    addPass(new DetectRoundChange());
   }
   if (this->getSparcTargetMachine().getSubtargetImpl()->fixAllFDIVSQRT())
   {
-    addPass(new FixAllFDIVSQRT(getSparcTargetMachine()));
+    addPass(new FixAllFDIVSQRT());
   }
 }
 
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
index 48193fe..faf714c 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetMachine.h
@@ -40,6 +40,10 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
 /// Sparc 32-bit target machine
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index 8fdde15..2c040dc 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -9,12 +9,18 @@
 
 #include "SparcTargetObjectFile.h"
 #include "MCTargetDesc/SparcMCExpr.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Target/TargetLowering.h"
 
 using namespace llvm;
 
+void SparcELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                          const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
 const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
diff --git a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
index fe88006..3b1b345 100644
--- a/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -23,6 +23,8 @@ public:
     TargetLoweringObjectFileELF()
   {}
 
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
   const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
                                         unsigned Encoding,
                                         const TargetMachine &TM,
diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a94717c..3368078 100644
--- a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -9,15 +9,30 @@
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
 
 using namespace llvm;
 
@@ -31,6 +46,7 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 }
 
 namespace {
+
 enum RegisterKind {
   GR32Reg,
   GRH32Reg,
@@ -45,6 +61,7 @@ enum RegisterKind {
   VR64Reg,
   VR128Reg,
   AR32Reg,
+  CR64Reg,
 };
 
 enum MemoryKind {
@@ -56,7 +73,6 @@ enum MemoryKind {
 };
 
 class SystemZOperand : public MCParsedAsmOperand {
-public:
 private:
   enum OperandKind {
     KindInvalid,
@@ -140,12 +156,14 @@ public:
                                                        SMLoc EndLoc) {
     return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
+
   static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
     auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
@@ -153,12 +171,14 @@ public:
     Op->Reg.Num = Num;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
             const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
@@ -175,6 +195,7 @@ public:
       Op->Mem.Length.Reg = LengthReg;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
                SMLoc StartLoc, SMLoc EndLoc) {
@@ -242,6 +263,9 @@ public:
   bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const {
     return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287);
   }
+  bool isMemDisp12Len4(RegisterKind RegKind) const {
+    return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10);
+  }
   bool isMemDisp12Len8(RegisterKind RegKind) const {
     return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100);
   }
@@ -251,6 +275,10 @@ public:
   SMLoc getEndLoc() const override { return EndLoc; }
   void print(raw_ostream &OS) const override;
 
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
+  SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   // Used by the TableGen code to add particular types of operand
   // to an instruction.
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -320,6 +348,7 @@ public:
   bool isVF128() const { return false; }
   bool isVR128() const { return isReg(VR128Reg); }
   bool isAR32() const { return isReg(AR32Reg); }
+  bool isCR64() const { return isReg(CR64Reg); }
   bool isAnyReg() const { return (isReg() || isImm(0, 15)); }
   bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
   bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
@@ -327,6 +356,7 @@ public:
   bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
   bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
   bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
+  bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); }
   bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
   bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
   bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
@@ -355,7 +385,8 @@ private:
     RegGR,
     RegFP,
     RegV,
-    RegAR
+    RegAR,
+    RegCR
   };
   struct Register {
     RegisterGroup Group;
@@ -463,6 +494,9 @@ public:
   OperandMatchResultTy parseAR32(OperandVector &Operands) {
     return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg);
   }
+  OperandMatchResultTy parseCR64(OperandVector &Operands) {
+    return parseRegister(Operands, RegCR, SystemZMC::CR64Regs, CR64Reg);
+  }
   OperandMatchResultTy parseAnyReg(OperandVector &Operands) {
     return parseAnyRegister(Operands);
   }
@@ -503,6 +537,7 @@ public:
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
   }
 };
+
 } // end anonymous namespace
 
 #define GET_REGISTER_MATCHER
@@ -623,6 +658,8 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
     Reg.Group = RegV;
   else if (Prefix == 'a' && Reg.Num < 16)
     Reg.Group = RegAR;
+  else if (Prefix == 'c' && Reg.Num < 16)
+    Reg.Group = RegCR;
   else
     return Error(Reg.StartLoc, "invalid register");
 
@@ -716,6 +753,10 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
       Kind = AR32Reg;
       RegNo = SystemZMC::AR32Regs[Reg.Num];
     }
+    else if (Reg.Group == RegCR) {
+      Kind = CR64Reg;
+      RegNo = SystemZMC::CR64Regs[Reg.Num];
+    }
     else {
       return MatchOperand_ParseFail;
     }
@@ -1031,6 +1072,8 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
     RegNo = SystemZMC::VR128Regs[Reg.Num];
   else if (Reg.Group == RegAR)
     RegNo = SystemZMC::AR32Regs[Reg.Num];
+  else if (Reg.Group == RegCR)
+    RegNo = SystemZMC::CR64Regs[Reg.Num];
   StartLoc = Reg.StartLoc;
   EndLoc = Reg.EndLoc;
   return false;
@@ -1125,6 +1168,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   return false;
 }
 
+std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS);
+
 bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                OperandVector &Operands,
                                                MCStreamer &Out,
@@ -1170,8 +1215,13 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
 
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  case Match_MnemonicFail: {
+    uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits());
+    std::string Suggestion = SystemZMnemonicSpellCheck(
+      ((SystemZOperand &)*Operands[0]).getToken(), FBS);
+    return Error(IDLoc, "invalid instruction" + Suggestion,
+                 ((SystemZOperand &)*Operands[0]).getLocRange());
+  }
   }
 
   llvm_unreachable("Unexpected match type");
diff --git a/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 1806e01..8903b57 100644
--- a/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "SystemZ.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -21,17 +25,19 @@ using namespace llvm;
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
+
 class SystemZDisassembler : public MCDisassembler {
 public:
   SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
     : MCDisassembler(STI, Ctx) {}
-  ~SystemZDisassembler() override {}
+  ~SystemZDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
 };
+
 } // end anonymous namespace
 
 static MCDisassembler *createSystemZDisassembler(const Target &T,
@@ -156,6 +162,12 @@ static DecodeStatus DecodeAR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, SystemZMC::AR32Regs, 16);
 }
 
+static DecodeStatus DecodeCR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::CR64Regs, 16);
+}
+
 template<unsigned N>
 static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
   if (!isUInt<N>(Imm))
@@ -321,6 +333,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field,
+                                               const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 16 && "Invalid BDLAddr12Len4");
+  Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::createImm(Disp));
+  Inst.addOperand(MCOperand::createImm(Length + 1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
                                                const unsigned *Regs) {
   uint64_t Length = Field >> 16;
@@ -393,6 +417,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
   return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
 }
 
+static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst,
+                                                     uint64_t Field,
+                                                     uint64_t Address,
+                                                     const void *Decoder) {
+  return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
 static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
                                                      uint64_t Field,
                                                      uint64_t Address,
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 1207c7b..6cd12e1 100644
--- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,10 +10,13 @@
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 6336f5e..d65c661 100644
--- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -15,8 +15,10 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCOperand;
 
 class SystemZInstPrinter : public MCInstPrinter {
@@ -70,6 +72,7 @@ private:
   // This forms part of the instruction name rather than the operand list.
   void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 9192448..51ac410 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
@@ -50,8 +50,9 @@ public:
     return SystemZ::NumTargetFixupKinds;
   }
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override;
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
@@ -89,15 +90,17 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   return Infos[Kind - FirstTargetFixupKind];
 }
 
-void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value,
-                                     bool IsPCRel) const {
+void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm,
+                                     const MCFixup &Fixup,
+                                     const MCValue &Target,
+                                     MutableArrayRef<char> Data, uint64_t Value,
+                                     bool IsResolved) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
   unsigned Size = (BitSize + 7) / 8;
 
-  assert(Offset + Size <= DataSize && "Invalid fixup offset!");
+  assert(Offset + Size <= Data.size() && "Invalid fixup offset!");
 
   // Big-endian insertion of Size bytes.
   Value = extractBitsForFixup(Kind, Value);
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index b17977d..6e00981 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -14,7 +14,7 @@
 using namespace llvm;
 
 SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
-  PointerSize = 8;
+  CodePointerSize = 8;
   CalleeSaveStackSlotSize = 8;
   IsLittleEndian = false;
 
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index 7082aba..d188f56 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -11,20 +11,28 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
   MCContext &Ctx;
@@ -34,7 +42,7 @@ public:
     : MCII(mcii), Ctx(ctx) {
   }
 
-  ~SystemZMCCodeEmitter() override {}
+  ~SystemZMCCodeEmitter() override = default;
 
   // OVerride MCCodeEmitter.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -69,6 +77,9 @@ private:
   uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
+  uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
   uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
@@ -137,13 +148,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-} // end anonymous namespace
 
-MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                MCContext &Ctx) {
-  return new SystemZMCCodeEmitter(MCII, Ctx);
-}
+} // end anonymous namespace
 
 void SystemZMCCodeEmitter::
 encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -217,6 +223,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
 }
 
 uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1;
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
 getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
@@ -282,3 +299,9 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "SystemZGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                MCContext &Ctx) {
+  return new SystemZMCCodeEmitter(MCII, Ctx);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 43a96e8..df0a816 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -7,35 +7,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
+
 class SystemZObjectWriter : public MCELFObjectTargetWriter {
 public:
   SystemZObjectWriter(uint8_t OSABI);
-
-  ~SystemZObjectWriter() override;
+  ~SystemZObjectWriter() override = default;
 
 protected:
   // Override MCELFObjectTargetWriter.
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
+
 } // end anonymous namespace
 
 SystemZObjectWriter::SystemZObjectWriter(uint8_t OSABI)
   : MCELFObjectTargetWriter(/*Is64Bit=*/true, OSABI, ELF::EM_S390,
                             /*HasRelocationAddend=*/ true) {}
 
-SystemZObjectWriter::~SystemZObjectWriter() {
-}
-
 // Return the relocation type for an absolute value of MCFixupKind Kind.
 static unsigned getAbsoluteReloc(unsigned Kind) {
   switch (Kind) {
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index dfea7e3..727ab92 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -116,6 +116,13 @@ const unsigned SystemZMC::AR32Regs[16] = {
   SystemZ::A12, SystemZ::A13, SystemZ::A14, SystemZ::A15
 };
 
+const unsigned SystemZMC::CR64Regs[16] = {
+  SystemZ::C0, SystemZ::C1, SystemZ::C2, SystemZ::C3,
+  SystemZ::C4, SystemZ::C5, SystemZ::C6, SystemZ::C7,
+  SystemZ::C8, SystemZ::C9, SystemZ::C10, SystemZ::C11,
+  SystemZ::C12, SystemZ::C13, SystemZ::C14, SystemZ::C15
+};
+
 unsigned SystemZMC::getFirstReg(unsigned Reg) {
   static unsigned Map[SystemZ::NUM_TARGET_REGS];
   static bool Initialized = false;
diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index d9926c7..dbca348 100644
--- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -55,6 +55,7 @@ extern const unsigned VR32Regs[32];
 extern const unsigned VR64Regs[32];
 extern const unsigned VR128Regs[32];
 extern const unsigned AR32Regs[16];
+extern const unsigned CR64Regs[16];
 
 // Return the 0-based number of the first architectural register that
 // contains the given LLVM register.   E.g. R1D -> 1.
diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt
index 86a1322..9b71415 100644
--- a/contrib/llvm/lib/Target/SystemZ/README.txt
+++ b/contrib/llvm/lib/Target/SystemZ/README.txt
@@ -63,7 +63,12 @@ via a register.)
 
 --
 
-We don't use ICM or STCM.
+We don't use ICM, STCM, or CLM.
+
+--
+
+We don't use ADD (LOGICAL) HIGH, SUBTRACT (LOGICAL) HIGH,
+or COMPARE (LOGICAL) HIGH yet.
 
 --
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZ.td b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
index 6bdfd4d..41300a1 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZ.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZ.td
@@ -54,6 +54,9 @@ include "SystemZInstrFormats.td"
 include "SystemZInstrInfo.td"
 include "SystemZInstrVector.td"
 include "SystemZInstrFP.td"
+include "SystemZInstrHFP.td"
+include "SystemZInstrDFP.td"
+include "SystemZInstrSystem.td"
 
 def SystemZInstrInfo : InstrInfo {}
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index b4c843f..d70f9e9 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -13,15 +13,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,11 +41,11 @@ STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
 STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
 
 namespace {
+
 // Represents the references to a particular register in one or more
 // instructions.
 struct Reference {
-  Reference()
-    : Def(false), Use(false) {}
+  Reference() = default;
 
   Reference &operator|=(const Reference &Other) {
     Def |= Other.Def;
@@ -49,15 +57,16 @@ struct Reference {
 
   // True if the register is defined or used in some form, either directly or
   // via a sub- or super-register.
-  bool Def;
-  bool Use;
+  bool Def = false;
+  bool Use = false;
 };
 
 class SystemZElimCompare : public MachineFunctionPass {
 public:
   static char ID;
+
   SystemZElimCompare(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
+    : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override {
     return "SystemZ Comparison Elimination";
@@ -65,6 +74,7 @@ public:
 
   bool processBlock(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &F) override;
+
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::NoVRegs);
@@ -84,16 +94,13 @@ private:
   bool fuseCompareOperations(MachineInstr &Compare,
                              SmallVectorImpl<MachineInstr *> &CCUsers);
 
-  const SystemZInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
+  const SystemZInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 };
 
 char SystemZElimCompare::ID = 0;
-} // end anonymous namespace
 
-FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
-  return new SystemZElimCompare(TM);
-}
+} // end anonymous namespace
 
 // Return true if CC is live out of MBB.
 static bool isCCLiveOut(MachineBasicBlock &MBB) {
@@ -167,7 +174,7 @@ static unsigned getCompareSourceReg(MachineInstr &Compare) {
     reg = Compare.getOperand(0).getReg();
   else if (isLoadAndTestAsCmp(Compare))
     reg = Compare.getOperand(1).getReg();
-  assert (reg);
+  assert(reg);
 
   return reg;
 }
@@ -216,9 +223,7 @@ bool SystemZElimCompare::convertToBRCT(
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(BRCT));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
-  MIB.addOperand(MI.getOperand(0))
-     .addOperand(MI.getOperand(1))
-     .addOperand(Target);
+  MIB.add(MI.getOperand(0)).add(MI.getOperand(1)).add(Target);
   // Add a CC def to BRCT(G), since we may have to split them again if the
   // branch displacement overflows.  BRCTH has a 32-bit displacement, so
   // this is not necessary there.
@@ -261,10 +266,10 @@ bool SystemZElimCompare::convertToLoadAndTrap(
     Branch->RemoveOperand(0);
   Branch->setDesc(TII->get(LATOpcode));
   MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
-      .addOperand(MI.getOperand(0))
-      .addOperand(MI.getOperand(1))
-      .addOperand(MI.getOperand(2))
-      .addOperand(MI.getOperand(3));
+      .add(MI.getOperand(0))
+      .add(MI.getOperand(1))
+      .add(MI.getOperand(2))
+      .add(MI.getOperand(3));
   MI.eraseFromParent();
   return true;
 }
@@ -368,10 +373,8 @@ static bool isCompareZero(MachineInstr &Compare) {
     return true;
 
   default:
-
     if (isLoadAndTestAsCmp(Compare))
       return true;
-
     return Compare.getNumExplicitOperands() == 2 &&
            Compare.getOperand(1).isImm() && Compare.getOperand(1).getImm() == 0;
   }
@@ -502,15 +505,15 @@ bool SystemZElimCompare::fuseCompareOperations(
   Branch->setDesc(TII->get(FusedOpcode));
   MachineInstrBuilder MIB(*Branch->getParent()->getParent(), Branch);
   for (unsigned I = 0; I < SrcNOps; I++)
-    MIB.addOperand(Compare.getOperand(I));
-  MIB.addOperand(CCMask);
+    MIB.add(Compare.getOperand(I));
+  MIB.add(CCMask);
 
   if (Type == SystemZII::CompareAndBranch) {
     // Only conditional branches define CC, as they may be converted back
     // to a non-fused branch because of a long displacement.  Conditional
     // returns don't have that problem.
-    MIB.addOperand(Target)
-       .addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead);
+    MIB.add(Target).addReg(SystemZ::CC,
+                           RegState::ImplicitDefine | RegState::Dead);
   }
 
   if (Type == SystemZII::CompareAndSibcall)
@@ -573,3 +576,7 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
 
   return Changed;
 }
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+  return new SystemZElimCompare(TM);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
index 92ce808..d02db9a 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -74,7 +74,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
   unsigned CCValid = MI.getOperand(3).getImm();
   unsigned CCMask = MI.getOperand(4).getImm();
 
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LivePhysRegs LiveRegs(TII->getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
   for (auto I = std::prev(MBB.end()); I != MBBI; --I)
     LiveRegs.stepBackward(*I);
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
index 716e5ad..fda9c30 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -68,6 +68,21 @@ def FeaturePopulationCount : SystemZFeature<
   "Assume that the population-count facility is installed"
 >;
 
+def FeatureMessageSecurityAssist3 : SystemZFeature<
+  "message-security-assist-extension3", "MessageSecurityAssist3",
+  "Assume that the message-security-assist extension facility 3 is installed"
+>;
+
+def FeatureMessageSecurityAssist4 : SystemZFeature<
+  "message-security-assist-extension4", "MessageSecurityAssist4",
+  "Assume that the message-security-assist extension facility 4 is installed"
+>;
+
+def FeatureResetReferenceBitsMultiple : SystemZFeature<
+  "reset-reference-bits-multiple", "ResetReferenceBitsMultiple",
+  "Assume that the reset-reference-bits-multiple facility is installed"
+>;
+
 def Arch9NewFeatures : SystemZFeatureList<[
     FeatureDistinctOps,
     FeatureFastSerialization,
@@ -75,7 +90,10 @@ def Arch9NewFeatures : SystemZFeatureList<[
     FeatureHighWord,
     FeatureInterlockedAccess1,
     FeatureLoadStoreOnCond,
-    FeaturePopulationCount
+    FeaturePopulationCount,
+    FeatureMessageSecurityAssist3,
+    FeatureMessageSecurityAssist4,
+    FeatureResetReferenceBitsMultiple
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -109,12 +127,24 @@ def FeatureTransactionalExecution : SystemZFeature<
   "Assume that the transactional-execution facility is installed"
 >;
 
+def FeatureDFPZonedConversion : SystemZFeature<
+  "dfp-zoned-conversion", "DFPZonedConversion",
+  "Assume that the DFP zoned-conversion facility is installed"
+>;
+
+def FeatureEnhancedDAT2 : SystemZFeature<
+  "enhanced-dat-2", "EnhancedDAT2",
+  "Assume that the enhanced-DAT facility 2 is installed"
+>;
+
 def Arch10NewFeatures : SystemZFeatureList<[
     FeatureExecutionHint,
     FeatureLoadAndTrap,
     FeatureMiscellaneousExtensions,
     FeatureProcessorAssist,
-    FeatureTransactionalExecution
+    FeatureTransactionalExecution,
+    FeatureDFPZonedConversion,
+    FeatureEnhancedDAT2
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -133,6 +163,16 @@ def FeatureLoadStoreOnCond2 : SystemZFeature<
   "Assume that the load/store-on-condition facility 2 is installed"
 >;
 
+def FeatureMessageSecurityAssist5 : SystemZFeature<
+  "message-security-assist-extension5", "MessageSecurityAssist5",
+  "Assume that the message-security-assist extension facility 5 is installed"
+>;
+
+def FeatureDFPPackedConversion : SystemZFeature<
+  "dfp-packed-conversion", "DFPPackedConversion",
+  "Assume that the DFP packed-conversion facility is installed"
+>;
+
 def FeatureVector : SystemZFeature<
   "vector", "Vector",
   "Assume that the vectory facility is installed"
@@ -142,11 +182,65 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">;
 def Arch11NewFeatures : SystemZFeatureList<[
     FeatureLoadAndZeroRightmostByte,
     FeatureLoadStoreOnCond2,
+    FeatureMessageSecurityAssist5,
+    FeatureDFPPackedConversion,
     FeatureVector
 ]>;
 
 //===----------------------------------------------------------------------===//
 //
+// New features added in the Twelvth Edition of the z/Architecture
+//
+//===----------------------------------------------------------------------===//
+
+def FeatureMiscellaneousExtensions2 : SystemZFeature<
+  "miscellaneous-extensions-2", "MiscellaneousExtensions2",
+  "Assume that the miscellaneous-extensions facility 2 is installed"
+>;
+
+def FeatureGuardedStorage : SystemZFeature<
+  "guarded-storage", "GuardedStorage",
+  "Assume that the guarded-storage facility is installed"
+>;
+
+def FeatureMessageSecurityAssist7 : SystemZFeature<
+  "message-security-assist-extension7", "MessageSecurityAssist7",
+  "Assume that the message-security-assist extension facility 7 is installed"
+>;
+
+def FeatureMessageSecurityAssist8 : SystemZFeature<
+  "message-security-assist-extension8", "MessageSecurityAssist8",
+  "Assume that the message-security-assist extension facility 8 is installed"
+>;
+
+def FeatureVectorEnhancements1 : SystemZFeature<
+  "vector-enhancements-1", "VectorEnhancements1",
+  "Assume that the vector enhancements facility 1 is installed"
+>;
+def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">;
+
+def FeatureVectorPackedDecimal : SystemZFeature<
+  "vector-packed-decimal", "VectorPackedDecimal",
+  "Assume that the vector packed decimal facility is installed"
+>;
+
+def FeatureInsertReferenceBitsMultiple : SystemZFeature<
+  "insert-reference-bits-multiple", "InsertReferenceBitsMultiple",
+  "Assume that the insert-reference-bits-multiple facility is installed"
+>;
+
+def Arch12NewFeatures : SystemZFeatureList<[
+    FeatureMiscellaneousExtensions2,
+    FeatureGuardedStorage,
+    FeatureMessageSecurityAssist7,
+    FeatureMessageSecurityAssist8,
+    FeatureVectorEnhancements1,
+    FeatureVectorPackedDecimal,
+    FeatureInsertReferenceBitsMultiple
+]>;
+
+//===----------------------------------------------------------------------===//
+//
 // Cumulative supported and unsupported feature sets
 //
 //===----------------------------------------------------------------------===//
@@ -159,9 +253,13 @@ def Arch10SupportedFeatures
   : SystemZFeatureAdd<Arch9SupportedFeatures.List,  Arch10NewFeatures.List>;
 def Arch11SupportedFeatures
   : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>;
+def Arch12SupportedFeatures
+  : SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>;
 
-def Arch11UnsupportedFeatures
+def Arch12UnsupportedFeatures
   : SystemZFeatureList<[]>;
+def Arch11UnsupportedFeatures
+  : SystemZFeatureAdd<Arch12UnsupportedFeatures.List, Arch12NewFeatures.List>;
 def Arch10UnsupportedFeatures
   : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>;
 def Arch9UnsupportedFeatures
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index a28a91e..0cb2b5a 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -277,8 +277,21 @@ void SystemZFrameLowering::
 processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
-  uint64_t MaxReach = (MFFrame.estimateStackSize(MF) +
-                       SystemZMC::CallFrameSize * 2);
+  // Get the size of our stack frame to be allocated ...
+  uint64_t StackSize = (MFFrame.estimateStackSize(MF) +
+                        SystemZMC::CallFrameSize);
+  // ... and the maximum offset we may need to reach into the
+  // caller's frame to access the save area or stack arguments.
+  int64_t MaxArgOffset = SystemZMC::CallFrameSize;
+  for (int I = MFFrame.getObjectIndexBegin(); I != 0; ++I)
+    if (MFFrame.getObjectOffset(I) >= 0) {
+      int64_t ArgOffset = SystemZMC::CallFrameSize +
+                          MFFrame.getObjectOffset(I) +
+                          MFFrame.getObjectSize(I);
+      MaxArgOffset = std::max(MaxArgOffset, ArgOffset);
+    }
+
+  uint64_t MaxReach = StackSize + MaxArgOffset;
   if (!isUInt<12>(MaxReach)) {
     // We may need register scavenging slots if some parts of the frame
     // are outside the reach of an unsigned 12-bit displacement.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
index fe4b52b..73a1036 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.cpp
@@ -26,7 +26,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 // This is the limit of processor resource usage at which the
 // scheduler should try to look for other instructions (not using the
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
index 8fa54ee..0c755c9 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZHazardRecognizer.h
@@ -25,10 +25,10 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZHAZARDRECOGNIZER_H
 
 #include "SystemZSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/raw_ostream.h"
 #include <string>
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 920b6e4..cd2f708 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -711,9 +712,9 @@ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
   // The inner check covers all cases but is more expensive.
   uint64_t Used = allOnes(Op.getValueSizeInBits());
   if (Used != (AndMask | InsertMask)) {
-    APInt KnownZero, KnownOne;
-    CurDAG->computeKnownBits(Op.getOperand(0), KnownZero, KnownOne);
-    if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
+    KnownBits Known;
+    CurDAG->computeKnownBits(Op.getOperand(0), Known);
+    if (Used != (AndMask | InsertMask | Known.Zero.getZExtValue()))
       return false;
   }
 
@@ -770,9 +771,9 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       // If some bits of Input are already known zeros, those bits will have
       // been removed from the mask.  See if adding them back in makes the
       // mask suitable.
-      APInt KnownZero, KnownOne;
-      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
-      Mask |= KnownZero.getZExtValue();
+      KnownBits Known;
+      CurDAG->computeKnownBits(Input, Known);
+      Mask |= Known.Zero.getZExtValue();
       if (!refineRxSBGMask(RxSBG, Mask))
         return false;
     }
@@ -794,9 +795,9 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       // If some bits of Input are already known ones, those bits will have
       // been removed from the mask.  See if adding them back in makes the
       // mask suitable.
-      APInt KnownZero, KnownOne;
-      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
-      Mask &= ~KnownOne.getZExtValue();
+      KnownBits Known;
+      CurDAG->computeKnownBits(Input, Known);
+      Mask &= ~Known.One.getZExtValue();
       if (!refineRxSBGMask(RxSBG, Mask))
         return false;
     }
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2d0a06a..2d916d2 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -20,8 +20,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
 #include <cctype>
 
 using namespace llvm;
@@ -100,7 +101,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
     addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
   }
-  addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+  if (Subtarget.hasVectorEnhancements1())
+    addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
+  else
+    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
   if (Subtarget.hasVector()) {
     addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
@@ -194,6 +198,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // Only z196 and above have native support for conversions to unsigned.
+      // On z10, promoting to i64 doesn't generate an inexact condition for
+      // values that are outside the i32 range but in the i64 range, so use
+      // the default expansion.
       if (!Subtarget.hasFPExtension())
         setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     }
@@ -312,7 +319,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::AND, VT, Legal);
       setOperationAction(ISD::OR, VT, Legal);
       setOperationAction(ISD::XOR, VT, Legal);
-      setOperationAction(ISD::CTPOP, VT, Custom);
+      if (Subtarget.hasVectorEnhancements1())
+        setOperationAction(ISD::CTPOP, VT, Legal);
+      else
+        setOperationAction(ISD::CTPOP, VT, Custom);
       setOperationAction(ISD::CTTZ, VT, Legal);
       setOperationAction(ISD::CTLZ, VT, Legal);
 
@@ -344,9 +354,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // There should be no need to check for float types other than v2f64
     // since <2 x f32> isn't a legal type.
     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
   }
 
   // Handle floating-point types.
@@ -406,10 +420,60 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
   }
 
+  // The vector enhancements facility 1 has instructions for these.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+    setOperationAction(ISD::FABS, MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
+
+    setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMAXNAN, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
+    setOperationAction(ISD::FMINNAN, MVT::f128, Legal);
+  }
+
   // We have fused multiply-addition for f32 and f64 but not f128.
   setOperationAction(ISD::FMA, MVT::f32,  Legal);
   setOperationAction(ISD::FMA, MVT::f64,  Legal);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FMA, MVT::f128, Legal);
+  else
+    setOperationAction(ISD::FMA, MVT::f128, Expand);
+
+  // We don't have a copysign instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1())
+    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
 
   // Needed so that we don't try to implement f128 constant loads using
   // a load-and-extend of a f80 constant (in cases where the constant
@@ -417,6 +481,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
 
+  // We don't have extending load instruction on vector registers.
+  if (Subtarget.hasVectorEnhancements1()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
+  }
+
   // Floating-point truncation and stores need to be done separately.
   setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
@@ -481,7 +551,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   case MVT::f64:
     return true;
   case MVT::f128:
-    return false;
+    return Subtarget.hasVectorEnhancements1();
   default:
     break;
   }
@@ -822,7 +892,7 @@ bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
   return isTruncateFree(FromType, ToType);
 }
 
-bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return CI->isTailCall();
 }
 
@@ -1102,9 +1172,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Mark the start of the call.
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain,
-                                 DAG.getConstant(NumBytes, DL, PtrVT, true),
-                                 DL);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -1316,11 +1384,6 @@ SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue SystemZTargetLowering::prepareVolatileOrAtomicLoad(
-    SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const {
-  return DAG.getNode(SystemZISD::SERIALIZE, DL, MVT::Other, Chain);
-}
-
 // Return true if Op is an intrinsic node with chain that returns the CC value
 // as its only (other) argument.  Provide the associated SystemZISD opcode and
 // the mask of valid CC values if so.
@@ -1461,21 +1524,25 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
     return true;
 
   case Intrinsic::s390_vfcedbs:
+  case Intrinsic::s390_vfcesbs:
     Opcode = SystemZISD::VFCMPES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchdbs:
+  case Intrinsic::s390_vfchsbs:
     Opcode = SystemZISD::VFCMPHS;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vfchedbs:
+  case Intrinsic::s390_vfchesbs:
     Opcode = SystemZISD::VFCMPHES;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
 
   case Intrinsic::s390_vftcidb:
+  case Intrinsic::s390_vftcisb:
     Opcode = SystemZISD::VFTCI;
     CCValid = SystemZ::CCMASK_VCMP;
     return true;
@@ -2053,6 +2120,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   if (NewC.ICmpType != SystemZICMP::SignedOnly &&
       NewC.Op0.getOpcode() == ISD::SHL &&
       isSimpleShift(NewC.Op0, ShiftVal) &&
+      (MaskVal >> ShiftVal != 0) &&
       (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                         MaskVal >> ShiftVal,
                                         CmpVal >> ShiftVal,
@@ -2062,6 +2130,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
   } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
              NewC.Op0.getOpcode() == ISD::SRL &&
              isSimpleShift(NewC.Op0, ShiftVal) &&
+             (MaskVal << ShiftVal != 0) &&
              (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
                                                MaskVal << ShiftVal,
                                                CmpVal << ShiftVal,
@@ -2221,15 +2290,12 @@ static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
 
 // Lower a binary operation that produces two VT results, one in each
 // half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
-// Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
-// on the extended Op0 and (unextended) Op1.  Store the even register result
+// and Opcode performs the GR128 operation.  Store the even register result
 // in Even and the odd register result in Odd.
 static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
-                             unsigned Extend, unsigned Opcode, SDValue Op0,
-                             SDValue Op1, SDValue &Even, SDValue &Odd) {
-  SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0);
-  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
-                               SDValue(In128, 0), Op1);
+                             unsigned Opcode, SDValue Op0, SDValue Op1,
+                             SDValue &Even, SDValue &Odd) {
+  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1);
   bool Is32Bit = is32Bit(VT);
   Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
   Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
@@ -2316,11 +2382,15 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
 
 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
 // producing a result of type VT.
-static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
-                            EVT VT, SDValue CmpOp0, SDValue CmpOp1) {
-  // There is no hardware support for v4f32, so extend the vector into
-  // two v2f64s and compare those.
-  if (CmpOp0.getValueType() == MVT::v4f32) {
+SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                                            const SDLoc &DL, EVT VT,
+                                            SDValue CmpOp0,
+                                            SDValue CmpOp1) const {
+  // There is no hardware support for v4f32 (unless we have the vector
+  // enhancements facility 1), so extend the vector into two v2f64s
+  // and compare those.
+  if (CmpOp0.getValueType() == MVT::v4f32 &&
+      !Subtarget.hasVectorEnhancements1()) {
     SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0);
     SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0);
     SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1);
@@ -2334,9 +2404,11 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL,
 
 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
 // an integer mask of type VT.
-static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
-                                ISD::CondCode CC, SDValue CmpOp0,
-                                SDValue CmpOp1) {
+SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
+                                                const SDLoc &DL, EVT VT,
+                                                ISD::CondCode CC,
+                                                SDValue CmpOp0,
+                                                SDValue CmpOp1) const {
   bool IsFP = CmpOp0.getValueType().isFloatingPoint();
   bool Invert = false;
   SDValue Cmp;
@@ -2344,6 +2416,7 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
     // Handle tests for order using (or (ogt y x) (oge x y)).
   case ISD::SETUO:
     Invert = true;
+    LLVM_FALLTHROUGH;
   case ISD::SETO: {
     assert(IsFP && "Unexpected integer comparison");
     SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
@@ -2355,6 +2428,7 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
     // Handle <> tests using (or (ogt y x) (ogt x y)).
   case ISD::SETUEQ:
     Invert = true;
+    LLVM_FALLTHROUGH;
   case ISD::SETONE: {
     assert(IsFP && "Unexpected integer comparison");
     SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0);
@@ -2789,8 +2863,9 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   // but we need this case for bitcasts that are created during lowering
   // and which are then lowered themselves.
   if (auto *LoadN = dyn_cast<LoadSDNode>(In))
-    return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
-                       LoadN->getMemOperand());
+    if (ISD::isNormalLoad(LoadN))
+      return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
+                         LoadN->getMemOperand());
 
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
     SDValue In64;
@@ -2957,8 +3032,14 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     // We define this so that it can be used for constant division.
     lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
                     Op.getOperand(1), Ops[1], Ops[0]);
+  else if (Subtarget.hasMiscellaneousExtensions2())
+    // SystemZISD::SMUL_LOHI returns the low result in the odd register and
+    // the high result in the even register.  ISD::SMUL_LOHI is defined to
+    // return the low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   else {
-    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
     //
     //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
     //
@@ -2976,10 +3057,10 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     SDValue RL = Op.getOperand(1);
     SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
     SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
-    // UMUL_LOHI64 returns the low result in the odd register and the high
-    // result in the even register.  SMUL_LOHI is defined to return the
-    // low half first, so the results are in reverse order.
-    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+    // SystemZISD::UMUL_LOHI returns the low result in the odd register and
+    // the high result in the even register.  ISD::SMUL_LOHI is defined to
+    // return the low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
                      LL, RL, Ops[1], Ops[0]);
     SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
     SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
@@ -3000,10 +3081,10 @@ SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
     lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
                     Op.getOperand(1), Ops[1], Ops[0]);
   else
-    // UMUL_LOHI64 returns the low result in the odd register and the high
-    // result in the even register.  UMUL_LOHI is defined to return the
-    // low half first, so the results are in reverse order.
-    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+    // SystemZISD::UMUL_LOHI returns the low result in the odd register and
+    // the high result in the even register.  ISD::UMUL_LOHI is defined to
+    // return the low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, DL);
 }
@@ -3014,24 +3095,19 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
   SDValue Op1 = Op.getOperand(1);
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
-  unsigned Opcode;
 
-  // We use DSGF for 32-bit division.
-  if (is32Bit(VT)) {
+  // We use DSGF for 32-bit division.  This means the first operand must
+  // always be 64-bit, and the second operand should be 32-bit whenever
+  // that is possible, to improve performance.
+  if (is32Bit(VT))
     Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
-    Opcode = SystemZISD::SDIVREM32;
-  } else if (DAG.ComputeNumSignBits(Op1) > 32) {
+  else if (DAG.ComputeNumSignBits(Op1) > 32)
     Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
-    Opcode = SystemZISD::SDIVREM32;
-  } else
-    Opcode = SystemZISD::SDIVREM64;
 
-  // DSG(F) takes a 64-bit dividend, so the even register in the GR128
-  // input is "don't care".  The instruction returns the remainder in
-  // the even register and the quotient in the odd register.
+  // DSG(F) returns the remainder in the even register and the
+  // quotient in the odd register.
   SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
-                   Op0, Op1, Ops[1], Ops[0]);
+  lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, DL);
 }
 
@@ -3040,16 +3116,11 @@ SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
-  // DL(G) uses a double-width dividend, so we need to clear the even
-  // register in the GR128 input.  The instruction returns the remainder
-  // in the even register and the quotient in the odd register.
+  // DL(G) returns the remainder in the even register and the
+  // quotient in the odd register.
   SDValue Ops[2];
-  if (is32Bit(VT))
-    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_32, SystemZISD::UDIVREM32,
-                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
-  else
-    lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
-                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  lowerGR128Binary(DAG, DL, VT, SystemZISD::UDIVREM,
+                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, DL);
 }
 
@@ -3058,14 +3129,14 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
 
   // Get the known-zero masks for each operand.
   SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
-  APInt KnownZero[2], KnownOne[2];
-  DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
-  DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
+  KnownBits Known[2];
+  DAG.computeKnownBits(Ops[0], Known[0]);
+  DAG.computeKnownBits(Ops[1], Known[1]);
 
   // See if the upper 32 bits of one operand and the lower 32 bits of the
   // other are known zero.  They are the low and high operands respectively.
-  uint64_t Masks[] = { KnownZero[0].getZExtValue(),
-                       KnownZero[1].getZExtValue() };
+  uint64_t Masks[] = { Known[0].Zero.getZExtValue(),
+                       Known[1].Zero.getZExtValue() };
   unsigned High, Low;
   if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
     High = 1, Low = 0;
@@ -3150,9 +3221,9 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
   }
 
   // Get the known-zero mask for the operand.
-  APInt KnownZero, KnownOne;
-  DAG.computeKnownBits(Op, KnownZero, KnownOne);
-  unsigned NumSignificantBits = (~KnownZero).getActiveBits();
+  KnownBits Known;
+  DAG.computeKnownBits(Op, Known);
+  unsigned NumSignificantBits = (~Known.Zero).getActiveBits();
   if (NumSignificantBits == 0)
     return DAG.getConstant(0, DL, VT);
 
@@ -3189,13 +3260,13 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   SDLoc DL(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
-      FenceScope == CrossThread) {
+      FenceSSID == SyncScope::System) {
     return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
                                       Op.getOperand(0)),
                    0);
@@ -3205,12 +3276,15 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-// Op is an atomic load.  Lower it into a normal volatile load.
+// Op is an atomic load.  Lower it into a serialization followed
+// by a normal volatile load.
 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
                                                 SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
+  SDValue Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
+                                             MVT::Other, Node->getChain()), 0);
   return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
-                        Node->getChain(), Node->getBasePtr(),
+                        Chain, Node->getBasePtr(),
                         Node->getMemoryVT(), Node->getMemOperand());
 }
 
@@ -3802,7 +3876,7 @@ namespace {
 struct GeneralShuffle {
   GeneralShuffle(EVT vt) : VT(vt) {}
   void addUndef();
-  void add(SDValue, unsigned);
+  bool add(SDValue, unsigned);
   SDValue getNode(SelectionDAG &, const SDLoc &);
 
   // The operands of the shuffle.
@@ -3828,8 +3902,10 @@ void GeneralShuffle::addUndef() {
 // Add an extra element to the shuffle, taking it from element Elem of Op.
 // A null Op indicates a vector input whose value will be calculated later;
 // there is at most one such input per shuffle and it always has the same
-// type as the result.
-void GeneralShuffle::add(SDValue Op, unsigned Elem) {
+// type as the result. Aborts and returns false if the source vector elements
+// of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
+// LLVM they become implicitly extended, but this is rare and not optimized.
+bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
 
   // The source vector can have wider elements than the result,
@@ -3837,8 +3913,12 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
   // We want the least significant part.
   EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
   unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
-  assert(FromBytesPerElement >= BytesPerElement &&
-         "Invalid EXTRACT_VECTOR_ELT");
+
+  // Return false if the source elements are smaller than their destination
+  // elements.
+  if (FromBytesPerElement < BytesPerElement)
+    return false;
+
   unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
                    (FromBytesPerElement - BytesPerElement));
 
@@ -3856,13 +3936,13 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
         break;
       if (NewByte < 0) {
         addUndef();
-        return;
+        return true;
       }
       Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
       Byte = unsigned(NewByte) % SystemZ::VectorBytes;
     } else if (Op.isUndef()) {
       addUndef();
-      return;
+      return true;
     } else
       break;
   }
@@ -3879,6 +3959,8 @@ void GeneralShuffle::add(SDValue Op, unsigned Elem) {
   unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
   for (unsigned I = 0; I < BytesPerElement; ++I)
     Bytes.push_back(Base + I);
+
+  return true;
 }
 
 // Return SDNodes for the completed shuffle.
@@ -4110,12 +4192,14 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
     if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op.getOperand(1).getOpcode() == ISD::Constant) {
       unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-      GS.add(Op.getOperand(0), Elem);
+      if (!GS.add(Op.getOperand(0), Elem))
+        return SDValue();
       FoundOne = true;
     } else if (Op.isUndef()) {
       GS.addUndef();
     } else {
-      GS.add(SDValue(), ResidueOps.size());
+      if (!GS.add(SDValue(), ResidueOps.size()))
+        return SDValue();
       ResidueOps.push_back(BVN->getOperand(I));
     }
   }
@@ -4172,12 +4256,20 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
 
+  // If all elements are loads, use VLREP/VLEs (below).
+  bool AllLoads = true;
+  for (auto Elem : Elems)
+    if (Elem.getOpcode() != ISD::LOAD || cast<LoadSDNode>(Elem)->isIndexed()) {
+      AllLoads = false;
+      break;
+    }
+
   // The best way of building a v2i64 from two i64s is to use VLVGP.
-  if (VT == MVT::v2i64)
+  if (VT == MVT::v2i64 && !AllLoads)
     return joinDwords(DAG, DL, Elems[0], Elems[1]);
 
   // Use a 64-bit merge high to combine two doubles.
-  if (VT == MVT::v2f64)
+  if (VT == MVT::v2f64 && !AllLoads)
     return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
 
   // Build v4f32 values directly from the FPRs:
@@ -4187,7 +4279,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   //      <ABxx>         <CDxx>
   //                V                 VMRHG
   //              <ABCD>
-  if (VT == MVT::v4f32) {
+  if (VT == MVT::v4f32 && !AllLoads) {
     SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
     SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
     // Avoid unnecessary undefs by reusing the other operand.
@@ -4229,23 +4321,37 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
         Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
     Result = DAG.getBuildVector(VT, DL, Constants);
   } else {
-    // Otherwise try to use VLVGP to start the sequence in order to
+    // Otherwise try to use VLREP or VLVGP to start the sequence in order to
     // avoid a false dependency on any previous contents of the vector
-    // register.  This only makes sense if one of the associated elements
-    // is defined.
-    unsigned I1 = NumElements / 2 - 1;
-    unsigned I2 = NumElements - 1;
-    bool Def1 = !Elems[I1].isUndef();
-    bool Def2 = !Elems[I2].isUndef();
-    if (Def1 || Def2) {
-      SDValue Elem1 = Elems[Def1 ? I1 : I2];
-      SDValue Elem2 = Elems[Def2 ? I2 : I1];
-      Result = DAG.getNode(ISD::BITCAST, DL, VT,
-                           joinDwords(DAG, DL, Elem1, Elem2));
-      Done[I1] = true;
-      Done[I2] = true;
-    } else
-      Result = DAG.getUNDEF(VT);
+    // register.
+
+    // Use a VLREP if at least one element is a load.
+    unsigned LoadElIdx = UINT_MAX;
+    for (unsigned I = 0; I < NumElements; ++I)
+      if (Elems[I].getOpcode() == ISD::LOAD &&
+          cast<LoadSDNode>(Elems[I])->isUnindexed()) {
+        LoadElIdx = I;
+        break;
+      }
+    if (LoadElIdx != UINT_MAX) {
+      Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, Elems[LoadElIdx]);
+      Done[LoadElIdx] = true;
+    } else {
+      // Try to use VLVGP.
+      unsigned I1 = NumElements / 2 - 1;
+      unsigned I2 = NumElements - 1;
+      bool Def1 = !Elems[I1].isUndef();
+      bool Def2 = !Elems[I2].isUndef();
+      if (Def1 || Def2) {
+        SDValue Elem1 = Elems[Def1 ? I1 : I2];
+        SDValue Elem2 = Elems[Def2 ? I2 : I1];
+        Result = DAG.getNode(ISD::BITCAST, DL, VT,
+                             joinDwords(DAG, DL, Elem1, Elem2));
+        Done[I1] = true;
+        Done[I2] = true;
+      } else
+        Result = DAG.getUNDEF(VT);
+    }
   }
 
   // Use VLVGx to insert the other elements.
@@ -4354,9 +4460,9 @@ SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
     int Elt = VSN->getMaskElt(I);
     if (Elt < 0)
       GS.addUndef();
-    else
-      GS.add(Op.getOperand(unsigned(Elt) / NumElements),
-             unsigned(Elt) % NumElements);
+    else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
+                     unsigned(Elt) % NumElements))
+      return SDValue();
   }
   return GS.getNode(DAG, SDLoc(VSN));
 }
@@ -4630,11 +4736,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
     OPCODE(POPCNT);
-    OPCODE(UMUL_LOHI64);
-    OPCODE(SDIVREM32);
-    OPCODE(SDIVREM64);
-    OPCODE(UDIVREM32);
-    OPCODE(UDIVREM64);
+    OPCODE(SMUL_LOHI);
+    OPCODE(UMUL_LOHI);
+    OPCODE(SDIVREM);
+    OPCODE(UDIVREM);
     OPCODE(MVC);
     OPCODE(MVC_LOOP);
     OPCODE(NC);
@@ -4649,7 +4754,6 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(STRCMP);
     OPCODE(SEARCH_STRING);
     OPCODE(IPM);
-    OPCODE(SERIALIZE);
     OPCODE(MEMBARRIER);
     OPCODE(TBEGIN);
     OPCODE(TBEGIN_NOFLOAT);
@@ -4722,9 +4826,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 // Return true if VT is a vector whose elements are a whole number of bytes
-// in width.
-static bool canTreatAsByteVector(EVT VT) {
-  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0;
+// in width. Also check for presence of vector support.
+bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
+  if (!Subtarget.hasVector())
+    return false;
+
+  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
 }
 
 // Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
@@ -4986,6 +5093,10 @@ SDValue SystemZTargetLowering::combineSTORE(
 
 SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
     SDNode *N, DAGCombinerInfo &DCI) const {
+
+  if (!Subtarget.hasVector())
+    return SDValue();
+
   // Try to simplify a vector extraction.
   if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     SDValue Op0 = N->getOperand(0);
@@ -5233,7 +5344,7 @@ static unsigned forceReg(MachineInstr &MI, MachineOperand &Base,
 
   unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
-      .addOperand(Base)
+      .add(Base)
       .addImm(0)
       .addReg(0);
   return Reg;
@@ -5321,9 +5432,24 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
   if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
     if (Invert)
       CCMask ^= CCValid;
+
+    // ISel pattern matching also adds a load memory operand of the same
+    // address, so take special care to find the storing memory operand.
+    MachineMemOperand *MMO = nullptr;
+    for (auto *I : MI.memoperands())
+      if (I->isStore()) {
+          MMO = I;
+          break;
+        }
+
     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
-      .addReg(SrcReg).addOperand(Base).addImm(Disp)
-      .addImm(CCValid).addImm(CCMask);
+      .addReg(SrcReg)
+      .add(Base)
+      .addImm(Disp)
+      .addImm(CCValid)
+      .addImm(CCMask)
+      .addMemOperand(MMO);
+
     MI.eraseFromParent();
     return MBB;
   }
@@ -5350,7 +5476,10 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
   //   # fallthrough to JoinMBB
   MBB = FalseMBB;
   BuildMI(MBB, DL, TII->get(StoreOpcode))
-    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+      .addReg(SrcReg)
+      .add(Base)
+      .addImm(Disp)
+      .addReg(IndexReg);
   MBB->addSuccessor(JoinMBB);
 
   MI.eraseFromParent();
@@ -5415,8 +5544,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   //   %OrigVal = L Disp(%Base)
   //   # fall through to LoopMMB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5437,8 +5565,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   if (Invert) {
     // Perform the operation normally and then invert every bit of the field.
     unsigned Tmp = MRI.createVirtualRegister(RC);
-    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp)
-      .addReg(RotatedOldVal).addOperand(Src2);
+    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
     if (BitSize <= 32)
       // XILF with the upper BitSize bits set.
       BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
@@ -5454,7 +5581,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
   } else if (BinOpcode)
     // A simply binary operation.
     BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
-      .addReg(RotatedOldVal).addOperand(Src2);
+        .addReg(RotatedOldVal)
+        .add(Src2);
   else if (IsSubWord)
     // Use RISBG to rotate Src2 into position and use it to replace the
     // field in RotatedOldVal.
@@ -5465,7 +5593,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
-    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(NewVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5533,8 +5664,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
   //   %OrigVal     = L Disp(%Base)
   //   # fall through to LoopMMB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5581,7 +5711,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
-    .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(NewVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5642,7 +5775,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   //   # fall through to LoopMMB
   MBB = StartMBB;
   BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
-    .addOperand(Base).addImm(Disp).addReg(0);
+      .add(Base)
+      .addImm(Disp)
+      .addReg(0);
   MBB->addSuccessor(LoopMBB);
 
   //  LoopMBB:
@@ -5696,7 +5831,10 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
     .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
   BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
-    .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
+      .addReg(OldVal)
+      .addReg(StoreVal)
+      .add(Base)
+      .addImm(Disp);
   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
@@ -5706,14 +5844,12 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
   return DoneMBB;
 }
 
-// Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
+// Emit an extension from a GR64 to a GR128.  ClearEven is true
 // if the high register of the GR128 value must be cleared or false if
-// it's "don't care".  SubReg is subreg_l32 when extending a GR32
-// and subreg_l64 when extending a GR64.
+// it's "don't care".
 MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
                                                      MachineBasicBlock *MBB,
-                                                     bool ClearEven,
-                                                     unsigned SubReg) const {
+                                                     bool ClearEven) const {
   MachineFunction &MF = *MBB->getParent();
   const SystemZInstrInfo *TII =
       static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@@ -5736,7 +5872,7 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
     In128 = NewIn128;
   }
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
-    .addReg(In128).addReg(Src).addImm(SubReg);
+    .addReg(In128).addReg(Src).addImm(SystemZ::subreg_l64);
 
   MI.eraseFromParent();
   return MBB;
@@ -5869,7 +6005,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (!isUInt<12>(DestDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .addOperand(DestBase)
+          .add(DestBase)
           .addImm(DestDisp)
           .addReg(0);
       DestBase = MachineOperand::CreateReg(Reg, false);
@@ -5878,15 +6014,19 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     if (!isUInt<12>(SrcDisp)) {
       unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
-          .addOperand(SrcBase)
+          .add(SrcBase)
           .addImm(SrcDisp)
           .addReg(0);
       SrcBase = MachineOperand::CreateReg(Reg, false);
       SrcDisp = 0;
     }
     BuildMI(*MBB, MI, DL, TII->get(Opcode))
-      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
-      .addOperand(SrcBase).addImm(SrcDisp);
+        .add(DestBase)
+        .addImm(DestDisp)
+        .addImm(ThisLength)
+        .add(SrcBase)
+        .addImm(SrcDisp)
+        ->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
     DestDisp += ThisLength;
     SrcDisp += ThisLength;
     Length -= ThisLength;
@@ -6057,6 +6197,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::SelectF32:
   case SystemZ::SelectF64:
   case SystemZ::SelectF128:
+  case SystemZ::SelectVR128:
     return emitSelect(MI, MBB, 0);
 
   case SystemZ::CondStore8Mux:
@@ -6096,12 +6237,10 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::CondStoreF64Inv:
     return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
 
-  case SystemZ::AEXT128_64:
-    return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
-  case SystemZ::ZEXT128_32:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
-  case SystemZ::ZEXT128_64:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
+  case SystemZ::AEXT128:
+    return emitExt128(MI, MBB, false);
+  case SystemZ::ZEXT128:
+    return emitExt128(MI, MBB, true);
 
   case SystemZ::ATOMIC_SWAPW:
     return emitAtomicLoadBinary(MI, MBB, 0, 0);
@@ -6310,3 +6449,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     llvm_unreachable("Unexpected instr type to insert");
   }
 }
+
+// This is only used by the isel schedulers, and is needed only to prevent
+// compiler from crashing when list-ilp is used.
+const TargetRegisterClass *
+SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
+  if (VT == MVT::Untyped)
+    return &SystemZ::ADDR128BitRegClass;
+  return TargetLowering::getRepRegClassFor(VT);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 7a21a47..abe8b72 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -86,14 +86,12 @@ enum NodeType : unsigned {
   // Count number of bits set in operand 0 per byte.
   POPCNT,
 
-  // Wrappers around the ISD opcodes of the same name.  The output and
-  // first input operands are GR128s.  The trailing numbers are the
-  // widths of the second operand in bits.
-  UMUL_LOHI64,
-  SDIVREM32,
-  SDIVREM64,
-  UDIVREM32,
-  UDIVREM64,
+  // Wrappers around the ISD opcodes of the same name.  The output is GR128.
+  // Input operands may be GR64 or GR32, depending on the instruction.
+  SMUL_LOHI,
+  UMUL_LOHI,
+  SDIVREM,
+  UDIVREM,
 
   // Use a series of MVCs to copy bytes from one memory location to another.
   // The operands are:
@@ -139,9 +137,6 @@ enum NodeType : unsigned {
   // Store the CC value in bits 29 and 28 of an integer.
   IPM,
 
-  // Perform a serialization operation.  (BCR 15,0 or BCR 14,0.)
-  SERIALIZE,
-
   // Compiler barrier only; generate a no-op.
   MEMBARRIER,
 
@@ -454,7 +449,7 @@ public:
                               MachineBasicBlock *BB) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   bool allowTruncateForTailCall(Type *, Type *) const override;
-  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -471,8 +466,6 @@ public:
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
-  SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
-                                      SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   ISD::NodeType getExtendForAtomicOps() const override {
@@ -487,6 +480,12 @@ private:
   const SystemZSubtarget &Subtarget;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
+                       const SDLoc &DL, EVT VT,
+                       SDValue CmpOp0, SDValue CmpOp1) const;
+  SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL,
+                           EVT VT, ISD::CondCode CC,
+                           SDValue CmpOp0, SDValue CmpOp1) const;
   SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -522,7 +521,6 @@ private:
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
@@ -537,6 +535,7 @@ private:
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
+  bool canTreatAsByteVector(EVT VT) const;
   SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
                          unsigned Index, DAGCombinerInfo &DCI,
                          bool Force) const;
@@ -567,7 +566,7 @@ private:
                                    unsigned StoreOpcode, unsigned STOCOpcode,
                                    bool Invert) const;
   MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB,
-                                bool ClearEven, unsigned SubReg) const;
+                                bool ClearEven) const;
   MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI,
                                           MachineBasicBlock *BB,
                                           unsigned BinOpcode, unsigned BitSize,
@@ -589,6 +588,8 @@ private:
   MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const;
+
+  const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 };
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrDFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrDFP.td
new file mode 100644
index 0000000..08ab2d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrDFP.td
@@ -0,0 +1,231 @@
+//==- SystemZInstrDFP.td - Floating-point SystemZ instructions -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ decimal floating-point
+// arithmetic.  These instructions are inot currently used for code generation,
+// are provided for use with the assembler and disassembler only.  If LLVM
+// ever supports decimal floating-point types (_Decimal64 etc.), they can
+// also be used for code generation for those types.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and test.
+let Defs = [CC] in {
+  def LTDTR : UnaryRRE<"ltdtr", 0xB3D6, null_frag, FP64,  FP64>;
+  def LTXTR : UnaryRRE<"ltxtr", 0xB3DE, null_frag, FP128, FP128>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations.  The destination
+// of LDXTR is a 128-bit value, but only the first register of the pair is used.
+def LEDTR : TernaryRRFe<"ledtr", 0xB3D5, FP32,  FP64>;
+def LDXTR : TernaryRRFe<"ldxtr", 0xB3DD, FP128, FP128>;
+
+// Extend floating-point values to wider representations.
+def LDETR : BinaryRRFd<"ldetr", 0xB3D4, FP64,  FP32>;
+def LXDTR : BinaryRRFd<"lxdtr", 0xB3DC, FP128, FP64>;
+
+// Convert a signed integer value to a floating-point one.
+def CDGTR : UnaryRRE<"cdgtr", 0xB3F1, null_frag, FP64,  GR64>;
+def CXGTR : UnaryRRE<"cxgtr", 0xB3F9, null_frag, FP128, GR64>;
+let Predicates = [FeatureFPExtension] in {
+  def CDGTRA : TernaryRRFe<"cdgtra", 0xB3F1, FP64,  GR64>;
+  def CXGTRA : TernaryRRFe<"cxgtra", 0xB3F9, FP128, GR64>;
+  def CDFTR : TernaryRRFe<"cdftr", 0xB951, FP64,  GR32>;
+  def CXFTR : TernaryRRFe<"cxftr", 0xB959, FP128, GR32>;
+}
+
+// Convert an unsigned integer value to a floating-point one.
+let Predicates = [FeatureFPExtension] in {
+  def CDLGTR : TernaryRRFe<"cdlgtr", 0xB952, FP64,  GR64>;
+  def CXLGTR : TernaryRRFe<"cxlgtr", 0xB95A, FP128, GR64>;
+  def CDLFTR : TernaryRRFe<"cdlftr", 0xB953, FP64,  GR32>;
+  def CXLFTR : TernaryRRFe<"cxlftr", 0xB95B, FP128, GR32>;
+}
+
+// Convert a floating-point value to a signed integer value.
+let Defs = [CC] in {
+  def CGDTR : BinaryRRFe<"cgdtr", 0xB3E1, GR64, FP64>;
+  def CGXTR : BinaryRRFe<"cgxtr", 0xB3E9, GR64, FP128>;
+  let Predicates = [FeatureFPExtension] in {
+    def CGDTRA : TernaryRRFe<"cgdtra", 0xB3E1, GR64, FP64>;
+    def CGXTRA : TernaryRRFe<"cgxtra", 0xB3E9, GR64, FP128>;
+    def CFDTR : TernaryRRFe<"cfdtr", 0xB941, GR32, FP64>;
+    def CFXTR : TernaryRRFe<"cfxtr", 0xB949, GR32, FP128>;
+  }
+}
+
+// Convert a floating-point value to an unsigned integer value.
+let Defs = [CC] in {
+  let Predicates = [FeatureFPExtension] in {
+    def CLGDTR : TernaryRRFe<"clgdtr", 0xB942, GR64, FP64>;
+    def CLGXTR : TernaryRRFe<"clgxtr", 0xB94A, GR64, FP128>;
+    def CLFDTR : TernaryRRFe<"clfdtr", 0xB943, GR32, FP64>;
+    def CLFXTR : TernaryRRFe<"clfxtr", 0xB94B, GR32, FP128>;
+  }
+}
+
+// Convert a packed value to a floating-point one.
+def CDSTR : UnaryRRE<"cdstr", 0xB3F3, null_frag, FP64,  GR64>;
+def CXSTR : UnaryRRE<"cxstr", 0xB3FB, null_frag, FP128, GR128>;
+def CDUTR : UnaryRRE<"cdutr", 0xB3F2, null_frag, FP64,  GR64>;
+def CXUTR : UnaryRRE<"cxutr", 0xB3FA, null_frag, FP128, GR128>;
+
+// Convert a floating-point value to a packed value.
+def CSDTR : BinaryRRFd<"csdtr", 0xB3E3, GR64,  FP64>;
+def CSXTR : BinaryRRFd<"csxtr", 0xB3EB, GR128, FP128>;
+def CUDTR : UnaryRRE<"cudtr", 0xB3E2, null_frag, GR64,  FP64>;
+def CUXTR : UnaryRRE<"cuxtr", 0xB3EA, null_frag, GR128, FP128>;
+
+// Convert from/to memory values in the zoned format.
+let Predicates = [FeatureDFPZonedConversion] in {
+  def CDZT : BinaryRSL<"cdzt", 0xEDAA, FP64>;
+  def CXZT : BinaryRSL<"cxzt", 0xEDAB, FP128>;
+  def CZDT : StoreBinaryRSL<"czdt", 0xEDA8, FP64>;
+  def CZXT : StoreBinaryRSL<"czxt", 0xEDA9, FP128>;
+}
+
+// Convert from/to memory values in the packed format.
+let Predicates = [FeatureDFPPackedConversion] in {
+  def CDPT : BinaryRSL<"cdpt", 0xEDAE, FP64>;
+  def CXPT : BinaryRSL<"cxpt", 0xEDAF, FP128>;
+  def CPDT : StoreBinaryRSL<"cpdt", 0xEDAC, FP64>;
+  def CPXT : StoreBinaryRSL<"cpxt", 0xEDAD, FP128>;
+}
+
+// Perform floating-point operation.
+let Defs = [CC, R1L, F0Q], Uses = [R0L, F4Q] in
+  def PFPO : SideEffectInherentE<"pfpo", 0x010A>;
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Round to an integer, with the second operand (M3) specifying the rounding
+// mode.  M4 can be set to 4 to suppress detection of inexact conditions.
+def FIDTR : TernaryRRFe<"fidtr", 0xB3D7, FP64,  FP64>;
+def FIXTR : TernaryRRFe<"fixtr", 0xB3DF, FP128, FP128>;
+
+// Extract biased exponent.
+def EEDTR : UnaryRRE<"eedtr", 0xB3E5, null_frag, FP64,  FP64>;
+def EEXTR : UnaryRRE<"eextr", 0xB3ED, null_frag, FP128, FP128>;
+
+// Extract significance.
+def ESDTR : UnaryRRE<"esdtr", 0xB3E7, null_frag, FP64,  FP64>;
+def ESXTR : UnaryRRE<"esxtr", 0xB3EF, null_frag, FP128, FP128>;
+
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def ADTR : BinaryRRFa<"adtr", 0xB3D2, null_frag, FP64,  FP64,  FP64>;
+    def AXTR : BinaryRRFa<"axtr", 0xB3DA, null_frag, FP128, FP128, FP128>;
+  }
+  let Predicates = [FeatureFPExtension] in {
+    def ADTRA : TernaryRRFa<"adtra", 0xB3D2, FP64,  FP64,  FP64>;
+    def AXTRA : TernaryRRFa<"axtra", 0xB3DA, FP128, FP128, FP128>;
+  }
+}
+
+// Subtraction.
+let Defs = [CC] in {
+  def SDTR : BinaryRRFa<"sdtr", 0xB3D3, null_frag, FP64,  FP64,  FP64>;
+  def SXTR : BinaryRRFa<"sxtr", 0xB3DB, null_frag, FP128, FP128, FP128>;
+  let Predicates = [FeatureFPExtension] in {
+    def SDTRA : TernaryRRFa<"sdtra", 0xB3D3, FP64,  FP64,  FP64>;
+    def SXTRA : TernaryRRFa<"sxtra", 0xB3DB, FP128, FP128, FP128>;
+  }
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MDTR : BinaryRRFa<"mdtr", 0xB3D0, null_frag, FP64,  FP64,  FP64>;
+  def MXTR : BinaryRRFa<"mxtr", 0xB3D8, null_frag, FP128, FP128, FP128>;
+}
+let Predicates = [FeatureFPExtension] in {
+  def MDTRA : TernaryRRFa<"mdtra", 0xB3D0, FP64,  FP64,  FP64>;
+  def MXTRA : TernaryRRFa<"mxtra", 0xB3D8, FP128, FP128, FP128>;
+}
+
+// Division.
+def DDTR : BinaryRRFa<"ddtr", 0xB3D1, null_frag, FP64,  FP64,  FP64>;
+def DXTR : BinaryRRFa<"dxtr", 0xB3D9, null_frag, FP128, FP128, FP128>;
+let Predicates = [FeatureFPExtension] in {
+  def DDTRA : TernaryRRFa<"ddtra", 0xB3D1, FP64,  FP64,  FP64>;
+  def DXTRA : TernaryRRFa<"dxtra", 0xB3D9, FP128, FP128, FP128>;
+}
+
+// Quantize.
+def QADTR : TernaryRRFb<"qadtr", 0xB3F5, FP64,  FP64,  FP64>;
+def QAXTR : TernaryRRFb<"qaxtr", 0xB3FD, FP128, FP128, FP128>;
+
+// Reround.
+def RRDTR : TernaryRRFb<"rrdtr", 0xB3F7, FP64,  FP64,  FP64>;
+def RRXTR : TernaryRRFb<"rrxtr", 0xB3FF, FP128, FP128, FP128>;
+
+// Shift significand left/right.
+def SLDT : BinaryRXF<"sldt", 0xED40, null_frag, FP64,  FP64,  null_frag, 0>;
+def SLXT : BinaryRXF<"slxt", 0xED48, null_frag, FP128, FP128, null_frag, 0>;
+def SRDT : BinaryRXF<"srdt", 0xED41, null_frag, FP64,  FP64,  null_frag, 0>;
+def SRXT : BinaryRXF<"srxt", 0xED49, null_frag, FP128, FP128, null_frag, 0>;
+
+// Insert biased exponent.
+def IEDTR : BinaryRRFb<"iedtr", 0xB3F6, null_frag, FP64,  FP64,   FP64>;
+def IEXTR : BinaryRRFb<"iextr", 0xB3FE, null_frag, FP128, FP128, FP128>;
+
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare.
+let Defs = [CC] in {
+  def CDTR : CompareRRE<"cdtr", 0xB3E4, null_frag, FP64,  FP64>;
+  def CXTR : CompareRRE<"cxtr", 0xB3EC, null_frag, FP128, FP128>;
+}
+
+// Compare and signal.
+let Defs = [CC] in {
+  def KDTR : CompareRRE<"kdtr", 0xB3E0, null_frag, FP64,  FP64>;
+  def KXTR : CompareRRE<"kxtr", 0xB3E8, null_frag, FP128, FP128>;
+}
+
+// Compare biased exponent.
+let Defs = [CC] in {
+  def CEDTR : CompareRRE<"cedtr", 0xB3F4, null_frag, FP64,  FP64>;
+  def CEXTR : CompareRRE<"cextr", 0xB3FC, null_frag, FP128, FP128>;
+}
+
+// Test Data Class.
+let Defs = [CC] in {
+  def TDCET : TestRXE<"tdcet", 0xED50, null_frag, FP32>;
+  def TDCDT : TestRXE<"tdcdt", 0xED54, null_frag, FP64>;
+  def TDCXT : TestRXE<"tdcxt", 0xED58, null_frag, FP128>;
+}
+
+// Test Data Group.
+let Defs = [CC] in {
+  def TDGET : TestRXE<"tdget", 0xED51, null_frag, FP32>;
+  def TDGDT : TestRXE<"tdgdt", 0xED55, null_frag, FP64>;
+  def TDGXT : TestRXE<"tdgxt", 0xED59, null_frag, FP128>;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index bb6d27e..02aeaad 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -12,9 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
-def SelectF32  : SelectWrapper<FP32>;
-def SelectF64  : SelectWrapper<FP64>;
-def SelectF128 : SelectWrapper<FP128>;
+def SelectF32  : SelectWrapper<f32, FP32>;
+def SelectF64  : SelectWrapper<f64, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def SelectF128 : SelectWrapper<f128, FP128>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def SelectVR128 : SelectWrapper<f128, VR128>;
 
 defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
                                nonvolatile_load, bdxaddr20only>;
@@ -69,8 +72,9 @@ let Defs = [CC], usesCustomInserter = 1 in {
 let Predicates = [FeatureVector] in {
   defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>;
   defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>;
-  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 }
+let Predicates = [FeatureVector, FeatureNoVectorEnhancements1] in
+  defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>;
 
 // Moves between 64-bit integer and floating-point registers.
 def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
@@ -83,8 +87,12 @@ let isCodeGenOnly = 1 in {
 }
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 FP128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 VR128:$src2)))),
+            (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
@@ -92,8 +100,12 @@ let isCodeGenOnly = 1 in
 def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>;
 
 // The sign of an FP128 is in the high register.
-def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 FP128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureVectorEnhancements1] in
+  def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 VR128:$src2)))),
+            (CPSDRdd FP64:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
@@ -101,12 +113,14 @@ class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
         (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP32:$src2)>;
-def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  FP64:$src2)>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
-                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP32:$src2)>;
+  def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    FP64:$src2)>;
+  def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                    (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+}
 
 defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
 defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
@@ -121,7 +135,8 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
   defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
 
   // For z13 we prefer LDE over LE to avoid partial register dependencies.
-  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+  let isCodeGenOnly = 1 in
+    def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -165,20 +180,32 @@ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>,
 def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>,
              Requires<[FeatureFPExtension]>;
 
-def : Pat<(f32 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
-def : Pat<(f64 (fpround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f32 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>;
+  def : Pat<(f64 (fpround FP128:$src)),
+            (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
+}
 
 // Extend register floating-point values to wider representations.
-def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64,  FP32>;
-def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>;
-def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>;
+def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend,  FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>;
+  def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>;
+}
 
 // Extend memory floating-point values to wider representations.
 def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
-def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
-def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag,  FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag,  FP128, 8>;
+let Predicates = [FeatureNoVectorEnhancements1] in {
+  def : Pat<(f128 (extloadf32 bdxaddr12only:$src)),
+            (LXEB bdxaddr12only:$src)>;
+  def : Pat<(f128 (extloadf64 bdxaddr12only:$src)),
+            (LXDB bdxaddr12only:$src)>;
+}
 
 // Convert a signed integer register value to a floating-point one.
 def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
@@ -425,30 +452,32 @@ def : Pat<(fmul (f64 (fpextend FP32:$src1)),
 
 // f128 multiplication of two FP64 registers.
 def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
-          (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_h64), FP64:$src2)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))),
+            (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
+                                  FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
 def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
-def : Pat<(fmul (f128 (fpextend FP64:$src1)),
-                (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
-                bdxaddr12only:$addr)>;
+let Predicates = [FeatureNoVectorEnhancements1] in
+  def : Pat<(fmul (f128 (fpextend FP64:$src1)),
+                  (f128 (extloadf64 bdxaddr12only:$addr))),
+            (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
+                  bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
-def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32>;
-def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64>;
+def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>;
+def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64, FP64>;
 
-def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load, 4>;
-def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load, 8>;
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, FP32, load, 4>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, FP64, load, 8>;
 
 // Fused multiply-subtract.
-def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32>;
-def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64>;
+def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32, FP32>;
+def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64, FP64>;
 
-def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load, 4>;
-def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load, 8>;
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, FP32, load, 4>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, FP64, load, 8>;
 
 // Division.
 def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32,  FP32>;
@@ -458,6 +487,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
 def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
 def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
 
+// Divide to integer.
+let Defs = [CC] in {
+  def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>;
+  def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>;
+}
+
 //===----------------------------------------------------------------------===//
 // Comparisons
 //===----------------------------------------------------------------------===//
@@ -469,6 +504,13 @@ let Defs = [CC], CCValues = 0xF in {
 
   def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
   def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
+
+  def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32,  FP32>;
+  def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64,  FP64>;
+  def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>;
+
+  def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>;
+  def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>;
 }
 
 // Test Data Class.
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index c727f48..033a0a8 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -527,6 +527,22 @@ class InstRRFc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{3-0}   = R2;
 }
 
+class InstRRFd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M4;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = 0;
+  let Inst{11-8}  = M4;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
 class InstRRFe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
@@ -710,6 +726,37 @@ class InstRSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = RI2;
 }
 
+class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRSLb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<24> BDL2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-16} = BDL2;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = M3;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -817,6 +864,37 @@ class InstSSa<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = BD2;
 }
 
+class InstSSb<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+  bits<20> BDL2;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = BDL2{19-16};
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-0}  = BDL2{15-0};
+}
+
+class InstSSc<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<20> BDL1;
+  bits<16> BD2;
+  bits<4> I3;
+
+  let Inst{47-40} = op;
+  let Inst{39-36} = BDL1{19-16};
+  let Inst{35-32} = I3;
+  let Inst{31-16} = BDL1{15-0};
+  let Inst{15-0}  = BD2;
+}
+
 class InstSSd<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -850,6 +928,20 @@ class InstSSe<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = BD4;
 }
 
+class InstSSf<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<16> BD1;
+  bits<24> BDL2;
+
+  let Inst{47-40} = op;
+  let Inst{39-32} = BDL2{23-16};
+  let Inst{31-16} = BD1;
+  let Inst{15-0}  = BDL2{15-0};
+}
+
 class InstSSE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -999,6 +1091,94 @@ class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRIf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<5> V3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-28} = V3{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I4;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9}     = V3{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<8> I3;
+  bits<8> I4;
+  bits<4> M5;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = I4;
+  let Inst{23-20} = M5;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> I2;
+  bits<4> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = 0;
+  let Inst{31-16} = I2;
+  let Inst{15-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRIi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<4> R2;
+  bits<8> I3;
+  bits<4> M4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = V1{3-0};
+  let Inst{35-32} = R2;
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M4;
+  let Inst{19-12} = I3;
+  let Inst{11}    = V1{4};
+  let Inst{10-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 // Depending on the instruction mnemonic, certain bits may be or-ed into
 // the M4 value provided as explicit operand.  These are passed as m4or.
 class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -1167,6 +1347,67 @@ class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRRg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = V1{3-0};
+  let Inst{31-28} = V2{3-0};
+  let Inst{27-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V1{4};
+  let Inst{9}     = V2{4};
+  let Inst{8}     = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstVRRi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<5> V2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = V2{3-0};
+  let Inst{31-24} = 0;
+  let Inst{23-20} = M3;
+  let Inst{19-12} = 0;
+  let Inst{11}    = 0;
+  let Inst{10}    = V2{4};
+  let Inst{9-8}   = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1229,6 +1470,25 @@ class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVRSd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<4> R3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = 0;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -1266,6 +1526,24 @@ class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstVSI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<5> V1;
+  bits<16> BD2;
+  bits<8> I3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-32} = I3;
+  let Inst{31-16} = BD2;
+  let Inst{15-12} = V1{3-0};
+  let Inst{11-9}  = 0;
+  let Inst{8}     = V1{4};
+  let Inst{7-0}   = op{7-0};
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction classes for .insn directives
 //===----------------------------------------------------------------------===//
@@ -1567,6 +1845,9 @@ class ICV<string name>
 //   Inherent:
 //     One register output operand and no input operands.
 //
+//   InherentDual:
+//     Two register output operands and no input operands.
+//
 //   StoreInherent:
 //     One address operand.  The instruction stores to the address.
 //
@@ -1642,8 +1923,9 @@ class ICV<string name>
 //     Two input operands and an implicit CC output operand.
 //
 //   Test:
-//     Two input operands and an implicit CC output operand.  The second
-//     input operand is an "address" operand used as a test class mask.
+//     One or two input operands and an implicit CC output operand.  If
+//     present, the second input operand is an "address" operand used as
+//     a test class mask.
 //
 //   Ternary:
 //     One register output operand and three input operands.
@@ -1691,6 +1973,10 @@ class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
   let R2 = 0;
 }
 
+class InherentDualRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1, cls:$R2), (ins),
+            mnemonic#"\t$R1, $R2", []>;
+
 class InherentVRIa<string mnemonic, bits<16> opcode, bits<16> value>
   : InstVRIa<opcode, (outs VR128:$V1), (ins), mnemonic#"\t$V1", []> {
   let I2 = value;
@@ -1714,6 +2000,12 @@ class SideEffectInherentS<string mnemonic, bits<16> opcode,
   let BD2 = 0;
 }
 
+class SideEffectInherentRRE<string mnemonic, bits<16> opcode>
+  : InstRRE<opcode, (outs), (ins), mnemonic, []> {
+  let R1 = 0;
+  let R2 = 0;
+}
+
 // Allow an optional TLS marker symbol to generate TLS call relocations.
 class CallRI<string mnemonic, bits<12> opcode>
   : InstRIb<opcode, (outs), (ins GR64:$R1, brtarget16tls:$RI2),
@@ -1804,6 +2096,25 @@ class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode>
   let M1 = V.ccmask;
 }
 
+class CondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr20only:$XBD2),
+             !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> {
+  let CCMaskFirst = 1;
+}
+
+class AsmCondBranchRXY<string mnemonic, bits<16> opcode>
+  : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
+             mnemonic#"\t$M1, $XBD2", []>;
+
+class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode,
+                         SDPatternOperator operator = null_frag>
+  : InstRXYb<opcode, (outs), (ins bdxaddr20only:$XBD2),
+             !subst("#", V.suffix, mnemonic)#"\t$XBD2",
+             [(operator (load bdxaddr20only:$XBD2))]> {
+  let isAsmParserOnly = V.alternate;
+  let M1 = V.ccmask;
+}
+
 class CmpBranchRIEa<string mnemonic, bits<16> opcode,
                     RegisterOperand cls, Immediate imm>
   : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3),
@@ -2084,6 +2395,13 @@ multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
   }
 }
 
+class LoadMultipleSSe<string mnemonic, bits<8> opcode, RegisterOperand cls>
+  : InstSSe<opcode, (outs cls:$R1, cls:$R3),
+            (ins bdaddr12only:$BD2, bdaddr12only:$BD4),
+            mnemonic#"\t$R1, $R3, $BD2, $BD4", []> {
+  let mayLoad = 1;
+}
+
 class LoadMultipleVRSa<string mnemonic, bits<16> opcode>
   : InstVRSa<opcode, (outs VR128:$V1, VR128:$V3), (ins bdaddr12only:$BD2),
              mnemonic#"\t$V1, $V3, $BD2", []> {
@@ -2159,6 +2477,24 @@ class StoreLengthVRSb<string mnemonic, bits<16> opcode,
   let AccessBytes = bytes;
 }
 
+class StoreLengthVRSd<string mnemonic, bits<16> opcode,
+                      SDPatternOperator operator, bits<5> bytes>
+  : InstVRSd<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreLengthVSI<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator, bits<5> bytes>
+  : InstVSI<opcode, (outs), (ins VR128:$V1, bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(operator VR128:$V1, imm32zx8:$I3, bdaddr12only:$BD2)]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
                       AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2),
@@ -2355,6 +2691,23 @@ class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let OpType = "reg";
 }
 
+class UnaryTiedRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src),
+            mnemonic#"\t$R1", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R2 = 0;
+}
+
+class UnaryMemRRFc<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let M3 = 0;
+}
+
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
               RegisterOperand cls, Immediate imm>
   : InstRIa<opcode, (outs cls:$R1), (ins imm:$I2),
@@ -2570,6 +2923,11 @@ class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
   : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
             mnemonic##"\t$R1, $XBD2", []>;
 
+class SideEffectBinaryRXY<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls>
+  : InstRXYa<opcode, (outs), (ins cls:$R1, bdxaddr20only:$XBD2),
+             mnemonic##"\t$R1, $XBD2", []>;
+
 class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
                             RegisterOperand cls>
   : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
@@ -2580,16 +2938,90 @@ class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
   let AddedComplexity = 7;
 }
 
+class SideEffectBinaryRRE<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []>;
+
+class SideEffectBinaryRRFa<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+             mnemonic#"\t$R1, $R2", []> {
+  let R3 = 0;
+  let M4 = 0;
+}
+
+class SideEffectBinaryRRFc<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+             mnemonic#"\t$R1, $R2", []> {
+  let M3 = 0;
+}
+
 class SideEffectBinaryIE<string mnemonic, bits<16> opcode,
                          Immediate imm1, Immediate imm2>
   : InstIE<opcode, (outs), (ins imm1:$I1, imm2:$I2),
            mnemonic#"\t$I1, $I2", []>;
 
+class SideEffectBinarySI<string mnemonic, bits<8> opcode, Operand imm>
+  : InstSI<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2", []>;
+
 class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
                           SDPatternOperator operator, Immediate imm>
   : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
             mnemonic#"\t$BD1, $I2", [(operator bdaddr12only:$BD1, imm:$I2)]>;
 
+class SideEffectBinarySSa<string mnemonic, bits<8> opcode>
+  : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2),
+            mnemonic##"\t$BDL1, $BD2", []>;
+
+class SideEffectBinarySSb<string mnemonic, bits<8> opcode>
+  : InstSSb<opcode,
+            (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+            mnemonic##"\t$BDL1, $BDL2", []>;
+
+class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
+  : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
+            mnemonic##"\t$BD1, $BDL2", []>;
+
+class SideEffectBinarySSE<string mnemonic, bits<16> opcode>
+  : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2),
+            mnemonic#"\t$BD1, $BD2", []>;
+
+class SideEffectBinaryMemMemRR<string mnemonic, bits<8> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2>
+  : InstRR<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+           mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemRRE<string mnemonic, bits<16> opcode,
+                             RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls2:$R2), (ins cls1:$R1, cls2:$R2src),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R2 = $R2src";
+  let DisableEncoding = "$R2src";
+}
+
+class SideEffectBinaryMemMemRRE<string mnemonic, bits<16> opcode,
+                                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+            mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+}
+
+class SideEffectBinaryMemMemRRFc<string mnemonic, bits<16> opcode,
+                                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2), (ins cls1:$R1src, cls2:$R2src),
+             mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src";
+  let DisableEncoding = "$R1src, $R2src";
+  let M3 = 0;
+}
+
 class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls1, RegisterOperand cls2>
   : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
@@ -2612,6 +3044,15 @@ class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let DisableEncoding = "$R1src";
 }
 
+class BinaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRD<opcode, (outs cls1:$R1), (ins cls2:$R3, cls2:$R2),
+            mnemonic#"\t$R1, $R3, $R2",
+            [(set cls1:$R1, (operator cls2:$R3, cls2:$R2))]> {
+  let OpKey = mnemonic#cls;
+  let OpType = "reg";
+}
+
 class BinaryRRFa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls1, RegisterOperand cls2,
                  RegisterOperand cls3>
@@ -2654,6 +3095,25 @@ class BinaryRRFb<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M4 = 0;
 }
 
+class BinaryMemRRFc<string mnemonic, bits<16> opcode,
+                    RegisterOperand cls1, RegisterOperand cls2, Immediate imm>
+  : InstRRFc<opcode, (outs cls2:$R2, cls1:$R1), (ins cls1:$R1src, imm:$M3),
+            mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2> {
+  def "" : BinaryMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
+class BinaryRRFd<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRFd<opcode, (outs cls1:$R1), (ins cls2:$R2, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $M4", []>;
+
 class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                 RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
@@ -2804,6 +3264,13 @@ multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
   }
 }
 
+class BinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSLb<opcode, (outs cls:$R1),
+             (ins bdladdr12onlylen8:$BDL2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $BDL2, $M3", []> {
+  let mayLoad = 1;
+}
+
 class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                AddressingMode mode = bdxaddr12only>
@@ -2833,6 +3300,18 @@ class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M3 = 0;
 }
 
+class BinaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2,
+                SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls1:$R1), (ins cls2:$R3, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $R3, $XBD2",
+            [(set cls1:$R1, (operator cls2:$R3, (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                 AddressingMode mode = bdxaddr20only>
@@ -2937,6 +3416,11 @@ class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5),
              mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>;
 
+class BinaryVRIh<string mnemonic, bits<16> opcode>
+  : InstVRIh<opcode, (outs VR128:$V1),
+             (ins imm32zx16:$I2, imm32zx4:$I3),
+             mnemonic#"\t$V1, $I2, $I3", []>;
+
 class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0>
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5),
@@ -3065,6 +3549,10 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator,
              mnemonic#"\t$V1, $R2, $R3",
              [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>;
 
+class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $V2, $M3", []>;
+
 class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type>
   : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2),
@@ -3102,6 +3590,15 @@ class BinaryVRScGeneric<string mnemonic, bits<16> opcode>
              (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4),
              mnemonic#"\t$R1, $V3, $BD2, $M4", []>;
 
+class BinaryVRSd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 bits<5> bytes>
+  : InstVRSd<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2),
+             mnemonic#"\t$V1, $R3, $BD2",
+             [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr, bits<5> bytes>
   : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3),
@@ -3112,6 +3609,50 @@ class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let AccessBytes = bytes;
 }
 
+class StoreBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                    bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
+                             bits<16> rsyOpcode, RegisterOperand cls,
+                             bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreBinaryRSY<mnemonic#"y", rsyOpcode, cls, bytes,
+                              bdaddr20pair>;
+  }
+}
+
+class StoreBinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSLb<opcode, (outs),
+             (ins cls:$R1, bdladdr12onlylen8:$BDL2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $BDL2, $M3", []> {
+  let mayStore = 1;
+}
+
+class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                bits<5> bytes>
+  : InstVSI<opcode, (outs VR128:$V1), (ins bdaddr12only:$BD2, imm32zx8:$I3),
+            mnemonic#"\t$V1, $BD2, $I3",
+            [(set VR128:$V1, (operator imm32zx8:$I3, bdaddr12only:$BD2))]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3237,6 +3778,40 @@ multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
   }
 }
 
+class CompareRS<string mnemonic, bits<8> opcode, RegisterOperand cls,
+                bits<5> bytes, AddressingMode mode = bdaddr12only>
+  : InstRSb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+            mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
+                 bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSYb<opcode, (outs), (ins cls:$R1, imm32zx4:$M3, mode:$BD2),
+             mnemonic#"\t$R1, $M3, $BD2", []> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
+multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
+                         RegisterOperand cls, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
+    let DispSize = "20" in
+      def Y  : CompareRSY<mnemonic#"y", rsyOpcode, cls, bytes, bdaddr20pair>;
+  }
+}
+
+class CompareSSb<string mnemonic, bits<8> opcode>
+  : InstSSb<opcode,
+            (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
+            mnemonic##"\t$BDL1, $BDL2", []> {
+  let isCompare = 1;
+  let mayLoad = 1;
+}
+
 class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 SDPatternOperator load, Immediate imm,
                 AddressingMode mode = bdaddr12only>
@@ -3305,6 +3880,12 @@ class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode>
   let M5 = 0;
 }
 
+class CompareVRRh<string mnemonic, bits<16> opcode>
+  : InstVRRh<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3),
+             mnemonic#"\t$V1, $V2, $M3", []> {
+  let isCompare = 1;
+}
+
 class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
               RegisterOperand cls>
   : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
@@ -3313,29 +3894,112 @@ class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M3 = 0;
 }
 
+class TestRSL<string mnemonic, bits<16> opcode>
+  : InstRSLa<opcode, (outs), (ins bdladdr12onlylen4:$BDL1),
+             mnemonic#"\t$BDL1", []> {
+  let mayLoad = 1;
+}
+
+class TestVRRg<string mnemonic, bits<16> opcode>
+  : InstVRRg<opcode, (outs), (ins VR128:$V1),
+             mnemonic#"\t$V1", []>;
+
+class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
+  : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
+                                 shift12only:$BD2, imm32zx4:$I3),
+            mnemonic##"\t$BDL1, $BD2, $I3", []>;
+
+class SideEffectTernaryRRFa<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            RegisterOperand cls3>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R2, $R3", []> {
+  let M4 = 0;
+}
+
+class SideEffectTernaryRRFb<string mnemonic, bits<16> opcode,
+                            RegisterOperand cls1, RegisterOperand cls2,
+                            RegisterOperand cls3>
+  : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3),
+             mnemonic#"\t$R1, $R3, $R2", []> {
+  let M4 = 0;
+}
+
+class SideEffectTernaryMemMemMemRRFb<string mnemonic, bits<16> opcode,
+                                     RegisterOperand cls1,
+                                     RegisterOperand cls2,
+                                     RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1, cls2:$R2, cls3:$R3),
+             (ins cls1:$R1src, cls2:$R2src, cls3:$R3src),
+             mnemonic#"\t$R1, $R3, $R2", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src";
+  let DisableEncoding = "$R1src, $R2src, $R3src";
+  let M4 = 0;
+}
+
 class SideEffectTernaryRRFc<string mnemonic, bits<16> opcode,
                             RegisterOperand cls1, RegisterOperand cls2,
                             Immediate imm>
   : InstRRFc<opcode, (outs), (ins cls1:$R1, cls2:$R2, imm:$M3),
              mnemonic#"\t$R1, $R2, $M3", []>;
 
+multiclass SideEffectTernaryRRFcOpt<string mnemonic, bits<16> opcode,
+                                    RegisterOperand cls1,
+                                    RegisterOperand cls2> {
+  def "" : SideEffectTernaryRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : SideEffectBinaryRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
+class SideEffectTernaryMemMemRRFc<string mnemonic, bits<16> opcode,
+                                  RegisterOperand cls1, RegisterOperand cls2,
+                                  Immediate imm>
+  : InstRRFc<opcode, (outs cls1:$R1, cls2:$R2),
+             (ins cls1:$R1src, cls2:$R2src, imm:$M3),
+             mnemonic#"\t$R1, $R2, $M3", []> {
+  let Constraints = "$R1 = $R1src, $R2 = $R2src";
+  let DisableEncoding = "$R1src, $R2src";
+}
+
+multiclass SideEffectTernaryMemMemRRFcOpt<string mnemonic, bits<16> opcode,
+                                          RegisterOperand cls1,
+                                          RegisterOperand cls2> {
+  def "" : SideEffectTernaryMemMemRRFc<mnemonic, opcode, cls1, cls2, imm32zx4>;
+  def Opt : SideEffectBinaryMemMemRRFc<mnemonic, opcode, cls1, cls2>;
+}
+
 class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
                            RegisterOperand cls>
   : InstSSF<opcode, (outs),
             (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
             mnemonic#"\t$BD1, $BD2, $R3", []>;
 
+class TernaryRRFa<string mnemonic, bits<16> opcode,
+                 RegisterOperand cls1, RegisterOperand cls2,
+                 RegisterOperand cls3>
+  : InstRRFa<opcode, (outs cls1:$R1), (ins cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $R3, $M4", []>;
+
+class TernaryRRFb<string mnemonic, bits<16> opcode,
+                  RegisterOperand cls1, RegisterOperand cls2,
+                  RegisterOperand cls3>
+  : InstRRFb<opcode, (outs cls1:$R1, cls3:$R3),
+             (ins cls1:$R1src, cls2:$R2, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R3, $R2, $M4", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                   RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1),
              (ins imm32zx4:$M3, cls2:$R2, imm32zx4:$M4),
              mnemonic#"\t$R1, $M3, $R2, $M4", []>;
 
-class TernaryRRD<string mnemonic, bits<16> opcode,
-                 SDPatternOperator operator, RegisterOperand cls>
-  : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+class TernaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRD<opcode, (outs cls1:$R1), (ins cls2:$R1src, cls2:$R3, cls2:$R2),
             mnemonic#"\t$R1, $R3, $R2",
-            [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+            [(set cls1:$R1, (operator cls2:$R1src, cls2:$R3, cls2:$R2))]> {
   let OpKey = mnemonic#cls;
   let OpType = "reg";
   let Constraints = "$R1 = $R1src";
@@ -3376,13 +4040,44 @@ multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
   }
 }
 
+class SideEffectTernaryRS<string mnemonic, bits<8> opcode,
+                          RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSa<opcode, (outs),
+            (ins cls1:$R1, cls2:$R3, bdaddr12only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []>;
+
+class SideEffectTernaryRSY<string mnemonic, bits<16> opcode,
+                           RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSYa<opcode, (outs),
+             (ins cls1:$R1, cls2:$R3, bdaddr20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []>;
+
+class SideEffectTernaryMemMemRS<string mnemonic, bits<8> opcode,
+                                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSa<opcode, (outs cls1:$R1, cls2:$R3),
+            (ins cls1:$R1src, cls2:$R3src, shift12only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
+    let Constraints = "$R1 = $R1src, $R3 = $R3src";
+    let DisableEncoding = "$R1src, $R3src";
+}
+
+class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
+                                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRSYa<opcode, (outs cls1:$R1, cls2:$R3),
+             (ins cls1:$R1src, cls2:$R3src, shift20only:$BD2),
+             mnemonic#"\t$R1, $R3, $BD2", []> {
+    let Constraints = "$R1 = $R1src, $R3 = $R3src";
+    let DisableEncoding = "$R1src, $R3src";
+}
+
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
-  : InstRXF<opcode, (outs cls:$R1),
-            (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+                 RegisterOperand cls1, RegisterOperand cls2,
+                 SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls1:$R1),
+            (ins cls2:$R1src, cls2:$R3, bdxaddr12only:$XBD2),
             mnemonic#"\t$R1, $R3, $XBD2",
-            [(set cls:$R1, (operator cls:$R1src, cls:$R3,
-                                     (load bdxaddr12only:$XBD2)))]> {
+            [(set cls1:$R1, (operator cls2:$R1src, cls2:$R3,
+                                      (load bdxaddr12only:$XBD2)))]> {
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let Constraints = "$R1 = $R1src";
@@ -3412,6 +4107,11 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M5 = type;
 }
 
+class TernaryVRIi<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstVRIi<opcode, (outs VR128:$V1),
+             (ins cls:$R2, imm32zx8:$I3, imm32zx4:$M4),
+             mnemonic#"\t$V1, $R2, $I3, $M4", []>;
+
 class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or>
   : InstVRRa<opcode, (outs tr1.op:$V1),
@@ -3484,6 +4184,25 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M6 = 0;
 }
 
+class TernaryVRRcFloat<string mnemonic, bits<16> opcode,
+                       SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
+                       bits<4> type = 0, bits<4> m5 = 0>
+  : InstVRRc<opcode, (outs tr1.op:$V1),
+             (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M6",
+             [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
+                                                 (tr2.vt tr2.op:$V3),
+                                                 imm32zx4:$M6)))]> {
+  let M4 = type;
+  let M5 = m5;
+}
+
+class TernaryVRRcFloatGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRc<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5,
+                  imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>;
+
 class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                   TypedReg tr1, TypedReg tr2, bits<4> type = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
@@ -3589,20 +4308,38 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode>
   let DisableEncoding = "$V1src";
 }
 
+class QuaternaryVRIf<string mnemonic, bits<16> opcode>
+  : InstVRIf<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>;
+
+class QuaternaryVRIg<string mnemonic, bits<16> opcode>
+  : InstVRIg<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, imm32zx8:$I3,
+                  imm32zx8:$I4, imm32zx4:$M5),
+             mnemonic#"\t$V1, $V2, $I3, $I4, $M5", []>;
+
 class QuaternaryVRRd<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator, TypedReg tr1, TypedReg tr2,
-                     bits<4> type, SDPatternOperator m6mask, bits<4> m6or>
+                     TypedReg tr3, TypedReg tr4, bits<4> type,
+                     SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0>
   : InstVRRd<opcode, (outs tr1.op:$V1),
-             (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6),
+             (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6),
              mnemonic#"\t$V1, $V2, $V3, $V4, $M6",
              [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2),
-                                                 (tr2.vt tr2.op:$V3),
-                                                 (tr2.vt tr2.op:$V4),
+                                                 (tr3.vt tr3.op:$V3),
+                                                 (tr4.vt tr4.op:$V4),
                                                  m6mask:$M6)))],
              m6or> {
   let M5 = type;
 }
 
+class QuaternaryVRRdGeneric<string mnemonic, bits<16> opcode>
+  : InstVRRd<opcode, (outs VR128:$V1),
+             (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6),
+             mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+
 // Declare a pair of instructions, one which sets CC and one which doesn't.
 // The CC-setting form ends with "S" and sets the low bit of M6.
 // Also create aliases to make use of M6 operand optional in assembler.
@@ -3611,13 +4348,15 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
                                 SDPatternOperator operator_cc,
                                 TypedReg tr1, TypedReg tr2, bits<4> type,
                                 bits<4> modifier = 0> {
-  def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type,
+  def "" : QuaternaryVRRd<mnemonic, opcode, operator,
+                          tr1, tr2, tr2, tr2, type,
                           imm32zx4even, !and (modifier, 14)>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, tr2.op:$V4, 0)>;
   let Defs = [CC] in
-    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
+                           tr1, tr2, tr2, tr2, type,
                            imm32zx4even, !add (!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
@@ -3625,15 +4364,41 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
 }
 
 multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> {
-  def "" : InstVRRd<opcode, (outs VR128:$V1),
-                   (ins VR128:$V2, VR128:$V3, VR128:$V4,
-                        imm32zx4:$M5, imm32zx4:$M6),
-                   mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>;
+  def "" : QuaternaryVRRdGeneric<mnemonic, opcode>;
   def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5",
                   (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3,
                                             VR128:$V4, imm32zx4:$M5, 0)>;
 }
 
+class SideEffectQuaternaryRRFa<string mnemonic, bits<16> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2,
+                               RegisterOperand cls3>
+  : InstRRFa<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $R3, $M4", []>;
+
+multiclass SideEffectQuaternaryRRFaOptOpt<string mnemonic, bits<16> opcode,
+                                          RegisterOperand cls1,
+                                          RegisterOperand cls2,
+                                          RegisterOperand cls3> {
+  def "" : SideEffectQuaternaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+  def Opt : SideEffectTernaryRRFa<mnemonic, opcode, cls1, cls2, cls3>;
+  def OptOpt : SideEffectBinaryRRFa<mnemonic, opcode, cls1, cls2>;
+}
+
+class SideEffectQuaternaryRRFb<string mnemonic, bits<16> opcode,
+                               RegisterOperand cls1, RegisterOperand cls2,
+                               RegisterOperand cls3>
+  : InstRRFb<opcode, (outs), (ins cls1:$R1, cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R3, $R2, $M4", []>;
+
+multiclass SideEffectQuaternaryRRFbOpt<string mnemonic, bits<16> opcode,
+                                       RegisterOperand cls1,
+                                       RegisterOperand cls2,
+                                       RegisterOperand cls3> {
+  def "" : SideEffectQuaternaryRRFb<mnemonic, opcode, cls1, cls2, cls3>;
+  def Opt : SideEffectTernaryRRFb<mnemonic, opcode, cls1, cls2, cls3>;
+}
+
 class SideEffectQuaternarySSe<string mnemonic, bits<8> opcode,
                               RegisterOperand cls>
   : InstSSe<opcode, (outs),
@@ -3649,6 +4414,16 @@ class LoadAndOpRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let mayStore = 1;
 }
 
+class CmpSwapRRE<string mnemonic, bits<16> opcode,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls, AddressingMode mode = bdaddr12only>
   : InstRSa<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
@@ -3897,10 +4672,10 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
 
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
-class SelectWrapper<RegisterOperand cls>
+class SelectWrapper<ValueType vt, RegisterOperand cls>
   : Pseudo<(outs cls:$dst),
            (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc),
-           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+           [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2,
                                             imm32zx4:$valid, imm32zx4:$cc))]> {
   let usesCustomInserter = 1;
   // Although the instructions used by these nodes do not in themselves
@@ -3981,9 +4756,7 @@ class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
 // another instruction to handle the excess.
 multiclass MemorySS<string mnemonic, bits<8> opcode,
                     SDPatternOperator sequence, SDPatternOperator loop> {
-  def "" : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
-                                        bdaddr12only:$BD2),
-                   mnemonic##"\t$BDL1, $BD2", []>;
+  def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
     def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
                                        imm64:$length),
@@ -4003,13 +4776,8 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
 // the full loop (the main instruction plus the branch on CC==3).
 multiclass StringRRE<string mnemonic, bits<16> opcode,
                      SDPatternOperator operator> {
-  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
-                   (ins GR64:$R1src, GR64:$R2src),
-                   mnemonic#"\t$R1, $R2", []> {
-    let Uses = [R0L];
-    let Constraints = "$R1 = $R1src, $R2 = $R2src";
-    let DisableEncoding = "$R1src, $R2src";
-  }
+  let Uses = [R0L] in
+    def "" : SideEffectBinaryMemMemRRE<mnemonic, opcode, GR64, GR64>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in
     def Loop : Pseudo<(outs GR64:$end),
                       (ins GR64:$start1, GR64:$start2, GR32:$char),
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrHFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
new file mode 100644
index 0000000..6d5b4b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -0,0 +1,240 @@
+//==- SystemZInstrHFP.td - Floating-point SystemZ instructions -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ hexadecimal floating-point
+// arithmetic.  Since this format is not mapped to any source-language data
+// type, these instructions are not used for code generation, but are provided
+// for use with the assembler and disassembler only.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and test.
+let Defs = [CC] in {
+  def LTER : UnaryRR <"lter", 0x32,   null_frag, FP32,  FP32>;
+  def LTDR : UnaryRR <"ltdr", 0x22,   null_frag, FP64,  FP64>;
+  def LTXR : UnaryRRE<"ltxr", 0xB362, null_frag, FP128, FP128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations.
+def LEDR : UnaryRR <"ledr", 0x35,   null_frag, FP32, FP64>;
+def LEXR : UnaryRRE<"lexr", 0xB366, null_frag, FP32, FP128>;
+def LDXR : UnaryRR <"ldxr", 0x25,   null_frag, FP64, FP128>;
+let isAsmParserOnly = 1 in {
+  def LRER : UnaryRR <"lrer", 0x35, null_frag, FP32, FP64>;
+  def LRDR : UnaryRR <"lrdr", 0x25, null_frag, FP64, FP128>;
+}
+
+// Extend floating-point values to wider representations.
+def LDER : UnaryRRE<"lder", 0xB324, null_frag, FP64,  FP32>;
+def LXER : UnaryRRE<"lxer", 0xB326, null_frag, FP128, FP32>;
+def LXDR : UnaryRRE<"lxdr", 0xB325, null_frag, FP128, FP64>;
+
+def LDE : UnaryRXE<"lde", 0xED24, null_frag, FP64,  4>;
+def LXE : UnaryRXE<"lxe", 0xED26, null_frag, FP128, 4>;
+def LXD : UnaryRXE<"lxd", 0xED25, null_frag, FP128, 8>;
+
+// Convert a signed integer register value to a floating-point one.
+def CEFR : UnaryRRE<"cefr", 0xB3B4, null_frag, FP32,  GR32>;
+def CDFR : UnaryRRE<"cdfr", 0xB3B5, null_frag, FP64,  GR32>;
+def CXFR : UnaryRRE<"cxfr", 0xB3B6, null_frag, FP128, GR32>;
+
+def CEGR : UnaryRRE<"cegr", 0xB3C4, null_frag, FP32,  GR64>;
+def CDGR : UnaryRRE<"cdgr", 0xB3C5, null_frag, FP64,  GR64>;
+def CXGR : UnaryRRE<"cxgr", 0xB3C6, null_frag, FP128, GR64>;
+
+// Convert a floating-point register value to a signed integer value,
+// with the second operand (modifier M3) specifying the rounding mode.
+let Defs = [CC] in {
+  def CFER : BinaryRRFe<"cfer", 0xB3B8, GR32, FP32>;
+  def CFDR : BinaryRRFe<"cfdr", 0xB3B9, GR32, FP64>;
+  def CFXR : BinaryRRFe<"cfxr", 0xB3BA, GR32, FP128>;
+
+  def CGER : BinaryRRFe<"cger", 0xB3C8, GR64, FP32>;
+  def CGDR : BinaryRRFe<"cgdr", 0xB3C9, GR64, FP64>;
+  def CGXR : BinaryRRFe<"cgxr", 0xB3CA, GR64, FP128>;
+}
+
+// Convert BFP to HFP.
+let Defs = [CC] in {
+  def THDER : UnaryRRE<"thder", 0xB358, null_frag, FP64, FP32>;
+  def THDR  : UnaryRRE<"thdr",  0xB359, null_frag, FP64, FP64>;
+}
+
+// Convert HFP to BFP.
+let Defs = [CC] in {
+  def TBEDR : BinaryRRFe<"tbedr", 0xB350, FP32, FP64>;
+  def TBDR  : BinaryRRFe<"tbdr",  0xB351, FP64, FP64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Negation (Load Complement).
+let Defs = [CC] in {
+  def LCER : UnaryRR <"lcer", 0x33,   null_frag, FP32,  FP32>;
+  def LCDR : UnaryRR <"lcdr", 0x23,   null_frag, FP64,  FP64>;
+  def LCXR : UnaryRRE<"lcxr", 0xB363, null_frag, FP128, FP128>;
+}
+
+// Absolute value (Load Positive).
+let Defs = [CC] in {
+  def LPER : UnaryRR <"lper", 0x30,   null_frag, FP32,  FP32>;
+  def LPDR : UnaryRR <"lpdr", 0x20,   null_frag, FP64,  FP64>;
+  def LPXR : UnaryRRE<"lpxr", 0xB360, null_frag, FP128, FP128>;
+}
+
+// Negative absolute value (Load Negative).
+let Defs = [CC] in {
+  def LNER : UnaryRR <"lner", 0x31,   null_frag, FP32,  FP32>;
+  def LNDR : UnaryRR <"lndr", 0x21,   null_frag, FP64,  FP64>;
+  def LNXR : UnaryRRE<"lnxr", 0xB361, null_frag, FP128, FP128>;
+}
+
+// Halve.
+def HER : UnaryRR <"her", 0x34, null_frag, FP32, FP32>;
+def HDR : UnaryRR <"hdr", 0x24, null_frag, FP64, FP64>;
+
+// Square root.
+def SQER : UnaryRRE<"sqer", 0xB245, null_frag, FP32,  FP32>;
+def SQDR : UnaryRRE<"sqdr", 0xB244, null_frag, FP64,  FP64>;
+def SQXR : UnaryRRE<"sqxr", 0xB336, null_frag, FP128, FP128>;
+
+def SQE : UnaryRXE<"sqe", 0xED34, null_frag, FP32, 4>;
+def SQD : UnaryRXE<"sqd", 0xED35, null_frag, FP64, 8>;
+
+// Round to an integer (rounding towards zero).
+def FIER : UnaryRRE<"fier", 0xB377, null_frag, FP32,  FP32>;
+def FIDR : UnaryRRE<"fidr", 0xB37F, null_frag, FP64,  FP64>;
+def FIXR : UnaryRRE<"fixr", 0xB367, null_frag, FP128, FP128>;
+
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def AER : BinaryRR<"aer", 0x3A, null_frag, FP32,  FP32>;
+    def ADR : BinaryRR<"adr", 0x2A, null_frag, FP64,  FP64>;
+    def AXR : BinaryRR<"axr", 0x36, null_frag, FP128, FP128>;
+  }
+  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, load, 4>;
+  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, load, 8>;
+}
+
+// Addition (unnormalized).
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def AUR : BinaryRR<"aur", 0x3E, null_frag, FP32, FP32>;
+    def AWR : BinaryRR<"awr", 0x2E, null_frag, FP64, FP64>;
+  }
+  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, load, 4>;
+  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, load, 8>;
+}
+
+// Subtraction.
+let Defs = [CC] in {
+  def SER : BinaryRR<"ser", 0x3B, null_frag, FP32,  FP32>;
+  def SDR : BinaryRR<"sdr", 0x2B, null_frag, FP64,  FP64>;
+  def SXR : BinaryRR<"sxr", 0x37, null_frag, FP128, FP128>;
+
+  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, load, 4>;
+  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, load, 8>;
+}
+
+// Subtraction (unnormalized).
+let Defs = [CC] in {
+  def SUR : BinaryRR<"sur", 0x3F, null_frag, FP32, FP32>;
+  def SWR : BinaryRR<"swr", 0x2F, null_frag, FP64, FP64>;
+
+  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, load, 4>;
+  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, load, 8>;
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MEER : BinaryRRE<"meer", 0xB337, null_frag, FP32,  FP32>;
+  def MDR  : BinaryRR <"mdr",  0x2C,   null_frag, FP64,  FP64>;
+  def MXR  : BinaryRR <"mxr",  0x26,   null_frag, FP128, FP128>;
+}
+def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, load, 4>;
+def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, load, 8>;
+
+// Extending multiplication (f32 x f32 -> f64).
+def MDER : BinaryRR<"mder", 0x3C, null_frag, FP64, FP32>;
+def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, load, 4>;
+let isAsmParserOnly = 1 in {
+  def MER : BinaryRR<"mer", 0x3C, null_frag, FP64, FP32>;
+  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, load, 4>;
+}
+
+// Extending multiplication (f64 x f64 -> f128).
+def MXDR : BinaryRR<"mxdr", 0x27, null_frag, FP128, FP64>;
+def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, load, 8>;
+
+// Fused multiply-add.
+def MAER : TernaryRRD<"maer", 0xB32E, null_frag, FP32, FP32>;
+def MADR : TernaryRRD<"madr", 0xB33E, null_frag, FP64, FP64>;
+def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, load, 4>;
+def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, load, 8>;
+
+// Fused multiply-subtract.
+def MSER : TernaryRRD<"mser", 0xB32F, null_frag, FP32, FP32>;
+def MSDR : TernaryRRD<"msdr", 0xB33F, null_frag, FP64, FP64>;
+def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, load, 4>;
+def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, load, 8>;
+
+// Multiplication (unnormalized).
+def MYR  : BinaryRRD<"myr",  0xB33B, null_frag, FP128, FP64>;
+def MYHR : BinaryRRD<"myhr", 0xB33D, null_frag, FP64,  FP64>;
+def MYLR : BinaryRRD<"mylr", 0xB339, null_frag, FP64,  FP64>;
+def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, load, 8>;
+def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, load, 8>;
+def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, load, 8>;
+
+// Fused multiply-add (unnormalized).
+def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP128, FP64>;
+def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64,  FP64>;
+def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64,  FP64>;
+def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, load, 8>;
+def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, load, 8>;
+def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, load, 8>;
+
+// Division.
+def DER : BinaryRR <"der", 0x3D,   null_frag, FP32,  FP32>;
+def DDR : BinaryRR <"ddr", 0x2D,   null_frag, FP64,  FP64>;
+def DXR : BinaryRRE<"dxr", 0xB22D, null_frag, FP128, FP128>;
+def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, load, 4>;
+def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, load, 8>;
+
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  def CER : CompareRR <"cer", 0x39,   null_frag, FP32,  FP32>;
+  def CDR : CompareRR <"cdr", 0x29,   null_frag, FP64,  FP64>;
+  def CXR : CompareRRE<"cxr", 0xB369, null_frag, FP128, FP128>;
+
+  def CE : CompareRX<"ce", 0x79, null_frag, FP32, load, 4>;
+  def CD : CompareRX<"cd", 0x69, null_frag, FP64, load, 8>;
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 3565d5f..4533f4f 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -12,11 +12,32 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZInstrInfo.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZ.h"
 #include "SystemZInstrBuilder.h"
-#include "SystemZTargetMachine.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -58,20 +79,34 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
   MBB->insert(MI, EarlierMI);
 
-  // Set up the two 64-bit registers.
+  // Set up the two 64-bit registers and remember super reg and its flags.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
+  unsigned Reg128 = LowRegOp.getReg();
+  unsigned Reg128Killed = getKillRegState(LowRegOp.isKill());
+  unsigned Reg128Undef  = getUndefRegState(LowRegOp.isUndef());
   HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
   LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
+  if (MI->mayStore()) {
+    // Add implicit uses of the super register in case one of the subregs is
+    // undefined. We could track liveness and skip storing an undefined
+    // subreg, but this is hopefully rare (discovered with llvm-stress).
+    // If Reg128 was killed, set kill flag on MI.
+    unsigned Reg128UndefImpl = (Reg128Undef | RegState::Implicit);
+    MachineInstrBuilder(MF, EarlierMI).addReg(Reg128, Reg128UndefImpl);
+    MachineInstrBuilder(MF, MI).addReg(Reg128, (Reg128UndefImpl | Reg128Killed));
+  }
+
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
   MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
   MachineOperand &LowOffsetOp = MI->getOperand(2);
   LowOffsetOp.setImm(LowOffsetOp.getImm() + 8);
 
-  // Clear the kill flags for the base and index registers in the first
-  // instruction.
+  // Clear the kill flags on the registers in the first instruction.
+  if (EarlierMI->getOperand(0).isReg() && EarlierMI->getOperand(0).isUse())
+    EarlierMI->getOperand(0).setIsKill(false);
   EarlierMI->getOperand(1).setIsKill(false);
   EarlierMI->getOperand(3).setIsKill(false);
 
@@ -131,7 +166,8 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr &MI, unsigned LowOpcode,
     MI.setDesc(get(LowOpcodeK));
   else {
     emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(), DestReg, SrcReg,
-                  SystemZ::LR, 32, MI.getOperand(1).isKill());
+                  SystemZ::LR, 32, MI.getOperand(1).isKill(),
+                  MI.getOperand(1).isUndef());
     MI.setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
     MI.getOperand(1).setReg(DestReg);
     MI.tieOperands(0, 1);
@@ -185,41 +221,45 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
 // are low registers, otherwise use RISB[LH]G.
 void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                                         unsigned Size) const {
-  emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
-                MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
-                Size, MI.getOperand(1).isKill());
+  MachineInstrBuilder MIB =
+    emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+               MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+               Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef());
+
+  // Keep the remaining operands as-is.
+  for (unsigned I = 2; I < MI.getNumOperands(); ++I)
+    MIB.add(MI.getOperand(I));
+
   MI.eraseFromParent();
 }
 
 void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction &MF = *MBB->getParent();
-  const unsigned Reg = MI->getOperand(0).getReg();
+  const unsigned Reg64 = MI->getOperand(0).getReg();
+  const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
 
-  // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD,
-  // so they already have operand 0 set to reg.
+  // EAR can only load the low subregister so us a shift for %a0 to produce
+  // the GR containing %a0 and %a1.
 
   // ear <reg>, %a0
-  MachineInstr *Ear1MI = MF.CloneMachineInstr(MI);
-  MBB->insert(MI, Ear1MI);
-  Ear1MI->setDesc(get(SystemZ::EAR));
-  MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32)
+    .addReg(SystemZ::A0)
+    .addReg(Reg64, RegState::ImplicitDefine);
 
   // sllg <reg>, <reg>, 32
-  MachineInstr *SllgMI = MF.CloneMachineInstr(MI);
-  MBB->insert(MI, SllgMI);
-  SllgMI->setDesc(get(SystemZ::SLLG));
-  MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::SLLG), Reg64)
+    .addReg(Reg64)
+    .addReg(0)
+    .addImm(32);
 
   // ear <reg>, %a1
-  MachineInstr *Ear2MI = MF.CloneMachineInstr(MI);
-  MBB->insert(MI, Ear2MI);
-  Ear2MI->setDesc(get(SystemZ::EAR));
-  MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32)
+    .addReg(SystemZ::A1);
 
   // lg <reg>, 40(<reg>)
   MI->setDesc(get(SystemZ::LG));
-  MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0);
+  MachineInstrBuilder(MF, MI).addReg(Reg64).addImm(40).addReg(0);
 }
 
 // Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
@@ -227,11 +267,13 @@ void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
 // are low registers, otherwise use RISB[LH]G.  Size is the number of bits
 // taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
 // KillSrc is true if this move is the last use of SrcReg.
-void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     const DebugLoc &DL, unsigned DestReg,
-                                     unsigned SrcReg, unsigned LowLowOpcode,
-                                     unsigned Size, bool KillSrc) const {
+MachineInstrBuilder
+SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, unsigned LowLowOpcode,
+                                unsigned Size, bool KillSrc,
+                                bool UndefSrc) const {
   unsigned Opcode;
   bool DestIsHigh = isHighReg(DestReg);
   bool SrcIsHigh = isHighReg(SrcReg);
@@ -242,18 +284,16 @@ void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
   else if (!DestIsHigh && SrcIsHigh)
     Opcode = SystemZ::RISBLH;
   else {
-    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+    return BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc));
   }
   unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
-  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+  return BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
     .addReg(DestReg, RegState::Undef)
-    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc))
     .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
 }
 
-
 MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
                                                        bool NewMI,
                                                        unsigned OpIdx1,
@@ -282,7 +322,6 @@ MachineInstr *SystemZInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   }
 }
 
-
 // If MI is a simple load or store for a frame object, return the register
 // it loads or stores and set FrameIndex to the index of the frame object.
 // Return 0 otherwise.
@@ -586,7 +625,6 @@ bool SystemZInstrInfo::optimizeCompareInstr(
          removeIPMBasedCompare(Compare, SrcReg, MRI, &RI);
 }
 
-
 bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                        ArrayRef<MachineOperand> Pred,
                                        unsigned TrueReg, unsigned FalseReg,
@@ -640,6 +678,12 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
     else {
       Opc = SystemZ::LOCR;
       MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
+      unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg);
+      TrueReg = TReg;
+      FalseReg = FReg;
     }
   } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
     Opc = SystemZ::LOCGR;
@@ -706,7 +750,7 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   return true;
 }
 
-bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+bool SystemZInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   if (Opcode == SystemZ::Return ||
       Opcode == SystemZ::Trap ||
@@ -780,10 +824,11 @@ bool SystemZInstrInfo::PredicateInstruction(
     MI.RemoveOperand(0);
     MI.setDesc(get(SystemZ::CallBRCL));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
-      .addImm(CCValid).addImm(CCMask)
-      .addOperand(FirstOp)
-      .addRegMask(RegMask)
-      .addReg(SystemZ::CC, RegState::Implicit);
+        .addImm(CCValid)
+        .addImm(CCMask)
+        .add(FirstOp)
+        .addRegMask(RegMask)
+        .addReg(SystemZ::CC, RegState::Implicit);
     return true;
   }
   if (Opcode == SystemZ::CallBR) {
@@ -803,17 +848,55 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    const DebugLoc &DL, unsigned DestReg,
                                    unsigned SrcReg, bool KillSrc) const {
-  // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
+  // Split 128-bit GPR moves into two 64-bit moves. Add implicit uses of the
+  // super register in case one of the subregs is undefined.
+  // This handles ADDR128 too.
   if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
     copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
                 RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+    MachineInstrBuilder(*MBB.getParent(), std::prev(MBBI))
+      .addReg(SrcReg, RegState::Implicit);
     copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
                 RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+    MachineInstrBuilder(*MBB.getParent(), std::prev(MBBI))
+      .addReg(SrcReg, (getKillRegState(KillSrc) | RegState::Implicit));
     return;
   }
 
   if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
-    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc,
+                  false);
+    return;
+  }
+
+  // Move 128-bit floating-point values between VR128 and FP128.
+  if (SystemZ::VR128BitRegClass.contains(DestReg) &&
+      SystemZ::FP128BitRegClass.contains(SrcReg)) {
+    unsigned SrcRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned SrcRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VMRHG), DestReg)
+      .addReg(SrcRegHi, getKillRegState(KillSrc))
+      .addReg(SrcRegLo, getKillRegState(KillSrc));
+    return;
+  }
+  if (SystemZ::FP128BitRegClass.contains(DestReg) &&
+      SystemZ::VR128BitRegClass.contains(SrcReg)) {
+    unsigned DestRegHi =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+    unsigned DestRegLo =
+      RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                             SystemZ::subreg_r64, &SystemZ::VR128BitRegClass);
+
+    if (DestRegHi != SrcReg)
+      copyPhysReg(MBB, MBBI, DL, DestRegHi, SrcReg, false);
+    BuildMI(MBB, MBBI, DL, get(SystemZ::VREPG), DestRegLo)
+      .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1);
     return;
   }
 
@@ -888,15 +971,19 @@ static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
 }
 
 namespace {
+
 struct LogicOp {
-  LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+  LogicOp() = default;
   LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
     : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
 
   explicit operator bool() const { return RegSize; }
 
-  unsigned RegSize, ImmLSB, ImmSize;
+  unsigned RegSize = 0;
+  unsigned ImmLSB = 0;
+  unsigned ImmSize = 0;
 };
+
 } // end anonymous namespace
 
 static LogicOp interpretAndImmediate(unsigned Opcode) {
@@ -976,12 +1063,12 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
       MachineInstrBuilder MIB(
           *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
                                       /*NoImplicit=*/true));
-      MIB.addOperand(Dest);
+      MIB.add(Dest);
       // Keep the kill state, but drop the tied flag.
       MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
       // Keep the remaining operands as-is.
       for (unsigned I = 2; I < NumOps; ++I)
-        MIB.addOperand(MI.getOperand(I));
+        MIB.add(MI.getOperand(I));
       MBB->insert(MI, MIB);
       return finishConvertToThreeAddress(&MI, MIB, LV);
     }
@@ -1009,7 +1096,7 @@ MachineInstr *SystemZInstrInfo::convertToThreeAddress(
       MachineOperand &Src = MI.getOperand(1);
       MachineInstrBuilder MIB =
           BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpcode))
-              .addOperand(Dest)
+              .add(Dest)
               .addReg(0)
               .addReg(Src.getReg(), getKillRegState(Src.isKill()),
                       Src.getSubReg())
@@ -1040,7 +1127,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
       LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
       ++CCUnit;
-      assert (!CCUnit.isValid() && "CC only has one reg unit.");
+      assert(!CCUnit.isValid() && "CC only has one reg unit.");
       SlotIndex MISlot =
           LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
       if (!CCLiveRange.liveAt(MISlot)) {
@@ -1063,10 +1150,9 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   unsigned OpNum = Ops[0];
-  assert(Size ==
-             MF.getRegInfo()
-                 .getRegClass(MI.getOperand(OpNum).getReg())
-                 ->getSize() &&
+  assert(Size * 8 ==
+           TRI->getRegSizeInBits(*MF.getRegInfo()
+                               .getRegClass(MI.getOperand(OpNum).getReg())) &&
          "Invalid size combination");
 
   if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) && OpNum == 0 &&
@@ -1091,7 +1177,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                      get(StoreOpcode))
-          .addOperand(MI.getOperand(1))
+          .add(MI.getOperand(1))
           .addFrameIndex(FrameIndex)
           .addImm(0)
           .addReg(0);
@@ -1100,12 +1186,12 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // destination register instead.
     if (OpNum == 1) {
       unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
-      unsigned Dest = MI.getOperand(0).getReg();
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
-                     get(LoadOpcode), Dest)
-          .addFrameIndex(FrameIndex)
-          .addImm(0)
-          .addReg(0);
+                     get(LoadOpcode))
+        .add(MI.getOperand(0))
+        .addFrameIndex(FrameIndex)
+        .addImm(0)
+        .addReg(0);
     }
   }
 
@@ -1132,7 +1218,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
             .addFrameIndex(FrameIndex)
             .addImm(0)
             .addImm(Size)
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(MI.getOperand(2).getImm())
             .addMemOperand(MMO);
       }
@@ -1140,7 +1226,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       if (isSimpleBD12Move(&MI, SystemZII::SimpleBDXStore)) {
         return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
                        get(SystemZ::MVC))
-            .addOperand(MI.getOperand(1))
+            .add(MI.getOperand(1))
             .addImm(MI.getOperand(2).getImm())
             .addImm(Size)
             .addFrameIndex(FrameIndex)
@@ -1164,7 +1250,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
       MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
                                         MI.getDebugLoc(), get(MemOpcode));
       for (unsigned I = 0; I < OpNum; ++I)
-        MIB.addOperand(MI.getOperand(I));
+        MIB.add(MI.getOperand(I));
       MIB.addFrameIndex(FrameIndex).addImm(Offset);
       if (MemDesc.TSFlags & SystemZII::HasIndex)
         MIB.addReg(0);
@@ -1379,6 +1465,7 @@ SystemZII::Branch
 SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case SystemZ::BR:
+  case SystemZ::BI:
   case SystemZ::J:
   case SystemZ::JG:
     return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 794b193..b8be1f5 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -16,16 +16,22 @@
 
 #include "SystemZ.h"
 #include "SystemZRegisterInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cstdint>
 
 #define GET_INSTRINFO_HEADER
 #include "SystemZGenInstrInfo.inc"
 
 namespace llvm {
 
-class SystemZTargetMachine;
+class SystemZSubtarget;
 
 namespace SystemZII {
+
 enum {
   // See comments in SystemZInstrFormats.td.
   SimpleBDXLoad          = (1 << 0),
@@ -43,12 +49,15 @@ enum {
   CCMaskLast             = (1 << 19),
   IsLogical              = (1 << 20)
 };
+
 static inline unsigned getAccessSize(unsigned int Flags) {
   return (Flags & AccessSizeMask) >> AccessSizeShift;
 }
+
 static inline unsigned getCCValues(unsigned int Flags) {
   return (Flags & CCValuesMask) >> CCValuesShift;
 }
+
 static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
   return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
 }
@@ -64,6 +73,7 @@ enum {
   // @INDNTPOFF
   MO_INDNTPOFF = (2 << 0)
 };
+
 // Classifies a branch.
 enum BranchType {
   // An instruction that branches on the current value of CC.
@@ -93,6 +103,7 @@ enum BranchType {
   // the result is nonzero.
   BranchCTG
 };
+
 // Information about a branch instruction.
 struct Branch {
   // The type of the branch.
@@ -111,6 +122,7 @@ struct Branch {
          const MachineOperand *target)
     : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
 };
+
 // Kinds of fused compares in compare-and-* instructions.  Together with type
 // of the converted compare, this identifies the compare-and-*
 // instruction.
@@ -127,9 +139,9 @@ enum FusedCompareType {
   // Trap
   CompareAndTrap
 };
+
 } // end namespace SystemZII
 
-class SystemZSubtarget;
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
   SystemZSubtarget &STI;
@@ -149,9 +161,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                         unsigned Size) const;
   void expandLoadStackGuard(MachineInstr *MI) const;
-  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
-                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+
+  MachineInstrBuilder
+  emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                unsigned LowLowOpcode, unsigned Size, bool KillSrc,
+                bool UndefSrc) const;
+
   virtual void anchor();
 
 protected:
@@ -203,7 +219,7 @@ public:
                     unsigned FalseReg) const override;
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const override;
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override;
@@ -304,6 +320,7 @@ public:
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZINSTRINFO_H
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index d63525f..f64c0d1 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 let hasNoSchedulingInfo = 1 in {
-  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
-                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                                [(callseq_start timm:$amt1, timm:$amt2)]>;
   def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
                                 [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
@@ -48,6 +48,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BC  : CondBranchRX<"b#",  0x47>;
       def BCR : CondBranchRR<"b#r", 0x07>;
+      def BIC : CondBranchRXY<"bi#", 0xe347>,
+                Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 
@@ -58,6 +60,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
   let isIndirectBranch = 1 in {
     def BCAsm  : AsmCondBranchRX<"bc",  0x47>;
     def BCRAsm : AsmCondBranchRR<"bcr", 0x07>;
+    def BICAsm : AsmCondBranchRXY<"bic", 0xe347>,
+                 Requires<[FeatureMiscellaneousExtensions2]>;
   }
 
   // Define AsmParser extended mnemonics for each general condition-code mask
@@ -69,6 +73,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in {
     let isIndirectBranch = 1 in {
       def BAsm#V  : FixedCondBranchRX <CV<V>, "b#",  0x47>;
       def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>;
+      def BIAsm#V : FixedCondBranchRXY<CV<V>, "bi#", 0xe347>,
+                    Requires<[FeatureMiscellaneousExtensions2]>;
     }
   }
 }
@@ -81,6 +87,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isIndirectBranch = 1 in {
     def B  : FixedCondBranchRX<CondAlways, "b",  0x47>;
     def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>;
+    def BI : FixedCondBranchRXY<CondAlways, "bi", 0xe347, brind>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   }
 }
 
@@ -189,18 +197,15 @@ let isBranch = 1, isTerminator = 1 in {
 //===----------------------------------------------------------------------===//
 
 // Unconditional trap.
-// FIXME: This trap instruction should be marked as isTerminator, but there is
-// currently a general bug that allows non-terminators to be placed between
-// terminators. Temporarily leave this unmarked until the bug is fixed.
-let isBarrier = 1, hasCtrlDep = 1 in
+let hasCtrlDep = 1 in
   def Trap : Alias<4, (outs), (ins), [(trap)]>;
 
 // Conditional trap.
-let isTerminator = 1, hasCtrlDep = 1, Uses = [CC] in
+let hasCtrlDep = 1, Uses = [CC] in
   def CondTrap : Alias<4, (outs), (ins cond4:$valid, cond4:$R1), []>;
 
 // Fused compare-and-trap instructions.
-let isTerminator = 1, hasCtrlDep = 1 in {
+let hasCtrlDep = 1 in {
   // These patterns work the same way as for compare-and-branch.
   defm CRT   : CmpBranchRRFcPair<"crt",   0xB972, GR32>;
   defm CGRT  : CmpBranchRRFcPair<"cgrt",  0xB960, GR64>;
@@ -319,9 +324,9 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in {
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
-def Select32    : SelectWrapper<GR32>;
-def Select64    : SelectWrapper<GR64>;
+def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<i32, GR32>;
+def Select64    : SelectWrapper<i64, GR64>;
 
 // We don't define 32-bit Mux stores if we don't have STOCFH, because the
 // low-only STOC should then always be used if possible.
@@ -464,6 +469,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
   defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def MVCL  : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
+  def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
+  def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
+}
 
 // String moves.
 let mayLoad = 1, mayStore = 1, Defs = [CC] in
@@ -675,6 +685,22 @@ let Predicates = [FeatureLoadAndTrap] in {
   def LLGTAT : UnaryRXY<"llgtat", 0xE39C, null_frag, GR64, 4>;
 }
 
+// Extend GR64s to GR128s.
+let usesCustomInserter = 1 in
+  def ZEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+
+//===----------------------------------------------------------------------===//
+// "Any" extensions
+//===----------------------------------------------------------------------===//
+
+// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
+def : Pat<(i64 (anyext GR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
+
+// Extend GR64s to GR128s.
+let usesCustomInserter = 1 in
+  def AEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
+
 //===----------------------------------------------------------------------===//
 // Truncations
 //===----------------------------------------------------------------------===//
@@ -707,6 +733,10 @@ def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
 defm : StoreGR64Pair<ST, STY, truncstorei32>;
 def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 
+// Store characters under mask -- not (yet) used for codegen.
+defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>;
+def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>;
+
 //===----------------------------------------------------------------------===//
 // Multi-register moves
 //===----------------------------------------------------------------------===//
@@ -715,6 +745,7 @@ def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>;
 def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>;
 def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>;
+def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>;
 
 // Multi-register stores.
 defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>;
@@ -742,6 +773,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>;
 def STRV  : StoreRXY<"strv",  0xE33E, z_strv,  GR32, 4>;
 def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>;
 
+// Byte-swapping memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+  def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>;
+
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
@@ -816,6 +851,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
 defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
+// Insert characters under mask -- not (yet) used for codegen.
 let Defs = [CC] in {
   defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>;
   def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>;
@@ -871,6 +907,12 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   }
   def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
 
+  // Addition to a high register.
+  def AHHHR : BinaryRRFa<"ahhhr", 0xB9C8, null_frag, GRH32, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def AHHLR : BinaryRRFa<"ahhlr", 0xB9D8, null_frag, GRH32, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Addition of signed 16-bit immediates.
   defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
   defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
@@ -887,6 +929,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Addition of memory.
   defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
   defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
   def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
@@ -905,6 +949,12 @@ let Defs = [CC] in {
   }
   def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
 
+  // Addition to a high register.
+  def ALHHHR : BinaryRRFa<"alhhhr", 0xB9CA, null_frag, GRH32, GRH32, GRH32>,
+               Requires<[FeatureHighWord]>;
+  def ALHHLR : BinaryRRFa<"alhhlr", 0xB9DA, null_frag, GRH32, GRH32, GR32>,
+               Requires<[FeatureHighWord]>;
+
   // Addition of signed 16-bit immediates.
   def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
                 Requires<[FeatureDistinctOps]>;
@@ -915,10 +965,18 @@ let Defs = [CC] in {
   def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
   def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
 
+  // Addition of signed 32-bit immediates.
+  def ALSIH : BinaryRIL<"alsih", 0xCCA, null_frag, GRH32, simm32>,
+              Requires<[FeatureHighWord]>;
+
   // Addition of memory.
   defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
   def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
   def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
+
+  // Addition to memory.
+  def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
+  def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>;
 }
 defm : ZXB<addc, GR64, ALGFR>;
 
@@ -933,6 +991,10 @@ let Defs = [CC], Uses = [CC] in {
   def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
 }
 
+// Addition that does not modify the condition code.
+def ALSIHN : BinaryRIL<"alsihn", 0xCCB, null_frag, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+
 //===----------------------------------------------------------------------===//
 // Subtraction
 //===----------------------------------------------------------------------===//
@@ -945,9 +1007,17 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
   defm SGR : BinaryRREAndK<"sgr", 0xB909, 0xB9E9, sub, GR64, GR64>;
 
+  // Subtraction from a high register.
+  def SHHHR : BinaryRRFa<"shhhr", 0xB9C9, null_frag, GRH32, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def SHHLR : BinaryRRFa<"shhlr", 0xB9D9, null_frag, GRH32, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
   defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>,
+             Requires<[FeatureMiscellaneousExtensions2]>;
   def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
   def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
@@ -960,6 +1030,12 @@ let Defs = [CC] in {
   def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
   defm SLGR : BinaryRREAndK<"slgr", 0xB90B, 0xB9EB, subc, GR64, GR64>;
 
+  // Subtraction from a high register.
+  def SLHHHR : BinaryRRFa<"slhhhr", 0xB9CB, null_frag, GRH32, GRH32, GRH32>,
+               Requires<[FeatureHighWord]>;
+  def SLHHLR : BinaryRRFa<"slhhlr", 0xB9DB, null_frag, GRH32, GRH32, GR32>,
+               Requires<[FeatureHighWord]>;
+
   // Subtraction of unsigned 32-bit immediates.  These don't match
   // subc because we prefer addc for constants.
   def SLFI  : BinaryRIL<"slfi",  0xC25, null_frag, GR32, uimm32>;
@@ -1143,6 +1219,15 @@ defm : RMWIByte<xor, bdaddr20pair, XIY>;
 // Multiplication
 //===----------------------------------------------------------------------===//
 
+// Multiplication of a register, setting the condition code.  We prefer these
+// over MS(G)R if available, even though we cannot use the condition code,
+// since they are three-operand instructions.
+let Predicates = [FeatureMiscellaneousExtensions2],
+    Defs = [CC], isCommutable = 1 in {
+  def MSRKC  : BinaryRRFa<"msrkc",  0xB9FD, mul, GR32, GR32, GR32>;
+  def MSGRKC : BinaryRRFa<"msgrkc", 0xB9ED, mul, GR64, GR64, GR64>;
+}
+
 // Multiplication of a register.
 let isCommutable = 1 in {
   def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
@@ -1162,14 +1247,39 @@ def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 // Multiplication of memory.
 defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
 defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MGH  : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>,
+            Requires<[FeatureMiscellaneousExtensions2]>;
 def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
 def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
+// Multiplication of memory, setting the condition code.
+let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
+  def MSC  : BinaryRXY<"msc",  0xE353, null_frag, GR32, load, 4>;
+  def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>;
+}
+
 // Multiplication of a register, producing two results.
-def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
+def MR   : BinaryRR <"mr",    0x1C,   null_frag, GR128, GR32>;
+def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>,
+           Requires<[FeatureMiscellaneousExtensions2]>;
+def MLR  : BinaryRRE<"mlr",  0xB996, null_frag, GR128, GR32>;
+def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>;
+def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2),
+          (MGRK GR64:$src1, GR64:$src2)>;
+def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2),
+          (MLGR (AEXT128 GR64:$src1), GR64:$src2)>;
 
 // Multiplication of memory, producing two results.
-def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
+def M   : BinaryRX <"m",   0x5C,   null_frag, GR128, load, 4>;
+def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>;
+def MG  : BinaryRXY<"mg",  0xE384, null_frag, GR128, load, 8>,
+          Requires<[FeatureMiscellaneousExtensions2]>;
+def ML  : BinaryRXY<"ml",  0xE396, null_frag, GR128, load, 4>;
+def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>;
+def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
+def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
@@ -1177,39 +1287,69 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
 
 let hasSideEffects = 1 in {  // Do not speculatively execute.
   // Division and remainder, from registers.
-  def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>;
-  def DSGR  : BinaryRRE<"dsgr",  0xB90D, z_sdivrem64, GR128, GR64>;
-  def DLR   : BinaryRRE<"dlr",   0xB997, z_udivrem32, GR128, GR32>;
-  def DLGR  : BinaryRRE<"dlgr",  0xB987, z_udivrem64, GR128, GR64>;
+  def DR    : BinaryRR <"dr",    0x1D,   null_frag, GR128, GR32>;
+  def DSGFR : BinaryRRE<"dsgfr", 0xB91D, null_frag, GR128, GR32>;
+  def DSGR  : BinaryRRE<"dsgr",  0xB90D, null_frag, GR128, GR64>;
+  def DLR   : BinaryRRE<"dlr",   0xB997, null_frag, GR128, GR32>;
+  def DLGR  : BinaryRRE<"dlgr",  0xB987, null_frag, GR128, GR64>;
 
   // Division and remainder, from memory.
-  def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
-  def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load, 8>;
-  def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load, 4>;
-  def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
-}
+  def D    : BinaryRX <"d",    0x5D,   null_frag, GR128, load, 4>;
+  def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, load, 4>;
+  def DSG  : BinaryRXY<"dsg",  0xE30D, null_frag, GR128, load, 8>;
+  def DL   : BinaryRXY<"dl",   0xE397, null_frag, GR128, load, 4>;
+  def DLG  : BinaryRXY<"dlg",  0xE387, null_frag, GR128, load, 8>;
+}
+def : Pat<(z_sdivrem GR64:$src1, GR32:$src2),
+          (DSGFR (AEXT128 GR64:$src1), GR32:$src2)>;
+def : Pat<(z_sdivrem GR64:$src1, (i32 (load bdxaddr20only:$src2))),
+          (DSGF (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
+def : Pat<(z_sdivrem GR64:$src1, GR64:$src2),
+          (DSGR (AEXT128 GR64:$src1), GR64:$src2)>;
+def : Pat<(z_sdivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (DSG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>;
+
+def : Pat<(z_udivrem GR32:$src1, GR32:$src2),
+          (DLR (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
+                                       subreg_l32)), GR32:$src2)>;
+def : Pat<(z_udivrem GR32:$src1, (i32 (load bdxaddr20only:$src2))),
+          (DL (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1,
+                                      subreg_l32)), bdxaddr20only:$src2)>;
+def : Pat<(z_udivrem GR64:$src1, GR64:$src2),
+          (DLGR (ZEXT128 GR64:$src1), GR64:$src2)>;
+def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))),
+          (DLG (ZEXT128 GR64:$src1), bdxaddr20only:$src2)>;
 
 //===----------------------------------------------------------------------===//
 // Shifts
 //===----------------------------------------------------------------------===//
 
-// Shift left.
+// Logical shift left.
 let hasSideEffects = 0 in {
   defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
-  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
   def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>;
+  def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>;
+}
+
+// Arithmetic shift left.
+let Defs = [CC] in {
+  defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>;
+  def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>;
+  def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>;
 }
 
 // Logical shift right.
 let hasSideEffects = 0 in {
   defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
   def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>;
+  def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>;
 }
 
 // Arithmetic shift right.
 let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
   defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
   def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>;
+  def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>;
 }
 
 // Rotate left.
@@ -1266,6 +1406,12 @@ let Defs = [CC], CCValues = 0xE in {
   def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
   def CGR  : CompareRRE<"cgr",  0xB920, z_scmp,    GR64, GR64>;
 
+  // Comparison with a high register.
+  def CHHR : CompareRRE<"chhr", 0xB9CD, null_frag, GRH32, GRH32>,
+             Requires<[FeatureHighWord]>;
+  def CHLR : CompareRRE<"chlr", 0xB9DD, null_frag, GRH32, GR32>,
+             Requires<[FeatureHighWord]>;
+
   // Comparison with a signed 16-bit immediate.  CHIMux expands to CHI or CIH,
   // depending on the choice of register.
   def CHIMux : CompareRIPseudo<z_scmp, GRX32, imm32sx16>,
@@ -1312,6 +1458,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
   def CLGR  : CompareRRE<"clgr",  0xB921, z_ucmp,    GR64, GR64>;
 
+  // Comparison with a high register.
+  def CLHHR : CompareRRE<"clhhr", 0xB9CF, null_frag, GRH32, GRH32>,
+              Requires<[FeatureHighWord]>;
+  def CLHLR : CompareRRE<"clhlr", 0xB9DF, null_frag, GRH32, GR32>,
+              Requires<[FeatureHighWord]>;
+
   // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
   // or CLIH, depending on the choice of register.
   def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
@@ -1351,8 +1503,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
 // Memory-to-memory comparison.
-let mayLoad = 1, Defs = [CC] in
+let mayLoad = 1, Defs = [CC] in {
   defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
+  def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
+  def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
+}
 
 // String comparison.
 let mayLoad = 1, Defs = [CC] in
@@ -1381,6 +1537,12 @@ let Defs = [CC] in {
 def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>;
 def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>;
 
+// Compare logical characters under mask -- not (yet) used for codegen.
+let Defs = [CC] in {
+  defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>;
+  def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -1404,7 +1566,7 @@ let Predicates = [FeatureExecutionHint] in {
 // A serialization instruction that acts as a barrier for all memory
 // accesses, which expands to "bcr 14, 0".
 let hasSideEffects = 1 in
-def Serialize : Alias<2, (outs), (ins), [(z_serialize)]>;
+def Serialize : Alias<2, (outs), (ins), []>;
 
 // A pseudo instruction that serves as a compiler barrier.
 let hasSideEffects = 1, hasNoSchedulingInfo = 1 in
@@ -1581,6 +1743,136 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in {
 }
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1 in
+  def TR : SideEffectBinarySSa<"tr", 0xDC>;
+
+let mayLoad = 1, Defs = [CC, R0L, R1D] in {
+  def TRT  : SideEffectBinarySSa<"trt", 0xDD>;
+  def TRTR : SideEffectBinarySSa<"trtr", 0xD0>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+  def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>;
+
+let mayLoad = 1, Uses = [R1D], Defs = [CC] in {
+  defm TRTE  : BinaryMemRRFcOpt<"trte",  0xB9BF, GR128, GR64>;
+  defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+  defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>;
+  defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>;
+  defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>;
+  defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>;
+}
+
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>;
+  defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>;
+  defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>;
+  defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>;
+  def  CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>;
+  def  CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>;
+
+  let isAsmParserOnly = 1 in {
+    defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>;
+    defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in {
+  def KM  : SideEffectBinaryMemMemRRE<"km",  0xB92E, GR128, GR128>;
+  def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>;
+
+  def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>;
+  def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>;
+  def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist4] in {
+    def KMF   : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>;
+    def KMO   : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>;
+    def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D,
+                                               GR128, GR128, GR128>;
+    def PCC   : SideEffectInherentRRE<"pcc", 0xB92C>;
+  }
+
+  let Predicates = [FeatureMessageSecurityAssist5] in
+    def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>;
+  let Predicates = [FeatureMessageSecurityAssist7], isAsmParserOnly = 1 in
+    def PRNO : SideEffectBinaryMemMemRRE<"prno", 0xB93C, GR128, GR128>;
+
+  let Predicates = [FeatureMessageSecurityAssist8] in
+    def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929,
+                                              GR128, GR128, GR128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureGuardedStorage] in {
+  def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>;
+  def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>;
+
+  let mayLoad = 1 in
+    def LGSC : SideEffectBinaryRXY<"lgsc", 0xE34D, GR64>;
+  let mayStore = 1 in
+    def STGSC : SideEffectBinaryRXY<"stgsc", 0xE349, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+defm CVB  : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>;
+def  CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>;
+
+defm CVD  : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>;
+def  CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>;
+
+let mayLoad = 1, mayStore = 1 in {
+  def MVN : SideEffectBinarySSa<"mvn", 0xD1>;
+  def MVZ : SideEffectBinarySSa<"mvz", 0xD3>;
+  def MVO : SideEffectBinarySSb<"mvo", 0xF1>;
+
+  def PACK : SideEffectBinarySSb<"pack", 0xF2>;
+  def PKA  : SideEffectBinarySSf<"pka", 0xE9>;
+  def PKU  : SideEffectBinarySSf<"pku", 0xE1>;
+  def UNPK : SideEffectBinarySSb<"unpk", 0xF3>;
+  let Defs = [CC] in {
+    def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>;
+    def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>;
+  }
+}
+
+let mayLoad = 1, mayStore = 1 in {
+  let Defs = [CC] in {
+    def AP : SideEffectBinarySSb<"ap", 0xFA>;
+    def SP : SideEffectBinarySSb<"sp", 0xFB>;
+    def ZAP : SideEffectBinarySSb<"zap", 0xF8>;
+    def SRP : SideEffectTernarySSc<"srp", 0xF0>;
+  }
+  def MP : SideEffectBinarySSb<"mp", 0xFC>;
+  def DP : SideEffectBinarySSb<"dp", 0xFD>;
+  let Defs = [CC] in {
+    def ED : SideEffectBinarySSa<"ed", 0xDE>;
+    def EDMK : SideEffectBinarySSa<"edmk", 0xDF>;
+  }
+}
+
+let Defs = [CC] in {
+  def CP : CompareSSb<"cp", 0xF9>;
+  def TP : TestRSL<"tp", 0xEBC0>;
+}
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -1699,39 +1991,32 @@ def : Pat<(ctlz GR64:$src),
 let Predicates = [FeaturePopulationCount], Defs = [CC] in
   def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>;
 
-// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
-def : Pat<(i64 (anyext GR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
-
-// Extend GR32s and GR64s to GR128s.
-let usesCustomInserter = 1 in {
-  def AEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
-  def ZEXT128_32 : Pseudo<(outs GR128:$dst), (ins GR32:$src), []>;
-  def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
-}
-
 // Search a block of memory for a character.
 let mayLoad = 1, Defs = [CC] in
-  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+  defm SRST : StringRRE<"srst", 0xB25E, z_search_string>;
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>;
 
-// Supervisor call.
-let hasSideEffects = 1, isCall = 1, Defs = [CC] in
-  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+// Compare until substring equal.
+let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in
+  def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>;
 
-// Store clock.
-let hasSideEffects = 1, Defs = [CC] in {
-  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
-  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
-  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
-}
+// Compare and form codeword.
+let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in
+  def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>;
 
-// Store facility list.
-let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
-  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+// Update tree.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D],
+    Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in
+  def UPT : SideEffectInherentE<"upt", 0x0102>;
 
-// Extract CPU time.
-let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in
-  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+// Checksum.
+let mayLoad = 1, Defs = [CC] in
+  def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>;
+
+// Compression call.
+let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in
+  def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>;
 
 // Execute.
 let hasSideEffects = 1 in {
@@ -1739,17 +2024,6 @@ let hasSideEffects = 1 in {
   def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>;
 }
 
-// Program return.
-let hasSideEffects = 1, Defs = [CC] in
-  def PR : SideEffectInherentE<"pr", 0x0101>;
-
-// Move with key.
-let mayLoad = 1, mayStore = 1, Defs = [CC] in
-  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
-
-// Store real address.
-def STRAG : StoreSSE<"strag", 0xE502>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrSystem.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrSystem.td
new file mode 100644
index 0000000..0112ebf
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrSystem.td
@@ -0,0 +1,521 @@
+//==- SystemZInstrSystem.td - SystemZ system instructions -*- tblgen-*-----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ system-level instructions.
+// Most of these instructions are privileged or semi-privileged.  They are
+// not used for code generation, but are provided for use with the assembler
+// and disassembler only.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Program-Status Word Instructions.
+//===----------------------------------------------------------------------===//
+
+// Extract PSW.
+let hasSideEffects = 1, Uses = [CC] in
+  def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>;
+
+// Load PSW (extended).
+let hasSideEffects = 1, Defs = [CC], mayLoad = 1 in {
+  def LPSW : SideEffectUnaryS<"lpsw", 0x8200, null_frag, 8>;
+  def LPSWE : SideEffectUnaryS<"lpswe", 0xB2B2, null_frag, 16>;
+}
+
+// Insert PSW key.
+let Uses = [R2L], Defs = [R2L] in
+  def IPK : SideEffectInherentS<"ipk", 0xB20B, null_frag>;
+
+// Set PSW key from address.
+let hasSideEffects = 1 in
+  def SPKA : SideEffectAddressS<"spka", 0xB20A, null_frag>;
+
+// Set system mask.
+let hasSideEffects = 1, mayLoad = 1 in
+  def SSM : SideEffectUnaryS<"ssm", 0x8000, null_frag, 1>;
+
+// Store then AND/OR system mask.
+let hasSideEffects = 1 in {
+  def STNSM : StoreSI<"stnsm", 0xAC, null_frag, imm32zx8>;
+  def STOSM : StoreSI<"stosm", 0xAD, null_frag, imm32zx8>;
+}
+
+// Insert address space control.
+let hasSideEffects = 1 in
+  def IAC : InherentRRE<"iac", 0xB224, GR32, null_frag>;
+
+// Set address space control (fast).
+let hasSideEffects = 1 in {
+  def SAC : SideEffectAddressS<"sac", 0xB219, null_frag>;
+  def SACF : SideEffectAddressS<"sacf", 0xB279, null_frag>;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Register Instructions.
+//===----------------------------------------------------------------------===//
+
+// Load control.
+def LCTL : LoadMultipleRS<"lctl", 0xB7, CR64>;
+def LCTLG : LoadMultipleRSY<"lctlg", 0xEB2F, CR64>;
+
+// Store control.
+def STCTL : StoreMultipleRS<"stctl", 0xB6, CR64>;
+def STCTG : StoreMultipleRSY<"stctg", 0xEB25, CR64>;
+
+// Extract primary ASN (and instance).
+let hasSideEffects = 1 in {
+  def EPAR : InherentRRE<"epar", 0xB226, GR32, null_frag>;
+  def EPAIR : InherentRRE<"epair", 0xB99A, GR64, null_frag>;
+}
+
+// Extract secondary ASN (and instance).
+let hasSideEffects = 1 in {
+  def ESAR : InherentRRE<"esar", 0xB227, GR32, null_frag>;
+  def ESAIR : InherentRRE<"esair", 0xB99B, GR64, null_frag>;
+}
+
+// Set secondary ASN (and instance).
+let hasSideEffects = 1 in {
+  def SSAR : SideEffectUnaryRRE<"ssar", 0xB225, GR32, null_frag>;
+  def SSAIR : SideEffectUnaryRRE<"ssair", 0xB99F, GR64, null_frag>;
+}
+
+// Extract and set extended authority.
+let hasSideEffects = 1 in
+  def ESEA : UnaryTiedRRE<"esea", 0xB99D, GR32>;
+
+//===----------------------------------------------------------------------===//
+// Prefix-Register Instructions.
+//===----------------------------------------------------------------------===//
+
+// Set prefix.
+let hasSideEffects = 1 in
+  def SPX : SideEffectUnaryS<"spx", 0xB210, null_frag, 4>;
+
+// Store prefix.
+let hasSideEffects = 1 in
+  def STPX : StoreInherentS<"stpx", 0xB211, null_frag, 4>;
+
+//===----------------------------------------------------------------------===//
+// Storage-Key and Real Memory Instructions.
+//===----------------------------------------------------------------------===//
+
+// Insert storage key extended.
+let hasSideEffects = 1 in
+  def ISKE : BinaryRRE<"iske", 0xB229, null_frag, GR32, GR64>;
+
+// Insert virtual storage key.
+let hasSideEffects = 1 in
+  def IVSK : BinaryRRE<"ivsk", 0xB223, null_frag, GR32, GR64>;
+
+// Set storage key extended.
+let hasSideEffects = 1, Defs = [CC] in
+  defm SSKE : SideEffectTernaryRRFcOpt<"sske", 0xB22B, GR32, GR64>;
+
+// Reset reference bit extended.
+let hasSideEffects = 1, Defs = [CC] in
+  def RRBE : SideEffectBinaryRRE<"rrbe", 0xB22A, GR32, GR64>;
+
+// Reset reference bits multiple.
+let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in
+  def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>;
+
+// Insert reference bits multiple.
+let Predicates = [FeatureInsertReferenceBitsMultiple], hasSideEffects = 1 in
+  def IRBM : UnaryRRE<"irbm", 0xB9AC, null_frag, GR64, GR64>;
+
+// Perform frame management function.
+let hasSideEffects = 1 in
+  def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>;
+
+// Test block.
+let hasSideEffects = 1, mayStore = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def TB : SideEffectBinaryRRE<"tb", 0xB22C, GR64, GR64>;
+
+// Page in / out.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def PGIN : SideEffectBinaryRRE<"pgin", 0xB22E, GR64, GR64>;
+  def PGOUT : SideEffectBinaryRRE<"pgout", 0xB22F, GR64, GR64>;
+}
+
+//===----------------------------------------------------------------------===//
+// Dynamic-Address-Translation Instructions.
+//===----------------------------------------------------------------------===//
+
+// Invalidate page table entry.
+let hasSideEffects = 1 in
+  defm IPTE : SideEffectQuaternaryRRFaOptOpt<"ipte", 0xB221, GR64, GR32, GR32>;
+
+// Invalidate DAT table entry.
+let hasSideEffects = 1 in
+  defm IDTE : SideEffectQuaternaryRRFbOpt<"idte", 0xB98E, GR64, GR64, GR64>;
+
+// Compare and replace DAT table entry.
+let Predicates = [FeatureEnhancedDAT2], hasSideEffects = 1, Defs = [CC] in
+  defm CRDTE : SideEffectQuaternaryRRFbOpt<"crdte", 0xB98F, GR128, GR128, GR64>;
+
+// Purge TLB.
+let hasSideEffects = 1 in
+  def PTLB : SideEffectInherentS<"ptlb", 0xB20D, null_frag>;
+
+// Compare and swap and purge.
+let hasSideEffects = 1, Defs = [CC] in {
+  def CSP : CmpSwapRRE<"csp", 0xB250, GR128, GR64>;
+  def CSPG : CmpSwapRRE<"cspg", 0xB98A, GR128, GR64>;
+}
+
+// Load page-table-entry address.
+let hasSideEffects = 1, Defs = [CC] in
+  def LPTEA : TernaryRRFb<"lptea", 0xB9AA, GR64, GR64, GR64>;
+
+// Load real address.
+let hasSideEffects = 1, Defs = [CC] in {
+  defm LRA : LoadAddressRXPair<"lra", 0xB1, 0xE313, null_frag>;
+  def LRAG : LoadAddressRXY<"lrag", 0xE303, null_frag, laaddr20pair>;
+}
+
+// Store real address.
+def STRAG : StoreSSE<"strag", 0xE502>;
+
+// Load using real address.
+let mayLoad = 1 in {
+ def LURA : UnaryRRE<"lura", 0xB24B, null_frag, GR32, GR64>;
+ def LURAG : UnaryRRE<"lurag", 0xB905, null_frag, GR64, GR64>;
+}
+
+// Store using real address.
+let mayStore = 1 in {
+ def STURA : SideEffectBinaryRRE<"stura", 0xB246, GR32, GR64>;
+ def STURG : SideEffectBinaryRRE<"sturg", 0xB925, GR64, GR64>;
+}
+
+// Test protection.
+let hasSideEffects = 1, Defs = [CC] in
+  def TPROT : SideEffectBinarySSE<"tprot", 0xE501>;
+
+//===----------------------------------------------------------------------===//
+// Memory-move Instructions.
+//===----------------------------------------------------------------------===//
+
+// Move with key.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in
+  def MVCK : MemoryBinarySSd<"mvck", 0xD9, GR64>;
+
+// Move to primary / secondary.
+let mayLoad = 1, mayStore = 1, Defs = [CC] in {
+  def MVCP : MemoryBinarySSd<"mvcp", 0xDA, GR64>;
+  def MVCS : MemoryBinarySSd<"mvcs", 0xDB, GR64>;
+}
+
+// Move with source / destination key.
+let mayLoad = 1, mayStore = 1, Uses = [R0L, R1L] in {
+  def MVCSK : SideEffectBinarySSE<"mvcsk", 0xE50E>;
+  def MVCDK : SideEffectBinarySSE<"mvcdk", 0xE50F>;
+}
+
+// Move with optional specifications.
+let mayLoad = 1, mayStore = 1, Uses = [R0L] in
+  def MVCOS : SideEffectTernarySSF<"mvcos", 0xC80, GR64>;
+
+// Move page.
+let mayLoad = 1, mayStore = 1, Uses = [R0L], Defs = [CC] in
+  def MVPG : SideEffectBinaryRRE<"mvpg", 0xB254, GR64, GR64>;
+
+//===----------------------------------------------------------------------===//
+// Address-Space Instructions.
+//===----------------------------------------------------------------------===//
+
+// Load address space parameters.
+let hasSideEffects = 1, Defs = [CC] in
+  def LASP : SideEffectBinarySSE<"lasp", 0xE500>;
+
+// Purge ALB.
+let hasSideEffects = 1 in
+  def PALB : SideEffectInherentRRE<"palb", 0xB248>;
+
+// Program call.
+let hasSideEffects = 1 in
+  def PC : SideEffectAddressS<"pc", 0xB218, null_frag>;
+
+// Program return.
+let hasSideEffects = 1, Defs = [CC] in
+  def PR : SideEffectInherentE<"pr", 0x0101>;
+
+// Program transfer (with instance).
+let hasSideEffects = 1 in {
+  def PT : SideEffectBinaryRRE<"pt", 0xB228, GR32, GR64>;
+  def PTI : SideEffectBinaryRRE<"pti", 0xB99E, GR64, GR64>;
+}
+
+// Resume program.
+let hasSideEffects = 1, Defs = [CC] in
+  def RP : SideEffectAddressS<"rp", 0xB277, null_frag>;
+
+// Branch in subspace group.
+let hasSideEffects = 1 in
+  def BSG : UnaryRRE<"bsg", 0xB258, null_frag, GR64, GR64>;
+
+// Branch and set authority.
+let hasSideEffects = 1 in
+  def BSA : UnaryRRE<"bsa", 0xB25A, null_frag, GR64, GR64>;
+
+// Test access.
+let Defs = [CC] in
+  def TAR : SideEffectBinaryRRE<"tar", 0xB24C, AR32, GR32>;
+
+//===----------------------------------------------------------------------===//
+// Linkage-Stack Instructions.
+//===----------------------------------------------------------------------===//
+
+// Branch and stack.
+let hasSideEffects = 1 in
+  def BAKR : SideEffectBinaryRRE<"bakr", 0xB240, GR64, GR64>;
+
+// Extract stacked registers.
+let hasSideEffects = 1 in {
+  def EREG : SideEffectBinaryRRE<"ereg", 0xB249, GR32, GR32>;
+  def EREGG : SideEffectBinaryRRE<"eregg", 0xB90E, GR64, GR64>;
+}
+
+// Extract stacked state.
+let hasSideEffects = 1, Defs = [CC] in
+  def ESTA : UnaryRRE<"esta", 0xB24A, null_frag, GR128, GR32>;
+
+// Modify stacked state.
+let hasSideEffects = 1 in
+  def MSTA : SideEffectUnaryRRE<"msta", 0xB247, GR128, null_frag>;
+
+//===----------------------------------------------------------------------===//
+// Time-Related Instructions.
+//===----------------------------------------------------------------------===//
+
+// Perform timing facility function.
+let hasSideEffects = 1, mayLoad = 1, Uses = [R0L, R1D], Defs = [CC] in
+  def PTFF : SideEffectInherentE<"ptff", 0x0104>;
+
+// Set clock.
+let hasSideEffects = 1, Defs = [CC] in
+  def SCK : SideEffectUnaryS<"sck", 0xB204, null_frag, 8>;
+
+// Set clock programmable field.
+let hasSideEffects = 1, Uses = [R0L] in
+  def SCKPF : SideEffectInherentE<"sckpf", 0x0107>;
+
+// Set clock comparator.
+let hasSideEffects = 1 in
+  def SCKC : SideEffectUnaryS<"sckc", 0xB206, null_frag, 8>;
+
+// Set CPU timer.
+let hasSideEffects = 1 in
+  def SPT : SideEffectUnaryS<"spt", 0xB208, null_frag, 8>;
+
+// Store clock (fast / extended).
+let hasSideEffects = 1, Defs = [CC] in {
+  def STCK  : StoreInherentS<"stck",  0xB205, null_frag, 8>;
+  def STCKF : StoreInherentS<"stckf", 0xB27C, null_frag, 8>;
+  def STCKE : StoreInherentS<"stcke", 0xB278, null_frag, 16>;
+}
+
+// Store clock comparator.
+let hasSideEffects = 1 in
+  def STCKC : StoreInherentS<"stckc", 0xB207, null_frag, 8>;
+
+// Store CPU timer.
+let hasSideEffects = 1 in
+  def STPT : StoreInherentS<"stpt", 0xB209, null_frag, 8>;
+
+//===----------------------------------------------------------------------===//
+// CPU-Related Instructions.
+//===----------------------------------------------------------------------===//
+
+// Store CPU address.
+let hasSideEffects = 1 in
+  def STAP : StoreInherentS<"stap", 0xB212, null_frag, 2>;
+
+// Store CPU ID.
+let hasSideEffects = 1 in
+  def STIDP : StoreInherentS<"stidp", 0xB202, null_frag, 8>;
+
+// Store system information.
+let hasSideEffects = 1, Uses = [R0L, R1L], Defs = [R0L, CC] in
+  def STSI : StoreInherentS<"stsi", 0xB27D, null_frag, 0>;
+
+// Store facility list.
+let hasSideEffects = 1 in
+  def STFL : StoreInherentS<"stfl", 0xB2B1, null_frag, 4>;
+
+// Store facility list extended.
+let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in
+  def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>;
+
+// Extract CPU attribute.
+let hasSideEffects = 1 in
+  def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>;
+
+// Extract CPU time.
+let hasSideEffects = 1, mayLoad = 1, Defs = [R0D, R1D] in
+  def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>;
+
+// Perform topology function.
+let hasSideEffects = 1 in
+  def PTF : UnaryTiedRRE<"ptf", 0xB9A2, GR64>;
+
+// Perform cryptographic key management operation.
+let Predicates = [FeatureMessageSecurityAssist3],
+    hasSideEffects = 1, Uses = [R0L, R1D] in
+  def PCKMO : SideEffectInherentRRE<"pckmo", 0xB928>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Supervisor call.
+let hasSideEffects = 1, isCall = 1, Defs = [CC] in
+  def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>;
+
+// Monitor call.
+let hasSideEffects = 1, isCall = 1 in
+  def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>;
+
+// Diagnose.
+let hasSideEffects = 1, isCall = 1 in
+  def DIAG : SideEffectTernaryRS<"diag", 0x83, GR32, GR32>;
+
+// Trace.
+let hasSideEffects = 1, mayLoad = 1 in {
+  def TRACE : SideEffectTernaryRS<"trace", 0x99, GR32, GR32>;
+  def TRACG : SideEffectTernaryRSY<"tracg", 0xEB0F, GR64, GR64>;
+}
+
+// Trap.
+let hasSideEffects = 1 in {
+  def TRAP2 : SideEffectInherentE<"trap2", 0x01FF>;
+  def TRAP4 : SideEffectAddressS<"trap4", 0xB2FF, null_frag>;
+}
+
+// Signal processor.
+let hasSideEffects = 1, Defs = [CC] in
+  def SIGP : SideEffectTernaryRS<"sigp", 0xAE, GR64, GR64>;
+
+// Signal adapter.
+let hasSideEffects = 1, Uses = [R0D, R1D, R2D, R3D], Defs = [CC] in
+  def SIGA : SideEffectAddressS<"siga", 0xB274, null_frag>;
+
+// Start interpretive execution.
+let hasSideEffects = 1, Defs = [CC] in
+  def SIE : SideEffectUnaryS<"sie", 0xB214, null_frag, 0>;
+
+//===----------------------------------------------------------------------===//
+// CPU-Measurement Facility Instructions (SA23-2260).
+//===----------------------------------------------------------------------===//
+
+// Load program parameter
+let hasSideEffects = 1 in
+  def LPP : SideEffectUnaryS<"lpp", 0xB280, null_frag, 8>;
+
+// Extract coprocessor-group address.
+let hasSideEffects = 1, Defs = [CC] in
+  def ECPGA : UnaryRRE<"ecpga", 0xB2ED, null_frag, GR32, GR64>;
+
+// Extract CPU counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def ECCTR : UnaryRRE<"ecctr", 0xB2E4, null_frag, GR64, GR64>;
+
+// Extract peripheral counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def EPCTR : UnaryRRE<"epctr", 0xB2E5, null_frag, GR64, GR64>;
+
+// Load CPU-counter-set controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LCCTL : SideEffectUnaryS<"lcctl", 0xB284, null_frag, 8>;
+
+// Load peripheral-counter-set controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LPCTL : SideEffectUnaryS<"lpctl", 0xB285, null_frag, 8>;
+
+// Load sampling controls.
+let hasSideEffects = 1, Defs = [CC] in
+  def LSCTL : SideEffectUnaryS<"lsctl", 0xB287, null_frag, 0>;
+
+// Query sampling information.
+let hasSideEffects = 1 in
+  def QSI : StoreInherentS<"qsi", 0xB286, null_frag, 0>;
+
+// Query counter information.
+let hasSideEffects = 1 in
+  def QCTRI : StoreInherentS<"qctri", 0xB28E, null_frag, 0>;
+
+// Set CPU counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def SCCTR : SideEffectBinaryRRE<"scctr", 0xB2E0, GR64, GR64>;
+
+// Set peripheral counter.
+let hasSideEffects = 1, Defs = [CC] in
+  def SPCTR : SideEffectBinaryRRE<"spctr", 0xB2E1, GR64, GR64>;
+
+//===----------------------------------------------------------------------===//
+// I/O Instructions (Principles of Operation, Chapter 14).
+//===----------------------------------------------------------------------===//
+
+// Clear subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def CSCH : SideEffectInherentS<"csch", 0xB230, null_frag>;
+
+// Halt subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def HSCH : SideEffectInherentS<"hsch", 0xB231, null_frag>;
+
+// Modify subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def MSCH : SideEffectUnaryS<"msch", 0xB232, null_frag, 0>;
+
+// Resume subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def RSCH : SideEffectInherentS<"rsch", 0xB238, null_frag>;
+
+// Start subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def SSCH : SideEffectUnaryS<"ssch", 0xB233, null_frag, 0>;
+
+// Store subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def STSCH : StoreInherentS<"stsch", 0xB234, null_frag, 0>;
+
+// Test subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def TSCH : StoreInherentS<"tsch", 0xB235, null_frag, 0>;
+
+// Cancel subchannel.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def XSCH : SideEffectInherentS<"xsch", 0xB276, null_frag>;
+
+// Reset channel path.
+let hasSideEffects = 1, Uses = [R1L], Defs = [CC] in
+  def RCHP : SideEffectInherentS<"rchp", 0xB23B, null_frag>;
+
+// Set channel monitor.
+let hasSideEffects = 1, mayLoad = 1, Uses = [R1L, R2D] in
+  def SCHM : SideEffectInherentS<"schm", 0xB23C, null_frag>;
+
+// Store channel path status.
+let hasSideEffects = 1 in
+  def STCPS : StoreInherentS<"stcps", 0xB23A, null_frag, 0>;
+
+// Store channel report word.
+let hasSideEffects = 1, Defs = [CC] in
+  def STCRW : StoreInherentS<"stcrw", 0xB239, null_frag, 0>;
+
+// Test pending interruption.
+let hasSideEffects = 1, Defs = [CC] in
+  def TPI : StoreInherentS<"tpi", 0xB236, null_frag, 0>;
+
+// Set address limit.
+let hasSideEffects = 1, Uses = [R1L] in
+  def SAL : SideEffectInherentS<"sal", 0xB237, null_frag>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 738ea7a..c9a02d9 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -14,7 +14,7 @@
 let Predicates = [FeatureVector] in {
   // Register move.
   def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>;
-  def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>;
+  def VLR32 : UnaryAliasVRR<null_frag, v32sb, v32sb>;
   def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>;
 
   // Load GR from VR element.
@@ -56,17 +56,28 @@ def : VectorExtractSubreg<v4i32, VLGVF>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [FeatureVector] in {
-  // Generate byte mask.
-  def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
-  def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
-  def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
-
-  // Generate mask.
-  def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
-  def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
-  def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
-  def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
-  def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+  let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1,
+      isReMaterializable = 1 in {
+
+    // Generate byte mask.
+    def VZERO : InherentVRIa<"vzero", 0xE744, 0>;
+    def VONE  : InherentVRIa<"vone", 0xE744, 0xffff>;
+    def VGBM  : UnaryVRIa<"vgbm", 0xE744, z_byte_mask, v128b, imm32zx16>;
+
+    // Generate mask.
+    def VGM  : BinaryVRIbGeneric<"vgm", 0xE746>;
+    def VGMB : BinaryVRIb<"vgmb", 0xE746, z_rotate_mask, v128b, 0>;
+    def VGMH : BinaryVRIb<"vgmh", 0xE746, z_rotate_mask, v128h, 1>;
+    def VGMF : BinaryVRIb<"vgmf", 0xE746, z_rotate_mask, v128f, 2>;
+    def VGMG : BinaryVRIb<"vgmg", 0xE746, z_rotate_mask, v128g, 3>;
+
+    // Replicate immediate.
+    def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
+    def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
+    def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
+    def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
+    def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
+  }
 
   // Load element immediate.
   //
@@ -86,13 +97,6 @@ let Predicates = [FeatureVector] in {
     def VLEIG : TernaryVRIa<"vleig", 0xE742, z_vector_insert,
                             v128g, v128g, imm64sx16, imm32zx1>;
   }
-
-  // Replicate immediate.
-  def VREPI  : UnaryVRIaGeneric<"vrepi", 0xE745, imm32sx16>;
-  def VREPIB : UnaryVRIa<"vrepib", 0xE745, z_replicate, v128b, imm32sx16, 0>;
-  def VREPIH : UnaryVRIa<"vrepih", 0xE745, z_replicate, v128h, imm32sx16, 1>;
-  def VREPIF : UnaryVRIa<"vrepif", 0xE745, z_replicate, v128f, imm32sx16, 2>;
-  def VREPIG : UnaryVRIa<"vrepig", 0xE745, z_replicate, v128g, imm32sx16, 3>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -137,7 +141,7 @@ let Predicates = [FeatureVector] in {
   // LEY and LDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>;
+  def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>;
   def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>;
 
   // Load logical element and zero.
@@ -150,6 +154,11 @@ let Predicates = [FeatureVector] in {
             (VLLEZF bdxaddr12only:$addr)>;
   def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)),
             (VLLEZG bdxaddr12only:$addr)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VLLEZLF : UnaryVRX<"vllezlf", 0xE704, z_vllezli32, v128f, 4, 6>;
+    def : Pat<(v4f32 (z_vllezlf32 bdxaddr12only:$addr)),
+              (VLLEZLF bdxaddr12only:$addr)>;
+  }
 
   // Load element.
   def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8,  v128b, v128b, 1, imm32zx4>;
@@ -166,6 +175,13 @@ let Predicates = [FeatureVector] in {
   def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Load rightmost with length.  The number of loaded bytes is only known
+  // at run time.
+  def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+  def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+}
+
 // Use replicating loads if we're inserting a single element into an
 // undefined vector.  This avoids a false dependency on the previous
 // register contents.
@@ -215,7 +231,7 @@ let Predicates = [FeatureVector] in {
   // STEY and STDY offer full 20-bit displacement fields.  It's often better
   // to use those instructions rather than force a 20-bit displacement
   // into a GPR temporary.
-  def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>;
+  def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>;
   def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>;
 
   // Scatter element.
@@ -223,6 +239,13 @@ let Predicates = [FeatureVector] in {
   def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>;
 }
 
+let Predicates = [FeatureVectorPackedDecimal] in {
+  // Store rightmost with length.  The number of stored bytes is only known
+  // at run time.
+  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+  def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // Selects and permutes
 //===----------------------------------------------------------------------===//
@@ -252,6 +275,10 @@ let Predicates = [FeatureVector] in {
   // Permute doubleword immediate.
   def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>;
 
+  // Bit Permute.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VBPERM : BinaryVRRc<"vbperm", 0xE785, int_s390_vbperm, v128g, v128b>;
+
   // Replicate.
   def VREP:   BinaryVRIcGeneric<"vrep", 0xE74D>;
   def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>;
@@ -420,6 +447,10 @@ let Predicates = [FeatureVector] in {
   def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
   def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
 
+  // Not exclusive or.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>;
+
   // Exclusive or.
   def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
 
@@ -563,6 +594,17 @@ let Predicates = [FeatureVector] in {
   def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
   def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
 
+  // Multiply sum logical.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VMSL  : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>;
+    def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg,
+                               v128q, v128g, v128g, v128q, 3>;
+  }
+
+  // Nand.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>;
+
   // Nor.
   def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
   def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
@@ -570,9 +612,19 @@ let Predicates = [FeatureVector] in {
   // Or.
   def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
 
+  // Or with complement.
+  let Predicates = [FeatureVectorEnhancements1] in
+    def VOC : BinaryVRRc<"voc", 0xE76F, null_frag, v128any, v128any>;
+
   // Population count.
   def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>;
   def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VPOPCTB : UnaryVRRa<"vpopctb", 0xE750, ctpop, v128b, v128b, 0>;
+    def VPOPCTH : UnaryVRRa<"vpopcth", 0xE750, ctpop, v128h, v128h, 1>;
+    def VPOPCTF : UnaryVRRa<"vpopctf", 0xE750, ctpop, v128f, v128f, 2>;
+    def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>;
+  }
 
   // Element rotate left logical (with vector shift amount).
   def VERLLV  : BinaryVRRcGeneric<"verllv", 0xE773>;
@@ -720,6 +772,14 @@ multiclass BitwiseVectorOps<ValueType type> {
               (VNO VR128:$x, VR128:$y)>;
     def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>;
   }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))),
+              (VNX VR128:$x, VR128:$y)>;
+    def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))),
+              (VNN VR128:$x, VR128:$y)>;
+    def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))),
+              (VOC VR128:$x, VR128:$y)>;
+  }
 }
 
 defm : BitwiseVectorOps<v16i8>;
@@ -875,6 +935,11 @@ let Predicates = [FeatureVector] in {
   def VFA   : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
   def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>;
   def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>;
+    def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>;
+    def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>;
+  }
 
   // Convert from fixed 64-bit.
   def VCDG  : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>;
@@ -906,6 +971,11 @@ let Predicates = [FeatureVector] in {
   def VFD   : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
   def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>;
   def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>;
+    def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>;
+    def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>;
+  }
 
   // Load FP integer.
   def VFI   : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>;
@@ -913,66 +983,213 @@ let Predicates = [FeatureVector] in {
   def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>;
   defm : VectorRounding<VFIDB, v128db>;
   defm : VectorRounding<WFIDB, v64db>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>;
+    def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>;
+    def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>;
+    defm : VectorRounding<VFISB, v128sb>;
+    defm : VectorRounding<WFISB, v32sb>;
+    defm : VectorRounding<WFIXB, v128xb>;
+  }
 
   // Load lengthened.
   def VLDE  : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
-  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>;
-  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>;
+  def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>;
+  def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLL  : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>;
+      def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>;
+      def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>;
+    }
+    def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>;
+    def : Pat<(f128 (fpextend (f32 VR32:$src))),
+              (WFLLD (WLDEB VR32:$src))>;
+  }
 
-  // Load rounded,
+  // Load rounded.
   def VLED  : TernaryVRRaFloatGeneric<"vled", 0xE7C5>;
-  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>;
-  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>;
+  def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+  def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
   def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>;
-  def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>;
+  def : FPConversion<WLEDB, fpround, v32sb, v64db, 0, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    let isAsmParserOnly = 1 in {
+      def VFLR  : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>;
+      def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>;
+      def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>;
+    }
+    def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>;
+    def : FPConversion<WFLRX, fpround, v64db, v128xb, 0, 0>;
+    def : Pat<(f32 (fpround (f128 VR128:$src))),
+              (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>;
+  }
+
+  // Maximum.
+  multiclass VectorMax<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fmaxnum, tr, 4>;
+    def : FPMinMax<insn, fmaxnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
+    def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
+                                   v128db, v128db, 3, 0>;
+    def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMax<VFMAXDB, v128db>;
+    defm : VectorMax<WFMAXDB, v64db>;
+    defm : VectorMax<VFMAXSB, v128sb>;
+    defm : VectorMax<WFMAXSB, v32sb>;
+    defm : VectorMax<WFMAXXB, v128xb>;
+  }
+
+  // Minimum.
+  multiclass VectorMin<Instruction insn, TypedReg tr> {
+    def : FPMinMax<insn, fminnum, tr, 4>;
+    def : FPMinMax<insn, fminnan, tr, 1>;
+  }
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
+    def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
+                                   v128db, v128db, 3, 0>;
+    def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag,
+                                   v64db, v64db, 3, 8>;
+    def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb,
+                                   v128sb, v128sb, 2, 0>;
+    def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag,
+                                   v32sb, v32sb, 2, 8>;
+    def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag,
+                                   v128xb, v128xb, 4, 8>;
+    defm : VectorMin<VFMINDB, v128db>;
+    defm : VectorMin<WFMINDB, v64db>;
+    defm : VectorMin<VFMINSB, v128sb>;
+    defm : VectorMin<WFMINSB, v32sb>;
+    defm : VectorMin<WFMINXB, v128xb>;
+  }
 
   // Multiply.
   def VFM   : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
   def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>;
   def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>;
+    def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>;
+    def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>;
+  }
 
   // Multiply and add.
   def VFMA   : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
   def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>;
   def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>;
+    def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>;
+    def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>;
+  }
 
   // Multiply and subtract.
   def VFMS   : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
   def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>;
   def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>;
+    def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>;
+    def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and add.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMA   : TernaryVRReFloatGeneric<"vfnma", 0xE79F>;
+    def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>;
+    def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>;
+    def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>;
+    def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>;
+    def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>;
+  }
+
+  // Negative multiply and subtract.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFNMS   : TernaryVRReFloatGeneric<"vfnms", 0xE79E>;
+    def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>;
+    def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>;
+    def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>;
+    def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>;
+    def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>;
+  }
 
   // Perform sign operation.
   def VFPSO   : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>;
   def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>;
   def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFPSOSB : BinaryVRRa<"vfpsosb", 0xE7CC, null_frag, v128sb, v128sb, 2, 0>;
+    def WFPSOSB : BinaryVRRa<"wfpsosb", 0xE7CC, null_frag, v32sb, v32sb, 2, 8>;
+    def WFPSOXB : BinaryVRRa<"wfpsoxb", 0xE7CC, null_frag, v128xb, v128xb, 4, 8>;
+  }
 
   // Load complement.
   def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>;
   def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLCSB : UnaryVRRa<"vflcsb", 0xE7CC, fneg, v128sb, v128sb, 2, 0, 0>;
+    def WFLCSB : UnaryVRRa<"wflcsb", 0xE7CC, fneg, v32sb, v32sb, 2, 8, 0>;
+    def WFLCXB : UnaryVRRa<"wflcxb", 0xE7CC, fneg, v128xb, v128xb, 4, 8, 0>;
+  }
 
   // Load negative.
   def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>;
   def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLNSB : UnaryVRRa<"vflnsb", 0xE7CC, fnabs, v128sb, v128sb, 2, 0, 1>;
+    def WFLNSB : UnaryVRRa<"wflnsb", 0xE7CC, fnabs, v32sb, v32sb, 2, 8, 1>;
+    def WFLNXB : UnaryVRRa<"wflnxb", 0xE7CC, fnabs, v128xb, v128xb, 4, 8, 1>;
+  }
 
   // Load positive.
   def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>;
   def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFLPSB : UnaryVRRa<"vflpsb", 0xE7CC, fabs, v128sb, v128sb, 2, 0, 2>;
+    def WFLPSB : UnaryVRRa<"wflpsb", 0xE7CC, fabs, v32sb, v32sb, 2, 8, 2>;
+    def WFLPXB : UnaryVRRa<"wflpxb", 0xE7CC, fabs, v128xb, v128xb, 4, 8, 2>;
+  }
 
   // Square root.
   def VFSQ   : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
   def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>;
   def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>;
+    def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>;
+    def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>;
+  }
 
   // Subtract.
   def VFS   : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
   def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>;
   def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>;
+    def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>;
+    def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>;
+  }
 
   // Test data class immediate.
   let Defs = [CC] in {
     def VFTCI   : BinaryVRIeFloatGeneric<"vftci", 0xE74A>;
     def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>;
     def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def VFTCISB : BinaryVRIe<"vftcisb", 0xE74A, z_vftci, v128f, v128sb, 2, 0>;
+      def WFTCISB : BinaryVRIe<"wftcisb", 0xE74A, null_frag, v32f, v32sb, 2, 8>;
+      def WFTCIXB : BinaryVRIe<"wftcixb", 0xE74A, null_frag, v128q, v128xb, 4, 8>;
+    }
   }
 }
 
@@ -985,12 +1202,20 @@ let Predicates = [FeatureVector] in {
   let Defs = [CC] in {
     def WFC   : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
     def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_fcmp, v32sb, 2>;
+      def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_fcmp, v128xb, 4>;
+    }
   }
 
   // Compare and signal scalar.
   let Defs = [CC] in {
     def WFK   : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
     def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>;
+    let Predicates = [FeatureVectorEnhancements1] in {
+      def WFKSB : CompareVRRa<"wfksb", 0xE7CA, null_frag, v32sb, 2>;
+      def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, null_frag, v128xb, 4>;
+    }
   }
 
   // Compare equal.
@@ -999,6 +1224,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKESB : BinaryVRRcSPair<"vfkesb", 0xE7E8, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKESB : BinaryVRRcSPair<"wfkesb", 0xE7E8, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKEXB : BinaryVRRcSPair<"wfkexb", 0xE7E8, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high.
   def  VFCH   : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>;
@@ -1006,6 +1253,28 @@ let Predicates = [FeatureVector] in {
                                 v128g, v128db, 3, 0>;
   defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag,
                                 v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs,
+                                  v128f, v128sb, 2, 0>;
+    defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 8>;
+    defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v128g, v128db, 3, 4>;
+    defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag,
+                                  v64g, v64db, 3, 12>;
+    defm VFKHSB : BinaryVRRcSPair<"vfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v128f, v128sb, 2, 4>;
+    defm WFKHSB : BinaryVRRcSPair<"wfkhsb", 0xE7EB, null_frag, null_frag,
+                                  v32f, v32sb, 2, 12>;
+    defm WFKHXB : BinaryVRRcSPair<"wfkhxb", 0xE7EB, null_frag, null_frag,
+                                  v128q, v128xb, 4, 12>;
+  }
 
   // Compare high or equal.
   def  VFCHE   : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>;
@@ -1013,6 +1282,28 @@ let Predicates = [FeatureVector] in {
                                  v128g, v128db, 3, 0>;
   defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag,
                                  v64g, v64db, 3, 8>;
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes,
+                                   v128f, v128sb, 2, 0>;
+    defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 8>;
+    defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 8>;
+  }
+
+  // Compare and signal high or equal.
+  let Predicates = [FeatureVectorEnhancements1] in {
+    defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v128g, v128db, 3, 4>;
+    defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag,
+                                   v64g, v64db, 3, 12>;
+    defm VFKHESB : BinaryVRRcSPair<"vfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v128f, v128sb, 2, 4>;
+    defm WFKHESB : BinaryVRRcSPair<"wfkhesb", 0xE7EA, null_frag, null_frag,
+                                   v32f, v32sb, 2, 12>;
+    defm WFKHEXB : BinaryVRRcSPair<"wfkhexb", 0xE7EA, null_frag, null_frag,
+                                   v128q, v128xb, 4, 12>;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1024,36 +1315,49 @@ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (f128  VR128:$src))), (v16i8 VR128:$src)>;
 
 def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (f128  VR128:$src))), (v8i16 VR128:$src)>;
 
 def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (f128  VR128:$src))), (v4i32 VR128:$src)>;
 
 def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (f128  VR128:$src))), (v2i64 VR128:$src)>;
 
 def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (f128  VR128:$src))), (v4f32 VR128:$src)>;
 
 def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (f128  VR128:$src))), (v2f64 VR128:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32 VR128:$src))), (f128  VR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64 VR128:$src))), (f128  VR128:$src)>;
 
 //===----------------------------------------------------------------------===//
 // Replicating scalars
@@ -1130,6 +1434,20 @@ let AddedComplexity = 4 in {
 }
 
 //===----------------------------------------------------------------------===//
+// Support for 128-bit floating-point values in vector registers
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorEnhancements1] in {
+  def : Pat<(f128 (load bdxaddr12only:$addr)),
+            (VL bdxaddr12only:$addr)>;
+  def : Pat<(store (f128 VR128:$src), bdxaddr12only:$addr),
+            (VST VR128:$src, bdxaddr12only:$addr)>;
+
+  def : Pat<(f128 fpimm0), (VZERO)>;
+  def : Pat<(f128 fpimmneg0), (WFLNXB (VZERO))>;
+}
+
+//===----------------------------------------------------------------------===//
 // String instructions
 //===----------------------------------------------------------------------===//
 
@@ -1198,3 +1516,37 @@ let Predicates = [FeatureVector] in {
   defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf,
                                         z_vstrcz_cc, v128f, v128f, 2, 2>;
 }
+
+//===----------------------------------------------------------------------===//
+// Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [FeatureVectorPackedDecimal] in {
+  def VLIP : BinaryVRIh<"vlip", 0xE649>;
+
+  def VPKZ : BinaryVSI<"vpkz", 0xE634, null_frag, 0>;
+  def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>;
+
+  let Defs = [CC] in {
+    def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>;
+    def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>;
+    def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>;
+    def VCVDG : TernaryVRIi<"vcvdg", 0xE65A, GR64>;
+
+    def VAP : QuaternaryVRIf<"vap", 0xE671>;
+    def VSP : QuaternaryVRIf<"vsp", 0xE673>;
+
+    def VMP : QuaternaryVRIf<"vmp", 0xE678>;
+    def VMSP : QuaternaryVRIf<"vmsp", 0xE679>;
+
+    def VDP : QuaternaryVRIf<"vdp", 0xE67A>;
+    def VRP : QuaternaryVRIf<"vrp", 0xE67B>;
+    def VSDP : QuaternaryVRIf<"vsdp", 0xE67E>;
+
+    def VSRP : QuaternaryVRIg<"vsrp", 0xE659>;
+    def VPSOP : QuaternaryVRIg<"vpsop", 0xE65B>;
+
+    def VTP : TestVRRg<"vtp", 0xE65F>;
+    def VCP : CompareVRRh<"vcp", 0xE677>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
index ec8ce6e..d4cd89c 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SystemZTargetMachine.h"
 #include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -127,7 +127,7 @@ MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I,
   return Copy;
 }
 
-// Create a virtal register in *TLSBaseAddrReg, and populate it by
+// Create a virtual register in *TLSBaseAddrReg, and populate it by
 // inserting a copy instruction after I. Returns the new instruction.
 MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I,
                                             unsigned *TLSBaseAddrReg) {
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index 14ff6af..791f033 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -53,15 +53,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -70,72 +76,72 @@ using namespace llvm;
 STATISTIC(LongBranches, "Number of long branches.");
 
 namespace {
+
 // Represents positional information about a basic block.
 struct MBBInfo {
   // The address that we currently assume the block has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The size of the block in bytes, excluding terminators.
   // This value never changes.
-  uint64_t Size;
+  uint64_t Size = 0;
 
   // The minimum alignment of the block, as a log2 value.
   // This value never changes.
-  unsigned Alignment;
+  unsigned Alignment = 0;
 
   // The number of terminators in this block.  This value never changes.
-  unsigned NumTerminators;
+  unsigned NumTerminators = 0;
 
-  MBBInfo()
-    : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+  MBBInfo() = default;
 };
 
 // Represents the state of a block terminator.
 struct TerminatorInfo {
   // If this terminator is a relaxable branch, this points to the branch
   // instruction, otherwise it is null.
-  MachineInstr *Branch;
+  MachineInstr *Branch = nullptr;
 
   // The address that we currently assume the terminator has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The current size of the terminator in bytes.
-  uint64_t Size;
+  uint64_t Size = 0;
 
   // If Branch is nonnull, this is the number of the target block,
   // otherwise it is unused.
-  unsigned TargetBlock;
+  unsigned TargetBlock = 0;
 
   // If Branch is nonnull, this is the length of the longest relaxed form,
   // otherwise it is zero.
-  unsigned ExtraRelaxSize;
+  unsigned ExtraRelaxSize = 0;
 
-  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
-                     ExtraRelaxSize(0) {}
+  TerminatorInfo() = default;
 };
 
 // Used to keep track of the current position while iterating over the blocks.
 struct BlockPosition {
   // The address that we assume this position has.
-  uint64_t Address;
+  uint64_t Address = 0;
 
   // The number of low bits in Address that are known to be the same
   // as the runtime address.
   unsigned KnownBits;
 
-  BlockPosition(unsigned InitialAlignment)
-    : Address(0), KnownBits(InitialAlignment) {}
+  BlockPosition(unsigned InitialAlignment) : KnownBits(InitialAlignment) {}
 };
 
 class SystemZLongBranch : public MachineFunctionPass {
 public:
   static char ID;
+
   SystemZLongBranch(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr) {}
+    : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "SystemZ Long Branch"; }
 
   bool runOnMachineFunction(MachineFunction &F) override;
+
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::NoVRegs);
@@ -155,7 +161,7 @@ private:
   void relaxBranch(TerminatorInfo &Terminator);
   void relaxBranches();
 
-  const SystemZInstrInfo *TII;
+  const SystemZInstrInfo *TII = nullptr;
   MachineFunction *MF;
   SmallVector<MBBInfo, 16> MBBs;
   SmallVector<TerminatorInfo, 16> Terminators;
@@ -165,11 +171,8 @@ char SystemZLongBranch::ID = 0;
 
 const uint64_t MaxBackwardRange = 0x10000;
 const uint64_t MaxForwardRange = 0xfffe;
-} // end anonymous namespace
 
-FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
-  return new SystemZLongBranch(TM);
-}
+} // end anonymous namespace
 
 // Position describes the state immediately before Block.  Update Block
 // accordingly and move Position to the end of the block's non-terminator
@@ -354,13 +357,13 @@ void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1))
-    .addImm(-1);
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .addImm(-1);
   MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
-    .addImm(SystemZ::CCMASK_ICMP)
-    .addImm(SystemZ::CCMASK_CMP_NE)
-    .addOperand(MI->getOperand(2));
+                           .addImm(SystemZ::CCMASK_ICMP)
+                           .addImm(SystemZ::CCMASK_CMP_NE)
+                           .add(MI->getOperand(2));
   // The implicit use of CC is a killing use.
   BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
   MI->eraseFromParent();
@@ -373,12 +376,12 @@ void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
-    .addOperand(MI->getOperand(0))
-    .addOperand(MI->getOperand(1));
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1));
   MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
-    .addImm(SystemZ::CCMASK_ICMP)
-    .addOperand(MI->getOperand(2))
-    .addOperand(MI->getOperand(3));
+                           .addImm(SystemZ::CCMASK_ICMP)
+                           .add(MI->getOperand(2))
+                           .add(MI->getOperand(3));
   // The implicit use of CC is a killing use.
   BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
   MI->eraseFromParent();
@@ -463,3 +466,7 @@ bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
   relaxBranches();
   return true;
 }
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
index ab6020f..8342463 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.cpp
@@ -18,12 +18,12 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 #ifndef NDEBUG
 // Print the set of SUs
 void SystemZPostRASchedStrategy::SUSet::
-dump(SystemZHazardRecognizer &HazardRec) {
+dump(SystemZHazardRecognizer &HazardRec) const {
   dbgs() << "{";
   for (auto &SU : *this) {
     HazardRec.dumpSU(SU, dbgs());
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index b919758..3dfef38 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -1,4 +1,4 @@
-//==-- SystemZMachineScheduler.h - SystemZ Scheduler Interface -*- C++ -*---==//
+//==- SystemZMachineScheduler.h - SystemZ Scheduler Interface ----*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,10 +14,10 @@
 // usage of processor resources.
 //===----------------------------------------------------------------------===//
 
-#include "SystemZInstrInfo.h"
 #include "SystemZHazardRecognizer.h"
 #include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include <set>
 
 #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
@@ -28,29 +28,29 @@ namespace llvm {
   
 /// A MachineSchedStrategy implementation for SystemZ post RA scheduling.
 class SystemZPostRASchedStrategy : public MachineSchedStrategy {
-    ScheduleDAGMI *DAG;
+  ScheduleDAGMI *DAG;
   
   /// A candidate during instruction evaluation.
   struct Candidate {
-    SUnit *SU;
+    SUnit *SU = nullptr;
 
     /// The decoding cost.
-    int GroupingCost;
+    int GroupingCost = 0;
 
     /// The processor resources cost.
-    int ResourcesCost;
+    int ResourcesCost = 0;
 
-    Candidate() : SU(nullptr), GroupingCost(0), ResourcesCost(0) {}
+    Candidate() = default;
     Candidate(SUnit *SU_, SystemZHazardRecognizer &HazardRec);
 
     // Compare two candidates.
     bool operator<(const Candidate &other);
 
     // Check if this node is free of cost ("as good as any").
-    bool inline noCost() {
+    bool noCost() const {
       return (GroupingCost <= 0 && !ResourcesCost);
     }
-   };
+  };
 
   // A sorter for the Available set that makes sure that SUs are considered
   // in the best order.
@@ -72,7 +72,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   // A set of SUs with a sorter and dump method.
   struct SUSet : std::set<SUnit*, SUSorter> {
     #ifndef NDEBUG
-    void dump(SystemZHazardRecognizer &HazardRec);
+    void dump(SystemZHazardRecognizer &HazardRec) const;
     #endif
   };
 
@@ -83,7 +83,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   // region.
   SystemZHazardRecognizer HazardRec;
   
- public:
+public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
 
   /// PostRA scheduling does not track pressure.
@@ -107,6 +107,6 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
   void releaseBottomNode(SUnit *SU) override {};
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif /* LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H */
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZMACHINESCHEDULER_H
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
index 7bb4fe5..7136121 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -531,6 +531,7 @@ def BDAddr64Disp12      : AddressAsmOperand<"BDAddr",   "64", "12">;
 def BDAddr64Disp20      : AddressAsmOperand<"BDAddr",   "64", "20">;
 def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
 def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len4">;
 def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 def BDRAddr64Disp12     : AddressAsmOperand<"BDRAddr",  "64", "12">;
 def BDVAddr64Disp12     : AddressAsmOperand<"BDVAddr",  "64", "12">;
@@ -578,6 +579,7 @@ def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
 def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
 def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
 def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def bdladdr12onlylen4 : BDLMode<"BDLAddr",  "64", "12", "Only", "4">;
 def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
 def bdraddr12only     : BDRMode<"BDRAddr",  "64", "12", "Only">;
 def bdvaddr12only     : BDVMode<            "64", "12">;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
index fde26ed..759a8bb 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -10,7 +10,8 @@
 //===----------------------------------------------------------------------===//
 // Type profiles
 //===----------------------------------------------------------------------===//
-def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>]>;
+def SDT_CallSeqStart        : SDCallSeqStart<[SDTCisVT<0, i64>,
+                                              SDTCisVT<1, i64>]>;
 def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
@@ -35,14 +36,10 @@ def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisPtrTy<0>]>;
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
-def SDT_ZGR128Binary32      : SDTypeProfile<1, 2,
+def SDT_ZGR128Binary        : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, untyped>,
-                                             SDTCisVT<1, untyped>,
-                                             SDTCisVT<2, i32>]>;
-def SDT_ZGR128Binary64      : SDTypeProfile<1, 2,
-                                            [SDTCisVT<0, untyped>,
-                                             SDTCisVT<1, untyped>,
-                                             SDTCisVT<2, i64>]>;
+                                             SDTCisInt<1>,
+                                             SDTCisInt<2>]>;
 def SDT_ZAtomicLoadBinaryW  : SDTypeProfile<1, 5,
                                             [SDTCisVT<0, i32>,
                                              SDTCisPtrTy<1>,
@@ -184,14 +181,11 @@ def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
     		                 [SDNPInGlue]>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
-def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
-def z_sdivrem32         : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
-def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
-def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
-def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
-
-def z_serialize         : SDNode<"SystemZISD::SERIALIZE", SDTNone,
-                                 [SDNPHasChain, SDNPMayStore]>;
+def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
+def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
+def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
+def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
+
 def z_membarrier        : SDNode<"SystemZISD::MEMBARRIER", SDTNone,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
@@ -556,6 +550,12 @@ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                     (fma node:$src2, node:$src3, (fneg node:$src1))>;
 
+// Negative fused multiply-add and multiply-subtract.
+def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fma node:$src1, node:$src2, node:$src3))>;
+def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+                   (fneg (fms node:$src1, node:$src2, node:$src3))>;
+
 // Floating-point negative absolute.
 def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
 
@@ -631,6 +631,19 @@ def z_vllezf64 : PatFrag<(ops node:$addr),
                           (scalar_to_vector (f64 (load node:$addr))),
                           (z_vzero))>;
 
+// Similarly for the high element of a zeroed vector.
+def z_vllezli32 : z_vllez<i32, load, 0>;
+def z_vllezlf32 : PatFrag<(ops node:$addr),
+                          (bitconvert
+                           (z_merge_high
+                            (v2i64
+                             (bitconvert
+                              (z_merge_high
+                               (v4f32 (scalar_to_vector
+                                       (f32 (load node:$addr)))),
+                               (v4f32 (z_vzero))))),
+                            (v2i64 (z_vzero))))>;
+
 // Store one element of a vector.
 class z_vste<ValueType scalartype, SDPatternOperator store>
   : PatFrag<(ops node:$vec, node:$addr, node:$index),
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
index 16a7ed7..152521f 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -167,3 +167,10 @@ class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
                    TypedReg tr2, bits<3> suppress, bits<4> mode>
   : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
         (insn tr2.op:$vec, suppress, mode)>;
+
+// Use INSN to perform mininum/maximum operation OPERATOR on type TR.
+// FUNCTION is the type of minimum/maximum function to perform.
+class FPMinMax<Instruction insn, SDPatternOperator operator, TypedReg tr,
+               bits<4> function>
+  : Pat<(tr.vt (operator (tr.vt tr.op:$vec1), (tr.vt tr.op:$vec2))),
+        (insn tr.op:$vec1, tr.op:$vec2, function)>;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
index 1cdc094..0dca458 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -33,3 +33,6 @@ def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>;
 def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>;
 def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>;
 
+def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>;
+def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 6ef8000..d14a0fb 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SystemZInstrInfo.h"
 #include "SystemZRegisterInfo.h"
+#include "SystemZInstrInfo.h"
 #include "SystemZSubtarget.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 47d2f75..52ba1a5 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -260,10 +260,10 @@ defm VF128 : SystemZRegClass<"VF128",
 
 // All vector registers.
 defm VR128 : SystemZRegClass<"VR128",
-                             [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
-                             (add (sequence "V%u", 0, 7),
-                                  (sequence "V%u", 16, 31),
-                                  (sequence "V%u", 8, 15))>;
+                             [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128, (add (sequence "V%u", 0, 7),
+                                       (sequence "V%u", 16, 31),
+                                       (sequence "V%u", 8, 15))>;
 
 // Attaches a ValueType to a register operand, to make the instruction
 // definitions easier.
@@ -272,7 +272,8 @@ class TypedReg<ValueType vtin, RegisterOperand opin> {
   RegisterOperand op = opin;
 }
 
-def v32eb   : TypedReg<f32,     VR32>;
+def v32f    : TypedReg<i32,     VR32>;
+def v32sb   : TypedReg<f32,     VR32>;
 def v64g    : TypedReg<i64,     VR64>;
 def v64db   : TypedReg<f64,     VR64>;
 def v128b   : TypedReg<v16i8,   VR128>;
@@ -280,8 +281,9 @@ def v128h   : TypedReg<v8i16,   VR128>;
 def v128f   : TypedReg<v4i32,   VR128>;
 def v128g   : TypedReg<v2i64,   VR128>;
 def v128q   : TypedReg<v16i8,   VR128>;
-def v128eb  : TypedReg<v4f32,   VR128>;
+def v128sb  : TypedReg<v4f32,   VR128>;
 def v128db  : TypedReg<v2f64,   VR128>;
+def v128xb  : TypedReg<f128,    VR128>;
 def v128any : TypedReg<untyped, VR128>;
 
 //===----------------------------------------------------------------------===//
@@ -304,3 +306,13 @@ foreach I = 0-15 in {
 defm AR32 : SystemZRegClass<"AR32", [i32], 32,
                             (add (sequence "A%u", 0, 15)), 0>;
 
+// Control registers.
+class CREG64<bits<16> num, string n> : SystemZReg<n> {
+  let HWEncoding = num;
+}
+foreach I = 0-15 in {
+  def C#I : CREG64<I, "c"#I>, DwarfRegNum<[!add(I, 32)]>;
+}
+defm CR64 : SystemZRegClass<"CR64", [i64], 64,
+                            (add (sequence "C%u", 0, 15)), 0>;
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
index dbba8ab..8dba89f 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSchedule.td
@@ -56,12 +56,16 @@ def LSU_lat1    : SchedWrite;
 // Floating point unit (zEC12 and earlier)
 def FPU  : SchedWrite;
 def FPU2 : SchedWrite;
+def DFU  : SchedWrite;
+def DFU2 : SchedWrite;
 
-// Vector sub units (z13)
+// Vector sub units (z13 and later)
 def VecBF     : SchedWrite;
 def VecBF2    : SchedWrite;
 def VecDF     : SchedWrite;
 def VecDF2    : SchedWrite;
+def VecDFX    : SchedWrite;
+def VecDFX2   : SchedWrite;
 def VecFPd    : SchedWrite; // Blocking BFP div/sqrt unit.
 def VecMul    : SchedWrite;
 def VecStr    : SchedWrite;
@@ -71,6 +75,7 @@ def VecXsPm   : SchedWrite;
 def VBU         : SchedWrite;
 
 
+include "SystemZScheduleZ14.td"
 include "SystemZScheduleZ13.td"
 include "SystemZScheduleZEC12.td"
 include "SystemZScheduleZ196.td"
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
index e97d61d..72543c1 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -15,7 +15,7 @@
 def Z13Model : SchedMachineModel {
 
     let UnsupportedFeatures = Arch11UnsupportedFeatures.List;
-    
+
     let IssueWidth = 8;
     let MicroOpBufferSize = 60;     // Issue queues
     let LoadLatency = 1;            // Optimistic load latency.
@@ -76,6 +76,8 @@ def : WriteRes<VecBF,   [Z13_VecUnit]> { let Latency = 8; }
 def : WriteRes<VecBF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
 def : WriteRes<VecDF,   [Z13_VecUnit]> { let Latency = 8; }
 def : WriteRes<VecDF2,  [Z13_VecUnit, Z13_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z13_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z13_VecUnit, Z13_VecUnit]> { let Latency = 2; }
 def : WriteRes<VecFPd,  [Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
                          Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
                          Z13_VecFPdUnit, Z13_VecFPdUnit, Z13_VecFPdUnit,
@@ -157,7 +159,7 @@ def : InstRW<[FXb], (instregex "CondReturn$")>;
 // Select instructions
 //===----------------------------------------------------------------------===//
 
-// Select pseudo 
+// Select pseudo
 def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
 
 // CondStore pseudos
@@ -179,6 +181,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -223,7 +226,7 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
 
 def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
 def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
-def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
 def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
 
@@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
 
 def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of ceil(5/2) FXb ops)
 def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
               GroupAlone], (instregex "STM(G|H|Y)?$")>;
@@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
 def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -345,7 +353,10 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXa], (instregex "ALR(K)?$")>;
 def : InstRW<[FXa], (instregex "AR(K)?$")>;
-def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
 
 // Logical addition with carry
 def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
@@ -368,6 +379,8 @@ def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXa], (instregex "SLR(K)?$")>;
 def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
@@ -433,18 +446,22 @@ def : InstRW<[FXa, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXa, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXa, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXa, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXa, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa2, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa2, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXa, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXa, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa2, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa2, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
-def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
+def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
 def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
 def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
 def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
@@ -456,7 +473,9 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
 def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXa], (instregex "SLA(K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -493,6 +512,8 @@ def : InstRW<[FXb], (instregex "CLIH$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXb], (instregex "CLR$")>;
 def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
@@ -505,7 +526,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -516,6 +537,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -550,7 +574,7 @@ def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -563,6 +587,51 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -629,41 +698,29 @@ def : InstRW<[FXb], (instregex "PPA$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXa, Lat6, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXa, FXa, Lat6, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
 
 // Extend
-def : InstRW<[FXa], (instregex "AEXT128_64$")>;
-def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>;
+def : InstRW<[FXa], (instregex "AEXT128$")>;
+def : InstRW<[FXa], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
-// Move with key
-def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Extract CPU Time
-def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>;
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXb, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
-             (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXa, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXb, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -786,7 +843,7 @@ def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
 // Addition
 def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
 def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
-def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
 
 // Subtraction
 def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
@@ -801,9 +858,9 @@ def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -811,14 +868,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
 def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
 def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>;
-def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>;
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
@@ -832,10 +892,246 @@ def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[FXa, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXa, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
 def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[VecBF], (instregex "LEXR$")>;
+def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXb], (instregex "LDER$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
+def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIER$")>;
+def : InstRW<[VecBF], (instregex "FIDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecDF], (instregex "LTDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[VecDF], (instregex "LDETR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[VecDF], (instregex "FIDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[VecDF], (instregex "QADTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
+def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[VecDF], (instregex "CEDTR$")>;
+def : InstRW<[VecDF], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
 // --------------------------------- Vector --------------------------------- //
 
 //===----------------------------------------------------------------------===//
@@ -855,8 +1151,8 @@ def : InstRW<[VecXsPm], (instregex "VZERO$")>;
 def : InstRW<[VecXsPm], (instregex "VONE$")>;
 def : InstRW<[VecXsPm], (instregex "VGBM$")>;
 def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
-def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
 def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
 
 //===----------------------------------------------------------------------===//
 // Vector: Loads
@@ -989,32 +1285,43 @@ def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
 // Vector: Floating-point arithmetic
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[VecBF2], (instregex "VCD(G|GB|LG|LGB)$")>;
-def : InstRW<[VecBF], (instregex "WCD(GB|LGB)$")>;
+// Conversion and rounding
+def : InstRW<[VecBF2], (instregex "VCD(L)?G$")>;
+def : InstRW<[VecBF2], (instregex "VCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
 def : InstRW<[VecBF2], (instregex "VC(L)?GD$")>;
-def : InstRW<[VecBF2], (instregex "VFADB$")>;
-def : InstRW<[VecBF], (instregex "WFADB$")>;
-def : InstRW<[VecBF2], (instregex "VCGDB$")>;
-def : InstRW<[VecBF], (instregex "WCGDB$")>;
-def : InstRW<[VecBF2], (instregex "VF(I|M|A|S)$")>;
-def : InstRW<[VecBF2], (instregex "VF(I|M|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WF(I|M|S)DB$")>;
-def : InstRW<[VecBF2], (instregex "VCLGDB$")>;
-def : InstRW<[VecBF], (instregex "WCLGDB$")>;
-def : InstRW<[VecXsPm], (instregex "VFL(C|N|P)DB$")>;
-def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)DB$")>;
-def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>;
-def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>;
-def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>;
-def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
-def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI(DB)?$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WFTCIDB$")>;
+def : InstRW<[VecBF2], (instregex "VC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
 def : InstRW<[VecBF2], (instregex "VL(DE|ED)$")>;
 def : InstRW<[VecBF2], (instregex "VL(DE|ED)B$")>;
 def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[VecBF2], (instregex "VFI$")>;
+def : InstRW<[VecBF2], (instregex "VFIDB$")>;
+def : InstRW<[VecBF], (instregex "WFIDB$")>;
+
+// Sign operations
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+
+// Test data class
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+
+// Add / subtract
+def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
+def : InstRW<[VecBF2], (instregex "VF(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[VecBF2], (instregex "VFM$")>;
+def : InstRW<[VecBF2], (instregex "VFMDB$")>;
+def : InstRW<[VecBF], (instregex "WFMDB$")>;
+def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>;
+def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>;
 
-// divide / square root
+// Divide / square root
 def : InstRW<[VecFPd], (instregex "VFD$")>;
 def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
 def : InstRW<[VecFPd], (instregex "VFSQ$")>;
@@ -1026,10 +1333,10 @@ def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
 
 def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)$")>;
 def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)DB$")>;
-def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
 def : InstRW<[VecXsPm], (instregex "WFC(E|H|HE)DB$")>;
 def : InstRW<[VecXsPm, Lat4], (instregex "VFC(E|H|HE)DBS$")>;
 def : InstRW<[VecXsPm, Lat4], (instregex "WFC(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
 def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
 
 //===----------------------------------------------------------------------===//
@@ -1060,5 +1367,163 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
 def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
 def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
 
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
new file mode 100644
index 0000000..698eb56
--- /dev/null
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ14.td
@@ -0,0 +1,1611 @@
+//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Z14 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Z14Model : SchedMachineModel {
+
+    let UnsupportedFeatures = Arch12UnsupportedFeatures.List;
+
+    let IssueWidth = 8;
+    let MicroOpBufferSize = 60;     // Issue queues
+    let LoadLatency = 1;            // Optimistic load latency.
+
+    let PostRAScheduler = 1;
+
+    // Extra cycles for a mispredicted branch.
+    let MispredictPenalty = 20;
+}
+
+let SchedModel = Z14Model in  {
+
+// These definitions could be put in a subtarget common include file,
+// but it seems the include system in Tablegen currently rejects
+// multiple includes of same file.
+def : WriteRes<GroupAlone, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+  let EndGroup    = 1;
+}
+def : WriteRes<BeginGroup, []> {
+  let NumMicroOps = 0;
+  let BeginGroup  = 1;
+}
+def : WriteRes<EndGroup, []> {
+  let NumMicroOps = 0;
+  let EndGroup    = 1;
+}
+def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;}
+def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;}
+def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;}
+def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;}
+def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;}
+def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;}
+def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;}
+def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;}
+def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;}
+def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;}
+def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;}
+def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;}
+def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;}
+def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
+
+// Execution units.
+def Z14_FXaUnit     : ProcResource<2>;
+def Z14_FXbUnit     : ProcResource<2>;
+def Z14_LSUnit      : ProcResource<2>;
+def Z14_VecUnit     : ProcResource<2>;
+def Z14_VecFPdUnit  : ProcResource<2> { let BufferSize = 1; /* blocking */ }
+def Z14_VBUnit      : ProcResource<2>;
+
+// Subtarget specific definitions of scheduling resources.
+def : WriteRes<FXa,     [Z14_FXaUnit]> { let Latency = 1; }
+def : WriteRes<FXa2,    [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; }
+def : WriteRes<FXb,     [Z14_FXbUnit]> { let Latency = 1; }
+def : WriteRes<LSU,     [Z14_LSUnit]>  { let Latency = 4; }
+def : WriteRes<VecBF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecBF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDF,   [Z14_VecUnit]> { let Latency = 8; }
+def : WriteRes<VecDF2,  [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; }
+def : WriteRes<VecDFX,  [Z14_VecUnit]> { let Latency = 1; }
+def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; }
+def : WriteRes<VecFPd,  [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit,
+                         Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]>
+                         { let Latency = 30; }
+def : WriteRes<VecMul,  [Z14_VecUnit]> { let Latency = 5; }
+def : WriteRes<VecStr,  [Z14_VecUnit]> { let Latency = 4; }
+def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; }
+def : WriteRes<VBU,     [Z14_VBUnit]>; // Virtual Branching Unit
+
+// -------------------------- INSTRUCTIONS ---------------------------------- //
+
+// InstRW constructs have been used in order to preserve the
+// readability of the InstrInfo files.
+
+// For each instruction, as matched by a regexp, provide a list of
+// resources that it needs. These will be combined into a SchedClass.
+
+//===----------------------------------------------------------------------===//
+// Stack allocation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY
+
+//===----------------------------------------------------------------------===//
+// Branch instructions
+//===----------------------------------------------------------------------===//
+
+// Branch
+def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>;
+def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>;
+def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>;
+def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone],
+             (instregex "B(R)?X(H|L).*$")>;
+
+// Compare and branch
+def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone],
+             (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Trap instructions
+//===----------------------------------------------------------------------===//
+
+// Trap
+def : InstRW<[VBU], (instregex "(Cond)?Trap$")>;
+
+// Compare and trap
+def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>;
+def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Call and return instructions
+//===----------------------------------------------------------------------===//
+
+// Call
+def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>;
+
+// Return
+def : InstRW<[FXb, EndGroup], (instregex "Return$")>;
+def : InstRW<[FXb], (instregex "CondReturn$")>;
+
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+// Select pseudo
+def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>;
+
+// CondStore pseudos
+def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Moves
+def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>;
+
+// Move character
+def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
+
+// Pseudo -> reg move
+def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>;
+def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>;
+def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>;
+def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>;
+
+// Loads
+def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>;
+def : InstRW<[LSU], (instregex "LG(RL)?$")>;
+def : InstRW<[LSU], (instregex "L128$")>;
+
+def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>;
+def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>;
+
+def : InstRW<[FXa], (instregex "LG(F|H)I$")>;
+def : InstRW<[FXa], (instregex "LHI(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LR(Mux)?$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LZR(F|G)$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>;
+
+// Load and test
+def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>;
+def : InstRW<[FXa], (instregex "LT(G)?R$")>;
+
+// Stores
+def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>;
+
+// String moves.
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>;
+
+//===----------------------------------------------------------------------===//
+// Conditional move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>;
+def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Sign extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "L(B|H|G)R$")>;
+def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>;
+def : InstRW<[FXa], (instregex "LTGFR$")>;
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>;
+
+//===----------------------------------------------------------------------===//
+// Zero extensions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>;
+def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>;
+def : InstRW<[LSU], (instregex "LLC(Mux)?$")>;
+def : InstRW<[LSU], (instregex "LLH(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>;
+def : InstRW<[LSU], (instregex "LLHRL$")>;
+def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>;
+
+// Load and zero rightmost byte
+def : InstRW<[LSU], (instregex "LLZRGF$")>;
+
+// Load and trap
+def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
+
+//===----------------------------------------------------------------------===//
+// Truncations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Multi-register moves
+//===----------------------------------------------------------------------===//
+
+// Load multiple (estimated average of 5 ops)
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "LM(H|Y|G)?$")>;
+
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
+// Store multiple (estimated average of ceil(5/2) FXb ops)
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10,
+              GroupAlone], (instregex "STM(G|H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Byte swaps
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LRV(G)?R$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
+
+//===----------------------------------------------------------------------===//
+// Load address instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>;
+
+// Load the Global Offset Table address ( -> larl )
+def : InstRW<[FXa], (instregex "GOT$")>;
+
+//===----------------------------------------------------------------------===//
+// Absolute and Negation
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "LP(G)?R$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>;
+def : InstRW<[FXa], (instregex "LN(R|GR)$")>;
+def : InstRW<[FXa], (instregex "LC(R|GR)$")>;
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Insertion
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>;
+def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>;
+def : InstRW<[FXa], (instregex "IIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "IIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "IILF(64)?$")>;
+def : InstRW<[FXa], (instregex "IILH(64)?$")>;
+def : InstRW<[FXa], (instregex "IILL(64)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Addition
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>;
+def : InstRW<[FXa], (instregex "AIH$")>;
+def : InstRW<[FXa], (instregex "AFI(Mux)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>;
+def : InstRW<[FXa], (instregex "AGFI$")>;
+def : InstRW<[FXa], (instregex "AGHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AGR(K)?$")>;
+def : InstRW<[FXa], (instregex "AHI(K)?$")>;
+def : InstRW<[FXa], (instregex "AHIMux(K)?$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>;
+def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXa], (instregex "ALGHSIK$")>;
+def : InstRW<[FXa], (instregex "ALGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "ALGR(K)?$")>;
+def : InstRW<[FXa], (instregex "ALR(K)?$")>;
+def : InstRW<[FXa], (instregex "AR(K)?$")>;
+def : InstRW<[FXa], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXa], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>;
+
+// Logical addition with carry
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>;
+
+// Add with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "AGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// Subtraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>;
+def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>;
+def : InstRW<[FXa], (instregex "SGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLFI$")>;
+def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>;
+def : InstRW<[FXa], (instregex "SLGF(I|R)$")>;
+def : InstRW<[FXa], (instregex "SLGR(K)?$")>;
+def : InstRW<[FXa], (instregex "SLR(K)?$")>;
+def : InstRW<[FXa], (instregex "SR(K)?$")>;
+def : InstRW<[FXa], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>;
+
+// Subtraction with borrow
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>;
+def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>;
+
+// Subtraction with sign extension (16/32 -> 64)
+def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>;
+def : InstRW<[FXa, Lat2], (instregex "SGFR$")>;
+
+//===----------------------------------------------------------------------===//
+// AND
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "NGR(K)?$")>;
+def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>;
+def : InstRW<[FXa], (instregex "NIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "NIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "NILF(64)?$")>;
+def : InstRW<[FXa], (instregex "NILH(64)?$")>;
+def : InstRW<[FXa], (instregex "NILL(64)?$")>;
+def : InstRW<[FXa], (instregex "NR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>;
+
+//===----------------------------------------------------------------------===//
+// OR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>;
+def : InstRW<[FXa], (instregex "OGR(K)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>;
+def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>;
+def : InstRW<[FXa], (instregex "OIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHH(64)?$")>;
+def : InstRW<[FXa], (instregex "OIHL(64)?$")>;
+def : InstRW<[FXa], (instregex "OILF(64)?$")>;
+def : InstRW<[FXa], (instregex "OILH(64)?$")>;
+def : InstRW<[FXa], (instregex "OILL(64)?$")>;
+def : InstRW<[FXa], (instregex "OR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>;
+
+//===----------------------------------------------------------------------===//
+// XOR
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>;
+def : InstRW<[FXa], (instregex "XIFMux$")>;
+def : InstRW<[FXa], (instregex "XGR(K)?$")>;
+def : InstRW<[FXa], (instregex "XIHF(64)?$")>;
+def : InstRW<[FXa], (instregex "XILF(64)?$")>;
+def : InstRW<[FXa], (instregex "XR(K)?$")>;
+def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>;
+
+//===----------------------------------------------------------------------===//
+// Multiplication
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>;
+def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>;
+def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGR$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>;
+def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXa, Lat4], (instregex "MGHI$")>;
+def : InstRW<[FXa, Lat4], (instregex "MHI$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>;
+def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>;
+def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>;
+def : InstRW<[FXa, FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>;
+def : InstRW<[FXa, FXa, Lat8, GroupAlone], (instregex "MGRK$")>;
+def : InstRW<[FXa, LSU, Lat9], (instregex "MSC$")>;
+def : InstRW<[FXa, LSU, Lat11], (instregex "MSGC$")>;
+def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>;
+def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>;
+
+//===----------------------------------------------------------------------===//
+// Division and remainder
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>;
+def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>;
+def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>;
+def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>;
+def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>;
+def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Shifts
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "SLL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRL(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SRA(G|K)?$")>;
+def : InstRW<[FXa], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
+
+// Rotate
+def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>;
+
+// Rotate and insert
+def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>;
+def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>;
+def : InstRW<[FXa], (instregex "RISBMux$")>;
+
+// Rotate and Select
+def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>;
+
+//===----------------------------------------------------------------------===//
+// Comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>;
+def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>;
+def : InstRW<[FXb], (instregex "CG(F|H)I$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>;
+def : InstRW<[FXb], (instregex "C(G)?R$")>;
+def : InstRW<[FXb], (instregex "CIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>;
+def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>;
+def : InstRW<[FXb], (instregex "CLGF(I|R)$")>;
+def : InstRW<[FXb], (instregex "CLGR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>;
+def : InstRW<[FXb], (instregex "CLIH$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>;
+def : InstRW<[FXb], (instregex "CLR$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXb], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>;
+
+// Compare halfword
+def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>;
+def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>;
+
+// Compare with sign extension (32 -> 64)
+def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "CGFR$")>;
+
+// Compare logical character
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
+
+// Test under mask
+def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>;
+def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>;
+def : InstRW<[FXb], (instregex "TMHH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMHL(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLH(64)?$")>;
+def : InstRW<[FXb], (instregex "TMLL(64)?$")>;
+
+// Compare logical characters under mask
+def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Prefetch and execution hint
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "PFD(RL)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "BPP$")>;
+def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>;
+def : InstRW<[FXb], (instregex "NIAI$")>;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>;
+
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>;
+
+// Test and set
+def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>;
+
+// Compare and swap
+def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>;
+
+// Compare double and swap
+def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone],
+             (instregex "CDS(Y)?$")>;
+def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone],
+             (instregex "CDSG$")>;
+
+// Compare and swap and store
+def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>;
+
+// Perform locked operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
+
+// Load/store pair from/to quadword
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>;
+def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
+
+// Load pair disjoint
+def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>;
+def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>;
+def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>;
+
+//===----------------------------------------------------------------------===//
+// Guarded storage
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "LGG$")>;
+def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>;
+def : InstRW<[LSU, Lat30], (instregex "(L|ST)GSC$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone],
+             (instregex "CVBG$")>;
+def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>;
+
+def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>;
+def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Access registers
+//===----------------------------------------------------------------------===//
+
+// Extract/set/copy access register
+def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>;
+
+// Load address extended
+def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>;
+
+// Load/store access multiple (not modeled precisely)
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Program mask and addressing mode
+//===----------------------------------------------------------------------===//
+
+// Insert Program Mask
+def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>;
+
+// Set Program Mask
+def : InstRW<[LSU, EndGroup], (instregex "SPM$")>;
+
+// Branch and link
+def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>;
+
+// Test addressing mode
+def : InstRW<[FXb], (instregex "TAM$")>;
+
+// Set addressing mode
+def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>;
+
+// Branch (and save) and set mode.
+def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>;
+def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>;
+
+//===----------------------------------------------------------------------===//
+// Transactional execution
+//===----------------------------------------------------------------------===//
+
+// Transaction begin
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone],
+              (instregex "TBEGIN(C|_nofloat)?$")>;
+
+// Transaction end
+def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>;
+
+// Transaction abort
+def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>;
+
+// Extract Transaction Nesting Depth
+def : InstRW<[FXa], (instregex "ETND$")>;
+
+// Nontransactional store
+def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>;
+
+//===----------------------------------------------------------------------===//
+// Processor assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, GroupAlone], (instregex "PPA$")>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions.
+//===----------------------------------------------------------------------===//
+
+// Find leftmost one
+def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>;
+
+// Population count
+def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>;
+
+// Extend
+def : InstRW<[FXa], (instregex "AEXT128$")>;
+def : InstRW<[FXa], (instregex "ZEXT128$")>;
+
+// String instructions
+def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
+
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>;
+
+// Execute
+def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>;
+
+//===----------------------------------------------------------------------===//
+// .insn directive instructions
+//===----------------------------------------------------------------------===//
+
+// An "empty" sched-class will be assigned instead of the "invalid sched-class".
+// getNumDecoderSlots() will then return 1 instead of 0.
+def : InstRW<[], (instregex "Insn.*")>;
+
+
+// ----------------------------- Floating point ----------------------------- //
+
+//===----------------------------------------------------------------------===//
+// FP: Select instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>;
+def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>;
+def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load zero
+def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>;
+def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>;
+
+// Load
+def : InstRW<[VecXsPm], (instregex "LER$")>;
+def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>;
+def : InstRW<[FXb, Lat3], (instregex "LGDR$")>;
+def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>;
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone],
+             (instregex "LTXBRCompare(_VecPseudo)?$")>;
+
+// Copy sign
+def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>;
+def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Load instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>;
+def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>;
+def : InstRW<[LSU], (instregex "LX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Store instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>;
+def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>;
+
+// Load lengthened
+def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>;
+def : InstRW<[VecBF], (instregex "LDEBR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>;
+def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>;
+def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>;
+def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>;
+def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "A(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>;
+def : InstRW<[VecBF], (instregex "S(E|D)BR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>;
+
+// Divide to integer
+def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>;
+
+// Test Data Class
+def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>;
+
+//===----------------------------------------------------------------------===//
+// FP: Floating-point control register instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
+def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
+def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
+def : InstRW<[FXa, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[VecBF], (instregex "LEXR$")>;
+def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXb], (instregex "LDER$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
+def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIER$")>;
+def : InstRW<[VecBF], (instregex "FIDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[VecBF], (instregex "C(E|D)R$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecDF], (instregex "LTDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[VecDF], (instregex "LDETR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXb, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[VecDF], (instregex "FIDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[VecDF], (instregex "QADTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
+def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[VecDF], (instregex "CEDTR$")>;
+def : InstRW<[VecDF], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
+// --------------------------------- Vector --------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// Vector: Move instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "VLR(32|64)?$")>;
+def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>;
+def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>;
+def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Immediate instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VZERO$")>;
+def : InstRW<[VecXsPm], (instregex "VONE$")>;
+def : InstRW<[VecXsPm], (instregex "VGBM$")>;
+def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Loads
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU], (instregex "VL(L|BB)?$")>;
+def : InstRW<[LSU], (instregex "VL(32|64)$")>;
+def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>;
+def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+              (instregex "VLM$")>;
+def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Stores
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>;
+def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>;
+def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone],
+              (instregex "VSTM$")>;
+def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>;
+def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Selects and permutes
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VPDI$")>;
+def : InstRW<[VecXsPm], (instregex "VBPERM$")>;
+def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSEL$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Widening and narrowing
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>;
+def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>;
+def : InstRW<[VecXsPm], (instregex "VO(C)?$")>;
+def : InstRW<[VecMul], (instregex "VCKSM$")>;
+def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VX$")>;
+def : InstRW<[VecMul], (instregex "VGFM?$")>;
+def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>;
+def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VML(B|F)?$")>;
+def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>;
+def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>;
+def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSLB$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>;
+def : InstRW<[VecXsPm], (instregex "VSR(A|L)B$")>;
+
+def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>;
+def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>;
+def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>;
+
+def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>;
+def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Integer comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>;
+def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VTM$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point arithmetic
+//===----------------------------------------------------------------------===//
+
+// Conversion and rounding
+def : InstRW<[VecBF], (instregex "VCD(L)?G$")>;
+def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GD$")>;
+def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>;
+def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>;
+def : InstRW<[VecBF], (instregex "VFL(L|R)$")>;
+def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>;
+def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>;
+def : InstRW<[VecBF2], (instregex "WFLLD$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>;
+def : InstRW<[VecBF2], (instregex "VFI$")>;
+def : InstRW<[VecBF], (instregex "VFIDB$")>;
+def : InstRW<[VecBF], (instregex "WFIDB$")>;
+def : InstRW<[VecBF2], (instregex "VFISB$")>;
+def : InstRW<[VecBF], (instregex "WFISB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>;
+
+// Sign operations
+def : InstRW<[VecXsPm], (instregex "VFPSO$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>;
+def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>;
+def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>;
+
+// Minimum / maximum
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>;
+
+// Test data class
+def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>;
+
+// Add / subtract
+def : InstRW<[VecBF2], (instregex "VF(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>;
+
+// Multiply / multiply-and-add/subtract
+def : InstRW<[VecBF2], (instregex "VFM$")>;
+def : InstRW<[VecBF], (instregex "VFMDB$")>;
+def : InstRW<[VecBF], (instregex "WFMDB$")>;
+def : InstRW<[VecBF2], (instregex "VFMSB$")>;
+def : InstRW<[VecBF], (instregex "WFMSB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>;
+def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>;
+def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>;
+def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>;
+def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>;
+
+// Divide / square root
+def : InstRW<[VecFPd], (instregex "VFD$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>;
+def : InstRW<[VecFPd], (instregex "WFDXB$")>;
+def : InstRW<[VecFPd], (instregex "VFSQ$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>;
+def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>;
+def : InstRW<[VecFPd], (instregex "WFSQXB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point comparison
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>;
+def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>;
+def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>;
+def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Floating-point insertion and extraction
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LEFR$")>;
+def : InstRW<[FXb, Lat4], (instregex "LFER$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: String instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecStr], (instregex "VFAE(B)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>;
+def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>;
+def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>;
+def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>;
+def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>;
+
+//===----------------------------------------------------------------------===//
+// Vector: Packed-decimal instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[VecDF, VecDF, Lat10], (instregex "VLIP$")>;
+def : InstRW<[VecDFX, LSU, GroupAlone], (instregex "VPKZ$")>;
+def : InstRW<[VecDFX, FXb, LSU, Lat12, BeginGroup], (instregex "VUPKZ$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>;
+def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>;
+def : InstRW<[VecDFX], (instregex "V(A|S)P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>;
+def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>;
+def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>;
+def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>;
+def : InstRW<[VecDFX], (instregex "VPSOP$")>;
+def : InstRW<[VecDFX], (instregex "V(T|C)P$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXa, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXb, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXb, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXb, Lat30], (instregex "IRBM$")>;
+def : InstRW<[FXb, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXb, Lat30], (instregex "TB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXb, Lat30], (instregex "PR$")>;
+def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXb, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>;
+def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone],
+             (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXb, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXb, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXb, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXb], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXb, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb], (instregex "LPP$")>;
+def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXb, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXb, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXb, Lat30], (instregex "SAL$")>;
+
+}
+
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
index a950e54..4d986e8 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 def Z196_FXUnit : ProcResource<2>;
 def Z196_LSUnit : ProcResource<2>;
 def Z196_FPUnit : ProcResource<1>;
+def Z196_DFUnit : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
 def : WriteRes<FXU,       [Z196_FXUnit]> { let Latency = 1; }
@@ -66,6 +67,8 @@ def : WriteRes<LSU,       [Z196_LSUnit]> { let Latency = 4; }
 def : WriteRes<LSU_lat1,  [Z196_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,       [Z196_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2,      [Z196_FPUnit, Z196_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU,       [Z196_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2,      [Z196_DFUnit, Z196_DFUnit]> { let Latency = 3; }
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
 
@@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>;
 
 def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
              (instregex "STM(H|Y|G)?$")>;
@@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
 def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
 def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>;
 def : InstRW<[FXU], (instregex "AIH$")>;
 def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -294,15 +303,17 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AGR(K)?$")>;
 def : InstRW<[FXU], (instregex "AHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
 def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
 def : InstRW<[FXU], (instregex "ALGHSIK$")>;
 def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -325,6 +336,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
+def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -390,16 +403,22 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "D$")>;
 def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
               (instregex "DSG(F)?R$")>;
 def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -416,7 +435,9 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
 def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -453,6 +474,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>;
@@ -465,7 +488,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -476,6 +499,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch
 //===----------------------------------------------------------------------===//
@@ -507,7 +533,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -520,6 +546,50 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -560,40 +630,29 @@ def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
 
 // Extend
-def : InstRW<[FXU], (instregex "AEXT128_64$")>;
-def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
+def : InstRW<[FXU], (instregex "AEXT128$")>;
+def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
-// Move with key
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Extract CPU Time
-def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
-def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
-def : InstRW<[LSU, FXU, Lat5], (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -730,9 +789,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -740,14 +799,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
 def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
@@ -760,10 +822,396 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
 def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[FPU], (instregex "LEXR$")>;
+def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXU], (instregex "LDER$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[FPU], (instregex "THD(E)?R$")>;
+def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIER$")>;
+def : InstRW<[FPU], (instregex "FIDR$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2,  Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[FPU], (instregex "C(E|D)R$")>;
+def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
+def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
+def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
+             (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXU, Lat30], (instregex "TB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
+def : InstRW<[FXU, LSU, Lat15], (instregex "STCK$")>;
+def : InstRW<[FXU, LSU, Lat12], (instregex "STCKF$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LPP$")>;
+def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
+
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 8ab6c82..a0f2115 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -59,6 +59,7 @@ def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;}
 def ZEC12_FXUnit : ProcResource<2>;
 def ZEC12_LSUnit : ProcResource<2>;
 def ZEC12_FPUnit : ProcResource<1>;
+def ZEC12_DFUnit : ProcResource<1>;
 def ZEC12_VBUnit : ProcResource<1>;
 
 // Subtarget specific definitions of scheduling resources.
@@ -67,6 +68,8 @@ def : WriteRes<LSU,      [ZEC12_LSUnit]> { let Latency = 4; }
 def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; }
 def : WriteRes<FPU,  [ZEC12_FPUnit]> { let Latency = 8; }
 def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; }
+def : WriteRes<DFU,  [ZEC12_DFUnit]> { let Latency = 2; }
+def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; }
 def : WriteRes<VBU,  [ZEC12_VBUnit]>; // Virtual Branching Unit
 
 // -------------------------- INSTRUCTIONS ---------------------------------- //
@@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>;
 
 // Move character
 def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>;
 
 // Pseudo -> reg move
 def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>;
@@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>;
 
 def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>;
 def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone],
              (instregex "LM(H|Y|G)?$")>;
 
+// Load multiple disjoint
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>;
+
 // Store multiple (estimated average of 3 ops)
 def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
              (instregex "STM(H|Y|G)?$")>;
@@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone],
 def : InstRW<[FXU], (instregex "LRV(G)?R$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
@@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>;
 // Addition
 //===----------------------------------------------------------------------===//
 
-def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>;
 def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>;
 def : InstRW<[FXU], (instregex "AIH$")>;
 def : InstRW<[FXU], (instregex "AFI(Mux)?$")>;
@@ -304,15 +313,17 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AGR(K)?$")>;
 def : InstRW<[FXU], (instregex "AHI(K)?$")>;
 def : InstRW<[FXU], (instregex "AHIMux(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>;
 def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>;
 def : InstRW<[FXU], (instregex "ALGHSIK$")>;
 def : InstRW<[FXU], (instregex "ALGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "ALGR(K)?$")>;
 def : InstRW<[FXU], (instregex "ALR(K)?$")>;
 def : InstRW<[FXU], (instregex "AR(K)?$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>;
+def : InstRW<[FXU], (instregex "A(L)?HHHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "A(L)?HHLR$")>;
+def : InstRW<[FXU], (instregex "ALSIH(N)?$")>;
+def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>;
 
 // Logical addition with carry
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>;
@@ -335,6 +346,8 @@ def : InstRW<[FXU], (instregex "SLGF(I|R)$")>;
 def : InstRW<[FXU], (instregex "SLGR(K)?$")>;
 def : InstRW<[FXU], (instregex "SLR(K)?$")>;
 def : InstRW<[FXU], (instregex "SR(K)?$")>;
+def : InstRW<[FXU], (instregex "S(L)?HHHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "S(L)?HHLR$")>;
 
 // Subtraction with borrow
 def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>;
@@ -400,16 +413,22 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>;
 def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>;
 def : InstRW<[FXU, Lat8], (instregex "MSGR$")>;
 def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>;
-def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
-def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
+def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>;
+def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>;
 def : InstRW<[FXU, Lat5], (instregex "MGHI$")>;
 def : InstRW<[FXU, Lat5], (instregex "MHI$")>;
 def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>;
+def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
+def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
+              (instregex "D$")>;
 def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
               (instregex "DSG(F)?R$")>;
 def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone],
@@ -426,7 +445,9 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone],
 def : InstRW<[FXU], (instregex "SLL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRL(G|K)?$")>;
 def : InstRW<[FXU], (instregex "SRA(G|K)?$")>;
-def : InstRW<[FXU], (instregex "SLA(K)?$")>;
+def : InstRW<[FXU], (instregex "SLA(G|K)?$")>;
+def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone],
+             (instregex "S(L|R)D(A|L)$")>;
 
 // Rotate
 def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>;
@@ -463,6 +484,8 @@ def : InstRW<[FXU], (instregex "CLIH$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>;
 def : InstRW<[FXU], (instregex "CLR$")>;
 def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>;
+def : InstRW<[FXU], (instregex "C(L)?HHR$")>;
+def : InstRW<[FXU, Lat2], (instregex "C(L)?HLR$")>;
 
 // Compare halfword
 def : InstRW<[FXU, LSU, Lat6], (instregex "CH(Y|RL)?$")>;
@@ -475,7 +498,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>;
 
 // Compare logical character
 def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>;
-
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>;
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>;
 
 // Test under mask
@@ -486,6 +509,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLH(64)?$")>;
 def : InstRW<[FXU], (instregex "TMLL(64)?$")>;
 
+// Compare logical characters under mask
+def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>;
+
 //===----------------------------------------------------------------------===//
 // Prefetch and execution hint
 //===----------------------------------------------------------------------===//
@@ -519,7 +545,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone],
              (instregex "CDSG$")>;
 
 // Compare and swap and store
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>;
 
 // Perform locked operation
 def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>;
@@ -532,6 +558,50 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>;
 def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>;
 
 //===----------------------------------------------------------------------===//
+// Translate and convert
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>;
+def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>;
+def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>;
+
+//===----------------------------------------------------------------------===//
+// Message-security assist
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>;
+
+//===----------------------------------------------------------------------===//
+// Decimal arithmetic
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>;
+def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>;
+def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone],
+             (instregex "CVDG$")>;
+def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>;
+def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>;
+def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>;
+
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone],
+             (instregex "(A|S|ZA)P$")>;
+def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone],
+             (instregex "(M|D)P$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone],
+             (instregex "SRP$")>;
+def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>;
+def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>;
+
+//===----------------------------------------------------------------------===//
 // Access registers
 //===----------------------------------------------------------------------===//
 
@@ -598,40 +668,29 @@ def : InstRW<[FXU], (instregex "PPA$")>;
 //===----------------------------------------------------------------------===//
 
 // Find leftmost one
-def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
+def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>;
 
 // Population count
 def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>;
 
 // Extend
-def : InstRW<[FXU], (instregex "AEXT128_64$")>;
-def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>;
+def : InstRW<[FXU], (instregex "AEXT128$")>;
+def : InstRW<[FXU], (instregex "ZEXT128$")>;
 
 // String instructions
 def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>;
+def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>;
 
-// Move with key
-def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>;
-
-// Extract CPU Time
-def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>;
+// Various complex instructions
+def : InstRW<[LSU, Lat30], (instregex "CFC$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>;
+def : InstRW<[LSU, Lat30], (instregex "CKSM$")>;
+def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>;
 
 // Execute
 def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>;
 
-// Program return
-def : InstRW<[FXU, Lat30], (instregex "PR$")>;
-
-// Inline assembly
-def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
-def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
-             (instregex "STCKE$")>;
-def : InstRW<[FXU, LSU, Lat5], (instregex "STFLE$")>;
-def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
-
-// Store real address
-def : InstRW<[FXU, LSU, Lat5], (instregex "STRAG$")>;
-
 //===----------------------------------------------------------------------===//
 // .insn directive instructions
 //===----------------------------------------------------------------------===//
@@ -768,9 +827,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>;
 
 // Multiply and add / subtract
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>;
-def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>;
 def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>;
 
 // Division
@@ -778,14 +837,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>;
 def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>;
 def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>;
 
+// Divide to integer
+def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>;
+
 //===----------------------------------------------------------------------===//
 // FP: Comparisons
 //===----------------------------------------------------------------------===//
 
 // Compare
-def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>;
-def : InstRW<[FPU], (instregex "C(E|D)BR$")>;
-def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>;
+def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>;
+def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>;
+def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>;
 
 // Test Data Class
 def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>;
@@ -798,10 +860,403 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>;
 def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>;
 def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>;
 def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>;
-def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
-def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
-def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
-def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
+def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>;
+def : InstRW<[FXU, Lat30], (instregex "SFASR$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>;
+
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[FPU], (instregex "LEXR$")>;
+def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXU], (instregex "LDER$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[FPU], (instregex "THD(E)?R$")>;
+def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIER$")>;
+def : InstRW<[FPU], (instregex "FIDR$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>;
+def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>;
+def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>;
+def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[FPU], (instregex "C(E|D)R$")>;
+def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
+def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[FXU, Lat30], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
+def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
+def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>;
+
+
+// -------------------------------- System ---------------------------------- //
+
+//===----------------------------------------------------------------------===//
+// System: Program-Status Word Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "EPSW$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>;
+def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SSM$")>;
+def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>;
+def : InstRW<[FXU, Lat3], (instregex "IAC$")>;
+def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Control Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>;
+def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone],
+             (instregex "STCT(L|G)$")>;
+def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>;
+def : InstRW<[FXU, Lat30], (instregex "ESEA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Prefix-Register Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "SPX$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STPX$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Storage-Key and Real Memory Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "ISKE$")>;
+def : InstRW<[FXU, Lat30], (instregex "IVSK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SSKE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "RRB(E|M)$")>;
+def : InstRW<[FXU, Lat30], (instregex "PFMF$")>;
+def : InstRW<[FXU, Lat30], (instregex "TB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGIN$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PGOUT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Dynamic-Address-Translation Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "IDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "CRDTE(Opt)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTLB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "CSP(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LPTEA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LRA(Y|G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STRAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "LURA(G)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STUR(A|G)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Memory-move Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>;
+def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>;
+def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>;
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Address-Space Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>;
+def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>;
+def : InstRW<[FXU, Lat30], (instregex "PR$")>;
+def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "RP$")>;
+def : InstRW<[FXU, Lat30], (instregex "BS(G|A)$")>;
+def : InstRW<[FXU, Lat20], (instregex "TAR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Linkage-Stack Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>;
+def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>;
+def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Time-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "PTFF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCK$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCKPF$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SCKC$")>;
+def : InstRW<[FXU, LSU, Lat20], (instregex "SPT$")>;
+def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "STCK(F)?$")>;
+def : InstRW<[LSU, LSU, LSU, LSU, FXU, FXU, Lat20, GroupAlone],
+             (instregex "STCKE$")>;
+def : InstRW<[FXU, LSU, Lat9], (instregex "STCKC$")>;
+def : InstRW<[FXU, LSU, Lat8], (instregex "STPT$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Related Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, LSU, Lat30], (instregex "STAP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STIDP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STSI$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STFL(E)?$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECAG$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "ECTG$")>;
+def : InstRW<[FXU, Lat30], (instregex "PTF$")>;
+def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>;
+
+//===----------------------------------------------------------------------===//
+// System: Miscellaneous Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "SVC$")>;
+def : InstRW<[FXU, GroupAlone], (instregex "MC$")>;
+def : InstRW<[FXU, Lat30], (instregex "DIAG$")>;
+def : InstRW<[FXU], (instregex "TRAC(E|G)$")>;
+def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>;
+def : InstRW<[FXU, Lat30], (instregex "SIGP$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>;
+
+//===----------------------------------------------------------------------===//
+// System: CPU-Measurement Facility Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU], (instregex "LPP$")>;
+def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>;
+def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>;
+def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>;
+def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>;
+
+//===----------------------------------------------------------------------===//
+// System: I/O Instructions
+//===----------------------------------------------------------------------===//
+
+def : InstRW<[FXU, Lat30], (instregex "(C|H|R|X)SCH$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>;
+def : InstRW<[FXU, Lat30], (instregex "RCHP$")>;
+def : InstRW<[FXU, Lat30], (instregex "SCHM$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "STC(PS|RW)$")>;
+def : InstRW<[FXU, LSU, Lat30], (instregex "TPI$")>;
+def : InstRW<[FXU, Lat30], (instregex "SAL$")>;
 
 }
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index 83882fc..13ceb37 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -14,9 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
@@ -167,10 +167,10 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
     MI.RemoveOperand(0);
     MI.setDesc(TII->get(Opcode));
     MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
-      .addOperand(Dest)
-      .addOperand(Mode)
-      .addOperand(Src)
-      .addOperand(Suppress);
+        .add(Dest)
+        .add(Mode)
+        .add(Src)
+        .add(Suppress);
     return true;
   }
   return false;
@@ -200,14 +200,26 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001AddCC(MI, SystemZ::ADBR);
       break;
 
+    case SystemZ::WFASB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::AEBR);
+      break;
+
     case SystemZ::WFDDB:
       Changed |= shortenOn001(MI, SystemZ::DDBR);
       break;
 
+    case SystemZ::WFDSB:
+      Changed |= shortenOn001(MI, SystemZ::DEBR);
+      break;
+
     case SystemZ::WFIDB:
       Changed |= shortenFPConv(MI, SystemZ::FIDBRA);
       break;
 
+    case SystemZ::WFISB:
+      Changed |= shortenFPConv(MI, SystemZ::FIEBRA);
+      break;
+
     case SystemZ::WLDEB:
       Changed |= shortenOn01(MI, SystemZ::LDEBR);
       break;
@@ -220,30 +232,58 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001(MI, SystemZ::MDBR);
       break;
 
+    case SystemZ::WFMSB:
+      Changed |= shortenOn001(MI, SystemZ::MEEBR);
+      break;
+
     case SystemZ::WFLCDB:
       Changed |= shortenOn01(MI, SystemZ::LCDFR);
       break;
 
+    case SystemZ::WFLCSB:
+      Changed |= shortenOn01(MI, SystemZ::LCDFR_32);
+      break;
+
     case SystemZ::WFLNDB:
       Changed |= shortenOn01(MI, SystemZ::LNDFR);
       break;
 
+    case SystemZ::WFLNSB:
+      Changed |= shortenOn01(MI, SystemZ::LNDFR_32);
+      break;
+
     case SystemZ::WFLPDB:
       Changed |= shortenOn01(MI, SystemZ::LPDFR);
       break;
 
+    case SystemZ::WFLPSB:
+      Changed |= shortenOn01(MI, SystemZ::LPDFR_32);
+      break;
+
     case SystemZ::WFSQDB:
       Changed |= shortenOn01(MI, SystemZ::SQDBR);
       break;
 
+    case SystemZ::WFSQSB:
+      Changed |= shortenOn01(MI, SystemZ::SQEBR);
+      break;
+
     case SystemZ::WFSDB:
       Changed |= shortenOn001AddCC(MI, SystemZ::SDBR);
       break;
 
+    case SystemZ::WFSSB:
+      Changed |= shortenOn001AddCC(MI, SystemZ::SEBR);
+      break;
+
     case SystemZ::WFCDB:
       Changed |= shortenOn01(MI, SystemZ::CDBR);
       break;
 
+    case SystemZ::WFCSB:
+      Changed |= shortenOn01(MI, SystemZ::CEBR);
+      break;
+
     case SystemZ::VL32:
       // For z13 we prefer LDE over LE to avoid partial register dependencies.
       Changed |= shortenOn0(MI, SystemZ::LDE32);
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index ce07ea3..9cd09b0 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -37,12 +37,20 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
                                    const TargetMachine &TM)
     : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
       HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
-      HasPopulationCount(false), HasFastSerialization(false),
-      HasInterlockedAccess1(false), HasMiscellaneousExtensions(false),
+      HasPopulationCount(false), HasMessageSecurityAssist3(false),
+      HasMessageSecurityAssist4(false), HasResetReferenceBitsMultiple(false),
+      HasFastSerialization(false), HasInterlockedAccess1(false),
+      HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
+      HasDFPZonedConversion(false), HasEnhancedDAT2(false),
       HasVector(false), HasLoadStoreOnCond2(false),
-      HasLoadAndZeroRightmostByte(false),
+      HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
+      HasDFPPackedConversion(false),
+      HasMiscellaneousExtensions2(false), HasGuardedStorage(false),
+      HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false),
+      HasVectorEnhancements1(false), HasVectorPackedDecimal(false),
+      HasInsertReferenceBitsMultiple(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index cdb6132..4829f73 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -19,8 +19,8 @@
 #include "SystemZInstrInfo.h"
 #include "SystemZRegisterInfo.h"
 #include "SystemZSelectionDAGInfo.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
 
@@ -39,6 +39,9 @@ protected:
   bool HasHighWord;
   bool HasFPExtension;
   bool HasPopulationCount;
+  bool HasMessageSecurityAssist3;
+  bool HasMessageSecurityAssist4;
+  bool HasResetReferenceBitsMultiple;
   bool HasFastSerialization;
   bool HasInterlockedAccess1;
   bool HasMiscellaneousExtensions;
@@ -46,9 +49,20 @@ protected:
   bool HasLoadAndTrap;
   bool HasTransactionalExecution;
   bool HasProcessorAssist;
+  bool HasDFPZonedConversion;
+  bool HasEnhancedDAT2;
   bool HasVector;
   bool HasLoadStoreOnCond2;
   bool HasLoadAndZeroRightmostByte;
+  bool HasMessageSecurityAssist5;
+  bool HasDFPPackedConversion;
+  bool HasMiscellaneousExtensions2;
+  bool HasGuardedStorage;
+  bool HasMessageSecurityAssist7;
+  bool HasMessageSecurityAssist8;
+  bool HasVectorEnhancements1;
+  bool HasVectorPackedDecimal;
+  bool HasInsertReferenceBitsMultiple;
 
 private:
   Triple TargetTriple;
@@ -104,6 +118,19 @@ public:
   // Return true if the target has the population-count facility.
   bool hasPopulationCount() const { return HasPopulationCount; }
 
+  // Return true if the target has the message-security-assist
+  // extension facility 3.
+  bool hasMessageSecurityAssist3() const { return HasMessageSecurityAssist3; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 4.
+  bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; }
+
+  // Return true if the target has the reset-reference-bits-multiple facility.
+  bool hasResetReferenceBitsMultiple() const {
+    return HasResetReferenceBitsMultiple;
+  }
+
   // Return true if the target has the fast-serialization facility.
   bool hasFastSerialization() const { return HasFastSerialization; }
 
@@ -127,14 +154,54 @@ public:
   // Return true if the target has the processor-assist facility.
   bool hasProcessorAssist() const { return HasProcessorAssist; }
 
+  // Return true if the target has the DFP zoned-conversion facility.
+  bool hasDFPZonedConversion() const { return HasDFPZonedConversion; }
+
+  // Return true if the target has the enhanced-DAT facility 2.
+  bool hasEnhancedDAT2() const { return HasEnhancedDAT2; }
+
   // Return true if the target has the load-and-zero-rightmost-byte facility.
   bool hasLoadAndZeroRightmostByte() const {
     return HasLoadAndZeroRightmostByte;
   }
 
+  // Return true if the target has the message-security-assist
+  // extension facility 5.
+  bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; }
+
+  // Return true if the target has the DFP packed-conversion facility.
+  bool hasDFPPackedConversion() const { return HasDFPPackedConversion; }
+
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
+  // Return true if the target has the miscellaneous-extensions facility 2.
+  bool hasMiscellaneousExtensions2() const {
+    return HasMiscellaneousExtensions2;
+  }
+
+  // Return true if the target has the guarded-storage facility.
+  bool hasGuardedStorage() const { return HasGuardedStorage; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 7.
+  bool hasMessageSecurityAssist7() const { return HasMessageSecurityAssist7; }
+
+  // Return true if the target has the message-security-assist
+  // extension facility 8.
+  bool hasMessageSecurityAssist8() const { return HasMessageSecurityAssist8; }
+
+  // Return true if the target has the vector-enhancements facility 1.
+  bool hasVectorEnhancements1() const { return HasVectorEnhancements1; }
+
+  // Return true if the target has the vector-packed-decimal facility.
+  bool hasVectorPackedDecimal() const { return HasVectorPackedDecimal; }
+
+  // Return true if the target has the insert-reference-bits-multiple facility.
+  bool hasInsertReferenceBitsMultiple() const {
+    return HasInsertReferenceBitsMultiple;
+  }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
index 96a9ef8..5dbd23d 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -47,10 +47,10 @@
 #include "SystemZ.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include <deque>
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 33fdb8f..025bf73 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -8,13 +8,24 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetMachine.h"
-#include "SystemZTargetTransformInfo.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZ.h"
 #include "SystemZMachineScheduler.h"
+#include "SystemZTargetTransformInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include <string>
 
 using namespace llvm;
 
@@ -48,7 +59,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      StringRef FS) {
   bool VectorABI = UsesVectorABI(CPU, FS);
-  std::string Ret = "";
+  std::string Ret;
 
   // Big endian.
   Ret += "E";
@@ -96,18 +107,19 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
                                            CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, FS), TT, CPU, FS, Options,
                         getEffectiveRelocModel(RM), CM, OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()),
+      TLOF(llvm::make_unique<TargetLoweringObjectFileELF>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
-SystemZTargetMachine::~SystemZTargetMachine() {}
+SystemZTargetMachine::~SystemZTargetMachine() = default;
 
 namespace {
+
 /// SystemZ Code Generator Pass Configuration Options.
 class SystemZPassConfig : public TargetPassConfig {
 public:
-  SystemZPassConfig(SystemZTargetMachine *TM, PassManagerBase &PM)
+  SystemZPassConfig(SystemZTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   SystemZTargetMachine &getSystemZTargetMachine() const {
@@ -116,7 +128,8 @@ public:
 
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
-    return new ScheduleDAGMI(C, make_unique<SystemZPostRASchedStrategy>(C),
+    return new ScheduleDAGMI(C,
+                             llvm::make_unique<SystemZPostRASchedStrategy>(C),
                              /*RemoveKillFlags=*/true);
   }
 
@@ -126,11 +139,14 @@ public:
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
+
 } // end anonymous namespace
 
 void SystemZPassConfig::addIRPasses() {
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createSystemZTDCPass());
+    addPass(createLoopDataPrefetchPass());
+  }
 
   TargetPassConfig::addIRPasses();
 }
@@ -157,7 +173,6 @@ void SystemZPassConfig::addPreSched2() {
 }
 
 void SystemZPassConfig::addPreEmitPass() {
-
   // Do instruction shortening before compare elimination because some
   // vector instructions will be shortened into opcodes that compare
   // elimination recognizes.
@@ -199,7 +214,7 @@ void SystemZPassConfig::addPreEmitPass() {
 }
 
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new SystemZPassConfig(this, PM);
+  return new SystemZPassConfig(*this, PM);
 }
 
 TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() {
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
index 69cf9bc..a10ca64 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -1,4 +1,4 @@
-//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//=- SystemZTargetMachine.h - Define TargetMachine for SystemZ ----*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,18 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
 
 #include "SystemZSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
-class TargetFrameLowering;
-
 class SystemZTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  SystemZSubtarget        Subtarget;
+  SystemZSubtarget Subtarget;
 
 public:
   SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -34,20 +37,22 @@ public:
   ~SystemZTargetMachine() override;
 
   const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
   const SystemZSubtarget *getSubtargetImpl(const Function &) const override {
     return &Subtarget;
   }
+
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
   TargetIRAnalysis getTargetIRAnalysis() override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
 
   bool targetSchedulesPostRAScheduling() const override { return true; };
-
 };
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZTARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b10c0e0..506dc74 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -238,7 +238,7 @@ SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
+void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   // Find out if L contains a call, what the machine instruction count
   // estimate is, and how many stores there are.
@@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
         }
       }
       if (isa<StoreInst>(&I)) {
-        NumStores++;
         Type *MemAccessTy = I.getOperand(0)->getType();
-        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
-           (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
-          NumStores++;  // 128 bit fp/int stores get split.
+        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
       }
     }
 
@@ -305,7 +302,7 @@ unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
   return 0;
 }
 
-unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
   if (!Vector)
     return 64;
   if (ST->hasVector())
@@ -313,3 +310,581 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
   return 0;
 }
 
+int SystemZTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
+
+  // TODO: return a good value for BB-VECTORIZER that includes the
+  // immediate loads, which we do not want to count for the loop
+  // vectorizer, since they are hopefully hoisted out of the loop. This
+  // would require a new parameter 'InLoop', but not sure if constant
+  // args are common enough to motivate this.
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+  // Div with a constant which is a power of 2 will be converted by
+  // DAGCombiner to use shifts. With vector shift-element instructions, a
+  // vector sdiv costs about as much as a scalar one.
+  const unsigned SDivCostEstimate = 4;
+  bool SDivPow2 = false;
+  bool UDivPow2 = false;
+  if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv) &&
+      Args.size() == 2) {
+    const ConstantInt *CI = nullptr;
+    if (const Constant *C = dyn_cast<Constant>(Args[1])) {
+      if (C->getType()->isVectorTy())
+        CI = dyn_cast_or_null<const ConstantInt>(C->getSplatValue());
+      else
+        CI = dyn_cast<const ConstantInt>(C);
+    }
+    if (CI != nullptr &&
+        (CI->getValue().isPowerOf2() || (-CI->getValue()).isPowerOf2())) {
+      if (Opcode == Instruction::SDiv)
+        SDivPow2 = true;
+      else
+        UDivPow2 = true;
+    }
+  }
+
+  if (Ty->isVectorTy()) {
+    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    unsigned VF = Ty->getVectorNumElements();
+    unsigned NumVectors = getNumberOfParts(Ty);
+
+    // These vector operations are custom handled, but are still supported
+    // with one instruction per vector, regardless of element size.
+    if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+        Opcode == Instruction::AShr || UDivPow2) {
+      return NumVectors;
+    }
+
+    if (SDivPow2)
+      return (NumVectors * SDivCostEstimate);
+
+    // These FP operations are supported with a single vector instruction for
+    // double (base implementation assumes float generally costs 2). For
+    // FP128, the scalar cost is 1, and there is no overhead since the values
+    // are already in scalar registers.
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+      switch (ScalarBits) {
+      case 32: {
+        // The vector enhancements facility 1 provides v4f32 instructions.
+        if (ST->hasVectorEnhancements1())
+          return NumVectors;
+        // Return the cost of multiple scalar invocation plus the cost of
+        // inserting and extracting the values.
+        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        // FIXME: VF 2 for these FP operations are currently just as
+        // expensive as for VF 4.
+        if (VF == 2)
+          Cost *= 2;
+        return Cost;
+      }
+      case 64:
+      case 128:
+        return NumVectors;
+      default:
+        break;
+      }
+    }
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem) {
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+      if (VF == 2 && ScalarBits == 32)
+        Cost *= 2;
+      return Cost;
+    }
+  }
+  else {  // Scalar:
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1)
+      // 2 * ipm sequences ; xor ; shift ; compare
+      return 7;
+
+    if (UDivPow2)
+      return 1;
+    if (SDivPow2)
+      return SDivCostEstimate;
+
+    // An extra extension for narrow types is needed.
+    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+      // sext of op(s) for narrow types
+      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+
+    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
+      return (ScalarBits < 32 ? 4 : 2);
+  }
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  assert (Tp->isVectorTy());
+  assert (ST->hasVector() && "getShuffleCost() called.");
+  unsigned NumVectors = getNumberOfParts(Tp);
+  
+  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+
+  // FP128 values are always in scalar registers, so there is no work
+  // involved with a shuffle, except for broadcast. In that case register
+  // moves are done with a single instruction per element.
+  if (Tp->getScalarType()->isFP128Ty())
+    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+
+  switch (Kind) {
+  case  TargetTransformInfo::SK_ExtractSubvector:
+    // ExtractSubvector Index indicates start offset.
+
+    // Extracting a subvector from first index is a noop.
+    return (Index == 0 ? 0 : NumVectors);
+
+  case TargetTransformInfo::SK_Broadcast:
+    // Loop vectorizer calls here to figure out the extra cost of
+    // broadcasting a loaded value to all elements of a vector. Since vlrep
+    // loads and replicates with a single instruction, adjust the returned
+    // value.
+    return NumVectors - 1;
+
+  default:
+
+    // SystemZ supports single instruction permutation / replication.
+    return NumVectors;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+// Return the log2 difference of the element sizes of the two vector types.
+static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
+  unsigned Bits0 = Ty0->getScalarSizeInBits();
+  unsigned Bits1 = Ty1->getScalarSizeInBits();
+
+  if (Bits1 >  Bits0)
+    return (Log2_32(Bits1) - Log2_32(Bits0));
+
+  return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+// Return the number of instructions needed to truncate SrcTy to DstTy.
+unsigned SystemZTTIImpl::
+getVectorTruncCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
+  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
+          "Packing must reduce size of vector type.");
+  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
+          "Packing should not change number of elements.");
+
+  // TODO: Since fp32 is expanded, the extract cost should always be 0.
+
+  unsigned NumParts = getNumberOfParts(SrcTy);
+  if (NumParts <= 2)
+    // Up to 2 vector registers can be truncated efficiently with pack or
+    // permute. The latter requires an immediate mask to be loaded, which
+    // typically gets hoisted out of a loop.  TODO: return a good value for
+    // BB-VECTORIZER that includes the immediate loads, which we do not want
+    // to count for the loop vectorizer.
+    return 1;
+
+  unsigned Cost = 0;
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  unsigned VF = SrcTy->getVectorNumElements();
+  for (unsigned P = 0; P < Log2Diff; ++P) {
+    if (NumParts > 1)
+      NumParts /= 2;
+    Cost += NumParts;
+  }
+
+  // Currently, a general mix of permutes and pack instructions is output by
+  // isel, which follow the cost computation above except for this case which
+  // is one instruction less:
+  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
+      DstTy->getScalarSizeInBits() == 8)
+    Cost--;
+
+  return Cost;
+}
+
+// Return the cost of converting a vector bitmask produced by a compare
+// (SrcTy), to the type of the select or extend instruction (DstTy).
+unsigned SystemZTTIImpl::
+getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
+          "Should only be called with vector types.");
+
+  unsigned PackCost = 0;
+  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
+  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  if (SrcScalarBits > DstScalarBits)
+    // The bitmask will be truncated.
+    PackCost = getVectorTruncCost(SrcTy, DstTy);
+  else if (SrcScalarBits < DstScalarBits) {
+    unsigned DstNumParts = getNumberOfParts(DstTy);
+    // Each vector select needs its part of the bitmask unpacked.
+    PackCost = Log2Diff * DstNumParts;
+    // Extra cost for moving part of mask before unpacking.
+    PackCost += DstNumParts - 1;
+  }
+
+  return PackCost;
+}
+
+// Return the type of the compared operands. This is needed to compute the
+// cost for a Select / ZExt or SExt instruction.
+static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
+  Type *OpTy = nullptr;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+    OpTy = CI->getOperand(0)->getType();
+  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
+    if (LogicI->getNumOperands() == 2)
+      if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
+        if (isa<CmpInst>(LogicI->getOperand(1)))
+          OpTy = CI0->getOperand(0)->getType();
+
+  if (OpTy != nullptr) {
+    if (VF == 1) {
+      assert (!OpTy->isVectorTy() && "Expected scalar type");
+      return OpTy;
+    }
+    // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
+    // be either scalar or already vectorized with a same or lesser VF.
+    Type *ElTy = OpTy->getScalarType();
+    return VectorType::get(ElTy, VF);
+  }
+
+  return nullptr;
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
+  unsigned DstScalarBits = Dst->getScalarSizeInBits();
+  unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+  if (Src->isVectorTy()) {
+    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+    assert (Dst->isVectorTy());
+    unsigned VF = Src->getVectorNumElements();
+    unsigned NumDstVectors = getNumberOfParts(Dst);
+    unsigned NumSrcVectors = getNumberOfParts(Src);
+
+    if (Opcode == Instruction::Trunc) {
+      if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
+        return 0; // Check for NOOP conversions.
+      return getVectorTruncCost(Src, Dst);
+    }
+
+    if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      if (SrcScalarBits >= 8) {
+        // ZExt/SExt will be handled with one unpack per doubling of width.
+        unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
+
+        // For types that spans multiple vector registers, some additional
+        // instructions are used to setup the unpacking.
+        unsigned NumSrcVectorOps =
+          (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
+                          : (NumDstVectors / 2));
+
+        return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
+      }
+      else if (SrcScalarBits == 1) {
+        // This should be extension of a compare i1 result.
+        // If we know what the widths of the compared operands, get the
+        // cost of converting it to Dst. Otherwise assume same widths.
+        unsigned Cost = 0;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+        if (CmpOpTy != nullptr)
+          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+        if (Opcode == Instruction::ZExt)
+          // One 'vn' per dst vector with an immediate mask.
+          Cost += NumDstVectors;
+        return Cost;
+      }
+    }
+  
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+        Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+      // TODO: Fix base implementation which could simplify things a bit here
+      // (seems to miss on differentiating on scalar/vector types).
+
+      // Only 64 bit vector conversions are natively supported.
+      if (SrcScalarBits == 64 && DstScalarBits == 64)
+        return NumDstVectors;
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values. Base implementation does not
+      // realize float->int gets scalarized.
+      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+      unsigned TotCost = VF * ScalarCost;
+      bool NeedsInserts = true, NeedsExtracts = true;
+      // FP128 registers do not get inserted or extracted.
+      if (DstScalarBits == 128 &&
+          (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+        NeedsInserts = false;
+      if (SrcScalarBits == 128 &&
+          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+        NeedsExtracts = false;
+
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+        TotCost *= 2;
+
+      return TotCost;
+    }
+
+    if (Opcode == Instruction::FPTrunc) {
+      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
+        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+      else // double -> float
+        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+    }
+
+    if (Opcode == Instruction::FPExt) {
+      if (SrcScalarBits == 32 && DstScalarBits == 64) {
+        // float -> double is very rare and currently unoptimized. Instead of
+        // using vldeb, which can do two at a time, all conversions are
+        // scalarized.
+        return VF * 2;
+      }
+      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
+      return VF + getScalarizationOverhead(Src, false, true);
+    }
+  }
+  else { // Scalar
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+    
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+
+      return Cost;
+    }
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                       const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+    unsigned VF = ValTy->getVectorNumElements();
+
+    // Called with a compare instruction.
+    if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+      unsigned PredicateExtraCost = 0;
+      if (I != nullptr) {
+        // Some predicates cost one or two extra instructions.
+        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        case CmpInst::Predicate::ICMP_NE:
+        case CmpInst::Predicate::ICMP_UGE:
+        case CmpInst::Predicate::ICMP_ULE:
+        case CmpInst::Predicate::ICMP_SGE:
+        case CmpInst::Predicate::ICMP_SLE:
+          PredicateExtraCost = 1;
+          break;
+        case CmpInst::Predicate::FCMP_ONE:
+        case CmpInst::Predicate::FCMP_ORD:
+        case CmpInst::Predicate::FCMP_UEQ:
+        case CmpInst::Predicate::FCMP_UNO:
+          PredicateExtraCost = 2;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
+      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+      unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+
+      unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
+      return Cost;
+    }
+    else { // Called with a select instruction.
+      assert (Opcode == Instruction::Select);
+
+      // We can figure out the extra cost of packing / unpacking if the
+      // instruction was passed and the compare instruction is found.
+      unsigned PackCost = 0;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+      if (CmpOpTy != nullptr)
+        PackCost =
+          getVectorBitmaskConversionCost(CmpOpTy, ValTy);
+
+      return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+    }
+  }
+  else { // Scalar
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += 2; // extend both operands
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP, so this costs a conditional jump.
+      return 1; // Load On Condition.
+    }
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
+
+int SystemZTTIImpl::
+getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  // vlvgp will insert two grs into a vector register, so only count half the
+  // number of instructions.
+  if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
+    return ((Index % 2 == 0) ? 1 : 0);
+
+  if (Opcode == Instruction::ExtractElement) {
+    int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+
+    // Give a slight penalty for moving out of vector pipeline to FXU unit.
+    if (Index == 0 && Val->isIntOrIntVectorTy())
+      Cost += 1;
+
+    return Cost;
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
+  assert(!Src->isVoidTy() && "Invalid type");
+
+  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
+      I != nullptr && I->hasOneUse()) {
+      const Instruction *UserI = cast<Instruction>(*I->user_begin());
+      unsigned Bits = Src->getScalarSizeInBits();
+      bool FoldsLoad = false;
+      switch (UserI->getOpcode()) {
+      case Instruction::ICmp:
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      // This also makes sense for float operations, but disabled for now due
+      // to regressions.
+      // case Instruction::FCmp:
+      // case Instruction::FAdd:
+      // case Instruction::FSub:
+      // case Instruction::FMul:
+      // case Instruction::FDiv:
+        FoldsLoad = (Bits == 32 || Bits == 64);
+        break;
+      }
+
+      if (FoldsLoad) {
+        assert (UserI->getNumOperands() == 2 &&
+                "Expected to only handle binops.");
+
+        // UserI can't fold two loads, so in that case return 0 cost only
+        // half of the time.
+        for (unsigned i = 0; i < 2; ++i) {
+          if (UserI->getOperand(i) == I)
+            continue;
+          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
+            if (LI->hasOneUse())
+              return i == 0;
+          }
+        }
+
+        return 0;
+      }
+  }
+
+  unsigned NumOps = getNumberOfParts(Src);
+
+  if (Src->getScalarSizeInBits() == 128)
+    // 128 bit scalars are held in a pair of two 64 bit registers.
+    NumOps *= 2;
+
+  return  NumOps;
+}
+
+int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
+     (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
+  assert (WideBits > 0 && "Could not compute size of vector");
+  int NumWideParts =
+    ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+
+  // How many source vectors are handled to produce a vectorized operand?
+  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
+  int NumSrcParts =
+    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+
+  // A Load group may have gaps.
+  unsigned NumOperands =
+    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+
+  // Each needed permute takes two vectors as input.
+  if (NumSrcParts > 1)
+    NumSrcParts--;
+  int NumPermutes = NumSrcParts * NumOperands;
+
+  // Cost of load/store operations and the permutations needed.
+  return NumWideParts + NumPermutes;
+}
diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f7d2d82..a0c6fa9 100644
--- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   const SystemZSubtarget *getST() const { return ST; }
   const SystemZTargetLowering *getTLI() const { return TLI; }
 
+  unsigned const LIBCALL_COST = 30;
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -43,7 +45,8 @@ public:
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
-  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+  void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+                               TTI::UnrollingPreferences &UP);
 
   /// @}
 
@@ -51,8 +54,39 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
-
+  unsigned getRegisterBitWidth(bool Vector) const;
+
+  unsigned getCacheLineSize() { return 256; }
+  unsigned getPrefetchDistance() { return 2000; }
+  unsigned getMinPrefetchStride() { return 2048; }
+
+  bool prefersVectorizedAddressing() { return false; }
+  bool supportsEfficientVectorElementLoadStore() { return true; }
+  bool enableInterleavedAccessVectorization() { return true; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
+  unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace, const Instruction *I = nullptr);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
   /// @}
 };
 
diff --git a/contrib/llvm/lib/Target/Target.cpp b/contrib/llvm/lib/Target/Target.cpp
index 5d1616d..42d9262 100644
--- a/contrib/llvm/lib/Target/Target.cpp
+++ b/contrib/llvm/lib/Target/Target.cpp
@@ -14,12 +14,12 @@
 
 #include "llvm-c/Target.h"
 #include "llvm-c/Initialization.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include <cstring>
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
index 375f851..f941891 100644
--- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -24,7 +25,6 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
@@ -44,7 +44,7 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
   Ctx = &ctx;
   // `Initialize` can be called more than once.
-  if (Mang != nullptr) delete Mang;
+  delete Mang;
   Mang = new Mangler();
   InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(),
                        TM.getCodeModel(), *Ctx);
@@ -240,6 +240,20 @@ MCSection *TargetLoweringObjectFile::SectionForGlobal(
   if (GO->hasSection())
     return getExplicitSectionGlobal(GO, Kind, TM);
 
+  if (auto *GVar = dyn_cast<GlobalVariable>(GO)) {
+    auto Attrs = GVar->getAttributes();
+    if ((Attrs.hasAttribute("bss-section") && Kind.isBSS()) ||
+        (Attrs.hasAttribute("data-section") && Kind.isData()) ||
+        (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()))  {
+       return getExplicitSectionGlobal(GO, Kind, TM);
+    }
+  }
+
+  if (auto *F = dyn_cast<Function>(GO)) {
+    if (F->hasFnAttribute("implicit-section-name"))
+      return getExplicitSectionGlobal(GO, Kind, TM);
+  }
+
   // Use default section depending on the 'type' of global
   return SelectSectionForGlobal(GO, Kind, TM);
 }
@@ -264,10 +278,7 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
   // in discardable section
   // FIXME: this isn't the right predicate, should be based on the MCSection
   // for the function.
-  if (F.isWeakForLinker())
-    return true;
-
-  return false;
+  return F.isWeakForLinker();
 }
 
 /// Given a mergable constant with the specified size and relocation
diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp
index 8a6d284..e8fe0a2 100644
--- a/contrib/llvm/lib/Target/TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/TargetMachine.cpp
@@ -74,10 +74,10 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
       Options.X = DefaultOptions.X;                                            \
   } while (0)
 
-  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
   RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
+  RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
   RESET_OPTION(NoTrappingFPMath, "no-trapping-math");
 
   StringRef Denormal =
@@ -156,8 +156,11 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     bool IsTLS = GV && GV->isThreadLocal();
     bool IsAccessViaCopyRelocs =
         Options.MCOptions.MCPIECopyRelocations && GV && isa<GlobalVariable>(GV);
-    // Check if we can use copy relocations.
-    if (!IsTLS && (RM == Reloc::Static || IsAccessViaCopyRelocs))
+    Triple::ArchType Arch = TT.getArch();
+    bool IsPPC =
+        Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::ppc64le;
+    // Check if we can use copy relocations. PowerPC has no copy relocations.
+    if (!IsTLS && !IsPPC && (RM == Reloc::Static || IsAccessViaCopyRelocs))
       return true;
   }
 
@@ -198,7 +201,7 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; }
 void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; }
 
 TargetIRAnalysis TargetMachine::getTargetIRAnalysis() {
-  return TargetIRAnalysis([this](const Function &F) {
+  return TargetIRAnalysis([](const Function &F) {
     return TargetTransformInfo(F.getParent()->getDataLayout());
   });
 }
diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp
index df12e0e..01f1493 100644
--- a/contrib/llvm/lib/Target/TargetMachineC.cpp
+++ b/contrib/llvm/lib/Target/TargetMachineC.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-c/TargetMachine.h"
 #include "llvm-c/Core.h"
 #include "llvm-c/Target.h"
+#include "llvm-c/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CodeGenCWrappers.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index b4763ca..9be11da 100644
--- a/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -15,8 +15,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -63,89 +63,8 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
 MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
     raw_ostream &OS, raw_ostream &CS) const {
-  Size = 0;
-  uint64_t Pos = 0;
-
-  // Read the opcode.
-  if (Pos + sizeof(uint64_t) > Bytes.size())
-    return MCDisassembler::Fail;
-  uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
-  Pos += sizeof(uint64_t);
-
-  if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
-    return MCDisassembler::Fail;
-
-  MI.setOpcode(Opcode);
-  const MCInstrDesc &Desc = MCII->get(Opcode);
-  unsigned NumFixedOperands = Desc.NumOperands;
-
-  // If it's variadic, read the number of extra operands.
-  unsigned NumExtraOperands = 0;
-  if (Desc.isVariadic()) {
-    if (Pos + sizeof(uint64_t) > Bytes.size())
-      return MCDisassembler::Fail;
-    NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
-    Pos += sizeof(uint64_t);
-  }
-
-  // Read the fixed operands. These are described by the MCInstrDesc.
-  for (unsigned i = 0; i < NumFixedOperands; ++i) {
-    const MCOperandInfo &Info = Desc.OpInfo[i];
-    switch (Info.OperandType) {
-    case MCOI::OPERAND_IMMEDIATE:
-    case WebAssembly::OPERAND_LOCAL:
-    case WebAssembly::OPERAND_P2ALIGN:
-    case WebAssembly::OPERAND_BASIC_BLOCK: {
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      MI.addOperand(MCOperand::createImm(Imm));
-      break;
-    }
-    case MCOI::OPERAND_REGISTER: {
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      MI.addOperand(MCOperand::createReg(Reg));
-      break;
-    }
-    case WebAssembly::OPERAND_F32IMM:
-    case WebAssembly::OPERAND_F64IMM: {
-      // TODO: MC converts all floating point immediate operands to double.
-      // This is fine for numeric values, but may cause NaNs to change bits.
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      double Imm;
-      memcpy(&Imm, &Bits, sizeof(Imm));
-      MI.addOperand(MCOperand::createFPImm(Imm));
-      break;
-    }
-    default:
-      llvm_unreachable("unimplemented operand kind");
-    }
-  }
 
-  // Read the extra operands.
-  assert(NumExtraOperands == 0 || Desc.isVariadic());
-  for (unsigned i = 0; i < NumExtraOperands; ++i) {
-    if (Pos + sizeof(uint64_t) > Bytes.size())
-      return MCDisassembler::Fail;
-    if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
-      // Decode extra immediate operands.
-      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
-      MI.addOperand(MCOperand::createImm(Imm));
-    } else {
-      // Decode extra register operands.
-      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
-      MI.addOperand(MCOperand::createReg(Reg));
-    }
-    Pos += sizeof(uint64_t);
-  }
+  // TODO: Implement disassembly.
 
-  Size = Pos;
-  return MCDisassembler::Success;
+  return MCDisassembler::Fail;
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 0af13cf..f31dde0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -242,3 +242,17 @@ const char *llvm::WebAssembly::TypeToString(MVT Ty) {
     llvm_unreachable("unsupported type");
   }
 }
+
+const char *llvm::WebAssembly::TypeToString(wasm::ValType Type) {
+  switch (Type) {
+  case wasm::ValType::I32:
+    return "i32";
+  case wasm::ValType::I64:
+    return "i64";
+  case wasm::ValType::F32:
+    return "f32";
+  case wasm::ValType::F64:
+    return "f64";
+  }
+  llvm_unreachable("unsupported type");
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index d11f99c..b1de84d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCInstPrinter.h"
 
@@ -50,6 +51,7 @@ public:
 namespace WebAssembly {
 
 const char *TypeToString(MVT Ty);
+const char *TypeToString(wasm::ValType Type);
 
 } // end namespace WebAssembly
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 97454a8..1357cb5 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -12,6 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
@@ -22,20 +23,22 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 namespace {
-class WebAssemblyAsmBackend final : public MCAsmBackend {
+class WebAssemblyAsmBackendELF final : public MCAsmBackend {
   bool Is64Bit;
 
 public:
-  explicit WebAssemblyAsmBackend(bool Is64Bit)
+  explicit WebAssemblyAsmBackendELF(bool Is64Bit)
       : MCAsmBackend(), Is64Bit(Is64Bit) {}
-  ~WebAssemblyAsmBackend() override {}
+  ~WebAssemblyAsmBackendELF() override {}
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
                   uint64_t Value, bool IsPCRel) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
@@ -61,6 +64,98 @@ public:
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
 
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+  bool Is64Bit;
+
+public:
+  explicit WebAssemblyAsmBackend(bool Is64Bit)
+      : MCAsmBackend(), Is64Bit(Is64Bit) {}
+  ~WebAssemblyAsmBackend() override {}
+
+  unsigned getNumFixupKinds() const override {
+    return WebAssembly::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsPCRel) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
+                                            MCObjectWriter *OW) const {
+  for (uint64_t i = 0; i < Count; ++i)
+    OW->write8(WebAssembly::Nop);
+
+  return true;
+}
+
+void WebAssemblyAsmBackendELF::applyFixup(const MCAssembler &Asm,
+                                          const MCFixup &Fixup,
+                                          const MCValue &Target,
+                                          MutableArrayRef<char> Data,
+                                          uint64_t Value, bool IsPCRel) const {
+  const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
+  if (Value == 0)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackendELF::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+
+const MCFixupKindInfo &
+WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[WebAssembly::NumTargetFixupKinds] = {
+    // This table *must* be in the order that the fixup_* kinds are defined in
+    // WebAssemblyFixupKinds.h.
+    //
+    // Name                     Offset (bits) Size (bits)     Flags
+    { "fixup_code_sleb128_i32", 0,            5*8,            0 },
+    { "fixup_code_sleb128_i64", 0,            10*8,           0 },
+    { "fixup_code_uleb128_i32", 0,            5*8,            0 },
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
 bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
                                          MCObjectWriter *OW) const {
   if (Count == 0)
@@ -72,13 +167,15 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
   return true;
 }
 
-void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                       unsigned DataSize, uint64_t Value,
-                                       bool IsPCRel) const {
+void WebAssemblyAsmBackend::applyFixup(const MCAssembler &Asm,
+                                       const MCFixup &Fixup,
+                                       const MCValue &Target,
+                                       MutableArrayRef<char> Data,
+                                       uint64_t Value, bool IsPCRel) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
-  unsigned NumBytes = (Info.TargetSize + 7) / 8;
+  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
   if (Value == 0)
     return; // Doesn't change encoding.
 
@@ -86,7 +183,7 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   Value <<= Info.TargetOffset;
 
   unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+  assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
 
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
@@ -96,10 +193,12 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 MCObjectWriter *
 WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+  return createWebAssemblyWasmObjectWriter(OS, Is64Bit);
 }
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyAsmBackendELF(TT.isArch64Bit());
   return new WebAssemblyAsmBackend(TT.isArch64Bit());
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
new file mode 100644
index 0000000..b0af63c
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
@@ -0,0 +1,31 @@
+//=- WebAssemblyFixupKinds.h - WebAssembly Specific Fixup Entries -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYFIXUPKINDS_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace WebAssembly {
+enum Fixups {
+  fixup_code_sleb128_i32 = FirstTargetFixupKind,      // 32-bit signed
+  fixup_code_sleb128_i64,                             // 64-bit signed
+  fixup_code_uleb128_i32,                             // 32-bit unsigned
+
+  fixup_code_global_index,                            // 32-bit unsigned
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index d8c3921..5f8c78e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -19,10 +19,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-asm-info"
 
-WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+WebAssemblyMCAsmInfoELF::~WebAssemblyMCAsmInfoELF() {}
 
-WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
-  PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
+WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) {
+  CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
 
   // TODO: What should MaxInstLength be?
 
@@ -51,3 +51,33 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
   // WebAssembly's stack is never executable.
   UsesNonexecutableStackSection = false;
 }
+
+WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+
+WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+  CodePointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
+
+  // TODO: What should MaxInstLength be?
+
+  UseDataRegionDirectives = true;
+
+  // Use .skip instead of .zero because .zero is confusing when used with two
+  // arguments (it doesn't actually zero things out).
+  ZeroDirective = "\t.skip\t";
+
+  Data8bitsDirective = "\t.int8\t";
+  Data16bitsDirective = "\t.int16\t";
+  Data32bitsDirective = "\t.int32\t";
+  Data64bitsDirective = "\t.int64\t";
+
+  AlignmentIsInBytes = false;
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+
+  SupportsDebugInformation = true;
+
+  // For now, WebAssembly does not support exceptions.
+  ExceptionsType = ExceptionHandling::None;
+
+  // TODO: UseIntegratedAssembler?
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index 2dcf2cd..d954709 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -16,12 +16,19 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoWasm.h"
 
 namespace llvm {
 
 class Triple;
 
-class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
+class WebAssemblyMCAsmInfoELF final : public MCAsmInfoELF {
+public:
+  explicit WebAssemblyMCAsmInfoELF(const Triple &T);
+  ~WebAssemblyMCAsmInfoELF() override;
+};
+
+class WebAssemblyMCAsmInfo final : public MCAsmInfoWasm {
 public:
   explicit WebAssemblyMCAsmInfo(const Triple &T);
   ~WebAssemblyMCAsmInfo() override;
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index d0e0eec..3e3b52f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -12,6 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -46,7 +47,7 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  explicit WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
 };
 } // end anonymous namespace
 
@@ -63,6 +64,13 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
   assert(Binary < UINT8_MAX && "Multi-byte opcodes not supported yet");
   OS << uint8_t(Binary);
 
+  // For br_table instructions, encode the size of the table. In the MCInst,
+  // there's an index operand, one operand for each table entry, and the
+  // default operand.
+  if (MI.getOpcode() == WebAssembly::BR_TABLE_I32 ||
+      MI.getOpcode() == WebAssembly::BR_TABLE_I64)
+    encodeULEB128(MI.getNumOperands() - 2, OS);
+
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
     const MCOperand &MO = MI.getOperand(i);
@@ -77,6 +85,10 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
           encodeSLEB128(int32_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
+        } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+          llvm_unreachable("wasm globals should only be accessed symbolicly");
+        } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) {
+          encodeSLEB128(int64_t(MO.getImm()), OS);
         } else {
           encodeULEB128(uint64_t(MO.getImm()), OS);
         }
@@ -102,14 +114,31 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         support::endian::Writer<support::little>(OS).write<double>(d);
       }
     } else if (MO.isExpr()) {
+      const MCOperandInfo &Info = Desc.OpInfo[i];
+      llvm::MCFixupKind FixupKind;
+      size_t PaddedSize;
+      if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32);
+        PaddedSize = 5;
+      } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64);
+        PaddedSize = 10;
+      } else if (Info.OperandType == WebAssembly::OPERAND_FUNCTION32 ||
+                 Info.OperandType == WebAssembly::OPERAND_OFFSET32 ||
+                 Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
+        PaddedSize = 5;
+      } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_global_index);
+        PaddedSize = 5;
+      } else {
+        llvm_unreachable("unexpected symbolic operand kind");
+      }
       Fixups.push_back(MCFixup::create(
           OS.tell() - Start, MO.getExpr(),
-          STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
-          MI.getLoc()));
+          FixupKind, MI.getLoc()));
       ++MCNumFixups;
-      encodeULEB128(STI.getTargetTriple().isArch64Bit() ? UINT64_MAX
-                                                        : uint64_t(UINT32_MAX),
-                    OS);
+      encodeULEB128(0, OS, PaddedSize - 1);
     } else {
       llvm_unreachable("unexpected operand kind");
     }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 3dc1ded..9580eea 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -36,6 +36,8 @@ using namespace llvm;
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT) {
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyMCAsmInfoELF(TT);
   return new WebAssemblyMCAsmInfo(TT);
 }
 
@@ -71,7 +73,7 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
 
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
-                                        MCContext & /*Ctx*/) {
+                                        MCContext &Ctx) {
   return createWebAssemblyMCCodeEmitter(MCII);
 }
 
@@ -88,8 +90,12 @@ static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
 }
 
 static MCTargetStreamer *
-createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
-  return new WebAssemblyTargetELFStreamer(S);
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyTargetELFStreamer(S);
+
+  return new WebAssemblyTargetWasmStreamer(S);
 }
 
 static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
@@ -135,12 +141,12 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
   }
 }
 
-WebAssembly::ValType WebAssembly::toValType(const MVT &Ty) {
+wasm::ValType WebAssembly::toValType(const MVT &Ty) {
   switch (Ty.SimpleTy) {
-  case MVT::i32: return WebAssembly::ValType::I32;
-  case MVT::i64: return WebAssembly::ValType::I64;
-  case MVT::f32: return WebAssembly::ValType::F32;
-  case MVT::f64: return WebAssembly::ValType::F64;
+  case MVT::i32: return wasm::ValType::I32;
+  case MVT::i64: return wasm::ValType::I64;
+  case MVT::f32: return wasm::ValType::F32;
+  case MVT::f64: return wasm::ValType::F64;
   default: llvm_unreachable("unexpected type");
   }
 }
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 8583b77..4d676c3 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H
 
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -41,12 +42,17 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
                                                  bool Is64Bit, uint8_t OSABI);
 
+MCObjectWriter *createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
+                                                  bool Is64Bit);
+
 namespace WebAssembly {
 enum OperandType {
   /// Basic block label in a branch construct.
   OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
   /// Local index.
   OPERAND_LOCAL,
+  /// Global index.
+  OPERAND_GLOBAL,
   /// 32-bit integer immediates.
   OPERAND_I32IMM,
   /// 64-bit integer immediates.
@@ -62,7 +68,9 @@ enum OperandType {
   /// p2align immediate for load and store address alignment.
   OPERAND_P2ALIGN,
   /// signature immediate for block/loop.
-  OPERAND_SIGNATURE
+  OPERAND_SIGNATURE,
+  /// type signature immediate for call_indirect.
+  OPERAND_TYPEINDEX,
 };
 } // end namespace WebAssembly
 
@@ -141,40 +149,25 @@ static const unsigned StoreP2AlignOperandNo = 0;
 
 /// This is used to indicate block signatures.
 enum class ExprType {
-  Void    = 0x40,
-  I32     = 0x7f,
-  I64     = 0x7e,
-  F32     = 0x7d,
-  F64     = 0x7c,
-  I8x16   = 0x7b,
-  I16x8   = 0x7a,
-  I32x4   = 0x79,
-  F32x4   = 0x78,
-  B8x16   = 0x77,
-  B16x8   = 0x76,
-  B32x4   = 0x75
-};
-
-/// This is used to indicate local types.
-enum class ValType {
-  I32     = 0x7f,
-  I64     = 0x7e,
-  F32     = 0x7d,
-  F64     = 0x7c,
-  I8x16   = 0x7b,
-  I16x8   = 0x7a,
-  I32x4   = 0x79,
-  F32x4   = 0x78,
-  B8x16   = 0x77,
-  B16x8   = 0x76,
-  B32x4   = 0x75
+  Void    = -0x40,
+  I32     = -0x01,
+  I64     = -0x02,
+  F32     = -0x03,
+  F64     = -0x04,
+  I8x16   = -0x05,
+  I16x8   = -0x06,
+  I32x4   = -0x07,
+  F32x4   = -0x08,
+  B8x16   = -0x09,
+  B16x8   = -0x0a,
+  B32x4   = -0x0b
 };
 
 /// Instruction opcodes emitted via means other than CodeGen.
 static const unsigned Nop = 0x01;
 static const unsigned End = 0x0b;
 
-ValType toValType(const MVT &Ty);
+wasm::ValType toValType(const MVT &Ty);
 
 } // end namespace WebAssembly
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 3cee8b2..00bf024 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -18,9 +18,11 @@
 #include "WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
@@ -28,6 +30,10 @@ using namespace llvm;
 WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
+void WebAssemblyTargetStreamer::emitValueType(wasm::ValType Type) {
+  Streamer.EmitSLEB128IntValue(int32_t(Type));
+}
+
 WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
     MCStreamer &S, formatted_raw_ostream &OS)
     : WebAssemblyTargetStreamer(S), OS(OS) {}
@@ -35,6 +41,9 @@ WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
 WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
     : WebAssemblyTargetStreamer(S) {}
 
+WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S)
+    : WebAssemblyTargetStreamer(S) {}
+
 static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
   bool First = true;
   for (MVT Type : Types) {
@@ -47,14 +56,28 @@ static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
   OS << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
-  OS << "\t.param  \t";
-  PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitParam(MCSymbol *Symbol,
+                                             ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.param  \t";
+
+    // FIXME: Currently this applies to the "current" function; it may
+    // be cleaner to specify an explicit symbol as part of the directive.
+
+    PrintTypes(OS, Types);
+  }
 }
 
-void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
-  OS << "\t.result \t";
-  PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitResult(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.result \t";
+
+    // FIXME: Currently this applies to the "current" function; it may
+    // be cleaner to specify an explicit symbol as part of the directive.
+
+    PrintTypes(OS, Types);
+  }
 }
 
 void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
@@ -64,11 +87,36 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
   }
 }
 
+void WebAssemblyTargetAsmStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  if (!Globals.empty()) {
+    OS << "\t.globalvar  \t";
+
+    bool First = true;
+    for (const wasm::Global &G : Globals) {
+      if (First)
+        First = false;
+      else
+        OS << ", ";
+      OS << WebAssembly::TypeToString(G.Type);
+      if (!G.InitialModule.empty())
+        OS << '=' << G.InitialModule << ':' << G.InitialName;
+      else
+        OS << '=' << G.InitialValue;
+    }
+    OS << '\n';
+  }
+}
+
+void WebAssemblyTargetAsmStreamer::emitStackPointer(uint32_t Index) {
+  OS << "\t.stack_pointer\t" << Index << '\n';
+}
+
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
 void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
-    StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
-  OS << "\t.functype\t" << name;
+    MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+  OS << "\t.functype\t" << Symbol->getName();
   if (Results.empty())
     OS << ", void";
   else {
@@ -88,18 +136,30 @@ void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
   OS << "\t.indidx  \t" << *Value << '\n';
 }
 
-void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+void WebAssemblyTargetELFStreamer::emitParam(MCSymbol *Symbol,
+                                             ArrayRef<MVT> Types) {
   // Nothing to emit; params are declared as part of the function signature.
 }
 
-void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+void WebAssemblyTargetELFStreamer::emitResult(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
   // Nothing to emit; results are declared as part of the function signature.
 }
 
 void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
   Streamer.EmitULEB128IntValue(Types.size());
   for (MVT Type : Types)
-    Streamer.EmitIntValue(int64_t(WebAssembly::toValType(Type)), 1);
+    emitValueType(WebAssembly::toValType(Type));
+}
+
+void WebAssemblyTargetELFStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  llvm_unreachable(".globalvar encoding not yet implemented");
+}
+
+void WebAssemblyTargetELFStreamer::emitStackPointer(
+    uint32_t Index) {
+  llvm_unreachable(".stack_pointer encoding not yet implemented");
 }
 
 void WebAssemblyTargetELFStreamer::emitEndFunc() {
@@ -111,10 +171,110 @@ void WebAssemblyTargetELFStreamer::emitIndIdx(const MCExpr *Value) {
 }
 
 void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
-    StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+    MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
   // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
   // whether it's necessary for .o files to declare indirect function types.
 }
 
 void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
-}
-\ No newline at end of file
+}
+
+void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
+  SmallVector<wasm::ValType, 4> Params;
+  for (MVT Ty : Types)
+    Params.push_back(WebAssembly::toValType(Ty));
+
+  cast<MCSymbolWasm>(Symbol)->setParams(std::move(Params));
+}
+
+void WebAssemblyTargetWasmStreamer::emitResult(MCSymbol *Symbol,
+                                               ArrayRef<MVT> Types) {
+  SmallVector<wasm::ValType, 4> Returns;
+  for (MVT Ty : Types)
+    Returns.push_back(WebAssembly::toValType(Ty));
+
+  cast<MCSymbolWasm>(Symbol)->setReturns(std::move(Returns));
+}
+
+void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
+  SmallVector<std::pair<MVT, uint32_t>, 4> Grouped;
+  for (MVT Type : Types) {
+    if (Grouped.empty() || Grouped.back().first != Type)
+      Grouped.push_back(std::make_pair(Type, 1));
+    else
+      ++Grouped.back().second;
+  }
+
+  Streamer.EmitULEB128IntValue(Grouped.size());
+  for (auto Pair : Grouped) {
+    Streamer.EmitULEB128IntValue(Pair.second);
+    emitValueType(WebAssembly::toValType(Pair.first));
+  }
+}
+
+void WebAssemblyTargetWasmStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  // Encode the globals use by the funciton into the special .global_variables
+  // section. This will later be decoded and turned into contents for the
+  // Globals Section.
+  Streamer.PushSection();
+  Streamer.SwitchSection(Streamer.getContext()
+                                 .getWasmSection(".global_variables", 0, 0));
+  for (const wasm::Global &G : Globals) {
+    Streamer.EmitIntValue(int32_t(G.Type), 1);
+    Streamer.EmitIntValue(G.Mutable, 1);
+    if (G.InitialModule.empty()) {
+      Streamer.EmitIntValue(0, 1); // indicate that we have an int value
+      Streamer.EmitSLEB128IntValue(0);
+    } else {
+      Streamer.EmitIntValue(1, 1); // indicate that we have a module import
+      Streamer.EmitBytes(G.InitialModule);
+      Streamer.EmitIntValue(0, 1); // nul-terminate
+      Streamer.EmitBytes(G.InitialName);
+      Streamer.EmitIntValue(0, 1); // nul-terminate
+    }
+  }
+  Streamer.PopSection();
+}
+
+void WebAssemblyTargetWasmStreamer::emitStackPointer(uint32_t Index) {
+  Streamer.PushSection();
+  Streamer.SwitchSection(Streamer.getContext()
+                                 .getWasmSection(".stack_pointer", 0, 0));
+  Streamer.EmitIntValue(Index, 4);
+  Streamer.PopSection();
+}
+
+void WebAssemblyTargetWasmStreamer::emitEndFunc() {
+  llvm_unreachable(".end_func is not needed for direct wasm output");
+}
+
+void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) {
+  llvm_unreachable(".indidx encoding not yet implemented");
+}
+
+void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
+    MCSymbol *Symbol, SmallVectorImpl<MVT> &Params,
+    SmallVectorImpl<MVT> &Results) {
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Symbol);
+  if (WasmSym->isFunction()) {
+    // Symbol already has its arguments and result set.
+    return;
+  }
+
+  SmallVector<wasm::ValType, 4> ValParams;
+  for (MVT Ty : Params)
+    ValParams.push_back(WebAssembly::toValType(Ty));
+
+  SmallVector<wasm::ValType, 1> ValResults;
+  for (MVT Ty : Results)
+    ValResults.push_back(WebAssembly::toValType(Ty));
+
+  WasmSym->setParams(std::move(ValParams));
+  WasmSym->setReturns(std::move(ValResults));
+  WasmSym->setIsFunction(true);
+}
+
+void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 23ac319..102d721 100644
--- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -16,12 +16,14 @@
 #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYTARGETSTREAMER_H
 
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
 
 class MCELFStreamer;
+class MCWasmStreamer;
 
 /// WebAssembly-specific streamer interface, to implement support
 /// WebAssembly-specific assembly directives.
@@ -30,23 +32,28 @@ public:
   explicit WebAssemblyTargetStreamer(MCStreamer &S);
 
   /// .param
-  virtual void emitParam(ArrayRef<MVT> Types) = 0;
+  virtual void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .result
-  virtual void emitResult(ArrayRef<MVT> Types) = 0;
+  virtual void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .local
   virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  /// .globalvar
+  virtual void emitGlobal(ArrayRef<wasm::Global> Globals) = 0;
+  /// .stack_pointer
+  virtual void emitStackPointer(uint32_t Index) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
   /// .functype
-  virtual void emitIndirectFunctionType(StringRef name,
+  virtual void emitIndirectFunctionType(MCSymbol *Symbol,
                                         SmallVectorImpl<MVT> &Params,
-                                        SmallVectorImpl<MVT> &Results) {
-    llvm_unreachable("emitIndirectFunctionType not implemented");
-  }
+                                        SmallVectorImpl<MVT> &Results) = 0;
   /// .indidx
   virtual void emitIndIdx(const MCExpr *Value) = 0;
   /// .import_global
   virtual void emitGlobalImport(StringRef name) = 0;
+
+protected:
+  void emitValueType(wasm::ValType Type);
 };
 
 /// This part is for ascii assembly output
@@ -56,11 +63,13 @@ class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
 public:
   WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
-  void emitParam(ArrayRef<MVT> Types) override;
-  void emitResult(ArrayRef<MVT> Types) override;
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
   void emitEndFunc() override;
-  void emitIndirectFunctionType(StringRef name,
+  void emitIndirectFunctionType(MCSymbol *Symbol,
                                 SmallVectorImpl<MVT> &Params,
                                 SmallVectorImpl<MVT> &Results) override;
   void emitIndIdx(const MCExpr *Value) override;
@@ -72,11 +81,31 @@ class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
 public:
   explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
 
-  void emitParam(ArrayRef<MVT> Types) override;
-  void emitResult(ArrayRef<MVT> Types) override;
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
+  void emitEndFunc() override;
+  void emitIndirectFunctionType(MCSymbol *Symbol,
+                                SmallVectorImpl<MVT> &Params,
+                                SmallVectorImpl<MVT> &Results) override;
+  void emitIndIdx(const MCExpr *Value) override;
+  void emitGlobalImport(StringRef name) override;
+};
+
+/// This part is for Wasm object output
+class WebAssemblyTargetWasmStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetWasmStreamer(MCStreamer &S);
+
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
   void emitEndFunc() override;
-  void emitIndirectFunctionType(StringRef name,
+  void emitIndirectFunctionType(MCSymbol *Symbol,
                                 SmallVectorImpl<MVT> &Params,
                                 SmallVectorImpl<MVT> &Results) override;
   void emitIndIdx(const MCExpr *Value) override;
diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
new file mode 100644
index 0000000..9cf7782
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -0,0 +1,100 @@
+//===-- WebAssemblyWasmObjectWriter.cpp - WebAssembly Wasm Writer ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles Wasm-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/BinaryFormat/Wasm.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class WebAssemblyWasmObjectWriter final : public MCWasmObjectTargetWriter {
+public:
+  explicit WebAssemblyWasmObjectWriter(bool Is64Bit);
+
+private:
+  unsigned getRelocType(const MCValue &Target,
+                        const MCFixup &Fixup) const override;
+};
+} // end anonymous namespace
+
+WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
+    : MCWasmObjectTargetWriter(Is64Bit) {}
+
+// Test whether the given expression computes a function address.
+static bool IsFunctionExpr(const MCExpr *Expr) {
+  if (auto SyExp = dyn_cast<MCSymbolRefExpr>(Expr))
+    return cast<MCSymbolWasm>(SyExp->getSymbol()).isFunction();
+
+  if (auto BinOp = dyn_cast<MCBinaryExpr>(Expr))
+    return IsFunctionExpr(BinOp->getLHS()) != IsFunctionExpr(BinOp->getRHS());
+
+  if (auto UnOp = dyn_cast<MCUnaryExpr>(Expr))
+    return IsFunctionExpr(UnOp->getSubExpr());
+
+  return false;
+}
+
+static bool IsFunctionType(const MCValue &Target) {
+  const MCSymbolRefExpr *RefA = Target.getSymA();
+  return RefA && RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX;
+}
+
+unsigned
+WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup) const {
+  // WebAssembly functions are not allocated in the data address space. To
+  // resolve a pointer to a function, we must use a special relocation type.
+  bool IsFunction = IsFunctionExpr(Fixup.getValue());
+
+  switch (unsigned(Fixup.getKind())) {
+  case WebAssembly::fixup_code_global_index:
+    return wasm::R_WEBASSEMBLY_GLOBAL_INDEX_LEB;
+  case WebAssembly::fixup_code_sleb128_i32:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB;
+  case WebAssembly::fixup_code_sleb128_i64:
+    llvm_unreachable("fixup_sleb128_i64 not implemented yet");
+  case WebAssembly::fixup_code_uleb128_i32:
+    if (IsFunctionType(Target))
+      return wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB;
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB;
+  case FK_Data_4:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32;
+  case FK_Data_8:
+    llvm_unreachable("FK_Data_8 not implemented yet");
+  default:
+    llvm_unreachable("unimplemented fixup kind");
+  }
+}
+
+MCObjectWriter *llvm::createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
+                                                        bool Is64Bit) {
+  MCWasmObjectTargetWriter *MOTW = new WebAssemblyWasmObjectWriter(Is64Bit);
+  return createWasmObjectWriter(MOTW, OS);
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt
index 64991ad..3433b15 100644
--- a/contrib/llvm/lib/Target/WebAssembly/README.txt
+++ b/contrib/llvm/lib/Target/WebAssembly/README.txt
@@ -145,3 +145,24 @@ WebAssemblyRegStackify could be extended, or possibly rewritten, to take
 advantage of the new opportunities.
 
 //===---------------------------------------------------------------------===//
+
+Add support for mergeable sections in the Wasm writer, such as for strings and
+floating-point constants.
+
+//===---------------------------------------------------------------------===//
+
+The function @dynamic_alloca_redzone in test/CodeGen/WebAssembly/userstack.ll
+ends up with a tee_local in its prolog which has an unused result, requiring
+an extra drop:
+
+    get_global  $push8=, 0
+    tee_local   $push9=, 1, $pop8
+    drop        $pop9
+    [...]
+
+The prologue code initially thinks it needs an FP register, but later it
+turns out to be unneeded, so one could either approach this by being more
+clever about not inserting code for an FP in the first place, or optimizing
+away the copy later.
+
+//===---------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
index 8738263..e04c4db 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -46,6 +46,7 @@ FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
 FunctionPass *createWebAssemblyExplicitLocals();
 FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyCFGSort();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 5b4b82e..211358a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -14,6 +14,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "WebAssemblyAsmPrinter.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "MCTargetDesc/WebAssemblyTargetStreamer.h"
@@ -21,16 +22,19 @@
 #include "WebAssemblyMCInstLower.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyRegisterInfo.h"
-#include "WebAssemblySubtarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -38,65 +42,16 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-namespace {
-
-class WebAssemblyAsmPrinter final : public AsmPrinter {
-  const MachineRegisterInfo *MRI;
-  WebAssemblyFunctionInfo *MFI;
-
-public:
-  WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
-
-private:
-  StringRef getPassName() const override {
-    return "WebAssembly Assembly Printer";
-  }
-
-  //===------------------------------------------------------------------===//
-  // MachineFunctionPass Implementation.
-  //===------------------------------------------------------------------===//
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    MRI = &MF.getRegInfo();
-    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(MF);
-  }
-
-  //===------------------------------------------------------------------===//
-  // AsmPrinter Implementation.
-  //===------------------------------------------------------------------===//
-
-  void EmitEndOfAsmFile(Module &M) override;
-  void EmitJumpTableInfo() override;
-  void EmitConstantPool() override;
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
-  void EmitInstruction(const MachineInstr *MI) override;
-  const MCExpr *lowerConstant(const Constant *CV) override;
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &OS) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &OS) override;
-
-  MVT getRegType(unsigned RegNo) const;
-  std::string regToString(const MachineOperand &MO);
-  WebAssemblyTargetStreamer *getTargetStreamer();
-};
-
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // Helpers.
 //===----------------------------------------------------------------------===//
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
+  const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
                 MVT::v4i32, MVT::v4f32})
-    if (TRC->hasType(T))
+    if (TRI->isTypeLegalForClass(*TRC, T))
       return T;
   DEBUG(errs() << "Unknown type for register number: " << RegNo);
   llvm_unreachable("Unknown register type");
@@ -129,13 +84,16 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       SmallVector<MVT, 4> Results;
       SmallVector<MVT, 4> Params;
       ComputeSignatureVTs(F, TM, Params, Results);
-      getTargetStreamer()->emitIndirectFunctionType(F.getName(), Params,
+      getTargetStreamer()->emitIndirectFunctionType(getSymbol(&F), Params,
                                                     Results);
     }
   }
   for (const auto &G : M.globals()) {
     if (!G.hasInitializer() && G.hasExternalLinkage()) {
+      uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
       getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
+      OutStreamer->emitELFSize(getSymbol(&G),
+                               MCConstantExpr::create(Size, OutContext));
     }
   }
 }
@@ -150,8 +108,7 @@ void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
-  if (!MFI->getParams().empty())
-    getTargetStreamer()->emitParam(MFI->getParams());
+  getTargetStreamer()->emitParam(CurrentFnSym, MFI->getParams());
 
   SmallVector<MVT, 4> ResultVTs;
   const Function &F(*MF->getFunction());
@@ -169,23 +126,26 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   // If the return type needs to be legalized it will get converted into
   // passing a pointer.
   if (ResultVTs.size() == 1)
-    getTargetStreamer()->emitResult(ResultVTs);
-
-  // FIXME: When ExplicitLocals is enabled by default, we won't need
-  // to define the locals here (and MFI can go back to being pointer-to-const).
-  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
-    unsigned WAReg = MFI->getWAReg(VReg);
-    // Don't declare unused registers.
-    if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
-      continue;
-    // Don't redeclare parameters.
-    if (WAReg < MFI->getParams().size())
-      continue;
-    // Don't declare stackified registers.
-    if (int(WAReg) < 0)
-      continue;
-    MFI->addLocal(getRegType(VReg));
+    getTargetStreamer()->emitResult(CurrentFnSym, ResultVTs);
+  else
+    getTargetStreamer()->emitResult(CurrentFnSym, ArrayRef<MVT>());
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    assert(MFI->getLocals().empty());
+    for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+      unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+      unsigned WAReg = MFI->getWAReg(VReg);
+      // Don't declare unused registers.
+      if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+        continue;
+      // Don't redeclare parameters.
+      if (WAReg < MFI->getParams().size())
+        continue;
+      // Don't declare stackified registers.
+      if (int(WAReg) < 0)
+        continue;
+      MFI->addLocal(getRegType(VReg));
+    }
   }
 
   getTargetStreamer()->emitLocal(MFI->getLocals());
@@ -194,7 +154,8 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
-  getTargetStreamer()->emitEndFunc();
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    getTargetStreamer()->emitEndFunc();
 }
 
 void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -252,9 +213,10 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
 const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) {
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    if (GV->getValueType()->isFunctionTy())
+    if (GV->getValueType()->isFunctionTy()) {
       return MCSymbolRefExpr::create(
           getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext);
+    }
   return AsmPrinter::lowerConstant(CV);
 }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
new file mode 100644
index 0000000..c8917b8
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -0,0 +1,77 @@
+// WebAssemblyAsmPrinter.h - WebAssembly implementation of AsmPrinter-*- C++ -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYASMPRINTER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYASMPRINTER_H
+
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCSymbol;
+class WebAssemblyFunctionInfo;
+class WebAssemblyTargetStreamer;
+class WebAssemblyMCInstLower;
+
+class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
+  const WebAssemblySubtarget *Subtarget;
+  const MachineRegisterInfo *MRI;
+  WebAssemblyFunctionInfo *MFI;
+
+public:
+  explicit WebAssemblyAsmPrinter(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)),
+        Subtarget(nullptr), MRI(nullptr), MFI(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Assembly Printer";
+  }
+
+  const WebAssemblySubtarget &getSubtarget() const { return *Subtarget; }
+
+  //===------------------------------------------------------------------===//
+  // MachineFunctionPass Implementation.
+  //===------------------------------------------------------------------===//
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+    MRI = &MF.getRegInfo();
+    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(MF);
+  }
+
+  //===------------------------------------------------------------------===//
+  // AsmPrinter Implementation.
+  //===------------------------------------------------------------------===//
+
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitJumpTableInfo() override;
+  void EmitConstantPool() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  const MCExpr *lowerConstant(const Constant *CV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  MVT getRegType(unsigned RegNo) const;
+  std::string regToString(const MachineOperand &MO);
+  WebAssemblyTargetStreamer *getTargetStreamer();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
new file mode 100644
index 0000000..7001117
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -0,0 +1,277 @@
+//===-- WebAssemblyCFGSort.cpp - CFG Sorting ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG sorting pass.
+///
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
+///
+////===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-sort"
+
+namespace {
+class WebAssemblyCFGSort final : public MachineFunctionPass {
+  StringRef getPassName() const override { return "WebAssembly CFG Sort"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCFGSort() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGSort::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGSort() {
+  return new WebAssemblyCFGSort();
+}
+
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+  bool AnyBarrier = false;
+#endif
+  bool AllAnalyzable = true;
+  for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+    AnyBarrier |= Term.isBarrier();
+#endif
+    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+  }
+  assert((AnyBarrier || AllAnalyzable) &&
+         "AnalyzeBranch needs to analyze any block with a fallthrough");
+  if (AllAnalyzable)
+    MBB->updateTerminator();
+}
+
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() > B->getNumber();
+  }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() < B->getNumber();
+  }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+  const MachineLoop *Loop;
+  unsigned NumBlocksLeft;
+
+  /// List of blocks not dominated by Loop's header that are deferred until
+  /// after all of Loop's blocks have been seen.
+  std::vector<MachineBasicBlock *> Deferred;
+
+  explicit Entry(const MachineLoop *L)
+      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+} // end anonymous namespace
+
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+                       const MachineDominatorTree &MDT) {
+  // Prepare for a topological sort: Record the number of predecessors each
+  // block has, ignoring loop backedges.
+  MF.RenumberBlocks();
+  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned N = MBB.pred_size();
+    if (MachineLoop *L = MLI.getLoopFor(&MBB))
+      if (L->getHeader() == &MBB)
+        for (const MachineBasicBlock *Pred : MBB.predecessors())
+          if (L->contains(Pred))
+            --N;
+    NumPredsLeft[MBB.getNumber()] = N;
+  }
+
+  // Topological sort the CFG, with additional constraints:
+  //  - Between a loop header and the last block in the loop, there can be
+  //    no blocks not dominated by the loop header.
+  //  - It's desirable to preserve the original block order when possible.
+  // We use two ready lists; Preferred and Ready. Preferred has recently
+  // processed successors, to help preserve block sequences from the original
+  // order. Ready has the remaining ready blocks.
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbers>
+      Preferred;
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbersBackwards>
+      Ready;
+  SmallVector<Entry, 4> Loops;
+  for (MachineBasicBlock *MBB = &MF.front();;) {
+    const MachineLoop *L = MLI.getLoopFor(MBB);
+    if (L) {
+      // If MBB is a loop header, add it to the active loop list. We can't put
+      // any blocks that it doesn't dominate until we see the end of the loop.
+      if (L->getHeader() == MBB)
+        Loops.push_back(Entry(L));
+      // For each active loop the block is in, decrement the count. If MBB is
+      // the last block in an active loop, take it off the list and pick up any
+      // blocks deferred because the header didn't dominate them.
+      for (Entry &E : Loops)
+        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+          for (auto DeferredBlock : E.Deferred)
+            Ready.push(DeferredBlock);
+      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+        Loops.pop_back();
+    }
+    // The main topological sort logic.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      // Ignore backedges.
+      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+          continue;
+      // Decrement the predecessor count. If it's now zero, it's ready.
+      if (--NumPredsLeft[Succ->getNumber()] == 0)
+        Preferred.push(Succ);
+    }
+    // Determine the block to follow MBB. First try to find a preferred block,
+    // to preserve the original block order when possible.
+    MachineBasicBlock *Next = nullptr;
+    while (!Preferred.empty()) {
+      Next = Preferred.top();
+      Preferred.pop();
+      // If X isn't dominated by the top active loop header, defer it until that
+      // loop is done.
+      if (!Loops.empty() &&
+          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+        Loops.back().Deferred.push_back(Next);
+        Next = nullptr;
+        continue;
+      }
+      // If Next was originally ordered before MBB, and it isn't because it was
+      // loop-rotated above the header, it's not preferred.
+      if (Next->getNumber() < MBB->getNumber() &&
+          (!L || !L->contains(Next) ||
+           L->getHeader()->getNumber() < Next->getNumber())) {
+        Ready.push(Next);
+        Next = nullptr;
+        continue;
+      }
+      break;
+    }
+    // If we didn't find a suitable block in the Preferred list, check the
+    // general Ready list.
+    if (!Next) {
+      // If there are no more blocks to process, we're done.
+      if (Ready.empty()) {
+        MaybeUpdateTerminator(MBB);
+        break;
+      }
+      for (;;) {
+        Next = Ready.top();
+        Ready.pop();
+        // If Next isn't dominated by the top active loop header, defer it until
+        // that loop is done.
+        if (!Loops.empty() &&
+            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+          Loops.back().Deferred.push_back(Next);
+          continue;
+        }
+        break;
+      }
+    }
+    // Move the next block into place and iterate.
+    Next->moveAfter(MBB);
+    MaybeUpdateTerminator(MBB);
+    MBB = Next;
+  }
+  assert(Loops.empty() && "Active loop list not finished");
+  MF.RenumberBlocks();
+
+#ifndef NDEBUG
+  SmallSetVector<MachineLoop *, 8> OnStack;
+
+  // Insert a sentinel representing the degenerate loop that starts at the
+  // function entry block and includes the entire function as a "loop" that
+  // executes once.
+  OnStack.insert(nullptr);
+
+  for (auto &MBB : MF) {
+    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+    MachineLoop *Loop = MLI.getLoopFor(&MBB);
+    if (Loop && &MBB == Loop->getHeader()) {
+      // Loop header. The loop predecessor should be sorted above, and the other
+      // predecessors should be backedges below.
+      for (auto Pred : MBB.predecessors())
+        assert(
+            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+            "Loop header predecessors must be loop predecessors or backedges");
+      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+    } else {
+      // Not a loop header. All predecessors should be sorted above.
+      for (auto Pred : MBB.predecessors())
+        assert(Pred->getNumber() < MBB.getNumber() &&
+               "Non-loop-header predecessors should be topologically sorted");
+      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+             "Blocks must be nested in their loops");
+    }
+    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+      OnStack.pop_back();
+  }
+  assert(OnStack.pop_back_val() == nullptr &&
+         "The function entry block shouldn't actually be a loop header");
+  assert(OnStack.empty() &&
+         "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** CFG Sorting **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto &MDT = getAnalysis<MachineDominatorTree>();
+  // Liveness is not tracked for VALUE_STACK physreg.
+  MF.getRegInfo().invalidateLiveness();
+
+  // Sort the blocks, with contiguous loops.
+  SortBlocks(MF, MLI, MDT);
+
+  return true;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 49b9754..21e0f6b 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -10,12 +10,7 @@
 /// \file
 /// \brief This file implements a CFG stacking pass.
 ///
-/// This pass reorders the blocks in a function to put them into topological
-/// order, ignoring loop backedges, and without any loop being interrupted
-/// by a block not dominated by the loop header, with special care to keep the
-/// order as similar as possible to the original order.
-///
-/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// This pass inserts BLOCK and LOOP markers to mark the start of scopes, since
 /// scope boundaries serve as the labels for WebAssembly's control transfers.
 ///
 /// This is sufficient to convert arbitrary CFGs into a form that works on
@@ -23,13 +18,11 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
-#include "llvm/ADT/PriorityQueue.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -68,217 +61,6 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
-}
-
-static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
-#ifndef NDEBUG
-  bool AnyBarrier = false;
-#endif
-  bool AllAnalyzable = true;
-  for (const MachineInstr &Term : MBB->terminators()) {
-#ifndef NDEBUG
-    AnyBarrier |= Term.isBarrier();
-#endif
-    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
-  }
-  assert((AnyBarrier || AllAnalyzable) &&
-         "AnalyzeBranch needs to analyze any block with a fallthrough");
-  if (AllAnalyzable)
-    MBB->updateTerminator();
-}
-
-namespace {
-/// Sort blocks by their number.
-struct CompareBlockNumbers {
-  bool operator()(const MachineBasicBlock *A,
-                  const MachineBasicBlock *B) const {
-    return A->getNumber() > B->getNumber();
-  }
-};
-/// Sort blocks by their number in the opposite order..
-struct CompareBlockNumbersBackwards {
-  bool operator()(const MachineBasicBlock *A,
-                  const MachineBasicBlock *B) const {
-    return A->getNumber() < B->getNumber();
-  }
-};
-/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
-/// by the loop header among the loop's blocks.
-struct Entry {
-  const MachineLoop *Loop;
-  unsigned NumBlocksLeft;
-
-  /// List of blocks not dominated by Loop's header that are deferred until
-  /// after all of Loop's blocks have been seen.
-  std::vector<MachineBasicBlock *> Deferred;
-
-  explicit Entry(const MachineLoop *L)
-      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
-};
-}
-
-/// Sort the blocks, taking special care to make sure that loops are not
-/// interrupted by blocks not dominated by their header.
-/// TODO: There are many opportunities for improving the heuristics here.
-/// Explore them.
-static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
-                       const MachineDominatorTree &MDT) {
-  // Prepare for a topological sort: Record the number of predecessors each
-  // block has, ignoring loop backedges.
-  MF.RenumberBlocks();
-  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
-  for (MachineBasicBlock &MBB : MF) {
-    unsigned N = MBB.pred_size();
-    if (MachineLoop *L = MLI.getLoopFor(&MBB))
-      if (L->getHeader() == &MBB)
-        for (const MachineBasicBlock *Pred : MBB.predecessors())
-          if (L->contains(Pred))
-            --N;
-    NumPredsLeft[MBB.getNumber()] = N;
-  }
-
-  // Topological sort the CFG, with additional constraints:
-  //  - Between a loop header and the last block in the loop, there can be
-  //    no blocks not dominated by the loop header.
-  //  - It's desirable to preserve the original block order when possible.
-  // We use two ready lists; Preferred and Ready. Preferred has recently
-  // processed sucessors, to help preserve block sequences from the original
-  // order. Ready has the remaining ready blocks.
-  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
-                CompareBlockNumbers>
-      Preferred;
-  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
-                CompareBlockNumbersBackwards>
-      Ready;
-  SmallVector<Entry, 4> Loops;
-  for (MachineBasicBlock *MBB = &MF.front();;) {
-    const MachineLoop *L = MLI.getLoopFor(MBB);
-    if (L) {
-      // If MBB is a loop header, add it to the active loop list. We can't put
-      // any blocks that it doesn't dominate until we see the end of the loop.
-      if (L->getHeader() == MBB)
-        Loops.push_back(Entry(L));
-      // For each active loop the block is in, decrement the count. If MBB is
-      // the last block in an active loop, take it off the list and pick up any
-      // blocks deferred because the header didn't dominate them.
-      for (Entry &E : Loops)
-        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
-          for (auto DeferredBlock : E.Deferred)
-            Ready.push(DeferredBlock);
-      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
-        Loops.pop_back();
-    }
-    // The main topological sort logic.
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      // Ignore backedges.
-      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
-        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
-          continue;
-      // Decrement the predecessor count. If it's now zero, it's ready.
-      if (--NumPredsLeft[Succ->getNumber()] == 0)
-        Preferred.push(Succ);
-    }
-    // Determine the block to follow MBB. First try to find a preferred block,
-    // to preserve the original block order when possible.
-    MachineBasicBlock *Next = nullptr;
-    while (!Preferred.empty()) {
-      Next = Preferred.top();
-      Preferred.pop();
-      // If X isn't dominated by the top active loop header, defer it until that
-      // loop is done.
-      if (!Loops.empty() &&
-          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
-        Loops.back().Deferred.push_back(Next);
-        Next = nullptr;
-        continue;
-      }
-      // If Next was originally ordered before MBB, and it isn't because it was
-      // loop-rotated above the header, it's not preferred.
-      if (Next->getNumber() < MBB->getNumber() &&
-          (!L || !L->contains(Next) ||
-           L->getHeader()->getNumber() < Next->getNumber())) {
-        Ready.push(Next);
-        Next = nullptr;
-        continue;
-      }
-      break;
-    }
-    // If we didn't find a suitable block in the Preferred list, check the
-    // general Ready list.
-    if (!Next) {
-      // If there are no more blocks to process, we're done.
-      if (Ready.empty()) {
-        MaybeUpdateTerminator(MBB);
-        break;
-      }
-      for (;;) {
-        Next = Ready.top();
-        Ready.pop();
-        // If Next isn't dominated by the top active loop header, defer it until
-        // that loop is done.
-        if (!Loops.empty() &&
-            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
-          Loops.back().Deferred.push_back(Next);
-          continue;
-        }
-        break;
-      }
-    }
-    // Move the next block into place and iterate.
-    Next->moveAfter(MBB);
-    MaybeUpdateTerminator(MBB);
-    MBB = Next;
-  }
-  assert(Loops.empty() && "Active loop list not finished");
-  MF.RenumberBlocks();
-
-#ifndef NDEBUG
-  SmallSetVector<MachineLoop *, 8> OnStack;
-
-  // Insert a sentinel representing the degenerate loop that starts at the
-  // function entry block and includes the entire function as a "loop" that
-  // executes once.
-  OnStack.insert(nullptr);
-
-  for (auto &MBB : MF) {
-    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
-
-    MachineLoop *Loop = MLI.getLoopFor(&MBB);
-    if (Loop && &MBB == Loop->getHeader()) {
-      // Loop header. The loop predecessor should be sorted above, and the other
-      // predecessors should be backedges below.
-      for (auto Pred : MBB.predecessors())
-        assert(
-            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
-            "Loop header predecessors must be loop predecessors or backedges");
-      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
-    } else {
-      // Not a loop header. All predecessors should be sorted above.
-      for (auto Pred : MBB.predecessors())
-        assert(Pred->getNumber() < MBB.getNumber() &&
-               "Non-loop-header predecessors should be topologically sorted");
-      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
-             "Blocks must be nested in their loops");
-    }
-    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
-      OnStack.pop_back();
-  }
-  assert(OnStack.pop_back_val() == nullptr &&
-         "The function entry block shouldn't actually be a loop header");
-  assert(OnStack.empty() &&
-         "Control flow stack pushes and pops should be balanced.");
-#endif
-}
-
 /// Test whether Pred has any terminators explicitly branching to MBB, as
 /// opposed to falling through. Note that it's possible (eg. in unoptimized
 /// code) for a branch instruction to both branch to a block and fallthrough
@@ -488,6 +270,15 @@ static void FixEndsAtEndOfFunction(
   }
 }
 
+// WebAssembly functions end with an end instruction, as if the function body
+// were a block.
+static void AppendEndToFunction(
+    MachineFunction &MF,
+    const WebAssemblyInstrInfo &TII) {
+  BuildMI(MF.back(), MF.back().end(), DebugLoc(),
+          TII.get(WebAssembly::END_FUNCTION));
+}
+
 /// Insert LOOP and BLOCK markers at appropriate places.
 static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
                          const WebAssemblyInstrInfo &TII,
@@ -555,6 +346,11 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
   // Fix up block/loop signatures at the end of the function to conform to
   // WebAssembly's rules.
   FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
+
+  // Add an end instruction at the end of the function body.
+  if (!MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF())
+    AppendEndToFunction(MF, TII);
 }
 
 bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
@@ -569,9 +365,6 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MF.getRegInfo().invalidateLiveness();
 
-  // Sort the blocks, with contiguous loops.
-  SortBlocks(MF, MLI, MDT);
-
   // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
   PlaceMarkers(MF, MLI, TII, MDT, MFI);
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
index fc0a01c..b2330a2 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -22,8 +22,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -97,15 +97,28 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
         MI.setDesc(Desc);
 
         // Rewrite argument order
-        auto Uses = MI.explicit_uses();
-        MachineInstr::mop_iterator it = Uses.begin();
-        const MachineOperand MO = *it;
+        SmallVector<MachineOperand, 8> Ops;
+
+        // Set up a placeholder for the type signature immediate.
+        Ops.push_back(MachineOperand::CreateImm(0));
 
         // Set up the flags immediate, which currently has no defined flags
         // so it's always zero.
-        it->ChangeToImmediate(0);
-
-        MI.addOperand(MF, MO);
+        Ops.push_back(MachineOperand::CreateImm(0));
+
+        for (const MachineOperand &MO :
+                 make_range(MI.operands_begin() +
+                                MI.getDesc().getNumDefs() + 1,
+                            MI.operands_begin() +
+                                MI.getNumExplicitOperands()))
+          Ops.push_back(MO);
+        Ops.push_back(MI.getOperand(MI.getDesc().getNumDefs()));
+
+        // Replace the instructions operands.
+        while (MI.getNumOperands() > MI.getDesc().getNumDefs())
+          MI.RemoveOperand(MI.getNumOperands() - 1);
+        for (const MachineOperand &MO : Ops)
+          MI.addOperand(MO);
 
         DEBUG(dbgs() << "  After transform: " << MI);
         Changed = true;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 04ede7f..4124911 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -31,6 +31,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-explicit-locals"
 
+// A command-line option to disable this pass. Note that this produces output
+// which is not valid WebAssembly, though it may be more convenient for writing
+// LLVM unit tests with.
+static cl::opt<bool> DisableWebAssemblyExplicitLocals(
+    "disable-wasm-explicit-locals", cl::ReallyHidden,
+    cl::desc("WebAssembly: Disable emission of get_local/set_local."),
+    cl::init(false));
+
 namespace {
 class WebAssemblyExplicitLocals final : public MachineFunctionPass {
   StringRef getPassName() const override {
@@ -60,7 +68,25 @@ FunctionPass *llvm::createWebAssemblyExplicitLocals() {
 /// if it doesn't yet have one.
 static unsigned getLocalId(DenseMap<unsigned, unsigned> &Reg2Local,
                            unsigned &CurLocal, unsigned Reg) {
-  return Reg2Local.insert(std::make_pair(Reg, CurLocal++)).first->second;
+  auto P = Reg2Local.insert(std::make_pair(Reg, CurLocal));
+  if (P.second)
+    ++CurLocal;
+  return P.first->second;
+}
+
+/// Get the appropriate drop opcode for the given register class.
+static unsigned getDropOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::DROP_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::DROP_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::DROP_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::DROP_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::DROP_V128;
+  llvm_unreachable("Unexpected register class");
 }
 
 /// Get the appropriate get_local opcode for the given register class.
@@ -146,6 +172,10 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                   "********** Function: "
                << MF.getName() << '\n');
 
+  // Disable this pass if directed to do so.
+  if (DisableWebAssemblyExplicitLocals)
+    return false;
+
   // Disable this pass if we aren't doing direct wasm object emission.
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF())
@@ -176,6 +206,12 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // Start assigning local numbers after the last parameter.
   unsigned CurLocal = MFI.getParams().size();
 
+  // Precompute the set of registers that are unused, so that we can insert
+  // drops to their defs.
+  BitVector UseEmpty(MRI.getNumVirtRegs());
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i)
+    UseEmpty[i] = MRI.use_empty(TargetRegisterInfo::index2VirtReg(i));
+
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
@@ -224,15 +260,26 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       assert(MI.getDesc().getNumDefs() <= 1);
       if (MI.getDesc().getNumDefs() == 1) {
         unsigned OldReg = MI.getOperand(0).getReg();
-        if (!MFI.isVRegStackified(OldReg) && !MRI.use_empty(OldReg)) {
-          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+        if (!MFI.isVRegStackified(OldReg)) {
           const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
           unsigned NewReg = MRI.createVirtualRegister(RC);
           auto InsertPt = std::next(MachineBasicBlock::iterator(&MI));
-          unsigned Opc = getSetLocalOpcode(RC);
-          BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
-              .addImm(LocalId)
-              .addReg(NewReg);
+          if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
+            MI.eraseFromParent();
+            Changed = true;
+            continue;
+          }
+          if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
+            unsigned Opc = getDropOpcode(RC);
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                .addReg(NewReg);
+          } else {
+            unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+            unsigned Opc = getSetLocalOpcode(RC);
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                .addImm(LocalId)
+                .addReg(NewReg);
+          }
           MI.getOperand(0).setReg(NewReg);
           MFI.stackifyVReg(NewReg);
           Changed = true;
@@ -278,13 +325,16 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Define the locals.
+  // TODO: Sort the locals for better compression.
+  MFI.setNumLocals(CurLocal - MFI.getParams().size());
   for (size_t i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     auto I = Reg2Local.find(Reg);
     if (I == Reg2Local.end() || I->second < MFI.getParams().size())
       continue;
 
-    MFI.addLocal(typeForRegClass(MRI.getRegClass(Reg)));
+    MFI.setLocal(I->second - MFI.getParams().size(),
+                 typeForRegClass(MRI.getRegClass(Reg)));
     Changed = true;
   }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index bc7020f..c980f4b 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -16,8 +16,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
@@ -63,12 +63,16 @@ class WebAssemblyFastISel final : public FastISel {
   public:
     // Innocuous defaults for our address.
     Address() : Kind(RegBase), Offset(0), GV(0) { Base.Reg = 0; }
-    void setKind(BaseKind K) { Kind = K; }
+    void setKind(BaseKind K) {
+      assert(!isSet() && "Can't change kind with non-zero base");
+      Kind = K;
+    }
     BaseKind getKind() const { return Kind; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
+      assert(Base.Reg == 0 && "Overwriting non-zero register");
       Base.Reg = Reg;
     }
     unsigned getReg() const {
@@ -77,6 +81,7 @@ class WebAssemblyFastISel final : public FastISel {
     }
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index access!");
+      assert(Base.FI == 0 && "Overwriting non-zero frame index");
       Base.FI = FI;
     }
     unsigned getFI() const {
@@ -91,6 +96,13 @@ class WebAssemblyFastISel final : public FastISel {
     int64_t getOffset() const { return Offset; }
     void setGlobalValue(const GlobalValue *G) { GV = G; }
     const GlobalValue *getGlobalValue() const { return GV; }
+    bool isSet() const {
+      if (isRegBase()) {
+        return Base.Reg != 0;
+      } else {
+        return Base.FI != 0;
+      }
+    }
   };
 
   /// Keep a pointer to the WebAssemblySubtarget around so that we can make the
@@ -116,6 +128,8 @@ private:
     case MVT::f32:
     case MVT::f64:
       return VT;
+    case MVT::f16:
+      return MVT::f32;
     case MVT::v16i8:
     case MVT::v8i16:
     case MVT::v4i32:
@@ -295,6 +309,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
     DenseMap<const AllocaInst *, int>::iterator SI =
         FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end()) {
+      if (Addr.isSet()) {
+        return false;
+      }
       Addr.setKind(Address::FrameIndexBase);
       Addr.setFI(SI->second);
       return true;
@@ -339,6 +356,9 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
     break;
   }
   }
+  if (Addr.isSet()) {
+    return false;
+  }
   Addr.setReg(getRegForValue(Obj));
   return Addr.getReg() != 0;
 }
@@ -594,12 +614,12 @@ bool WebAssemblyFastISel::fastLowerArguments() {
 
   unsigned i = 0;
   for (auto const &Arg : F->args()) {
-    const AttributeSet &Attrs = F->getAttributes();
-    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
-        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
-        Attrs.hasAttribute(i+1, Attribute::Nest))
+    const AttributeList &Attrs = F->getAttributes();
+    if (Attrs.hasParamAttribute(i, Attribute::ByVal) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftSelf) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftError) ||
+        Attrs.hasParamAttribute(i, Attribute::InAlloca) ||
+        Attrs.hasParamAttribute(i, Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
@@ -744,19 +764,19 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
       return false;
 
-    const AttributeSet &Attrs = Call->getAttributes();
-    if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
-        Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
-        Attrs.hasAttribute(i+1, Attribute::InAlloca) ||
-        Attrs.hasAttribute(i+1, Attribute::Nest))
+    const AttributeList &Attrs = Call->getAttributes();
+    if (Attrs.hasParamAttribute(i, Attribute::ByVal) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftSelf) ||
+        Attrs.hasParamAttribute(i, Attribute::SwiftError) ||
+        Attrs.hasParamAttribute(i, Attribute::InAlloca) ||
+        Attrs.hasParamAttribute(i, Attribute::Nest))
       return false;
 
     unsigned Reg;
 
-    if (Attrs.hasAttribute(i+1, Attribute::SExt))
+    if (Attrs.hasParamAttribute(i, Attribute::SExt))
       Reg = getRegForSignedValue(V);
-    else if (Attrs.hasAttribute(i+1, Attribute::ZExt))
+    else if (Attrs.hasParamAttribute(i, Attribute::ZExt))
       Reg = getRegForUnsignedValue(V);
     else
       Reg = getRegForValue(V);
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index adf904e..76a2ff3 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -84,7 +84,7 @@ static void FindUses(Value *V, Function &F,
 //  - Call with fewer arguments than needed: arguments are filled in with undef
 //  - Return value is not needed: drop it
 //  - Return value needed but not present: supply an undef
-//  
+//
 // For now, return nullptr without creating a wrapper if the wrapper cannot
 // be generated due to incompatible types.
 static Function *CreateWrapper(Function *F, FunctionType *Ty) {
@@ -148,6 +148,11 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
     if (!Ty)
       continue;
 
+    // Wasm varargs are not ABI-compatible with non-varargs. Just ignore
+    // such casts for now.
+    if (Ty->isVarArg() || F->isVarArg())
+      continue;
+
     auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
     if (Pair.second)
       Pair.first->second = CreateWrapper(F, Ty);
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 2bbf7a2..41f315c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -26,8 +26,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/ADT/PriorityQueue.h"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index a6a2c0b..a37d613 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -24,10 +24,11 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -101,25 +102,33 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
                             MachineBasicBlock::iterator &InsertAddr,
                             MachineBasicBlock::iterator &InsertStore,
                             const DebugLoc &DL) {
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
   const char *ES = "__stack_pointer";
   auto *SPSymbol = MF.createExternalSymbolName(ES);
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterClass *PtrRC =
-      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned Zero = MRI.createVirtualRegister(PtrRC);
-  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned Zero = MRI.createVirtualRegister(PtrRC);
 
-  BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
-      .addImm(0);
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-      MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
-      .addImm(2)  // p2align
-      .addExternalSymbol(SPSymbol)
-      .addReg(Zero)
-      .addReg(SrcReg)
-      .addMemOperand(MMO);
+    BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+        .addImm(0);
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+        MachineMemOperand::MOStore, 4, 4);
+    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
+        .addImm(2)  // p2align
+        .addExternalSymbol(SPSymbol)
+        .addReg(Zero)
+        .addReg(SrcReg)
+        .addMemOperand(MMO);
+  } else {
+    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
+        .addExternalSymbol(SPSymbol)
+        .addReg(SrcReg);
+  }
 }
 
 MachineBasicBlock::iterator
@@ -151,27 +160,37 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto &MRI = MF.getRegInfo();
 
   auto InsertPt = MBB.begin();
+  while (InsertPt != MBB.end() && WebAssembly::isArgument(*InsertPt))
+    ++InsertPt;
   DebugLoc DL;
 
   const TargetRegisterClass *PtrRC =
       MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned Zero = MRI.createVirtualRegister(PtrRC);
   unsigned SPReg = WebAssembly::SP32;
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
+
   const char *ES = "__stack_pointer";
   auto *SPSymbol = MF.createExternalSymbolName(ES);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
-      .addImm(0);
-  MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-      MachineMemOperand::MOLoad, 4, 4);
-  // Load the SP value.
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
-      .addImm(2)       // p2align
-      .addExternalSymbol(SPSymbol)
-      .addReg(Zero)    // addr
-      .addMemOperand(LoadMMO);
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    unsigned Zero = MRI.createVirtualRegister(PtrRC);
+
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+        .addImm(0);
+    MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
+        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+        MachineMemOperand::MOLoad, 4, 4);
+    // Load the SP value.
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+        .addImm(2)       // p2align
+        .addExternalSymbol(SPSymbol)
+        .addReg(Zero)    // addr
+        .addMemOperand(LoadMMO);
+  } else {
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
+        .addExternalSymbol(SPSymbol);
+  }
 
   bool HasBP = hasBP(MF);
   if (HasBP) {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index a67137f..4f3ae57 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -12,12 +12,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6a7f75a..8143770 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -84,8 +84,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                     ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
       setCondCodeAction(CC, T, Expand);
     // Expand floating-point library function operators.
-    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
-                    ISD::FREM, ISD::FMA})
+    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM,
+                    ISD::FMA})
       setOperationAction(Op, T, Expand);
     // Note supported floating-point library function operators that otherwise
     // default to expand.
@@ -95,6 +95,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Support minnan and maxnan, which otherwise default to expand.
     setOperationAction(ISD::FMINNAN, T, Legal);
     setOperationAction(ISD::FMAXNAN, T, Legal);
+    // WebAssembly currently has no builtin f16 support.
+    setOperationAction(ISD::FP16_TO_FP, T, Expand);
+    setOperationAction(ISD::FP_TO_FP16, T, Expand);
+    setLoadExtAction(ISD::EXTLOAD, T, MVT::f16, Expand);
+    setTruncStoreAction(T, MVT::f16, Expand);
   }
 
   for (auto T : {MVT::i32, MVT::i64}) {
@@ -253,7 +258,8 @@ bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
-bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
+                                              AttributeList Attr) const {
   // The current thinking is that wasm engines will perform this optimization,
   // so we can save on code size.
   return true;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 5bc7230..99d3d0d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -58,7 +58,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
                              unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
                                       bool *Fast) const override;
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 047f4be..6b45839 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in {
 // Call sequence markers. These have an immediate which represents the amount of
 // stack space to allocate or free, which is used for varargs lowering.
 let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt),
-                         [(WebAssemblycallseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                         [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>;
 def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2),
                        [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
 } // isCodeGenOnly = 1
@@ -30,13 +30,15 @@ multiclass CALL<WebAssemblyRegClass vt, string prefix> {
                    [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
                    !strconcat(prefix, "call\t$dst, $callee"),
                    0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
                               [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
                               "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins i32imm:$flags, variable_ops),
+  def CALL_INDIRECT_#vt : I<(outs vt:$dst),
+                            (ins TypeIndex:$type, i32imm:$flags, variable_ops),
                             [],
                             !strconcat(prefix, "call_indirect\t$dst"),
                             0x11>;
@@ -48,6 +50,7 @@ multiclass SIMD_CALL<ValueType vt, string prefix> {
                                (WebAssemblycall1 (i32 imm:$callee)))],
                          !strconcat(prefix, "call\t$dst, $callee"),
                          0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
                                     (ins I32:$callee, variable_ops),
@@ -57,7 +60,8 @@ multiclass SIMD_CALL<ValueType vt, string prefix> {
   } // isCodeGenOnly = 1
 
   def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
-                                  (ins i32imm:$flags, variable_ops),
+                                  (ins TypeIndex:$type, i32imm:$flags,
+                                       variable_ops),
                                   [],
                                   !strconcat(prefix, "call_indirect\t$dst"),
                                   0x11>;
@@ -76,13 +80,15 @@ let Uses = [SP32, SP64], isCall = 1 in {
   def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
                     [(WebAssemblycall0 (i32 imm:$callee))],
                     "call    \t$callee", 0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
                       [(WebAssemblycall0 I32:$callee)],
                       "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_VOID : I<(outs), (ins i32imm:$flags, variable_ops),
+  def CALL_INDIRECT_VOID : I<(outs),
+                             (ins TypeIndex:$type, i32imm:$flags, variable_ops),
                              [],
                              "call_indirect\t", 0x11>;
 } // Uses = [SP32,SP64], isCall = 1
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 1146431..1297941 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -57,16 +57,21 @@ def BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
 }
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
-// Placemarkers to indicate the start or end of a block or loop scope. These
-// use/clobber VALUE_STACK to prevent them from being moved into the middle of
-// an expression tree.
+// Placemarkers to indicate the start or end of a block, loop, or try scope.
+// These use/clobber VALUE_STACK to prevent them from being moved into the
+// middle of an expression tree.
 let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
 def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
 def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
+def TRY       : I<(outs), (ins Signature:$sig), [], "try     \t$sig", 0x06>;
 
-// END_BLOCK and END_LOOP are represented with the same opcode in wasm.
+// END_BLOCK, END_LOOP, END_TRY, and END_FUNCTION are represented with the same
+// opcode in wasm.
 def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
 def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
+def END_TRY   : I<(outs), (ins), [], "end_try", 0x0b>;
+let isTerminator = 1, isBarrier = 1 in
+def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
 multiclass RETURN<WebAssemblyRegClass vt> {
@@ -109,6 +114,20 @@ let isReturn = 1 in {
 
 def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable", 0x00>;
 
+def THROW_I32 : I<(outs), (ins i32imm:$tag, I32:$obj),
+                  [(int_wasm_throw imm:$tag, I32:$obj)], "throw   \t$tag, $obj",
+                  0x08>;
+def THROW_I64 : I<(outs), (ins i32imm:$tag, I64:$obj),
+                  [(int_wasm_throw imm:$tag, I64:$obj)], "throw   \t$tag, $obj",
+                  0x08>;
+def RETHROW : I<(outs), (ins i32imm:$rel_depth), [], "rethrow \t$rel_depth",
+                0x09>;
+
 } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
 
 } // Defs = [ARGUMENTS]
+
+// rethrow takes a relative depth as an argument, for which currently only 0 is
+// possible for C++. Once other languages need depths other than 0, depths will
+// be computed in CFGStackify.
+def : Pat<(int_wasm_rethrow), (RETHROW 0)>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 030be08..03c9c1f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -55,8 +55,8 @@ defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
 } // isCommutable = 1
 defm LT : ComparisonFP<SETOLT, "lt  ", 0x5d, 0x63>;
-defm LE : ComparisonFP<SETOLE, "le  ", 0x5e, 0x64>;
-defm GT : ComparisonFP<SETOGT, "gt  ", 0x5f, 0x65>;
+defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
+defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
 } // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 0e2d8bb..8846952 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -183,11 +183,9 @@ unsigned WebAssemblyInstrInfo::insertBranch(MachineBasicBlock &MBB,
   assert(Cond.size() == 2 && "Expected a flag and a successor block");
 
   if (Cond[0].getImm()) {
-    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).addOperand(Cond[1]);
+    BuildMI(&MBB, DL, get(WebAssembly::BR_IF)).addMBB(TBB).add(Cond[1]);
   } else {
-    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS))
-        .addMBB(TBB)
-        .addOperand(Cond[1]);
+    BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)).addMBB(TBB).add(Cond[1]);
   }
   if (!FBB)
     return 1;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index dcfd1a4..fa2146f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">,
 // WebAssembly-specific DAG Node Types.
 //===----------------------------------------------------------------------===//
 
-def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>;
+def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>,
+                                                  SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCallSeqEnd :
     SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 def SDT_WebAssemblyCall0    : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
@@ -74,6 +75,9 @@ def bb_op : Operand<OtherVT>;
 let OperandType = "OPERAND_LOCAL" in
 def local_op : Operand<i32>;
 
+let OperandType = "OPERAND_GLOBAL" in
+def global_op : Operand<i32>;
+
 let OperandType = "OPERAND_I32IMM" in
 def i32imm_op : Operand<i32>;
 
@@ -104,6 +108,9 @@ def Signature : Operand<i32> {
 }
 } // OperandType = "OPERAND_SIGNATURE"
 
+let OperandType = "OPERAND_TYPEINDEX" in
+def TypeIndex : Operand<i32>;
+
 } // OperandNamespace = "WebAssembly"
 
 //===----------------------------------------------------------------------===//
@@ -178,6 +185,18 @@ let hasSideEffects = 0 in {
   def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
                          "tee_local\t$res, $local, $src", 0x22>;
 
+  // Unused values must be dropped in some contexts.
+  def DROP_#vt : I<(outs), (ins vt:$src), [],
+                   "drop\t$src", 0x1a>;
+
+  let mayLoad = 1 in
+  def GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local), [],
+                         "get_global\t$res, $local", 0x23>;
+
+  let mayStore = 1 in
+  def SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src), [],
+                         "set_global\t$local, $src", 0x24>;
+
 } // hasSideEffects = 0
 }
 defm : LOCAL<I32>;
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index b606ebb..365b327 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -26,18 +26,18 @@
 // offset for an add that needs wrapping.
 def regPlusImm : PatFrag<(ops node:$addr, node:$off),
                          (add node:$addr, node:$off),
-                         [{ return N->getFlags()->hasNoUnsignedWrap(); }]>;
+                         [{ return N->getFlags().hasNoUnsignedWrap(); }]>;
 
 // Treat an 'or' node as an 'add' if the or'ed bits are known to be zero.
 def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
-  APInt KnownZero0, KnownOne0;
-  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
-  APInt KnownZero1, KnownOne1;
-  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
-  return (~KnownZero0 & ~KnownZero1) == 0;
+  KnownBits Known0;
+  CurDAG->computeKnownBits(N->getOperand(0), Known0, 0);
+  KnownBits Known1;
+  CurDAG->computeKnownBits(N->getOperand(1), Known1, 0);
+  return (~Known0.Zero & ~Known1.Zero) == 0;
 }]>;
 
 // GlobalAddresses are conceptually unsigned values, so we can also fold them
@@ -47,7 +47,7 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
 def regPlusGA : PatFrag<(ops node:$addr, node:$off),
                         (add node:$addr, node:$off),
                         [{
-  return N->getFlags()->hasNoUnsignedWrap();
+  return N->getFlags().hasNoUnsignedWrap();
 }]>;
 
 // We don't need a regPlusES because external symbols never have constant
@@ -673,9 +673,9 @@ def CURRENT_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
                          Requires<[HasAddr32]>;
 
 // Grow memory.
-def GROW_MEMORY_I32 : I<(outs), (ins i32imm:$flags, I32:$delta),
+def GROW_MEMORY_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
                         [],
-                        "grow_memory\t$delta", 0x40>,
+                        "grow_memory\t$dst, $delta", 0x40>,
                       Requires<[HasAddr32]>;
 
 } // Defs = [ARGUMENTS]
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 7ea5d05..576b71d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -15,8 +15,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -118,7 +118,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
       // delete the br_unless.
       assert(Inverted);
       BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF))
-          .addOperand(MI->getOperand(0))
+          .add(MI->getOperand(0))
           .addReg(Cond);
       MBB.erase(MI);
     }
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 72cb1cc..f0b6a3e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -412,7 +412,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   if (CI->doesNotReturn()) {
     if (auto *F = dyn_cast<Function>(CI->getCalledValue()))
       F->removeFnAttr(Attribute::NoReturn);
-    CI->removeAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+    CI->removeAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
 
   IRBuilder<> IRB(C);
@@ -435,25 +435,20 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
 
   // Because we added the pointer to the callee as first argument, all
   // argument attribute indices have to be incremented by one.
-  SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &InvokePAL = CI->getAttributes();
-  CallSite::arg_iterator AI = CI->arg_begin();
-  unsigned i = 1; // Argument attribute index starts from 1
-  for (unsigned e = CI->getNumArgOperands(); i <= e; ++AI, ++i) {
-    if (InvokePAL.hasAttributes(i)) {
-      AttrBuilder B(InvokePAL, i);
-      AttributesVec.push_back(AttributeSet::get(C, i + 1, B));
-    }
-  }
-  // Add any return attributes.
-  if (InvokePAL.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getRetAttributes()));
-  // Add any function attributes.
-  if (InvokePAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getFnAttributes()));
+  SmallVector<AttributeSet, 8> ArgAttributes;
+  const AttributeList &InvokeAL = CI->getAttributes();
+
+  // No attributes for the callee pointer.
+  ArgAttributes.push_back(AttributeSet());
+  // Copy the argument attributes from the original
+  for (unsigned i = 0, e = CI->getNumArgOperands(); i < e; ++i)
+    ArgAttributes.push_back(InvokeAL.getParamAttributes(i));
+
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeSet NewCallPAL = AttributeSet::get(C, AttributesVec);
-  NewCall->setAttributes(NewCallPAL);
+  AttributeList NewCallAL =
+      AttributeList::get(C, InvokeAL.getFnAttributes(),
+                         InvokeAL.getRetAttributes(), ArgAttributes);
+  NewCall->setAttributes(NewCallAL);
 
   CI->replaceAllUsesWith(NewCall);
 
@@ -624,7 +619,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
   Function *F =
       Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
   Argument *Arg1 = &*(F->arg_begin());
-  Argument *Arg2 = &*(++F->arg_begin());
+  Argument *Arg2 = &*std::next(F->arg_begin());
   Arg1->setName("threw");
   Arg2->setName("value");
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
@@ -902,7 +897,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     }
   }
 
-  // Look for orphan landingpads, can occur in blocks with no predecesors
+  // Look for orphan landingpads, can occur in blocks with no predecessors
   for (BasicBlock &BB : F) {
     Instruction *I = BB.getFirstNonPHI();
     if (auto *LPI = dyn_cast<LandingPadInst>(I))
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 022a448..8880539 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -14,7 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyAsmPrinter.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRuntimeLibcallSignatures.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Constants.h"
@@ -22,18 +25,85 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 MCSymbol *
 WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
+  const GlobalValue *Global = MO.getGlobal();
+  MCSymbol *Sym = Printer.getSymbol(Global);
+  if (isa<MCSymbolELF>(Sym))
+    return Sym;
+
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+
+  if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
+    const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
+    const TargetMachine &TM = MF.getTarget();
+    const Function &CurrentFunc = *MF.getFunction();
+
+    SmallVector<wasm::ValType, 4> Returns;
+    SmallVector<wasm::ValType, 4> Params;
+
+    wasm::ValType iPTR =
+        MF.getSubtarget<WebAssemblySubtarget>().hasAddr64() ?
+        wasm::ValType::I64 :
+        wasm::ValType::I32;
+
+    SmallVector<MVT, 4> ResultMVTs;
+    ComputeLegalValueVTs(CurrentFunc, TM, FuncTy->getReturnType(), ResultMVTs);
+    // WebAssembly can't currently handle returning tuples.
+    if (ResultMVTs.size() <= 1)
+      for (MVT ResultMVT : ResultMVTs)
+        Returns.push_back(WebAssembly::toValType(ResultMVT));
+    else
+      Params.push_back(iPTR);
+
+    for (Type *Ty : FuncTy->params()) {
+      SmallVector<MVT, 4> ParamMVTs;
+      ComputeLegalValueVTs(CurrentFunc, TM, Ty, ParamMVTs);
+      for (MVT ParamMVT : ParamMVTs)
+        Params.push_back(WebAssembly::toValType(ParamMVT));
+    }
+
+    if (FuncTy->isVarArg())
+      Params.push_back(iPTR);
+
+    WasmSym->setReturns(std::move(Returns));
+    WasmSym->setParams(std::move(Params));
+    WasmSym->setIsFunction(true);
+  }
+
+  return WasmSym;
 }
 
 MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
     const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+  const char *Name = MO.getSymbolName();
+  MCSymbol *Sym = Printer.GetExternalSymbolSymbol(Name);
+  if (isa<MCSymbolELF>(Sym))
+    return Sym;
+
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+  const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
+
+  // __stack_pointer is a global variable; all other external symbols used by
+  // CodeGen are functions.
+  if (strcmp(Name, "__stack_pointer") == 0)
+    return WasmSym;
+
+  SmallVector<wasm::ValType, 4> Returns;
+  SmallVector<wasm::ValType, 4> Params;
+  GetSignature(Subtarget, Name, Returns, Params);
+
+  WasmSym->setReturns(std::move(Returns));
+  WasmSym->setParams(std::move(Params));
+  WasmSym->setIsFunction(true);
+
+  return WasmSym;
 }
 
 MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
@@ -42,6 +112,7 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
   MCSymbolRefExpr::VariantKind VK =
       IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
              : MCSymbolRefExpr::VK_None;
+
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
 
   if (Offset != 0) {
@@ -54,20 +125,34 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
   return MCOperand::createExpr(Expr);
 }
 
+// Return the WebAssembly type associated with the given register class.
+static wasm::ValType getType(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return wasm::ValType::I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return wasm::ValType::I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return wasm::ValType::F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return wasm::ValType::F64;
+  llvm_unreachable("Unexpected register class");
+}
+
 void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
                                    MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
+  const MCInstrDesc &Desc = MI->getDesc();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_MachineBasicBlock:
-      MI->dump();
+      MI->print(errs());
       llvm_unreachable("MachineBasicBlock operand should have been rewritten");
     case MachineOperand::MO_Register: {
       // Ignore all implicit register operands.
@@ -80,6 +165,41 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
       break;
     }
     case MachineOperand::MO_Immediate:
+      if (i < Desc.NumOperands) {
+        const MCOperandInfo &Info = Desc.OpInfo[i];
+        if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+          MCSymbol *Sym = Printer.createTempSymbol("typeindex");
+          if (!isa<MCSymbolELF>(Sym)) {
+            SmallVector<wasm::ValType, 4> Returns;
+            SmallVector<wasm::ValType, 4> Params;
+
+            const MachineRegisterInfo &MRI =
+                MI->getParent()->getParent()->getRegInfo();
+            for (const MachineOperand &MO : MI->defs())
+              Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
+            for (const MachineOperand &MO : MI->explicit_uses())
+              if (MO.isReg())
+                Params.push_back(getType(MRI.getRegClass(MO.getReg())));
+
+            // call_indirect instructions have a callee operand at the end which
+            // doesn't count as a param.
+            if (WebAssembly::isCallIndirect(*MI))
+              Params.pop_back();
+
+            MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+            WasmSym->setReturns(std::move(Returns));
+            WasmSym->setParams(std::move(Params));
+            WasmSym->setIsFunction(true);
+
+            const MCExpr *Expr =
+                MCSymbolRefExpr::create(WasmSym,
+                                        MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX,
+                                        Ctx);
+            MCOp = MCOperand::createExpr(Expr);
+            break;
+          }
+        }
+      }
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_FPImmediate: {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index ab4ba1c..d1d2794 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -20,7 +20,7 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class AsmPrinter;
+class WebAssemblyAsmPrinter;
 class MCContext;
 class MCSymbol;
 class MachineInstr;
@@ -29,7 +29,7 @@ class MachineOperand;
 /// This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCContext &Ctx;
-  AsmPrinter &Printer;
+  WebAssemblyAsmPrinter &Printer;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
@@ -37,7 +37,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
                                bool IsFunc) const;
 
 public:
-  WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+  WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
       : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 };
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 756619b..1fcbb77 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -60,6 +60,8 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   void addResult(MVT VT) { Results.push_back(VT); }
   const std::vector<MVT> &getResults() const { return Results; }
 
+  void setNumLocals(size_t NumLocals) { Locals.resize(NumLocals, MVT::i32); }
+  void setLocal(size_t i, MVT VT) { Locals[i] = VT; }
   void addLocal(MVT VT) { Locals.push_back(VT); }
   const std::vector<MVT> &getLocals() const { return Locals; }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 96520aa..559165e 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -54,7 +54,7 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
 
 void OptimizeReturned::visitCallSite(CallSite CS) {
   for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i)
-    if (CS.paramHasAttr(1 + i, Attribute::Returned)) {
+    if (CS.paramHasAttr(i, Attribute::Returned)) {
       Instruction *Inst = CS.getInstruction();
       Value *Arg = CS.getArgOperand(i);
       // Ignore constants, globals, undef, etc.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 32dde88..d2fbc5a 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -80,19 +80,31 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
     return false;
   if (&MBB != &MF.back())
     return false;
-  if (&MI != &MBB.back())
-    return false;
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    if (&MI != &MBB.back())
+      return false;
+  } else {
+    MachineBasicBlock::iterator End = MBB.end();
+    --End;
+    assert(End->getOpcode() == WebAssembly::END_FUNCTION);
+    --End;
+    if (&MI != &*End)
+      return false;
+  }
 
-  // If the operand isn't stackified, insert a COPY to read the operand and
-  // stackify it.
-  MachineOperand &MO = MI.getOperand(0);
-  unsigned Reg = MO.getReg();
-  if (!MFI.isVRegStackified(Reg)) {
-    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
-        .addReg(Reg);
-    MO.setReg(NewReg);
-    MFI.stackifyVReg(NewReg);
+  if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
+    // If the operand isn't stackified, insert a COPY to read the operand and
+    // stackify it.
+    MachineOperand &MO = MI.getOperand(0);
+    unsigned Reg = MO.getReg();
+    if (!MFI.isVRegStackified(Reg)) {
+      unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+          .addReg(Reg);
+      MO.setReg(NewReg);
+      MFI.stackifyVReg(NewReg);
+    }
   }
 
   // Rewrite the return.
@@ -127,7 +139,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
           if (Name == TLI.getLibcallName(RTLIB::MEMCPY) ||
               Name == TLI.getLibcallName(RTLIB::MEMMOVE) ||
               Name == TLI.getLibcallName(RTLIB::MEMSET)) {
-            LibFunc::Func Func;
+            LibFunc Func;
             if (LibInfo.getLibFunc(Name, Func)) {
               const auto &Op2 = MI.getOperand(2);
               if (!Op2.isReg())
@@ -188,9 +200,9 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
             WebAssembly::COPY_V128);
         break;
       case WebAssembly::RETURN_VOID:
-        if (!DisableWebAssemblyFallthroughReturnOpt &&
-            &MBB == &MF.back() && &MI == &MBB.back())
-          MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID,
+            WebAssembly::INSTRUCTION_LIST_END);
         break;
       }
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 473dcb7..1462c49 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -19,8 +19,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 5fd4a8d..ba39b6c 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -140,8 +140,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
 
     // Check if it's possible to reuse any of the used colors.
     if (!MRI->isLiveIn(Old))
-      for (int C(UsedColors.find_first()); C != -1;
-           C = UsedColors.find_next(C)) {
+      for (unsigned C : UsedColors.set_bits()) {
         if (MRI->getRegClass(SortedIntervals[C]->reg) != RC)
           continue;
         for (LiveInterval *OtherLI : Assignments[C])
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index e347082..766ab45 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -13,8 +13,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 32ee09e..ea9e3fa 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -20,8 +20,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -152,7 +153,7 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
 }
 
 // Determine whether MI reads memory, writes memory, has side effects,
-// and/or uses the __stack_pointer value.
+// and/or uses the stack pointer value.
 static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
                   bool &Write, bool &Effects, bool &StackPointer) {
   assert(!MI.isPosition());
@@ -176,8 +177,9 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
         auto PSV = MPI.V.get<const PseudoSourceValue *>();
         if (const ExternalSymbolPseudoSourceValue *EPSV =
                 dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
-          if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+          if (StringRef(EPSV->getSymbol()) == "__stack_pointer") {
             StackPointer = true;
+          }
       }
     }
   } else if (MI.hasOrderedMemoryRef()) {
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index 9e944df..878ffd0 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -19,8 +19,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
new file mode 100644
index 0000000..2599064
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -0,0 +1,1323 @@
+// CodeGen/RuntimeLibcallSignatures.cpp - R.T. Lib. Call Signatures -*- C++ -*--
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains signature information for runtime libcalls.
+///
+/// CodeGen uses external symbols, which it refers to by name. The WebAssembly
+/// target needs type information for all functions. This file contains a big
+/// table providing type signatures for all runtime library functions that LLVM
+/// uses.
+///
+/// This is currently a fairly heavy-handed solution.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyRuntimeLibcallSignatures.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+
+using namespace llvm;
+
+namespace {
+
+enum RuntimeLibcallSignature {
+  func,
+  f32_func_f32,
+  f32_func_f64,
+  f32_func_i32,
+  f32_func_i64,
+  f32_func_i16,
+  f64_func_f32,
+  f64_func_f64,
+  f64_func_i32,
+  f64_func_i64,
+  i32_func_f32,
+  i32_func_f64,
+  i32_func_i32,
+  i64_func_f32,
+  i64_func_f64,
+  i64_func_i64,
+  f32_func_f32_f32,
+  f32_func_f32_i32,
+  f32_func_i64_i64,
+  f64_func_f64_f64,
+  f64_func_f64_i32,
+  f64_func_i64_i64,
+  i16_func_f32,
+  i8_func_i8_i8,
+  func_f32_iPTR_iPTR,
+  func_f64_iPTR_iPTR,
+  i16_func_i16_i16,
+  i32_func_f32_f32,
+  i32_func_f64_f64,
+  i32_func_i32_i32,
+  i64_func_i64_i64,
+  i64_i64_func_f32,
+  i64_i64_func_f64,
+  i16_i16_func_i16_i16,
+  i32_i32_func_i32_i32,
+  i64_i64_func_i64_i64,
+  i64_i64_func_i64_i64_i64_i64,
+  i64_i64_i64_i64_func_i64_i64_i64_i64,
+  i64_i64_func_i64_i64_i32,
+  iPTR_func_iPTR_i32_iPTR,
+  iPTR_func_iPTR_iPTR_iPTR,
+  f32_func_f32_f32_f32,
+  f64_func_f64_f64_f64,
+  func_i64_i64_iPTR_iPTR,
+  func_iPTR_f32,
+  func_iPTR_f64,
+  func_iPTR_i32,
+  func_iPTR_i64,
+  func_iPTR_i64_i64,
+  func_iPTR_i64_i64_i64_i64,
+  func_iPTR_i64_i64_i64_i64_i64_i64,
+  i32_func_i64_i64,
+  i32_func_i64_i64_i64_i64,
+  unsupported
+};
+
+} // end anonymous namespace
+
+static const RuntimeLibcallSignature
+RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
+// Integer
+/* SHL_I16 */ i16_func_i16_i16,
+/* SHL_I32 */ i32_func_i32_i32,
+/* SHL_I64 */ i64_func_i64_i64,
+/* SHL_I128 */ i64_i64_func_i64_i64_i32,
+/* SRL_I16 */ i16_func_i16_i16,
+/* SRL_I32 */ i32_func_i32_i32,
+/* SRL_I64 */ i64_func_i64_i64,
+/* SRL_I128 */ i64_i64_func_i64_i64_i32,
+/* SRA_I16 */ i16_func_i16_i16,
+/* SRA_I32 */ i32_func_i32_i32,
+/* SRA_I64 */ i64_func_i64_i64,
+/* SRA_I128 */ i64_i64_func_i64_i64_i32,
+/* MUL_I8 */ i8_func_i8_i8,
+/* MUL_I16 */ i16_func_i16_i16,
+/* MUL_I32 */ i32_func_i32_i32,
+/* MUL_I64 */ i64_func_i64_i64,
+/* MUL_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* MULO_I32 */ i32_func_i32_i32,
+/* MULO_I64 */ i64_func_i64_i64,
+/* MULO_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SDIV_I8 */ i8_func_i8_i8,
+/* SDIV_I16 */ i16_func_i16_i16,
+/* SDIV_I32 */ i32_func_i32_i32,
+/* SDIV_I64 */ i64_func_i64_i64,
+/* SDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* UDIV_I8 */ i8_func_i8_i8,
+/* UDIV_I16 */ i16_func_i16_i16,
+/* UDIV_I32 */ i32_func_i32_i32,
+/* UDIV_I64 */ i64_func_i64_i64,
+/* UDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SREM_I8 */ i8_func_i8_i8,
+/* SREM_I16 */ i16_func_i16_i16,
+/* SREM_I32 */ i32_func_i32_i32,
+/* SREM_I64 */ i64_func_i64_i64,
+/* SREM_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* UREM_I8 */ i8_func_i8_i8,
+/* UREM_I16 */ i16_func_i16_i16,
+/* UREM_I32 */ i32_func_i32_i32,
+/* UREM_I64 */ i64_func_i64_i64,
+/* UREM_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SDIVREM_I8 */ i8_func_i8_i8,
+/* SDIVREM_I16 */ i16_i16_func_i16_i16,
+/* SDIVREM_I32 */ i32_i32_func_i32_i32,
+/* SDIVREM_I64 */ i64_func_i64_i64,
+/* SDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
+/* UDIVREM_I8 */ i8_func_i8_i8,
+/* UDIVREM_I16 */ i16_i16_func_i16_i16,
+/* UDIVREM_I32 */ i32_i32_func_i32_i32,
+/* UDIVREM_I64 */ i64_i64_func_i64_i64,
+/* UDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
+/* NEG_I32 */ i32_func_i32,
+/* NEG_I64 */ i64_func_i64,
+
+// FLOATING POINT
+/* ADD_F32 */ f32_func_f32_f32,
+/* ADD_F64 */ f64_func_f64_f64,
+/* ADD_F80 */ unsupported,
+/* ADD_F128 */ func_iPTR_i64_i64_i64_i64,
+/* ADD_PPCF128 */ unsupported,
+/* SUB_F32 */ f32_func_f32_f32,
+/* SUB_F64 */ f64_func_f64_f64,
+/* SUB_F80 */ unsupported,
+/* SUB_F128 */ func_iPTR_i64_i64_i64_i64,
+/* SUB_PPCF128 */ unsupported,
+/* MUL_F32 */ f32_func_f32_f32,
+/* MUL_F64 */ f64_func_f64_f64,
+/* MUL_F80 */ unsupported,
+/* MUL_F128 */ func_iPTR_i64_i64_i64_i64,
+/* MUL_PPCF128 */ unsupported,
+/* DIV_F32 */ f32_func_f32_f32,
+/* DIV_F64 */ f64_func_f64_f64,
+/* DIV_F80 */ unsupported,
+/* DIV_F128 */ func_iPTR_i64_i64_i64_i64,
+/* DIV_PPCF128 */ unsupported,
+/* REM_F32 */ f32_func_f32_f32,
+/* REM_F64 */ f64_func_f64_f64,
+/* REM_F80 */ unsupported,
+/* REM_F128 */ func_iPTR_i64_i64_i64_i64,
+/* REM_PPCF128 */ unsupported,
+/* FMA_F32 */ f32_func_f32_f32_f32,
+/* FMA_F64 */ f64_func_f64_f64_f64,
+/* FMA_F80 */ unsupported,
+/* FMA_F128 */ func_iPTR_i64_i64_i64_i64_i64_i64,
+/* FMA_PPCF128 */ unsupported,
+/* POWI_F32 */ f32_func_f32_i32,
+/* POWI_F64 */ f64_func_f64_i32,
+/* POWI_F80 */ unsupported,
+/* POWI_F128 */ func_iPTR_i64_i64_i64_i64,
+/* POWI_PPCF128 */ unsupported,
+/* SQRT_F32 */ f32_func_f32,
+/* SQRT_F64 */ f64_func_f64,
+/* SQRT_F80 */ unsupported,
+/* SQRT_F128 */ func_iPTR_i64_i64,
+/* SQRT_PPCF128 */ unsupported,
+/* LOG_F32 */ f32_func_f32,
+/* LOG_F64 */ f64_func_f64,
+/* LOG_F80 */ unsupported,
+/* LOG_F128 */ func_iPTR_i64_i64,
+/* LOG_PPCF128 */ unsupported,
+/* LOG2_F32 */ f32_func_f32,
+/* LOG2_F64 */ f64_func_f64,
+/* LOG2_F80 */ unsupported,
+/* LOG2_F128 */ func_iPTR_i64_i64,
+/* LOG2_PPCF128 */ unsupported,
+/* LOG10_F32 */ f32_func_f32,
+/* LOG10_F64 */ f64_func_f64,
+/* LOG10_F80 */ unsupported,
+/* LOG10_F128 */ func_iPTR_i64_i64,
+/* LOG10_PPCF128 */ unsupported,
+/* EXP_F32 */ f32_func_f32,
+/* EXP_F64 */ f64_func_f64,
+/* EXP_F80 */ unsupported,
+/* EXP_F128 */ func_iPTR_i64_i64,
+/* EXP_PPCF128 */ unsupported,
+/* EXP2_F32 */ f32_func_f32,
+/* EXP2_F64 */ f64_func_f64,
+/* EXP2_F80 */ unsupported,
+/* EXP2_F128 */ func_iPTR_i64_i64,
+/* EXP2_PPCF128 */ unsupported,
+/* SIN_F32 */ f32_func_f32,
+/* SIN_F64 */ f64_func_f64,
+/* SIN_F80 */ unsupported,
+/* SIN_F128 */ func_iPTR_i64_i64,
+/* SIN_PPCF128 */ unsupported,
+/* COS_F32 */ f32_func_f32,
+/* COS_F64 */ f64_func_f64,
+/* COS_F80 */ unsupported,
+/* COS_F128 */ func_iPTR_i64_i64,
+/* COS_PPCF128 */ unsupported,
+/* SINCOS_F32 */ func_f32_iPTR_iPTR,
+/* SINCOS_F64 */ func_f64_iPTR_iPTR,
+/* SINCOS_F80 */ unsupported,
+/* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,
+/* SINCOS_PPCF128 */ unsupported,
+/* POW_F32 */ f32_func_f32_f32,
+/* POW_F64 */ f64_func_f64_f64,
+/* POW_F80 */ unsupported,
+/* POW_F128 */ func_iPTR_i64_i64_i64_i64,
+/* POW_PPCF128 */ unsupported,
+/* CEIL_F32 */ f32_func_f32,
+/* CEIL_F64 */ f64_func_f64,
+/* CEIL_F80 */ unsupported,
+/* CEIL_F128 */ func_iPTR_i64_i64,
+/* CEIL_PPCF128 */ unsupported,
+/* TRUNC_F32 */ f32_func_f32,
+/* TRUNC_F64 */ f64_func_f64,
+/* TRUNC_F80 */ unsupported,
+/* TRUNC_F128 */ func_iPTR_i64_i64,
+/* TRUNC_PPCF128 */ unsupported,
+/* RINT_F32 */ f32_func_f32,
+/* RINT_F64 */ f64_func_f64,
+/* RINT_F80 */ unsupported,
+/* RINT_F128 */ func_iPTR_i64_i64,
+/* RINT_PPCF128 */ unsupported,
+/* NEARBYINT_F32 */ f32_func_f32,
+/* NEARBYINT_F64 */ f64_func_f64,
+/* NEARBYINT_F80 */ unsupported,
+/* NEARBYINT_F128 */ func_iPTR_i64_i64,
+/* NEARBYINT_PPCF128 */ unsupported,
+/* ROUND_F32 */ f32_func_f32,
+/* ROUND_F64 */ f64_func_f64,
+/* ROUND_F80 */ unsupported,
+/* ROUND_F128 */ func_iPTR_i64_i64,
+/* ROUND_PPCF128 */ unsupported,
+/* FLOOR_F32 */ f32_func_f32,
+/* FLOOR_F64 */ f64_func_f64,
+/* FLOOR_F80 */ unsupported,
+/* FLOOR_F128 */ func_iPTR_i64_i64,
+/* FLOOR_PPCF128 */ unsupported,
+/* COPYSIGN_F32 */ f32_func_f32_f32,
+/* COPYSIGN_F64 */ f64_func_f64_f64,
+/* COPYSIGN_F80 */ unsupported,
+/* COPYSIGN_F128 */ func_iPTR_i64_i64_i64_i64,
+/* COPYSIGN_PPCF128 */ unsupported,
+/* FMIN_F32 */ f32_func_f32_f32,
+/* FMIN_F64 */ f64_func_f64_f64,
+/* FMIN_F80 */ unsupported,
+/* FMIN_F128 */ func_iPTR_i64_i64_i64_i64,
+/* FMIN_PPCF128 */ unsupported,
+/* FMAX_F32 */ f32_func_f32_f32,
+/* FMAX_F64 */ f64_func_f64_f64,
+/* FMAX_F80 */ unsupported,
+/* FMAX_F128 */ func_iPTR_i64_i64_i64_i64,
+/* FMAX_PPCF128 */ unsupported,
+
+// CONVERSION
+/* FPEXT_F32_PPCF128 */ unsupported,
+/* FPEXT_F64_PPCF128 */ unsupported,
+/* FPEXT_F64_F128 */ func_iPTR_f64,
+/* FPEXT_F32_F128 */ func_iPTR_f32,
+/* FPEXT_F32_F64 */ f64_func_f32,
+/* FPEXT_F16_F32 */ f32_func_i16,
+/* FPROUND_F32_F16 */ i16_func_f32,
+/* FPROUND_F64_F16 */ unsupported,
+/* FPROUND_F80_F16 */ unsupported,
+/* FPROUND_F128_F16 */ unsupported,
+/* FPROUND_PPCF128_F16 */ unsupported,
+/* FPROUND_F64_F32 */ f32_func_f64,
+/* FPROUND_F80_F32 */ unsupported,
+/* FPROUND_F128_F32 */ f32_func_i64_i64,
+/* FPROUND_PPCF128_F32 */ unsupported,
+/* FPROUND_F80_F64 */ unsupported,
+/* FPROUND_F128_F64 */ f64_func_i64_i64,
+/* FPROUND_PPCF128_F64 */ unsupported,
+/* FPTOSINT_F32_I32 */ i32_func_f32,
+/* FPTOSINT_F32_I64 */ i64_func_f32,
+/* FPTOSINT_F32_I128 */ i64_i64_func_f32,
+/* FPTOSINT_F64_I32 */ i32_func_f64,
+/* FPTOSINT_F64_I64 */ i64_func_f64,
+/* FPTOSINT_F64_I128 */ i64_i64_func_f64,
+/* FPTOSINT_F80_I32 */ unsupported,
+/* FPTOSINT_F80_I64 */ unsupported,
+/* FPTOSINT_F80_I128 */ unsupported,
+/* FPTOSINT_F128_I32 */ i32_func_i64_i64,
+/* FPTOSINT_F128_I64 */ i64_func_i64_i64,
+/* FPTOSINT_F128_I128 */ i64_i64_func_i64_i64,
+/* FPTOSINT_PPCF128_I32 */ unsupported,
+/* FPTOSINT_PPCF128_I64 */ unsupported,
+/* FPTOSINT_PPCF128_I128 */ unsupported,
+/* FPTOUINT_F32_I32 */ i32_func_f32,
+/* FPTOUINT_F32_I64 */ i64_func_f32,
+/* FPTOUINT_F32_I128 */ i64_i64_func_f32,
+/* FPTOUINT_F64_I32 */ i32_func_f64,
+/* FPTOUINT_F64_I64 */ i64_func_f64,
+/* FPTOUINT_F64_I128 */ i64_i64_func_f64,
+/* FPTOUINT_F80_I32 */ unsupported,
+/* FPTOUINT_F80_I64 */ unsupported,
+/* FPTOUINT_F80_I128 */ unsupported,
+/* FPTOUINT_F128_I32 */ i32_func_i64_i64,
+/* FPTOUINT_F128_I64 */ i64_func_i64_i64,
+/* FPTOUINT_F128_I128 */ i64_i64_func_i64_i64,
+/* FPTOUINT_PPCF128_I32 */ unsupported,
+/* FPTOUINT_PPCF128_I64 */ unsupported,
+/* FPTOUINT_PPCF128_I128 */ unsupported,
+/* SINTTOFP_I32_F32 */ f32_func_i32,
+/* SINTTOFP_I32_F64 */ f64_func_i32,
+/* SINTTOFP_I32_F80 */ unsupported,
+/* SINTTOFP_I32_F128 */ func_iPTR_i32,
+/* SINTTOFP_I32_PPCF128 */ unsupported,
+/* SINTTOFP_I64_F32 */ f32_func_i64,
+/* SINTTOFP_I64_F64 */ f64_func_i64,
+/* SINTTOFP_I64_F80 */ unsupported,
+/* SINTTOFP_I64_F128 */ func_iPTR_i64,
+/* SINTTOFP_I64_PPCF128 */ unsupported,
+/* SINTTOFP_I128_F32 */ f32_func_i64_i64,
+/* SINTTOFP_I128_F64 */ f64_func_i64_i64,
+/* SINTTOFP_I128_F80 */ unsupported,
+/* SINTTOFP_I128_F128 */ func_iPTR_i64_i64,
+/* SINTTOFP_I128_PPCF128 */ unsupported,
+/* UINTTOFP_I32_F32 */ f32_func_i32,
+/* UINTTOFP_I32_F64 */ f64_func_i64,
+/* UINTTOFP_I32_F80 */ unsupported,
+/* UINTTOFP_I32_F128 */ func_iPTR_i32,
+/* UINTTOFP_I32_PPCF128 */ unsupported,
+/* UINTTOFP_I64_F32 */ f32_func_i64,
+/* UINTTOFP_I64_F64 */ f64_func_i64,
+/* UINTTOFP_I64_F80 */ unsupported,
+/* UINTTOFP_I64_F128 */ func_iPTR_i64,
+/* UINTTOFP_I64_PPCF128 */ unsupported,
+/* UINTTOFP_I128_F32 */ f32_func_i64_i64,
+/* UINTTOFP_I128_F64 */ f64_func_i64_i64,
+/* UINTTOFP_I128_F80 */ unsupported,
+/* UINTTOFP_I128_F128 */ func_iPTR_i64_i64,
+/* UINTTOFP_I128_PPCF128 */ unsupported,
+
+// COMPARISON
+/* OEQ_F32 */ i32_func_f32_f32,
+/* OEQ_F64 */ i32_func_f64_f64,
+/* OEQ_F128 */ i32_func_i64_i64_i64_i64,
+/* OEQ_PPCF128 */ unsupported,
+/* UNE_F32 */ i32_func_f32_f32,
+/* UNE_F64 */ i32_func_f64_f64,
+/* UNE_F128 */ i32_func_i64_i64_i64_i64,
+/* UNE_PPCF128 */ unsupported,
+/* OGE_F32 */ i32_func_f32_f32,
+/* OGE_F64 */ i32_func_f64_f64,
+/* OGE_F128 */ i32_func_i64_i64_i64_i64,
+/* OGE_PPCF128 */ unsupported,
+/* OLT_F32 */ i32_func_f32_f32,
+/* OLT_F64 */ i32_func_f64_f64,
+/* OLT_F128 */ i32_func_i64_i64_i64_i64,
+/* OLT_PPCF128 */ unsupported,
+/* OLE_F32 */ i32_func_f32_f32,
+/* OLE_F64 */ i32_func_f64_f64,
+/* OLE_F128 */ i32_func_i64_i64_i64_i64,
+/* OLE_PPCF128 */ unsupported,
+/* OGT_F32 */ i32_func_f32_f32,
+/* OGT_F64 */ i32_func_f64_f64,
+/* OGT_F128 */ i32_func_i64_i64_i64_i64,
+/* OGT_PPCF128 */ unsupported,
+/* UO_F32 */ i32_func_f32_f32,
+/* UO_F64 */ i32_func_f64_f64,
+/* UO_F128 */ i32_func_i64_i64_i64_i64,
+/* UO_PPCF128 */ unsupported,
+/* O_F32 */ i32_func_f32_f32,
+/* O_F64 */ i32_func_f64_f64,
+/* O_F128 */ i32_func_i64_i64_i64_i64,
+/* O_PPCF128 */ unsupported,
+
+// MEMORY
+/* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
+/* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR,
+
+// ELEMENT-WISE ATOMIC MEMORY
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
+
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported,
+
+// EXCEPTION HANDLING
+/* UNWIND_RESUME */ unsupported,
+
+// Note: there's two sets of atomics libcalls; see
+// <http://llvm.org/docs/Atomics.html> for more info on the
+// difference between them.
+
+// Atomic '__sync_*' libcalls.
+/* SYNC_VAL_COMPARE_AND_SWAP_1 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_2 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_4 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_8 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_16 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_1 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_2 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_4 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_8 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_16 */ unsupported,
+/* SYNC_FETCH_AND_ADD_1 */ unsupported,
+/* SYNC_FETCH_AND_ADD_2 */ unsupported,
+/* SYNC_FETCH_AND_ADD_4 */ unsupported,
+/* SYNC_FETCH_AND_ADD_8 */ unsupported,
+/* SYNC_FETCH_AND_ADD_16 */ unsupported,
+/* SYNC_FETCH_AND_SUB_1 */ unsupported,
+/* SYNC_FETCH_AND_SUB_2 */ unsupported,
+/* SYNC_FETCH_AND_SUB_4 */ unsupported,
+/* SYNC_FETCH_AND_SUB_8 */ unsupported,
+/* SYNC_FETCH_AND_SUB_16 */ unsupported,
+/* SYNC_FETCH_AND_AND_1 */ unsupported,
+/* SYNC_FETCH_AND_AND_2 */ unsupported,
+/* SYNC_FETCH_AND_AND_4 */ unsupported,
+/* SYNC_FETCH_AND_AND_8 */ unsupported,
+/* SYNC_FETCH_AND_AND_16 */ unsupported,
+/* SYNC_FETCH_AND_OR_1 */ unsupported,
+/* SYNC_FETCH_AND_OR_2 */ unsupported,
+/* SYNC_FETCH_AND_OR_4 */ unsupported,
+/* SYNC_FETCH_AND_OR_8 */ unsupported,
+/* SYNC_FETCH_AND_OR_16 */ unsupported,
+/* SYNC_FETCH_AND_XOR_1 */ unsupported,
+/* SYNC_FETCH_AND_XOR_2 */ unsupported,
+/* SYNC_FETCH_AND_XOR_4 */ unsupported,
+/* SYNC_FETCH_AND_XOR_8 */ unsupported,
+/* SYNC_FETCH_AND_XOR_16 */ unsupported,
+/* SYNC_FETCH_AND_NAND_1 */ unsupported,
+/* SYNC_FETCH_AND_NAND_2 */ unsupported,
+/* SYNC_FETCH_AND_NAND_4 */ unsupported,
+/* SYNC_FETCH_AND_NAND_8 */ unsupported,
+/* SYNC_FETCH_AND_NAND_16 */ unsupported,
+/* SYNC_FETCH_AND_MAX_1 */ unsupported,
+/* SYNC_FETCH_AND_MAX_2 */ unsupported,
+/* SYNC_FETCH_AND_MAX_4 */ unsupported,
+/* SYNC_FETCH_AND_MAX_8 */ unsupported,
+/* SYNC_FETCH_AND_MAX_16 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_1 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_2 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_4 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_8 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_16 */ unsupported,
+/* SYNC_FETCH_AND_MIN_1 */ unsupported,
+/* SYNC_FETCH_AND_MIN_2 */ unsupported,
+/* SYNC_FETCH_AND_MIN_4 */ unsupported,
+/* SYNC_FETCH_AND_MIN_8 */ unsupported,
+/* SYNC_FETCH_AND_MIN_16 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_1 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_2 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_4 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_8 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_16 */ unsupported,
+
+// Atomic '__atomic_*' libcalls.
+/* ATOMIC_LOAD */ unsupported,
+/* ATOMIC_LOAD_1 */ unsupported,
+/* ATOMIC_LOAD_2 */ unsupported,
+/* ATOMIC_LOAD_4 */ unsupported,
+/* ATOMIC_LOAD_8 */ unsupported,
+/* ATOMIC_LOAD_16 */ unsupported,
+
+/* ATOMIC_STORE */ unsupported,
+/* ATOMIC_STORE_1 */ unsupported,
+/* ATOMIC_STORE_2 */ unsupported,
+/* ATOMIC_STORE_4 */ unsupported,
+/* ATOMIC_STORE_8 */ unsupported,
+/* ATOMIC_STORE_16 */ unsupported,
+
+/* ATOMIC_EXCHANGE */ unsupported,
+/* ATOMIC_EXCHANGE_1 */ unsupported,
+/* ATOMIC_EXCHANGE_2 */ unsupported,
+/* ATOMIC_EXCHANGE_4 */ unsupported,
+/* ATOMIC_EXCHANGE_8 */ unsupported,
+/* ATOMIC_EXCHANGE_16 */ unsupported,
+
+/* ATOMIC_COMPARE_EXCHANGE */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_1 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_2 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_4 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_8 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_16 */ unsupported,
+
+/* ATOMIC_FETCH_ADD_1 */ unsupported,
+/* ATOMIC_FETCH_ADD_2 */ unsupported,
+/* ATOMIC_FETCH_ADD_4 */ unsupported,
+/* ATOMIC_FETCH_ADD_8 */ unsupported,
+/* ATOMIC_FETCH_ADD_16 */ unsupported,
+
+/* ATOMIC_FETCH_SUB_1 */ unsupported,
+/* ATOMIC_FETCH_SUB_2 */ unsupported,
+/* ATOMIC_FETCH_SUB_4 */ unsupported,
+/* ATOMIC_FETCH_SUB_8 */ unsupported,
+/* ATOMIC_FETCH_SUB_16 */ unsupported,
+
+/* ATOMIC_FETCH_AND_1 */ unsupported,
+/* ATOMIC_FETCH_AND_2 */ unsupported,
+/* ATOMIC_FETCH_AND_4 */ unsupported,
+/* ATOMIC_FETCH_AND_8 */ unsupported,
+/* ATOMIC_FETCH_AND_16 */ unsupported,
+
+/* ATOMIC_FETCH_OR_1 */ unsupported,
+/* ATOMIC_FETCH_OR_2 */ unsupported,
+/* ATOMIC_FETCH_OR_4 */ unsupported,
+/* ATOMIC_FETCH_OR_8 */ unsupported,
+/* ATOMIC_FETCH_OR_16 */ unsupported,
+
+/* ATOMIC_FETCH_XOR_1 */ unsupported,
+/* ATOMIC_FETCH_XOR_2 */ unsupported,
+/* ATOMIC_FETCH_XOR_4 */ unsupported,
+/* ATOMIC_FETCH_XOR_8 */ unsupported,
+/* ATOMIC_FETCH_XOR_16 */ unsupported,
+
+/* ATOMIC_FETCH_NAND_1 */ unsupported,
+/* ATOMIC_FETCH_NAND_2 */ unsupported,
+/* ATOMIC_FETCH_NAND_4 */ unsupported,
+/* ATOMIC_FETCH_NAND_8 */ unsupported,
+/* ATOMIC_FETCH_NAND_16 */ unsupported,
+
+// Stack Protector Fail.
+/* STACKPROTECTOR_CHECK_FAIL */ func,
+
+// Deoptimization.
+/* DEOPTIMIZE */ unsupported,
+
+};
+
+static const char *
+RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
+/* SHL_I16 */ "__ashlhi3",
+/* SHL_I32 */ "__ashlsi3",
+/* SHL_I64 */ "__ashldi3",
+/* SHL_I128 */ "__ashlti3",
+/* SRL_I16 */ "__lshrhi3",
+/* SRL_I32 */ "__lshrsi3",
+/* SRL_I64 */ "__lshrdi3",
+/* SRL_I128 */ "__lshrti3",
+/* SRA_I16 */ "__ashrhi3",
+/* SRA_I32 */ "__ashrsi3",
+/* SRA_I64 */ "__ashrdi3",
+/* SRA_I128 */ "__ashrti3",
+/* MUL_I8 */ "__mulqi3",
+/* MUL_I16 */ "__mulhi3",
+/* MUL_I32 */ "__mulsi3",
+/* MUL_I64 */ "__muldi3",
+/* MUL_I128 */ "__multi3",
+/* MULO_I32 */ "__mulosi4",
+/* MULO_I64 */ "__mulodi4",
+/* MULO_I128 */ "__muloti4",
+/* SDIV_I8 */ "__divqi3",
+/* SDIV_I16 */ "__divhi3",
+/* SDIV_I32 */ "__divsi3",
+/* SDIV_I64 */ "__divdi3",
+/* SDIV_I128 */ "__divti3",
+/* UDIV_I8 */ "__udivqi3",
+/* UDIV_I16 */ "__udivhi3",
+/* UDIV_I32 */ "__udivsi3",
+/* UDIV_I64 */ "__udivdi3",
+/* UDIV_I128 */ "__udivti3",
+/* SREM_I8 */ "__modqi3",
+/* SREM_I16 */ "__modhi3",
+/* SREM_I32 */ "__modsi3",
+/* SREM_I64 */ "__moddi3",
+/* SREM_I128 */ "__modti3",
+/* UREM_I8 */ "__umodqi3",
+/* UREM_I16 */ "__umodhi3",
+/* UREM_I32 */ "__umodsi3",
+/* UREM_I64 */ "__umoddi3",
+/* UREM_I128 */ "__umodti3",
+/* SDIVREM_I8 */ nullptr,
+/* SDIVREM_I16 */ nullptr,
+/* SDIVREM_I32 */ nullptr,
+/* SDIVREM_I64 */ nullptr,
+/* SDIVREM_I128 */ nullptr,
+/* UDIVREM_I8 */ nullptr,
+/* UDIVREM_I16 */ nullptr,
+/* UDIVREM_I32 */ nullptr,
+/* UDIVREM_I64 */ nullptr,
+/* UDIVREM_I128 */ nullptr,
+/* NEG_I32 */ "__negsi2",
+/* NEG_I64 */ "__negdi2",
+/* ADD_F32 */ "__addsf3",
+/* ADD_F64 */ "__adddf3",
+/* ADD_F80 */ nullptr,
+/* ADD_F128 */ "__addtf3",
+/* ADD_PPCF128 */ nullptr,
+/* SUB_F32 */ "__subsf3",
+/* SUB_F64 */ "__subdf3",
+/* SUB_F80 */ nullptr,
+/* SUB_F128 */ "__subtf3",
+/* SUB_PPCF128 */ nullptr,
+/* MUL_F32 */ "__mulsf3",
+/* MUL_F64 */ "__muldf3",
+/* MUL_F80 */ nullptr,
+/* MUL_F128 */ "__multf3",
+/* MUL_PPCF128 */ nullptr,
+/* DIV_F32 */ "__divsf3",
+/* DIV_F64 */ "__divdf3",
+/* DIV_F80 */ nullptr,
+/* DIV_F128 */ "__divtf3",
+/* DIV_PPCF128 */ nullptr,
+/* REM_F32 */ "fmodf",
+/* REM_F64 */ "fmod",
+/* REM_F80 */ nullptr,
+/* REM_F128 */ "fmodl",
+/* REM_PPCF128 */ nullptr,
+/* FMA_F32 */ "fmaf",
+/* FMA_F64 */ "fma",
+/* FMA_F80 */ nullptr,
+/* FMA_F128 */ "fmal",
+/* FMA_PPCF128 */ nullptr,
+/* POWI_F32 */ "__powisf2",
+/* POWI_F64 */ "__powidf2",
+/* POWI_F80 */ nullptr,
+/* POWI_F128 */ "__powitf2",
+/* POWI_PPCF128 */ nullptr,
+/* SQRT_F32 */ "sqrtf",
+/* SQRT_F64 */ "sqrt",
+/* SQRT_F80 */ nullptr,
+/* SQRT_F128 */ "sqrtl",
+/* SQRT_PPCF128 */ nullptr,
+/* LOG_F32 */ "logf",
+/* LOG_F64 */ "log",
+/* LOG_F80 */ nullptr,
+/* LOG_F128 */ "logl",
+/* LOG_PPCF128 */ nullptr,
+/* LOG2_F32 */ "log2f",
+/* LOG2_F64 */ "log2",
+/* LOG2_F80 */ nullptr,
+/* LOG2_F128 */ "log2l",
+/* LOG2_PPCF128 */ nullptr,
+/* LOG10_F32 */ "log10f",
+/* LOG10_F64 */ "log10",
+/* LOG10_F80 */ nullptr,
+/* LOG10_F128 */ "log10l",
+/* LOG10_PPCF128 */ nullptr,
+/* EXP_F32 */ "expf",
+/* EXP_F64 */ "exp",
+/* EXP_F80 */ nullptr,
+/* EXP_F128 */ "expl",
+/* EXP_PPCF128 */ nullptr,
+/* EXP2_F32 */ "exp2f",
+/* EXP2_F64 */ "exp2",
+/* EXP2_F80 */ nullptr,
+/* EXP2_F128 */ "exp2l",
+/* EXP2_PPCF128 */ nullptr,
+/* SIN_F32 */ "sinf",
+/* SIN_F64 */ "sin",
+/* SIN_F80 */ nullptr,
+/* SIN_F128 */ "sinl",
+/* SIN_PPCF128 */ nullptr,
+/* COS_F32 */ "cosf",
+/* COS_F64 */ "cos",
+/* COS_F80 */ nullptr,
+/* COS_F128 */ "cosl",
+/* COS_PPCF128 */ nullptr,
+/* SINCOS_F32 */ "sincosf",
+/* SINCOS_F64 */ "sincos",
+/* SINCOS_F80 */ nullptr,
+/* SINCOS_F128 */ "sincosl",
+/* SINCOS_PPCF128 */ nullptr,
+/* POW_F32 */ "powf",
+/* POW_F64 */ "pow",
+/* POW_F80 */ nullptr,
+/* POW_F128 */ "powl",
+/* POW_PPCF128 */ nullptr,
+/* CEIL_F32 */ "ceilf",
+/* CEIL_F64 */ "ceil",
+/* CEIL_F80 */ nullptr,
+/* CEIL_F128 */ "ceill",
+/* CEIL_PPCF128 */ nullptr,
+/* TRUNC_F32 */ "truncf",
+/* TRUNC_F64 */ "trunc",
+/* TRUNC_F80 */ nullptr,
+/* TRUNC_F128 */ "truncl",
+/* TRUNC_PPCF128 */ nullptr,
+/* RINT_F32 */ "rintf",
+/* RINT_F64 */ "rint",
+/* RINT_F80 */ nullptr,
+/* RINT_F128 */ "rintl",
+/* RINT_PPCF128 */ nullptr,
+/* NEARBYINT_F32 */ "nearbyintf",
+/* NEARBYINT_F64 */ "nearbyint",
+/* NEARBYINT_F80 */ nullptr,
+/* NEARBYINT_F128 */ "nearbyintl",
+/* NEARBYINT_PPCF128 */ nullptr,
+/* ROUND_F32 */ "roundf",
+/* ROUND_F64 */ "round",
+/* ROUND_F80 */ nullptr,
+/* ROUND_F128 */ "roundl",
+/* ROUND_PPCF128 */ nullptr,
+/* FLOOR_F32 */ "floorf",
+/* FLOOR_F64 */ "floor",
+/* FLOOR_F80 */ nullptr,
+/* FLOOR_F128 */ "floorl",
+/* FLOOR_PPCF128 */ nullptr,
+/* COPYSIGN_F32 */ "copysignf",
+/* COPYSIGN_F64 */ "copysign",
+/* COPYSIGN_F80 */ nullptr,
+/* COPYSIGN_F128 */ "copysignl",
+/* COPYSIGN_PPCF128 */ nullptr,
+/* FMIN_F32 */ "fminf",
+/* FMIN_F64 */ "fmin",
+/* FMIN_F80 */ nullptr,
+/* FMIN_F128 */ "fminl",
+/* FMIN_PPCF128 */ nullptr,
+/* FMAX_F32 */ "fmaxf",
+/* FMAX_F64 */ "fmax",
+/* FMAX_F80 */ nullptr,
+/* FMAX_F128 */ "fmaxl",
+/* FMAX_PPCF128 */ nullptr,
+/* FPEXT_F32_PPCF128 */ nullptr,
+/* FPEXT_F64_PPCF128 */ nullptr,
+/* FPEXT_F64_F128 */ "__extenddftf2",
+/* FPEXT_F32_F128 */ "__extendsftf2",
+/* FPEXT_F32_F64 */ "__extendsfdf2",
+/* FPEXT_F16_F32 */ "__gnu_h2f_ieee",
+/* FPROUND_F32_F16 */ "__gnu_f2h_ieee",
+/* FPROUND_F64_F16 */ nullptr,
+/* FPROUND_F80_F16 */ nullptr,
+/* FPROUND_F128_F16 */ nullptr,
+/* FPROUND_PPCF128_F16 */ nullptr,
+/* FPROUND_F64_F32 */ "__truncdfsf2",
+/* FPROUND_F80_F32 */ "__truncxfsf2",
+/* FPROUND_F128_F32 */ "__trunctfsf2",
+/* FPROUND_PPCF128_F32 */ nullptr,
+/* FPROUND_F80_F64 */ "__truncxfdf2",
+/* FPROUND_F128_F64 */ "__trunctfdf2",
+/* FPROUND_PPCF128_F64 */ nullptr,
+/* FPTOSINT_F32_I32 */ "__fixsfsi",
+/* FPTOSINT_F32_I64 */ "__fixsfdi",
+/* FPTOSINT_F32_I128 */ "__fixsfti",
+/* FPTOSINT_F64_I32 */ "__fixdfsi",
+/* FPTOSINT_F64_I64 */ "__fixdfdi",
+/* FPTOSINT_F64_I128 */ "__fixdfti",
+/* FPTOSINT_F80_I32 */ "__fixxfsi",
+/* FPTOSINT_F80_I64 */ "__fixxfdi",
+/* FPTOSINT_F80_I128 */ "__fixxfti",
+/* FPTOSINT_F128_I32 */ "__fixtfsi",
+/* FPTOSINT_F128_I64 */ "__fixtfdi",
+/* FPTOSINT_F128_I128 */ "__fixtfti",
+/* FPTOSINT_PPCF128_I32 */ nullptr,
+/* FPTOSINT_PPCF128_I64 */ nullptr,
+/* FPTOSINT_PPCF128_I128 */ nullptr,
+/* FPTOUINT_F32_I32 */ "__fixunssfsi",
+/* FPTOUINT_F32_I64 */ "__fixunssfdi",
+/* FPTOUINT_F32_I128 */ "__fixunssfti",
+/* FPTOUINT_F64_I32 */ "__fixunsdfsi",
+/* FPTOUINT_F64_I64 */ "__fixunsdfdi",
+/* FPTOUINT_F64_I128 */ "__fixunsdfti",
+/* FPTOUINT_F80_I32 */ "__fixunsxfsi",
+/* FPTOUINT_F80_I64 */ "__fixunsxfdi",
+/* FPTOUINT_F80_I128 */ "__fixunsxfti",
+/* FPTOUINT_F128_I32 */ "__fixunstfsi",
+/* FPTOUINT_F128_I64 */ "__fixunstfdi",
+/* FPTOUINT_F128_I128 */ "__fixunstfti",
+/* FPTOUINT_PPCF128_I32 */ nullptr,
+/* FPTOUINT_PPCF128_I64 */ nullptr,
+/* FPTOUINT_PPCF128_I128 */ nullptr,
+/* SINTTOFP_I32_F32 */ "__floatsisf",
+/* SINTTOFP_I32_F64 */ "__floatsidf",
+/* SINTTOFP_I32_F80 */ nullptr,
+/* SINTTOFP_I32_F128 */ "__floatsitf",
+/* SINTTOFP_I32_PPCF128 */ nullptr,
+/* SINTTOFP_I64_F32 */ "__floatdisf",
+/* SINTTOFP_I64_F64 */ "__floatdidf",
+/* SINTTOFP_I64_F80 */ nullptr,
+/* SINTTOFP_I64_F128 */ "__floatditf",
+/* SINTTOFP_I64_PPCF128 */ nullptr,
+/* SINTTOFP_I128_F32 */ "__floattisf",
+/* SINTTOFP_I128_F64 */ "__floattidf",
+/* SINTTOFP_I128_F80 */ nullptr,
+/* SINTTOFP_I128_F128 */ "__floattitf",
+/* SINTTOFP_I128_PPCF128 */ nullptr,
+/* UINTTOFP_I32_F32 */ "__floatunsisf",
+/* UINTTOFP_I32_F64 */ "__floatunsidf",
+/* UINTTOFP_I32_F80 */ nullptr,
+/* UINTTOFP_I32_F128 */ "__floatunsitf",
+/* UINTTOFP_I32_PPCF128 */ nullptr,
+/* UINTTOFP_I64_F32 */ "__floatundisf",
+/* UINTTOFP_I64_F64 */ "__floatundidf",
+/* UINTTOFP_I64_F80 */ nullptr,
+/* UINTTOFP_I64_F128 */ "__floatunditf",
+/* UINTTOFP_I64_PPCF128 */ nullptr,
+/* UINTTOFP_I128_F32 */ "__floatuntisf",
+/* UINTTOFP_I128_F64 */ "__floatuntidf",
+/* UINTTOFP_I128_F80 */ nullptr,
+/* UINTTOFP_I128_F128 */ "__floatuntitf",
+/* UINTTOFP_I128_PPCF128 */ nullptr,
+/* OEQ_F32 */ "__eqsf2",
+/* OEQ_F64 */ "__eqdf2",
+/* OEQ_F128 */ "__eqtf2",
+/* OEQ_PPCF128 */ nullptr,
+/* UNE_F32 */ "__nesf2",
+/* UNE_F64 */ "__nedf2",
+/* UNE_F128 */ "__netf2",
+/* UNE_PPCF128 */ nullptr,
+/* OGE_F32 */ "__gesf2",
+/* OGE_F64 */ "__gedf2",
+/* OGE_F128 */ "__getf2",
+/* OGE_PPCF128 */ nullptr,
+/* OLT_F32 */ "__ltsf2",
+/* OLT_F64 */ "__ltdf2",
+/* OLT_F128 */ "__lttf2",
+/* OLT_PPCF128 */ nullptr,
+/* OLE_F32 */ "__lesf2",
+/* OLE_F64 */ "__ledf2",
+/* OLE_F128 */ "__letf2",
+/* OLE_PPCF128 */ nullptr,
+/* OGT_F32 */ "__gtsf2",
+/* OGT_F64 */ "__gtdf2",
+/* OGT_F128 */ "__gttf2",
+/* OGT_PPCF128 */ nullptr,
+/* UO_F32 */ "__unordsf2",
+/* UO_F64 */ "__unorddf2",
+/* UO_F128 */ "__unordtf2",
+/* UO_PPCF128 */ nullptr,
+/* O_F32 */ "__unordsf2",
+/* O_F64 */ "__unorddf2",
+/* O_F128 */ "__unordtf2",
+/* O_PPCF128 */ nullptr,
+/* MEMCPY */ "memcpy",
+/* MEMMOVE */ "memset",
+/* MEMSET */ "memmove",
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
+/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
+/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr,
+/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr,
+/* UNWIND_RESUME */ "_Unwind_Resume",
+/* SYNC_VAL_COMPARE_AND_SWAP_1 */ "__sync_val_compare_and_swap_1",
+/* SYNC_VAL_COMPARE_AND_SWAP_2 */ "__sync_val_compare_and_swap_2",
+/* SYNC_VAL_COMPARE_AND_SWAP_4 */ "__sync_val_compare_and_swap_4",
+/* SYNC_VAL_COMPARE_AND_SWAP_8 */ "__sync_val_compare_and_swap_8",
+/* SYNC_VAL_COMPARE_AND_SWAP_16 */ "__sync_val_compare_and_swap_16",
+/* SYNC_LOCK_TEST_AND_SET_1 */ "__sync_lock_test_and_set_1",
+/* SYNC_LOCK_TEST_AND_SET_2 */ "__sync_lock_test_and_set_2",
+/* SYNC_LOCK_TEST_AND_SET_4 */ "__sync_lock_test_and_set_4",
+/* SYNC_LOCK_TEST_AND_SET_8 */ "__sync_lock_test_and_set_8",
+/* SYNC_LOCK_TEST_AND_SET_16 */ "__sync_lock_test_and_set_16",
+/* SYNC_FETCH_AND_ADD_1 */ "__sync_fetch_and_add_1",
+/* SYNC_FETCH_AND_ADD_2 */ "__sync_fetch_and_add_2",
+/* SYNC_FETCH_AND_ADD_4 */ "__sync_fetch_and_add_4",
+/* SYNC_FETCH_AND_ADD_8 */ "__sync_fetch_and_add_8",
+/* SYNC_FETCH_AND_ADD_16 */ "__sync_fetch_and_add_16",
+/* SYNC_FETCH_AND_SUB_1 */ "__sync_fetch_and_sub_1",
+/* SYNC_FETCH_AND_SUB_2 */ "__sync_fetch_and_sub_2",
+/* SYNC_FETCH_AND_SUB_4 */ "__sync_fetch_and_sub_4",
+/* SYNC_FETCH_AND_SUB_8 */ "__sync_fetch_and_sub_8",
+/* SYNC_FETCH_AND_SUB_16 */ "__sync_fetch_and_sub_16",
+/* SYNC_FETCH_AND_AND_1 */ "__sync_fetch_and_and_1",
+/* SYNC_FETCH_AND_AND_2 */ "__sync_fetch_and_and_2",
+/* SYNC_FETCH_AND_AND_4 */ "__sync_fetch_and_and_4",
+/* SYNC_FETCH_AND_AND_8 */ "__sync_fetch_and_and_8",
+/* SYNC_FETCH_AND_AND_16 */ "__sync_fetch_and_and_16",
+/* SYNC_FETCH_AND_OR_1 */ "__sync_fetch_and_or_1",
+/* SYNC_FETCH_AND_OR_2 */ "__sync_fetch_and_or_2",
+/* SYNC_FETCH_AND_OR_4 */ "__sync_fetch_and_or_4",
+/* SYNC_FETCH_AND_OR_8 */ "__sync_fetch_and_or_8",
+/* SYNC_FETCH_AND_OR_16 */ "__sync_fetch_and_or_16",
+/* SYNC_FETCH_AND_XOR_1 */ "__sync_fetch_and_xor_1",
+/* SYNC_FETCH_AND_XOR_2 */ "__sync_fetch_and_xor_2",
+/* SYNC_FETCH_AND_XOR_4 */ "__sync_fetch_and_xor_4",
+/* SYNC_FETCH_AND_XOR_8 */ "__sync_fetch_and_xor_8",
+/* SYNC_FETCH_AND_XOR_16 */ "__sync_fetch_and_xor_16",
+/* SYNC_FETCH_AND_NAND_1 */ "__sync_fetch_and_nand_1",
+/* SYNC_FETCH_AND_NAND_2 */ "__sync_fetch_and_nand_2",
+/* SYNC_FETCH_AND_NAND_4 */ "__sync_fetch_and_nand_4",
+/* SYNC_FETCH_AND_NAND_8 */ "__sync_fetch_and_nand_8",
+/* SYNC_FETCH_AND_NAND_16 */ "__sync_fetch_and_nand_16",
+/* SYNC_FETCH_AND_MAX_1 */ "__sync_fetch_and_max_1",
+/* SYNC_FETCH_AND_MAX_2 */ "__sync_fetch_and_max_2",
+/* SYNC_FETCH_AND_MAX_4 */ "__sync_fetch_and_max_4",
+/* SYNC_FETCH_AND_MAX_8 */ "__sync_fetch_and_max_8",
+/* SYNC_FETCH_AND_MAX_16 */ "__sync_fetch_and_max_16",
+/* SYNC_FETCH_AND_UMAX_1 */ "__sync_fetch_and_umax_1",
+/* SYNC_FETCH_AND_UMAX_2 */ "__sync_fetch_and_umax_2",
+/* SYNC_FETCH_AND_UMAX_4 */ "__sync_fetch_and_umax_4",
+/* SYNC_FETCH_AND_UMAX_8 */ "__sync_fetch_and_umax_8",
+/* SYNC_FETCH_AND_UMAX_16 */ "__sync_fetch_and_umax_16",
+/* SYNC_FETCH_AND_MIN_1 */ "__sync_fetch_and_min_1",
+/* SYNC_FETCH_AND_MIN_2 */ "__sync_fetch_and_min_2",
+/* SYNC_FETCH_AND_MIN_4 */ "__sync_fetch_and_min_4",
+/* SYNC_FETCH_AND_MIN_8 */ "__sync_fetch_and_min_8",
+/* SYNC_FETCH_AND_MIN_16 */ "__sync_fetch_and_min_16",
+/* SYNC_FETCH_AND_UMIN_1 */ "__sync_fetch_and_umin_1",
+/* SYNC_FETCH_AND_UMIN_2 */ "__sync_fetch_and_umin_2",
+/* SYNC_FETCH_AND_UMIN_4 */ "__sync_fetch_and_umin_4",
+/* SYNC_FETCH_AND_UMIN_8 */ "__sync_fetch_and_umin_8",
+/* SYNC_FETCH_AND_UMIN_16 */ "__sync_fetch_and_umin_16",
+
+/* ATOMIC_LOAD */ "__atomic_load",
+/* ATOMIC_LOAD_1 */ "__atomic_load_1",
+/* ATOMIC_LOAD_2 */ "__atomic_load_2",
+/* ATOMIC_LOAD_4 */ "__atomic_load_4",
+/* ATOMIC_LOAD_8 */ "__atomic_load_8",
+/* ATOMIC_LOAD_16 */ "__atomic_load_16",
+
+/* ATOMIC_STORE */ "__atomic_store",
+/* ATOMIC_STORE_1 */ "__atomic_store_1",
+/* ATOMIC_STORE_2 */ "__atomic_store_2",
+/* ATOMIC_STORE_4 */ "__atomic_store_4",
+/* ATOMIC_STORE_8 */ "__atomic_store_8",
+/* ATOMIC_STORE_16 */ "__atomic_store_16",
+
+/* ATOMIC_EXCHANGE */ "__atomic_exchange",
+/* ATOMIC_EXCHANGE_1 */ "__atomic_exchange_1",
+/* ATOMIC_EXCHANGE_2 */ "__atomic_exchange_2",
+/* ATOMIC_EXCHANGE_4 */ "__atomic_exchange_4",
+/* ATOMIC_EXCHANGE_8 */ "__atomic_exchange_8",
+/* ATOMIC_EXCHANGE_16 */ "__atomic_exchange_16",
+
+/* ATOMIC_COMPARE_EXCHANGE */ "__atomic_compare_exchange",
+/* ATOMIC_COMPARE_EXCHANGE_1 */ "__atomic_compare_exchange_1",
+/* ATOMIC_COMPARE_EXCHANGE_2 */ "__atomic_compare_exchange_2",
+/* ATOMIC_COMPARE_EXCHANGE_4 */ "__atomic_compare_exchange_4",
+/* ATOMIC_COMPARE_EXCHANGE_8 */ "__atomic_compare_exchange_8",
+/* ATOMIC_COMPARE_EXCHANGE_16 */ "__atomic_compare_exchange_16",
+
+/* ATOMIC_FETCH_ADD_1 */ "__atomic_fetch_add_1",
+/* ATOMIC_FETCH_ADD_2 */ "__atomic_fetch_add_2",
+/* ATOMIC_FETCH_ADD_4 */ "__atomic_fetch_add_4",
+/* ATOMIC_FETCH_ADD_8 */ "__atomic_fetch_add_8",
+/* ATOMIC_FETCH_ADD_16 */ "__atomic_fetch_add_16",
+/* ATOMIC_FETCH_SUB_1 */ "__atomic_fetch_sub_1",
+/* ATOMIC_FETCH_SUB_2 */ "__atomic_fetch_sub_2",
+/* ATOMIC_FETCH_SUB_4 */ "__atomic_fetch_sub_4",
+/* ATOMIC_FETCH_SUB_8 */ "__atomic_fetch_sub_8",
+/* ATOMIC_FETCH_SUB_16 */ "__atomic_fetch_sub_16",
+/* ATOMIC_FETCH_AND_1 */ "__atomic_fetch_and_1",
+/* ATOMIC_FETCH_AND_2 */ "__atomic_fetch_and_2",
+/* ATOMIC_FETCH_AND_4 */ "__atomic_fetch_and_4",
+/* ATOMIC_FETCH_AND_8 */ "__atomic_fetch_and_8",
+/* ATOMIC_FETCH_AND_16 */ "__atomic_fetch_and_16",
+/* ATOMIC_FETCH_OR_1 */ "__atomic_fetch_or_1",
+/* ATOMIC_FETCH_OR_2 */ "__atomic_fetch_or_2",
+/* ATOMIC_FETCH_OR_4 */ "__atomic_fetch_or_4",
+/* ATOMIC_FETCH_OR_8 */ "__atomic_fetch_or_8",
+/* ATOMIC_FETCH_OR_16 */ "__atomic_fetch_or_16",
+/* ATOMIC_FETCH_XOR_1 */ "__atomic_fetch_xor_1",
+/* ATOMIC_FETCH_XOR_2 */ "__atomic_fetch_xor_2",
+/* ATOMIC_FETCH_XOR_4 */ "__atomic_fetch_xor_4",
+/* ATOMIC_FETCH_XOR_8 */ "__atomic_fetch_xor_8",
+/* ATOMIC_FETCH_XOR_16 */ "__atomic_fetch_xor_16",
+/* ATOMIC_FETCH_NAND_1 */ "__atomic_fetch_nand_1",
+/* ATOMIC_FETCH_NAND_2 */ "__atomic_fetch_nand_2",
+/* ATOMIC_FETCH_NAND_4 */ "__atomic_fetch_nand_4",
+/* ATOMIC_FETCH_NAND_8 */ "__atomic_fetch_nand_8",
+/* ATOMIC_FETCH_NAND_16 */ "__atomic_fetch_nand_16",
+
+/* STACKPROTECTOR_CHECK_FAIL */ "__stack_chk_fail",
+
+/* DEOPTIMIZE */ "__llvm_deoptimize",
+};
+
+void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
+                        RTLIB::Libcall LC, SmallVectorImpl<wasm::ValType> &Rets,
+                        SmallVectorImpl<wasm::ValType> &Params) {
+  assert(Rets.empty());
+  assert(Params.empty());
+
+  WebAssembly::ExprType iPTR = Subtarget.hasAddr64() ?
+                               WebAssembly::ExprType::I64 :
+                               WebAssembly::ExprType::I32;
+
+  switch (RuntimeLibcallSignatures[LC]) {
+  case func:
+    break;
+  case f32_func_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f32_func_f64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f32_func_i32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f32_func_i64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f32_func_i16:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_f32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f64_func_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f64_func_i32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_i64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i32_func_f64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i32_func_i32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_func_f32:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i64_func_f64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i64_func_i64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f32_func_f32_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f32_func_f32_i32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f32_func_i64_i64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f64_func_f64_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f64_func_f64_i32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_i64_i64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i16_func_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i8_func_i8_i8:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case func_f32_iPTR_iPTR:
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case func_f64_iPTR_iPTR:
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case i16_func_i16_i16:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i32_func_f32_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i32_func_f64_f64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i32_func_i32_i32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_func_i64_i64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_f32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i64_i64_func_f64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i16_i16_func_i16_i16:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I32);
+    Rets.push_back(wasm::ValType::I32);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i32_i32_func_i32_i32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I32);
+    Rets.push_back(wasm::ValType::I32);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_i64_func_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_i64_i64_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_i64_i64_func_i64_i64_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_i64_i64_i32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case iPTR_func_iPTR_i32_iPTR:
+    Rets.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case iPTR_func_iPTR_iPTR_iPTR:
+    Rets.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case f32_func_f32_f32_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f64_func_f64_f64_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case func_i64_i64_iPTR_iPTR:
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case func_iPTR_f32:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case func_iPTR_f64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case func_iPTR_i32:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case func_iPTR_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64_i64_i64_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_i64_i64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_i64_i64_i64_i64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case unsupported:
+    llvm_unreachable("unsupported runtime library signature");
+  }
+}
+
+void llvm::GetSignature(const WebAssemblySubtarget &Subtarget, const char *Name,
+                        SmallVectorImpl<wasm::ValType> &Rets,
+                        SmallVectorImpl<wasm::ValType> &Params) {
+  assert(strcmp(RuntimeLibcallNames[RTLIB::DEOPTIMIZE], "__llvm_deoptimize") ==
+         0);
+
+  for (size_t i = 0, e = RTLIB::UNKNOWN_LIBCALL; i < e; ++i)
+    if (RuntimeLibcallNames[i] && strcmp(RuntimeLibcallNames[i], Name) == 0)
+      return GetSignature(Subtarget, RTLIB::Libcall(i), Rets, Params);
+
+  llvm_unreachable("unexpected runtime library name");
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
new file mode 100644
index 0000000..1290676
--- /dev/null
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -0,0 +1,37 @@
+// CodeGen/RuntimeLibcallSignatures.h - R.T. Lib. Call Signatures -*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides signature information for runtime libcalls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_RUNTIME_LIBCALL_SIGNATURES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_RUNTIME_LIBCALL_SIGNATURES_H
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+
+namespace llvm {
+
+class WebAssemblySubtarget;
+
+extern void GetSignature(const WebAssemblySubtarget &Subtarget,
+                         RTLIB::Libcall LC,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+extern void GetSignature(const WebAssemblySubtarget &Subtarget,
+                         const char *Name, SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index 2441ead..b1385f4 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
index 34ec6f2..8173364 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp
@@ -24,8 +24,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -154,7 +154,7 @@ static bool optimizeCall(MachineBasicBlock &MBB, MachineInstr &MI,
   if (!callReturnsInput)
     return false;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!LibInfo.getLibFunc(Name, Func))
     return false;
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index f5ef35a..7b05f67 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -12,9 +12,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "WebAssemblyTargetMachine.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyTargetObjectFile.h"
 #include "WebAssemblyTargetTransformInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -74,13 +74,25 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, getEffectiveRelocModel(RM),
                         CM, OL),
-      TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+      TLOF(TT.isOSBinFormatELF() ?
+              static_cast<TargetLoweringObjectFile*>(
+                  new WebAssemblyTargetObjectFileELF()) :
+              static_cast<TargetLoweringObjectFile*>(
+                  new WebAssemblyTargetObjectFile())) {
   // WebAssembly type-checks instructions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
   // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
   // 'unreachable' instructions which is meant for that case.
   this->Options.TrapUnreachable = true;
 
+  // WebAssembly treats each function as an independent unit. Force
+  // -ffunction-sections, effectively, so that we can emit them independently.
+  if (!TT.isOSBinFormatELF()) {
+    this->Options.FunctionSections = true;
+    this->Options.DataSections = true;
+    this->Options.UniqueSectionNames = true;
+  }
+
   initAsmInfo();
 
   // Note that we don't use setRequiresStructuredCFG(true). It disables
@@ -117,7 +129,7 @@ namespace {
 /// WebAssembly Code Generator Pass Configuration Options.
 class WebAssemblyPassConfig final : public TargetPassConfig {
 public:
-  WebAssemblyPassConfig(WebAssemblyTargetMachine *TM, PassManagerBase &PM)
+  WebAssemblyPassConfig(WebAssemblyTargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {}
 
   WebAssemblyTargetMachine &getWebAssemblyTargetMachine() const {
@@ -142,7 +154,7 @@ TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() {
 
 TargetPassConfig *
 WebAssemblyTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new WebAssemblyPassConfig(this, PM);
+  return new WebAssemblyPassConfig(*this, PM);
 }
 
 FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) {
@@ -161,7 +173,7 @@ void WebAssemblyPassConfig::addIRPasses() {
   else
     // Expand some atomic operations. WebAssemblyTargetLowering has hooks which
     // control specifically what gets lowered.
-    addPass(createAtomicExpandPass(TM));
+    addPass(createAtomicExpandPass());
 
   // Fix function bitcasts, as WebAssembly requires caller and callee signatures
   // to match.
@@ -260,13 +272,19 @@ void WebAssemblyPassConfig::addPreEmitPass() {
     addPass(createWebAssemblyRegColoring());
   }
 
+  // Eliminate multiple-entry loops. Do this before inserting explicit get_local
+  // and set_local operators because we create a new variable that we want
+  // converted into a local.
+  addPass(createWebAssemblyFixIrreducibleControlFlow());
+
   // Insert explicit get_local and set_local operators.
   addPass(createWebAssemblyExplicitLocals());
 
-  // Eliminate multiple-entry loops.
-  addPass(createWebAssemblyFixIrreducibleControlFlow());
+  // Sort the blocks of the CFG into topological order, a prerequisite for
+  // BLOCK and LOOP markers.
+  addPass(createWebAssemblyCFGSort());
 
-  // Put the CFG in structured form; insert BLOCK and LOOP markers.
+  // Insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
   // Lower br_unless into br_if.
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
index 74e33b9..b1fd108 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -17,8 +17,14 @@
 #include "WebAssemblyTargetMachine.h"
 using namespace llvm;
 
-void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
-                                             const TargetMachine &TM) {
+void WebAssemblyTargetObjectFileELF::Initialize(MCContext &Ctx,
+                                                const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileWasm::Initialize(Ctx, TM);
+  InitializeWasm();
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index 39e50c9..ace87c9 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -20,7 +20,13 @@
 
 namespace llvm {
 
-class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
+class WebAssemblyTargetObjectFileELF final
+    : public TargetLoweringObjectFileELF {
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileWasm {
 public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 };
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 47aadf9..b3ce4bd 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -36,7 +36,7 @@ unsigned WebAssemblyTTIImpl::getNumberOfRegisters(bool Vector) {
   return Result;
 }
 
-unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector && getST()->hasSIMD128())
     return 128;
 
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index f658609..7b35fc9 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -55,7 +55,7 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index a0049c1..e32772d 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -15,6 +15,7 @@
 #include "WebAssemblyUtilities.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 using namespace llvm;
 
 bool WebAssembly::isArgument(const MachineInstr &MI) {
@@ -69,3 +70,28 @@ bool WebAssembly::isChild(const MachineInstr &MI,
   return TargetRegisterInfo::isVirtualRegister(Reg) &&
          MFI.isVRegStackified(Reg);
 }
+
+bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CALL_INDIRECT_VOID:
+  case WebAssembly::CALL_INDIRECT_I32:
+  case WebAssembly::CALL_INDIRECT_I64:
+  case WebAssembly::CALL_INDIRECT_F32:
+  case WebAssembly::CALL_INDIRECT_F64:
+  case WebAssembly::CALL_INDIRECT_v16i8:
+  case WebAssembly::CALL_INDIRECT_v8i16:
+  case WebAssembly::CALL_INDIRECT_v4i32:
+  case WebAssembly::CALL_INDIRECT_v4f32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+MachineBasicBlock *llvm::LoopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
index eb11440..595491f 100644
--- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -18,7 +18,9 @@
 
 namespace llvm {
 
+class MachineBasicBlock;
 class MachineInstr;
+class MachineLoop;
 class WebAssemblyFunctionInfo;
 
 namespace WebAssembly {
@@ -27,8 +29,15 @@ bool isArgument(const MachineInstr &MI);
 bool isCopy(const MachineInstr &MI);
 bool isTee(const MachineInstr &MI);
 bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+bool isCallIndirect(const MachineInstr &MI);
 
 } // end namespace WebAssembly
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *LoopBottom(const MachineLoop *Loop);
+
 } // end namespace llvm
 
 #endif
diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 8dd5e8a..35a6713 100644
--- a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -1,5 +1,15 @@
 # Tests which are known to fail from the GCC torture test suite.
 
+# Syntax: Each line has a single test to be marked as a 'known failure' (or
+# 'exclusion'. Known failures are expected to fail, and will cause an error if
+# they pass. (Known failures that do not run at all will not cause an
+# error). The format is
+# <name> <attributes> # comment
+#
+# The attributes in this case represent the different arguments used to
+# compiler: 'wasm-s' is for compiling to .s files, and 'wasm-o' for compiling
+# to wasm object files (.o).
+
 # Computed gotos are not supported (Cannot select BlockAddress/BRIND)
 20040302-1.c
 20071210-1.c
@@ -23,9 +33,6 @@ built-in-setjmp.c
 pr60003.c
 
 # Error in the program / unsupported by Clang.
-scal-to-vec1.c
-scal-to-vec2.c
-scal-to-vec3.c
 20000822-1.c
 20010209-1.c
 20010605-1.c
@@ -66,3 +73,18 @@ pr41935.c
 920728-1.c
 pr28865.c
 widechar-2.c
+
+# crash: Running pass 'WebAssembly Explicit Locals' on function
+20020107-1.c wasm-o
+20030222-1.c wasm-o
+20071220-1.c wasm-o
+20071220-2.c wasm-o
+990130-1.c wasm-o
+pr38533.c wasm-o
+pr41239.c wasm-o
+pr43385.c wasm-o
+pr43560.c wasm-o
+pr45695.c wasm-o
+pr49279.c wasm-o
+pr49390.c wasm-o
+pr52286.c wasm-o
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index c38a7d1..f7e31de 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -1,4 +1,4 @@
-//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,23 +8,30 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86AsmInstrumentation.h"
-#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86Operand.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
 #include <vector>
 
 // Following comment describes how assembly instrumentation works.
@@ -91,30 +98,35 @@
 //   register as a frame register and temprorary override current CFA
 //   register.
 
-namespace llvm {
-namespace {
+using namespace llvm;
 
 static cl::opt<bool> ClAsanInstrumentAssembly(
     "asan-instrument-assembly",
     cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
     cl::init(false));
 
-const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
-const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+static const int64_t MinAllowedDisplacement =
+    std::numeric_limits<int32_t>::min();
+static const int64_t MaxAllowedDisplacement =
+    std::numeric_limits<int32_t>::max();
 
-int64_t ApplyDisplacementBounds(int64_t Displacement) {
+static int64_t ApplyDisplacementBounds(int64_t Displacement) {
   return std::max(std::min(MaxAllowedDisplacement, Displacement),
                   MinAllowedDisplacement);
 }
 
-void CheckDisplacementBounds(int64_t Displacement) {
+static void CheckDisplacementBounds(int64_t Displacement) {
   assert(Displacement >= MinAllowedDisplacement &&
          Displacement <= MaxAllowedDisplacement);
 }
 
-bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+static bool IsStackReg(unsigned Reg) {
+  return Reg == X86::RSP || Reg == X86::ESP;
+}
 
-bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+namespace {
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
@@ -178,7 +190,7 @@ public:
   X86AddressSanitizer(const MCSubtargetInfo *&STI)
       : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
 
-  ~X86AddressSanitizer() override {}
+  ~X86AddressSanitizer() override = default;
 
   // X86AsmInstrumentation implementation:
   void InstrumentAndEmitInstruction(const MCInst &Inst,
@@ -255,9 +267,11 @@ protected:
   bool is64BitMode() const {
     return STI->getFeatureBits()[X86::Mode64Bit];
   }
+
   bool is32BitMode() const {
     return STI->getFeatureBits()[X86::Mode32Bit];
   }
+
   bool is16BitMode() const {
     return STI->getFeatureBits()[X86::Mode16Bit];
   }
@@ -498,7 +512,7 @@ public:
   X86AddressSanitizer32(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  ~X86AddressSanitizer32() override {}
+  ~X86AddressSanitizer32() override = default;
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
@@ -604,9 +618,9 @@ private:
     EmitInstruction(
         Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
 
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
                                             (IsWrite ? "store" : "load") +
-                                            llvm::Twine(AccessSize));
+                                            Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
@@ -756,7 +770,7 @@ public:
   X86AddressSanitizer64(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  ~X86AddressSanitizer64() override {}
+  ~X86AddressSanitizer64() override = default;
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
@@ -875,15 +889,17 @@ private:
       EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
                                RegCtx.AddressReg(64)));
     }
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
                                             (IsWrite ? "store" : "load") +
-                                            llvm::Twine(AccessSize));
+                                            Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
   }
 };
 
+} // end anonymous namespace
+
 void X86AddressSanitizer64::InstrumentMemOperandSmall(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
@@ -1022,12 +1038,10 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
   RestoreFlags(Out);
 }
 
-} // End anonymous namespace
-
 X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
-    : STI(STI), InitialFrameReg(0) {}
+    : STI(STI) {}
 
-X86AsmInstrumentation::~X86AsmInstrumentation() {}
+X86AsmInstrumentation::~X86AsmInstrumentation() = default;
 
 void X86AsmInstrumentation::InstrumentAndEmitInstruction(
     const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
@@ -1060,8 +1074,9 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
 }
 
 X86AsmInstrumentation *
-CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
-                            const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                                  const MCContext &Ctx,
+                                  const MCSubtargetInfo *&STI) {
   Triple T(STI->getTargetTriple());
   const bool hasCompilerRTSupport = T.isOSLinux();
   if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
@@ -1073,5 +1088,3 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
   }
   return new X86AsmInstrumentation(STI);
 }
-
-} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 470cead..97a55cd 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -1,4 +1,4 @@
-//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,7 +11,6 @@
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
 
 #include "llvm/ADT/SmallVector.h"
-
 #include <memory>
 
 namespace llvm {
@@ -23,7 +22,6 @@ class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCTargetOptions;
-
 class X86AsmInstrumentation;
 
 X86AsmInstrumentation *
@@ -43,7 +41,7 @@ public:
   // Tries to instrument and emit instruction.
   virtual void InstrumentAndEmitInstruction(
       const MCInst &Inst,
-      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
       MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
 
 protected:
@@ -60,9 +58,9 @@ protected:
 
   const MCSubtargetInfo *&STI;
 
-  unsigned InitialFrameReg;
+  unsigned InitialFrameReg = 0;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index e692118..c1d216c 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -49,8 +49,11 @@ static const char OpPrecedence[] = {
   4, // IC_MINUS
   5, // IC_MULTIPLY
   5, // IC_DIVIDE
-  6, // IC_RPAREN
-  7, // IC_LPAREN
+  5, // IC_MOD
+  6, // IC_NOT
+  7, // IC_NEG
+  8, // IC_RPAREN
+  9, // IC_LPAREN
   0, // IC_IMM
   0  // IC_REGISTER
 };
@@ -92,17 +95,32 @@ private:
     IC_MINUS,
     IC_MULTIPLY,
     IC_DIVIDE,
+    IC_MOD,
+    IC_NOT,
+    IC_NEG,
     IC_RPAREN,
     IC_LPAREN,
     IC_IMM,
     IC_REGISTER
   };
 
+  enum IntelOperatorKind {
+    IOK_INVALID = 0,
+    IOK_LENGTH,
+    IOK_SIZE,
+    IOK_TYPE,
+    IOK_OFFSET
+  };
+
   class InfixCalculator {
     typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
     SmallVector<ICToken, 4> PostfixStack;
 
+    bool isUnaryOperator(const InfixCalculatorTok Op) {
+      return Op == IC_NEG || Op == IC_NOT;
+    }
+
   public:
     int64_t popOperand() {
       assert (!PostfixStack.empty() && "Poped an empty stack!");
@@ -184,6 +202,22 @@ private:
         ICToken Op = PostfixStack[i];
         if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
           OperandStack.push_back(Op);
+        } else if (isUnaryOperator(Op.first)) {
+          assert (OperandStack.size() > 0 && "Too few operands.");
+          ICToken Operand = OperandStack.pop_back_val();
+          assert (Operand.first == IC_IMM &&
+                  "Unary operation with a register!");
+          switch (Op.first) {
+          default:
+            report_fatal_error("Unexpected operator!");
+            break;
+          case IC_NEG:
+            OperandStack.push_back(std::make_pair(IC_IMM, -Operand.second));
+            break;
+          case IC_NOT:
+            OperandStack.push_back(std::make_pair(IC_IMM, ~Operand.second));
+            break;
+          }
         } else {
           assert (OperandStack.size() > 1 && "Too few operands.");
           int64_t Val;
@@ -214,6 +248,12 @@ private:
             Val = Op1.second / Op2.second;
             OperandStack.push_back(std::make_pair(IC_IMM, Val));
             break;
+          case IC_MOD:
+            assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
+                    "Modulo operation with an immediate and a register!");
+            Val = Op1.second % Op2.second;
+            OperandStack.push_back(std::make_pair(IC_IMM, Val));
+            break;
           case IC_OR:
             assert (Op1.first == IC_IMM && Op2.first == IC_IMM &&
                     "Or operation with an immediate and a register!");
@@ -263,6 +303,7 @@ private:
     IES_NOT,
     IES_MULTIPLY,
     IES_DIVIDE,
+    IES_MOD,
     IES_LBRAC,
     IES_RBRAC,
     IES_LPAREN,
@@ -413,10 +454,16 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_PLUS:
       case IES_NOT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_LPAREN:
       case IES_RPAREN:
       case IES_LBRAC:
@@ -424,11 +471,12 @@ private:
       case IES_INTEGER:
       case IES_REGISTER:
         State = IES_MINUS;
-        // Only push the minus operator if it is not a unary operator.
-        if (!(CurrState == IES_PLUS || CurrState == IES_MINUS ||
-              CurrState == IES_MULTIPLY || CurrState == IES_DIVIDE ||
-              CurrState == IES_LPAREN || CurrState == IES_LBRAC))
+        // push minus operator if it is not a negate operator
+        if (CurrState == IES_REGISTER || CurrState == IES_RPAREN ||
+            CurrState == IES_INTEGER  || CurrState == IES_RBRAC)
           IC.pushOperator(IC_MINUS);
+        else
+          IC.pushOperator(IC_NEG);
         if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
           // If we already have a BaseReg, then assume this is the IndexReg with
           // a scale of 1.
@@ -450,9 +498,21 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_OR:
+      case IES_XOR:
+      case IES_AND:
+      case IES_LSHIFT:
+      case IES_RSHIFT:
       case IES_PLUS:
+      case IES_MINUS:
       case IES_NOT:
+      case IES_MULTIPLY:
+      case IES_DIVIDE:
+      case IES_MOD:
+      case IES_LPAREN:
+      case IES_LBRAC:
         State = IES_NOT;
+        IC.pushOperator(IC_NOT);
         break;
       }
       PrevState = CurrState;
@@ -517,6 +577,7 @@ private:
       case IES_LSHIFT:
       case IES_RSHIFT:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_MULTIPLY:
       case IES_LPAREN:
         State = IES_INTEGER;
@@ -531,26 +592,6 @@ private:
           }
           // Get the scale and replace the 'Register * Scale' with '0'.
           IC.popOperator();
-        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-                    PrevState == IES_OR || PrevState == IES_AND ||
-                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-                    PrevState == IES_NOT || PrevState == IES_XOR) &&
-                   CurrState == IES_MINUS) {
-          // Unary minus.  No need to pop the minus operand because it was never
-          // pushed.
-          IC.pushOperand(IC_IMM, -TmpInt); // Push -Imm.
-        } else if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-                    PrevState == IES_OR || PrevState == IES_AND ||
-                    PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-                    PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-                    PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-                    PrevState == IES_NOT || PrevState == IES_XOR) &&
-                   CurrState == IES_NOT) {
-          // Unary not.  No need to pop the not operand because it was never
-          // pushed.
-          IC.pushOperand(IC_IMM, ~TmpInt); // Push ~Imm.
         } else {
           IC.pushOperand(IC_IMM, TmpInt);
         }
@@ -586,6 +627,19 @@ private:
         break;
       }
     }
+    void onMod() {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_INTEGER:
+      case IES_RPAREN:
+        State = IES_MOD;
+        IC.pushOperator(IC_MOD);
+        break;
+      }
+    }
     void onLBrac() {
       PrevState = State;
       switch (State) {
@@ -639,18 +693,8 @@ private:
       case IES_RSHIFT:
       case IES_MULTIPLY:
       case IES_DIVIDE:
+      case IES_MOD:
       case IES_LPAREN:
-        // FIXME: We don't handle this type of unary minus or not, yet.
-        if ((PrevState == IES_PLUS || PrevState == IES_MINUS ||
-            PrevState == IES_OR || PrevState == IES_AND ||
-            PrevState == IES_LSHIFT || PrevState == IES_RSHIFT ||
-            PrevState == IES_MULTIPLY || PrevState == IES_DIVIDE ||
-            PrevState == IES_LPAREN || PrevState == IES_LBRAC ||
-            PrevState == IES_NOT || PrevState == IES_XOR) &&
-            (CurrState == IES_MINUS || CurrState == IES_NOT)) {
-          State = IES_ERROR;
-          break;
-        }
         State = IES_LPAREN;
         IC.pushOperator(IC_LPAREN);
         break;
@@ -704,10 +748,12 @@ private:
   std::unique_ptr<X86Operand> ParseIntelOperand();
   std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
   bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
-  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  unsigned IdentifyIntelOperator(StringRef Name);
+  unsigned ParseIntelOperator(unsigned OpKind);
   std::unique_ptr<X86Operand>
   ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
   std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+  bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
   bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
   std::unique_ptr<X86Operand>
   ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
@@ -766,11 +812,6 @@ private:
 
   bool ParseZ(std::unique_ptr<X86Operand> &Z, const SMLoc &StartLoc);
 
-  /// MS-compatibility:
-  /// Obtain an appropriate size qualifier, when facing its absence,
-  /// upon AVX512 vector/broadcast memory operand
-  unsigned AdjustAVX512Mem(unsigned Size, X86Operand* UnsizedMemOpNext);
-
   bool is64BitMode() const {
     // FIXME: Can tablegen auto-generate this?
     return getSTI().getFeatureBits()[X86::Mode64Bit];
@@ -814,6 +855,7 @@ private:
   /// }
 
 public:
+
   X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
                const MCInstrInfo &mii, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
@@ -1195,27 +1237,16 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
                                  Identifier, Info.OpDecl);
   }
 
+
   // We either have a direct symbol reference, or an offset from a symbol.  The
   // parser always puts the symbol on the LHS, so look there for size
   // calculation purposes.
+  unsigned FrontendSize = 0;
   const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
   bool IsSymRef =
       isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
-  if (IsSymRef) {
-    if (!Size) {
-      Size = Info.Type * 8; // Size is in terms of bits in this context.
-      if (Size)
-        InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
-                                            /*Len=*/0, Size);
-    if (AllowBetterSizeMatch)
-      // Handle cases where size qualifier is absent, upon an indirect symbol
-      // reference - e.g. "vaddps zmm1, zmm2, [var]"
-      // set Size to zero to allow matching mechansim to try and find a better
-      // size qualifier than our initial guess, based on available variants of
-      // the given instruction
-      Size = 0;
-    }
-  }
+  if (IsSymRef && !Size && Info.Type)
+    FrontendSize = Info.Type * 8; // Size is in terms of bits in this context.
 
   // When parsing inline assembly we set the base register to a non-zero value
   // if we don't know the actual value at this time.  This is necessary to
@@ -1223,7 +1254,7 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
   BaseReg = BaseReg ? BaseReg : 1;
   return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
                                IndexReg, Scale, Start, End, Size, Identifier,
-                               Info.OpDecl);
+                               Info.OpDecl, FrontendSize);
 }
 
 static void
@@ -1266,10 +1297,12 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
     }
   }
   // Remove all the ImmPrefix rewrites within the brackets.
+  // We may have some Imm rewrties as a result of an operator applying,
+  // remove them as well
   for (AsmRewrite &AR : AsmRewrites) {
     if (AR.Loc.getPointer() < StartInBrac.getPointer())
       continue;
-    if (AR.Kind == AOK_ImmPrefix)
+    if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm)
       AR.Kind = AOK_Delete;
   }
   const char *SymLocPtr = SymName.data();
@@ -1286,6 +1319,32 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
   }
 }
 
+// Some binary bitwise operators have a named synonymous
+// Query a candidate string for being such a named operator
+// and if so - invoke the appropriate handler
+bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) {
+  // A named operator should be either lower or upper case, but not a mix
+  if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
+    return false;
+  if (Name.equals_lower("not"))
+    SM.onNot();
+  else if (Name.equals_lower("or"))
+    SM.onOr();
+  else if (Name.equals_lower("shl"))
+    SM.onLShift();
+  else if (Name.equals_lower("shr"))
+    SM.onRShift();
+  else if (Name.equals_lower("xor"))
+    SM.onXor();
+  else if (Name.equals_lower("and"))
+    SM.onAnd();
+  else if (Name.equals_lower("mod"))
+    SM.onMod();
+  else
+    return false;
+  return true;
+}
+
 bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
@@ -1295,16 +1354,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   while (!Done) {
     bool UpdateLocLex = true;
 
+    AsmToken::TokenKind TK = getLexer().getKind();
     // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
     // identifier.  Don't try an parse it as a register.
-    if (PrevTK != AsmToken::Error && Tok.getString().startswith("."))
+    if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") &&
+        TK != AsmToken::Identifier)
       break;
 
     // If we're parsing an immediate expression, we don't expect a '['.
     if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
       break;
 
-    AsmToken::TokenKind TK = getLexer().getKind();
     switch (TK) {
     default: {
       if (SM.isValidEndState()) {
@@ -1324,31 +1384,36 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       const MCExpr *Val;
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
+      UpdateLocLex = false;
       if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
         SM.onRegister(TmpReg);
-        UpdateLocLex = false;
-        break;
+      } else if (ParseIntelNamedOperator(Identifier, SM)) {
+        UpdateLocLex = true;
+      } else if (!isParsingInlineAsm()) {
+        if (getParser().parsePrimaryExpr(Val, End))
+          return Error(Tok.getLoc(), "Unexpected identifier!");
+        SM.onIdentifierExpr(Val, Identifier);
+      } else if (unsigned OpKind = IdentifyIntelOperator(Identifier)) {
+        if (OpKind == IOK_OFFSET) 
+          return Error(IdentLoc, "Dealing OFFSET operator as part of"
+            "a compound immediate expression is yet to be supported");
+        int64_t Val = ParseIntelOperator(OpKind);
+        if (!Val)
+          return true;
+        StringRef ErrMsg;
+        if (SM.onInteger(Val, ErrMsg))
+          return Error(IdentLoc, ErrMsg);
+      } else if (Identifier.find('.') != StringRef::npos &&
+            PrevTK == AsmToken::RBrac) {
+          return false;
       } else {
-        if (!isParsingInlineAsm()) {
-          if (getParser().parsePrimaryExpr(Val, End))
-            return Error(Tok.getLoc(), "Unexpected identifier!");
-        } else {
-          // This is a dot operator, not an adjacent identifier.
-          if (Identifier.find('.') != StringRef::npos &&
-              PrevTK == AsmToken::RBrac) {
-            return false;
-          } else {
-            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-            if (ParseIntelIdentifier(Val, Identifier, Info,
-                                     /*Unevaluated=*/false, End))
-              return true;
-          }
-        }
+        InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+        if (ParseIntelIdentifier(Val, Identifier, Info,
+                                 /*Unevaluated=*/false, End))
+          return true;
         SM.onIdentifierExpr(Val, Identifier);
-        UpdateLocLex = false;
-        break;
       }
-      return Error(Tok.getLoc(), "Unexpected identifier!");
+      break;
     }
     case AsmToken::Integer: {
       StringRef ErrMsg;
@@ -1678,8 +1743,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
     unsigned Len = DotDispStr.size();
-    unsigned Val = OrigDispVal + DotDispVal;
-    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val);
+    InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal);
   }
 
   NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
@@ -1715,11 +1779,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
                                OffsetOfLoc, Identifier, Info.OpDecl);
 }
 
-enum IntelOperatorKind {
-  IOK_LENGTH,
-  IOK_SIZE,
-  IOK_TYPE
-};
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+    .Cases("TYPE","type",IOK_TYPE)
+    .Cases("SIZE","size",IOK_SIZE)
+    .Cases("LENGTH","length",IOK_LENGTH)
+    .Cases("OFFSET","offset",IOK_OFFSET)
+    .Default(IOK_INVALID);
+}
 
 /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
 /// returns the number of elements in an array.  It returns the value 1 for
@@ -1727,7 +1796,7 @@ enum IntelOperatorKind {
 /// variable.  A variable's size is the product of its LENGTH and TYPE.  The
 /// TYPE operator returns the size of a C or C++ type or variable. If the
 /// variable is an array, TYPE returns the size of a single element.
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+unsigned X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc TypeLoc = Tok.getLoc();
@@ -1739,11 +1808,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/true, End))
-    return nullptr;
-
-  if (!Info.OpDecl)
-    return ErrorOperand(Start, "unable to lookup expression");
+    return 0;
 
+  if (!Info.OpDecl) {
+    Error(Start, "unable to lookup expression");
+    return 0;
+  }
+  
   unsigned CVal = 0;
   switch(OpKind) {
   default: llvm_unreachable("Unexpected operand kind!");
@@ -1757,8 +1828,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   unsigned Len = End.getPointer() - TypeLoc.getPointer();
   InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
 
-  const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
-  return X86Operand::CreateImm(Imm, Start, End);
+  return CVal;
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
@@ -1766,18 +1836,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
 
-  // Offset, length, type and size operators.
-  if (isParsingInlineAsm()) {
-    StringRef AsmTokStr = Tok.getString();
-    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+  // FIXME: Offset operator
+  // Should be handled as part of immediate expression, as other operators
+  // Currently, only supported as a stand-alone operand
+  if (isParsingInlineAsm())
+    if (IdentifyIntelOperator(Tok.getString()) == IOK_OFFSET)
       return ParseIntelOffsetOfOperator();
-    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
-      return ParseIntelOperator(IOK_LENGTH);
-    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
-      return ParseIntelOperator(IOK_SIZE);
-    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
-      return ParseIntelOperator(IOK_TYPE);
-  }
 
   bool PtrInOperand = false;
   unsigned Size = getIntelMemOperandSize(Tok.getString());
@@ -2360,7 +2424,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     Name == "lock" || Name == "rep" ||
     Name == "repe" || Name == "repz" ||
     Name == "repne" || Name == "repnz" ||
-    Name == "rex64" || Name == "data16";
+    Name == "rex64" || Name == "data16" || Name == "data32";
 
   bool CurlyAsEndOfStatement = false;
   // This does the actual operand parsing.  Don't parse any more if we have a
@@ -2389,8 +2453,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         break;
      }
 
-    // In MS inline asm curly braces mark the begining/end of a block, therefore
-    // they should be interepreted as end of statement
+    // In MS inline asm curly braces mark the beginning/end of a block,
+    // therefore they should be interepreted as end of statement
     CurlyAsEndOfStatement =
         isParsingIntelSyntax() && isParsingInlineAsm() &&
         (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
@@ -2842,23 +2906,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   return true;
 }
 
-unsigned X86AsmParser::AdjustAVX512Mem(unsigned Size,
-    X86Operand* UnsizedMemOpNext) {
-  // Check for the existence of an AVX512 platform
-  if (!getSTI().getFeatureBits()[X86::FeatureAVX512])
-    return 0;
-  // Allow adjusting upon a (x|y|z)mm
-  if (Size == 512 || Size == 256 || Size == 128)
-    return Size;
-  // This is an allegadly broadcasting mem op adjustment,
-  // allow some more inquiring to validate it
-  if (Size == 64 || Size == 32)
-    return UnsizedMemOpNext && UnsizedMemOpNext->isToken() &&
-      UnsizedMemOpNext->getToken().substr(0, 4).equals("{1to") ? Size : 0;
-  // Do not allow any other type of adjustments
-  return 0;
-}
-
 bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
                                                 OperandVector &Operands,
                                                 MCStreamer &Out,
@@ -2878,19 +2925,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
 
   // Find one unsized memory operand, if present.
   X86Operand *UnsizedMemOp = nullptr;
-  // If unsized memory operand was found - obtain following operand.
-  // For use in AdjustAVX512Mem
-  X86Operand *UnsizedMemOpNext = nullptr;
   for (const auto &Op : Operands) {
     X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
-    if (UnsizedMemOp) {
-      UnsizedMemOpNext = X86Op;
+    if (X86Op->isMemUnsized()) {
+      UnsizedMemOp = X86Op;
       // Have we found an unqualified memory operand,
       // break. IA allows only one memory operand.
       break;
     }
-    if (X86Op->isMemUnsized())
-      UnsizedMemOp = X86Op;
   }
 
   // Allow some instructions to have implicitly pointer-sized operands.  This is
@@ -2936,7 +2978,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
   // If an unsized memory operand is present, try to match with each memory
   // operand size.  In Intel assembly, the size is not part of the instruction
   // mnemonic.
-  unsigned MatchedSize = 0;
   if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) {
     static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512};
     for (unsigned Size : MopSizes) {
@@ -2951,17 +2992,10 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
       // If this returned as a missing feature failure, remember that.
       if (Match.back() == Match_MissingFeature)
         ErrorInfoMissingFeature = ErrorInfoIgnore;
-      if (M == Match_Success)
-        // MS-compatability:
-        // Adjust AVX512 vector/broadcast memory operand,
-        // when facing the absence of a size qualifier.
-        // Match GCC behavior on respective cases.
-        MatchedSize = AdjustAVX512Mem(Size, UnsizedMemOpNext);
     }
 
     // Restore the size of the unsized memory operand if we modified it.
-    if (UnsizedMemOp)
-      UnsizedMemOp->Mem.Size = 0;
+    UnsizedMemOp->Mem.Size = 0;
   }
 
   // If we haven't matched anything yet, this is not a basic integer or FPU
@@ -2985,20 +3019,30 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
                  Op.getLocRange(), MatchingInlineAsm);
   }
 
+  unsigned NumSuccessfulMatches =
+      std::count(std::begin(Match), std::end(Match), Match_Success);
+
+  // If matching was ambiguous and we had size information from the frontend,
+  // try again with that. This handles cases like "movxz eax, m8/m16".
+  if (UnsizedMemOp && NumSuccessfulMatches > 1 &&
+      UnsizedMemOp->getMemFrontendSize()) {
+    UnsizedMemOp->Mem.Size = UnsizedMemOp->getMemFrontendSize();
+    unsigned M = MatchInstruction(
+        Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax());
+    if (M == Match_Success)
+      NumSuccessfulMatches = 1;
+
+    // Add a rewrite that encodes the size information we used from the
+    // frontend.
+    InstInfo->AsmRewrites->emplace_back(
+        AOK_SizeDirective, UnsizedMemOp->getStartLoc(),
+        /*Len=*/0, UnsizedMemOp->getMemFrontendSize());
+  }
+
   // If exactly one matched, then we treat that as a successful match (and the
   // instruction will already have been filled in correctly, since the failing
   // matches won't have modified it).
-  unsigned NumSuccessfulMatches =
-      std::count(std::begin(Match), std::end(Match), Match_Success);
   if (NumSuccessfulMatches == 1) {
-    if (MatchedSize && isParsingInlineAsm() && isParsingIntelSyntax())
-      // MS compatibility -
-      // Fix the rewrite according to the matched memory size
-      // MS inline assembly only
-      for (AsmRewrite &AR : *InstInfo->AsmRewrites)
-        if ((AR.Loc.getPointer() == UnsizedMemOp->StartLoc.getPointer()) &&
-            (AR.Kind == AOK_SizeDirective))
-          AR.Val = MatchedSize;
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the individual
     // transformations can chain off each other.
@@ -3015,7 +3059,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
            "multiple matches only possible with unsized memory operands");
     return Error(UnsizedMemOp->getStartLoc(),
                  "ambiguous operand size for instruction '" + Mnemonic + "\'",
-                 UnsizedMemOp->getLocRange(), MatchingInlineAsm);
+                 UnsizedMemOp->getLocRange());
   }
 
   // If one instruction matched with a missing feature, report this as a
@@ -3052,6 +3096,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   else if (IDVal.startswith(".att_syntax")) {
+    getParser().setParsingInlineAsm(false);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "prefix")
         Parser.Lex();
@@ -3064,6 +3109,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
+    getParser().setParsingInlineAsm(true);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if (Parser.getTok().getString() == "noprefix")
         Parser.Lex();
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 9db1a84..0fba15c 100644
--- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -1,4 +1,4 @@
-//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,17 @@
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
 
 #include "X86AsmParserCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/ADT/STLExtras.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <memory>
 
 namespace llvm {
 
@@ -57,6 +62,10 @@ struct X86Operand : public MCParsedAsmOperand {
     unsigned Scale;
     unsigned Size;
     unsigned ModeSize;
+
+    /// If the memory operand is unsized and there are multiple instruction
+    /// matches, prefer the one with this size.
+    unsigned FrontendSize;
   };
 
   union {
@@ -74,11 +83,14 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
+
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
+
   /// getLocRange - Get the range between the first and last token of this
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   /// getOffsetOfLoc - Get the location of the offset operator.
   SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
 
@@ -128,6 +140,10 @@ struct X86Operand : public MCParsedAsmOperand {
     assert(Kind == Memory && "Invalid access!");
     return Mem.ModeSize;
   }
+  unsigned getMemFrontendSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.FrontendSize;
+  }
 
   bool isToken() const override {return Kind == Token; }
 
@@ -271,6 +287,9 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isMem256_RC256X() const {
     return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
+  bool isMem256_RC512() const {
+    return isMem256() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+  }
   bool isMem512_RC256X() const {
     return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
   }
@@ -419,10 +438,12 @@ struct X86Operand : public MCParsedAsmOperand {
       RegNo = getGR32FromGR64(RegNo);
     Inst.addOperand(MCOperand::createReg(RegNo));
   }
+
   void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
@@ -451,6 +472,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
     Inst.addOperand(MCOperand::createReg(getMemSegReg()));
   }
+
   void addDstIdxOperands(MCInst &Inst, unsigned N) const {
     assert((N == 1) && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
@@ -498,7 +520,7 @@ struct X86Operand : public MCParsedAsmOperand {
   static std::unique_ptr<X86Operand>
   CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
             unsigned Size = 0, StringRef SymName = StringRef(),
-            void *OpDecl = nullptr) {
+            void *OpDecl = nullptr, unsigned FrontendSize = 0) {
     auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -507,6 +529,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
     Res->Mem.ModeSize = ModeSize;
+    Res->Mem.FrontendSize = FrontendSize;
     Res->SymName      = SymName;
     Res->OpDecl       = OpDecl;
     Res->AddressOf    = false;
@@ -518,7 +541,7 @@ struct X86Operand : public MCParsedAsmOperand {
   CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
             unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
             SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
-            void *OpDecl = nullptr) {
+            void *OpDecl = nullptr, unsigned FrontendSize = 0) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -534,6 +557,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
     Res->Mem.ModeSize = ModeSize;
+    Res->Mem.FrontendSize = FrontendSize;
     Res->SymName      = SymName;
     Res->OpDecl       = OpDecl;
     Res->AddressOf    = false;
@@ -541,6 +565,6 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 };
 
-} // End of namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 0871888..4ce908b 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -74,8 +74,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86DisassemblerDecoder.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86DisassemblerDecoder.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCExpr.h"
@@ -368,32 +368,49 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
 
   bool isBranch = false;
   uint64_t pcrel = 0;
-  if (type == TYPE_RELv) {
+  if (type == TYPE_REL) {
     isBranch = true;
     pcrel = insn.startLocation +
             insn.immediateOffset + insn.immediateSize;
-    switch (insn.displacementSize) {
+    switch (operand.encoding) {
     default:
       break;
-    case 1:
+    case ENCODING_Iv:
+      switch (insn.displacementSize) {
+      default:
+        break;
+      case 1:
+        if(immediate & 0x80)
+          immediate |= ~(0xffull);
+        break;
+      case 2:
+        if(immediate & 0x8000)
+          immediate |= ~(0xffffull);
+        break;
+      case 4:
+        if(immediate & 0x80000000)
+          immediate |= ~(0xffffffffull);
+        break;
+      case 8:
+        break;
+      }
+      break;
+    case ENCODING_IB:
       if(immediate & 0x80)
         immediate |= ~(0xffull);
       break;
-    case 2:
+    case ENCODING_IW:
       if(immediate & 0x8000)
         immediate |= ~(0xffffull);
       break;
-    case 4:
+    case ENCODING_ID:
       if(immediate & 0x80000000)
         immediate |= ~(0xffffffffull);
       break;
-    case 8:
-      break;
     }
   }
   // By default sign-extend all X86 immediates based on their encoding.
-  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
-           type == TYPE_IMM64 || type == TYPE_IMMv) {
+  else if (type == TYPE_IMM) {
     switch (operand.encoding) {
     default:
       break;
@@ -620,38 +637,17 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   }
 
   switch (type) {
-  case TYPE_XMM32:
-  case TYPE_XMM64:
-  case TYPE_XMM128:
+  case TYPE_XMM:
     mcInst.addOperand(MCOperand::createReg(X86::XMM0 + (immediate >> 4)));
     return;
-  case TYPE_XMM256:
+  case TYPE_YMM:
     mcInst.addOperand(MCOperand::createReg(X86::YMM0 + (immediate >> 4)));
     return;
-  case TYPE_XMM512:
+  case TYPE_ZMM:
     mcInst.addOperand(MCOperand::createReg(X86::ZMM0 + (immediate >> 4)));
     return;
   case TYPE_BNDR:
     mcInst.addOperand(MCOperand::createReg(X86::BND0 + (immediate >> 4)));
-  case TYPE_REL8:
-    isBranch = true;
-    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    if (immediate & 0x80)
-      immediate |= ~(0xffull);
-    break;
-  case TYPE_REL16:
-    isBranch = true;
-    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    if (immediate & 0x8000)
-      immediate |= ~(0xffffull);
-    break;
-  case TYPE_REL32:
-  case TYPE_REL64:
-    isBranch = true;
-    pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    if(immediate & 0x80000000)
-      immediate |= ~(0xffffffffull);
-    break;
   default:
     // operand is 64 bits wide.  Do nothing.
     break;
@@ -662,8 +658,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
                                mcInst, Dis))
     mcInst.addOperand(MCOperand::createImm(immediate));
 
-  if (type == TYPE_MOFFS8 || type == TYPE_MOFFS16 ||
-      type == TYPE_MOFFS32 || type == TYPE_MOFFS64) {
+  if (type == TYPE_MOFFS) {
     MCOperand segmentReg;
     segmentReg = MCOperand::createReg(segmentRegnums[insn.segmentOverride]);
     mcInst.addOperand(segmentReg);
@@ -767,7 +762,27 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
                        Opcode == X86::VPGATHERDQYrm ||
                        Opcode == X86::VPGATHERQQrm ||
                        Opcode == X86::VPGATHERDDrm ||
-                       Opcode == X86::VPGATHERQDrm);
+                       Opcode == X86::VPGATHERQDrm ||
+                       Opcode == X86::VGATHERDPDZ128rm ||
+                       Opcode == X86::VGATHERDPDZ256rm ||
+                       Opcode == X86::VGATHERDPSZ128rm ||
+                       Opcode == X86::VGATHERQPDZ128rm ||
+                       Opcode == X86::VGATHERQPSZ128rm ||
+                       Opcode == X86::VPGATHERDDZ128rm ||
+                       Opcode == X86::VPGATHERDQZ128rm ||
+                       Opcode == X86::VPGATHERDQZ256rm ||
+                       Opcode == X86::VPGATHERQDZ128rm ||
+                       Opcode == X86::VPGATHERQQZ128rm ||
+                       Opcode == X86::VSCATTERDPDZ128mr ||
+                       Opcode == X86::VSCATTERDPDZ256mr ||
+                       Opcode == X86::VSCATTERDPSZ128mr ||
+                       Opcode == X86::VSCATTERQPDZ128mr ||
+                       Opcode == X86::VSCATTERQPSZ128mr ||
+                       Opcode == X86::VPSCATTERDDZ128mr ||
+                       Opcode == X86::VPSCATTERDQZ128mr ||
+                       Opcode == X86::VPSCATTERDQZ256mr ||
+                       Opcode == X86::VPSCATTERQDZ128mr ||
+                       Opcode == X86::VPSCATTERQQZ128mr);
     bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
                        Opcode == X86::VGATHERDPSYrm ||
                        Opcode == X86::VGATHERQPSYrm ||
@@ -775,13 +790,49 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
                        Opcode == X86::VPGATHERDQZrm ||
                        Opcode == X86::VPGATHERQQYrm ||
                        Opcode == X86::VPGATHERDDYrm ||
-                       Opcode == X86::VPGATHERQDYrm);
+                       Opcode == X86::VPGATHERQDYrm ||
+                       Opcode == X86::VGATHERDPSZ256rm ||
+                       Opcode == X86::VGATHERQPDZ256rm ||
+                       Opcode == X86::VGATHERQPSZ256rm ||
+                       Opcode == X86::VPGATHERDDZ256rm ||
+                       Opcode == X86::VPGATHERQQZ256rm ||
+                       Opcode == X86::VPGATHERQDZ256rm ||
+                       Opcode == X86::VSCATTERDPDZmr ||
+                       Opcode == X86::VPSCATTERDQZmr ||
+                       Opcode == X86::VSCATTERDPSZ256mr ||
+                       Opcode == X86::VSCATTERQPDZ256mr ||
+                       Opcode == X86::VSCATTERQPSZ256mr ||
+                       Opcode == X86::VPSCATTERDDZ256mr ||
+                       Opcode == X86::VPSCATTERQQZ256mr ||
+                       Opcode == X86::VPSCATTERQDZ256mr ||
+                       Opcode == X86::VGATHERPF0DPDm ||
+                       Opcode == X86::VGATHERPF1DPDm ||
+                       Opcode == X86::VSCATTERPF0DPDm ||
+                       Opcode == X86::VSCATTERPF1DPDm);
     bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
                        Opcode == X86::VGATHERDPSZrm ||
                        Opcode == X86::VGATHERQPSZrm ||
                        Opcode == X86::VPGATHERQQZrm ||
                        Opcode == X86::VPGATHERDDZrm ||
-                       Opcode == X86::VPGATHERQDZrm);
+                       Opcode == X86::VPGATHERQDZrm ||
+                       Opcode == X86::VSCATTERQPDZmr ||
+                       Opcode == X86::VSCATTERDPSZmr ||
+                       Opcode == X86::VSCATTERQPSZmr ||
+                       Opcode == X86::VPSCATTERQQZmr ||
+                       Opcode == X86::VPSCATTERDDZmr ||
+                       Opcode == X86::VPSCATTERQDZmr ||
+                       Opcode == X86::VGATHERPF0DPSm ||
+                       Opcode == X86::VGATHERPF0QPDm ||
+                       Opcode == X86::VGATHERPF0QPSm ||
+                       Opcode == X86::VGATHERPF1DPSm ||
+                       Opcode == X86::VGATHERPF1QPDm ||
+                       Opcode == X86::VGATHERPF1QPSm ||
+                       Opcode == X86::VSCATTERPF0DPSm ||
+                       Opcode == X86::VSCATTERPF0QPDm ||
+                       Opcode == X86::VSCATTERPF0QPSm ||
+                       Opcode == X86::VSCATTERPF1DPSm ||
+                       Opcode == X86::VSCATTERPF1QPDm ||
+                       Opcode == X86::VSCATTERPF1QPSm);
     if (IndexIs128 || IndexIs256 || IndexIs512) {
       unsigned IndexOffset = insn.sibIndex -
                          (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
@@ -909,38 +960,15 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_R64:
   case TYPE_Rv:
   case TYPE_MM64:
-  case TYPE_XMM32:
-  case TYPE_XMM64:
-  case TYPE_XMM128:
-  case TYPE_XMM256:
-  case TYPE_XMM512:
-  case TYPE_VK1:
-  case TYPE_VK2:
-  case TYPE_VK4:
-  case TYPE_VK8:
-  case TYPE_VK16:
-  case TYPE_VK32:
-  case TYPE_VK64:
+  case TYPE_XMM:
+  case TYPE_YMM:
+  case TYPE_ZMM:
+  case TYPE_VK:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
   case TYPE_BNDR:
     return translateRMRegister(mcInst, insn);
   case TYPE_M:
-  case TYPE_M8:
-  case TYPE_M16:
-  case TYPE_M32:
-  case TYPE_M64:
-  case TYPE_M128:
-  case TYPE_M256:
-  case TYPE_M512:
-  case TYPE_Mv:
-  case TYPE_M32FP:
-  case TYPE_M64FP:
-  case TYPE_M80FP:
-  case TYPE_M1616:
-  case TYPE_M1632:
-  case TYPE_M1664:
-  case TYPE_LEA:
     return translateRMMemory(mcInst, insn, Dis);
   }
 }
@@ -992,6 +1020,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
   case ENCODING_WRITEMASK:
     return translateMaskRegister(mcInst, insn.writemask);
   CASE_ENCODING_RM:
+  CASE_ENCODING_VSIB:
     return translateRM(mcInst, operand, insn, Dis);
   case ENCODING_IB:
   case ENCODING_IW:
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index ab64d6f..577b7a7 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cstdarg>   /* for va_*()       */
-#include <cstdio>    /* for vsnprintf()  */
-#include <cstdlib>   /* for exit()       */
-#include <cstring>   /* for memset()     */
+#include <cstdarg> /* for va_*()       */
+#include <cstdio>  /* for vsnprintf()  */
+#include <cstdlib> /* for exit()       */
+#include <cstring> /* for memset()     */
 
 #include "X86DisassemblerDecoder.h"
 
@@ -650,11 +650,6 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->addressSize        = (hasAdSize ? 4 : 8);
       insn->displacementSize   = 4;
       insn->immediateSize      = 4;
-    } else if (insn->rexPrefix) {
-      insn->registerSize       = (hasOpSize ? 2 : 4);
-      insn->addressSize        = (hasAdSize ? 4 : 8);
-      insn->displacementSize   = (hasOpSize ? 2 : 4);
-      insn->immediateSize      = (hasOpSize ? 2 : 4);
     } else {
       insn->registerSize       = (hasOpSize ? 2 : 4);
       insn->addressSize        = (hasAdSize ? 4 : 8);
@@ -1475,21 +1470,13 @@ static int readModRM(struct InternalInstruction* insn) {
       return prefix##_EAX + index;                        \
     case TYPE_R64:                                        \
       return prefix##_RAX + index;                        \
-    case TYPE_XMM512:                                     \
+    case TYPE_ZMM:                                        \
       return prefix##_ZMM0 + index;                       \
-    case TYPE_XMM256:                                     \
+    case TYPE_YMM:                                        \
       return prefix##_YMM0 + index;                       \
-    case TYPE_XMM128:                                     \
-    case TYPE_XMM64:                                      \
-    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
       return prefix##_XMM0 + index;                       \
-    case TYPE_VK1:                                        \
-    case TYPE_VK2:                                        \
-    case TYPE_VK4:                                        \
-    case TYPE_VK8:                                        \
-    case TYPE_VK16:                                       \
-    case TYPE_VK32:                                       \
-    case TYPE_VK64:                                       \
+    case TYPE_VK:                                         \
       if (index > 7)                                      \
         *valid = 0;                                       \
       return prefix##_K0 + index;                         \
@@ -1562,6 +1549,7 @@ static int fixupReg(struct InternalInstruction *insn,
       return -1;
     break;
   CASE_ENCODING_RM:
+  CASE_ENCODING_VSIB:
     if (insn->eaBase >= insn->eaRegBase) {
       insn->eaBase = (EABase)fixupRMValue(insn,
                                           (OperandType)op->type,
@@ -1753,6 +1741,18 @@ static int readOperands(struct InternalInstruction* insn) {
     case ENCODING_SI:
     case ENCODING_DI:
       break;
+    CASE_ENCODING_VSIB:
+      // VSIB can use the V2 bit so check only the other bits.
+      if (needVVVV)
+        needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      // Apply the AVX512 compressed displacement scaling factor.
+      if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+        insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
+      break;
     case ENCODING_REG:
     CASE_ENCODING_RM:
       if (readModRM(insn))
@@ -1774,8 +1774,7 @@ static int readOperands(struct InternalInstruction* insn) {
       }
       if (readImmediate(insn, 1))
         return -1;
-      if (Op.type == TYPE_XMM128 ||
-          Op.type == TYPE_XMM256)
+      if (Op.type == TYPE_XMM || Op.type == TYPE_YMM)
         sawRegImm = 1;
       break;
     case ENCODING_IW:
diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 0a835b8..e0f4399 100644
--- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -339,6 +339,15 @@ enum ModRMDecisionType {
     case ENCODING_RM_CD32:   \
     case ENCODING_RM_CD64
 
+#define CASE_ENCODING_VSIB   \
+    case ENCODING_VSIB:      \
+    case ENCODING_VSIB_CD2:  \
+    case ENCODING_VSIB_CD4:  \
+    case ENCODING_VSIB_CD8:  \
+    case ENCODING_VSIB_CD16: \
+    case ENCODING_VSIB_CD32: \
+    case ENCODING_VSIB_CD64
+
 // Physical encodings of instruction operands.
 #define ENCODINGS                                                              \
   ENUM_ENTRY(ENCODING_NONE,   "")                                              \
@@ -350,6 +359,13 @@ enum ModRMDecisionType {
   ENUM_ENTRY(ENCODING_RM_CD16,"R/M operand with CDisp scaling of 16")          \
   ENUM_ENTRY(ENCODING_RM_CD32,"R/M operand with CDisp scaling of 32")          \
   ENUM_ENTRY(ENCODING_RM_CD64,"R/M operand with CDisp scaling of 64")          \
+  ENUM_ENTRY(ENCODING_VSIB,     "VSIB operand in ModR/M byte.")                \
+  ENUM_ENTRY(ENCODING_VSIB_CD2, "VSIB operand with CDisp scaling of 2")        \
+  ENUM_ENTRY(ENCODING_VSIB_CD4, "VSIB operand with CDisp scaling of 4")        \
+  ENUM_ENTRY(ENCODING_VSIB_CD8, "VSIB operand with CDisp scaling of 8")        \
+  ENUM_ENTRY(ENCODING_VSIB_CD16,"VSIB operand with CDisp scaling of 16")       \
+  ENUM_ENTRY(ENCODING_VSIB_CD32,"VSIB operand with CDisp scaling of 32")       \
+  ENUM_ENTRY(ENCODING_VSIB_CD64,"VSIB operand with CDisp scaling of 64")       \
   ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
   ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
   ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
@@ -383,85 +399,38 @@ enum OperandEncoding {
 // Semantic interpretations of instruction operands.
 #define TYPES                                                                  \
   ENUM_ENTRY(TYPE_NONE,       "")                                              \
-  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
-  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
-  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
-  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
-  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
-  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
-  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_REL,        "immediate address")                             \
   ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
   ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
   ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
   ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
-  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
-  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM,        "immediate operand")                      \
   ENUM_ENTRY(TYPE_IMM3,       "1-byte immediate operand between 0 and 7")      \
   ENUM_ENTRY(TYPE_IMM5,       "1-byte immediate operand between 0 and 31")     \
   ENUM_ENTRY(TYPE_AVX512ICC,  "1-byte immediate operand for AVX512 icmp")      \
   ENUM_ENTRY(TYPE_UIMM8,      "1-byte unsigned immediate operand")             \
-  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
-  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
-  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
-  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
   ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
-  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
-  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
-  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
-  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
-  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
-  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
-  ENUM_ENTRY(TYPE_M256,       "256-byte (AVX)")                                \
-  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
-  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
-  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
-  ENUM_ENTRY(TYPE_SRCIDX8,    "1-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_SRCIDX16,   "2-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_SRCIDX32,   "4-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_SRCIDX64,   "8-byte memory at source index")                 \
-  ENUM_ENTRY(TYPE_DSTIDX8,    "1-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_DSTIDX16,   "2-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_DSTIDX32,   "4-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_DSTIDX64,   "8-byte memory at destination index")            \
-  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
-                              "base)")                                         \
-  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
-  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
-  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
-  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
-  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
-  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_SRCIDX,     "memory at source index")                        \
+  ENUM_ENTRY(TYPE_DSTIDX,     "memory at destination index")                   \
+  ENUM_ENTRY(TYPE_MOFFS,      "memory offset (relative to segment base)")      \
   ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
   ENUM_ENTRY(TYPE_MM64,       "8-byte MMX register")                           \
-  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
-  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
-  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
-  ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
-  ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
-  ENUM_ENTRY(TYPE_VK1,        "1-bit")                                         \
-  ENUM_ENTRY(TYPE_VK2,        "2-bit")                                         \
-  ENUM_ENTRY(TYPE_VK4,        "4-bit")                                         \
-  ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
-  ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
-  ENUM_ENTRY(TYPE_VK32,       "32-bit")                                        \
-  ENUM_ENTRY(TYPE_VK64,       "64-bit")                                        \
+  ENUM_ENTRY(TYPE_XMM,        "16-byte")                                       \
+  ENUM_ENTRY(TYPE_YMM,        "32-byte")                                       \
+  ENUM_ENTRY(TYPE_ZMM,        "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK,         "mask register")                                 \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
   ENUM_ENTRY(TYPE_CONTROLREG, "Control register operand")                      \
   ENUM_ENTRY(TYPE_BNDR,       "MPX bounds register")                           \
                                                                                \
-  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
   ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
-  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
   ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
   ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
   ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
   ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
   ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
   ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
-  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
 
 #define ENUM_ENTRY(n, d) n,
 enum OperandType {
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 10b7e6f..4d91300 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -14,17 +14,20 @@
 
 #include "X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86InstComments.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -61,6 +64,17 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
     OS << "\tcallq\t";
     printPCRelImm(MI, 0, OS);
   }
+  // data16 and data32 both have the same encoding of 0x66. While data32 is
+  // valid only in 16 bit systems, data16 is valid in the rest.
+  // There seems to be some lack of support of the Requires clause that causes
+  // 0x66 to be interpreted as "data16" by the asm printer.
+  // Thus we add an adjustment here in order to print the "right" instruction.
+  else if (MI->getOpcode() == X86::DATA16_PREFIX &&
+    (STI.getFeatureBits()[X86::Mode16Bit])) {
+    MCInst Data32MI(*MI);
+    Data32MI.setOpcode(X86::DATA32_PREFIX);
+    printInstruction(&Data32MI, OS);
+  }
   // Try to print any aliases first.
   else if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
@@ -135,6 +149,7 @@ void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
   case 3: O << "{rz-sae}"; break;
   }
 }
+
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value (e.g. for jumps and calls).  These
 /// print slightly differently than normal immediates.  For example, a $ is not
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index bbb3090..946c1c7 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -1,4 +1,4 @@
-//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -137,6 +137,7 @@ public:
 private:
   bool HasCustomInstComment;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 8594add..f5f3a4c 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -15,8 +15,8 @@
 #include "X86InstComments.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "Utils/X86ShuffleDecode.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -587,6 +587,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPSLLDQZ256rr:
   case X86::VPSLLDQZ512rr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
   case X86::VPSLLDQZ128rm:
   case X86::VPSLLDQZ256rm:
   case X86::VPSLLDQZ512rm:
@@ -604,6 +605,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VPSRLDQZ256rr:
   case X86::VPSRLDQZ512rr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    LLVM_FALLTHROUGH;
   case X86::VPSRLDQZ128rm:
   case X86::VPSRLDQZ256rm:
   case X86::VPSRLDQZ512rm:
@@ -1036,7 +1038,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::EXTRQI:
     if (MI->getOperand(2).isImm() &&
         MI->getOperand(3).isImm())
-      DecodeEXTRQIMask(MI->getOperand(2).getImm(),
+      DecodeEXTRQIMask(MVT::v16i8, MI->getOperand(2).getImm(),
                        MI->getOperand(3).getImm(),
                        ShuffleMask);
 
@@ -1047,7 +1049,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::INSERTQI:
     if (MI->getOperand(3).isImm() &&
         MI->getOperand(4).isImm())
-      DecodeINSERTQIMask(MI->getOperand(3).getImm(),
+      DecodeINSERTQIMask(MVT::v16i8, MI->getOperand(3).getImm(),
                          MI->getOperand(4).getImm(),
                          ShuffleMask);
 
@@ -1091,6 +1093,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
     DecodeSubVectorBroadcast(MVT::v8f32, MVT::v2f32, ShuffleMask);
@@ -1099,6 +1102,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
   CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
   CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
     DecodeSubVectorBroadcast(MVT::v16f32, MVT::v2f32, ShuffleMask);
@@ -1189,8 +1193,6 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     OS << ']';
     --i; // For loop increments element #.
   }
-  //MI->print(OS, 0);
-  OS << "\n";
 
   // We successfully added a comment to this instruction.
   return true;
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 4443edb..d6af671 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -14,14 +14,16 @@
 
 #include "X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86InstComments.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#include <cctype>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 20cd7ff..ace3118 100644
--- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -157,6 +157,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e83ec9f..733eac7 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -10,6 +10,8 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
@@ -22,9 +24,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -108,12 +108,12 @@ public:
     return Infos[Kind - FirstTargetFixupKind];
   }
 
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
-    assert(Fixup.getOffset() + Size <= DataSize &&
-           "Invalid fixup offset!");
+    assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
 
     // Check that uppper bits are either all zeros or all ones.
     // Specifically ignore overflow/underflow as long as the leakage is
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index aab5525..d8953da 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -212,7 +212,12 @@ namespace X86II {
     /// the offset from beginning of section.
     ///
     /// This is the TLS offset for the COFF/Windows TLS mechanism.
-    MO_SECREL
+    MO_SECREL,
+
+    /// MO_ABS8 - On a symbol operand this indicates that the symbol is known
+    /// to be an absolute symbol in range [0,128), so we can use the @ABS8
+    /// symbol modifier.
+    MO_ABS8,
   };
 
   enum : uint64_t {
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index da69da5..4da4eeb 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -9,28 +9,32 @@
 
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
-  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
-  public:
-    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
 
-    ~X86ELFObjectWriter() override;
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+  ~X86ELFObjectWriter() override = default;
 
-  protected:
-    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
-                          const MCFixup &Fixup, bool IsPCRel) const override;
-  };
-}
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+} // end anonymous namespace
 
 X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                                        uint16_t EMachine)
@@ -40,9 +44,6 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                               (EMachine != ELF::EM_386) &&
                                   (EMachine != ELF::EM_IAMCU)) {}
 
-X86ELFObjectWriter::~X86ELFObjectWriter()
-{}
-
 enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
 
 static X86_64RelType getType64(unsigned Kind,
@@ -96,6 +97,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
   default:
     llvm_unreachable("Unimplemented");
   case MCSymbolRefExpr::VK_None:
+  case MCSymbolRefExpr::VK_X86_ABS8:
     switch (Type) {
     case RT64_64:
       return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
@@ -219,6 +221,7 @@ static unsigned getRelocType32(MCContext &Ctx,
   default:
     llvm_unreachable("Unimplemented");
   case MCSymbolRefExpr::VK_None:
+  case MCSymbolRefExpr::VK_X86_ABS8:
     switch (Type) {
     case RT32_32:
       return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 48a1d8f..1538a51 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -13,12 +13,12 @@
 
 #include "X86MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
 using namespace llvm;
 
 enum AsmWriterFlavorTy {
@@ -43,7 +43,7 @@ void X86MCAsmInfoDarwin::anchor() { }
 X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   bool is64Bit = T.getArch() == Triple::x86_64;
   if (is64Bit)
-    PointerSize = CalleeSaveStackSlotSize = 8;
+    CodePointerSize = CalleeSaveStackSlotSize = 8;
 
   AssemblerDialect = AsmWriterFlavor;
 
@@ -92,7 +92,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   // For ELF, x86-64 pointer size depends on the ABI.
   // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
   // with the x32 ABI, pointer size remains the default 4.
-  PointerSize = (is64Bit && !isX32) ? 8 : 4;
+  CodePointerSize = (is64Bit && !isX32) ? 8 : 4;
 
   // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
   CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
@@ -129,7 +129,7 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
     PrivateLabelPrefix = ".L";
-    PointerSize = 8;
+    CodePointerSize = 8;
     WinEHEncodingType = WinEH::EncodingType::Itanium;
   } else {
     // 32-bit X86 doesn't use CFI, so this isn't a real encoding type. It's just
@@ -156,7 +156,7 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
   if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
     PrivateLabelPrefix = ".L";
-    PointerSize = 8;
+    CodePointerSize = 8;
     WinEHEncodingType = WinEH::EncodingType::Itanium;
     ExceptionsType = ExceptionHandling::WinEH;
   } else {
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8045e7c..10e2bbc 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -11,35 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
-  void operator=(const X86MCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
+
 public:
   X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
     : MCII(mcii), Ctx(ctx) {
   }
-
-  ~X86MCCodeEmitter() override {}
+  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+  X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete;
+  ~X86MCCodeEmitter() override = default;
 
   bool is64BitMode(const MCSubtargetInfo &STI) const {
     return STI.getFeatureBits()[X86::Mode64Bit];
@@ -106,8 +114,7 @@ public:
                      SmallVectorImpl<MCFixup> &Fixups,
                      int ImmOffset = 0) const;
 
-  inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode,
-                                  unsigned RM) {
+  static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
     assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
     return RM | (RegOpcode << 3) | (Mod << 6);
   }
@@ -149,12 +156,6 @@ public:
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
-                                            MCContext &Ctx) {
-  return new X86MCCodeEmitter(MCII, Ctx);
-}
-
 /// isDisp8 - Return true if this signed displacement fits in a 8-bit
 /// sign-extended field.
 static bool isDisp8(int Value) {
@@ -1436,7 +1437,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r: {
+  case X86II::MRM6r: case X86II::MRM7r:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
@@ -1446,13 +1447,12 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
                      CurByte, OS);
     break;
-  }
 
   case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m: {
+  case X86II::MRM6m: case X86II::MRM7m:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
@@ -1463,7 +1463,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      Rex, CurByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
-  }
+
   case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
   case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
   case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
@@ -1527,3 +1527,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 #endif
 }
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, Ctx);
+}
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 297926d..8f2017e 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -7,9 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
@@ -19,7 +20,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/MachO.h"
 
 using namespace llvm;
 
@@ -153,8 +153,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(
     const MCSymbol *B_Base = Asm.getAtom(*B);
 
     // Neither symbol can be modified.
-    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
-        Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) {
+    if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) {
       Asm.getContext().reportError(Fixup.getLoc(),
                                    "unsupported relocation of modified symbol");
       return;
@@ -397,7 +396,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     if (!SB->getFragment()) {
       Asm.getContext().reportError(
           Fixup.getLoc(),
-          "symbol '" + B->getSymbol().getName() +
+          "symbol '" + SB->getName() +
               "' can not be undefined in a subtraction expression");
       return false;
     }
@@ -409,7 +408,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer,
     // pedantic compatibility with 'as'.
     Type = A->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF
                            : (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
-    Value2 = Writer->getSymbolAddress(B->getSymbol(), Layout);
+    Value2 = Writer->getSymbolAddress(*SB, Layout);
     FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
   }
 
@@ -469,8 +468,8 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
                                                const MCFixup &Fixup,
                                                MCValue Target,
                                                uint64_t &FixedValue) {
-  assert(Target.getSymA()->getKind() == MCSymbolRefExpr::VK_TLVP &&
-         !is64Bit() &&
+  const MCSymbolRefExpr *SymA = Target.getSymA();
+  assert(SymA->getKind() == MCSymbolRefExpr::VK_TLVP && !is64Bit() &&
          "Should only be called with a 32-bit TLVP relocation!");
 
   unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind());
@@ -481,15 +480,14 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
   // subtraction from the picbase. For 32-bit pic the addend is the difference
   // between the picbase and the next address.  For 32-bit static the addend is
   // zero.
-  if (Target.getSymB()) {
+  if (auto *SymB = Target.getSymB()) {
     // If this is a subtraction then we're pcrel.
     uint32_t FixupAddress =
       Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
     IsPCRel = 1;
-    FixedValue =
-        FixupAddress -
-        Writer->getSymbolAddress(Target.getSymB()->getSymbol(), Layout) +
-        Target.getConstant();
+    FixedValue = FixupAddress -
+                 Writer->getSymbolAddress(SymB->getSymbol(), Layout) +
+                 Target.getConstant();
     FixedValue += 1ULL << Log2Size;
   } else {
     FixedValue = 0;
@@ -500,8 +498,7 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer,
   MRE.r_word0 = Value;
   MRE.r_word1 =
       (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28);
-  Writer->addRelocation(&Target.getSymA()->getSymbol(), Fragment->getParent(),
-                        MRE);
+  Writer->addRelocation(&SymA->getSymbol(), Fragment->getParent(), MRE);
 }
 
 void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 33376b6..807f7a6 100644
--- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -9,41 +9,47 @@
 
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-namespace llvm {
-  class MCObjectWriter;
-}
-
 namespace {
-  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
-  public:
-    X86WinCOFFObjectWriter(bool Is64Bit);
-    ~X86WinCOFFObjectWriter() override;
 
-    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsCrossSection,
-                          const MCAsmBackend &MAB) const override;
-  };
-}
+class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  X86WinCOFFObjectWriter(bool Is64Bit);
+  ~X86WinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+};
+
+} // end anonymous namespace
 
 X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
                                           : COFF::IMAGE_FILE_MACHINE_I386) {}
 
-X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
-
-unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
+unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
                                               const MCAsmBackend &MAB) const {
-  unsigned FixupKind = IsCrossSection ? FK_PCRel_4 : Fixup.getKind();
+  unsigned FixupKind = Fixup.getKind();
+  if (IsCrossSection) {
+    if (FixupKind != FK_Data_4 && FixupKind != llvm::X86::reloc_signed_4byte) {
+      Ctx.reportError(Fixup.getLoc(), "Cannot represent this expression");
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    }
+    FixupKind = FK_PCRel_4;
+  }
 
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 1be5aec..8a0fbfb 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -452,15 +452,20 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) {
     Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
 }
 
-void DecodeEXTRQIMask(int Len, int Idx,
+void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
                       SmallVectorImpl<int> &ShuffleMask) {
+  assert(VT.is128BitVector() && "Expected 128-bit vector");
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned HalfElts = NumElts / 2;
+
   // Only the bottom 6 bits are valid for each immediate.
   Len &= 0x3F;
   Idx &= 0x3F;
 
   // We can only decode this bit extraction instruction as a shuffle if both the
-  // length and index work with whole bytes.
-  if (0 != (Len % 8) || 0 != (Idx % 8))
+  // length and index work with whole elements.
+  if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
     return;
 
   // A length of zero is equivalent to a bit length of 64.
@@ -469,33 +474,38 @@ void DecodeEXTRQIMask(int Len, int Idx,
 
   // If the length + index exceeds the bottom 64 bits the result is undefined.
   if ((Len + Idx) > 64) {
-    ShuffleMask.append(16, SM_SentinelUndef);
+    ShuffleMask.append(NumElts, SM_SentinelUndef);
     return;
   }
 
-  // Convert index and index to work with bytes.
-  Len /= 8;
-  Idx /= 8;
+  // Convert index and index to work with elements.
+  Len /= EltSize;
+  Idx /= EltSize;
 
-  // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes
-  // of the lower 64-bits. The upper 64-bits are undefined.
+  // EXTRQ: Extract Len elements starting from Idx. Zero pad the remaining
+  // elements of the lower 64-bits. The upper 64-bits are undefined.
   for (int i = 0; i != Len; ++i)
     ShuffleMask.push_back(i + Idx);
-  for (int i = Len; i != 8; ++i)
+  for (int i = Len; i != (int)HalfElts; ++i)
     ShuffleMask.push_back(SM_SentinelZero);
-  for (int i = 8; i != 16; ++i)
+  for (int i = HalfElts; i != (int)NumElts; ++i)
     ShuffleMask.push_back(SM_SentinelUndef);
 }
 
-void DecodeINSERTQIMask(int Len, int Idx,
+void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
                         SmallVectorImpl<int> &ShuffleMask) {
+  assert(VT.is128BitVector() && "Expected 128-bit vector");
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSize = VT.getScalarSizeInBits();
+  unsigned HalfElts = NumElts / 2;
+
   // Only the bottom 6 bits are valid for each immediate.
   Len &= 0x3F;
   Idx &= 0x3F;
 
   // We can only decode this bit insertion instruction as a shuffle if both the
-  // length and index work with whole bytes.
-  if (0 != (Len % 8) || 0 != (Idx % 8))
+  // length and index work with whole elements.
+  if (0 != (Len % EltSize) || 0 != (Idx % EltSize))
     return;
 
   // A length of zero is equivalent to a bit length of 64.
@@ -504,24 +514,24 @@ void DecodeINSERTQIMask(int Len, int Idx,
 
   // If the length + index exceeds the bottom 64 bits the result is undefined.
   if ((Len + Idx) > 64) {
-    ShuffleMask.append(16, SM_SentinelUndef);
+    ShuffleMask.append(NumElts, SM_SentinelUndef);
     return;
   }
 
-  // Convert index and index to work with bytes.
-  Len /= 8;
-  Idx /= 8;
+  // Convert index and index to work with elements.
+  Len /= EltSize;
+  Idx /= EltSize;
 
-  // INSERTQ: Extract lowest Len bytes from lower half of second source and
-  // insert over first source starting at Idx byte. The upper 64-bits are
+  // INSERTQ: Extract lowest Len elements from lower half of second source and
+  // insert over first source starting at Idx element. The upper 64-bits are
   // undefined.
   for (int i = 0; i != Idx; ++i)
     ShuffleMask.push_back(i);
   for (int i = 0; i != Len; ++i)
-    ShuffleMask.push_back(i + 16);
-  for (int i = Idx + Len; i != 8; ++i)
+    ShuffleMask.push_back(i + NumElts);
+  for (int i = Idx + Len; i != (int)HalfElts; ++i)
     ShuffleMask.push_back(i);
-  for (int i = 8; i != 16; ++i)
+  for (int i = HalfElts; i != (int)NumElts; ++i)
     ShuffleMask.push_back(SM_SentinelUndef);
 }
 
diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
index 17619d0..251c9f7 100644
--- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -134,12 +134,12 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask);
 void DecodeScalarMoveMask(MVT VT, bool IsLoad,
                           SmallVectorImpl<int> &ShuffleMask);
 
-/// Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask.
-void DecodeEXTRQIMask(int Len, int Idx,
+/// Decode a SSE4A EXTRQ instruction as a shuffle mask.
+void DecodeEXTRQIMask(MVT VT, int Len, int Idx,
                       SmallVectorImpl<int> &ShuffleMask);
 
-/// Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask.
-void DecodeINSERTQIMask(int Len, int Idx,
+/// Decode a SSE4A INSERTQ instruction as a shuffle mask.
+void DecodeINSERTQIMask(MVT VT, int Len, int Idx,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index 2cb80a4..91201d1 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -21,7 +21,10 @@ namespace llvm {
 
 class FunctionPass;
 class ImmutablePass;
+class InstructionSelector;
 class PassRegistry;
+class X86RegisterBankInfo;
+class X86Subtarget;
 class X86TargetMachine;
 
 /// This pass converts a legalized DAG into a X86-specific DAG, ready for
@@ -80,6 +83,9 @@ FunctionPass *createX86WinEHStatePass();
 /// the MachineInstr to MC.
 FunctionPass *createX86ExpandPseudoPass();
 
+/// This pass converts X86 cmov instructions into branch when profitable.
+FunctionPass *createX86CmovConverterPass();
+
 /// Return a Machine IR pass that selectively replaces
 /// certain byte and word instructions by equivalent 32 bit instructions,
 /// in order to eliminate partial register usage, false dependences on
@@ -92,6 +98,10 @@ void initializeFixupBWInstPassPass(PassRegistry &);
 /// encoding when possible in order to reduce code size.
 FunctionPass *createX86EvexToVexInsts();
 
+InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
+                                                  X86Subtarget &,
+                                                  X86RegisterBankInfo &);
+
 void initializeEvexToVexInstPassPass(PassRegistry &);
 
 } // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td
index 83a23d4..54eabea 100644
--- a/contrib/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm/lib/Target/X86/X86.td
@@ -127,6 +127,9 @@ def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
 def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
                       "Enable AVX-512 Conflict Detection Instructions",
                                       [FeatureAVX512]>;
+def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
+                       "true", "Enable AVX-512 Population Count Instructions",
+                                      [FeatureAVX512]>;
 def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
@@ -170,6 +173,8 @@ def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       [FeatureSSE2]>;
 def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
                                       "Enable TBM instructions">;
+def FeatureLWP     : SubtargetFeature<"lwp", "HasLWP", "true",
+                                      "Enable LWP instructions">;
 def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
 def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
@@ -187,8 +192,6 @@ def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
 def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
                                       "Support RTM instructions">;
-def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
-                                      "Support HLE">;
 def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
                                       "Support ADX instructions">;
 def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
@@ -202,6 +205,8 @@ def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
                                        "Support LAHF and SAHF instructions">;
 def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
                                       "Enable MONITORX/MWAITX timer functionality">;
+def FeatureCLZERO  : SubtargetFeature<"clzero", "HasCLZERO", "true",
+                                      "Enable Cache Line Zero">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -215,18 +220,10 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
-def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true",
-                                      "Invalidate Process-Context Identifier">;
-def FeatureVMFUNC  : SubtargetFeature<"vmfunc", "HasVMFUNC", "true",
-                                      "VM Functions">;
-def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
-                                      "Supervisor Mode Access Protection">;
 def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
                                       "Enable Software Guard Extensions">;
 def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
                                       "Flush A Cache Line Optimized">;
-def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true",
-                                      "Enable Persistent Commit">;
 def FeatureCLWB    : SubtargetFeature<"clwb", "HasCLWB", "true",
                                       "Cache Line Write Back">;
 // TODO: This feature ought to be renamed.
@@ -241,16 +238,19 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
 def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
                                    "LEA instruction with certain arguments is slow">;
+def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true",
+                                   "LEA instruction with 3 ops or certain registers is slow">;
 def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
                                    "INC and DEC instructions are slower than ADD and SUB">;
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
-// On at least some AMD processors, there is no performance hazard to writing
-// only the lower parts of a YMM register without clearing the upper part.
-def FeatureFastPartialYMMWrite
-    : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
-                       "true", "Partial writes to YMM registers are fast">;
+// On some X86 processors, there is no performance hazard to writing only the
+// lower parts of a YMM or ZMM register without clearing the upper part.
+def FeatureFastPartialYMMorZMMWrite
+    : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
+                       "HasFastPartialYMMorZMMWrite",
+                       "true", "Partial writes to YMM/ZMM registers are fast">;
 // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
 // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
 // vector FSQRT has higher throughput than the corresponding NR code.
@@ -271,6 +271,25 @@ def FeatureFastLZCNT
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
 
+
+// Sandy Bridge and newer processors can use SHLD with the same source on both
+// inputs to implement rotate to avoid the partial flag update of the normal
+// rotate instructions.
+def FeatureFastSHLDRotate
+    : SubtargetFeature<
+          "fast-shld-rotate", "HasFastSHLDRotate", "true",
+          "SHLD can be used as a faster rotate">;
+
+// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
+// "string operations"). See "REP String Enhancement" in the Intel Software
+// Development Manual. This feature essentially means that REP MOVSB will copy
+// using the largest available size instead of copying bytes one by one, making
+// it at least as fast as REPMOVS{W,D,Q}.
+def FeatureERMSB
+    : SubtargetFeature<
+          "ermsb", "HasERMSB", "true",
+          "REP MOVS/STOS are fast">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -281,6 +300,8 @@ def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
 def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
                     "Intel Silvermont processors">;
+def ProcIntelGLM  : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
+                    "Intel Goldmont processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -411,6 +432,35 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
 def : SilvermontProc<"silvermont">;
 def : SilvermontProc<"slm">; // Legacy alias.
 
+class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
+  ProcIntelGLM,
+  FeatureX87,
+  FeatureMMX,
+  FeatureSSE42,
+  FeatureFXSR,
+  FeatureCMPXCHG16B,
+  FeatureMOVBE,
+  FeaturePOPCNT,
+  FeaturePCLMUL,
+  FeatureAES,
+  FeaturePRFCHW,
+  FeatureCallRegIndirect,
+  FeatureSlowLEA,
+  FeatureSlowIncDec,
+  FeatureSlowBTMem,
+  FeatureLAHFSAHF,
+  FeatureMPX,
+  FeatureSHA,
+  FeatureRDRAND,
+  FeatureRDSEED,
+  FeatureXSAVE,
+  FeatureXSAVEOPT,
+  FeatureXSAVEC,
+  FeatureXSAVES,
+  FeatureCLFLUSHOPT
+]>;
+def : GoldmontProc<"goldmont">;
+
 // "Arrandale" along with corei3 and corei5
 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
   FeatureX87,
@@ -466,7 +516,9 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
-  FeatureFastScalarFSQRT
+  FeatureSlow3OpsLEA,
+  FeatureFastScalarFSQRT,
+  FeatureFastSHLDRotate
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -495,13 +547,10 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureAVX2,
   FeatureBMI,
   FeatureBMI2,
+  FeatureERMSB,
   FeatureFMA,
   FeatureLZCNT,
   FeatureMOVBE,
-  FeatureINVPCID,
-  FeatureVMFUNC,
-  FeatureRTM,
-  FeatureHLE,
   FeatureSlowIncDec
 ]>;
 
@@ -512,8 +561,7 @@ def : HaswellProc<"core-avx2">; // Legacy alias.
 
 def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
   FeatureADX,
-  FeatureRDSEED,
-  FeatureSMAP
+  FeatureRDSEED
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
                                              BDWFeatures.Value, []>;
@@ -521,6 +569,7 @@ def : BroadwellProc<"broadwell">;
 
 def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
   FeatureMPX,
+  FeatureRTM,
   FeatureXSAVEC,
   FeatureXSAVES,
   FeatureSGX,
@@ -547,7 +596,8 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
   FeatureLZCNT,
   FeatureBMI,
   FeatureBMI2,
-  FeatureFMA
+  FeatureFMA,
+  FeatureFastPartialYMMorZMMWrite
 ]>;
 def : KnightsLandingProc<"knl">;
 
@@ -558,7 +608,6 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
   FeatureBWI,
   FeatureVLX,
   FeaturePKU,
-  FeaturePCOMMIT,
   FeatureCLWB
 ]>;
 
@@ -662,7 +711,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
-  FeatureFastPartialYMMWrite
+  FeatureFastPartialYMMorZMMWrite
 ]>;
 
 // Bulldozer
@@ -681,6 +730,7 @@ def : Proc<"bdver1", [
   FeatureLZCNT,
   FeaturePOPCNT,
   FeatureXSAVE,
+  FeatureLWP,
   FeatureSlowSHLD,
   FeatureLAHFSAHF
 ]>;
@@ -703,6 +753,7 @@ def : Proc<"bdver2", [
   FeatureXSAVE,
   FeatureBMI,
   FeatureTBM,
+  FeatureLWP,
   FeatureFMA,
   FeatureSlowSHLD,
   FeatureLAHFSAHF
@@ -727,6 +778,7 @@ def : Proc<"bdver3", [
   FeatureXSAVE,
   FeatureBMI,
   FeatureTBM,
+  FeatureLWP,
   FeatureFMA,
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
@@ -753,6 +805,7 @@ def : Proc<"bdver4", [
   FeatureBMI,
   FeatureBMI2,
   FeatureTBM,
+  FeatureLWP,
   FeatureFMA,
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
@@ -761,16 +814,15 @@ def : Proc<"bdver4", [
   FeatureMWAITX
 ]>;
 
-// TODO: The scheduler model falls to BTVER2 model.
-// The znver1 model has to be put in place.
-// Zen
-def: ProcessorModel<"znver1", BtVer2Model, [
+// Znver1
+def: ProcessorModel<"znver1", Znver1Model, [
   FeatureADX,
   FeatureAES,
   FeatureAVX2,
   FeatureBMI,
   FeatureBMI2,
   FeatureCLFLUSHOPT,
+  FeatureCLZERO,
   FeatureCMPXCHG16B,
   FeatureF16C,
   FeatureFMA,
@@ -788,7 +840,6 @@ def: ProcessorModel<"znver1", BtVer2Model, [
   FeatureRDRAND,
   FeatureRDSEED,
   FeatureSHA,
-  FeatureSMAP,
   FeatureSSE4A,
   FeatureSlowSHLD,
   FeatureX87,
@@ -824,6 +875,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel,
 //===----------------------------------------------------------------------===//
 
 include "X86RegisterInfo.td"
+include "X86RegisterBanks.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
index e1825ca..dc15aea 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "llvm/BinaryFormat/COFF.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineValueType.h"
@@ -34,7 +35,6 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
index 6798253..d7c3b74 100644
--- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -81,7 +81,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
-  void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerFAULTING_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
 
   void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
@@ -91,6 +91,9 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
                                      X86MCInstLower &MCIL);
   void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
+  void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
 
   // Helper function that emits the XRay sleds we've collected for a particular
   // function.
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index 844c66d..3355ae8 100644
--- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -17,22 +17,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-
-#include "X86.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86FrameLowering.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -44,6 +57,7 @@ static cl::opt<bool>
                cl::init(false), cl::Hidden);
 
 namespace {
+
 class X86CallFrameOptimization : public MachineFunctionPass {
 public:
   X86CallFrameOptimization() : MachineFunctionPass(ID) {}
@@ -53,30 +67,28 @@ public:
 private:
   // Information we know about a particular call site
   struct CallContext {
-    CallContext()
-        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
-          MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
+    CallContext() : FrameSetup(nullptr), MovVector(4, nullptr) {}
 
     // Iterator referring to the frame setup instruction
     MachineBasicBlock::iterator FrameSetup;
 
     // Actual call instruction
-    MachineInstr *Call;
+    MachineInstr *Call = nullptr;
 
     // A copy of the stack pointer
-    MachineInstr *SPCopy;
+    MachineInstr *SPCopy = nullptr;
 
     // The total displacement of all passed parameters
-    int64_t ExpectedDist;
+    int64_t ExpectedDist = 0;
 
     // The sequence of movs used to pass the parameters
     SmallVector<MachineInstr *, 4> MovVector;
 
     // True if this call site has no stack parameters
-    bool NoStackParams;
+    bool NoStackParams = false;
 
     // True if this call site can use push instructions
-    bool UsePush;
+    bool UsePush = false;
   };
 
   typedef SmallVector<CallContext, 8> ContextVector;
@@ -102,7 +114,7 @@ private:
 
   StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
 
-  const TargetInstrInfo *TII;
+  const X86InstrInfo *TII;
   const X86FrameLowering *TFL;
   const X86Subtarget *STI;
   MachineRegisterInfo *MRI;
@@ -112,11 +124,8 @@ private:
 };
 
 char X86CallFrameOptimization::ID = 0;
-} // end anonymous namespace
 
-FunctionPass *llvm::createX86CallFrameOptimization() {
-  return new X86CallFrameOptimization();
-}
+} // end anonymous namespace
 
 // This checks whether the transformation is legal.
 // Also returns false in cases where it's potentially legal, but
@@ -327,7 +336,6 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
   // transformation.
   const X86RegisterInfo &RegInfo =
       *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
-  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 
   // We expect to enter this at the beginning of a call sequence
   assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
@@ -336,8 +344,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
 
   // How much do we adjust the stack? This puts an upper bound on
   // the number of parameters actually passed on it.
-  unsigned int MaxAdjust =
-      FrameSetup->getOperand(0).getImm() >> Log2SlotSize;
+  unsigned int MaxAdjust = TII->getFrameSize(*FrameSetup) >> Log2SlotSize;
 
   // A zero adjustment means no stack parameters
   if (!MaxAdjust) {
@@ -430,7 +437,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
     return;
 
   Context.Call = &*I;
-  if ((++I)->getOpcode() != FrameDestroyOpcode)
+  if ((++I)->getOpcode() != TII->getCallFrameDestroyOpcode())
     return;
 
   // Now, go through the vector, and see that we don't have any gaps,
@@ -460,7 +467,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
   // PEI will end up finalizing the handling of this.
   MachineBasicBlock::iterator FrameSetup = Context.FrameSetup;
   MachineBasicBlock &MBB = *(FrameSetup->getParent());
-  FrameSetup->getOperand(1).setImm(Context.ExpectedDist);
+  TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist);
 
   DebugLoc DL = FrameSetup->getDebugLoc();
   bool Is64Bit = STI->is64Bit();
@@ -487,11 +494,10 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         if (isInt<8>(Val))
           PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
       }
-      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
-                 .addOperand(PushOp);
+      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
       break;
     case X86::MOV32mr:
-    case X86::MOV64mr:
+    case X86::MOV64mr: {
       unsigned int Reg = PushOp.getReg();
 
       // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
@@ -501,9 +507,9 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
         BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
         BuildMI(MBB, Context.Call, DL, TII->get(X86::INSERT_SUBREG), Reg)
-          .addReg(UndefReg)
-          .addOperand(PushOp)
-          .addImm(X86::sub_32bit);
+            .addReg(UndefReg)
+            .add(PushOp)
+            .addImm(X86::sub_32bit);
       }
 
       // If PUSHrmm is not slow on this target, try to fold the source of the
@@ -530,6 +536,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       }
       break;
     }
+    }
 
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
@@ -589,3 +596,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
 
   return &DefMI;
 }
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
index 5ae4962..99aeec6 100644
--- a/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -14,12 +14,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86CallLowering.h"
+#include "X86CallingConv.h"
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
+#include "X86TargetMachine.h"
+
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
+#include "X86GenCallingConv.inc"
+
 #ifndef LLVM_BUILD_GLOBAL_ISEL
 #error "This shouldn't be built without GISel"
 #endif
@@ -27,20 +36,195 @@ using namespace llvm;
 X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
     : CallLowering(&TLI) {}
 
+bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                        SmallVectorImpl<ArgInfo> &SplitArgs,
+                                        const DataLayout &DL,
+                                        MachineRegisterInfo &MRI,
+                                        SplitArgTy PerformArgSplit) const {
+
+  const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
+  LLVMContext &Context = OrigArg.Ty->getContext();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+  if (SplitVTs.size() != 1) {
+    // TODO: support struct/array split
+    return false;
+  }
+
+  EVT VT = SplitVTs[0];
+  unsigned NumParts = TLI.getNumRegisters(Context, VT);
+
+  if (NumParts == 1) {
+    // replace the original type ( pointer -> GPR ).
+    SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context),
+                           OrigArg.Flags, OrigArg.IsFixed);
+    return true;
+  }
+
+  SmallVector<unsigned, 8> SplitRegs;
+
+  EVT PartVT = TLI.getRegisterType(Context, VT);
+  Type *PartTy = PartVT.getTypeForEVT(Context);
+
+  for (unsigned i = 0; i < NumParts; ++i) {
+    ArgInfo Info =
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
+                PartTy, OrigArg.Flags};
+    SplitArgs.push_back(Info);
+    SplitRegs.push_back(Info.Reg);
+  }
+
+  PerformArgSplit(SplitRegs);
+  return true;
+}
+
+namespace {
+struct FuncReturnHandler : public CallLowering::ValueHandler {
+  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    llvm_unreachable("Don't know how to get a stack address yet");
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    MIB.addUse(PhysReg, RegState::Implicit);
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    llvm_unreachable("Don't know how to assign a value to an address yet");
+  }
+
+  MachineInstrBuilder &MIB;
+};
+} // End anonymous namespace.
+
 bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                   const Value *Val, unsigned VReg) const {
-  // TODO: handle functions returning non-void values.
-  if (Val)
-    return false;
 
-  MIRBuilder.buildInstr(X86::RET).addImm(0);
+  assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
+
+  if (VReg) {
+    MachineFunction &MF = MIRBuilder.getMF();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    auto &DL = MF.getDataLayout();
+    const Function &F = *MF.getFunction();
 
+    ArgInfo OrigArg{VReg, Val->getType()};
+    setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
+
+    SmallVector<ArgInfo, 8> SplitArgs;
+    if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                           [&](ArrayRef<unsigned> Regs) {
+                             MIRBuilder.buildUnmerge(Regs, VReg);
+                           }))
+      return false;
+
+    FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+    if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+      return false;
+  }
+
+  MIRBuilder.insertInstr(MIB);
   return true;
 }
 
+namespace {
+struct FormalArgHandler : public CallLowering::ValueHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn *AssignFn, const DataLayout &DL)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), DL(DL) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    int FI = MFI.CreateFixedObject(Size, Offset, true);
+    MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+
+    unsigned AddrReg = MRI.createGenericVirtualRegister(
+        LLT::pointer(0, DL.getPointerSizeInBits(0)));
+    MIRBuilder.buildFrameIndex(AddrReg, FI);
+    return AddrReg;
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+        0);
+    MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+    MIRBuilder.buildCopy(ValVReg, PhysReg);
+  }
+
+  const DataLayout &DL;
+};
+} // namespace
+
 bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                            const Function &F,
                                            ArrayRef<unsigned> VRegs) const {
-  // TODO: handle functions with one or more arguments.
-  return F.arg_empty();
+  if (F.arg_empty())
+    return true;
+
+  // TODO: handle variadic function
+  if (F.isVarArg())
+    return false;
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto DL = MF.getDataLayout();
+
+  SmallVector<ArgInfo, 8> SplitArgs;
+  unsigned Idx = 0;
+  for (auto &Arg : F.args()) {
+
+    // TODO: handle not simple cases.
+    if (Arg.hasAttribute(Attribute::ByVal) ||
+        Arg.hasAttribute(Attribute::InReg) ||
+        Arg.hasAttribute(Attribute::StructRet) ||
+        Arg.hasAttribute(Attribute::SwiftSelf) ||
+        Arg.hasAttribute(Attribute::SwiftError) ||
+        Arg.hasAttribute(Attribute::Nest))
+      return false;
+
+    ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+    setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
+    if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                           [&](ArrayRef<unsigned> Regs) {
+                             MIRBuilder.buildMerge(VRegs[Idx], Regs);
+                           }))
+      return false;
+    Idx++;
+  }
+
+  MachineBasicBlock &MBB = MIRBuilder.getMBB();
+  if (!MBB.empty())
+    MIRBuilder.setInstr(*MBB.begin());
+
+  FormalArgHandler Handler(MIRBuilder, MRI, CC_X86, DL);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+    return false;
+
+  // Move back to the end of the basic block.
+  MIRBuilder.setMBB(MBB);
+
+  return true;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm/lib/Target/X86/X86CallLowering.h
index f2672f0..6a5dabf 100644
--- a/contrib/llvm/lib/Target/X86/X86CallLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86CallLowering.h
@@ -34,6 +34,15 @@ public:
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
+
+private:
+  /// A function of this type is used to perform value split action.
+  typedef std::function<void(ArrayRef<unsigned>)> SplitArgTy;
+
+  bool splitToValueTypes(const ArgInfo &OrigArgInfo,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI,
+                         SplitArgTy SplitArg) const;
 };
-} // End of namespace llvm;
+} // namespace llvm
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
index c96e76b..59dde98 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -1,208 +1,208 @@
-//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl   -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of custom routines for the X86
-// Calling Convention that aren't done by tablegen.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/X86MCTargetDesc.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/IR/CallingConv.h"
-
-namespace llvm {
-
-bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                   CCValAssign::LocInfo &LocInfo,
-                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  // List of GPR registers that are available to store values in regcall
-  // calling convention.
-  static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
-                                      X86::ESI};
-
-  // The vector will save all the available registers for allocation.
-  SmallVector<unsigned, 5> AvailableRegs;
-
-  // searching for the available registers.
-  for (auto Reg : RegList) {
-    if (!State.isAllocated(Reg))
-      AvailableRegs.push_back(Reg);
-  }
-
-  const size_t RequiredGprsUponSplit = 2;
-  if (AvailableRegs.size() < RequiredGprsUponSplit)
-    return false; // Not enough free registers - continue the search.
-
-  // Allocating the available registers.
-  for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
-
-    // Marking the register as located.
-    unsigned Reg = State.AllocateReg(AvailableRegs[I]);
-
-    // Since we previously made sure that 2 registers are available
-    // we expect that a real register number will be returned.
-    assert(Reg && "Expecting a register will be available");
-
-    // Assign the value to the allocated register
-    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-  }
-
-  // Successful in allocating regsiters - stop scanning next rules.
-  return true;
-}
-
-static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
-  if (ValVT.is512BitVector()) {
-    static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
-                                           X86::ZMM3, X86::ZMM4, X86::ZMM5};
-    return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
-  }
-
-  if (ValVT.is256BitVector()) {
-    static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
-                                           X86::YMM3, X86::YMM4, X86::YMM5};
-    return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
-  }
-
-  static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
-                                         X86::XMM3, X86::XMM4, X86::XMM5};
-  return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
-}
-
-static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
-  static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
-  return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
-}
-
-static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
-                                            MVT &LocVT,
-                                            CCValAssign::LocInfo &LocInfo,
-                                            ISD::ArgFlagsTy &ArgFlags,
-                                            CCState &State) {
-
-  ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
-  bool Is64bit = static_cast<const X86Subtarget &>(
-                     State.getMachineFunction().getSubtarget())
-                     .is64Bit();
-
-  for (auto Reg : RegList) {
-    // If the register is not marked as allocated - assign to it.
-    if (!State.isAllocated(Reg)) {
-      unsigned AssigedReg = State.AllocateReg(Reg);
-      assert(AssigedReg == Reg && "Expecting a valid register allocation");
-      State.addLoc(
-          CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
-      return true;
-    }
-    // If the register is marked as shadow allocated - assign to it.
-    if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-      return true;
-    }
-  }
-
-  llvm_unreachable("Clang should ensure that hva marked vectors will have "
-                   "an available register.");
-  return false;
-}
-
-bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                          CCValAssign::LocInfo &LocInfo,
-                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  // On the second pass, go through the HVAs only.
-  if (ArgFlags.isSecArgPass()) {
-    if (ArgFlags.isHva())
-      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
-                                             ArgFlags, State);
-    return true;
-  }
-
-  // Process only vector types as defined by vectorcall spec:
-  // "A vector type is either a floating-point type, for example,
-  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
-  if (!(ValVT.isFloatingPoint() ||
-        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
-    // If R9 was already assigned it means that we are after the fourth element
-    // and because this is not an HVA / Vector type, we need to allocate
-    // shadow XMM register.
-    if (State.isAllocated(X86::R9)) {
-      // Assign shadow XMM register.
-      (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
-    }
-
-    return false;
-  }
-
-  if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
-    // Assign shadow GPR register.
-    (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
-
-    // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
-    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
-      // In Vectorcall Calling convention, additional shadow stack can be
-      // created on top of the basic 32 bytes of win64.
-      // It can happen if the fifth or sixth argument is vector type or HVA.
-      // At that case for each argument a shadow stack of 8 bytes is allocated.
-      if (Reg == X86::XMM4 || Reg == X86::XMM5)
-        State.AllocateStack(8, 8);
-
-      if (!ArgFlags.isHva()) {
-        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-        return true; // Allocated a register - Stop the search.
-      }
-    }
-  }
-
-  // If this is an HVA - Stop the search,
-  // otherwise continue the search.
-  return ArgFlags.isHva();
-}
-
-bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                          CCValAssign::LocInfo &LocInfo,
-                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  // On the second pass, go through the HVAs only.
-  if (ArgFlags.isSecArgPass()) {
-    if (ArgFlags.isHva())
-      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
-                                             ArgFlags, State);
-    return true;
-  }
-
-  // Process only vector types as defined by vectorcall spec:
-  // "A vector type is either a floating point type, for example,
-  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
-  if (!(ValVT.isFloatingPoint() ||
-        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
-    return false;
-  }
-
-  if (ArgFlags.isHva())
-    return true; // If this is an HVA - Stop the search.
-
-  // Assign XMM register.
-  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return true;
-  }
-
-  // In case we did not find an available XMM register for a vector -
-  // pass it indirectly.
-  // It is similar to CCPassIndirect, with the addition of inreg.
-  if (!ValVT.isFloatingPoint()) {
-    LocVT = MVT::i32;
-    LocInfo = CCValAssign::Indirect;
-    ArgFlags.setInReg();
-  }
-
-  return false; // No register was assigned - Continue the search.
-}
-
-} // End llvm namespace
+//=== X86CallingConv.cpp - X86 Custom Calling Convention Impl   -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of custom routines for the X86
+// Calling Convention that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // List of GPR registers that are available to store values in regcall
+  // calling convention.
+  static const MCPhysReg RegList[] = {X86::EAX, X86::ECX, X86::EDX, X86::EDI,
+                                      X86::ESI};
+
+  // The vector will save all the available registers for allocation.
+  SmallVector<unsigned, 5> AvailableRegs;
+
+  // searching for the available registers.
+  for (auto Reg : RegList) {
+    if (!State.isAllocated(Reg))
+      AvailableRegs.push_back(Reg);
+  }
+
+  const size_t RequiredGprsUponSplit = 2;
+  if (AvailableRegs.size() < RequiredGprsUponSplit)
+    return false; // Not enough free registers - continue the search.
+
+  // Allocating the available registers.
+  for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
+
+    // Marking the register as located.
+    unsigned Reg = State.AllocateReg(AvailableRegs[I]);
+
+    // Since we previously made sure that 2 registers are available
+    // we expect that a real register number will be returned.
+    assert(Reg && "Expecting a register will be available");
+
+    // Assign the value to the allocated register
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  }
+
+  // Successful in allocating regsiters - stop scanning next rules.
+  return true;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+  if (ValVT.is512BitVector()) {
+    static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+                                           X86::ZMM3, X86::ZMM4, X86::ZMM5};
+    return makeArrayRef(std::begin(RegListZMM), std::end(RegListZMM));
+  }
+
+  if (ValVT.is256BitVector()) {
+    static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+                                           X86::YMM3, X86::YMM4, X86::YMM5};
+    return makeArrayRef(std::begin(RegListYMM), std::end(RegListYMM));
+  }
+
+  static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+                                         X86::XMM3, X86::XMM4, X86::XMM5};
+  return makeArrayRef(std::begin(RegListXMM), std::end(RegListXMM));
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+  static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+  return makeArrayRef(std::begin(RegListGPR), std::end(RegListGPR));
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+                                            MVT &LocVT,
+                                            CCValAssign::LocInfo &LocInfo,
+                                            ISD::ArgFlagsTy &ArgFlags,
+                                            CCState &State) {
+
+  ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+  bool Is64bit = static_cast<const X86Subtarget &>(
+                     State.getMachineFunction().getSubtarget())
+                     .is64Bit();
+
+  for (auto Reg : RegList) {
+    // If the register is not marked as allocated - assign to it.
+    if (!State.isAllocated(Reg)) {
+      unsigned AssigedReg = State.AllocateReg(Reg);
+      assert(AssigedReg == Reg && "Expecting a valid register allocation");
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+      return true;
+    }
+    // If the register is marked as shadow allocated - assign to it.
+    if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+  }
+
+  llvm_unreachable("Clang should ensure that hva marked vectors will have "
+                   "an available register.");
+  return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating-point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    // If R9 was already assigned it means that we are after the fourth element
+    // and because this is not an HVA / Vector type, we need to allocate
+    // shadow XMM register.
+    if (State.isAllocated(X86::R9)) {
+      // Assign shadow XMM register.
+      (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+    }
+
+    return false;
+  }
+
+  if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+    // Assign shadow GPR register.
+    (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+    // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+      // In Vectorcall Calling convention, additional shadow stack can be
+      // created on top of the basic 32 bytes of win64.
+      // It can happen if the fifth or sixth argument is vector type or HVA.
+      // At that case for each argument a shadow stack of 8 bytes is allocated.
+      if (Reg == X86::XMM4 || Reg == X86::XMM5)
+        State.AllocateStack(8, 8);
+
+      if (!ArgFlags.isHva()) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return true; // Allocated a register - Stop the search.
+      }
+    }
+  }
+
+  // If this is an HVA - Stop the search,
+  // otherwise continue the search.
+  return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    return false;
+  }
+
+  if (ArgFlags.isHva())
+    return true; // If this is an HVA - Stop the search.
+
+  // Assign XMM register.
+  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  // In case we did not find an available XMM register for a vector -
+  // pass it indirectly.
+  // It is similar to CCPassIndirect, with the addition of inreg.
+  if (!ValVT.isFloatingPoint()) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::Indirect;
+    ArgFlags.setInReg();
+  }
+
+  return false; // No register was assigned - Continue the search.
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td
index cf7bc98..2646198 100644
--- a/contrib/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td
@@ -73,8 +73,8 @@ def CC_#NAME : CallingConv<[
     CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>,
     CCIfByVal<CCPassByVal<4, 4>>,
 
-    // Promote i1/i8/i16 arguments to i32.
-    CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+    // Promote i1/i8/i16/v1i1 arguments to i32.
+    CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
     // Promote v8i1/v16i1/v32i1 arguments to i32.
     CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>,
@@ -146,8 +146,8 @@ def CC_#NAME : CallingConv<[
 ]>;
 
 def RetCC_#NAME : CallingConv<[
-    // Promote i1, v8i1 arguments to i8.
-    CCIfType<[i1, v8i1], CCPromoteToType<i8>>,
+    // Promote i1, v1i1, v8i1 arguments to i8.
+    CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>,
 
     // Promote v16i1 arguments to i16.
     CCIfType<[v16i1], CCPromoteToType<i16>>,
@@ -207,6 +207,7 @@ def RetCC_X86Common : CallingConv<[
   //
   // For code that doesn't care about the ABI, we allow returning more than two
   // integer values in registers.
+  CCIfType<[v1i1],  CCPromoteToType<i8>>,
   CCIfType<[i1],  CCPromoteToType<i8>>,
   CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>,
   CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
@@ -375,6 +376,7 @@ def RetCC_X86_64_Swift : CallingConv<[
   CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
 
   // For integers, ECX, R8D can be used as extra return registers.
+  CCIfType<[v1i1],  CCPromoteToType<i8>>,
   CCIfType<[i1],  CCPromoteToType<i8>>,
   CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>,
   CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>,
@@ -446,7 +448,7 @@ def RetCC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>,
 
   // Handle explicit CC selection
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
   // Handle Vectorcall CC
@@ -485,8 +487,8 @@ def CC_X86_64_C : CallingConv<[
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<8, 8>>,
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>,
@@ -584,8 +586,8 @@ def CC_X86_Win64_C : CallingConv<[
   // FIXME: Handle byval stuff.
   // FIXME: Handle varargs.
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in R10.
   CCIfNest<CCAssignToReg<[R10]>>,
@@ -649,7 +651,15 @@ def CC_X86_64_GHC : CallingConv<[
   // Pass in STG registers: F1, F2, F3, F4, D1, D2
   CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
             CCIfSubtarget<"hasSSE1()",
-            CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>
+            CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>,
+  // AVX
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+            CCIfSubtarget<"hasAVX()",
+            CCAssignToReg<[YMM1, YMM2, YMM3, YMM4, YMM5, YMM6]>>>,
+  // AVX-512
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+            CCIfSubtarget<"hasAVX512()",
+            CCAssignToReg<[ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6]>>>
 ]>;
 
 def CC_X86_64_HiPE : CallingConv<[
@@ -796,8 +806,8 @@ def CC_X86_32_Common : CallingConv<[
 ]>;
 
 def CC_X86_32_C : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in ECX.
   CCIfNest<CCAssignToReg<[ECX]>>,
@@ -816,8 +826,8 @@ def CC_X86_32_MCU : CallingConv<[
   // puts arguments in registers.
   CCIfByVal<CCPassByVal<4, 4>>,
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // If the call is not a vararg call, some arguments may be passed
   // in integer registers.
@@ -828,8 +838,8 @@ def CC_X86_32_MCU : CallingConv<[
 ]>;
 
 def CC_X86_32_FastCall : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in EAX.
   CCIfNest<CCAssignToReg<[EAX]>>,
@@ -858,15 +868,15 @@ def CC_X86_32_ThisCall_Common : CallingConv<[
 ]>;
 
 def CC_X86_32_ThisCall_Mingw : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   CCDelegateTo<CC_X86_32_ThisCall_Common>
 ]>;
 
 def CC_X86_32_ThisCall_Win : CallingConv<[
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // Pass sret arguments indirectly through stack.
   CCIfSRet<CCAssignToStack<4, 4>>,
@@ -885,8 +895,8 @@ def CC_X86_32_FastCC : CallingConv<[
   // puts arguments in registers.
   CCIfByVal<CCPassByVal<4, 4>>,
 
-  // Promote i1/i8/i16 arguments to i32.
-  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  // Promote i1/i8/i16/v1i1 arguments to i32.
+  CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>,
 
   // The 'nest' parameter, if any, is passed in EAX.
   CCIfNest<CCAssignToReg<[EAX]>>,
@@ -994,7 +1004,7 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
   CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
-  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
   CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>,
@@ -1074,6 +1084,8 @@ def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
                                                  (sequence "K%u", 0, 7))>;
 
 def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_NoSSE : CalleeSavedRegs<(add RAX, RBX, RCX, RDX, RSI, RDI, R8, R9,
+                                                R10, R11, R12, R13, R14, R15, RBP)>;
 def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
                                                    (sequence "YMM%u", 0, 15)),
                                               (sequence "XMM%u", 0, 15))>;
diff --git a/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp
new file mode 100644
index 0000000..bfc83443
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -0,0 +1,611 @@
+//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a pass that converts X86 cmov instructions into branch
+/// when profitable. This pass is conservative, i.e., it applies transformation
+/// if and only if it can gaurantee a gain with high confidence.
+///
+/// Thus, the optimization applies under the following conditions:
+///   1. Consider as a candidate only CMOV in most inner loop, assuming that
+///       most hotspots are represented by these loops.
+///   2. Given a group of CMOV instructions, that are using same EFLAGS def
+///      instruction:
+///      a. Consider them as candidates only if all have same code condition or
+///         opposite one, to prevent generating more than one conditional jump
+///         per EFLAGS def instruction.
+///      b. Consider them as candidates only if all are profitable to be
+///         converted, assuming that one bad conversion may casue a degradation.
+///   3. Apply conversion only for loop that are found profitable and only for
+///      CMOV candidates that were found profitable.
+///      a. Loop is considered profitable only if conversion will reduce its
+///         depth cost by some thrishold.
+///      b. CMOV is considered profitable if the cost of its condition is higher
+///         than the average cost of its true-value and false-value by 25% of
+///         branch-misprediction-penalty, this to assure no degredassion even
+///         with 25% branch misprediction.
+///
+/// Note: This pass is assumed to run on SSA machine code.
+//===----------------------------------------------------------------------===//
+//
+//  External interfaces:
+//      FunctionPass *llvm::createX86CmovConverterPass();
+//      bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
+//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cmov-converter"
+
+STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
+STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
+STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
+STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
+
+namespace {
+// This internal switch can be used to turn off the cmov/branch optimization.
+static cl::opt<bool>
+    EnableCmovConverter("x86-cmov-converter",
+                        cl::desc("Enable the X86 cmov-to-branch optimization."),
+                        cl::init(true), cl::Hidden);
+
+/// Converts X86 cmov instructions into branches when profitable.
+class X86CmovConverterPass : public MachineFunctionPass {
+public:
+  X86CmovConverterPass() : MachineFunctionPass(ID) {}
+  ~X86CmovConverterPass() {}
+
+  StringRef getPassName() const override { return "X86 cmov Conversion"; }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  /// Pass identification, replacement for typeid.
+  static char ID;
+
+  const MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  TargetSchedModel TSchedModel;
+
+  /// List of consecutive CMOV instructions.
+  typedef SmallVector<MachineInstr *, 2> CmovGroup;
+  typedef SmallVector<CmovGroup, 2> CmovGroups;
+
+  /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
+  /// CmovInstGroups accordingly.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff it found any CMOV-group-candidate.
+  bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups);
+
+  /// Check if it is profitable to transform each CMOV-group-candidates into
+  /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
+  ///
+  /// \param CurrLoop Loop being processed.
+  /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
+  /// \returns true iff any CMOV-group-candidate remain.
+  bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop,
+                                        CmovGroups &CmovInstGroups);
+
+  /// Convert the given list of consecutive CMOV instructions into a branch.
+  ///
+  /// \param Group Consecutive CMOV instructions to be converted into branch.
+  void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
+};
+
+char X86CmovConverterPass::ID = 0;
+
+void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineLoopInfo>();
+}
+
+bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  if (!EnableCmovConverter)
+    return false;
+
+  DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+               << "**********\n");
+
+  bool Changed = false;
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  MRI = &MF.getRegInfo();
+  TII = STI.getInstrInfo();
+  TSchedModel.init(STI.getSchedModel(), &STI, TII);
+
+  //===--------------------------------------------------------------------===//
+  // Algorithm
+  // ---------
+  //   For each inner most loop
+  //     collectCmovCandidates() {
+  //       Find all CMOV-group-candidates.
+  //     }
+  //
+  //     checkForProfitableCmovCandidates() {
+  //       * Calculate both loop-depth and optimized-loop-depth.
+  //       * Use these depth to check for loop transformation profitability.
+  //       * Check for CMOV-group-candidate transformation profitability.
+  //     }
+  //
+  //     For each profitable CMOV-group-candidate
+  //       convertCmovInstsToBranches() {
+  //           * Create FalseBB, SinkBB, Conditional branch to SinkBB.
+  //           * Replace each CMOV instruction with a PHI instruction in SinkBB.
+  //       }
+  //
+  // Note: For more details, see each function description.
+  //===--------------------------------------------------------------------===//
+  for (MachineBasicBlock &MBB : MF) {
+    MachineLoop *CurrLoop = MLI.getLoopFor(&MBB);
+
+    // Optimize only inner most loops.
+    if (!CurrLoop || CurrLoop->getHeader() != &MBB ||
+        !CurrLoop->getSubLoops().empty())
+      continue;
+
+    // List of consecutive CMOV instructions to be processed.
+    CmovGroups CmovInstGroups;
+
+    if (!collectCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups))
+      continue;
+
+    Changed = true;
+    for (auto &Group : CmovInstGroups)
+      convertCmovInstsToBranches(Group);
+  }
+  return Changed;
+}
+
+bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
+                                                 CmovGroups &CmovInstGroups) {
+  //===--------------------------------------------------------------------===//
+  // Collect all CMOV-group-candidates and add them into CmovInstGroups.
+  //
+  // CMOV-group:
+  //   CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
+  //
+  // CMOV-group-candidate:
+  //   CMOV-group where all the CMOV instructions are
+  //     1. consecutive.
+  //     2. have same condition code or opposite one.
+  //     3. have only operand registers (X86::CMOVrr).
+  //===--------------------------------------------------------------------===//
+  // List of possible improvement (TODO's):
+  // --------------------------------------
+  //   TODO: Add support for X86::CMOVrm instructions.
+  //   TODO: Add support for X86::SETcc instructions.
+  //   TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
+  //===--------------------------------------------------------------------===//
+
+  // Current processed CMOV-Group.
+  CmovGroup Group;
+  for (auto *MBB : CurrLoop->getBlocks()) {
+    Group.clear();
+    // Condition code of first CMOV instruction current processed range and its
+    // opposite condition code.
+    X86::CondCode FirstCC, FirstOppCC;
+    // Indicator of a non CMOVrr instruction in the current processed range.
+    bool FoundNonCMOVInst = false;
+    // Indicator for current processed CMOV-group if it should be skipped.
+    bool SkipGroup = false;
+
+    for (auto &I : *MBB) {
+      X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
+      // Check if we found a X86::CMOVrr instruction.
+      if (CC != X86::COND_INVALID && !I.mayLoad()) {
+        if (Group.empty()) {
+          // We found first CMOV in the range, reset flags.
+          FirstCC = CC;
+          FirstOppCC = X86::GetOppositeBranchCondition(CC);
+          FoundNonCMOVInst = false;
+          SkipGroup = false;
+        }
+        Group.push_back(&I);
+        // Check if it is a non-consecutive CMOV instruction or it has different
+        // condition code than FirstCC or FirstOppCC.
+        if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
+          // Mark the SKipGroup indicator to skip current processed CMOV-Group.
+          SkipGroup = true;
+        continue;
+      }
+      // If Group is empty, keep looking for first CMOV in the range.
+      if (Group.empty())
+        continue;
+
+      // We found a non X86::CMOVrr instruction.
+      FoundNonCMOVInst = true;
+      // Check if this instruction define EFLAGS, to determine end of processed
+      // range, as there would be no more instructions using current EFLAGS def.
+      if (I.definesRegister(X86::EFLAGS)) {
+        // Check if current processed CMOV-group should not be skipped and add
+        // it as a CMOV-group-candidate.
+        if (!SkipGroup)
+          CmovInstGroups.push_back(Group);
+        else
+          ++NumOfSkippedCmovGroups;
+        Group.clear();
+      }
+    }
+    // End of basic block is considered end of range, check if current processed
+    // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
+    if (Group.empty())
+      continue;
+    if (!SkipGroup)
+      CmovInstGroups.push_back(Group);
+    else
+      ++NumOfSkippedCmovGroups;
+  }
+
+  NumOfCmovGroupCandidate += CmovInstGroups.size();
+  return !CmovInstGroups.empty();
+}
+
+/// \returns Depth of CMOV instruction as if it was converted into branch.
+/// \param TrueOpDepth depth cost of CMOV true value operand.
+/// \param FalseOpDepth depth cost of CMOV false value operand.
+static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
+  //===--------------------------------------------------------------------===//
+  // With no info about branch weight, we assume 50% for each value operand.
+  // Thus, depth of optimized CMOV instruction is the rounded up average of
+  // its True-Operand-Value-Depth and False-Operand-Value-Depth.
+  //===--------------------------------------------------------------------===//
+  return (TrueOpDepth + FalseOpDepth + 1) / 2;
+}
+
+bool X86CmovConverterPass::checkForProfitableCmovCandidates(
+    MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) {
+  struct DepthInfo {
+    /// Depth of original loop.
+    unsigned Depth;
+    /// Depth of optimized loop.
+    unsigned OptDepth;
+  };
+  /// Number of loop iterations to calculate depth for ?!
+  static const unsigned LoopIterations = 2;
+  DenseMap<MachineInstr *, DepthInfo> DepthMap;
+  DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
+  enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
+  /// For each register type maps the register to its last def instruction.
+  DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum];
+  /// Maps register operand to its def instruction, which can be nullptr if it
+  /// is unknown (e.g., operand is defined outside the loop).
+  DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap;
+
+  // Set depth of unknown instruction (i.e., nullptr) to zero.
+  DepthMap[nullptr] = {0, 0};
+
+  SmallPtrSet<MachineInstr *, 4> CmovInstructions;
+  for (auto &Group : CmovInstGroups)
+    CmovInstructions.insert(Group.begin(), Group.end());
+
+  //===--------------------------------------------------------------------===//
+  // Step 1: Calculate instruction depth and loop depth.
+  // Optimized-Loop:
+  //   loop with CMOV-group-candidates converted into branches.
+  //
+  // Instruction-Depth:
+  //   instruction latency + max operand depth.
+  //     * For CMOV instruction in optimized loop the depth is calculated as:
+  //       CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
+  // TODO: Find a better way to estimate the latency of the branch instruction
+  //       rather than using the CMOV latency.
+  //
+  // Loop-Depth:
+  //   max instruction depth of all instructions in the loop.
+  // Note: instruction with max depth represents the critical-path in the loop.
+  //
+  // Loop-Depth[i]:
+  //   Loop-Depth calculated for first `i` iterations.
+  //   Note: it is enough to calculate depth for up to two iterations.
+  //
+  // Depth-Diff[i]:
+  //   Number of cycles saved in first 'i` iterations by optimizing the loop.
+  //===--------------------------------------------------------------------===//
+  for (unsigned I = 0; I < LoopIterations; ++I) {
+    DepthInfo &MaxDepth = LoopDepth[I];
+    for (auto *MBB : CurrLoop->getBlocks()) {
+      // Clear physical registers Def map.
+      RegDefMaps[PhyRegType].clear();
+      for (MachineInstr &MI : *MBB) {
+        unsigned MIDepth = 0;
+        unsigned MIDepthOpt = 0;
+        bool IsCMOV = CmovInstructions.count(&MI);
+        for (auto &MO : MI.uses()) {
+          // Checks for "isUse()" as "uses()" returns also implicit definitions.
+          if (!MO.isReg() || !MO.isUse())
+            continue;
+          unsigned Reg = MO.getReg();
+          auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
+          if (MachineInstr *DefMI = RDM.lookup(Reg)) {
+            OperandToDefMap[&MO] = DefMI;
+            DepthInfo Info = DepthMap.lookup(DefMI);
+            MIDepth = std::max(MIDepth, Info.Depth);
+            if (!IsCMOV)
+              MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
+          }
+        }
+
+        if (IsCMOV)
+          MIDepthOpt = getDepthOfOptCmov(
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
+              DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
+
+        // Iterates over all operands to handle implicit definitions as well.
+        for (auto &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isDef())
+            continue;
+          unsigned Reg = MO.getReg();
+          RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
+        }
+
+        unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+        DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
+        MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
+        MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
+      }
+    }
+  }
+
+  unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
+                                   LoopDepth[1].Depth - LoopDepth[1].OptDepth};
+
+  //===--------------------------------------------------------------------===//
+  // Step 2: Check if Loop worth to be optimized.
+  // Worth-Optimize-Loop:
+  //   case 1: Diff[1] == Diff[0]
+  //           Critical-path is iteration independent - there is no dependency
+  //           of critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is enough to check gain percent of 1st iteration -
+  //           To be conservative, the optimized loop need to have a depth of
+  //           12.5% cycles less than original loop, per iteration.
+  //
+  //   case 2: Diff[1] > Diff[0]
+  //           Critical-path is iteration dependent - there is dependency of
+  //           critical-path instructions on critical-path instructions of
+  //           previous iteration.
+  //           Thus, it is required to check the gradient of the gain - the
+  //           change in Depth-Diff compared to the change in Loop-Depth between
+  //           1st and 2nd iterations.
+  //           To be conservative, the gradient need to be at least 50%.
+  //
+  // If loop is not worth optimizing, remove all CMOV-group-candidates.
+  //===--------------------------------------------------------------------===//
+  bool WorthOptLoop = false;
+  if (Diff[1] == Diff[0])
+    WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
+  else if (Diff[1] > Diff[0])
+    WorthOptLoop =
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+
+  if (!WorthOptLoop)
+    return false;
+
+  ++NumOfLoopCandidate;
+
+  //===--------------------------------------------------------------------===//
+  // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
+  // Worth-Optimize-Group:
+  //   Iff it worths to optimize all CMOV instructions in the group.
+  //
+  // Worth-Optimize-CMOV:
+  //   Predicted branch is faster than CMOV by the difference between depth of
+  //   condition operand and depth of taken (predicted) value operand.
+  //   To be conservative, the gain of such CMOV transformation should cover at
+  //   at least 25% of branch-misprediction-penalty.
+  //===--------------------------------------------------------------------===//
+  unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
+  CmovGroups TempGroups;
+  std::swap(TempGroups, CmovInstGroups);
+  for (auto &Group : TempGroups) {
+    bool WorthOpGroup = true;
+    for (auto *MI : Group) {
+      // Avoid CMOV instruction which value is used as a pointer to load from.
+      // This is another conservative check to avoid converting CMOV instruction
+      // used with tree-search like algorithm, where the branch is unpredicted.
+      auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
+      if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
+        unsigned Op = UIs.begin()->getOpcode();
+        if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
+          WorthOpGroup = false;
+          break;
+        }
+      }
+
+      unsigned CondCost =
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
+      unsigned ValCost = getDepthOfOptCmov(
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
+          DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
+      if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
+        WorthOpGroup = false;
+        break;
+      }
+    }
+
+    if (WorthOpGroup)
+      CmovInstGroups.push_back(Group);
+  }
+
+  return !CmovInstGroups.empty();
+}
+
+static bool checkEFLAGSLive(MachineInstr *MI) {
+  if (MI->killsRegister(X86::EFLAGS))
+    return false;
+
+  // The EFLAGS operand of MI might be missing a kill marker.
+  // Figure out whether EFLAGS operand should LIVE after MI instruction.
+  MachineBasicBlock *BB = MI->getParent();
+  MachineBasicBlock::iterator ItrMI = MI;
+
+  // Scan forward through BB for a use/def of EFLAGS.
+  for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
+    if (I->readsRegister(X86::EFLAGS))
+      return true;
+    if (I->definesRegister(X86::EFLAGS))
+      return false;
+  }
+
+  // We hit the end of the block, check whether EFLAGS is live into a successor.
+  for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
+    if ((*I)->isLiveIn(X86::EFLAGS))
+      return true;
+  }
+
+  return false;
+}
+
+void X86CmovConverterPass::convertCmovInstsToBranches(
+    SmallVectorImpl<MachineInstr *> &Group) const {
+  assert(!Group.empty() && "No CMOV instructions to convert");
+  ++NumOfOptimizedCmovGroups;
+
+  // To convert a CMOVcc instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+
+  // Before
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   v1 = CMOVge t1, f1, cond
+  //   v2 = CMOVlt t2, f2, cond
+  //   v3 = CMOVge v1, f3, cond
+  //
+  // After
+  // -----
+  // MBB:
+  //   cond = cmp ...
+  //   jge %SinkMBB
+  //
+  // FalseMBB:
+  //   jmp %SinkMBB
+  //
+  // SinkMBB:
+  //   %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
+  //   %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
+  //                                          ; true-value with false-value
+  //   %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
+  //                                          ; previous Phi instruction result
+
+  MachineInstr &MI = *Group.front();
+  MachineInstr *LastCMOV = Group.back();
+  DebugLoc DL = MI.getDebugLoc();
+  X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
+  X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction::iterator It = ++MBB->getIterator();
+  MachineFunction *F = MBB->getParent();
+  const BasicBlock *BB = MBB->getBasicBlock();
+
+  MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
+  F->insert(It, FalseMBB);
+  F->insert(It, SinkMBB);
+
+  // If the EFLAGS register isn't dead in the terminator, then claim that it's
+  // live into the sink and copy blocks.
+  if (checkEFLAGSLive(LastCMOV)) {
+    FalseMBB->addLiveIn(X86::EFLAGS);
+    SinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
+  // Transfer the remainder of BB and its successor edges to SinkMBB.
+  SinkMBB->splice(SinkMBB->begin(), MBB,
+                  std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
+  SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Add the false and sink blocks as its successors.
+  MBB->addSuccessor(FalseMBB);
+  MBB->addSuccessor(SinkMBB);
+
+  // Create the conditional branch instruction.
+  BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
+
+  // Add the sink block to the false block successors.
+  FalseMBB->addSuccessor(SinkMBB);
+
+  MachineInstrBuilder MIB;
+  MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
+  MachineBasicBlock::iterator MIItEnd =
+      std::next(MachineBasicBlock::iterator(LastCMOV));
+  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+  // As we are creating the PHIs, we have to be careful if there is more than
+  // one.  Later CMOVs may reference the results of earlier CMOVs, but later
+  // PHIs have to reference the individual true/false inputs from earlier PHIs.
+  // That also means that PHI construction must work forward from earlier to
+  // later, and that the code must maintain a mapping from earlier PHI's
+  // destination registers, and the registers that went into the PHI.
+  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+
+  for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+    unsigned DestReg = MIIt->getOperand(0).getReg();
+    unsigned Op1Reg = MIIt->getOperand(1).getReg();
+    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+    // If this CMOV we are processing is the opposite condition from the jump we
+    // generated, then we have to swap the operands for the PHI that is going to
+    // be generated.
+    if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
+      std::swap(Op1Reg, Op2Reg);
+
+    auto Op1Itr = RegRewriteTable.find(Op1Reg);
+    if (Op1Itr != RegRewriteTable.end())
+      Op1Reg = Op1Itr->second.first;
+
+    auto Op2Itr = RegRewriteTable.find(Op2Reg);
+    if (Op2Itr != RegRewriteTable.end())
+      Op2Reg = Op2Itr->second.second;
+
+    //  SinkMBB:
+    //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
+    //  ...
+    MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+              .addReg(Op1Reg)
+              .addMBB(FalseMBB)
+              .addReg(Op2Reg)
+              .addMBB(MBB);
+    (void)MIB;
+    DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
+    DEBUG(dbgs() << "\tTo: "; MIB->dump());
+
+    // Add this PHI to the rewrite table.
+    RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+  }
+
+  // Now remove the CMOV(s).
+  MBB->erase(MIItBegin, MIItEnd);
+}
+
+} // End anonymous namespace.
+
+FunctionPass *llvm::createX86CmovConverterPass() {
+  return new X86CmovConverterPass();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
index bdd1ab5..6472bbb 100755
--- a/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/contrib/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -20,16 +20,30 @@
 //===---------------------------------------------------------------------===//
 
 #include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
-#include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
-#include "X86InstrTablesInfo.h"
-#include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
-#include "X86TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
+// Including the generated EVEX2VEX tables.
+struct X86EvexToVexCompressTableEntry {
+  uint16_t EvexOpcode;
+  uint16_t VexOpcode;
+};
+#include "X86GenEVEX2VEXTables.inc"
+
 #define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
 #define EVEX2VEX_NAME "x86-evex-to-vex-compress"
 
@@ -56,8 +70,6 @@ class EvexToVexInstPass : public MachineFunctionPass {
 public:
   static char ID;
 
-  StringRef getPassName() const override { return EVEX2VEX_DESC; }
-
   EvexToVexInstPass() : MachineFunctionPass(ID) {
     initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
 
@@ -72,6 +84,8 @@ public:
     }
   }
 
+  StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
   /// Loop over all of the basic blocks, replacing EVEX instructions
   /// by equivalent VEX instructions when possible for reducing code size.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -88,13 +102,8 @@ private:
 };
 
 char EvexToVexInstPass::ID = 0;
-}
 
-INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
-
-FunctionPass *llvm::createX86EvexToVexInsts() {
-  return new EvexToVexInstPass();
-}
+} // end anonymous namespace
 
 bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
@@ -125,7 +134,6 @@ void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
 // For EVEX instructions that can be encoded using VEX encoding
 // replace them by the VEX encoding in order to reduce size.
 bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
-
   // VEX format.
   // # of bytes: 0,2,3  1      1      0,1   0,1,2,4  0,1
   //  [Prefixes] [VEX]  OPCODE ModR/M [SIB] [DISP]  [IMM]
@@ -211,3 +219,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
   return true; 
 }
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+  return new EvexToVexInstPass();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 985acf9..5dfd95f 100644
--- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -77,9 +77,11 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   default:
     return false;
   case X86::TCRETURNdi:
+  case X86::TCRETURNdicc:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
+  case X86::TCRETURNdi64cc:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64: {
     bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
@@ -97,6 +99,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     Offset = StackAdj - MaxTCDelta;
     assert(Offset >= 0 && "Offset should never be negative");
 
+    if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) {
+      assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
+    }
+
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
@@ -105,12 +111,22 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     // Jump to label or value in register.
     bool IsWin64 = STI->isTargetWin64();
-    if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) {
+    if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
+        Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
       unsigned Op;
       switch (Opcode) {
       case X86::TCRETURNdi:
         Op = X86::TAILJMPd;
         break;
+      case X86::TCRETURNdicc:
+        Op = X86::TAILJMPd_CC;
+        break;
+      case X86::TCRETURNdi64cc:
+        assert(!MBB.getParent()->hasWinCFI() &&
+               "Conditional tail calls confuse "
+               "the Win64 unwinder.");
+        Op = X86::TAILJMPd64_CC;
+        break;
       default:
         // Note: Win64 uses REX prefixes indirect jumps out of functions, but
         // not direct ones.
@@ -126,13 +142,17 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         MIB.addExternalSymbol(JumpTarget.getSymbolName(),
                               JumpTarget.getTargetFlags());
       }
+      if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) {
+        MIB.addImm(MBBI->getOperand(2).getImm());
+      }
+
     } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
       unsigned Op = (Opcode == X86::TCRETURNmi)
                         ? X86::TAILJMPm
                         : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
       for (unsigned i = 0; i != 5; ++i)
-        MIB.addOperand(MBBI->getOperand(i));
+        MIB.add(MBBI->getOperand(i));
     } else if (Opcode == X86::TCRETURNri64) {
       BuildMI(MBB, MBBI, DL,
               TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
@@ -195,7 +215,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
     }
     for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
-      MIB.addOperand(MBBI->getOperand(I));
+      MIB.add(MBBI->getOperand(I));
     MBB.erase(MBBI);
     return true;
   }
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index c890fdd..527e5d5 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -180,44 +180,6 @@ private:
 
 } // end anonymous namespace.
 
-static std::pair<X86::CondCode, bool>
-getX86ConditionCode(CmpInst::Predicate Predicate) {
-  X86::CondCode CC = X86::COND_INVALID;
-  bool NeedSwap = false;
-  switch (Predicate) {
-  default: break;
-  // Floating-point Predicates
-  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
-  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
-  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
-  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
-  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
-  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
-  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
-  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
-
-  // Integer Predicates
-  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
-  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
-  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
-  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
-  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
-  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
-  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
-  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
-  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
-  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
-  }
-
-  return std::make_pair(CC, NeedSwap);
-}
-
 static std::pair<unsigned, bool>
 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
   unsigned CC;
@@ -367,6 +329,10 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
+    // TODO: Support this properly.
+    if (Subtarget->hasAVX512())
+      return false;
+    LLVM_FALLTHROUGH;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -448,6 +414,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
     assert(HasAVX);
     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
       Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
+    else if (IsNonTemporal && Alignment >= 16)
+      return false; // Force split for X86::VMOVNTDQArm
     else if (Alignment >= 32)
       Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
     else
@@ -458,6 +426,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
     assert(HasAVX);
     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
       Opc = X86::VMOVNTDQAYrm;
+    else if (IsNonTemporal && Alignment >= 16)
+      return false; // Force split for X86::VMOVNTDQArm
     else if (Alignment >= 32)
       Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
     else
@@ -471,6 +441,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
     assert(HasAVX);
     if (IsNonTemporal && Alignment >= 32 && HasAVX2)
       Opc = X86::VMOVNTDQAYrm;
+    else if (IsNonTemporal && Alignment >= 16)
+      return false; // Force split for X86::VMOVNTDQArm
     else if (Alignment >= 32)
       Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
     else
@@ -524,6 +496,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
                                    X86AddressMode &AM,
                                    MachineMemOperand *MMO, bool Aligned) {
+  bool HasSSE1 = Subtarget->hasSSE1();
   bool HasSSE2 = Subtarget->hasSSE2();
   bool HasSSE4A = Subtarget->hasSSE4A();
   bool HasAVX = Subtarget->hasAVX();
@@ -537,6 +510,16 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
   case MVT::f80: // No f80 support yet.
   default: return false;
   case MVT::i1: {
+    // In case ValReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) {
+      unsigned KValReg = ValReg;
+      ValReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ValReg)
+          .addReg(KValReg);
+      ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true,
+                                          X86::sub_8bit);
+    }
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -574,6 +557,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
     } else
       Opc = X86::ST_Fp64m;
     break;
+  case MVT::x86mmx:
+    Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
+    break;
   case MVT::v4f32:
     if (Aligned) {
       if (IsNonTemporal)
@@ -1201,7 +1187,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       CC != CallingConv::X86_StdCall &&
       CC != CallingConv::X86_ThisCall &&
       CC != CallingConv::X86_64_SysV &&
-      CC != CallingConv::X86_64_Win64)
+      CC != CallingConv::Win64)
     return false;
 
   // Don't handle popping bytes if they don't fit the ret's immediate.
@@ -1268,6 +1254,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       if (SrcVT == MVT::i1) {
         if (Outs[0].Flags.isSExt())
           return false;
+        // In case SrcReg is a K register, COPY to a GPR
+        if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) {
+          unsigned KSrcReg = SrcReg;
+          SrcReg = createResultReg(&X86::GR32RegClass);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::COPY), SrcReg)
+              .addReg(KSrcReg);
+          SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+                                              X86::sub_8bit);
+        }
         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
         SrcVT = MVT::i8;
       }
@@ -1531,7 +1527,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 
   X86::CondCode CC;
   bool SwapArgs;
-  std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+  std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
   unsigned Opc = X86::getSETFromCond(CC);
 
@@ -1559,6 +1555,17 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   // Handle zero-extension from i1 to i8, which is common.
   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
   if (SrcVT == MVT::i1) {
+    // In case ResultReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) {
+      unsigned KResultReg = ResultReg;
+      ResultReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(KResultReg);
+      ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                             X86::sub_8bit);
+    }
+
     // Set the high bits to zero.
     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
     SrcVT = MVT::i8;
@@ -1658,7 +1665,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
       bool SwapArgs;
       unsigned BranchOpc;
-      std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
+      std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate);
       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
       BranchOpc = X86::GetCondBranchFromCond(CC);
@@ -1740,10 +1747,12 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // In case OpReg is a K register, COPY to a GPR
   if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
     unsigned KOpReg = OpReg;
-    OpReg = createResultReg(&X86::GR8RegClass);
+    OpReg = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), OpReg)
         .addReg(KOpReg);
+    OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true,
+                                       X86::sub_8bit);
   }
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
       .addReg(OpReg)
@@ -2029,7 +2038,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     }
 
     bool NeedSwap;
-    std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
+    std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate);
     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
     const Value *CmpLHS = CI->getOperand(0);
@@ -2084,10 +2093,12 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     // In case OpReg is a K register, COPY to a GPR
     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
       unsigned KCondReg = CondReg;
-      CondReg = createResultReg(&X86::GR8RegClass);
+      CondReg = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+                                           X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
         .addReg(CondReg, getKillRegState(CondIsKill))
@@ -2106,7 +2117,8 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   if (!LHSReg || !RHSReg)
     return false;
 
-  unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
+  const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
+  unsigned Opc = X86::getCMovFromCond(CC, TRI.getRegSizeInBits(*RC)/8);
   unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
                                        LHSReg, LHSIsKill);
   updateValueMap(I, ResultReg);
@@ -2275,7 +2287,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   const auto *CI = dyn_cast<CmpInst>(Cond);
   if (CI && (CI->getParent() == I->getParent())) {
     bool NeedSwap;
-    std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
+    std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate());
     if (CC > X86::LAST_VALID_COND)
       return false;
 
@@ -2297,10 +2309,12 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
     // In case OpReg is a K register, COPY to a GPR
     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
       unsigned KCondReg = CondReg;
-      CondReg = createResultReg(&X86::GR8RegClass);
+      CondReg = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+                                           X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
         .addReg(CondReg, getKillRegState(CondIsKill))
@@ -2423,12 +2437,22 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
   if (OpReg == 0)
     return false;
 
+  unsigned ImplicitDefReg;
+  if (Subtarget->hasAVX()) {
+    ImplicitDefReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+  }
+
   unsigned ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB;
   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
                 ResultReg);
+
   if (Subtarget->hasAVX())
-    MIB.addReg(OpReg);
+    MIB.addReg(ImplicitDefReg);
+
   MIB.addReg(OpReg);
   updateValueMap(I, ResultReg);
   return true;
@@ -2461,7 +2485,8 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   EVT DstVT = TLI.getValueType(DL, I->getType());
 
   // This code only handles truncation to byte.
-  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+  // TODO: Support truncate to i1 with AVX512.
+  if (DstVT != MVT::i8 && (DstVT != MVT::i1 || Subtarget->hasAVX512()))
     return false;
   if (!TLI.isTypeLegal(SrcVT))
     return false;
@@ -3014,19 +3039,19 @@ bool X86FastISel::fastLowerArguments() {
   if (!Subtarget->is64Bit())
     return false;
 
+  if (Subtarget->useSoftFloat())
+    return false;
+
   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
   unsigned GPRCnt = 0;
   unsigned FPRCnt = 0;
-  unsigned Idx = 0;
   for (auto const &Arg : F->args()) {
-    // The first argument is at index 1.
-    ++Idx;
-    if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
-        F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+    if (Arg.hasAttribute(Attribute::ByVal) ||
+        Arg.hasAttribute(Attribute::InReg) ||
+        Arg.hasAttribute(Attribute::StructRet) ||
+        Arg.hasAttribute(Attribute::SwiftSelf) ||
+        Arg.hasAttribute(Attribute::SwiftError) ||
+        Arg.hasAttribute(Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
@@ -3105,8 +3130,8 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
     return 0;
 
   if (CS)
-    if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) ||
-        CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU())
+    if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) ||
+        CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
       return 0;
 
   return 4;
@@ -3127,6 +3152,15 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
 
+  const CallInst *CI =
+      CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
+  const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
+
+  // Functions with no_caller_saved_registers that need special handling.
+  if ((CI && CI->hasFnAttr("no_caller_saved_registers")) ||
+      (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
+    return false;
+
   // Handle only C, fastcc, and webkit_js calling conventions for now.
   switch (CC) {
   default: return false;
@@ -3137,7 +3171,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   case CallingConv::X86_FastCall:
   case CallingConv::X86_StdCall:
   case CallingConv::X86_ThisCall:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
     break;
   }
@@ -3230,7 +3264,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes).addImm(0);
+    .addImm(NumBytes).addImm(0).addImm(0);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
@@ -3266,6 +3300,16 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
       // Handle zero-extension from i1 to i8, which is common.
       if (ArgVT == MVT::i1) {
+        // In case SrcReg is a K register, COPY to a GPR
+        if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) {
+          unsigned KArgReg = ArgReg;
+          ArgReg = createResultReg(&X86::GR32RegClass);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::COPY), ArgReg)
+              .addReg(KArgReg);
+          ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true,
+                                              X86::sub_8bit);
+        }
         // Set the high bits to zero.
         ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
         ArgVT = MVT::i8;
@@ -3463,6 +3507,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     CCValAssign &VA = RVLocs[i];
     EVT CopyVT = VA.getValVT();
     unsigned CopyReg = ResultReg + i;
+    unsigned SrcReg = VA.getLocReg();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
@@ -3470,9 +3515,19 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       report_fatal_error("SSE register return with SSE disabled");
     }
 
+    // If the return value is an i1 and AVX-512 is enabled, we need
+    // to do a fixup to make the copy legal.
+    if (CopyVT == MVT::i1 && SrcReg == X86::AL && Subtarget->hasAVX512()) {
+      // Need to copy to a GR32 first.
+      // TODO: MOVZX isn't great here. We don't care about the upper bits.
+      SrcReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::MOVZX32rr8), SrcReg).addReg(X86::AL);
+    }
+
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
-    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+    if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
       CopyVT = MVT::f80;
       CopyReg = createResultReg(&X86::RFP80RegClass);
@@ -3480,7 +3535,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
     // Copy out the result.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
     InRegs.push_back(VA.getLocReg());
 
     // Round the f80 to the right size, which also moves it to the appropriate
@@ -3622,7 +3677,12 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
   unsigned Opc = 0;
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type");
-  case MVT::i1:  VT = MVT::i8;       LLVM_FALLTHROUGH;
+  case MVT::i1:
+    // TODO: Support this properly.
+    if (Subtarget->hasAVX512())
+      return 0;
+    VT = MVT::i8;
+    LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = X86::MOV8ri;  break;
   case MVT::i16: Opc = X86::MOV16ri; break;
   case MVT::i32: Opc = X86::MOV32ri; break;
diff --git a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 8bde4bf..95c6f2a 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -22,7 +22,7 @@
 /// instructions and register-to-register moves.  It would
 /// seem like cmov(s) would also be affected, but because of the way cmov is
 /// really implemented by most machines as reading both the destination and
-/// and source regsters, and then "merging" the two based on a condition,
+/// and source registers, and then "merging" the two based on a condition,
 /// it really already should be considered as having a true dependence on the
 /// destination register as well.
 ///
@@ -95,10 +95,9 @@ class FixupBWInstPass : public MachineFunctionPass {
 
   // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
   // possible.  Return the replacement instruction if OK, return nullptr
-  // otherwise. Set WasCandidate to true or false depending on whether the
-  // MI was a candidate for this sort of transformation.
-  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
-                                bool &WasCandidate) const;
+  // otherwise.
+  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const;
+
 public:
   static char ID;
 
@@ -226,7 +225,7 @@ MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
 
   unsigned NumArgs = MI->getNumOperands();
   for (unsigned i = 1; i < NumArgs; ++i)
-    MIB.addOperand(MI->getOperand(i));
+    MIB.add(MI->getOperand(i));
 
   MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
@@ -264,17 +263,13 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
   // Drop imp-defs/uses that would be redundant with the new def/use.
   for (auto &Op : MI->implicit_operands())
     if (Op.getReg() != (Op.isDef() ? NewDestReg : NewSrcReg))
-      MIB.addOperand(Op);
+      MIB.add(Op);
 
   return MIB;
 }
 
-MachineInstr *FixupBWInstPass::tryReplaceInstr(
-                  MachineInstr *MI, MachineBasicBlock &MBB,
-                  bool &WasCandidate) const {
-  MachineInstr *NewMI = nullptr;
-  WasCandidate = false;
-
+MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
+                                               MachineBasicBlock &MBB) const {
   // See if this is an instruction of the type we are currently looking for.
   switch (MI->getOpcode()) {
 
@@ -282,12 +277,9 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Only replace 8 bit loads with the zero extending versions if
     // in an inner most loop and not optimizing for size. This takes
     // an extra byte to encode, and provides limited performance upside.
-    if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
-      if (ML->begin() == ML->end() && !OptForSize) {
-        NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
-        WasCandidate = true;
-      }
-    }
+    if (MachineLoop *ML = MLI->getLoopFor(&MBB))
+      if (ML->begin() == ML->end() && !OptForSize)
+        return tryReplaceLoad(X86::MOVZX32rm8, MI);
     break;
 
   case X86::MOV16rm:
@@ -295,9 +287,7 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Code size is the same, and there is sometimes a perf advantage
     // from eliminating a false dependence on the upper portion of
     // the register.
-    NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
-    WasCandidate = true;
-    break;
+    return tryReplaceLoad(X86::MOVZX32rm16, MI);
 
   case X86::MOV8rr:
   case X86::MOV16rr:
@@ -305,16 +295,14 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Code size is either less (16) or equal (8), and there is sometimes a
     // perf advantage from eliminating a false dependence on the upper portion
     // of the register.
-    NewMI = tryReplaceCopy(MI);
-    WasCandidate = true;
-    break;
+    return tryReplaceCopy(MI);
 
   default:
     // nothing to do here.
     break;
   }
 
-  return NewMI;
+  return nullptr;
 }
 
 void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
@@ -338,18 +326,11 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
   // We run after PEI, so we need to AddPristinesAndCSRs.
   LiveRegs.addLiveOuts(MBB);
 
-  bool WasCandidate = false;
-
   for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
     MachineInstr *MI = &*I;
     
-    MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
-
-    // Add this to replacements if it was a candidate, even if NewMI is
-    // nullptr.  We will revisit that in a bit.
-    if (WasCandidate) {
+    if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
       MIReplacements.push_back(std::make_pair(MI, NewMI));
-    }
 
     // We're done with this instruction, update liveness for the next one.
     LiveRegs.stepBackward(*MI);
@@ -359,9 +340,7 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
     MachineInstr *MI = MIReplacements.back().first;
     MachineInstr *NewMI = MIReplacements.back().second;
     MIReplacements.pop_back();
-    if (NewMI) {
-      MBB.insert(MI, NewMI);
-      MBB.erase(MI);
-    }
+    MBB.insert(MI, NewMI);
+    MBB.erase(MI);
   }
 }
diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 1209591..9f649da 100644
--- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -27,20 +27,26 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "x86-fixup-LEAs"
+namespace llvm {
+void initializeFixupLEAPassPass(PassRegistry &);
+}
+
+#define FIXUPLEA_DESC "X86 LEA Fixup"
+#define FIXUPLEA_NAME "x86-fixup-LEAs"
+
+#define DEBUG_TYPE FIXUPLEA_NAME
 
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
 class FixupLEAPass : public MachineFunctionPass {
   enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
-  static char ID;
+
   /// \brief Loop over all of the instructions in the basic block
   /// replacing applicable instructions with LEA instructions,
   /// where appropriate.
   bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
 
-  StringRef getPassName() const override { return "X86 LEA Fixup"; }
 
   /// \brief Given a machine register, look for the instruction
   /// which writes it in the current basic block. If found,
@@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass {
   void processInstructionForSLM(MachineBasicBlock::iterator &I,
                                 MachineFunction::iterator MFI);
 
+
+  /// \brief Given a LEA instruction which is unprofitable
+  /// on SNB+ try to replace it with other instructions.
+  /// According to Intel's Optimization Reference Manual:
+  /// " For LEA instructions with three source operands and some specific
+  ///   situations, instruction latency has increased to 3 cycles, and must
+  ///   dispatch via port 1:
+  /// - LEA that has all three source operands: base, index, and offset
+  /// - LEA that uses base and index registers where the base is EBP, RBP,
+  ///   or R13
+  /// - LEA that uses RIP relative addressing mode
+  /// - LEA that uses 16-bit addressing mode "
+  /// This function currently handles the first 2 cases only.
+  MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI,
+                                          MachineFunction::iterator MFI);
+
   /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg
   /// and convert them to INC or DEC respectively.
   bool fixupIncDec(MachineBasicBlock::iterator &I,
@@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass {
                                    MachineBasicBlock::iterator &MBBI) const;
 
 public:
-  FixupLEAPass() : MachineFunctionPass(ID) {}
+  static char ID;
+
+  StringRef getPassName() const override { return FIXUPLEA_DESC; }
+
+  FixupLEAPass() : MachineFunctionPass(ID) {
+    initializeFixupLEAPassPass(*PassRegistry::getPassRegistry());
+  }
 
   /// \brief Loop over all of the basic blocks,
   /// replacing instructions by equivalent LEA instructions
@@ -104,9 +132,12 @@ private:
   bool OptIncDec;
   bool OptLEA;
 };
-char FixupLEAPass::ID = 0;
 }
 
+char FixupLEAPass::ID = 0;
+
+INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false)
+
 MachineInstr *
 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
                                  MachineBasicBlock::iterator &MBBI) const {
@@ -120,8 +151,8 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
         BuildMI(*MF, MI.getDebugLoc(),
                 TII->get(MI.getOpcode() == X86::MOV32rr ? X86::LEA32r
                                                         : X86::LEA64r))
-            .addOperand(Dest)
-            .addOperand(Src)
+            .add(Dest)
+            .add(Src)
             .addImm(1)
             .addReg(0)
             .addImm(0)
@@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
   OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
-  OptLEA = ST.LEAusesAG() || ST.slowLEA();
+  OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
 
   if (!OptLEA && !OptIncDec)
     return false;
@@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I,
   return MachineBasicBlock::iterator();
 }
 
-static inline bool isLEA(const int opcode) {
-  return opcode == X86::LEA16r || opcode == X86::LEA32r ||
-         opcode == X86::LEA64r || opcode == X86::LEA64_32r;
+static inline bool isLEA(const int Opcode) {
+  return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
+         Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
+}
+
+static inline bool isInefficientLEAReg(unsigned int Reg) {
+  return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13;
+}
+
+static inline bool isRegOperand(const MachineOperand &Op) {
+  return Op.isReg() && Op.getReg() != X86::NoRegister;
+}
+/// hasIneffecientLEARegs - LEA that uses base and index registers
+/// where the base is EBP, RBP, or R13
+static inline bool hasInefficientLEABaseReg(const MachineOperand &Base,
+                                            const MachineOperand &Index) {
+  return Base.isReg() && isInefficientLEAReg(Base.getReg()) &&
+         isRegOperand(Index);
+}
+
+static inline bool hasLEAOffset(const MachineOperand &Offset) {
+  return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal();
+}
+
+// LEA instruction that has all three operands: offset, base and index
+static inline bool isThreeOperandsLEA(const MachineOperand &Base,
+                                      const MachineOperand &Index,
+                                      const MachineOperand &Offset) {
+  return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset);
+}
+
+static inline int getADDrrFromLEA(int LEAOpcode) {
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return X86::ADD16rr;
+  case X86::LEA32r:
+    return X86::ADD32rr;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    return X86::ADD64rr;
+  }
+}
+
+static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) {
+  bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm());
+  switch (LEAOpcode) {
+  default:
+    llvm_unreachable("Unexpected LEA instruction");
+  case X86::LEA16r:
+    return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri;
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+    return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri;
+  case X86::LEA64r:
+    return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32;
+  }
 }
 
 /// isLEASimpleIncOrDec - Does this LEA have one these forms:
@@ -287,8 +373,8 @@ bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I,
 
     MachineInstr *NewMI =
         BuildMI(*MFI, I, MI.getDebugLoc(), TII->get(NewOpcode))
-            .addOperand(MI.getOperand(0))
-            .addOperand(MI.getOperand(1));
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1));
     MFI->erase(I);
     I = static_cast<MachineBasicBlock::iterator>(NewMI);
     return true;
@@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p,
 void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
                                             MachineFunction::iterator MFI) {
   MachineInstr &MI = *I;
-  const int opcode = MI.getOpcode();
-  if (!isLEA(opcode))
+  const int Opcode = MI.getOpcode();
+  if (!isLEA(Opcode))
     return;
   if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() ||
       !TII->isSafeToClobberEFLAGS(*MFI, I))
@@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
     return;
   if (MI.getOperand(2).getImm() > 1)
     return;
-  int addrr_opcode, addri_opcode;
-  switch (opcode) {
-  default:
-    llvm_unreachable("Unexpected LEA instruction");
-  case X86::LEA16r:
-    addrr_opcode = X86::ADD16rr;
-    addri_opcode = X86::ADD16ri;
-    break;
-  case X86::LEA32r:
-    addrr_opcode = X86::ADD32rr;
-    addri_opcode = X86::ADD32ri;
-    break;
-  case X86::LEA64_32r:
-  case X86::LEA64r:
-    addrr_opcode = X86::ADD64rr;
-    addri_opcode = X86::ADD64ri32;
-    break;
-  }
   DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
   DEBUG(dbgs() << "FixLEA: Replaced by: ";);
   MachineInstr *NewMI = nullptr;
-  const MachineOperand &Dst = MI.getOperand(0);
   // Make ADD instruction for two registers writing to LEA's destination
   if (SrcR1 != 0 && SrcR2 != 0) {
-    const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode))
-                .addOperand(Dst)
-                .addOperand(Src1)
-                .addOperand(Src2);
-    MFI->insert(I, NewMI);
+    const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode));
+    const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI =
+        BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src);
     DEBUG(NewMI->dump(););
   }
   // Make ADD instruction for immediate
   if (MI.getOperand(4).getImm() != 0) {
+    const MCInstrDesc &ADDri =
+        TII->get(getADDriFromLEA(Opcode, MI.getOperand(4)));
     const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3);
-    NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode))
-                .addOperand(Dst)
-                .addOperand(SrcR)
+    NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR)
+                .add(SrcR)
                 .addImm(MI.getOperand(4).getImm());
-    MFI->insert(I, NewMI);
     DEBUG(NewMI->dump(););
   }
   if (NewMI) {
     MFI->erase(I);
-    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+    I = NewMI;
+  }
+}
+
+MachineInstr *
+FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI,
+                                        MachineFunction::iterator MFI) {
+
+  const int LEAOpcode = MI.getOpcode();
+  if (!isLEA(LEAOpcode))
+    return nullptr;
+
+  const MachineOperand &Dst = MI.getOperand(0);
+  const MachineOperand &Base = MI.getOperand(1);
+  const MachineOperand &Scale = MI.getOperand(2);
+  const MachineOperand &Index = MI.getOperand(3);
+  const MachineOperand &Offset = MI.getOperand(4);
+  const MachineOperand &Segment = MI.getOperand(5);
+
+  if (!(isThreeOperandsLEA(Base, Index, Offset) ||
+        hasInefficientLEABaseReg(Base, Index)) ||
+      !TII->isSafeToClobberEFLAGS(*MFI, MI) ||
+      Segment.getReg() != X86::NoRegister)
+    return nullptr;
+
+  unsigned int DstR = Dst.getReg();
+  unsigned int BaseR = Base.getReg();
+  unsigned int IndexR = Index.getReg();
+  unsigned SSDstR =
+      (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR;
+  bool IsScale1 = Scale.getImm() == 1;
+  bool IsInefficientBase = isInefficientLEAReg(BaseR);
+  bool IsInefficientIndex = isInefficientLEAReg(IndexR);
+
+  // Skip these cases since it takes more than 2 instructions
+  // to replace the LEA instruction.
+  if (IsInefficientBase && SSDstR == BaseR && !IsScale1)
+    return nullptr;
+  if (LEAOpcode == X86::LEA64_32r && IsInefficientBase &&
+      (IsInefficientIndex || !IsScale1))
+    return nullptr;
+
+  const DebugLoc DL = MI.getDebugLoc();
+  const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode));
+  const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset));
+
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+
+  // First try to replace LEA with one or two (for the 3-op LEA case)
+  // add instructions:
+  // 1.lea (%base,%index,1), %base => add %index,%base
+  // 2.lea (%base,%index,1), %index => add %base,%index
+  if (IsScale1 && (DstR == BaseR || DstR == IndexR)) {
+    const MachineOperand &Src = DstR == BaseR ? Index : Base;
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // If the base is inefficient try switching the index and base operands,
+  // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction:
+  // lea offset(%base,%index,scale),%dst =>
+  // lea (%base,%index,scale); add offset,%dst
+  if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) {
+    MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                              .add(Dst)
+                              .add(IsInefficientBase ? Index : Base)
+                              .add(Scale)
+                              .add(IsInefficientBase ? Base : Index)
+                              .addImm(0)
+                              .add(Segment);
+    DEBUG(NewMI->dump(););
+    // Create ADD instruction for the Offset in case of 3-Ops LEA.
+    if (hasLEAOffset(Offset)) {
+      NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset);
+      DEBUG(NewMI->dump(););
+    }
+    return NewMI;
+  }
+  // Handle the rest of the cases with inefficient base register:
+  assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!");
+  assert(IsInefficientBase && "efficient base should be handled already!");
+
+  // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst
+  if (IsScale1 && !hasLEAOffset(Offset)) {
+    TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill());
+    DEBUG(MI.getPrevNode()->dump(););
+
+    MachineInstr *NewMI =
+        BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index);
+    DEBUG(NewMI->dump(););
+    return NewMI;
   }
+  // lea offset(%base,%index,scale), %dst =>
+  // lea offset( ,%index,scale), %dst; add %base,%dst
+  MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode))
+                            .add(Dst)
+                            .addReg(0)
+                            .add(Scale)
+                            .add(Index)
+                            .add(Offset)
+                            .add(Segment);
+  DEBUG(NewMI->dump(););
+
+  NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base);
+  DEBUG(NewMI->dump(););
+  return NewMI;
 }
 
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
@@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
     if (OptLEA) {
       if (MF.getSubtarget<X86Subtarget>().isSLM())
         processInstructionForSLM(I, MFI);
-      else
-        processInstruction(I, MFI);
+
+      else {
+        if (MF.getSubtarget<X86Subtarget>().slow3OpsLEA()) {
+          if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) {
+            MFI->erase(I);
+            I = NewMI;
+          }
+        } else
+          processInstruction(I, MFI);
+      }
     }
   }
   return false;
diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
index a5489b9..5582526 100644
--- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -123,18 +123,26 @@ namespace {
     EdgeBundles *Bundles;
 
     // Return a bitmask of FP registers in block's live-in list.
-    static unsigned calcLiveInMask(MachineBasicBlock *MBB) {
+    static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) {
       unsigned Mask = 0;
-      for (const auto &LI : MBB->liveins()) {
-        if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6)
-          continue;
-        Mask |= 1 << (LI.PhysReg - X86::FP0);
+      for (MachineBasicBlock::livein_iterator I = MBB->livein_begin();
+           I != MBB->livein_end(); ) {
+        MCPhysReg Reg = I->PhysReg;
+        static_assert(X86::FP6 - X86::FP0 == 6, "sequential regnums");
+        if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+          Mask |= 1 << (Reg - X86::FP0);
+          if (RemoveFPs) {
+            I = MBB->removeLiveIn(I);
+            continue;
+          }
+        }
+        ++I;
       }
       return Mask;
     }
 
     // Partition all the CFG edges into LiveBundles.
-    void bundleCFG(MachineFunction &MF);
+    void bundleCFGRecomputeKillFlags(MachineFunction &MF);
 
     MachineBasicBlock *MBB;     // Current basic block
 
@@ -327,7 +335,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
 
   // Prepare cross-MBB liveness.
-  bundleCFG(MF);
+  bundleCFGRecomputeKillFlags(MF);
 
   StackTop = 0;
 
@@ -375,13 +383,15 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
 /// registers live-out from a block is identical to the live-in set of all
 /// successors. This is not enforced by the normal live-in lists since
 /// registers may be implicitly defined, or not used by all successors.
-void FPS::bundleCFG(MachineFunction &MF) {
+void FPS::bundleCFGRecomputeKillFlags(MachineFunction &MF) {
   assert(LiveBundles.empty() && "Stale data in LiveBundles");
   LiveBundles.resize(Bundles->getNumBundles());
 
   // Gather the actual live-in masks for all MBBs.
   for (MachineBasicBlock &MBB : MF) {
-    const unsigned Mask = calcLiveInMask(&MBB);
+    setKillFlags(MBB);
+
+    const unsigned Mask = calcLiveInMask(&MBB, false);
     if (!Mask)
       continue;
     // Update MBB ingoing bundle mask.
@@ -396,7 +406,6 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
   bool Changed = false;
   MBB = &BB;
 
-  setKillFlags(BB);
   setupBlockStack();
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
@@ -453,6 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       unsigned Reg = DeadRegs[i];
       // Check if Reg is live on the stack. An inline-asm register operand that
       // is in the clobber list and marked dead might not be live on the stack.
+      static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
       if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) {
         DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
         freeStackSlotAfter(I, Reg-X86::FP0);
@@ -506,7 +516,6 @@ void FPS::setupBlockStack() {
 
   // Push the fixed live-in registers.
   for (unsigned i = Bundle.FixCount; i > 0; --i) {
-    MBB->addLiveIn(X86::ST0+i-1);
     DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
                  << unsigned(Bundle.FixStack[i-1]) << '\n');
     pushReg(Bundle.FixStack[i-1]);
@@ -515,7 +524,8 @@ void FPS::setupBlockStack() {
   // Kill off unwanted live-ins. This can happen with a critical edge.
   // FIXME: We could keep these live registers around as zombies. They may need
   // to be revived at the end of a short block. It might save a few instrs.
-  adjustLiveRegs(calcLiveInMask(MBB), MBB->begin());
+  unsigned Mask = calcLiveInMask(MBB, /*RemoveFPs=*/true);
+  adjustLiveRegs(Mask, MBB->begin());
   DEBUG(MBB->dump());
 }
 
@@ -1655,8 +1665,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
 }
 
 void FPS::setKillFlags(MachineBasicBlock &MBB) const {
-  const TargetRegisterInfo *TRI =
-      MBB.getParent()->getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo &TRI =
+      *MBB.getParent()->getSubtarget().getRegisterInfo();
   LivePhysRegs LPR(TRI);
 
   LPR.addLiveOuts(MBB);
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
index cd69044..f294e81 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -29,8 +29,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
 
 using namespace llvm;
@@ -252,40 +252,76 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
                                     int64_t NumBytes, bool InEpilogue) const {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  MachineInstr::MIFlag Flag =
+      isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-  while (Offset) {
-    if (Offset > Chunk) {
-      // Rather than emit a long series of instructions for large offsets,
-      // load the offset into a register and do one sub/add
-      unsigned Reg = 0;
+  if (Offset > Chunk) {
+    // Rather than emit a long series of instructions for large offsets,
+    // load the offset into a register and do one sub/add
+    unsigned Reg = 0;
+    unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
 
-      if (isSub && !isEAXLiveIn(MBB))
-        Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+    if (isSub && !isEAXLiveIn(MBB))
+      Reg = Rax;
+    else
+      Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+    unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+    unsigned AddSubRROpc =
+        isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
+    if (Reg) {
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
+                             .addReg(StackPtr)
+                             .addReg(Reg);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      return;
+    } else if (Offset > 8 * Chunk) {
+      // If we would need more than 8 add or sub instructions (a >16GB stack
+      // frame), it's worth spilling RAX to materialize this immediate.
+      //   pushq %rax
+      //   movabsq +-$Offset+-SlotSize, %rax
+      //   addq %rsp, %rax
+      //   xchg %rax, (%rsp)
+      //   movq (%rsp), %rsp
+      assert(Is64Bit && "can't have 32-bit 16GB stack frame");
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+          .addReg(Rax, RegState::Kill)
+          .setMIFlag(Flag);
+      // Subtract is not commutative, so negate the offset and always use add.
+      // Subtract 8 less and add 8 more to account for the PUSH we just did.
+      if (isSub)
+        Offset = -(Offset - SlotSize);
       else
-        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
-
-      if (Reg) {
-        unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
-        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
-          .addImm(Offset);
-        Opc = isSub
-          ? getSUBrrOpcode(Is64Bit)
-          : getADDrrOpcode(Is64Bit);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
-          .addReg(StackPtr)
-          .addReg(Reg);
-        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
-        Offset = 0;
-        continue;
-      }
+        Offset = Offset + SlotSize;
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
+                             .addReg(Rax)
+                             .addReg(StackPtr);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      // Exchange the new SP in RAX with the top of the stack.
+      addRegOffset(
+          BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
+          StackPtr, false, 0);
+      // Load new SP from the top of the stack into RSP.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
+                   StackPtr, false, 0);
+      return;
     }
+  }
 
+  while (Offset) {
     uint64_t ThisVal = std::min(Offset, Chunk);
-    if (ThisVal == (Is64Bit ? 8 : 4)) {
-      // Use push / pop instead.
+    if (ThisVal == SlotSize) {
+      // Use push / pop for slot sized adjustments as a size optimization. We
+      // need to find a dead register when using pop.
       unsigned Reg = isSub
         ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
         : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
@@ -293,23 +329,16 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
         unsigned Opc = isSub
           ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
           : (Is64Bit ? X86::POP64r  : X86::POP32r);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
-          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
-        if (isSub)
-          MI->setFlag(MachineInstr::FrameSetup);
-        else
-          MI->setFlag(MachineInstr::FrameDestroy);
+        BuildMI(MBB, MBBI, DL, TII.get(Opc))
+            .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
+            .setMIFlag(Flag);
         Offset -= ThisVal;
         continue;
       }
     }
 
-    MachineInstrBuilder MI = BuildStackAdjustment(
-        MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
-    if (isSub)
-      MI.setMIFlag(MachineInstr::FrameSetup);
-    else
-      MI.setMIFlag(MachineInstr::FrameDestroy);
+    BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
+        .setMIFlag(Flag);
 
     Offset -= ThisVal;
   }
@@ -719,17 +748,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
   else
     CallOp = X86::CALLpcrel32;
 
-  const char *Symbol;
-  if (Is64Bit) {
-    if (STI.isTargetCygMing()) {
-      Symbol = "___chkstk_ms";
-    } else {
-      Symbol = "__chkstk";
-    }
-  } else if (STI.isTargetCygMing())
-    Symbol = "_alloca";
-  else
-    Symbol = "_chkstk";
+  StringRef Symbol = STI.getTargetLowering()->getStackProbeSymbolName(MF);
 
   MachineInstrBuilder CI;
   MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI);
@@ -740,10 +759,11 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
     // For the large code model, we have to call through a register. Use R11,
     // as it is scratch in all supported calling conventions.
     BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::R11)
-        .addExternalSymbol(Symbol);
+        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
     CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addReg(X86::R11);
   } else {
-    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp)).addExternalSymbol(Symbol);
+    CI = BuildMI(MBB, MBBI, DL, TII.get(CallOp))
+        .addExternalSymbol(MF.createExternalSymbolName(Symbol));
   }
 
   unsigned AX = Is64Bit ? X86::RAX : X86::EAX;
@@ -754,13 +774,16 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
       .addReg(SP, RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
 
-  if (Is64Bit) {
+  if (STI.isTargetWin64() || !STI.isOSWindows()) {
+    // MSVC x32's _chkstk and cygwin/mingw's _alloca adjust %esp themselves.
     // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
-    // themselves. It also does not clobber %rax so we can reuse it when
+    // themselves. They also does not clobber %rax so we can reuse it when
     // adjusting %rsp.
-    BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), X86::RSP)
-        .addReg(X86::RSP)
-        .addReg(X86::RAX);
+    // All other platforms do not specify a particular ABI for the stack probe
+    // function, so we arbitrarily define it to not adjust %esp/%rsp itself.
+    BuildMI(MBB, MBBI, DL, TII.get(getSUBrrOpcode(Is64Bit)), SP)
+        .addReg(SP)
+        .addReg(AX);
   }
 
   if (InProlog) {
@@ -949,7 +972,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
+  bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
 
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
@@ -959,6 +982,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
 
+  // Re-align the stack on 64-bit if the x86-interrupt calling convention is
+  // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
+  // stack alignment.
+  if (Fn->getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
+      Fn->arg_size() == 2) {
+    StackSize += 8;
+    MFI.setStackSize(StackSize);
+    emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false);
+  }
+
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
@@ -968,6 +1001,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       !TRI->needsStackRealignment(MF) &&
       !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
       !MFI.adjustsStack() &&                   // No calls.
+      !UseStackProbe &&                        // No stack probes.
       !IsWin64CC &&                            // Win64 has no Red Zone
       !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
       !MF.shouldSplitStack()) {                // Regular stack
@@ -1023,6 +1057,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (HasFP) {
+    assert(MF.getRegInfo().isReserved(MachineFramePtr) && "FP reserved");
+
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     // If required, include space for extra hidden slot for stashing base pointer.
@@ -1085,13 +1121,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
                                     nullptr, DwarfFramePtr));
       }
     }
-
-    // Mark the FramePtr as live-in in every block. Don't do this again for
-    // funclet prologues.
-    if (!IsFunclet) {
-      for (MachineBasicBlock &EveryMBB : MF)
-        EveryMBB.addLiveIn(MachineFramePtr);
-    }
   } else {
     assert(!IsFunclet && "funclets without FPs not yet implemented");
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
@@ -1158,6 +1187,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
     AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
   if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+    assert(!X86FI->getUsesRedZone() &&
+           "The Red Zone is not accounted for in stack probes");
+
     // Check whether EAX is livein for this block.
     bool isEAXAlive = isEAXLiveIn(MBB);
 
@@ -1659,21 +1691,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-// NOTE: this only has a subset of the full frame index logic. In
-// particular, the FI < 0 and AfterFPPop logic is handled in
-// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly
-// (probably?) it should be moved into here.
 int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                              unsigned &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
+  bool IsFixed = MFI.isFixedObjectIndex(FI);
   // We can't calculate offset from frame pointer if the stack is realigned,
   // so enforce usage of stack/base pointer.  The base pointer is used when we
   // have dynamic allocas in addition to dynamic realignment.
   if (TRI->hasBasePointer(MF))
-    FrameReg = TRI->getBaseRegister();
+    FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getBaseRegister();
   else if (TRI->needsStackRealignment(MF))
-    FrameReg = TRI->getStackRegister();
+    FrameReg = IsFixed ? TRI->getFramePtr() : TRI->getStackRegister();
   else
     FrameReg = TRI->getFrameRegister(MF);
 
@@ -1747,6 +1776,14 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + FPDelta;
 }
 
+int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
+                                               int FI, unsigned &FrameReg,
+                                               int Adjustment) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  FrameReg = TRI->getStackRegister();
+  return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
+}
+
 int
 X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
                                                  int FI, unsigned &FrameReg,
@@ -1803,9 +1840,6 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
   assert(MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta() >= 0 &&
          "we don't handle this case!");
 
-  // Fill in FrameReg output argument.
-  FrameReg = TRI->getStackRegister();
-
   // This is how the math works out:
   //
   //  %rsp grows (i.e. gets lower) left to right. Each box below is
@@ -1830,12 +1864,8 @@ X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
   // (C - E) == (C - A) - (B - A) + (B - E)
   //            { Using [1], [2] and [3] above }
   //         == getObjectOffset - LocalAreaOffset + StackSize
-  //
-
-  // Get the Offset from the StackPointer
-  int Offset = MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
 
-  return Offset + StackSize;
+  return getFrameIndexReferenceSP(MF, FI, FrameReg, StackSize);
 }
 
 bool X86FrameLowering::assignCalleeSavedSpillSlots(
@@ -1887,14 +1917,15 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
       continue;
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    unsigned Size = TRI->getSpillSize(*RC);
+    unsigned Align = TRI->getSpillAlignment(*RC);
     // ensure alignment
-    SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
+    SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
     // spill into slot
-    SpillSlotOffset -= RC->getSize();
-    int SlotIndex =
-        MFI.CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
+    SpillSlotOffset -= Size;
+    int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
-    MFI.ensureMaxAlignment(RC->getAlignment());
+    MFI.ensureMaxAlignment(Align);
   }
 
   return true;
@@ -2587,8 +2618,8 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   unsigned Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
   DebugLoc DL = I->getDebugLoc();
-  uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
-  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
+  uint64_t Amount = !reserveCallFrame ? TII.getFrameSize(*I) : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? TII.getFrameAdjustment(*I) : 0;
   I = MBB.erase(I);
   auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
 
@@ -2952,6 +2983,10 @@ unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF)
 
 void X86FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
+  // Mark the function as not having WinCFI. We will set it back to true in
+  // emitPrologue if it gets called and emits CFI.
+  MF.setHasWinCFI(false);
+
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
   const Function *Fn = MF.getFunction();
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
index e1b04d6..7d214ca 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -20,6 +20,7 @@ namespace llvm {
 
 class MachineInstrBuilder;
 class MCCFIInstruction;
+class X86InstrInfo;
 class X86Subtarget;
 class X86RegisterInfo;
 
@@ -30,7 +31,7 @@ public:
   // Cached subtarget predicates.
 
   const X86Subtarget &STI;
-  const TargetInstrInfo &TII;
+  const X86InstrInfo &TII;
   const X86RegisterInfo *TRI;
 
   unsigned SlotSize;
@@ -99,6 +100,8 @@ public:
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
+  int getFrameIndexReferenceSP(const MachineFunction &MF,
+                               int FI, unsigned &SPReg, int Adjustment) const;
   int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
                                      unsigned &FrameReg,
                                      bool IgnoreSPUpdates) const override;
diff --git a/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def b/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def
new file mode 100644
index 0000000..06be142
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -0,0 +1,104 @@
+//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by X86RegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{
+    /* StartIdx, Length, RegBank */
+    // GPR value
+    {0, 8, X86::GPRRegBank},   // :0
+    {0, 16, X86::GPRRegBank},  // :1
+    {0, 32, X86::GPRRegBank},  // :2
+    {0, 64, X86::GPRRegBank},  // :3
+    // FR32/64 , xmm registers
+    {0, 32, X86::VECRRegBank},  // :4
+    {0, 64, X86::VECRRegBank},  // :5
+    // VR128/256/512
+    {0, 128, X86::VECRRegBank}, // :6
+    {0, 256, X86::VECRRegBank}, // :7
+    {0, 512, X86::VECRRegBank}, // :8   
+};
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum PartialMappingIdx {
+  PMI_None = -1,
+  PMI_GPR8,
+  PMI_GPR16,
+  PMI_GPR32,
+  PMI_GPR64,
+  PMI_FP32,
+  PMI_FP64,
+  PMI_VEC128,
+  PMI_VEC256,
+  PMI_VEC512
+};
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#define INSTR_3OP(INFO) INFO, INFO, INFO,
+#define BREAKDOWN(INDEX, NUM)                                                  \
+  { &X86GenRegisterBankInfo::PartMappings[INDEX], NUM }
+// ValueMappings.
+RegisterBankInfo::ValueMapping X86GenRegisterBankInfo::ValMappings[]{
+    /* BreakDown, NumBreakDowns */
+    // 3-operands instructions (all binary operations should end up with one of
+    // those mapping).
+    INSTR_3OP(BREAKDOWN(PMI_GPR8, 1))  // 0: GPR_8
+    INSTR_3OP(BREAKDOWN(PMI_GPR16, 1)) // 3: GPR_16
+    INSTR_3OP(BREAKDOWN(PMI_GPR32, 1)) // 6: GPR_32
+    INSTR_3OP(BREAKDOWN(PMI_GPR64, 1)) // 9: GPR_64    
+    INSTR_3OP(BREAKDOWN(PMI_FP32, 1))   // 12: Fp32
+    INSTR_3OP(BREAKDOWN(PMI_FP64, 1))   // 15: Fp64
+    INSTR_3OP(BREAKDOWN(PMI_VEC128, 1)) // 18: Vec128
+    INSTR_3OP(BREAKDOWN(PMI_VEC256, 1)) // 21: Vec256
+    INSTR_3OP(BREAKDOWN(PMI_VEC512, 1)) // 24: Vec512    
+};
+#undef INSTR_3OP
+#undef BREAKDOWN
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum ValueMappingIdx {
+  VMI_None = -1,
+  VMI_3OpsGpr8Idx =  PMI_GPR8  * 3,
+  VMI_3OpsGpr16Idx = PMI_GPR16 * 3,
+  VMI_3OpsGpr32Idx = PMI_GPR32 * 3,
+  VMI_3OpsGpr64Idx = PMI_GPR64 * 3,  
+  VMI_3OpsFp32Idx = PMI_FP32 * 3,
+  VMI_3OpsFp64Idx = PMI_FP64 * 3,
+  VMI_3OpsVec128Idx = PMI_VEC128 * 3,
+  VMI_3OpsVec256Idx = PMI_VEC256 * 3,
+  VMI_3OpsVec512Idx = PMI_VEC512 * 3,
+};
+#undef GET_TARGET_REGBANK_INFO_CLASS
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#undef GET_TARGET_REGBANK_INFO_IMPL
+const RegisterBankInfo::ValueMapping *
+X86GenRegisterBankInfo::getValueMapping(PartialMappingIdx Idx,
+                                        unsigned NumOperands) {
+  
+  // We can use VMI_3Ops Mapping for all the cases.
+  if (NumOperands <= 3 && (Idx >= PMI_GPR8 && Idx <= PMI_VEC512))
+    return &ValMappings[(unsigned)Idx * 3];
+  
+  llvm_unreachable("Unsupported PartialMappingIdx.");
+}
+
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8ab4c06..8f24f98 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -188,7 +189,6 @@ namespace {
 
   private:
     void Select(SDNode *N) override;
-    bool tryGather(SDNode *N, unsigned Opc);
 
     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -204,6 +204,11 @@ namespace {
     bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
                           SDValue &Segment);
+    template <class GatherScatterSDNode>
+    bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N,
+                                       SDValue &Base, SDValue &Scale,
+                                       SDValue &Index, SDValue &Disp,
+                                       SDValue &Segment);
     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
     bool selectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -384,6 +389,16 @@ namespace {
     bool ComplexPatternFuncMutatesDAG() const override {
       return true;
     }
+
+    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
+
+    /// Returns whether this is a relocatable immediate in the range
+    /// [-2^Width .. 2^Width-1].
+    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
+      if (auto *CN = dyn_cast<ConstantSDNode>(N))
+        return isInt<Width>(CN->getSExtValue());
+      return isSExtAbsoluteSymbolRef(Width, N);
+    }
   };
 }
 
@@ -408,8 +423,7 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
     case X86ISD::XOR:
     case X86ISD::OR:
     case ISD::ADD:
-    case ISD::ADDC:
-    case ISD::ADDE:
+    case ISD::ADDCARRY:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR: {
@@ -709,7 +723,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
-        Subtarget->isTargetGlibc())
+        (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
+         Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1040,7 +1055,10 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
   // Scale the leading zero count down based on the actual size of the value.
   // Also scale it down based on the size of the shift.
-  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
+  if (MaskLZ < ScaleDown)
+    return true;
+  MaskLZ -= ScaleDown;
 
   // The final check is to ensure that any masked out high bits of X are
   // already known to be zero. Otherwise, the mask has a semantic impact
@@ -1060,9 +1078,9 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   }
   APInt MaskedHighBits =
     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
-  APInt KnownZero, KnownOne;
-  DAG.computeKnownBits(X, KnownZero, KnownOne);
-  if (MaskedHighBits != KnownZero) return true;
+  KnownBits Known;
+  DAG.computeKnownBits(X, Known);
+  if (MaskedHighBits != Known.Zero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
   // and an addressing mode. Make it so.
@@ -1166,8 +1184,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
-    if (ConstantSDNode
-          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       unsigned Val = CN->getZExtValue();
       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
       // that the base operand remains free for further matching. If
@@ -1175,15 +1192,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
       if (Val == 1 || Val == 2 || Val == 3) {
         AM.Scale = 1 << Val;
-        SDValue ShVal = N.getNode()->getOperand(0);
+        SDValue ShVal = N.getOperand(0);
 
         // Okay, we know that we have a scale by now.  However, if the scaled
         // value is an add of something and a constant, we can fold the
         // constant into the disp field here.
         if (CurDAG->isBaseWithConstantOffset(ShVal)) {
-          AM.IndexReg = ShVal.getNode()->getOperand(0);
-          ConstantSDNode *AddVal =
-            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          AM.IndexReg = ShVal.getOperand(0);
+          ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!foldOffsetIntoAddress(Disp, AM))
             return false;
@@ -1233,28 +1249,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
         AM.IndexReg.getNode() == nullptr) {
-      if (ConstantSDNode
-            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
             CN->getZExtValue() == 9) {
           AM.Scale = unsigned(CN->getZExtValue())-1;
 
-          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue MulVal = N.getOperand(0);
           SDValue Reg;
 
           // Okay, we know that we have a scale by now.  However, if the scaled
           // value is an add of something and a constant, we can fold the
           // constant into the disp field here.
           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
-              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
-            Reg = MulVal.getNode()->getOperand(0);
+              isa<ConstantSDNode>(MulVal.getOperand(1))) {
+            Reg = MulVal.getOperand(0);
             ConstantSDNode *AddVal =
-              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+              cast<ConstantSDNode>(MulVal.getOperand(1));
             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
             if (foldOffsetIntoAddress(Disp, AM))
-              Reg = N.getNode()->getOperand(0);
+              Reg = N.getOperand(0);
           } else {
-            Reg = N.getNode()->getOperand(0);
+            Reg = N.getOperand(0);
           }
 
           AM.IndexReg = AM.Base_Reg = Reg;
@@ -1277,7 +1292,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
-    if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
@@ -1288,7 +1303,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     }
 
     int Cost = 0;
-    SDValue RHS = Handle.getValue().getNode()->getOperand(1);
+    SDValue RHS = Handle.getValue().getOperand(1);
     // If the RHS involves a register with multiple uses, this
     // transformation incurs an extra mov, due to the neg instruction
     // clobbering its operand.
@@ -1297,12 +1312,13 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
-         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+         RHS.getOperand(0).getValueType() == MVT::i32))
       ++Cost;
     // If the base is a register with multiple uses, this
     // transformation may save a mov.
-    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
-         AM.Base_Reg.getNode() &&
+    // FIXME: Don't rely on DELETED_NODEs.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
+         AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
          !AM.Base_Reg.getNode()->hasOneUse()) ||
         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
       --Cost;
@@ -1325,8 +1341,8 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     AM.Scale = 1;
 
     // Insert the new nodes into the topological ordering.
-    insertDAGNode(*CurDAG, N, Zero);
-    insertDAGNode(*CurDAG, N, Neg);
+    insertDAGNode(*CurDAG, Handle.getValue(), Zero);
+    insertDAGNode(*CurDAG, Handle.getValue(), Neg);
     return false;
   }
 
@@ -1407,13 +1423,10 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   return false;
 }
 
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                                      SDValue &Scale, SDValue &Index,
-                                      SDValue &Disp, SDValue &Segment) {
-
-  MaskedGatherScatterSDNode *Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent);
-  if (!Mgs)
-    return false;
+template <class GatherScatterSDNode>
+bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
+    GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale,
+    SDValue &Index, SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
   unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1445,6 +1458,18 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   return true;
 }
 
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                       SDValue &Scale, SDValue &Index,
+                                       SDValue &Disp, SDValue &Segment) {
+  if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent))
+    return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>(
+        Mgs, N, Base, Scale, Index, Disp, Segment);
+  if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent))
+    return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>(
+        X86Gather, N, Base, Scale, Index, Disp, Segment);
+  return false;
+}
+
 /// Returns true if it is able to pattern match an addressing mode.
 /// It returns the operands which make up the maximal addressing mode it can
 /// match by reference.
@@ -1789,6 +1814,21 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
+bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
+  if (N->getOpcode() == ISD::TRUNCATE)
+    N = N->getOperand(0).getNode();
+  if (N->getOpcode() != X86ISD::Wrapper)
+    return false;
+
+  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
+  if (!GA)
+    return false;
+
+  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+  return CR && CR->getSignedMin().sge(-1ull << Width) &&
+         CR->getSignedMax().slt(1ull << Width);
+}
+
 /// Test whether the given X86ISD::CMP node has any uses which require the SF
 /// or OF bits to be accurate.
 static bool hasNoSignedComparisonUses(SDNode *N) {
@@ -1905,6 +1945,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
       SDValue Op = Chain.getOperand(i);
       if (Op == Load.getValue(1)) {
         ChainCheck = true;
+        // Drop Load, but keep its chain. No cycle check necessary.
+        ChainOps.push_back(Load.getOperand(0));
         continue;
       }
 
@@ -1954,39 +1996,6 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   llvm_unreachable("unrecognized size for LdVT");
 }
 
-/// Customized ISel for GATHER operations.
-bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
-  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
-  SDValue Chain = Node->getOperand(0);
-  SDValue VSrc = Node->getOperand(2);
-  SDValue Base = Node->getOperand(3);
-  SDValue VIdx = Node->getOperand(4);
-  SDValue VMask = Node->getOperand(5);
-  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
-  if (!Scale)
-    return false;
-
-  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
-                                   MVT::Other);
-
-  SDLoc DL(Node);
-
-  // Memory Operands: Base, Scale, Index, Disp, Segment
-  SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
-  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
-                          Disp, Segment, VMask, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
-  // Node has 2 outputs: VDst and MVT::Other.
-  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
-  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
-  // of ResNode.
-  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
-  CurDAG->RemoveDeadNode(Node);
-  return true;
-}
-
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
@@ -2024,55 +2033,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default: break;
-    case Intrinsic::x86_avx2_gather_d_pd:
-    case Intrinsic::x86_avx2_gather_d_pd_256:
-    case Intrinsic::x86_avx2_gather_q_pd:
-    case Intrinsic::x86_avx2_gather_q_pd_256:
-    case Intrinsic::x86_avx2_gather_d_ps:
-    case Intrinsic::x86_avx2_gather_d_ps_256:
-    case Intrinsic::x86_avx2_gather_q_ps:
-    case Intrinsic::x86_avx2_gather_q_ps_256:
-    case Intrinsic::x86_avx2_gather_d_q:
-    case Intrinsic::x86_avx2_gather_d_q_256:
-    case Intrinsic::x86_avx2_gather_q_q:
-    case Intrinsic::x86_avx2_gather_q_q_256:
-    case Intrinsic::x86_avx2_gather_d_d:
-    case Intrinsic::x86_avx2_gather_d_d_256:
-    case Intrinsic::x86_avx2_gather_q_d:
-    case Intrinsic::x86_avx2_gather_q_d_256: {
-      if (!Subtarget->hasAVX2())
-        break;
-      unsigned Opc;
-      switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
-      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
-      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
-      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
-      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
-      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
-      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
-      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
-      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
-      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
-      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
-      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
-      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
-      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
-      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
-      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
-      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
-      }
-      if (tryGather(Node, Opc))
-        return;
-      break;
-    }
-    }
-    break;
-  }
   case X86ISD::GlobalBaseReg:
     ReplaceNode(Node, getGlobalBaseReg());
     return;
@@ -2576,7 +2536,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8 &&
         X86::isZeroNode(N1)) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C) break;
 
       // For example, convert "testl %eax, $8" to "testb %al, $8"
@@ -2584,7 +2544,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           (!(C->getZExtValue() & 0x80) ||
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // On x86-32, only the ABCD registers have 8-bit subregisters.
         if (!Subtarget->is64Bit()) {
@@ -2620,7 +2580,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         // Shift the immediate right by 8 bits.
         SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
                                                        dl, MVT::i8);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Put the value in an ABCD register.
         const TargetRegisterClass *TRC;
@@ -2657,7 +2617,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i16);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Extract the 16-bit subregister.
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
@@ -2680,7 +2640,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
            hasNoSignedComparisonUses(Node))) {
         SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
                                                 MVT::i32);
-        SDValue Reg = N0.getNode()->getOperand(0);
+        SDValue Reg = N0.getOperand(0);
 
         // Extract the 32-bit subregister.
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index de6c9a6..957b46c40 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -52,7 +53,9 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <bitset>
@@ -70,6 +73,30 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
+static cl::opt<int> ExperimentalPrefLoopAlignment(
+    "x86-experimental-pref-loop-alignment", cl::init(4),
+    cl::desc("Sets the preferable loop alignment for experiments "
+             "(the last x86-experimental-pref-loop-alignment bits"
+             " of the loop header PC will be 0)."),
+    cl::Hidden);
+
+static cl::opt<bool> MulConstantOptimization(
+    "mul-constant-optimization", cl::init(true),
+    cl::desc("Replace 'mul x, Const' with more effective instructions like "
+             "SHIFT, LEA, etc."),
+    cl::Hidden);
+
+/// Call this when the user attempts to do something unsupported, like
+/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
+/// report_fatal_error, so calling code should attempt to recover without
+/// crashing.
+static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
+                             const char *Msg) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  DAG.getContext()->diagnose(
+      DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+}
+
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -290,16 +317,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UREM, VT, Expand);
   }
 
-  for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
-    if (VT == MVT::i64 && !Subtarget.is64Bit())
-      continue;
-    // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
-    setOperationAction(ISD::ADDC, VT, Custom);
-    setOperationAction(ISD::ADDE, VT, Custom);
-    setOperationAction(ISD::SUBC, VT, Custom);
-    setOperationAction(ISD::SUBE, VT, Custom);
-  }
-
   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
@@ -401,8 +418,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
-    setOperationAction(ISD::SETCCE, VT, Custom);
   }
+
+  // Custom action for SELECT MMX and expand action for SELECT_CC MMX
+  setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
+
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
@@ -427,7 +448,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
     setOperationAction(ISD::BlockAddress    , VT, Custom);
   }
-  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+
+  // 64-bit shl, sra, srl (iff 32-bit x86)
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
@@ -648,7 +670,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FSINCOS,   VT, Expand);
     setOperationAction(ISD::FCOS,      VT, Expand);
     setOperationAction(ISD::FREM,      VT, Expand);
-    setOperationAction(ISD::FPOWI,     VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FPOW,      VT, Expand);
     setOperationAction(ISD::FLOG,      VT, Expand);
@@ -775,29 +796,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
 
-    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
-    setOperationAction(ISD::CTPOP,              MVT::v16i8, Custom);
-    setOperationAction(ISD::CTPOP,              MVT::v8i16, Custom);
-    setOperationAction(ISD::CTPOP,              MVT::v4i32, Custom);
-    setOperationAction(ISD::CTPOP,              MVT::v2i64, Custom);
-
-    setOperationAction(ISD::CTTZ,               MVT::v16i8, Custom);
-    setOperationAction(ISD::CTTZ,               MVT::v8i16, Custom);
-    setOperationAction(ISD::CTTZ,               MVT::v4i32, Custom);
-    setOperationAction(ISD::CTTZ,               MVT::v2i64, Custom);
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SETCC,              VT, Custom);
+      setOperationAction(ISD::CTPOP,              VT, Custom);
+      setOperationAction(ISD::CTTZ,               VT, Custom);
+    }
 
-    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
@@ -872,22 +882,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 
-    for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
-      setOperationAction(ISD::SRL, VT, Custom);
-      setOperationAction(ISD::SHL, VT, Custom);
-      setOperationAction(ISD::SRA, VT, Custom);
-    }
-
-    // In the customized shift lowering, the legal cases in AVX2 will be
-    // recognized.
-    for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
-      setOperationAction(ISD::SRL, VT, Custom);
-      setOperationAction(ISD::SHL, VT, Custom);
-      setOperationAction(ISD::SRA, VT, Custom);
+    // In the customized shift lowering, the legal v4i32/v2i64 cases
+    // in AVX2 will be recognized.
+    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SRL,              VT, Custom);
+      setOperationAction(ISD::SHL,              VT, Custom);
+      setOperationAction(ISD::SRA,              VT, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
@@ -922,6 +929,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
+    for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
+    }
+
     for (MVT VT : MVT::integer_vector_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
@@ -929,19 +941,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
-    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
-
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
-    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
+    for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+      setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
+      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
+      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
+      setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
+      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
+      setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
+    }
 
     // i8 vectors are custom because the source register and source
     // source memory operand types are not the same width.
@@ -1005,36 +1012,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
-    for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
+    // In the customized shift lowering, the legal v8i32/v4i64 cases
+    // in AVX2 will be recognized.
+    for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
     }
 
-    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
-    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
-    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
-    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
-
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
-    setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
-    setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
-    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
+      setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
+      setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
+    }
+
     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
+      setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
       setOperationAction(ISD::CTTZ,            VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
@@ -1065,6 +1067,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1081,27 +1084,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
 
       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
-      setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
-      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
-      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
-      setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
-      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
-      setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
-
-      setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
-      setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
-    }
-
-    // In the customized shift lowering, the legal cases in AVX2 will be
-    // recognized.
-    for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
-      setOperationAction(ISD::SRL, VT, Custom);
-      setOperationAction(ISD::SHL, VT, Custom);
-      setOperationAction(ISD::SRA, VT, Custom);
+      for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
+        setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
+        setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
+        setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
+      }
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
@@ -1126,7 +1116,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
@@ -1149,7 +1139,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
 
-    addRegisterClass(MVT::i1,     &X86::VK1RegClass);
+    addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
@@ -1164,16 +1154,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
     }
-    setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
-    setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
-    setOperationAction(ISD::SETCCE,             MVT::i1,    Custom);
-    setOperationAction(ISD::SELECT_CC,          MVT::i1,    Expand);
-    setOperationAction(ISD::XOR,                MVT::i1,    Legal);
-    setOperationAction(ISD::OR,                 MVT::i1,    Legal);
-    setOperationAction(ISD::AND,                MVT::i1,    Legal);
-    setOperationAction(ISD::SUB,                MVT::i1,    Custom);
-    setOperationAction(ISD::ADD,                MVT::i1,    Custom);
-    setOperationAction(ISD::MUL,                MVT::i1,    Custom);
 
     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
@@ -1242,27 +1222,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(ISD::MSTORE, VT, Custom);
       }
     }
-    setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i1,  Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v16i1, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v8i1,  Expand);
-    setOperationAction(ISD::VSELECT,            MVT::v16i1, Expand);
-    if (Subtarget.hasDQI()) {
-      setOperationAction(ISD::SINT_TO_FP,       MVT::v8i64, Legal);
-      setOperationAction(ISD::SINT_TO_FP,       MVT::v4i64, Legal);
-      setOperationAction(ISD::SINT_TO_FP,       MVT::v2i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP,       MVT::v8i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP,       MVT::v4i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP,       MVT::v2i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT,       MVT::v8i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT,       MVT::v4i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT,       MVT::v2i64, Legal);
-      setOperationAction(ISD::FP_TO_UINT,       MVT::v8i64, Legal);
-      setOperationAction(ISD::FP_TO_UINT,       MVT::v4i64, Legal);
-      setOperationAction(ISD::FP_TO_UINT,       MVT::v2i64, Legal);
 
+    if (Subtarget.hasDQI()) {
+      for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
+        setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
+        setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
+        setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
+        setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
+      }
       if (Subtarget.hasVLX()) {
         // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
         setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
@@ -1296,8 +1265,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
     }
 
-    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
-    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
@@ -1310,11 +1277,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
 
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::FFLOOR,     VT, Legal);
-      setOperationAction(ISD::FCEIL,      VT, Legal);
-      setOperationAction(ISD::FTRUNC,     VT, Legal);
-      setOperationAction(ISD::FRINT,      VT, Legal);
-      setOperationAction(ISD::FNEARBYINT, VT, Legal);
+      setOperationAction(ISD::FFLOOR,           VT, Legal);
+      setOperationAction(ISD::FCEIL,            VT, Legal);
+      setOperationAction(ISD::FTRUNC,           VT, Legal);
+      setOperationAction(ISD::FRINT,            VT, Legal);
+      setOperationAction(ISD::FNEARBYINT,       VT, Legal);
     }
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
@@ -1330,48 +1297,54 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
 
-    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
-    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
-
-    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
+    setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
 
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v16i1, Custom);
-    setOperationAction(ISD::SELECT,             MVT::v8i1,  Custom);
-
-    setOperationAction(ISD::SMAX,               MVT::v16i32, Legal);
-    setOperationAction(ISD::SMAX,               MVT::v8i64, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v16i32, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v8i64, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v16i32, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v8i64, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v16i32, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v8i64, Legal);
-
-    setOperationAction(ISD::ADD,                MVT::v8i1,  Expand);
-    setOperationAction(ISD::ADD,                MVT::v16i1, Expand);
-    setOperationAction(ISD::SUB,                MVT::v8i1,  Expand);
-    setOperationAction(ISD::SUB,                MVT::v16i1, Expand);
-    setOperationAction(ISD::MUL,                MVT::v8i1,  Expand);
-    setOperationAction(ISD::MUL,                MVT::v16i1, Expand);
 
     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
 
+    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+    setOperationAction(ISD::ABS,                MVT::v4i64, Legal);
+    setOperationAction(ISD::ABS,                MVT::v2i64, Legal);
+
+    for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
+      setOperationAction(ISD::ADD,              VT, Custom);
+      setOperationAction(ISD::SUB,              VT, Custom);
+      setOperationAction(ISD::MUL,              VT, Custom);
+      setOperationAction(ISD::SETCC,            VT, Custom);
+      setOperationAction(ISD::SELECT,           VT, Custom);
+      setOperationAction(ISD::TRUNCATE,         VT, Custom);
+
+      setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
+      setOperationAction(ISD::VSELECT,          VT,  Expand);
+    }
+
     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
-      setOperationAction(ISD::SRL, VT, Custom);
-      setOperationAction(ISD::SHL, VT, Custom);
-      setOperationAction(ISD::SRA, VT, Custom);
-      setOperationAction(ISD::CTPOP, VT, Custom);
-      setOperationAction(ISD::CTTZ, VT, Custom);
+      setOperationAction(ISD::SMAX,             VT, Legal);
+      setOperationAction(ISD::UMAX,             VT, Legal);
+      setOperationAction(ISD::SMIN,             VT, Legal);
+      setOperationAction(ISD::UMIN,             VT, Legal);
+      setOperationAction(ISD::ABS,              VT, Legal);
+      setOperationAction(ISD::SRL,              VT, Custom);
+      setOperationAction(ISD::SHL,              VT, Custom);
+      setOperationAction(ISD::SRA,              VT, Custom);
+      setOperationAction(ISD::CTPOP,            VT, Custom);
+      setOperationAction(ISD::CTTZ,             VT, Custom);
+    }
+
+    // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+    for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
+                    MVT::v8i64}) {
+      setOperationAction(ISD::ROTL,             VT, Custom);
+      setOperationAction(ISD::ROTR,             VT, Custom);
     }
 
     // Need to promote to 64-bit even though we have 32-bit masked instructions
@@ -1382,33 +1355,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
 
     if (Subtarget.hasCDI()) {
-      setOperationAction(ISD::CTLZ,             MVT::v8i64,  Legal);
-      setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
-
-      setOperationAction(ISD::CTLZ,             MVT::v8i16,  Custom);
-      setOperationAction(ISD::CTLZ,             MVT::v16i8,  Custom);
-      setOperationAction(ISD::CTLZ,             MVT::v16i16, Custom);
-      setOperationAction(ISD::CTLZ,             MVT::v32i8,  Custom);
-
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i64,  Custom);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v16i32, Custom);
-
-      if (Subtarget.hasVLX()) {
-        setOperationAction(ISD::CTLZ,             MVT::v4i64, Legal);
-        setOperationAction(ISD::CTLZ,             MVT::v8i32, Legal);
-        setOperationAction(ISD::CTLZ,             MVT::v2i64, Legal);
-        setOperationAction(ISD::CTLZ,             MVT::v4i32, Legal);
-      } else {
-        setOperationAction(ISD::CTLZ,             MVT::v4i64, Custom);
-        setOperationAction(ISD::CTLZ,             MVT::v8i32, Custom);
-        setOperationAction(ISD::CTLZ,             MVT::v2i64, Custom);
-        setOperationAction(ISD::CTLZ,             MVT::v4i32, Custom);
+      // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
+      for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
+                      MVT::v4i64, MVT::v8i64}) {
+        setOperationAction(ISD::CTLZ,            VT, Legal);
+        setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
-
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i64, Custom);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v8i32, Custom);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v2i64, Custom);
-      setOperationAction(ISD::CTTZ_ZERO_UNDEF,  MVT::v4i32, Custom);
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasDQI()) {
@@ -1418,6 +1370,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
     }
 
+    if (Subtarget.hasVPOPCNTDQ()) {
+      // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
+      // version of popcntd/q.
+      for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
+                      MVT::v4i32, MVT::v2i64})
+        setOperationAction(ISD::CTPOP, VT, Legal);
+    }
+
     // Custom lower several nodes.
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
@@ -1428,7 +1388,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // (result) is 256-bit but the source is 512-bit wide.
     // 128-bit was made Custom under AVX1.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                     MVT::v8f32, MVT::v4f64 })
+                     MVT::v8f32, MVT::v4f64, MVT::v1i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
@@ -1438,10 +1398,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-      setOperationAction(ISD::VSELECT,             VT, Legal);
+      setOperationAction(ISD::VSELECT,             VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
       setOperationAction(ISD::MLOAD,               VT, Legal);
       setOperationAction(ISD::MSTORE,              VT, Legal);
       setOperationAction(ISD::MGATHER,             VT, Legal);
@@ -1460,12 +1420,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
-    setOperationAction(ISD::ADD,                MVT::v32i1, Expand);
-    setOperationAction(ISD::ADD,                MVT::v64i1, Expand);
-    setOperationAction(ISD::SUB,                MVT::v32i1, Expand);
-    setOperationAction(ISD::SUB,                MVT::v64i1, Expand);
-    setOperationAction(ISD::MUL,                MVT::v32i1, Expand);
-    setOperationAction(ISD::MUL,                MVT::v64i1, Expand);
+    setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
+    setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
+    setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
+    setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
+    setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
+    setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
@@ -1479,8 +1439,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
@@ -1502,8 +1462,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
-    setOperationAction(ISD::VSELECT,            MVT::v32i16, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v64i8, Legal);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
@@ -1515,15 +1473,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
 
-    setOperationAction(ISD::SMAX,               MVT::v64i8, Legal);
-    setOperationAction(ISD::SMAX,               MVT::v32i16, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v64i8, Legal);
-    setOperationAction(ISD::UMAX,               MVT::v32i16, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v64i8, Legal);
-    setOperationAction(ISD::SMIN,               MVT::v32i16, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v64i8, Legal);
-    setOperationAction(ISD::UMIN,               MVT::v32i16, Legal);
-
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
 
     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
@@ -1545,7 +1494,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-      setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::VSELECT,      VT, Custom);
+      setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
       setOperationAction(ISD::SRA,          VT, Custom);
@@ -1553,6 +1503,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MSTORE,       VT, Legal);
       setOperationAction(ISD::CTPOP,        VT, Custom);
       setOperationAction(ISD::CTTZ,         VT, Custom);
+      setOperationAction(ISD::SMAX,         VT, Legal);
+      setOperationAction(ISD::UMAX,         VT, Legal);
+      setOperationAction(ISD::SMIN,         VT, Legal);
+      setOperationAction(ISD::UMIN,         VT, Legal);
 
       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
@@ -1574,9 +1528,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
 
     for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
-      setOperationAction(ISD::ADD,                VT, Expand);
-      setOperationAction(ISD::SUB,                VT, Expand);
-      setOperationAction(ISD::MUL,                VT, Expand);
+      setOperationAction(ISD::ADD,                VT, Custom);
+      setOperationAction(ISD::SUB,                VT, Custom);
+      setOperationAction(ISD::MUL,                VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Expand);
 
       setOperationAction(ISD::TRUNCATE,           VT, Custom);
@@ -1626,6 +1580,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::USUBO, VT, Custom);
     setOperationAction(ISD::SMULO, VT, Custom);
     setOperationAction(ISD::UMULO, VT, Custom);
+
+    // Support carry in as value rather than glue.
+    setOperationAction(ISD::ADDCARRY, VT, Custom);
+    setOperationAction(ISD::SUBCARRY, VT, Custom);
+    setOperationAction(ISD::SETCCCARRY, VT, Custom);
   }
 
   if (!Subtarget.is64Bit()) {
@@ -1671,6 +1630,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
@@ -1696,6 +1656,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
@@ -1712,7 +1674,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
-  setPrefLoopAlignment(4); // 2^4 bytes.
+
+  // TODO: These control memcmp expansion in CGP and could be raised higher, but
+  // that needs to benchmarked and balanced with the potential use of vector
+  // load/store types (PR33329, PR33914).
+  MaxLoadsPerMemcmp = 2;
+  MaxLoadsPerMemcmpOptSize = 2;
+
+  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
+  setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
@@ -1742,7 +1712,7 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                           LLVMContext& Context,
                                           EVT VT) const {
   if (!VT.isVector())
-    return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
+    return MVT::i8;
 
   if (VT.isSimple()) {
     MVT VVT = VT.getSimpleVT();
@@ -1933,6 +1903,34 @@ bool X86TargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
+void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                                              ArgListTy &Args) const {
+
+  // Only relabel X86-32 for C / Stdcall CCs.
+  if (Subtarget.is64Bit())
+    return;
+  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+    return;
+  unsigned ParamRegs = 0;
+  if (auto *M = MF->getFunction()->getParent())
+    ParamRegs = M->getNumberRegisterParameters();
+
+  // Mark the first N int arguments as having reg
+  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
+    Type *T = Args[Idx].Ty;
+    if (T->isPointerTy() || T->isIntegerTy())
+      if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
+        unsigned numRegs = 1;
+        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
+          numRegs = 2;
+        if (ParamRegs < numRegs)
+          return;
+        ParamRegs -= numRegs;
+        Args[Idx].IsInReg = true;
+      }
+  }
+}
+
 const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
@@ -2001,21 +1999,37 @@ unsigned X86TargetLowering::getAddressSpace() const {
   return 256;
 }
 
-Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
-  // glibc has a special slot for the stack guard in tcbhead_t, use it instead
-  // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
-  if (!Subtarget.isTargetGlibc())
-    return TargetLowering::getIRStackGuard(IRB);
-
-  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
-  // %gs:0x14 on i386
-  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
-  unsigned AddressSpace = getAddressSpace();
+static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
+  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
+         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+}
+
+static Constant* SegmentOffset(IRBuilder<> &IRB,
+                               unsigned Offset, unsigned AddressSpace) {
   return ConstantExpr::getIntToPtr(
       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
 }
 
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
+  // tcbhead_t; use it instead of the usual global variable (see
+  // sysdeps/{i386,x86_64}/nptl/tls.h)
+  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
+    if (Subtarget.isTargetFuchsia()) {
+      // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+      return SegmentOffset(IRB, 0x10, getAddressSpace());
+    } else {
+      // %fs:0x28, unless we're using a Kernel code model, in which case
+      // it's %gs:0x28.  gs:0x14 on i386.
+      unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+      return SegmentOffset(IRB, Offset, getAddressSpace());
+    }
+  }
+
+  return TargetLowering::getIRStackGuard(IRB);
+}
+
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -2027,13 +2041,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
     auto *SecurityCheckCookie = cast<Function>(
         M.getOrInsertFunction("__security_check_cookie",
                               Type::getVoidTy(M.getContext()),
-                              Type::getInt8PtrTy(M.getContext()), nullptr));
+                              Type::getInt8PtrTy(M.getContext())));
     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
     return;
   }
-  // glibc has a special slot for the stack guard.
-  if (Subtarget.isTargetGlibc())
+  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
+  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
     return;
   TargetLowering::insertSSPDeclarations(M);
 }
@@ -2056,21 +2070,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (Subtarget.getTargetTriple().isOSContiki())
     return getDefaultSafeStackPointerLocation(IRB, false);
 
-  if (!Subtarget.isTargetAndroid())
-    return TargetLowering::getSafeStackPointerLocation(IRB);
-
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  unsigned AddressSpace, Offset;
+  if (Subtarget.isTargetAndroid()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    // %gs:0x24 on i386
+    unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+    return SegmentOffset(IRB, Offset, getAddressSpace());
+  }
 
-  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
-  // %gs:0x24 on i386
-  Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
-  AddressSpace = getAddressSpace();
-  return ConstantExpr::getIntToPtr(
-      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
-      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+  // Fuchsia is similar.
+  if (Subtarget.isTargetFuchsia()) {
+    // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+    return SegmentOffset(IRB, 0x18, getAddressSpace());
+  }
+
+  return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -2160,6 +2176,12 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
+  // In some cases we need to disable registers from the default CSR list.
+  // For example, when they are used for argument passing.
+  bool ShouldDisableCalleeSavedRegister =
+      CallConv == CallingConv::X86_RegCall ||
+      MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
+
   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
     report_fatal_error("X86 interrupts may not return any value");
 
@@ -2179,6 +2201,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
        ++I, ++OutsIndex) {
     CCValAssign &VA = RVLocs[I];
     assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Add the register to the CalleeSaveDisableRegs list.
+    if (ShouldDisableCalleeSavedRegister)
+      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
+
     SDValue ValToCopy = OutVals[OutsIndex];
     EVT ValVT = ValToCopy.getValueType();
 
@@ -2203,15 +2230,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
-          (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
-      report_fatal_error("SSE register return with SSE disabled");
+        (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+    } else if (ValVT == MVT::f64 &&
+               (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
+      // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
+      // llvm-gcc has never done it right and no one has noticed, so this
+      // should be OK for now.
+      errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
-    // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
-    // llvm-gcc has never done it right and no one has noticed, so this
-    // should be OK for now.
-    if (ValVT == MVT::f64 &&
-        (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
-      report_fatal_error("SSE2 register return with SSE2 disabled");
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
@@ -2253,6 +2282,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
       assert(2 == RegsToPass.size() &&
              "Expecting two registers after Pass64BitArgInRegs");
+
+      // Add the second register to the CalleeSaveDisableRegs list.
+      if (ShouldDisableCalleeSavedRegister)
+        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
     } else {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
     }
@@ -2309,6 +2342,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // RAX/EAX now acts like a return value.
     RetOps.push_back(
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+
+    // Add the returned register to the CalleeSaveDisableRegs list.
+    if (ShouldDisableCalleeSavedRegister)
+      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
   }
 
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -2444,7 +2481,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
   // Convert the i32 type into v32i1 type
   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
 
-  // Concantenate the two values together
+  // Concatenate the two values together
   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
 }
 
@@ -2456,6 +2493,9 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
                                SelectionDAG &DAG) {
   SDValue ValReturned = ValArg;
 
+  if (ValVT == MVT::v1i1)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
+
   if (ValVT == MVT::v64i1) {
     // In 32 bit machine, this case is handled by getv64i1Argument
     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
@@ -2478,7 +2518,6 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
 
     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
   }
-
   return DAG.getBitcast(ValVT, ValReturned);
 }
 
@@ -2488,8 +2527,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
 SDValue X86TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    uint32_t *RegMask) const {
 
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget.is64Bit();
@@ -2503,10 +2544,19 @@ SDValue X86TargetLowering::LowerCallResult(
     CCValAssign &VA = RVLocs[I];
     EVT CopyVT = VA.getLocVT();
 
+    // In some calling conventions we need to remove the used registers
+    // from the register mask.
+    if (RegMask) {
+      for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+    }
+
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
-      report_fatal_error("SSE register return with SSE disabled");
+      errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
+      VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2624,7 +2674,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   // C calling conventions:
   case CallingConv::C:
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
   // Callee pop conventions:
   case CallingConv::X86_ThisCall:
@@ -2643,13 +2693,13 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
 }
 
-bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   auto Attr =
       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
     return false;
 
-  CallSite CS(CI);
+  ImmutableCallSite CS(CI);
   CallingConv::ID CalleeCC = CS.getCallingConv();
   if (!mayTailCallThisCC(CalleeCC))
     return false;
@@ -2669,6 +2719,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // If value is passed by pointer we have address passed instead of the value
   // itself. No need to extend if the mask value and location share the same
@@ -2686,13 +2737,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   // taken by a return address.
   int Offset = 0;
   if (CallConv == CallingConv::X86_INTR) {
-    const X86Subtarget& Subtarget =
-        static_cast<const X86Subtarget&>(DAG.getSubtarget());
     // X86 interrupts may take one or two arguments.
     // On the stack there will be no return address as in regular call.
     // Offset of last argument need to be set to -4/-8 bytes.
     // Where offset of the first argument out of two, should be set to 0 bytes.
     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+    if (Subtarget.is64Bit() && Ins.size() == 2) {
+      // The stack pointer needs to be realigned for 64 bit handlers with error
+      // code, so the argument offset changes by 8 bytes.
+      Offset += 8;
+    }
   }
 
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
@@ -2707,30 +2761,74 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
     }
-    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-  } else {
-    int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
-                                   VA.getLocMemOffset(), isImmutable);
-
-    // Set SExt or ZExt flag.
-    if (VA.getLocInfo() == CCValAssign::ZExt) {
-      MFI.setObjectZExt(FI, true);
-    } else if (VA.getLocInfo() == CCValAssign::SExt) {
-      MFI.setObjectSExt(FI, true);
+    return DAG.getFrameIndex(FI, PtrVT);
+  }
+
+  // This is an argument in memory. We might be able to perform copy elision.
+  if (Flags.isCopyElisionCandidate()) {
+    EVT ArgVT = Ins[i].ArgVT;
+    SDValue PartAddr;
+    if (Ins[i].PartOffset == 0) {
+      // If this is a one-part value or the first part of a multi-part value,
+      // create a stack object for the entire argument value type and return a
+      // load from our portion of it. This assumes that if the first part of an
+      // argument is in memory, the rest will also be in memory.
+      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
+                                     /*Immutable=*/false);
+      PartAddr = DAG.getFrameIndex(FI, PtrVT);
+      return DAG.getLoad(
+          ValVT, dl, Chain, PartAddr,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+    } else {
+      // This is not the first piece of an argument in memory. See if there is
+      // already a fixed stack object including this offset. If so, assume it
+      // was created by the PartOffset == 0 branch above and create a load from
+      // the appropriate offset into it.
+      int64_t PartBegin = VA.getLocMemOffset();
+      int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+      int FI = MFI.getObjectIndexBegin();
+      for (; MFI.isFixedObjectIndex(FI); ++FI) {
+        int64_t ObjBegin = MFI.getObjectOffset(FI);
+        int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+        if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+          break;
+      }
+      if (MFI.isFixedObjectIndex(FI)) {
+        SDValue Addr =
+            DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+                        DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+        return DAG.getLoad(
+            ValVT, dl, Chain, Addr,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+                                              Ins[i].PartOffset));
+      }
     }
+  }
 
-    // Adjust SP offset of interrupt parameter.
-    if (CallConv == CallingConv::X86_INTR) {
-      MFI.setObjectOffset(FI, Offset);
-    }
+  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+                                 VA.getLocMemOffset(), isImmutable);
 
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue Val = DAG.getLoad(
-        ValVT, dl, Chain, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-    return ExtendedInMem ?
-      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+  // Set SExt or ZExt flag.
+  if (VA.getLocInfo() == CCValAssign::ZExt) {
+    MFI.setObjectZExt(FI, true);
+  } else if (VA.getLocInfo() == CCValAssign::SExt) {
+    MFI.setObjectSExt(FI, true);
   }
+
+  // Adjust SP offset of interrupt parameter.
+  if (CallConv == CallingConv::X86_INTR) {
+    MFI.setObjectOffset(FI, Offset);
+  }
+
+  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+  SDValue Val = DAG.getLoad(
+      ValVT, dl, Chain, FIN,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  return ExtendedInMem
+             ? (VA.getValVT().isVector()
+                    ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
+                    : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
+             : Val;
 }
 
 // FIXME: Get this from tablegen.
@@ -2781,12 +2879,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
+#ifndef NDEBUG
 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
                           return A.getValNo() < B.getValNo();
                         });
 }
+#endif
 
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -2836,8 +2936,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
-  if (!isSortedByValueNo(ArgLocs))
-    llvm_unreachable("Argument Location list must be sorted before lowering");
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -2853,7 +2953,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
             "Currently the only custom case is when we split v64i1 to 2 regs");
 
         // v64i1 values, in regcall calling convention, that are
-        // compiled to 32 bit arch, are splited up into two registers.
+        // compiled to 32 bit arch, are split up into two registers.
         ArgValue =
             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
       } else {
@@ -2878,7 +2978,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
         else if (RegVT == MVT::x86mmx)
           RC = &X86::VR64RegClass;
-        else if (RegVT == MVT::i1)
+        else if (RegVT == MVT::v1i1)
           RC = &X86::VK1RegClass;
         else if (RegVT == MVT::v8i1)
           RC = &X86::VK8RegClass;
@@ -3107,8 +3207,9 @@ SDValue X86TargetLowering::LowerFormalArguments(
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
-    // X86 interrupts must pop the error code if present
-    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+    // X86 interrupts must pop the error code (and the alignment padding) if
+    // present.
+    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
@@ -3146,6 +3247,13 @@ SDValue X86TargetLowering::LowerFormalArguments(
     }
   }
 
+  if (CallConv == CallingConv::X86_RegCall ||
+      Fn->hasFnAttribute("no_caller_saved_registers")) {
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
+      MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
+  }
+
   return Chain;
 }
 
@@ -3232,6 +3340,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsSibcall      = false;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
+  const CallInst *CI =
+      CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
+  const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
+  bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
+                 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
 
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
@@ -3333,8 +3446,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!IsSibcall)
-    Chain = DAG.getCALLSEQ_START(
-        Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
+                                 NumBytes - NumBytesToPush, dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
@@ -3348,8 +3461,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
-  if (!isSortedByValueNo(ArgLocs))
-    llvm_unreachable("Argument Location list must be sorted before lowering");
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
@@ -3517,7 +3630,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (VA.isRegLoc()) {
         if (VA.needsCustom()) {
           assert((CallConv == CallingConv::X86_RegCall) &&
-                 "Expecting custome case only in regcall calling convention");
+                 "Expecting custom case only in regcall calling convention");
           // This means that we are in special case where one argument was
           // passed through two register locations - Skip the next location
           ++I;
@@ -3644,7 +3757,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
+  // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
+  // set X86_INTR calling convention because it has the same CSR mask
+  // (same preserved registers).
+  const uint32_t *Mask = RegInfo->getCallPreservedMask(
+      MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
 
   // If this is an invoke in a 32-bit function using a funclet-based
@@ -3662,7 +3779,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Mask = RegInfo->getNoPreservedMask();
   }
 
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  // Define a new register mask from the existing mask.
+  uint32_t *RegMask = nullptr;
+
+  // In some calling conventions we need to remove the used physical registers
+  // from the reg mask.
+  if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
+    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+    // Allocate a new Reg Mask and copy Mask.
+    RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
+    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+    memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+
+    // Make sure all sub registers of the argument registers are reset
+    // in the RegMask.
+    for (auto const &RegPair : RegsToPass)
+      for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+
+    // Create the RegMask Operand according to our updated mask.
+    Ops.push_back(DAG.getRegisterMask(RegMask));
+  } else {
+    // Create the RegMask Operand according to the static mask.
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -3715,8 +3857,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
-                         Ins, dl, DAG, InVals);
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, RegMask);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3847,6 +3989,13 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   if (Offset != MFI.getObjectOffset(FI))
     return false;
 
+  // If this is not byval, check that the argument stack object is immutable.
+  // inalloca and argument copy elision can create mutable argument stack
+  // objects. Byval objects can be mutated, but a byval call intends to pass the
+  // mutated memory.
+  if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
+    return false;
+
   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
     // If the argument location is wider than the argument type, check that any
     // extension flags match.
@@ -4088,6 +4237,8 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PSHUFLW:
   case X86ISD::SHUFP:
   case X86ISD::INSERTPS:
+  case X86ISD::EXTRQI:
+  case X86ISD::INSERTQI:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
@@ -4132,6 +4283,7 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
     return true;
   // 'Faux' Target Shuffles.
   case ISD::AND:
+  case X86ISD::ANDNP:
     return true;
   }
 }
@@ -4448,6 +4600,11 @@ bool X86TargetLowering::isCtlzFast() const {
   return Subtarget.hasFastLZCNT();
 }
 
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  return true;
+}
+
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (!Subtarget.hasBMI())
     return false;
@@ -4460,6 +4617,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   return true;
 }
 
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+  MVT VT = MVT::getIntegerVT(NumBits);
+  if (isTypeLegal(VT))
+    return VT;
+
+  // PMOVMSKB can handle this.
+  if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+    return MVT::v16i8;
+
+  // VPMOVMSKB can handle this.
+  if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+    return MVT::v32i8;
+
+  // TODO: Allow 64-bit type for 32-bit target.
+  // TODO: 512-bit types should be allowed, but make sure that those
+  // cases are handled in combineVectorSizedSetCCEquality().
+
+  return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
 /// Val is the undef sentinel value or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
@@ -4555,28 +4732,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     SmallVectorImpl<int> &WidenedMask) {
   WidenedMask.assign(Mask.size() / 2, 0);
   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    int M0 = Mask[i];
+    int M1 = Mask[i + 1];
+
     // If both elements are undef, its trivial.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+    if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
       WidenedMask[i / 2] = SM_SentinelUndef;
       continue;
     }
 
     // Check for an undef mask and a mask value properly aligned to fit with
     // a pair of values. If we find such a case, use the non-undef mask's value.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
-        Mask[i + 1] % 2 == 1) {
-      WidenedMask[i / 2] = Mask[i + 1] / 2;
+    if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
+      WidenedMask[i / 2] = M1 / 2;
       continue;
     }
-    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
-      WidenedMask[i / 2] = Mask[i] / 2;
+    if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
+      WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // When zeroing, we need to spread the zeroing across both lanes to widen.
-    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
-      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
-          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+    if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
+      if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
+          (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
         WidenedMask[i / 2] = SM_SentinelZero;
         continue;
       }
@@ -4585,9 +4764,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
 
     // Finally check if the two mask values are adjacent and aligned with
     // a pair.
-    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
-        Mask[i] + 1 == Mask[i + 1]) {
-      WidenedMask[i / 2] = Mask[i] / 2;
+    if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
+      WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
@@ -4608,7 +4786,7 @@ static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
                              SmallVectorImpl<int> &ScaledMask) {
   assert(0 < Scale && "Unexpected scaling factor");
   int NumElts = Mask.size();
-  ScaledMask.assign(NumElts * Scale, -1);
+  ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
 
   for (int i = 0; i != NumElts; ++i) {
     int M = Mask[i];
@@ -4634,14 +4812,10 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
     return false;
 
   // The index should be aligned on a vecWidth-bit boundary.
-  uint64_t Index =
-    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-
+  uint64_t Index = N->getConstantOperandVal(1);
   MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getScalarSizeInBits();
-  bool Result = (Index * ElSize) % vecWidth == 0;
-
-  return Result;
+  return (Index * ElSize) % vecWidth == 0;
 }
 
 /// Return true if the specified INSERT_SUBVECTOR
@@ -4651,15 +4825,12 @@ static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
     return false;
-  // The index should be aligned on a vecWidth-bit boundary.
-  uint64_t Index =
-    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
+  // The index should be aligned on a vecWidth-bit boundary.
+  uint64_t Index = N->getConstantOperandVal(2);
   MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getScalarSizeInBits();
-  bool Result = (Index * ElSize) % vecWidth == 0;
-
-  return Result;
+  return (Index * ElSize) % vecWidth == 0;
 }
 
 bool X86::isVINSERT128Index(SDNode *N) {
@@ -4683,13 +4854,9 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
          "Illegal extract subvector for VEXTRACT");
 
-  uint64_t Index =
-    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-
+  uint64_t Index = N->getConstantOperandVal(1);
   MVT VecVT = N->getOperand(0).getSimpleValueType();
-  MVT ElVT = VecVT.getVectorElementType();
-
-  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
   return Index / NumElemsPerChunk;
 }
 
@@ -4698,13 +4865,9 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
          "Illegal insert subvector for VINSERT");
 
-  uint64_t Index =
-    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
-
+  uint64_t Index = N->getConstantOperandVal(2);
   MVT VecVT = N->getSimpleValueType(0);
-  MVT ElVT = VecVT.getVectorElementType();
-
-  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
+  unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
   return Index / NumElemsPerChunk;
 }
 
@@ -4737,9 +4900,9 @@ bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
 }
 
-// Build a vector of constants
+// Build a vector of constants.
 // Use an UNDEF node if MaskElt == -1.
-// Spilt 64-bit constants in the 32-bit mode.
+// Split 64-bit constants in the 32-bit mode.
 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
                               const SDLoc &dl, bool IsMask = false) {
 
@@ -4770,9 +4933,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   return ConstsNode;
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+  assert(Bits.size() == Undefs.getBitWidth() &&
+         "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
@@ -4844,10 +5008,6 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
-  // Extract from UNDEF is UNDEF.
-  if (Vec.isUndef())
-    return DAG.getUNDEF(ResultVT);
-
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
@@ -4858,8 +5018,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
 
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+    return DAG.getBuildVector(
+        ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -4918,50 +5078,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
-
-  // For insertion into the zero index (low half) of a 256-bit vector, it is
-  // more efficient to generate a blend with immediate instead of an insert*128.
-  // We are still creating an INSERT_SUBVECTOR below with an undef node to
-  // extend the subvector to the size of the result vector. Make sure that
-  // we are not recursing on that node by checking for undef here.
-  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
-      !Result.isUndef()) {
-    EVT ResultVT = Result.getValueType();
-    SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
-    SDValue Undef = DAG.getUNDEF(ResultVT);
-    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
-                                 Vec, ZeroIndex);
-
-    // The blend instruction, and therefore its mask, depend on the data type.
-    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
-    if (ScalarType.isFloatingPoint()) {
-      // Choose either vblendps (float) or vblendpd (double).
-      unsigned ScalarSize = ScalarType.getSizeInBits();
-      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
-      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
-      SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
-      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
-    }
-
-    const X86Subtarget &Subtarget =
-    static_cast<const X86Subtarget &>(DAG.getSubtarget());
-
-    // AVX2 is needed for 256-bit integer blend support.
-    // Integers must be cast to 32-bit because there is only vpblendd;
-    // vpblendw can't be used for this because it has a handicapped mask.
-
-    // If we don't have AVX2, then cast to float. Using a wrong domain blend
-    // is still more efficient than using the wrong domain vinsertf128 that
-    // will be created by InsertSubVector().
-    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
-
-    SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
-    Result = DAG.getBitcast(CastVT, Result);
-    Vec256 = DAG.getBitcast(CastVT, Vec256);
-    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
-    return DAG.getBitcast(ResultVT, Vec256);
-  }
-
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
@@ -4971,6 +5087,20 @@ static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
 
+// Return true if the instruction zeroes the unused upper part of the
+// destination and accepts mask.
+static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case X86ISD::PCMPEQM:
+  case X86ISD::PCMPGTM:
+  case X86ISD::CMPM:
+  case X86ISD::CMPMU:
+    return true;
+  }
+}
+
 /// Insert i1-subvector to i1-vector.
 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
@@ -5003,6 +5133,22 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // 3. Subvector should be inserted in the middle (for example v2i1
   //    to v16i1, index 2)
 
+  // If this node widens - by concatenating zeroes - the type of the result
+  // of a node with instruction that zeroes all upper (irrelevant) bits of the
+  // output register, mark this node as legal to enable replacing them with
+  // the v8i1 version of the previous instruction during instruction selection.
+  // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
+  // while zeroing all the upper remaining 60 bits of the register. if the
+  // result of such instruction is inserted into an allZeroVector, then we can
+  // safely remove insert_vector (in instruction selection) as the cmp instr
+  // already zeroed the rest of the register.
+  if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
+      (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
+       (SubVec.getOpcode() == ISD::AND &&
+        (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
+         isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
+    return Op;
+
   // extend to natively supported kshift
   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
   MVT WideOpVT = OpVT;
@@ -5023,7 +5169,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   if (Vec.isUndef()) {
     if (IdxVal != 0) {
       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
-      WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
+      WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+                               ShiftBits);
     }
     return ExtractSubVec(WideSubVec);
   }
@@ -5032,9 +5179,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
     NumElems = WideOpVT.getVectorNumElements();
     unsigned ShiftLeft = NumElems - SubVecNumElems;
     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
-                             DAG.getConstant(ShiftLeft, dl, MVT::i8));
-    Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
+                      DAG.getConstant(ShiftLeft, dl, MVT::i8));
+    Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
     return ExtractSubVec(Vec);
   }
@@ -5043,8 +5190,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
@@ -5056,12 +5203,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   // Simple case when we put subvector in the upper part
   if (IdxVal + SubVecNumElems == NumElems) {
     // Zero upper bits of the Vec
-    WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
+    WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
                              DAG.getConstant(IdxVal, dl, MVT::i8));
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
     return ExtractSubVec(Vec);
   }
@@ -5094,26 +5241,38 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 }
 
 /// Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
-/// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
 /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
-                             SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
 
   APInt Ones = APInt::getAllOnesValue(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
-  SDValue Vec;
-  if (!Subtarget.hasInt256() && NumElts == 8) {
-    Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
-    Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
-  } else {
-    Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
-  }
+  SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
+static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
+                              SelectionDAG &DAG) {
+  EVT InVT = In.getValueType();
+  assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
+
+  if (VT.is128BitVector() && InVT.is128BitVector())
+    return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
+                                : DAG.getZeroExtendVectorInReg(In, DL, VT);
+
+  // For 256-bit vectors, we only need the lower (128-bit) input half.
+  // For 512-bit vectors, we only need the lower input half or quarter.
+  if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
+    int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+    In = extractSubVector(In, 0, DAG, DL,
+                          std::max(128, (int)VT.getSizeInBits() / Scale));
+  }
+
+  return DAG.getNode(Opc, DL, VT, In);
+}
+
 /// Generate unpacklo/unpackhi shuffle mask.
 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
                                     bool Unary) {
@@ -5199,9 +5358,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
 
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
-                                          SmallBitVector &UndefElts,
-                                          SmallVectorImpl<APInt> &EltBits) {
-  assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+                                          APInt &UndefElts,
+                                          SmallVectorImpl<APInt> &EltBits,
+                                          bool AllowWholeUndefs = true,
+                                          bool AllowPartialUndefs = true) {
   assert(EltBits.empty() && "Expected an empty EltBits vector");
 
   Op = peekThroughBitcasts(Op);
@@ -5211,174 +5371,173 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
-  // Extract all the undef/constant element data and pack into single bitsets.
-  APInt UndefBits(SizeInBits, 0);
-  APInt MaskBits(SizeInBits, 0);
+  // Bitcast a source array of element bits to the target size.
+  auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
+    unsigned NumSrcElts = UndefSrcElts.getBitWidth();
+    unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
+    assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
+           "Constant bit sizes don't match");
+
+    // Don't split if we don't allow undef bits.
+    bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
+    if (UndefSrcElts.getBoolValue() && !AllowUndefs)
+      return false;
+
+    // If we're already the right size, don't bother bitcasting.
+    if (NumSrcElts == NumElts) {
+      UndefElts = UndefSrcElts;
+      EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
+      return true;
+    }
+
+    // Extract all the undef/constant element data and pack into single bitsets.
+    APInt UndefBits(SizeInBits, 0);
+    APInt MaskBits(SizeInBits, 0);
 
-  // Split the undef/constant single bitset data into the target elements.
-  auto SplitBitData = [&]() {
-    UndefElts = SmallBitVector(NumElts, false);
+    for (unsigned i = 0; i != NumSrcElts; ++i) {
+      unsigned BitOffset = i * SrcEltSizeInBits;
+      if (UndefSrcElts[i])
+        UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+      MaskBits.insertBits(SrcEltBits[i], BitOffset);
+    }
+
+    // Split the undef/constant single bitset data into the target elements.
+    UndefElts = APInt(NumElts, 0);
     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
 
     for (unsigned i = 0; i != NumElts; ++i) {
-      APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
-      UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+      unsigned BitOffset = i * EltSizeInBits;
+      APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
 
-      // Only treat an element as UNDEF if all bits are UNDEF, otherwise
-      // treat it as zero.
+      // Only treat an element as UNDEF if all bits are UNDEF.
       if (UndefEltBits.isAllOnesValue()) {
-        UndefElts[i] = true;
+        if (!AllowWholeUndefs)
+          return false;
+        UndefElts.setBit(i);
         continue;
       }
 
-      APInt Bits = MaskBits.lshr(i * EltSizeInBits);
-      Bits = Bits.zextOrTrunc(EltSizeInBits);
+      // If only some bits are UNDEF then treat them as zero (or bail if not
+      // supported).
+      if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
+        return false;
+
+      APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
       EltBits[i] = Bits.getZExtValue();
     }
     return true;
   };
 
-  auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
-                                          APInt &Undefs) {
+  // Collect constant bits and insert into mask/undef bit masks.
+  auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
+                                unsigned UndefBitIndex) {
     if (!Cst)
       return false;
-    unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
     if (isa<UndefValue>(Cst)) {
-      Mask = APInt::getNullValue(SizeInBits);
-      Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+      Undefs.setBit(UndefBitIndex);
       return true;
     }
     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
-      Mask = CInt->getValue().zextOrTrunc(SizeInBits);
-      Undefs = APInt::getNullValue(SizeInBits);
+      Mask = CInt->getValue();
       return true;
     }
     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
-      Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
-      Undefs = APInt::getNullValue(SizeInBits);
+      Mask = CFP->getValueAPF().bitcastToAPInt();
       return true;
     }
     return false;
   };
 
+  // Extract constant bits from build vector.
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+    APInt UndefSrcElts(NumSrcElts, 0);
+    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      const SDValue &Src = Op.getOperand(i);
+      if (Src.isUndef()) {
+        UndefSrcElts.setBit(i);
+        continue;
+      }
+      auto *Cst = cast<ConstantSDNode>(Src);
+      SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+    }
+    return CastBitData(UndefSrcElts, SrcEltBits);
+  }
+
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
     if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
       return false;
 
-    unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
-    for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
-      APInt Bits, Undefs;
-      if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+    unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
+    unsigned NumSrcElts = CstTy->getVectorNumElements();
+
+    APInt UndefSrcElts(NumSrcElts, 0);
+    SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
+    for (unsigned i = 0; i != NumSrcElts; ++i)
+      if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
+                               UndefSrcElts, i))
         return false;
-      MaskBits |= Bits.shl(i * CstEltSizeInBits);
-      UndefBits |= Undefs.shl(i * CstEltSizeInBits);
-    }
 
-    return SplitBitData();
+    return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from a broadcasted constant pool scalar.
   if (Op.getOpcode() == X86ISD::VBROADCAST &&
-      EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+      EltSizeInBits <= VT.getScalarSizeInBits()) {
     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
-      APInt Bits, Undefs;
-      if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
-        unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
-        unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
-        for (unsigned i = 0; i != NumBroadcastElts; ++i) {
-          MaskBits |= Bits.shl(i * NumBroadcastBits);
-          UndefBits |= Undefs.shl(i * NumBroadcastBits);
-        }
-        return SplitBitData();
+      unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
+      unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+      APInt UndefSrcElts(NumSrcElts, 0);
+      SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
+      if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
+        if (UndefSrcElts[0])
+          UndefSrcElts.setBits(0, NumSrcElts);
+        SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
+        return CastBitData(UndefSrcElts, SrcEltBits);
       }
     }
   }
 
+  // Extract a rematerialized scalar constant insertion.
+  if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
+      Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+    unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+    unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
+    APInt UndefSrcElts(NumSrcElts, 0);
+    SmallVector<APInt, 64> SrcEltBits;
+    auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
+    SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
+    SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
+    return CastBitData(UndefSrcElts, SrcEltBits);
+  }
+
   return false;
 }
 
-// TODO: Merge more of this with getTargetConstantBitsFromNode.
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask) {
-  MaskNode = peekThroughBitcasts(MaskNode);
-
-  MVT VT = MaskNode.getSimpleValueType();
-  assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
-  unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
-
-  // Split an APInt element into MaskEltSizeInBits sized pieces and
-  // insert into the shuffle mask.
-  auto SplitElementToMask = [&](APInt Element) {
-    // Note that this is x86 and so always little endian: the low byte is
-    // the first byte of the mask.
-    int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-    for (int i = 0; i < Split; ++i) {
-      APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
-      Element = Element.lshr(MaskEltSizeInBits);
-      RawMask.push_back(RawElt.getZExtValue());
-    }
-  };
-
-  if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
-    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-    // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
-    if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
-      return false;
-    if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
-      const APInt &MaskElement = CN->getAPIntValue();
-      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-        APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
-        RawMask.push_back(RawElt.getZExtValue());
-      }
-    }
-    return false;
-  }
-
-  if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
-      MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
-    if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
-      if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
-        RawMask.push_back(CN->getZExtValue());
-        RawMask.append(NumMaskElts - 1, 0);
-        return true;
-      }
-
-      if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
-        unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-        SplitElementToMask(CN->getAPIntValue());
-        RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
-        return true;
-      }
-    }
-    return false;
-  }
-
-  if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  // We can always decode if the buildvector is all zero constants,
-  // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
-  if (all_of(MaskNode->ops(), X86::isZeroNode)) {
-    RawMask.append(NumMaskElts, 0);
-    return true;
-  }
-
-  // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-  if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
+  APInt UndefElts;
+  SmallVector<APInt, 64> EltBits;
+
+  // Extract the raw target constant bits.
+  // FIXME: We currently don't support UNDEF bits or mask entries.
+  if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
+                                     EltBits, /* AllowWholeUndefs */ false,
+                                     /* AllowPartialUndefs */ false))
     return false;
 
-  for (SDValue Op : MaskNode->ops()) {
-    if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
-      SplitElementToMask(CN->getAPIntValue());
-    else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
-      SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
-    else
-      return false;
-  }
+  // Insert the extracted elements into the mask.
+  for (APInt Elt : EltBits)
+    RawMask.push_back(Elt.getZExtValue());
 
   return true;
 }
@@ -5405,6 +5564,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::BLENDI:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5416,6 +5576,24 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
+  case X86ISD::EXTRQI:
+    if (isa<ConstantSDNode>(N->getOperand(1)) &&
+        isa<ConstantSDNode>(N->getOperand(2))) {
+      int BitLen = N->getConstantOperandVal(1);
+      int BitIdx = N->getConstantOperandVal(2);
+      DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
+      IsUnary = true;
+    }
+    break;
+  case X86ISD::INSERTQI:
+    if (isa<ConstantSDNode>(N->getOperand(2)) &&
+        isa<ConstantSDNode>(N->getOperand(3))) {
+      int BitLen = N->getConstantOperandVal(2);
+      int BitIdx = N->getConstantOperandVal(3);
+      DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
+      IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    }
+    break;
   case X86ISD::UNPCKH:
     DecodeUNPCKHMask(VT, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -5473,8 +5651,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
-    // We only decode broadcasts of same-sized vectors at the moment.
-    if (N->getOperand(0).getValueType() == VT) {
+    SDValue N0 = N->getOperand(0);
+    // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+    // add the pre-extracted value to the Ops vector.
+    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N0.getOperand(0).getValueType() == VT &&
+        N0.getConstantOperandVal(1) == 0)
+      Ops.push_back(N0.getOperand(0));
+
+    // We only decode broadcasts of same-sized vectors, unless the broadcast
+    // came from an extract from the original width. If we found one, we
+    // pushed it the Ops vector above.
+    if (N0.getValueType() == VT || !Ops.empty()) {
       DecodeVectorBroadcast(VT, Mask);
       IsUnary = true;
       break;
@@ -5669,6 +5857,19 @@ static bool setTargetShuffleZeroElements(SDValue N,
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
+  assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+         "Illegal split of shuffle value type");
+  unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+
+  // Extract known constant input data.
+  APInt UndefSrcElts[2];
+  SmallVector<APInt, 32> SrcEltBits[2];
+  bool IsSrcConstant[2] = {
+      getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
+                                    SrcEltBits[0], true, false),
+      getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
+                                    SrcEltBits[1], true, false)};
+
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
 
@@ -5677,6 +5878,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
       continue;
 
     // Determine shuffle input and normalize the mask.
+    unsigned SrcIdx = M / Size;
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
@@ -5686,39 +5888,27 @@ static bool setTargetShuffleZeroElements(SDValue N,
       continue;
     }
 
-    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
-    if (V.getOpcode() != ISD::BUILD_VECTOR)
-      continue;
-
-    // If the BUILD_VECTOR has fewer elements then the (larger) source
-    // element must be UNDEF/ZERO.
-    // TODO: Is it worth testing the individual bits of a constant?
-    if ((Size % V.getNumOperands()) == 0) {
-      int Scale = Size / V->getNumOperands();
-      SDValue Op = V.getOperand(M / Scale);
-      if (Op.isUndef())
+    // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
+    // TODO: We currently only set UNDEF for integer types - floats use the same
+    // registers as vectors and many of the scalar folded loads rely on the
+    // SCALAR_TO_VECTOR pattern.
+    if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        (Size % V.getValueType().getVectorNumElements()) == 0) {
+      int Scale = Size / V.getValueType().getVectorNumElements();
+      int Idx = M / Scale;
+      if (Idx != 0 && !VT.isFloatingPoint())
         Mask[i] = SM_SentinelUndef;
-      else if (X86::isZeroNode(Op))
+      else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
         Mask[i] = SM_SentinelZero;
       continue;
     }
 
-    // If the BUILD_VECTOR has more elements then all the (smaller) source
-    // elements must be all UNDEF or all ZERO.
-    if ((V.getNumOperands() % Size) == 0) {
-      int Scale = V->getNumOperands() / Size;
-      bool AllUndef = true;
-      bool AllZero = true;
-      for (int j = 0; j < Scale; ++j) {
-        SDValue Op = V.getOperand((M * Scale) + j);
-        AllUndef &= Op.isUndef();
-        AllZero &= X86::isZeroNode(Op);
-      }
-      if (AllUndef)
+    // Attempt to extract from the source's constant bits.
+    if (IsSrcConstant[SrcIdx]) {
+      if (UndefSrcElts[SrcIdx][M])
         Mask[i] = SM_SentinelUndef;
-      else if (AllZero)
+      else if (SrcEltBits[SrcIdx][M] == 0)
         Mask[i] = SM_SentinelZero;
-      continue;
     }
   }
 
@@ -5731,7 +5921,8 @@ static bool setTargetShuffleZeroElements(SDValue N,
 // The decoded shuffle mask may contain a different number of elements to the
 // destination value type.
 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
-                               SmallVectorImpl<SDValue> &Ops) {
+                               SmallVectorImpl<SDValue> &Ops,
+                               SelectionDAG &DAG) {
   Mask.clear();
   Ops.clear();
 
@@ -5744,11 +5935,16 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
 
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
-  case ISD::AND: {
+  case ISD::AND:
+  case X86ISD::ANDNP: {
     // Attempt to decode as a per-byte mask.
-    SmallBitVector UndefElts;
+    APInt UndefElts;
     SmallVector<APInt, 32> EltBits;
-    if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    bool IsAndN = (X86ISD::ANDNP == Opcode);
+    uint64_t ZeroMask = IsAndN ? 255 : 0;
+    if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
       return false;
     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
       if (UndefElts[i]) {
@@ -5758,9 +5954,93 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
       uint64_t ByteBits = EltBits[i].getZExtValue();
       if (ByteBits != 0 && ByteBits != 255)
         return false;
-      Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
+      Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
+    }
+    Ops.push_back(IsAndN ? N1 : N0);
+    return true;
+  }
+  case ISD::SCALAR_TO_VECTOR: {
+    // Match against a scalar_to_vector of an extract from a vector,
+    // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
+    SDValue N0 = N.getOperand(0);
+    SDValue SrcExtract;
+
+    if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        N0.getOperand(0).getValueType() == VT) {
+      SrcExtract = N0;
+    } else if (N0.getOpcode() == ISD::AssertZext &&
+               N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
+               cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
+      SrcExtract = N0.getOperand(0);
+      assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
+    } else if (N0.getOpcode() == ISD::AssertZext &&
+               N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
+               cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
+      SrcExtract = N0.getOperand(0);
+      assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
+    }
+
+    if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
+      return false;
+
+    SDValue SrcVec = SrcExtract.getOperand(0);
+    EVT SrcVT = SrcVec.getValueType();
+    unsigned NumSrcElts = SrcVT.getVectorNumElements();
+    unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
+
+    unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
+    if (NumSrcElts <= SrcIdx)
+      return false;
+
+    Ops.push_back(SrcVec);
+    Mask.push_back(SrcIdx);
+    Mask.append(NumZeros, SM_SentinelZero);
+    Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
+    return true;
+  }
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW: {
+    SDValue InVec = N.getOperand(0);
+    SDValue InScl = N.getOperand(1);
+    uint64_t InIdx = N.getConstantOperandVal(2);
+    assert(InIdx < NumElts && "Illegal insertion index");
+
+    // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
+    if (X86::isZeroNode(InScl)) {
+      Ops.push_back(InVec);
+      for (unsigned i = 0; i != NumElts; ++i)
+        Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
+      return true;
     }
+
+    // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
+    // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
+    unsigned ExOp =
+        (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
+    if (InScl.getOpcode() != ISD::AssertZext ||
+        InScl.getOperand(0).getOpcode() != ExOp)
+      return false;
+
+    SDValue ExVec = InScl.getOperand(0).getOperand(0);
+    uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
+    assert(ExIdx < NumElts && "Illegal extraction index");
+    Ops.push_back(InVec);
+    Ops.push_back(ExVec);
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
+    return true;
+  }
+  case X86ISD::PACKSS: {
+    // If we know input saturation won't happen we can treat this
+    // as a truncation shuffle.
+    if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
+        DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
+      return false;
+
     Ops.push_back(N.getOperand(0));
+    Ops.push_back(N.getOperand(1));
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(i * 2);
     return true;
   }
   case X86ISD::VSHLI:
@@ -5795,6 +6075,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     }
     return true;
   }
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VZEXT: {
     // TODO - add support for VPMOVZX with smaller input vector types.
     SDValue Src = N.getOperand(0);
@@ -5810,36 +6091,39 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
   return false;
 }
 
+/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
+                                              SmallVectorImpl<int> &Mask) {
+  int MaskWidth = Mask.size();
+  SmallVector<SDValue, 16> UsedInputs;
+  for (int i = 0, e = Inputs.size(); i < e; ++i) {
+    int lo = UsedInputs.size() * MaskWidth;
+    int hi = lo + MaskWidth;
+    if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
+      UsedInputs.push_back(Inputs[i]);
+      continue;
+    }
+    for (int &M : Mask)
+      if (lo <= M)
+        M -= MaskWidth;
+  }
+  Inputs = UsedInputs;
+}
+
 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
 /// remaining input indices in case we now have a unary shuffle and adjust the
-/// Op0/Op1 inputs accordingly.
+/// inputs accordingly.
 /// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
-                                       SmallVectorImpl<int> &Mask) {
-  SmallVector<SDValue, 2> Ops;
-  if (!setTargetShuffleZeroElements(Op, Mask, Ops))
-    if (!getFauxShuffleMask(Op, Mask, Ops))
+static bool resolveTargetShuffleInputs(SDValue Op,
+                                       SmallVectorImpl<SDValue> &Inputs,
+                                       SmallVectorImpl<int> &Mask,
+                                       SelectionDAG &DAG) {
+  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
+    if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
       return false;
 
-  int NumElts = Mask.size();
-  bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
-    return 0 <= Idx && Idx < NumElts;
-  });
-  bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
-
-  Op0 = Op0InUse ? Ops[0] : SDValue();
-  Op1 = Op1InUse ? Ops[1] : SDValue();
-
-  // We're only using Op1 - commute the mask and inputs.
-  if (!Op0InUse && Op1InUse) {
-    for (int &M : Mask)
-      if (NumElts <= M)
-        M -= NumElts;
-    Op0 = Op1;
-    Op1 = SDValue();
-  }
-
+  resolveTargetShuffleInputsAndMask(Inputs, Mask);
   return true;
 }
 
@@ -5914,11 +6198,10 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
 /// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
-                                       unsigned NumNonZero, unsigned NumZero,
-                                       SelectionDAG &DAG,
-                                       const X86Subtarget &Subtarget,
-                                       const TargetLowering &TLI) {
-  if (NumNonZero > 8)
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
+  if (NumNonZero > 8 && !Subtarget.hasSSE41())
     return SDValue();
 
   SDLoc dl(Op);
@@ -5928,18 +6211,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   // SSE4.1 - use PINSRB to insert each byte directly.
   if (Subtarget.hasSSE41()) {
     for (unsigned i = 0; i < 16; ++i) {
-      bool isNonZero = (NonZeros & (1 << i)) != 0;
-      if (isNonZero) {
+      bool IsNonZero = (NonZeros & (1 << i)) != 0;
+      if (IsNonZero) {
+        // If the build vector contains zeros or our first insertion is not the
+        // first index then insert into zero vector to break any register
+        // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
         if (First) {
-          if (NumZero)
-            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
-          else
-            V = DAG.getUNDEF(MVT::v16i8);
           First = false;
+          if (NumZero || 0 != i)
+            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+          else {
+            assert(0 == i && "Expected insertion into zero-index");
+            V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+            V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+            V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+            V = DAG.getBitcast(MVT::v16i8, V);
+            continue;
+          }
         }
-        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
-                        MVT::v16i8, V, Op.getOperand(i),
-                        DAG.getIntPtrConstant(i, dl));
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
+                        Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
       }
     }
 
@@ -5958,24 +6249,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     }
 
     if ((i & 1) != 0) {
+      // FIXME: Investigate extending to i32 instead of just i16.
+      // FIXME: Investigate combining the first 4 bytes as a i32 instead.
       SDValue ThisElt, LastElt;
-      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
       if (LastIsNonZero) {
-        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
-                              MVT::i16, Op.getOperand(i-1));
+        LastElt =
+            DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
       }
       if (ThisIsNonZero) {
         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
-        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
-                              ThisElt, DAG.getConstant(8, dl, MVT::i8));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
+                              DAG.getConstant(8, dl, MVT::i8));
         if (LastIsNonZero)
           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
       } else
         ThisElt = LastElt;
 
-      if (ThisElt.getNode())
-        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
-                        DAG.getIntPtrConstant(i/2, dl));
+      if (ThisElt) {
+        if (1 == i) {
+          V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
+                      : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
+          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+          V = DAG.getBitcast(MVT::v8i16, V);
+        } else {
+          V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                          DAG.getIntPtrConstant(i / 2, dl));
+        }
+      }
     }
   }
 
@@ -5986,27 +6288,34 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget,
-                                     const TargetLowering &TLI) {
-  if (NumNonZero > 4)
+                                     const X86Subtarget &Subtarget) {
+  if (NumNonZero > 4 && !Subtarget.hasSSE41())
     return SDValue();
 
   SDLoc dl(Op);
   SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
-    bool isNonZero = (NonZeros & (1 << i)) != 0;
-    if (isNonZero) {
+    bool IsNonZero = (NonZeros & (1 << i)) != 0;
+    if (IsNonZero) {
+      // If the build vector contains zeros or our first insertion is not the
+      // first index then insert into zero vector to break any register
+      // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
       if (First) {
-        if (NumZero)
-          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
-        else
-          V = DAG.getUNDEF(MVT::v8i16);
         First = false;
+        if (NumZero || 0 != i)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else {
+          assert(0 == i && "Expected insertion into zero-index");
+          V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+          V = DAG.getBitcast(MVT::v8i16, V);
+          continue;
+        }
       }
-      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
-                      MVT::v8i16, V, Op.getOperand(i),
-                      DAG.getIntPtrConstant(i, dl));
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
+                      Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
     }
   }
 
@@ -6015,8 +6324,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 
 /// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget,
-                                     const TargetLowering &TLI) {
+                                     const X86Subtarget &Subtarget) {
   // Find all zeroable elements.
   std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
@@ -6064,7 +6372,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
 
     Elt = Op->getOperand(EltIdx);
     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
-    EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
+    EltMaskIdx = Elt.getConstantOperandVal(1);
     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
       break;
     Mask[EltIdx] = EltIdx;
@@ -6095,8 +6403,7 @@ static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
     SDValue SrcVector = Current->getOperand(0);
     if (!V1.getNode())
       V1 = SrcVector;
-    CanFold = SrcVector == V1 &&
-      cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
+    CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
   }
 
   if (!CanFold)
@@ -6212,7 +6519,8 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
 ///
 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
-                                        SDLoc &DL, SelectionDAG &DAG,
+                                        const SDLoc &DL, SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget,
                                         bool isAfterLegalize) {
   unsigned NumElems = Elts.size();
 
@@ -6288,16 +6596,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
-
-    if (LDBase->hasAnyUseOfValue(1)) {
-      SDValue NewChain =
-          DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
-                      SDValue(NewLd.getNode(), 1));
-      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
-      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
-                             SDValue(NewLd.getNode(), 1));
-    }
-
+    DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
     return NewLd;
   };
 
@@ -6317,6 +6616,12 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
       return SDValue();
 
+    // Don't create 256-bit non-temporal aligned loads without AVX2 as these
+    // will lower to regular temporal loads and use the cache.
+    if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
+        VT.is256BitVector() && !Subtarget.hasInt256())
+      return SDValue();
+
     if (IsConsecutiveLoad)
       return CreateLoad(VT, LDBase);
 
@@ -6356,19 +6661,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                   LDBase->getAlignment(),
                                   false/*isVolatile*/, true/*ReadMem*/,
                                   false/*WriteMem*/);
-
-      // Make sure the newly-created LOAD is in the same position as LDBase in
-      // terms of dependency. We create a TokenFactor for LDBase and ResNode,
-      // and update uses of LDBase's output chain to use the TokenFactor.
-      if (LDBase->hasAnyUseOfValue(1)) {
-        SDValue NewChain =
-            DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
-                        SDValue(ResNode.getNode(), 1));
-        DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
-        DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
-                               SDValue(ResNode.getNode(), 1));
-      }
-
+      DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
       return DAG.getBitcast(VT, ResNode);
     }
   }
@@ -6376,22 +6669,22 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
   return SDValue();
 }
 
-static Constant *getConstantVector(MVT VT, APInt SplatValue,
+static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
                                    unsigned SplatBitSize, LLVMContext &C) {
   unsigned ScalarSize = VT.getScalarSizeInBits();
   unsigned NumElm = SplatBitSize / ScalarSize;
 
   SmallVector<Constant *, 32> ConstantVec;
   for (unsigned i = 0; i < NumElm; i++) {
-    APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+    APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
-      assert((ScalarSize == 32 || ScalarSize == 64) &&
-             "Unsupported floating point scalar size");
-      if (ScalarSize == 32)
-        Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
-      else
-        Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
+      if (ScalarSize == 32) {
+        Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
+      } else {
+        assert(ScalarSize == 64 && "Unsupported floating point scalar size");
+        Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
+      }
     } else
       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
     ConstantVec.push_back(Const);
@@ -6409,18 +6702,16 @@ static bool isUseOfShuffle(SDNode *N) {
   return false;
 }
 
-/// Attempt to use the vbroadcast instruction to generate a splat value for the
-/// following cases:
-/// 1. A splat BUILD_VECTOR which uses:
-///    a. A single scalar load, or a constant.
-///    b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
-/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
-/// a scalar load, or a constant.
+/// Attempt to use the vbroadcast instruction to generate a splat value
+/// from a splat BUILD_VECTOR which uses:
+///  a. A single scalar load, or a constant.
+///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
 ///
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
-static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
-                                    SelectionDAG &DAG) {
+static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
+                                           const X86Subtarget &Subtarget,
+                                           SelectionDAG &DAG) {
   // VBROADCAST requires AVX.
   // TODO: Splats could be generated for non-AVX CPUs using SSE
   // instructions, but there's less potential gain for only 128-bit vectors.
@@ -6479,11 +6770,13 @@ static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget
           // AVX have support for 32 and 64 bit broadcast for floats only.
           // No 64bit integer in 32bit subtarget.
           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
-          Constant *C = SplatBitSize == 32
-                            ? ConstantFP::get(Type::getFloatTy(*Ctx),
-                                              SplatValue.bitsToFloat())
-                            : ConstantFP::get(Type::getDoubleTy(*Ctx),
-                                              SplatValue.bitsToDouble());
+          // Lower the splat via APFloat directly, to avoid any conversion.
+          Constant *C =
+              SplatBitSize == 32
+                  ? ConstantFP::get(*Ctx,
+                                    APFloat(APFloat::IEEEsingle(), SplatValue))
+                  : ConstantFP::get(*Ctx,
+                                    APFloat(APFloat::IEEEdouble(), SplatValue));
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
@@ -6664,6 +6957,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
     // Quit if non-constant index.
     if (!isa<ConstantSDNode>(ExtIdx))
       return SDValue();
@@ -6694,11 +6988,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
-  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
-    unsigned Idx = InsertIndices[i];
+
+  for (unsigned Idx : InsertIndices)
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
                      DAG.getIntPtrConstant(Idx, DL));
-  }
 
   return NV;
 }
@@ -6711,7 +7004,7 @@ static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (!In.isUndef())
-      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
   }
   SDLoc dl(Op);
   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
@@ -6754,7 +7047,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     if (!isa<ConstantSDNode>(In))
       NonConstIdx.push_back(idx);
     else {
-      Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
+      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
       HasConstElts = true;
     }
     if (SplatIdx < 0)
@@ -6765,9 +7058,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
 
   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   if (IsSplat)
-    return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
-                       DAG.getConstant(1, dl, VT),
-                       DAG.getConstant(0, dl, VT));
+    return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
+                         DAG.getConstant(1, dl, VT),
+                         DAG.getConstant(0, dl, VT));
 
   // insert elements one by one
   SDValue DstVec;
@@ -7347,7 +7640,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
         (VT == MVT::v8i32 && Subtarget.hasInt256()))
       return Op;
 
-    return getOnesVector(VT, Subtarget, DAG, DL);
+    return getOnesVector(VT, DAG, DL);
   }
 
   return SDValue();
@@ -7373,7 +7666,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return AddSub;
   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
     return HorizontalOp;
-  if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
+  if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
     return Broadcast;
   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
     return BitOp;
@@ -7418,7 +7711,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // a constant pool load than it is to do a movd + shuffle.
     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
-      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
         // Handle SSE only.
         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
         MVT VecVT = MVT::v4i32;
@@ -7523,7 +7816,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // See if we can use a vector load to get all of the elements.
   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
-    if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
+    if (SDValue LD =
+            EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
       return LD;
   }
 
@@ -7561,17 +7855,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
-                                          DAG, Subtarget, *this))
+                                          DAG, Subtarget))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
-                                          DAG, Subtarget, *this))
+                                          DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   if (EVTBits == 32 && NumElems == 4)
-    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
@@ -7647,24 +7941,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     }
 
     // Next, we iteratively mix elements, e.g. for v4f32:
-    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
-    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
-    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
-    unsigned EltStride = NumElems >> 1;
-    while (EltStride != 0) {
-      for (unsigned i = 0; i < EltStride; ++i) {
-        // If Ops[i+EltStride] is undef and this is the first round of mixing,
-        // then it is safe to just drop this shuffle: V[i] is already in the
-        // right place, the one element (since it's the first round) being
-        // inserted as undef can be dropped.  This isn't safe for successive
-        // rounds because they will permute elements within both vectors.
-        if (Ops[i+EltStride].isUndef() &&
-            EltStride == NumElems/2)
-          continue;
-
-        Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
-      }
-      EltStride >>= 1;
+    //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
+    //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
+    //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
+    for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
+      // Generate scaled UNPCKL shuffle mask.
+      SmallVector<int, 16> Mask;
+      for(unsigned i = 0; i != Scale; ++i)
+        Mask.push_back(i);
+      for (unsigned i = 0; i != Scale; ++i)
+        Mask.push_back(NumElems+i);
+      Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
+
+      for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
+        Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
     }
     return Ops[0];
   }
@@ -7699,6 +7989,60 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
+// Return true if all the operands of the given CONCAT_VECTORS node are zeros
+// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
+static bool isExpandWithZeros(const SDValue &Op) {
+  assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
+         "Expand with zeros only possible in CONCAT_VECTORS nodes!");
+
+  for (unsigned i = 1; i < Op.getNumOperands(); i++)
+    if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
+      return false;
+
+  return true;
+}
+
+// Returns true if the given node is a type promotion (by concatenating i1
+// zeros) of the result of a node that already zeros all upper bits of
+// k-register.
+static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
+  unsigned Opc = Op.getOpcode();
+
+  assert(Opc == ISD::CONCAT_VECTORS &&
+         Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+         "Unexpected node to check for type promotion!");
+
+  // As long as we are concatenating zeros to the upper part of a previous node
+  // result, climb up the tree until a node with different opcode is
+  // encountered
+  while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
+    if (Opc == ISD::INSERT_SUBVECTOR) {
+      if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
+          Op.getConstantOperandVal(2) == 0)
+        Op = Op.getOperand(1);
+      else
+        return SDValue();
+    } else { // Opc == ISD::CONCAT_VECTORS
+      if (isExpandWithZeros(Op))
+        Op = Op.getOperand(0);
+      else
+        return SDValue();
+    }
+    Opc = Op.getOpcode();
+  }
+
+  // Check if the first inserted node zeroes the upper bits, or an 'and' result
+  // of a node that zeros the upper bits (its masked version).
+  if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
+      (Op.getOpcode() == ISD::AND &&
+       (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
+        isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
+    return Op;
+  }
+
+  return SDValue();
+}
+
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
@@ -7709,6 +8053,17 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
   assert(isPowerOf2_32(NumOfOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
+  // If this node promotes - by concatenating zeroes - the type of the result
+  // of a node with instruction that zeroes all upper (irrelevant) bits of the
+  // output register, mark it as legal and catch the pattern in instruction
+  // selection to avoid emitting extra insturctions (for zeroing upper bits).
+  if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
+    SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
+    SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
+                       ZeroC);
+  }
+
   SDValue Undef = DAG.getUNDEF(ResVT);
   if (NumOfOperands > 2) {
     // Specialize the cases when all, or all but one, of the operands are undef.
@@ -7767,7 +8122,7 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
 
   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
   if (V1.isUndef())
-    V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
 
   if (IsZeroV1)
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
@@ -7849,7 +8204,7 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
                                   ArrayRef<int> Mask,
                                   SmallVectorImpl<int> &RepeatedMask) {
-  int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+  auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   RepeatedMask.assign(LaneSize, -1);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
@@ -7956,7 +8311,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
               ExpectedBV->getOperand(ExpectedMask[i] % Size))
         return false;
     }
-}
+  }
 
   return true;
 }
@@ -7986,6 +8341,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
   return true;
 }
 
+// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
+// mask.
+static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
+                                                    const APInt &Zeroable) {
+  int NumElts = Mask.size();
+  assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+
+  SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    if (M == SM_SentinelUndef)
+      continue;
+    assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
+    TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+  }
+  return TargetMask;
+}
+
+// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
+// instructions.
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return false;
+
+  SmallVector<int, 8> Unpcklwd;
+  createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
+                          /* Unary = */ false);
+  SmallVector<int, 8> Unpckhwd;
+  createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
+                          /* Unary = */ false);
+  bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
+                         isTargetShuffleEquivalent(Mask, Unpckhwd));
+  return IsUnpackwdMask;
+}
+
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -8009,7 +8399,7 @@ static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   return Imm;
 }
 
-static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
+static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
                                           SelectionDAG &DAG) {
   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
@@ -8022,9 +8412,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
 /// as many lanes with this technique as possible to simplify the remaining
 /// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
-                                                     SDValue V1, SDValue V2) {
-  SmallBitVector Zeroable(Mask.size(), false);
+static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                            SDValue V1, SDValue V2) {
+  APInt Zeroable(Mask.size(), 0);
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
@@ -8039,7 +8429,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
     int M = Mask[i];
     // Handle the easy cases.
     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
-      Zeroable[i] = true;
+      Zeroable.setBit(i);
       continue;
     }
 
@@ -8057,17 +8447,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
       int Scale = Size / V->getNumOperands();
       SDValue Op = V.getOperand(M / Scale);
       if (Op.isUndef() || X86::isZeroNode(Op))
-        Zeroable[i] = true;
+        Zeroable.setBit(i);
       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
         APInt Val = Cst->getAPIntValue();
-        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+        Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
-        Zeroable[i] = (Val == 0);
+        if (Val == 0)
+          Zeroable.setBit(i);
       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
         APInt Val = Cst->getValueAPF().bitcastToAPInt();
-        Val = Val.lshr((M % Scale) * ScalarSizeInBits);
+        Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
-        Zeroable[i] = (Val == 0);
+        if (Val == 0)
+          Zeroable.setBit(i);
       }
       continue;
     }
@@ -8081,7 +8473,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
         SDValue Op = V.getOperand((M * Scale) + j);
         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
       }
-      Zeroable[i] = AllZeroable;
+      if (AllZeroable)
+        Zeroable.setBit(i);
       continue;
     }
   }
@@ -8096,19 +8489,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
-                                     ArrayRef<int> Mask,const EVT &VectorType,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable,
+                                     ArrayRef<int> Mask, const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
-  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+  for (int i = 0, e = Mask.size(); i < e; i++) {
     // Checks if the mask's zeros elements are built from only zeros.
-    if (Mask[i] == -1)
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] < 0)
       return false;
     if (Zeroable[i])
       continue;
     // Find the lowest non zero element
-    if (NextElement == -1) {
+    if (NextElement < 0) {
       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
       IsZeroSideLeft = NextElement != 0;
     }
@@ -8124,7 +8518,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2,
-                                            const SmallBitVector &Zeroable,
+                                            const APInt &Zeroable,
                                             const X86Subtarget &Subtarget,
                                             SelectionDAG &DAG) {
   int Size = Mask.size();
@@ -8179,19 +8573,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl);
 
-// Function convertBitVectorToUnsigned - The function gets SmallBitVector
-// as argument and convert him to unsigned.
-// The output of the function is not(zeroable)
-static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
-  unsigned convertBit = 0;
-  for (int i = 0, e = Zeroable.size(); i < e; i++)
-    convertBit |= !(Zeroable[i]) << i;
-  return convertBit;
-}
-
 // X86 has dedicated shuffle that can be lowered to VEXPAND
 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
-                                          const SmallBitVector &Zeroable,
+                                          const APInt &Zeroable,
                                           ArrayRef<int> Mask, SDValue &V1,
                                           SDValue &V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
@@ -8199,7 +8583,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
     return SDValue();
-  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  unsigned VEXPANDMask = (~Zeroable).getZExtValue();
   MVT IntegerType =
       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
@@ -8210,9 +8594,94 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                               Subtarget, DAG, DL);
   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
-  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
-                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
-                     ZeroVector);
+  return DAG.getSelect(DL, VT, VMask,
+                       DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                       ZeroVector);
+}
+
+static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+                                        unsigned &UnpackOpcode, bool IsUnary,
+                                        ArrayRef<int> TargetMask, SDLoc &DL,
+                                        SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+
+  bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
+  for (int i = 0; i != NumElts; i += 2) {
+    int M1 = TargetMask[i + 0];
+    int M2 = TargetMask[i + 1];
+    Undef1 &= (SM_SentinelUndef == M1);
+    Undef2 &= (SM_SentinelUndef == M2);
+    Zero1 &= isUndefOrZero(M1);
+    Zero2 &= isUndefOrZero(M2);
+  }
+  assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
+         "Zeroable shuffle detected");
+
+  // Attempt to match the target mask against the unpack lo/hi mask patterns.
+  SmallVector<int, 64> Unpckl, Unpckh;
+  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
+  if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+    UnpackOpcode = X86ISD::UNPCKL;
+    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+    return true;
+  }
+
+  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
+  if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+    UnpackOpcode = X86ISD::UNPCKH;
+    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+    return true;
+  }
+
+  // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
+  if (IsUnary && (Zero1 || Zero2)) {
+    // Don't bother if we can blend instead.
+    if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
+        isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
+      return false;
+
+    bool MatchLo = true, MatchHi = true;
+    for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
+      int M = TargetMask[i];
+
+      // Ignore if the input is known to be zero or the index is undef.
+      if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
+          (M == SM_SentinelUndef))
+        continue;
+
+      MatchLo &= (M == Unpckl[i]);
+      MatchHi &= (M == Unpckh[i]);
+    }
+
+    if (MatchLo || MatchHi) {
+      UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+      V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+      return true;
+    }
+  }
+
+  // If a binary shuffle, commute and try again.
+  if (!IsUnary) {
+    ShuffleVectorSDNode::commuteMask(Unpckl);
+    if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+      UnpackOpcode = X86ISD::UNPCKL;
+      std::swap(V1, V2);
+      return true;
+    }
+
+    ShuffleVectorSDNode::commuteMask(Unpckh);
+    if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+      UnpackOpcode = X86ISD::UNPCKH;
+      std::swap(V1, V2);
+      return true;
+    }
+  }
+
+  return false;
 }
 
 // X86 has dedicated unpack instructions that can handle specific blend
@@ -8248,13 +8717,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
 /// one of the inputs being zeroable.
 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
-                                           const SmallBitVector &Zeroable,
+                                           const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
-  SDValue AllOnes =
-      DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   SDValue V;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -8286,10 +8754,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
   MVT EltVT = VT.getVectorElementType();
-  int NumEltBits = EltVT.getSizeInBits();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
-  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
-                                    EltVT);
+  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> MaskOps;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
@@ -8307,51 +8773,81 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
-/// \brief Try to emit a blend instruction for a shuffle.
-///
-/// This doesn't do any checks for the availability of instructions for blending
-/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
-/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
-                                         SDValue V2, ArrayRef<int> Original,
-                                         const SmallBitVector &Zeroable,
-                                         const X86Subtarget &Subtarget,
-                                         SelectionDAG &DAG) {
-  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
-  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-  SmallVector<int, 8> Mask(Original.begin(), Original.end());
-  bool ForceV1Zero = false, ForceV2Zero = false;
+static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG);
+
+static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
+                                      MutableArrayRef<int> TargetMask,
+                                      bool &ForceV1Zero, bool &ForceV2Zero,
+                                      uint64_t &BlendMask) {
+  bool V1IsZeroOrUndef =
+      V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZeroOrUndef =
+      V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
+
+  BlendMask = 0;
+  ForceV1Zero = false, ForceV2Zero = false;
+  assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
 
   // Attempt to generate the binary blend mask. If an input is zero then
   // we can use any lane.
   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
-  unsigned BlendMask = 0;
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    int M = Mask[i];
-    if (M < 0)
+  for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+    int M = TargetMask[i];
+    if (M == SM_SentinelUndef)
       continue;
     if (M == i)
       continue;
     if (M == i + Size) {
-      BlendMask |= 1u << i;
+      BlendMask |= 1ull << i;
       continue;
     }
-    if (Zeroable[i]) {
-      if (V1IsZero) {
+    if (M == SM_SentinelZero) {
+      if (V1IsZeroOrUndef) {
         ForceV1Zero = true;
-        Mask[i] = i;
+        TargetMask[i] = i;
         continue;
       }
-      if (V2IsZero) {
+      if (V2IsZeroOrUndef) {
         ForceV2Zero = true;
-        BlendMask |= 1u << i;
-        Mask[i] = i + Size;
+        BlendMask |= 1ull << i;
+        TargetMask[i] = i + Size;
         continue;
       }
     }
-    return SDValue(); // Shuffled input!
+    return false;
   }
+  return true;
+}
+
+uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
+  uint64_t ScaledMask = 0;
+  for (int i = 0; i != Size; ++i)
+    if (BlendMask & (1ull << i))
+      ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
+  return ScaledMask;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Original,
+                                         const APInt &Zeroable,
+                                         const X86Subtarget &Subtarget,
+                                         SelectionDAG &DAG) {
+  SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
+
+  uint64_t BlendMask = 0;
+  bool ForceV1Zero = false, ForceV2Zero = false;
+  if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+                                 BlendMask))
+    return SDValue();
 
   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   if (ForceV1Zero)
@@ -8359,15 +8855,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   if (ForceV2Zero)
     V2 = getZeroVector(VT, Subtarget, DAG, DL);
 
-  auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
-    unsigned ScaledMask = 0;
-    for (int i = 0; i != Size; ++i)
-      if (BlendMask & (1u << i))
-        for (int j = 0; j != Scale; ++j)
-          ScaledMask |= 1u << (i * Scale + j);
-    return ScaledMask;
-  };
-
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
@@ -8387,7 +8874,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     if (Subtarget.hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+      BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
       V1 = DAG.getBitcast(BlendVT, V1);
       V2 = DAG.getBitcast(BlendVT, V2);
@@ -8400,7 +8887,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     // For integer shuffles we need to expand the mask and cast the inputs to
     // v8i16s prior to blending.
     int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+    BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = DAG.getBitcast(MVT::v8i16, V2);
     return DAG.getBitcast(VT,
@@ -8417,7 +8904,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
       BlendMask = 0;
       for (int i = 0; i < 8; ++i)
         if (RepeatedMask[i] >= 8)
-          BlendMask |= 1u << i;
+          BlendMask |= 1ull << i;
       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                          DAG.getConstant(BlendMask, DL, MVT::i8));
     }
@@ -8428,6 +8915,13 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
+    if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
+      MVT IntegerType =
+          MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+      SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+      return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
+    }
+
     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
     if (SDValue Masked =
             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
@@ -8462,10 +8956,21 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     V1 = DAG.getBitcast(BlendVT, V1);
     V2 = DAG.getBitcast(BlendVT, V2);
     return DAG.getBitcast(
-        VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
-                        DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
+        VT,
+        DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
+                      V1, V2));
+  }
+  case MVT::v16f32:
+  case MVT::v8f64:
+  case MVT::v8i64:
+  case MVT::v16i32:
+  case MVT::v32i16:
+  case MVT::v64i8: {
+    MVT IntegerType =
+        MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+    SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
+    return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
   }
-
   default:
     llvm_unreachable("Not a supported integer vector type!");
   }
@@ -8503,7 +9008,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
 
-/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// \brief Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
@@ -8757,7 +9262,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
                                      unsigned ScalarSizeInBits,
                                      ArrayRef<int> Mask, int MaskOffset,
-                                     const SmallBitVector &Zeroable,
+                                     const APInt &Zeroable,
                                      const X86Subtarget &Subtarget) {
   int Size = Mask.size();
   unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -8819,7 +9324,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
 
 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
-                                         const SmallBitVector &Zeroable,
+                                         const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   int Size = Mask.size();
@@ -8852,132 +9357,145 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
   return DAG.getBitcast(VT, V);
 }
 
-/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
-                                           SDValue V2, ArrayRef<int> Mask,
-                                           const SmallBitVector &Zeroable,
-                                           SelectionDAG &DAG) {
+// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+// Remainder of lower half result is zero and upper half is all undef.
+static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+                                      ArrayRef<int> Mask, uint64_t &BitLen,
+                                      uint64_t &BitIdx, const APInt &Zeroable) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
-  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+  assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
-    return SDValue();
+    return false;
 
-  // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
-  // Remainder of lower half result is zero and upper half is all undef.
-  auto LowerAsEXTRQ = [&]() {
-    // Determine the extraction length from the part of the
-    // lower half that isn't zeroable.
-    int Len = HalfSize;
-    for (; Len > 0; --Len)
-      if (!Zeroable[Len - 1])
-        break;
-    assert(Len > 0 && "Zeroable shuffle mask");
+  // Determine the extraction length from the part of the
+  // lower half that isn't zeroable.
+  int Len = HalfSize;
+  for (; Len > 0; --Len)
+    if (!Zeroable[Len - 1])
+      break;
+  assert(Len > 0 && "Zeroable shuffle mask");
 
-    // Attempt to match first Len sequential elements from the lower half.
-    SDValue Src;
-    int Idx = -1;
-    for (int i = 0; i != Len; ++i) {
-      int M = Mask[i];
-      if (M < 0)
-        continue;
-      SDValue &V = (M < Size ? V1 : V2);
-      M = M % Size;
+  // Attempt to match first Len sequential elements from the lower half.
+  SDValue Src;
+  int Idx = -1;
+  for (int i = 0; i != Len; ++i) {
+    int M = Mask[i];
+    if (M == SM_SentinelUndef)
+      continue;
+    SDValue &V = (M < Size ? V1 : V2);
+    M = M % Size;
 
-      // The extracted elements must start at a valid index and all mask
-      // elements must be in the lower half.
-      if (i > M || M >= HalfSize)
-        return SDValue();
+    // The extracted elements must start at a valid index and all mask
+    // elements must be in the lower half.
+    if (i > M || M >= HalfSize)
+      return false;
 
-      if (Idx < 0 || (Src == V && Idx == (M - i))) {
-        Src = V;
-        Idx = M - i;
-        continue;
-      }
-      return SDValue();
+    if (Idx < 0 || (Src == V && Idx == (M - i))) {
+      Src = V;
+      Idx = M - i;
+      continue;
     }
+    return false;
+  }
 
-    if (Idx < 0)
-      return SDValue();
+  if (!Src || Idx < 0)
+    return false;
 
-    assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
-    int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
-    int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
-    return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
-                       DAG.getConstant(BitLen, DL, MVT::i8),
-                       DAG.getConstant(BitIdx, DL, MVT::i8));
-  };
+  assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+  BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+  BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+  V1 = Src;
+  return true;
+}
+
+// INSERTQ: Extract lowest Len elements from lower half of second source and
+// insert over first source, starting at Idx.
+// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+                                        ArrayRef<int> Mask, uint64_t &BitLen,
+                                        uint64_t &BitIdx) {
+  int Size = Mask.size();
+  int HalfSize = Size / 2;
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
-  if (SDValue ExtrQ = LowerAsEXTRQ())
-    return ExtrQ;
+  // Upper half must be undefined.
+  if (!isUndefInRange(Mask, HalfSize, HalfSize))
+    return false;
+
+  for (int Idx = 0; Idx != HalfSize; ++Idx) {
+    SDValue Base;
 
-  // INSERTQ: Extract lowest Len elements from lower half of second source and
-  // insert over first source, starting at Idx.
-  // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
-  auto LowerAsInsertQ = [&]() {
-    for (int Idx = 0; Idx != HalfSize; ++Idx) {
-      SDValue Base;
+    // Attempt to match first source from mask before insertion point.
+    if (isUndefInRange(Mask, 0, Idx)) {
+      /* EMPTY */
+    } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+      Base = V1;
+    } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+      Base = V2;
+    } else {
+      continue;
+    }
 
-      // Attempt to match first source from mask before insertion point.
-      if (isUndefInRange(Mask, 0, Idx)) {
+    // Extend the extraction length looking to match both the insertion of
+    // the second source and the remaining elements of the first.
+    for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+      SDValue Insert;
+      int Len = Hi - Idx;
+
+      // Match insertion.
+      if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+        Insert = V1;
+      } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+        Insert = V2;
+      } else {
+        continue;
+      }
+
+      // Match the remaining elements of the lower half.
+      if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
         /* EMPTY */
-      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+      } else if ((!Base || (Base == V1)) &&
+                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
         Base = V1;
-      } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+      } else if ((!Base || (Base == V2)) &&
+                 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+                                            Size + Hi)) {
         Base = V2;
       } else {
         continue;
       }
 
-      // Extend the extraction length looking to match both the insertion of
-      // the second source and the remaining elements of the first.
-      for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
-        SDValue Insert;
-        int Len = Hi - Idx;
-
-        // Match insertion.
-        if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
-          Insert = V1;
-        } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
-          Insert = V2;
-        } else {
-          continue;
-        }
-
-        // Match the remaining elements of the lower half.
-        if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
-          /* EMPTY */
-        } else if ((!Base || (Base == V1)) &&
-                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
-          Base = V1;
-        } else if ((!Base || (Base == V2)) &&
-                   isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
-                                              Size + Hi)) {
-          Base = V2;
-        } else {
-          continue;
-        }
-
-        // We may not have a base (first source) - this can safely be undefined.
-        if (!Base)
-          Base = DAG.getUNDEF(VT);
-
-        int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
-        int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
-        return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
-                           DAG.getConstant(BitLen, DL, MVT::i8),
-                           DAG.getConstant(BitIdx, DL, MVT::i8));
-      }
+      BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+      BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+      V1 = Base;
+      V2 = Insert;
+      return true;
     }
+  }
 
-    return SDValue();
-  };
+  return false;
+}
 
-  if (SDValue InsertQ = LowerAsInsertQ())
-    return InsertQ;
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+                                           SDValue V2, ArrayRef<int> Mask,
+                                           const APInt &Zeroable,
+                                           SelectionDAG &DAG) {
+  uint64_t BitLen, BitIdx;
+  if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+    return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
+
+  if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+    return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
+                       V2 ? V2 : DAG.getUNDEF(VT),
+                       DAG.getConstant(BitLen, DL, MVT::i8),
+                       DAG.getConstant(BitIdx, DL, MVT::i8));
 
   return SDValue();
 }
@@ -8987,7 +9505,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
 /// features of the subtarget. The extended elements are consecutive and
-/// begin and can start from an offseted element index in the input; to
+/// begin and can start from an offsetted element index in the input; to
 /// avoid excess shuffling the offset must either being in the bottom lane
 /// or at the start of a higher lane. All extended elements must be from
 /// the same lane.
@@ -9027,21 +9545,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget.hasSSE41()) {
-    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
     if (Offset && Scale == 2 && VT.is128BitVector())
       return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
-
-    // For 256-bit vectors, we only need the lower (128-bit) input half.
-    // For 512-bit vectors, we only need the lower input half or quarter.
-    if (VT.getSizeInBits() > 128)
-      InputV = extractSubVector(InputV, 0, DAG, DL,
-                                std::max(128, (int)VT.getSizeInBits() / Scale));
-
-    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+    InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -9158,7 +9669,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
@@ -9314,7 +9825,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
@@ -9469,7 +9980,6 @@ static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
-/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
                                              SDValue V1, SDValue V2,
                                              ArrayRef<int> Mask,
@@ -9582,17 +10092,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
                     DAG.getMachineFunction().getMachineMemOperand(
                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
-
-    // Make sure the newly-created LOAD is in the same position as Ld in
-    // terms of dependency. We create a TokenFactor for Ld and V,
-    // and update uses of Ld's output chain to use the TokenFactor.
-    if (Ld->hasAnyUseOfValue(1)) {
-      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                     SDValue(Ld, 1), SDValue(V.getNode(), 1));
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
-      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
-                             SDValue(V.getNode(), 1));
-    }
+    DAG.makeEquivalentMemoryOrdering(Ld, V);
   } else if (!BroadcastFromReg) {
     // We can't broadcast from a vector register.
     return SDValue();
@@ -9612,7 +10112,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     if (((BroadcastIdx * EltSize) % 128) != 0)
       return SDValue();
 
-    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+    // The shuffle input might have been a bitcast we looked through; look at
+    // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
+    // later bitcast it to BroadcastVT.
+    MVT SrcVT = V.getSimpleValueType();
+    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+           "Unexpected vector element size");
+    assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
+           "Unexpected vector size");
+
+    MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
                     DAG.getIntPtrConstant(BroadcastIdx, DL));
   }
@@ -9642,6 +10151,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   }
 
+  // We only support broadcasting from 128-bit vectors to minimize the
+  // number of patterns we need to deal with in isel. So extract down to
+  // 128-bits.
+  if (SrcVT.getSizeInBits() > 128)
+    V = extract128BitVector(V, 0, DAG, DL);
+
   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
@@ -9653,7 +10168,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
 // elements are zeroable.
 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                          unsigned &InsertPSMask,
-                                         const SmallBitVector &Zeroable,
+                                         const APInt &Zeroable,
                                          ArrayRef<int> Mask,
                                          SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
@@ -9742,7 +10257,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
 
 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
-                                            const SmallBitVector &Zeroable,
+                                            const APInt &Zeroable,
                                             SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
@@ -9877,7 +10392,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
 /// it is better to avoid lowering through this for integer vectors where
 /// possible.
 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -9959,7 +10474,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// it falls back to the floating point shuffle operation with appropriate bit
 /// casting.
 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10178,7 +10693,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10261,7 +10776,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10353,7 +10868,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
-  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // up the inputs, bypassing domain shift penalties that we would incur if we
   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   // relevant.
   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
@@ -10384,18 +10899,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
-  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+  assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
 
   SmallVector<int, 4> LoInputs;
-  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
-               [](int M) { return M >= 0; });
+  copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
   std::sort(LoInputs.begin(), LoInputs.end());
   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   SmallVector<int, 4> HiInputs;
-  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
-               [](int M) { return M >= 0; });
+  copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
   std::sort(HiInputs.begin(), HiInputs.end());
   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   int NumLToL =
@@ -10530,9 +11043,10 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
                  "We need to be changing the number of flipped inputs!");
           int PSHUFHalfMask[] = {0, 1, 2, 3};
           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
-          V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
-                          MVT::v8i16, V,
-                          getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
+          V = DAG.getNode(
+              FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
+              MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
+              getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
 
           for (int &M : Mask)
             if (M >= 0 && M == FixIdx)
@@ -10574,7 +11088,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   };
   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
-  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+  if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
 
   // At this point there are at most two inputs to the low and high halves from
@@ -10830,7 +11344,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
 /// blend if only one input is used.
 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
     bool &V2InUse) {
   SDValue V1Mask[16];
   SDValue V2Mask[16];
@@ -10891,7 +11405,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
 /// halves of the inputs separately (making them have relatively few inputs)
 /// and then concatenate them.
 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -11075,7 +11589,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
 /// back together.
 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -11132,14 +11646,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (!canWidenViaDuplication(Mask))
         return SDValue();
       SmallVector<int, 4> LoInputs;
-      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
-                   [](int M) { return M >= 0 && M < 8; });
+      copy_if(Mask, std::back_inserter(LoInputs),
+              [](int M) { return M >= 0 && M < 8; });
       std::sort(LoInputs.begin(), LoInputs.end());
       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
                      LoInputs.end());
       SmallVector<int, 4> HiInputs;
-      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
-                   [](int M) { return M >= 8; });
+      copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
       std::sort(HiInputs.begin(), HiInputs.end());
       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
                      HiInputs.end());
@@ -11193,7 +11706,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
             PostDupI16Shuffle[i / 2] = MappedMask;
           else
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
-                   "Conflicting entrties in the original shuffle!");
+                   "Conflicting entries in the original shuffle!");
         }
       return DAG.getBitcast(
           MVT::v16i8,
@@ -11365,7 +11878,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// dispatches to the lowering routines accordingly.
 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   switch (VT.SimpleTy) {
@@ -11621,7 +12134,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
 /// \brief Handle lowering 2-lane 128-bit shuffles.
 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   SmallVector<int, 4> WidenedMask;
@@ -11647,18 +12160,22 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
     // subvector.
     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
-      // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
+      // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
       if (Subtarget.hasAVX2() && V2.isUndef())
         return SDValue();
 
-      MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
-                                   VT.getVectorNumElements() / 2);
-      SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
-                                DAG.getIntPtrConstant(0, DL));
-      SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                                OnlyUsesV1 ? V1 : V2,
-                                DAG.getIntPtrConstant(0, DL));
-      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+      // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
+      // this will likely become vinsertf128 which can't fold a 256-bit memop.
+      if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
+        MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
+                                     VT.getVectorNumElements() / 2);
+        SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+                                  DAG.getIntPtrConstant(0, DL));
+        SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+                                  OnlyUsesV1 ? V1 : V2,
+                                  DAG.getIntPtrConstant(0, DL));
+        return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+      }
     }
   }
 
@@ -12091,7 +12608,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
                                          unsigned &ShuffleImm,
                                          ArrayRef<int> Mask) {
   int NumElts = VT.getVectorNumElements();
-  assert(VT.getScalarType() == MVT::f64 &&
+  assert(VT.getScalarSizeInBits() == 64 &&
          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected data type for VSHUFPD");
 
@@ -12127,6 +12644,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
+  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+         "Unexpected data type for VSHUFPD");
+
   unsigned Immediate = 0;
   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
     return SDValue();
@@ -12153,7 +12673,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12250,7 +12770,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12338,7 +12858,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12414,6 +12934,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
+  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+  // since after split we get a more efficient code using vpunpcklwd and
+  // vpunpckhwd instrs than vblend.
+  if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+    if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
+                                                     Mask, DAG))
+      return V;
+
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
@@ -12429,7 +12957,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12445,6 +12973,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
+  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+  // since after split we get a more efficient code than vblend by using
+  // vpunpcklwd and vpunpckhwd instrs.
+  if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+      !Subtarget.hasAVX512())
+    if (SDValue V =
+            lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
+      return V;
+
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
@@ -12533,7 +13070,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12619,7 +13156,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12692,7 +13229,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// together based on the available instructions.
 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // If we have a single input to the zero element, insert that into V1 if we
@@ -12844,7 +13381,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12891,12 +13428,16 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V2, DAG, Subtarget))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
-static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12925,6 +13466,10 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
       return Unpck;
 
+    if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
+                                                  Zeroable, Subtarget, DAG))
+      return Blend;
+
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
@@ -12938,7 +13483,7 @@ static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12994,12 +13539,16 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V2, DAG, Subtarget))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -13062,12 +13611,15 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -13109,12 +13661,16 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
     }
   }
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -13159,6 +13715,10 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
+                                                Zeroable, Subtarget, DAG))
+    return Blend;
+
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
@@ -13170,7 +13730,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// together based on the available instructions.
 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
@@ -13251,7 +13811,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (ISD::isBuildVectorAllZeros(V1.getNode()))
     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
-    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+    V1 = getOnesVector(ExtVT, DAG, DL);
   else
     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
 
@@ -13260,7 +13820,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
-    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+    V2 = getOnesVector(ExtVT, DAG, DL);
   else
     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
 
@@ -13392,8 +13952,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   // We actually see shuffles that are entirely re-arrangements of a set of
   // zero inputs. This mostly happens while decomposing complex shuffles into
   // simple ones. Directly lower these as a buildvector of zeros.
-  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-  if (Zeroable.all())
+  APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   // Try to collapse shuffles into using a vector type with fewer elements but
@@ -13474,6 +14034,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
+  // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+  // with patterns on the mask registers on AVX-512.
+  if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+    return Op;
+
   // Try to lower this to a blend-style vector shuffle. This can handle all
   // constant condition cases.
   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
@@ -13483,10 +14048,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget.hasSSE41())
     return SDValue();
 
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
+  // into an i1 condition so that we can use the mask-based 512-bit blend
+  // instructions.
+  if (VT.getSizeInBits() == 512) {
+    SDValue Cond = Op.getOperand(0);
+    // The vNi1 condition case should be handled above as it can be trivially
+    // lowered.
+    assert(Cond.getValueType().getScalarSizeInBits() ==
+               VT.getScalarSizeInBits() &&
+           "Should have a size-matched integer condition!");
+    // Build a mask by testing the condition against itself (tests for zero).
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
+    // Now return a new VSELECT using the mask.
+    return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
+  }
+
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
-  switch (Op.getSimpleValueType().SimpleTy) {
+  switch (VT.SimpleTy) {
   default:
     // Most of the vector types have blends past SSE4.1.
     return Op;
@@ -13564,15 +14149,18 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   SDValue Idx = Op.getOperand(1);
   MVT EltVT = Op.getSimpleValueType();
 
-  assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
          "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
-  // extend vector to VR512
+  // extend vector to VR512/128
   if (!isa<ConstantSDNode>(Idx)) {
-    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    unsigned NumElts = VecVT.getVectorNumElements();
+    // Extending v8i1/v16i1 to 512-bit get better performance on KNL
+    // than extending to 128/256bit.
+    unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               ExtVT.getVectorElementType(), Ext, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
@@ -13590,12 +14178,12 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
   }
   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
   if (MaxSift - IdxVal)
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                       DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
-  Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+  Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                     DAG.getConstant(MaxSift, dl, MVT::i8));
-  return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
-                       DAG.getIntPtrConstant(0, dl));
+  return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 SDValue
@@ -13606,28 +14194,40 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
 
-  if (Op.getSimpleValueType() == MVT::i1)
+  if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG);
 
   if (!isa<ConstantSDNode>(Idx)) {
-    if (VecVT.is512BitVector() ||
-        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
-         VecVT.getScalarSizeInBits() == 32)) {
-
-      MVT MaskEltVT =
-        MVT::getIntegerVT(VecVT.getScalarSizeInBits());
-      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
-                                    MaskEltVT.getSizeInBits());
+    // Its more profitable to go through memory (1 cycles throughput)
+    // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+    // IACA tool was used to get performance estimation
+    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+    //
+    // example : extractelement <16 x i8> %a, i32 %i
+    //
+    // Block Throughput: 3.00 Cycles
+    // Throughput Bottleneck: Port5
+    //
+    // | Num Of |   Ports pressure in cycles  |    |
+    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
+    // ---------------------------------------------
+    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
+    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
+    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
+    // Total Num Of Uops: 4
+    //
+    //
+    // Block Throughput: 1.00 Cycles
+    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+    //
+    // |    |  Ports pressure in cycles   |  |
+    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
+    // ---------------------------------------------------------
+    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
+    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
+    // Total Num Of Uops: 4
 
-      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
-      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
-                                 DAG.getConstant(0, dl, PtrVT));
-      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
-                         DAG.getConstant(0, dl, PtrVT));
-    }
     return SDValue();
   }
 
@@ -13675,7 +14275,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
-  // TODO: handle v16i8.
+  // TODO: We only extract a single element from v16i8, we can probably afford
+  // to be more aggressive here before using the default approach of spilling to
+  // stack.
+  if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+    // Extract either the lowest i32 or any i16, and extract the sub-byte.
+    int DWordIdx = IdxVal / 4;
+    if (DWordIdx == 0) {
+      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                DAG.getBitcast(MVT::v4i32, Vec),
+                                DAG.getIntPtrConstant(DWordIdx, dl));
+      int ShiftVal = (IdxVal % 4) * 8;
+      if (ShiftVal != 0)
+        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+                          DAG.getConstant(ShiftVal, dl, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    }
+
+    int WordIdx = IdxVal / 2;
+    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                              DAG.getBitcast(MVT::v8i16, Vec),
+                              DAG.getIntPtrConstant(WordIdx, dl));
+    int ShiftVal = (IdxVal % 2) * 8;
+    if (ShiftVal != 0)
+      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+                        DAG.getConstant(ShiftVal, dl, MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+  }
 
   if (VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
@@ -13734,31 +14360,35 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
 
   if(Vec.isUndef()) {
     if (IdxVal)
-      EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+      EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                              DAG.getConstant(IdxVal, dl, MVT::i8));
     return EltInVec;
   }
 
-  // Insertion of one bit into first or last position
-  // can be done with two SHIFTs + OR.
+  // Insertion of one bit into first position
   if (IdxVal == 0 ) {
-    // EltInVec already at correct index and other bits are 0.
+    // Clean top bits of vector.
+    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
+                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
+    EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
+                           DAG.getConstant(NumElems - 1, dl, MVT::i8));
     // Clean the first bit in source vector.
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                       DAG.getConstant(1 , dl, MVT::i8));
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                       DAG.getConstant(1, dl, MVT::i8));
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
+  // Insertion of one bit into last position
   if (IdxVal == NumElems -1) {
     // Move the bit to the last position inside the vector.
-    EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+    EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                            DAG.getConstant(IdxVal, dl, MVT::i8));
     // Clean the last bit in the source vector.
-    Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                            DAG.getConstant(1, dl, MVT::i8));
-    Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
+    Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                            DAG.getConstant(1 , dl, MVT::i8));
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
@@ -13790,17 +14420,20 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   auto *N2C = cast<ConstantSDNode>(N2);
   unsigned IdxVal = N2C->getZExtValue();
 
-  // If we are clearing out a element, we do this more efficiently with a
-  // blend shuffle than a costly integer insertion.
-  // TODO: would other rematerializable values (e.g. allbits) benefit as well?
-  // TODO: pre-SSE41 targets will tend to use bit masking - this could still
-  // be beneficial if we are inserting several zeros and can combine the masks.
-  if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
-    SmallVector<int, 8> ClearMask;
+  bool IsZeroElt = X86::isZeroNode(N1);
+  bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
+
+  // If we are inserting a element, see if we can do this more efficiently with
+  // a blend shuffle with a rematerializable vector than a costly integer
+  // insertion.
+  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
+      16 <= EltVT.getSizeInBits()) {
+    SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
-      ClearMask.push_back(i == IdxVal ? i + NumElts : i);
-    SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+      BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+    SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+                                  : DAG.getConstant(-1, dl, VT);
+    return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
   }
 
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
@@ -13837,25 +14470,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  if (Subtarget.hasSSE41()) {
-    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
-      unsigned Opc;
-      if (VT == MVT::v8i16) {
-        Opc = X86ISD::PINSRW;
-      } else {
-        assert(VT == MVT::v16i8);
-        Opc = X86ISD::PINSRB;
-      }
-
-      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
-      // argument.
-      if (N1.getValueType() != MVT::i32)
-        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-      if (N2.getValueType() != MVT::i32)
-        N2 = DAG.getIntPtrConstant(IdxVal, dl);
-      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+  // argument. SSE41 required for pinsrb.
+  if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
+    unsigned Opc;
+    if (VT == MVT::v8i16) {
+      assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
+      Opc = X86ISD::PINSRW;
+    } else {
+      assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
+      assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
+      Opc = X86ISD::PINSRB;
     }
 
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(IdxVal, dl);
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  }
+
+  if (Subtarget.hasSSE41()) {
     if (EltVT == MVT::f32) {
       // Bits [7:6] of the constant are the source select. This will always be
       //   zero here. The DAG Combiner may combine an extract_elt index into
@@ -13885,36 +14520,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
     }
 
-    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
-      // PINSR* works with constant index.
+    // PINSR* works with constant index.
+    if (EltVT == MVT::i32 || EltVT == MVT::i64)
       return Op;
-    }
   }
 
-  if (EltVT == MVT::i8)
-    return SDValue();
-
-  if (EltVT.getSizeInBits() == 16) {
-    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
-    // as its second argument.
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(IdxVal, dl);
-    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
-  }
   return SDValue();
 }
 
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT OpVT = Op.getSimpleValueType();
 
+  // It's always cheaper to replace a xor+movd with xorps and simplifies further
+  // combines.
+  if (X86::isZeroNode(Op.getOperand(0)))
+    return getZeroVector(OpVT, Subtarget, DAG, dl);
+
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
-    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    unsigned SizeFactor = OpVT.getSizeInBits() / 128;
     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
                                  OpVT.getVectorNumElements() / SizeFactor);
 
@@ -13923,9 +14551,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
     // Insert the 128-bit vector.
     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
+  assert(OpVT.is128BitVector() && "Expected an SSE type!");
+
+  // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
+  if (OpVT == MVT::v4i32)
+    return Op;
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  assert(OpVT.is128BitVector() && "Expected an SSE type!");
   return DAG.getBitcast(
       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
 }
@@ -13943,24 +14575,33 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   MVT ResVT = Op.getSimpleValueType();
 
+  // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
+  // would result with: v1i1 = extract_subvector(vXi1, idx).
+  // Lower these into extract_vector_elt which is already selectable.
+  if (ResVT == MVT::v1i1) {
+    assert(Subtarget.hasAVX512() &&
+           "Boolean EXTRACT_SUBVECTOR requires AVX512");
+
+    MVT EltVT = ResVT.getVectorElementType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    MVT LegalVT =
+        (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
+    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
+  }
+
   assert((In.getSimpleValueType().is256BitVector() ||
           In.getSimpleValueType().is512BitVector()) &&
          "Can only extract from 256-bit or 512-bit vectors");
 
-  if (ResVT.is128BitVector())
-    return extract128BitVector(In, IdxVal, DAG, dl);
-  if (ResVT.is256BitVector())
-    return extract256BitVector(In, IdxVal, DAG, dl);
-
-  llvm_unreachable("Unimplemented!");
-}
+  // If the input is a buildvector just emit a smaller one.
+  unsigned ElemsPerChunk = ResVT.getVectorNumElements();
+  if (In.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getBuildVector(
+        ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
 
-static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
-    if (llvm::all_of(ValidUsers,
-                     [&I](SDValue V) { return V.getNode() != *I; }))
-      return false;
-  return true;
+  // Everything else is legal.
+  return Op;
 }
 
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
@@ -13968,83 +14609,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
 // the upper bits of a vector.
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
-  assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
-
-  SDLoc dl(Op);
-  SDValue Vec = Op.getOperand(0);
-  SDValue SubVec = Op.getOperand(1);
-  SDValue Idx = Op.getOperand(2);
-
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  MVT OpVT = Op.getSimpleValueType();
-  MVT SubVecVT = SubVec.getSimpleValueType();
-
-  if (OpVT.getVectorElementType() == MVT::i1)
-    return insert1BitVector(Op, DAG, Subtarget);
-
-  assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
-         "Can only insert into 256-bit or 512-bit vectors");
-
-  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
-  // load:
-  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
-  //                   (load16 addr + 16), Elts/2)
-  // --> load32 addr
-  // or:
-  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
-  //                   (load32 addr + 32), Elts/2)
-  // --> load64 addr
-  // or a 16-byte or 32-byte broadcast:
-  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
-  //                   (load16 addr), Elts/2)
-  // --> X86SubVBroadcast(load16 addr)
-  // or:
-  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
-  //                   (load32 addr), Elts/2)
-  // --> X86SubVBroadcast(load32 addr)
-  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
-      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
-    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
-    if (Idx2 && Idx2->getZExtValue() == 0) {
-      SDValue SubVec2 = Vec.getOperand(1);
-      // If needed, look through bitcasts to get to the load.
-      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
-        bool Fast;
-        unsigned Alignment = FirstLd->getAlignment();
-        unsigned AS = FirstLd->getAddressSpace();
-        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
-        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
-                                    OpVT, AS, Alignment, &Fast) && Fast) {
-          SDValue Ops[] = {SubVec2, SubVec};
-          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
-            return Ld;
-        }
-      }
-      // If lower/upper loads are the same and the only users of the load, then
-      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
-      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
-        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
-            areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
-          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
-        }
-      }
-      // If this is subv_broadcast insert into both halves, use a larger
-      // subv_broadcast.
-      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
-        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
-                           SubVec.getOperand(0));
-      }
-    }
-  }
+  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
 
-  if (SubVecVT.is128BitVector())
-    return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
-  if (SubVecVT.is256BitVector())
-    return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
-  llvm_unreachable("Unimplemented!");
+  return insert1BitVector(Op, DAG, Subtarget);
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
@@ -14062,7 +14629,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
@@ -14417,7 +14984,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
@@ -14438,7 +15005,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       Subtarget.isTargetWindowsItanium() ||
       Subtarget.isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
-    // Need to generate someting similar to:
+    // Need to generate something similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
     //                                  ; from TEB
     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
@@ -15040,8 +15607,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
-  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
-                               Zero, Four);
+  SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
@@ -15313,7 +15879,7 @@ static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
   SDValue Zero =
    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
 
-  SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
+  SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
   if (VT == ExtVT)
     return SelectedVal;
   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
@@ -15489,32 +16055,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     // word to byte only under BWI
     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
-                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+                         getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   }
 
-  // Truncate with PACKSS if we are truncating a vector comparison result.
-  // TODO: We should be able to support other operations as long as we
-  // we are saturating+packing zero/all bits only.
-  auto IsPackableComparison = [](SDValue V) {
-    unsigned Opcode = V.getOpcode();
-    return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
-            Opcode == X86ISD::CMPP);
-  };
-
-  if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
-                                   all_of(In->ops(), IsPackableComparison))) {
+  // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
+  if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
     if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
       return V;
-  }
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v8i32, In);
-      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
-                                ShufMask);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
@@ -15530,30 +16085,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
-    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       In = DAG.getBitcast(MVT::v32i8, In);
 
-      SmallVector<SDValue,32> pshufbMask;
-      for (unsigned i = 0; i < 2; ++i) {
-        pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
-        for (unsigned j = 0; j < 8; ++j)
-          pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
-      }
-      SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
-      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      // The PSHUFB mask:
+      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
+                                      -1, -1, -1, -1, -1, -1, -1, -1,
+                                      16, 17, 20, 21, 24, 25, 28, 29,
+                                      -1, -1, -1, -1, -1, -1, -1, -1 };
+      In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
-      static const int ShufMask[] = {0,  2,  -1,  -1};
-      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
-                                ShufMask);
+      static const int ShufMask2[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
       return DAG.getBitcast(VT, In);
@@ -15572,9 +16117,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
-    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
 
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -15598,17 +16142,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   // Prepare truncation shuffle mask
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
-  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
-                                   DAG.getUNDEF(NVT), MaskVec);
+  In = DAG.getBitcast(NVT, In);
+  SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
 }
 
-SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                          const X86Subtarget &Subtarget,
-                                          SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
-
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) {
@@ -15616,8 +16157,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
     SDValue Src = Op.getOperand(0);
     SDLoc dl(Op);
     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
-      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
-                         dl, VT,
+      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                      DAG.getUNDEF(MVT::v2f32)));
     }
@@ -15701,7 +16241,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   unsigned EltBits = EltVT.getSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   APInt MaskElt =
-    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
+    IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
   const fltSemantics &Sem =
       EltVT == MVT::f64 ? APFloat::IEEEdouble() :
           (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
@@ -15764,9 +16304,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   // The mask constants are automatically splatted for vector types.
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   SDValue SignMask = DAG.getConstantFP(
-      APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+      APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   SDValue MagMask = DAG.getConstantFP(
-      APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
+      APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
 
   // First, clear all bits but the sign bit from the second operand (sign).
   if (IsFakeVector)
@@ -15891,7 +16431,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
 
-  // If more than one full vectors are evaluated, OR them first before PTEST.
+  // If more than one full vector is evaluated, OR them first before PTEST.
   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
     // Each iteration will OR 2 nodes and append the result until there is only
     // 1 node left, i.e. the final OR'd value of all vectors.
@@ -15900,8 +16440,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
-                     VecIns.back(), VecIns.back());
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
 }
 
 /// \brief return true if \c Op has a use that doesn't just read flags.
@@ -15970,11 +16509,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL:
-    case ISD::SHL: {
-      const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
-      if (BinNode->Flags.hasNoSignedWrap())
+    case ISD::SHL:
+      if (Op.getNode()->getFlags().hasNoSignedWrap())
         break;
-    }
+      LLVM_FALLTHROUGH;
     default:
       NeedOF = true;
       break;
@@ -16366,7 +16904,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
-/// multplication by a reciprocal. This may need to be adjusted for a given
+/// multiplication by a reciprocal. This may need to be adjusted for a given
 /// CPU if a division's cost is not at least twice the cost of a multiplication.
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
@@ -16432,9 +16970,9 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
       unsigned BitWidth = Op0.getValueSizeInBits();
       unsigned AndBitWidth = And.getValueSizeInBits();
       if (BitWidth > AndBitWidth) {
-        APInt Zeros, Ones;
-        DAG.computeKnownBits(Op0, Zeros, Ones);
-        if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
+        KnownBits Known;
+        DAG.computeKnownBits(Op0, Known);
+        if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
           return SDValue();
       }
       LHS = Op1;
@@ -16682,7 +17220,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
-  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   SDLoc dl(Op);
 
@@ -16709,18 +17247,18 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
     // available.
     SDValue Cmp;
-    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+    unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
     if (SSECC == 8) {
       // LLVM predicate is SETUEQ or SETONE.
       unsigned CC0, CC1;
       unsigned CombineOpc;
-      if (SetCCOpcode == ISD::SETUEQ) {
+      if (Cond == ISD::SETUEQ) {
         CC0 = 3; // UNORD
         CC1 = 0; // EQ
         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
                                            static_cast<unsigned>(ISD::OR);
       } else {
-        assert(SetCCOpcode == ISD::SETONE);
+        assert(Cond == ISD::SETONE);
         CC0 = 7; // ORD
         CC1 = 4; // NEQ
         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
@@ -16767,7 +17305,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     // 2. The original operand type has been promoted to a 256-bit vector.
     //
     // Note that condition 2. only applies for AVX targets.
-    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
+    SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
     return DAG.getZExtOrTrunc(NewOp, dl, VT);
   }
 
@@ -16807,7 +17345,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
-    switch (SetCCOpcode) {
+    switch (Cond) {
     default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETULT:
     case ISD::SETLT: CmpMode = 0x00; break;
@@ -16822,60 +17360,55 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     }
 
     // Are we comparing unsigned or signed integers?
-    unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
-      ? X86ISD::VPCOMU : X86ISD::VPCOM;
+    unsigned Opc =
+        ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
 
     return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(CmpMode, dl, MVT::i8));
   }
 
-  // We are handling one of the integer comparisons here.  Since SSE only has
+  // We are handling one of the integer comparisons here. Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
-  unsigned Opc;
-  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
-  bool Subus = false;
-
-  switch (SetCCOpcode) {
-  default: llvm_unreachable("Unexpected SETCC condition");
-  case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
-  case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
-  case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = X86ISD::PCMPGT;
-                    Invert = true; break;
-  case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = X86ISD::PCMPGT;
-                    FlipSigns = true; break;
-  case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = X86ISD::PCMPGT;
-                    FlipSigns = true; Invert = true; break;
-  }
+  unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+                                                            : X86ISD::PCMPGT;
+  bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+              Cond == ISD::SETGE || Cond == ISD::SETUGE;
+  bool Invert = Cond == ISD::SETNE ||
+                (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+
+  // If both operands are known non-negative, then an unsigned compare is the
+  // same as a signed compare and there's no need to flip signbits.
+  // TODO: We could check for more general simplifications here since we're
+  // computing known bits.
+  bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
+                   !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
 
   // Special case: Use min/max operations for SETULE/SETUGE
   MVT VET = VT.getVectorElementType();
-  bool hasMinMax =
-       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
-    || (Subtarget.hasSSE2()  && (VET == MVT::i8));
-
-  if (hasMinMax) {
-    switch (SetCCOpcode) {
+  bool HasMinMax =
+      (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
+      (Subtarget.hasSSE2() && (VET == MVT::i8));
+  bool MinMax = false;
+  if (HasMinMax) {
+    switch (Cond) {
     default: break;
     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
     }
 
-    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
+    if (MinMax)
+      Swap = Invert = FlipSigns = false;
   }
 
-  bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
-  if (!MinMax && hasSubus) {
+  bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
+  bool Subus = false;
+  if (!MinMax && HasSubus) {
     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
     // Op0 u<= Op1:
     //   t = psubus Op0, Op1
     //   pcmpeq t, <0..0>
-    switch (SetCCOpcode) {
+    switch (Cond) {
     default: break;
     case ISD::SETULT: {
       // If the comparison is against a constant we can turn this into a
@@ -16977,10 +17510,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
     MVT EltVT = VT.getVectorElementType();
-    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
+    SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
                                  VT);
-    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
-    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
   }
 
   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
@@ -17005,8 +17538,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
-  assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
-         && "SetCC type must be 8-bit or 1-bit integer");
+  assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
@@ -17070,19 +17602,24 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return SetCC;
 }
 
-SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
   SDValue Cond = Op.getOperand(3);
   SDLoc DL(Op);
 
-  assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+  assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
 
-  assert(Carry.getOpcode() != ISD::CARRY_FALSE);
+  // Recreate the carry if needed.
+  EVT CarryVT = Carry.getValueType();
+  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
+
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
-  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
+  SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
   SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
   if (Op.getSimpleValueType() == MVT::i1)
     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
@@ -17140,7 +17677,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     if (SSECC != 8) {
       if (Subtarget.hasAVX512()) {
-        SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
+        SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
                                   CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
         return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
                            DL, VT, Cmp, Op1, Op2);
@@ -17176,7 +17713,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
-        SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
+        SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
 
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                            VSel, DAG.getIntPtrConstant(0, DL));
@@ -17188,9 +17725,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // AVX512 fallback is to lower selects of scalar floats to masked moves.
-  if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
-      Subtarget.hasAVX512())
-    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
+  if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+    SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
+    return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+  }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
     SDValue Op1Scalar;
@@ -17204,9 +17742,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
       Op2Scalar = Op2.getOperand(0);
     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-      SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
-                                      Op1Scalar.getValueType(),
-                                      Cond, Op1Scalar, Op2Scalar);
+      SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
+                                        Op1Scalar, Op2Scalar);
       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
         return DAG.getBitcast(VT, newSelect);
       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
@@ -17221,8 +17758,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
-    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
-                                    Cond, Op1, Op2);
+    SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
@@ -17241,33 +17777,33 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
+  // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
+  // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
   if (Cond.getOpcode() == X86ISD::SETCC &&
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
-
-    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+    unsigned CondCode =
+        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
       SDValue CmpOp0 = Cmp.getOperand(0);
+
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
       // (select (x == 0), 0, -1) -> neg & sbb
       if (isNullConstant(Y) &&
-            (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
-          SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
-          SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
-                                    DAG.getConstant(0, DL,
-                                                    CmpOp0.getValueType()),
-                                    CmpOp0);
-          SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
-                                    DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                    SDValue(Neg.getNode(), 1));
-          return Res;
-        }
+          (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+        SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
+        SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
+        SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
+                                  DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                                  SDValue(Neg.getNode(), 1));
+        return Res;
+      }
 
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
@@ -17283,6 +17819,43 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       if (!isNullConstant(Op2))
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
+    } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
+               Cmp.getOperand(0).getOpcode() == ISD::AND &&
+               isOneConstant(Cmp.getOperand(0).getOperand(1))) {
+      SDValue CmpOp0 = Cmp.getOperand(0);
+      SDValue Src1, Src2;
+      // true if Op2 is XOR or OR operator and one of its operands
+      // is equal to Op1
+      // ( a , a op b) || ( b , a op b)
+      auto isOrXorPattern = [&]() {
+        if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
+            (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
+          Src1 =
+              Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
+          Src2 = Op1;
+          return true;
+        }
+        return false;
+      };
+
+      if (isOrXorPattern()) {
+        SDValue Neg;
+        unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
+        // we need mask of all zeros or ones with same size of the other
+        // operands.
+        if (CmpSz > VT.getSizeInBits())
+          Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
+        else if (CmpSz < VT.getSizeInBits())
+          Neg = DAG.getNode(ISD::AND, DL, VT,
+              DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
+              DAG.getConstant(1, DL, VT));
+        else
+          Neg = CmpOp0;
+        SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
+                                   Neg); // -(and (x, 0x1))
+        SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
+        return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
+      }
     }
   }
 
@@ -17423,17 +17996,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   // SKX processor
   if ((InVTElt == MVT::i1) &&
-      (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
-        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
-
-       ((Subtarget.hasBWI() && VT.is512BitVector() &&
-        VTElt.getSizeInBits() <= 16)) ||
+      (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
 
-       ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
-        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+       ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
 
-       ((Subtarget.hasDQI() && VT.is512BitVector() &&
-        VTElt.getSizeInBits() >= 32))))
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -17441,8 +18007,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   if (VT.is512BitVector() && InVTElt != MVT::i1 &&
       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
-      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
-    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+      return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
+    return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
   }
 
   if (InVTElt != MVT::i1)
@@ -17454,12 +18020,12 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   SDValue V;
   if (Subtarget.hasDQI()) {
-    V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+    V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
     assert(!VT.is512BitVector() && "Unexpected vector type");
   } else {
-    SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+    SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
-    V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+    V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
     if (ExtVT == VT)
       return V;
   }
@@ -17506,11 +18072,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
 
-  // SSE41 targets can use the pmovsx* instructions directly.
-  unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
-                      X86ISD::VSEXT : X86ISD::VZEXT;
-  if (Subtarget.hasSSE41())
+  // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
+  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
+  // need to be handled here for 256/512-bit results.
+  if (Subtarget.hasInt256()) {
+    assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+    unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+                        X86ISD::VSEXT : X86ISD::VZEXT;
     return DAG.getNode(ExtOpc, dl, VT, In);
+  }
 
   // We should only get here for sign extend.
   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
@@ -17595,8 +18165,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
-  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
-  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+  OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
+  OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -17674,7 +18244,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
   MVT VT = Op.getValueType().getSimpleVT();
   unsigned NumElts = VT.getVectorNumElements();
 
-  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+  if ((Subtarget.hasBWI() && NumElts >= 32) ||
+      (Subtarget.hasDQI() && NumElts < 16) ||
       NumElts == 16) {
     // Load and extend - everything is legal
     if (NumElts < 8) {
@@ -17703,7 +18274,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
 
   if (NumElts <= 8) {
     // A subset, assume that we have only AVX-512F
-    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+    unsigned NumBitsToLoad = 8;
     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
                               Ld->getBasePtr(),
@@ -17911,7 +18482,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
   if (Ext == ISD::SEXTLOAD) {
     // If we have SSE4.1, we can directly emit a VSEXT node.
     if (Subtarget.hasSSE41()) {
-      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
       return Sext;
     }
@@ -18243,8 +18814,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
+  bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
-               SplitStack;
+               SplitStack || EmitStackProbe;
   SDLoc dl(Op);
 
   // Get the inputs.
@@ -18256,7 +18828,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   bool Is64Bit = Subtarget.is64Bit();
   MVT SPTy = getPointerTy(DAG.getDataLayout());
@@ -18469,6 +19041,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
+  // Bitcast the source vector to the output type, this is mainly necessary for
+  // vXi8/vXi64 shifts.
+  if (VT != SrcOp.getSimpleValueType())
+    SrcOp = DAG.getBitcast(VT, SrcOp);
+
   // Fold this packed shift into its first operand if ShiftAmt is 0.
   if (ShiftAmt == 0)
     return SrcOp;
@@ -18485,9 +19062,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
          && "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
-  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
-  if (VT == SrcOp.getSimpleValueType() &&
-      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+  // vector of Constants or UNDEFs.
+  if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     SmallVector<SDValue, 8> Elts;
     unsigned NumElts = SrcOp->getNumOperands();
     ConstantSDNode *ND;
@@ -18578,11 +19154,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     ShAmt = ShAmt.getOperand(0);
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
-    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -18692,8 +19268,8 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
 
 /// \brief Creates an SDNode for a predicated scalar operation.
 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
-/// The mask is coming as MVT::i8 and it should be truncated
-/// to MVT::i1 while lowering masking intrinsics.
+/// The mask is coming as MVT::i8 and it should be transformed
+/// to MVT::v1i1 while lowering masking intrinsics.
 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
 /// "X86select" instead of "vselect". We just can't create the "vselect" node
 /// for a scalar instruction.
@@ -18701,19 +19277,19 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
-  if (isAllOnesConstant(Mask))
-    return Op;
+
+  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
+    if (MaskConst->getZExtValue() & 0x1)
+      return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
-  // The mask should be of type MVT::i1
-  SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
 
+  SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
   if (Op.getOpcode() == X86ISD::FSETCCM ||
       Op.getOpcode() == X86ISD::FSETCCM_RND)
     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
-  if (Op.getOpcode() == X86ISD::VFPCLASS ||
-      Op.getOpcode() == X86ISD::VFPCLASSS)
+  if (Op.getOpcode() == X86ISD::VFPCLASSS)
     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
 
   if (PreservedSrc.isUndef())
@@ -18762,7 +19338,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
   // registration, or the .set_setframe offset.
   MCSymbol *OffsetSym =
       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
-          GlobalValue::getRealLinkageName(Fn->getName()));
+          GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
@@ -18853,6 +19429,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       SDValue Src2 = Op.getOperand(2);
       SDValue passThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd))
+          return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                                  dl, VT, Src1, Src2, Rnd),
+                                      Mask, passThru, Subtarget, DAG);
+      }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
                                   Mask, passThru, Subtarget, DAG);
     }
@@ -19142,10 +19726,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       SDValue Src1 = Op.getOperand(1);
       SDValue Imm = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
-      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
+      SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
-      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
+      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
+                         DAG.getIntPtrConstant(0, dl));
     }
     case CMP_MASK:
     case CMP_MASK_CC: {
@@ -19205,18 +19790,18 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       if (IntrData->Opc1 != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd))
-          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
+          Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
       }
       //default rounding mode
       if(!Cmp.getNode())
-        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
+        Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
 
       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
                                              DAG.getTargetConstant(0, dl,
                                                                    MVT::i1),
                                              Subtarget, DAG);
-
-      return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
+      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
+                         DAG.getIntPtrConstant(0, dl));
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
@@ -19264,13 +19849,13 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
 
       SDValue FCmp;
       if (isRoundModeCurDirection(Sae))
-        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
-                                  DAG.getConstant(CondVal, dl, MVT::i8));
+        FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
+                           DAG.getConstant(CondVal, dl, MVT::i8));
       else
-        FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
-                                  DAG.getConstant(CondVal, dl, MVT::i8), Sae);
-      // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
-      return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
+        FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
+                           DAG.getConstant(CondVal, dl, MVT::i8), Sae);
+      return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
+                         DAG.getIntPtrConstant(0, dl));
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
@@ -19306,6 +19891,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                 Src2, Src1);
       return DAG.getBitcast(VT, Res);
     }
+    case MASK_BINOP: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
+      return DAG.getBitcast(VT, Res);
+    }
     case FIXUPIMMS:
     case FIXUPIMMS_MASKZ:
     case FIXUPIMM:
@@ -19347,12 +19941,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                 DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
-    case CONVERT_MASK_TO_VEC: {
-      SDValue Mask = Op.getOperand(1);
-      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
-      SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-      return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
-    }
     case BRCST_SUBVEC_TO_VEC: {
       SDValue Src = Op.getOperand(1);
       SDValue Passthru = Op.getOperand(2);
@@ -19478,6 +20066,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
+  case Intrinsic::x86_avx512_knot_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
+    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
+  case Intrinsic::x86_avx512_kandn_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    // Invert LHS for the not.
+    LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
+                      DAG.getConstant(1, dl, MVT::v16i1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
+  case Intrinsic::x86_avx512_kxnor_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+    // Invert result for the not.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
+                      DAG.getConstant(1, dl, MVT::v16i1));
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -19569,7 +20184,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     SDValue Op1 = Op.getOperand(1);
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
-        GlobalValue::getRealLinkageName(Fn->getName()));
+        GlobalValue::dropLLVMManglingEscape(Fn->getName()));
 
     // Generate a simple absolute symbol reference. This intrinsic is only
     // supported on 32-bit Windows, which isn't PIC.
@@ -19603,12 +20218,40 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
   }
 }
 
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                                 SDValue Src, SDValue Mask, SDValue Base,
+                                 SDValue Index, SDValue ScaleOp, SDValue Chain,
+                                 const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  EVT MaskVT = Mask.getValueType();
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  // If source is undef or we know it won't be used, use a zero vector
+  // to break register dependency.
+  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+  SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, dl);
+}
+
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
@@ -19617,7 +20260,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  if (Src.isUndef())
+  // If source is undef or we know it won't be used, use a zero vector
+  // to break register dependency.
+  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -19630,7 +20276,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -19649,14 +20298,16 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
-  auto *C = cast<ConstantSDNode>(ScaleOp);
+  auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  // Scale must be constant.
+  if (!C)
+    return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-  //SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
@@ -19884,16 +20535,17 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
-  const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
+  const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
   if (!IntrData) {
-    if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
+    switch (IntNo) {
+    case llvm::Intrinsic::x86_seh_ehregnode:
       return MarkEHRegistrationNode(Op, DAG);
-    if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
+    case llvm::Intrinsic::x86_seh_ehguard:
       return MarkEHGuard(Op, DAG);
-    if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
-        IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
-        IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
-        IntNo == llvm::Intrinsic::x86_flags_write_u64) {
+    case llvm::Intrinsic::x86_flags_read_u32:
+    case llvm::Intrinsic::x86_flags_read_u64:
+    case llvm::Intrinsic::x86_flags_write_u32:
+    case llvm::Intrinsic::x86_flags_write_u64: {
       // We need a frame pointer because this will get lowered to a PUSH/POP
       // sequence.
       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
@@ -19902,6 +20554,20 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
       return SDValue();
     }
+    case Intrinsic::x86_lwpins32:
+    case Intrinsic::x86_lwpins64: {
+      SDLoc dl(Op);
+      SDValue Chain = Op->getOperand(0);
+      SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+      SDValue LwpIns =
+          DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
+                      Op->getOperand(3), Op->getOperand(4));
+      SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
+      SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
+      return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+                         LwpIns.getValue(1));
+    }
+    }
     return SDValue();
   }
 
@@ -19928,6 +20594,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
+  case GATHER_AVX2: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Mask  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+                             Scale, Chain, Subtarget);
+  }
   case GATHER: {
   //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
@@ -19953,8 +20629,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
-    assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
-    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+    assert((HintVal == 2 || HintVal == 3) &&
+           "Wrong prefetch hint in intrinsic: should be 2 or 3");
+    unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
     SDValue Mask  = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
@@ -19994,8 +20671,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   }
   // ADC/ADCX/SBB
   case ADX: {
-    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
-    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
+    SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+    SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
                                 DAG.getConstant(-1, dl, MVT::i8));
     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
@@ -20368,7 +21045,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
       FunctionType *FTy = Func->getFunctionType();
-      const AttributeSet &Attrs = Func->getAttributes();
+      const AttributeList &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
@@ -20503,54 +21180,62 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
 }
 
+// Split an unary integer op into 2 half sized ops.
+static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned SizeInBits = VT.getSizeInBits();
+
+  // Extract the Lo/Hi vectors
+  SDLoc dl(Op);
+  SDValue Src = Op.getOperand(0);
+  SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
+  SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
+}
+
+// Decompose 256-bit ops into smaller 128-bit ops.
+static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return LowerVectorIntUnary(Op, DAG);
+}
+
+// Decompose 512-bit ops into smaller 256-bit ops.
+static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is512BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 512-bit vector integer operation");
+  return LowerVectorIntUnary(Op, DAG);
+}
+
 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
 //
-// 1. i32/i64 128/256-bit vector (native support require VLX) are expended
-//    to 512-bit vector.
-// 2. i8/i16 vector implemented using dword LZCNT vector instruction
-//    ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
-//    split the vector, perform operation on it's Lo a Hi part and
-//    concatenate the results.
-static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
+// i8/i16 vector implemented using dword LZCNT vector instruction
+// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
+// split the vector, perform operation on it's Lo a Hi part and
+// concatenate the results.
+static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::CTLZ);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
-  if (EltVT == MVT::i64 || EltVT == MVT::i32) {
-    // Extend to 512 bit vector.
-    assert((VT.is256BitVector() || VT.is128BitVector()) &&
-              "Unsupported value type for operation");
-
-    MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
-    SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
-                                 DAG.getUNDEF(NewVT),
-                                 Op.getOperand(0),
-                                 DAG.getIntPtrConstant(0, dl));
-    SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
-
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
-                       DAG.getIntPtrConstant(0, dl));
-  }
-
   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
           "Unsupported element type");
 
-  if (16 < NumElems) {
-    // Split vector, it's Lo and Hi parts will be handled in next iteration.
-    SDValue Lo, Hi;
-    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
-    MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-    Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
-    Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
-  }
+  // Split vector, it's Lo and Hi parts will be handled in next iteration.
+  if (16 < NumElems)
+    return LowerVectorIntUnary(Op, DAG);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
-
   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
           "Unsupported value type for operation");
 
@@ -20637,23 +21322,17 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
                                const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
-  SDValue Op0 = Op.getOperand(0);
 
-  if (Subtarget.hasAVX512())
-    return LowerVectorCTLZ_AVX512(Op, DAG);
+  if (Subtarget.hasCDI())
+    return LowerVectorCTLZ_AVX512CDI(Op, DAG);
 
   // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
-    unsigned NumElems = VT.getVectorNumElements();
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntUnary(Op, DAG);
 
-    // Extract each 128-bit vector, perform ctlz and concat the result.
-    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
-    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
-                       DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
-                       DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
-  }
+  // Decompose 512-bit ops into smaller 256-bit ops.
+  if (VT.is512BitVector() && !Subtarget.hasBWI())
+    return Lower512IntUnary(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
@@ -20802,9 +21481,10 @@ static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
-static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
-  if (Op.getValueType() == MVT::i1)
-    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
+static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+  if (VT.getScalarType() == MVT::i1)
+    return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
                        Op.getOperand(0), Op.getOperand(1));
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
@@ -20812,14 +21492,11 @@ static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   return Lower256IntArith(Op, DAG);
 }
 
-static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
-  if (Op.getValueType() == MVT::i1)
-    return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(0), Op.getOperand(1));
+static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
-  return Lower256IntArith(Op, DAG);
+  return Lower256IntUnary(Op, DAG);
 }
 
 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
@@ -20834,7 +21511,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
-  if (VT == MVT::i1)
+  if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
 
   // Decompose 256-bit ops into smaller 128-bit ops.
@@ -20874,8 +21551,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
     // Extract the lo parts and sign extend to i16
     SDValue ALo, BLo;
     if (Subtarget.hasSSE41()) {
-      ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
-      BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+      ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
+      BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                               -1, 4, -1, 5, -1, 6, -1, 7};
@@ -20894,8 +21571,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                               -1, -1, -1, -1, -1, -1, -1, -1};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
-      BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+      AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
+      BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                               -1, 12, -1, 13, -1, 14, -1, 15};
@@ -21056,8 +21733,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
     }
 
-    SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
-    SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+    SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
+    SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
                                DAG.getConstant(8, dl, MVT::v16i16));
@@ -21073,8 +21750,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   // Extract the lo parts and zero/sign extend to i16.
   SDValue ALo, BLo;
   if (Subtarget.hasSSE41()) {
-    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
-    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+    ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
+    BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
   } else {
     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                             -1, 4, -1, 5, -1, 6, -1, 7};
@@ -21093,8 +21770,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                             -1, -1, -1, -1, -1, -1, -1, -1};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
-    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+    AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
+    BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
   } else {
     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                             -1, 12, -1, 13, -1, 14, -1, 15};
@@ -21148,8 +21825,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
                            MachinePointerInfo(), /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
-    Entry.isSExt = false;
-    Entry.isZExt = false;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
@@ -21157,11 +21834,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
                                          getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(getLibcallCallingConv(LC),
-               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, std::move(Args))
-    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(
+          getLibcallCallingConv(LC),
+          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+          std::move(Args))
+      .setInRegister()
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return DAG.getBitcast(VT, CallInfo.first);
@@ -21269,15 +21950,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
-  if (VT.is512BitVector() &&
+  if (VT.is512BitVector() && Subtarget.hasAVX512() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
-  bool LShift = VT.is128BitVector() ||
-    (VT.is256BitVector() && Subtarget.hasInt256());
+  bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
+                (VT.is256BitVector() && Subtarget.hasInt256());
 
-  bool AShift = LShift && (Subtarget.hasVLX() ||
-    (VT != MVT::v2i64 && VT != MVT::v4i64));
+  bool AShift = LShift && (Subtarget.hasAVX512() ||
+                           (VT != MVT::v2i64 && VT != MVT::v4i64));
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
@@ -21301,7 +21982,7 @@ static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
     return false;
 
-  if (VT.is512BitVector() || Subtarget.hasVLX())
+  if (Subtarget.hasAVX512())
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
@@ -21324,6 +22005,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
     SDValue Ex = DAG.getBitcast(ExVT, R);
 
+    // ashr(R, 63) === cmp_slt(R, 0)
+    if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
+      assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
+             "Unsupported PCMPGT op");
+      return DAG.getNode(X86ISD::PCMPGT, dl, VT,
+                         getZeroVector(VT, Subtarget, DAG, dl), R);
+    }
+
     if (ShiftAmt >= 32) {
       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
       SDValue Upper =
@@ -21360,8 +22049,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
       // i64 SRA needs to be performed as partial shifts.
-      if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
-          Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
+      if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
+           (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
+          Op.getOpcode() == ISD::SRA)
         return ArithmeticShiftRight64(ShiftAmt);
 
       if (VT == MVT::v16i8 ||
@@ -21422,10 +22112,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
   }
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+  // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
 
+    // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
+    unsigned SubVectorScale = 1;
+    if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      SubVectorScale =
+          Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
+      Amt = Amt.getOperand(0);
+    }
+
     // Peek through any splat that was introduced for i64 shift vectorization.
     int SplatIndex = -1;
     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
@@ -21442,7 +22141,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
-                     VT.getVectorNumElements();
+                     (SubVectorScale * VT.getVectorNumElements());
     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
     uint64_t ShiftAmt = 0;
     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
@@ -21610,11 +22309,11 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   }
 
   // i64 vector arithmetic shift can be emulated with the transform:
-  // M = lshr(SIGN_BIT, Amt)
+  // M = lshr(SIGN_MASK, Amt)
   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
       Op.getOpcode() == ISD::SRA) {
-    SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
+    SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
@@ -21816,23 +22515,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
-        return DAG.getBitcast(SelVT,
-                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
         // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
-        return DAG.getBitcast(SelVT,
-                              DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
-      return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
+      return DAG.getSelect(dl, SelVT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
@@ -21954,15 +22651,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
         Sel = DAG.getBitcast(ExtVT, Sel);
-        return DAG.getBitcast(
-            VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
+        return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we splat the sign bit - a negative value will
       // set all bits of the lanes to true and VSELECT uses that in
       // its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue C =
           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
-      return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
+      return DAG.getSelect(dl, VT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
@@ -22017,10 +22713,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
+  unsigned Opcode = Op.getOpcode();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+
+  if (Subtarget.hasAVX512()) {
+    // Attempt to rotate by immediate.
+    APInt UndefElts;
+    SmallVector<APInt, 16> EltBits;
+    if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
+      if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
+            return EltBits[0] == V;
+          })) {
+        unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+        uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
+        return DAG.getNode(Op, DL, VT, R,
+                           DAG.getConstant(RotateAmt, DL, MVT::i8));
+      }
+    }
+
+    // Else, fall-back on VPROLV/VPRORV.
+    return Op;
+  }
 
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
-  assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
+  assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right.
@@ -22035,7 +22752,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
-      assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
+      assert(RotateAmt < EltSizeInBits && "Rotation out of range");
       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
@@ -22062,10 +22779,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
     if (isOneConstant(RHS)) {
-        BaseOp = X86ISD::INC;
-        Cond = X86::COND_O;
-        break;
-      }
+      BaseOp = X86ISD::INC;
+      Cond = X86::COND_O;
+      break;
+    }
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
     break;
@@ -22077,10 +22794,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
     // A subtract of one will be selected as a DEC. Note that DEC doesn't
     // set CF, so we can't do this for USUBO.
     if (isOneConstant(RHS)) {
-        BaseOp = X86ISD::DEC;
-        Cond = X86::COND_O;
-        break;
-      }
+      BaseOp = X86ISD::DEC;
+      Cond = X86::COND_O;
+      break;
+    }
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_O;
     break;
@@ -22146,7 +22863,7 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
-  auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
+  auto PTy = cast<PointerType>(LI->getPointerOperandType());
   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
                                                : AtomicExpansionKind::None;
 }
@@ -22202,7 +22919,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
 
   auto Builder = IRBuilder<>(AI);
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  auto SynchScope = AI->getSynchScope();
+  auto SSID = AI->getSyncScopeID();
   // We must restrict the ordering to avoid generating loads with Release or
   // ReleaseAcquire orderings.
   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
@@ -22224,7 +22941,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
-  if (SynchScope == SingleThread)
+  if (SSID == SyncScope::SingleThread)
     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
     // the IR level, so we must wrap it in an intrinsic.
     return nullptr;
@@ -22243,7 +22960,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
           AI->getType()->getPrimitiveSizeInBits());
-  Loaded->setAtomic(Order, SynchScope);
+  Loaded->setAtomic(Order, SSID);
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
   return Loaded;
@@ -22254,13 +22971,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
-  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
-      FenceScope == CrossThread) {
+      FenceSSID == SyncScope::System) {
     if (Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
@@ -22470,7 +23187,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   // index into a in-register pre-computed pop count table. We then split up the
   // input vector in two new ones: (1) a vector with only the shifted-right
   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
-  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
   //
@@ -22588,35 +23305,33 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc DL(Op.getNode());
   SDValue Op0 = Op.getOperand(0);
 
+  // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
+  if (Subtarget.hasVPOPCNTDQ()) {
+    if (VT == MVT::v8i16) {
+      Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
+      Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+    }
+    if (VT == MVT::v16i8 || VT == MVT::v16i16) {
+      Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
+      Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+    }
+  }
+
   if (!Subtarget.hasSSSE3()) {
     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   }
 
-  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
-    unsigned NumElems = VT.getVectorNumElements();
-
-    // Extract each 128-bit vector, compute pop count and concat the result.
-    SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
-    SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
-                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
-                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
-  }
-
-  if (VT.is512BitVector() && !Subtarget.hasBWI()) {
-    unsigned NumElems = VT.getVectorNumElements();
-
-    // Extract each 256-bit vector, compute pop count and concat the result.
-    SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
-    SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntUnary(Op, DAG);
 
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
-                       LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
-                       LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
-  }
+  // Decompose 512-bit ops into smaller 256-bit ops.
+  if (VT.is512BitVector() && !Subtarget.hasBWI())
+    return Lower512IntUnary(Op, DAG);
 
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
@@ -22643,20 +23358,12 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
                        DAG.getIntPtrConstant(0, DL));
   }
 
-  MVT SVT = VT.getVectorElementType();
   int NumElts = VT.getVectorNumElements();
   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
 
   // Decompose 256-bit ops into smaller 128-bit ops.
-  if (VT.is256BitVector()) {
-    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
-    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
-
-    MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
-                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
-                       DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
-  }
+  if (VT.is256BitVector())
+    return Lower256IntUnary(Op, DAG);
 
   assert(VT.is128BitVector() &&
          "Only 128-bit vector bitreverse lowering supported.");
@@ -22697,14 +23404,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
          "Only byte vector BITREVERSE supported");
 
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
-  if (VT.is256BitVector() && !Subtarget.hasInt256()) {
-    MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
-    SDValue Lo = extract128BitVector(In, 0, DAG, DL);
-    SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
-    Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
-    Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-  }
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return Lower256IntUnary(Op, DAG);
 
   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   // two nibbles and a PSHUFB lookup to find the bitreverse of each
@@ -22824,30 +23525,33 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   return Op;
 }
 
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getNode()->getSimpleValueType(0);
+static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
+  SDNode *N = Op.getNode();
+  MVT VT = N->getSimpleValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+  SDLoc DL(N);
 
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Invalid code");
-  case ISD::ADDC: Opc = X86ISD::ADD; break;
-  case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
-  case ISD::SUBC: Opc = X86ISD::SUB; break;
-  case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
-  }
+  // Set the carry flag.
+  SDValue Carry = Op.getOperand(2);
+  EVT CarryVT = Carry.getValueType();
+  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
+  Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
+                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
+
+  unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
+  SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
+                            Op.getOperand(1), Carry.getValue(1));
+
+  SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
+  if (N->getValueType(1) == MVT::i1)
+    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
-                       Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
-                     Op.getOperand(1), Op.getOperand(2));
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
@@ -22867,8 +23571,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   bool isF64 = ArgVT == MVT::f64;
@@ -22880,13 +23584,13 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
-  Type *RetTy = isF64
-    ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
-    : (Type*)VectorType::get(ArgTy, 4);
+  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
+                      : (Type *)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -22923,8 +23627,6 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
          "Unexpected request for vector widening");
 
-  EVT EltVT = NVT.getVectorElementType();
-
   SDLoc dl(InOp);
   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
       InOp.getNumOperands() == 2) {
@@ -22942,6 +23644,8 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
     for (unsigned i = 0; i < InNumElts; ++i)
       Ops.push_back(InOp.getOperand(i));
 
+    EVT EltVT = InOp.getOperand(0).getValueType();
+
     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
       DAG.getUNDEF(EltVT);
     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
@@ -23086,7 +23790,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
@@ -23142,7 +23846,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
@@ -23202,7 +23906,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
 
-    // The pass-thru value
+    // The pass-through value
     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
     Src0 = ExtendToType(Src0, NewVT, DAG);
 
@@ -23216,6 +23920,57 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
     return DAG.getMergeValues(RetOps, dl);
   }
+  if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
+    // There is a special case when the return type is v2i32 is illegal and
+    // the type legaizer extended it to v2i64. Without this conversion we end up
+    // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
+    // In order to avoid this situation, we'll build an X86 specific Gather node
+    // with index v2i64 and value type v4i32.
+    assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
+           "Unexpected type in masked gather");
+    Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
+                                DAG.getBitcast(MVT::v4i32, Src0),
+                                DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
+    // The mask should match the destination type. Extending mask with zeroes
+    // is not necessary since instruction itself reads only two values from
+    // memory.
+    Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+      DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
+      N->getMemOperand());
+
+    SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
+                                  NewGather.getValue(0), DAG);
+    SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+    return DAG.getMergeValues(RetOps, dl);
+  }
+  if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
+    // This transformation is for optimization only.
+    // The type legalizer extended mask and index to 4 elements vector
+    // in order to match requirements of the common gather node - same
+    // vector width of index and value. X86 Gather node allows mismatch
+    // of vector width in order to select more optimal instruction at the
+    // end.
+    assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
+           "Unexpected type in masked gather");
+    if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
+        ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
+        Index.getOpcode() == ISD::CONCAT_VECTORS &&
+        Index.getOperand(1).isUndef()) {
+      Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
+      Index = Index.getOperand(0);
+    } else
+      return Op;
+    SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+    SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+      DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
+      N->getMemOperand());
+
+    SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
+    return DAG.getMergeValues(RetOps, dl);
+
+  }
   return Op;
 }
 
@@ -23284,7 +24039,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
-  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
@@ -23303,7 +24058,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, Subtarget, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   case ISD::FABS:
@@ -23311,7 +24066,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::SETCCE:             return LowerSETCCE(Op, DAG);
+  case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
@@ -23344,7 +24099,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
-  case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
+  case ISD::ROTL:
+  case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
@@ -23356,16 +24112,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::ADD:                return LowerADD(Op, DAG);
-  case ISD::SUB:                return LowerSUB(Op, DAG);
+  case ISD::ADDCARRY:
+  case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
+  case ISD::ADD:
+  case ISD::SUB:                return LowerADD_SUB(Op, DAG);
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
+  case ISD::ABS:                return LowerABS(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
@@ -23768,7 +24523,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
-  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
@@ -23779,16 +24533,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::ABS:                return "X86ISD::ABS";
   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMAXS:              return "X86ISD::FMAXS";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
+  case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMINS:              return "X86ISD::FMINS";
   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
+  case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
-  case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
+  case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
@@ -23827,7 +24584,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
-  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
@@ -23876,6 +24632,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   case X86ISD::KTEST:              return "X86ISD::KTEST";
+  case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
+  case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
@@ -23976,9 +24734,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
+  case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
+  case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
+  case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
@@ -24012,6 +24774,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
+  case X86ISD::LWPINS:             return "X86ISD::LWPINS";
+  case X86ISD::MGATHER:            return "X86ISD::MGATHER";
   }
   return nullptr;
 }
@@ -24236,16 +25000,22 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
   //  xbegin sinkMBB
   //
   // mainMBB:
-  //  eax = -1
+  //  s0 = -1
+  //
+  // fallBB:
+  //  eax = # XABORT_DEF
+  //  s1 = eax
   //
   // sinkMBB:
-  //  v = eax
+  //  v = phi(s0/mainBB, s1/fallBB)
 
   MachineBasicBlock *thisMBB = MBB;
   MachineFunction *MF = MBB->getParent();
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
+  MF->insert(I, fallMBB);
   MF->insert(I, sinkMBB);
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -24253,25 +25023,40 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned fallDstReg = MRI.createVirtualRegister(RC);
+
   // thisMBB:
-  //  xbegin sinkMBB
+  //  xbegin fallMBB
   //  # fallthrough to mainMBB
-  //  # abortion to sinkMBB
-  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  //  # abortion to fallMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
   thisMBB->addSuccessor(mainMBB);
-  thisMBB->addSuccessor(sinkMBB);
+  thisMBB->addSuccessor(fallMBB);
 
   // mainMBB:
-  //  EAX = -1
-  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  //  mainDstReg := -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
+  BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   mainMBB->addSuccessor(sinkMBB);
 
-  // sinkMBB:
-  // EAX is live into the sinkMBB
-  sinkMBB->addLiveIn(X86::EAX);
-  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
-          MI.getOperand(0).getReg())
+  // fallMBB:
+  //  ; pseudo instruction to model hardware's definition from XABORT
+  //  EAX := XABORT_DEF
+  //  fallDstReg := EAX
+  BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
+  BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
       .addReg(X86::EAX);
+  fallMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
+      .addReg(mainDstReg).addMBB(mainMBB)
+      .addReg(fallDstReg).addMBB(fallMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
@@ -24302,7 +25087,7 @@ static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
   for (unsigned i = 1; i < NumArgs; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
-      MIB.addOperand(Op);
+      MIB.add(Op);
   }
   if (MI.hasOneMemOperand())
     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24338,7 +25123,7 @@ static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
   for (unsigned i = 1; i < NumArgs; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
-      MIB.addOperand(Op);
+      MIB.add(Op);
   }
   if (MI.hasOneMemOperand())
     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -24398,7 +25183,7 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
 
   unsigned ValOps = X86::AddrNumOperands;
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
@@ -24413,6 +25198,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
+                                      const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  // Address into RAX/EAX
+  unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.add(MI->getOperand(i));
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+
+
 MachineBasicBlock *
 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
@@ -24536,12 +25341,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
     // Load the offset value into a register
     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, UseFPOffset ? 4 : 0)
-      .addOperand(Segment)
-      .setMemRefs(MMOBegin, MMOEnd);
+        .add(Base)
+        .add(Scale)
+        .add(Index)
+        .addDisp(Disp, UseFPOffset ? 4 : 0)
+        .add(Segment)
+        .setMemRefs(MMOBegin, MMOEnd);
 
     // Check if there is enough room left to pull this argument.
     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
@@ -24561,12 +25366,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
     // Read the reg_save_area address.
     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, 16)
-      .addOperand(Segment)
-      .setMemRefs(MMOBegin, MMOEnd);
+        .add(Base)
+        .add(Scale)
+        .add(Index)
+        .addDisp(Disp, 16)
+        .add(Segment)
+        .setMemRefs(MMOBegin, MMOEnd);
 
     // Zero-extend the offset
     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
@@ -24588,13 +25393,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
     // Store it back into the va_list.
     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
-      .addOperand(Base)
-      .addOperand(Scale)
-      .addOperand(Index)
-      .addDisp(Disp, UseFPOffset ? 4 : 0)
-      .addOperand(Segment)
-      .addReg(NextOffsetReg)
-      .setMemRefs(MMOBegin, MMOEnd);
+        .add(Base)
+        .add(Scale)
+        .add(Index)
+        .addDisp(Disp, UseFPOffset ? 4 : 0)
+        .add(Segment)
+        .addReg(NextOffsetReg)
+        .setMemRefs(MMOBegin, MMOEnd);
 
     // Jump to endMBB
     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
@@ -24608,12 +25413,12 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   // Load the overflow_area address into a register.
   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
-    .addOperand(Base)
-    .addOperand(Scale)
-    .addOperand(Index)
-    .addDisp(Disp, 8)
-    .addOperand(Segment)
-    .setMemRefs(MMOBegin, MMOEnd);
+      .add(Base)
+      .add(Scale)
+      .add(Index)
+      .addDisp(Disp, 8)
+      .add(Segment)
+      .setMemRefs(MMOBegin, MMOEnd);
 
   // If we need to align it, do so. Otherwise, just copy the address
   // to OverflowDestReg.
@@ -24644,13 +25449,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
   // Store the new overflow address.
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
-    .addOperand(Base)
-    .addOperand(Scale)
-    .addOperand(Index)
-    .addDisp(Disp, 8)
-    .addOperand(Segment)
-    .addReg(NextAddrReg)
-    .setMemRefs(MMOBegin, MMOEnd);
+      .add(Base)
+      .add(Scale)
+      .add(Index)
+      .addDisp(Disp, 8)
+      .add(Segment)
+      .addReg(NextAddrReg)
+      .setMemRefs(MMOBegin, MMOEnd);
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
@@ -24867,7 +25672,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   //
   //   (CMOV (CMOV F, T, cc1), T, cc2)
   //
-  // to two successives branches.  For that, we look for another CMOV as the
+  // to two successive branches.  For that, we look for another CMOV as the
   // following instruction.
   //
   // Without this, we would add a PHI between the two jumps, which ends up
@@ -25123,12 +25928,12 @@ X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
     // instruction using the same address operands.
     if (Operand.isReg())
       Operand.setIsKill(false);
-    MIB.addOperand(Operand);
+    MIB.add(Operand);
   }
   MachineInstr *FOpMI = MIB;
   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
   for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
@@ -25331,7 +26136,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
-    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
+    BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
@@ -25414,6 +26219,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
@@ -25430,7 +26236,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
 
   DstReg = MI.getOperand(CurOp++).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
-  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
+  (void)TRI;
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
@@ -25508,7 +26315,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
     else
-      MIB.addOperand(MI.getOperand(MemOpndSlot + i));
+      MIB.add(MI.getOperand(MemOpndSlot + i));
   }
   if (!UseImmLabel)
     MIB.addReg(LabelReg);
@@ -25591,7 +26398,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   // Reload FP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.addOperand(MI.getOperand(i));
+    MIB.add(MI.getOperand(i));
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload IP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
@@ -25599,7 +26406,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), LabelOffset);
     else
-      MIB.addOperand(MI.getOperand(i));
+      MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload SP
@@ -25608,7 +26415,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), SPOffset);
     else
-      MIB.addOperand(MI.getOperand(i));
+      MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Jump
@@ -25625,7 +26432,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
@@ -25644,8 +26451,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     VR = MRI->createVirtualRegister(TRC);
     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
 
-    /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
-
     if (Subtarget.is64Bit())
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
           .addReg(X86::RIP)
@@ -25655,7 +26460,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
           .addReg(0);
     else
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
-          .addReg(0) /* XII->getGlobalBaseReg(MF) */
+          .addReg(0) /* TII->getGlobalBaseReg(MF) */
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
@@ -25677,7 +26482,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
   int FI = MFI.getFunctionContextIndex();
 
   // Get a mapping of the call site numbers to all of the landing pads they're
@@ -25749,9 +26554,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
 
-  const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
-  const X86RegisterInfo &RI = XII->getRegisterInfo();
-
+  const X86RegisterInfo &RI = TII->getRegisterInfo();
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   if (RI.hasBasePointer(*MF)) {
@@ -25799,8 +26602,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   SmallVector<MachineBasicBlock *, 64> MBBLPads;
-  const MCPhysReg *SavedRegs =
-      Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (MachineBasicBlock *MBB : InvokeBBs) {
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
@@ -26033,6 +26835,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   case X86::MONITORX:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+
+  // Cache line zero
+  case X86::CLZERO:
+    return emitClzero(&MI, BB, Subtarget);
+
   // PKU feature
   case X86::WRPKRU:
     return emitWRPKRU(MI, BB, Subtarget);
@@ -26068,6 +26875,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
 
+  case TargetOpcode::PATCHABLE_EVENT_CALL:
+    // Do nothing here, handle in xray instrumentation pass.
+    return BB;
+
   case X86::LCMPXCHG8B: {
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
@@ -26135,12 +26946,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 //===----------------------------------------------------------------------===//
 
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                      APInt &KnownZero,
-                                                      APInt &KnownOne,
+                                                      KnownBits &Known,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
-  unsigned BitWidth = KnownZero.getBitWidth();
+  unsigned BitWidth = Known.getBitWidth();
   unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
           Opc == ISD::INTRINSIC_W_CHAIN ||
@@ -26148,7 +26960,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
-  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  Known.resetAll();
   switch (Opc) {
   default: break;
   case X86ISD::ADD:
@@ -26167,44 +26979,101 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       break;
     LLVM_FALLTHROUGH;
   case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    Known.Zero.setBitsFrom(1);
     break;
   case X86ISD::MOVMSK: {
     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+    Known.Zero.setBitsFrom(NumLoBits);
+    break;
+  }
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
+        Known.setAllZero();
+        break;
+      }
+
+      DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      if (Opc == X86ISD::VSHLI) {
+        Known.Zero <<= ShAmt;
+        Known.One <<= ShAmt;
+        // Low bits are known zero.
+        Known.Zero.setLowBits(ShAmt);
+      } else {
+        Known.Zero.lshrInPlace(ShAmt);
+        Known.One.lshrInPlace(ShAmt);
+        // High bits are known zero.
+        Known.Zero.setHighBits(ShAmt);
+      }
+    }
     break;
   }
   case X86ISD::VZEXT: {
     SDValue N0 = Op.getOperand(0);
-    unsigned NumElts = Op.getValueType().getVectorNumElements();
-    unsigned InNumElts = N0.getValueType().getVectorNumElements();
-    unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
-
-    KnownZero = KnownOne = APInt(InBitWidth, 0);
-    APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
-    DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
-    KnownOne = KnownOne.zext(BitWidth);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+    unsigned NumElts = VT.getVectorNumElements();
+
+    EVT SrcVT = N0.getValueType();
+    unsigned InNumElts = SrcVT.getVectorNumElements();
+    unsigned InBitWidth = SrcVT.getScalarSizeInBits();
+    assert(InNumElts >= NumElts && "Illegal VZEXT input");
+
+    Known = KnownBits(InBitWidth);
+    APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
+    DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
+    Known = Known.zext(BitWidth);
+    Known.Zero.setBitsFrom(InBitWidth);
     break;
   }
   }
 }
 
 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
-    SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
-  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
-  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
-    return Op.getScalarValueSizeInBits();
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
+  unsigned VTBits = Op.getScalarValueSizeInBits();
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case X86ISD::SETCC_CARRY:
+    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+    return VTBits;
 
-  if (Op.getOpcode() == X86ISD::VSEXT) {
-    EVT VT = Op.getValueType();
-    EVT SrcVT = Op.getOperand(0).getValueType();
-    unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+  case X86ISD::VSEXT: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    Tmp += VTBits - Src.getScalarValueSizeInBits();
     return Tmp;
   }
 
+  case X86ISD::VSHLI: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+    if (ShiftVal.uge(VTBits))
+      return VTBits; // Shifted all bits out --> zero.
+    if (ShiftVal.uge(Tmp))
+      return 1; // Shifted all sign bits out --> unknown.
+    return Tmp - ShiftVal.getZExtValue();
+  }
+
+  case X86ISD::VSRAI: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+    ShiftVal += Tmp;
+    return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+  }
+
+  case X86ISD::PCMPGT:
+  case X86ISD::PCMPEQ:
+  case X86ISD::CMPP:
+  case X86ISD::VPCOM:
+  case X86ISD::VPCOMU:
+    // Vector compares return zero/all-bits result values.
+    return VTBits;
+  }
+
   // Fallback case.
   return 1;
 }
@@ -26228,24 +27097,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                    bool FloatDomain,
+                                    bool AllowFloatDomain, bool AllowIntDomain,
+                                    SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
-  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
-  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
-      isUndefOrEqual(Mask[0], 0) &&
-      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
-    Shuffle = X86ISD::VZEXT_MOVL;
-    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
-    return true;
-  }
-
-  // Match against a VZEXT instruction.
-  // TODO: Add 256/512-bit vector support.
-  if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+  // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+  // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
     unsigned MaxScale = 64 / MaskEltSize;
     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
       bool Match = true;
@@ -26255,19 +27117,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
       }
       if (Match) {
-        SrcVT = MaskVT;
+        unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
+        SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
+        if (SrcVT != MaskVT)
+          V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
-        Shuffle = X86ISD::VZEXT;
+        Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
+                                  : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
         return true;
       }
     }
   }
 
+  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+      isUndefOrEqual(Mask[0], 0) &&
+      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+    Shuffle = X86ISD::VZEXT_MOVL;
+    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+    return true;
+  }
+
   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   // instructions are no slower than UNPCKLPD but has the option to
   // fold the input operand into even an unaligned memory load.
-  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v2f64;
@@ -26285,7 +27160,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
-  if (MaskVT.is256BitVector() && FloatDomain) {
+  if (MaskVT.is256BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVDDUP;
@@ -26304,7 +27179,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
-  if (MaskVT.is512BitVector() && FloatDomain) {
+  if (MaskVT.is512BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX512() &&
            "AVX512 required for 512-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
@@ -26343,47 +27218,78 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                           bool FloatDomain,
+                                           const APInt &Zeroable,
+                                           bool AllowFloatDomain,
+                                           bool AllowIntDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
+  unsigned InputSizeInBits = MaskVT.getSizeInBits();
+  unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
+  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
 
-  bool ContainsZeros = false;
-  SmallBitVector Zeroable(NumMaskElts, false);
-  for (unsigned i = 0; i != NumMaskElts; ++i) {
-    int M = Mask[i];
-    Zeroable[i] = isUndefOrZero(M);
-    ContainsZeros |= (M == SM_SentinelZero);
-  }
+  bool ContainsZeros =
+      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
 
-  // Attempt to match against byte/bit shifts.
-  // FIXME: Add 512-bit support.
-  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
-                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
-    int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
-                                             MaskVT.getScalarSizeInBits(), Mask,
-                                             0, Zeroable, Subtarget);
-    if (0 < ShiftAmt) {
-      PermuteImm = (unsigned)ShiftAmt;
+  // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
+  if (!ContainsZeros && MaskScalarSizeInBits == 64) {
+    // Check for lane crossing permutes.
+    if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+      // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+      if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
+        Shuffle = X86ISD::VPERMI;
+        ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
+        PermuteImm = getV4X86ShuffleImm(Mask);
+        return true;
+      }
+      if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
+        SmallVector<int, 4> RepeatedMask;
+        if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+          Shuffle = X86ISD::VPERMI;
+          ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
+          PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+          return true;
+        }
+      }
+    } else if (AllowFloatDomain && Subtarget.hasAVX()) {
+      // VPERMILPD can permute with a non-repeating shuffle.
+      Shuffle = X86ISD::VPERMILPI;
+      ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+      PermuteImm = 0;
+      for (int i = 0, e = Mask.size(); i != e; ++i) {
+        int M = Mask[i];
+        if (M == SM_SentinelUndef)
+          continue;
+        assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+        PermuteImm |= (M & 1) << i;
+      }
       return true;
     }
   }
 
-  // Ensure we don't contain any zero elements.
-  if (ContainsZeros)
-    return false;
-
-  assert(llvm::all_of(Mask, [&](int M) {
-                        return SM_SentinelUndef <= M && M < (int)NumMaskElts;
-                      }) && "Expected unary shuffle");
-
-  unsigned InputSizeInBits = MaskVT.getSizeInBits();
-  unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
-  MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
+  // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
+  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+  if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
+      !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
+    SmallVector<int, 4> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+      // Narrow the repeated mask to create 32-bit element permutes.
+      SmallVector<int, 4> WordMask = RepeatedMask;
+      if (MaskScalarSizeInBits == 64)
+        scaleShuffleMask(2, RepeatedMask, WordMask);
+
+      Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
+      ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
+      ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
+      PermuteImm = getV4X86ShuffleImm(WordMask);
+      return true;
+    }
+  }
 
-  // Handle PSHUFLW/PSHUFHW repeated patterns.
-  if (MaskScalarSizeInBits == 16) {
+  // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
+  if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
       ArrayRef<int> LoMask(Mask.data() + 0, 4);
@@ -26411,110 +27317,59 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
         return true;
       }
-
-      return false;
     }
-    return false;
   }
 
-  // We only support permutation of 32/64 bit elements after this.
-  if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
-    return false;
-
-  // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
-  // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
-  if (FloatDomain && !Subtarget.hasAVX())
-    return false;
-
-  // Pre-AVX2 we must use float shuffles on 256-bit vectors.
-  if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
-    FloatDomain = true;
-
-  // Check for lane crossing permutes.
-  if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
-    // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
-    if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
-      Shuffle = X86ISD::VPERMI;
-      ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
-      PermuteImm = getV4X86ShuffleImm(Mask);
+  // Attempt to match against byte/bit shifts.
+  // FIXME: Add 512-bit support.
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+    int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+                                             MaskScalarSizeInBits, Mask,
+                                             0, Zeroable, Subtarget);
+    if (0 < ShiftAmt) {
+      PermuteImm = (unsigned)ShiftAmt;
       return true;
     }
-    if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
-      SmallVector<int, 4> RepeatedMask;
-      if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
-        Shuffle = X86ISD::VPERMI;
-        ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
-        PermuteImm = getV4X86ShuffleImm(RepeatedMask);
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // VPERMILPD can permute with a non-repeating shuffle.
-  if (FloatDomain && MaskScalarSizeInBits == 64) {
-    Shuffle = X86ISD::VPERMILPI;
-    ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
-    PermuteImm = 0;
-    for (int i = 0, e = Mask.size(); i != e; ++i) {
-      int M = Mask[i];
-      if (M == SM_SentinelUndef)
-        continue;
-      assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
-      PermuteImm |= (M & 1) << i;
-    }
-    return true;
   }
 
-  // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
-  SmallVector<int, 4> RepeatedMask;
-  if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
-    return false;
-
-  // Narrow the repeated mask for 32-bit element permutes.
-  SmallVector<int, 4> WordMask = RepeatedMask;
-  if (MaskScalarSizeInBits == 64)
-    scaleShuffleMask(2, RepeatedMask, WordMask);
-
-  Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
-  ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
-  ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
-  PermuteImm = getV4X86ShuffleImm(WordMask);
-  return true;
+  return false;
 }
 
 // Attempt to match a combined unary shuffle mask against supported binary
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                     bool FloatDomain, SDValue &V1, SDValue &V2,
+                                     bool AllowFloatDomain, bool AllowIntDomain,
+                                     SDValue &V1, SDValue &V2, SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &ShuffleVT,
                                      bool IsUnary) {
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
-    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVLHPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVHLPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
-        (FloatDomain || !Subtarget.hasSSE41())) {
+        (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
       ShuffleVT = MaskVT;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
-        (FloatDomain || !Subtarget.hasSSE41())) {
+        (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
       ShuffleVT = MaskVT;
       return true;
@@ -26527,57 +27382,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
-    MVT LegalVT = MaskVT;
-    if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
-      LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
-
-    SmallVector<int, 64> Unpckl, Unpckh;
-    if (IsUnary) {
-      createUnpackShuffleMask(MaskVT, Unpckl, true, true);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        V2 = V1;
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      createUnpackShuffleMask(MaskVT, Unpckh, false, true);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        V2 = V1;
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-    } else {
-      createUnpackShuffleMask(MaskVT, Unpckl, true, false);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      createUnpackShuffleMask(MaskVT, Unpckh, false, false);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      ShuffleVectorSDNode::commuteMask(Unpckl);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        std::swap(V1, V2);
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      ShuffleVectorSDNode::commuteMask(Unpckh);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        std::swap(V1, V2);
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
+    if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
+                                    DAG, Subtarget)) {
+      ShuffleVT = MaskVT;
+      if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
+        ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+      return true;
     }
   }
 
@@ -26585,17 +27395,20 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 }
 
 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                            bool FloatDomain,
-                                            SDValue &V1, SDValue &V2,
-                                            SDLoc &DL, SelectionDAG &DAG,
+                                            const APInt &Zeroable,
+                                            bool AllowFloatDomain,
+                                            bool AllowIntDomain,
+                                            SDValue &V1, SDValue &V2, SDLoc &DL,
+                                            SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget,
                                             unsigned &Shuffle, MVT &ShuffleVT,
                                             unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
+  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   // Attempt to match against PALIGNR byte rotate.
-  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
-                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
     if (0 < ByteRotation) {
       Shuffle = X86ISD::PALIGNR;
@@ -26606,77 +27419,69 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to combine to X86ISD::BLENDI.
-  if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
-                           (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
-    // Determine a type compatible with X86ISD::BLENDI.
-    // TODO - add 16i16 support (requires lane duplication).
-    MVT BlendVT = MaskVT;
-    if (Subtarget.hasAVX2()) {
-      if (BlendVT == MVT::v4i64)
-        BlendVT = MVT::v8i32;
-      else if (BlendVT == MVT::v2i64)
-        BlendVT = MVT::v4i32;
-    } else {
-      if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
-        BlendVT = MVT::v8i16;
-      else if (BlendVT == MVT::v4i64)
-        BlendVT = MVT::v4f64;
-      else if (BlendVT == MVT::v8i32)
-        BlendVT = MVT::v8f32;
-    }
-
-    unsigned BlendSize = BlendVT.getVectorNumElements();
-    unsigned MaskRatio = BlendSize / NumMaskElts;
-
-    // Can we blend with zero?
-    if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
-                                         /*Low*/ 0) &&
-        NumMaskElts <= BlendVT.getVectorNumElements()) {
-      PermuteImm = 0;
-      for (unsigned i = 0; i != BlendSize; ++i)
-        if (Mask[i / MaskRatio] < 0)
-          PermuteImm |= 1u << i;
-
-      V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
-      Shuffle = X86ISD::BLENDI;
-      ShuffleVT = BlendVT;
-      return true;
-    }
-
-    // Attempt to match as a binary blend.
-    if (NumMaskElts <= BlendVT.getVectorNumElements()) {
-      bool MatchBlend = true;
-      for (int i = 0; i != (int)NumMaskElts; ++i) {
-        int M = Mask[i];
-        if (M == SM_SentinelUndef)
-          continue;
-        else if (M == SM_SentinelZero)
-          MatchBlend = false;
-        else if ((M != i) && (M != (i + (int)NumMaskElts)))
-          MatchBlend = false;
-      }
+  if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+                            (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
+      (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
+    uint64_t BlendMask = 0;
+    bool ForceV1Zero = false, ForceV2Zero = false;
+    SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
+    if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
+                                  BlendMask)) {
+      if (MaskVT == MVT::v16i16) {
+        // We can only use v16i16 PBLENDW if the lanes are repeated.
+        SmallVector<int, 8> RepeatedMask;
+        if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
+                                        RepeatedMask)) {
+          assert(RepeatedMask.size() == 8 &&
+                 "Repeated mask size doesn't match!");
+          PermuteImm = 0;
+          for (int i = 0; i < 8; ++i)
+            if (RepeatedMask[i] >= 8)
+              PermuteImm |= 1 << i;
+          V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+          V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+          Shuffle = X86ISD::BLENDI;
+          ShuffleVT = MaskVT;
+          return true;
+        }
+      } else {
+        // Determine a type compatible with X86ISD::BLENDI.
+        ShuffleVT = MaskVT;
+        if (Subtarget.hasAVX2()) {
+          if (ShuffleVT == MVT::v4i64)
+            ShuffleVT = MVT::v8i32;
+          else if (ShuffleVT == MVT::v2i64)
+            ShuffleVT = MVT::v4i32;
+        } else {
+          if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+            ShuffleVT = MVT::v8i16;
+          else if (ShuffleVT == MVT::v4i64)
+            ShuffleVT = MVT::v4f64;
+          else if (ShuffleVT == MVT::v8i32)
+            ShuffleVT = MVT::v8f32;
+        }
 
-      if (MatchBlend) {
-        PermuteImm = 0;
-        for (unsigned i = 0; i != BlendSize; ++i)
-          if ((int)NumMaskElts <= Mask[i / MaskRatio])
-            PermuteImm |= 1u << i;
+        if (!ShuffleVT.isFloatingPoint()) {
+          int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+          BlendMask =
+              scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+          ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+          ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+        }
 
+        V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+        V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+        PermuteImm = (unsigned)BlendMask;
         Shuffle = X86ISD::BLENDI;
-        ShuffleVT = BlendVT;
         return true;
       }
     }
   }
 
   // Attempt to combine to INSERTPS.
-  if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
-    SmallBitVector Zeroable(4, false);
-    for (unsigned i = 0; i != NumMaskElts; ++i)
-      if (Mask[i] < 0)
-        Zeroable[i] = true;
-
-    if (Zeroable.any() &&
+  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+      MaskVT.is128BitVector()) {
+    if (Zeroable.getBoolValue() &&
         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
       Shuffle = X86ISD::INSERTPS;
       ShuffleVT = MVT::v4f32;
@@ -26685,22 +27490,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to combine to SHUFPD.
-  if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
-      (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
-      (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+  if (AllowFloatDomain && EltSizeInBits == 64 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
       Shuffle = X86ISD::SHUFP;
-      ShuffleVT = MaskVT;
+      ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
       return true;
     }
   }
 
   // Attempt to combine to SHUFPS.
-  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
-      (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
-      (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+  if (AllowFloatDomain && EltSizeInBits == 32 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     SmallVector<int, 4> RepeatedMask;
     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+      // Match each half of the repeated mask, to determine if its just
+      // referencing one of the vectors, is zeroable or entirely undef.
       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
         int M0 = RepeatedMask[Offset];
         int M1 = RepeatedMask[Offset + 1];
@@ -26732,7 +27541,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         V1 = Lo;
         V2 = Hi;
         Shuffle = X86ISD::SHUFP;
-        ShuffleVT = MaskVT;
+        ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
         PermuteImm = getV4X86ShuffleImm(ShufMask);
         return true;
       }
@@ -26764,7 +27573,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // here, we're not going to remove the operands we find.
   bool UnaryShuffle = (Inputs.size() == 1);
   SDValue V1 = peekThroughBitcasts(Inputs[0]);
-  SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+  SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
+                             : peekThroughBitcasts(Inputs[1]));
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
@@ -26853,6 +27663,18 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   MVT ShuffleSrcVT, ShuffleVT;
   unsigned Shuffle, PermuteImm;
 
+  // Which shuffle domains are permitted?
+  // Permit domain crossing at higher combine depths.
+  bool AllowFloatDomain = FloatDomain || (Depth > 3);
+  bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
+                        (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
+
+  // Determine zeroable mask elements.
+  APInt Zeroable(NumMaskElts, 0);
+  for (unsigned i = 0; i != NumMaskElts; ++i)
+    if (isUndefOrZero(Mask[i]))
+      Zeroable.setBit(i);
+
   if (UnaryShuffle) {
     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
     // directly if we don't shuffle the lower element and we shuffle the upper
@@ -26869,8 +27691,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       }
     }
 
-    if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
-                                ShuffleSrcVT, ShuffleVT)) {
+    if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+                                V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+                                ShuffleVT)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26884,8 +27707,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       return true;
     }
 
-    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
-                                       Shuffle, ShuffleVT, PermuteImm)) {
+    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+                                       AllowIntDomain, Subtarget, Shuffle,
+                                       ShuffleVT, PermuteImm)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26901,8 +27725,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
-                               Shuffle, ShuffleVT, UnaryShuffle)) {
+  if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+                               V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
+                               UnaryShuffle)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26918,8 +27743,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return true;
   }
 
-  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
-                                      DAG, Subtarget, Shuffle, ShuffleVT,
+  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+                                      AllowIntDomain, V1, V2, DL, DAG,
+                                      Subtarget, Shuffle, ShuffleVT,
                                       PermuteImm)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
@@ -26937,6 +27763,45 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return true;
   }
 
+  // Typically from here on, we need an integer version of MaskVT.
+  MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
+  IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
+
+  // Annoyingly, SSE4A instructions don't map into the above match helpers.
+  if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
+    uint64_t BitLen, BitIdx;
+    if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+                                  Zeroable)) {
+      if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
+        return false; // Nothing to do!
+      V1 = DAG.getBitcast(IntMaskVT, V1);
+      DCI.AddToWorklist(V1.getNode());
+      Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
+                        DAG.getConstant(BitLen, DL, MVT::i8),
+                        DAG.getConstant(BitIdx, DL, MVT::i8));
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+
+    if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+      if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
+        return false; // Nothing to do!
+      V1 = DAG.getBitcast(IntMaskVT, V1);
+      DCI.AddToWorklist(V1.getNode());
+      V2 = DAG.getBitcast(IntMaskVT, V2);
+      DCI.AddToWorklist(V2.getNode());
+      Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
+                        DAG.getConstant(BitLen, DL, MVT::i8),
+                        DAG.getConstant(BitIdx, DL, MVT::i8));
+      DCI.AddToWorklist(Res.getNode());
+      DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+                    /*AddTo*/ true);
+      return true;
+    }
+  }
+
   // Don't try to re-form single instruction chains under any circumstances now
   // that we've done encoding canonicalization for them.
   if (Depth < 2)
@@ -26957,9 +27822,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
-      MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
-      MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
-      SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       DCI.AddToWorklist(VPermMask.getNode());
       Res = DAG.getBitcast(MaskVT, V1);
       DCI.AddToWorklist(Res.getNode());
@@ -26988,9 +27851,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         if (Mask[i] == SM_SentinelZero)
           Mask[i] = NumMaskElts + i;
 
-      MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
-      MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
-      SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       DCI.AddToWorklist(VPermMask.getNode());
       Res = DAG.getBitcast(MaskVT, V1);
       DCI.AddToWorklist(Res.getNode());
@@ -27015,9 +27876,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
-      MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
-      MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
-      SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+      SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       DCI.AddToWorklist(VPermMask.getNode());
       V1 = DAG.getBitcast(MaskVT, V1);
       DCI.AddToWorklist(V1.getNode());
@@ -27039,12 +27898,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
-    SmallBitVector UndefElts(NumMaskElts, false);
+    APInt UndefElts(NumMaskElts, 0);
     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
       int M = Mask[i];
       if (M == SM_SentinelUndef) {
-        UndefElts[i] = true;
+        UndefElts.setBit(i);
         continue;
       }
       if (M == SM_SentinelZero)
@@ -27076,8 +27935,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
       VPermIdx.push_back(Idx);
     }
-    MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
-    SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+    SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
     DCI.AddToWorklist(VPermMask.getNode());
     Res = DAG.getBitcast(MaskVT, V1);
     DCI.AddToWorklist(Res.getNode());
@@ -27100,8 +27958,6 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
     SmallVector<int, 8> VPerm2Idx;
-    MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
-    MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
     unsigned M2ZImm = 0;
     for (int M : Mask) {
       if (M == SM_SentinelUndef) {
@@ -27121,7 +27977,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(MaskVT, V2);
     DCI.AddToWorklist(V2.getNode());
-    SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
+    SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
     DCI.AddToWorklist(VPerm2MaskOp.getNode());
     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
                       DAG.getConstant(M2ZImm, DL, MVT::i8));
@@ -27228,8 +28084,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
   // Extract constant bits from each source op.
   bool OneUseConstantOp = false;
-  SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
-  SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+  SmallVector<APInt, 16> UndefEltsOps(NumOps);
+  SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue SrcOp = Ops[i];
     OneUseConstantOp |= SrcOp.hasOneUse();
@@ -27245,18 +28101,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
     return false;
 
   // Shuffle the constant bits according to the mask.
-  SmallBitVector UndefElts(NumMaskElts, false);
-  SmallBitVector ZeroElts(NumMaskElts, false);
-  SmallBitVector ConstantElts(NumMaskElts, false);
+  APInt UndefElts(NumMaskElts, 0);
+  APInt ZeroElts(NumMaskElts, 0);
+  APInt ConstantElts(NumMaskElts, 0);
   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
                                         APInt::getNullValue(MaskSizeInBits));
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       continue;
     } else if (M == SM_SentinelZero) {
-      ZeroElts[i] = true;
+      ZeroElts.setBit(i);
       continue;
     }
     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
@@ -27266,21 +28122,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
     if (SrcUndefElts[SrcMaskIdx]) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       continue;
     }
 
     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
     APInt &Bits = SrcEltBits[SrcMaskIdx];
     if (!Bits) {
-      ZeroElts[i] = true;
+      ZeroElts.setBit(i);
       continue;
     }
 
-    ConstantElts[i] = true;
+    ConstantElts.setBit(i);
     ConstantBitData[i] = Bits;
   }
-  assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+  assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
 
   // Create the constant data.
   MVT MaskSVT;
@@ -27330,6 +28186,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
                                           int SrcOpIndex, SDValue Root,
                                           ArrayRef<int> RootMask,
+                                          ArrayRef<const SDNode*> SrcNodes,
                                           int Depth, bool HasVariableMask,
                                           SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
@@ -27353,13 +28210,17 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
-  SDValue Input0, Input1;
-  SmallVector<int, 16> OpMask;
-  if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
+  SmallVector<int, 64> OpMask;
+  SmallVector<SDValue, 2> OpInputs;
+  if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
     return false;
 
+  assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
+  SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
+  SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
+
   // Add the inputs to the Ops list, avoiding duplicates.
-  SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+  SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
 
   int InputIdx0 = -1, InputIdx1 = -1;
   for (int i = 0, e = Ops.size(); i < e; ++i) {
@@ -27385,52 +28246,70 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
            OpMask.size() % RootMask.size() == 0) ||
           OpMask.size() == RootMask.size()) &&
          "The smaller number of elements must divide the larger.");
-  int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
-  int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
-  int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
-  assert(((RootRatio == 1 && OpRatio == 1) ||
-          (RootRatio == 1) != (OpRatio == 1)) &&
+
+  // This function can be performance-critical, so we rely on the power-of-2
+  // knowledge that we have about the mask sizes to replace div/rem ops with
+  // bit-masks and shifts.
+  assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
+  assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
+  unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
+  unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
+
+  unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
+  unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
+  unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
+  assert((RootRatio == 1 || OpRatio == 1) &&
          "Must not have a ratio for both incoming and op masks!");
 
-  SmallVector<int, 16> Mask;
-  Mask.reserve(MaskWidth);
+  assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
+  assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
+  assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
+  unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
+  unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
+
+  SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
 
   // Merge this shuffle operation's mask into our accumulated mask. Note that
   // this shuffle's mask will be the first applied to the input, followed by the
   // root mask to get us all the way to the root value arrangement. The reason
   // for this order is that we are recursing up the operation chain.
-  for (int i = 0; i < MaskWidth; ++i) {
-    int RootIdx = i / RootRatio;
+  for (unsigned i = 0; i < MaskWidth; ++i) {
+    unsigned RootIdx = i >> RootRatioLog2;
     if (RootMask[RootIdx] < 0) {
       // This is a zero or undef lane, we're done.
-      Mask.push_back(RootMask[RootIdx]);
+      Mask[i] = RootMask[RootIdx];
       continue;
     }
 
-    int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
+    unsigned RootMaskedIdx =
+        RootRatio == 1
+            ? RootMask[RootIdx]
+            : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
 
     // Just insert the scaled root mask value if it references an input other
     // than the SrcOp we're currently inserting.
     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
-      Mask.push_back(RootMaskedIdx);
+      Mask[i] = RootMaskedIdx;
       continue;
     }
 
-    RootMaskedIdx %= MaskWidth;
-
-    int OpIdx = RootMaskedIdx / OpRatio;
+    RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
+    unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
     if (OpMask[OpIdx] < 0) {
       // The incoming lanes are zero or undef, it doesn't matter which ones we
       // are using.
-      Mask.push_back(OpMask[OpIdx]);
+      Mask[i] = OpMask[OpIdx];
       continue;
     }
 
     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
-    int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
-    OpMaskedIdx %= MaskWidth;
+    unsigned OpMaskedIdx =
+        OpRatio == 1
+            ? OpMask[OpIdx]
+            : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
 
+    OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
     if (OpMask[OpIdx] < (int)OpMask.size()) {
       assert(0 <= InputIdx0 && "Unknown target shuffle input");
       OpMaskedIdx += InputIdx0 * MaskWidth;
@@ -27439,7 +28318,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
       OpMaskedIdx += InputIdx1 * MaskWidth;
     }
 
-    Mask.push_back(OpMaskedIdx);
+    Mask[i] = OpMaskedIdx;
   }
 
   // Handle the all undef/zero cases early.
@@ -27457,28 +28336,25 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
   }
 
   // Remove unused shuffle source ops.
-  SmallVector<SDValue, 8> UsedOps;
-  for (int i = 0, e = Ops.size(); i < e; ++i) {
-    int lo = UsedOps.size() * MaskWidth;
-    int hi = lo + MaskWidth;
-    if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
-      UsedOps.push_back(Ops[i]);
-      continue;
-    }
-    for (int &M : Mask)
-      if (lo <= M)
-        M -= MaskWidth;
-  }
-  assert(!UsedOps.empty() && "Shuffle with no inputs detected");
-  Ops = UsedOps;
+  resolveTargetShuffleInputsAndMask(Ops, Mask);
+  assert(!Ops.empty() && "Shuffle with no inputs detected");
 
   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
 
-  // See if we can recurse into each shuffle source op (if it's a target shuffle).
+  // Update the list of shuffle nodes that have been combined so far.
+  SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
+                                                SrcNodes.end());
+  CombinedNodes.push_back(Op.getNode());
+
+  // See if we can recurse into each shuffle source op (if it's a target
+  // shuffle). The source op should only be combined if it either has a
+  // single use (i.e. current Op) or all its users have already been combined.
   for (int i = 0, e = Ops.size(); i < e; ++i)
-    if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
-      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
-                                        HasVariableMask, DAG, DCI, Subtarget))
+    if (Ops[i].getNode()->hasOneUse() ||
+        SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+                                        Depth + 1, HasVariableMask, DAG, DCI,
+                                        Subtarget))
         return true;
 
   // Attempt to constant fold all of the constant source ops.
@@ -27495,7 +28371,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
   // elements, and shrink them to the half-width mask. It does this in a loop
   // so it will reduce the size of the mask to the minimal width mask which
   // performs an equivalent shuffle.
-  SmallVector<int, 16> WidenedMask;
+  SmallVector<int, 64> WidenedMask;
   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
     Mask = std::move(WidenedMask);
   }
@@ -27561,8 +28437,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
 /// altering anything.
 static SDValue
 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
-                             SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             SelectionDAG &DAG) {
   assert(N.getOpcode() == X86ISD::PSHUFD &&
          "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
@@ -27842,19 +28717,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   }
   case X86ISD::MOVSD:
   case X86ISD::MOVSS: {
-    bool isFloat = VT.isFloatingPoint();
     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
-    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
-    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
-    assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+    if (isZero0 && isZero1)
+      return SDValue();
 
     // We often lower to MOVSD/MOVSS from integer as well as native float
     // types; remove unnecessary domain-crossing bitcasts if we can to make it
     // easier to combine shuffles later on. We've already accounted for the
     // domain switching cost when we decided to lower with it.
+    bool isFloat = VT.isFloatingPoint();
+    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
@@ -28025,7 +28901,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     break;
 
   case X86ISD::PSHUFD:
-    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
       return NewN;
 
     break;
@@ -28173,12 +29049,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-
-  // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
-    return SDValue();
-
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
   if (TLI.isTypeLegal(VT))
@@ -28249,11 +29120,19 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
-  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+      Elts.push_back(Elt);
+      continue;
+    }
+    Elts.clear();
+    break;
+  }
 
-  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
-    return LD;
+  if (Elts.size() == VT.getVectorNumElements())
+    if (SDValue LD =
+            EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
+      return LD;
 
   // For AVX2, we sometimes want to combine
   // (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -28276,7 +29155,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     // a particular chain.
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -28303,18 +29182,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   EVT OriginalVT = InVec.getValueType();
 
-  if (InVec.getOpcode() == ISD::BITCAST) {
-    // Don't duplicate a load with other uses.
-    if (!InVec.hasOneUse())
-      return SDValue();
-    EVT BCVT = InVec.getOperand(0).getValueType();
-    if (!BCVT.isVector() ||
-        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
-      return SDValue();
-    InVec = InVec.getOperand(0);
-  }
+  // Peek through bitcasts, don't duplicate a load with other uses.
+  InVec = peekThroughOneUseBitcasts(InVec);
 
   EVT CurrentVT = InVec.getValueType();
+  if (!CurrentVT.isVector() ||
+      CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+    return SDValue();
 
   if (!isTargetShuffle(InVec.getOpcode()))
     return SDValue();
@@ -28389,23 +29263,151 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
+// Try to match patterns such as
+// (i16 bitcast (v16i1 x))
+// ->
+// (i16 movmsk (16i8 sext (v16i1 x)))
+// before the illegal vector is scalarized on subtargets that don't have legal
+// vxi1 types.
+static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
+                                  const X86Subtarget &Subtarget) {
+  EVT VT = BitCast.getValueType();
+  SDValue N0 = BitCast.getOperand(0);
+  EVT VecVT = N0->getValueType(0);
+
+  if (!VT.isScalarInteger() || !VecVT.isSimple())
+    return SDValue();
+
+  // With AVX512 vxi1 types are legal and we prefer using k-regs.
+  // MOVMSK is supported in SSE2 or later.
+  if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
+    return SDValue();
+
+  // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
+  // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
+  // v8i16 and v16i16.
+  // For these two cases, we can shuffle the upper element bytes to a
+  // consecutive sequence at the start of the vector and treat the results as
+  // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
+  // for v16i16 this is not the case, because the shuffle is expensive, so we
+  // avoid sign-extending to this type entirely.
+  // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
+  // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
+  MVT SExtVT;
+  MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
+  switch (VecVT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v2i1:
+    SExtVT = MVT::v2i64;
+    FPCastVT = MVT::v2f64;
+    break;
+  case MVT::v4i1:
+    SExtVT = MVT::v4i32;
+    FPCastVT = MVT::v4f32;
+    // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
+    // sign-extend to a 256-bit operation to avoid truncation.
+    if (N0->getOpcode() == ISD::SETCC &&
+        N0->getOperand(0)->getValueType(0).is256BitVector() &&
+        Subtarget.hasInt256()) {
+      SExtVT = MVT::v4i64;
+      FPCastVT = MVT::v4f64;
+    }
+    break;
+  case MVT::v8i1:
+    SExtVT = MVT::v8i16;
+    // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
+    // sign-extend to a 256-bit operation to match the compare.
+    // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
+    // 256-bit because the shuffle is cheaper than sign extending the result of
+    // the compare.
+    if (N0->getOpcode() == ISD::SETCC &&
+        N0->getOperand(0)->getValueType(0).is256BitVector() &&
+        Subtarget.hasInt256()) {
+      SExtVT = MVT::v8i32;
+      FPCastVT = MVT::v8f32;
+    }
+    break;
+  case MVT::v16i1:
+    SExtVT = MVT::v16i8;
+    // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
+    // it is not profitable to sign-extend to 256-bit because this will
+    // require an extra cross-lane shuffle which is more expensive than
+    // truncating the result of the compare to 128-bits.
+    break;
+  case MVT::v32i1:
+    // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
+    if (!Subtarget.hasInt256())
+      return SDValue();
+    SExtVT = MVT::v32i8;
+    break;
+  };
+
+  SDLoc DL(BitCast);
+  SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
+  if (SExtVT == MVT::v8i16) {
+    V = DAG.getBitcast(MVT::v16i8, V);
+    V = DAG.getVectorShuffle(
+        MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
+        {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+  } else
+    assert(SExtVT.getScalarType() != MVT::i16 &&
+           "Vectors of i16 must be shuffled");
+  if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+    V = DAG.getBitcast(FPCastVT, V);
+  V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+  return DAG.getZExtOrTrunc(V, DL, VT);
+}
+
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
+                              TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+
+  // Try to match patterns such as
+  // (i16 bitcast (v16i1 x))
+  // ->
+  // (i16 movmsk (16i8 sext (v16i1 x)))
+  // before the setcc result is scalarized on subtargets that don't have legal
+  // vxi1 types.
+  if (DCI.isBeforeLegalize())
+    if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
+      return V;
+  // Since MMX types are special and don't usually play with other vector types,
+  // it's better to handle them early to be sure we emit efficient code by
+  // avoiding store-load conversions.
 
-  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-  // special and don't usually play with other vector types, it's better to
-  // handle them early to be sure we emit efficient code by avoiding
-  // store-load conversions.
+  // Detect bitcasts between i32 to x86mmx low word.
   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
-      N0.getValueType() == MVT::v2i32 &&
-      isNullConstant(N0.getOperand(1))) {
+      SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
     SDValue N00 = N0->getOperand(0);
     if (N00.getValueType() == MVT::i32)
       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
   }
 
+  // Detect bitcasts between element or subvector extraction to x86mmx.
+  if (VT == MVT::x86mmx &&
+      (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+      isNullConstant(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N00.getValueType().is128BitVector())
+      return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+                         DAG.getBitcast(MVT::v2i64, N00));
+  }
+
+  // Detect bitcasts from FP_TO_SINT to x86mmx.
+  if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
+      N0.getOpcode() == ISD::FP_TO_SINT) {
+    SDLoc DL(N0);
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+                              DAG.getUNDEF(MVT::v2i32));
+    return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+                       DAG.getBitcast(MVT::v2i64, Res));
+  }
+
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
@@ -28511,12 +29513,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOpcode() != ISD::SETCC)
     return false;
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
-  if (CC != ISD::SETGT)
+  if (CC != ISD::SETGT && CC != ISD::SETLT)
     return false;
 
   SDValue SelectOp1 = Select->getOperand(1);
   SDValue SelectOp2 = Select->getOperand(2);
 
+  // The following instructions assume SelectOp1 is the subtraction operand
+  // and SelectOp2 is the negation operand.
+  // In the case of SETLT this is the other way around.
+  if (CC == ISD::SETLT)
+    std::swap(SelectOp1, SelectOp2);
+
   // The second operand of the select should be the negation of the first
   // operand, which is implemented as 0 - SelectOp1.
   if (!(SelectOp2.getOpcode() == ISD::SUB &&
@@ -28529,8 +29537,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOperand(0) != SelectOp1)
     return false;
 
-  // The second operand of the comparison can be either -1 or 0.
-  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+  // In SetLT case, The second operand of the comparison can be either 1 or 0.
+  APInt SplatVal;
+  if ((CC == ISD::SETLT) &&
+      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal,
+                                    /*AllowShrink*/false) &&
+         SplatVal.isOneValue()) ||
+        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+    return false;
+
+  // In SetGT case, The second operand of the comparison can be either -1 or 0.
+  if ((CC == ISD::SETGT) &&
+      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
     return false;
 
@@ -28576,17 +29594,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
 }
 
+// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+static SDValue combineHorizontalPredicateResult(SDNode *Extract,
+                                                SelectionDAG &DAG,
+                                                const X86Subtarget &Subtarget) {
+  // Bail without SSE2 or with AVX512VL (which uses predicate registers).
+  if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+    return SDValue();
+
+  EVT ExtractVT = Extract->getValueType(0);
+  unsigned BitWidth = ExtractVT.getSizeInBits();
+  if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
+      ExtractVT != MVT::i8)
+    return SDValue();
+
+  // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+  for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
+    SDValue Match = matchBinOpReduction(Extract, Op);
+    if (!Match)
+      continue;
+
+    // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+    // which we can't support here for now.
+    if (Match.getScalarValueSizeInBits() != BitWidth)
+      continue;
+
+    // We require AVX2 for PMOVMSKB for v16i16/v32i8;
+    unsigned MatchSizeInBits = Match.getValueSizeInBits();
+    if (!(MatchSizeInBits == 128 ||
+          (MatchSizeInBits == 256 &&
+           ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
+      return SDValue();
+
+    // Don't bother performing this for 2-element vectors.
+    if (Match.getValueType().getVectorNumElements() <= 2)
+      return SDValue();
+
+    // Check that we are extracting a reduction of all sign bits.
+    if (DAG.ComputeNumSignBits(Match) != BitWidth)
+      return SDValue();
+
+    // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+    MVT MaskVT;
+    if (64 == BitWidth || 32 == BitWidth)
+      MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+                                MatchSizeInBits / BitWidth);
+    else
+      MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+    APInt CompareBits;
+    ISD::CondCode CondCode;
+    if (Op == ISD::OR) {
+      // any_of -> MOVMSK != 0
+      CompareBits = APInt::getNullValue(32);
+      CondCode = ISD::CondCode::SETNE;
+    } else {
+      // all_of -> MOVMSK == ((1 << NumElts) - 1)
+      CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+      CondCode = ISD::CondCode::SETEQ;
+    }
+
+    // Perform the select as i32/i64 and then truncate to avoid partial register
+    // stalls.
+    unsigned ResWidth = std::max(BitWidth, 32u);
+    EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
+    SDLoc DL(Extract);
+    SDValue Zero = DAG.getConstant(0, DL, ResVT);
+    SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
+    SDValue Res = DAG.getBitcast(MaskVT, Match);
+    Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
+    Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
+                          Ones, Zero, CondCode);
+    return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Verify the type we're extracting from is appropriate
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
+  // Verify the type we're extracting from is any integer type above i16.
   EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
     return SDValue();
 
   unsigned RegSize = 128;
@@ -28595,15 +29688,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   else if (Subtarget.hasAVX2())
     RegSize = 256;
 
-  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
+  if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
   // Match shuffle + add pyramid.
   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
 
+  // The operand is expected to be zero extended from i8
+  // (verified in detectZextAbsDiff).
+  // In order to convert to i64 and above, additional any/zero/sign
+  // extend is expected.
+  // The zero extend from 32 bit has no mathematical effect on the result.
+  // Also the sign extend is basically zero extend
+  // (extends the sign bit which is zero).
+  // So it is correct to skip the sign/zero extend instruction.
+  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+    Root.getOpcode() == ISD::ZERO_EXTEND ||
+    Root.getOpcode() == ISD::ANY_EXTEND))
+    Root = Root.getOperand(0);
+
   // If there was a match, we want Root to be a select that is the root of an
   // abs-diff pattern.
   if (!Root || (Root.getOpcode() != ISD::VSELECT))
@@ -28614,7 +29720,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
     return SDValue();
 
-  // Create the SAD instruction
+  // Create the SAD instruction.
   SDLoc DL(Extract);
   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
 
@@ -28636,13 +29742,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
     }
   }
 
-  // Return the lowest i32.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+  MVT Type = Extract->getSimpleValueType(0);
+  unsigned TypeSizeInBits = Type.getSizeInBits();
+  // Return the lowest TypeSizeInBits bits.
+  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                      Extract->getOperand(1));
 }
 
+// Attempt to peek through a target shuffle and extract the scalar from the
+// source.
+static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+  EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+  // Don't attempt this for boolean mask vectors or unknown extraction indices.
+  if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  // Resolve the target shuffle inputs and mask.
+  SmallVector<int, 16> Mask;
+  SmallVector<SDValue, 2> Ops;
+  if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
+    return SDValue();
+
+  // Attempt to narrow/widen the shuffle mask to the correct size.
+  if (Mask.size() != NumSrcElts) {
+    if ((NumSrcElts % Mask.size()) == 0) {
+      SmallVector<int, 16> ScaledMask;
+      int Scale = NumSrcElts / Mask.size();
+      scaleShuffleMask(Scale, Mask, ScaledMask);
+      Mask = std::move(ScaledMask);
+    } else if ((Mask.size() % NumSrcElts) == 0) {
+      SmallVector<int, 16> WidenedMask;
+      while (Mask.size() > NumSrcElts &&
+             canWidenShuffleElements(Mask, WidenedMask))
+        Mask = std::move(WidenedMask);
+      // TODO - investigate support for wider shuffle masks with known upper
+      // undef/zero elements for implicit zero-extension.
+    }
+  }
+
+  // Check if narrowing/widening failed.
+  if (Mask.size() != NumSrcElts)
+    return SDValue();
+
+  int SrcIdx = Mask[N->getConstantOperandVal(1)];
+  SDLoc dl(N);
+
+  // If the shuffle source element is undef/zero then we can just accept it.
+  if (SrcIdx == SM_SentinelUndef)
+    return DAG.getUNDEF(VT);
+
+  if (SrcIdx == SM_SentinelZero)
+    return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
+                                : DAG.getConstant(0, dl, VT);
+
+  SDValue SrcOp = Ops[SrcIdx / Mask.size()];
+  SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+  SrcIdx = SrcIdx % Mask.size();
+
+  // We can only extract other elements from 128-bit vectors and in certain
+  // circumstances, depending on SSE-level.
+  // TODO: Investigate using extract_subvector for larger vectors.
+  // TODO: Investigate float/double extraction if it will be just stored.
+  if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
+      ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+    assert(SrcSVT == VT && "Unexpected extraction type");
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
+                       DAG.getIntPtrConstant(SrcIdx, dl));
+  }
+
+  if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+      (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+    assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
+           "Unexpected extraction type");
+    unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+    SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
+                                DAG.getIntPtrConstant(SrcIdx, dl));
+    SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
+                                 DAG.getValueType(SrcSVT));
+    return DAG.getZExtOrTrunc(Assert, dl, VT);
+  }
+
+  return SDValue();
+}
+
 /// Detect vector gather/scatter index generation and convert it from being a
 /// bunch of shuffles and extracts into a somewhat faster sequence.
 /// For i686, the best sequence is apparently storing the value and loading
@@ -28653,14 +29849,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
+  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+    return NewOp;
+
   SDValue InputVector = N->getOperand(0);
+  SDValue EltIdx = N->getOperand(1);
+
+  EVT SrcVT = InputVector.getValueType();
+  EVT VT = N->getValueType(0);
   SDLoc dl(InputVector);
+
+  // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
+  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+      VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
+    SDValue MMXSrc = InputVector.getOperand(0);
+
+    // The bitcast source is a direct mmx result.
+    if (MMXSrc.getValueType() == MVT::x86mmx)
+      return DAG.getBitcast(VT, InputVector);
+  }
+
   // Detect mmx to i32 conversion through a v2i32 elt extract.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
-      N->getValueType(0) == MVT::i32 &&
-      InputVector.getValueType() == MVT::v2i32 &&
-      isa<ConstantSDNode>(N->getOperand(1)) &&
-      N->getConstantOperandVal(1) == 0) {
+      VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
@@ -28668,15 +29879,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   }
 
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
-      InputVector.getOpcode() == ISD::BITCAST &&
+  if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
+      isa<ConstantSDNode>(EltIdx) &&
       isa<ConstantSDNode>(InputVector.getOperand(0))) {
-    uint64_t ExtractedElt =
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    uint64_t InputValue =
-        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+    uint64_t ExtractedElt = N->getConstantOperandVal(1);
+    uint64_t InputValue = InputVector.getConstantOperandVal(0);
     uint64_t Res = (InputValue >> ExtractedElt) & 1;
     return DAG.getConstant(Res, dl, MVT::i1);
   }
@@ -28687,9 +29894,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
     return SAD;
 
+  // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
+  if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+    return Cmp;
+
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
-  if (InputVector.getValueType() != MVT::v4i32)
+  if (SrcVT != MVT::v4i32)
     return SDValue();
 
   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
@@ -28717,9 +29928,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     // Record which element was extracted.
-    ExtractedElements |=
-      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
-
+    ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
     Uses.push_back(Extract);
   }
 
@@ -28752,11 +29961,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   } else {
     // Store the value to a temporary stack slot.
-    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
                               MachinePointerInfo());
 
-    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    EVT ElementType = SrcVT.getVectorElementType();
     unsigned EltSize = ElementType.getSizeInBits() / 8;
 
     // Replace each use (extract) with a load of the appropriate element.
@@ -28779,8 +29988,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     UE = Uses.end(); UI != UE; ++UI) {
     SDNode *Extract = *UI;
 
-    SDValue Idx = Extract->getOperand(1);
-    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    uint64_t IdxVal = Extract->getConstantOperandVal(1);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   }
 
@@ -28788,6 +29996,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// TODO - merge with combineExtractVectorElt once it can handle the implicit
+// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
+// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+// combineBasicSADPattern.
+static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
+}
+
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 static SDValue
@@ -28812,12 +30030,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   // This situation only applies to avx512.
   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
-      //Invert the cond to not(cond) : xor(op,allones)=not(op)
-      SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-        DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
-                        DL, CondVT));
-      //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
-      return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+    // Invert the cond to not(cond) : xor(op,allones)=not(op)
+    SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
+                                  DAG.getAllOnesConstant(DL, CondVT));
+    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
@@ -28920,18 +30137,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
                        DAG.getConstant(ShAmt, DL, MVT::i8));
   }
 
-  // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
-  if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
-    if (NeedsCondInvert) // Invert the condition if needed.
-      Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-                         DAG.getConstant(1, DL, Cond.getValueType()));
-
-    // Zero extend the condition if needed.
-    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
-    return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
-                       SDValue(FalseC, 0));
-  }
-
   // Optimize cases that will turn into an LEA instruction.  This requires
   // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
@@ -28939,7 +30144,7 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
     if (N->getValueType(0) == MVT::i32)
       Diff = (unsigned)Diff;
 
-    bool isFastMultiplier = false;
+    bool IsFastMultiplier = false;
     if (Diff < 10) {
       switch ((unsigned char)Diff) {
       default:
@@ -28951,12 +30156,12 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
       case 5: // result = lea base(cond, cond*4)
       case 8: // result = lea base(    , cond*8)
       case 9: // result = lea base(cond, cond*8)
-        isFastMultiplier = true;
+        IsFastMultiplier = true;
         break;
       }
     }
 
-    if (isFastMultiplier) {
+    if (IsFastMultiplier) {
       APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
       if (NeedsCondInvert) // Invert the condition if needed.
         Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
@@ -29049,7 +30254,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
       return false;
     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
     // Only change element size, not type.
-    if (VT.isInteger() != OpEltVT.isInteger())
+    if (EltVT.isInteger() != OpEltVT.isInteger())
       return false;
     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29063,7 +30268,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
     DCI.AddToWorklist(Op1.getNode());
     DCI.CombineTo(OrigOp.getNode(),
                   DAG.getNode(Opcode, DL, VT, Op0, Op1,
-                              DAG.getConstant(Imm, DL, MVT::i8)));
+                              DAG.getIntPtrConstant(Imm, DL)));
     return true;
   }
   case ISD::EXTRACT_SUBVECTOR: {
@@ -29072,7 +30277,7 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
       return false;
     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
     // Only change element size, not type.
-    if (VT.isInteger() != OpEltVT.isInteger())
+    if (EltVT.isInteger() != OpEltVT.isInteger())
       return false;
     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
@@ -29084,7 +30289,23 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
     DCI.AddToWorklist(Op0.getNode());
     DCI.CombineTo(OrigOp.getNode(),
                   DAG.getNode(Opcode, DL, VT, Op0,
-                              DAG.getConstant(Imm, DL, MVT::i8)));
+                              DAG.getIntPtrConstant(Imm, DL)));
+    return true;
+  }
+  case X86ISD::SUBV_BROADCAST: {
+    unsigned EltSize = EltVT.getSizeInBits();
+    if (EltSize != 32 && EltSize != 64)
+      return false;
+    // Only change element size, not type.
+    if (VT.isInteger() != Op.getSimpleValueType().isInteger())
+      return false;
+    SDValue Op0 = Op.getOperand(0);
+    MVT Op0VT = MVT::getVectorVT(EltVT,
+                            Op0.getSimpleValueType().getSizeInBits() / EltSize);
+    Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
+    DCI.AddToWorklist(Op0.getNode());
+    DCI.CombineTo(OrigOp.getNode(),
+                  DAG.getNode(Opcode, DL, VT, Op0));
     return true;
   }
   }
@@ -29147,6 +30368,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
@@ -29177,6 +30399,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
@@ -29211,6 +30434,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
@@ -29239,6 +30463,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
+        LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
@@ -29296,7 +30521,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
-      return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
+      return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
     }
   }
@@ -29355,7 +30580,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
           // x s< 0 ? x^C : 0 --> subus x, C
           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
-              OpRHSConst->getAPIntValue().isSignBit())
+              OpRHSConst->getAPIntValue().isSignMask())
             // Note that we have to rebuild the RHS constant here to ensure we
             // don't rely on particular values of undef lanes.
             return DAG.getNode(
@@ -29370,8 +30595,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
 
   // If this is a *dynamic* select (non-constant condition) and we can match
   // this node with one of the variable blend instructions, restructure the
-  // condition so that the blends can use the high bit of each element and use
-  // SimplifyDemandedBits to simplify the condition operand.
+  // condition so that blends can use the high (sign) bit of each element and
+  // use SimplifyDemandedBits to simplify the condition operand.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() &&
       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
@@ -29404,51 +30629,49 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     // Byte blends are only available in AVX2
     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
       return SDValue();
+    // There are no 512-bit blend instructions that use sign bits.
+    if (VT.is512BitVector())
+      return SDValue();
 
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
-    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
-
-    APInt KnownZero, KnownOne;
-    TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
-                                          DCI.isBeforeLegalizeOps());
-    if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
-        TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
-                                 TLO)) {
-      // If we changed the computation somewhere in the DAG, this change
-      // will affect all users of Cond.
-      // Make sure it is fine and update all the nodes so that we do not
-      // use the generic VSELECT anymore. Otherwise, we may perform
-      // wrong optimizations as we messed up with the actual expectation
+    APInt DemandedMask(APInt::getSignMask(BitWidth));
+    KnownBits Known;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
+        TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
+      // If we changed the computation somewhere in the DAG, this change will
+      // affect all users of Cond. Make sure it is fine and update all the nodes
+      // so that we do not use the generic VSELECT anymore. Otherwise, we may
+      // perform wrong optimizations as we messed with the actual expectation
       // for the vector boolean values.
       if (Cond != TLO.Old) {
-        // Check all uses of that condition operand to check whether it will be
-        // consumed by non-BLEND instructions, which may depend on all bits are
-        // set properly.
-        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
-             I != E; ++I)
-          if (I->getOpcode() != ISD::VSELECT)
-            // TODO: Add other opcodes eventually lowered into BLEND.
+        // Check all uses of the condition operand to check whether it will be
+        // consumed by non-BLEND instructions. Those may require that all bits
+        // are set properly.
+        for (SDNode *U : Cond->uses()) {
+          // TODO: Add other opcodes eventually lowered into BLEND.
+          if (U->getOpcode() != ISD::VSELECT)
             return SDValue();
+        }
 
-        // Update all the users of the condition, before committing the change,
-        // so that the VSELECT optimizations that expect the correct vector
-        // boolean value will not be triggered.
-        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
-             I != E; ++I)
-          DAG.ReplaceAllUsesOfValueWith(
-              SDValue(*I, 0),
-              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
-                          Cond, I->getOperand(1), I->getOperand(2)));
+        // Update all users of the condition before committing the change, so
+        // that the VSELECT optimizations that expect the correct vector boolean
+        // value will not be triggered.
+        for (SDNode *U : Cond->uses()) {
+          SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
+                                   U->getValueType(0), Cond, U->getOperand(1),
+                                   U->getOperand(2));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+        }
         DCI.CommitTargetLoweringOpt(TLO);
         return SDValue();
       }
-      // At this point, only Cond is changed. Change the condition
-      // just for N to keep the opportunity to optimize all other
-      // users their own way.
-      DAG.ReplaceAllUsesOfValueWith(
-          SDValue(N, 0),
-          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
-                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      // Only Cond (rather than other nodes in the computation chain) was
+      // changed. Change the condition just for N to keep the opportunity to
+      // optimize all other users their own way.
+      SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
       return SDValue();
     }
   }
@@ -29456,7 +30679,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   // Look for vselects with LHS/RHS being bitcasted from an operation that
   // can be executed on another type. Push the bitcast to the inputs of
   // the operation. This exposes opportunities for using masking instructions.
-  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
+  if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
       CondVT.getVectorElementType() == MVT::i1) {
     if (combineBitcastForMaskedOp(LHS, DAG, DCI))
       return SDValue(N, 0);
@@ -29464,6 +30687,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return SDValue(N, 0);
   }
 
+  // Custom action for SELECT MMX
+  if (VT == MVT::x86mmx) {
+    LHS = DAG.getBitcast(MVT::i64, LHS);
+    RHS = DAG.getBitcast(MVT::i64, RHS);
+    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
+    return DAG.getBitcast(VT, newSelect);
+  }
+
   return SDValue();
 }
 
@@ -29711,11 +30942,40 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
   return true;
 }
 
+// When legalizing carry, we create carries via add X, -1
+// If that comes from an actual carry, via setcc, we use the
+// carry directly.
+static SDValue combineCarryThroughADD(SDValue EFLAGS) {
+  if (EFLAGS.getOpcode() == X86ISD::ADD) {
+    if (isAllOnesConstant(EFLAGS.getOperand(1))) {
+      SDValue Carry = EFLAGS.getOperand(0);
+      while (Carry.getOpcode() == ISD::TRUNCATE ||
+             Carry.getOpcode() == ISD::ZERO_EXTEND ||
+             Carry.getOpcode() == ISD::SIGN_EXTEND ||
+             Carry.getOpcode() == ISD::ANY_EXTEND ||
+             (Carry.getOpcode() == ISD::AND &&
+              isOneConstant(Carry.getOperand(1))))
+        Carry = Carry.getOperand(0);
+      if (Carry.getOpcode() == X86ISD::SETCC ||
+          Carry.getOpcode() == X86ISD::SETCC_CARRY) {
+        if (Carry.getConstantOperandVal(0) == X86::COND_B)
+          return Carry.getOperand(1);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 /// Optimize an EFLAGS definition used according to the condition code \p CC
 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
 /// uses of chain values.
 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
                                   SelectionDAG &DAG) {
+  if (CC == X86::COND_B)
+    if (SDValue Flags = combineCarryThroughADD(EFLAGS))
+      return Flags;
+
   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
     return R;
   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
@@ -30141,6 +31401,77 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   }
 }
 
+static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
+                                 EVT VT, SDLoc DL) {
+
+  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
+    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                                 DAG.getConstant(Mult, DL, VT));
+    Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+                         DAG.getConstant(Shift, DL, MVT::i8));
+    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+                         N->getOperand(0));
+    return Result;
+  };
+
+  auto combineMulMulAddOrSub = [&](bool isAdd) {
+    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                                 DAG.getConstant(9, DL, VT));
+    Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
+    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
+                         N->getOperand(0));
+    return Result;
+  };
+
+  switch (MulAmt) {
+  default:
+    break;
+  case 11:
+    // mul x, 11 => add ((shl (mul x, 5), 1), x)
+    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
+  case 21:
+    // mul x, 21 => add ((shl (mul x, 5), 2), x)
+    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+  case 22:
+    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
+  case 19:
+    // mul x, 19 => sub ((shl (mul x, 5), 2), x)
+    return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
+  case 13:
+    // mul x, 13 => add ((shl (mul x, 3), 2), x)
+    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
+  case 23:
+    // mul x, 13 => sub ((shl (mul x, 3), 3), x)
+    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
+  case 14:
+    // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
+  case 26:
+    // mul x, 26 => sub ((mul (mul x, 9), 3), x)
+    return combineMulMulAddOrSub(/*isAdd*/ false);
+  case 28:
+    // mul x, 28 => add ((mul (mul x, 9), 3), x)
+    return combineMulMulAddOrSub(/*isAdd*/ true);
+  case 29:
+    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulMulAddOrSub(/*isAdd*/ true));
+  case 30:
+    // mul x, 30 => sub (sub ((shl x, 5), x), x)
+    return DAG.getNode(
+        ISD::SUB, DL, VT,
+        DAG.getNode(ISD::SUB, DL, VT,
+                    DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(5, DL, MVT::i8)),
+                    N->getOperand(0)),
+        N->getOperand(0));
+  }
+  return SDValue();
+}
+
 /// Optimize a single multiply with constant into two operations in order to
 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
@@ -30150,6 +31481,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
+  if (!MulConstantOptimization)
+    return SDValue();
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
@@ -30205,25 +31538,41 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
-  }
+  } else if (!Subtarget.slowLEA())
+    NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
 
   if (!NewMul) {
-    assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
-           && "Both cases that could cause potential overflows should have "
-              "already been handled.");
-    if (isPowerOf2_64(MulAmt - 1))
-      // (mul x, 2^N + 1) => (add (shl x, N), x)
-      NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
-                                DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                                DAG.getConstant(Log2_64(MulAmt - 1), DL,
-                                MVT::i8)));
-
-    else if (isPowerOf2_64(MulAmt + 1))
-      // (mul x, 2^N - 1) => (sub (shl x, N), x)
-      NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
-                                N->getOperand(0),
-                                DAG.getConstant(Log2_64(MulAmt + 1),
-                                DL, MVT::i8)), N->getOperand(0));
+    assert(MulAmt != 0 &&
+           MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
+           "Both cases that could cause potential overflows should have "
+           "already been handled.");
+    int64_t SignMulAmt = C->getSExtValue();
+    if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
+        (SignMulAmt != -INT64_MAX)) {
+      int NumSign = SignMulAmt > 0 ? 1 : -1;
+      bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
+      bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
+      if (IsPowerOf2_64PlusOne) {
+        // (mul x, 2^N + 1) => (add (shl x, N), x)
+        NewMul = DAG.getNode(
+            ISD::ADD, DL, VT, N->getOperand(0),
+            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                        DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
+                                        MVT::i8)));
+      } else if (IsPowerOf2_64MinusOne) {
+        // (mul x, 2^N - 1) => (sub (shl x, N), x)
+        NewMul = DAG.getNode(
+            ISD::SUB, DL, VT,
+            DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                        DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
+                                        MVT::i8)),
+            N->getOperand(0));
+      }
+      // To negate, subtract the number from zero
+      if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
+        NewMul =
+            DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
+    }
   }
 
   if (NewMul)
@@ -30246,8 +31595,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-    const APInt &ShAmt = N1C->getAPIntValue();
-    Mask = Mask.shl(ShAmt);
+    Mask <<= N1C->getAPIntValue();
     bool MaskOK = false;
     // We can handle cases concerning bit-widening nodes containing setcc_c if
     // we carefully interrogate the mask to make sure we are semantics
@@ -30396,42 +31744,95 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget &Subtarget) {
-  assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
-         "Unexpected opcode");
+static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
+  unsigned Opcode = N->getOpcode();
+  assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
+          X86ISD::VSRLI == Opcode) &&
+         "Unexpected shift opcode");
+  bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
   EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-
-  // This fails for mask register (vXi1) shifts.
-  if ((NumBitsPerElt % 8) != 0)
-    return SDValue();
+  assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
+         "Unexpected value type");
 
   // Out of range logical bit shifts are guaranteed to be zero.
-  APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
-  if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
-    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+  // Out of range arithmetic bit shifts splat the sign bit.
+  APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
+  if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
+    if (LogicalShift)
+      return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
+    else
+      ShiftVal = NumBitsPerElt - 1;
+  }
 
   // Shift N0 by zero -> N0.
   if (!ShiftVal)
-    return N->getOperand(0);
+    return N0;
 
   // Shift zero -> zero.
-  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+  if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
 
+  // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
+  // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
+  // TODO - support other sra opcodes as needed.
+  if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
+      N0.getOpcode() == X86ISD::VSRAI)
+    return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
+
   // We can decode 'whole byte' logical bit shifts as shuffles.
-  if ((ShiftVal.getZExtValue() % 8) == 0) {
+  if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
   }
 
+  // Constant Folding.
+  APInt UndefElts;
+  SmallVector<APInt, 32> EltBits;
+  if (N->isOnlyUserOf(N0.getNode()) &&
+      getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
+    assert(EltBits.size() == VT.getVectorNumElements() &&
+           "Unexpected shift value type");
+    unsigned ShiftImm = ShiftVal.getZExtValue();
+    for (APInt &Elt : EltBits) {
+      if (X86ISD::VSHLI == Opcode)
+        Elt <<= ShiftImm;
+      else if (X86ISD::VSRAI == Opcode)
+        Elt.ashrInPlace(ShiftImm);
+      else
+        Elt.lshrInPlace(ShiftImm);
+    }
+    return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
+  }
+
+  return SDValue();
+}
+
+static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget &Subtarget) {
+  assert(
+      ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
+       (N->getOpcode() == X86ISD::PINSRW &&
+        N->getValueType(0) == MVT::v8i16)) &&
+      "Unexpected vector insertion");
+
+  // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
+  SDValue Op(N, 0);
+  SmallVector<int, 1> NonceMask; // Just a placeholder.
+  NonceMask.push_back(0);
+  combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+                                /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                DCI, Subtarget);
   return SDValue();
 }
 
@@ -30495,13 +31896,11 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
           // See X86ATTInstPrinter.cpp:printSSECC().
           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
           if (Subtarget.hasAVX512()) {
-            SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
-                                         CMP01,
-                                         DAG.getConstant(x86cc, DL, MVT::i8));
-            if (N->getValueType(0) != MVT::i1)
-              return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
-                                 FSetCC);
-            return FSetCC;
+            SDValue FSetCC =
+                DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
+                            DAG.getConstant(x86cc, DL, MVT::i8));
+            return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
+                               FSetCC, DAG.getIntPtrConstant(0, DL));
           }
           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
                                               CMP00.getValueType(), CMP00, CMP01,
@@ -30550,33 +31949,15 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
     return SDValue();
 
-  // Canonicalize XOR to the left.
-  if (N1.getOpcode() == ISD::XOR)
-    std::swap(N0, N1);
-
-  if (N0.getOpcode() != ISD::XOR)
-    return SDValue();
+  if (N0.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
 
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-
-  N01 = peekThroughBitcasts(N01);
-
-  // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
-  // insert_subvector building a 256-bit AllOnes vector.
-  if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
-    if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
-      return SDValue();
+  if (N1.getOpcode() == ISD::XOR &&
+      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
 
-    SDValue V1 = N01->getOperand(0);
-    SDValue V2 = N01->getOperand(1);
-    if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
-        !V1.getOperand(0).isUndef() ||
-        !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
-        !ISD::isBuildVectorAllOnes(V2.getNode()))
-      return SDValue();
-  }
-  return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
+  return SDValue();
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
@@ -30696,38 +32077,35 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
-/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
-/// eliminate loading the vector constant mask value. This relies on the fact
-/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
-static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+/// If this is a zero/all-bits result that is bitwise-anded with a low bits
+/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
+/// with a shift-right to eliminate loading the vector constant mask value.
+static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+  EVT VT0 = Op0.getValueType();
+  EVT VT1 = Op1.getValueType();
 
-  // TODO: Use AssertSext to mark any nodes that have the property of producing
-  // all-ones or all-zeros. Then check for that node rather than particular
-  // opcodes.
-  if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+  if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
     return SDValue();
 
-  // The existence of the PCMP node guarantees that we have the required SSE2 or
-  // AVX2 for a shift of this vector type, but there is no vector shift by
-  // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
-  // masked compare nodes, so they should not make it here.
-  EVT VT0 = Op0.getValueType();
-  EVT VT1 = Op1.getValueType();
-  unsigned EltBitWidth = VT0.getScalarSizeInBits();
-  if (VT0 != VT1 || EltBitWidth == 8)
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal,
+                                  /*AllowShrink*/false) ||
+      !SplatVal.isMask())
     return SDValue();
 
-  assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+  if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+    return SDValue();
 
-  APInt SplatVal;
-  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+  unsigned EltBitWidth = VT0.getScalarSizeInBits();
+  if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
   SDLoc DL(N);
-  SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+  unsigned ShiftVal = SplatVal.countTrailingOnes();
+  SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
@@ -30747,7 +32125,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
     return R;
 
-  if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+  if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
   EVT VT = N->getValueType(0);
@@ -30760,7 +32138,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -30802,20 +32180,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
 //   (sub (xor X, M), M)
 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
-  assert(N->getOpcode() == ISD::OR);
+  assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
-  if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
+  if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
+        (VT.is256BitVector() && Subtarget.hasInt256())))
     return SDValue();
-  assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
 
-  // Canonicalize pandn to RHS
-  if (N0.getOpcode() == X86ISD::ANDNP)
+  // Canonicalize AND to LHS.
+  if (N1.getOpcode() == ISD::AND)
     std::swap(N0, N1);
 
+  // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
+  // ANDNP combine allows other combines to happen that prevent matching.
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
     return SDValue();
 
@@ -30837,21 +32217,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   Y = peekThroughBitcasts(Y);
 
   EVT MaskVT = Mask.getValueType();
-
-  // Validate that the Mask operand is a vector sra node.
-  // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
-  // there is no psrai.b
   unsigned EltBits = MaskVT.getScalarSizeInBits();
-  unsigned SraAmt = ~0;
-  if (Mask.getOpcode() == ISD::SRA) {
-    if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
-      if (auto *AmtConst = AmtBV->getConstantSplatNode())
-        SraAmt = AmtConst->getZExtValue();
-  } else if (Mask.getOpcode() == X86ISD::VSRAI) {
-    SDValue SraC = Mask.getOperand(1);
-    SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
-  }
-  if ((SraAmt + 1) != EltBits)
+
+  // TODO: Attempt to handle floating point cases as well?
+  if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
     return SDValue();
 
   SDLoc DL(N);
@@ -30872,7 +32241,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   //   (add (xor X, M), (and M, 1))
   // And further to:
   //   (sub (xor X, M), M)
-  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
+  if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
+      DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
     auto IsNegV = [](SDNode *N, SDValue V) {
       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
@@ -30884,7 +32254,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
       V = Y;
 
     if (V) {
-      assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
       SDValue SubOp2 = Mask;
 
@@ -30901,8 +32270,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
       if (V == Y)
          std::swap(SubOp1, SubOp2);
 
-      return DAG.getBitcast(VT,
-                            DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
+      SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+      return DAG.getBitcast(VT, Res);
     }
   }
 
@@ -30915,7 +32284,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
   Mask = DAG.getBitcast(BlendVT, Mask);
-  Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
+  Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
   return DAG.getBitcast(VT, Mask);
 }
 
@@ -30969,7 +32338,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
            N->getOperand(1).getOpcode() == X86ISD::CMP &&
-           N->getOperand(1).getConstantOperandVal(1) == 0 &&
+           isNullConstant(N->getOperand(1).getOperand(1)) &&
            N->getOperand(1).getValueType().bitsGE(MVT::i32);
   };
 
@@ -31272,6 +32641,75 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
+/// Check if truncation with saturation form type \p SrcVT to \p DstVT
+/// is valid for the given \p Subtarget.
+static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
+                                        const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasAVX512())
+    return false;
+
+  // FIXME: Scalar type may be supported if we move it to vector register.
+  if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
+    return false;
+
+  EVT SrcElVT = SrcVT.getScalarType();
+  EVT DstElVT = DstVT.getScalarType();
+  if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
+    return false;
+  if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
+    return false;
+  if (SrcVT.is512BitVector() || Subtarget.hasVLX())
+    return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
+  return false;
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectUSatPattern(SDValue In, EVT VT) {
+  if (In.getOpcode() != ISD::UMIN)
+    return SDValue();
+
+  //Saturation with truncation. We truncate from InVT to VT.
+  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
+    "Unexpected types for truncate operation");
+
+  APInt C;
+  if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C,
+                                 /*AllowShrink*/false)) {
+    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
+    // the element size of the destination type.
+    return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
+      SDValue();
+  }
+  return SDValue();
+}
+
+/// Detect a pattern of truncation with saturation:
+/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
+/// The types should allow to use VPMOVUS* instruction on AVX512.
+/// Return the source value to be truncated or SDValue() if the pattern was not
+/// matched.
+static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
+                                       const X86Subtarget &Subtarget) {
+  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+    return SDValue();
+  return detectUSatPattern(In, VT);
+}
+
+static SDValue
+combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
+                        const X86Subtarget &Subtarget) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
+    return SDValue();
+  if (auto USatVal = detectUSatPattern(In, VT))
+    if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
+      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+  return SDValue();
+}
+
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
@@ -31327,8 +32765,8 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
     if (!BV || !BV->isConstant())
       return false;
-    for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
+    for (SDValue Op : V->ops()) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
       if (!C)
         return false;
       uint64_t Val = C->getZExtValue();
@@ -31403,15 +32841,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
-  // into two 16-byte operations.
+  // into two 16-byte operations. Also split non-temporal aligned loads on
+  // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
   ISD::LoadExtType Ext = Ld->getExtensionType();
   bool Fast;
   unsigned AddressSpace = Ld->getAddressSpace();
   unsigned Alignment = Ld->getAlignment();
   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
       Ext == ISD::NON_EXTLOAD &&
-      TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
-                             AddressSpace, Alignment, &Fast) && !Fast) {
+      ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
+       (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
+                               AddressSpace, Alignment, &Fast) && !Fast))) {
     unsigned NumElems = RegVT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
@@ -31664,7 +33104,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                      Mld->getBasePtr(), NewMask, WideSrc0,
                                      Mld->getMemoryVT(), Mld->getMemOperand(),
                                      ISD::NON_EXTLOAD);
-  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 
@@ -31838,6 +33278,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
+    if (SDValue Val =
+        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
+      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
+                             dl, Val, St->getBasePtr(),
+                             St->getMemoryVT(), St->getMemOperand(), DAG);
+
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
@@ -31975,7 +33421,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getPointerInfo(), Ld->getAlignment(),
                                   Ld->getMemOperand()->getFlags());
-      SDValue NewChain = NewLd.getValue(1);
+      // Make sure new load is placed in same chain order.
+      SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
       if (TokenFactorIndex >= 0) {
         Ops.push_back(NewChain);
         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
@@ -31996,11 +33443,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                                Ld->getPointerInfo().getWithOffset(4),
                                MinAlign(Ld->getAlignment(), 4),
                                Ld->getMemOperand()->getFlags());
+    // Make sure new loads are placed in same chain order.
+    SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
+    NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
 
-    SDValue NewChain = LoLd.getValue(1);
     if (TokenFactorIndex >= 0) {
-      Ops.push_back(LoLd);
-      Ops.push_back(HiLd);
+      Ops.push_back(NewChain);
       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
     }
 
@@ -32198,13 +33646,30 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
 
-  auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
-    // TODO: Add extra cases where we can truncate both inputs for the
-    // cost of one (or none).
-    // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+  auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
+    unsigned TruncSizeInBits = VT.getScalarSizeInBits();
+
+    // Repeated operand, so we are only trading one output truncation for
+    // one input truncation.
     if (Op0 == Op1)
       return true;
 
+    // See if either operand has been extended from a smaller/equal size to
+    // the truncation size, allowing a truncation to combine with the extend.
+    unsigned Opcode0 = Op0.getOpcode();
+    if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
+         Opcode0 == ISD::ZERO_EXTEND) &&
+        Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+      return true;
+
+    unsigned Opcode1 = Op1.getOpcode();
+    if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
+         Opcode1 == ISD::ZERO_EXTEND) &&
+        Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
+      return true;
+
+    // See if either operand is a single use constant which can be constant
+    // folded.
     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
@@ -32236,7 +33701,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
-        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+        IsRepeatedOpOrFreeTruncation(Op0, Op1))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
@@ -32252,7 +33717,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(Opcode, VT) &&
-        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+        IsRepeatedOpOrFreeTruncation(Op0, Op1))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
@@ -32458,6 +33923,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
+  // Try to combine truncation with unsigned saturation.
+  if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
+    return Val;
+
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -32494,8 +33963,8 @@ static SDValue isFNEG(SDNode *N) {
   SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
 
   unsigned EltBits = Op1.getScalarValueSizeInBits();
-  auto isSignBitValue = [&](const ConstantFP *C) {
-    return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
+  auto isSignMask = [&](const ConstantFP *C) {
+    return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
   };
 
   // There is more than one way to represent the same constant on
@@ -32506,21 +33975,21 @@ static SDValue isFNEG(SDNode *N) {
   // We check all variants here.
   if (Op1.getOpcode() == X86ISD::VBROADCAST) {
     if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
-      if (isSignBitValue(cast<ConstantFP>(C)))
+      if (isSignMask(cast<ConstantFP>(C)))
         return Op0;
 
   } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
     if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
-      if (isSignBitValue(CN->getConstantFPValue()))
+      if (isSignMask(CN->getConstantFPValue()))
         return Op0;
 
   } else if (auto *C = getTargetConstantFromNode(Op1)) {
     if (C->getType()->isVectorTy()) {
       if (auto *SplatV = C->getSplatValue())
-        if (isSignBitValue(cast<ConstantFP>(SplatV)))
+        if (isSignMask(cast<ConstantFP>(SplatV)))
           return Op0;
     } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
-      if (isSignBitValue(FPConst))
+      if (isSignMask(FPConst))
         return Op0;
   }
   return SDValue();
@@ -32545,7 +34014,7 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
   // use of a constant by performing (-0 - A*B) instead.
   // FIXME: Check rounding control flags as well once it becomes available.
   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
-      Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
+      Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
                                   Arg.getOperand(1), Zero);
@@ -32800,8 +34269,35 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
-  auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
-  return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
+  return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
+}
+
+/// Do target-specific dag combines on X86ISD::ANDNP nodes.
+static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const X86Subtarget &Subtarget) {
+  // ANDNP(0, x) -> x
+  if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+    return N->getOperand(1);
+
+  // ANDNP(x, 0) -> 0
+  if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
+    return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
+
+  EVT VT = N->getValueType(0);
+
+  // Attempt to recursively combine a bitmask ANDNP with shuffles.
+  if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+    SDValue Op(N, 0);
+    SmallVector<int, 1> NonceMask; // Just a placeholder.
+    NonceMask.push_back(0);
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+                                      /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                      DCI, Subtarget))
+      return SDValue(); // This routine will use CombineTo to replace N.
+  }
+
+  return SDValue();
 }
 
 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
@@ -32811,12 +34307,12 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
   if (Op1.hasOneUse()) {
     unsigned BitWidth = Op1.getValueSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
-    APInt KnownZero, KnownOne;
+    KnownBits Known;
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
-        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+    if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
       DCI.CommitTargetLoweringOpt(TLO);
   }
   return SDValue();
@@ -32878,8 +34374,8 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
     return SDValue();
 
   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
-  bool NSW = Add->getFlags()->hasNoSignedWrap();
-  bool NUW = Add->getFlags()->hasNoUnsignedWrap();
+  bool NSW = Add->getFlags().hasNoSignedWrap();
+  bool NUW = Add->getFlags().hasNoUnsignedWrap();
 
   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
   // into the 'zext'
@@ -32919,7 +34415,7 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   SDNodeFlags Flags;
   Flags.setNoSignedWrap(NSW);
   Flags.setNoUnsignedWrap(NUW);
-  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
+  return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
 }
 
 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
@@ -33065,13 +34561,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps()) {
     if (InVT == MVT::i1) {
       SDValue Zero = DAG.getConstant(0, DL, VT);
-      SDValue AllOnes =
-          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
-      return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
+      return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
     }
     return SDValue();
   }
 
+  if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
+      isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
+    // Invert and sign-extend a boolean is the same as zero-extend and subtract
+    // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
+    // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
+    // sext (xor Bool, -1) --> sub (zext Bool), 1
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
+  }
+
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
@@ -33212,8 +34717,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Optimize x == -y --> x+y == 0
-///          x != -y --> x+y != 0
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+                                               const X86Subtarget &Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+  // We're looking for an oversized integer equality comparison, but ignore a
+  // comparison with zero because that gets special treatment in EmitTest().
+  SDValue X = SetCC->getOperand(0);
+  SDValue Y = SetCC->getOperand(1);
+  EVT OpVT = X.getValueType();
+  unsigned OpSize = OpVT.getSizeInBits();
+  if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
+    return SDValue();
+
+  // TODO: Use PXOR + PTEST for SSE4.1 or later?
+  // TODO: Add support for AVX-512.
+  EVT VT = SetCC->getValueType(0);
+  SDLoc DL(SetCC);
+  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+      (OpSize == 256 && Subtarget.hasAVX2())) {
+    EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
+    SDValue VecX = DAG.getBitcast(VecVT, X);
+    SDValue VecY = DAG.getBitcast(VecVT, Y);
+
+    // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
+    // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
+    // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
+    // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
+    // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+    SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
+    SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
+    SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
+                                    MVT::i32);
+    return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -33222,21 +34766,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
-    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
-      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                 LHS.getOperand(1));
-      return DAG.getSetCC(DL, N->getValueType(0), addV,
-                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+  if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+    EVT OpVT = LHS.getValueType();
+    // 0-x == y --> x+y == 0
+    // 0-x != y --> x+y != 0
+    if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+        LHS.hasOneUse()) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
-  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
-    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
-      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                 RHS.getOperand(1));
-      return DAG.getSetCC(DL, N->getValueType(0), addV,
-                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    // x == 0-y --> x+y == 0
+    // x != 0-y --> x+y != 0
+    if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+        RHS.hasOneUse()) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
 
+    if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+      return V;
+  }
+
   if (VT.getScalarType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
     bool IsSEXT0 =
@@ -33293,56 +34843,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Helper function of performSETCCCombine. It is to materialize "setb reg"
-// as "sbb reg,reg", since it can be extended without zext and produces
-// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
-                               SelectionDAG &DAG, MVT VT) {
-  if (VT == MVT::i8)
-    return DAG.getNode(ISD::AND, DL, VT,
-                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                   EFLAGS),
-                       DAG.getConstant(1, DL, VT));
-  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
-  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
-                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                 EFLAGS));
-}
-
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
-                               TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
-  if (CC == X86::COND_A) {
-    // Try to convert COND_A into COND_B in an attempt to facilitate
-    // materializing "setb reg".
-    //
-    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
-    // cannot take an immediate as its first operand.
-    //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
-                                   EFLAGS.getNode()->getVTList(),
-                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
-    }
-  }
-
-  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
-  // a zext and produces an all-ones bit which is more useful than 0/1 in some
-  // cases.
-  if (CC == X86::COND_B)
-    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
-
   // Try to simplify the EFLAGS and condition code operands.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
     return getSETCC(CC, Flags, DL, DAG);
@@ -33352,7 +34859,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
 
 /// Optimize branch condition evaluation.
 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue EFLAGS = N->getOperand(3);
@@ -33512,6 +35018,18 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
+  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+    MVT VT = N->getSimpleValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+    return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
+                       N->getOperand(0), N->getOperand(1),
+                       Flags);
+  }
+
+  return SDValue();
+}
+
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
                           X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -33535,48 +35053,237 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
     return DCI.CombineTo(N, Res1, CarryOut);
   }
 
+  if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
+    MVT VT = N->getSimpleValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+    return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
+                       N->getOperand(0), N->getOperand(1),
+                       Flags);
+  }
+
   return SDValue();
 }
 
-/// fold (add Y, (sete  X, 0)) -> adc  0, Y
-///      (add Y, (setne X, 0)) -> sbb -1, Y
-///      (sub (sete  X, 0), Y) -> sbb  0, Y
-///      (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
+/// which is more useful than 0/1 in some cases.
+static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
   SDLoc DL(N);
+  // "Condition code B" is also known as "the carry flag" (CF).
+  SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
+  SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
+  MVT VT = N->getSimpleValueType(0);
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
 
-  // Look through ZExts.
-  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
-  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
-    return SDValue();
+  assert(VT == MVT::i1 && "Unexpected type for SETCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+  bool IsSub = N->getOpcode() == ISD::SUB;
+  SDValue X = N->getOperand(0);
+  SDValue Y = N->getOperand(1);
+
+  // If this is an add, canonicalize a zext operand to the RHS.
+  // TODO: Incomplete? What if both sides are zexts?
+  if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
+      Y.getOpcode() != ISD::ZERO_EXTEND)
+    std::swap(X, Y);
 
-  SDValue SetCC = Ext.getOperand(0);
-  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+  // Look through a one-use zext.
+  bool PeekedThroughZext = false;
+  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
+    Y = Y.getOperand(0);
+    PeekedThroughZext = true;
+  }
+
+  // If this is an add, canonicalize a setcc operand to the RHS.
+  // TODO: Incomplete? What if both sides are setcc?
+  // TODO: Should we allow peeking through a zext of the other operand?
+  if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
+      Y.getOpcode() != X86ISD::SETCC)
+    std::swap(X, Y);
+
+  if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
     return SDValue();
 
-  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+
+  // If X is -1 or 0, then we have an opportunity to avoid constants required in
+  // the general case below.
+  auto *ConstantX = dyn_cast<ConstantSDNode>(X);
+  if (ConstantX) {
+    if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
+        (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
+      // This is a complicated way to get -1 or 0 from the carry flag:
+      // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+      //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                         Y.getOperand(1));
+    }
+
+    if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
+        (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
+      SDValue EFLAGS = Y->getOperand(1);
+      if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+          EFLAGS.getValueType().isInteger() &&
+          !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+        // Swap the operands of a SUB, and we have the same pattern as above.
+        // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
+        //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
+        SDValue NewSub = DAG.getNode(
+            X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+            EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+        SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+        return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                           DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                           NewEFLAGS);
+      }
+    }
+  }
+
+  if (CC == X86::COND_B) {
+    // X + SETB Z --> X + (mask SBB Z, Z)
+    // X - SETB Z --> X - (mask SBB Z, Z)
+    // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
+    SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
+    if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+      SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+    return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+  }
+
+  if (CC == X86::COND_A) {
+    SDValue EFLAGS = Y->getOperand(1);
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
+      if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+        SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+      return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+    }
+  }
+
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
-  SDValue Cmp = SetCC.getOperand(1);
+  SDValue Cmp = Y.getOperand(1);
   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
       !X86::isZeroNode(Cmp.getOperand(1)) ||
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
-  SDValue CmpOp0 = Cmp.getOperand(0);
-  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
-                               DAG.getConstant(1, DL, CmpOp0.getValueType()));
+  SDValue Z = Cmp.getOperand(0);
+  EVT ZVT = Z.getValueType();
+
+  // If X is -1 or 0, then we have an opportunity to avoid constants required in
+  // the general case below.
+  if (ConstantX) {
+    // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
+    // fake operands:
+    //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
+    // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
+    if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
+        (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
+      SDValue Zero = DAG.getConstant(0, DL, ZVT);
+      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+      SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8),
+                         SDValue(Neg.getNode(), 1));
+    }
+
+    // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
+    // with fake operands:
+    //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
+    // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
+    if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
+        (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
+      SDValue One = DAG.getConstant(1, DL, ZVT);
+      SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+      return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                         DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+    }
+  }
+
+  // (cmp Z, 1) sets the carry flag if Z is 0.
+  SDValue One = DAG.getConstant(1, DL, ZVT);
+  SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+
+  // Add the flags type for ADC/SBB nodes.
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
-  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
-    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
-                       DL, OtherVal.getValueType(), OtherVal,
-                       DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
-                       NewCmp);
-  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
-                     DL, OtherVal.getValueType(), OtherVal,
-                     DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+                       DAG.getConstant(-1ULL, DL, VT), Cmp1);
+
+  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
+  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
+  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+                     DAG.getConstant(0, DL, VT), Cmp1);
+}
+
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  SDValue MulOp = N->getOperand(0);
+  SDValue Phi = N->getOperand(1);
+
+  if (MulOp.getOpcode() != ISD::MUL)
+    std::swap(MulOp, Phi);
+  if (MulOp.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+  unsigned VectorSize = VT.getVectorNumElements() * 16;
+  // If the vector size is less than 128, or greater than the supported RegSize,
+  // do not use PMADD.
+  if (VectorSize < 128 || VectorSize > RegSize)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                   VT.getVectorNumElements());
+  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                VT.getVectorNumElements() / 2);
+
+  // Shrink the operands of mul.
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
+  // Madd vector size is half of the original vector size
+  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+  // Fill the rest of the output with 0
+  SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+  return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
 }
 
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -33650,12 +35357,41 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
 }
 
+/// Convert vector increment or decrement to sub/add with an all-ones constant:
+/// add X, <1, 1...> --> sub X, <-1, -1...>
+/// sub X, <1, 1...> --> add X, <-1, -1...>
+/// The all-ones vector constant can be materialized using a pcmpeq instruction
+/// that is commonly recognized as an idiom (has no register dependency), so
+/// that's better/smaller than loading a splat 1 constant.
+static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
+  assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+         "Unexpected opcode for increment/decrement transform");
+
+  // Pseudo-legality check: getOnesVector() expects one of these types, so bail
+  // out and wait for legalization if we have an unsupported vector length.
+  EVT VT = N->getValueType(0);
+  if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
+    return SDValue();
+
+  SDNode *N1 = N->getOperand(1).getNode();
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) ||
+      !SplatVal.isOneValue())
+    return SDValue();
+
+  SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
+  unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+  return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
+}
+
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
-  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
-  if (Flags->hasVectorReduction()) {
+  const SDNodeFlags Flags = N->getFlags();
+  if (Flags.hasVectorReduction()) {
     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
       return Sad;
+    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+      return MAdd;
   }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
@@ -33667,7 +35403,10 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
-  return OptimizeConditionalInDecrement(N, DAG);
+  if (SDValue V = combineIncDecVector(N, DAG))
+    return V;
+
+  return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
@@ -33700,36 +35439,47 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
-  return OptimizeConditionalInDecrement(N, DAG);
+  if (SDValue V = combineIncDecVector(N, DAG))
+    return V;
+
+  return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
   SDLoc DL(N);
   unsigned Opcode = N->getOpcode();
   MVT VT = N->getSimpleValueType(0);
   MVT SVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = SVT.getSizeInBits();
+
   SDValue Op = N->getOperand(0);
   MVT OpVT = Op.getSimpleValueType();
   MVT OpEltVT = OpVT.getVectorElementType();
-  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+  unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
+  unsigned InputBits = OpEltSizeInBits * NumElts;
 
   // Perform any constant folding.
   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
-    unsigned NumDstElts = VT.getVectorNumElements();
-    SmallBitVector Undefs(NumDstElts, false);
-    SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
-    for (unsigned i = 0; i != NumDstElts; ++i) {
-      SDValue OpElt = Op.getOperand(i);
-      if (OpElt.getOpcode() == ISD::UNDEF) {
-        Undefs[i] = true;
+  APInt UndefElts;
+  SmallVector<APInt, 64> EltBits;
+  if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
+    APInt Undefs(NumElts, 0);
+    SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
+    bool IsZEXT =
+        (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (UndefElts[i]) {
+        Undefs.setBit(i);
         continue;
       }
-      APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
-      Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
-                                        : Cst.sextOrTrunc(SVT.getSizeInBits());
+      Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
+                       : EltBits[i].sextOrTrunc(EltSizeInBits);
     }
     return getConstVector(Vals, Undefs, VT, DAG, DL);
   }
@@ -33829,7 +35579,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
 
   if (N->getOperand(0) == N->getOperand(1)) {
     if (N->getOpcode() == X86ISD::PCMPEQ)
-      return getOnesVector(VT, Subtarget, DAG, DL);
+      return getOnesVector(VT, DAG, DL);
     if (N->getOpcode() == X86ISD::PCMPGT)
       return getZeroVector(VT, Subtarget, DAG, DL);
   }
@@ -33837,6 +35587,99 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT OpVT = N->getSimpleValueType(0);
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // If this is an insert of an extract, combine to a shuffle. Don't do this
+  // if the insert or extract can be represented with a subvector operation.
+  if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      SubVec.getOperand(0).getSimpleValueType() == OpVT &&
+      (IdxVal != 0 || !Vec.isUndef())) {
+    int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
+    if (ExtIdxVal != 0) {
+      int VecNumElts = OpVT.getVectorNumElements();
+      int SubVecNumElts = SubVecVT.getVectorNumElements();
+      SmallVector<int, 64> Mask(VecNumElts);
+      // First create an identity shuffle mask.
+      for (int i = 0; i != VecNumElts; ++i)
+        Mask[i] = i;
+      // Now insert the extracted portion.
+      for (int i = 0; i != SubVecNumElts; ++i)
+        Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
+
+      return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+    }
+  }
+
+  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+  // load:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr + 16), Elts/2)
+  // --> load32 addr
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr + 32), Elts/2)
+  // --> load64 addr
+  // or a 16-byte or 32-byte broadcast:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr), Elts/2)
+  // --> X86SubVBroadcast(load16 addr)
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr), Elts/2)
+  // --> X86SubVBroadcast(load32 addr)
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+    if (Idx2 && Idx2->getZExtValue() == 0) {
+      SDValue SubVec2 = Vec.getOperand(1);
+      // If needed, look through bitcasts to get to the load.
+      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+        bool Fast;
+        unsigned Alignment = FirstLd->getAlignment();
+        unsigned AS = FirstLd->getAddressSpace();
+        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                    OpVT, AS, Alignment, &Fast) && Fast) {
+          SDValue Ops[] = {SubVec2, SubVec};
+          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
+                                                    Subtarget, false))
+            return Ld;
+        }
+      }
+      // If lower/upper loads are the same and the only users of the load, then
+      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+            SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
+          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+        }
+      }
+      // If this is subv_broadcast insert into both halves, use a larger
+      // subv_broadcast.
+      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+                           SubVec.getOperand(0));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -33845,13 +35688,19 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+  case X86ISD::PEXTRW:
+  case X86ISD::PEXTRB:
+    return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+  case ISD::INSERT_SUBVECTOR:
+    return combineInsertSubvector(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
-  case ISD::BITCAST:        return combineBitcast(N, DAG, Subtarget);
+  case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
+  case X86ISD::SBB:         return combineSBB(N, DAG);
   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   case ISD::SHL:
@@ -33870,6 +35719,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
+  case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   case X86ISD::FXOR:
@@ -33884,14 +35734,22 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
-  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
-  case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
+  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
+  case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::VSHLI:
-  case X86ISD::VSRLI:       return combineVectorShift(N, DAG, DCI, Subtarget);
+  case X86ISD::VSRAI:
+  case X86ISD::VSRLI:
+    return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VSEXT:
   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::INSERTPS:
+  case X86ISD::EXTRQI:
+  case X86ISD::INSERTQI:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
@@ -33969,14 +35827,21 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
 /// we don't adjust the stack we clobber the first frame index.
 /// See X86InstrInfo::copyPhysReg.
-bool X86TargetLowering::hasCopyImplyingStackAdjustment(
-    MachineFunction *MF) const {
-  const MachineRegisterInfo &MRI = MF->getRegInfo();
-
+static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   return any_of(MRI.reg_instructions(X86::EFLAGS),
                 [](const MachineInstr &RI) { return RI.isCopy(); });
 }
 
+void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
+  if (hasCopyImplyingStackAdjustment(MF)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    MFI.setHasCopyImplyingStackAdjustment(true);
+  }
+
+  TargetLoweringBase::finalizeLowering(MF);
+}
+
 /// This method query the target whether it is beneficial for dag combiner to
 /// promote the specified node. If true, it should return the desired promotion
 /// type by reference.
@@ -34217,6 +36082,7 @@ TargetLowering::ConstraintWeight
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    LLVM_FALLTHROUGH;
   case 'R':
   case 'q':
   case 'Q':
@@ -34568,6 +36434,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
           return std::make_pair(0U, &X86::GR64RegClass);
         break;
       }
+      LLVM_FALLTHROUGH;
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32 || VT == MVT::f32)
@@ -34737,7 +36604,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   // turn into {ax},{dx}.
   // MVT::Other is used to specify clobber names.
-  if (Res.second->hasType(VT) || VT == MVT::Other)
+  if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
     return Res;   // Correct type already, nothing to do.
 
   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
@@ -34775,11 +36642,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       Res.second = &X86::FR32RegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
       Res.second = &X86::FR64RegClass;
-    else if (X86::VR128RegClass.hasType(VT))
+    else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
       Res.second = &X86::VR128RegClass;
-    else if (X86::VR256RegClass.hasType(VT))
+    else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
       Res.second = &X86::VR256RegClass;
-    else if (X86::VR512RegClass.hasType(VT))
+    else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
       Res.second = &X86::VR512RegClass;
     else {
       // Type mismatch and not a clobber: Return an error;
@@ -34819,7 +36686,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   return -1;
 }
 
-bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on x86 is expensive. However, when aggressively optimizing
   // for code size, we prefer to use a div instruction, as it is usually smaller
   // than the alternative sequence.
@@ -34827,8 +36694,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::MinSize);
+  bool OptSize =
+      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
@@ -34884,3 +36751,22 @@ void X86TargetLowering::insertCopiesSplitCSR(
 bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
+
+/// Returns the name of the symbol used to emit stack probes or the empty
+/// string if not applicable.
+StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+  // If the function specifically requests stack probes, emit them.
+  if (MF.getFunction()->hasFnAttribute("probe-stack"))
+    return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+
+  // Generally, if we aren't on Windows, the platform ABI does not include
+  // support for stack probes, so don't emit them.
+  if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
+    return "";
+
+  // We need a stack probe to conform to the Windows ABI. Choose the right
+  // symbol.
+  if (Subtarget.is64Bit())
+    return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
+  return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
+}
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
index 37f9353..dbbc2bb 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h
@@ -149,8 +149,7 @@ namespace llvm {
       WrapperRIP,
 
       /// Copies a 64-bit value from the low word of an XMM vector
-      /// to an MMX vector.  If you think this is too close to the previous
-      /// mnemonic, so do I; blame Intel.
+      /// to an MMX vector.
       MOVDQ2Q,
 
       /// Copies a 32-bit value from the low word of a MMX
@@ -179,7 +178,7 @@ namespace llvm {
 
       /// Insert the lower 16-bits of a 32-bit value to a vector,
       /// corresponds to X86::PINSRW.
-      PINSRW, MMX_PINSRW,
+      PINSRW,
 
       /// Shuffle 16 8-bit values within a vector.
       PSHUFB,
@@ -195,21 +194,21 @@ namespace llvm {
       /// Blend where the selector is an immediate.
       BLENDI,
 
-      /// Blend where the condition has been shrunk.
-      /// This is used to emphasize that the condition mask is
-      /// no more valid for generic VSELECT optimizations.
+      /// Dynamic (non-constant condition) vector blend where only the sign bits
+      /// of the condition elements are used. This is used to enforce that the
+      /// condition mask is not valid for generic VSELECT optimizations.
       SHRUNKBLEND,
 
       /// Combined add and sub on an FP vector.
       ADDSUB,
 
       //  FP vector ops with rounding mode.
-      FADD_RND,
-      FSUB_RND,
-      FMUL_RND,
-      FDIV_RND,
-      FMAX_RND,
-      FMIN_RND,
+      FADD_RND, FADDS_RND,
+      FSUB_RND, FSUBS_RND,
+      FMUL_RND, FMULS_RND,
+      FDIV_RND, FDIVS_RND,
+      FMAX_RND, FMAXS_RND,
+      FMIN_RND, FMINS_RND,
       FSQRT_RND, FSQRTS_RND,
 
       // FP vector get exponent.
@@ -239,9 +238,6 @@ namespace llvm {
       FHADD,
       FHSUB,
 
-      // Integer absolute value
-      ABS,
-
       // Detect Conflicts Within a Vector
       CONFLICT,
 
@@ -251,6 +247,9 @@ namespace llvm {
       /// Commutative FMIN and FMAX.
       FMAXC, FMINC,
 
+      /// Scalar intrinsic floating point max and min.
+      FMAXS, FMINS,
+
       /// Floating point reciprocal-sqrt and reciprocal approximation.
       /// Note that these typically require refinement
       /// in order to obtain suitable precision.
@@ -320,6 +319,9 @@ namespace llvm {
       // Vector shift elements by immediate
       VSHLI, VSRLI, VSRAI,
 
+      // Shifts of mask registers.
+      KSHIFTL, KSHIFTR,
+
       // Bit rotate by immediate
       VROTLI, VROTRI,
 
@@ -443,8 +445,7 @@ namespace llvm {
       // Broadcast subvector to vector.
       SUBV_BROADCAST,
 
-      // Insert/Extract vector element.
-      VINSERT,
+      // Extract vector element.
       VEXTRACT,
 
       /// SSE4A Extraction and Insertion.
@@ -558,6 +559,9 @@ namespace llvm {
       // Conversions between float and half-float.
       CVTPS2PH, CVTPH2PS,
 
+      // LWP insert record.
+      LWPINS,
+
       // Compare and swap.
       LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LCMPXCHG8_DAG,
@@ -611,7 +615,10 @@ namespace llvm {
       // Vector truncating store with unsigned/signed saturation
       VTRUNCSTOREUS, VTRUNCSTORES,
       // Vector truncating masked store with unsigned/signed saturation
-      VMTRUNCSTOREUS, VMTRUNCSTORES
+      VMTRUNCSTOREUS, VMTRUNCSTORES,
+
+      // X86 specific gather
+      MGATHER
 
       // WARNING: Do not add anything in the end unless you want the node to
       // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -686,6 +693,9 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
+    void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                               ArgListTy &Args) const override;
+
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
       return MVT::i8;
     }
@@ -757,6 +767,19 @@ namespace llvm {
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+    // Return true if it is profitable to combine a BUILD_VECTOR to a TRUNCATE
+    // for given operand and result types.
+    // Example of such a combine:
+    // v4i32 build_vector((extract_elt V, 0),
+    //                    (extract_elt V, 2),
+    //                    (extract_elt V, 4),
+    //                    (extract_elt V, 6))
+    //  -->
+    // v4i32 truncate (bitcast V to v4i64)
+    bool isDesirableToCombineBuildVectorToTruncate() const override {
+      return true;
+    }
+
     /// Return true if the target has native support for
     /// the specified value type and it is 'desirable' to use the type for the
     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
@@ -769,10 +792,6 @@ namespace llvm {
     /// and some i16 instructions are slow.
     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
-    /// Return true if the MachineFunction contains a COPY which would imply
-    /// HasOpaqueSPAdjustment.
-    bool hasCopyImplyingStackAdjustment(MachineFunction *MF) const override;
-
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
@@ -806,8 +825,17 @@ namespace llvm {
       return false;
     }
 
+    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
     bool hasAndNotCompare(SDValue Y) const override;
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
+    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+    MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
@@ -815,13 +843,14 @@ namespace llvm {
     /// Determine which of the bits specified in Mask are known to be either
     /// zero or one and return them in the KnownZero/KnownOne bitsets.
     void computeKnownBitsForTargetNode(const SDValue Op,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
+                                       KnownBits &Known,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
     /// Determine the number of bits in the operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth) const override;
 
@@ -984,6 +1013,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool convertSelectOfConstantsToMath() const override {
+      return true;
+    }
+
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
@@ -1035,10 +1068,12 @@ namespace llvm {
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
-    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+    bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
     bool supportSwiftError() const override;
 
+    StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
+
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
     /// \brief Lower interleaved load(s) into target specific
@@ -1047,6 +1082,15 @@ namespace llvm {
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
+
+    /// \brief Lower interleaved store(s) into target specific
+    /// instructions/intrinsics.
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+                               unsigned Factor) const override;
+
+
+    void finalizeLowering(MachineFunction &MF) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -1076,7 +1120,8 @@ namespace llvm {
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             const SDLoc &dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals) const;
+                            SmallVectorImpl<SDValue> &InVals,
+                            uint32_t *RegMask) const;
     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
                              const SDLoc &dl, SelectionDAG &DAG,
@@ -1138,12 +1183,11 @@ namespace llvm {
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget,
-                           SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
                       SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
@@ -1188,7 +1232,7 @@ namespace llvm {
 
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
-    bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+    bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 
     EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
                             ISD::NodeType ExtendKind) const override;
@@ -1377,6 +1421,19 @@ namespace llvm {
     }
   };
 
+  // X86 specific Gather node.
+  class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode {
+  public:
+    X86MaskedGatherSDNode(unsigned Order,
+                          const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                          MachineMemOperand *MMO)
+      : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO)
+    {}
+    static bool classof(const SDNode *N) {
+      return N->getOpcode() == X86ISD::MGATHER;
+    }
+  };
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
index ba1aede..08b501f 100644
--- a/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
+++ b/contrib/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -38,7 +38,9 @@ multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
   def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
 }
 
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, bit Commutable = 0,
+                               string Ver = ""> {
+  let isCommutable = Commutable in
   def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
       !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
@@ -63,25 +65,25 @@ multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
         (bitconvert (load_mmx addr:$src))))]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb", 1>;
 defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
 defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
-defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
-defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd", 1>;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq", 1>;
 defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
 defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
 defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
 defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
-defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul", 1>;
 defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
 defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
 defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
 defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
 defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
-defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
-defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub", 1>;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr", 1>;
 defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
-defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", 1>;
 
 
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
@@ -98,6 +100,6 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
 // "3DNowA" instructions
 defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
 defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
-defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
-defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", 0, "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", 0, "a">;
 defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
index 230d170..0ae960e 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -31,15 +31,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM");
 
   // The mask VT.
-  ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
-                                                          "v" # NumElts # "i1"));
-
-  // The GPR register class that can hold the write mask.  Use GR8 for fewer
-  // than 8 elements.  Use shift-right and equal to work around the lack of
-  // !lt in tablegen.
-  RegisterClass MRC =
-    !cast<RegisterClass>("GR" #
-                         !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
+  ValueType KVT = !cast<ValueType>("v" # NumElts # "i1");
 
   // Suffix used in the instruction mnemonic.
   string Suffix = suffix;
@@ -69,6 +61,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   // The corresponding memory operand, e.g. i512mem for VR512.
   X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
   X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+  // FP scalar memory operand for intrinsics - ssmem/sdmem.
+  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
+                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
   // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
@@ -89,6 +84,12 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
+  ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
+                                          !cast<ComplexPattern>("sse_load_f32"),
+                                    !if (!eq (EltTypeName, "f64"),
+                                          !cast<ComplexPattern>("sse_load_f64"),
+                                    ?));
+
   // The corresponding float type, e.g. v16f32 for v16i32
   // Note: For EltSize < 32, FloatVT is illegal and TableGen
   //       fails to compile, so we choose FloatVT = VT
@@ -184,6 +185,20 @@ def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
 def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
                                              v2f64x_info>;
 
+class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm,
+                       ValueType _vt> {
+  RegisterClass KRC = _krc;
+  RegisterClass KRCWM = _krcwm;
+  ValueType KVT = _vt;
+}
+
+def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>;
+def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>;
+def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>;
+def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
+def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
+def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;
+
 // This multiclass generates the masking variants from the non-masking
 // variant.  It only provides the assembly pieces for the masking variants.
 // It assumes custom ISel patterns for masking which can be provided as
@@ -207,7 +222,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
-  let AddedComplexity = 20, isCommutable = IsKCommutable in
+  let isCommutable = IsKCommutable in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
@@ -219,7 +234,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
 
   // Zero mask does not add any restrictions to commute operands transformation.
   // So, it is Ok to use IsCommutable instead of IsKCommutable.
-  let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+  let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -250,6 +265,23 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          MaskingConstraint, NoItinerary, IsCommutable,
                          IsKCommutable>;
 
+// Similar to AVX512_maskable_common, but with scalar types.
+multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  SDNode Select = vselect,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [], [], [],
+                         MaskingConstraint, NoItinerary, IsCommutable,
+                         IsKCommutable>;
+
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
 // perserved vector elements come from a new dummy input operand tied to $dst.
@@ -460,7 +492,7 @@ def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in {
 def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
                [(set VR128X:$dst, (v4i32 immAllZerosV))]>;
 def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
@@ -470,7 +502,7 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "",
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in {
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in {
   def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "",
                           [(set FR32X:$dst, fp32imm0)]>;
   def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "",
@@ -484,7 +516,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                                        PatFrag vinsert_insert> {
   let ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
-                   (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+                   (ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
@@ -492,7 +524,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                          (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
 
     defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
-                   (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+                   (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
@@ -625,14 +657,14 @@ multiclass vextract_for_size<int Opcode,
     // vextract_extract), we interesting only in patterns without mask,
     // intrinsics pattern match generated bellow.
     defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
-                (ins From.RC:$src1, i32u8imm:$idx),
+                (ins From.RC:$src1, u8imm:$idx),
                 "vextract" # To.EltTypeName # "x" # To.NumElts,
                 "$idx, $src1", "$src1, $idx",
                 [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
                                                          (iPTR imm)))]>,
               AVX512AIi8Base, EVEX;
     def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
-                    (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx),
+                    (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
                     "vextract" # To.EltTypeName # "x" # To.NumElts #
                         "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
                     [(store (To.VT (vextract_extract:$idx
@@ -642,7 +674,7 @@ multiclass vextract_for_size<int Opcode,
     let mayStore = 1, hasSideEffects = 0 in
     def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
                     (ins To.MemOp:$dst, To.KRCWM:$mask,
-                                        From.RC:$src1, i32u8imm:$idx),
+                                        From.RC:$src1, u8imm:$idx),
                      "vextract" # To.EltTypeName # "x" # To.NumElts #
                           "\t{$idx, $src1, $dst {${mask}}|"
                           "$dst {${mask}}, $src1, $idx}",
@@ -846,32 +878,20 @@ def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
 // broadcast with a scalar argument.
 multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
-
-  let isCodeGenOnly = 1 in {
-  def r_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
-               (ins SrcInfo.FRC:$src), OpcodeStr#"\t{$src, $dst|$dst, $src}",
-               [(set DestInfo.RC:$dst, (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)))]>,
-               Requires<[HasAVX512]>, T8PD, EVEX;
-
-  let Constraints = "$src0 = $dst" in
-  def rk_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
-                (ins DestInfo.RC:$src0, DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
-                OpcodeStr#"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}",
-                [(set DestInfo.RC:$dst,
-                     (vselect DestInfo.KRCWM:$mask,
-                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
-                              DestInfo.RC:$src0))]>,
-              Requires<[HasAVX512]>, T8PD, EVEX, EVEX_K;
-
-  def rkz_s : I< opc, MRMSrcReg, (outs DestInfo.RC:$dst),
-                (ins DestInfo.KRCWM:$mask, SrcInfo.FRC:$src),
-                OpcodeStr#"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
-                [(set DestInfo.RC:$dst,
-                     (vselect DestInfo.KRCWM:$mask,
-                              (DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
-                              DestInfo.ImmAllZerosV))]>,
-                Requires<[HasAVX512]>, T8PD, EVEX, EVEX_KZ;
-  } // let isCodeGenOnly = 1 in
+  def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#r)
+             (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                                  (X86VBroadcast SrcInfo.FRC:$src),
+                                  DestInfo.RC:$src0)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#rk)
+             DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
+             (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
+  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
+                                  (X86VBroadcast SrcInfo.FRC:$src),
+                                  DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#DestInfo.ZSuffix#rkz)
+             DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
 }
 
 multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
@@ -892,7 +912,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                           (SrcInfo.VT (scalar_to_vector
                                        (SrcInfo.ScalarLdFrag addr:$src))))),
             (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
-  let AddedComplexity = 20 in
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                           (X86VBroadcast
                            (SrcInfo.VT (scalar_to_vector
@@ -900,7 +919,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                           DestInfo.RC:$src0)),
             (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
              DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
-  let AddedComplexity = 30 in
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                           (X86VBroadcast
                            (SrcInfo.VT (scalar_to_vector
@@ -951,39 +969,73 @@ def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
           (VBROADCASTSDZm addr:$src)>;
 
 multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+                                    SDPatternOperator OpNode,
                                     RegisterClass SrcRC> {
+  let ExeDomain = _.ExeDomain in
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins SrcRC:$src),
                          "vpbroadcast"##_.Suffix, "$src", "$src",
-                         (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX;
+                         (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX;
+}
+
+multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, 
+                                    X86VectorVTInfo _, SDPatternOperator OpNode,
+                                    RegisterClass SrcRC, SubRegIndex Subreg> {
+  let ExeDomain = _.ExeDomain in
+  defm r : AVX512_maskable_custom<opc, MRMSrcReg,
+                        (outs _.RC:$dst), (ins GR32:$src),
+                        !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
+                        !con((ins _.KRCWM:$mask), (ins GR32:$src)),
+                        "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
+                        "$src0 = $dst">, T8PD, EVEX;
+
+  def : Pat <(_.VT (OpNode SrcRC:$src)),
+             (!cast<Instruction>(Name#r)
+              (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+
+  def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0),
+             (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask,
+              (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+
+  def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV),
+             (!cast<Instruction>(Name#rkz) _.KRCWM:$mask,
+              (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+}
+
+multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name,
+                      AVX512VLVectorVTInfo _, SDPatternOperator OpNode,
+                      RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> {
+  let Predicates = [prd] in
+    defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, _.info512, OpNode, SrcRC, 
+              Subreg>, EVEX_V512;
+  let Predicates = [prd, HasVLX] in {
+    defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, _.info256, OpNode,
+              SrcRC, Subreg>, EVEX_V256;
+    defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, _.info128, OpNode,
+              SrcRC, Subreg>, EVEX_V128;
+  }
 }
 
 multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+                                       SDPatternOperator OpNode,
                                        RegisterClass SrcRC, Predicate prd> {
   let Predicates = [prd] in
-    defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+    defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
-    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256;
+    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128;
   }
 }
 
-let isCodeGenOnly = 1 in {
-defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8,
-                                                 HasBWI>;
-defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16,
-                                                 HasBWI>;
-}
-let isAsmParserOnly = 1 in {
-  defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
-                                                       GR32, HasBWI>;
-  defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
-                                                       GR32, HasBWI>;
-}
-defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
-                                                 HasAVX512>;
-defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
-                                                 HasAVX512>, VEX_W;
+defm VPBROADCASTBr : avx512_int_broadcastbw_reg_vl<0x7A, "VPBROADCASTBr",
+                       avx512vl_i8_info, X86VBroadcast, GR8, sub_8bit, HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcastbw_reg_vl<0x7B, "VPBROADCASTWr",
+                       avx512vl_i16_info, X86VBroadcast, GR16, sub_16bit,
+                       HasBWI>;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
+                                                 X86VBroadcast, GR32, HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
+                                                 X86VBroadcast, GR64, HasAVX512>, VEX_W;
 
 def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
            (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
@@ -1035,7 +1087,18 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                             AVX5128IBase, EVEX;
 }
 
+let Predicates = [HasAVX512] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZm addr:$src)>;
+}
+
 let Predicates = [HasVLX, HasBWI] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZ128m addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZ256m addr:$src)>;
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   // This means we'll encounter truncated i32 loads; match that here.
   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -1075,18 +1138,12 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
 
 // Provide fallback in case the load node that is used in the patterns above
 // is used by additional users, which prevents the pattern selection.
-def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
-          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v8f32 VR256X:$src), 1)>;
 def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
           (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v4f64 VR256X:$src), 1)>;
 def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
           (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v4i64 VR256X:$src), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
-          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
-                           (v8i32 VR256X:$src), 1)>;
 def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
           (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v16i16 VR256X:$src), 1)>;
@@ -1098,46 +1155,6 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
 def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
           (VBROADCASTI32X4rm addr:$src)>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
-          (VINSERTF64x4Zrr
-           (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
-                                                   VR128X:$src, sub_xmm),
-                                    VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
-                                                   VR128X:$src, sub_xmm),
-                                    VR128X:$src, 1)), sub_ymm), 1)>;
-
-def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
-                                                   VR128X:$src, sub_xmm),
-                                    VR128X:$src, 1)), sub_ymm), 1)>;
 }
 
 let Predicates = [HasVLX] in {
@@ -1209,25 +1226,6 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
 def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
 
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
-          (VINSERTF64x4Zrr
-           (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
-          (VINSERTI64x4Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-
 def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
           (VBROADCASTF64X4rm addr:$src)>;
 def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
@@ -1265,25 +1263,6 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
 def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
           (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
                            (v8i32 VR256X:$src), 1)>;
-
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
-          (VINSERTF32x8Zrr
-           (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
-          (VINSERTI32x8Zrr
-           (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                           VR128X:$src, sub_xmm),
-                            VR128X:$src, 1),
-           (EXTRACT_SUBREG
-            (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
-                                                    VR128X:$src, sub_xmm),
-                                     VR128X:$src, 1)), sub_ymm), 1)>;
 }
 
 multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
@@ -1310,6 +1289,13 @@ defm VBROADCASTI32X2  : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
 defm VBROADCASTF32X2  : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2",
                                           avx512vl_f32_info, avx512vl_f64_info>;
 
+let Predicates = [HasVLX] in {
+def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))),
+          (VBROADCASTSSZ256r (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm))>;
+def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))),
+          (VBROADCASTSDZ256r (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>;
+}
+
 def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))),
           (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
 def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))),
@@ -1604,13 +1590,13 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                       (OpNode (_.VT _.RC:$src1),
                               (_.VT _.RC:$src2),
                               imm:$cc)>, EVEX_4V;
+  let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
-                    (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                    (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc),
                     "vcmp${cc}"#_.Suffix,
                     "$src2, $src1", "$src1, $src2",
-                    (OpNode (_.VT _.RC:$src1),
-                        (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                    (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
                         imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
 
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -1629,6 +1615,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                         (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                         "vcmp"#_.Suffix,
                         "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+  let mayLoad = 1 in
     defm  rmi_alt  : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
                         (outs _.KRC:$dst),
                         (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -1667,8 +1654,10 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
 }
 
 let Predicates = [HasAVX512] in {
+  let ExeDomain = SSEPackedSingle in
   defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
                                    AVX512XSIi8Base;
+  let ExeDomain = SSEPackedDouble in
   defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
                                    AVX512XDIi8Base, VEX_W;
 }
@@ -1687,6 +1676,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
              [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
                                      (_.VT (bitconvert (_.LdFrag addr:$src2)))))],
              IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  let isCommutable = IsCommutable in
   def rrk : AVX512BI<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
@@ -1790,17 +1780,217 @@ defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
                       avx512vl_i64_info, HasAVX512>,
                 T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPGTDZrr
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
 
-def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
-            (COPY_TO_REGCLASS (VPCMPEQDZrr
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
-}
+multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                       SDNode OpNode, string InstrStr,
+                                       list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (_.VT (bitconvert (_.LdFrag addr:$src2))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask,
+                                          (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
+                                                                _.RC:$src1, _.RC:$src2),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
+                                                                (_.LdFrag addr:$src2))))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask,
+                                                                 _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+}
+}
+
+multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                           SDNode OpNode, string InstrStr,
+                                           list<Predicate> Preds>
+         : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                            (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (X86VBroadcast
+                                                          (_.ScalarLdFrag addr:$src2)))))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask,
+                                                                  _.RC:$src1, addr:$src2),
+                              NewInf.KRC)>;
+}
+}
+
+// VPCMPEQB - i8
+defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPEQW - i16
+defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm,
+                                   "VPCMPEQWZ", [HasBWI]>;
+
+// VPCMPEQD - i32
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQDZ", [HasAVX512]>;
+
+// VPCMPEQQ - i64
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info,  X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm,
+                                       "VPCMPEQQZ", [HasAVX512]>;
+
+// VPCMPGTB - i8
+defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPGTW - i16
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v16i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v8i16x_info,  v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_packed_lowering<v32i16_info,  v64i1_info, X86pcmpgtm,
+                                   "VPCMPGTWZ", [HasBWI]>;
+
+// VPCMPGTD - i32
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i32x_info,  v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTDZ", [HasAVX512]>;
+
+// VPCMPGTQ - i64
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info,  X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
+defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm,
+                                       "VPCMPGTQZ", [HasAVX512]>;
 
 multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                           X86VectorVTInfo _> {
@@ -1820,6 +2010,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
                               (_.VT (bitconvert (_.LdFrag addr:$src2))),
                               imm:$cc))],
              IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  let isCommutable = 1 in
   def rrik : AVX512AIi8<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
                                       AVX512ICC:$cc),
@@ -1962,6 +2153,237 @@ defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
 defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
                                      HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
 
+multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                          SDNode OpNode, string InstrStr,
+                                          list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (_.VT _.RC:$src2),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
+                                                                 addr:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and _.KRCWM:$mask,
+                                          (OpNode (_.VT _.RC:$src1),
+                                                  (_.VT _.RC:$src2),
+                                                  imm:$cc))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
+                                                                  _.RC:$src1,
+                                                                  _.RC:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (_.VT (bitconvert
+                                                                (_.LdFrag addr:$src2))),
+                                                         imm:$cc)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
+                                                                  _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+
+multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                              SDNode OpNode, string InstrStr,
+                                              list<Predicate> Preds>
+         : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (OpNode (_.VT _.RC:$src1),
+                                             (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                             imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (and (_.KVT _.KRCWM:$mask),
+                                          (_.KVT (OpNode (_.VT _.RC:$src1),
+                                                         (X86VBroadcast
+                                                            (_.ScalarLdFrag addr:$src2)),
+                                                         imm:$cc)))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask,
+                                                                   _.RC:$src1,
+                                                                   addr:$src2,
+                                                                   imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+
+// VPCMPB - i8
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm,
+                                      "VPCMPBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm,
+                                      "VPCMPBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm,
+                                      "VPCMPBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPW - i16
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v16i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v32i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v64i1_info, X86cmpm,
+                                      "VPCMPWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm,
+                                      "VPCMPWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm,
+                                      "VPCMPWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm,
+                                      "VPCMPWZ", [HasBWI]>;
+
+// VPCMPD - i32
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v8i1_info,   X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v16i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v32i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v64i1_info,  X86cmpm,
+                                          "VPCMPDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86cmpm,
+                                          "VPCMPDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm,
+                                          "VPCMPDZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm,
+                                          "VPCMPDZ", [HasAVX512]>;
+
+// VPCMPQ - i64
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info,   X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info,   X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info,  X86cmpm,
+                                          "VPCMPQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info,   X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info,  X86cmpm,
+                                          "VPCMPQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm,
+                                          "VPCMPQZ", [HasAVX512]>;
+
+// VPCMPUB - i8
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu,
+                                      "VPCMPUBZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUBZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUBZ256", [HasBWI, HasVLX]>;
+
+// VPCMPUW - i16
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v16i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v32i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v8i16x_info,  v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ128", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu,
+                                      "VPCMPUWZ256", [HasBWI, HasVLX]>;
+defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ256", [HasBWI, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu,
+                                      "VPCMPUWZ", [HasBWI]>;
+
+// VPCMPUD - i32
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v8i1_info,   X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v16i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v32i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info,  v64i1_info,  X86cmpmu,
+                                          "VPCMPUDZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v16i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v32i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info,  v64i1_info, X86cmpmu,
+                                          "VPCMPUDZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu,
+                                          "VPCMPUDZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu,
+                                          "VPCMPUDZ", [HasAVX512]>;
+
+// VPCMPUQ - i64
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info,   X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info,   X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info,  X86cmpmu,
+                                          "VPCMPUQZ128", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info,   X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info,  X86cmpmu,
+                                          "VPCMPUQZ256", [HasAVX512, HasVLX]>;
+
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu,
+                                          "VPCMPUQZ", [HasAVX512]>;
+
 multiclass avx512_vcmp_common<X86VectorVTInfo _> {
 
   defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -2052,21 +2474,108 @@ defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
 defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
                           AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
 
-def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VCMPPSZrri
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VPCMPDZrri
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
-def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
-          (COPY_TO_REGCLASS (VPCMPUDZrri
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
-            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
-            imm:$cc), VK8)>;
+multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                          string InstrStr, list<Predicate> Preds> {
+let Predicates = Preds in {
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (_.VT _.RC:$src2),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (_.VT (bitconvert (_.LdFrag addr:$src2))),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
+                                                                 addr:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpm (_.VT _.RC:$src1),
+                                              (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+                                              imm:$cc)),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1,
+                                                                  addr:$src2,
+                                                                  imm:$cc),
+                              NewInf.KRC)>;
+}
+}
+
+multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
+                                              string InstrStr, list<Predicate> Preds>
+         : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
+
+let Predicates = Preds in
+  def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
+                              (_.KVT (X86cmpmRnd (_.VT _.RC:$src1),
+                                                 (_.VT _.RC:$src2),
+                                                 imm:$cc,
+                                                 (i32 FROUND_NO_EXC))),
+                              (i64 0)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1,
+                                                                 _.RC:$src2,
+                                                                 imm:$cc),
+                              NewInf.KRC)>;
+}
+
+
+// VCMPPS - f32
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v8i1_info,  "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v16i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v32i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f32x_info,  v64i1_info, "VCMPPSZ128",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v16i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v32i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v8f32x_info,  v64i1_info, "VCMPPSZ256",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ",
+                                          [HasAVX512]>;
+
+// VCMPPD - f64
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info,  "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info,  "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info,  "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256",
+                                      [HasAVX512, HasVLX]>;
+
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
+defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ",
+                                          [HasAVX512]>;
 
 // ----------------------------------------------------------------
 // FPClass
@@ -2087,22 +2596,20 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       [(set _.KRC:$dst,(or _.KRCWM:$mask,
                                       (OpNode (_.VT _.RC:$src1),
                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    let AddedComplexity = 20 in {
-      def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##
-                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set _.KRC:$dst,
-                            (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                    (i32 imm:$src2)))], NoItinerary>;
-      def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##
-                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set _.KRC:$dst,
                           (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                              (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    }
+                                  (i32 imm:$src2)))], NoItinerary>;
+    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##
+                    "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                    [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                        (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                            (i32 imm:$src2))))], NoItinerary>, EVEX_K;
   }
 }
 
@@ -2242,28 +2749,26 @@ let Predicates = [HasBWI] in {
 
 // GR from/to mask register
 def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
-          (COPY_TO_REGCLASS GR16:$src, VK16)>;
+          (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
 def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
-          (COPY_TO_REGCLASS VK16:$src, GR16)>;
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
 
 def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
-          (COPY_TO_REGCLASS GR8:$src, VK8)>;
+          (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
 def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
-          (COPY_TO_REGCLASS VK8:$src, GR8)>;
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>;
 
 def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
           (KMOVWrk VK16:$src)>;
 def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
-          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>;
+          (COPY_TO_REGCLASS VK16:$src, GR32)>;
 
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>;
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>;
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
           (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
 def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>;
+          (COPY_TO_REGCLASS VK8:$src, GR32)>;
 
 def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
           (COPY_TO_REGCLASS GR32:$src, VK32)>;
@@ -2296,20 +2801,20 @@ let Predicates = [HasDQI] in {
 let Predicates = [HasAVX512, NoDQI] in {
   def : Pat<(store VK1:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK2:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK4:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK8:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
+              sub_8bit)))>;
 
   def : Pat<(v8i1 (load addr:$src)),
             (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
@@ -2322,7 +2827,7 @@ let Predicates = [HasAVX512, NoDQI] in {
 let Predicates = [HasAVX512] in {
   def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst),
             (KMOVWmk addr:$dst, VK16:$src)>;
-  def : Pat<(i1 (load addr:$src)),
+  def : Pat<(v1i1 (load addr:$src)),
             (COPY_TO_REGCLASS (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), VK1)>;
   def : Pat<(v16i1 (bitconvert (i16 (load addr:$src)))),
             (KMOVWkm addr:$src)>;
@@ -2339,81 +2844,45 @@ let Predicates = [HasBWI] in {
 }
 
 let Predicates = [HasAVX512] in {
-  def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
-                                        (i32 1))), VK1)>;
-
-  def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
-
-  def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
-            (COPY_TO_REGCLASS GR32:$src, VK1)>;
-
-  def : Pat<(i1 (trunc (i8 GR8:$src))),
-       (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                          GR8:$src, sub_8bit), (i32 1))),
-       VK1)>;
+  multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> {
+    def : Pat<(maskVT (scalar_to_vector GR32:$src)),
+              (COPY_TO_REGCLASS GR32:$src, maskRC)>;
 
-  def : Pat<(i1 (trunc (i16 GR16:$src))),
-       (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                          GR16:$src, sub_16bit), (i32 1))),
-       VK1)>;
+    def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
+              (COPY_TO_REGCLASS maskRC:$src, GR32)>;
 
-  def : Pat<(i32 (zext VK1:$src)),
-            (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+    def : Pat<(maskVT (scalar_to_vector GR8:$src)),
+              (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
 
-  def : Pat<(i32 (anyext VK1:$src)),
-            (COPY_TO_REGCLASS VK1:$src, GR32)>;
+    def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
+              (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
 
-  def : Pat<(i8 (zext VK1:$src)),
-            (EXTRACT_SUBREG
-             (AND32ri8 (KMOVWrk
-                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
-
-  def : Pat<(i8 (anyext VK1:$src)),
-            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
+    def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
+              (COPY_TO_REGCLASS maskRC:$src, GR32)>;
+  }
 
-  def : Pat<(i64 (zext VK1:$src)),
-            (AND64ri8 (SUBREG_TO_REG (i64 0),
-             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+  defm : operation_gpr_mask_copy_lowering<VK1,  v1i1>;
+  defm : operation_gpr_mask_copy_lowering<VK2,  v2i1>;
+  defm : operation_gpr_mask_copy_lowering<VK4,  v4i1>;
+  defm : operation_gpr_mask_copy_lowering<VK8,  v8i1>;
+  defm : operation_gpr_mask_copy_lowering<VK16,  v16i1>;
+  defm : operation_gpr_mask_copy_lowering<VK32,  v32i1>;
+  defm : operation_gpr_mask_copy_lowering<VK64,  v64i1>;
 
-  def : Pat<(i64 (anyext VK1:$src)),
-            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-             (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_32bit)>;
+  def : Pat<(X86kshiftr  (X86kshiftl (v1i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+          (COPY_TO_REGCLASS
+                (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                            GR8:$src, sub_8bit), (i32 1))), VK1)>;
+  def : Pat<(X86kshiftr  (X86kshiftl (v16i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+            (COPY_TO_REGCLASS
+                (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                            GR8:$src, sub_8bit), (i32 1))), VK16)>;
+  def : Pat<(X86kshiftr  (X86kshiftl (v8i1 (scalar_to_vector GR8:$src)), (i8 15)), (i8 15)) ,
+         (COPY_TO_REGCLASS
+          (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                            GR8:$src, sub_8bit), (i32 1))), VK8)>;
 
-  def : Pat<(i16 (zext VK1:$src)),
-            (EXTRACT_SUBREG
-             (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
-              sub_16bit)>;
-
-  def : Pat<(i16 (anyext VK1:$src)),
-            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
-}
-def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK16)>;
-def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK8)>;
-def : Pat<(v4i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK4)>;
-def : Pat<(v2i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK2)>;
-def : Pat<(v32i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK32)>;
-def : Pat<(v64i1 (scalar_to_vector VK1:$src)),
-          (COPY_TO_REGCLASS VK1:$src, VK64)>;
-
-def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
-def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
-
-def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), (COPY_TO_REGCLASS VK64:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), (COPY_TO_REGCLASS VK32:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK16:$src, (iPTR 0))), (COPY_TO_REGCLASS VK16:$src, VK1)>;
-def : Pat<(i1 (X86Vextract VK8:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK8:$src,  VK1)>;
-def : Pat<(i1 (X86Vextract VK4:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK4:$src,  VK1)>;
-def : Pat<(i1 (X86Vextract VK2:$src,  (iPTR 0))), (COPY_TO_REGCLASS VK2:$src,  VK1)>;
+}
 
 // Mask unary operation
 // - KNOT
@@ -2440,15 +2909,6 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
 
 defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
 
-multiclass avx512_mask_unop_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
-                (i16 GR16:$src)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
-}
-defm : avx512_mask_unop_int<"knot", "KNOT">;
-
 // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
 let Predicates = [HasAVX512, NoDQI] in
 def : Pat<(vnot VK8:$src),
@@ -2497,21 +2957,6 @@ defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,   1>;
 defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
 defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,   1, HasDQI>;
 
-multiclass avx512_mask_binop_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
-                (i16 GR16:$src1), (i16 GR16:$src2)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
-              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-
-defm : avx512_mask_binop_int<"kand",  "KAND">;
-defm : avx512_mask_binop_int<"kandn", "KANDN">;
-defm : avx512_mask_binop_int<"kor",   "KOR">;
-defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
-defm : avx512_mask_binop_int<"kxor",  "KXOR">;
-
 multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
                             Instruction Inst> {
   // With AVX512F, 8-bit mask is promoted to 16-bit mask,
@@ -2613,8 +3058,71 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
   }
 }
 
-defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>;
-defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>;
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>;
+
+multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
+def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrr)
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+            (i8 8)), (i8 8))>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
+                                       (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
+                     (COPY_TO_REGCLASS VK8:$mask, VK16),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+            (i8 8)), (i8 8))>;
+}
+
+multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
+                                                AVX512VLVectorVTInfo _> {
+def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrri)
+            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+            (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+                     imm:$cc),
+            (i8 8)), (i8 8))>;
+
+def : Pat<(insert_subvector (v16i1 immAllZerosV),
+                            (v8i1 (and VK8:$mask,
+                                       (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
+                            (i64 0)),
+            (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
+                     (COPY_TO_REGCLASS VK8:$mask, VK16),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                     (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+                     imm:$cc),
+            (i8 8)), (i8 8))>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD">;
+  defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm, "VPCMPEQD">;
+
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", avx512vl_f32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpm, "VPCMPD", avx512vl_i32_info>;
+  defm : axv512_icmp_packed_cc_no_vlx_lowering<X86cmpmu, "VPCMPUD", avx512vl_i32_info>;
+}
 
 // Mask setting all 0s or 1s
 multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
@@ -2625,7 +3133,6 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
 }
 
 multiclass avx512_mask_setop_w<PatFrag Val> {
-  defm B : avx512_mask_setop<VK8,   v8i1, Val>;
   defm W : avx512_mask_setop<VK16, v16i1, Val>;
   defm D : avx512_mask_setop<VK32,  v32i1, Val>;
   defm Q : avx512_mask_setop<VK64, v64i1, Val>;
@@ -2639,12 +3146,11 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
   def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>;
   def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>;
+  def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>;
   def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
   def : Pat<(v4i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK4)>;
   def : Pat<(v2i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK2)>;
-  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
-  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
-  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  def : Pat<(v1i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK1)>;
 }
 
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
@@ -2656,6 +3162,12 @@ multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subV
   def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
             (VT (COPY_TO_REGCLASS subRC:$src, RC))>;
 }
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK2,  v2i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK4,  v4i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK8,  v8i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK16, v16i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK32, v32i1>;
+defm : operation_subvector_mask_lowering<VK1,  v1i1,  VK64, v64i1>;
 
 defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK4,  v4i1>;
 defm : operation_subvector_mask_lowering<VK2,  v2i1,  VK8,  v8i1>;
@@ -2695,12 +3207,12 @@ def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
 
 // Patterns for kmask shift
 multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
-  def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))),
+  def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))),
             (VT (COPY_TO_REGCLASS
                    (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16),
                                (I8Imm $imm)),
                    RC))>;
-  def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))),
+  def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))),
             (VT (COPY_TO_REGCLASS
                    (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16),
                                (I8Imm $imm)),
@@ -2738,7 +3250,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                     [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
                     _.ExeDomain>, EVEX;
 
-  let Constraints = "$src0 = $dst" in {
+  let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
   def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
                     (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
                     !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -2809,22 +3321,22 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                        PatFrag st_frag, PatFrag mstore> {
+                        PatFrag st_frag, PatFrag mstore, string Name> {
 
   let hasSideEffects = 0 in {
   def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
                          OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
-                         [], _.ExeDomain>, EVEX;
+                         [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
   def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                          (ins _.KRCWM:$mask, _.RC:$src),
                          OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
                          "${dst} {${mask}}, $src}",
-                         [], _.ExeDomain>,  EVEX, EVEX_K;
+                         [], _.ExeDomain>,  EVEX, EVEX_K, FoldGenData<Name#rrk>;
   def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                           (ins _.KRCWM:$mask, _.RC:$src),
                           OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
-                          [], _.ExeDomain>, EVEX, EVEX_KZ;
+                          [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
   }
 
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
@@ -2842,80 +3354,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 
 
 multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
-                            AVX512VLVectorVTInfo _, Predicate prd> {
+                            AVX512VLVectorVTInfo _, Predicate prd,
+                            string Name> {
   let Predicates = [prd] in
   defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
-                        masked_store_unaligned>, EVEX_V512;
+                        masked_store_unaligned, Name#Z>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
-                             masked_store_unaligned>, EVEX_V256;
+                             masked_store_unaligned, Name#Z256>, EVEX_V256;
     defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
-                             masked_store_unaligned>, EVEX_V128;
+                             masked_store_unaligned, Name#Z128>, EVEX_V128;
   }
 }
 
 multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
-                                  AVX512VLVectorVTInfo _,  Predicate prd> {
+                                  AVX512VLVectorVTInfo _,  Predicate prd,
+                                  string Name> {
   let Predicates = [prd] in
   defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
-                        masked_store_aligned512>, EVEX_V512;
+                        masked_store_aligned512, Name#Z>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
-                             masked_store_aligned256>, EVEX_V256;
+                             masked_store_aligned256, Name#Z256>, EVEX_V256;
     defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
-                             masked_store_aligned128>, EVEX_V128;
+                             masked_store_aligned128, Name#Z128>, EVEX_V128;
   }
 }
 
 defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
                                      HasAVX512>,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
-                                      HasAVX512>,  PS, EVEX_CD8<32, CD8VF>;
+                                      HasAVX512, "VMOVAPS">,
+               PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
                                      HasAVX512>,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
-                                     HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+                                     HasAVX512, "VMOVAPD">,
+               PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
                               null_frag>,
-               avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+               avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
+                               "VMOVUPS">,
                               PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
                               null_frag>,
-               avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+               avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
+                               "VMOVUPD">,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+                                       HasAVX512, "VMOVDQA32">,
+                 PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+                                    HasAVX512, "VMOVDQA64">,
+                 PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
-                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+                avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+                                 HasBWI, "VMOVDQU8">,
+                XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+                                 HasBWI, "VMOVDQU16">,
+                 XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+                                 HasAVX512, "VMOVDQU32">,
+                 XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+                                 HasAVX512, "VMOVDQU64">,
+                 XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
 // to load or store from a ZMM register instead. These are converted in
@@ -3095,8 +3619,8 @@ let Predicates = [HasVLX] in {
   def : Pat<(alignedstore256 (v4f64 (extract_subvector
                                      (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
      (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
-  def : Pat<(alignedstore (v8f32 (extract_subvector
-                                  (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
+  def : Pat<(alignedstore256 (v8f32 (extract_subvector
+                                     (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
      (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
   def : Pat<(alignedstore256 (v4i64 (extract_subvector
                                      (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
@@ -3160,6 +3684,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src)
                        "vmovq\t{$src, $dst|$dst, $src}",
                        [(set FR64X:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
 def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64X:$src))],
@@ -3272,20 +3800,22 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
                                     (scalar_to_vector _.FRC:$src2))))],
              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
-              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              (ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
               "$dst {${mask}} {z}, $src1, $src2}"),
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
-                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                      (_.VT (OpNode _.RC:$src1,
+                                            (scalar_to_vector _.FRC:$src2))),
                                       _.ImmAllZerosV)))],
               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
-             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
              "$dst {${mask}}, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
-                                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                     (_.VT (OpNode _.RC:$src1,
+                                           (scalar_to_vector _.FRC:$src2))),
                                      (_.VT _.RC:$src0))))],
              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
   let canFoldAsLoad = 1, isReMaterializable = 1 in
@@ -3329,27 +3859,24 @@ multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
                         (_.VT (scalar_to_vector
-                                  (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+                                  (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
                                                        (_.EltVT _.FRC:$src1),
                                                        (_.EltVT _.FRC:$src2))))))),
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
                                           (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
                                           (COPY_TO_REGCLASS GR32:$mask, VK1WM),
-                                          (_.VT _.RC:$src0),
-                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                                          (_.VT _.RC:$src0), _.FRC:$src1),
                             _.RC)>;
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
                         (_.VT (scalar_to_vector
-                                  (_.EltVT (X86selects (i1 (trunc GR32:$mask)),
+                                  (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
                                                        (_.EltVT _.FRC:$src1),
                                                        (_.EltVT ZeroFP))))))),
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
                                           (COPY_TO_REGCLASS GR32:$mask, VK1WM),
-                                          (_.VT _.RC:$src0),
-                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                                          (_.VT _.RC:$src0), _.FRC:$src1),
                             _.RC)>;
-
 }
 
 multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3359,10 +3886,27 @@ def : Pat<(masked_store addr:$dst, Mask,
              (_.info512.VT (insert_subvector undef,
                                (_.info256.VT (insert_subvector undef,
                                                  (_.info128.VT _.info128.RC:$src),
-                                                 (i64 0))),
-                               (i64 0)))),
+                                                 (iPTR 0))),
+                               (iPTR 0)))),
+          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
+                      (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
+multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
+                                               AVX512VLVectorVTInfo _,
+                                               dag Mask, RegisterClass MaskRC,
+                                               SubRegIndex subreg> {
+
+def : Pat<(masked_store addr:$dst, Mask,
+             (_.info512.VT (insert_subvector undef,
+                               (_.info256.VT (insert_subvector undef,
+                                                 (_.info128.VT _.info128.RC:$src),
+                                                 (iPTR 0))),
+                               (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
-                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
                       (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 
 }
@@ -3374,9 +3918,9 @@ def : Pat<(_.info128.VT (extract_subvector
                          (_.info512.VT (masked_load addr:$srcAddr, Mask,
                                         (_.info512.VT (bitconvert
                                                        (v16i32 immAllZerosV))))),
-                           (i64 0))),
+                           (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmkz)
-                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
                       addr:$srcAddr)>;
 
 def : Pat<(_.info128.VT (extract_subvector
@@ -3384,11 +3928,39 @@ def : Pat<(_.info128.VT (extract_subvector
                       (_.info512.VT (insert_subvector undef,
                             (_.info256.VT (insert_subvector undef,
                                   (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (i64 0))),
-                            (i64 0))))),
-                (i64 0))),
+                                  (iPTR 0))),
+                            (iPTR 0))))),
+                (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
-                      (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
+                      (COPY_TO_REGCLASS MaskRC:$mask, VK1WM),
+                      addr:$srcAddr)>;
+
+}
+
+multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
+                                              AVX512VLVectorVTInfo _,
+                                              dag Mask, RegisterClass MaskRC,
+                                              SubRegIndex subreg> {
+
+def : Pat<(_.info128.VT (extract_subvector
+                         (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                                        (_.info512.VT (bitconvert
+                                                       (v16i32 immAllZerosV))))),
+                           (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmkz)
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+                      addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+                (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                      (_.info512.VT (insert_subvector undef,
+                            (_.info256.VT (insert_subvector undef,
+                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+                                  (iPTR 0))),
+                            (iPTR 0))))),
+                (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+                      (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
                       addr:$srcAddr)>;
 
 }
@@ -3398,72 +3970,107 @@ defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
-defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
-                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
-defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
-                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
-defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
-                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
-defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
-                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
 
 def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
           (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
-          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+          (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
-let hasSideEffects = 0 in
-defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
-                           "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
-                           XS, EVEX_4V, VEX_LIG;
-
-let hasSideEffects = 0 in
-defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
-                           "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
-                           XD, EVEX_4V, VEX_LIG, VEX_W;
+let hasSideEffects = 0 in {
+  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                           (ins VR128X:$src1, FR32X:$src2),
+                           "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
+                           FoldGenData<"VMOVSSZrr">;
+
+let Constraints = "$src0 = $dst" in
+  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
+                                                   VR128X:$src1, FR32X:$src2),
+                             "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+                                        "$dst {${mask}}, $src1, $src2}",
+                             [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+                             FoldGenData<"VMOVSSZrrk">;
+
+  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                         (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
+                         "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                                    "$dst {${mask}} {z}, $src1, $src2}",
+                         [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+                         FoldGenData<"VMOVSSZrrkz">;
+
+  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                           (ins VR128X:$src1, FR64X:$src2),
+                           "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
+                           FoldGenData<"VMOVSDZrr">;
+
+let Constraints = "$src0 = $dst" in
+  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
+                                                   VR128X:$src1, FR64X:$src2),
+                             "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+                                        "$dst {${mask}}, $src1, $src2}",
+                             [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+                             VEX_W, FoldGenData<"VMOVSDZrrk">;
+
+  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+                              (ins f64x_info.KRCWM:$mask, VR128X:$src1,
+                                                          FR64X:$src2),
+                              "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                                         "$dst {${mask}} {z}, $src1, $src2}",
+                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+                              VEX_W, FoldGenData<"VMOVSDZrrkz">;
+}
 
 let Predicates = [HasAVX512] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128X then do a
   // MOVS{S,D} to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
-            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+            (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>;
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
-            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+            (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
-            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+            (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
   def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
-            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+            (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>;
   }
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (V_SET0)),
+             (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4i32 (V_SET0)),
+             (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4f32 (V_SET0)),
+             (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSSZrr (v4i32 (V_SET0)),
+             (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
               (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), sub_xmm)>;
 
   let AddedComplexity = 20 in {
@@ -3525,11 +4132,11 @@ let Predicates = [HasAVX512] in {
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                    (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
                                             FR32X:$src)), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                    (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
-            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                                      FR64X:$src)), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                    (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
@@ -3538,18 +4145,18 @@ let Predicates = [HasAVX512] in {
   // Move low f64 and clear high bits.
   def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (V_SET0)),
+             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
             (SUBREG_TO_REG (i32 0),
-             (VMOVSDZrr (v2f64 (V_SET0)),
+             (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
 
   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
-            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
                        (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
 
   // Extract and store.
@@ -3582,10 +4189,6 @@ let Predicates = [HasAVX512] in {
             (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
   def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
             (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
-  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
-  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
@@ -3635,6 +4238,8 @@ let Predicates = [HasAVX512] in {
   }
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -3669,42 +4274,26 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i64 (X86vzload addr:$src)),
             (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
 }
-
-def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
 let SchedRW = [WriteLoad] in {
   def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                         (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
-                        [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
-                        SSEPackedInt>, EVEX, T8PD, EVEX_V512,
+                        [], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
                         EVEX_CD8<64, CD8VF>;
 
   let Predicates = [HasVLX] in {
     def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
                          (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
-                         [(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))],
-                         SSEPackedInt>, EVEX, T8PD, EVEX_V256,
+                         [], SSEPackedInt>, EVEX, T8PD, EVEX_V256,
                          EVEX_CD8<64, CD8VF>;
 
     def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
                         (ins i128mem:$src),
                         "vmovntdqa\t{$src, $dst|$dst, $src}",
-                        [(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))],
-                        SSEPackedInt>, EVEX, T8PD, EVEX_V128,
+                        [], SSEPackedInt>, EVEX, T8PD, EVEX_V128,
                         EVEX_CD8<64, CD8VF>;
   }
 }
@@ -4150,8 +4739,7 @@ let Predicates = [HasDQI, NoVLX] in {
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           X86VectorVTInfo _, OpndItins itins,
-                           bit IsCommutable = 0> {
+                           X86VectorVTInfo _, bit IsCommutable = 0> {
   defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
@@ -4159,7 +4747,7 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                      (bitconvert (_.VT _.RC:$src2)))),
                     (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
                                                        _.RC:$src2)))),
-                    itins.rr, IsCommutable>,
+                    IIC_SSE_BIT_P_RR, IsCommutable>,
             AVX512BIBase, EVEX_4V;
 
   defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4169,14 +4757,13 @@ multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    (bitconvert (_.LdFrag addr:$src2)))),
                   (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
                                      (bitconvert (_.LdFrag addr:$src2)))))),
-                  itins.rm>,
+                  IIC_SSE_BIT_P_RM>,
             AVX512BIBase, EVEX_4V;
 }
 
 multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            X86VectorVTInfo _, OpndItins itins,
-                            bit IsCommutable = 0> :
-           avx512_logic_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> {
+                            X86VectorVTInfo _, bit IsCommutable = 0> :
+           avx512_logic_rm<opc, OpcodeStr, OpNode, _, IsCommutable> {
   defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
                   "${src2}"##_.BroadcastStr##", $src1",
@@ -4189,58 +4776,48 @@ multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                      (bitconvert
                                       (_.VT (X86VBroadcast
                                              (_.ScalarLdFrag addr:$src2)))))))),
-                  itins.rm>,
+                  IIC_SSE_BIT_P_RM>,
              AVX512BIBase, EVEX_4V, EVEX_B;
 }
 
 multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               AVX512VLVectorVTInfo VTInfo, OpndItins itins,
-                               Predicate prd, bit IsCommutable = 0> {
-  let Predicates = [prd] in
-    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins,
+                               AVX512VLVectorVTInfo VTInfo,
+                               bit IsCommutable = 0> {
+  let Predicates = [HasAVX512] in
+    defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
                              IsCommutable>, EVEX_V512;
 
-  let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins,
+  let Predicates = [HasAVX512, HasVLX] in {
+    defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
                              IsCommutable>, EVEX_V256;
-    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins,
+    defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
                              IsCommutable>, EVEX_V128;
   }
 }
 
 multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
-                               itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>;
+                                  IsCommutable>, EVEX_CD8<32, CD8VF>;
 }
 
 multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                OpndItins itins, Predicate prd,
                                 bit IsCommutable = 0> {
   defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
-                               itins, prd, IsCommutable>,
-                               VEX_W, EVEX_CD8<64, CD8VF>;
+                                  IsCommutable>,
+                                  VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
-                                 SDNode OpNode, OpndItins itins, Predicate prd,
-                                 bit IsCommutable = 0> {
-  defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd,
-                                IsCommutable>;
-
-  defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd,
-                                IsCommutable>;
+                                 SDNode OpNode, bit IsCommutable = 0> {
+  defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, IsCommutable>;
+  defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, IsCommutable>;
 }
 
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp,
-                                  SSE_INTALU_ITINS_P, HasAVX512, 0>;
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  FP arithmetic
@@ -4252,16 +4829,16 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 FROUND_CURRENT)),
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2,
+                                          (i32 FROUND_CURRENT))),
                            itins.rr>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
-                         (VecNode (_.VT _.RC:$src1),
-                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
-                           (i32 FROUND_CURRENT)),
+                         (_.VT (VecNode _.RC:$src1,
+                                        _.ScalarIntMemCPat:$src2,
+                                        (i32 FROUND_CURRENT))),
                          itins.rm>;
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -4291,13 +4868,43 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
                           EVEX_B, EVEX_RC;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode VecNode, OpndItins itins, bit IsCommutable> {
-  let ExeDomain = _.ExeDomain in
+                                SDNode OpNode, SDNode VecNode, SDNode SaeNode,
+                                OpndItins itins, bit IsCommutable> {
+  let ExeDomain = _.ExeDomain in {
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2)),
+                           itins.rr>;
+
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (VecNode _.RC:$src1,
+                                        _.ScalarIntMemCPat:$src2)),
+                         itins.rm>;
+
+  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr> {
+    let isCommutable = IsCommutable;
+  }
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+  }
+
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
-                            (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                             (i32 FROUND_NO_EXC))>, EVEX_B;
+  }
 }
 
 multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4316,31 +4923,29 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SDNode VecNode,
+                                  SDNode VecNode, SDNode SaeNode,
                                   SizeItins itins, bit IsCommutable> {
-  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
-                              itins.s, IsCommutable>,
-             avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
-                              itins.s, IsCommutable>,
+  defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
+                              VecNode, SaeNode, itins.s, IsCommutable>,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
-  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
-                              itins.d,                  IsCommutable>,
-             avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
-                              itins.d, IsCommutable>,
+  defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
+                              VecNode, SaeNode, itins.d, IsCommutable>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 }
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>;
-defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
-defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+                                 SSE_ALU_ITINS_S, 0>;
+defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+                                 SSE_ALU_ITINS_S, 0>;
 
 // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
-  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4598,6 +5203,7 @@ let Predicates = [HasVLX,HasDQI] in {
 
 multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -4613,10 +5219,12 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
                                               (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
                    EVEX_4V, EVEX_B;
+  }
 }
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -4627,6 +5235,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (OpNode _.RC:$src1,
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                           (i32 FROUND_CURRENT))>;
+  }
 }
 
 multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
@@ -4899,6 +5508,33 @@ defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
 defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
 defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
 
+// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                 VR128X:$src2)), sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                 VR128X:$src2)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                 imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                 imm:$src2)), sub_xmm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // Variable Bit Shifts
 //===-------------------------------------------------------------------===//
@@ -4932,6 +5568,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
                     EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
 }
+
 multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   AVX512VLVectorVTInfo _> {
   let Predicates  = [HasAVX512] in
@@ -4955,12 +5592,13 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
 }
 
 // Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
-  let Predicates = [HasBWI, NoVLX] in {
+multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
+                                     SDNode OpNode, list<Predicate> p> {
+  let Predicates = p in {
   def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1),
                                   (_.info256.VT _.info256.RC:$src2))),
             (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME#"WZrr")
+                (!cast<Instruction>(OpcodeStr#"Zrr")
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
              sub_ymm)>;
@@ -4968,13 +5606,12 @@ multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> {
   def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1),
                                   (_.info128.VT _.info128.RC:$src2))),
             (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME#"WZrr")
+                (!cast<Instruction>(OpcodeStr#"Zrr")
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
                     (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
              sub_xmm)>;
   }
 }
-
 multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
                                  SDNode OpNode> {
   let Predicates = [HasBWI] in
@@ -4990,19 +5627,22 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
 }
 
 defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
-              avx512_var_shift_w<0x12, "vpsllvw", shl>,
-              avx512_var_shift_w_lowering<avx512vl_i16_info, shl>;
+              avx512_var_shift_w<0x12, "vpsllvw", shl>;
 
 defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
-              avx512_var_shift_w<0x11, "vpsravw", sra>,
-              avx512_var_shift_w_lowering<avx512vl_i16_info, sra>;
+              avx512_var_shift_w<0x11, "vpsravw", sra>;
 
 defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
-              avx512_var_shift_w<0x10, "vpsrlvw", srl>,
-              avx512_var_shift_w_lowering<avx512vl_i16_info, srl>;
+              avx512_var_shift_w<0x10, "vpsrlvw", srl>;
+
 defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
 defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
 
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>;
+
 // Special handing for handling VPSRAV intrinsics.
 multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                                          list<Predicate> p> {
@@ -5013,7 +5653,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 20 in {
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
@@ -5023,8 +5662,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
-    }
-    let AddedComplexity = 30 in {
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
@@ -5034,7 +5671,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
-    }
   }
 }
 
@@ -5046,14 +5682,12 @@ multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
                      (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
                _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 20 in
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1,
                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 30 in
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1,
                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
@@ -5073,6 +5707,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>;
 defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>;
 
+
+// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPROLQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPROLDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
+// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                       sub_xmm)>;
+  def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORVQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                       sub_ymm)>;
+
+  def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+                        sub_xmm)>;
+  def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORVDZrr
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+                        sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPRORQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                       imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                        imm:$src2)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v16i32
+              (VPRORDZri
+                (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                        imm:$src2)), sub_ymm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // 1-src variable permutation VPERMW/D/Q
 //===-------------------------------------------------------------------===//
@@ -5251,6 +5988,7 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
   def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.RC:$src1, f64mem:$src2),
                   !strconcat(OpcodeStr,
@@ -5599,7 +6337,7 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
           "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
 
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-          (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
+          (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
 
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5625,13 +6363,13 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                             string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
                             SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
-
+  let ExeDomain = _.ExeDomain in {
   defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
                 // Operands for intrinsic are in 123 order to preserve passthu
                 // semantics.
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
-                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
+                         _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -5641,8 +6379,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 
   defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
                 (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds3 _.RC:$src2,
-                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
                               _.RC:$src1, (i32 FROUND_CURRENT))),
                 (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
                                   (i32 imm:$rc))),
@@ -5653,8 +6390,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 
   defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds1 _.RC:$src1,
-                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
                               _.RC:$src2, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
                          (i32 imm:$rc))),
@@ -5662,6 +6398,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                          _.FRC:$src2))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
                           (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+  }
 }
 
 multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
@@ -5692,6 +6429,7 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
 let Constraints = "$src1 = $dst" in {
 multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -5711,6 +6449,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src1,
              _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
             AVX512FMA3Base, EVEX_B;
+  }
 }
 } // Constraints = "$src1 = $dst"
 
@@ -5878,10 +6617,10 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
                 !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
                 EVEX, VEX_LIG, EVEX_B, EVEX_RC;
-    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
-                      (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))),
+                      (SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
                       (i32 FROUND_CURRENT)))]>,
                 EVEX, VEX_LIG;
   } // Predicates = [HasAVX512]
@@ -5918,20 +6657,20 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
             (VCVTSS2SIZrr VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))),
-            (VCVTSS2SIZrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)),
+            (VCVTSS2SIZrm sse_load_f32:$src)>;
   def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
             (VCVTSS2SI64Zrr VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))),
-            (VCVTSS2SI64Zrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)),
+            (VCVTSS2SI64Zrm sse_load_f32:$src)>;
   def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
             (VCVTSD2SIZrr VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))),
-            (VCVTSD2SIZrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)),
+            (VCVTSD2SIZrm sse_load_f64:$src)>;
   def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
             (VCVTSD2SI64Zrr VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))),
-            (VCVTSD2SI64Zrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)),
+            (VCVTSD2SI64Zrm sse_load_f64:$src)>;
 } // HasAVX512
 
 let Predicates = [HasAVX512] in {
@@ -6018,7 +6757,7 @@ let Predicates = [HasAVX512] in {
                                     EVEX,VEX_LIG , EVEX_B;
     let mayLoad = 1, hasSideEffects = 0 in
       def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
-                  (ins _SrcRC.MemOp:$src),
+                  (ins _SrcRC.IntScalarMemOp:$src),
                   !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                   []>, EVEX, VEX_LIG;
 
@@ -6055,47 +6794,58 @@ defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
             (VCVTTSS2SIZrr_Int VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))),
-            (VCVTTSS2SIZrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvttss2si sse_load_f32:$src)),
+            (VCVTTSS2SIZrm_Int ssmem:$src)>;
   def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
             (VCVTTSS2SI64Zrr_Int VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))),
-            (VCVTTSS2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 sse_load_f32:$src)),
+            (VCVTTSS2SI64Zrm_Int ssmem:$src)>;
   def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
             (VCVTTSD2SIZrr_Int VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))),
-            (VCVTTSD2SIZrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si sse_load_f64:$src)),
+            (VCVTTSD2SIZrm_Int sdmem:$src)>;
   def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
             (VCVTTSD2SI64Zrr_Int VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))),
-            (VCVTTSD2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)),
+            (VCVTTSD2SI64Zrm_Int sdmem:$src)>;
 } // HasAVX512
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
 //===----------------------------------------------------------------------===//
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNode> {
-  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2),
                                        (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
-  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
-                                  (_Src.VT (scalar_to_vector
-                                            (_Src.ScalarLdFrag addr:$src2))),
+                                  (_Src.VT _Src.ScalarIntMemCPat:$src2),
                                   (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+  let isCodeGenOnly = 1, hasSideEffects = 0 in {
+    def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _Src.FRC:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+    let mayLoad = 1 in
+    def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+  }
 }
 
 // Scalar Coversion with SAE - suppress all exceptions
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
-  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
@@ -6107,7 +6857,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
 // Scalar Conversion with rounding control (RC)
 multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
-  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
@@ -6140,39 +6890,36 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
                                           X86fpextRnd,f32x_info, f64x_info >;
 
 def : Pat<(f64 (fpextend FR32X:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+          (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
           Requires<[HasAVX512]>;
 def : Pat<(f64 (fpextend (loadf32 addr:$src))),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX512]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-      (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
       Requires<[HasAVX512, OptForSize]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
-                    (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+          (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
           Requires<[HasAVX512, OptForSpeed]>;
 
 def : Pat<(f32 (fpround FR64X:$src)),
-          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
-                    (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+          (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
            Requires<[HasAVX512]>;
 
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128X:$dst),
                    (v4f32 (scalar_to_vector
                      (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
-          (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+          (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>,
           Requires<[HasAVX512]>;
 
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128X:$dst),
                    (v2f64 (scalar_to_vector
                      (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
-          (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+          (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>,
           Requires<[HasAVX512]>;
 
 //===----------------------------------------------------------------------===//
@@ -6808,7 +7555,7 @@ let Predicates = [HasAVX512] in {
   let Predicates = [HasVLX] in {
     defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
                         EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
-    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 }
@@ -6917,7 +7664,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
-  let AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -6942,6 +7689,7 @@ defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
@@ -6955,6 +7703,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           (OpNode (_.FloatVT
                             (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
                           EVEX, T8PD, EVEX_B;
+  }
 }
 
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -6986,7 +7735,7 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          SDNode OpNode> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -7005,6 +7754,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (OpNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                          (i32 FROUND_CURRENT))>;
+  }
 }
 
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -7024,7 +7774,7 @@ defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
@@ -7041,9 +7791,11 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          (OpNode (_.FloatVT
                                   (X86VBroadcast (_.ScalarLdFrag addr:$src))),
                                  (i32 FROUND_CURRENT))>, EVEX_B;
+  }
 }
 multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode> {
+  let ExeDomain = _.ExeDomain in
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
@@ -7084,6 +7836,7 @@ defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
 
 multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
                               SDNode OpNodeRnd, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in
   defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
                          (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
@@ -7092,6 +7845,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX;
@@ -7106,6 +7860,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                           (OpNode (_.FloatVT
                             (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
                           EVEX, EVEX_B;
+  }
 }
 
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
@@ -7143,7 +7898,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                               string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -7176,6 +7931,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
   }
+  }
 
   def : Pat<(_.EltVT (OpNode _.FRC:$src)),
             (!cast<Instruction>(NAME#SUFF#Zr)
@@ -7238,13 +7994,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
   def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x1))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
   def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x2))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
   def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
-             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x3))), _.FRC)>;
+             (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
   def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
              (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
@@ -7254,13 +8010,13 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
 
   def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x1))), _.FRC)>;
+             addr:$src, (i32 0x9))), _.FRC)>;
   def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x2))), _.FRC)>;
+             addr:$src, (i32 0xa))), _.FRC)>;
   def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
-             addr:$src, (i32 0x3))), _.FRC)>;
+             addr:$src, (i32 0xb))), _.FRC)>;
   def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
              (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
              addr:$src, (i32 0x4))), _.FRC)>;
@@ -7480,11 +8236,11 @@ multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasBWI] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v8i16x_info,
-                    v16i8x_info, i64mem, LdFrag, OpNode>,
+                    v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v16i16x_info,
@@ -7499,11 +8255,11 @@ multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
-                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                   v16i8x_info, i32mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
@@ -7518,11 +8274,11 @@ multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v16i8x_info, i16mem, LdFrag, OpNode>,
+                   v16i8x_info, i16mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7537,11 +8293,11 @@ multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
-                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                   v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
@@ -7556,11 +8312,11 @@ multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v8i16x_info, i32mem, LdFrag, OpNode>,
+                   v8i16x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7575,12 +8331,12 @@ multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
 
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v4i32x_info, i64mem, LdFrag, OpNode>,
+                   v4i32x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7594,19 +8350,19 @@ multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
-defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
-defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
-defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
-defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
-defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">;
 
-defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
-defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
-defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
-defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
-defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
-defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">;
 
 // EXTLOAD patterns, implemented using vpmovz
 multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
@@ -7649,69 +8405,69 @@ let Predicates = [HasAVX512] in {
   defm : avx512_ext_lowering<"DQZ",    v8i64_info,   v8i32x_info,  extloadvi32>;
 }
 
-multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
-                                 SDNode ExtOp, PatFrag ExtLoad16> {
+multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
+                                 SDNode InVecOp, PatFrag ExtLoad16> {
   // 128-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
@@ -7790,8 +8546,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, string ExtTy,
   }
 }
 
-defm : AVX512_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
-defm : AVX512_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;
 
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
@@ -7832,7 +8588,7 @@ multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mgatherv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz512mem,
+  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256xmem,
                                        mgatherv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
@@ -7842,7 +8598,7 @@ let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
                                           vx128xmem, mgatherv4i32>, EVEX_V128;
   defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mgatherv2i64>, EVEX_V128;
+                                          vx64xmem, X86mgatherv2i64>, EVEX_V128;
 }
 }
 
@@ -7889,7 +8645,7 @@ multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
   defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
                                        mscatterv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz512mem,
+  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256xmem,
                                        mscatterv8i64>, EVEX_V512;
 let Predicates = [HasVLX] in {
   defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
@@ -7922,7 +8678,7 @@ defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7934,7 +8690,7 @@ defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7946,7 +8702,7 @@ defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7958,7 +8714,7 @@ defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps
                      VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
 defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
-                     VK8WM, vz512mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+                     VK8WM, vz256xmem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
 
 defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
                      VK8WM, vy512mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
@@ -7982,6 +8738,17 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
 }
 
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_convert_mask_to_vector_lowering<X86VectorVTInfo X86Info,
+                                                            X86VectorVTInfo _> {
+
+  def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))),
+            (X86Info.VT (EXTRACT_SUBREG
+                           (_.VT (!cast<Instruction>(NAME#"Zrr")
+                             (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))),
+                           X86Info.SubRegIdx))>;
+}
+
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
 let Predicates = [prd] in
@@ -7991,20 +8758,17 @@ let Predicates = [prd] in
     defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
     defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
   }
-}
+let Predicates = [prd, NoVLX] in {
+   defm Z256_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info256,VTInfo.info512>;
+   defm Z128_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info128,VTInfo.info512>;
+  }
 
-multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
-  defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info,  OpcodeStr,
-                                       HasBWI>;
-  defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
-                                       HasBWI>, VEX_W;
-  defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
-                                       HasDQI>;
-  defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
-                                       HasDQI>, VEX_W;
 }
 
-defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>;
+defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W;
+defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>;
+defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W;
 
 multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
     def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
@@ -8319,6 +9083,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
   defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -8435,26 +9200,26 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
 }
 let Predicates = [HasAVX512] in {
 def : Pat<(v16f32 (ffloor VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
 def : Pat<(v16f32 (fnearbyint VR512:$src)),
           (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>;
 def : Pat<(v16f32 (fceil VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>;
 def : Pat<(v16f32 (frint VR512:$src)),
           (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>;
 def : Pat<(v16f32 (ftrunc VR512:$src)),
-          (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPSZrri VR512:$src, (i32 0xB))>;
 
 def : Pat<(v8f64 (ffloor VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>;
 def : Pat<(v8f64 (fnearbyint VR512:$src)),
           (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>;
 def : Pat<(v8f64 (fceil VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>;
 def : Pat<(v8f64 (frint VR512:$src)),
           (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>;
 def : Pat<(v8f64 (ftrunc VR512:$src)),
-          (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>;
+          (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
 }
 
 defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
@@ -8466,6 +9231,39 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
 defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
       AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 
+let Predicates = [HasAVX512] in {
+// Provide fallback in case the load node that is used in the broadcast
+// patterns above is used by additional users, which prevents the pattern
+// selection.
+def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+          (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+          (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+          (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+          (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+          (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+
+def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+          (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+                          0)>;
+}
+
 multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
   defm NAME:       avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
                            AVX512AIi8Base, EVEX_4V;
@@ -8503,6 +9301,7 @@ defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
 
 multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1), OpcodeStr,
                     "$src1", "$src1",
@@ -8513,6 +9312,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src1", "$src1",
                   (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
             EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+  }
 }
 
 multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8577,65 +9377,20 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
                                     HasBWI>;
 }
 
-defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
-
-def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
-                                                      VR128X:$src))>;
-def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>;
-def avx512_v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>;
-def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
-                                                      VR256X:$src))>;
-def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>;
-def avx512_v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>;
-
-let Predicates = [HasBWI, HasVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))),
-            (VPABSBZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))),
-            (VPABSWZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v32i1sextv32i8)),
-            (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))),
-            (VPABSBZ256rr VR256X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v16i1sextv16i16)),
-            (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))),
-            (VPABSWZ256rr VR256X:$src)>;
-}
-let Predicates = [HasAVX512, HasVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))),
-            (VPABSDZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v8i1sextv8i32)),
-            (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))),
-            (VPABSDZ256rr VR256X:$src)>;
-}
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>;
 
-let Predicates = [HasAVX512] in {
-def : Pat<(xor
-          (bc_v8i64 (v16i1sextv16i32)),
-          (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
-          (VPABSDZrr VR512:$src)>;
-def : Pat<(xor
-          (bc_v8i64 (v8i1sextv8i64)),
-          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
-          (VPABSQZrr VR512:$src)>;
-}
-let Predicates = [HasBWI] in {
-def : Pat<(xor
-          (bc_v8i64 (v64i1sextv64i8)),
-          (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))),
-          (VPABSBZrr VR512:$src)>;
-def : Pat<(xor
-          (bc_v8i64 (v32i1sextv32i16)),
-          (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))),
-          (VPABSWZrr VR512:$src)>;
+// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v4i64 (abs VR256X:$src)),
+            (EXTRACT_SUBREG
+                (VPABSQZrr
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
+             sub_ymm)>;
+  def : Pat<(v2i64 (abs VR128X:$src)),
+            (EXTRACT_SUBREG
+                (VPABSQZrr
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
+             sub_xmm)>;
 }
 
 multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
@@ -8646,6 +9401,66 @@ multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
 defm VPLZCNT    : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
 defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
 
+// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasCDI, NoVLX] in {
+  def : Pat<(v4i64 (ctlz VR256X:$src)),
+            (EXTRACT_SUBREG
+                (VPLZCNTQZrr
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
+             sub_ymm)>;
+  def : Pat<(v2i64 (ctlz VR128X:$src)),
+            (EXTRACT_SUBREG
+                (VPLZCNTQZrr
+                    (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
+             sub_xmm)>;
+
+  def : Pat<(v8i32 (ctlz VR256X:$src)),
+            (EXTRACT_SUBREG
+                (VPLZCNTDZrr
+                    (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
+             sub_ymm)>;
+  def : Pat<(v4i32 (ctlz VR128X:$src)),
+            (EXTRACT_SUBREG
+                (VPLZCNTDZrr
+                    (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
+             sub_xmm)>;
+}
+
+//===---------------------------------------------------------------------===//
+// Counts number of ones - VPOPCNTD and VPOPCNTQ
+//===---------------------------------------------------------------------===//
+
+multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> {
+  let Predicates = [HasVPOPCNTDQ] in
+    defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512;
+}
+
+// Use 512bit version to implement 128/256 bit.
+multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+    def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+                       (EXTRACT_SUBREG
+                         (!cast<Instruction>(NAME # "Zrr")
+                           (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+                                          _.info256.RC:$src1,
+                                          _.info256.SubRegIdx)),
+                       _.info256.SubRegIdx)>;
+
+    def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+                       (EXTRACT_SUBREG
+                         (!cast<Instruction>(NAME # "Zrr")
+                           (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+                                          _.info128.RC:$src1,
+                                          _.info128.SubRegIdx)),
+                       _.info128.SubRegIdx)>;
+  }
+}
+
+defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>,
+                avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
+defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>,
+                avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W;
+
 //===---------------------------------------------------------------------===//
 // Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
@@ -8663,6 +9478,7 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
 
 multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
                    (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
@@ -8671,6 +9487,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
                  (_.VT (OpNode (_.VT (scalar_to_vector
                                        (_.ScalarLdFrag addr:$src)))))>,
                  EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+  }
 }
 
 multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8791,7 +9608,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
     def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins _.RC:$src1, u8imm:$src2),
                    OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                   EVEX, TAPD;
+                   EVEX, TAPD, FoldGenData<NAME#rr>;
 
     defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
   }
@@ -8947,6 +9764,68 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
 defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
                                        HasBWI>, EVEX_4V;
 
+// Transforms to swizzle an immediate to enable better matching when
+// memory operand isn't in the right place.
+def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 1/4 and 3/6.
+  uint8_t NewImm = Imm & 0xa5;
+  if (Imm & 0x02) NewImm |= 0x10;
+  if (Imm & 0x10) NewImm |= 0x02;
+  if (Imm & 0x08) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 2/4 and 3/5.
+  uint8_t NewImm = Imm & 0xc3;
+  if (Imm & 0x04) NewImm |= 0x10;
+  if (Imm & 0x10) NewImm |= 0x04;
+  if (Imm & 0x08) NewImm |= 0x20;
+  if (Imm & 0x20) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 1/2 and 5/6.
+  uint8_t NewImm = Imm & 0x99;
+  if (Imm & 0x02) NewImm |= 0x04;
+  if (Imm & 0x04) NewImm |= 0x02;
+  if (Imm & 0x20) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x20;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by moving operand 1 to the end.
+  uint8_t Imm = N->getZExtValue();
+  // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
+  uint8_t NewImm = Imm & 0x81;
+  if (Imm & 0x02) NewImm |= 0x04;
+  if (Imm & 0x04) NewImm |= 0x10;
+  if (Imm & 0x08) NewImm |= 0x40;
+  if (Imm & 0x10) NewImm |= 0x02;
+  if (Imm & 0x20) NewImm |= 0x08;
+  if (Imm & 0x40) NewImm |= 0x20;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
+  uint8_t Imm = N->getZExtValue();
+  // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
+  uint8_t NewImm = Imm & 0x81;
+  if (Imm & 0x02) NewImm |= 0x10;
+  if (Imm & 0x04) NewImm |= 0x02;
+  if (Imm & 0x08) NewImm |= 0x20;
+  if (Imm & 0x10) NewImm |= 0x04;
+  if (Imm & 0x20) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
 multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           X86VectorVTInfo _>{
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
@@ -8975,6 +9854,141 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (i8 imm:$src4)), 1, 0>, EVEX_B,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
   }// Constraints = "$src1 = $dst"
+
+  // Additional patterns for matching passthru operand in other positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+
+  // Additional patterns for matching loads in other positions.
+  def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (OpNode _.RC:$src1,
+                          (bitconvert (_.LdFrag addr:$src3)),
+                          _.RC:$src2, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching zero masking with loads in other
+  // positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching masked loads with different
+  // operand orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1,
+                    (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+
+  // Additional patterns for matching broadcasts in other positions.
+  def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (OpNode _.RC:$src1,
+                          (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                          _.RC:$src2, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching zero masking with broadcasts in other
+  // positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+             (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+             (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching masked broadcasts with different
+  // operand orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    (i8 imm:$src4)), _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
 }
 
 multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
index bfd21c0..e38bbc9 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -964,10 +964,10 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         } // isConvertibleToThreeAddress
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
 
       def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
       def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
@@ -989,10 +989,12 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       }
     } // Constraints = "$src1 = $dst"
 
-    def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+    let mayLoad = 1, mayStore = 1 in {
+      def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+      def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+      def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+      def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+    }
 
     // NOTE: These are order specific, we want the mi8 forms to be listed
     // first so that they are slightly preferred to the mi forms.
@@ -1047,10 +1049,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         } // isConvertibleToThreeAddress
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
 
       def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
       def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
@@ -1127,10 +1129,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       }
     } // isCommutable
 
-    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
-    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
-    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
-    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
 
     def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
     def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
index ba970bc..dcce7b9 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -147,7 +147,7 @@ addOffset(const MachineInstrBuilder &MIB, int Offset) {
 
 static inline const MachineInstrBuilder &
 addOffset(const MachineInstrBuilder &MIB, const MachineOperand& Offset) {
-  return MIB.addImm(1).addReg(0).addOperand(Offset).addReg(0);
+  return MIB.addImm(1).addReg(0).add(Offset).addReg(0);
 }
 
 /// addRegOffset - This function is used to add a memory reference of the form
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index c73c950..b85abfb 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -110,3 +110,9 @@ defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>;  // signed greater or equal
 defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>;  // signed less than or equal
 defm SETG  : SETCC<0x9F, "setg",  X86_COND_G>;   // signed greater than
 
+// SALC is an undocumented instruction. Information for this instruction can be found
+// here http://www.rcollins.org/secrets/opcodes/SALC.html
+// Set AL if carry. 
+let Uses = [EFLAGS], Defs = [AL] in {
+  def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
index 3c27eb8..d003d02 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
+                           (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
                            "#ADJCALLSTACKDOWN",
                            []>,
                           Requires<[NotLP64]>;
@@ -52,8 +53,8 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[NotLP64]>;
 }
-def : Pat<(X86callseq_start timm:$amt1),
-          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+       (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
 
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
@@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1),
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
+                           (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
                            "#ADJCALLSTACKDOWN",
                            []>,
                           Requires<[IsLP64]>;
@@ -71,8 +73,8 @@ def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[IsLP64]>;
 }
-def : Pat<(X86callseq_start timm:$amt1),
-          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
+def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
+        (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
 
 
 // x86-64 va_start lowering magic.
@@ -259,20 +261,20 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 20 in
+    isPseudo = 1, AddedComplexity = 10 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
+let AddedComplexity = 10 in {
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
-def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
-  let AddedComplexity = 20;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
 }
 
 let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
-    AddedComplexity = 15 in {
+    AddedComplexity = 10 in {
   // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
   // which only require 3 bytes compared to MOV32ri which requires 5.
   let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@@ -287,7 +289,7 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
   def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
 }
 
-let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5 in {
 // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
 // FIXME: Add itinerary class and Schedule.
 def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
@@ -772,11 +774,11 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
 // the pseudo. The argument feeding EBX is ebx_input.
 //
 // The additional argument, $ebx_save, is a temporary register used to
-// save the value of RBX accross the actual instruction.
+// save the value of RBX across the actual instruction.
 //
 // To make sure the register assigned to $ebx_save does not interfere with
 // the definition of the actual instruction, we use a definition $dst which
-// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
 // the instruction and we are sure we will have a valid register to restore
 // the value of RBX.
 let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
@@ -1271,11 +1273,11 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
-  APInt KnownZero0, KnownOne0;
-  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
-  APInt KnownZero1, KnownOne1;
-  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
-  return (~KnownZero0 & ~KnownZero1) == 0;
+  KnownBits Known0;
+  CurDAG->computeKnownBits(N->getOperand(0), Known0, 0);
+  KnownBits Known1;
+  CurDAG->computeKnownBits(N->getOperand(1), Known1, 0);
+  return (~Known0.Zero & ~Known1.Zero) == 0;
 }]>;
 
 
@@ -1743,6 +1745,12 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
 def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
 def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
 
+// sub reg, relocImm
+def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
+          (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
+def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2),
+          (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td
index 2f260c4..4ea223e 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrControl.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td
@@ -264,6 +264,21 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                    "jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
 }
 
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [ESP, EFLAGS] in {
+  def TCRETURNdicc : PseudoI<(outs),
+                     (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
+
+  // This gets substituted to a conditional jump instruction in MC lowering.
+  def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
+                           (ins i32imm_pcrel:$dst, i32imm:$cond),
+                           "",
+                           [], IIC_JMP_REL>;
+}
+
 
 //===----------------------------------------------------------------------===//
 //  Call Instructions...
@@ -325,3 +340,19 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                            "rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
   }
 }
+
+// Conditional tail calls are similar to the above, but they are branches
+// rather than barriers, and they use EFLAGS.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
+    isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
+  let Uses = [RSP, EFLAGS] in {
+  def TCRETURNdi64cc : PseudoI<(outs),
+                           (ins i64i32imm_pcrel:$dst, i32imm:$offset,
+                            i32imm:$cond), []>;
+
+  // This gets substituted to a conditional jump instruction in MC lowering.
+  def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
+                           (ins i64i32imm_pcrel:$dst, i32imm:$cond),
+                           "",
+                           [], IIC_JMP_REL>;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
index 4b19f80..3a3cdc9 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td
@@ -191,13 +191,15 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpStr, string PackTy, string Suff,
                        SDNode OpNode, RegisterClass RC,
-                       X86MemOperand x86memop> {
-  defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
-                                x86memop, RC>;
-  defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                                x86memop, RC, OpNode>;
-  defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                                x86memop, RC>;
+                       X86MemOperand x86memop> { 
+  let Predicates = [HasFMA, NoAVX512] in {
+    defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+                                  x86memop, RC>;
+    defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                                  x86memop, RC, OpNode>;
+    defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+                                  x86memop, RC>;
+  }
 }
 
 // The FMA 213 form is created for lowering of scalar FMA intrinscis
@@ -295,7 +297,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-               VEX_LIG;
+               VEX_LIG, FoldGenData<NAME#rr>;
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
@@ -319,6 +321,12 @@ let isCodeGenOnly = 1 in {
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
                  (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+let hasSideEffects = 0 in
+  def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               []>, VEX_LIG, FoldGenData<NAME#rr_Int>; 
 } // isCodeGenOnly = 1
 }
 
@@ -370,12 +378,13 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
-               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               FoldGenData<NAME#rr>;
   def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
                 (ins VR256:$src1, VR256:$src2, VR256:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-                VEX_L;
+                VEX_L, FoldGenData<NAME#Yrr>;
 } // isCodeGenOnly = 1
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index db83497..00ef65c 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -16,11 +16,14 @@
 #include "X86InstrInfo.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Threading.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 /// This flag is used in the method llvm::call_once() used below to make the
 /// initialization of the map 'OpcodeToGroup' thread safe.
-LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+static llvm::once_flag InitGroupsOnceFlag;
 
 static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
 X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
index 025cee3..e356816 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -1,4 +1,4 @@
-//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,9 +18,11 @@
 #include "X86.h"
 #include "llvm/ADT/DenseMap.h"
 #include <cassert>
+#include <cstdint>
 #include <set>
 
 namespace llvm {
+
 /// This class is used to group {132, 213, 231} forms of FMA opcodes together.
 /// Each of the groups has either 3 register opcodes, 3 memory opcodes,
 /// or 6 register and memory opcodes. Also, each group has an attrubutes field
@@ -201,7 +203,7 @@ public:
   static X86InstrFMA3Info *getX86InstrFMA3Info();
 
   /// Constructor. Just creates an object of the class.
-  X86InstrFMA3Info() {}
+  X86InstrFMA3Info() = default;
 
   /// Destructor. Deallocates the memory used for FMA3 Groups.
   ~X86InstrFMA3Info() {
@@ -310,6 +312,7 @@ public:
     return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
   }
 };
-} // namespace llvm
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
index 10f3839..11b1d07 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -631,6 +631,9 @@ let Defs = [FPSW] in
 def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
 def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
                 "ffree\t$reg", IIC_FFREE>;
+def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg),
+                "ffreep\t$reg", IIC_FFREE>;
+
 // Clear exceptions
 
 let Defs = [FPSW] in
@@ -665,15 +668,16 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
 
 let Predicates = [HasFXSR] in {
   def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                 "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+               "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
   def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                    "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
-                    IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+                 "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+                 IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
   def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, 
+                TB;
   def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                     "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
-                     IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+                  "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+                  IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
 } // Predicates = [FeatureFXSR]
 } // SchedRW
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
index 610756a..bfcbf71 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td
@@ -199,7 +199,8 @@ class TAPS : TA { Prefix OpPrefix = PS; }
 class TAPD : TA { Prefix OpPrefix = PD; }
 class TAXD : TA { Prefix OpPrefix = XD; }
 class VEX    { Encoding OpEnc = EncVEX; }
-class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_W    { bits<2> VEX_WPrefix = 1; }
+class VEX_WIG  { bits<2> VEX_WPrefix = 2; }
 class VEX_4V : VEX { bit hasVEX_4V = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
@@ -224,6 +225,12 @@ class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 class XOP { Encoding OpEnc = EncXOP; }
 class XOP_4V : XOP { bit hasVEX_4V = 1; }
 
+// Specify the alternative register form instruction to replace the current
+// instruction in case it was picked during generation of memory folding tables
+class FoldGenData<string _RegisterForm> {
+    string FoldGenRegForm = _RegisterForm;
+}
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr,
               InstrItinClass itin,
@@ -270,7 +277,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
   Encoding OpEnc = EncNormal; // Encoding used by this instruction
   bits<2> OpEncBits = OpEnc.Value;
-  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bits<2> VEX_WPrefix = 0;  // Does this inst set the VEX_W field?
   bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
   bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
@@ -303,6 +310,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                                      CD8_EltSize,
                                      !srl(VectSize, CD8_Form{1-0}))), 0);
 
+  // Used in the memory folding generation (TableGen backend) to point to an alternative
+  // instruction to replace the current one in case it got picked during generation.
+  string FoldGenRegForm = ?;
+
   // TSFlags layout should be kept in sync with X86BaseInfo.h.
   let TSFlags{6-0}   = FormBits;
   let TSFlags{8-7}   = OpSizeBits;
@@ -317,7 +328,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{28-27} = ExeDomain.Value;
   let TSFlags{30-29} = OpEncBits;
   let TSFlags{38-31} = Opcode;
-  let TSFlags{39}    = hasVEX_WPrefix;
+  // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
+  let TSFlags{39}    = VEX_WPrefix{0};
   let TSFlags{40}    = hasVEX_4V;
   let TSFlags{41}    = hasVEX_L;
   let TSFlags{42}    = hasEVEX_K;
@@ -453,7 +465,7 @@ class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
          Domain d = GenericDomain>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
-                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
                    !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
                    !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index c5689d7..8b5bbf2 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -27,21 +27,19 @@ def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
 //===----------------------------------------------------------------------===//
 
 def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
-def load_mvmmx : PatFrag<(ops node:$ptr),
-                         (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
 
 //===----------------------------------------------------------------------===//
 // SSE specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<1>, SDTCisVT<3, i8>,
-                                       SDTCisVec<1>]>;
-def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
-                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                       SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, i8>]>;
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fmins   : SDNode<"X86ISD::FMINS",     SDTFPBinOp>;
+def X86fmaxs   : SDNode<"X86ISD::FMAXS",     SDTFPBinOp>;
 
 // Commutative and Associative FMIN and FMAX.
 def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
@@ -200,6 +198,15 @@ def X86vshli   : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
 def X86vsrli   : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
 def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
 
+def X86kshiftl : SDNode<"X86ISD::KSHIFTL",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i8>]>>;
+def X86kshiftr : SDNode<"X86ISD::KSHIFTR",
+                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                             SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i8>]>>;
+
 def X86vrotli  : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
 def X86vrotri  : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
 
@@ -230,10 +237,11 @@ def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
                                              SDTCisSameAs<0,2>,
                                              SDTCisSameSizeAs<0,3>,
                                              SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisFP<0>, SDTCisInt<3>,
                                              SDTCisVT<4, i8>]>>;
 def X86vpperm : SDNode<"X86ISD::VPPERM",
                         SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>]>>;
+                                             SDTCisSameAs<0,2>, SDTCisSameAs<0, 3>]>>;
 
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
@@ -266,7 +274,7 @@ def X86select  : SDNode<"X86ISD::SELECT",
                                              SDTCisSameNumEltsAs<0, 1>]>>;
 
 def X86selects : SDNode<"X86ISD::SELECTS",
-                        SDTypeProfile<1, 3, [SDTCisVT<1, i1>,
+                        SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<2, 3>]>>;
 
@@ -300,13 +308,17 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 
 def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                         SDTCisSameSizeAs<0,2>,
-                                        SDTCisSameNumEltsAs<0,2>]>;
+                                        SDTCisSameNumEltsAs<0,2>,
+                                        SDTCisFP<0>, SDTCisInt<2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
-def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                             SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>]>;
 def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
                                                  SDTCisSameAs<0,2>,
                                                  SDTCisInt<3>,
@@ -314,8 +326,10 @@ def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
                                                  SDTCisSameNumEltsAs<0, 3>,
                                                  SDTCisVT<4, i32>,
                                                  SDTCisVT<5, i32>]>;
-def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                              SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                               SDTCisSameAs<0,1>,
+                                               SDTCisVT<2, i32>,
+                                               SDTCisVT<3, i32>]>;
 
 def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
@@ -324,9 +338,9 @@ def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
 
-def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
-                                SDTCisVT<4, i8>]>;
+def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisVec<0>,
+                                       SDTCisSameAs<0,1>, SDTCisSameAs<0,2>,
+                                       SDTCisSameAs<0,3>, SDTCisVT<4, i8>]>;
 
 def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
@@ -334,16 +348,13 @@ def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
 def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
   SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
 
-def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
-                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
-                           SDTCisVT<4, i32>]>;
+                           SDTCisFP<0>, SDTCisVT<4, i32>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
-def X86Abs      : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
 def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -367,17 +378,28 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
+                                   SDTCisVec<1>, SDTCisInt<1>,
                                    SDTCisSameSizeAs<0,1>,
-                                   SDTCisSameAs<1,2>]>;
+                                   SDTCisSameAs<1,2>,
+                                   SDTCisOpSmallerThanOp<0, 1>]>;
 def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
 def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
 
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
-def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
-def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD"   , SDTPack, [SDNPCommutative]>;
+def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW",
+                            SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+                                                 SDTCVecEltisVT<1, i8>,
+                                                 SDTCisSameSizeAs<0,1>,
+                                                 SDTCisSameAs<1,2>]>>;
+def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD",
+                            SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i32>,
+                                                 SDTCVecEltisVT<1, i16>,
+                                                 SDTCisSameSizeAs<0,1>,
+                                                 SDTCisSameAs<1,2>]>,
+                            [SDNPCommutative]>;
 
 def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
 def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
@@ -414,12 +436,12 @@ def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
 def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
 def X86VGetMant    : SDNode<"X86ISD::VGETMANT",  SDTFPUnaryOpImmRound>;
 def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
-                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
-                                            SDTCisVec<1>, SDTCisFP<1>,
+                       SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                            SDTCisFP<1>,
                                             SDTCisSameNumEltsAs<0,1>,
                                             SDTCisVT<2, i32>]>, []>;
 def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
-                       SDTypeProfile<1, 2, [SDTCisVT<0, i1>,
+                       SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>,
                                             SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>;
 
 def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
@@ -428,11 +450,8 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
-                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
-                               SDTCisPtrTy<3>]>, []>;
 def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
-                              [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
+                              [SDTCisVec<1>,
                                SDTCisPtrTy<2>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
@@ -440,24 +459,30 @@ def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
 
 def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
+def X86faddRnds  : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>;
 def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
+def X86fsubRnds  : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>;
 def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
+def X86fmulRnds  : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
-def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
+def X86fdivRnds  : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",  SDTFPBinOpRound>;
+def X86fmaxRnds  : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",  SDTFPBinOpRound>;
+def X86fminRnds  : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>;
 def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
 def X86scalefs   : SDNode<"X86ISD::SCALEFS",        SDTFPBinOpRound>;
-def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
 def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
 def X86fsqrtRnds    : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
 def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
 def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
 
-def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
-def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
-def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
-def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFPTernaryOp>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFPTernaryOp>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFPTernaryOp>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFPTernaryOp>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFPTernaryOp>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFPTernaryOp>;
 
 def X86FmaddRnd     : SDNode<"X86ISD::FMADD_RND",     SDTFmaRound>;
 def X86FnmaddRnd    : SDNode<"X86ISD::FNMADD_RND",    SDTFmaRound>;
@@ -478,8 +503,10 @@ def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound>;
 def X86FmsubRnds3   : SDNode<"X86ISD::FMSUBS3_RND",     SDTFmaRound>;
 def X86FnmsubRnds3  : SDNode<"X86ISD::FNMSUBS3_RND",    SDTFmaRound>;
 
-def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTFma>;
-def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTFma>;
+def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma>;
+def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTIFma>;
 
 def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
 def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
@@ -614,22 +641,37 @@ def sdmem : Operand<v2f64> {
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
+// Vector load wrappers to prevent folding of non-temporal aligned loads on
+// supporting targets.
+def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() ||
+         cast<LoadSDNode>(N)->getAlignment() < 16;
+}]>;
+def vec256load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !Subtarget->hasAVX2() || !cast<LoadSDNode>(N)->isNonTemporal() ||
+         cast<LoadSDNode>(N)->getAlignment() < 32;
+}]>;
+def vec512load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return !Subtarget->hasAVX512() || !cast<LoadSDNode>(N)->isNonTemporal() ||
+         cast<LoadSDNode>(N)->getAlignment() < 64;
+}]>;
+
 // 128-bit load pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
-def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
-def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
-def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (vec128load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (vec128load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (vec128load node:$ptr))>;
 
 // 256-bit load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
-def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
-def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
-def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
+def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (vec256load node:$ptr))>;
+def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (vec256load node:$ptr))>;
+def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (vec256load node:$ptr))>;
 
 // 512-bit load pattern fragments
-def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
-def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
-def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (vec512load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (vec512load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (vec512load node:$ptr))>;
 
 // 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
@@ -695,15 +737,15 @@ def alignedloadv8f64  : PatFrag<(ops node:$ptr),
 def alignedloadv8i64  : PatFrag<(ops node:$ptr),
                                 (v8i64  (alignedload512 node:$ptr))>;
 
-// Like 'load', but uses special alignment checks suitable for use in
+// Like 'vec128load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
 // be naturally aligned on some targets but not on others.  If the subtarget
 // allows unaligned accesses, match any load, though this may require
 // setting a feature bit in the processor (on startup, for example).
 // Opteron 10h and later implement such a feature.
-def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return    Subtarget->hasSSEUnalignedMem()
-         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+def memop : PatFrag<(ops node:$ptr), (vec128load node:$ptr), [{
+  return Subtarget->hasSSEUnalignedMem() ||
+         cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
 // 128-bit memop pattern fragments
@@ -712,16 +754,6 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
 def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
 def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
 
-// These are needed to match a scalar memop that is used in a vector-only
-// math instruction such as the FP logical ops: andps, andnps, orps, xorps.
-// The memory operand is required to be a 128-bit load, so it must be converted
-// from a vector to a scalar.
-def memopfsf32_128 : PatFrag<(ops node:$ptr),
-  (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>;
-def memopfsf64_128 : PatFrag<(ops node:$ptr),
-  (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>;
-
-
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
 // FIXME: 8 byte alignment for mmx reads is not required
@@ -731,6 +763,9 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
 
 def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
 
+def X86masked_gather  : SDNode<"X86ISD::MGATHER",  SDTMaskedGather,
+                       [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
 def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
@@ -754,6 +789,15 @@ def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
             Mgt->getBasePtr().getValueType() == MVT::v2i64);
   return false;
 }]>;
+def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
+  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+  if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
+    return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
+            Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
+           (Mgt->getMemoryVT() == MVT::v2i32 ||
+            Mgt->getMemoryVT() == MVT::v2f32);
+  return false;
+}]>;
 def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
   (masked_gather node:$src1, node:$src2, node:$src3) , [{
   if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 627b612..34d4816 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -414,17 +414,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
     { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
     { X86::VEXTRACTPSZrr,   X86::VEXTRACTPSZmr,    TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
     { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
     { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+    { X86::VMOVPQIto64Zrr,  X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
+    { X86::VMOVSDto64Zrr,   X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
+    { X86::VMOVSS2DIZrr,    X86::VMOVSS2DIZmr,  TB_FOLDED_STORE },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
+    { X86::VPEXTRDZrr,      X86::VPEXTRDZmr,    TB_FOLDED_STORE },
+    { X86::VPEXTRQZrr,      X86::VPEXTRQZmr,    TB_FOLDED_STORE },
     { X86::VPMOVDBZrr,      X86::VPMOVDBZmr,    TB_FOLDED_STORE },
     { X86::VPMOVDWZrr,      X86::VPMOVDWZmr,    TB_FOLDED_STORE },
     { X86::VPMOVQDZrr,      X86::VPMOVQDZmr,    TB_FOLDED_STORE },
@@ -816,6 +821,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
     { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
 
+    // LWP foldable instructions
+    { X86::LWPINS32rri,        X86::LWPINS32rmi,      0 },
+    { X86::LWPINS64rri,        X86::LWPINS64rmi,      0 },
+    { X86::LWPVAL32rri,        X86::LWPVAL32rmi,      0 },
+    { X86::LWPVAL64rri,        X86::LWPVAL64rmi,      0 },
+
     // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
     { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
@@ -867,11 +878,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // AVX-512 foldable instructions
     { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm,     TB_NO_REVERSE },
     { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
-    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VMOV64toSDZrr,    X86::VMOV64toSDZrm,      0 },
+    { X86::VMOVDI2PDIZrr,    X86::VMOVDI2PDIZrm,      0 },
     { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
     { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
     { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
@@ -883,12 +893,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
     { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
     { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
+    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VPABSBZrr,        X86::VPABSBZrm,          0 },
     { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
     { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
+    { X86::VPABSWZrr,        X86::VPABSWZrm,          0 },
+    { X86::VPCONFLICTDZrr,   X86::VPCONFLICTDZrm,     0 },
+    { X86::VPCONFLICTQZrr,   X86::VPCONFLICTQZrm,     0 },
     { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
     { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
     { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
     { X86::VPERMQZri,        X86::VPERMQZmi,          0 },
+    { X86::VPLZCNTDZrr,      X86::VPLZCNTDZrm,        0 },
+    { X86::VPLZCNTQZrr,      X86::VPLZCNTQZrm,        0 },
     { X86::VPMOVSXBDZrr,     X86::VPMOVSXBDZrm,       0 },
     { X86::VPMOVSXBQZrr,     X86::VPMOVSXBQZrm,       TB_NO_REVERSE },
     { X86::VPMOVSXBWZrr,     X86::VPMOVSXBWZrm,       0 },
@@ -901,15 +918,26 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMOVZXDQZrr,     X86::VPMOVZXDQZrm,       0 },
     { X86::VPMOVZXWDZrr,     X86::VPMOVZXWDZrm,       0 },
     { X86::VPMOVZXWQZrr,     X86::VPMOVZXWQZrm,       0 },
+    { X86::VPOPCNTDZrr,      X86::VPOPCNTDZrm,        0 },
+    { X86::VPOPCNTQZrr,      X86::VPOPCNTQZrm,        0 },
     { X86::VPSHUFDZri,       X86::VPSHUFDZmi,         0 },
     { X86::VPSHUFHWZri,      X86::VPSHUFHWZmi,        0 },
     { X86::VPSHUFLWZri,      X86::VPSHUFLWZmi,        0 },
+    { X86::VPSLLDQZ512rr,    X86::VPSLLDQZ512rm,      0 },
+    { X86::VPSLLDZri,        X86::VPSLLDZmi,          0 },
+    { X86::VPSLLQZri,        X86::VPSLLQZmi,          0 },
+    { X86::VPSLLWZri,        X86::VPSLLWZmi,          0 },
+    { X86::VPSRADZri,        X86::VPSRADZmi,          0 },
+    { X86::VPSRAQZri,        X86::VPSRAQZmi,          0 },
+    { X86::VPSRAWZri,        X86::VPSRAWZmi,          0 },
+    { X86::VPSRLDQZ512rr,    X86::VPSRLDQZ512rm,      0 },
+    { X86::VPSRLDZri,        X86::VPSRLDZmi,          0 },
+    { X86::VPSRLQZri,        X86::VPSRLQZmi,          0 },
+    { X86::VPSRLWZri,        X86::VPSRLWZmi,          0 },
 
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256r_s,  X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256r_s,  X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
     { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
     { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
     { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
@@ -920,10 +948,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
     { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
     { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+    { X86::VPABSBZ256rr,         X86::VPABSBZ256rm,         0 },
+    { X86::VPABSDZ256rr,         X86::VPABSDZ256rm,         0 },
+    { X86::VPABSQZ256rr,         X86::VPABSQZ256rm,         0 },
+    { X86::VPABSWZ256rr,         X86::VPABSWZ256rm,         0 },
+    { X86::VPCONFLICTDZ256rr,    X86::VPCONFLICTDZ256rm,    0 },
+    { X86::VPCONFLICTQZ256rr,    X86::VPCONFLICTQZ256rm,    0 },
     { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
     { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
     { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
     { X86::VPERMQZ256ri,         X86::VPERMQZ256mi,         0 },
+    { X86::VPLZCNTDZ256rr,       X86::VPLZCNTDZ256rm,       0 },
+    { X86::VPLZCNTQZ256rr,       X86::VPLZCNTQZ256rm,       0 },
     { X86::VPMOVSXBDZ256rr,      X86::VPMOVSXBDZ256rm,      TB_NO_REVERSE },
     { X86::VPMOVSXBQZ256rr,      X86::VPMOVSXBQZ256rm,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZ256rr,      X86::VPMOVSXBWZ256rm,      0 },
@@ -939,10 +975,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
     { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
     { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
+    { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
+    { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
+    { X86::VPSLLQZ256ri,         X86::VPSLLQZ256mi,         0 },
+    { X86::VPSLLWZ256ri,         X86::VPSLLWZ256mi,         0 },
+    { X86::VPSRADZ256ri,         X86::VPSRADZ256mi,         0 },
+    { X86::VPSRAQZ256ri,         X86::VPSRAQZ256mi,         0 },
+    { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
+    { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
+    { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
+    { X86::VPSRLQZ256ri,         X86::VPSRLQZ256mi,         0 },
+    { X86::VPSRLWZ256ri,         X86::VPSRLWZ256mi,         0 },
 
     // AVX-512 foldable instructions (128-bit versions)
     { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ128r_s,  X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
     { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
     { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
     { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
@@ -953,8 +999,16 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
     { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
     { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+    { X86::VPABSBZ128rr,         X86::VPABSBZ128rm,         0 },
+    { X86::VPABSDZ128rr,         X86::VPABSDZ128rm,         0 },
+    { X86::VPABSQZ128rr,         X86::VPABSQZ128rm,         0 },
+    { X86::VPABSWZ128rr,         X86::VPABSWZ128rm,         0 },
+    { X86::VPCONFLICTDZ128rr,    X86::VPCONFLICTDZ128rm,    0 },
+    { X86::VPCONFLICTQZ128rr,    X86::VPCONFLICTQZ128rm,    0 },
     { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
     { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
+    { X86::VPLZCNTDZ128rr,       X86::VPLZCNTDZ128rm,       0 },
+    { X86::VPLZCNTQZ128rr,       X86::VPLZCNTQZ128rm,       0 },
     { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
     { X86::VPMOVSXBQZ128rr,      X86::VPMOVSXBQZ128rm,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZ128rr,      X86::VPMOVSXBWZ128rm,      TB_NO_REVERSE },
@@ -970,6 +1024,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
     { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
     { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
+    { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
+    { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
+    { X86::VPSLLQZ128ri,         X86::VPSLLQZ128mi,         0 },
+    { X86::VPSLLWZ128ri,         X86::VPSLLWZ128mi,         0 },
+    { X86::VPSRADZ128ri,         X86::VPSRADZ128mi,         0 },
+    { X86::VPSRAQZ128ri,         X86::VPSRAQZ128mi,         0 },
+    { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
+    { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
+    { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
+    { X86::VPSRLQZ128ri,         X86::VPSRLQZ128mi,         0 },
+    { X86::VPSRLWZ128ri,         X86::VPSRLWZ128mi,         0 },
 
     // F16C foldable instructions
     { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
@@ -1170,18 +1235,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
     { X86::PMADDUBSWrr,     X86::PMADDUBSWrm,   TB_ALIGN_16 },
     { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
     { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
     { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
-    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
-    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
     { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
     { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
     { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
-    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
-    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
-    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
-    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
     { X86::PMULHRSWrr,      X86::PMULHRSWrm,    TB_ALIGN_16 },
     { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
@@ -1340,8 +1405,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
 
     // AVX 128-bit versions of foldable instructions
-    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
-    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    TB_NO_REVERSE },
     { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
     { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
     { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
@@ -1350,8 +1413,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
     { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
-    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
-    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    TB_NO_REVERSE },
     { X86::VADDPDrr,          X86::VADDPDrm,           0 },
     { X86::VADDPSrr,          X86::VADDPSrm,           0 },
     { X86::VADDSDrr,          X86::VADDSDrm,           0 },
@@ -1458,18 +1519,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
     { X86::VPMADDUBSWrr,      X86::VPMADDUBSWrm,       0 },
     { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
     { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
     { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
-    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
-    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
     { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
     { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
     { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
-    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
-    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
-    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
-    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
     { X86::VPMULHRSWrr,       X86::VPMULHRSWrm,        0 },
     { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
@@ -1626,18 +1687,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
     { X86::VPMADDUBSWYrr,     X86::VPMADDUBSWYrm,      0 },
     { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
     { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
     { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
-    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
-    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
     { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
     { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
     { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
-    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
-    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
-    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
-    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
     { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
     { X86::VPMULHRSWYrr,      X86::VPMULHRSWYrm,       0 },
@@ -1732,7 +1793,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
-    { X86::VPCMOVrrrY,        X86::VPCMOVrmrY,          0 },
+    { X86::VPCMOVYrrr,        X86::VPCMOVYrmr,          0 },
     { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
     { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
     { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
@@ -1742,9 +1803,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
     { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
     { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
-    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PDYrr,     X86::VPERMIL2PDYmr,       0 },
     { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
-    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPERMIL2PSYrr,     X86::VPERMIL2PSYmr,       0 },
     { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
     { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
     { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
@@ -1800,8 +1861,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDNPSZrr,        X86::VANDNPSZrm,          0 },
     { X86::VANDPDZrr,         X86::VANDPDZrm,           0 },
     { X86::VANDPSZrr,         X86::VANDPSZrm,           0 },
-    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
     { X86::VCMPPDZrri,        X86::VCMPPDZrmi,          0 },
     { X86::VCMPPSZrri,        X86::VCMPPSZrmi,          0 },
     { X86::VCMPSDZrr,         X86::VCMPSDZrm,           0 },
@@ -1842,6 +1901,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMINSDZrr_Int,     X86::VMINSDZrm_Int,       TB_NO_REVERSE },
     { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
     { X86::VMINSSZrr_Int,     X86::VMINSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VMOVLHPSZrr,       X86::VMOVHPSZ128rm,       TB_NO_REVERSE },
     { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
     { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
     { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
@@ -1850,6 +1910,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
     { X86::VORPDZrr,          X86::VORPDZrm,            0 },
     { X86::VORPSZrr,          X86::VORPSZrm,            0 },
+    { X86::VPACKSSDWZrr,      X86::VPACKSSDWZrm,        0 },
+    { X86::VPACKSSWBZrr,      X86::VPACKSSWBZrm,        0 },
+    { X86::VPACKUSDWZrr,      X86::VPACKUSDWZrm,        0 },
+    { X86::VPACKUSWBZrr,      X86::VPACKUSWBZrm,        0 },
     { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
     { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
     { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
@@ -1863,6 +1927,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrr,        X86::VPANDNDZrm,          0 },
     { X86::VPANDNQZrr,        X86::VPANDNQZrm,          0 },
     { X86::VPANDQZrr,         X86::VPANDQZrm,           0 },
+    { X86::VPAVGBZrr,         X86::VPAVGBZrm,           0 },
+    { X86::VPAVGWZrr,         X86::VPAVGWZrm,           0 },
     { X86::VPCMPBZrri,        X86::VPCMPBZrmi,          0 },
     { X86::VPCMPDZrri,        X86::VPCMPDZrmi,          0 },
     { X86::VPCMPEQBZrr,       X86::VPCMPEQBZrm,         0 },
@@ -1887,26 +1953,55 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
     { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
     { X86::VPERMWZrr,         X86::VPERMWZrm,           0 },
+    { X86::VPINSRBZrr,        X86::VPINSRBZrm,          0 },
+    { X86::VPINSRDZrr,        X86::VPINSRDZrm,          0 },
+    { X86::VPINSRQZrr,        X86::VPINSRQZrm,          0 },
+    { X86::VPINSRWZrr,        X86::VPINSRWZrm,          0 },
     { X86::VPMADDUBSWZrr,     X86::VPMADDUBSWZrm,       0 },
     { X86::VPMADDWDZrr,       X86::VPMADDWDZrm,         0 },
+    { X86::VPMAXSBZrr,        X86::VPMAXSBZrm,          0 },
     { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
     { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXSWZrr,        X86::VPMAXSWZrm,          0 },
+    { X86::VPMAXUBZrr,        X86::VPMAXUBZrm,          0 },
     { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
     { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMAXUWZrr,        X86::VPMAXUWZrm,          0 },
+    { X86::VPMINSBZrr,        X86::VPMINSBZrm,          0 },
     { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
     { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINSWZrr,        X86::VPMINSWZrm,          0 },
+    { X86::VPMINUBZrr,        X86::VPMINUBZrm,          0 },
     { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
     { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMINUWZrr,        X86::VPMINUWZrm,          0 },
     { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
+    { X86::VPMULLDZrr,        X86::VPMULLDZrm,          0 },
+    { X86::VPMULLQZrr,        X86::VPMULLQZrm,          0 },
+    { X86::VPMULLWZrr,        X86::VPMULLWZrm,          0 },
     { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
     { X86::VPORDZrr,          X86::VPORDZrm,            0 },
     { X86::VPORQZrr,          X86::VPORQZrm,            0 },
+    { X86::VPSADBWZ512rr,     X86::VPSADBWZ512rm,       0 },
     { X86::VPSHUFBZrr,        X86::VPSHUFBZrm,          0 },
+    { X86::VPSLLDZrr,         X86::VPSLLDZrm,           0 },
+    { X86::VPSLLQZrr,         X86::VPSLLQZrm,           0 },
     { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
     { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSLLVWZrr,        X86::VPSLLVWZrm,          0 },
+    { X86::VPSLLWZrr,         X86::VPSLLWZrm,           0 },
+    { X86::VPSRADZrr,         X86::VPSRADZrm,           0 },
+    { X86::VPSRAQZrr,         X86::VPSRAQZrm,           0 },
     { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRAVQZrr,        X86::VPSRAVQZrm,          0 },
+    { X86::VPSRAVWZrr,        X86::VPSRAVWZrm,          0 },
+    { X86::VPSRAWZrr,         X86::VPSRAWZrm,           0 },
+    { X86::VPSRLDZrr,         X86::VPSRLDZrm,           0 },
+    { X86::VPSRLQZrr,         X86::VPSRLQZrm,           0 },
     { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
     { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSRLVWZrr,        X86::VPSRLVWZrm,          0 },
+    { X86::VPSRLWZrr,         X86::VPSRLWZrm,           0 },
     { X86::VPSUBBZrr,         X86::VPSUBBZrm,           0 },
     { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
     { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
@@ -1957,9 +2052,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPDZ256rr,      X86::VANDPDZ256rm,        0 },
     { X86::VANDPSZ128rr,      X86::VANDPSZ128rm,        0 },
     { X86::VANDPSZ256rr,      X86::VANDPSZ256rm,        0 },
-    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
     { X86::VCMPPDZ128rri,     X86::VCMPPDZ128rmi,       0 },
     { X86::VCMPPDZ256rri,     X86::VCMPPDZ256rmi,       0 },
     { X86::VCMPPSZ128rri,     X86::VCMPPSZ128rmi,       0 },
@@ -1996,6 +2088,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
     { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
     { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
+    { X86::VPACKSSDWZ256rr,   X86::VPACKSSDWZ256rm,     0 },
+    { X86::VPACKSSDWZ128rr,   X86::VPACKSSDWZ128rm,     0 },
+    { X86::VPACKSSWBZ256rr,   X86::VPACKSSWBZ256rm,     0 },
+    { X86::VPACKSSWBZ128rr,   X86::VPACKSSWBZ128rm,     0 },
+    { X86::VPACKUSDWZ256rr,   X86::VPACKUSDWZ256rm,     0 },
+    { X86::VPACKUSDWZ128rr,   X86::VPACKUSDWZ128rm,     0 },
+    { X86::VPACKUSWBZ256rr,   X86::VPACKUSWBZ256rm,     0 },
+    { X86::VPACKUSWBZ128rr,   X86::VPACKUSWBZ128rm,     0 },
     { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
     { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
     { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
@@ -2022,6 +2122,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNQZ256rr,     X86::VPANDNQZ256rm,       0 },
     { X86::VPANDQZ128rr,      X86::VPANDQZ128rm,        0 },
     { X86::VPANDQZ256rr,      X86::VPANDQZ256rm,        0 },
+    { X86::VPAVGBZ128rr,      X86::VPAVGBZ128rm,        0 },
+    { X86::VPAVGBZ256rr,      X86::VPAVGBZ256rm,        0 },
+    { X86::VPAVGWZ128rr,      X86::VPAVGWZ128rm,        0 },
+    { X86::VPAVGWZ256rr,      X86::VPAVGWZ256rm,        0 },
     { X86::VPCMPBZ128rri,     X86::VPCMPBZ128rmi,       0 },
     { X86::VPCMPBZ256rri,     X86::VPCMPBZ256rmi,       0 },
     { X86::VPCMPDZ128rri,     X86::VPCMPDZ128rmi,       0 },
@@ -2070,12 +2174,92 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMADDUBSWZ256rr,  X86::VPMADDUBSWZ256rm,    0 },
     { X86::VPMADDWDZ128rr,    X86::VPMADDWDZ128rm,      0 },
     { X86::VPMADDWDZ256rr,    X86::VPMADDWDZ256rm,      0 },
+    { X86::VPMAXSBZ128rr,     X86::VPMAXSBZ128rm,       0 },
+    { X86::VPMAXSBZ256rr,     X86::VPMAXSBZ256rm,       0 },
+    { X86::VPMAXSDZ128rr,     X86::VPMAXSDZ128rm,       0 },
+    { X86::VPMAXSDZ256rr,     X86::VPMAXSDZ256rm,       0 },
+    { X86::VPMAXSQZ128rr,     X86::VPMAXSQZ128rm,       0 },
+    { X86::VPMAXSQZ256rr,     X86::VPMAXSQZ256rm,       0 },
+    { X86::VPMAXSWZ128rr,     X86::VPMAXSWZ128rm,       0 },
+    { X86::VPMAXSWZ256rr,     X86::VPMAXSWZ256rm,       0 },
+    { X86::VPMAXUBZ128rr,     X86::VPMAXUBZ128rm,       0 },
+    { X86::VPMAXUBZ256rr,     X86::VPMAXUBZ256rm,       0 },
+    { X86::VPMAXUDZ128rr,     X86::VPMAXUDZ128rm,       0 },
+    { X86::VPMAXUDZ256rr,     X86::VPMAXUDZ256rm,       0 },
+    { X86::VPMAXUQZ128rr,     X86::VPMAXUQZ128rm,       0 },
+    { X86::VPMAXUQZ256rr,     X86::VPMAXUQZ256rm,       0 },
+    { X86::VPMAXUWZ128rr,     X86::VPMAXUWZ128rm,       0 },
+    { X86::VPMAXUWZ256rr,     X86::VPMAXUWZ256rm,       0 },
+    { X86::VPMINSBZ128rr,     X86::VPMINSBZ128rm,       0 },
+    { X86::VPMINSBZ256rr,     X86::VPMINSBZ256rm,       0 },
+    { X86::VPMINSDZ128rr,     X86::VPMINSDZ128rm,       0 },
+    { X86::VPMINSDZ256rr,     X86::VPMINSDZ256rm,       0 },
+    { X86::VPMINSQZ128rr,     X86::VPMINSQZ128rm,       0 },
+    { X86::VPMINSQZ256rr,     X86::VPMINSQZ256rm,       0 },
+    { X86::VPMINSWZ128rr,     X86::VPMINSWZ128rm,       0 },
+    { X86::VPMINSWZ256rr,     X86::VPMINSWZ256rm,       0 },
+    { X86::VPMINUBZ128rr,     X86::VPMINUBZ128rm,       0 },
+    { X86::VPMINUBZ256rr,     X86::VPMINUBZ256rm,       0 },
+    { X86::VPMINUDZ128rr,     X86::VPMINUDZ128rm,       0 },
+    { X86::VPMINUDZ256rr,     X86::VPMINUDZ256rm,       0 },
+    { X86::VPMINUQZ128rr,     X86::VPMINUQZ128rm,       0 },
+    { X86::VPMINUQZ256rr,     X86::VPMINUQZ256rm,       0 },
+    { X86::VPMINUWZ128rr,     X86::VPMINUWZ128rm,       0 },
+    { X86::VPMINUWZ256rr,     X86::VPMINUWZ256rm,       0 },
+    { X86::VPMULDQZ128rr,     X86::VPMULDQZ128rm,       0 },
+    { X86::VPMULDQZ256rr,     X86::VPMULDQZ256rm,       0 },
+    { X86::VPMULLDZ128rr,     X86::VPMULLDZ128rm,       0 },
+    { X86::VPMULLDZ256rr,     X86::VPMULLDZ256rm,       0 },
+    { X86::VPMULLQZ128rr,     X86::VPMULLQZ128rm,       0 },
+    { X86::VPMULLQZ256rr,     X86::VPMULLQZ256rm,       0 },
+    { X86::VPMULLWZ128rr,     X86::VPMULLWZ128rm,       0 },
+    { X86::VPMULLWZ256rr,     X86::VPMULLWZ256rm,       0 },
+    { X86::VPMULUDQZ128rr,    X86::VPMULUDQZ128rm,      0 },
+    { X86::VPMULUDQZ256rr,    X86::VPMULUDQZ256rm,      0 },
     { X86::VPORDZ128rr,       X86::VPORDZ128rm,         0 },
     { X86::VPORDZ256rr,       X86::VPORDZ256rm,         0 },
     { X86::VPORQZ128rr,       X86::VPORQZ128rm,         0 },
     { X86::VPORQZ256rr,       X86::VPORQZ256rm,         0 },
+    { X86::VPSADBWZ128rr,     X86::VPSADBWZ128rm,       0 },
+    { X86::VPSADBWZ256rr,     X86::VPSADBWZ256rm,       0 },
     { X86::VPSHUFBZ128rr,     X86::VPSHUFBZ128rm,       0 },
     { X86::VPSHUFBZ256rr,     X86::VPSHUFBZ256rm,       0 },
+    { X86::VPSLLDZ128rr,      X86::VPSLLDZ128rm,        0 },
+    { X86::VPSLLDZ256rr,      X86::VPSLLDZ256rm,        0 },
+    { X86::VPSLLQZ128rr,      X86::VPSLLQZ128rm,        0 },
+    { X86::VPSLLQZ256rr,      X86::VPSLLQZ256rm,        0 },
+    { X86::VPSLLVDZ128rr,     X86::VPSLLVDZ128rm,       0 },
+    { X86::VPSLLVDZ256rr,     X86::VPSLLVDZ256rm,       0 },
+    { X86::VPSLLVQZ128rr,     X86::VPSLLVQZ128rm,       0 },
+    { X86::VPSLLVQZ256rr,     X86::VPSLLVQZ256rm,       0 },
+    { X86::VPSLLVWZ128rr,     X86::VPSLLVWZ128rm,       0 },
+    { X86::VPSLLVWZ256rr,     X86::VPSLLVWZ256rm,       0 },
+    { X86::VPSLLWZ128rr,      X86::VPSLLWZ128rm,        0 },
+    { X86::VPSLLWZ256rr,      X86::VPSLLWZ256rm,        0 },
+    { X86::VPSRADZ128rr,      X86::VPSRADZ128rm,        0 },
+    { X86::VPSRADZ256rr,      X86::VPSRADZ256rm,        0 },
+    { X86::VPSRAQZ128rr,      X86::VPSRAQZ128rm,        0 },
+    { X86::VPSRAQZ256rr,      X86::VPSRAQZ256rm,        0 },
+    { X86::VPSRAVDZ128rr,     X86::VPSRAVDZ128rm,       0 },
+    { X86::VPSRAVDZ256rr,     X86::VPSRAVDZ256rm,       0 },
+    { X86::VPSRAVQZ128rr,     X86::VPSRAVQZ128rm,       0 },
+    { X86::VPSRAVQZ256rr,     X86::VPSRAVQZ256rm,       0 },
+    { X86::VPSRAVWZ128rr,     X86::VPSRAVWZ128rm,       0 },
+    { X86::VPSRAVWZ256rr,     X86::VPSRAVWZ256rm,       0 },
+    { X86::VPSRAWZ128rr,      X86::VPSRAWZ128rm,        0 },
+    { X86::VPSRAWZ256rr,      X86::VPSRAWZ256rm,        0 },
+    { X86::VPSRLDZ128rr,      X86::VPSRLDZ128rm,        0 },
+    { X86::VPSRLDZ256rr,      X86::VPSRLDZ256rm,        0 },
+    { X86::VPSRLQZ128rr,      X86::VPSRLQZ128rm,        0 },
+    { X86::VPSRLQZ256rr,      X86::VPSRLQZ256rm,        0 },
+    { X86::VPSRLVDZ128rr,     X86::VPSRLVDZ128rm,       0 },
+    { X86::VPSRLVDZ256rr,     X86::VPSRLVDZ256rm,       0 },
+    { X86::VPSRLVQZ128rr,     X86::VPSRLVQZ128rm,       0 },
+    { X86::VPSRLVQZ256rr,     X86::VPSRLVQZ256rm,       0 },
+    { X86::VPSRLVWZ128rr,     X86::VPSRLVWZ128rm,       0 },
+    { X86::VPSRLVWZ256rr,     X86::VPSRLVWZ256rm,       0 },
+    { X86::VPSRLWZ128rr,      X86::VPSRLWZ128rm,        0 },
+    { X86::VPSRLWZ256rr,      X86::VPSRLWZ256rm,        0 },
     { X86::VPSUBBZ128rr,      X86::VPSUBBZ128rm,        0 },
     { X86::VPSUBBZ256rr,      X86::VPSUBBZ256rm,        0 },
     { X86::VPSUBDZ128rr,      X86::VPSUBDZ128rm,        0 },
@@ -2112,6 +2296,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPXORDZ256rr,      X86::VPXORDZ256rm,        0 },
     { X86::VPXORQZ128rr,      X86::VPXORQZ128rm,        0 },
     { X86::VPXORQZ256rr,      X86::VPXORQZ256rm,        0 },
+    { X86::VSHUFPDZ128rri,    X86::VSHUFPDZ128rmi,      0 },
+    { X86::VSHUFPDZ256rri,    X86::VSHUFPDZ256rmi,      0 },
+    { X86::VSHUFPSZ128rri,    X86::VSHUFPSZ128rmi,      0 },
+    { X86::VSHUFPSZ256rri,    X86::VSHUFPSZ256rmi,      0 },
     { X86::VSUBPDZ128rr,      X86::VSUBPDZ128rm,        0 },
     { X86::VSUBPDZ256rr,      X86::VSUBPDZ256rm,        0 },
     { X86::VSUBPSZ128rr,      X86::VSUBPSZ128rm,        0 },
@@ -2130,10 +2318,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VXORPSZ256rr,      X86::VXORPSZ256rm,        0 },
 
     // AVX-512 masked foldable instructions
+    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
+    { X86::VPABSBZrrkz,       X86::VPABSBZrmkz,         0 },
+    { X86::VPABSDZrrkz,       X86::VPABSDZrmkz,         0 },
+    { X86::VPABSQZrrkz,       X86::VPABSQZrmkz,         0 },
+    { X86::VPABSWZrrkz,       X86::VPABSWZrmkz,         0 },
+    { X86::VPCONFLICTDZrrkz,  X86::VPCONFLICTDZrmkz,    0 },
+    { X86::VPCONFLICTQZrrkz,  X86::VPCONFLICTQZrmkz,    0 },
     { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
     { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
     { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
     { X86::VPERMQZrikz,       X86::VPERMQZmikz,         0 },
+    { X86::VPLZCNTDZrrkz,     X86::VPLZCNTDZrmkz,       0 },
+    { X86::VPLZCNTQZrrkz,     X86::VPLZCNTQZrmkz,       0 },
     { X86::VPMOVSXBDZrrkz,    X86::VPMOVSXBDZrmkz,      0 },
     { X86::VPMOVSXBQZrrkz,    X86::VPMOVSXBQZrmkz,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZrrkz,    X86::VPMOVSXBWZrmkz,      0 },
@@ -2146,15 +2344,36 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMOVZXDQZrrkz,    X86::VPMOVZXDQZrmkz,      0 },
     { X86::VPMOVZXWDZrrkz,    X86::VPMOVZXWDZrmkz,      0 },
     { X86::VPMOVZXWQZrrkz,    X86::VPMOVZXWQZrmkz,      0 },
+    { X86::VPOPCNTDZrrkz,     X86::VPOPCNTDZrmkz,       0 },
+    { X86::VPOPCNTQZrrkz,     X86::VPOPCNTQZrmkz,       0 },
     { X86::VPSHUFDZrikz,      X86::VPSHUFDZmikz,        0 },
     { X86::VPSHUFHWZrikz,     X86::VPSHUFHWZmikz,       0 },
     { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
+    { X86::VPSLLDZrikz,       X86::VPSLLDZmikz,         0 },
+    { X86::VPSLLQZrikz,       X86::VPSLLQZmikz,         0 },
+    { X86::VPSLLWZrikz,       X86::VPSLLWZmikz,         0 },
+    { X86::VPSRADZrikz,       X86::VPSRADZmikz,         0 },
+    { X86::VPSRAQZrikz,       X86::VPSRAQZmikz,         0 },
+    { X86::VPSRAWZrikz,       X86::VPSRAWZmikz,         0 },
+    { X86::VPSRLDZrikz,       X86::VPSRLDZmikz,         0 },
+    { X86::VPSRLQZrikz,       X86::VPSRLQZmikz,         0 },
+    { X86::VPSRLWZrikz,       X86::VPSRLWZmikz,         0 },
 
     // AVX-512VL 256-bit masked foldable instructions
+    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
+    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+    { X86::VPABSBZ256rrkz,    X86::VPABSBZ256rmkz,      0 },
+    { X86::VPABSDZ256rrkz,    X86::VPABSDZ256rmkz,      0 },
+    { X86::VPABSQZ256rrkz,    X86::VPABSQZ256rmkz,      0 },
+    { X86::VPABSWZ256rrkz,    X86::VPABSWZ256rmkz,      0 },
+    { X86::VPCONFLICTDZ256rrkz, X86::VPCONFLICTDZ256rmkz, 0 },
+    { X86::VPCONFLICTQZ256rrkz, X86::VPCONFLICTQZ256rmkz, 0 },
     { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
     { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
     { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
     { X86::VPERMQZ256rikz,    X86::VPERMQZ256mikz,      0 },
+    { X86::VPLZCNTDZ256rrkz,  X86::VPLZCNTDZ256rmkz,    0 },
+    { X86::VPLZCNTQZ256rrkz,  X86::VPLZCNTQZ256rmkz,    0 },
     { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz,   TB_NO_REVERSE },
     { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz,   TB_NO_REVERSE },
     { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz,   0 },
@@ -2170,10 +2389,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256rikz,   X86::VPSHUFDZ256mikz,     0 },
     { X86::VPSHUFHWZ256rikz,  X86::VPSHUFHWZ256mikz,    0 },
     { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
+    { X86::VPSLLDZ256rikz,    X86::VPSLLDZ256mikz,      0 },
+    { X86::VPSLLQZ256rikz,    X86::VPSLLQZ256mikz,      0 },
+    { X86::VPSLLWZ256rikz,    X86::VPSLLWZ256mikz,      0 },
+    { X86::VPSRADZ256rikz,    X86::VPSRADZ256mikz,      0 },
+    { X86::VPSRAQZ256rikz,    X86::VPSRAQZ256mikz,      0 },
+    { X86::VPSRAWZ256rikz,    X86::VPSRAWZ256mikz,      0 },
+    { X86::VPSRLDZ256rikz,    X86::VPSRLDZ256mikz,      0 },
+    { X86::VPSRLQZ256rikz,    X86::VPSRLQZ256mikz,      0 },
+    { X86::VPSRLWZ256rikz,    X86::VPSRLWZ256mikz,      0 },
 
     // AVX-512VL 128-bit masked foldable instructions
+    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+    { X86::VPABSBZ128rrkz,    X86::VPABSBZ128rmkz,      0 },
+    { X86::VPABSDZ128rrkz,    X86::VPABSDZ128rmkz,      0 },
+    { X86::VPABSQZ128rrkz,    X86::VPABSQZ128rmkz,      0 },
+    { X86::VPABSWZ128rrkz,    X86::VPABSWZ128rmkz,      0 },
+    { X86::VPCONFLICTDZ128rrkz, X86::VPCONFLICTDZ128rmkz, 0 },
+    { X86::VPCONFLICTQZ128rrkz, X86::VPCONFLICTQZ128rmkz, 0 },
     { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
     { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
+    { X86::VPLZCNTDZ128rrkz,  X86::VPLZCNTDZ128rmkz,    0 },
+    { X86::VPLZCNTQZ128rrkz,  X86::VPLZCNTQZ128rmkz,    0 },
     { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
     { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz,   TB_NO_REVERSE },
     { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz,   TB_NO_REVERSE },
@@ -2189,6 +2426,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128rikz,   X86::VPSHUFDZ128mikz,     0 },
     { X86::VPSHUFHWZ128rikz,  X86::VPSHUFHWZ128mikz,    0 },
     { X86::VPSHUFLWZ128rikz,  X86::VPSHUFLWZ128mikz,    0 },
+    { X86::VPSLLDZ128rikz,    X86::VPSLLDZ128mikz,      0 },
+    { X86::VPSLLQZ128rikz,    X86::VPSLLQZ128mikz,      0 },
+    { X86::VPSLLWZ128rikz,    X86::VPSLLWZ128mikz,      0 },
+    { X86::VPSRADZ128rikz,    X86::VPSRADZ128mikz,      0 },
+    { X86::VPSRAQZ128rikz,    X86::VPSRAQZ128mikz,      0 },
+    { X86::VPSRAWZ128rikz,    X86::VPSRAWZ128mikz,      0 },
+    { X86::VPSRLDZ128rikz,    X86::VPSRLDZ128mikz,      0 },
+    { X86::VPSRLQZ128rikz,    X86::VPSRLQZ128mikz,      0 },
+    { X86::VPSRLWZ128rikz,    X86::VPSRLWZ128mikz,      0 },
 
     // AES foldable instructions
     { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
@@ -2262,23 +2508,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
-    { X86::VPCMOVrrrY,            X86::VPCMOVrrmY,            0 },
+    { X86::VPCMOVYrrr,            X86::VPCMOVYrrm,            0 },
     { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
-    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PDYrr,         X86::VPERMIL2PDYrm,         0 },
     { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
-    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPERMIL2PSYrr,         X86::VPERMIL2PSYrm,         0 },
     { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
 
     // AVX-512 instructions with 3 source operands.
-    { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
-    { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
-    { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
-    { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 },
-    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
     { X86::VPERMI2Brr,            X86::VPERMI2Brm,            0 },
     { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
     { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
@@ -2329,6 +2566,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 masked instructions
     { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
     { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
+    { X86::VADDSDZrr_Intkz,       X86::VADDSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VADDSSZrr_Intkz,       X86::VADDSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VALIGNDZrrikz,         X86::VALIGNDZrmikz,         0 },
     { X86::VALIGNQZrrikz,         X86::VALIGNQZrmikz,         0 },
     { X86::VANDNPDZrrkz,          X86::VANDNPDZrmkz,          0 },
@@ -2337,6 +2576,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPSZrrkz,           X86::VANDPSZrmkz,           0 },
     { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
     { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
+    { X86::VDIVSDZrr_Intkz,       X86::VDIVSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr_Intkz,       X86::VDIVSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VINSERTF32x4Zrrkz,     X86::VINSERTF32x4Zrmkz,     0 },
     { X86::VINSERTF32x8Zrrkz,     X86::VINSERTF32x8Zrmkz,     0 },
     { X86::VINSERTF64x2Zrrkz,     X86::VINSERTF64x2Zrmkz,     0 },
@@ -2349,14 +2590,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXCPSZrrkz,          X86::VMAXCPSZrmkz,          0 },
     { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
     { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
+    { X86::VMAXSDZrr_Intkz,       X86::VMAXSDZrm_Intkz,       0 },
+    { X86::VMAXSSZrr_Intkz,       X86::VMAXSSZrm_Intkz,       0 },
     { X86::VMINCPDZrrkz,          X86::VMINCPDZrmkz,          0 },
     { X86::VMINCPSZrrkz,          X86::VMINCPSZrmkz,          0 },
     { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
     { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
+    { X86::VMINSDZrr_Intkz,       X86::VMINSDZrm_Intkz,       0 },
+    { X86::VMINSSZrr_Intkz,       X86::VMINSSZrm_Intkz,       0 },
     { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
     { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
+    { X86::VMULSDZrr_Intkz,       X86::VMULSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VMULSSZrr_Intkz,       X86::VMULSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
     { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
+    { X86::VPACKSSDWZrrkz,        X86::VPACKSSDWZrmkz,        0 },
+    { X86::VPACKSSWBZrrkz,        X86::VPACKSSWBZrmkz,        0 },
+    { X86::VPACKUSDWZrrkz,        X86::VPACKUSDWZrmkz,        0 },
+    { X86::VPACKUSWBZrrkz,        X86::VPACKUSWBZrmkz,        0 },
     { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
     { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
     { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
@@ -2370,6 +2621,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrrkz,          X86::VPANDNDZrmkz,          0 },
     { X86::VPANDNQZrrkz,          X86::VPANDNQZrmkz,          0 },
     { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
+    { X86::VPAVGBZrrkz,           X86::VPAVGBZrmkz,           0 },
+    { X86::VPAVGWZrrkz,           X86::VPAVGWZrmkz,           0 },
     { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
     { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
     { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
@@ -2380,9 +2633,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZrrkz,           X86::VPERMWZrmkz,           0 },
     { X86::VPMADDUBSWZrrkz,       X86::VPMADDUBSWZrmkz,       0 },
     { X86::VPMADDWDZrrkz,         X86::VPMADDWDZrmkz,         0 },
+    { X86::VPMAXSBZrrkz,          X86::VPMAXSBZrmkz,          0 },
+    { X86::VPMAXSDZrrkz,          X86::VPMAXSDZrmkz,          0 },
+    { X86::VPMAXSQZrrkz,          X86::VPMAXSQZrmkz,          0 },
+    { X86::VPMAXSWZrrkz,          X86::VPMAXSWZrmkz,          0 },
+    { X86::VPMAXUBZrrkz,          X86::VPMAXUBZrmkz,          0 },
+    { X86::VPMAXUDZrrkz,          X86::VPMAXUDZrmkz,          0 },
+    { X86::VPMAXUQZrrkz,          X86::VPMAXUQZrmkz,          0 },
+    { X86::VPMAXUWZrrkz,          X86::VPMAXUWZrmkz,          0 },
+    { X86::VPMINSBZrrkz,          X86::VPMINSBZrmkz,          0 },
+    { X86::VPMINSDZrrkz,          X86::VPMINSDZrmkz,          0 },
+    { X86::VPMINSQZrrkz,          X86::VPMINSQZrmkz,          0 },
+    { X86::VPMINSWZrrkz,          X86::VPMINSWZrmkz,          0 },
+    { X86::VPMINUBZrrkz,          X86::VPMINUBZrmkz,          0 },
+    { X86::VPMINUDZrrkz,          X86::VPMINUDZrmkz,          0 },
+    { X86::VPMINUQZrrkz,          X86::VPMINUQZrmkz,          0 },
+    { X86::VPMINUWZrrkz,          X86::VPMINUWZrmkz,          0 },
+    { X86::VPMULLDZrrkz,          X86::VPMULLDZrmkz,          0 },
+    { X86::VPMULLQZrrkz,          X86::VPMULLQZrmkz,          0 },
+    { X86::VPMULLWZrrkz,          X86::VPMULLWZrmkz,          0 },
+    { X86::VPMULDQZrrkz,          X86::VPMULDQZrmkz,          0 },
+    { X86::VPMULUDQZrrkz,         X86::VPMULUDQZrmkz,         0 },
     { X86::VPORDZrrkz,            X86::VPORDZrmkz,            0 },
     { X86::VPORQZrrkz,            X86::VPORQZrmkz,            0 },
     { X86::VPSHUFBZrrkz,          X86::VPSHUFBZrmkz,          0 },
+    { X86::VPSLLDZrrkz,           X86::VPSLLDZrmkz,           0 },
+    { X86::VPSLLQZrrkz,           X86::VPSLLQZrmkz,           0 },
+    { X86::VPSLLVDZrrkz,          X86::VPSLLVDZrmkz,          0 },
+    { X86::VPSLLVQZrrkz,          X86::VPSLLVQZrmkz,          0 },
+    { X86::VPSLLVWZrrkz,          X86::VPSLLVWZrmkz,          0 },
+    { X86::VPSLLWZrrkz,           X86::VPSLLWZrmkz,           0 },
+    { X86::VPSRADZrrkz,           X86::VPSRADZrmkz,           0 },
+    { X86::VPSRAQZrrkz,           X86::VPSRAQZrmkz,           0 },
+    { X86::VPSRAVDZrrkz,          X86::VPSRAVDZrmkz,          0 },
+    { X86::VPSRAVQZrrkz,          X86::VPSRAVQZrmkz,          0 },
+    { X86::VPSRAVWZrrkz,          X86::VPSRAVWZrmkz,          0 },
+    { X86::VPSRAWZrrkz,           X86::VPSRAWZrmkz,           0 },
+    { X86::VPSRLDZrrkz,           X86::VPSRLDZrmkz,           0 },
+    { X86::VPSRLQZrrkz,           X86::VPSRLQZrmkz,           0 },
+    { X86::VPSRLVDZrrkz,          X86::VPSRLVDZrmkz,          0 },
+    { X86::VPSRLVQZrrkz,          X86::VPSRLVQZrmkz,          0 },
+    { X86::VPSRLVWZrrkz,          X86::VPSRLVWZrmkz,          0 },
+    { X86::VPSRLWZrrkz,           X86::VPSRLWZrmkz,           0 },
     { X86::VPSUBBZrrkz,           X86::VPSUBBZrmkz,           0 },
     { X86::VPSUBDZrrkz,           X86::VPSUBDZrmkz,           0 },
     { X86::VPSUBQZrrkz,           X86::VPSUBQZrmkz,           0 },
@@ -2401,8 +2693,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZrrkz,       X86::VPUNPCKLWDZrmkz,       0 },
     { X86::VPXORDZrrkz,           X86::VPXORDZrmkz,           0 },
     { X86::VPXORQZrrkz,           X86::VPXORQZrmkz,           0 },
+    { X86::VSHUFPDZrrikz,         X86::VSHUFPDZrmikz,         0 },
+    { X86::VSHUFPSZrrikz,         X86::VSHUFPSZrmikz,         0 },
     { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
     { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
+    { X86::VSUBSDZrr_Intkz,       X86::VSUBSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr_Intkz,       X86::VSUBSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VUNPCKHPDZrrkz,        X86::VUNPCKHPDZrmkz,        0 },
     { X86::VUNPCKHPSZrrkz,        X86::VUNPCKHPSZrmkz,        0 },
     { X86::VUNPCKLPDZrrkz,        X86::VUNPCKLPDZrmkz,        0 },
@@ -2437,6 +2733,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
     { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
     { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
+    { X86::VPACKSSDWZ256rrkz,     X86::VPACKSSDWZ256rmkz,     0 },
+    { X86::VPACKSSWBZ256rrkz,     X86::VPACKSSWBZ256rmkz,     0 },
+    { X86::VPACKUSDWZ256rrkz,     X86::VPACKUSDWZ256rmkz,     0 },
+    { X86::VPACKUSWBZ256rrkz,     X86::VPACKUSWBZ256rmkz,     0 },
     { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
     { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
     { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
@@ -2450,6 +2750,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ256rrkz,       X86::VPANDNDZ256rmkz,       0 },
     { X86::VPANDNQZ256rrkz,       X86::VPANDNQZ256rmkz,       0 },
     { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
+    { X86::VPAVGBZ256rrkz,        X86::VPAVGBZ256rmkz,        0 },
+    { X86::VPAVGWZ256rrkz,        X86::VPAVGWZ256rmkz,        0 },
     { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
     { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
     { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
@@ -2460,9 +2762,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ256rrkz,        X86::VPERMWZ256rmkz,        0 },
     { X86::VPMADDUBSWZ256rrkz,    X86::VPMADDUBSWZ256rmkz,    0 },
     { X86::VPMADDWDZ256rrkz,      X86::VPMADDWDZ256rmkz,      0 },
+    { X86::VPMAXSBZ256rrkz,       X86::VPMAXSBZ256rmkz,       0 },
+    { X86::VPMAXSDZ256rrkz,       X86::VPMAXSDZ256rmkz,       0 },
+    { X86::VPMAXSQZ256rrkz,       X86::VPMAXSQZ256rmkz,       0 },
+    { X86::VPMAXSWZ256rrkz,       X86::VPMAXSWZ256rmkz,       0 },
+    { X86::VPMAXUBZ256rrkz,       X86::VPMAXUBZ256rmkz,       0 },
+    { X86::VPMAXUDZ256rrkz,       X86::VPMAXUDZ256rmkz,       0 },
+    { X86::VPMAXUQZ256rrkz,       X86::VPMAXUQZ256rmkz,       0 },
+    { X86::VPMAXUWZ256rrkz,       X86::VPMAXUWZ256rmkz,       0 },
+    { X86::VPMINSBZ256rrkz,       X86::VPMINSBZ256rmkz,       0 },
+    { X86::VPMINSDZ256rrkz,       X86::VPMINSDZ256rmkz,       0 },
+    { X86::VPMINSQZ256rrkz,       X86::VPMINSQZ256rmkz,       0 },
+    { X86::VPMINSWZ256rrkz,       X86::VPMINSWZ256rmkz,       0 },
+    { X86::VPMINUBZ256rrkz,       X86::VPMINUBZ256rmkz,       0 },
+    { X86::VPMINUDZ256rrkz,       X86::VPMINUDZ256rmkz,       0 },
+    { X86::VPMINUQZ256rrkz,       X86::VPMINUQZ256rmkz,       0 },
+    { X86::VPMINUWZ256rrkz,       X86::VPMINUWZ256rmkz,       0 },
+    { X86::VPMULDQZ256rrkz,       X86::VPMULDQZ256rmkz,       0 },
+    { X86::VPMULLDZ256rrkz,       X86::VPMULLDZ256rmkz,       0 },
+    { X86::VPMULLQZ256rrkz,       X86::VPMULLQZ256rmkz,       0 },
+    { X86::VPMULLWZ256rrkz,       X86::VPMULLWZ256rmkz,       0 },
+    { X86::VPMULUDQZ256rrkz,      X86::VPMULUDQZ256rmkz,      0 },
     { X86::VPORDZ256rrkz,         X86::VPORDZ256rmkz,         0 },
     { X86::VPORQZ256rrkz,         X86::VPORQZ256rmkz,         0 },
     { X86::VPSHUFBZ256rrkz,       X86::VPSHUFBZ256rmkz,       0 },
+    { X86::VPSLLDZ256rrkz,        X86::VPSLLDZ256rmkz,        0 },
+    { X86::VPSLLQZ256rrkz,        X86::VPSLLQZ256rmkz,        0 },
+    { X86::VPSLLVDZ256rrkz,       X86::VPSLLVDZ256rmkz,       0 },
+    { X86::VPSLLVQZ256rrkz,       X86::VPSLLVQZ256rmkz,       0 },
+    { X86::VPSLLVWZ256rrkz,       X86::VPSLLVWZ256rmkz,       0 },
+    { X86::VPSLLWZ256rrkz,        X86::VPSLLWZ256rmkz,        0 },
+    { X86::VPSRADZ256rrkz,        X86::VPSRADZ256rmkz,        0 },
+    { X86::VPSRAQZ256rrkz,        X86::VPSRAQZ256rmkz,        0 },
+    { X86::VPSRAVDZ256rrkz,       X86::VPSRAVDZ256rmkz,       0 },
+    { X86::VPSRAVQZ256rrkz,       X86::VPSRAVQZ256rmkz,       0 },
+    { X86::VPSRAVWZ256rrkz,       X86::VPSRAVWZ256rmkz,       0 },
+    { X86::VPSRAWZ256rrkz,        X86::VPSRAWZ256rmkz,        0 },
+    { X86::VPSRLDZ256rrkz,        X86::VPSRLDZ256rmkz,        0 },
+    { X86::VPSRLQZ256rrkz,        X86::VPSRLQZ256rmkz,        0 },
+    { X86::VPSRLVDZ256rrkz,       X86::VPSRLVDZ256rmkz,       0 },
+    { X86::VPSRLVQZ256rrkz,       X86::VPSRLVQZ256rmkz,       0 },
+    { X86::VPSRLVWZ256rrkz,       X86::VPSRLVWZ256rmkz,       0 },
+    { X86::VPSRLWZ256rrkz,        X86::VPSRLWZ256rmkz,        0 },
     { X86::VPSUBBZ256rrkz,        X86::VPSUBBZ256rmkz,        0 },
     { X86::VPSUBDZ256rrkz,        X86::VPSUBDZ256rmkz,        0 },
     { X86::VPSUBQZ256rrkz,        X86::VPSUBQZ256rmkz,        0 },
@@ -2481,6 +2822,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ256rrkz,    X86::VPUNPCKLWDZ256rmkz,    0 },
     { X86::VPXORDZ256rrkz,        X86::VPXORDZ256rmkz,        0 },
     { X86::VPXORQZ256rrkz,        X86::VPXORQZ256rmkz,        0 },
+    { X86::VSHUFPDZ256rrikz,      X86::VSHUFPDZ256rmikz,      0 },
+    { X86::VSHUFPSZ256rrikz,      X86::VSHUFPSZ256rmikz,      0 },
     { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
     { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
     { X86::VUNPCKHPDZ256rrkz,     X86::VUNPCKHPDZ256rmkz,     0 },
@@ -2513,6 +2856,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
     { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
     { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
+    { X86::VPACKSSDWZ128rrkz,     X86::VPACKSSDWZ128rmkz,     0 },
+    { X86::VPACKSSWBZ128rrkz,     X86::VPACKSSWBZ128rmkz,     0 },
+    { X86::VPACKUSDWZ128rrkz,     X86::VPACKUSDWZ128rmkz,     0 },
+    { X86::VPACKUSWBZ128rrkz,     X86::VPACKUSWBZ128rmkz,     0 },
     { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
     { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
     { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
@@ -2526,15 +2873,56 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ128rrkz,       X86::VPANDNDZ128rmkz,       0 },
     { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
     { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
+    { X86::VPAVGBZ128rrkz,        X86::VPAVGBZ128rmkz,        0 },
+    { X86::VPAVGWZ128rrkz,        X86::VPAVGWZ128rmkz,        0 },
     { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
     { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
     { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
     { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
     { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
     { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
+    { X86::VPMAXSBZ128rrkz,       X86::VPMAXSBZ128rmkz,       0 },
+    { X86::VPMAXSDZ128rrkz,       X86::VPMAXSDZ128rmkz,       0 },
+    { X86::VPMAXSQZ128rrkz,       X86::VPMAXSQZ128rmkz,       0 },
+    { X86::VPMAXSWZ128rrkz,       X86::VPMAXSWZ128rmkz,       0 },
+    { X86::VPMAXUBZ128rrkz,       X86::VPMAXUBZ128rmkz,       0 },
+    { X86::VPMAXUDZ128rrkz,       X86::VPMAXUDZ128rmkz,       0 },
+    { X86::VPMAXUQZ128rrkz,       X86::VPMAXUQZ128rmkz,       0 },
+    { X86::VPMAXUWZ128rrkz,       X86::VPMAXUWZ128rmkz,       0 },
+    { X86::VPMINSBZ128rrkz,       X86::VPMINSBZ128rmkz,       0 },
+    { X86::VPMINSDZ128rrkz,       X86::VPMINSDZ128rmkz,       0 },
+    { X86::VPMINSQZ128rrkz,       X86::VPMINSQZ128rmkz,       0 },
+    { X86::VPMINSWZ128rrkz,       X86::VPMINSWZ128rmkz,       0 },
+    { X86::VPMINUBZ128rrkz,       X86::VPMINUBZ128rmkz,       0 },
+    { X86::VPMINUDZ128rrkz,       X86::VPMINUDZ128rmkz,       0 },
+    { X86::VPMINUQZ128rrkz,       X86::VPMINUQZ128rmkz,       0 },
+    { X86::VPMINUWZ128rrkz,       X86::VPMINUWZ128rmkz,       0 },
+    { X86::VPMULDQZ128rrkz,       X86::VPMULDQZ128rmkz,       0 },
+    { X86::VPMULLDZ128rrkz,       X86::VPMULLDZ128rmkz,       0 },
+    { X86::VPMULLQZ128rrkz,       X86::VPMULLQZ128rmkz,       0 },
+    { X86::VPMULLWZ128rrkz,       X86::VPMULLWZ128rmkz,       0 },
+    { X86::VPMULUDQZ128rrkz,      X86::VPMULUDQZ128rmkz,      0 },
     { X86::VPORDZ128rrkz,         X86::VPORDZ128rmkz,         0 },
     { X86::VPORQZ128rrkz,         X86::VPORQZ128rmkz,         0 },
     { X86::VPSHUFBZ128rrkz,       X86::VPSHUFBZ128rmkz,       0 },
+    { X86::VPSLLDZ128rrkz,        X86::VPSLLDZ128rmkz,        0 },
+    { X86::VPSLLQZ128rrkz,        X86::VPSLLQZ128rmkz,        0 },
+    { X86::VPSLLVDZ128rrkz,       X86::VPSLLVDZ128rmkz,       0 },
+    { X86::VPSLLVQZ128rrkz,       X86::VPSLLVQZ128rmkz,       0 },
+    { X86::VPSLLVWZ128rrkz,       X86::VPSLLVWZ128rmkz,       0 },
+    { X86::VPSLLWZ128rrkz,        X86::VPSLLWZ128rmkz,        0 },
+    { X86::VPSRADZ128rrkz,        X86::VPSRADZ128rmkz,        0 },
+    { X86::VPSRAQZ128rrkz,        X86::VPSRAQZ128rmkz,        0 },
+    { X86::VPSRAVDZ128rrkz,       X86::VPSRAVDZ128rmkz,       0 },
+    { X86::VPSRAVQZ128rrkz,       X86::VPSRAVQZ128rmkz,       0 },
+    { X86::VPSRAVWZ128rrkz,       X86::VPSRAVWZ128rmkz,       0 },
+    { X86::VPSRAWZ128rrkz,        X86::VPSRAWZ128rmkz,        0 },
+    { X86::VPSRLDZ128rrkz,        X86::VPSRLDZ128rmkz,        0 },
+    { X86::VPSRLQZ128rrkz,        X86::VPSRLQZ128rmkz,        0 },
+    { X86::VPSRLVDZ128rrkz,       X86::VPSRLVDZ128rmkz,       0 },
+    { X86::VPSRLVQZ128rrkz,       X86::VPSRLVQZ128rmkz,       0 },
+    { X86::VPSRLVWZ128rrkz,       X86::VPSRLVWZ128rmkz,       0 },
+    { X86::VPSRLWZ128rrkz,        X86::VPSRLWZ128rmkz,        0 },
     { X86::VPSUBBZ128rrkz,        X86::VPSUBBZ128rmkz,        0 },
     { X86::VPSUBDZ128rrkz,        X86::VPSUBDZ128rmkz,        0 },
     { X86::VPSUBQZ128rrkz,        X86::VPSUBQZ128rmkz,        0 },
@@ -2553,6 +2941,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ128rrkz,    X86::VPUNPCKLWDZ128rmkz,    0 },
     { X86::VPXORDZ128rrkz,        X86::VPXORDZ128rmkz,        0 },
     { X86::VPXORQZ128rrkz,        X86::VPXORQZ128rmkz,        0 },
+    { X86::VSHUFPDZ128rrikz,      X86::VSHUFPDZ128rmikz,      0 },
+    { X86::VSHUFPSZ128rrikz,      X86::VSHUFPSZ128rmikz,      0 },
     { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
     { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
     { X86::VUNPCKHPDZ128rrkz,     X86::VUNPCKHPDZ128rmkz,     0 },
@@ -2563,10 +2953,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VXORPSZ128rrkz,        X86::VXORPSZ128rmkz,        0 },
 
     // AVX-512 masked foldable instructions
+    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
+    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
+    { X86::VPABSBZrrk,            X86::VPABSBZrmk,            0 },
+    { X86::VPABSDZrrk,            X86::VPABSDZrmk,            0 },
+    { X86::VPABSQZrrk,            X86::VPABSQZrmk,            0 },
+    { X86::VPABSWZrrk,            X86::VPABSWZrmk,            0 },
+    { X86::VPCONFLICTDZrrk,       X86::VPCONFLICTDZrmk,       0 },
+    { X86::VPCONFLICTQZrrk,       X86::VPCONFLICTQZrmk,       0 },
     { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
     { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
     { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
     { X86::VPERMQZrik,            X86::VPERMQZmik,            0 },
+    { X86::VPLZCNTDZrrk,          X86::VPLZCNTDZrmk,          0 },
+    { X86::VPLZCNTQZrrk,          X86::VPLZCNTQZrmk,          0 },
     { X86::VPMOVSXBDZrrk,         X86::VPMOVSXBDZrmk,         0 },
     { X86::VPMOVSXBQZrrk,         X86::VPMOVSXBQZrmk,         TB_NO_REVERSE },
     { X86::VPMOVSXBWZrrk,         X86::VPMOVSXBWZrmk,         0 },
@@ -2579,15 +2979,36 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMOVZXDQZrrk,         X86::VPMOVZXDQZrmk,         0 },
     { X86::VPMOVZXWDZrrk,         X86::VPMOVZXWDZrmk,         0 },
     { X86::VPMOVZXWQZrrk,         X86::VPMOVZXWQZrmk,         0 },
+    { X86::VPOPCNTDZrrk,          X86::VPOPCNTDZrmk,          0 },
+    { X86::VPOPCNTQZrrk,          X86::VPOPCNTQZrmk,          0 },
     { X86::VPSHUFDZrik,           X86::VPSHUFDZmik,           0 },
     { X86::VPSHUFHWZrik,          X86::VPSHUFHWZmik,          0 },
     { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
+    { X86::VPSLLDZrik,            X86::VPSLLDZmik,            0 },
+    { X86::VPSLLQZrik,            X86::VPSLLQZmik,            0 },
+    { X86::VPSLLWZrik,            X86::VPSLLWZmik,            0 },
+    { X86::VPSRADZrik,            X86::VPSRADZmik,            0 },
+    { X86::VPSRAQZrik,            X86::VPSRAQZmik,            0 },
+    { X86::VPSRAWZrik,            X86::VPSRAWZmik,            0 },
+    { X86::VPSRLDZrik,            X86::VPSRLDZmik,            0 },
+    { X86::VPSRLQZrik,            X86::VPSRLQZmik,            0 },
+    { X86::VPSRLWZrik,            X86::VPSRLWZmik,            0 },
 
     // AVX-512VL 256-bit masked foldable instructions
+    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
+    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
+    { X86::VPABSBZ256rrk,         X86::VPABSBZ256rmk,         0 },
+    { X86::VPABSDZ256rrk,         X86::VPABSDZ256rmk,         0 },
+    { X86::VPABSQZ256rrk,         X86::VPABSQZ256rmk,         0 },
+    { X86::VPABSWZ256rrk,         X86::VPABSWZ256rmk,         0 },
+    { X86::VPCONFLICTDZ256rrk,    X86::VPCONFLICTDZ256rmk,    0 },
+    { X86::VPCONFLICTQZ256rrk,    X86::VPCONFLICTQZ256rmk,    0 },
     { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
     { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
     { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
     { X86::VPERMQZ256rik,         X86::VPERMQZ256mik,         0 },
+    { X86::VPLZCNTDZ256rrk,       X86::VPLZCNTDZ256rmk,       0 },
+    { X86::VPLZCNTQZ256rrk,       X86::VPLZCNTQZ256rmk,       0 },
     { X86::VPMOVSXBDZ256rrk,      X86::VPMOVSXBDZ256rmk,      TB_NO_REVERSE },
     { X86::VPMOVSXBQZ256rrk,      X86::VPMOVSXBQZ256rmk,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZ256rrk,      X86::VPMOVSXBWZ256rmk,      0 },
@@ -2603,10 +3024,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256rik,        X86::VPSHUFDZ256mik,        0 },
     { X86::VPSHUFHWZ256rik,       X86::VPSHUFHWZ256mik,       0 },
     { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
+    { X86::VPSLLDZ256rik,         X86::VPSLLDZ256mik,         0 },
+    { X86::VPSLLQZ256rik,         X86::VPSLLQZ256mik,         0 },
+    { X86::VPSLLWZ256rik,         X86::VPSLLWZ256mik,         0 },
+    { X86::VPSRADZ256rik,         X86::VPSRADZ256mik,         0 },
+    { X86::VPSRAQZ256rik,         X86::VPSRAQZ256mik,         0 },
+    { X86::VPSRAWZ256rik,         X86::VPSRAWZ256mik,         0 },
+    { X86::VPSRLDZ256rik,         X86::VPSRLDZ256mik,         0 },
+    { X86::VPSRLQZ256rik,         X86::VPSRLQZ256mik,         0 },
+    { X86::VPSRLWZ256rik,         X86::VPSRLWZ256mik,         0 },
 
     // AVX-512VL 128-bit masked foldable instructions
+    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
+    { X86::VPABSBZ128rrk,         X86::VPABSBZ128rmk,         0 },
+    { X86::VPABSDZ128rrk,         X86::VPABSDZ128rmk,         0 },
+    { X86::VPABSQZ128rrk,         X86::VPABSQZ128rmk,         0 },
+    { X86::VPABSWZ128rrk,         X86::VPABSWZ128rmk,         0 },
+    { X86::VPCONFLICTDZ128rrk,    X86::VPCONFLICTDZ128rmk,    0 },
+    { X86::VPCONFLICTQZ128rrk,    X86::VPCONFLICTQZ128rmk,    0 },
     { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
     { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
+    { X86::VPLZCNTDZ128rrk,       X86::VPLZCNTDZ128rmk,       0 },
+    { X86::VPLZCNTQZ128rrk,       X86::VPLZCNTQZ128rmk,       0 },
     { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
     { X86::VPMOVSXBQZ128rrk,      X86::VPMOVSXBQZ128rmk,      TB_NO_REVERSE },
     { X86::VPMOVSXBWZ128rrk,      X86::VPMOVSXBWZ128rmk,      TB_NO_REVERSE },
@@ -2622,6 +3061,73 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128rik,        X86::VPSHUFDZ128mik,        0 },
     { X86::VPSHUFHWZ128rik,       X86::VPSHUFHWZ128mik,       0 },
     { X86::VPSHUFLWZ128rik,       X86::VPSHUFLWZ128mik,       0 },
+    { X86::VPSLLDZ128rik,         X86::VPSLLDZ128mik,         0 },
+    { X86::VPSLLQZ128rik,         X86::VPSLLQZ128mik,         0 },
+    { X86::VPSLLWZ128rik,         X86::VPSLLWZ128mik,         0 },
+    { X86::VPSRADZ128rik,         X86::VPSRADZ128mik,         0 },
+    { X86::VPSRAQZ128rik,         X86::VPSRAQZ128mik,         0 },
+    { X86::VPSRAWZ128rik,         X86::VPSRAWZ128mik,         0 },
+    { X86::VPSRLDZ128rik,         X86::VPSRLDZ128mik,         0 },
+    { X86::VPSRLQZ128rik,         X86::VPSRLQZ128mik,         0 },
+    { X86::VPSRLWZ128rik,         X86::VPSRLWZ128mik,         0 },
+
+    // AVX-512 masked compare instructions
+    { X86::VCMPPDZ128rrik,        X86::VCMPPDZ128rmik,        0 },
+    { X86::VCMPPSZ128rrik,        X86::VCMPPSZ128rmik,        0 },
+    { X86::VCMPPDZ256rrik,        X86::VCMPPDZ256rmik,        0 },
+    { X86::VCMPPSZ256rrik,        X86::VCMPPSZ256rmik,        0 },
+    { X86::VCMPPDZrrik,           X86::VCMPPDZrmik,           0 },
+    { X86::VCMPPSZrrik,           X86::VCMPPSZrmik,           0 },
+    { X86::VCMPSDZrr_Intk,        X86::VCMPSDZrm_Intk,        TB_NO_REVERSE },
+    { X86::VCMPSSZrr_Intk,        X86::VCMPSSZrm_Intk,        TB_NO_REVERSE },
+    { X86::VPCMPBZ128rrik,        X86::VPCMPBZ128rmik,        0 },
+    { X86::VPCMPBZ256rrik,        X86::VPCMPBZ256rmik,        0 },
+    { X86::VPCMPBZrrik,           X86::VPCMPBZrmik,           0 },
+    { X86::VPCMPDZ128rrik,        X86::VPCMPDZ128rmik,        0 },
+    { X86::VPCMPDZ256rrik,        X86::VPCMPDZ256rmik,        0 },
+    { X86::VPCMPDZrrik,           X86::VPCMPDZrmik,           0 },
+    { X86::VPCMPEQBZ128rrk,       X86::VPCMPEQBZ128rmk,       0 },
+    { X86::VPCMPEQBZ256rrk,       X86::VPCMPEQBZ256rmk,       0 },
+    { X86::VPCMPEQBZrrk,          X86::VPCMPEQBZrmk,          0 },
+    { X86::VPCMPEQDZ128rrk,       X86::VPCMPEQDZ128rmk,       0 },
+    { X86::VPCMPEQDZ256rrk,       X86::VPCMPEQDZ256rmk,       0 },
+    { X86::VPCMPEQDZrrk,          X86::VPCMPEQDZrmk,          0 },
+    { X86::VPCMPEQQZ128rrk,       X86::VPCMPEQQZ128rmk,       0 },
+    { X86::VPCMPEQQZ256rrk,       X86::VPCMPEQQZ256rmk,       0 },
+    { X86::VPCMPEQQZrrk,          X86::VPCMPEQQZrmk,          0 },
+    { X86::VPCMPEQWZ128rrk,       X86::VPCMPEQWZ128rmk,       0 },
+    { X86::VPCMPEQWZ256rrk,       X86::VPCMPEQWZ256rmk,       0 },
+    { X86::VPCMPEQWZrrk,          X86::VPCMPEQWZrmk,          0 },
+    { X86::VPCMPGTBZ128rrk,       X86::VPCMPGTBZ128rmk,       0 },
+    { X86::VPCMPGTBZ256rrk,       X86::VPCMPGTBZ256rmk,       0 },
+    { X86::VPCMPGTBZrrk,          X86::VPCMPGTBZrmk,          0 },
+    { X86::VPCMPGTDZ128rrk,       X86::VPCMPGTDZ128rmk,       0 },
+    { X86::VPCMPGTDZ256rrk,       X86::VPCMPGTDZ256rmk,       0 },
+    { X86::VPCMPGTDZrrk,          X86::VPCMPGTDZrmk,          0 },
+    { X86::VPCMPGTQZ128rrk,       X86::VPCMPGTQZ128rmk,       0 },
+    { X86::VPCMPGTQZ256rrk,       X86::VPCMPGTQZ256rmk,       0 },
+    { X86::VPCMPGTQZrrk,          X86::VPCMPGTQZrmk,          0 },
+    { X86::VPCMPGTWZ128rrk,       X86::VPCMPGTWZ128rmk,       0 },
+    { X86::VPCMPGTWZ256rrk,       X86::VPCMPGTWZ256rmk,       0 },
+    { X86::VPCMPGTWZrrk,          X86::VPCMPGTWZrmk,          0 },
+    { X86::VPCMPQZ128rrik,        X86::VPCMPQZ128rmik,        0 },
+    { X86::VPCMPQZ256rrik,        X86::VPCMPQZ256rmik,        0 },
+    { X86::VPCMPQZrrik,           X86::VPCMPQZrmik,           0 },
+    { X86::VPCMPUBZ128rrik,       X86::VPCMPUBZ128rmik,       0 },
+    { X86::VPCMPUBZ256rrik,       X86::VPCMPUBZ256rmik,       0 },
+    { X86::VPCMPUBZrrik,          X86::VPCMPUBZrmik,          0 },
+    { X86::VPCMPUDZ128rrik,       X86::VPCMPUDZ128rmik,       0 },
+    { X86::VPCMPUDZ256rrik,       X86::VPCMPUDZ256rmik,       0 },
+    { X86::VPCMPUDZrrik,          X86::VPCMPUDZrmik,          0 },
+    { X86::VPCMPUQZ128rrik,       X86::VPCMPUQZ128rmik,       0 },
+    { X86::VPCMPUQZ256rrik,       X86::VPCMPUQZ256rmik,       0 },
+    { X86::VPCMPUQZrrik,          X86::VPCMPUQZrmik,          0 },
+    { X86::VPCMPUWZ128rrik,       X86::VPCMPUWZ128rmik,       0 },
+    { X86::VPCMPUWZ256rrik,       X86::VPCMPUWZ256rmik,       0 },
+    { X86::VPCMPUWZrrik,          X86::VPCMPUWZrmik,          0 },
+    { X86::VPCMPWZ128rrik,        X86::VPCMPWZ128rmik,        0 },
+    { X86::VPCMPWZ256rrik,        X86::VPCMPWZ256rmik,        0 },
+    { X86::VPCMPWZrrik,           X86::VPCMPWZrmik,           0 },
   };
 
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
@@ -2651,6 +3157,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 foldable masked instructions
     { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
     { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
+    { X86::VADDSDZrr_Intk,     X86::VADDSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VADDSSZrr_Intk,     X86::VADDSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VALIGNDZrrik,       X86::VALIGNDZrmik,         0 },
     { X86::VALIGNQZrrik,       X86::VALIGNQZrmik,         0 },
     { X86::VANDNPDZrrk,        X86::VANDNPDZrmk,          0 },
@@ -2659,6 +3167,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPSZrrk,         X86::VANDPSZrmk,           0 },
     { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
     { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
+    { X86::VDIVSDZrr_Intk,     X86::VDIVSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr_Intk,     X86::VDIVSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VINSERTF32x4Zrrk,   X86::VINSERTF32x4Zrmk,     0 },
     { X86::VINSERTF32x8Zrrk,   X86::VINSERTF32x8Zrmk,     0 },
     { X86::VINSERTF64x2Zrrk,   X86::VINSERTF64x2Zrmk,     0 },
@@ -2671,14 +3181,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXCPSZrrk,        X86::VMAXCPSZrmk,          0 },
     { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
     { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
+    { X86::VMAXSDZrr_Intk,     X86::VMAXSDZrm_Intk,       0 },
+    { X86::VMAXSSZrr_Intk,     X86::VMAXSSZrm_Intk,       0 },
     { X86::VMINCPDZrrk,        X86::VMINCPDZrmk,          0 },
     { X86::VMINCPSZrrk,        X86::VMINCPSZrmk,          0 },
     { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
     { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
+    { X86::VMINSDZrr_Intk,     X86::VMINSDZrm_Intk,       0 },
+    { X86::VMINSSZrr_Intk,     X86::VMINSSZrm_Intk,       0 },
     { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
     { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
+    { X86::VMULSDZrr_Intk,     X86::VMULSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VMULSSZrr_Intk,     X86::VMULSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
     { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
+    { X86::VPACKSSDWZrrk,      X86::VPACKSSDWZrmk,        0 },
+    { X86::VPACKSSWBZrrk,      X86::VPACKSSWBZrmk,        0 },
+    { X86::VPACKUSDWZrrk,      X86::VPACKUSDWZrmk,        0 },
+    { X86::VPACKUSWBZrrk,      X86::VPACKUSWBZrmk,        0 },
     { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
     { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
     { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
@@ -2692,6 +3212,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrrk,        X86::VPANDNDZrmk,          0 },
     { X86::VPANDNQZrrk,        X86::VPANDNQZrmk,          0 },
     { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
+    { X86::VPAVGBZrrk,         X86::VPAVGBZrmk,           0 },
+    { X86::VPAVGWZrrk,         X86::VPAVGWZrmk,           0 },
     { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
     { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
     { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
@@ -2714,9 +3236,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
     { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
     { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
+    { X86::VPMAXSBZrrk,        X86::VPMAXSBZrmk,          0 },
+    { X86::VPMAXSDZrrk,        X86::VPMAXSDZrmk,          0 },
+    { X86::VPMAXSQZrrk,        X86::VPMAXSQZrmk,          0 },
+    { X86::VPMAXSWZrrk,        X86::VPMAXSWZrmk,          0 },
+    { X86::VPMAXUBZrrk,        X86::VPMAXUBZrmk,          0 },
+    { X86::VPMAXUDZrrk,        X86::VPMAXUDZrmk,          0 },
+    { X86::VPMAXUQZrrk,        X86::VPMAXUQZrmk,          0 },
+    { X86::VPMAXUWZrrk,        X86::VPMAXUWZrmk,          0 },
+    { X86::VPMINSBZrrk,        X86::VPMINSBZrmk,          0 },
+    { X86::VPMINSDZrrk,        X86::VPMINSDZrmk,          0 },
+    { X86::VPMINSQZrrk,        X86::VPMINSQZrmk,          0 },
+    { X86::VPMINSWZrrk,        X86::VPMINSWZrmk,          0 },
+    { X86::VPMINUBZrrk,        X86::VPMINUBZrmk,          0 },
+    { X86::VPMINUDZrrk,        X86::VPMINUDZrmk,          0 },
+    { X86::VPMINUQZrrk,        X86::VPMINUQZrmk,          0 },
+    { X86::VPMINUWZrrk,        X86::VPMINUWZrmk,          0 },
+    { X86::VPMULDQZrrk,        X86::VPMULDQZrmk,          0 },
+    { X86::VPMULLDZrrk,        X86::VPMULLDZrmk,          0 },
+    { X86::VPMULLQZrrk,        X86::VPMULLQZrmk,          0 },
+    { X86::VPMULLWZrrk,        X86::VPMULLWZrmk,          0 },
+    { X86::VPMULUDQZrrk,       X86::VPMULUDQZrmk,         0 },
     { X86::VPORDZrrk,          X86::VPORDZrmk,            0 },
     { X86::VPORQZrrk,          X86::VPORQZrmk,            0 },
     { X86::VPSHUFBZrrk,        X86::VPSHUFBZrmk,          0 },
+    { X86::VPSLLDZrrk,         X86::VPSLLDZrmk,           0 },
+    { X86::VPSLLQZrrk,         X86::VPSLLQZrmk,           0 },
+    { X86::VPSLLVDZrrk,        X86::VPSLLVDZrmk,          0 },
+    { X86::VPSLLVQZrrk,        X86::VPSLLVQZrmk,          0 },
+    { X86::VPSLLVWZrrk,        X86::VPSLLVWZrmk,          0 },
+    { X86::VPSLLWZrrk,         X86::VPSLLWZrmk,           0 },
+    { X86::VPSRADZrrk,         X86::VPSRADZrmk,           0 },
+    { X86::VPSRAQZrrk,         X86::VPSRAQZrmk,           0 },
+    { X86::VPSRAVDZrrk,        X86::VPSRAVDZrmk,          0 },
+    { X86::VPSRAVQZrrk,        X86::VPSRAVQZrmk,          0 },
+    { X86::VPSRAVWZrrk,        X86::VPSRAVWZrmk,          0 },
+    { X86::VPSRAWZrrk,         X86::VPSRAWZrmk,           0 },
+    { X86::VPSRLDZrrk,         X86::VPSRLDZrmk,           0 },
+    { X86::VPSRLQZrrk,         X86::VPSRLQZrmk,           0 },
+    { X86::VPSRLVDZrrk,        X86::VPSRLVDZrmk,          0 },
+    { X86::VPSRLVQZrrk,        X86::VPSRLVQZrmk,          0 },
+    { X86::VPSRLVWZrrk,        X86::VPSRLVWZrmk,          0 },
+    { X86::VPSRLWZrrk,         X86::VPSRLWZrmk,           0 },
     { X86::VPSUBBZrrk,         X86::VPSUBBZrmk,           0 },
     { X86::VPSUBDZrrk,         X86::VPSUBDZrmk,           0 },
     { X86::VPSUBQZrrk,         X86::VPSUBQZrmk,           0 },
@@ -2736,8 +3297,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZrrk,     X86::VPUNPCKLWDZrmk,       0 },
     { X86::VPXORDZrrk,         X86::VPXORDZrmk,           0 },
     { X86::VPXORQZrrk,         X86::VPXORQZrmk,           0 },
+    { X86::VSHUFPDZrrik,       X86::VSHUFPDZrmik,         0 },
+    { X86::VSHUFPSZrrik,       X86::VSHUFPSZrmik,         0 },
     { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
     { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
+    { X86::VSUBSDZrr_Intk,     X86::VSUBSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr_Intk,     X86::VSUBSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VUNPCKHPDZrrk,      X86::VUNPCKHPDZrmk,        0 },
     { X86::VUNPCKHPSZrrk,      X86::VUNPCKHPSZrmk,        0 },
     { X86::VUNPCKLPDZrrk,      X86::VUNPCKLPDZrmk,        0 },
@@ -2772,6 +3337,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
     { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
     { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
+    { X86::VPACKSSDWZ256rrk,   X86::VPACKSSDWZ256rmk,     0 },
+    { X86::VPACKSSWBZ256rrk,   X86::VPACKSSWBZ256rmk,     0 },
+    { X86::VPACKUSDWZ256rrk,   X86::VPACKUSDWZ256rmk,     0 },
+    { X86::VPACKUSWBZ256rrk,   X86::VPACKUSWBZ256rmk,     0 },
     { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
     { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
     { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
@@ -2785,6 +3354,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ256rrk,     X86::VPANDNDZ256rmk,       0 },
     { X86::VPANDNQZ256rrk,     X86::VPANDNQZ256rmk,       0 },
     { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
+    { X86::VPAVGBZ256rrk,      X86::VPAVGBZ256rmk,        0 },
+    { X86::VPAVGWZ256rrk,      X86::VPAVGWZ256rmk,        0 },
     { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
     { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
     { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
@@ -2807,9 +3378,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
     { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
     { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
+    { X86::VPMAXSBZ256rrk,     X86::VPMAXSBZ256rmk,       0 },
+    { X86::VPMAXSDZ256rrk,     X86::VPMAXSDZ256rmk,       0 },
+    { X86::VPMAXSQZ256rrk,     X86::VPMAXSQZ256rmk,       0 },
+    { X86::VPMAXSWZ256rrk,     X86::VPMAXSWZ256rmk,       0 },
+    { X86::VPMAXUBZ256rrk,     X86::VPMAXUBZ256rmk,       0 },
+    { X86::VPMAXUDZ256rrk,     X86::VPMAXUDZ256rmk,       0 },
+    { X86::VPMAXUQZ256rrk,     X86::VPMAXUQZ256rmk,       0 },
+    { X86::VPMAXUWZ256rrk,     X86::VPMAXUWZ256rmk,       0 },
+    { X86::VPMINSBZ256rrk,     X86::VPMINSBZ256rmk,       0 },
+    { X86::VPMINSDZ256rrk,     X86::VPMINSDZ256rmk,       0 },
+    { X86::VPMINSQZ256rrk,     X86::VPMINSQZ256rmk,       0 },
+    { X86::VPMINSWZ256rrk,     X86::VPMINSWZ256rmk,       0 },
+    { X86::VPMINUBZ256rrk,     X86::VPMINUBZ256rmk,       0 },
+    { X86::VPMINUDZ256rrk,     X86::VPMINUDZ256rmk,       0 },
+    { X86::VPMINUQZ256rrk,     X86::VPMINUQZ256rmk,       0 },
+    { X86::VPMINUWZ256rrk,     X86::VPMINUWZ256rmk,       0 },
+    { X86::VPMULDQZ256rrk,     X86::VPMULDQZ256rmk,       0 },
+    { X86::VPMULLDZ256rrk,     X86::VPMULLDZ256rmk,       0 },
+    { X86::VPMULLQZ256rrk,     X86::VPMULLQZ256rmk,       0 },
+    { X86::VPMULLWZ256rrk,     X86::VPMULLWZ256rmk,       0 },
+    { X86::VPMULUDQZ256rrk,    X86::VPMULUDQZ256rmk,      0 },
     { X86::VPORDZ256rrk,       X86::VPORDZ256rmk,         0 },
     { X86::VPORQZ256rrk,       X86::VPORQZ256rmk,         0 },
     { X86::VPSHUFBZ256rrk,     X86::VPSHUFBZ256rmk,       0 },
+    { X86::VPSLLDZ256rrk,      X86::VPSLLDZ256rmk,        0 },
+    { X86::VPSLLQZ256rrk,      X86::VPSLLQZ256rmk,        0 },
+    { X86::VPSLLVDZ256rrk,     X86::VPSLLVDZ256rmk,       0 },
+    { X86::VPSLLVQZ256rrk,     X86::VPSLLVQZ256rmk,       0 },
+    { X86::VPSLLVWZ256rrk,     X86::VPSLLVWZ256rmk,       0 },
+    { X86::VPSLLWZ256rrk,      X86::VPSLLWZ256rmk,        0 },
+    { X86::VPSRADZ256rrk,      X86::VPSRADZ256rmk,        0 },
+    { X86::VPSRAQZ256rrk,      X86::VPSRAQZ256rmk,        0 },
+    { X86::VPSRAVDZ256rrk,     X86::VPSRAVDZ256rmk,       0 },
+    { X86::VPSRAVQZ256rrk,     X86::VPSRAVQZ256rmk,       0 },
+    { X86::VPSRAVWZ256rrk,     X86::VPSRAVWZ256rmk,       0 },
+    { X86::VPSRAWZ256rrk,      X86::VPSRAWZ256rmk,        0 },
+    { X86::VPSRLDZ256rrk,      X86::VPSRLDZ256rmk,        0 },
+    { X86::VPSRLQZ256rrk,      X86::VPSRLQZ256rmk,        0 },
+    { X86::VPSRLVDZ256rrk,     X86::VPSRLVDZ256rmk,       0 },
+    { X86::VPSRLVQZ256rrk,     X86::VPSRLVQZ256rmk,       0 },
+    { X86::VPSRLVWZ256rrk,     X86::VPSRLVWZ256rmk,       0 },
+    { X86::VPSRLWZ256rrk,      X86::VPSRLWZ256rmk,        0 },
     { X86::VPSUBBZ256rrk,      X86::VPSUBBZ256rmk,        0 },
     { X86::VPSUBDZ256rrk,      X86::VPSUBDZ256rmk,        0 },
     { X86::VPSUBQZ256rrk,      X86::VPSUBQZ256rmk,        0 },
@@ -2830,6 +3440,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ256rrk,  X86::VPUNPCKLWDZ256rmk,    0 },
     { X86::VPXORDZ256rrk,      X86::VPXORDZ256rmk,        0 },
     { X86::VPXORQZ256rrk,      X86::VPXORQZ256rmk,        0 },
+    { X86::VSHUFPDZ256rrik,    X86::VSHUFPDZ256rmik,      0 },
+    { X86::VSHUFPSZ256rrik,    X86::VSHUFPSZ256rmik,      0 },
     { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
     { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
     { X86::VUNPCKHPDZ256rrk,   X86::VUNPCKHPDZ256rmk,     0 },
@@ -2862,6 +3474,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
     { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
     { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
+    { X86::VPACKSSDWZ128rrk,   X86::VPACKSSDWZ128rmk,     0 },
+    { X86::VPACKSSWBZ128rrk,   X86::VPACKSSWBZ128rmk,     0 },
+    { X86::VPACKUSDWZ128rrk,   X86::VPACKUSDWZ128rmk,     0 },
+    { X86::VPACKUSWBZ128rrk,   X86::VPACKUSWBZ128rmk,     0 },
     { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
     { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
     { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },
@@ -2875,6 +3491,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ128rrk,     X86::VPANDNDZ128rmk,       0 },
     { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
     { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
+    { X86::VPAVGBZ128rrk,      X86::VPAVGBZ128rmk,        0 },
+    { X86::VPAVGWZ128rrk,      X86::VPAVGWZ128rmk,        0 },
     { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
     { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
     { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
@@ -2893,9 +3511,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
     { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
     { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
+    { X86::VPMAXSBZ128rrk,     X86::VPMAXSBZ128rmk,       0 },
+    { X86::VPMAXSDZ128rrk,     X86::VPMAXSDZ128rmk,       0 },
+    { X86::VPMAXSQZ128rrk,     X86::VPMAXSQZ128rmk,       0 },
+    { X86::VPMAXSWZ128rrk,     X86::VPMAXSWZ128rmk,       0 },
+    { X86::VPMAXUBZ128rrk,     X86::VPMAXUBZ128rmk,       0 },
+    { X86::VPMAXUDZ128rrk,     X86::VPMAXUDZ128rmk,       0 },
+    { X86::VPMAXUQZ128rrk,     X86::VPMAXUQZ128rmk,       0 },
+    { X86::VPMAXUWZ128rrk,     X86::VPMAXUWZ128rmk,       0 },
+    { X86::VPMINSBZ128rrk,     X86::VPMINSBZ128rmk,       0 },
+    { X86::VPMINSDZ128rrk,     X86::VPMINSDZ128rmk,       0 },
+    { X86::VPMINSQZ128rrk,     X86::VPMINSQZ128rmk,       0 },
+    { X86::VPMINSWZ128rrk,     X86::VPMINSWZ128rmk,       0 },
+    { X86::VPMINUBZ128rrk,     X86::VPMINUBZ128rmk,       0 },
+    { X86::VPMINUDZ128rrk,     X86::VPMINUDZ128rmk,       0 },
+    { X86::VPMINUQZ128rrk,     X86::VPMINUQZ128rmk,       0 },
+    { X86::VPMINUWZ128rrk,     X86::VPMINUWZ128rmk,       0 },
+    { X86::VPMULDQZ128rrk,     X86::VPMULDQZ128rmk,       0 },
+    { X86::VPMULLDZ128rrk,     X86::VPMULLDZ128rmk,       0 },
+    { X86::VPMULLQZ128rrk,     X86::VPMULLQZ128rmk,       0 },
+    { X86::VPMULLWZ128rrk,     X86::VPMULLWZ128rmk,       0 },
+    { X86::VPMULUDQZ128rrk,    X86::VPMULUDQZ128rmk,      0 },
     { X86::VPORDZ128rrk,       X86::VPORDZ128rmk,         0 },
     { X86::VPORQZ128rrk,       X86::VPORQZ128rmk,         0 },
     { X86::VPSHUFBZ128rrk,     X86::VPSHUFBZ128rmk,       0 },
+    { X86::VPSLLDZ128rrk,      X86::VPSLLDZ128rmk,        0 },
+    { X86::VPSLLQZ128rrk,      X86::VPSLLQZ128rmk,        0 },
+    { X86::VPSLLVDZ128rrk,     X86::VPSLLVDZ128rmk,       0 },
+    { X86::VPSLLVQZ128rrk,     X86::VPSLLVQZ128rmk,       0 },
+    { X86::VPSLLVWZ128rrk,     X86::VPSLLVWZ128rmk,       0 },
+    { X86::VPSLLWZ128rrk,      X86::VPSLLWZ128rmk,        0 },
+    { X86::VPSRADZ128rrk,      X86::VPSRADZ128rmk,        0 },
+    { X86::VPSRAQZ128rrk,      X86::VPSRAQZ128rmk,        0 },
+    { X86::VPSRAVDZ128rrk,     X86::VPSRAVDZ128rmk,       0 },
+    { X86::VPSRAVQZ128rrk,     X86::VPSRAVQZ128rmk,       0 },
+    { X86::VPSRAVWZ128rrk,     X86::VPSRAVWZ128rmk,       0 },
+    { X86::VPSRAWZ128rrk,      X86::VPSRAWZ128rmk,        0 },
+    { X86::VPSRLDZ128rrk,      X86::VPSRLDZ128rmk,        0 },
+    { X86::VPSRLQZ128rrk,      X86::VPSRLQZ128rmk,        0 },
+    { X86::VPSRLVDZ128rrk,     X86::VPSRLVDZ128rmk,       0 },
+    { X86::VPSRLVQZ128rrk,     X86::VPSRLVQZ128rmk,       0 },
+    { X86::VPSRLVWZ128rrk,     X86::VPSRLVWZ128rmk,       0 },
+    { X86::VPSRLWZ128rrk,      X86::VPSRLWZ128rmk,        0 },
     { X86::VPSUBBZ128rrk,      X86::VPSUBBZ128rmk,        0 },
     { X86::VPSUBDZ128rrk,      X86::VPSUBDZ128rmk,        0 },
     { X86::VPSUBQZ128rrk,      X86::VPSUBQZ128rmk,        0 },
@@ -2916,6 +3573,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ128rrk,  X86::VPUNPCKLWDZ128rmk,    0 },
     { X86::VPXORDZ128rrk,      X86::VPXORDZ128rmk,        0 },
     { X86::VPXORQZ128rrk,      X86::VPXORQZ128rmk,        0 },
+    { X86::VSHUFPDZ128rrik,    X86::VSHUFPDZ128rmik,      0 },
+    { X86::VSHUFPSZ128rrik,    X86::VSHUFPSZ128rmik,      0 },
     { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
     { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
     { X86::VUNPCKHPDZ128rrk,   X86::VUNPCKHPDZ128rmk,     0 },
@@ -3026,6 +3685,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
       return false;
+    LLVM_FALLTHROUGH;
   case X86::MOVSX32rr16:
   case X86::MOVZX32rr16:
   case X86::MOVSX64rr16:
@@ -3063,18 +3723,13 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
 
-  if (MI.getOpcode() == getCallFrameSetupOpcode() ||
-      MI.getOpcode() == getCallFrameDestroyOpcode()) {
+  if (isFrameInstr(MI)) {
     unsigned StackAlign = TFI->getStackAlignment();
-    int SPAdj =
-        (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;
-
-    SPAdj -= MI.getOperand(1).getImm();
-
-    if (MI.getOpcode() == getCallFrameSetupOpcode())
-      return SPAdj;
-    else
-      return -SPAdj;
+    int SPAdj = alignTo(getFrameSize(MI), StackAlign);
+    SPAdj -= getFrameAdjustment(MI);
+    if (!isFrameSetup(MI))
+      SPAdj = -SPAdj;
+    return SPAdj;
   }
 
   // To know whether a call adjusts the stack, we need information
@@ -3569,7 +4224,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
 
     const DebugLoc &DL = Orig.getDebugLoc();
     BuildMI(MBB, I, DL, get(X86::MOV32ri))
-        .addOperand(Orig.getOperand(0))
+        .add(Orig.getOperand(0))
         .addImm(Value);
   } else {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
@@ -3654,10 +4309,10 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
-    MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
-                                 get(TargetOpcode::COPY))
-        .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
-        .addOperand(Src);
+    MachineInstr *Copy =
+        BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
+            .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+            .add(Src);
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
@@ -3823,10 +4478,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
-                .addOperand(Dest)
+                .add(Dest)
                 .addReg(0)
                 .addImm(1ULL << ShAmt)
-                .addOperand(Src)
+                .add(Src)
                 .addImm(0)
                 .addReg(0);
     break;
@@ -3848,14 +4503,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     MachineInstrBuilder MIB =
         BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .addOperand(Dest)
+            .add(Dest)
             .addReg(0)
             .addImm(1ULL << ShAmt)
             .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
             .addImm(0)
             .addReg(0);
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
     NewMI = MIB;
 
     break;
@@ -3869,10 +4524,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                .addOperand(Dest)
+                .add(Dest)
                 .addReg(0)
                 .addImm(1ULL << ShAmt)
-                .addOperand(Src)
+                .add(Src)
                 .addImm(0)
                 .addReg(0);
     break;
@@ -3891,11 +4546,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 
     MachineInstrBuilder MIB =
         BuildMI(MF, MI.getDebugLoc(), get(Opc))
-            .addOperand(Dest)
+            .add(Dest)
             .addReg(SrcReg,
                     getKillRegState(isKill) | getUndefRegState(isUndef));
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, 1);
     break;
@@ -3905,10 +4560,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      1);
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), 1);
     break;
   case X86::DEC64r:
   case X86::DEC32r: {
@@ -3924,11 +4577,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .addOperand(Dest)
+                                  .add(Dest)
                                   .addReg(SrcReg, getUndefRegState(isUndef) |
                                                       getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, -1);
 
@@ -3939,10 +4592,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      -1);
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src), -1);
     break;
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
@@ -3970,12 +4621,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                         SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
       return nullptr;
 
-    MachineInstrBuilder MIB =
-        BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
+    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
     if (ImplicitOp2.getReg() != 0)
-      MIB.addOperand(ImplicitOp2);
+      MIB.add(ImplicitOp2);
 
     NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
 
@@ -3995,9 +4645,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Src2 = MI.getOperand(2).getReg();
     bool isKill2 = MI.getOperand(2).isKill();
-    NewMI = addRegReg(
-        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
-        Src.getReg(), Src.isKill(), Src2, isKill2);
+    NewMI = addRegReg(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest),
+                      Src.getReg(), Src.isKill(), Src2, isKill2);
 
     // Preserve undefness of the operands.
     bool isUndef = MI.getOperand(1).isUndef();
@@ -4014,10 +4663,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::ADD64ri32_DB:
   case X86::ADD64ri8_DB:
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      MI.getOperand(2));
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
+        MI.getOperand(2));
     break;
   case X86::ADD32ri:
   case X86::ADD32ri8:
@@ -4034,11 +4682,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .addOperand(Dest)
+                                  .add(Dest)
                                   .addReg(SrcReg, getUndefRegState(isUndef) |
                                                       getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
-      MIB.addOperand(ImplicitOp);
+      MIB.add(ImplicitOp);
 
     NewMI = addOffset(MIB, MI.getOperand(2));
     break;
@@ -4051,12 +4699,136 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
                      : nullptr;
     assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
-    NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
-                          .addOperand(Dest)
-                          .addOperand(Src),
-                      MI.getOperand(2));
+    NewMI = addOffset(
+        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).add(Dest).add(Src),
+        MI.getOperand(2));
+    break;
+
+  case X86::VMOVDQU8Z128rmk:
+  case X86::VMOVDQU8Z256rmk:
+  case X86::VMOVDQU8Zrmk:
+  case X86::VMOVDQU16Z128rmk:
+  case X86::VMOVDQU16Z256rmk:
+  case X86::VMOVDQU16Zrmk:
+  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
+  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
+  case X86::VMOVDQU32Zrmk:    case X86::VMOVDQA32Zrmk:
+  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
+  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
+  case X86::VMOVDQU64Zrmk:    case X86::VMOVDQA64Zrmk:
+  case X86::VMOVUPDZ128rmk:   case X86::VMOVAPDZ128rmk:
+  case X86::VMOVUPDZ256rmk:   case X86::VMOVAPDZ256rmk:
+  case X86::VMOVUPDZrmk:      case X86::VMOVAPDZrmk:
+  case X86::VMOVUPSZ128rmk:   case X86::VMOVAPSZ128rmk:
+  case X86::VMOVUPSZ256rmk:   case X86::VMOVAPSZ256rmk:
+  case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk: {
+    unsigned Opc;
+    switch (MIOpc) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::VMOVDQU8Z128rmk:  Opc = X86::VPBLENDMBZ128rmk; break;
+    case X86::VMOVDQU8Z256rmk:  Opc = X86::VPBLENDMBZ256rmk; break;
+    case X86::VMOVDQU8Zrmk:     Opc = X86::VPBLENDMBZrmk;    break;
+    case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+    case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+    case X86::VMOVDQU16Zrmk:    Opc = X86::VPBLENDMWZrmk;    break;
+    case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQU32Zrmk:    Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQU64Zrmk:    Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVUPDZ128rmk:   Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVUPDZ256rmk:   Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVUPDZrmk:      Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVUPSZ128rmk:   Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVUPSZ256rmk:   Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVUPSZrmk:      Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQA32Zrmk:    Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQA64Zrmk:    Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVAPDZ128rmk:   Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVAPDZ256rmk:   Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVAPDZrmk:      Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVAPSZ128rmk:   Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVAPSZ256rmk:   Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVAPSZrmk:      Opc = X86::VBLENDMPSZrmk;    break;
+    }
+
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .add(MI.getOperand(2))
+              .add(Src)
+              .add(MI.getOperand(3))
+              .add(MI.getOperand(4))
+              .add(MI.getOperand(5))
+              .add(MI.getOperand(6))
+              .add(MI.getOperand(7));
+    break;
+  }
+  case X86::VMOVDQU8Z128rrk:
+  case X86::VMOVDQU8Z256rrk:
+  case X86::VMOVDQU8Zrrk:
+  case X86::VMOVDQU16Z128rrk:
+  case X86::VMOVDQU16Z256rrk:
+  case X86::VMOVDQU16Zrrk:
+  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
+  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
+  case X86::VMOVDQU32Zrrk:    case X86::VMOVDQA32Zrrk:
+  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
+  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
+  case X86::VMOVDQU64Zrrk:    case X86::VMOVDQA64Zrrk:
+  case X86::VMOVUPDZ128rrk:   case X86::VMOVAPDZ128rrk:
+  case X86::VMOVUPDZ256rrk:   case X86::VMOVAPDZ256rrk:
+  case X86::VMOVUPDZrrk:      case X86::VMOVAPDZrrk:
+  case X86::VMOVUPSZ128rrk:   case X86::VMOVAPSZ128rrk:
+  case X86::VMOVUPSZ256rrk:   case X86::VMOVAPSZ256rrk:
+  case X86::VMOVUPSZrrk:      case X86::VMOVAPSZrrk: {
+    unsigned Opc;
+    switch (MIOpc) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::VMOVDQU8Z128rrk:  Opc = X86::VPBLENDMBZ128rrk; break;
+    case X86::VMOVDQU8Z256rrk:  Opc = X86::VPBLENDMBZ256rrk; break;
+    case X86::VMOVDQU8Zrrk:     Opc = X86::VPBLENDMBZrrk;    break;
+    case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break;
+    case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break;
+    case X86::VMOVDQU16Zrrk:    Opc = X86::VPBLENDMWZrrk;    break;
+    case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+    case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+    case X86::VMOVDQU32Zrrk:    Opc = X86::VPBLENDMDZrrk;    break;
+    case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+    case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+    case X86::VMOVDQU64Zrrk:    Opc = X86::VPBLENDMQZrrk;    break;
+    case X86::VMOVUPDZ128rrk:   Opc = X86::VBLENDMPDZ128rrk; break;
+    case X86::VMOVUPDZ256rrk:   Opc = X86::VBLENDMPDZ256rrk; break;
+    case X86::VMOVUPDZrrk:      Opc = X86::VBLENDMPDZrrk;    break;
+    case X86::VMOVUPSZ128rrk:   Opc = X86::VBLENDMPSZ128rrk; break;
+    case X86::VMOVUPSZ256rrk:   Opc = X86::VBLENDMPSZ256rrk; break;
+    case X86::VMOVUPSZrrk:      Opc = X86::VBLENDMPSZrrk;    break;
+    case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break;
+    case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break;
+    case X86::VMOVDQA32Zrrk:    Opc = X86::VPBLENDMDZrrk;    break;
+    case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break;
+    case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break;
+    case X86::VMOVDQA64Zrrk:    Opc = X86::VPBLENDMQZrrk;    break;
+    case X86::VMOVAPDZ128rrk:   Opc = X86::VBLENDMPDZ128rrk; break;
+    case X86::VMOVAPDZ256rrk:   Opc = X86::VBLENDMPDZ256rrk; break;
+    case X86::VMOVAPDZrrk:      Opc = X86::VBLENDMPDZrrk;    break;
+    case X86::VMOVAPSZ128rrk:   Opc = X86::VBLENDMPSZ128rrk; break;
+    case X86::VMOVAPSZ256rrk:   Opc = X86::VBLENDMPSZ256rrk; break;
+    case X86::VMOVAPSZrrk:      Opc = X86::VBLENDMPSZrrk;    break;
+    }
+
+    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
+              .add(Dest)
+              .add(MI.getOperand(2))
+              .add(Src)
+              .add(MI.getOperand(3));
     break;
   }
+  }
 
   if (!NewMI) return nullptr;
 
@@ -4337,6 +5109,18 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
+  case X86::PFSUBrr:
+  case X86::PFSUBRrr: {
+    // PFSUB  x, y: x = x - y
+    // PFSUBR x, y: x = y - x
+    unsigned Opc =
+        (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+    break;
+  }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
   case X86::PBLENDWrri:
@@ -4446,20 +5230,32 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       return nullptr;
     }
   }
-  case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
-  case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
-  case X86::VPCMPBZrri:    case X86::VPCMPUBZrri:
-  case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
-  case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
-  case X86::VPCMPDZrri:    case X86::VPCMPUDZrri:
-  case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
-  case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
-  case X86::VPCMPQZrri:    case X86::VPCMPUQZrri:
-  case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
-  case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
-  case X86::VPCMPWZrri:    case X86::VPCMPUWZrri: {
+  case X86::VPCMPBZ128rri:  case X86::VPCMPUBZ128rri:
+  case X86::VPCMPBZ256rri:  case X86::VPCMPUBZ256rri:
+  case X86::VPCMPBZrri:     case X86::VPCMPUBZrri:
+  case X86::VPCMPDZ128rri:  case X86::VPCMPUDZ128rri:
+  case X86::VPCMPDZ256rri:  case X86::VPCMPUDZ256rri:
+  case X86::VPCMPDZrri:     case X86::VPCMPUDZrri:
+  case X86::VPCMPQZ128rri:  case X86::VPCMPUQZ128rri:
+  case X86::VPCMPQZ256rri:  case X86::VPCMPUQZ256rri:
+  case X86::VPCMPQZrri:     case X86::VPCMPUQZrri:
+  case X86::VPCMPWZ128rri:  case X86::VPCMPUWZ128rri:
+  case X86::VPCMPWZ256rri:  case X86::VPCMPUWZ256rri:
+  case X86::VPCMPWZrri:     case X86::VPCMPUWZrri:
+  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
+  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
+  case X86::VPCMPBZrrik:    case X86::VPCMPUBZrrik:
+  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
+  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
+  case X86::VPCMPDZrrik:    case X86::VPCMPUDZrrik:
+  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
+  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
+  case X86::VPCMPQZrrik:    case X86::VPCMPUQZrrik:
+  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
+  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
+  case X86::VPCMPWZrrik:    case X86::VPCMPUWZrrik: {
     // Flip comparison mode immediate (if necessary).
-    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
+    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
     switch (Imm) {
     default: llvm_unreachable("Unreachable!");
     case 0x01: Imm = 0x06; break; // LT  -> NLE
@@ -4473,7 +5269,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       break;
     }
     auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(3).setImm(Imm);
+    WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
@@ -4606,18 +5402,30 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
   case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
   case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
-  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
-  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
-  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
-  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
-  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
-  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrik:
+  case X86::VPTERNLOGDZ128rrik:
+  case X86::VPTERNLOGDZ256rrik:
+  case X86::VPTERNLOGQZrrik:
+  case X86::VPTERNLOGQZ128rrik:
+  case X86::VPTERNLOGQZ256rrik:
   case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
   case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
   case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
   case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
   case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
-  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZ128rmbi:
+  case X86::VPTERNLOGDZ256rmbi:
+  case X86::VPTERNLOGDZrmbi:
+  case X86::VPTERNLOGQZ128rmbi:
+  case X86::VPTERNLOGQZ256rmbi:
+  case X86::VPTERNLOGQZrmbi:
+  case X86::VPTERNLOGDZ128rmbikz:
+  case X86::VPTERNLOGDZ256rmbikz:
+  case X86::VPTERNLOGDZrmbikz:
+  case X86::VPTERNLOGQZ128rmbikz:
+  case X86::VPTERNLOGQZ256rmbikz:
+  case X86::VPTERNLOGQZrmbikz: {
     auto &WorkingMI = cloneIfNew(MI);
     if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
       return nullptr;
@@ -4798,18 +5606,30 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
   case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
   case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
-  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
-  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
-  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
-  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
-  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
-  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrik:
+  case X86::VPTERNLOGDZ128rrik:
+  case X86::VPTERNLOGDZ256rrik:
+  case X86::VPTERNLOGQZrrik:
+  case X86::VPTERNLOGQZ128rrik:
+  case X86::VPTERNLOGQZ256rrik:
   case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
   case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
   case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
   case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
   case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
   case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZ128rmbi:
+  case X86::VPTERNLOGDZ256rmbi:
+  case X86::VPTERNLOGDZrmbi:
+  case X86::VPTERNLOGQZ128rmbi:
+  case X86::VPTERNLOGQZ256rmbi:
+  case X86::VPTERNLOGQZrmbi:
+  case X86::VPTERNLOGDZ128rmbikz:
+  case X86::VPTERNLOGDZ256rmbikz:
+  case X86::VPTERNLOGDZrmbikz:
+  case X86::VPTERNLOGQZ128rmbikz:
+  case X86::VPTERNLOGQZ256rmbikz:
+  case X86::VPTERNLOGQZrmbikz:
     return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   default:
     const X86InstrFMA3Group *FMA3Group =
@@ -5022,6 +5842,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   }
 }
 
+std::pair<X86::CondCode, bool>
+X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
+  X86::CondCode CC = X86::COND_INVALID;
+  bool NeedSwap = false;
+  switch (Predicate) {
+  default: break;
+  // Floating-point Predicates
+  case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
+  case CmpInst::FCMP_OLT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
+  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
+  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
+  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
+  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
+  case CmpInst::FCMP_OEQ:                         LLVM_FALLTHROUGH;
+  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
+
+  // Integer Predicates
+  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
+  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
+  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
+  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
+  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
+  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
+  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
+  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
+  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
+  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
+  }
+
+  return std::make_pair(CC, NeedSwap);
+}
+
 /// Return a set opcode for the given condition and
 /// whether it has memory operand.
 unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
@@ -5108,6 +5966,95 @@ bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
   return !isPredicated(MI);
 }
 
+bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNmi:
+  case X86::TCRETURNdi64:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNmi64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool X86InstrInfo::canMakeTailCallConditional(
+    SmallVectorImpl<MachineOperand> &BranchCond,
+    const MachineInstr &TailCall) const {
+  if (TailCall.getOpcode() != X86::TCRETURNdi &&
+      TailCall.getOpcode() != X86::TCRETURNdi64) {
+    // Only direct calls can be done with a conditional branch.
+    return false;
+  }
+
+  const MachineFunction *MF = TailCall.getParent()->getParent();
+  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
+    // Conditional tail calls confuse the Win64 unwinder.
+    return false;
+  }
+
+  assert(BranchCond.size() == 1);
+  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
+    // Can't make a conditional tail call with this condition.
+    return false;
+  }
+
+  const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  if (X86FI->getTCReturnAddrDelta() != 0 ||
+      TailCall.getOperand(1).getImm() != 0) {
+    // A conditional tail call cannot do any stack adjustment.
+    return false;
+  }
+
+  return true;
+}
+
+void X86InstrInfo::replaceBranchWithTailCall(
+    MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
+    const MachineInstr &TailCall) const {
+  assert(canMakeTailCallConditional(BranchCond, TailCall));
+
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    if (I->isDebugValue())
+      continue;
+    if (!I->isBranch())
+      assert(0 && "Can't find the branch to replace!");
+
+    X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
+    assert(BranchCond.size() == 1);
+    if (CC != BranchCond[0].getImm())
+      continue;
+
+    break;
+  }
+
+  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
+                                                         : X86::TCRETURNdi64cc;
+
+  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
+  MIB->addOperand(TailCall.getOperand(0)); // Destination.
+  MIB.addImm(0); // Stack offset (not used).
+  MIB->addOperand(BranchCond[0]); // Condition.
+  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
+
+  // Add implicit uses and defs of all live regs potentially clobbered by the
+  // call. This way they still appear live across the call.
+  LivePhysRegs LiveRegs(getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
+  LiveRegs.stepForward(*MIB, Clobbers);
+  for (const auto &C : Clobbers) {
+    MIB.addReg(C.first, RegState::Implicit);
+    MIB.addReg(C.first, RegState::Implicit | RegState::Define);
+  }
+
+  I->eraseFromParent();
+}
+
 // Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
 // not be a fallthrough MBB now due to layout changes). Return nullptr if the
 // fallthrough MBB cannot be identified.
@@ -5494,9 +6441,11 @@ void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
                                 ArrayRef<MachineOperand> Cond, unsigned TrueReg,
                                 unsigned FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
   assert(Cond.size() == 1 && "Invalid Cond array");
   unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
-                                 MRI.getRegClass(DstReg)->getSize(),
+                                 TRI.getRegSizeInBits(RC) / 8,
                                  false /*HasMemoryOperand*/);
   BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
 }
@@ -5514,8 +6463,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
 
   // SrcReg(MaskReg) -> DestReg(GR64)
   // SrcReg(MaskReg) -> DestReg(GR32)
-  // SrcReg(MaskReg) -> DestReg(GR16)
-  // SrcReg(MaskReg) -> DestReg(GR8)
 
   // All KMASK RegClasses hold the same k registers, can be tested against anyone.
   if (X86::VK16RegClass.contains(SrcReg)) {
@@ -5525,20 +6472,10 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
     }
     if (X86::GR32RegClass.contains(DestReg))
       return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
-    if (X86::GR16RegClass.contains(DestReg)) {
-      DestReg = getX86SubSuperRegister(DestReg, 32);
-      return X86::KMOVWrk;
-    }
-    if (X86::GR8RegClass.contains(DestReg)) {
-      DestReg = getX86SubSuperRegister(DestReg, 32);
-      return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
-    }
   }
 
   // SrcReg(GR64) -> DestReg(MaskReg)
   // SrcReg(GR32) -> DestReg(MaskReg)
-  // SrcReg(GR16) -> DestReg(MaskReg)
-  // SrcReg(GR8)  -> DestReg(MaskReg)
 
   // All KMASK RegClasses hold the same k registers, can be tested against anyone.
   if (X86::VK16RegClass.contains(DestReg)) {
@@ -5548,14 +6485,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
     }
     if (X86::GR32RegClass.contains(SrcReg))
       return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
-    if (X86::GR16RegClass.contains(SrcReg)) {
-      SrcReg = getX86SubSuperRegister(SrcReg, 32);
-      return X86::KMOVWkr;
-    }
-    if (X86::GR8RegClass.contains(SrcReg)) {
-      SrcReg = getX86SubSuperRegister(SrcReg, 32);
-      return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
-    }
   }
 
 
@@ -5729,9 +6658,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // first frame index.
     // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
 
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    const TargetRegisterInfo &TRI = getRegisterInfo();
     MachineBasicBlock::LivenessQueryResult LQR =
-        MBB.computeRegisterLiveness(TRI, AX, MI);
+        MBB.computeRegisterLiveness(&TRI, AX, MI);
     // We do not want to save and restore AX if we do not have to.
     // Moreover, if we do so whereas AX is dead, we would need to set
     // an undef flag on the use of AX, otherwise the verifier will
@@ -5748,7 +6677,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       }
       // AX contains the top most register in the aliasing hierarchy.
       // It may not be live, but one of its aliases may be.
-      for (MCRegAliasIterator AI(AX, TRI, true);
+      for (MCRegAliasIterator AI(AX, &TRI, true);
            AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
         LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
                                 : MachineBasicBlock::LQR_Dead;
@@ -5787,7 +6716,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   bool HasAVX512 = STI.hasAVX512();
   bool HasVLX = STI.hasVLX();
 
-  switch (RC->getSize()) {
+  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
   default:
     llvm_unreachable("Unknown spill size");
   case 1:
@@ -5833,28 +6762,36 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
   case 16: {
-    assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
-    // If stack is realigned we can use aligned stores.
-    if (isStackAligned)
-      return load ?
-        (HasVLX    ? X86::VMOVAPSZ128rm :
-         HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
-         HasAVX    ? X86::VMOVAPSrm :
-                     X86::MOVAPSrm):
-        (HasVLX    ? X86::VMOVAPSZ128mr :
-         HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
-         HasAVX    ? X86::VMOVAPSmr :
-                     X86::MOVAPSmr);
-    else
-      return load ?
-        (HasVLX    ? X86::VMOVUPSZ128rm :
-         HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
-         HasAVX    ? X86::VMOVUPSrm :
-                     X86::MOVUPSrm):
-        (HasVLX    ? X86::VMOVUPSZ128mr :
-         HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
-         HasAVX    ? X86::VMOVUPSmr :
-                     X86::MOVUPSmr);
+    if (X86::VR128XRegClass.hasSubClassEq(RC)) {
+      // If stack is realigned we can use aligned stores.
+      if (isStackAligned)
+        return load ?
+          (HasVLX    ? X86::VMOVAPSZ128rm :
+           HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
+           HasAVX    ? X86::VMOVAPSrm :
+                       X86::MOVAPSrm):
+          (HasVLX    ? X86::VMOVAPSZ128mr :
+           HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
+           HasAVX    ? X86::VMOVAPSmr :
+                       X86::MOVAPSmr);
+      else
+        return load ?
+          (HasVLX    ? X86::VMOVUPSZ128rm :
+           HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
+           HasAVX    ? X86::VMOVUPSrm :
+                       X86::MOVUPSrm):
+          (HasVLX    ? X86::VMOVUPSZ128mr :
+           HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
+           HasAVX    ? X86::VMOVUPSmr :
+                       X86::MOVUPSmr);
+    }
+    if (X86::BNDRRegClass.hasSubClassEq(RC)) {
+      if (STI.is64Bit())
+        return load ? X86::BNDMOVRM64rm : X86::BNDMOVMR64mr;
+      else
+        return load ? X86::BNDMOVRM32rm : X86::BNDMOVMR32mr;
+    }
+    llvm_unreachable("Unknown 16-byte regclass");
   }
   case 32:
     assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
@@ -5939,9 +6876,9 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
+  assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
          "Stack slot too small for store");
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
       RI.canRealignStack(MF);
@@ -5958,14 +6895,15 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
+    MIB.add(Addr[i]);
   MIB.addReg(SrcReg, getKillRegState(isKill));
   (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
@@ -5978,7 +6916,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
       (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
       RI.canRealignStack(MF);
@@ -5993,14 +6931,15 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
-    MIB.addOperand(Addr[i]);
+    MIB.add(Addr[i]);
   (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
 }
@@ -6017,12 +6956,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP8ri:
-    if (!MI.getOperand(1).isImm())
-      return false;
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI.getOperand(1).getImm();
+    if (MI.getOperand(1).isImm()) {
+      CmpMask = ~0;
+      CmpValue = MI.getOperand(1).getImm();
+    } else {
+      CmpMask = CmpValue = 0;
+    }
     return true;
   // A SUB can be used to perform comparison.
   case X86::SUB64rm:
@@ -6031,7 +6972,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB8rm:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::SUB64rr:
@@ -6040,7 +6981,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB8rr:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::SUB64ri32:
@@ -6050,12 +6991,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB8ri:
-    if (!MI.getOperand(2).isImm())
-      return false;
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI.getOperand(2).getImm();
+    if (MI.getOperand(2).isImm()) {
+      CmpMask = ~0;
+      CmpValue = MI.getOperand(2).getImm();
+    } else {
+      CmpMask = CmpValue = 0;
+    }
     return true;
   case X86::CMP64rr:
   case X86::CMP32rr:
@@ -6063,7 +7006,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::CMP8rr:
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = MI.getOperand(1).getReg();
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::TEST8rr:
@@ -6089,8 +7032,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
 inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
-                                        MachineInstr &OI) {
+                                        unsigned SrcReg2, int ImmMask,
+                                        int ImmValue, MachineInstr &OI) {
   if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
        (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
        (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
@@ -6101,7 +7044,8 @@ inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
         OI.getOperand(2).getReg() == SrcReg)))
     return true;
 
-  if (((FlagI.getOpcode() == X86::CMP64ri32 &&
+  if (ImmMask != 0 &&
+      ((FlagI.getOpcode() == X86::CMP64ri32 &&
         OI.getOpcode() == X86::SUB64ri32) ||
        (FlagI.getOpcode() == X86::CMP64ri8 &&
         OI.getOpcode() == X86::SUB64ri8) ||
@@ -6288,7 +7232,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
 
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
-  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
   if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
     return false;
 
@@ -6338,8 +7282,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (; RI != RE; ++RI) {
     MachineInstr &Instr = *RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
-    if (!IsCmpZero &&
-        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+    if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
+                                           CmpValue, Instr)) {
       Sub = &Instr;
       break;
     }
@@ -6447,7 +7391,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
         NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
       else {
         unsigned DstReg = Instr.getOperand(0).getReg();
-        NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
+        const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+        NewOpc = getCMovFromCond(NewCC, TRI->getRegSizeInBits(*DstRC)/8,
                                  HasMemoryOperand);
       }
 
@@ -6764,18 +7709,44 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
   case X86::AVX512_128_SET0:
-    return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
-  case X86::AVX512_256_SET0:
-    return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
+  case X86::AVX512_FsFLD0SS:
+  case X86::AVX512_FsFLD0SD: {
+    bool HasVLX = Subtarget.hasVLX();
+    unsigned SrcReg = MIB->getOperand(0).getReg();
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
+      return Expand2AddrUndef(MIB,
+                              get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
+    // Extended register without VLX. Use a larger XOR.
+    SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
+    MIB->getOperand(0).setReg(SrcReg);
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  }
+  case X86::AVX512_256_SET0: {
+    bool HasVLX = Subtarget.hasVLX();
+    unsigned SrcReg = MIB->getOperand(0).getReg();
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
+      return Expand2AddrUndef(MIB,
+                              get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr));
+    // Extended register without VLX. Use a larger XOR.
+    SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
+    MIB->getOperand(0).setReg(SrcReg);
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
+  }
   case X86::AVX512_512_SET0:
     return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
-  case X86::AVX512_FsFLD0SS:
-  case X86::AVX512_FsFLD0SD:
-    return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
+  case X86::AVX1_SETALLONES: {
+    unsigned Reg = MIB->getOperand(0).getReg();
+    // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
+    MIB->setDesc(get(X86::VCMPPSYrri));
+    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
+    return true;
+  }
   case X86::AVX512_512_SETALLONES: {
     unsigned Reg = MIB->getOperand(0).getReg();
     MIB->setDesc(get(X86::VPTERNLOGDZrri));
@@ -6838,11 +7809,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   // registers, since it is not usable as a write mask.
   // FIXME: A more advanced approach would be to choose the best input mask
   // register based on context.
-  case X86::KSET0B:
   case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
   case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
   case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
-  case X86::KSET1B:
   case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
   case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
   case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
@@ -6860,7 +7829,7 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
   if (NumAddrOps < 4) {
     // FrameIndex only - add an immediate offset (whether its zero or not).
     for (unsigned i = 0; i != NumAddrOps; ++i)
-      MIB.addOperand(MOs[i]);
+      MIB.add(MOs[i]);
     addOffset(MIB, PtrOffset);
   } else {
     // General Memory Addressing - we need to add any offset to an existing
@@ -6871,7 +7840,7 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
       if (i == 3 && PtrOffset != 0) {
         MIB.addDisp(MO, PtrOffset);
       } else {
-        MIB.addOperand(MO);
+        MIB.add(MO);
       }
     }
   }
@@ -6893,11 +7862,11 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
   unsigned NumOps = MI.getDesc().getNumOperands() - 2;
   for (unsigned i = 0; i != NumOps; ++i) {
     MachineOperand &MO = MI.getOperand(i + 2);
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
   for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
-    MIB.addOperand(MO);
+    MIB.add(MO);
   }
 
   MachineBasicBlock *MBB = InsertPt->getParent();
@@ -6922,7 +7891,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
       assert(MO.isReg() && "Expected to fold into reg operand!");
       addOperands(MIB, MOs, PtrOffset);
     } else {
-      MIB.addOperand(MO);
+      MIB.add(MO);
     }
   }
 
@@ -6958,7 +7927,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       unsigned DstIdx = (Imm >> 4) & 3;
       unsigned SrcIdx = (Imm >> 6) & 3;
 
-      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if (Size <= RCSize && 4 <= Align) {
         int PtrOffset = SrcIdx * 4;
         unsigned NewImm = (DstIdx << 4) | ZMask;
@@ -6980,7 +7951,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
     // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
     if (OpNum == 2) {
-      unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
+      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
       if (Size <= RCSize && 8 <= Align) {
         unsigned NewOpCode =
             (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
@@ -7069,7 +8042,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
         return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
-        unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
+        const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+        const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
+                                                    &RI, MF);
+        unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
@@ -7226,7 +8202,7 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExeDepsFix pass how many idle
+/// Inform the ExecutionDepsFix pass how many idle
 /// instructions we would like before a partial register update.
 unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
@@ -7344,11 +8320,15 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   case X86::VCVTUSI642SDZrrb_Int:
   case X86::VCVTUSI642SDZrm_Int:
   case X86::VCVTSD2SSZrr:
-  case X86::VCVTSD2SSZrrb:
+  case X86::VCVTSD2SSZrr_Int:
+  case X86::VCVTSD2SSZrrb_Int:
   case X86::VCVTSD2SSZrm:
+  case X86::VCVTSD2SSZrm_Int:
   case X86::VCVTSS2SDZrr:
-  case X86::VCVTSS2SDZrrb:
+  case X86::VCVTSS2SDZrr_Int:
+  case X86::VCVTSS2SDZrrb_Int:
   case X86::VCVTSS2SDZrm:
+  case X86::VCVTSS2SDZrm_Int:
   case X86::VRNDSCALESDr:
   case X86::VRNDSCALESDrb:
   case X86::VRNDSCALESDm:
@@ -7375,8 +8355,8 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExeDepsFix pass how many idle instructions we would like before
-/// certain undef register reads.
+/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// before certain undef register reads.
 ///
 /// This catches the VCVTSI2SD family of instructions:
 ///
@@ -7506,11 +8486,13 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
                                              const MachineFunction &MF) {
   unsigned Opc = LoadMI.getOpcode();
   unsigned UserOpc = UserMI.getOpcode();
-  unsigned RegSize =
-      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC =
+      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
+  unsigned RegSize = TRI.getRegSizeInBits(*RC);
 
   if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
-      RegSize > 4) {
+      RegSize > 32) {
     // These instructions only load 32 bits, we can't fold them if the
     // destination register is wider than 32 bits (4 bytes), and its user
     // instruction isn't scalar (SS).
@@ -7522,6 +8504,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
     case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
     case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
+    case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+    case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
+    case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
+    case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
+    case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
+    case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
     case X86::VFMADDSS4rr_Int:   case X86::VFNMADDSS4rr_Int:
     case X86::VFMSUBSS4rr_Int:   case X86::VFNMSUBSS4rr_Int:
     case X86::VFMADD132SSr_Int:  case X86::VFNMADD132SSr_Int:
@@ -7536,6 +8524,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
     case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
     case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
+    case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk:
+    case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk:
+    case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk:
+    case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk:
+    case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk:
+    case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk:
+    case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz:
+    case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz:
+    case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz:
+    case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
+    case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
+    case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
       return false;
     default:
       return true;
@@ -7543,7 +8543,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
   }
 
   if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
-      RegSize > 8) {
+      RegSize > 64) {
     // These instructions only load 64 bits, we can't fold them if the
     // destination register is wider than 64 bits (8 bytes), and its user
     // instruction isn't scalar (SD).
@@ -7555,6 +8555,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
     case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
     case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
+    case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+    case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
+    case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
+    case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
+    case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
+    case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
     case X86::VFMADDSD4rr_Int:   case X86::VFNMADDSD4rr_Int:
     case X86::VFMSUBSD4rr_Int:   case X86::VFNMSUBSD4rr_Int:
     case X86::VFMADD132SDr_Int:  case X86::VFNMADD132SDr_Int:
@@ -7569,6 +8575,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
     case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
     case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
+    case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk:
+    case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk:
+    case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk:
+    case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk:
+    case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk:
+    case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk:
+    case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz:
+    case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz:
+    case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz:
+    case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
+    case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
+    case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
       return false;
     default:
       return true;
@@ -7617,6 +8635,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       Alignment = 64;
       break;
     case X86::AVX2_SETALLONES:
+    case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
       Alignment = 32;
@@ -7662,6 +8681,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   case X86::V_SET0:
   case X86::V_SETALLONES:
   case X86::AVX2_SETALLONES:
+  case X86::AVX1_SETALLONES:
   case X86::AVX_SET0:
   case X86::AVX512_128_SET0:
   case X86::AVX512_256_SET0:
@@ -7703,13 +8723,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
-             Opc == X86::AVX512_256_SET0)
+             Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
     bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
-                      Opc == X86::AVX512_512_SETALLONES);
+                      Opc == X86::AVX512_512_SETALLONES ||
+                      Opc == X86::AVX1_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -7800,11 +8821,11 @@ bool X86InstrInfo::unfoldMemoryOperand(
   if (FoldedStore)
     MIB.addReg(Reg, RegState::Define);
   for (MachineOperand &BeforeOp : BeforeOps)
-    MIB.addOperand(BeforeOp);
+    MIB.add(BeforeOp);
   if (FoldedLoad)
     MIB.addReg(Reg);
   for (MachineOperand &AfterOp : AfterOps)
-    MIB.addOperand(AfterOp);
+    MIB.add(AfterOp);
   for (MachineOperand &ImpOp : ImpOps) {
     MIB.addReg(ImpOp.getReg(),
                getDefRegState(ImpOp.isDef()) |
@@ -7870,6 +8891,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
@@ -7892,7 +8914,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   // Emit the load instruction.
   SDNode *Load = nullptr;
   if (FoldedLoad) {
-    EVT VT = *RC->vt_begin();
+    EVT VT = *TRI.legalclasstypes_begin(*RC);
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
@@ -7904,7 +8926,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
     // memory access is slow above.
-    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
@@ -7920,7 +8942,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
     DstRC = getRegClass(MCID, 0, &RI, MF);
-    VTs.push_back(*DstRC->vt_begin());
+    VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
     EVT VT = N->getValueType(i);
@@ -7949,7 +8971,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
       return false;
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
     // memory access is slow above.
-    unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
+    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
     SDNode *Store =
@@ -8143,28 +9165,29 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
     break;
   }
 
-  // Check if chain operands and base addresses match.
-  if (Load1->getOperand(0) != Load2->getOperand(0) ||
-      Load1->getOperand(5) != Load2->getOperand(5))
+  // Lambda to check if both the loads have the same value for an operand index.
+  auto HasSameOp = [&](int I) {
+    return Load1->getOperand(I) == Load2->getOperand(I);
+  };
+
+  // All operands except the displacement should match.
+  if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
+      !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
     return false;
-  // Segment operands should match as well.
-  if (Load1->getOperand(4) != Load2->getOperand(4))
+
+  // Chain Operand must be the same.
+  if (!HasSameOp(5))
     return false;
-  // Scale should be 1, Index should be Reg0.
-  if (Load1->getOperand(1) == Load2->getOperand(1) &&
-      Load1->getOperand(2) == Load2->getOperand(2)) {
-    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
-      return false;
 
-    // Now let's examine the displacements.
-    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
-        isa<ConstantSDNode>(Load2->getOperand(3))) {
-      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
-      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
-      return true;
-    }
-  }
-  return false;
+  // Now let's examine if the displacements are constants.
+  auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
+  auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
+  if (!Disp1 || !Disp2)
+    return false;
+
+  Offset1 = Disp1->getSExtValue();
+  Offset2 = Disp2->getSExtValue();
+  return true;
 }
 
 bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
@@ -8215,165 +9238,6 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
-bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
-                                          const MachineInstr &Second) const {
-  // Check if this processor supports macro-fusion. Since this is a minor
-  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
-  // proxy for SandyBridge+.
-  if (!Subtarget.hasAVX())
-    return false;
-
-  enum {
-    FuseTest,
-    FuseCmp,
-    FuseInc
-  } FuseKind;
-
-  switch (Second.getOpcode()) {
-  default:
-    return false;
-  case X86::JE_1:
-  case X86::JNE_1:
-  case X86::JL_1:
-  case X86::JLE_1:
-  case X86::JG_1:
-  case X86::JGE_1:
-    FuseKind = FuseInc;
-    break;
-  case X86::JB_1:
-  case X86::JBE_1:
-  case X86::JA_1:
-  case X86::JAE_1:
-    FuseKind = FuseCmp;
-    break;
-  case X86::JS_1:
-  case X86::JNS_1:
-  case X86::JP_1:
-  case X86::JNP_1:
-  case X86::JO_1:
-  case X86::JNO_1:
-    FuseKind = FuseTest;
-    break;
-  }
-  switch (First.getOpcode()) {
-  default:
-    return false;
-  case X86::TEST8rr:
-  case X86::TEST16rr:
-  case X86::TEST32rr:
-  case X86::TEST64rr:
-  case X86::TEST8ri:
-  case X86::TEST16ri:
-  case X86::TEST32ri:
-  case X86::TEST32i32:
-  case X86::TEST64i32:
-  case X86::TEST64ri32:
-  case X86::TEST8rm:
-  case X86::TEST16rm:
-  case X86::TEST32rm:
-  case X86::TEST64rm:
-  case X86::TEST8ri_NOREX:
-  case X86::AND16i16:
-  case X86::AND16ri:
-  case X86::AND16ri8:
-  case X86::AND16rm:
-  case X86::AND16rr:
-  case X86::AND32i32:
-  case X86::AND32ri:
-  case X86::AND32ri8:
-  case X86::AND32rm:
-  case X86::AND32rr:
-  case X86::AND64i32:
-  case X86::AND64ri32:
-  case X86::AND64ri8:
-  case X86::AND64rm:
-  case X86::AND64rr:
-  case X86::AND8i8:
-  case X86::AND8ri:
-  case X86::AND8rm:
-  case X86::AND8rr:
-    return true;
-  case X86::CMP16i16:
-  case X86::CMP16ri:
-  case X86::CMP16ri8:
-  case X86::CMP16rm:
-  case X86::CMP16rr:
-  case X86::CMP32i32:
-  case X86::CMP32ri:
-  case X86::CMP32ri8:
-  case X86::CMP32rm:
-  case X86::CMP32rr:
-  case X86::CMP64i32:
-  case X86::CMP64ri32:
-  case X86::CMP64ri8:
-  case X86::CMP64rm:
-  case X86::CMP64rr:
-  case X86::CMP8i8:
-  case X86::CMP8ri:
-  case X86::CMP8rm:
-  case X86::CMP8rr:
-  case X86::ADD16i16:
-  case X86::ADD16ri:
-  case X86::ADD16ri8:
-  case X86::ADD16ri8_DB:
-  case X86::ADD16ri_DB:
-  case X86::ADD16rm:
-  case X86::ADD16rr:
-  case X86::ADD16rr_DB:
-  case X86::ADD32i32:
-  case X86::ADD32ri:
-  case X86::ADD32ri8:
-  case X86::ADD32ri8_DB:
-  case X86::ADD32ri_DB:
-  case X86::ADD32rm:
-  case X86::ADD32rr:
-  case X86::ADD32rr_DB:
-  case X86::ADD64i32:
-  case X86::ADD64ri32:
-  case X86::ADD64ri32_DB:
-  case X86::ADD64ri8:
-  case X86::ADD64ri8_DB:
-  case X86::ADD64rm:
-  case X86::ADD64rr:
-  case X86::ADD64rr_DB:
-  case X86::ADD8i8:
-  case X86::ADD8mi:
-  case X86::ADD8mr:
-  case X86::ADD8ri:
-  case X86::ADD8rm:
-  case X86::ADD8rr:
-  case X86::SUB16i16:
-  case X86::SUB16ri:
-  case X86::SUB16ri8:
-  case X86::SUB16rm:
-  case X86::SUB16rr:
-  case X86::SUB32i32:
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB32rm:
-  case X86::SUB32rr:
-  case X86::SUB64i32:
-  case X86::SUB64ri32:
-  case X86::SUB64ri8:
-  case X86::SUB64rm:
-  case X86::SUB64rr:
-  case X86::SUB8i8:
-  case X86::SUB8ri:
-  case X86::SUB8rm:
-  case X86::SUB8rr:
-    return FuseKind == FuseCmp || FuseKind == FuseInc;
-  case X86::INC16r:
-  case X86::INC32r:
-  case X86::INC64r:
-  case X86::INC8r:
-  case X86::DEC16r:
-  case X86::DEC32r:
-  case X86::DEC64r:
-  case X86::DEC8r:
-    return FuseKind == FuseInc;
-  }
-}
-
 bool X86InstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -8424,6 +9288,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
   { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
   { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr },
+  { X86::MOVSDmr,    X86::MOVSDmr,   X86::MOVPQI2QImr },
   { X86::MOVSSmr,    X86::MOVSSmr,   X86::MOVPDI2DImr },
   { X86::MOVSDrm,    X86::MOVSDrm,   X86::MOVQI2PQIrm },
   { X86::MOVSSrm,    X86::MOVSSrm,   X86::MOVDI2PDIrm },
@@ -8443,6 +9308,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
   { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
   { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr },
+  { X86::VMOVSDmr,   X86::VMOVSDmr,   X86::VMOVPQI2QImr },
   { X86::VMOVSSmr,   X86::VMOVSSmr,   X86::VMOVPDI2DImr },
   { X86::VMOVSDrm,   X86::VMOVSDrm,   X86::VMOVQI2PQIrm },
   { X86::VMOVSSrm,   X86::VMOVSSrm,   X86::VMOVDI2PDIrm },
@@ -8465,7 +9331,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   // AVX512 support
   { X86::VMOVLPSZ128mr,  X86::VMOVLPDZ128mr,  X86::VMOVPQI2QIZmr  },
   { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
-  { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+  { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr },
   { X86::VMOVNTPSZmr,    X86::VMOVNTPDZmr,    X86::VMOVNTDQZmr    },
   { X86::VMOVSDZmr,      X86::VMOVSDZmr,      X86::VMOVPQI2QIZmr  },
   { X86::VMOVSSZmr,      X86::VMOVSSZmr,      X86::VMOVPDI2DIZmr  },
@@ -8493,10 +9359,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
   { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
   { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
-  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
-  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
-  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
-  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
   { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
   { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
   { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
@@ -8508,6 +9370,14 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VBROADCASTF128,  X86::VBROADCASTF128,  X86::VBROADCASTI128 },
 };
 
+static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+};
+
 static const uint16_t ReplaceableInstrsAVX512[][4] = {
   // Two integer columns for 64-bit and 32-bit elements.
   //PackedSingle        PackedDouble        PackedInt             PackedInt
@@ -8769,16 +9639,25 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
       validDomains = 0xe;
     } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
       validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+    } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
+      // Insert/extract instructions should only effect domain if AVX2
+      // is enabled.
+      if (!Subtarget.hasAVX2())
+        return std::make_pair(0, 0);
+      validDomains = 0xe;
     } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
       validDomains = 0xe;
-    } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
-      validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
-    } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+    } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain,
+                                                  ReplaceableInstrsAVX512DQ)) {
+      validDomains = 0xe;
+    } else if (Subtarget.hasDQI()) {
+      if (const uint16_t *table = lookupAVX512(opcode, domain,
                                              ReplaceableInstrsAVX512DQMasked)) {
-      if (domain == 1 || (domain == 3 && table[3] == opcode))
-        validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
-      else
-        validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+        if (domain == 1 || (domain == 3 && table[3] == opcode))
+          validDomains = 0xa;
+        else
+          validDomains = 0xc;
+      }
     }
   }
   return std::make_pair(domain, validDomains);
@@ -8794,6 +9673,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
            "256-bit vector operations only available in AVX2");
     table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
   }
+  if (!table) { // try the other table
+    assert(Subtarget.hasAVX2() &&
+           "256-bit insert/extract only available in AVX2");
+    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
+  }
   if (!table) { // try the AVX512 table
     assert(Subtarget.hasAVX512() && "Requires AVX-512");
     table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
@@ -8820,7 +9704,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
 }
 
 /// Return the noop instruction to use for a noop.
-void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+void X86InstrInfo::getNoop(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
@@ -9457,28 +10341,6 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   return makeArrayRef(TargetFlags);
 }
 
-bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
-  switch (Inst.getOpcode()) {
-    case X86::TCRETURNdi:
-    case X86::TCRETURNmi:
-    case X86::TCRETURNri:
-    case X86::TCRETURNdi64:
-    case X86::TCRETURNmi64:
-    case X86::TCRETURNri64:
-    case X86::TAILJMPd:
-    case X86::TAILJMPm:
-    case X86::TAILJMPr:
-    case X86::TAILJMPd64:
-    case X86::TAILJMPm64:
-    case X86::TAILJMPr64:
-    case X86::TAILJMPm64_REX:
-    case X86::TAILJMPr64_REX:
-      return true;
-    default:
-      return false;
-  }
-}
-
 namespace {
   /// Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
@@ -9626,7 +10488,7 @@ namespace {
       return Copy;
     }
 
-    // Create a virtal register in *TLSBaseAddrReg, and populate it by
+    // Create a virtual register in *TLSBaseAddrReg, and populate it by
     // inserting a copy instruction after I. Returns the new instruction.
     MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
       MachineFunction *MF = I.getParent()->getParent();
@@ -9665,3 +10527,122 @@ namespace {
 char LDTLSCleanup::ID = 0;
 FunctionPass*
 llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+
+unsigned X86InstrInfo::getOutliningBenefit(size_t SequenceSize,
+                                           size_t Occurrences,
+                                           bool CanBeTailCall) const {
+  unsigned NotOutlinedSize = SequenceSize * Occurrences;
+  unsigned OutlinedSize;
+
+  // Is it a tail call?
+  if (CanBeTailCall) {
+    // If yes, we don't have to include a return instruction-- it's already in
+    // our sequence. So we have one occurrence of the sequence + #Occurrences
+    // calls.
+    OutlinedSize = SequenceSize + Occurrences;
+  } else {
+    // If not, add one for the return instruction.
+    OutlinedSize = (SequenceSize + 1) + Occurrences;
+  }
+
+  // Return the number of instructions saved by outlining this sequence.
+  return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+  return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+X86GenInstrInfo::MachineOutlinerInstrType
+X86InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugValue() || MI.isIndirectDebugValue())
+    return MachineOutlinerInstrType::Invisible;
+
+  // Is this a tail call? If yes, we can outline as a tail call.
+  if (isTailCall(MI))
+    return MachineOutlinerInstrType::Legal;
+
+  // Is this the terminator of a basic block?
+  if (MI.isTerminator() || MI.isReturn()) {
+
+    // Does its parent have any successors in its MachineFunction?
+    if (MI.getParent()->succ_empty())
+        return MachineOutlinerInstrType::Legal;
+
+    // It does, so we can't tail call it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  // Don't outline anything that modifies or reads from the stack pointer.
+  //
+  // FIXME: There are instructions which are being manually built without
+  // explicit uses/defs so we also have to check the MCInstrDesc. We should be
+  // able to remove the extra checks once those are fixed up. For example,
+  // sometimes we might get something like %RAX<def> = POP64r 1. This won't be
+  // caught by modifiesRegister or readsRegister even though the instruction
+  // really ought to be formed so that modifiesRegister/readsRegister would
+  // catch it.
+  if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
+      MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Outlined calls change the instruction pointer, so don't read from it.
+  if (MI.readsRegister(X86::RIP, &RI) ||
+      MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Positions can't safely be outlined.
+  if (MI.isPosition())
+    return MachineOutlinerInstrType::Illegal;
+
+  // Make sure none of the operands of this instruction do anything tricky.
+  for (const MachineOperand &MOP : MI.operands())
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return MachineOutlinerInstrType::Illegal;
+
+  return MachineOutlinerInstrType::Legal;
+}
+
+void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                          MachineFunction &MF,
+                                          bool IsTailCall) const {
+
+  // If we're a tail call, we already have a return, so don't do anything.
+  if (IsTailCall)
+    return;
+
+  // We're a normal call, so our sequence doesn't have a return instruction.
+  // Add it in.
+  MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ));
+  MBB.insert(MBB.end(), retq);
+}
+
+void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                          MachineFunction &MF,
+                                          bool IsTailCall) const {}
+
+MachineBasicBlock::iterator
+X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &It,
+                                 MachineFunction &MF,
+                                 bool IsTailCall) const {
+  // Is it a tail call?
+  if (IsTailCall) {
+    // Yes, just insert a JMP.
+    It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(X86::JMP_1))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+  } else {
+    // No, insert a call.
+    It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+  }
+
+  return It;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index acfdef4..e648760 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -64,6 +64,10 @@ enum CondCode {
 // Turn condition code into conditional branch opcode.
 unsigned GetCondBranchFromCond(CondCode CC);
 
+/// \brief Return a pair of condition code for the given predicate and whether
+/// the instruction operands should be swaped to match the condition code.
+std::pair<CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate);
+
 /// \brief Return a set opcode for the given condition and whether it has
 /// a memory operand.
 unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
@@ -182,6 +186,25 @@ public:
   ///
   const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
+  /// Returns the stack pointer adjustment that happens inside the frame
+  /// setup..destroy sequence (e.g. by pushes, or inside the callee).
+  int64_t getFrameAdjustment(const MachineInstr &I) const {
+    assert(isFrameInstr(I));
+    if (isFrameSetup(I))
+      return I.getOperand(2).getImm();
+    return I.getOperand(1).getImm();
+  }
+
+  /// Sets the stack pointer adjustment made inside the frame made up by this
+  /// instruction.
+  void setFrameAdjustment(MachineInstr &I, int64_t V) const {
+    assert(isFrameInstr(I));
+    if (isFrameSetup(I))
+      I.getOperand(2).setImm(V);
+    else
+      I.getOperand(1).setImm(V);
+  }
+
   /// getSPAdjust - This returns the stack pointer adjustment made by
   /// this instruction. For x86, we need to handle more complex call
   /// sequences involving PUSHes.
@@ -316,6 +339,13 @@ public:
 
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
+  bool isUnconditionalTailCall(const MachineInstr &MI) const override;
+  bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
+                                  const MachineInstr &TailCall) const override;
+  void replaceBranchWithTailCall(MachineBasicBlock &MBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 const MachineInstr &TailCall) const override;
+
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
@@ -436,10 +466,7 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
-
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
+  void getNoop(MCInst &NopInst) const override;
 
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
@@ -539,8 +566,28 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
-  bool isTailCall(const MachineInstr &Inst) const override;
+  unsigned getOutliningBenefit(size_t SequenceSize,
+                               size_t Occurrences,
+                               bool CanBeTailCall) const override;
+
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+
+  llvm::X86GenInstrInfo::MachineOutlinerInstrType
+  getOutliningType(MachineInstr &MI) const override;
+
+  void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool IsTailCall) const override;
+
+  void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool isTailCall) const override;
 
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It,
+                     MachineFunction &MF,
+                     bool IsTailCall) const override;
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
index 3803671..fab70e9 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td
@@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
 
 def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
 
-def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
+                                          SDTCisVT<1, i32>]>;
 def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                         SDTCisVT<1, i32>]>;
 
@@ -283,6 +284,11 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
 def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def X86lwpins : SDNode<"X86ISD::LWPINS",
+                       SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                            SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
+                       [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>;
+
 //===----------------------------------------------------------------------===//
 // X86 Operand Definitions.
 //
@@ -318,6 +324,7 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
   def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; }
   def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; }
   def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
+  def X86Mem256_RC512Operand  : AsmOperandClass { let Name = "Mem256_RC512"; }
   def X86Mem512_RC512Operand  : AsmOperandClass { let Name = "Mem512_RC512"; }
 }
 
@@ -374,9 +381,10 @@ def vy256mem : X86VMemOperand<VR256,  "printi256mem", X86Mem256_RC256Operand>;
 def vx64xmem  : X86VMemOperand<VR128X, "printi64mem",  X86Mem64_RC128XOperand>;
 def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>;
 def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>;
-def vy128xmem : X86VMemOperand<VR256,  "printi128mem", X86Mem128_RC256XOperand>;
+def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>;
 def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>;
 def vy512mem  : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>;
+def vz256xmem : X86VMemOperand<VR512,  "printi256mem", X86Mem256_RC512Operand>;
 def vz512mem  : X86VMemOperand<VR512,  "printi512mem", X86Mem512_RC512Operand>;
 
 // A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead
@@ -805,6 +813,8 @@ def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">,
                      AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
+                   AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
 def HasPFI       : Predicate<"Subtarget->hasPFI()">,
                      AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
 def HasERI       : Predicate<"Subtarget->hasERI()">,
@@ -831,10 +841,10 @@ def HasXSAVEC    : Predicate<"Subtarget->hasXSAVEC()">;
 def HasXSAVES    : Predicate<"Subtarget->hasXSAVES()">;
 def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
 def HasFMA       : Predicate<"Subtarget->hasFMA()">;
-def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
 def HasTBM       : Predicate<"Subtarget->hasTBM()">;
+def HasLWP       : Predicate<"Subtarget->hasLWP()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
 def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
 def HasF16C      : Predicate<"Subtarget->hasF16C()">;
@@ -848,8 +858,6 @@ def HasVBMI      : Predicate<"Subtarget->hasVBMI()">,
 def HasIFMA      : Predicate<"Subtarget->hasIFMA()">,
                      AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
-def HasHLE       : Predicate<"Subtarget->hasHLE()">;
-def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
@@ -857,9 +865,11 @@ def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
+def HasCLZERO    : Predicate<"Subtarget->hasCLZERO()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
+def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
@@ -876,7 +886,9 @@ def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
 def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
-                                  "Subtarget->getFrameLowering()->hasFP(*MF)">;
+                                  "Subtarget->getFrameLowering()->hasFP(*MF)"> {
+  let RecomputePerFunction = 1;
+}
 def IsPS4        : Predicate<"Subtarget->isTargetPS4()">;
 def NotPS4       : Predicate<"!Subtarget->isTargetPS4()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
@@ -886,15 +898,24 @@ def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
 def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
                              "TM.getCodeModel() == CodeModel::Kernel">;
 def IsNotPIC     : Predicate<"!TM.isPositionIndependent()">;
-def OptForSize   : Predicate<"OptForSize">;
-def OptForMinSize : Predicate<"OptForMinSize">;
-def OptForSpeed  : Predicate<"!OptForSize">;
+
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+  def OptForSize   : Predicate<"MF->getFunction()->optForSize()">;
+  def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">;
+  def OptForSpeed  : Predicate<"!MF->getFunction()->optForSize()">;
+}
+
 def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
 def CallImmAddr  : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
+def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
@@ -931,6 +952,15 @@ def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
 def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
 
+// FIXME: Ideally we would just replace the above i*immSExt* matchers with
+// relocImm-based matchers, but then FastISel would be unable to use them.
+def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
+  return isSExtRelocImm<8>(N);
+}]>;
+def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
+  return isSExtRelocImm<32>(N);
+}]>;
+
 // If we have multiple users of an immediate, it's much smaller to reuse
 // the register, rather than encode the immediate in every instruction.
 // This has the risk of increasing register pressure from stretched live
@@ -971,6 +1001,13 @@ def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 
+def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
 // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
 // unsigned field.
 def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
@@ -1106,13 +1143,15 @@ def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
 def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
                 IIC_POP_REG>, OpSize16;
-def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
-                IIC_POP_MEM>, OpSize16;
 def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>;
+} // mayLoad, SchedRW
+let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in {
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
+                IIC_POP_MEM>, OpSize16;
 def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
                 IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>;
-} // mayLoad, SchedRW
+} // mayStore, mayLoad, WriteRMW
 
 let mayStore = 1, SchedRW = [WriteStore] in {
 def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[],
@@ -1194,9 +1233,10 @@ def POP64r   : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
                  IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
 def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
                 IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>;
+} // mayLoad, SchedRW
+let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in
 def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
                 IIC_POP_MEM>, OpSize32, Requires<[In64BitMode]>;
-} // mayLoad, SchedRW
 let mayStore = 1, SchedRW = [WriteStore] in {
 def PUSH64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [],
                  IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>;
@@ -1398,11 +1438,14 @@ def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
 // Longer forms that use a ModR/M byte. Needed for disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                   FoldGenData<"MOV8ri">;
 def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
-                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+                   FoldGenData<"MOV16ri">;
 def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
-                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+                   FoldGenData<"MOV32ri">;
 }
 } // SchedRW
 
@@ -1525,13 +1568,17 @@ def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteMove] in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                   FoldGenData<"MOV8rr">;
 def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+                    FoldGenData<"MOV16rr">;
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+                    FoldGenData<"MOV32rr">;
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                     FoldGenData<"MOV64rr">;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
@@ -1965,7 +2012,12 @@ def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
                      Requires<[In64BitMode]>;
 
 // Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>,
+                     Requires<[Not16BitMode]>;
+
+// Data instruction prefix
+def DATA32_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data32", []>,
+                     Requires<[In16BitMode]>;
 
 // Repeat string operation instruction prefixes
 // These uses the DF flag in the EFLAGS register to inc or dec ECX
@@ -2079,6 +2131,7 @@ def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
 def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
                  "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
                  Requires<[Not64BitMode]>;
+let mayStore = 1 in
 def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                  "arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
                  Requires<[Not64BitMode]>;
@@ -2315,6 +2368,38 @@ let Predicates = [HasBMI2] in {
   def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
             (BZHI64rm addr:$src,
               (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  // x & (-1 >> (32 - y))
+  def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x & (-1 >> (64 - y))
+  def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+
+  // x << (32 - y) >> (32 - y)
+  def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rr GR32:$src, GR32:$lz)>;
+  def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))),
+                 (i8 (trunc (sub 32, GR32:$lz)))),
+            (BZHI32rm addr:$src, GR32:$lz)>;
+
+  // x << (64 - y) >> (64 - y)
+  def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
+  def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))),
+                 (i8 (trunc (sub 64, GR32:$lz)))),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
 } // HasBMI2
 
 let Predicates = [HasBMI] in {
@@ -2414,6 +2499,59 @@ defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
 } // HasTBM, EFLAGS
 
 //===----------------------------------------------------------------------===//
+// Lightweight Profiling Instructions
+
+let Predicates = [HasLWP] in {
+
+def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
+               [(int_x86_llwpcb GR32:$src)], IIC_LWP>,
+               XOP, XOP9, Requires<[Not64BitMode]>;
+def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
+               [(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>,
+               XOP, XOP9, Requires<[Not64BitMode]>;
+
+def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
+                 [(int_x86_llwpcb GR64:$src)], IIC_LWP>,
+                 XOP, XOP9, VEX_W, Requires<[In64BitMode]>;
+def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
+                 [(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>,
+                 XOP, XOP9, VEX_W, Requires<[In64BitMode]>;
+
+multiclass lwpins_intr<RegisterClass RC> {
+  def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
+                 "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+                 [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
+                 XOP_4V, XOPA;
+  let mayLoad = 1 in
+  def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
+                 "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+                 [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
+                 XOP_4V, XOPA;
+}
+
+let Defs = [EFLAGS] in {
+  defm LWPINS32 : lwpins_intr<GR32>;
+  defm LWPINS64 : lwpins_intr<GR64>, VEX_W;
+} // EFLAGS
+
+multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
+  def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
+                 "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+                 [(Int RC:$src0, GR32:$src1, imm:$cntl)], IIC_LWP>,
+                 XOP_4V, XOPA;
+  let mayLoad = 1 in
+  def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
+                 "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
+                 [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)], IIC_LWP>,
+                 XOP_4V, XOPA;
+}
+
+defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
+defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;
+
+} // HasLWP
+
+//===----------------------------------------------------------------------===//
 // MONITORX/MWAITX Instructions
 //
 let SchedRW = [ WriteSystem ] in {
@@ -2429,7 +2567,7 @@ let SchedRW = [ WriteSystem ] in {
   }
 
   let Uses = [ ECX, EAX, EBX ] in {
-    def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", 
+    def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx",
                     [(int_x86_mwaitx ECX, EAX, EBX)], IIC_SSE_MWAITX>,
                     TB, Requires<[ HasMWAITX ]>;
   }
@@ -2448,8 +2586,19 @@ def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
 //===----------------------------------------------------------------------===//
 // CLZERO Instruction
 //
-let Uses = [EAX] in
-def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+let SchedRW = [WriteSystem] in {
+  let Uses = [EAX] in
+  def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>,
+                TB, Requires<[HasCLZERO]>;
+
+  let usesCustomInserter = 1 in {
+  def CLZERO : PseudoI<(outs), (ins i32mem:$src1),
+                       [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>;
+  }
+} // SchedRW
+
+def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate TBM instructions.
@@ -2522,10 +2671,10 @@ let Predicates = [HasTBM] in {
 // Memory Instructions
 //
 
+let Predicates = [HasCLFLUSHOPT] in
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                    "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
-def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
 
 
 //===----------------------------------------------------------------------===//
@@ -2977,7 +3126,7 @@ def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32me
 def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
+def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
index 0bb1068..2c04772 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td
@@ -248,7 +248,8 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                          "movd\t{$src, $dst|$dst, $src}",
                          [(set GR32:$dst,
                           (MMX_X86movd2w (x86mmx VR64:$src)))],
-                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>,
+                          FoldGenData<"MMX_MOVD64rr">;
 
 let isBitcast = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
@@ -277,7 +278,7 @@ def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
-                        IIC_MMX_MOVQ_RR>;
+                        IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">;
 }
 } // SchedRW
 
@@ -294,6 +295,7 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         [(set VR64:$dst, (load_mmx addr:$src))],
                         IIC_MMX_MOVQ_RM>;
 } // SchedRW
+
 let SchedRW = [WriteStore] in
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -378,7 +380,6 @@ defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
 defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
                                    MMX_PHADDSUBW>;
 
-
 // -- Subtraction
 defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
                                    MMX_INTALU_ITINS>;
@@ -479,13 +480,6 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
                                     int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
-
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
                                     int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
                                     MMX_SHIFT_ITINS>;
@@ -496,13 +490,6 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
                                     int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
-
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
                                     int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
                                     MMX_SHIFT_ITINS>;
@@ -510,11 +497,6 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                     int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRADrm VR64:$src1, addr:$src2)>;
-
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
                                      MMX_INTALU_ITINS>;
@@ -576,9 +558,6 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                                                    imm:$src2))],
                           IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
 
-
-
-
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
@@ -639,7 +618,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                           [(set GR32orGR64:$dst,
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
-
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -670,6 +648,16 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))),
+          (MMX_CVTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
+          (MMX_CVTTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+          (MMX_CVTPD2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+          (MMX_CVTTPD2PIirr VR128:$src)>;
 }
-
-
diff --git a/contrib/llvm/lib/Target/X86/X86InstrMPX.td b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
index 309f601..104ba2a 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrMPX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrMPX.td
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
+let mayLoad = 1 in {
   def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
               OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, Not64BitMode]>;
@@ -21,16 +22,19 @@ multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
               OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
               Requires<[HasMPX, In64BitMode]>;
 }
+}
 
 defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
 
 multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
+let mayLoad = 1 in {
   def 32rm: I<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i32mem:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
   def 64rm: RI<opc, MRMSrcMem, (outs), (ins  BNDR:$src1, i64mem:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, In64BitMode]>;
+}
   def 32rr: I<opc, MRMSrcReg, (outs), (ins  BNDR:$src1, GR32:$src2),
               OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
               Requires<[HasMPX, Not64BitMode]>;
@@ -45,16 +49,18 @@ defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
 def BNDMOVRMrr   : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
+let mayLoad = 1 in {
 def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
 def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, In64BitMode]>;
-
+}
 def BNDMOVMRrr   : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX]>;
+let mayStore = 1 in {
 def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
                     "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
                     Requires<[HasMPX, Not64BitMode]>;
@@ -65,6 +71,8 @@ def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
 def BNDSTXmr:      I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
                     "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
+}
+let mayLoad = 1 in
 def BNDLDXrm:      I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
                     "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
                     Requires<[HasMPX]>;
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
index 1812d01..650e4fc 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td
@@ -259,8 +259,8 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator Int, RegisterClass RC,
-                               string asm, Operand memopr,
+                               SDPatternOperator OpNode, RegisterClass RC,
+                               ValueType VT, string asm, Operand memopr,
                                ComplexPattern mem_cpat, Domain d,
                                OpndItins itins, bit Is2Addr = 1> {
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
@@ -268,14 +268,14 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
        Sched<[itins.Sched]>;
   let mayLoad = 1 in
   def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 }
@@ -446,9 +446,9 @@ def : Pat<(v4f64  (bitconvert (v8f32  VR256:$src))), (v4f64  VR256:$src)>;
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
   def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
-                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>;
+                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
   def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
-                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>;
+                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -461,12 +461,12 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [NoVLX], SchedRW = [WriteZero] in {
+    isPseudo = 1, SchedRW = [WriteZero] in {
 def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-let Predicates = [NoVLX] in
+let Predicates = [NoAVX512] in
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 
 
@@ -475,7 +475,7 @@ def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, Predicates = [HasAVX, NoVLX], SchedRW = [WriteZero] in {
+    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
 def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                  [(set VR256:$dst, (v8i32 immAllZerosV))]>;
 }
@@ -486,12 +486,15 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1, SchedRW = [WriteZero] in {
   def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                        [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+  let Predicates = [HasAVX1Only, OptForMinSize] in {
+  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
+  }
   let Predicates = [HasAVX2] in
   def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                           [(set VR256:$dst, (v8i32 immAllOnesV))]>;
 }
 
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move FP Scalar Instructions
 //
@@ -504,7 +507,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 
 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
                          X86MemOperand x86memop, string base_opc,
-                         string asm_opr, Domain d = GenericDomain> {
+                         string asm_opr, Domain d = GenericDomain,
+                         string Name> {
   let isCommutable = 1 in
   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src1, RC:$src2),
@@ -518,25 +522,27 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
   def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                   (ins VR128:$src1, RC:$src2),
                   !strconcat(base_opc, asm_opr),
-                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
+                  FoldGenData<Name#rr>;
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
                       X86MemOperand x86memop, string OpcodeStr,
-                      Domain d = GenericDomain> {
+                      Domain d = GenericDomain, string Name> {
   // AVX
   defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
-                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
-                              VEX_4V, VEX_LIG;
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
+                              "V"#Name>,
+                              VEX_4V, VEX_LIG, VEX_WIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
-                     VEX, VEX_LIG, Sched<[WriteStore]>;
+                     VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
-                              "\t{$src2, $dst|$dst, $src2}", d>;
+                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
   }
 
   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
@@ -552,7 +558,7 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG;
   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
@@ -560,9 +566,9 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle>, XS;
+                        SSEPackedSingle, "MOVSS">, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble>, XD;
+                        SSEPackedDouble, "MOVSD">, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
@@ -644,10 +650,6 @@ let Predicates = [UseAVX] in {
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
@@ -738,10 +740,6 @@ let Predicates = [UseSSE2] in {
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
   // is during lowering, where it's not possible to recognize the fold because
@@ -786,29 +784,29 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in
 let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX;
+                              PS, VEX, VEX_WIG;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX;
+                              PD, VEX, VEX_WIG;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX;
+                              PS, VEX, VEX_WIG;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX;
+                              PD, VEX, VEX_WIG;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX, VEX_L;
+                              PS, VEX, VEX_L, VEX_WIG;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX, VEX_L;
+                              PD, VEX, VEX_L, VEX_WIG;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX, VEX_L;
+                              PS, VEX, VEX_L, VEX_WIG;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX, VEX_L;
+                              PD, VEX, VEX_L, VEX_WIG;
 }
 
 let Predicates = [UseSSE1] in {
@@ -832,35 +830,35 @@ let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
 } // SchedRW
 
 // For disassembler
@@ -869,35 +867,43 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movaps\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVA_P_RR>, VEX;
+                          IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
+                          FoldGenData<"VMOVAPSrr">;
   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movapd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVA_P_RR>, VEX;
+                           IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
+                           FoldGenData<"VMOVAPDrr">;
   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movups\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
+                           FoldGenData<"VMOVUPSrr">;
   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movupd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
+                           FoldGenData<"VMOVUPDrr">;
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVAPSYrr">;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVAPDYrr">;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVUPSYrr">;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVUPDYrr">;
 }
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -943,36 +949,22 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteFShuffle] in {
   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movaps\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVA_P_RR>;
+                         IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">;
   def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movapd\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVA_P_RR>;
+                         IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">;
   def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movups\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVU_P_RR>;
+                         IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">;
   def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movupd\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVU_P_RR>;
+                         IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">;
 }
 
-// Use vmovaps/vmovups for AVX integer load/store.
 let Predicates = [HasAVX, NoVLX] in {
-  // 128-bit load/store
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (VMOVUPSrm addr:$src)>;
-
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
+  // 256-bit load/store need to use floating point load/store in case we don't
+  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
+  // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
@@ -981,10 +973,18 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v4i64 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v8i32 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 
   // Special patterns for storing subvector extracts of lower 128-bits
   // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
@@ -994,18 +994,6 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(alignedstore (v4f32 (extract_subvector
                                   (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v2i64 (extract_subvector
-                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v4i32 (extract_subvector
-                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v8i16 (extract_subvector
-                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v16i8 (extract_subvector
-                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 
   def : Pat<(store (v2f64 (extract_subvector
                            (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
@@ -1013,40 +1001,6 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(store (v4f32 (extract_subvector
                            (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v2i64 (extract_subvector
-                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v4i32 (extract_subvector
-                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v8i16 (extract_subvector
-                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v16i8 (extract_subvector
-                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
-  // 128-bit load/store
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
-  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -1107,7 +1061,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
   let Predicates = [UseAVX] in
     defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                                    itin>, VEX_4V;
+                                    itin>, VEX_4V, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in
     defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
@@ -1126,12 +1080,12 @@ def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
                                  (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX;
+                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX;
+                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 }// UseAVX
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
@@ -1238,12 +1192,12 @@ def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 } // UseAVX
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
@@ -1343,14 +1297,14 @@ let AddedComplexity = 20, Predicates = [UseAVX] in {
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -1725,11 +1679,11 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               PS, VEX, Requires<[HasAVX, NoVLX]>;
+                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
+                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
@@ -1777,20 +1731,21 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 // Convert scalar double to scalar single
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
-                       (ins FR64:$src1, FR64:$src2),
+                       (ins FR32:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                       IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2F]>;
+                      Sched<[WriteCvtF2F]>, VEX_WIG;
 let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
-                       (ins FR64:$src1, f64mem:$src2),
+                       (ins FR32:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [], IIC_SSE_CVT_Scalar_RM>,
                       XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
-def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+def : Pat<(f32 (fpround FR64:$src)),
+            (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
           Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1810,15 +1765,15 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
-                       Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
+                       Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
-                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG,
+                       Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
@@ -1842,30 +1797,30 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
 // SSE2 instructions with XS prefix
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
-                    (ins FR32:$src1, FR32:$src2),
+                    (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RR>,
                     XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
-                    Sched<[WriteCvtF2F]>;
+                    Sched<[WriteCvtF2F]>, VEX_WIG;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
-                    (ins FR32:$src1, f32mem:$src2),
+                    (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RM>,
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
 def : Pat<(f64 (fpextend FR32:$src)),
-    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fpextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
 def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
     Requires<[UseAVX, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
     Requires<[UseAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
@@ -1895,15 +1850,15 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
-                    Sched<[WriteCvtF2F]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1999,22 +1954,22 @@ def : Pat<(v4f32 (X86Movss
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
-                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
@@ -2035,7 +1990,7 @@ def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
-                       VEX, Sched<[WriteCvtF2I]>;
+                       VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
@@ -2044,7 +1999,7 @@ def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
-                      Sched<[WriteCvtF2ILd]>;
+                      Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2053,12 +2008,12 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                       VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
@@ -2083,23 +2038,23 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
-                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
-                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
-                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
                           IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
-                          Sched<[WriteCvtF2ILd]>;
+                          Sched<[WriteCvtF2ILd]>, VEX_WIG;
 }
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2118,7 +2073,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
-                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
@@ -2132,7 +2087,7 @@ def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
-                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2142,12 +2097,12 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
-                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
-                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 }
 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
@@ -2193,19 +2148,19 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
-                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
-                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -2225,30 +2180,30 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                        VEX, Sched<[WriteCvtI2FLd]>;
+                          (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>,
+                        VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
-                        VEX, Sched<[WriteCvtI2F]>;
+                        VEX, Sched<[WriteCvtI2F]>, VEX_WIG;
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2F]>;
+                         VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG;
 }
 
 let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
+                         (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))],
                        IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2276,7 +2231,7 @@ let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
-                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
@@ -2285,7 +2240,7 @@ let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
-                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2294,11 +2249,11 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (fpround VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
-                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 }
 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
@@ -2368,21 +2323,25 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
+let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
-                 XD, VEX_4V, VEX_LIG;
+                 XD, VEX_4V, VEX_LIG, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
                   i8immZExt3>, XS;
+  let ExeDomain = SSEPackedDouble in
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
@@ -2398,6 +2357,7 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
                                                VR128:$src, immLeaf:$cc))],
                                                itins.rr>,
            Sched<[itins.Sched]>;
+let mayLoad = 1 in
   def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, memop:$src, CC:$cc), asm,
                         [(set VR128:$dst, (Int VR128:$src1,
@@ -2408,18 +2368,22 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
 
 let isCodeGenOnly = 1 in {
   // Aliases to match intrinsics which expect XMM operand(s).
+  let ExeDomain = SSEPackedSingle in
   defm Int_VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                        SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
                        XS, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
   defm Int_VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                        SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
                        XD, VEX_4V;
   let Constraints = "$src1 = $dst" in {
+    let ExeDomain = SSEPackedSingle in
     defm Int_CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
                          SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+    let ExeDomain = SSEPackedDouble in
     defm Int_CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
                          SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
@@ -2437,6 +2401,7 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
                      IIC_SSE_COMIS_RR>,
           Sched<[WriteFAdd]>;
+let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
@@ -2454,6 +2419,7 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
                      IIC_SSE_COMIS_RR>,
           Sched<[WriteFAdd]>;
+let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
@@ -2464,26 +2430,26 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss">, PS, VEX, VEX_LIG;
+                                  "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd">, PD, VEX, VEX_LIG;
+                                  "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                    "comiss">, PS, VEX, VEX_LIG;
+                                    "comiss">, PS, VEX, VEX_LIG, VEX_WIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                    "comisd">, PD, VEX, VEX_LIG;
+                                    "comisd">, PD, VEX, VEX_LIG, VEX_WIG;
   }
 
   let isCodeGenOnly = 1 in {
     defm Int_VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                              sse_load_f32, "ucomiss">, PS, VEX;
+                              sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG;
     defm Int_VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                              sse_load_f64, "ucomisd">, PD, VEX;
+                              sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG;
 
     defm Int_VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                              sse_load_f32, "comiss">, PS, VEX;
+                              sse_load_f32, "comiss">, PS, VEX, VEX_WIG;
     defm Int_VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                              sse_load_f64, "comisd">, PD, VEX;
+                              sse_load_f64, "comisd">, PD, VEX, VEX_WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss">, PS;
@@ -2512,18 +2478,19 @@ let Defs = [EFLAGS] in {
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, Intrinsic Int, string asm,
+                            Operand CC,  ValueType VT, string asm,
                             string asm_alt, Domain d, ImmLeaf immLeaf,
                             PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))],
              itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+             [(set RC:$dst,
+               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))],
              itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
@@ -2540,67 +2507,33 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
-defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
-defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
-defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
-defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
-  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
-  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
 }
 
-let Predicates = [HasAVX] in {
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
-          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
-          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-
-def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
-          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
-          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
-          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
-          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [UseSSE1] in {
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
-          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [UseSSE2] in {
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
-          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Shuffle Instructions
 //===----------------------------------------------------------------------===//
@@ -2624,16 +2557,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
 let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2715,29 +2648,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2789,13 +2722,13 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                        SSEPackedSingle>, PS, VEX;
+                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                        SSEPackedDouble>, PD, VEX;
+                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
-                                         SSEPackedSingle>, PS, VEX, VEX_L;
+                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
-                                         SSEPackedDouble>, PD, VEX, VEX_L;
+                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
@@ -2839,7 +2772,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          OpndItins itins, bit IsCommutable = 0, Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2848,7 +2781,7 @@ let Constraints = "$src1 = $dst" in
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, loadv4i64, i256mem, itins,
-                               IsCommutable, 0>, VEX_4V, VEX_L;
+                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 // These are ordered here for pattern ordering requirements with the fp versions
@@ -2876,7 +2809,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
                                   (bc_v4i64 (v8f32 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2884,14 +2817,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (loadv4i64 addr:$src2)))], 0>,
-                                  PD, VEX_4V, VEX_L;
+                                  PD, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
                                  (bc_v2i64 (v4f32 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
@@ -2899,7 +2832,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (loadv2i64 addr:$src2)))], 0>,
-                                                 PD, VEX_4V;
+                                                 PD, VEX_4V, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -3065,17 +2998,17 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -3092,10 +3025,10 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   SizeItins itins> {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                          OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
-                         XS, VEX_4V, VEX_LIG;
+                         XS, VEX_4V, VEX_LIG, VEX_WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                          OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
-                         XD, VEX_4V, VEX_LIG;
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -3108,21 +3041,20 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                      SDPatternOperator IntSS,
-                                      SDPatternOperator IntSD,
+                                      SDPatternOperator OpNode,
                                       SizeItins itins> {
-  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
-  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
-    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
                    SSEPackedSingle, itins.s>, XS;
-    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
                    SSEPackedDouble, itins.d>, XD;
   }
@@ -3131,29 +3063,23 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 // Binary Arithmetic instructions
 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
            basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
-                                      SSE_ALU_ITINS_S>;
+           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>;
 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
            basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
-                                      SSE_MUL_ITINS_S>;
+           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>;
 let isCommutable = 0 in {
   defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
-                                        SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>;
   defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
              basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
-                                        SSE_DIV_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>;
   defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
-                                        int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>;
   defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
-                                        int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>;
 }
 
 let isCodeGenOnly = 1 in {
@@ -3400,7 +3326,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
             Sched<[itins.Sched.Folded, ReadAfterLd]>,
             Requires<[target, OptForSize]>;
 
-  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
@@ -3444,7 +3370,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  let isCodeGenOnly = 1 in {
+  let isCodeGenOnly = 1, ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
                 (ins VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3465,7 +3391,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   // which has a clobber before the rcp, vs.
   // vrcpss mem, %xmm0, %xmm0
   // TODO: In theory, we could fold the load, and avoid the stall caused by
-  // the partial register store, either in ExeDepFix or with smarter RA.
+  // the partial register store, either in ExecutionDepsFix or with smarter RA.
   let Predicates = [UseAVX] in {
    def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
@@ -3495,22 +3421,22 @@ let Predicates = prds in {
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>;
+                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
 }
 
   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3531,22 +3457,22 @@ let Predicates = [HasAVX] in {
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>;
+                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
 }
 
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3567,7 +3493,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
                       f32mem,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
-                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3579,7 +3505,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          f64mem,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
                          OpNode, SSEPackedDouble, itins, "SD">,
-                         XD, VEX_4V, VEX_LIG;
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 // Square root.
@@ -3647,41 +3573,41 @@ def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f32 VR128:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX;
+                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f128mem:$dst, VR128:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v2f64 VR128:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX;
+                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
 
 let ExeDomain = SSEPackedInt in
 def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
+                         (ins i128mem:$dst, VR128:$src),
                          "movntdq\t{$src, $dst|$dst, $src}",
                          [(alignednontemporalstore (v2i64 VR128:$src),
                                                    addr:$dst)],
-                                                   IIC_SSE_MOVNT>, VEX;
+                                                   IIC_SSE_MOVNT>, VEX, VEX_WIG;
 
 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v8f32 VR256:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f64 VR256:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 let ExeDomain = SSEPackedInt in
 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
-                    (ins f256mem:$dst, VR256:$src),
+                    (ins i256mem:$dst, VR256:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4i64 VR256:$src),
                                               addr:$dst)],
-                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+                                              IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 }
 
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
@@ -3771,8 +3697,7 @@ let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
 def PAUSE : I<0x90, RawFrm, (outs), (ins),
-              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
-              OBXS, Requires<[HasSSE2]>;
+              "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS;
 }
 
 let SchedRW = [WriteFence] in {
@@ -3797,20 +3722,18 @@ def : Pat<(X86MFence), (MFENCE)>;
 //===----------------------------------------------------------------------===//
 
 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+               IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG;
 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+               IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG;
 
-let Predicates = [UseSSE1] in {
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+              IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
-}
+              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+              IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3821,16 +3744,16 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
 let hasSideEffects = 0, SchedRW = [WriteMove] in {
 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX;
+                    VEX, VEX_WIG;
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX, VEX_L;
+                    VEX, VEX_L, VEX_WIG;
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX;
+                    VEX, VEX_WIG;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX, VEX_L;
+                    VEX, VEX_L, VEX_WIG;
 }
 
 // For Disassembler
@@ -3839,54 +3762,60 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>,
-                        VEX;
+                        VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+                        FoldGenData<"VMOVDQAYrr">;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
-                        VEX;
+                        VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+                        FoldGenData<"VMOVDQUYrr">;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
     hasSideEffects = 0, SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX;
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv2i64 addr:$src))],
+                   IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX, VEX_L;
-let Predicates = [HasAVX] in {
-  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX;
-  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX, VEX_L;
-}
+                   VEX, VEX_L, VEX_WIG;
+let Predicates = [HasAVX,NoVLX] in
+def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (loadv2i64 addr:$src))],
+                  IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG;
+def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                  XS, VEX, VEX_L, VEX_WIG;
 }
 
 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i128mem:$dst, VR128:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX;
+                     "movdqa\t{$src, $dst|$dst, $src}",
+                     [(alignedstore (v2i64 VR128:$src), addr:$dst)],
+                     IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i256mem:$dst, VR256:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX, VEX_L;
-let Predicates = [HasAVX] in {
+                     VEX, VEX_L, VEX_WIG;
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX;
+                  "vmovdqu\t{$src, $dst|$dst, $src}",
+                  [(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_WIG;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX, VEX_L;
-}
+                  XS, VEX, VEX_L, VEX_WIG;
 }
 
 let SchedRW = [WriteMove] in {
@@ -3903,11 +3832,12 @@ def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqa\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>;
+                       IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">;
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>,
+                       FoldGenData<"MOVDQUrr">;
 }
 } // SchedRW
 
@@ -3949,6 +3879,50 @@ def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
                 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
+let Predicates = [HasAVX, NoVLX] in {
+  // Additional patterns for other integer sizes.
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
 //===---------------------------------------------------------------------===//
@@ -4037,12 +4011,12 @@ defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
                                VR256, loadv4i64, i256mem, SSE_PMADD,
-                               0>, VEX_4V, VEX_L;
+                               0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                              memopv2i64, i128mem, SSE_PMADD>;
@@ -4050,11 +4024,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                             VEX_4V;
+                             VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
                              loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
-                             VEX_4V, VEX_L;
+                             VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
                             memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
@@ -4062,11 +4036,11 @@ defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
 let Predicates = [HasAVX, NoVLX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
                               loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                              VEX_4V;
+                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, loadv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
+                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
@@ -4113,11 +4087,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, DstVT128, SrcVT,
-                              loadv2i64, 0>, VEX_4V;
+                              loadv2i64, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, DstVT256, SrcVT,
-                                loadv2i64, 0>, VEX_4V, VEX_L;
+                                loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                            VR128, DstVT128, SrcVT, memopv2i64>;
@@ -4138,10 +4112,10 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
                            SDNode OpNode> {
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                             VR128, v16i8, 0>, VEX_4V;
+                             VR128, v16i8, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                               VR256, v32i8, 0>, VEX_4V, VEX_L;
+                               VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
 }
@@ -4202,7 +4176,7 @@ let Predicates = [HasAVX, prd] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
@@ -4210,7 +4184,7 @@ let Predicates = [HasAVX, prd] in {
                      [(set VR128:$dst,
                        (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
-                  Sched<[WriteShuffleLd]>;
+                  Sched<[WriteShuffleLd]>, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, prd] in {
@@ -4220,7 +4194,7 @@ let Predicates = [HasAVX2, prd] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
@@ -4228,7 +4202,7 @@ let Predicates = [HasAVX2, prd] in {
                       [(set VR256:$dst,
                         (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
-                   Sched<[WriteShuffleLd]>;
+                   Sched<[WriteShuffleLd]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -4257,20 +4231,6 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
                              NoVLX_Or_NoBWI>, XD;
 
-let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
-            (VPSHUFDmi addr:$src1, imm:$imm)>;
-  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-            (VPSHUFDri VR128:$src1, imm:$imm)>;
-}
-
-let Predicates = [UseSSE2] in {
-  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
-            (PSHUFDmi addr:$src1, imm:$imm)>;
-  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-            (PSHUFDri VR128:$src1, imm:$imm)>;
-}
-
 //===---------------------------------------------------------------------===//
 // Packed Integer Pack Instructions (SSE & AVX)
 //===---------------------------------------------------------------------===//
@@ -4364,24 +4324,24 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
                              loadv2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
                                VEX_4V, VEX_L;
 }
@@ -4443,44 +4403,44 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4565,14 +4525,14 @@ def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
-           IIC_SSE_MOVMSK>, VEX;
+           IIC_SSE_MOVMSK>, VEX, VEX_WIG;
 
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
-           VEX, VEX_L;
+           VEX, VEX_L, VEX_WIG;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
@@ -4593,13 +4553,13 @@ def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
-           IIC_SSE_MASKMOV>, VEX;
+           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
-           IIC_SSE_MASKMOV>, VEX;
+           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
 
 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -4659,17 +4619,17 @@ def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
 def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 let isCodeGenOnly = 1 in
 def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
 } // ExeDomain = SSEPackedInt
@@ -4725,19 +4685,6 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
-
-def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
@@ -4751,20 +4698,20 @@ def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                       VEX;
 
 def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
                                                          (iPTR 0)))],
                                                          IIC_SSE_MOVD_ToGP>;
 } //SchedRW
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
                           (ins i64mem:$dst, VR128:$src),
                           "movq\t{$src, $dst|$dst, $src}",
                           [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
 
@@ -4791,7 +4738,7 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
                          IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
   def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         "movq\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
                          IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
   def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
@@ -4837,6 +4784,8 @@ let Predicates = [UseAVX] in {
   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   // These instructions also write zeros in the high part of a 256-bit register.
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -4866,6 +4815,8 @@ let Predicates = [UseSSE2] in {
               (MOV64toPQIrr GR64:$src)>;
   }
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (MOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -4877,12 +4828,12 @@ let Predicates = [UseSSE2] in {
   }
 }
 
-// These are the correct encodings of the instructions so that we know how to
-// read correct assembly, even though we continue to emit the wrong ones for
-// compatibility with Darwin's buggy assembler.
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
+// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
+// these aliases.
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
+def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
 // Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
 def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
@@ -4903,7 +4854,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                    VEX, Requires<[UseAVX]>;
+                    VEX, Requires<[UseAVX]>, VEX_WIG;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4920,7 +4871,7 @@ def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
-                                    IIC_SSE_MOVDQ>, VEX;
+                                    IIC_SSE_MOVDQ>, VEX, VEX_WIG;
 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
@@ -4932,7 +4883,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteVecLogic] in {
 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG;
 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
 }
@@ -4978,7 +4929,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[UseAVX]>;
+                      XS, VEX, Requires<[UseAVX]>, VEX_WIG;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -5016,13 +4967,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -5090,8 +5041,8 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
@@ -5108,16 +5059,6 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVDDUPYrr VR256:$src)>;
 }
 
-let Predicates = [HasAVX] in {
-  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64
-                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-}
-
 let Predicates = [HasAVX, NoVLX] in
 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
           (VMOVDDUPrm addr:$src)>;
@@ -5128,13 +5069,6 @@ def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
 let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
-            (MOVDDUPrm addr:$src)>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
-            (MOVDDUPrm addr:$src)>;
-  def : Pat<(X86Movddup (bc_v2f64
-                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-            (MOVDDUPrm addr:$src)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5145,11 +5079,11 @@ let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
                    [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
-                   VEX, VEX_L;
+                   VEX, VEX_L, VEX_WIG;
 }
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
@@ -5183,15 +5117,15 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
@@ -5248,14 +5182,14 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
-      Sched<[WriteFAdd]>;
+      Sched<[WriteFHAdd]>;
 
   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
-        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
                   X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
@@ -5265,36 +5199,36 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
-      Sched<[WriteFAdd]>;
+      Sched<[WriteFHAdd]>;
 
   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
-        IIC_SSE_HADDSUB_RM>, Sched<[WriteFAddLd, ReadAfterLd]>;
+        IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, loadv4f32, 0>, VEX_4V;
+                            X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, loadv4f32, 0>, VEX_4V;
+                            X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                            X86fhadd, loadv2f64, 0>, VEX_4V;
+                            X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                            X86fhsub, loadv2f64, 0>, VEX_4V;
+                            X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 
@@ -5352,90 +5286,30 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   Sched<[WriteVecALULd]>;
 }
 
-// Helper fragments to match sext vXi1 to vXiY.
-def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
-                                               VR128:$src))>;
-def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
-def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
-def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
-                                               VR256:$src))>;
-def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
-def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
-
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
-  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
+  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
-}
-
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(xor
-            (bc_v2i64 (v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
-            (VPABSBrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
-            (VPABSWrr VR128:$src)>;
+  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG;
 }
-let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
-            (VPABSDrr VR128:$src)>;
-}
-
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
-  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
-}
-let Predicates = [HasAVX2, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
-}
-
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  def : Pat<(xor
-            (bc_v4i64 (v32i1sextv32i8)),
-            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
-            (VPABSBYrr VR256:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (v16i1sextv16i16)),
-            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
-            (VPABSWYrr VR256:$src)>;
+  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
-  def : Pat<(xor
-            (bc_v4i64 (v8i1sextv8i32)),
-            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
-            (VPABSDYrr VR256:$src)>;
+  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG;
 }
 
-defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
-defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
-defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
-
-let Predicates = [UseSSSE3] in {
-  def : Pat<(xor
-            (bc_v2i64 (v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
-            (PABSBrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
-            (PABSWrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
-            (PABSDrr VR128:$src)>;
-}
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
 //===---------------------------------------------------------------------===//
 
-let Sched = WriteVecALU in {
+let Sched = WritePHAdd in {
 def SSE_PHADDSUBD : OpndItins<
   IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
 >;
@@ -5527,45 +5401,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PSHUFB, 0>, VEX_4V;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
                                   v16i8, VR128, loadv2i64, i128mem,
-                                  SSE_PMADD, 0>, VEX_4V;
+                                  SSE_PMADD, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V;
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBD, 0>, VEX_4V;
+                                  SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 }
 
@@ -5573,42 +5447,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
                                    v32i8, VR256, loadv4i64, i256mem,
-                                   SSE_PMADD, 0>, VEX_4V, VEX_L;
+                                   SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
                                         int_x86_avx2_phsub_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
 }
 }
 
@@ -5686,9 +5560,9 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2] in
-  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGNR : ssse3_palignr<"palignr">;
 
@@ -5779,10 +5653,10 @@ multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
   defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
   let Predicates = [HasAVX, prd] in
     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
-                                     VR128, VR128, AVXItins>, VEX;
+                                     VR128, VR128, AVXItins>, VEX, VEX_WIG;
   let Predicates = [HasAVX2, prd] in
     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
-                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
+                                     VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG;
 }
 
 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
@@ -6010,12 +5884,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   }
 }
 
-defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
-defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>;
 
 let Predicates = [UseSSE41] in {
-  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
-  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6054,7 +5928,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
                    (ins VR128:$src1, u8imm:$src2),
                    !strconcat(OpcodeStr,
                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   []>, Sched<[WriteShuffle]>;
+                   []>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>;
 
   let hasSideEffects = 0, mayStore = 1,
       SchedRW = [WriteShuffleLd, WriteRMW] in
@@ -6103,20 +5977,20 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR64:$dst,
                   (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
-                  Sched<[WriteShuffle]>, REX_W;
+                  Sched<[WriteShuffle]>;
   let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
-                          addr:$dst)]>, REX_W;
+                          addr:$dst)]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
 
-defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
@@ -6140,7 +6014,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
   defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
 }
 
@@ -6268,7 +6142,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG;
   let Constraints = "$src1 = $dst" in
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
@@ -6461,14 +6335,14 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
                                  loadv4f32, loadv2f64,
                                  int_x86_sse41_round_ps,
-                                 int_x86_sse41_round_pd>, VEX;
+                                 int_x86_sse41_round_pd>, VEX, VEX_WIG;
   defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
                                  loadv8f32, loadv4f64,
                                  int_x86_avx_round_ps_256,
-                                 int_x86_avx_round_pd_256>, VEX, VEX_L;
+                                 int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG;
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround",
                                  int_x86_sse41_round_ss,
-                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
@@ -6606,20 +6480,20 @@ let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX;
+                Sched<[WriteVecLogic]>, VEX, VEX_WIG;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX, VEX_L;
+                Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG;
 }
 
 let Defs = [EFLAGS] in {
@@ -6722,7 +6596,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
                                          int_x86_sse41_phminposuw, loadv2i64,
-                                         WriteVecIMul>, VEX;
+                                         WriteVecIMul>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
                                          int_x86_sse41_phminposuw, memopv2i64,
                                          WriteVecIMul>;
@@ -6778,65 +6652,65 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
                                    VR128, loadv2i64, i128mem,
-                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6861,22 +6735,23 @@ let Constraints = "$src1 = $dst" in {
                                   SSE_INTMUL_ITINS_P, 1>;
 }
 
-let Predicates = [HasAVX, NoVLX] in {
+let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
                                  loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
-                                 VEX_4V;
+                                 VEX_4V, VEX_WIG;
+let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                 VEX_4V;
-}
-let Predicates = [HasAVX2] in {
+                                 VEX_4V, VEX_WIG;
+
+let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
+let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
-}
+                                  VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
@@ -6945,52 +6820,52 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                         VR128, loadv2i64, i128mem, 0,
-                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in {
   defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
                                   VR128, loadv4f32, f128mem, 0,
-                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
                                    VR256, loadv8f32, f256mem, 0,
-                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
   defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
                                   VR128, loadv2f64, f128mem, 0,
-                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
                                    VR256, loadv4f64, f256mem, 0,
-                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
                                   VR128, loadv2i64, i128mem, 0,
-                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, loadv4f32, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V;
+                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, loadv2f64, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V;
+                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                     VR256, loadv8f32, i256mem, 0,
-                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, loadv4i64, i256mem, 0,
-                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
                                    VR256, loadv4i64, i256mem, 0,
-                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7020,6 +6895,19 @@ let Constraints = "$src1 = $dst" in {
                                   SSE_DPPD_ITINS>;
 }
 
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX] in {
+def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
+          (VBLENDPDYrri VR256:$src1,
+                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0x3)>;
+def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
@@ -7165,14 +7053,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{$src2, $dst|$dst, $src2}"),
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
                     itins.rr>, Sched<[itins.Sched]>;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{$src2, $dst|$dst, $src2}"),
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (mem_frag addr:$src2)), XMM0))],
@@ -7193,18 +7081,18 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   DEFAULT_ITINS_VARBLENDSCHED>;
 
 // Aliases with the implicit xmm0 argument
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
@@ -7228,17 +7116,14 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
 let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX, NoVLX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "vmovntdqa\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
-                       VEX;
+                       "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+                       VEX, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                         "vmovntdqa\t{$src, $dst|$dst, $src}",
-                         [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         VEX, VEX_L;
+                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
+                         VEX, VEX_L, VEX_WIG;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                       "movntdqa\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+                       "movntdqa\t{$src, $dst|$dst, $src}", []>;
 } // SchedRW
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -7277,33 +7162,37 @@ let Predicates = [UseSSE41] in {
 /// SS42I_binop_rm - Simple SSE 4.2 binary operator
 multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, bit Is2Addr = 1> {
+                          X86MemOperand x86memop, OpndItins itins,
+                          bit Is2Addr = 1> {
   def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>;
+       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, Sched<[itins.Sched]>;
   def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
        (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set RC:$dst,
-         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>;
+         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, 0>, VEX_4V;
+                                 loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>,
+                                 VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>,
+                                  VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
@@ -7323,7 +7212,7 @@ multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
-                         Requires<[HasAVX]>;
+                         Requires<[HasAVX]>, VEX_WIG;
   defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
                          Requires<[UseSSE42]>;
 }
@@ -7397,7 +7286,7 @@ multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
-                      Requires<[HasAVX]>;
+                      Requires<[HasAVX]>, VEX_WIG;
   defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
                       Requires<[UseSSE42]>;
 }
@@ -7515,14 +7404,18 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                       bit UsesXMM0 = 0> {
   def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             !if(UsesXMM0,
+                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
 
   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             !if(UsesXMM0,
+                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
                     (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
@@ -7557,10 +7450,10 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
 }
 
 // Aliases with explicit %xmm0
-def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
-def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
 
 //===----------------------------------------------------------------------===//
 // AES-NI Instructions
@@ -7588,13 +7481,13 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7615,12 +7508,12 @@ let Predicates = [HasAVX, HasAES] in {
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst,
         (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
-      VEX;
+      VEX, VEX_WIG;
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
-      Sched<[WriteAESIMCLd]>, VEX;
+      Sched<[WriteAESIMCLd]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1),
@@ -7640,13 +7533,13 @@ let Predicates = [HasAVX, HasAES] in {
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
-      Sched<[WriteAESKeyGen]>, VEX;
+      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
-      Sched<[WriteAESKeyGenLd]>, VEX;
+      Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, u8imm:$src2),
@@ -7672,14 +7565,14 @@ def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst,
              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
-           Sched<[WriteCLMul]>;
+           Sched<[WriteCLMul]>, VEX_WIG;
 
 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
                               (loadv2i64 addr:$src2), imm:$src3))]>,
-           Sched<[WriteCLMulLd, ReadAfterLd]>;
+           Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -7879,6 +7772,13 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
 }
 
+// To create a 256-bit all ones value, we should produce VCMPTRUEPS
+// with YMM register containing zero.
+// FIXME: Avoid producing vxorps to clear the fake inputs.
+let Predicates = [HasAVX1Only] in {
+def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
+}
+
 multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
                             PatFrag memop_frag> {
   def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
@@ -8029,41 +7929,6 @@ let ExeDomain = SSEPackedDouble in {
                                loadv4i64, v4f64, v4i64>, VEX_L;
 }
 
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
-          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
-def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
-          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
-def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
-          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
-def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
-          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
-
-def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
-                               (i8 imm:$imm))),
-          (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
-          (VPERMILPDYmi addr:$src1, imm:$imm)>;
-
-def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
-          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
-          (VPERMILPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
-          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
-          (VPERMILPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
-          (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
-          (VPERMILPDmi addr:$src1, imm:$imm)>;
-}
-
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
@@ -8118,15 +7983,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 //
+// Note, these instruction do not affect the YMM16-YMM31.
 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8235,6 +8101,46 @@ defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
 defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
                                 VR256, loadv4i64, i256mem>, VEX_L;
 
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX2] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
 //===----------------------------------------------------------------------===//
 // VPBROADCAST - Load from memory and broadcast to all elements of the
 //               destination operand
@@ -8282,6 +8188,11 @@ defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
                                     v2i64, v4i64, NoVLX>;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQrm addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQYrm addr:$src)>;
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   // This means we'll encounter truncated i32 loads; match that here.
   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -8296,7 +8207,7 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
             (VPBROADCASTWYrm addr:$src)>;
 }
 
-let Predicates = [HasAVX2] in {
+let Predicates = [HasAVX2, NoVLX] in {
   // Provide aliases for broadcast from the same register class that
   // automatically does the extract.
   def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
@@ -8343,18 +8254,13 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+            (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
-            (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
-  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
-            (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-
-  // The patterns for VPBROADCASTD are not needed because they would match
-  // the exact same thing as VBROADCASTSS patterns.
-
+            (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
-        (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-  // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+            (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
 }
 
 // AVX1 broadcast patterns
@@ -8377,15 +8283,15 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
-            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+            (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm),
+              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>;
 
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
             (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
@@ -8399,7 +8305,7 @@ let Predicates = [HasAVX1Only] in {
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
 
   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
-              (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+            (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8407,7 +8313,8 @@ let Predicates = [HasAVX1Only] in {
 //
 
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                     ValueType OpVT, X86FoldableSchedWrite Sched> {
+                     ValueType OpVT, X86FoldableSchedWrite Sched,
+                     X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, VR256:$src2),
@@ -8417,7 +8324,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
                      Sched<[Sched]>, VEX_4V, VEX_L;
     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
-                     (ins VR256:$src1, i256mem:$src2),
+                     (ins VR256:$src1, memOp:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
@@ -8427,12 +8334,15 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256,
+                        i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256,
+                        f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                         ValueType OpVT, X86FoldableSchedWrite Sched> {
+                         ValueType OpVT, X86FoldableSchedWrite Sched,
+                         X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
     def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
                        (ins VR256:$src1, u8imm:$src2),
@@ -8442,7 +8352,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
                        Sched<[Sched]>, VEX, VEX_L;
     def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
-                       (ins i256mem:$src1, u8imm:$src2),
+                       (ins memOp:$src1, u8imm:$src2),
                        !strconcat(OpcodeStr,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
@@ -8453,10 +8363,10 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
-                            WriteShuffle256>, VEX_W;
+                            WriteShuffle256, i256mem>, VEX_W;
 let ExeDomain = SSEPackedDouble in
 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
-                             WriteFShuffle256>, VEX_W;
+                             WriteFShuffle256, f256mem>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
index e2be735..0efb383 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -340,75 +340,71 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
 
 let hasSideEffects = 0 in {
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
+
+let Uses = [CL, EFLAGS] in {
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [CL, EFLAGS]
+
+let Uses = [EFLAGS] in {
 def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
                "rcl{b}\t$dst", [], IIC_SR>;
 def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcl{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
                   "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
-let Uses = [CL] in
-def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
-
 def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcl{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
                   "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
-let Uses = [CL] in
-def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
-
-
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcl{q}\t$dst", [], IIC_SR>;
 def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
                    "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [EFLAGS]
 
+let Uses = [CL, EFLAGS] in {
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [CL, EFLAGS]
 
+let Uses = [EFLAGS] in {
 def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
                "rcr{b}\t$dst", [], IIC_SR>;
 def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcr{w}\t$dst", [], IIC_SR>, OpSize16;
 def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt),
                   "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16;
-let Uses = [CL] in
-def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize16;
-
 def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcr{l}\t$dst", [], IIC_SR>, OpSize32;
 def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt),
                   "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
-let Uses = [CL] in
-def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
-
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcr{q}\t$dst", [], IIC_SR>;
 def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
-let Uses = [CL] in
-def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+} // Uses = [EFLAGS]
 
 } // Constraints = "$src = $dst"
 
-let SchedRW = [WriteShiftLd, WriteRMW] in {
+let SchedRW = [WriteShiftLd, WriteRMW], mayStore = 1 in {
+let Uses = [EFLAGS] in {
 def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst),
                "rcl{b}\t$dst", [], IIC_SR>;
 def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt),
@@ -442,8 +438,9 @@ def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
                  "rcr{q}\t$dst", [], IIC_SR>;
 def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+} // Uses = [EFLAGS]
 
-let Uses = [CL] in {
+let Uses = [CL, EFLAGS] in {
 def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
                 "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
@@ -461,7 +458,7 @@ def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
                  "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
 def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
                   "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
-}
+} // Uses = [CL, EFLAGS]
 } // SchedRW
 } // hasSideEffects = 0
 
@@ -665,19 +662,19 @@ def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
 // Rotate by 1
 def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
                  "ror{b}\t$dst",
-               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)],
                IIC_SR>;
 def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t$dst",
-              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)],
               IIC_SR>, OpSize16;
 def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t$dst",
-              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)],
               IIC_SR>, OpSize32;
 def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                  "ror{q}\t$dst",
-               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)],
                IIC_SR>;
 } // SchedRW
 
@@ -849,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
 
 } // Defs = [EFLAGS]
 
+// Sandy Bridge and newer Intel processors support faster rotates using
+// SHLD to avoid a partial flag update on the normal rotate instructions.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
+  def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+            (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+  def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+            (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+}
+
 def ROT32L2R_imm8  : SDNodeXForm<imm, [{
   // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
   return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
index 9265d64..2e5350c 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td
@@ -173,27 +173,28 @@ def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>, OpSize32;
 def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
-
+let mayStore = 1 in {
 def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
 def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
 def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
-
+}
 def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
 def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize32;
 def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
-
+let mayLoad = 1 in {
 def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
                 "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
 def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
                 "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
 def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
                  "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+}
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -202,6 +203,7 @@ def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
 let SchedRW = [WriteSystem] in {
 def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB;
 
+let mayLoad = 1 in
 def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
                 OpSize16;
@@ -210,6 +212,7 @@ def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                 OpSize16;
 
 // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+let mayLoad = 1 in
 def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB,
                 OpSize32;
@@ -217,23 +220,27 @@ def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                 "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB,
                 OpSize32;
 // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+let mayLoad = 1 in
 def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB;
 def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
                  "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB;
 
+let mayLoad = 1 in
 def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                 "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
                 OpSize16;
 def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                 "lsl{w}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
                 OpSize16;
+let mayLoad = 1 in
 def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB,
                 OpSize32;
 def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                 "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB,
                 OpSize32;
+let mayLoad = 1 in
 def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                  "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB;
 def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
@@ -248,11 +255,13 @@ def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins),
                "str{l}\t$dst", [], IIC_STR>, TB, OpSize32;
 def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins),
                 "str{q}\t$dst", [], IIC_STR>, TB;
+let mayStore = 1 in
 def STRm   : I<0x00, MRM1m, (outs), (ins i16mem:$dst),
                "str{w}\t$dst", [], IIC_STR>, TB;
 
 def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
+let mayLoad = 1 in
 def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
 
@@ -377,12 +386,14 @@ def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
 
 def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
               "verr\t$seg", [], IIC_VERR>, TB;
-def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
-              "verr\t$seg", [], IIC_VERR>, TB;
 def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
               "verw\t$seg", [], IIC_VERW_MEM>, TB;
+let mayLoad = 1 in {
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", [], IIC_VERR>, TB;
 def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
               "verw\t$seg", [], IIC_VERW_REG>, TB;
+}
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -403,6 +414,7 @@ def SIDT64m : I<0x01, MRM1m, (outs), (ins opaque80mem:$dst),
               "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>;
 def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB, OpSize16;
+let mayStore = 1 in
 def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst),
                 "sldt{w}\t$dst", [], IIC_SLDT>, TB;
 def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
@@ -412,6 +424,7 @@ def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins),
 //   extension.
 def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
+let mayStore = 1 in
 def SLDT64m : RI<0x00, MRM0m, (outs), (ins i16mem:$dst),
                  "sldt{q}\t$dst", [], IIC_SLDT>, TB;
 
@@ -429,6 +442,7 @@ def LIDT64m : I<0x01, MRM3m, (outs), (ins opaque80mem:$src),
               "lidt{q}\t$src", [], IIC_LIDT>, TB, Requires<[In64BitMode]>;
 def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
                 "lldt{w}\t$src", [], IIC_LLDT_REG>, TB;
+let mayLoad = 1 in
 def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
                 "lldt{w}\t$src", [], IIC_LLDT_MEM>, TB;
 } // SchedRW
@@ -459,6 +473,7 @@ def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst),
 
 def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
                 "lmsw{w}\t$src", [], IIC_LMSW_MEM>, TB;
+let mayLoad = 1 in
 def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
                 "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB;
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
index 7267d75..61aac58 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td
@@ -25,9 +25,14 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
 
 let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
 def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
-                         "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+                         "xbegin\t$dst", []>, OpSize16;
 def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
-                         "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+                         "xbegin\t$dst", []>, OpSize32;
+}
+
+// Psuedo instruction to fake the definition of EAX on the fallback code path.
+let isPseudo = 1, Defs = [EAX] in {
+def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
@@ -35,7 +40,7 @@ def XEND : I<0x01, MRM_D5, (outs), (ins),
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
@@ -44,7 +49,7 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
 // HLE prefixes
 
 let isAsmParserOnly = 1 in {
-def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
-def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>;
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h b/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
deleted file mode 100755
index 415a891..0000000
--- a/contrib/llvm/lib/Target/X86/X86InstrTablesInfo.h
+++ /dev/null
@@ -1,1162 +0,0 @@
-//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains related X86 Instruction Information Tables.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
-#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
-
-using namespace llvm;
-
-struct X86EvexToVexCompressTableEntry {
-  uint16_t EvexOpcode;
-  uint16_t VexOpcode;
-};
-
-
-
-// X86 EVEX encoded instructions that have a VEX 128 encoding
-// (table format: <EVEX opcode, VEX-128 opcode>).
-static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = {
-  // EVEX scalar with corresponding VEX.
-  { X86::Int_VCOMISDZrm         ,  X86::Int_VCOMISDrm            },
-  { X86::Int_VCOMISDZrr         ,  X86::Int_VCOMISDrr            },
-  { X86::Int_VCOMISSZrm         ,  X86::Int_VCOMISSrm            },
-  { X86::Int_VCOMISSZrr         ,  X86::Int_VCOMISSrr            },
-  { X86::Int_VUCOMISDZrm        ,  X86::Int_VUCOMISDrm           },
-  { X86::Int_VUCOMISDZrr        ,  X86::Int_VUCOMISDrr           },
-  { X86::Int_VUCOMISSZrm        ,  X86::Int_VUCOMISSrm           },
-  { X86::Int_VUCOMISSZrr        ,  X86::Int_VUCOMISSrr           },
-  { X86::VADDSDZrm              ,  X86::VADDSDrm                 },
-  { X86::VADDSDZrm_Int          ,  X86::VADDSDrm_Int             },
-  { X86::VADDSDZrr              ,  X86::VADDSDrr                 },
-  { X86::VADDSDZrr_Int          ,  X86::VADDSDrr_Int             },
-  { X86::VADDSSZrm              ,  X86::VADDSSrm                 },
-  { X86::VADDSSZrm_Int          ,  X86::VADDSSrm_Int             },
-  { X86::VADDSSZrr              ,  X86::VADDSSrr                 },
-  { X86::VADDSSZrr_Int          ,  X86::VADDSSrr_Int             },
-  { X86::VCOMISDZrm             ,  X86::VCOMISDrm                },
-  { X86::VCOMISDZrr             ,  X86::VCOMISDrr                },
-  { X86::VCOMISSZrm             ,  X86::VCOMISSrm                },
-  { X86::VCOMISSZrr             ,  X86::VCOMISSrr                },
-  { X86::VCVTSD2SI64Zrm         ,  X86::VCVTSD2SI64rm            },
-  { X86::VCVTSD2SI64Zrr         ,  X86::VCVTSD2SI64rr            },
-  { X86::VCVTSD2SIZrm           ,  X86::VCVTSD2SIrm              },
-  { X86::VCVTSD2SIZrr           ,  X86::VCVTSD2SIrr              },
-  { X86::VCVTSD2SSZrm           ,  X86::VCVTSD2SSrm              },
-  { X86::VCVTSD2SSZrr           ,  X86::VCVTSD2SSrr              },
-  { X86::VCVTSI2SDZrm           ,  X86::VCVTSI2SDrm              },
-  { X86::VCVTSI2SDZrm_Int       ,  X86::Int_VCVTSI2SDrm          },
-  { X86::VCVTSI2SDZrr           ,  X86::VCVTSI2SDrr              },
-  { X86::VCVTSI2SDZrr_Int       ,  X86::Int_VCVTSI2SDrr          },
-  { X86::VCVTSI2SSZrm           ,  X86::VCVTSI2SSrm              },
-  { X86::VCVTSI2SSZrm_Int       ,  X86::Int_VCVTSI2SSrm          },
-  { X86::VCVTSI2SSZrr           ,  X86::VCVTSI2SSrr              },
-  { X86::VCVTSI2SSZrr_Int       ,  X86::Int_VCVTSI2SSrr          },
-  { X86::VCVTSS2SDZrm           ,  X86::VCVTSS2SDrm              },
-  { X86::VCVTSS2SDZrr           ,  X86::VCVTSS2SDrr              },
-  { X86::VCVTSS2SI64Zrm         ,  X86::VCVTSS2SI64rm            },
-  { X86::VCVTSS2SI64Zrr         ,  X86::VCVTSS2SI64rr            },
-  { X86::VCVTSS2SIZrm           ,  X86::VCVTSS2SIrm              },
-  { X86::VCVTSS2SIZrr           ,  X86::VCVTSS2SIrr              },
-  { X86::VCVTTSD2SI64Zrm        ,  X86::VCVTTSD2SI64rm           },
-  { X86::VCVTTSD2SI64Zrm_Int    ,  X86::Int_VCVTTSD2SI64rm       },
-  { X86::VCVTTSD2SI64Zrr        ,  X86::VCVTTSD2SI64rr           },
-  { X86::VCVTTSD2SI64Zrr_Int    ,  X86::Int_VCVTTSD2SI64rr       },
-  { X86::VCVTTSD2SIZrm          ,  X86::VCVTTSD2SIrm             },
-  { X86::VCVTTSD2SIZrm_Int      ,  X86::Int_VCVTTSD2SIrm         },
-  { X86::VCVTTSD2SIZrr          ,  X86::VCVTTSD2SIrr             },
-  { X86::VCVTTSD2SIZrr_Int      ,  X86::Int_VCVTTSD2SIrr         },
-  { X86::VCVTTSS2SI64Zrm        ,  X86::VCVTTSS2SI64rm           },
-  { X86::VCVTTSS2SI64Zrm_Int    ,  X86::Int_VCVTTSS2SI64rm       },
-  { X86::VCVTTSS2SI64Zrr        ,  X86::VCVTTSS2SI64rr           },
-  { X86::VCVTTSS2SI64Zrr_Int    ,  X86::Int_VCVTTSS2SI64rr       },
-  { X86::VCVTTSS2SIZrm          ,  X86::VCVTTSS2SIrm             },
-  { X86::VCVTTSS2SIZrm_Int      ,  X86::Int_VCVTTSS2SIrm         },
-  { X86::VCVTTSS2SIZrr          ,  X86::VCVTTSS2SIrr             },
-  { X86::VCVTTSS2SIZrr_Int      ,  X86::Int_VCVTTSS2SIrr         },
-  { X86::VDIVSDZrm              ,  X86::VDIVSDrm                 },
-  { X86::VDIVSDZrm_Int          ,  X86::VDIVSDrm_Int             },
-  { X86::VDIVSDZrr              ,  X86::VDIVSDrr                 },
-  { X86::VDIVSDZrr_Int          ,  X86::VDIVSDrr_Int             },
-  { X86::VDIVSSZrm              ,  X86::VDIVSSrm                 },
-  { X86::VDIVSSZrm_Int          ,  X86::VDIVSSrm_Int             },
-  { X86::VDIVSSZrr              ,  X86::VDIVSSrr                 },
-  { X86::VDIVSSZrr_Int          ,  X86::VDIVSSrr_Int             },
-  { X86::VFMADD132SDZm          ,  X86::VFMADD132SDm             },
-  { X86::VFMADD132SDZm_Int      ,  X86::VFMADD132SDm_Int         },
-  { X86::VFMADD132SDZr          ,  X86::VFMADD132SDr             },
-  { X86::VFMADD132SDZr_Int      ,  X86::VFMADD132SDr_Int         },
-  { X86::VFMADD132SSZm          ,  X86::VFMADD132SSm             },
-  { X86::VFMADD132SSZm_Int      ,  X86::VFMADD132SSm_Int         },
-  { X86::VFMADD132SSZr          ,  X86::VFMADD132SSr             },
-  { X86::VFMADD132SSZr_Int      ,  X86::VFMADD132SSr_Int         },
-  { X86::VFMADD213SDZm          ,  X86::VFMADD213SDm             },
-  { X86::VFMADD213SDZm_Int      ,  X86::VFMADD213SDm_Int         },
-  { X86::VFMADD213SDZr          ,  X86::VFMADD213SDr             },
-  { X86::VFMADD213SDZr_Int      ,  X86::VFMADD213SDr_Int         },
-  { X86::VFMADD213SSZm          ,  X86::VFMADD213SSm             },
-  { X86::VFMADD213SSZm_Int      ,  X86::VFMADD213SSm_Int         },
-  { X86::VFMADD213SSZr          ,  X86::VFMADD213SSr             },
-  { X86::VFMADD213SSZr_Int      ,  X86::VFMADD213SSr_Int         },
-  { X86::VFMADD231SDZm          ,  X86::VFMADD231SDm             },
-  { X86::VFMADD231SDZm_Int      ,  X86::VFMADD231SDm_Int         },
-  { X86::VFMADD231SDZr          ,  X86::VFMADD231SDr             },
-  { X86::VFMADD231SDZr_Int      ,  X86::VFMADD231SDr_Int         },
-  { X86::VFMADD231SSZm          ,  X86::VFMADD231SSm             },
-  { X86::VFMADD231SSZm_Int      ,  X86::VFMADD231SSm_Int         },
-  { X86::VFMADD231SSZr          ,  X86::VFMADD231SSr             },
-  { X86::VFMADD231SSZr_Int      ,  X86::VFMADD231SSr_Int         },
-  { X86::VFMSUB132SDZm          ,  X86::VFMSUB132SDm             },
-  { X86::VFMSUB132SDZm_Int      ,  X86::VFMSUB132SDm_Int         },
-  { X86::VFMSUB132SDZr          ,  X86::VFMSUB132SDr             },
-  { X86::VFMSUB132SDZr_Int      ,  X86::VFMSUB132SDr_Int         },
-  { X86::VFMSUB132SSZm          ,  X86::VFMSUB132SSm             },
-  { X86::VFMSUB132SSZm_Int      ,  X86::VFMSUB132SSm_Int         },
-  { X86::VFMSUB132SSZr          ,  X86::VFMSUB132SSr             },
-  { X86::VFMSUB132SSZr_Int      ,  X86::VFMSUB132SSr_Int         },
-  { X86::VFMSUB213SDZm          ,  X86::VFMSUB213SDm             },
-  { X86::VFMSUB213SDZm_Int      ,  X86::VFMSUB213SDm_Int         },
-  { X86::VFMSUB213SDZr          ,  X86::VFMSUB213SDr             },
-  { X86::VFMSUB213SDZr_Int      ,  X86::VFMSUB213SDr_Int         },
-  { X86::VFMSUB213SSZm          ,  X86::VFMSUB213SSm             },
-  { X86::VFMSUB213SSZm_Int      ,  X86::VFMSUB213SSm_Int         },
-  { X86::VFMSUB213SSZr          ,  X86::VFMSUB213SSr             },
-  { X86::VFMSUB213SSZr_Int      ,  X86::VFMSUB213SSr_Int         },
-  { X86::VFMSUB231SDZm          ,  X86::VFMSUB231SDm             },
-  { X86::VFMSUB231SDZm_Int      ,  X86::VFMSUB231SDm_Int         },
-  { X86::VFMSUB231SDZr          ,  X86::VFMSUB231SDr             },
-  { X86::VFMSUB231SDZr_Int      ,  X86::VFMSUB231SDr_Int         },
-  { X86::VFMSUB231SSZm          ,  X86::VFMSUB231SSm             },
-  { X86::VFMSUB231SSZm_Int      ,  X86::VFMSUB231SSm_Int         },
-  { X86::VFMSUB231SSZr          ,  X86::VFMSUB231SSr             },
-  { X86::VFMSUB231SSZr_Int      ,  X86::VFMSUB231SSr_Int         },
-  { X86::VFNMADD132SDZm         ,  X86::VFNMADD132SDm            },
-  { X86::VFNMADD132SDZm_Int     ,  X86::VFNMADD132SDm_Int        },
-  { X86::VFNMADD132SDZr         ,  X86::VFNMADD132SDr            },
-  { X86::VFNMADD132SDZr_Int     ,  X86::VFNMADD132SDr_Int        },
-  { X86::VFNMADD132SSZm         ,  X86::VFNMADD132SSm            },
-  { X86::VFNMADD132SSZm_Int     ,  X86::VFNMADD132SSm_Int        },
-  { X86::VFNMADD132SSZr         ,  X86::VFNMADD132SSr            },
-  { X86::VFNMADD132SSZr_Int     ,  X86::VFNMADD132SSr_Int        },
-  { X86::VFNMADD213SDZm         ,  X86::VFNMADD213SDm            },
-  { X86::VFNMADD213SDZm_Int     ,  X86::VFNMADD213SDm_Int        },
-  { X86::VFNMADD213SDZr         ,  X86::VFNMADD213SDr            },
-  { X86::VFNMADD213SDZr_Int     ,  X86::VFNMADD213SDr_Int        },
-  { X86::VFNMADD213SSZm         ,  X86::VFNMADD213SSm            },
-  { X86::VFNMADD213SSZm_Int     ,  X86::VFNMADD213SSm_Int        },
-  { X86::VFNMADD213SSZr         ,  X86::VFNMADD213SSr            },
-  { X86::VFNMADD213SSZr_Int     ,  X86::VFNMADD213SSr_Int        },
-  { X86::VFNMADD231SDZm         ,  X86::VFNMADD231SDm            },
-  { X86::VFNMADD231SDZm_Int     ,  X86::VFNMADD231SDm_Int        },
-  { X86::VFNMADD231SDZr         ,  X86::VFNMADD231SDr            },
-  { X86::VFNMADD231SDZr_Int     ,  X86::VFNMADD231SDr_Int        },
-  { X86::VFNMADD231SSZm         ,  X86::VFNMADD231SSm            },
-  { X86::VFNMADD231SSZm_Int     ,  X86::VFNMADD231SSm_Int        },
-  { X86::VFNMADD231SSZr         ,  X86::VFNMADD231SSr            },
-  { X86::VFNMADD231SSZr_Int     ,  X86::VFNMADD231SSr_Int        },
-  { X86::VFNMSUB132SDZm         ,  X86::VFNMSUB132SDm            },
-  { X86::VFNMSUB132SDZm_Int     ,  X86::VFNMSUB132SDm_Int        },
-  { X86::VFNMSUB132SDZr         ,  X86::VFNMSUB132SDr            },
-  { X86::VFNMSUB132SDZr_Int     ,  X86::VFNMSUB132SDr_Int        },
-  { X86::VFNMSUB132SSZm         ,  X86::VFNMSUB132SSm            },
-  { X86::VFNMSUB132SSZm_Int     ,  X86::VFNMSUB132SSm_Int        },
-  { X86::VFNMSUB132SSZr         ,  X86::VFNMSUB132SSr            },
-  { X86::VFNMSUB132SSZr_Int     ,  X86::VFNMSUB132SSr_Int        },
-  { X86::VFNMSUB213SDZm         ,  X86::VFNMSUB213SDm            },
-  { X86::VFNMSUB213SDZm_Int     ,  X86::VFNMSUB213SDm_Int        },
-  { X86::VFNMSUB213SDZr         ,  X86::VFNMSUB213SDr            },
-  { X86::VFNMSUB213SDZr_Int     ,  X86::VFNMSUB213SDr_Int        },
-  { X86::VFNMSUB213SSZm         ,  X86::VFNMSUB213SSm            },
-  { X86::VFNMSUB213SSZm_Int     ,  X86::VFNMSUB213SSm_Int        },
-  { X86::VFNMSUB213SSZr         ,  X86::VFNMSUB213SSr            },
-  { X86::VFNMSUB213SSZr_Int     ,  X86::VFNMSUB213SSr_Int        },
-  { X86::VFNMSUB231SDZm         ,  X86::VFNMSUB231SDm            },
-  { X86::VFNMSUB231SDZm_Int     ,  X86::VFNMSUB231SDm_Int        },
-  { X86::VFNMSUB231SDZr         ,  X86::VFNMSUB231SDr            },
-  { X86::VFNMSUB231SDZr_Int     ,  X86::VFNMSUB231SDr_Int        },
-  { X86::VFNMSUB231SSZm         ,  X86::VFNMSUB231SSm            },
-  { X86::VFNMSUB231SSZm_Int     ,  X86::VFNMSUB231SSm_Int        },
-  { X86::VFNMSUB231SSZr         ,  X86::VFNMSUB231SSr            },
-  { X86::VFNMSUB231SSZr_Int     ,  X86::VFNMSUB231SSr_Int        },
-  { X86::VMAXCSDZrm             ,  X86::VMAXCSDrm                },
-  { X86::VMAXCSDZrr             ,  X86::VMAXCSDrr                },
-  { X86::VMAXCSSZrm             ,  X86::VMAXCSSrm                },
-  { X86::VMAXCSSZrr             ,  X86::VMAXCSSrr                },
-  { X86::VMAXSDZrm              ,  X86::VMAXSDrm                 },
-  { X86::VMAXSDZrm_Int          ,  X86::VMAXSDrm_Int             },
-  { X86::VMAXSDZrr              ,  X86::VMAXSDrr                 },
-  { X86::VMAXSDZrr_Int          ,  X86::VMAXSDrr_Int             },
-  { X86::VMAXSSZrm              ,  X86::VMAXSSrm                 },
-  { X86::VMAXSSZrm_Int          ,  X86::VMAXSSrm_Int             },
-  { X86::VMAXSSZrr              ,  X86::VMAXSSrr                 },
-  { X86::VMAXSSZrr_Int          ,  X86::VMAXSSrr_Int             },
-  { X86::VMINCSDZrm             ,  X86::VMINCSDrm                },
-  { X86::VMINCSDZrr             ,  X86::VMINCSDrr                },
-  { X86::VMINCSSZrm             ,  X86::VMINCSSrm                },
-  { X86::VMINCSSZrr             ,  X86::VMINCSSrr                },
-  { X86::VMINSDZrm              ,  X86::VMINSDrm                 },
-  { X86::VMINSDZrm_Int          ,  X86::VMINSDrm_Int             },
-  { X86::VMINSDZrr              ,  X86::VMINSDrr                 },
-  { X86::VMINSDZrr_Int          ,  X86::VMINSDrr_Int             },
-  { X86::VMINSSZrm              ,  X86::VMINSSrm                 },
-  { X86::VMINSSZrm_Int          ,  X86::VMINSSrm_Int             },
-  { X86::VMINSSZrr              ,  X86::VMINSSrr                 },
-  { X86::VMINSSZrr_Int          ,  X86::VMINSSrr_Int             },
-  { X86::VMOV64toSDZrr          ,  X86::VMOV64toSDrr             },
-  { X86::VMOVDI2SSZrm           ,  X86::VMOVDI2SSrm              },
-  { X86::VMOVDI2SSZrr           ,  X86::VMOVDI2SSrr              },
-  { X86::VMOVSDZmr              ,  X86::VMOVSDmr                 },
-  { X86::VMOVSDZrm              ,  X86::VMOVSDrm                 },
-  { X86::VMOVSDZrr              ,  X86::VMOVSDrr                 },
-  { X86::VMOVSSZmr              ,  X86::VMOVSSmr                 },
-  { X86::VMOVSSZrm              ,  X86::VMOVSSrm                 },
-  { X86::VMOVSSZrr              ,  X86::VMOVSSrr                 },
-  { X86::VMOVSSZrr_REV          ,  X86::VMOVSSrr_REV             },
-  { X86::VMULSDZrm              ,  X86::VMULSDrm                 },
-  { X86::VMULSDZrm_Int          ,  X86::VMULSDrm_Int             },
-  { X86::VMULSDZrr              ,  X86::VMULSDrr                 },
-  { X86::VMULSDZrr_Int          ,  X86::VMULSDrr_Int             },
-  { X86::VMULSSZrm              ,  X86::VMULSSrm                 },
-  { X86::VMULSSZrm_Int          ,  X86::VMULSSrm_Int             },
-  { X86::VMULSSZrr              ,  X86::VMULSSrr                 },
-  { X86::VMULSSZrr_Int          ,  X86::VMULSSrr_Int             },
-  { X86::VSQRTSDZm              ,  X86::VSQRTSDm                 },
-  { X86::VSQRTSDZm_Int          ,  X86::VSQRTSDm_Int             },
-  { X86::VSQRTSDZr              ,  X86::VSQRTSDr                 },
-  { X86::VSQRTSDZr_Int          ,  X86::VSQRTSDr_Int             },
-  { X86::VSQRTSSZm              ,  X86::VSQRTSSm                 },
-  { X86::VSQRTSSZm_Int          ,  X86::VSQRTSSm_Int             },
-  { X86::VSQRTSSZr              ,  X86::VSQRTSSr                 },
-  { X86::VSQRTSSZr_Int          ,  X86::VSQRTSSr_Int             },
-  { X86::VSUBSDZrm              ,  X86::VSUBSDrm                 },
-  { X86::VSUBSDZrm_Int          ,  X86::VSUBSDrm_Int             },
-  { X86::VSUBSDZrr              ,  X86::VSUBSDrr                 },
-  { X86::VSUBSDZrr_Int          ,  X86::VSUBSDrr_Int             },
-  { X86::VSUBSSZrm              ,  X86::VSUBSSrm                 },
-  { X86::VSUBSSZrm_Int          ,  X86::VSUBSSrm_Int             },
-  { X86::VSUBSSZrr              ,  X86::VSUBSSrr                 },
-  { X86::VSUBSSZrr_Int          ,  X86::VSUBSSrr_Int             },
-  { X86::VUCOMISDZrm            ,  X86::VUCOMISDrm               },
-  { X86::VUCOMISDZrr            ,  X86::VUCOMISDrr               },
-  { X86::VUCOMISSZrm            ,  X86::VUCOMISSrm               },
-  { X86::VUCOMISSZrr            ,  X86::VUCOMISSrr               },
-
-  { X86::VMOV64toPQIZrr         ,   X86::VMOV64toPQIrr           },
-  { X86::VMOV64toSDZrr          ,   X86::VMOV64toSDrr            },
-  { X86::VMOVDI2PDIZrm          ,   X86::VMOVDI2PDIrm            },
-  { X86::VMOVDI2PDIZrr          ,   X86::VMOVDI2PDIrr            },
-  { X86::VMOVLHPSZrr            ,   X86::VMOVLHPSrr              },
-  { X86::VMOVHLPSZrr            ,   X86::VMOVHLPSrr              },
-  { X86::VMOVPDI2DIZmr          ,   X86::VMOVPDI2DImr            },
-  { X86::VMOVPDI2DIZrr          ,   X86::VMOVPDI2DIrr            },
-  { X86::VMOVPQI2QIZmr          ,   X86::VMOVPQI2QImr            },
-  { X86::VMOVPQIto64Zrr         ,   X86::VMOVPQIto64rr           },
-  { X86::VMOVQI2PQIZrm          ,   X86::VMOVQI2PQIrm            },
-  { X86::VMOVZPQILo2PQIZrr      ,   X86::VMOVZPQILo2PQIrr        },
-
-  { X86::VPEXTRBZmr             ,   X86::VPEXTRBmr               },
-  { X86::VPEXTRBZrr             ,   X86::VPEXTRBrr               },
-  { X86::VPEXTRDZmr             ,   X86::VPEXTRDmr               },
-  { X86::VPEXTRDZrr             ,   X86::VPEXTRDrr               },
-  { X86::VPEXTRQZmr             ,   X86::VPEXTRQmr               },
-  { X86::VPEXTRQZrr             ,   X86::VPEXTRQrr               },
-  { X86::VPEXTRWZmr             ,   X86::VPEXTRWmr               },
-  { X86::VPEXTRWZrr             ,   X86::VPEXTRWri               },
-
-  { X86::VPINSRBZrm             ,   X86::VPINSRBrm               },
-  { X86::VPINSRBZrr             ,   X86::VPINSRBrr               },
-  { X86::VPINSRDZrm             ,   X86::VPINSRDrm               },
-  { X86::VPINSRDZrr             ,   X86::VPINSRDrr               },
-  { X86::VPINSRQZrm             ,   X86::VPINSRQrm               },
-  { X86::VPINSRQZrr             ,   X86::VPINSRQrr               },
-  { X86::VPINSRWZrm             ,   X86::VPINSRWrmi              },
-  { X86::VPINSRWZrr             ,   X86::VPINSRWrri              },
-
-  // EVEX 128 with corresponding VEX.
-  { X86::VADDPDZ128rm           ,    X86::VADDPDrm               },
-  { X86::VADDPDZ128rr           ,    X86::VADDPDrr               },
-  { X86::VADDPSZ128rm           ,    X86::VADDPSrm               },
-  { X86::VADDPSZ128rr           ,    X86::VADDPSrr               },
-  { X86::VANDNPDZ128rm          ,    X86::VANDNPDrm              },
-  { X86::VANDNPDZ128rr          ,    X86::VANDNPDrr              },
-  { X86::VANDNPSZ128rm          ,    X86::VANDNPSrm              },
-  { X86::VANDNPSZ128rr          ,    X86::VANDNPSrr              },
-  { X86::VANDPDZ128rm           ,    X86::VANDPDrm               },
-  { X86::VANDPDZ128rr           ,    X86::VANDPDrr               },
-  { X86::VANDPSZ128rm           ,    X86::VANDPSrm               },
-  { X86::VANDPSZ128rr           ,    X86::VANDPSrr               },
-  { X86::VBROADCASTSSZ128m      ,    X86::VBROADCASTSSrm         },
-  { X86::VBROADCASTSSZ128r      ,    X86::VBROADCASTSSrr         },
-  { X86::VBROADCASTSSZ128r_s    ,    X86::VBROADCASTSSrr         },
-  { X86::VCVTDQ2PDZ128rm        ,    X86::VCVTDQ2PDrm            },
-  { X86::VCVTDQ2PDZ128rr        ,    X86::VCVTDQ2PDrr            },
-  { X86::VCVTDQ2PSZ128rm        ,    X86::VCVTDQ2PSrm            },
-  { X86::VCVTDQ2PSZ128rr        ,    X86::VCVTDQ2PSrr            },
-  { X86::VCVTPD2DQZ128rm        ,    X86::VCVTPD2DQrm            },
-  { X86::VCVTPD2DQZ128rr        ,    X86::VCVTPD2DQrr            },
-  { X86::VCVTPD2PSZ128rm        ,    X86::VCVTPD2PSrm            },
-  { X86::VCVTPD2PSZ128rr        ,    X86::VCVTPD2PSrr            },
-  { X86::VCVTPH2PSZ128rm        ,    X86::VCVTPH2PSrm            },
-  { X86::VCVTPH2PSZ128rr        ,    X86::VCVTPH2PSrr            },
-  { X86::VCVTPS2DQZ128rm        ,    X86::VCVTPS2DQrm            },
-  { X86::VCVTPS2DQZ128rr        ,    X86::VCVTPS2DQrr            },
-  { X86::VCVTPS2PDZ128rm        ,    X86::VCVTPS2PDrm            },
-  { X86::VCVTPS2PDZ128rr        ,    X86::VCVTPS2PDrr            },
-  { X86::VCVTPS2PHZ128mr        ,    X86::VCVTPS2PHmr            },
-  { X86::VCVTPS2PHZ128rr        ,    X86::VCVTPS2PHrr            },
-  { X86::VCVTTPD2DQZ128rm       ,    X86::VCVTTPD2DQrm           },
-  { X86::VCVTTPD2DQZ128rr       ,    X86::VCVTTPD2DQrr           },
-  { X86::VCVTTPS2DQZ128rm       ,    X86::VCVTTPS2DQrm           },
-  { X86::VCVTTPS2DQZ128rr       ,    X86::VCVTTPS2DQrr           },
-  { X86::VDIVPDZ128rm           ,    X86::VDIVPDrm               },
-  { X86::VDIVPDZ128rr           ,    X86::VDIVPDrr               },
-  { X86::VDIVPSZ128rm           ,    X86::VDIVPSrm               },
-  { X86::VDIVPSZ128rr           ,    X86::VDIVPSrr               },
-  { X86::VFMADD132PDZ128m       ,    X86::VFMADD132PDm           },
-  { X86::VFMADD132PDZ128r       ,    X86::VFMADD132PDr           },
-  { X86::VFMADD132PSZ128m       ,    X86::VFMADD132PSm           },
-  { X86::VFMADD132PSZ128r       ,    X86::VFMADD132PSr           },
-  { X86::VFMADD213PDZ128m       ,    X86::VFMADD213PDm           },
-  { X86::VFMADD213PDZ128r       ,    X86::VFMADD213PDr           },
-  { X86::VFMADD213PSZ128m       ,    X86::VFMADD213PSm           },
-  { X86::VFMADD213PSZ128r       ,    X86::VFMADD213PSr           },
-  { X86::VFMADD231PDZ128m       ,    X86::VFMADD231PDm           },
-  { X86::VFMADD231PDZ128r       ,    X86::VFMADD231PDr           },
-  { X86::VFMADD231PSZ128m       ,    X86::VFMADD231PSm           },
-  { X86::VFMADD231PSZ128r       ,    X86::VFMADD231PSr           },
-  { X86::VFMADDSUB132PDZ128m    ,    X86::VFMADDSUB132PDm        },
-  { X86::VFMADDSUB132PDZ128r    ,    X86::VFMADDSUB132PDr        },
-  { X86::VFMADDSUB132PSZ128m    ,    X86::VFMADDSUB132PSm        },
-  { X86::VFMADDSUB132PSZ128r    ,    X86::VFMADDSUB132PSr        },
-  { X86::VFMADDSUB213PDZ128m    ,    X86::VFMADDSUB213PDm        },
-  { X86::VFMADDSUB213PDZ128r    ,    X86::VFMADDSUB213PDr        },
-  { X86::VFMADDSUB213PSZ128m    ,    X86::VFMADDSUB213PSm        },
-  { X86::VFMADDSUB213PSZ128r    ,    X86::VFMADDSUB213PSr        },
-  { X86::VFMADDSUB231PDZ128m    ,    X86::VFMADDSUB231PDm        },
-  { X86::VFMADDSUB231PDZ128r    ,    X86::VFMADDSUB231PDr        },
-  { X86::VFMADDSUB231PSZ128m    ,    X86::VFMADDSUB231PSm        },
-  { X86::VFMADDSUB231PSZ128r    ,    X86::VFMADDSUB231PSr        },
-  { X86::VFMSUB132PDZ128m       ,    X86::VFMSUB132PDm           },
-  { X86::VFMSUB132PDZ128r       ,    X86::VFMSUB132PDr           },
-  { X86::VFMSUB132PSZ128m       ,    X86::VFMSUB132PSm           },
-  { X86::VFMSUB132PSZ128r       ,    X86::VFMSUB132PSr           },
-  { X86::VFMSUB213PDZ128m       ,    X86::VFMSUB213PDm           },
-  { X86::VFMSUB213PDZ128r       ,    X86::VFMSUB213PDr           },
-  { X86::VFMSUB213PSZ128m       ,    X86::VFMSUB213PSm           },
-  { X86::VFMSUB213PSZ128r       ,    X86::VFMSUB213PSr           },
-  { X86::VFMSUB231PDZ128m       ,    X86::VFMSUB231PDm           },
-  { X86::VFMSUB231PDZ128r       ,    X86::VFMSUB231PDr           },
-  { X86::VFMSUB231PSZ128m       ,    X86::VFMSUB231PSm           },
-  { X86::VFMSUB231PSZ128r       ,    X86::VFMSUB231PSr           },
-  { X86::VFMSUBADD132PDZ128m    ,    X86::VFMSUBADD132PDm        },
-  { X86::VFMSUBADD132PDZ128r    ,    X86::VFMSUBADD132PDr        },
-  { X86::VFMSUBADD132PSZ128m    ,    X86::VFMSUBADD132PSm        },
-  { X86::VFMSUBADD132PSZ128r    ,    X86::VFMSUBADD132PSr        },
-  { X86::VFMSUBADD213PDZ128m    ,    X86::VFMSUBADD213PDm        },
-  { X86::VFMSUBADD213PDZ128r    ,    X86::VFMSUBADD213PDr        },
-  { X86::VFMSUBADD213PSZ128m    ,    X86::VFMSUBADD213PSm        },
-  { X86::VFMSUBADD213PSZ128r    ,    X86::VFMSUBADD213PSr        },
-  { X86::VFMSUBADD231PDZ128m    ,    X86::VFMSUBADD231PDm        },
-  { X86::VFMSUBADD231PDZ128r    ,    X86::VFMSUBADD231PDr        },
-  { X86::VFMSUBADD231PSZ128m    ,    X86::VFMSUBADD231PSm        },
-  { X86::VFMSUBADD231PSZ128r    ,    X86::VFMSUBADD231PSr        },
-  { X86::VFNMADD132PDZ128m      ,    X86::VFNMADD132PDm          },
-  { X86::VFNMADD132PDZ128r      ,    X86::VFNMADD132PDr          },
-  { X86::VFNMADD132PSZ128m      ,    X86::VFNMADD132PSm          },
-  { X86::VFNMADD132PSZ128r      ,    X86::VFNMADD132PSr          },
-  { X86::VFNMADD213PDZ128m      ,    X86::VFNMADD213PDm          },
-  { X86::VFNMADD213PDZ128r      ,    X86::VFNMADD213PDr          },
-  { X86::VFNMADD213PSZ128m      ,    X86::VFNMADD213PSm          },
-  { X86::VFNMADD213PSZ128r      ,    X86::VFNMADD213PSr          },
-  { X86::VFNMADD231PDZ128m      ,    X86::VFNMADD231PDm          },
-  { X86::VFNMADD231PDZ128r      ,    X86::VFNMADD231PDr          },
-  { X86::VFNMADD231PSZ128m      ,    X86::VFNMADD231PSm          },
-  { X86::VFNMADD231PSZ128r      ,    X86::VFNMADD231PSr          },
-  { X86::VFNMSUB132PDZ128m      ,    X86::VFNMSUB132PDm          },
-  { X86::VFNMSUB132PDZ128r      ,    X86::VFNMSUB132PDr          },
-  { X86::VFNMSUB132PSZ128m      ,    X86::VFNMSUB132PSm          },
-  { X86::VFNMSUB132PSZ128r      ,    X86::VFNMSUB132PSr          },
-  { X86::VFNMSUB213PDZ128m      ,    X86::VFNMSUB213PDm          },
-  { X86::VFNMSUB213PDZ128r      ,    X86::VFNMSUB213PDr          },
-  { X86::VFNMSUB213PSZ128m      ,    X86::VFNMSUB213PSm          },
-  { X86::VFNMSUB213PSZ128r      ,    X86::VFNMSUB213PSr          },
-  { X86::VFNMSUB231PDZ128m      ,    X86::VFNMSUB231PDm          },
-  { X86::VFNMSUB231PDZ128r      ,    X86::VFNMSUB231PDr          },
-  { X86::VFNMSUB231PSZ128m      ,    X86::VFNMSUB231PSm          },
-  { X86::VFNMSUB231PSZ128r      ,    X86::VFNMSUB231PSr          },
-  { X86::VMAXCPDZ128rm          ,    X86::VMAXCPDrm              },
-  { X86::VMAXCPDZ128rr          ,    X86::VMAXCPDrr              },
-  { X86::VMAXCPSZ128rm          ,    X86::VMAXCPSrm              },
-  { X86::VMAXCPSZ128rr          ,    X86::VMAXCPSrr              },
-  { X86::VMAXPDZ128rm           ,    X86::VMAXPDrm               },
-  { X86::VMAXPDZ128rr           ,    X86::VMAXPDrr               },
-  { X86::VMAXPSZ128rm           ,    X86::VMAXPSrm               },
-  { X86::VMAXPSZ128rr           ,    X86::VMAXPSrr               },
-  { X86::VMINCPDZ128rm          ,    X86::VMINCPDrm              },
-  { X86::VMINCPDZ128rr          ,    X86::VMINCPDrr              },
-  { X86::VMINCPSZ128rm          ,    X86::VMINCPSrm              },
-  { X86::VMINCPSZ128rr          ,    X86::VMINCPSrr              },
-  { X86::VMINPDZ128rm           ,    X86::VMINPDrm               },
-  { X86::VMINPDZ128rr           ,    X86::VMINPDrr               },
-  { X86::VMINPSZ128rm           ,    X86::VMINPSrm               },
-  { X86::VMINPSZ128rr           ,    X86::VMINPSrr               },
-  { X86::VMOVAPDZ128mr          ,    X86::VMOVAPDmr              },
-  { X86::VMOVAPDZ128rm          ,    X86::VMOVAPDrm              },
-  { X86::VMOVAPDZ128rr          ,    X86::VMOVAPDrr              },
-  { X86::VMOVAPDZ128rr_REV      ,    X86::VMOVAPDrr_REV          },
-  { X86::VMOVAPSZ128mr          ,    X86::VMOVAPSmr              },
-  { X86::VMOVAPSZ128rm          ,    X86::VMOVAPSrm              },
-  { X86::VMOVAPSZ128rr          ,    X86::VMOVAPSrr              },
-  { X86::VMOVAPSZ128rr_REV      ,    X86::VMOVAPSrr_REV          },
-  { X86::VMOVDDUPZ128rm         ,    X86::VMOVDDUPrm             },
-  { X86::VMOVDDUPZ128rr         ,    X86::VMOVDDUPrr             },
-  { X86::VMOVDQA32Z128mr        ,    X86::VMOVDQAmr              },
-  { X86::VMOVDQA32Z128rm        ,    X86::VMOVDQArm              },
-  { X86::VMOVDQA32Z128rr        ,    X86::VMOVDQArr              },
-  { X86::VMOVDQA32Z128rr_REV    ,    X86::VMOVDQArr_REV          },
-  { X86::VMOVDQA64Z128mr        ,    X86::VMOVDQAmr              },
-  { X86::VMOVDQA64Z128rm        ,    X86::VMOVDQArm              },
-  { X86::VMOVDQA64Z128rr        ,    X86::VMOVDQArr              },
-  { X86::VMOVDQA64Z128rr_REV    ,    X86::VMOVDQArr_REV          },
-  { X86::VMOVDQU16Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU16Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU16Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU16Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU32Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU32Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU32Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU32Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU64Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU64Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU64Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU64Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU8Z128mr         ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU8Z128rm         ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU8Z128rr         ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU8Z128rr_REV     ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVHPDZ128mr          ,    X86::VMOVHPDmr              },
-  { X86::VMOVHPDZ128rm          ,    X86::VMOVHPDrm              },
-  { X86::VMOVHPSZ128mr          ,    X86::VMOVHPSmr              },
-  { X86::VMOVHPSZ128rm          ,    X86::VMOVHPSrm              },
-  { X86::VMOVLPDZ128mr          ,    X86::VMOVLPDmr              },
-  { X86::VMOVLPDZ128rm          ,    X86::VMOVLPDrm              },
-  { X86::VMOVLPSZ128mr          ,    X86::VMOVLPSmr              },
-  { X86::VMOVLPSZ128rm          ,    X86::VMOVLPSrm              },
-  { X86::VMOVNTDQAZ128rm        ,    X86::VMOVNTDQArm            },
-  { X86::VMOVNTDQZ128mr         ,    X86::VMOVNTDQmr             },
-  { X86::VMOVNTPDZ128mr         ,    X86::VMOVNTPDmr             },
-  { X86::VMOVNTPSZ128mr         ,    X86::VMOVNTPSmr             },
-  { X86::VMOVSHDUPZ128rm        ,    X86::VMOVSHDUPrm            },
-  { X86::VMOVSHDUPZ128rr        ,    X86::VMOVSHDUPrr            },
-  { X86::VMOVSLDUPZ128rm        ,    X86::VMOVSLDUPrm            },
-  { X86::VMOVSLDUPZ128rr        ,    X86::VMOVSLDUPrr            },
-  { X86::VMOVUPDZ128mr          ,    X86::VMOVUPDmr              },
-  { X86::VMOVUPDZ128rm          ,    X86::VMOVUPDrm              },
-  { X86::VMOVUPDZ128rr          ,    X86::VMOVUPDrr              },
-  { X86::VMOVUPDZ128rr_REV      ,    X86::VMOVUPDrr_REV          },
-  { X86::VMOVUPSZ128mr          ,    X86::VMOVUPSmr              },
-  { X86::VMOVUPSZ128rm          ,    X86::VMOVUPSrm              },
-  { X86::VMOVUPSZ128rr          ,    X86::VMOVUPSrr              },
-  { X86::VMOVUPSZ128rr_REV      ,    X86::VMOVUPSrr_REV          },
-  { X86::VMULPDZ128rm           ,    X86::VMULPDrm               },
-  { X86::VMULPDZ128rr           ,    X86::VMULPDrr               },
-  { X86::VMULPSZ128rm           ,    X86::VMULPSrm               },
-  { X86::VMULPSZ128rr           ,    X86::VMULPSrr               },
-  { X86::VORPDZ128rm            ,    X86::VORPDrm                },
-  { X86::VORPDZ128rr            ,    X86::VORPDrr                },
-  { X86::VORPSZ128rm            ,    X86::VORPSrm                },
-  { X86::VORPSZ128rr            ,    X86::VORPSrr                },
-  { X86::VPABSBZ128rm           ,    X86::VPABSBrm               },
-  { X86::VPABSBZ128rr           ,    X86::VPABSBrr               },
-  { X86::VPABSDZ128rm           ,    X86::VPABSDrm               },
-  { X86::VPABSDZ128rr           ,    X86::VPABSDrr               },
-  { X86::VPABSWZ128rm           ,    X86::VPABSWrm               },
-  { X86::VPABSWZ128rr           ,    X86::VPABSWrr               },
-  { X86::VPACKSSDWZ128rm        ,    X86::VPACKSSDWrm            },
-  { X86::VPACKSSDWZ128rr        ,    X86::VPACKSSDWrr            },
-  { X86::VPACKSSWBZ128rm        ,    X86::VPACKSSWBrm            },
-  { X86::VPACKSSWBZ128rr        ,    X86::VPACKSSWBrr            },
-  { X86::VPACKUSDWZ128rm        ,    X86::VPACKUSDWrm            },
-  { X86::VPACKUSDWZ128rr        ,    X86::VPACKUSDWrr            },
-  { X86::VPACKUSWBZ128rm        ,    X86::VPACKUSWBrm            },
-  { X86::VPACKUSWBZ128rr        ,    X86::VPACKUSWBrr            },
-  { X86::VPADDBZ128rm           ,    X86::VPADDBrm               },
-  { X86::VPADDBZ128rr           ,    X86::VPADDBrr               },
-  { X86::VPADDDZ128rm           ,    X86::VPADDDrm               },
-  { X86::VPADDDZ128rr           ,    X86::VPADDDrr               },
-  { X86::VPADDQZ128rm           ,    X86::VPADDQrm               },
-  { X86::VPADDQZ128rr           ,    X86::VPADDQrr               },
-  { X86::VPADDSBZ128rm          ,    X86::VPADDSBrm              },
-  { X86::VPADDSBZ128rr          ,    X86::VPADDSBrr              },
-  { X86::VPADDSWZ128rm          ,    X86::VPADDSWrm              },
-  { X86::VPADDSWZ128rr          ,    X86::VPADDSWrr              },
-  { X86::VPADDUSBZ128rm         ,    X86::VPADDUSBrm             },
-  { X86::VPADDUSBZ128rr         ,    X86::VPADDUSBrr             },
-  { X86::VPADDUSWZ128rm         ,    X86::VPADDUSWrm             },
-  { X86::VPADDUSWZ128rr         ,    X86::VPADDUSWrr             },
-  { X86::VPADDWZ128rm           ,    X86::VPADDWrm               },
-  { X86::VPADDWZ128rr           ,    X86::VPADDWrr               },
-  { X86::VPALIGNRZ128rmi        ,    X86::VPALIGNRrmi            },
-  { X86::VPALIGNRZ128rri        ,    X86::VPALIGNRrri            },
-  { X86::VPANDDZ128rm           ,    X86::VPANDrm                },
-  { X86::VPANDDZ128rr           ,    X86::VPANDrr                },
-  { X86::VPANDQZ128rm           ,    X86::VPANDrm                },
-  { X86::VPANDQZ128rr           ,    X86::VPANDrr                },
-  { X86::VPAVGBZ128rm           ,    X86::VPAVGBrm               },
-  { X86::VPAVGBZ128rr           ,    X86::VPAVGBrr               },
-  { X86::VPAVGWZ128rm           ,    X86::VPAVGWrm               },
-  { X86::VPAVGWZ128rr           ,    X86::VPAVGWrr               },
-  { X86::VPBROADCASTBZ128m      ,    X86::VPBROADCASTBrm         },
-  { X86::VPBROADCASTBZ128r      ,    X86::VPBROADCASTBrr         },
-  { X86::VPBROADCASTDZ128m      ,    X86::VPBROADCASTDrm         },
-  { X86::VPBROADCASTDZ128r      ,    X86::VPBROADCASTDrr         },
-  { X86::VPBROADCASTQZ128m      ,    X86::VPBROADCASTQrm         },
-  { X86::VPBROADCASTQZ128r      ,    X86::VPBROADCASTQrr         },
-  { X86::VPBROADCASTWZ128m      ,    X86::VPBROADCASTWrm         },
-  { X86::VPBROADCASTWZ128r      ,    X86::VPBROADCASTWrr         },
-  { X86::VPERMILPDZ128mi        ,    X86::VPERMILPDmi            },
-  { X86::VPERMILPDZ128ri        ,    X86::VPERMILPDri            },
-  { X86::VPERMILPDZ128rm        ,    X86::VPERMILPDrm            },
-  { X86::VPERMILPDZ128rr        ,    X86::VPERMILPDrr            },
-  { X86::VPERMILPSZ128mi        ,    X86::VPERMILPSmi            },
-  { X86::VPERMILPSZ128ri        ,    X86::VPERMILPSri            },
-  { X86::VPERMILPSZ128rm        ,    X86::VPERMILPSrm            },
-  { X86::VPERMILPSZ128rr        ,    X86::VPERMILPSrr            },
-  { X86::VPMADDUBSWZ128rm       ,    X86::VPMADDUBSWrm           },
-  { X86::VPMADDUBSWZ128rr       ,    X86::VPMADDUBSWrr           },
-  { X86::VPMADDWDZ128rm         ,    X86::VPMADDWDrm             },
-  { X86::VPMADDWDZ128rr         ,    X86::VPMADDWDrr             },
-  { X86::VPMAXSBZ128rm          ,    X86::VPMAXSBrm              },
-  { X86::VPMAXSBZ128rr          ,    X86::VPMAXSBrr              },
-  { X86::VPMAXSDZ128rm          ,    X86::VPMAXSDrm              },
-  { X86::VPMAXSDZ128rr          ,    X86::VPMAXSDrr              },
-  { X86::VPMAXSWZ128rm          ,    X86::VPMAXSWrm              },
-  { X86::VPMAXSWZ128rr          ,    X86::VPMAXSWrr              },
-  { X86::VPMAXUBZ128rm          ,    X86::VPMAXUBrm              },
-  { X86::VPMAXUBZ128rr          ,    X86::VPMAXUBrr              },
-  { X86::VPMAXUDZ128rm          ,    X86::VPMAXUDrm              },
-  { X86::VPMAXUDZ128rr          ,    X86::VPMAXUDrr              },
-  { X86::VPMAXUWZ128rm          ,    X86::VPMAXUWrm              },
-  { X86::VPMAXUWZ128rr          ,    X86::VPMAXUWrr              },
-  { X86::VPMINSBZ128rm          ,    X86::VPMINSBrm              },
-  { X86::VPMINSBZ128rr          ,    X86::VPMINSBrr              },
-  { X86::VPMINSDZ128rm          ,    X86::VPMINSDrm              },
-  { X86::VPMINSDZ128rr          ,    X86::VPMINSDrr              },
-  { X86::VPMINSWZ128rm          ,    X86::VPMINSWrm              },
-  { X86::VPMINSWZ128rr          ,    X86::VPMINSWrr              },
-  { X86::VPMINUBZ128rm          ,    X86::VPMINUBrm              },
-  { X86::VPMINUBZ128rr          ,    X86::VPMINUBrr              },
-  { X86::VPMINUDZ128rm          ,    X86::VPMINUDrm              },
-  { X86::VPMINUDZ128rr          ,    X86::VPMINUDrr              },
-  { X86::VPMINUWZ128rm          ,    X86::VPMINUWrm              },
-  { X86::VPMINUWZ128rr          ,    X86::VPMINUWrr              },
-  { X86::VPMOVSXBDZ128rm        ,    X86::VPMOVSXBDrm            },
-  { X86::VPMOVSXBDZ128rr        ,    X86::VPMOVSXBDrr            },
-  { X86::VPMOVSXBQZ128rm        ,    X86::VPMOVSXBQrm            },
-  { X86::VPMOVSXBQZ128rr        ,    X86::VPMOVSXBQrr            },
-  { X86::VPMOVSXBWZ128rm        ,    X86::VPMOVSXBWrm            },
-  { X86::VPMOVSXBWZ128rr        ,    X86::VPMOVSXBWrr            },
-  { X86::VPMOVSXDQZ128rm        ,    X86::VPMOVSXDQrm            },
-  { X86::VPMOVSXDQZ128rr        ,    X86::VPMOVSXDQrr            },
-  { X86::VPMOVSXWDZ128rm        ,    X86::VPMOVSXWDrm            },
-  { X86::VPMOVSXWDZ128rr        ,    X86::VPMOVSXWDrr            },
-  { X86::VPMOVSXWQZ128rm        ,    X86::VPMOVSXWQrm            },
-  { X86::VPMOVSXWQZ128rr        ,    X86::VPMOVSXWQrr            },
-  { X86::VPMOVZXBDZ128rm        ,    X86::VPMOVZXBDrm            },
-  { X86::VPMOVZXBDZ128rr        ,    X86::VPMOVZXBDrr            },
-  { X86::VPMOVZXBQZ128rm        ,    X86::VPMOVZXBQrm            },
-  { X86::VPMOVZXBQZ128rr        ,    X86::VPMOVZXBQrr            },
-  { X86::VPMOVZXBWZ128rm        ,    X86::VPMOVZXBWrm            },
-  { X86::VPMOVZXBWZ128rr        ,    X86::VPMOVZXBWrr            },
-  { X86::VPMOVZXDQZ128rm        ,    X86::VPMOVZXDQrm            },
-  { X86::VPMOVZXDQZ128rr        ,    X86::VPMOVZXDQrr            },
-  { X86::VPMOVZXWDZ128rm        ,    X86::VPMOVZXWDrm            },
-  { X86::VPMOVZXWDZ128rr        ,    X86::VPMOVZXWDrr            },
-  { X86::VPMOVZXWQZ128rm        ,    X86::VPMOVZXWQrm            },
-  { X86::VPMOVZXWQZ128rr        ,    X86::VPMOVZXWQrr            },
-  { X86::VPMULDQZ128rm          ,    X86::VPMULDQrm              },
-  { X86::VPMULDQZ128rr          ,    X86::VPMULDQrr              },
-  { X86::VPMULHRSWZ128rm        ,    X86::VPMULHRSWrm            },
-  { X86::VPMULHRSWZ128rr        ,    X86::VPMULHRSWrr            },
-  { X86::VPMULHUWZ128rm         ,    X86::VPMULHUWrm             },
-  { X86::VPMULHUWZ128rr         ,    X86::VPMULHUWrr             },
-  { X86::VPMULHWZ128rm          ,    X86::VPMULHWrm              },
-  { X86::VPMULHWZ128rr          ,    X86::VPMULHWrr              },
-  { X86::VPMULLDZ128rm          ,    X86::VPMULLDrm              },
-  { X86::VPMULLDZ128rr          ,    X86::VPMULLDrr              },
-  { X86::VPMULLWZ128rm          ,    X86::VPMULLWrm              },
-  { X86::VPMULLWZ128rr          ,    X86::VPMULLWrr              },
-  { X86::VPMULUDQZ128rm         ,    X86::VPMULUDQrm             },
-  { X86::VPMULUDQZ128rr         ,    X86::VPMULUDQrr             },
-  { X86::VPORDZ128rm            ,    X86::VPORrm                 },
-  { X86::VPORDZ128rr            ,    X86::VPORrr                 },
-  { X86::VPORQZ128rm            ,    X86::VPORrm                 },
-  { X86::VPORQZ128rr            ,    X86::VPORrr                 },
-  { X86::VPSADBWZ128rm          ,    X86::VPSADBWrm              },
-  { X86::VPSADBWZ128rr          ,    X86::VPSADBWrr              },
-  { X86::VPSHUFBZ128rm          ,    X86::VPSHUFBrm              },
-  { X86::VPSHUFBZ128rr          ,    X86::VPSHUFBrr              },
-  { X86::VPSHUFDZ128mi          ,    X86::VPSHUFDmi              },
-  { X86::VPSHUFDZ128ri          ,    X86::VPSHUFDri              },
-  { X86::VPSHUFHWZ128mi         ,    X86::VPSHUFHWmi             },
-  { X86::VPSHUFHWZ128ri         ,    X86::VPSHUFHWri             },
-  { X86::VPSHUFLWZ128mi         ,    X86::VPSHUFLWmi             },
-  { X86::VPSHUFLWZ128ri         ,    X86::VPSHUFLWri             },
-  { X86::VPSLLDQZ128rr          ,    X86::VPSLLDQri              },
-  { X86::VPSLLDZ128ri           ,    X86::VPSLLDri               },
-  { X86::VPSLLDZ128rm           ,    X86::VPSLLDrm               },
-  { X86::VPSLLDZ128rr           ,    X86::VPSLLDrr               },
-  { X86::VPSLLQZ128ri           ,    X86::VPSLLQri               },
-  { X86::VPSLLQZ128rm           ,    X86::VPSLLQrm               },
-  { X86::VPSLLQZ128rr           ,    X86::VPSLLQrr               },
-  { X86::VPSLLVDZ128rm          ,    X86::VPSLLVDrm              },
-  { X86::VPSLLVDZ128rr          ,    X86::VPSLLVDrr              },
-  { X86::VPSLLVQZ128rm          ,    X86::VPSLLVQrm              },
-  { X86::VPSLLVQZ128rr          ,    X86::VPSLLVQrr              },
-  { X86::VPSLLWZ128ri           ,    X86::VPSLLWri               },
-  { X86::VPSLLWZ128rm           ,    X86::VPSLLWrm               },
-  { X86::VPSLLWZ128rr           ,    X86::VPSLLWrr               },
-  { X86::VPSRADZ128ri           ,    X86::VPSRADri               },
-  { X86::VPSRADZ128rm           ,    X86::VPSRADrm               },
-  { X86::VPSRADZ128rr           ,    X86::VPSRADrr               },
-  { X86::VPSRAVDZ128rm          ,    X86::VPSRAVDrm              },
-  { X86::VPSRAVDZ128rr          ,    X86::VPSRAVDrr              },
-  { X86::VPSRAWZ128ri           ,    X86::VPSRAWri               },
-  { X86::VPSRAWZ128rm           ,    X86::VPSRAWrm               },
-  { X86::VPSRAWZ128rr           ,    X86::VPSRAWrr               },
-  { X86::VPSRLDQZ128rr          ,    X86::VPSRLDQri              },
-  { X86::VPSRLDZ128ri           ,    X86::VPSRLDri               },
-  { X86::VPSRLDZ128rm           ,    X86::VPSRLDrm               },
-  { X86::VPSRLDZ128rr           ,    X86::VPSRLDrr               },
-  { X86::VPSRLQZ128ri           ,    X86::VPSRLQri               },
-  { X86::VPSRLQZ128rm           ,    X86::VPSRLQrm               },
-  { X86::VPSRLQZ128rr           ,    X86::VPSRLQrr               },
-  { X86::VPSRLVDZ128rm          ,    X86::VPSRLVDrm              },
-  { X86::VPSRLVDZ128rr          ,    X86::VPSRLVDrr              },
-  { X86::VPSRLVQZ128rm          ,    X86::VPSRLVQrm              },
-  { X86::VPSRLVQZ128rr          ,    X86::VPSRLVQrr              },
-  { X86::VPSRLWZ128ri           ,    X86::VPSRLWri               },
-  { X86::VPSRLWZ128rm           ,    X86::VPSRLWrm               },
-  { X86::VPSRLWZ128rr           ,    X86::VPSRLWrr               },
-  { X86::VPSUBBZ128rm           ,    X86::VPSUBBrm               },
-  { X86::VPSUBBZ128rr           ,    X86::VPSUBBrr               },
-  { X86::VPSUBDZ128rm           ,    X86::VPSUBDrm               },
-  { X86::VPSUBDZ128rr           ,    X86::VPSUBDrr               },
-  { X86::VPSUBQZ128rm           ,    X86::VPSUBQrm               },
-  { X86::VPSUBQZ128rr           ,    X86::VPSUBQrr               },
-  { X86::VPSUBSBZ128rm          ,    X86::VPSUBSBrm              },
-  { X86::VPSUBSBZ128rr          ,    X86::VPSUBSBrr              },
-  { X86::VPSUBSWZ128rm          ,    X86::VPSUBSWrm              },
-  { X86::VPSUBSWZ128rr          ,    X86::VPSUBSWrr              },
-  { X86::VPSUBUSBZ128rm         ,    X86::VPSUBUSBrm             },
-  { X86::VPSUBUSBZ128rr         ,    X86::VPSUBUSBrr             },
-  { X86::VPSUBUSWZ128rm         ,    X86::VPSUBUSWrm             },
-  { X86::VPSUBUSWZ128rr         ,    X86::VPSUBUSWrr             },
-  { X86::VPSUBWZ128rm           ,    X86::VPSUBWrm               },
-  { X86::VPSUBWZ128rr           ,    X86::VPSUBWrr               },
-  { X86::VPUNPCKHBWZ128rm       ,    X86::VPUNPCKHBWrm           },
-  { X86::VPUNPCKHBWZ128rr       ,    X86::VPUNPCKHBWrr           },
-  { X86::VPUNPCKHDQZ128rm       ,    X86::VPUNPCKHDQrm           },
-  { X86::VPUNPCKHDQZ128rr       ,    X86::VPUNPCKHDQrr           },
-  { X86::VPUNPCKHQDQZ128rm      ,    X86::VPUNPCKHQDQrm          },
-  { X86::VPUNPCKHQDQZ128rr      ,    X86::VPUNPCKHQDQrr          },
-  { X86::VPUNPCKHWDZ128rm       ,    X86::VPUNPCKHWDrm           },
-  { X86::VPUNPCKHWDZ128rr       ,    X86::VPUNPCKHWDrr           },
-  { X86::VPUNPCKLBWZ128rm       ,    X86::VPUNPCKLBWrm           },
-  { X86::VPUNPCKLBWZ128rr       ,    X86::VPUNPCKLBWrr           },
-  { X86::VPUNPCKLDQZ128rm       ,    X86::VPUNPCKLDQrm           },
-  { X86::VPUNPCKLDQZ128rr       ,    X86::VPUNPCKLDQrr           },
-  { X86::VPUNPCKLQDQZ128rm      ,    X86::VPUNPCKLQDQrm          },
-  { X86::VPUNPCKLQDQZ128rr      ,    X86::VPUNPCKLQDQrr          },
-  { X86::VPUNPCKLWDZ128rm       ,    X86::VPUNPCKLWDrm           },
-  { X86::VPUNPCKLWDZ128rr       ,    X86::VPUNPCKLWDrr           },
-  { X86::VPXORDZ128rm           ,    X86::VPXORrm                },
-  { X86::VPXORDZ128rr           ,    X86::VPXORrr                },
-  { X86::VPXORQZ128rm           ,    X86::VPXORrm                },
-  { X86::VPXORQZ128rr           ,    X86::VPXORrr                },
-  { X86::VSHUFPDZ128rmi         ,    X86::VSHUFPDrmi             },
-  { X86::VSHUFPDZ128rri         ,    X86::VSHUFPDrri             },
-  { X86::VSHUFPSZ128rmi         ,    X86::VSHUFPSrmi             },
-  { X86::VSHUFPSZ128rri         ,    X86::VSHUFPSrri             },
-  { X86::VSQRTPDZ128m           ,    X86::VSQRTPDm               },
-  { X86::VSQRTPDZ128r           ,    X86::VSQRTPDr               },
-  { X86::VSQRTPSZ128m           ,    X86::VSQRTPSm               },
-  { X86::VSQRTPSZ128r           ,    X86::VSQRTPSr               },
-  { X86::VSUBPDZ128rm           ,    X86::VSUBPDrm               },
-  { X86::VSUBPDZ128rr           ,    X86::VSUBPDrr               },
-  { X86::VSUBPSZ128rm           ,    X86::VSUBPSrm               },
-  { X86::VSUBPSZ128rr           ,    X86::VSUBPSrr               },
-  { X86::VUNPCKHPDZ128rm        ,    X86::VUNPCKHPDrm            },
-  { X86::VUNPCKHPDZ128rr        ,    X86::VUNPCKHPDrr            },
-  { X86::VUNPCKHPSZ128rm        ,    X86::VUNPCKHPSrm            },
-  { X86::VUNPCKHPSZ128rr        ,    X86::VUNPCKHPSrr            },
-  { X86::VUNPCKLPDZ128rm        ,    X86::VUNPCKLPDrm            },
-  { X86::VUNPCKLPDZ128rr        ,    X86::VUNPCKLPDrr            },
-  { X86::VUNPCKLPSZ128rm        ,    X86::VUNPCKLPSrm            },
-  { X86::VUNPCKLPSZ128rr        ,    X86::VUNPCKLPSrr            },
-  { X86::VXORPDZ128rm           ,    X86::VXORPDrm               },
-  { X86::VXORPDZ128rr           ,    X86::VXORPDrr               },
-  { X86::VXORPSZ128rm           ,    X86::VXORPSrm               },
-  { X86::VXORPSZ128rr           ,    X86::VXORPSrr               },
-};
-
-
-// X86 EVEX encoded instructions that have a VEX 256 encoding
-// (table format: <EVEX opcode, VEX-256 opcode>).
- static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = {
-  { X86::VADDPDZ256rm           ,     X86::VADDPDYrm             },
-  { X86::VADDPDZ256rr           ,     X86::VADDPDYrr             },
-  { X86::VADDPSZ256rm           ,     X86::VADDPSYrm             },
-  { X86::VADDPSZ256rr           ,     X86::VADDPSYrr             },
-  { X86::VANDNPDZ256rm          ,     X86::VANDNPDYrm            },
-  { X86::VANDNPDZ256rr          ,     X86::VANDNPDYrr            },
-  { X86::VANDNPSZ256rm          ,     X86::VANDNPSYrm            },
-  { X86::VANDNPSZ256rr          ,     X86::VANDNPSYrr            },
-  { X86::VANDPDZ256rm           ,     X86::VANDPDYrm             },
-  { X86::VANDPDZ256rr           ,     X86::VANDPDYrr             },
-  { X86::VANDPSZ256rm           ,     X86::VANDPSYrm             },
-  { X86::VANDPSZ256rr           ,     X86::VANDPSYrr             },
-  { X86::VBROADCASTSDZ256m      ,     X86::VBROADCASTSDYrm       },
-  { X86::VBROADCASTSDZ256r      ,     X86::VBROADCASTSDYrr       },
-  { X86::VBROADCASTSDZ256r_s    ,     X86::VBROADCASTSDYrr       },
-  { X86::VBROADCASTSSZ256m      ,     X86::VBROADCASTSSYrm       },
-  { X86::VBROADCASTSSZ256r      ,     X86::VBROADCASTSSYrr       },
-  { X86::VBROADCASTSSZ256r_s    ,     X86::VBROADCASTSSYrr       },
-  { X86::VCVTDQ2PDZ256rm        ,     X86::VCVTDQ2PDYrm          },
-  { X86::VCVTDQ2PDZ256rr        ,     X86::VCVTDQ2PDYrr          },
-  { X86::VCVTDQ2PSZ256rm        ,     X86::VCVTDQ2PSYrm          },
-  { X86::VCVTDQ2PSZ256rr        ,     X86::VCVTDQ2PSYrr          },
-  { X86::VCVTPD2DQZ256rm        ,     X86::VCVTPD2DQYrm          },
-  { X86::VCVTPD2DQZ256rr        ,     X86::VCVTPD2DQYrr          },
-  { X86::VCVTPD2PSZ256rm        ,     X86::VCVTPD2PSYrm          },
-  { X86::VCVTPD2PSZ256rr        ,     X86::VCVTPD2PSYrr          },
-  { X86::VCVTPH2PSZ256rm        ,     X86::VCVTPH2PSYrm          },
-  { X86::VCVTPH2PSZ256rr        ,     X86::VCVTPH2PSYrr          },
-  { X86::VCVTPS2DQZ256rm        ,     X86::VCVTPS2DQYrm          },
-  { X86::VCVTPS2DQZ256rr        ,     X86::VCVTPS2DQYrr          },
-  { X86::VCVTPS2PDZ256rm        ,     X86::VCVTPS2PDYrm          },
-  { X86::VCVTPS2PDZ256rr        ,     X86::VCVTPS2PDYrr          },
-  { X86::VCVTPS2PHZ256mr        ,     X86::VCVTPS2PHYmr          },
-  { X86::VCVTPS2PHZ256rr        ,     X86::VCVTPS2PHYrr          },
-  { X86::VCVTTPD2DQZ256rm       ,     X86::VCVTTPD2DQYrm         },
-  { X86::VCVTTPD2DQZ256rr       ,     X86::VCVTTPD2DQYrr         },
-  { X86::VCVTTPS2DQZ256rm       ,     X86::VCVTTPS2DQYrm         },
-  { X86::VCVTTPS2DQZ256rr       ,     X86::VCVTTPS2DQYrr         },
-  { X86::VDIVPDZ256rm           ,     X86::VDIVPDYrm             },
-  { X86::VDIVPDZ256rr           ,     X86::VDIVPDYrr             },
-  { X86::VDIVPSZ256rm           ,     X86::VDIVPSYrm             },
-  { X86::VDIVPSZ256rr           ,     X86::VDIVPSYrr             },
-  { X86::VEXTRACTF32x4Z256mr    ,    X86::VEXTRACTF128mr         },
-  { X86::VEXTRACTF64x2Z256mr    ,    X86::VEXTRACTF128mr         },
-  { X86::VEXTRACTF32x4Z256rr    ,    X86::VEXTRACTF128rr         },
-  { X86::VEXTRACTF64x2Z256rr    ,    X86::VEXTRACTF128rr         },
-  { X86::VEXTRACTI32x4Z256mr    ,    X86::VEXTRACTI128mr         },
-  { X86::VEXTRACTI64x2Z256mr    ,    X86::VEXTRACTI128mr         },
-  { X86::VEXTRACTI32x4Z256rr    ,    X86::VEXTRACTI128rr         },
-  { X86::VEXTRACTI64x2Z256rr    ,    X86::VEXTRACTI128rr         },
-  { X86::VFMADD132PDZ256m       ,     X86::VFMADD132PDYm         },
-  { X86::VFMADD132PDZ256r       ,     X86::VFMADD132PDYr         },
-  { X86::VFMADD132PSZ256m       ,     X86::VFMADD132PSYm         },
-  { X86::VFMADD132PSZ256r       ,     X86::VFMADD132PSYr         },
-  { X86::VFMADD213PDZ256m       ,     X86::VFMADD213PDYm         },
-  { X86::VFMADD213PDZ256r       ,     X86::VFMADD213PDYr         },
-  { X86::VFMADD213PSZ256m       ,     X86::VFMADD213PSYm         },
-  { X86::VFMADD213PSZ256r       ,     X86::VFMADD213PSYr         },
-  { X86::VFMADD231PDZ256m       ,     X86::VFMADD231PDYm         },
-  { X86::VFMADD231PDZ256r       ,     X86::VFMADD231PDYr         },
-  { X86::VFMADD231PSZ256m       ,     X86::VFMADD231PSYm         },
-  { X86::VFMADD231PSZ256r       ,     X86::VFMADD231PSYr         },
-  { X86::VFMADDSUB132PDZ256m    ,     X86::VFMADDSUB132PDYm      },
-  { X86::VFMADDSUB132PDZ256r    ,     X86::VFMADDSUB132PDYr      },
-  { X86::VFMADDSUB132PSZ256m    ,     X86::VFMADDSUB132PSYm      },
-  { X86::VFMADDSUB132PSZ256r    ,     X86::VFMADDSUB132PSYr      },
-  { X86::VFMADDSUB213PDZ256m    ,     X86::VFMADDSUB213PDYm      },
-  { X86::VFMADDSUB213PDZ256r    ,     X86::VFMADDSUB213PDYr      },
-  { X86::VFMADDSUB213PSZ256m    ,     X86::VFMADDSUB213PSYm      },
-  { X86::VFMADDSUB213PSZ256r    ,     X86::VFMADDSUB213PSYr      },
-  { X86::VFMADDSUB231PDZ256m    ,     X86::VFMADDSUB231PDYm      },
-  { X86::VFMADDSUB231PDZ256r    ,     X86::VFMADDSUB231PDYr      },
-  { X86::VFMADDSUB231PSZ256m    ,     X86::VFMADDSUB231PSYm      },
-  { X86::VFMADDSUB231PSZ256r    ,     X86::VFMADDSUB231PSYr      },
-  { X86::VFMSUB132PDZ256m       ,     X86::VFMSUB132PDYm         },
-  { X86::VFMSUB132PDZ256r       ,     X86::VFMSUB132PDYr         },
-  { X86::VFMSUB132PSZ256m       ,     X86::VFMSUB132PSYm         },
-  { X86::VFMSUB132PSZ256r       ,     X86::VFMSUB132PSYr         },
-  { X86::VFMSUB213PDZ256m       ,     X86::VFMSUB213PDYm         },
-  { X86::VFMSUB213PDZ256r       ,     X86::VFMSUB213PDYr         },
-  { X86::VFMSUB213PSZ256m       ,     X86::VFMSUB213PSYm         },
-  { X86::VFMSUB213PSZ256r       ,     X86::VFMSUB213PSYr         },
-  { X86::VFMSUB231PDZ256m       ,     X86::VFMSUB231PDYm         },
-  { X86::VFMSUB231PDZ256r       ,     X86::VFMSUB231PDYr         },
-  { X86::VFMSUB231PSZ256m       ,     X86::VFMSUB231PSYm         },
-  { X86::VFMSUB231PSZ256r       ,     X86::VFMSUB231PSYr         },
-  { X86::VFMSUBADD132PDZ256m    ,     X86::VFMSUBADD132PDYm      },
-  { X86::VFMSUBADD132PDZ256r    ,     X86::VFMSUBADD132PDYr      },
-  { X86::VFMSUBADD132PSZ256m    ,     X86::VFMSUBADD132PSYm      },
-  { X86::VFMSUBADD132PSZ256r    ,     X86::VFMSUBADD132PSYr      },
-  { X86::VFMSUBADD213PDZ256m    ,     X86::VFMSUBADD213PDYm      },
-  { X86::VFMSUBADD213PDZ256r    ,     X86::VFMSUBADD213PDYr      },
-  { X86::VFMSUBADD213PSZ256m    ,     X86::VFMSUBADD213PSYm      },
-  { X86::VFMSUBADD213PSZ256r    ,     X86::VFMSUBADD213PSYr      },
-  { X86::VFMSUBADD231PDZ256m    ,     X86::VFMSUBADD231PDYm      },
-  { X86::VFMSUBADD231PDZ256r    ,     X86::VFMSUBADD231PDYr      },
-  { X86::VFMSUBADD231PSZ256m    ,     X86::VFMSUBADD231PSYm      },
-  { X86::VFMSUBADD231PSZ256r    ,     X86::VFMSUBADD231PSYr      },
-  { X86::VFNMADD132PDZ256m      ,     X86::VFNMADD132PDYm        },
-  { X86::VFNMADD132PDZ256r      ,     X86::VFNMADD132PDYr        },
-  { X86::VFNMADD132PSZ256m      ,     X86::VFNMADD132PSYm        },
-  { X86::VFNMADD132PSZ256r      ,     X86::VFNMADD132PSYr        },
-  { X86::VFNMADD213PDZ256m      ,     X86::VFNMADD213PDYm        },
-  { X86::VFNMADD213PDZ256r      ,     X86::VFNMADD213PDYr        },
-  { X86::VFNMADD213PSZ256m      ,     X86::VFNMADD213PSYm        },
-  { X86::VFNMADD213PSZ256r      ,     X86::VFNMADD213PSYr        },
-  { X86::VFNMADD231PDZ256m      ,     X86::VFNMADD231PDYm        },
-  { X86::VFNMADD231PDZ256r      ,     X86::VFNMADD231PDYr        },
-  { X86::VFNMADD231PSZ256m      ,     X86::VFNMADD231PSYm        },
-  { X86::VFNMADD231PSZ256r      ,     X86::VFNMADD231PSYr        },
-  { X86::VFNMSUB132PDZ256m      ,     X86::VFNMSUB132PDYm        },
-  { X86::VFNMSUB132PDZ256r      ,     X86::VFNMSUB132PDYr        },
-  { X86::VFNMSUB132PSZ256m      ,     X86::VFNMSUB132PSYm        },
-  { X86::VFNMSUB132PSZ256r      ,     X86::VFNMSUB132PSYr        },
-  { X86::VFNMSUB213PDZ256m      ,     X86::VFNMSUB213PDYm        },
-  { X86::VFNMSUB213PDZ256r      ,     X86::VFNMSUB213PDYr        },
-  { X86::VFNMSUB213PSZ256m      ,     X86::VFNMSUB213PSYm        },
-  { X86::VFNMSUB213PSZ256r      ,     X86::VFNMSUB213PSYr        },
-  { X86::VFNMSUB231PDZ256m      ,     X86::VFNMSUB231PDYm        },
-  { X86::VFNMSUB231PDZ256r      ,     X86::VFNMSUB231PDYr        },
-  { X86::VFNMSUB231PSZ256m      ,     X86::VFNMSUB231PSYm        },
-  { X86::VFNMSUB231PSZ256r      ,     X86::VFNMSUB231PSYr        },
-  { X86::VINSERTF32x4Z256rm     ,    X86::VINSERTF128rm          },
-  { X86::VINSERTF64x2Z256rm     ,    X86::VINSERTF128rm          },
-  { X86::VINSERTF32x4Z256rr     ,    X86::VINSERTF128rr          },
-  { X86::VINSERTF64x2Z256rr     ,    X86::VINSERTF128rr          },
-  { X86::VINSERTI32x4Z256rm     ,    X86::VINSERTI128rm          },
-  { X86::VINSERTI64x2Z256rm     ,    X86::VINSERTI128rm          },
-  { X86::VINSERTI32x4Z256rr     ,    X86::VINSERTI128rr          },
-  { X86::VINSERTI64x2Z256rr     ,    X86::VINSERTI128rr          },
-  { X86::VMAXCPDZ256rm          ,     X86::VMAXCPDYrm            },
-  { X86::VMAXCPDZ256rr          ,     X86::VMAXCPDYrr            },
-  { X86::VMAXCPSZ256rm          ,     X86::VMAXCPSYrm            },
-  { X86::VMAXCPSZ256rr          ,     X86::VMAXCPSYrr            },
-  { X86::VMAXPDZ256rm           ,     X86::VMAXPDYrm             },
-  { X86::VMAXPDZ256rr           ,     X86::VMAXPDYrr             },
-  { X86::VMAXPSZ256rm           ,     X86::VMAXPSYrm             },
-  { X86::VMAXPSZ256rr           ,     X86::VMAXPSYrr             },
-  { X86::VMINCPDZ256rm          ,     X86::VMINCPDYrm            },
-  { X86::VMINCPDZ256rr          ,     X86::VMINCPDYrr            },
-  { X86::VMINCPSZ256rm          ,     X86::VMINCPSYrm            },
-  { X86::VMINCPSZ256rr          ,     X86::VMINCPSYrr            },
-  { X86::VMINPDZ256rm           ,     X86::VMINPDYrm             },
-  { X86::VMINPDZ256rr           ,     X86::VMINPDYrr             },
-  { X86::VMINPSZ256rm           ,     X86::VMINPSYrm             },
-  { X86::VMINPSZ256rr           ,     X86::VMINPSYrr             },
-  { X86::VMOVAPDZ256mr          ,     X86::VMOVAPDYmr            },
-  { X86::VMOVAPDZ256rm          ,     X86::VMOVAPDYrm            },
-  { X86::VMOVAPDZ256rr          ,     X86::VMOVAPDYrr            },
-  { X86::VMOVAPDZ256rr_REV      ,     X86::VMOVAPDYrr_REV        },
-  { X86::VMOVAPSZ256mr          ,     X86::VMOVAPSYmr            },
-  { X86::VMOVAPSZ256rm          ,     X86::VMOVAPSYrm            },
-  { X86::VMOVAPSZ256rr          ,     X86::VMOVAPSYrr            },
-  { X86::VMOVAPSZ256rr_REV      ,     X86::VMOVAPSYrr_REV        },
-  { X86::VMOVDDUPZ256rm         ,     X86::VMOVDDUPYrm           },
-  { X86::VMOVDDUPZ256rr         ,     X86::VMOVDDUPYrr           },
-  { X86::VMOVDQA32Z256mr        ,     X86::VMOVDQAYmr            },
-  { X86::VMOVDQA32Z256rm        ,     X86::VMOVDQAYrm            },
-  { X86::VMOVDQA32Z256rr        ,     X86::VMOVDQAYrr            },
-  { X86::VMOVDQA32Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
-  { X86::VMOVDQA64Z256mr        ,     X86::VMOVDQAYmr            },
-  { X86::VMOVDQA64Z256rm        ,     X86::VMOVDQAYrm            },
-  { X86::VMOVDQA64Z256rr        ,     X86::VMOVDQAYrr            },
-  { X86::VMOVDQA64Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
-  { X86::VMOVDQU16Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU16Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU16Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU16Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU32Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU32Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU32Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU32Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU64Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU64Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU64Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU64Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU8Z256mr         ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU8Z256rm         ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU8Z256rr         ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU8Z256rr_REV     ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVNTDQAZ256rm        ,     X86::VMOVNTDQAYrm          },
-  { X86::VMOVNTDQZ256mr         ,     X86::VMOVNTDQYmr           },
-  { X86::VMOVNTPDZ256mr         ,     X86::VMOVNTPDYmr           },
-  { X86::VMOVNTPSZ256mr         ,     X86::VMOVNTPSYmr           },
-  { X86::VMOVSHDUPZ256rm        ,     X86::VMOVSHDUPYrm          },
-  { X86::VMOVSHDUPZ256rr        ,     X86::VMOVSHDUPYrr          },
-  { X86::VMOVSLDUPZ256rm        ,     X86::VMOVSLDUPYrm          },
-  { X86::VMOVSLDUPZ256rr        ,     X86::VMOVSLDUPYrr          },
-  { X86::VMOVUPDZ256mr          ,     X86::VMOVUPDYmr            },
-  { X86::VMOVUPDZ256rm          ,     X86::VMOVUPDYrm            },
-  { X86::VMOVUPDZ256rr          ,     X86::VMOVUPDYrr            },
-  { X86::VMOVUPDZ256rr_REV      ,     X86::VMOVUPDYrr_REV        },
-  { X86::VMOVUPSZ256mr          ,     X86::VMOVUPSYmr            },
-  { X86::VMOVUPSZ256rm          ,     X86::VMOVUPSYrm            },
-  { X86::VMOVUPSZ256rr          ,     X86::VMOVUPSYrr            },
-  { X86::VMOVUPSZ256rr_REV      ,     X86::VMOVUPSYrr_REV        },
-  { X86::VMULPDZ256rm           ,     X86::VMULPDYrm             },
-  { X86::VMULPDZ256rr           ,     X86::VMULPDYrr             },
-  { X86::VMULPSZ256rm           ,     X86::VMULPSYrm             },
-  { X86::VMULPSZ256rr           ,     X86::VMULPSYrr             },
-  { X86::VORPDZ256rm            ,     X86::VORPDYrm              },
-  { X86::VORPDZ256rr            ,     X86::VORPDYrr              },
-  { X86::VORPSZ256rm            ,     X86::VORPSYrm              },
-  { X86::VORPSZ256rr            ,     X86::VORPSYrr              },
-  { X86::VPABSBZ256rm           ,     X86::VPABSBYrm             },
-  { X86::VPABSBZ256rr           ,     X86::VPABSBYrr             },
-  { X86::VPABSDZ256rm           ,     X86::VPABSDYrm             },
-  { X86::VPABSDZ256rr           ,     X86::VPABSDYrr             },
-  { X86::VPABSWZ256rm           ,     X86::VPABSWYrm             },
-  { X86::VPABSWZ256rr           ,     X86::VPABSWYrr             },
-  { X86::VPACKSSDWZ256rm        ,     X86::VPACKSSDWYrm          },
-  { X86::VPACKSSDWZ256rr        ,     X86::VPACKSSDWYrr          },
-  { X86::VPACKSSWBZ256rm        ,     X86::VPACKSSWBYrm          },
-  { X86::VPACKSSWBZ256rr        ,     X86::VPACKSSWBYrr          },
-  { X86::VPACKUSDWZ256rm        ,     X86::VPACKUSDWYrm          },
-  { X86::VPACKUSDWZ256rr        ,     X86::VPACKUSDWYrr          },
-  { X86::VPACKUSWBZ256rm        ,     X86::VPACKUSWBYrm          },
-  { X86::VPACKUSWBZ256rr        ,     X86::VPACKUSWBYrr          },
-  { X86::VPADDBZ256rm           ,     X86::VPADDBYrm             },
-  { X86::VPADDBZ256rr           ,     X86::VPADDBYrr             },
-  { X86::VPADDDZ256rm           ,     X86::VPADDDYrm             },
-  { X86::VPADDDZ256rr           ,     X86::VPADDDYrr             },
-  { X86::VPADDQZ256rm           ,     X86::VPADDQYrm             },
-  { X86::VPADDQZ256rr           ,     X86::VPADDQYrr             },
-  { X86::VPADDSBZ256rm          ,     X86::VPADDSBYrm            },
-  { X86::VPADDSBZ256rr          ,     X86::VPADDSBYrr            },
-  { X86::VPADDSWZ256rm          ,     X86::VPADDSWYrm            },
-  { X86::VPADDSWZ256rr          ,     X86::VPADDSWYrr            },
-  { X86::VPADDUSBZ256rm         ,     X86::VPADDUSBYrm           },
-  { X86::VPADDUSBZ256rr         ,     X86::VPADDUSBYrr           },
-  { X86::VPADDUSWZ256rm         ,     X86::VPADDUSWYrm           },
-  { X86::VPADDUSWZ256rr         ,     X86::VPADDUSWYrr           },
-  { X86::VPADDWZ256rm           ,     X86::VPADDWYrm             },
-  { X86::VPADDWZ256rr           ,     X86::VPADDWYrr             },
-  { X86::VPALIGNRZ256rmi        ,     X86::VPALIGNRYrmi          },
-  { X86::VPALIGNRZ256rri        ,     X86::VPALIGNRYrri          },
-  { X86::VPANDDZ256rm           ,     X86::VPANDYrm              },
-  { X86::VPANDDZ256rr           ,     X86::VPANDYrr              },
-  { X86::VPANDQZ256rm           ,     X86::VPANDYrm              },
-  { X86::VPANDQZ256rr           ,     X86::VPANDYrr              },
-  { X86::VPAVGBZ256rm           ,     X86::VPAVGBYrm             },
-  { X86::VPAVGBZ256rr           ,     X86::VPAVGBYrr             },
-  { X86::VPAVGWZ256rm           ,     X86::VPAVGWYrm             },
-  { X86::VPAVGWZ256rr           ,     X86::VPAVGWYrr             },
-  { X86::VPBROADCASTBZ256m      ,     X86::VPBROADCASTBYrm       },
-  { X86::VPBROADCASTBZ256r      ,     X86::VPBROADCASTBYrr       },
-  { X86::VPBROADCASTDZ256m      ,     X86::VPBROADCASTDYrm       },
-  { X86::VPBROADCASTDZ256r      ,     X86::VPBROADCASTDYrr       },
-  { X86::VPBROADCASTQZ256m      ,     X86::VPBROADCASTQYrm       },
-  { X86::VPBROADCASTQZ256r      ,     X86::VPBROADCASTQYrr       },
-  { X86::VPBROADCASTWZ256m      ,     X86::VPBROADCASTWYrm       },
-  { X86::VPBROADCASTWZ256r      ,     X86::VPBROADCASTWYrr       },
-  { X86::VPERMDZ256rm           ,     X86::VPERMDYrm             },
-  { X86::VPERMDZ256rr           ,     X86::VPERMDYrr             },
-  { X86::VPERMILPDZ256mi        ,     X86::VPERMILPDYmi          },
-  { X86::VPERMILPDZ256ri        ,     X86::VPERMILPDYri          },
-  { X86::VPERMILPDZ256rm        ,     X86::VPERMILPDYrm          },
-  { X86::VPERMILPDZ256rr        ,     X86::VPERMILPDYrr          },
-  { X86::VPERMILPSZ256mi        ,     X86::VPERMILPSYmi          },
-  { X86::VPERMILPSZ256ri        ,     X86::VPERMILPSYri          },
-  { X86::VPERMILPSZ256rm        ,     X86::VPERMILPSYrm          },
-  { X86::VPERMILPSZ256rr        ,     X86::VPERMILPSYrr          },
-  { X86::VPERMPDZ256mi          ,     X86::VPERMPDYmi            },
-  { X86::VPERMPDZ256ri          ,     X86::VPERMPDYri            },
-  { X86::VPERMPSZ256rm          ,     X86::VPERMPSYrm            },
-  { X86::VPERMPSZ256rr          ,     X86::VPERMPSYrr            },
-  { X86::VPERMQZ256mi           ,     X86::VPERMQYmi             },
-  { X86::VPERMQZ256ri           ,     X86::VPERMQYri             },
-  { X86::VPMADDUBSWZ256rm       ,     X86::VPMADDUBSWYrm         },
-  { X86::VPMADDUBSWZ256rr       ,     X86::VPMADDUBSWYrr         },
-  { X86::VPMADDWDZ256rm         ,     X86::VPMADDWDYrm           },
-  { X86::VPMADDWDZ256rr         ,     X86::VPMADDWDYrr           },
-  { X86::VPMAXSBZ256rm          ,     X86::VPMAXSBYrm            },
-  { X86::VPMAXSBZ256rr          ,     X86::VPMAXSBYrr            },
-  { X86::VPMAXSDZ256rm          ,     X86::VPMAXSDYrm            },
-  { X86::VPMAXSDZ256rr          ,     X86::VPMAXSDYrr            },
-  { X86::VPMAXSWZ256rm          ,     X86::VPMAXSWYrm            },
-  { X86::VPMAXSWZ256rr          ,     X86::VPMAXSWYrr            },
-  { X86::VPMAXUBZ256rm          ,     X86::VPMAXUBYrm            },
-  { X86::VPMAXUBZ256rr          ,     X86::VPMAXUBYrr            },
-  { X86::VPMAXUDZ256rm          ,     X86::VPMAXUDYrm            },
-  { X86::VPMAXUDZ256rr          ,     X86::VPMAXUDYrr            },
-  { X86::VPMAXUWZ256rm          ,     X86::VPMAXUWYrm            },
-  { X86::VPMAXUWZ256rr          ,     X86::VPMAXUWYrr            },
-  { X86::VPMINSBZ256rm          ,     X86::VPMINSBYrm            },
-  { X86::VPMINSBZ256rr          ,     X86::VPMINSBYrr            },
-  { X86::VPMINSDZ256rm          ,     X86::VPMINSDYrm            },
-  { X86::VPMINSDZ256rr          ,     X86::VPMINSDYrr            },
-  { X86::VPMINSWZ256rm          ,     X86::VPMINSWYrm            },
-  { X86::VPMINSWZ256rr          ,     X86::VPMINSWYrr            },
-  { X86::VPMINUBZ256rm          ,     X86::VPMINUBYrm            },
-  { X86::VPMINUBZ256rr          ,     X86::VPMINUBYrr            },
-  { X86::VPMINUDZ256rm          ,     X86::VPMINUDYrm            },
-  { X86::VPMINUDZ256rr          ,     X86::VPMINUDYrr            },
-  { X86::VPMINUWZ256rm          ,     X86::VPMINUWYrm            },
-  { X86::VPMINUWZ256rr          ,     X86::VPMINUWYrr            },
-  { X86::VPMOVSXBDZ256rm        ,     X86::VPMOVSXBDYrm          },
-  { X86::VPMOVSXBDZ256rr        ,     X86::VPMOVSXBDYrr          },
-  { X86::VPMOVSXBQZ256rm        ,     X86::VPMOVSXBQYrm          },
-  { X86::VPMOVSXBQZ256rr        ,     X86::VPMOVSXBQYrr          },
-  { X86::VPMOVSXBWZ256rm        ,     X86::VPMOVSXBWYrm          },
-  { X86::VPMOVSXBWZ256rr        ,     X86::VPMOVSXBWYrr          },
-  { X86::VPMOVSXDQZ256rm        ,     X86::VPMOVSXDQYrm          },
-  { X86::VPMOVSXDQZ256rr        ,     X86::VPMOVSXDQYrr          },
-  { X86::VPMOVSXWDZ256rm        ,     X86::VPMOVSXWDYrm          },
-  { X86::VPMOVSXWDZ256rr        ,     X86::VPMOVSXWDYrr          },
-  { X86::VPMOVSXWQZ256rm        ,     X86::VPMOVSXWQYrm          },
-  { X86::VPMOVSXWQZ256rr        ,     X86::VPMOVSXWQYrr          },
-  { X86::VPMOVZXBDZ256rm        ,     X86::VPMOVZXBDYrm          },
-  { X86::VPMOVZXBDZ256rr        ,     X86::VPMOVZXBDYrr          },
-  { X86::VPMOVZXBQZ256rm        ,     X86::VPMOVZXBQYrm          },
-  { X86::VPMOVZXBQZ256rr        ,     X86::VPMOVZXBQYrr          },
-  { X86::VPMOVZXBWZ256rm        ,     X86::VPMOVZXBWYrm          },
-  { X86::VPMOVZXBWZ256rr        ,     X86::VPMOVZXBWYrr          },
-  { X86::VPMOVZXDQZ256rm        ,     X86::VPMOVZXDQYrm          },
-  { X86::VPMOVZXDQZ256rr        ,     X86::VPMOVZXDQYrr          },
-  { X86::VPMOVZXWDZ256rm        ,     X86::VPMOVZXWDYrm          },
-  { X86::VPMOVZXWDZ256rr        ,     X86::VPMOVZXWDYrr          },
-  { X86::VPMOVZXWQZ256rm        ,     X86::VPMOVZXWQYrm          },
-  { X86::VPMOVZXWQZ256rr        ,     X86::VPMOVZXWQYrr          },
-  { X86::VPMULDQZ256rm          ,     X86::VPMULDQYrm            },
-  { X86::VPMULDQZ256rr          ,     X86::VPMULDQYrr            },
-  { X86::VPMULHRSWZ256rm        ,     X86::VPMULHRSWYrm          },
-  { X86::VPMULHRSWZ256rr        ,     X86::VPMULHRSWYrr          },
-  { X86::VPMULHUWZ256rm         ,     X86::VPMULHUWYrm           },
-  { X86::VPMULHUWZ256rr         ,     X86::VPMULHUWYrr           },
-  { X86::VPMULHWZ256rm          ,     X86::VPMULHWYrm            },
-  { X86::VPMULHWZ256rr          ,     X86::VPMULHWYrr            },
-  { X86::VPMULLDZ256rm          ,     X86::VPMULLDYrm            },
-  { X86::VPMULLDZ256rr          ,     X86::VPMULLDYrr            },
-  { X86::VPMULLWZ256rm          ,     X86::VPMULLWYrm            },
-  { X86::VPMULLWZ256rr          ,     X86::VPMULLWYrr            },
-  { X86::VPMULUDQZ256rm         ,     X86::VPMULUDQYrm           },
-  { X86::VPMULUDQZ256rr         ,     X86::VPMULUDQYrr           },
-  { X86::VPORDZ256rm            ,     X86::VPORYrm               },
-  { X86::VPORDZ256rr            ,     X86::VPORYrr               },
-  { X86::VPORQZ256rm            ,     X86::VPORYrm               },
-  { X86::VPORQZ256rr            ,     X86::VPORYrr               },
-  { X86::VPSADBWZ256rm          ,     X86::VPSADBWYrm            },
-  { X86::VPSADBWZ256rr          ,     X86::VPSADBWYrr            },
-  { X86::VPSHUFBZ256rm          ,     X86::VPSHUFBYrm            },
-  { X86::VPSHUFBZ256rr          ,     X86::VPSHUFBYrr            },
-  { X86::VPSHUFDZ256mi          ,     X86::VPSHUFDYmi            },
-  { X86::VPSHUFDZ256ri          ,     X86::VPSHUFDYri            },
-  { X86::VPSHUFHWZ256mi         ,     X86::VPSHUFHWYmi           },
-  { X86::VPSHUFHWZ256ri         ,     X86::VPSHUFHWYri           },
-  { X86::VPSHUFLWZ256mi         ,     X86::VPSHUFLWYmi           },
-  { X86::VPSHUFLWZ256ri         ,     X86::VPSHUFLWYri           },
-  { X86::VPSLLDQZ256rr          ,     X86::VPSLLDQYri            },
-  { X86::VPSLLDZ256ri           ,     X86::VPSLLDYri             },
-  { X86::VPSLLDZ256rm           ,     X86::VPSLLDYrm             },
-  { X86::VPSLLDZ256rr           ,     X86::VPSLLDYrr             },
-  { X86::VPSLLQZ256ri           ,     X86::VPSLLQYri             },
-  { X86::VPSLLQZ256rm           ,     X86::VPSLLQYrm             },
-  { X86::VPSLLQZ256rr           ,     X86::VPSLLQYrr             },
-  { X86::VPSLLVDZ256rm          ,     X86::VPSLLVDYrm            },
-  { X86::VPSLLVDZ256rr          ,     X86::VPSLLVDYrr            },
-  { X86::VPSLLVQZ256rm          ,     X86::VPSLLVQYrm            },
-  { X86::VPSLLVQZ256rr          ,     X86::VPSLLVQYrr            },
-  { X86::VPSLLWZ256ri           ,     X86::VPSLLWYri             },
-  { X86::VPSLLWZ256rm           ,     X86::VPSLLWYrm             },
-  { X86::VPSLLWZ256rr           ,     X86::VPSLLWYrr             },
-  { X86::VPSRADZ256ri           ,     X86::VPSRADYri             },
-  { X86::VPSRADZ256rm           ,     X86::VPSRADYrm             },
-  { X86::VPSRADZ256rr           ,     X86::VPSRADYrr             },
-  { X86::VPSRAVDZ256rm          ,     X86::VPSRAVDYrm            },
-  { X86::VPSRAVDZ256rr          ,     X86::VPSRAVDYrr            },
-  { X86::VPSRAWZ256ri           ,     X86::VPSRAWYri             },
-  { X86::VPSRAWZ256rm           ,     X86::VPSRAWYrm             },
-  { X86::VPSRAWZ256rr           ,     X86::VPSRAWYrr             },
-  { X86::VPSRLDQZ256rr          ,     X86::VPSRLDQYri            },
-  { X86::VPSRLDZ256ri           ,     X86::VPSRLDYri             },
-  { X86::VPSRLDZ256rm           ,     X86::VPSRLDYrm             },
-  { X86::VPSRLDZ256rr           ,     X86::VPSRLDYrr             },
-  { X86::VPSRLQZ256ri           ,     X86::VPSRLQYri             },
-  { X86::VPSRLQZ256rm           ,     X86::VPSRLQYrm             },
-  { X86::VPSRLQZ256rr           ,     X86::VPSRLQYrr             },
-  { X86::VPSRLVDZ256rm          ,     X86::VPSRLVDYrm            },
-  { X86::VPSRLVDZ256rr          ,     X86::VPSRLVDYrr            },
-  { X86::VPSRLVQZ256rm          ,     X86::VPSRLVQYrm            },
-  { X86::VPSRLVQZ256rr          ,     X86::VPSRLVQYrr            },
-  { X86::VPSRLWZ256ri           ,     X86::VPSRLWYri             },
-  { X86::VPSRLWZ256rm           ,     X86::VPSRLWYrm             },
-  { X86::VPSRLWZ256rr           ,     X86::VPSRLWYrr             },
-  { X86::VPSUBBZ256rm           ,     X86::VPSUBBYrm             },
-  { X86::VPSUBBZ256rr           ,     X86::VPSUBBYrr             },
-  { X86::VPSUBDZ256rm           ,     X86::VPSUBDYrm             },
-  { X86::VPSUBDZ256rr           ,     X86::VPSUBDYrr             },
-  { X86::VPSUBQZ256rm           ,     X86::VPSUBQYrm             },
-  { X86::VPSUBQZ256rr           ,     X86::VPSUBQYrr             },
-  { X86::VPSUBSBZ256rm          ,     X86::VPSUBSBYrm            },
-  { X86::VPSUBSBZ256rr          ,     X86::VPSUBSBYrr            },
-  { X86::VPSUBSWZ256rm          ,     X86::VPSUBSWYrm            },
-  { X86::VPSUBSWZ256rr          ,     X86::VPSUBSWYrr            },
-  { X86::VPSUBUSBZ256rm         ,     X86::VPSUBUSBYrm           },
-  { X86::VPSUBUSBZ256rr         ,     X86::VPSUBUSBYrr           },
-  { X86::VPSUBUSWZ256rm         ,     X86::VPSUBUSWYrm           },
-  { X86::VPSUBUSWZ256rr         ,     X86::VPSUBUSWYrr           },
-  { X86::VPSUBWZ256rm           ,     X86::VPSUBWYrm             },
-  { X86::VPSUBWZ256rr           ,     X86::VPSUBWYrr             },
-  { X86::VPUNPCKHBWZ256rm       ,     X86::VPUNPCKHBWYrm         },
-  { X86::VPUNPCKHBWZ256rr       ,     X86::VPUNPCKHBWYrr         },
-  { X86::VPUNPCKHDQZ256rm       ,     X86::VPUNPCKHDQYrm         },
-  { X86::VPUNPCKHDQZ256rr       ,     X86::VPUNPCKHDQYrr         },
-  { X86::VPUNPCKHQDQZ256rm      ,     X86::VPUNPCKHQDQYrm        },
-  { X86::VPUNPCKHQDQZ256rr      ,     X86::VPUNPCKHQDQYrr        },
-  { X86::VPUNPCKHWDZ256rm       ,     X86::VPUNPCKHWDYrm         },
-  { X86::VPUNPCKHWDZ256rr       ,     X86::VPUNPCKHWDYrr         },
-  { X86::VPUNPCKLBWZ256rm       ,     X86::VPUNPCKLBWYrm         },
-  { X86::VPUNPCKLBWZ256rr       ,     X86::VPUNPCKLBWYrr         },
-  { X86::VPUNPCKLDQZ256rm       ,     X86::VPUNPCKLDQYrm         },
-  { X86::VPUNPCKLDQZ256rr       ,     X86::VPUNPCKLDQYrr         },
-  { X86::VPUNPCKLQDQZ256rm      ,     X86::VPUNPCKLQDQYrm        },
-  { X86::VPUNPCKLQDQZ256rr      ,     X86::VPUNPCKLQDQYrr        },
-  { X86::VPUNPCKLWDZ256rm       ,     X86::VPUNPCKLWDYrm         },
-  { X86::VPUNPCKLWDZ256rr       ,     X86::VPUNPCKLWDYrr         },
-  { X86::VPXORDZ256rm           ,     X86::VPXORYrm              },
-  { X86::VPXORDZ256rr           ,     X86::VPXORYrr              },
-  { X86::VPXORQZ256rm           ,     X86::VPXORYrm              },
-  { X86::VPXORQZ256rr           ,     X86::VPXORYrr              },
-  { X86::VSHUFPDZ256rmi         ,     X86::VSHUFPDYrmi           },
-  { X86::VSHUFPDZ256rri         ,     X86::VSHUFPDYrri           },
-  { X86::VSHUFPSZ256rmi         ,     X86::VSHUFPSYrmi           },
-  { X86::VSHUFPSZ256rri         ,     X86::VSHUFPSYrri           },
-  { X86::VSQRTPDZ256m           ,     X86::VSQRTPDYm             },
-  { X86::VSQRTPDZ256r           ,     X86::VSQRTPDYr             },
-  { X86::VSQRTPSZ256m           ,     X86::VSQRTPSYm             },
-  { X86::VSQRTPSZ256r           ,     X86::VSQRTPSYr             },
-  { X86::VSUBPDZ256rm           ,     X86::VSUBPDYrm             },
-  { X86::VSUBPDZ256rr           ,     X86::VSUBPDYrr             },
-  { X86::VSUBPSZ256rm           ,     X86::VSUBPSYrm             },
-  { X86::VSUBPSZ256rr           ,     X86::VSUBPSYrr             },
-  { X86::VUNPCKHPDZ256rm        ,     X86::VUNPCKHPDYrm          },
-  { X86::VUNPCKHPDZ256rr        ,     X86::VUNPCKHPDYrr          },
-  { X86::VUNPCKHPSZ256rm        ,     X86::VUNPCKHPSYrm          },
-  { X86::VUNPCKHPSZ256rr        ,     X86::VUNPCKHPSYrr          },
-  { X86::VUNPCKLPDZ256rm        ,     X86::VUNPCKLPDYrm          },
-  { X86::VUNPCKLPDZ256rr        ,     X86::VUNPCKLPDYrr          },
-  { X86::VUNPCKLPSZ256rm        ,     X86::VUNPCKLPSYrm          },
-  { X86::VUNPCKLPSZ256rr        ,     X86::VUNPCKLPSYrr          },
-  { X86::VXORPDZ256rm           ,     X86::VXORPDYrm             },
-  { X86::VXORPDZ256rr           ,     X86::VXORPDYrr             },
-  { X86::VXORPSZ256rm           ,     X86::VXORPSYrm             },
-  { X86::VXORPSZ256rr           ,     X86::VXORPSYrr             },
-};
-
-#endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
index 2ea27a9..315a69e 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrVMX.td
@@ -43,22 +43,26 @@ def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
   "vmptrld\t$vmcs", []>, PS;
 def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
   "vmptrst\t$vmcs", []>, TB;
-def VMREAD64rm : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
   "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
-def VMREAD32rm : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
   "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
-def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+let mayStore = 1 in {
+def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+}
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
   "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
-def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
   "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+let mayLoad = 1 in {
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+}
 // 0F 01 C4
 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
 def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
index 2b296e1..5dde2d0 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td
@@ -111,7 +111,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>,
-               XOP_4V, VEX_W, Sched<[WriteVarVecShift]>;
+               XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData<NAME#rr>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -183,6 +183,27 @@ let ExeDomain = SSEPackedInt in {
   defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
 }
 
+// IFMA patterns - for cases where we can safely ignore the overflow bits from
+// the multiply or easily match with existing intrinsics.
+let Predicates = [HasXOP] in {
+  def : Pat<(v8i16 (add (mul (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
+                        (v8i16 VR128:$src3))),
+            (VPMACSWWrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+                        (v4i32 VR128:$src3))),
+            (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v2i64 (add (X86pmuldq (X86PShufd (v4i32 VR128:$src1), (i8 -11)),
+                                   (X86PShufd (v4i32 VR128:$src2), (i8 -11))),
+                        (v2i64 VR128:$src3))),
+            (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v2i64 (add (X86pmuldq (v4i32 VR128:$src1), (v4i32 VR128:$src2)),
+                        (v2i64 VR128:$src3))),
+            (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+  def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)),
+                        (v4i32 VR128:$src3))),
+            (VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+}
+
 // Instruction where second source can be memory, third must be imm8
 multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> {
   let isCommutable = 1 in
@@ -261,7 +282,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 (ins VR128:$src1, VR128:$src2, VR128:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                []>, XOP_4V, VEX_W;
+                []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -269,159 +290,87 @@ let ExeDomain = SSEPackedInt in {
 }
 
 // Instruction where either second or third source can be memory
-multiclass xop4op_int<bits<8> opc, string OpcodeStr,
-                      Intrinsic Int128, Intrinsic Int256> {
-  // 128-bit Instruction
-  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, ValueType VT> {
+  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
-            XOP_4V;
-  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V;
+  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst,
-              (Int128 VR128:$src1, VR128:$src2,
-               (bitconvert (loadv2i64 addr:$src3))))]>,
+            [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
+                                   (X86andnp (load addr:$src3), RC:$src2))))]>,
             XOP_4V, VEX_W;
-  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
-            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
+            (ins RC:$src1, x86memop:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst,
-              (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
-               VR128:$src3))]>,
+            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+                                   (X86andnp RC:$src3, (load addr:$src2)))))]>,
             XOP_4V;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, VR128:$src3),
-            !strconcat(OpcodeStr,
-            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, VEX_W;
-
-  // 256-bit Instruction
-  def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2, VR256:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
-             XOP_4V, VEX_L;
-  def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, VR256:$src2,
-               (bitconvert (loadv4i64 addr:$src3))))]>,
-             XOP_4V, VEX_W, VEX_L;
-  def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
-             (ins VR256:$src1, f256mem:$src2, VR256:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
-                VR256:$src3))]>,
-             XOP_4V, VEX_L;
-  // For disassembler
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
-            (ins VR256:$src1, VR256:$src2, VR256:$src3),
+  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, VEX_W, VEX_L;
+            []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPCMOV : xop4op_int<0xA2, "vpcmov",
-                           int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>;
+  defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L;
 }
 
-let Predicates = [HasXOP] in {
-  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
-                       (X86andnp VR128:$src3, VR128:$src2))),
-            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
-
-  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
-                       (X86andnp VR256:$src3, VR256:$src2))),
-            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-}
-
-multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                  ValueType vt128, ValueType vt256,
-                  ValueType id128, ValueType id256,
-                  PatFrag ld_128, PatFrag ld_256> {
-  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
+                        X86MemOperand intmemop, X86MemOperand fpmemop,
+                        ValueType VT, PatFrag FPLdFrag,
+                        PatFrag IntLdFrag> {
+  def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
-  def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
+        [(set RC:$dst,
+           (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>;
+  def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                          (id128 (bitconvert (loadv2i64 addr:$src3))),
-                          (i8 imm:$src4))))]>,
-        VEX_W;
-  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+        [(set RC:$dst,
+          (VT (X86vpermil2 RC:$src1, RC:$src2,
+                           (bitconvert (IntLdFrag addr:$src3)),
+                           (i8 imm:$src4))))]>, VEX_W;
+  def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1),
-                          (vt128 (bitconvert (ld_128 addr:$src2))),
-                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
+        [(set RC:$dst,
+          (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
+                           RC:$src3, (i8 imm:$src4))))]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        []>, VEX_W;
-
-  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
+  def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
-                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
-  def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
-                          (id256 (bitconvert (loadv4i64 addr:$src3))),
-                          (i8 imm:$src4))))]>, VEX_W, VEX_L;
-  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1),
-                          (vt256 (bitconvert (ld_256 addr:$src2))),
-                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
-  // For disassembler
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        []>, VEX_W, VEX_L;
+        []>, VEX_W, FoldGenData<NAME#rr>;
 }
 
-let ExeDomain = SSEPackedDouble in
-  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
-                           v2i64, v4i64, loadv2f64, loadv4f64>;
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem,
+                                 v2f64, loadv2f64, loadv2i64>;
+  defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem,
+                                  v4f64, loadv4f64, loadv4i64>, VEX_L;
+}
 
-let ExeDomain = SSEPackedSingle in
-  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
-                           v4i32, v8i32, loadv4f32, loadv8f32>;
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
+                                 v4f32, loadv4f32, loadv2i64>;
+  defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
+                                  v8f32, loadv8f32, loadv4i64>, VEX_L;
+}
 
diff --git a/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp
new file mode 100644
index 0000000..859d328
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -0,0 +1,1066 @@
+//===- X86InstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterBankInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "X86-isel"
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class X86InstructionSelector : public InstructionSelector {
+public:
+  X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
+                         const X86RegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  // TODO: remove after supported by Tablegen-erated instruction selection.
+  unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
+                          uint64_t Alignment) const;
+
+  bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
+  bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI,
+                             MachineFunction &MF) const;
+  bool selectGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
+  bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI,
+                      MachineFunction &MF) const;
+  bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI,
+                   MachineFunction &MF) const;
+  bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
+                  MachineFunction &MF) const;
+  bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
+                 MachineFunction &MF) const;
+  bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
+                   MachineFunction &MF) const;
+  bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
+                           MachineFunction &MF) const;
+  bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
+  bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
+                    MachineFunction &MF) const;
+  bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
+                     MachineFunction &MF) const;
+
+  // emit insert subreg instruction and insert it before MachineInstr &I
+  bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+                        MachineRegisterInfo &MRI, MachineFunction &MF) const;
+  // emit extract subreg instruction and insert it before MachineInstr &I
+  bool emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
+                         MachineRegisterInfo &MRI, MachineFunction &MF) const;
+
+  const TargetRegisterClass *getRegClass(LLT Ty, const RegisterBank &RB) const;
+  const TargetRegisterClass *getRegClass(LLT Ty, unsigned Reg,
+                                         MachineRegisterInfo &MRI) const;
+
+  const X86TargetMachine &TM;
+  const X86Subtarget &STI;
+  const X86InstrInfo &TII;
+  const X86RegisterInfo &TRI;
+  const X86RegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
+                                               const X86Subtarget &STI,
+                                               const X86RegisterBankInfo &RBI)
+    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+// FIXME: This should be target-independent, inferred from the types declared
+// for each class in the bank.
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, const RegisterBank &RB) const {
+  if (RB.getID() == X86::GPRRegBankID) {
+    if (Ty.getSizeInBits() <= 8)
+      return &X86::GR8RegClass;
+    if (Ty.getSizeInBits() == 16)
+      return &X86::GR16RegClass;
+    if (Ty.getSizeInBits() == 32)
+      return &X86::GR32RegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &X86::GR64RegClass;
+  }
+  if (RB.getID() == X86::VECRRegBankID) {
+    if (Ty.getSizeInBits() == 32)
+      return STI.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
+    if (Ty.getSizeInBits() == 64)
+      return STI.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
+    if (Ty.getSizeInBits() == 128)
+      return STI.hasAVX512() ? &X86::VR128XRegClass : &X86::VR128RegClass;
+    if (Ty.getSizeInBits() == 256)
+      return STI.hasAVX512() ? &X86::VR256XRegClass : &X86::VR256RegClass;
+    if (Ty.getSizeInBits() == 512)
+      return &X86::VR512RegClass;
+  }
+
+  llvm_unreachable("Unknown RegBank!");
+}
+
+const TargetRegisterClass *
+X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
+                                    MachineRegisterInfo &MRI) const {
+  const RegisterBank &RegBank = *RBI.getRegBank(Reg, MRI, TRI);
+  return getRegClass(Ty, RegBank);
+}
+
+// Set X86 Opcode and constrain DestReg.
+bool X86InstructionSelector::selectCopy(MachineInstr &I,
+                                        MachineRegisterInfo &MRI) const {
+
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    assert(I.isCopy() && "Generic operators do not allow physical registers");
+    return true;
+  }
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+
+  assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+         "No phys reg on generic operators");
+  assert((DstSize == SrcSize ||
+          // Copies are a mean to setup initial types, the number of
+          // bits may not exactly match.
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
+         "Copy with different width?!");
+
+  const TargetRegisterClass *RC = nullptr;
+
+  switch (RegBank.getID()) {
+  case X86::GPRRegBankID:
+    assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
+    RC = getRegClass(MRI.getType(DstReg), RegBank);
+
+    // Change the physical register
+    if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+      if (RC == &X86::GR32RegClass)
+        I.getOperand(1).setSubReg(X86::sub_32bit);
+      else if (RC == &X86::GR16RegClass)
+        I.getOperand(1).setSubReg(X86::sub_16bit);
+      else if (RC == &X86::GR8RegClass)
+        I.getOperand(1).setSubReg(X86::sub_8bit);
+
+      I.getOperand(1).substPhysReg(SrcReg, TRI);
+    }
+    break;
+  case X86::VECRRegBankID:
+    RC = getRegClass(MRI.getType(DstReg), RegBank);
+    break;
+  default:
+    llvm_unreachable("Unknown RegBank!");
+  }
+
+  // No need to constrain SrcReg. It will get constrained when
+  // we hit another of its use or its defs.
+  // Copies do not have constraints.
+  const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
+  if (!OldRC || !RC->hasSubClassEq(OldRC)) {
+    if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+      DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                   << " operand\n");
+      return false;
+    }
+  }
+  I.setDesc(TII.get(X86::COPY));
+  return true;
+}
+
+bool X86InstructionSelector::select(MachineInstr &I) const {
+  assert(I.getParent() && "Instruction should be in a basic block!");
+  assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Opcode = I.getOpcode();
+  if (!isPreISelGenericOpcode(Opcode)) {
+    // Certain non-generic instructions also need some special handling.
+
+    if (I.isCopy())
+      return selectCopy(I, MRI);
+
+    // TODO: handle more cases - LOAD_STACK_GUARD, PHI
+    return true;
+  }
+
+  assert(I.getNumOperands() == I.getNumExplicitOperands() &&
+         "Generic instruction has unexpected implicit operands\n");
+
+  if (selectImpl(I))
+    return true;
+
+  DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
+
+  // TODO: This should be implemented by tblgen.
+  if (selectLoadStoreOp(I, MRI, MF))
+    return true;
+  if (selectFrameIndexOrGep(I, MRI, MF))
+    return true;
+  if (selectGlobalValue(I, MRI, MF))
+    return true;
+  if (selectConstant(I, MRI, MF))
+    return true;
+  if (selectTrunc(I, MRI, MF))
+    return true;
+  if (selectZext(I, MRI, MF))
+    return true;
+  if (selectCmp(I, MRI, MF))
+    return true;
+  if (selectUadde(I, MRI, MF))
+    return true;
+  if (selectUnmergeValues(I, MRI, MF))
+    return true;
+  if (selectMergeValues(I, MRI, MF))
+    return true;
+  if (selectExtract(I, MRI, MF))
+    return true;
+  if (selectInsert(I, MRI, MF))
+    return true;
+
+  return false;
+}
+
+unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
+                                                unsigned Opc,
+                                                uint64_t Alignment) const {
+  bool Isload = (Opc == TargetOpcode::G_LOAD);
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (Ty == LLT::scalar(8)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV8rm : X86::MOV8mr;
+  } else if (Ty == LLT::scalar(16)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV16rm : X86::MOV16mr;
+  } else if (Ty == LLT::scalar(32) || Ty == LLT::pointer(0, 32)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::VECRRegBankID == RB.getID())
+      return Isload ? (HasAVX512 ? X86::VMOVSSZrm
+                                 : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm)
+                    : (HasAVX512 ? X86::VMOVSSZmr
+                                 : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+  } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::VECRRegBankID == RB.getID())
+      return Isload ? (HasAVX512 ? X86::VMOVSDZrm
+                                 : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm)
+                    : (HasAVX512 ? X86::VMOVSDZmr
+                                 : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
+    if (Alignment >= 16)
+      return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
+                              : HasAVX512
+                                    ? X86::VMOVAPSZ128rm_NOVLX
+                                    : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
+                    : (HasVLX ? X86::VMOVAPSZ128mr
+                              : HasAVX512
+                                    ? X86::VMOVAPSZ128mr_NOVLX
+                                    : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+    else
+      return Isload ? (HasVLX ? X86::VMOVUPSZ128rm
+                              : HasAVX512
+                                    ? X86::VMOVUPSZ128rm_NOVLX
+                                    : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
+                    : (HasVLX ? X86::VMOVUPSZ128mr
+                              : HasAVX512
+                                    ? X86::VMOVUPSZ128mr_NOVLX
+                                    : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 256) {
+    if (Alignment >= 32)
+      return Isload ? (HasVLX ? X86::VMOVAPSZ256rm
+                              : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
+                                          : X86::VMOVAPSYrm)
+                    : (HasVLX ? X86::VMOVAPSZ256mr
+                              : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
+                                          : X86::VMOVAPSYmr);
+    else
+      return Isload ? (HasVLX ? X86::VMOVUPSZ256rm
+                              : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
+                                          : X86::VMOVUPSYrm)
+                    : (HasVLX ? X86::VMOVUPSZ256mr
+                              : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
+                                          : X86::VMOVUPSYmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 512) {
+    if (Alignment >= 64)
+      return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+  return Opc;
+}
+
+// Fill in an address from the given instruction.
+void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
+                      X86AddressMode &AM) {
+
+  assert(I.getOperand(0).isReg() && "unsupported opperand.");
+  assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
+         "unsupported type.");
+
+  if (I.getOpcode() == TargetOpcode::G_GEP) {
+    if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
+      int64_t Imm = *COff;
+      if (isInt<32>(Imm)) { // Check for displacement overflow.
+        AM.Disp = static_cast<int32_t>(Imm);
+        AM.Base.Reg = I.getOperand(1).getReg();
+        return;
+      }
+    }
+  } else if (I.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    AM.Base.FrameIndex = I.getOperand(1).getIndex();
+    AM.BaseType = X86AddressMode::FrameIndexBase;
+    return;
+  }
+
+  // Default behavior.
+  AM.Base.Reg = I.getOperand(0).getReg();
+  return;
+}
+
+bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+
+  unsigned Opc = I.getOpcode();
+
+  if (Opc != TargetOpcode::G_STORE && Opc != TargetOpcode::G_LOAD)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+  auto &MemOp = **I.memoperands_begin();
+  if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+    DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+    return false;
+  }
+
+  unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
+  if (NewOpc == Opc)
+    return false;
+
+  X86AddressMode AM;
+  X86SelectAddress(*MRI.getVRegDef(I.getOperand(1).getReg()), MRI, AM);
+
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+  if (Opc == TargetOpcode::G_LOAD) {
+    I.RemoveOperand(1);
+    addFullAddress(MIB, AM);
+  } else {
+    // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+    I.RemoveOperand(1);
+    I.RemoveOperand(0);
+    addFullAddress(MIB, AM).addUse(DefReg);
+  }
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+static unsigned getLeaOP(LLT Ty, const X86Subtarget &STI) {
+  if (Ty == LLT::pointer(0, 64))
+    return X86::LEA64r;
+  else if (Ty == LLT::pointer(0, 32))
+    return STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r;
+  else
+    llvm_unreachable("Can't get LEA opcode. Unsupported type.");
+}
+
+bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
+                                                   MachineRegisterInfo &MRI,
+                                                   MachineFunction &MF) const {
+  unsigned Opc = I.getOpcode();
+
+  if (Opc != TargetOpcode::G_FRAME_INDEX && Opc != TargetOpcode::G_GEP)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+
+  // Use LEA to calculate frame index and GEP
+  unsigned NewOpc = getLeaOP(Ty, STI);
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+
+  if (Opc == TargetOpcode::G_FRAME_INDEX) {
+    addOffset(MIB, 0);
+  } else {
+    MachineOperand &InxOp = I.getOperand(2);
+    I.addOperand(InxOp);        // set IndexReg
+    InxOp.ChangeToImmediate(1); // set Scale
+    MIB.addImm(0).addReg(0);
+  }
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+  unsigned Opc = I.getOpcode();
+
+  if (Opc != TargetOpcode::G_GLOBAL_VALUE)
+    return false;
+
+  auto GV = I.getOperand(1).getGlobal();
+  if (GV->isThreadLocal()) {
+    return false; // TODO: we don't support TLS yet.
+  }
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
+
+  X86AddressMode AM;
+  AM.GV = GV;
+  AM.GVOpFlags = STI.classifyGlobalReference(GV);
+
+  // TODO: The ABI requires an extra load. not supported yet.
+  if (isGlobalStubReference(AM.GVOpFlags))
+    return false;
+
+  // TODO: This reference is relative to the pic base. not supported yet.
+  if (isGlobalRelativeToPICBase(AM.GVOpFlags))
+    return false;
+
+  if (STI.isPICStyleRIPRel()) {
+    // Use rip-relative addressing.
+    assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+    AM.Base.Reg = X86::RIP;
+  }
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+  unsigned NewOpc = getLeaOP(Ty, STI);
+
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+
+  I.RemoveOperand(1);
+  addFullAddress(MIB, AM);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectConstant(MachineInstr &I,
+                                            MachineRegisterInfo &MRI,
+                                            MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_CONSTANT)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+
+  if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID)
+    return false;
+
+  uint64_t Val = 0;
+  if (I.getOperand(1).isCImm()) {
+    Val = I.getOperand(1).getCImm()->getZExtValue();
+    I.getOperand(1).ChangeToImmediate(Val);
+  } else if (I.getOperand(1).isImm()) {
+    Val = I.getOperand(1).getImm();
+  } else
+    llvm_unreachable("Unsupported operand type.");
+
+  unsigned NewOpc;
+  switch (Ty.getSizeInBits()) {
+  case 8:
+    NewOpc = X86::MOV8ri;
+    break;
+  case 16:
+    NewOpc = X86::MOV16ri;
+    break;
+  case 32:
+    NewOpc = X86::MOV32ri;
+    break;
+  case 64: {
+    // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used
+    if (isInt<32>(Val))
+      NewOpc = X86::MOV64ri32;
+    else
+      NewOpc = X86::MOV64ri;
+    break;
+  }
+  default:
+    llvm_unreachable("Can't select G_CONSTANT, unsupported type.");
+  }
+
+  I.setDesc(TII.get(NewOpc));
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectTrunc(MachineInstr &I,
+                                         MachineRegisterInfo &MRI,
+                                         MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_TRUNC)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+  const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+  if (DstRB.getID() != SrcRB.getID()) {
+    DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+    return false;
+  }
+
+  if (DstRB.getID() != X86::GPRRegBankID)
+    return false;
+
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+  if (!DstRC)
+    return false;
+
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+  if (!SrcRC)
+    return false;
+
+  unsigned SubIdx;
+  if (DstRC == SrcRC) {
+    // Nothing to be done
+    SubIdx = X86::NoSubRegister;
+  } else if (DstRC == &X86::GR32RegClass) {
+    SubIdx = X86::sub_32bit;
+  } else if (DstRC == &X86::GR16RegClass) {
+    SubIdx = X86::sub_16bit;
+  } else if (DstRC == &X86::GR8RegClass) {
+    SubIdx = X86::sub_8bit;
+  } else {
+    return false;
+  }
+
+  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    return false;
+  }
+
+  I.getOperand(1).setSubReg(SubIdx);
+
+  I.setDesc(TII.get(X86::COPY));
+  return true;
+}
+
+bool X86InstructionSelector::selectZext(MachineInstr &I,
+                                        MachineRegisterInfo &MRI,
+                                        MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_ZEXT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  if (SrcTy != LLT::scalar(1))
+    return false;
+
+  unsigned AndOpc;
+  if (DstTy == LLT::scalar(8))
+    AndOpc = X86::AND8ri;
+  else if (DstTy == LLT::scalar(16))
+    AndOpc = X86::AND16ri8;
+  else if (DstTy == LLT::scalar(32))
+    AndOpc = X86::AND32ri8;
+  else if (DstTy == LLT::scalar(64))
+    AndOpc = X86::AND64ri8;
+  else
+    return false;
+
+  unsigned DefReg = SrcReg;
+  if (DstTy != LLT::scalar(8)) {
+    DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI));
+    BuildMI(*I.getParent(), I, I.getDebugLoc(),
+            TII.get(TargetOpcode::SUBREG_TO_REG), DefReg)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(X86::sub_8bit);
+  }
+
+  MachineInstr &AndInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg)
+           .addReg(DefReg)
+           .addImm(1);
+
+  constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool X86InstructionSelector::selectCmp(MachineInstr &I,
+                                       MachineRegisterInfo &MRI,
+                                       MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_ICMP)
+    return false;
+
+  X86::CondCode CC;
+  bool SwapArgs;
+  std::tie(CC, SwapArgs) = X86::getX86ConditionCode(
+      (CmpInst::Predicate)I.getOperand(1).getPredicate());
+  unsigned OpSet = X86::getSETFromCond(CC);
+
+  unsigned LHS = I.getOperand(2).getReg();
+  unsigned RHS = I.getOperand(3).getReg();
+
+  if (SwapArgs)
+    std::swap(LHS, RHS);
+
+  unsigned OpCmp;
+  LLT Ty = MRI.getType(LHS);
+
+  switch (Ty.getSizeInBits()) {
+  default:
+    return false;
+  case 8:
+    OpCmp = X86::CMP8rr;
+    break;
+  case 16:
+    OpCmp = X86::CMP16rr;
+    break;
+  case 32:
+    OpCmp = X86::CMP32rr;
+    break;
+  case 64:
+    OpCmp = X86::CMP64rr;
+    break;
+  }
+
+  MachineInstr &CmpInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp))
+           .addReg(LHS)
+           .addReg(RHS);
+
+  MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                   TII.get(OpSet), I.getOperand(0).getReg());
+
+  constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI);
+  constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool X86InstructionSelector::selectUadde(MachineInstr &I,
+                                         MachineRegisterInfo &MRI,
+                                         MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_UADDE)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned CarryOutReg = I.getOperand(1).getReg();
+  const unsigned Op0Reg = I.getOperand(2).getReg();
+  const unsigned Op1Reg = I.getOperand(3).getReg();
+  unsigned CarryInReg = I.getOperand(4).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+
+  if (DstTy != LLT::scalar(32))
+    return false;
+
+  // find CarryIn def instruction.
+  MachineInstr *Def = MRI.getVRegDef(CarryInReg);
+  while (Def->getOpcode() == TargetOpcode::G_TRUNC) {
+    CarryInReg = Def->getOperand(1).getReg();
+    Def = MRI.getVRegDef(CarryInReg);
+  }
+
+  unsigned Opcode;
+  if (Def->getOpcode() == TargetOpcode::G_UADDE) {
+    // carry set by prev ADD.
+
+    BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), X86::EFLAGS)
+        .addReg(CarryInReg);
+
+    if (!RBI.constrainGenericRegister(CarryInReg, X86::GR32RegClass, MRI))
+      return false;
+
+    Opcode = X86::ADC32rr;
+  } else if (auto val = getConstantVRegVal(CarryInReg, MRI)) {
+    // carry is constant, support only 0.
+    if (*val != 0)
+      return false;
+
+    Opcode = X86::ADD32rr;
+  } else
+    return false;
+
+  MachineInstr &AddInst =
+      *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
+           .addReg(Op0Reg)
+           .addReg(Op1Reg);
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), CarryOutReg)
+      .addReg(X86::EFLAGS);
+
+  if (!constrainSelectedInstRegOperands(AddInst, TII, TRI, RBI) ||
+      !RBI.constrainGenericRegister(CarryOutReg, X86::GR32RegClass, MRI))
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool X86InstructionSelector::selectExtract(MachineInstr &I,
+                                           MachineRegisterInfo &MRI,
+                                           MachineFunction &MF) const {
+
+  if (I.getOpcode() != TargetOpcode::G_EXTRACT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  int64_t Index = I.getOperand(2).getImm();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+
+  // Meanwile handle vector type only.
+  if (!DstTy.isVector())
+    return false;
+
+  if (Index % DstTy.getSizeInBits() != 0)
+    return false; // Not extract subvector.
+
+  if (Index == 0) {
+    // Replace by extract subreg copy.
+    if (!emitExtractSubreg(DstReg, SrcReg, I, MRI, MF))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (SrcTy.getSizeInBits() == 256 && DstTy.getSizeInBits() == 128) {
+    if (HasVLX)
+      I.setDesc(TII.get(X86::VEXTRACTF32x4Z256rr));
+    else if (HasAVX)
+      I.setDesc(TII.get(X86::VEXTRACTF128rr));
+    else
+      return false;
+  } else if (SrcTy.getSizeInBits() == 512 && HasAVX512) {
+    if (DstTy.getSizeInBits() == 128)
+      I.setDesc(TII.get(X86::VEXTRACTF32x4Zrr));
+    else if (DstTy.getSizeInBits() == 256)
+      I.setDesc(TII.get(X86::VEXTRACTF64x4Zrr));
+    else
+      return false;
+  } else
+    return false;
+
+  // Convert to X86 VEXTRACT immediate.
+  Index = Index / DstTy.getSizeInBits();
+  I.getOperand(2).setImm(Index);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
+                                               MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  unsigned SubIdx = X86::NoSubRegister;
+
+  if (!DstTy.isVector() || !SrcTy.isVector())
+    return false;
+
+  assert(SrcTy.getSizeInBits() > DstTy.getSizeInBits() &&
+         "Incorrect Src/Dst register size");
+
+  if (DstTy.getSizeInBits() == 128)
+    SubIdx = X86::sub_xmm;
+  else if (DstTy.getSizeInBits() == 256)
+    SubIdx = X86::sub_ymm;
+  else
+    return false;
+
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+
+  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubIdx);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY), DstReg)
+      .addReg(SrcReg, 0, SubIdx);
+
+  return true;
+}
+
+bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
+                                              MachineInstr &I,
+                                              MachineRegisterInfo &MRI,
+                                              MachineFunction &MF) const {
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  unsigned SubIdx = X86::NoSubRegister;
+
+  // TODO: support scalar types
+  if (!DstTy.isVector() || !SrcTy.isVector())
+    return false;
+
+  assert(SrcTy.getSizeInBits() < DstTy.getSizeInBits() &&
+         "Incorrect Src/Dst register size");
+
+  if (SrcTy.getSizeInBits() == 128)
+    SubIdx = X86::sub_xmm;
+  else if (SrcTy.getSizeInBits() == 256)
+    SubIdx = X86::sub_ymm;
+  else
+    return false;
+
+  const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcReg, MRI);
+  const TargetRegisterClass *DstRC = getRegClass(DstTy, DstReg, MRI);
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+    DEBUG(dbgs() << "Failed to constrain INSERT_SUBREG\n");
+    return false;
+  }
+
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::COPY))
+      .addReg(DstReg, RegState::DefineNoRead, SubIdx)
+      .addReg(SrcReg);
+
+  return true;
+}
+
+bool X86InstructionSelector::selectInsert(MachineInstr &I,
+                                          MachineRegisterInfo &MRI,
+                                          MachineFunction &MF) const {
+
+  if (I.getOpcode() != TargetOpcode::G_INSERT)
+    return false;
+
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned InsertReg = I.getOperand(2).getReg();
+  int64_t Index = I.getOperand(3).getImm();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT InsertRegTy = MRI.getType(InsertReg);
+
+  // Meanwile handle vector type only.
+  if (!DstTy.isVector())
+    return false;
+
+  if (Index % InsertRegTy.getSizeInBits() != 0)
+    return false; // Not insert subvector.
+
+  if (Index == 0 && MRI.getVRegDef(SrcReg)->isImplicitDef()) {
+    // Replace by subreg copy.
+    if (!emitInsertSubreg(DstReg, InsertReg, I, MRI, MF))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (DstTy.getSizeInBits() == 256 && InsertRegTy.getSizeInBits() == 128) {
+    if (HasVLX)
+      I.setDesc(TII.get(X86::VINSERTF32x4Z256rr));
+    else if (HasAVX)
+      I.setDesc(TII.get(X86::VINSERTF128rr));
+    else
+      return false;
+  } else if (DstTy.getSizeInBits() == 512 && HasAVX512) {
+    if (InsertRegTy.getSizeInBits() == 128)
+      I.setDesc(TII.get(X86::VINSERTF32x4Zrr));
+    else if (InsertRegTy.getSizeInBits() == 256)
+      I.setDesc(TII.get(X86::VINSERTF64x4Zrr));
+    else
+      return false;
+  } else
+    return false;
+
+  // Convert to X86 VINSERT immediate.
+  Index = Index / InsertRegTy.getSizeInBits();
+
+  I.getOperand(3).setImm(Index);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I,
+                                                 MachineRegisterInfo &MRI,
+                                                 MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
+    return false;
+
+  // Split to extracts.
+  unsigned NumDefs = I.getNumOperands() - 1;
+  unsigned SrcReg = I.getOperand(NumDefs).getReg();
+  unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
+
+  for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
+
+    MachineInstr &ExtrInst =
+        *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                 TII.get(TargetOpcode::G_EXTRACT), I.getOperand(Idx).getReg())
+             .addReg(SrcReg)
+             .addImm(Idx * DefSize);
+
+    if (!select(ExtrInst))
+      return false;
+  }
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
+
+  // Split to inserts.
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned SrcReg0 = I.getOperand(1).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg0);
+  unsigned SrcSize = SrcTy.getSizeInBits();
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+  // For the first src use insertSubReg.
+  unsigned DefReg = MRI.createGenericVirtualRegister(DstTy);
+  MRI.setRegBank(DefReg, RegBank);
+  if (!emitInsertSubreg(DefReg, I.getOperand(1).getReg(), I, MRI, MF))
+    return false;
+
+  for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
+
+    unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
+    MRI.setRegBank(Tmp, RegBank);
+
+    MachineInstr &InsertInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                        TII.get(TargetOpcode::G_INSERT), Tmp)
+                                    .addReg(DefReg)
+                                    .addReg(I.getOperand(Idx).getReg())
+                                    .addImm((Idx - 1) * SrcSize);
+
+    DefReg = Tmp;
+
+    if (!select(InsertInst))
+      return false;
+  }
+
+  MachineInstr &CopyInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                    TII.get(TargetOpcode::COPY), DstReg)
+                                .addReg(DefReg);
+
+  if (!select(CopyInst))
+    return false;
+
+  I.eraseFromParent();
+  return true;
+}
+InstructionSelector *
+llvm::createX86InstructionSelector(const X86TargetMachine &TM,
+                                   X86Subtarget &Subtarget,
+                                   X86RegisterBankInfo &RBI) {
+  return new X86InstructionSelector(TM, Subtarget, RBI);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index d9edf46..f0ed4bc 100644
--- a/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -16,9 +16,11 @@
 
 #include "X86ISelLowering.h"
 #include "X86TargetMachine.h"
+#include "llvm/Analysis/VectorUtils.h"
 
 using namespace llvm;
 
+namespace {
 /// \brief This class holds necessary information to represent an interleaved
 /// access group and supports utilities to lower the group into
 /// X86-specific instructions/intrinsics.
@@ -27,7 +29,6 @@ using namespace llvm;
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
-
 class X86InterleavedAccessGroup {
   /// \brief Reference to the wide-load instruction of an interleaved access
   /// group.
@@ -50,9 +51,8 @@ class X86InterleavedAccessGroup {
   IRBuilder<> &Builder;
 
   /// \brief Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
-  /// sub vectors of type \p T. Returns true and the sub-vectors in
-  /// \p DecomposedVectors if it decomposes the Inst, returns false otherwise.
-  bool decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+  /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
+  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
                  SmallVectorImpl<Instruction *> &DecomposedVectors);
 
   /// \brief Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -80,8 +80,7 @@ public:
   /// target information \p STarget.
   explicit X86InterleavedAccessGroup(Instruction *I,
                                      ArrayRef<ShuffleVectorInst *> Shuffs,
-                                     ArrayRef<unsigned> Ind,
-                                     const unsigned F,
+                                     ArrayRef<unsigned> Ind, const unsigned F,
                                      const X86Subtarget &STarget,
                                      IRBuilder<> &B)
       : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
@@ -95,54 +94,68 @@ public:
   /// instructions/intrinsics.
   bool lowerIntoOptimizedSequence();
 };
+} // end anonymous namespace
 
 bool X86InterleavedAccessGroup::isSupported() const {
   VectorType *ShuffleVecTy = Shuffles[0]->getType();
   uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
   Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
 
-  if (DL.getTypeSizeInBits(Inst->getType()) < Factor * ShuffleVecSize)
-    return false;
+  // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
+  uint64_t ExpectedShuffleVecSize;
+  if (isa<LoadInst>(Inst))
+    ExpectedShuffleVecSize = 256;
+  else
+    ExpectedShuffleVecSize = 1024;
 
-  // Currently, lowering is supported for 64 bits on AVX.
-  if (!Subtarget.hasAVX() || ShuffleVecSize != 256 ||
+  if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
       DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
     return false;
 
   return true;
 }
 
-bool X86InterleavedAccessGroup::decompose(
+void X86InterleavedAccessGroup::decompose(
     Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
     SmallVectorImpl<Instruction *> &DecomposedVectors) {
+
+  assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
+         "Expected Load or Shuffle");
+
   Type *VecTy = VecInst->getType();
   (void)VecTy;
   assert(VecTy->isVectorTy() &&
          DL.getTypeSizeInBits(VecTy) >=
              DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
          "Invalid Inst-size!!!");
-  assert(VecTy->getVectorElementType() == SubVecTy->getVectorElementType() &&
-         "Element type mismatched!!!");
 
-  if (!isa<LoadInst>(VecInst))
-    return false;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
+    Value *Op0 = SVI->getOperand(0);
+    Value *Op1 = SVI->getOperand(1);
+
+    // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
+    for (unsigned i = 0; i < NumSubVectors; ++i)
+      DecomposedVectors.push_back(
+          cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
+              Op0, Op1, createSequentialMask(Builder, Indices[i],
+                                             SubVecTy->getVectorNumElements(), 0))));
+    return;
+  }
 
+  // Decompose the load instruction.
   LoadInst *LI = cast<LoadInst>(VecInst);
   Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
-
   Value *VecBasePtr =
       Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
 
-  // Generate N loads of T type
+  // Generate N loads of T type.
   for (unsigned i = 0; i < NumSubVectors; i++) {
-    // TODO: Support inbounds GEP
+    // TODO: Support inbounds GEP.
     Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
     Instruction *NewLoad =
         Builder.CreateAlignedLoad(NewBasePtr, LI->getAlignment());
     DecomposedVectors.push_back(NewLoad);
   }
-
-  return true;
 }
 
 void X86InterleavedAccessGroup::transpose_4x4(
@@ -180,21 +193,46 @@ void X86InterleavedAccessGroup::transpose_4x4(
 // instructions/intrinsics.
 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   SmallVector<Instruction *, 4> DecomposedVectors;
-  VectorType *VecTy = Shuffles[0]->getType();
-  // Try to generate target-sized register(/instruction).
-  if (!decompose(Inst, Factor, VecTy, DecomposedVectors))
-    return false;
-
   SmallVector<Value *, 4> TransposedVectors;
-  // Perform matrix-transposition in order to compute interleaved
-  // results by generating some sort of (optimized) target-specific
-  // instructions.
+  VectorType *ShuffleTy = Shuffles[0]->getType();
+
+  if (isa<LoadInst>(Inst)) {
+    // Try to generate target-sized register(/instruction).
+    decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+
+    // Perform matrix-transposition in order to compute interleaved
+    // results by generating some sort of (optimized) target-specific
+    // instructions.
+    transpose_4x4(DecomposedVectors, TransposedVectors);
+
+    // Now replace the unoptimized-interleaved-vectors with the
+    // transposed-interleaved vectors.
+    for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
+      Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+
+    return true;
+  }
+
+  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
+  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+
+  // Lower the interleaved stores:
+  //   1. Decompose the interleaved wide shuffle into individual shuffle
+  //   vectors.
+  decompose(Shuffles[0], Factor,
+            VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+
+  //   2. Transpose the interleaved-vectors into vectors of contiguous
+  //      elements.
   transpose_4x4(DecomposedVectors, TransposedVectors);
 
-  // Now replace the unoptimized-interleaved-vectors with the
-  // transposed-interleaved vectors.
-  for (unsigned i = 0; i < Shuffles.size(); i++)
-    Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
+  //   3. Concatenate the contiguous-vectors back into a wide vector.
+  Value *WideVec = concatenateVectors(Builder, TransposedVectors);
+
+  //   4. Generate a store instruction for wide-vec.
+  StoreInst *SI = cast<StoreInst>(Inst);
+  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
+                             SI->getAlignment());
 
   return true;
 }
@@ -219,3 +257,29 @@ bool X86TargetLowering::lowerInterleavedLoad(
 
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
+
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+                                              ShuffleVectorInst *SVI,
+                                              unsigned Factor) const {
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
+  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+         "Invalid interleaved store");
+
+  // Holds the indices of SVI that correspond to the starting index of each
+  // interleaved shuffle.
+  SmallVector<unsigned, 4> Indices;
+  auto Mask = SVI->getShuffleMask();
+  for (unsigned i = 0; i < Factor; i++)
+    Indices.push_back(Mask[i]);
+
+  ArrayRef<ShuffleVectorInst *> Shuffles = makeArrayRef(SVI);
+
+  // Create an interleaved access group.
+  IRBuilder<> Builder(SI);
+  X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
+                                Builder);
+
+  return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 63a02af..6b1add8 100644
--- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
   TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
-  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+  FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
 };
 
 struct IntrinsicData {
@@ -67,6 +67,23 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
   X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
 
+  X86_INTRINSIC_DATA(avx2_gather_d_d,      GATHER_AVX2, X86::VPGATHERDDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_d_256,  GATHER_AVX2, X86::VPGATHERDDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_pd,     GATHER_AVX2, X86::VGATHERDPDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_ps,     GATHER_AVX2, X86::VGATHERDPSrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_q,      GATHER_AVX2, X86::VPGATHERDQrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_q_256,  GATHER_AVX2, X86::VPGATHERDQYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_d,      GATHER_AVX2, X86::VPGATHERQDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_d_256,  GATHER_AVX2, X86::VPGATHERQDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_pd,     GATHER_AVX2, X86::VGATHERQPDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_ps,     GATHER_AVX2, X86::VGATHERQPSrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_q,      GATHER_AVX2, X86::VPGATHERQQrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_q_256,  GATHER_AVX2, X86::VPGATHERQQYrm, 0),
+
   X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
@@ -325,6 +342,8 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
  * the alphabetical order.
  */
 static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx_cmp_pd_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(avx_cmp_ps_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
@@ -351,9 +370,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
@@ -421,18 +440,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -455,18 +462,20 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
-
+  X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FADD_RND, 0),
+                     X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FADD_RND, 0),
+                     X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
                      X86ISD::VBROADCAST, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
@@ -501,12 +510,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::SHUF128, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
                      X86ISD::SHUF128, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
@@ -515,16 +518,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
                      X86ISD::CMPM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_cmp_sd,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
   X86_INTRINSIC_DATA(avx512_mask_cmp_ss,     CMP_MASK_SCALAR_CC,
                      X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,  CMP_MASK_CC, X86ISD::CMPM, 0),
-  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,  CMP_MASK_CC, X86ISD::CMPM, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::COMPRESS, 0),
   X86_INTRINSIC_DATA(avx512_mask_compress_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -720,9 +717,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
                      X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FDIV_RND, 0),
+                     X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FDIV_RND, 0),
+                     X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -795,74 +792,42 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VGETMANTS, 0),
   X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
                      X86ISD::VGETMANTS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMAX_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMAX_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMAXS, X86ISD::FMAXS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMAXS, X86ISD::FMAXS_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMIN_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMIN_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMINS, X86ISD::FMINS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMINS, X86ISD::FMINS_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMUL_RND, 0),
+                     X86ISD::FMULS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMUL_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+                     X86ISD::FMULS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
@@ -1191,21 +1156,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
                      X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSUB_RND, 0),
+                     X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSUB_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
-  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
+                     X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
                      X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
@@ -1486,6 +1439,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VPMADD52L, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
                      X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
@@ -1613,6 +1570,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
@@ -1620,7 +1578,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_max_ss,        INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse_min_ss,        INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
@@ -1631,6 +1591,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
   X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_cmp_pd,       INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
@@ -1643,7 +1604,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_cvttpd2dq,    INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(sse2_cvttps2dq,    INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
   X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse2_max_sd,       INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_min_sd,       INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
@@ -1696,9 +1659,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
   X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, ISD::ABS, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
diff --git a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp
new file mode 100644
index 0000000..744ba21
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -0,0 +1,351 @@
+//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86LegalizerInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+using namespace TargetOpcode;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
+                                   const X86TargetMachine &TM)
+    : Subtarget(STI), TM(TM) {
+
+  setLegalizerInfo32bit();
+  setLegalizerInfo64bit();
+  setLegalizerInfoSSE1();
+  setLegalizerInfoSSE2();
+  setLegalizerInfoSSE41();
+  setLegalizerInfoAVX();
+  setLegalizerInfoAVX2();
+  setLegalizerInfoAVX512();
+  setLegalizerInfoAVX512DQ();
+  setLegalizerInfoAVX512BW();
+
+  computeTables();
+}
+
+void X86LegalizerInfo::setLegalizerInfo32bit() {
+
+  if (Subtarget.is64Bit())
+    return;
+
+  const LLT p0 = LLT::pointer(0, 32);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+    for (auto Ty : {s8, s16, s32})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned Op : {G_UADDE}) {
+    setAction({Op, s32}, Legal);
+    setAction({Op, 1, s1}, Legal);
+  }
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, p0})
+      setAction({MemOp, Ty}, Legal);
+
+    setAction({MemOp, s1}, WidenScalar);
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+  setAction({G_GLOBAL_VALUE, p0}, Legal);
+
+  setAction({G_GEP, p0}, Legal);
+  setAction({G_GEP, 1, s32}, Legal);
+
+  for (auto Ty : {s1, s8, s16})
+    setAction({G_GEP, 1, Ty}, WidenScalar);
+
+  // Constants
+  for (auto Ty : {s8, s16, s32, p0})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+  setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
+
+  // Extensions
+  for (auto Ty : {s8, s16, s32}) {
+    setAction({G_ZEXT, Ty}, Legal);
+    setAction({G_SEXT, Ty}, Legal);
+  }
+
+  for (auto Ty : {s1, s8, s16}) {
+    setAction({G_ZEXT, 1, Ty}, Legal);
+    setAction({G_SEXT, 1, Ty}, Legal);
+  }
+
+  // Comparison
+  setAction({G_ICMP, s1}, Legal);
+
+  for (auto Ty : {s8, s16, s32, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfo64bit() {
+
+  if (!Subtarget.is64Bit())
+    return;
+
+  const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+    for (auto Ty : {s8, s16, s32, s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, s64, p0})
+      setAction({MemOp, Ty}, Legal);
+
+    setAction({MemOp, s1}, WidenScalar);
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+  setAction({G_GLOBAL_VALUE, p0}, Legal);
+
+  setAction({G_GEP, p0}, Legal);
+  setAction({G_GEP, 1, s32}, Legal);
+  setAction({G_GEP, 1, s64}, Legal);
+
+  for (auto Ty : {s1, s8, s16})
+    setAction({G_GEP, 1, Ty}, WidenScalar);
+
+  // Constants
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+
+  // Extensions
+  for (auto Ty : {s8, s16, s32, s64}) {
+    setAction({G_ZEXT, Ty}, Legal);
+    setAction({G_SEXT, Ty}, Legal);
+  }
+
+  for (auto Ty : {s1, s8, s16, s32}) {
+    setAction({G_ZEXT, 1, Ty}, Legal);
+    setAction({G_SEXT, 1, Ty}, Legal);
+  }
+
+  // Comparison
+  setAction({G_ICMP, s1}, Legal);
+
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({G_ICMP, 1, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE1() {
+  if (!Subtarget.hasSSE1())
+    return;
+
+  const LLT s32 = LLT::scalar(32);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s32, v4s32})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v4s32, v2s64})
+      setAction({MemOp, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE2() {
+  if (!Subtarget.hasSSE2())
+    return;
+
+  const LLT s64 = LLT::scalar(64);
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s64, v2s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v16s8, v8s16, v4s32, v2s64})
+      setAction({BinOp, Ty}, Legal);
+
+  setAction({G_MUL, v8s16}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE41() {
+  if (!Subtarget.hasSSE41())
+    return;
+
+  const LLT v4s32 = LLT::vector(4, 32);
+
+  setAction({G_MUL, v4s32}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX() {
+  if (!Subtarget.hasAVX())
+    return;
+
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
+  const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v8s32, v4s64})
+      setAction({MemOp, Ty}, Legal);
+
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) {
+    setAction({G_INSERT, Ty}, Legal);
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {v16s8, v8s16, v4s32, v2s64}) {
+    setAction({G_INSERT, 1, Ty}, Legal);
+    setAction({G_EXTRACT, Ty}, Legal);
+  }
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX2() {
+  if (!Subtarget.hasAVX2())
+    return;
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
+  const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (auto Ty : {v16s16, v8s32})
+    setAction({G_MUL, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX512() {
+  if (!Subtarget.hasAVX512())
+    return;
+
+  const LLT v16s8 = LLT::vector(16, 8);
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  const LLT v32s8 = LLT::vector(32, 8);
+  const LLT v16s16 = LLT::vector(16, 16);
+  const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  const LLT v64s8 = LLT::vector(64, 8);
+  const LLT v32s16 = LLT::vector(32, 16);
+  const LLT v16s32 = LLT::vector(16, 32);
+  const LLT v8s64 = LLT::vector(8, 64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v16s32, v8s64})
+      setAction({BinOp, Ty}, Legal);
+
+  setAction({G_MUL, v16s32}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v16s32, v8s64})
+      setAction({MemOp, Ty}, Legal);
+
+  for (auto Ty : {v64s8, v32s16, v16s32, v8s64}) {
+    setAction({G_INSERT, Ty}, Legal);
+    setAction({G_EXTRACT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {v32s8, v16s16, v8s32, v4s64, v16s8, v8s16, v4s32, v2s64}) {
+    setAction({G_INSERT, 1, Ty}, Legal);
+    setAction({G_EXTRACT, Ty}, Legal);
+  }
+
+  /************ VLX *******************/
+  if (!Subtarget.hasVLX())
+    return;
+
+  for (auto Ty : {v4s32, v8s32})
+    setAction({G_MUL, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX512DQ() {
+  if (!(Subtarget.hasAVX512() && Subtarget.hasDQI()))
+    return;
+
+  const LLT v8s64 = LLT::vector(8, 64);
+
+  setAction({G_MUL, v8s64}, Legal);
+
+  /************ VLX *******************/
+  if (!Subtarget.hasVLX())
+    return;
+
+  const LLT v2s64 = LLT::vector(2, 64);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  for (auto Ty : {v2s64, v4s64})
+    setAction({G_MUL, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoAVX512BW() {
+  if (!(Subtarget.hasAVX512() && Subtarget.hasBWI()))
+    return;
+
+  const LLT v64s8 = LLT::vector(64, 8);
+  const LLT v32s16 = LLT::vector(32, 16);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v64s8, v32s16})
+      setAction({BinOp, Ty}, Legal);
+
+  setAction({G_MUL, v32s16}, Legal);
+
+  /************ VLX *******************/
+  if (!Subtarget.hasVLX())
+    return;
+
+  const LLT v8s16 = LLT::vector(8, 16);
+  const LLT v16s16 = LLT::vector(16, 16);
+
+  for (auto Ty : {v8s16, v16s16})
+    setAction({G_MUL, Ty}, Legal);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h
new file mode 100644
index 0000000..135950a
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86LegalizerInfo.h
@@ -0,0 +1,49 @@
+//===- X86LegalizerInfo.h ------------------------------------------*- C++
+//-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class X86Subtarget;
+class X86TargetMachine;
+
+/// This class provides the information for the target register banks.
+class X86LegalizerInfo : public LegalizerInfo {
+private:
+  /// Keep a reference to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget &Subtarget;
+  const X86TargetMachine &TM;
+
+public:
+  X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
+
+private:
+  void setLegalizerInfo32bit();
+  void setLegalizerInfo64bit();
+  void setLegalizerInfoSSE1();
+  void setLegalizerInfoSSE2();
+  void setLegalizerInfoSSE41();
+  void setLegalizerInfoAVX();
+  void setLegalizerInfoAVX2();
+  void setLegalizerInfoAVX512();
+  void setLegalizerInfoAVX512DQ();
+  void setLegalizerInfoAVX512BW();
+};
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
index feeb2fd..fd2837b 100644
--- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -12,20 +12,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86AsmPrinter.h"
-#include "X86RegisterInfo.h"
-#include "X86ShuffleDecodeConstantPool.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "InstPrinter/X86InstComments.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "Utils/X86ShuffleDecode.h"
+#include "X86AsmPrinter.h"
+#include "X86RegisterInfo.h"
+#include "X86ShuffleDecodeConstantPool.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
@@ -38,13 +39,12 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 using namespace llvm;
@@ -102,7 +102,7 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
 }
 
 void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
-  OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+  OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo);
   SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
 }
 
@@ -215,6 +215,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
   case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
   case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_ABS8:      RefKind = MCSymbolRefExpr::VK_X86_ABS8; break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
     Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -357,7 +358,7 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
                                     const MachineOperand &MO) const {
   switch (MO.getType()) {
   default:
-    MI->dump();
+    MI->print(errs());
     llvm_unreachable("unknown operand type");
   case MachineOperand::MO_Register:
     // Ignore all implicit register operands.
@@ -498,11 +499,16 @@ ReSimplify:
     break;
   }
 
-  // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction.
+  // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
   { unsigned Opcode;
   case X86::TAILJMPr:   Opcode = X86::JMP32r; goto SetTailJmpOpcode;
   case X86::TAILJMPd:
   case X86::TAILJMPd64: Opcode = X86::JMP_1;  goto SetTailJmpOpcode;
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPd64_CC:
+    Opcode = X86::GetCondBranchFromCond(
+        static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
+    goto SetTailJmpOpcode;
 
   SetTailJmpOpcode:
     MCOperand Saved = OutMI.getOperand(0);
@@ -888,30 +894,47 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
   SM.recordStatepoint(MI);
 }
 
-void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
-                                       X86MCInstLower &MCIL) {
-  // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
+void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
+                                     X86MCInstLower &MCIL) {
+  // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+  //                  <opcode>, <operands>
 
-  unsigned LoadDefRegister = MI.getOperand(0).getReg();
-  MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
-  unsigned LoadOpcode = MI.getOperand(2).getImm();
-  unsigned LoadOperandsBeginIdx = 3;
+  unsigned DefRegister = FaultingMI.getOperand(0).getReg();
+  FaultMaps::FaultKind FK =
+      static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+  MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+  unsigned Opcode = FaultingMI.getOperand(3).getImm();
+  unsigned OperandsBeginIdx = 4;
 
-  FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+  assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+  FM.recordFaultingOp(FK, HandlerLabel);
 
-  MCInst LoadMI;
-  LoadMI.setOpcode(LoadOpcode);
+  MCInst MI;
+  MI.setOpcode(Opcode);
 
-  if (LoadDefRegister != X86::NoRegister)
-    LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+  if (DefRegister != X86::NoRegister)
+    MI.addOperand(MCOperand::createReg(DefRegister));
 
-  for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
-            E = MI.operands_end();
+  for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+            E = FaultingMI.operands_end();
        I != E; ++I)
-    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
-      LoadMI.addOperand(MaybeOperand.getValue());
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
+      MI.addOperand(MaybeOperand.getValue());
+
+  OutStreamer->EmitInstruction(MI, getSubtargetInfo());
+}
 
-  OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
+                                     X86MCInstLower &MCIL) {
+  bool Is64Bits = Subtarget->is64Bit();
+  MCContext &Ctx = OutStreamer->getContext();
+  MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
+  const MCSymbolRefExpr *Op =
+      MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx);
+
+  EmitAndCountInstruction(
+      MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
+          .addExpr(Op));
 }
 
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
@@ -1017,6 +1040,83 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
            getSubtargetInfo());
 }
 
+void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
+                                              X86MCInstLower &MCIL) {
+  assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");
+
+  // We want to emit the following pattern, which follows the x86 calling
+  // convention to prepare for the trampoline call to be patched in.
+  //
+  //   <args placement according SysV64 calling convention>
+  //   .p2align 1, ...
+  // .Lxray_event_sled_N:
+  //   jmp +N                    // jump across the call instruction
+  //   callq __xray_CustomEvent  // force relocation to symbol
+  //   <args cleanup, jump to here>
+  //
+  // The relative jump needs to jump forward 24 bytes:
+  // 10 (args) + 5 (nops) + 9 (cleanup)
+  //
+  // After patching, it would look something like:
+  //
+  //   nopw (2-byte nop)
+  //   callq __xrayCustomEvent  // already lowered
+  //
+  // ---
+  // First we emit the label and the jump.
+  auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
+  OutStreamer->AddComment("# XRay Custom Event Log");
+  OutStreamer->EmitCodeAlignment(2);
+  OutStreamer->EmitLabel(CurSled);
+
+  // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
+  // an operand (computed as an offset from the jmp instruction).
+  // FIXME: Find another less hacky way do force the relative jump.
+  OutStreamer->EmitBytes("\xeb\x14");
+
+  // The default C calling convention will place two arguments into %rcx and
+  // %rdx -- so we only work with those.
+  unsigned UsedRegs[] = {X86::RDI, X86::RSI, X86::RAX};
+
+  // Because we will use %rax, we preserve that across the call.
+  EmitAndCountInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
+
+  // Then we put the operands in the %rdi and %rsi registers.
+  for (unsigned I = 0; I < MI.getNumOperands(); ++I)
+    if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
+      if (Op->isImm())
+        EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
+                                    .addReg(UsedRegs[I])
+                                    .addImm(Op->getImm()));
+      else if (Op->isReg()) {
+        if (Op->getReg() != UsedRegs[I])
+          EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
+                                      .addReg(UsedRegs[I])
+                                      .addReg(Op->getReg()));
+        else
+          EmitNops(*OutStreamer, 3, Subtarget->is64Bit(), getSubtargetInfo());
+      }
+    }
+
+  // We emit a hard dependency on the __xray_CustomEvent symbol, which is the
+  // name of the trampoline to be implemented by the XRay runtime. We put this
+  // explicitly in the %rax register.
+  auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
+  MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
+  EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
+                              .addReg(X86::RAX)
+                              .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
+
+  // Emit the call instruction.
+  EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(X86::RAX));
+
+  // Restore caller-saved and used registers.
+  OutStreamer->AddComment("xray custom event end.");
+  EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RAX));
+
+  recordSled(CurSled, MI, SledKind::CUSTOM_EVENT);
+}
+
 void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
                                                   X86MCInstLower &MCIL) {
   // We want to emit the following pattern:
@@ -1232,6 +1332,32 @@ static std::string getShuffleComment(const MachineInstr *MI,
   return Comment;
 }
 
+static void printConstant(const Constant *COp, raw_ostream &CS) {
+  if (isa<UndefValue>(COp)) {
+    CS << "u";
+  } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
+    if (CI->getBitWidth() <= 64) {
+      CS << CI->getZExtValue();
+    } else {
+      // print multi-word constant as (w0,w1)
+      const auto &Val = CI->getValue();
+      CS << "(";
+      for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+        if (i > 0)
+          CS << ",";
+        CS << Val.getRawData()[i];
+      }
+      CS << ")";
+    }
+  } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
+    SmallString<32> Str;
+    CF->getValueAPF().toString(Str);
+    CS << Str;
+  } else {
+    CS << "?";
+  }
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   X86MCInstLower MCInstLowering(*MF, *this);
   const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
@@ -1276,9 +1402,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::TAILJMPr:
   case X86::TAILJMPm:
   case X86::TAILJMPd:
+  case X86::TAILJMPd_CC:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
   case X86::TAILJMPd64:
+  case X86::TAILJMPd64_CC:
   case X86::TAILJMPr64_REX:
   case X86::TAILJMPm64_REX:
     // Lower these as normal, but add some comments.
@@ -1367,8 +1495,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::STATEPOINT:
     return LowerSTATEPOINT(*MI, MCInstLowering);
 
-  case TargetOpcode::FAULTING_LOAD_OP:
-    return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::FENTRY_CALL:
+    return LowerFENTRY_CALL(*MI, MCInstLowering);
 
   case TargetOpcode::PATCHABLE_OP:
     return LowerPATCHABLE_OP(*MI, MCInstLowering);
@@ -1387,6 +1518,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   case TargetOpcode::PATCHABLE_TAIL_CALL:
     return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+    
+  case TargetOpcode::PATCHABLE_EVENT_CALL:
+    return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
 
   case X86::MORESTACK_RET:
     EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
@@ -1501,7 +1635,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
@@ -1572,15 +1707,16 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, ElSize, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
 
   case X86::VPERMIL2PDrm:
   case X86::VPERMIL2PSrm:
-  case X86::VPERMIL2PDrmY:
-  case X86::VPERMIL2PSrmY: {
+  case X86::VPERMIL2PDYrm:
+  case X86::VPERMIL2PSYrm: {
     if (!OutStreamer->isVerboseAsm())
       break;
     assert(MI->getNumOperands() >= 8 &&
@@ -1593,8 +1729,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned ElSize;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
-    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
-    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
+    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break;
+    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
     }
 
     const MachineOperand &MaskOp = MI->getOperand(6);
@@ -1602,7 +1738,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
@@ -1618,7 +1755,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask),
+                                !EnablePrintSchedInfo);
     }
     break;
   }
@@ -1654,66 +1792,159 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // For loads from a constant pool to a vector register, print the constant
   // loaded.
   CASE_ALL_MOV_RM()
+  case X86::VBROADCASTF128:
+  case X86::VBROADCASTI128:
+  case X86::VBROADCASTF32X4Z256rm:
+  case X86::VBROADCASTF32X4rm:
+  case X86::VBROADCASTF32X8rm:
+  case X86::VBROADCASTF64X2Z128rm:
+  case X86::VBROADCASTF64X2rm:
+  case X86::VBROADCASTF64X4rm:
+  case X86::VBROADCASTI32X4Z256rm:
+  case X86::VBROADCASTI32X4rm:
+  case X86::VBROADCASTI32X8rm:
+  case X86::VBROADCASTI64X2Z128rm:
+  case X86::VBROADCASTI64X2rm:
+  case X86::VBROADCASTI64X4rm:
     if (!OutStreamer->isVerboseAsm())
       break;
     if (MI->getNumOperands() <= 4)
       break;
     if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+      int NumLanes = 1;
+      // Override NumLanes for the broadcast instructions.
+      switch (MI->getOpcode()) {
+      case X86::VBROADCASTF128:         NumLanes = 2;  break;
+      case X86::VBROADCASTI128:         NumLanes = 2;  break;
+      case X86::VBROADCASTF32X4Z256rm:  NumLanes = 2;  break;
+      case X86::VBROADCASTF32X4rm:      NumLanes = 4;  break;
+      case X86::VBROADCASTF32X8rm:      NumLanes = 2;  break;
+      case X86::VBROADCASTF64X2Z128rm:  NumLanes = 2;  break;
+      case X86::VBROADCASTF64X2rm:      NumLanes = 4;  break;
+      case X86::VBROADCASTF64X4rm:      NumLanes = 2;  break;
+      case X86::VBROADCASTI32X4Z256rm:  NumLanes = 2;  break;
+      case X86::VBROADCASTI32X4rm:      NumLanes = 4;  break;
+      case X86::VBROADCASTI32X8rm:      NumLanes = 2;  break;
+      case X86::VBROADCASTI64X2Z128rm:  NumLanes = 2;  break;
+      case X86::VBROADCASTI64X2rm:      NumLanes = 4;  break;
+      case X86::VBROADCASTI64X4rm:      NumLanes = 2;  break;
+      }
+
       std::string Comment;
       raw_string_ostream CS(Comment);
       const MachineOperand &DstOp = MI->getOperand(0);
       CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
       if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
         CS << "[";
-        for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
-          if (i != 0)
-            CS << ",";
-          if (CDS->getElementType()->isIntegerTy())
-            CS << CDS->getElementAsInteger(i);
-          else if (CDS->getElementType()->isFloatTy())
-            CS << CDS->getElementAsFloat(i);
-          else if (CDS->getElementType()->isDoubleTy())
-            CS << CDS->getElementAsDouble(i);
-          else
-            CS << "?";
+        for (int l = 0; l != NumLanes; ++l) {
+          for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
+            if (i != 0 || l != 0)
+              CS << ",";
+            if (CDS->getElementType()->isIntegerTy())
+              CS << CDS->getElementAsInteger(i);
+            else if (CDS->getElementType()->isFloatTy())
+              CS << CDS->getElementAsFloat(i);
+            else if (CDS->getElementType()->isDoubleTy())
+              CS << CDS->getElementAsDouble(i);
+            else
+              CS << "?";
+          }
         }
         CS << "]";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
       } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
         CS << "<";
-        for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
-          if (i != 0)
-            CS << ",";
-          Constant *COp = CV->getOperand(i);
-          if (isa<UndefValue>(COp)) {
-            CS << "u";
-          } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
-            if (CI->getBitWidth() <= 64) {
-              CS << CI->getZExtValue();
-            } else {
-              // print multi-word constant as (w0,w1)
-              const auto &Val = CI->getValue();
-              CS << "(";
-              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
-                if (i > 0)
-                  CS << ",";
-                CS << Val.getRawData()[i];
-              }
-              CS << ")";
-            }
-          } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
-            SmallString<32> Str;
-            CF->getValueAPF().toString(Str);
-            CS << Str;
-          } else {
-            CS << "?";
+        for (int l = 0; l != NumLanes; ++l) {
+          for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
+            if (i != 0 || l != 0)
+              CS << ",";
+            printConstant(CV->getOperand(i), CS);
           }
         }
         CS << ">";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
       }
     }
     break;
+  case X86::VBROADCASTSSrm:
+  case X86::VBROADCASTSSYrm:
+  case X86::VBROADCASTSSZ128m:
+  case X86::VBROADCASTSSZ256m:
+  case X86::VBROADCASTSSZm:
+  case X86::VBROADCASTSDYrm:
+  case X86::VBROADCASTSDZ256m:
+  case X86::VBROADCASTSDZm:
+  case X86::VPBROADCASTBrm:
+  case X86::VPBROADCASTBYrm:
+  case X86::VPBROADCASTBZ128m:
+  case X86::VPBROADCASTBZ256m:
+  case X86::VPBROADCASTBZm:
+  case X86::VPBROADCASTDrm:
+  case X86::VPBROADCASTDYrm:
+  case X86::VPBROADCASTDZ128m:
+  case X86::VPBROADCASTDZ256m:
+  case X86::VPBROADCASTDZm:
+  case X86::VPBROADCASTQrm:
+  case X86::VPBROADCASTQYrm:
+  case X86::VPBROADCASTQZ128m:
+  case X86::VPBROADCASTQZ256m:
+  case X86::VPBROADCASTQZm:
+  case X86::VPBROADCASTWrm:
+  case X86::VPBROADCASTWYrm:
+  case X86::VPBROADCASTWZ128m:
+  case X86::VPBROADCASTWZ256m:
+  case X86::VPBROADCASTWZm:
+    if (!OutStreamer->isVerboseAsm())
+      break;
+    if (MI->getNumOperands() <= 4)
+      break;
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+      int NumElts;
+      switch (MI->getOpcode()) {
+      default: llvm_unreachable("Invalid opcode");
+      case X86::VBROADCASTSSrm:    NumElts = 4;  break;
+      case X86::VBROADCASTSSYrm:   NumElts = 8;  break;
+      case X86::VBROADCASTSSZ128m: NumElts = 4;  break;
+      case X86::VBROADCASTSSZ256m: NumElts = 8;  break;
+      case X86::VBROADCASTSSZm:    NumElts = 16; break;
+      case X86::VBROADCASTSDYrm:   NumElts = 4;  break;
+      case X86::VBROADCASTSDZ256m: NumElts = 4;  break;
+      case X86::VBROADCASTSDZm:    NumElts = 8;  break;
+      case X86::VPBROADCASTBrm:    NumElts = 16; break;
+      case X86::VPBROADCASTBYrm:   NumElts = 32; break;
+      case X86::VPBROADCASTBZ128m: NumElts = 16; break;
+      case X86::VPBROADCASTBZ256m: NumElts = 32; break;
+      case X86::VPBROADCASTBZm:    NumElts = 64; break;
+      case X86::VPBROADCASTDrm:    NumElts = 4;  break;
+      case X86::VPBROADCASTDYrm:   NumElts = 8;  break;
+      case X86::VPBROADCASTDZ128m: NumElts = 4;  break;
+      case X86::VPBROADCASTDZ256m: NumElts = 8;  break;
+      case X86::VPBROADCASTDZm:    NumElts = 16; break;
+      case X86::VPBROADCASTQrm:    NumElts = 2;  break;
+      case X86::VPBROADCASTQYrm:   NumElts = 4;  break;
+      case X86::VPBROADCASTQZ128m: NumElts = 2;  break;
+      case X86::VPBROADCASTQZ256m: NumElts = 4;  break;
+      case X86::VPBROADCASTQZm:    NumElts = 8;  break;
+      case X86::VPBROADCASTWrm:    NumElts = 8;  break;
+      case X86::VPBROADCASTWYrm:   NumElts = 16; break;
+      case X86::VPBROADCASTWZ128m: NumElts = 8;  break;
+      case X86::VPBROADCASTWZ256m: NumElts = 16; break;
+      case X86::VPBROADCASTWZm:    NumElts = 32; break;
+      }
+
+      std::string Comment;
+      raw_string_ostream CS(Comment);
+      const MachineOperand &DstOp = MI->getOperand(0);
+      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+      CS << "[";
+      for (int i = 0; i != NumElts; ++i) {
+        if (i != 0)
+          CS << ",";
+        printConstant(C, CS);
+      }
+      CS << "]";
+      OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo);
+    }
   }
 
   MCInst TmpInst;
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index c9e636f..3fcb642 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -9,6 +9,7 @@
 
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
@@ -20,11 +21,8 @@ void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
       MF->getSubtarget().getRegisterInfo());
     unsigned SlotSize = RegInfo->getSlotSize();
-    for (const MCPhysReg *CSR =
-      RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
-      unsigned Reg = *CSR;
-       ++CSR)
-    {
+    for (const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+         unsigned Reg = *CSR; ++CSR) {
       if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
         RestoreBasePointerOffset -= SlotSize;
     }
diff --git a/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp b/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp
new file mode 100644
index 0000000..8fdf106
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -0,0 +1,202 @@
+//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the X86 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MacroFusion.h"
+
+using namespace llvm;
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!ST.hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  unsigned FirstOpcode = FirstMI
+                         ? FirstMI->getOpcode()
+                         : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = SecondMI.getOpcode();
+
+  switch (SecondOpcode) {
+  default:
+    return false;
+  case X86::JE_1:
+  case X86::JNE_1:
+  case X86::JL_1:
+  case X86::JLE_1:
+  case X86::JG_1:
+  case X86::JGE_1:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_1:
+  case X86::JBE_1:
+  case X86::JA_1:
+  case X86::JAE_1:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_1:
+  case X86::JNS_1:
+  case X86::JP_1:
+  case X86::JNP_1:
+  case X86::JO_1:
+  case X86::JNO_1:
+    FuseKind = FuseTest;
+    break;
+  }
+
+  switch (FirstOpcode) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  case X86::INSTRUCTION_LIST_END:
+    return true;
+  }
+}
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation () {
+  return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/X86/X86MacroFusion.h b/contrib/llvm/lib/Target/X86/X86MacroFusion.h
new file mode 100644
index 0000000..13fa2d7
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86MacroFusion.h
@@ -0,0 +1,25 @@
+//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the X86 definition of the DAG scheduling mutation
+///  to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createX86MacroFusionDAGMutation());
+/// to X86PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation();
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index e144700..e6756b9 100644
--- a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -27,6 +27,8 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -262,6 +264,12 @@ private:
   /// \brief Removes redundant address calculations.
   bool removeRedundantAddrCalc(MemOpMap &LEAs);
 
+  /// Replace debug value MI with a new debug value instruction using register
+  /// VReg with an appropriate offset and DIExpression to incorporate the
+  /// address displacement AddrDispShift. Return new debug value instruction.
+  MachineInstr *replaceDebugValue(MachineInstr &MI, unsigned VReg,
+                                  int64_t AddrDispShift);
+
   /// \brief Removes LEAs which calculate similar addresses.
   bool removeRedundantLEAs(MemOpMap &LEAs);
 
@@ -389,9 +397,6 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
   assert(isLEA(First) && isLEA(Last) &&
          "The function works only with LEA instructions");
 
-  // Get new address displacement.
-  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
-
   // Make sure that LEA def registers belong to the same class. There may be
   // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
   // be used as their operands, so we must be sure that replacing one LEA
@@ -400,10 +405,13 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
       MRI->getRegClass(Last.getOperand(0).getReg()))
     return false;
 
+  // Get new address displacement.
+  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
+
   // Loop over all uses of the Last LEA to check that its def register is
   // used only as address base for memory accesses. If so, it can be
   // replaced, otherwise - no.
-  for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+  for (auto &MO : MRI->use_nodbg_operands(Last.getOperand(0).getReg())) {
     MachineInstr &MI = *MO.getParent();
 
     // Get the number of the first memory operand.
@@ -532,6 +540,25 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
   return Changed;
 }
 
+MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
+                                                 unsigned VReg,
+                                                 int64_t AddrDispShift) {
+  DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
+
+  if (AddrDispShift != 0)
+    Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift,
+                                 DIExpression::WithStackValue);
+
+  // Replace DBG_VALUE instruction with modified version.
+  MachineBasicBlock *MBB = MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  bool IsIndirect = MI.isIndirectDebugValue();
+  int64_t Offset = IsIndirect ? MI.getOperand(1).getImm() : 0;
+  const MDNode *Var = MI.getDebugVariable();
+  return BuildMI(*MBB, MBB->erase(&MI), DL, TII->get(TargetOpcode::DBG_VALUE),
+                 IsIndirect, VReg, Offset, Var, Expr);
+}
+
 // Try to find similar LEAs in the list and replace one with another.
 bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
   bool Changed = false;
@@ -563,12 +590,21 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         // Loop over all uses of the Last LEA and update their operands. Note
         // that the correctness of this has already been checked in the
         // isReplaceable function.
-        for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
-                  UE = MRI->use_end();
+        unsigned FirstVReg = First.getOperand(0).getReg();
+        unsigned LastVReg = Last.getOperand(0).getReg();
+        for (auto UI = MRI->use_begin(LastVReg), UE = MRI->use_end();
              UI != UE;) {
           MachineOperand &MO = *UI++;
           MachineInstr &MI = *MO.getParent();
 
+          if (MI.isDebugValue()) {
+            // Replace DBG_VALUE instruction with modified version using the
+            // register from the replacing LEA and the address displacement
+            // between the LEA instructions.
+            replaceDebugValue(MI, FirstVReg, AddrDispShift);
+            continue;
+          }
+
           // Get the number of the first memory operand.
           const MCInstrDesc &Desc = MI.getDesc();
           int MemOpNo =
@@ -576,7 +612,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
               X86II::getOperandBias(Desc);
 
           // Update address base.
-          MO.setReg(First.getOperand(0).getReg());
+          MO.setReg(FirstVReg);
 
           // Update address disp.
           MachineOperand &Op = MI.getOperand(MemOpNo + X86::AddrDisp);
@@ -587,14 +623,14 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         }
 
         // Since we can possibly extend register lifetime, clear kill flags.
-        MRI->clearKillFlags(First.getOperand(0).getReg());
+        MRI->clearKillFlags(FirstVReg);
 
         ++NumRedundantLEAs;
         DEBUG(dbgs() << "OptimizeLEAs: Remove redundant LEA: "; Last.dump(););
 
         // By this moment, all of the Last LEA's uses must be replaced. So we
         // can freely remove it.
-        assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+        assert(MRI->use_empty(LastVReg) &&
                "The LEA's def register must have no uses");
         Last.eraseFromParent();
 
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
new file mode 100644
index 0000000..efd3df2
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -0,0 +1,245 @@
+//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterBankInfo.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "X86GenRegisterBank.inc"
+
+using namespace llvm;
+// This file will be TableGen'ed at some point.
+#define GET_TARGET_REGBANK_INFO_IMPL
+#include "X86GenRegisterBankInfo.def"
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
+    : X86GenRegisterBankInfo() {
+
+  // validate RegBank initialization.
+  const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID);
+  (void)RBGPR;
+  assert(&X86::GPRRegBank == &RBGPR && "Incorrect RegBanks inizalization.");
+
+  // The GPR register bank is fully defined by all the registers in
+  // GR64 + its subclasses.
+  assert(RBGPR.covers(*TRI.getRegClass(X86::GR64RegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+}
+
+const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+
+  if (X86::GR8RegClass.hasSubClassEq(&RC) ||
+      X86::GR16RegClass.hasSubClassEq(&RC) ||
+      X86::GR32RegClass.hasSubClassEq(&RC) ||
+      X86::GR64RegClass.hasSubClassEq(&RC))
+    return getRegBank(X86::GPRRegBankID);
+
+  if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
+      X86::FR64XRegClass.hasSubClassEq(&RC) ||
+      X86::VR128XRegClass.hasSubClassEq(&RC) ||
+      X86::VR256XRegClass.hasSubClassEq(&RC) ||
+      X86::VR512RegClass.hasSubClassEq(&RC))
+    return getRegBank(X86::VECRRegBankID);
+
+  llvm_unreachable("Unsupported register kind yet.");
+}
+
+X86GenRegisterBankInfo::PartialMappingIdx
+X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
+  if ((Ty.isScalar() && !isFP) || Ty.isPointer()) {
+    switch (Ty.getSizeInBits()) {
+    case 1:
+    case 8:
+      return PMI_GPR8;
+    case 16:
+      return PMI_GPR16;
+    case 32:
+      return PMI_GPR32;
+    case 64:
+      return PMI_GPR64;
+      break;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  } else if (Ty.isScalar()) {
+    switch (Ty.getSizeInBits()) {
+    case 32:
+      return PMI_FP32;
+    case 64:
+      return PMI_FP64;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  } else {
+    switch (Ty.getSizeInBits()) {
+    case 128:
+      return PMI_VEC128;
+    case 256:
+      return PMI_VEC256;
+    case 512:
+      return PMI_VEC512;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  }
+
+  return PMI_None;
+}
+
+void X86RegisterBankInfo::getInstrPartialMappingIdxs(
+    const MachineInstr &MI, const MachineRegisterInfo &MRI, const bool isFP,
+    SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx) {
+
+  unsigned NumOperands = MI.getNumOperands();
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    auto &MO = MI.getOperand(Idx);
+    if (!MO.isReg())
+      OpRegBankIdx[Idx] = PMI_None;
+    else
+      OpRegBankIdx[Idx] = getPartialMappingIdx(MRI.getType(MO.getReg()), isFP);
+  }
+}
+
+bool X86RegisterBankInfo::getInstrValueMapping(
+    const MachineInstr &MI,
+    const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+    SmallVectorImpl<const ValueMapping *> &OpdsMapping) {
+
+  unsigned NumOperands = MI.getNumOperands();
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    if (!MI.getOperand(Idx).isReg())
+      continue;
+
+    auto Mapping = getValueMapping(OpRegBankIdx[Idx], 1);
+    if (!Mapping->isValid())
+      return false;
+
+    OpdsMapping[Idx] = Mapping;
+  }
+  return true;
+}
+
+const RegisterBankInfo::InstructionMapping &
+X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI,
+                                            bool isFP) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned NumOperands = MI.getNumOperands();
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+  if (NumOperands != 3 || (Ty != MRI.getType(MI.getOperand(1).getReg())) ||
+      (Ty != MRI.getType(MI.getOperand(2).getReg())))
+    llvm_unreachable("Unsupported operand mapping yet.");
+
+  auto Mapping = getValueMapping(getPartialMappingIdx(Ty, isFP), 3);
+  return getInstructionMapping(DefaultMappingID, 1, Mapping, NumOperands);
+}
+
+const RegisterBankInfo::InstructionMapping &
+X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto Opc = MI.getOpcode();
+
+  // Try the default logic for non-generic instructions that are either copies
+  // or already have some operands assigned to banks.
+  if (!isPreISelGenericOpcode(Opc)) {
+    const InstructionMapping &Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  }
+
+  switch (Opc) {
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+    return getSameOperandsMapping(MI, false);
+    break;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+    return getSameOperandsMapping(MI, true);
+    break;
+  default:
+    break;
+  }
+
+  unsigned NumOperands = MI.getNumOperands();
+
+  // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
+  SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+  getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
+
+  // Finally construct the computed mapping.
+  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+  if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+    return getInvalidInstructionMapping();
+
+  return getInstructionMapping(DefaultMappingID, /* Cost */ 1,
+                               getOperandsMapping(OpdsMapping), NumOperands);
+}
+
+void X86RegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  return applyDefaultMapping(OpdMapper);
+}
+
+RegisterBankInfo::InstructionMappings
+X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE: {
+    // we going to try to map 32/64 bit to PMI_FP32/PMI_FP64
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    unsigned NumOperands = MI.getNumOperands();
+
+    // Track the bank of each register, use FP mapping (all scalars in VEC)
+    SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+    getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
+
+    // Finally construct the computed mapping.
+    SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+    if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+      break;
+
+    const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
+        /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands);
+    InstructionMappings AltMappings;
+    AltMappings.push_back(&Mapping);
+    return AltMappings;
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h
new file mode 100644
index 0000000..e227880
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterBankInfo.h
@@ -0,0 +1,82 @@
+//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "X86GenRegisterBank.inc"
+
+namespace llvm {
+
+class LLT;
+
+class X86GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "X86GenRegisterBank.inc"
+#define GET_TARGET_REGBANK_INFO_CLASS
+#include "X86GenRegisterBankInfo.def"
+
+  static RegisterBankInfo::PartialMapping PartMappings[];
+  static RegisterBankInfo::ValueMapping ValMappings[];
+
+  static PartialMappingIdx getPartialMappingIdx(const LLT &Ty, bool isFP);
+  static const RegisterBankInfo::ValueMapping *
+  getValueMapping(PartialMappingIdx Idx, unsigned NumOperands);
+};
+
+class TargetRegisterInfo;
+
+/// This class provides the information for the target register banks.
+class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
+private:
+  /// Get an instruction mapping.
+  /// \return An InstructionMappings with a statically allocated
+  /// OperandsMapping.
+  const InstructionMapping &getSameOperandsMapping(const MachineInstr &MI,
+                                                   bool isFP) const;
+
+  /// Track the bank of each instruction operand(register)
+  static void
+  getInstrPartialMappingIdxs(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI, const bool isFP,
+                             SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx);
+
+  /// Construct the instruction ValueMapping from PartialMappingIdxs
+  /// \return true if mapping succeeded.
+  static bool
+  getInstrValueMapping(const MachineInstr &MI,
+                       const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+                       SmallVectorImpl<const ValueMapping *> &OpdsMapping);
+
+public:
+  X86RegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  const InstructionMapping &
+  getInstrMapping(const MachineInstr &MI) const override;
+};
+
+} // namespace llvm
+#endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterBanks.td b/contrib/llvm/lib/Target/X86/X86RegisterBanks.td
new file mode 100644
index 0000000..6d17cd5
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86RegisterBanks.td
@@ -0,0 +1,17 @@
+//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: RAX, RCX,...
+def GPRRegBank : RegisterBank<"GPR", [GR64]>;
+
+/// Floating Point/Vector Registers
+def VECRRegBank : RegisterBank<"VECR", [VR512]>;
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 65f438f..343da25 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -80,7 +80,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // ExeDepsFixer and PostRAScheduler require liveness.
+  // ExecutionDepsFixer and PostRAScheduler require liveness.
   return true;
 }
 
@@ -137,25 +137,29 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case X86::FR32RegClassID:
     case X86::FR64RegClassID:
       // If AVX-512 isn't supported we should only inflate to these classes.
-      if (!Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+      if (!Subtarget.hasAVX512() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::VR128RegClassID:
     case X86::VR256RegClassID:
       // If VLX isn't supported we should only inflate to these classes.
-      if (!Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+      if (!Subtarget.hasVLX() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::VR128XRegClassID:
     case X86::VR256XRegClassID:
       // If VLX isn't support we shouldn't inflate to these classes.
-      if (Subtarget.hasVLX() && Super->getSize() == RC->getSize())
+      if (Subtarget.hasVLX() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::FR32XRegClassID:
     case X86::FR64XRegClassID:
       // If AVX-512 isn't support we shouldn't inflate to these classes.
-      if (Subtarget.hasAVX512() && Super->getSize() == RC->getSize())
+      if (Subtarget.hasAVX512() &&
+          getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
       break;
     case X86::GR8RegClassID:
@@ -168,7 +172,7 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case X86::VR512RegClassID:
       // Don't return a super-class that would shrink the spill size.
       // That can happen with the vector and float classes.
-      if (Super->getSize() == RC->getSize())
+      if (getRegSizeInBits(*Super) == getRegSizeInBits(*RC))
         return Super;
     }
     Super = *I++;
@@ -220,7 +224,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
 const TargetRegisterClass *
 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
   const Function *F = MF.getFunction();
-  if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64))
+  if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64))
     return &X86::GR64_TCW64RegClass;
   else if (Is64Bit)
     return &X86::GR64_TCRegClass;
@@ -272,7 +276,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool HasAVX512 = Subtarget.hasAVX512();
   bool CallsEHReturn = MF->callsEHReturn();
 
-  switch (MF->getFunction()->getCallingConv()) {
+  CallingConv::ID CC = MF->getFunction()->getCallingConv();
+
+  // If attribute NoCallerSavedRegisters exists then we set X86_INTR calling
+  // convention because it has the CSR list.
+  if (MF->getFunction()->hasFnAttribute("no_caller_saved_registers"))
+    CC = CallingConv::X86_INTR;
+
+  switch (CC) {
   case CallingConv::GHC:
   case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
@@ -309,21 +320,21 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case CallingConv::X86_RegCall:
     if (Is64Bit) {
       if (IsWin64) {
-        return (HasSSE ? CSR_Win64_RegCall_SaveList : 
+        return (HasSSE ? CSR_Win64_RegCall_SaveList :
                          CSR_Win64_RegCall_NoSSE_SaveList);
       } else {
-        return (HasSSE ? CSR_SysV64_RegCall_SaveList : 
+        return (HasSSE ? CSR_SysV64_RegCall_SaveList :
                          CSR_SysV64_RegCall_NoSSE_SaveList);
       }
     } else {
-      return (HasSSE ? CSR_32_RegCall_SaveList : 
+      return (HasSSE ? CSR_32_RegCall_SaveList :
                        CSR_32_RegCall_NoSSE_SaveList);
     }
   case CallingConv::Cold:
     if (Is64Bit)
       return CSR_64_MostRegs_SaveList;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     if (!HasSSE)
       return CSR_Win64_NoSSE_SaveList;
     return CSR_Win64_SaveList;
@@ -337,7 +348,9 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
         return CSR_64_AllRegs_AVX512_SaveList;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_SaveList;
-      return CSR_64_AllRegs_SaveList;
+      if (HasSSE)
+        return CSR_64_AllRegs_SaveList;
+      return CSR_64_AllRegs_NoSSE_SaveList;
     } else {
       if (HasAVX512)
         return CSR_32_AllRegs_AVX512_SaveList;
@@ -422,22 +435,22 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_64_HHVM_RegMask;
   case CallingConv::X86_RegCall:
     if (Is64Bit) {
-      if (IsWin64) { 
-        return (HasSSE ? CSR_Win64_RegCall_RegMask : 
+      if (IsWin64) {
+        return (HasSSE ? CSR_Win64_RegCall_RegMask :
                          CSR_Win64_RegCall_NoSSE_RegMask);
       } else {
-        return (HasSSE ? CSR_SysV64_RegCall_RegMask : 
+        return (HasSSE ? CSR_SysV64_RegCall_RegMask :
                          CSR_SysV64_RegCall_NoSSE_RegMask);
       }
     } else {
-      return (HasSSE ? CSR_32_RegCall_RegMask : 
+      return (HasSSE ? CSR_32_RegCall_RegMask :
                        CSR_32_RegCall_NoSSE_RegMask);
     }
   case CallingConv::Cold:
     if (Is64Bit)
       return CSR_64_MostRegs_RegMask;
     break;
-  case CallingConv::X86_64_Win64:
+  case CallingConv::Win64:
     return CSR_Win64_RegMask;
   case CallingConv::X86_64_SysV:
     return CSR_64_RegMask;
@@ -447,7 +460,9 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
         return CSR_64_AllRegs_AVX512_RegMask;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_RegMask;
-      return CSR_64_AllRegs_RegMask;
+      if (HasSSE)
+        return CSR_64_AllRegs_RegMask;
+      return CSR_64_AllRegs_NoSSE_RegMask;
     } else {
       if (HasAVX512)
         return CSR_32_AllRegs_AVX512_RegMask;
@@ -665,32 +680,28 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MI.getParent()->getParent();
   const X86FrameLowering *TFI = getFrameLowering(MF);
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned BasePtr;
 
-  unsigned Opc = MI.getOpcode();
-  bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm ||
-                    Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64;
-
-  if (hasBasePointer(MF))
-    BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
-  else if (needsStackRealignment(MF))
-    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
-  else if (AfterFPPop)
-    BasePtr = StackPtr;
-  else
-    BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
+  // Determine base register and offset.
+  int FIOffset;
+  unsigned BasePtr;
+  if (MI.isReturn()) {
+    assert((!needsStackRealignment(MF) ||
+           MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
+           "Return instruction can only reference SP relative frame objects");
+    FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+  } else {
+    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
+  }
 
   // LOCAL_ESCAPE uses a single offset, with no register. It only works in the
   // simple FP case, and doesn't work with stack realignment. On 32-bit, the
   // offset is from the traditional base pointer location.  On 64-bit, the
   // offset is from the SP at the end of the prologue, not the FP location. This
   // matches the behavior of llvm.frameaddress.
-  unsigned IgnoredFrameReg;
+  unsigned Opc = MI.getOpcode();
   if (Opc == TargetOpcode::LOCAL_ESCAPE) {
     MachineOperand &FI = MI.getOperand(FIOperandNum);
-    int Offset;
-    Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-    FI.ChangeToImmediate(Offset);
+    FI.ChangeToImmediate(FIOffset);
     return;
   }
 
@@ -706,15 +717,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // FrameIndex with base register.  Add an offset to the offset.
   MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false);
 
-  // Now add the frame object offset to the offset from EBP.
-  int FIOffset;
-  if (AfterFPPop) {
-    // Tail call jmp happens after FP is popped.
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
-    FIOffset = MFI.getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
-  } else
-    FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
-
   if (BasePtr == StackPtr)
     FIOffset += SPAdj;
 
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
index 58fa31e..25958f0 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -133,6 +133,11 @@ public:
   unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const;
   unsigned getStackRegister() const { return StackPtr; }
   unsigned getBaseRegister() const { return BasePtr; }
+  /// Returns physical register used as frame pointer.
+  /// This will always returns the frame pointer register, contrary to
+  /// getFrameRegister() which returns the "base pointer" in situations
+  /// involving a stack, frame and base pointer.
+  unsigned getFramePtr() const { return FramePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
 };
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
index ad02a94..3a61a72 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -189,22 +189,22 @@ def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
 def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
 def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
 
-def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
-def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
-def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
-def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
-def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
-def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
-def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
-def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
-def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
-def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
-def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
-def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
-def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
-def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
-def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
-def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>;
 
 } // CostPerUse
 
@@ -511,7 +511,7 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
-def VK1     : RegisterClass<"X86", [i1],    16,  (sequence "K%u", 0, 7)> {let Size = 16;}
+def VK1     : RegisterClass<"X86", [v1i1],  16,  (sequence "K%u", 0, 7)> {let Size = 16;}
 def VK2     : RegisterClass<"X86", [v2i1],  16,  (add VK1)> {let Size = 16;}
 def VK4     : RegisterClass<"X86", [v4i1],  16,  (add VK2)> {let Size = 16;}
 def VK8     : RegisterClass<"X86", [v8i1],  16,  (add VK4)> {let Size = 16;}
@@ -519,7 +519,7 @@ def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
 def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
 def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
 
-def VK1WM   : RegisterClass<"X86", [i1],    16,  (sub VK1, K0)> {let Size = 16;}
+def VK1WM   : RegisterClass<"X86", [v1i1],  16,  (sub VK1, K0)> {let Size = 16;}
 def VK2WM   : RegisterClass<"X86", [v2i1],  16,  (sub VK2, K0)> {let Size = 16;}
 def VK4WM   : RegisterClass<"X86", [v4i1],  16,  (sub VK4, K0)> {let Size = 16;}
 def VK8WM   : RegisterClass<"X86", [v8i1],  16,  (sub VK8, K0)> {let Size = 16;}
diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
index 677e824..03c8ccb 100644
--- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -1488,6 +1488,39 @@ def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
 
 //-- Arithmetic instructions --//
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
+  let Latency = 9;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1, 2, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2];
+}
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+  let ResourceCycles = [1, 2, 1];
+}
+
 // PHADD|PHSUB (S) W/D.
 // v <- v,v.
 def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
index eca65c2..b8ec588 100644
--- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -157,6 +157,31 @@ def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
   let ResourceCycles = [1, 1, 1, 1];
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [SBPort1]> {
+  let Latency = 3;
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [SBPort15]>;
+
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 1];
+}
+
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [SBPort015]> {
diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td
index 35257f8..d831a79 100644
--- a/contrib/llvm/lib/Target/X86/X86Schedule.td
+++ b/contrib/llvm/lib/Target/X86/X86Schedule.td
@@ -77,6 +77,10 @@ defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
 // FMA Scheduling helper class.
 class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
+// Horizontal Add/Sub (float and integer)
+defm WriteFHAdd  : X86SchedWritePair;
+defm WritePHAdd : X86SchedWritePair;
+
 // Vector integer operations.
 defm WriteVecALU   : X86SchedWritePair; // Vector integer ALU op, no logicals.
 defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
@@ -366,6 +370,7 @@ def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
 def IIC_SSE_MWAITX : InstrItinClass;
 def IIC_SSE_MONITORX : InstrItinClass;
+def IIC_SSE_CLZERO : InstrItinClass;
 
 def IIC_SSE_PREFETCH : InstrItinClass;
 def IIC_SSE_PAUSE : InstrItinClass;
@@ -496,6 +501,7 @@ def IIC_IN_RI : InstrItinClass;
 def IIC_OUT_RR : InstrItinClass;
 def IIC_OUT_IR : InstrItinClass;
 def IIC_INS : InstrItinClass;
+def IIC_LWP : InstrItinClass;
 def IIC_MOV_REG_DR : InstrItinClass;
 def IIC_MOV_DR_REG : InstrItinClass;
 def IIC_MOV_REG_CR : InstrItinClass;
@@ -657,5 +663,6 @@ include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
 include "X86ScheduleSLM.td"
+include "X86ScheduleZnver1.td"
 include "X86ScheduleBtVer2.td"
 
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index ce1ece3..9dcc968 100644
--- a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -320,6 +320,38 @@ def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def : WriteRes<WriteFHAdd, [JFPU0]> {
+  let Latency = 3;
+}
+
+def : WriteRes<WriteFHAddLd, [JLAGU, JFPU0]> {
+  let Latency = 8;
+}
+
+def : WriteRes<WritePHAdd, [JFPU01]> {
+  let ResourceCycles = [1];
+}
+def : WriteRes<WritePHAddLd, [JLAGU, JFPU01 ]> {
+  let Latency = 6;
+  let ResourceCycles = [1, 1];
+}
+
+def WriteFHAddY: SchedWriteRes<[JFPU0]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteFHAddY], (instregex "VH(ADD|SUB)P(S|D)Yrr")>;
+
+def WriteFHAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteFHAddYLd], (instregex "VH(ADD|SUB)P(S|D)Yrm")>;
+
+////////////////////////////////////////////////////////////////////////////////
 // Carry-less multiplication instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -337,5 +369,98 @@ def : WriteRes<WriteSystem,     [JAny]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; }
 def : WriteRes<WriteFence,  [JSAGU]>;
 def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// SSE4A instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteEXTRQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>;
+
+def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
+  let Latency = 2;
+  let ResourceCycles = [4];
+}
+def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
+
+////////////////////////////////////////////////////////////////////////////////
+// AVX instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteFAddY: SchedWriteRes<[JFPU0]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteFAddY], (instregex "VADD(SUB)?P(S|D)Yrr", "VSUBP(S|D)Yrr")>;
+
+def WriteFAddYLd: SchedWriteRes<[JLAGU, JFPU0]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteFAddYLd, ReadAfterLd], (instregex "VADD(SUB)?P(S|D)Yrm", "VSUBP(S|D)Yrm")>;
+
+def WriteFDivY: SchedWriteRes<[JFPU1]> {
+  let Latency = 38;
+  let ResourceCycles = [38];
+}
+def : InstRW<[WriteFDivY], (instregex "VDIVP(D|S)Yrr")>;
+
+def WriteFDivYLd: SchedWriteRes<[JLAGU, JFPU1]> {
+  let Latency = 43;
+  let ResourceCycles = [1, 38];
+}
+def : InstRW<[WriteFDivYLd, ReadAfterLd], (instregex "VDIVP(S|D)Yrm")>;
+
+def WriteVMULYPD: SchedWriteRes<[JFPU1]> {
+  let Latency = 4;
+  let ResourceCycles = [4];
+}
+def : InstRW<[WriteVMULYPD], (instregex "VMULPDYrr")>;
+
+def WriteVMULYPDLd: SchedWriteRes<[JLAGU, JFPU1]> {
+  let Latency = 9;
+  let ResourceCycles = [1, 4];
+}
+def : InstRW<[WriteVMULYPDLd, ReadAfterLd], (instregex "VMULPDYrm")>;
+
+def WriteVMULYPS: SchedWriteRes<[JFPU1]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+def : InstRW<[WriteVMULYPS], (instregex "VMULPSYrr", "VRCPPSYr", "VRSQRTPSYr")>;
+
+def WriteVMULYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
+  let Latency = 7;
+  let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteVMULYPSLd, ReadAfterLd], (instregex "VMULPSYrm", "VRCPPSYm", "VRSQRTPSYm")>;
+
+def WriteVSQRTYPD: SchedWriteRes<[JFPU1]> {
+  let Latency = 54;
+  let ResourceCycles = [54];
+}
+def : InstRW<[WriteVSQRTYPD], (instregex "VSQRTPDYr")>;
+
+def WriteVSQRTYPDLd: SchedWriteRes<[JLAGU, JFPU1]> {
+  let Latency = 59;
+  let ResourceCycles = [1, 54];
+}
+def : InstRW<[WriteVSQRTYPDLd], (instregex "VSQRTPDYm")>;
+
+def WriteVSQRTYPS: SchedWriteRes<[JFPU1]> {
+  let Latency = 42;
+  let ResourceCycles = [42];
+}
+def : InstRW<[WriteVSQRTYPS], (instregex "VSQRTPSYr")>;
+
+def WriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
+  let Latency = 47;
+  let ResourceCycles = [1, 42];
+}
+def : InstRW<[WriteVSQRTYPSLd], (instregex "VSQRTPSYm")>;
+
 } // SchedModel
 
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
index f95d4fa..03ed2db 100644
--- a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -137,6 +137,33 @@ defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
 defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
 defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+// HADD, HSUB PS/PD
+
+def : WriteRes<WriteFHAdd,  [FPC_RSV01]> {
+  let Latency = 3;
+  let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteFHAddLd,  [FPC_RSV01, MEC_RSV]> {
+  let Latency = 6;
+  let ResourceCycles = [2, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+def : WriteRes<WritePHAdd,  [FPC_RSV01]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+}
+
+def : WriteRes<WritePHAddLd,  [FPC_RSV01, MEC_RSV]> {
+  let Latency = 4;
+  let ResourceCycles = [1, 1];
+}
+
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td
new file mode 100644
index 0000000..d5b4cfe
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -0,0 +1,223 @@
+//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver1 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver1Model : SchedMachineModel {
+  // Zen can decode 4 instructions per cycle.
+  let IssueWidth = 4;
+  // Based on the reorder buffer we define MicroOpBufferSize
+  let MicroOpBufferSize = 192;
+  let LoadLatency = 4;
+  let MispredictPenalty = 17;
+  let HighLatency = 25;
+  let PostRAScheduler = 1;
+
+  // FIXME: This variable is required for incomplete model.
+  // We haven't catered all instructions.
+  // So, we reset the value of this variable so as to
+  // say that the model is incomplete.
+  let CompleteModel = 0;
+}
+
+let SchedModel = Znver1Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+//  * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+//  * Two AGU units (ZAGU0, ZAGU1)
+//  * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def ZnALU0 : ProcResource<1>;
+def ZnALU1 : ProcResource<1>;
+def ZnALU2 : ProcResource<1>;
+def ZnALU3 : ProcResource<1>;
+
+// Two AGU units are defined below
+def ZnAGU0 : ProcResource<1>;
+def ZnAGU1 : ProcResource<1>;
+
+// Four FPU units are defined below
+def ZnFPU0 : ProcResource<1>;
+def ZnFPU1 : ProcResource<1>;
+def ZnFPU2 : ProcResource<1>;
+def ZnFPU3 : ProcResource<1>;
+
+// FPU grouping
+def ZnFPU     : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
+def ZnFPU013  : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
+def ZnFPU01   : ProcResGroup<[ZnFPU0, ZnFPU1]>;
+def ZnFPU12   : ProcResGroup<[ZnFPU1, ZnFPU2]>;
+def ZnFPU13   : ProcResGroup<[ZnFPU1, ZnFPU3]>;
+def ZnFPU23   : ProcResGroup<[ZnFPU2, ZnFPU3]>;
+def ZnFPU02   : ProcResGroup<[ZnFPU0, ZnFPU2]>;
+def ZnFPU03   : ProcResGroup<[ZnFPU0, ZnFPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// ZnALU03 - 0,3 grouping
+def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
+
+// 56 Entry (14x4 entries) Int Scheduler
+def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
+  let BufferSize=56;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
+  let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def ZnMultiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def ZnDivider : ProcResource<1>;
+
+// 4 Cycles load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops 
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+//      a. load and
+//      b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU 
+  // adds 4 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 4);
+  }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant takes 1-cycle on Execution Port.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on ZnAGU
+  // adds 7 cycles to the latency.
+  def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+     let Latency = !add(Lat, 7);
+  }
+}
+
+// WriteRMW is set for instructions with Memory write 
+// operation in codegen
+def : WriteRes<WriteRMW, [ZnAGU]>;
+
+def : WriteRes<WriteStore, [ZnAGU]>;
+def : WriteRes<WriteMove,  [ZnALU]>;
+def : WriteRes<WriteLoad,  [ZnAGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero,  []>;
+def : WriteRes<WriteLEA, [ZnALU]>;
+defm : ZnWriteResPair<WriteALU,   ZnALU, 1>;
+defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
+defm : ZnWriteResPair<WriteJump,  ZnALU, 1>;
+
+// IDIV
+def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
+  let Latency = 41;
+  let ResourceCycles = [1, 41];
+}
+
+def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
+  let Latency = 45;
+  let ResourceCycles = [1, 4, 41];
+}
+
+// IMUL
+def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
+  let Latency = 4;
+}
+def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
+  let Latency = 4;
+}
+
+def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
+  let Latency = 8;
+}
+
+// Floating point operations
+defm : ZnWriteResFpuPair<WriteFHAdd,     ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFAdd,      ZnFPU0,  3>;
+defm : ZnWriteResFpuPair<WriteFBlend,    ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteVarBlend,  ZnFPU0,  1>;
+defm : ZnWriteResFpuPair<WriteCvtI2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2F,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteCvtF2I,    ZnFPU3,  5>;
+defm : ZnWriteResFpuPair<WriteFDiv,      ZnFPU3, 15>;
+defm : ZnWriteResFpuPair<WriteFShuffle,  ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteFMul,      ZnFPU0,  5>;
+defm : ZnWriteResFpuPair<WriteFRcp,      ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrt,    ZnFPU01, 5>;
+defm : ZnWriteResFpuPair<WriteFSqrt,     ZnFPU3, 20>;
+
+// Vector integer operations which uses FPU units
+defm : ZnWriteResFpuPair<WriteVecShift,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecLogic,   ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WritePHAdd,      ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecALU,     ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteVecIMul,    ZnFPU0,  4>;
+defm : ZnWriteResFpuPair<WriteShuffle,    ZnFPU,   1>;
+defm : ZnWriteResFpuPair<WriteBlend,      ZnFPU01, 1>;
+defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU,   2>;
+
+// Vector Shift Operations
+defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+
+// AES Instructions.
+defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+
+def : WriteRes<WriteFence,  [ZnAGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
+
+//Microcoded Instructions
+let Latency = 100 in {
+  def : WriteRes<WriteMicrocoded, []>;
+  def : WriteRes<WriteSystem, []>;
+  def : WriteRes<WriteMPSAD, []>;
+  def : WriteRes<WriteMPSADLd, []>;
+  def : WriteRes<WriteCLMul, []>;
+  def : WriteRes<WriteCLMulLd, []>;
+  def : WriteRes<WritePCmpIStrM, []>;
+  def : WriteRes<WritePCmpIStrMLd, []>;
+  def : WriteRes<WritePCmpEStrI, []>;
+  def : WriteRes<WritePCmpEStrILd, []>;
+  def : WriteRes<WritePCmpEStrM, []>;
+  def : WriteRes<WritePCmpEStrMLd, []>;
+  def : WriteRes<WritePCmpIStrI, []>;
+  def : WriteRes<WritePCmpIStrILd, []>;
+  }
+}
diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index f031a28..c67aa04 100644
--- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86InstrInfo.h"
+#include "X86SelectionDAGInfo.h"
 #include "X86ISelLowering.h"
+#include "X86InstrInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
-#include "X86SelectionDAGInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Target/TargetLowering.h"
@@ -44,8 +44,26 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
   return false;
 }
 
+namespace {
+
+// Represents a cover of a buffer of Size bytes with Count() blocks of type AVT
+// (of size UBytes() bytes), as well as how many bytes remain (BytesLeft() is
+// always smaller than the block size).
+struct RepMovsRepeats {
+  RepMovsRepeats(uint64_t Size) : Size(Size) {}
+
+  uint64_t Count() const { return Size / UBytes(); }
+  uint64_t BytesLeft() const { return Size % UBytes(); }
+  uint64_t UBytes() const { return AVT.getSizeInBits() / 8; }
+
+  const uint64_t Size;
+  MVT AVT = MVT::i8;
+};
+
+}  // namespace
+
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
     SDValue Size, unsigned Align, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -69,10 +87,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   if ((Align & 3) != 0 || !ConstantSize ||
       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
     // Check to see if there is a specialized entry-point for memory zeroing.
-    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+    ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
 
-    if (const char *bzeroEntry = V &&
-        V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
+    if (const char *bzeroEntry = ValC &&
+        ValC->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) {
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
       Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
@@ -85,10 +103,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
       Args.push_back(Entry);
 
       TargetLowering::CallLoweringInfo CLI(DAG);
-      CLI.setDebugLoc(dl).setChain(Chain)
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
-        .setDiscardResult();
+      CLI.setDebugLoc(dl)
+          .setChain(Chain)
+          .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                        DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                        std::move(Args))
+          .setDiscardResult();
 
       std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
@@ -102,9 +122,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   SDValue InFlag;
   EVT AVT;
   SDValue Count;
-  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
   unsigned BytesLeft = 0;
-  bool TwoRepStos = false;
   if (ValC) {
     unsigned ValReg;
     uint64_t Val = ValC->getZExtValue() & 255;
@@ -146,7 +165,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   } else {
     AVT = MVT::i8;
     Count  = DAG.getIntPtrConstant(SizeVal, dl);
-    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag);
     InFlag = Chain.getValue(1);
   }
 
@@ -161,20 +180,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
   Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
 
-  if (TwoRepStos) {
-    InFlag = Chain.getValue(1);
-    Count  = Size;
-    EVT CVT = Count.getValueType();
-    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
-                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl,
-                                               CVT));
-    Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
-                             Left, InFlag);
-    InFlag = Chain.getValue(1);
-    Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
-    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
-  } else if (BytesLeft) {
+  if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
     EVT AddrVT = Dst.getValueType();
@@ -183,7 +189,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     Chain = DAG.getMemset(Chain, dl,
                           DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                       DAG.getConstant(Offset, dl, AddrVT)),
-                          Src,
+                          Val,
                           DAG.getConstant(BytesLeft, dl, SizeVT),
                           Align, isVolatile, false,
                           DstPtrInfo.getWithOffset(Offset));
@@ -204,8 +210,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
       DAG.getMachineFunction().getSubtarget<X86Subtarget>();
   if (!ConstantSize)
     return SDValue();
-  uint64_t SizeVal = ConstantSize->getZExtValue();
-  if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
+  RepMovsRepeats Repeats(ConstantSize->getZExtValue());
+  if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold())
     return SDValue();
 
   /// If not DWORD aligned, it is more efficient to call the library.  However
@@ -226,26 +232,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   if (isBaseRegConflictPossible(DAG, ClobberSet))
     return SDValue();
 
-  MVT AVT;
-  if (Align & 1)
-    AVT = MVT::i8;
-  else if (Align & 2)
-    AVT = MVT::i16;
-  else if (Align & 4)
-    // DWORD aligned
-    AVT = MVT::i32;
-  else
-    // QWORD aligned
-    AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
-
-  unsigned UBytes = AVT.getSizeInBits() / 8;
-  unsigned CountVal = SizeVal / UBytes;
-  SDValue Count = DAG.getIntPtrConstant(CountVal, dl);
-  unsigned BytesLeft = SizeVal % UBytes;
+  // If the target has enhanced REPMOVSB, then it's at least as fast to use
+  // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
+  // BytesLeft.
+  if (!Subtarget.hasERMSB() && !(Align & 1)) {
+    if (Align & 2)
+      // WORD aligned
+      Repeats.AVT = MVT::i16;
+    else if (Align & 4)
+      // DWORD aligned
+      Repeats.AVT = MVT::i32;
+    else
+      // QWORD aligned
+      Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
+
+    if (Repeats.BytesLeft() > 0 &&
+        DAG.getMachineFunction().getFunction()->optForMinSize()) {
+      // When agressively optimizing for size, avoid generating the code to
+      // handle BytesLeft.
+      Repeats.AVT = MVT::i8;
+    }
+  }
 
   SDValue InFlag;
   Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX,
-                           Count, InFlag);
+                           DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag);
   InFlag = Chain.getValue(1);
   Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI,
                            Dst, InFlag);
@@ -255,14 +266,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
   InFlag = Chain.getValue(1);
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag };
   SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
 
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepMovs);
-  if (BytesLeft) {
+  if (Repeats.BytesLeft()) {
     // Handle the last 1 - 7 bytes.
-    unsigned Offset = SizeVal - BytesLeft;
+    unsigned Offset = Repeats.Size - Repeats.BytesLeft();
     EVT DstVT = Dst.getValueType();
     EVT SrcVT = Src.getValueType();
     EVT SizeVT = Size.getValueType();
@@ -273,7 +284,8 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
                                     DAG.getNode(ISD::ADD, dl, SrcVT, Src,
                                                 DAG.getConstant(Offset, dl,
                                                                 SrcVT)),
-                                    DAG.getConstant(BytesLeft, dl, SizeVT),
+                                    DAG.getConstant(Repeats.BytesLeft(), dl,
+                                                    SizeVT),
                                     Align, isVolatile, AlwaysInline, false,
                                     DstPtrInfo.getWithOffset(Offset),
                                     SrcPtrInfo.getWithOffset(Offset)));
diff --git a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 1111552..2cebb76 100644
--- a/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -14,7 +14,7 @@
 
 #include "X86ShuffleDecodeConstantPool.h"
 #include "Utils/X86ShuffleDecode.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 
@@ -25,7 +25,7 @@
 namespace llvm {
 
 static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
-                                SmallBitVector &UndefElts,
+                                APInt &UndefElts,
                                 SmallVectorImpl<uint64_t> &RawMask) {
   // It is not an error for shuffle masks to not be a vector of
   // MaskEltSizeInBits because the constant pool uniques constants by their
@@ -49,6 +49,33 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
   unsigned NumCstElts = CstTy->getVectorNumElements();
 
+  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+         "Unaligned shuffle mask size");
+
+  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+  UndefElts = APInt(NumMaskElts, 0);
+  RawMask.resize(NumMaskElts, 0);
+
+  // Fast path - if the constants match the mask size then copy direct.
+  if (MaskEltSizeInBits == CstEltSizeInBits) {
+    assert(NumCstElts == NumMaskElts && "Unaligned shuffle mask size");
+    for (unsigned i = 0; i != NumMaskElts; ++i) {
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+        return false;
+
+      if (isa<UndefValue>(COp)) {
+        UndefElts.setBit(i);
+        RawMask[i] = 0;
+        continue;
+      }
+
+      auto *Elt = cast<ConstantInt>(COp);
+      RawMask[i] = Elt->getValue().getZExtValue();
+    }
+    return true;
+  }
+
   // Extract all the undef/constant element data and pack into single bitsets.
   APInt UndefBits(CstSizeInBits, 0);
   APInt MaskBits(CstSizeInBits, 0);
@@ -57,39 +84,30 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return false;
 
+    unsigned BitOffset = i * CstEltSizeInBits;
+
     if (isa<UndefValue>(COp)) {
-      APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits);
-      UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+      UndefBits.setBits(BitOffset, BitOffset + CstEltSizeInBits);
       continue;
     }
 
-    APInt EltBits = cast<ConstantInt>(COp)->getValue();
-    EltBits = EltBits.zextOrTrunc(CstSizeInBits);
-    MaskBits |= EltBits.shl(i * CstEltSizeInBits);
+    MaskBits.insertBits(cast<ConstantInt>(COp)->getValue(), BitOffset);
   }
 
   // Now extract the undef/constant bit data into the raw shuffle masks.
-  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
-         "Unaligned shuffle mask size");
-
-  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
-  UndefElts = SmallBitVector(NumMaskElts, false);
-  RawMask.resize(NumMaskElts, 0);
-
   for (unsigned i = 0; i != NumMaskElts; ++i) {
-    APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits);
-    EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits);
+    unsigned BitOffset = i * MaskEltSizeInBits;
+    APInt EltUndef = UndefBits.extractBits(MaskEltSizeInBits, BitOffset);
 
     // Only treat the element as UNDEF if all bits are UNDEF, otherwise
     // treat it as zero.
     if (EltUndef.isAllOnesValue()) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       RawMask[i] = 0;
       continue;
     }
 
-    APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits);
-    EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits);
+    APInt EltBits = MaskBits.extractBits(MaskEltSizeInBits, BitOffset);
     RawMask[i] = EltBits.getZExtValue();
   }
 
@@ -104,8 +122,8 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 32> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
@@ -145,8 +163,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 16> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
@@ -180,7 +198,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
+  APInt UndefElts;
   SmallVector<uint64_t, 8> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
@@ -231,8 +249,8 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 32> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 16> RawMask;
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
@@ -286,8 +304,8 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
          "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
@@ -314,8 +332,8 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
          "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
index 586bb7b..24845bea 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -11,19 +11,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "X86.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#include "X86RegisterBankInfo.h"
+#endif
 #include "X86Subtarget.h"
-#include "X86InstrInfo.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <string>
 
 #if defined(_MSC_VER)
 #include <intrin.h>
@@ -93,8 +110,17 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     return X86II::MO_NO_FLAG;
 
   // Absolute symbols can be referenced directly.
-  if (GV && GV->isAbsoluteSymbolRef())
-    return X86II::MO_NO_FLAG;
+  if (GV) {
+    if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) {
+      // See if we can use the 8-bit immediate form. Note that some instructions
+      // will sign extend the immediate operand, so to be conservative we only
+      // accept the range [0,128).
+      if (CR->getUnsignedMax().ult(128))
+        return X86II::MO_ABS8;
+      else
+        return X86II::MO_NO_FLAG;
+    }
+  }
 
   if (TM.shouldAssumeDSOLocal(M, GV))
     return classifyLocalReference(GV);
@@ -126,12 +152,18 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
     return X86II::MO_NO_FLAG;
 
   assert(!isTargetCOFF());
+  const Function *F = dyn_cast_or_null<Function>(GV);
 
-  if (isTargetELF())
+  if (isTargetELF()) {
+    if (is64Bit() && F && (CallingConv::X86_RegCall == F->getCallingConv()))
+      // According to psABI, PLT stub clobbers XMM8-XMM15.
+      // In Regcall calling convention those registers are used for passing
+      // parameters. Thus we need to prevent lazy binding in Regcall.
+      return X86II::MO_GOTPCREL;
     return X86II::MO_PLT;
+  }
 
   if (is64Bit()) {
-    auto *F = dyn_cast_or_null<Function>(GV);
     if (F && F->hasFnAttribute(Attribute::NonLazyBind))
       // If the function is marked as non-lazy, generate an indirect call
       // which loads from the GOT directly. This avoids runtime overhead
@@ -195,7 +227,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+sahf";
   }
 
-
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
@@ -253,6 +284,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFMA4 = false;
   HasXOP = false;
   HasTBM = false;
+  HasLWP = false;
   HasMOVBE = false;
   HasRDRAND = false;
   HasF16C = false;
@@ -263,11 +295,11 @@ void X86Subtarget::initializeEnvironment() {
   HasVBMI = false;
   HasIFMA = false;
   HasRTM = false;
-  HasHLE = false;
   HasERI = false;
   HasCDI = false;
   HasPFI = false;
   HasDQI = false;
+  HasVPOPCNTDQ = false;
   HasBWI = false;
   HasVLX = false;
   HasADX = false;
@@ -277,7 +309,11 @@ void X86Subtarget::initializeEnvironment() {
   HasRDSEED = false;
   HasLAHFSAHF = false;
   HasMWAITX = false;
+  HasCLZERO = false;
   HasMPX = false;
+  HasSGX = false;
+  HasCLFLUSHOPT = false;
+  HasCLWB = false;
   IsBTMemSlow = false;
   IsPMULLDSlow = false;
   IsSHLDSlow = false;
@@ -286,16 +322,19 @@ void X86Subtarget::initializeEnvironment() {
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
-  HasFastPartialYMMWrite = false;
+  HasFastPartialYMMorZMMWrite = false;
   HasFastScalarFSQRT = false;
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
+  HasFastSHLDRotate = false;
+  HasERMSB = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
   CallRegIndirect = false;
   LEAUsesAG = false;
   SlowLEA = false;
+  Slow3OpsLEA = false;
   SlowIncDec = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
@@ -310,6 +349,35 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
   return *this;
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct X86GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+
+} // end anonymous namespace
+#endif
+
 X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                            const X86TargetMachine &TM,
                            unsigned StackAlignOverride)
@@ -321,8 +389,8 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
-      TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
+      FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (!isPositionIndependent())
     setPICStyle(PICStyles::None);
@@ -334,6 +402,19 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
     setPICStyle(PICStyles::StubPIC);
   else if (isTargetELF())
     setPICStyle(PICStyles::GOT);
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+  GISelAccessor *GISel = new GISelAccessor();
+#else
+  X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
+
+  GISel->CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
+  GISel->Legalizer.reset(new X86LegalizerInfo(*this, TM));
+
+  auto *RBI = new X86RegisterBankInfo(*getRegisterInfo());
+  GISel->RegBankInfo.reset(RBI);
+  GISel->InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI));
+#endif
+  setGISelAccessor(*GISel);
 }
 
 const CallLowering *X86Subtarget::getCallLowering() const {
@@ -359,4 +440,3 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
 bool X86Subtarget::enableEarlyIfConversion() const {
   return hasCMov() && X86EarlyIfConv;
 }
-
diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h
index d80dc4a..427a000 100644
--- a/contrib/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h
@@ -18,33 +18,36 @@
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
 #include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <string>
+#include <memory>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "X86GenSubtargetInfo.inc"
 
 namespace llvm {
+
 class GlobalValue;
-class StringRef;
-class TargetMachine;
 
 /// The X86 backend supports a number of different styles of PIC.
 ///
 namespace PICStyles {
+
 enum Style {
   StubPIC,          // Used on i386-darwin in pic mode.
   GOT,              // Used on 32 bit elf on when in pic mode.
   RIPRel,           // Used on X86-64 when in pic mode.
   None              // Set when not in pic mode.
 };
-}
 
-class X86Subtarget final : public X86GenSubtargetInfo {
+} // end namespace PICStyles
 
+class X86Subtarget final : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
     NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -55,7 +58,7 @@ protected:
   };
 
   enum X86ProcFamilyEnum {
-    Others, IntelAtom, IntelSLM
+    Others, IntelAtom, IntelSLM, IntelGLM
   };
 
   /// X86 processor family: Intel Atom, and others
@@ -96,10 +99,13 @@ protected:
 
   /// Target has XSAVE instructions
   bool HasXSAVE;
+
   /// Target has XSAVEOPT instructions
   bool HasXSAVEOPT;
+
   /// Target has XSAVEC instructions
   bool HasXSAVEC;
+
   /// Target has XSAVES instructions
   bool HasXSAVES;
 
@@ -118,6 +124,9 @@ protected:
   /// Target has TBM instructions.
   bool HasTBM;
 
+  /// Target has LWP instructions
+  bool HasLWP;
+
   /// True if the processor has the MOVBE instruction.
   bool HasMOVBE;
 
@@ -148,9 +157,6 @@ protected:
   /// Processor has RTM instructions.
   bool HasRTM;
 
-  /// Processor has HLE.
-  bool HasHLE;
-
   /// Processor has ADX instructions.
   bool HasADX;
 
@@ -169,6 +175,9 @@ protected:
   /// Processor has MONITORX/MWAITX instructions.
   bool HasMWAITX;
 
+  /// Processor has Cache Line Zero instruction
+  bool HasCLZERO;
+
   /// Processor has Prefetch with intent to Write instruction
   bool HasPFPREFETCHWT1;
 
@@ -201,8 +210,8 @@ protected:
   bool UseLeaForSP;
 
   /// True if there is no performance penalty to writing only the lower parts
-  /// of a YMM register without clearing the upper part.
-  bool HasFastPartialYMMWrite;
+  /// of a YMM or ZMM register without clearing the upper part.
+  bool HasFastPartialYMMorZMMWrite;
 
   /// True if hardware SQRTSS instruction is at least as fast (latency) as
   /// RSQRTSS followed by a Newton-Raphson iteration.
@@ -223,6 +232,12 @@ protected:
   /// True if LZCNT instruction is fast.
   bool HasFastLZCNT;
 
+  /// True if SHLD based rotate is fast.
+  bool HasFastSHLDRotate;
+
+  /// True if the processor has enhanced REP MOVSB/STOSB.
+  bool HasERMSB;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -238,6 +253,11 @@ protected:
   /// True if the LEA instruction with certain arguments is slow
   bool SlowLEA;
 
+  /// True if the LEA instruction has all three source operands: base, index,
+  /// and offset or if the LEA instruction uses base and index registers where
+  /// the base is EBP, RBP,or R13
+  bool Slow3OpsLEA;
+
   /// True if INC and DEC instructions are slow when writing to flags
   bool SlowIncDec;
 
@@ -250,6 +270,9 @@ protected:
   /// Processor has AVX-512 Conflict Detection Instructions
   bool HasCDI;
 
+  /// Processor has AVX-512 population count Instructions
+  bool HasVPOPCNTDQ;
+
   /// Processor has AVX-512 Doubleword and Quadword instructions
   bool HasDQI;
 
@@ -265,24 +288,12 @@ protected:
   /// Processor supports MPX - Memory Protection Extensions
   bool HasMPX;
 
-  /// Processor supports Invalidate Process-Context Identifier
-  bool HasInvPCId;
-
-  /// Processor has VM Functions
-  bool HasVMFUNC;
-
-  /// Processor has Supervisor Mode Access Protection
-  bool HasSMAP;
-
   /// Processor has Software Guard Extensions
   bool HasSGX;
 
   /// Processor supports Flush Cache Line instruction
   bool HasCLFLUSHOPT;
 
-  /// Processor has Persistent Commit feature
-  bool HasPCOMMIT;
-
   /// Processor supports Cache Line Write Back instruction
   bool HasCLWB;
 
@@ -307,8 +318,8 @@ protected:
   /// This is used to avoid ifndefs spreading around while GISel is
   /// an optional library.
   std::unique_ptr<GISelAccessor> GISel;
-private:
 
+private:
   /// Override the stack alignment.
   unsigned StackAlignOverride;
 
@@ -341,13 +352,17 @@ public:
   const X86TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
+
   const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
   const X86FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
+
   const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
   const X86RegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
@@ -370,12 +385,14 @@ public:
   const InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
+
 private:
   /// Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
   X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
   void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
 public:
   /// Is this x86_64? (disregarding specific ABI / programming model)
   bool is64Bit() const {
@@ -432,11 +449,12 @@ public:
   bool hasPCLMUL() const { return HasPCLMUL; }
   // Prefer FMA4 to FMA - its better for commutation/memory folding and
   // has equal or better performance on all supported targets.
-  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
   bool hasFMA4() const { return HasFMA4; }
-  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
+  bool hasLWP() const { return HasLWP; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
   bool hasF16C() const { return HasF16C; }
@@ -447,13 +465,13 @@ public:
   bool hasVBMI() const { return HasVBMI; }
   bool hasIFMA() const { return HasIFMA; }
   bool hasRTM() const { return HasRTM; }
-  bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool hasMWAITX() const { return HasMWAITX; }
+  bool hasCLZERO() const { return HasCLZERO; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isPMULLDSlow() const { return IsPMULLDSlow; }
@@ -462,18 +480,24 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
-  bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastPartialYMMorZMMWrite() const {
+    return HasFastPartialYMMorZMMWrite;
+  }
   bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
+  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasERMSB() const { return HasERMSB; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
   bool slowLEA() const { return SlowLEA; }
+  bool slow3OpsLEA() const { return Slow3OpsLEA; }
   bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
+  bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
   bool hasDQI() const { return HasDQI; }
@@ -481,8 +505,9 @@ public:
   bool hasVLX() const { return HasVLX; }
   bool hasPKU() const { return HasPKU; }
   bool hasMPX() const { return HasMPX; }
+  bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
 
-  virtual bool isXRaySupported() const override { return is64Bit(); }
+  bool isXRaySupported() const override { return is64Bit(); }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
@@ -513,6 +538,7 @@ public:
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
   bool isTargetWindowsMSVC() const {
     return TargetTriple.isWindowsMSVCEnvironment();
@@ -571,7 +597,7 @@ public:
     case CallingConv::Intel_OCL_BI:
       return isTargetWin64();
     // This convention allows using the Win64 convention on other targets.
-    case CallingConv::X86_64_Win64:
+    case CallingConv::Win64:
       return true;
     // This convention allows using the SysV convention on Windows targets.
     case CallingConv::X86_64_SysV:
@@ -616,6 +642,9 @@ public:
   /// Enable the MachineScheduler pass for all X86 subtargets.
   bool enableMachineScheduler() const override { return true; }
 
+  // TODO: Update the regression tests and return true.
+  bool supportPrintSchedInfo() const override { return false; }
+
   bool enableEarlyIfConversion() const override;
 
   /// Return the instruction itineraries based on the subtarget selection.
@@ -628,6 +657,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
index aa5cfc6..08c2cda 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -11,22 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetMachine.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86.h"
 #include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "X86TargetTransformInfo.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
@@ -34,8 +55,12 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::init(true), cl::Hidden);
 
 namespace llvm {
+
 void initializeWinEHStatePassPass(PassRegistry &);
-}
+void initializeFixupLEAPassPass(PassRegistry &);
+void initializeX86ExecutionDepsFixPass(PassRegistry &);
+
+} // end namespace llvm
 
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
@@ -47,27 +72,31 @@ extern "C" void LLVMInitializeX86Target() {
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
+  initializeFixupLEAPassPass(PR);
+  initializeX86ExecutionDepsFixPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::x86_64)
-      return make_unique<X86_64MachoTargetObjectFile>();
-    return make_unique<TargetLoweringObjectFileMachO>();
+      return llvm::make_unique<X86_64MachoTargetObjectFile>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
   }
 
   if (TT.isOSFreeBSD())
-    return make_unique<X86FreeBSDTargetObjectFile>();
-  if (TT.isOSLinux() || TT.isOSNaCl())
-    return make_unique<X86LinuxNaClTargetObjectFile>();
+    return llvm::make_unique<X86FreeBSDTargetObjectFile>();
+  if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
+    return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
+  if (TT.isOSSolaris())
+    return llvm::make_unique<X86SolarisTargetObjectFile>();
   if (TT.isOSFuchsia())
-    return make_unique<X86FuchsiaTargetObjectFile>();
+    return llvm::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
-    return make_unique<X86ELFTargetObjectFile>();
+    return llvm::make_unique<X86ELFTargetObjectFile>();
   if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
-    return make_unique<X86WindowsTargetObjectFile>();
+    return llvm::make_unique<X86WindowsTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
-    return make_unique<TargetLoweringObjectFileCOFF>();
+    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
   llvm_unreachable("unknown subtarget type");
 }
 
@@ -177,31 +206,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-X86TargetMachine::~X86TargetMachine() {}
+X86TargetMachine::~X86TargetMachine() = default;
 
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-struct X86GISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CL;
-  X86GISelActualAccessor(CallLowering* CL): CL(CL) {}
-  const CallLowering *getCallLowering() const override {
-    return CL.get();
-  }
-  const InstructionSelector *getInstructionSelector() const override {
-    //TODO: Implement
-    return nullptr;
-  }
-  const LegalizerInfo *getLegalizerInfo() const override {
-    //TODO: Implement
-    return nullptr;
-  }
-  const RegisterBankInfo *getRegBankInfo() const override {
-    //TODO: Implement
-    return nullptr;
-  }
-};
-} // End anonymous namespace.
-#endif
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -241,13 +247,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
                                         Options.StackAlignmentOverride);
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-    GISelAccessor *GISel = new GISelAccessor();
-#else
-    X86GISelActualAccessor *GISel = new X86GISelActualAccessor(
-        new X86CallLowering(*I->getTargetLowering()));
-#endif
-    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
@@ -270,16 +269,16 @@ TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
   });
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// X86 Code Generator Pass Configuration Options.
 class X86PassConfig : public TargetPassConfig {
 public:
-  X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
+  X86PassConfig(X86TargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   X86TargetMachine &getX86TargetMachine() const {
@@ -289,7 +288,7 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    DAG->addMutation(createX86MacroFusionDAGMutation());
     return DAG;
   }
 
@@ -301,26 +300,40 @@ public:
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
 #endif
-bool addILPOpts() override;
+  bool addILPOpts() override;
   bool addPreISel() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreEmitPass() override;
   void addPreSched2() override;
 };
-} // namespace
+
+class X86ExecutionDepsFix : public ExecutionDepsFix {
+public:
+  static char ID;
+  X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {}
+  StringRef getPassName() const override {
+    return "X86 Execution Dependency Fix";
+  }
+};
+char X86ExecutionDepsFix::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
+                "X86 Execution Dependency Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new X86PassConfig(this, PM);
+  return new X86PassConfig(*this, PM);
 }
 
 void X86PassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass(&getX86TargetMachine()));
+  addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createInterleavedAccessPass(TM));
+    addPass(createInterleavedAccessPass());
 }
 
 bool X86PassConfig::addInstSelector() {
@@ -343,17 +356,17 @@ bool X86PassConfig::addIRTranslator() {
 }
 
 bool X86PassConfig::addLegalizeMachineIR() {
-  //TODO: Implement
+  addPass(new Legalizer());
   return false;
 }
 
 bool X86PassConfig::addRegBankSelect() {
-  //TODO: Implement
+  addPass(new RegBankSelect());
   return false;
 }
 
 bool X86PassConfig::addGlobalInstructionSelect() {
-  //TODO: Implement
+  addPass(new InstructionSelect());
   return false;
 }
 #endif
@@ -362,6 +375,7 @@ bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
+  addPass(createX86CmovConverterPass());
   return true;
 }
 
@@ -375,6 +389,7 @@ bool X86PassConfig::addPreISel() {
 
 void X86PassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    addPass(&LiveRangeShrinkID);
     addPass(createX86FixupSetCC());
     addPass(createX86OptimizeLEAs());
     addPass(createX86CallFrameOptimization());
@@ -391,7 +406,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
 
 void X86PassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass));
+    addPass(new X86ExecutionDepsFix());
 
   if (UseVZeroUpper)
     addPass(createX86IssueVZeroUpperPass());
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
index d756d07..c162079 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h
@@ -13,14 +13,20 @@
 
 #ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
 #define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
-#include "X86InstrInfo.h"
+
 #include "X86Subtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
 class StringRef;
+class X86Subtarget;
+class X86RegisterBankInfo;
 
 class X86TargetMachine final : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -32,17 +38,26 @@ public:
                    Optional<Reloc::Model> RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
   ~X86TargetMachine() override;
+
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
+  // The no argument getSubtargetImpl, while it exists on some targets, is
+  // deprecated and should not be used.
+  const X86Subtarget *getSubtargetImpl() const = delete;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isMachineVerifierClean() const override {
+    return false;
+  }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 7f70829..8627c06 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -9,6 +9,8 @@
 
 #include "X86TargetObjectFile.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/MC/MCContext.h"
@@ -16,8 +18,6 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/COFF.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Target/TargetLowering.h"
 
 using namespace llvm;
@@ -86,6 +86,12 @@ X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(TM.Options.UseInitArray);
 }
 
+void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
+                                            const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
 const MCExpr *X86WindowsTargetObjectFile::lowerRelativeReference(
     const GlobalValue *LHS, const GlobalValue *RHS,
     const TargetMachine &TM) const {
diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
index 39d2e84..f6aa570 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -66,6 +66,11 @@ namespace llvm {
     void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   };
 
+  /// \brief This implementation is used for Solaris on x86/x86-64.
+  class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+  };
+
   /// \brief This implementation is used for Windows targets on x86 and x86-64.
   class X86WindowsTargetObjectFile : public TargetLoweringObjectFileCOFF {
     const MCExpr *
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5715d82..c9924f2 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
   return 8;
 }
 
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
     if (ST->hasAVX512())
       return 512;
@@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   return 32;
 }
 
+unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
+  return getRegisterBitWidth(true);
+}
+
 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // If the loop will not be vectorized, don't interleave the loop.
   // Let regular unroll to unroll the loop, which saves the overflow
@@ -114,7 +118,7 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }
 
 int X86TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty,  
+    unsigned Opcode, Type *Ty,
     TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo,
@@ -138,10 +142,15 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::FDIV, MVT::v2f64, 69 }, // divpd
     { ISD::FADD, MVT::v2f64, 2  }, // addpd
     { ISD::FSUB, MVT::v2f64, 2  }, // subpd
-    // v2i64/v4i64 mul is custom lowered as a series of long
-    // multiplies(3), shifts(3) and adds(2).
-    // slm muldq version throughput is 2
-    { ISD::MUL,  MVT::v2i64, 11 },
+    // v2i64/v4i64 mul is custom lowered as a series of long:
+    // multiplies(3), shifts(3) and adds(2)
+    // slm muldq version throughput is 2 and addq throughput 4 
+    // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
+    //       3X4 (addq throughput) = 17 
+    { ISD::MUL,  MVT::v2i64, 17 },
+    // slm addq\subq throughput is 4
+    { ISD::ADD,  MVT::v2i64, 4  },
+    { ISD::SUB,  MVT::v2i64, 4  },
   };
 
   if (ST->isSLM()) {
@@ -207,6 +216,10 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry AVX512UniformConstCostTable[] = {
+    { ISD::SRA,  MVT::v2i64,   1 },
+    { ISD::SRA,  MVT::v4i64,   1 },
+    { ISD::SRA,  MVT::v8i64,   1 },
+
     { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
   };
@@ -239,35 +252,38 @@ int X86TTIImpl::getArithmeticInstrCost(
   }
 
   static const CostTblEntry SSE2UniformConstCostTable[] = {
-    { ISD::SHL,  MVT::v16i8,   2 }, // psllw + pand.
-    { ISD::SRL,  MVT::v16i8,   2 }, // psrlw + pand.
-    { ISD::SRA,  MVT::v16i8,   4 }, // psrlw, pand, pxor, psubb.
-
-    { ISD::SHL,  MVT::v32i8,   4 }, // 2*(psllw + pand).
-    { ISD::SRL,  MVT::v32i8,   4 }, // 2*(psrlw + pand).
-    { ISD::SRA,  MVT::v32i8,   8 }, // 2*(psrlw, pand, pxor, psubb).
-
-    { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
-    { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
-    { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
-    { ISD::UDIV, MVT::v8i16,   6 }, // pmulhuw sequence
-    { ISD::SDIV, MVT::v8i32,  38 }, // pmuludq sequence
-    { ISD::SDIV, MVT::v4i32,  19 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v8i32,  30 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v4i32,  15 }, // pmuludq sequence
+    { ISD::SHL,  MVT::v16i8,     2 }, // psllw + pand.
+    { ISD::SRL,  MVT::v16i8,     2 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v16i8,     4 }, // psrlw, pand, pxor, psubb.
+
+    { ISD::SHL,  MVT::v32i8,   4+2 }, // 2*(psllw + pand) + split.
+    { ISD::SRL,  MVT::v32i8,   4+2 }, // 2*(psrlw + pand) + split.
+    { ISD::SRA,  MVT::v32i8,   8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
+
+    { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
+    { ISD::SDIV, MVT::v8i16,     6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
+    { ISD::UDIV, MVT::v8i16,     6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  38+2 }, // 2*pmuludq sequence + split.
+    { ISD::SDIV, MVT::v4i32,    19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v8i32,  30+2 }, // 2*pmuludq sequence + split.
+    { ISD::UDIV, MVT::v4i32,    15 }, // pmuludq sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
     // pmuldq sequence.
     if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
-      return LT.first * 30;
+      return LT.first * 32;
     if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
       return LT.first * 15;
 
-    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
-                                            LT.second))
-      return LT.first * Entry->Cost;
+    // XOP has faster vXi8 shifts.
+    if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) ||
+        !ST->hasXOP())
+      if (const auto *Entry =
+              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+        return LT.first * Entry->Cost;
   }
 
   static const CostTblEntry AVX2UniformCostTable[] = {
@@ -319,6 +335,14 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512BWCostTable[] = {
+    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
+    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
+
+    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
+    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
+
     { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
     { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
     { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
@@ -347,8 +371,12 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SHL,     MVT::v16i32,     1 },
     { ISD::SRL,     MVT::v16i32,     1 },
     { ISD::SRA,     MVT::v16i32,     1 },
+
     { ISD::SHL,     MVT::v8i64,      1 },
     { ISD::SRL,     MVT::v8i64,      1 },
+
+    { ISD::SRA,     MVT::v2i64,      1 },
+    { ISD::SRA,     MVT::v4i64,      1 },
     { ISD::SRA,     MVT::v8i64,      1 },
 
     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
@@ -410,18 +438,18 @@ int X86TTIImpl::getArithmeticInstrCost(
     { ISD::SRL,     MVT::v2i64,    2 },
     { ISD::SRA,     MVT::v2i64,    2 },
     // 256bit shifts require splitting if AVX2 didn't catch them above.
-    { ISD::SHL,     MVT::v32i8,    2 },
-    { ISD::SRL,     MVT::v32i8,    4 },
-    { ISD::SRA,     MVT::v32i8,    4 },
-    { ISD::SHL,     MVT::v16i16,   2 },
-    { ISD::SRL,     MVT::v16i16,   4 },
-    { ISD::SRA,     MVT::v16i16,   4 },
-    { ISD::SHL,     MVT::v8i32,    2 },
-    { ISD::SRL,     MVT::v8i32,    4 },
-    { ISD::SRA,     MVT::v8i32,    4 },
-    { ISD::SHL,     MVT::v4i64,    2 },
-    { ISD::SRL,     MVT::v4i64,    4 },
-    { ISD::SRA,     MVT::v4i64,    4 },
+    { ISD::SHL,     MVT::v32i8,  2+2 },
+    { ISD::SRL,     MVT::v32i8,  4+2 },
+    { ISD::SRA,     MVT::v32i8,  4+2 },
+    { ISD::SHL,     MVT::v16i16, 2+2 },
+    { ISD::SRL,     MVT::v16i16, 4+2 },
+    { ISD::SRA,     MVT::v16i16, 4+2 },
+    { ISD::SHL,     MVT::v8i32,  2+2 },
+    { ISD::SRL,     MVT::v8i32,  4+2 },
+    { ISD::SRA,     MVT::v8i32,  4+2 },
+    { ISD::SHL,     MVT::v4i64,  2+2 },
+    { ISD::SRL,     MVT::v4i64,  4+2 },
+    { ISD::SRA,     MVT::v4i64,  4+2 },
   };
 
   // Look for XOP lowering tricks.
@@ -431,23 +459,28 @@ int X86TTIImpl::getArithmeticInstrCost(
 
   static const CostTblEntry SSE2UniformShiftCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
-    { ISD::SHL,  MVT::v16i16, 2 }, // psllw.
-    { ISD::SHL,  MVT::v8i32,  2 }, // pslld
-    { ISD::SHL,  MVT::v4i64,  2 }, // psllq.
-
-    { ISD::SRL,  MVT::v16i16, 2 }, // psrlw.
-    { ISD::SRL,  MVT::v8i32,  2 }, // psrld.
-    { ISD::SRL,  MVT::v4i64,  2 }, // psrlq.
-
-    { ISD::SRA,  MVT::v16i16, 2 }, // psraw.
-    { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
-    { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
-    { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
+    { ISD::SHL,  MVT::v16i16, 2+2 }, // 2*psllw + split.
+    { ISD::SHL,  MVT::v8i32,  2+2 }, // 2*pslld + split.
+    { ISD::SHL,  MVT::v4i64,  2+2 }, // 2*psllq + split.
+
+    { ISD::SRL,  MVT::v16i16, 2+2 }, // 2*psrlw + split.
+    { ISD::SRL,  MVT::v8i32,  2+2 }, // 2*psrld + split.
+    { ISD::SRL,  MVT::v4i64,  2+2 }, // 2*psrlq + split.
+
+    { ISD::SRA,  MVT::v16i16, 2+2 }, // 2*psraw + split.
+    { ISD::SRA,  MVT::v8i32,  2+2 }, // 2*psrad + split.
+    { ISD::SRA,  MVT::v2i64,    4 }, // 2*psrad + shuffle.
+    { ISD::SRA,  MVT::v4i64,  8+2 }, // 2*(2*psrad + shuffle) + split.
   };
 
   if (ST->hasSSE2() &&
       ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
        (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+
+    // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
+    if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
+      return LT.first * 4; // 2*psrad + shuffle.
+
     if (const auto *Entry =
             CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
@@ -561,28 +594,28 @@ int X86TTIImpl::getArithmeticInstrCost(
       return LT.first * Entry->Cost;
 
   static const CostTblEntry SSE41CostTable[] = {
-    { ISD::SHL,  MVT::v16i8,    11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v32i8,  2*11 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SHL,  MVT::v4i32,     4 }, // pslld/paddd/cvttps2dq/pmulld
-    { ISD::SHL,  MVT::v8i32,   2*4 }, // pslld/paddd/cvttps2dq/pmulld
-
-    { ISD::SRL,  MVT::v16i8,    12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v32i8,  2*12 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRL,  MVT::v4i32,    11 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v8i32,  2*11 }, // Shift each lane + blend.
-
-    { ISD::SRA,  MVT::v16i8,    24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v32i8,  2*24 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v8i16,    14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v16i16, 2*14 }, // pblendvb sequence.
-    { ISD::SRA,  MVT::v4i32,    12 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v8i32,  2*12 }, // Shift each lane + blend.
-
-    { ISD::MUL,  MVT::v4i32,     1 }  // pmulld
+    { ISD::SHL,  MVT::v16i8,      11 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v32i8,  2*11+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SHL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SHL,  MVT::v4i32,       4 }, // pslld/paddd/cvttps2dq/pmulld
+    { ISD::SHL,  MVT::v8i32,   2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
+
+    { ISD::SRL,  MVT::v16i8,      12 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v32i8,  2*12+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRL,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRL,  MVT::v4i32,      11 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v8i32,  2*11+2 }, // Shift each lane + blend + split.
+
+    { ISD::SRA,  MVT::v16i8,      24 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v32i8,  2*24+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v8i16,      14 }, // pblendvb sequence.
+    { ISD::SRA,  MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+    { ISD::SRA,  MVT::v4i32,      12 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v8i32,  2*12+2 }, // Shift each lane + blend + split.
+
+    { ISD::MUL,  MVT::v4i32,       1 }  // pmulld
   };
 
   if (ST->hasSSE41())
@@ -592,34 +625,33 @@ int X86TTIImpl::getArithmeticInstrCost(
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
-    { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SHL,  MVT::v4i32,   2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v8i32, 2*2*5 }, // We optimized this using mul.
-    { ISD::SHL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SHL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRL,  MVT::v16i8,    26 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRL,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRL,  MVT::v2i64,     4 }, // splat+shuffle sequence.
-    { ISD::SRL,  MVT::v4i64,   2*4 }, // splat+shuffle sequence.
-
-    { ISD::SRA,  MVT::v16i8,    54 }, // unpacked cmpgtb sequence.
-    { ISD::SRA,  MVT::v8i16,    32 }, // cmpgtb sequence.
-    { ISD::SRA,  MVT::v4i32,    16 }, // Shift each lane + blend.
-    { ISD::SRA,  MVT::v2i64,    12 }, // srl/xor/sub sequence.
-    { ISD::SRA,  MVT::v4i64,  2*12 }, // srl/xor/sub sequence.
-
-    { ISD::MUL,  MVT::v16i8,    12 }, // extend/pmullw/trunc sequence.
-    { ISD::MUL,  MVT::v8i16,     1 }, // pmullw
-    { ISD::MUL,  MVT::v4i32,     6 }, // 3*pmuludq/4*shuffle
-    { ISD::MUL,  MVT::v2i64,     8 }, // 3*pmuludq/3*shift/2*add
-
-    { ISD::FDIV, MVT::f32,      23 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v4f32,    39 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::f64,      38 }, // Pentium IV from http://www.agner.org/
-    { ISD::FDIV, MVT::v2f64,    69 }, // Pentium IV from http://www.agner.org/
+    { ISD::SHL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SHL,  MVT::v4i32,     2*5 }, // We optimized this using mul.
+    { ISD::SHL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SHL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRL,  MVT::v16i8,      26 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRL,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRL,  MVT::v2i64,       4 }, // splat+shuffle sequence.
+    { ISD::SRL,  MVT::v4i64,   2*4+2 }, // splat+shuffle sequence + split.
+
+    { ISD::SRA,  MVT::v16i8,      54 }, // unpacked cmpgtb sequence.
+    { ISD::SRA,  MVT::v8i16,      32 }, // cmpgtb sequence.
+    { ISD::SRA,  MVT::v4i32,      16 }, // Shift each lane + blend.
+    { ISD::SRA,  MVT::v2i64,      12 }, // srl/xor/sub sequence.
+    { ISD::SRA,  MVT::v4i64,  2*12+2 }, // srl/xor/sub sequence+split.
+
+    { ISD::MUL,  MVT::v16i8,      12 }, // extend/pmullw/trunc sequence.
+    { ISD::MUL,  MVT::v8i16,       1 }, // pmullw
+    { ISD::MUL,  MVT::v4i32,       6 }, // 3*pmuludq/4*shuffle
+    { ISD::MUL,  MVT::v2i64,       8 }, // 3*pmuludq/3*shift/2*add
+
+    { ISD::FDIV, MVT::f32,        23 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v4f32,      39 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::f64,        38 }, // Pentium IV from http://www.agner.org/
+    { ISD::FDIV, MVT::v2f64,      69 }, // Pentium IV from http://www.agner.org/
 
     // It is not a good idea to vectorize division. We have to scalarize it and
     // in the process we will often end up having to spilling regular
@@ -804,7 +836,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
 
     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-    { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+    { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
+
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }  // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
   };
 
   if (ST->hasAVX2())
@@ -861,7 +900,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
   };
 
   if (ST->hasSSSE3())
@@ -886,7 +928,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
     { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
   };
 
   if (ST->hasSSE2())
@@ -906,7 +951,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -1272,7 +1318,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1338,17 +1385,62 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
+unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
+
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Type *> Tys, FastMathFlags FMF) {
+                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                      unsigned ScalarizationCostPassed) {
   // Costs should match the codegen from:
   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
   // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll
   // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll
   // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll
+  static const CostTblEntry AVX512CDCostTbl[] = {
+    { ISD::CTLZ,       MVT::v8i64,   1 },
+    { ISD::CTLZ,       MVT::v16i32,  1 },
+    { ISD::CTLZ,       MVT::v32i16,  8 },
+    { ISD::CTLZ,       MVT::v64i8,  20 },
+    { ISD::CTLZ,       MVT::v4i64,   1 },
+    { ISD::CTLZ,       MVT::v8i32,   1 },
+    { ISD::CTLZ,       MVT::v16i16,  4 },
+    { ISD::CTLZ,       MVT::v32i8,  10 },
+    { ISD::CTLZ,       MVT::v2i64,   1 },
+    { ISD::CTLZ,       MVT::v4i32,   1 },
+    { ISD::CTLZ,       MVT::v8i16,   4 },
+    { ISD::CTLZ,       MVT::v16i8,   4 },
+  };
+  static const CostTblEntry AVX512BWCostTbl[] = {
+    { ISD::BITREVERSE, MVT::v8i64,   5 },
+    { ISD::BITREVERSE, MVT::v16i32,  5 },
+    { ISD::BITREVERSE, MVT::v32i16,  5 },
+    { ISD::BITREVERSE, MVT::v64i8,   5 },
+    { ISD::CTLZ,       MVT::v8i64,  23 },
+    { ISD::CTLZ,       MVT::v16i32, 22 },
+    { ISD::CTLZ,       MVT::v32i16, 18 },
+    { ISD::CTLZ,       MVT::v64i8,  17 },
+    { ISD::CTPOP,      MVT::v8i64,   7 },
+    { ISD::CTPOP,      MVT::v16i32, 11 },
+    { ISD::CTPOP,      MVT::v32i16,  9 },
+    { ISD::CTPOP,      MVT::v64i8,   6 },
+    { ISD::CTTZ,       MVT::v8i64,  10 },
+    { ISD::CTTZ,       MVT::v16i32, 14 },
+    { ISD::CTTZ,       MVT::v32i16, 12 },
+    { ISD::CTTZ,       MVT::v64i8,   9 },
+  };
+  static const CostTblEntry AVX512CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v8i64,  36 },
+    { ISD::BITREVERSE, MVT::v16i32, 24 },
+    { ISD::CTLZ,       MVT::v8i64,  29 },
+    { ISD::CTLZ,       MVT::v16i32, 35 },
+    { ISD::CTPOP,      MVT::v8i64,  16 },
+    { ISD::CTPOP,      MVT::v16i32, 24 },
+    { ISD::CTTZ,       MVT::v8i64,  20 },
+    { ISD::CTTZ,       MVT::v16i32, 28 },
+  };
   static const CostTblEntry XOPCostTbl[] = {
     { ISD::BITREVERSE, MVT::v4i64,   4 },
     { ISD::BITREVERSE, MVT::v8i32,   4 },
@@ -1391,25 +1483,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f64,  28 }, // Haswell from http://www.agner.org/
   };
   static const CostTblEntry AVX1CostTbl[] = {
-    { ISD::BITREVERSE, MVT::v4i64,  10 },
-    { ISD::BITREVERSE, MVT::v8i32,  10 },
-    { ISD::BITREVERSE, MVT::v16i16, 10 },
-    { ISD::BITREVERSE, MVT::v32i8,  10 },
+    { ISD::BITREVERSE, MVT::v4i64,  12 }, // 2 x 128-bit Op + extract/insert
+    { ISD::BITREVERSE, MVT::v8i32,  12 }, // 2 x 128-bit Op + extract/insert
+    { ISD::BITREVERSE, MVT::v16i16, 12 }, // 2 x 128-bit Op + extract/insert
+    { ISD::BITREVERSE, MVT::v32i8,  12 }, // 2 x 128-bit Op + extract/insert
     { ISD::BSWAP,      MVT::v4i64,   4 },
     { ISD::BSWAP,      MVT::v8i32,   4 },
     { ISD::BSWAP,      MVT::v16i16,  4 },
-    { ISD::CTLZ,       MVT::v4i64,  46 },
-    { ISD::CTLZ,       MVT::v8i32,  36 },
-    { ISD::CTLZ,       MVT::v16i16, 28 },
-    { ISD::CTLZ,       MVT::v32i8,  18 },
-    { ISD::CTPOP,      MVT::v4i64,  14 },
-    { ISD::CTPOP,      MVT::v8i32,  22 },
-    { ISD::CTPOP,      MVT::v16i16, 18 },
-    { ISD::CTPOP,      MVT::v32i8,  12 },
-    { ISD::CTTZ,       MVT::v4i64,  20 },
-    { ISD::CTTZ,       MVT::v8i32,  28 },
-    { ISD::CTTZ,       MVT::v16i16, 24 },
-    { ISD::CTTZ,       MVT::v32i8,  18 },
+    { ISD::CTLZ,       MVT::v4i64,  48 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v8i32,  38 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v16i16, 30 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTLZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v4i64,  16 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v8i32,  24 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v16i16, 20 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTPOP,      MVT::v32i8,  14 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v4i64,  22 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v8i32,  30 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v16i16, 26 }, // 2 x 128-bit Op + extract/insert
+    { ISD::CTTZ,       MVT::v32i8,  20 }, // 2 x 128-bit Op + extract/insert
     { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
@@ -1418,8 +1510,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
   };
   static const CostTblEntry SSE42CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   18 }, // Nehalem from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSSE3CostTbl[] = {
     { ISD::BITREVERSE, MVT::v2i64,   5 },
@@ -1443,6 +1535,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::CTTZ,       MVT::v16i8,   9 }
   };
   static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v2i64,  29 },
+    { ISD::BITREVERSE, MVT::v4i32,  27 },
+    { ISD::BITREVERSE, MVT::v8i16,  27 },
+    { ISD::BITREVERSE, MVT::v16i8,  20 },
     { ISD::BSWAP,      MVT::v2i64,   7 },
     { ISD::BSWAP,      MVT::v4i32,   7 },
     { ISD::BSWAP,      MVT::v8i16,   7 },
@@ -1462,8 +1558,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSE1CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   28 }, // Pentium III from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
+  };
+  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+    { ISD::BITREVERSE, MVT::i64,    14 }
+  };
+  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+    { ISD::BITREVERSE, MVT::i32,    14 },
+    { ISD::BITREVERSE, MVT::i16,    14 },
+    { ISD::BITREVERSE, MVT::i8,     11 }
   };
 
   unsigned ISD = ISD::DELETED_NODE;
@@ -1495,6 +1599,18 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   MVT MTy = LT.second;
 
   // Attempt to lookup cost.
+  if (ST->hasCDI())
+    if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
   if (ST->hasXOP())
     if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
@@ -1523,12 +1639,19 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+  if (ST->is64Bit())
+    if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+    return LT.first * Entry->Cost;
+
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Value *> Args, FastMathFlags FMF) {
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+                     ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -1562,22 +1685,8 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
-int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-  assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  int Cost = 0;
-
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    if (Insert)
-      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    if (Extract)
-      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-
-  return Cost;
-}
-
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
@@ -2128,11 +2237,125 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  // TODO: We can increase these based on available vector ops.
+  MaxLoadSize = ST->is64Bit() ? 8 : 4;
+  return true;
+}
+
 bool X86TTIImpl::enableInterleavedAccessVectorization() {
   // TODO: We expect this to be beneficial regardless of arch,
   // but there are currently some unexplained performance artifacts on Atom.
   // As a temporary solution, disable on Atom.
-  return !(ST->isAtom() || ST->isSLM());
+  return !(ST->isAtom());
+}
+
+// Get estimation for interleaved load/store operations for AVX2.
+// \p Factor is the interleaved-access factor (stride) - number of
+// (interleaved) elements in the group.
+// \p Indices contains the indices for a strided load: when the
+// interleaved load has gaps they indicate which elements are used.
+// If Indices is empty (or if the number of indices is equal to the size
+// of the interleaved-access as given in \p Factor) the access has no gaps.
+//
+// As opposed to AVX-512, AVX2 does not have generic shuffles that allow
+// computing the cost using a generic formula as a function of generic
+// shuffles. We therefore use a lookup table instead, filled according to
+// the instruction sequences that codegen currently generates.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+
+  // We currently Support only fully-interleaved groups, with no gaps.
+  // TODO: Support also strided loads (interleaved-groups with gaps).
+  if (Indices.size() && Indices.size() != Factor)
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+
+  // This function can be called with VecTy=<6xi128>, Factor=3, in which case
+  // the VF=2, while v2i128 is an unsupported MVT vector type
+  // (see MachineValueType.h::getVectorVT()).
+  if (!LegalVT.isVector())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  Type *ScalarTy = VecTy->getVectorElementType();
+  
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+  
+  VectorType *VT = VectorType::get(ScalarTy, VF);
+  EVT ETy = TLI->getValueType(DL, VT);
+  if (!ETy.isSimple())
+    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                             Alignment, AddressSpace);
+
+  // TODO: Complete for other data-types and strides.
+  // Each combination of Stride, ElementTy and VF results in a different
+  // sequence; The cost tables are therefore accessed with:
+  // Factor (stride) and VectorType=VFxElemType.
+  // The Cost accounts only for the shuffle sequence;
+  // The cost of the loads/stores is accounted for separately.
+  //
+  static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+    { 3, MVT::v2i8,  10 }, //(load 6i8 and)  deinterleave into 3 x 2i8
+    { 3, MVT::v4i8,  4 },  //(load 12i8 and) deinterleave into 3 x 4i8
+    { 3, MVT::v8i8,  9 },  //(load 24i8 and) deinterleave into 3 x 8i8
+    { 3, MVT::v16i8, 18},  //(load 48i8 and) deinterleave into 3 x 16i8
+    { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
+    
+    { 4, MVT::v2i8,  12 }, //(load 8i8 and)   deinterleave into 4 x 2i8
+    { 4, MVT::v4i8,  4 },  //(load 16i8 and)  deinterleave into 4 x 4i8
+    { 4, MVT::v8i8,  20 }, //(load 32i8 and)  deinterleave into 4 x 8i8
+    { 4, MVT::v16i8, 39 }, //(load 64i8 and)  deinterleave into 4 x 16i8
+    { 4, MVT::v32i8, 80 }  //(load 128i8 and) deinterleave into 4 x 32i8
+  };
+
+  static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+    { 3, MVT::v2i8,  7 },  //interleave 3 x 2i8  into 6i8 (and store)
+    { 3, MVT::v4i8,  8 },  //interleave 3 x 4i8  into 12i8 (and store)
+    { 3, MVT::v8i8,  11 }, //interleave 3 x 8i8  into 24i8 (and store)
+    { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
+    { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+
+    { 4, MVT::v2i8,  12 }, //interleave 4 x 2i8  into 8i8 (and store)
+    { 4, MVT::v4i8,  9 },  //interleave 4 x 4i8  into 16i8 (and store)
+    { 4, MVT::v8i8,  16 }, //interleave 4 x 8i8  into 32i8 (and store)
+    { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
+    { 4, MVT::v32i8, 40 }  //interleave 4 x 32i8 into 128i8 (and store)
+  };
+
+  if (Opcode == Instruction::Load) {
+    if (const auto *Entry =
+            CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  } else {
+    assert(Opcode == Instruction::Store &&
+           "Expected Store Instruction at this  point");
+    if (const auto *Entry = 
+            CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
+      return NumOfMemOps * MemOpCost + Entry->Cost;
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
 }
 
 // Get estimation for interleaved load/store operations and strided load.
@@ -2243,6 +2466,10 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
     return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
                                             Alignment, AddressSpace);
+  if (ST->hasAVX2())
+    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
+                                          Alignment, AddressSpace);
+  
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
index ecaaf95..ad0a0a2 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -33,8 +33,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   const X86Subtarget *ST;
   const X86TargetLowering *TLI;
 
-  int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
   const X86Subtarget *getST() const { return ST; }
   const X86TargetLowering *getTLI() const { return TLI; }
 
@@ -53,7 +51,8 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
@@ -63,11 +62,13 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
@@ -75,10 +76,14 @@ public:
   int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                 const SCEV *Ptr);
 
+  unsigned getAtomicMemIntrinsicMaxElementSize() const;
+
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF);
+                            ArrayRef<Type *> Tys, FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX);
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF);
+                            ArrayRef<Value *> Args, FastMathFlags FMF,
+                            unsigned VF = 1);
 
   int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
 
@@ -88,6 +93,9 @@ public:
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace);
+  int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
 
   int getIntImmCost(int64_t);
 
@@ -102,7 +110,7 @@ public:
   bool isLegalMaskedScatter(Type *DataType);
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
-
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 9766b84..d17dfac 100644
--- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -56,11 +56,11 @@ namespace {
 
     // Core algorithm state:
     // BlockState - Each block is either:
-    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //   - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
     //                   vzeroupper instructions in this block.
     //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
-    //                  block that will ensure that YMM is clean on exit.
-    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  block that will ensure that YMM/ZMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
     //                  subsequent vzeroupper in the block clears it.
     //
     // AddedToDirtySuccessors - This flag is raised when a block is added to the
@@ -97,6 +97,7 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
   return new VZeroUpperInserter();
 }
 
+#ifndef NDEBUG
 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
   switch (ST) {
     case PASS_THROUGH: return "Pass-through";
@@ -105,52 +106,56 @@ const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
   }
   llvm_unreachable("Invalid block exit state.");
 }
+#endif
 
-static bool isYmmReg(unsigned Reg) {
-  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
+/// Thus, there is no need to check for Y/ZMM16 and above.
+static bool isYmmOrZmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
+         (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
 }
 
-static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
        E = MRI.livein_end(); I != E; ++I)
-    if (isYmmReg(I->first))
+    if (isYmmOrZmmReg(I->first))
       return true;
 
   return false;
 }
 
-static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
   for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
+  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
   return true;
 }
 
-static bool hasYmmReg(MachineInstr &MI) {
+static bool hasYmmOrZmmReg(MachineInstr &MI) {
   for (const MachineOperand &MO : MI.operands()) {
-    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
       return true;
     if (!MO.isReg())
       continue;
     if (MO.isDebug())
       continue;
-    if (isYmmReg(MO.getReg()))
+    if (isYmmOrZmmReg(MO.getReg()))
       return true;
   }
   return false;
 }
 
-/// Check if any YMM register will be clobbered by this instruction.
-static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+/// Check if given call instruction has a RegMask operand.
+static bool callHasRegMask(MachineInstr &MI) {
   assert(MI.isCall() && "Can only be called on call instructions.");
   for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isRegMask())
-      continue;
-    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
-      if (MO.clobbersPhysReg(reg))
-        return true;
-    }
+    if (MO.isRegMask())
+      return true;
   }
   return false;
 }
@@ -175,17 +180,20 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
 /// Loop over all of the instructions in the basic block, inserting vzeroupper
 /// instructions before function calls.
 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
-
   // Start by assuming that the block is PASS_THROUGH which implies no unguarded
   // calls.
   BlockExitState CurState = PASS_THROUGH;
   BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
 
   for (MachineInstr &MI : MBB) {
+    bool IsCall = MI.isCall();
+    bool IsReturn = MI.isReturn();
+    bool IsControlFlow = IsCall || IsReturn;
+
     // No need for vzeroupper before iret in interrupt handler function,
-    // epilogue will restore YMM registers if needed.
-    bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
-    bool IsControlFlow = MI.isCall() || MI.isReturn();
+    // epilogue will restore YMM/ZMM registers if needed.
+    if (IsX86INTR && IsReturn)
+      continue;
 
     // An existing VZERO* instruction resets the state.
     if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
@@ -194,30 +202,30 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     }
 
     // Shortcut: don't need to check regular instructions in dirty state.
-    if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+    if (!IsControlFlow && CurState == EXITS_DIRTY)
       continue;
 
-    if (hasYmmReg(MI)) {
-      // We found a ymm-using instruction; this could be an AVX instruction,
-      // or it could be control flow.
+    if (hasYmmOrZmmReg(MI)) {
+      // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
+      // instruction, or it could be control flow.
       CurState = EXITS_DIRTY;
       continue;
     }
 
     // Check for control-flow out of the current function (which might
     // indirectly execute SSE instructions).
-    if (!IsControlFlow || IsReturnFromX86INTR)
+    if (!IsControlFlow)
       continue;
 
-    // If the call won't clobber any YMM register, skip it as well. It usually
-    // happens on helper function calls (such as '_chkstk', '_ftol2') where
-    // standard calling convention is not used (RegMask is not used to mark
-    // register clobbered and register usage (def/imp-def/use) is well-defined
-    // and explicitly specified.
-    if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+    // If the call has no RegMask, skip it as well. It usually happens on
+    // helper function calls (such as '_chkstk', '_ftol2') where standard
+    // calling convention is not used (RegMask is not used to mark register
+    // clobbered and register usage (def/imp-def/use) is well-defined and
+    // explicitly specified.
+    if (IsCall && !callHasRegMask(MI))
       continue;
 
-    // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+    // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
     // registers. In addition, the processor changes back to Clean state, after
     // which execution of SSE instructions or AVX instructions has no transition
     // penalty. Add the VZEROUPPER instruction before any function call/return
@@ -226,7 +234,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     // predecessor block.
     if (CurState == EXITS_DIRTY) {
       // After the inserted VZEROUPPER the state becomes clean again, but
-      // other YMM may appear before other subsequent calls or even before
+      // other YMM/ZMM may appear before other subsequent calls or even before
       // the end of the BB.
       insertVZeroUpper(MI, MBB);
       CurState = EXITS_CLEAN;
@@ -257,30 +265,32 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+  if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
     return false;
   TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   EverMadeChange = false;
   IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
 
-  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
-
-  // Fast check: if the function doesn't use any ymm registers, we don't need
-  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
-  // cheap in the common case of no ymm use.
-  bool YMMUsed = FnHasLiveInYmm;
-  if (!YMMUsed) {
-    const TargetRegisterClass *RC = &X86::VR256RegClass;
-    for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
-         i++) {
-      if (!MRI.reg_nodbg_empty(*i)) {
-        YMMUsed = true;
-        break;
+  bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
+
+  // Fast check: if the function doesn't use any ymm/zmm registers, we don't
+  // need to insert any VZEROUPPER instructions.  This is constant-time, so it
+  // is cheap in the common case of no ymm/zmm use.
+  bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
+  const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
+  for (auto *RC : RCs) {
+    if (!YmmOrZmmUsed) {
+      for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+           i++) {
+        if (!MRI.reg_nodbg_empty(*i)) {
+          YmmOrZmmUsed = true;
+          break;
+        }
       }
     }
   }
-  if (!YMMUsed) {
+  if (!YmmOrZmmUsed) {
     return false;
   }
 
@@ -294,9 +304,9 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     processBasicBlock(MBB);
 
-  // If any YMM regs are live-in to this function, add the entry block to the
-  // DirtySuccessors list
-  if (FnHasLiveInYmm)
+  // If any YMM/ZMM regs are live-in to this function, add the entry block to
+  // the DirtySuccessors list
+  if (FnHasLiveInYmmOrZmm)
     addDirtySuccessor(MF.front());
 
   // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
index bc14630..0c3b343 100644
--- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -22,9 +22,9 @@
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
                         /*isVarArg=*/false);
   Function *Trampoline =
       Function::Create(TrampolineTy, GlobalValue::InternalLinkage,
-                       Twine("__ehhandler$") + GlobalValue::getRealLinkageName(
+                       Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
                                                    ParentFunc->getName()),
                        TheModule);
   BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
@@ -412,7 +412,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
   // Can't use musttail due to prototype mismatch, but we can use tail.
   Call->setTailCall(true);
   // Set inreg so we pass it in EAX.
-  Call->addAttribute(1, Attribute::InReg);
+  Call->addParamAttr(0, Attribute::InReg);
   Builder.CreateRet(Call);
   return Trampoline;
 }
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 500c84d..b03c185 100644
--- a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -12,13 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreInstPrinter.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index dc513f7..8a7efe2 100644
--- a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -15,6 +15,8 @@
 
 #ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
 #define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -32,12 +34,14 @@ public:
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
+
 private:
   void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
   void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index c5859b7..dd27e7c 100644
--- a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -11,16 +11,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "XCoreMCTargetDesc.h"
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
 #include "InstPrinter/XCoreInstPrinter.h"
-#include "XCoreMCAsmInfo.h"
+#include "MCTargetDesc/XCoreMCAsmInfo.h"
 #include "XCoreTargetStreamer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -79,20 +83,25 @@ static MCInstPrinter *createXCoreMCInstPrinter(const Triple &T,
 }
 
 XCoreTargetStreamer::XCoreTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
-XCoreTargetStreamer::~XCoreTargetStreamer() {}
+
+XCoreTargetStreamer::~XCoreTargetStreamer() = default;
 
 namespace {
 
 class XCoreTargetAsmStreamer : public XCoreTargetStreamer {
   formatted_raw_ostream &OS;
+
 public:
   XCoreTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+
   void emitCCTopData(StringRef Name) override;
   void emitCCTopFunction(StringRef Name) override;
   void emitCCBottomData(StringRef Name) override;
   void emitCCBottomFunction(StringRef Name) override;
 };
 
+} // end anonymous namespace
+
 XCoreTargetAsmStreamer::XCoreTargetAsmStreamer(MCStreamer &S,
                                                formatted_raw_ostream &OS)
     : XCoreTargetStreamer(S), OS(OS) {}
@@ -112,7 +121,6 @@ void XCoreTargetAsmStreamer::emitCCBottomData(StringRef Name) {
 void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
   OS << "\t.cc_bottom " << Name << ".function\n";
 }
-}
 
 static MCTargetStreamer *createTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
diff --git a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index ac0f3fe..1dc384f 100644
--- a/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/contrib/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -14,13 +14,13 @@
 #ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 #define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 
-#include "llvm/Support/DataTypes.h"
-
 namespace llvm {
+
 class Target;
+
 Target &getTheXCoreTarget();
 
-} // End llvm namespace
+} // end namespace llvm
 
 // Defines symbolic names for XCore registers.  This defines a mapping from
 // register name to register number.
@@ -36,4 +36,4 @@ Target &getTheXCoreTarget();
 #define GET_SUBTARGETINFO_ENUM
 #include "XCoreGenSubtargetInfo.inc"
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index b35aa0b..8f7c8a8 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "XCore.h"
 #include "InstPrinter/XCoreInstPrinter.h"
+#include "XCore.h"
 #include "XCoreInstrInfo.h"
 #include "XCoreMCInstLower.h"
 #include "XCoreSubtarget.h"
diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index e0e2e03..7846120 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -238,7 +238,7 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
     report_fatal_error("emitPrologue unsupported alignment: "
                        + Twine(MFI.getMaxAlignment()));
 
-  const AttributeSet &PAL = MF.getFunction()->getAttributes();
+  const AttributeList &PAL = MF.getFunction()->getAttributes();
   if (PAL.hasAttrSomewhere(Attribute::Nest))
     BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
     // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
@@ -575,18 +575,17 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   assert(RS && "requiresRegisterScavenging failed");
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   // Reserve slots close to SP or frame pointer for Scavenging spills.
   // When using SP for small frames, we don't need any scratch registers.
   // When using SP for large frames, we may need 2 scratch registers.
   // When using FP, for large or small frames, we may need 1 scratch register.
+  unsigned Size = TRI.getSpillSize(RC);
+  unsigned Align = TRI.getSpillAlignment(RC);
   if (XFI->isLargeFrame(MF) || hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                      RC->getAlignment(),
-                                                      false));
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
   if (XFI->isLargeFrame(MF) && !hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(RC->getSize(),
-                                                      RC->getAlignment(),
-                                                      false));
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
 }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 9244d59..1da189c 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -406,9 +407,9 @@ SDValue XCoreTargetLowering::lowerLoadWordFromAlignedBasePlusOffset(
 
 static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
 {
-  APInt KnownZero, KnownOne;
-  DAG.computeKnownBits(Value, KnownZero, KnownOne);
-  return KnownZero.countTrailingOnes() >= 2;
+  KnownBits Known;
+  DAG.computeKnownBits(Value, Known);
+  return Known.countMinTrailingZeros() >= 2;
 }
 
 SDValue XCoreTargetLowering::
@@ -483,7 +484,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
       CallingConv::C, IntPtrTy,
       DAG.getExternalSymbol("__misaligned_load",
                             getPointerTy(DAG.getDataLayout())),
@@ -1130,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
   unsigned NumBytes = RetCCInfo.getNextStackOffset();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getConstant(NumBytes, dl, PtrVT, true), dl);
+  Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -1601,13 +1601,12 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       if (OutVal.hasOneUse()) {
         unsigned BitWidth = OutVal.getValueSizeInBits();
         APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
-        APInt KnownZero, KnownOne;
+        KnownBits Known;
         TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                               !DCI.isBeforeLegalizeOps());
         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(OutVal, DemandedMask) ||
-            TLI.SimplifyDemandedBits(OutVal, DemandedMask, KnownZero, KnownOne,
-                                     TLO))
+        if (TLI.ShrinkDemandedConstant(OutVal, DemandedMask, TLO) ||
+            TLI.SimplifyDemandedBits(OutVal, DemandedMask, Known, TLO))
           DCI.CommitTargetLoweringOpt(TLO);
       }
       break;
@@ -1618,13 +1617,12 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       if (Time.hasOneUse()) {
         unsigned BitWidth = Time.getValueSizeInBits();
         APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
-        APInt KnownZero, KnownOne;
+        KnownBits Known;
         TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                               !DCI.isBeforeLegalizeOps());
         const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-        if (TLO.ShrinkDemandedConstant(Time, DemandedMask) ||
-            TLI.SimplifyDemandedBits(Time, DemandedMask, KnownZero, KnownOne,
-                                     TLO))
+        if (TLI.ShrinkDemandedConstant(Time, DemandedMask, TLO) ||
+            TLI.SimplifyDemandedBits(Time, DemandedMask, Known, TLO))
           DCI.CommitTargetLoweringOpt(TLO);
       }
       break;
@@ -1655,11 +1653,11 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
     // low bit set
     if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
-      APInt KnownZero, KnownOne;
+      KnownBits Known;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.computeKnownBits(N2, KnownZero, KnownOne);
-      if ((KnownZero & Mask) == Mask) {
+      DAG.computeKnownBits(N2, Known);
+      if ((Known.Zero & Mask) == Mask) {
         SDValue Carry = DAG.getConstant(0, dl, VT);
         SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
         SDValue Ops[] = { Result, Carry };
@@ -1678,11 +1676,11 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 
     // fold (lsub 0, 0, x) -> x, -x iff x has only the low bit set
     if (N0C && N0C->isNullValue() && N1C && N1C->isNullValue()) {
-      APInt KnownZero, KnownOne;
+      KnownBits Known;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.computeKnownBits(N2, KnownZero, KnownOne);
-      if ((KnownZero & Mask) == Mask) {
+      DAG.computeKnownBits(N2, Known);
+      if ((Known.Zero & Mask) == Mask) {
         SDValue Borrow = N2;
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
                                      DAG.getConstant(0, dl, VT), N2);
@@ -1694,11 +1692,11 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     // fold (lsub x, 0, y) -> 0, sub x, y iff borrow is unused and y has only the
     // low bit set
     if (N1C && N1C->isNullValue() && N->hasNUsesOfValue(0, 1)) {
-      APInt KnownZero, KnownOne;
+      KnownBits Known;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.computeKnownBits(N2, KnownZero, KnownOne);
-      if ((KnownZero & Mask) == Mask) {
+      DAG.computeKnownBits(N2, Known);
+      if ((Known.Zero & Mask) == Mask) {
         SDValue Borrow = DAG.getConstant(0, dl, VT);
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
         SDValue Ops[] = { Result, Borrow };
@@ -1822,19 +1820,19 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 }
 
 void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
-                                                        APInt &KnownZero,
-                                                        APInt &KnownOne,
+                                                        KnownBits &Known,
+                                                        const APInt &DemandedElts,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
-  KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
+  Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
   case XCoreISD::LADD:
   case XCoreISD::LSUB:
     if (Op.getResNo() == 1) {
       // Top bits of carry / borrow are clear.
-      KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
-                                        KnownZero.getBitWidth() - 1);
+      Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
+                                         Known.getBitWidth() - 1);
     }
     break;
   case ISD::INTRINSIC_W_CHAIN:
@@ -1843,24 +1841,24 @@ void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       switch (IntNo) {
       case Intrinsic::xcore_getts:
         // High bits are known to be zero.
-        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
-                                          KnownZero.getBitWidth() - 16);
+        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
+                                           Known.getBitWidth() - 16);
         break;
       case Intrinsic::xcore_int:
       case Intrinsic::xcore_inct:
         // High bits are known to be zero.
-        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
-                                          KnownZero.getBitWidth() - 8);
+        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
+                                           Known.getBitWidth() - 8);
         break;
       case Intrinsic::xcore_testct:
         // Result is either 0 or 1.
-        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
-                                          KnownZero.getBitWidth() - 1);
+        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
+                                           Known.getBitWidth() - 1);
         break;
       case Intrinsic::xcore_testwct:
         // Result is in the range 0 - 4.
-        KnownZero = APInt::getHighBitsSet(KnownZero.getBitWidth(),
-                                          KnownZero.getBitWidth() - 3);
+        Known.Zero = APInt::getHighBitsSet(Known.getBitWidth(),
+                                           Known.getBitWidth() - 3);
         break;
       }
     }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
index 41813bb..452d5b0 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -200,8 +200,8 @@ namespace llvm {
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
+                                       KnownBits &Known,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
index f1d52d5..b87ba65 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
+++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -73,9 +73,10 @@ def XCoreLdwsp        : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp,
                                [SDNPHasChain, SDNPMayLoad]>;
 
 // These are target-independent nodes, but have target-specific formats.
-def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
+                                             SDTCisVT<1, i32> ]>;
 def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                        SDTCisVT<1, i32> ]>;
+                                           SDTCisVT<1, i32> ]>;
 
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
                            [SDNPHasChain, SDNPOutGlue]>;
@@ -323,9 +324,9 @@ class F2R_np<bits<6> opc, string OpcStr> :
 //===----------------------------------------------------------------------===//
 
 let Defs = [SP], Uses = [SP] in {
-def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
-                               "# ADJCALLSTACKDOWN $amt",
-                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2),
+                               "# ADJCALLSTACKDOWN $amt, $amt2",
+                               [(callseq_start timm:$amt, timm:$amt2)]>;
 def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             "# ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 5cc51cd..87532d1 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -128,11 +128,11 @@ createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
 
 static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
   do {
-    SmallVector<WeakVH,8> WUsers(CE->user_begin(), CE->user_end());
+    SmallVector<WeakTrackingVH, 8> WUsers(CE->user_begin(), CE->user_end());
     std::sort(WUsers.begin(), WUsers.end());
     WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
     while (!WUsers.empty())
-      if (WeakVH WU = WUsers.pop_back_val()) {
+      if (WeakTrackingVH WU = WUsers.pop_back_val()) {
         if (PHINode *PN = dyn_cast<PHINode>(WU)) {
           for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
             if (PN->getIncomingValue(I) == CE) {
@@ -159,12 +159,12 @@ static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
 }
 
 static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
-  SmallVector<WeakVH,8> WUsers;
+  SmallVector<WeakTrackingVH, 8> WUsers;
   for (User *U : GV->users())
     if (!isa<Instruction>(U))
-      WUsers.push_back(WeakVH(U));
+      WUsers.push_back(WeakTrackingVH(U));
   while (!WUsers.empty())
-    if (WeakVH WU = WUsers.pop_back_val()) {
+    if (WeakTrackingVH WU = WUsers.pop_back_val()) {
       ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
       if (!CE || !replaceConstantExprOp(CE, P))
         return false;
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index e91536c..75af0e9 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -10,6 +10,7 @@
 #include "XCoreMachineFunctionInfo.h"
 #include "XCoreInstrInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
@@ -35,13 +36,15 @@ int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
   if (LRSpillSlotSet) {
     return LRSpillSlot;
   }
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   if (! MF.getFunction()->isVarArg()) {
     // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
-    LRSpillSlot = MFI.CreateFixedObject(RC->getSize(), 0, true);
+    LRSpillSlot = MFI.CreateFixedObject(TRI.getSpillSize(RC), 0, true);
   } else {
-    LRSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+    LRSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC),
+                                        TRI.getSpillAlignment(RC), true);
   }
   LRSpillSlotSet = true;
   return LRSpillSlot;
@@ -51,9 +54,11 @@ int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
   if (FPSpillSlotSet) {
     return FPSpillSlot;
   }
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  FPSpillSlot = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  FPSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC),
+                                      TRI.getSpillAlignment(RC), true);
   FPSpillSlotSet = true;
   return FPSpillSlot;
 }
@@ -62,10 +67,13 @@ const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
   if (EHSpillSlotSet) {
     return EHSpillSlot;
   }
-  const TargetRegisterClass *RC = &XCore::GRRegsRegClass;
+  const TargetRegisterClass &RC = XCore::GRRegsRegClass;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  EHSpillSlot[0] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
-  EHSpillSlot[1] = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), true);
+  unsigned Size = TRI.getSpillSize(RC);
+  unsigned Align = TRI.getSpillAlignment(RC);
+  EHSpillSlot[0] = MFI.CreateStackObject(Size, Align, true);
+  EHSpillSlot[1] = MFI.CreateStackObject(Size, Align, true);
   EHSpillSlotSet = true;
   return EHSpillSlot;
 }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
index cdcc52f..cf469ec 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h
@@ -14,50 +14,39 @@
 #ifndef LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
 
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include <cassert>
+#include <utility>
 #include <vector>
 
 namespace llvm {
 
-// Forward declarations
-class Function;
-
 /// XCoreFunctionInfo - This class is derived from MachineFunction private
 /// XCore target-specific information for each MachineFunction.
 class XCoreFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-  bool LRSpillSlotSet;
+  bool LRSpillSlotSet = false;
   int LRSpillSlot;
-  bool FPSpillSlotSet;
+  bool FPSpillSlotSet = false;
   int FPSpillSlot;
-  bool EHSpillSlotSet;
+  bool EHSpillSlotSet = false;
   int EHSpillSlot[2];
   unsigned ReturnStackOffset;
-  bool ReturnStackOffsetSet;
-  int VarArgsFrameIndex;
-  mutable int CachedEStackSize;
+  bool ReturnStackOffsetSet = false;
+  int VarArgsFrameIndex = 0;
+  mutable int CachedEStackSize = -1;
   std::vector<std::pair<MachineBasicBlock::iterator, CalleeSavedInfo>>
   SpillLabels;
 
+  virtual void anchor();
+
 public:
-  XCoreFunctionInfo() :
-    LRSpillSlotSet(false),
-    FPSpillSlotSet(false),
-    EHSpillSlotSet(false),
-    ReturnStackOffsetSet(false),
-    VarArgsFrameIndex(0),
-    CachedEStackSize(-1) {}
+  XCoreFunctionInfo() = default;
   
-  explicit XCoreFunctionInfo(MachineFunction &MF) :
-    LRSpillSlotSet(false),
-    FPSpillSlotSet(false),
-    EHSpillSlotSet(false),
-    ReturnStackOffsetSet(false),
-    VarArgsFrameIndex(0),
-    CachedEStackSize(-1) {}
+  explicit XCoreFunctionInfo(MachineFunction &MF) {}
   
-  ~XCoreFunctionInfo() {}
+  ~XCoreFunctionInfo() override = default;
   
   void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
@@ -101,6 +90,7 @@ public:
     return SpillLabels;
   }
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_XCORE_XCOREMACHINEFUNCTIONINFO_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index c03b0af..646309e 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -35,11 +35,11 @@ SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Chain)
-        .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                   Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__memcpy_4",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args))
+        .setLibCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                      Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "__memcpy_4", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args))
         .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index bf3138f..cb23399 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -11,14 +11,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreTargetMachine.h"
+#include "MCTargetDesc/XCoreMCTargetDesc.h"
+#include "XCore.h"
 #include "XCoreTargetObjectFile.h"
 #include "XCoreTargetTransformInfo.h"
-#include "XCore.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/TargetRegistry.h"
+
 using namespace llvm;
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
@@ -38,18 +42,19 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(
           T, "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32-f64:32-a:0:32-n32",
           TT, CPU, FS, Options, getEffectiveRelocModel(RM), CM, OL),
-      TLOF(make_unique<XCoreTargetObjectFile>()),
+      TLOF(llvm::make_unique<XCoreTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   initAsmInfo();
 }
 
-XCoreTargetMachine::~XCoreTargetMachine() {}
+XCoreTargetMachine::~XCoreTargetMachine() = default;
 
 namespace {
+
 /// XCore Code Generator Pass Configuration Options.
 class XCorePassConfig : public TargetPassConfig {
 public:
-  XCorePassConfig(XCoreTargetMachine *TM, PassManagerBase &PM)
+  XCorePassConfig(XCoreTargetMachine &TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   XCoreTargetMachine &getXCoreTargetMachine() const {
@@ -61,14 +66,15 @@ public:
   bool addInstSelector() override;
   void addPreEmitPass() override;
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *XCoreTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new XCorePassConfig(this, PM);
+  return new XCorePassConfig(*this, PM);
 }
 
 void XCorePassConfig::addIRPasses() {
-  addPass(createAtomicExpandPass(&getXCoreTargetMachine()));
+  addPass(createAtomicExpandPass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
index 4bd25bc..a047b3c 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.h
@@ -15,13 +15,19 @@
 #define LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
 
 #include "XCoreSubtarget.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
 class XCoreTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   XCoreSubtarget Subtarget;
+
 public:
   XCoreTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -38,6 +44,7 @@ public:
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
@@ -45,4 +52,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_XCORETARGETMACHINE_H
diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
index ad8693f..c60a262 100644
--- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -9,10 +9,10 @@
 
 #include "XCoreTargetObjectFile.h"
 #include "XCoreSubtarget.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/ELF.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Testing/Support/Error.cpp b/contrib/llvm/lib/Testing/Support/Error.cpp
new file mode 100644
index 0000000..ce0da44
--- /dev/null
+++ b/contrib/llvm/lib/Testing/Support/Error.cpp
@@ -0,0 +1,22 @@
+//===- llvm/Testing/Support/Error.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Testing/Support/Error.h"
+
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+
+llvm::detail::ErrorHolder llvm::detail::TakeError(llvm::Error Err) {
+  bool Succeeded = !static_cast<bool>(Err);
+  std::string Message;
+  if (!Succeeded)
+    Message = toString(std::move(Err));
+  return {Succeeded, Message};
+}
diff --git a/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
new file mode 100644
index 0000000..4820b9f
--- /dev/null
+++ b/contrib/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -0,0 +1,183 @@
+//===- DlltoolDriver.cpp - dlltool.exe-compatible driver ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines an interface to a dlltool.exe-compatible driver.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ToolDrivers/llvm-dlltool/DlltoolDriver.h"
+#include "llvm/Object/ArchiveWriter.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
+#include "llvm/Object/COFFModuleDefinition.h"
+#include "llvm/Option/Arg.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
+#include "llvm/Support/Path.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::object;
+using namespace llvm::COFF;
+
+namespace {
+
+enum {
+  OPT_INVALID = 0,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
+#include "Options.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE;
+#include "Options.inc"
+#undef PREFIX
+
+static const llvm::opt::OptTable::Info infoTable[] = {
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X7, X8, X9, X10, X11, X12)      \
+  {X1, X2, X10,         X11,         OPT_##ID, llvm::opt::Option::KIND##Class, \
+   X9, X8, OPT_##GROUP, OPT_##ALIAS, X7,       X12},
+#include "Options.inc"
+#undef OPTION
+};
+
+class DllOptTable : public llvm::opt::OptTable {
+public:
+  DllOptTable() : OptTable(infoTable, false) {}
+};
+
+} // namespace
+
+std::vector<std::unique_ptr<MemoryBuffer>> OwningMBs;
+
+// Opens a file. Path has to be resolved already.
+// Newly created memory buffers are owned by this driver.
+Optional<MemoryBufferRef> openFile(StringRef Path) {
+  ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> MB = MemoryBuffer::getFile(Path);
+
+  if (std::error_code EC = MB.getError()) {
+    llvm::errs() << "fail openFile: " << EC.message() << "\n";
+    return None;
+  }
+
+  MemoryBufferRef MBRef = MB.get()->getMemBufferRef();
+  OwningMBs.push_back(std::move(MB.get())); // take ownership
+  return MBRef;
+}
+
+static MachineTypes getEmulation(StringRef S) {
+  return StringSwitch<MachineTypes>(S)
+      .Case("i386", IMAGE_FILE_MACHINE_I386)
+      .Case("i386:x86-64", IMAGE_FILE_MACHINE_AMD64)
+      .Case("arm", IMAGE_FILE_MACHINE_ARMNT)
+      .Default(IMAGE_FILE_MACHINE_UNKNOWN);
+}
+
+static std::string getImplibPath(std::string Path) {
+  SmallString<128> Out = StringRef("lib");
+  Out.append(Path);
+  sys::path::replace_extension(Out, ".a");
+  return Out.str();
+}
+
+int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
+  DllOptTable Table;
+  unsigned MissingIndex;
+  unsigned MissingCount;
+  llvm::opt::InputArgList Args =
+      Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount);
+  if (MissingCount) {
+    llvm::errs() << Args.getArgString(MissingIndex) << ": missing argument\n";
+    return 1;
+  }
+
+  // Handle when no input or output is specified
+  if (Args.hasArgNoClaim(OPT_INPUT) ||
+      (!Args.hasArgNoClaim(OPT_d) && !Args.hasArgNoClaim(OPT_l))) {
+    Table.PrintHelp(outs(), ArgsArr[0], "dlltool", false);
+    llvm::outs() << "\nTARGETS: i386, i386:x86-64, arm\n";
+    return 1;
+  }
+
+  if (!Args.hasArgNoClaim(OPT_m) && Args.hasArgNoClaim(OPT_d)) {
+    llvm::errs() << "error: no target machine specified\n"
+                 << "supported targets: i386, i386:x86-64, arm\n";
+    return 1;
+  }
+
+  for (auto *Arg : Args.filtered(OPT_UNKNOWN))
+    llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
+
+  if (!Args.hasArg(OPT_d)) {
+    llvm::errs() << "no definition file specified\n";
+    return 1;
+  }
+
+  Optional<MemoryBufferRef> MB = openFile(Args.getLastArg(OPT_d)->getValue());
+  if (!MB)
+    return 1;
+
+  if (!MB->getBufferSize()) {
+    llvm::errs() << "definition file empty\n";
+    return 1;
+  }
+
+  COFF::MachineTypes Machine = IMAGE_FILE_MACHINE_UNKNOWN;
+  if (auto *Arg = Args.getLastArg(OPT_m))
+    Machine = getEmulation(Arg->getValue());
+
+  if (Machine == IMAGE_FILE_MACHINE_UNKNOWN) {
+    llvm::errs() << "unknown target\n";
+    return 1;
+  }
+
+  Expected<COFFModuleDefinition> Def =
+      parseCOFFModuleDefinition(*MB, Machine, true);
+
+  if (!Def) {
+    llvm::errs() << "error parsing definition\n"
+                 << errorToErrorCode(Def.takeError()).message();
+    return 1;
+  }
+
+  // Do this after the parser because parseCOFFModuleDefinition sets OutputFile.
+  if (auto *Arg = Args.getLastArg(OPT_D))
+    Def->OutputFile = Arg->getValue();
+
+  if (Def->OutputFile.empty()) {
+    llvm::errs() << "no output file specified\n";
+    return 1;
+  }
+
+  std::string Path = Args.getLastArgValue(OPT_l);
+  if (Path.empty())
+    Path = getImplibPath(Def->OutputFile);
+
+  if (Machine == IMAGE_FILE_MACHINE_I386 && Args.getLastArg(OPT_k)) {
+    for (COFFShortExport& E : Def->Exports) {
+      if (E.isWeak() || (!E.Name.empty() && E.Name[0] == '?'))
+        continue;
+      E.SymbolName = E.Name;
+      // Trim off the trailing decoration. Symbols will always have a
+      // starting prefix here (either _ for cdecl/stdcall, @ for fastcall
+      // or ? for C++ functions). (Vectorcall functions also will end up having
+      // a prefix here, even if they shouldn't.)
+      E.Name = E.Name.substr(0, E.Name.find('@', 1));
+      // By making sure E.SymbolName != E.Name for decorated symbols,
+      // writeImportLibrary writes these symbols with the type
+      // IMPORT_NAME_UNDECORATE.
+    }
+  }
+
+  if (writeImportLibrary(Def->OutputFile, Path, Def->Exports, Machine, true))
+    return 1;
+  return 0;
+}
diff --git a/contrib/llvm/lib/ToolDrivers/llvm-dlltool/Options.td b/contrib/llvm/lib/ToolDrivers/llvm-dlltool/Options.td
new file mode 100644
index 0000000..e78182a
--- /dev/null
+++ b/contrib/llvm/lib/ToolDrivers/llvm-dlltool/Options.td
@@ -0,0 +1,26 @@
+include "llvm/Option/OptParser.td"
+
+def m: JoinedOrSeparate<["-"], "m">, HelpText<"Set target machine">;
+def m_long : JoinedOrSeparate<["--"], "machine">, Alias<m>;
+
+def l: JoinedOrSeparate<["-"], "l">, HelpText<"Generate an import lib">;
+def l_long : JoinedOrSeparate<["--"], "output-lib">, Alias<l>;
+
+def D: JoinedOrSeparate<["-"], "D">, HelpText<"Specify the input DLL Name">;
+def D_long : JoinedOrSeparate<["--"], "dllname">, Alias<D>;
+
+def d: JoinedOrSeparate<["-"], "d">, HelpText<"Input .def File">;
+def d_long : JoinedOrSeparate<["--"], "input-def">, Alias<d>;
+
+def k: Flag<["-"], "k">, HelpText<"Kill @n Symbol from export">;
+def k_alias: Flag<["--"], "kill-at">, Alias<k>;
+
+//==============================================================================
+// The flags below do nothing. They are defined only for dlltool compatibility.
+//==============================================================================
+
+def S: JoinedOrSeparate<["-"], "S">, HelpText<"Assembler">;
+def S_alias: JoinedOrSeparate<["--"], "as">, Alias<S>;
+
+def f: JoinedOrSeparate<["-"], "f">, HelpText<"Assembler Flags">;
+def f_alias: JoinedOrSeparate<["--"], "as-flags">, Alias<f>;
diff --git a/contrib/llvm/lib/LibDriver/LibDriver.cpp b/contrib/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index bcdec4f..f304b9c 100644
--- a/contrib/llvm/lib/LibDriver/LibDriver.cpp
+++ b/contrib/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -12,16 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/LibDriver/LibDriver.h"
+#include "llvm/ToolDrivers/llvm-lib/LibDriver.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/BinaryFormat/Magic.h"
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -30,7 +31,7 @@ namespace {
 
 enum {
   OPT_INVALID = 0,
-#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID,
+#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11, _12) OPT_##ID,
 #include "Options.inc"
 #undef OPTION
 };
@@ -40,11 +41,9 @@ enum {
 #undef PREFIX
 
 static const llvm::opt::OptTable::Info infoTable[] = {
-#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10)    \
-  {                                                                    \
-    X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \
-    OPT_##GROUP, OPT_##ALIAS, X6                                       \
-  },
+#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10, X11)       \
+  {X1, X2, X9,          X10,         OPT_##ID, llvm::opt::Option::KIND##Class, \
+   X8, X7, OPT_##GROUP, OPT_##ALIAS, X6,       X11},
 #include "Options.inc"
 #undef OPTION
 };
@@ -121,7 +120,7 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
   for (auto *Arg : Args.filtered(OPT_UNKNOWN))
     llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n";
 
-  if (Args.filtered_begin(OPT_INPUT) == Args.filtered_end()) {
+  if (!Args.hasArgNoClaim(OPT_INPUT)) {
     // No input files.  To match lib.exe, silently do nothing.
     return 0;
   }
@@ -143,11 +142,10 @@ int llvm::libDriverMain(llvm::ArrayRef<const char*> ArgsArr) {
       });
       return 1;
     }
-    sys::fs::file_magic Magic =
-        sys::fs::identify_magic(MOrErr->Buf->getBuffer());
-    if (Magic != sys::fs::file_magic::coff_object &&
-        Magic != sys::fs::file_magic::bitcode &&
-        Magic != sys::fs::file_magic::windows_resource) {
+    llvm::file_magic Magic = llvm::identify_magic(MOrErr->Buf->getBuffer());
+    if (Magic != llvm::file_magic::coff_object &&
+        Magic != llvm::file_magic::bitcode &&
+        Magic != llvm::file_magic::windows_resource) {
       llvm::errs() << Arg->getValue()
                    << ": not a COFF object, bitcode or resource file\n";
       return 1;
diff --git a/contrib/llvm/lib/LibDriver/Options.td b/contrib/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 5a56ef7..5a56ef7 100644
--- a/contrib/llvm/lib/LibDriver/Options.td
+++ b/contrib/llvm/lib/ToolDrivers/llvm-lib/Options.td
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index a97db6f..3598766 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -101,7 +101,9 @@ namespace {
 struct CoroCleanup : FunctionPass {
   static char ID; // Pass identification, replacement for typeid
 
-  CoroCleanup() : FunctionPass(ID) {}
+  CoroCleanup() : FunctionPass(ID) {
+    initializeCoroCleanupPass(*PassRegistry::getPassRegistry());
+  }
 
   std::unique_ptr<Lowerer> L;
 
@@ -124,6 +126,7 @@ struct CoroCleanup : FunctionPass {
     if (!L)
       AU.setPreservesAll();
   }
+  StringRef getPassName() const override { return "Coroutine Cleanup"; }
 };
 }
 
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index e8bb0ca..ba05896 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -183,7 +183,9 @@ namespace {
 
 struct CoroEarly : public FunctionPass {
   static char ID; // Pass identification, replacement for typeid.
-  CoroEarly() : FunctionPass(ID) {}
+  CoroEarly() : FunctionPass(ID) {
+    initializeCoroEarlyPass(*PassRegistry::getPassRegistry());
+  }
 
   std::unique_ptr<Lowerer> L;
 
@@ -208,6 +210,9 @@ struct CoroEarly : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
   }
+  StringRef getPassName() const override {
+    return "Lower early coroutine intrinsics";
+  }
 };
 }
 
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 99974d8..42fd6d7 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -92,7 +92,7 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
 
 // Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
 static Type *getFrameType(Function *Resume) {
-  auto *ArgType = Resume->getArgumentList().front().getType();
+  auto *ArgType = Resume->arg_begin()->getType();
   return cast<PointerType>(ArgType)->getElementType();
 }
 
@@ -127,7 +127,8 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   // is spilled into the coroutine frame and recreate the alignment information
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
-  auto *Frame = new AllocaInst(FrameTy, "", InsertPt);
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
   auto *FrameVoidPtr =
       new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
 
@@ -257,7 +258,9 @@ static bool replaceDevirtTrigger(Function &F) {
 namespace {
 struct CoroElide : FunctionPass {
   static char ID;
-  CoroElide() : FunctionPass(ID) {}
+  CoroElide() : FunctionPass(ID) {
+    initializeCoroElidePass(*PassRegistry::getPassRegistry());
+  }
 
   std::unique_ptr<Lowerer> L;
 
@@ -300,6 +303,7 @@ struct CoroElide : FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
   }
+  StringRef getPassName() const override { return "Coroutine Elision"; }
 };
 }
 
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index bb28558a..85e9003 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -133,6 +133,7 @@ struct SuspendCrossingInfo {
 };
 } // end anonymous namespace
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label,
                                                 BitVector const &BV) const {
   dbgs() << Label << ":";
@@ -151,6 +152,7 @@ LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
   }
   dbgs() << "\n";
 }
+#endif
 
 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
     : Mapping(F) {
@@ -175,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
   // consume. Note, that crossing coro.save also requires a spill, as any code
   // between coro.save and coro.suspend may resume the coroutine and all of the
   // state needs to be saved by that time.
-  auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) {
+  auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
     BasicBlock *SuspendBlock = BarrierInst->getParent();
     auto &B = getBlockData(SuspendBlock);
     B.Suspend = true;
@@ -345,6 +347,27 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
   return FrameTy;
 }
 
+// We need to make room to insert a spill after initial PHIs, but before
+// catchswitch instruction. Placing it before violates the requirement that
+// catchswitch, like all other EHPads must be the first nonPHI in a block.
+//
+// Split away catchswitch into a separate block and insert in its place:
+//
+//   cleanuppad <InsertPt> cleanupret.
+//
+// cleanupret instruction will act as an insert point for the spill.
+static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
+  BasicBlock *CurrentBlock = CatchSwitch->getParent();
+  BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch);
+  CurrentBlock->getTerminator()->eraseFromParent();
+
+  auto *CleanupPad =
+      CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock);
+  auto *CleanupRet =
+      CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock);
+  return CleanupRet;
+}
+
 // Replace all alloca and SSA values that are accessed across suspend points
 // with GetElementPointer from coroutine frame + loads and stores. Create an
 // AllocaSpillBB that will become the new entry block for the resume parts of
@@ -420,15 +443,34 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
           report_fatal_error("Coroutines cannot handle non static allocas yet");
       } else {
         // Otherwise, create a store instruction storing the value into the
-        // coroutine frame. For, argument, we will place the store instruction
-        // right after the coroutine frame pointer instruction, i.e. bitcase of
-        // coro.begin from i8* to %f.frame*. For all other values, the spill is
-        // placed immediately after the definition.
-        Builder.SetInsertPoint(
-            isa<Argument>(CurrentValue)
-                ? FramePtr->getNextNode()
-                : dyn_cast<Instruction>(E.def())->getNextNode());
+        // coroutine frame.
+
+        Instruction *InsertPt = nullptr;
+        if (isa<Argument>(CurrentValue)) {
+          // For arguments, we will place the store instruction right after
+          // the coroutine frame pointer instruction, i.e. bitcast of
+          // coro.begin from i8* to %f.frame*.
+          InsertPt = FramePtr->getNextNode();
+        } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) {
+          // If we are spilling the result of the invoke instruction, split the
+          // normal edge and insert the spill in the new block.
+          auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
+          InsertPt = NewBB->getTerminator();
+        } else if (dyn_cast<PHINode>(CurrentValue)) {
+          // Skip the PHINodes and EH pads instructions.
+          BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent();
+          if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
+            InsertPt = splitBeforeCatchSwitch(CSI);
+          else
+            InsertPt = &*DefBlock->getFirstInsertionPt();
+        } else {
+          // For all other values, the spill is placed immediately after
+          // the definition.
+          assert(!isa<TerminatorInst>(E.def()) && "unexpected terminator");
+          InsertPt = cast<Instruction>(E.def())->getNextNode();
+        }
 
+        Builder.SetInsertPoint(InsertPt);
         auto *G = Builder.CreateConstInBoundsGEP2_32(
             FrameTy, FramePtr, 0, Index,
             CurrentValue->getName() + Twine(".spill.addr"));
@@ -477,6 +519,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
   return FramePtr;
 }
 
+// Sets the unwind edge of an instruction to a particular successor.
+static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) {
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    II->setUnwindDest(Succ);
+  else if (auto *CS = dyn_cast<CatchSwitchInst>(TI))
+    CS->setUnwindDest(Succ);
+  else if (auto *CR = dyn_cast<CleanupReturnInst>(TI))
+    CR->setUnwindDest(Succ);
+  else
+    llvm_unreachable("unexpected terminator instruction");
+}
+
+// Replaces all uses of OldPred with the NewPred block in all PHINodes in a
+// block.
+static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred,
+                           BasicBlock *NewPred,
+                           PHINode *LandingPadReplacement) {
+  unsigned BBIdx = 0;
+  for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // We manually update the LandingPadReplacement PHINode and it is the last
+    // PHI Node. So, if we find it, we are done.
+    if (LandingPadReplacement == PN)
+      break;
+
+    // Reuse the previous value of BBIdx if it lines up.  In cases where we
+    // have multiple phi nodes with *lots* of predecessors, this is a speed
+    // win because we don't have to scan the PHI looking for TIBB.  This
+    // happens because the BB list of PHI nodes are usually in the same
+    // order.
+    if (PN->getIncomingBlock(BBIdx) != OldPred)
+      BBIdx = PN->getBasicBlockIndex(OldPred);
+
+    assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!");
+    PN->setIncomingBlock(BBIdx, NewPred);
+  }
+}
+
+// Uses SplitEdge unless the successor block is an EHPad, in which case do EH
+// specific handling.
+static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
+                                    LandingPadInst *OriginalPad,
+                                    PHINode *LandingPadReplacement) {
+  auto *PadInst = Succ->getFirstNonPHI();
+  if (!LandingPadReplacement && !PadInst->isEHPad())
+    return SplitEdge(BB, Succ);
+
+  auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ);
+  setUnwindEdgeTo(BB->getTerminator(), NewBB);
+  updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement);
+
+  if (LandingPadReplacement) {
+    auto *NewLP = OriginalPad->clone();
+    auto *Terminator = BranchInst::Create(Succ, NewBB);
+    NewLP->insertBefore(Terminator);
+    LandingPadReplacement->addIncoming(NewLP, NewBB);
+    return NewBB;
+  }
+  Value *ParentPad = nullptr;
+  if (auto *FuncletPad = dyn_cast<FuncletPadInst>(PadInst))
+    ParentPad = FuncletPad->getParentPad();
+  else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(PadInst))
+    ParentPad = CatchSwitch->getParentPad();
+  else
+    llvm_unreachable("handling for other EHPads not implemented yet");
+
+  auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB);
+  CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB);
+  return NewBB;
+}
+
 static void rewritePHIs(BasicBlock &BB) {
   // For every incoming edge we will create a block holding all
   // incoming values in a single PHI nodes.
@@ -499,9 +613,22 @@ static void rewritePHIs(BasicBlock &BB) {
   // TODO: Simplify PHINodes in the basic block to remove duplicate
   // predecessors.
 
+  LandingPadInst *LandingPad = nullptr;
+  PHINode *ReplPHI = nullptr;
+  if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
+    // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+    // We replace the original landing pad with a PHINode that will collect the
+    // results from all of them.
+    ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad);
+    ReplPHI->takeName(LandingPad);
+    LandingPad->replaceAllUsesWith(ReplPHI);
+    // We will erase the original landing pad at the end of this function after
+    // ehAwareSplitEdge cloned it in the transition blocks.
+  }
+
   SmallVector<BasicBlock *, 8> Preds(pred_begin(&BB), pred_end(&BB));
   for (BasicBlock *Pred : Preds) {
-    auto *IncomingBB = SplitEdge(Pred, &BB);
+    auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
     auto *PN = cast<PHINode>(&BB.front());
     do {
@@ -513,7 +640,14 @@ static void rewritePHIs(BasicBlock &BB) {
       InputV->addIncoming(V, Pred);
       PN->setIncomingValue(Index, InputV);
       PN = dyn_cast<PHINode>(PN->getNextNode());
-    } while (PN);
+    } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced
+                             // the landing pad.
+  }
+
+  if (LandingPad) {
+    // Calls to ehAwareSplitEdge function cloned the original lading pad.
+    // No longer need it.
+    LandingPad->eraseFromParent();
   }
 }
 
@@ -665,9 +799,9 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     splitAround(CSI, "CoroSuspend");
   }
 
-  // Put fallthrough CoroEnd into its own block. Note: Shape::buildFrom places
-  // the fallthrough coro.end as the first element of CoroEnds array.
-  splitAround(Shape.CoroEnds.front(), "CoroEnd");
+  // Put CoroEnds into their own blocks.
+  for (CoroEndInst *CE : Shape.CoroEnds)
+    splitAround(CE, "CoroEnd");
 
   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
   // never has its definition separated from the PHI by the suspend point.
@@ -679,21 +813,25 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   IRBuilder<> Builder(F.getContext());
   SpillInfo Spills;
 
-  // See if there are materializable instructions across suspend points.
-  for (Instruction &I : instructions(F))
-    if (materializable(I))
-      for (User *U : I.users())
-        if (Checker.isDefinitionAcrossSuspend(I, U))
-          Spills.emplace_back(&I, U);
-
-  // Rewrite materializable instructions to be materialized at the use point.
-  std::sort(Spills.begin(), Spills.end());
-  DEBUG(dump("Materializations", Spills));
-  rewriteMaterializableInstructions(Builder, Spills);
+  for (int Repeat = 0; Repeat < 4; ++Repeat) {
+    // See if there are materializable instructions across suspend points.
+    for (Instruction &I : instructions(F))
+      if (materializable(I))
+        for (User *U : I.users())
+          if (Checker.isDefinitionAcrossSuspend(I, U))
+            Spills.emplace_back(&I, U);
+
+    if (Spills.empty())
+      break;
+
+    // Rewrite materializable instructions to be materialized at the use point.
+    DEBUG(dump("Materializations", Spills));
+    rewriteMaterializableInstructions(Builder, Spills);
+    Spills.clear();
+  }
 
   // Collect the spills for arguments and other not-materializable values.
-  Spills.clear();
-  for (Argument &A : F.getArgumentList())
+  for (Argument &A : F.args())
     for (User *U : A.users())
       if (Checker.isDefinitionAcrossSuspend(A, U))
         Spills.emplace_back(&A, U);
@@ -714,12 +852,9 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
         if (I.getType()->isTokenTy())
           report_fatal_error(
               "token definition is separated from the use by a suspend point");
-        assert(!materializable(I) &&
-               "rewriteMaterializable did not do its job");
         Spills.emplace_back(&I, U);
       }
   }
-  std::sort(Spills.begin(), Spills.end());
   DEBUG(dump("Spills", Spills));
   moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h b/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
index e03cef4..9a8cc5a 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -23,6 +23,9 @@
 // the Coroutine library.
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H
+#define LLVM_LIB_TRANSFORMS_COROUTINES_COROINSTR_H
+
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
 
@@ -55,10 +58,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_subfn_addr;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -67,10 +70,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroAllocInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_alloc;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -172,10 +175,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_id;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -184,10 +187,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_frame;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -200,10 +203,10 @@ public:
   Value *getFrame() const { return getArgOperand(FrameArg); }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_free;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -218,10 +221,10 @@ public:
   Value *getMem() const { return getArgOperand(MemArg); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_begin;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -230,10 +233,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroSaveInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_save;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -251,10 +254,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_promise;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -276,10 +279,10 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_suspend;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -288,10 +291,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst {
 public:
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_size;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
@@ -307,12 +310,14 @@ public:
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntrinsicInst *I) {
+  static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::coro_end;
   }
-  static inline bool classof(const Value *V) {
+  static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 };
 
 } // End namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 7a3f4f6..173dc05 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Transforms/Scalar.h"
@@ -144,6 +145,33 @@ static void replaceFallthroughCoroEnd(IntrinsicInst *End,
   BB->getTerminator()->eraseFromParent();
 }
 
+// In Resumers, we replace unwind coro.end with True to force the immediate
+// unwind to caller.
+static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *True = ConstantInt::getTrue(Context);
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    if (!CE->isUnwind())
+      continue;
+
+    auto *NewCE = cast<IntrinsicInst>(VMap[CE]);
+
+    // If coro.end has an associated bundle, add cleanupret instruction.
+    if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) {
+      Value *FromPad = Bundle->Inputs[0];
+      auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE);
+      NewCE->getParent()->splitBasicBlock(NewCE);
+      CleanupRet->getParent()->getTerminator()->eraseFromParent();
+    }
+
+    NewCE->replaceAllUsesWith(True);
+    NewCE->eraseFromParent();
+  }
+}
+
 // Rewrite final suspend point handling. We do not use suspend index to
 // represent the final suspend point. Instead we zero-out ResumeFnAddr in the
 // coroutine frame, since it is undefined behavior to resume a coroutine
@@ -157,9 +185,9 @@ static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
                                coro::Shape &Shape, SwitchInst *Switch,
                                bool IsDestroy) {
   assert(Shape.HasFinalSuspend);
-  auto FinalCase = --Switch->case_end();
-  BasicBlock *ResumeBB = FinalCase.getCaseSuccessor();
-  Switch->removeCase(FinalCase);
+  auto FinalCaseIt = std::prev(Switch->case_end());
+  BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor();
+  Switch->removeCase(FinalCaseIt);
   if (IsDestroy) {
     BasicBlock *OldSwitchBB = Switch->getParent();
     auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
@@ -188,26 +216,18 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   Function *NewF =
       Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage,
                        F.getName() + Suffix, M);
-  NewF->addAttribute(1, Attribute::NonNull);
-  NewF->addAttribute(1, Attribute::NoAlias);
+  NewF->addParamAttr(0, Attribute::NonNull);
+  NewF->addParamAttr(0, Attribute::NoAlias);
 
   ValueToValueMapTy VMap;
   // Replace all args with undefs. The buildCoroutineFrame algorithm already
   // rewritten access to the args that occurs after suspend points with loads
   // and stores to/from the coroutine frame.
-  for (Argument &A : F.getArgumentList())
+  for (Argument &A : F.args())
     VMap[&A] = UndefValue::get(A.getType());
 
   SmallVector<ReturnInst *, 4> Returns;
 
-  if (DISubprogram *SP = F.getSubprogram()) {
-    // If we have debug info, add mapping for the metadata nodes that should not
-    // be cloned by CloneFunctionInfo.
-    auto &MD = VMap.MD();
-    MD[SP->getUnit()].reset(SP->getUnit());
-    MD[SP->getType()].reset(SP->getType());
-    MD[SP->getFile()].reset(SP->getFile());
-  }
   CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
 
   // Remove old returns.
@@ -216,10 +236,8 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   // Remove old return attributes.
   NewF->removeAttributes(
-      AttributeSet::ReturnIndex,
-      AttributeSet::get(
-          NewF->getContext(), AttributeSet::ReturnIndex,
-          AttributeFuncs::typeIncompatible(NewF->getReturnType())));
+      AttributeList::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewF->getReturnType()));
 
   // Make AllocaSpillBlock the new entry block.
   auto *SwitchBB = cast<BasicBlock>(VMap[ResumeEntry]);
@@ -236,7 +254,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   IRBuilder<> Builder(&NewF->getEntryBlock().front());
 
   // Remap frame pointer.
-  Argument *NewFramePtr = &NewF->getArgumentList().front();
+  Argument *NewFramePtr = &*NewF->arg_begin();
   Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
   NewFramePtr->takeName(OldFramePtr);
   OldFramePtr->replaceAllUsesWith(NewFramePtr);
@@ -270,9 +288,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   // Remove coro.end intrinsics.
   replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
-  // FIXME: coming in upcoming patches:
-  // replaceUnwindCoroEnds(Shape.CoroEnds, VMap);
-
+  replaceUnwindCoroEnds(Shape, VMap);
   // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
   // to suppress deallocation code.
   coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
@@ -284,8 +300,16 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 }
 
 static void removeCoroEnds(coro::Shape &Shape) {
-  for (CoroEndInst *CE : Shape.CoroEnds)
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *False = ConstantInt::getFalse(Context);
+
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    CE->replaceAllUsesWith(False);
     CE->eraseFromParent();
+  }
 }
 
 static void replaceFrameSize(coro::Shape &Shape) {
@@ -477,12 +501,87 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   S.resize(N);
 }
 
+static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) {
+  // Collect all blocks that we need to look for instructions to relocate.
+  SmallPtrSet<BasicBlock *, 4> RelocBlocks;
+  SmallVector<BasicBlock *, 4> Work;
+  Work.push_back(CB->getParent());
+
+  do {
+    BasicBlock *Current = Work.pop_back_val();
+    for (BasicBlock *BB : predecessors(Current))
+      if (RelocBlocks.count(BB) == 0) {
+        RelocBlocks.insert(BB);
+        Work.push_back(BB);
+      }
+  } while (!Work.empty());
+  return RelocBlocks;
+}
+
+static SmallPtrSet<Instruction *, 8>
+getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
+                              SmallPtrSetImpl<BasicBlock *> &RelocBlocks) {
+  SmallPtrSet<Instruction *, 8> DoNotRelocate;
+  // Collect all instructions that we should not relocate
+  SmallVector<Instruction *, 8> Work;
+
+  // Start with CoroBegin and terminators of all preceding blocks.
+  Work.push_back(CoroBegin);
+  BasicBlock *CoroBeginBB = CoroBegin->getParent();
+  for (BasicBlock *BB : RelocBlocks)
+    if (BB != CoroBeginBB)
+      Work.push_back(BB->getTerminator());
+
+  // For every instruction in the Work list, place its operands in DoNotRelocate
+  // set.
+  do {
+    Instruction *Current = Work.pop_back_val();
+    DoNotRelocate.insert(Current);
+    for (Value *U : Current->operands()) {
+      auto *I = dyn_cast<Instruction>(U);
+      if (!I)
+        continue;
+      if (isa<AllocaInst>(U))
+        continue;
+      if (DoNotRelocate.count(I) == 0) {
+        Work.push_back(I);
+        DoNotRelocate.insert(I);
+      }
+    }
+  } while (!Work.empty());
+  return DoNotRelocate;
+}
+
+static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) {
+  // Analyze which non-alloca instructions are needed for allocation and
+  // relocate the rest to after coro.begin. We need to do it, since some of the
+  // targets of those instructions may be placed into coroutine frame memory
+  // for which becomes available after coro.begin intrinsic.
+
+  auto BlockSet = getCoroBeginPredBlocks(CoroBegin);
+  auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet);
+
+  Instruction *InsertPt = CoroBegin->getNextNode();
+  BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well.
+  for (auto B = BB.begin(), E = BB.end(); B != E;) {
+    Instruction &I = *B++;
+    if (isa<AllocaInst>(&I))
+      continue;
+    if (&I == CoroBegin)
+      break;
+    if (DoNotRelocateSet.count(&I))
+      continue;
+    I.moveBefore(InsertPt);
+  }
+}
+
 static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
   coro::Shape Shape(F);
   if (!Shape.CoroBegin)
     return;
 
   simplifySuspendPoints(Shape);
+  relocateInstructionBefore(Shape.CoroBegin, F);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 
@@ -582,7 +681,9 @@ namespace {
 
 struct CoroSplit : public CallGraphSCCPass {
   static char ID; // Pass identification, replacement for typeid
-  CoroSplit() : CallGraphSCCPass(ID) {}
+  CoroSplit() : CallGraphSCCPass(ID) {
+    initializeCoroSplitPass(*PassRegistry::getPassRegistry());
+  }
 
   bool Run = false;
 
@@ -628,6 +729,7 @@ struct CoroSplit : public CallGraphSCCPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     CallGraphSCCPass::getAnalysisUsage(AU);
   }
+  StringRef getPassName() const override { return "Coroutine Splitting"; }
 };
 }
 
diff --git a/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 877ec34..44e1f9b 100644
--- a/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/contrib/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -218,6 +218,8 @@ void coro::Shape::buildFrom(Function &F) {
   size_t FinalSuspendIndex = 0;
   clear(*this);
   SmallVector<CoroFrameInst *, 8> CoroFrames;
+  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
+
   for (Instruction &I : instructions(F)) {
     if (auto II = dyn_cast<IntrinsicInst>(&I)) {
       switch (II->getIntrinsicID()) {
@@ -229,6 +231,12 @@ void coro::Shape::buildFrom(Function &F) {
       case Intrinsic::coro_frame:
         CoroFrames.push_back(cast<CoroFrameInst>(II));
         break;
+      case Intrinsic::coro_save:
+        // After optimizations, coro_suspends using this coro_save might have
+        // been removed, remember orphaned coro_saves to remove them later.
+        if (II->use_empty())
+          UnusedCoroSaves.push_back(cast<CoroSaveInst>(II));
+        break;
       case Intrinsic::coro_suspend:
         CoroSuspends.push_back(cast<CoroSuspendInst>(II));
         if (CoroSuspends.back()->isFinal()) {
@@ -245,9 +253,9 @@ void coro::Shape::buildFrom(Function &F) {
           if (CoroBegin)
             report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-          CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
-          CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
-          CB->removeAttribute(AttributeSet::FunctionIndex,
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+          CB->removeAttribute(AttributeList::FunctionIndex,
                               Attribute::NoDuplicate);
           CoroBegin = CB;
         }
@@ -311,4 +319,8 @@ void coro::Shape::buildFrom(Function &F) {
   if (HasFinalSuspend &&
       FinalSuspendIndex != CoroSuspends.size() - 1)
     std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
+
+  // Remove orphaned coro.saves.
+  for (CoroSaveInst *CoroSave : UnusedCoroSaves)
+    CoroSave->eraseFromParent();
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 65b7bad..72bae20 100644
--- a/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -29,8 +29,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -38,6 +39,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
@@ -51,323 +53,404 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include <set>
 using namespace llvm;
 
 #define DEBUG_TYPE "argpromotion"
 
-STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted");
+STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
 STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
-STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted");
-STATISTIC(NumArgumentsDead     , "Number of dead pointer args eliminated");
+STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
 
-namespace {
-  /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
-  ///
-  struct ArgPromotion : public CallGraphSCCPass {
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      getAAResultsAnalysisUsage(AU);
-      CallGraphSCCPass::getAnalysisUsage(AU);
-    }
+/// A vector used to hold the indices of a single GEP instruction
+typedef std::vector<uint64_t> IndicesVector;
 
-    bool runOnSCC(CallGraphSCC &SCC) override;
-    static char ID; // Pass identification, replacement for typeid
-    explicit ArgPromotion(unsigned maxElements = 3)
-        : CallGraphSCCPass(ID), maxElements(maxElements) {
-      initializeArgPromotionPass(*PassRegistry::getPassRegistry());
-    }
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function.  At this point, we know that it's
+/// safe to do so.
+static Function *
+doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+            Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                ReplaceCallSite) {
 
-  private:
+  // Start by computing a new prototype for the function, which is the same as
+  // the old function, but has modified arguments.
+  FunctionType *FTy = F->getFunctionType();
+  std::vector<Type *> Params;
 
-    using llvm::Pass::doInitialization;
-    bool doInitialization(CallGraph &CG) override;
-    /// The maximum number of elements to expand, or 0 for unlimited.
-    unsigned maxElements;
-  };
-}
+  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
 
-/// A vector used to hold the indices of a single GEP instruction
-typedef std::vector<uint64_t> IndicesVector;
+  // ScalarizedElements - If we are promoting a pointer that has elements
+  // accessed out of it, keep track of which elements are accessed so that we
+  // can add one argument for each.
+  //
+  // Arguments that are directly loaded will have a zero element value here, to
+  // handle cases where there are both a direct load and GEP accesses.
+  //
+  std::map<Argument *, ScalarizeTable> ScalarizedElements;
 
-static CallGraphNode *
-PromoteArguments(CallGraphNode *CGN, CallGraph &CG,
-                 function_ref<AAResults &(Function &F)> AARGetter,
-                 unsigned MaxElements);
-static bool isDenselyPacked(Type *type, const DataLayout &DL);
-static bool canPaddingBeAccessed(Argument *Arg);
-static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR,
-                                    unsigned MaxElements);
-static CallGraphNode *
-DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG);
+  // OriginalLoads - Keep track of a representative load instruction from the
+  // original function so that we can tell the alias analysis implementation
+  // what the new GEP/Load instructions we are inserting look like.
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
 
-char ArgPromotion::ID = 0;
-INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
-                "Promote 'by reference' arguments to scalars", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
-                "Promote 'by reference' arguments to scalars", false, false)
+  // Attribute - Keep track of the parameter attributes for the arguments
+  // that we are *not* promoting. For the ones that we do promote, the parameter
+  // attributes are lost
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  AttributeList PAL = F->getAttributes();
 
-Pass *llvm::createArgumentPromotionPass(unsigned maxElements) {
-  return new ArgPromotion(maxElements);
-}
+  // First, determine the new argument list
+  unsigned ArgNo = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgNo) {
+    if (ByValArgsToTransform.count(&*I)) {
+      // Simple byval argument? Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
+      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
+      ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
+                        AttributeSet());
+      ++NumByValArgsPromoted;
+    } else if (!ArgsToPromote.count(&*I)) {
+      // Unchanged argument
+      Params.push_back(I->getType());
+      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+    } else if (I->use_empty()) {
+      // Dead argument (which are always marked as promotable)
+      ++NumArgumentsDead;
 
-static bool runImpl(CallGraphSCC &SCC, CallGraph &CG,
-                    function_ref<AAResults &(Function &F)> AARGetter,
-                    unsigned MaxElements) {
-  bool Changed = false, LocalChange;
+      // There may be remaining metadata uses of the argument for things like
+      // llvm.dbg.value. Replace them with undef.
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    } else {
+      // Okay, this is being promoted. This means that the only uses are loads
+      // or GEPs which are only used by loads
 
-  do {  // Iterate until we stop promoting from this SCC.
-    LocalChange = false;
-    // Attempt to promote arguments from all functions in this SCC.
-    for (CallGraphNode *OldNode : SCC) {
-      if (CallGraphNode *NewNode =
-              PromoteArguments(OldNode, CG, AARGetter, MaxElements)) {
-        LocalChange = true;
-        SCC.ReplaceNode(OldNode, NewNode);
+      // In this table, we will track which indices are loaded from the argument
+      // (where direct loads are tracked as no indices).
+      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+      for (User *U : I->users()) {
+        Instruction *UI = cast<Instruction>(U);
+        Type *SrcTy;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          SrcTy = L->getType();
+        else
+          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
+        IndicesVector Indices;
+        Indices.reserve(UI->getNumOperands() - 1);
+        // Since loads will only have a single operand, and GEPs only a single
+        // non-index operand, this will record direct loads without any indices,
+        // and gep+loads with the GEP indices.
+        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
+             II != IE; ++II)
+          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Indices.size() == 1 && Indices.front() == 0)
+          Indices.clear();
+        ArgIndices.insert(std::make_pair(SrcTy, Indices));
+        LoadInst *OrigLoad;
+        if (LoadInst *L = dyn_cast<LoadInst>(UI))
+          OrigLoad = L;
+        else
+          // Take any load, we will use it only to update Alias Analysis
+          OrigLoad = cast<LoadInst>(UI->user_back());
+        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
       }
-    }
-    Changed |= LocalChange;               // Remember that we changed something.
-  } while (LocalChange);
-  
-  return Changed;
-}
 
-bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
-  if (skipSCC(SCC))
-    return false;
+      // Add a parameter to the function for each element passed in.
+      for (const auto &ArgIndex : ArgIndices) {
+        // not allowed to dereference ->begin() if size() is 0
+        Params.push_back(GetElementPtrInst::getIndexedType(
+            cast<PointerType>(I->getType()->getScalarType())->getElementType(),
+            ArgIndex.second));
+        ArgAttrVec.push_back(AttributeSet());
+        assert(Params.back());
+      }
 
-  // Get the callgraph information that we need to update to reflect our
-  // changes.
-  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
+        ++NumArgumentsPromoted;
+      else
+        ++NumAggregatesPromoted;
+    }
+  }
 
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
-
-  return runImpl(SCC, CG, AARGetter, maxElements);
-}
+  Type *RetTy = FTy->getReturnType();
 
-/// \brief Checks if a type could have padding bytes.
-static bool isDenselyPacked(Type *type, const DataLayout &DL) {
+  // Construct the new function type using the new arguments.
+  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
 
-  // There is no size information, so be conservative.
-  if (!type->isSized())
-    return false;
+  // Create the new function body and insert it into the module.
+  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
+  NF->copyAttributesFrom(F);
 
-  // If the alloc size is not equal to the storage size, then there are padding
-  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
-  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
-    return false;
+  // Patch the pointer to LLVM function in debug info descriptor.
+  NF->setSubprogram(F->getSubprogram());
+  F->setSubprogram(nullptr);
 
-  if (!isa<CompositeType>(type))
-    return true;
+  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
+               << "From: " << *F);
 
-  // For homogenous sequential types, check for padding within members.
-  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
-    return isDenselyPacked(seqTy->getElementType(), DL);
+  // Recompute the parameter attributes list based on the new arguments for
+  // the function.
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
+                                       PAL.getRetAttributes(), ArgAttrVec));
+  ArgAttrVec.clear();
 
-  // Check for padding within and between elements of a struct.
-  StructType *StructTy = cast<StructType>(type);
-  const StructLayout *Layout = DL.getStructLayout(StructTy);
-  uint64_t StartPos = 0;
-  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
-    Type *ElTy = StructTy->getElementType(i);
-    if (!isDenselyPacked(ElTy, DL))
-      return false;
-    if (StartPos != Layout->getElementOffsetInBits(i))
-      return false;
-    StartPos += DL.getTypeAllocSizeInBits(ElTy);
-  }
+  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+  NF->takeName(F);
 
-  return true;
-}
+  // Loop over all of the callers of the function, transforming the call sites
+  // to pass in the loaded pointers.
+  //
+  SmallVector<Value *, 16> Args;
+  while (!F->use_empty()) {
+    CallSite CS(F->user_back());
+    assert(CS.getCalledFunction() == F);
+    Instruction *Call = CS.getInstruction();
+    const AttributeList &CallPAL = CS.getAttributes();
 
-/// \brief Checks if the padding bytes of an argument could be accessed.
-static bool canPaddingBeAccessed(Argument *arg) {
+    // Loop over the operands, inserting GEP and loads in the caller as
+    // appropriate.
+    CallSite::arg_iterator AI = CS.arg_begin();
+    ArgNo = 0;
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+         ++I, ++AI, ++ArgNo)
+      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+        Args.push_back(*AI); // Unmodified argument
+        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+      } else if (ByValArgsToTransform.count(&*I)) {
+        // Emit a GEP and load for each element of the struct.
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
+        Value *Idxs[2] = {
+            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+          Value *Idx = GetElementPtrInst::Create(
+              STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);
+          // TODO: Tell AA about the new values?
+          Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call));
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      } else if (!I->use_empty()) {
+        // Non-dead argument: insert GEPs and loads as appropriate.
+        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+        // Store the Value* version of the indices in here, but declare it now
+        // for reuse.
+        std::vector<Value *> Ops;
+        for (const auto &ArgIndex : ArgIndices) {
+          Value *V = *AI;
+          LoadInst *OrigLoad =
+              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
+          if (!ArgIndex.second.empty()) {
+            Ops.reserve(ArgIndex.second.size());
+            Type *ElTy = V->getType();
+            for (auto II : ArgIndex.second) {
+              // Use i32 to index structs, and i64 for others (pointers/arrays).
+              // This satisfies GEP constraints.
+              Type *IdxTy =
+                  (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
+                                      : Type::getInt64Ty(F->getContext()));
+              Ops.push_back(ConstantInt::get(IdxTy, II));
+              // Keep track of the type we're currently indexing.
+              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
+                ElTy = ElPTy->getElementType();
+              else
+                ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
+            }
+            // And create a GEP to extract those indices.
+            V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
+                                          V->getName() + ".idx", Call);
+            Ops.clear();
+          }
+          // Since we're replacing a load make sure we take the alignment
+          // of the previous load.
+          LoadInst *newLoad = new LoadInst(V, V->getName() + ".val", Call);
+          newLoad->setAlignment(OrigLoad->getAlignment());
+          // Transfer the AA info too.
+          AAMDNodes AAInfo;
+          OrigLoad->getAAMetadata(AAInfo);
+          newLoad->setAAMetadata(AAInfo);
 
-  assert(arg->hasByValAttr());
+          Args.push_back(newLoad);
+          ArgAttrVec.push_back(AttributeSet());
+        }
+      }
 
-  // Track all the pointers to the argument to make sure they are not captured.
-  SmallPtrSet<Value *, 16> PtrValues;
-  PtrValues.insert(arg);
+    // Push any varargs arguments on the list.
+    for (; AI != CS.arg_end(); ++AI, ++ArgNo) {
+      Args.push_back(*AI);
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+    }
 
-  // Track all of the stores.
-  SmallVector<StoreInst *, 16> Stores;
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CS.getOperandBundlesAsDefs(OpBundles);
 
-  // Scan through the uses recursively to make sure the pointer is always used
-  // sanely.
-  SmallVector<Value *, 16> WorkList;
-  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
-  while (!WorkList.empty()) {
-    Value *V = WorkList.back();
-    WorkList.pop_back();
-    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
-      if (PtrValues.insert(V).second)
-        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
-    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
-      Stores.push_back(Store);
-    } else if (!isa<LoadInst>(V)) {
-      return true;
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call);
+    } else {
+      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call);
+      NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+      NewCS = NewCall;
     }
-  }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(
+        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
+                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
+    Args.clear();
+    ArgAttrVec.clear();
 
-// Check to make sure the pointers aren't captured
-  for (StoreInst *Store : Stores)
-    if (PtrValues.count(Store->getValueOperand()))
-      return true;
+    // Update the callgraph to know that the callsite has been transformed.
+    if (ReplaceCallSite)
+      (*ReplaceCallSite)(CS, NewCS);
 
-  return false;
-}
+    if (!Call->use_empty()) {
+      Call->replaceAllUsesWith(NewCS.getInstruction());
+      NewCS->takeName(Call);
+    }
 
-/// PromoteArguments - This method checks the specified function to see if there
-/// are any promotable arguments and if it is safe to promote the function (for
-/// example, all callers are direct).  If safe to promote some arguments, it
-/// calls the DoPromotion method.
-///
-static CallGraphNode *
-PromoteArguments(CallGraphNode *CGN, CallGraph &CG,
-                 function_ref<AAResults &(Function &F)> AARGetter,
-                 unsigned MaxElements) {
-  Function *F = CGN->getFunction();
+    // Finally, remove the old call from the program, reducing the use-count of
+    // F.
+    Call->eraseFromParent();
+  }
 
-  // Make sure that it is local to this module.
-  if (!F || !F->hasLocalLinkage()) return nullptr;
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
-  // Don't promote arguments for variadic functions. Adding, removing, or
-  // changing non-pack parameters can change the classification of pack
-  // parameters. Frontends encode that classification at the call site in the
-  // IR, while in the callee the classification is determined dynamically based
-  // on the number of registers consumed so far.
-  if (F->isVarArg()) return nullptr;
+  // Since we have now created the new function, splice the body of the old
+  // function right into the new function, leaving the old rotting hulk of the
+  // function empty.
+  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
 
-  // First check: see if there are any pointer arguments!  If not, quick exit.
-  SmallVector<Argument*, 16> PointerArgs;
-  for (Argument &I : F->args())
-    if (I.getType()->isPointerTy())
-      PointerArgs.push_back(&I);
-  if (PointerArgs.empty()) return nullptr;
+  // Loop over the argument list, transferring uses of the old arguments over to
+  // the new arguments, also transferring over the names as well.
+  //
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+                              I2 = NF->arg_begin();
+       I != E; ++I) {
+    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+      // If this is an unmodified argument, move the name and users over to the
+      // new version.
+      I->replaceAllUsesWith(&*I2);
+      I2->takeName(&*I);
+      ++I2;
+      continue;
+    }
 
-  // Second check: make sure that all callers are direct callers.  We can't
-  // transform functions that have indirect callers.  Also see if the function
-  // is self-recursive.
-  bool isSelfRecursive = false;
-  for (Use &U : F->uses()) {
-    CallSite CS(U.getUser());
-    // Must be a direct call.
-    if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) return nullptr;
-    
-    if (CS.getInstruction()->getParent()->getParent() == F)
-      isSelfRecursive = true;
-  }
-  
-  const DataLayout &DL = F->getParent()->getDataLayout();
+    if (ByValArgsToTransform.count(&*I)) {
+      // In the callee, we create an alloca, and store each of the new incoming
+      // arguments into the alloca.
+      Instruction *InsertPt = &NF->begin()->front();
 
-  AAResults &AAR = AARGetter(*F);
+      // Just add all the struct element types.
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
+                                        I->getParamAlignment(), "", InsertPt);
+      StructType *STy = cast<StructType>(AgTy);
+      Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
+                        nullptr};
 
-  // Check to see which arguments are promotable.  If an argument is promotable,
-  // add it to ArgsToPromote.
-  SmallPtrSet<Argument*, 8> ArgsToPromote;
-  SmallPtrSet<Argument*, 8> ByValArgsToTransform;
-  for (Argument *PtrArg : PointerArgs) {
-    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+        Value *Idx = GetElementPtrInst::Create(
+            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
+            InsertPt);
+        I2->setName(I->getName() + "." + Twine(i));
+        new StoreInst(&*I2++, Idx, InsertPt);
+      }
 
-    // Replace sret attribute with noalias. This reduces register pressure by
-    // avoiding a register copy.
-    if (PtrArg->hasStructRetAttr()) {
-      unsigned ArgNo = PtrArg->getArgNo();
-      F->setAttributes(
-          F->getAttributes()
-              .removeAttribute(F->getContext(), ArgNo + 1, Attribute::StructRet)
-              .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias));
-      for (Use &U : F->uses()) {
-        CallSite CS(U.getUser());
-        CS.setAttributes(
-            CS.getAttributes()
-                .removeAttribute(F->getContext(), ArgNo + 1,
-                                 Attribute::StructRet)
-                .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias));
+      // Anything that used the arg should now use the alloca.
+      I->replaceAllUsesWith(TheAlloca);
+      TheAlloca->takeName(&*I);
+
+      // If the alloca is used in a call, we must clear the tail flag since
+      // the callee now uses an alloca from the caller.
+      for (User *U : TheAlloca->users()) {
+        CallInst *Call = dyn_cast<CallInst>(U);
+        if (!Call)
+          continue;
+        Call->setTailCall(false);
       }
+      continue;
     }
 
-    // If this is a byval argument, and if the aggregate type is small, just
-    // pass the elements, which is always safe, if the passed value is densely
-    // packed or if we can prove the padding bytes are never accessed. This does
-    // not apply to inalloca.
-    bool isSafeToPromote =
-        PtrArg->hasByValAttr() &&
-        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
-    if (isSafeToPromote) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
-          DEBUG(dbgs() << "argpromotion disable promoting argument '"
-                << PtrArg->getName() << "' because it would require adding more"
-                << " than " << MaxElements << " arguments to the function.\n");
-          continue;
-        }
-        
-        // If all the elements are single-value types, we can promote it.
-        bool AllSimple = true;
-        for (const auto *EltTy : STy->elements()) {
-          if (!EltTy->isSingleValueType()) {
-            AllSimple = false;
-            break;
-          }
+    if (I->use_empty())
+      continue;
+
+    // Otherwise, if we promoted this argument, then all users are load
+    // instructions (or GEPs with only load users), and all loads should be
+    // using the new argument that we added.
+    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+
+    while (!I->use_empty()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
+        assert(ArgIndices.begin()->second.empty() &&
+               "Load element should sort to front!");
+        I2->setName(I->getName() + ".val");
+        LI->replaceAllUsesWith(&*I2);
+        LI->eraseFromParent();
+        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+                     << "' in function '" << F->getName() << "'\n");
+      } else {
+        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
+        IndicesVector Operands;
+        Operands.reserve(GEP->getNumIndices());
+        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+             II != IE; ++II)
+          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+        // GEPs with a single 0 index can be merged with direct loads
+        if (Operands.size() == 1 && Operands.front() == 0)
+          Operands.clear();
+
+        Function::arg_iterator TheArg = I2;
+        for (ScalarizeTable::iterator It = ArgIndices.begin();
+             It->second != Operands; ++It, ++TheArg) {
+          assert(It != ArgIndices.end() && "GEP not handled??");
         }
 
-        // Safe to transform, don't even bother trying to "promote" it.
-        // Passing the elements as a scalar will allow sroa to hack on
-        // the new alloca we introduce.
-        if (AllSimple) {
-          ByValArgsToTransform.insert(PtrArg);
-          continue;
+        std::string NewName = I->getName();
+        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+          NewName += "." + utostr(Operands[i]);
         }
-      }
-    }
+        NewName += ".val";
+        TheArg->setName(NewName);
 
-    // If the argument is a recursive type and we're in a recursive
-    // function, we could end up infinitely peeling the function argument.
-    if (isSelfRecursive) {
-      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
-        bool RecursiveType = false;
-        for (const auto *EltTy : STy->elements()) {
-          if (EltTy == PtrArg->getType()) {
-            RecursiveType = true;
-            break;
-          }
+        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+                     << "' of function '" << NF->getName() << "'\n");
+
+        // All of the uses must be load instructions.  Replace them all with
+        // the argument specified by ArgNo.
+        while (!GEP->use_empty()) {
+          LoadInst *L = cast<LoadInst>(GEP->user_back());
+          L->replaceAllUsesWith(&*TheArg);
+          L->eraseFromParent();
         }
-        if (RecursiveType)
-          continue;
+        GEP->eraseFromParent();
       }
     }
-    
-    // Otherwise, see if we can promote the pointer to its value.
-    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
-                                MaxElements))
-      ArgsToPromote.insert(PtrArg);
-  }
 
-  // No promotable pointer arguments.
-  if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) 
-    return nullptr;
+    // Increment I2 past all of the arguments added for this promoted pointer.
+    std::advance(I2, ArgIndices.size());
+  }
 
-  return DoPromotion(F, ArgsToPromote, ByValArgsToTransform, CG);
+  return NF;
 }
 
 /// AllCallersPassInValidPointerForArgument - Return true if we can prove that
 /// all callees pass in a valid pointer for the specified function argument.
-static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
+static bool allCallersPassInValidPointerForArgument(Argument *Arg) {
   Function *Callee = Arg->getParent();
   const DataLayout &DL = Callee->getParent()->getDataLayout();
 
@@ -390,26 +473,25 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
 /// elements in Prefix is the same as the corresponding elements in Longer.
 ///
 /// This means it also returns true when Prefix and Longer are equal!
-static bool IsPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
+static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
   if (Prefix.size() > Longer.size())
     return false;
   return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
 }
 
-
 /// Checks if Indices, or a prefix of Indices, is in Set.
-static bool PrefixIn(const IndicesVector &Indices,
+static bool prefixIn(const IndicesVector &Indices,
                      std::set<IndicesVector> &Set) {
-    std::set<IndicesVector>::iterator Low;
-    Low = Set.upper_bound(Indices);
-    if (Low != Set.begin())
-      Low--;
-    // Low is now the last element smaller than or equal to Indices. This means
-    // it points to a prefix of Indices (possibly Indices itself), if such
-    // prefix exists.
-    //
-    // This load is safe if any prefix of its operands is safe to load.
-    return Low != Set.end() && IsPrefix(*Low, Indices);
+  std::set<IndicesVector>::iterator Low;
+  Low = Set.upper_bound(Indices);
+  if (Low != Set.begin())
+    Low--;
+  // Low is now the last element smaller than or equal to Indices. This means
+  // it points to a prefix of Indices (possibly Indices itself), if such
+  // prefix exists.
+  //
+  // This load is safe if any prefix of its operands is safe to load.
+  return Low != Set.end() && isPrefix(*Low, Indices);
 }
 
 /// Mark the given indices (ToMark) as safe in the given set of indices
@@ -417,7 +499,7 @@ static bool PrefixIn(const IndicesVector &Indices,
 /// is already a prefix of Indices in Safe, Indices are implicitely marked safe
 /// already. Furthermore, any indices that Indices is itself a prefix of, are
 /// removed from Safe (since they are implicitely safe because of Indices now).
-static void MarkIndicesSafe(const IndicesVector &ToMark,
+static void markIndicesSafe(const IndicesVector &ToMark,
                             std::set<IndicesVector> &Safe) {
   std::set<IndicesVector>::iterator Low;
   Low = Safe.upper_bound(ToMark);
@@ -428,7 +510,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark,
   // means it points to a prefix of Indices (possibly Indices itself), if
   // such prefix exists.
   if (Low != Safe.end()) {
-    if (IsPrefix(*Low, ToMark))
+    if (isPrefix(*Low, ToMark))
       // If there is already a prefix of these indices (or exactly these
       // indices) marked a safe, don't bother adding these indices
       return;
@@ -441,7 +523,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark,
   ++Low;
   // If there we're a prefix of longer index list(s), remove those
   std::set<IndicesVector>::iterator End = Safe.end();
-  while (Low != End && IsPrefix(ToMark, *Low)) {
+  while (Low != End && isPrefix(ToMark, *Low)) {
     std::set<IndicesVector>::iterator Remove = Low;
     ++Low;
     Safe.erase(Remove);
@@ -486,7 +568,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
   GEPIndicesSet ToPromote;
 
   // If the pointer is always valid, any load with first index 0 is valid.
-  if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg))
+  if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg))
     SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
 
   // First, iterate the entry block and mark loads of (geps of) arguments as
@@ -512,25 +594,26 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
               return false;
 
           // Indices checked out, mark them as safe
-          MarkIndicesSafe(Indices, SafeToUnconditionallyLoad);
+          markIndicesSafe(Indices, SafeToUnconditionallyLoad);
           Indices.clear();
         }
       } else if (V == Arg) {
         // Direct loads are equivalent to a GEP with a single 0 index.
-        MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+        markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
       }
     }
 
   // Now, iterate all uses of the argument to see if there are any uses that are
   // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
-  SmallVector<LoadInst*, 16> Loads;
+  SmallVector<LoadInst *, 16> Loads;
   IndicesVector Operands;
   for (Use &U : Arg->uses()) {
     User *UR = U.getUser();
     Operands.clear();
     if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
       // Don't hack volatile/atomic loads
-      if (!LI->isSimple()) return false;
+      if (!LI->isSimple())
+        return false;
       Loads.push_back(LI);
       // Direct loads are equivalent to a GEP with a zero index and then a load.
       Operands.push_back(0);
@@ -547,30 +630,31 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
       }
 
       // Ensure that all of the indices are constants.
-      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end();
-        i != e; ++i)
+      for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
+           ++i)
         if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
           Operands.push_back(C->getSExtValue());
         else
-          return false;  // Not a constant operand GEP!
+          return false; // Not a constant operand GEP!
 
       // Ensure that the only users of the GEP are load instructions.
       for (User *GEPU : GEP->users())
         if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
           // Don't hack volatile/atomic loads
-          if (!LI->isSimple()) return false;
+          if (!LI->isSimple())
+            return false;
           Loads.push_back(LI);
         } else {
           // Other uses than load?
           return false;
         }
     } else {
-      return false;  // Not a load or a GEP.
+      return false; // Not a load or a GEP.
     }
 
     // Now, see if it is safe to promote this load / loads of this GEP. Loading
     // is safe if Operands, or a prefix of Operands, is marked as safe.
-    if (!PrefixIn(Operands, SafeToUnconditionallyLoad))
+    if (!prefixIn(Operands, SafeToUnconditionallyLoad))
       return false;
 
     // See if we are already promoting a load with these indices. If not, check
@@ -579,8 +663,10 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
     if (ToPromote.find(Operands) == ToPromote.end()) {
       if (MaxElements > 0 && ToPromote.size() == MaxElements) {
         DEBUG(dbgs() << "argpromotion not promoting argument '"
-              << Arg->getName() << "' because it would require adding more "
-              << "than " << MaxElements << " arguments to the function.\n");
+                     << Arg->getName()
+                     << "' because it would require adding more "
+                     << "than " << MaxElements
+                     << " arguments to the function.\n");
         // We limit aggregate promotion to only promoting up to a fixed number
         // of elements of the aggregate.
         return false;
@@ -589,7 +675,8 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
     }
   }
 
-  if (Loads.empty()) return true;  // No users, this is a dead argument.
+  if (Loads.empty())
+    return true; // No users, this is a dead argument.
 
   // Okay, now we know that the argument is only used by load instructions and
   // it is safe to unconditionally perform all of them. Use alias analysis to
@@ -598,7 +685,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
 
   // Because there could be several/many load instructions, remember which
   // blocks we know to be transparent to the load.
-  df_iterator_default_set<BasicBlock*, 16> TranspBlocks;
+  df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
 
   for (LoadInst *Load : Loads) {
     // Check to see if the load is invalidated from the start of the block to
@@ -607,7 +694,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
 
     MemoryLocation Loc = MemoryLocation::get(Load);
     if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))
-      return false;  // Pointer is invalidated!
+      return false; // Pointer is invalidated!
 
     // Now check every path from the entry block to the load for transparency.
     // To do this, we perform a depth first search on the inverse CFG from the
@@ -625,416 +712,347 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
   return true;
 }
 
-/// DoPromotion - This method actually performs the promotion of the specified
-/// arguments, and returns the new function.  At this point, we know that it's
-/// safe to do so.
-static CallGraphNode *
-DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG) {
+/// \brief Checks if a type could have padding bytes.
+static bool isDenselyPacked(Type *type, const DataLayout &DL) {
 
-  // Start by computing a new prototype for the function, which is the same as
-  // the old function, but has modified arguments.
-  FunctionType *FTy = F->getFunctionType();
-  std::vector<Type*> Params;
+  // There is no size information, so be conservative.
+  if (!type->isSized())
+    return false;
 
-  typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;
+  // If the alloc size is not equal to the storage size, then there are padding
+  // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+  if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
+    return false;
 
-  // ScalarizedElements - If we are promoting a pointer that has elements
-  // accessed out of it, keep track of which elements are accessed so that we
-  // can add one argument for each.
-  //
-  // Arguments that are directly loaded will have a zero element value here, to
-  // handle cases where there are both a direct load and GEP accesses.
-  //
-  std::map<Argument*, ScalarizeTable> ScalarizedElements;
+  if (!isa<CompositeType>(type))
+    return true;
 
-  // OriginalLoads - Keep track of a representative load instruction from the
-  // original function so that we can tell the alias analysis implementation
-  // what the new GEP/Load instructions we are inserting look like.
-  // We need to keep the original loads for each argument and the elements
-  // of the argument that are accessed.
-  std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads;
+  // For homogenous sequential types, check for padding within members.
+  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
+    return isDenselyPacked(seqTy->getElementType(), DL);
 
-  // Attribute - Keep track of the parameter attributes for the arguments
-  // that we are *not* promoting. For the ones that we do promote, the parameter
-  // attributes are lost
-  SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &PAL = F->getAttributes();
+  // Check for padding within and between elements of a struct.
+  StructType *StructTy = cast<StructType>(type);
+  const StructLayout *Layout = DL.getStructLayout(StructTy);
+  uint64_t StartPos = 0;
+  for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
+    Type *ElTy = StructTy->getElementType(i);
+    if (!isDenselyPacked(ElTy, DL))
+      return false;
+    if (StartPos != Layout->getElementOffsetInBits(i))
+      return false;
+    StartPos += DL.getTypeAllocSizeInBits(ElTy);
+  }
 
-  // Add any return attributes.
-  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                              PAL.getRetAttributes()));
+  return true;
+}
 
-  // First, determine the new argument list
-  unsigned ArgIndex = 1;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
-       ++I, ++ArgIndex) {
-    if (ByValArgsToTransform.count(&*I)) {
-      // Simple byval argument? Just add all the struct element types.
-      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      StructType *STy = cast<StructType>(AgTy);
-      Params.insert(Params.end(), STy->element_begin(), STy->element_end());
-      ++NumByValArgsPromoted;
-    } else if (!ArgsToPromote.count(&*I)) {
-      // Unchanged argument
-      Params.push_back(I->getType());
-      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
-      if (attrs.hasAttributes(ArgIndex)) {
-        AttrBuilder B(attrs, ArgIndex);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
-      }
-    } else if (I->use_empty()) {
-      // Dead argument (which are always marked as promotable)
-      ++NumArgumentsDead;
-    } else {
-      // Okay, this is being promoted. This means that the only uses are loads
-      // or GEPs which are only used by loads
+/// \brief Checks if the padding bytes of an argument could be accessed.
+static bool canPaddingBeAccessed(Argument *arg) {
 
-      // In this table, we will track which indices are loaded from the argument
-      // (where direct loads are tracked as no indices).
-      ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-      for (User *U : I->users()) {
-        Instruction *UI = cast<Instruction>(U);
-        Type *SrcTy;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          SrcTy = L->getType();
-        else
-          SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
-        IndicesVector Indices;
-        Indices.reserve(UI->getNumOperands() - 1);
-        // Since loads will only have a single operand, and GEPs only a single
-        // non-index operand, this will record direct loads without any indices,
-        // and gep+loads with the GEP indices.
-        for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
-             II != IE; ++II)
-          Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Indices.size() == 1 && Indices.front() == 0)
-          Indices.clear();
-        ArgIndices.insert(std::make_pair(SrcTy, Indices));
-        LoadInst *OrigLoad;
-        if (LoadInst *L = dyn_cast<LoadInst>(UI))
-          OrigLoad = L;
-        else
-          // Take any load, we will use it only to update Alias Analysis
-          OrigLoad = cast<LoadInst>(UI->user_back());
-        OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
-      }
+  assert(arg->hasByValAttr());
 
-      // Add a parameter to the function for each element passed in.
-      for (const auto &ArgIndex : ArgIndices) {
-        // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(
-            cast<PointerType>(I->getType()->getScalarType())->getElementType(),
-            ArgIndex.second));
-        assert(Params.back());
-      }
+  // Track all the pointers to the argument to make sure they are not captured.
+  SmallPtrSet<Value *, 16> PtrValues;
+  PtrValues.insert(arg);
 
-      if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
-        ++NumArgumentsPromoted;
-      else
-        ++NumAggregatesPromoted;
+  // Track all of the stores.
+  SmallVector<StoreInst *, 16> Stores;
+
+  // Scan through the uses recursively to make sure the pointer is always used
+  // sanely.
+  SmallVector<Value *, 16> WorkList;
+  WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
+  while (!WorkList.empty()) {
+    Value *V = WorkList.back();
+    WorkList.pop_back();
+    if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+      if (PtrValues.insert(V).second)
+        WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+      Stores.push_back(Store);
+    } else if (!isa<LoadInst>(V)) {
+      return true;
     }
   }
 
-  // Add any function attributes.
-  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(FTy->getContext(),
-                                              PAL.getFnAttributes()));
+  // Check to make sure the pointers aren't captured
+  for (StoreInst *Store : Stores)
+    if (PtrValues.count(Store->getValueOperand()))
+      return true;
 
-  Type *RetTy = FTy->getReturnType();
+  return false;
+}
 
-  // Construct the new function type using the new arguments.
-  FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct).  If safe to promote some arguments, it
+/// calls the DoPromotion method.
+///
+static Function *
+promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements,
+                 Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                     ReplaceCallSite) {
+  // Make sure that it is local to this module.
+  if (!F->hasLocalLinkage())
+    return nullptr;
 
-  // Create the new function body and insert it into the module.
-  Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
-  NF->copyAttributesFrom(F);
+  // Don't promote arguments for variadic functions. Adding, removing, or
+  // changing non-pack parameters can change the classification of pack
+  // parameters. Frontends encode that classification at the call site in the
+  // IR, while in the callee the classification is determined dynamically based
+  // on the number of registers consumed so far.
+  if (F->isVarArg())
+    return nullptr;
 
-  // Patch the pointer to LLVM function in debug info descriptor.
-  NF->setSubprogram(F->getSubprogram());
-  F->setSubprogram(nullptr);
+  // First check: see if there are any pointer arguments!  If not, quick exit.
+  SmallVector<Argument *, 16> PointerArgs;
+  for (Argument &I : F->args())
+    if (I.getType()->isPointerTy())
+      PointerArgs.push_back(&I);
+  if (PointerArgs.empty())
+    return nullptr;
 
-  DEBUG(dbgs() << "ARG PROMOTION:  Promoting to:" << *NF << "\n"
-        << "From: " << *F);
-  
-  // Recompute the parameter attributes list based on the new arguments for
-  // the function.
-  NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
-  AttributesVec.clear();
+  // Second check: make sure that all callers are direct callers.  We can't
+  // transform functions that have indirect callers.  Also see if the function
+  // is self-recursive.
+  bool isSelfRecursive = false;
+  for (Use &U : F->uses()) {
+    CallSite CS(U.getUser());
+    // Must be a direct call.
+    if (CS.getInstruction() == nullptr || !CS.isCallee(&U))
+      return nullptr;
 
-  F->getParent()->getFunctionList().insert(F->getIterator(), NF);
-  NF->takeName(F);
+    if (CS.getInstruction()->getParent()->getParent() == F)
+      isSelfRecursive = true;
+  }
 
-  // Get a new callgraph node for NF.
-  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
-  // Loop over all of the callers of the function, transforming the call sites
-  // to pass in the loaded pointers.
-  //
-  SmallVector<Value*, 16> Args;
-  while (!F->use_empty()) {
-    CallSite CS(F->user_back());
-    assert(CS.getCalledFunction() == F);
-    Instruction *Call = CS.getInstruction();
-    const AttributeSet &CallPAL = CS.getAttributes();
+  AAResults &AAR = AARGetter(*F);
 
-    // Add any return attributes.
-    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
-      AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                                CallPAL.getRetAttributes()));
+  // Check to see which arguments are promotable.  If an argument is promotable,
+  // add it to ArgsToPromote.
+  SmallPtrSet<Argument *, 8> ArgsToPromote;
+  SmallPtrSet<Argument *, 8> ByValArgsToTransform;
+  for (Argument *PtrArg : PointerArgs) {
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
-    // Loop over the operands, inserting GEP and loads in the caller as
-    // appropriate.
-    CallSite::arg_iterator AI = CS.arg_begin();
-    ArgIndex = 1;
-    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-         I != E; ++I, ++AI, ++ArgIndex)
-      if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
-        Args.push_back(*AI);          // Unmodified argument
+    // Replace sret attribute with noalias. This reduces register pressure by
+    // avoiding a register copy.
+    if (PtrArg->hasStructRetAttr()) {
+      unsigned ArgNo = PtrArg->getArgNo();
+      F->removeParamAttr(ArgNo, Attribute::StructRet);
+      F->addParamAttr(ArgNo, Attribute::NoAlias);
+      for (Use &U : F->uses()) {
+        CallSite CS(U.getUser());
+        CS.removeParamAttr(ArgNo, Attribute::StructRet);
+        CS.addParamAttr(ArgNo, Attribute::NoAlias);
+      }
+    }
 
-        if (CallPAL.hasAttributes(ArgIndex)) {
-          AttrBuilder B(CallPAL, ArgIndex);
-          AttributesVec.
-            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
-        }
-      } else if (ByValArgsToTransform.count(&*I)) {
-        // Emit a GEP and load for each element of the struct.
-        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-        StructType *STy = cast<StructType>(AgTy);
-        Value *Idxs[2] = {
-              ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-          Value *Idx = GetElementPtrInst::Create(
-              STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);
-          // TODO: Tell AA about the new values?
-          Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call));
+    // If this is a byval argument, and if the aggregate type is small, just
+    // pass the elements, which is always safe, if the passed value is densely
+    // packed or if we can prove the padding bytes are never accessed. This does
+    // not apply to inalloca.
+    bool isSafeToPromote =
+        PtrArg->hasByValAttr() &&
+        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
+    if (isSafeToPromote) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
+          DEBUG(dbgs() << "argpromotion disable promoting argument '"
+                       << PtrArg->getName()
+                       << "' because it would require adding more"
+                       << " than " << MaxElements
+                       << " arguments to the function.\n");
+          continue;
         }
-      } else if (!I->use_empty()) {
-        // Non-dead argument: insert GEPs and loads as appropriate.
-        ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-        // Store the Value* version of the indices in here, but declare it now
-        // for reuse.
-        std::vector<Value*> Ops;
-        for (const auto &ArgIndex : ArgIndices) {
-          Value *V = *AI;
-          LoadInst *OrigLoad =
-              OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
-          if (!ArgIndex.second.empty()) {
-            Ops.reserve(ArgIndex.second.size());
-            Type *ElTy = V->getType();
-            for (unsigned long II : ArgIndex.second) {
-              // Use i32 to index structs, and i64 for others (pointers/arrays).
-              // This satisfies GEP constraints.
-              Type *IdxTy = (ElTy->isStructTy() ?
-                    Type::getInt32Ty(F->getContext()) : 
-                    Type::getInt64Ty(F->getContext()));
-              Ops.push_back(ConstantInt::get(IdxTy, II));
-              // Keep track of the type we're currently indexing.
-              if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
-                ElTy = ElPTy->getElementType();
-              else
-                ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
-            }
-            // And create a GEP to extract those indices.
-            V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
-                                          V->getName() + ".idx", Call);
-            Ops.clear();
+
+        // If all the elements are single-value types, we can promote it.
+        bool AllSimple = true;
+        for (const auto *EltTy : STy->elements()) {
+          if (!EltTy->isSingleValueType()) {
+            AllSimple = false;
+            break;
           }
-          // Since we're replacing a load make sure we take the alignment
-          // of the previous load.
-          LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call);
-          newLoad->setAlignment(OrigLoad->getAlignment());
-          // Transfer the AA info too.
-          AAMDNodes AAInfo;
-          OrigLoad->getAAMetadata(AAInfo);
-          newLoad->setAAMetadata(AAInfo);
+        }
 
-          Args.push_back(newLoad);
+        // Safe to transform, don't even bother trying to "promote" it.
+        // Passing the elements as a scalar will allow sroa to hack on
+        // the new alloca we introduce.
+        if (AllSimple) {
+          ByValArgsToTransform.insert(PtrArg);
+          continue;
         }
       }
+    }
 
-    // Push any varargs arguments on the list.
-    for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
-      Args.push_back(*AI);
-      if (CallPAL.hasAttributes(ArgIndex)) {
-        AttrBuilder B(CallPAL, ArgIndex);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+    // If the argument is a recursive type and we're in a recursive
+    // function, we could end up infinitely peeling the function argument.
+    if (isSelfRecursive) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+        bool RecursiveType = false;
+        for (const auto *EltTy : STy->elements()) {
+          if (EltTy == PtrArg->getType()) {
+            RecursiveType = true;
+            break;
+          }
+        }
+        if (RecursiveType)
+          continue;
       }
     }
 
-    // Add any function attributes.
-    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
-      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
-                                                CallPAL.getFnAttributes()));
+    // Otherwise, see if we can promote the pointer to its value.
+    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
+                                MaxElements))
+      ArgsToPromote.insert(PtrArg);
+  }
 
-    SmallVector<OperandBundleDef, 1> OpBundles;
-    CS.getOperandBundlesAsDefs(OpBundles);
+  // No promotable pointer arguments.
+  if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+    return nullptr;
 
-    Instruction *New;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles, "", Call);
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(),
-                                                            AttributesVec));
-    } else {
-      New = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
-                                                          AttributesVec));
-      cast<CallInst>(New)->setTailCallKind(
-          cast<CallInst>(Call)->getTailCallKind());
-    }
-    New->setDebugLoc(Call->getDebugLoc());
-    Args.clear();
-    AttributesVec.clear();
+  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+}
 
-    // Update the callgraph to know that the callsite has been transformed.
-    CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];
-    CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN);
+PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
+                                             CGSCCAnalysisManager &AM,
+                                             LazyCallGraph &CG,
+                                             CGSCCUpdateResult &UR) {
+  bool Changed = false, LocalChange;
 
-    if (!Call->use_empty()) {
-      Call->replaceAllUsesWith(New);
-      New->takeName(Call);
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+
+    for (LazyCallGraph::Node &N : C) {
+      Function &OldF = N.getFunction();
+
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+      // FIXME: This lambda must only be used with this function. We should
+      // skip the lambda and just get the AA results directly.
+      auto AARGetter = [&](Function &F) -> AAResults & {
+        assert(&F == &OldF && "Called with an unexpected function!");
+        return FAM.getResult<AAManager>(F);
+      };
+
+      Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
+      if (!NewF)
+        continue;
+      LocalChange = true;
+
+      // Directly substitute the functions in the call graph. Note that this
+      // requires the old function to be completely dead and completely
+      // replaced by the new function. It does no call graph updates, it merely
+      // swaps out the particular function mapped to a particular node in the
+      // graph.
+      C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
+      OldF.eraseFromParent();
     }
 
-    // Finally, remove the old call from the program, reducing the use-count of
-    // F.
-    Call->eraseFromParent();
-  }
+    Changed |= LocalChange;
+  } while (LocalChange);
 
-  // Since we have now created the new function, splice the body of the old
-  // function right into the new function, leaving the old rotting hulk of the
-  // function empty.
-  NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+  if (!Changed)
+    return PreservedAnalyses::all();
 
-  // Loop over the argument list, transferring uses of the old arguments over to
-  // the new arguments, also transferring over the names as well.
-  //
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
-       I2 = NF->arg_begin(); I != E; ++I) {
-    if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
-      // If this is an unmodified argument, move the name and users over to the
-      // new version.
-      I->replaceAllUsesWith(&*I2);
-      I2->takeName(&*I);
-      ++I2;
-      continue;
-    }
-
-    if (ByValArgsToTransform.count(&*I)) {
-      // In the callee, we create an alloca, and store each of the new incoming
-      // arguments into the alloca.
-      Instruction *InsertPt = &NF->begin()->front();
+  return PreservedAnalyses::none();
+}
 
-      // Just add all the struct element types.
-      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt);
-      StructType *STy = cast<StructType>(AgTy);
-      Value *Idxs[2] = {
-            ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr };
+namespace {
+/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+///
+struct ArgPromotion : public CallGraphSCCPass {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    getAAResultsAnalysisUsage(AU);
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
 
-      for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-        Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-        Value *Idx = GetElementPtrInst::Create(
-            AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
-            InsertPt);
-        I2->setName(I->getName()+"."+Twine(i));
-        new StoreInst(&*I2++, Idx, InsertPt);
-      }
+  bool runOnSCC(CallGraphSCC &SCC) override;
+  static char ID; // Pass identification, replacement for typeid
+  explicit ArgPromotion(unsigned MaxElements = 3)
+      : CallGraphSCCPass(ID), MaxElements(MaxElements) {
+    initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+  }
 
-      // Anything that used the arg should now use the alloca.
-      I->replaceAllUsesWith(TheAlloca);
-      TheAlloca->takeName(&*I);
+private:
+  using llvm::Pass::doInitialization;
+  bool doInitialization(CallGraph &CG) override;
+  /// The maximum number of elements to expand, or 0 for unlimited.
+  unsigned MaxElements;
+};
+}
 
-      // If the alloca is used in a call, we must clear the tail flag since
-      // the callee now uses an alloca from the caller.
-      for (User *U : TheAlloca->users()) {
-        CallInst *Call = dyn_cast<CallInst>(U);
-        if (!Call)
-          continue;
-        Call->setTailCall(false);
-      }
-      continue;
-    }
+char ArgPromotion::ID = 0;
+INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
+                      "Promote 'by reference' arguments to scalars", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
+                    "Promote 'by reference' arguments to scalars", false, false)
 
-    if (I->use_empty())
-      continue;
+Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
+  return new ArgPromotion(MaxElements);
+}
 
-    // Otherwise, if we promoted this argument, then all users are load
-    // instructions (or GEPs with only load users), and all loads should be
-    // using the new argument that we added.
-    ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+  if (skipSCC(SCC))
+    return false;
 
-    while (!I->use_empty()) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
-        assert(ArgIndices.begin()->second.empty() &&
-               "Load element should sort to front!");
-        I2->setName(I->getName()+".val");
-        LI->replaceAllUsesWith(&*I2);
-        LI->eraseFromParent();
-        DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
-              << "' in function '" << F->getName() << "'\n");
-      } else {
-        GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
-        IndicesVector Operands;
-        Operands.reserve(GEP->getNumIndices());
-        for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
-             II != IE; ++II)
-          Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+  // Get the callgraph information that we need to update to reflect our
+  // changes.
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
 
-        // GEPs with a single 0 index can be merged with direct loads
-        if (Operands.size() == 1 && Operands.front() == 0)
-          Operands.clear();
+  LegacyAARGetter AARGetter(*this);
 
-        Function::arg_iterator TheArg = I2;
-        for (ScalarizeTable::iterator It = ArgIndices.begin();
-             It->second != Operands; ++It, ++TheArg) {
-          assert(It != ArgIndices.end() && "GEP not handled??");
-        }
+  bool Changed = false, LocalChange;
 
-        std::string NewName = I->getName();
-        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-            NewName += "." + utostr(Operands[i]);
-        }
-        NewName += ".val";
-        TheArg->setName(NewName);
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+    // Attempt to promote arguments from all functions in this SCC.
+    for (CallGraphNode *OldNode : SCC) {
+      Function *OldF = OldNode->getFunction();
+      if (!OldF)
+        continue;
+
+      auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) {
+        Function *Caller = OldCS.getInstruction()->getParent()->getParent();
+        CallGraphNode *NewCalleeNode =
+            CG.getOrInsertFunction(NewCS.getCalledFunction());
+        CallGraphNode *CallerNode = CG[Caller];
+        CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+      };
+
+      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
+                                            {ReplaceCallSite})) {
+        LocalChange = true;
 
-        DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
-              << "' of function '" << NF->getName() << "'\n");
+        // Update the call graph for the newly promoted function.
+        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
+        NewNode->stealCalledFunctionsFrom(OldNode);
+        if (OldNode->getNumReferences() == 0)
+          delete CG.removeFunctionFromModule(OldNode);
+        else
+          OldF->setLinkage(Function::ExternalLinkage);
 
-        // All of the uses must be load instructions.  Replace them all with
-        // the argument specified by ArgNo.
-        while (!GEP->use_empty()) {
-          LoadInst *L = cast<LoadInst>(GEP->user_back());
-          L->replaceAllUsesWith(&*TheArg);
-          L->eraseFromParent();
-        }
-        GEP->eraseFromParent();
+        // And updat ethe SCC we're iterating as well.
+        SCC.ReplaceNode(OldNode, NewNode);
       }
     }
+    // Remember that we changed something.
+    Changed |= LocalChange;
+  } while (LocalChange);
 
-    // Increment I2 past all of the arguments added for this promoted pointer.
-    std::advance(I2, ArgIndices.size());
-  }
-
-  NF_CGN->stealCalledFunctionsFrom(CG[F]);
-  
-  // Now that the old function is dead, delete it.  If there is a dangling
-  // reference to the CallgraphNode, just leave the dead function around for
-  // someone else to nuke.
-  CallGraphNode *CGN = CG[F];
-  if (CGN->getNumReferences() == 0)
-    delete CG.removeFunctionFromModule(CGN);
-  else
-    F->setLinkage(Function::ExternalLinkage);
-  
-  return NF_CGN;
+  return Changed;
 }
 
 bool ArgPromotion::doInitialization(CallGraph &CG) {
diff --git a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index d75ed20..62b5a9c 100644
--- a/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -60,6 +60,23 @@ static bool IsBetterCanonical(const GlobalVariable &A,
   return A.hasGlobalUnnamedAddr();
 }
 
+static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GV->getAllMetadata(MDs);
+  for (const auto &V : MDs)
+    if (V.first != LLVMContext::MD_dbg)
+      return true;
+  return false;
+}
+
+static void copyDebugLocMetadata(const GlobalVariable *From,
+                                 GlobalVariable *To) {
+  SmallVector<DIGlobalVariableExpression *, 1> MDs;
+  From->getDebugInfo(MDs);
+  for (auto MD : MDs)
+    To->addDebugInfo(MD);
+}
+
 static unsigned getAlignment(GlobalVariable *GV) {
   unsigned Align = GV->getAlignment();
   if (Align)
@@ -113,6 +130,10 @@ static bool mergeConstants(Module &M) {
       if (GV->isWeakForLinker())
         continue;
 
+      // Don't touch globals with metadata other then !dbg.
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
@@ -155,6 +176,9 @@ static bool mergeConstants(Module &M) {
       if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr())
         continue;
 
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
       if (!GV->hasGlobalUnnamedAddr())
         Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
 
@@ -178,6 +202,8 @@ static bool mergeConstants(Module &M) {
                      getAlignment(Replacements[i].second)));
       }
 
+      copyDebugLocMetadata(Replacements[i].first, Replacements[i].second);
+
       // Eliminate any uses of the dead global.
       Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
 
diff --git a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index ba2e60d..d94aa5d 100644
--- a/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -95,11 +95,25 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
     }
   }
 
+  NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+  if (CfiFunctionsMD) {
+    for (auto Func : CfiFunctionsMD->operands()) {
+      assert(Func->getNumOperands() >= 2);
+      for (unsigned I = 2; I < Func->getNumOperands(); ++I)
+        if (ConstantInt *TypeId =
+                extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
+          TypeIds.insert(TypeId->getZExtValue());
+    }
+  }
+
   LLVMContext &Ctx = M.getContext();
   Constant *C = M.getOrInsertFunction(
       "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
-      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx), nullptr);
+      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
   Function *F = dyn_cast<Function>(C);
+  // Take over the existing function. The frontend emits a weak stub so that the
+  // linker knows about the symbol; this pass replaces the function body.
+  F->deleteBody();
   F->setAlignment(4096);
   auto args = F->arg_begin();
   Value &CallSiteTypeId = *(args++);
@@ -117,7 +131,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   IRBuilder<> IRBFail(TrapBB);
   Constant *CFICheckFailFn = M.getOrInsertFunction(
       "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx),
-      Type::getInt8PtrTy(Ctx), nullptr);
+      Type::getInt8PtrTy(Ctx));
   IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
   IRBFail.CreateBr(ExitBB);
 
diff --git a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 1a5ed46..8e26849 100644
--- a/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -166,41 +166,40 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
-    AttributeSet PAL = CS.getAttributes();
-    if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
-      SmallVector<AttributeSet, 8> AttributesVec;
-      for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
-        AttributesVec.push_back(PAL.getSlotAttributes(i));
-      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
-                                                  PAL.getFnAttributes()));
-      PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
+    AttributeList PAL = CS.getAttributes();
+    if (!PAL.isEmpty()) {
+      SmallVector<AttributeSet, 8> ArgAttrs;
+      for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
+        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
+      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
+                               PAL.getRetAttributes(), ArgAttrs);
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
 
-    Instruction *New;
+    CallSite NewCS;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles, "", Call);
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(PAL);
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call);
     } else {
-      New = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(PAL);
-      cast<CallInst>(New)->setTailCallKind(
-          cast<CallInst>(Call)->getTailCallKind());
+      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
+      cast<CallInst>(NewCS.getInstruction())
+          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
     }
-    New->setDebugLoc(Call->getDebugLoc());
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(PAL);
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
 
     Args.clear();
 
     if (!Call->use_empty())
-      Call->replaceAllUsesWith(New);
+      Call->replaceAllUsesWith(NewCS.getInstruction());
 
-    New->takeName(Call);
+    NewCS->takeName(Call);
 
     // Finally, remove the old call from the program, reducing the use-count of
     // F.
@@ -681,8 +680,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   bool HasLiveReturnedArg = false;
 
   // Set up to build a new list of parameter attributes.
-  SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &PAL = F->getAttributes();
+  SmallVector<AttributeSet, 8> ArgAttrVec;
+  const AttributeList &PAL = F->getAttributes();
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -696,16 +695,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[i] = true;
-
-      // Get the original parameter attributes (skipping the first one, that is
-      // for the return value.
-      if (PAL.hasAttributes(i + 1)) {
-        AttrBuilder B(PAL, i + 1);
-        if (B.contains(Attribute::Returned))
-          HasLiveReturnedArg = true;
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
-      }
+      ArgAttrVec.push_back(PAL.getParamAttributes(i));
+      HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
       DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
@@ -779,30 +770,24 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttributeSet RAttrs = PAL.getRetAttributes();
+  AttrBuilder RAttrs(PAL.getRetAttributes());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
-    RAttrs = RAttrs.removeAttributes(NRetTy->getContext(),
-                                     AttributeSet::ReturnIndex,
-                                     AttributeFuncs::typeIncompatible(NRetTy));
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
   else
-    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
-             overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
            "Return attributes no longer compatible?");
 
-  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
-
-  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                              PAL.getFnAttributes()));
+  AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
+  assert(ArgAttrVec.size() == Params.size());
+  AttributeList NewPAL = AttributeList::get(
+      F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -829,18 +814,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     CallSite CS(F->user_back());
     Instruction *Call = CS.getInstruction();
 
-    AttributesVec.clear();
-    const AttributeSet &CallPAL = CS.getAttributes();
-
-    // The call return attributes.
-    AttributeSet RAttrs = CallPAL.getRetAttributes();
+    ArgAttrVec.clear();
+    const AttributeList &CallPAL = CS.getAttributes();
 
-    // Adjust in case the function was changed to return void.
-    RAttrs = RAttrs.removeAttributes(NRetTy->getContext(),
-                                     AttributeSet::ReturnIndex,
-                        AttributeFuncs::typeIncompatible(NF->getReturnType()));
-    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
-      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
+    // Adjust the call return attributes in case the function was changed to
+    // return void.
+    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -852,57 +833,55 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        if (CallPAL.hasAttributes(i + 1)) {
-          AttrBuilder B(CallPAL, i + 1);
+        AttributeSet Attrs = CallPAL.getParamAttributes(i);
+        if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
           // call sites keep the return value alive just like 'returned'
-          // attributes on function declaration but it's less clearly a win
-          // and this is not an expected case anyway
-          if (NRetTy != RetTy && B.contains(Attribute::Returned))
-            B.removeAttribute(Attribute::Returned);
-          AttributesVec.
-            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+          // attributes on function declaration but it's less clearly a win and
+          // this is not an expected case anyway
+          ArgAttrVec.push_back(AttributeSet::get(
+              F->getContext(),
+              AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+        } else {
+          // Otherwise, use the original attributes.
+          ArgAttrVec.push_back(Attrs);
         }
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      if (CallPAL.hasAttributes(i + 1)) {
-        AttrBuilder B(CallPAL, i + 1);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
-      }
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(i));
     }
 
-    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
-      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
-                                                CallPAL.getFnAttributes()));
-
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
+    assert(ArgAttrVec.size() == Args.size());
+    AttributeList NewCallPAL = AttributeList::get(
+        F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec);
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
 
-    Instruction *New;
+    CallSite NewCS;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles, "", Call->getParent());
-      cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(NewCallPAL);
+      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", Call->getParent());
     } else {
-      New = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(NewCallPAL);
-      cast<CallInst>(New)->setTailCallKind(
-          cast<CallInst>(Call)->getTailCallKind());
+      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
+      cast<CallInst>(NewCS.getInstruction())
+          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
     }
-    New->setDebugLoc(Call->getDebugLoc());
-
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(NewCallPAL);
+    NewCS->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      NewCS->setProfWeight(W);
     Args.clear();
+    ArgAttrVec.clear();
 
+    Instruction *New = NewCS.getInstruction();
     if (!Call->use_empty()) {
       if (New->getType() == Call->getType()) {
         // Return type not changed? Just replace users then.
diff --git a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
index 98c4b17..ecff88c 100644
--- a/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -17,9 +17,9 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Pass.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "elim-avail-extern"
diff --git a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
index 479fd18..d1147f7 100644
--- a/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -53,18 +53,18 @@ static void makeVisible(GlobalValue &GV, bool Delete) {
 }
 
 namespace {
-  /// @brief A pass to extract specific functions and their dependencies.
+  /// @brief A pass to extract specific global values and their dependencies.
   class GVExtractorPass : public ModulePass {
     SetVector<GlobalValue *> Named;
     bool deleteStuff;
   public:
     static char ID; // Pass identification, replacement for typeid
 
-    /// FunctionExtractorPass - If deleteFn is true, this pass deletes as the
-    /// specified function. Otherwise, it deletes as much of the module as
-    /// possible, except for the function specified.
-    ///
-    explicit GVExtractorPass(std::vector<GlobalValue*>& GVs, bool deleteS = true)
+    /// If deleteS is true, this pass deletes the specified global values.
+    /// Otherwise, it deletes as much of the module as possible, except for the
+    /// global values specified.
+    explicit GVExtractorPass(std::vector<GlobalValue*> &GVs,
+                             bool deleteS = true)
       : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {}
 
     bool runOnModule(Module &M) override {
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 402a665..813a4b6 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -34,7 +33,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "functionattrs"
@@ -49,31 +48,35 @@ STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 
-namespace {
-typedef SmallSetVector<Function *, 8> SCCNodeSet;
-}
+// FIXME: This is disabled by default to avoid exposing security vulnerabilities
+// in C/C++ code compiled by clang:
+// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html
+static cl::opt<bool> EnableNonnullArgPropagation(
+    "enable-nonnull-arg-prop", cl::Hidden,
+    cl::desc("Try to propagate nonnull argument attributes from callsites to "
+             "caller functions."));
 
 namespace {
-/// The three kinds of memory access relevant to 'readonly' and
-/// 'readnone' attributes.
-enum MemoryAccessKind {
-  MAK_ReadNone = 0,
-  MAK_ReadOnly = 1,
-  MAK_MayWrite = 2
-};
+typedef SmallSetVector<Function *, 8> SCCNodeSet;
 }
 
-static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
+/// Returns the memory access attribute for function F using AAR for AA results,
+/// where SCCNodes is the current SCC.
+///
+/// If ThisBody is true, this function may examine the function body and will
+/// return a result pertaining to this copy of the function. If it is false, the
+/// result will be based only on AA results for the function declaration; it
+/// will be assumed that some other (perhaps less optimized) version of the
+/// function may be selected at link time.
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
+                                                  AAResults &AAR,
                                                   const SCCNodeSet &SCCNodes) {
   FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
   if (MRB == FMRB_DoesNotAccessMemory)
     // Already perfect!
     return MAK_ReadNone;
 
-  // Non-exact function definitions may not be selected at link time, and an
-  // alternative version that writes to memory may be selected.  See the comment
-  // on GlobalValue::isDefinitionExact for more details.
-  if (!F.hasExactDefinition()) {
+  if (!ThisBody) {
     if (AliasAnalysis::onlyReadsMemory(MRB))
       return MAK_ReadOnly;
 
@@ -172,9 +175,14 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
   return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
 }
 
+MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
+                                                       AAResults &AAR) {
+  return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
+}
+
 /// Deduce readonly/readnone attributes for the SCC.
 template <typename AARGetterT>
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
@@ -182,7 +190,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
     // Call the callable parameter to look up AA results for this function.
     AAResults &AAR = AARGetter(*F);
 
-    switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) {
+    // Non-exact function definitions may not be selected at link time, and an
+    // alternative version that writes to memory may be selected.  See the
+    // comment on GlobalValue::isDefinitionExact for more details.
+    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
+                                      AAR, SCCNodes)) {
     case MAK_MayWrite:
       return false;
     case MAK_ReadOnly:
@@ -209,15 +221,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
     MadeChange = true;
 
     // Clear out any existing attributes.
-    AttrBuilder B;
-    B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
-    F->removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B));
+    F->removeFnAttr(Attribute::ReadOnly);
+    F->removeFnAttr(Attribute::ReadNone);
 
     // Add in the new attribute.
-    F->addAttribute(AttributeSet::FunctionIndex,
-                    ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
+    F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
 
     if (ReadsMemory)
       ++NumReadOnly;
@@ -482,9 +490,6 @@ determinePointerReadAttrs(Argument *A,
 static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
 
-  AttrBuilder B;
-  B.addAttribute(Attribute::Returned);
-
   // Check each function in turn, determining if an argument is always returned.
   for (Function *F : SCCNodes) {
     // We can infer and propagate function attributes only when we know that the
@@ -522,7 +527,7 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
 
     if (Value *RetArg = FindRetArg()) {
       auto *A = cast<Argument>(RetArg);
-      A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+      A->addAttr(Attribute::Returned);
       ++NumReturned;
       Changed = true;
     }
@@ -531,15 +536,55 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
   return Changed;
 }
 
+/// If a callsite has arguments that are also arguments to the parent function,
+/// try to propagate attributes from the callsite's arguments to the parent's
+/// arguments. This may be important because inlining can cause information loss
+/// when attribute knowledge disappears with the inlined call.
+static bool addArgumentAttrsFromCallsites(Function &F) {
+  if (!EnableNonnullArgPropagation)
+    return false;
+
+  bool Changed = false;
+
+  // For an argument attribute to transfer from a callsite to the parent, the
+  // call must be guaranteed to execute every time the parent is called.
+  // Conservatively, just check for calls in the entry block that are guaranteed
+  // to execute.
+  // TODO: This could be enhanced by testing if the callsite post-dominates the
+  // entry block or by doing simple forward walks or backward walks to the
+  // callsite.
+  BasicBlock &Entry = F.getEntryBlock();
+  for (Instruction &I : Entry) {
+    if (auto CS = CallSite(&I)) {
+      if (auto *CalledFunc = CS.getCalledFunction()) {
+        for (auto &CSArg : CalledFunc->args()) {
+          if (!CSArg.hasNonNullAttr())
+            continue;
+
+          // If the non-null callsite argument operand is an argument to 'F'
+          // (the caller) and the call is guaranteed to execute, then the value
+          // must be non-null throughout 'F'.
+          auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo()));
+          if (FArg && !FArg->hasNonNullAttr()) {
+            FArg->addAttr(Attribute::NonNull);
+            Changed = true;
+          }
+        }
+      }
+    }
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+  }
+  
+  return Changed;
+}
+
 /// Deduce nocapture attributes for the SCC.
 static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
 
   ArgumentGraph AG;
 
-  AttrBuilder B;
-  B.addAttribute(Attribute::NoCapture);
-
   // Check each function in turn, determining which pointer arguments are not
   // captured.
   for (Function *F : SCCNodes) {
@@ -549,6 +594,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (!F->hasExactDefinition())
       continue;
 
+    Changed |= addArgumentAttrsFromCallsites(*F);
+
     // Functions that are readonly (or readnone) and nounwind and don't return
     // a value can't capture arguments. Don't analyze them.
     if (F->onlyReadsMemory() && F->doesNotThrow() &&
@@ -556,7 +603,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
            ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+          A->addAttr(Attribute::NoCapture);
           ++NumNoCapture;
           Changed = true;
         }
@@ -575,8 +622,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         if (!Tracker.Captured) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
-            A->addAttr(
-                AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+            A->addAttr(Attribute::NoCapture);
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -602,9 +648,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         Self.insert(&*A);
         Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
         if (R != Attribute::None) {
-          AttrBuilder B;
-          B.addAttribute(R);
-          A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+          A->addAttr(R);
           Changed = true;
           R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
         }
@@ -629,7 +673,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       if (ArgumentSCC[0]->Uses.size() == 1 &&
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
         Argument *A = ArgumentSCC[0]->Definition;
-        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        A->addAttr(Attribute::NoCapture);
         ++NumNoCapture;
         Changed = true;
       }
@@ -671,7 +715,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+      A->addAttr(Attribute::NoCapture);
       ++NumNoCapture;
       Changed = true;
     }
@@ -702,14 +746,12 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     }
 
     if (ReadAttr != Attribute::None) {
-      AttrBuilder B, R;
-      B.addAttribute(ReadAttr);
-      R.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
         // Clear out existing readonly/readnone attributes
-        A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, R));
-        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        A->removeAttr(Attribute::ReadOnly);
+        A->removeAttr(Attribute::ReadNone);
+        A->addAttr(ReadAttr);
         ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
         Changed = true;
       }
@@ -769,7 +811,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
       case Instruction::Call:
       case Instruction::Invoke: {
         CallSite CS(RVI);
-        if (CS.paramHasAttr(0, Attribute::NoAlias))
+        if (CS.hasRetAttr(Attribute::NoAlias))
           break;
         if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
           break;
@@ -792,7 +834,7 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
   // pointers.
   for (Function *F : SCCNodes) {
     // Already noalias.
-    if (F->doesNotAlias(0))
+    if (F->returnDoesNotAlias())
       continue;
 
     // We can infer and propagate function attributes only when we know that the
@@ -812,10 +854,11 @@ static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
 
   bool MadeChange = false;
   for (Function *F : SCCNodes) {
-    if (F->doesNotAlias(0) || !F->getReturnType()->isPointerTy())
+    if (F->returnDoesNotAlias() ||
+        !F->getReturnType()->isPointerTy())
       continue;
 
-    F->setDoesNotAlias(0);
+    F->setReturnDoesNotAlias();
     ++NumNoAlias;
     MadeChange = true;
   }
@@ -905,7 +948,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
   // pointers.
   for (Function *F : SCCNodes) {
     // Already nonnull.
-    if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                         Attribute::NonNull))
       continue;
 
@@ -926,7 +969,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
         // Mark the function eagerly since we may discover a function
         // which prevents us from speculating about the entire SCC
         DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
-        F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
         ++NumNonNullReturn;
         MadeChange = true;
       }
@@ -939,13 +982,13 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
 
   if (SCCReturnsNonNull) {
     for (Function *F : SCCNodes) {
-      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::NonNull) ||
           !F->getReturnType()->isPointerTy())
         continue;
 
       DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
-      F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
       ++NumNonNullReturn;
       MadeChange = true;
     }
@@ -1144,6 +1187,10 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
     SCCNodes.insert(F);
   }
 
+  // Skip it if the SCC only contains optnone functions.
+  if (SCCNodes.empty())
+    return Changed;
+
   Changed |= addArgumentReturnedAttrs(SCCNodes);
   Changed |= addReadAttrs(SCCNodes, AARGetter);
   Changed |= addArgumentAttrs(SCCNodes);
@@ -1163,19 +1210,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
 bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
   if (skipSCC(SCC))
     return false;
-
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
-
-  return runImpl(SCC, AARGetter);
+  return runImpl(SCC, LegacyAARGetter(*this));
 }
 
 namespace {
@@ -1275,16 +1310,9 @@ PreservedAnalyses
 ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
   auto &CG = AM.getResult<CallGraphAnalysis>(M);
 
-  bool Changed = deduceFunctionAttributeInRPO(M, CG);
-
-  // CallGraphAnalysis holds AssertingVH and must be invalidated eagerly so
-  // that other passes don't delete stuff from under it.
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<CallGraphAnalysis>(M);
-
-  if (!Changed)
+  if (!deduceFunctionAttributeInRPO(M, CG))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
   PA.preserve<CallGraphAnalysis>();
   return PA;
diff --git a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 6b32f6c..233a36d 100644
--- a/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -25,7 +26,6 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Object/IRObjectFile.h"
-#include "llvm/Object/ModuleSummaryIndexObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SourceMgr.h"
@@ -64,6 +64,12 @@ static cl::opt<float> ImportHotMultiplier(
     "import-hot-multiplier", cl::init(3.0), cl::Hidden, cl::value_desc("x"),
     cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
 
+static cl::opt<float> ImportCriticalMultiplier(
+    "import-critical-multiplier", cl::init(100.0), cl::Hidden,
+    cl::value_desc("x"),
+    cl::desc(
+        "Multiply the `import-instr-limit` threshold for critical callsites"));
+
 // FIXME: This multiplier was not really tuned up.
 static cl::opt<float> ImportColdMultiplier(
     "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"),
@@ -75,12 +81,6 @@ static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
 static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
                                  cl::desc("Compute dead symbols"));
 
-// Temporary allows the function import pass to disable always linking
-// referenced discardable symbols.
-static cl::opt<bool>
-    DontForceImportReferencedDiscardableSymbols("disable-force-link-odr",
-                                                cl::init(false), cl::Hidden);
-
 static cl::opt<bool> EnableImportMetadata(
     "enable-import-metadata", cl::init(
 #if !defined(NDEBUG)
@@ -123,8 +123,8 @@ namespace {
 /// - [insert you fancy metric here]
 static const GlobalValueSummary *
 selectCallee(const ModuleSummaryIndex &Index,
-             const GlobalValueSummaryList &CalleeSummaryList,
-             unsigned Threshold) {
+             ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
+             unsigned Threshold, StringRef CallerModulePath) {
   auto It = llvm::find_if(
       CalleeSummaryList,
       [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
@@ -145,6 +145,21 @@ selectCallee(const ModuleSummaryIndex &Index,
 
         auto *Summary = cast<FunctionSummary>(GVSummary);
 
+        // If this is a local function, make sure we import the copy
+        // in the caller's module. The only time a local function can
+        // share an entry in the index is if there is a local with the same name
+        // in another module that had the same source file name (in a different
+        // directory), where each was compiled in their own directory so there
+        // was not distinguishing path.
+        // However, do the import from another module if there is only one
+        // entry in the list - in that case this must be a reference due
+        // to indirect call profile data, since a function pointer can point to
+        // a local in another module.
+        if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
+            CalleeSummaryList.size() > 1 &&
+            Summary->modulePath() != CallerModulePath)
+          return false;
+
         if (Summary->instCount() > Threshold)
           return false;
 
@@ -159,17 +174,6 @@ selectCallee(const ModuleSummaryIndex &Index,
   return cast<GlobalValueSummary>(It->get());
 }
 
-/// Return the summary for the function \p GUID that fits the \p Threshold, or
-/// null if there's no match.
-static const GlobalValueSummary *selectCallee(GlobalValue::GUID GUID,
-                                              unsigned Threshold,
-                                              const ModuleSummaryIndex &Index) {
-  auto CalleeSummaryList = Index.findGlobalValueSummaryList(GUID);
-  if (CalleeSummaryList == Index.end())
-    return nullptr; // This function does not have a summary
-  return selectCallee(Index, CalleeSummaryList->second, Threshold);
-}
-
 using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */,
                             GlobalValue::GUID>;
 
@@ -183,10 +187,23 @@ static void computeImportForFunction(
     FunctionImporter::ImportMapTy &ImportList,
     StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
   for (auto &Edge : Summary.calls()) {
-    auto GUID = Edge.first.getGUID();
-    DEBUG(dbgs() << " edge -> " << GUID << " Threshold:" << Threshold << "\n");
+    ValueInfo VI = Edge.first;
+    DEBUG(dbgs() << " edge -> " << VI.getGUID() << " Threshold:" << Threshold
+                 << "\n");
+
+    if (VI.getSummaryList().empty()) {
+      // For SamplePGO, the indirect call targets for local functions will
+      // have its original name annotated in profile. We try to find the
+      // corresponding PGOFuncName as the GUID.
+      auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
+      if (GUID == 0)
+        continue;
+      VI = Index.getValueInfo(GUID);
+      if (!VI)
+        continue;
+    }
 
-    if (DefinedGVSummaries.count(GUID)) {
+    if (DefinedGVSummaries.count(VI.getGUID())) {
       DEBUG(dbgs() << "ignored! Target already in destination module.\n");
       continue;
     }
@@ -196,13 +213,16 @@ static void computeImportForFunction(
         return ImportHotMultiplier;
       if (Hotness == CalleeInfo::HotnessType::Cold)
         return ImportColdMultiplier;
+      if (Hotness == CalleeInfo::HotnessType::Critical)
+        return ImportCriticalMultiplier;
       return 1.0;
     };
 
     const auto NewThreshold =
         Threshold * GetBonusMultiplier(Edge.second.Hotness);
 
-    auto *CalleeSummary = selectCallee(GUID, NewThreshold, Index);
+    auto *CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+                                       Summary.modulePath());
     if (!CalleeSummary) {
       DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n");
       continue;
@@ -234,7 +254,7 @@ static void computeImportForFunction(
     const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
 
     auto ExportModulePath = ResolvedCalleeSummary->modulePath();
-    auto &ProcessedThreshold = ImportList[ExportModulePath][GUID];
+    auto &ProcessedThreshold = ImportList[ExportModulePath][VI.getGUID()];
     /// Since the traversal of the call graph is DFS, we can revisit a function
     /// a second time with a higher threshold. In this case, it is added back to
     /// the worklist with the new threshold.
@@ -250,7 +270,7 @@ static void computeImportForFunction(
     // Make exports in the source module.
     if (ExportLists) {
       auto &ExportList = (*ExportLists)[ExportModulePath];
-      ExportList.insert(GUID);
+      ExportList.insert(VI.getGUID());
       if (!PreviouslyImported) {
         // This is the first time this function was exported from its source
         // module, so mark all functions and globals it references as exported
@@ -270,7 +290,7 @@ static void computeImportForFunction(
     }
 
     // Insert the newly imported function to the worklist.
-    Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, GUID);
+    Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold, VI.getGUID());
   }
 }
 
@@ -280,8 +300,7 @@ static void computeImportForFunction(
 static void ComputeImportForModule(
     const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
     FunctionImporter::ImportMapTy &ImportList,
-    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr,
-    const DenseSet<GlobalValue::GUID> *DeadSymbols = nullptr) {
+    StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
   // Worklist contains the list of function imported in this module, for which
   // we will analyse the callees and may import further down the callgraph.
   SmallVector<EdgeInfo, 128> Worklist;
@@ -289,7 +308,7 @@ static void ComputeImportForModule(
   // Populate the worklist with the import for the functions in the current
   // module
   for (auto &GVSummary : DefinedGVSummaries) {
-    if (DeadSymbols && DeadSymbols->count(GVSummary.first)) {
+    if (!Index.isGlobalValueLive(GVSummary.second)) {
       DEBUG(dbgs() << "Ignores Dead GUID: " << GVSummary.first << "\n");
       continue;
     }
@@ -332,15 +351,14 @@ void llvm::ComputeCrossModuleImport(
     const ModuleSummaryIndex &Index,
     const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     StringMap<FunctionImporter::ImportMapTy> &ImportLists,
-    StringMap<FunctionImporter::ExportSetTy> &ExportLists,
-    const DenseSet<GlobalValue::GUID> *DeadSymbols) {
+    StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
   // For each module that has function defined, compute the import/export lists.
   for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
     auto &ImportList = ImportLists[DefinedGVSummaries.first()];
     DEBUG(dbgs() << "Computing import for Module '"
                  << DefinedGVSummaries.first() << "'\n");
     ComputeImportForModule(DefinedGVSummaries.second, Index, ImportList,
-                           &ExportLists, DeadSymbols);
+                           &ExportLists);
   }
 
   // When computing imports we added all GUIDs referenced by anything
@@ -402,84 +420,71 @@ void llvm::ComputeCrossModuleImportForModule(
 #endif
 }
 
-DenseSet<GlobalValue::GUID> llvm::computeDeadSymbols(
-    const ModuleSummaryIndex &Index,
+void llvm::computeDeadSymbols(
+    ModuleSummaryIndex &Index,
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  assert(!Index.withGlobalValueDeadStripping());
   if (!ComputeDead)
-    return DenseSet<GlobalValue::GUID>();
+    return;
   if (GUIDPreservedSymbols.empty())
     // Don't do anything when nothing is live, this is friendly with tests.
-    return DenseSet<GlobalValue::GUID>();
-  DenseSet<GlobalValue::GUID> LiveSymbols = GUIDPreservedSymbols;
-  SmallVector<GlobalValue::GUID, 128> Worklist;
-  Worklist.reserve(LiveSymbols.size() * 2);
-  for (auto GUID : LiveSymbols) {
-    DEBUG(dbgs() << "Live root: " << GUID << "\n");
-    Worklist.push_back(GUID);
-  }
-  // Add values flagged in the index as live roots to the worklist.
-  for (const auto &Entry : Index) {
-    bool IsLiveRoot = llvm::any_of(
-        Entry.second,
-        [&](const std::unique_ptr<llvm::GlobalValueSummary> &Summary) {
-          return Summary->liveRoot();
-        });
-    if (!IsLiveRoot)
+    return;
+  unsigned LiveSymbols = 0;
+  SmallVector<ValueInfo, 128> Worklist;
+  Worklist.reserve(GUIDPreservedSymbols.size() * 2);
+  for (auto GUID : GUIDPreservedSymbols) {
+    ValueInfo VI = Index.getValueInfo(GUID);
+    if (!VI)
       continue;
-    DEBUG(dbgs() << "Live root (summary): " << Entry.first << "\n");
-    Worklist.push_back(Entry.first);
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
   }
 
-  while (!Worklist.empty()) {
-    auto GUID = Worklist.pop_back_val();
-    auto It = Index.findGlobalValueSummaryList(GUID);
-    if (It == Index.end()) {
-      DEBUG(dbgs() << "Not in index: " << GUID << "\n");
-      continue;
-    }
-
-    // FIXME: we should only make the prevailing copy live here
-    for (auto &Summary : It->second) {
-      for (auto Ref : Summary->refs()) {
-        auto RefGUID = Ref.getGUID();
-        if (LiveSymbols.insert(RefGUID).second) {
-          DEBUG(dbgs() << "Marking live (ref): " << RefGUID << "\n");
-          Worklist.push_back(RefGUID);
-        }
-      }
-      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) {
-        for (auto Call : FS->calls()) {
-          auto CallGUID = Call.first.getGUID();
-          if (LiveSymbols.insert(CallGUID).second) {
-            DEBUG(dbgs() << "Marking live (call): " << CallGUID << "\n");
-            Worklist.push_back(CallGUID);
-          }
-        }
+  // Add values flagged in the index as live roots to the worklist.
+  for (const auto &Entry : Index)
+    for (auto &S : Entry.second.SummaryList)
+      if (S->isLive()) {
+        DEBUG(dbgs() << "Live root: " << Entry.first << "\n");
+        Worklist.push_back(ValueInfo(&Entry));
+        ++LiveSymbols;
+        break;
       }
+
+  // Make value live and add it to the worklist if it was not live before.
+  // FIXME: we should only make the prevailing copy live here
+  auto visit = [&](ValueInfo VI) {
+    for (auto &S : VI.getSummaryList())
+      if (S->isLive())
+        return;
+    for (auto &S : VI.getSummaryList())
+      S->setLive(true);
+    ++LiveSymbols;
+    Worklist.push_back(VI);
+  };
+
+  while (!Worklist.empty()) {
+    auto VI = Worklist.pop_back_val();
+    for (auto &Summary : VI.getSummaryList()) {
+      for (auto Ref : Summary->refs())
+        visit(Ref);
+      if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
+        for (auto Call : FS->calls())
+          visit(Call.first);
       if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
         auto AliaseeGUID = AS->getAliasee().getOriginalName();
-        if (LiveSymbols.insert(AliaseeGUID).second) {
-          DEBUG(dbgs() << "Marking live (alias): " << AliaseeGUID << "\n");
-          Worklist.push_back(AliaseeGUID);
-        }
+        ValueInfo AliaseeVI = Index.getValueInfo(AliaseeGUID);
+        if (AliaseeVI)
+          visit(AliaseeVI);
       }
     }
   }
-  DenseSet<GlobalValue::GUID> DeadSymbols;
-  DeadSymbols.reserve(
-      std::min(Index.size(), Index.size() - LiveSymbols.size()));
-  for (auto &Entry : Index) {
-    auto GUID = Entry.first;
-    if (!LiveSymbols.count(GUID)) {
-      DEBUG(dbgs() << "Marking dead: " << GUID << "\n");
-      DeadSymbols.insert(GUID);
-    }
-  }
-  DEBUG(dbgs() << LiveSymbols.size() << " symbols Live, and "
-               << DeadSymbols.size() << " symbols Dead \n");
-  NumDeadSymbols += DeadSymbols.size();
-  NumLiveSymbols += LiveSymbols.size();
-  return DeadSymbols;
+  Index.setWithGlobalValueDeadStripping();
+
+  unsigned DeadSymbols = Index.size() - LiveSymbols;
+  DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+               << " symbols Dead \n");
+  NumDeadSymbols += DeadSymbols;
+  NumLiveSymbols += LiveSymbols;
 }
 
 /// Compute the set of summaries needed for a ThinLTO backend compilation of
@@ -522,9 +527,24 @@ llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename,
 /// Fixup WeakForLinker linkages in \p TheModule based on summary analysis.
 void llvm::thinLTOResolveWeakForLinkerModule(
     Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
+  auto ConvertToDeclaration = [](GlobalValue &GV) {
+    DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n");
+    if (Function *F = dyn_cast<Function>(&GV)) {
+      F->deleteBody();
+      F->clearMetadata();
+    } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+      V->setInitializer(nullptr);
+      V->setLinkage(GlobalValue::ExternalLinkage);
+      V->clearMetadata();
+    } else
+      // For now we don't resolve or drop aliases. Once we do we'll
+      // need to add support here for creating either a function or
+      // variable declaration, and return the new GlobalValue* for
+      // the caller to use.
+      llvm_unreachable("Expected function or variable");
+  };
+
   auto updateLinkage = [&](GlobalValue &GV) {
-    if (!GlobalValue::isWeakForLinker(GV.getLinkage()))
-      return;
     // See if the global summary analysis computed a new resolved linkage.
     const auto &GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end())
@@ -532,18 +552,40 @@ void llvm::thinLTOResolveWeakForLinkerModule(
     auto NewLinkage = GS->second->linkage();
     if (NewLinkage == GV.getLinkage())
       return;
-    DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
-                 << GV.getLinkage() << " to " << NewLinkage << "\n");
-    GV.setLinkage(NewLinkage);
-    // Remove functions converted to available_externally from comdats,
+
+    // Switch the linkage to weakany if asked for, e.g. we do this for
+    // linker redefined symbols (via --wrap or --defsym).
+    // We record that the visibility should be changed here in `addThinLTO`
+    // as we need access to the resolution vectors for each input file in
+    // order to find which symbols have been redefined.
+    // We may consider reorganizing this code and moving the linkage recording
+    // somewhere else, e.g. in thinLTOResolveWeakForLinkerInIndex.
+    if (NewLinkage == GlobalValue::WeakAnyLinkage) {
+      GV.setLinkage(NewLinkage);
+      return;
+    }
+
+    if (!GlobalValue::isWeakForLinker(GV.getLinkage()))
+      return;
+    // Check for a non-prevailing def that has interposable linkage
+    // (e.g. non-odr weak or linkonce). In that case we can't simply
+    // convert to available_externally, since it would lose the
+    // interposable property and possibly get inlined. Simply drop
+    // the definition in that case.
+    if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
+        GlobalValue::isInterposableLinkage(GV.getLinkage()))
+      ConvertToDeclaration(GV);
+    else {
+      DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from "
+                   << GV.getLinkage() << " to " << NewLinkage << "\n");
+      GV.setLinkage(NewLinkage);
+    }
+    // Remove declarations from comdats, including available_externally
     // as this is a declaration for the linker, and will be dropped eventually.
     // It is illegal for comdats to contain declarations.
     auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
-    if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
-      assert(GO->hasAvailableExternallyLinkage() &&
-             "Expected comdat on definition (possibly available external)");
+    if (GO && GO->isDeclarationForLinker() && GO->hasComdat())
       GO->setComdat(nullptr);
-    }
   };
 
   // Process functions and global now
@@ -562,7 +604,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
   // the current module.
   StringSet<> AsmUndefinedRefs;
   ModuleSymbolTable::CollectAsmSymbols(
-      Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(),
+      TheModule,
       [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
         if (Flags & object::BasicSymbolRef::SF_Undefined)
           AsmUndefinedRefs.insert(Name);
@@ -576,8 +618,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
       return true;
 
     // Lookup the linkage recorded in the summaries during global analysis.
-    const auto &GS = DefinedGlobals.find(GV.getGUID());
-    GlobalValue::LinkageTypes Linkage;
+    auto GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end()) {
       // Must have been promoted (possibly conservatively). Find original
       // name so that we can access the correct summary and see if it can
@@ -589,7 +630,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
       std::string OrigId = GlobalValue::getGlobalIdentifier(
           OrigName, GlobalValue::InternalLinkage,
           TheModule.getSourceFileName());
-      const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+      GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
       if (GS == DefinedGlobals.end()) {
         // Also check the original non-promoted non-globalized name. In some
         // cases a preempted weak value is linked in as a local copy because
@@ -597,15 +638,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
         // In that case, since it was originally not a local value, it was
         // recorded in the index using the original name.
         // FIXME: This may not be needed once PR27866 is fixed.
-        const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+        GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
         assert(GS != DefinedGlobals.end());
-        Linkage = GS->second->linkage();
-      } else {
-        Linkage = GS->second->linkage();
       }
-    } else
-      Linkage = GS->second->linkage();
-    return !GlobalValue::isLocalLinkage(Linkage);
+    }
+    return !GlobalValue::isLocalLinkage(GS->second->linkage());
   };
 
   // FIXME: See if we can just internalize directly here via linkage changes
@@ -617,14 +654,12 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
 // index.
 //
 Expected<bool> FunctionImporter::importFunctions(
-    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList,
-    bool ForceImportReferencedDiscardableSymbols) {
+    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
   DEBUG(dbgs() << "Starting import for Module "
                << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 
-  // Linker that will be used for importing function
-  Linker TheLinker(DestModule);
+  IRMover Mover(DestModule);
   // Do the actual import of functions now, one Module at a time
   std::set<StringRef> ModuleNameOrderedList;
   for (auto &FunctionsToImportPerModule : ImportList) {
@@ -648,7 +683,7 @@ Expected<bool> FunctionImporter::importFunctions(
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
     // Find the globals to import
-    DenseSet<const GlobalValue *> GlobalsToImport;
+    SetVector<GlobalValue *> GlobalsToImport;
     for (Function &F : *SrcModule) {
       if (!F.hasName())
         continue;
@@ -687,6 +722,13 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
     for (GlobalAlias &GA : SrcModule->aliases()) {
+      // FIXME: This should eventually be controlled entirely by the summary.
+      if (FunctionImportGlobalProcessing::doImportAsDefinition(
+              &GA, &GlobalsToImport)) {
+        GlobalsToImport.insert(&GA);
+        continue;
+      }
+
       if (!GA.hasName())
         continue;
       auto GUID = GA.getGUID();
@@ -731,12 +773,9 @@ Expected<bool> FunctionImporter::importFunctions(
                << " from " << SrcModule->getSourceFileName() << "\n";
     }
 
-    // Instruct the linker that the client will take care of linkonce resolution
-    unsigned Flags = Linker::Flags::None;
-    if (!ForceImportReferencedDiscardableSymbols)
-      Flags |= Linker::Flags::DontForceLinkLinkonceODR;
-
-    if (TheLinker.linkInModule(std::move(SrcModule), Flags, &GlobalsToImport))
+    if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(),
+                   [](GlobalValue &, IRMover::ValueAdder) {},
+                   /*IsPerformingImport=*/true))
       report_fatal_error("Function Import: link error");
 
     ImportedCount += GlobalsToImport.size();
@@ -778,7 +817,7 @@ static bool doImportingForModule(Module &M) {
   // is only enabled when testing importing via the 'opt' tool, which does
   // not do the ThinLink that would normally determine what values to promote.
   for (auto &I : *Index) {
-    for (auto &S : I.second) {
+    for (auto &S : I.second.SummaryList) {
       if (GlobalValue::isLocalLinkage(S->linkage()))
         S->setLinkage(GlobalValue::ExternalLinkage);
     }
@@ -796,8 +835,7 @@ static bool doImportingForModule(Module &M) {
     return loadFile(Identifier, M.getContext());
   };
   FunctionImporter Importer(*Index, ModuleLoader);
-  Expected<bool> Result = Importer.importFunctions(
-      M, ImportList, !DontForceImportReferencedDiscardableSymbols);
+  Expected<bool> Result = Importer.importFunctions(M, ImportList);
 
   // FIXME: Probably need to propagate Errors through the pass manager.
   if (!Result) {
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 7a04de3..c91e8b4 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/CtorUtils.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
-#include <unordered_map>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "globaldce"
@@ -50,7 +50,14 @@ namespace {
       if (skipModule(M))
         return false;
 
+      // We need a minimally functional dummy module analysis manager. It needs
+      // to at least know about the possibility of proxying a function analysis
+      // manager.
+      FunctionAnalysisManager DummyFAM;
       ModuleAnalysisManager DummyMAM;
+      DummyMAM.registerPass(
+          [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); });
+
       auto PA = Impl.run(M, DummyMAM);
       return !PA.areAllPreserved();
     }
@@ -78,9 +85,67 @@ static bool isEmptyFunction(Function *F) {
   return RI.getReturnValue() == nullptr;
 }
 
-PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
+/// Compute the set of GlobalValue that depends from V.
+/// The recursion stops as soon as a GlobalValue is met.
+void GlobalDCEPass::ComputeDependencies(Value *V,
+                                        SmallPtrSetImpl<GlobalValue *> &Deps) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    Function *Parent = I->getParent()->getParent();
+    Deps.insert(Parent);
+  } else if (auto *GV = dyn_cast<GlobalValue>(V)) {
+    Deps.insert(GV);
+  } else if (auto *CE = dyn_cast<Constant>(V)) {
+    // Avoid walking the whole tree of a big ConstantExprs multiple times.
+    auto Where = ConstantDependenciesCache.find(CE);
+    if (Where != ConstantDependenciesCache.end()) {
+      auto const &K = Where->second;
+      Deps.insert(K.begin(), K.end());
+    } else {
+      SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE];
+      for (User *CEUser : CE->users())
+        ComputeDependencies(CEUser, LocalDeps);
+      Deps.insert(LocalDeps.begin(), LocalDeps.end());
+    }
+  }
+}
+
+void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
+  SmallPtrSet<GlobalValue *, 8> Deps;
+  for (User *User : GV.users())
+    ComputeDependencies(User, Deps);
+  Deps.erase(&GV); // Remove self-reference.
+  for (GlobalValue *GVU : Deps) {
+    GVDependencies.insert(std::make_pair(GVU, &GV));
+  }
+}
+
+/// Mark Global value as Live
+void GlobalDCEPass::MarkLive(GlobalValue &GV,
+                             SmallVectorImpl<GlobalValue *> *Updates) {
+  auto const Ret = AliveGlobals.insert(&GV);
+  if (!Ret.second)
+    return;
+
+  if (Updates)
+    Updates->push_back(&GV);
+  if (Comdat *C = GV.getComdat()) {
+    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
+      MarkLive(*CM.second, Updates); // Recursion depth is only two because only
+                                     // globals in the same comdat are visited.
+  }
+}
+
+PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
   bool Changed = false;
 
+  // The algorithm first computes the set L of global variables that are
+  // trivially live.  Then it walks the initialization of these variables to
+  // compute the globals used to initialize them, which effectively builds a
+  // directed graph where nodes are global variables, and an edge from A to B
+  // means B is used to initialize A.  Finally, it propagates the liveness
+  // information through the graph starting from the nodes in L. Nodes note
+  // marked as alive are discarded.
+
   // Remove empty functions from the global ctors list.
   Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
 
@@ -103,21 +168,39 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
     // initializer.
     if (!GO.isDeclaration() && !GO.hasAvailableExternallyLinkage())
       if (!GO.isDiscardableIfUnused())
-        GlobalIsNeeded(&GO);
+        MarkLive(GO);
+
+    UpdateGVDependencies(GO);
   }
 
+  // Compute direct dependencies of aliases.
   for (GlobalAlias &GA : M.aliases()) {
     Changed |= RemoveUnusedGlobalValue(GA);
     // Externally visible aliases are needed.
     if (!GA.isDiscardableIfUnused())
-      GlobalIsNeeded(&GA);
+      MarkLive(GA);
+
+    UpdateGVDependencies(GA);
   }
 
+  // Compute direct dependencies of ifuncs.
   for (GlobalIFunc &GIF : M.ifuncs()) {
     Changed |= RemoveUnusedGlobalValue(GIF);
     // Externally visible ifuncs are needed.
     if (!GIF.isDiscardableIfUnused())
-      GlobalIsNeeded(&GIF);
+      MarkLive(GIF);
+
+    UpdateGVDependencies(GIF);
+  }
+
+  // Propagate liveness from collected Global Values through the computed
+  // dependencies.
+  SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(),
+                                           AliveGlobals.end()};
+  while (!NewLiveGVs.empty()) {
+    GlobalValue *LGV = NewLiveGVs.pop_back_val();
+    for (auto &&GVD : make_range(GVDependencies.equal_range(LGV)))
+      MarkLive(*GVD.second, &NewLiveGVs);
   }
 
   // Now that all globals which are needed are in the AliveGlobals set, we loop
@@ -154,7 +237,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
       GA.setAliasee(nullptr);
     }
 
-  // The third pass drops targets of ifuncs which are dead...
+  // The fourth pass drops targets of ifuncs which are dead...
   std::vector<GlobalIFunc*> DeadIFuncs;
   for (GlobalIFunc &GIF : M.ifuncs())
     if (!AliveGlobals.count(&GIF)) {
@@ -188,7 +271,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
 
   // Make sure that all memory is released
   AliveGlobals.clear();
-  SeenConstants.clear();
+  ConstantDependenciesCache.clear();
+  GVDependencies.clear();
   ComdatMembers.clear();
 
   if (Changed)
@@ -196,60 +280,6 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) {
   return PreservedAnalyses::all();
 }
 
-/// GlobalIsNeeded - the specific global value as needed, and
-/// recursively mark anything that it uses as also needed.
-void GlobalDCEPass::GlobalIsNeeded(GlobalValue *G) {
-  // If the global is already in the set, no need to reprocess it.
-  if (!AliveGlobals.insert(G).second)
-    return;
-
-  if (Comdat *C = G->getComdat()) {
-    for (auto &&CM : make_range(ComdatMembers.equal_range(C)))
-      GlobalIsNeeded(CM.second);
-  }
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) {
-    // If this is a global variable, we must make sure to add any global values
-    // referenced by the initializer to the alive set.
-    if (GV->hasInitializer())
-      MarkUsedGlobalsAsNeeded(GV->getInitializer());
-  } else if (GlobalIndirectSymbol *GIS = dyn_cast<GlobalIndirectSymbol>(G)) {
-    // The target of a global alias or ifunc is needed.
-    MarkUsedGlobalsAsNeeded(GIS->getIndirectSymbol());
-  } else {
-    // Otherwise this must be a function object.  We have to scan the body of
-    // the function looking for constants and global values which are used as
-    // operands.  Any operands of these types must be processed to ensure that
-    // any globals used will be marked as needed.
-    Function *F = cast<Function>(G);
-
-    for (Use &U : F->operands())
-      MarkUsedGlobalsAsNeeded(cast<Constant>(U.get()));
-
-    for (BasicBlock &BB : *F)
-      for (Instruction &I : BB)
-        for (Use &U : I.operands())
-          if (GlobalValue *GV = dyn_cast<GlobalValue>(U))
-            GlobalIsNeeded(GV);
-          else if (Constant *C = dyn_cast<Constant>(U))
-            MarkUsedGlobalsAsNeeded(C);
-  }
-}
-
-void GlobalDCEPass::MarkUsedGlobalsAsNeeded(Constant *C) {
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return GlobalIsNeeded(GV);
-
-  // Loop over all of the operands of the constant, adding any globals they
-  // use to the list of needed globals.
-  for (Use &U : C->operands()) {
-    // If we've already processed this constant there's no need to do it again.
-    Constant *Op = dyn_cast<Constant>(U);
-    if (Op && SeenConstants.insert(Op).second)
-      MarkUsedGlobalsAsNeeded(Op);
-  }
-}
-
 // RemoveUnusedGlobalValue - Loop over all of the uses of the specified
 // GlobalValue, looking for the constant pointer ref that may be pointing to it.
 // If found, check to see if the constant pointer ref is safe to destroy, and if
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 5b0d5e3..93eab68 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -239,7 +239,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
   // we delete a constant array, we may also be holding pointer to one of its
   // elements (or an element of one of its elements if we're dealing with an
   // array of arrays) in the worklist.
-  SmallVector<WeakVH, 8> WorkList(V->user_begin(), V->user_end());
+  SmallVector<WeakTrackingVH, 8> WorkList(V->user_begin(), V->user_end());
   while (!WorkList.empty()) {
     Value *UV = WorkList.pop_back_val();
     if (!UV)
@@ -837,7 +837,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
     if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
       // The global is initialized when the store to it occurs.
       new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0,
-                    SI->getOrdering(), SI->getSynchScope(), SI);
+                    SI->getOrdering(), SI->getSyncScopeID(), SI);
       SI->eraseFromParent();
       continue;
     }
@@ -854,7 +854,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       // Replace the cmp X, 0 with a use of the bool value.
       // Sink the load to where the compare was, if atomic rules allow us to.
       Value *LV = new LoadInst(InitBool, InitBool->getName()+".val", false, 0,
-                               LI->getOrdering(), LI->getSynchScope(),
+                               LI->getOrdering(), LI->getSyncScopeID(),
                                LI->isUnordered() ? (Instruction*)ICI : LI);
       InitBoolUsed = true;
       switch (ICI->getPredicate()) {
@@ -1605,7 +1605,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(LI->getOperand(0) == GV && "Not a copy!");
           // Insert a new load, to preserve the saved value.
           StoreVal = new LoadInst(NewGV, LI->getName()+".b", false, 0,
-                                  LI->getOrdering(), LI->getSynchScope(), LI);
+                                  LI->getOrdering(), LI->getSyncScopeID(), LI);
         } else {
           assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
                  "This is not a form that we understand!");
@@ -1614,12 +1614,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         }
       }
       new StoreInst(StoreVal, NewGV, false, 0,
-                    SI->getOrdering(), SI->getSynchScope(), SI);
+                    SI->getOrdering(), SI->getSyncScopeID(), SI);
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
       LoadInst *NLI = new LoadInst(NewGV, LI->getName()+".b", false, 0,
-                                   LI->getOrdering(), LI->getSynchScope(), LI);
+                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
       Value *NSI;
       if (IsOneZero)
         NSI = new ZExtInst(NLI, LI->getType(), "", LI);
@@ -1792,7 +1792,9 @@ static void makeAllConstantUsesInstructions(Constant *C) {
       NewU->insertBefore(UI);
       UI->replaceUsesOfWith(U, NewU);
     }
-    U->dropAllReferences();
+    // We've replaced all the uses, so destroy the constant. (destroyConstant
+    // will update value handles and metadata.)
+    U->destroyConstant();
   }
 }
 
@@ -1819,12 +1821,14 @@ static bool processInternalGlobal(
       GS.AccessingFunction->doesNotRecurse() &&
       isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
                                           LookupDomTree)) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
     Type *ElemTy = GV->getValueType();
     // FIXME: Pass Global's alignment when globals have alignment
-    AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr,
+    AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
                                         GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
@@ -1977,16 +1981,11 @@ static void ChangeCalleesToFastCall(Function *F) {
   }
 }
 
-static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
-  for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    unsigned Index = Attrs.getSlotIndex(i);
-    if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
-      continue;
-
-    // There can be only one.
-    return Attrs.removeAttribute(C, Index, Attribute::Nest);
-  }
-
+static AttributeList StripNest(LLVMContext &C, AttributeList Attrs) {
+  // There can be at most one attribute set with a nest attribute.
+  unsigned NestIndex;
+  if (Attrs.hasAttrSomewhere(Attribute::Nest, &NestIndex))
+    return Attrs.removeAttribute(C, NestIndex, Attribute::Nest);
   return Attrs;
 }
 
@@ -2027,6 +2026,24 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
       continue;
     }
 
+    // LLVM's definition of dominance allows instructions that are cyclic
+    // in unreachable blocks, e.g.:
+    // %pat = select i1 %condition, @global, i16* %pat
+    // because any instruction dominates an instruction in a block that's
+    // not reachable from entry.
+    // So, remove unreachable blocks from the function, because a) there's
+    // no point in analyzing them and b) GlobalOpt should otherwise grow
+    // some more complicated logic to break these cycles.
+    // Removing unreachable blocks might invalidate the dominator so we
+    // recalculate it.
+    if (!F->isDeclaration()) {
+      if (removeUnreachableBlocks(*F)) {
+        auto &DT = LookupDomTree(*F);
+        DT.recalculate(*F);
+        Changed = true;
+      }
+    }
+
     Changed |= processGlobal(*F, TLI, LookupDomTree);
 
     if (!F->hasLocalLinkage())
@@ -2387,7 +2404,7 @@ OptimizeGlobalAliases(Module &M,
 }
 
 static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
-  LibFunc::Func F = LibFunc::cxa_atexit;
+  LibFunc F = LibFunc_cxa_atexit;
   if (!TLI->has(F))
     return nullptr;
 
@@ -2396,7 +2413,7 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
     return nullptr;
 
   // Make sure that the function has the correct prototype.
-  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc::cxa_atexit)
+  if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
     return nullptr;
 
   return Fn;
diff --git a/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index bbbd096..e47d881 100644
--- a/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/GlobalSplit.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Constants.h"
@@ -23,6 +22,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 
 #include <set>
 
@@ -85,7 +85,16 @@ bool splitGlobal(GlobalVariable &GV) {
       uint64_t ByteOffset = cast<ConstantInt>(
               cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
               ->getZExtValue();
-      if (ByteOffset < SplitBegin || ByteOffset >= SplitEnd)
+      // Type metadata may be attached one byte after the end of the vtable, for
+      // classes without virtual methods in Itanium ABI. AFAIK, it is never
+      // attached to the first byte of a vtable. Subtract one to get the right
+      // slice.
+      // This is making an assumption that vtable groups are the only kinds of
+      // global variables that !type metadata can be attached to, and that they
+      // are either Itanium ABI vtable groups or contain a single vtable (i.e.
+      // Microsoft ABI vtables).
+      uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
+      if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
         continue;
       SplitGV->addMetadata(
           LLVMContext::MD_type,
diff --git a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
index 916135e..f79b610 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -24,6 +23,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "ipconstprop"
@@ -136,7 +136,13 @@ static bool PropagateConstantReturn(Function &F) {
   // For more details, see GlobalValue::mayBeDerefined.
   if (!F.isDefinitionExact())
     return false;
-    
+
+  // Don't touch naked functions. The may contain asm returning
+  // value we don't see, so we may end up interprocedurally propagating
+  // the return value incorrectly.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
+
   // Check to see if this function returns a constant.
   SmallVector<Value *,4> RetVals;
   StructType *STy = dyn_cast<StructType>(F.getReturnType());
diff --git a/contrib/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
index 89518f3..5bb305c 100644
--- a/contrib/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/IPO.cpp
@@ -13,10 +13,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/IPO.h"
-#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
diff --git a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 2ef299d..15d7515 100644
--- a/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/InferFunctionAttrs.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
diff --git a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
index 1770445..50e7cc8 100644
--- a/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -48,7 +48,7 @@ public:
   }
 
   explicit SimpleInliner(InlineParams Params)
-      : LegacyInlinerBase(ID), Params(Params) {
+      : LegacyInlinerBase(ID), Params(std::move(Params)) {
     initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
   }
 
@@ -61,7 +61,8 @@ public:
         [&](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
-    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI);
+    return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
+                               /*GetBFI=*/None, PSI);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
@@ -92,8 +93,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 }
 
 Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
-                                       unsigned SizeOptLevel) {
-  return new SimpleInliner(llvm::getInlineParams(OptLevel, SizeOptLevel));
+                                       unsigned SizeOptLevel,
+                                       bool DisableInlineHotCallSite) {
+  auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
+  if (DisableInlineHotCallSite)
+    Param.HotCallSiteThreshold = 0;
+  return new SimpleInliner(Param);
 }
 
 Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
diff --git a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
index 3f4731c..317770d 100644
--- a/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
@@ -260,8 +261,8 @@ static bool InlineCallIfPossible(
 /// Return true if inlining of CS can block the caller from being
 /// inlined which is proved to be more beneficial. \p IC is the
 /// estimated inline cost associated with callsite \p CS.
-/// \p TotalAltCost will be set to the estimated cost of inlining the caller
-/// if \p CS is suppressed for inlining.
+/// \p TotalSecondaryCost will be set to the estimated cost of inlining the
+/// caller if \p CS is suppressed for inlining.
 static bool
 shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
                  int &TotalSecondaryCost,
@@ -288,7 +289,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   // treating them as truly abstract units etc.
   TotalSecondaryCost = 0;
   // The candidate cost to be imposed upon the current function.
-  int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
+  int CandidateCost = IC.getCost() - 1;
   // This bool tracks what happens if we do NOT inline C into B.
   bool callerWillBeRemoved = Caller->hasLocalLinkage();
   // This bool tracks what happens if we DO inline C into B.
@@ -325,7 +326,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   // one is set very low by getInlineCost, in anticipation that Caller will
   // be removed entirely.  We did not account for this above unless there
   // is only one caller of Caller.
-  if (callerWillBeRemoved && !Caller->use_empty())
+  if (callerWillBeRemoved && !Caller->hasOneUse())
     TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
 
   if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
@@ -342,6 +343,7 @@ static bool shouldInline(CallSite CS,
   InlineCost IC = GetInlineCost(CS);
   Instruction *Call = CS.getInstruction();
   Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
 
   if (IC.isAlways()) {
     DEBUG(dbgs() << "    Inlining: cost=always"
@@ -355,19 +357,20 @@ static bool shouldInline(CallSite CS,
   if (IC.isNever()) {
     DEBUG(dbgs() << "    NOT Inlining: cost=never"
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", Callee)
-             << " should never be inlined (cost=never)");
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+             << NV("Callee", Callee) << " not inlined into "
+             << NV("Caller", Caller)
+             << " because it should never be inlined (cost=never)");
     return false;
   }
 
-  Function *Caller = CS.getCaller();
   if (!IC) {
     DEBUG(dbgs() << "    NOT Inlining: cost=" << IC.getCost()
                  << ", thres=" << (IC.getCostDelta() + IC.getCost())
                  << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", Callee) << " too costly to inline (cost="
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", Callee) << " not inlined into "
+             << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
     return false;
@@ -378,8 +381,8 @@ static bool shouldInline(CallSite CS,
     DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
                  << " Cost = " << IC.getCost()
                  << ", outer Cost = " << TotalSecondaryCost << '\n');
-    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE,
-                                        "IncreaseCostInOtherContexts", Call)
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+                                      Call)
              << "Not inlining. Cost of inlining " << NV("Callee", Callee)
              << " increases the cost of inlining " << NV("Caller", Caller)
              << " in other contexts");
@@ -499,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
         std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
 
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache);
+  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
 
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -516,52 +519,54 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       Function *Caller = CS.getCaller();
       Function *Callee = CS.getCalledFunction();
 
-      // If this call site is dead and it is to a readonly function, we should
-      // just delete the call instead of trying to inline it, regardless of
-      // size.  This happens because IPSCCP propagates the result out of the
-      // call and then we're left with the dead call.
-      if (isInstructionTriviallyDead(CS.getInstruction(), &TLI)) {
-        DEBUG(dbgs() << "    -> Deleting dead call: " << *CS.getInstruction()
-                     << "\n");
-        // Update the call graph by deleting the edge from Callee to Caller.
-        CG[Caller]->removeCallEdgeFor(CS);
-        CS.getInstruction()->eraseFromParent();
-        ++NumCallsDeleted;
-      } else {
-        // We can only inline direct calls to non-declarations.
-        if (!Callee || Callee->isDeclaration())
-          continue;
+      // We can only inline direct calls to non-declarations.
+      if (!Callee || Callee->isDeclaration())
+        continue;
 
+      Instruction *Instr = CS.getInstruction();
+
+      bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI);
+
+      int InlineHistoryID;
+      if (!IsTriviallyDead) {
         // If this call site was obtained by inlining another function, verify
         // that the include path for the function did not include the callee
         // itself.  If so, we'd be recursively inlining the same function,
         // which would provide the same callsites, which would cause us to
         // infinitely inline.
-        int InlineHistoryID = CallSites[CSi].second;
+        InlineHistoryID = CallSites[CSi].second;
         if (InlineHistoryID != -1 &&
             InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory))
           continue;
+      }
 
+      // FIXME for new PM: because of the old PM we currently generate ORE and
+      // in turn BFI on demand.  With the new PM, the ORE dependency should
+      // just become a regular analysis dependency.
+      OptimizationRemarkEmitter ORE(Caller);
+
+      // If the policy determines that we should inline this function,
+      // delete the call instead.
+      if (!shouldInline(CS, GetInlineCost, ORE))
+        continue;
+
+      // If this call site is dead and it is to a readonly function, we should
+      // just delete the call instead of trying to inline it, regardless of
+      // size.  This happens because IPSCCP propagates the result out of the
+      // call and then we're left with the dead call.
+      if (IsTriviallyDead) {
+        DEBUG(dbgs() << "    -> Deleting dead call: " << *Instr << "\n");
+        // Update the call graph by deleting the edge from Callee to Caller.
+        CG[Caller]->removeCallEdgeFor(CS);
+        Instr->eraseFromParent();
+        ++NumCallsDeleted;
+      } else {
         // Get DebugLoc to report. CS will be invalid after Inliner.
-        DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+        DebugLoc DLoc = Instr->getDebugLoc();
         BasicBlock *Block = CS.getParent();
-        // FIXME for new PM: because of the old PM we currently generate ORE and
-        // in turn BFI on demand.  With the new PM, the ORE dependency should
-        // just become a regular analysis dependency.
-        OptimizationRemarkEmitter ORE(Caller);
-
-        // If the policy determines that we should inline this function,
-        // try to do so.
-        using namespace ore;
-        if (!shouldInline(CS, GetInlineCost, ORE)) {
-          ORE.emit(
-              OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
-              << NV("Callee", Callee) << " will not be inlined into "
-              << NV("Caller", Caller));
-          continue;
-        }
 
         // Attempt to inline the function.
+        using namespace ore;
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
                                   InlineHistoryID, InsertLifetime, AARGetter,
                                   ImportedFunctionsStats)) {
@@ -638,22 +643,12 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   ACT = &getAnalysis<AssumptionCacheTracker>();
   PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
   };
   return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
                          [this](CallSite CS) { return getInlineCost(CS); },
-                         AARGetter, ImportedFunctionsStats);
+                         LegacyAARGetter(*this), ImportedFunctionsStats);
 }
 
 /// Remove now-dead linkonce functions at the end of
@@ -750,9 +745,6 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
-  FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
-          .getManager();
   const ModuleAnalysisManager &MAM =
       AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager();
   bool Changed = false;
@@ -761,35 +753,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   Module &M = *InitialC.begin()->getFunction().getParent();
   ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
 
-  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-      [&](Function &F) -> AssumptionCache & {
-    return FAM.getResult<AssumptionAnalysis>(F);
-  };
-
-  // Setup the data structure used to plumb customization into the
-  // `InlineFunction` routine.
-  InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache);
+  // We use a single common worklist for calls across the entire SCC. We
+  // process these in-order and append new calls introduced during inlining to
+  // the end.
+  //
+  // Note that this particular order of processing is actually critical to
+  // avoid very bad behaviors. Consider *highly connected* call graphs where
+  // each function contains a small amonut of code and a couple of calls to
+  // other functions. Because the LLVM inliner is fundamentally a bottom-up
+  // inliner, it can handle gracefully the fact that these all appear to be
+  // reasonable inlining candidates as it will flatten things until they become
+  // too big to inline, and then move on and flatten another batch.
+  //
+  // However, when processing call edges *within* an SCC we cannot rely on this
+  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+  // functions we can end up incrementally inlining N calls into each of
+  // N functions because each incremental inlining decision looks good and we
+  // don't have a topological ordering to prevent explosions.
+  //
+  // To compensate for this, we don't process transitive edges made immediate
+  // by inlining until we've done one pass of inlining across the entire SCC.
+  // Large, highly connected SCCs still lead to some amount of code bloat in
+  // this model, but it is uniformly spread across all the functions in the SCC
+  // and eventually they all become too large to inline, rather than
+  // incrementally maknig a single function grow in a super linear fashion.
+  SmallVector<std::pair<CallSite, int>, 16> Calls;
 
-  auto GetInlineCost = [&](CallSite CS) {
-    Function &Callee = *CS.getCalledFunction();
-    auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
-    return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI);
-  };
+  // Populate the initial list of calls in this SCC.
+  for (auto &N : InitialC) {
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(N.getFunction()))
+      if (auto CS = CallSite(&I))
+        if (Function *Callee = CS.getCalledFunction())
+          if (!Callee->isDeclaration())
+            Calls.push_back({CS, -1});
+  }
+  if (Calls.empty())
+    return PreservedAnalyses::all();
 
-  // We use a worklist of nodes to process so that we can handle if the SCC
-  // structure changes and some nodes are no longer part of the current SCC. We
-  // also need to use an updatable pointer for the SCC as a consequence.
-  SmallVector<LazyCallGraph::Node *, 16> Nodes;
-  for (auto &N : InitialC)
-    Nodes.push_back(&N);
+  // Capture updatable variables for the current SCC and RefSCC.
   auto *C = &InitialC;
   auto *RC = &C->getOuterRefSCC();
 
-  // We also use a secondary worklist of call sites within a particular node to
-  // allow quickly continuing to inline through newly inlined call sites where
-  // possible.
-  SmallVector<std::pair<CallSite, int>, 16> Calls;
-
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
   // infinite inlining in some obscure cases.  To represent this, we use an
@@ -805,34 +814,58 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // defer deleting these to make it easier to handle the call graph updates.
   SmallVector<Function *, 4> DeadFunctions;
 
-  do {
-    auto &N = *Nodes.pop_back_val();
+  // Loop forward over all of the calls. Note that we cannot cache the size as
+  // inlining can introduce new calls that need to be processed.
+  for (int i = 0; i < (int)Calls.size(); ++i) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls[i].first.getCaller();
+    LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C)
       continue;
-    Function &F = N.getFunction();
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
 
+    DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+
+    // Get a FunctionAnalysisManager via a proxy for this particular node. We
+    // do this each time we visit a node as the SCC may have changed and as
+    // we're going to mutate this particular function we want to make sure the
+    // proxy is in place to forward any invalidation events. We can use the
+    // manager we get here for looking up results for functions other than this
+    // node however because those functions aren't going to be mutated by this
+    // pass.
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG)
+            .getManager();
+    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+        [&](Function &F) -> AssumptionCache & {
+      return FAM.getResult<AssumptionAnalysis>(F);
+    };
+    auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
+      return FAM.getResult<BlockFrequencyAnalysis>(F);
+    };
+
+    auto GetInlineCost = [&](CallSite CS) {
+      Function &Callee = *CS.getCalledFunction();
+      auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
+      return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI},
+                           PSI);
+    };
+
     // Get the remarks emission analysis for the caller.
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    // We want to generally process call sites top-down in order for
-    // simplifications stemming from replacing the call with the returned value
-    // after inlining to be visible to subsequent inlining decisions. So we
-    // walk the function backwards and then process the back of the vector.
-    // FIXME: Using reverse is a really bad way to do this. Instead we should
-    // do an actual PO walk of the function body.
-    for (Instruction &I : reverse(instructions(F)))
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
-          if (!Callee->isDeclaration())
-            Calls.push_back({CS, -1});
-
+    // Now process as many calls as we have within this caller in the sequnece.
+    // We bail out as soon as the caller has to change so we can update the
+    // call graph and prepare the context of that new caller.
     bool DidInline = false;
-    while (!Calls.empty()) {
+    for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) {
       int InlineHistoryID;
       CallSite CS;
-      std::tie(CS, InlineHistoryID) = Calls.pop_back_val();
+      std::tie(CS, InlineHistoryID) = Calls[i];
       Function &Callee = *CS.getCalledFunction();
 
       if (InlineHistoryID != -1 &&
@@ -843,6 +876,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       if (!shouldInline(CS, GetInlineCost, ORE))
         continue;
 
+      // Setup the data structure used to plumb customization into the
+      // `InlineFunction` routine.
+      InlineFunctionInfo IFI(
+          /*cg=*/nullptr, &GetAssumptionCache, PSI,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
+          &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
       if (!InlineFunction(CS, IFI))
         continue;
       DidInline = true;
@@ -869,7 +909,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         // To check this we also need to nuke any dead constant uses (perhaps
         // made dead by this operation on other functions).
         Callee.removeDeadConstantUsers();
-        if (Callee.use_empty()) {
+        if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+          Calls.erase(
+              std::remove_if(Calls.begin() + i + 1, Calls.end(),
+                             [&Callee](const std::pair<CallSite, int> &Call) {
+                               return Call.first.getCaller() == &Callee;
+                             }),
+              Calls.end());
           // Clear the body and queue the function itself for deletion when we
           // finish inlining and call graph updates.
           // Note that after this point, it is an error to do anything other
@@ -882,6 +928,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       }
     }
 
+    // Back the call index up by one to put us in a good position to go around
+    // the outer loop.
+    --i;
+
     if (!DidInline)
       continue;
     Changed = true;
@@ -896,8 +946,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // below.
     for (Function *InlinedCallee : InlinedCallees) {
       LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee);
-      for (LazyCallGraph::Edge &E : CalleeN)
-        RC->insertTrivialRefEdge(N, *E.getNode());
+      for (LazyCallGraph::Edge &E : *CalleeN)
+        RC->insertTrivialRefEdge(N, E.getNode());
     }
     InlinedCallees.clear();
 
@@ -908,8 +958,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // re-use the exact same logic for updating the call graph to reflect the
     // change..
     C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
+    DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
-  } while (!Nodes.empty());
+  }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
   // all of the trivially dead functions, updating the call graph and the CGSCC
@@ -920,8 +971,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // sets.
   for (Function *DeadF : DeadFunctions) {
     // Get the necessary information out of the call graph and nuke the
-    // function there.
+    // function there. Also, cclear out any cached analyses.
     auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
+    FunctionAnalysisManager &FAM =
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG)
+            .getManager();
+    FAM.clear(*DeadF);
+    AM.clear(DeadC);
     auto &DeadRC = DeadC.getOuterRefSCC();
     CG.removeDeadFunction(*DeadF);
 
@@ -933,5 +989,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // And delete the actual function from the module.
     M.getFunctionList().erase(DeadF);
   }
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  // Even if we change the IR, we update the core CGSCC data structures and so
+  // can preserve the proxy to the function analysis manager.
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+  return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index f898c3b..c74b0a3 100644
--- a/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/IR/Dominators.h"
@@ -22,6 +21,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
diff --git a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index deb7e81..693df5e 100644
--- a/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -42,8 +43,6 @@
 using namespace llvm;
 using namespace lowertypetests;
 
-using SummaryAction = LowerTypeTestsSummaryAction;
-
 #define DEBUG_TYPE "lowertypetests"
 
 STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -57,13 +56,13 @@ static cl::opt<bool> AvoidReuse(
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<SummaryAction> ClSummaryAction(
+static cl::opt<PassSummaryAction> ClSummaryAction(
     "lowertypetests-summary-action",
     cl::desc("What to do with the summary when running this pass"),
-    cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
-               clEnumValN(SummaryAction::Import, "import",
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
                           "Import typeid resolutions from summary and globals"),
-               clEnumValN(SummaryAction::Export, "export",
+               clEnumValN(PassSummaryAction::Export, "export",
                           "Export typeid resolutions to summary and globals")),
     cl::Hidden);
 
@@ -208,17 +207,26 @@ struct ByteArrayInfo {
 class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
   GlobalObject *GO;
   size_t NTypes;
+  // For functions: true if this is a definition (either in the merged module or
+  // in one of the thinlto modules).
+  bool IsDefinition;
+  // For functions: true if this function is either defined or used in a thinlto
+  // module and its jumptable entry needs to be exported to thinlto backends.
+  bool IsExported;
 
   friend TrailingObjects;
   size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
 
 public:
   static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+                                  bool IsDefinition, bool IsExported,
                                   ArrayRef<MDNode *> Types) {
     auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
         totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
     GTM->GO = GO;
     GTM->NTypes = Types.size();
+    GTM->IsDefinition = IsDefinition;
+    GTM->IsExported = IsExported;
     std::uninitialized_copy(Types.begin(), Types.end(),
                             GTM->getTrailingObjects<MDNode *>());
     return GTM;
@@ -226,6 +234,12 @@ public:
   GlobalObject *getGlobal() const {
     return GO;
   }
+  bool isDefinition() const {
+    return IsDefinition;
+  }
+  bool isExported() const {
+    return IsExported;
+  }
   ArrayRef<MDNode *> types() const {
     return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
   }
@@ -234,10 +248,9 @@ public:
 class LowerTypeTestsModule {
   Module &M;
 
-  SummaryAction Action;
-  ModuleSummaryIndex *Summary;
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
 
-  bool LinkerSubsectionsViaSymbols;
   Triple::ArchType Arch;
   Triple::OSType OS;
   Triple::ObjectFormatType ObjectFormat;
@@ -253,15 +266,21 @@ class LowerTypeTestsModule {
   // Indirect function call index assignment counter for WebAssembly
   uint64_t IndirectIndex = 1;
 
-  // Mapping from type identifiers to the call sites that test them.
-  DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites;
+  // Mapping from type identifiers to the call sites that test them, as well as
+  // whether the type identifier needs to be exported to ThinLTO backends as
+  // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId).
+  struct TypeIdUserInfo {
+    std::vector<CallInst *> CallSites;
+    bool IsExported = false;
+  };
+  DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers;
 
   /// This structure describes how to lower type tests for a particular type
   /// identifier. It is either built directly from the global analysis (during
   /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
   /// identifier summaries and external symbol references (in ThinLTO backends).
   struct TypeIdLowering {
-    TypeTestResolution::Kind TheKind;
+    TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
 
     /// All except Unsat: the start address within the combined global.
     Constant *OffsetedGlobal;
@@ -274,9 +293,6 @@ class LowerTypeTestsModule {
     /// covering members of this type identifier as a multiple of 2^AlignLog2.
     Constant *SizeM1;
 
-    /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width.
-    unsigned SizeM1BitWidth;
-
     /// ByteArray: the byte array to test the address against.
     Constant *TheByteArray;
 
@@ -291,6 +307,11 @@ class LowerTypeTestsModule {
 
   Function *WeakInitializerFn = nullptr;
 
+  void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
+  TypeIdLowering importTypeId(StringRef TypeId);
+  void importTypeTest(CallInst *CI);
+  void importFunction(Function *F, bool isDefinition);
+
   BitSetInfo
   buildBitSet(Metadata *TypeId,
               const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
@@ -327,8 +348,8 @@ class LowerTypeTestsModule {
   void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
 
 public:
-  LowerTypeTestsModule(Module &M, SummaryAction Action,
-                       ModuleSummaryIndex *Summary);
+  LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
+                       const ModuleSummaryIndex *ImportSummary);
   bool lower();
 
   // Lower the module using the action and summary passed as command line
@@ -341,15 +362,17 @@ struct LowerTypeTests : public ModulePass {
 
   bool UseCommandLine = false;
 
-  SummaryAction Action;
-  ModuleSummaryIndex *Summary;
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
 
   LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
-  LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
-      : ModulePass(ID), Action(Action), Summary(Summary) {
+  LowerTypeTests(ModuleSummaryIndex *ExportSummary,
+                 const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
@@ -358,7 +381,7 @@ struct LowerTypeTests : public ModulePass {
       return false;
     if (UseCommandLine)
       return LowerTypeTestsModule::runForTesting(M);
-    return LowerTypeTestsModule(M, Action, Summary).lower();
+    return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
   }
 };
 
@@ -368,9 +391,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
                 false)
 char LowerTypeTests::ID = 0;
 
-ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
-                                           ModuleSummaryIndex *Summary) {
-  return new LowerTypeTests(Action, Summary);
+ModulePass *
+llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                               const ModuleSummaryIndex *ImportSummary) {
+  return new LowerTypeTests(ExportSummary, ImportSummary);
 }
 
 /// Build a bit set for TypeId using the object layouts in
@@ -467,13 +491,9 @@ void LowerTypeTestsModule::allocateByteArrays() {
     // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures
     // that the pc-relative displacement is folded into the lea instead of the
     // test instruction getting another displacement.
-    if (LinkerSubsectionsViaSymbols) {
-      BAI->ByteArray->replaceAllUsesWith(GEP);
-    } else {
-      GlobalAlias *Alias = GlobalAlias::create(
-          Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
-      BAI->ByteArray->replaceAllUsesWith(Alias);
-    }
+    GlobalAlias *Alias = GlobalAlias::create(
+        Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
+    BAI->ByteArray->replaceAllUsesWith(Alias);
     BAI->ByteArray->eraseFromParent();
   }
 
@@ -494,10 +514,11 @@ Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
     return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
   } else {
     Constant *ByteArray = TIL.TheByteArray;
-    if (!LinkerSubsectionsViaSymbols && AvoidReuse) {
+    if (AvoidReuse && !ImportSummary) {
       // Each use of the byte array uses a different alias. This makes the
       // backend less likely to reuse previously computed byte array addresses,
       // improving the security of the CFI mechanism based on this pass.
+      // This won't work when importing because TheByteArray is external.
       ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
                                       "bits_use", ByteArray, &M);
     }
@@ -593,15 +614,31 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                      IntPtrTy));
   Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
 
-  Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy);
-  Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst);
+  Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1);
 
   // If the bit set is all ones, testing against it is unnecessary.
   if (TIL.TheKind == TypeTestResolution::AllOnes)
     return OffsetInRange;
 
-  TerminatorInst *Term = SplitBlockAndInsertIfThen(OffsetInRange, CI, false);
-  IRBuilder<> ThenB(Term);
+  // See if the intrinsic is used in the following common pattern:
+  //   br(llvm.type.test(...), thenbb, elsebb)
+  // where nothing happens between the type test and the br.
+  // If so, create slightly simpler IR.
+  if (CI->hasOneUse())
+    if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin()))
+      if (CI->getNextNode() == Br) {
+        BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator());
+        BasicBlock *Else = Br->getSuccessor(1);
+        BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange);
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                           Br->getMetadata(LLVMContext::MD_prof));
+        ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
+
+        IRBuilder<> ThenB(CI);
+        return createBitSetTest(ThenB, TIL, BitOffset);
+      }
+
+  IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
 
   // Now that we know that the offset is in range and aligned, load the
   // appropriate bit from the bitset.
@@ -672,21 +709,174 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
                                       ConstantInt::get(Int32Ty, I * 2)};
     Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
         NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
-    if (LinkerSubsectionsViaSymbols) {
-      GV->replaceAllUsesWith(CombinedGlobalElemPtr);
-    } else {
-      assert(GV->getType()->getAddressSpace() == 0);
-      GlobalAlias *GAlias = GlobalAlias::create(NewTy->getElementType(I * 2), 0,
-                                                GV->getLinkage(), "",
-                                                CombinedGlobalElemPtr, &M);
-      GAlias->setVisibility(GV->getVisibility());
-      GAlias->takeName(GV);
-      GV->replaceAllUsesWith(GAlias);
-    }
+    assert(GV->getType()->getAddressSpace() == 0);
+    GlobalAlias *GAlias =
+        GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(),
+                            "", CombinedGlobalElemPtr, &M);
+    GAlias->setVisibility(GV->getVisibility());
+    GAlias->takeName(GV);
+    GV->replaceAllUsesWith(GAlias);
     GV->eraseFromParent();
   }
 }
 
+/// Export the given type identifier so that ThinLTO backends may import it.
+/// Type identifiers are exported by adding coarse-grained information about how
+/// to test the type identifier to the summary, and creating symbols in the
+/// object file (aliases and absolute symbols) containing fine-grained
+/// information about the type identifier.
+void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
+                                        const TypeIdLowering &TIL) {
+  TypeTestResolution &TTRes =
+      ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
+  TTRes.TheKind = TIL.TheKind;
+
+  auto ExportGlobal = [&](StringRef Name, Constant *C) {
+    GlobalAlias *GA =
+        GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                            "__typeid_" + TypeId + "_" + Name, C, &M);
+    GA->setVisibility(GlobalValue::HiddenVisibility);
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    ExportGlobal("global_addr", TIL.OffsetedGlobal);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy));
+    ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy));
+
+    uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
+    if (TIL.TheKind == TypeTestResolution::Inline)
+      TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6;
+    else
+      TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32;
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    ExportGlobal("byte_array", TIL.TheByteArray);
+    ExportGlobal("bit_mask", TIL.BitMask);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    ExportGlobal("inline_bits",
+                 ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy));
+}
+
+LowerTypeTestsModule::TypeIdLowering
+LowerTypeTestsModule::importTypeId(StringRef TypeId) {
+  const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
+  if (!TidSummary)
+    return {}; // Unsat: no globals match this type id.
+  const TypeTestResolution &TTRes = TidSummary->TTRes;
+
+  TypeIdLowering TIL;
+  TIL.TheKind = TTRes.TheKind;
+
+  auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) {
+    Constant *C =
+        M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty);
+    auto *GV = dyn_cast<GlobalVariable>(C);
+    // We only need to set metadata if the global is newly created, in which
+    // case it would not have hidden visibility.
+    if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+      return C;
+
+    GV->setVisibility(GlobalValue::HiddenVisibility);
+    auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+      GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                      MDNode::get(M.getContext(), {MinC, MaxC}));
+    };
+    if (AbsWidth == IntPtrTy->getBitWidth())
+      SetAbsRange(~0ull, ~0ull); // Full set.
+    else if (AbsWidth)
+      SetAbsRange(0, 1ull << AbsWidth);
+    return C;
+  };
+
+  if (TIL.TheKind != TypeTestResolution::Unsat)
+    TIL.OffsetedGlobal = ImportGlobal("global_addr", 0);
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray ||
+      TIL.TheKind == TypeTestResolution::Inline ||
+      TIL.TheKind == TypeTestResolution::AllOnes) {
+    TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty);
+    TIL.SizeM1 = ConstantExpr::getPtrToInt(
+        ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::ByteArray) {
+    TIL.TheByteArray = ImportGlobal("byte_array", 0);
+    TIL.BitMask = ImportGlobal("bit_mask", 8);
+  }
+
+  if (TIL.TheKind == TypeTestResolution::Inline)
+    TIL.InlineBits = ConstantExpr::getPtrToInt(
+        ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth),
+        TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
+
+  return TIL;
+}
+
+void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
+  auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+  if (!TypeIdMDVal)
+    report_fatal_error("Second argument of llvm.type.test must be metadata");
+
+  auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
+  if (!TypeIdStr)
+    report_fatal_error(
+        "Second argument of llvm.type.test must be a metadata string");
+
+  TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
+  Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
+  CI->replaceAllUsesWith(Lowered);
+  CI->eraseFromParent();
+}
+
+// ThinLTO backend: the function F has a jump table entry; update this module
+// accordingly. isDefinition describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) {
+  assert(F->getType()->getAddressSpace() == 0);
+
+  // Declaration of a local function - nothing to do.
+  if (F->isDeclarationForLinker() && isDefinition)
+    return;
+
+  GlobalValue::VisibilityTypes Visibility = F->getVisibility();
+  std::string Name = F->getName();
+  Function *FDecl;
+
+  if (F->isDeclarationForLinker() && !isDefinition) {
+    // Declaration of an external function.
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             Name + ".cfi_jt", &M);
+    FDecl->setVisibility(GlobalValue::HiddenVisibility);
+  } else if (isDefinition) {
+    F->setName(Name + ".cfi");
+    F->setLinkage(GlobalValue::ExternalLinkage);
+    F->setVisibility(GlobalValue::HiddenVisibility);
+    FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+                             Name, &M);
+    FDecl->setVisibility(Visibility);
+  } else {
+    // Function definition without type metadata, where some other translation
+    // unit contained a declaration with type metadata. This normally happens
+    // during mixed CFI + non-CFI compilation. We do nothing with the function
+    // so that it is treated the same way as a function defined outside of the
+    // LTO unit.
+    return;
+  }
+
+  if (F->isWeakForLinker())
+    replaceWeakDeclarationWithJumpTablePtr(F, FDecl);
+  else
+    F->replaceAllUsesWith(FDecl);
+}
+
 void LowerTypeTestsModule::lowerTypeTestCalls(
     ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
     const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
@@ -708,16 +898,12 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
         Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
     TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+    TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1);
     if (BSI.isAllOnes()) {
       TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
                                        : TypeTestResolution::AllOnes;
-      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
-      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
-                                    BSI.BitSize - 1);
     } else if (BSI.BitSize <= 64) {
       TIL.TheKind = TypeTestResolution::Inline;
-      TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6;
-      TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1);
       uint64_t InlineBits = 0;
       for (auto Bit : BSI.Bits)
         InlineBits |= uint64_t(1) << Bit;
@@ -728,17 +914,19 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
             (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
     } else {
       TIL.TheKind = TypeTestResolution::ByteArray;
-      TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32;
-      TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty,
-                                    BSI.BitSize - 1);
       ++NumByteArraysCreated;
       ByteArrayInfo *BAI = createByteArray(BSI);
       TIL.TheByteArray = BAI->ByteArray;
       TIL.BitMask = BAI->MaskGlobal;
     }
 
+    TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
+
+    if (TIUI.IsExported)
+      exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+
     // Lower each call to llvm.type.test for this type identifier.
-    for (CallInst *CI : TypeTestCallSites[TypeId]) {
+    for (CallInst *CI : TIUI.CallSites) {
       ++NumTypeTestCallsLowered;
       Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
       CI->replaceAllUsesWith(Lowered);
@@ -757,9 +945,9 @@ void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
     report_fatal_error(
         "A member of a type identifier may not have an explicit section");
 
-  if (isa<GlobalVariable>(GO) && GO->isDeclarationForLinker())
-    report_fatal_error(
-        "A global var member of a type identifier must be a definition");
+  // FIXME: We previously checked that global var member of a type identifier
+  // must be a definition, but the IR linker may leave type metadata on
+  // declarations. We should restore this check after fixing PR31759.
 
   auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
   if (!OffsetConstMD)
@@ -1012,7 +1200,6 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
   // arithmetic that we normally use for globals.
 
   // FIXME: find a better way to represent the jumptable in the IR.
-
   assert(!Functions.empty());
 
   // Build a simple layout based on the regular layout of jump tables.
@@ -1036,6 +1223,7 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
   // references to the original functions with references to the aliases.
   for (unsigned I = 0; I != Functions.size(); ++I) {
     Function *F = cast<Function>(Functions[I]->getGlobal());
+    bool IsDefinition = Functions[I]->isDefinition();
 
     Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
         ConstantExpr::getInBoundsGetElementPtr(
@@ -1043,8 +1231,18 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
             ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
                                  ConstantInt::get(IntPtrTy, I)}),
         F->getType());
-    if (LinkerSubsectionsViaSymbols || F->isDeclarationForLinker()) {
-
+    if (Functions[I]->isExported()) {
+      if (IsDefinition) {
+        ExportSummary->cfiFunctionDefs().insert(F->getName());
+      } else {
+        GlobalAlias *JtAlias = GlobalAlias::create(
+            F->getValueType(), 0, GlobalValue::ExternalLinkage,
+            F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+        JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+        ExportSummary->cfiFunctionDecls().insert(F->getName());
+      }
+    }
+    if (!IsDefinition) {
       if (F->isWeakForLinker())
         replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr);
       else
@@ -1052,9 +1250,8 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
     } else {
       assert(F->getType()->getAddressSpace() == 0);
 
-      GlobalAlias *FAlias = GlobalAlias::create(F->getValueType(), 0,
-                                                F->getLinkage(), "",
-                                                CombinedGlobalElemPtr, &M);
+      GlobalAlias *FAlias = GlobalAlias::create(
+          F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M);
       FAlias->setVisibility(F->getVisibility());
       FAlias->takeName(F);
       if (FAlias->hasName())
@@ -1173,15 +1370,12 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 }
 
 /// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
-                                           ModuleSummaryIndex *Summary)
-    : M(M), Action(Action), Summary(Summary) {
-  // FIXME: Use these fields.
-  (void)this->Action;
-  (void)this->Summary;
-
+LowerTypeTestsModule::LowerTypeTestsModule(
+    Module &M, ModuleSummaryIndex *ExportSummary,
+    const ModuleSummaryIndex *ImportSummary)
+    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
+  assert(!(ExportSummary && ImportSummary));
   Triple TargetTriple(M.getTargetTriple());
-  LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
   Arch = TargetTriple.getArch();
   OS = TargetTriple.getOS();
   ObjectFormat = TargetTriple.getObjectFormat();
@@ -1203,7 +1397,11 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
     ExitOnErr(errorCodeToError(In.error()));
   }
 
-  bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+  bool Changed =
+      LowerTypeTestsModule(
+          M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .lower();
 
   if (!ClWriteSummary.empty()) {
     ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
@@ -1222,9 +1420,40 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
-  if (!TypeTestFunc || TypeTestFunc->use_empty())
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary &&
+      !ImportSummary)
     return false;
 
+  if (ImportSummary) {
+    if (TypeTestFunc) {
+      for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+           UI != UE;) {
+        auto *CI = cast<CallInst>((*UI++).getUser());
+        importTypeTest(CI);
+      }
+    }
+
+    SmallVector<Function *, 8> Defs;
+    SmallVector<Function *, 8> Decls;
+    for (auto &F : M) {
+      // CFI functions are either external, or promoted. A local function may
+      // have the same name, but it's not the one we are looking for.
+      if (F.hasLocalLinkage())
+        continue;
+      if (ImportSummary->cfiFunctionDefs().count(F.getName()))
+        Defs.push_back(&F);
+      else if (ImportSummary->cfiFunctionDecls().count(F.getName()))
+        Decls.push_back(&F);
+    }
+
+    for (auto F : Defs)
+      importFunction(F, /*isDefinition*/ true);
+    for (auto F : Decls)
+      importFunction(F, /*isDefinition*/ false);
+
+    return true;
+  }
+
   // Equivalence class set containing type identifiers and the globals that
   // reference them. This is used to partition the set of type identifiers in
   // the module into disjoint sets.
@@ -1247,13 +1476,76 @@ bool LowerTypeTestsModule::lower() {
   llvm::DenseMap<Metadata *, TIInfo> TypeIdInfo;
   unsigned I = 0;
   SmallVector<MDNode *, 2> Types;
+
+  struct ExportedFunctionInfo {
+    CfiFunctionLinkage Linkage;
+    MDNode *FuncMD; // {name, linkage, type[, type...]}
+  };
+  DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
+  if (ExportSummary) {
+    NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+    if (CfiFunctionsMD) {
+      for (auto FuncMD : CfiFunctionsMD->operands()) {
+        assert(FuncMD->getNumOperands() >= 2);
+        StringRef FunctionName =
+            cast<MDString>(FuncMD->getOperand(0))->getString();
+        if (!ExportSummary->isGUIDLive(GlobalValue::getGUID(
+                GlobalValue::dropLLVMManglingEscape(FunctionName))))
+          continue;
+        CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
+            cast<ConstantAsMetadata>(FuncMD->getOperand(1))
+                ->getValue()
+                ->getUniqueInteger()
+                .getZExtValue());
+        auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
+        if (!P.second && P.first->second.Linkage != CFL_Definition)
+          P.first->second = {Linkage, FuncMD};
+      }
+
+      for (const auto &P : ExportedFunctions) {
+        StringRef FunctionName = P.first;
+        CfiFunctionLinkage Linkage = P.second.Linkage;
+        MDNode *FuncMD = P.second.FuncMD;
+        Function *F = M.getFunction(FunctionName);
+        if (!F)
+          F = Function::Create(
+              FunctionType::get(Type::getVoidTy(M.getContext()), false),
+              GlobalVariable::ExternalLinkage, FunctionName, &M);
+
+        if (Linkage == CFL_Definition)
+          F->eraseMetadata(LLVMContext::MD_type);
+
+        if (F->isDeclaration()) {
+          if (Linkage == CFL_WeakDeclaration)
+            F->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+          SmallVector<MDNode *, 2> Types;
+          for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
+            F->addMetadata(LLVMContext::MD_type,
+                           *cast<MDNode>(FuncMD->getOperand(I).get()));
+        }
+      }
+    }
+  }
+
   for (GlobalObject &GO : M.global_objects()) {
+    if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
+      continue;
+
     Types.clear();
     GO.getMetadata(LLVMContext::MD_type, Types);
     if (Types.empty())
       continue;
 
-    auto *GTM = GlobalTypeMember::create(Alloc, &GO, Types);
+    bool IsDefinition = !GO.isDeclarationForLinker();
+    bool IsExported = false;
+    if (isa<Function>(GO) && ExportedFunctions.count(GO.getName())) {
+      IsDefinition |= ExportedFunctions[GO.getName()].Linkage == CFL_Definition;
+      IsExported = true;
+    }
+
+    auto *GTM =
+        GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types);
     for (MDNode *Type : Types) {
       verifyTypeMDNode(&GO, Type);
       auto &Info = TypeIdInfo[cast<MDNode>(Type)->getOperand(1)];
@@ -1262,33 +1554,56 @@ bool LowerTypeTestsModule::lower() {
     }
   }
 
-  for (const Use &U : TypeTestFunc->uses()) {
-    auto CI = cast<CallInst>(U.getUser());
+  auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & {
+    // Add the call site to the list of call sites for this type identifier. We
+    // also use TypeIdUsers to keep track of whether we have seen this type
+    // identifier before. If we have, we don't need to re-add the referenced
+    // globals to the equivalence class.
+    auto Ins = TypeIdUsers.insert({TypeId, {}});
+    if (Ins.second) {
+      // Add the type identifier to the equivalence class.
+      GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId);
+      GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+
+      // Add the referenced globals to the type identifier's equivalence class.
+      for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals)
+        CurSet = GlobalClasses.unionSets(
+            CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+    }
+
+    return Ins.first->second;
+  };
 
-    auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
-    if (!BitSetMDVal)
-      report_fatal_error("Second argument of llvm.type.test must be metadata");
-    auto BitSet = BitSetMDVal->getMetadata();
+  if (TypeTestFunc) {
+    for (const Use &U : TypeTestFunc->uses()) {
+      auto CI = cast<CallInst>(U.getUser());
 
-    // Add the call site to the list of call sites for this type identifier. We
-    // also use TypeTestCallSites to keep track of whether we have seen this
-    // type identifier before. If we have, we don't need to re-add the
-    // referenced globals to the equivalence class.
-    std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool>
-        Ins = TypeTestCallSites.insert(
-            std::make_pair(BitSet, std::vector<CallInst *>()));
-    Ins.first->second.push_back(CI);
-    if (!Ins.second)
-      continue;
+      auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+      if (!TypeIdMDVal)
+        report_fatal_error("Second argument of llvm.type.test must be metadata");
+      auto TypeId = TypeIdMDVal->getMetadata();
+      AddTypeIdUse(TypeId).CallSites.push_back(CI);
+    }
+  }
 
-    // Add the type identifier to the equivalence class.
-    GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet);
-    GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdInfo) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
 
-    // Add the referenced globals to the type identifier's equivalence class.
-    for (GlobalTypeMember *GTM : TypeIdInfo[BitSet].RefGlobals)
-      CurSet = GlobalClasses.unionSets(
-          CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second.SummaryList) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS || !ExportSummary->isGlobalValueLive(FS))
+          continue;
+        for (GlobalValue::GUID G : FS->type_tests())
+          for (Metadata *MD : MetadataByGUID[G])
+            AddTypeIdUse(MD).IsExported = true;
+      }
+    }
   }
 
   if (GlobalClasses.empty())
@@ -1349,8 +1664,9 @@ bool LowerTypeTestsModule::lower() {
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed =
-      LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
+  bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr,
+                                      /*ImportSummary=*/nullptr)
+                     .lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index e0bb0eb..0e478ba 100644
--- a/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -96,8 +96,10 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
@@ -127,6 +129,26 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck(
              "'0' disables this check. Works only with '-debug' key."),
     cl::init(0), cl::Hidden);
 
+// Under option -mergefunc-preserve-debug-info we:
+// - Do not create a new function for a thunk.
+// - Retain the debug info for a thunk's parameters (and associated
+//   instructions for the debug info) from the entry block.
+//   Note: -debug will display the algorithm at work.
+// - Create debug-info for the call (to the shared implementation) made by
+//   a thunk and its return value.
+// - Erase the rest of the function, retaining the (minimally sized) entry
+//   block to create a thunk.
+// - Preserve a thunk's call site to point to the thunk even when both occur
+//   within the same translation unit, to aid debugability. Note that this
+//   behaviour differs from the underlying -mergefunc implementation which
+//   modifies the thunk's call site to point to the shared implementation
+//   when both occur within the same translation unit.
+static cl::opt<bool>
+    MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden,
+                      cl::init(false),
+                      cl::desc("Preserve debug info in thunk when mergefunc "
+                               "transformations are made."));
+
 namespace {
 
 class FunctionNode {
@@ -185,11 +207,13 @@ private:
 
   /// A work queue of functions that may have been modified and should be
   /// analyzed again.
-  std::vector<WeakVH> Deferred;
+  std::vector<WeakTrackingVH> Deferred;
 
   /// Checks the rules of order relation introduced among functions set.
   /// Returns true, if sanity check has been passed, and false if failed.
-  bool doSanityCheck(std::vector<WeakVH> &Worklist);
+#ifndef NDEBUG
+  bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
+#endif
 
   /// Insert a ComparableFunction into the FnTree, or merge it away if it's
   /// equal to one that's already present.
@@ -215,8 +239,21 @@ private:
   /// Replace G with a thunk or an alias to F. Deletes G.
   void writeThunkOrAlias(Function *F, Function *G);
 
-  /// Replace G with a simple tail call to bitcast(F). Also replace direct uses
-  /// of G with bitcast(F). Deletes G.
+  /// Fill PDIUnrelatedWL with instructions from the entry block that are
+  /// unrelated to parameter related debug info.
+  void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+                                 std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Erase the rest of the CFG (i.e. barring the entry block).
+  void eraseTail(Function *G);
+
+  /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+  /// parameter debug info, from the entry block.
+  void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL);
+
+  /// Replace G with a simple tail call to bitcast(F). Also (unless
+  /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+  /// delete G.
   void writeThunk(Function *F, Function *G);
 
   /// Replace G with an alias to F. Deletes G.
@@ -248,7 +285,8 @@ ModulePass *llvm::createMergeFunctionsPass() {
   return new MergeFunctions();
 }
 
-bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
+#ifndef NDEBUG
+bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
   if (const unsigned Max = NumFunctionsForSanityCheck) {
     unsigned TripleNumber = 0;
     bool Valid = true;
@@ -256,10 +294,12 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
     dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
 
     unsigned i = 0;
-    for (std::vector<WeakVH>::iterator I = Worklist.begin(), E = Worklist.end();
+    for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
+                                               E = Worklist.end();
          I != E && i < Max; ++I, ++i) {
       unsigned j = i;
-      for (std::vector<WeakVH>::iterator J = I; J != E && j < Max; ++J, ++j) {
+      for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
+           ++J, ++j) {
         Function *F1 = cast<Function>(*I);
         Function *F2 = cast<Function>(*J);
         int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare();
@@ -269,8 +309,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
         if (Res1 != -Res2) {
           dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
                  << "\n";
-          F1->dump();
-          F2->dump();
+          dbgs() << *F1 << '\n' << *F2 << '\n';
           Valid = false;
         }
 
@@ -278,7 +317,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
           continue;
 
         unsigned k = j;
-        for (std::vector<WeakVH>::iterator K = J; K != E && k < Max;
+        for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
              ++k, ++K, ++TripleNumber) {
           if (K == J)
             continue;
@@ -305,9 +344,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
                    << TripleNumber << "\n";
             dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
                    << Res4 << "\n";
-            F1->dump();
-            F2->dump();
-            F3->dump();
+            dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n';
             Valid = false;
           }
         }
@@ -319,6 +356,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) {
   }
   return true;
 }
+#endif
 
 bool MergeFunctions::runOnModule(Module &M) {
   if (skipModule(M))
@@ -349,12 +387,12 @@ bool MergeFunctions::runOnModule(Module &M) {
     // consider merging it. Otherwise it is dropped and never considered again.
     if ((I != S && std::prev(I)->first == I->first) ||
         (std::next(I) != IE && std::next(I)->first == I->first) ) {
-      Deferred.push_back(WeakVH(I->second));
+      Deferred.push_back(WeakTrackingVH(I->second));
     }
   }
   
   do {
-    std::vector<WeakVH> Worklist;
+    std::vector<WeakTrackingVH> Worklist;
     Deferred.swap(Worklist);
 
     DEBUG(doSanityCheck(Worklist));
@@ -363,7 +401,7 @@ bool MergeFunctions::runOnModule(Module &M) {
     DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
 
     // Insert functions and merge them.
-    for (WeakVH &I : Worklist) {
+    for (WeakTrackingVH &I : Worklist) {
       if (!I)
         continue;
       Function *F = cast<Function>(I);
@@ -400,19 +438,15 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
       // Transferring other attributes may help other optimizations, but that
       // should be done uniformly and not in this ad-hoc way.
       auto &Context = New->getContext();
-      auto NewFuncAttrs = New->getAttributes();
-      auto CallSiteAttrs = CS.getAttributes();
-
-      CallSiteAttrs = CallSiteAttrs.addAttributes(
-          Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes());
-
-      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) {
-        AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx);
-        if (Attrs.getNumSlots())
-          CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs);
-      }
-
-      CS.setAttributes(CallSiteAttrs);
+      auto NewPAL = New->getAttributes();
+      SmallVector<AttributeSet, 4> NewArgAttrs;
+      for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++)
+        NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx));
+      // Don't transfer attributes from the function to the callee. Function
+      // attributes typically aren't relevant to the calling convention or ABI.
+      CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(),
+                                          NewPAL.getRetAttributes(),
+                                          NewArgAttrs));
 
       remove(CS.getInstruction()->getParent()->getParent());
       U->set(BitcastNew);
@@ -461,51 +495,242 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
     return Builder.CreateBitCast(V, DestTy);
 }
 
-// Replace G with a simple tail call to bitcast(F). Also replace direct uses
-// of G with bitcast(F). Deletes G.
+// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+// parameter debug info, from the entry block.
+void MergeFunctions::eraseInstsUnrelatedToPDI(
+    std::vector<Instruction *> &PDIUnrelatedWL) {
+
+  DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in "
+                  "entry block) unrelated to parameter debug info from entry "
+                  "block: {\n");
+  while (!PDIUnrelatedWL.empty()) {
+    Instruction *I = PDIUnrelatedWL.back();
+    DEBUG(dbgs() << "  Deleting Instruction: ");
+    DEBUG(I->print(dbgs()));
+    DEBUG(dbgs() << "\n");
+    I->eraseFromParent();
+    PDIUnrelatedWL.pop_back();
+  }
+  DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+                  "debug info from entry block. \n");
+}
+
+// Reduce G to its entry block.
+void MergeFunctions::eraseTail(Function *G) {
+
+  std::vector<BasicBlock *> WorklistBB;
+  for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
+       BBI != BBE; ++BBI) {
+    BBI->dropAllReferences();
+    WorklistBB.push_back(&*BBI);
+  }
+  while (!WorklistBB.empty()) {
+    BasicBlock *BB = WorklistBB.back();
+    BB->eraseFromParent();
+    WorklistBB.pop_back();
+  }
+}
+
+// We are interested in the following instructions from the entry block as being
+// related to parameter debug info:
+// - @llvm.dbg.declare
+// - stores from the incoming parameters to locations on the stack-frame
+// - allocas that create these locations on the stack-frame
+// - @llvm.dbg.value
+// - the entry block's terminator
+// The rest are unrelated to debug info for the parameters; fill up
+// PDIUnrelatedWL with such instructions.
+void MergeFunctions::filterInstsUnrelatedToPDI(
+    BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
+
+  std::set<Instruction *> PDIRelated;
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
+       BI != BIE; ++BI) {
+    if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
+      DEBUG(dbgs() << " Deciding: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DVI->getVariable();
+      if (DILocVar->isParameter()) {
+        DEBUG(dbgs() << "  Include (parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+        PDIRelated.insert(&*BI);
+      } else {
+        DEBUG(dbgs() << "  Delete (!parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
+    } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
+      DEBUG(dbgs() << " Deciding: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      DILocalVariable *DILocVar = DDI->getVariable();
+      if (DILocVar->isParameter()) {
+        DEBUG(dbgs() << "  Parameter: ");
+        DEBUG(DILocVar->print(dbgs()));
+        AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+        if (AI) {
+          DEBUG(dbgs() << "  Processing alloca users: ");
+          DEBUG(dbgs() << "\n");
+          for (User *U : AI->users()) {
+            if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+              if (Value *Arg = SI->getValueOperand()) {
+                if (dyn_cast<Argument>(Arg)) {
+                  DEBUG(dbgs() << "  Include: ");
+                  DEBUG(AI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(AI);
+                  DEBUG(dbgs() << "   Include (parameter): ");
+                  DEBUG(SI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(SI);
+                  DEBUG(dbgs() << "  Include: ");
+                  DEBUG(BI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                  PDIRelated.insert(&*BI);
+                } else {
+                  DEBUG(dbgs() << "   Delete (!parameter): ");
+                  DEBUG(SI->print(dbgs()));
+                  DEBUG(dbgs() << "\n");
+                }
+              }
+            } else {
+              DEBUG(dbgs() << "   Defer: ");
+              DEBUG(U->print(dbgs()));
+              DEBUG(dbgs() << "\n");
+            }
+          }
+        } else {
+          DEBUG(dbgs() << "  Delete (alloca NULL): ");
+          DEBUG(BI->print(dbgs()));
+          DEBUG(dbgs() << "\n");
+        }
+      } else {
+        DEBUG(dbgs() << "  Delete (!parameter): ");
+        DEBUG(BI->print(dbgs()));
+        DEBUG(dbgs() << "\n");
+      }
+    } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) {
+      DEBUG(dbgs() << " Will Include Terminator: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      PDIRelated.insert(&*BI);
+    } else {
+      DEBUG(dbgs() << " Defer: ");
+      DEBUG(BI->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+  DEBUG(dbgs()
+        << " Report parameter debug info related/related instructions: {\n");
+  for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
+       BI != BE; ++BI) {
+
+    Instruction *I = &*BI;
+    if (PDIRelated.find(I) == PDIRelated.end()) {
+      DEBUG(dbgs() << "  !PDIRelated: ");
+      DEBUG(I->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+      PDIUnrelatedWL.push_back(I);
+    } else {
+      DEBUG(dbgs() << "   PDIRelated: ");
+      DEBUG(I->print(dbgs()));
+      DEBUG(dbgs() << "\n");
+    }
+  }
+  DEBUG(dbgs() << " }\n");
+}
+
+// Replace G with a simple tail call to bitcast(F). Also (unless
+// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+// delete G. Under MergeFunctionsPDI, we use G itself for creating
+// the thunk as we preserve the debug info (and associated instructions)
+// from G's entry block pertaining to G's incoming arguments which are
+// passed on as corresponding arguments in the call that G makes to F.
+// For better debugability, under MergeFunctionsPDI, we do not modify G's
+// call sites to point to F even when within the same translation unit.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
-  if (!G->isInterposable()) {
-    // Redirect direct callers of G to F.
+  if (!G->isInterposable() && !MergeFunctionsPDI) {
+    // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+    // above).
     replaceDirectCallers(G, F);
   }
 
   // If G was internal then we may have replaced all uses of G with F. If so,
-  // stop here and delete G. There's no need for a thunk.
-  if (G->hasLocalLinkage() && G->use_empty()) {
+  // stop here and delete G. There's no need for a thunk. (See note on
+  // MergeFunctionsPDI above).
+  if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) {
     G->eraseFromParent();
     return;
   }
 
-  Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
-                                    G->getParent());
-  BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG);
-  IRBuilder<> Builder(BB);
+  BasicBlock *GEntryBlock = nullptr;
+  std::vector<Instruction *> PDIUnrelatedWL;
+  BasicBlock *BB = nullptr;
+  Function *NewG = nullptr;
+  if (MergeFunctionsPDI) {
+    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+                    "function as thunk; retain original: "
+                 << G->getName() << "()\n");
+    GEntryBlock = &G->getEntryBlock();
+    DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+                    "debug info for "
+                 << G->getName() << "() {\n");
+    filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
+    GEntryBlock->getTerminator()->eraseFromParent();
+    BB = GEntryBlock;
+  } else {
+    NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "",
+                            G->getParent());
+    BB = BasicBlock::Create(F->getContext(), "", NewG);
+  }
 
+  IRBuilder<> Builder(BB);
+  Function *H = MergeFunctionsPDI ? G : NewG;
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
   FunctionType *FFTy = F->getFunctionType();
-  for (Argument & AI : NewG->args()) {
+  for (Argument & AI : H->args()) {
     Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
     ++i;
   }
 
   CallInst *CI = Builder.CreateCall(F, Args);
+  ReturnInst *RI = nullptr;
   CI->setTailCall();
   CI->setCallingConv(F->getCallingConv());
   CI->setAttributes(F->getAttributes());
-  if (NewG->getReturnType()->isVoidTy()) {
-    Builder.CreateRetVoid();
+  if (H->getReturnType()->isVoidTy()) {
+    RI = Builder.CreateRetVoid();
   } else {
-    Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType()));
+    RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType()));
   }
 
-  NewG->copyAttributesFrom(G);
-  NewG->takeName(G);
-  removeUsers(G);
-  G->replaceAllUsesWith(NewG);
-  G->eraseFromParent();
+  if (MergeFunctionsPDI) {
+    DISubprogram *DIS = G->getSubprogram();
+    if (DIS) {
+      DebugLoc CIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      DebugLoc RIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS);
+      CI->setDebugLoc(CIDbgLoc);
+      RI->setDebugLoc(RIDbgLoc);
+    } else {
+      DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+                   << G->getName() << "()\n");
+    }
+    eraseTail(G);
+    eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
+    DEBUG(dbgs() << "} // End of parameter related debug info filtering for: "
+                 << G->getName() << "()\n");
+  } else {
+    NewG->copyAttributesFrom(G);
+    NewG->takeName(G);
+    removeUsers(G);
+    G->replaceAllUsesWith(NewG);
+    G->eraseFromParent();
+  }
 
-  DEBUG(dbgs() << "writeThunk: " << NewG->getName() << '\n');
+  DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
   ++NumThunksWritten;
 }
 
diff --git a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 7ef3fc1..8840435 100644
--- a/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -16,8 +16,15 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -27,19 +34,177 @@
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "partialinlining"
-
-STATISTIC(NumPartialInlined, "Number of functions partially inlined");
+#define DEBUG_TYPE "partial-inlining"
+
+STATISTIC(NumPartialInlined,
+          "Number of callsites functions partially inlined into.");
+
+// Command line option to disable partial-inlining. The default is false:
+static cl::opt<bool>
+    DisablePartialInlining("disable-partial-inlining", cl::init(false),
+                           cl::Hidden, cl::desc("Disable partial ininling"));
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+                                      cl::init(false), cl::ZeroOrMore,
+                                      cl::ReallyHidden,
+                                      cl::desc("Skip Cost Analysis"));
+
+static cl::opt<unsigned> MaxNumInlineBlocks(
+    "max-num-inline-blocks", cl::init(5), cl::Hidden,
+    cl::desc("Max Number of Blocks  To be Partially Inlined"));
+
+// Command line option to set the maximum number of partial inlining allowed
+// for the module. The default value of -1 means no limit.
+static cl::opt<int> MaxNumPartialInlining(
+    "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of partial inlining. The default is unlimited"));
+
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+    OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+                             cl::Hidden, cl::ZeroOrMore,
+                             cl::desc("Relative frequency of outline region to "
+                                      "the entry block"));
+
+static cl::opt<unsigned> ExtraOutliningPenalty(
+    "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
+    cl::desc("A debug option to add additional penalty to the computed one."));
 
 namespace {
+
+struct FunctionOutliningInfo {
+  FunctionOutliningInfo()
+      : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr),
+        ReturnBlockPreds() {}
+  // Returns the number of blocks to be inlined including all blocks
+  // in Entries and one return block.
+  unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
+
+  // A set of blocks including the function entry that guard
+  // the region to be outlined.
+  SmallVector<BasicBlock *, 4> Entries;
+  // The return block that is not included in the outlined region.
+  BasicBlock *ReturnBlock;
+  // The dominating block of the region to be outlined.
+  BasicBlock *NonReturnBlock;
+  // The set of blocks in Entries that that are predecessors to ReturnBlock
+  SmallVector<BasicBlock *, 4> ReturnBlockPreds;
+};
+
 struct PartialInlinerImpl {
-  PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(IFI) {}
+  PartialInlinerImpl(
+      std::function<AssumptionCache &(Function &)> *GetAC,
+      std::function<TargetTransformInfo &(Function &)> *GTTI,
+      Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
+      ProfileSummaryInfo *ProfSI)
+      : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
   bool run(Module &M);
   Function *unswitchFunction(Function *F);
 
+  // This class speculatively clones the the function to be partial inlined.
+  // At the end of partial inlining, the remaining callsites to the cloned
+  // function that are not partially inlined will be fixed up to reference
+  // the original function, and the cloned function will be erased.
+  struct FunctionCloner {
+    FunctionCloner(Function *F, FunctionOutliningInfo *OI);
+    ~FunctionCloner();
+
+    // Prepare for function outlining: making sure there is only
+    // one incoming edge from the extracted/outlined region to
+    // the return block.
+    void NormalizeReturnBlock();
+
+    // Do function outlining:
+    Function *doFunctionOutlining();
+
+    Function *OrigFunc = nullptr;
+    Function *ClonedFunc = nullptr;
+    Function *OutlinedFunc = nullptr;
+    BasicBlock *OutliningCallBB = nullptr;
+    // ClonedFunc is inlined in one of its callers after function
+    // outlining.
+    bool IsFunctionInlined = false;
+    // The cost of the region to be outlined.
+    int OutlinedRegionCost = 0;
+    std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+    std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+  };
+
 private:
-  InlineFunctionInfo IFI;
+  int NumPartialInlining = 0;
+  std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+  std::function<TargetTransformInfo &(Function &)> *GetTTI;
+  Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
+  ProfileSummaryInfo *PSI;
+
+  // Return the frequency of the OutlininingBB relative to F's entry point.
+  // The result is no larger than 1 and is represented using BP.
+  // (Note that the outlined region's 'head' block can only have incoming
+  // edges from the guarding entry blocks).
+  BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
+
+  // Return true if the callee of CS should be partially inlined with
+  // profit.
+  bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
+                           BlockFrequency WeightedOutliningRcost,
+                           OptimizationRemarkEmitter &ORE);
+
+  // Try to inline DuplicateFunction (cloned from F with call to
+  // the OutlinedFunction into its callers. Return true
+  // if there is any successful inlining.
+  bool tryPartialInline(FunctionCloner &Cloner);
+
+  // Compute the mapping from use site of DuplicationFunction to the enclosing
+  // BB's profile count.
+  void computeCallsiteToProfCountMap(Function *DuplicateFunction,
+                                     DenseMap<User *, uint64_t> &SiteCountMap);
+
+  bool IsLimitReached() {
+    return (MaxNumPartialInlining != -1 &&
+            NumPartialInlining >= MaxNumPartialInlining);
+  }
+
+  static CallSite getCallSite(User *U) {
+    CallSite CS;
+    if (CallInst *CI = dyn_cast<CallInst>(U))
+      CS = CallSite(CI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
+      CS = CallSite(II);
+    else
+      llvm_unreachable("All uses must be calls");
+    return CS;
+  }
+
+  static CallSite getOneCallSiteTo(Function *F) {
+    User *User = *F->user_begin();
+    return getCallSite(User);
+  }
+
+  std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
+    CallSite CS = getOneCallSiteTo(F);
+    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
+    BasicBlock *Block = CS.getParent();
+    return std::make_tuple(DLoc, Block);
+  }
+
+  // Returns the costs associated with function outlining:
+  // - The first value is the non-weighted runtime cost for making the call
+  //   to the outlined function, including the addtional  setup cost in the
+  //    outlined function itself;
+  // - The second value is the estimated size of the new call sequence in
+  //   basic block Cloner.OutliningCallBB;
+  std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
+  // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+  // approximate both the size and runtime cost (Note that in the current
+  // inline cost analysis, there is no clear distinction there either).
+  static int computeBBInlineCost(BasicBlock *BB);
+
+  std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
+
 };
+
 struct PartialInlinerLegacyPass : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
   PartialInlinerLegacyPass() : ModulePass(ID) {
@@ -48,124 +213,713 @@ struct PartialInlinerLegacyPass : public ModulePass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
   }
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
 
     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
+    TargetTransformInfoWrapperPass *TTIWP =
+        &getAnalysis<TargetTransformInfoWrapperPass>();
+    ProfileSummaryInfo *PSI =
+        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&ACT](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
-    InlineFunctionInfo IFI(nullptr, &GetAssumptionCache);
-    return PartialInlinerImpl(IFI).run(M);
+
+    std::function<TargetTransformInfo &(Function &)> GetTTI =
+        [&TTIWP](Function &F) -> TargetTransformInfo & {
+      return TTIWP->getTTI(F);
+    };
+
+    return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
   }
 };
 }
 
-Function *PartialInlinerImpl::unswitchFunction(Function *F) {
-  // First, verify that this function is an unswitching candidate...
+std::unique_ptr<FunctionOutliningInfo>
+PartialInlinerImpl::computeOutliningInfo(Function *F) {
   BasicBlock *EntryBlock = &F->front();
   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
   if (!BR || BR->isUnconditional())
-    return nullptr;
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Returns true if Succ is BB's successor
+  auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
+    return is_contained(successors(BB), Succ);
+  };
+
+  auto SuccSize = [](BasicBlock *BB) {
+    return std::distance(succ_begin(BB), succ_end(BB));
+  };
+
+  auto IsReturnBlock = [](BasicBlock *BB) {
+    TerminatorInst *TI = BB->getTerminator();
+    return isa<ReturnInst>(TI);
+  };
+
+  auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+    if (IsReturnBlock(Succ1))
+      return std::make_tuple(Succ1, Succ2);
+    if (IsReturnBlock(Succ2))
+      return std::make_tuple(Succ2, Succ1);
+
+    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+  };
+
+  // Detect a triangular shape:
+  auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+    if (IsSuccessor(Succ1, Succ2))
+      return std::make_tuple(Succ1, Succ2);
+    if (IsSuccessor(Succ2, Succ1))
+      return std::make_tuple(Succ2, Succ1);
+
+    return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+  };
+
+  std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
+      llvm::make_unique<FunctionOutliningInfo>();
+
+  BasicBlock *CurrEntry = EntryBlock;
+  bool CandidateFound = false;
+  do {
+    // The number of blocks to be inlined has already reached
+    // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
+    // disables partial inlining for the function.
+    if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
+      break;
+
+    if (SuccSize(CurrEntry) != 2)
+      break;
+
+    BasicBlock *Succ1 = *succ_begin(CurrEntry);
+    BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
+
+    BasicBlock *ReturnBlock, *NonReturnBlock;
+    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+
+    if (ReturnBlock) {
+      OutliningInfo->Entries.push_back(CurrEntry);
+      OutliningInfo->ReturnBlock = ReturnBlock;
+      OutliningInfo->NonReturnBlock = NonReturnBlock;
+      CandidateFound = true;
+      break;
+    }
+
+    BasicBlock *CommSucc;
+    BasicBlock *OtherSucc;
+    std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
+
+    if (!CommSucc)
+      break;
 
-  BasicBlock *ReturnBlock = nullptr;
-  BasicBlock *NonReturnBlock = nullptr;
-  unsigned ReturnCount = 0;
-  for (BasicBlock *BB : successors(EntryBlock)) {
-    if (isa<ReturnInst>(BB->getTerminator())) {
-      ReturnBlock = BB;
-      ReturnCount++;
-    } else
-      NonReturnBlock = BB;
+    OutliningInfo->Entries.push_back(CurrEntry);
+    CurrEntry = OtherSucc;
+
+  } while (true);
+
+  if (!CandidateFound)
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Do sanity check of the entries: threre should not
+  // be any successors (not in the entry set) other than
+  // {ReturnBlock, NonReturnBlock}
+  assert(OutliningInfo->Entries[0] == &F->front() &&
+         "Function Entry must be the first in Entries vector");
+  DenseSet<BasicBlock *> Entries;
+  for (BasicBlock *E : OutliningInfo->Entries)
+    Entries.insert(E);
+
+  // Returns true of BB has Predecessor which is not
+  // in Entries set.
+  auto HasNonEntryPred = [Entries](BasicBlock *BB) {
+    for (auto Pred : predecessors(BB)) {
+      if (!Entries.count(Pred))
+        return true;
+    }
+    return false;
+  };
+  auto CheckAndNormalizeCandidate =
+      [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
+        for (BasicBlock *E : OutliningInfo->Entries) {
+          for (auto Succ : successors(E)) {
+            if (Entries.count(Succ))
+              continue;
+            if (Succ == OutliningInfo->ReturnBlock)
+              OutliningInfo->ReturnBlockPreds.push_back(E);
+            else if (Succ != OutliningInfo->NonReturnBlock)
+              return false;
+          }
+          // There should not be any outside incoming edges either:
+          if (HasNonEntryPred(E))
+            return false;
+        }
+        return true;
+      };
+
+  if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
+    return std::unique_ptr<FunctionOutliningInfo>();
+
+  // Now further growing the candidate's inlining region by
+  // peeling off dominating blocks from the outlining region:
+  while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
+    BasicBlock *Cand = OutliningInfo->NonReturnBlock;
+    if (SuccSize(Cand) != 2)
+      break;
+
+    if (HasNonEntryPred(Cand))
+      break;
+
+    BasicBlock *Succ1 = *succ_begin(Cand);
+    BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
+
+    BasicBlock *ReturnBlock, *NonReturnBlock;
+    std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+    if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
+      break;
+
+    if (NonReturnBlock->getSinglePredecessor() != Cand)
+      break;
+
+    // Now grow and update OutlininigInfo:
+    OutliningInfo->Entries.push_back(Cand);
+    OutliningInfo->NonReturnBlock = NonReturnBlock;
+    OutliningInfo->ReturnBlockPreds.push_back(Cand);
+    Entries.insert(Cand);
   }
 
-  if (ReturnCount != 1)
-    return nullptr;
+  return OutliningInfo;
+}
+
+// Check if there is PGO data or user annoated branch data:
+static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
+  if (F->getEntryCount())
+    return true;
+  // Now check if any of the entry block has MD_prof data:
+  for (auto *E : OI->Entries) {
+    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+    if (!BR || BR->isUnconditional())
+      continue;
+    uint64_t T, F;
+    if (BR->extractProfMetadata(T, F))
+      return true;
+  }
+  return false;
+}
+
+BranchProbability
+PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
+
+  auto EntryFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+  auto OutliningCallFreq =
+      Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
+
+  auto OutlineRegionRelFreq =
+      BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
+                                              EntryFreq.getFrequency());
+
+  if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
+    return OutlineRegionRelFreq;
+
+  // When profile data is not available, we need to be conservative in
+  // estimating the overall savings. Static branch prediction can usually
+  // guess the branch direction right (taken/non-taken), but the guessed
+  // branch probability is usually not biased enough. In case when the
+  // outlined region is predicted to be likely, its probability needs
+  // to be made higher (more biased) to not under-estimate the cost of
+  // function outlining. On the other hand, if the outlined region
+  // is predicted to be less likely, the predicted probablity is usually
+  // higher than the actual. For instance, the actual probability of the
+  // less likely target is only 5%, but the guessed probablity can be
+  // 40%. In the latter case, there is no need for further adjustement.
+  // FIXME: add an option for this.
+  if (OutlineRegionRelFreq < BranchProbability(45, 100))
+    return OutlineRegionRelFreq;
+
+  OutlineRegionRelFreq = std::max(
+      OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+  return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+    CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
+    OptimizationRemarkEmitter &ORE) {
+
+  using namespace ore;
+  if (SkipCostAnalysis)
+    return true;
+
+  Instruction *Call = CS.getInstruction();
+  Function *Callee = CS.getCalledFunction();
+  assert(Callee == Cloner.ClonedFunc);
+
+  Function *Caller = CS.getCaller();
+  auto &CalleeTTI = (*GetTTI)(*Callee);
+  InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
+                                *GetAssumptionCache, GetBFI, PSI);
+
+  if (IC.isAlways()) {
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+             << NV("Callee", Cloner.OrigFunc)
+             << " should always be fully inlined, not partially");
+    return false;
+  }
+
+  if (IC.isNever()) {
+    ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller)
+             << " because it should never be inlined (cost=never)");
+    return false;
+  }
+
+  if (!IC) {
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+             << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+             << NV("Caller", Caller) << " because too costly to inline (cost="
+             << NV("Cost", IC.getCost()) << ", threshold="
+             << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+    return false;
+  }
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+  // The savings of eliminating the call:
+  int NonWeightedSavings = getCallsiteCost(CS, DL);
+  BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+  // Weighted saving is smaller than weighted cost, return false
+  if (NormWeightedSavings < WeightedOutliningRcost) {
+    ORE.emit(
+        OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
+        << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+        << NV("Caller", Caller) << " runtime overhead (overhead="
+        << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
+        << ", savings="
+        << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
+        << " of making the outlined call is too high");
+
+    return false;
+  }
+
+  ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
+           << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
+           << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+           << " (threshold="
+           << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
+  return true;
+}
+
+// TODO: Ideally  we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
+int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
+  int InlineCost = 0;
+  const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::Alloca:
+      continue;
+    case Instruction::GetElementPtr:
+      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+        continue;
+    default:
+      break;
+    }
+
+    IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
+    if (IntrInst) {
+      if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+          IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+        continue;
+    }
+
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(CI), DL);
+      continue;
+    }
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+      InlineCost += getCallsiteCost(CallSite(II), DL);
+      continue;
+    }
+
+    if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+      InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+      continue;
+    }
+    InlineCost += InlineConstants::InstrCost;
+  }
+  return InlineCost;
+}
+
+std::tuple<int, int>
+PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
+
+  // Now compute the cost of the call sequence to the outlined function
+  // 'OutlinedFunction' in BB 'OutliningCallBB':
+  int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
+
+  // Now compute the cost of the extracted/outlined function itself:
+  int OutlinedFunctionCost = 0;
+  for (BasicBlock &BB : *Cloner.OutlinedFunc) {
+    OutlinedFunctionCost += computeBBInlineCost(&BB);
+  }
+
+  assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
+         "Outlined function cost should be no less than the outlined region");
+  // The code extractor introduces a new root and exit stub blocks with
+  // additional unconditional branches. Those branches will be eliminated
+  // later with bb layout. The cost should be adjusted accordingly:
+  OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
+
+  int OutliningRuntimeOverhead =
+      OutliningFuncCallCost +
+      (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+      ExtraOutliningPenalty;
+
+  return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+    Function *DuplicateFunction,
+    DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
+  std::vector<User *> Users(DuplicateFunction->user_begin(),
+                            DuplicateFunction->user_end());
+  Function *CurrentCaller = nullptr;
+  std::unique_ptr<BlockFrequencyInfo> TempBFI;
+  BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+  auto ComputeCurrBFI = [&,this](Function *Caller) {
+      // For the old pass manager:
+      if (!GetBFI) {
+        DominatorTree DT(*Caller);
+        LoopInfo LI(DT);
+        BranchProbabilityInfo BPI(*Caller, LI);
+        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
+        CurrentCallerBFI = TempBFI.get();
+      } else {
+        // New pass manager:
+        CurrentCallerBFI = &(*GetBFI)(*Caller);
+      }
+  };
+
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+    Function *Caller = CS.getCaller();
+    if (CurrentCaller != Caller) {
+      CurrentCaller = Caller;
+      ComputeCurrBFI(Caller);
+    } else {
+      assert(CurrentCallerBFI && "CallerBFI is not set");
+    }
+    BasicBlock *CallBB = CS.getInstruction()->getParent();
+    auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+    if (Count)
+      CallSiteToProfCountMap[User] = *Count;
+    else
+      CallSiteToProfCountMap[User] = 0;
+  }
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
+                                                   FunctionOutliningInfo *OI)
+    : OrigFunc(F) {
+  ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
 
   // Clone the function, so that we can hack away on it.
   ValueToValueMapTy VMap;
-  Function *DuplicateFunction = CloneFunction(F, VMap);
-  DuplicateFunction->setLinkage(GlobalValue::InternalLinkage);
-  BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[EntryBlock]);
-  BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[ReturnBlock]);
-  BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[NonReturnBlock]);
+  ClonedFunc = CloneFunction(F, VMap);
 
+  ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+  ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
+  for (BasicBlock *BB : OI->Entries) {
+    ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+  }
+  for (BasicBlock *E : OI->ReturnBlockPreds) {
+    BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+    ClonedOI->ReturnBlockPreds.push_back(NewE);
+  }
   // Go ahead and update all uses to the duplicate, so that we can just
   // use the inliner functionality when we're done hacking.
-  F->replaceAllUsesWith(DuplicateFunction);
+  F->replaceAllUsesWith(ClonedFunc);
+}
+
+void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
+
+  auto getFirstPHI = [](BasicBlock *BB) {
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
 
   // Special hackery is needed with PHI nodes that have inputs from more than
   // one extracted block.  For simplicity, just split the PHIs into a two-level
   // sequence of PHIs, some of which will go in the extracted region, and some
   // of which will go outside.
-  BasicBlock *PreReturn = NewReturnBlock;
-  NewReturnBlock = NewReturnBlock->splitBasicBlock(
-      NewReturnBlock->getFirstNonPHI()->getIterator());
+  BasicBlock *PreReturn = ClonedOI->ReturnBlock;
+  // only split block when necessary:
+  PHINode *FirstPhi = getFirstPHI(PreReturn);
+  unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+  if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+    return;
+
+  auto IsTrivialPhi = [](PHINode *PN) -> Value * {
+    Value *CommonValue = PN->getIncomingValue(0);
+    if (all_of(PN->incoming_values(),
+               [&](Value *V) { return V == CommonValue; }))
+      return CommonValue;
+    return nullptr;
+  };
+
+  ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
   BasicBlock::iterator I = PreReturn->begin();
-  Instruction *Ins = &NewReturnBlock->front();
+  Instruction *Ins = &ClonedOI->ReturnBlock->front();
+  SmallVector<Instruction *, 4> DeadPhis;
   while (I != PreReturn->end()) {
     PHINode *OldPhi = dyn_cast<PHINode>(I);
     if (!OldPhi)
       break;
 
-    PHINode *RetPhi = PHINode::Create(OldPhi->getType(), 2, "", Ins);
+    PHINode *RetPhi =
+        PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
     OldPhi->replaceAllUsesWith(RetPhi);
-    Ins = NewReturnBlock->getFirstNonPHI();
+    Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
 
     RetPhi->addIncoming(&*I, PreReturn);
-    RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewEntryBlock),
-                        NewEntryBlock);
-    OldPhi->removeIncomingValue(NewEntryBlock);
+    for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+      RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+      OldPhi->removeIncomingValue(E);
+    }
 
+    // After incoming values splitting, the old phi may become trivial.
+    // Keeping the trivial phi can introduce definition inside the outline
+    // region which is live-out, causing necessary overhead (load, store
+    // arg passing etc).
+    if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+      OldPhi->replaceAllUsesWith(OldPhiVal);
+      DeadPhis.push_back(OldPhi);
+    }
     ++I;
-  }
-  NewEntryBlock->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock);
+    }
+    for (auto *DP : DeadPhis)
+      DP->eraseFromParent();
+
+    for (auto E : ClonedOI->ReturnBlockPreds) {
+      E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+    }
+}
+
+Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
+  // Returns true if the block is to be partial inlined into the caller
+  // (i.e. not to be extracted to the out of line function)
+  auto ToBeInlined = [&, this](BasicBlock *BB) {
+    return BB == ClonedOI->ReturnBlock ||
+           (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
+            ClonedOI->Entries.end());
+  };
 
   // Gather up the blocks that we're going to extract.
   std::vector<BasicBlock *> ToExtract;
-  ToExtract.push_back(NewNonReturnBlock);
-  for (BasicBlock &BB : *DuplicateFunction)
-    if (&BB != NewEntryBlock && &BB != NewReturnBlock &&
-        &BB != NewNonReturnBlock)
+  ToExtract.push_back(ClonedOI->NonReturnBlock);
+  OutlinedRegionCost +=
+      PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
+  for (BasicBlock &BB : *ClonedFunc)
+    if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
       ToExtract.push_back(&BB);
+      // FIXME: the code extractor may hoist/sink more code
+      // into the outlined function which may make the outlining
+      // overhead (the difference of the outlined function cost
+      // and OutliningRegionCost) look larger.
+      OutlinedRegionCost += computeBBInlineCost(&BB);
+    }
 
   // The CodeExtractor needs a dominator tree.
   DominatorTree DT;
-  DT.recalculate(*DuplicateFunction);
+  DT.recalculate(*ClonedFunc);
 
   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
   LoopInfo LI(DT);
-  BranchProbabilityInfo BPI(*DuplicateFunction, LI);
-  BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI);
+  BranchProbabilityInfo BPI(*ClonedFunc, LI);
+  ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
 
   // Extract the body of the if.
-  Function *ExtractedFunction =
-      CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI)
-          .extractCodeRegion();
+  OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+                               ClonedFuncBFI.get(), &BPI)
+                     .extractCodeRegion();
+
+  if (OutlinedFunc) {
+    OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
+        .getInstruction()
+        ->getParent();
+    assert(OutliningCallBB->getParent() == ClonedFunc);
+  }
 
-  // Inline the top-level if test into all callers.
-  std::vector<User *> Users(DuplicateFunction->user_begin(),
-                            DuplicateFunction->user_end());
-  for (User *User : Users)
-    if (CallInst *CI = dyn_cast<CallInst>(User))
-      InlineFunction(CI, IFI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(User))
-      InlineFunction(II, IFI);
+  return OutlinedFunc;
+}
 
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
   // Ditch the duplicate, since we're done with it, and rewrite all remaining
   // users (function pointers, etc.) back to the original function.
-  DuplicateFunction->replaceAllUsesWith(F);
-  DuplicateFunction->eraseFromParent();
+  ClonedFunc->replaceAllUsesWith(OrigFunc);
+  ClonedFunc->eraseFromParent();
+  if (!IsFunctionInlined) {
+    // Remove the function that is speculatively created if there is no
+    // reference.
+    if (OutlinedFunc)
+      OutlinedFunc->eraseFromParent();
+  }
+}
 
-  ++NumPartialInlined;
+Function *PartialInlinerImpl::unswitchFunction(Function *F) {
+
+  if (F->hasAddressTaken())
+    return nullptr;
+
+  // Let inliner handle it
+  if (F->hasFnAttribute(Attribute::AlwaysInline))
+    return nullptr;
+
+  if (F->hasFnAttribute(Attribute::NoInline))
+    return nullptr;
+
+  if (PSI->isFunctionEntryCold(F))
+    return nullptr;
+
+  if (F->user_begin() == F->user_end())
+    return nullptr;
+
+  std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
 
-  return ExtractedFunction;
+  if (!OI)
+    return nullptr;
+
+  FunctionCloner Cloner(F, OI.get());
+  Cloner.NormalizeReturnBlock();
+  Function *OutlinedFunction = Cloner.doFunctionOutlining();
+
+  bool AnyInline = tryPartialInline(Cloner);
+
+  if (AnyInline)
+    return OutlinedFunction;
+
+  return nullptr;
+}
+
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
+  int NonWeightedRcost;
+  int SizeCost;
+
+  if (Cloner.OutlinedFunc == nullptr)
+    return false;
+
+  std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
+
+  auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+  auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
+
+  // The call sequence to the outlined function is larger than the original
+  // outlined region size, it does not increase the chances of inlining
+  // the function with outlining (The inliner usies the size increase to
+  // model the cost of inlining a callee).
+  if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+    OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
+    DebugLoc DLoc;
+    BasicBlock *Block;
+    std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
+    ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+                                        DLoc, Block)
+             << ore::NV("Function", Cloner.OrigFunc)
+             << " not partially inlined into callers (Original Size = "
+             << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
+             << ", Size of call sequence to outlined function = "
+             << ore::NV("NewSize", SizeCost) << ")");
+    return false;
+  }
+
+  assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
+         "F's users should all be replaced!");
+
+  std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+                            Cloner.ClonedFunc->user_end());
+
+  DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+  if (Cloner.OrigFunc->getEntryCount())
+    computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
+
+  auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+  uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
+
+  bool AnyInline = false;
+  for (User *User : Users) {
+    CallSite CS = getCallSite(User);
+
+    if (IsLimitReached())
+      continue;
+
+    OptimizationRemarkEmitter ORE(CS.getCaller());
+
+    if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
+      continue;
+
+    ORE.emit(
+        OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
+        << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+        << ore::NV("Caller", CS.getCaller()));
+
+    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
+    InlineFunction(CS, IFI);
+
+    // Now update the entry count:
+    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    }
+
+    AnyInline = true;
+    NumPartialInlining++;
+    // Update the stats
+    NumPartialInlined++;
+  }
+
+  if (AnyInline) {
+    Cloner.IsFunctionInlined = true;
+    if (CalleeEntryCount)
+      Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
+  }
+
+  return AnyInline;
 }
 
 bool PartialInlinerImpl::run(Module &M) {
+  if (DisablePartialInlining)
+    return false;
+
   std::vector<Function *> Worklist;
   Worklist.reserve(M.size());
   for (Function &F : M)
@@ -203,6 +957,8 @@ char PartialInlinerLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
                       "Partial Inliner", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
                     "Partial Inliner", false, false)
 
@@ -213,12 +969,25 @@ ModulePass *llvm::createPartialInliningPass() {
 PreservedAnalyses PartialInlinerPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
       [&FAM](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
-  InlineFunctionInfo IFI(nullptr, &GetAssumptionCache);
-  if (PartialInlinerImpl(IFI).run(M))
+
+  std::function<BlockFrequencyInfo &(Function &)> GetBFI =
+      [&FAM](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+
+  std::function<TargetTransformInfo &(Function &)> GetTTI =
+      [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+  if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 941efb2..0b319f6 100644
--- a/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -38,21 +38,22 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
 
 static cl::opt<bool>
-RunLoopVectorization("vectorize-loops", cl::Hidden,
-                     cl::desc("Run the Loop vectorization passes"));
+    RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden,
+                       cl::ZeroOrMore, cl::desc("Run Partial inlinining pass"));
 
 static cl::opt<bool>
-RunSLPVectorization("vectorize-slp", cl::Hidden,
-                    cl::desc("Run the SLP vectorization passes"));
+    RunLoopVectorization("vectorize-loops", cl::Hidden,
+                         cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
-                    cl::desc("Run the BB vectorization passes"));
+RunSLPVectorization("vectorize-slp", cl::Hidden,
+                    cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
@@ -67,10 +68,6 @@ static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
 
-static cl::opt<bool> RunLoadCombine("combine-loads", cl::init(false),
-                                    cl::Hidden,
-                                    cl::desc("Run the load combining pass"));
-
 static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
                                cl::desc("Run the NewGVN pass"));
 
@@ -93,10 +90,6 @@ static cl::opt<CFLAAType>
                         clEnumValN(CFLAAType::Both, "both",
                                    "Enable both variants of CFL-AA")));
 
-static cl::opt<bool>
-EnableMLSM("mlsm", cl::init(true), cl::Hidden,
-           cl::desc("Enable motion of merged load and store"));
-
 static cl::opt<bool> EnableLoopInterchange(
     "enable-loopinterchange", cl::init(false), cl::Hidden,
     cl::desc("Enable the new, experimental LoopInterchange Pass"));
@@ -140,15 +133,28 @@ static cl::opt<int> PreInlineThreshold(
     cl::desc("Control the amount of inlining in pre-instrumentation inliner "
              "(default = 75)"));
 
+static cl::opt<bool> EnableEarlyCSEMemSSA(
+    "enable-earlycse-memssa", cl::init(true), cl::Hidden,
+    cl::desc("Enable the EarlyCSE w/ MemorySSA pass (default = on)"));
+
 static cl::opt<bool> EnableGVNHoist(
     "enable-gvn-hoist", cl::init(false), cl::Hidden,
-    cl::desc("Enable the GVN hoisting pass"));
+    cl::desc("Enable the GVN hoisting pass (default = off)"));
 
 static cl::opt<bool>
     DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
                               cl::Hidden,
                               cl::desc("Disable shrink-wrap library calls"));
 
+static cl::opt<bool>
+    EnableSimpleLoopUnswitch("enable-simple-loop-unswitch", cl::init(false),
+                             cl::Hidden,
+                             cl::desc("Enable the simple loop unswitch pass."));
+
+static cl::opt<bool> EnableGVNSink(
+    "enable-gvn-sink", cl::init(false), cl::Hidden,
+    cl::desc("Enable the GVN sinking pass (default = off)"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -156,11 +162,9 @@ PassManagerBuilder::PassManagerBuilder() {
     Inliner = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    BBVectorize = RunBBVectorization;
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
-    LoadCombine = RunLoadCombine;
     NewGVN = RunNewGVN;
     DisableGVNLoadPRE = false;
     VerifyInput = false;
@@ -172,6 +176,7 @@ PassManagerBuilder::PassManagerBuilder() {
     PGOInstrUse = RunPGOInstrUse;
     PrepareForThinLTO = EnablePrepareForThinLTO;
     PerformThinLTO = false;
+    DivergentTarget = false;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -183,6 +188,13 @@ PassManagerBuilder::~PassManagerBuilder() {
 static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
    PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
 
+/// Check if GlobalExtensions is constructed and not empty.
+/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
+/// the construction of the object.
+static bool GlobalExtensionsNotEmpty() {
+  return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
+}
+
 void PassManagerBuilder::addGlobalExtension(
     PassManagerBuilder::ExtensionPointTy Ty,
     PassManagerBuilder::ExtensionFn Fn) {
@@ -195,9 +207,12 @@ void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
 
 void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
                                            legacy::PassManagerBase &PM) const {
-  for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i)
-    if ((*GlobalExtensions)[i].first == ETy)
-      (*GlobalExtensions)[i].second(*this, PM);
+  if (GlobalExtensionsNotEmpty()) {
+    for (auto &Ext : *GlobalExtensions) {
+      if (Ext.first == ETy)
+        Ext.second(*this, PM);
+    }
+  }
   for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
     if (Extensions[i].first == ETy)
       Extensions[i].second(*this, PM);
@@ -248,18 +263,17 @@ void PassManagerBuilder::populateFunctionPassManager(
   FPM.add(createCFGSimplificationPass());
   FPM.add(createSROAPass());
   FPM.add(createEarlyCSEPass());
-  if(EnableGVNHoist)
-    FPM.add(createGVNHoistPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
 
 // Do PGO instrumentation generation or use pass as the option specified.
 void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
-  if (!EnablePGOInstrGen && PGOInstrUse.empty())
+  if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
     return;
   // Perform the preinline and cleanup passes for O1 and above.
   // And avoid doing them if optimizing for size.
-  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner) {
+  if (OptLevel > 0 && SizeLevel == 0 && !DisablePreInliner &&
+      PGOSampleUse.empty()) {
     // Create preinline pass. We construct an InlineParams object and specify
     // the threshold here to avoid the command line options of the regular
     // inliner to influence pre-inlining. The only fields of InlineParams we
@@ -283,17 +297,32 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM) {
     InstrProfOptions Options;
     if (!PGOInstrGen.empty())
       Options.InstrProfileOutput = PGOInstrGen;
+    Options.DoCounterPromotion = true;
+    MPM.add(createLoopRotatePass());
     MPM.add(createInstrProfilingLegacyPass(Options));
   }
   if (!PGOInstrUse.empty())
     MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse));
+  // Indirect call promotion that promotes intra-module targets only.
+  // For ThinLTO this is done earlier due to interactions with globalopt
+  // for imported functions. We don't run this at -O0.
+  if (OptLevel > 0)
+    MPM.add(
+        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
 }
 void PassManagerBuilder::addFunctionSimplificationPasses(
     legacy::PassManagerBase &MPM) {
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
   MPM.add(createSROAPass());
-  MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  MPM.add(createEarlyCSEPass(EnableEarlyCSEMemSSA)); // Catch trivial redundancies
+  if (EnableGVNHoist)
+    MPM.add(createGVNHoistPass());
+  if (EnableGVNSink) {
+    MPM.add(createGVNSinkPass());
+    MPM.add(createCFGSimplificationPass());
+  }
+
   // Speculative execution if the target has divergent branches; otherwise nop.
   MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
   MPM.add(createJumpThreadingPass());         // Thread jumps.
@@ -305,29 +334,37 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createLibCallsShrinkWrapPass());
   addExtensionsToPM(EP_Peephole, MPM);
 
+  // Optimize memory intrinsic calls based on the profiled size information.
+  if (SizeLevel == 0)
+    MPM.add(createPGOMemOPSizeOptLegacyPass());
+
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
   MPM.add(createLICMPass());                  // Hoist loop invariants
-  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  if (EnableSimpleLoopUnswitch)
+    MPM.add(createSimpleLoopUnswitchLegacyPass());
+  else
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
   MPM.add(createCFGSimplificationPass());
   addInstructionCombiningPass(MPM);
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  addExtensionsToPM(EP_LateLoopOptimizations, MPM);
   MPM.add(createLoopDeletionPass());          // Delete dead loops
+
   if (EnableLoopInterchange) {
     MPM.add(createLoopInterchangePass()); // Interchange loops
     MPM.add(createCFGSimplificationPass());
   }
   if (!DisableUnrollLoops)
-    MPM.add(createSimpleLoopUnrollPass());    // Unroll small loops
+    MPM.add(createSimpleLoopUnrollPass(OptLevel));    // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
 
   if (OptLevel > 1) {
-    if (EnableMLSM)
-      MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
+    MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
     MPM.add(NewGVN ? createNewGVNPass()
                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
   }
@@ -352,29 +389,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
-  if (!RunSLPAfterLoopVectorization) {
-    if (SLPVectorize)
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
-    }
-  }
-
-  if (LoadCombine)
-    MPM.add(createLoadCombinePass());
+  if (!RunSLPAfterLoopVectorization && SLPVectorize)
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
@@ -409,14 +425,17 @@ void PassManagerBuilder::populateModulePassManager(
     // builds. The function merging pass is
     if (MergeFunctions)
       MPM.add(createMergeFunctionsPass());
-    else if (!GlobalExtensions->empty() || !Extensions.empty())
+    else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
       MPM.add(createBarrierNoopPass());
 
+    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+
+    // Rename anon globals to be able to export them in the summary.
+    // This has to be done after we add the extensions to the pass manager
+    // as there could be passes (e.g. Adddress sanitizer) which introduce
+    // new unnamed globals.
     if (PrepareForThinLTO)
-      // Rename anon globals to be able to export them in the summary.
       MPM.add(createNameAnonGlobalPass());
-
-    addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
     return;
   }
 
@@ -434,7 +453,16 @@ void PassManagerBuilder::populateModulePassManager(
   // earlier in the pass pipeline, here before globalopt. Otherwise imported
   // available_externally functions look unreferenced and are removed.
   if (PerformThinLTO)
-    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true));
+    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
+                                                     !PGOSampleUse.empty()));
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
+  // as it will change the CFG too much to make the 2nd profile annotation
+  // in backend more difficult.
+  bool PrepareForThinLTOUsingPGOSampleProfile =
+      PrepareForThinLTO && !PGOSampleUse.empty();
+  if (PrepareForThinLTOUsingPGOSampleProfile)
+    DisableUnrollLoops = true;
 
   if (!DisableUnitAtATime) {
     // Infer attributes about declarations if possible.
@@ -454,15 +482,13 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
   }
 
-  if (!PerformThinLTO) {
-    /// PGO instrumentation is added during the compile phase for ThinLTO, do
-    /// not run it a second time
+  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
+  // call promotion as it will change the CFG too much to make the 2nd
+  // profile annotation in backend more difficult.
+  // PGO instrumentation is added during the compile phase for ThinLTO, do
+  // not run it a second time
+  if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile)
     addPGOInstrPasses(MPM);
-    // Indirect call promotion that promotes intra-module targets only.
-    // For ThinLTO this is done earlier due to interactions with globalopt
-    // for imported functions.
-    MPM.add(createPGOIndirectCallPromotionLegacyPass());
-  }
 
   if (EnableNonLTOGlobalsModRef)
     // We add a module alias analysis pass here. In part due to bugs in the
@@ -489,6 +515,8 @@ void PassManagerBuilder::populateModulePassManager(
   // pass manager that we are specifically trying to avoid. To prevent this
   // we must insert a no-op module pass to reset the pass manager.
   MPM.add(createBarrierNoopPass());
+  if (RunPartialInlining)
+    MPM.add(createPartialInliningPass());
 
   if (!DisableUnitAtATime && OptLevel > 1 && !PrepareForLTO &&
       !PrepareForThinLTO)
@@ -589,42 +617,24 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCorrelatedValuePropagationPass());
     addInstructionCombiningPass(MPM);
     MPM.add(createLICMPass());
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
     MPM.add(createCFGSimplificationPass());
     addInstructionCombiningPass(MPM);
   }
 
-  if (RunSLPAfterLoopVectorization) {
-    if (SLPVectorize) {
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-      if (OptLevel > 1 && ExtraVectorizerPasses) {
-        MPM.add(createEarlyCSEPass());
-      }
-    }
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
+  if (RunSLPAfterLoopVectorization && SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
     }
   }
 
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables
   addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
-    MPM.add(createLoopUnrollPass());    // Unroll small loops
+    MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
 
     // LoopUnroll may generate some redundency to cleanup.
     addInstructionCombiningPass(MPM);
@@ -662,6 +672,11 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createLoopSinkPass());
   // Get rid of LCSSA nodes.
   MPM.add(createInstructionSimplifierPass());
+
+  // LoopSink (and other loop passes since the last simplifyCFG) might have
+  // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
+  MPM.add(createCFGSimplificationPass());
+
   addExtensionsToPM(EP_OptimizerLast, MPM);
 }
 
@@ -684,7 +699,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
-    PM.add(createPGOIndirectCallPromotionLegacyPass(true));
+    PM.add(
+        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
 
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
@@ -703,7 +719,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createGlobalSplitPass());
 
   // Apply whole-program devirtualization and virtual constant propagation.
-  PM.add(createWholeProgramDevirtPass());
+  PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
 
   // That's all we need at opt level 1.
   if (OptLevel == 1)
@@ -759,8 +775,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
 
   PM.add(createLICMPass());                 // Hoist loop invariants.
-  if (EnableMLSM)
-    PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
+  PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
   PM.add(NewGVN ? createNewGVNPass()
                 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
@@ -775,11 +790,11 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createLoopInterchangePass());
 
   if (!DisableUnrollLoops)
-    PM.add(createSimpleLoopUnrollPass());   // Unroll small loops
+    PM.add(createSimpleLoopUnrollPass(OptLevel));   // Unroll small loops
   PM.add(createLoopVectorizePass(true, LoopVectorize));
   // The vectorizer may have significantly shortened a loop body; unroll again.
   if (!DisableUnrollLoops)
-    PM.add(createLoopUnrollPass());
+    PM.add(createLoopUnrollPass(OptLevel));
 
   // Now that we've optimized loops (in particular loop induction variables),
   // we may have exposed more scalar opportunities. Run parts of the scalar
@@ -799,9 +814,6 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // alignments.
   PM.add(createAlignmentFromAssumptionsPass());
 
-  if (LoadCombine)
-    PM.add(createLoadCombinePass());
-
   // Cleanup and simplify the code after the scalar optimizations.
   addInstructionCombiningPass(PM);
   addExtensionsToPM(EP_Peephole, PM);
@@ -833,6 +845,23 @@ void PassManagerBuilder::populateThinLTOPassManager(
   if (VerifyInput)
     PM.add(createVerifierPass());
 
+  if (ImportSummary) {
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
+    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
+  }
+
   populateModulePassManager(PM);
 
   if (VerifyOutput)
@@ -849,6 +878,12 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
 
   if (OptLevel != 0)
     addLTOOptimizationPasses(PM);
+  else {
+    // The whole-program-devirt pass needs to run at -O0 because only it knows
+    // about the llvm.type.checked.load intrinsic: it needs to both lower the
+    // intrinsic itself and handle it in the summary.
+    PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+  }
 
   // Create a function that performs CFI checks for cross-DSO calls with targets
   // in the current module.
@@ -857,8 +892,7 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   // Lower type metadata and the type.test intrinsic. This pass supports Clang's
   // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
   // link time if CFI is enabled. The pass does nothing if CFI is disabled.
-  PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None,
-                                  /*Summary=*/nullptr));
+  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
diff --git a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
index d9acb9b..3fd5984 100644
--- a/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -14,10 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -28,6 +26,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 6a43f8d..6baada2 100644
--- a/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -42,7 +43,9 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -50,6 +53,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cctype>
 
@@ -159,21 +163,26 @@ protected:
   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
   const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
+  std::vector<const FunctionSamples *>
+  findIndirectCallFunctionSamples(const Instruction &I) const;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  bool inlineHotFunctions(Function &F);
+  bool inlineHotFunctions(Function &F,
+                          DenseSet<GlobalValue::GUID> &ImportGUIDs);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
   bool computeBlockWeights(Function &F);
   void findEquivalenceClasses(Function &F);
+  template <bool IsPostDom>
   void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
-                           DominatorTreeBase<BasicBlock> *DomTree);
+                           DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
+
   void propagateWeights(Function &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(Function &F);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
-  unsigned getOffset(unsigned L, unsigned H) const;
+  unsigned getOffset(const DILocation *DIL) const;
   void clearFunctionData();
 
   /// \brief Map basic blocks to their computed weights.
@@ -202,9 +211,15 @@ protected:
   /// the same number of times.
   EquivalenceClassMap EquivalenceClass;
 
+  /// Map from function name to Function *. Used to find the function from
+  /// the function name. If the function name contains suffix, additional
+  /// entry is added to map from the stripped name to the function if there
+  /// is one-to-one mapping.
+  StringMap<Function *> SymbolMap;
+
   /// \brief Dominance, post-dominance and loop information.
   std::unique_ptr<DominatorTree> DT;
-  std::unique_ptr<DominatorTreeBase<BasicBlock>> PDT;
+  std::unique_ptr<PostDomTreeBase<BasicBlock>> PDT;
   std::unique_ptr<LoopInfo> LI;
 
   AssumptionCacheTracker *ACT;
@@ -326,11 +341,12 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const {
   // If there are inlined callsites in this function, count the samples found
   // in the respective bodies. However, do not bother counting callees with 0
   // total samples, these are callees that were never invoked at runtime.
-  for (const auto &I : FS->getCallsiteSamples()) {
-    const FunctionSamples *CalleeSamples = &I.second;
-    if (callsiteIsHot(FS, CalleeSamples))
-      Count += countUsedRecords(CalleeSamples);
-  }
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Count += countUsedRecords(CalleeSamples);
+    }
 
   return Count;
 }
@@ -343,11 +359,12 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const {
   unsigned Count = FS->getBodySamples().size();
 
   // Only count records in hot callsites.
-  for (const auto &I : FS->getCallsiteSamples()) {
-    const FunctionSamples *CalleeSamples = &I.second;
-    if (callsiteIsHot(FS, CalleeSamples))
-      Count += countBodyRecords(CalleeSamples);
-  }
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Count += countBodyRecords(CalleeSamples);
+    }
 
   return Count;
 }
@@ -362,11 +379,12 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const {
     Total += I.second.getSamples();
 
   // Only count samples in hot callsites.
-  for (const auto &I : FS->getCallsiteSamples()) {
-    const FunctionSamples *CalleeSamples = &I.second;
-    if (callsiteIsHot(FS, CalleeSamples))
-      Total += countBodySamples(CalleeSamples);
-  }
+  for (const auto &I : FS->getCallsiteSamples())
+    for (const auto &J : I.second) {
+      const FunctionSamples *CalleeSamples = &J.second;
+      if (callsiteIsHot(FS, CalleeSamples))
+        Total += countBodySamples(CalleeSamples);
+    }
 
   return Total;
 }
@@ -398,15 +416,11 @@ void SampleProfileLoader::clearFunctionData() {
   CoverageTracker.clear();
 }
 
-/// \brief Returns the offset of lineno \p L to head_lineno \p H
-///
-/// \param L  Lineno
-/// \param H  Header lineno of the function
-///
-/// \returns offset to the header lineno. 16 bits are used to represent offset.
+/// Returns the line offset to the start line of the subprogram.
 /// We assume that a single function will not exceed 65535 LOC.
-unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const {
-  return (L - H) & 0xffff;
+unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
+  return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+         0xffff;
 }
 
 /// \brief Print the weight of edge \p E on stream \p OS.
@@ -451,8 +465,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
 /// \param Inst Instruction to query.
 ///
 /// \returns the weight of \p Inst.
-ErrorOr<uint64_t>
-SampleProfileLoader::getInstWeight(const Instruction &Inst) {
+ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   const DebugLoc &DLoc = Inst.getDebugLoc();
   if (!DLoc)
     return std::error_code();
@@ -470,19 +483,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // If a call/invoke instruction is inlined in profile, but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
-  bool IsCall = isa<CallInst>(Inst) || isa<InvokeInst>(Inst);
-  if (IsCall && findCalleeFunctionSamples(Inst))
+  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+      findCalleeFunctionSamples(Inst))
     return 0;
 
   const DILocation *DIL = DLoc;
-  unsigned Lineno = DLoc.getLine();
-  unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine();
-
-  uint32_t LineOffset = getOffset(Lineno, HeaderLineno);
-  uint32_t Discriminator = DIL->getDiscriminator();
-  ErrorOr<uint64_t> R = IsCall
-                            ? FS->findCallSamplesAt(LineOffset, Discriminator)
-                            : FS->findSamplesAt(LineOffset, Discriminator);
+  uint32_t LineOffset = getOffset(DIL);
+  uint32_t Discriminator = DIL->getBaseDiscriminator();
+  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
   if (R) {
     bool FirstMark =
         CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
@@ -491,13 +499,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) {
       LLVMContext &Ctx = F->getContext();
       emitOptimizationRemark(
           Ctx, DEBUG_TYPE, *F, DLoc,
-          Twine("Applied ") + Twine(*R) + " samples from profile (offset: " +
-              Twine(LineOffset) +
+          Twine("Applied ") + Twine(*R) +
+              " samples from profile (offset: " + Twine(LineOffset) +
               ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")");
     }
-    DEBUG(dbgs() << "    " << Lineno << "." << DIL->getDiscriminator() << ":"
-                 << Inst << " (line offset: " << Lineno - HeaderLineno << "."
-                 << DIL->getDiscriminator() << " - weight: " << R.get()
+    DEBUG(dbgs() << "    " << DLoc.getLine() << "."
+                 << DIL->getBaseDiscriminator() << ":" << Inst
+                 << " (line offset: " << LineOffset << "."
+                 << DIL->getBaseDiscriminator() << " - weight: " << R.get()
                  << ")\n");
   }
   return R;
@@ -511,8 +520,7 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) {
 /// \param BB The basic block to query.
 ///
 /// \returns the weight for \p BB.
-ErrorOr<uint64_t>
-SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
+ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
   uint64_t Max = 0;
   bool HasWeight = false;
   for (auto &I : BB->getInstList()) {
@@ -565,16 +573,49 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
   if (!DIL) {
     return nullptr;
   }
-  DISubprogram *SP = DIL->getScope()->getSubprogram();
-  if (!SP)
-    return nullptr;
+
+  StringRef CalleeName;
+  if (const CallInst *CI = dyn_cast<CallInst>(&Inst))
+    if (Function *Callee = CI->getCalledFunction())
+      CalleeName = Callee->getName();
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return nullptr;
 
-  return FS->findFunctionSamplesAt(LineLocation(
-      getOffset(DIL->getLine(), SP->getLine()), DIL->getDiscriminator()));
+  return FS->findFunctionSamplesAt(
+      LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName);
+}
+
+/// Returns a vector of FunctionSamples that are the indirect call targets
+/// of \p Inst. The vector is sorted by the total number of samples.
+std::vector<const FunctionSamples *>
+SampleProfileLoader::findIndirectCallFunctionSamples(
+    const Instruction &Inst) const {
+  const DILocation *DIL = Inst.getDebugLoc();
+  std::vector<const FunctionSamples *> R;
+
+  if (!DIL) {
+    return R;
+  }
+
+  const FunctionSamples *FS = findFunctionSamples(Inst);
+  if (FS == nullptr)
+    return R;
+
+  if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(
+          LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) {
+    if (M->size() == 0)
+      return R;
+    for (const auto &NameFS : *M) {
+      R.push_back(&NameFS.second);
+    }
+    std::sort(R.begin(), R.end(),
+              [](const FunctionSamples *L, const FunctionSamples *R) {
+                return L->getTotalSamples() > R->getTotalSamples();
+              });
+  }
+  return R;
 }
 
 /// \brief Get the FunctionSamples for an instruction.
@@ -588,23 +629,23 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
 /// \returns the FunctionSamples pointer to the inlined instance.
 const FunctionSamples *
 SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
-  SmallVector<LineLocation, 10> S;
+  SmallVector<std::pair<LineLocation, StringRef>, 10> S;
   const DILocation *DIL = Inst.getDebugLoc();
-  if (!DIL) {
+  if (!DIL)
     return Samples;
-  }
+
+  const DILocation *PrevDIL = DIL;
   for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
-    DISubprogram *SP = DIL->getScope()->getSubprogram();
-    if (!SP)
-      return nullptr;
-    S.push_back(LineLocation(getOffset(DIL->getLine(), SP->getLine()),
-                             DIL->getDiscriminator()));
+    S.push_back(std::make_pair(
+        LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()),
+        PrevDIL->getScope()->getSubprogram()->getLinkageName()));
+    PrevDIL = DIL;
   }
   if (S.size() == 0)
     return Samples;
   const FunctionSamples *FS = Samples;
   for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) {
-    FS = FS->findFunctionSamplesAt(S[i]);
+    FS = FS->findFunctionSamplesAt(S[i].first, S[i].second);
   }
   return FS;
 }
@@ -614,14 +655,17 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 /// Iteratively traverse all callsites of the function \p F, and find if
 /// the corresponding inlined instance exists and is hot in profile. If
 /// it is hot enough, inline the callsites and adds new callsites of the
-/// callee into the caller.
-///
-/// TODO: investigate the possibility of not invoking InlineFunction directly.
+/// callee into the caller. If the call is an indirect call, first promote
+/// it to direct call. Each indirect call is limited with a single target.
 ///
 /// \param F function to perform iterative inlining.
+/// \param ImportGUIDs a set to be updated to include all GUIDs that come
+///     from a different module but inlined in the profiled binary.
 ///
 /// \returns True if there is any inline happened.
-bool SampleProfileLoader::inlineHotFunctions(Function &F) {
+bool SampleProfileLoader::inlineHotFunctions(
+    Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
   bool Changed = false;
   LLVMContext &Ctx = F.getContext();
   std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
@@ -635,7 +679,7 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
       for (auto &I : BB.getInstList()) {
         const FunctionSamples *FS = nullptr;
         if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
-            (FS = findCalleeFunctionSamples(I))) {
+            !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
           Candidates.push_back(&I);
           if (callsiteIsHot(Samples, FS))
             Hot = true;
@@ -647,18 +691,55 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
     }
     for (auto I : CIS) {
       InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
-      CallSite CS(I);
-      Function *CalledFunction = CS.getCalledFunction();
-      if (!CalledFunction || !CalledFunction->getSubprogram())
+      Function *CalledFunction = CallSite(I).getCalledFunction();
+      // Do not inline recursive calls.
+      if (CalledFunction == &F)
         continue;
+      Instruction *DI = I;
+      if (!CalledFunction && !PromotedInsns.count(I) &&
+          CallSite(I).isIndirectCall())
+        for (const auto *FS : findIndirectCallFunctionSamples(*I)) {
+          auto CalleeFunctionName = FS->getName();
+          // If it is a recursive call, we do not inline it as it could bloat
+          // the code exponentially. There is way to better handle this, e.g.
+          // clone the caller first, and inline the cloned caller if it is
+          // recursive. As llvm does not inline recursive calls, we will simply
+          // ignore it instead of handling it explicitly.
+          if (CalleeFunctionName == F.getName())
+            continue;
+          const char *Reason = "Callee function not available";
+          auto R = SymbolMap.find(CalleeFunctionName);
+          if (R == SymbolMap.end())
+            continue;
+          CalledFunction = R->getValue();
+          if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) {
+            // The indirect target was promoted and inlined in the profile, as a
+            // result, we do not have profile info for the branch probability.
+            // We set the probability to 80% taken to indicate that the static
+            // call is likely taken.
+            DI = dyn_cast<Instruction>(
+                promoteIndirectCall(I, CalledFunction, 80, 100, false)
+                    ->stripPointerCasts());
+            PromotedInsns.insert(I);
+          } else {
+            DEBUG(dbgs() << "\nFailed to promote indirect call to "
+                         << CalleeFunctionName << " because " << Reason
+                         << "\n");
+            continue;
+          }
+        }
+      if (!CalledFunction || !CalledFunction->getSubprogram()) {
+        findCalleeFunctionSamples(*I)->findImportedFunctions(
+            ImportGUIDs, F.getParent(),
+            Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
+        continue;
+      }
       DebugLoc DLoc = I->getDebugLoc();
-      uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples();
-      if (InlineFunction(CS, IFI)) {
+      if (InlineFunction(CallSite(DI), IFI)) {
         LocalChanged = true;
         emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
                                Twine("inlined hot callee '") +
-                                   CalledFunction->getName() + "' with " +
-                                   Twine(NumSamples) + " samples into '" +
+                                   CalledFunction->getName() + "' into '" +
                                    F.getName() + "'");
       }
     }
@@ -694,9 +775,10 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
 /// \param DomTree  Opposite dominator tree. If \p Descendants is filled
 ///                 with blocks from \p BB1's dominator tree, then
 ///                 this is the post-dominator tree, and vice versa.
+template <bool IsPostDom>
 void SampleProfileLoader::findEquivalencesFor(
     BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
-    DominatorTreeBase<BasicBlock> *DomTree) {
+    DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
   const BasicBlock *EC = EquivalenceClass[BB1];
   uint64_t Weight = BlockWeights[EC];
   for (const auto *BB2 : Descendants) {
@@ -994,6 +1076,26 @@ void SampleProfileLoader::buildEdges(Function &F) {
   }
 }
 
+/// Sorts the CallTargetMap \p M by count in descending order and stores the
+/// sorted result in \p Sorted. Returns the total counts.
+static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted,
+                                const SampleRecord::CallTargetMap &M) {
+  Sorted.clear();
+  uint64_t Sum = 0;
+  for (auto I = M.begin(); I != M.end(); ++I) {
+    Sum += I->getValue();
+    Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()});
+  }
+  std::sort(Sorted.begin(), Sorted.end(),
+            [](const InstrProfValueData &L, const InstrProfValueData &R) {
+              if (L.Count == R.Count)
+                return L.Value > R.Value;
+              else
+                return L.Count > R.Count;
+            });
+  return Sum;
+}
+
 /// \brief Propagate weights into edges
 ///
 /// The following rules are applied to every block BB in the CFG:
@@ -1015,10 +1117,6 @@ void SampleProfileLoader::propagateWeights(Function &F) {
   bool Changed = true;
   unsigned I = 0;
 
-  // Add an entry count to the function using the samples gathered
-  // at the function entry.
-  F.setEntryCount(Samples->getHeadSamples() + 1);
-
   // If BB weight is larger than its corresponding loop's header BB weight,
   // use the BB weight to replace the loop header BB weight.
   for (auto &BI : F) {
@@ -1071,13 +1169,32 @@ void SampleProfileLoader::propagateWeights(Function &F) {
 
     if (BlockWeights[BB]) {
       for (auto &I : BB->getInstList()) {
-        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-          if (!dyn_cast<IntrinsicInst>(&I)) {
-            SmallVector<uint32_t, 1> Weights;
-            Weights.push_back(BlockWeights[BB]);
-            CI->setMetadata(LLVMContext::MD_prof,
-                            MDB.createBranchWeights(Weights));
-          }
+        if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          continue;
+        CallSite CS(&I);
+        if (!CS.getCalledFunction()) {
+          const DebugLoc &DLoc = I.getDebugLoc();
+          if (!DLoc)
+            continue;
+          const DILocation *DIL = DLoc;
+          uint32_t LineOffset = getOffset(DIL);
+          uint32_t Discriminator = DIL->getBaseDiscriminator();
+
+          const FunctionSamples *FS = findFunctionSamples(I);
+          if (!FS)
+            continue;
+          auto T = FS->findCallTargetMapAt(LineOffset, Discriminator);
+          if (!T || T.get().size() == 0)
+            continue;
+          SmallVector<InstrProfValueData, 2> SortedCallTargets;
+          uint64_t Sum = SortCallTargets(SortedCallTargets, T.get());
+          annotateValueSite(*I.getParent()->getParent()->getParent(), I,
+                            SortedCallTargets, Sum, IPVK_IndirectCallTarget,
+                            SortedCallTargets.size());
+        } else if (!dyn_cast<IntrinsicInst>(&I)) {
+          SmallVector<uint32_t, 1> Weights;
+          Weights.push_back(BlockWeights[BB]);
+          I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
         }
       }
     }
@@ -1087,8 +1204,11 @@ void SampleProfileLoader::propagateWeights(Function &F) {
     if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
       continue;
 
+    DebugLoc BranchLoc = TI->getDebugLoc();
     DEBUG(dbgs() << "\nGetting weights for branch at line "
-                 << TI->getDebugLoc().getLine() << ".\n");
+                 << ((BranchLoc) ? Twine(BranchLoc.getLine())
+                                 : Twine("<UNKNOWN LOCATION>"))
+                 << ".\n");
     SmallVector<uint32_t, 4> Weights;
     uint32_t MaxWeight = 0;
     DebugLoc MaxDestLoc;
@@ -1115,13 +1235,16 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       }
     }
 
+    uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
-    if (MaxWeight > 0) {
+    // Do not set weights if the weights are present. In ThinLTO, the profile
+    // annotation is done twice. If the first annotation already set the
+    // weights, the second pass does not need to set it.
+    if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
       DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
       TI->setMetadata(llvm::LLVMContext::MD_prof,
                       MDB.createBranchWeights(Weights));
-      DebugLoc BranchLoc = TI->getDebugLoc();
       emitOptimizationRemark(
           Ctx, DEBUG_TYPE, F, MaxDestLoc,
           Twine("most popular destination for conditional branches at ") +
@@ -1163,7 +1286,7 @@ void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
   DT.reset(new DominatorTree);
   DT->recalculate(F);
 
-  PDT.reset(new DominatorTreeBase<BasicBlock>(true));
+  PDT.reset(new PostDomTreeBase<BasicBlock>());
   PDT->recalculate(F);
 
   LI.reset(new LoopInfo);
@@ -1228,12 +1351,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
                << ": " << getFunctionLoc(F) << "\n");
 
-  Changed |= inlineHotFunctions(F);
+  DenseSet<GlobalValue::GUID> ImportGUIDs;
+  Changed |= inlineHotFunctions(F, ImportGUIDs);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
 
   if (Changed) {
+    // Add an entry count to the function using the samples gathered at the
+    // function entry. Also sets the GUIDs that comes from a different
+    // module but inlined in the profiled binary. This is aiming at making
+    // the IR match the profiled binary before annotation.
+    F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs);
+
     // Compute dominance and loop info needed for propagation.
     computeDominanceAndLoopInfo(F);
 
@@ -1309,6 +1439,26 @@ bool SampleProfileLoader::runOnModule(Module &M) {
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
 
+  // Populate the symbol map.
+  for (const auto &N_F : M.getValueSymbolTable()) {
+    std::string OrigName = N_F.getKey();
+    Function *F = dyn_cast<Function>(N_F.getValue());
+    if (F == nullptr)
+      continue;
+    SymbolMap[OrigName] = F;
+    auto pos = OrigName.find('.');
+    if (pos != std::string::npos) {
+      std::string NewName = OrigName.substr(0, pos);
+      auto r = SymbolMap.insert(std::make_pair(NewName, F));
+      // Failiing to insert means there is already an entry in SymbolMap,
+      // thus there are multiple functions that are mapped to the same
+      // stripped name. In this case of name conflicting, set the value
+      // to nullptr to avoid confusion.
+      if (!r.second)
+        r.first->second = nullptr;
+    }
+  }
+
   bool retval = false;
   for (auto &F : M)
     if (!F.isDeclaration()) {
@@ -1329,7 +1479,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 bool SampleProfileLoader::runOnFunction(Function &F) {
   F.setEntryCount(0);
   Samples = Reader->getSamplesFor(F);
-  if (!Samples->empty())
+  if (Samples && !Samples->empty())
     return emitAnnotations(F);
   return false;
 }
@@ -1337,7 +1487,8 @@ bool SampleProfileLoader::runOnFunction(Function &F) {
 PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
                                                ModuleAnalysisManager &AM) {
 
-  SampleProfileLoader SampleLoader(SampleProfileFile);
+  SampleProfileLoader SampleLoader(
+      ProfileFileName.empty() ? SampleProfileFile : ProfileFileName);
 
   SampleLoader.doInitialization(M);
 
diff --git a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 8f6f161..de1b51e 100644
--- a/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -20,7 +20,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -30,6 +29,7 @@
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -323,6 +323,14 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
       LiveGVs.insert(GVE);
   }
 
+  std::set<DICompileUnit *> LiveCUs;
+  // Any CU referenced from a subprogram is live.
+  for (DISubprogram *SP : F.subprograms()) {
+    if (SP->getUnit())
+      LiveCUs.insert(SP->getUnit());
+  }
+
+  bool HasDeadCUs = false;
   for (DICompileUnit *DIC : F.compile_units()) {
     // Create our live global variable list.
     bool GlobalVariableChange = false;
@@ -341,6 +349,11 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
         GlobalVariableChange = true;
     }
 
+    if (!LiveGlobalVariables.empty())
+      LiveCUs.insert(DIC);
+    else if (!LiveCUs.count(DIC))
+      HasDeadCUs = true;
+
     // If we found dead global variables, replace the current global
     // variable list with our new live global variable list.
     if (GlobalVariableChange) {
@@ -352,5 +365,16 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
     LiveGlobalVariables.clear();
   }
 
+  if (HasDeadCUs) {
+    // Delete the old node and replace it with a new one
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+    NMD->clearOperands();
+    if (!LiveCUs.empty()) {
+      for (DICompileUnit *CU : LiveCUs)
+        NMD->addOperand(CU);
+    }
+    Changed = true;
+  }
+
   return Changed;
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 3680cfc..8ef6bb6 100644
--- a/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -6,99 +6,67 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This pass prepares a module containing type metadata for ThinLTO by splitting
-// it into regular and thin LTO parts if possible, and writing both parts to
-// a multi-module bitcode file. Modules that do not contain type metadata are
-// written unmodified as a single module.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 using namespace llvm;
 
 namespace {
 
-// Produce a unique identifier for this module by taking the MD5 sum of the
-// names of the module's strong external symbols. This identifier is
-// normally guaranteed to be unique, or the program would fail to link due to
-// multiply defined symbols.
-//
-// If the module has no strong external symbols (such a module may still have a
-// semantic effect if it performs global initialization), we cannot produce a
-// unique identifier for this module, so we return the empty string, which
-// causes the entire module to be written as a regular LTO module.
-std::string getModuleId(Module *M) {
-  MD5 Md5;
-  bool ExportsSymbols = false;
-  auto AddGlobal = [&](GlobalValue &GV) {
-    if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
-        !GV.hasExternalLinkage())
-      return;
-    ExportsSymbols = true;
-    Md5.update(GV.getName());
-    Md5.update(ArrayRef<uint8_t>{0});
-  };
-
-  for (auto &F : *M)
-    AddGlobal(F);
-  for (auto &GV : M->globals())
-    AddGlobal(GV);
-  for (auto &GA : M->aliases())
-    AddGlobal(GA);
-  for (auto &IF : M->ifuncs())
-    AddGlobal(IF);
-
-  if (!ExportsSymbols)
-    return "";
-
-  MD5::MD5Result R;
-  Md5.final(R);
-
-  SmallString<32> Str;
-  MD5::stringifyResult(R, Str);
-  return ("$" + Str).str();
-}
-
 // Promote each local-linkage entity defined by ExportM and used by ImportM by
 // changing visibility and appending the given ModuleId.
-void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
-  auto PromoteInternal = [&](GlobalValue &ExportGV) {
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
+                      SetVector<GlobalValue *> &PromoteExtra) {
+  DenseMap<const Comdat *, Comdat *> RenamedComdats;
+  for (auto &ExportGV : ExportM.global_values()) {
     if (!ExportGV.hasLocalLinkage())
-      return;
+      continue;
+
+    auto Name = ExportGV.getName();
+    GlobalValue *ImportGV = ImportM.getNamedValue(Name);
+    if ((!ImportGV || ImportGV->use_empty()) && !PromoteExtra.count(&ExportGV))
+      continue;
 
-    GlobalValue *ImportGV = ImportM.getNamedValue(ExportGV.getName());
-    if (!ImportGV || ImportGV->use_empty())
-      return;
+    std::string NewName = (Name + ModuleId).str();
 
-    std::string NewName = (ExportGV.getName() + ModuleId).str();
+    if (const auto *C = ExportGV.getComdat())
+      if (C->getName() == Name)
+        RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
 
     ExportGV.setName(NewName);
     ExportGV.setLinkage(GlobalValue::ExternalLinkage);
     ExportGV.setVisibility(GlobalValue::HiddenVisibility);
 
-    ImportGV->setName(NewName);
-    ImportGV->setVisibility(GlobalValue::HiddenVisibility);
-  };
+    if (ImportGV) {
+      ImportGV->setName(NewName);
+      ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+    }
+  }
 
-  for (auto &F : ExportM)
-    PromoteInternal(F);
-  for (auto &GV : ExportM.globals())
-    PromoteInternal(GV);
-  for (auto &GA : ExportM.aliases())
-    PromoteInternal(GA);
-  for (auto &IF : ExportM.ifuncs())
-    PromoteInternal(IF);
+  if (!RenamedComdats.empty())
+    for (auto &GO : ExportM.global_objects())
+      if (auto *C = GO.getComdat()) {
+        auto Replacement = RenamedComdats.find(C);
+        if (Replacement != RenamedComdats.end())
+          GO.setComdat(Replacement->second);
+      }
 }
 
 // Promote all internal (i.e. distinct) type ids used by the module by replacing
@@ -194,24 +162,7 @@ void simplifyExternals(Module &M) {
 }
 
 void filterModule(
-    Module *M, std::function<bool(const GlobalValue *)> ShouldKeepDefinition) {
-  for (Function &F : *M) {
-    if (ShouldKeepDefinition(&F))
-      continue;
-
-    F.deleteBody();
-    F.clearMetadata();
-  }
-
-  for (GlobalVariable &GV : M->globals()) {
-    if (ShouldKeepDefinition(&GV))
-      continue;
-
-    GV.setInitializer(nullptr);
-    GV.setLinkage(GlobalValue::ExternalLinkage);
-    GV.clearMetadata();
-  }
-
+    Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
   for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E;) {
     GlobalAlias *GA = &*I++;
@@ -219,65 +170,227 @@ void filterModule(
       continue;
 
     GlobalObject *GO;
-    if (I->getValueType()->isFunctionTy())
+    if (GA->getValueType()->isFunctionTy())
       GO = Function::Create(cast<FunctionType>(GA->getValueType()),
                             GlobalValue::ExternalLinkage, "", M);
     else
       GO = new GlobalVariable(
           *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
-          (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+          nullptr, "", nullptr,
           GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
     GO->takeName(GA);
     GA->replaceAllUsesWith(GO);
     GA->eraseFromParent();
   }
+
+  for (Function &F : *M) {
+    if (ShouldKeepDefinition(&F))
+      continue;
+
+    F.deleteBody();
+    F.setComdat(nullptr);
+    F.clearMetadata();
+  }
+
+  for (GlobalVariable &GV : M->globals()) {
+    if (ShouldKeepDefinition(&GV))
+      continue;
+
+    GV.setInitializer(nullptr);
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    GV.setComdat(nullptr);
+    GV.clearMetadata();
+  }
+}
+
+void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
+  if (auto *F = dyn_cast<Function>(C))
+    return Fn(F);
+  if (isa<GlobalValue>(C))
+    return;
+  for (Value *Op : C->operands())
+    forEachVirtualFunction(cast<Constant>(Op), Fn);
 }
 
 // If it's possible to split M into regular and thin LTO parts, do so and write
 // a multi-module bitcode file with the two parts to OS. Otherwise, write only a
 // regular LTO bitcode file to OS.
-void splitAndWriteThinLTOBitcode(raw_ostream &OS, Module &M) {
-  std::string ModuleId = getModuleId(&M);
+void splitAndWriteThinLTOBitcode(
+    raw_ostream &OS, raw_ostream *ThinLinkOS,
+    function_ref<AAResults &(Function &)> AARGetter, Module &M) {
+  std::string ModuleId = getUniqueModuleId(&M);
   if (ModuleId.empty()) {
     // We couldn't generate a module ID for this module, just write it out as a
     // regular LTO module.
     WriteBitcodeToFile(&M, OS);
+    if (ThinLinkOS)
+      // We don't have a ThinLTO part, but still write the module to the
+      // ThinLinkOS if requested so that the expected output file is produced.
+      WriteBitcodeToFile(&M, *ThinLinkOS);
     return;
   }
 
   promoteTypeIds(M, ModuleId);
 
-  auto IsInMergedM = [&](const GlobalValue *GV) {
-    auto *GVar = dyn_cast<GlobalVariable>(GV->getBaseObject());
-    if (!GVar)
-      return false;
-
+  // Returns whether a global has attached type metadata. Such globals may
+  // participate in CFI or whole-program devirtualization, so they need to
+  // appear in the merged module instead of the thin LTO module.
+  auto HasTypeMetadata = [&](const GlobalObject *GO) {
     SmallVector<MDNode *, 1> MDs;
-    GVar->getMetadata(LLVMContext::MD_type, MDs);
+    GO->getMetadata(LLVMContext::MD_type, MDs);
     return !MDs.empty();
   };
 
+  // Collect the set of virtual functions that are eligible for virtual constant
+  // propagation. Each eligible function must not access memory, must return
+  // an integer of width <=64 bits, must take at least one argument, must not
+  // use its first argument (assumed to be "this") and all arguments other than
+  // the first one must be of <=64 bit integer type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  std::set<const Function *> EligibleVirtualFns;
+  // If any member of a comdat lives in MergedM, put all members of that
+  // comdat in MergedM to keep the comdat together.
+  DenseSet<const Comdat *> MergedMComdats;
+  for (GlobalVariable &GV : M.globals())
+    if (HasTypeMetadata(&GV)) {
+      if (const auto *C = GV.getComdat())
+        MergedMComdats.insert(C);
+      forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
+        auto *RT = dyn_cast<IntegerType>(F->getReturnType());
+        if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
+            !F->arg_begin()->use_empty())
+          return;
+        for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) {
+          auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
+          if (!ArgT || ArgT->getBitWidth() > 64)
+            return;
+        }
+        if (!F->isDeclaration() &&
+            computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+          EligibleVirtualFns.insert(F);
+      });
+    }
+
   ValueToValueMapTy VMap;
-  std::unique_ptr<Module> MergedM(CloneModule(&M, VMap, IsInMergedM));
+  std::unique_ptr<Module> MergedM(
+      CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool {
+        if (const auto *C = GV->getComdat())
+          if (MergedMComdats.count(C))
+            return true;
+        if (auto *F = dyn_cast<Function>(GV))
+          return EligibleVirtualFns.count(F);
+        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+          return HasTypeMetadata(GVar);
+        return false;
+      }));
+  StripDebugInfo(*MergedM);
+
+  for (Function &F : *MergedM)
+    if (!F.isDeclaration()) {
+      // Reset the linkage of all functions eligible for virtual constant
+      // propagation. The canonical definitions live in the thin LTO module so
+      // that they can be imported.
+      F.setLinkage(GlobalValue::AvailableExternallyLinkage);
+      F.setComdat(nullptr);
+    }
 
-  filterModule(&M, [&](const GlobalValue *GV) { return !IsInMergedM(GV); });
+  SetVector<GlobalValue *> CfiFunctions;
+  for (auto &F : M)
+    if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
+      CfiFunctions.insert(&F);
+
+  // Remove all globals with type metadata, globals with comdats that live in
+  // MergedM, and aliases pointing to such globals from the thin LTO module.
+  filterModule(&M, [&](const GlobalValue *GV) {
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+      if (HasTypeMetadata(GVar))
+        return false;
+    if (const auto *C = GV->getComdat())
+      if (MergedMComdats.count(C))
+        return false;
+    return true;
+  });
+
+  promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
+  promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+
+  SmallVector<MDNode *, 8> CfiFunctionMDs;
+  for (auto V : CfiFunctions) {
+    Function &F = *cast<Function>(V);
+    SmallVector<MDNode *, 2> Types;
+    F.getMetadata(LLVMContext::MD_type, Types);
+
+    auto &Ctx = MergedM->getContext();
+    SmallVector<Metadata *, 4> Elts;
+    Elts.push_back(MDString::get(Ctx, F.getName()));
+    CfiFunctionLinkage Linkage;
+    if (!F.isDeclarationForLinker())
+      Linkage = CFL_Definition;
+    else if (F.isWeakForLinker())
+      Linkage = CFL_WeakDeclaration;
+    else
+      Linkage = CFL_Declaration;
+    Elts.push_back(ConstantAsMetadata::get(
+        llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
+    for (auto Type : Types)
+      Elts.push_back(Type);
+    CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
+  }
 
-  promoteInternals(*MergedM, M, ModuleId);
-  promoteInternals(M, *MergedM, ModuleId);
+  if(!CfiFunctionMDs.empty()) {
+    NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
+    for (auto MD : CfiFunctionMDs)
+      NMD->addOperand(MD);
+  }
 
   simplifyExternals(*MergedM);
 
-  SmallVector<char, 0> Buffer;
-  BitcodeWriter W(Buffer);
-
   // FIXME: Try to re-use BSI and PFI from the original module here.
-  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
-  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
-                /*GenerateHash=*/true);
+  ProfileSummaryInfo PSI(M);
+  ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
 
-  W.writeModule(MergedM.get());
+  // Mark the merged module as requiring full LTO. We still want an index for
+  // it though, so that it can participate in summary-based dead stripping.
+  MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+  ModuleSummaryIndex MergedMIndex =
+      buildModuleSummaryIndex(*MergedM, nullptr, &PSI);
 
+  SmallVector<char, 0> Buffer;
+
+  BitcodeWriter W(Buffer);
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                /*GenerateHash=*/true, &ModHash);
+  W.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
+                &MergedMIndex);
+  W.writeSymtab();
+  W.writeStrtab();
   OS << Buffer;
+
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info (the merged module was already stripped above)
+  // and write it to the given OS.
+  if (ThinLinkOS) {
+    Buffer.clear();
+    BitcodeWriter W2(Buffer);
+    StripDebugInfo(M);
+    W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                   /*GenerateHash=*/false, &ModHash);
+    W2.writeModule(MergedM.get(), /*ShouldPreserveUseListOrder=*/false,
+                   &MergedMIndex);
+    W2.writeSymtab();
+    W2.writeStrtab();
+    *ThinLinkOS << Buffer;
+  }
 }
 
 // Returns whether this module needs to be split because it uses type metadata.
@@ -292,28 +405,45 @@ bool requiresSplit(Module &M) {
   return false;
 }
 
-void writeThinLTOBitcode(raw_ostream &OS, Module &M,
-                         const ModuleSummaryIndex *Index) {
+void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
+                         function_ref<AAResults &(Function &)> AARGetter,
+                         Module &M, const ModuleSummaryIndex *Index) {
   // See if this module has any type metadata. If so, we need to split it.
   if (requiresSplit(M))
-    return splitAndWriteThinLTOBitcode(OS, M);
+    return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
 
   // Otherwise we can just write it out as a regular module.
+
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
   WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
-                     /*GenerateHash=*/true);
+                     /*GenerateHash=*/true, &ModHash);
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info and write it to the given OS.
+  if (ThinLinkOS) {
+    StripDebugInfo(M);
+    WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+                       Index,
+                       /*GenerateHash=*/false, &ModHash);
+  }
 }
 
 class WriteThinLTOBitcode : public ModulePass {
   raw_ostream &OS; // raw_ostream to print on
+  // The output stream on which to emit a minimized module for use
+  // just in the thin link, if requested.
+  raw_ostream *ThinLinkOS;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) {
+  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
-  explicit WriteThinLTOBitcode(raw_ostream &o)
-      : ModulePass(ID), OS(o) {
+  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
+      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
@@ -322,12 +452,14 @@ public:
   bool runOnModule(Module &M) override {
     const ModuleSummaryIndex *Index =
         &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
-    writeThinLTOBitcode(OS, M, Index);
+    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
     return true;
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ModuleSummaryIndexWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 } // anonymous namespace
@@ -335,10 +467,25 @@ public:
 char WriteThinLTOBitcode::ID = 0;
 INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
                       "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
                     "Write ThinLTO Bitcode", false, true)
 
-ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str) {
-  return new WriteThinLTOBitcode(Str);
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
+                                                raw_ostream *ThinLinkOS) {
+  return new WriteThinLTOBitcode(Str, ThinLinkOS);
+}
+
+PreservedAnalyses
+llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  writeThinLTOBitcode(OS, ThinLinkOS,
+                      [&FAM](Function &F) -> AAResults & {
+                        return FAM.getResult<AAManager>(F);
+                      },
+                      M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
+  return PreservedAnalyses::all();
 }
diff --git a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 844cc0f..00769cd 100644
--- a/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -25,6 +25,20 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
+// This pass is intended to be used during the regular and thin LTO pipelines.
+// During regular LTO, the pass determines the best optimization for each
+// virtual call and applies the resolutions directly to virtual calls that are
+// eligible for virtual call optimization (i.e. calls that use either of the
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During
+// ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over a single merged module
+//   that contains all vtables with !type metadata that participate in the link.
+//   The pass computes a resolution for each virtual call and stores it in the
+//   type identifier summary.
+// - Import phase: this is run during the thin backends over the individual
+//   modules. The pass applies the resolutions previously computed during the
+//   import phase to each eligible virtual call.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -32,9 +46,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -54,12 +70,16 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include <algorithm>
 #include <cstddef>
@@ -72,6 +92,26 @@ using namespace wholeprogramdevirt;
 
 #define DEBUG_TYPE "wholeprogramdevirt"
 
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "wholeprogramdevirt-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "wholeprogramdevirt-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "wholeprogramdevirt-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -259,15 +299,92 @@ struct VirtualCallSite {
   }
 };
 
+// Call site information collected for a specific VTableSlot and possibly a list
+// of constant integer arguments. The grouping by arguments is handled by the
+// VTableSlotInfo class.
+struct CallSiteInfo {
+  /// The set of call sites for this slot. Used during regular LTO and the
+  /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
+  /// call sites that appear in the merged module itself); in each of these
+  /// cases we are directly operating on the call sites at the IR level.
+  std::vector<VirtualCallSite> CallSites;
+
+  // These fields are used during the export phase of ThinLTO and reflect
+  // information collected from function summaries.
+
+  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
+  /// this slot.
+  bool SummaryHasTypeTestAssumeUsers;
+
+  /// CFI-specific: a vector containing the list of function summaries that use
+  /// the llvm.type.checked.load intrinsic and therefore will require
+  /// resolutions for llvm.type.test in order to implement CFI checks if
+  /// devirtualization was unsuccessful. If devirtualization was successful, the
+  /// pass will clear this vector by calling markDevirt(). If at the end of the
+  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
+  /// to each of the function summaries in the vector.
+  std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+
+  bool isExported() const {
+    return SummaryHasTypeTestAssumeUsers ||
+           !SummaryTypeCheckedLoadUsers.empty();
+  }
+
+  /// As explained in the comment for SummaryTypeCheckedLoadUsers.
+  void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); }
+};
+
+// Call site information collected for a specific VTableSlot.
+struct VTableSlotInfo {
+  // The set of call sites which do not have all constant integer arguments
+  // (excluding "this").
+  CallSiteInfo CSInfo;
+
+  // The set of call sites with all constant integer arguments (excluding
+  // "this"), grouped by argument list.
+  std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
+
+  void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses);
+
+private:
+  CallSiteInfo &findCallSiteInfo(CallSite CS);
+};
+
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
+  std::vector<uint64_t> Args;
+  auto *CI = dyn_cast<IntegerType>(CS.getType());
+  if (!CI || CI->getBitWidth() > 64 || CS.arg_empty())
+    return CSInfo;
+  for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64)
+      return CSInfo;
+    Args.push_back(CI->getZExtValue());
+  }
+  return ConstCSInfo[Args];
+}
+
+void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
+                                 unsigned *NumUnsafeUses) {
+  findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses});
+}
+
 struct DevirtModule {
   Module &M;
+  function_ref<AAResults &(Function &)> AARGetter;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
   IntegerType *Int8Ty;
   PointerType *Int8PtrTy;
   IntegerType *Int32Ty;
+  IntegerType *Int64Ty;
+  IntegerType *IntPtrTy;
 
   bool RemarksEnabled;
 
-  MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots;
+  MapVector<VTableSlot, VTableSlotInfo> CallSlots;
 
   // This map keeps track of the number of "unsafe" uses of a loaded function
   // pointer. The key is the associated llvm.type.test intrinsic call generated
@@ -279,11 +396,18 @@ struct DevirtModule {
   // true.
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
 
-  DevirtModule(Module &M)
-      : M(M), Int8Ty(Type::getInt8Ty(M.getContext())),
+  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+               ModuleSummaryIndex *ExportSummary,
+               const ModuleSummaryIndex *ImportSummary)
+      : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())),
         Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
         Int32Ty(Type::getInt32Ty(M.getContext())),
-        RemarksEnabled(areRemarksEnabled()) {}
+        Int64Ty(Type::getInt64Ty(M.getContext())),
+        IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+        RemarksEnabled(areRemarksEnabled()) {
+    assert(!(ExportSummary && ImportSummary));
+  }
 
   bool areRemarksEnabled();
 
@@ -298,57 +422,169 @@ struct DevirtModule {
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
                             uint64_t ByteOffset);
+
+  void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
+                             bool &IsExported);
   bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           MutableArrayRef<VirtualCallSite> CallSites);
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res);
+
   bool tryEvaluateFunctionsWithArgs(
       MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-      ArrayRef<ConstantInt *> Args);
-  bool tryUniformRetValOpt(IntegerType *RetType,
-                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           MutableArrayRef<VirtualCallSite> CallSites);
+      ArrayRef<uint64_t> Args);
+
+  void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                             uint64_t TheRetVal);
+  bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           CallSiteInfo &CSInfo,
+                           WholeProgramDevirtResolution::ByArg *Res);
+
+  // Returns the global symbol name that is used to export information about the
+  // given vtable slot and list of arguments.
+  std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                            StringRef Name);
+
+  // This function is called during the export phase to create a symbol
+  // definition containing information about the given vtable slot and list of
+  // arguments.
+  void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                    Constant *C);
+
+  // This function is called during the import phase to create a reference to
+  // the symbol definition created during the export phase.
+  Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                         StringRef Name, unsigned AbsWidth = 0);
+
+  void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
+                            Constant *UniqueMemberAddr);
   bool tryUniqueRetValOpt(unsigned BitWidth,
                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                          MutableArrayRef<VirtualCallSite> CallSites);
+                          CallSiteInfo &CSInfo,
+                          WholeProgramDevirtResolution::ByArg *Res,
+                          VTableSlot Slot, ArrayRef<uint64_t> Args);
+
+  void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                             Constant *Byte, Constant *Bit);
   bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           ArrayRef<VirtualCallSite> CallSites);
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res, VTableSlot Slot);
 
   void rebuildGlobal(VTableBits &B);
 
+  // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
+  void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
+
+  // If we were able to eliminate all unsafe uses for a type checked load,
+  // eliminate the associated type tests by replacing them with true.
+  void removeRedundantTypeTests();
+
   bool run();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M,
+                            function_ref<AAResults &(Function &)> AARGetter);
 };
 
 struct WholeProgramDevirt : public ModulePass {
   static char ID;
 
-  WholeProgramDevirt() : ModulePass(ID) {
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
+                     const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
     initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
+    if (UseCommandLine)
+      return DevirtModule::runForTesting(M, LegacyAARGetter(*this));
+    return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary)
+        .run();
+  }
 
-    return DevirtModule(M).run();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt",
-                "Whole program devirtualization", false, false)
+INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
+                      "Whole program devirtualization", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
+                    "Whole program devirtualization", false, false)
 char WholeProgramDevirt::ID = 0;
 
-ModulePass *llvm::createWholeProgramDevirtPass() {
-  return new WholeProgramDevirt;
+ModulePass *
+llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                                   const ModuleSummaryIndex *ImportSummary) {
+  return new WholeProgramDevirt(ExportSummary, ImportSummary);
 }
 
 PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
-                                              ModuleAnalysisManager &) {
-  if (!DevirtModule(M).run())
+                                              ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+  if (!DevirtModule(M, AARGetter, nullptr, nullptr).run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
 
+bool DevirtModule::runForTesting(
+    Module &M, function_ref<AAResults &(Function &)> AARGetter) {
+  ModuleSummaryIndex Summary;
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed =
+      DevirtModule(
+          M, AARGetter,
+          ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .run();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr(
+        "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
+
+  return Changed;
+}
+
 void DevirtModule::buildTypeIdentifierMap(
     std::vector<VTableBits> &Bits,
     DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
@@ -443,9 +679,31 @@ bool DevirtModule::tryFindVirtualCallTargets(
   return !TargetsForSlot.empty();
 }
 
+void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
+                                         Constant *TheFn, bool &IsExported) {
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      if (RemarksEnabled)
+        VCallSite.emitRemark("single-impl", TheFn->getName());
+      VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
+          TheFn, VCallSite.CS.getCalledValue()->getType()));
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    if (CSInfo.isExported()) {
+      IsExported = true;
+      CSInfo.markDevirt();
+    }
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
 bool DevirtModule::trySingleImplDevirt(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) {
   // See if the program contains a single implementation of this virtual
   // function.
   Function *TheFn = TargetsForSlot[0].Fn;
@@ -453,39 +711,51 @@ bool DevirtModule::trySingleImplDevirt(
     if (TheFn != Target.Fn)
       return false;
 
+  // If so, update each call site to call that implementation directly.
   if (RemarksEnabled)
     TargetsForSlot[0].WasDevirt = true;
-  // If so, update each call site to call that implementation directly.
-  for (auto &&VCallSite : CallSites) {
-    if (RemarksEnabled)
-      VCallSite.emitRemark("single-impl", TheFn->getName());
-    VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
-        TheFn, VCallSite.CS.getCalledValue()->getType()));
-    // This use is no longer unsafe.
-    if (VCallSite.NumUnsafeUses)
-      --*VCallSite.NumUnsafeUses;
+
+  bool IsExported = false;
+  applySingleImplDevirt(SlotInfo, TheFn, IsExported);
+  if (!IsExported)
+    return false;
+
+  // If the only implementation has local linkage, we must promote to external
+  // to make it visible to thin LTO objects. We can only get here during the
+  // ThinLTO export phase.
+  if (TheFn->hasLocalLinkage()) {
+    TheFn->setLinkage(GlobalValue::ExternalLinkage);
+    TheFn->setVisibility(GlobalValue::HiddenVisibility);
+    TheFn->setName(TheFn->getName() + "$merged");
   }
+
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  Res->SingleImplName = TheFn->getName();
+
   return true;
 }
 
 bool DevirtModule::tryEvaluateFunctionsWithArgs(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    ArrayRef<ConstantInt *> Args) {
+    ArrayRef<uint64_t> Args) {
   // Evaluate each function and store the result in each target's RetVal
   // field.
   for (VirtualCallTarget &Target : TargetsForSlot) {
     if (Target.Fn->arg_size() != Args.size() + 1)
       return false;
-    for (unsigned I = 0; I != Args.size(); ++I)
-      if (Target.Fn->getFunctionType()->getParamType(I + 1) !=
-          Args[I]->getType())
-        return false;
 
     Evaluator Eval(M.getDataLayout(), nullptr);
     SmallVector<Constant *, 2> EvalArgs;
     EvalArgs.push_back(
         Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
-    EvalArgs.insert(EvalArgs.end(), Args.begin(), Args.end());
+    for (unsigned I = 0; I != Args.size(); ++I) {
+      auto *ArgTy = dyn_cast<IntegerType>(
+          Target.Fn->getFunctionType()->getParamType(I + 1));
+      if (!ArgTy)
+        return false;
+      EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
+    }
+
     Constant *RetVal;
     if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
         !isa<ConstantInt>(RetVal))
@@ -495,9 +765,18 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs(
   return true;
 }
 
+void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                         uint64_t TheRetVal) {
+  for (auto Call : CSInfo.CallSites)
+    Call.replaceAndErase(
+        "uniform-ret-val", FnName, RemarksEnabled,
+        ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal));
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryUniformRetValOpt(
-    IntegerType *RetType, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
+    WholeProgramDevirtResolution::ByArg *Res) {
   // Uniform return value optimization. If all functions return the same
   // constant, replace all calls with that constant.
   uint64_t TheRetVal = TargetsForSlot[0].RetVal;
@@ -505,19 +784,77 @@ bool DevirtModule::tryUniformRetValOpt(
     if (Target.RetVal != TheRetVal)
       return false;
 
-  auto TheRetValConst = ConstantInt::get(RetType, TheRetVal);
-  for (auto Call : CallSites)
-    Call.replaceAndErase("uniform-ret-val", TargetsForSlot[0].Fn->getName(),
-                         RemarksEnabled, TheRetValConst);
+  if (CSInfo.isExported()) {
+    Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+    Res->Info = TheRetVal;
+  }
+
+  applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
   if (RemarksEnabled)
     for (auto &&Target : TargetsForSlot)
       Target.WasDevirt = true;
   return true;
 }
 
+std::string DevirtModule::getGlobalName(VTableSlot Slot,
+                                        ArrayRef<uint64_t> Args,
+                                        StringRef Name) {
+  std::string FullName = "__typeid_";
+  raw_string_ostream OS(FullName);
+  OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
+  for (uint64_t Arg : Args)
+    OS << '_' << Arg;
+  OS << '_' << Name;
+  return OS.str();
+}
+
+void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                StringRef Name, Constant *C) {
+  GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                                        getGlobalName(Slot, Args, Name), C, &M);
+  GA->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                     StringRef Name, unsigned AbsWidth) {
+  Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty);
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  // We only need to set metadata if the global is newly created, in which
+  // case it would not have hidden visibility.
+  if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+    return C;
+
+  GV->setVisibility(GlobalValue::HiddenVisibility);
+  auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+    GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                    MDNode::get(M.getContext(), {MinC, MaxC}));
+  };
+  if (AbsWidth == IntPtrTy->getBitWidth())
+    SetAbsRange(~0ull, ~0ull); // Full set.
+  else if (AbsWidth)
+    SetAbsRange(0, 1ull << AbsWidth);
+  return GV;
+}
+
+void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                        bool IsOne,
+                                        Constant *UniqueMemberAddr) {
+  for (auto &&Call : CSInfo.CallSites) {
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                              Call.VTable, UniqueMemberAddr);
+    Cmp = B.CreateZExt(Cmp, Call.CS->getType());
+    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp);
+  }
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryUniqueRetValOpt(
     unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
+    VTableSlot Slot, ArrayRef<uint64_t> Args) {
   // IsOne controls whether we look for a 0 or a 1.
   auto tryUniqueRetValOptFor = [&](bool IsOne) {
     const TypeMemberInfo *UniqueMember = nullptr;
@@ -533,16 +870,23 @@ bool DevirtModule::tryUniqueRetValOpt(
     // checked for a uniform return value in tryUniformRetValOpt.
     assert(UniqueMember);
 
-    // Replace each call with the comparison.
-    for (auto &&Call : CallSites) {
-      IRBuilder<> B(Call.CS.getInstruction());
-      Value *OneAddr = B.CreateBitCast(UniqueMember->Bits->GV, Int8PtrTy);
-      OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset);
-      Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
-                                Call.VTable, OneAddr);
-      Call.replaceAndErase("unique-ret-val", TargetsForSlot[0].Fn->getName(),
-                           RemarksEnabled, Cmp);
+    Constant *UniqueMemberAddr =
+        ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy);
+    UniqueMemberAddr = ConstantExpr::getGetElementPtr(
+        Int8Ty, UniqueMemberAddr,
+        ConstantInt::get(Int64Ty, UniqueMember->Offset));
+
+    if (CSInfo.isExported()) {
+      Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+      Res->Info = IsOne;
+
+      exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
     }
+
+    // Replace each call with the comparison.
+    applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
+                         UniqueMemberAddr);
+
     // Update devirtualization statistics for targets.
     if (RemarksEnabled)
       for (auto &&Target : TargetsForSlot)
@@ -560,9 +904,30 @@ bool DevirtModule::tryUniqueRetValOpt(
   return false;
 }
 
+void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                                         Constant *Byte, Constant *Bit) {
+  for (auto Call : CSInfo.CallSites) {
+    auto *RetType = cast<IntegerType>(Call.CS.getType());
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte);
+    if (RetType->getBitWidth() == 1) {
+      Value *Bits = B.CreateLoad(Addr);
+      Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+      auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+      Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
+                           IsBitSet);
+    } else {
+      Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+      Value *Val = B.CreateLoad(RetType, ValAddr);
+      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val);
+    }
+  }
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryVirtualConstProp(
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    ArrayRef<VirtualCallSite> CallSites) {
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
   // This only works if the function returns an integer.
   auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
   if (!RetType)
@@ -571,55 +936,38 @@ bool DevirtModule::tryVirtualConstProp(
   if (BitWidth > 64)
     return false;
 
-  // Make sure that each function does not access memory, takes at least one
-  // argument, does not use its first argument (which we assume is 'this'),
-  // and has the same return type.
+  // Make sure that each function is defined, does not access memory, takes at
+  // least one argument, does not use its first argument (which we assume is
+  // 'this'), and has the same return type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
   for (VirtualCallTarget &Target : TargetsForSlot) {
-    if (!Target.Fn->doesNotAccessMemory() || Target.Fn->arg_empty() ||
-        !Target.Fn->arg_begin()->use_empty() ||
+    if (Target.Fn->isDeclaration() ||
+        computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
+            MAK_ReadNone ||
+        Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
         Target.Fn->getReturnType() != RetType)
       return false;
   }
 
-  // Group call sites by the list of constant arguments they pass.
-  // The comparator ensures deterministic ordering.
-  struct ByAPIntValue {
-    bool operator()(const std::vector<ConstantInt *> &A,
-                    const std::vector<ConstantInt *> &B) const {
-      return std::lexicographical_compare(
-          A.begin(), A.end(), B.begin(), B.end(),
-          [](ConstantInt *AI, ConstantInt *BI) {
-            return AI->getValue().ult(BI->getValue());
-          });
-    }
-  };
-  std::map<std::vector<ConstantInt *>, std::vector<VirtualCallSite>,
-           ByAPIntValue>
-      VCallSitesByConstantArg;
-  for (auto &&VCallSite : CallSites) {
-    std::vector<ConstantInt *> Args;
-    if (VCallSite.CS.getType() != RetType)
-      continue;
-    for (auto &&Arg :
-         make_range(VCallSite.CS.arg_begin() + 1, VCallSite.CS.arg_end())) {
-      if (!isa<ConstantInt>(Arg))
-        break;
-      Args.push_back(cast<ConstantInt>(&Arg));
-    }
-    if (Args.size() + 1 != VCallSite.CS.arg_size())
-      continue;
-
-    VCallSitesByConstantArg[Args].push_back(VCallSite);
-  }
-
-  for (auto &&CSByConstantArg : VCallSitesByConstantArg) {
+  for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
     if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
       continue;
 
-    if (tryUniformRetValOpt(RetType, TargetsForSlot, CSByConstantArg.second))
+    WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
+    if (Res)
+      ResByArg = &Res->ResByArg[CSByConstantArg.first];
+
+    if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
       continue;
 
-    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second))
+    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
+                           ResByArg, Slot, CSByConstantArg.first))
       continue;
 
     // Find an allocation offset in bits in all vtables associated with the
@@ -659,26 +1007,20 @@ bool DevirtModule::tryVirtualConstProp(
       for (auto &&Target : TargetsForSlot)
         Target.WasDevirt = true;
 
-    // Rewrite each call to a load from OffsetByte/OffsetBit.
-    for (auto Call : CSByConstantArg.second) {
-      IRBuilder<> B(Call.CS.getInstruction());
-      Value *Addr = B.CreateConstGEP1_64(Call.VTable, OffsetByte);
-      if (BitWidth == 1) {
-        Value *Bits = B.CreateLoad(Addr);
-        Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
-        Value *BitsAndBit = B.CreateAnd(Bits, Bit);
-        auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
-        Call.replaceAndErase("virtual-const-prop-1-bit",
-                             TargetsForSlot[0].Fn->getName(),
-                             RemarksEnabled, IsBitSet);
-      } else {
-        Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
-        Value *Val = B.CreateLoad(RetType, ValAddr);
-        Call.replaceAndErase("virtual-const-prop",
-                             TargetsForSlot[0].Fn->getName(),
-                             RemarksEnabled, Val);
-      }
+    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+
+    if (CSByConstantArg.second.isExported()) {
+      ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+      exportGlobal(Slot, CSByConstantArg.first, "byte",
+                   ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy));
+      exportGlobal(Slot, CSByConstantArg.first, "bit",
+                   ConstantExpr::getIntToPtr(BitConst, Int8PtrTy));
     }
+
+    // Rewrite each call to a load from OffsetByte/OffsetBit.
+    applyVirtualConstProp(CSByConstantArg.second,
+                          TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
   }
   return true;
 }
@@ -733,7 +1075,11 @@ bool DevirtModule::areRemarksEnabled() {
   if (FL.empty())
     return false;
   const Function &Fn = FL.front();
-  auto DI = OptimizationRemark(DEBUG_TYPE, Fn, DebugLoc(), "");
+
+  const auto &BBL = Fn.getBasicBlockList();
+  if (BBL.empty())
+    return false;
+  auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
   return DI.isEnabled();
 }
 
@@ -766,8 +1112,8 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
       Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
       if (SeenPtrs.insert(Ptr).second) {
         for (DevirtCallSite Call : DevirtCalls) {
-          CallSlots[{TypeId, Call.Offset}].push_back(
-              {CI->getArgOperand(0), Call.CS, nullptr});
+          CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0),
+                                                       Call.CS, nullptr);
         }
       }
     }
@@ -853,14 +1199,79 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
     if (HasNonCallUses)
       ++NumUnsafeUses;
     for (DevirtCallSite Call : DevirtCalls) {
-      CallSlots[{TypeId, Call.Offset}].push_back(
-          {Ptr, Call.CS, &NumUnsafeUses});
+      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS,
+                                                   &NumUnsafeUses);
     }
 
     CI->eraseFromParent();
   }
 }
 
+void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  const TypeIdSummary *TidSummary =
+      ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString());
+  if (!TidSummary)
+    return;
+  auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
+  if (ResI == TidSummary->WPDRes.end())
+    return;
+  const WholeProgramDevirtResolution &Res = ResI->second;
+
+  if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    // The type of the function in the declaration is irrelevant because every
+    // call site will cast it to the correct type.
+    auto *SingleImpl = M.getOrInsertFunction(
+        Res.SingleImplName, Type::getVoidTy(M.getContext()));
+
+    // This is the import phase so we should not be exporting anything.
+    bool IsExported = false;
+    applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
+    assert(!IsExported);
+  }
+
+  for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
+    auto I = Res.ResByArg.find(CSByConstantArg.first);
+    if (I == Res.ResByArg.end())
+      continue;
+    auto &ResByArg = I->second;
+    // FIXME: We should figure out what to do about the "function name" argument
+    // to the apply* functions, as the function names are unavailable during the
+    // importing phase. For now we just pass the empty string. This does not
+    // impact correctness because the function names are just used for remarks.
+    switch (ResByArg.TheKind) {
+    case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+      applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
+      break;
+    case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
+      Constant *UniqueMemberAddr =
+          importGlobal(Slot, CSByConstantArg.first, "unique_member");
+      applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
+                           UniqueMemberAddr);
+      break;
+    }
+    case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
+      Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32);
+      Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty);
+      Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8);
+      Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty);
+      applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+    }
+    default:
+      break;
+    }
+  }
+}
+
+void DevirtModule::removeRedundantTypeTests() {
+  auto True = ConstantInt::getTrue(M.getContext());
+  for (auto &&U : NumUnsafeUsesForTypeTest) {
+    if (U.second == 0) {
+      U.first->replaceAllUsesWith(True);
+      U.first->eraseFromParent();
+    }
+  }
+}
+
 bool DevirtModule::run() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
@@ -868,7 +1279,11 @@ bool DevirtModule::run() {
       M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
   Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
 
-  if ((!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+  // Normally if there are no users of the devirtualization intrinsics in the
+  // module, this pass has nothing to do. But if we are exporting, we also need
+  // to handle any users that appear only in the function summaries.
+  if (!ExportSummary &&
+      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
        AssumeFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
     return false;
@@ -879,6 +1294,17 @@ bool DevirtModule::run() {
   if (TypeCheckedLoadFunc)
     scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
 
+  if (ImportSummary) {
+    for (auto &S : CallSlots)
+      importResolution(S.first, S.second);
+
+    removeRedundantTypeTests();
+
+    // The rest of the code is only necessary when exporting or during regular
+    // LTO, so we are done.
+    return true;
+  }
+
   // Rebuild type metadata into a map for easy lookup.
   std::vector<VTableBits> Bits;
   DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
@@ -886,6 +1312,53 @@ bool DevirtModule::run() {
   if (TypeIdMap.empty())
     return true;
 
+  // Collect information from summary about which calls to try to devirtualize.
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdMap) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second.SummaryList) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS)
+          continue;
+        // FIXME: Only add live functions.
+        for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers =
+                true;
+          }
+        }
+        for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}]
+                .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_test_assume_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryHasTypeTestAssumeUsers = true;
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_checked_load_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+      }
+    }
+  }
+
   // For each (type, offset) pair:
   bool DidVirtualConstProp = false;
   std::map<std::string, Function*> DevirtTargets;
@@ -894,19 +1367,39 @@ bool DevirtModule::run() {
     // function implementation at offset S.first.ByteOffset, and add to
     // TargetsForSlot.
     std::vector<VirtualCallTarget> TargetsForSlot;
-    if (!tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
-                                   S.first.ByteOffset))
-      continue;
-
-    if (!trySingleImplDevirt(TargetsForSlot, S.second) &&
-        tryVirtualConstProp(TargetsForSlot, S.second))
+    if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
+                                  S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res = nullptr;
+      if (ExportSummary && isa<MDString>(S.first.TypeID))
+        Res = &ExportSummary
+                   ->getOrInsertTypeIdSummary(
+                       cast<MDString>(S.first.TypeID)->getString())
+                   .WPDRes[S.first.ByteOffset];
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) &&
+          tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first))
         DidVirtualConstProp = true;
 
-    // Collect functions devirtualized at least for one call site for stats.
-    if (RemarksEnabled)
-      for (const auto &T : TargetsForSlot)
-        if (T.WasDevirt)
-          DevirtTargets[T.Fn->getName()] = T.Fn;
+      // Collect functions devirtualized at least for one call site for stats.
+      if (RemarksEnabled)
+        for (const auto &T : TargetsForSlot)
+          if (T.WasDevirt)
+            DevirtTargets[T.Fn->getName()] = T.Fn;
+    }
+
+    // CFI-specific: if we are exporting and any llvm.type.checked.load
+    // intrinsics were *not* devirtualized, we need to add the resulting
+    // llvm.type.test intrinsics to the function summaries so that the
+    // LowerTypeTests pass will export them.
+    if (ExportSummary && isa<MDString>(S.first.TypeID)) {
+      auto GUID =
+          GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
+      for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
+        FS->addTypeTest(GUID);
+      for (auto &CCS : S.second.ConstCSInfo)
+        for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
+          FS->addTypeTest(GUID);
+    }
   }
 
   if (RemarksEnabled) {
@@ -914,23 +1407,12 @@ bool DevirtModule::run() {
     for (const auto &DT : DevirtTargets) {
       Function *F = DT.second;
       DISubprogram *SP = F->getSubprogram();
-      DebugLoc DL = SP ? DebugLoc::get(SP->getScopeLine(), 0, SP) : DebugLoc();
-      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, DL,
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP,
                              Twine("devirtualized ") + F->getName());
     }
   }
 
-  // If we were able to eliminate all unsafe uses for a type checked load,
-  // eliminate the type test by replacing it with true.
-  if (TypeCheckedLoadFunc) {
-    auto True = ConstantInt::getTrue(M.getContext());
-    for (auto &&U : NumUnsafeUsesForTypeTest) {
-      if (U.second == 0) {
-        U.first->replaceAllUsesWith(True);
-        U.first->eraseFromParent();
-      }
-    }
-  }
+  removeRedundantTypeTests();
 
   // Rebuild each global we touched as part of virtual constant propagation to
   // include the before and after bytes.
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 2d34c1c..809471c 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -163,7 +164,7 @@ namespace {
   ///
   class FAddCombine {
   public:
-    FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(nullptr) {}
+    FAddCombine(InstCombiner::BuilderTy &B) : Builder(B), Instr(nullptr) {}
     Value *simplify(Instruction *FAdd);
 
   private:
@@ -186,7 +187,7 @@ namespace {
     Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
     void createInstPostProc(Instruction *NewInst, bool NoNumber = false);
 
-    InstCombiner::BuilderTy *Builder;
+    InstCombiner::BuilderTy &Builder;
     Instruction *Instr;
 
      // Debugging stuff are clustered here.
@@ -734,7 +735,7 @@ Value *FAddCombine::createNaryFAdd
 }
 
 Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
-  Value *V = Builder->CreateFSub(Opnd0, Opnd1);
+  Value *V = Builder.CreateFSub(Opnd0, Opnd1);
   if (Instruction *I = dyn_cast<Instruction>(V))
     createInstPostProc(I);
   return V;
@@ -749,21 +750,21 @@ Value *FAddCombine::createFNeg(Value *V) {
 }
 
 Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) {
-  Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
+  Value *V = Builder.CreateFAdd(Opnd0, Opnd1);
   if (Instruction *I = dyn_cast<Instruction>(V))
     createInstPostProc(I);
   return V;
 }
 
 Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
-  Value *V = Builder->CreateFMul(Opnd0, Opnd1);
+  Value *V = Builder.CreateFMul(Opnd0, Opnd1);
   if (Instruction *I = dyn_cast<Instruction>(V))
     createInstPostProc(I);
   return V;
 }
 
 Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) {
-  Value *V = Builder->CreateFDiv(Opnd0, Opnd1);
+  Value *V = Builder.CreateFDiv(Opnd0, Opnd1);
   if (Instruction *I = dyn_cast<Instruction>(V))
     createInstPostProc(I);
   return V;
@@ -794,6 +795,11 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
     if (Opnd->isConstant())
       continue;
 
+    // The constant check above is really for a few special constant
+    // coefficients.
+    if (isa<UndefValue>(Opnd->getSymVal()))
+      continue;
+
     const FAddendCoef &CE = Opnd->getCoef();
     if (CE.isMinusOne() || CE.isMinusTwo())
       NegOpndNum++;
@@ -841,108 +847,28 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
   return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
 }
 
-// If one of the operands only has one non-zero bit, and if the other
-// operand has a known-zero bit in a more significant place than it (not
-// including the sign bit) the ripple may go up to and fill the zero, but
-// won't change the sign. For example, (X & ~4) + 1.
-static bool checkRippleForAdd(const APInt &Op0KnownZero,
-                              const APInt &Op1KnownZero) {
-  APInt Op1MaybeOne = ~Op1KnownZero;
-  // Make sure that one of the operand has at most one bit set to 1.
-  if (Op1MaybeOne.countPopulation() != 1)
-    return false;
-
-  // Find the most significant known 0 other than the sign bit.
-  int BitWidth = Op0KnownZero.getBitWidth();
-  APInt Op0KnownZeroTemp(Op0KnownZero);
-  Op0KnownZeroTemp.clearBit(BitWidth - 1);
-  int Op0ZeroPosition = BitWidth - Op0KnownZeroTemp.countLeadingZeros() - 1;
-
-  int Op1OnePosition = BitWidth - Op1MaybeOne.countLeadingZeros() - 1;
-  assert(Op1OnePosition >= 0);
-
-  // This also covers the case of no known zero, since in that case
-  // Op0ZeroPosition is -1.
-  return Op0ZeroPosition >= Op1OnePosition;
-}
-
-/// Return true if we can prove that:
-///    (sext (add LHS, RHS))  === (add (sext LHS), (sext RHS))
-/// This basically requires proving that the add in the original type would not
-/// overflow to change the sign bit or have a carry out.
-bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
-  // There are different heuristics we can use for this.  Here are some simple
-  // ones.
-
-  // If LHS and RHS each have at least two sign bits, the addition will look
-  // like
-  //
-  // XX..... +
-  // YY.....
-  //
-  // If the carry into the most significant position is 0, X and Y can't both
-  // be 1 and therefore the carry out of the addition is also 0.
-  //
-  // If the carry into the most significant position is 1, X and Y can't both
-  // be 0 and therefore the carry out of the addition is also 1.
-  //
-  // Since the carry into the most significant position is always equal to
-  // the carry out of the addition, there is no signed overflow.
-  if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
-      ComputeNumSignBits(RHS, 0, &CxtI) > 1)
-    return true;
-
-  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-  APInt LHSKnownZero(BitWidth, 0);
-  APInt LHSKnownOne(BitWidth, 0);
-  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI);
-
-  APInt RHSKnownZero(BitWidth, 0);
-  APInt RHSKnownOne(BitWidth, 0);
-  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
-
-  // Addition of two 2's compliment numbers having opposite signs will never
-  // overflow.
-  if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
-      (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
-    return true;
-
-  // Check if carry bit of addition will not cause overflow.
-  if (checkRippleForAdd(LHSKnownZero, RHSKnownZero))
-    return true;
-  if (checkRippleForAdd(RHSKnownZero, LHSKnownZero))
-    return true;
-
-  return false;
-}
-
 /// \brief Return true if we can prove that:
 ///    (sub LHS, RHS)  === (sub nsw LHS, RHS)
 /// This basically requires proving that the add in the original type would not
 /// overflow to change the sign bit or have a carry out.
 /// TODO: Handle this for Vectors.
-bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
+bool InstCombiner::willNotOverflowSignedSub(const Value *LHS,
+                                            const Value *RHS,
+                                            const Instruction &CxtI) const {
   // If LHS and RHS each have at least two sign bits, the subtraction
   // cannot overflow.
   if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 &&
       ComputeNumSignBits(RHS, 0, &CxtI) > 1)
     return true;
 
-  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-  APInt LHSKnownZero(BitWidth, 0);
-  APInt LHSKnownOne(BitWidth, 0);
-  computeKnownBits(LHS, LHSKnownZero, LHSKnownOne, 0, &CxtI);
+  KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI);
 
-  APInt RHSKnownZero(BitWidth, 0);
-  APInt RHSKnownOne(BitWidth, 0);
-  computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
+  KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI);
 
-  // Subtraction of two 2's compliment numbers having identical signs will
+  // Subtraction of two 2's complement numbers having identical signs will
   // never overflow.
-  if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) ||
-      (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1]))
+  if ((LHSKnown.isNegative() && RHSKnown.isNegative()) ||
+      (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()))
     return true;
 
   // TODO: implement logic similar to checkRippleForAdd
@@ -951,16 +877,13 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
 
 /// \brief Return true if we can prove that:
 ///    (sub LHS, RHS)  === (sub nuw LHS, RHS)
-bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
-                                              Instruction &CxtI) {
+bool InstCombiner::willNotOverflowUnsignedSub(const Value *LHS,
+                                              const Value *RHS,
+                                              const Instruction &CxtI) const {
   // If the LHS is negative and the RHS is non-negative, no unsigned wrap.
-  bool LHSKnownNonNegative, LHSKnownNegative;
-  bool RHSKnownNonNegative, RHSKnownNegative;
-  ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0,
-                 &CxtI);
-  ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0,
-                 &CxtI);
-  if (LHSKnownNegative && RHSKnownNonNegative)
+  KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+  KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+  if (LHSKnown.isNegative() && RHSKnown.isNonNegative())
     return true;
 
   return false;
@@ -972,7 +895,7 @@ bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS,
 //   ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
 //   XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
 static Value *checkForNegativeOperand(BinaryOperator &I,
-                                      InstCombiner::BuilderTy *Builder) {
+                                      InstCombiner::BuilderTy &Builder) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
   // This function creates 2 instructions to replace ADD, we need at least one
@@ -996,13 +919,13 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
       // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
       // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
       if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) {
-        Value *NewAnd = Builder->CreateAnd(Z, *C1);
-        return Builder->CreateSub(RHS, NewAnd, "sub");
+        Value *NewAnd = Builder.CreateAnd(Z, *C1);
+        return Builder.CreateSub(RHS, NewAnd, "sub");
       } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) {
         // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
         // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
-        Value *NewOr = Builder->CreateOr(Z, ~(*C1));
-        return Builder->CreateSub(RHS, NewOr, "sub");
+        Value *NewOr = Builder.CreateOr(Z, ~(*C1));
+        return Builder.CreateSub(RHS, NewOr, "sub");
       }
     }
   }
@@ -1021,12 +944,73 @@ static Value *checkForNegativeOperand(BinaryOperator &I,
   if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
     if (C1->countTrailingZeros() == 0)
       if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
-        Value *NewOr = Builder->CreateOr(Z, ~(*C2));
-        return Builder->CreateSub(RHS, NewOr, "sub");
+        Value *NewOr = Builder.CreateOr(Z, ~(*C2));
+        return Builder.CreateSub(RHS, NewOr, "sub");
       }
   return nullptr;
 }
 
+static Instruction *foldAddWithConstant(BinaryOperator &Add,
+                                        InstCombiner::BuilderTy &Builder) {
+  Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+  const APInt *C;
+  if (!match(Op1, m_APInt(C)))
+    return nullptr;
+
+  if (C->isSignMask()) {
+    // If wrapping is not allowed, then the addition must set the sign bit:
+    // X + (signmask) --> X | signmask
+    if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+      return BinaryOperator::CreateOr(Op0, Op1);
+
+    // If wrapping is allowed, then the addition flips the sign bit of LHS:
+    // X + (signmask) --> X ^ signmask
+    return BinaryOperator::CreateXor(Op0, Op1);
+  }
+
+  Value *X;
+  const APInt *C2;
+  Type *Ty = Add.getType();
+
+  // Is this add the last step in a convoluted sext?
+  // add(zext(xor i16 X, -32768), -32768) --> sext X
+  if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+      C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+    return CastInst::Create(Instruction::SExt, X, Ty);
+
+  // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C))
+  // FIXME: This should check hasOneUse to not increase the instruction count?
+  if (C->isNegative() &&
+      match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) &&
+      C->sge(-C2->sext(C->getBitWidth()))) {
+    Constant *NewC =
+        ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth()));
+    return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+  }
+
+  if (C->isOneValue() && Op0->hasOneUse()) {
+    // add (sext i1 X), 1 --> zext (not X)
+    // TODO: The smallest IR representation is (select X, 0, 1), and that would
+    // not require the one-use check. But we need to remove a transform in
+    // visitSelect and make sure that IR value tracking for select is equal or
+    // better than for these ops.
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        X->getType()->getScalarSizeInBits() == 1)
+      return new ZExtInst(Builder.CreateNot(X), Ty);
+
+    // Shifts and add used to flip and mask off the low bit:
+    // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+    const APInt *C3;
+    if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
+        C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+      Value *NotX = Builder.CreateNot(X);
+      return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
@@ -1034,51 +1018,21 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(),
-                                 I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
+  if (Value *V =
+          SimplifyAddInst(LHS, RHS, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                          SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
    // (A*B)+(A*C) -> A*(B+C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
-  const APInt *Val;
-  if (match(RHS, m_APInt(Val))) {
-    // X + (signbit) --> X ^ signbit
-    if (Val->isSignBit())
-      return BinaryOperator::CreateXor(LHS, RHS);
-
-    // Is this add the last step in a convoluted sext?
-    Value *X;
-    const APInt *C;
-    if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
-        C->isMinSignedValue() &&
-        C->sext(LHS->getType()->getScalarSizeInBits()) == *Val) {
-      // add(zext(xor i16 X, -32768), -32768) --> sext X
-      return CastInst::Create(Instruction::SExt, X, LHS->getType());
-    }
-
-    if (Val->isNegative() &&
-        match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
-        Val->sge(-C->sext(Val->getBitWidth()))) {
-      // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
-      return CastInst::Create(
-          Instruction::ZExt,
-          Builder->CreateNUWAdd(
-              X, Constant::getIntegerValue(X->getType(),
-                                           *C + Val->trunc(C->getBitWidth()))),
-          I.getType());
-    }
-  }
+  if (Instruction *X = foldAddWithConstant(I, Builder))
+    return X;
 
-  // FIXME: Use the match above instead of dyn_cast to allow these transforms
-  // for splat vectors.
+  // FIXME: This should be moved into the above helper function to allow these
+  // transforms for splat vectors.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
-    // See if SimplifyDemandedBits can simplify this.  This handles stuff like
-    // (X & 254)+1 -> (X&254)|1
-    if (SimplifyDemandedInstructionBits(I))
-      return &I;
-
     // zext(bool) + C -> bool ? C + 1 : C
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
       if (ZI->getSrcTy()->isIntegerTy(1))
@@ -1106,34 +1060,31 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
       if (ExtendAmt) {
         Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt);
-        Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext");
+        Value *NewShl = Builder.CreateShl(XorLHS, ShAmt, "sext");
         return BinaryOperator::CreateAShr(NewShl, ShAmt);
       }
 
       // If this is a xor that was canonicalized from a sub, turn it back into
       // a sub and fuse this add with it.
       if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
-        IntegerType *IT = cast<IntegerType>(I.getType());
-        APInt LHSKnownOne(IT->getBitWidth(), 0);
-        APInt LHSKnownZero(IT->getBitWidth(), 0);
-        computeKnownBits(XorLHS, LHSKnownZero, LHSKnownOne, 0, &I);
-        if ((XorRHS->getValue() | LHSKnownZero).isAllOnesValue())
+        KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I);
+        if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue())
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
       }
-      // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C,
-      // transform them into (X + (signbit ^ C))
-      if (XorRHS->getValue().isSignBit())
+      // (X + signmask) + C could have gotten canonicalized to (X^signmask) + C,
+      // transform them into (X + (signmask ^ C))
+      if (XorRHS->getValue().isSignMask())
         return BinaryOperator::CreateAdd(XorLHS,
                                          ConstantExpr::getXor(XorRHS, CI));
     }
   }
 
-  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
-    if (Instruction *NV = FoldOpIntoPhi(I))
+  if (isa<Constant>(RHS))
+    if (Instruction *NV = foldOpWithConstantIntoOperand(I))
       return NV;
 
-  if (I.getType()->getScalarType()->isIntegerTy(1))
+  if (I.getType()->isIntOrIntVectorTy(1))
     return BinaryOperator::CreateXor(LHS, RHS);
 
   // X + X --> X << 1
@@ -1150,7 +1101,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *LHSV = dyn_castNegVal(LHS)) {
     if (!isa<Constant>(RHS))
       if (Value *RHSV = dyn_castNegVal(RHS)) {
-        Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
+        Value *NewAdd = Builder.CreateAdd(LHSV, RHSV, "sum");
         return BinaryOperator::CreateNeg(NewAdd);
       }
 
@@ -1197,15 +1148,10 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
       if (AddRHSHighBits == AddRHSHighBitsAnd) {
         // Okay, the xform is safe.  Insert the new add pronto.
-        Value *NewAdd = Builder->CreateAdd(X, CRHS, LHS->getName());
+        Value *NewAdd = Builder.CreateAdd(X, CRHS, LHS->getName());
         return BinaryOperator::CreateAnd(NewAdd, C2);
       }
     }
-
-    // Try to fold constant add into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
   }
 
   // add (select X 0 (sub n A)) A  -->  select X A n
@@ -1242,10 +1188,10 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
         if (ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
-            WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
+            willNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
           // Insert the new, smaller add.
           Value *NewAdd =
-              Builder->CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv");
+              Builder.CreateNSWAdd(LHSConv->getOperand(0), CI, "addconv");
           return new SExtInst(NewAdd, I.getType());
         }
       }
@@ -1253,16 +1199,16 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
     // (add (sext x), (sext y)) --> (sext (add int x, y))
     if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of sexts), and if the
       // integer add will not overflow.
       if (LHSConv->getOperand(0)->getType() ==
               RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
+          willNotOverflowSignedAdd(LHSConv->getOperand(0),
                                    RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+        Value *NewAdd = Builder.CreateNSWAdd(LHSConv->getOperand(0),
                                              RHSConv->getOperand(0), "addconv");
         return new SExtInst(NewAdd, I.getType());
       }
@@ -1278,11 +1224,10 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
         if (ConstantExpr::getZExt(CI, I.getType()) == RHSC &&
-            computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) ==
-                OverflowResult::NeverOverflows) {
+            willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) {
           // Insert the new, smaller add.
           Value *NewAdd =
-              Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
+              Builder.CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv");
           return new ZExtInst(NewAdd, I.getType());
         }
       }
@@ -1290,17 +1235,16 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
     // (add (zext x), (zext y)) --> (zext (add int x, y))
     if (auto *RHSConv = dyn_cast<ZExtInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of zexts), and if the
       // integer add will not overflow.
       if (LHSConv->getOperand(0)->getType() ==
               RHSConv->getOperand(0)->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          computeOverflowForUnsignedAdd(LHSConv->getOperand(0),
-                                        RHSConv->getOperand(0),
-                                        &I) == OverflowResult::NeverOverflows) {
+          willNotOverflowUnsignedAdd(LHSConv->getOperand(0),
+                                     RHSConv->getOperand(0), I)) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNUWAdd(
+        Value *NewAdd = Builder.CreateNUWAdd(
             LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv");
         return new ZExtInst(NewAdd, I.getType());
       }
@@ -1311,13 +1255,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(LHS, m_And(m_Specific(B), m_Specific(A)))))
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
 
     if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(RHS, m_And(m_Specific(B), m_Specific(A)))))
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
   }
 
@@ -1325,8 +1267,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
-        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(LHS, m_And(m_Specific(B), m_Specific(A))))) {
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B)))) {
       auto *New = BinaryOperator::CreateAdd(A, B);
       New->setHasNoSignedWrap(I.hasNoSignedWrap());
       New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
@@ -1334,8 +1275,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
 
     if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
-        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(RHS, m_And(m_Specific(B), m_Specific(A))))) {
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B)))) {
       auto *New = BinaryOperator::CreateAdd(A, B);
       New->setHasNoSignedWrap(I.hasNoSignedWrap());
       New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
@@ -1343,16 +1283,14 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
-  // TODO(jingyue): Consider WillNotOverflowSignedAdd and
-  // WillNotOverflowUnsignedAdd to reduce the number of invocations of
+  // TODO(jingyue): Consider willNotOverflowSignedAdd and
+  // willNotOverflowUnsignedAdd to reduce the number of invocations of
   // computeKnownBits.
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) {
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() &&
-      computeOverflowForUnsignedAdd(LHS, RHS, &I) ==
-          OverflowResult::NeverOverflows) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
@@ -1367,8 +1305,8 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V =
-          SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (isa<Constant>(RHS))
@@ -1394,38 +1332,57 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    Value *LHSIntVal = LHSConv->getOperand(0);
+    Type *FPType = LHSConv->getType();
+
+    // TODO: This check is overly conservative. In many cases known bits
+    // analysis can tell us that the result of the addition has less significant
+    // bits than the integer type can hold.
+    auto IsValidPromotion = [](Type *FTy, Type *ITy) {
+      Type *FScalarTy = FTy->getScalarType();
+      Type *IScalarTy = ITy->getScalarType();
+
+      // Do we have enough bits in the significand to represent the result of
+      // the integer addition?
+      unsigned MaxRepresentableBits =
+          APFloat::semanticsPrecision(FScalarTy->getFltSemantics());
+      return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits;
+    };
+
     // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
     // ... if the constant fits in the integer value.  This is useful for things
     // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
     // requires a constant pool load, and generally allows the add to be better
     // instcombined.
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
-      Constant *CI =
-      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
-      if (LHSConv->hasOneUse() &&
-          ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
-        // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
-                                              CI, "addconv");
-        return new SIToFPInst(NewAdd, I.getType());
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+      if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+        Constant *CI =
+          ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
+        if (LHSConv->hasOneUse() &&
+            ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+            willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
+          // Insert the new integer add.
+          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");
+          return new SIToFPInst(NewAdd, I.getType());
+        }
       }
-    }
 
     // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
     if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
-      // single use (so we don't increase the number of int->fp conversions),
-      // and if the integer add will not overflow.
-      if (LHSConv->getOperand(0)->getType() ==
-              RHSConv->getOperand(0)->getType() &&
-          (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0), I)) {
-        // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
-                                              RHSConv->getOperand(0),"addconv");
-        return new SIToFPInst(NewAdd, I.getType());
+      Value *RHSIntVal = RHSConv->getOperand(0);
+      // It's enough to check LHS types only because we require int types to
+      // be the same for this transform.
+      if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+        // Only do this if x/y have the same type, if at least one of them has a
+        // single use (so we don't increase the number of int->fp conversions),
+        // and if the integer add will not overflow.
+        if (LHSIntVal->getType() == RHSIntVal->getType() &&
+            (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+            willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
+          // Insert the new integer add.
+          Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv");
+          return new SIToFPInst(NewAdd, I.getType());
+        }
       }
     }
   }
@@ -1521,14 +1478,14 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
   // pointer, subtract it from the offset we have.
   if (GEP2) {
     Value *Offset = EmitGEPOffset(GEP2);
-    Result = Builder->CreateSub(Result, Offset);
+    Result = Builder.CreateSub(Result, Offset);
   }
 
   // If we have p - gep(p, ...)  then we have to negate the result.
   if (Swapped)
-    Result = Builder->CreateNeg(Result, "diff.neg");
+    Result = Builder.CreateNeg(Result, "diff.neg");
 
-  return Builder->CreateIntCast(Result, Ty, true);
+  return Builder.CreateIntCast(Result, Ty, true);
 }
 
 Instruction *InstCombiner::visitSub(BinaryOperator &I) {
@@ -1537,8 +1494,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(),
-                                 I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
+  if (Value *V =
+          SimplifySubInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                          SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // (A*B)-(A*C) -> A*(B-C) etc
@@ -1562,7 +1520,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return Res;
   }
 
-  if (I.getType()->isIntegerTy(1))
+  if (I.getType()->isIntOrIntVectorTy(1))
     return BinaryOperator::CreateXor(Op0, Op1);
 
   // Replace (-1 - A) with (~A).
@@ -1580,22 +1538,24 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       if (Instruction *R = FoldOpIntoSelect(I, SI))
         return R;
 
+    // Try to fold constant sub into PHI values.
+    if (PHINode *PN = dyn_cast<PHINode>(Op1))
+      if (Instruction *R = foldOpIntoPhi(I, PN))
+        return R;
+
     // C-(X+C2) --> (C-C2)-X
     Constant *C2;
     if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
 
-    if (SimplifyDemandedInstructionBits(I))
-      return &I;
-
     // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
     if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X))))
-      if (X->getType()->getScalarType()->isIntegerTy(1))
+      if (X->getType()->isIntOrIntVectorTy(1))
         return CastInst::CreateSExtOrBitCast(X, Op1->getType());
 
     // Fold (sub 0, (sext bool to B)) --> (zext bool to B)
     if (C->isNullValue() && match(Op1, m_SExt(m_Value(X))))
-      if (X->getType()->getScalarType()->isIntegerTy(1))
+      if (X->getType()->isIntOrIntVectorTy(1))
         return CastInst::CreateZExtOrBitCast(X, Op1->getType());
   }
 
@@ -1605,7 +1565,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     // -(X >>u 31) -> (X >>s 31)
     // -(X >>s 31) -> (X >>u 31)
-    if (*Op0C == 0) {
+    if (Op0C->isNullValue()) {
       Value *X;
       const APInt *ShAmt;
       if (match(Op1, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
@@ -1622,11 +1582,9 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
-    if ((*Op0C + 1).isPowerOf2()) {
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
-      computeKnownBits(&I, KnownZero, KnownOne, 0, &I);
-      if ((*Op0C | KnownZero).isAllOnesValue())
+    if (Op0C->isMask()) {
+      KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+      if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
         return BinaryOperator::CreateXor(Op1, Op0);
     }
   }
@@ -1634,8 +1592,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   {
     Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
-    if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
-        match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
+    if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
       return BinaryOperator::CreateNeg(Y);
 
     // (X-Y)-X == -Y
@@ -1645,38 +1602,34 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   // (sub (or A, B) (xor A, B)) --> (and A, B)
   {
-    Value *A = nullptr, *B = nullptr;
+    Value *A, *B;
     if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(Op0, m_Or(m_Specific(A), m_Specific(B))) ||
-         match(Op0, m_Or(m_Specific(B), m_Specific(A)))))
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
   }
 
-  if (Op0->hasOneUse()) {
-    Value *Y = nullptr;
+  {
+    Value *Y;
     // ((X | Y) - X) --> (~X & Y)
-    if (match(Op0, m_Or(m_Value(Y), m_Specific(Op1))) ||
-        match(Op0, m_Or(m_Specific(Op1), m_Value(Y))))
+    if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
       return BinaryOperator::CreateAnd(
-          Y, Builder->CreateNot(Op1, Op1->getName() + ".not"));
+          Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
   }
 
   if (Op1->hasOneUse()) {
     Value *X = nullptr, *Y = nullptr, *Z = nullptr;
     Constant *C = nullptr;
-    Constant *CI = nullptr;
 
     // (X - (Y - Z))  -->  (X + (Z - Y)).
     if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
       return BinaryOperator::CreateAdd(Op0,
-                                      Builder->CreateSub(Z, Y, Op1->getName()));
+                                      Builder.CreateSub(Z, Y, Op1->getName()));
 
     // (X - (X & Y))   -->   (X & ~Y)
     //
-    if (match(Op1, m_And(m_Value(Y), m_Specific(Op0))) ||
-        match(Op1, m_And(m_Specific(Op0), m_Value(Y))))
+    if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0))))
       return BinaryOperator::CreateAnd(Op0,
-                                  Builder->CreateNot(Y, Y->getName() + ".not"));
+                                  Builder.CreateNot(Y, Y->getName() + ".not"));
 
     // 0 - (X sdiv C)  -> (X sdiv -C)  provided the negation doesn't overflow.
     if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) && match(Op0, m_Zero()) &&
@@ -1693,7 +1646,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     // 'nuw' is dropped in favor of the canonical form.
     if (match(Op1, m_SExt(m_Value(Y))) &&
         Y->getType()->getScalarSizeInBits() == 1) {
-      Value *Zext = Builder->CreateZExt(Y, I.getType());
+      Value *Zext = Builder.CreateZExt(Y, I.getType());
       BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Zext);
       Add->setHasNoSignedWrap(I.hasNoSignedWrap());
       return Add;
@@ -1702,15 +1655,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
     Value *A, *B;
-    if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) ||
-        match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B))))
-      return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B));
+    Constant *CI;
+    if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B)))))
+      return BinaryOperator::CreateAdd(Op0, Builder.CreateMul(A, B));
 
     // X - A*CI -> X + A*-CI
-    // X - CI*A -> X + A*-CI
-    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI))) ||
-        match(Op1, m_Mul(m_Constant(CI), m_Value(A)))) {
-      Value *NewMul = Builder->CreateMul(A, ConstantExpr::getNeg(CI));
+    // No need to handle commuted multiply because multiply handling will
+    // ensure constant will be move to the right hand side.
+    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI)))) {
+      Value *NewMul = Builder.CreateMul(A, ConstantExpr::getNeg(CI));
       return BinaryOperator::CreateAdd(Op0, NewMul);
     }
   }
@@ -1730,11 +1683,11 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return replaceInstUsesWith(I, Res);
 
   bool Changed = false;
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedSub(Op0, Op1, I)) {
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
-  if (!I.hasNoUnsignedWrap() && WillNotOverflowUnsignedSub(Op0, Op1, I)) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
@@ -1748,8 +1701,8 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V =
-          SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // fsub nsz 0, X ==> fsub nsz -0.0, X
@@ -1774,14 +1727,14 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   }
   if (FPTruncInst *FPTI = dyn_cast<FPTruncInst>(Op1)) {
     if (Value *V = dyn_castFNegVal(FPTI->getOperand(0))) {
-      Value *NewTrunc = Builder->CreateFPTrunc(V, I.getType());
+      Value *NewTrunc = Builder.CreateFPTrunc(V, I.getType());
       Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewTrunc);
       NewI->copyFastMathFlags(&I);
       return NewI;
     }
   } else if (FPExtInst *FPEI = dyn_cast<FPExtInst>(Op1)) {
     if (Value *V = dyn_castFNegVal(FPEI->getOperand(0))) {
-      Value *NewExt = Builder->CreateFPExt(V, I.getType());
+      Value *NewExt = Builder.CreateFPExt(V, I.getType());
       Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewExt);
       NewI->copyFastMathFlags(&I);
       return NewI;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index da5384a..fdc9c37 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -23,21 +23,6 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "instcombine"
 
-static inline Value *dyn_castNotVal(Value *V) {
-  // If this is not(not(x)) don't return that this is a not: we want the two
-  // not's to be folded first.
-  if (BinaryOperator::isNot(V)) {
-    Value *Operand = BinaryOperator::getNotArgument(V);
-    if (!IsFreeToInvert(Operand, Operand->hasOneUse()))
-      return Operand;
-  }
-
-  // Constants can be considered to be not'ed values...
-  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
-    return ConstantInt::get(C->getType(), ~C->getValue());
-  return nullptr;
-}
-
 /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
 /// a four bit mask.
 static unsigned getFCmpCode(FCmpInst::Predicate CC) {
@@ -69,17 +54,17 @@ static unsigned getFCmpCode(FCmpInst::Predicate CC) {
 /// instruction. The sign is passed in to determine which kind of predicate to
 /// use in the new icmp instruction.
 static Value *getNewICmpValue(bool Sign, unsigned Code, Value *LHS, Value *RHS,
-                              InstCombiner::BuilderTy *Builder) {
+                              InstCombiner::BuilderTy &Builder) {
   ICmpInst::Predicate NewPred;
   if (Value *NewConstant = getICmpValue(Sign, Code, LHS, RHS, NewPred))
     return NewConstant;
-  return Builder->CreateICmp(NewPred, LHS, RHS);
+  return Builder.CreateICmp(NewPred, LHS, RHS);
 }
 
 /// This is the complement of getFCmpCode, which turns an opcode and two
 /// operands into either a FCmp instruction, or a true/false constant.
 static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
-                           InstCombiner::BuilderTy *Builder) {
+                           InstCombiner::BuilderTy &Builder) {
   const auto Pred = static_cast<FCmpInst::Predicate>(Code);
   assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
          "Unexpected FCmp predicate!");
@@ -87,59 +72,50 @@ static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
     return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
   if (Pred == FCmpInst::FCMP_TRUE)
     return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
-  return Builder->CreateFCmp(Pred, LHS, RHS);
+  return Builder.CreateFCmp(Pred, LHS, RHS);
 }
 
-/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) to BSWAP(BITWISE_OP(A, B))
+/// \brief Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
+/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
 /// \param I Binary operator to transform.
 /// \return Pointer to node that must replace the original binary operator, or
 ///         null pointer if no transformation was made.
-Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
-  IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
-
-  // Can't do vectors.
-  if (I.getType()->isVectorTy())
-    return nullptr;
-
-  // Can only do bitwise ops.
-  if (!I.isBitwiseLogicOp())
-    return nullptr;
+static Value *SimplifyBSwap(BinaryOperator &I,
+                            InstCombiner::BuilderTy &Builder) {
+  assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying");
 
   Value *OldLHS = I.getOperand(0);
   Value *OldRHS = I.getOperand(1);
-  ConstantInt *ConstLHS = dyn_cast<ConstantInt>(OldLHS);
-  ConstantInt *ConstRHS = dyn_cast<ConstantInt>(OldRHS);
-  IntrinsicInst *IntrLHS = dyn_cast<IntrinsicInst>(OldLHS);
-  IntrinsicInst *IntrRHS = dyn_cast<IntrinsicInst>(OldRHS);
-  bool IsBswapLHS = (IntrLHS && IntrLHS->getIntrinsicID() == Intrinsic::bswap);
-  bool IsBswapRHS = (IntrRHS && IntrRHS->getIntrinsicID() == Intrinsic::bswap);
-
-  if (!IsBswapLHS && !IsBswapRHS)
-    return nullptr;
-
-  if (!IsBswapLHS && !ConstLHS)
-    return nullptr;
 
-  if (!IsBswapRHS && !ConstRHS)
+  Value *NewLHS;
+  if (!match(OldLHS, m_BSwap(m_Value(NewLHS))))
     return nullptr;
 
-  /// OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
-  /// OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
-  Value *NewLHS = IsBswapLHS ? IntrLHS->getOperand(0) :
-                  Builder->getInt(ConstLHS->getValue().byteSwap());
+  Value *NewRHS;
+  const APInt *C;
 
-  Value *NewRHS = IsBswapRHS ? IntrRHS->getOperand(0) :
-                  Builder->getInt(ConstRHS->getValue().byteSwap());
+  if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) {
+    // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
+    if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse())
+      return nullptr;
+    // NewRHS initialized by the matcher.
+  } else if (match(OldRHS, m_APInt(C))) {
+    // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
+    if (!OldLHS->hasOneUse())
+      return nullptr;
+    NewRHS = ConstantInt::get(I.getType(), C->byteSwap());
+  } else
+    return nullptr;
 
-  Value *BinOp = Builder->CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
-  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap, ITy);
-  return Builder->CreateCall(F, BinOp);
+  Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
+  Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap,
+                                          I.getType());
+  return Builder.CreateCall(F, BinOp);
 }
 
 /// This handles expressions of the form ((val OP C1) & C2).  Where
-/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
-/// guaranteed to be a binary operator.
-Instruction *InstCombiner::OptAndOp(Instruction *Op,
+/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
+Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
                                     ConstantInt *OpRHS,
                                     ConstantInt *AndRHS,
                                     BinaryOperator &TheAnd) {
@@ -149,30 +125,24 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     Together = ConstantExpr::getAnd(AndRHS, OpRHS);
 
   switch (Op->getOpcode()) {
+  default: break;
   case Instruction::Xor:
     if (Op->hasOneUse()) {
       // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
-      Value *And = Builder->CreateAnd(X, AndRHS);
+      Value *And = Builder.CreateAnd(X, AndRHS);
       And->takeName(Op);
       return BinaryOperator::CreateXor(And, Together);
     }
     break;
   case Instruction::Or:
     if (Op->hasOneUse()){
-      if (Together != OpRHS) {
-        // (X | C1) & C2 --> (X | (C1&C2)) & C2
-        Value *Or = Builder->CreateOr(X, Together);
-        Or->takeName(Op);
-        return BinaryOperator::CreateAnd(Or, AndRHS);
-      }
-
       ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together);
       if (TogetherCI && !TogetherCI->isZero()){
         // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1
         // NOTE: This reduces the number of bits set in the & mask, which
         // can expose opportunities for store narrowing.
         Together = ConstantExpr::getXor(AndRHS, Together);
-        Value *And = Builder->CreateAnd(X, Together);
+        Value *And = Builder.CreateAnd(X, Together);
         And->takeName(Op);
         return BinaryOperator::CreateOr(And, OpRHS);
       }
@@ -194,17 +164,17 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
         const APInt& AddRHS = OpRHS->getValue();
 
         // Check to see if any bits below the one bit set in AndRHSV are set.
-        if ((AddRHS & (AndRHSV-1)) == 0) {
+        if ((AddRHS & (AndRHSV - 1)).isNullValue()) {
           // If not, the only thing that can effect the output of the AND is
           // the bit specified by AndRHSV.  If that bit is set, the effect of
           // the XOR is to toggle the bit.  If it is clear, then the ADD has
           // no effect.
-          if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop
+          if ((AddRHS & AndRHSV).isNullValue()) { // Bit is not set, noop
             TheAnd.setOperand(0, X);
             return &TheAnd;
           } else {
             // Pull the XOR out of the AND.
-            Value *NewAnd = Builder->CreateAnd(X, AndRHS);
+            Value *NewAnd = Builder.CreateAnd(X, AndRHS);
             NewAnd->takeName(Op);
             return BinaryOperator::CreateXor(NewAnd, AndRHS);
           }
@@ -220,7 +190,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     uint32_t BitWidth = AndRHS->getType()->getBitWidth();
     uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
     APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
-    ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShlMask);
+    ConstantInt *CI = Builder.getInt(AndRHS->getValue() & ShlMask);
 
     if (CI->getValue() == ShlMask)
       // Masking out bits that the shift already masks.
@@ -240,7 +210,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     uint32_t BitWidth = AndRHS->getType()->getBitWidth();
     uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
     APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
-    ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShrMask);
+    ConstantInt *CI = Builder.getInt(AndRHS->getValue() & ShrMask);
 
     if (CI->getValue() == ShrMask)
       // Masking out bits that the shift already masks.
@@ -260,12 +230,12 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
       uint32_t BitWidth = AndRHS->getType()->getBitWidth();
       uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
       APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
-      Constant *C = Builder->getInt(AndRHS->getValue() & ShrMask);
+      Constant *C = Builder.getInt(AndRHS->getValue() & ShrMask);
       if (C == AndRHS) {          // Masking out bits shifted in.
         // (Val ashr C1) & C2 -> (Val lshr C1) & C2
         // Make the argument unsigned.
         Value *ShVal = Op->getOperand(0);
-        ShVal = Builder->CreateLShr(ShVal, OpRHS, Op->getName());
+        ShVal = Builder.CreateLShr(ShVal, OpRHS, Op->getName());
         return BinaryOperator::CreateAnd(ShVal, AndRHS, TheAnd.getName());
       }
     }
@@ -291,189 +261,102 @@ Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
   ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
   if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) {
     Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred;
-    return Builder->CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
+    return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
   }
 
   // V >= Lo && V <  Hi --> V - Lo u<  Hi - Lo
   // V <  Lo || V >= Hi --> V - Lo u>= Hi - Lo
   Value *VMinusLo =
-      Builder->CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
+      Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
   Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo);
-  return Builder->CreateICmp(Pred, VMinusLo, HiMinusLo);
+  return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo);
 }
 
-/// Returns true iff Val consists of one contiguous run of 1s with any number
-/// of 0s on either side.  The 1s are allowed to wrap from LSB to MSB,
-/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
-/// not, since all 1s are not contiguous.
-static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
-  const APInt& V = Val->getValue();
-  uint32_t BitWidth = Val->getType()->getBitWidth();
-  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
-
-  // look for the first zero bit after the run of ones
-  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
-  // look for the first non-zero bit
-  ME = V.getActiveBits();
-  return true;
-}
-
-/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines
-/// whether the operator is a sub. If we can fold one of the following xforms:
+/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
+/// that can be simplified.
+/// One of A and B is considered the mask. The other is the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum contains
+/// only "Mask", then both A and B can be considered masks. If A is the mask,
+/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
+/// If both A and C are constants, this proof is also easy.
+/// For the following explanations, we assume that A is the mask.
 ///
-/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
-/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
-/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
+/// bits of A are set in B.
+///   Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
 ///
-/// return (A +/- B).
+/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
+/// bits of A are cleared in B.
+///   Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
+///
+/// "Mixed" declares that (A & B) == C and C might or might not contain any
+/// number of one bits and zero bits.
+///   Example: (icmp eq (A & 3), 1) -> AMask_Mixed
+///
+/// "Not" means that in above descriptions "==" should be replaced by "!=".
+///   Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
 ///
-Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
-                                        ConstantInt *Mask, bool isSub,
-                                        Instruction &I) {
-  Instruction *LHSI = dyn_cast<Instruction>(LHS);
-  if (!LHSI || LHSI->getNumOperands() != 2 ||
-      !isa<ConstantInt>(LHSI->getOperand(1))) return nullptr;
-
-  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
-
-  switch (LHSI->getOpcode()) {
-  default: return nullptr;
-  case Instruction::And:
-    if (ConstantExpr::getAnd(N, Mask) == Mask) {
-      // If the AndRHS is a power of two minus one (0+1+), this is simple.
-      if ((Mask->getValue().countLeadingZeros() +
-           Mask->getValue().countPopulation()) ==
-          Mask->getValue().getBitWidth())
-        break;
-
-      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
-      // part, we don't need any explicit masks to take them out of A.  If that
-      // is all N is, ignore it.
-      uint32_t MB = 0, ME = 0;
-      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
-        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
-        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
-        if (MaskedValueIsZero(RHS, Mask, 0, &I))
-          break;
-      }
-    }
-    return nullptr;
-  case Instruction::Or:
-  case Instruction::Xor:
-    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
-    if ((Mask->getValue().countLeadingZeros() +
-         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
-        && ConstantExpr::getAnd(N, Mask)->isNullValue())
-      break;
-    return nullptr;
-  }
-
-  if (isSub)
-    return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold");
-  return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold");
-}
-
-/// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C)
-/// One of A and B is considered the mask, the other the value. This is
-/// described as the "AMask" or "BMask" part of the enum. If the enum
-/// contains only "Mask", then both A and B can be considered masks.
-/// If A is the mask, then it was proven, that (A & C) == C. This
-/// is trivial if C == A, or C == 0. If both A and C are constants, this
-/// proof is also easy.
-/// For the following explanations we assume that A is the mask.
-/// The part "AllOnes" declares, that the comparison is true only
-/// if (A & B) == A, or all bits of A are set in B.
-///   Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes
-/// The part "AllZeroes" declares, that the comparison is true only
-/// if (A & B) == 0, or all bits of A are cleared in B.
-///   Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes
-/// The part "Mixed" declares, that (A & B) == C and C might or might not
-/// contain any number of one bits and zero bits.
-///   Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed
-/// The Part "Not" means, that in above descriptions "==" should be replaced
-/// by "!=".
-///   Example: (icmp ne (A & 3), 3) -> FoldMskICmp_AMask_NotAllOnes
 /// If the mask A contains a single bit, then the following is equivalent:
 ///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
 ///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
 enum MaskedICmpType {
-  FoldMskICmp_AMask_AllOnes           =     1,
-  FoldMskICmp_AMask_NotAllOnes        =     2,
-  FoldMskICmp_BMask_AllOnes           =     4,
-  FoldMskICmp_BMask_NotAllOnes        =     8,
-  FoldMskICmp_Mask_AllZeroes          =    16,
-  FoldMskICmp_Mask_NotAllZeroes       =    32,
-  FoldMskICmp_AMask_Mixed             =    64,
-  FoldMskICmp_AMask_NotMixed          =   128,
-  FoldMskICmp_BMask_Mixed             =   256,
-  FoldMskICmp_BMask_NotMixed          =   512
+  AMask_AllOnes           =     1,
+  AMask_NotAllOnes        =     2,
+  BMask_AllOnes           =     4,
+  BMask_NotAllOnes        =     8,
+  Mask_AllZeros           =    16,
+  Mask_NotAllZeros        =    32,
+  AMask_Mixed             =    64,
+  AMask_NotMixed          =   128,
+  BMask_Mixed             =   256,
+  BMask_NotMixed          =   512
 };
 
-/// Return the set of pattern classes (from MaskedICmpType)
-/// that (icmp SCC (A & B), C) satisfies.
-static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
-                                    ICmpInst::Predicate SCC)
-{
+/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
+/// satisfies.
+static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
+                                  ICmpInst::Predicate Pred) {
   ConstantInt *ACst = dyn_cast<ConstantInt>(A);
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
   ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-  bool icmp_eq = (SCC == ICmpInst::ICMP_EQ);
-  bool icmp_abit = (ACst && !ACst->isZero() &&
-                    ACst->getValue().isPowerOf2());
-  bool icmp_bbit = (BCst && !BCst->isZero() &&
-                    BCst->getValue().isPowerOf2());
-  unsigned result = 0;
+  bool IsEq = (Pred == ICmpInst::ICMP_EQ);
+  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
+  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+  unsigned MaskVal = 0;
   if (CCst && CCst->isZero()) {
     // if C is zero, then both A and B qualify as mask
-    result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes |
-                          FoldMskICmp_AMask_Mixed |
-                          FoldMskICmp_BMask_Mixed)
-                       : (FoldMskICmp_Mask_NotAllZeroes |
-                          FoldMskICmp_AMask_NotMixed |
-                          FoldMskICmp_BMask_NotMixed));
-    if (icmp_abit)
-      result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes |
-                            FoldMskICmp_AMask_NotMixed)
-                         : (FoldMskICmp_AMask_AllOnes |
-                            FoldMskICmp_AMask_Mixed));
-    if (icmp_bbit)
-      result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes |
-                            FoldMskICmp_BMask_NotMixed)
-                         : (FoldMskICmp_BMask_AllOnes |
-                            FoldMskICmp_BMask_Mixed));
-    return result;
+    MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
+                     : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
+                       : (AMask_AllOnes | AMask_Mixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
+                       : (BMask_AllOnes | BMask_Mixed));
+    return MaskVal;
   }
+
   if (A == C) {
-    result |= (icmp_eq ? (FoldMskICmp_AMask_AllOnes |
-                          FoldMskICmp_AMask_Mixed)
-                       : (FoldMskICmp_AMask_NotAllOnes |
-                          FoldMskICmp_AMask_NotMixed));
-    if (icmp_abit)
-      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_AMask_NotMixed)
-                         : (FoldMskICmp_Mask_AllZeroes |
-                            FoldMskICmp_AMask_Mixed));
-  } else if (ACst && CCst &&
-             ConstantExpr::getAnd(ACst, CCst) == CCst) {
-    result |= (icmp_eq ? FoldMskICmp_AMask_Mixed
-                       : FoldMskICmp_AMask_NotMixed);
+    MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
+                     : (AMask_NotAllOnes | AMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
+                       : (Mask_AllZeros | AMask_Mixed));
+  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+    MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
   }
+
   if (B == C) {
-    result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes |
-                          FoldMskICmp_BMask_Mixed)
-                       : (FoldMskICmp_BMask_NotAllOnes |
-                          FoldMskICmp_BMask_NotMixed));
-    if (icmp_bbit)
-      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_BMask_NotMixed)
-                         : (FoldMskICmp_Mask_AllZeroes |
-                            FoldMskICmp_BMask_Mixed));
-  } else if (BCst && CCst &&
-             ConstantExpr::getAnd(BCst, CCst) == CCst) {
-    result |= (icmp_eq ? FoldMskICmp_BMask_Mixed
-                       : FoldMskICmp_BMask_NotMixed);
-  }
-  return result;
+    MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
+                     : (BMask_NotAllOnes | BMask_NotMixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
+                       : (Mask_AllZeros | BMask_Mixed));
+  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+    MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
+  }
+
+  return MaskVal;
 }
 
 /// Convert an analysis of a masked ICmp into its equivalent if all boolean
@@ -482,32 +365,30 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
 /// involves swapping those bits over.
 static unsigned conjugateICmpMask(unsigned Mask) {
   unsigned NewMask;
-  NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes |
-                     FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed |
-                     FoldMskICmp_BMask_Mixed))
+  NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
+                     AMask_Mixed | BMask_Mixed))
             << 1;
 
-  NewMask |=
-      (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes |
-               FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed |
-               FoldMskICmp_BMask_NotMixed))
-      >> 1;
+  NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
+                      AMask_NotMixed | BMask_NotMixed))
+             >> 1;
 
   return NewMask;
 }
 
-/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
-/// Return the set of pattern classes (from MaskedICmpType)
-/// that both LHS and RHS satisfy.
-static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
-                                             Value*& B, Value*& C,
-                                             Value*& D, Value*& E,
-                                             ICmpInst *LHS, ICmpInst *RHS,
-                                             ICmpInst::Predicate &LHSCC,
-                                             ICmpInst::Predicate &RHSCC) {
-  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0;
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
+/// Return the set of pattern classes (from MaskedICmpType) that both LHS and
+/// RHS satisfy.
+static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+                                         Value *&D, Value *&E, ICmpInst *LHS,
+                                         ICmpInst *RHS,
+                                         ICmpInst::Predicate &PredL,
+                                         ICmpInst::Predicate &PredR) {
+  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+    return 0;
   // vectors are not (yet?) supported
-  if (LHS->getOperand(0)->getType()->isVectorTy()) return 0;
+  if (LHS->getOperand(0)->getType()->isVectorTy())
+    return 0;
 
   // Here comes the tricky part:
   // LHS might be of the form L11 & L12 == X, X == L21 & L22,
@@ -517,9 +398,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   // above.
   Value *L1 = LHS->getOperand(0);
   Value *L2 = LHS->getOperand(1);
-  Value *L11,*L12,*L21,*L22;
+  Value *L11, *L12, *L21, *L22;
   // Check whether the icmp can be decomposed into a bit test.
-  if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) {
+  if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) {
     L21 = L22 = L1 = nullptr;
   } else {
     // Look for ANDs in the LHS icmp.
@@ -543,22 +424,26 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   }
 
   // Bail if LHS was a icmp that can't be decomposed into an equality.
-  if (!ICmpInst::isEquality(LHSCC))
+  if (!ICmpInst::isEquality(PredL))
     return 0;
 
   Value *R1 = RHS->getOperand(0);
   Value *R2 = RHS->getOperand(1);
-  Value *R11,*R12;
-  bool ok = false;
-  if (decomposeBitTestICmp(RHS, RHSCC, R11, R12, R2)) {
+  Value *R11, *R12;
+  bool Ok = false;
+  if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) {
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12;
+      A = R11;
+      D = R12;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11;
+      A = R12;
+      D = R11;
     } else {
       return 0;
     }
-    E = R2; R1 = nullptr; ok = true;
+    E = R2;
+    R1 = nullptr;
+    Ok = true;
   } else if (R1->getType()->isIntegerTy()) {
     if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
       // As before, model no mask as a trivial mask if it'll let us do an
@@ -568,60 +453,78 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     }
 
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12; E = R2; ok = true;
+      A = R11;
+      D = R12;
+      E = R2;
+      Ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11; E = R2; ok = true;
+      A = R12;
+      D = R11;
+      E = R2;
+      Ok = true;
     }
   }
 
   // Bail if RHS was a icmp that can't be decomposed into an equality.
-  if (!ICmpInst::isEquality(RHSCC))
+  if (!ICmpInst::isEquality(PredR))
     return 0;
 
   // Look for ANDs on the right side of the RHS icmp.
-  if (!ok && R2->getType()->isIntegerTy()) {
+  if (!Ok && R2->getType()->isIntegerTy()) {
     if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
       R11 = R2;
       R12 = Constant::getAllOnesValue(R2->getType());
     }
 
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12; E = R1; ok = true;
+      A = R11;
+      D = R12;
+      E = R1;
+      Ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11; E = R1; ok = true;
+      A = R12;
+      D = R11;
+      E = R1;
+      Ok = true;
     } else {
       return 0;
     }
   }
-  if (!ok)
+  if (!Ok)
     return 0;
 
   if (L11 == A) {
-    B = L12; C = L2;
+    B = L12;
+    C = L2;
   } else if (L12 == A) {
-    B = L11; C = L2;
+    B = L11;
+    C = L2;
   } else if (L21 == A) {
-    B = L22; C = L1;
+    B = L22;
+    C = L1;
   } else if (L22 == A) {
-    B = L21; C = L1;
+    B = L21;
+    C = L1;
   }
 
-  unsigned LeftType = getTypeOfMaskedICmp(A, B, C, LHSCC);
-  unsigned RightType = getTypeOfMaskedICmp(A, D, E, RHSCC);
+  unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
+  unsigned RightType = getMaskedICmpType(A, D, E, PredR);
   return LeftType & RightType;
 }
 
 /// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y).
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
-                                     llvm::InstCombiner::BuilderTy *Builder) {
+                                     llvm::InstCombiner::BuilderTy &Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
-  unsigned Mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
-                                               LHSCC, RHSCC);
-  if (Mask == 0) return nullptr;
-  assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
-         "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  unsigned Mask =
+      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
+  if (Mask == 0)
+    return nullptr;
+
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
 
   // In full generality:
   //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
@@ -642,41 +545,43 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     Mask = conjugateICmpMask(Mask);
   }
 
-  if (Mask & FoldMskICmp_Mask_AllZeroes) {
+  if (Mask & Mask_AllZeros) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
-    Value *NewOr = Builder->CreateOr(B, D);
-    Value *NewAnd = Builder->CreateAnd(A, NewOr);
+    Value *NewOr = Builder.CreateOr(B, D);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr);
     // We can't use C as zero because we might actually handle
     //   (icmp ne (A & B), B) & (icmp ne (A & D), D)
     // with B and D, having a single bit set.
     Value *Zero = Constant::getNullValue(A->getType());
-    return Builder->CreateICmp(NewCC, NewAnd, Zero);
+    return Builder.CreateICmp(NewCC, NewAnd, Zero);
   }
-  if (Mask & FoldMskICmp_BMask_AllOnes) {
+  if (Mask & BMask_AllOnes) {
     // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
-    Value *NewOr = Builder->CreateOr(B, D);
-    Value *NewAnd = Builder->CreateAnd(A, NewOr);
-    return Builder->CreateICmp(NewCC, NewAnd, NewOr);
+    Value *NewOr = Builder.CreateOr(B, D);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr);
+    return Builder.CreateICmp(NewCC, NewAnd, NewOr);
   }
-  if (Mask & FoldMskICmp_AMask_AllOnes) {
+  if (Mask & AMask_AllOnes) {
     // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
-    Value *NewAnd1 = Builder->CreateAnd(B, D);
-    Value *NewAnd2 = Builder->CreateAnd(A, NewAnd1);
-    return Builder->CreateICmp(NewCC, NewAnd2, A);
+    Value *NewAnd1 = Builder.CreateAnd(B, D);
+    Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1);
+    return Builder.CreateICmp(NewCC, NewAnd2, A);
   }
 
   // Remaining cases assume at least that B and D are constant, and depend on
   // their actual values. This isn't strictly necessary, just a "handle the
   // easy cases for now" decision.
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  if (!BCst) return nullptr;
+  if (!BCst)
+    return nullptr;
   ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-  if (!DCst) return nullptr;
+  if (!DCst)
+    return nullptr;
 
-  if (Mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
     // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
@@ -689,7 +594,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (Mask & FoldMskICmp_AMask_NotAllOnes) {
+
+  if (Mask & AMask_NotAllOnes) {
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
     // Only valid if one of the masks is a superset of the other (check "B|D" is
@@ -701,7 +607,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (Mask & FoldMskICmp_BMask_Mixed) {
+
+  if (Mask & BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
     // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
@@ -713,23 +620,28 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set.
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-    if (!CCst) return nullptr;
+    if (!CCst)
+      return nullptr;
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
-    if (!ECst) return nullptr;
-    if (LHSCC != NewCC)
+    if (!ECst)
+      return nullptr;
+    if (PredL != NewCC)
       CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
-    if (RHSCC != NewCC)
+    if (PredR != NewCC)
       ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
     // If there is a conflict, we should actually return a false for the
     // whole construct.
     if (((BCst->getValue() & DCst->getValue()) &
-         (CCst->getValue() ^ ECst->getValue())) != 0)
+         (CCst->getValue() ^ ECst->getValue())).getBoolValue())
       return ConstantInt::get(LHS->getType(), !IsAnd);
-    Value *NewOr1 = Builder->CreateOr(B, D);
+
+    Value *NewOr1 = Builder.CreateOr(B, D);
     Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
-    Value *NewAnd = Builder->CreateAnd(A, NewOr1);
-    return Builder->CreateICmp(NewCC, NewAnd, NewOr2);
+    Value *NewAnd = Builder.CreateAnd(A, NewOr1);
+    return Builder.CreateICmp(NewCC, NewAnd, NewOr2);
   }
+
   return nullptr;
 }
 
@@ -778,23 +690,123 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
   }
 
   // This simplification is only valid if the upper range is not negative.
-  bool IsNegative, IsNotNegative;
-  ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1);
-  if (!IsNotNegative)
+  KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+  if (!Known.isNonNegative())
     return nullptr;
 
   if (Inverted)
     NewPred = ICmpInst::getInversePredicate(NewPred);
 
-  return Builder->CreateICmp(NewPred, Input, RangeEnd);
+  return Builder.CreateICmp(NewPred, Input, RangeEnd);
+}
+
+static Value *
+foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
+                                     bool JoinedByAnd,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *X = LHS->getOperand(0);
+  if (X != RHS->getOperand(0))
+    return nullptr;
+
+  const APInt *C1, *C2;
+  if (!match(LHS->getOperand(1), m_APInt(C1)) ||
+      !match(RHS->getOperand(1), m_APInt(C2)))
+    return nullptr;
+
+  // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred !=  RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // The larger unsigned constant goes on the right.
+  if (C1->ugt(*C2))
+    std::swap(C1, C2);
+
+  APInt Xor = *C1 ^ *C2;
+  if (Xor.isPowerOf2()) {
+    // If LHSC and RHSC differ by only one bit, then set that bit in X and
+    // compare against the larger constant:
+    // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
+    // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
+    // We choose an 'or' with a Pow2 constant rather than the inverse mask with
+    // 'and' because that may lead to smaller codegen from a smaller constant.
+    Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor));
+    return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
+  }
+
+  // Special case: get the ordering right when the values wrap around zero.
+  // Ie, we assumed the constants were unsigned when swapping earlier.
+  if (C1->isNullValue() && C2->isAllOnesValue())
+    std::swap(C1, C2);
+
+  if (*C1 == *C2 - 1) {
+    // (X == 13 || X == 14) --> X - 13 <=u 1
+    // (X != 13 && X != 14) --> X - 13  >u 1
+    // An 'add' is the canonical IR form, so favor that over a 'sub'.
+    Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
+    auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
+    return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
+  }
+
+  return nullptr;
+}
+
+// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                                   bool JoinedByAnd,
+                                                   Instruction &CxtI) {
+  ICmpInst::Predicate Pred = LHS->getPredicate();
+  if (Pred != RHS->getPredicate())
+    return nullptr;
+  if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+    return nullptr;
+  if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+    return nullptr;
+
+  // TODO support vector splats
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC || !LHSC->isZero() || !RHSC->isZero())
+    return nullptr;
+
+  Value *A, *B, *C, *D;
+  if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
+      match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
+    if (A == D || B == D)
+      std::swap(C, D);
+    if (B == C)
+      std::swap(A, B);
+
+    if (A == C &&
+        isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) &&
+        isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) {
+      Value *Mask = Builder.CreateOr(B, D);
+      Value *Masked = Builder.CreateAnd(A, Mask);
+      auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+      return Builder.CreateICmp(NewPred, Masked, Mask);
+    }
+  }
+
+  return nullptr;
 }
 
 /// Fold (icmp)&(icmp) if possible.
-Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                    Instruction &CxtI) {
+  // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
+    return V;
+
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(LHSCC, RHSCC)) {
+  if (PredicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -819,86 +831,91 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
     return V;
 
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
+    return V;
+
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
-  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-  if (!LHSCst || !RHSCst) return nullptr;
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC)
+    return nullptr;
 
-  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+  if (LHSC == RHSC && PredL == PredR) {
     // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
     // where C is a power of 2 or
     // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
-    if ((LHSCC == ICmpInst::ICMP_ULT && LHSCst->getValue().isPowerOf2()) ||
-        (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero())) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
+        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
+      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+      return Builder.CreateICmp(PredL, NewOr, LHSC);
     }
   }
 
   // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
   // where CMAX is the all ones value for the truncated type,
   // iff the lower bits of C2 and CA are zero.
-  if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC &&
-      LHS->hasOneUse() && RHS->hasOneUse()) {
+  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
+      RHS->hasOneUse()) {
     Value *V;
-    ConstantInt *AndCst, *SmallCst = nullptr, *BigCst = nullptr;
+    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
 
     // (trunc x) == C1 & (and x, CA) == C2
     // (and x, CA) == C2 & (trunc x) == C1
-    if (match(Val2, m_Trunc(m_Value(V))) &&
-        match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
-      SmallCst = RHSCst;
-      BigCst = LHSCst;
-    } else if (match(Val, m_Trunc(m_Value(V))) &&
-               match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
-      SmallCst = LHSCst;
-      BigCst = RHSCst;
+    if (match(RHS0, m_Trunc(m_Value(V))) &&
+        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = RHSC;
+      BigC = LHSC;
+    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = LHSC;
+      BigC = RHSC;
     }
 
-    if (SmallCst && BigCst) {
-      unsigned BigBitSize = BigCst->getType()->getBitWidth();
-      unsigned SmallBitSize = SmallCst->getType()->getBitWidth();
+    if (SmallC && BigC) {
+      unsigned BigBitSize = BigC->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallC->getType()->getBitWidth();
 
       // Check that the low bits are zero.
       APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
-      if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) {
-        Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue());
-        APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue();
-        Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N);
-        return Builder->CreateICmp(LHSCC, NewAnd, NewVal);
+      if ((Low & AndC->getValue()).isNullValue() &&
+          (Low & BigC->getValue()).isNullValue()) {
+        Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue());
+        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
+        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+        return Builder.CreateICmp(PredL, NewAnd, NewVal);
       }
     }
   }
 
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return nullptr;
+  if (LHS0 != RHS0)
+    return nullptr;
 
-  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
-  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
-      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
-      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
-      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
     return nullptr;
 
   // We can't fold (ugt x, C) & (sgt x, C2).
-  if (!PredicatesFoldable(LHSCC, RHSCC))
+  if (!PredicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
-  if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) &&
-       CmpInst::isSigned(RHSCC)))
-    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
   else
-    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
 
   if (ShouldSwap) {
     std::swap(LHS, RHS);
-    std::swap(LHSCst, RHSCst);
-    std::swap(LHSCC, RHSCC);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
   }
 
   // At this point, we know we have two icmp instructions
@@ -907,113 +924,55 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
   // (from the icmp folding check above), that the two constants
   // are not equal and that the larger constant is on the RHS
-  assert(LHSCst != RHSCst && "Compares not folded above?");
+  assert(LHSC != RHSC && "Compares not folded above?");
 
-  switch (LHSCC) {
-  default: llvm_unreachable("Unknown integer condition code!");
-  case ICmpInst::ICMP_EQ:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
-    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
-    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
-      return LHS;
-    }
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
   case ICmpInst::ICMP_NE:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_ULT:
-      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
-        return Builder->CreateICmpULT(Val, LHSCst);
-      if (LHSCst->isNullValue())    // (X !=  0 & X u< 14) -> X-1 u< 13
-        return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13
+        return Builder.CreateICmpULT(LHS0, LHSC);
+      if (LHSC->isZero()) // (X !=  0 & X u< 14) -> X-1 u< 13
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                                false, true);
-      break;                        // (X != 13 & X u< 15) -> no change
+      break; // (X != 13 & X u< 15) -> no change
     case ICmpInst::ICMP_SLT:
-      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
-        return Builder->CreateICmpSLT(Val, LHSCst);
-      break;                        // (X != 13 & X s< 15) -> no change
-    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
-    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
-      return RHS;
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13
+        return Builder.CreateICmpSLT(LHS0, LHSC);
+      break;                 // (X != 13 & X s< 15) -> no change
     case ICmpInst::ICMP_NE:
-      // Special case to get the ordering right when the values wrap around
-      // zero.
-      if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue())
-        std::swap(LHSCst, RHSCst);
-      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1),
-                                      Val->getName()+".cmp");
-      }
-      break;                        // (X != 13 & X != 15) -> no change
-    }
-    break;
-  case ICmpInst::ICMP_ULT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
-    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
-      return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
-    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
-      return LHS;
-    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
-      break;
-    }
-    break;
-  case ICmpInst::ICMP_SLT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
-    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
-      return LHS;
-    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
+      // Potential folds for this case should already be handled.
       break;
     }
     break;
   case ICmpInst::ICMP_UGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
-      return RHS;
-    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
-      break;
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_NE:
-      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
-        return Builder->CreateICmp(LHSCC, Val, RHSCst);
-      break;                        // (X u> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
-      return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+      if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14
+        return Builder.CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                              false, true);
-    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
-      return RHS;
-    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
-      break;
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_NE:
-      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
-        return Builder->CreateICmp(LHSCC, Val, RHSCst);
-      break;                        // (X s> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
-      return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
-                             true, true);
-    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
-      break;
+      if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14
+        return Builder.CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
+                             true);
     }
     break;
   }
@@ -1023,7 +982,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
 /// Optimize (fcmp)&(fcmp).  NOTE: Unlike the rest of instcombine, this returns
 /// a Value which should already be inserted into the function.
-Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+Value *InstCombiner::foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
   Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
   FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
@@ -1058,15 +1017,15 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // If either of the constants are nans, then the whole thing returns
         // false.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
-          return Builder->getFalse();
-        return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
+          return Builder.getFalse();
+        return Builder.CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
       }
 
     // Handle vector zeros.  This occurs because the canonical form of
     // "fcmp ord x,x" is "fcmp ord x, 0".
     if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
         isa<ConstantAggregateZero>(RHS->getOperand(1)))
-      return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
+      return Builder.CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
     return nullptr;
   }
 
@@ -1077,26 +1036,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 /// (~A & ~B) == (~(A | B))
 /// (~A | ~B) == (~(A & B))
 static Instruction *matchDeMorgansLaws(BinaryOperator &I,
-                                       InstCombiner::BuilderTy *Builder) {
+                                       InstCombiner::BuilderTy &Builder) {
   auto Opcode = I.getOpcode();
   assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
          "Trying to match De Morgan's Laws with something other than and/or");
+
   // Flip the logic operation.
-  if (Opcode == Instruction::And)
-    Opcode = Instruction::Or;
-  else
-    Opcode = Instruction::And;
+  Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
 
-  Value *Op0 = I.getOperand(0);
-  Value *Op1 = I.getOperand(1);
-  // TODO: Use pattern matchers instead of dyn_cast.
-  if (Value *Op0NotVal = dyn_castNotVal(Op0))
-    if (Value *Op1NotVal = dyn_castNotVal(Op1))
-      if (Op0->hasOneUse() && Op1->hasOneUse()) {
-        Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal,
-                                              I.getName() + ".demorgan");
-        return BinaryOperator::CreateNot(LogicOp);
-      }
+  Value *A, *B;
+  if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+      match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+      !IsFreeToInvert(A, A->hasOneUse()) &&
+      !IsFreeToInvert(B, B->hasOneUse())) {
+    Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+    return BinaryOperator::CreateNot(AndOr);
+  }
 
   return nullptr;
 }
@@ -1125,7 +1080,7 @@ bool InstCombiner::shouldOptimizeCast(CastInst *CI) {
 
 /// Fold {and,or,xor} (cast X), C.
 static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
-                                          InstCombiner::BuilderTy *Builder) {
+                                          InstCombiner::BuilderTy &Builder) {
   Constant *C;
   if (!match(Logic.getOperand(1), m_Constant(C)))
     return nullptr;
@@ -1134,26 +1089,17 @@ static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
   Type *DestTy = Logic.getType();
   Type *SrcTy = Cast->getSrcTy();
 
-  // If the first operand is bitcast, move the logic operation ahead of the
-  // bitcast (do the logic operation in the original type). This can eliminate
-  // bitcasts and allow combines that would otherwise be impeded by the bitcast.
+  // Move the logic operation ahead of a zext if the constant is unchanged in
+  // the smaller source type. Performing the logic in a smaller type may provide
+  // more information to later folds, and the smaller logic instruction may be
+  // cheaper (particularly in the case of vectors).
   Value *X;
-  if (match(Cast, m_BitCast(m_Value(X)))) {
-    Value *NewConstant = ConstantExpr::getBitCast(C, SrcTy);
-    Value *NewOp = Builder->CreateBinOp(LogicOpc, X, NewConstant);
-    return CastInst::CreateBitOrPointerCast(NewOp, DestTy);
-  }
-
-  // Similarly, move the logic operation ahead of a zext if the constant is
-  // unchanged in the smaller source type. Performing the logic in a smaller
-  // type may provide more information to later folds, and the smaller logic
-  // instruction may be cheaper (particularly in the case of vectors).
   if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
     Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
     Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
     if (ZextTruncC == C) {
       // LogicOpc (zext X), C --> zext (LogicOpc X, C)
-      Value *NewOp = Builder->CreateBinOp(LogicOpc, X, TruncC);
+      Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
       return new ZExtInst(NewOp, DestTy);
     }
   }
@@ -1196,7 +1142,7 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
 
   // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
   if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
-    Value *NewOp = Builder->CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
+    Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
                                         I.getName());
     return CastInst::Create(CastOpcode, NewOp, DestTy);
   }
@@ -1210,8 +1156,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
   ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
   ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
   if (ICmp0 && ICmp1) {
-    Value *Res = LogicOpc == Instruction::And ? FoldAndOfICmps(ICmp0, ICmp1)
-                                              : FoldOrOfICmps(ICmp0, ICmp1, &I);
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
+                                              : foldOrOfICmps(ICmp0, ICmp1, I);
     if (Res)
       return CastInst::Create(CastOpcode, Res, DestTy);
     return nullptr;
@@ -1222,8 +1168,8 @@ Instruction *InstCombiner::foldCastedBitwiseLogic(BinaryOperator &I) {
   FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
   FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
   if (FCmp0 && FCmp1) {
-    Value *Res = LogicOpc == Instruction::And ? FoldAndOfFCmps(FCmp0, FCmp1)
-                                              : FoldOrOfFCmps(FCmp0, FCmp1);
+    Value *Res = LogicOpc == Instruction::And ? foldAndOfFCmps(FCmp0, FCmp1)
+                                              : foldOrOfFCmps(FCmp0, FCmp1);
     if (Res)
       return CastInst::Create(CastOpcode, Res, DestTy);
     return nullptr;
@@ -1242,15 +1188,14 @@ static Instruction *foldBoolSextMaskToSelect(BinaryOperator &I) {
 
   // Fold (and (sext bool to A), B) --> (select bool, B, 0)
   Value *X = nullptr;
-  if (match(Op0, m_SExt(m_Value(X))) &&
-      X->getType()->getScalarType()->isIntegerTy(1)) {
+  if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
     Value *Zero = Constant::getNullValue(Op1->getType());
     return SelectInst::Create(X, Op1, Zero);
   }
 
   // Fold (and ~(sext bool to A), B) --> (select bool, 0, B)
   if (match(Op0, m_Not(m_SExt(m_Value(X)))) &&
-      X->getType()->getScalarType()->isIntegerTy(1)) {
+      X->getType()->isIntOrIntVectorTy(1)) {
     Value *Zero = Constant::getNullValue(Op0->getType());
     return SelectInst::Create(X, Zero, Op1);
   }
@@ -1258,6 +1203,58 @@ static Instruction *foldBoolSextMaskToSelect(BinaryOperator &I) {
   return nullptr;
 }
 
+static Instruction *foldAndToXor(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::And);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // Operand complexity canonicalization guarantees that the 'or' is Op0.
+  // (A | B) & ~(A & B) --> A ^ B
+  // (A | B) & ~(B & A) --> A ^ B
+  if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+      match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B)))))
+    return BinaryOperator::CreateXor(A, B);
+
+  // (A | ~B) & (~A | B) --> ~(A ^ B)
+  // (A | ~B) & (B | ~A) --> ~(A ^ B)
+  // (~B | A) & (~A | B) --> ~(A ^ B)
+  // (~B | A) & (B | ~A) --> ~(A ^ B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  return nullptr;
+}
+
+static Instruction *foldOrToXor(BinaryOperator &I,
+                                InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Or);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // Operand complexity canonicalization guarantees that the 'and' is Op0.
+  // (A & B) | ~(A | B) --> ~(A ^ B)
+  // (A & B) | ~(B | A) --> ~(A ^ B)
+  if (Op0->hasOneUse() || Op1->hasOneUse())
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+      return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  // (A & ~B) | (~A & B) --> A ^ B
+  // (A & ~B) | (B & ~A) --> A ^ B
+  // (~B & A) | (~A & B) --> A ^ B
+  // (~B & A) | (B & ~A) --> A ^ B
+  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+      match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))
+    return BinaryOperator::CreateXor(A, B);
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -1268,11 +1265,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAndInst(Op0, Op1, DL, &TLI, &DT, &AC))
-    return replaceInstUsesWith(I, V);
-
-  // (A|B)&(A|C) -> A|(B&C) etc
-  if (Value *V = SimplifyUsingDistributiveLaws(I))
+  if (Value *V = SimplifyAndInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // See if we can simplify any instructions used by the instruction whose sole
@@ -1280,9 +1273,27 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
-  if (Value *V = SimplifyBSwap(I))
+  // Do this before using distributive laws to catch simple and/or/not patterns.
+  if (Instruction *Xor = foldAndToXor(I, Builder))
+    return Xor;
+
+  // (A|B)&(A|C) -> A|(B&C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
+
+  if (match(Op1, m_One())) {
+    // (1 << x) & 1 --> zext(x == 0)
+    // (1 >> x) & 1 --> zext(x == 0)
+    Value *X;
+    if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X))))) {
+      Value *IsZero = Builder.CreateICmpEQ(X, ConstantInt::get(I.getType(), 0));
+      return new ZExtInst(IsZero, I.getType());
+    }
+  }
+
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
     const APInt &AndRHSMask = AndRHS->getValue();
 
@@ -1300,65 +1311,47 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         APInt NotAndRHS(~AndRHSMask);
         if (MaskedValueIsZero(Op0LHS, NotAndRHS, 0, &I)) {
           // Not masking anything out for the LHS, move to RHS.
-          Value *NewRHS = Builder->CreateAnd(Op0RHS, AndRHS,
-                                             Op0RHS->getName()+".masked");
+          Value *NewRHS = Builder.CreateAnd(Op0RHS, AndRHS,
+                                            Op0RHS->getName()+".masked");
           return BinaryOperator::Create(Op0I->getOpcode(), Op0LHS, NewRHS);
         }
         if (!isa<Constant>(Op0RHS) &&
             MaskedValueIsZero(Op0RHS, NotAndRHS, 0, &I)) {
           // Not masking anything out for the RHS, move to LHS.
-          Value *NewLHS = Builder->CreateAnd(Op0LHS, AndRHS,
-                                             Op0LHS->getName()+".masked");
+          Value *NewLHS = Builder.CreateAnd(Op0LHS, AndRHS,
+                                            Op0LHS->getName()+".masked");
           return BinaryOperator::Create(Op0I->getOpcode(), NewLHS, Op0RHS);
         }
 
         break;
       }
-      case Instruction::Add:
-        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
-        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
-        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
-        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);
-        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
-        break;
+      }
 
+      // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
+      // of X and OP behaves well when given trunc(C1) and X.
+      switch (Op0I->getOpcode()) {
+      default:
+        break;
+      case Instruction::Xor:
+      case Instruction::Or:
+      case Instruction::Mul:
+      case Instruction::Add:
       case Instruction::Sub:
-        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
-        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
-        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
-        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);
-
-        // -x & 1 -> x & 1
-        if (AndRHSMask == 1 && match(Op0LHS, m_Zero()))
-          return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
-
-        // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
-        // has 1's for all bits that the subtraction with A might affect.
-        if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) {
-          uint32_t BitWidth = AndRHSMask.getBitWidth();
-          uint32_t Zeros = AndRHSMask.countLeadingZeros();
-          APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
-
-          if (MaskedValueIsZero(Op0LHS, Mask, 0, &I)) {
-            Value *NewNeg = Builder->CreateNeg(Op0RHS);
-            return BinaryOperator::CreateAnd(NewNeg, AndRHS);
+        Value *X;
+        ConstantInt *C1;
+        if (match(Op0I, m_c_BinOp(m_ZExt(m_Value(X)), m_ConstantInt(C1)))) {
+          if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
+            auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
+            Value *BinOp;
+            if (isa<ZExtInst>(Op0LHS))
+              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1);
+            else
+              BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
+            auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
+            auto *And = Builder.CreateAnd(BinOp, TruncC2);
+            return new ZExtInst(And, I.getType());
           }
         }
-        break;
-
-      case Instruction::Shl:
-      case Instruction::LShr:
-        // (1 << x) & 1 --> zext(x == 0)
-        // (1 >> x) & 1 --> zext(x == 0)
-        if (AndRHSMask == 1 && Op0LHS == AndRHS) {
-          Value *NewICmp =
-            Builder->CreateICmpEQ(Op0RHS, Constant::getNullValue(I.getType()));
-          return new ZExtInst(NewICmp, I.getType());
-        }
-        break;
       }
 
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
@@ -1375,34 +1368,23 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         // into  : and (trunc X to T), trunc(YC) & C2
         // This will fold the two constants together, which may allow
         // other simplifications.
-        Value *NewCast = Builder->CreateTrunc(X, I.getType(), "and.shrunk");
+        Value *NewCast = Builder.CreateTrunc(X, I.getType(), "and.shrunk");
         Constant *C3 = ConstantExpr::getTrunc(YC, I.getType());
         C3 = ConstantExpr::getAnd(C3, AndRHS);
         return BinaryOperator::CreateAnd(NewCast, C3);
       }
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
     return DeMorgan;
 
   {
-    Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-    // (A|B) & ~(A&B) -> A^B
-    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1, m_Not(m_And(m_Value(C), m_Value(D)))) &&
-        ((A == C && B == D) || (A == D && B == C)))
-      return BinaryOperator::CreateXor(A, B);
-
-    // ~(A&B) & (A|B) -> A^B
-    if (match(Op1, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op0, m_Not(m_And(m_Value(C), m_Value(D)))) &&
-        ((A == C && B == D) || (A == D && B == C)))
-      return BinaryOperator::CreateXor(A, B);
-
+    Value *A = nullptr, *B = nullptr, *C = nullptr;
     // A&(A^B) => A & ~B
     {
       Value *tmpOp0 = Op0;
@@ -1424,38 +1406,35 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         // an endless loop. By checking that A is non-constant we ensure that
         // we will never get to the loop.
         if (A == tmpOp0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
-          return BinaryOperator::CreateAnd(A, Builder->CreateNot(B));
+          return BinaryOperator::CreateAnd(A, Builder.CreateNot(B));
       }
     }
 
-    // (A&((~A)|B)) -> A&B
-    if (match(Op0, m_Or(m_Not(m_Specific(Op1)), m_Value(A))) ||
-        match(Op0, m_Or(m_Value(A), m_Not(m_Specific(Op1)))))
-      return BinaryOperator::CreateAnd(A, Op1);
-    if (match(Op1, m_Or(m_Not(m_Specific(Op0)), m_Value(A))) ||
-        match(Op1, m_Or(m_Value(A), m_Not(m_Specific(Op0)))))
-      return BinaryOperator::CreateAnd(A, Op0);
-
     // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
     if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
       if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
-        if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
-          return BinaryOperator::CreateAnd(Op0, Builder->CreateNot(C));
+        if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+          return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
 
     // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
     if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
       if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
-        if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
-          return BinaryOperator::CreateAnd(Op1, Builder->CreateNot(C));
+        if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse()))
+          return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
 
     // (A | B) & ((~A) ^ B) -> (A & B)
-    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1, m_Xor(m_Not(m_Specific(A)), m_Specific(B))))
+    // (A | B) & (B ^ (~A)) -> (A & B)
+    // (B | A) & ((~A) ^ B) -> (A & B)
+    // (B | A) & (B ^ (~A)) -> (A & B)
+    if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
 
     // ((~A) ^ B) & (A | B) -> (A & B)
     // ((~A) ^ B) & (B | A) -> (A & B)
-    if (match(Op0, m_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+    // (B ^ (~A)) & (A | B) -> (A & B)
+    // (B ^ (~A)) & (B | A) -> (A & B)
+    if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
         match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
   }
@@ -1464,7 +1443,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = FoldAndOfICmps(LHS, RHS))
+      if (Value *Res = foldAndOfICmps(LHS, RHS, I))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -1472,26 +1451,26 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
     Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldAndOfICmps(LHS, Cmp))
-          return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldAndOfICmps(LHS, Cmp))
-          return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
+        if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldAndOfICmps(Cmp, RHS))
-          return replaceInstUsesWith(I, Builder->CreateAnd(Res, Y));
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldAndOfICmps(Cmp, RHS))
-          return replaceInstUsesWith(I, Builder->CreateAnd(Res, X));
+        if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
     }
   }
 
   // If and'ing two fcmp, try combine them into one.
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = FoldAndOfFCmps(LHS, RHS))
+      if (Value *Res = foldAndOfFCmps(LHS, RHS))
         return replaceInstUsesWith(I, Res);
 
   if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
@@ -1566,16 +1545,19 @@ static Value *getSelectCondition(Value *A, Value *B,
                                  InstCombiner::BuilderTy &Builder) {
   // If these are scalars or vectors of i1, A can be used directly.
   Type *Ty = A->getType();
-  if (match(A, m_Not(m_Specific(B))) && Ty->getScalarType()->isIntegerTy(1))
+  if (match(A, m_Not(m_Specific(B))) && Ty->isIntOrIntVectorTy(1))
     return A;
 
   // If A and B are sign-extended, look through the sexts to find the booleans.
   Value *Cond;
+  Value *NotB;
   if (match(A, m_SExt(m_Value(Cond))) &&
-      Cond->getType()->getScalarType()->isIntegerTy(1) &&
-      match(B, m_CombineOr(m_Not(m_SExt(m_Specific(Cond))),
-                           m_SExt(m_Not(m_Specific(Cond))))))
-    return Cond;
+      Cond->getType()->isIntOrIntVectorTy(1) &&
+      match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+    NotB = peekThroughBitcast(NotB, true);
+    if (match(NotB, m_SExt(m_Specific(Cond))))
+      return Cond;
+  }
 
   // All scalar (and most vector) possibilities should be handled now.
   // Try more matches that only apply to non-splat constant vectors.
@@ -1592,7 +1574,7 @@ static Value *getSelectCondition(Value *A, Value *B,
   // operand, see if the constants are inverse bitmasks.
   if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AC)))) &&
       match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BC)))) &&
-      Cond->getType()->getScalarType()->isIntegerTy(1) &&
+      Cond->getType()->isIntOrIntVectorTy(1) &&
       areInverseVectorBitmasks(AC, BC)) {
     AC = ConstantExpr::getTrunc(AC, CmpInst::makeCmpResultType(Ty));
     return Builder.CreateXor(Cond, AC);
@@ -1607,12 +1589,8 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
   // The potential condition of the select may be bitcasted. In that case, look
   // through its bitcast and the corresponding bitcast of the 'not' condition.
   Type *OrigType = A->getType();
-  Value *SrcA, *SrcB;
-  if (match(A, m_OneUse(m_BitCast(m_Value(SrcA)))) &&
-      match(B, m_OneUse(m_BitCast(m_Value(SrcB))))) {
-    A = SrcA;
-    B = SrcB;
-  }
+  A = peekThroughBitcast(A, true);
+  B = peekThroughBitcast(B, true);
 
   if (Value *Cond = getSelectCondition(A, B, Builder)) {
     // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
@@ -1628,46 +1606,17 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
 }
 
 /// Fold (icmp)|(icmp) if possible.
-Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                   Instruction *CxtI) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
-
+Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
+                                   Instruction &CxtI) {
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-
-  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() &&
-      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
-
-    BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
-    BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
-    if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() &&
-        LAnd->getOpcode() == Instruction::And &&
-        RAnd->getOpcode() == Instruction::And) {
-
-      Value *Mask = nullptr;
-      Value *Masked = nullptr;
-      if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
-          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, &AC, CxtI,
-                                 &DT) &&
-          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, &AC, CxtI,
-                                 &DT)) {
-        Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
-        Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
-      } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
-                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, &AC,
-                                        CxtI, &DT) &&
-                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, &AC,
-                                        CxtI, &DT)) {
-        Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
-        Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
-      }
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
+    return V;
 
-      if (Masked)
-        return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask);
-    }
-  }
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
 
   // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
   //                   -->  (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
@@ -1680,52 +1629,52 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
   // This implies all values in the two ranges differ by exactly one bit.
 
-  if ((LHSCC == ICmpInst::ICMP_ULT || LHSCC == ICmpInst::ICMP_ULE) &&
-      LHSCC == RHSCC && LHSCst && RHSCst && LHS->hasOneUse() &&
-      RHS->hasOneUse() && LHSCst->getType() == RHSCst->getType() &&
-      LHSCst->getValue() == (RHSCst->getValue())) {
+  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
+      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
+      LHSC->getType() == RHSC->getType() &&
+      LHSC->getValue() == (RHSC->getValue())) {
 
     Value *LAdd = LHS->getOperand(0);
     Value *RAdd = RHS->getOperand(0);
 
     Value *LAddOpnd, *RAddOpnd;
-    ConstantInt *LAddCst, *RAddCst;
-    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddCst))) &&
-        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddCst))) &&
-        LAddCst->getValue().ugt(LHSCst->getValue()) &&
-        RAddCst->getValue().ugt(LHSCst->getValue())) {
-
-      APInt DiffCst = LAddCst->getValue() ^ RAddCst->getValue();
-      if (LAddOpnd == RAddOpnd && DiffCst.isPowerOf2()) {
-        ConstantInt *MaxAddCst = nullptr;
-        if (LAddCst->getValue().ult(RAddCst->getValue()))
-          MaxAddCst = RAddCst;
+    ConstantInt *LAddC, *RAddC;
+    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddC))) &&
+        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddC))) &&
+        LAddC->getValue().ugt(LHSC->getValue()) &&
+        RAddC->getValue().ugt(LHSC->getValue())) {
+
+      APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
+      if (LAddOpnd == RAddOpnd && DiffC.isPowerOf2()) {
+        ConstantInt *MaxAddC = nullptr;
+        if (LAddC->getValue().ult(RAddC->getValue()))
+          MaxAddC = RAddC;
         else
-          MaxAddCst = LAddCst;
+          MaxAddC = LAddC;
 
-        APInt RRangeLow = -RAddCst->getValue();
-        APInt RRangeHigh = RRangeLow + LHSCst->getValue();
-        APInt LRangeLow = -LAddCst->getValue();
-        APInt LRangeHigh = LRangeLow + LHSCst->getValue();
+        APInt RRangeLow = -RAddC->getValue();
+        APInt RRangeHigh = RRangeLow + LHSC->getValue();
+        APInt LRangeLow = -LAddC->getValue();
+        APInt LRangeHigh = LRangeLow + LHSC->getValue();
         APInt LowRangeDiff = RRangeLow ^ LRangeLow;
         APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
         APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
                                                    : RRangeLow - LRangeLow;
 
         if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
-            RangeDiff.ugt(LHSCst->getValue())) {
-          Value *MaskCst = ConstantInt::get(LAddCst->getType(), ~DiffCst);
+            RangeDiff.ugt(LHSC->getValue())) {
+          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
 
-          Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskCst);
-          Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddCst);
-          return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSCst));
+          Value *NewAnd = Builder.CreateAnd(LAddOpnd, MaskC);
+          Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
+          return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
         }
       }
     }
   }
 
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(LHSCC, RHSCC)) {
+  if (PredicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -1743,31 +1692,31 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
-  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
     Value *A = nullptr, *B = nullptr;
-    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) {
-      B = Val;
-      if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1))
-        A = Val2;
-      else if (RHSCC == ICmpInst::ICMP_UGT && Val == Val2)
+    if (PredL == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero()) {
+      B = LHS0;
+      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS->getOperand(1))
+        A = RHS0;
+      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = RHS->getOperand(1);
     }
     // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
     // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
-    else if (RHSCC == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
-      B = Val2;
-      if (LHSCC == ICmpInst::ICMP_ULT && Val2 == LHS->getOperand(1))
-        A = Val;
-      else if (LHSCC == ICmpInst::ICMP_UGT && Val2 == Val)
+    else if (PredR == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
+      B = RHS0;
+      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS->getOperand(1))
+        A = LHS0;
+      else if (PredL == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = LHS->getOperand(1);
     }
     if (A && B)
-      return Builder->CreateICmp(
+      return Builder.CreateICmp(
           ICmpInst::ICMP_UGE,
-          Builder->CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
+          Builder.CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
   }
 
   // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
@@ -1778,54 +1727,58 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
     return V;
 
+  if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
+    return V;
+
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
-  if (!LHSCst || !RHSCst) return nullptr;
+  if (!LHSC || !RHSC)
+    return nullptr;
 
-  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+  if (LHSC == RHSC && PredL == PredR) {
     // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
-    if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    if (PredL == ICmpInst::ICMP_NE && LHSC->isZero()) {
+      Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+      return Builder.CreateICmp(PredL, NewOr, LHSC);
     }
   }
 
   // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
   //   iff C2 + CA == C1.
-  if (LHSCC == ICmpInst::ICMP_ULT && RHSCC == ICmpInst::ICMP_EQ) {
-    ConstantInt *AddCst;
-    if (match(Val, m_Add(m_Specific(Val2), m_ConstantInt(AddCst))))
-      if (RHSCst->getValue() + AddCst->getValue() == LHSCst->getValue())
-        return Builder->CreateICmpULE(Val, LHSCst);
+  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
+    ConstantInt *AddC;
+    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
+      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
+        return Builder.CreateICmpULE(LHS0, LHSC);
   }
 
   // From here on, we only handle:
   //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return nullptr;
+  if (LHS0 != RHS0)
+    return nullptr;
 
-  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
-  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
-      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
-      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
-      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
     return nullptr;
 
   // We can't fold (ugt x, C) | (sgt x, C2).
-  if (!PredicatesFoldable(LHSCC, RHSCC))
+  if (!PredicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
-  if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) &&
-       CmpInst::isSigned(RHSCC)))
-    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
   else
-    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
 
   if (ShouldSwap) {
     std::swap(LHS, RHS);
-    std::swap(LHSCst, RHSCst);
-    std::swap(LHSCC, RHSCC);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
   }
 
   // At this point, we know we have two icmp instructions
@@ -1834,127 +1787,45 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
   // icmp folding check above), that the two constants are not
   // equal.
-  assert(LHSCst != RHSCst && "Compares not folded above?");
+  assert(LHSC != RHSC && "Compares not folded above?");
 
-  switch (LHSCC) {
-  default: llvm_unreachable("Unknown integer condition code!");
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
   case ICmpInst::ICMP_EQ:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ:
-      if (LHS->getOperand(0) == RHS->getOperand(0)) {
-        // if LHSCst and RHSCst differ only by one bit:
-        // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2
-        assert(LHSCst->getValue().ule(LHSCst->getValue()));
-
-        APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
-        if (Xor.isPowerOf2()) {
-          Value *Cst = Builder->getInt(Xor);
-          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst);
-          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);
-        }
-      }
-
-      if (LHSCst == SubOne(RHSCst)) {
-        // (X == 13 | X == 14) -> X-13 <u 2
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
-        return Builder->CreateICmpULT(Add, AddCST);
-      }
-
-      break;                         // (X == 13 | X == 15) -> no change
-    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
-    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      // Potential folds for this case should already be handled.
+      break;
+    case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change
       break;
-    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
-    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
-      return RHS;
     }
     break;
-  case ICmpInst::ICMP_NE:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
-    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
-    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
-      return LHS;
-    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
-    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
-    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
-      return Builder->getTrue();
-    }
   case ICmpInst::ICMP_ULT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
-      // If RHSCst is [us]MAXINT, it is always false.  Not handling
-      // this can cause overflow.
-      if (RHSCst->isMaxValue(false))
-        return LHS;
-      return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
+    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
+      assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
                              false, false);
-    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
-      return RHS;
-    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SLT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
-      break;
-    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
-      // If RHSCst is [us]MAXINT, it is always false.  Not handling
-      // this can cause overflow.
-      if (RHSCst->isMaxValue(true))
-        return LHS;
-      return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
-                             true, false);
-    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
-      return RHS;
-    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
-      break;
-    }
-    break;
-  case ICmpInst::ICMP_UGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
-    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
-      return LHS;
-    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
-    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
-      return Builder->getTrue();
-    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
-      break;
-    }
-    break;
-  case ICmpInst::ICMP_SGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
-    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
-      return LHS;
-    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
-    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
-      return Builder->getTrue();
-    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change
       break;
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2
+      assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
+                             false);
     }
     break;
   }
@@ -1963,7 +1834,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
 
 /// Optimize (fcmp)|(fcmp).  NOTE: Unlike the rest of instcombine, this returns
 /// a Value which should already be inserted into the function.
-Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
+Value *InstCombiner::foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   Value *Op0LHS = LHS->getOperand(0), *Op0RHS = LHS->getOperand(1);
   Value *Op1LHS = RHS->getOperand(0), *Op1RHS = RHS->getOperand(1);
   FCmpInst::Predicate Op0CC = LHS->getPredicate(), Op1CC = RHS->getPredicate();
@@ -1993,18 +1864,18 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // If either of the constants are nans, then the whole thing returns
         // true.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
-          return Builder->getTrue();
+          return Builder.getTrue();
 
         // Otherwise, no need to compare the two constants, compare the
         // rest.
-        return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
+        return Builder.CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
       }
 
     // Handle vector zeros.  This occurs because the canonical form of
     // "fcmp uno x,x" is "fcmp uno x, 0".
     if (isa<ConstantAggregateZero>(LHS->getOperand(1)) &&
         isa<ConstantAggregateZero>(RHS->getOperand(1)))
-      return Builder->CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
+      return Builder.CreateFCmpUNO(LHS->getOperand(0), RHS->getOperand(0));
 
     return nullptr;
   }
@@ -2021,8 +1892,9 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
 ///     (A & C1) | B
 ///
 /// when the XOR of the two constants is "all ones" (-1).
-Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
-                                               Value *A, Value *B, Value *C) {
+static Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op,
+                                        Value *A, Value *B, Value *C,
+                                        InstCombiner::BuilderTy &Builder) {
   ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
   if (!CI1) return nullptr;
 
@@ -2034,7 +1906,7 @@ Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
   if (!Xor.isAllOnesValue()) return nullptr;
 
   if (V1 == A || V1 == B) {
-    Value *NewOp = Builder->CreateAnd((V1 == A) ? B : A, CI1);
+    Value *NewOp = Builder.CreateAnd((V1 == A) ? B : A, CI1);
     return BinaryOperator::CreateOr(NewOp, V1);
   }
 
@@ -2043,15 +1915,16 @@ Instruction *InstCombiner::FoldOrWithConstants(BinaryOperator &I, Value *Op,
 
 /// \brief This helper function folds:
 ///
-///     ((A | B) & C1) ^ (B & C2)
+///     ((A ^ B) & C1) | (B & C2)
 ///
 /// into:
 ///
 ///     (A & C1) ^ B
 ///
 /// when the XOR of the two constants is "all ones" (-1).
-Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op,
-                                                Value *A, Value *B, Value *C) {
+static Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op,
+                                         Value *A, Value *B, Value *C,
+                                         InstCombiner::BuilderTy &Builder) {
   ConstantInt *CI1 = dyn_cast<ConstantInt>(C);
   if (!CI1)
     return nullptr;
@@ -2066,7 +1939,7 @@ Instruction *InstCombiner::FoldXorWithConstants(BinaryOperator &I, Value *Op,
     return nullptr;
 
   if (V1 == A || V1 == B) {
-    Value *NewOp = Builder->CreateAnd(V1 == A ? B : A, CI1);
+    Value *NewOp = Builder.CreateAnd(V1 == A ? B : A, CI1);
     return BinaryOperator::CreateXor(NewOp, V1);
   }
 
@@ -2083,11 +1956,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyOrInst(Op0, Op1, DL, &TLI, &DT, &AC))
-    return replaceInstUsesWith(I, V);
-
-  // (A&B)|(A&C) -> A&(B|C) etc
-  if (Value *V = SimplifyUsingDistributiveLaws(I))
+  if (Value *V = SimplifyOrInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // See if we can simplify any instructions used by the instruction whose sole
@@ -2095,92 +1964,58 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
-  if (Value *V = SimplifyBSwap(I))
-    return replaceInstUsesWith(I, V);
+  // Do this before using distributive laws to catch simple and/or/not patterns.
+  if (Instruction *Xor = foldOrToXor(I, Builder))
+    return Xor;
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
-    ConstantInt *C1 = nullptr; Value *X = nullptr;
-    // (X & C1) | C2 --> (X | C2) & (C1|C2)
-    // iff (C1 & C2) == 0.
-    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) &&
-        (RHS->getValue() & C1->getValue()) != 0 &&
-        Op0->hasOneUse()) {
-      Value *Or = Builder->CreateOr(X, RHS);
-      Or->takeName(Op0);
-      return BinaryOperator::CreateAnd(Or,
-                             Builder->getInt(RHS->getValue() | C1->getValue()));
-    }
+  // (A&B)|(A&C) -> A&(B|C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
 
-    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
-    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
-        Op0->hasOneUse()) {
-      Value *Or = Builder->CreateOr(X, RHS);
-      Or->takeName(Op0);
-      return BinaryOperator::CreateXor(Or,
-                            Builder->getInt(C1->getValue() & ~RHS->getValue()));
-    }
+  if (Value *V = SimplifyBSwap(I, Builder))
+    return replaceInstUsesWith(I, V);
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
   // Given an OR instruction, check to see if this is a bswap.
   if (Instruction *BSwap = MatchBSwap(I))
     return BSwap;
 
-  Value *A = nullptr, *B = nullptr;
-  ConstantInt *C1 = nullptr, *C2 = nullptr;
-
-  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
-  if (Op0->hasOneUse() &&
-      match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op1, C1->getValue(), 0, &I)) {
-    Value *NOr = Builder->CreateOr(A, Op1);
-    NOr->takeName(Op0);
-    return BinaryOperator::CreateXor(NOr, C1);
-  }
+  {
+    Value *A;
+    const APInt *C;
+    // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+    if (match(Op0, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op1, *C, 0, &I)) {
+      Value *NOr = Builder.CreateOr(A, Op1);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       ConstantInt::get(NOr->getType(), *C));
+    }
 
-  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
-  if (Op1->hasOneUse() &&
-      match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op0, C1->getValue(), 0, &I)) {
-    Value *NOr = Builder->CreateOr(A, Op0);
-    NOr->takeName(Op0);
-    return BinaryOperator::CreateXor(NOr, C1);
+    // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+    if (match(Op1, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op0, *C, 0, &I)) {
+      Value *NOr = Builder.CreateOr(A, Op0);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       ConstantInt::get(NOr->getType(), *C));
+    }
   }
 
-  // ((~A & B) | A) -> (A | B)
-  if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-      match(Op1, m_Specific(A)))
-    return BinaryOperator::CreateOr(A, B);
-
-  // ((A & B) | ~A) -> (~A | B)
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      match(Op1, m_Not(m_Specific(A))))
-    return BinaryOperator::CreateOr(Builder->CreateNot(A), B);
-
-  // (A & ~B) | (A ^ B) -> (A ^ B)
-  // (~B & A) | (A ^ B) -> (A ^ B)
-  if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-      match(Op1, m_Xor(m_Specific(A), m_Specific(B))))
-    return BinaryOperator::CreateXor(A, B);
-
-  // Commute the 'or' operands.
-  // (A ^ B) | (A & ~B) -> (A ^ B)
-  // (A ^ B) | (~B & A) -> (A ^ B)
-  if (match(Op1, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-      match(Op0, m_Xor(m_Specific(A), m_Specific(B))))
-    return BinaryOperator::CreateXor(A, B);
+  Value *A, *B;
 
   // (A & C)|(B & D)
   Value *C = nullptr, *D = nullptr;
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
       match(Op1, m_And(m_Value(B), m_Value(D)))) {
     Value *V1 = nullptr, *V2 = nullptr;
-    C1 = dyn_cast<ConstantInt>(C);
-    C2 = dyn_cast<ConstantInt>(D);
+    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
+    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
     if (C1 && C2) {  // (A & C1)|(B & C2)
-      if ((C1->getValue() & C2->getValue()) == 0) {
+      if ((C1->getValue() & C2->getValue()).isNullValue()) {
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
         // iff (C1&C2) == 0 and (N&~C1) == 0
         if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
@@ -2189,7 +2024,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
              (V2 == B &&
               MaskedValueIsZero(V1, ~C1->getValue(), 0, &I))))  // (N|V)
           return BinaryOperator::CreateAnd(A,
-                                Builder->getInt(C1->getValue()|C2->getValue()));
+                                Builder.getInt(C1->getValue()|C2->getValue()));
         // Or commutes, try both ways.
         if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
             ((V1 == A &&
@@ -2197,18 +2032,18 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
              (V2 == A &&
               MaskedValueIsZero(V1, ~C2->getValue(), 0, &I))))  // (N|V)
           return BinaryOperator::CreateAnd(B,
-                                Builder->getInt(C1->getValue()|C2->getValue()));
+                                 Builder.getInt(C1->getValue()|C2->getValue()));
 
         // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
         // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
         ConstantInt *C3 = nullptr, *C4 = nullptr;
         if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
-            (C3->getValue() & ~C1->getValue()) == 0 &&
+            (C3->getValue() & ~C1->getValue()).isNullValue() &&
             match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
-            (C4->getValue() & ~C2->getValue()) == 0) {
-          V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
+            (C4->getValue() & ~C2->getValue()).isNullValue()) {
+          V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
           return BinaryOperator::CreateAnd(V2,
-                                Builder->getInt(C1->getValue()|C2->getValue()));
+                                 Builder.getInt(C1->getValue()|C2->getValue()));
         }
       }
     }
@@ -2218,82 +2053,59 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     // 'or' that it is replacing.
     if (Op0->hasOneUse() || Op1->hasOneUse()) {
       // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
-      if (Value *V = matchSelectFromAndOr(A, C, B, D, *Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, B, D, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(A, C, D, B, *Builder))
+      if (Value *V = matchSelectFromAndOr(A, C, D, B, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, B, D, *Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, B, D, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(C, A, D, B, *Builder))
+      if (Value *V = matchSelectFromAndOr(C, A, D, B, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, A, C, *Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, A, C, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(B, D, C, A, *Builder))
+      if (Value *V = matchSelectFromAndOr(B, D, C, A, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, A, C, *Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, A, C, Builder))
         return replaceInstUsesWith(I, V);
-      if (Value *V = matchSelectFromAndOr(D, B, C, A, *Builder))
+      if (Value *V = matchSelectFromAndOr(D, B, C, A, Builder))
         return replaceInstUsesWith(I, V);
     }
 
-    // ((A&~B)|(~A&B)) -> A^B
-    if ((match(C, m_Not(m_Specific(D))) &&
-         match(B, m_Not(m_Specific(A)))))
-      return BinaryOperator::CreateXor(A, D);
-    // ((~B&A)|(~A&B)) -> A^B
-    if ((match(A, m_Not(m_Specific(D))) &&
-         match(B, m_Not(m_Specific(C)))))
-      return BinaryOperator::CreateXor(C, D);
-    // ((A&~B)|(B&~A)) -> A^B
-    if ((match(C, m_Not(m_Specific(B))) &&
-         match(D, m_Not(m_Specific(A)))))
-      return BinaryOperator::CreateXor(A, B);
-    // ((~B&A)|(B&~A)) -> A^B
-    if ((match(A, m_Not(m_Specific(B))) &&
-         match(D, m_Not(m_Specific(C)))))
-      return BinaryOperator::CreateXor(C, B);
-
     // ((A|B)&1)|(B&-2) -> (A&1) | B
-    if (match(A, m_Or(m_Value(V1), m_Specific(B))) ||
-        match(A, m_Or(m_Specific(B), m_Value(V1)))) {
-      Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C);
-      if (Ret) return Ret;
+    if (match(A, m_c_Or(m_Value(V1), m_Specific(B)))) {
+      if (Instruction *Ret = FoldOrWithConstants(I, Op1, V1, B, C, Builder))
+        return Ret;
     }
     // (B&-2)|((A|B)&1) -> (A&1) | B
-    if (match(B, m_Or(m_Specific(A), m_Value(V1))) ||
-        match(B, m_Or(m_Value(V1), m_Specific(A)))) {
-      Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D);
-      if (Ret) return Ret;
+    if (match(B, m_c_Or(m_Specific(A), m_Value(V1)))) {
+      if (Instruction *Ret = FoldOrWithConstants(I, Op0, A, V1, D, Builder))
+        return Ret;
     }
     // ((A^B)&1)|(B&-2) -> (A&1) ^ B
-    if (match(A, m_Xor(m_Value(V1), m_Specific(B))) ||
-        match(A, m_Xor(m_Specific(B), m_Value(V1)))) {
-      Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C);
-      if (Ret) return Ret;
+    if (match(A, m_c_Xor(m_Value(V1), m_Specific(B)))) {
+      if (Instruction *Ret = FoldXorWithConstants(I, Op1, V1, B, C, Builder))
+        return Ret;
     }
     // (B&-2)|((A^B)&1) -> (A&1) ^ B
-    if (match(B, m_Xor(m_Specific(A), m_Value(V1))) ||
-        match(B, m_Xor(m_Value(V1), m_Specific(A)))) {
-      Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D);
-      if (Ret) return Ret;
+    if (match(B, m_c_Xor(m_Specific(A), m_Value(V1)))) {
+      if (Instruction *Ret = FoldXorWithConstants(I, Op0, A, V1, D, Builder))
+        return Ret;
     }
   }
 
   // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
   if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
     if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
-      if (Op1->hasOneUse() || cast<BinaryOperator>(Op1)->hasOneUse())
-        return BinaryOperator::CreateOr(Op0, C);
+      return BinaryOperator::CreateOr(Op0, C);
 
   // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
   if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
     if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
-      if (Op0->hasOneUse() || cast<BinaryOperator>(Op0)->hasOneUse())
-        return BinaryOperator::CreateOr(Op1, C);
+      return BinaryOperator::CreateOr(Op1, C);
 
   // ((B | C) & A) | B -> B | (A & C)
   if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
-    return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C));
+    return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
     return DeMorgan;
@@ -2317,11 +2129,11 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       return BinaryOperator::CreateOr(A, B);
 
     if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
-      Value *Not = Builder->CreateNot(B, B->getName()+".not");
+      Value *Not = Builder.CreateNot(B, B->getName() + ".not");
       return BinaryOperator::CreateOr(Not, Op0);
     }
     if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) {
-      Value *Not = Builder->CreateNot(A, A->getName()+".not");
+      Value *Not = Builder.CreateNot(A, A->getName() + ".not");
       return BinaryOperator::CreateOr(Not, Op0);
     }
   }
@@ -2335,21 +2147,10 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
                                B->getOpcode() == Instruction::Xor)) {
         Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
                                                  B->getOperand(0);
-        Value *Not = Builder->CreateNot(NotOp, NotOp->getName()+".not");
+        Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
         return BinaryOperator::CreateOr(Not, Op0);
       }
 
-  // (A & B) | (~A ^ B) -> (~A ^ B)
-  // (A & B) | (B ^ ~A) -> (~A ^ B)
-  // (B & A) | (~A ^ B) -> (~A ^ B)
-  // (B & A) | (B ^ ~A) -> (~A ^ B)
-  // The match order is important: match the xor first because the 'not'
-  // operation defines 'A'. We do not need to match the xor as Op0 because the
-  // xor was canonicalized to Op1 above.
-  if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
-      match(Op0, m_c_And(m_Specific(A), m_Specific(B))))
-    return BinaryOperator::CreateXor(Builder->CreateNot(A), B);
-
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
@@ -2357,7 +2158,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
     if (LHS && RHS)
-      if (Value *Res = FoldOrOfICmps(LHS, RHS, &I))
+      if (Value *Res = foldOrOfICmps(LHS, RHS, I))
         return replaceInstUsesWith(I, Res);
 
     // TODO: Make this recursive; it's a little tricky because an arbitrary
@@ -2365,26 +2166,26 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     Value *X, *Y;
     if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
-          return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldOrOfICmps(LHS, Cmp, &I))
-          return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
+        if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
     }
     if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
       if (auto *Cmp = dyn_cast<ICmpInst>(X))
-        if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
-          return replaceInstUsesWith(I, Builder->CreateOr(Res, Y));
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
       if (auto *Cmp = dyn_cast<ICmpInst>(Y))
-        if (Value *Res = FoldOrOfICmps(Cmp, RHS, &I))
-          return replaceInstUsesWith(I, Builder->CreateOr(Res, X));
+        if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+          return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
     }
   }
 
   // (fcmp uno x, c) | (fcmp uno y, c)  -> (fcmp uno x, y)
   if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
     if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = FoldOrOfFCmps(LHS, RHS))
+      if (Value *Res = foldOrOfFCmps(LHS, RHS))
         return replaceInstUsesWith(I, Res);
 
   if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
@@ -2392,10 +2193,10 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
   if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
-      A->getType()->getScalarType()->isIntegerTy(1))
+      A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
   if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
-      A->getType()->getScalarType()->isIntegerTy(1))
+      A->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
 
   // Note: If we've gotten to the point of visiting the outer OR, then the
@@ -2403,9 +2204,10 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   // be simplified by a later pass either, so we try swapping the inner/outer
   // ORs in the hopes that we'll be able to simplify it this way.
   // (X|C) | V --> (X|V) | C
+  ConstantInt *C1;
   if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) &&
       match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) {
-    Value *Inner = Builder->CreateOr(A, Op1);
+    Value *Inner = Builder.CreateOr(A, Op1);
     Inner->takeName(Op0);
     return BinaryOperator::CreateOr(Inner, C1);
   }
@@ -2418,8 +2220,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
     if (Op0->hasOneUse() && Op1->hasOneUse() &&
         match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
         match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
-      Value *orTrue = Builder->CreateOr(A, C);
-      Value *orFalse = Builder->CreateOr(B, D);
+      Value *orTrue = Builder.CreateOr(A, C);
+      Value *orFalse = Builder.CreateOr(B, D);
       return SelectInst::Create(X, orTrue, orFalse);
     }
   }
@@ -2427,6 +2229,116 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
+/// A ^ B can be specified using other logic ops in a variety of patterns. We
+/// can fold these early and efficiently by morphing an existing instruction.
+static Instruction *foldXorToXor(BinaryOperator &I,
+                                 InstCombiner::BuilderTy &Builder) {
+  assert(I.getOpcode() == Instruction::Xor);
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *A, *B;
+
+  // There are 4 commuted variants for each of the basic patterns.
+
+  // (A & B) ^ (A | B) -> A ^ B
+  // (A & B) ^ (B | A) -> A ^ B
+  // (A | B) ^ (A & B) -> A ^ B
+  // (A | B) ^ (B & A) -> A ^ B
+  if ((match(Op0, m_And(m_Value(A), m_Value(B))) &&
+       match(Op1, m_c_Or(m_Specific(A), m_Specific(B)))) ||
+      (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+       match(Op1, m_c_And(m_Specific(A), m_Specific(B))))) {
+    I.setOperand(0, A);
+    I.setOperand(1, B);
+    return &I;
+  }
+
+  // (A | ~B) ^ (~A | B) -> A ^ B
+  // (~B | A) ^ (~A | B) -> A ^ B
+  // (~A | B) ^ (A | ~B) -> A ^ B
+  // (B | ~A) ^ (A | ~B) -> A ^ B
+  if ((match(Op0, m_Or(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_Or(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_Or(m_Specific(A), m_Not(m_Specific(B)))))) {
+    I.setOperand(0, A);
+    I.setOperand(1, B);
+    return &I;
+  }
+
+  // (A & ~B) ^ (~A & B) -> A ^ B
+  // (~B & A) ^ (~A & B) -> A ^ B
+  // (~A & B) ^ (A & ~B) -> A ^ B
+  // (B & ~A) ^ (A & ~B) -> A ^ B
+  if ((match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+       match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B)))) ||
+      (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+       match(Op1, m_c_And(m_Specific(A), m_Not(m_Specific(B)))))) {
+    I.setOperand(0, A);
+    I.setOperand(1, B);
+    return &I;
+  }
+
+  // For the remaining cases we need to get rid of one of the operands.
+  if (!Op0->hasOneUse() && !Op1->hasOneUse())
+    return nullptr;
+
+  // (A | B) ^ ~(A & B) -> ~(A ^ B)
+  // (A | B) ^ ~(B & A) -> ~(A ^ B)
+  // (A & B) ^ ~(A | B) -> ~(A ^ B)
+  // (A & B) ^ ~(B | A) -> ~(A ^ B)
+  // Complexity sorting ensures the not will be on the right side.
+  if ((match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+       match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) ||
+      (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+       match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))))
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  return nullptr;
+}
+
+Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
+  if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+    if (LHS->getOperand(0) == RHS->getOperand(1) &&
+        LHS->getOperand(1) == RHS->getOperand(0))
+      LHS->swapOperands();
+    if (LHS->getOperand(0) == RHS->getOperand(0) &&
+        LHS->getOperand(1) == RHS->getOperand(1)) {
+      // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+      Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+      unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+      bool isSigned = LHS->isSigned() || RHS->isSigned();
+      return getNewICmpValue(isSigned, Code, Op0, Op1, Builder);
+    }
+  }
+
+  // Instead of trying to imitate the folds for and/or, decompose this 'xor'
+  // into those logic ops. That is, try to turn this into an and-of-icmps
+  // because we have many folds for that pattern.
+  //
+  // This is based on a truth table definition of xor:
+  // X ^ Y --> (X | Y) & !(X & Y)
+  if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
+    // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
+    // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
+    if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
+      // TODO: Independently handle cases where the 'and' side is a constant.
+      if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) {
+        // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS
+        RHS->setPredicate(RHS->getInversePredicate());
+        return Builder.CreateAnd(LHS, RHS);
+      }
+      if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) {
+        // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS
+        LHS->setPredicate(LHS->getInversePredicate());
+        return Builder.CreateAnd(LHS, RHS);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -2437,9 +2349,12 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyXorInst(Op0, Op1, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyXorInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
+  if (Instruction *NewXor = foldXorToXor(I, Builder))
+    return NewXor;
+
   // (A&B)^(A&C) -> A&(B^C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
@@ -2449,68 +2364,85 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (SimplifyDemandedInstructionBits(I))
     return &I;
 
-  if (Value *V = SimplifyBSwap(I))
+  if (Value *V = SimplifyBSwap(I, Builder))
     return replaceInstUsesWith(I, V);
 
-  // Is this a ~ operation?
-  if (Value *NotOp = dyn_castNotVal(&I)) {
-    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
-      if (Op0I->getOpcode() == Instruction::And ||
-          Op0I->getOpcode() == Instruction::Or) {
-        // ~(~X & Y) --> (X | ~Y) - De Morgan's Law
-        // ~(~X | Y) === (X & ~Y) - De Morgan's Law
-        if (dyn_castNotVal(Op0I->getOperand(1)))
-          Op0I->swapOperands();
-        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) {
-          Value *NotY =
-            Builder->CreateNot(Op0I->getOperand(1),
-                               Op0I->getOperand(1)->getName()+".not");
-          if (Op0I->getOpcode() == Instruction::And)
-            return BinaryOperator::CreateOr(Op0NotVal, NotY);
-          return BinaryOperator::CreateAnd(Op0NotVal, NotY);
-        }
+  // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
+  Value *X, *Y;
+
+  // We must eliminate the and/or (one-use) for these transforms to not increase
+  // the instruction count.
+  // ~(~X & Y) --> (X | ~Y)
+  // ~(Y & ~X) --> (X | ~Y)
+  if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return BinaryOperator::CreateOr(X, NotY);
+  }
+  // ~(~X | Y) --> (X & ~Y)
+  // ~(Y | ~X) --> (X & ~Y)
+  if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) {
+    Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+    return BinaryOperator::CreateAnd(X, NotY);
+  }
+
+  // Is this a 'not' (~) fed by a binary operator?
+  BinaryOperator *NotVal;
+  if (match(&I, m_Not(m_BinOp(NotVal)))) {
+    if (NotVal->getOpcode() == Instruction::And ||
+        NotVal->getOpcode() == Instruction::Or) {
+      // Apply DeMorgan's Law when inverts are free:
+      // ~(X & Y) --> (~X | ~Y)
+      // ~(X | Y) --> (~X & ~Y)
+      if (IsFreeToInvert(NotVal->getOperand(0),
+                         NotVal->getOperand(0)->hasOneUse()) &&
+          IsFreeToInvert(NotVal->getOperand(1),
+                         NotVal->getOperand(1)->hasOneUse())) {
+        Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
+        Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
+        if (NotVal->getOpcode() == Instruction::And)
+          return BinaryOperator::CreateOr(NotX, NotY);
+        return BinaryOperator::CreateAnd(NotX, NotY);
+      }
+    }
 
-        // ~(X & Y) --> (~X | ~Y) - De Morgan's Law
-        // ~(X | Y) === (~X & ~Y) - De Morgan's Law
-        if (IsFreeToInvert(Op0I->getOperand(0),
-                           Op0I->getOperand(0)->hasOneUse()) &&
-            IsFreeToInvert(Op0I->getOperand(1),
-                           Op0I->getOperand(1)->hasOneUse())) {
-          Value *NotX =
-            Builder->CreateNot(Op0I->getOperand(0), "notlhs");
-          Value *NotY =
-            Builder->CreateNot(Op0I->getOperand(1), "notrhs");
-          if (Op0I->getOpcode() == Instruction::And)
-            return BinaryOperator::CreateOr(NotX, NotY);
-          return BinaryOperator::CreateAnd(NotX, NotY);
-        }
+    // ~(~X >>s Y) --> (X >>s Y)
+    if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+      return BinaryOperator::CreateAShr(X, Y);
 
-      } else if (Op0I->getOpcode() == Instruction::AShr) {
-        // ~(~X >>s Y) --> (X >>s Y)
-        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0)))
-          return BinaryOperator::CreateAShr(Op0NotVal, Op0I->getOperand(1));
-      }
+    // If we are inverting a right-shifted constant, we may be able to eliminate
+    // the 'not' by inverting the constant and using the opposite shift type.
+    // Canonicalization rules ensure that only a negative constant uses 'ashr',
+    // but we must check that in case that transform has not fired yet.
+    const APInt *C;
+    if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) {
+      // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateLShr(NotC, Y);
+    }
+
+    if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) {
+      // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+      Constant *NotC = ConstantInt::get(I.getType(), ~(*C));
+      return BinaryOperator::CreateAShr(NotC, Y);
     }
   }
 
-  if (Constant *RHS = dyn_cast<Constant>(Op1)) {
-    if (RHS->isAllOnesValue() && Op0->hasOneUse())
-      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
-      if (CmpInst *CI = dyn_cast<CmpInst>(Op0))
-        return CmpInst::Create(CI->getOpcode(),
-                               CI->getInversePredicate(),
-                               CI->getOperand(0), CI->getOperand(1));
+  // not (cmp A, B) = !cmp A, B
+  CmpInst::Predicate Pred;
+  if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
+    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return replaceInstUsesWith(I, Op0);
   }
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+  if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) {
     // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
     if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
       if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
         if (CI->hasOneUse() && Op0C->hasOneUse()) {
           Instruction::CastOps Opcode = Op0C->getOpcode();
           if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-              (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(),
-                                            Op0C->getDestTy()))) {
+              (RHSC == ConstantExpr::getCast(Opcode, Builder.getTrue(),
+                                             Op0C->getDestTy()))) {
             CI->setPredicate(CI->getInversePredicate());
             return CastInst::Create(Opcode, CI, Op0C->getType());
           }
@@ -2520,26 +2452,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
       // ~(c-X) == X-c-1 == X+(-c-1)
-      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+      if (Op0I->getOpcode() == Instruction::Sub && RHSC->isMinusOne())
         if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
           Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
-          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
-                                      ConstantInt::get(I.getType(), 1));
-          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1),
+                                           SubOne(NegOp0I0C));
         }
 
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
         if (Op0I->getOpcode() == Instruction::Add) {
           // ~(X-c) --> (-c-1)-X
-          if (RHS->isAllOnesValue()) {
+          if (RHSC->isMinusOne()) {
             Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
-            return BinaryOperator::CreateSub(
-                           ConstantExpr::getSub(NegOp0CI,
-                                      ConstantInt::get(I.getType(), 1)),
-                                      Op0I->getOperand(0));
-          } else if (RHS->getValue().isSignBit()) {
-            // (X + C) ^ signbit -> (X + C + signbit)
-            Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue());
+            return BinaryOperator::CreateSub(SubOne(NegOp0CI),
+                                             Op0I->getOperand(0));
+          } else if (RHSC->getValue().isSignMask()) {
+            // (X + C) ^ signmask -> (X + C + signmask)
+            Constant *C = Builder.getInt(RHSC->getValue() + Op0CI->getValue());
             return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
 
           }
@@ -2547,10 +2476,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
           // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
           if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue(),
                                 0, &I)) {
-            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHSC);
             // Anything in both C1 and C2 is known to be zero, remove it from
             // NewRHS.
-            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHSC);
             NewRHS = ConstantExpr::getAnd(NewRHS,
                                        ConstantExpr::getNot(CommonBits));
             Worklist.Add(Op0I);
@@ -2568,11 +2497,11 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
               E1->getOpcode() == Instruction::Xor &&
               (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
             // fold (C1 >> C2) ^ C3
-            ConstantInt *C2 = Op0CI, *C3 = RHS;
+            ConstantInt *C2 = Op0CI, *C3 = RHSC;
             APInt FoldConst = C1->getValue().lshr(C2->getValue());
             FoldConst ^= C3->getValue();
             // Prepare the two operands.
-            Value *Opnd0 = Builder->CreateLShr(E1->getOperand(0), C2);
+            Value *Opnd0 = Builder.CreateLShr(E1->getOperand(0), C2);
             Opnd0->takeName(Op0I);
             cast<Instruction>(Opnd0)->setDebugLoc(I.getDebugLoc());
             Value *FoldVal = ConstantInt::get(Opnd0->getType(), FoldConst);
@@ -2582,27 +2511,26 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         }
       }
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
-  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
-  if (Op1I) {
+  {
     Value *A, *B;
-    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
-      if (A == Op0) {              // B^(B|A) == (A|B)^B
-        Op1I->swapOperands();
-        I.swapOperands();
-        std::swap(Op0, Op1);
-      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+    if (match(Op1, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
+      if (A == Op0) {                                      // A^(A|B) == A^(B|A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B|A) == (B|A)^A
         I.swapOperands();     // Simplified below.
         std::swap(Op0, Op1);
       }
-    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) &&
-               Op1I->hasOneUse()){
+    } else if (match(Op1, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
       if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
-        Op1I->swapOperands();
+        cast<BinaryOperator>(Op1)->swapOperands();
         std::swap(A, B);
       }
       if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
@@ -2612,89 +2540,53 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
-  if (Op0I) {
+  {
     Value *A, *B;
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        Op0I->hasOneUse()) {
+    if (match(Op0, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
       if (A == Op1)                                  // (B|A)^B == (A|B)^B
         std::swap(A, B);
       if (B == Op1)                                  // (A|B)^B == A & ~B
-        return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
-    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-               Op0I->hasOneUse()){
+        return BinaryOperator::CreateAnd(A, Builder.CreateNot(Op1));
+    } else if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
       if (A == Op1)                                        // (A&B)^A -> (B&A)^A
         std::swap(A, B);
+      const APInt *C;
       if (B == Op1 &&                                      // (B&A)^A == ~B & A
-          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
-        return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1);
+          !match(Op1, m_APInt(C))) {  // Canonical form is (B&C)^C
+        return BinaryOperator::CreateAnd(Builder.CreateNot(A), Op1);
       }
     }
   }
 
-  if (Op0I && Op1I) {
+  {
     Value *A, *B, *C, *D;
-    // (A & B)^(A | B) -> A ^ B
-    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
-      if ((A == C && B == D) || (A == D && B == C))
-        return BinaryOperator::CreateXor(A, B);
-    }
-    // (A | B)^(A & B) -> A ^ B
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
-      if ((A == C && B == D) || (A == D && B == C))
-        return BinaryOperator::CreateXor(A, B);
-    }
-    // (A | ~B) ^ (~A | B) -> A ^ B
-    // (~B | A) ^ (~A | B) -> A ^ B
-    if (match(Op0I, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B))))
-      return BinaryOperator::CreateXor(A, B);
-
-    // (~A | B) ^ (A | ~B) -> A ^ B
-    if (match(Op0I, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
-        match(Op1I, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
-      return BinaryOperator::CreateXor(A, B);
-    }
-    // (A & ~B) ^ (~A & B) -> A ^ B
-    // (~B & A) ^ (~A & B) -> A ^ B
-    if (match(Op0I, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B))))
-      return BinaryOperator::CreateXor(A, B);
-
-    // (~A & B) ^ (A & ~B) -> A ^ B
-    if (match(Op0I, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-        match(Op1I, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
-      return BinaryOperator::CreateXor(A, B);
-    }
     // (A ^ C)^(A | B) -> ((~A) & B) ^ C
-    if (match(Op0I, m_Xor(m_Value(D), m_Value(C))) &&
-        match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+    if (match(Op0, m_Xor(m_Value(D), m_Value(C))) &&
+        match(Op1, m_Or(m_Value(A), m_Value(B)))) {
       if (D == A)
         return BinaryOperator::CreateXor(
-            Builder->CreateAnd(Builder->CreateNot(A), B), C);
+            Builder.CreateAnd(Builder.CreateNot(A), B), C);
       if (D == B)
         return BinaryOperator::CreateXor(
-            Builder->CreateAnd(Builder->CreateNot(B), A), C);
+            Builder.CreateAnd(Builder.CreateNot(B), A), C);
     }
     // (A | B)^(A ^ C) -> ((~A) & B) ^ C
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Xor(m_Value(D), m_Value(C)))) {
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Xor(m_Value(D), m_Value(C)))) {
       if (D == A)
         return BinaryOperator::CreateXor(
-            Builder->CreateAnd(Builder->CreateNot(A), B), C);
+            Builder.CreateAnd(Builder.CreateNot(A), B), C);
       if (D == B)
         return BinaryOperator::CreateXor(
-            Builder->CreateAnd(Builder->CreateNot(B), A), C);
+            Builder.CreateAnd(Builder.CreateNot(B), A), C);
     }
     // (A & B) ^ (A ^ B) -> (A | B)
-    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Xor(m_Specific(A), m_Specific(B))))
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
     // (A ^ B) ^ (A & B) -> (A | B)
-    if (match(Op0I, m_Xor(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_And(m_Specific(A), m_Specific(B))))
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
   }
 
@@ -2703,25 +2595,12 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   Value *A, *B;
   if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
       match(Op1, m_Not(m_Specific(A))))
-    return BinaryOperator::CreateNot(Builder->CreateAnd(A, B));
-
-  // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
-  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
-    if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
-      if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
-        if (LHS->getOperand(0) == RHS->getOperand(1) &&
-            LHS->getOperand(1) == RHS->getOperand(0))
-          LHS->swapOperands();
-        if (LHS->getOperand(0) == RHS->getOperand(0) &&
-            LHS->getOperand(1) == RHS->getOperand(1)) {
-          Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
-          unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
-          bool isSigned = LHS->isSigned() || RHS->isSigned();
-          return replaceInstUsesWith(I,
-                               getNewICmpValue(isSigned, Code, Op0, Op1,
-                                               Builder));
-        }
-      }
+    return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
+
+  if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+    if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+      if (Value *V = foldXorOfICmps(LHS, RHS))
+        return replaceInstUsesWith(I, V);
 
   if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
     return CastedXor;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 2ef82ba..391c430 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -16,9 +16,9 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -44,6 +44,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
@@ -60,6 +61,12 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
+    "unfold-element-atomic-memcpy-max-elements",
+    cl::init(16),
+    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
+             "allowed to unfold"));
+
 /// Return the specified type promoted as it would be to pass though a va_arg
 /// area.
 static Type *getPromotedType(Type *Ty) {
@@ -70,27 +77,6 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
-/// Given an aggregate type which ultimately holds a single scalar element,
-/// like {{{type}}} or [1 x type], return type.
-static Type *reduceToSingleValueType(Type *T) {
-  while (!T->isSingleValueType()) {
-    if (StructType *STy = dyn_cast<StructType>(T)) {
-      if (STy->getNumElements() == 1)
-        T = STy->getElementType(0);
-      else
-        break;
-    } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) {
-      if (ATy->getNumElements() == 1)
-        T = ATy->getElementType();
-      else
-        break;
-    } else
-      break;
-  }
-
-  return T;
-}
-
 /// Return a constant boolean vector that has true elements in all positions
 /// where the input constant data vector has an element with the sign bit set.
 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
@@ -108,6 +94,83 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
   return ConstantVector::get(BoolVec);
 }
 
+Instruction *InstCombiner::SimplifyElementUnorderedAtomicMemCpy(
+    ElementUnorderedAtomicMemCpyInst *AMI) {
+  // Try to unfold this intrinsic into sequence of explicit atomic loads and
+  // stores.
+  // First check that number of elements is compile time constant.
+  auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
+  if (!LengthCI)
+    return nullptr;
+
+  // Check that there are not too many elements.
+  uint64_t LengthInBytes = LengthCI->getZExtValue();
+  uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
+  uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
+  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
+    return nullptr;
+
+  // Only expand if there are elements to copy.
+  if (NumElements > 0) {
+    // Don't unfold into illegal integers
+    uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
+    if (!getDataLayout().isLegalInteger(ElementSizeInBits))
+      return nullptr;
+
+    // Cast source and destination to the correct type. Intrinsic input
+    // arguments are usually represented as i8*. Often operands will be
+    // explicitly casted to i8* and we can just strip those casts instead of
+    // inserting new ones. However it's easier to rely on other InstCombine
+    // rules which will cover trivial cases anyway.
+    Value *Src = AMI->getRawSource();
+    Value *Dst = AMI->getRawDest();
+    Type *ElementPointerType =
+        Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
+                           Src->getType()->getPointerAddressSpace());
+
+    Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
+                                                 "memcpy_unfold.src_casted");
+    Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
+                                                 "memcpy_unfold.dst_casted");
+
+    for (uint64_t i = 0; i < NumElements; ++i) {
+      // Get current element addresses
+      ConstantInt *ElementIdxCI =
+          ConstantInt::get(AMI->getContext(), APInt(64, i));
+      Value *SrcElementAddr =
+          Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+      Value *DstElementAddr =
+          Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+      // Load from the source. Transfer alignment information and mark load as
+      // unordered atomic.
+      LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+      Load->setOrdering(AtomicOrdering::Unordered);
+      // We know alignment of the first element. It is also guaranteed by the
+      // verifier that element size is less or equal than first element
+      // alignment and both of this values are powers of two. This means that
+      // all subsequent accesses are at least element size aligned.
+      // TODO: We can infer better alignment but there is no evidence that this
+      // will matter.
+      Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
+                                : ElementSizeInBytes);
+      Load->setDebugLoc(AMI->getDebugLoc());
+
+      // Store loaded value via unordered atomic store.
+      StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
+      Store->setOrdering(AtomicOrdering::Unordered);
+      Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
+                                 : ElementSizeInBytes);
+      Store->setDebugLoc(AMI->getDebugLoc());
+    }
+  }
+
+  // Set the number of elements of the copy to 0, it will be deleted on the
+  // next iteration.
+  AMI->setLength(Constant::getNullValue(LengthCI->getType()));
+  return AMI;
+}
+
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
   unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
@@ -144,41 +207,19 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
   Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
 
-  // Memcpy forces the use of i8* for the source and destination.  That means
-  // that if you're using memcpy to move one double around, you'll get a cast
-  // from double* to i8*.  We'd much rather use a double load+store rather than
-  // an i64 load+store, here because this improves the odds that the source or
-  // dest address will be promotable.  See if we can find a better type than the
-  // integer datatype.
-  Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  // If the memcpy has metadata describing the members, see if we can get the
+  // TBAA tag describing our copy.
   MDNode *CopyMD = nullptr;
-  if (StrippedDest != MI->getArgOperand(0)) {
-    Type *SrcETy = cast<PointerType>(StrippedDest->getType())
-                                    ->getElementType();
-    if (SrcETy->isSized() && DL.getTypeStoreSize(SrcETy) == Size) {
-      // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
-      // down through these levels if so.
-      SrcETy = reduceToSingleValueType(SrcETy);
-
-      if (SrcETy->isSingleValueType()) {
-        NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
-        NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
-
-        // If the memcpy has metadata describing the members, see if we can
-        // get the TBAA tag describing our copy.
-        if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
-          if (M->getNumOperands() == 3 && M->getOperand(0) &&
-              mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
-              mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() &&
-              M->getOperand(1) &&
-              mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
-              mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
-                  Size &&
-              M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
-            CopyMD = cast<MDNode>(M->getOperand(2));
-        }
-      }
-    }
+  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+    if (M->getNumOperands() == 3 && M->getOperand(0) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
+        M->getOperand(1) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+        Size &&
+        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+      CopyMD = cast<MDNode>(M->getOperand(2));
   }
 
   // If the memcpy/memmove provides better alignment info than we can
@@ -186,9 +227,9 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   SrcAlign = std::max(SrcAlign, CopyAlign);
   DstAlign = std::max(DstAlign, CopyAlign);
 
-  Value *Src = Builder->CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
-  Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
-  LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
+  Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
+  Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
+  LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
   L->setAlignment(SrcAlign);
   if (CopyMD)
     L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
@@ -197,7 +238,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   if (LoopMemParallelMD)
     L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
 
-  StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
+  StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
   S->setAlignment(DstAlign);
   if (CopyMD)
     S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
@@ -233,15 +274,15 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
     Value *Dest = MI->getDest();
     unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
     Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
-    Dest = Builder->CreateBitCast(Dest, NewDstPtrTy);
+    Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
 
     // Alignment 0 is identity for alignment 1 for memset, but not store.
     if (Alignment == 0) Alignment = 1;
 
     // Extract the fill value and store.
     uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
-    StoreInst *S = Builder->CreateStore(ConstantInt::get(ITy, Fill), Dest,
-                                        MI->isVolatile());
+    StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
+                                       MI->isVolatile());
     S->setAlignment(Alignment);
 
     // Set the size of the copy to 0, it will be deleted on the next iteration.
@@ -343,7 +384,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
     for (unsigned i = 0; i != NumSubElts; ++i) {
       unsigned SubEltIdx = (NumSubElts - 1) - i;
       auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
-      Count = Count.shl(BitWidth);
+      Count <<= BitWidth;
       Count |= SubElt->getValue().zextOrTrunc(64);
     }
   }
@@ -357,7 +398,7 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
 
   // If shift-by-zero then just return the original value.
-  if (Count == 0)
+  if (Count.isNullValue())
     return Vec;
 
   // Handle cases when Shift >= BitWidth.
@@ -510,8 +551,131 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
   return Builder.CreateAShr(Vec, ShiftVec);
 }
 
-static Value *simplifyX86movmsk(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
+static Value *simplifyX86muldq(const IntrinsicInst &II,
+                               InstCombiner::BuilderTy &Builder) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+  assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
+         Arg1->getType()->getScalarSizeInBits() == 32 &&
+         ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
+
+  // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
+  if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
+    return ConstantAggregateZero::get(ResTy);
+
+  // Constant folding.
+  // PMULDQ  = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 sext(shuffle<0,2,..>(Arg1))))
+  // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
+  //                vXi64 zext(shuffle<0,2,..>(Arg1))))
+  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
+    return nullptr;
+
+  unsigned NumElts = ResTy->getVectorNumElements();
+  assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
+         Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
+         "Unexpected muldq/muludq types");
+
+  unsigned IntrinsicID = II.getIntrinsicID();
+  bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
+                   Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
+                   Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
+
+  SmallVector<unsigned, 16> ShuffleMask;
+  for (unsigned i = 0; i != NumElts; ++i)
+    ShuffleMask.push_back(i * 2);
+
+  auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
+  auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
+
+  if (IsSigned) {
+    LHS = Builder.CreateSExt(LHS, ResTy);
+    RHS = Builder.CreateSExt(RHS, ResTy);
+  } else {
+    LHS = Builder.CreateZExt(LHS, ResTy);
+    RHS = Builder.CreateZExt(RHS, ResTy);
+  }
+
+  return Builder.CreateMul(LHS, RHS);
+}
+
+static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
+  Value *Arg0 = II.getArgOperand(0);
+  Value *Arg1 = II.getArgOperand(1);
+  Type *ResTy = II.getType();
+
+  // Fast all undef handling.
+  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
+    return UndefValue::get(ResTy);
+
+  Type *ArgTy = Arg0->getType();
+  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
+  unsigned NumDstElts = ResTy->getVectorNumElements();
+  unsigned NumSrcElts = ArgTy->getVectorNumElements();
+  assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types");
+
+  unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
+  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
+  assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) &&
+         "Unexpected packing types");
+
+  // Constant folding.
+  auto *Cst0 = dyn_cast<Constant>(Arg0);
+  auto *Cst1 = dyn_cast<Constant>(Arg1);
+  if (!Cst0 || !Cst1)
+    return nullptr;
+
+  SmallVector<Constant *, 32> Vals;
+  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+    for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
+      unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
+      auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0;
+      auto *COp = Cst->getAggregateElement(SrcIdx);
+      if (COp && isa<UndefValue>(COp)) {
+        Vals.push_back(UndefValue::get(ResTy->getScalarType()));
+        continue;
+      }
+
+      auto *CInt = dyn_cast_or_null<ConstantInt>(COp);
+      if (!CInt)
+        return nullptr;
+
+      APInt Val = CInt->getValue();
+      assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() &&
+             "Unexpected constant bitwidth");
+
+      if (IsSigned) {
+        // PACKSS: Truncate signed value with signed saturation.
+        // Source values less than dst minint are saturated to minint.
+        // Source values greater than dst maxint are saturated to maxint.
+        if (Val.isSignedIntN(DstScalarSizeInBits))
+          Val = Val.trunc(DstScalarSizeInBits);
+        else if (Val.isNegative())
+          Val = APInt::getSignedMinValue(DstScalarSizeInBits);
+        else
+          Val = APInt::getSignedMaxValue(DstScalarSizeInBits);
+      } else {
+        // PACKUS: Truncate signed value with unsigned saturation.
+        // Source values less than zero are saturated to zero.
+        // Source values greater than dst maxuint are saturated to maxuint.
+        if (Val.isIntN(DstScalarSizeInBits))
+          Val = Val.trunc(DstScalarSizeInBits);
+        else if (Val.isNegative())
+          Val = APInt::getNullValue(DstScalarSizeInBits);
+        else
+          Val = APInt::getAllOnesValue(DstScalarSizeInBits);
+      }
+
+      Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val));
+    }
+  }
+
+  return ConstantVector::get(Vals);
+}
+
+static Value *simplifyX86movmsk(const IntrinsicInst &II) {
   Value *Arg = II.getArgOperand(0);
   Type *ResTy = II.getType();
   Type *ArgTy = Arg->getType();
@@ -679,7 +843,8 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
     // Length bits.
     if (CI0) {
       APInt Elt = CI0->getValue();
-      Elt = Elt.lshr(Index).zextOrTrunc(Length);
+      Elt.lshrInPlace(Index);
+      Elt = Elt.zextOrTrunc(Length);
       return LowConstantHighUndef(Elt.getZExtValue());
     }
 
@@ -693,7 +858,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
   }
 
   // Constant Fold - extraction from zero is always {zero, undef}.
-  if (CI0 && CI0->equalsInt(0))
+  if (CI0 && CI0->isZero())
     return LowConstantHighUndef(0);
 
   return nullptr;
@@ -876,7 +1041,7 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
     // The PD variants uses bit 1 to select per-lane element index, so
     // shift down to convert to generic shuffle mask index.
     if (IsPD)
-      Index = Index.lshr(1);
+      Index.lshrInPlace(1);
 
     // The _256 variants are a bit trickier since the mask bits always index
     // into the corresponding 128 half. In order to convert to a generic
@@ -1211,42 +1376,78 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
           II.getIntrinsicID() == Intrinsic::ctlz) &&
          "Expected cttz or ctlz intrinsic");
   Value *Op0 = II.getArgOperand(0);
-  // FIXME: Try to simplify vectors of integers.
-  auto *IT = dyn_cast<IntegerType>(Op0->getType());
-  if (!IT)
-    return nullptr;
 
-  unsigned BitWidth = IT->getBitWidth();
-  APInt KnownZero(BitWidth, 0);
-  APInt KnownOne(BitWidth, 0);
-  IC.computeKnownBits(Op0, KnownZero, KnownOne, 0, &II);
+  KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
 
   // Create a mask for bits above (ctlz) or below (cttz) the first known one.
   bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
-  unsigned NumMaskBits = IsTZ ? KnownOne.countTrailingZeros()
-                              : KnownOne.countLeadingZeros();
-  APInt Mask = IsTZ ? APInt::getLowBitsSet(BitWidth, NumMaskBits)
-                    : APInt::getHighBitsSet(BitWidth, NumMaskBits);
+  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+                                : Known.countMaxLeadingZeros();
+  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+                                : Known.countMinLeadingZeros();
 
   // If all bits above (ctlz) or below (cttz) the first known one are known
   // zero, this value is constant.
   // FIXME: This should be in InstSimplify because we're replacing an
   // instruction with a constant.
-  if ((Mask & KnownZero) == Mask) {
-    auto *C = ConstantInt::get(IT, APInt(BitWidth, NumMaskBits));
+  if (PossibleZeros == DefiniteZeros) {
+    auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
     return IC.replaceInstUsesWith(II, C);
   }
 
   // If the input to cttz/ctlz is known to be non-zero,
   // then change the 'ZeroIsUndef' parameter to 'true'
   // because we know the zero behavior can't affect the result.
-  if (KnownOne != 0 || isKnownNonZero(Op0, IC.getDataLayout())) {
+  if (!Known.One.isNullValue() ||
+      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
+                     &IC.getDominatorTree())) {
     if (!match(II.getArgOperand(1), m_One())) {
-      II.setOperand(1, IC.Builder->getTrue());
+      II.setOperand(1, IC.Builder.getTrue());
       return &II;
     }
   }
 
+  // Add range metadata since known bits can't completely reflect what we know.
+  // TODO: Handle splat vectors.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
+  assert(II.getIntrinsicID() == Intrinsic::ctpop &&
+         "Expected ctpop intrinsic");
+  Value *Op0 = II.getArgOperand(0);
+  // FIXME: Try to simplify vectors of integers.
+  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  if (!IT)
+    return nullptr;
+
+  unsigned BitWidth = IT->getBitWidth();
+  KnownBits Known(BitWidth);
+  IC.computeKnownBits(Op0, Known, 0, &II);
+
+  unsigned MinCount = Known.countMinPopulation();
+  unsigned MaxCount = Known.countMaxPopulation();
+
+  // Add range metadata since known bits can't completely reflect what we know.
+  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+    Metadata *LowAndHigh[] = {
+        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
+        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
+    II.setMetadata(LLVMContext::MD_range,
+                   MDNode::get(II.getContext(), LowAndHigh));
+    return &II;
+  }
+
   return nullptr;
 }
 
@@ -1274,7 +1475,7 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
   // the LLVM intrinsic definition for the pointer argument.
   unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
   PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
-  Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
+  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
 
   // Second, convert the x86 XMM integer vector mask to a vector of bools based
   // on each element's most significant bit (the sign bit).
@@ -1282,7 +1483,7 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
 
   // The pass-through vector for an x86 masked load is a zero vector.
   CallInst *NewMaskedLoad =
-      IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
+      IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
   return IC.replaceInstUsesWith(II, NewMaskedLoad);
 }
 
@@ -1317,19 +1518,40 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   // the LLVM intrinsic definition for the pointer argument.
   unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
   PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
-  Value *PtrCast = IC.Builder->CreateBitCast(Ptr, VecPtrTy, "castvec");
+  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
 
   // Second, convert the x86 XMM integer vector mask to a vector of bools based
   // on each element's most significant bit (the sign bit).
   Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
 
-  IC.Builder->CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
+  IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
 
   // 'Replace uses' doesn't work for stores. Erase the original masked store.
   IC.eraseInstFromFunction(II);
   return true;
 }
 
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1373,6 +1595,254 @@ static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
   return false;
 }
 
+// Convert NVVM intrinsics to target-generic LLVM code where possible.
+static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
+  // Each NVVM intrinsic we can simplify can be replaced with one of:
+  //
+  //  * an LLVM intrinsic,
+  //  * an LLVM cast operation,
+  //  * an LLVM binary operation, or
+  //  * ad-hoc LLVM IR for the particular operation.
+
+  // Some transformations are only valid when the module's
+  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
+  // transformations are valid regardless of the module's ftz setting.
+  enum FtzRequirementTy {
+    FTZ_Any,       // Any ftz setting is ok.
+    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
+    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
+  };
+  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
+  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
+  // simplify.
+  enum SpecialCase {
+    SPC_Reciprocal,
+  };
+
+  // SimplifyAction is a poor-man's variant (plus an additional flag) that
+  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
+  struct SimplifyAction {
+    // Invariant: At most one of these Optionals has a value.
+    Optional<Intrinsic::ID> IID;
+    Optional<Instruction::CastOps> CastOp;
+    Optional<Instruction::BinaryOps> BinaryOp;
+    Optional<SpecialCase> Special;
+
+    FtzRequirementTy FtzRequirement = FTZ_Any;
+
+    SimplifyAction() = default;
+
+    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
+        : IID(IID), FtzRequirement(FtzReq) {}
+
+    // Cast operations don't have anything to do with FTZ, so we skip that
+    // argument.
+    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
+
+    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
+        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
+
+    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
+        : Special(Special), FtzRequirement(FtzReq) {}
+  };
+
+  // Try to generate a SimplifyAction describing how to replace our
+  // IntrinsicInstr with target-generic LLVM IR.
+  const SimplifyAction Action = [II]() -> SimplifyAction {
+    switch (II->getIntrinsicID()) {
+
+    // NVVM intrinsics that map directly to LLVM intrinsics.
+    case Intrinsic::nvvm_ceil_d:
+      return {Intrinsic::ceil, FTZ_Any};
+    case Intrinsic::nvvm_ceil_f:
+      return {Intrinsic::ceil, FTZ_MustBeOff};
+    case Intrinsic::nvvm_ceil_ftz_f:
+      return {Intrinsic::ceil, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fabs_d:
+      return {Intrinsic::fabs, FTZ_Any};
+    case Intrinsic::nvvm_fabs_f:
+      return {Intrinsic::fabs, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fabs_ftz_f:
+      return {Intrinsic::fabs, FTZ_MustBeOn};
+    case Intrinsic::nvvm_floor_d:
+      return {Intrinsic::floor, FTZ_Any};
+    case Intrinsic::nvvm_floor_f:
+      return {Intrinsic::floor, FTZ_MustBeOff};
+    case Intrinsic::nvvm_floor_ftz_f:
+      return {Intrinsic::floor, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fma_rn_d:
+      return {Intrinsic::fma, FTZ_Any};
+    case Intrinsic::nvvm_fma_rn_f:
+      return {Intrinsic::fma, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fma_rn_ftz_f:
+      return {Intrinsic::fma, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmax_d:
+      return {Intrinsic::maxnum, FTZ_Any};
+    case Intrinsic::nvvm_fmax_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmax_ftz_f:
+      return {Intrinsic::maxnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_fmin_d:
+      return {Intrinsic::minnum, FTZ_Any};
+    case Intrinsic::nvvm_fmin_f:
+      return {Intrinsic::minnum, FTZ_MustBeOff};
+    case Intrinsic::nvvm_fmin_ftz_f:
+      return {Intrinsic::minnum, FTZ_MustBeOn};
+    case Intrinsic::nvvm_round_d:
+      return {Intrinsic::round, FTZ_Any};
+    case Intrinsic::nvvm_round_f:
+      return {Intrinsic::round, FTZ_MustBeOff};
+    case Intrinsic::nvvm_round_ftz_f:
+      return {Intrinsic::round, FTZ_MustBeOn};
+    case Intrinsic::nvvm_sqrt_rn_d:
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_f:
+      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
+      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
+      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
+      // the versions with explicit ftz-ness.
+      return {Intrinsic::sqrt, FTZ_Any};
+    case Intrinsic::nvvm_sqrt_rn_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOff};
+    case Intrinsic::nvvm_sqrt_rn_ftz_f:
+      return {Intrinsic::sqrt, FTZ_MustBeOn};
+    case Intrinsic::nvvm_trunc_d:
+      return {Intrinsic::trunc, FTZ_Any};
+    case Intrinsic::nvvm_trunc_f:
+      return {Intrinsic::trunc, FTZ_MustBeOff};
+    case Intrinsic::nvvm_trunc_ftz_f:
+      return {Intrinsic::trunc, FTZ_MustBeOn};
+
+    // NVVM intrinsics that map to LLVM cast operations.
+    //
+    // Note that llvm's target-generic conversion operators correspond to the rz
+    // (round to zero) versions of the nvvm conversion intrinsics, even though
+    // most everything else here uses the rn (round to nearest even) nvvm ops.
+    case Intrinsic::nvvm_d2i_rz:
+    case Intrinsic::nvvm_f2i_rz:
+    case Intrinsic::nvvm_d2ll_rz:
+    case Intrinsic::nvvm_f2ll_rz:
+      return {Instruction::FPToSI};
+    case Intrinsic::nvvm_d2ui_rz:
+    case Intrinsic::nvvm_f2ui_rz:
+    case Intrinsic::nvvm_d2ull_rz:
+    case Intrinsic::nvvm_f2ull_rz:
+      return {Instruction::FPToUI};
+    case Intrinsic::nvvm_i2d_rz:
+    case Intrinsic::nvvm_i2f_rz:
+    case Intrinsic::nvvm_ll2d_rz:
+    case Intrinsic::nvvm_ll2f_rz:
+      return {Instruction::SIToFP};
+    case Intrinsic::nvvm_ui2d_rz:
+    case Intrinsic::nvvm_ui2f_rz:
+    case Intrinsic::nvvm_ull2d_rz:
+    case Intrinsic::nvvm_ull2f_rz:
+      return {Instruction::UIToFP};
+
+    // NVVM intrinsics that map to LLVM binary ops.
+    case Intrinsic::nvvm_add_rn_d:
+      return {Instruction::FAdd, FTZ_Any};
+    case Intrinsic::nvvm_add_rn_f:
+      return {Instruction::FAdd, FTZ_MustBeOff};
+    case Intrinsic::nvvm_add_rn_ftz_f:
+      return {Instruction::FAdd, FTZ_MustBeOn};
+    case Intrinsic::nvvm_mul_rn_d:
+      return {Instruction::FMul, FTZ_Any};
+    case Intrinsic::nvvm_mul_rn_f:
+      return {Instruction::FMul, FTZ_MustBeOff};
+    case Intrinsic::nvvm_mul_rn_ftz_f:
+      return {Instruction::FMul, FTZ_MustBeOn};
+    case Intrinsic::nvvm_div_rn_d:
+      return {Instruction::FDiv, FTZ_Any};
+    case Intrinsic::nvvm_div_rn_f:
+      return {Instruction::FDiv, FTZ_MustBeOff};
+    case Intrinsic::nvvm_div_rn_ftz_f:
+      return {Instruction::FDiv, FTZ_MustBeOn};
+
+    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
+    // need special handling.
+    //
+    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
+    // as well.
+    case Intrinsic::nvvm_rcp_rn_d:
+      return {SPC_Reciprocal, FTZ_Any};
+    case Intrinsic::nvvm_rcp_rn_f:
+      return {SPC_Reciprocal, FTZ_MustBeOff};
+    case Intrinsic::nvvm_rcp_rn_ftz_f:
+      return {SPC_Reciprocal, FTZ_MustBeOn};
+
+    // We do not currently simplify intrinsics that give an approximate answer.
+    // These include:
+    //
+    //   - nvvm_cos_approx_{f,ftz_f}
+    //   - nvvm_ex2_approx_{d,f,ftz_f}
+    //   - nvvm_lg2_approx_{d,f,ftz_f}
+    //   - nvvm_sin_approx_{f,ftz_f}
+    //   - nvvm_sqrt_approx_{f,ftz_f}
+    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
+    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
+    //   - nvvm_rcp_approx_ftz_d
+    //
+    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
+    // means that fastmath is enabled in the intrinsic.  Unfortunately only
+    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
+    // information gets lost and we can't select on it.
+    //
+    // TODO: div and rcp are lowered to a binary op, so these we could in theory
+    // lower them to "fast fdiv".
+
+    default:
+      return {};
+    }
+  }();
+
+  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
+  // can bail out now.  (Notice that in the case that IID is not an NVVM
+  // intrinsic, we don't have to look up any module metadata, as
+  // FtzRequirementTy will be FTZ_Any.)
+  if (Action.FtzRequirement != FTZ_Any) {
+    bool FtzEnabled =
+        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
+        "true";
+
+    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
+      return nullptr;
+  }
+
+  // Simplify to target-generic intrinsic.
+  if (Action.IID) {
+    SmallVector<Value *, 4> Args(II->arg_operands());
+    // All the target-generic intrinsics currently of interest to us have one
+    // type argument, equal to that of the nvvm intrinsic's argument.
+    Type *Tys[] = {II->getArgOperand(0)->getType()};
+    return CallInst::Create(
+        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
+  }
+
+  // Simplify to target-generic binary op.
+  if (Action.BinaryOp)
+    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
+                                  II->getArgOperand(1), II->getName());
+
+  // Simplify to target-generic cast op.
+  if (Action.CastOp)
+    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
+                            II->getName());
+
+  // All that's left are the special cases.
+  if (!Action.Special)
+    return nullptr;
+
+  switch (*Action.Special) {
+  case SPC_Reciprocal:
+    // Simplify reciprocal.
+    return BinaryOperator::Create(
+        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
+        II->getArgOperand(0), II->getName());
+  }
+  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
+}
+
 Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
   removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
   return nullptr;
@@ -1388,8 +1858,8 @@ Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
 /// lifting.
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   auto Args = CI.arg_operands();
-  if (Value *V = SimplifyCall(CI.getCalledValue(), Args.begin(), Args.end(), DL,
-                              &TLI, &DT, &AC))
+  if (Value *V = SimplifyCall(&CI, CI.getCalledValue(), Args.begin(),
+                              Args.end(), SQ.getWithInstruction(&CI)))
     return replaceInstUsesWith(CI, V);
 
   if (isFreeCall(&CI, &TLI))
@@ -1462,6 +1932,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
+  if (auto *AMI = dyn_cast<ElementUnorderedAtomicMemCpyInst>(II)) {
+    if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
+      if (C->isNullValue())
+        return eraseInstFromFunction(*AMI);
+
+    if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
+      return I;
+  }
+
+  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
+    return I;
+
   auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
                                               unsigned DemandedWidth) {
     APInt UndefElts(Width, 0);
@@ -1481,16 +1963,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *IIOperand = II->getArgOperand(0);
     Value *X = nullptr;
 
+    // TODO should this be in InstSimplify?
     // bswap(bswap(x)) -> x
     if (match(IIOperand, m_BSwap(m_Value(X))))
-        return replaceInstUsesWith(CI, X);
+      return replaceInstUsesWith(CI, X);
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
       unsigned C = X->getType()->getPrimitiveSizeInBits() -
         IIOperand->getType()->getPrimitiveSizeInBits();
       Value *CV = ConstantInt::get(X->getType(), C);
-      Value *V = Builder->CreateLShr(X, CV);
+      Value *V = Builder.CreateLShr(X, CV);
       return new TruncInst(V, IIOperand->getType());
     }
     break;
@@ -1500,14 +1983,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *IIOperand = II->getArgOperand(0);
     Value *X = nullptr;
 
+    // TODO should this be in InstSimplify?
     // bitreverse(bitreverse(x)) -> x
-    if (match(IIOperand, m_Intrinsic<Intrinsic::bitreverse>(m_Value(X))))
+    if (match(IIOperand, m_BitReverse(m_Value(X))))
       return replaceInstUsesWith(CI, X);
     break;
   }
 
   case Intrinsic::masked_load:
-    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, *Builder))
+    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, Builder))
       return replaceInstUsesWith(CI, SimplifiedMaskedOp);
     break;
   case Intrinsic::masked_store:
@@ -1526,7 +2010,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if (Power->isOne())
         return replaceInstUsesWith(CI, II->getArgOperand(0));
       // powi(x, -1) -> 1/x
-      if (Power->isAllOnesValue())
+      if (Power->isMinusOne())
         return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
                                           II->getArgOperand(0));
     }
@@ -1538,6 +2022,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return I;
     break;
 
+  case Intrinsic::ctpop:
+    if (auto *I = foldCtpop(*II, *this))
+      return I;
+    break;
+
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::umul_with_overflow:
@@ -1581,8 +2070,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, V);
     break;
   }
-  case Intrinsic::fma:
   case Intrinsic::fmuladd: {
+    // Canonicalize fast fmuladd to the separate fmul + fadd.
+    if (II->hasUnsafeAlgebra()) {
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(II->getFastMathFlags());
+      Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
+                                      II->getArgOperand(1));
+      Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
+      Add->takeName(II);
+      return replaceInstUsesWith(*II, Add);
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::fma: {
     Value *Src0 = II->getArgOperand(0);
     Value *Src1 = II->getArgOperand(1);
 
@@ -1626,11 +2128,31 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Constant *LHS, *RHS;
     if (match(II->getArgOperand(0),
               m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
-      CallInst *Call0 = Builder->CreateCall(II->getCalledFunction(), {LHS});
-      CallInst *Call1 = Builder->CreateCall(II->getCalledFunction(), {RHS});
+      CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
+      CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
       return SelectInst::Create(Cond, Call0, Call1);
     }
 
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::ceil:
+  case Intrinsic::floor:
+  case Intrinsic::round:
+  case Intrinsic::nearbyint:
+  case Intrinsic::rint:
+  case Intrinsic::trunc: {
+    Value *ExtSrc;
+    if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
+        II->getArgOperand(0)->hasOneUse()) {
+      // fabs (fpext x) -> fpext (fabs x)
+      Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
+                                           { ExtSrc->getType() });
+      CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
+      NewFabs->copyFastMathFlags(II);
+      NewFabs->takeName(II);
+      return new FPExtInst(NewFabs, II->getType());
+    }
+
     break;
   }
   case Intrinsic::cos:
@@ -1652,7 +2174,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC lvx -> load if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
                                    &DT) >= 16) {
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
       return new LoadInst(Ptr);
     }
@@ -1660,8 +2182,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_vsx_lxvw4x:
   case Intrinsic::ppc_vsx_lxvd2x: {
     // Turn PPC VSX loads into normal loads.
-    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
-                                        PointerType::getUnqual(II->getType()));
+    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
+                                       PointerType::getUnqual(II->getType()));
     return new LoadInst(Ptr, Twine(""), false, 1);
   }
   case Intrinsic::ppc_altivec_stvx:
@@ -1671,7 +2193,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                                    &DT) >= 16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
@@ -1679,18 +2201,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_vsx_stxvd2x: {
     // Turn PPC VSX stores into normal stores.
     Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
-    Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
     return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
   }
   case Intrinsic::ppc_qpx_qvlfs:
     // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
                                    &DT) >= 16) {
-      Type *VTy = VectorType::get(Builder->getFloatTy(),
+      Type *VTy = VectorType::get(Builder.getFloatTy(),
                                   II->getType()->getVectorNumElements());
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(VTy));
-      Value *Load = Builder->CreateLoad(Ptr);
+      Value *Load = Builder.CreateLoad(Ptr);
       return new FPExtInst(Load, II->getType());
     }
     break;
@@ -1698,7 +2220,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
                                    &DT) >= 32) {
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0),
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
       return new LoadInst(Ptr);
     }
@@ -1707,11 +2229,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
                                    &DT) >= 16) {
-      Type *VTy = VectorType::get(Builder->getFloatTy(),
+      Type *VTy = VectorType::get(Builder.getFloatTy(),
           II->getArgOperand(0)->getType()->getVectorNumElements());
-      Value *TOp = Builder->CreateFPTrunc(II->getArgOperand(0), VTy);
+      Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
       Type *OpPtrTy = PointerType::getUnqual(VTy);
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(TOp, Ptr);
     }
     break;
@@ -1721,7 +2243,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                                    &DT) >= 32) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
-      Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
+      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(II->getArgOperand(0), Ptr);
     }
     break;
@@ -1750,15 +2272,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         SmallVector<uint32_t, 8> SubVecMask;
         for (unsigned i = 0; i != RetWidth; ++i)
           SubVecMask.push_back((int)i);
-        VectorHalfAsShorts = Builder->CreateShuffleVector(
+        VectorHalfAsShorts = Builder.CreateShuffleVector(
             Arg, UndefValue::get(ArgType), SubVecMask);
       }
 
       auto VectorHalfType =
           VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
       auto VectorHalfs =
-          Builder->CreateBitCast(VectorHalfAsShorts, VectorHalfType);
-      auto VectorFloats = Builder->CreateFPExt(VectorHalfs, RetType);
+          Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
+      auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
       return replaceInstUsesWith(*II, VectorFloats);
     }
 
@@ -1812,7 +2334,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx_movmsk_pd_256:
   case Intrinsic::x86_avx_movmsk_ps_256:
   case Intrinsic::x86_avx2_pmovmskb: {
-    if (Value *V = simplifyX86movmsk(*II, *Builder))
+    if (Value *V = simplifyX86movmsk(*II))
       return replaceInstUsesWith(*II, V);
     break;
   }
@@ -1863,6 +2385,37 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return II;
     break;
   }
+  case Intrinsic::x86_avx512_mask_cmp_pd_128:
+  case Intrinsic::x86_avx512_mask_cmp_pd_256:
+  case Intrinsic::x86_avx512_mask_cmp_pd_512:
+  case Intrinsic::x86_avx512_mask_cmp_ps_128:
+  case Intrinsic::x86_avx512_mask_cmp_ps_256:
+  case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
+    Value *Arg0 = II->getArgOperand(0);
+    Value *Arg1 = II->getArgOperand(1);
+    bool Arg0IsZero = match(Arg0, m_Zero());
+    if (Arg0IsZero)
+      std::swap(Arg0, Arg1);
+    Value *A, *B;
+    // This fold requires only the NINF(not +/- inf) since inf minus
+    // inf is nan.
+    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
+    // equal for both compares.
+    // NNAN is not needed because nans compare the same for both compares.
+    // The compare intrinsic uses the above assumptions and therefore
+    // doesn't require additional flags.
+    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
+         match(Arg1, m_Zero()) &&
+         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
+      if (Arg0IsZero)
+        std::swap(A, B);
+      II->setArgOperand(0, A);
+      II->setArgOperand(1, B);
+      return II;
+    }
+    break;
+  }
 
   case Intrinsic::x86_avx512_mask_add_ps_512:
   case Intrinsic::x86_avx512_mask_div_ps_512:
@@ -1884,25 +2437,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         default: llvm_unreachable("Case stmts out of sync!");
         case Intrinsic::x86_avx512_mask_add_ps_512:
         case Intrinsic::x86_avx512_mask_add_pd_512:
-          V = Builder->CreateFAdd(Arg0, Arg1);
+          V = Builder.CreateFAdd(Arg0, Arg1);
           break;
         case Intrinsic::x86_avx512_mask_sub_ps_512:
         case Intrinsic::x86_avx512_mask_sub_pd_512:
-          V = Builder->CreateFSub(Arg0, Arg1);
+          V = Builder.CreateFSub(Arg0, Arg1);
           break;
         case Intrinsic::x86_avx512_mask_mul_ps_512:
         case Intrinsic::x86_avx512_mask_mul_pd_512:
-          V = Builder->CreateFMul(Arg0, Arg1);
+          V = Builder.CreateFMul(Arg0, Arg1);
           break;
         case Intrinsic::x86_avx512_mask_div_ps_512:
         case Intrinsic::x86_avx512_mask_div_pd_512:
-          V = Builder->CreateFDiv(Arg0, Arg1);
+          V = Builder.CreateFDiv(Arg0, Arg1);
           break;
         }
 
         // Create a select for the masking.
         V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
-                              *Builder);
+                              Builder);
         return replaceInstUsesWith(*II, V);
       }
     }
@@ -1923,27 +2476,27 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         // Extract the element as scalars.
         Value *Arg0 = II->getArgOperand(0);
         Value *Arg1 = II->getArgOperand(1);
-        Value *LHS = Builder->CreateExtractElement(Arg0, (uint64_t)0);
-        Value *RHS = Builder->CreateExtractElement(Arg1, (uint64_t)0);
+        Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
+        Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
 
         Value *V;
         switch (II->getIntrinsicID()) {
         default: llvm_unreachable("Case stmts out of sync!");
         case Intrinsic::x86_avx512_mask_add_ss_round:
         case Intrinsic::x86_avx512_mask_add_sd_round:
-          V = Builder->CreateFAdd(LHS, RHS);
+          V = Builder.CreateFAdd(LHS, RHS);
           break;
         case Intrinsic::x86_avx512_mask_sub_ss_round:
         case Intrinsic::x86_avx512_mask_sub_sd_round:
-          V = Builder->CreateFSub(LHS, RHS);
+          V = Builder.CreateFSub(LHS, RHS);
           break;
         case Intrinsic::x86_avx512_mask_mul_ss_round:
         case Intrinsic::x86_avx512_mask_mul_sd_round:
-          V = Builder->CreateFMul(LHS, RHS);
+          V = Builder.CreateFMul(LHS, RHS);
           break;
         case Intrinsic::x86_avx512_mask_div_ss_round:
         case Intrinsic::x86_avx512_mask_div_sd_round:
-          V = Builder->CreateFDiv(LHS, RHS);
+          V = Builder.CreateFDiv(LHS, RHS);
           break;
         }
 
@@ -1953,18 +2506,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         // We don't need a select if we know the mask bit is a 1.
         if (!C || !C->getValue()[0]) {
           // Cast the mask to an i1 vector and then extract the lowest element.
-          auto *MaskTy = VectorType::get(Builder->getInt1Ty(),
+          auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
                              cast<IntegerType>(Mask->getType())->getBitWidth());
-          Mask = Builder->CreateBitCast(Mask, MaskTy);
-          Mask = Builder->CreateExtractElement(Mask, (uint64_t)0);
+          Mask = Builder.CreateBitCast(Mask, MaskTy);
+          Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
           // Extract the lowest element from the passthru operand.
-          Value *Passthru = Builder->CreateExtractElement(II->getArgOperand(2),
+          Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
                                                           (uint64_t)0);
-          V = Builder->CreateSelect(Mask, V, Passthru);
+          V = Builder.CreateSelect(Mask, V, Passthru);
         }
 
         // Insert the result back into the original argument 0.
-        V = Builder->CreateInsertElement(Arg0, V, (uint64_t)0);
+        V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
 
         return replaceInstUsesWith(*II, V);
       }
@@ -2045,7 +2598,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx512_pslli_d_512:
   case Intrinsic::x86_avx512_pslli_q_512:
   case Intrinsic::x86_avx512_pslli_w_512:
-    if (Value *V = simplifyX86immShift(*II, *Builder))
+    if (Value *V = simplifyX86immShift(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2076,7 +2629,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx512_psll_d_512:
   case Intrinsic::x86_avx512_psll_q_512:
   case Intrinsic::x86_avx512_psll_w_512: {
-    if (Value *V = simplifyX86immShift(*II, *Builder))
+    if (Value *V = simplifyX86immShift(*II, Builder))
       return replaceInstUsesWith(*II, V);
 
     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
@@ -2120,7 +2673,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx512_psrlv_w_128:
   case Intrinsic::x86_avx512_psrlv_w_256:
   case Intrinsic::x86_avx512_psrlv_w_512:
-    if (Value *V = simplifyX86varShift(*II, *Builder))
+    if (Value *V = simplifyX86varShift(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2130,6 +2683,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx2_pmulu_dq:
   case Intrinsic::x86_avx512_pmul_dq_512:
   case Intrinsic::x86_avx512_pmulu_dq_512: {
+    if (Value *V = simplifyX86muldq(*II, Builder))
+      return replaceInstUsesWith(*II, V);
+
     unsigned VWidth = II->getType()->getVectorNumElements();
     APInt UndefElts(VWidth, 0);
     APInt DemandedElts = APInt::getAllOnesValue(VWidth);
@@ -2141,8 +2697,66 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   }
 
+  case Intrinsic::x86_sse2_packssdw_128:
+  case Intrinsic::x86_sse2_packsswb_128:
+  case Intrinsic::x86_avx2_packssdw:
+  case Intrinsic::x86_avx2_packsswb:
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
+    if (Value *V = simplifyX86pack(*II, true))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_sse2_packuswb_128:
+  case Intrinsic::x86_sse41_packusdw:
+  case Intrinsic::x86_avx2_packusdw:
+  case Intrinsic::x86_avx2_packuswb:
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512:
+    if (Value *V = simplifyX86pack(*II, false))
+      return replaceInstUsesWith(*II, V);
+    break;
+
+  case Intrinsic::x86_pclmulqdq: {
+    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
+      unsigned Imm = C->getZExtValue();
+
+      bool MadeChange = false;
+      Value *Arg0 = II->getArgOperand(0);
+      Value *Arg1 = II->getArgOperand(1);
+      unsigned VWidth = Arg0->getType()->getVectorNumElements();
+      APInt DemandedElts(VWidth, 0);
+
+      APInt UndefElts1(VWidth, 0);
+      DemandedElts = (Imm & 0x01) ? 2 : 1;
+      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
+                                                UndefElts1)) {
+        II->setArgOperand(0, V);
+        MadeChange = true;
+      }
+
+      APInt UndefElts2(VWidth, 0);
+      DemandedElts = (Imm & 0x10) ? 2 : 1;
+      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
+                                                UndefElts2)) {
+        II->setArgOperand(1, V);
+        MadeChange = true;
+      }
+
+      // If both input elements are undef, the result is undef.
+      if (UndefElts1[(Imm & 0x01) ? 1 : 0] ||
+          UndefElts2[(Imm & 0x10) ? 1 : 0])
+        return replaceInstUsesWith(*II,
+                                   ConstantAggregateZero::get(II->getType()));
+
+      if (MadeChange)
+        return II;
+    }
+    break;
+  }
+
   case Intrinsic::x86_sse41_insertps:
-    if (Value *V = simplifyX86insertps(*II, *Builder))
+    if (Value *V = simplifyX86insertps(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2165,7 +2779,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
            : nullptr;
 
     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
-    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
+    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
       return replaceInstUsesWith(*II, V);
 
     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
@@ -2197,7 +2811,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
 
     // Attempt to simplify to a constant or shuffle vector.
-    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, *Builder))
+    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
       return replaceInstUsesWith(*II, V);
 
     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
@@ -2229,7 +2843,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       const APInt &V11 = CI11->getValue();
       APInt Len = V11.zextOrTrunc(6);
       APInt Idx = V11.lshr(8).zextOrTrunc(6);
-      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
+      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
         return replaceInstUsesWith(*II, V);
     }
 
@@ -2262,7 +2876,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (CILength && CIIndex) {
       APInt Len = CILength->getValue().zextOrTrunc(6);
       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
-      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, *Builder))
+      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
         return replaceInstUsesWith(*II, V);
     }
 
@@ -2316,7 +2930,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
   case Intrinsic::x86_avx512_pshuf_b_512:
-    if (Value *V = simplifyX86pshufb(*II, *Builder))
+    if (Value *V = simplifyX86pshufb(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2326,13 +2940,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx_vpermilvar_pd:
   case Intrinsic::x86_avx_vpermilvar_pd_256:
   case Intrinsic::x86_avx512_vpermilvar_pd_512:
-    if (Value *V = simplifyX86vpermilvar(*II, *Builder))
+    if (Value *V = simplifyX86vpermilvar(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
-    if (Value *V = simplifyX86vpermv(*II, *Builder))
+    if (Value *V = simplifyX86vpermv(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2350,10 +2964,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx512_mask_permvar_sf_512:
   case Intrinsic::x86_avx512_mask_permvar_si_256:
   case Intrinsic::x86_avx512_mask_permvar_si_512:
-    if (Value *V = simplifyX86vpermv(*II, *Builder)) {
+    if (Value *V = simplifyX86vpermv(*II, Builder)) {
       // We simplified the permuting, now create a select for the masking.
       V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
-                            *Builder);
+                            Builder);
       return replaceInstUsesWith(*II, V);
     }
     break;
@@ -2362,7 +2976,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_avx_vperm2f128_ps_256:
   case Intrinsic::x86_avx_vperm2f128_si_256:
   case Intrinsic::x86_avx2_vperm2i128:
-    if (Value *V = simplifyX86vperm2(*II, *Builder))
+    if (Value *V = simplifyX86vperm2(*II, Builder))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2395,7 +3009,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_xop_vpcomd:
   case Intrinsic::x86_xop_vpcomq:
   case Intrinsic::x86_xop_vpcomw:
-    if (Value *V = simplifyX86vpcom(*II, *Builder, true))
+    if (Value *V = simplifyX86vpcom(*II, Builder, true))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2403,7 +3017,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_xop_vpcomud:
   case Intrinsic::x86_xop_vpcomuq:
   case Intrinsic::x86_xop_vpcomuw:
-    if (Value *V = simplifyX86vpcom(*II, *Builder, false))
+    if (Value *V = simplifyX86vpcom(*II, Builder, false))
       return replaceInstUsesWith(*II, V);
     break;
 
@@ -2430,10 +3044,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
       if (AllEltsOk) {
         // Cast the input vectors to byte vectors.
-        Value *Op0 = Builder->CreateBitCast(II->getArgOperand(0),
-                                            Mask->getType());
-        Value *Op1 = Builder->CreateBitCast(II->getArgOperand(1),
-                                            Mask->getType());
+        Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
+                                           Mask->getType());
+        Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
+                                           Mask->getType());
         Value *Result = UndefValue::get(Op0->getType());
 
         // Only extract each element once.
@@ -2453,13 +3067,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
             Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
             Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
             ExtractedElts[Idx] =
-              Builder->CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
-                                            Builder->getInt32(Idx&15));
+              Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
+                                           Builder.getInt32(Idx&15));
           }
 
           // Insert this value into the result vector.
-          Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx],
-                                                Builder->getInt32(i));
+          Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
+                                               Builder.getInt32(i));
         }
         return CastInst::Create(Instruction::BitCast, Result, CI.getType());
       }
@@ -2531,9 +3145,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
-
   case Intrinsic::amdgcn_rcp: {
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
       APFloat Val(ArgVal.getSemantics(), 1.0);
       APFloat::opStatus Status = Val.divide(ArgVal,
@@ -2546,6 +3165,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+    break;
+  }
   case Intrinsic::amdgcn_frexp_mant:
   case Intrinsic::amdgcn_frexp_exp: {
     Value *Src = II->getArgOperand(0);
@@ -2611,7 +3238,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     if (Mask == (S_NAN | Q_NAN)) {
       // Equivalent of isnan. Replace with standard fcmp.
-      Value *FCmp = Builder->CreateFCmpUNO(Src0, Src0);
+      Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
       FCmp->takeName(II);
       return replaceInstUsesWith(*II, FCmp);
     }
@@ -2623,7 +3250,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
       // Clamp mask to used bits
       if ((Mask & FullMask) != Mask) {
-        CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
+        CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
           { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
         );
 
@@ -2650,6 +3277,289 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
   }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem
+          = II->getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded = ConstantVector::get({
+            ConstantFP::get(II->getContext(), Val0),
+            ConstantFP::get(II->getContext(), Val1) });
+        return replaceInstUsesWith(*II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II->getArgOperand(0);
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(*II, Src);
+
+    unsigned Width;
+    Type *Ty = II->getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0)
+        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
+
+      if (Width >= IntSize) {
+        // Hardware ignores high bits, so remove those.
+        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
+                                              Width & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
+                                              Offset & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;
+
+    // TODO: Also emit sub if only width is constant.
+    if (!CWidth && COffset && Offset == 0) {
+      Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
+      Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2));
+      ShiftVal = Builder.CreateZExt(ShiftVal, II->getType());
+
+      Value *Shl = Builder.CreateShl(Src, ShiftVal);
+      Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal)
+                                 : Builder.CreateLShr(Shl, ShiftVal);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    if (!CWidth || !COffset)
+      break;
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
+      Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
+                                 : Builder.CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
+                               : Builder.CreateLShr(Src, Offset);
+
+    RightShift->takeName(II);
+    return replaceInstUsesWith(*II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (!En) // Illegal.
+      break;
+
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II->getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed)
+      return II;
+
+    break;
+
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    Value *Src2 = II->getArgOperand(2);
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II->setArgOperand(0, Src0);
+      II->setArgOperand(1, Src1);
+      II->setArgOperand(2, Src2);
+      return II;
+    }
+
+    if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+      CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return replaceInstUsesWith(*II,
+            ConstantFP::get(Builder.getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (!CC)
+      break;
+
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        if (CCmp->isNullValue()) {
+          return replaceInstUsesWith(
+              *II, ConstantExpr::getSExt(CCmp, II->getType()));
+        }
+
+        // The result of V_ICMP/V_FCMP assembly instructions (which this
+        // intrinsic exposes) is one bit per thread, masked with the EXEC
+        // register (which contains the bitmask of live threads). So a
+        // comparison that always returns true is the same as a read of the
+        // EXEC register.
+        Value *NewF = Intrinsic::getDeclaration(
+            II->getModule(), Intrinsic::read_register, II->getType());
+        Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
+        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
+        CallInst *NewCall = Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(II);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred
+        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II->setArgOperand(0, Src1);
+      II->setArgOperand(1, Src0);
+      II->setArgOperand(2, ConstantInt::get(CC->getType(),
+                                            static_cast<int>(SwapPred)));
+      return II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
+         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
+      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, m_Zero()) &&
+        match(Src0,
+              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
+        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
+
+      Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID,
+                                              SrcLHS->getType());
+      Value *Args[] = { SrcLHS, SrcRHS,
+                        ConstantInt::get(CC->getType(), SrcPred) };
+      CallInst *NewCall = Builder.CreateCall(NewF, Args);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    break;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
@@ -2720,16 +3630,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // the InstCombineIRInserter object.
     Value *AssumeIntrinsic = II->getCalledValue(), *A, *B;
     if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
-      Builder->CreateCall(AssumeIntrinsic, A, II->getName());
-      Builder->CreateCall(AssumeIntrinsic, B, II->getName());
+      Builder.CreateCall(AssumeIntrinsic, A, II->getName());
+      Builder.CreateCall(AssumeIntrinsic, B, II->getName());
       return eraseInstFromFunction(*II);
     }
     // assume(!(a || b)) -> assume(!a); assume(!b);
     if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
-      Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(A),
-                          II->getName());
-      Builder->CreateCall(AssumeIntrinsic, Builder->CreateNot(B),
-                          II->getName());
+      Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(A), II->getName());
+      Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(B), II->getName());
       return eraseInstFromFunction(*II);
     }
 
@@ -2751,9 +3659,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // If there is a dominating assume with the same condition as this one,
     // then this one is redundant, and should be removed.
-    APInt KnownZero(1, 0), KnownOne(1, 0);
-    computeKnownBits(IIOperand, KnownZero, KnownOne, 0, II);
-    if (KnownOne.isAllOnesValue())
+    KnownBits Known(1);
+    computeKnownBits(IIOperand, Known, 0, II);
+    if (Known.isAllOnes())
       return eraseInstFromFunction(*II);
 
     // Update the cache of affected values for this assumption (we might be
@@ -2790,7 +3698,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
       // isKnownNonNull -> nonnull attribute
       if (isKnownNonNullAt(DerivedPtr, II, &DT))
-        II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
     }
 
     // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
@@ -2799,11 +3707,38 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
     break;
   }
-  }
 
+  case Intrinsic::experimental_guard: {
+    // Is this guard followed by another guard?
+    Instruction *NextInst = II->getNextNode();
+    Value *NextCond = nullptr;
+    if (match(NextInst,
+              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
+      Value *CurrCond = II->getArgOperand(0);
+
+      // Remove a guard that it is immediately preceded by an identical guard.
+      if (CurrCond == NextCond)
+        return eraseInstFromFunction(*NextInst);
+
+      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+      II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
+      return eraseInstFromFunction(*NextInst);
+    }
+    break;
+  }
+  }
   return visitCallSite(II);
 }
 
+// Fence instruction simplification
+Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
+  // Remove identical consecutive fences.
+  if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+    if (FI.isIdenticalTo(NFI))
+      return eraseInstFromFunction(FI);
+  return nullptr;
+}
+
 // InvokeInst simplification
 //
 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
@@ -2945,24 +3880,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   // Mark any parameters that are known to be non-null with the nonnull
   // attribute.  This is helpful for inlining calls to functions with null
   // checks on their arguments.
-  SmallVector<unsigned, 4> Indices;
+  SmallVector<unsigned, 4> ArgNos;
   unsigned ArgNo = 0;
 
   for (Value *V : CS.args()) {
     if (V->getType()->isPointerTy() &&
-        !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+        !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
         isKnownNonNullAt(V, CS.getInstruction(), &DT))
-      Indices.push_back(ArgNo + 1);
+      ArgNos.push_back(ArgNo);
     ArgNo++;
   }
 
   assert(ArgNo == CS.arg_size() && "sanity check");
 
-  if (!Indices.empty()) {
-    AttributeSet AS = CS.getAttributes();
+  if (!ArgNos.empty()) {
+    AttributeList AS = CS.getAttributes();
     LLVMContext &Ctx = CS.getInstruction()->getContext();
-    AS = AS.addAttribute(Ctx, Indices,
-                         Attribute::get(Ctx, Attribute::NonNull));
+    AS = AS.addParamAttribute(Ctx, ArgNos,
+                              Attribute::get(Ctx, Attribute::NonNull));
     CS.setAttributes(AS);
     Changed = true;
   }
@@ -3081,7 +4016,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     return false;
 
   Instruction *Caller = CS.getInstruction();
-  const AttributeSet &CallerPAL = CS.getAttributes();
+  const AttributeList &CallerPAL = CS.getAttributes();
 
   // Okay, this is a cast from a function to a different type.  Unless doing so
   // would cause a type conversion of one of our arguments, change this call to
@@ -3108,7 +4043,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
       if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
         return false;   // Attribute not compatible with transformed value.
     }
@@ -3149,8 +4084,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
-    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
-          overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
+    if (AttrBuilder(CallerPAL.getParamAttributes(i))
+            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
     if (CS.isInAllocaArgument(i))
@@ -3158,9 +4093,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy &&
-        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
-                                                         Attribute::ByVal)) {
+    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (!ParamPTy || !ParamPTy->getElementType()->isSized())
         return false;
@@ -3195,62 +4128,49 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   }
 
   if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
-      !CallerPAL.isEmpty())
+      !CallerPAL.isEmpty()) {
     // In this case we have more arguments than the new function type, but we
     // won't be dropping them.  Check that these extra arguments have attributes
     // that are compatible with being a vararg call argument.
-    for (unsigned i = CallerPAL.getNumSlots(); i; --i) {
-      unsigned Index = CallerPAL.getSlotIndex(i - 1);
-      if (Index <= FT->getNumParams())
-        break;
-
-      // Check if it has an attribute that's incompatible with varargs.
-      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
-      if (PAttrs.hasAttribute(Index, Attribute::StructRet))
-        return false;
-    }
-
+    unsigned SRetIdx;
+    if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
+        SRetIdx > FT->getNumParams())
+      return false;
+  }
 
   // Okay, we decided that this is a safe thing to do: go ahead and start
   // inserting cast instructions as necessary.
-  std::vector<Value*> Args;
+  SmallVector<Value *, 8> Args;
+  SmallVector<AttributeSet, 8> ArgAttrs;
   Args.reserve(NumActualArgs);
-  SmallVector<AttributeSet, 8> attrVec;
-  attrVec.reserve(NumCommonArgs);
+  ArgAttrs.reserve(NumActualArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
   RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
 
-  // Add the new return attributes.
-  if (RAttrs.hasAttributes())
-    attrVec.push_back(AttributeSet::get(Caller->getContext(),
-                                        AttributeSet::ReturnIndex, RAttrs));
-
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
     Type *ParamTy = FT->getParamType(i);
 
-    if ((*AI)->getType() == ParamTy) {
-      Args.push_back(*AI);
-    } else {
-      Args.push_back(Builder->CreateBitOrPointerCast(*AI, ParamTy));
-    }
+    Value *NewArg = *AI;
+    if ((*AI)->getType() != ParamTy)
+      NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
+    Args.push_back(NewArg);
 
     // Add any parameter attributes.
-    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
-    if (PAttrs.hasAttributes())
-      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
-                                          PAttrs));
+    ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
   }
 
   // If the function takes more arguments than the call was taking, add them
   // now.
-  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i)
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
     Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+    ArgAttrs.push_back(AttributeSet());
+  }
 
   // If we are removing arguments to the function, emit an obnoxious warning.
   if (FT->getNumParams() < NumActualArgs) {
@@ -3259,54 +4179,56 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       // Add all of the arguments in their promoted form to the arg list.
       for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
         Type *PTy = getPromotedType((*AI)->getType());
+        Value *NewArg = *AI;
         if (PTy != (*AI)->getType()) {
           // Must promote to pass through va_arg area!
           Instruction::CastOps opcode =
             CastInst::getCastOpcode(*AI, false, PTy, false);
-          Args.push_back(Builder->CreateCast(opcode, *AI, PTy));
-        } else {
-          Args.push_back(*AI);
+          NewArg = Builder.CreateCast(opcode, *AI, PTy);
         }
+        Args.push_back(NewArg);
 
         // Add any parameter attributes.
-        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
-        if (PAttrs.hasAttributes())
-          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
-                                              PAttrs));
+        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
       }
     }
   }
 
   AttributeSet FnAttrs = CallerPAL.getFnAttributes();
-  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
-    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
-                                                       attrVec);
+  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
+         "missing argument attributes");
+  LLVMContext &Ctx = Callee->getContext();
+  AttributeList NewCallerPAL = AttributeList::get(
+      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
 
   SmallVector<OperandBundleDef, 1> OpBundles;
   CS.getOperandBundlesAsDefs(OpBundles);
 
-  Instruction *NC;
+  CallSite NewCS;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-    NC = Builder->CreateInvoke(Callee, II->getNormalDest(), II->getUnwindDest(),
-                               Args, OpBundles);
-    NC->takeName(II);
-    cast<InvokeInst>(NC)->setCallingConv(II->getCallingConv());
-    cast<InvokeInst>(NC)->setAttributes(NewCallerPAL);
+    NewCS = Builder.CreateInvoke(Callee, II->getNormalDest(),
+                                 II->getUnwindDest(), Args, OpBundles);
   } else {
-    CallInst *CI = cast<CallInst>(Caller);
-    NC = Builder->CreateCall(Callee, Args, OpBundles);
-    NC->takeName(CI);
-    cast<CallInst>(NC)->setTailCallKind(CI->getTailCallKind());
-    cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
-    cast<CallInst>(NC)->setAttributes(NewCallerPAL);
+    NewCS = Builder.CreateCall(Callee, Args, OpBundles);
+    cast<CallInst>(NewCS.getInstruction())
+        ->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind());
   }
+  NewCS->takeName(Caller);
+  NewCS.setCallingConv(CS.getCallingConv());
+  NewCS.setAttributes(NewCallerPAL);
+
+  // Preserve the weight metadata for the new call instruction. The metadata
+  // is used by SamplePGO to check callsite's hotness.
+  uint64_t W;
+  if (Caller->extractProfTotalWeight(W))
+    NewCS->setProfWeight(W);
 
   // Insert a cast of the return type as necessary.
+  Instruction *NC = NewCS.getInstruction();
   Value *NV = NC;
   if (OldRetTy != NV->getType() && !Caller->use_empty()) {
     if (!NV->getType()->isVoidTy()) {
@@ -3351,7 +4273,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Value *Callee = CS.getCalledValue();
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
-  const AttributeSet &Attrs = CS.getAttributes();
+  AttributeList Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
@@ -3364,50 +4286,46 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
   FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());
 
-  const AttributeSet &NestAttrs = NestF->getAttributes();
+  AttributeList NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
-    unsigned NestIdx = 1;
+    unsigned NestArgNo = 0;
     Type *NestTy = nullptr;
     AttributeSet NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
-         E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
-      if (NestAttrs.hasAttribute(NestIdx, Attribute::Nest)) {
+                                      E = NestFTy->param_end();
+         I != E; ++NestArgNo, ++I) {
+      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+      if (AS.hasAttribute(Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
-        NestAttr = NestAttrs.getParamAttributes(NestIdx);
+        NestAttr = AS;
         break;
       }
+    }
 
     if (NestTy) {
       Instruction *Caller = CS.getInstruction();
       std::vector<Value*> NewArgs;
+      std::vector<AttributeSet> NewArgAttrs;
       NewArgs.reserve(CS.arg_size() + 1);
-
-      SmallVector<AttributeSet, 8> NewAttrs;
-      NewAttrs.reserve(Attrs.getNumSlots() + 1);
+      NewArgAttrs.reserve(CS.arg_size());
 
       // Insert the nest argument into the call argument list, which may
       // mean appending it.  Likewise for attributes.
 
-      // Add any result attributes.
-      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
-        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                             Attrs.getRetAttributes()));
-
       {
-        unsigned Idx = 1;
+        unsigned ArgNo = 0;
         CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
         do {
-          if (Idx == NestIdx) {
+          if (ArgNo == NestArgNo) {
             // Add the chain argument and attributes.
             Value *NestVal = Tramp->getArgOperand(2);
             if (NestVal->getType() != NestTy)
-              NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
+              NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
-            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                                 NestAttr));
+            NewArgAttrs.push_back(NestAttr);
           }
 
           if (I == E)
@@ -3415,23 +4333,13 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          AttributeSet Attr = Attrs.getParamAttributes(Idx);
-          if (Attr.hasAttributes(Idx)) {
-            AttrBuilder B(Attr, Idx);
-            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                                 Idx + (Idx >= NestIdx), B));
-          }
+          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
 
-          ++Idx;
+          ++ArgNo;
           ++I;
         } while (true);
       }
 
-      // Add any function attributes.
-      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
-                                             Attrs.getFnAttributes()));
-
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
       // with the chain parameter inserted.
@@ -3442,12 +4350,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       // Insert the chain's type into the list of parameter types, which may
       // mean appending it.
       {
-        unsigned Idx = 1;
+        unsigned ArgNo = 0;
         FunctionType::param_iterator I = FTy->param_begin(),
           E = FTy->param_end();
 
         do {
-          if (Idx == NestIdx)
+          if (ArgNo == NestArgNo)
             // Add the chain's type.
             NewTypes.push_back(NestTy);
 
@@ -3457,7 +4365,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
           // Add the original type.
           NewTypes.push_back(*I);
 
-          ++Idx;
+          ++ArgNo;
           ++I;
         } while (true);
       }
@@ -3470,8 +4378,9 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttributeSet &NewPAL =
-          AttributeSet::get(FTy->getContext(), NewAttrs);
+      AttributeList NewPAL =
+          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
+                             Attrs.getRetAttributes(), NewArgAttrs);
 
       SmallVector<OperandBundleDef, 1> OpBundles;
       CS.getOperandBundlesAsDefs(OpBundles);
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index e74b590..dfdfd3e 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -14,9 +14,10 @@
 #include "InstCombineInternal.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/KnownBits.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -83,7 +84,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
                                                    AllocaInst &AI) {
   PointerType *PTy = cast<PointerType>(CI.getType());
 
-  BuilderTy AllocaBuilder(*Builder);
+  BuilderTy AllocaBuilder(Builder);
   AllocaBuilder.SetInsertPoint(&AI);
 
   // Get the type really allocated and the type casted to.
@@ -274,12 +275,12 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       return NV;
 
   // If we are casting a PHI, then fold the cast into the PHI.
-  if (isa<PHINode>(Src)) {
+  if (auto *PN = dyn_cast<PHINode>(Src)) {
     // Don't do this if it would create a PHI node with an illegal type from a
     // legal type.
     if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
-        ShouldChangeType(CI.getType(), Src->getType()))
-      if (Instruction *NV = FoldOpIntoPhi(CI))
+        shouldChangeType(CI.getType(), Src->getType()))
+      if (Instruction *NV = foldOpIntoPhi(CI, PN))
         return NV;
   }
 
@@ -405,8 +406,7 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
 ///   trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
 ///   --->
 ///   extractelement <4 x i32> %X, 1
-static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC,
-                                         const DataLayout &DL) {
+static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
   Value *TruncOp = Trunc.getOperand(0);
   Type *DestType = Trunc.getType();
   if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
@@ -433,21 +433,21 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC,
   unsigned NumVecElts = VecWidth / DestWidth;
   if (VecType->getElementType() != DestType) {
     VecType = VectorType::get(DestType, NumVecElts);
-    VecInput = IC.Builder->CreateBitCast(VecInput, VecType, "bc");
+    VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
   }
 
   unsigned Elt = ShiftAmount / DestWidth;
-  if (DL.isBigEndian())
+  if (IC.getDataLayout().isBigEndian())
     Elt = NumVecElts - 1 - Elt;
 
-  return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
+  return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
 }
 
 /// Try to narrow the width of bitwise logic instructions with constants.
 Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
   Type *SrcTy = Trunc.getSrcTy();
   Type *DestTy = Trunc.getType();
-  if (isa<IntegerType>(SrcTy) && !ShouldChangeType(SrcTy, DestTy))
+  if (isa<IntegerType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
     return nullptr;
 
   BinaryOperator *LogicOp;
@@ -459,10 +459,60 @@ Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
 
   // trunc (logic X, C) --> logic (trunc X, C')
   Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
-  Value *NarrowOp0 = Builder->CreateTrunc(LogicOp->getOperand(0), DestTy);
+  Value *NarrowOp0 = Builder.CreateTrunc(LogicOp->getOperand(0), DestTy);
   return BinaryOperator::Create(LogicOp->getOpcode(), NarrowOp0, NarrowC);
 }
 
+/// Try to narrow the width of a splat shuffle. This could be generalized to any
+/// shuffle with a constant operand, but we limit the transform to avoid
+/// creating a shuffle type that targets may not be able to lower effectively.
+static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
+  if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
+      Shuf->getMask()->getSplatValue() &&
+      Shuf->getType() == Shuf->getOperand(0)->getType()) {
+    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
+    Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getMask());
+  }
+
+  return nullptr;
+}
+
+/// Try to narrow the width of an insert element. This could be generalized for
+/// any vector constant, but we limit the transform to insertion into undef to
+/// avoid potential backend problems from unsupported insertion widths. This
+/// could also be extended to handle the case of inserting a scalar constant
+/// into a vector variable.
+static Instruction *shrinkInsertElt(CastInst &Trunc,
+                                    InstCombiner::BuilderTy &Builder) {
+  Instruction::CastOps Opcode = Trunc.getOpcode();
+  assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
+         "Unexpected instruction for shrinking");
+
+  auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
+  if (!InsElt || !InsElt->hasOneUse())
+    return nullptr;
+
+  Type *DestTy = Trunc.getType();
+  Type *DestScalarTy = DestTy->getScalarType();
+  Value *VecOp = InsElt->getOperand(0);
+  Value *ScalarOp = InsElt->getOperand(1);
+  Value *Index = InsElt->getOperand(2);
+
+  if (isa<UndefValue>(VecOp)) {
+    // trunc   (inselt undef, X, Index) --> inselt undef,   (trunc X), Index
+    // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
+    UndefValue *NarrowUndef = UndefValue::get(DestTy);
+    Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
+    return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
@@ -488,7 +538,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateTruncated(Src, DestTy, *this, &CI)) {
 
     // If this cast is a truncate, evaluting in a different type always
@@ -503,11 +553,14 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
     Constant *One = ConstantInt::get(SrcTy, 1);
-    Src = Builder->CreateAnd(Src, One);
+    Src = Builder.CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
 
+  // FIXME: Maybe combine the next two transforms to handle the no cast case
+  // more efficiently. Support vector types. Cleanup code by using m_OneUse.
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = nullptr; ConstantInt *Cst = nullptr;
   if (Src->hasOneUse() &&
@@ -526,36 +579,54 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // Since we're doing an lshr and a zero extend, and know that the shift
     // amount is smaller than ASize, it is always safe to do the shift in A's
     // type, then zero extend or truncate to the result.
-    Value *Shift = Builder->CreateLShr(A, Cst->getZExtValue());
+    Value *Shift = Builder.CreateLShr(A, Cst->getZExtValue());
     Shift->takeName(Src);
     return CastInst::CreateIntegerCast(Shift, DestTy, false);
   }
 
+  // FIXME: We should canonicalize to zext/trunc and remove this transform.
   // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type
   // conversion.
   // It works because bits coming from sign extension have the same value as
   // the sign bit of the original value; performing ashr instead of lshr
   // generates bits of the same value as the sign bit.
   if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) &&
-      cast<Instruction>(Src)->getOperand(0)->hasOneUse()) {
+      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
+    Value *SExt = cast<Instruction>(Src)->getOperand(0);
+    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
     const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
+    const unsigned CISize = CI.getType()->getPrimitiveSizeInBits();
+    const unsigned MaxAmt = SExtSize - std::max(CISize, ASize);
+    unsigned ShiftAmt = Cst->getZExtValue();
+
     // This optimization can be only performed when zero bits generated by
     // the original lshr aren't pulled into the value after truncation, so we
-    // can only shift by values smaller than the size of destination type (in
-    // bits).
-    if (Cst->getValue().ult(ASize)) {
-      Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue());
-      Shift->takeName(Src);
-      return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+    // can only shift by values no larger than the number of extension bits.
+    // FIXME: Instead of bailing when the shift is too large, use and to clear
+    // the extra bits.
+    if (ShiftAmt <= MaxAmt) {
+      if (CISize == ASize)
+        return BinaryOperator::CreateAShr(A, ConstantInt::get(CI.getType(),
+                                          std::min(ShiftAmt, ASize - 1)));
+      if (SExt->hasOneUse()) {
+        Value *Shift = Builder.CreateAShr(A, std::min(ShiftAmt, ASize - 1));
+        Shift->takeName(Src);
+        return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+      }
     }
   }
 
   if (Instruction *I = shrinkBitwiseLogic(CI))
     return I;
 
+  if (Instruction *I = shrinkSplatShuffle(CI, Builder))
+    return I;
+
+  if (Instruction *I = shrinkInsertElt(CI, Builder))
+    return I;
+
   if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
-      ShouldChangeType(SrcTy, DestTy)) {
+      shouldChangeType(SrcTy, DestTy)) {
     // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
     // dest type is native and cst < dest size.
     if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
@@ -564,7 +635,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
       // FoldShiftByConstant and is the extend in reg pattern.
       const unsigned DestSize = DestTy->getScalarSizeInBits();
       if (Cst->getValue().ult(DestSize)) {
-        Value *NewTrunc = Builder->CreateTrunc(A, DestTy, A->getName() + ".tr");
+        Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
 
         return BinaryOperator::Create(
           Instruction::Shl, NewTrunc,
@@ -573,7 +644,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     }
   }
 
-  if (Instruction *I = foldVecTruncToExtElt(CI, *this, DL))
+  if (Instruction *I = foldVecTruncToExtElt(CI, *this))
     return I;
 
   return nullptr;
@@ -589,20 +660,20 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
 
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
-    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+    if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV.isNullValue()) ||
         (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV.isAllOnesValue())) {
       if (!DoTransform) return ICI;
 
       Value *In = ICI->getOperand(0);
       Value *Sh = ConstantInt::get(In->getType(),
                                    In->getType()->getScalarSizeInBits() - 1);
-      In = Builder->CreateLShr(In, Sh, In->getName() + ".lobit");
+      In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit");
       if (In->getType() != CI.getType())
-        In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/);
+        In = Builder.CreateIntCast(In, CI.getType(), false /*ZExt*/);
 
       if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
         Constant *One = ConstantInt::get(In->getType(), 1);
-        In = Builder->CreateXor(In, One, In->getName() + ".not");
+        In = Builder.CreateXor(In, One, In->getName() + ".not");
       }
 
       return replaceInstUsesWith(CI, In);
@@ -616,20 +687,18 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
     // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
     // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
-    if ((Op1CV == 0 || Op1CV.isPowerOf2()) &&
+    if ((Op1CV.isNullValue() || Op1CV.isPowerOf2()) &&
         // This only works for EQ and NE
         ICI->isEquality()) {
       // If Op1C some other power of two, convert:
-      uint32_t BitWidth = Op1C->getType()->getBitWidth();
-      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(ICI->getOperand(0), KnownZero, KnownOne, 0, &CI);
+      KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI);
 
-      APInt KnownZeroMask(~KnownZero);
+      APInt KnownZeroMask(~Known.Zero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
         if (!DoTransform) return ICI;
 
         bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
-        if (Op1CV != 0 && (Op1CV != KnownZeroMask)) {
+        if (!Op1CV.isNullValue() && (Op1CV != KnownZeroMask)) {
           // (X&4) == 2 --> false
           // (X&4) != 2 --> true
           Constant *Res = ConstantInt::get(Type::getInt1Ty(CI.getContext()),
@@ -643,19 +712,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
         if (ShAmt) {
           // Perform a logical shr by shiftamt.
           // Insert the shift to put the result in the low bit.
-          In = Builder->CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
-                                   In->getName() + ".lobit");
+          In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
+                                  In->getName() + ".lobit");
         }
 
-        if ((Op1CV != 0) == isNE) { // Toggle the low bit.
+        if (!Op1CV.isNullValue() == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
-          In = Builder->CreateXor(In, One);
+          In = Builder.CreateXor(In, One);
         }
 
         if (CI.getType() == In->getType())
           return replaceInstUsesWith(CI, In);
 
-        Value *IntCast = Builder->CreateIntCast(In, CI.getType(), false);
+        Value *IntCast = Builder.CreateIntCast(In, CI.getType(), false);
         return replaceInstUsesWith(CI, IntCast);
       }
     }
@@ -666,34 +735,31 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
   // may lead to additional simplifications.
   if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
     if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
-      uint32_t BitWidth = ITy->getBitWidth();
       Value *LHS = ICI->getOperand(0);
       Value *RHS = ICI->getOperand(1);
 
-      APInt KnownZeroLHS(BitWidth, 0), KnownOneLHS(BitWidth, 0);
-      APInt KnownZeroRHS(BitWidth, 0), KnownOneRHS(BitWidth, 0);
-      computeKnownBits(LHS, KnownZeroLHS, KnownOneLHS, 0, &CI);
-      computeKnownBits(RHS, KnownZeroRHS, KnownOneRHS, 0, &CI);
+      KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI);
+      KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI);
 
-      if (KnownZeroLHS == KnownZeroRHS && KnownOneLHS == KnownOneRHS) {
-        APInt KnownBits = KnownZeroLHS | KnownOneLHS;
+      if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
+        APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
         APInt UnknownBit = ~KnownBits;
         if (UnknownBit.countPopulation() == 1) {
           if (!DoTransform) return ICI;
 
-          Value *Result = Builder->CreateXor(LHS, RHS);
+          Value *Result = Builder.CreateXor(LHS, RHS);
 
           // Mask off any bits that are set and won't be shifted away.
-          if (KnownOneLHS.uge(UnknownBit))
-            Result = Builder->CreateAnd(Result,
+          if (KnownLHS.One.uge(UnknownBit))
+            Result = Builder.CreateAnd(Result,
                                         ConstantInt::get(ITy, UnknownBit));
 
           // Shift the bit we're testing down to the lsb.
-          Result = Builder->CreateLShr(
+          Result = Builder.CreateLShr(
                Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
 
           if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
-            Result = Builder->CreateXor(Result, ConstantInt::get(ITy, 1));
+            Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1));
           Result->takeName(ICI);
           return replaceInstUsesWith(CI, Result);
         }
@@ -838,11 +904,6 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -851,10 +912,10 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
-    assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
-           "Unreasonable BitsToClear");
+    assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
+           "Can't clear more bits than in SrcTy");
 
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -898,7 +959,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     if (SrcSize < DstSize) {
       APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
       Constant *AndConst = ConstantInt::get(A->getType(), AndValue);
-      Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask");
+      Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask");
       return new ZExtInst(And, CI.getType());
     }
 
@@ -908,7 +969,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
                                                            AndValue));
     }
     if (SrcSize > DstSize) {
-      Value *Trunc = Builder->CreateTrunc(A, CI.getType());
+      Value *Trunc = Builder.CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
       return BinaryOperator::CreateAnd(Trunc,
                                        ConstantInt::get(Trunc->getType(),
@@ -930,8 +991,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
         (transformZExtICmp(LHS, CI, false) ||
          transformZExtICmp(RHS, CI, false))) {
       // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
-      Value *LCast = Builder->CreateZExt(LHS, CI.getType(), LHS->getName());
-      Value *RCast = Builder->CreateZExt(RHS, CI.getType(), RHS->getName());
+      Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
+      Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
       BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast);
 
       // Perform the elimination.
@@ -958,7 +1019,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) &&
       X->getType() == CI.getType()) {
     Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
-    return BinaryOperator::CreateXor(Builder->CreateAnd(X, ZC), ZC);
+    return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC);
   }
 
   return nullptr;
@@ -981,12 +1042,12 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
 
       Value *Sh = ConstantInt::get(Op0->getType(),
                                    Op0->getType()->getScalarSizeInBits()-1);
-      Value *In = Builder->CreateAShr(Op0, Sh, Op0->getName()+".lobit");
+      Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
       if (In->getType() != CI.getType())
-        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/);
+        In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
 
       if (Pred == ICmpInst::ICMP_SGT)
-        In = Builder->CreateNot(In, In->getName()+".not");
+        In = Builder.CreateNot(In, In->getName() + ".not");
       return replaceInstUsesWith(CI, In);
     }
   }
@@ -997,11 +1058,9 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
     // the icmp and sext into bitwise/integer operations.
     if (ICI->hasOneUse() &&
         ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
-      unsigned BitWidth = Op1C->getType()->getBitWidth();
-      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(Op0, KnownZero, KnownOne, 0, &CI);
+      KnownBits Known = computeKnownBits(Op0, 0, &CI);
 
-      APInt KnownZeroMask(~KnownZero);
+      APInt KnownZeroMask(~Known.Zero);
       if (KnownZeroMask.isPowerOf2()) {
         Value *In = ICI->getOperand(0);
 
@@ -1019,26 +1078,26 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
           unsigned ShiftAmt = KnownZeroMask.countTrailingZeros();
           // Perform a right shift to place the desired bit in the LSB.
           if (ShiftAmt)
-            In = Builder->CreateLShr(In,
-                                     ConstantInt::get(In->getType(), ShiftAmt));
+            In = Builder.CreateLShr(In,
+                                    ConstantInt::get(In->getType(), ShiftAmt));
 
           // At this point "In" is either 1 or 0. Subtract 1 to turn
           // {1, 0} -> {0, -1}.
-          In = Builder->CreateAdd(In,
-                                  ConstantInt::getAllOnesValue(In->getType()),
-                                  "sext");
+          In = Builder.CreateAdd(In,
+                                 ConstantInt::getAllOnesValue(In->getType()),
+                                 "sext");
         } else {
           // sext ((x & 2^n) != 0)   -> (x << bitwidth-n) a>> bitwidth-1
           // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1
           unsigned ShiftAmt = KnownZeroMask.countLeadingZeros();
           // Perform a left shift to place the desired bit in the MSB.
           if (ShiftAmt)
-            In = Builder->CreateShl(In,
-                                    ConstantInt::get(In->getType(), ShiftAmt));
+            In = Builder.CreateShl(In,
+                                   ConstantInt::get(In->getType(), ShiftAmt));
 
           // Distribute the bit over the whole bit width.
-          In = Builder->CreateAShr(In, ConstantInt::get(In->getType(),
-                                                        BitWidth - 1), "sext");
+          In = Builder.CreateAShr(In, ConstantInt::get(In->getType(),
+                                  KnownZeroMask.getBitWidth() - 1), "sext");
         }
 
         if (CI.getType() == In->getType())
@@ -1124,20 +1183,14 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
   // If we know that the value being extended is positive, we can use a zext
   // instead.
-  bool KnownZero, KnownOne;
-  ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI);
-  if (KnownZero) {
-    Value *ZExt = Builder->CreateZExt(Src, DestTy);
+  KnownBits Known = computeKnownBits(Src, 0, &CI);
+  if (Known.isNonNegative()) {
+    Value *ZExt = Builder.CreateZExt(Src, DestTy);
     return replaceInstUsesWith(CI, ZExt);
   }
 
@@ -1145,7 +1198,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateSExtd(Src, DestTy)) {
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -1163,22 +1216,20 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
 
     // We need to emit a shl + ashr to do the sign extend.
     Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
-    return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
+    return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"),
                                       ShAmt);
   }
 
-  // If this input is a trunc from our destination, then turn sext(trunc(x))
+  // If the input is a trunc from the destination type, then turn sext(trunc(x))
   // into shifts.
-  if (TruncInst *TI = dyn_cast<TruncInst>(Src))
-    if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
-      uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
-      uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-
-      // We need to emit a shl + ashr to do the sign extend.
-      Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
-      Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
-      return BinaryOperator::CreateAShr(Res, ShAmt);
-    }
+  Value *X;
+  if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
+    // sext(trunc(X)) --> ashr(shl(X, C), C)
+    unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+    unsigned DestBitSize = DestTy->getScalarSizeInBits();
+    Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt);
+  }
 
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
     return transformSExtICmp(ICI, CI);
@@ -1206,7 +1257,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
     unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
     Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
-    A = Builder->CreateShl(A, ShAmtV, CI.getName());
+    A = Builder.CreateShl(A, ShAmtV, CI.getName());
     return BinaryOperator::CreateAShr(A, ShAmtV);
   }
 
@@ -1225,17 +1276,15 @@ static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   return nullptr;
 }
 
-/// If this is a floating-point extension instruction, look
-/// through it until we get the source value.
+/// Look through floating-point extensions until we get the source value.
 static Value *lookThroughFPExtensions(Value *V) {
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (I->getOpcode() == Instruction::FPExt)
-      return lookThroughFPExtensions(I->getOperand(0));
+  while (auto *FPExt = dyn_cast<FPExtInst>(V))
+    V = FPExt->getOperand(0);
 
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+  if (auto *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
       return V;  // No constant folding of this.
     // See if the value can be truncated to half and then reextended.
@@ -1297,9 +1346,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // case of interest here is (float)((double)float + float)).
         if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
           if (LHSOrig->getType() != CI.getType())
-            LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType());
+            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
           if (RHSOrig->getType() != CI.getType())
-            RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType());
+            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
           Instruction *RI =
             BinaryOperator::Create(OpI->getOpcode(), LHSOrig, RHSOrig);
           RI->copyFastMathFlags(OpI);
@@ -1314,9 +1363,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // in the destination format if it can represent both sources.
         if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
           if (LHSOrig->getType() != CI.getType())
-            LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType());
+            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
           if (RHSOrig->getType() != CI.getType())
-            RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType());
+            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
           Instruction *RI =
             BinaryOperator::CreateFMul(LHSOrig, RHSOrig);
           RI->copyFastMathFlags(OpI);
@@ -1332,9 +1381,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         // TODO: Tighten bound via rigorous analysis of the unbalanced case.
         if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
           if (LHSOrig->getType() != CI.getType())
-            LHSOrig = Builder->CreateFPExt(LHSOrig, CI.getType());
+            LHSOrig = Builder.CreateFPExt(LHSOrig, CI.getType());
           if (RHSOrig->getType() != CI.getType())
-            RHSOrig = Builder->CreateFPExt(RHSOrig, CI.getType());
+            RHSOrig = Builder.CreateFPExt(RHSOrig, CI.getType());
           Instruction *RI =
             BinaryOperator::CreateFDiv(LHSOrig, RHSOrig);
           RI->copyFastMathFlags(OpI);
@@ -1349,11 +1398,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         if (SrcWidth == OpWidth)
           break;
         if (LHSWidth < SrcWidth)
-          LHSOrig = Builder->CreateFPExt(LHSOrig, RHSOrig->getType());
+          LHSOrig = Builder.CreateFPExt(LHSOrig, RHSOrig->getType());
         else if (RHSWidth <= SrcWidth)
-          RHSOrig = Builder->CreateFPExt(RHSOrig, LHSOrig->getType());
+          RHSOrig = Builder.CreateFPExt(RHSOrig, LHSOrig->getType());
         if (LHSOrig != OpI->getOperand(0) || RHSOrig != OpI->getOperand(1)) {
-          Value *ExactResult = Builder->CreateFRem(LHSOrig, RHSOrig);
+          Value *ExactResult = Builder.CreateFRem(LHSOrig, RHSOrig);
           if (Instruction *RI = dyn_cast<Instruction>(ExactResult))
             RI->copyFastMathFlags(OpI);
           return CastInst::CreateFPCast(ExactResult, CI.getType());
@@ -1362,8 +1411,8 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
 
     // (fptrunc (fneg x)) -> (fneg (fptrunc x))
     if (BinaryOperator::isFNeg(OpI)) {
-      Value *InnerTrunc = Builder->CreateFPTrunc(OpI->getOperand(1),
-                                                 CI.getType());
+      Value *InnerTrunc = Builder.CreateFPTrunc(OpI->getOperand(1),
+                                                CI.getType());
       Instruction *RI = BinaryOperator::CreateFNeg(InnerTrunc);
       RI->copyFastMathFlags(OpI);
       return RI;
@@ -1382,34 +1431,57 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       (isa<ConstantFP>(SI->getOperand(1)) ||
        isa<ConstantFP>(SI->getOperand(2))) &&
       matchSelectPattern(SI, LHS, RHS).Flavor == SPF_UNKNOWN) {
-    Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1),
-                                             CI.getType());
-    Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2),
-                                             CI.getType());
+    Value *LHSTrunc = Builder.CreateFPTrunc(SI->getOperand(1), CI.getType());
+    Value *RHSTrunc = Builder.CreateFPTrunc(SI->getOperand(2), CI.getType());
     return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
   }
 
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
   if (II) {
     switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::fabs: {
-        // (fptrunc (fabs x)) -> (fabs (fptrunc x))
-        Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
-                                                   CI.getType());
-        Type *IntrinsicType[] = { CI.getType() };
-        Function *Overload = Intrinsic::getDeclaration(
-            CI.getModule(), II->getIntrinsicID(), IntrinsicType);
-
-        SmallVector<OperandBundleDef, 1> OpBundles;
-        II->getOperandBundlesAsDefs(OpBundles);
-
-        Value *Args[] = { InnerTrunc };
-        return CallInst::Create(Overload, Args, OpBundles, II->getName());
+    default: break;
+    case Intrinsic::fabs:
+    case Intrinsic::ceil:
+    case Intrinsic::floor:
+    case Intrinsic::rint:
+    case Intrinsic::round:
+    case Intrinsic::nearbyint:
+    case Intrinsic::trunc: {
+      Value *Src = II->getArgOperand(0);
+      if (!Src->hasOneUse())
+        break;
+
+      // Except for fabs, this transformation requires the input of the unary FP
+      // operation to be itself an fpext from the type to which we're
+      // truncating.
+      if (II->getIntrinsicID() != Intrinsic::fabs) {
+        FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
+        if (!FPExtSrc || FPExtSrc->getOperand(0)->getType() != CI.getType())
+          break;
       }
+
+      // Do unary FP operation on smaller type.
+      // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+      Value *InnerTrunc = Builder.CreateFPTrunc(Src, CI.getType());
+      Type *IntrinsicType[] = { CI.getType() };
+      Function *Overload = Intrinsic::getDeclaration(
+        CI.getModule(), II->getIntrinsicID(), IntrinsicType);
+
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      II->getOperandBundlesAsDefs(OpBundles);
+
+      Value *Args[] = { InnerTrunc };
+      CallInst *NewCI =  CallInst::Create(Overload, Args,
+                                          OpBundles, II->getName());
+      NewCI->copyFastMathFlags(II);
+      return NewCI;
+    }
     }
   }
 
+  if (Instruction *I = shrinkInsertElt(CI, Builder))
+    return I;
+
   return nullptr;
 }
 
@@ -1502,7 +1574,7 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
     if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
       Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
 
-    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+    Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty);
     return new IntToPtrInst(P, CI.getType());
   }
 
@@ -1524,7 +1596,7 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         // GEP into CI would undo canonicalizing addrspacecast with different
         // pointer types, causing infinite loops.
         (!isa<AddrSpaceCastInst>(CI) ||
-          GEP->getType() == GEP->getPointerOperand()->getType())) {
+         GEP->getType() == GEP->getPointerOperandType())) {
       // Changing the cast operand is usually not a good idea but it is safe
       // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
@@ -1552,7 +1624,7 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   if (Ty->isVectorTy()) // Handle vectors of pointers.
     PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
 
-  Value *P = Builder->CreatePtrToInt(CI.getOperand(0), PtrTy);
+  Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy);
   return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
 }
 
@@ -1578,7 +1650,7 @@ static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
       return nullptr;
 
     SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
-    InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
+    InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
   }
 
   // Now that the element types match, get the shuffle mask and RHS of the
@@ -1758,8 +1830,8 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (!Elements[i]) continue;  // Unset element.
 
-    Result = IC.Builder->CreateInsertElement(Result, Elements[i],
-                                             IC.Builder->getInt32(i));
+    Result = IC.Builder.CreateInsertElement(Result, Elements[i],
+                                            IC.Builder.getInt32(i));
   }
 
   return Result;
@@ -1770,8 +1842,7 @@ static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// vectors better than bitcasts of scalars because vector registers are
 /// usually not type-specific like scalar integer or scalar floating-point.
 static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
-                                              InstCombiner &IC,
-                                              const DataLayout &DL) {
+                                              InstCombiner &IC) {
   // TODO: Create and use a pattern matcher for ExtractElementInst.
   auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
   if (!ExtElt || !ExtElt->hasOneUse())
@@ -1785,8 +1856,8 @@ static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
 
   unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
   auto *NewVecType = VectorType::get(DestType, NumElts);
-  auto *NewBC = IC.Builder->CreateBitCast(ExtElt->getVectorOperand(),
-                                          NewVecType, "bc");
+  auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
+                                         NewVecType, "bc");
   return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
 }
 
@@ -1795,7 +1866,7 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
                                             InstCombiner::BuilderTy &Builder) {
   Type *DestTy = BitCast.getType();
   BinaryOperator *BO;
-  if (!DestTy->getScalarType()->isIntegerTy() ||
+  if (!DestTy->isIntOrIntVectorTy() ||
       !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
       !BO->isBitwiseLogicOp())
     return nullptr;
@@ -1821,6 +1892,18 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
     return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
   }
 
+  // Canonicalize vector bitcasts to come before vector bitwise logic with a
+  // constant. This eases recognition of special constants for later ops.
+  // Example:
+  // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+  Constant *C;
+  if (match(BO->getOperand(1), m_Constant(C))) {
+    // bitcast (logic X, C) --> logic (bitcast X, C')
+    Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+    Value *CastedC = ConstantExpr::getBitCast(C, DestTy);
+    return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
+  }
+
   return nullptr;
 }
 
@@ -1946,8 +2029,8 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
   // For each old PHI node, create a corresponding new PHI node with a type A.
   SmallDenseMap<PHINode *, PHINode *> NewPNodes;
   for (auto *OldPN : OldPhiNodes) {
-    Builder->SetInsertPoint(OldPN);
-    PHINode *NewPN = Builder->CreatePHI(DestTy, OldPN->getNumOperands());
+    Builder.SetInsertPoint(OldPN);
+    PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
     NewPNodes[OldPN] = NewPN;
   }
 
@@ -1960,8 +2043,8 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
       if (auto *C = dyn_cast<Constant>(V)) {
         NewV = ConstantExpr::getBitCast(C, DestTy);
       } else if (auto *LI = dyn_cast<LoadInst>(V)) {
-        Builder->SetInsertPoint(LI->getNextNode());
-        NewV = Builder->CreateBitCast(LI, DestTy);
+        Builder.SetInsertPoint(LI->getNextNode());
+        NewV = Builder.CreateBitCast(LI, DestTy);
         Worklist.Add(LI);
       } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
         NewV = BCI->getOperand(0);
@@ -1977,9 +2060,9 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
   for (User *U : PN->users()) {
     auto *SI = dyn_cast<StoreInst>(U);
     if (SI && SI->isSimple() && SI->getOperand(0) == PN) {
-      Builder->SetInsertPoint(SI);
+      Builder.SetInsertPoint(SI);
       auto *NewBC =
-          cast<BitCastInst>(Builder->CreateBitCast(NewPNodes[PN], SrcTy));
+          cast<BitCastInst>(Builder.CreateBitCast(NewPNodes[PN], SrcTy));
       SI->setOperand(0, NewBC);
       Worklist.Add(SI);
       assert(hasStoreUsersOnly(*NewBC));
@@ -2034,14 +2117,14 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
 
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
-      SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder->getInt32(0));
+      SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
       return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
 
   if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
     if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
-      Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType());
+      Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
       return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
       // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
@@ -2074,7 +2157,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // scalar-scalar cast.
       if (!DestTy->isVectorTy()) {
         Value *Elem =
-          Builder->CreateExtractElement(Src,
+          Builder.CreateExtractElement(Src,
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
         return CastInst::Create(Instruction::BitCast, Elem, DestTy);
       }
@@ -2103,8 +2186,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
            Tmp->getOperand(0)->getType() == DestTy) ||
           ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
-        Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy);
-        Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy);
+        Value *LHS = Builder.CreateBitCast(SVI->getOperand(0), DestTy);
+        Value *RHS = Builder.CreateBitCast(SVI->getOperand(1), DestTy);
         // Return a new shuffle vector.  Use the same element ID's, as we
         // know the vector types match #elts.
         return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
@@ -2117,13 +2200,13 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (Instruction *I = optimizeBitCastFromPhi(CI, PN))
       return I;
 
-  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this, DL))
+  if (Instruction *I = canonicalizeBitCastExtElt(CI, *this))
     return I;
 
-  if (Instruction *I = foldBitCastBitwiseLogic(CI, *Builder))
+  if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder))
     return I;
 
-  if (Instruction *I = foldBitCastSelect(CI, *Builder))
+  if (Instruction *I = foldBitCastSelect(CI, Builder))
     return I;
 
   if (SrcTy->isPointerTy())
@@ -2147,7 +2230,7 @@ Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
       MidTy = VectorType::get(MidTy, VT->getNumElements());
     }
 
-    Value *NewBitCast = Builder->CreateBitCast(Src, MidTy);
+    Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
     return new AddrSpaceCastInst(NewBitCast, CI.getType());
   }
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 428f94b..a8faaec 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 using namespace PatternMatch;
@@ -111,10 +112,10 @@ static bool subWithOverflow(Constant *&Result, Constant *In1,
 
 /// Given an icmp instruction, return true if any use of this comparison is a
 /// branch on sign bit comparison.
-static bool isBranchOnSignBitCheck(ICmpInst &I, bool isSignBit) {
+static bool hasBranchUse(ICmpInst &I) {
   for (auto *U : I.users())
     if (isa<BranchInst>(U))
-      return isSignBit;
+      return true;
   return false;
 }
 
@@ -126,7 +127,7 @@ static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
   switch (Pred) {
   case ICmpInst::ICMP_SLT:   // True if LHS s< 0
     TrueIfSigned = true;
-    return RHS == 0;
+    return RHS.isNullValue();
   case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
     TrueIfSigned = true;
     return RHS.isAllOnesValue();
@@ -140,7 +141,7 @@ static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS,
   case ICmpInst::ICMP_UGE:
     // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
     TrueIfSigned = true;
-    return RHS.isSignBit();
+    return RHS.isSignMask();
   default:
     return false;
   }
@@ -154,10 +155,10 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
   if (!ICmpInst::isSigned(Pred))
     return false;
 
-  if (C == 0)
+  if (C.isNullValue())
     return ICmpInst::isRelational(Pred);
 
-  if (C == 1) {
+  if (C.isOneValue()) {
     if (Pred == ICmpInst::ICMP_SLT) {
       Pred = ICmpInst::ICMP_SLE;
       return true;
@@ -175,42 +176,40 @@ static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
 /// Given a signed integer type and a set of known zero and one bits, compute
 /// the maximum and minimum values that could have the specified known zero and
 /// known one bits, returning them in Min/Max.
-static void computeSignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
-                                                   const APInt &KnownOne,
+/// TODO: Move to method on KnownBits struct?
+static void computeSignedMinMaxValuesFromKnownBits(const KnownBits &Known,
                                                    APInt &Min, APInt &Max) {
-  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
-         KnownZero.getBitWidth() == Min.getBitWidth() &&
-         KnownZero.getBitWidth() == Max.getBitWidth() &&
+  assert(Known.getBitWidth() == Min.getBitWidth() &&
+         Known.getBitWidth() == Max.getBitWidth() &&
          "KnownZero, KnownOne and Min, Max must have equal bitwidth.");
-  APInt UnknownBits = ~(KnownZero|KnownOne);
+  APInt UnknownBits = ~(Known.Zero|Known.One);
 
   // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
   // bit if it is unknown.
-  Min = KnownOne;
-  Max = KnownOne|UnknownBits;
+  Min = Known.One;
+  Max = Known.One|UnknownBits;
 
   if (UnknownBits.isNegative()) { // Sign bit is unknown
-    Min.setBit(Min.getBitWidth()-1);
-    Max.clearBit(Max.getBitWidth()-1);
+    Min.setSignBit();
+    Max.clearSignBit();
   }
 }
 
 /// Given an unsigned integer type and a set of known zero and one bits, compute
 /// the maximum and minimum values that could have the specified known zero and
 /// known one bits, returning them in Min/Max.
-static void computeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
-                                                     const APInt &KnownOne,
+/// TODO: Move to method on KnownBits struct?
+static void computeUnsignedMinMaxValuesFromKnownBits(const KnownBits &Known,
                                                      APInt &Min, APInt &Max) {
-  assert(KnownZero.getBitWidth() == KnownOne.getBitWidth() &&
-         KnownZero.getBitWidth() == Min.getBitWidth() &&
-         KnownZero.getBitWidth() == Max.getBitWidth() &&
+  assert(Known.getBitWidth() == Min.getBitWidth() &&
+         Known.getBitWidth() == Max.getBitWidth() &&
          "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
-  APInt UnknownBits = ~(KnownZero|KnownOne);
+  APInt UnknownBits = ~(Known.Zero|Known.One);
 
   // The minimum value is when the unknown bits are all zeros.
-  Min = KnownOne;
+  Min = Known.One;
   // The maximum value is when the unknown bits are all ones.
-  Max = KnownOne|UnknownBits;
+  Max = Known.One|UnknownBits;
 }
 
 /// This is called when we see this pattern:
@@ -230,7 +229,9 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     return nullptr;
 
   uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
-  if (ArrayElementCount > 1024) return nullptr; // Don't blow up on huge arrays.
+  // Don't blow up on huge arrays.
+  if (ArrayElementCount > MaxArraySizeForCombine)
+    return nullptr;
 
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
@@ -391,7 +392,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
     unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
     if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
-      Idx = Builder->CreateTrunc(Idx, IntPtrTy);
+      Idx = Builder.CreateTrunc(Idx, IntPtrTy);
   }
 
   // If the comparison is only true for one or two elements, emit direct
@@ -399,7 +400,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
   if (SecondTrueElement != Overdefined) {
     // None true -> false.
     if (FirstTrueElement == Undefined)
-      return replaceInstUsesWith(ICI, Builder->getFalse());
+      return replaceInstUsesWith(ICI, Builder.getFalse());
 
     Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
 
@@ -408,9 +409,9 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
       return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
 
     // True for two elements -> 'i == 47 | i == 72'.
-    Value *C1 = Builder->CreateICmpEQ(Idx, FirstTrueIdx);
+    Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
     Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
-    Value *C2 = Builder->CreateICmpEQ(Idx, SecondTrueIdx);
+    Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
     return BinaryOperator::CreateOr(C1, C2);
   }
 
@@ -419,7 +420,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
   if (SecondFalseElement != Overdefined) {
     // None false -> true.
     if (FirstFalseElement == Undefined)
-      return replaceInstUsesWith(ICI, Builder->getTrue());
+      return replaceInstUsesWith(ICI, Builder.getTrue());
 
     Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
 
@@ -428,9 +429,9 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
       return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
 
     // False for two elements -> 'i != 47 & i != 72'.
-    Value *C1 = Builder->CreateICmpNE(Idx, FirstFalseIdx);
+    Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
     Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
-    Value *C2 = Builder->CreateICmpNE(Idx, SecondFalseIdx);
+    Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
     return BinaryOperator::CreateAnd(C1, C2);
   }
 
@@ -442,7 +443,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
     if (FirstTrueElement) {
       Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
-      Idx = Builder->CreateAdd(Idx, Offs);
+      Idx = Builder.CreateAdd(Idx, Offs);
     }
 
     Value *End = ConstantInt::get(Idx->getType(),
@@ -456,7 +457,7 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
     if (FirstFalseElement) {
       Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
-      Idx = Builder->CreateAdd(Idx, Offs);
+      Idx = Builder.CreateAdd(Idx, Offs);
     }
 
     Value *End = ConstantInt::get(Idx->getType(),
@@ -480,9 +481,9 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
       Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
 
     if (Ty) {
-      Value *V = Builder->CreateIntCast(Idx, Ty, false);
-      V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
-      V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
+      Value *V = Builder.CreateIntCast(Idx, Ty, false);
+      V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
       return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
     }
   }
@@ -565,7 +566,7 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
+      VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
   }
@@ -587,10 +588,10 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
 
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
   if (VariableIdx->getType() != IntPtrTy)
-    VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
+    VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
   Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
-  return IC.Builder->CreateAdd(VariableIdx, OffsetVal, "offset");
+  return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
 }
 
 /// Returns true if we can rewrite Start as a GEP with pointer Base
@@ -980,13 +981,13 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
         if (LHSIndexTy != RHSIndexTy) {
           if (LHSIndexTy->getPrimitiveSizeInBits() <
               RHSIndexTy->getPrimitiveSizeInBits()) {
-            ROffset = Builder->CreateTrunc(ROffset, LHSIndexTy);
+            ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
           } else
-            LOffset = Builder->CreateTrunc(LOffset, RHSIndexTy);
+            LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
         }
 
-        Value *Cmp = Builder->CreateICmp(ICmpInst::getSignedPredicate(Cond),
-                                         LOffset, ROffset);
+        Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
+                                        LOffset, ROffset);
         return replaceInstUsesWith(I, Cmp);
       }
 
@@ -1025,7 +1026,7 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       if (NumDifferences == 0)   // SAME GEP?
         return replaceInstUsesWith(I, // No comparison is needed here.
-                             Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond)));
+                             Builder.getInt1(ICmpInst::isTrueWhenEqual(Cond)));
 
       else if (NumDifferences == 1 && GEPsInBounds) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
@@ -1173,7 +1174,7 @@ Instruction *InstCombiner::foldICmpAddOpConst(Instruction &ICI,
   // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
 
   assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
-  Constant *C = Builder->getInt(CI->getValue()-1);
+  Constant *C = Builder.getInt(CI->getValue() - 1);
   return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
 }
 
@@ -1192,7 +1193,7 @@ Instruction *InstCombiner::foldICmpShrConstConst(ICmpInst &I, Value *A,
   };
 
   // Don't bother doing any work for cases which InstSimplify handles.
-  if (AP2 == 0)
+  if (AP2.isNullValue())
     return nullptr;
 
   bool IsAShr = isa<AShrOperator>(I.getOperand(0));
@@ -1251,7 +1252,7 @@ Instruction *InstCombiner::foldICmpShlConstConst(ICmpInst &I, Value *A,
   };
 
   // Don't bother doing any work for cases which InstSimplify handles.
-  if (AP2 == 0)
+  if (AP2.isNullValue())
     return nullptr;
 
   unsigned AP2TrailingZeros = AP2.countTrailingZeros();
@@ -1346,17 +1347,17 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   Value *F = Intrinsic::getDeclaration(I.getModule(),
                                        Intrinsic::sadd_with_overflow, NewType);
 
-  InstCombiner::BuilderTy *Builder = IC.Builder;
+  InstCombiner::BuilderTy &Builder = IC.Builder;
 
   // Put the new code above the original add, in case there are any uses of the
   // add between the add and the compare.
-  Builder->SetInsertPoint(OrigAdd);
+  Builder.SetInsertPoint(OrigAdd);
 
-  Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName() + ".trunc");
-  Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName() + ".trunc");
-  CallInst *Call = Builder->CreateCall(F, {TruncA, TruncB}, "sadd");
-  Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result");
-  Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType());
+  Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
+  Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
+  CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
+  Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
+  Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());
 
   // The inner add was the result of the narrow add, zero extended to the
   // wider type.  Replace it with the result computed by the intrinsic.
@@ -1398,12 +1399,12 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
   }
 
   // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
-  if (*C == 0 && Pred == ICmpInst::ICMP_SGT) {
+  if (C->isNullValue() && Pred == ICmpInst::ICMP_SGT) {
     SelectPatternResult SPR = matchSelectPattern(X, A, B);
     if (SPR.Flavor == SPF_SMIN) {
-      if (isKnownPositive(A, DL))
+      if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
         return new ICmpInst(Pred, B, Cmp.getOperand(1));
-      if (isKnownPositive(B, DL))
+      if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
         return new ICmpInst(Pred, A, Cmp.getOperand(1));
     }
   }
@@ -1433,9 +1434,9 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
     ConstantRange Intersection = DominatingCR.intersectWith(CR);
     ConstantRange Difference = DominatingCR.difference(CR);
     if (Intersection.isEmptySet())
-      return replaceInstUsesWith(Cmp, Builder->getFalse());
+      return replaceInstUsesWith(Cmp, Builder.getFalse());
     if (Difference.isEmptySet())
-      return replaceInstUsesWith(Cmp, Builder->getTrue());
+      return replaceInstUsesWith(Cmp, Builder.getTrue());
 
     // If this is a normal comparison, it demands all bits. If it is a sign
     // bit comparison, it only demands the sign bit.
@@ -1447,12 +1448,13 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
     // of a test and branch. So we avoid canonicalizing in such situations
     // because test and branch instruction has better branch displacement
     // than compare and branch instruction.
-    if (!isBranchOnSignBitCheck(Cmp, IsSignBit) && !Cmp.isEquality()) {
-      if (auto *AI = Intersection.getSingleElement())
-        return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder->getInt(*AI));
-      if (auto *AD = Difference.getSingleElement())
-        return new ICmpInst(ICmpInst::ICMP_NE, X, Builder->getInt(*AD));
-    }
+    if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
+      return nullptr;
+
+    if (auto *AI = Intersection.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*AI));
+    if (auto *AD = Difference.getSingleElement())
+      return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*AD));
   }
 
   return nullptr;
@@ -1464,7 +1466,7 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
                                                  const APInt *C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *X = Trunc->getOperand(0);
-  if (*C == 1 && C->getBitWidth() > 1) {
+  if (C->isOneValue() && C->getBitWidth() > 1) {
     // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
     Value *V = nullptr;
     if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
@@ -1477,14 +1479,13 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
     // of the high bits truncated out of x are known.
     unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
              SrcBits = X->getType()->getScalarSizeInBits();
-    APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
-    computeKnownBits(X, KnownZero, KnownOne, 0, &Cmp);
+    KnownBits Known = computeKnownBits(X, 0, &Cmp);
 
     // If all the high bits are known, we can do this xform.
-    if ((KnownZero | KnownOne).countLeadingOnes() >= SrcBits - DstBits) {
+    if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
       // Pull in the high bits from known-ones set.
       APInt NewRHS = C->zext(SrcBits);
-      NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
+      NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
       return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
     }
   }
@@ -1505,7 +1506,7 @@ Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
   // If this is a comparison that tests the signbit (X < 0) or (x > -1),
   // fold the xor.
   ICmpInst::Predicate Pred = Cmp.getPredicate();
-  if ((Pred == ICmpInst::ICMP_SLT && *C == 0) ||
+  if ((Pred == ICmpInst::ICMP_SLT && C->isNullValue()) ||
       (Pred == ICmpInst::ICMP_SGT && C->isAllOnesValue())) {
 
     // If the sign bit of the XorCst is not set, there is no change to
@@ -1530,14 +1531,14 @@ Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
   }
 
   if (Xor->hasOneUse()) {
-    // (icmp u/s (xor X SignBit), C) -> (icmp s/u X, (xor C SignBit))
-    if (!Cmp.isEquality() && XorC->isSignBit()) {
+    // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
+    if (!Cmp.isEquality() && XorC->isSignMask()) {
       Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
                             : Cmp.getSignedPredicate();
       return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), *C ^ *XorC));
     }
 
-    // (icmp u/s (xor X ~SignBit), C) -> (icmp s/u X, (xor C ~SignBit))
+    // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
     if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
       Pred = Cmp.isSigned() ? Cmp.getUnsignedPredicate()
                             : Cmp.getSignedPredicate();
@@ -1623,15 +1624,15 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
   // Turn ((X >> Y) & C2) == 0  into  (X & (C2 << Y)) == 0.  The latter is
   // preferable because it allows the C2 << Y expression to be hoisted out of a
   // loop if Y is invariant and X is not.
-  if (Shift->hasOneUse() && *C1 == 0 && Cmp.isEquality() &&
+  if (Shift->hasOneUse() && C1->isNullValue() && Cmp.isEquality() &&
       !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
     // Compute C2 << Y.
     Value *NewShift =
-        IsShl ? Builder->CreateLShr(And->getOperand(1), Shift->getOperand(1))
-              : Builder->CreateShl(And->getOperand(1), Shift->getOperand(1));
+        IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
+              : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));
 
     // Compute X & (C2 << Y).
-    Value *NewAnd = Builder->CreateAnd(Shift->getOperand(0), NewShift);
+    Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
     Cmp.setOperand(0, NewAnd);
     return &Cmp;
   }
@@ -1663,13 +1664,13 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
       (Cmp.isEquality() || (!C1->isNegative() && !C2->isNegative()))) {
     // TODO: Is this a good transform for vectors? Wider types may reduce
     // throughput. Should this transform be limited (even for scalars) by using
-    // ShouldChangeType()?
+    // shouldChangeType()?
     if (!Cmp.getType()->isVectorTy()) {
       Type *WideType = W->getType();
       unsigned WideScalarBits = WideType->getScalarSizeInBits();
       Constant *ZextC1 = ConstantInt::get(WideType, C1->zext(WideScalarBits));
       Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
-      Value *NewAnd = Builder->CreateAnd(W, ZextC2, And->getName());
+      Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
       return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
     }
   }
@@ -1681,7 +1682,8 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
   // (icmp pred (and A, (or (shl 1, B), 1), 0))
   //
   // iff pred isn't signed
-  if (!Cmp.isSigned() && *C1 == 0 && match(And->getOperand(1), m_One())) {
+  if (!Cmp.isSigned() && C1->isNullValue() &&
+      match(And->getOperand(1), m_One())) {
     Constant *One = cast<Constant>(And->getOperand(1));
     Value *Or = And->getOperand(0);
     Value *A, *B, *LShr;
@@ -1702,12 +1704,12 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
           NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
       } else {
         if (UsesRemoved >= 3)
-          NewOr = Builder->CreateOr(Builder->CreateShl(One, B, LShr->getName(),
-                                                       /*HasNUW=*/true),
-                                    One, Or->getName());
+          NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
+                                                     /*HasNUW=*/true),
+                                   One, Or->getName());
       }
       if (NewOr) {
-        Value *NewAnd = Builder->CreateAnd(A, NewOr, And->getName());
+        Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
         Cmp.setOperand(0, NewAnd);
         return &Cmp;
       }
@@ -1764,13 +1766,13 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
   // (X & C2) != 0 -> (trunc X) <  0
   //   iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
   const APInt *C2;
-  if (And->hasOneUse() && *C == 0 && match(Y, m_APInt(C2))) {
+  if (And->hasOneUse() && C->isNullValue() && match(Y, m_APInt(C2))) {
     int32_t ExactLogBase2 = C2->exactLogBase2();
     if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
       Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
       if (And->getType()->isVectorTy())
         NTy = VectorType::get(NTy, And->getType()->getVectorNumElements());
-      Value *Trunc = Builder->CreateTrunc(X, NTy);
+      Value *Trunc = Builder.CreateTrunc(X, NTy);
       auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
                                                             : CmpInst::ICMP_SLT;
       return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
@@ -1784,7 +1786,7 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
 Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
                                               const APInt *C) {
   ICmpInst::Predicate Pred = Cmp.getPredicate();
-  if (*C == 1) {
+  if (C->isOneValue()) {
     // icmp slt signum(V) 1 --> icmp slt V, 1
     Value *V = nullptr;
     if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
@@ -1792,7 +1794,16 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
                           ConstantInt::get(V->getType(), 1));
   }
 
-  if (!Cmp.isEquality() || *C != 0 || !Or->hasOneUse())
+  // X | C == C --> X <=u C
+  // X | C != C --> X  >u C
+  //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
+  if (Cmp.isEquality() && Cmp.getOperand(1) == Or->getOperand(1) &&
+      (*C + 1).isPowerOf2()) {
+    Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+    return new ICmpInst(Pred, Or->getOperand(0), Or->getOperand(1));
+  }
+
+  if (!Cmp.isEquality() || !C->isNullValue() || !Or->hasOneUse())
     return nullptr;
 
   Value *P, *Q;
@@ -1800,12 +1811,24 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
     // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
     // -> and (icmp eq P, null), (icmp eq Q, null).
     Value *CmpP =
-        Builder->CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
+        Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
     Value *CmpQ =
-        Builder->CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
-    auto LogicOpc = Pred == ICmpInst::Predicate::ICMP_EQ ? Instruction::And
-                                                         : Instruction::Or;
-    return BinaryOperator::Create(LogicOpc, CmpP, CmpQ);
+        Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, CmpP, CmpQ);
+  }
+
+  // Are we using xors to bitwise check for a pair of (in)equalities? Convert to
+  // a shorter form that has more potential to be folded even further.
+  Value *X1, *X2, *X3, *X4;
+  if (match(Or->getOperand(0), m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
+      match(Or->getOperand(1), m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
+    // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
+    // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
+    Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
+    Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
+    auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+    return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
   }
 
   return nullptr;
@@ -1914,61 +1937,89 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
 
   ICmpInst::Predicate Pred = Cmp.getPredicate();
   Value *X = Shl->getOperand(0);
-  if (Cmp.isEquality()) {
-    // If the shift is NUW, then it is just shifting out zeros, no need for an
-    // AND.
-    Constant *LShrC = ConstantInt::get(Shl->getType(), C->lshr(*ShiftAmt));
-    if (Shl->hasNoUnsignedWrap())
-      return new ICmpInst(Pred, X, LShrC);
-
-    // If the shift is NSW and we compare to 0, then it is just shifting out
-    // sign bits, no need for an AND either.
-    if (Shl->hasNoSignedWrap() && *C == 0)
-      return new ICmpInst(Pred, X, LShrC);
-
-    if (Shl->hasOneUse()) {
-      // Otherwise, strength reduce the shift into an and.
-      Constant *Mask = ConstantInt::get(Shl->getType(),
-          APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
-
-      Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
-      return new ICmpInst(Pred, And, LShrC);
+  Type *ShType = Shl->getType();
+
+  // NSW guarantees that we are only shifting out sign bits from the high bits,
+  // so we can ASHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoSignedWrap()) {
+    if (Pred == ICmpInst::ICMP_SGT) {
+      // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
+      APInt ShiftedC = C->ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
+      // This is the same code as the SGT case, but assert the pre-condition
+      // that is needed for this to work with equality predicates.
+      assert(C->ashr(*ShiftAmt).shl(*ShiftAmt) == *C &&
+             "Compare known true or false was not folded");
+      APInt ShiftedC = C->ashr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_SLT) {
+      // SLE is the same as above, but SLE is canonicalized to SLT, so convert:
+      // (X << S) <=s C is equiv to X <=s (C >> S) for all C
+      // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
+      // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
+      assert(!C->isMinSignedValue() && "Unexpected icmp slt");
+      APInt ShiftedC = (*C - 1).ashr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead; isSignTest may change 'Pred', so only
+    // do that if we're sure to not continue on in this function.
+    if (isSignTest(Pred, *C))
+      return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
+  }
+
+  // NUW guarantees that we are only shifting out zero bits from the high bits,
+  // so we can LSHR the compare constant without needing a mask and eliminate
+  // the shift.
+  if (Shl->hasNoUnsignedWrap()) {
+    if (Pred == ICmpInst::ICMP_UGT) {
+      // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
+      APInt ShiftedC = C->lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) {
+      // This is the same code as the UGT case, but assert the pre-condition
+      // that is needed for this to work with equality predicates.
+      assert(C->lshr(*ShiftAmt).shl(*ShiftAmt) == *C &&
+             "Compare known true or false was not folded");
+      APInt ShiftedC = C->lshr(*ShiftAmt);
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+    }
+    if (Pred == ICmpInst::ICMP_ULT) {
+      // ULE is the same as above, but ULE is canonicalized to ULT, so convert:
+      // (X << S) <=u C is equiv to X <=u (C >> S) for all C
+      // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
+      // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
+      assert(C->ugt(0) && "ult 0 should have been eliminated");
+      APInt ShiftedC = (*C - 1).lshr(*ShiftAmt) + 1;
+      return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
     }
   }
 
-  // If this is a signed comparison to 0 and the shift is sign preserving,
-  // use the shift LHS operand instead; isSignTest may change 'Pred', so only
-  // do that if we're sure to not continue on in this function.
-  if (Shl->hasNoSignedWrap() && isSignTest(Pred, *C))
-    return new ICmpInst(Pred, X, Constant::getNullValue(X->getType()));
+  if (Cmp.isEquality() && Shl->hasOneUse()) {
+    // Strength-reduce the shift into an 'and'.
+    Constant *Mask = ConstantInt::get(
+        ShType,
+        APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
+    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+    Constant *LShrC = ConstantInt::get(ShType, C->lshr(*ShiftAmt));
+    return new ICmpInst(Pred, And, LShrC);
+  }
 
   // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
   bool TrueIfSigned = false;
   if (Shl->hasOneUse() && isSignBitCheck(Pred, *C, TrueIfSigned)) {
     // (X << 31) <s 0  --> (X & 1) != 0
     Constant *Mask = ConstantInt::get(
-        X->getType(),
+        ShType,
         APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
-    Value *And = Builder->CreateAnd(X, Mask, Shl->getName() + ".mask");
+    Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
     return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
-                        And, Constant::getNullValue(And->getType()));
-  }
-
-  // When the shift is nuw and pred is >u or <=u, comparison only really happens
-  // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the
-  // <=u case can be further converted to match <u (see below).
-  if (Shl->hasNoUnsignedWrap() &&
-      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) {
-    // Derivation for the ult case:
-    // (X << S) <=u C is equiv to X <=u (C >> S) for all C
-    // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
-    // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
-    assert((Pred != ICmpInst::ICMP_ULT || C->ugt(0)) &&
-           "Encountered `ult 0` that should have been eliminated by "
-           "InstSimplify.");
-    APInt ShiftedC = Pred == ICmpInst::ICMP_ULT ? (*C - 1).lshr(*ShiftAmt) + 1
-                                                : C->lshr(*ShiftAmt);
-    return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), ShiftedC));
+                        And, Constant::getNullValue(ShType));
   }
 
   // Transform (icmp pred iM (shl iM %v, N), C)
@@ -1981,11 +2032,11 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
   if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
       DL.isLegalInteger(TypeBits - Amt)) {
     Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
-    if (X->getType()->isVectorTy())
-      TruncTy = VectorType::get(TruncTy, X->getType()->getVectorNumElements());
+    if (ShType->isVectorTy())
+      TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements());
     Constant *NewC =
         ConstantInt::get(TruncTy, C->ashr(*ShiftAmt).trunc(TypeBits - Amt));
-    return new ICmpInst(Pred, Builder->CreateTrunc(X, TruncTy), NewC);
+    return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
   }
 
   return nullptr;
@@ -1999,7 +2050,8 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
   // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
   Value *X = Shr->getOperand(0);
   CmpInst::Predicate Pred = Cmp.getPredicate();
-  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && *C == 0)
+  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
+      C->isNullValue())
     return new ICmpInst(Pred, X, Cmp.getOperand(1));
 
   const APInt *ShiftVal;
@@ -2036,8 +2088,8 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
     Constant *DivCst = ConstantInt::get(
         Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
 
-    Value *Tmp = IsAShr ? Builder->CreateSDiv(X, DivCst, "", Shr->isExact())
-                        : Builder->CreateUDiv(X, DivCst, "", Shr->isExact());
+    Value *Tmp = IsAShr ? Builder.CreateSDiv(X, DivCst, "", Shr->isExact())
+                        : Builder.CreateUDiv(X, DivCst, "", Shr->isExact());
 
     Cmp.setOperand(0, Tmp);
 
@@ -2075,7 +2127,7 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp,
     // Otherwise strength reduce the shift into an 'and'.
     APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
     Constant *Mask = ConstantInt::get(Shr->getType(), Val);
-    Value *And = Builder->CreateAnd(X, Mask, Shr->getName() + ".mask");
+    Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
     return new ICmpInst(Pred, And, ShiftedCmpRHS);
   }
 
@@ -2090,7 +2142,7 @@ Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
   if (!match(UDiv->getOperand(0), m_APInt(C2)))
     return nullptr;
 
-  assert(C2 != 0 && "udiv 0, X should have been simplified already.");
+  assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
 
   // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
   Value *Y = UDiv->getOperand(1);
@@ -2103,7 +2155,7 @@ Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp,
 
   // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
   if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
-    assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
+    assert(*C != 0 && "icmp ult X, 0 should have been simplified already.");
     return new ICmpInst(ICmpInst::ICMP_UGT, Y,
                         ConstantInt::get(Y->getType(), C2->udiv(*C)));
   }
@@ -2141,7 +2193,8 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
   // INT_MIN will also fail if the divisor is 1. Although folds of all these
   // division-by-constant cases should be present, we can not assert that they
   // have happened before we reach this icmp instruction.
-  if (*C2 == 0 || *C2 == 1 || (DivIsSigned && C2->isAllOnesValue()))
+  if (C2->isNullValue() || C2->isOneValue() ||
+      (DivIsSigned && C2->isAllOnesValue()))
     return nullptr;
 
   // TODO: We could do all of the computations below using APInt.
@@ -2187,7 +2240,7 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
       HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
     }
   } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
-    if (*C == 0) {       // (X / pos) op 0
+    if (C->isNullValue()) {       // (X / pos) op 0
       // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
       LoBound = ConstantExpr::getNeg(SubOne(RangeSize));
       HiBound = RangeSize;
@@ -2208,7 +2261,7 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
   } else if (C2->isNegative()) { // Divisor is < 0.
     if (Div->isExact())
       RangeSize = ConstantExpr::getNeg(RangeSize);
-    if (*C == 0) {       // (X / neg) op 0
+    if (C->isNullValue()) { // (X / neg) op 0
       // e.g. X/-5 op 0  --> [-4, 5)
       LoBound = AddOne(RangeSize);
       HiBound = ConstantExpr::getNeg(RangeSize);
@@ -2238,7 +2291,7 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
     default: llvm_unreachable("Unhandled icmp opcode!");
     case ICmpInst::ICMP_EQ:
       if (LoOverflow && HiOverflow)
-        return replaceInstUsesWith(Cmp, Builder->getFalse());
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
       if (HiOverflow)
         return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
                             ICmpInst::ICMP_UGE, X, LoBound);
@@ -2250,7 +2303,7 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
                                HiBound->getUniqueInteger(), DivIsSigned, true));
     case ICmpInst::ICMP_NE:
       if (LoOverflow && HiOverflow)
-        return replaceInstUsesWith(Cmp, Builder->getTrue());
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
       if (HiOverflow)
         return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
                             ICmpInst::ICMP_ULT, X, LoBound);
@@ -2264,16 +2317,16 @@ Instruction *InstCombiner::foldICmpDivConstant(ICmpInst &Cmp,
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_SLT:
       if (LoOverflow == +1)   // Low bound is greater than input range.
-        return replaceInstUsesWith(Cmp, Builder->getTrue());
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
       if (LoOverflow == -1)   // Low bound is less than input range.
-        return replaceInstUsesWith(Cmp, Builder->getFalse());
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
       return new ICmpInst(Pred, X, LoBound);
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_SGT:
       if (HiOverflow == +1)       // High bound greater than input range.
-        return replaceInstUsesWith(Cmp, Builder->getFalse());
+        return replaceInstUsesWith(Cmp, Builder.getFalse());
       if (HiOverflow == -1)       // High bound less than input range.
-        return replaceInstUsesWith(Cmp, Builder->getTrue());
+        return replaceInstUsesWith(Cmp, Builder.getTrue());
       if (Pred == ICmpInst::ICMP_UGT)
         return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
       return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
@@ -2300,15 +2353,15 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
       return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
 
     // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
-    if (Pred == ICmpInst::ICMP_SGT && *C == 0)
+    if (Pred == ICmpInst::ICMP_SGT && C->isNullValue())
       return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
 
     // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
-    if (Pred == ICmpInst::ICMP_SLT && *C == 0)
+    if (Pred == ICmpInst::ICMP_SLT && C->isNullValue())
       return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
 
     // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
-    if (Pred == ICmpInst::ICMP_SLT && *C == 1)
+    if (Pred == ICmpInst::ICMP_SLT && C->isOneValue())
       return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
   }
 
@@ -2320,12 +2373,12 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp,
   //   iff (C2 & (C - 1)) == C - 1 and C is a power of 2
   if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
       (*C2 & (*C - 1)) == (*C - 1))
-    return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateOr(Y, *C - 1), X);
+    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, *C - 1), X);
 
   // C2 - Y >u C -> (Y | C) != C2
   //   iff C2 & C == C and C + 1 is a power of 2
   if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == *C)
-    return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateOr(Y, *C), X);
+    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, *C), X);
 
   return nullptr;
 }
@@ -2342,14 +2395,30 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   // Fold icmp pred (add X, C2), C.
   Value *X = Add->getOperand(0);
   Type *Ty = Add->getType();
-  auto CR =
-      ConstantRange::makeExactICmpRegion(Cmp.getPredicate(), *C).subtract(*C2);
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the add does not wrap, we can always adjust the compare by subtracting
+  // the constants. Equality comparisons are handled elsewhere. SGE/SLE are
+  // canonicalized to SGT/SLT.
+  if (Add->hasNoSignedWrap() &&
+      (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) {
+    bool Overflow;
+    APInt NewC = C->ssub_ov(*C2, Overflow);
+    // If there is overflow, the result must be true or false.
+    // TODO: Can we assert there is no overflow because InstSimplify always
+    // handles those cases?
+    if (!Overflow)
+      // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
+      return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
+  }
+
+  auto CR = ConstantRange::makeExactICmpRegion(Pred, *C).subtract(*C2);
   const APInt &Upper = CR.getUpper();
   const APInt &Lower = CR.getLower();
   if (Cmp.isSigned()) {
-    if (Lower.isSignBit())
+    if (Lower.isSignMask())
       return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
-    if (Upper.isSignBit())
+    if (Upper.isSignMask())
       return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
   } else {
     if (Lower.isMinValue())
@@ -2364,22 +2433,91 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   // X+C <u C2 -> (X & -C2) == C
   //   iff C & (C2-1) == 0
   //       C2 is a power of 2
-  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
-      (*C2 & (*C - 1)) == 0)
-    return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateAnd(X, -(*C)),
+  if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() && (*C2 & (*C - 1)) == 0)
+    return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -(*C)),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
   // X+C >u C2 -> (X & ~C2) != C
   //   iff C & C2 == 0
   //       C2+1 is a power of 2
-  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() &&
-      (*C2 & *C) == 0)
-    return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateAnd(X, ~(*C)),
+  if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == 0)
+    return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~(*C)),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
   return nullptr;
 }
 
+bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
+                                           Value *&RHS, ConstantInt *&Less,
+                                           ConstantInt *&Equal,
+                                           ConstantInt *&Greater) {
+  // TODO: Generalize this to work with other comparison idioms or ensure
+  // they get canonicalized into this form.
+
+  // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32
+  // Greater), where Equal, Less and Greater are placeholders for any three
+  // constants.
+  ICmpInst::Predicate PredA, PredB;
+  if (match(SI->getTrueValue(), m_ConstantInt(Equal)) &&
+      match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) &&
+      PredA == ICmpInst::ICMP_EQ &&
+      match(SI->getFalseValue(),
+            m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)),
+                     m_ConstantInt(Less), m_ConstantInt(Greater))) &&
+      PredB == ICmpInst::ICMP_SLT) {
+    return true;
+  }
+  return false;
+}
+
+Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp,
+                                                  Instruction *Select,
+                                                  ConstantInt *C) {
+
+  assert(C && "Cmp RHS should be a constant int!");
+  // If we're testing a constant value against the result of a three way
+  // comparison, the result can be expressed directly in terms of the
+  // original values being compared.  Note: We could possibly be more
+  // aggressive here and remove the hasOneUse test. The original select is
+  // really likely to simplify or sink when we remove a test of the result.
+  Value *OrigLHS, *OrigRHS;
+  ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
+  if (Cmp.hasOneUse() &&
+      matchThreeWayIntCompare(cast<SelectInst>(Select), OrigLHS, OrigRHS,
+                                 C1LessThan, C2Equal, C3GreaterThan)) {
+    assert(C1LessThan && C2Equal && C3GreaterThan);
+
+    bool TrueWhenLessThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
+            ->isAllOnesValue();
+    bool TrueWhenEqual =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
+            ->isAllOnesValue();
+    bool TrueWhenGreaterThan =
+        ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
+            ->isAllOnesValue();
+
+    // This generates the new instruction that will replace the original Cmp
+    // Instruction. Instead of enumerating the various combinations when
+    // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
+    // false, we rely on chaining of ORs and future passes of InstCombine to
+    // simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
+
+    // When none of the three constants satisfy the predicate for the RHS (C),
+    // the entire original Cmp can be simplified to a false.
+    Value *Cond = Builder.getFalse();
+    if (TrueWhenLessThan)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT, OrigLHS, OrigRHS));
+    if (TrueWhenEqual)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ, OrigLHS, OrigRHS));
+    if (TrueWhenGreaterThan)
+      Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT, OrigLHS, OrigRHS));
+
+    return replaceInstUsesWith(Cmp, Cond);
+  }
+  return nullptr;
+}
+
 /// Try to fold integer comparisons with a constant operand: icmp Pred X, C
 /// where X is some kind of instruction.
 Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
@@ -2439,11 +2577,28 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) {
       return I;
   }
 
+  // Match against CmpInst LHS being instructions other than binary operators.
   Instruction *LHSI;
-  if (match(Cmp.getOperand(0), m_Instruction(LHSI)) &&
-      LHSI->getOpcode() == Instruction::Trunc)
-    if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C))
-      return I;
+  if (match(Cmp.getOperand(0), m_Instruction(LHSI))) {
+    switch (LHSI->getOpcode()) {
+    case Instruction::Select:
+      {
+      // For now, we only support constant integers while folding the
+      // ICMP(SELECT)) pattern. We can extend this to support vector of integers
+      // similar to the cases handled by binary ops above.
+      if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
+        if (Instruction *I = foldICmpSelectConstant(Cmp, LHSI, ConstRHS))
+          return I;
+      break;
+      }
+    case Instruction::Trunc:
+      if (Instruction *I = foldICmpTruncConstant(Cmp, LHSI, C))
+        return I;
+      break;
+    default:
+      break;
+    }
+  }
 
   if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, C))
     return I;
@@ -2469,10 +2624,10 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
   switch (BO->getOpcode()) {
   case Instruction::SRem:
     // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
-    if (*C == 0 && BO->hasOneUse()) {
+    if (C->isNullValue() && BO->hasOneUse()) {
       const APInt *BOC;
       if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
-        Value *NewRem = Builder->CreateURem(BOp0, BOp1, BO->getName());
+        Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
         return new ICmpInst(Pred, NewRem,
                             Constant::getNullValue(BO->getType()));
       }
@@ -2486,7 +2641,7 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
         Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1));
         return new ICmpInst(Pred, BOp0, SubC);
       }
-    } else if (*C == 0) {
+    } else if (C->isNullValue()) {
       // Replace ((add A, B) != 0) with (A != -B) if A or B is
       // efficiently invertible, or if the add has just this one use.
       if (Value *NegVal = dyn_castNegVal(BOp1))
@@ -2494,7 +2649,7 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
       if (Value *NegVal = dyn_castNegVal(BOp0))
         return new ICmpInst(Pred, NegVal, BOp1);
       if (BO->hasOneUse()) {
-        Value *Neg = Builder->CreateNeg(BOp1);
+        Value *Neg = Builder.CreateNeg(BOp1);
         Neg->takeName(BO);
         return new ICmpInst(Pred, BOp0, Neg);
       }
@@ -2507,7 +2662,7 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
         // For the xor case, we can xor two constants together, eliminating
         // the explicit xor.
         return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
-      } else if (*C == 0) {
+      } else if (C->isNullValue()) {
         // Replace ((xor A, B) != 0) with (A != B)
         return new ICmpInst(Pred, BOp0, BOp1);
       }
@@ -2520,7 +2675,7 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
         // Replace ((sub BOC, B) != C) with (B != BOC-C).
         Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS);
         return new ICmpInst(Pred, BOp1, SubC);
-      } else if (*C == 0) {
+      } else if (C->isNullValue()) {
         // Replace ((sub A, B) != 0) with (A != B).
         return new ICmpInst(Pred, BOp0, BOp1);
       }
@@ -2533,7 +2688,7 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
       // Replace (X | C) == -1 with (X & ~C) == ~C.
       // This removes the -1 constant.
       Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
-      Value *And = Builder->CreateAnd(BOp0, NotBOC);
+      Value *And = Builder.CreateAnd(BOp0, NotBOC);
       return new ICmpInst(Pred, And, NotBOC);
     }
     break;
@@ -2551,14 +2706,14 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
         break;
 
       // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
-      if (BOC->isSignBit()) {
+      if (BOC->isSignMask()) {
         Constant *Zero = Constant::getNullValue(BOp0->getType());
         auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
         return new ICmpInst(NewPred, BOp0, Zero);
       }
 
       // ((X & ~7) == 0) --> X < 8
-      if (*C == 0 && (~(*BOC) + 1).isPowerOf2()) {
+      if (C->isNullValue() && (~(*BOC) + 1).isPowerOf2()) {
         Constant *NegBOC = ConstantExpr::getNeg(cast<Constant>(BOp1));
         auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
         return new ICmpInst(NewPred, BOp0, NegBOC);
@@ -2567,9 +2722,9 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
     break;
   }
   case Instruction::Mul:
-    if (*C == 0 && BO->hasNoSignedWrap()) {
+    if (C->isNullValue() && BO->hasNoSignedWrap()) {
       const APInt *BOC;
-      if (match(BOp1, m_APInt(BOC)) && *BOC != 0) {
+      if (match(BOp1, m_APInt(BOC)) && !BOC->isNullValue()) {
         // The trivial case (mul X, 0) is handled by InstSimplify.
         // General case : (mul X, C) != 0 iff X != 0
         //                (mul X, C) == 0 iff X == 0
@@ -2578,7 +2733,7 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
     }
     break;
   case Instruction::UDiv:
-    if (*C == 0) {
+    if (C->isNullValue()) {
       // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
       auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
       return new ICmpInst(NewPred, BOp1, BOp0);
@@ -2597,32 +2752,35 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
   if (!II || !Cmp.isEquality())
     return nullptr;
 
-  // Handle icmp {eq|ne} <intrinsic>, intcst.
+  // Handle icmp {eq|ne} <intrinsic>, Constant.
+  Type *Ty = II->getType();
   switch (II->getIntrinsicID()) {
   case Intrinsic::bswap:
     Worklist.Add(II);
     Cmp.setOperand(0, II->getArgOperand(0));
-    Cmp.setOperand(1, Builder->getInt(C->byteSwap()));
+    Cmp.setOperand(1, ConstantInt::get(Ty, C->byteSwap()));
     return &Cmp;
+
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for !=
     if (*C == C->getBitWidth()) {
       Worklist.Add(II);
       Cmp.setOperand(0, II->getArgOperand(0));
-      Cmp.setOperand(1, ConstantInt::getNullValue(II->getType()));
+      Cmp.setOperand(1, ConstantInt::getNullValue(Ty));
       return &Cmp;
     }
     break;
+
   case Intrinsic::ctpop: {
     // popcount(A) == 0  ->  A == 0 and likewise for !=
     // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for !=
-    bool IsZero = *C == 0;
+    bool IsZero = C->isNullValue();
     if (IsZero || *C == C->getBitWidth()) {
       Worklist.Add(II);
       Cmp.setOperand(0, II->getArgOperand(0));
-      auto *NewOp = IsZero ? Constant::getNullValue(II->getType())
-                           : Constant::getAllOnesValue(II->getType());
+      auto *NewOp =
+          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty);
       Cmp.setOperand(1, NewOp);
       return &Cmp;
     }
@@ -2631,6 +2789,7 @@ Instruction *InstCombiner::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
   default:
     break;
   }
+
   return nullptr;
 }
 
@@ -2656,7 +2815,7 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
     // block.  If in the same block, we're encouraging jump threading.  If
     // not, we are just pessimizing the code by making an i1 phi.
     if (LHSI->getParent() == I.getParent())
-      if (Instruction *NV = FoldOpIntoPhi(I))
+      if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
         return NV;
     break;
   case Instruction::Select: {
@@ -2698,11 +2857,11 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
     }
     if (Transform) {
       if (!Op1)
-        Op1 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
-                                  I.getName());
+        Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
+                                 I.getName());
       if (!Op2)
-        Op2 = Builder->CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
-                                  I.getName());
+        Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
+                                 I.getName());
       return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
     }
     break;
@@ -2733,6 +2892,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) {
 }
 
 /// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
 Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
@@ -2742,7 +2904,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (!BO0 && !BO1)
     return nullptr;
 
-  CmpInst::Predicate Pred = I.getPredicate();
+  const CmpInst::Predicate Pred = I.getPredicate();
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
   if (BO0 && isa<OverflowingBinaryOperator>(BO0))
     NoOp0WrapProblem =
@@ -2767,12 +2929,6 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X+cst) < 0 --> X < -cst
-  if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
-    if (ConstantInt *RHSC = dyn_cast_or_null<ConstantInt>(B))
-      if (!RHSC->isMinValue(/*isSigned=*/true))
-        return new ICmpInst(Pred, A, ConstantExpr::getNeg(RHSC));
-
   // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
   if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
     return new ICmpInst(Pred, A == Op1 ? B : A,
@@ -2847,6 +3003,31 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
     return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
 
+  // TODO: The subtraction-related identities shown below also hold, but
+  // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
+  // wouldn't happen even if they were implemented.
+  //
+  // icmp ult (X - 1), Y -> icmp ule X, Y
+  // icmp uge (X - 1), Y -> icmp ugt X, Y
+  // icmp ugt X, (Y - 1) -> icmp uge X, Y
+  // icmp ule X, (Y - 1) -> icmp ult X, Y
+
+  // icmp ule (X + 1), Y -> icmp ult X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
+
+  // icmp ugt (X + 1), Y -> icmp uge X, Y
+  if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
+
+  // icmp uge X, (Y + 1) -> icmp ugt X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
+
+  // icmp ult X, (Y + 1) -> icmp ule X, Y
+  if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
+    return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
+
   // if C1 has greater magnitude than C2:
   //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
   //  s.t. C3 = C1 - C2
@@ -2864,12 +3045,12 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
           APInt AP1Abs = C1->getValue().abs();
           APInt AP2Abs = C2->getValue().abs();
           if (AP1Abs.uge(AP2Abs)) {
-            ConstantInt *C3 = Builder->getInt(AP1 - AP2);
-            Value *NewAdd = Builder->CreateNSWAdd(A, C3);
+            ConstantInt *C3 = Builder.getInt(AP1 - AP2);
+            Value *NewAdd = Builder.CreateNSWAdd(A, C3);
             return new ICmpInst(Pred, NewAdd, C);
           } else {
-            ConstantInt *C3 = Builder->getInt(AP2 - AP1);
-            Value *NewAdd = Builder->CreateNSWAdd(C, C3);
+            ConstantInt *C3 = Builder.getInt(AP2 - AP1);
+            Value *NewAdd = Builder.CreateNSWAdd(C, C3);
             return new ICmpInst(Pred, A, NewAdd);
           }
         }
@@ -2956,56 +3137,69 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
       break;
     case Instruction::Add:
     case Instruction::Sub:
-    case Instruction::Xor:
+    case Instruction::Xor: {
       if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
-        return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                            BO1->getOperand(0));
-      // icmp u/s (a ^ signbit), (b ^ signbit) --> icmp s/u a, b
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
-        if (CI->getValue().isSignBit()) {
-          ICmpInst::Predicate Pred =
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C))) {
+        // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+        if (C->isSignMask()) {
+          ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
-          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
 
-        if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
-          ICmpInst::Predicate Pred =
+        // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+        if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
+          ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
-          Pred = I.getSwappedPredicate(Pred);
-          return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+          NewPred = I.getSwappedPredicate(NewPred);
+          return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
       }
       break;
-    case Instruction::Mul:
+    }
+    case Instruction::Mul: {
       if (!I.isEquality())
         break;
 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
-        // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
-        // Mask = -1 >> count-trailing-zeros(Cst).
-        if (!CI->isZero() && !CI->isOne()) {
-          const APInt &AP = CI->getValue();
-          ConstantInt *Mask = ConstantInt::get(
-              I.getContext(),
-              APInt::getLowBitsSet(AP.getBitWidth(),
-                                   AP.getBitWidth() - AP.countTrailingZeros()));
-          Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
-          Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
-          return new ICmpInst(I.getPredicate(), And1, And2);
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
+          !C->isOneValue()) {
+        // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
+        // Mask = -1 >> count-trailing-zeros(C).
+        if (unsigned TZs = C->countTrailingZeros()) {
+          Constant *Mask = ConstantInt::get(
+              BO0->getType(),
+              APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
+          Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
+          Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
+          return new ICmpInst(Pred, And1, And2);
         }
+        // If there are no trailing zeros in the multiplier, just eliminate
+        // the multiplies (no masking is needed):
+        // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
       }
       break;
+    }
     case Instruction::UDiv:
     case Instruction::LShr:
-      if (I.isSigned())
+      if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
         break;
-      LLVM_FALLTHROUGH;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::SDiv:
+      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+        break;
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::AShr:
       if (!BO0->isExact() || !BO1->isExact())
         break;
-      return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                          BO1->getOperand(0));
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
     case Instruction::Shl: {
       bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
       bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
@@ -3013,8 +3207,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
         break;
       if (!NSW && I.isSigned())
         break;
-      return new ICmpInst(I.getPredicate(), BO0->getOperand(0),
-                          BO1->getOperand(0));
+      return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
     }
     }
   }
@@ -3022,10 +3215,9 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
   if (BO0) {
     // Transform  A & (L - 1) `ult` L --> L != 0
     auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
-    auto BitwiseAnd =
-        m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value()));
+    auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
 
-    if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) {
+    if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
       auto *Zero = Constant::getNullValue(BO0->getType());
       return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
     }
@@ -3126,12 +3318,12 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     return nullptr;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const CmpInst::Predicate Pred = I.getPredicate();
   Value *A, *B, *C, *D;
   if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
     if (A == Op1 || B == Op1) { // (A^B) == A  ->  B == 0
       Value *OtherVal = A == Op1 ? B : A;
-      return new ICmpInst(I.getPredicate(), OtherVal,
-                          Constant::getNullValue(A->getType()));
+      return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
     }
 
     if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
@@ -3139,28 +3331,27 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
       ConstantInt *C1, *C2;
       if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
           Op1->hasOneUse()) {
-        Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
-        Value *Xor = Builder->CreateXor(C, NC);
-        return new ICmpInst(I.getPredicate(), A, Xor);
+        Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
+        Value *Xor = Builder.CreateXor(C, NC);
+        return new ICmpInst(Pred, A, Xor);
       }
 
       // A^B == A^D -> B == D
       if (A == C)
-        return new ICmpInst(I.getPredicate(), B, D);
+        return new ICmpInst(Pred, B, D);
       if (A == D)
-        return new ICmpInst(I.getPredicate(), B, C);
+        return new ICmpInst(Pred, B, C);
       if (B == C)
-        return new ICmpInst(I.getPredicate(), A, D);
+        return new ICmpInst(Pred, A, D);
       if (B == D)
-        return new ICmpInst(I.getPredicate(), A, C);
+        return new ICmpInst(Pred, A, C);
     }
   }
 
   if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
     // A == (A^B)  ->  B == 0
     Value *OtherVal = A == Op0 ? B : A;
-    return new ICmpInst(I.getPredicate(), OtherVal,
-                        Constant::getNullValue(A->getType()));
+    return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
   }
 
   // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
@@ -3187,8 +3378,8 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     }
 
     if (X) { // Build (X^Y) & Z
-      Op1 = Builder->CreateXor(X, Y);
-      Op1 = Builder->CreateAnd(Op1, Z);
+      Op1 = Builder.CreateXor(X, Y);
+      Op1 = Builder.CreateAnd(Op1, Z);
       I.setOperand(0, Op1);
       I.setOperand(1, Constant::getNullValue(Op1->getType()));
       return &I;
@@ -3205,8 +3396,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     APInt Pow2 = Cst1->getValue() + 1;
     if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
         Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
-      return new ICmpInst(I.getPredicate(), A,
-                          Builder->CreateTrunc(B, A->getType()));
+      return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
   }
 
   // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
@@ -3218,12 +3408,11 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     unsigned TypeBits = Cst1->getBitWidth();
     unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
     if (ShAmt < TypeBits && ShAmt != 0) {
-      ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
-                                     ? ICmpInst::ICMP_UGE
-                                     : ICmpInst::ICMP_ULT;
-      Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+      ICmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
       APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
-      return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
+      return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
     }
   }
 
@@ -3233,12 +3422,11 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     unsigned TypeBits = Cst1->getBitWidth();
     unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
     if (ShAmt < TypeBits && ShAmt != 0) {
-      Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+      Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
       APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
-      Value *And = Builder->CreateAnd(Xor, Builder->getInt(AndVal),
+      Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
                                       I.getName() + ".mask");
-      return new ICmpInst(I.getPredicate(), And,
-                          Constant::getNullValue(Cst1->getType()));
+      return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
     }
   }
 
@@ -3261,11 +3449,20 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
       APInt CmpV = Cst1->getValue().zext(ASize);
       CmpV <<= ShAmt;
 
-      Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
-      return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
+      Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
+      return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
     }
   }
 
+  // If both operands are byte-swapped or bit-reversed, just compare the
+  // original values.
+  // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
+  // and handle more intrinsics.
+  if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
+      (match(Op0, m_BitReverse(m_Value(A))) &&
+       match(Op1, m_BitReverse(m_Value(B)))))
+    return new ICmpInst(Pred, A, B);
+
   return nullptr;
 }
 
@@ -3290,7 +3487,7 @@ Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
         RHSOp = RHSC->getOperand(0);
         // If the pointer types don't match, insert a bitcast.
         if (LHSCIOp->getType() != RHSOp->getType())
-          RHSOp = Builder->CreateBitCast(RHSOp, LHSCIOp->getType());
+          RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType());
       }
     } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
@@ -3374,7 +3571,7 @@ Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) {
   // We're performing an unsigned comp with a sign extended value.
   // This is true if the input is >= 0. [aka >s -1]
   Constant *NegOne = Constant::getAllOnesValue(SrcTy);
-  Value *Result = Builder->CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
+  Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName());
 
   // Finally, return the value computed.
   if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
@@ -3402,7 +3599,7 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
   // may be pointing to the compare.  We want to insert the new instructions
   // before the add in case there are uses of the add between the add and the
   // compare.
-  Builder->SetInsertPoint(&OrigI);
+  Builder.SetInsertPoint(&OrigI);
 
   switch (OCF) {
   case OCF_INVALID:
@@ -3411,11 +3608,11 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
   case OCF_UNSIGNED_ADD: {
     OverflowResult OR = computeOverflowForUnsignedAdd(LHS, RHS, &OrigI);
     if (OR == OverflowResult::NeverOverflows)
-      return SetResult(Builder->CreateNUWAdd(LHS, RHS), Builder->getFalse(),
+      return SetResult(Builder.CreateNUWAdd(LHS, RHS), Builder.getFalse(),
                        true);
 
     if (OR == OverflowResult::AlwaysOverflows)
-      return SetResult(Builder->CreateAdd(LHS, RHS), Builder->getTrue(), true);
+      return SetResult(Builder.CreateAdd(LHS, RHS), Builder.getTrue(), true);
 
     // Fall through uadd into sadd
     LLVM_FALLTHROUGH;
@@ -3423,13 +3620,13 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
   case OCF_SIGNED_ADD: {
     // X + 0 -> {X, false}
     if (match(RHS, m_Zero()))
-      return SetResult(LHS, Builder->getFalse(), false);
+      return SetResult(LHS, Builder.getFalse(), false);
 
     // We can strength reduce this signed add into a regular add if we can prove
     // that it will never overflow.
     if (OCF == OCF_SIGNED_ADD)
-      if (WillNotOverflowSignedAdd(LHS, RHS, OrigI))
-        return SetResult(Builder->CreateNSWAdd(LHS, RHS), Builder->getFalse(),
+      if (willNotOverflowSignedAdd(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNSWAdd(LHS, RHS), Builder.getFalse(),
                          true);
     break;
   }
@@ -3438,15 +3635,15 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
   case OCF_SIGNED_SUB: {
     // X - 0 -> {X, false}
     if (match(RHS, m_Zero()))
-      return SetResult(LHS, Builder->getFalse(), false);
+      return SetResult(LHS, Builder.getFalse(), false);
 
     if (OCF == OCF_SIGNED_SUB) {
-      if (WillNotOverflowSignedSub(LHS, RHS, OrigI))
-        return SetResult(Builder->CreateNSWSub(LHS, RHS), Builder->getFalse(),
+      if (willNotOverflowSignedSub(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNSWSub(LHS, RHS), Builder.getFalse(),
                          true);
     } else {
-      if (WillNotOverflowUnsignedSub(LHS, RHS, OrigI))
-        return SetResult(Builder->CreateNUWSub(LHS, RHS), Builder->getFalse(),
+      if (willNotOverflowUnsignedSub(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNUWSub(LHS, RHS), Builder.getFalse(),
                          true);
     }
     break;
@@ -3455,28 +3652,28 @@ bool InstCombiner::OptimizeOverflowCheck(OverflowCheckFlavor OCF, Value *LHS,
   case OCF_UNSIGNED_MUL: {
     OverflowResult OR = computeOverflowForUnsignedMul(LHS, RHS, &OrigI);
     if (OR == OverflowResult::NeverOverflows)
-      return SetResult(Builder->CreateNUWMul(LHS, RHS), Builder->getFalse(),
+      return SetResult(Builder.CreateNUWMul(LHS, RHS), Builder.getFalse(),
                        true);
     if (OR == OverflowResult::AlwaysOverflows)
-      return SetResult(Builder->CreateMul(LHS, RHS), Builder->getTrue(), true);
+      return SetResult(Builder.CreateMul(LHS, RHS), Builder.getTrue(), true);
     LLVM_FALLTHROUGH;
   }
   case OCF_SIGNED_MUL:
     // X * undef -> undef
     if (isa<UndefValue>(RHS))
-      return SetResult(RHS, UndefValue::get(Builder->getInt1Ty()), false);
+      return SetResult(RHS, UndefValue::get(Builder.getInt1Ty()), false);
 
     // X * 0 -> {0, false}
     if (match(RHS, m_Zero()))
-      return SetResult(RHS, Builder->getFalse(), false);
+      return SetResult(RHS, Builder.getFalse(), false);
 
     // X * 1 -> {X, false}
     if (match(RHS, m_One()))
-      return SetResult(LHS, Builder->getFalse(), false);
+      return SetResult(LHS, Builder.getFalse(), false);
 
     if (OCF == OCF_SIGNED_MUL)
-      if (WillNotOverflowSignedMul(LHS, RHS, OrigI))
-        return SetResult(Builder->CreateNSWMul(LHS, RHS), Builder->getFalse(),
+      if (willNotOverflowSignedMul(LHS, RHS, OrigI))
+        return SetResult(Builder.CreateNSWMul(LHS, RHS), Builder.getFalse(),
                          true);
     break;
   }
@@ -3552,6 +3749,11 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
           const APInt &CVal = CI->getValue();
           if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
             return nullptr;
+        } else {
+          // In this case we could have the operand of the binary operation
+          // being defined in another block, and performing the replacement
+          // could break the dominance relation.
+          return nullptr;
         }
       } else {
         // Other uses prohibit this transformation.
@@ -3641,25 +3843,25 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
     return nullptr;
   }
 
-  InstCombiner::BuilderTy *Builder = IC.Builder;
-  Builder->SetInsertPoint(MulInstr);
+  InstCombiner::BuilderTy &Builder = IC.Builder;
+  Builder.SetInsertPoint(MulInstr);
 
   // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
   Value *MulA = A, *MulB = B;
   if (WidthA < MulWidth)
-    MulA = Builder->CreateZExt(A, MulType);
+    MulA = Builder.CreateZExt(A, MulType);
   if (WidthB < MulWidth)
-    MulB = Builder->CreateZExt(B, MulType);
+    MulB = Builder.CreateZExt(B, MulType);
   Value *F = Intrinsic::getDeclaration(I.getModule(),
                                        Intrinsic::umul_with_overflow, MulType);
-  CallInst *Call = Builder->CreateCall(F, {MulA, MulB}, "umul");
+  CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
   IC.Worklist.Add(MulInstr);
 
   // If there are uses of mul result other than the comparison, we know that
   // they are truncation or binary AND. Change them to use result of
   // mul.with.overflow and adjust properly mask/size.
   if (MulVal->hasNUsesOrMore(2)) {
-    Value *Mul = Builder->CreateExtractValue(Call, 0, "umul.value");
+    Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
     for (User *U : MulVal->users()) {
       if (U == &I || U == OtherVal)
         continue;
@@ -3673,9 +3875,9 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
         // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
         ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
         APInt ShortMask = CI->getValue().trunc(MulWidth);
-        Value *ShortAnd = Builder->CreateAnd(Mul, ShortMask);
+        Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
         Instruction *Zext =
-            cast<Instruction>(Builder->CreateZExt(ShortAnd, BO->getType()));
+            cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType()));
         IC.Worklist.Add(Zext);
         IC.replaceInstUsesWith(*BO, Zext);
       } else {
@@ -3712,7 +3914,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
     llvm_unreachable("Unexpected predicate");
   }
   if (Inverse) {
-    Value *Res = Builder->CreateExtractValue(Call, 1);
+    Value *Res = Builder.CreateExtractValue(Call, 1);
     return BinaryOperator::CreateNot(Res);
   }
 
@@ -3725,7 +3927,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
 static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth,
                                     bool isSignCheck) {
   if (isSignCheck)
-    return APInt::getSignBit(BitWidth);
+    return APInt::getSignMask(BitWidth);
 
   ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
   if (!CI) return APInt::getAllOnesValue(BitWidth);
@@ -3738,16 +3940,14 @@ static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth,
   // greater than the RHS must differ in a bit higher than these due to carry.
   case ICmpInst::ICMP_UGT: {
     unsigned trailingOnes = RHS.countTrailingOnes();
-    APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes);
-    return ~lowBitsSet;
+    return APInt::getBitsSetFrom(BitWidth, trailingOnes);
   }
 
   // Similarly, for a ULT comparison, we don't care about the trailing zeros.
   // Any value less than the RHS must differ in a higher bit because of carries.
   case ICmpInst::ICMP_ULT: {
     unsigned trailingZeros = RHS.countTrailingZeros();
-    APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros);
-    return ~lowBitsSet;
+    return APInt::getBitsSetFrom(BitWidth, trailingZeros);
   }
 
   default:
@@ -3887,7 +4087,7 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
   assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
   if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
     BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
-    // The check for the unique predecessor is not the best that can be
+    // The check for the single predecessor is not the best that can be
     // done. But it protects efficiently against cases like when SI's
     // home block has two successors, Succ and Succ1, and Succ1 predecessor
     // of Succ. Then SI can't be replaced by SIOpd because the use that gets
@@ -3895,8 +4095,10 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
     // guarantees that the path all uses of SI (outside SI's parent) are on
     // is disjoint from all other paths out of SI. But that information
     // is more expensive to compute, and the trade-off here is in favor
-    // of compile-time.
-    if (Succ->getUniquePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+    // of compile-time. It should also be noticed that we check for a single
+    // predecessor and not only uniqueness. This to handle the situation when
+    // Succ and Succ1 points to the same basic block.
+    if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
       NumSel++;
       SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
       return true;
@@ -3929,16 +4131,16 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
     IsSignBit = isSignBitCheck(Pred, *CmpC, UnusedBit);
   }
 
-  APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
-  APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
+  KnownBits Op0Known(BitWidth);
+  KnownBits Op1Known(BitWidth);
 
-  if (SimplifyDemandedBits(I.getOperandUse(0),
+  if (SimplifyDemandedBits(&I, 0,
                            getDemandedBitsLHSMask(I, BitWidth, IsSignBit),
-                           Op0KnownZero, Op0KnownOne, 0))
+                           Op0Known, 0))
     return &I;
 
-  if (SimplifyDemandedBits(I.getOperandUse(1), APInt::getAllOnesValue(BitWidth),
-                           Op1KnownZero, Op1KnownOne, 0))
+  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
+                           Op1Known, 0))
     return &I;
 
   // Given the known and unknown bits, compute a range that the LHS could be
@@ -3947,15 +4149,11 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
   APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
   if (I.isSigned()) {
-    computeSignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, Op0Min,
-                                           Op0Max);
-    computeSignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, Op1Min,
-                                           Op1Max);
+    computeSignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
+    computeSignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
   } else {
-    computeUnsignedMinMaxValuesFromKnownBits(Op0KnownZero, Op0KnownOne, Op0Min,
-                                             Op0Max);
-    computeUnsignedMinMaxValuesFromKnownBits(Op1KnownZero, Op1KnownOne, Op1Min,
-                                             Op1Max);
+    computeUnsignedMinMaxValuesFromKnownBits(Op0Known, Op0Min, Op0Max);
+    computeUnsignedMinMaxValuesFromKnownBits(Op1Known, Op1Min, Op1Max);
   }
 
   // If Min and Max are known to be the same, then SimplifyDemandedBits
@@ -3982,8 +4180,8 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
     // If all bits are known zero except for one, then we know at most one bit
     // is set. If the comparison is against zero, then this is a check to see if
     // *that* bit is set.
-    APInt Op0KnownZeroInverted = ~Op0KnownZero;
-    if (~Op1KnownZero == 0) {
+    APInt Op0KnownZeroInverted = ~Op0Known.Zero;
+    if (Op1Known.isZero()) {
       // If the LHS is an AND with the same constant, look through it.
       Value *LHS = nullptr;
       const APInt *LHSC;
@@ -4013,7 +4211,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
 
       // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
       const APInt *CI;
-      if (Op0KnownZeroInverted == 1 &&
+      if (Op0KnownZeroInverted.isOneValue() &&
           match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
         // ((8 >>u X) & 1) == 0 -> X != 3
         // ((8 >>u X) & 1) != 0 -> X == 3
@@ -4071,7 +4269,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
       if (Op1Max == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
         return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            Builder->getInt(CI->getValue() - 1));
+                            Builder.getInt(CI->getValue() - 1));
     }
     break;
   case ICmpInst::ICMP_SGT:
@@ -4085,7 +4283,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
       if (Op1Min == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
         return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                            Builder->getInt(CI->getValue() + 1));
+                            Builder.getInt(CI->getValue() + 1));
     }
     break;
   case ICmpInst::ICMP_SGE:
@@ -4121,8 +4319,8 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   // Turn a signed comparison into an unsigned one if both operands are known to
   // have the same sign.
   if (I.isSigned() &&
-      ((Op0KnownZero.isNegative() && Op1KnownZero.isNegative()) ||
-       (Op0KnownOne.isNegative() && Op1KnownOne.isNegative())))
+      ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
+       (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
     return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
 
   return nullptr;
@@ -4186,6 +4384,80 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne));
 }
 
+/// Integer compare with boolean values can always be turned into bitwise ops.
+static Instruction *canonicalizeICmpBool(ICmpInst &I,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *A = I.getOperand(0), *B = I.getOperand(1);
+  assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");
+
+  // A boolean compared to true/false can be simplified to Op0/true/false in
+  // 14 out of the 20 (10 predicates * 2 constants) possible combinations.
+  // Cases not handled by InstSimplify are always 'not' of Op0.
+  if (match(B, m_Zero())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_EQ:  // A ==   0 -> !A
+      case CmpInst::ICMP_ULE: // A <=u  0 -> !A
+      case CmpInst::ICMP_SGE: // A >=s  0 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  } else if (match(B, m_One())) {
+    switch (I.getPredicate()) {
+      case CmpInst::ICMP_NE:  // A !=  1 -> !A
+      case CmpInst::ICMP_ULT: // A <u  1 -> !A
+      case CmpInst::ICMP_SGT: // A >s -1 -> !A
+        return BinaryOperator::CreateNot(A);
+      default:
+        llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+    }
+  }
+
+  switch (I.getPredicate()) {
+  default:
+    llvm_unreachable("Invalid icmp instruction!");
+  case ICmpInst::ICMP_EQ:
+    // icmp eq i1 A, B -> ~(A ^ B)
+    return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+  case ICmpInst::ICMP_NE:
+    // icmp ne i1 A, B -> A ^ B
+    return BinaryOperator::CreateXor(A, B);
+
+  case ICmpInst::ICMP_UGT:
+    // icmp ugt -> icmp ult
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULT:
+    // icmp ult i1 A, B -> ~A & B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGT:
+    // icmp sgt -> icmp slt
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLT:
+    // icmp slt i1 A, B -> A & ~B
+    return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
+
+  case ICmpInst::ICMP_UGE:
+    // icmp uge -> icmp ule
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_ULE:
+    // icmp ule i1 A, B -> ~A | B
+    return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
+
+  case ICmpInst::ICMP_SGE:
+    // icmp sge -> icmp sle
+    std::swap(A, B);
+    LLVM_FALLTHROUGH;
+  case ICmpInst::ICMP_SLE:
+    // icmp sle i1 A, B -> A | ~B
+    return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
+  }
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -4202,8 +4474,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Changed = true;
   }
 
-  if (Value *V =
-          SimplifyICmpInst(I.getPredicate(), Op0, Op1, DL, &TLI, &DT, &AC, &I))
+  if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1,
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // comparing -val or val with non-zero is the same as just comparing val
@@ -4223,49 +4495,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
   }
 
-  Type *Ty = Op0->getType();
-
-  // icmp's with boolean values can always be turned into bitwise operations
-  if (Ty->getScalarType()->isIntegerTy(1)) {
-    switch (I.getPredicate()) {
-    default: llvm_unreachable("Invalid icmp instruction!");
-    case ICmpInst::ICMP_EQ: {                // icmp eq i1 A, B -> ~(A^B)
-      Value *Xor = Builder->CreateXor(Op0, Op1, I.getName() + "tmp");
-      return BinaryOperator::CreateNot(Xor);
-    }
-    case ICmpInst::ICMP_NE:                  // icmp ne i1 A, B -> A^B
-      return BinaryOperator::CreateXor(Op0, Op1);
-
-    case ICmpInst::ICMP_UGT:
-      std::swap(Op0, Op1);                   // Change icmp ugt -> icmp ult
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_ULT:{                // icmp ult i1 A, B -> ~A & B
-      Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
-      return BinaryOperator::CreateAnd(Not, Op1);
-    }
-    case ICmpInst::ICMP_SGT:
-      std::swap(Op0, Op1);                   // Change icmp sgt -> icmp slt
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_SLT: {               // icmp slt i1 A, B -> A & ~B
-      Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
-      return BinaryOperator::CreateAnd(Not, Op0);
-    }
-    case ICmpInst::ICMP_UGE:
-      std::swap(Op0, Op1);                   // Change icmp uge -> icmp ule
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_ULE: {               // icmp ule i1 A, B -> ~A | B
-      Value *Not = Builder->CreateNot(Op0, I.getName() + "tmp");
-      return BinaryOperator::CreateOr(Not, Op1);
-    }
-    case ICmpInst::ICMP_SGE:
-      std::swap(Op0, Op1);                   // Change icmp sge -> icmp sle
-      LLVM_FALLTHROUGH;
-    case ICmpInst::ICMP_SLE: {               // icmp sle i1 A, B -> A | ~B
-      Value *Not = Builder->CreateNot(Op1, I.getName() + "tmp");
-      return BinaryOperator::CreateOr(Not, Op0);
-    }
-    }
-  }
+  if (Op0->getType()->isIntOrIntVectorTy(1))
+    if (Instruction *Res = canonicalizeICmpBool(I, Builder))
+      return Res;
 
   if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
     return NewICmp;
@@ -4357,7 +4589,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType());
         } else {
           // Otherwise, cast the RHS right before the icmp
-          Op1 = Builder->CreateBitCast(Op1, Op0->getType());
+          Op1 = Builder.CreateBitCast(Op1, Op0->getType());
         }
       }
       return new ICmpInst(I.getPredicate(), Op0, Op1);
@@ -4389,18 +4621,20 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // if A is a power of 2.
     if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
         match(Op1, m_Zero()) &&
-        isKnownToBeAPowerOfTwo(A, DL, false, 0, &AC, &I, &DT) && I.isEquality())
-      return new ICmpInst(I.getInversePredicate(),
-                          Builder->CreateAnd(A, B),
+        isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
+      return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
                           Op1);
 
-    // ~x < ~y --> y < x
-    // ~x < cst --> ~cst < x
+    // ~X < ~Y --> Y < X
+    // ~X < C -->  X > ~C
     if (match(Op0, m_Not(m_Value(A)))) {
       if (match(Op1, m_Not(m_Value(B))))
         return new ICmpInst(I.getPredicate(), B, A);
-      if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1))
-        return new ICmpInst(I.getPredicate(), ConstantExpr::getNot(RHSC), A);
+
+      const APInt *C;
+      if (match(Op1, m_APInt(C)))
+        return new ICmpInst(I.getSwappedPredicate(), A,
+                            ConstantInt::get(Op1->getType(), ~(*C)));
     }
 
     Instruction *AddI = nullptr;
@@ -4488,10 +4722,10 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
       RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
       if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
         if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
-          return replaceInstUsesWith(I, Builder->getFalse());
+          return replaceInstUsesWith(I, Builder.getFalse());
 
         assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
-        return replaceInstUsesWith(I, Builder->getTrue());
+        return replaceInstUsesWith(I, Builder.getTrue());
       }
     }
 
@@ -4557,9 +4791,9 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     Pred = ICmpInst::ICMP_NE;
     break;
   case FCmpInst::FCMP_ORD:
-    return replaceInstUsesWith(I, Builder->getTrue());
+    return replaceInstUsesWith(I, Builder.getTrue());
   case FCmpInst::FCMP_UNO:
-    return replaceInstUsesWith(I, Builder->getFalse());
+    return replaceInstUsesWith(I, Builder.getFalse());
   }
 
   // Now we know that the APFloat is a normal number, zero or inf.
@@ -4577,8 +4811,8 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
           Pred == ICmpInst::ICMP_SLE)
-        return replaceInstUsesWith(I, Builder->getTrue());
-      return replaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
     }
   } else {
     // If the RHS value is > UnsignedMax, fold the comparison. This handles
@@ -4589,8 +4823,8 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
           Pred == ICmpInst::ICMP_ULE)
-        return replaceInstUsesWith(I, Builder->getTrue());
-      return replaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
     }
   }
 
@@ -4602,8 +4836,8 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
           Pred == ICmpInst::ICMP_SGE)
-        return replaceInstUsesWith(I, Builder->getTrue());
-      return replaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
     }
   } else {
     // See if the RHS value is < UnsignedMin.
@@ -4613,8 +4847,8 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
           Pred == ICmpInst::ICMP_UGE)
-        return replaceInstUsesWith(I, Builder->getTrue());
-      return replaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder.getTrue());
+      return replaceInstUsesWith(I, Builder.getFalse());
     }
   }
 
@@ -4636,14 +4870,14 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
       switch (Pred) {
       default: llvm_unreachable("Unexpected integer comparison!");
       case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
-        return replaceInstUsesWith(I, Builder->getTrue());
+        return replaceInstUsesWith(I, Builder.getTrue());
       case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
-        return replaceInstUsesWith(I, Builder->getFalse());
+        return replaceInstUsesWith(I, Builder.getFalse());
       case ICmpInst::ICMP_ULE:
         // (float)int <= 4.4   --> int <= 4
         // (float)int <= -4.4  --> false
         if (RHS.isNegative())
-          return replaceInstUsesWith(I, Builder->getFalse());
+          return replaceInstUsesWith(I, Builder.getFalse());
         break;
       case ICmpInst::ICMP_SLE:
         // (float)int <= 4.4   --> int <= 4
@@ -4655,7 +4889,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
         // (float)int < -4.4   --> false
         // (float)int < 4.4    --> int <= 4
         if (RHS.isNegative())
-          return replaceInstUsesWith(I, Builder->getFalse());
+          return replaceInstUsesWith(I, Builder.getFalse());
         Pred = ICmpInst::ICMP_ULE;
         break;
       case ICmpInst::ICMP_SLT:
@@ -4668,7 +4902,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
         // (float)int > 4.4    --> int > 4
         // (float)int > -4.4   --> true
         if (RHS.isNegative())
-          return replaceInstUsesWith(I, Builder->getTrue());
+          return replaceInstUsesWith(I, Builder.getTrue());
         break;
       case ICmpInst::ICMP_SGT:
         // (float)int > 4.4    --> int > 4
@@ -4680,7 +4914,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
         // (float)int >= -4.4   --> true
         // (float)int >= 4.4    --> int > 4
         if (RHS.isNegative())
-          return replaceInstUsesWith(I, Builder->getTrue());
+          return replaceInstUsesWith(I, Builder.getTrue());
         Pred = ICmpInst::ICMP_UGT;
         break;
       case ICmpInst::ICMP_SGE:
@@ -4711,8 +4945,9 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1,
-                                  I.getFastMathFlags(), DL, &TLI, &DT, &AC, &I))
+  if (Value *V =
+          SimplifyFCmpInst(I.getPredicate(), Op0, Op1, I.getFastMathFlags(),
+                           SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // Simplify 'fcmp pred X, X'
@@ -4801,7 +5036,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         // block.  If in the same block, we're encouraging jump threading.  If
         // not, we are just pessimizing the code by making an i1 phi.
         if (LHSI->getParent() == I.getParent())
-          if (Instruction *NV = FoldOpIntoPhi(I))
+          if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
             return NV;
         break;
       case Instruction::SIToFP:
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 2847ce8..c38a498 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -17,6 +17,7 @@
 
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -27,7 +28,9 @@
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 #define DEBUG_TYPE "instcombine"
 
@@ -40,27 +43,68 @@ class DbgDeclareInst;
 class MemIntrinsic;
 class MemSetInst;
 
-/// \brief Assign a complexity or rank value to LLVM Values.
+/// Assign a complexity or rank value to LLVM Values. This is used to reduce
+/// the amount of pattern matching needed for compares and commutative
+/// instructions. For example, if we have:
+///   icmp ugt X, Constant
+/// or
+///   xor (add X, Constant), cast Z
+///
+/// We do not have to consider the commuted variants of these patterns because
+/// canonicalization based on complexity guarantees the above ordering.
 ///
 /// This routine maps IR values to various complexity ranks:
 ///   0 -> undef
 ///   1 -> Constants
 ///   2 -> Other non-instructions
 ///   3 -> Arguments
-///   3 -> Unary operations
-///   4 -> Other instructions
+///   4 -> Cast and (f)neg/not instructions
+///   5 -> Other instructions
 static inline unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
-        BinaryOperator::isNot(V))
-      return 3;
-    return 4;
+    if (isa<CastInst>(V) || BinaryOperator::isNeg(V) ||
+        BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V))
+      return 4;
+    return 5;
   }
   if (isa<Argument>(V))
     return 3;
   return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
 }
 
+/// Predicate canonicalization reduces the number of patterns that need to be
+/// matched by other transforms. For example, we may swap the operands of a
+/// conditional branch or select to create a compare with a canonical (inverted)
+/// predicate which is then more likely to be matched with other values.
+static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) {
+  switch (Pred) {
+  case CmpInst::ICMP_NE:
+  case CmpInst::ICMP_ULE:
+  case CmpInst::ICMP_SLE:
+  case CmpInst::ICMP_UGE:
+  case CmpInst::ICMP_SGE:
+  // TODO: There are 16 FCMP predicates. Should others be (not) canonical?
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_OGE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+/// Return the source operand of a potentially bitcasted value while optionally
+/// checking if it has one use. If there is no bitcast or the one use check is
+/// not met, return the input value itself.
+static inline Value *peekThroughBitcast(Value *V, bool OneUseOnly = false) {
+  if (auto *BitCast = dyn_cast<BitCastInst>(V))
+    if (!OneUseOnly || BitCast->hasOneUse())
+      return BitCast->getOperand(0);
+
+  // V is not a bitcast or V has more than one use and OneUseOnly is true.
+  return V;
+}
+
 /// \brief Add one to a Constant
 static inline Constant *AddOne(Constant *C) {
   return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
@@ -85,11 +129,10 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) {
     return true;
 
   // A vector of constant integers can be inverted easily.
-  Constant *CV;
-  if (V->getType()->isVectorTy() && match(V, PatternMatch::m_Constant(CV))) {
+  if (V->getType()->isVectorTy() && isa<Constant>(V)) {
     unsigned NumElts = V->getType()->getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = CV->getAggregateElement(i);
+      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
       if (!Elt)
         return false;
 
@@ -167,7 +210,7 @@ public:
   /// \brief An IRBuilder that automatically inserts new instructions into the
   /// worklist.
   typedef IRBuilder<TargetFolder, IRBuilderCallbackInserter> BuilderTy;
-  BuilderTy *Builder;
+  BuilderTy &Builder;
 
 private:
   // Mode in which we are running the combiner.
@@ -182,7 +225,7 @@ private:
   TargetLibraryInfo &TLI;
   DominatorTree &DT;
   const DataLayout &DL;
-
+  const SimplifyQuery SQ;
   // Optional analyses. When non-null, these can both be used to do better
   // combining and will be updated to reflect any changes.
   LoopInfo *LI;
@@ -190,13 +233,13 @@ private:
   bool MadeIRChange;
 
 public:
-  InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder,
+  InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
                bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
-               AssumptionCache &AC, TargetLibraryInfo &TLI,
-               DominatorTree &DT, const DataLayout &DL, LoopInfo *LI)
+               AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
+               const DataLayout &DL, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
         ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
-        DL(DL), LI(LI), MadeIRChange(false) {}
+        DL(DL), SQ(DL, &TLI, &DT, &AC), LI(LI), MadeIRChange(false) {}
 
   /// \brief Run the combiner over the entire worklist until it is empty.
   ///
@@ -241,15 +284,7 @@ public:
   Instruction *visitSDiv(BinaryOperator &I);
   Instruction *visitFDiv(BinaryOperator &I);
   Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
-  Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
-  Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
   Instruction *visitAnd(BinaryOperator &I);
-  Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);
-  Value *FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
-  Instruction *FoldOrWithConstants(BinaryOperator &I, Value *Op, Value *A,
-                                   Value *B, Value *C);
-  Instruction *FoldXorWithConstants(BinaryOperator &I, Value *Op, Value *A,
-                                    Value *B, Value *C);
   Instruction *visitOr(BinaryOperator &I);
   Instruction *visitXor(BinaryOperator &I);
   Instruction *visitShl(BinaryOperator &I);
@@ -289,6 +324,7 @@ public:
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitFenceInst(FenceInst &FI);
   Instruction *visitSwitchInst(SwitchInst &SI);
   Instruction *visitReturnInst(ReturnInst &RI);
   Instruction *visitInsertValueInst(InsertValueInst &IV);
@@ -313,9 +349,14 @@ public:
   bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
                                  const unsigned SIOpd);
 
+  /// Try to replace instruction \p I with value \p V which are pointers
+  /// in different address space.
+  /// \return true if successful.
+  bool replacePointer(Instruction &I, Value *V);
+
 private:
-  bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
-  bool ShouldChangeType(Type *From, Type *To) const;
+  bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
+  bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
@@ -370,10 +411,27 @@ private:
                                  bool DoTransform = true);
 
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-  bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI);
-  bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
-  bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI);
-  bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI);
+  bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const {
+    return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
+  bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const;
+  bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const;
+  bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
+                                const Instruction &CxtI) const;
+  bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
+                                  const Instruction &CxtI) const {
+    return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+           OverflowResult::NeverOverflows;
+  };
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
   Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
@@ -394,6 +452,14 @@ private:
   Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
                                             const CastInst *CI2);
 
+  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
+  Value *foldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
+  Value *foldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
+
+  Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+                                       bool JoinedByAnd, Instruction &CxtI);
 public:
   /// \brief Inserts an instruction \p New before instruction \p Old
   ///
@@ -456,8 +522,9 @@ public:
   /// methods should return the value returned by this function.
   Instruction *eraseInstFromFunction(Instruction &I) {
     DEBUG(dbgs() << "IC: ERASE " << I << '\n');
-
     assert(I.use_empty() && "Cannot erase instruction that is used!");
+    salvageDebugInfo(I);
+
     // Make sure that we reprocess all operands now that we reduced their
     // use counts.
     if (I.getNumOperands() < 8) {
@@ -471,33 +538,47 @@ public:
     return nullptr; // Don't do anything with FI
   }
 
-  void computeKnownBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                        unsigned Depth, Instruction *CxtI) const {
-    return llvm::computeKnownBits(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
-                                  &DT);
+  void computeKnownBits(const Value *V, KnownBits &Known,
+                        unsigned Depth, const Instruction *CxtI) const {
+    llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+  }
+  KnownBits computeKnownBits(const Value *V, unsigned Depth,
+                             const Instruction *CxtI) const {
+    return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+  }
+
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
   }
 
-  bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0,
-                         Instruction *CxtI = nullptr) const {
+  bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+                         const Instruction *CxtI = nullptr) const {
     return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
   }
-  unsigned ComputeNumSignBits(Value *Op, unsigned Depth = 0,
-                              Instruction *CxtI = nullptr) const {
+  unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) const {
     return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
   }
-  void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                      unsigned Depth = 0, Instruction *CxtI = nullptr) const {
-    return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI,
-                                &DT);
-  }
-  OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS,
-                                               const Instruction *CxtI) {
+  OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
     return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
   }
-  OverflowResult computeOverflowForUnsignedAdd(Value *LHS, Value *RHS,
-                                               const Instruction *CxtI) {
+  OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+                                               const Value *RHS,
+                                               const Instruction *CxtI) const {
     return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
+  OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+                                             const Value *RHS,
+                                             const Instruction *CxtI) const {
+    return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+  }
+
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine;
 
 private:
   /// \brief Performs a few simplifications for operators which are associative
@@ -513,18 +594,39 @@ private:
   /// value, or null if it didn't simplify.
   Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
 
+  /// This tries to simplify binary operations by factorizing out common terms
+  /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+  Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *,
+                          Value *, Value *, Value *);
+
+  /// Match a select chain which produces one of three values based on whether
+  /// the LHS is less than, equal to, or greater than RHS respectively.
+  /// Return true if we matched a three way compare idiom. The LHS, RHS, Less,
+  /// Equal and Greater values are saved in the matching process and returned to
+  /// the caller.
+  bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS,
+                               ConstantInt *&Less, ConstantInt *&Equal,
+                               ConstantInt *&Greater);
+
   /// \brief Attempts to replace V with a simpler value based on the demanded
   /// bits.
-  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
-                                 APInt &KnownOne, unsigned Depth,
-                                 Instruction *CxtI);
-  bool SimplifyDemandedBits(Use &U, const APInt &DemandedMask, APInt &KnownZero,
-                            APInt &KnownOne, unsigned Depth = 0);
+  Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
+                                 unsigned Depth, Instruction *CxtI);
+  bool SimplifyDemandedBits(Instruction *I, unsigned Op,
+                            const APInt &DemandedMask, KnownBits &Known,
+                            unsigned Depth = 0);
+  /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
+  /// bits. It also tries to handle simplifications that can be done based on
+  /// DemandedMask, but without modifying the Instruction.
+  Value *SimplifyMultipleUseDemandedBits(Instruction *I,
+                                         const APInt &DemandedMask,
+                                         KnownBits &Known,
+                                         unsigned Depth, Instruction *CxtI);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
   /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
-  Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
-                                    const APInt &DemandedMask, APInt &KnownZero,
-                                    APInt &KnownOne);
+  Value *simplifyShrShlDemandedBits(
+      Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+      const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
 
   /// \brief Tries to simplify operands to an integer instruction based on its
   /// demanded bits.
@@ -534,13 +636,12 @@ private:
                                     APInt &UndefElts, unsigned Depth = 0);
 
   Value *SimplifyVectorOp(BinaryOperator &Inst);
-  Value *SimplifyBSwap(BinaryOperator &Inst);
 
 
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
   /// only possible if all operands to the PHI are constants).
-  Instruction *FoldOpIntoPhi(Instruction &I);
+  Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
 
   /// Given an instruction with a select as one operand and a constant as the
   /// other operand, try to fold the binary operator into the select arguments.
@@ -549,7 +650,7 @@ private:
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// This is a convenience wrapper function for the above two functions.
-  Instruction *foldOpWithConstantIntoOperand(Instruction &I);
+  Instruction *foldOpWithConstantIntoOperand(BinaryOperator &I);
 
   /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
@@ -583,6 +684,8 @@ private:
   Instruction *foldICmpBinOp(ICmpInst &Cmp);
   Instruction *foldICmpEquality(ICmpInst &Cmp);
 
+  Instruction *foldICmpSelectConstant(ICmpInst &Cmp, Instruction *Select,
+                                      ConstantInt *C);
   Instruction *foldICmpTruncConstant(ICmpInst &Cmp, Instruction *Trunc,
                                      const APInt *C);
   Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
@@ -628,16 +731,17 @@ private:
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
 
-  Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+  Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
 
-  Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
-                            bool isSub, Instruction &I);
   Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                          bool isSigned, bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+  Instruction *
+  SimplifyElementUnorderedAtomicMemCpy(ElementUnorderedAtomicMemCpyInst *AMI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 49e516e..4510365 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,13 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -167,6 +169,18 @@ isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
   return nullptr;
 }
 
+/// Returns true if V is dereferenceable for size of alloca.
+static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
+                                           const DataLayout &DL) {
+  if (AI->isArrayAllocation())
+    return false;
+  uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
+  if (!AllocaSize)
+    return false;
+  return isDereferenceableAndAlignedPointer(V, AI->getAlignment(),
+                                            APInt(64, AllocaSize), DL);
+}
+
 static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   // Check for array size of 1 (scalar allocation).
   if (!AI.isArrayAllocation()) {
@@ -175,7 +189,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
       return nullptr;
 
     // Canonicalize it.
-    Value *V = IC.Builder->getInt32(1);
+    Value *V = IC.Builder.getInt32(1);
     AI.setOperand(0, V);
     return &AI;
   }
@@ -183,7 +197,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
     Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
-    AllocaInst *New = IC.Builder->CreateAlloca(NewTy, nullptr, AI.getName());
+    AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
     New->setAlignment(AI.getAlignment());
 
     // Scan to the end of the allocation instructions, to skip over a block of
@@ -215,7 +229,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   // any casting is exposed early.
   Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
   if (AI.getArraySize()->getType() != IntPtrTy) {
-    Value *V = IC.Builder->CreateIntCast(AI.getArraySize(), IntPtrTy, false);
+    Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
     AI.setOperand(0, V);
     return &AI;
   }
@@ -223,6 +237,107 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   return nullptr;
 }
 
+namespace {
+// If I and V are pointers in different address space, it is not allowed to
+// use replaceAllUsesWith since I and V have different types. A
+// non-target-specific transformation should not use addrspacecast on V since
+// the two address space may be disjoint depending on target.
+//
+// This class chases down uses of the old pointer until reaching the load
+// instructions, then replaces the old pointer in the load instructions with
+// the new pointer. If during the chasing it sees bitcast or GEP, it will
+// create new bitcast or GEP with the new pointer and use them in the load
+// instruction.
+class PointerReplacer {
+public:
+  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  void replacePointer(Instruction &I, Value *V);
+
+private:
+  void findLoadAndReplace(Instruction &I);
+  void replace(Instruction *I);
+  Value *getReplacement(Value *I);
+
+  SmallVector<Instruction *, 4> Path;
+  MapVector<Value *, Value *> WorkMap;
+  InstCombiner &IC;
+};
+} // end anonymous namespace
+
+void PointerReplacer::findLoadAndReplace(Instruction &I) {
+  for (auto U : I.users()) {
+    auto *Inst = dyn_cast<Instruction>(&*U);
+    if (!Inst)
+      return;
+    DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
+    if (isa<LoadInst>(Inst)) {
+      for (auto P : Path)
+        replace(P);
+      replace(Inst);
+    } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
+      Path.push_back(Inst);
+      findLoadAndReplace(*Inst);
+      Path.pop_back();
+    } else {
+      return;
+    }
+  }
+}
+
+Value *PointerReplacer::getReplacement(Value *V) {
+  auto Loc = WorkMap.find(V);
+  if (Loc != WorkMap.end())
+    return Loc->second;
+  return nullptr;
+}
+
+void PointerReplacer::replace(Instruction *I) {
+  if (getReplacement(I))
+    return;
+
+  if (auto *LT = dyn_cast<LoadInst>(I)) {
+    auto *V = getReplacement(LT->getPointerOperand());
+    assert(V && "Operand not replaced");
+    auto *NewI = new LoadInst(V);
+    NewI->takeName(LT);
+    IC.InsertNewInstWith(NewI, *LT);
+    IC.replaceInstUsesWith(*LT, NewI);
+    WorkMap[LT] = NewI;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    auto *V = getReplacement(GEP->getPointerOperand());
+    assert(V && "Operand not replaced");
+    SmallVector<Value *, 8> Indices;
+    Indices.append(GEP->idx_begin(), GEP->idx_end());
+    auto *NewI = GetElementPtrInst::Create(
+        V->getType()->getPointerElementType(), V, Indices);
+    IC.InsertNewInstWith(NewI, *GEP);
+    NewI->takeName(GEP);
+    WorkMap[GEP] = NewI;
+  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
+    auto *V = getReplacement(BC->getOperand(0));
+    assert(V && "Operand not replaced");
+    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
+                                  V->getType()->getPointerAddressSpace());
+    auto *NewI = new BitCastInst(V, NewT);
+    IC.InsertNewInstWith(NewI, *BC);
+    NewI->takeName(BC);
+    WorkMap[BC] = NewI;
+  } else {
+    llvm_unreachable("should never reach here");
+  }
+}
+
+void PointerReplacer::replacePointer(Instruction &I, Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(I.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && PT->getElementType() == NT->getElementType() &&
+         "Invalid usage");
+#endif
+  WorkMap[&I] = V;
+  findLoadAndReplace(I);
+}
+
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI))
     return I;
@@ -287,18 +402,29 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
       unsigned SourceAlign = getOrEnforceKnownAlignment(
           Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
-      if (AI.getAlignment() <= SourceAlign) {
+      if (AI.getAlignment() <= SourceAlign &&
+          isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
         DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
         DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           eraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Constant *Cast
-          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
-        Instruction *NewI = replaceInstUsesWith(AI, Cast);
-        eraseInstFromFunction(*Copy);
-        ++NumGlobalCopies;
-        return NewI;
+        auto *SrcTy = TheSrc->getType();
+        auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
+                                        SrcTy->getPointerAddressSpace());
+        Constant *Cast =
+            ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
+        if (AI.getType()->getPointerAddressSpace() ==
+            SrcTy->getPointerAddressSpace()) {
+          Instruction *NewI = replaceInstUsesWith(AI, Cast);
+          eraseInstFromFunction(*Copy);
+          ++NumGlobalCopies;
+          return NewI;
+        } else {
+          PointerReplacer PtrReplacer(*this);
+          PtrReplacer.replacePointer(AI, Cast);
+          ++NumGlobalCopies;
+        }
       }
     }
   }
@@ -332,10 +458,10 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
   SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
   LI.getAllMetadata(MD);
 
-  LoadInst *NewLoad = IC.Builder->CreateAlignedLoad(
-      IC.Builder->CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
+  LoadInst *NewLoad = IC.Builder.CreateAlignedLoad(
+      IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)),
       LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix);
-  NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
+  NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
   MDBuilder MDB(NewLoad->getContext());
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
@@ -363,21 +489,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
       break;
 
     case LLVMContext::MD_nonnull:
-      // This only directly applies if the new type is also a pointer.
-      if (NewTy->isPointerTy()) {
-        NewLoad->setMetadata(ID, N);
-        break;
-      }
-      // If it's integral now, translate it to !range metadata.
-      if (NewTy->isIntegerTy()) {
-        auto *ITy = cast<IntegerType>(NewTy);
-        auto *NullInt = ConstantExpr::getPtrToInt(
-            ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
-        auto *NonNullInt =
-            ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
-        NewLoad->setMetadata(LLVMContext::MD_range,
-                             MDB.createRange(NonNullInt, NullInt));
-      }
+      copyNonnullMetadata(LI, N, *NewLoad);
       break;
     case LLVMContext::MD_align:
     case LLVMContext::MD_dereferenceable:
@@ -387,17 +499,7 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT
         NewLoad->setMetadata(ID, N);
       break;
     case LLVMContext::MD_range:
-      // FIXME: It would be nice to propagate this in some way, but the type
-      // conversions make it hard.
-
-      // If it's a pointer now and the range does not contain 0, make it !nonnull.
-      if (NewTy->isPointerTy()) {
-        unsigned BitWidth = IC.getDataLayout().getTypeSizeInBits(NewTy);
-        if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
-          MDNode *NN = MDNode::get(LI.getContext(), None);
-          NewLoad->setMetadata(LLVMContext::MD_nonnull, NN);
-        }
-      }
+      copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad);
       break;
     }
   }
@@ -416,10 +518,10 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
   SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
   SI.getAllMetadata(MD);
 
-  StoreInst *NewStore = IC.Builder->CreateAlignedStore(
-      V, IC.Builder->CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
+  StoreInst *NewStore = IC.Builder.CreateAlignedStore(
+      V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
       SI.getAlignment(), SI.isVolatile());
-  NewStore->setAtomic(SI.getOrdering(), SI.getSynchScope());
+  NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
     MDNode *N = MDPair.second;
@@ -511,7 +613,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
       // Replace all the stores with stores of the newly loaded value.
       for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) {
         auto *SI = cast<StoreInst>(*UI++);
-        IC.Builder->SetInsertPoint(SI);
+        IC.Builder.SetInsertPoint(SI);
         combineStoreToNewValue(IC, *SI, NewLoad);
         IC.eraseInstFromFunction(*SI);
       }
@@ -559,7 +661,10 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U),
                                                ".unpack");
-      return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
+      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
 
@@ -584,11 +689,15 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
-                                                Name + ".elt");
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                               Name + ".elt");
       auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
-      auto *L = IC.Builder->CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
-      V = IC.Builder->CreateInsertValue(V, L, i);
+      auto *L = IC.Builder.CreateAlignedLoad(Ptr, EltAlign, Name + ".unpack");
+      // Propagate AA metadata. It'll still be valid on the narrowed load.
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
+      V = IC.Builder.CreateInsertValue(V, L, i);
     }
 
     V->setName(Name);
@@ -600,7 +709,10 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
       LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack");
-      return IC.replaceInstUsesWith(LI, IC.Builder->CreateInsertValue(
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      NewLoad->setAAMetadata(AAMD);
+      return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
         UndefValue::get(T), NewLoad, 0, Name));
     }
 
@@ -608,7 +720,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     // arrays of arbitrary size but this has a terrible impact on compile time.
     // The threshold here is chosen arbitrarily, maybe needs a little bit of
     // tuning.
-    if (NumElements > 1024)
+    if (NumElements > IC.MaxArraySizeForCombine)
       return nullptr;
 
     const DataLayout &DL = IC.getDataLayout();
@@ -628,11 +740,14 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder->CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
-                                                Name + ".elt");
-      auto *L = IC.Builder->CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
-                                              Name + ".unpack");
-      V = IC.Builder->CreateInsertValue(V, L, i);
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                               Name + ".elt");
+      auto *L = IC.Builder.CreateAlignedLoad(Ptr, MinAlign(Align, Offset),
+                                             Name + ".unpack");
+      AAMDNodes AAMD;
+      LI.getAAMetadata(AAMD);
+      L->setAAMetadata(AAMD);
+      V = IC.Builder.CreateInsertValue(V, L, i);
       Offset += EltSize;
     }
 
@@ -772,10 +887,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI,
   // first non-zero index.
   auto IsAllNonNegative = [&]() {
     for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
-      bool KnownNonNegative, KnownNegative;
-      IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative,
-                        KnownNegative, 0, MemI);
-      if (KnownNonNegative)
+      KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+      if (Known.isNonNegative())
         continue;
       return false;
     }
@@ -818,6 +931,18 @@ static Instruction *replaceGEPIdxWithZero(InstCombiner &IC, Value *Ptr,
   return nullptr;
 }
 
+static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    const Value *GEPI0 = GEPI->getOperand(0);
+    if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0)
+      return true;
+  }
+  if (isa<UndefValue>(Op) ||
+      (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0))
+    return true;
+  return false;
+}
+
 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
@@ -857,8 +982,8 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);
 
     return replaceInstUsesWith(
-        LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
-                                            LI.getName() + ".cast"));
+        LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+                                           LI.getName() + ".cast"));
   }
 
   // None of the following transforms are legal for volatile/ordered atomic
@@ -866,29 +991,16 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   if (!LI.isUnordered()) return nullptr;
 
   // load(gep null, ...) -> unreachable
-  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
-    const Value *GEPI0 = GEPI->getOperand(0);
-    // TODO: Consider a target hook for valid address spaces for this xform.
-    if (isa<ConstantPointerNull>(GEPI0) && GEPI->getPointerAddressSpace() == 0){
-      // Insert a new store to null instruction before the load to indicate
-      // that this code is not reachable.  We do this instead of inserting
-      // an unreachable instruction directly because we cannot modify the
-      // CFG.
-      new StoreInst(UndefValue::get(LI.getType()),
-                    Constant::getNullValue(Op->getType()), &LI);
-      return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
-    }
-  }
-
   // load null/undef -> unreachable
-  // TODO: Consider a target hook for valid address spaces for this xform.
-  if (isa<UndefValue>(Op) ||
-      (isa<ConstantPointerNull>(Op) && LI.getPointerAddressSpace() == 0)) {
-    // Insert a new store to null instruction before the load to indicate that
-    // this code is not reachable.  We do this instead of inserting an
-    // unreachable instruction directly because we cannot modify the CFG.
-    new StoreInst(UndefValue::get(LI.getType()),
-                  Constant::getNullValue(Op->getType()), &LI);
+  // TODO: Consider a target hook for valid address spaces for this xforms.
+  if (canSimplifyNullLoadOrGEP(LI, Op)) {
+    // Insert a new store to null instruction before the load to indicate
+    // that this code is not reachable.  We do this instead of inserting
+    // an unreachable instruction directly because we cannot modify the
+    // CFG.
+    StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
+                                  Constant::getNullValue(Op->getType()), &LI);
+    SI->setDebugLoc(LI.getDebugLoc());
     return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
   }
 
@@ -908,15 +1020,15 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       unsigned Align = LI.getAlignment();
       if (isSafeToLoadUnconditionally(SI->getOperand(1), Align, DL, SI) &&
           isSafeToLoadUnconditionally(SI->getOperand(2), Align, DL, SI)) {
-        LoadInst *V1 = Builder->CreateLoad(SI->getOperand(1),
-                                           SI->getOperand(1)->getName()+".val");
-        LoadInst *V2 = Builder->CreateLoad(SI->getOperand(2),
-                                           SI->getOperand(2)->getName()+".val");
+        LoadInst *V1 = Builder.CreateLoad(SI->getOperand(1),
+                                          SI->getOperand(1)->getName()+".val");
+        LoadInst *V2 = Builder.CreateLoad(SI->getOperand(2),
+                                          SI->getOperand(2)->getName()+".val");
         assert(LI.isUnordered() && "implied by above");
         V1->setAlignment(Align);
-        V1->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
         V2->setAlignment(Align);
-        V2->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
         return SelectInst::Create(SI->getCondition(), V1, V2);
       }
 
@@ -1061,7 +1173,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     // If the struct only have one element, we unpack.
     unsigned Count = ST->getNumElements();
     if (Count == 1) {
-      V = IC.Builder->CreateExtractValue(V, 0);
+      V = IC.Builder.CreateExtractValue(V, 0);
       combineStoreToNewValue(IC, SI, V);
       return true;
     }
@@ -1090,11 +1202,14 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder->CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
-                                                AddrName);
-      auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+                                               AddrName);
+      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
       auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
-      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
     }
 
     return true;
@@ -1104,7 +1219,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     // If the array only have one element, we unpack.
     auto NumElements = AT->getNumElements();
     if (NumElements == 1) {
-      V = IC.Builder->CreateExtractValue(V, 0);
+      V = IC.Builder.CreateExtractValue(V, 0);
       combineStoreToNewValue(IC, SI, V);
       return true;
     }
@@ -1113,7 +1228,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     // arrays of arbitrary size but this has a terrible impact on compile time.
     // The threshold here is chosen arbitrarily, maybe needs a little bit of
     // tuning.
-    if (NumElements > 1024)
+    if (NumElements > IC.MaxArraySizeForCombine)
       return false;
 
     const DataLayout &DL = IC.getDataLayout();
@@ -1137,11 +1252,14 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
         Zero,
         ConstantInt::get(IdxType, i),
       };
-      auto *Ptr = IC.Builder->CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
-                                                AddrName);
-      auto *Val = IC.Builder->CreateExtractValue(V, i, EltName);
+      auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+                                               AddrName);
+      auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
       auto EltAlign = MinAlign(Align, Offset);
-      IC.Builder->CreateAlignedStore(Val, Ptr, EltAlign);
+      Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+      AAMDNodes AAMD;
+      SI.getAAMetadata(AAMD);
+      NS->setAAMetadata(AAMD);
       Offset += EltSize;
     }
 
@@ -1268,8 +1386,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       break;
     }
 
-    // Don't skip over loads or things that can modify memory.
-    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
+    // Don't skip over loads, throws or things that can modify memory.
+    if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
       break;
   }
 
@@ -1392,8 +1510,8 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
       }
       // If we find something that may be using or overwriting the stored
       // value, or if we run out of instructions, we can't do the xform.
-      if (BBI->mayReadFromMemory() || BBI->mayWriteToMemory() ||
-          BBI == OtherBB->begin())
+      if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
+          BBI->mayWriteToMemory() || BBI == OtherBB->begin())
         return false;
     }
 
@@ -1402,7 +1520,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     // StoreBB.
     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
       // FIXME: This should really be AA driven.
-      if (I->mayReadFromMemory() || I->mayWriteToMemory())
+      if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
         return false;
     }
   }
@@ -1423,9 +1541,11 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                    SI.isVolatile(),
                                    SI.getAlignment(),
                                    SI.getOrdering(),
-                                   SI.getSynchScope());
+                                   SI.getSyncScopeID());
   InsertNewInstBefore(NewSI, *BBI);
-  NewSI->setDebugLoc(OtherStore->getDebugLoc());
+  // The debug locations of the original instructions might differ; merge them.
+  NewSI->setDebugLoc(DILocation::getMergedLocation(SI.getDebugLoc(),
+                                                   OtherStore->getDebugLoc()));
 
   // If the two stores had AA tags, merge them.
   AAMDNodes AATags;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 45a19fb..e3a5022 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -39,17 +39,15 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
   Value *A = nullptr, *B = nullptr, *One = nullptr;
   if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
       match(One, m_One())) {
-    A = IC.Builder->CreateSub(A, B);
-    return IC.Builder->CreateShl(One, A);
+    A = IC.Builder.CreateSub(A, B);
+    return IC.Builder.CreateShl(One, A);
   }
 
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   BinaryOperator *I = dyn_cast<BinaryOperator>(V);
   if (I && I->isLogicalShift() &&
-      isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0,
-                             &IC.getAssumptionCache(), &CxtI,
-                             &IC.getDominatorTree())) {
+      IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
     // We know that this is an exact/nuw shift and that the input is a
     // non-zero context as well.
     if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
@@ -132,8 +130,9 @@ static Constant *getLogBase2Vector(ConstantDataVector *CV) {
 
 /// \brief Return true if we can prove that:
 ///    (mul LHS, RHS)  === (mul nsw LHS, RHS)
-bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
-                                            Instruction &CxtI) {
+bool InstCombiner::willNotOverflowSignedMul(const Value *LHS,
+                                            const Value *RHS,
+                                            const Instruction &CxtI) const {
   // Multiplying n * m significant bits yields a result of n + m significant
   // bits. If the total number of significant bits does not exceed the
   // result bit width (minus 1), there is no overflow.
@@ -162,11 +161,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS,
     // product is exactly the minimum negative number.
     // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000
     // For simplicity we just check if at least one side is not negative.
-    bool LHSNonNegative, LHSNegative;
-    bool RHSNonNegative, RHSNegative;
-    ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI);
-    ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI);
-    if (LHSNonNegative || RHSNonNegative)
+    KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI);
+    KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI);
+    if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative())
       return true;
   }
   return false;
@@ -179,7 +176,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyMulInst(Op0, Op1, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyMulInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyUsingDistributiveLaws(I))
@@ -230,8 +227,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         if (I.hasNoUnsignedWrap())
           Shl->setHasNoUnsignedWrap();
         if (I.hasNoSignedWrap()) {
-          uint64_t V;
-          if (match(NewCst, m_ConstantInt(V)) && V != Width - 1)
+          const APInt *V;
+          if (match(NewCst, m_APInt(V)) && *V != Width - 1)
             Shl->setHasNoSignedWrap();
         }
 
@@ -253,9 +250,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
           ConstantInt *C1;
           Value *Sub = nullptr;
           if (match(Op0, m_Sub(m_Value(Y), m_Value(X))))
-            Sub = Builder->CreateSub(X, Y, "suba");
+            Sub = Builder.CreateSub(X, Y, "suba");
           else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1))))
-            Sub = Builder->CreateSub(Builder->CreateNeg(C1), Y, "subc");
+            Sub = Builder.CreateSub(Builder.CreateNeg(C1), Y, "subc");
           if (Sub)
             return
               BinaryOperator::CreateMul(Sub,
@@ -275,11 +272,11 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       Value *X;
       Constant *C1;
       if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
-        Value *Mul = Builder->CreateMul(C1, Op1);
+        Value *Mul = Builder.CreateMul(C1, Op1);
         // Only go forward with the transform if C1*CI simplifies to a tidier
         // constant.
         if (!match(Mul, m_Mul(m_Value(), m_Value())))
-          return BinaryOperator::CreateAdd(Builder->CreateMul(X, Op1), Mul);
+          return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
       }
     }
   }
@@ -298,44 +295,38 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
   {
-    Value *Op1C = Op1;
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
-    if (!BO ||
-        (BO->getOpcode() != Instruction::UDiv &&
-         BO->getOpcode() != Instruction::SDiv)) {
-      Op1C = Op0;
-      BO = dyn_cast<BinaryOperator>(Op1);
+    Value *Y = Op1;
+    BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
+    if (!Div || (Div->getOpcode() != Instruction::UDiv &&
+                 Div->getOpcode() != Instruction::SDiv)) {
+      Y = Op0;
+      Div = dyn_cast<BinaryOperator>(Op1);
     }
-    Value *Neg = dyn_castNegVal(Op1C);
-    if (BO && BO->hasOneUse() &&
-        (BO->getOperand(1) == Op1C || BO->getOperand(1) == Neg) &&
-        (BO->getOpcode() == Instruction::UDiv ||
-         BO->getOpcode() == Instruction::SDiv)) {
-      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+    Value *Neg = dyn_castNegVal(Y);
+    if (Div && Div->hasOneUse() &&
+        (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
+        (Div->getOpcode() == Instruction::UDiv ||
+         Div->getOpcode() == Instruction::SDiv)) {
+      Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
 
       // If the division is exact, X % Y is zero, so we end up with X or -X.
-      if (PossiblyExactOperator *SDiv = dyn_cast<PossiblyExactOperator>(BO))
-        if (SDiv->isExact()) {
-          if (Op1BO == Op1C)
-            return replaceInstUsesWith(I, Op0BO);
-          return BinaryOperator::CreateNeg(Op0BO);
-        }
-
-      Value *Rem;
-      if (BO->getOpcode() == Instruction::UDiv)
-        Rem = Builder->CreateURem(Op0BO, Op1BO);
-      else
-        Rem = Builder->CreateSRem(Op0BO, Op1BO);
-      Rem->takeName(BO);
+      if (Div->isExact()) {
+        if (DivOp1 == Y)
+          return replaceInstUsesWith(I, X);
+        return BinaryOperator::CreateNeg(X);
+      }
 
-      if (Op1BO == Op1C)
-        return BinaryOperator::CreateSub(Op0BO, Rem);
-      return BinaryOperator::CreateSub(Rem, Op0BO);
+      auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
+                                                          : Instruction::SRem;
+      Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
+      if (DivOp1 == Y)
+        return BinaryOperator::CreateSub(X, Rem);
+      return BinaryOperator::CreateSub(Rem, X);
     }
   }
 
   /// i1 mul -> i1 and.
-  if (I.getType()->getScalarType()->isIntegerTy(1))
+  if (I.getType()->isIntOrIntVectorTy(1))
     return BinaryOperator::CreateAnd(Op0, Op1);
 
   // X*(1 << Y) --> X << Y
@@ -377,7 +368,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
 
     if (BoolCast) {
-      Value *V = Builder->CreateSub(Constant::getNullValue(I.getType()),
+      Value *V = Builder.CreateSub(Constant::getNullValue(I.getType()),
                                     BoolCast);
       return BinaryOperator::CreateAnd(V, OtherOp);
     }
@@ -392,10 +383,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
         if (ConstantExpr::getSExt(CI, I.getType()) == Op1C &&
-            WillNotOverflowSignedMul(Op0Conv->getOperand(0), CI, I)) {
+            willNotOverflowSignedMul(Op0Conv->getOperand(0), CI, I)) {
           // Insert the new, smaller mul.
           Value *NewMul =
-              Builder->CreateNSWMul(Op0Conv->getOperand(0), CI, "mulconv");
+              Builder.CreateNSWMul(Op0Conv->getOperand(0), CI, "mulconv");
           return new SExtInst(NewMul, I.getType());
         }
       }
@@ -409,10 +400,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (Op0Conv->getOperand(0)->getType() ==
               Op1Conv->getOperand(0)->getType() &&
           (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
-          WillNotOverflowSignedMul(Op0Conv->getOperand(0),
+          willNotOverflowSignedMul(Op0Conv->getOperand(0),
                                    Op1Conv->getOperand(0), I)) {
         // Insert the new integer mul.
-        Value *NewMul = Builder->CreateNSWMul(
+        Value *NewMul = Builder.CreateNSWMul(
             Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
         return new SExtInst(NewMul, I.getType());
       }
@@ -428,11 +419,10 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
         Constant *CI =
             ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType());
         if (ConstantExpr::getZExt(CI, I.getType()) == Op1C &&
-            computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) ==
-                OverflowResult::NeverOverflows) {
+            willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) {
           // Insert the new, smaller mul.
           Value *NewMul =
-              Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
+              Builder.CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv");
           return new ZExtInst(NewMul, I.getType());
         }
       }
@@ -446,25 +436,22 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (Op0Conv->getOperand(0)->getType() ==
               Op1Conv->getOperand(0)->getType() &&
           (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) &&
-          computeOverflowForUnsignedMul(Op0Conv->getOperand(0),
-                                        Op1Conv->getOperand(0),
-                                        &I) == OverflowResult::NeverOverflows) {
+          willNotOverflowUnsignedMul(Op0Conv->getOperand(0),
+                                     Op1Conv->getOperand(0), I)) {
         // Insert the new integer mul.
-        Value *NewMul = Builder->CreateNUWMul(
+        Value *NewMul = Builder.CreateNUWMul(
             Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv");
         return new ZExtInst(NewMul, I.getType());
       }
     }
   }
 
-  if (!I.hasNoSignedWrap() && WillNotOverflowSignedMul(Op0, Op1, I)) {
+  if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoSignedWrap(true);
   }
 
-  if (!I.hasNoUnsignedWrap() &&
-      computeOverflowForUnsignedMul(Op0, Op1, &I) ==
-          OverflowResult::NeverOverflows) {
+  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
     Changed = true;
     I.setHasNoUnsignedWrap(true);
   }
@@ -612,8 +599,8 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (isa<Constant>(Op0))
     std::swap(Op0, Op1);
 
-  if (Value *V =
-          SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(),
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   bool AllowReassociate = I.hasUnsafeAlgebra();
@@ -711,11 +698,11 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
     // if pattern detected emit alternate sequence
     if (OpX && OpY) {
-      BuilderTy::FastMathFlagGuard Guard(*Builder);
-      Builder->setFastMathFlags(Log2->getFastMathFlags());
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(Log2->getFastMathFlags());
       Log2->setArgOperand(0, OpY);
-      Value *FMulVal = Builder->CreateFMul(OpX, Log2);
-      Value *FSub = Builder->CreateFSub(FMulVal, OpX);
+      Value *FMulVal = Builder.CreateFMul(OpX, Log2);
+      Value *FSub = Builder.CreateFSub(FMulVal, OpX);
       FSub->takeName(&I);
       return replaceInstUsesWith(I, FSub);
     }
@@ -727,23 +714,23 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   for (int i = 0; i < 2; i++) {
     bool IgnoreZeroSign = I.hasNoSignedZeros();
     if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
-      BuilderTy::FastMathFlagGuard Guard(*Builder);
-      Builder->setFastMathFlags(I.getFastMathFlags());
+      BuilderTy::FastMathFlagGuard Guard(Builder);
+      Builder.setFastMathFlags(I.getFastMathFlags());
 
       Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
       Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
 
       // -X * -Y => X*Y
       if (N1) {
-        Value *FMul = Builder->CreateFMul(N0, N1);
+        Value *FMul = Builder.CreateFMul(N0, N1);
         FMul->takeName(&I);
         return replaceInstUsesWith(I, FMul);
       }
 
       if (Opnd0->hasOneUse()) {
         // -X * Y => -(X*Y) (Promote negation as high as possible)
-        Value *T = Builder->CreateFMul(N0, Opnd1);
-        Value *Neg = Builder->CreateFNeg(T);
+        Value *T = Builder.CreateFMul(N0, Opnd1);
+        Value *Neg = Builder.CreateFNeg(T);
         Neg->takeName(&I);
         return replaceInstUsesWith(I, Neg);
       }
@@ -768,10 +755,10 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           Y = Opnd0_0;
 
         if (Y) {
-          BuilderTy::FastMathFlagGuard Guard(*Builder);
-          Builder->setFastMathFlags(I.getFastMathFlags());
-          Value *T = Builder->CreateFMul(Opnd1, Opnd1);
-          Value *R = Builder->CreateFMul(T, Y);
+          BuilderTy::FastMathFlagGuard Guard(Builder);
+          Builder.setFastMathFlags(I.getFastMathFlags());
+          Value *T = Builder.CreateFMul(Opnd1, Opnd1);
+          Value *R = Builder.CreateFMul(T, Y);
           R->takeName(&I);
           return replaceInstUsesWith(I, R);
         }
@@ -837,7 +824,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
         *I = SI->getOperand(NonNullOperand);
         Worklist.Add(&*BBI);
       } else if (*I == SelectCond) {
-        *I = Builder->getInt1(NonNullOperand == 1);
+        *I = Builder.getInt1(NonNullOperand == 1);
         Worklist.Add(&*BBI);
       }
     }
@@ -944,28 +931,25 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
         }
       }
 
-      if (*C2 != 0) // avoid X udiv 0
+      if (!C2->isNullValue()) // avoid X udiv 0
         if (Instruction *FoldedDiv = foldOpWithConstantIntoOperand(I))
           return FoldedDiv;
     }
   }
 
-  if (ConstantInt *One = dyn_cast<ConstantInt>(Op0)) {
-    if (One->isOne() && !I.getType()->isIntegerTy(1)) {
-      bool isSigned = I.getOpcode() == Instruction::SDiv;
-      if (isSigned) {
-        // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
-        // result is one, if Op1 is -1 then the result is minus one, otherwise
-        // it's zero.
-        Value *Inc = Builder->CreateAdd(Op1, One);
-        Value *Cmp = Builder->CreateICmpULT(
-                         Inc, ConstantInt::get(I.getType(), 3));
-        return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0));
-      } else {
-        // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
-        // result is one, otherwise it's zero.
-        return new ZExtInst(Builder->CreateICmpEQ(Op1, One), I.getType());
-      }
+  if (match(Op0, m_One())) {
+    assert(!I.getType()->isIntOrIntVectorTy(1) && "i1 divide not removed?");
+    if (I.getOpcode() == Instruction::SDiv) {
+      // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
+      // result is one, if Op1 is -1 then the result is minus one, otherwise
+      // it's zero.
+      Value *Inc = Builder.CreateAdd(Op1, Op0);
+      Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(I.getType(), 3));
+      return SelectInst::Create(Cmp, Op1, ConstantInt::get(I.getType(), 0));
+    } else {
+      // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
+      // result is one, otherwise it's zero.
+      return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), I.getType());
     }
   }
 
@@ -1040,7 +1024,7 @@ static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
 // X udiv C, where C >= signbit
 static Instruction *foldUDivNegCst(Value *Op0, Value *Op1,
                                    const BinaryOperator &I, InstCombiner &IC) {
-  Value *ICI = IC.Builder->CreateICmpULT(Op0, cast<ConstantInt>(Op1));
+  Value *ICI = IC.Builder.CreateICmpULT(Op0, cast<ConstantInt>(Op1));
 
   return SelectInst::Create(ICI, Constant::getNullValue(I.getType()),
                             ConstantInt::get(I.getType(), 1));
@@ -1059,10 +1043,9 @@ static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
   if (!match(ShiftLeft, m_Shl(m_APInt(CI), m_Value(N))))
     llvm_unreachable("match should never fail here!");
   if (*CI != 1)
-    N = IC.Builder->CreateAdd(N,
-                              ConstantInt::get(N->getType(), CI->logBase2()));
+    N = IC.Builder.CreateAdd(N, ConstantInt::get(N->getType(), CI->logBase2()));
   if (Op1 != ShiftLeft)
-    N = IC.Builder->CreateZExt(N, Op1->getType());
+    N = IC.Builder.CreateZExt(N, Op1->getType());
   BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
   if (I.isExact())
     LShr->setIsExact();
@@ -1118,7 +1101,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyUDivInst(Op0, Op1, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyUDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
@@ -1148,7 +1131,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
     if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
       return new ZExtInst(
-          Builder->CreateUDiv(ZOp0->getOperand(0), ZOp1, "div", I.isExact()),
+          Builder.CreateUDiv(ZOp0->getOperand(0), ZOp1, "div", I.isExact()),
           I.getType());
 
   // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
@@ -1191,7 +1174,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySDivInst(Op0, Op1, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifySDivInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // Handle the integer div common cases
@@ -1223,7 +1206,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
 
       Constant *NarrowDivisor =
           ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
-      Value *NarrowOp = Builder->CreateSDiv(Op0Src, NarrowDivisor);
+      Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
       return new SExtInst(NarrowOp, Op0->getType());
     }
   }
@@ -1231,7 +1214,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
   if (Constant *RHS = dyn_cast<Constant>(Op1)) {
     // X/INT_MIN -> X == INT_MIN
     if (RHS->isMinSignedValue())
-      return new ZExtInst(Builder->CreateICmpEQ(Op0, Op1), I.getType());
+      return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), I.getType());
 
     // -X/C  -->  X/-C  provided the negation doesn't overflow.
     Value *X;
@@ -1244,25 +1227,23 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
   // unsigned inputs), turn this into a udiv.
-  if (I.getType()->isIntegerTy()) {
-    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
-    if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
-      if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
-        // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
-        auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
-        BO->setIsExact(I.isExact());
-        return BO;
-      }
+  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+  if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
+    if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
+      // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      BO->setIsExact(I.isExact());
+      return BO;
+    }
 
-      if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) {
-        // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
-        // Safe because the only negative value (1 << Y) can take on is
-        // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
-        // the sign bit set.
-        auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
-        BO->setIsExact(I.isExact());
-        return BO;
-      }
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+      // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
+      // Safe because the only negative value (1 << Y) can take on is
+      // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
+      // the sign bit set.
+      auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+      BO->setIsExact(I.isExact());
+      return BO;
     }
   }
 
@@ -1306,7 +1287,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyFDivInst(Op0, Op1, I.getFastMathFlags(),
-                                  DL, &TLI, &DT, &AC))
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (isa<Constant>(Op0))
@@ -1396,7 +1377,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       // (X/Y) / Z => X / (Y*Z)
       //
       if (!isa<Constant>(Y) || !isa<Constant>(Op1)) {
-        NewInst = Builder->CreateFMul(Y, Op1);
+        NewInst = Builder.CreateFMul(Y, Op1);
         if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
           FastMathFlags Flags = I.getFastMathFlags();
           Flags &= cast<Instruction>(Op0)->getFastMathFlags();
@@ -1408,7 +1389,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       // Z / (X/Y) => Z*Y / X
       //
       if (!isa<Constant>(Y) || !isa<Constant>(Op0)) {
-        NewInst = Builder->CreateFMul(Op0, Y);
+        NewInst = Builder.CreateFMul(Op0, Y);
         if (Instruction *RI = dyn_cast<Instruction>(NewInst)) {
           FastMathFlags Flags = I.getFastMathFlags();
           Flags &= cast<Instruction>(Op1)->getFastMathFlags();
@@ -1461,16 +1442,16 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
       if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
         if (Instruction *R = FoldOpIntoSelect(I, SI))
           return R;
-      } else if (isa<PHINode>(Op0I)) {
+      } else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
         using namespace llvm::PatternMatch;
         const APInt *Op1Int;
         if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
             (I.getOpcode() == Instruction::URem ||
              !Op1Int->isMinSignedValue())) {
-          // FoldOpIntoPhi will speculate instructions to the end of the PHI's
+          // foldOpIntoPhi will speculate instructions to the end of the PHI's
           // predecessor blocks, so do this only if we know the srem or urem
           // will not fault.
-          if (Instruction *NV = FoldOpIntoPhi(I))
+          if (Instruction *NV = foldOpIntoPhi(I, PN))
             return NV;
         }
       }
@@ -1490,7 +1471,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyURemInst(Op0, Op1, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyURemInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *common = commonIRemTransforms(I))
@@ -1499,28 +1480,28 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   // (zext A) urem (zext B) --> zext (A urem B)
   if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
     if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
-      return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1),
+      return new ZExtInst(Builder.CreateURem(ZOp0->getOperand(0), ZOp1),
                           I.getType());
 
   // X urem Y -> X and Y-1, where Y is a power of 2,
-  if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) {
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
-    Value *Add = Builder->CreateAdd(Op1, N1);
+    Value *Add = Builder.CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
   }
 
   // 1 urem X -> zext(X != 1)
   if (match(Op0, m_One())) {
-    Value *Cmp = Builder->CreateICmpNE(Op1, Op0);
-    Value *Ext = Builder->CreateZExt(Cmp, I.getType());
+    Value *Cmp = Builder.CreateICmpNE(Op1, Op0);
+    Value *Ext = Builder.CreateZExt(Cmp, I.getType());
     return replaceInstUsesWith(I, Ext);
   }
 
   // X urem C -> X < C ? X : X - C, where C >= signbit.
   const APInt *DivisorC;
   if (match(Op1, m_APInt(DivisorC)) && DivisorC->isNegative()) {
-    Value *Cmp = Builder->CreateICmpULT(Op0, Op1);
-    Value *Sub = Builder->CreateSub(Op0, Op1);
+    Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
+    Value *Sub = Builder.CreateSub(Op0, Op1);
     return SelectInst::Create(Cmp, Op0, Sub);
   }
 
@@ -1533,7 +1514,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifySRemInst(Op0, Op1, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifySRemInst(Op0, Op1, SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // Handle the integer rem common cases
@@ -1552,13 +1533,11 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
 
   // If the sign bits of both operands are zero (i.e. we can prove they are
   // unsigned inputs), turn this into a urem.
-  if (I.getType()->isIntegerTy()) {
-    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
-    if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
-        MaskedValueIsZero(Op0, Mask, 0, &I)) {
-      // X srem Y -> X urem Y, iff X and Y don't have sign bit set
-      return BinaryOperator::CreateURem(Op0, Op1, I.getName());
-    }
+  APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+  if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
+      MaskedValueIsZero(Op0, Mask, 0, &I)) {
+    // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+    return BinaryOperator::CreateURem(Op0, Op1, I.getName());
   }
 
   // If it's a constant vector, flip any negative values positive.
@@ -1609,7 +1588,7 @@ Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
     return replaceInstUsesWith(I, V);
 
   if (Value *V = SimplifyFRemInst(Op0, Op1, I.getFastMathFlags(),
-                                  DL, &TLI, &DT, &AC))
+                                  SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   // Handle cases involving: rem X, (select Cond, Y, Z)
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 4cbffe9..0011412 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -16,9 +16,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/DebugInfo.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -457,8 +457,8 @@ Instruction *InstCombiner::FoldPHIArgZextsIntoPHI(PHINode &Phi) {
   }
 
   // The more common cases of a phi with no constant operands or just one
-  // variable operand are handled by FoldPHIArgOpIntoPHI() and FoldOpIntoPhi()
-  // respectively. FoldOpIntoPhi() wants to do the opposite transform that is
+  // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi()
+  // respectively. foldOpIntoPhi() wants to do the opposite transform that is
   // performed here. It tries to replicate a cast in the phi operand's basic
   // block to expose other folding opportunities. Thus, InstCombine will
   // infinite loop without this check.
@@ -507,7 +507,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     // Be careful about transforming integer PHIs.  We don't want to pessimize
     // the code by turning an i32 into an i1293.
     if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
-      if (!ShouldChangeType(PN.getType(), CastSrcTy))
+      if (!shouldChangeType(PN.getType(), CastSrcTy))
         return nullptr;
     }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
@@ -636,10 +636,10 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
 /// Return an existing non-zero constant if this phi node has one, otherwise
 /// return constant 1.
 static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) {
-  assert(isa<IntegerType>(PN.getType()) && "Expect only intger type phi");
+  assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi");
   for (Value *V : PN.operands())
     if (auto *ConstVA = dyn_cast<ConstantInt>(V))
-      if (!ConstVA->isZeroValue())
+      if (!ConstVA->isZero())
         return ConstVA;
   return ConstantInt::get(cast<IntegerType>(PN.getType()), 1);
 }
@@ -836,12 +836,12 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
         }
 
         // Otherwise, do an extract in the predecessor.
-        Builder->SetInsertPoint(Pred->getTerminator());
+        Builder.SetInsertPoint(Pred->getTerminator());
         Value *Res = InVal;
         if (Offset)
-          Res = Builder->CreateLShr(Res, ConstantInt::get(InVal->getType(),
+          Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(),
                                                           Offset), "extract");
-        Res = Builder->CreateTrunc(Res, Ty, "extract.t");
+        Res = Builder.CreateTrunc(Res, Ty, "extract.t");
         PredVal = Res;
         EltPHI->addIncoming(Res, Pred);
 
@@ -880,7 +880,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
     return replaceInstUsesWith(PN, V);
 
   if (Instruction *Result = FoldPHIArgZextsIntoPHI(PN))
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 3664484..4eebe82 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -60,12 +61,12 @@ static CmpInst::Predicate getCmpPredicateForMinMax(SelectPatternFlavor SPF,
   }
 }
 
-static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy *Builder,
+static Value *generateMinMaxSelectPattern(InstCombiner::BuilderTy &Builder,
                                           SelectPatternFlavor SPF, Value *A,
                                           Value *B) {
   CmpInst::Predicate Pred = getCmpPredicateForMinMax(SPF);
   assert(CmpInst::isIntPredicate(Pred));
-  return Builder->CreateSelect(Builder->CreateICmp(Pred, A, B), A, B);
+  return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
 }
 
 /// We want to turn code that looks like this:
@@ -120,6 +121,16 @@ static Constant *getSelectFoldableConstant(Instruction *I) {
 /// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
+  // Don't break up min/max patterns. The hasOneUse checks below prevent that
+  // for most cases, but vector min/max with bitcasts can be transformed. If the
+  // one-use restrictions are eased for other patterns, we still don't want to
+  // obfuscate min/max.
+  if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
+       match(&SI, m_SMax(m_Value(), m_Value())) ||
+       match(&SI, m_UMin(m_Value(), m_Value())) ||
+       match(&SI, m_UMax(m_Value(), m_Value()))))
+    return nullptr;
+
   // If this is a cast from the same type, merge.
   if (TI->getNumOperands() == 1 && TI->isCast()) {
     Type *FIOpndTy = FI->getOperand(0)->getType();
@@ -156,8 +167,8 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
 
     // Fold this by inserting a select from the input values.
     Value *NewSI =
-        Builder->CreateSelect(SI.getCondition(), TI->getOperand(0),
-                              FI->getOperand(0), SI.getName() + ".v", &SI);
+        Builder.CreateSelect(SI.getCondition(), TI->getOperand(0),
+                             FI->getOperand(0), SI.getName() + ".v", &SI);
     return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
                             TI->getType());
   }
@@ -200,8 +211,8 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
   }
 
   // If we reach here, they do have operations in common.
-  Value *NewSI = Builder->CreateSelect(SI.getCondition(), OtherOpT, OtherOpF,
-                                       SI.getName() + ".v", &SI);
+  Value *NewSI = Builder.CreateSelect(SI.getCondition(), OtherOpT, OtherOpF,
+                                      SI.getName() + ".v", &SI);
   Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
   Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
   return BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
@@ -216,8 +227,8 @@ static bool isSelect01(Constant *C1, Constant *C2) {
     return false;
   if (!C1I->isZero() && !C2I->isZero()) // One side must be zero.
     return false;
-  return C1I->isOne() || C1I->isAllOnesValue() ||
-         C2I->isOne() || C2I->isAllOnesValue();
+  return C1I->isOne() || C1I->isMinusOne() ||
+         C2I->isOne() || C2I->isMinusOne();
 }
 
 /// Try to fold the select into one of the operands to allow further
@@ -243,7 +254,7 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
-            Value *NewSel = Builder->CreateSelect(SI.getCondition(), OOp, C);
+            Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
             NewSel->takeName(TVI);
             BinaryOperator *TVI_BO = cast<BinaryOperator>(TVI);
             BinaryOperator *BO = BinaryOperator::Create(TVI_BO->getOpcode(),
@@ -273,7 +284,7 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
           // Avoid creating select between 2 constants unless it's selecting
           // between 0, 1 and -1.
           if (!isa<Constant>(OOp) || isSelect01(C, cast<Constant>(OOp))) {
-            Value *NewSel = Builder->CreateSelect(SI.getCondition(), C, OOp);
+            Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
             NewSel->takeName(FVI);
             BinaryOperator *FVI_BO = cast<BinaryOperator>(FVI);
             BinaryOperator *BO = BinaryOperator::Create(FVI_BO->getOpcode(),
@@ -292,7 +303,7 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 /// We want to turn:
 ///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
 /// into:
-///   (or (shl (and X, C1), C3), y)
+///   (or (shl (and X, C1), C3), Y)
 /// iff:
 ///   C1 and C2 are both powers of 2
 /// where:
@@ -304,21 +315,46 @@ Instruction *InstCombiner::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 /// 3. The magnitude of C2 and C1 are flipped
 static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
-                                  InstCombiner::BuilderTy *Builder) {
+                                  InstCombiner::BuilderTy &Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
-  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
+  if (!IC || !SI.getType()->isIntegerTy())
     return nullptr;
 
   Value *CmpLHS = IC->getOperand(0);
   Value *CmpRHS = IC->getOperand(1);
 
-  if (!match(CmpRHS, m_Zero()))
-    return nullptr;
+  Value *V;
+  unsigned C1Log;
+  bool IsEqualZero;
+  bool NeedAnd = false;
+  if (IC->isEquality()) {
+    if (!match(CmpRHS, m_Zero()))
+      return nullptr;
+
+    const APInt *C1;
+    if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1))))
+      return nullptr;
+
+    V = CmpLHS;
+    C1Log = C1->logBase2();
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ;
+  } else if (IC->getPredicate() == ICmpInst::ICMP_SLT ||
+             IC->getPredicate() == ICmpInst::ICMP_SGT) {
+    // We also need to recognize (icmp slt (trunc (X)), 0) and
+    // (icmp sgt (trunc (X)), -1).
+    IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT;
+    if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) ||
+        (!IsEqualZero && !match(CmpRHS, m_Zero())))
+      return nullptr;
+
+    if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V)))))
+      return nullptr;
 
-  Value *X;
-  const APInt *C1;
-  if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1))))
+    C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1;
+    NeedAnd = true;
+  } else {
     return nullptr;
+  }
 
   const APInt *C2;
   bool OrOnTrueVal = false;
@@ -329,26 +365,40 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
   if (!OrOnFalseVal && !OrOnTrueVal)
     return nullptr;
 
-  Value *V = CmpLHS;
   Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
 
-  unsigned C1Log = C1->logBase2();
   unsigned C2Log = C2->logBase2();
+
+  bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal);
+  bool NeedShift = C1Log != C2Log;
+  bool NeedZExtTrunc = Y->getType()->getIntegerBitWidth() !=
+                       V->getType()->getIntegerBitWidth();
+
+  // Make sure we don't create more instructions than we save.
+  Value *Or = OrOnFalseVal ? FalseVal : TrueVal;
+  if ((NeedShift + NeedXor + NeedZExtTrunc) >
+      (IC->hasOneUse() + Or->hasOneUse()))
+    return nullptr;
+
+  if (NeedAnd) {
+    // Insert the AND instruction on the input to the truncate.
+    APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log);
+    V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1));
+  }
+
   if (C2Log > C1Log) {
-    V = Builder->CreateZExtOrTrunc(V, Y->getType());
-    V = Builder->CreateShl(V, C2Log - C1Log);
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
+    V = Builder.CreateShl(V, C2Log - C1Log);
   } else if (C1Log > C2Log) {
-    V = Builder->CreateLShr(V, C1Log - C2Log);
-    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+    V = Builder.CreateLShr(V, C1Log - C2Log);
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
   } else
-    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+    V = Builder.CreateZExtOrTrunc(V, Y->getType());
 
-  ICmpInst::Predicate Pred = IC->getPredicate();
-  if ((Pred == ICmpInst::ICMP_NE && OrOnFalseVal) ||
-      (Pred == ICmpInst::ICMP_EQ && OrOnTrueVal))
-    V = Builder->CreateXor(V, *C2);
+  if (NeedXor)
+    V = Builder.CreateXor(V, *C2);
 
-  return Builder->CreateOr(V, Y);
+  return Builder.CreateOr(V, Y);
 }
 
 /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
@@ -364,7 +414,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
 /// into:
 ///   %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
 static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
-                                  InstCombiner::BuilderTy *Builder) {
+                                 InstCombiner::BuilderTy &Builder) {
   ICmpInst::Predicate Pred = ICI->getPredicate();
   Value *CmpLHS = ICI->getOperand(0);
   Value *CmpRHS = ICI->getOperand(1);
@@ -395,7 +445,6 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   if (match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) ||
       match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS)))) {
     IntrinsicInst *II = cast<IntrinsicInst>(Count);
-    IRBuilder<> Builder(II);
     // Explicitly clear the 'undef_on_zero' flag.
     IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone());
     Type *Ty = NewI->getArgOperand(1)->getType();
@@ -500,18 +549,16 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
   return true;
 }
 
-/// If this is an integer min/max where the select's 'true' operand is a
-/// constant, canonicalize that constant to the 'false' operand:
-/// select (icmp Pred X, C), C, X --> select (icmp Pred' X, C), X, C
+/// If this is an integer min/max (icmp + select) with a constant operand,
+/// create the canonical icmp for the min/max operation and canonicalize the
+/// constant to the 'false' operand of the select:
+/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
+/// Note: if C1 != C2, this will change the icmp constant to the existing
+/// constant operand of the select.
 static Instruction *
 canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
                                InstCombiner::BuilderTy &Builder) {
-  // TODO: We should also canonicalize min/max when the select has a different
-  // constant value than the cmp constant, but we need to fix the backend first.
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)) ||
-      !isa<Constant>(Sel.getTrueValue()) ||
-      isa<Constant>(Sel.getFalseValue()) ||
-      Cmp.getOperand(1) != Sel.getTrueValue())
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
   // Canonicalize the compare predicate based on whether we have min or max.
@@ -526,22 +573,31 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   default: return nullptr;
   }
 
-  // Canonicalize the constant to the right side.
-  if (isa<Constant>(LHS))
-    std::swap(LHS, RHS);
+  // Is this already canonical?
+  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
+      Cmp.getPredicate() == NewPred)
+    return nullptr;
+
+  // Create the canonical compare and plug it into the select.
+  Sel.setCondition(Builder.CreateICmp(NewPred, LHS, RHS));
 
-  Value *NewCmp = Builder.CreateICmp(NewPred, LHS, RHS);
-  SelectInst *NewSel = SelectInst::Create(NewCmp, LHS, RHS, "", nullptr, &Sel);
+  // If the select operands did not change, we're done.
+  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
+    return &Sel;
 
-  // We swapped the select operands, so swap the metadata too.
-  NewSel->swapProfMetadata();
-  return NewSel;
+  // If we are swapping the select operands, swap the metadata too.
+  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
+         "Unexpected results from matchSelectPattern");
+  Sel.setTrueValue(LHS);
+  Sel.setFalseValue(RHS);
+  Sel.swapProfMetadata();
+  return &Sel;
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
 Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
                                                   ICmpInst *ICI) {
-  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *Builder))
+  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
     return NewSel;
 
   bool Changed = adjustMinMax(SI, *ICI);
@@ -561,23 +617,23 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
     if (TrueVal->getType() == Ty) {
       if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
         ConstantInt *C1 = nullptr, *C2 = nullptr;
-        if (Pred == ICmpInst::ICMP_SGT && Cmp->isAllOnesValue()) {
+        if (Pred == ICmpInst::ICMP_SGT && Cmp->isMinusOne()) {
           C1 = dyn_cast<ConstantInt>(TrueVal);
           C2 = dyn_cast<ConstantInt>(FalseVal);
-        } else if (Pred == ICmpInst::ICMP_SLT && Cmp->isNullValue()) {
+        } else if (Pred == ICmpInst::ICMP_SLT && Cmp->isZero()) {
           C1 = dyn_cast<ConstantInt>(FalseVal);
           C2 = dyn_cast<ConstantInt>(TrueVal);
         }
         if (C1 && C2) {
           // This shift results in either -1 or 0.
-          Value *AShr = Builder->CreateAShr(CmpLHS, Ty->getBitWidth()-1);
+          Value *AShr = Builder.CreateAShr(CmpLHS, Ty->getBitWidth() - 1);
 
           // Check if we can express the operation with a single or.
-          if (C2->isAllOnesValue())
-            return replaceInstUsesWith(SI, Builder->CreateOr(AShr, C1));
+          if (C2->isMinusOne())
+            return replaceInstUsesWith(SI, Builder.CreateOr(AShr, C1));
 
-          Value *And = Builder->CreateAnd(AShr, C2->getValue()-C1->getValue());
-          return replaceInstUsesWith(SI, Builder->CreateAdd(And, C1));
+          Value *And = Builder.CreateAnd(AShr, C2->getValue() - C1->getValue());
+          return replaceInstUsesWith(SI, Builder.CreateAdd(And, C1));
         }
       }
     }
@@ -602,7 +658,7 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
   {
     unsigned BitWidth =
         DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
-    APInt MinSignedValue = APInt::getSignBit(BitWidth);
+    APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
     Value *X;
     const APInt *Y, *C;
     bool TrueWhenUnset;
@@ -628,19 +684,19 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
       // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y
       if (TrueWhenUnset && TrueVal == X &&
           match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder->CreateAnd(X, ~(*Y));
+        V = Builder.CreateAnd(X, ~(*Y));
       // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y
       else if (!TrueWhenUnset && FalseVal == X &&
                match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder->CreateAnd(X, ~(*Y));
+        V = Builder.CreateAnd(X, ~(*Y));
       // (X & Y) == 0 ? X ^ Y : X  --> X | Y
       else if (TrueWhenUnset && FalseVal == X &&
                match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder->CreateOr(X, *Y);
+        V = Builder.CreateOr(X, *Y);
       // (X & Y) != 0 ? X : X ^ Y  --> X | Y
       else if (!TrueWhenUnset && TrueVal == X &&
                match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder->CreateOr(X, *Y);
+        V = Builder.CreateOr(X, *Y);
 
       if (V)
         return replaceInstUsesWith(SI, V);
@@ -753,8 +809,8 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
       (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
     SelectInst *SI = cast<SelectInst>(Inner);
     Value *NewSI =
-        Builder->CreateSelect(SI->getCondition(), SI->getFalseValue(),
-                              SI->getTrueValue(), SI->getName(), SI);
+        Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(),
+                             SI->getTrueValue(), SI->getName(), SI);
     return replaceInstUsesWith(Outer, NewSI);
   }
 
@@ -786,19 +842,21 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
   // This transform is performance neutral if we can elide at least one xor from
   // the set of three operands, since we'll be tacking on an xor at the very
   // end.
-  if (IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
+  if (SelectPatternResult::isMinOrMax(SPF1) &&
+      SelectPatternResult::isMinOrMax(SPF2) &&
+      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
       IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
       IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
     if (!NotA)
-      NotA = Builder->CreateNot(A);
+      NotA = Builder.CreateNot(A);
     if (!NotB)
-      NotB = Builder->CreateNot(B);
+      NotB = Builder.CreateNot(B);
     if (!NotC)
-      NotC = Builder->CreateNot(C);
+      NotC = Builder.CreateNot(C);
 
     Value *NewInner = generateMinMaxSelectPattern(
         Builder, getInverseMinMaxSelectPattern(SPF1), NotA, NotB);
-    Value *NewOuter = Builder->CreateNot(generateMinMaxSelectPattern(
+    Value *NewOuter = Builder.CreateNot(generateMinMaxSelectPattern(
         Builder, getInverseMinMaxSelectPattern(SPF2), NewInner, NotC));
     return replaceInstUsesWith(Outer, NewOuter);
   }
@@ -810,9 +868,9 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
 /// icmp instruction with zero, and we have an 'and' with the non-constant value
 /// and a power of two we can turn the select into a shift on the result of the
 /// 'and'.
-static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
-                                ConstantInt *FalseVal,
-                                InstCombiner::BuilderTy *Builder) {
+static Value *foldSelectICmpAnd(const SelectInst &SI, APInt TrueVal,
+                                APInt FalseVal,
+                                InstCombiner::BuilderTy &Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
   if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
     return nullptr;
@@ -828,56 +886,53 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
   // If both select arms are non-zero see if we have a select of the form
   // 'x ? 2^n + C : C'. Then we can offset both arms by C, use the logic
   // for 'x ? 2^n : 0' and fix the thing up at the end.
-  ConstantInt *Offset = nullptr;
-  if (!TrueVal->isZero() && !FalseVal->isZero()) {
-    if ((TrueVal->getValue() - FalseVal->getValue()).isPowerOf2())
+  APInt Offset(TrueVal.getBitWidth(), 0);
+  if (!TrueVal.isNullValue() && !FalseVal.isNullValue()) {
+    if ((TrueVal - FalseVal).isPowerOf2())
       Offset = FalseVal;
-    else if ((FalseVal->getValue() - TrueVal->getValue()).isPowerOf2())
+    else if ((FalseVal - TrueVal).isPowerOf2())
       Offset = TrueVal;
     else
       return nullptr;
 
     // Adjust TrueVal and FalseVal to the offset.
-    TrueVal = ConstantInt::get(Builder->getContext(),
-                               TrueVal->getValue() - Offset->getValue());
-    FalseVal = ConstantInt::get(Builder->getContext(),
-                                FalseVal->getValue() - Offset->getValue());
+    TrueVal -= Offset;
+    FalseVal -= Offset;
   }
 
   // Make sure the mask in the 'and' and one of the select arms is a power of 2.
   if (!AndRHS->getValue().isPowerOf2() ||
-      (!TrueVal->getValue().isPowerOf2() &&
-       !FalseVal->getValue().isPowerOf2()))
+      (!TrueVal.isPowerOf2() && !FalseVal.isPowerOf2()))
     return nullptr;
 
   // Determine which shift is needed to transform result of the 'and' into the
   // desired result.
-  ConstantInt *ValC = !TrueVal->isZero() ? TrueVal : FalseVal;
-  unsigned ValZeros = ValC->getValue().logBase2();
+  const APInt &ValC = !TrueVal.isNullValue() ? TrueVal : FalseVal;
+  unsigned ValZeros = ValC.logBase2();
   unsigned AndZeros = AndRHS->getValue().logBase2();
 
   // If types don't match we can still convert the select by introducing a zext
   // or a trunc of the 'and'. The trunc case requires that all of the truncated
   // bits are zero, we can figure that out by looking at the 'and' mask.
-  if (AndZeros >= ValC->getBitWidth())
+  if (AndZeros >= ValC.getBitWidth())
     return nullptr;
 
-  Value *V = Builder->CreateZExtOrTrunc(LHS, SI.getType());
+  Value *V = Builder.CreateZExtOrTrunc(LHS, SI.getType());
   if (ValZeros > AndZeros)
-    V = Builder->CreateShl(V, ValZeros - AndZeros);
+    V = Builder.CreateShl(V, ValZeros - AndZeros);
   else if (ValZeros < AndZeros)
-    V = Builder->CreateLShr(V, AndZeros - ValZeros);
+    V = Builder.CreateLShr(V, AndZeros - ValZeros);
 
   // Okay, now we know that everything is set up, we just don't know whether we
   // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
-  bool ShouldNotVal = !TrueVal->isZero();
+  bool ShouldNotVal = !TrueVal.isNullValue();
   ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
   if (ShouldNotVal)
-    V = Builder->CreateXor(V, ValC);
+    V = Builder.CreateXor(V, ValC);
 
   // Apply an offset if needed.
-  if (Offset)
-    V = Builder->CreateAdd(V, Offset);
+  if (!Offset.isNullValue())
+    V = Builder.CreateAdd(V, ConstantInt::get(V->getType(), Offset));
   return V;
 }
 
@@ -966,7 +1021,7 @@ Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
   // TODO: Handle larger types? That requires adjusting FoldOpIntoSelect too.
   Value *X = ExtInst->getOperand(0);
   Type *SmallType = X->getType();
-  if (!SmallType->getScalarType()->isIntegerTy(1))
+  if (!SmallType->isIntOrIntVectorTy(1))
     return nullptr;
 
   Constant *C;
@@ -987,7 +1042,7 @@ Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
 
     // select Cond, (ext X), C --> ext(select Cond, X, C')
     // select Cond, C, (ext X) --> ext(select Cond, C', X)
-    Value *NewSel = Builder->CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
+    Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
     return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType);
   }
 
@@ -1035,8 +1090,10 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
       // If the select condition element is false, choose from the 2nd vector.
       Mask.push_back(ConstantInt::get(Int32Ty, i + NumElts));
     } else if (isa<UndefValue>(Elt)) {
-      // If the select condition element is undef, the shuffle mask is undef.
-      Mask.push_back(UndefValue::get(Int32Ty));
+      // Undef in a select condition (choose one of the operands) does not mean
+      // the same thing as undef in a shuffle mask (any value is acceptable), so
+      // give up.
+      return nullptr;
     } else {
       // Bail out on a constant expression.
       return nullptr;
@@ -1100,14 +1157,31 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *FalseVal = SI.getFalseValue();
   Type *SelType = SI.getType();
 
-  if (Value *V =
-          SimplifySelectInst(CondVal, TrueVal, FalseVal, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
+                                    SQ.getWithInstruction(&SI)))
     return replaceInstUsesWith(SI, V);
 
   if (Instruction *I = canonicalizeSelectToShuffle(SI))
     return I;
 
-  if (SelType->getScalarType()->isIntegerTy(1) &&
+  // Canonicalize a one-use integer compare with a non-canonical predicate by
+  // inverting the predicate and swapping the select operands. This matches a
+  // compare canonicalization for conditional branches.
+  // TODO: Should we do the same for FP compares?
+  CmpInst::Predicate Pred;
+  if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap true/false values and condition.
+    CmpInst *Cond = cast<CmpInst>(CondVal);
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    SI.swapProfMetadata();
+    Worklist.Add(Cond);
+    return &SI;
+  }
+
+  if (SelType->isIntOrIntVectorTy(1) &&
       TrueVal->getType() == CondVal->getType()) {
     if (match(TrueVal, m_One())) {
       // Change: A = select B, true, C --> A = or B, C
@@ -1115,7 +1189,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
     if (match(TrueVal, m_Zero())) {
       // Change: A = select B, false, C --> A = and !B, C
-      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
     }
     if (match(FalseVal, m_Zero())) {
@@ -1124,7 +1198,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
     if (match(FalseVal, m_One())) {
       // Change: A = select B, C, true --> A = or !B, C
-      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
       return BinaryOperator::CreateOr(NotCond, TrueVal);
     }
 
@@ -1149,7 +1223,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
   // because that may need 3 instructions to splat the condition value:
   // extend, insertelement, shufflevector.
-  if (CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
+  if (SelType->isIntOrIntVectorTy() &&
+      CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
     // select C, 1, 0 -> zext C to int
     if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
       return new ZExtInst(CondVal, SelType);
@@ -1160,20 +1235,21 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
     // select C, 0, 1 -> zext !C to int
     if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) {
-      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
       return new ZExtInst(NotCond, SelType);
     }
 
     // select C, 0, -1 -> sext !C to int
     if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) {
-      Value *NotCond = Builder->CreateNot(CondVal, "not." + CondVal->getName());
+      Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
       return new SExtInst(NotCond, SelType);
     }
   }
 
   if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
     if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal))
-      if (Value *V = foldSelectICmpAnd(SI, TrueValC, FalseValC, Builder))
+      if (Value *V = foldSelectICmpAnd(SI, TrueValC->getValue(),
+                                       FalseValC->getValue(), Builder))
         return replaceInstUsesWith(SI, V);
 
   // See if we are selecting two values based on a comparison of the two values.
@@ -1211,10 +1287,10 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
       if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
         FCmpInst::Predicate InvPred = FCI->getInversePredicate();
-        IRBuilder<>::FastMathFlagGuard FMFG(*Builder);
-        Builder->setFastMathFlags(FCI->getFastMathFlags());
-        Value *NewCond = Builder->CreateFCmp(InvPred, TrueVal, FalseVal,
-                                             FCI->getName() + ".inv");
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        Builder.setFastMathFlags(FCI->getFastMathFlags());
+        Value *NewCond = Builder.CreateFCmp(InvPred, TrueVal, FalseVal,
+                                            FCI->getName() + ".inv");
 
         return SelectInst::Create(NewCond, FalseVal, TrueVal,
                                   SI.getName() + ".p");
@@ -1254,10 +1330,10 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y
       if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
         FCmpInst::Predicate InvPred = FCI->getInversePredicate();
-        IRBuilder<>::FastMathFlagGuard FMFG(*Builder);
-        Builder->setFastMathFlags(FCI->getFastMathFlags());
-        Value *NewCond = Builder->CreateFCmp(InvPred, FalseVal, TrueVal,
-                                             FCI->getName() + ".inv");
+        IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+        Builder.setFastMathFlags(FCI->getFastMathFlags());
+        Value *NewCond = Builder.CreateFCmp(InvPred, FalseVal, TrueVal,
+                                            FCI->getName() + ".inv");
 
         return SelectInst::Create(NewCond, FalseVal, TrueVal,
                                   SI.getName() + ".p");
@@ -1273,7 +1349,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
       return Result;
 
-  if (Instruction *Add = foldAddSubSelect(SI, *Builder))
+  if (Instruction *Add = foldAddSubSelect(SI, Builder))
     return Add;
 
   // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
@@ -1304,16 +1380,16 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
         Value *Cmp;
         if (CmpInst::isIntPredicate(Pred)) {
-          Cmp = Builder->CreateICmp(Pred, LHS, RHS);
+          Cmp = Builder.CreateICmp(Pred, LHS, RHS);
         } else {
-          IRBuilder<>::FastMathFlagGuard FMFG(*Builder);
+          IRBuilder<>::FastMathFlagGuard FMFG(Builder);
           auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
-          Builder->setFastMathFlags(FMF);
-          Cmp = Builder->CreateFCmp(Pred, LHS, RHS);
+          Builder.setFastMathFlags(FMF);
+          Cmp = Builder.CreateFCmp(Pred, LHS, RHS);
         }
 
-        Value *NewSI = Builder->CreateCast(
-            CastOp, Builder->CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI),
+        Value *NewSI = Builder.CreateCast(
+            CastOp, Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI),
             SelType);
         return replaceInstUsesWith(SI, NewSI);
       }
@@ -1348,13 +1424,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
           (SI.hasOneUse() && match(*SI.user_begin(), m_Not(m_Value())));
 
       if (NumberOfNots >= 2) {
-        Value *NewLHS = Builder->CreateNot(LHS);
-        Value *NewRHS = Builder->CreateNot(RHS);
-        Value *NewCmp = SPF == SPF_SMAX
-                            ? Builder->CreateICmpSLT(NewLHS, NewRHS)
-                            : Builder->CreateICmpULT(NewLHS, NewRHS);
+        Value *NewLHS = Builder.CreateNot(LHS);
+        Value *NewRHS = Builder.CreateNot(RHS);
+        Value *NewCmp = SPF == SPF_SMAX ? Builder.CreateICmpSLT(NewLHS, NewRHS)
+                                        : Builder.CreateICmpULT(NewLHS, NewRHS);
         Value *NewSI =
-            Builder->CreateNot(Builder->CreateSelect(NewCmp, NewLHS, NewRHS));
+            Builder.CreateNot(Builder.CreateSelect(NewCmp, NewLHS, NewRHS));
         return replaceInstUsesWith(SI, NewSI);
       }
     }
@@ -1364,11 +1439,11 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   }
 
   // See if we can fold the select into a phi node if the condition is a select.
-  if (isa<PHINode>(SI.getCondition()))
+  if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
     // The true/false values have to be live in the PHI predecessor's blocks.
     if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
         canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
-      if (Instruction *NV = FoldOpIntoPhi(SI))
+      if (Instruction *NV = foldOpIntoPhi(SI, PN))
         return NV;
 
   if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
@@ -1384,7 +1459,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // We choose this as normal form to enable folding on the And and shortening
       // paths for the values (this helps GetUnderlyingObjects() for example).
       if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
-        Value *And = Builder->CreateAnd(CondVal, TrueSI->getCondition());
+        Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
         SI.setOperand(0, And);
         SI.setOperand(1, TrueSI->getTrueValue());
         return &SI;
@@ -1402,7 +1477,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       }
       // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
       if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
-        Value *Or = Builder->CreateOr(CondVal, FalseSI->getCondition());
+        Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
         SI.setOperand(0, Or);
         SI.setOperand(2, FalseSI->getFalseValue());
         return &SI;
@@ -1450,7 +1525,21 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     }
   }
 
-  if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, *Builder))
+  // If we can compute the condition, there's no need for a select.
+  // Like the above fold, we are attempting to reduce compile-time cost by
+  // putting this fold here with limitations rather than in InstSimplify.
+  // The motivation for this call into value tracking is to take advantage of
+  // the assumption cache, so make sure that is populated.
+  if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
+    KnownBits Known(1);
+    computeKnownBits(CondVal, Known, 0, &SI);
+    if (Known.One.isOneValue())
+      return replaceInstUsesWith(SI, TrueVal);
+    if (Known.Zero.isOneValue())
+      return replaceInstUsesWith(SI, FalseVal);
+  }
+
+  if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder))
     return BitCastSel;
 
   return nullptr;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 4ff9b64..7ed141c 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -22,8 +22,8 @@ using namespace PatternMatch;
 #define DEBUG_TYPE "instcombine"
 
 Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
-  assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  assert(Op0->getType() == Op1->getType());
 
   // See if we can fold away this shift.
   if (SimplifyDemandedInstructionBits(I))
@@ -44,9 +44,10 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   Value *A;
   Constant *C;
   if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
-    if (isKnownNonNegative(A, DL) && isKnownNonNegative(C, DL))
+    if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
+        isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
       return BinaryOperator::Create(
-          I.getOpcode(), Builder->CreateBinOp(I.getOpcode(), Op0, C), A);
+          I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
 
   // X shift (A srem B) -> X shift (A and B-1) iff B is a power of 2.
   // Because shifts by negative values (which could occur if A were negative)
@@ -55,8 +56,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Power2(B)))) {
     // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
     // demand the sign bit (and many others) here??
-    Value *Rem = Builder->CreateAnd(A, ConstantInt::get(I.getType(), *B-1),
-                                    Op1->getName());
+    Value *Rem = Builder.CreateAnd(A, ConstantInt::get(I.getType(), *B - 1),
+                                   Op1->getName());
     I.setOperand(1, Rem);
     return &I;
   }
@@ -65,63 +66,60 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
 }
 
 /// Return true if we can simplify two logical (either left or right) shifts
-/// that have constant shift amounts.
-static bool canEvaluateShiftedShift(unsigned FirstShiftAmt,
-                                    bool IsFirstShiftLeft,
-                                    Instruction *SecondShift, InstCombiner &IC,
+/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
+static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
+                                    Instruction *InnerShift, InstCombiner &IC,
                                     Instruction *CxtI) {
-  assert(SecondShift->isLogicalShift() && "Unexpected instruction type");
+  assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
 
-  // We need constant shifts.
-  auto *SecondShiftConst = dyn_cast<ConstantInt>(SecondShift->getOperand(1));
-  if (!SecondShiftConst)
+  // We need constant scalar or constant splat shifts.
+  const APInt *InnerShiftConst;
+  if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
     return false;
 
-  unsigned SecondShiftAmt = SecondShiftConst->getZExtValue();
-  bool IsSecondShiftLeft = SecondShift->getOpcode() == Instruction::Shl;
-
-  // We can always fold  shl(c1) +  shl(c2) ->  shl(c1+c2).
-  // We can always fold lshr(c1) + lshr(c2) -> lshr(c1+c2).
-  if (IsFirstShiftLeft == IsSecondShiftLeft)
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  if (IsInnerShl == IsOuterShl)
     return true;
 
-  // We can always fold lshr(c) +  shl(c) -> and(c2).
-  // We can always fold  shl(c) + lshr(c) -> and(c2).
-  if (FirstShiftAmt == SecondShiftAmt)
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  unsigned InnerShAmt = InnerShiftConst->getZExtValue();
+  if (InnerShAmt == OuterShAmt)
     return true;
 
-  unsigned TypeWidth = SecondShift->getType()->getScalarSizeInBits();
-
   // If the 2nd shift is bigger than the 1st, we can fold:
-  //   lshr(c1) +  shl(c2) ->  shl(c3) + and(c4) or
-  //   shl(c1)  + lshr(c2) -> lshr(c3) + and(c4),
+  // lshr (shl X, C1), C2 -->  and (shl X, C1 - C2), C3
+  // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
   // but it isn't profitable unless we know the and'd out bits are already zero.
-  // Also check that the 2nd shift is valid (less than the type width) or we'll
-  // crash trying to produce the bit mask for the 'and'.
-  if (SecondShiftAmt > FirstShiftAmt && SecondShiftAmt < TypeWidth) {
-    unsigned MaskShift = IsSecondShiftLeft ? TypeWidth - SecondShiftAmt
-                                           : SecondShiftAmt - FirstShiftAmt;
-    APInt Mask = APInt::getLowBitsSet(TypeWidth, FirstShiftAmt) << MaskShift;
-    if (IC.MaskedValueIsZero(SecondShift->getOperand(0), Mask, 0, CxtI))
+  // Also, check that the inner shift is valid (less than the type width) or
+  // we'll crash trying to produce the bit mask for the 'and'.
+  unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
+  if (InnerShAmt > OuterShAmt && InnerShAmt < TypeWidth) {
+    unsigned MaskShift =
+        IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
+    APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
+    if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI))
       return true;
   }
 
   return false;
 }
 
-/// See if we can compute the specified value, but shifted
-/// logically to the left or right by some number of bits.  This should return
-/// true if the expression can be computed for the same cost as the current
-/// expression tree.  This is used to eliminate extraneous shifting from things
-/// like:
+/// See if we can compute the specified value, but shifted logically to the left
+/// or right by some number of bits. This should return true if the expression
+/// can be computed for the same cost as the current expression tree. This is
+/// used to eliminate extraneous shifting from things like:
 ///      %C = shl i128 %A, 64
 ///      %D = shl i128 %B, 96
 ///      %E = or i128 %C, %D
 ///      %F = lshr i128 %E, 64
-/// where the client will ask if E can be computed shifted right by 64-bits.  If
-/// this succeeds, the GetShiftedValue function will be called to produce the
-/// value.
-static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
+/// where the client will ask if E can be computed shifted right by 64-bits. If
+/// this succeeds, getShiftedValue() will be called to produce the value.
+static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
                                InstCombiner &IC, Instruction *CxtI) {
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
@@ -165,8 +163,8 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
   case Instruction::Or:
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
-    return CanEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
-           CanEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
+    return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
+           canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
 
   case Instruction::Shl:
   case Instruction::LShr:
@@ -176,8 +174,8 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
     SelectInst *SI = cast<SelectInst>(I);
     Value *TrueVal = SI->getTrueValue();
     Value *FalseVal = SI->getFalseValue();
-    return CanEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
-           CanEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
+    return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
+           canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
   }
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -185,23 +183,86 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (Value *IncValue : PN->incoming_values())
-      if (!CanEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
+      if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
         return false;
     return true;
   }
   }
 }
 
-/// When CanEvaluateShifted returned true for an expression,
-/// this value inserts the new computation that produces the shifted value.
-static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
+/// Fold OuterShift (InnerShift X, C1), C2.
+/// See canEvaluateShiftedShift() for the constraints on these instructions.
+static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
+                               bool IsOuterShl,
+                               InstCombiner::BuilderTy &Builder) {
+  bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+  Type *ShType = InnerShift->getType();
+  unsigned TypeWidth = ShType->getScalarSizeInBits();
+
+  // We only accept shifts-by-a-constant in canEvaluateShifted().
+  const APInt *C1;
+  match(InnerShift->getOperand(1), m_APInt(C1));
+  unsigned InnerShAmt = C1->getZExtValue();
+
+  // Change the shift amount and clear the appropriate IR flags.
+  auto NewInnerShift = [&](unsigned ShAmt) {
+    InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt));
+    if (IsInnerShl) {
+      InnerShift->setHasNoUnsignedWrap(false);
+      InnerShift->setHasNoSignedWrap(false);
+    } else {
+      InnerShift->setIsExact(false);
+    }
+    return InnerShift;
+  };
+
+  // Two logical shifts in the same direction:
+  // shl (shl X, C1), C2 -->  shl X, C1 + C2
+  // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+  if (IsInnerShl == IsOuterShl) {
+    // If this is an oversized composite shift, then unsigned shifts get 0.
+    if (InnerShAmt + OuterShAmt >= TypeWidth)
+      return Constant::getNullValue(ShType);
+
+    return NewInnerShift(InnerShAmt + OuterShAmt);
+  }
+
+  // Equal shift amounts in opposite directions become bitwise 'and':
+  // lshr (shl X, C), C --> and X, C'
+  // shl (lshr X, C), C --> and X, C'
+  if (InnerShAmt == OuterShAmt) {
+    APInt Mask = IsInnerShl
+                     ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt)
+                     : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt);
+    Value *And = Builder.CreateAnd(InnerShift->getOperand(0),
+                                   ConstantInt::get(ShType, Mask));
+    if (auto *AndI = dyn_cast<Instruction>(And)) {
+      AndI->moveBefore(InnerShift);
+      AndI->takeName(InnerShift);
+    }
+    return And;
+  }
+
+  assert(InnerShAmt > OuterShAmt &&
+         "Unexpected opposite direction logical shift pair");
+
+  // In general, we would need an 'and' for this transform, but
+  // canEvaluateShiftedShift() guarantees that the masked-off bits are not used.
+  // lshr (shl X, C1), C2 -->  shl X, C1 - C2
+  // shl (lshr X, C1), C2 --> lshr X, C1 - C2
+  return NewInnerShift(InnerShAmt - OuterShAmt);
+}
+
+/// When canEvaluateShifted() returns true for an expression, this function
+/// inserts the new computation that produces the shifted value.
+static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                               InstCombiner &IC, const DataLayout &DL) {
   // We can always evaluate constants shifted.
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (isLeftShift)
-      V = IC.Builder->CreateShl(C, NumBits);
+      V = IC.Builder.CreateShl(C, NumBits);
     else
-      V = IC.Builder->CreateLShr(C, NumBits);
+      V = IC.Builder.CreateLShr(C, NumBits);
     // If we got a constantexpr back, try to simplify it with TD info.
     if (auto *C = dyn_cast<Constant>(V))
       if (auto *FoldedC =
@@ -220,100 +281,21 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   case Instruction::Xor:
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
     I->setOperand(
-        0, GetShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
+        0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
     I->setOperand(
-        1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
     return I;
 
-  case Instruction::Shl: {
-    BinaryOperator *BO = cast<BinaryOperator>(I);
-    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
-
-    // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
-
-    // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
-    if (isLeftShift) {
-      // If this is oversized composite shift, then unsigned shifts get 0.
-      unsigned NewShAmt = NumBits+CI->getZExtValue();
-      if (NewShAmt >= TypeWidth)
-        return Constant::getNullValue(I->getType());
-
-      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
-      BO->setHasNoUnsignedWrap(false);
-      BO->setHasNoSignedWrap(false);
-      return I;
-    }
-
-    // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have
-    // zeros.
-    if (CI->getValue() == NumBits) {
-      APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits));
-      V = IC.Builder->CreateAnd(BO->getOperand(0),
-                                ConstantInt::get(BO->getContext(), Mask));
-      if (Instruction *VI = dyn_cast<Instruction>(V)) {
-        VI->moveBefore(BO);
-        VI->takeName(BO);
-      }
-      return V;
-    }
-
-    // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
-    // the and won't be needed.
-    assert(CI->getZExtValue() > NumBits);
-    BO->setOperand(1, ConstantInt::get(BO->getType(),
-                                       CI->getZExtValue() - NumBits));
-    BO->setHasNoUnsignedWrap(false);
-    BO->setHasNoSignedWrap(false);
-    return BO;
-  }
-  // FIXME: This is almost identical to the SHL case. Refactor both cases into
-  // a helper function.
-  case Instruction::LShr: {
-    BinaryOperator *BO = cast<BinaryOperator>(I);
-    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
-    // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
-
-    // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
-    if (!isLeftShift) {
-      // If this is oversized composite shift, then unsigned shifts get 0.
-      unsigned NewShAmt = NumBits+CI->getZExtValue();
-      if (NewShAmt >= TypeWidth)
-        return Constant::getNullValue(BO->getType());
-
-      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
-      BO->setIsExact(false);
-      return I;
-    }
-
-    // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have
-    // zeros.
-    if (CI->getValue() == NumBits) {
-      APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits));
-      V = IC.Builder->CreateAnd(I->getOperand(0),
-                                ConstantInt::get(BO->getContext(), Mask));
-      if (Instruction *VI = dyn_cast<Instruction>(V)) {
-        VI->moveBefore(I);
-        VI->takeName(I);
-      }
-      return V;
-    }
-
-    // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
-    // the and won't be needed.
-    assert(CI->getZExtValue() > NumBits);
-    BO->setOperand(1, ConstantInt::get(BO->getType(),
-                                       CI->getZExtValue() - NumBits));
-    BO->setIsExact(false);
-    return BO;
-  }
+  case Instruction::Shl:
+  case Instruction::LShr:
+    return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift,
+                            IC.Builder);
 
   case Instruction::Select:
     I->setOperand(
-        1, GetShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+        1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
     I->setOperand(
-        2, GetShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
+        2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
     return I;
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
@@ -321,215 +303,39 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // instructions with a single use.
     PHINode *PN = cast<PHINode>(I);
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-      PN->setIncomingValue(i, GetShiftedValue(PN->getIncomingValue(i), NumBits,
+      PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits,
                                               isLeftShift, IC, DL));
     return PN;
   }
   }
 }
 
-/// Try to fold (X << C1) << C2, where the shifts are some combination of
-/// shl/ashr/lshr.
-static Instruction *
-foldShiftByConstOfShiftByConst(BinaryOperator &I, ConstantInt *COp1,
-                               InstCombiner::BuilderTy *Builder) {
-  Value *Op0 = I.getOperand(0);
-  uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
-
-  // Find out if this is a shift of a shift by a constant.
-  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
-  if (ShiftOp && !ShiftOp->isShift())
-    ShiftOp = nullptr;
-
-  if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
-
-    // This is a constant shift of a constant shift. Be careful about hiding
-    // shl instructions behind bit masks. They are used to represent multiplies
-    // by a constant, and it is important that simple arithmetic expressions
-    // are still recognizable by scalar evolution.
-    //
-    // The transforms applied to shl are very similar to the transforms applied
-    // to mul by constant. We can be more aggressive about optimizing right
-    // shifts.
-    //
-    // Combinations of right and left shifts will still be optimized in
-    // DAGCombine where scalar evolution no longer applies.
-
-    ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
-    uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
-    uint32_t ShiftAmt2 = COp1->getLimitedValue(TypeBits);
-    assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
-    if (ShiftAmt1 == 0)
-      return nullptr; // Will be simplified in the future.
-    Value *X = ShiftOp->getOperand(0);
-
-    IntegerType *Ty = cast<IntegerType>(I.getType());
-
-    // Check for (X << c1) << c2  and  (X >> c1) >> c2
-    if (I.getOpcode() == ShiftOp->getOpcode()) {
-      uint32_t AmtSum = ShiftAmt1 + ShiftAmt2; // Fold into one big shift.
-      // If this is an oversized composite shift, then unsigned shifts become
-      // zero (handled in InstSimplify) and ashr saturates.
-      if (AmtSum >= TypeBits) {
-        if (I.getOpcode() != Instruction::AShr)
-          return nullptr;
-        AmtSum = TypeBits - 1; // Saturate to 31 for i32 ashr.
-      }
-
-      return BinaryOperator::Create(I.getOpcode(), X,
-                                    ConstantInt::get(Ty, AmtSum));
-    }
-
-    if (ShiftAmt1 == ShiftAmt2) {
-      // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
-      if (I.getOpcode() == Instruction::LShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
-        return BinaryOperator::CreateAnd(
-            X, ConstantInt::get(I.getContext(), Mask));
-      }
-    } else if (ShiftAmt1 < ShiftAmt2) {
-      uint32_t ShiftDiff = ShiftAmt2 - ShiftAmt1;
-
-      // (X >>?,exact C1) << C2 --> X << (C2-C1)
-      // The inexact version is deferred to DAGCombine so we don't hide shl
-      // behind a bit mask.
-      if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl && ShiftOp->isExact()) {
-        assert(ShiftOp->getOpcode() == Instruction::LShr ||
-               ShiftOp->getOpcode() == Instruction::AShr);
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        BinaryOperator *NewShl =
-            BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
-        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
-        return NewShl;
-      }
-
-      // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
-      if (I.getOpcode() == Instruction::LShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        // (X <<nuw C1) >>u C2 --> X >>u (C2-C1)
-        if (ShiftOp->hasNoUnsignedWrap()) {
-          BinaryOperator *NewLShr =
-              BinaryOperator::Create(Instruction::LShr, X, ShiftDiffCst);
-          NewLShr->setIsExact(I.isExact());
-          return NewLShr;
-        }
-        Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
-
-        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(
-            Shift, ConstantInt::get(I.getContext(), Mask));
-      }
-
-      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
-      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
-      if (I.getOpcode() == Instruction::AShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        if (ShiftOp->hasNoSignedWrap()) {
-          // (X <<nsw C1) >>s C2 --> X >>s (C2-C1)
-          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-          BinaryOperator *NewAShr =
-              BinaryOperator::Create(Instruction::AShr, X, ShiftDiffCst);
-          NewAShr->setIsExact(I.isExact());
-          return NewAShr;
-        }
-      }
-    } else {
-      assert(ShiftAmt2 < ShiftAmt1);
-      uint32_t ShiftDiff = ShiftAmt1 - ShiftAmt2;
-
-      // (X >>?exact C1) << C2 --> X >>?exact (C1-C2)
-      // The inexact version is deferred to DAGCombine so we don't hide shl
-      // behind a bit mask.
-      if (I.getOpcode() == Instruction::Shl &&
-          ShiftOp->getOpcode() != Instruction::Shl && ShiftOp->isExact()) {
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        BinaryOperator *NewShr =
-            BinaryOperator::Create(ShiftOp->getOpcode(), X, ShiftDiffCst);
-        NewShr->setIsExact(true);
-        return NewShr;
-      }
-
-      // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
-      if (I.getOpcode() == Instruction::LShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-        if (ShiftOp->hasNoUnsignedWrap()) {
-          // (X <<nuw C1) >>u C2 --> X <<nuw (C1-C2)
-          BinaryOperator *NewShl =
-              BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
-          NewShl->setHasNoUnsignedWrap(true);
-          return NewShl;
-        }
-        Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-
-        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
-        return BinaryOperator::CreateAnd(
-            Shift, ConstantInt::get(I.getContext(), Mask));
-      }
-
-      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
-      // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
-      if (I.getOpcode() == Instruction::AShr &&
-          ShiftOp->getOpcode() == Instruction::Shl) {
-        if (ShiftOp->hasNoSignedWrap()) {
-          // (X <<nsw C1) >>s C2 --> X <<nsw (C1-C2)
-          ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff);
-          BinaryOperator *NewShl =
-              BinaryOperator::Create(Instruction::Shl, X, ShiftDiffCst);
-          NewShl->setHasNoSignedWrap(true);
-          return NewShl;
-        }
-      }
-    }
-  }
-
-  return nullptr;
-}
-
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
 
-  ConstantInt *COp1 = nullptr;
-  if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(Op1))
-    COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue());
-  else if (ConstantVector *CV = dyn_cast<ConstantVector>(Op1))
-    COp1 = dyn_cast_or_null<ConstantInt>(CV->getSplatValue());
-  else
-    COp1 = dyn_cast<ConstantInt>(Op1);
-
-  if (!COp1)
+  const APInt *Op1C;
+  if (!match(Op1, m_APInt(Op1C)))
     return nullptr;
 
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
-      CanEvaluateShifted(Op0, COp1->getZExtValue(), isLeftShift, *this, &I)) {
+      canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
 
     return replaceInstUsesWith(
-        I, GetShiftedValue(Op0, COp1->getZExtValue(), isLeftShift, *this, DL));
+        I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
   }
 
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
-  uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
+  unsigned TypeBits = Op0->getType()->getScalarSizeInBits();
 
-  assert(!COp1->uge(TypeBits) &&
+  assert(!Op1C->uge(TypeBits) &&
          "Shift over the type width should have been removed already");
 
-  // ((X*C1) << C2) == (X * (C1 << C2))
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
-    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
-      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
-        return BinaryOperator::CreateMul(BO->getOperand(0),
-                                         ConstantExpr::getShl(BOOp, Op1));
-
   if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
     return FoldedShift;
 
@@ -544,9 +350,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
         isa<ConstantInt>(TrOp->getOperand(1))) {
       // Okay, we'll do this xform.  Make the shift of shift.
-      Constant *ShAmt = ConstantExpr::getZExt(COp1, TrOp->getType());
+      Constant *ShAmt =
+          ConstantExpr::getZExt(cast<Constant>(Op1), TrOp->getType());
       // (shift2 (shift1 & 0x00FF), c2)
-      Value *NSh = Builder->CreateBinOp(I.getOpcode(), TrOp, ShAmt,I.getName());
+      Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
 
       // For logical shifts, the truncation has the effect of making the high
       // part of the register be zeros.  Emulate this by inserting an AND to
@@ -561,16 +368,16 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
       // shift.  We know that it is a logical shift by a constant, so adjust the
       // mask as appropriate.
       if (I.getOpcode() == Instruction::Shl)
-        MaskV <<= COp1->getZExtValue();
+        MaskV <<= Op1C->getZExtValue();
       else {
         assert(I.getOpcode() == Instruction::LShr && "Unknown logical shift");
-        MaskV = MaskV.lshr(COp1->getZExtValue());
+        MaskV.lshrInPlace(Op1C->getZExtValue());
       }
 
       // shift1 & 0x00FF
-      Value *And = Builder->CreateAnd(NSh,
-                                      ConstantInt::get(I.getContext(), MaskV),
-                                      TI->getName());
+      Value *And = Builder.CreateAnd(NSh,
+                                     ConstantInt::get(I.getContext(), MaskV),
+                                     TI->getName());
 
       // Return the value truncated to the interesting size.
       return new TruncInst(And, I.getType());
@@ -594,11 +401,11 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
             match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
                   m_Specific(Op1)))) {
           Value *YS =         // (Y << C)
-            Builder->CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
           // (X + (Y << C))
-          Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), YS, V1,
-                                          Op0BO->getOperand(1)->getName());
-          uint32_t Op1Val = COp1->getLimitedValue(TypeBits);
+          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
+                                         Op0BO->getOperand(1)->getName());
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
 
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(I.getContext(), Bits);
@@ -614,11 +421,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                   m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
                         m_ConstantInt(CC)))) {
           Value *YS =   // (Y << C)
-            Builder->CreateShl(Op0BO->getOperand(0), Op1,
-                                         Op0BO->getName());
+            Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
           // X & (CC << C)
-          Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
-                                         V1->getName()+".mask");
+          Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
           return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
         }
         LLVM_FALLTHROUGH;
@@ -630,11 +436,11 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
             match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
                   m_Specific(Op1)))) {
           Value *YS =  // (Y << C)
-            Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
           // (X + (Y << C))
-          Value *X = Builder->CreateBinOp(Op0BO->getOpcode(), V1, YS,
-                                          Op0BO->getOperand(0)->getName());
-          uint32_t Op1Val = COp1->getLimitedValue(TypeBits);
+          Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
+                                         Op0BO->getOperand(0)->getName());
+          unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
 
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(I.getContext(), Bits);
@@ -649,10 +455,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                   m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))),
                         m_ConstantInt(CC))) && V2 == Op1) {
           Value *YS = // (Y << C)
-            Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+            Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
           // X & (CC << C)
-          Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
-                                         V1->getName()+".mask");
+          Value *XM = Builder.CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
 
           return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
         }
@@ -695,7 +501,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
 
           Value *NewShift =
-            Builder->CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
+            Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
           NewShift->takeName(Op0BO);
 
           return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
@@ -705,9 +511,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     }
   }
 
-  if (Instruction *Folded = foldShiftByConstOfShiftByConst(I, COp1, Builder))
-    return Folded;
-
   return nullptr;
 }
 
@@ -715,59 +518,97 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (Value *V =
-          SimplifyShlInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(),
-                          I.hasNoUnsignedWrap(), DL, &TLI, &DT, &AC))
+          SimplifyShlInst(Op0, Op1, I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+                          SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *V = commonShiftTransforms(I))
     return V;
 
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
-    unsigned ShAmt = Op1C->getZExtValue();
-
-    // Turn:
-    //  %zext = zext i32 %V to i64
-    //  %res = shl i64 %V, 8
-    //
-    // Into:
-    //  %shl = shl i32 %V, 8
-    //  %res = zext i32 %shl to i64
-    //
-    // This is only valid if %V would have zeros shifted out.
-    if (auto *ZI = dyn_cast<ZExtInst>(I.getOperand(0))) {
-      unsigned SrcBitWidth = ZI->getSrcTy()->getScalarSizeInBits();
-      if (ShAmt < SrcBitWidth &&
-          MaskedValueIsZero(ZI->getOperand(0),
-                            APInt::getHighBitsSet(SrcBitWidth, ShAmt), 0, &I)) {
-        auto *Shl = Builder->CreateShl(ZI->getOperand(0), ShAmt);
-        return new ZExtInst(Shl, I.getType());
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = I.getType()->getScalarSizeInBits();
+    Type *Ty = I.getType();
+
+    // shl (zext X), ShAmt --> zext (shl X, ShAmt)
+    // This is only valid if X would have zeros shifted out.
+    Value *X;
+    if (match(Op0, m_ZExt(m_Value(X)))) {
+      unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt < SrcWidth &&
+          MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
+        return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty);
+    }
+
+    // (X >>u C) << C --> X & (-1 << C)
+    if (match(Op0, m_LShr(m_Value(X), m_Specific(Op1)))) {
+      APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    // Be careful about hiding shl instructions behind bit masks. They are used
+    // to represent multiplies by a constant, and it is important that simple
+    // arithmetic expressions are still recognizable by scalar evolution.
+    // The inexact versions are deferred to DAGCombine, so we don't hide shl
+    // behind a bit mask.
+    const APInt *ShOp1;
+    if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1))))) {
+      unsigned ShrAmt = ShOp1->getZExtValue();
+      if (ShrAmt < ShAmt) {
+        // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+        auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+        NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+        NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+        return NewShl;
+      }
+      if (ShrAmt > ShAmt) {
+        // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+        auto *NewShr = BinaryOperator::Create(
+            cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
+        NewShr->setIsExact(true);
+        return NewShr;
       }
     }
 
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X << C1) << C2 --> X << (C1 + C2)
+        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
+    }
+
     // If the shifted-out value is known-zero, then this is a NUW shift.
     if (!I.hasNoUnsignedWrap() &&
-        MaskedValueIsZero(I.getOperand(0),
-                          APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt), 0,
-                          &I)) {
+        MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
       I.setHasNoUnsignedWrap();
       return &I;
     }
 
-    // If the shifted out value is all signbits, this is a NSW shift.
-    if (!I.hasNoSignedWrap() &&
-        ComputeNumSignBits(I.getOperand(0), 0, &I) > ShAmt) {
+    // If the shifted-out value is all signbits, then this is a NSW shift.
+    if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
       I.setHasNoSignedWrap();
       return &I;
     }
   }
 
-  // (C1 << A) << C2 -> (C1 << C2) << A
-  Constant *C1, *C2;
-  Value *A;
-  if (match(I.getOperand(0), m_OneUse(m_Shl(m_Constant(C1), m_Value(A)))) &&
-      match(I.getOperand(1), m_Constant(C2)))
-    return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
+  Constant *C1;
+  if (match(Op1, m_Constant(C1))) {
+    Constant *C2;
+    Value *X;
+    // (C2 << X) << C1 --> (C2 << C1) << X
+    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
+      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
+
+    // (X * C2) << C1 --> X * (C2 << C1)
+    if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+  }
 
   return nullptr;
 }
@@ -776,43 +617,109 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
-                                  DL, &TLI, &DT, &AC))
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V =
+          SimplifyLShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-    unsigned ShAmt = Op1C->getZExtValue();
-
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op0)) {
-      unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
+  Type *Ty = I.getType();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
+    unsigned BitWidth = Ty->getScalarSizeInBits();
+    auto *II = dyn_cast<IntrinsicInst>(Op0);
+    if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
+        (II->getIntrinsicID() == Intrinsic::ctlz ||
+         II->getIntrinsicID() == Intrinsic::cttz ||
+         II->getIntrinsicID() == Intrinsic::ctpop)) {
       // ctlz.i32(x)>>5  --> zext(x == 0)
       // cttz.i32(x)>>5  --> zext(x == 0)
       // ctpop.i32(x)>>5 --> zext(x == -1)
-      if ((II->getIntrinsicID() == Intrinsic::ctlz ||
-           II->getIntrinsicID() == Intrinsic::cttz ||
-           II->getIntrinsicID() == Intrinsic::ctpop) &&
-          isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt) {
-        bool isCtPop = II->getIntrinsicID() == Intrinsic::ctpop;
-        Constant *RHS = ConstantInt::getSigned(Op0->getType(), isCtPop ? -1:0);
-        Value *Cmp = Builder->CreateICmpEQ(II->getArgOperand(0), RHS);
-        return new ZExtInst(Cmp, II->getType());
+      bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop;
+      Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0);
+      Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS);
+      return new ZExtInst(Cmp, Ty);
+    }
+
+    Value *X;
+    const APInt *ShOp1;
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
+          auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
+          NewLShr->setIsExact(I.isExact());
+          return NewLShr;
+        }
+        // (X << C1) >>u C2  --> (X >>u (C2 - C1)) & (-1 >> C2)
+        Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
+      }
+      if (ShlAmt > ShAmt) {
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+          // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+          auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+          NewShl->setHasNoUnsignedWrap(true);
+          return NewShl;
+        }
+        // (X << C1) >>u C2  --> X << (C1 - C2) & (-1 >> C2)
+        Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+      }
+      assert(ShlAmt == ShAmt);
+      // (X << C) >>u C --> X & (-1 >>u C)
+      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    if (match(Op0, m_SExt(m_Value(X))) &&
+        (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+      // Are we moving the sign bit to the low bit and widening with high zeros?
+      unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
+      if (ShAmt == BitWidth - 1) {
+        // lshr (sext i1 X to iN), N-1 --> zext X to iN
+        if (SrcTyBitWidth == 1)
+          return new ZExtInst(X, Ty);
+
+        // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
+        if (Op0->hasOneUse()) {
+          Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1);
+          return new ZExtInst(NewLShr, Ty);
+        }
+      }
+
+      // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+      if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
+        // The new shift amount can't be more than the narrow source type.
+        unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
+        Value *AShr = Builder.CreateAShr(X, NewShAmt);
+        return new ZExtInst(AShr, Ty);
       }
     }
 
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
+        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(Op1C->getBitWidth(), ShAmt),
-                          0, &I)){
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
       I.setIsExact();
       return &I;
     }
   }
-
   return nullptr;
 }
 
@@ -820,48 +727,67 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
   if (Value *V = SimplifyVectorOp(I))
     return replaceInstUsesWith(I, V);
 
-  if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
-                                  DL, &TLI, &DT, &AC))
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (Value *V =
+          SimplifyAShrInst(Op0, Op1, I.isExact(), SQ.getWithInstruction(&I)))
     return replaceInstUsesWith(I, V);
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
 
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
-  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
-    unsigned ShAmt = Op1C->getZExtValue();
+  Type *Ty = I.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+  const APInt *ShAmtAPInt;
+  if (match(Op1, m_APInt(ShAmtAPInt))) {
+    unsigned ShAmt = ShAmtAPInt->getZExtValue();
 
-    // If the input is a SHL by the same constant (ashr (shl X, C), C), then we
-    // have a sign-extend idiom.
+    // If the shift amount equals the difference in width of the destination
+    // and source scalar types:
+    // ashr (shl (zext X), C), C --> sext X
     Value *X;
-    if (match(Op0, m_Shl(m_Value(X), m_Specific(Op1)))) {
-      // If the input is an extension from the shifted amount value, e.g.
-      //   %x = zext i8 %A to i32
-      //   %y = shl i32 %x, 24
-      //   %z = ashr %y, 24
-      // then turn this into "z = sext i8 A to i32".
-      if (ZExtInst *ZI = dyn_cast<ZExtInst>(X)) {
-        uint32_t SrcBits = ZI->getOperand(0)->getType()->getScalarSizeInBits();
-        uint32_t DestBits = ZI->getType()->getScalarSizeInBits();
-        if (Op1C->getZExtValue() == DestBits-SrcBits)
-          return new SExtInst(ZI->getOperand(0), ZI->getType());
+    if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) &&
+        ShAmt == BitWidth - X->getType()->getScalarSizeInBits())
+      return new SExtInst(X, Ty);
+
+    // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
+    // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+    const APInt *ShOp1;
+    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
+      if (ShlAmt < ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+        auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff);
+        NewAShr->setIsExact(I.isExact());
+        return NewAShr;
       }
+      if (ShlAmt > ShAmt) {
+        // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+        Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+        auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff);
+        NewShl->setHasNoSignedWrap(true);
+        return NewShl;
+      }
+    }
+
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized arithmetic shifts replicate the sign bit.
+      AmtSum = std::min(AmtSum, BitWidth - 1);
+      // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+      return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
     }
 
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
-        MaskedValueIsZero(Op0, APInt::getLowBitsSet(Op1C->getBitWidth(), ShAmt),
-                          0, &I)) {
+        MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
       I.setIsExact();
       return &I;
     }
   }
 
   // See if we can turn a signed shr into an unsigned shr.
-  if (MaskedValueIsZero(Op0,
-                        APInt::getSignBit(I.getType()->getScalarSizeInBits()),
-                        0, &I))
+  if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
     return BinaryOperator::CreateLShr(Op0, Op1);
 
   return nullptr;
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8b930bd..a20f474 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -26,22 +27,22 @@ using namespace llvm::PatternMatch;
 /// constant integer. If so, check to see if there are any bits set in the
 /// constant that are not demanded. If so, shrink the constant and return true.
 static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
-                                   APInt Demanded) {
+                                   const APInt &Demanded) {
   assert(I && "No instruction?");
   assert(OpNo < I->getNumOperands() && "Operand index too large");
 
-  // If the operand is not a constant integer, nothing to do.
-  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
-  if (!OpC) return false;
+  // The operand must be a constant integer or splat integer.
+  Value *Op = I->getOperand(OpNo);
+  const APInt *C;
+  if (!match(Op, m_APInt(C)))
+    return false;
 
   // If there are no bits set that aren't demanded, nothing to do.
-  Demanded = Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
-  if ((~Demanded & OpC->getValue()) == 0)
+  if (C->isSubsetOf(Demanded))
     return false;
 
   // This instruction is producing bits that are not demanded. Shrink the RHS.
-  Demanded &= OpC->getValue();
-  I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded));
+  I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));
 
   return true;
 }
@@ -52,10 +53,10 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
 /// the instruction has any properties that allow us to simplify its operands.
 bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
   unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+  KnownBits Known(BitWidth);
   APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
 
-  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, KnownZero, KnownOne,
+  Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
                                      0, &Inst);
   if (!V) return false;
   if (V == &Inst) return true;
@@ -66,12 +67,13 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Use &U, const APInt &DemandedMask,
-                                        APInt &KnownZero, APInt &KnownOne,
+bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                        const APInt &DemandedMask,
+                                        KnownBits &Known,
                                         unsigned Depth) {
-  auto *UserI = dyn_cast<Instruction>(U.getUser());
-  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero,
-                                          KnownOne, Depth, UserI);
+  Use &U = I->getOperandUse(OpNo);
+  Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
+                                          Depth, I);
   if (!NewVal) return false;
   U = NewVal;
   return true;
@@ -85,15 +87,16 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, const APInt &DemandedMask,
 /// with a constant or one of its operands. In such cases, this function does
 /// the replacement and returns true. In all other cases, it returns false after
 /// analyzing the expression and setting KnownOne and known to be one in the
-/// expression. KnownZero contains all the bits that are known to be zero in the
-/// expression. These are provided to potentially allow the caller (which might
-/// recursively be SimplifyDemandedBits itself) to simplify the expression.
-/// KnownOne and KnownZero always follow the invariant that:
-///   KnownOne & KnownZero == 0.
-/// That is, a bit can't be both 1 and 0. Note that the bits in KnownOne and
-/// KnownZero may only be accurate for those bits set in DemandedMask. Note also
-/// that the bitwidth of V, DemandedMask, KnownZero and KnownOne must all be the
-/// same.
+/// expression. Known.Zero contains all the bits that are known to be zero in
+/// the expression. These are provided to potentially allow the caller (which
+/// might recursively be SimplifyDemandedBits itself) to simplify the
+/// expression.
+/// Known.One and Known.Zero always follow the invariant that:
+///   Known.One & Known.Zero == 0.
+/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
+/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
+/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
+/// be the same.
 ///
 /// This returns null if it did not change anything and it permits no
 /// simplification.  This returns V itself if it did some simplification of V's
@@ -101,8 +104,7 @@ bool InstCombiner::SimplifyDemandedBits(Use &U, const APInt &DemandedMask,
 /// some other non-null value if it found out that V is equal to another value
 /// in the context where the specified bits are demanded, but not for all users.
 Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
-                                             APInt &KnownZero, APInt &KnownOne,
-                                             unsigned Depth,
+                                             KnownBits &Known, unsigned Depth,
                                              Instruction *CxtI) {
   assert(V != nullptr && "Null pointer of Value???");
   assert(Depth <= 6 && "Limit Search Depth");
@@ -110,246 +112,144 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   Type *VTy = V->getType();
   assert(
       (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
-      KnownZero.getBitWidth() == BitWidth &&
-      KnownOne.getBitWidth() == BitWidth &&
-      "Value *V, DemandedMask, KnownZero and KnownOne "
-      "must have same BitWidth");
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    // We know all of the bits for a constant!
-    KnownOne = CI->getValue() & DemandedMask;
-    KnownZero = ~KnownOne & DemandedMask;
-    return nullptr;
-  }
-  if (isa<ConstantPointerNull>(V)) {
-    // We know all of the bits for a constant!
-    KnownOne.clearAllBits();
-    KnownZero = DemandedMask;
+      Known.getBitWidth() == BitWidth &&
+      "Value *V, DemandedMask and Known must have same BitWidth");
+
+  if (isa<Constant>(V)) {
+    computeKnownBits(V, Known, Depth, CxtI);
     return nullptr;
   }
 
-  KnownZero.clearAllBits();
-  KnownOne.clearAllBits();
-  if (DemandedMask == 0) {   // Not demanding any bits from V.
-    if (isa<UndefValue>(V))
-      return nullptr;
+  Known.resetAll();
+  if (DemandedMask.isNullValue())     // Not demanding any bits from V.
     return UndefValue::get(VTy);
-  }
 
   if (Depth == 6)        // Limit search depth.
     return nullptr;
 
-  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-  APInt RHSKnownZero(BitWidth, 0), RHSKnownOne(BitWidth, 0);
-
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) {
-    computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
+    computeKnownBits(V, Known, Depth, CxtI);
     return nullptr;        // Only analyze instructions.
   }
 
   // If there are multiple uses of this value and we aren't at the root, then
   // we can't do any simplifications of the operands, because DemandedMask
   // only reflects the bits demanded by *one* of the users.
-  if (Depth != 0 && !I->hasOneUse()) {
-    // Despite the fact that we can't simplify this instruction in all User's
-    // context, we can at least compute the knownzero/knownone bits, and we can
-    // do simplifications that apply to *just* the one user if we know that
-    // this instruction has a simpler value in that context.
-    if (I->getOpcode() == Instruction::And) {
-      // If either the LHS or the RHS are Zero, the result is zero.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
-                       CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
-
-      // If all of the demanded bits are known 1 on one side, return the other.
-      // These bits cannot contribute to the result of the 'and' in this
-      // context.
-      if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
-          (DemandedMask & ~LHSKnownZero))
-        return I->getOperand(0);
-      if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
-          (DemandedMask & ~RHSKnownZero))
-        return I->getOperand(1);
-
-      // If all of the demanded bits in the inputs are known zeros, return zero.
-      if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
-        return Constant::getNullValue(VTy);
-
-    } else if (I->getOpcode() == Instruction::Or) {
-      // We can simplify (X|Y) -> X or Y in the user's context if we know that
-      // only bits from X or Y are demanded.
-
-      // If either the LHS or the RHS are One, the result is One.
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
-                       CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
-
-      // If all of the demanded bits are known zero on one side, return the
-      // other.  These bits cannot contribute to the result of the 'or' in this
-      // context.
-      if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
-          (DemandedMask & ~LHSKnownOne))
-        return I->getOperand(0);
-      if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
-          (DemandedMask & ~RHSKnownOne))
-        return I->getOperand(1);
-
-      // If all of the potentially set bits on one side are known to be set on
-      // the other side, just use the 'other' side.
-      if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
-          (DemandedMask & (~RHSKnownZero)))
-        return I->getOperand(0);
-      if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
-          (DemandedMask & (~LHSKnownZero)))
-        return I->getOperand(1);
-    } else if (I->getOpcode() == Instruction::Xor) {
-      // We can simplify (X^Y) -> X or Y in the user's context if we know that
-      // only bits from X or Y are demanded.
-
-      computeKnownBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth + 1,
-                       CxtI);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
-
-      // If all of the demanded bits are known zero on one side, return the
-      // other.
-      if ((DemandedMask & RHSKnownZero) == DemandedMask)
-        return I->getOperand(0);
-      if ((DemandedMask & LHSKnownZero) == DemandedMask)
-        return I->getOperand(1);
-    }
+  if (Depth != 0 && !I->hasOneUse())
+    return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
 
-    // Compute the KnownZero/KnownOne bits to simplify things downstream.
-    computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
-    return nullptr;
-  }
+  KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
 
   // If this is the root being simplified, allow it to have multiple uses,
   // just set the DemandedMask to all bits so that we can try to simplify the
   // operands.  This allows visitTruncInst (for example) to simplify the
   // operand of a trunc without duplicating all the logic below.
   if (Depth == 0 && !V->hasOneUse())
-    DemandedMask = APInt::getAllOnesValue(BitWidth);
+    DemandedMask.setAllBits();
 
   switch (I->getOpcode()) {
   default:
-    computeKnownBits(I, KnownZero, KnownOne, Depth, CxtI);
+    computeKnownBits(I, Known, Depth, CxtI);
     break;
-  case Instruction::And:
+  case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
-                             LHSKnownZero, LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
+                             Depth + 1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    APInt IKnownOne = RHSKnown.One & LHSKnown.One;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if ((DemandedMask & ((RHSKnownZero | LHSKnownZero)|
-                         (RHSKnownOne & LHSKnownOne))) == DemandedMask)
-      return Constant::getIntegerValue(VTy, RHSKnownOne & LHSKnownOne);
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(VTy, IKnownOne);
 
     // If all of the demanded bits are known 1 on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
-    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) ==
-        (DemandedMask & ~LHSKnownZero))
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
       return I->getOperand(0);
-    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) ==
-        (DemandedMask & ~RHSKnownZero))
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
       return I->getOperand(1);
 
-    // If all of the demanded bits in the inputs are known zeros, return zero.
-    if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
-      return Constant::getNullValue(VTy);
-
     // If the RHS is a constant, see if we can simplify it.
-    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
       return I;
 
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    KnownOne = RHSKnownOne & LHSKnownOne;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    KnownZero = RHSKnownZero | LHSKnownZero;
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
     break;
-  case Instruction::Or:
+  }
+  case Instruction::Or: {
     // If either the LHS or the RHS are One, the result is One.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne,
-                             LHSKnownZero, LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
+                             Depth + 1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
+    // Output known-1 are known. to be set if s.et in either the LHS | RHS.
+    APInt IKnownOne = RHSKnown.One | LHSKnown.One;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if ((DemandedMask & ((RHSKnownZero & LHSKnownZero)|
-                         (RHSKnownOne | LHSKnownOne))) == DemandedMask)
-      return Constant::getIntegerValue(VTy, RHSKnownOne | LHSKnownOne);
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(VTy, IKnownOne);
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
-    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) ==
-        (DemandedMask & ~LHSKnownOne))
-      return I->getOperand(0);
-    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) ==
-        (DemandedMask & ~RHSKnownOne))
-      return I->getOperand(1);
-
-    // If all of the potentially set bits on one side are known to be set on
-    // the other side, just use the 'other' side.
-    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) ==
-        (DemandedMask & (~RHSKnownZero)))
+    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
       return I->getOperand(0);
-    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) ==
-        (DemandedMask & (~LHSKnownZero)))
+    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
       return I->getOperand(1);
 
     // If the RHS is a constant, see if we can simplify it.
     if (ShrinkDemandedConstant(I, 1, DemandedMask))
       return I;
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    KnownZero = RHSKnownZero & LHSKnownZero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    KnownOne = RHSKnownOne | LHSKnownOne;
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
     break;
+  }
   case Instruction::Xor: {
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, LHSKnownZero,
-                             LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt IKnownZero = (RHSKnownZero & LHSKnownZero) |
-                       (RHSKnownOne & LHSKnownOne);
+    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
+                       (RHSKnown.One & LHSKnown.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    APInt IKnownOne =  (RHSKnownZero & LHSKnownOne) |
-                       (RHSKnownOne & LHSKnownZero);
+    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) |
+                       (RHSKnown.One & LHSKnown.Zero);
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if ((DemandedMask & (IKnownZero|IKnownOne)) == DemandedMask)
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
       return Constant::getIntegerValue(VTy, IKnownOne);
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
-    if ((DemandedMask & RHSKnownZero) == DemandedMask)
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero))
       return I->getOperand(0);
-    if ((DemandedMask & LHSKnownZero) == DemandedMask)
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero))
       return I->getOperand(1);
 
     // If all of the demanded bits are known to be zero on one side or the
     // other, turn this into an *inclusive* or.
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
-    if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) {
       Instruction *Or =
         BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
                                  I->getName());
@@ -360,14 +260,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     // bits on that side are also known to be set on the other side, turn this
     // into an AND, as we know the bits will be cleared.
     //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
-    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) {
-      // all known
-      if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
-        Constant *AndC = Constant::getIntegerValue(VTy,
-                                                   ~RHSKnownOne & DemandedMask);
-        Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
-        return InsertNewInstWith(And, *I);
-      }
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) &&
+        RHSKnown.One.isSubsetOf(LHSKnown.One)) {
+      Constant *AndC = Constant::getIntegerValue(VTy,
+                                                 ~RHSKnown.One & DemandedMask);
+      Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+      return InsertNewInstWith(And, *I);
     }
 
     // If the RHS is a constant, see if we can simplify it.
@@ -383,10 +281,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
           isa<ConstantInt>(I->getOperand(1)) &&
           isa<ConstantInt>(LHSInst->getOperand(1)) &&
-          (LHSKnownOne & RHSKnownOne & DemandedMask) != 0) {
+          (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
         ConstantInt *AndRHS = cast<ConstantInt>(LHSInst->getOperand(1));
         ConstantInt *XorRHS = cast<ConstantInt>(I->getOperand(1));
-        APInt NewMask = ~(LHSKnownOne & RHSKnownOne & DemandedMask);
+        APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
 
         Constant *AndC =
           ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
@@ -400,9 +298,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       }
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    KnownZero= (RHSKnownZero & LHSKnownZero) | (RHSKnownOne & LHSKnownOne);
+    Known.Zero = std::move(IKnownZero);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOne = (RHSKnownZero & LHSKnownOne) | (RHSKnownOne & LHSKnownZero);
+    Known.One  = std::move(IKnownOne);
     break;
   }
   case Instruction::Select:
@@ -412,13 +310,11 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero,
-                             LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
       return I;
-    assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
-    assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
     if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
@@ -426,21 +322,22 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       return I;
 
     // Only known if known in both the LHS and RHS.
-    KnownOne = RHSKnownOne & LHSKnownOne;
-    KnownZero = RHSKnownZero & LHSKnownZero;
+    Known.One = RHSKnown.One & LHSKnown.One;
+    Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
     break;
+  case Instruction::ZExt:
   case Instruction::Trunc: {
-    unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits();
-    DemandedMask = DemandedMask.zext(truncBf);
-    KnownZero = KnownZero.zext(truncBf);
-    KnownOne = KnownOne.zext(truncBf);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
       return I;
-    DemandedMask = DemandedMask.trunc(BitWidth);
-    KnownZero = KnownZero.trunc(BitWidth);
-    KnownOne = KnownOne.trunc(BitWidth);
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+    Known = Known.zextOrTrunc(BitWidth);
+    // Any top bits are known to be zero.
+    if (BitWidth > SrcBitWidth)
+      Known.Zero.setBitsFrom(SrcBitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
   case Instruction::BitCast:
@@ -460,65 +357,38 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Don't touch a vector-to-scalar bitcast.
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
-      return I;
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-    break;
-  case Instruction::ZExt: {
-    // Compute the bits in the result that are not present in the input.
-    unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
-
-    DemandedMask = DemandedMask.trunc(SrcBitWidth);
-    KnownZero = KnownZero.trunc(SrcBitWidth);
-    KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
       return I;
-    DemandedMask = DemandedMask.zext(BitWidth);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-    // The top bits are known to be zero.
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
-  }
   case Instruction::SExt: {
     // Compute the bits in the result that are not present in the input.
-    unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
-    APInt InputDemandedBits = DemandedMask &
-                              APInt::getLowBitsSet(BitWidth, SrcBitWidth);
+    APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
 
-    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
     // If any of the sign extended bits are demanded, we know that the sign
     // bit is demanded.
-    if ((NewBits & DemandedMask) != 0)
+    if (DemandedMask.getActiveBits() > SrcBitWidth)
       InputDemandedBits.setBit(SrcBitWidth-1);
 
-    InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
-    KnownZero = KnownZero.trunc(SrcBitWidth);
-    KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, KnownZero,
-                             KnownOne, Depth + 1))
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
       return I;
-    InputDemandedBits = InputDemandedBits.zext(BitWidth);
-    KnownZero = KnownZero.zext(BitWidth);
-    KnownOne = KnownOne.zext(BitWidth);
-    assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-
-    // If the sign bit of the input is known set or clear, then we know the
-    // top bits of the result.
 
     // If the input sign bit is known zero, or if the NewBits are not demanded
     // convert this into a zero extension.
-    if (KnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
-      // Convert to ZExt cast
+    if (InputKnown.isNonNegative() ||
+        DemandedMask.getActiveBits() <= SrcBitWidth) {
+      // Convert to ZExt cast.
       CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
       return InsertNewInstWith(NewCast, *I);
-    } else if (KnownOne[SrcBitWidth-1]) {    // Input sign bit known set
-      KnownOne |= NewBits;
-    }
+     }
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    Known = InputKnown.sext(BitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
   case Instruction::Add:
@@ -530,11 +400,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Right fill the mask of bits for this ADD/SUB to demand the most
       // significant bit and all those below it.
       APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth + 1) ||
+      if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+          SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
           ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
-          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth + 1)) {
+          SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
         // Disable the nsw and nuw flags here: We can no longer guarantee that
         // we won't wrap after simplification. Removing the nsw/nuw flags is
         // legal here because the top bit is not demanded.
@@ -543,24 +412,33 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         BinOP.setHasNoUnsignedWrap(false);
         return I;
       }
+
+      // If we are known to be adding/subtracting zeros to every bit below
+      // the highest demanded bit, we just return the other side.
+      if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
+        return I->getOperand(0);
+      // We can't do this with the LHS for subtraction, unless we are only
+      // demanding the LSB.
+      if ((I->getOpcode() == Instruction::Add ||
+           DemandedFromOps.isOneValue()) &&
+          DemandedFromOps.isSubsetOf(LHSKnown.Zero))
+        return I->getOperand(1);
     }
 
     // Otherwise just hand the add/sub off to computeKnownBits to fill in
     // the known zeros and ones.
-    computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
+    computeKnownBits(V, Known, Depth, CxtI);
     break;
   }
-  case Instruction::Shl:
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-      {
-        Value *VarX; ConstantInt *C1;
-        if (match(I->getOperand(0), m_Shr(m_Value(VarX), m_ConstantInt(C1)))) {
-          Instruction *Shr = cast<Instruction>(I->getOperand(0));
-          Value *R = SimplifyShrShlDemandedBits(Shr, I, DemandedMask,
-                                                KnownZero, KnownOne);
-          if (R)
-            return R;
-        }
+  case Instruction::Shl: {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
+      const APInt *ShrAmt;
+      if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt)))) {
+        Instruction *Shr = cast<Instruction>(I->getOperand(0));
+        if (Value *R = simplifyShrShlDemandedBits(
+                Shr, *ShrAmt, I, *SA, DemandedMask, Known))
+          return R;
       }
 
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
@@ -569,24 +447,24 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is NUW/NSW, then it does demand the high bits.
       ShlOperator *IOp = cast<ShlOperator>(I);
       if (IOp->hasNoSignedWrap())
-        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+        DemandedMaskIn.setHighBits(ShiftAmt+1);
       else if (IOp->hasNoUnsignedWrap())
-        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setHighBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
-      assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-      KnownZero <<= ShiftAmt;
-      KnownOne  <<= ShiftAmt;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero <<= ShiftAmt;
+      Known.One  <<= ShiftAmt;
       // low bits known zero.
       if (ShiftAmt)
-        KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        Known.Zero.setLowBits(ShiftAmt);
     }
     break;
-  case Instruction::LShr:
-    // For a logical shift right
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+  }
+  case Instruction::LShr: {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
       // Unsigned shift right.
@@ -595,27 +473,24 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<LShrOperator>(I)->isExact())
-        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setLowBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
-      assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-      if (ShiftAmt) {
-        // Compute the new bits that are at the top now.
-        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-        KnownZero |= HighBits;  // high bits known zero.
-      }
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+      Known.Zero.lshrInPlace(ShiftAmt);
+      Known.One.lshrInPlace(ShiftAmt);
+      if (ShiftAmt)
+        Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
     }
     break;
-  case Instruction::AShr:
+  }
+  case Instruction::AShr: {
     // If this is an arithmetic shift right and only the low-bit is set, we can
     // always convert this into a logical shr, even if the shift amount is
     // variable.  The low bit of the shift cannot be an input sign bit unless
     // the shift amount is >= the size of the datatype, which is undefined.
-    if (DemandedMask == 1) {
+    if (DemandedMask.isOneValue()) {
       // Perform the logical shift right.
       Instruction *NewVal = BinaryOperator::CreateLShr(
                         I->getOperand(0), I->getOperand(1), I->getName());
@@ -624,57 +499,58 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
     // If the sign bit is the only bit demanded by this ashr, then there is no
     // need to do it, the shift doesn't change the high bit.
-    if (DemandedMask.isSignBit())
+    if (DemandedMask.isSignMask())
       return I->getOperand(0);
 
-    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt *SA;
+    if (match(I->getOperand(1), m_APInt(SA))) {
       uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
       // Signed shift right.
       APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
-      // If any of the "high bits" are demanded, we should set the sign bit as
+      // If any of the high bits are demanded, we should set the sign bit as
       // demanded.
       if (DemandedMask.countLeadingZeros() <= ShiftAmt)
-        DemandedMaskIn.setBit(BitWidth-1);
+        DemandedMaskIn.setSignBit();
 
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<AShrOperator>(I)->isExact())
-        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setLowBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
-      assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now.
       APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      Known.Zero.lshrInPlace(ShiftAmt);
+      Known.One.lshrInPlace(ShiftAmt);
 
       // Handle the sign bits.
-      APInt SignBit(APInt::getSignBit(BitWidth));
+      APInt SignMask(APInt::getSignMask(BitWidth));
       // Adjust to where it is now in the mask.
-      SignBit = APIntOps::lshr(SignBit, ShiftAmt);
+      SignMask.lshrInPlace(ShiftAmt);
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (BitWidth <= ShiftAmt || KnownZero[BitWidth-ShiftAmt-1] ||
-          (HighBits & ~DemandedMask) == HighBits) {
-        // Perform the logical shift right.
-        BinaryOperator *NewVal = BinaryOperator::CreateLShr(I->getOperand(0),
-                                                            SA, I->getName());
-        NewVal->setIsExact(cast<BinaryOperator>(I)->isExact());
-        return InsertNewInstWith(NewVal, *I);
-      } else if ((KnownOne & SignBit) != 0) { // New bits are known one.
-        KnownOne |= HighBits;
+      if (BitWidth <= ShiftAmt || Known.Zero[BitWidth-ShiftAmt-1] ||
+          !DemandedMask.intersects(HighBits)) {
+        BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
+                                                          I->getOperand(1));
+        LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
+        return InsertNewInstWith(LShr, *I);
+      } else if (Known.One.intersects(SignMask)) { // New bits are known one.
+        Known.One |= HighBits;
       }
     }
     break;
+  }
   case Instruction::SRem:
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       // X % -1 demands all the bits because we don't want to introduce
       // INT_MIN % -1 (== undef) by accident.
-      if (Rem->isAllOnesValue())
+      if (Rem->isMinusOne())
         break;
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
@@ -682,53 +558,47 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           return I->getOperand(0);
 
         APInt LowBits = RA - 1;
-        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, LHSKnownZero,
-                                 LHSKnownOne, Depth + 1))
+        APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
+        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
           return I;
 
         // The low bits of LHS are unchanged by the srem.
-        KnownZero = LHSKnownZero & LowBits;
-        KnownOne = LHSKnownOne & LowBits;
+        Known.Zero = LHSKnown.Zero & LowBits;
+        Known.One = LHSKnown.One & LowBits;
 
         // If LHS is non-negative or has all low bits zero, then the upper bits
         // are all zero.
-        if (LHSKnownZero[BitWidth-1] || ((LHSKnownZero & LowBits) == LowBits))
-          KnownZero |= ~LowBits;
+        if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
+          Known.Zero |= ~LowBits;
 
         // If LHS is negative and not all low bits are zero, then the upper bits
         // are all one.
-        if (LHSKnownOne[BitWidth-1] && ((LHSKnownOne & LowBits) != 0))
-          KnownOne |= ~LowBits;
+        if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
+          Known.One |= ~LowBits;
 
-        assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
+        assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+        break;
       }
     }
 
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
-    if (DemandedMask.isNegative() && KnownZero.isNonNegative()) {
-      APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
-      computeKnownBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth + 1,
-                       CxtI);
+    if (DemandedMask.isSignBitSet()) {
+      computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
       // If it's known zero, our sign bit is also zero.
-      if (LHSKnownZero.isNegative())
-        KnownZero.setBit(KnownZero.getBitWidth() - 1);
+      if (LHSKnown.isNonNegative())
+        Known.makeNonNegative();
     }
     break;
   case Instruction::URem: {
-    APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
+    KnownBits Known2(BitWidth);
     APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, KnownZero2,
-                             KnownOne2, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), AllOnes, KnownZero2,
-                             KnownOne2, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
       return I;
 
-    unsigned Leaders = KnownZero2.countLeadingOnes();
-    Leaders = std::max(Leaders,
-                       KnownZero2.countLeadingOnes());
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+    unsigned Leaders = Known2.countMinLeadingZeros();
+    Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
     break;
   }
   case Instruction::Call:
@@ -788,29 +658,156 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // If we don't need any of low bits then return zero,
         // we know that DemandedMask is non-zero already.
         APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
-        if (DemandedElts == 0)
+        if (DemandedElts.isNullValue())
           return ConstantInt::getNullValue(VTy);
 
         // We know that the upper bits are set to zero.
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - ArgWidth);
+        Known.Zero.setBitsFrom(ArgWidth);
         return nullptr;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero = APInt::getHighBitsSet(64, 32);
+        Known.Zero.setBitsFrom(32);
         return nullptr;
       }
     }
-    computeKnownBits(V, KnownZero, KnownOne, Depth, CxtI);
+    computeKnownBits(V, Known, Depth, CxtI);
     break;
   }
 
   // If the client is only demanding bits that we know, return the known
   // constant.
-  if ((DemandedMask & (KnownZero|KnownOne)) == DemandedMask)
-    return Constant::getIntegerValue(VTy, KnownOne);
+  if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+    return Constant::getIntegerValue(VTy, Known.One);
+  return nullptr;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It computes Known
+/// bits. It also tries to handle simplifications that can be done based on
+/// DemandedMask, but without modifying the Instruction.
+Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
+                                                     const APInt &DemandedMask,
+                                                     KnownBits &Known,
+                                                     unsigned Depth,
+                                                     Instruction *CxtI) {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  Type *ITy = I->getType();
+
+  KnownBits LHSKnown(BitWidth);
+  KnownBits RHSKnown(BitWidth);
+
+  // Despite the fact that we can't simplify this instruction in all User's
+  // context, we can at least compute the known bits, and we can
+  // do simplifications that apply to *just* the one user if we know that
+  // this instruction has a simpler value in that context.
+  switch (I->getOpcode()) {
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    APInt IKnownOne = RHSKnown.One & LHSKnown.One;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return I->getOperand(1);
+
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Or: {
+    // We can simplify (X|Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    // If either the LHS or the RHS are One, the result is One.
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    APInt IKnownOne = RHSKnown.One | LHSKnown.One;
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return I->getOperand(1);
+
+    Known.Zero = std::move(IKnownZero);
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  case Instruction::Xor: {
+    // We can simplify (X^Y) -> X or Y in the user's context if we know that
+    // only bits from X or Y are demanded.
+
+    computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+    computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+                     CxtI);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
+                       (RHSKnown.One & LHSKnown.One);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) |
+                       (RHSKnown.One & LHSKnown.Zero);
+
+    // If the client is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
+      return Constant::getIntegerValue(ITy, IKnownOne);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+      return I->getOperand(0);
+    if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+      return I->getOperand(1);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    Known.Zero = std::move(IKnownZero);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    Known.One  = std::move(IKnownOne);
+    break;
+  }
+  default:
+    // Compute the Known bits to simplify things downstream.
+    computeKnownBits(I, Known, Depth, CxtI);
+
+    // If this user is only demanding bits that we know, return the known
+    // constant.
+    if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
+
+    break;
+  }
+
   return nullptr;
 }
 
+
 /// Helper routine of SimplifyDemandedUseBits. It tries to simplify
 /// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
 /// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
@@ -828,29 +825,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 ///
 /// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
 /// not successful.
-Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
-                                                Instruction *Shl,
-                                                const APInt &DemandedMask,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne) {
-
-  const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue();
-  const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue();
+Value *
+InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
+                                         Instruction *Shl, const APInt &ShlOp1,
+                                         const APInt &DemandedMask,
+                                         KnownBits &Known) {
   if (!ShlOp1 || !ShrOp1)
-      return nullptr; // Noop.
+    return nullptr; // No-op.
 
   Value *VarX = Shr->getOperand(0);
   Type *Ty = VarX->getType();
-  unsigned BitWidth = Ty->getIntegerBitWidth();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
   if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
     return nullptr; // Undef.
 
   unsigned ShlAmt = ShlOp1.getZExtValue();
   unsigned ShrAmt = ShrOp1.getZExtValue();
 
-  KnownOne.clearAllBits();
-  KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
-  KnownZero &= DemandedMask;
+  Known.One.clearAllBits();
+  Known.Zero.setLowBits(ShlAmt - 1);
+  Known.Zero &= DemandedMask;
 
   APInt BitMask1(APInt::getAllOnesValue(BitWidth));
   APInt BitMask2(APInt::getAllOnesValue(BitWidth));
@@ -916,7 +910,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     return nullptr;
   }
 
-  if (DemandedElts == 0) { // If nothing is demanded, provide undef.
+  if (DemandedElts.isNullValue()) { // If nothing is demanded, provide undef.
     UndefElts = EltMask;
     return UndefValue::get(V->getType());
   }
@@ -1472,14 +1466,213 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       break;
     }
 
+    case Intrinsic::x86_sse2_packssdw_128:
+    case Intrinsic::x86_sse2_packsswb_128:
+    case Intrinsic::x86_sse2_packuswb_128:
+    case Intrinsic::x86_sse41_packusdw:
+    case Intrinsic::x86_avx2_packssdw:
+    case Intrinsic::x86_avx2_packsswb:
+    case Intrinsic::x86_avx2_packusdw:
+    case Intrinsic::x86_avx2_packuswb:
+    case Intrinsic::x86_avx512_packssdw_512:
+    case Intrinsic::x86_avx512_packsswb_512:
+    case Intrinsic::x86_avx512_packusdw_512:
+    case Intrinsic::x86_avx512_packuswb_512: {
+      auto *Ty0 = II->getArgOperand(0)->getType();
+      unsigned InnerVWidth = Ty0->getVectorNumElements();
+      assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
+
+      unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
+      unsigned VWidthPerLane = VWidth / NumLanes;
+      unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
+
+      // Per lane, pack the elements of the first input and then the second.
+      // e.g.
+      // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
+      // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
+      for (int OpNum = 0; OpNum != 2; ++OpNum) {
+        APInt OpDemandedElts(InnerVWidth, 0);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          unsigned LaneIdx = Lane * VWidthPerLane;
+          for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
+            unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
+            if (DemandedElts[Idx])
+              OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
+          }
+        }
+
+        // Demand elements from the operand.
+        auto *Op = II->getArgOperand(OpNum);
+        APInt OpUndefElts(InnerVWidth, 0);
+        TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts,
+                                          Depth + 1);
+        if (TmpV) {
+          II->setArgOperand(OpNum, TmpV);
+          MadeChange = true;
+        }
+
+        // Pack the operand's UNDEF elements, one lane at a time.
+        OpUndefElts = OpUndefElts.zext(VWidth);
+        for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+          APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
+          LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
+          LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
+          UndefElts |= LaneElts;
+        }
+      }
+      break;
+    }
+
+    // PSHUFB
+    case Intrinsic::x86_ssse3_pshuf_b_128:
+    case Intrinsic::x86_avx2_pshuf_b:
+    case Intrinsic::x86_avx512_pshuf_b_512:
+    // PERMILVAR
+    case Intrinsic::x86_avx_vpermilvar_ps:
+    case Intrinsic::x86_avx_vpermilvar_ps_256:
+    case Intrinsic::x86_avx512_vpermilvar_ps_512:
+    case Intrinsic::x86_avx_vpermilvar_pd:
+    case Intrinsic::x86_avx_vpermilvar_pd_256:
+    case Intrinsic::x86_avx512_vpermilvar_pd_512:
+    // PERMV
+    case Intrinsic::x86_avx2_permd:
+    case Intrinsic::x86_avx2_permps: {
+      Value *Op1 = II->getArgOperand(1);
+      TmpV = SimplifyDemandedVectorElts(Op1, DemandedElts, UndefElts,
+                                        Depth + 1);
+      if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
+      break;
+    }
+
     // SSE4A instructions leave the upper 64-bits of the 128-bit result
     // in an undefined state.
     case Intrinsic::x86_sse4a_extrq:
     case Intrinsic::x86_sse4a_extrqi:
     case Intrinsic::x86_sse4a_insertq:
     case Intrinsic::x86_sse4a_insertqi:
-      UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2);
+      UndefElts.setHighBits(VWidth / 2);
       break;
+    case Intrinsic::amdgcn_buffer_load:
+    case Intrinsic::amdgcn_buffer_load_format:
+    case Intrinsic::amdgcn_image_sample:
+    case Intrinsic::amdgcn_image_sample_cl:
+    case Intrinsic::amdgcn_image_sample_d:
+    case Intrinsic::amdgcn_image_sample_d_cl:
+    case Intrinsic::amdgcn_image_sample_l:
+    case Intrinsic::amdgcn_image_sample_b:
+    case Intrinsic::amdgcn_image_sample_b_cl:
+    case Intrinsic::amdgcn_image_sample_lz:
+    case Intrinsic::amdgcn_image_sample_cd:
+    case Intrinsic::amdgcn_image_sample_cd_cl:
+
+    case Intrinsic::amdgcn_image_sample_c:
+    case Intrinsic::amdgcn_image_sample_c_cl:
+    case Intrinsic::amdgcn_image_sample_c_d:
+    case Intrinsic::amdgcn_image_sample_c_d_cl:
+    case Intrinsic::amdgcn_image_sample_c_l:
+    case Intrinsic::amdgcn_image_sample_c_b:
+    case Intrinsic::amdgcn_image_sample_c_b_cl:
+    case Intrinsic::amdgcn_image_sample_c_lz:
+    case Intrinsic::amdgcn_image_sample_c_cd:
+    case Intrinsic::amdgcn_image_sample_c_cd_cl:
+
+    case Intrinsic::amdgcn_image_sample_o:
+    case Intrinsic::amdgcn_image_sample_cl_o:
+    case Intrinsic::amdgcn_image_sample_d_o:
+    case Intrinsic::amdgcn_image_sample_d_cl_o:
+    case Intrinsic::amdgcn_image_sample_l_o:
+    case Intrinsic::amdgcn_image_sample_b_o:
+    case Intrinsic::amdgcn_image_sample_b_cl_o:
+    case Intrinsic::amdgcn_image_sample_lz_o:
+    case Intrinsic::amdgcn_image_sample_cd_o:
+    case Intrinsic::amdgcn_image_sample_cd_cl_o:
+
+    case Intrinsic::amdgcn_image_sample_c_o:
+    case Intrinsic::amdgcn_image_sample_c_cl_o:
+    case Intrinsic::amdgcn_image_sample_c_d_o:
+    case Intrinsic::amdgcn_image_sample_c_d_cl_o:
+    case Intrinsic::amdgcn_image_sample_c_l_o:
+    case Intrinsic::amdgcn_image_sample_c_b_o:
+    case Intrinsic::amdgcn_image_sample_c_b_cl_o:
+    case Intrinsic::amdgcn_image_sample_c_lz_o:
+    case Intrinsic::amdgcn_image_sample_c_cd_o:
+    case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
+
+    case Intrinsic::amdgcn_image_getlod: {
+      if (VWidth == 1 || !DemandedElts.isMask())
+        return nullptr;
+
+      // TODO: Handle 3 vectors when supported in code gen.
+      unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
+      if (NewNumElts == VWidth)
+        return nullptr;
+
+      Module *M = II->getParent()->getParent()->getParent();
+      Type *EltTy = V->getType()->getVectorElementType();
+
+      Type *NewTy = (NewNumElts == 1) ? EltTy :
+        VectorType::get(EltTy, NewNumElts);
+
+      auto IID = II->getIntrinsicID();
+
+      bool IsBuffer = IID == Intrinsic::amdgcn_buffer_load ||
+                      IID == Intrinsic::amdgcn_buffer_load_format;
+
+      Function *NewIntrin = IsBuffer ?
+        Intrinsic::getDeclaration(M, IID, NewTy) :
+        // Samplers have 3 mangled types.
+        Intrinsic::getDeclaration(M, IID,
+                                  { NewTy, II->getArgOperand(0)->getType(),
+                                      II->getArgOperand(1)->getType()});
+
+      SmallVector<Value *, 5> Args;
+      for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+        Args.push_back(II->getArgOperand(I));
+
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(II);
+
+      CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
+      NewCall->takeName(II);
+      NewCall->copyMetadata(*II);
+
+      if (!IsBuffer) {
+        ConstantInt *DMask = dyn_cast<ConstantInt>(NewCall->getArgOperand(3));
+        if (DMask) {
+          unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+          unsigned PopCnt = 0;
+          unsigned NewDMask = 0;
+          for (unsigned I = 0; I < 4; ++I) {
+            const unsigned Bit = 1 << I;
+            if (!!(DMaskVal & Bit)) {
+              if (++PopCnt > NewNumElts)
+                break;
+
+              NewDMask |= Bit;
+            }
+          }
+
+          NewCall->setArgOperand(3, ConstantInt::get(DMask->getType(), NewDMask));
+        }
+      }
+
+
+      if (NewNumElts == 1) {
+        return Builder.CreateInsertElement(UndefValue::get(V->getType()),
+                                           NewCall, static_cast<uint64_t>(0));
+      }
+
+      SmallVector<uint32_t, 8> EltMask;
+      for (unsigned I = 0; I < VWidth; ++I)
+        EltMask.push_back(I);
+
+      Value *Shuffle = Builder.CreateShuffleVector(
+        NewCall, UndefValue::get(NewTy), EltMask);
+
+      MadeChange = true;
+      return Shuffle;
+    }
     }
     break;
   }
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index b2477f6..dd71a31 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -144,8 +144,9 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
 }
 
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
-  if (Value *V = SimplifyExtractElementInst(
-          EI.getVectorOperand(), EI.getIndexOperand(), DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyExtractElementInst(EI.getVectorOperand(),
+                                            EI.getIndexOperand(),
+                                            SQ.getWithInstruction(&EI)))
     return replaceInstUsesWith(EI, V);
 
   // If vector val is constant with all elements the same, replace EI with
@@ -203,11 +204,11 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       if (I->hasOneUse() &&
           cheapToScalarize(BO, isa<ConstantInt>(EI.getOperand(1)))) {
         Value *newEI0 =
-          Builder->CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
-                                        EI.getName()+".lhs");
+          Builder.CreateExtractElement(BO->getOperand(0), EI.getOperand(1),
+                                       EI.getName()+".lhs");
         Value *newEI1 =
-          Builder->CreateExtractElement(BO->getOperand(1), EI.getOperand(1),
-                                        EI.getName()+".rhs");
+          Builder.CreateExtractElement(BO->getOperand(1), EI.getOperand(1),
+                                       EI.getName()+".rhs");
         return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(),
                                                      newEI0, newEI1, BO);
       }
@@ -249,8 +250,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       // Bitcasts can change the number of vector elements, and they cost
       // nothing.
       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
-        Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
-                                                  EI.getIndexOperand());
+        Value *EE = Builder.CreateExtractElement(CI->getOperand(0),
+                                                 EI.getIndexOperand());
         Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
@@ -268,20 +269,20 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
 
         Value *Cond = SI->getCondition();
         if (Cond->getType()->isVectorTy()) {
-          Cond = Builder->CreateExtractElement(Cond,
-                                               EI.getIndexOperand(),
-                                               Cond->getName() + ".elt");
+          Cond = Builder.CreateExtractElement(Cond,
+                                              EI.getIndexOperand(),
+                                              Cond->getName() + ".elt");
         }
 
         Value *V1Elem
-          = Builder->CreateExtractElement(TrueVal,
-                                          EI.getIndexOperand(),
-                                          TrueVal->getName() + ".elt");
+          = Builder.CreateExtractElement(TrueVal,
+                                         EI.getIndexOperand(),
+                                         TrueVal->getName() + ".elt");
 
         Value *V2Elem
-          = Builder->CreateExtractElement(FalseVal,
-                                          EI.getIndexOperand(),
-                                          FalseVal->getName() + ".elt");
+          = Builder.CreateExtractElement(FalseVal,
+                                         EI.getIndexOperand(),
+                                         FalseVal->getName() + ".elt");
         return SelectInst::Create(Cond,
                                   V1Elem,
                                   V2Elem,
@@ -440,7 +441,7 @@ static void replaceExtractElements(InsertElementInst *InsElt,
     if (!OldExt || OldExt->getParent() != WideVec->getParent())
       continue;
     auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
-    NewExt->insertAfter(WideVec);
+    NewExt->insertAfter(OldExt);
     IC.replaceInstUsesWith(*OldExt, NewExt);
   }
 }
@@ -645,6 +646,36 @@ static Instruction *foldInsSequenceIntoBroadcast(InsertElementInst &InsElt) {
   return new ShuffleVectorInst(InsertFirst, UndefValue::get(VT), ZeroMask);
 }
 
+/// If we have an insertelement instruction feeding into another insertelement
+/// and the 2nd is inserting a constant into the vector, canonicalize that
+/// constant insertion before the insertion of a variable:
+///
+/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
+/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
+///
+/// This has the potential of eliminating the 2nd insertelement instruction
+/// via constant folding of the scalar constant into a vector constant.
+static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
+                                     InstCombiner::BuilderTy &Builder) {
+  auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
+  if (!InsElt1 || !InsElt1->hasOneUse())
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *ScalarC;
+  ConstantInt *IdxC1, *IdxC2;
+  if (match(InsElt1->getOperand(0), m_Value(X)) &&
+      match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
+      match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
+      match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
+      match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
+    Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
+    return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
+  }
+
+  return nullptr;
+}
+
 /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
 /// --> shufflevector X, CVec', Mask'
 static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
@@ -806,6 +837,9 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
     return Shuf;
 
+  if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
+    return NewInsElt;
+
   // Turn a sequence of inserts that broadcasts a scalar into a single
   // insert + shufflevector.
   if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE))
@@ -986,9 +1020,9 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
     SmallVector<Constant *, 16> MaskValues;
     for (int i = 0, e = Mask.size(); i != e; ++i) {
       if (Mask[i] == -1)
-        MaskValues.push_back(UndefValue::get(Builder->getInt32Ty()));
+        MaskValues.push_back(UndefValue::get(Builder.getInt32Ty()));
       else
-        MaskValues.push_back(Builder->getInt32(Mask[i]));
+        MaskValues.push_back(Builder.getInt32(Mask[i]));
     }
     return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
                                           ConstantVector::get(MaskValues));
@@ -1061,7 +1095,7 @@ InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
 
       Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask);
       return InsertElementInst::Create(V, I->getOperand(1),
-                                       Builder->getInt32(Index), "", I);
+                                       Builder.getInt32(Index), "", I);
     }
   }
   llvm_unreachable("failed to reorder elements of vector instruction!");
@@ -1107,12 +1141,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
-  bool MadeChange = false;
-
-  // Undefined shuffle mask -> undefined value.
-  if (isa<UndefValue>(SVI.getOperand(2)))
-    return replaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+  if (auto *V = SimplifyShuffleVectorInst(
+          LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI)))
+    return replaceInstUsesWith(SVI, V);
 
+  bool MadeChange = false;
   unsigned VWidth = SVI.getType()->getVectorNumElements();
 
   APInt UndefElts(VWidth, 0);
@@ -1209,7 +1242,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (isShuffleExtractingFromLHS(SVI, Mask)) {
     Value *V = LHS;
     unsigned MaskElems = Mask.size();
-    unsigned BegIdx = Mask.front();
     VectorType *SrcTy = cast<VectorType>(V->getType());
     unsigned VecBitWidth = SrcTy->getBitWidth();
     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
@@ -1223,6 +1255,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
           // Only visit bitcasts that weren't previously handled.
           BCs.push_back(BC);
     for (BitCastInst *BC : BCs) {
+      unsigned BegIdx = Mask.front();
       Type *TgtTy = BC->getDestTy();
       unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
       if (!TgtElemBitWidth)
@@ -1242,9 +1275,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
                                                 UndefValue::get(Int32Ty));
         for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
           ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
-        V = Builder->CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                         ConstantVector::get(ShuffleMask),
-                                         SVI.getName() + ".extract");
+        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                                        ConstantVector::get(ShuffleMask),
+                                        SVI.getName() + ".extract");
         BegIdx = 0;
       }
       unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
@@ -1254,10 +1287,10 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
       auto *NewBC =
           BCAlreadyExists
               ? NewBCs[CastSrcTy]
-              : Builder->CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+              : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
       if (!BCAlreadyExists)
         NewBCs[CastSrcTy] = NewBC;
-      auto *Ext = Builder->CreateExtractElement(
+      auto *Ext = Builder.CreateExtractElement(
           NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
       // The shufflevector isn't being replaced: the bitcast that used it
       // is. InstCombine will visit the newly-created instructions.
diff --git a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 27fc34d..c776656 100644
--- a/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -33,7 +33,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "InstCombineInternal.h"
 #include "llvm-c/Initialization.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -60,7 +59,9 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -82,18 +83,24 @@ static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
 
+static cl::opt<unsigned>
+MaxArraySize("instcombine-maxarray-size", cl::init(1024),
+             cl::desc("Maximum array size considered when doing a combine"));
+
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
-  return llvm::EmitGEPOffset(Builder, DL, GEP);
+  return llvm::EmitGEPOffset(&Builder, DL, GEP);
 }
 
 /// Return true if it is desirable to convert an integer computation from a
 /// given bit width to a new bit width.
-/// We don't want to convert from a legal to an illegal type for example or from
-/// a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(unsigned FromWidth,
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. A width of '1' is always treated as a legal type
+/// because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types.
+bool InstCombiner::shouldChangeType(unsigned FromWidth,
                                     unsigned ToWidth) const {
-  bool FromLegal = DL.isLegalInteger(FromWidth);
-  bool ToLegal = DL.isLegalInteger(ToWidth);
+  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
+  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
@@ -109,14 +116,16 @@ bool InstCombiner::ShouldChangeType(unsigned FromWidth,
 }
 
 /// Return true if it is desirable to convert a computation from 'From' to 'To'.
-/// We don't want to convert from a legal to an illegal type for example or from
-/// a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. i1 is always treated as a legal type because it is
+/// a fundamental type in IR, and there are many specialized optimizations for
+/// i1 types.
+bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
 
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
-  return ShouldChangeType(FromWidth, ToWidth);
+  return shouldChangeType(FromWidth, ToWidth);
 }
 
 // Return true, if No Signed Wrap should be maintained for I.
@@ -140,9 +149,9 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
 
   bool Overflow = false;
   if (Opcode == Instruction::Add)
-    BVal->sadd_ov(*CVal, Overflow);
+    (void)BVal->sadd_ov(*CVal, Overflow);
   else
-    BVal->ssub_ov(*CVal, Overflow);
+    (void)BVal->ssub_ov(*CVal, Overflow);
 
   return !Overflow;
 }
@@ -247,7 +256,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = I.getOperand(1);
 
         // Does "B op C" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, B, C, DL)) {
+        if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "A op V".
           I.setOperand(0, A);
           I.setOperand(1, V);
@@ -276,7 +285,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = Op1->getOperand(1);
 
         // Does "A op B" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, A, B, DL)) {
+        if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "V op C".
           I.setOperand(0, V);
           I.setOperand(1, C);
@@ -304,7 +313,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = I.getOperand(1);
 
         // Does "C op A" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, C, A, DL)) {
+        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "V op B".
           I.setOperand(0, V);
           I.setOperand(1, B);
@@ -324,7 +333,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Value *C = Op1->getOperand(1);
 
         // Does "C op A" simplify?
-        if (Value *V = SimplifyBinOp(Opcode, C, A, DL)) {
+        if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "B op V".
           I.setOperand(0, B);
           I.setOperand(1, V);
@@ -447,16 +456,11 @@ static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
 
 /// This function returns identity value for given opcode, which can be used to
 /// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
-static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) {
+static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
   if (isa<Constant>(V))
     return nullptr;
 
-  if (OpCode == Instruction::Mul)
-    return ConstantInt::get(V->getType(), 1);
-
-  // TODO: We can handle other cases e.g. Instruction::And, Instruction::Or etc.
-
-  return nullptr;
+  return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
 }
 
 /// This function factors binary ops which can be combined using distributive
@@ -468,8 +472,7 @@ static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) {
 static Instruction::BinaryOps
 getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
                           BinaryOperator *Op, Value *&LHS, Value *&RHS) {
-  if (!Op)
-    return Instruction::BinaryOpsEnd;
+  assert(Op && "Expected a binary operator");
 
   LHS = Op->getOperand(0);
   RHS = Op->getOperand(1);
@@ -495,15 +498,10 @@ getBinOpsForFactorization(Instruction::BinaryOps TopLevelOpcode,
 
 /// This tries to simplify binary operations by factorizing out common terms
 /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
-static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
-                               const DataLayout &DL, BinaryOperator &I,
-                               Instruction::BinaryOps InnerOpcode, Value *A,
-                               Value *B, Value *C, Value *D) {
-
-  // If any of A, B, C, D are null, we can not factor I, return early.
-  // Checking A and C should be enough.
-  if (!A || !C || !B || !D)
-    return nullptr;
+Value *InstCombiner::tryFactorization(BinaryOperator &I,
+                                      Instruction::BinaryOps InnerOpcode,
+                                      Value *A, Value *B, Value *C, Value *D) {
+  assert(A && B && C && D && "All values must be provided");
 
   Value *V = nullptr;
   Value *SimplifiedInst = nullptr;
@@ -522,13 +520,13 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
         std::swap(C, D);
       // Consider forming "A op' (B op D)".
       // If "B op D" simplifies then it can be formed with no cost.
-      V = SimplifyBinOp(TopLevelOpcode, B, D, DL);
+      V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
       // If "B op D" doesn't simplify then only go on if both of the existing
       // operations "A op' B" and "C op' D" will be zapped as no longer used.
       if (!V && LHS->hasOneUse() && RHS->hasOneUse())
-        V = Builder->CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
+        V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
       if (V) {
-        SimplifiedInst = Builder->CreateBinOp(InnerOpcode, A, V);
+        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
       }
     }
 
@@ -541,14 +539,14 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
         std::swap(C, D);
       // Consider forming "(A op C) op' B".
       // If "A op C" simplifies then it can be formed with no cost.
-      V = SimplifyBinOp(TopLevelOpcode, A, C, DL);
+      V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
 
       // If "A op C" doesn't simplify then only go on if both of the existing
       // operations "A op' B" and "C op' D" will be zapped as no longer used.
       if (!V && LHS->hasOneUse() && RHS->hasOneUse())
-        V = Builder->CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
+        V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
       if (V) {
-        SimplifiedInst = Builder->CreateBinOp(InnerOpcode, V, B);
+        SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
       }
     }
 
@@ -564,13 +562,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
         if (isa<OverflowingBinaryOperator>(&I))
           HasNSW = I.hasNoSignedWrap();
 
-        if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
-          if (isa<OverflowingBinaryOperator>(Op0))
-            HasNSW &= Op0->hasNoSignedWrap();
+        if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS))
+          HasNSW &= LOBO->hasNoSignedWrap();
 
-        if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
-          if (isa<OverflowingBinaryOperator>(Op1))
-            HasNSW &= Op1->hasNoSignedWrap();
+        if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS))
+          HasNSW &= ROBO->hasNoSignedWrap();
 
         // We can propagate 'nsw' if we know that
         //  %Y = mul nsw i16 %X, C
@@ -599,31 +595,39 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
   BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+  Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
 
-  // Factorization.
-  Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-  auto TopLevelOpcode = I.getOpcode();
-  auto LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
-  auto RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
-
-  // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
-  // a common term.
-  if (LHSOpcode == RHSOpcode) {
-    if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, C, D))
-      return V;
-  }
-
-  // The instruction has the form "(A op' B) op (C)".  Try to factorize common
-  // term.
-  if (Value *V = tryFactorization(Builder, DL, I, LHSOpcode, A, B, RHS,
-                                  getIdentityValue(LHSOpcode, RHS)))
-    return V;
+  {
+    // Factorization.
+    Value *A, *B, *C, *D;
+    Instruction::BinaryOps LHSOpcode, RHSOpcode;
+    if (Op0)
+      LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
+    if (Op1)
+      RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
+
+    // The instruction has the form "(A op' B) op (C op' D)".  Try to factorize
+    // a common term.
+    if (Op0 && Op1 && LHSOpcode == RHSOpcode)
+      if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
+        return V;
+
+    // The instruction has the form "(A op' B) op (C)".  Try to factorize common
+    // term.
+    if (Op0)
+      if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
+        if (Value *V =
+                tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
+          return V;
 
-  // The instruction has the form "(B) op (C op' D)".  Try to factorize common
-  // term.
-  if (Value *V = tryFactorization(Builder, DL, I, RHSOpcode, LHS,
-                                  getIdentityValue(RHSOpcode, LHS), C, D))
-    return V;
+    // The instruction has the form "(B) op (C op' D)".  Try to factorize common
+    // term.
+    if (Op1)
+      if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
+        if (Value *V =
+                tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
+          return V;
+  }
 
   // Expansion.
   if (Op0 && RightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
@@ -632,23 +636,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
     Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
 
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+    Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQ.getWithInstruction(&I));
+
     // Do "A op C" and "B op C" both simplify?
-    if (Value *L = SimplifyBinOp(TopLevelOpcode, A, C, DL))
-      if (Value *R = SimplifyBinOp(TopLevelOpcode, B, C, DL)) {
-        // They do! Return "L op' R".
-        ++NumExpand;
-        // If "L op' R" equals "A op' B" then "L op' R" is just the LHS.
-        if ((L == A && R == B) ||
-            (Instruction::isCommutative(InnerOpcode) && L == B && R == A))
-          return Op0;
-        // Otherwise return "L op' R" if it simplifies.
-        if (Value *V = SimplifyBinOp(InnerOpcode, L, R, DL))
-          return V;
-        // Otherwise, create a new instruction.
-        C = Builder->CreateBinOp(InnerOpcode, L, R);
-        C->takeName(&I);
-        return C;
-      }
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      C = Builder.CreateBinOp(InnerOpcode, L, R);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "B op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, B, C);
+      C->takeName(&I);
+      return C;
+    }
+
+    // Does "B op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      C = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      C->takeName(&I);
+      return C;
+    }
   }
 
   if (Op1 && LeftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
@@ -657,23 +673,35 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
     Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
 
+    Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQ.getWithInstruction(&I));
+    Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+
     // Do "A op B" and "A op C" both simplify?
-    if (Value *L = SimplifyBinOp(TopLevelOpcode, A, B, DL))
-      if (Value *R = SimplifyBinOp(TopLevelOpcode, A, C, DL)) {
-        // They do! Return "L op' R".
-        ++NumExpand;
-        // If "L op' R" equals "B op' C" then "L op' R" is just the RHS.
-        if ((L == B && R == C) ||
-            (Instruction::isCommutative(InnerOpcode) && L == C && R == B))
-          return Op1;
-        // Otherwise return "L op' R" if it simplifies.
-        if (Value *V = SimplifyBinOp(InnerOpcode, L, R, DL))
-          return V;
-        // Otherwise, create a new instruction.
-        A = Builder->CreateBinOp(InnerOpcode, L, R);
-        A->takeName(&I);
-        return A;
-      }
+    if (L && R) {
+      // They do! Return "L op' R".
+      ++NumExpand;
+      A = Builder.CreateBinOp(InnerOpcode, L, R);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op B" simplify to the identity value for the inner opcode?
+    if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+      // They do! Return "A op C".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, C);
+      A->takeName(&I);
+      return A;
+    }
+
+    // Does "A op C" simplify to the identity value for the inner opcode?
+    if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+      // They do! Return "A op B".
+      ++NumExpand;
+      A = Builder.CreateBinOp(TopLevelOpcode, A, B);
+      A->takeName(&I);
+      return A;
+    }
   }
 
   // (op (select (a, c, b)), (select (a, d, b))) -> (select (a, (op c, d), 0))
@@ -682,19 +710,21 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
     if (auto *SI1 = dyn_cast<SelectInst>(RHS)) {
       if (SI0->getCondition() == SI1->getCondition()) {
         Value *SI = nullptr;
-        if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(),
-                                     SI1->getFalseValue(), DL, &TLI, &DT, &AC))
-          SI = Builder->CreateSelect(SI0->getCondition(),
-                                     Builder->CreateBinOp(TopLevelOpcode,
-                                                          SI0->getTrueValue(),
-                                                          SI1->getTrueValue()),
-                                     V);
-        if (Value *V = SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(),
-                                     SI1->getTrueValue(), DL, &TLI, &DT, &AC))
-          SI = Builder->CreateSelect(
+        if (Value *V =
+                SimplifyBinOp(TopLevelOpcode, SI0->getFalseValue(),
+                              SI1->getFalseValue(), SQ.getWithInstruction(&I)))
+          SI = Builder.CreateSelect(SI0->getCondition(),
+                                    Builder.CreateBinOp(TopLevelOpcode,
+                                                        SI0->getTrueValue(),
+                                                        SI1->getTrueValue()),
+                                    V);
+        if (Value *V =
+                SimplifyBinOp(TopLevelOpcode, SI0->getTrueValue(),
+                              SI1->getTrueValue(), SQ.getWithInstruction(&I)))
+          SI = Builder.CreateSelect(
               SI0->getCondition(), V,
-              Builder->CreateBinOp(TopLevelOpcode, SI0->getFalseValue(),
-                                   SI1->getFalseValue()));
+              Builder.CreateBinOp(TopLevelOpcode, SI0->getFalseValue(),
+                                  SI1->getFalseValue()));
         if (SI) {
           SI->takeName(&I);
           return SI;
@@ -720,6 +750,21 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
     if (C->getType()->getElementType()->isIntegerTy())
       return ConstantExpr::getNeg(C);
 
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      Constant *Elt = CV->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      if (!isa<ConstantInt>(Elt))
+        return nullptr;
+    }
+    return ConstantExpr::getNeg(CV);
+  }
+
   return nullptr;
 }
 
@@ -741,9 +786,9 @@ Value *InstCombiner::dyn_castFNegVal(Value *V, bool IgnoreZeroSign) const {
 }
 
 static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
-                                             InstCombiner *IC) {
+                                             InstCombiner::BuilderTy &Builder) {
   if (auto *Cast = dyn_cast<CastInst>(&I))
-    return IC->Builder->CreateCast(Cast->getOpcode(), SO, I.getType());
+    return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());
 
   assert(I.isBinaryOp() && "Unexpected opcode for select folding");
 
@@ -762,8 +807,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
     std::swap(Op0, Op1);
 
   auto *BO = cast<BinaryOperator>(&I);
-  Value *RI = IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1,
-                                       SO->getName() + ".op");
+  Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
+                                  SO->getName() + ".op");
   auto *FPInst = dyn_cast<Instruction>(RI);
   if (FPInst && isa<FPMathOperator>(FPInst))
     FPInst->copyFastMathFlags(BO);
@@ -781,7 +826,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
     return nullptr;
 
   // Bool selects with constant operands can be folded to logical ops.
-  if (SI->getType()->getScalarType()->isIntegerTy(1))
+  if (SI->getType()->isIntOrIntVectorTy(1))
     return nullptr;
 
   // If it's a bitcast involving vectors, make sure it has the same number of
@@ -815,13 +860,34 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
     }
   }
 
-  Value *NewTV = foldOperationIntoSelectOperand(Op, TV, this);
-  Value *NewFV = foldOperationIntoSelectOperand(Op, FV, this);
+  Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
+  Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
   return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
 }
 
-Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
-  PHINode *PN = cast<PHINode>(I.getOperand(0));
+static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
+                                        InstCombiner::BuilderTy &Builder) {
+  bool ConstIsRHS = isa<Constant>(I->getOperand(1));
+  Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
+
+  if (auto *InC = dyn_cast<Constant>(InV)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I->getOpcode(), InC, C);
+    return ConstantExpr::get(I->getOpcode(), C, InC);
+  }
+
+  Value *Op0 = InV, *Op1 = C;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+
+  Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phitmp");
+  auto *FPInst = dyn_cast<Instruction>(RI);
+  if (FPInst && isa<FPMathOperator>(FPInst))
+    FPInst->copyFastMathFlags(I);
+  return RI;
+}
+
+Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   unsigned NumPHIValues = PN->getNumIncomingValues();
   if (NumPHIValues == 0)
     return nullptr;
@@ -885,7 +951,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   // If we are going to have to insert a new computation, do so right before the
   // predecessor's terminator.
   if (NonConstBB)
-    Builder->SetInsertPoint(NonConstBB->getTerminator());
+    Builder.SetInsertPoint(NonConstBB->getTerminator());
 
   // Next, add all of the operands to the PHI.
   if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
@@ -902,11 +968,25 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
       // even if currently isNullValue gives false.
       Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
-      if (InC && !isa<ConstantExpr>(InC))
+      // For vector constants, we cannot use isNullValue to fold into
+      // FalseVInPred versus TrueVInPred. When we have individual nonzero
+      // elements in the vector, we will incorrectly fold InC to
+      // `TrueVInPred`.
+      if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
-      else
-        InV = Builder->CreateSelect(PN->getIncomingValue(i),
-                                    TrueVInPred, FalseVInPred, "phitmp");
+      else {
+        // Generate the select in the same block as PN's current incoming block.
+        // Note: ThisBB need not be the NonConstBB because vector constants
+        // which are constants by definition are handled here.
+        // FIXME: This can lead to an increase in IR generation because we might
+        // generate selects for vector constant phi operand, that could not be
+        // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
+        // non-vector phis, this transformation was always profitable because
+        // the select would be generated exactly once in the NonConstBB.
+        Builder.SetInsertPoint(ThisBB->getTerminator());
+        InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
+                                   FalseVInPred, "phitmp");
+      }
       NewPN->addIncoming(InV, ThisBB);
     }
   } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
@@ -916,22 +996,17 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
       else if (isa<ICmpInst>(CI))
-        InV = Builder->CreateICmp(CI->getPredicate(), PN->getIncomingValue(i),
-                                  C, "phitmp");
+        InV = Builder.CreateICmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                 C, "phitmp");
       else
-        InV = Builder->CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i),
-                                  C, "phitmp");
+        InV = Builder.CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                 C, "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
-  } else if (I.getNumOperands() == 2) {
-    Constant *C = cast<Constant>(I.getOperand(1));
+  } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
     for (unsigned i = 0; i != NumPHIValues; ++i) {
-      Value *InV = nullptr;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
-        InV = ConstantExpr::get(I.getOpcode(), InC, C);
-      else
-        InV = Builder->CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
-                                   PN->getIncomingValue(i), C, "phitmp");
+      Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
+                                             Builder);
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   } else {
@@ -942,8 +1017,8 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
       else
-        InV = Builder->CreateCast(CI->getOpcode(),
-                                PN->getIncomingValue(i), I.getType(), "phitmp");
+        InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
+                                 I.getType(), "phitmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   }
@@ -957,14 +1032,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return replaceInstUsesWith(I, NewPN);
 }
 
-Instruction *InstCombiner::foldOpWithConstantIntoOperand(Instruction &I) {
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(BinaryOperator &I) {
   assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
     if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
       return NewSel;
-  } else if (isa<PHINode>(I.getOperand(0))) {
-    if (Instruction *NewPhi = FoldOpIntoPhi(I))
+  } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
+    if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
       return NewPhi;
   }
   return nullptr;
@@ -1289,8 +1364,8 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
 /// \brief Creates node of binary operation with the same attributes as the
 /// specified one but with other operands.
 static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
-                                 InstCombiner::BuilderTy *B) {
-  Value *BO = B->CreateBinOp(Inst.getOpcode(), LHS, RHS);
+                                 InstCombiner::BuilderTy &B) {
+  Value *BO = B.CreateBinOp(Inst.getOpcode(), LHS, RHS);
   // If LHS and RHS are constant, BO won't be a binary operator.
   if (BinaryOperator *NewBO = dyn_cast<BinaryOperator>(BO))
     NewBO->copyIRFlags(&Inst);
@@ -1315,22 +1390,19 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
   assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
 
-  // If both arguments of binary operation are shuffles, which use the same
-  // mask and shuffle within a single vector, it is worthwhile to move the
-  // shuffle after binary operation:
+  // If both arguments of the binary operation are shuffles that use the same
+  // mask and shuffle within a single vector, move the shuffle after the binop:
   //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
-  if (isa<ShuffleVectorInst>(LHS) && isa<ShuffleVectorInst>(RHS)) {
-    ShuffleVectorInst *LShuf = cast<ShuffleVectorInst>(LHS);
-    ShuffleVectorInst *RShuf = cast<ShuffleVectorInst>(RHS);
-    if (isa<UndefValue>(LShuf->getOperand(1)) &&
-        isa<UndefValue>(RShuf->getOperand(1)) &&
-        LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType() &&
-        LShuf->getMask() == RShuf->getMask()) {
-      Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
-          RShuf->getOperand(0), Builder);
-      return Builder->CreateShuffleVector(NewBO,
-          UndefValue::get(NewBO->getType()), LShuf->getMask());
-    }
+  auto *LShuf = dyn_cast<ShuffleVectorInst>(LHS);
+  auto *RShuf = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LShuf && RShuf && LShuf->getMask() == RShuf->getMask() &&
+      isa<UndefValue>(LShuf->getOperand(1)) &&
+      isa<UndefValue>(RShuf->getOperand(1)) &&
+      LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType()) {
+    Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
+                                      RShuf->getOperand(0), Builder);
+    return Builder.CreateShuffleVector(
+        NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
   }
 
   // If one argument is a shuffle within one vector, the other is a constant,
@@ -1368,7 +1440,7 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
       Value *NewLHS = isa<Constant>(LHS) ? C2 : Shuffle->getOperand(0);
       Value *NewRHS = isa<Constant>(LHS) ? Shuffle->getOperand(0) : C2;
       Value *NewBO = CreateBinOpAsGiven(Inst, NewLHS, NewRHS, Builder);
-      return Builder->CreateShuffleVector(NewBO,
+      return Builder.CreateShuffleVector(NewBO,
           UndefValue::get(Inst.getType()), Shuffle->getMask());
     }
   }
@@ -1379,8 +1451,8 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
-  if (Value *V =
-          SimplifyGEPInst(GEP.getSourceElementType(), Ops, DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyGEPInst(GEP.getSourceElementType(), Ops,
+                                 SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
@@ -1416,7 +1488,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // If we are using a wider index than needed for this platform, shrink
       // it to what we need.  If narrower, sign-extend it to what we need.
       // This explicit cast can make subsequent optimizations more obvious.
-      *I = Builder->CreateIntCast(*I, NewIndexType, true);
+      *I = Builder.CreateIntCast(*I, NewIndexType, true);
       MadeChange = true;
     }
   }
@@ -1510,10 +1582,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // set that index.
       PHINode *NewPN;
       {
-        IRBuilderBase::InsertPointGuard Guard(*Builder);
-        Builder->SetInsertPoint(PN);
-        NewPN = Builder->CreatePHI(Op1->getOperand(DI)->getType(),
-                                   PN->getNumOperands());
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(PN);
+        NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
+                                  PN->getNumOperands());
       }
 
       for (auto &I : PN->operands())
@@ -1559,27 +1631,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // Replace: gep (gep %P, long B), long A, ...
       // With:    T = long A+B; gep %P, T, ...
       //
-      Value *Sum;
       Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
       Value *GO1 = GEP.getOperand(1);
-      if (SO1 == Constant::getNullValue(SO1->getType())) {
-        Sum = GO1;
-      } else if (GO1 == Constant::getNullValue(GO1->getType())) {
-        Sum = SO1;
-      } else {
-        // If they aren't the same type, then the input hasn't been processed
-        // by the loop above yet (which canonicalizes sequential index types to
-        // intptr_t).  Just avoid transforming this until the input has been
-        // normalized.
-        if (SO1->getType() != GO1->getType())
-          return nullptr;
-        // Only do the combine when GO1 and SO1 are both constants. Only in
-        // this case, we are sure the cost after the merge is never more than
-        // that before the merge.
-        if (!isa<Constant>(GO1) || !isa<Constant>(SO1))
-          return nullptr;
-        Sum = Builder->CreateAdd(SO1, GO1, PtrOp->getName()+".sum");
-      }
+
+      // If they aren't the same type, then the input hasn't been processed
+      // by the loop above yet (which canonicalizes sequential index types to
+      // intptr_t).  Just avoid transforming this until the input has been
+      // normalized.
+      if (SO1->getType() != GO1->getType())
+        return nullptr;
+
+      Value *Sum =
+          SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+      // Only do the combine when we are sure the cost after the
+      // merge is never more than that before the merge.
+      if (Sum == nullptr)
+        return nullptr;
 
       // Update the GEP in place if possible.
       if (Src->getNumOperands() == 2) {
@@ -1638,8 +1705,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // pointer arithmetic.
         if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
           Operator *Index = cast<Operator>(V);
-          Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType());
-          Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1));
+          Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
+          Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
           return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
         }
         // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
@@ -1654,14 +1721,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
-  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
-  Value *StrippedPtr = PtrOp->stripPointerCasts();
-  PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
-
   // We do not handle pointer-vector geps here.
-  if (!StrippedPtrTy)
+  if (GEP.getType()->isVectorTy())
     return nullptr;
 
+  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+  Value *StrippedPtr = PtrOp->stripPointerCasts();
+  PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
+
   if (StrippedPtr != PtrOp) {
     bool HasZeroPointerIndex = false;
     if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
@@ -1692,7 +1759,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           // ->
           // %0 = GEP i8 addrspace(1)* X, ...
           // addrspacecast i8 addrspace(1)* %0 to i8*
-          return new AddrSpaceCastInst(Builder->Insert(Res), GEP.getType());
+          return new AddrSpaceCastInst(Builder.Insert(Res), GEP.getType());
         }
 
         if (ArrayType *XATy =
@@ -1720,10 +1787,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // addrspacecast i8 addrspace(1)* %0 to i8*
             SmallVector<Value*, 8> Idx(GEP.idx_begin(), GEP.idx_end());
             Value *NewGEP = GEP.isInBounds()
-                                ? Builder->CreateInBoundsGEP(
+                                ? Builder.CreateInBoundsGEP(
                                       nullptr, StrippedPtr, Idx, GEP.getName())
-                                : Builder->CreateGEP(nullptr, StrippedPtr, Idx,
-                                                     GEP.getName());
+                                : Builder.CreateGEP(nullptr, StrippedPtr, Idx,
+                                                    GEP.getName());
             return new AddrSpaceCastInst(NewGEP, GEP.getType());
           }
         }
@@ -1741,9 +1808,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
         Value *NewGEP =
             GEP.isInBounds()
-                ? Builder->CreateInBoundsGEP(nullptr, StrippedPtr, Idx,
-                                             GEP.getName())
-                : Builder->CreateGEP(nullptr, StrippedPtr, Idx, GEP.getName());
+                ? Builder.CreateInBoundsGEP(nullptr, StrippedPtr, Idx,
+                                            GEP.getName())
+                : Builder.CreateGEP(nullptr, StrippedPtr, Idx, GEP.getName());
 
         // V and GEP are both pointer types --> BitCast
         return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
@@ -1776,10 +1843,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // GEP may not be "inbounds".
             Value *NewGEP =
                 GEP.isInBounds() && NSW
-                    ? Builder->CreateInBoundsGEP(nullptr, StrippedPtr, NewIdx,
-                                                 GEP.getName())
-                    : Builder->CreateGEP(nullptr, StrippedPtr, NewIdx,
-                                         GEP.getName());
+                    ? Builder.CreateInBoundsGEP(nullptr, StrippedPtr, NewIdx,
+                                                GEP.getName())
+                    : Builder.CreateGEP(nullptr, StrippedPtr, NewIdx,
+                                        GEP.getName());
 
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
@@ -1818,10 +1885,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                 NewIdx};
 
             Value *NewGEP = GEP.isInBounds() && NSW
-                                ? Builder->CreateInBoundsGEP(
+                                ? Builder.CreateInBoundsGEP(
                                       SrcElTy, StrippedPtr, Off, GEP.getName())
-                                : Builder->CreateGEP(SrcElTy, StrippedPtr, Off,
-                                                     GEP.getName());
+                                : Builder.CreateGEP(SrcElTy, StrippedPtr, Off,
+                                                    GEP.getName());
             // The NewGEP must be pointer typed, so must the old one -> BitCast
             return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
                                                                  GEP.getType());
@@ -1885,8 +1952,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP =
             GEP.isInBounds()
-                ? Builder->CreateInBoundsGEP(nullptr, Operand, NewIndices)
-                : Builder->CreateGEP(nullptr, Operand, NewIndices);
+                ? Builder.CreateInBoundsGEP(nullptr, Operand, NewIndices)
+                : Builder.CreateGEP(nullptr, Operand, NewIndices);
 
         if (NGEP->getType() == GEP.getType())
           return replaceInstUsesWith(GEP, NGEP);
@@ -1935,9 +2002,9 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
   return isAllocLikeFn(V, TLI) && V != AI;
 }
 
-static bool
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
-                     const TargetLibraryInfo *TLI) {
+static bool isAllocSiteRemovable(Instruction *AI,
+                                 SmallVectorImpl<WeakTrackingVH> &Users,
+                                 const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 4> Worklist;
   Worklist.push_back(AI);
 
@@ -1950,6 +2017,7 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
         // Give up the moment we see something we can't handle.
         return false;
 
+      case Instruction::AddrSpaceCast:
       case Instruction::BitCast:
       case Instruction::GetElementPtr:
         Users.emplace_back(I);
@@ -2021,7 +2089,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
-  SmallVector<WeakVH, 64> Users;
+  SmallVector<WeakTrackingVH, 64> Users;
   if (isAllocSiteRemovable(&MI, Users, &TLI)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       // Lowering all @llvm.objectsize calls first because they may
@@ -2051,7 +2119,8 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
         replaceInstUsesWith(*C,
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
-      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
+                 isa<AddrSpaceCastInst>(I)) {
         replaceInstUsesWith(*I, UndefValue::get(I->getType()));
       }
       eraseInstFromFunction(*I);
@@ -2133,8 +2202,8 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   // free undef -> unreachable.
   if (isa<UndefValue>(Op)) {
     // Insert a new store to null because we cannot modify the CFG here.
-    Builder->CreateStore(ConstantInt::getTrue(FI.getContext()),
-                         UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
+    Builder.CreateStore(ConstantInt::getTrue(FI.getContext()),
+                        UndefValue::get(Type::getInt1PtrTy(FI.getContext())));
     return eraseInstFromFunction(FI);
   }
 
@@ -2167,11 +2236,9 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
 
   // There might be assume intrinsics dominating this return that completely
   // determine the value. If so, constant fold it.
-  unsigned BitWidth = VTy->getPrimitiveSizeInBits();
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  computeKnownBits(ResultOp, KnownZero, KnownOne, 0, &RI);
-  if ((KnownZero|KnownOne).isAllOnesValue())
-    RI.setOperand(0, Constant::getIntegerValue(VTy, KnownOne));
+  KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
+  if (Known.isConstant())
+    RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant()));
 
   return nullptr;
 }
@@ -2198,37 +2265,18 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
     return &BI;
   }
 
-  // Canonicalize fcmp_one -> fcmp_oeq
-  FCmpInst::Predicate FPred; Value *Y;
-  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)),
-                             TrueDest, FalseDest)) &&
-      BI.getCondition()->hasOneUse())
-    if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
-        FPred == FCmpInst::FCMP_OGE) {
-      FCmpInst *Cond = cast<FCmpInst>(BI.getCondition());
-      Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
-
-      // Swap Destinations and condition.
-      BI.swapSuccessors();
-      Worklist.Add(Cond);
-      return &BI;
-    }
-
-  // Canonicalize icmp_ne -> icmp_eq
-  ICmpInst::Predicate IPred;
-  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
-                      TrueDest, FalseDest)) &&
-      BI.getCondition()->hasOneUse())
-    if (IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
-        IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
-        IPred == ICmpInst::ICMP_SGE) {
-      ICmpInst *Cond = cast<ICmpInst>(BI.getCondition());
-      Cond->setPredicate(ICmpInst::getInversePredicate(IPred));
-      // Swap Destinations and condition.
-      BI.swapSuccessors();
-      Worklist.Add(Cond);
-      return &BI;
-    }
+  // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq.
+  CmpInst::Predicate Pred;
+  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest,
+                      FalseDest)) &&
+      !isCanonicalPredicate(Pred)) {
+    // Swap destinations and condition.
+    CmpInst *Cond = cast<CmpInst>(BI.getCondition());
+    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+    BI.swapSuccessors();
+    Worklist.Add(Cond);
+    return &BI;
+  }
 
   return nullptr;
 }
@@ -2239,21 +2287,19 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   ConstantInt *AddRHS;
   if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
     // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
-    for (SwitchInst::CaseIt CaseIter : SI.cases()) {
-      Constant *NewCase = ConstantExpr::getSub(CaseIter.getCaseValue(), AddRHS);
+    for (auto Case : SI.cases()) {
+      Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
       assert(isa<ConstantInt>(NewCase) &&
              "Result of expression should be constant");
-      CaseIter.setValue(cast<ConstantInt>(NewCase));
+      Case.setValue(cast<ConstantInt>(NewCase));
     }
     SI.setCondition(Op0);
     return &SI;
   }
 
-  unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  computeKnownBits(Cond, KnownZero, KnownOne, 0, &SI);
-  unsigned LeadingKnownZeros = KnownZero.countLeadingOnes();
-  unsigned LeadingKnownOnes = KnownOne.countLeadingOnes();
+  KnownBits Known = computeKnownBits(Cond, 0, &SI);
+  unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+  unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
 
   // Compute the number of leading bits we can ignore.
   // TODO: A better way to determine this would use ComputeNumSignBits().
@@ -2264,20 +2310,20 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
         LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
   }
 
-  unsigned NewWidth = BitWidth - std::max(LeadingKnownZeros, LeadingKnownOnes);
+  unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
   // Shrink the condition operand if the new type is smaller than the old type.
   // This may produce a non-standard type for the switch, but that's ok because
   // the backend should extend back to a legal type for the target.
-  if (NewWidth > 0 && NewWidth < BitWidth) {
+  if (NewWidth > 0 && NewWidth < Known.getBitWidth()) {
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
-    Builder->SetInsertPoint(&SI);
-    Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc");
+    Builder.SetInsertPoint(&SI);
+    Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
     SI.setCondition(NewCond);
 
-    for (SwitchInst::CaseIt CaseIter : SI.cases()) {
-      APInt TruncatedCase = CaseIter.getCaseValue()->getValue().trunc(NewWidth);
-      CaseIter.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+    for (auto Case : SI.cases()) {
+      APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+      Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
     }
     return &SI;
   }
@@ -2291,8 +2337,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   if (!EV.hasIndices())
     return replaceInstUsesWith(EV, Agg);
 
-  if (Value *V =
-          SimplifyExtractValueInst(Agg, EV.getIndices(), DL, &TLI, &DT, &AC))
+  if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
+                                          SQ.getWithInstruction(&EV)))
     return replaceInstUsesWith(EV, V);
 
   if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
@@ -2329,8 +2375,8 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // %E = insertvalue { i32 } %X, i32 42, 0
       // by switching the order of the insert and extract (though the
       // insertvalue should be left in, since it may have other uses).
-      Value *NewEV = Builder->CreateExtractValue(IV->getAggregateOperand(),
-                                                 EV.getIndices());
+      Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
+                                                EV.getIndices());
       return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
                                      makeArrayRef(insi, inse));
     }
@@ -2405,19 +2451,25 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // extractvalue has integer indices, getelementptr has Value*s. Convert.
       SmallVector<Value*, 4> Indices;
       // Prefix an i32 0 since we need the first element.
-      Indices.push_back(Builder->getInt32(0));
+      Indices.push_back(Builder.getInt32(0));
       for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
             I != E; ++I)
-        Indices.push_back(Builder->getInt32(*I));
+        Indices.push_back(Builder.getInt32(*I));
 
       // We need to insert these at the location of the old load, not at that of
       // the extractvalue.
-      Builder->SetInsertPoint(L);
-      Value *GEP = Builder->CreateInBoundsGEP(L->getType(),
-                                              L->getPointerOperand(), Indices);
+      Builder.SetInsertPoint(L);
+      Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
+                                             L->getPointerOperand(), Indices);
+      Instruction *NL = Builder.CreateLoad(GEP);
+      // Whatever aliasing information we had for the orignal load must also
+      // hold for the smaller load, so propagate the annotations.
+      AAMDNodes Nodes;
+      L->getAAMetadata(Nodes);
+      NL->setAAMetadata(Nodes);
       // Returning the load directly will cause the main loop to insert it in
       // the wrong spot, so use replaceInstUsesWith().
-      return replaceInstUsesWith(EV, Builder->CreateLoad(GEP));
+      return replaceInstUsesWith(EV, NL);
     }
   // We could simplify extracts from other values. Note that nested extracts may
   // already be simplified implicitly by the above: extract (extract (insert) )
@@ -2849,12 +2901,9 @@ bool InstCombiner::run() {
     // a value even when the operands are not all constants.
     Type *Ty = I->getType();
     if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) {
-      unsigned BitWidth = Ty->getScalarSizeInBits();
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
-      computeKnownBits(I, KnownZero, KnownOne, /*Depth*/0, I);
-      if ((KnownZero | KnownOne).isAllOnesValue()) {
-        Constant *C = ConstantInt::get(Ty, KnownOne);
+      KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
+      if (Known.isConstant()) {
+        Constant *C = ConstantInt::get(Ty, Known.getConstant());
         DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
                         " from: " << *I << '\n');
 
@@ -2909,8 +2958,8 @@ bool InstCombiner::run() {
     }
 
     // Now that we have an instruction, try combining it to simplify it.
-    Builder->SetInsertPoint(I);
-    Builder->SetCurrentDebugLocation(I->getDebugLoc());
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
 
 #ifndef NDEBUG
     std::string OrigI;
@@ -2934,8 +2983,8 @@ bool InstCombiner::run() {
         Result->takeName(I);
 
         // Push the new instruction and any users onto the worklist.
-        Worklist.Add(Result);
         Worklist.AddUsersToWorkList(*Result);
+        Worklist.Add(Result);
 
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
@@ -2958,8 +3007,8 @@ bool InstCombiner::run() {
         if (isInstructionTriviallyDead(I, &TLI)) {
           eraseInstFromFunction(*I);
         } else {
-          Worklist.Add(I);
           Worklist.AddUsersToWorkList(*I);
+          Worklist.Add(I);
         }
       }
       MadeIRChange = true;
@@ -3005,6 +3054,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
         ++NumDeadInst;
         DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
+        MadeIRChange = true;
         continue;
       }
 
@@ -3018,16 +3068,16 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
           ++NumConstProp;
           if (isInstructionTriviallyDead(Inst, TLI))
             Inst->eraseFromParent();
+          MadeIRChange = true;
           continue;
         }
 
       // See if we can constant fold its operands.
-      for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e;
-           ++i) {
-        if (!isa<ConstantVector>(i) && !isa<ConstantExpr>(i))
+      for (Use &U : Inst->operands()) {
+        if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
           continue;
 
-        auto *C = cast<Constant>(i);
+        auto *C = cast<Constant>(U);
         Constant *&FoldRes = FoldedConstants[C];
         if (!FoldRes)
           FoldRes = ConstantFoldConstant(C, DL, TLI);
@@ -3035,12 +3085,18 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
           FoldRes = C;
 
         if (FoldRes != C) {
-          *i = FoldRes;
+          DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+                       << "\n    Old = " << *C
+                       << "\n    New = " << *FoldRes << '\n');
+          U = FoldRes;
           MadeIRChange = true;
         }
       }
 
-      InstrsForInstCombineWorklist.push_back(Inst);
+      // Skip processing debug intrinsics in InstCombine. Processing these call instructions
+      // consumes non-trivial amount of time and provides no value for the optimization.
+      if (!isa<DbgInfoIntrinsic>(Inst))
+        InstrsForInstCombineWorklist.push_back(Inst);
     }
 
     // Recursively visit successors.  If this is a branch or switch on a
@@ -3055,17 +3111,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
-        // See if this is an explicit destination.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i)
-          if (i.getCaseValue() == Cond) {
-            BasicBlock *ReachableBB = i.getCaseSuccessor();
-            Worklist.push_back(ReachableBB);
-            continue;
-          }
-
-        // Otherwise it is the default destination.
-        Worklist.push_back(SI->getDefaultDest());
+        Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
         continue;
       }
     }
@@ -3139,7 +3185,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
   // by instcombiner.
-  bool DbgDeclaresChanged = LowerDbgDeclare(F);
+  bool MadeIRChange = LowerDbgDeclare(F);
 
   // Iterate while there is work to do.
   int Iteration = 0;
@@ -3148,17 +3194,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
     DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                  << F.getName() << "\n");
 
-    bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+    MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
-    InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
+    InstCombiner IC(Worklist, Builder, F.optForMinSize(), ExpensiveCombines,
                     AA, AC, TLI, DT, DL, LI);
-    Changed |= IC.run();
+    IC.MaxArraySizeForCombine = MaxArraySize;
 
-    if (!Changed)
+    if (!IC.run())
       break;
   }
 
-  return DbgDeclaresChanged || Iteration > 1;
+  return MadeIRChange || Iteration > 1;
 }
 
 PreservedAnalyses InstCombinePass::run(Function &F,
@@ -3176,9 +3222,10 @@ PreservedAnalyses InstCombinePass::run(Function &F,
     return PreservedAnalyses::all();
 
   // Mark all the analyses that instcombine updates as preserved.
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<AAManager>();
+  PA.preserve<GlobalsAA>();
   return PA;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index f5e9e7d..f8d2552 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -22,9 +22,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -43,6 +45,7 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
@@ -80,6 +83,7 @@ static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 // The shadow memory space is dynamically allocated.
 static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
@@ -100,6 +104,10 @@ static const char *const kAsanRegisterImageGlobalsName =
   "__asan_register_image_globals";
 static const char *const kAsanUnregisterImageGlobalsName =
   "__asan_unregister_image_globals";
+static const char *const kAsanRegisterElfGlobalsName =
+  "__asan_register_elf_globals";
+static const char *const kAsanUnregisterElfGlobalsName =
+  "__asan_unregister_elf_globals";
 static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *const kAsanInitName = "__asan_init";
@@ -119,8 +127,11 @@ static const char *const kAsanPoisonStackMemoryName =
     "__asan_poison_stack_memory";
 static const char *const kAsanUnpoisonStackMemoryName =
     "__asan_unpoison_stack_memory";
+
+// ASan version script has __asan_* wildcard. Triple underscore prevents a
+// linker (gold) warning about attempting to export a local symbol.
 static const char *const kAsanGlobalsRegisteredFlagName =
-    "__asan_globals_registered";
+    "___asan_globals_registered";
 
 static const char *const kAsanOptionDetectUseAfterReturn =
     "__asan_option_detect_stack_use_after_return";
@@ -184,6 +195,11 @@ static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
 static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
                                       cl::desc("Check stack-use-after-return"),
                                       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
+                                        cl::desc("Create redzones for byval "
+                                                 "arguments (extra copy "
+                                                 "required)"), cl::Hidden,
+                                        cl::init(true));
 static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
                                      cl::desc("Check stack-use-after-scope"),
                                      cl::Hidden, cl::init(false));
@@ -264,11 +280,17 @@ static cl::opt<bool>
                                 cl::Hidden, cl::init(false));
 
 static cl::opt<bool>
-    ClUseMachOGlobalsSection("asan-globals-live-support",
-                             cl::desc("Use linker features to support dead "
-                                      "code stripping of globals "
-                                      "(Mach-O only)"),
-                             cl::Hidden, cl::init(true));
+    ClUseGlobalsGC("asan-globals-live-support",
+                   cl::desc("Use linker features to support dead "
+                            "code stripping of globals"),
+                   cl::Hidden, cl::init(true));
+
+// This is on by default even though there is a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool>
+    ClWithComdat("asan-with-comdat",
+                 cl::desc("Place ASan constructors in comdat sections"),
+                 cl::Hidden, cl::init(true));
 
 // Debug flags.
 static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
@@ -380,6 +402,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsPS4CPU = TargetTriple.isPS4CPU();
   bool IsLinux = TargetTriple.isOSLinux();
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
                  TargetTriple.getArch() == llvm::Triple::ppc64le;
@@ -392,6 +415,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
                   TargetTriple.getArch() == llvm::Triple::mips64el;
   bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
   bool IsWindows = TargetTriple.isOSWindows();
+  bool IsFuchsia = TargetTriple.isOSFuchsia();
 
   ShadowMapping Mapping;
 
@@ -412,12 +436,18 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
     else
       Mapping.Offset = kDefaultShadowOffset32;
   } else {  // LongSize == 64
-    if (IsPPC64)
+    // Fuchsia is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsFuchsia)
+      Mapping.Offset = 0;
+    else if (IsPPC64)
       Mapping.Offset = kPPC64_ShadowOffset64;
     else if (IsSystemZ)
       Mapping.Offset = kSystemZ_ShadowOffset64;
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsPS4CPU)
+      Mapping.Offset = kPS4CPU_ShadowOffset64;
     else if (IsLinux && IsX86_64) {
       if (IsKasan)
         Mapping.Offset = kLinuxKasan_ShadowOffset64;
@@ -456,9 +486,9 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   // offset is not necessary 1/8-th of the address space.  On SystemZ,
   // we could OR the constant in a single instruction, but it's more
   // efficient to load it once and use indexed addressing.
-  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ
-                           && !(Mapping.Offset & (Mapping.Offset - 1))
-                           && Mapping.Offset != kDynamicShadowSentinel;
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+                           !(Mapping.Offset & (Mapping.Offset - 1)) &&
+                           Mapping.Offset != kDynamicShadowSentinel;
 
   return Mapping;
 }
@@ -567,8 +597,6 @@ struct AddressSanitizer : public FunctionPass {
   Type *IntptrTy;
   ShadowMapping Mapping;
   DominatorTree *DT;
-  Function *AsanCtorFunction = nullptr;
-  Function *AsanInitFunction = nullptr;
   Function *AsanHandleNoReturnFunc;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
   // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
@@ -587,22 +615,36 @@ struct AddressSanitizer : public FunctionPass {
 };
 
 class AddressSanitizerModule : public ModulePass {
- public:
+public:
   explicit AddressSanitizerModule(bool CompileKernel = false,
-                                  bool Recover = false)
+                                  bool Recover = false,
+                                  bool UseGlobalsGC = true)
       : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan),
-        Recover(Recover || ClRecover) {}
+        Recover(Recover || ClRecover),
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+        // Not a typo: ClWithComdat is almost completely pointless without
+        // ClUseGlobalsGC (because then it only works on modules without
+        // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+        // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+        // argument is designed as workaround. Therefore, disable both
+        // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+        // do globals-gc.
+        UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
   bool runOnModule(Module &M) override;
-  static char ID;  // Pass identification, replacement for typeid
+  static char ID; // Pass identification, replacement for typeid
   StringRef getPassName() const override { return "AddressSanitizerModule"; }
 
 private:
   void initializeCallbacks(Module &M);
 
-  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M);
+  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
   void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
                              ArrayRef<GlobalVariable *> ExtendedGlobals,
                              ArrayRef<Constant *> MetadataInitializers);
+  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+                            ArrayRef<GlobalVariable *> ExtendedGlobals,
+                            ArrayRef<Constant *> MetadataInitializers,
+                            const std::string &UniqueModuleId);
   void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
                               ArrayRef<GlobalVariable *> ExtendedGlobals,
                               ArrayRef<Constant *> MetadataInitializers);
@@ -613,7 +655,8 @@ private:
 
   GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
                                        StringRef OriginalName);
-  void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata);
+  void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
+                                  StringRef InternalSuffix);
   IRBuilder<> CreateAsanModuleDtor(Module &M);
 
   bool ShouldInstrumentGlobal(GlobalVariable *G);
@@ -628,6 +671,8 @@ private:
   GlobalsMetadata GlobalsMD;
   bool CompileKernel;
   bool Recover;
+  bool UseGlobalsGC;
+  bool UseCtorComdat;
   Type *IntptrTy;
   LLVMContext *C;
   Triple TargetTriple;
@@ -638,6 +683,11 @@ private:
   Function *AsanUnregisterGlobals;
   Function *AsanRegisterImageGlobals;
   Function *AsanUnregisterImageGlobals;
+  Function *AsanRegisterElfGlobals;
+  Function *AsanUnregisterElfGlobals;
+
+  Function *AsanCtorFunction = nullptr;
+  Function *AsanDtorFunction = nullptr;
 };
 
 // Stack poisoning does not play well with exception handling.
@@ -705,6 +755,10 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
   bool runOnFunction() {
     if (!ClStack) return false;
+
+    if (ClRedzoneByvalArgs && Mapping.Offset != kDynamicShadowSentinel)
+      copyArgsPassedByValToAllocas();
+
     // Collect alloca, ret, lifetime instructions etc.
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
 
@@ -721,6 +775,11 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     return true;
   }
 
+  // Arguments marked with the "byval" attribute are implicitly copied without
+  // using an alloca instruction.  To produce redzones for those arguments, we
+  // copy them a second time into memory allocated with an alloca instruction.
+  void copyArgsPassedByValToAllocas();
+
   // Finds all Alloca instructions and puts
   // poisoned red zones around all of them.
   // Then unpoison everything back before the function returns.
@@ -906,9 +965,10 @@ INITIALIZE_PASS(
     "ModulePass",
     false, false)
 ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
-                                                   bool Recover) {
+                                                   bool Recover,
+                                                   bool UseGlobalsGC) {
   assert(!CompileKernel || Recover);
-  return new AddressSanitizerModule(CompileKernel, Recover);
+  return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -1187,7 +1247,7 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
     if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
       // dyn_cast as we might get UndefValue
       if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
-        if (Masked->isNullValue())
+        if (Masked->isZero())
           // Mask is constant false, so no instrumentation needed.
           continue;
         // If we have a true or undef value, fall through to doInstrumentAddress
@@ -1421,8 +1481,13 @@ void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
 void AddressSanitizerModule::createInitializerPoisonCalls(
     Module &M, GlobalValue *ModuleName) {
   GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (!GV)
+    return;
+
+  ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA)
+    return;
 
-  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
   for (Use &OP : CA->operands()) {
     if (isa<ConstantAggregateZero>(OP)) continue;
     ConstantStruct *CS = cast<ConstantStruct>(OP);
@@ -1530,9 +1595,6 @@ bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
 // binary in order to allow the linker to properly dead strip. This is only
 // supported on recent versions of ld64.
 bool AddressSanitizerModule::ShouldUseMachOGlobalsSection() const {
-  if (!ClUseMachOGlobalsSection)
-    return false;
-
   if (!TargetTriple.isOSBinFormatMachO())
     return false;
 
@@ -1561,38 +1623,48 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
 
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr));
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy()));
   AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
 
   // Declare functions that register/unregister globals.
   AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnregisterGlobals = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
-                            IntptrTy, IntptrTy, nullptr));
+                            IntptrTy, IntptrTy));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
 
   // Declare the functions that find globals in a shared object and then invoke
   // the (un)register function on them.
   AsanRegisterImageGlobals =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);
 
   AsanUnregisterImageGlobals =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);
+
+  AsanRegisterElfGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy));
+  AsanRegisterElfGlobals->setLinkage(Function::ExternalLinkage);
+
+  AsanUnregisterElfGlobals = checkSanitizerInterfaceFunction(
+      M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
+                            IntptrTy, IntptrTy, IntptrTy));
+  AsanUnregisterElfGlobals->setLinkage(Function::ExternalLinkage);
 }
 
 // Put the metadata and the instrumented global in the same group. This ensures
 // that the metadata is discarded if the instrumented global is discarded.
 void AddressSanitizerModule::SetComdatForGlobalMetadata(
-    GlobalVariable *G, GlobalVariable *Metadata) {
+    GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) {
   Module &M = *G->getParent();
   Comdat *C = G->getComdat();
   if (!C) {
@@ -1602,7 +1674,15 @@ void AddressSanitizerModule::SetComdatForGlobalMetadata(
       assert(G->hasLocalLinkage());
       G->setName(Twine(kAsanGenPrefix) + "_anon_global");
     }
-    C = M.getOrInsertComdat(G->getName());
+
+    if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
+      std::string Name = G->getName();
+      Name += InternalSuffix;
+      C = M.getOrInsertComdat(Name);
+    } else {
+      C = M.getOrInsertComdat(G->getName());
+    }
+
     // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF.
     if (TargetTriple.isOSBinFormatCOFF())
       C->setSelectionKind(Comdat::NoDuplicates);
@@ -1618,21 +1698,21 @@ void AddressSanitizerModule::SetComdatForGlobalMetadata(
 GlobalVariable *
 AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
                                              StringRef OriginalName) {
-  GlobalVariable *Metadata =
-      new GlobalVariable(M, Initializer->getType(), false,
-                         GlobalVariable::InternalLinkage, Initializer,
-                         Twine("__asan_global_") +
-                             GlobalValue::getRealLinkageName(OriginalName));
+  auto Linkage = TargetTriple.isOSBinFormatMachO()
+                     ? GlobalVariable::InternalLinkage
+                     : GlobalVariable::PrivateLinkage;
+  GlobalVariable *Metadata = new GlobalVariable(
+      M, Initializer->getType(), false, Linkage, Initializer,
+      Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
   Metadata->setSection(getGlobalMetadataSection());
   return Metadata;
 }
 
 IRBuilder<> AddressSanitizerModule::CreateAsanModuleDtor(Module &M) {
-  Function *AsanDtorFunction =
+  AsanDtorFunction =
       Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
                        GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
-  appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
 
   return IRBuilder<>(ReturnInst::Create(*C, AsanDtorBB));
 }
@@ -1657,10 +1737,69 @@ void AddressSanitizerModule::InstrumentGlobalsCOFF(
            "global metadata will not be padded appropriately");
     Metadata->setAlignment(SizeOfGlobalStruct);
 
-    SetComdatForGlobalMetadata(G, Metadata);
+    SetComdatForGlobalMetadata(G, Metadata, "");
   }
 }
 
+void AddressSanitizerModule::InstrumentGlobalsELF(
+    IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+    ArrayRef<Constant *> MetadataInitializers,
+    const std::string &UniqueModuleId) {
+  assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+  for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+    GlobalVariable *G = ExtendedGlobals[i];
+    GlobalVariable *Metadata =
+        CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
+    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+    Metadata->setMetadata(LLVMContext::MD_associated, MD);
+    MetadataGlobals[i] = Metadata;
+
+    SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
+  }
+
+  // Update llvm.compiler.used, adding the new metadata globals. This is
+  // needed so that during LTO these variables stay alive.
+  if (!MetadataGlobals.empty())
+    appendToCompilerUsed(M, MetadataGlobals);
+
+  // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+  // to look up the loaded image that contains it. Second, we can store in it
+  // whether registration has already occurred, to prevent duplicate
+  // registration.
+  //
+  // Common linkage ensures that there is only one global per shared library.
+  GlobalVariable *RegisteredFlag = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::CommonLinkage,
+      ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+  RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+  // Create start and stop symbols.
+  GlobalVariable *StartELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__start_" + getGlobalMetadataSection());
+  StartELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+  GlobalVariable *StopELFMetadata = new GlobalVariable(
+      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
+      "__stop_" + getGlobalMetadataSection());
+  StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+
+  // Create a call to register the globals with the runtime.
+  IRB.CreateCall(AsanRegisterElfGlobals,
+                 {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                  IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                  IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+
+  // We also need to unregister globals at the end, e.g., when a shared library
+  // gets closed.
+  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
+                      {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+                       IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+                       IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+}
+
 void AddressSanitizerModule::InstrumentGlobalsMachO(
     IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
     ArrayRef<Constant *> MetadataInitializers) {
@@ -1669,7 +1808,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
   // On recent Mach-O platforms, use a structure which binds the liveness of
   // the global variable to the metadata struct. Keep the list of "Liveness" GV
   // created to be added to llvm.compiler.used
-  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr);
+  StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
   SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
 
   for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -1680,9 +1819,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO(
 
     // On recent Mach-O platforms, we emit the global metadata in a way that
     // allows the linker to properly strip dead globals.
-    auto LivenessBinder = ConstantStruct::get(
-        LivenessTy, Initializer->getAggregateElement(0u),
-        ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr);
+    auto LivenessBinder =
+        ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+                            ConstantExpr::getPointerCast(Metadata, IntptrTy));
     GlobalVariable *Liveness = new GlobalVariable(
         M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
         Twine("__asan_binder_") + G->getName());
@@ -1748,7 +1887,10 @@ void AddressSanitizerModule::InstrumentGlobalsWithMetadataArray(
 // This function replaces all global variables with new variables that have
 // trailing redzones. It also creates a function that poisons
 // redzones and inserts this function into llvm.global_ctors.
-bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
+// Sets *CtorComdat to true if the global registration code emitted into the
+// asan constructor is comdat-compatible.
+bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat) {
+  *CtorComdat = false;
   GlobalsMD.init(M);
 
   SmallVector<GlobalVariable *, 16> GlobalsToChange;
@@ -1758,7 +1900,10 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
   }
 
   size_t n = GlobalsToChange.size();
-  if (n == 0) return false;
+  if (n == 0) {
+    *CtorComdat = true;
+    return false;
+  }
 
   auto &DL = M.getDataLayout();
 
@@ -1774,7 +1919,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy =
       StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
-                      IntptrTy, IntptrTy, IntptrTy, nullptr);
+                      IntptrTy, IntptrTy, IntptrTy);
   SmallVector<GlobalVariable *, 16> NewGlobals(n);
   SmallVector<Constant *, 16> Initializers(n);
 
@@ -1810,10 +1955,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
     assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
 
-    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr);
-    Constant *NewInitializer =
-        ConstantStruct::get(NewTy, G->getInitializer(),
-                            Constant::getNullValue(RightRedZoneTy), nullptr);
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
 
     // Create a new global variable with enough space for a redzone.
     GlobalValue::LinkageTypes Linkage = G->getLinkage();
@@ -1862,7 +2006,8 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
     GlobalValue *InstrumentedGlobal = NewGlobal;
 
     bool CanUsePrivateAliases =
-        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO();
+        TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
+        TargetTriple.isOSBinFormatWasm();
     if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
       // Create local alias for NewGlobal to avoid crash on ODR between
       // instrumented and non-instrumented libraries.
@@ -1893,7 +2038,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
         ConstantExpr::getPointerCast(Name, IntptrTy),
         ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
-        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr);
+        ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
 
     if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
 
@@ -1902,9 +2047,16 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
     Initializers[i] = Initializer;
   }
 
-  if (TargetTriple.isOSBinFormatCOFF()) {
+  std::string ELFUniqueModuleId =
+      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
+                                                        : "";
+
+  if (!ELFUniqueModuleId.empty()) {
+    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
+    *CtorComdat = true;
+  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
     InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
-  } else if (ShouldUseMachOGlobalsSection()) {
+  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
     InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
   } else {
     InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
@@ -1926,14 +2078,39 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   initializeCallbacks(M);
 
-  bool Changed = false;
+  if (CompileKernel)
+    return false;
+
+  // Create a module constructor. A destructor is created lazily because not all
+  // platforms, and not all modules need it.
+  std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{}, kAsanVersionCheckName);
 
+  bool CtorComdat = true;
+  bool Changed = false;
   // TODO(glider): temporarily disabled globals instrumentation for KASan.
-  if (ClGlobals && !CompileKernel) {
-    Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
-    assert(CtorFunc);
-    IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
-    Changed |= InstrumentGlobals(IRB, M);
+  if (ClGlobals) {
+    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
+    Changed |= InstrumentGlobals(IRB, M, &CtorComdat);
+  }
+
+  // Put the constructor and destructor in comdat if both
+  // (1) global instrumentation is not TU-specific
+  // (2) target is ELF.
+  if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+    AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
+    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
+                        AsanCtorFunction);
+    if (AsanDtorFunction) {
+      AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
+      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority,
+                          AsanDtorFunction);
+    }
+  } else {
+    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
+    if (AsanDtorFunction)
+      appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
   }
 
   return Changed;
@@ -1949,49 +2126,60 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
       const std::string ExpStr = Exp ? "exp_" : "";
       const std::string SuffixStr = CompileKernel ? "N" : "_n";
       const std::string EndingStr = Recover ? "_noabort" : "";
-      Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
-      AsanErrorCallbackSized[AccessIsWrite][Exp] =
-          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr,
-              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
-      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
-          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
-              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
-      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
-           AccessSizeIndex++) {
-        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
-        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
-            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
-                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
-        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
-            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
-                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
+
+      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+      SmallVector<Type *, 2> Args1{1, IntptrTy};
+      if (Exp) {
+        Type *ExpType = Type::getInt32Ty(*C);
+        Args2.push_back(ExpType);
+        Args1.push_back(ExpType);
       }
-    }
+	    AsanErrorCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
+	                EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+	         AccessSizeIndex++) {
+	      const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+	      AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+
+	      AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+	    }
+	  }
   }
 
   const std::string MemIntrinCallbackPrefix =
       CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
   AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
   AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
   AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memset", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy));
 
   AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), nullptr));
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy()));
 
   AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -2001,7 +2189,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 // virtual
 bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
-
   GlobalsMD.init(M);
 
   C = &(M.getContext());
@@ -2009,13 +2196,6 @@ bool AddressSanitizer::doInitialization(Module &M) {
   IntptrTy = Type::getIntNTy(*C, LongSize);
   TargetTriple = Triple(M.getTargetTriple());
 
-  if (!CompileKernel) {
-    std::tie(AsanCtorFunction, AsanInitFunction) =
-        createSanitizerCtorAndInitFunctions(
-            M, kAsanModuleCtorName, kAsanInitName,
-            /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName);
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
-  }
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   return true;
 }
@@ -2034,6 +2214,8 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
   if (F.getName().find(" load]") != std::string::npos) {
+    Function *AsanInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
     IRB.CreateCall(AsanInitFunction, {});
     return true;
@@ -2081,7 +2263,6 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
 }
 
 bool AddressSanitizer::runOnFunction(Function &F) {
-  if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().startswith("__asan_")) return false;
@@ -2175,8 +2356,9 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       (ClInstrumentationWithCallsThreshold >= 0 &&
        ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
   const DataLayout &DL = F.getParent()->getDataLayout();
-  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(),
-                                     /*RoundToAlign=*/true);
+  ObjectSizeOpts ObjSizeOpts;
+  ObjSizeOpts.RoundToAlign = true;
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
 
   // Instrument.
   int NumInstrumented = 0;
@@ -2234,18 +2416,18 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
     std::string Suffix = itostr(i);
     AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
-                              IntptrTy, nullptr));
+                              IntptrTy));
     AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
-                              IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+                              IRB.getVoidTy(), IntptrTy, IntptrTy));
   }
   if (ASan.UseAfterScope) {
     AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
-                              IntptrTy, IntptrTy, nullptr));
+                              IntptrTy, IntptrTy));
     AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
-                              IntptrTy, IntptrTy, nullptr));
+                              IntptrTy, IntptrTy));
   }
 
   for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
@@ -2254,14 +2436,14 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
     Name << std::setw(2) << std::setfill('0') << std::hex << Val;
     AsanSetShadowFunc[Val] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy));
   }
 
   AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanAllocasUnpoisonFunc =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
 }
 
 void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
@@ -2363,6 +2545,28 @@ static int StackMallocSizeClass(uint64_t LocalStackSize) {
   llvm_unreachable("impossible LocalStackSize");
 }
 
+void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
+  BasicBlock &FirstBB = *F.begin();
+  IRBuilder<> IRB(&FirstBB, FirstBB.getFirstInsertionPt());
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  for (Argument &Arg : F.args()) {
+    if (Arg.hasByValAttr()) {
+      Type *Ty = Arg.getType()->getPointerElementType();
+      unsigned Align = Arg.getParamAlignment();
+      if (Align == 0) Align = DL.getABITypeAlignment(Ty);
+
+      const std::string &Name = Arg.hasName() ? Arg.getName().str() :
+          "Arg" + llvm::to_string(Arg.getArgNo());
+      AllocaInst *AI = IRB.CreateAlloca(Ty, nullptr, Twine(Name) + ".byval");
+      AI->setAlignment(Align);
+      Arg.replaceAllUsesWith(AI);
+
+      uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+      IRB.CreateMemCpy(AI, &Arg, AllocSize, Align);
+    }
+  }
+}
+
 PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
                                           Value *ValueIfTrue,
                                           Instruction *ThenTerm,
@@ -2566,7 +2770,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
     Value *NewAllocaPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
         AI->getType());
-    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, /*Deref=*/true);
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, DIExpression::NoDeref);
     AI->replaceAllUsesWith(NewAllocaPtr);
   }
 
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index d4c8369..a193efe 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetFolder.h"
@@ -25,6 +24,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "bounds-checking"
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
index 3802f9f..16e2e6b 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -12,6 +12,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
@@ -24,10 +27,10 @@
 #include <utility>
 #include <vector>
 
-namespace llvm {
-
 #define DEBUG_TYPE "cfgmst"
 
+namespace llvm {
+
 /// \brief An union-find based Minimum Spanning Tree for CFG
 ///
 /// Implements a Union-find algorithm to compute Minimum Spanning Tree
@@ -220,5 +223,8 @@ public:
   }
 };
 
-#undef DEBUG_TYPE // "cfgmst"
 } // end namespace llvm
+
+#undef DEBUG_TYPE // "cfgmst"
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index b34d5b8..ddc975c 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -44,15 +44,14 @@
 /// For more information, please refer to the design document:
 /// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstVisitor.h"
@@ -63,6 +62,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
@@ -254,7 +254,7 @@ class DataFlowSanitizer : public ModulePass {
   MDNode *ColdCallWeights;
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
-  AttributeSet ReadOnlyNoneAttrs;
+  AttrBuilder ReadOnlyNoneAttrs;
   bool DFSanRuntimeShadowMask;
 
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
@@ -331,6 +331,10 @@ class DFSanVisitor : public InstVisitor<DFSanVisitor> {
   DFSanFunction &DFSF;
   DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
 
+  const DataLayout &getDataLayout() const {
+    return DFSF.F->getParent()->getDataLayout();
+  }
+
   void visitOperandShadowInst(Instruction &I);
 
   void visitBinaryOperator(BinaryOperator &BO);
@@ -384,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
     ArgTypes.push_back(ShadowPtrTy);
   Type *RetType = T->getReturnType();
   if (!RetType->isVoidTy())
-    RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr);
+    RetType = StructType::get(RetType, ShadowTy);
   return FunctionType::get(RetType, ArgTypes, T->isVarArg());
 }
 
@@ -472,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
     GetArgTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ArgTLSTy),
-                              (Type *)nullptr)));
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), false)));
   }
   if (GetRetvalTLSPtr) {
     RetvalTLS = nullptr;
     GetRetvalTLS = ConstantExpr::getIntToPtr(
         ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
         PointerType::getUnqual(
-            FunctionType::get(PointerType::getUnqual(ShadowTy),
-                              (Type *)nullptr)));
+            FunctionType::get(PointerType::getUnqual(ShadowTy), false)));
   }
 
   ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
@@ -539,16 +541,13 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
                                     F->getParent());
   NewF->copyAttributesFrom(F);
   NewF->removeAttributes(
-    AttributeSet::ReturnIndex,
-    AttributeSet::get(F->getContext(), AttributeSet::ReturnIndex,
-                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+      AttributeList::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
-    NewF->removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet().addAttribute(*Ctx, AttributeSet::FunctionIndex,
-                                    "split-stack"));
+    NewF->removeAttributes(AttributeList::FunctionIndex,
+                           AttrBuilder().addAttribute("split-stack"));
     CallInst::Create(DFSanVarargWrapperFn,
                      IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
                      BB);
@@ -580,8 +579,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     Function::arg_iterator AI = F->arg_begin(); ++AI;
     for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
       Args.push_back(&*AI);
-    CallInst *CI =
-        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    CallInst *CI = CallInst::Create(&*F->arg_begin(), Args, "", BB);
     ReturnInst *RI;
     if (FT->getReturnType()->isVoidTy())
       RI = ReturnInst::Create(*Ctx, BB);
@@ -595,7 +593,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     DFSanVisitor(DFSF).visitCallInst(*CI);
     if (!FT->getReturnType()->isVoidTy())
       new StoreInst(DFSF.getShadow(RI->getReturnValue()),
-                    &F->getArgumentList().back(), RI);
+                    &*std::prev(F->arg_end()), RI);
   }
 
   return C;
@@ -622,33 +620,33 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
-    F->addAttribute(1, Attribute::ZExt);
-    F->addAttribute(2, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    F->addParamAttr(0, Attribute::ZExt);
+    F->addParamAttr(1, Attribute::ZExt);
   }
   DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
-    F->addAttribute(1, Attribute::ZExt);
-    F->addAttribute(2, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    F->addParamAttr(0, Attribute::ZExt);
+    F->addParamAttr(1, Attribute::ZExt);
   }
   DFSanUnionLoadFn =
       Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   }
   DFSanUnimplementedFn =
       Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
   DFSanSetLabelFn =
       Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
   if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
-    F->addAttribute(1, Attribute::ZExt);
+    F->addParamAttr(0, Attribute::ZExt);
   }
   DFSanNonzeroLabelFn =
       Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
@@ -694,9 +692,8 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
     }
   }
 
-  AttrBuilder B;
-  B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
-  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+  ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
+      .addAttribute(Attribute::ReadNone);
 
   // First, change the ABI of every function in the module.  ABI-listed
   // functions keep their original ABI and get a wrapper function.
@@ -717,9 +714,8 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
         NewF->copyAttributesFrom(&F);
         NewF->removeAttributes(
-          AttributeSet::ReturnIndex,
-          AttributeSet::get(NewF->getContext(), AttributeSet::ReturnIndex,
-                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+            AttributeList::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
         for (Function::arg_iterator FArg = F.arg_begin(),
                                     NewFArg = NewF->arg_begin(),
                                     FArgEnd = F.arg_end();
@@ -758,7 +754,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
           &F, std::string("dfsw$") + std::string(F.getName()),
           GlobalValue::LinkOnceODRLinkage, NewFT);
       if (getInstrumentedABI() == IA_TLS)
-        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
 
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
@@ -906,7 +902,7 @@ Value *DFSanFunction::getShadow(Value *V) {
         break;
       }
       case DataFlowSanitizer::IA_Args: {
-        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
         Function::arg_iterator i = F->arg_begin();
         while (ArgIdx--)
           ++i;
@@ -983,9 +979,9 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
   IRBuilder<> IRB(Pos);
   if (AvoidNewBlocks) {
     CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
-    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
-    Call->addAttribute(1, Attribute::ZExt);
-    Call->addAttribute(2, Attribute::ZExt);
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
 
     CCS.Block = Pos->getParent();
     CCS.Shadow = Call;
@@ -996,9 +992,9 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
         Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
     IRBuilder<> ThenIRB(BI);
     CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
-    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
-    Call->addAttribute(1, Attribute::ZExt);
-    Call->addAttribute(2, Attribute::ZExt);
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addParamAttr(0, Attribute::ZExt);
+    Call->addParamAttr(1, Attribute::ZExt);
 
     BasicBlock *Tail = BI->getSuccessor(0);
     PHINode *Phi = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
@@ -1099,7 +1095,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     CallInst *FallbackCall = FallbackIRB.CreateCall(
         DFS.DFSanUnionLoadFn,
         {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
 
     // Compare each of the shadows stored in the loaded 64 bits to each other,
     // by computing (WideShadow rotl ShadowWidth) == WideShadow.
@@ -1156,7 +1152,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
   IRBuilder<> IRB(Pos);
   CallInst *FallbackCall = IRB.CreateCall(
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   return FallbackCall;
 }
 
@@ -1446,7 +1442,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
 
           // Custom functions returning non-void will write to the return label.
           if (!FT->getReturnType()->isVoidTy()) {
-            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+            CustomFn->removeAttributes(AttributeList::FunctionIndex,
                                        DFSF.DFS.ReadOnlyNoneAttrs);
           }
         }
@@ -1474,6 +1470,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         }
 
         i = CS.arg_begin();
+        const unsigned ShadowArgStart = Args.size();
         for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
           Args.push_back(DFSF.getShadow(*i));
 
@@ -1481,7 +1478,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
           auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
                                            CS.arg_size() - FT->getNumParams());
           auto *LabelVAAlloca = new AllocaInst(
-              LabelVATy, "labelva", &DFSF.F->getEntryBlock().front());
+              LabelVATy, getDataLayout().getAllocaAddrSpace(),
+              "labelva", &DFSF.F->getEntryBlock().front());
 
           for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
             auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
@@ -1494,8 +1492,9 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         if (!FT->getReturnType()->isVoidTy()) {
           if (!DFSF.LabelReturnAlloca) {
             DFSF.LabelReturnAlloca =
-                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
-                               &DFSF.F->getEntryBlock().front());
+              new AllocaInst(DFSF.DFS.ShadowTy,
+                             getDataLayout().getAllocaAddrSpace(),
+                             "labelreturn", &DFSF.F->getEntryBlock().front());
           }
           Args.push_back(DFSF.LabelReturnAlloca);
         }
@@ -1507,6 +1506,15 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         CustomCI->setCallingConv(CI->getCallingConv());
         CustomCI->setAttributes(CI->getAttributes());
 
+        // Update the parameter attributes of the custom call instruction to
+        // zero extend the shadow parameters. This is required for targets
+        // which consider ShadowTy an illegal type.
+        for (unsigned n = 0; n < FT->getNumParams(); n++) {
+          const unsigned ArgNo = ShadowArgStart + n;
+          if (CustomCI->getArgOperand(ArgNo)->getType() == DFSF.DFS.ShadowTy)
+            CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
+        }
+
         if (!FT->getReturnType()->isVoidTy()) {
           LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
           DFSF.setShadow(CustomCI, LabelLoad);
@@ -1574,7 +1582,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
       unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
       ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
       AllocaInst *VarArgShadow =
-          new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front());
+        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+                       "", &DFSF.F->getEntryBlock().front());
       Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
       for (unsigned n = 0; i != e; ++i, ++n) {
         IRB.CreateStore(
@@ -1593,7 +1602,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
     }
     NewCS.setCallingConv(CS.getCallingConv());
     NewCS.setAttributes(CS.getAttributes().removeAttributes(
-        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
         AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
 
     if (Next) {
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 05eba6c..6864d29 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -18,7 +18,6 @@
 // The rest is handled by the run-time library.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -32,6 +31,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -267,35 +267,35 @@ void EfficiencySanitizer::initializeCallbacks(Module &M) {
     SmallString<32> AlignedLoadName("__esan_aligned_load" + ByteSizeStr);
     EsanAlignedLoad[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> AlignedStoreName("__esan_aligned_store" + ByteSizeStr);
     EsanAlignedStore[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> UnalignedLoadName("__esan_unaligned_load" + ByteSizeStr);
     EsanUnalignedLoad[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> UnalignedStoreName("__esan_unaligned_store" + ByteSizeStr);
     EsanUnalignedStore[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   }
   EsanUnalignedLoadN = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__esan_unaligned_loadN", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   EsanUnalignedStoreN = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__esan_unaligned_storeN", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemmoveFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemcpyFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemsetFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt32Ty(), IntptrTy, nullptr));
+                            IRB.getInt32Ty(), IntptrTy));
 }
 
 bool EfficiencySanitizer::shouldIgnoreStructType(StructType *StructTy) {
@@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
   //   u64 *ArrayCounter;
   // };
   auto *StructInfoTy =
-    StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
-                    Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
+      StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
+                      Int8PtrPtrTy, Int64PtrTy, Int64PtrTy);
   auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
   // This structure should be kept consistent with the CacheFragInfo struct
   // in the runtime library.
@@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
   //   u32 NumStructs;
   //   StructInfo *Structs;
   // };
-  auto *CacheFragInfoTy =
-    StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);
+  auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy);
 
   std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
   unsigned NumStructs = 0;
@@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
     ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
     ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
                                           getArrayCounterIdx(StructTy));
-    Initializers.push_back(
-        ConstantStruct::get(
-            StructInfoTy,
-            ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
-            ConstantInt::get(Int32Ty,
-                             DL.getStructLayout(StructTy)->getSizeInBytes()),
-            ConstantInt::get(Int32Ty, StructTy->getNumElements()),
-            Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
-                ConstantExpr::getPointerCast(Offset, Int32PtrTy),
-            Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
-                ConstantExpr::getPointerCast(Size, Int32PtrTy),
-            TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
-                ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
-            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
-                                           FieldCounterIdx),
-            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
-                                           ArrayCounterIdx),
-            nullptr));
+    Initializers.push_back(ConstantStruct::get(
+        StructInfoTy,
+        ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
+        ConstantInt::get(Int32Ty,
+                         DL.getStructLayout(StructTy)->getSizeInBytes()),
+        ConstantInt::get(Int32Ty, StructTy->getNumElements()),
+        Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                          : ConstantExpr::getPointerCast(Offset, Int32PtrTy),
+        Size == nullptr ? ConstantPointerNull::get(Int32PtrTy)
+                        : ConstantExpr::getPointerCast(Size, Int32PtrTy),
+        TypeName == nullptr
+            ? ConstantPointerNull::get(Int8PtrPtrTy)
+            : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       FieldCounterIdx),
+        ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
+                                       ArrayCounterIdx)));
   }
   // Structs.
   Constant *StructInfo;
@@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
 
   auto *CacheFragInfoGV = new GlobalVariable(
       M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
-      ConstantStruct::get(CacheFragInfoTy,
-                          UnitName,
-                          ConstantInt::get(Int32Ty, NumStructs),
-                          StructInfo,
-                          nullptr));
+      ConstantStruct::get(CacheFragInfoTy, UnitName,
+                          ConstantInt::get(Int32Ty, NumStructs), StructInfo));
   return CacheFragInfoGV;
 }
 
@@ -533,7 +528,7 @@ void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
   IRBuilder<> IRB_Dtor(EsanDtorFunction->getEntryBlock().getTerminator());
   Function *EsanExit = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(EsanExitName, IRB_Dtor.getVoidTy(),
-                            Int8PtrTy, nullptr));
+                            Int8PtrTy));
   EsanExit->setLinkage(Function::ExternalLinkage);
   IRB_Dtor.CreateCall(EsanExit, {ToolInfoArg});
   appendToGlobalDtors(M, EsanDtorFunction, EsanCtorAndDtorPriority);
@@ -757,7 +752,7 @@ bool EfficiencySanitizer::instrumentGetElementPtr(Instruction *I, Module &M) {
     return false;
   }
   Type *SourceTy = GepInst->getSourceElementType();
-  StructType *StructTy;
+  StructType *StructTy = nullptr;
   ConstantInt *Idx;
   // Check if GEP calculates address from a struct array.
   if (isa<StructType>(SourceTy)) {
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 1ba13bd..4089d81 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -1,4 +1,4 @@
-//===-- IndirectCallPromotion.cpp - Promote indirect calls to direct calls ===//
+//===-- IndirectCallPromotion.cpp - Optimizations based on value profiling ===//
 //
 //                      The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/IndirectCallSiteVisitor.h"
 #include "llvm/IR/BasicBlock.h"
@@ -40,6 +42,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -65,13 +68,13 @@ static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
 // For debug use only.
 static cl::opt<unsigned>
     ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
-              cl::desc("Max number of promotions for this compilaiton"));
+              cl::desc("Max number of promotions for this compilation"));
 
 // If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
 // For debug use only.
 static cl::opt<unsigned>
     ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
-              cl::desc("Skip Callsite up to this number for this compilaiton"));
+              cl::desc("Skip Callsite up to this number for this compilation"));
 
 // Set if the pass is called in LTO optimization. The difference for LTO mode
 // is the pass won't prefix the source module name to the internal linkage
@@ -80,6 +83,12 @@ static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
                                 cl::desc("Run indirect-call promotion in LTO "
                                          "mode"));
 
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+                     cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
 // If the option is set to true, only call instructions will be considered for
 // transformation -- invoke instructions will be ignored.
 static cl::opt<bool>
@@ -105,8 +114,8 @@ class PGOIndirectCallPromotionLegacyPass : public ModulePass {
 public:
   static char ID;
 
-  PGOIndirectCallPromotionLegacyPass(bool InLTO = false)
-      : ModulePass(ID), InLTO(InLTO) {
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
     initializePGOIndirectCallPromotionLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -119,6 +128,10 @@ private:
   // If this pass is called in LTO. We need to special handling the PGOFuncName
   // for the static variables due to LTO's internalization.
   bool InLTO;
+
+  // If this pass is called in SamplePGO. We need to add the prof metadata to
+  // the promoted direct call.
+  bool SamplePGO;
 };
 } // end anonymous namespace
 
@@ -128,8 +141,9 @@ INITIALIZE_PASS(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
                 "direct calls.",
                 false, false)
 
-ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO) {
-  return new PGOIndirectCallPromotionLegacyPass(InLTO);
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+                                                           bool SamplePGO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
 }
 
 namespace {
@@ -144,17 +158,11 @@ private:
   // defines.
   InstrProfSymtab *Symtab;
 
-  enum TargetStatus {
-    OK,                   // Should be able to promote.
-    NotAvailableInModule, // Cannot find the target in current module.
-    ReturnTypeMismatch,   // Return type mismatch b/w target and indirect-call.
-    NumArgsMismatch,      // Number of arguments does not match.
-    ArgTypeMismatch       // Type mismatch in the arguments (cannot bitcast).
-  };
+  bool SamplePGO;
 
   // Test if we can legally promote this direct-call of Target.
-  TargetStatus isPromotionLegal(Instruction *Inst, uint64_t Target,
-                                Function *&F);
+  bool isPromotionLegal(Instruction *Inst, uint64_t Target, Function *&F,
+                        const char **Reason = nullptr);
 
   // A struct that records the direct target and it's call count.
   struct PromotionCandidate {
@@ -172,91 +180,77 @@ private:
       Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
       uint64_t TotalCount, uint32_t NumCandidates);
 
-  // Main function that transforms Inst (either a indirect-call instruction, or
-  // an invoke instruction , to a conditional call to F. This is like:
-  //     if (Inst.CalledValue == F)
-  //        F(...);
-  //     else
-  //        Inst(...);
-  //     end
-  // TotalCount is the profile count value that the instruction executes.
-  // Count is the profile count value that F is the target function.
-  // These two values are being used to update the branch weight.
-  void promote(Instruction *Inst, Function *F, uint64_t Count,
-               uint64_t TotalCount);
-
   // Promote a list of targets for one indirect-call callsite. Return
   // the number of promotions.
   uint32_t tryToPromote(Instruction *Inst,
                         const std::vector<PromotionCandidate> &Candidates,
                         uint64_t &TotalCount);
 
-  static const char *StatusToString(const TargetStatus S) {
-    switch (S) {
-    case OK:
-      return "OK to promote";
-    case NotAvailableInModule:
-      return "Cannot find the target";
-    case ReturnTypeMismatch:
-      return "Return type mismatch";
-    case NumArgsMismatch:
-      return "The number of arguments mismatch";
-    case ArgTypeMismatch:
-      return "Argument Type mismatch";
-    }
-    llvm_unreachable("Should not reach here");
-  }
-
   // Noncopyable
   ICallPromotionFunc(const ICallPromotionFunc &other) = delete;
   ICallPromotionFunc &operator=(const ICallPromotionFunc &other) = delete;
 
 public:
-  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab)
-      : F(Func), M(Modu), Symtab(Symtab) {
-  }
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+                     bool SamplePGO)
+      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO) {}
 
   bool processFunction();
 };
 } // end anonymous namespace
 
-ICallPromotionFunc::TargetStatus
-ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
-                                     Function *&TargetFunction) {
-  Function *DirectCallee = Symtab->getFunction(Target);
-  if (DirectCallee == nullptr)
-    return NotAvailableInModule;
+bool llvm::isLegalToPromote(Instruction *Inst, Function *F,
+                            const char **Reason) {
   // Check the return type.
   Type *CallRetType = Inst->getType();
   if (!CallRetType->isVoidTy()) {
-    Type *FuncRetType = DirectCallee->getReturnType();
+    Type *FuncRetType = F->getReturnType();
     if (FuncRetType != CallRetType &&
-        !CastInst::isBitCastable(FuncRetType, CallRetType))
-      return ReturnTypeMismatch;
+        !CastInst::isBitCastable(FuncRetType, CallRetType)) {
+      if (Reason)
+        *Reason = "Return type mismatch";
+      return false;
+    }
   }
 
   // Check if the arguments are compatible with the parameters
-  FunctionType *DirectCalleeType = DirectCallee->getFunctionType();
+  FunctionType *DirectCalleeType = F->getFunctionType();
   unsigned ParamNum = DirectCalleeType->getFunctionNumParams();
   CallSite CS(Inst);
   unsigned ArgNum = CS.arg_size();
 
-  if (ParamNum != ArgNum && !DirectCalleeType->isVarArg())
-    return NumArgsMismatch;
+  if (ParamNum != ArgNum && !DirectCalleeType->isVarArg()) {
+    if (Reason)
+      *Reason = "The number of arguments mismatch";
+    return false;
+  }
 
   for (unsigned I = 0; I < ParamNum; ++I) {
     Type *PTy = DirectCalleeType->getFunctionParamType(I);
     Type *ATy = CS.getArgument(I)->getType();
     if (PTy == ATy)
       continue;
-    if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy))
-      return ArgTypeMismatch;
+    if (!CastInst::castIsValid(Instruction::BitCast, CS.getArgument(I), PTy)) {
+      if (Reason)
+        *Reason = "Argument type mismatch";
+      return false;
+    }
   }
 
   DEBUG(dbgs() << " #" << NumOfPGOICallPromotion << " Promote the icall to "
-               << Symtab->getFuncName(Target) << "\n");
-  TargetFunction = DirectCallee;
-  return OK;
+               << F->getName() << "\n");
+  return true;
+}
+
+bool ICallPromotionFunc::isPromotionLegal(Instruction *Inst, uint64_t Target,
+                                          Function *&TargetFunction,
+                                          const char **Reason) {
+  TargetFunction = Symtab->getFunction(Target);
+  if (TargetFunction == nullptr) {
+    *Reason = "Cannot find the target";
+    return false;
+  }
+  return isLegalToPromote(Inst, TargetFunction, Reason);
 }
 
 // Indirect-call promotion heuristic. The direct targets are sorted based on
@@ -296,10 +290,9 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
       break;
     }
     Function *TargetFunction = nullptr;
-    TargetStatus Status = isPromotionLegal(Inst, Target, TargetFunction);
-    if (Status != OK) {
+    const char *Reason = nullptr;
+    if (!isPromotionLegal(Inst, Target, TargetFunction, &Reason)) {
       StringRef TargetFuncName = Symtab->getFuncName(Target);
-      const char *Reason = StatusToString(Status);
       DEBUG(dbgs() << " Not promote: " << Reason << "\n");
       emitOptimizationRemarkMissed(
           F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
@@ -532,8 +525,14 @@ static void insertCallRetPHI(Instruction *Inst, Instruction *CallResult,
 //     Ret = phi(Ret1, Ret2);
 // It adds type casts for the args do not match the parameters and the return
 // value. Branch weights metadata also updated.
-void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
-                                 uint64_t Count, uint64_t TotalCount) {
+// If \p AttachProfToDirectCall is true, a prof metadata is attached to the
+// new direct call to contain \p Count. This is used by SamplePGO inliner to
+// check callsite hotness.
+// Returns the promoted direct call instruction.
+Instruction *llvm::promoteIndirectCall(Instruction *Inst,
+                                       Function *DirectCallee, uint64_t Count,
+                                       uint64_t TotalCount,
+                                       bool AttachProfToDirectCall) {
   assert(DirectCallee != nullptr);
   BasicBlock *BB = Inst->getParent();
   // Just to suppress the non-debug build warning.
@@ -548,6 +547,14 @@ void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
   Instruction *NewInst =
       createDirectCallInst(Inst, DirectCallee, DirectCallBB, MergeBB);
 
+  if (AttachProfToDirectCall) {
+    SmallVector<uint32_t, 1> Weights;
+    Weights.push_back(Count);
+    MDBuilder MDB(NewInst->getContext());
+    dyn_cast<Instruction>(NewInst->stripPointerCasts())
+        ->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  }
+
   // Move Inst from MergeBB to IndirectCallBB.
   Inst->removeFromParent();
   IndirectCallBB->getInstList().insert(IndirectCallBB->getFirstInsertionPt(),
@@ -576,9 +583,10 @@ void ICallPromotionFunc::promote(Instruction *Inst, Function *DirectCallee,
   DEBUG(dbgs() << *BB << *DirectCallBB << *IndirectCallBB << *MergeBB << "\n");
 
   emitOptimizationRemark(
-      F.getContext(), "pgo-icall-prom", F, Inst->getDebugLoc(),
+      BB->getContext(), "pgo-icall-prom", *BB->getParent(), Inst->getDebugLoc(),
       Twine("Promote indirect call to ") + DirectCallee->getName() +
           " with count " + Twine(Count) + " out of " + Twine(TotalCount));
+  return NewInst;
 }
 
 // Promote indirect-call to conditional direct-call for one callsite.
@@ -589,7 +597,7 @@ uint32_t ICallPromotionFunc::tryToPromote(
 
   for (auto &C : Candidates) {
     uint64_t Count = C.Count;
-    promote(Inst, C.TargetFunction, Count, TotalCount);
+    promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount, SamplePGO);
     assert(TotalCount >= Count);
     TotalCount -= Count;
     NumOfPGOICallPromotion++;
@@ -630,18 +638,23 @@ bool ICallPromotionFunc::processFunction() {
 }
 
 // A wrapper function that does the actual work.
-static bool promoteIndirectCalls(Module &M, bool InLTO) {
+static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {
   if (DisableICP)
     return false;
   InstrProfSymtab Symtab;
-  Symtab.create(M, InLTO);
+  if (Error E = Symtab.create(M, InLTO)) {
+    std::string SymtabFailure = toString(std::move(E));
+    DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+    (void)SymtabFailure;
+    return false;
+  }
   bool Changed = false;
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
-    ICallPromotionFunc ICallPromotion(F, &M, &Symtab);
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO);
     bool FuncChanged = ICallPromotion.processFunction();
     if (ICPDUMPAFTER && FuncChanged) {
       DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
@@ -658,11 +671,14 @@ static bool promoteIndirectCalls(Module &M, bool InLTO) {
 
 bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
   // Command-line option has the priority for InLTO.
-  return promoteIndirectCalls(M, InLTO | ICPLTOMode);
+  return promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                              SamplePGO | ICPSamplePGOMode);
 }
 
-PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode))
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                            SamplePGO | ICPSamplePGOMode))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index adea7e7..db8fa89 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -14,18 +14,63 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/InstrProfiling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "instrprof"
 
+// The start and end values of precise value profile range for memory
+// intrinsic sizes
+cl::opt<std::string> MemOPSizeRange(
+    "memop-size-range",
+    cl::desc("Set the range of size in memory intrinsic calls to be profiled "
+             "precisely, in a format of <start_val>:<end_val>"),
+    cl::init(""));
+
+// The value that considered to be large value in  memory intrinsic.
+cl::opt<unsigned> MemOPSizeLarge(
+    "memop-size-large",
+    cl::desc("Set large value thresthold in memory intrinsic size profiling. "
+             "Value of 0 disables the large value profiling."),
+    cl::init(8192));
+
 namespace {
 
 cl::opt<bool> DoNameCompression("enable-name-compression",
@@ -41,6 +86,7 @@ cl::opt<bool> ValueProfileStaticAlloc(
     "vp-static-alloc",
     cl::desc("Do static counter allocation for value profiler"),
     cl::init(true));
+
 cl::opt<double> NumCountersPerValueSite(
     "vp-counters-per-site",
     cl::desc("The average number of profile counters allocated "
@@ -51,14 +97,56 @@ cl::opt<double> NumCountersPerValueSite(
     // is usually smaller than 2.
     cl::init(1.0));
 
+cl::opt<bool> AtomicCounterUpdatePromoted(
+    "atomic-counter-update-promoted", cl::ZeroOrMore,
+    cl::desc("Do counter update using atomic fetch add "
+             " for promoted counters only"),
+    cl::init(false));
+
+// If the option is not specified, the default behavior about whether
+// counter promotion is done depends on how instrumentaiton lowering
+// pipeline is setup, i.e., the default value of true of this option
+// does not mean the promotion will be done by default. Explicitly
+// setting this option can override the default behavior.
+cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+                                 cl::desc("Do counter register promotion"),
+                                 cl::init(false));
+cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
+    cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
+    cl::desc("Max number counter promotions per loop to avoid"
+             " increasing register pressure too much"));
+
+// A debug option
+cl::opt<int>
+    MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+                       cl::desc("Max number of allowed counter promotions"));
+
+cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
+    cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
+    cl::desc("The max number of exiting blocks of a loop to allow "
+             " speculative counter promotion"));
+
+cl::opt<bool> SpeculativeCounterPromotionToLoop(
+    cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
+    cl::desc("When the option is false, if the target block is in a loop, "
+             "the promotion will be disallowed unless the promoted counter "
+             " update can be further/iteratively promoted into an acyclic "
+             " region."));
+
+cl::opt<bool> IterativeCounterPromotion(
+    cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
+    cl::desc("Allow counter promotion across the whole loop nest."));
+
 class InstrProfilingLegacyPass : public ModulePass {
   InstrProfiling InstrProf;
 
 public:
   static char ID;
-  InstrProfilingLegacyPass() : ModulePass(ID), InstrProf() {}
+
+  InstrProfilingLegacyPass() : ModulePass(ID) {}
   InstrProfilingLegacyPass(const InstrProfOptions &Options)
       : ModulePass(ID), InstrProf(Options) {}
+
   StringRef getPassName() const override {
     return "Frontend instrumentation-based coverage lowering";
   }
@@ -73,7 +161,184 @@ public:
   }
 };
 
-} // anonymous namespace
+///
+/// A helper class to promote one counter RMW operation in the loop
+/// into register update.
+///
+/// RWM update for the counter will be sinked out of the loop after
+/// the transformation.
+///
+class PGOCounterPromoterHelper : public LoadAndStorePromoter {
+public:
+  PGOCounterPromoterHelper(
+      Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
+      BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
+      ArrayRef<Instruction *> InsertPts,
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      LoopInfo &LI)
+      : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
+        InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
+    assert(isa<LoadInst>(L));
+    assert(isa<StoreInst>(S));
+    SSA.AddAvailableValue(PH, Init);
+  }
+
+  void doExtraRewritesBeforeFinalDeletion() const override {
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+      Instruction *InsertPos = InsertPts[i];
+      // Get LiveIn value into the ExitBlock. If there are multiple
+      // predecessors, the value is defined by a PHI node in this
+      // block.
+      Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+      Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
+      IRBuilder<> Builder(InsertPos);
+      if (AtomicCounterUpdatePromoted)
+        // automic update currently can only be promoted across the current
+        // loop, not the whole loop nest.
+        Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
+                                AtomicOrdering::SequentiallyConsistent);
+      else {
+        LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted");
+        auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
+        auto *NewStore = Builder.CreateStore(NewVal, Addr);
+
+        // Now update the parent loop's candidate list:
+        if (IterativeCounterPromotion) {
+          auto *TargetLoop = LI.getLoopFor(ExitBlock);
+          if (TargetLoop)
+            LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
+        }
+      }
+    }
+  }
+
+private:
+  Instruction *Store;
+  ArrayRef<BasicBlock *> ExitBlocks;
+  ArrayRef<Instruction *> InsertPts;
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  LoopInfo &LI;
+};
+
+/// A helper class to do register promotion for all profile counter
+/// updates in a loop.
+///
+class PGOCounterPromoter {
+public:
+  PGOCounterPromoter(
+      DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+      Loop &CurLoop, LoopInfo &LI)
+      : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
+        LI(LI) {
+
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    SmallPtrSet<BasicBlock *, 8> BlockSet;
+    L.getExitBlocks(LoopExitBlocks);
+
+    for (BasicBlock *ExitBlock : LoopExitBlocks) {
+      if (BlockSet.insert(ExitBlock).second) {
+        ExitBlocks.push_back(ExitBlock);
+        InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+      }
+    }
+  }
+
+  bool run(int64_t *NumPromoted) {
+    unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
+    if (MaxProm == 0)
+      return false;
+
+    unsigned Promoted = 0;
+    for (auto &Cand : LoopToCandidates[&L]) {
+
+      SmallVector<PHINode *, 4> NewPHIs;
+      SSAUpdater SSA(&NewPHIs);
+      Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
+
+      PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
+                                        L.getLoopPreheader(), ExitBlocks,
+                                        InsertPts, LoopToCandidates, LI);
+      Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
+      Promoted++;
+      if (Promoted >= MaxProm)
+        break;
+
+      (*NumPromoted)++;
+      if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
+        break;
+    }
+
+    DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+                 << L.getLoopDepth() << ")\n");
+    return Promoted != 0;
+  }
+
+private:
+  bool allowSpeculativeCounterPromotion(Loop *LP) {
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    L.getExitingBlocks(ExitingBlocks);
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return true;
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return false;
+    return true;
+  }
+
+  // Returns the max number of Counter Promotions for LP.
+  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+    // We can't insert into a catchswitch.
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    LP->getExitBlocks(LoopExitBlocks);
+    if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
+          return isa<CatchSwitchInst>(Exit->getTerminator());
+        }))
+      return 0;
+
+    if (!LP->hasDedicatedExits())
+      return 0;
+
+    BasicBlock *PH = LP->getLoopPreheader();
+    if (!PH)
+      return 0;
+
+    SmallVector<BasicBlock *, 8> ExitingBlocks;
+    LP->getExitingBlocks(ExitingBlocks);
+    // Not considierered speculative.
+    if (ExitingBlocks.size() == 1)
+      return MaxNumOfPromotionsPerLoop;
+
+    if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+      return 0;
+
+    // Whether the target block is in a loop does not matter:
+    if (SpeculativeCounterPromotionToLoop)
+      return MaxNumOfPromotionsPerLoop;
+
+    // Now check the target block:
+    unsigned MaxProm = MaxNumOfPromotionsPerLoop;
+    for (auto *TargetBlock : LoopExitBlocks) {
+      auto *TargetLoop = LI.getLoopFor(TargetBlock);
+      if (!TargetLoop)
+        continue;
+      unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
+      unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
+      MaxProm =
+          std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
+                                PendingCandsInTarget);
+    }
+    return MaxProm;
+  }
+
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  SmallVector<Instruction *, 8> InsertPts;
+  Loop &L;
+  LoopInfo &LI;
+};
+
+} // end anonymous namespace
 
 PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
@@ -97,35 +362,70 @@ llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options) {
   return new InstrProfilingLegacyPass(Options);
 }
 
-bool InstrProfiling::isMachO() const {
-  return Triple(M->getTargetTriple()).isOSBinFormatMachO();
+static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
+  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
+  if (Inc)
+    return Inc;
+  return dyn_cast<InstrProfIncrementInst>(Instr);
 }
 
-/// Get the section name for the counter variables.
-StringRef InstrProfiling::getCountersSection() const {
-  return getInstrProfCountersSectionName(isMachO());
-}
+bool InstrProfiling::lowerIntrinsics(Function *F) {
+  bool MadeChange = false;
+  PromotionCandidates.clear();
+  for (BasicBlock &BB : *F) {
+    for (auto I = BB.begin(), E = BB.end(); I != E;) {
+      auto Instr = I++;
+      InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+      if (Inc) {
+        lowerIncrement(Inc);
+        MadeChange = true;
+      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+        lowerValueProfileInst(Ind);
+        MadeChange = true;
+      }
+    }
+  }
 
-/// Get the section name for the name variables.
-StringRef InstrProfiling::getNameSection() const {
-  return getInstrProfNameSectionName(isMachO());
-}
+  if (!MadeChange)
+    return false;
 
-/// Get the section name for the profile data variables.
-StringRef InstrProfiling::getDataSection() const {
-  return getInstrProfDataSectionName(isMachO());
+  promoteCounterLoadStores(F);
+  return true;
 }
 
-/// Get the section name for the coverage mapping data.
-StringRef InstrProfiling::getCoverageSection() const {
-  return getInstrProfCoverageSectionName(isMachO());
+bool InstrProfiling::isCounterPromotionEnabled() const {
+  if (DoCounterPromotion.getNumOccurrences() > 0)
+    return DoCounterPromotion;
+
+  return Options.DoCounterPromotion;
 }
 
-static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
-  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
-  if (Inc)
-    return Inc;
-  return dyn_cast<InstrProfIncrementInst>(Instr);
+void InstrProfiling::promoteCounterLoadStores(Function *F) {
+  if (!isCounterPromotionEnabled())
+    return;
+
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
+
+  for (const auto &LoadStore : PromotionCandidates) {
+    auto *CounterLoad = LoadStore.first;
+    auto *CounterStore = LoadStore.second;
+    BasicBlock *BB = CounterLoad->getParent();
+    Loop *ParentLoop = LI.getLoopFor(BB);
+    if (!ParentLoop)
+      continue;
+    LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
+  }
+
+  SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
+
+  // Do a post-order traversal of the loops so that counter updates can be
+  // iteratively hoisted outside the loop nest.
+  for (auto *Loop : llvm::reverse(Loops)) {
+    PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI);
+    Promoter.run(&TotalCountersPromoted);
+  }
 }
 
 bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
@@ -137,6 +437,9 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   NamesSize = 0;
   ProfileDataMap.clear();
   UsedVars.clear();
+  getMemOPSizeRangeFromOption(MemOPSizeRange, MemOPSizeRangeStart,
+                              MemOPSizeRangeLast);
+  TT = Triple(M.getTargetTriple());
 
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
@@ -157,18 +460,7 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   }
 
   for (Function &F : M)
-    for (BasicBlock &BB : F)
-      for (auto I = BB.begin(), E = BB.end(); I != E;) {
-        auto Instr = I++;
-        InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
-        if (Inc) {
-          lowerIncrement(Inc);
-          MadeChange = true;
-        } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
-          lowerValueProfileInst(Ind);
-          MadeChange = true;
-        }
-      }
+    MadeChange |= lowerIntrinsics(&F);
 
   if (GlobalVariable *CoverageNamesVar =
           M.getNamedGlobal(getCoverageUnusedNamesVarName())) {
@@ -189,26 +481,42 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
 }
 
 static Constant *getOrInsertValueProfilingCall(Module &M,
-                                               const TargetLibraryInfo &TLI) {
+                                               const TargetLibraryInfo &TLI,
+                                               bool IsRange = false) {
   LLVMContext &Ctx = M.getContext();
   auto *ReturnTy = Type::getVoidTy(M.getContext());
-  Type *ParamTypes[] = {
+
+  Constant *Res;
+  if (!IsRange) {
+    Type *ParamTypes[] = {
 #define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
 #include "llvm/ProfileData/InstrProfData.inc"
-  };
-  auto *ValueProfilingCallTy =
-      FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
-  Constant *Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
-                                        ValueProfilingCallTy);
+    };
+    auto *ValueProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+                                ValueProfilingCallTy);
+  } else {
+    Type *RangeParamTypes[] = {
+#define VALUE_RANGE_PROF 1
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+#undef VALUE_RANGE_PROF
+    };
+    auto *ValueRangeProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(RangeParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueRangeProfFuncName(),
+                                ValueRangeProfilingCallTy);
+  }
+
   if (Function *FunRes = dyn_cast<Function>(Res)) {
     if (auto AK = TLI.getExtAttrForI32Param(false))
-      FunRes->addAttribute(3, AK);
+      FunRes->addParamAttr(2, AK);
   }
   return Res;
 }
 
 void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
-
   GlobalVariable *Name = Ind->getName();
   uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
   uint64_t Index = Ind->getIndex()->getZExtValue();
@@ -222,7 +530,6 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
 }
 
 void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
-
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
   assert(It != ProfileDataMap.end() && It->second.DataVar &&
@@ -235,13 +542,27 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
     Index += It->second.NumValueSites[Kind];
 
   IRBuilder<> Builder(Ind);
-  Value *Args[3] = {Ind->getTargetValue(),
-                    Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
-                    Builder.getInt32(Index)};
-  CallInst *Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI),
-                                      Args);
+  bool IsRange = (Ind->getValueKind()->getZExtValue() ==
+                  llvm::InstrProfValueKind::IPVK_MemOPSize);
+  CallInst *Call = nullptr;
+  if (!IsRange) {
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args);
+  } else {
+    Value *Args[6] = {
+        Ind->getTargetValue(),
+        Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+        Builder.getInt32(Index),
+        Builder.getInt64(MemOPSizeRangeStart),
+        Builder.getInt64(MemOPSizeRangeLast),
+        Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
+    Call =
+        Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), Args);
+  }
   if (auto AK = TLI->getExtAttrForI32Param(false))
-    Call->addAttribute(3, AK);
+    Call->addParamAttr(2, AK);
   Ind->replaceAllUsesWith(Call);
   Ind->eraseFromParent();
 }
@@ -252,14 +573,16 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
   IRBuilder<> Builder(Inc);
   uint64_t Index = Inc->getIndex()->getZExtValue();
   Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters, 0, Index);
-  Value *Count = Builder.CreateLoad(Addr, "pgocount");
-  Count = Builder.CreateAdd(Count, Inc->getStep());
-  Inc->replaceAllUsesWith(Builder.CreateStore(Count, Addr));
+  Value *Load = Builder.CreateLoad(Addr, "pgocount");
+  auto *Count = Builder.CreateAdd(Load, Inc->getStep());
+  auto *Store = Builder.CreateStore(Count, Addr);
+  Inc->replaceAllUsesWith(Store);
+  if (isCounterPromotionEnabled())
+    PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
   Inc->eraseFromParent();
 }
 
 void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
-
   ConstantArray *Names =
       cast<ConstantArray>(CoverageNamesVar->getInitializer());
   for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
@@ -270,7 +593,9 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 
     Name->setLinkage(GlobalValue::PrivateLinkage);
     ReferencedNames.push_back(Name);
+    NC->dropAllReferences();
   }
+  CoverageNamesVar->eraseFromParent();
 }
 
 /// Get the name of a profiling variable for a particular function.
@@ -291,14 +616,24 @@ static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
 
 static inline bool shouldRecordFunctionAddr(Function *F) {
   // Check the linkage
+  bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
   if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
-      !F->hasAvailableExternallyLinkage())
+      !HasAvailableExternallyLinkage)
     return true;
+
+  // A function marked 'alwaysinline' with available_externally linkage can't
+  // have its address taken. Doing so would create an undefined external ref to
+  // the function, which would fail to link.
+  if (HasAvailableExternallyLinkage &&
+      F->hasFnAttribute(Attribute::AlwaysInline))
+    return false;
+
   // Prohibit function address recording if the function is both internal and
   // COMDAT. This avoids the profile data variable referencing internal symbols
   // in COMDAT.
   if (F->hasLocalLinkage() && F->hasComdat())
     return false;
+
   // Check uses of this function for other than direct calls or invokes to it.
   // Inline virtual functions have linkeOnceODR linkage. When a key method
   // exists, the vtable will only be emitted in the TU where the key method
@@ -367,7 +702,8 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                          Constant::getNullValue(CounterTy),
                          getVarName(Inc, getInstrProfCountersVarPrefix()));
   CounterPtr->setVisibility(NamePtr->getVisibility());
-  CounterPtr->setSection(getCountersSection());
+  CounterPtr->setSection(
+      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
   CounterPtr->setAlignment(8);
   CounterPtr->setComdat(ProfileVarsComdat);
 
@@ -376,7 +712,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
   // the current function.
   Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
   if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(*M)) {
-
     uint64_t NS = 0;
     for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
       NS += PD.NumValueSites[Kind];
@@ -388,11 +723,12 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                              Constant::getNullValue(ValuesTy),
                              getVarName(Inc, getInstrProfValuesVarPrefix()));
       ValuesVar->setVisibility(NamePtr->getVisibility());
-      ValuesVar->setSection(getInstrProfValuesSectionName(isMachO()));
+      ValuesVar->setSection(
+          getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
       ValuesVar->setAlignment(8);
       ValuesVar->setComdat(ProfileVarsComdat);
       ValuesPtrExpr =
-          ConstantExpr::getBitCast(ValuesVar, llvm::Type::getInt8PtrTy(Ctx));
+          ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
     }
   }
 
@@ -421,7 +757,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
                                   ConstantStruct::get(DataTy, DataVals),
                                   getVarName(Inc, getInstrProfDataVarPrefix()));
   Data->setVisibility(NamePtr->getVisibility());
-  Data->setSection(getDataSection());
+  Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
   Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT);
   Data->setComdat(ProfileVarsComdat);
 
@@ -481,9 +817,10 @@ void InstrProfiling::emitVNodes() {
 
   ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
   auto *VNodesVar = new GlobalVariable(
-      *M, VNodesTy, false, llvm::GlobalValue::PrivateLinkage,
+      *M, VNodesTy, false, GlobalValue::PrivateLinkage,
       Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
-  VNodesVar->setSection(getInstrProfVNodesSectionName(isMachO()));
+  VNodesVar->setSection(
+      getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
   UsedVars.push_back(VNodesVar);
 }
 
@@ -496,18 +833,22 @@ void InstrProfiling::emitNameData() {
   std::string CompressedNameStr;
   if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
                                           DoNameCompression)) {
-    llvm::report_fatal_error(toString(std::move(E)), false);
+    report_fatal_error(toString(std::move(E)), false);
   }
 
   auto &Ctx = M->getContext();
-  auto *NamesVal = llvm::ConstantDataArray::getString(
+  auto *NamesVal = ConstantDataArray::getString(
       Ctx, StringRef(CompressedNameStr), false);
-  NamesVar = new llvm::GlobalVariable(*M, NamesVal->getType(), true,
-                                      llvm::GlobalValue::PrivateLinkage,
-                                      NamesVal, getInstrProfNamesVarName());
+  NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
+                                GlobalValue::PrivateLinkage, NamesVal,
+                                getInstrProfNamesVarName());
   NamesSize = CompressedNameStr.size();
-  NamesVar->setSection(getNameSection());
+  NamesVar->setSection(
+      getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
   UsedVars.push_back(NamesVar);
+
+  for (auto *NamePtr : ReferencedNames)
+    NamePtr->eraseFromParent();
 }
 
 void InstrProfiling::emitRegistration() {
@@ -550,7 +891,6 @@ void InstrProfiling::emitRegistration() {
 }
 
 void InstrProfiling::emitRuntimeHook() {
-
   // We expect the linker to be invoked with -u<hook_var> flag for linux,
   // for which case there is no need to emit the user function.
   if (Triple(M->getTargetTriple()).isOSLinux())
@@ -600,7 +940,6 @@ void InstrProfiling::emitInitialization() {
     GlobalVariable *ProfileNameVar = new GlobalVariable(
         *M, ProfileNameConst->getType(), true, GlobalValue::WeakAnyLinkage,
         ProfileNameConst, INSTR_PROF_QUOTE(INSTR_PROF_PROFILE_NAME_VAR));
-    Triple TT(M->getTargetTriple());
     if (TT.supportsCOMDAT()) {
       ProfileNameVar->setLinkage(GlobalValue::ExternalLinkage);
       ProfileNameVar->setComdat(M->getOrInsertComdat(
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index 2963d08..7bb62d2 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -63,6 +63,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializePGOInstrumentationGenLegacyPassPass(Registry);
   initializePGOInstrumentationUseLegacyPassPass(Registry);
   initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializePGOMemOPSizeOptLegacyPassPass(Registry);
   initializeInstrProfilingLegacyPassPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
index 363539b..4eb758c 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
-#define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
 
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/IR/BasicBlock.h"
@@ -108,4 +108,4 @@ namespace llvm {
 
 } // End llvm namespace
 
-#endif
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_MAXIMUMSPANNINGTREE_H
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fafb0fc..b7c6271 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -425,7 +425,7 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   // which is not yet implemented.
   StringRef WarningFnName = Recover ? "__msan_warning"
                                     : "__msan_warning_noreturn";
-  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), nullptr);
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
 
   for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
        AccessSizeIndex++) {
@@ -433,31 +433,31 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
     std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
     MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt32Ty(), nullptr);
+        IRB.getInt32Ty());
 
     FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
     MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt8PtrTy(), IRB.getInt32Ty(), nullptr);
+        IRB.getInt8PtrTy(), IRB.getInt32Ty());
   }
 
   MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
     "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+    IRB.getInt8PtrTy(), IntptrTy);
   MsanPoisonStackFn =
       M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr);
+                            IRB.getInt8PtrTy(), IntptrTy);
   MsanChainOriginFn = M.getOrInsertFunction(
-    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), nullptr);
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
   MemmoveFn = M.getOrInsertFunction(
     "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+    IRB.getInt8PtrTy(), IntptrTy);
   MemcpyFn = M.getOrInsertFunction(
     "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IntptrTy, nullptr);
+    IntptrTy);
   MemsetFn = M.getOrInsertFunction(
     "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
-    IntptrTy, nullptr);
+    IntptrTy);
 
   // Create globals.
   RetvalTLS = new GlobalVariable(
@@ -1037,15 +1037,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     OriginMap[V] = Origin;
   }
 
+  Constant *getCleanShadow(Type *OrigTy) {
+    Type *ShadowTy = getShadowTy(OrigTy);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
   /// \brief Create a clean shadow value for a given value.
   ///
   /// Clean shadow (all zeroes) means all bits of the value are defined
   /// (initialized).
   Constant *getCleanShadow(Value *V) {
-    Type *ShadowTy = getShadowTy(V);
-    if (!ShadowTy)
-      return nullptr;
-    return Constant::getNullValue(ShadowTy);
+    return getCleanShadow(V->getType());
   }
 
   /// \brief Create a dirty shadow of a given shadow type.
@@ -1572,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
                           bool Signed = false) {
     Type *srcTy = V->getType();
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    if (srcSizeInBits > 1 && dstSizeInBits == 1)
+      return IRB.CreateICmpNE(V, getCleanShadow(V));
+
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
       return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
         dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
       return IRB.CreateIntCast(V, dstTy, Signed);
-    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
-    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
       IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
@@ -1942,7 +1949,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
 
-    // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
     if (MS.TrackOrigins)
       IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1));
@@ -2081,6 +2087,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     switch (I.getNumArgOperands()) {
     case 3:
       assert(isa<ConstantInt>(I.getArgOperand(2)) && "Invalid rounding mode");
+      LLVM_FALLTHROUGH;
     case 2:
       CopyOp = I.getArgOperand(0);
       ConvertOp = I.getArgOperand(1);
@@ -2325,11 +2332,49 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  void handleStmxcsr(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    Value *ShadowPtr = getShadowPtr(Addr, Ty, IRB);
+
+    IRB.CreateStore(getCleanShadow(Ty),
+                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+  }
+
+  void handleLdmxcsr(IntrinsicInst &I) {
+    if (!InsertChecks) return;
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    unsigned Alignment = 1;
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    Value *Shadow = IRB.CreateAlignedLoad(getShadowPtr(Addr, Ty, IRB),
+                                          Alignment, "_ldmxcsr");
+    Value *Origin = MS.TrackOrigins
+                        ? IRB.CreateLoad(getOriginPtr(Addr, IRB, Alignment))
+                        : getCleanOrigin();
+    insertShadowCheck(Shadow, Origin, &I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
+    case llvm::Intrinsic::x86_sse_stmxcsr:
+      handleStmxcsr(I);
+      break;
+    case llvm::Intrinsic::x86_sse_ldmxcsr:
+      handleLdmxcsr(I);
+      break;
     case llvm::Intrinsic::x86_avx512_vcvtsd2usi64:
     case llvm::Intrinsic::x86_avx512_vcvtsd2usi32:
     case llvm::Intrinsic::x86_avx512_vcvtss2usi64:
@@ -2566,10 +2611,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         AttrBuilder B;
         B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone);
-        Func->removeAttributes(AttributeSet::FunctionIndex,
-                               AttributeSet::get(Func->getContext(),
-                                                 AttributeSet::FunctionIndex,
-                                                 B));
+        Func->removeAttributes(AttributeList::FunctionIndex, B);
       }
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
@@ -2597,12 +2639,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             " Shadow: " << *ArgShadow << "\n");
       bool ArgIsInitialized = false;
       const DataLayout &DL = F.getParent()->getDataLayout();
-      if (CS.paramHasAttr(i + 1, Attribute::ByVal)) {
+      if (CS.paramHasAttr(i, Attribute::ByVal)) {
         assert(A->getType()->isPointerTy() &&
                "ByVal argument is not a pointer!");
         Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
         if (ArgOffset + Size > kParamTLSSize) break;
-        unsigned ParamAlignment = CS.getParamAlignment(i + 1);
+        unsigned ParamAlignment = CS.getParamAlignment(i);
         unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment);
         Store = IRB.CreateMemCpy(ArgShadowBase,
                                  getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
@@ -2690,7 +2732,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     } else {
       Value *Shadow = getShadow(RetVal);
       IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
-      // FIXME: make it conditional if ClStoreCleanOrigin==0
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -2717,15 +2758,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getCleanOrigin());
     IRBuilder<> IRB(I.getNextNode());
     const DataLayout &DL = F.getParent()->getDataLayout();
-    uint64_t Size = DL.getTypeAllocSize(I.getAllocatedType());
+    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    if (I.isArrayAllocation())
+      Len = IRB.CreateMul(Len, I.getArraySize());
     if (PoisonStack && ClPoisonStackWithCall) {
       IRB.CreateCall(MS.MsanPoisonStackFn,
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
-                      ConstantInt::get(MS.IntptrTy, Size)});
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
     } else {
       Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
-      IRB.CreateMemSet(ShadowBase, PoisonValue, Size, I.getAlignment());
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
     }
 
     if (PoisonStack && MS.TrackOrigins) {
@@ -2742,8 +2785,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                                StackDescription.str());
 
       IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
-                      ConstantInt::get(MS.IntptrTy, Size),
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
                       IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
                       IRB.CreatePointerCast(&F, MS.IntptrTy)});
     }
@@ -2876,8 +2918,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClDumpStrictInstructions)
       dumpInst(I);
     DEBUG(dbgs() << "DEFAULT: " << I << "\n");
-    for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
-      insertShadowCheck(I.getOperand(i), &I);
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
+      Value *Operand = I.getOperand(i);
+      if (Operand->getType()->isSized())
+        insertShadowCheck(Operand, &I);
+    }
     setShadow(&I, getCleanShadow(&I));
     setOrigin(&I, getCleanOrigin());
   }
@@ -2935,7 +2980,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *A = *ArgIt;
       unsigned ArgNo = CS.getArgumentNo(ArgIt);
       bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
-      bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
       if (IsByVal) {
         // ByVal arguments always go to the overflow area.
         // Fixed arguments passed through the overflow area will be stepped
@@ -2994,7 +3039,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVAStartInst(VAStartInst &I) override {
-    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+    if (F.getCallingConv() == CallingConv::Win64)
       return;
     IRBuilder<> IRB(&I);
     VAStartInstrumentationList.push_back(&I);
@@ -3008,7 +3053,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
   }
 
   void visitVACopyInst(VACopyInst &I) override {
-    if (F.getCallingConv() == CallingConv::X86_64_Win64)
+    if (F.getCallingConv() == CallingConv::Win64)
       return;
     IRBuilder<> IRB(&I);
     Value *VAListTag = I.getArgOperand(0);
@@ -3456,12 +3501,12 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       Value *A = *ArgIt;
       unsigned ArgNo = CS.getArgumentNo(ArgIt);
       bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
-      bool IsByVal = CS.paramHasAttr(ArgNo + 1, Attribute::ByVal);
+      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
       if (IsByVal) {
         assert(A->getType()->isPointerTy());
         Type *RealTy = A->getType()->getPointerElementType();
         uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
-        uint64_t ArgAlign = CS.getParamAlignment(ArgNo + 1);
+        uint64_t ArgAlign = CS.getParamAlignment(ArgNo);
         if (ArgAlign < 8)
           ArgAlign = 8;
         VAArgOffset = alignTo(VAArgOffset, ArgAlign);
@@ -3618,9 +3663,7 @@ bool MemorySanitizer::runOnFunction(Function &F) {
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly)
     .addAttribute(Attribute::ReadNone);
-  F.removeAttributes(AttributeSet::FunctionIndex,
-                     AttributeSet::get(F.getContext(),
-                                       AttributeSet::FunctionIndex, B));
+  F.removeAttributes(AttributeList::FunctionIndex, B);
 
   return Visitor.runOnFunction();
 }
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 04f9a64..8e4bfc0 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -58,8 +58,10 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/IndirectCallSiteVisitor.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -71,7 +73,9 @@
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/JamCRC.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -87,6 +91,7 @@ using namespace llvm;
 
 STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
 STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
 STATISTIC(NumOfPGOEdge, "Number of edges.");
 STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
 STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
@@ -116,6 +121,13 @@ static cl::opt<unsigned> MaxNumAnnotations(
     cl::desc("Max number of annotations for a single indirect "
              "call callsite"));
 
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of preicise value annotations for a single memop"
+             "intrinsic"));
+
 // Command line option to control appending FunctionHash to the name of a COMDAT
 // function. This is to avoid the hash mismatch caused by the preinliner.
 static cl::opt<bool> DoComdatRenaming(
@@ -125,26 +137,102 @@ static cl::opt<bool> DoComdatRenaming(
 
 // Command line option to enable/disable the warning about missing profile
 // information.
-static cl::opt<bool> PGOWarnMissing("pgo-warn-missing-function",
-                                     cl::init(false),
-                                     cl::Hidden);
+static cl::opt<bool>
+    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+                   cl::desc("Use this option to turn on/off "
+                            "warnings about missing profile data for "
+                            "functions."));
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data.
-static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
-                                       cl::Hidden);
+static cl::opt<bool>
+    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+                      cl::desc("Use this option to turn off/on "
+                               "warnings about profile cfg mismatch."));
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data for Comdat functions, which often turns out to be false
 // positive due to the pre-instrumentation inline.
-static cl::opt<bool> NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat",
-                                             cl::init(true), cl::Hidden);
+static cl::opt<bool>
+    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("The option is used to turn on/off "
+                                     "warnings about hash mismatch for comdat "
+                                     "functions."));
 
 // Command line option to enable/disable select instruction instrumentation.
-static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
-                                    cl::Hidden);
+static cl::opt<bool>
+    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+                   cl::desc("Use this option to turn on/off SELECT "
+                            "instruction instrumentation. "));
+
+// Command line option to turn on CFG dot dump of raw profile counts
+static cl::opt<bool>
+    PGOViewRawCounts("pgo-view-raw-counts", cl::init(false), cl::Hidden,
+                     cl::desc("A boolean option to show CFG dag "
+                              "with raw profile counts from "
+                              "profile data. See also option "
+                              "-pgo-view-counts. To limit graph "
+                              "display to only one function, use "
+                              "filtering option -view-bfi-func-name."));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+                  cl::desc("Use this option to turn on/off "
+                           "memory intrinsic size profiling."));
+
+// Emit branch probability as optimization remarks.
+static cl::opt<bool>
+    EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
+                          cl::desc("When this option is on, the annotated "
+                                   "branch probability will be emitted as "
+                                   " optimization remarks: -Rpass-analysis="
+                                   "pgo-instr-use"));
+
+// Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
+extern cl::opt<bool> PGOViewCounts;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 namespace {
 
+// Return a string describing the branch condition that can be
+// used in static branch probability heuristics:
+std::string getBranchCondString(Instruction *TI) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isConditional())
+    return std::string();
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return std::string();
+
+  std::string result;
+  raw_string_ostream OS(result);
+  OS << CmpInst::getPredicateName(CI->getPredicate()) << "_";
+  CI->getOperand(0)->getType()->print(OS, true);
+
+  Value *RHS = CI->getOperand(1);
+  ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+  if (CV) {
+    if (CV->isZero())
+      OS << "_Zero";
+    else if (CV->isOne())
+      OS << "_One";
+    else if (CV->isMinusOne())
+      OS << "_MinusOne";
+    else
+      OS << "_Const";
+  }
+  OS.flush();
+  return result;
+}
+
 /// The select instruction visitor plays three roles specified
 /// by the mode. In \c VM_counting mode, it simply counts the number of
 /// select instructions. In \c VM_instrument mode, it inserts code to count
@@ -167,6 +255,7 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   SelectInstVisitor(Function &Func) : F(Func) {}
 
   void countSelects(Function &Func) {
+    NSIs = 0;
     Mode = VM_counting;
     visit(Func);
   }
@@ -196,9 +285,54 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   void annotateOneSelectInst(SelectInst &SI);
   // Visit \p SI instruction and perform tasks according to visit mode.
   void visitSelectInst(SelectInst &SI);
+  // Return the number of select instructions. This needs be called after
+  // countSelects().
   unsigned getNumOfSelectInsts() const { return NSIs; }
 };
 
+/// Instruction Visitor class to visit memory intrinsic calls.
+struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
+  Function &F;
+  unsigned NMemIs = 0;          // Number of memIntrinsics instrumented.
+  VisitMode Mode = VM_counting; // Visiting mode.
+  unsigned CurCtrId = 0;        // Current counter index.
+  unsigned TotalNumCtrs = 0;    // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+  std::vector<Instruction *> Candidates;
+
+  MemIntrinsicVisitor(Function &Func) : F(Func) {}
+
+  void countMemIntrinsics(Function &Func) {
+    NMemIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+
+  void instrumentMemIntrinsics(Function &Func, unsigned TotalNC,
+                               GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  std::vector<Instruction *> findMemIntrinsics(Function &Func) {
+    Candidates.clear();
+    Mode = VM_annotate;
+    visit(Func);
+    return Candidates;
+  }
+
+  // Visit the IR stream and annotate all mem intrinsic call instructions.
+  void instrumentOneMemIntrinsic(MemIntrinsic &MI);
+  // Visit \p MI instruction and perform tasks according to visit mode.
+  void visitMemIntrinsic(MemIntrinsic &SI);
+  unsigned getNumOfMemIntrinsics() const { return NMemIs; }
+};
+
 class PGOInstrumentationGenLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -316,8 +450,9 @@ private:
   std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
 
 public:
-  std::vector<Instruction *> IndirectCallSites;
+  std::vector<std::vector<Instruction *>> ValueSites;
   SelectInstVisitor SIVisitor;
+  MemIntrinsicVisitor MIVisitor;
   std::string FuncName;
   GlobalVariable *FuncNameVar;
   // CFG hash value for this function.
@@ -347,13 +482,16 @@ public:
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr)
-      : F(Func), ComdatMembers(ComdatMembers), SIVisitor(Func), FunctionHash(0),
-        MST(F, BPI, BFI) {
+      : F(Func), ComdatMembers(ComdatMembers), ValueSites(IPVK_Last + 1),
+        SIVisitor(Func), MIVisitor(Func), FunctionHash(0), MST(F, BPI, BFI) {
 
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
+    MIVisitor.countMemIntrinsics(Func);
     NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-    IndirectCallSites = findIndirectCallSites(Func);
+    NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+    ValueSites[IPVK_IndirectCallTarget] = findIndirectCallSites(Func);
+    ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func);
 
     FuncName = getPGOFuncName(F);
     computeCFGHash();
@@ -405,7 +543,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   }
   JC.update(Indexes);
   FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
-                 (uint64_t)IndirectCallSites.size() << 48 |
+                 (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
                  (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
 }
 
@@ -552,7 +690,7 @@ static void instrumentOneFunc(
     return;
 
   unsigned NumIndirectCallSites = 0;
-  for (auto &I : FuncInfo.IndirectCallSites) {
+  for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
     CallSite CS(I);
     Value *Callee = CS.getCalledValue();
     DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
@@ -565,10 +703,14 @@ static void instrumentOneFunc(
         {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
          Builder.getInt64(FuncInfo.FunctionHash),
          Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
-         Builder.getInt32(llvm::InstrProfValueKind::IPVK_IndirectCallTarget),
+         Builder.getInt32(IPVK_IndirectCallTarget),
          Builder.getInt32(NumIndirectCallSites++)});
   }
   NumOfPGOICall += NumIndirectCallSites;
+
+  // Now instrument memop intrinsic calls.
+  FuncInfo.MIVisitor.instrumentMemIntrinsics(
+      F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash);
 }
 
 // This class represents a CFG edge in profile use compilation.
@@ -653,8 +795,11 @@ public:
   // Set the branch weights based on the count values.
   void setBranchWeights();
 
-  // Annotate the indirect call sites.
-  void annotateIndirectCallSites();
+  // Annotate the value profile call sites all all value kind.
+  void annotateValueSites();
+
+  // Annotate the value profile call sites for one value kind.
+  void annotateValueSites(uint32_t Kind);
 
   // The hotness of the function from the profile count.
   enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
@@ -677,6 +822,8 @@ public:
     return FuncInfo.findBBInfo(BB);
   }
 
+  Function &getFunc() const { return F; }
+
 private:
   Function &F;
   Module *M;
@@ -761,7 +908,7 @@ void PGOUseFunc::setInstrumentedCounts(
     NewEdge1.InMST = true;
     getBBInfo(InstrBB).setBBInfoCount(CountValue);
   }
-  ProfileCountSize =  CountFromProfile.size();
+  ProfileCountSize = CountFromProfile.size();
   CountPosition = I;
 }
 
@@ -932,21 +1079,6 @@ void PGOUseFunc::populateCounters() {
   DEBUG(FuncInfo.dumpInfo("after reading profile."));
 }
 
-static void setProfMetadata(Module *M, Instruction *TI,
-                            ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) {
-  MDBuilder MDB(M->getContext());
-  assert(MaxCount > 0 && "Bad max count");
-  uint64_t Scale = calculateCountScale(MaxCount);
-  SmallVector<unsigned, 4> Weights;
-  for (const auto &ECI : EdgeCounts)
-    Weights.push_back(scaleBranchCount(ECI, Scale));
-
-  DEBUG(dbgs() << "Weight is: ";
-        for (const auto &W : Weights) { dbgs() << W << " "; } 
-        dbgs() << "\n";);
-  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
-}
-
 // Assign the scaled count values to the BB with multiple out edges.
 void PGOUseFunc::setBranchWeights() {
   // Generate MD_prof metadata for every branch instruction.
@@ -990,8 +1122,8 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   Builder.CreateCall(
       Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
       {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
-       Builder.getInt64(FuncHash),
-       Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+       Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
 
@@ -1020,9 +1152,9 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   if (SI.getCondition()->getType()->isVectorTy())
     return;
 
-  NSIs++;
   switch (Mode) {
   case VM_counting:
+    NSIs++;
     return;
   case VM_instrument:
     instrumentOneSelectInst(SI);
@@ -1035,35 +1167,79 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
-// Traverse all the indirect callsites and annotate the instructions.
-void PGOUseFunc::annotateIndirectCallSites() {
+void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&MI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  Value *Length = MI.getLength();
+  assert(!dyn_cast<ConstantInt>(Length));
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+      {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty),
+       Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
+  ++CurCtrId;
+}
+
+void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
+  if (!PGOInstrMemOP)
+    return;
+  Value *Length = MI.getLength();
+  // Not instrument constant length calls.
+  if (dyn_cast<ConstantInt>(Length))
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NMemIs++;
+    return;
+  case VM_instrument:
+    instrumentOneMemIntrinsic(MI);
+    return;
+  case VM_annotate:
+    Candidates.push_back(&MI);
+    return;
+  }
+  llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
   if (DisableValueProfiling)
     return;
 
   // Create the PGOFuncName meta data.
   createPGOFuncNameMetadata(F, FuncInfo.FuncName);
 
-  unsigned IndirectCallSiteIndex = 0;
-  auto &IndirectCallSites = FuncInfo.IndirectCallSites;
-  unsigned NumValueSites =
-      ProfileRecord.getNumValueSites(IPVK_IndirectCallTarget);
-  if (NumValueSites != IndirectCallSites.size()) {
-    std::string Msg =
-        std::string("Inconsistent number of indirect call sites: ") +
-        F.getName().str();
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+  unsigned ValueSiteIndex = 0;
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+  if (NumValueSites != ValueSites.size()) {
     auto &Ctx = M->getContext();
-    Ctx.diagnose(
-        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of value sites for kind = ") + Twine(Kind) +
+            " in " + F.getName().str(),
+        DS_Warning));
     return;
   }
 
-  for (auto &I : IndirectCallSites) {
-    DEBUG(dbgs() << "Read one indirect call instrumentation: Index="
-                 << IndirectCallSiteIndex << " out of " << NumValueSites
-                 << "\n");
-    annotateValueSite(*M, *I, ProfileRecord, IPVK_IndirectCallTarget,
-                      IndirectCallSiteIndex, MaxNumAnnotations);
-    IndirectCallSiteIndex++;
+  for (auto &I : ValueSites) {
+    DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                 << "): Index = " << ValueSiteIndex << " out of "
+                 << NumValueSites << "\n");
+    annotateValueSite(*M, *I, ProfileRecord,
+                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+                                             : MaxNumAnnotations);
+    ValueSiteIndex++;
   }
 }
 } // end anonymous namespace
@@ -1196,12 +1372,29 @@ static bool annotateAllFunctions(
       continue;
     Func.populateCounters();
     Func.setBranchWeights();
-    Func.annotateIndirectCallSites();
+    Func.annotateValueSites();
     PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
     if (FreqAttr == PGOUseFunc::FFA_Cold)
       ColdFunctions.push_back(&F);
     else if (FreqAttr == PGOUseFunc::FFA_Hot)
       HotFunctions.push_back(&F);
+    if (PGOViewCounts && (ViewBlockFreqFuncName.empty() ||
+                          F.getName().equals(ViewBlockFreqFuncName))) {
+      LoopInfo LI{DominatorTree(F)};
+      std::unique_ptr<BranchProbabilityInfo> NewBPI =
+          llvm::make_unique<BranchProbabilityInfo>(F, LI);
+      std::unique_ptr<BlockFrequencyInfo> NewBFI =
+          llvm::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+
+      NewBFI->view();
+    }
+    if (PGOViewRawCounts && (ViewBlockFreqFuncName.empty() ||
+                             F.getName().equals(ViewBlockFreqFuncName))) {
+      if (ViewBlockFreqFuncName.empty())
+        WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+      else
+        ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+    }
   }
   M.setProfileSummary(PGOReader->getSummary().getMD(M.getContext()));
   // Set function hotness attribute from the profile.
@@ -1257,3 +1450,113 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
 
   return annotateAllFunctions(M, ProfileFileName, LookupBPI, LookupBFI);
 }
+
+namespace llvm {
+void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+                     uint64_t MaxCount) {
+  MDBuilder MDB(M->getContext());
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<unsigned, 4> Weights;
+  for (const auto &ECI : EdgeCounts)
+    Weights.push_back(scaleBranchCount(ECI, Scale));
+
+  DEBUG(dbgs() << "Weight is: ";
+        for (const auto &W : Weights) { dbgs() << W << " "; }
+        dbgs() << "\n";);
+  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  if (EmitBranchProbability) {
+    std::string BrCondStr = getBranchCondString(TI);
+    if (BrCondStr.empty())
+      return;
+
+    unsigned WSum =
+        std::accumulate(Weights.begin(), Weights.end(), 0,
+                        [](unsigned w1, unsigned w2) { return w1 + w2; });
+    uint64_t TotalCount =
+        std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), 0,
+                        [](uint64_t c1, uint64_t c2) { return c1 + c2; });
+    BranchProbability BP(Weights[0], WSum);
+    std::string BranchProbStr;
+    raw_string_ostream OS(BranchProbStr);
+    OS << BP;
+    OS << " (total count : " << TotalCount << ")";
+    OS.flush();
+    Function *F = TI->getParent()->getParent();
+    emitOptimizationRemarkAnalysis(
+        F->getContext(), "pgo-use-annot", *F, TI->getDebugLoc(),
+        Twine(BrCondStr) +
+            " is true with probability : " + Twine(BranchProbStr));
+  }
+}
+
+template <> struct GraphTraits<PGOUseFunc *> {
+  typedef const BasicBlock *NodeRef;
+  typedef succ_const_iterator ChildIteratorType;
+  typedef pointer_iterator<Function::const_iterator> nodes_iterator;
+
+  static NodeRef getEntryNode(const PGOUseFunc *G) {
+    return &G->getFunc().front();
+  }
+  static ChildIteratorType child_begin(const NodeRef N) {
+    return succ_begin(N);
+  }
+  static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
+  static nodes_iterator nodes_begin(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().begin());
+  }
+  static nodes_iterator nodes_end(const PGOUseFunc *G) {
+    return nodes_iterator(G->getFunc().end());
+  }
+};
+
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+  if (!Node->getName().empty())
+    return Node->getName();
+
+  std::string SimpleNodeName;
+  raw_string_ostream OS(SimpleNodeName);
+  Node->printAsOperand(OS, false);
+  return OS.str();
+}
+
+template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const PGOUseFunc *G) {
+    return G->getFunc().getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << getSimpleNodeName(Node) << ":\\l";
+    UseBBInfo *BI = Graph->findBBInfo(Node);
+    OS << "Count : ";
+    if (BI && BI->CountValid)
+      OS << BI->CountValue << "\\l";
+    else
+      OS << "Unknown\\l";
+
+    if (!PGOInstrSelect)
+      return Result;
+
+    for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+      auto *I = &*BI;
+      if (!isa<SelectInst>(I))
+        continue;
+      // Display scaled counts for SELECT instruction:
+      OS << "SELECT : { T = ";
+      uint64_t TC, FC;
+      bool HasProf = I->extractProfMetadata(TC, FC);
+      if (!HasProf)
+        OS << "Unknown, F = Unknown }\\l";
+      else
+        OS << TC << ", F = " << FC << " }\\l";
+    }
+    return Result;
+  }
+};
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
new file mode 100644
index 0000000..0bc9ddf
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -0,0 +1,419 @@
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
+      : Func(Func), BFI(BFI), Changed(false) {
+    ValueDataArray =
+        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+    // Get the MemOPSize range information from option MemOPSizeRange,
+    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+                                PreciseRangeLast);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MI : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MI)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
+                     << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(&MI);
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  bool Changed;
+  std::vector<MemIntrinsic *> WorkList;
+  // Start of the previse range.
+  int64_t PreciseRangeStart;
+  // Last value of the previse range.
+  int64_t PreciseRangeLast;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemIntrinsic *MI);
+
+  // This kind shows which group the value falls in. For PreciseValue, we have
+  // the profile count for that value. LargeGroup groups the values that are in
+  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+      return LargeGroup;
+    if (Value == PreciseRangeLast + 1)
+      return NonLargeGroup;
+    return PreciseValue;
+  }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+  assert(MI);
+  if (MI->getIntrinsicID() == Intrinsic::memmove)
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
+               << "\n");
+  DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+  // Skip if the total value profiled count is 0, in which case we can't
+  // scale up the counts properly (and there is no profitable transformation).
+  if (TotalCount == 0)
+    return false;
+
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+                 << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  uint64_t SavedRemainCount = SavedTotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+    // Only care precise value here.
+    if (getMemOPSizeKind(V) != PreciseValue)
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+    assert(SavedRemainCount >= VD.Count);
+    SavedRemainCount -= VD.Count;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+
+  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+               << " Versions (covering " << SumForOpt << " out of "
+               << TotalCount << ")\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MI->getParent();
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+  auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MI);
+  BasicBlock::iterator It(*MI);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+  MergeBB->setName("MemOP.Merge");
+  BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+  DefaultBB->setName("MemOP.Default");
+
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MI->getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+  // Clear the value profile data.
+  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+  // If all promoted, we don't need the MD.prof metadata.
+  if (SavedRemainCount > 0 || Version != NumVals)
+    // Otherwise we need update with the un-promoted records back.
+    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+                      SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  for (uint64_t SizeId : SizeIds) {
+    ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    Instruction *NewInst = MI->clone();
+    // Fix the argument.
+    dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewInst);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  DEBUG(dbgs() << *BB << "\n");
+  DEBUG(dbgs() << *DefaultBB << "\n");
+  DEBUG(dbgs() << *MergeBB << "\n");
+
+  emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
+                         MI->getDebugLoc(),
+                         Twine("optimize ") + getMIName(MI) + " with count " +
+                             Twine(SumForOpt) + " out of " + Twine(TotalCount) +
+                             " for " + Twine(Version) + " versions");
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  return PGOMemOPSizeOptImpl(F, BFI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 5b4b1fb..06fe075 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -7,24 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Coverage instrumentation that works with AddressSanitizer
-// and potentially with other Sanitizers.
-//
-// We create a Guard variable with the same linkage
-// as the function and inject this code into the entry block (SCK_Function)
-// or all blocks (SCK_BB):
-// if (Guard < 0) {
-//    __sanitizer_cov(&Guard);
-// }
-// The accesses to Guard are atomic. The rest of the logic is
-// in __sanitizer_cov (it's fine to call it more than once).
-//
-// With SCK_Edge we also split critical edges this effectively
-// instrumenting all edges.
-//
-// This coverage implementation provides very limited data:
-// it only tells if a given function (block) was ever executed. No counters.
-// But for many use cases this is what we need and the added slowdown small.
+// Coverage instrumentation done on LLVM IR level, works with Sanitizers.
 //
 //===----------------------------------------------------------------------===//
 
@@ -56,16 +39,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "sancov"
 
-static const char *const SanCovModuleInitName = "__sanitizer_cov_module_init";
-static const char *const SanCovName = "__sanitizer_cov";
-static const char *const SanCovWithCheckName = "__sanitizer_cov_with_check";
-static const char *const SanCovIndirCallName = "__sanitizer_cov_indir_call16";
 static const char *const SanCovTracePCIndirName =
     "__sanitizer_cov_trace_pc_indir";
-static const char *const SanCovTraceEnterName =
-    "__sanitizer_cov_trace_func_enter";
-static const char *const SanCovTraceBBName =
-    "__sanitizer_cov_trace_basic_block";
 static const char *const SanCovTracePCName = "__sanitizer_cov_trace_pc";
 static const char *const SanCovTraceCmp1 = "__sanitizer_cov_trace_cmp1";
 static const char *const SanCovTraceCmp2 = "__sanitizer_cov_trace_cmp2";
@@ -78,39 +53,34 @@ static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
 static const char *const SanCovModuleCtorName = "sancov.module_ctor";
 static const uint64_t SanCtorAndDtorPriority = 2;
 
-static const char *const SanCovTracePCGuardSection = "__sancov_guards";
 static const char *const SanCovTracePCGuardName =
     "__sanitizer_cov_trace_pc_guard";
 static const char *const SanCovTracePCGuardInitName =
     "__sanitizer_cov_trace_pc_guard_init";
+static const char *const SanCov8bitCountersInitName = 
+    "__sanitizer_cov_8bit_counters_init";
+
+static const char *const SanCovGuardsSectionName = "sancov_guards";
+static const char *const SanCovCountersSectionName = "sancov_cntrs";
 
 static cl::opt<int> ClCoverageLevel(
     "sanitizer-coverage-level",
     cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
-             "3: all blocks and critical edges, "
-             "4: above plus indirect calls"),
+             "3: all blocks and critical edges"),
     cl::Hidden, cl::init(0));
 
-static cl::opt<unsigned> ClCoverageBlockThreshold(
-    "sanitizer-coverage-block-threshold",
-    cl::desc("Use a callback with a guard check inside it if there are"
-             " more than this number of blocks."),
-    cl::Hidden, cl::init(500));
-
-static cl::opt<bool>
-    ClExperimentalTracing("sanitizer-coverage-experimental-tracing",
-                          cl::desc("Experimental basic-block tracing: insert "
-                                   "callbacks at every basic block"),
-                          cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClExperimentalTracePC("sanitizer-coverage-trace-pc",
-                                           cl::desc("Experimental pc tracing"),
-                                           cl::Hidden, cl::init(false));
+static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
+                               cl::desc("Experimental pc tracing"), cl::Hidden,
+                               cl::init(false));
 
 static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
                                     cl::desc("pc tracing with a guard"),
                                     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
+                                    cl::desc("increments 8-bit counter for every edge"),
+                                    cl::Hidden, cl::init(false));
+
 static cl::opt<bool>
     ClCMPTracing("sanitizer-coverage-trace-compares",
                  cl::desc("Tracing of CMP and similar instructions"),
@@ -129,16 +99,6 @@ static cl::opt<bool>
                   cl::desc("Reduce the number of instrumented blocks"),
                   cl::Hidden, cl::init(true));
 
-// Experimental 8-bit counters used as an additional search heuristic during
-// coverage-guided fuzzing.
-// The counters are not thread-friendly:
-//   - contention on these counters may cause significant slowdown;
-//   - the counter updates are racy and the results may be inaccurate.
-// They are also inaccurate due to 8-bit integer overflow.
-static cl::opt<bool> ClUse8bitCounters("sanitizer-coverage-8bit-counters",
-                                       cl::desc("Experimental 8-bit counters"),
-                                       cl::Hidden, cl::init(false));
-
 namespace {
 
 SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
@@ -169,13 +129,15 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
   Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
   Options.IndirectCalls |= CLOpts.IndirectCalls;
-  Options.TraceBB |= ClExperimentalTracing;
   Options.TraceCmp |= ClCMPTracing;
   Options.TraceDiv |= ClDIVTracing;
   Options.TraceGep |= ClGEPTracing;
-  Options.Use8bitCounters |= ClUse8bitCounters;
-  Options.TracePC |= ClExperimentalTracePC;
+  Options.TracePC |= ClTracePC;
   Options.TracePCGuard |= ClTracePCGuard;
+  Options.Inline8bitCounters |= ClInline8bitCounters;
+  if (!Options.TracePCGuard && !Options.TracePC && !Options.Inline8bitCounters)
+    Options.TracePCGuard = true; // TracePCGuard is default.
+  Options.NoPrune |= !ClPruneBlocks;
   return Options;
 }
 
@@ -207,90 +169,128 @@ private:
   void InjectTraceForSwitch(Function &F,
                             ArrayRef<Instruction *> SwitchTraceTargets);
   bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks);
-  void CreateFunctionGuardArray(size_t NumGuards, Function &F);
-  void SetNoSanitizeMetadata(Instruction *I);
-  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
-                             bool UseCalls);
-  unsigned NumberOfInstrumentedBlocks() {
-    return SanCovFunction->getNumUses() +
-           SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
-           SanCovTraceEnter->getNumUses();
+  GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
+                                                    Function &F, Type *Ty,
+                                                    const char *Section);
+  void CreateFunctionLocalArrays(size_t NumGuards, Function &F);
+  void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx);
+  void CreateInitCallForSection(Module &M, const char *InitFunctionName,
+                                Type *Ty, const std::string &Section);
+
+  void SetNoSanitizeMetadata(Instruction *I) {
+    I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+                   MDNode::get(*C, None));
   }
-  Function *SanCovFunction;
-  Function *SanCovWithCheckFunction;
-  Function *SanCovIndirCallFunction, *SanCovTracePCIndir;
-  Function *SanCovTraceEnter, *SanCovTraceBB, *SanCovTracePC, *SanCovTracePCGuard;
+
+  std::string getSectionName(const std::string &Section) const;
+  std::string getSectionStart(const std::string &Section) const;
+  std::string getSectionEnd(const std::string &Section) const;
+  Function *SanCovTracePCIndir;
+  Function *SanCovTracePC, *SanCovTracePCGuard;
   Function *SanCovTraceCmpFunction[4];
   Function *SanCovTraceDivFunction[2];
   Function *SanCovTraceGepFunction;
   Function *SanCovTraceSwitchFunction;
   InlineAsm *EmptyAsm;
-  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy;
+  Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
+      *Int8Ty, *Int8PtrTy;
   Module *CurModule;
+  Triple TargetTriple;
   LLVMContext *C;
   const DataLayout *DL;
 
-  GlobalVariable *GuardArray;
   GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
-  GlobalVariable *EightBitCounterArray;
-  bool HasSancovGuardsSection;
+  GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
 
   SanitizerCoverageOptions Options;
 };
 
 } // namespace
 
+void SanitizerCoverageModule::CreateInitCallForSection(
+    Module &M, const char *InitFunctionName, Type *Ty,
+    const std::string &Section) {
+  IRBuilder<> IRB(M.getContext());
+  Function *CtorFunc;
+  GlobalVariable *SecStart =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr,
+                         getSectionStart(Section));
+  SecStart->setVisibility(GlobalValue::HiddenVisibility);
+  GlobalVariable *SecEnd =
+      new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+                         nullptr, getSectionEnd(Section));
+  SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+
+  std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, SanCovModuleCtorName, InitFunctionName, {Ty, Ty},
+      {IRB.CreatePointerCast(SecStart, Ty), IRB.CreatePointerCast(SecEnd, Ty)});
+
+  if (TargetTriple.supportsCOMDAT()) {
+    // Use comdat to dedup CtorFunc.
+    CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName));
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+  } else {
+    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+  }
+}
+
 bool SanitizerCoverageModule::runOnModule(Module &M) {
   if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
     return false;
   C = &(M.getContext());
   DL = &M.getDataLayout();
   CurModule = &M;
-  HasSancovGuardsSection = false;
+  TargetTriple = Triple(M.getTargetTriple());
+  FunctionGuardArray = nullptr;
+  Function8bitCounterArray = nullptr;
   IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   IntptrPtrTy = PointerType::getUnqual(IntptrTy);
   Type *VoidTy = Type::getVoidTy(*C);
   IRBuilder<> IRB(*C);
-  Type *Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
   Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
   Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+  Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
   Int64Ty = IRB.getInt64Ty();
   Int32Ty = IRB.getInt32Ty();
+  Int8Ty = IRB.getInt8Ty();
 
-  SanCovFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy, nullptr));
-  SanCovWithCheckFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
   SanCovTracePCIndir = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy, nullptr));
-  SanCovIndirCallFunction =
-      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy));
   SanCovTraceCmpFunction[0] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty(), nullptr));
+          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty()));
   SanCovTraceCmpFunction[1] = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTraceCmp2, VoidTy, IRB.getInt16Ty(),
-                            IRB.getInt16Ty(), nullptr));
+                            IRB.getInt16Ty()));
   SanCovTraceCmpFunction[2] = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTraceCmp4, VoidTy, IRB.getInt32Ty(),
-                            IRB.getInt32Ty(), nullptr));
+                            IRB.getInt32Ty()));
   SanCovTraceCmpFunction[3] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty, nullptr));
+          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty));
 
   SanCovTraceDivFunction[0] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty(), nullptr));
+          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty()));
   SanCovTraceDivFunction[1] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceDiv8, VoidTy, Int64Ty, nullptr));
+          SanCovTraceDiv8, VoidTy, Int64Ty));
   SanCovTraceGepFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceGep, VoidTy, IntptrTy, nullptr));
+          SanCovTraceGep, VoidTy, IntptrTy));
   SanCovTraceSwitchFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy, nullptr));
+          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy));
+  // Make sure smaller parameters are zero-extended to i64 as required by the
+  // x86_64 ABI.
+  if (TargetTriple.getArch() == Triple::x86_64) {
+    for (int i = 0; i < 3; i++) {
+      SanCovTraceCmpFunction[i]->addParamAttr(0, Attribute::ZExt);
+      SanCovTraceCmpFunction[i]->addParamAttr(1, Attribute::ZExt);
+    }
+    SanCovTraceDivFunction[0]->addParamAttr(0, Attribute::ZExt);
+  }
+
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
@@ -298,102 +298,19 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
                             /*hasSideEffects=*/true);
 
   SanCovTracePC = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTracePCName, VoidTy, nullptr));
+      M.getOrInsertFunction(SanCovTracePCName, VoidTy));
   SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      SanCovTracePCGuardName, VoidTy, Int32PtrTy, nullptr));
-  SanCovTraceEnter = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy, nullptr));
-  SanCovTraceBB = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTraceBBName, VoidTy, Int32PtrTy, nullptr));
-
-  // At this point we create a dummy array of guards because we don't
-  // know how many elements we will need.
-  Type *Int32Ty = IRB.getInt32Ty();
-  Type *Int8Ty = IRB.getInt8Ty();
-
-  if (!Options.TracePCGuard)
-    GuardArray =
-        new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
-                           nullptr, "__sancov_gen_cov_tmp");
-  if (Options.Use8bitCounters)
-    EightBitCounterArray =
-        new GlobalVariable(M, Int8Ty, false, GlobalVariable::ExternalLinkage,
-                           nullptr, "__sancov_gen_cov_tmp");
+      SanCovTracePCGuardName, VoidTy, Int32PtrTy));
 
   for (auto &F : M)
     runOnFunction(F);
 
-  auto N = NumberOfInstrumentedBlocks();
-
-  GlobalVariable *RealGuardArray = nullptr;
-  if (!Options.TracePCGuard) {
-    // Now we know how many elements we need. Create an array of guards
-    // with one extra element at the beginning for the size.
-    Type *Int32ArrayNTy = ArrayType::get(Int32Ty, N + 1);
-    RealGuardArray = new GlobalVariable(
-        M, Int32ArrayNTy, false, GlobalValue::PrivateLinkage,
-        Constant::getNullValue(Int32ArrayNTy), "__sancov_gen_cov");
-
-    // Replace the dummy array with the real one.
-    GuardArray->replaceAllUsesWith(
-        IRB.CreatePointerCast(RealGuardArray, Int32PtrTy));
-    GuardArray->eraseFromParent();
-  }
-
-  GlobalVariable *RealEightBitCounterArray;
-  if (Options.Use8bitCounters) {
-    // Make sure the array is 16-aligned.
-    static const int CounterAlignment = 16;
-    Type *Int8ArrayNTy = ArrayType::get(Int8Ty, alignTo(N, CounterAlignment));
-    RealEightBitCounterArray = new GlobalVariable(
-        M, Int8ArrayNTy, false, GlobalValue::PrivateLinkage,
-        Constant::getNullValue(Int8ArrayNTy), "__sancov_gen_cov_counter");
-    RealEightBitCounterArray->setAlignment(CounterAlignment);
-    EightBitCounterArray->replaceAllUsesWith(
-        IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy));
-    EightBitCounterArray->eraseFromParent();
-  }
-
-  // Create variable for module (compilation unit) name
-  Constant *ModNameStrConst =
-      ConstantDataArray::getString(M.getContext(), M.getName(), true);
-  GlobalVariable *ModuleName = new GlobalVariable(
-      M, ModNameStrConst->getType(), true, GlobalValue::PrivateLinkage,
-      ModNameStrConst, "__sancov_gen_modname");
-  if (Options.TracePCGuard) {
-    if (HasSancovGuardsSection) {
-      Function *CtorFunc;
-      std::string SectionName(SanCovTracePCGuardSection);
-      GlobalVariable *Bounds[2];
-      const char *Prefix[2] = {"__start_", "__stop_"};
-      for (int i = 0; i < 2; i++) {
-        Bounds[i] = new GlobalVariable(M, Int32PtrTy, false,
-                                       GlobalVariable::ExternalLinkage, nullptr,
-                                       Prefix[i] + SectionName);
-        Bounds[i]->setVisibility(GlobalValue::HiddenVisibility);
-      }
-      std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
-          M, SanCovModuleCtorName, SanCovTracePCGuardInitName,
-          {Int32PtrTy, Int32PtrTy},
-          {IRB.CreatePointerCast(Bounds[0], Int32PtrTy),
-            IRB.CreatePointerCast(Bounds[1], Int32PtrTy)});
-
-      appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
-    }
-  } else if (!Options.TracePC) {
-    Function *CtorFunc;
-    std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
-        M, SanCovModuleCtorName, SanCovModuleInitName,
-        {Int32PtrTy, IntptrTy, Int8PtrTy, Int8PtrTy},
-        {IRB.CreatePointerCast(RealGuardArray, Int32PtrTy),
-         ConstantInt::get(IntptrTy, N),
-         Options.Use8bitCounters
-             ? IRB.CreatePointerCast(RealEightBitCounterArray, Int8PtrTy)
-             : Constant::getNullValue(Int8PtrTy),
-         IRB.CreatePointerCast(ModuleName, Int8PtrTy)});
-
-    appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
-  }
+  if (FunctionGuardArray)
+    CreateInitCallForSection(M, SanCovTracePCGuardInitName, Int32PtrTy,
+                             SanCovGuardsSectionName);
+  if (Function8bitCounterArray)
+    CreateInitCallForSection(M, SanCov8bitCountersInitName, Int8PtrTy,
+                             SanCovCountersSectionName);
 
   return true;
 }
@@ -425,8 +342,10 @@ static bool isFullPostDominator(const BasicBlock *BB,
   return true;
 }
 
-static bool shouldInstrumentBlock(const Function& F, const BasicBlock *BB, const DominatorTree *DT,
-                                  const PostDominatorTree *PDT) {
+static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
+                                  const DominatorTree *DT,
+                                  const PostDominatorTree *PDT,
+                                  const SanitizerCoverageOptions &Options) {
   // Don't insert coverage for unreachable blocks: we will never call
   // __sanitizer_cov() for them, so counting them in
   // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
@@ -435,10 +354,18 @@ static bool shouldInstrumentBlock(const Function& F, const BasicBlock *BB, const
   if (isa<UnreachableInst>(BB->getTerminator()))
     return false;
 
-  if (!ClPruneBlocks || &F.getEntryBlock() == BB)
+  // Don't insert coverage into blocks without a valid insertion point
+  // (catchswitch blocks).
+  if (BB->getFirstInsertionPt() == BB->end())
+    return false;
+
+  if (Options.NoPrune || &F.getEntryBlock() == BB)
     return true;
 
-  return !(isFullDominator(BB, DT) || isFullPostDominator(BB, PDT));
+  // Do not instrument full dominators, or full post-dominators with multiple
+  // predecessors.
+  return !isFullDominator(BB, DT)
+    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
 }
 
 bool SanitizerCoverageModule::runOnFunction(Function &F) {
@@ -474,7 +401,7 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
       &getAnalysis<PostDominatorTreeWrapperPass>(F).getPostDomTree();
 
   for (auto &BB : F) {
-    if (shouldInstrumentBlock(F, &BB, DT, PDT))
+    if (shouldInstrumentBlock(F, &BB, DT, PDT, Options))
       BlocksToInstrument.push_back(&BB);
     for (auto &Inst : BB) {
       if (Options.IndirectCalls) {
@@ -507,17 +434,26 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) {
   InjectTraceForGep(F, GepTraceTargets);
   return true;
 }
-void SanitizerCoverageModule::CreateFunctionGuardArray(size_t NumGuards,
-                                                       Function &F) {
-  if (!Options.TracePCGuard) return;
-  HasSancovGuardsSection = true;
-  ArrayType *ArrayOfInt32Ty = ArrayType::get(Int32Ty, NumGuards);
-  FunctionGuardArray = new GlobalVariable(
-      *CurModule, ArrayOfInt32Ty, false, GlobalVariable::PrivateLinkage,
-      Constant::getNullValue(ArrayOfInt32Ty), "__sancov_gen_");
+
+GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection(
+    size_t NumElements, Function &F, Type *Ty, const char *Section) {
+  ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
+  auto Array = new GlobalVariable(
+      *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
+      Constant::getNullValue(ArrayTy), "__sancov_gen_");
   if (auto Comdat = F.getComdat())
-    FunctionGuardArray->setComdat(Comdat);
-  FunctionGuardArray->setSection(SanCovTracePCGuardSection);
+    Array->setComdat(Comdat);
+  Array->setSection(getSectionName(Section));
+  return Array;
+}
+void SanitizerCoverageModule::CreateFunctionLocalArrays(size_t NumGuards,
+                                                       Function &F) {
+  if (Options.TracePCGuard)
+    FunctionGuardArray = CreateFunctionLocalArrayInSection(
+        NumGuards, F, Int32Ty, SanCovGuardsSectionName);
+  if (Options.Inline8bitCounters)
+    Function8bitCounterArray = CreateFunctionLocalArrayInSection(
+        NumGuards, F, Int8Ty, SanCovCountersSectionName);
 }
 
 bool SanitizerCoverageModule::InjectCoverage(Function &F,
@@ -527,14 +463,13 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F,
   case SanitizerCoverageOptions::SCK_None:
     return false;
   case SanitizerCoverageOptions::SCK_Function:
-    CreateFunctionGuardArray(1, F);
-    InjectCoverageAtBlock(F, F.getEntryBlock(), 0, false);
+    CreateFunctionLocalArrays(1, F);
+    InjectCoverageAtBlock(F, F.getEntryBlock(), 0);
     return true;
   default: {
-    bool UseCalls = ClCoverageBlockThreshold < AllBlocks.size();
-    CreateFunctionGuardArray(AllBlocks.size(), F);
+    CreateFunctionLocalArrays(AllBlocks.size(), F);
     for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
-      InjectCoverageAtBlock(F, *AllBlocks[i], i, UseCalls);
+      InjectCoverageAtBlock(F, *AllBlocks[i], i);
     return true;
   }
   }
@@ -551,26 +486,14 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
   if (IndirCalls.empty())
     return;
-  const int CacheSize = 16;
-  const int CacheAlignment = 64; // Align for better performance.
-  Type *Ty = ArrayType::get(IntptrTy, CacheSize);
+  assert(Options.TracePC || Options.TracePCGuard || Options.Inline8bitCounters);
   for (auto I : IndirCalls) {
     IRBuilder<> IRB(I);
     CallSite CS(I);
     Value *Callee = CS.getCalledValue();
     if (isa<InlineAsm>(Callee))
       continue;
-    GlobalVariable *CalleeCache = new GlobalVariable(
-        *F.getParent(), Ty, false, GlobalValue::PrivateLinkage,
-        Constant::getNullValue(Ty), "__sancov_gen_callee_cache");
-    CalleeCache->setAlignment(CacheAlignment);
-    if (Options.TracePC || Options.TracePCGuard)
-      IRB.CreateCall(SanCovTracePCIndir,
-                     IRB.CreatePointerCast(Callee, IntptrTy));
-    else
-      IRB.CreateCall(SanCovIndirCallFunction,
-                     {IRB.CreatePointerCast(Callee, IntptrTy),
-                      IRB.CreatePointerCast(CalleeCache, IntptrTy)});
+    IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
   }
 }
 
@@ -670,13 +593,8 @@ void SanitizerCoverageModule::InjectTraceForCmp(
   }
 }
 
-void SanitizerCoverageModule::SetNoSanitizeMetadata(Instruction *I) {
-  I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
-                 MDNode::get(*C, None));
-}
-
 void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
-                                                    size_t Idx, bool UseCalls) {
+                                                    size_t Idx) {
   BasicBlock::iterator IP = BB.getFirstInsertionPt();
   bool IsEntryBB = &BB == &F.getEntryBlock();
   DebugLoc EntryLoc;
@@ -696,65 +614,51 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   if (Options.TracePC) {
     IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
     IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
-  } else if (Options.TracePCGuard) {
+  }
+  if (Options.TracePCGuard) {
     auto GuardPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
                       ConstantInt::get(IntptrTy, Idx * 4)),
         Int32PtrTy);
-    if (!UseCalls) {
-      auto GuardLoad = IRB.CreateLoad(GuardPtr);
-      GuardLoad->setAtomic(AtomicOrdering::Monotonic);
-      GuardLoad->setAlignment(8);
-      SetNoSanitizeMetadata(GuardLoad);  // Don't instrument with e.g. asan.
-      auto Cmp = IRB.CreateICmpNE(
-          GuardLoad, Constant::getNullValue(GuardLoad->getType()));
-      auto Ins = SplitBlockAndInsertIfThen(
-          Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
-      IRB.SetInsertPoint(Ins);
-      IRB.SetCurrentDebugLocation(EntryLoc);
-    }
     IRB.CreateCall(SanCovTracePCGuard, GuardPtr);
     IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
-  } else {
-    Value *GuardP = IRB.CreateAdd(
-        IRB.CreatePointerCast(GuardArray, IntptrTy),
-        ConstantInt::get(IntptrTy, (1 + NumberOfInstrumentedBlocks()) * 4));
-    GuardP = IRB.CreateIntToPtr(GuardP, Int32PtrTy);
-    if (Options.TraceBB) {
-      IRB.CreateCall(IsEntryBB ? SanCovTraceEnter : SanCovTraceBB, GuardP);
-    } else if (UseCalls) {
-      IRB.CreateCall(SanCovWithCheckFunction, GuardP);
-    } else {
-      LoadInst *Load = IRB.CreateLoad(GuardP);
-      Load->setAtomic(AtomicOrdering::Monotonic);
-      Load->setAlignment(4);
-      SetNoSanitizeMetadata(Load);
-      Value *Cmp =
-          IRB.CreateICmpSGE(Constant::getNullValue(Load->getType()), Load);
-      Instruction *Ins = SplitBlockAndInsertIfThen(
-          Cmp, &*IP, false, MDBuilder(*C).createBranchWeights(1, 100000));
-      IRB.SetInsertPoint(Ins);
-      IRB.SetCurrentDebugLocation(EntryLoc);
-      // __sanitizer_cov gets the PC of the instruction using GET_CALLER_PC.
-      IRB.CreateCall(SanCovFunction, GuardP);
-      IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
-    }
   }
-
-  if (Options.Use8bitCounters) {
-    IRB.SetInsertPoint(&*IP);
-    Value *P = IRB.CreateAdd(
-        IRB.CreatePointerCast(EightBitCounterArray, IntptrTy),
-        ConstantInt::get(IntptrTy, NumberOfInstrumentedBlocks() - 1));
-    P = IRB.CreateIntToPtr(P, IRB.getInt8PtrTy());
-    LoadInst *LI = IRB.CreateLoad(P);
-    Value *Inc = IRB.CreateAdd(LI, ConstantInt::get(IRB.getInt8Ty(), 1));
-    StoreInst *SI = IRB.CreateStore(Inc, P);
-    SetNoSanitizeMetadata(LI);
-    SetNoSanitizeMetadata(SI);
+  if (Options.Inline8bitCounters) {
+    auto CounterPtr = IRB.CreateGEP(
+        Function8bitCounterArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(CounterPtr);
+    auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
+    auto Store = IRB.CreateStore(Inc, CounterPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
   }
 }
 
+std::string
+SanitizerCoverageModule::getSectionName(const std::string &Section) const {
+  if (TargetTriple.getObjectFormat() == Triple::COFF)
+    return ".SCOV$M";
+  if (TargetTriple.isOSBinFormatMachO())
+    return "__DATA,__" + Section;
+  return "__" + Section;
+}
+
+std::string
+SanitizerCoverageModule::getSectionStart(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$start$__DATA$__" + Section;
+  return "__start___" + Section;
+}
+
+std::string
+SanitizerCoverageModule::getSectionEnd(const std::string &Section) const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$end$__DATA$__" + Section;
+  return "__stop___" + Section;
+}
+
+
 char SanitizerCoverageModule::ID = 0;
 INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
                       "SanitizerCoverage: TODO."
diff --git a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 52035c7..ec69044 100644
--- a/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -19,7 +19,6 @@
 // The rest is handled by the run-time library.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -42,6 +41,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -155,17 +155,18 @@ FunctionPass *llvm::createThreadSanitizerPass() {
 
 void ThreadSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
-  AttributeSet Attr;
-  Attr = Attr.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoUnwind);
+  AttributeList Attr;
+  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
   // Initialize the callbacks.
   TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   TsanFuncExit = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy(), nullptr));
+      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy()));
   TsanIgnoreBegin = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy(), nullptr));
+      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy()));
   TsanIgnoreEnd = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy(), nullptr));
+      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy()));
   OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
     const unsigned ByteSize = 1U << i;
@@ -174,31 +175,31 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
     std::string BitSizeStr = utostr(BitSize);
     SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
     TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
     TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
     TsanUnalignedRead[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
     TsanUnalignedWrite[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
     TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy, nullptr));
+        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy));
 
     SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
     TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
+        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy));
 
     for (int op = AtomicRMWInst::FIRST_BINOP;
         op <= AtomicRMWInst::LAST_BINOP; ++op) {
@@ -222,33 +223,33 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
       TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction(
-          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy, nullptr));
+          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy));
     }
 
     SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
                                   "_compare_exchange_val");
     TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
+        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy));
   }
   TsanVptrUpdate = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), nullptr));
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy()));
   TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
+      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy));
   TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
+      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy));
 
   MemmoveFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemcpyFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemsetFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt32Ty(), IntptrTy, nullptr));
+                            IRB.getInt32Ty(), IntptrTy));
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
@@ -271,7 +272,7 @@ static bool isVtableAccess(Instruction *I) {
 
 // Do not instrument known races/"benign races" that come from compiler
 // instrumentatin. The user has no way of suppressing them.
-static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
   // Peel off GEPs and BitCasts.
   Addr = Addr->stripInBoundsOffsets();
 
@@ -279,8 +280,9 @@ static bool shouldInstrumentReadWriteFromAddress(Value *Addr) {
     if (GV->hasSection()) {
       StringRef SectionName = GV->getSection();
       // Check if the global is in the PGO counters section.
-      if (SectionName.endswith(getInstrProfCountersSectionName(
-            /*AddSegment=*/false)))
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
         return false;
     }
 
@@ -342,13 +344,13 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
   for (Instruction *I : reverse(Local)) {
     if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
       Value *Addr = Store->getPointerOperand();
-      if (!shouldInstrumentReadWriteFromAddress(Addr))
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
         continue;
       WriteTargets.insert(Addr);
     } else {
       LoadInst *Load = cast<LoadInst>(I);
       Value *Addr = Load->getPointerOperand();
-      if (!shouldInstrumentReadWriteFromAddress(Addr))
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
         continue;
       if (WriteTargets.count(Addr)) {
         // We will write to this temp, so no reason to analyze the read.
@@ -377,10 +379,11 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
 }
 
 static bool isAtomic(Instruction *I) {
+  // TODO: Ask TTI whether synchronization scope is between threads.
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->isAtomic() && LI->getSynchScope() == CrossThread;
+    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isAtomic() && SI->getSynchScope() == CrossThread;
+    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
   if (isa<AtomicRMWInst>(I))
     return true;
   if (isa<AtomicCmpXchgInst>(I))
@@ -674,7 +677,7 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
     I->eraseFromParent();
   } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
     Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
-    Function *F = FI->getSynchScope() == SingleThread ?
+    Function *F = FI->getSyncScopeID() == SyncScope::SingleThread ?
         TsanAtomicSignalFence : TsanAtomicThreadFence;
     CallInst *C = CallInst::Create(F, Args);
     ReplaceInstWithInst(I, C);
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index c748272..cb3b575 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -127,9 +127,8 @@ private:
 
     LLVMContext &C = TheModule->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attr =
-      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
     FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
                                           /*isVarArg=*/false);
     return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
@@ -144,10 +143,10 @@ private:
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attr = AttributeSet();
+    AttributeList Attr = AttributeList();
 
     if (NoUnwind)
-      Attr = Attr.addAttribute(C, AttributeSet::FunctionIndex,
+      Attr = Attr.addAttribute(C, AttributeList::FunctionIndex,
                                Attribute::NoUnwind);
 
     return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
@@ -162,10 +161,9 @@ private:
     Type *I8XX = PointerType::getUnqual(I8X);
     Type *Params[] = { I8XX, I8X };
 
-    AttributeSet Attr =
-      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    Attr = Attr.addAttribute(C, 1, Attribute::NoCapture);
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    Attr = Attr.addParamAttribute(C, 0, Attribute::NoCapture);
 
     FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
                                           /*isVarArg=*/false);
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
index ef075bd..9c5cf6f 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
-#include <vector>
 #include <algorithm>
+#include <vector>
 
 namespace llvm {
 /// \brief An associative container with fast insertion-order (deterministic)
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 9d78e5a..4648050 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -20,8 +20,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
 #include "DependencyAnalysis.h"
+#include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/IR/CFG.h"
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index f02b75f..cd9b3d9 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -69,6 +69,19 @@ static inline void EraseInstruction(Instruction *CI) {
     RecursivelyDeleteTriviallyDeadInstructions(OldArg);
 }
 
+/// If Inst is a ReturnRV and its operand is a call or invoke, return the
+/// operand. Otherwise return null.
+static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
+                                                    ARCInstKind Class) {
+  if (Class != ARCInstKind::RetainRV)
+    return nullptr;
+
+  const auto *Opnd = Inst.getOperand(0)->stripPointerCasts();
+  if (const auto *C = dyn_cast<CallInst>(Opnd))
+    return C;
+  return dyn_cast<InvokeInst>(Opnd);
+}
+
 } // end namespace objcarc
 } // end namespace llvm
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 23c1f59..e70e759 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -26,9 +26,9 @@
 // TODO: ObjCARCContract could insert PHI nodes when uses aren't
 // dominated by single calls.
 
-#include "ObjCARC.h"
 #include "ARCRuntimeEntryPoints.h"
 #include "DependencyAnalysis.h"
+#include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Dominators.h"
@@ -394,6 +394,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
 
   DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong << "\n");
 
+  if (&*Iter == Retain) ++Iter;
   if (&*Iter == Store) ++Iter;
   Store->eraseFromParent();
   Release->eraseFromParent();
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 136d54a..8f3a33f 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -24,10 +24,10 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
 #include "ARCRuntimeEntryPoints.h"
 #include "BlotMapVector.h"
 #include "DependencyAnalysis.h"
+#include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
 #include "PtrState.h"
 #include "llvm/ADT/DenseMap.h"
@@ -85,41 +85,6 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   return nullptr;
 }
 
-/// This is a wrapper around getUnderlyingObjCPtr along the lines of
-/// GetUnderlyingObjects except that it returns early when it sees the first
-/// alloca.
-static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V,
-                                                   const DataLayout &DL) {
-  SmallPtrSet<const Value *, 4> Visited;
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(V);
-  do {
-    const Value *P = Worklist.pop_back_val();
-    P = GetUnderlyingObjCPtr(P, DL);
-
-    if (isa<AllocaInst>(P))
-      return true;
-
-    if (!Visited.insert(P).second)
-      continue;
-
-    if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) {
-      Worklist.push_back(SI->getTrueValue());
-      Worklist.push_back(SI->getFalseValue());
-      continue;
-    }
-
-    if (const PHINode *PN = dyn_cast<const PHINode>(P)) {
-      for (Value *IncValue : PN->incoming_values())
-        Worklist.push_back(IncValue);
-      continue;
-    }
-  } while (!Worklist.empty());
-
-  return false;
-}
-
-
 /// @}
 ///
 /// \defgroup ARCOpt ARC Optimization.
@@ -481,9 +446,6 @@ namespace {
     /// MDKind identifiers.
     ARCMDKindCache MDKindCache;
 
-    // This is used to track if a pointer is stored into an alloca.
-    DenseSet<const Value *> MultiOwnersSet;
-
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
@@ -524,8 +486,7 @@ namespace {
     PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
                              BlotMapVector<Value *, RRInfo> &Retains,
                              DenseMap<Value *, RRInfo> &Releases, Module *M,
-                             SmallVectorImpl<Instruction *> &NewRetains,
-                             SmallVectorImpl<Instruction *> &NewReleases,
+                             Instruction * Retain,
                              SmallVectorImpl<Instruction *> &DeadInsts,
                              RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                              Value *Arg, bool KnownSafe,
@@ -1155,29 +1116,6 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
   case ARCInstKind::None:
     // These are irrelevant.
     return NestingDetected;
-  case ARCInstKind::User:
-    // If we have a store into an alloca of a pointer we are tracking, the
-    // pointer has multiple owners implying that we must be more conservative.
-    //
-    // This comes up in the context of a pointer being ``KnownSafe''. In the
-    // presence of a block being initialized, the frontend will emit the
-    // objc_retain on the original pointer and the release on the pointer loaded
-    // from the alloca. The optimizer will through the provenance analysis
-    // realize that the two are related, but since we only require KnownSafe in
-    // one direction, will match the inner retain on the original pointer with
-    // the guard release on the original pointer. This is fixed by ensuring that
-    // in the presence of allocas we only unconditionally remove pointers if
-    // both our retain and our release are KnownSafe.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
-      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand(), DL)) {
-        auto I = MyStates.findPtrBottomUpState(
-            GetRCIdentityRoot(SI->getValueOperand()));
-        if (I != MyStates.bottom_up_ptr_end())
-          MultiOwnersSet.insert(I->first);
-      }
-    }
-    break;
   default:
     break;
   }
@@ -1540,8 +1478,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
     DenseMap<const BasicBlock *, BBState> &BBStates,
     BlotMapVector<Value *, RRInfo> &Retains,
     DenseMap<Value *, RRInfo> &Releases, Module *M,
-    SmallVectorImpl<Instruction *> &NewRetains,
-    SmallVectorImpl<Instruction *> &NewReleases,
+    Instruction *Retain,
     SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
     RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
     bool &AnyPairsCompletelyEliminated) {
@@ -1549,7 +1486,6 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   // is already incremented, we can similarly ignore possible decrements unless
   // we are dealing with a retainable object with multiple provenance sources.
   bool KnownSafeTD = true, KnownSafeBU = true;
-  bool MultipleOwners = false;
   bool CFGHazardAfflicted = false;
 
   // Connect the dots between the top-down-collected RetainsToMove and
@@ -1561,14 +1497,13 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   unsigned OldCount = 0;
   unsigned NewCount = 0;
   bool FirstRelease = true;
-  for (;;) {
+  for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
+    SmallVector<Instruction *, 4> NewReleases;
     for (Instruction *NewRetain : NewRetains) {
       auto It = Retains.find(NewRetain);
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
-      MultipleOwners =
-        MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain));
       for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
         auto Jt = Releases.find(NewRetainRelease);
         if (Jt == Releases.end())
@@ -1691,7 +1626,6 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
         }
       }
     }
-    NewReleases.clear();
     if (NewRetains.empty()) break;
   }
 
@@ -1745,10 +1679,6 @@ bool ObjCARCOpt::PerformCodePlacement(
   DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
 
   bool AnyPairsCompletelyEliminated = false;
-  RRInfo RetainsToMove;
-  RRInfo ReleasesToMove;
-  SmallVector<Instruction *, 4> NewRetains;
-  SmallVector<Instruction *, 4> NewReleases;
   SmallVector<Instruction *, 8> DeadInsts;
 
   // Visit each retain.
@@ -1780,9 +1710,10 @@ bool ObjCARCOpt::PerformCodePlacement(
 
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
-    NewRetains.push_back(Retain);
+    RRInfo RetainsToMove, ReleasesToMove;
+
     bool PerformMoveCalls = PairUpRetainsAndReleases(
-        BBStates, Retains, Releases, M, NewRetains, NewReleases, DeadInsts,
+        BBStates, Retains, Releases, M, Retain, DeadInsts,
         RetainsToMove, ReleasesToMove, Arg, KnownSafe,
         AnyPairsCompletelyEliminated);
 
@@ -1792,12 +1723,6 @@ bool ObjCARCOpt::PerformCodePlacement(
       MoveCalls(Arg, RetainsToMove, ReleasesToMove,
                 Retains, Releases, DeadInsts, M);
     }
-
-    // Clean up state for next retain.
-    NewReleases.clear();
-    NewRetains.clear();
-    RetainsToMove.clear();
-    ReleasesToMove.clear();
   }
 
   // Now that we're done moving everything, we can delete the newly dead
@@ -1987,9 +1912,6 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
                                                            Releases,
                                                            F.getParent());
 
-  // Cleanup.
-  MultiOwnersSet.clear();
-
   return AnyPairsCompletelyEliminated && NestingDetected;
 }
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index 9ffdfb4..62fc52f 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -22,8 +22,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "ObjCARC.h"
 #include "ProvenanceAnalysis.h"
+#include "ObjCARC.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index c274e81..870a5f6 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -8,13 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ProvenanceAnalysis.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
index a5afc8a..d13e941 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.cpp
@@ -244,6 +244,18 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
                                           const Value *Ptr,
                                           ProvenanceAnalysis &PA,
                                           ARCInstKind Class) {
+  auto SetSeqAndInsertReverseInsertPt = [&](Sequence NewSeq){
+    assert(!HasReverseInsertPts());
+    SetSeq(NewSeq);
+    // If this is an invoke instruction, we're scanning it as part of
+    // one of its successor blocks, since we can't insert code after it
+    // in its own block, and we don't want to split critical edges.
+    if (isa<InvokeInst>(Inst))
+      InsertReverseInsertPt(&*BB->getFirstInsertionPt());
+    else
+      InsertReverseInsertPt(&*++Inst->getIterator());
+  };
+
   // Check for possible direct uses.
   switch (GetSeq()) {
   case S_Release:
@@ -251,26 +263,18 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
     if (CanUse(Inst, Ptr, PA, Class)) {
       DEBUG(dbgs() << "            CanUse: Seq: " << GetSeq() << "; " << *Ptr
                    << "\n");
-      assert(!HasReverseInsertPts());
-      // If this is an invoke instruction, we're scanning it as part of
-      // one of its successor blocks, since we can't insert code after it
-      // in its own block, and we don't want to split critical edges.
-      if (isa<InvokeInst>(Inst))
-        InsertReverseInsertPt(&*BB->getFirstInsertionPt());
-      else
-        InsertReverseInsertPt(&*++Inst->getIterator());
-      SetSeq(S_Use);
+      SetSeqAndInsertReverseInsertPt(S_Use);
     } else if (Seq == S_Release && IsUser(Class)) {
       DEBUG(dbgs() << "            PreciseReleaseUse: Seq: " << GetSeq() << "; "
                    << *Ptr << "\n");
       // Non-movable releases depend on any possible objc pointer use.
-      SetSeq(S_Stop);
-      assert(!HasReverseInsertPts());
-      // As above; handle invoke specially.
-      if (isa<InvokeInst>(Inst))
-        InsertReverseInsertPt(&*BB->getFirstInsertionPt());
-      else
-        InsertReverseInsertPt(&*++Inst->getIterator());
+      SetSeqAndInsertReverseInsertPt(S_Stop);
+    } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
+      if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
+        DEBUG(dbgs() << "            ReleaseUse: Seq: " << GetSeq() << "; "
+                     << *Ptr << "\n");
+        SetSeqAndInsertReverseInsertPt(S_Stop);
+      }
     }
     break;
   case S_Stop:
@@ -351,8 +355,10 @@ bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
                                                    const Value *Ptr,
                                                    ProvenanceAnalysis &PA,
                                                    ARCInstKind Class) {
-  // Check for possible releases.
-  if (!CanAlterRefCount(Inst, Ptr, PA, Class))
+  // Check for possible releases. Treat clang.arc.use as a releasing instruction
+  // to prevent sinking a retain past it.
+  if (!CanAlterRefCount(Inst, Ptr, PA, Class) &&
+      Class != ARCInstKind::IntrinsicUser)
     return false;
 
   DEBUG(dbgs() << "            CanAlterRefCount: Seq: " << GetSeq() << "; " << *Ptr
diff --git a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
index 9749e44..87298fa 100644
--- a/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
+++ b/contrib/llvm/lib/Transforms/ObjCARC/PtrState.h
@@ -21,8 +21,8 @@
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace objcarc {
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
index adc903c..5b467dc 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed");
 STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
 
-// This is a tempoary option until we change the interface
-// to this pass based on optimization level.
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
 static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
                                            cl::init(true), cl::Hidden);
 
@@ -110,7 +110,7 @@ class AggressiveDeadCodeElimination {
 
   /// The set of blocks which we have determined whose control
   /// dependence sources must be live and which have not had
-  /// those dependences analyized.
+  /// those dependences analyzed.
   SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
 
   /// Set up auxiliary data structures for Instructions and BasicBlocks and
@@ -145,7 +145,7 @@ class AggressiveDeadCodeElimination {
   /// was removed.
   bool removeDeadInstructions();
 
-  /// Identify connected sections of the control flow grap which have
+  /// Identify connected sections of the control flow graph which have
   /// dead terminators and rewrite the control flow graph to remove them.
   void updateDeadRegions();
 
@@ -234,7 +234,7 @@ void AggressiveDeadCodeElimination::initialize() {
         return Iter != end() && Iter->second;
       }
     } State;
-    
+
     State.reserve(F.size());
     // Iterate over blocks in depth-first pre-order and
     // treat all edges to a block already seen as loop back edges
@@ -262,25 +262,6 @@ void AggressiveDeadCodeElimination::initialize() {
       continue;
     auto *BB = BBInfo.BB;
     if (!PDT.getNode(BB)) {
-      markLive(BBInfo.Terminator);
-      continue;
-    }
-    for (auto *Succ : successors(BB))
-      if (!PDT.getNode(Succ)) {
-        markLive(BBInfo.Terminator);
-        break;
-      }
-  }
-
-  // Mark blocks live if there is no path from the block to the
-  // return of the function or a successor for which this is true.
-  // This protects IDFCalculator which cannot handle such blocks.
-  for (auto &BBInfoPair : BlockInfo) {
-    auto &BBInfo = BBInfoPair.second;
-    if (BBInfo.terminatorIsLive())
-      continue;
-    auto *BB = BBInfo.BB;
-    if (!PDT.getNode(BB)) {
       DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName()
                    << '\n';);
       markLive(BBInfo.Terminator);
@@ -579,7 +560,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
         PreferredSucc = Info;
     }
     assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
-           "Failed to find safe successor for dead branc");
+           "Failed to find safe successor for dead branch");
     bool First = true;
     for (auto *Succ : successors(BB)) {
       if (!First || Succ != PreferredSucc->BB)
@@ -594,13 +575,13 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
 
 // reverse top-sort order
 void AggressiveDeadCodeElimination::computeReversePostOrder() {
-  
-  // This provides a post-order numbering of the reverse conrtol flow graph
+
+  // This provides a post-order numbering of the reverse control flow graph
   // Note that it is incomplete in the presence of infinite loops but we don't
   // need numbers blocks which don't reach the end of the functions since
   // all branches in those blocks are forced live.
-  
-  // For each block without successors, extend the DFS from the bloack
+
+  // For each block without successors, extend the DFS from the block
   // backward through the graph
   SmallPtrSet<BasicBlock*, 16> Visited;
   unsigned PostOrder = 0;
@@ -644,8 +625,8 @@ PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
   if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination())
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  auto PA = PreservedAnalyses();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index c1df317..99480f1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -19,12 +19,11 @@
 #define AA_NAME "alignment-from-assumptions"
 #define DEBUG_TYPE AA_NAME
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -35,6 +34,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 STATISTIC(NumLoadAlignChanged,
@@ -438,19 +438,13 @@ AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  bool Changed = runImpl(F, AC, &SE, &DT);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
-
-  if (!Changed)
+  if (!runImpl(F, AC, &SE, &DT))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<AAManager>();
   PA.preserve<ScalarEvolutionAnalysis>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
index 251b387..2e56186 100644
--- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DemandedBits.h"
@@ -35,6 +36,46 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed (unused)");
 STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
 
+/// If an instruction is trivialized (dead), then the chain of users of that
+/// instruction may need to be cleared of assumptions that can no longer be
+/// guaranteed correct.
+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
+  assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?");
+
+  // Initialize the worklist with eligible direct users.
+  SmallVector<Instruction *, 16> WorkList;
+  for (User *JU : I->users()) {
+    // If all bits of a user are demanded, then we know that nothing below that
+    // in the def-use chain needs to be changed.
+    auto *J = dyn_cast<Instruction>(JU);
+    if (J && !DB.getDemandedBits(J).isAllOnesValue())
+      WorkList.push_back(J);
+  }
+
+  // DFS through subsequent users while tracking visits to avoid cycles.
+  SmallPtrSet<Instruction *, 16> Visited;
+  while (!WorkList.empty()) {
+    Instruction *J = WorkList.pop_back_val();
+
+    // NSW, NUW, and exact are based on operands that might have changed.
+    J->dropPoisonGeneratingFlags();
+
+    // We do not have to worry about llvm.assume or range metadata:
+    // 1. llvm.assume demands its operand, so trivializing can't change it.
+    // 2. range metadata only applies to memory accesses which demand all bits.
+
+    Visited.insert(J);
+
+    for (User *KU : J->users()) {
+      // If all bits of a user are demanded, then we know that nothing below
+      // that in the def-use chain needs to be changed.
+      auto *K = dyn_cast<Instruction>(KU);
+      if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue())
+        WorkList.push_back(K);
+    }
+  }
+}
+
 static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
   SmallVector<Instruction*, 128> Worklist;
   bool Changed = false;
@@ -51,6 +92,9 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
       // replacing all uses with something else. Then, if they don't need to
       // remain live (because they have side effects, etc.) we can remove them.
       DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+
+      clearAssumptionsOfUsers(&I, DB);
+
       // FIXME: In theory we could substitute undef here instead of zero.
       // This should be reconsidered once we settle on the semantics of
       // undef, poison, etc.
@@ -80,8 +124,8 @@ PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!bitTrackingDCE(F, DB))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  auto PA = PreservedAnalyses();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 3826251..122c931 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -38,11 +38,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <tuple>
 
 using namespace llvm;
@@ -53,6 +55,12 @@ using namespace consthoist;
 STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
 STATISTIC(NumConstantsRebased, "Number of constants rebased");
 
+static cl::opt<bool> ConstHoistWithBlockFrequency(
+    "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
+    cl::desc("Enable the use of the block frequency analysis to reduce the "
+             "chance to execute const materialization more frequently than "
+             "without hoisting."));
+
 namespace {
 /// \brief The constant hoisting pass.
 class ConstantHoistingLegacyPass : public FunctionPass {
@@ -68,6 +76,8 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    if (ConstHoistWithBlockFrequency)
+      AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
@@ -82,6 +92,7 @@ private:
 char ConstantHoistingLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
                       "Constant Hoisting", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
@@ -99,9 +110,13 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
   DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
   DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
 
-  bool MadeChange = Impl.runImpl(
-      Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
-      getAnalysis<DominatorTreeWrapperPass>().getDomTree(), Fn.getEntryBlock());
+  bool MadeChange =
+      Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+                   getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                   ConstHoistWithBlockFrequency
+                       ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
+                       : nullptr,
+                   Fn.getEntryBlock());
 
   if (MadeChange) {
     DEBUG(dbgs() << "********** Function after Constant Hoisting: "
@@ -136,37 +151,163 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   if (Idx != ~0U && isa<PHINode>(Inst))
     return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
 
-  BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
-  return IDom->getTerminator();
+  // This must be an EH pad. Iterate over immediate dominators until we find a
+  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+  // and terminators.
+  auto IDom = DT->getNode(Inst->getParent())->getIDom();
+  while (IDom->getBlock()->isEHPad()) {
+    assert(Entry != IDom->getBlock() && "eh pad in entry block");
+    IDom = IDom->getIDom();
+  }
+
+  return IDom->getBlock()->getTerminator();
+}
+
+/// \brief Given \p BBs as input, find another set of BBs which collectively
+/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
+/// set found in \p BBs.
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+                                 BasicBlock *Entry,
+                                 SmallPtrSet<BasicBlock *, 8> &BBs) {
+  assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
+  // Nodes on the current path to the root.
+  SmallPtrSet<BasicBlock *, 8> Path;
+  // Candidates includes any block 'BB' in set 'BBs' that is not strictly
+  // dominated by any other blocks in set 'BBs', and all nodes in the path
+  // in the dominator tree from Entry to 'BB'.
+  SmallPtrSet<BasicBlock *, 16> Candidates;
+  for (auto BB : BBs) {
+    Path.clear();
+    // Walk up the dominator tree until Entry or another BB in BBs
+    // is reached. Insert the nodes on the way to the Path.
+    BasicBlock *Node = BB;
+    // The "Path" is a candidate path to be added into Candidates set.
+    bool isCandidate = false;
+    do {
+      Path.insert(Node);
+      if (Node == Entry || Candidates.count(Node)) {
+        isCandidate = true;
+        break;
+      }
+      assert(DT.getNode(Node)->getIDom() &&
+             "Entry doens't dominate current Node");
+      Node = DT.getNode(Node)->getIDom()->getBlock();
+    } while (!BBs.count(Node));
+
+    // If isCandidate is false, Node is another Block in BBs dominating
+    // current 'BB'. Drop the nodes on the Path.
+    if (!isCandidate)
+      continue;
+
+    // Add nodes on the Path into Candidates.
+    Candidates.insert(Path.begin(), Path.end());
+  }
+
+  // Sort the nodes in Candidates in top-down order and save the nodes
+  // in Orders.
+  unsigned Idx = 0;
+  SmallVector<BasicBlock *, 16> Orders;
+  Orders.push_back(Entry);
+  while (Idx != Orders.size()) {
+    BasicBlock *Node = Orders[Idx++];
+    for (auto ChildDomNode : DT.getNode(Node)->getChildren()) {
+      if (Candidates.count(ChildDomNode->getBlock()))
+        Orders.push_back(ChildDomNode->getBlock());
+    }
+  }
+
+  // Visit Orders in bottom-up order.
+  typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>
+      InsertPtsCostPair;
+  // InsertPtsMap is a map from a BB to the best insertion points for the
+  // subtree of BB (subtree not including the BB itself).
+  DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
+  InsertPtsMap.reserve(Orders.size() + 1);
+  for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
+    BasicBlock *Node = *RIt;
+    bool NodeInBBs = BBs.count(Node);
+    SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first;
+    BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
+
+    // Return the optimal insert points in BBs.
+    if (Node == Entry) {
+      BBs.clear();
+      if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
+        BBs.insert(Entry);
+      else
+        BBs.insert(InsertPts.begin(), InsertPts.end());
+      break;
+    }
+
+    BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
+    // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
+    // will update its parent's ParentInsertPts and ParentPtsFreq.
+    SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first;
+    BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
+    // Choose to insert in Node or in subtree of Node.
+    // Don't hoist to EHPad because we may not find a proper place to insert
+    // in EHPad.
+    // If the total frequency of InsertPts is the same as the frequency of the
+    // target Node, and InsertPts contains more than one nodes, choose hoisting
+    // to reduce code size.
+    if (NodeInBBs ||
+        (!Node->isEHPad() &&
+         (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+          (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
+      ParentInsertPts.insert(Node);
+      ParentPtsFreq += BFI.getBlockFreq(Node);
+    } else {
+      ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
+      ParentPtsFreq += InsertPtsFreq;
+    }
+  }
 }
 
 /// \brief Find an insertion point that dominates all uses.
-Instruction *ConstantHoistingPass::findConstantInsertionPoint(
+SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint(
     const ConstantInfo &ConstInfo) const {
   assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
   // Collect all basic blocks.
   SmallPtrSet<BasicBlock *, 8> BBs;
+  SmallPtrSet<Instruction *, 8> InsertPts;
   for (auto const &RCI : ConstInfo.RebasedConstants)
     for (auto const &U : RCI.Uses)
       BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
 
-  if (BBs.count(Entry))
-    return &Entry->front();
+  if (BBs.count(Entry)) {
+    InsertPts.insert(&Entry->front());
+    return InsertPts;
+  }
+
+  if (BFI) {
+    findBestInsertionSet(*DT, *BFI, Entry, BBs);
+    for (auto BB : BBs) {
+      BasicBlock::iterator InsertPt = BB->begin();
+      for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+        ;
+      InsertPts.insert(&*InsertPt);
+    }
+    return InsertPts;
+  }
 
   while (BBs.size() >= 2) {
     BasicBlock *BB, *BB1, *BB2;
     BB1 = *BBs.begin();
     BB2 = *std::next(BBs.begin());
     BB = DT->findNearestCommonDominator(BB1, BB2);
-    if (BB == Entry)
-      return &Entry->front();
+    if (BB == Entry) {
+      InsertPts.insert(&Entry->front());
+      return InsertPts;
+    }
     BBs.erase(BB1);
     BBs.erase(BB2);
     BBs.insert(BB);
   }
   assert((BBs.size() == 1) && "Expected only one element.");
   Instruction &FirstInst = (*BBs.begin())->front();
-  return findMatInsertPt(&FirstInst);
+  InsertPts.insert(findMatInsertPt(&FirstInst));
+  return InsertPts;
 }
 
 
@@ -210,68 +351,65 @@ void ConstantHoistingPass::collectConstantCandidates(
   }
 }
 
-/// \brief Scan the instruction for expensive integer constants and record them
-/// in the constant candidate vector.
-void ConstantHoistingPass::collectConstantCandidates(
-    ConstCandMapType &ConstCandMap, Instruction *Inst) {
-  // Skip all cast instructions. They are visited indirectly later on.
-  if (Inst->isCast())
-    return;
-
-  // Can't handle inline asm. Skip it.
-  if (auto Call = dyn_cast<CallInst>(Inst))
-    if (isa<InlineAsm>(Call->getCalledValue()))
-      return;
 
-  // Switch cases must remain constant, and if the value being tested is
-  // constant the entire thing should disappear.
-  if (isa<SwitchInst>(Inst))
-    return;
+/// \brief Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+  Value *Opnd = Inst->getOperand(Idx);
 
-  // Static allocas (constant size in the entry block) are handled by
-  // prologue/epilogue insertion so they're free anyway. We definitely don't
-  // want to make them non-constant.
-  auto AI = dyn_cast<AllocaInst>(Inst);
-  if (AI && AI->isStaticAlloca())
+  // Visit constant integers.
+  if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+    collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
     return;
+  }
 
-  // Scan all operands.
-  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
-    Value *Opnd = Inst->getOperand(Idx);
+  // Visit cast instructions that have constant integers.
+  if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+    // Only visit cast instructions, which have been skipped. All other
+    // instructions should have already been visited.
+    if (!CastInst->isCast())
+      return;
 
-    // Visit constant integers.
-    if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+    if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the cast instruction.
       collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-      continue;
+      return;
     }
+  }
 
-    // Visit cast instructions that have constant integers.
-    if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
-      // Only visit cast instructions, which have been skipped. All other
-      // instructions should have already been visited.
-      if (!CastInst->isCast())
-        continue;
+  // Visit constant expressions that have constant integers.
+  if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+    // Only visit constant cast expressions.
+    if (!ConstExpr->isCast())
+      return;
 
-      if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
-        // Pretend the constant is directly used by the instruction and ignore
-        // the cast instruction.
-        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-        continue;
-      }
+    if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+      // Pretend the constant is directly used by the instruction and ignore
+      // the constant expression.
+      collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+      return;
     }
+  }
+}
 
-    // Visit constant expressions that have constant integers.
-    if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
-      // Only visit constant cast expressions.
-      if (!ConstExpr->isCast())
-        continue;
 
-      if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
-        // Pretend the constant is directly used by the instruction and ignore
-        // the constant expression.
-        collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
-        continue;
-      }
+/// \brief Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoistingPass::collectConstantCandidates(
+    ConstCandMapType &ConstCandMap, Instruction *Inst) {
+  // Skip all cast instructions. They are visited indirectly later on.
+  if (Inst->isCast())
+    return;
+
+  // Scan all operands.
+  for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+    // The cost of materializing the constants (defined in
+    // `TargetTransformInfo::getIntImmCost`) for instructions which only take
+    // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So
+    // it's safe for us to collect constant candidates from all IntrinsicInsts.
+    if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) {
+      collectConstantCandidates(ConstCandMap, Inst, Idx);
     }
   } // end of for all operands
 }
@@ -289,8 +427,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
 // bit widths (APInt Operator- does not like that). If the value cannot be
 // represented in uint64 we return an "empty" APInt. This is then interpreted
 // as the value is not in range.
-static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2)
-{
+static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1,
+                                                 const APInt &V2) {
   llvm::Optional<APInt> Res = None;
   unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
                 V1.getBitWidth() : V2.getBitWidth();
@@ -549,29 +687,54 @@ bool ConstantHoistingPass::emitBaseConstants() {
   bool MadeChange = false;
   for (auto const &ConstInfo : ConstantVec) {
     // Hoist and hide the base constant behind a bitcast.
-    Instruction *IP = findConstantInsertionPoint(ConstInfo);
-    IntegerType *Ty = ConstInfo.BaseConstant->getType();
-    Instruction *Base =
-      new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
-    DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB "
-                 << IP->getParent()->getName() << '\n' << *Base << '\n');
-    NumConstantsHoisted++;
+    SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo);
+    assert(!IPSet.empty() && "IPSet is empty");
+
+    unsigned UsesNum = 0;
+    unsigned ReBasesNum = 0;
+    for (Instruction *IP : IPSet) {
+      IntegerType *Ty = ConstInfo.BaseConstant->getType();
+      Instruction *Base =
+          new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP);
+      DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant
+                   << ") to BB " << IP->getParent()->getName() << '\n'
+                   << *Base << '\n');
+
+      // Emit materialization code for all rebased constants.
+      unsigned Uses = 0;
+      for (auto const &RCI : ConstInfo.RebasedConstants) {
+        for (auto const &U : RCI.Uses) {
+          Uses++;
+          BasicBlock *OrigMatInsertBB =
+              findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
+          // If Base constant is to be inserted in multiple places,
+          // generate rebase for U using the Base dominating U.
+          if (IPSet.size() == 1 ||
+              DT->dominates(Base->getParent(), OrigMatInsertBB)) {
+            emitBaseConstants(Base, RCI.Offset, U);
+            ReBasesNum++;
+          }
+        }
+      }
+      UsesNum = Uses;
 
-    // Emit materialization code for all rebased constants.
-    for (auto const &RCI : ConstInfo.RebasedConstants) {
-      NumConstantsRebased++;
-      for (auto const &U : RCI.Uses)
-        emitBaseConstants(Base, RCI.Offset, U);
+      // Use the same debug location as the last user of the constant.
+      assert(!Base->use_empty() && "The use list is empty!?");
+      assert(isa<Instruction>(Base->user_back()) &&
+             "All uses should be instructions.");
+      Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
     }
+    (void)UsesNum;
+    (void)ReBasesNum;
+    // Expect all uses are rebased after rebase is done.
+    assert(UsesNum == ReBasesNum && "Not all uses are rebased");
+
+    NumConstantsHoisted++;
 
-    // Use the same debug location as the last user of the constant.
-    assert(!Base->use_empty() && "The use list is empty!?");
-    assert(isa<Instruction>(Base->user_back()) &&
-           "All uses should be instructions.");
-    Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc());
+    // Base constant is also included in ConstInfo.RebasedConstants, so
+    // deduct 1 from ConstInfo.RebasedConstants.size().
+    NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1;
 
-    // Correct for base constant, which we counted above too.
-    NumConstantsRebased--;
     MadeChange = true;
   }
   return MadeChange;
@@ -587,9 +750,11 @@ void ConstantHoistingPass::deleteDeadCastInst() const {
 
 /// \brief Optimize expensive integer constants in the given function.
 bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
-                                   DominatorTree &DT, BasicBlock &Entry) {
+                                   DominatorTree &DT, BlockFrequencyInfo *BFI,
+                                   BasicBlock &Entry) {
   this->TTI = &TTI;
   this->DT = &DT;
+  this->BFI = BFI;
   this->Entry = &Entry;  
   // Collect all constant candidates.
   collectConstantCandidates(Fn);
@@ -620,9 +785,13 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  if (!runImpl(F, TTI, DT, F.getEntryBlock()))
+  auto BFI = ConstHoistWithBlockFrequency
+                 ? &AM.getResult<BlockFrequencyAnalysis>(F)
+                 : nullptr;
+  if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock()))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
index 9e98219..4fa2789 100644
--- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp
@@ -18,15 +18,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <set>
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 84f9373..2815778 100644
--- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
@@ -26,6 +26,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -95,7 +96,8 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   return true;
 }
 
-static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
+static bool processPHI(PHINode *P, LazyValueInfo *LVI,
+                       const SimplifyQuery &SQ) {
   bool Changed = false;
 
   BasicBlock *BB = P->getParent();
@@ -149,9 +151,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI) {
     Changed = true;
   }
 
-  // FIXME: Provide TLI, DT, AT to SimplifyInstruction.
-  const DataLayout &DL = BB->getModule()->getDataLayout();
-  if (Value *V = SimplifyInstruction(P, DL)) {
+  if (Value *V = SimplifyInstruction(P, SQ)) {
     P->replaceAllUsesWith(V);
     P->eraseFromParent();
     Changed = true;
@@ -232,12 +232,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
   if (PB == PE) return false;
 
-  // Analyse each switch case in turn.  This is done in reverse order so that
-  // removing a case doesn't cause trouble for the iteration.
+  // Analyse each switch case in turn.
   bool Changed = false;
-  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
-       ) {
-    ConstantInt *Case = CI.getCaseValue();
+  for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+    ConstantInt *Case = CI->getCaseValue();
 
     // Check to see if the switch condition is equal to/not equal to the case
     // value on every incoming edge, equal/not equal being the same each time.
@@ -270,8 +268,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
     if (State == LazyValueInfo::False) {
       // This case never fires - remove it.
-      CI.getCaseSuccessor()->removePredecessor(BB);
-      SI->removeCase(CI); // Does not invalidate the iterator.
+      CI->getCaseSuccessor()->removePredecessor(BB);
+      CI = SI->removeCase(CI);
+      CE = SI->case_end();
 
       // The condition can be modified by removePredecessor's PHI simplification
       // logic.
@@ -279,7 +278,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
       ++NumDeadCases;
       Changed = true;
-    } else if (State == LazyValueInfo::True) {
+      continue;
+    }
+    if (State == LazyValueInfo::True) {
       // This case always fires.  Arrange for the switch to be turned into an
       // unconditional branch by replacing the switch condition with the case
       // value.
@@ -288,6 +289,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
       Changed = true;
       break;
     }
+
+    // Increment the case iterator since we didn't delete it.
+    ++CI;
   }
 
   if (Changed)
@@ -300,7 +304,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
 /// Infer nonnull attributes for the arguments at the specified callsite.
 static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
-  SmallVector<unsigned, 4> Indices;
+  SmallVector<unsigned, 4> ArgNos;
   unsigned ArgNo = 0;
 
   for (Value *V : CS.args()) {
@@ -308,23 +312,24 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
     // Try to mark pointer typed parameters as non-null.  We skip the
     // relatively expensive analysis for constants which are obviously either
     // null or non-null to start with.
-    if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) &&
+    if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
         !isa<Constant>(V) && 
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
                             CS.getInstruction()) == LazyValueInfo::False)
-      Indices.push_back(ArgNo + 1);
+      ArgNos.push_back(ArgNo);
     ArgNo++;
   }
 
   assert(ArgNo == CS.arg_size() && "sanity check");
 
-  if (Indices.empty())
+  if (ArgNos.empty())
     return false;
 
-  AttributeSet AS = CS.getAttributes();
+  AttributeList AS = CS.getAttributes();
   LLVMContext &Ctx = CS.getInstruction()->getContext();
-  AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
+  AS = AS.addParamAttribute(Ctx, ArgNos,
+                            Attribute::get(Ctx, Attribute::NonNull));
   CS.setAttributes(AS);
 
   return true;
@@ -437,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
 
   bool Changed = false;
   if (!NUW) {
-    ConstantRange NUWRange =
-            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
-                                              OBO::NoUnsignedWrap);
+    ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoUnsignedWrap);
     if (!NUWRange.isEmptySet()) {
       bool NewNUW = NUWRange.contains(LazyRRange());
       AddOp->setHasNoUnsignedWrap(NewNUW);
@@ -447,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
     }
   }
   if (!NSW) {
-    ConstantRange NSWRange =
-            LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange,
-                                              OBO::NoSignedWrap);
+    ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+        BinaryOperator::Add, LRange, OBO::NoSignedWrap);
     if (!NSWRange.isEmptySet()) {
       bool NewNSW = NSWRange.contains(LazyRRange());
       AddOp->setHasNoSignedWrap(NewNSW);
@@ -483,9 +486,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
     ConstantInt::getFalse(C->getContext());
 }
 
-static bool runImpl(Function &F, LazyValueInfo *LVI) {
+static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) {
   bool FnChanged = false;
-
   // Visiting in a pre-order depth-first traversal causes us to simplify early
   // blocks before querying later blocks (which require us to analyze early
   // blocks).  Eagerly simplifying shallow blocks means there is strictly less
@@ -500,7 +502,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) {
         BBChanged |= processSelect(cast<SelectInst>(II), LVI);
         break;
       case Instruction::PHI:
-        BBChanged |= processPHI(cast<PHINode>(II), LVI);
+        BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ);
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
@@ -548,7 +550,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) {
         BBChanged = true;        
       }
     }
-    };
+    }
 
     FnChanged |= BBChanged;
   }
@@ -561,18 +563,14 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) {
     return false;
 
   LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
-  return runImpl(F, LVI);
+  return runImpl(F, LVI, getBestSimplifyQuery(*this, F));
 }
 
 PreservedAnalyses
 CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
-  bool Changed = runImpl(F, LVI);
-
-  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<LazyValueAnalysis>(F);
+  bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F));
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
index cc2a3cf..fa4806e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -19,10 +19,10 @@
 #include "llvm/Transforms/Scalar/DCE.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -124,9 +124,12 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
 }
 
 PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
+  if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 4d4c3ba..1ec38e5 100644
--- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -135,13 +135,13 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) {
   if (auto CS = CallSite(I)) {
     if (Function *F = CS.getCalledFunction()) {
       StringRef FnName = F->getName();
-      if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy))
+      if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
         return true;
-      if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy))
+      if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
         return true;
-      if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat))
+      if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
         return true;
-      if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat))
+      if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
         return true;
     }
   }
@@ -287,19 +287,14 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
 }
 
 namespace {
-enum OverwriteResult {
-  OverwriteBegin,
-  OverwriteComplete,
-  OverwriteEnd,
-  OverwriteUnknown
-};
+enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
 }
 
-/// Return 'OverwriteComplete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
-/// the 'Earlier' location is completely overwritten by 'Later',
-/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
-/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later', or
+/// 'OW_Unknown' if nothing can be determined.
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
@@ -310,7 +305,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we don't know the sizes of either access, then we can't do a comparison.
   if (Later.Size == MemoryLocation::UnknownSize ||
       Earlier.Size == MemoryLocation::UnknownSize)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
@@ -320,7 +315,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (P1 == P2) {
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
   }
 
   // Check to see if the later store is to the entire object (either a global,
@@ -332,13 +327,13 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // If the "Later" store is to a recognizable object, get its size.
   uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
   if (ObjectSize != MemoryLocation::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
 
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
@@ -350,7 +345,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // The later store completely overlaps the earlier store if:
   //
@@ -370,7 +365,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (EarlierOff >= LaterOff &&
       Later.Size >= Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
-    return OverwriteComplete;
+    return OW_Complete;
 
   // We may now overlap, although the overlap is not complete. There might also
   // be other incomplete overlaps, and together, they might cover the complete
@@ -428,7 +423,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
                       ") Composite Later [" <<
                       ILI->second << ", " << ILI->first << ")\n");
       ++NumCompletePartials;
-      return OverwriteComplete;
+      return OW_Complete;
     }
   }
 
@@ -443,7 +438,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (!EnablePartialOverwriteTracking &&
       (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
        int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
-    return OverwriteEnd;
+    return OW_End;
 
   // Finally, we also need to check if the later store overwrites the beginning
   // of the earlier store.
@@ -458,11 +453,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
       (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
     assert(int64_t(LaterOff + Later.Size) <
                int64_t(EarlierOff + Earlier.Size) &&
-           "Expect to be handled as OverwriteComplete");
-    return OverwriteBegin;
+           "Expect to be handled as OW_Complete");
+    return OW_Begin;
   }
   // Otherwise, they don't completely overlap.
-  return OverwriteUnknown;
+  return OW_Unknown;
 }
 
 /// If 'Inst' might be a self read (i.e. a noop copy of a
@@ -551,7 +546,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
       Instruction *I = &*BI;
       if (I->mayWriteToMemory() && I != SecondI) {
         auto Res = AA->getModRefInfo(I, MemLoc);
-        if (Res != MRI_NoModRef)
+        if (Res & MRI_Mod)
           return false;
       }
     }
@@ -909,7 +904,7 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
 
   if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
     assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
-           "Should have been handled as OverwriteComplete");
+           "Should have been handled as OW_Complete");
     if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
                      LaterSize, false)) {
       IntervalMap.erase(OII);
@@ -1105,7 +1100,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
         OverwriteResult OR =
             isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
                         DepWrite, IOL);
-        if (OR == OverwriteComplete) {
+        if (OR == OW_Complete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
@@ -1117,15 +1112,15 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
           // We erased DepWrite; start over.
           InstDep = MD->getDependency(Inst);
           continue;
-        } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
-                   ((OR == OverwriteBegin &&
+        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OW_Begin &&
                      isShortenableAtTheBeginning(DepWrite)))) {
           assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
                                                     "when partial-overwrite "
                                                     "tracking is enabled");
           int64_t EarlierSize = DepLoc.Size;
           int64_t LaterSize = Loc.Size;
-          bool IsOverwriteEnd = (OR == OverwriteEnd);
+          bool IsOverwriteEnd = (OR == OW_End);
           MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
                                     InstWriteOffset, LaterSize, IsOverwriteEnd);
         }
@@ -1186,8 +1181,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
 
   if (!eliminateDeadStores(F, AA, MD, DT, TLI))
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 16e08ee..c5c9b2c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -15,10 +15,13 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,7 +35,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include <deque>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -252,7 +254,9 @@ public:
   const TargetTransformInfo &TTI;
   DominatorTree &DT;
   AssumptionCache &AC;
+  const SimplifyQuery SQ;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   typedef RecyclingAllocator<
       BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
@@ -313,9 +317,12 @@ public:
   unsigned CurrentGeneration;
 
   /// \brief Set up the EarlyCSE runner for a particular function.
-  EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
-           DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
-      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {}
+  EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
+           const TargetTransformInfo &TTI, DominatorTree &DT,
+           AssumptionCache &AC, MemorySSA *MSSA)
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
+  }
 
   bool run();
 
@@ -388,7 +395,7 @@ private:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
       : IsTargetMemInst(false), Inst(Inst) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+        if (TTI.getTgtMemIntrinsic(II, Info))
           IsTargetMemInst = true;
     }
     bool isLoad() const {
@@ -400,17 +407,14 @@ private:
       return isa<StoreInst>(Inst);
     }
     bool isAtomic() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.Ordering != AtomicOrdering::NotAtomic;
       return Inst->isAtomic();
     }
     bool isUnordered() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return true;
-      }
+      if (IsTargetMemInst)
+        return Info.isUnordered();
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isUnordered();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -421,10 +425,9 @@ private:
     }
 
     bool isVolatile() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.IsVolatile;
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isVolatile();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -504,7 +507,7 @@ private:
     if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
       // Optimize MemoryPhi nodes that may become redundant by having all the
       // same input values once MA is removed.
-      SmallVector<MemoryPhi *, 4> PhisToCheck;
+      SmallSetVector<MemoryPhi *, 4> PhisToCheck;
       SmallVector<MemoryAccess *, 8> WorkQueue;
       WorkQueue.push_back(MA);
       // Process MemoryPhi nodes in FIFO order using a ever-growing vector since
@@ -515,9 +518,9 @@ private:
 
         for (auto *U : WI->users())
           if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
-            PhisToCheck.push_back(MP);
+            PhisToCheck.insert(MP);
 
-        MSSA->removeMemoryAccess(WI);
+        MSSAUpdater->removeMemoryAccess(WI);
 
         for (MemoryPhi *MP : PhisToCheck) {
           MemoryAccess *FirstIn = MP->getIncomingValue(0);
@@ -559,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
   if (!MSSA)
     return false;
 
+  // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+  // read/write memory, then we can safely return true here.
+  // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+  // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+  // by also checking the MemorySSA MemoryAccess on the instruction.  Initial
+  // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+  // with the default optimization pipeline.
+  auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+  if (!EarlierMA)
+    return true;
+  auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+  if (!LaterMA)
+    return true;
+
   // Since we know LaterDef dominates LaterInst and EarlierInst dominates
   // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
   // EarlierInst and LaterInst and neither can any other write that potentially
   // clobbers LaterInst.
   MemoryAccess *LaterDef =
       MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
-  return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
+  return MSSA->dominates(LaterDef, EarlierMA);
 }
 
 bool EarlyCSE::processNode(DomTreeNode *Node) {
@@ -587,27 +604,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // which reaches this block where the condition might hold a different
   // value.  Since we're adding this to the scoped hash table (like any other
   // def), it will have been popped if we encounter a future merge block.
-  if (BasicBlock *Pred = BB->getSinglePredecessor())
-    if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-      if (BI->isConditional())
-        if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
-          if (SimpleValue::canHandle(CondInst)) {
-            assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
-            auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
-              ConstantInt::getTrue(BB->getContext()) :
-              ConstantInt::getFalse(BB->getContext());
-            AvailableValues.insert(CondInst, ConditionalConstant);
-            DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
-                  << CondInst->getName() << "' as " << *ConditionalConstant
-                  << " in " << BB->getName() << "\n");
-            // Replace all dominated uses with the known value.
-            if (unsigned Count =
-                    replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
-                                             BasicBlockEdge(Pred, BB))) {
-              Changed = true;
-              NumCSECVP = NumCSECVP + Count;
-            }
-          }
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (BI && BI->isConditional()) {
+      auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+      if (CondInst && SimpleValue::canHandle(CondInst)) {
+        assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+        auto *TorF = (BI->getSuccessor(0) == BB)
+                         ? ConstantInt::getTrue(BB->getContext())
+                         : ConstantInt::getFalse(BB->getContext());
+        AvailableValues.insert(CondInst, TorF);
+        DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                     << CondInst->getName() << "' as " << *TorF << " in "
+                     << BB->getName() << "\n");
+        // Replace all dominated uses with the known value.
+        if (unsigned Count = replaceDominatedUsesWith(
+                CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+          Changed = true;
+          NumCSECVP += Count;
+        }
+      }
+    }
+  }
 
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
@@ -615,8 +633,6 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   /// stores which can occur in bitfield code among other things.
   Instruction *LastStore = nullptr;
 
-  const DataLayout &DL = BB->getModule()->getDataLayout();
-
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
@@ -634,10 +650,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // Skip assume intrinsics, they don't really have side effects (although
     // they're marked as such to ensure preservation of control dependencies),
-    // and this pass will not disturb any of the assumption's control
-    // dependencies.
+    // and this pass will not bother with its removal. However, we should mark
+    // its condition as true for all dominated blocks.
     if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
-      DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+      auto *CondI =
+          dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
+      if (CondI && SimpleValue::canHandle(CondI)) {
+        DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n');
+        AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+      } else
+        DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
       continue;
     }
 
@@ -657,10 +679,25 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
       if (auto *CondI =
               dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
-        // The condition we're on guarding here is true for all dominated
-        // locations.
-        if (SimpleValue::canHandle(CondI))
+        if (SimpleValue::canHandle(CondI)) {
+          // Do we already know the actual value of this condition?
+          if (auto *KnownCond = AvailableValues.lookup(CondI)) {
+            // Is the condition known to be true?
+            if (isa<ConstantInt>(KnownCond) &&
+                cast<ConstantInt>(KnownCond)->isOne()) {
+              DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n');
+              removeMSSA(Inst);
+              Inst->eraseFromParent();
+              Changed = true;
+              continue;
+            } else
+              // Use the known value if it wasn't true.
+              cast<CallInst>(Inst)->setArgOperand(0, KnownCond);
+          }
+          // The condition we're on guarding here is true for all dominated
+          // locations.
           AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+        }
       }
 
       // Guard intrinsics read all memory, but don't write any memory.
@@ -672,7 +709,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
+    if (Value *V = SimplifyInstruction(Inst, SQ)) {
       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
       bool Killed = false;
       if (!Inst->use_empty()) {
@@ -761,12 +798,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       continue;
     }
 
-    // If this instruction may read from memory, forget LastStore.
-    // Load/store intrinsics will indicate both a read and a write to
-    // memory.  The target may override this (e.g. so that a store intrinsic
-    // does not read  from memory, and thus will be treated the same as a
-    // regular store for commoning purposes).
-    if (Inst->mayReadFromMemory() &&
+    // If this instruction may read from memory or throw (and potentially read
+    // from memory in the exception handler), forget LastStore.  Load/store
+    // intrinsics will indicate both a read and a write to memory.  The target
+    // may override this (e.g. so that a store intrinsic does not read from
+    // memory, and thus will be treated the same as a regular store for
+    // commoning purposes).
+    if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
         !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
       LastStore = nullptr;
 
@@ -962,15 +1000,13 @@ PreservedAnalyses EarlyCSEPass::run(Function &F,
   auto *MSSA =
       UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
 
-  EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
+  EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
 
   if (!CSE.run())
     return PreservedAnalyses::all();
 
-  // CSE preserves the dominator tree because it doesn't mutate the CFG.
-  // FIXME: Bundle this with other CFG-preservation.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   if (UseMemorySSA)
     PA.preserve<MemorySSAAnalysis>();
@@ -1008,7 +1044,7 @@ public:
     auto *MSSA =
         UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
 
-    EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
+    EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
 
     return CSE.run();
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
index 185cdbd..063df77 100644
--- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -11,10 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 545036d..b105ece 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -137,13 +137,13 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) {
 }
 
 // Helper - mark I as having been traversed, having range R.
-ConstantRange Float2IntPass::seen(Instruction *I, ConstantRange R) {
+void Float2IntPass::seen(Instruction *I, ConstantRange R) {
   DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
-  if (SeenInsts.find(I) != SeenInsts.end())
-    SeenInsts.find(I)->second = R;
+  auto IT = SeenInsts.find(I);
+  if (IT != SeenInsts.end())
+    IT->second = std::move(R);
   else
-    SeenInsts.insert(std::make_pair(I, R));
-  return R;
+    SeenInsts.insert(std::make_pair(I, std::move(R)));
 }
 
 // Helper - get a range representing a poison value.
@@ -516,11 +516,10 @@ FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
 PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) {
   if (!runImpl(F))
     return PreservedAnalyses::all();
-  else {
-    // FIXME: This should also 'preserve the CFG'.
-    PreservedAnalyses PA;
-    PA.preserve<GlobalsAA>();
-    return PA;
-  }
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  return PA;
 }
 } // End namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
index 0137378..ea28705 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -51,9 +50,12 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+
 #include <vector>
 using namespace llvm;
 using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "gvn"
@@ -595,11 +597,12 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
+  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
-LLVM_DUMP_METHOD
-void GVN::dump(DenseMap<uint32_t, Value*>& d) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
        E = d.end(); I != E; ++I) {
@@ -608,6 +611,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
+#endif
 
 /// Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -690,442 +694,6 @@ SpeculationFailure:
 }
 
 
-/// Return true if CoerceAvailableValueToLoadType will succeed.
-static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
-                                            Type *LoadTy,
-                                            const DataLayout &DL) {
-  // If the loaded or stored value is an first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
-      StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
-    return false;
-
-  // The store has to be at least as big as the load.
-  if (DL.getTypeSizeInBits(StoredVal->getType()) <
-        DL.getTypeSizeInBits(LoadTy))
-    return false;
-
-  return true;
-}
-
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value.  LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                             IRBuilder<> &IRB,
-                                             const DataLayout &DL) {
-  assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
-         "precondition violation - materialization can't fail");
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  // If this is already the right type, just return it.
-  Type *StoredValTy = StoredVal->getType();
-
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
-
-  // If the store and reload are the same size, we can always reuse it.
-  if (StoredValSize == LoadedValSize) {
-    // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->getScalarType()->isPointerTy() &&
-        LoadedTy->getScalarType()->isPointerTy()) {
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy);
-    } else {
-      // Convert source pointers to integers, which can be bitcast.
-      if (StoredValTy->getScalarType()->isPointerTy()) {
-        StoredValTy = DL.getIntPtrType(StoredValTy);
-        StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-      }
-
-      Type *TypeToCastTo = LoadedTy;
-      if (TypeToCastTo->getScalarType()->isPointerTy())
-        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
-
-      if (StoredValTy != TypeToCastTo)
-        StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
-
-      // Cast to pointer if the load needs a pointer type.
-      if (LoadedTy->getScalarType()->isPointerTy())
-        StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
-    }
-
-    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
-      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-        StoredVal = FoldedStoredVal;
-
-    return StoredVal;
-  }
-
-  // If the loaded value is smaller than the available value, then we can
-  // extract out a piece from it.  If the available value is too small, then we
-  // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
-         "CanCoerceMustAliasedValueToLoad fail");
-
-  // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->getScalarType()->isPointerTy()) {
-    StoredValTy = DL.getIntPtrType(StoredValTy);
-    StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-  }
-
-  // Convert vectors and fp to integer, which can be manipulated.
-  if (!StoredValTy->isIntegerTy()) {
-    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
-    StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
-  }
-
-  // If this is a big-endian system, we need to shift the value down to the low
-  // bits so that a truncate will work.
-  if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
-    StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp");
-  }
-
-  // Truncate the integer to the right size now.
-  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
-  StoredVal  = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
-
-  if (LoadedTy != NewIntTy) {
-    // If the result is a pointer, inttoptr.
-    if (LoadedTy->getScalarType()->isPointerTy())
-      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
-    else
-      // Otherwise, bitcast.
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
-  }
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  return StoredVal;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering memory write (store,
-/// memset, memcpy, memmove).  This means that the write *may* provide bits used
-/// by the load but we can't be sure because the pointers don't mustalias.
-///
-/// Check this case to see if there is anything more we can do before we give
-/// up.  This returns -1 if we have to give up, or a byte number in the stored
-/// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
-                                          Value *WritePtr,
-                                          uint64_t WriteSizeInBits,
-                                          const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
-    return -1;
-
-  int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase =
-      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
-  if (StoreBase != LoadBase)
-    return -1;
-
-  // If the load and store are to the exact same address, they should have been
-  // a must alias.  AA must have gotten confused.
-  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
-  // to a load from the base of the memset.
-
-  // If the load and store don't overlap at all, the store doesn't provide
-  // anything to the load.  In this case, they really don't alias at all, AA
-  // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
-
-  if ((WriteSizeInBits & 7) | (LoadSize & 7))
-    return -1;
-  uint64_t StoreSize = WriteSizeInBits / 8;  // Convert to bytes.
-  LoadSize /= 8;
-
-
-  bool isAAFailure = false;
-  if (StoreOffset < LoadOffset)
-    isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
-  else
-    isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
-
-  if (isAAFailure)
-    return -1;
-
-  // If the Load isn't completely contained within the stored bits, we don't
-  // have all the bits to feed it.  We could do something crazy in the future
-  // (issue a smaller load then merge the bits in) but this seems unlikely to be
-  // valuable.
-  if (StoreOffset > LoadOffset ||
-      StoreOffset+StoreSize < LoadOffset+LoadSize)
-    return -1;
-
-  // Okay, we can do this transformation.  Return the number of bytes into the
-  // store that the load is.
-  return LoadOffset-StoreOffset;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
-                                          StoreInst *DepSI) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepSI->getValueOperand()->getType()->isStructTy() ||
-      DepSI->getValueOperand()->getType()->isArrayTy())
-    return -1;
-
-  const DataLayout &DL = DepSI->getModule()->getDataLayout();
-  Value *StorePtr = DepSI->getPointerOperand();
-  uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                        StorePtr, StoreSize, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being clobbered by another load.  See if
-/// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
-                                         LoadInst *DepLI, const DataLayout &DL){
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
-    return -1;
-
-  Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
-  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
-  if (R != -1) return R;
-
-  // If we have a load/load clobber an DepLI can be widened to cover this load,
-  // then we should widen it!
-  int64_t LoadOffs = 0;
-  const Value *LoadBase =
-      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-
-  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-      LoadBase, LoadOffs, LoadSize, DepLI);
-  if (Size == 0) return -1;
-
-  // Check non-obvious conditions enforced by MDA which we rely on for being
-  // able to materialize this potentially available value
-  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
-  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
-
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
-}
-
-
-
-static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
-                                            MemIntrinsic *MI,
-                                            const DataLayout &DL) {
-  // If the mem operation is a non-constant size, we can't handle it.
-  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
-  if (!SizeCst) return -1;
-  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
-
-  // If this is memset, we just need to see if the offset is valid in the size
-  // of the memset..
-  if (MI->getIntrinsicID() == Intrinsic::memset)
-    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, DL);
-
-  // If we have a memcpy/memmove, the only case we can handle is if this is a
-  // copy from constant memory.  In that case, we can read directly from the
-  // constant memory.
-  MemTransferInst *MTI = cast<MemTransferInst>(MI);
-
-  Constant *Src = dyn_cast<Constant>(MTI->getSource());
-  if (!Src) return -1;
-
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
-  if (!GV || !GV->isConstant()) return -1;
-
-  // See if the access is within the bounds of the transfer.
-  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                              MI->getDest(), MemSizeInBits, DL);
-  if (Offset == -1)
-    return Offset;
-
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
-    return Offset;
-  return -1;
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.  This means
-/// that the store provides bits used by the load but we the pointers don't
-/// mustalias.  Check this case to see if there is anything more we can do
-/// before we give up.
-static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
-                                   Type *LoadTy,
-                                   Instruction *InsertPt, const DataLayout &DL){
-  LLVMContext &Ctx = SrcVal->getType()->getContext();
-
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // Compute which bits of the stored value are being used by the load.  Convert
-  // to an integer type to start with.
-  if (SrcVal->getType()->getScalarType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal,
-        DL.getIntPtrType(SrcVal->getType()));
-  if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-
-  // Shift the bits to the least significant depending on endianness.
-  unsigned ShiftAmt;
-  if (DL.isLittleEndian())
-    ShiftAmt = Offset*8;
-  else
-    ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-
-  if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-
-  if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-
-  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering load.  This means
-/// that the load *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias.  Check this case to see if there is
-/// anything more we can do before we give up.
-static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
-                                  Type *LoadTy, Instruction *InsertPt,
-                                  GVN &gvn) {
-  const DataLayout &DL = SrcVal->getModule()->getDataLayout();
-  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
-  // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-  if (Offset+LoadSize > SrcValStoreSize) {
-    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
-    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
-    // If we have a load/load clobber an DepLI can be widened to cover this
-    // load, then we should widen it to the next power of 2 size big enough!
-    unsigned NewLoadSize = Offset+LoadSize;
-    if (!isPowerOf2_32(NewLoadSize))
-      NewLoadSize = NextPowerOf2(NewLoadSize);
-
-    Value *PtrVal = SrcVal->getPointerOperand();
-
-    // Insert the new load after the old load.  This ensures that subsequent
-    // memdep queries will find the new load.  We can't easily remove the old
-    // load completely because it is already in the value numbering table.
-    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    Type *DestPTy =
-      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
-    DestPTy = PointerType::get(DestPTy,
-                               PtrVal->getType()->getPointerAddressSpace());
-    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
-    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
-    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
-    NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(SrcVal->getAlignment());
-
-    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
-    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-
-    // Replace uses of the original load with the wider load.  On a big endian
-    // system, we need to shift down to get the relevant bits.
-    Value *RV = NewLoad;
-    if (DL.isBigEndian())
-      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
-    RV = Builder.CreateTrunc(RV, SrcVal->getType());
-    SrcVal->replaceAllUsesWith(RV);
-
-    // We would like to use gvn.markInstructionForDeletion here, but we can't
-    // because the load is already memoized into the leader map table that GVN
-    // tracks.  It is potentially possible to remove the load from the table,
-    // but then there all of the operations based on it would need to be
-    // rehashed.  Just leave the dead load around.
-    gvn.getMemDep().removeInstruction(SrcVal);
-    SrcVal = NewLoad;
-  }
-
-  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                                     Type *LoadTy, Instruction *InsertPt,
-                                     const DataLayout &DL){
-  LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // We know that this method is only called when the mem transfer fully
-  // provides the bits for the load.
-  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
-    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
-    // independently of what the offset is.
-    Value *Val = MSI->getValue();
-    if (LoadSize != 1)
-      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-
-    Value *OneElt = Val;
-
-    // Splat the value out to the right number of bits.
-    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
-      // If we can double the number of bytes set, do it.
-      if (NumBytesSet*2 <= LoadSize) {
-        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
-        Val = Builder.CreateOr(Val, ShVal);
-        NumBytesSet <<= 1;
-        continue;
-      }
-
-      // Otherwise insert one byte at a time.
-      Value *ShVal = Builder.CreateShl(Val, 1*8);
-      Val = Builder.CreateOr(OneElt, ShVal);
-      ++NumBytesSet;
-    }
-
-    return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
-  }
-
-  // Otherwise, this is a memcpy/memmove from a constant global.
-  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
-  Constant *Src = cast<Constant>(MTI->getSource());
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
-}
 
 
 /// Given a set of loads specified by ValuesPerBlock,
@@ -1171,7 +739,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
 
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
@@ -1182,14 +750,20 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
     if (Load->getType() == LoadTy && Offset == 0) {
       Res = Load;
     } else {
-      Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn);
-
+      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+      // We would like to use gvn.markInstructionForDeletion here, but we can't
+      // because the load is already memoized into the leader map table that GVN
+      // tracks.  It is potentially possible to remove the load from the table,
+      // but then there all of the operations based on it would need to be
+      // rehashed.  Just leave the dead load around.
+      gvn.getMemDep().removeInstruction(Load);
       DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                    << *getCoercedLoadValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
+                   << *Res << '\n'
+                   << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
-    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
                                  InsertPt, DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
@@ -1258,7 +832,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
         if (Offset != -1) {
           Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
           return true;
@@ -1276,7 +850,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
 
         if (Offset != -1) {
           Res = AvailableValue::getLoad(DepLI, Offset);
@@ -1289,7 +863,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // forward a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
       if (Address && !LI->isAtomic()) {
-        int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
                                                       DepMI, DL);
         if (Offset != -1) {
           Res = AvailableValue::getMI(DepMI, Offset);
@@ -1334,7 +908,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // different types if we have to. If the stored value is larger or equal to
     // the loaded value, we can reuse it.
     if (S->getValueOperand()->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+        !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
                                          LI->getType(), DL))
       return false;
 
@@ -1351,7 +925,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
     if (LD->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+        !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
       return false;
 
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1592,8 +1166,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
     auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre",
                                  LI->isVolatile(), LI->getAlignment(),
-                                 LI->getOrdering(), LI->getSynchScope(),
+                                 LI->getOrdering(), LI->getSyncScopeID(),
                                  UnavailablePred->getTerminator());
+    NewLoad->setDebugLoc(LI->getDebugLoc());
 
     // Transfer the old load's AA tags to the new load.
     AAMDNodes Tags;
@@ -1628,7 +1203,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     V->takeName(LI);
   if (Instruction *I = dyn_cast<Instruction>(V))
     I->setDebugLoc(LI->getDebugLoc());
-  if (V->getType()->getScalarType()->isPointerTy())
+  if (V->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(V);
   markInstructionForDeletion(LI);
   ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
@@ -1713,9 +1288,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       // If instruction I has debug info, then we should not update it.
       // Also, if I has a null DebugLoc, then it is still potentially incorrect
       // to propagate LI's DebugLoc because LI may not post-dominate I.
-      if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
+      if (LI->getDebugLoc() && LI->getParent() == I->getParent())
         I->setDebugLoc(LI->getDebugLoc());
-    if (V->getType()->getScalarType()->isPointerTy())
+    if (V->getType()->isPtrOrPtrVectorTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(LI);
     ++NumGVNLoad;
@@ -1795,7 +1370,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
 
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
-  // Note that if 'I' is a load being replaced by some operation, 
+  // Note that if 'I' is a load being replaced by some operation,
   // for example, by an arithmetic operation, then andIRFlags()
   // would just erase all math flags from the original arithmetic
   // operation, which is clearly not wanted and not needed.
@@ -1869,7 +1444,7 @@ bool GVN::processLoad(LoadInst *L) {
     reportLoadElim(L, AvailableValue, ORE);
     // Tell MDA to rexamine the reused pointer since we might have more
     // information after forwarding it.
-    if (MD && AvailableValue->getType()->getScalarType()->isPointerTy())
+    if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
       MD->invalidateCachedPointerInfo(AvailableValue);
     return true;
   }
@@ -2024,7 +1599,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
       // RHS neither 'true' nor 'false' - bail out.
       continue;
     // Whether RHS equals 'true'.  Otherwise it equals 'false'.
-    bool isKnownTrue = CI->isAllOnesValue();
+    bool isKnownTrue = CI->isMinusOne();
     bool isKnownFalse = !isKnownTrue;
 
     // If "A && B" is known true then both A and B are known true.  If "A || B"
@@ -2113,7 +1688,7 @@ bool GVN::processInstruction(Instruction *I) {
   // example if it determines that %y is equal to %x then the instruction
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
   const DataLayout &DL = I->getModule()->getDataLayout();
-  if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+  if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
     bool Changed = false;
     if (!I->use_empty()) {
       I->replaceAllUsesWith(V);
@@ -2124,7 +1699,7 @@ bool GVN::processInstruction(Instruction *I) {
       Changed = true;
     }
     if (Changed) {
-      if (MD && V->getType()->getScalarType()->isPointerTy())
+      if (MD && V->getType()->isPtrOrPtrVectorTy())
         MD->invalidateCachedPointerInfo(V);
       ++NumGVNSimpl;
       return true;
@@ -2187,11 +1762,11 @@ bool GVN::processInstruction(Instruction *I) {
 
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      BasicBlock *Dst = i.getCaseSuccessor();
+      BasicBlock *Dst = i->getCaseSuccessor();
       // If there is only a single edge, propagate the case value into it.
       if (SwitchEdges.lookup(Dst) == 1) {
         BasicBlockEdge E(Parent, Dst);
-        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
       }
     }
     return Changed;
@@ -2235,7 +1810,7 @@ bool GVN::processInstruction(Instruction *I) {
 
   // Remove it!
   patchAndReplaceAllUsesWith(I, Repl);
-  if (MD && Repl->getType()->getScalarType()->isPointerTy())
+  if (MD && Repl->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(Repl);
   markInstructionForDeletion(I);
   return true;
@@ -2483,7 +2058,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) {
       // If we failed insertion, make sure we remove the instruction.
       DEBUG(verifyRemoved(PREInstr));
-      delete PREInstr;
+      PREInstr->deleteValue();
       return false;
     }
   }
@@ -2509,7 +2084,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   addToLeaderTable(ValNo, Phi, CurrentBlock);
   Phi->setDebugLoc(CurInst->getDebugLoc());
   CurInst->replaceAllUsesWith(Phi);
-  if (MD && Phi->getType()->getScalarType()->isPointerTy())
+  if (MD && Phi->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(Phi);
   VN.erase(CurInst);
   removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
@@ -2581,21 +2156,12 @@ bool GVN::iterateOnFunction(Function &F) {
 
   // Top-down walk of the dominator tree
   bool Changed = false;
-  // Save the blocks this function have before transformation begins. GVN may
-  // split critical edge, and hence may invalidate the RPO/DT iterator.
-  //
-  std::vector<BasicBlock *> BBVect;
-  BBVect.reserve(256);
   // Needed for value numbering with phi construction to work.
+  // RPOT walks the graph in its constructor and will not be invalidated during
+  // processBlock.
   ReversePostOrderTraversal<Function *> RPOT(&F);
-  for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
-                                                           RE = RPOT.end();
-       RI != RE; ++RI)
-    BBVect.push_back(*RI);
-
-  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
-       I != E; I++)
-    Changed |= processBlock(*I);
+  for (BasicBlock *BB : RPOT)
+    Changed |= processBlock(BB);
 
   return Changed;
 }
@@ -2783,6 +2349,7 @@ public:
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index f8e1d2e..29de792 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -17,16 +17,40 @@
 // is disabled in the following cases.
 // 1. Scalars across calls.
 // 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+//   switch i32 %c1, label %exit1 [
+//     i32 0, label %sw0
+//     i32 1, label %sw1
+//   ]
+//
+// sw0:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// sw1:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// exit1:
+//   store i32 1, i32* @G
+//   ret void
+// exit:
+//   ret void
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 
 using namespace llvm;
 
@@ -60,7 +84,7 @@ static cl::opt<int>
                    cl::desc("Maximum length of dependent chains to hoist "
                             "(default = 10, unlimited = -1)"));
 
-namespace {
+namespace llvm {
 
 // Provides a sorting function based on the execution order of two instructions.
 struct SortByDFSIn {
@@ -72,13 +96,6 @@ public:
 
   // Returns true when A executes before B.
   bool operator()(const Instruction *A, const Instruction *B) const {
-    // FIXME: libc++ has a std::sort() algorithm that will call the compare
-    // function on the same element.  Once PR20837 is fixed and some more years
-    // pass by and all the buildbots have moved to a corrected std::sort(),
-    // enable the following assert:
-    //
-    // assert(A != B);
-
     const BasicBlock *BA = A->getParent();
     const BasicBlock *BB = B->getParent();
     unsigned ADFS, BDFS;
@@ -202,6 +219,7 @@ public:
   GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
            MemorySSA *MSSA)
       : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
         HoistingGeps(false),
         HoistedCtr(0)
   { }
@@ -249,9 +267,11 @@ private:
   AliasAnalysis *AA;
   MemoryDependenceResults *MD;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   const bool HoistingGeps;
   DenseMap<const Value *, unsigned> DFSNumber;
   BBSideEffectsSet BBSideEffects;
+  DenseSet<const BasicBlock*> HoistBarrier;
   int HoistedCtr;
 
   enum InsKind { Unknown, Scalar, Load, Store };
@@ -307,8 +327,8 @@ private:
         continue;
       }
 
-      // Check for end of function, calls that do not return, etc.
-      if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
+      // We reached the leaf Basic Block => not all paths have this instruction.
+      if (!BB->getTerminator()->getNumSuccessors())
         return false;
 
       // When reaching the back-edge of a loop, there may be a path through the
@@ -360,7 +380,7 @@ private:
             ReachedNewPt = true;
           }
         }
-        if (defClobbersUseOrDef(Def, MU, *AA))
+        if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
           return true;
       }
 
@@ -387,7 +407,8 @@ private:
     // executed between the execution of NewBB and OldBB. Hoisting an expression
     // from OldBB into NewBB has to be safe on all execution paths.
     for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
-      if (*I == NewBB) {
+      const BasicBlock *BB = *I;
+      if (BB == NewBB) {
         // Stop traversal when reaching HoistPt.
         I.skipChildren();
         continue;
@@ -398,11 +419,17 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != OldBB) && HoistBarrier.count(BB))
         return true;
 
       // Check that we do not move a store past loads.
-      if (hasMemoryUse(NewPt, Def, *I))
+      if (hasMemoryUse(NewPt, Def, BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -419,17 +446,18 @@ private:
   // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
   // return true when the counter NBBsOnAllPaths reaches 0, except when it is
   // initialized to -1 which is unlimited.
-  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB,
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths) {
-    assert(DT->dominates(HoistPt, BB) && "Invalid path");
+    assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
 
     // Walk all basic blocks reachable in depth-first iteration on
     // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
     // blocks that may be executed between the execution of NewHoistPt and
     // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
     // on all execution paths.
-    for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) {
-      if (*I == HoistPt) {
+    for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+      const BasicBlock *BB = *I;
+      if (BB == HoistPt) {
         // Stop traversal when reaching NewHoistPt.
         I.skipChildren();
         continue;
@@ -440,7 +468,13 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != SrcBB) && HoistBarrier.count(BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -626,6 +660,8 @@ private:
       // Compute the insertion point and the list of expressions to be hoisted.
       SmallVecInsn InstructionsToHoist;
       for (auto I : V)
+        // We don't need to check for hoist-barriers here because if
+        // I->getParent() is a barrier then I precedes the barrier.
         if (!hasEH(I->getParent()))
           InstructionsToHoist.push_back(I);
 
@@ -809,9 +845,9 @@ private:
           // legal when the ld/st is not moved past its current definition.
           MemoryAccess *Def = OldMemAcc->getDefiningAccess();
           NewMemAcc =
-              MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+            MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
           OldMemAcc->replaceAllUsesWith(NewMemAcc);
-          MSSA->removeMemoryAccess(OldMemAcc);
+          MSSAUpdater->removeMemoryAccess(OldMemAcc);
         }
       }
 
@@ -850,7 +886,7 @@ private:
             // Update the uses of the old MSSA access with NewMemAcc.
             MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
             OldMA->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(OldMA);
+            MSSAUpdater->removeMemoryAccess(OldMA);
           }
 
           Repl->andIRFlags(I);
@@ -872,7 +908,7 @@ private:
           auto In = Phi->incoming_values();
           if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
             Phi->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(Phi);
+            MSSAUpdater->removeMemoryAccess(Phi);
           }
         }
       }
@@ -896,6 +932,12 @@ private:
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
       int InstructionNb = 0;
       for (Instruction &I1 : *BB) {
+        // If I1 cannot guarantee progress, subsequent instructions
+        // in BB cannot be hoisted anyways.
+        if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+           HoistBarrier.insert(BB);
+           break;
+        }
         // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
         // deeper may increase the register pressure and compilation time.
         if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
@@ -969,6 +1011,7 @@ public:
     AU.addRequired<MemorySSAWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
 } // namespace
@@ -985,6 +1028,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<GlobalsAA>();
   return PA;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
new file mode 100644
index 0000000..5fd2dfc
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -0,0 +1,883 @@
+//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1  ]   [ %c1 = add i32 %d, 1  ]
+/// [ %a2 = xor i32 %a1, 1 ]   [ %c2 = xor i32 %c1, 1 ]
+///                  \           /
+///            [ %e = phi i32 %a2, %c2 ]
+///            [ add i32 %e, 4         ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <unordered_set>
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
+}
+}
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+  return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+         (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+         (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+///   LockstepReverseIterator I([B1, B2, B3]);
+///   *I-- = [B1[n], B2[n], B3[n]];
+///   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+///   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+///   ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+  ArrayRef<BasicBlock *> Blocks;
+  SmallPtrSet<BasicBlock *, 4> ActiveBlocks;
+  SmallVector<Instruction *, 4> Insts;
+  bool Fail;
+
+public:
+  LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+    reset();
+  }
+
+  void reset() {
+    Fail = false;
+    ActiveBlocks.clear();
+    for (BasicBlock *BB : Blocks)
+      ActiveBlocks.insert(BB);
+    Insts.clear();
+    for (BasicBlock *BB : Blocks) {
+      if (BB->size() <= 1) {
+        // Block wasn't big enough - only contained a terminator.
+        ActiveBlocks.erase(BB);
+        continue;
+      }
+      Insts.push_back(BB->getTerminator()->getPrevNode());
+    }
+    if (Insts.empty())
+      Fail = true;
+  }
+
+  bool isValid() const { return !Fail; }
+  ArrayRef<Instruction *> operator*() const { return Insts; }
+  SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+  void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) {
+    for (auto II = Insts.begin(); II != Insts.end();) {
+      if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
+          Blocks.end()) {
+        ActiveBlocks.erase((*II)->getParent());
+        II = Insts.erase(II);
+      } else {
+        ++II;
+      }
+    }
+  }
+
+  void operator--() {
+    if (Fail)
+      return;
+    SmallVector<Instruction *, 4> NewInsts;
+    for (auto *Inst : Insts) {
+      if (Inst == &Inst->getParent()->front())
+        ActiveBlocks.erase(Inst->getParent());
+      else
+        NewInsts.push_back(Inst->getPrevNode());
+    }
+    if (NewInsts.empty()) {
+      Fail = true;
+      return;
+    }
+    Insts = NewInsts;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+  unsigned NumBlocks;
+  unsigned NumInstructions;
+  unsigned NumPHIs;
+  unsigned NumMemoryInsts;
+  int Cost = -1;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+  void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+    unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+    unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+    Cost = (NumInstructions * (NumBlocks - 1)) -
+           (NumExtraPHIs *
+            NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+           - SplitEdgeCost;
+  }
+  bool operator>(const SinkingInstructionCandidate &Other) const {
+    return Cost > Other.Cost;
+  }
+};
+
+#ifndef NDEBUG
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const SinkingInstructionCandidate &C) {
+  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+  return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+  SmallVector<Value *, 4> Values;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+  ModelledPHI() {}
+  ModelledPHI(const PHINode *PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+      Blocks.push_back(PN->getIncomingBlock(I));
+    std::sort(Blocks.begin(), Blocks.end());
+
+    // This assumes the PHI is already well-formed and there aren't conflicting
+    // incoming values for the same block.
+    for (auto *B : Blocks)
+      Values.push_back(PN->getIncomingValueForBlock(B));
+  }
+  /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+  /// without the same ID.
+  /// \note This is specifically for DenseMapInfo - do not use this!
+  static ModelledPHI createDummy(size_t ID) {
+    ModelledPHI M;
+    M.Values.push_back(reinterpret_cast<Value*>(ID));
+    return M;
+  }
+
+  /// Create a PHI from an array of incoming values and incoming blocks.
+  template <typename VArray, typename BArray>
+  ModelledPHI(const VArray &V, const BArray &B) {
+    std::copy(V.begin(), V.end(), std::back_inserter(Values));
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+  }
+
+  /// Create a PHI from [I[OpNum] for I in Insts].
+  template <typename BArray>
+  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+    for (auto *I : Insts)
+      Values.push_back(I->getOperand(OpNum));
+  }
+
+  /// Restrict the PHI's contents down to only \c NewBlocks.
+  /// \c NewBlocks must be a subset of \c this->Blocks.
+  void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) {
+    auto BI = Blocks.begin();
+    auto VI = Values.begin();
+    while (BI != Blocks.end()) {
+      assert(VI != Values.end());
+      if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
+          NewBlocks.end()) {
+        BI = Blocks.erase(BI);
+        VI = Values.erase(VI);
+      } else {
+        ++BI;
+        ++VI;
+      }
+    }
+    assert(Blocks.size() == NewBlocks.size());
+  }
+
+  ArrayRef<Value *> getValues() const { return Values; }
+
+  bool areAllIncomingValuesSame() const {
+    return all_of(Values, [&](Value *V) { return V == Values[0]; });
+  }
+  bool areAllIncomingValuesSameType() const {
+    return all_of(
+        Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+  }
+  bool areAnyIncomingValuesConstant() const {
+    return any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+  }
+  // Hash functor
+  unsigned hash() const {
+      return (unsigned)hash_combine_range(Values.begin(), Values.end());
+  }
+  bool operator==(const ModelledPHI &Other) const {
+    return Values == Other.Values && Blocks == Other.Blocks;
+  }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+  static inline ModelledPHI &getEmptyKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+    return Dummy;
+  }
+  static inline ModelledPHI &getTombstoneKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+    return Dummy;
+  }
+  static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+  static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+    return LHS == RHS;
+  }
+};
+
+typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet;
+
+//===----------------------------------------------------------------------===//
+//                             ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+  unsigned MemoryUseOrder = -1;
+  bool Volatile = false;
+
+public:
+  InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+                     BumpPtrAllocator &A)
+      : GVNExpression::BasicExpression(I->getNumUses()) {
+    allocateOperands(R, A);
+    setOpcode(I->getOpcode());
+    setType(I->getType());
+
+    for (auto &U : I->uses())
+      op_push_back(U.getUser());
+    std::sort(op_begin(), op_end());
+  }
+  void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+  void setVolatile(bool V) { Volatile = V; }
+
+  virtual hash_code getHashValue() const {
+    return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+                        MemoryUseOrder, Volatile);
+  }
+
+  template <typename Function> hash_code getHashValue(Function MapFn) {
+    hash_code H =
+        hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile);
+    for (auto *V : operands())
+      H = hash_combine(H, MapFn(V));
+    return H;
+  }
+};
+
+class ValueTable {
+  DenseMap<Value *, uint32_t> ValueNumbering;
+  DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+  DenseMap<size_t, uint32_t> HashNumbering;
+  BumpPtrAllocator Allocator;
+  ArrayRecycler<Value *> Recycler;
+  uint32_t nextValueNumber;
+
+  /// Create an expression for I based on its opcode and its uses. If I
+  /// touches or reads memory, the expression is also based upon its memory
+  /// order - see \c getMemoryUseOrder().
+  InstructionUseExpr *createExpr(Instruction *I) {
+    InstructionUseExpr *E =
+        new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+    if (isMemoryInst(I))
+      E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+    if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Predicate = C->getPredicate();
+      E->setOpcode((C->getOpcode() << 8) | Predicate);
+    }
+    return E;
+  }
+
+  /// Helper to compute the value number for a memory instruction
+  /// (LoadInst/StoreInst), including checking the memory ordering and
+  /// volatility.
+  template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+    if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+      return nullptr;
+    InstructionUseExpr *E = createExpr(I);
+    E->setVolatile(I->isVolatile());
+    return E;
+  }
+
+public:
+  /// Returns the value number for the specified value, assigning
+  /// it a new number if it did not have one before.
+  uint32_t lookupOrAdd(Value *V) {
+    auto VI = ValueNumbering.find(V);
+    if (VI != ValueNumbering.end())
+      return VI->second;
+
+    if (!isa<Instruction>(V)) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    Instruction *I = cast<Instruction>(V);
+    InstructionUseExpr *exp = nullptr;
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+      exp = createMemoryExpr(cast<LoadInst>(I));
+      break;
+    case Instruction::Store:
+      exp = createMemoryExpr(cast<StoreInst>(I));
+      break;
+    case Instruction::Call:
+    case Instruction::Invoke:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    default:
+      break;
+    }
+
+    if (!exp) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    uint32_t e = ExpressionNumbering[exp];
+    if (!e) {
+      hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+      auto I = HashNumbering.find(H);
+      if (I != HashNumbering.end()) {
+        e = I->second;
+      } else {
+        e = nextValueNumber++;
+        HashNumbering[H] = e;
+        ExpressionNumbering[exp] = e;
+      }
+    }
+    ValueNumbering[V] = e;
+    return e;
+  }
+
+  /// Returns the value number of the specified value. Fails if the value has
+  /// not yet been numbered.
+  uint32_t lookup(Value *V) const {
+    auto VI = ValueNumbering.find(V);
+    assert(VI != ValueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+
+  /// Removes all value numberings and resets the value table.
+  void clear() {
+    ValueNumbering.clear();
+    ExpressionNumbering.clear();
+    HashNumbering.clear();
+    Recycler.clear(Allocator);
+    nextValueNumber = 1;
+  }
+
+  ValueTable() : nextValueNumber(1) {}
+
+  /// \c Inst uses or touches memory. Return an ID describing the memory state
+  /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+  /// the exact same memory operations happen after I1 and I2.
+  ///
+  /// This is a very hard problem in general, so we use domain-specific
+  /// knowledge that we only ever check for equivalence between blocks sharing a
+  /// single immediate successor that is common, and when determining if I1 ==
+  /// I2 we will have already determined that next(I1) == next(I2). This
+  /// inductive property allows us to simply return the value number of the next
+  /// instruction that defines memory.
+  uint32_t getMemoryUseOrder(Instruction *Inst) {
+    auto *BB = Inst->getParent();
+    for (auto I = std::next(Inst->getIterator()), E = BB->end();
+         I != E && !I->isTerminator(); ++I) {
+      if (!isMemoryInst(&*I))
+        continue;
+      if (isa<LoadInst>(&*I))
+        continue;
+      CallInst *CI = dyn_cast<CallInst>(&*I);
+      if (CI && CI->onlyReadsMemory())
+        continue;
+      InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+      if (II && II->onlyReadsMemory())
+        continue;
+      return lookupOrAdd(&*I);
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+  GVNSink() : VN() {}
+  bool run(Function &F) {
+    DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+
+    unsigned NumSunk = 0;
+    ReversePostOrderTraversal<Function*> RPOT(&F);
+    for (auto *N : RPOT)
+      NumSunk += sinkBB(N);
+    
+    return NumSunk > 0;
+  }
+
+private:
+  ValueTable VN;
+
+  bool isInstructionBlacklisted(Instruction *I) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return true;
+    return false;
+  }
+
+  /// The main heuristic function. Analyze the set of instructions pointed to by
+  /// LRI and return a candidate solution if these instructions can be sunk, or
+  /// None otherwise.
+  Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+      LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+      ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+  /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+  void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+                          SmallPtrSetImpl<Value *> &PHIContents) {
+    for (auto &I : *BB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        return;
+
+      auto MPHI = ModelledPHI(PN);
+      PHIs.insert(MPHI);
+      for (auto *V : MPHI.getValues())
+        PHIContents.insert(V);
+    }
+  }
+
+  /// The main instruction sinking driver. Set up state and try and sink
+  /// instructions into BBEnd from its predecessors.
+  unsigned sinkBB(BasicBlock *BBEnd);
+
+  /// Perform the actual mechanics of sinking an instruction from Blocks into
+  /// BBEnd, which is their only successor.
+  void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+  /// Remove PHIs that all have the same incoming value.
+  void foldPointlessPHINodes(BasicBlock *BB) {
+    auto I = BB->begin();
+    while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+      if (!all_of(PN->incoming_values(),
+                  [&](const Value *V) { return V == PN->getIncomingValue(0); }))
+        continue;
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+      PN->eraseFromParent();
+    }
+  }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+  LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+  ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+  auto Insts = *LRI;
+  DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+                                                             : Insts) {
+    I->dump();
+  } dbgs() << " ]\n";);
+
+  DenseMap<uint32_t, unsigned> VNums;
+  for (auto *I : Insts) {
+    uint32_t N = VN.lookupOrAdd(I);
+    DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n");
+    if (N == ~0U)
+      return None;
+    VNums[N]++;
+  }
+  unsigned VNumToSink =
+      std::max_element(VNums.begin(), VNums.end(),
+                       [](const std::pair<uint32_t, unsigned> &I,
+                          const std::pair<uint32_t, unsigned> &J) {
+                         return I.second < J.second;
+                       })
+          ->first;
+
+  if (VNums[VNumToSink] == 1)
+    // Can't sink anything!
+    return None;
+
+  // Now restrict the number of incoming blocks down to only those with
+  // VNumToSink.
+  auto &ActivePreds = LRI.getActiveBlocks();
+  unsigned InitialActivePredSize = ActivePreds.size();
+  SmallVector<Instruction *, 4> NewInsts;
+  for (auto *I : Insts) {
+    if (VN.lookup(I) != VNumToSink)
+      ActivePreds.erase(I->getParent());
+    else
+      NewInsts.push_back(I);
+  }
+  for (auto *I : NewInsts)
+    if (isInstructionBlacklisted(I))
+      return None;
+
+  // If we've restricted the incoming blocks, restrict all needed PHIs also
+  // to that set.
+  bool RecomputePHIContents = false;
+  if (ActivePreds.size() != InitialActivePredSize) {
+    ModelledPHISet NewNeededPHIs;
+    for (auto P : NeededPHIs) {
+      P.restrictToBlocks(ActivePreds);
+      NewNeededPHIs.insert(P);
+    }
+    NeededPHIs = NewNeededPHIs;
+    LRI.restrictToBlocks(ActivePreds);
+    RecomputePHIContents = true;
+  }
+
+  // The sunk instruction's results.
+  ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+  // Does sinking this instruction render previous PHIs redundant?
+  if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
+    NeededPHIs.erase(NewPHI);
+    RecomputePHIContents = true;
+  }
+
+  if (RecomputePHIContents) {
+    // The needed PHIs have changed, so recompute the set of all needed
+    // values.
+    PHIContents.clear();
+    for (auto &PHI : NeededPHIs)
+      PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  // Is this instruction required by a later PHI that doesn't match this PHI?
+  // if so, we can't sink this instruction.
+  for (auto *V : NewPHI.getValues())
+    if (PHIContents.count(V))
+      // V exists in this PHI, but the whole PHI is different to NewPHI
+      // (else it would have been removed earlier). We cannot continue
+      // because this isn't representable.
+      return None;
+
+  // Which operands need PHIs?
+  // FIXME: If any of these fail, we should partition up the candidates to
+  // try and continue making progress.
+  Instruction *I0 = NewInsts[0];
+  for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+    ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+    if (PHI.areAllIncomingValuesSame())
+      continue;
+    if (!canReplaceOperandWithVariable(I0, OpNum))
+      // We can 't create a PHI from this instruction!
+      return None;
+    if (NeededPHIs.count(PHI))
+      continue;
+    if (!PHI.areAllIncomingValuesSameType())
+      return None;
+    // Don't create indirect calls! The called value is the final operand.
+    if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+        PHI.areAnyIncomingValuesConstant())
+      return None;
+
+    NeededPHIs.reserve(NeededPHIs.size());
+    NeededPHIs.insert(PHI);
+    PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  if (isMemoryInst(NewInsts[0]))
+    ++MemoryInstNum;
+
+  SinkingInstructionCandidate Cand;
+  Cand.NumInstructions = ++InstNum;
+  Cand.NumMemoryInsts = MemoryInstNum;
+  Cand.NumBlocks = ActivePreds.size();
+  Cand.NumPHIs = NeededPHIs.size();
+  for (auto *C : ActivePreds)
+    Cand.Blocks.push_back(C);
+
+  return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+  DEBUG(dbgs() << "GVNSink: running on basic block ";
+        BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+  SmallVector<BasicBlock *, 4> Preds;
+  for (auto *B : predecessors(BBEnd)) {
+    auto *T = B->getTerminator();
+    if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+      Preds.push_back(B);
+    else
+      return 0;
+  }
+  if (Preds.size() < 2)
+    return 0;
+  std::sort(Preds.begin(), Preds.end());
+
+  unsigned NumOrigPreds = Preds.size();
+  // We can only sink instructions through unconditional branches.
+  for (auto I = Preds.begin(); I != Preds.end();) {
+    if ((*I)->getTerminator()->getNumSuccessors() != 1)
+      I = Preds.erase(I);
+    else
+      ++I;
+  }
+
+  LockstepReverseIterator LRI(Preds);
+  SmallVector<SinkingInstructionCandidate, 4> Candidates;
+  unsigned InstNum = 0, MemoryInstNum = 0;
+  ModelledPHISet NeededPHIs;
+  SmallPtrSet<Value *, 4> PHIContents;
+  analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+  unsigned NumOrigPHIs = NeededPHIs.size();
+
+  while (LRI.isValid()) {
+    auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+                                             NeededPHIs, PHIContents);
+    if (!Cand)
+      break;
+    Cand->calculateCost(NumOrigPHIs, Preds.size());
+    Candidates.emplace_back(*Cand);
+    --LRI;
+  }
+
+  std::stable_sort(
+      Candidates.begin(), Candidates.end(),
+      [](const SinkingInstructionCandidate &A,
+         const SinkingInstructionCandidate &B) { return A > B; });
+  DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+                                                    : Candidates) dbgs()
+                                               << "  " << C << "\n";);
+
+  // Pick the top candidate, as long it is positive!
+  if (Candidates.empty() || Candidates.front().Cost <= 0)
+    return 0;
+  auto C = Candidates.front();
+
+  DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+  BasicBlock *InsertBB = BBEnd;
+  if (C.Blocks.size() < NumOrigPreds) {
+    DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
+          dbgs() << "\n");
+    InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+    if (!InsertBB) {
+      DEBUG(dbgs() << " -- FAILED to split edge!\n");
+      // Edge couldn't be split.
+      return 0;
+    }
+  }
+
+  for (unsigned I = 0; I < C.NumInstructions; ++I)
+    sinkLastInstruction(C.Blocks, InsertBB);
+
+  return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+                                  BasicBlock *BBEnd) {
+  SmallVector<Instruction *, 4> Insts;
+  for (BasicBlock *BB : Blocks)
+    Insts.push_back(BB->getTerminator()->getPrevNode());
+  Instruction *I0 = Insts.front();
+
+  SmallVector<Value *, 4> NewOperands;
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+      return I->getOperand(O) != I0->getOperand(O);
+    });
+    if (!NeedPHI) {
+      NewOperands.push_back(I0->getOperand(O));
+      continue;
+    }
+
+    // Create a new PHI in the successor block and populate it.
+    auto *Op = I0->getOperand(O);
+    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+    auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                               Op->getName() + ".sink", &BBEnd->front());
+    for (auto *I : Insts)
+      PN->addIncoming(I->getOperand(O), I->getParent());
+    NewOperands.push_back(PN);
+  }
+
+  // Arbitrarily use I0 as the new "common" instruction; remap its operands
+  // and move it to the start of the successor block.
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+    I0->getOperandUse(O).set(NewOperands[O]);
+  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+  // Update metadata and IR flags.
+  for (auto *I : Insts)
+    if (I != I0) {
+      combineMetadataForCSE(I0, I);
+      I0->andIRFlags(I);
+    }
+
+  for (auto *I : Insts)
+    if (I != I0)
+      I->replaceAllUsesWith(I0);
+  foldPointlessPHINodes(BBEnd);
+
+  // Finally nuke all instructions apart from the common instruction.
+  for (auto *I : Insts)
+    if (I != I0)
+      I->eraseFromParent();
+
+  NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNSinkLegacyPass() : FunctionPass(ID) {
+    initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    GVNSink G;
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+  GVNSink G;
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+                      "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+                    "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index b05ef00..fb7c6e1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -40,7 +40,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/GuardWidening.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -50,7 +49,9 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
@@ -536,10 +537,8 @@ bool GuardWideningImpl::parseRangeChecks(
       Changed = true;
     } else if (match(Check.getBase(),
                      m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
-      unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits();
-      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      computeKnownBits(OpLHS, KnownZero, KnownOne, DL);
-      if ((OpRHS->getValue() & KnownZero) == OpRHS->getValue()) {
+      KnownBits Known = computeKnownBits(OpLHS, DL);
+      if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
         Check.setBase(OpLHS);
         APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
         Check.setOffset(ConstantInt::get(Ctx, NewOffset));
@@ -568,8 +567,7 @@ bool GuardWideningImpl::combineRangeChecks(
       return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
     };
 
-    std::copy_if(Checks.begin(), Checks.end(),
-                 std::back_inserter(CurrentChecks), IsCurrentCheck);
+    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
     Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
 
     assert(CurrentChecks.size() != 0 && "We know we have at least one!");
@@ -613,16 +611,16 @@ bool GuardWideningImpl::combineRangeChecks(
     // We have a series of f+1 checks as:
     //
     //   I+k_0 u< L   ... Chk_0
-    //   I_k_1 u< L   ... Chk_1
+    //   I+k_1 u< L   ... Chk_1
     //   ...
-    //   I_k_f u< L   ... Chk_(f+1)
+    //   I+k_f u< L   ... Chk_f
     //
-    //     with forall i in [0,f): k_f-k_i u< k_f-k_0  ... Precond_0
+    //     with forall i in [0,f]: k_f-k_i u< k_f-k_0  ... Precond_0
     //          k_f-k_0 u< INT_MIN+k_f                 ... Precond_1
     //          k_f != k_0                             ... Precond_2
     //
     // Claim:
-    //   Chk_0 AND Chk_(f+1)  implies all the other checks
+    //   Chk_0 AND Chk_f  implies all the other checks
     //
     // Informal proof sketch:
     //
@@ -658,8 +656,12 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  bool Changed = GuardWideningImpl(DT, PDT, LI).run();
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  if (!GuardWideningImpl(DT, PDT, LI).run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 1752fb7..1078296 100644
--- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -86,6 +86,10 @@ static cl::opt<bool> UsePostIncrementRanges(
   cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
   cl::init(true));
 
+static cl::opt<bool>
+DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
+            cl::desc("Disable Linear Function Test Replace optimization"));
+
 namespace {
 struct RewritePhi;
 
@@ -97,7 +101,7 @@ class IndVarSimplify {
   TargetLibraryInfo *TLI;
   const TargetTransformInfo *TTI;
 
-  SmallVector<WeakVH, 16> DeadInsts;
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
   bool Changed = false;
 
   bool isValidRewrite(Value *FromVal, Value *ToVal);
@@ -231,8 +235,9 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   bool isExact = false;
   // See if we can convert this to an int64_t
   uint64_t UIntVal;
-  if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
-                           &isExact) != APFloat::opOK || !isExact)
+  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+                           APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+      !isExact)
     return false;
   IntVal = UIntVal;
   return true;
@@ -414,8 +419,8 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
                                       Compare->getName());
 
   // In the following deletions, PN may become dead and may be deleted.
-  // Use a WeakVH to observe whether this happens.
-  WeakVH WeakPH = PN;
+  // Use a WeakTrackingVH to observe whether this happens.
+  WeakTrackingVH WeakPH = PN;
 
   // Delete the old floating point exit comparison.  The branch starts using the
   // new comparison.
@@ -450,7 +455,7 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
   //
   BasicBlock *Header = L->getHeader();
 
-  SmallVector<WeakVH, 8> PHIs;
+  SmallVector<WeakTrackingVH, 8> PHIs;
   for (BasicBlock::iterator I = Header->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I)
     PHIs.push_back(PN);
@@ -900,13 +905,13 @@ class WidenIV {
   PHINode *WidePhi;
   Instruction *WideInc;
   const SCEV *WideIncExpr;
-  SmallVectorImpl<WeakVH> &DeadInsts;
+  SmallVectorImpl<WeakTrackingVH> &DeadInsts;
 
   SmallPtrSet<Instruction *,16> Widened;
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
   enum ExtendKind { ZeroExtended, SignExtended, Unknown };
-  // A map tracking the kind of extension used to widen each narrow IV 
+  // A map tracking the kind of extension used to widen each narrow IV
   // and narrow IV user.
   // Key: pointer to a narrow IV or IV user.
   // Value: the kind of extension used to widen this Instruction.
@@ -940,20 +945,13 @@ class WidenIV {
   }
 
 public:
-  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
-          ScalarEvolution *SEv, DominatorTree *DTree,
-          SmallVectorImpl<WeakVH> &DI, bool HasGuards) :
-    OrigPhi(WI.NarrowIV),
-    WideType(WI.WidestNativeType),
-    LI(LInfo),
-    L(LI->getLoopFor(OrigPhi->getParent())),
-    SE(SEv),
-    DT(DTree),
-    HasGuards(HasGuards),
-    WidePhi(nullptr),
-    WideInc(nullptr),
-    WideIncExpr(nullptr),
-    DeadInsts(DI) {
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
+          DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI,
+          bool HasGuards)
+      : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo),
+        L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree),
+        HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr),
+        WideIncExpr(nullptr), DeadInsts(DI) {
     assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
     ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
   }
@@ -1608,7 +1606,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
       return;
 
     CmpInst::Predicate P =
-            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);  
+            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
 
     auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
     auto CmpConstrainedLHSRange =
@@ -1634,7 +1632,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
   UpdateRangeFromGuards(NarrowUser);
 
   BasicBlock *NarrowUserBB = NarrowUser->getParent();
-  // If NarrowUserBB is statically unreachable asking dominator queries may 
+  // If NarrowUserBB is statically unreachable asking dominator queries may
   // yield surprising results. (e.g. the block may not have a dom tree node)
   if (!DT->isReachableFromEntry(NarrowUserBB))
     return;
@@ -1829,6 +1827,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
     // An IV counter must preserve its type.
     if (IncI->getNumOperands() == 2)
       break;
+    LLVM_FALLTHROUGH;
   default:
     return nullptr;
   }
@@ -2152,6 +2151,8 @@ linearFunctionTestReplace(Loop *L,
   Value *CmpIndVar = IndVar;
   const SCEV *IVCount = BackedgeTakenCount;
 
+  assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
   // If the exiting block is the same as the backedge block, we prefer to
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
@@ -2376,6 +2377,7 @@ bool IndVarSimplify::run(Loop *L) {
   //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
   //    and we're in trouble if we can't find the induction variable even when
   //    we've manually inserted one.
+  //  - LFTR relies on having a single backedge.
   if (!L->isLoopSimplifyForm())
     return false;
 
@@ -2415,7 +2417,8 @@ bool IndVarSimplify::run(Loop *L) {
 
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
-  if (canExpandBackedgeTakenCount(L, SE, Rewriter) && needsLFTR(L, DT)) {
+  if (!DisableLFTR && canExpandBackedgeTakenCount(L, SE, Rewriter) &&
+      needsLFTR(L, DT)) {
     PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT);
     if (IndVar) {
       // Check preconditions for proper SCEVExpander operation. SCEV does not
@@ -2492,8 +2495,9 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 8e81541..99b4458 100644
--- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -59,8 +59,8 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 
 using namespace llvm;
 
@@ -446,6 +446,15 @@ struct LoopStructure {
   BasicBlock *LatchExit;
   unsigned LatchBrExitIdx;
 
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+  //   ... body ...
+
   Value *IndVarNext;
   Value *IndVarStart;
   Value *LoopExitAt;
@@ -789,9 +798,32 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     return None;
   }
 
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
   // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
+    bool DecreasedRightValueByOne = false;
+    // Try to turn eq/ne predicates to those we can work with.
+    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+      // while (++i != len) {         while (++i < len) {
+      //   ...                 --->     ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SLT;
+    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+             !CanBeSMin(SE, RightSCEV)) {
+      // while (true) {               while (true) {
+      //   if (++i == len)     --->     if (++i > len - 1)
+      //     break;                       break;
+      //   ...                          ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SGT;
+      RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+      DecreasedRightValueByOne = true;
+    }
+
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) ||
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0);
@@ -809,11 +841,48 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(Preheader->getTerminator());
-      RightValue = B.CreateAdd(RightValue, One);
-    }
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SLT, IndVarStart,
+              SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
 
+      // We need to increase the right value unless we have already decreased
+      // it virtually when we replaced EQ with SGT.
+      if (!DecreasedRightValueByOne) {
+        IRBuilder<> B(Preheader->getTerminator());
+        RightValue = B.CreateAdd(RightValue, One);
+      }
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
+      assert(!DecreasedRightValueByOne &&
+             "Right value can be decreased only for LatchBrExitIdx == 0!");
+    }
   } else {
+    bool IncreasedRightValueByOne = false;
+    // Try to turn eq/ne predicates to those we can work with.
+    if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+      // while (--i != len) {         while (--i > len) {
+      //   ...                 --->     ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SGT;
+    else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 &&
+             !CanBeSMax(SE, RightSCEV)) {
+      // while (true) {               while (true) {
+      //   if (--i == len)     --->     if (--i < len + 1)
+      //     break;                       break;
+      //   ...                          ...
+      // }                            }
+      Pred = ICmpInst::ICMP_SLT;
+      RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+      IncreasedRightValueByOne = true;
+    }
+
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
         (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0);
@@ -831,15 +900,30 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
-      IRBuilder<> B(Preheader->getTerminator());
-      RightValue = B.CreateSub(RightValue, One);
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SGT, IndVarStart,
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+
+      // We need to decrease the right value unless we have already increased
+      // it virtually when we replaced EQ with SLT.
+      if (!IncreasedRightValueByOne) {
+        IRBuilder<> B(Preheader->getTerminator());
+        RightValue = B.CreateSub(RightValue, One);
+      }
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+      assert(!IncreasedRightValueByOne &&
+             "Right value can be increased only for LatchBrExitIdx == 0!");
     }
   }
 
-  const SCEV *StartNext = IndVarNext->getStart();
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
-  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
-
   BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
 
   assert(SE.getLoopDisposition(LatchCount, &L) ==
@@ -883,20 +967,23 @@ LoopConstrainer::calculateSubRanges() const {
   // I think we can be more aggressive here and make this nuw / nsw if the
   // addition that feeds into the icmp for the latch's terminating branch is nuw
   // / nsw.  In any case, a wrapping 2's complement addition is safe.
-  ConstantInt *One = ConstantInt::get(Ty, 1);
   const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart);
   const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt);
 
   bool Increasing = MainLoopStructure.IndVarIncreasing;
 
-  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the
-  // range of values the induction variable takes.
+  // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+  // [Smallest, GreatestSeen] is the range of values the induction variable
+  // takes.
 
-  const SCEV *Smallest = nullptr, *Greatest = nullptr;
+  const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
 
+  const SCEV *One = SE.getOne(Ty);
   if (Increasing) {
     Smallest = Start;
     Greatest = End;
+    // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+    GreatestSeen = SE.getMinusSCEV(End, One);
   } else {
     // These two computations may sign-overflow.  Here is why that is okay:
     //
@@ -914,8 +1001,9 @@ LoopConstrainer::calculateSubRanges() const {
     //    will be an empty range.  Returning an empty range is always safe.
     //
 
-    Smallest = SE.getAddExpr(End, SE.getSCEV(One));
-    Greatest = SE.getAddExpr(Start, SE.getSCEV(One));
+    Smallest = SE.getAddExpr(End, One);
+    Greatest = SE.getAddExpr(Start, One);
+    GreatestSeen = Start;
   }
 
   auto Clamp = [this, Smallest, Greatest](const SCEV *S) {
@@ -930,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const {
     Result.LowLimit = Clamp(Range.getBegin());
 
   bool ProvablyNoPostLoop =
-      SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd());
+      SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd());
   if (!ProvablyNoPostLoop)
     Result.HighLimit = Clamp(Range.getEnd());
 
@@ -1194,7 +1282,12 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
 
 Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
                                                  ValueToValueMapTy &VM) {
-  Loop &New = LPM.addLoop(Parent);
+  Loop &New = *new Loop();
+  if (Parent)
+    Parent->addChildLoop(&New);
+  else
+    LI.addTopLevelLoop(&New);
+  LPM.addLoop(New);
 
   // Add all of the blocks in Original to the new loop.
   for (auto *BB : Original->blocks())
@@ -1332,28 +1425,35 @@ bool LoopConstrainer::run() {
 
   DT.recalculate(F);
 
+  // We need to first add all the pre and post loop blocks into the loop
+  // structures (as part of createClonedLoopStructure), and then update the
+  // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+  // LI when LoopSimplifyForm is generated.
+  Loop *PreL = nullptr, *PostL = nullptr;
   if (!PreLoop.Blocks.empty()) {
-    auto *L = createClonedLoopStructure(
+    PreL = createClonedLoopStructure(
         &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map);
-    formLCSSARecursively(*L, DT, &LI, &SE);
-    simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
-    // Pre loops are slow paths, we do not need to perform any loop
-    // optimizations on them.
-    DisableAllLoopOptsOnLoop(*L);
   }
 
   if (!PostLoop.Blocks.empty()) {
-    auto *L = createClonedLoopStructure(
+    PostL = createClonedLoopStructure(
         &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map);
+  }
+
+  // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+  auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
     formLCSSARecursively(*L, DT, &LI, &SE);
     simplifyLoop(L, &DT, &LI, &SE, nullptr, true);
-    // Post loops are slow paths, we do not need to perform any loop
+    // Pre/post loops are slow paths, we do not need to perform any loop
     // optimizations on them.
-    DisableAllLoopOptsOnLoop(*L);
-  }
-
-  formLCSSARecursively(OriginalLoop, DT, &LI, &SE);
-  simplifyLoop(&OriginalLoop, &DT, &LI, &SE, nullptr, true);
+    if (!IsOriginalLoop)
+      DisableAllLoopOptsOnLoop(*L);
+  };
+  if (PreL)
+    CanonicalizeLoop(PreL, false);
+  if (PostL)
+    CanonicalizeLoop(PostL, false);
+  CanonicalizeLoop(&OriginalLoop, true);
 
   return true;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
new file mode 100644
index 0000000..89b28f0
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -0,0 +1,969 @@
+//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+//   __shared__ float a[10];
+//   float v = a[i];
+// to
+//   %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+//   %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+//   %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+//   %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+//   %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+//   CUDA: Compiling and optimizing for a GPU platform
+//   Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+//   ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+//     %generic.input = addrspacecast float addrspace(3)* %input to float*
+//   loop:
+//     %y = phi [ %generic.input, %y2 ]
+//     %y2 = getelementptr %y, 1
+//     %v = load %y2
+//     br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+//   uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+//   %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+//   %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+//   %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+namespace {
+static const unsigned UninitializedAddressSpace = ~0u;
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+
+/// \brief InferAddressSpaces
+class InferAddressSpaces : public FunctionPass {
+  /// Target specific address space which uses of should be replaced if
+  /// possible.
+  unsigned FlatAddrSpace;
+
+public:
+  static char ID;
+
+  InferAddressSpaces() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // Returns the new address space of V if updated; otherwise, returns None.
+  Optional<unsigned>
+  updateAddressSpace(const Value &V,
+                     const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+  // Tries to infer the specific address space of each address expression in
+  // Postorder.
+  void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+                          ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+  // Changes the flat address expressions in function F to point to specific
+  // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+  // all flat expressions in the use-def graph of function F.
+  bool
+  rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+                              const ValueToAddrSpaceMapTy &InferredAddrSpace,
+                              Function *F) const;
+
+  void appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const;
+
+  bool rewriteIntrinsicOperands(IntrinsicInst *II,
+                                Value *OldV, Value *NewV) const;
+  void collectRewritableIntrinsicOperands(
+    IntrinsicInst *II,
+    std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const;
+
+  std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
+
+  Value *cloneValueWithNewAddressSpace(
+    Value *V, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+  unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+};
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+void initializeInferAddressSpacesPass(PassRegistry &);
+}
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+                false, false)
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V) {
+  if (!isa<Operator>(V))
+    return false;
+
+  switch (cast<Operator>(V).getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+  case Instruction::Select:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+  const Operator &Op = cast<Operator>(V);
+  switch (Op.getOpcode()) {
+  case Instruction::PHI: {
+    auto IncomingValues = cast<PHINode>(Op).incoming_values();
+    return SmallVector<Value *, 2>(IncomingValues.begin(),
+                                   IncomingValues.end());
+  }
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::GetElementPtr:
+    return {Op.getOperand(0)};
+  case Instruction::Select:
+    return {Op.getOperand(1), Op.getOperand(2)};
+  default:
+    llvm_unreachable("Unexpected instruction type.");
+  }
+}
+
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+                                                  Value *OldV,
+                                                  Value *NewV) const {
+  Module *M = II->getParent()->getParent()->getParent();
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:{
+    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+    if (!IsVolatile || !IsVolatile->isZero())
+      return false;
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::objectsize: {
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+    IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::objectsize:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+                                                 PostorderStack, Visited);
+    break;
+  default:
+    break;
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
+void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
+    Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    DenseSet<Value *> &Visited) const {
+  assert(V->getType()->isPointerTy());
+
+  // Generic addressing expressions may be hidden in nested constant
+  // expressions.
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+    // TODO: Look in non-address parts, like icmp operands.
+    if (isAddressExpression(*CE) && Visited.insert(CE).second)
+      PostorderStack.push_back(std::make_pair(CE, false));
+
+    return;
+  }
+
+  if (isAddressExpression(*V) &&
+      V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
+    if (Visited.insert(V).second) {
+      PostorderStack.push_back(std::make_pair(V, false));
+
+      Operator *Op = cast<Operator>(V);
+      for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
+        if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
+          if (isAddressExpression(*CE) && Visited.insert(CE).second)
+            PostorderStack.emplace_back(CE, false);
+        }
+      }
+    }
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<WeakTrackingVH>
+InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
+  // This function implements a non-recursive postorder traversal of a partial
+  // use-def graph of function F.
+  std::vector<std::pair<Value *, bool>> PostorderStack;
+  // The set of visited expressions.
+  DenseSet<Value *> Visited;
+
+  auto PushPtrOperand = [&](Value *Ptr) {
+    appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
+                                                 Visited);
+  };
+
+  // Look at operations that may be interesting accelerate by moving to a known
+  // address space. We aim at generating after loads and stores, but pure
+  // addressing calculations may also be faster.
+  for (Instruction &I : instructions(F)) {
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      if (!GEP->getType()->isVectorTy())
+        PushPtrOperand(GEP->getPointerOperand());
+    } else if (auto *LI = dyn_cast<LoadInst>(&I))
+      PushPtrOperand(LI->getPointerOperand());
+    else if (auto *SI = dyn_cast<StoreInst>(&I))
+      PushPtrOperand(SI->getPointerOperand());
+    else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+      PushPtrOperand(RMW->getPointerOperand());
+    else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+      PushPtrOperand(CmpX->getPointerOperand());
+    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+      // For memset/memcpy/memmove, any pointer operand can be replaced.
+      PushPtrOperand(MI->getRawDest());
+
+      // Handle 2nd operand for memcpy/memmove.
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+        PushPtrOperand(MTI->getRawSource());
+    } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
+    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+      // FIXME: Handle vectors of pointers
+      if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+        PushPtrOperand(Cmp->getOperand(0));
+        PushPtrOperand(Cmp->getOperand(1));
+      }
+    } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+      if (!ASC->getType()->isVectorTy())
+        PushPtrOperand(ASC->getPointerOperand());
+    }
+  }
+
+  std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
+  while (!PostorderStack.empty()) {
+    Value *TopVal = PostorderStack.back().first;
+    // If the operands of the expression on the top are already explored,
+    // adds that expression to the resultant postorder.
+    if (PostorderStack.back().second) {
+      if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
+        Postorder.push_back(TopVal);
+      PostorderStack.pop_back();
+      continue;
+    }
+    // Otherwise, adds its operands to the stack and explores them.
+    PostorderStack.back().second = true;
+    for (Value *PtrOperand : getPointerOperands(*TopVal)) {
+      appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
+                                                   Visited);
+    }
+  }
+  return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Value *Operand = OperandUse.get();
+
+  Type *NewPtrTy =
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (Constant *C = dyn_cast<Constant>(Operand))
+    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+  if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+    return NewOperand;
+
+  UndefUsesToFix->push_back(&OperandUse);
+  return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+static Value *cloneInstructionWithNewAddressSpace(
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+  Type *NewPtrType =
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (I->getOpcode() == Instruction::AddrSpaceCast) {
+    Value *Src = I->getOperand(0);
+    // Because `I` is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space, according
+    // to our algorithm.
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
+
+  // Computes the converted pointer operands.
+  SmallVector<Value *, 4> NewPointerOperands;
+  for (const Use &OperandUse : I->operands()) {
+    if (!OperandUse.get()->getType()->isPointerTy())
+      NewPointerOperands.push_back(nullptr);
+    else
+      NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+                                     OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+    return new BitCastInst(NewPointerOperands[0], NewPtrType);
+  case Instruction::PHI: {
+    assert(I->getType()->isPointerTy());
+    PHINode *PHI = cast<PHINode>(I);
+    PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+    for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+      unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+      NewPHI->addIncoming(NewPointerOperands[OperandNo],
+                          PHI->getIncomingBlock(Index));
+    }
+    return NewPHI;
+  }
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        GEP->getSourceElementType(), NewPointerOperands[0],
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+    NewGEP->setIsInBounds(GEP->isInBounds());
+    return NewGEP;
+  }
+  case Instruction::Select: {
+    assert(I->getType()->isPointerTy());
+    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+                              NewPointerOperands[2], "", nullptr, I);
+  }
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+  ConstantExpr *CE, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace) {
+  Type *TargetType =
+    CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+    // Because CE is flat, the source address space must be specific.
+    // Therefore, the inferred address space must be the source space according
+    // to our algorithm.
+    assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+           NewAddrSpace);
+    return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::BitCast) {
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+    return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::Select) {
+    Constant *Src0 = CE->getOperand(1);
+    Constant *Src1 = CE->getOperand(2);
+    if (Src0->getType()->getPointerAddressSpace() ==
+        Src1->getType()->getPointerAddressSpace()) {
+
+      return ConstantExpr::getSelect(
+          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+          ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+    }
+  }
+
+  // Computes the operands of the new constant expression.
+  bool IsNew = false;
+  SmallVector<Constant *, 4> NewOperands;
+  for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+    Constant *Operand = CE->getOperand(Index);
+    // If the address space of `Operand` needs to be modified, the new operand
+    // with the new address space should already be in ValueWithNewAddrSpace
+    // because (1) the constant expressions we consider (i.e. addrspacecast,
+    // bitcast, and getelementptr) do not incur cycles in the data flow graph
+    // and (2) this function is called on constant expressions in postorder.
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+      IsNew = true;
+      NewOperands.push_back(cast<Constant>(NewOperand));
+    } else {
+      // Otherwise, reuses the old operand.
+      NewOperands.push_back(Operand);
+    }
+  }
+
+  // If !IsNew, we will replace the Value with itself. However, replaced values
+  // are assumed to wrapped in a addrspace cast later so drop it now.
+  if (!IsNew)
+    return nullptr;
+
+  if (CE->getOpcode() == Instruction::GetElementPtr) {
+    // Needs to specify the source type while constructing a getelementptr
+    // constant expression.
+    return CE->getWithOperands(
+      NewOperands, TargetType, /*OnlyIfReduced=*/false,
+      NewOperands[0]->getType()->getPointerElementType());
+  }
+
+  return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
+  Value *V, unsigned NewAddrSpace,
+  const ValueToValueMapTy &ValueWithNewAddrSpace,
+  SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+  // All values in Postorder are flat address expressions.
+  assert(isAddressExpression(*V) &&
+         V->getType()->getPointerAddressSpace() == FlatAddrSpace);
+
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Value *NewV = cloneInstructionWithNewAddressSpace(
+      I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+      if (NewI->getParent() == nullptr) {
+        NewI->insertBefore(I);
+        NewI->takeName(I);
+      }
+    }
+    return NewV;
+  }
+
+  return cloneConstantExprWithNewAddressSpace(
+    cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
+unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
+                                               unsigned AS2) const {
+  if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+    return FlatAddrSpace;
+
+  if (AS1 == UninitializedAddressSpace)
+    return AS2;
+  if (AS2 == UninitializedAddressSpace)
+    return AS1;
+
+  // The join of two different specific address spaces is flat.
+  return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
+bool InferAddressSpaces::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  FlatAddrSpace = TTI.getFlatAddressSpace();
+  if (FlatAddrSpace == UninitializedAddressSpace)
+    return false;
+
+  // Collects all flat address expressions in postorder.
+  std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
+
+  // Runs a data-flow analysis to refine the address spaces of every expression
+  // in Postorder.
+  ValueToAddrSpaceMapTy InferredAddrSpace;
+  inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+  // Changes the address spaces of the flat address expressions who are inferred
+  // to point to a specific address space.
+  return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F);
+}
+
+// Constants need to be tracked through RAUW to handle cases with nested
+// constant expressions, so wrap values in WeakTrackingVH.
+void InferAddressSpaces::inferAddressSpaces(
+    ArrayRef<WeakTrackingVH> Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+  SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+  // Initially, all expressions are in the uninitialized address space.
+  for (Value *V : Postorder)
+    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+
+    // Tries to update the address space of the stack top according to the
+    // address spaces of its operands.
+    DEBUG(dbgs() << "Updating the address space of\n  " << *V << '\n');
+    Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+    if (!NewAS.hasValue())
+      continue;
+    // If any updates are made, grabs its users to the worklist because
+    // their address spaces can also be possibly updated.
+    DEBUG(dbgs() << "  to " << NewAS.getValue() << '\n');
+    (*InferredAddrSpace)[V] = NewAS.getValue();
+
+    for (Value *User : V->users()) {
+      // Skip if User is already in the worklist.
+      if (Worklist.count(User))
+        continue;
+
+      auto Pos = InferredAddrSpace->find(User);
+      // Our algorithm only updates the address spaces of flat address
+      // expressions, which are those in InferredAddrSpace.
+      if (Pos == InferredAddrSpace->end())
+        continue;
+
+      // Function updateAddressSpace moves the address space down a lattice
+      // path. Therefore, nothing to do if User is already inferred as flat (the
+      // bottom element in the lattice).
+      if (Pos->second == FlatAddrSpace)
+        continue;
+
+      Worklist.insert(User);
+    }
+  }
+}
+
+Optional<unsigned> InferAddressSpaces::updateAddressSpace(
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+  assert(InferredAddrSpace.count(&V));
+
+  // The new inferred address space equals the join of the address spaces
+  // of all its pointer operands.
+  unsigned NewAS = UninitializedAddressSpace;
+
+  const Operator &Op = cast<Operator>(V);
+  if (Op.getOpcode() == Instruction::Select) {
+    Value *Src0 = Op.getOperand(1);
+    Value *Src1 = Op.getOperand(2);
+
+    auto I = InferredAddrSpace.find(Src0);
+    unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+      I->second : Src0->getType()->getPointerAddressSpace();
+
+    auto J = InferredAddrSpace.find(Src1);
+    unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+      J->second : Src1->getType()->getPointerAddressSpace();
+
+    auto *C0 = dyn_cast<Constant>(Src0);
+    auto *C1 = dyn_cast<Constant>(Src1);
+
+    // If one of the inputs is a constant, we may be able to do a constant
+    // addrspacecast of it. Defer inferring the address space until the input
+    // address space is known.
+    if ((C1 && Src0AS == UninitializedAddressSpace) ||
+        (C0 && Src1AS == UninitializedAddressSpace))
+      return None;
+
+    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+      NewAS = Src1AS;
+    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+      NewAS = Src0AS;
+    else
+      NewAS = joinAddressSpaces(Src0AS, Src1AS);
+  } else {
+    for (Value *PtrOperand : getPointerOperands(V)) {
+      auto I = InferredAddrSpace.find(PtrOperand);
+      unsigned OperandAS = I != InferredAddrSpace.end() ?
+        I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+      // join(flat, *) = flat. So we can break if NewAS is already flat.
+      NewAS = joinAddressSpaces(NewAS, OperandAS);
+      if (NewAS == FlatAddrSpace)
+        break;
+    }
+  }
+
+  unsigned OldAS = InferredAddrSpace.lookup(&V);
+  assert(OldAS != FlatAddrSpace);
+  if (OldAS == NewAS)
+    return None;
+  return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value.
+static bool isSimplePointerUseValidToReplace(Use &U) {
+  User *Inst = U.getUser();
+  unsigned OpNo = U.getOperandNo();
+
+  if (auto *LI = dyn_cast<LoadInst>(Inst))
+    return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile();
+
+  if (auto *SI = dyn_cast<StoreInst>(Inst))
+    return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile();
+
+  if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+    return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile();
+
+  if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+           !CmpX->isVolatile();
+  }
+
+  return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+                                     Value *NewV) {
+  IRBuilder<> B(MI);
+  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+  if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+    B.CreateMemSet(NewV, MSI->getValue(),
+                   MSI->getLength(), MSI->getAlignment(),
+                   false, // isVolatile
+                   TBAA, ScopeMD, NoAliasMD);
+  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+    Value *Src = MTI->getRawSource();
+    Value *Dest = MTI->getRawDest();
+
+    // Be careful in case this is a self-to-self copy.
+    if (Src == OldV)
+      Src = NewV;
+
+    if (Dest == OldV)
+      Dest = NewV;
+
+    if (isa<MemCpyInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpy(Dest, Src, MTI->getLength(),
+                     MTI->getAlignment(),
+                     false, // isVolatile
+                     TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else {
+      assert(isa<MemMoveInst>(MTI));
+      B.CreateMemMove(Dest, Src, MTI->getLength(),
+                      MTI->getAlignment(),
+                      false, // isVolatile
+                      TBAA, ScopeMD, NoAliasMD);
+    }
+  } else
+    llvm_unreachable("unhandled MemIntrinsic");
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+  assert(NewAS != UninitializedAddressSpace);
+
+  unsigned SrcAS = C->getType()->getPointerAddressSpace();
+  if (SrcAS == NewAS || isa<UndefValue>(C))
+    return true;
+
+  // Prevent illegal casts between different non-flat address spaces.
+  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+    return false;
+
+  if (isa<ConstantPointerNull>(C))
+    return true;
+
+  if (auto *Op = dyn_cast<Operator>(C)) {
+    // If we already have a constant addrspacecast, it should be safe to cast it
+    // off.
+    if (Op->getOpcode() == Instruction::AddrSpaceCast)
+      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+    if (Op->getOpcode() == Instruction::IntToPtr &&
+        Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+      return true;
+  }
+
+  return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+                                          Value::use_iterator End) {
+  User *CurUser = I->getUser();
+  ++I;
+
+  while (I != End && I->getUser() == CurUser)
+    ++I;
+
+  return I;
+}
+
+bool InferAddressSpaces::rewriteWithNewAddressSpaces(
+    ArrayRef<WeakTrackingVH> Postorder,
+    const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+  // For each address expression to be modified, creates a clone of it with its
+  // pointer operands converted to the new address space. Since the pointer
+  // operands are converted, the clone is naturally in the new address space by
+  // construction.
+  ValueToValueMapTy ValueWithNewAddrSpace;
+  SmallVector<const Use *, 32> UndefUsesToFix;
+  for (Value* V : Postorder) {
+    unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+    if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
+        V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+    }
+  }
+
+  if (ValueWithNewAddrSpace.empty())
+    return false;
+
+  // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+  for (const Use *UndefUse : UndefUsesToFix) {
+    User *V = UndefUse->getUser();
+    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    unsigned OperandNo = UndefUse->getOperandNo();
+    assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+    NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+  }
+
+  SmallVector<Instruction *, 16> DeadInstructions;
+
+  // Replaces the uses of the old address expressions with the new ones.
+  for (const WeakTrackingVH &WVH : Postorder) {
+    assert(WVH && "value was unexpectedly deleted");
+    Value *V = WVH;
+    Value *NewV = ValueWithNewAddrSpace.lookup(V);
+    if (NewV == nullptr)
+      continue;
+
+    DEBUG(dbgs() << "Replacing the uses of " << *V
+                 << "\n  with\n  " << *NewV << '\n');
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                                         C->getType());
+      if (C != Replace) {
+        DEBUG(dbgs() << "Inserting replacement const cast: "
+              << Replace << ": " << *Replace << '\n');
+        C->replaceAllUsesWith(Replace);
+        V = Replace;
+      }
+    }
+
+    Value::use_iterator I, E, Next;
+    for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+      Use &U = *I;
+
+      // Some users may see the same pointer operand in multiple operands. Skip
+      // to the next instruction.
+      I = skipToNextUser(I, E);
+
+      if (isSimplePointerUseValidToReplace(U)) {
+        // If V is used as the pointer operand of a compatible memory operation,
+        // sets the pointer operand to NewV. This replacement does not change
+        // the element type, so the resultant load/store is still valid.
+        U.set(NewV);
+        continue;
+      }
+
+      User *CurUser = U.getUser();
+      // Handle more complex cases like intrinsic that need to be remangled.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+          continue;
+      }
+
+      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+        if (rewriteIntrinsicOperands(II, V, NewV))
+          continue;
+      }
+
+      if (isa<Instruction>(CurUser)) {
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+          // If we can infer that both pointers are in the same addrspace,
+          // transform e.g.
+          //   %cmp = icmp eq float* %p, %q
+          // into
+          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          int SrcIdx = U.getOperandNo();
+          int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+          Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+              Cmp->setOperand(OtherIdx, OtherNewV);
+              Cmp->setOperand(SrcIdx, NewV);
+              continue;
+            }
+          }
+
+          // Even if the type mismatches, we can cast the constant.
+          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+              Cmp->setOperand(SrcIdx, NewV);
+              Cmp->setOperand(OtherIdx,
+                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+              continue;
+            }
+          }
+        }
+
+        if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          if (ASC->getDestAddressSpace() == NewAS) {
+            ASC->replaceAllUsesWith(NewV);
+            DeadInstructions.push_back(ASC);
+            continue;
+          }
+        }
+
+        // Otherwise, replaces the use with flat(NewV).
+        if (Instruction *I = dyn_cast<Instruction>(V)) {
+          BasicBlock::iterator InsertPos = std::next(I->getIterator());
+          while (isa<PHINode>(InsertPos))
+            ++InsertPos;
+          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+        } else {
+          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                               V->getType()));
+        }
+      }
+    }
+
+    if (V->use_empty()) {
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        DeadInstructions.push_back(I);
+    }
+  }
+
+  for (Instruction *I : DeadInstructions)
+    RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  return true;
+}
+
+FunctionPass *llvm::createInferAddressSpacesPass() {
+  return new InferAddressSpaces();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 1870c3d..dc9143b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -12,29 +12,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/JumpThreading.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -60,6 +64,11 @@ ImplicationSearchThreshold(
            "condition to use to thread over a weaker condition"),
   cl::init(3), cl::Hidden);
 
+static cl::opt<bool> PrintLVIAfterJumpThreading(
+    "print-lvi-after-jump-threading",
+    cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
+    cl::Hidden);
+
 namespace {
   /// This pass performs 'jump threading', which looks at blocks that have
   /// multiple predecessors and multiple successors.  If one or more of the
@@ -89,8 +98,10 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      if (PrintLVIAfterJumpThreading)
+        AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<LazyValueInfoWrapperPass>();
-      AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
     }
@@ -104,6 +115,7 @@ INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
@@ -121,16 +133,24 @@ bool JumpThreading::runOnFunction(Function &F) {
     return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
   if (HasProfileData) {
     LoopInfo LI{DominatorTree(F)};
-    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI),
-                      std::move(BPI));
+
+  bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
+                              std::move(BPI));
+  if (PrintLVIAfterJumpThreading) {
+    dbgs() << "LVI for function '" << F.getName() << "':\n";
+    LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                  dbgs());
+  }
+  return Changed;
 }
 
 PreservedAnalyses JumpThreadingPass::run(Function &F,
@@ -138,20 +158,19 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
   if (HasProfileData) {
     LoopInfo LI{DominatorTree(F)};
-    BPI.reset(new BranchProbabilityInfo(F, LI));
+    BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  bool Changed =
-      runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI));
 
-  // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<LazyValueAnalysis>(F);
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+                         std::move(BPI));
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -161,18 +180,23 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 }
 
 bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
-                                LazyValueInfo *LVI_, bool HasProfileData_,
+                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
+                                bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
   LVI = LVI_;
+  AA = AA_;
   BFI.reset();
   BPI.reset();
   // When profile data is available, we need to update edge weights after
   // successful jump threading, which requires both BPI and BFI being available.
   HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
   if (HasProfileData) {
     BPI = std::move(BPI_);
     BFI = std::move(BFI_);
@@ -219,33 +243,22 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
       // Can't thread an unconditional jump, but if the block is "almost
       // empty", we can replace uses of it with uses of the successor and make
       // this dead.
-      // We should not eliminate the loop header either, because eliminating
-      // a loop header might later prevent LoopSimplify from transforming nested
-      // loops into simplified form.
+      // We should not eliminate the loop header or latch either, because
+      // eliminating a loop header or latch might later prevent LoopSimplify
+      // from transforming nested loops into simplified form. We will rely on
+      // later passes in backend to clean up empty blocks.
       if (BI && BI->isUnconditional() &&
           BB != &BB->getParent()->getEntryBlock() &&
           // If the terminator is the only non-phi instruction, try to nuke it.
-          BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
-        // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
-        // block, we have to make sure it isn't in the LoopHeaders set.  We
-        // reinsert afterward if needed.
-        bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
-        BasicBlock *Succ = BI->getSuccessor(0);
-
+          BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) &&
+          !LoopHeaders.count(BI->getSuccessor(0))) {
         // FIXME: It is always conservatively correct to drop the info
         // for a block even if it doesn't get erased.  This isn't totally
         // awesome, but it allows us to use AssertingVH to prevent nasty
         // dangling pointer issues within LazyValueInfo.
         LVI->eraseBlock(BB);
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
           Changed = true;
-          // If we deleted BB and BB was the header of a loop, then the
-          // successor is now the header of the loop.
-          BB = Succ;
-        }
-
-        if (ErasedFromLoopHeaders)
-          LoopHeaders.insert(BB);
       }
     }
     EverChanged |= Changed;
@@ -255,10 +268,42 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   return EverChanged;
 }
 
-/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it. Stop scanning the block when passing the threshold.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
+static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+  assert(Cond->getType() == ToVal->getType());
+  auto *BB = Cond->getParent();
+  // We can unconditionally replace all uses in non-local blocks (i.e. uses
+  // strictly dominated by BB), since LVI information is true from the
+  // terminator of BB.
+  replaceNonLocalUsesWith(Cond, ToVal);
+  for (Instruction &I : reverse(*BB)) {
+    // Reached the Cond whose uses we are trying to replace, so there are no
+    // more uses.
+    if (&I == Cond)
+      break;
+    // We only replace uses in instructions that are guaranteed to reach the end
+    // of BB, where we know Cond is ToVal.
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+    I.replaceUsesOfWith(Cond, ToVal);
+  }
+  if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+    Cond->eraseFromParent();
+}
+
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
                                              unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I(BB->getFirstNonPHI());
 
@@ -266,15 +311,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // branch, so they shouldn't count against the duplication cost.
 
   unsigned Bonus = 0;
-  const TerminatorInst *BBTerm = BB->getTerminator();
-  // Threading through a switch statement is particularly profitable.  If this
-  // block ends in a switch, decrease its cost to make it more likely to happen.
-  if (isa<SwitchInst>(BBTerm))
-    Bonus = 6;
-
-  // The same holds for indirect branches, but slightly more so.
-  if (isa<IndirectBrInst>(BBTerm))
-    Bonus = 8;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
 
   // Bump the threshold up so the early exit from the loop doesn't skip the
   // terminator-based Size adjustment at the end.
@@ -283,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
-  for (; !isa<TerminatorInst>(I); ++I) {
+  for (; &*I != StopAt; ++I) {
 
     // Stop scanning the block if we've reached the threshold.
     if (Size > Threshold)
@@ -544,7 +591,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
   // Handle compare with phi operand, where the PHI is defined in this block.
   if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
     assert(Preference == WantInteger && "Compares only produce integers");
-    PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0));
+    Type *CmpType = Cmp->getType();
+    Value *CmpLHS = Cmp->getOperand(0);
+    Value *CmpRHS = Cmp->getOperand(1);
+    CmpInst::Predicate Pred = Cmp->getPredicate();
+
+    PHINode *PN = dyn_cast<PHINode>(CmpLHS);
     if (PN && PN->getParent() == BB) {
       const DataLayout &DL = PN->getModule()->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
@@ -552,15 +604,15 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         BasicBlock *PredBB = PN->getIncomingBlock(i);
         Value *LHS = PN->getIncomingValue(i);
-        Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB);
+        Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB);
 
-        Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL);
+        Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
         if (!Res) {
           if (!isa<Constant>(RHS))
             continue;
 
           LazyValueInfo::Tristate
-            ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS,
+            ResT = LVI->getPredicateOnEdge(Pred, LHS,
                                            cast<Constant>(RHS), PredBB, BB,
                                            CxtI ? CxtI : Cmp);
           if (ResT == LazyValueInfo::Unknown)
@@ -577,44 +629,81 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors(
 
     // If comparing a live-in value against a constant, see if we know the
     // live-in value on any predecessors.
-    if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) {
-      if (!isa<Instruction>(Cmp->getOperand(0)) ||
-          cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) {
-        Constant *RHSCst = cast<Constant>(Cmp->getOperand(1));
+    if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+      Constant *CmpConst = cast<Constant>(CmpRHS);
 
+      if (!isa<Instruction>(CmpLHS) ||
+          cast<Instruction>(CmpLHS)->getParent() != BB) {
         for (BasicBlock *P : predecessors(BB)) {
           // If the value is known by LazyValueInfo to be a constant in a
           // predecessor, use that information to try to thread this block.
           LazyValueInfo::Tristate Res =
-            LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0),
-                                    RHSCst, P, BB, CxtI ? CxtI : Cmp);
+            LVI->getPredicateOnEdge(Pred, CmpLHS,
+                                    CmpConst, P, BB, CxtI ? CxtI : Cmp);
           if (Res == LazyValueInfo::Unknown)
             continue;
 
-          Constant *ResC = ConstantInt::get(Cmp->getType(), Res);
+          Constant *ResC = ConstantInt::get(CmpType, Res);
           Result.push_back(std::make_pair(ResC, P));
         }
 
         return !Result.empty();
       }
 
+      // InstCombine can fold some forms of constant range checks into
+      // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+      // x as a live-in.
+      {
+        using namespace PatternMatch;
+        Value *AddLHS;
+        ConstantInt *AddConst;
+        if (isa<ConstantInt>(CmpConst) &&
+            match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+          if (!isa<Instruction>(AddLHS) ||
+              cast<Instruction>(AddLHS)->getParent() != BB) {
+            for (BasicBlock *P : predecessors(BB)) {
+              // If the value is known by LazyValueInfo to be a ConstantRange in
+              // a predecessor, use that information to try to thread this
+              // block.
+              ConstantRange CR = LVI->getConstantRangeOnEdge(
+                  AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+              // Propagate the range through the addition.
+              CR = CR.add(AddConst->getValue());
+
+              // Get the range where the compare returns true.
+              ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+                  Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+              Constant *ResC;
+              if (CmpRange.contains(CR))
+                ResC = ConstantInt::getTrue(CmpType);
+              else if (CmpRange.inverse().contains(CR))
+                ResC = ConstantInt::getFalse(CmpType);
+              else
+                continue;
+
+              Result.push_back(std::make_pair(ResC, P));
+            }
+
+            return !Result.empty();
+          }
+        }
+      }
+
       // Try to find a constant value for the LHS of a comparison,
       // and evaluate it statically if we can.
-      if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) {
-        PredValueInfoTy LHSVals;
-        ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
-                                        WantInteger, CxtI);
-
-        for (const auto &LHSVal : LHSVals) {
-          Constant *V = LHSVal.first;
-          Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(),
-                                                      V, CmpConst);
-          if (Constant *KC = getKnownConstant(Folded, WantInteger))
-            Result.push_back(std::make_pair(KC, LHSVal.second));
-        }
+      PredValueInfoTy LHSVals;
+      ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals,
+                                      WantInteger, CxtI);
 
-        return !Result.empty();
+      for (const auto &LHSVal : LHSVals) {
+        Constant *V = LHSVal.first;
+        Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+        if (Constant *KC = getKnownConstant(Folded, WantInteger))
+          Result.push_back(std::make_pair(KC, LHSVal.second));
       }
+
+      return !Result.empty();
     }
   }
 
@@ -722,6 +811,37 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
       LVI->eraseBlock(SinglePred);
       MergeBasicBlockIntoOnlyPred(BB);
 
+      // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by
+      // BB code within one basic block `BB`), we need to invalidate the LVI
+      // information associated with BB, because the LVI information need not be
+      // true for all of BB after the merge. For example,
+      // Before the merge, LVI info and code is as follows:
+      // SinglePred: <LVI info1 for %p val>
+      // %y = use of %p
+      // call @exit() // need not transfer execution to successor.
+      // assume(%p) // from this point on %p is true
+      // br label %BB
+      // BB: <LVI info2 for %p val, i.e. %p is true>
+      // %x = use of %p
+      // br label exit
+      //
+      // Note that this LVI info for blocks BB and SinglPred is correct for %p
+      // (info2 and info1 respectively). After the merge and the deletion of the
+      // LVI info1 for SinglePred. We have the following code:
+      // BB: <LVI info2 for %p val>
+      // %y = use of %p
+      // call @exit()
+      // assume(%p)
+      // %x = use of %p <-- LVI info2 is correct from here onwards.
+      // br label exit
+      // LVI info2 for BB is incorrect at the beginning of BB.
+
+      // Invalidate LVI information for BB if the LVI is not provably true for
+      // all of BB.
+      if (any_of(*BB, [](Instruction &I) {
+            return !isGuaranteedToTransferExecutionToSuccessor(&I);
+          }))
+        LVI->eraseBlock(BB);
       return true;
     }
   }
@@ -729,6 +849,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
 
+  // Look if we can propagate guards to predecessors.
+  if (HasGuards && ProcessGuards(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -804,7 +928,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     return false;
   }
 
-
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
     // If we're branching on a conditional, LVI might be able to determine
     // it's value at the branch instruction.  We only handle comparisons
@@ -812,7 +935,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     // TODO: This should be extended to handle switches as well.
     BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
     Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
-    if (CondBr && CondConst && CondBr->isConditional()) {
+    if (CondBr && CondConst) {
+      // We should have returned as soon as we turn a conditional branch to
+      // unconditional. Because its no longer interesting as far as jump
+      // threading is concerned.
+      assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
       LazyValueInfo::Tristate Ret =
         LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                             CondConst, CondBr);
@@ -824,21 +952,27 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         CondBr->eraseFromParent();
         if (CondCmp->use_empty())
           CondCmp->eraseFromParent();
+        // We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
         else if (CondCmp->getParent() == BB) {
-          // If the fact we just learned is true for all uses of the
-          // condition, replace it with a constant value
           auto *CI = Ret == LazyValueInfo::True ?
             ConstantInt::getTrue(CondCmp->getType()) :
             ConstantInt::getFalse(CondCmp->getType());
-          CondCmp->replaceAllUsesWith(CI);
-          CondCmp->eraseFromParent();
+          ReplaceFoldableUses(CondCmp, CI);
         }
         return true;
       }
-    }
 
-    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
-      return true;
+      // We did not manage to simplify this branch, try to see whether
+      // CondCmp depends on a known phi-select pattern.
+      if (TryToUnfoldSelect(CondCmp, BB))
+        return true;
+    }
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -857,7 +991,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (SimplifyPartiallyRedundantLoad(LI))
       return true;
 
-
   // Handle a variety of cases where we are branching on something derived from
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
@@ -871,7 +1004,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
       return ProcessBranchOnPHI(PN);
 
-
   // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
   if (CondInst->getOpcode() == Instruction::Xor &&
       CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
@@ -920,6 +1052,14 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
   return false;
 }
 
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+    if (OpInst->getParent() == BB)
+      return true;
+  return false;
+}
+
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -942,18 +1082,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   Value *LoadedPtr = LI->getOperand(0);
 
-  // If the loaded operand is defined in the LoadBB, it can't be available.
-  // TODO: Could do simple PHI translation, that would be fun :)
-  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
-    if (PtrOp->getParent() == LoadBB)
-      return false;
+  // If the loaded operand is defined in the LoadBB and its not a phi,
+  // it can't be available in predecessors.
+  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+    return false;
 
   // Scan a few instructions up from the load, to see if it is obviously live at
   // the entry to its block.
   BasicBlock::iterator BBIt(LI);
   bool IsLoadCSE;
-  if (Value *AvailableVal =
-        FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) {
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
@@ -997,12 +1136,34 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     if (!PredsScanned.insert(PredBB).second)
       continue;
 
-    // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
-                                                    DefMaxInstsToScan,
-                                                    nullptr,
-                                                    &IsLoadCSE);
+    unsigned NumScanedInst = 0;
+    Value *PredAvailable = nullptr;
+    // NOTE: We don't CSE load that is volatile or anything stronger than
+    // unordered, that should have been checked when we entered the function.
+    assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+    // If this is a load on a phi pointer, phi-translate it and search
+    // for available load/store to the pointer in predecessors.
+    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+    PredAvailable = FindAvailablePtrLoadStore(
+        Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        AA, &IsLoadCSE, &NumScanedInst);
+
+    // If PredBB has a single predecessor, continue scanning through the
+    // single precessor.
+    BasicBlock *SinglePredBB = PredBB;
+    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+           NumScanedInst < DefMaxInstsToScan) {
+      SinglePredBB = SinglePredBB->getSinglePredecessor();
+      if (SinglePredBB) {
+        BBIt = SinglePredBB->end();
+        PredAvailable = FindAvailablePtrLoadStore(
+            Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            &NumScanedInst);
+      }
+    }
+
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
@@ -1062,10 +1223,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    LoadInst *NewVal =
-        new LoadInst(LoadedPtr, LI->getName() + ".pr", false,
-                     LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(),
-                     UnavailablePred->getTerminator());
+    LoadInst *NewVal = new LoadInst(
+        LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+        LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+        LI->getSyncScopeID(), UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
@@ -1210,37 +1371,53 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
 
   BasicBlock *OnlyDest = nullptr;
   BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+  Constant *OnlyVal = nullptr;
+  Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
 
+  unsigned PredWithKnownDest = 0;
   for (const auto &PredValue : PredValues) {
     BasicBlock *Pred = PredValue.second;
     if (!SeenPreds.insert(Pred).second)
       continue;  // Duplicate predecessor entry.
 
-    // If the predecessor ends with an indirect goto, we can't change its
-    // destination.
-    if (isa<IndirectBrInst>(Pred->getTerminator()))
-      continue;
-
     Constant *Val = PredValue.first;
 
     BasicBlock *DestBB;
     if (isa<UndefValue>(Val))
       DestBB = nullptr;
-    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+      assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
-    else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
     } else {
       assert(isa<IndirectBrInst>(BB->getTerminator())
               && "Unexpected terminator");
+      assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
       DestBB = cast<BlockAddress>(Val)->getBasicBlock();
     }
 
     // If we have exactly one destination, remember it for efficiency below.
-    if (PredToDestList.empty())
+    if (PredToDestList.empty()) {
       OnlyDest = DestBB;
-    else if (OnlyDest != DestBB)
-      OnlyDest = MultipleDestSentinel;
+      OnlyVal = Val;
+    } else {
+      if (OnlyDest != DestBB)
+        OnlyDest = MultipleDestSentinel;
+      // It possible we have same destination, but different value, e.g. default
+      // case in switchinst.
+      if (Val != OnlyVal)
+        OnlyVal = MultipleVal;
+    }
+
+    // We know where this predecessor is going.
+    ++PredWithKnownDest;
+
+    // If the predecessor ends with an indirect goto, we can't change its
+    // destination.
+    if (isa<IndirectBrInst>(Pred->getTerminator()))
+      continue;
 
     PredToDestList.push_back(std::make_pair(Pred, DestBB));
   }
@@ -1249,6 +1426,45 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
   if (PredToDestList.empty())
     return false;
 
+  // If all the predecessors go to a single known successor, we want to fold,
+  // not thread. By doing so, we do not need to duplicate the current block and
+  // also miss potential opportunities in case we dont/cant duplicate.
+  if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+    if (PredWithKnownDest ==
+        (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+      bool SeenFirstBranchToOnlyDest = false;
+      for (BasicBlock *SuccBB : successors(BB)) {
+        if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest)
+          SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
+        else
+          SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+      }
+
+      // Finally update the terminator.
+      TerminatorInst *Term = BB->getTerminator();
+      BranchInst::Create(OnlyDest, Term);
+      Term->eraseFromParent();
+
+      // If the condition is now dead due to the removal of the old terminator,
+      // erase it.
+      if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
+        if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
+          CondInst->eraseFromParent();
+        // We can safely replace *some* uses of the CondInst if it has
+        // exactly one value as returned by LVI. RAUW is incorrect in the
+        // presence of guards and assumes, that have the `Cond` as the use. This
+        // is because we use the guards/assume to reason about the `Cond` value
+        // at the end of block, but RAUW unconditionally replaces all uses
+        // including the guards/assumes themselves and the uses before the
+        // guard/assume.
+        else if (OnlyVal && OnlyVal != MultipleVal &&
+                 CondInst->getParent() == BB)
+          ReplaceFoldableUses(CondInst, OnlyVal);
+      }
+      return true;
+    }
+  }
+
   // Determine which is the most common successor.  If we have many inputs and
   // this block is a switch, we want to start by threading the batch that goes
   // to the most popular destination first.  If we only know about one
@@ -1468,7 +1684,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1756,7 +1973,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
@@ -1811,11 +2029,12 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
     // If this instruction can be simplified after the operands are updated,
     // just use the simplified value instead.  This frequently happens due to
     // phi translation.
-    if (Value *IV =
-            SimplifyInstruction(New, BB->getModule()->getDataLayout())) {
+    if (Value *IV = SimplifyInstruction(
+            New,
+            {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
       ValueMapping[&*BI] = IV;
       if (!New->mayHaveSideEffects()) {
-        delete New;
+        New->deleteValue();
         New = nullptr;
       }
     } else {
@@ -1888,10 +2107,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 /// TryToUnfoldSelect - Look for blocks of the form
 /// bb1:
 ///   %a = select
-///   br bb
+///   br bb2
 ///
 /// bb2:
-///   %p = phi [%a, %bb] ...
+///   %p = phi [%a, %bb1] ...
 ///   %c = icmp %p
 ///   br i1 %c
 ///
@@ -1963,11 +2182,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
   return false;
 }
 
-/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form
+/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
+/// same BB in the form
 /// bb:
 ///   %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
-///   %s = select p, trueval, falseval
+///   %s = select %p, trueval, falseval
 ///
+/// or
+///
+/// bb:
+///   %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+///   %c = cmp %p, 0
+///   %s = select %c, trueval, falseval
+//
 /// And expand the select into a branch structure. This later enables
 /// jump-threading over bb in this pass.
 ///
@@ -1981,43 +2208,180 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   if (LoopHeaders.count(BB))
     return false;
 
-  // Look for a Phi/Select pair in the same basic block.  The Phi feeds the
-  // condition of the Select and at least one of the incoming values is a
-  // constant.
   for (BasicBlock::iterator BI = BB->begin();
        PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
-    unsigned NumPHIValues = PN->getNumIncomingValues();
-    if (NumPHIValues == 0 || !PN->hasOneUse())
+    // Look for a Phi having at least one constant incoming value.
+    if (llvm::all_of(PN->incoming_values(),
+                     [](Value *V) { return !isa<ConstantInt>(V); }))
       continue;
 
-    SelectInst *SI = dyn_cast<SelectInst>(PN->user_back());
-    if (!SI || SI->getParent() != BB)
-      continue;
+    auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+      // Check if SI is in BB and use V as condition.
+      if (SI->getParent() != BB)
+        return false;
+      Value *Cond = SI->getCondition();
+      return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+    };
 
-    Value *Cond = SI->getCondition();
-    if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1))
+    SelectInst *SI = nullptr;
+    for (Use &U : PN->uses()) {
+      if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+        // Look for a ICmp in BB that compares PN with a constant and is the
+        // condition of a Select.
+        if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+            isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+          if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+            if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+              SI = SelectI;
+              break;
+            }
+      } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+        // Look for a Select in BB that uses PN as condtion.
+        if (isUnfoldCandidate(SelectI, U.get())) {
+          SI = SelectI;
+          break;
+        }
+      }
+    }
+
+    if (!SI)
       continue;
+    // Expand the select.
+    TerminatorInst *Term =
+        SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
+    PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+    NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+    NewPN->addIncoming(SI->getFalseValue(), BB);
+    SI->replaceAllUsesWith(NewPN);
+    SI->eraseFromParent();
+    return true;
+  }
+  return false;
+}
 
-    bool HasConst = false;
-    for (unsigned i = 0; i != NumPHIValues; ++i) {
-      if (PN->getIncomingBlock(i) == BB)
-        return false;
-      if (isa<ConstantInt>(PN->getIncomingValue(i)))
-        HasConst = true;
-    }
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  using namespace PatternMatch;
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+  if (Pred1 == Pred2)
+    return false;
 
-    if (HasConst) {
-      // Expand the select.
-      TerminatorInst *Term =
-          SplitBlockAndInsertIfThen(SI->getCondition(), SI, false);
-      PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
-      NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
-      NewPN->addIncoming(SI->getFalseValue(), BB);
-      SI->replaceAllUsesWith(NewPN);
-      SI->eraseFromParent();
-      return true;
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  auto *Parent = Pred1->getSinglePredecessor();
+  if (!Parent || Parent != Pred2->getSinglePredecessor())
+    return false;
+
+  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+          return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl =
+        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+                                                       Guard, UnguardedMapping);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
     }
+    Inst->eraseFromParent();
   }
-  
-  return false;
+  return true;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
index f51d11c..37b9c4b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -77,10 +77,16 @@ STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
 STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
 STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
 
+/// Memory promotion is enabled by default.
 static cl::opt<bool>
-    DisablePromotion("disable-licm-promotion", cl::Hidden,
+    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
                      cl::desc("Disable memory promotion in LICM pass"));
 
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+    cl::desc("Max num uses visited for identifying load "
+             "invariance in loop using invariant start (default = 8)"));
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
                             const LoopSafetyInfo *SafetyInfo);
@@ -201,9 +207,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
     return PreservedAnalyses::all();
 
-  // FIXME: There is no setPreservesCFG in the new PM. When that becomes
-  // available, it should be used here.
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 char LegacyLICMPass::ID = 0;
@@ -425,6 +431,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         continue;
       }
 
+      // Attempt to remove floating point division out of the loop by converting
+      // it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv &&
+          CurLoop->isLoopInvariant(I.getOperand(1)) &&
+          I.hasAllowReciprocal()) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
+                                                  ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        I.eraseFromParent();
+
+        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        Changed = true;
+        continue;
+      }
+
       // Try hoisting the instruction out to the preheader.  We can only do this
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
@@ -461,7 +490,10 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
 
   SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow;
   // Iterate over loop instructions and compute safety info.
-  for (Loop::block_iterator BB = CurLoop->block_begin(),
+  // Skip header as it has been computed and stored in HeaderMayThrow.
+  // The first block in loopinfo.Blocks is guaranteed to be the header.
+  assert(Header == *CurLoop->getBlocks().begin() && "First block must be header");
+  for (Loop::block_iterator BB = std::next(CurLoop->block_begin()),
                             BBE = CurLoop->block_end();
        (BB != BBE) && !SafetyInfo->MayThrow; ++BB)
     for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
@@ -477,6 +509,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
         SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory location
+// and size as the memory location LI loads from, and also the invariant.start
+// has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+                                  Loop *CurLoop) {
+  Value *Addr = LI->getOperand(0);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+      cast<PointerType>(Addr->getType())->getElementType());
+
+  // if the type is i8 addrspace(x)*, we know this is the type of
+  // llvm.invariant.start operand
+  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+                                     LI->getPointerAddressSpace());
+  unsigned BitcastsVisited = 0;
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (Addr->getType() != PtrInt8Ty) {
+    auto *BC = dyn_cast<BitCastInst>(Addr);
+    // Avoid traversing high number of bitcast uses.
+    if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+      return false;
+    Addr = BC->getOperand(0);
+  }
+
+  unsigned UsesVisited = 0;
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : Addr->users()) {
+    // Avoid traversing for Load operand with high number of users.
+    if (++UsesVisited > MaxNumUsesTraversed)
+      return false;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    // If there are escaping uses of invariant.start instruction, the load maybe
+    // non-invariant.
+    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+        !II->use_empty())
+      continue;
+    unsigned InvariantSizeInBits =
+        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    // Confirm the invariant.start location size contains the load operand size
+    // in bits. Also, the invariant.start should dominate the load, and we
+    // should not hoist the load out of a loop that contains this dominating
+    // invariant.start.
+    if (LocSizeInBits <= InvariantSizeInBits &&
+        DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+      return true;
+  }
+
+  return false;
+}
+
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
                               LoopSafetyInfo *SafetyInfo,
@@ -493,6 +578,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT, CurLoop))
+      return true;
+
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
@@ -782,7 +871,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
   ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
-            << "hosting " << ore::NV("Inst", &I));
+            << "hoisting " << ore::NV("Inst", &I));
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -852,6 +941,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   LoopInfo &LI;
   DebugLoc DL;
   int Alignment;
+  bool UnorderedAtomic;
   AAMDNodes AATags;
 
   Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
@@ -875,10 +965,11 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-               const AAMDNodes &AATags)
+               bool UnorderedAtomic, const AAMDNodes &AATags)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
-        LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {}
+        LI(li), DL(std::move(dl)), Alignment(alignment),
+        UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -902,6 +993,8 @@ public:
       Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
       Instruction *InsertPos = LoopInsertPts[i];
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      if (UnorderedAtomic)
+        NewSI->setOrdering(AtomicOrdering::Unordered);
       NewSI->setAlignment(Alignment);
       NewSI->setDebugLoc(DL);
       if (AATags)
@@ -992,18 +1085,41 @@ bool llvm::promoteLoopAccessesToScalars(
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
+  // Keep track of which types of access we see
+  bool SawUnorderedAtomic = false; 
+  bool SawNotAtomic = false;
   AAMDNodes AATags;
 
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
 
+  // Do we know this object does not escape ?
+  bool IsKnownNonEscapingObject = false;
   if (SafetyInfo->MayThrow) {
     // If a loop can throw, we have to insert a store along each unwind edge.
     // That said, we can't actually make the unwind edge explicit. Therefore,
     // we have to prove that the store is dead along the unwind edge.
     //
-    // Currently, this code just special-cases alloca instructions.
-    if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL)))
-      return false;
+    // If the underlying object is not an alloca, nor a pointer that does not
+    // escape, then we can not effectively prove that the store is dead along
+    // the unwind edge. i.e. the caller of this function could have ways to
+    // access the pointed object.
+    Value *Object = GetUnderlyingObject(SomePtr, MDL);
+    // If this is a base pointer we do not understand, simply bail.
+    // We only handle alloca and return value from alloc-like fn right now.
+    if (!isa<AllocaInst>(Object)) {
+        if (!isAllocLikeFn(Object, TLI))
+          return false;
+      // If this is an alloc like fn. There are more constraints we need to verify.
+      // More specifically, we must make sure that the pointer can not escape.
+      //
+      // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped
+      // even though its not captured by the enclosing function. Standard allocation
+      // functions like malloc, calloc, and operator new return values which can
+      // be assumed not to have previously escaped.
+      if (PointerMayBeCaptured(Object, true, true))
+        return false;
+      IsKnownNonEscapingObject = true;
+    }
   }
 
   // Check that all of the pointers in the alias set have the same type.  We
@@ -1029,8 +1145,11 @@ bool llvm::promoteLoopAccessesToScalars(
       // it.
       if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
         assert(!Load->isVolatile() && "AST broken");
-        if (!Load->isSimple())
+        if (!Load->isUnordered())
           return false;
+        
+        SawUnorderedAtomic |= Load->isAtomic();
+        SawNotAtomic |= !Load->isAtomic();
 
         if (!DereferenceableInPH)
           DereferenceableInPH = isSafeToExecuteUnconditionally(
@@ -1041,9 +1160,12 @@ bool llvm::promoteLoopAccessesToScalars(
         if (UI->getOperand(1) != ASIV)
           continue;
         assert(!Store->isVolatile() && "AST broken");
-        if (!Store->isSimple())
+        if (!Store->isUnordered())
           return false;
 
+        SawUnorderedAtomic |= Store->isAtomic();
+        SawNotAtomic |= !Store->isAtomic();
+
         // If the store is guaranteed to execute, both properties are satisfied.
         // We may want to check if a store is guaranteed to execute even if we
         // already know that promotion is safe, since it may have higher
@@ -1096,6 +1218,12 @@ bool llvm::promoteLoopAccessesToScalars(
     }
   }
 
+  // If we found both an unordered atomic instruction and a non-atomic memory
+  // access, bail.  We can't blindly promote non-atomic to atomic since we
+  // might not be able to lower the result.  We can't downgrade since that
+  // would violate memory model.  Also, align 0 is an error for atomics.
+  if (SawUnorderedAtomic && SawNotAtomic)
+    return false;
 
   // If we couldn't prove we can hoist the load, bail.
   if (!DereferenceableInPH)
@@ -1106,10 +1234,15 @@ bool llvm::promoteLoopAccessesToScalars(
   // stores along paths which originally didn't have them without violating the
   // memory model.
   if (!SafeToInsertStore) {
-    Value *Object = GetUnderlyingObject(SomePtr, MDL);
-    SafeToInsertStore =
-        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+    // If this is a known non-escaping object, it is safe to insert the stores.
+    if (IsKnownNonEscapingObject)
+      SafeToInsertStore = true;
+    else {
+      Value *Object = GetUnderlyingObject(SomePtr, MDL);
+      SafeToInsertStore =
+        (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && 
         !PointerMayBeCaptured(Object, true, true);
+    }
   }
 
   // If we've still failed to prove we can sink the store, give up.
@@ -1134,12 +1267,15 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+                        SawUnorderedAtomic, AATags);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad = new LoadInst(
       SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+  if (SawUnorderedAtomic)
+    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
   if (AATags)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
deleted file mode 100644
index 389f1c5..0000000
--- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This transformation combines adjacent loads.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "load-combine"
-
-STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining");
-STATISTIC(NumLoadsCombined, "Number of loads combined");
-
-#define LDCOMBINE_NAME "Combine Adjacent Loads"
-
-namespace {
-struct PointerOffsetPair {
-  Value *Pointer;
-  APInt Offset;
-};
-
-struct LoadPOPPair {
-  LoadInst *Load;
-  PointerOffsetPair POP;
-  /// \brief The new load needs to be created before the first load in IR order.
-  unsigned InsertOrder;
-};
-
-class LoadCombine : public BasicBlockPass {
-  LLVMContext *C;
-  AliasAnalysis *AA;
-
-public:
-  LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
-    initializeLoadCombinePass(*PassRegistry::getPassRegistry());
-  }
-  
-  using llvm::Pass::doInitialization;
-  bool doInitialization(Function &) override;
-  bool runOnBasicBlock(BasicBlock &BB) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<AAResultsWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-
-  StringRef getPassName() const override { return LDCOMBINE_NAME; }
-  static char ID;
-
-  typedef IRBuilder<TargetFolder> BuilderTy;
-
-private:
-  BuilderTy *Builder;
-
-  PointerOffsetPair getPointerOffsetPair(LoadInst &);
-  bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &);
-  bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &);
-  bool combineLoads(SmallVectorImpl<LoadPOPPair> &);
-};
-}
-
-bool LoadCombine::doInitialization(Function &F) {
-  DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n");
-  C = &F.getContext();
-  return true;
-}
-
-PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) {
-  auto &DL = LI.getModule()->getDataLayout();
-
-  PointerOffsetPair POP;
-  POP.Pointer = LI.getPointerOperand();
-  unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace());
-  POP.Offset = APInt(BitWidth, 0);
-
-  while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) {
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) {
-      APInt LastOffset = POP.Offset;
-      if (!GEP->accumulateConstantOffset(DL, POP.Offset)) {
-        // Can't handle GEPs with variable indices.
-        POP.Offset = LastOffset;
-        return POP;
-      }
-      POP.Pointer = GEP->getPointerOperand();
-    } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) {
-      POP.Pointer = BC->getOperand(0);
-    }
-  }
-  return POP;
-}
-
-bool LoadCombine::combineLoads(
-    DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) {
-  bool Combined = false;
-  for (auto &Loads : LoadMap) {
-    if (Loads.second.size() < 2)
-      continue;
-    std::sort(Loads.second.begin(), Loads.second.end(),
-              [](const LoadPOPPair &A, const LoadPOPPair &B) {
-                return A.POP.Offset.slt(B.POP.Offset);
-              });
-    if (aggregateLoads(Loads.second))
-      Combined = true;
-  }
-  return Combined;
-}
-
-/// \brief Try to aggregate loads from a sorted list of loads to be combined.
-///
-/// It is guaranteed that no writes occur between any of the loads. All loads
-/// have the same base pointer. There are at least two loads.
-bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
-  assert(Loads.size() >= 2 && "Insufficient loads!");
-  LoadInst *BaseLoad = nullptr;
-  SmallVector<LoadPOPPair, 8> AggregateLoads;
-  bool Combined = false;
-  bool ValidPrevOffset = false;
-  APInt PrevOffset;
-  uint64_t PrevSize = 0;
-  for (auto &L : Loads) {
-    if (ValidPrevOffset == false) {
-      BaseLoad = L.Load;
-      PrevOffset = L.POP.Offset;
-      PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
-          L.Load->getType());
-      AggregateLoads.push_back(L);
-      ValidPrevOffset = true;
-      continue;
-    }
-    if (L.Load->getAlignment() > BaseLoad->getAlignment())
-      continue;
-    APInt PrevEnd = PrevOffset + PrevSize;
-    if (L.POP.Offset.sgt(PrevEnd)) {
-      // No other load will be combinable
-      if (combineLoads(AggregateLoads))
-        Combined = true;
-      AggregateLoads.clear();
-      ValidPrevOffset = false;
-      continue;
-    }
-    if (L.POP.Offset != PrevEnd)
-      // This load is offset less than the size of the last load.
-      // FIXME: We may want to handle this case.
-      continue;
-    PrevOffset = L.POP.Offset;
-    PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize(
-        L.Load->getType());
-    AggregateLoads.push_back(L);
-  }
-  if (combineLoads(AggregateLoads))
-    Combined = true;
-  return Combined;
-}
-
-/// \brief Given a list of combinable load. Combine the maximum number of them.
-bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) {
-  // Remove loads from the end while the size is not a power of 2.
-  unsigned TotalSize = 0;
-  for (const auto &L : Loads)
-    TotalSize += L.Load->getType()->getPrimitiveSizeInBits();
-  while (TotalSize != 0 && !isPowerOf2_32(TotalSize))
-    TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits();
-  if (Loads.size() < 2)
-    return false;
-
-  DEBUG({
-    dbgs() << "***** Combining Loads ******\n";
-    for (const auto &L : Loads) {
-      dbgs() << L.POP.Offset << ": " << *L.Load << "\n";
-    }
-  });
-
-  // Find first load. This is where we put the new load.
-  LoadPOPPair FirstLP;
-  FirstLP.InsertOrder = -1u;
-  for (const auto &L : Loads)
-    if (L.InsertOrder < FirstLP.InsertOrder)
-      FirstLP = L;
-
-  unsigned AddressSpace =
-      FirstLP.POP.Pointer->getType()->getPointerAddressSpace();
-
-  Builder->SetInsertPoint(FirstLP.Load);
-  Value *Ptr = Builder->CreateConstGEP1_64(
-      Builder->CreatePointerCast(Loads[0].POP.Pointer,
-                                 Builder->getInt8PtrTy(AddressSpace)),
-      Loads[0].POP.Offset.getSExtValue());
-  LoadInst *NewLoad = new LoadInst(
-      Builder->CreatePointerCast(
-          Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize),
-                                Ptr->getType()->getPointerAddressSpace())),
-      Twine(Loads[0].Load->getName()) + ".combined", false,
-      Loads[0].Load->getAlignment(), FirstLP.Load);
-
-  for (const auto &L : Loads) {
-    Builder->SetInsertPoint(L.Load);
-    Value *V = Builder->CreateExtractInteger(
-        L.Load->getModule()->getDataLayout(), NewLoad,
-        cast<IntegerType>(L.Load->getType()),
-        (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract");
-    L.Load->replaceAllUsesWith(V);
-  }
-
-  NumLoadsCombined = NumLoadsCombined + Loads.size();
-  return true;
-}
-
-bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
-  if (skipBasicBlock(BB))
-    return false;
-
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-
-  IRBuilder<TargetFolder> TheBuilder(
-      BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
-  Builder = &TheBuilder;
-
-  DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap;
-  AliasSetTracker AST(*AA);
-
-  bool Combined = false;
-  unsigned Index = 0;
-  for (auto &I : BB) {
-    if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
-      if (combineLoads(LoadMap))
-        Combined = true;
-      LoadMap.clear();
-      AST.clear();
-      continue;
-    }
-    LoadInst *LI = dyn_cast<LoadInst>(&I);
-    if (!LI)
-      continue;
-    ++NumLoadsAnalyzed;
-    if (!LI->isSimple() || !LI->getType()->isIntegerTy())
-      continue;
-    auto POP = getPointerOffsetPair(*LI);
-    if (!POP.Pointer)
-      continue;
-    LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++});
-    AST.add(LI);
-  }
-  if (combineLoads(LoadMap))
-    Combined = true;
-  return Combined;
-}
-
-char LoadCombine::ID = 0;
-
-BasicBlockPass *llvm::createLoadCombinePass() {
-  return new LoadCombine();
-}
-
-INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index cca75a3..ac4dd44 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -29,32 +30,45 @@ using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
 
-/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
-/// checked for unique exit and exiting blocks, and that the code is in LCSSA
-/// form.
-bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
-                                  SmallVectorImpl<BasicBlock *> &exitingBlocks,
-                                  SmallVectorImpl<BasicBlock *> &exitBlocks,
-                                  bool &Changed, BasicBlock *Preheader) {
-  BasicBlock *exitBlock = exitBlocks[0];
-
+/// This function deletes dead loops. The caller of this function needs to
+/// guarantee that the loop is infact dead. Here we handle two kinds of dead
+/// loop. The first kind (\p isLoopDead) is where only invariant values from
+/// within the loop are used outside of it. The second kind (\p
+/// isLoopNeverExecuted) is where the loop is provably never executed. We can
+/// always remove never executed loops since they will not cause any difference
+/// to program behaviour.
+/// 
+/// This also updates the relevant analysis information in \p DT, \p SE, and \p
+/// LI. It also updates the loop PM if an updater struct is provided.
+// TODO: This function will be used by loop-simplifyCFG as well. So, move this
+// to LoopUtils.cpp
+static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                           LoopInfo &LI, LPMUpdater *Updater = nullptr);
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+                       SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+                       BasicBlock *ExitBlock, bool &Changed,
+                       BasicBlock *Preheader) {
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
   // must pass through a PHI in the exit block, meaning that this check is
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
-  BasicBlock::iterator BI = exitBlock->begin();
+  BasicBlock::iterator BI = ExitBlock->begin();
   bool AllEntriesInvariant = true;
   bool AllOutgoingValuesSame = true;
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+    Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
 
     // Make sure all exiting blocks produce the same incoming value for the exit
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
     AllOutgoingValuesSame =
-        all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) {
+        all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
           return incoming == P->getIncomingValueForBlock(BB);
         });
 
@@ -78,95 +92,187 @@ bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE,
 
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
-  // marked volatile.  This could be made more aggressive by using aliasing
-  // information to identify readonly and readnone calls.
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (Instruction &I : **LI) {
-      if (I.mayHaveSideEffects())
-        return false;
-    }
-  }
-
+  // marked volatile.
+  for (auto &I : L->blocks())
+    if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
   return true;
 }
 
-/// Remove dead loops, by which we mean loops that do not impact the observable
-/// behavior of the program other than finite running time.  Note we do ensure
-/// that this never remove a loop that might be infinite, as doing so could
-/// change the halting/non-halting nature of a program. NOTE: This entire
-/// process relies pretty heavily on LoopSimplify and LCSSA in order to make
-/// various safety checks work.
-bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                               LoopInfo &loopInfo) {
-  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+/// This function returns true if there is no viable path from the
+/// entry block to the header of \p L. Right now, it only does
+/// a local search to save compile time.
+static bool isLoopNeverExecuted(Loop *L) {
+  using namespace PatternMatch;
 
-  // We can only remove the loop if there is a preheader that we can
-  // branch from after removing it.
-  BasicBlock *preheader = L->getLoopPreheader();
-  if (!preheader)
-    return false;
+  auto *Preheader = L->getLoopPreheader();
+  // TODO: We can relax this constraint, since we just need a loop
+  // predecessor.
+  assert(Preheader && "Needs preheader!");
 
-  // If LoopSimplify form is not available, stay out of trouble.
-  if (!L->hasDedicatedExits())
+  if (Preheader == &Preheader->getParent()->getEntryBlock())
     return false;
+  // All predecessors of the preheader should have a constant conditional
+  // branch, with the loop's preheader as not-taken.
+  for (auto *Pred: predecessors(Preheader)) {
+    BasicBlock *Taken, *NotTaken;
+    ConstantInt *Cond;
+    if (!match(Pred->getTerminator(),
+               m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
+      return false;
+    if (!Cond->getZExtValue())
+      std::swap(Taken, NotTaken);
+    if (Taken == Preheader)
+      return false;
+  }
+  assert(!pred_empty(Preheader) &&
+         "Preheader should have predecessors at this point!");
+  // All the predecessors have the loop preheader as not-taken target.
+  return true;
+}
 
+/// Remove a loop if it is dead.
+///
+/// A loop is considered dead if it does not impact the observable behavior of
+/// the program other than finite running time. This never removes a loop that
+/// might be infinite (unless it is never executed), as doing so could change
+/// the halting/non-halting nature of a program.
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                             LoopInfo &LI, LPMUpdater *Updater = nullptr) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+  // We can only remove the loop if there is a preheader that we can branch from
+  // after removing it. Also, if LoopSimplify form is not available, stay out
+  // of trouble.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader || !L->hasDedicatedExits()) {
+    DEBUG(dbgs()
+          << "Deletion requires Loop with preheader and dedicated exits.\n");
+    return false;
+  }
   // We can't remove loops that contain subloops.  If the subloops were dead,
   // they would already have been removed in earlier executions of this pass.
-  if (L->begin() != L->end())
+  if (L->begin() != L->end()) {
+    DEBUG(dbgs() << "Loop contains subloops.\n");
     return false;
+  }
 
-  SmallVector<BasicBlock *, 4> exitingBlocks;
-  L->getExitingBlocks(exitingBlocks);
 
-  SmallVector<BasicBlock *, 4> exitBlocks;
-  L->getUniqueExitBlocks(exitBlocks);
+  BasicBlock *ExitBlock = L->getUniqueExitBlock();
+
+  if (ExitBlock && isLoopNeverExecuted(L)) {
+    DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+    // Set incoming value to undef for phi nodes in the exit block.
+    BasicBlock::iterator BI = ExitBlock->begin();
+    while (PHINode *P = dyn_cast<PHINode>(BI)) {
+      for (unsigned i = 0; i < P->getNumIncomingValues(); i++)
+        P->setIncomingValue(i, UndefValue::get(P->getType()));
+      BI++;
+    }
+    deleteDeadLoop(L, DT, SE, LI, Updater);
+    ++NumDeleted;
+    return true;
+  }
+
+  // The remaining checks below are for a loop being dead because all statements
+  // in the loop are invariant.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
 
   // We require that the loop only have a single exit block.  Otherwise, we'd
   // be in the situation of needing to be able to solve statically which exit
   // block will be branched to, or trying to preserve the branching logic in
   // a loop invariant manner.
-  if (exitBlocks.size() != 1)
+  if (!ExitBlock) {
+    DEBUG(dbgs() << "Deletion requires single exit block\n");
     return false;
-
+  }
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
+    DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
     return Changed;
+  }
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(S))
+  if (isa<SCEVCouldNotCompute>(S)) {
+    DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n");
     return Changed;
+  }
+
+  DEBUG(dbgs() << "Loop is invariant, delete it!");
+  deleteDeadLoop(L, DT, SE, LI, Updater);
+  ++NumDeleted;
+
+  return true;
+}
+
+static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
+                           LoopInfo &LI, LPMUpdater *Updater) {
+  assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+  auto *Preheader = L->getLoopPreheader();
+  assert(Preheader && "Preheader should exist!");
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
-  BasicBlock *exitBlock = exitBlocks[0];
-
+  //
   // Because we're deleting a large chunk of code at once, the sequence in which
-  // we remove things is very important to avoid invalidation issues.  Don't
-  // mess with this unless you have good reason and know what you're doing.
+  // we remove things is very important to avoid invalidation issues.
+
+  // If we have an LPM updater, tell it about the loop being removed.
+  if (Updater)
+    Updater->markLoopAsDeleted(*L);
 
   // Tell ScalarEvolution that the loop is deleted. Do this before
   // deleting the loop so that ScalarEvolution can look at the loop
   // to determine what it needs to clean up.
   SE.forgetLoop(L);
 
-  // Connect the preheader directly to the exit block.
-  TerminatorInst *TI = preheader->getTerminator();
-  TI->replaceUsesOfWith(L->getHeader(), exitBlock);
+  auto *ExitBlock = L->getUniqueExitBlock();
+  assert(ExitBlock && "Should have a unique exit block!");
 
-  // Rewrite phis in the exit block to get their inputs from
-  // the preheader instead of the exiting block.
-  BasicBlock *exitingBlock = exitingBlocks[0];
-  BasicBlock::iterator BI = exitBlock->begin();
+  assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
+
+  // Connect the preheader directly to the exit block.
+  // Even when the loop is never executed, we cannot remove the edge from the
+  // source block to the exit block. Consider the case where the unexecuted loop
+  // branches back to an outer loop. If we deleted the loop and removed the edge
+  // coming to this inner loop, this will break the outer loop structure (by
+  // deleting the backedge of the outer loop). If the outer loop is indeed a
+  // non-loop, it will be deleted in a future iteration of loop deletion pass.
+  Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from the Preheader
+  // instead of the exiting block.
+  BasicBlock::iterator BI = ExitBlock->begin();
   while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    int j = P->getBasicBlockIndex(exitingBlock);
-    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
-    P->setIncomingBlock(j, preheader);
-    for (unsigned i = 1; i < exitingBlocks.size(); ++i)
-      P->removeIncomingValue(exitingBlocks[i]);
+    // Set the zero'th element of Phi to be from the preheader and remove all
+    // other incoming values. Given the loop has dedicated exits, all other
+    // incoming values must be from the exiting blocks.
+    int PredIndex = 0;
+    P->setIncomingBlock(PredIndex, Preheader);
+    // Removes all incoming values from all other exiting blocks (including
+    // duplicate values from an exiting block).
+    // Nuke all entries except the zero'th entry which is the preheader entry.
+    // NOTE! We need to remove Incoming Values in the reverse order as done
+    // below, to keep the indices valid for deletion (removeIncomingValues
+    // updates getNumIncomingValues and shifts all values down into the operand
+    // being deleted).
+    for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i)
+      P->removeIncomingValue(e-i, false);
+
+    assert((P->getNumIncomingValues() == 1 &&
+            P->getIncomingBlock(PredIndex) == Preheader) &&
+           "Should have exactly one value and that's from the preheader!");
     ++BI;
   }
 
@@ -175,11 +281,11 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
-    // Move all of the block's children to be children of the preheader, which
+    // Move all of the block's children to be children of the Preheader, which
     // allows us to remove the domtree entry for the block.
     ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
     for (DomTreeNode *ChildNode : ChildNodes) {
-      DT.changeImmediateDominator(ChildNode, DT[preheader]);
+      DT.changeImmediateDominator(ChildNode, DT[Preheader]);
     }
 
     ChildNodes.clear();
@@ -204,22 +310,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   SmallPtrSet<BasicBlock *, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (BasicBlock *BB : blocks)
-    loopInfo.removeBlock(BB);
+    LI.removeBlock(BB);
 
   // The last step is to update LoopInfo now that we've eliminated this loop.
-  loopInfo.markAsRemoved(L);
-  Changed = true;
-
-  ++NumDeleted;
-
-  return Changed;
+  LI.markAsRemoved(L);
 }
 
 PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
                                         LoopStandardAnalysisResults &AR,
-                                        LPMUpdater &) {
-  bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI);
-  if (!Changed)
+                                        LPMUpdater &Updater) {
+
+  DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  DEBUG(L.dump());
+  if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
@@ -254,11 +357,11 @@ Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
 bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
   if (skipLoop(L))
     return false;
-
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-  LoopDeletionPass Impl;
-  return Impl.runImpl(L, DT, SE, loopInfo);
+  DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+  DEBUG(L->dump());
+  return deleteLoopIfDead(L, DT, SE, LI);
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 19716b2..3624bba 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -812,29 +812,29 @@ private:
       const RuntimePointerChecking *RtPtrChecking) {
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (unsigned PtrIdx1 : Check.first->Members)
-                     for (unsigned PtrIdx2 : Check.second->Members)
-                       // Only include this check if there is a pair of pointers
-                       // that require checking and the pointers fall into
-                       // separate partitions.
-                       //
-                       // (Note that we already know at this point that the two
-                       // pointer groups need checking but it doesn't follow
-                       // that each pair of pointers within the two groups need
-                       // checking as well.
-                       //
-                       // In other words we don't want to include a check just
-                       // because there is a pair of pointers between the two
-                       // pointer groups that require checks and a different
-                       // pair whose pointers fall into different partitions.)
-                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
-                           !RuntimePointerChecking::arePointersInSamePartition(
-                               PtrToPartition, PtrIdx1, PtrIdx2))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (unsigned PtrIdx1 : Check.first->Members)
+                for (unsigned PtrIdx2 : Check.second->Members)
+                  // Only include this check if there is a pair of pointers
+                  // that require checking and the pointers fall into
+                  // separate partitions.
+                  //
+                  // (Note that we already know at this point that the two
+                  // pointer groups need checking but it doesn't follow
+                  // that each pair of pointers within the two groups need
+                  // checking as well.
+                  //
+                  // In other words we don't want to include a check just
+                  // because there is a pair of pointers between the two
+                  // pointer groups that require checks and a different
+                  // pair whose pointers fall into different partitions.)
+                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                      !RuntimePointerChecking::arePointersInSamePartition(
+                          PtrToPartition, PtrIdx1, PtrIdx2))
+                    return true;
+              return false;
+            });
 
     return Checks;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5fec51c..4a6a35c 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -110,6 +110,16 @@ private:
   bool HasMemset;
   bool HasMemsetPattern;
   bool HasMemcpy;
+  /// Return code for isLegalStore()
+  enum LegalStoreKind {
+    None = 0,
+    Memset,
+    MemsetPattern,
+    Memcpy,
+    UnorderedAtomicMemcpy,
+    DontUse // Dummy retval never to be used. Allows catching errors in retval
+            // handling.
+  };
 
   /// \name Countable Loop Idiom Handling
   /// @{
@@ -119,8 +129,7 @@ private:
                       SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
   void collectStores(BasicBlock *BB);
-  bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern,
-                    bool &ForMemcpy);
+  LegalStoreKind isLegalStore(StoreInst *SI);
   bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
                          bool ForMemset);
   bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
@@ -144,6 +153,10 @@ private:
   bool recognizePopcount();
   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                                PHINode *CntPhi, Value *Var);
+  bool recognizeAndInsertCTLZ();
+  void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst,
+                                PHINode *CntPhi, Value *Var, const DebugLoc DL,
+                                bool ZeroCheck, bool IsCntPhiUsedOutsideLoop);
 
   /// @}
 };
@@ -236,9 +249,9 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
   ApplyCodeSizeHeuristics =
       L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs;
 
-  HasMemset = TLI->has(LibFunc::memset);
-  HasMemsetPattern = TLI->has(LibFunc::memset_pattern16);
-  HasMemcpy = TLI->has(LibFunc::memcpy);
+  HasMemset = TLI->has(LibFunc_memset);
+  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+  HasMemcpy = TLI->has(LibFunc_memcpy);
 
   if (HasMemset || HasMemsetPattern || HasMemcpy)
     if (SE->hasLoopInvariantBackedgeTakenCount(L))
@@ -339,15 +352,24 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
   return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
 }
 
-bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
-                                      bool &ForMemsetPattern, bool &ForMemcpy) {
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+
   // Don't touch volatile stores.
-  if (!SI->isSimple())
-    return false;
+  if (SI->isVolatile())
+    return LegalStoreKind::None;
+  // We only want simple or unordered-atomic stores.
+  if (!SI->isUnordered())
+    return LegalStoreKind::None;
+
+  // Don't convert stores of non-integral pointer types to memsets (which stores
+  // integers).
+  if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType()))
+    return LegalStoreKind::None;
 
   // Avoid merging nontemporal stores.
   if (SI->getMetadata(LLVMContext::MD_nontemporal))
-    return false;
+    return LegalStoreKind::None;
 
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
@@ -355,7 +377,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
   // Reject stores that are so large that they overflow an unsigned.
   uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
   if ((SizeInBits & 7) || (SizeInBits >> 32) != 0)
-    return false;
+    return LegalStoreKind::None;
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided store.  If we have something else, it's a
@@ -363,11 +385,11 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
   const SCEVAddRecExpr *StoreEv =
       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
   if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
-    return false;
+    return LegalStoreKind::None;
 
   // Check to see if we have a constant stride.
   if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
-    return false;
+    return LegalStoreKind::None;
 
   // See if the store can be turned into a memset.
 
@@ -378,22 +400,23 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = nullptr;
 
+  // Note: memset and memset_pattern on unordered-atomic is yet not supported
+  bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
+
   // If we're allowed to form a memset, and the stored value would be
   // acceptable for memset, use it.
-  if (HasMemset && SplatValue &&
+  if (!UnorderedAtomic && HasMemset && SplatValue &&
       // Verify that the stored value is loop invariant.  If not, we can't
       // promote the memset.
       CurLoop->isLoopInvariant(SplatValue)) {
     // It looks like we can use SplatValue.
-    ForMemset = true;
-    return true;
-  } else if (HasMemsetPattern &&
+    return LegalStoreKind::Memset;
+  } else if (!UnorderedAtomic && HasMemsetPattern &&
              // Don't create memset_pattern16s with address spaces.
              StorePtr->getType()->getPointerAddressSpace() == 0 &&
              (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
     // It looks like we can use PatternValue!
-    ForMemsetPattern = true;
-    return true;
+    return LegalStoreKind::MemsetPattern;
   }
 
   // Otherwise, see if the store can be turned into a memcpy.
@@ -403,12 +426,17 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
     APInt Stride = getStoreStride(StoreEv);
     unsigned StoreSize = getStoreSizeInBytes(SI, DL);
     if (StoreSize != Stride && StoreSize != -Stride)
-      return false;
+      return LegalStoreKind::None;
 
     // The store must be feeding a non-volatile load.
     LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
-    if (!LI || !LI->isSimple())
-      return false;
+
+    // Only allow non-volatile loads
+    if (!LI || LI->isVolatile())
+      return LegalStoreKind::None;
+    // Only allow simple or unordered-atomic loads
+    if (!LI->isUnordered())
+      return LegalStoreKind::None;
 
     // See if the pointer expression is an AddRec like {base,+,1} on the current
     // loop, which indicates a strided load.  If we have something else, it's a
@@ -416,18 +444,19 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset,
     const SCEVAddRecExpr *LoadEv =
         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
     if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
-      return false;
+      return LegalStoreKind::None;
 
     // The store and load must share the same stride.
     if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
-      return false;
+      return LegalStoreKind::None;
 
     // Success.  This store can be converted into a memcpy.
-    ForMemcpy = true;
-    return true;
+    UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
+    return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
+                           : LegalStoreKind::Memcpy;
   }
   // This store can't be transformed into a memset/memcpy.
-  return false;
+  return LegalStoreKind::None;
 }
 
 void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
@@ -439,24 +468,29 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
     if (!SI)
       continue;
 
-    bool ForMemset = false;
-    bool ForMemsetPattern = false;
-    bool ForMemcpy = false;
     // Make sure this is a strided store with a constant stride.
-    if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy))
-      continue;
-
-    // Save the store locations.
-    if (ForMemset) {
+    switch (isLegalStore(SI)) {
+    case LegalStoreKind::None:
+      // Nothing to do
+      break;
+    case LegalStoreKind::Memset: {
       // Find the base pointer.
       Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
       StoreRefsForMemset[Ptr].push_back(SI);
-    } else if (ForMemsetPattern) {
+    } break;
+    case LegalStoreKind::MemsetPattern: {
       // Find the base pointer.
       Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL);
       StoreRefsForMemsetPattern[Ptr].push_back(SI);
-    } else if (ForMemcpy)
+    } break;
+    case LegalStoreKind::Memcpy:
+    case LegalStoreKind::UnorderedAtomicMemcpy:
       StoreRefsForMemcpy.push_back(SI);
+      break;
+    default:
+      assert(false && "unhandled return value");
+      break;
+    }
   }
 }
 
@@ -494,7 +528,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(
     Instruction *Inst = &*I++;
     // Look for memset instructions, which may be optimized to a larger memset.
     if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
-      WeakVH InstPtr(&*I);
+      WeakTrackingVH InstPtr(&*I);
       if (!processLoopMemSet(MSI, BECount))
         continue;
       MadeChange = true;
@@ -778,6 +812,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   if (NegStride)
     Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE);
 
+  // TODO: ideally we should still be able to generate memset if SCEV expander
+  // is taught to generate the dependencies at the latest point.
+  if (!isSafeToExpand(Start, *SE))
+    return false;
+
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
@@ -809,6 +848,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
                                SCEV::FlagNUW);
   }
 
+  // TODO: ideally we should still be able to generate memset if SCEV expander
+  // is taught to generate the dependencies at the latest point.
+  if (!isSafeToExpand(NumBytesS, *SE))
+    return false;
+
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
@@ -823,7 +867,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Module *M = TheStore->getModule();
     Value *MSP =
         M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
-                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+                               Int8PtrTy, Int8PtrTy, IntPtr);
     inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -851,10 +895,10 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
 /// If the stored value is a strided load in the same loop with the same stride
 /// this may be transformable into a memcpy.  This kicks in for stuff like
-///   for (i) A[i] = B[i];
+/// for (i) A[i] = B[i];
 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
                                                     const SCEV *BECount) {
-  assert(SI->isSimple() && "Expected only non-volatile stores.");
+  assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
 
   Value *StorePtr = SI->getPointerOperand();
   const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
@@ -864,7 +908,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   // The store must be feeding a non-volatile load.
   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
-  assert(LI->isSimple() && "Expected only non-volatile stores.");
+  assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
 
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided load.  If we have something else, it's a
@@ -938,6 +982,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   const SCEV *NumBytesS =
       SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW);
+
   if (StoreSize != 1)
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
@@ -945,9 +990,37 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
-  CallInst *NewCall =
-      Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
-                           std::min(SI->getAlignment(), LI->getAlignment()));
+  unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
+  CallInst *NewCall = nullptr;
+  // Check whether to generate an unordered atomic memcpy:
+  //  If the load or store are atomic, then they must neccessarily be unordered
+  //  by previous checks.
+  if (!SI->isAtomic() && !LI->isAtomic())
+    NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align);
+  else {
+    // We cannot allow unaligned ops for unordered load/store, so reject
+    // anything where the alignment isn't at least the element size.
+    if (Align < StoreSize)
+      return false;
+
+    // If the element.atomic memcpy is not lowered into explicit
+    // loads/stores later, then it will be lowered into an element-size
+    // specific lib call. If the lib call doesn't exist for our store size, then
+    // we shouldn't generate the memcpy.
+    if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
+      return false;
+
+    NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+        StoreBasePtr, LoadBasePtr, NumBytes, StoreSize);
+
+    // Propagate alignment info onto the pointer args. Note that unordered
+    // atomic loads/stores are *required* by the spec to have an alignment
+    // but non-atomic loads/stores may not.
+    NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(),
+                                                         SI->getAlignment()));
+    NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(),
+                                                         LI->getAlignment()));
+  }
   NewCall->setDebugLoc(SI->getDebugLoc());
 
   DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
@@ -979,7 +1052,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
 }
 
 bool LoopIdiomRecognize::runOnNoncountableLoop() {
-  return recognizePopcount();
+  return recognizePopcount() || recognizeAndInsertCTLZ();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1007,6 +1080,17 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
   return nullptr;
 }
 
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+                                 BasicBlock *LoopEntry) {
+  auto *PhiX = dyn_cast<PHINode>(VarX);
+  if (PhiX && PhiX->getParent() == LoopEntry &&
+      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+    return PhiX;
+  return nullptr;
+}
+
 /// Return true iff the idiom is detected in the loop.
 ///
 /// Additionally:
@@ -1076,19 +1160,15 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
     if (!Dec ||
         !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
           (SubInst->getOpcode() == Instruction::Add &&
-           Dec->isAllOnesValue()))) {
+           Dec->isMinusOne()))) {
       return false;
     }
   }
 
   // step 3: Check the recurrence of variable X
-  {
-    PhiX = dyn_cast<PHINode>(VarX1);
-    if (!PhiX ||
-        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
-      return false;
-    }
-  }
+  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+  if (!PhiX)
+    return false;
 
   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
   {
@@ -1104,8 +1184,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
       if (!Inc || !Inc->isOne())
         continue;
 
-      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
-      if (!Phi || Phi->getParent() != LoopEntry)
+      PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+      if (!Phi)
         continue;
 
       // Check if the result of the instruction is live of the loop.
@@ -1144,6 +1224,169 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   return true;
 }
 
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 == 0)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val;
+///    do {
+///       x = phi (x0, x.next);   //PhiX
+///       cnt = phi(cnt0, cnt.next);
+///
+///       cnt.next = cnt + 1;
+///        ...
+///       x.next = x >> 1;   // DefX
+///        ...
+///    } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
+                            Instruction *&CntInst, PHINode *&CntPhi,
+                            Instruction *&DefX) {
+  BasicBlock *LoopEntry;
+  Value *VarX = nullptr;
+
+  DefX = nullptr;
+  PhiX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: detect instructions corresponding to "x.next = x >> 1"
+  if (!DefX || DefX->getOpcode() != Instruction::AShr)
+    return false;
+  if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)))
+    if (!Shft || !Shft->isOne())
+      return false;
+  VarX = DefX->getOperand(0);
+
+  // step 3: Check the recurrence of variable X
+  PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+  if (!PhiX)
+    return false;
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+                            IterE = LoopEntry->end();
+       Iter != IterE; Iter++) {
+    Instruction *Inst = &*Iter;
+    if (Inst->getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+    if (!Inc || !Inc->isOne())
+      continue;
+
+    PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+    if (!Phi)
+      continue;
+
+    CntInst = Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
+/// Recognize CTLZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ trip count).
+/// If CTLZ inserted as a new trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertCTLZ() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Instruction *CntInst, *DefX;
+  PHINode *CntPhi, *PhiX;
+  if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX))
+    return false;
+
+  bool IsCntPhiUsedOutsideLoop = false;
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntPhiUsedOutsideLoop = true;
+      break;
+    }
+  bool IsCntInstUsedOutsideLoop = false;
+  for (User *U : CntInst->users())
+    if (!CurLoop->contains(dyn_cast<Instruction>(U))) {
+      IsCntInstUsedOutsideLoop = true;
+      break;
+    }
+  // If both CntInst and CntPhi are used outside the loop the profitability
+  // is questionable.
+  if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+    return false;
+
+  // For some CPUs result of CTLZ(X) intrinsic is undefined
+  // when X is 0. If we can not guarantee X != 0, we need to check this
+  // when expand.
+  bool ZeroCheck = false;
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  Value *InitX = PhiX->getIncomingValueForBlock(PH);
+  // If we check X != 0 before entering the loop we don't need a zero
+  // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the
+  // loop (if it is used we count CTLZ(X >> 1)).
+  if (!IsCntPhiUsedOutsideLoop)
+    if (BasicBlock *PreCondBB = PH->getSinglePredecessor())
+      if (BranchInst *PreCondBr =
+          dyn_cast<BranchInst>(PreCondBB->getTerminator())) {
+        if (matchCondition(PreCondBr, PH) == InitX)
+          ZeroCheck = true;
+      }
+
+  // Check if CTLZ intrinsic is profitable. Assume it is always profitable
+  // if we delete the loop (the loop has only 6 instructions):
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp eq %shr, 0
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+
+  IRBuilder<> Builder(PH->getTerminator());
+  SmallVector<const Value *, 2> Ops =
+      {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()};
+  ArrayRef<const Value *> Args(Ops);
+  if (CurLoop->getHeader()->size() != 6 &&
+      TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) >
+          TargetTransformInfo::TCC_Basic)
+    return false;
+
+  const DebugLoc DL = DefX->getDebugLoc();
+  transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
+
 /// Recognizes a population count idiom in a non-countable loop.
 ///
 /// If detected, transforms the relevant code to issue the popcount intrinsic
@@ -1207,6 +1450,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   return CI;
 }
 
+static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+                                     const DebugLoc &DL, bool ZeroCheck) {
+  Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+  Type *Tys[] = {Val->getType()};
+
+  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// Transform the following loop:
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+//    LOOP_BODY
+///   Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+///   CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+///   Count = CountPrev + 1
+/// else
+///   Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+///   CntPhi = PHI [Cnt0, CntInst]
+///   PhiX = PHI [InitX, DefX]
+///   PhiCount = PHI [Count, Dec]
+///   CntInst = CntPhi + 1
+///   DefX = PhiX >> 1
+///   Dec = PhiCount - 1
+///   LOOP_BODY
+///   Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+    BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX,
+    const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+  BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+
+  // Step 1: Insert the CTLZ instruction at the end of the preheader block
+  //   Count = BitWidth - CTLZ(InitX);
+  // If there are uses of CntPhi create:
+  //   CountPrev = BitWidth - CTLZ(InitX >> 1);
+  IRBuilder<> Builder(PreheaderBr);
+  Builder.SetCurrentDebugLocation(DL);
+  Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext;
+
+  if (IsCntPhiUsedOutsideLoop)
+    InitXNext = Builder.CreateAShr(InitX,
+                                   ConstantInt::get(InitX->getType(), 1));
+  else
+    InitXNext = InitX;
+  CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck);
+  Count = Builder.CreateSub(
+      ConstantInt::get(CTLZ->getType(),
+                       CTLZ->getType()->getIntegerBitWidth()),
+      CTLZ);
+  if (IsCntPhiUsedOutsideLoop) {
+    CountPrev = Count;
+    Count = Builder.CreateAdd(
+        CountPrev,
+        ConstantInt::get(CountPrev->getType(), 1));
+  }
+  if (IsCntPhiUsedOutsideLoop)
+    NewCount = Builder.CreateZExtOrTrunc(CountPrev,
+        cast<IntegerType>(CntInst->getType()));
+  else
+    NewCount = Builder.CreateZExtOrTrunc(Count,
+        cast<IntegerType>(CntInst->getType()));
+
+  // If the CTLZ counter's initial value is not zero, insert Add Inst.
+  Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+  ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+  if (!InitConst || !InitConst->isZero())
+    NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+
+  // Step 2: Insert new IV and loop condition:
+  // loop:
+  //   ...
+  //   PhiCount = PHI [Count, Dec]
+  //   ...
+  //   Dec = PhiCount - 1
+  //   ...
+  //   Br: loop if (Dec != 0)
+  BasicBlock *Body = *(CurLoop->block_begin());
+  auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator());
+  ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+  Type *Ty = Count->getType();
+
+  PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+  Builder.SetInsertPoint(LbCond);
+  Instruction *TcDec = cast<Instruction>(
+      Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                        "tcdec", false, true));
+
+  TcPhi->addIncoming(Count, Preheader);
+  TcPhi->addIncoming(TcDec, Body);
+
+  CmpInst::Predicate Pred =
+      (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+  LbCond->setPredicate(Pred);
+  LbCond->setOperand(0, TcDec);
+  LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+  // Step 3: All the references to the original counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctlz(x).
+  if (IsCntPhiUsedOutsideLoop)
+    CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+  else
+    CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+  // step 4: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                  Instruction *CntInst,
                                                  PHINode *CntPhi, Value *Var) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 69102d1..af09556 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -77,7 +77,7 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI,
 
         // Don't bother simplifying unused instructions.
         if (!I->use_empty()) {
-          Value *V = SimplifyInstruction(I, DL, TLI, DT, AC);
+          Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC});
           if (V && LI->replacementPreservesLCSSAForm(I, V)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
@@ -189,7 +189,9 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI))
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 char LoopInstSimplifyLegacyPass::ID = 0;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index e9f84ed..2e0d8e0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -39,7 +40,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-interchange"
@@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
 class LoopInterchangeLegality {
 public:
   LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
-                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA)
+                          LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA,
+                          OptimizationRemarkEmitter *ORE)
       : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
-        PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {}
+        PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {}
 
   /// Check if the loops can be interchanged.
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -353,6 +355,8 @@ private:
   LoopInfo *LI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
 
   bool InnerLoopHasReduction;
 };
@@ -361,8 +365,9 @@ private:
 /// loop.
 class LoopInterchangeProfitability {
 public:
-  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE)
-      : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {}
+  LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+                               OptimizationRemarkEmitter *ORE)
+      : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
 
   /// Check if the loop interchange is profitable.
   bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
@@ -376,6 +381,8 @@ private:
 
   /// Scev analysis.
   ScalarEvolution *SE;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
 };
 
 /// LoopInterchangeTransform interchanges the loop.
@@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass {
   DependenceInfo *DI;
   DominatorTree *DT;
   bool PreserveLCSSA;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter *ORE;
+
   LoopInterchange()
       : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) {
     initializeLoopInterchangePass(*PassRegistry::getPassRegistry());
@@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass {
     AU.addRequired<DependenceAnalysisWrapperPass>();
     AU.addRequiredID(LoopSimplifyID);
     AU.addRequiredID(LCSSAID);
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
@@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass {
     DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI();
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     DT = DTWP ? &DTWP->getDomTree() : nullptr;
+    ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
     PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
     // Build up a worklist of loop pairs to analyze.
@@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass {
     Loop *OuterLoop = LoopList[OuterLoopId];
 
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT,
-                                PreserveLCSSA);
+                                PreserveLCSSA, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n");
       return false;
     }
     DEBUG(dbgs() << "Loops are legal to interchange\n");
-    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE);
+    LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       DEBUG(dbgs() << "Interchanging loops not profitable\n");
       return false;
     }
 
+    ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged",
+                                 InnerLoop->getStartLoc(),
+                                 InnerLoop->getHeader())
+              << "Loop interchanged with enclosing loop.");
+
     LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT,
                                  LoopNestExit, LIL.hasInnerLoopReduction());
     LIT.transform();
@@ -757,13 +774,28 @@ bool LoopInterchangeLegality::currentLimitations() {
   PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
   SmallVector<PHINode *, 8> Reductions;
-  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions))
+  if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) {
+    DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes "
+                 << "are supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedPHIInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with induction or reduction PHI nodes can be"
+                 " interchange currently.");
     return true;
+  }
 
   // TODO: Currently we handle only loops with 1 induction variable.
   if (Inductions.size() != 1) {
     DEBUG(dbgs() << "We currently only support loops with 1 induction variable."
                  << "Failed to interchange due to current limitation\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "MultiInductionInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with 1 induction variable can be "
+                 "interchanged currently.");
     return true;
   }
   if (Reductions.size() > 0)
@@ -771,32 +803,80 @@ bool LoopInterchangeLegality::currentLimitations() {
 
   InnerInductionVar = Inductions.pop_back_val();
   Reductions.clear();
-  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions))
+  if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) {
+    DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes "
+                 << "are supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedPHIOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with induction or reduction PHI nodes can be"
+                 " interchanged currently.");
     return true;
+  }
 
   // Outer loop cannot have reduction because then loops will not be tightly
   // nested.
-  if (!Reductions.empty())
+  if (!Reductions.empty()) {
+    DEBUG(dbgs() << "Outer loops with reductions are not supported "
+                 << "currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "ReductionsOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Outer loops with reductions cannot be interchangeed "
+                 "currently.");
     return true;
+  }
   // TODO: Currently we handle only loops with 1 induction variable.
-  if (Inductions.size() != 1)
+  if (Inductions.size() != 1) {
+    DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+                 << "supported currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "MultiIndutionOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with 1 induction variable can be "
+                 "interchanged currently.");
     return true;
+  }
 
   // TODO: Triangular loops are not handled for now.
   if (!isLoopStructureUnderstood(InnerInductionVar)) {
     DEBUG(dbgs() << "Loop structure not understood by pass\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedStructureInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Inner loop structure not understood currently.");
     return true;
   }
 
   // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
   BasicBlock *LoopExitBlock =
       getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader);
-  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true))
+  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) {
+    DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoLCSSAPHIOuter",
+                                       OuterLoop->getStartLoc(),
+                                       OuterLoop->getHeader())
+              << "Only outer loops with LCSSA PHIs can be interchange "
+                 "currently.");
     return true;
+  }
 
   LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader);
-  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false))
+  if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) {
+    DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoLCSSAPHIOuterInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Only inner loops with LCSSA PHIs can be interchange "
+                 "currently.");
     return true;
+  }
 
   // TODO: Current limitation: Since we split the inner loop latch at the point
   // were induction variable is incremented (induction.next); We cannot have
@@ -816,8 +896,16 @@ bool LoopInterchangeLegality::currentLimitations() {
     InnerIndexVarInc =
         dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
 
-  if (!InnerIndexVarInc)
+  if (!InnerIndexVarInc) {
+    DEBUG(dbgs() << "Did not find an instruction to increment the induction "
+                 << "variable.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoIncrementInInner",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "The inner loop does not increment the induction variable.");
     return true;
+  }
 
   // Since we split the inner loop latch on this induction variable. Make sure
   // we do not have any instruction between the induction variable and branch
@@ -827,19 +915,35 @@ bool LoopInterchangeLegality::currentLimitations() {
   for (const Instruction &I : reverse(*InnerLoopLatch)) {
     if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I))
       continue;
+
     // We found an instruction. If this is not induction variable then it is not
     // safe to split this loop latch.
-    if (!I.isIdenticalTo(InnerIndexVarInc))
+    if (!I.isIdenticalTo(InnerIndexVarInc)) {
+      DEBUG(dbgs() << "Found unsupported instructions between induction "
+                   << "variable increment and branch.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "UnsupportedInsBetweenInduction",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Found unsupported instruction between induction variable "
+                 "increment and branch.");
       return true;
+    }
 
     FoundInduction = true;
     break;
   }
   // The loop latch ended and we didn't find the induction variable return as
   // current limitation.
-  if (!FoundInduction)
+  if (!FoundInduction) {
+    DEBUG(dbgs() << "Did not find the induction variable.\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NoIndutionVariable",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Did not find the induction variable.");
     return true;
-
+  }
   return false;
 }
 
@@ -851,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
                  << " and OuterLoopId = " << OuterLoopId
                  << " due to dependence\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "Dependence",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Cannot interchange loops due to dependences.");
     return false;
   }
 
@@ -886,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   // Check if the loops are tightly nested.
   if (!tightlyNested(OuterLoop, InnerLoop)) {
     DEBUG(dbgs() << "Loops not tightly nested\n");
+    ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                       "NotTightlyNested",
+                                       InnerLoop->getStartLoc(),
+                                       InnerLoop->getHeader())
+              << "Cannot interchange loops because they are not tightly "
+                 "nested.");
     return false;
   }
 
@@ -981,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
 
   // It is not profitable as per current cache profitability model. But check if
   // we can move this loop outside to improve parallelism.
-  bool ImprovesPar =
-      isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
-  return ImprovesPar;
+  if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+    return true;
+
+  ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE,
+                                     "InterchangeNotProfitable",
+                                     InnerLoop->getStartLoc(),
+                                     InnerLoop->getHeader())
+            << "Interchanging loops is too costly (cost="
+            << ore::NV("Cost", Cost) << ", threshold="
+            << ore::NV("Threshold", LoopInterchangeCostThreshold) <<
+            ") and it does not improve parallelism.");
+  return false;
 }
 
 void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
@@ -1267,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 
 INITIALIZE_PASS_END(LoopInterchange, "loop-interchange",
                     "Interchanges loops for cache reuse", false, false)
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 8fb5801..20b37c4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -20,13 +20,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -45,9 +46,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
-#include <forward_list>
-#include <cassert>
 #include <algorithm>
+#include <cassert>
+#include <forward_list>
 #include <set>
 #include <tuple>
 #include <utility>
@@ -196,8 +197,7 @@ public:
         continue;
 
       // Only progagate the value if they are of the same type.
-      if (Store->getPointerOperand()->getType() !=
-          Load->getPointerOperand()->getType())
+      if (Store->getPointerOperandType() != Load->getPointerOperandType())
         continue;
 
       Candidates.emplace_front(Load, Store);
@@ -373,15 +373,15 @@ public:
     const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (auto PtrIdx1 : Check.first->Members)
-                     for (auto PtrIdx2 : Check.second->Members)
-                       if (needsChecking(PtrIdx1, PtrIdx2,
-                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (auto PtrIdx1 : Check.first->Members)
+                for (auto PtrIdx2 : Check.second->Members)
+                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+                                    CandLoadPtrs))
+                    return true;
+              return false;
+            });
 
     DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
     DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
@@ -558,6 +558,32 @@ private:
   PredicatedScalarEvolution PSE;
 };
 
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+                          function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+  // Build up a worklist of inner-loops to transform to avoid iterator
+  // invalidation.
+  // FIXME: This logic comes from other passes that actually change the loop
+  // nest structure. It isn't clear this is necessary (or useful) for a pass
+  // which merely optimizes the use of loads in a loop.
+  SmallVector<Loop *, 8> Worklist;
+
+  for (Loop *TopLevelLoop : LI)
+    for (Loop *L : depth_first(TopLevelLoop))
+      // We only handle inner-most loops.
+      if (L->empty())
+        Worklist.push_back(L);
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  for (Loop *L : Worklist) {
+    // The actual work is performed by LoadEliminationForLoop.
+    LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT);
+    Changed |= LEL.processLoop();
+  }
+  return Changed;
+}
+
 /// \brief The pass.  Most of the work is delegated to the per-loop
 /// LoadEliminationForLoop class.
 class LoopLoadElimination : public FunctionPass {
@@ -570,32 +596,14 @@ public:
     if (skipFunction(F))
       return false;
 
-    auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
-    auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
-    // Build up a worklist of inner-loops to vectorize. This is necessary as the
-    // act of distributing a loop creates new loops and can invalidate iterators
-    // across the loops.
-    SmallVector<Loop *, 8> Worklist;
-
-    for (Loop *TopLevelLoop : *LI)
-      for (Loop *L : depth_first(TopLevelLoop))
-        // We only handle inner-most loops.
-        if (L->empty())
-          Worklist.push_back(L);
-
-    // Now walk the identified inner loops.
-    bool Changed = false;
-    for (Loop *L : Worklist) {
-      const LoopAccessInfo &LAI = LAA->getInfo(L);
-      // The actual work is performed by LoadEliminationForLoop.
-      LoadEliminationForLoop LEL(L, LI, LAI, DT);
-      Changed |= LEL.processLoop();
-    }
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
     // Process each loop nest in the function.
-    return Changed;
+    return eliminateLoadsAcrossLoops(
+        F, LI, DT,
+        [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -631,4 +639,28 @@ FunctionPass *createLoopLoadEliminationPass() {
   return new LoopLoadElimination();
 }
 
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+                                               FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+
+  auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+  bool Changed = eliminateLoadsAcrossLoops(
+      F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & {
+        LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI};
+        return LAM.getResult<LoopAccessAnalysis>(L, AR);
+      });
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  return PA;
+}
+
 } // end namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index 028f4bb..10f6fcd 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -42,6 +42,13 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
       break;
     }
 
+#ifndef NDEBUG
+    // Verify the loop structure and LCSSA form before visiting the loop.
+    L.verifyLoop();
+    assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) &&
+           "Loops must remain in LCSSA form!");
+#endif
+
     // Update the analysis manager as each pass runs and potentially
     // invalidates analyses.
     AM.invalidate(L, PassPA);
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
new file mode 100644
index 0000000..9b12ba1
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -0,0 +1,330 @@
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+//   for (i = 0; i < n; i++) {
+//     guard(i < len);
+//     ...
+//   }
+//
+// to
+//
+//   for (i = 0; i < n; i++) {
+//     guard(n - 1 < len);
+//     ...
+//   }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+//   if (n - 1 < len)
+//     for (i = 0; i < n; i++) {
+//       ...
+//     }
+//   else
+//     deoptimize
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+using namespace llvm;
+
+namespace {
+class LoopPredication {
+  /// Represents an induction variable check:
+  ///   icmp Pred, <induction variable>, <loop invariant limit>
+  struct LoopICmp {
+    ICmpInst::Predicate Pred;
+    const SCEVAddRecExpr *IV;
+    const SCEV *Limit;
+    LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+             const SCEV *Limit)
+        : Pred(Pred), IV(IV), Limit(Limit) {}
+    LoopICmp() {}
+  };
+
+  ScalarEvolution *SE;
+
+  Loop *L;
+  const DataLayout *DL;
+  BasicBlock *Preheader;
+
+  Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+
+  Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder,
+                     ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS,
+                     Instruction *InsertAt);
+
+  Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+                                        IRBuilder<> &Builder);
+  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+
+public:
+  LoopPredication(ScalarEvolution *SE) : SE(SE){};
+  bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+  static char ID;
+  LoopPredicationLegacyPass() : LoopPass(ID) {
+    initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    getLoopAnalysisUsage(AU);
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+    auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    LoopPredication LP(SE);
+    return LP.runOnLoop(L);
+  }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace llvm
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+                      "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+                    "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+  return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  LoopPredication LP(&AR.SE);
+  if (!LP.runOnLoop(&L))
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+Optional<LoopPredication::LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst *ICI) {
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+
+  Value *LHS = ICI->getOperand(0);
+  Value *RHS = ICI->getOperand(1);
+  const SCEV *LHSS = SE->getSCEV(LHS);
+  if (isa<SCEVCouldNotCompute>(LHSS))
+    return None;
+  const SCEV *RHSS = SE->getSCEV(RHS);
+  if (isa<SCEVCouldNotCompute>(RHSS))
+    return None;
+
+  // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
+  if (SE->isLoopInvariant(LHSS, L)) {
+    std::swap(LHS, RHS);
+    std::swap(LHSS, RHSS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+  if (!AR || AR->getLoop() != L)
+    return None;
+
+  return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+                                    IRBuilder<> &Builder,
+                                    ICmpInst::Predicate Pred, const SCEV *LHS,
+                                    const SCEV *RHS, Instruction *InsertAt) {
+  Type *Ty = LHS->getType();
+  assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+  Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt);
+  Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt);
+  return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+                                                       SCEVExpander &Expander,
+                                                       IRBuilder<> &Builder) {
+  DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+  DEBUG(ICI->dump());
+
+  auto RangeCheck = parseLoopICmp(ICI);
+  if (!RangeCheck) {
+    DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+    return None;
+  }
+
+  ICmpInst::Predicate Pred = RangeCheck->Pred;
+  const SCEVAddRecExpr *IndexAR = RangeCheck->IV;
+  const SCEV *RHSS = RangeCheck->Limit;
+
+  auto CanExpand = [this](const SCEV *S) {
+    return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE);
+  };
+  if (!CanExpand(RHSS))
+    return None;
+
+  DEBUG(dbgs() << "IndexAR: ");
+  DEBUG(IndexAR->dump());
+
+  bool IsIncreasing = false;
+  if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing))
+    return None;
+
+  // If the predicate is increasing the condition can change from false to true
+  // as the loop progresses, in this case take the value on the first iteration
+  // for the widened check. Otherwise the condition can change from true to
+  // false as the loop progresses, so take the value on the last iteration.
+  const SCEV *NewLHSS = IsIncreasing
+                            ? IndexAR->getStart()
+                            : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
+  if (NewLHSS == IndexAR) {
+    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
+    return None;
+  }
+
+  DEBUG(dbgs() << "NewLHSS: ");
+  DEBUG(NewLHSS->dump());
+
+  if (!CanExpand(NewLHSS))
+    return None;
+
+  DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
+
+  Instruction *InsertAt = Preheader->getTerminator();
+  return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt);
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+                                           SCEVExpander &Expander) {
+  DEBUG(dbgs() << "Processing guard:\n");
+  DEBUG(Guard->dump());
+
+  IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator()));
+
+  // The guard condition is expected to be in form of:
+  //   cond1 && cond2 && cond3 ...
+  // Iterate over subconditions looking for for icmp conditions which can be
+  // widened across loop iterations. Widening these conditions remember the
+  // resulting list of subconditions in Checks vector.
+  SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0));
+  SmallPtrSet<Value *, 4> Visited;
+
+  SmallVector<Value *, 4> Checks;
+
+  unsigned NumWidened = 0;
+  do {
+    Value *Condition = Worklist.pop_back_val();
+    if (!Visited.insert(Condition).second)
+      continue;
+
+    Value *LHS, *RHS;
+    using namespace llvm::PatternMatch;
+    if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+      Worklist.push_back(LHS);
+      Worklist.push_back(RHS);
+      continue;
+    }
+
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+      if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) {
+        Checks.push_back(NewRangeCheck.getValue());
+        NumWidened++;
+        continue;
+      }
+    }
+
+    // Save the condition as is if we can't widen it
+    Checks.push_back(Condition);
+  } while (Worklist.size() != 0);
+
+  if (NumWidened == 0)
+    return false;
+
+  // Emit the new guard condition
+  Builder.SetInsertPoint(Guard);
+  Value *LastCheck = nullptr;
+  for (auto *Check : Checks)
+    if (!LastCheck)
+      LastCheck = Check;
+    else
+      LastCheck = Builder.CreateAnd(LastCheck, Check);
+  Guard->setOperand(0, LastCheck);
+
+  DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+  return true;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+  L = Loop;
+
+  DEBUG(dbgs() << "Analyzing ");
+  DEBUG(L->dump());
+
+  Module *M = L->getHeader()->getModule();
+
+  // There is nothing to do if the module doesn't use guards
+  auto *GuardDecl =
+      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+  if (!GuardDecl || GuardDecl->use_empty())
+    return false;
+
+  DL = &M->getDataLayout();
+
+  Preheader = L->getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  // Collect all the guards into a vector and process later, so as not
+  // to invalidate the instruction iterator.
+  SmallVector<IntrinsicInst *, 4> Guards;
+  for (const auto BB : L->blocks())
+    for (auto &I : *BB)
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+          Guards.push_back(II);
+
+  if (Guards.empty())
+    return false;
+
+  SCEVExpander Expander(*SE, *DL, "loop-predication");
+
+  bool Changed = false;
+  for (auto *Guard : Guards)
+    Changed |= widenGuardConditions(Guard, Expander);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index 86058fe..fc0216e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -11,10 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -31,6 +30,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -557,7 +557,7 @@ bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
             Instruction *UUser = dyn_cast<Instruction>(UU);
             // Skip SExt if we are extending an nsw value
             // TODO: Allow ZExt too
-            if (BO->hasNoSignedWrap() && UUser && UUser->getNumUses() == 1 &&
+            if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
                 isa<SExtInst>(UUser))
               UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
             if (!isCompareUsedByBranch(UUser))
@@ -852,7 +852,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
   for (auto &KV : Roots) {
     if (KV.first == 0)
       continue;
-    if (KV.second->getNumUses() != NumBaseUses) {
+    if (!KV.second->hasNUses(NumBaseUses)) {
       DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
             << "#Base=" << NumBaseUses << ", #Root=" <<
             KV.second->getNumUses() << "\n");
@@ -867,7 +867,7 @@ void LoopReroll::DAGRootTracker::
 findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
   // Does the user look like it could be part of a root set?
   // All its users must be simple arithmetic ops.
-  if (I->getNumUses() > IL_MaxRerollIterations)
+  if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
     return;
 
   if (I != IV && findRootsBase(I, SubsumedInsts))
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index cc83069..3506ac3 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -58,13 +58,14 @@ class LoopRotate {
   AssumptionCache *AC;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  const SimplifyQuery &SQ;
 
 public:
   LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
              const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE)
-      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) {
-  }
+             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        SQ(SQ) {}
   bool processLoop(Loop *L);
 
 private:
@@ -79,7 +80,8 @@ private:
 /// to merge the two values.  Do this now.
 static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
                                             BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
   // Remove PHI node entries that are no longer live.
   BasicBlock::iterator I, E = OrigHeader->end();
   for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
@@ -87,7 +89,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
   // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
   // as necessary.
-  SSAUpdater SSA;
+  SSAUpdater SSA(InsertedPHIs);
   for (I = OrigHeader->begin(); I != E; ++I) {
     Value *OrigHeaderVal = &*I;
 
@@ -174,6 +176,38 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   }
 }
 
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
 /// Rotate loop LP. Return true if the loop is rotated.
 ///
 /// \param SimplifiedLatch is true if the latch was just folded into the final
@@ -278,8 +312,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
     ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
 
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
   // For the rest of the instructions, either hoist to the OrigPreheader if
   // possible or create a clone in the OldPreHeader if not.
   TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
@@ -309,14 +341,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // With the operands remapped, see if the instruction constant folds or is
     // otherwise simplifyable.  This commonly occurs because the entry from PHI
     // nodes allows icmps and other instructions to fold.
-    // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
-    Value *V = SimplifyInstruction(C, DL);
+    Value *V = SimplifyInstruction(C, SQ);
     if (V && LI->replacementPreservesLCSSAForm(C, V)) {
       // If so, then delete the temporary instruction and stick the folded value
       // in the map.
       ValueMap[Inst] = V;
       if (!C->mayHaveSideEffects()) {
-        delete C;
+        C->deleteValue();
         C = nullptr;
       }
     } else {
@@ -347,9 +378,18 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // remove the corresponding incoming values from the PHI nodes in OrigHeader.
   LoopEntryBranch->eraseFromParent();
 
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
   // If there were any uses of instructions in the duplicated block outside the
   // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
 
   // NewHeader is now the header of the loop.
   L->moveToHeader(NewHeader);
@@ -445,10 +485,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
           DomTreeNode *Node = HeaderChildren[I];
           BasicBlock *BB = Node->getBlock();
 
-          pred_iterator PI = pred_begin(BB);
-          BasicBlock *NearestDom = *PI;
-          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
-            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+          BasicBlock *NearestDom = nullptr;
+          for (BasicBlock *Pred : predecessors(BB)) {
+            // Consider only reachable basic blocks.
+            if (!DT->getNode(Pred))
+              continue;
+
+            if (!NearestDom) {
+              NearestDom = Pred;
+              continue;
+            }
+
+            NearestDom = DT->findNearestCommonDominator(NearestDom, Pred);
+            assert(NearestDom && "No NearestCommonDominator found");
+          }
+
+          assert(NearestDom && "Nearest dominator not found");
 
           // Remember if this changes the DomTree.
           if (Node->getIDom()->getBlock() != NearestDom) {
@@ -629,11 +681,15 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
                                       LoopStandardAnalysisResults &AR,
                                       LPMUpdater &) {
   int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
-  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE);
+  const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+  const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
+  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
+                SQ);
 
   bool Changed = LR.processLoop(&L);
   if (!Changed)
     return PreservedAnalyses::all();
+
   return getLoopPassPreservedAnalyses();
 }
 
@@ -671,7 +727,8 @@ public:
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
-    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE);
+    const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
+    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ);
     return LR.processLoop(L);
   }
 };
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 1606121..35c05e8 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -40,7 +40,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) {
   bool Changed = false;
   // Copy blocks into a temporary array to avoid iterator invalidation issues
   // as we remove them.
-  SmallVector<WeakVH, 16> Blocks(L.blocks());
+  SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
 
   for (auto &Block : Blocks) {
     // Attempt to merge blocks in the trivial case. Don't modify blocks which
@@ -69,6 +69,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LPMUpdater &) {
   if (!simplifyLoopCFG(L, AR.DT, AR.LI))
     return PreservedAnalyses::all();
+
   return getLoopPassPreservedAnalyses();
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
index f3f4152..c9d55b4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp
@@ -1,4 +1,4 @@
-//===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,8 +28,10 @@
 //       InsertBBs = UseBBs - DomBBs + BB
 //   For BB in InsertBBs:
 //     Insert I at BB's beginning
+//
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Scalar/LoopSink.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -297,6 +299,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
   return Changed;
 }
 
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  // Nothing to do if there are no loops.
+  if (LI.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = FAM.getResult<AAManager>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
+  // We want to do a postorder walk over the loops. Since loops are a tree this
+  // is equivalent to a reversed preorder walk and preorder is easy to compute
+  // without recursion. Since we reverse the preorder, we will visit siblings
+  // in reverse program order. This isn't expected to matter at all but is more
+  // consistent with sinking algorithms which generally work bottom-up.
+  SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+  bool Changed = false;
+  do {
+    Loop &L = *PreorderLoops.pop_back_val();
+
+    // Note that we don't pass SCEV here because it is only used to invalidate
+    // loops in SCEV and we don't preserve (or request) SCEV at all making that
+    // unnecessary.
+    Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+                                             /*ScalarEvolution*/ nullptr);
+  } while (!PreorderLoops.empty());
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
 namespace {
 struct LegacyLoopSinkPass : public LoopPass {
   static char ID;
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 194587a..3638da1 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -129,6 +129,24 @@ static cl::opt<bool> EnablePhiElim(
   "enable-lsr-phielim", cl::Hidden, cl::init(true),
   cl::desc("Enable LSR phi elimination"));
 
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+  "lsr-exp-narrow", cl::Hidden, cl::init(false),
+  cl::desc("Narrow LSR complex solution using"
+           " expectation of registers number"));
+
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+    "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+    cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+             " with the same ScaledReg and Scale"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -181,10 +199,11 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
-LLVM_DUMP_METHOD
-void RegSortData::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -295,9 +314,13 @@ struct Formula {
   /// canonical representation of a formula is
   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
   /// #1 enforces that the scaled register is always used when at least two
   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
   /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
   /// This invariant can be temporarly broken while building a formula.
   /// However, every formula inserted into the LSRInstance must be in canonical
   /// form.
@@ -318,12 +341,14 @@ struct Formula {
 
   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
-  bool isCanonical() const;
+  bool isCanonical(const Loop &L) const;
 
-  void canonicalize();
+  void canonicalize(const Loop &L);
 
   bool unscale();
 
+  bool hasZeroEnd() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -410,16 +435,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
-  canonicalize();
+  canonicalize(*L);
 }
 
 /// \brief Check whether or not this formula statisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
-bool Formula::isCanonical() const {
-  if (ScaledReg)
-    return Scale != 1 || !BaseRegs.empty();
-  return BaseRegs.size() <= 1;
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
+    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
+  auto I =
+      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+        return isa<const SCEVAddRecExpr>(S) &&
+               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+      });
+  return I == BaseRegs.end();
 }
 
 /// \brief Helper method to morph a formula into its canonical representation.
@@ -428,21 +472,33 @@ bool Formula::isCanonical() const {
 /// field. Otherwise, we would have to do special cases everywhere in LSR
 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
 /// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize() {
-  if (isCanonical())
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
     return;
   // So far we did not need this case. This is easy to implement but it is
   // useless to maintain dead code. Beside it could hurt compile time.
   assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
-  ScaledReg = BaseRegs.back();
-  BaseRegs.pop_back();
-  Scale = 1;
-  size_t BaseRegsSize = BaseRegs.size();
-  size_t Try = 0;
-  // If ScaledReg is an invariant, try to find a variant expression.
-  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
-    std::swap(ScaledReg, BaseRegs[Try++]);
+  if (!ScaledReg) {
+    ScaledReg = BaseRegs.back();
+    BaseRegs.pop_back();
+    Scale = 1;
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
+    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+                     [&](const SCEV *S) {
+                       return isa<const SCEVAddRecExpr>(S) &&
+                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+                     });
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
 }
 
 /// \brief Get rid of the scale in the formula.
@@ -458,6 +514,14 @@ bool Formula::unscale() {
   return true;
 }
 
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -534,10 +598,11 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
-LLVM_DUMP_METHOD
-void Formula::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Return true if the given addrec can be sign-extended without changing its
 /// value.
@@ -711,7 +776,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    if (SI->getOperand(1) == OperandVal)
+    if (SI->getPointerOperand() == OperandVal)
       isAddress = true;
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
@@ -723,6 +788,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
           isAddress = true;
         break;
     }
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    if (RMW->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    if (CmpX->getPointerOperand() == OperandVal)
+      isAddress = true;
   }
   return isAddress;
 }
@@ -735,6 +806,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
     AccessTy.AddrSpace = SI->getPointerAddressSpace();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
     AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
   }
 
   // All pointers have the same requirements, so canonicalize them to an
@@ -832,7 +907,7 @@ static bool isHighCostExpansion(const SCEV *S,
 /// If any of the instructions is the specified set are trivially dead, delete
 /// them and see if this makes any of their operands subsequently dead.
 static bool
-DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
+DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   bool Changed = false;
 
   while (!DeadInsts.empty()) {
@@ -875,44 +950,44 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F);
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);
 
 namespace {
 
 /// This class is used to measure and compare candidate formulae.
 class Cost {
-  /// TODO: Some of these could be merged. Also, a lexical ordering
-  /// isn't always optimal.
-  unsigned NumRegs;
-  unsigned AddRecCost;
-  unsigned NumIVMuls;
-  unsigned NumBaseAdds;
-  unsigned ImmCost;
-  unsigned SetupCost;
-  unsigned ScaleCost;
+  TargetTransformInfo::LSRCost C;
 
 public:
-  Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+  Cost() {
+    C.Insns = 0;
+    C.NumRegs = 0;
+    C.AddRecCost = 0;
+    C.NumIVMuls = 0;
+    C.NumBaseAdds = 0;
+    C.ImmCost = 0;
+    C.SetupCost = 0;
+    C.ScaleCost = 0;
+  }
 
-  bool operator<(const Cost &Other) const;
+  bool isLess(Cost &Other, const TargetTransformInfo &TTI);
 
   void Lose();
 
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
-             | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
-           & ImmCost & SetupCost & ScaleCost) == ~0u);
+    return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
+             | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+      || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
+           & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
   }
 #endif
 
   bool isLoser() {
     assert(isValid() && "invalid cost");
-    return NumRegs == ~0u;
+    return C.NumRegs == ~0u;
   }
 
   void RateFormula(const TargetTransformInfo &TTI,
@@ -1067,7 +1142,8 @@ public:
   }
   
   bool HasFormulaWithSameRegs(const Formula &F) const;
-  bool InsertFormula(const Formula &F);
+  float getNotSelectedProbability(const SCEV *Reg) const;
+  bool InsertFormula(const Formula &F, const Loop &L);
   void DeleteFormula(Formula &F);
   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
 
@@ -1083,20 +1159,26 @@ void Cost::RateRegister(const SCEV *Reg,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-    // If this is an addrec for another loop, don't second-guess its addrec phi
-    // nodes. LSR isn't currently smart enough to reason about more than one
-    // loop at a time. LSR has already run on inner loops, will not run on outer
-    // loops, and cannot be expected to change sibling loops.
+    // If this is an addrec for another loop, it should be an invariant
+    // with respect to L since L is the innermost loop (at least
+    // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
       if (isExistingPhi(AR, SE))
         return;
 
-      // Otherwise, do not consider this formula at all.
-      Lose();
+      // It is bad to allow LSR for current loop to add induction variables
+      // for its sibling loops.
+      if (!AR->getLoop()->contains(L)) {
+        Lose();
+        return;
+      }
+
+      // Otherwise, it will be an invariant with respect to Loop L.
+      ++C.NumRegs;
       return;
     }
-    AddRecCost += 1; /// TODO: This should be a function of the stride.
+    C.AddRecCost += 1; /// TODO: This should be a function of the stride.
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
@@ -1108,7 +1190,7 @@ void Cost::RateRegister(const SCEV *Reg,
       }
     }
   }
-  ++NumRegs;
+  ++C.NumRegs;
 
   // Rough heuristic; favor registers which don't require extra setup
   // instructions in the preheader.
@@ -1117,9 +1199,9 @@ void Cost::RateRegister(const SCEV *Reg,
       !(isa<SCEVAddRecExpr>(Reg) &&
         (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) ||
          isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart()))))
-    ++SetupCost;
+    ++C.SetupCost;
 
-  NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+  C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
                SE.hasComputableLoopEvolution(Reg, L);
 }
 
@@ -1150,8 +1232,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
-  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
   // Tally up the registers.
+  unsigned PrevAddRecCost = C.AddRecCost;
+  unsigned PrevNumRegs = C.NumRegs;
+  unsigned PrevNumBaseAdds = C.NumBaseAdds;
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Lose();
@@ -1176,73 +1261,113 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   if (NumBaseParts > 1)
     // Do not count the base and a possible second register if the target
     // allows to fold 2 registers.
-    NumBaseAdds +=
+    C.NumBaseAdds +=
         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
-  NumBaseAdds += (F.UnfoldedOffset != 0);
+  C.NumBaseAdds += (F.UnfoldedOffset != 0);
 
   // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
+  C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
     int64_t O = Fixup.Offset;
     int64_t Offset = (uint64_t)O + F.BaseOffset;
     if (F.BaseGV)
-      ImmCost += 64; // Handle symbolic values conservatively.
+      C.ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
     else if (Offset != 0)
-      ImmCost += APInt(64, Offset, true).getMinSignedBits();
+      C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
 
     // Check with target if this offset with this instruction is
     // specifically not supported.
     if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) &&
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
-      NumBaseAdds++;
+      C.NumBaseAdds++;
+  }
+
+  // If we don't count instruction cost exit here.
+  if (!InsnsCost) {
+    assert(isValid() && "invalid cost");
+    return;
+  }
+
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (C.NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      C.Insns += (C.NumRegs - PrevNumRegs);
+    else
+      C.Insns += (C.NumRegs - TTIRegNum);
   }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    C.Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  C.Insns += (C.AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
   assert(isValid() && "invalid cost");
 }
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
-  NumRegs = ~0u;
-  AddRecCost = ~0u;
-  NumIVMuls = ~0u;
-  NumBaseAdds = ~0u;
-  ImmCost = ~0u;
-  SetupCost = ~0u;
-  ScaleCost = ~0u;
+  C.Insns = ~0u;
+  C.NumRegs = ~0u;
+  C.AddRecCost = ~0u;
+  C.NumIVMuls = ~0u;
+  C.NumBaseAdds = ~0u;
+  C.ImmCost = ~0u;
+  C.SetupCost = ~0u;
+  C.ScaleCost = ~0u;
 }
 
 /// Choose the lower cost.
-bool Cost::operator<(const Cost &Other) const {
-  return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
-                  ImmCost, SetupCost) <
-         std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
-                  Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost,
-                  Other.SetupCost);
+bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) {
+  if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
+      C.Insns != Other.C.Insns)
+    return C.Insns < Other.C.Insns;
+  return TTI.isLSRCostLess(C, Other.C);
 }
 
 void Cost::print(raw_ostream &OS) const {
-  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
-  if (AddRecCost != 0)
-    OS << ", with addrec cost " << AddRecCost;
-  if (NumIVMuls != 0)
-    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
-  if (NumBaseAdds != 0)
-    OS << ", plus " << NumBaseAdds << " base add"
-       << (NumBaseAdds == 1 ? "" : "s");
-  if (ScaleCost != 0)
-    OS << ", plus " << ScaleCost << " scale cost";
-  if (ImmCost != 0)
-    OS << ", plus " << ImmCost << " imm cost";
-  if (SetupCost != 0)
-    OS << ", plus " << SetupCost << " setup cost";
+  if (InsnsCost)
+    OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+  OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+  if (C.AddRecCost != 0)
+    OS << ", with addrec cost " << C.AddRecCost;
+  if (C.NumIVMuls != 0)
+    OS << ", plus " << C.NumIVMuls << " IV mul"
+       << (C.NumIVMuls == 1 ? "" : "s");
+  if (C.NumBaseAdds != 0)
+    OS << ", plus " << C.NumBaseAdds << " base add"
+       << (C.NumBaseAdds == 1 ? "" : "s");
+  if (C.ScaleCost != 0)
+    OS << ", plus " << C.ScaleCost << " scale cost";
+  if (C.ImmCost != 0)
+    OS << ", plus " << C.ImmCost << " imm cost";
+  if (C.SetupCost != 0)
+    OS << ", plus " << C.SetupCost << " setup cost";
 }
 
-LLVM_DUMP_METHOD
-void Cost::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 LSRFixup::LSRFixup()
   : UserInst(nullptr), OperandValToReplace(nullptr),
@@ -1285,10 +1410,11 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
-LLVM_DUMP_METHOD
-void LSRFixup::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Test whether this use as a formula which has the same registers as the given
 /// formula.
@@ -1300,10 +1426,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   return Uniquifier.count(Key);
 }
 
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+  unsigned FNum = 0;
+  for (const Formula &F : Formulae)
+    if (F.referencesReg(Reg))
+      FNum++;
+  return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
 /// If the given formula has not yet been inserted, add it to the list, and
 /// return true. Return false otherwise.  The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F) {
-  assert(F.isCanonical() && "Invalid canonical representation");
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");
 
   if (!Formulae.empty() && RigidFormula)
     return false;
@@ -1391,10 +1526,11 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
-LLVM_DUMP_METHOD
-void LSRUse::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
@@ -1472,7 +1608,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  int64_t MinOffset, int64_t MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 const Formula &F) {
+                                 const Formula &F, const Loop &L) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
   // or a scale not equal to zero is correct.
   // Problems may arise from non canonical formulae having a scale == 0.
@@ -1480,7 +1616,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
   // However, when we generate the scaled formulae, we first check that the
   // scaling factor is profitable before computing the actual ScaledReg for
   // compile time sake.
-  assert((F.isCanonical() || F.Scale != 0));
+  assert((F.isCanonical(L) || F.Scale != 0));
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
 }
@@ -1515,14 +1651,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F) {
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
   if (!F.Scale)
     return 0;
 
   // If the use is not completely folded in that instruction, we will have to
   // pay an extra cost only for scale != 1.
   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                            LU.AccessTy, F))
+                            LU.AccessTy, F, L))
     return F.Scale != 1;
 
   switch (LU.Kind) {
@@ -1718,7 +1855,7 @@ class LSRInstance {
   void FinalizeChain(IVChain &Chain);
   void CollectChains();
   void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
-                       SmallVectorImpl<WeakVH> &DeadInsts);
+                       SmallVectorImpl<WeakTrackingVH> &DeadInsts);
 
   void CollectInterestingTypesAndFactors();
   void CollectFixupsAndInitialFormulae();
@@ -1772,6 +1909,8 @@ class LSRInstance {
   void NarrowSearchSpaceByDetectingSupersets();
   void NarrowSearchSpaceByCollapsingUnrolledCode();
   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
 
@@ -1792,19 +1931,15 @@ class LSRInstance {
                                   const LSRUse &LU,
                                   SCEVExpander &Rewriter) const;
 
-  Value *Expand(const LSRUse &LU, const LSRFixup &LF,
-                const Formula &F,
-                BasicBlock::iterator IP,
-                SCEVExpander &Rewriter,
-                SmallVectorImpl<WeakVH> &DeadInsts) const;
+  Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+                BasicBlock::iterator IP, SCEVExpander &Rewriter,
+                SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
   void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
-                     const Formula &F,
-                     SCEVExpander &Rewriter,
-                     SmallVectorImpl<WeakVH> &DeadInsts) const;
-  void Rewrite(const LSRUse &LU, const LSRFixup &LF,
-               const Formula &F,
+                     const Formula &F, SCEVExpander &Rewriter,
+                     SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+  void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
                SCEVExpander &Rewriter,
-               SmallVectorImpl<WeakVH> &DeadInsts) const;
+               SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
 
 public:
@@ -2191,7 +2326,7 @@ LSRInstance::OptimizeLoopTermCond() {
                 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
             const ConstantInt *C = D->getValue();
             // Stride of one or negative one can have reuse with non-addresses.
-            if (C->isOne() || C->isAllOnesValue())
+            if (C->isOne() || C->isMinusOne())
               goto decline_post_inc;
             // Avoid weird situations.
             if (C->getValue().getMinSignedBits() >= 64 ||
@@ -2492,7 +2627,12 @@ static Value *getWideOperand(Value *Oper) {
 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
   Type *LType = LVal->getType();
   Type *RType = RVal->getType();
-  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+                              // Different address spaces means (possibly)
+                              // different types of the pointer implementation,
+                              // e.g. i16 vs i32 so disallow that.
+                              (LType->getPointerAddressSpace() ==
+                               RType->getPointerAddressSpace()));
 }
 
 /// Return an approximation of this SCEV expression's "base", or NULL for any
@@ -2881,7 +3021,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
 /// Generate an add or subtract for each IVInc in a chain to materialize the IV
 /// user's operand from the previous IV user's operand.
 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
-                                  SmallVectorImpl<WeakVH> &DeadInsts) {
+                                  SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   // Find the new IVOperand for the head of the chain. It may have been replaced
   // by LSR.
   const IVInc &Head = Chain.Incs[0];
@@ -2989,8 +3129,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     User::op_iterator UseI =
         find(UserInst->operands(), U.getOperandValToReplace());
     assert(UseI != UserInst->op_end() && "cannot find IV operand");
-    if (IVIncSet.count(UseI))
+    if (IVIncSet.count(UseI)) {
+      DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
       continue;
+    }
 
     LSRUse::KindType Kind = LSRUse::Basic;
     MemAccessTy AccessTy;
@@ -3025,8 +3167,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
         if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
-          N = TransformForPostIncUse(Normalize, N, CI, nullptr,
-                                     TmpPostIncLoops, SE, DT);
+          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
         }
@@ -3108,7 +3249,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   // Do not insert formula that we will not be able to expand.
   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
          "Formula is illegal");
-  if (!LU.InsertFormula(F))
+
+  if (!LU.InsertFormula(F, *L))
     return false;
 
   CountRegisters(F, LUIdx);
@@ -3347,7 +3489,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       F.BaseRegs.push_back(*J);
     // We may have changed the number of register in base regs, adjust the
     // formula accordingly.
-    F.canonicalize();
+    F.canonicalize(*L);
 
     if (InsertFormula(LU, LUIdx, F))
       // If that formula hadn't been seen before, recurse to find more like
@@ -3359,7 +3501,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
 /// Split out subexpressions from adds and the bases of addrecs.
 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                          Formula Base, unsigned Depth) {
-  assert(Base.isCanonical() && "Input must be in the canonical form");
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
   // Arbitrarily cap recursion to protect compile time.
   if (Depth >= 3)
     return;
@@ -3400,7 +3542,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
-      F.canonicalize();
+      F.canonicalize(*L);
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
@@ -3457,7 +3599,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           F.ScaledReg = nullptr;
         } else
           F.deleteBaseReg(F.BaseRegs[Idx]);
-        F.canonicalize();
+        F.canonicalize(*L);
       } else if (IsScaledReg)
         F.ScaledReg = NewG;
       else
@@ -3620,10 +3762,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     if (LU.Kind == LSRUse::ICmpZero &&
         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
       continue;
-    // For each addrec base reg, apply the scale, if possible.
-    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
-      if (const SCEVAddRecExpr *AR =
-            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
         if (FactorS->isZero())
           continue;
@@ -3637,11 +3779,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           // The canonical representation of 1*reg is reg, which is already in
           // Base. In that case, do not try to insert the formula, it will be
           // rejected anyway.
-          if (F.Scale == 1 && F.BaseRegs.empty())
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
             continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
           (void)InsertFormula(LU, LUIdx, F);
         }
       }
+    }
   }
 }
 
@@ -3668,6 +3816,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
       if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
         continue;
 
+      F.canonicalize(*L);
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
@@ -3697,10 +3846,11 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
-LLVM_DUMP_METHOD
-void WorkItem::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// Look for registers which are a constant distance apart and try to form reuse
 /// opportunities between them.
@@ -3764,8 +3914,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
 
         // Compute the difference between the two.
         int64_t Imm = (uint64_t)JImm - M->first;
-        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
-             LUIdx = UsedByIndices.find_next(LUIdx))
+        for (unsigned LUIdx : UsedByIndices.set_bits())
           // Make a memo of this use, offset, and register tuple.
           if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
             WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
@@ -3821,7 +3970,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
             continue;
 
         // OK, looks good.
-        NewF.canonicalize();
+        NewF.canonicalize(*this->L);
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3853,7 +4002,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                 goto skip_formula;
 
           // Ok, looks good.
-          NewF.canonicalize();
+          NewF.canonicalize(*this->L);
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -3967,7 +4116,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
         Cost CostBest;
         Regs.clear();
         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU);
-        if (CostF < CostBest)
+        if (CostF.isLess(CostBest, TTI))
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
               dbgs() << "\n"
@@ -4165,6 +4314,242 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   }
 }
 
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heurstic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  DEBUG(dbgs() << "The search space is too complex.\n"
+                  "Narrowing the search space by choosing the best Formula "
+                  "from the Formulae with the same Scale and ScaledReg.\n");
+
+  // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+  typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy;
+  BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+  bool ChangedFormulae = false;
+#endif
+  DenseSet<const SCEV *> VisitedRegs;
+  SmallPtrSet<const SCEV *, 16> Regs;
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n');
+
+    // Return true if Formula FA is better than Formula FB.
+    auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+      // First we will try to choose the Formula with fewer new registers.
+      // For a register used by current Formula, the more the register is
+      // shared among LSRUses, the less we increase the register number
+      // counter of the formula.
+      size_t FARegNum = 0;
+      for (const SCEV *Reg : FA.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FARegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      size_t FBRegNum = 0;
+      for (const SCEV *Reg : FB.BaseRegs) {
+        const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+        FBRegNum += (NumUses - UsedByIndices.count() + 1);
+      }
+      if (FARegNum != FBRegNum)
+        return FARegNum < FBRegNum;
+
+      // If the new register numbers are the same, choose the Formula with
+      // less Cost.
+      Cost CostFA, CostFB;
+      Regs.clear();
+      CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU);
+      Regs.clear();
+      CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU);
+      return CostFA.isLess(CostFB, TTI);
+    };
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+         ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+      if (!F.ScaledReg)
+        continue;
+      auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+      if (P.second)
+        continue;
+
+      Formula &Best = LU.Formulae[P.first->second];
+      if (IsBetterThan(F, Best))
+        std::swap(F, Best);
+      DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+            dbgs() << "\n"
+                      "    in favor of formula ";
+            Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+      ChangedFormulae = true;
+#endif
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
+    }
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    // Reset this to prepare for the next use.
+    BestFormulae.clear();
+  }
+
+  DEBUG(if (ChangedFormulae) {
+    dbgs() << "\n"
+              "After filtering out undesirable candidates:\n";
+    print_uses(dbgs());
+  });
+}
+
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+///  reg(a) + reg({0,+,1})
+///  reg(a) + reg({-1,+,1}) + 1
+///  reg({a,+,1})
+/// Use2:
+///  reg(b) + reg({0,+,1})
+///  reg(b) + reg({-1,+,1}) + 1
+///  reg({b,+,1})
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1})
+///  reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+///                 Use1   Use2    Use3
+/// reg(a)         (1/3) *   1   *   1
+/// reg(b)           1   * (1/3) * (1/2)
+/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1})  (2/3) * (2/3) *   1
+/// reg({a,+,1})   (2/3) *   1   *   1
+/// reg({b,+,1})     1   * (2/3) * (2/3)
+/// reg(c)           1   *   1   *   0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
+///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
+///  reg({a,+,1})                   1
+/// Use2:
+///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
+///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
+///  reg({b,+,1})                   2/3
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+///  reg(c) + reg({b,+,1})          1 + 2/3
+
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+  // Ok, we have too many of formulae on our hands to conveniently handle.
+  // Use a rough heuristic to thin out the list.
+
+  // Set of Regs wich will be 100% used in final solution.
+  // Used in each formula of a solution (in example above this is reg(c)).
+  // We can skip them in calculations.
+  SmallPtrSet<const SCEV *, 4> UniqRegs;
+  DEBUG(dbgs() << "The search space is too complex.\n");
+
+  // Map each register to probability of not selecting
+  DenseMap <const SCEV *, float> RegNumMap;
+  for (const SCEV *Reg : RegUses) {
+    if (UniqRegs.count(Reg))
+      continue;
+    float PNotSel = 1;
+    for (const LSRUse &LU : Uses) {
+      if (!LU.Regs.count(Reg))
+        continue;
+      float P = LU.getNotSelectedProbability(Reg);
+      if (P != 0.0)
+        PNotSel *= P;
+      else
+        UniqRegs.insert(Reg);
+    }
+    RegNumMap.insert(std::make_pair(Reg, PNotSel));
+  }
+
+  DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+  // Delete formulas where registers number expectation is high.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // If nothing to delete - continue.
+    if (LU.Formulae.size() < 2)
+      continue;
+    // This is temporary solution to test performance. Float should be
+    // replaced with round independent type (based on integers) to avoid
+    // different results for different target builds.
+    float FMinRegNum = LU.Formulae[0].getNumRegs();
+    float FMinARegNum = LU.Formulae[0].getNumRegs();
+    size_t MinIdx = 0;
+    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+      Formula &F = LU.Formulae[i];
+      float FRegNum = 0;
+      float FARegNum = 0;
+      for (const SCEV *BaseReg : F.BaseRegs) {
+        if (UniqRegs.count(BaseReg))
+          continue;
+        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+        if (isa<SCEVAddRecExpr>(BaseReg))
+          FARegNum +=
+              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+      }
+      if (const SCEV *ScaledReg = F.ScaledReg) {
+        if (!UniqRegs.count(ScaledReg)) {
+          FRegNum +=
+              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+          if (isa<SCEVAddRecExpr>(ScaledReg))
+            FARegNum +=
+                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+        }
+      }
+      if (FMinRegNum > FRegNum ||
+          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+        FMinRegNum = FRegNum;
+        FMinARegNum = FARegNum;
+        MinIdx = i;
+      }
+    }
+    DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+          dbgs() << " with min reg num " << FMinRegNum << '\n');
+    if (MinIdx != 0)
+      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+    while (LU.Formulae.size() != 1) {
+      DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+            dbgs() << '\n');
+      LU.Formulae.pop_back();
+    }
+    LU.RecomputeRegs(LUIdx, RegUses);
+    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+    Formula &F = LU.Formulae[0];
+    DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    // When we choose the formula, the regs become unique.
+    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+    if (F.ScaledReg)
+      UniqRegs.insert(F.ScaledReg);
+  }
+  DEBUG(dbgs() << "After pre-selection:\n";
+  print_uses(dbgs()));
+}
+
+
 /// Pick a register which seems likely to be profitable, and then in any use
 /// which has any reference to that register, delete all formulae which do not
 /// reference that register.
@@ -4237,7 +4622,12 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByDetectingSupersets();
   NarrowSearchSpaceByCollapsingUnrolledCode();
   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
-  NarrowSearchSpaceByPickingWinnerRegs();
+  if (FilterSameScaledReg)
+    NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  if (LSRExpNarrow)
+    NarrowSearchSpaceByDeletingCostlyFormulas();
+  else
+    NarrowSearchSpaceByPickingWinnerRegs();
 }
 
 /// This is the recursive solver.
@@ -4294,7 +4684,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     NewCost = CurCost;
     NewRegs = CurRegs;
     NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU);
-    if (NewCost < SolutionCost) {
+    if (NewCost.isLess(SolutionCost, TTI)) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
         SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
@@ -4476,12 +4866,10 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
 
 /// Emit instructions for the leading candidate expression for this LSRUse (this
 /// is called "expanding").
-Value *LSRInstance::Expand(const LSRUse &LU,
-                           const LSRFixup &LF,
-                           const Formula &F,
-                           BasicBlock::iterator IP,
+Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
+                           const Formula &F, BasicBlock::iterator IP,
                            SCEVExpander &Rewriter,
-                           SmallVectorImpl<WeakVH> &DeadInsts) const {
+                           SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
   if (LU.RigidFormula)
     return LF.OperandValToReplace;
 
@@ -4515,11 +4903,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
     assert(!Reg->isZero() && "Zero allocated in a base register!");
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
-    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
-    Reg = TransformForPostIncUse(Denormalize, Reg,
-                                 LF.UserInst, LF.OperandValToReplace,
-                                 Loops, SE, DT);
-
+    Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
   }
 
@@ -4530,9 +4914,7 @@ Value *LSRInstance::Expand(const LSRUse &LU,
 
     // If we're expanding for a post-inc user, make the post-inc adjustment.
     PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
-    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
-                                     LF.UserInst, LF.OperandValToReplace,
-                                     Loops, SE, DT);
+    ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
 
     if (LU.Kind == LSRUse::ICmpZero) {
       // Expand ScaleReg as if it was part of the base regs.
@@ -4662,12 +5044,9 @@ Value *LSRInstance::Expand(const LSRUse &LU,
 /// Helper for Rewrite. PHI nodes are special because the use of their operands
 /// effectively happens in their predecessor blocks, so the expression may need
 /// to be expanded in multiple places.
-void LSRInstance::RewriteForPHI(PHINode *PN,
-                                const LSRUse &LU,
-                                const LSRFixup &LF,
-                                const Formula &F,
-                                SCEVExpander &Rewriter,
-                                SmallVectorImpl<WeakVH> &DeadInsts) const {
+void LSRInstance::RewriteForPHI(
+    PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+    SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
   DenseMap<BasicBlock *, Value *> Inserted;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
@@ -4739,11 +5118,9 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
 /// Emit instructions for the leading candidate expression for this LSRUse (this
 /// is called "expanding"), and update the UserInst to reference the newly
 /// expanded value.
-void LSRInstance::Rewrite(const LSRUse &LU,
-                          const LSRFixup &LF,
-                          const Formula &F,
-                          SCEVExpander &Rewriter,
-                          SmallVectorImpl<WeakVH> &DeadInsts) const {
+void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
+                          const Formula &F, SCEVExpander &Rewriter,
+                          SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
@@ -4781,7 +5158,7 @@ void LSRInstance::ImplementSolution(
     const SmallVectorImpl<const Formula *> &Solution) {
   // Keep track of instructions we may have made dead, so that
   // we can remove them after we are done working.
-  SmallVector<WeakVH, 16> DeadInsts;
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
 
   SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(),
                         "lsr");
@@ -4975,10 +5352,11 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
-LLVM_DUMP_METHOD
-void LSRInstance::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -5030,7 +5408,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   // Remove any extra phis created by processing inner loops.
   Changed |= DeleteDeadPHIs(L->getHeader());
   if (EnablePhiElim && L->isLoopSimplifyForm()) {
-    SmallVector<WeakVH, 16> DeadInsts;
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
     SCEVExpander Rewriter(SE, DL, "lsr");
 #ifndef NDEBUG
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index c7f9122..530a684 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -44,7 +44,11 @@ using namespace llvm;
 
 static cl::opt<unsigned>
     UnrollThreshold("unroll-threshold", cl::Hidden,
-                    cl::desc("The baseline cost threshold for loop unrolling"));
+                    cl::desc("The cost threshold for loop unrolling"));
+
+static cl::opt<unsigned> UnrollPartialThreshold(
+    "unroll-partial-threshold", cl::Hidden,
+    cl::desc("The cost threshold for partial loop unrolling"));
 
 static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
     "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
@@ -106,10 +110,19 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
              "aggressively unrolled."));
 
 static cl::opt<bool>
-    UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+    "unroll-revisit-child-loops", cl::Hidden,
+    cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+             "This shouldn't typically be needed as child loops (or their "
+             "clones) were already visited."));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -118,16 +131,17 @@ static const unsigned NoThreshold = UINT_MAX;
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
-    Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
-    Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
-    Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
+    Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
-  UP.Threshold = 150;
+  UP.Threshold = OptLevel > 2 ? 300 : 150;
   UP.MaxPercentThresholdBoost = 400;
   UP.OptSizeThreshold = 0;
-  UP.PartialThreshold = UP.Threshold;
+  UP.PartialThreshold = 150;
   UP.PartialOptSizeThreshold = 0;
   UP.Count = 0;
   UP.PeelCount = 0;
@@ -141,10 +155,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
-  UP.AllowPeeling = false;
+  UP.AllowPeeling = true;
 
   // Override with any target specific settings
-  TTI.getUnrollingPreferences(L, UP);
+  TTI.getUnrollingPreferences(L, SE, UP);
 
   // Apply size attributes
   if (L->getHeader()->getParent()->optForSize()) {
@@ -153,10 +167,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   }
 
   // Apply any user values specified by cl::opt
-  if (UnrollThreshold.getNumOccurrences() > 0) {
+  if (UnrollThreshold.getNumOccurrences() > 0)
     UP.Threshold = UnrollThreshold;
-    UP.PartialThreshold = UnrollThreshold;
-  }
+  if (UnrollPartialThreshold.getNumOccurrences() > 0)
+    UP.PartialThreshold = UnrollPartialThreshold;
   if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
@@ -495,7 +509,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
             KnownSucc = SI->getSuccessor(0);
           else if (ConstantInt *SimpleCondVal =
                        dyn_cast<ConstantInt>(SimpleCond))
-            KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
+            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
         }
       }
       if (KnownSucc) {
@@ -685,7 +699,7 @@ static uint64_t getUnrolledLoopSize(
 // Calculates unroll count and writes it to UP.Count.
 static bool computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
+    ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
     unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // Check for explicit Count.
@@ -756,7 +770,7 @@ static bool computeUnrollCount(
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, *SE, TTI,
+              L, FullUnrollTripCount, DT, SE, TTI,
               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -770,7 +784,15 @@ static bool computeUnrollCount(
     }
   }
 
-  // 4rd priority is partial unrolling.
+  // 4th priority is loop peeling
+  computePeelCount(L, LoopSize, UP, TripCount);
+  if (UP.PeelCount) {
+    UP.Runtime = false;
+    UP.Count = 1;
+    return ExplicitUnroll;
+  }
+
+  // 5th priority is partial unrolling.
   // Try partial unroll only when TripCount could be staticaly calculated.
   if (TripCount) {
     UP.Partial |= ExplicitUnroll;
@@ -814,6 +836,8 @@ static bool computeUnrollCount(
     } else {
       UP.Count = TripCount;
     }
+    if (UP.Count > UP.MaxCount)
+      UP.Count = UP.MaxCount;
     if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
         UP.Count != TripCount)
       ORE->emit(
@@ -833,14 +857,6 @@ static bool computeUnrollCount(
         << "Unable to fully unroll loop as directed by unroll(full) pragma "
            "because loop has a runtime trip count.");
 
-  // 5th priority is loop peeling
-  computePeelCount(L, LoopSize, UP);
-  if (UP.PeelCount) {
-    UP.Runtime = false;
-    UP.Count = 1;
-    return ExplicitUnroll;
-  }
-
   // 6th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
   if (HasRuntimeUnrollDisablePragma(L)) {
@@ -912,9 +928,9 @@ static bool computeUnrollCount(
 }
 
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
-                            ScalarEvolution *SE, const TargetTransformInfo &TTI,
+                            ScalarEvolution &SE, const TargetTransformInfo &TTI,
                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
-                            bool PreserveLCSSA,
+                            bool PreserveLCSSA, int OptLevel,
                             Optional<unsigned> ProvidedCount,
                             Optional<unsigned> ProvidedThreshold,
                             Optional<bool> ProvidedAllowPartial,
@@ -934,8 +950,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
-      ProvidedRuntime, ProvidedUpperBound);
+      L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
+      ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
     return false;
@@ -963,8 +979,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
     ExitingBlock = L->getExitingBlock();
   if (ExitingBlock) {
-    TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
-    TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+    TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+    TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
   }
 
   // If the loop contains a convergent operation, the prelude we'd add
@@ -986,8 +1002,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // count.
   bool MaxOrZero = false;
   if (!TripCount) {
-    MaxTripCount = SE->getSmallConstantMaxTripCount(L);
-    MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
+    MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+    MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
     // We can unroll by the upper bound amount if it's generally allowed or if
     // we know that the loop is executed either the upper bound or zero times.
     // (MaxOrZero unrolling keeps only the first loop test, so the number of
@@ -1016,7 +1032,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // Unroll the loop.
   if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
                   UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
-                  TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE,
+                  TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
                   PreserveLCSSA))
     return false;
 
@@ -1034,16 +1050,17 @@ namespace {
 class LoopUnroll : public LoopPass {
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopUnroll(Optional<unsigned> Threshold = None,
+  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None)
-      : LoopPass(ID), ProvidedCount(std::move(Count)),
+      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
+  int OptLevel;
   Optional<unsigned> ProvidedCount;
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
@@ -1058,7 +1075,7 @@ public:
 
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
@@ -1068,7 +1085,7 @@ public:
     OptimizationRemarkEmitter ORE(&F);
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
-    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
                            ProvidedCount, ProvidedThreshold,
                            ProvidedAllowPartial, ProvidedRuntime,
                            ProvidedUpperBound);
@@ -1094,26 +1111,27 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
-                                 int Runtime, int UpperBound) {
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+                                 int AllowPartial, int Runtime,
+                                 int UpperBound) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
-  return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
-                        Count == -1 ? None : Optional<unsigned>(Count),
-                        AllowPartial == -1 ? None
-                                           : Optional<bool>(AllowPartial),
-                        Runtime == -1 ? None : Optional<bool>(Runtime),
-                        UpperBound == -1 ? None : Optional<bool>(UpperBound));
+  return new LoopUnroll(
+      OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      Count == -1 ? None : Optional<unsigned>(Count),
+      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+      Runtime == -1 ? None : Optional<bool>(Runtime),
+      UpperBound == -1 ? None : Optional<bool>(UpperBound));
 }
 
-Pass *llvm::createSimpleLoopUnrollPass() {
-  return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+  return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
 }
 
 PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                                       LoopStandardAnalysisResults &AR,
-                                      LPMUpdater &) {
+                                      LPMUpdater &Updater) {
   const auto &FAM =
       AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
@@ -1124,12 +1142,84 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
     report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not "
                        "cached at a higher level");
 
-  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
-                                 /*PreserveLCSSA*/ true, ProvidedCount,
-                                 ProvidedThreshold, ProvidedAllowPartial,
-                                 ProvidedRuntime, ProvidedUpperBound);
-
+  // Keep track of the previous loop structure so we can identify new loops
+  // created by unrolling.
+  Loop *ParentL = L.getParentLoop();
+  SmallPtrSet<Loop *, 4> OldLoops;
+  if (ParentL)
+    OldLoops.insert(ParentL->begin(), ParentL->end());
+  else
+    OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+  // The API here is quite complex to call, but there are only two interesting
+  // states we support: partial and full (or "simple") unrolling. However, to
+  // enable these things we actually pass "None" in for the optional to avoid
+  // providing an explicit choice.
+  Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
+  if (!AllowPartialUnrolling)
+    AllowPartialParam = RuntimeParam = UpperBoundParam = false;
+  bool Changed = tryToUnrollLoop(
+      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+      /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
   if (!Changed)
     return PreservedAnalyses::all();
+
+  // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+  if (ParentL)
+    ParentL->verifyLoop();
+#endif
+
+  // Unrolling can do several things to introduce new loops into a loop nest:
+  // - Partial unrolling clones child loops within the current loop. If it
+  //   uses a remainder, then it can also create any number of sibling loops.
+  // - Full unrolling clones child loops within the current loop but then
+  //   removes the current loop making all of the children appear to be new
+  //   sibling loops.
+  // - Loop peeling can directly introduce new sibling loops by peeling one
+  //   iteration.
+  //
+  // When a new loop appears as a sibling loop, either from peeling an
+  // iteration or fully unrolling, its nesting structure has fundamentally
+  // changed and we want to revisit it to reflect that.
+  //
+  // When unrolling has removed the current loop, we need to tell the
+  // infrastructure that it is gone.
+  //
+  // Finally, we support a debugging/testing mode where we revisit child loops
+  // as well. These are not expected to require further optimizations as either
+  // they or the loop they were cloned from have been directly visited already.
+  // But the debugging mode allows us to check this assumption.
+  bool IsCurrentLoopValid = false;
+  SmallVector<Loop *, 4> SibLoops;
+  if (ParentL)
+    SibLoops.append(ParentL->begin(), ParentL->end());
+  else
+    SibLoops.append(AR.LI.begin(), AR.LI.end());
+  erase_if(SibLoops, [&](Loop *SibLoop) {
+    if (SibLoop == &L) {
+      IsCurrentLoopValid = true;
+      return true;
+    }
+
+    // Otherwise erase the loop from the list if it was in the old loops.
+    return OldLoops.count(SibLoop) != 0;
+  });
+  Updater.addSiblingLoops(SibLoops);
+
+  if (!IsCurrentLoopValid) {
+    Updater.markLoopAsDeleted(L);
+  } else {
+    // We can only walk child loops if the current loop remained valid.
+    if (UnrollRevisitChildLoops) {
+      // Walk *all* of the child loops. This is a highly speculative mode
+      // anyways so look for any simplifications that arose from partial
+      // unrolling or peeling off of iterations.
+      SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+      Updater.addChildLoops(ChildLoops);
+    }
+  }
+
   return getLoopPassPreservedAnalyses();
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 76fe918..d0c96fa 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This pass transforms loops that contain branches on loop-invariant conditions
-// to have multiple loops.  For example, it turns the left into the right code:
+// to multiple loops.  For example, it turns the left into the right code:
 //
 //  for (...)                  if (lic)
 //    A                          for (...)
@@ -26,32 +26,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Support/BranchProbability.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -77,19 +79,6 @@ static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
 
-static cl::opt<bool>
-LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
-    cl::init(false), cl::Hidden,
-    cl::desc("Enable the use of the block frequency analysis to access PGO "
-             "heuristics to minimize code growth in cold regions."));
-
-static cl::opt<unsigned>
-ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
-    cl::desc("Coldness threshold in percentage. The loop header frequency "
-             "(relative to the entry frequency) is compared with this "
-             "threshold to determine if non-trivial unswitching should be "
-             "enabled."));
-
 namespace {
 
   class LUAnalysisCache {
@@ -174,13 +163,6 @@ namespace {
 
     LUAnalysisCache BranchesInfo;
 
-    bool EnabledPGO;
-
-    // BFI and ColdEntryFreq are only used when PGO and
-    // LoopUnswitchWithBlockFrequency are enabled.
-    BlockFrequencyInfo BFI;
-    BlockFrequency ColdEntryFreq;
-
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -199,12 +181,14 @@ namespace {
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
     std::vector<BasicBlock*> NewBlocks;
 
+    bool hasBranchDivergence;
+
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false) :
+    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
       currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
-      loopPreheader(nullptr) {
+      loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -217,6 +201,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (hasBranchDivergence)
+        AU.addRequired<DivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
     }
 
@@ -255,6 +241,11 @@ namespace {
                                         TerminatorInst *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+    /// Given that the Invariant is not equal to Val. Simplify instructions
+    /// in the loop.
+    Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+                                           Constant *Val);
   };
 }
 
@@ -381,16 +372,35 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os) {
-  return new LoopUnswitch(Os);
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+  return new LoopUnswitch(Os, hasBranchDivergence);
 }
 
+/// Operator chain lattice.
+enum OperatorChain {
+  OC_OpChainNone,    ///< There is no operator.
+  OC_OpChainOr,      ///< There are only ORs.
+  OC_OpChainAnd,     ///< There are only ANDs.
+  OC_OpChainMixed    ///< There are ANDs and ORs.
+};
+
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   OperatorChain &ParentChain,
                                    DenseMap<Value *, Value *> &Cache) {
   auto CacheIt = Cache.find(Cond);
   if (CacheIt != Cache.end())
@@ -414,21 +424,53 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
     return Cond;
   }
 
+  // Walk up the operator chain to find partial invariant conditions.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
     if (BO->getOpcode() == Instruction::And ||
         BO->getOpcode() == Instruction::Or) {
-      // If either the left or right side is invariant, we can unswitch on this,
-      // which will cause the branch to go away in one loop and the condition to
-      // simplify in the other one.
-      if (Value *LHS =
-              FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) {
-        Cache[Cond] = LHS;
-        return LHS;
+      // Given the previous operator, compute the current operator chain status.
+      OperatorChain NewChain;
+      switch (ParentChain) {
+      case OC_OpChainNone:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainOr;
+        break;
+      case OC_OpChainOr:
+        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainAnd:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainMixed:
+        NewChain = OC_OpChainMixed;
+        break;
       }
-      if (Value *RHS =
-              FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) {
-        Cache[Cond] = RHS;
-        return RHS;
+
+      // If we reach a Mixed state, we do not want to keep walking up as we can not
+      // reliably find a value that will simplify the chain. With this check, we
+      // will return null on the first sight of mixed chain and the caller will
+      // either backtrack to find partial LIV in other operand or return null.
+      if (NewChain != OC_OpChainMixed) {
+        // Update the current operator chain type before we search up the chain.
+        ParentChain = NewChain;
+        // If either the left or right side is invariant, we can unswitch on this,
+        // which will cause the branch to go away in one loop and the condition to
+        // simplify in the other one.
+        if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = LHS;
+          return LHS;
+        }
+        // We did not manage to find a partial LIV in operand(0). Backtrack and try
+        // operand(1).
+        ParentChain = NewChain;
+        if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = RHS;
+          return RHS;
+        }
       }
     }
 
@@ -436,9 +478,21 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   return nullptr;
 }
 
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+                                                              Loop *L,
+                                                              bool &Changed) {
   DenseMap<Value *, Value *> Cache;
-  return FindLIVLoopCondition(Cond, L, Changed, Cache);
+  OperatorChain OpChain = OC_OpChainNone;
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+  // In case we do find a LIV, it can not be obtained by walking up a mixed
+  // operator chain.
+  assert((!FCond || OpChain != OC_OpChainMixed) &&
+        "Do not expect a partial LIV with mixed operator chain");
+  return {FCond, OpChain};
 }
 
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
@@ -457,19 +511,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   if (SanitizeMemory)
     computeLoopSafetyInfo(&SafetyInfo, L);
 
-  EnabledPGO = F->getEntryCount().hasValue();
-
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    BranchProbabilityInfo BPI(*F, *LI);
-    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
-
-    // Use BranchProbability to compute a minimum frequency based on
-    // function entry baseline frequency. Loops with headers below this
-    // frequency are considered as cold.
-    const BranchProbability ColdProb(ColdnessThreshold, 100);
-    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
-  }
-
   bool Changed = false;
   do {
     assert(currentLoop->isLCSSAForm(*DT));
@@ -581,19 +622,9 @@ bool LoopUnswitch::processCurrentLoop() {
       loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
     return false;
 
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    // Compute the weighted frequency of the hottest block in the
-    // loop (loopHeader in this case since inner loops should be
-    // processed before outer loop). If it is less than ColdFrequency,
-    // we should not unswitch.
-    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
-    if (LoopEntryFreq < ColdEntryFreq)
-      return false;
-  }
-
   for (IntrinsicInst *Guard : Guards) {
     Value *LoopCond =
-        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed);
+        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
     if (LoopCond &&
         UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
@@ -634,7 +665,7 @@ bool LoopUnswitch::processCurrentLoop() {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
         Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
@@ -642,24 +673,48 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                             currentLoop, Changed);
+      Value *SC = SI->getCondition();
+      Value *LoopCond;
+      OperatorChain OpChain;
+      std::tie(LoopCond, OpChain) =
+        FindLIVLoopCondition(SC, currentLoop, Changed);
+
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = nullptr;
-
-        // Do not process same value again and again.
-        // At this point we have some cases already unswitched and
-        // some not yet unswitched. Let's find the first not yet unswitched one.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i) {
-          Constant *UnswitchValCandidate = i.getCaseValue();
-          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
-            UnswitchVal = UnswitchValCandidate;
-            break;
+        // Find a case value such that at least one case value is unswitched
+        // out.
+        if (OpChain == OC_OpChainAnd) {
+          // If the chain only has ANDs and the switch has a case value of 0.
+          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllZero))
+            continue;
+          // We are unswitching 0 out.
+          UnswitchVal = AllZero;
+        } else if (OpChain == OC_OpChainOr) {
+          // If the chain only has ORs and the switch has a case value of ~0.
+          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllOne))
+            continue;
+          // We are unswitching ~0 out.
+          UnswitchVal = AllOne;
+        } else {
+          assert(OpChain == OC_OpChainNone && 
+                 "Expect to unswitch on trivial chain");
+          // Do not process same value again and again.
+          // At this point we have some cases already unswitched and
+          // some not yet unswitched. Let's find the first not yet unswitched one.
+          for (auto Case : SI->cases()) {
+            Constant *UnswitchValCandidate = Case.getCaseValue();
+            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+              UnswitchVal = UnswitchValCandidate;
+              break;
+            }
           }
         }
 
@@ -668,6 +723,11 @@ bool LoopUnswitch::processCurrentLoop() {
 
         if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
           ++NumSwitches;
+          // In case of a full LIV, UnswitchVal is the value we unswitched out.
+          // In case of a partial LIV, we only unswitch when its an AND-chain
+          // or OR-chain. In both cases switch input value simplifies to
+          // UnswitchVal.
+          BranchesInfo.setUnswitched(SI, UnswitchVal);
           return true;
         }
       }
@@ -678,7 +738,7 @@ bool LoopUnswitch::processCurrentLoop() {
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
         Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
@@ -753,6 +813,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                  << ". Cost too high.\n");
     return false;
   }
+  if (hasBranchDivergence &&
+      getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Condition is divergent.\n");
+    return false;
+  }
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
   return true;
@@ -762,7 +831,12 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
 /// mapping the blocks with the specified map.
 static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
                        LoopInfo *LI, LPPassManager *LPM) {
-  Loop &New = LPM->addLoop(PL);
+  Loop &New = *new Loop();
+  if (PL)
+    PL->addChildLoop(&New);
+  else
+    LI->addTopLevelLoop(&New);
+  LPM->addLoop(New);
 
   // Add all of the blocks in L to the new loop.
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
@@ -899,7 +973,6 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       if (I.mayHaveSideEffects())
         return false;
 
-    // FIXME: add check for constant foldable switch instructions.
     if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
       if (BI->isUnconditional()) {
         CurrentBB = BI->getSuccessor(0);
@@ -911,7 +984,16 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
         // Found a trivial condition candidate: non-foldable conditional branch.
         break;
       }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // At this point, any constant-foldable instructions should have probably
+      // been folded.
+      ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+      if (!Cond)
+        break;
+      // Find the target block we are definitely going to.
+      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
     } else {
+      // We do not understand these terminator instructions.
       break;
     }
 
@@ -929,7 +1011,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       return false;
 
     Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -960,7 +1042,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
     Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -973,13 +1055,12 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     // this.
     // Note that we can't trivially unswitch on the default case or
     // on already unswitched cases.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto Case : SI->cases()) {
       BasicBlock *LoopExitCandidate;
-      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
-                                               i.getCaseSuccessor()))) {
+      if ((LoopExitCandidate =
+               isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt *CaseVal = i.getCaseValue();
+        ConstantInt *CaseVal = Case.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -998,6 +1079,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
 
     UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
                              nullptr);
+
+    // We are only unswitching full LIV.
+    BranchesInfo.setUnswitched(SI, CondVal);
     ++NumSwitches;
     return true;
   }
@@ -1152,11 +1236,12 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   LoopProcessWorklist.push_back(NewLoop);
   redoLoop = true;
 
-  // Keep a WeakVH holding onto LIC.  If the first call to RewriteLoopBody
+  // Keep a WeakTrackingVH holding onto LIC.  If the first call to
+  // RewriteLoopBody
   // deletes the instruction (for example by simplifying a PHI that feeds into
   // the condition that we're unswitching on), we don't rewrite the second
   // iteration.
-  WeakVH LICHandle(LIC);
+  WeakTrackingVH LICHandle(LIC);
 
   // Now we rewrite the original code to know that the condition is true and the
   // new code to know that the condition is false.
@@ -1183,7 +1268,7 @@ static void RemoveFromWorklist(Instruction *I,
 static void ReplaceUsesOfWith(Instruction *I, Value *V,
                               std::vector<Instruction*> &Worklist,
                               Loop *L, LPPassManager *LPM) {
-  DEBUG(dbgs() << "Replace with '" << *V << "': " << *I);
+  DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
 
   // Add uses to the worklist, which may be dead now.
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1196,7 +1281,8 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   LPM->deleteSimpleAnalysisValue(I, L);
   RemoveFromWorklist(I, Worklist);
   I->replaceAllUsesWith(V);
-  I->eraseFromParent();
+  if (!I->mayHaveSideEffects())
+    I->eraseFromParent();
   ++NumSimplify;
 }
 
@@ -1253,18 +1339,38 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (!UI || !L->contains(UI))
       continue;
 
-    Worklist.push_back(UI);
+    // At this point, we know LIC is definitely not Val. Try to use some simple
+    // logic to simplify the user w.r.t. to the context.
+    if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+        // This in-loop instruction has been simplified w.r.t. its context,
+        // i.e. LIC != Val, make sure we propagate its replacement value to
+        // all its users.
+        //  
+        // We can not yet delete UI, the LIC user, yet, because that would invalidate
+        // the LIC->users() iterator !. However, we can make this instruction
+        // dead by replacing all its users and push it onto the worklist so that
+        // it can be properly deleted and its operands simplified. 
+        UI->replaceAllUsesWith(Replacement);
+      }
+    }
 
-    // TODO: We could do other simplifications, for example, turning
-    // 'icmp eq LIC, Val' -> false.
+    // This is a LIC user, push it into the worklist so that SimplifyCode can
+    // attempt to simplify it.
+    Worklist.push_back(UI);
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(UI);
     if (!SI || !isa<ConstantInt>(Val)) continue;
 
-    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // NOTE: if a case value for the switch is unswitched out, we record it
+    // after the unswitch finishes. We can not record it here as the switch
+    // is not a direct user of the partial LIV.
+    SwitchInst::CaseHandle DeadCase =
+        *SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
-    if (DeadCase == SI->case_default()) continue;
+    if (DeadCase == *SI->case_default())
+      continue;
 
     // Found a dead case value.  Don't remove PHI nodes in the
     // successor if they become single-entry, those PHI nodes may
@@ -1274,8 +1380,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
 
-    BranchesInfo.setUnswitched(SI, Val);
-
     if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
     // If the DeadCase successor dominates the loop latch, then the
     // transformation isn't safe since it will delete the sole predecessor edge
@@ -1334,7 +1438,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
 
     // Simple DCE.
     if (isInstructionTriviallyDead(I)) {
-      DEBUG(dbgs() << "Remove dead instruction '" << *I);
+      DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
 
       // Add uses to the worklist, which may be dead now.
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
@@ -1397,3 +1501,27 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     }
   }
 }
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+                                                     Value *Invariant,
+                                                     Constant *Val) {
+  // icmp eq cond, val -> false
+  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+  if (CI && CI->isEquality()) {
+    Value *Op0 = CI->getOperand(0);
+    Value *Op1 = CI->getOperand(1);
+    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+      LLVMContext &Ctx = Inst->getContext();
+      if (CI->getPredicate() == CmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(Ctx);
+      else 
+        return ConstantInt::getTrue(Ctx);
+     }
+  }
+
+  // FIXME: there may be other opportunities, e.g. comparison with floating
+  // point, or Invariant - Val != 0, etc.
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
index 08e60b1..6f77c5b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -155,8 +155,7 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
-    if (skipFunction(F))
-      return false;
+    // Don't skip optnone functions; atomics still need to be lowered.
     FunctionAnalysisManager DummyFAM;
     auto PA = Impl.run(F, DummyFAM);
     return !PA.areAllPreserved();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 52975ef..46f8a35 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -67,11 +68,11 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   if (!ExpectedValue)
     return false;
 
-  SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
   unsigned n = SI.getNumCases(); // +1 for default case.
   SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
 
-  if (Case == SI.case_default())
+  if (Case == *SI.case_default())
     Weights[0] = LikelyBranchWeight;
   else
     Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
@@ -83,6 +84,151 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   return true;
 }
 
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+  Value &Arg = *Expect->getArgOperand(0);
+  ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
+  if (!ExpectedValue)
+    return;
+  const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+  // Walk up in backward a list of instructions that
+  // have 'copy' semantics by 'stripping' the copies
+  // until a PHI node or an instruction of unknown kind
+  // is reached. Negation via xor is also handled.
+  //
+  //       C = PHI(...);
+  //       B = C;
+  //       A = B;
+  //       D = __builtin_expect(A, 0);
+  //
+  Value *V = &Arg;
+  SmallVector<Instruction *, 4> Operations;
+  while (!isa<PHINode>(V)) {
+    if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+      V = ZExt->getOperand(0);
+      Operations.push_back(ZExt);
+      continue;
+    }
+
+    if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+      V = SExt->getOperand(0);
+      Operations.push_back(SExt);
+      continue;
+    }
+
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+      return;
+
+    ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (!CInt)
+      return;
+
+    V = BinOp->getOperand(0);
+    Operations.push_back(BinOp);
+  }
+
+  // Executes the recorded operations on input 'Value'.
+  auto ApplyOperations = [&](const APInt &Value) {
+    APInt Result = Value;
+    for (auto Op : llvm::reverse(Operations)) {
+      switch (Op->getOpcode()) {
+      case Instruction::Xor:
+        Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+        break;
+      case Instruction::ZExt:
+        Result = Result.zext(Op->getType()->getIntegerBitWidth());
+        break;
+      case Instruction::SExt:
+        Result = Result.sext(Op->getType()->getIntegerBitWidth());
+        break;
+      default:
+        llvm_unreachable("Unexpected operation");
+      }
+    }
+    return Result;
+  };
+
+  auto *PhiDef = dyn_cast<PHINode>(V);
+
+  // Get the first dominating conditional branch of the operand
+  // i's incoming block.
+  auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+    BasicBlock *BB = PhiDef->getIncomingBlock(i);
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (BI && BI->isConditional())
+      return BI;
+    BB = BB->getSinglePredecessor();
+    if (!BB)
+      return nullptr;
+    BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || BI->isUnconditional())
+      return nullptr;
+    return BI;
+  };
+
+  // Now walk through all Phi operands to find phi oprerands with values
+  // conflicting with the expected phi output value. Any such operand
+  // indicates the incoming edge to that operand is unlikely.
+  for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+    Value *PhiOpnd = PhiDef->getIncomingValue(i);
+    ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+    if (!CI)
+      continue;
+
+    // Not an interesting case when IsUnlikely is false -- we can not infer
+    // anything useful when the operand value matches the expected phi
+    // output.
+    if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+      continue;
+
+    BranchInst *BI = GetDomConditional(i);
+    if (!BI)
+      continue;
+
+    MDBuilder MDB(PhiDef->getContext());
+
+    // There are two situations in which an operand of the PhiDef comes
+    // from a given successor of a branch instruction BI.
+    // 1) When the incoming block of the operand is the successor block;
+    // 2) When the incoming block is BI's enclosing block and the
+    // successor is the PhiDef's enclosing block.
+    //
+    // Returns true if the operand which comes from OpndIncomingBB
+    // comes from outgoing edge of BI that leads to Succ block.
+    auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+    auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+      if (OpndIncomingBB == Succ)
+        // If this successor is the incoming block for this
+        // Phi operand, then this successor does lead to the Phi.
+        return true;
+      if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+        // Otherwise, if the edge is directly from the branch
+        // to the Phi, this successor is the one feeding this
+        // Phi operand.
+        return true;
+      return false;
+    };
+
+    if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+      BI->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight));
+    else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+      BI->setMetadata(
+          LLVMContext::MD_prof,
+          MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight));
+  }
+}
+
 // Handle both BranchInst and SelectInst.
 template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
@@ -98,10 +244,18 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   CallInst *CI;
 
   ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
+  CmpInst::Predicate Predicate;
+  ConstantInt *CmpConstOperand = nullptr;
   if (!CmpI) {
     CI = dyn_cast<CallInst>(BSI.getCondition());
+    Predicate = CmpInst::ICMP_NE;
   } else {
-    if (CmpI->getPredicate() != CmpInst::ICMP_NE)
+    Predicate = CmpI->getPredicate();
+    if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
+      return false;
+
+    CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+    if (!CmpConstOperand)
       return false;
     CI = dyn_cast<CallInst>(CmpI->getOperand(0));
   }
@@ -109,6 +263,13 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   if (!CI)
     return false;
 
+  uint64_t ValueComparedTo = 0;
+  if (CmpConstOperand) {
+    if (CmpConstOperand->getBitWidth() > 64)
+      return false;
+    ValueComparedTo = CmpConstOperand->getZExtValue();
+  }
+
   Function *Fn = CI->getCalledFunction();
   if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
     return false;
@@ -121,9 +282,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   MDBuilder MDB(CI->getContext());
   MDNode *Node;
 
-  // If expect value is equal to 1 it means that we are more likely to take
-  // branch 0, in other case more likely is branch 1.
-  if (ExpectedValue->isOne())
+  if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
+      (Predicate == CmpInst::ICMP_EQ))
     Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
   else
     Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
@@ -173,6 +333,10 @@ static bool lowerExpectIntrinsic(Function &F) {
 
       Function *Fn = CI->getCalledFunction();
       if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+        // Before erasing the llvm.expect, walk backward to find
+        // phi that define llvm.expect's first arg, and
+        // infer branch probability:
+        handlePhiDef(CI);
         Value *Exp = CI->getArgOperand(0);
         CI->replaceAllUsesWith(Exp);
         CI->eraseFromParent();
diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index 4f41371..070114a 100644
--- a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -17,10 +17,10 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1b59014..7896396 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -13,19 +13,48 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "memcpyopt"
@@ -119,6 +148,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
   return true;
 }
 
+namespace {
 
 /// Represents a range of memset'd bytes with the ByteVal value.
 /// This allows us to analyze stores like:
@@ -130,7 +160,6 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
 /// the first store, we make a range [1, 2).  The second store extends the range
 /// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
 /// two ranges into [0, 3) which is memset'able.
-namespace {
 struct MemsetRange {
   // Start/End - A semi range that describes the span that this range covers.
   // The range is closed at the start and open at the end: [Start, End).
@@ -148,7 +177,8 @@ struct MemsetRange {
 
   bool isProfitableToUseMemset(const DataLayout &DL) const;
 };
-} // end anon namespace
+
+} // end anonymous namespace
 
 bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
@@ -192,13 +222,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
   return TheStores.size() > NumPointerStores+NumByteStores;
 }
 
-
 namespace {
+
 class MemsetRanges {
   /// A sorted list of the memset ranges.
   SmallVector<MemsetRange, 8> Ranges;
   typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
   const DataLayout &DL;
+
 public:
   MemsetRanges(const DataLayout &DL) : DL(DL) {}
 
@@ -231,8 +262,7 @@ public:
 
 };
 
-} // end anon namespace
-
+} // end anonymous namespace
 
 /// Add a new store to the MemsetRanges data structure.  This adds a
 /// new range for the specified store at the specified offset, merging into
@@ -299,48 +329,36 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 //===----------------------------------------------------------------------===//
 
 namespace {
-  class MemCpyOptLegacyPass : public FunctionPass {
-    MemCpyOptPass Impl;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    MemCpyOptLegacyPass() : FunctionPass(ID) {
-      initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
 
-    bool runOnFunction(Function &F) override;
-
-  private:
-    // This transformation requires dominator postdominator info
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<MemoryDependenceWrapperPass>();
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<MemoryDependenceWrapperPass>();
-    }
+class MemCpyOptLegacyPass : public FunctionPass {
+  MemCpyOptPass Impl;
 
-    // Helper functions
-    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
-    bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-    bool processMemCpy(MemCpyInst *M);
-    bool processMemMove(MemMoveInst *M);
-    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
-    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
-    bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
-    bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
-    bool processByValArgument(CallSite CS, unsigned ArgNo);
-    Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
-                                      Value *ByteVal);
-
-    bool iterateOnFunction(Function &F);
-  };
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-  char MemCpyOptLegacyPass::ID = 0;
-}
+  MemCpyOptLegacyPass() : FunctionPass(ID) {
+    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
+
+char MemCpyOptLegacyPass::ID = 0;
+
+} // end anonymous namespace
 
 /// The public interface to this file...
 FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
@@ -523,14 +541,15 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
     if (Args.erase(C))
       NeedLift = true;
     else if (MayAlias) {
-      NeedLift = any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
+      NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) {
         return AA.getModRefInfo(C, ML);
       });
 
       if (!NeedLift)
-        NeedLift = any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
-          return AA.getModRefInfo(C, CS);
-        });
+        NeedLift =
+            llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) {
+              return AA.getModRefInfo(C, CS);
+            });
     }
 
     if (!NeedLift)
@@ -567,7 +586,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P,
   }
 
   // We made it, we need to lift
-  for (auto *I : reverse(ToLift)) {
+  for (auto *I : llvm::reverse(ToLift)) {
     DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
     I->moveBefore(P);
   }
@@ -761,7 +780,6 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
   return false;
 }
 
-
 /// Takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
@@ -914,6 +932,17 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   if (MR != MRI_NoModRef)
     return false;
 
+  // We can't create address space casts here because we don't know if they're
+  // safe for the target.
+  if (cpySrc->getType()->getPointerAddressSpace() !=
+      cpyDest->getType()->getPointerAddressSpace())
+    return false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+        cpySrc->getType()->getPointerAddressSpace() !=
+        CS.getArgument(i)->getType()->getPointerAddressSpace())
+      return false;
+
   // All the checks have passed, so do the transformation.
   bool changedArgument = false;
   for (unsigned i = 0; i < CS.arg_size(); ++i)
@@ -1240,7 +1269,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
 bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
   AliasAnalysis &AA = LookupAliasAnalysis();
 
-  if (!TLI->has(LibFunc::memmove))
+  if (!TLI->has(LibFunc_memmove))
     return false;
 
   // See if the pointers alias.
@@ -1294,7 +1323,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
   // then it is some target specific value that we can't know.
-  unsigned ByValAlign = CS.getParamAlignment(ArgNo+1);
+  unsigned ByValAlign = CS.getParamAlignment(ArgNo);
   if (ByValAlign == 0) return false;
 
   // If it is greater than the memcpy, then we check to see if we can force the
@@ -1306,6 +1335,11 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
     return false;
 
+  // The address space of the memcpy source must match the byval argument
+  if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+      ByValArg->getType()->getPointerAddressSpace())
+    return false;
+
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
   //    memcpy(a <- b)
@@ -1375,7 +1409,6 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 }
 
 PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
-
   auto &MD = AM.getResult<MemoryDependenceAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
 
@@ -1393,7 +1426,9 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
                             LookupAssumptionCache, LookupDomTree);
   if (!MadeChange)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
@@ -1414,10 +1449,10 @@ bool MemCpyOptPass::runImpl(
   // If we don't have at least memset and memcpy, there is little point of doing
   // anything here.  These are required by a freestanding implementation, so if
   // even they are disabled, there is no point in trying hard.
-  if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy))
+  if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
     return false;
 
-  while (1) {
+  while (true) {
     if (!iterateOnFunction(F))
       break;
     MadeChange = true;
diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6a64c6b..6727cf0 100644
--- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -19,6 +19,8 @@
 // thinks it safe to do so.  This optimization helps with eg. hiding load
 // latencies, triggering if-conversion, and reducing static code size.
 //
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
 //===----------------------------------------------------------------------===//
 //
 //
@@ -87,7 +89,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 
@@ -118,16 +119,6 @@ private:
   void removeInstruction(Instruction *Inst);
   BasicBlock *getDiamondTail(BasicBlock *BB);
   bool isDiamondHead(BasicBlock *BB);
-  // Routines for hoisting loads
-  bool isLoadHoistBarrierInRange(const Instruction &Start,
-                                 const Instruction &End, LoadInst *LI,
-                                 bool SafeToLoadUnconditionally);
-  LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI);
-  void hoistInstruction(BasicBlock *BB, Instruction *HoistCand,
-                        Instruction *ElseInst);
-  bool isSafeToHoist(Instruction *I) const;
-  bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst);
-  bool mergeLoads(BasicBlock *BB);
   // Routines for sinking stores
   StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
   PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
@@ -188,169 +179,6 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
   return true;
 }
 
-///
-/// \brief True when instruction is a hoist barrier for a load
-///
-/// Whenever an instruction could possibly modify the value
-/// being loaded or protect against the load from happening
-/// it is considered a hoist barrier.
-///
-bool MergedLoadStoreMotion::isLoadHoistBarrierInRange(
-    const Instruction &Start, const Instruction &End, LoadInst *LI,
-    bool SafeToLoadUnconditionally) {
-  if (!SafeToLoadUnconditionally)
-    for (const Instruction &Inst :
-         make_range(Start.getIterator(), End.getIterator()))
-      if (!isGuaranteedToTransferExecutionToSuccessor(&Inst))
-        return true;
-  MemoryLocation Loc = MemoryLocation::get(LI);
-  return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod);
-}
-
-///
-/// \brief Decide if a load can be hoisted
-///
-/// When there is a load in \p BB to the same address as \p LI
-/// and it can be hoisted from \p BB, return that load.
-/// Otherwise return Null.
-///
-LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1,
-                                                   LoadInst *Load0) {
-  BasicBlock *BB0 = Load0->getParent();
-  BasicBlock *Head = BB0->getSinglePredecessor();
-  bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally(
-      Load0->getPointerOperand(), Load0->getAlignment(),
-      Load0->getModule()->getDataLayout(),
-      /*ScanFrom=*/Head->getTerminator());
-  for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE;
-       ++BBI) {
-    Instruction *Inst = &*BBI;
-
-    // Only merge and hoist loads when their result in used only in BB
-    auto *Load1 = dyn_cast<LoadInst>(Inst);
-    if (!Load1 || Inst->isUsedOutsideOfBlock(BB1))
-      continue;
-
-    MemoryLocation Loc0 = MemoryLocation::get(Load0);
-    MemoryLocation Loc1 = MemoryLocation::get(Load1);
-    if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) &&
-        !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1,
-                                   SafeToLoadUnconditionally) &&
-        !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0,
-                                   SafeToLoadUnconditionally)) {
-      return Load1;
-    }
-  }
-  return nullptr;
-}
-
-///
-/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into
-/// \p BB
-///
-/// BB is the head of a diamond
-///
-void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB,
-                                             Instruction *HoistCand,
-                                             Instruction *ElseInst) {
-  DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump();
-        dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n";
-        dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n");
-  // Hoist the instruction.
-  assert(HoistCand->getParent() != BB);
-
-  // Intersect optional metadata.
-  HoistCand->andIRFlags(ElseInst);
-  HoistCand->dropUnknownNonDebugMetadata();
-
-  // Prepend point for instruction insert
-  Instruction *HoistPt = BB->getTerminator();
-
-  // Merged instruction
-  Instruction *HoistedInst = HoistCand->clone();
-
-  // Hoist instruction.
-  HoistedInst->insertBefore(HoistPt);
-
-  HoistCand->replaceAllUsesWith(HoistedInst);
-  removeInstruction(HoistCand);
-  // Replace the else block instruction.
-  ElseInst->replaceAllUsesWith(HoistedInst);
-  removeInstruction(ElseInst);
-}
-
-///
-/// \brief Return true if no operand of \p I is defined in I's parent block
-///
-bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const {
-  BasicBlock *Parent = I->getParent();
-  for (Use &U : I->operands())
-    if (auto *Instr = dyn_cast<Instruction>(&U))
-      if (Instr->getParent() == Parent)
-        return false;
-  return true;
-}
-
-///
-/// \brief Merge two equivalent loads and GEPs and hoist into diamond head
-///
-bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0,
-                                      LoadInst *L1) {
-  // Only one definition?
-  auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand());
-  auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand());
-  if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) &&
-      A0->hasOneUse() && (A0->getParent() == L0->getParent()) &&
-      A1->hasOneUse() && (A1->getParent() == L1->getParent()) &&
-      isa<GetElementPtrInst>(A0)) {
-    DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump();
-          dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n";
-          dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n");
-    hoistInstruction(BB, A0, A1);
-    hoistInstruction(BB, L0, L1);
-    return true;
-  }
-  return false;
-}
-
-///
-/// \brief Try to hoist two loads to same address into diamond header
-///
-/// Starting from a diamond head block, iterate over the instructions in one
-/// successor block and try to match a load in the second successor.
-///
-bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) {
-  bool MergedLoads = false;
-  assert(isDiamondHead(BB));
-  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
-  BasicBlock *Succ0 = BI->getSuccessor(0);
-  BasicBlock *Succ1 = BI->getSuccessor(1);
-  // #Instructions in Succ1 for Compile Time Control
-  int Size1 = Succ1->size();
-  int NLoads = 0;
-  for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end();
-       BBI != BBE;) {
-    Instruction *I = &*BBI;
-    ++BBI;
-
-    // Don't move non-simple (atomic, volatile) loads.
-    auto *L0 = dyn_cast<LoadInst>(I);
-    if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0))
-      continue;
-
-    ++NLoads;
-    if (NLoads * Size1 >= MagicCompileTimeControl)
-      break;
-    if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) {
-      bool Res = hoistLoad(BB, L0, L1);
-      MergedLoads |= Res;
-      // Don't attempt to hoist above loads that had not been hoisted.
-      if (!Res)
-        break;
-    }
-  }
-  return MergedLoads;
-}
 
 ///
 /// \brief True when instruction is a sink barrier for a store
@@ -410,7 +238,7 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
                                 &BB->front());
   NewPN->addIncoming(Opd1, S0->getParent());
   NewPN->addIncoming(Opd2, S1->getParent());
-  if (MD && NewPN->getType()->getScalarType()->isPointerTy())
+  if (MD && NewPN->getType()->isPtrOrPtrVectorTy())
     MD->invalidateCachedPointerInfo(NewPN);
   return NewPN;
 }
@@ -534,7 +362,6 @@ bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD,
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
     if (isDiamondHead(BB)) {
-      Changed |= mergeLoads(BB);
       Changed |= mergeStores(getDiamondTail(BB));
     }
   }
@@ -596,8 +423,8 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!Impl.run(F, MD, AA))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   PA.preserve<MemoryDependenceAnalysis>();
   return PA;
diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index 0a3bf7b..d0bfe36 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -156,20 +156,12 @@ PreservedAnalyses NaryReassociatePass::run(Function &F,
   auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
   auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  bool Changed = runImpl(F, AC, DT, SE, TLI, TTI);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
-
-  if (!Changed)
+  if (!runImpl(F, AC, DT, SE, TLI, TTI))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<ScalarEvolutionAnalysis>();
-  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
@@ -219,7 +211,8 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
           Changed = true;
           SE->forgetValue(&*I);
           I->replaceAllUsesWith(NewI);
-          // If SeenExprs constains I's WeakVH, that entry will be replaced with
+          // If SeenExprs constains I's WeakTrackingVH, that entry will be
+          // replaced with
           // nullptr.
           RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI);
           I = NewI->getIterator();
@@ -227,7 +220,7 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
         // Add the rewritten instruction to SeenExprs; the original instruction
         // is deleted.
         const SCEV *NewSCEV = SE->getSCEV(&*I);
-        SeenExprs[NewSCEV].push_back(WeakVH(&*I));
+        SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I));
         // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
         // is equivalent to I. However, ScalarEvolution::getSCEV may
         // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose
@@ -247,7 +240,7 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
         //
         // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll.
         if (NewSCEV != OldSCEV)
-          SeenExprs[OldSCEV].push_back(WeakVH(&*I));
+          SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I));
       }
     }
   }
@@ -502,7 +495,8 @@ NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
   // future instruction either. Therefore, we pop it out of the stack. This
   // optimization makes the algorithm O(n).
   while (!Candidates.empty()) {
-    // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed
+    // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
+    // removed
     // during rewriting.
     if (Value *Candidate = Candidates.back()) {
       Instruction *CandidateInstruction = cast<Instruction>(Candidate);
diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 57e6e3d..9d01856 100644
--- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -17,6 +17,37 @@
 /// "A Sparse Algorithm for Predicated Global Value Numbering" from
 /// Karthik Gargi.
 ///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block.  This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number).  Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly.  In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes.  The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen.  The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai.  The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
+///
+/// We also do not perform elimination by using any published algorithm.  All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/NewGVN.h"
@@ -30,7 +61,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -40,13 +70,10 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -55,24 +82,25 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <numeric>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 using namespace llvm::GVNExpression;
-
+using namespace llvm::VNCoercion;
 #define DEBUG_TYPE "newgvn"
 
 STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -85,8 +113,19 @@ STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
 STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
 STATISTIC(NumGVNAvoidedSortedLeaderChanges,
           "Number of avoided sorted leader changes");
-STATISTIC(NumGVNNotMostDominatingLeader,
-          "Number of times a member dominated it's new classes' leader");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+          "Number of things eliminated using PHI of ops");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+              "Controls which instructions are value numbered")
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+              "Controls which instructions we create phi of ops for")
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow.  This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+                                           cl::init(false), cl::Hidden);
 
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@@ -105,6 +144,79 @@ PHIExpression::~PHIExpression() = default;
 }
 }
 
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows
+// instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+
+  TarjanSCC() : Components(1) {}
+
+  void Start(const Instruction *Start) {
+    if (Root.lookup(Start) == 0)
+      FindSCC(Start);
+  }
+
+  const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+    unsigned ComponentID = ValueToComponent.lookup(V);
+
+    assert(ComponentID > 0 &&
+           "Asking for a component for a value we never processed");
+    return Components[ComponentID];
+  }
+
+private:
+  void FindSCC(const Instruction *I) {
+    Root[I] = ++DFSNum;
+    // Store the DFS Number we had before it possibly gets incremented.
+    unsigned int OurDFS = DFSNum;
+    for (auto &Op : I->operands()) {
+      if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+        if (Root.lookup(Op) == 0)
+          FindSCC(InstOp);
+        if (!InComponent.count(Op))
+          Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+      }
+    }
+    // See if we really were the root of a component, by seeing if we still have
+    // our DFSNumber.  If we do, we are the root of the component, and we have
+    // completed a component. If we do not, we are not the root of a component,
+    // and belong on the component stack.
+    if (Root.lookup(I) == OurDFS) {
+      unsigned ComponentID = Components.size();
+      Components.resize(Components.size() + 1);
+      auto &Component = Components.back();
+      Component.insert(I);
+      DEBUG(dbgs() << "Component root is " << *I << "\n");
+      InComponent.insert(I);
+      ValueToComponent[I] = ComponentID;
+      // Pop a component off the stack and label it.
+      while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+        auto *Member = Stack.back();
+        DEBUG(dbgs() << "Component member is " << *Member << "\n");
+        Component.insert(Member);
+        InComponent.insert(Member);
+        ValueToComponent[Member] = ComponentID;
+        Stack.pop_back();
+      }
+    } else {
+      // Part of a component, push to stack
+      Stack.push_back(I);
+    }
+  }
+  unsigned int DFSNum = 1;
+  SmallPtrSet<const Value *, 8> InComponent;
+  DenseMap<const Value *, unsigned int> Root;
+  SmallVector<const Value *, 8> Stack;
+  // Store the components as vector of ptr sets, because we need the topo order
+  // of SCC's, but not individual member order
+  SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+  DenseMap<const Value *, unsigned> ValueToComponent;
+};
 // Congruence classes represent the set of expressions/instructions
 // that are all the same *during some scope in the function*.
 // That is, because of the way we perform equality propagation, and
@@ -115,46 +227,166 @@ PHIExpression::~PHIExpression() = default;
 // For any Value in the Member set, it is valid to replace any dominated member
 // with that Value.
 //
-// Every congruence class has a leader, and the leader is used to
-// symbolize instructions in a canonical way (IE every operand of an
-// instruction that is a member of the same congruence class will
-// always be replaced with leader during symbolization).
-// To simplify symbolization, we keep the leader as a constant if class can be
-// proved to be a constant value.
-// Otherwise, the leader is a randomly chosen member of the value set, it does
-// not matter which one is chosen.
-// Each congruence class also has a defining expression,
-// though the expression may be null.  If it exists, it can be used for forward
-// propagation and reassociation of values.
-//
-struct CongruenceClass {
-  using MemberSet = SmallPtrSet<Value *, 4>;
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization).  To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value.  Otherwise, the
+// leader is the member of the value set with the smallest DFS number.  Each
+// congruence class also has a defining expression, though the expression may be
+// null.  If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something.  Unfortunately, this does not work
+// easily.  The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression.  This
+// leads to wasted propagation, and in some cases, missed optimization.  For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class.  This in turn will move at one
+// of the memorydefs to a new memory congruence class.  Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number).  So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency.  While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+  using MemberType = Value;
+  using MemberSet = SmallPtrSet<MemberType *, 4>;
+  using MemoryMemberType = MemoryPhi;
+  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+  explicit CongruenceClass(unsigned ID) : ID(ID) {}
+  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+  unsigned getID() const { return ID; }
+  // True if this class has no members left.  This is mainly used for assertion
+  // purposes, and for skipping empty classes.
+  bool isDead() const {
+    // If it's both dead from a value perspective, and dead from a memory
+    // perspective, it's really dead.
+    return empty() && memory_empty();
+  }
+  // Leader functions
+  Value *getLeader() const { return RepLeader; }
+  void setLeader(Value *Leader) { RepLeader = Leader; }
+  const std::pair<Value *, unsigned int> &getNextLeader() const {
+    return NextLeader;
+  }
+  void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+
+  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+    if (LeaderPair.second < NextLeader.second)
+      NextLeader = LeaderPair;
+  }
+
+  Value *getStoredValue() const { return RepStoredValue; }
+  void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+  // Forward propagation info
+  const Expression *getDefiningExpr() const { return DefiningExpr; }
+
+  // Value member set
+  bool empty() const { return Members.empty(); }
+  unsigned size() const { return Members.size(); }
+  MemberSet::const_iterator begin() const { return Members.begin(); }
+  MemberSet::const_iterator end() const { return Members.end(); }
+  void insert(MemberType *M) { Members.insert(M); }
+  void erase(MemberType *M) { Members.erase(M); }
+  void swap(MemberSet &Other) { Members.swap(Other); }
+
+  // Memory member set
+  bool memory_empty() const { return MemoryMembers.empty(); }
+  unsigned memory_size() const { return MemoryMembers.size(); }
+  MemoryMemberSet::const_iterator memory_begin() const {
+    return MemoryMembers.begin();
+  }
+  MemoryMemberSet::const_iterator memory_end() const {
+    return MemoryMembers.end();
+  }
+  iterator_range<MemoryMemberSet::const_iterator> memory() const {
+    return make_range(memory_begin(), memory_end());
+  }
+  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+  // Store count
+  unsigned getStoreCount() const { return StoreCount; }
+  void incStoreCount() { ++StoreCount; }
+  void decStoreCount() {
+    assert(StoreCount != 0 && "Store count went negative");
+    --StoreCount;
+  }
+
+  // True if this class has no memory members.
+  bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
+  // Return true if two congruence classes are equivalent to each other.  This
+  // means
+  // that every field but the ID number and the dead field are equivalent.
+  bool isEquivalentTo(const CongruenceClass *Other) const {
+    if (!Other)
+      return false;
+    if (this == Other)
+      return true;
+
+    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+                 Other->RepMemoryAccess))
+      return false;
+    if (DefiningExpr != Other->DefiningExpr)
+      if (!DefiningExpr || !Other->DefiningExpr ||
+          *DefiningExpr != *Other->DefiningExpr)
+        return false;
+    // We need some ordered set
+    std::set<Value *> AMembers(Members.begin(), Members.end());
+    std::set<Value *> BMembers(Members.begin(), Members.end());
+    return AMembers == BMembers;
+  }
+
+private:
   unsigned ID;
   // Representative leader.
   Value *RepLeader = nullptr;
+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+  // If this is represented by a store, the value of the store.
+  Value *RepStoredValue = nullptr;
+  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+  // access.
+  const MemoryAccess *RepMemoryAccess = nullptr;
   // Defining Expression.
   const Expression *DefiningExpr = nullptr;
   // Actual members of this class.
   MemberSet Members;
-
-  // True if this class has no members left.  This is mainly used for assertion
-  // purposes, and for skipping empty classes.
-  bool Dead = false;
-
+  // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+  // MemoryUses have real instructions representing them, so we only need to
+  // track MemoryPhis here.
+  MemoryMemberSet MemoryMembers;
   // Number of stores in this congruence class.
   // This is used so we can detect store equivalence changes properly.
   int StoreCount = 0;
-
-  // The most dominating leader after our current leader, because the member set
-  // is not sorted and is expensive to keep sorted all the time.
-  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
-
-  explicit CongruenceClass(unsigned ID) : ID(ID) {}
-  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
-      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
 };
 
 namespace llvm {
+struct ExactEqualsExpression {
+  const Expression &E;
+  explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+  hash_code getComputedHash() const { return E.getComputedHash(); }
+  bool operator==(const Expression &Other) const {
+    return E.exactlyEquals(Other);
+  }
+};
+
 template <> struct DenseMapInfo<const Expression *> {
   static const Expression *getEmptyKey() {
     auto Val = static_cast<uintptr_t>(-1);
@@ -166,51 +398,144 @@ template <> struct DenseMapInfo<const Expression *> {
     Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
     return reinterpret_cast<const Expression *>(Val);
   }
-  static unsigned getHashValue(const Expression *V) {
-    return static_cast<unsigned>(V->getHashValue());
+  static unsigned getHashValue(const Expression *E) {
+    return E->getComputedHash();
   }
+  static unsigned getHashValue(const ExactEqualsExpression &E) {
+    return E.getComputedHash();
+  }
+  static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
+    if (RHS == getTombstoneKey() || RHS == getEmptyKey())
+      return false;
+    return LHS == *RHS;
+  }
+
   static bool isEqual(const Expression *LHS, const Expression *RHS) {
     if (LHS == RHS)
       return true;
     if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
         LHS == getEmptyKey() || RHS == getEmptyKey())
       return false;
+    // Compare hashes before equality.  This is *not* what the hashtable does,
+    // since it is computing it modulo the number of buckets, whereas we are
+    // using the full hash keyspace.  Since the hashes are precomputed, this
+    // check is *much* faster than equality.
+    if (LHS->getComputedHash() != RHS->getComputedHash())
+      return false;
     return *LHS == *RHS;
   }
 };
 } // end namespace llvm
 
-class NewGVN : public FunctionPass {
+namespace {
+class NewGVN {
+  Function &F;
   DominatorTree *DT;
-  const DataLayout *DL;
   const TargetLibraryInfo *TLI;
-  AssumptionCache *AC;
   AliasAnalysis *AA;
   MemorySSA *MSSA;
   MemorySSAWalker *MSSAWalker;
-  BumpPtrAllocator ExpressionAllocator;
-  ArrayRecycler<Value *> ArgRecycler;
+  const DataLayout &DL;
+  std::unique_ptr<PredicateInfo> PredInfo;
+
+  // These are the only two things the create* functions should have
+  // side-effects on due to allocating memory.
+  mutable BumpPtrAllocator ExpressionAllocator;
+  mutable ArrayRecycler<Value *> ArgRecycler;
+  mutable TarjanSCC SCCFinder;
+  const SimplifyQuery SQ;
+
+  // Number of function arguments, used by ranking
+  unsigned int NumFuncArgs;
+
+  // RPOOrdering of basic blocks
+  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
 
   // Congruence class info.
-  CongruenceClass *InitialClass;
+
+  // This class is called INITIAL in the paper. It is the class everything
+  // startsout in, and represents any value. Being an optimistic analysis,
+  // anything in the TOP class has the value TOP, which is indeterminate and
+  // equivalent to everything.
+  CongruenceClass *TOPClass;
   std::vector<CongruenceClass *> CongruenceClasses;
   unsigned NextCongruenceNum;
 
   // Value Mappings.
   DenseMap<Value *, CongruenceClass *> ValueToClass;
   DenseMap<Value *, const Expression *> ValueToExpression;
+  // Value PHI handling, used to make equivalence between phi(op, op) and
+  // op(phi, phi).
+  // These mappings just store various data that would normally be part of the
+  // IR.
+  DenseSet<const Instruction *> PHINodeUses;
+  // Map a temporary instruction we created to a parent block.
+  DenseMap<const Value *, BasicBlock *> TempToBlock;
+  // Map between the temporary phis we created and the real instructions they
+  // are known equivalent to.
+  DenseMap<const Value *, PHINode *> RealToTemp;
+  // In order to know when we should re-process instructions that have
+  // phi-of-ops, we track the set of expressions that they needed as
+  // leaders. When we discover new leaders for those expressions, we process the
+  // associated phi-of-op instructions again in case they have changed.  The
+  // other way they may change is if they had leaders, and those leaders
+  // disappear.  However, at the point they have leaders, there are uses of the
+  // relevant operands in the created phi node, and so they will get reprocessed
+  // through the normal user marking we perform.
+  mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+  DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+      ExpressionToPhiOfOps;
+  // Map from basic block to the temporary operations we created
+  DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs;
+  // Map from temporary operation to MemoryAccess.
+  DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+  // Set of all temporary instructions we created.
+  DenseSet<Instruction *> AllTempInstructions;
+
+  // Mapping from predicate info we used to the instructions we used it with.
+  // In order to correctly ensure propagation, we must keep track of what
+  // comparisons we used, so that when the values of the comparisons change, we
+  // propagate the information to the places we used the comparison.
+  mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+      PredicateToUsers;
+  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
+  // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+  mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+      MemoryToUsers;
 
   // A table storing which memorydefs/phis represent a memory state provably
   // equivalent to another memory state.
   // We could use the congruence class machinery, but the MemoryAccess's are
   // abstract memory states, so they can only ever be equivalent to each other,
   // and not to constants, etc.
-  DenseMap<const MemoryAccess *, MemoryAccess *> MemoryAccessEquiv;
-
+  DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+  // We could, if we wanted, build MemoryPhiExpressions and
+  // MemoryVariableExpressions, etc, and value number them the same way we value
+  // number phi expressions.  For the moment, this seems like overkill.  They
+  // can only exist in one of three states: they can be TOP (equal to
+  // everything), Equivalent to something else, or unique.  Because we do not
+  // create expressions for them, we need to simulate leader change not just
+  // when they change class, but when they change state.  Note: We can do the
+  // same thing for phis, and avoid having phi expressions if we wanted, We
+  // should eventually unify in one direction or the other, so this is a little
+  // bit of an experiment in which turns out easier to maintain.
+  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+  enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+  mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
 
+  // We have a single expression that represents currently DeadExpressions.
+  // For dead expressions we can prove will stay dead, we mark them with
+  // DFS number zero.  However, it's possible in the case of phi nodes
+  // for us to assume/prove all arguments are dead during fixpointing.
+  // We use DeadExpression for that case.
+  DeadExpression *SingletonDeadExpression = nullptr;
+
   // Which values have changed as a result of leader changes.
   SmallPtrSet<Value *, 8> LeaderChanges;
 
@@ -231,8 +556,6 @@ class NewGVN : public FunctionPass {
   BitVector TouchedInstructions;
 
   DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-  DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>>
-      DominatedInstRange;
 
 #ifndef NDEBUG
   // Debugging for how many times each block and instruction got processed.
@@ -240,56 +563,47 @@ class NewGVN : public FunctionPass {
 #endif
 
   // DFS info.
-  DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap;
+  // This contains a mapping from Instructions to DFS numbers.
+  // The numbering starts at 1. An instruction with DFS number zero
+  // means that the instruction is dead.
   DenseMap<const Value *, unsigned> InstrDFS;
+
+  // This contains the mapping DFS numbers to instructions.
   SmallVector<Value *, 32> DFSToInstr;
 
   // Deletion info.
   SmallPtrSet<Instruction *, 8> InstructionsToErase;
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  NewGVN() : FunctionPass(ID) {
-    initializeNewGVNPass(*PassRegistry::getPassRegistry());
+  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+         const DataLayout &DL)
+      : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) {
   }
-
-  bool runOnFunction(Function &F) override;
-  bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
-              TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA);
+  bool runGVN();
 
 private:
-  // This transformation requires dominator postdominator info.
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<MemorySSAWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-
   // Expression handling.
-  const Expression *createExpression(Instruction *, const BasicBlock *);
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
-                                           const BasicBlock *);
-  PHIExpression *createPHIExpression(Instruction *);
-  const VariableExpression *createVariableExpression(Value *);
-  const ConstantExpression *createConstantExpression(Constant *);
-  const Expression *createVariableOrConstant(Value *V, const BasicBlock *B);
-  const UnknownExpression *createUnknownExpression(Instruction *);
-  const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *,
-                                               const BasicBlock *);
+  const Expression *createExpression(Instruction *) const;
+  const Expression *createBinaryExpression(unsigned, Type *, Value *,
+                                           Value *) const;
+  PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge,
+                                     bool &OriginalOpsConstant) const;
+  const DeadExpression *createDeadExpression() const;
+  const VariableExpression *createVariableExpression(Value *) const;
+  const ConstantExpression *createConstantExpression(Constant *) const;
+  const Expression *createVariableOrConstant(Value *V) const;
+  const UnknownExpression *createUnknownExpression(Instruction *) const;
+  const StoreExpression *createStoreExpression(StoreInst *,
+                                               const MemoryAccess *) const;
   LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
-                                       MemoryAccess *, const BasicBlock *);
-
-  const CallExpression *createCallExpression(CallInst *, MemoryAccess *,
-                                             const BasicBlock *);
+                                       const MemoryAccess *) const;
+  const CallExpression *createCallExpression(CallInst *,
+                                             const MemoryAccess *) const;
   const AggregateValueExpression *
-  createAggregateValueExpression(Instruction *, const BasicBlock *);
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *,
-                              const BasicBlock *);
+  createAggregateValueExpression(Instruction *) const;
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
 
   // Congruence class handling.
   CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -298,13 +612,28 @@ private:
     return result;
   }
 
+  CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+    auto *CC = createCongruenceClass(nullptr, nullptr);
+    CC->setMemoryLeader(MA);
+    return CC;
+  }
+  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+    auto *CC = getMemoryClass(MA);
+    if (CC->getMemoryLeader() != MA)
+      CC = createMemoryClass(MA);
+    return CC;
+  }
+
   CongruenceClass *createSingletonCongruenceClass(Value *Member) {
     CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
-    CClass->Members.insert(Member);
+    CClass->insert(Member);
     ValueToClass[Member] = CClass;
     return CClass;
   }
   void initializeCongruenceClasses(Function &F);
+  const Expression *makePossiblePhiOfOps(Instruction *,
+                                         SmallPtrSetImpl<Value *> &);
+  void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
 
   // Value number an Instruction or MemoryPhi.
   void valueNumberMemoryPhi(MemoryPhi *);
@@ -312,78 +641,128 @@ private:
 
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
-                                               Value *);
-  const Expression *performSymbolicEvaluation(Value *, const BasicBlock *);
-  const Expression *performSymbolicLoadEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicStoreEvaluation(Instruction *,
-                                                   const BasicBlock *);
-  const Expression *performSymbolicCallEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicPHIEvaluation(Instruction *,
-                                                 const BasicBlock *);
-  bool setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To);
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *,
-                                                       const BasicBlock *);
+                                               Value *) const;
+  const Expression *performSymbolicEvaluation(Value *,
+                                              SmallPtrSetImpl<Value *> &) const;
+  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+                                                Instruction *,
+                                                MemoryAccess *) const;
+  const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+  const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+  const Expression *performSymbolicCallEvaluation(Instruction *) const;
+  const Expression *performSymbolicPHIEvaluation(Instruction *) const;
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+  const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
 
   // Congruence finding.
-  // Templated to allow them to work both on BB's and BB-edges.
-  template <class T>
-  Value *lookupOperandLeader(Value *, const User *, const T &) const;
+  bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+  Value *lookupOperandLeader(Value *) const;
   void performCongruenceFinding(Instruction *, const Expression *);
-  void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
-                                     CongruenceClass *);
+  void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+                                     CongruenceClass *, CongruenceClass *);
+  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+                                      CongruenceClass *, CongruenceClass *);
+  Value *getNextValueLeader(CongruenceClass *) const;
+  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+  bool isMemoryAccessTOP(const MemoryAccess *) const;
+
+  // Ranking
+  unsigned int getRank(const Value *) const;
+  bool shouldSwapOperands(const Value *, const Value *) const;
+
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
   void processOutgoingEdges(TerminatorInst *, BasicBlock *);
-  bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const;
-  Value *findConditionEquivalence(Value *, BasicBlock *) const;
-  MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const;
+  Value *findConditionEquivalence(Value *) const;
 
   // Elimination.
   struct ValueDFS;
-  void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
-                                SmallVectorImpl<ValueDFS> &);
+  void convertClassToDFSOrdered(const CongruenceClass &,
+                                SmallVectorImpl<ValueDFS> &,
+                                DenseMap<const Value *, unsigned int> &,
+                                SmallPtrSetImpl<Instruction *> &) const;
+  void convertClassToLoadsAndStores(const CongruenceClass &,
+                                    SmallVectorImpl<ValueDFS> &) const;
 
   bool eliminateInstructions(Function &);
   void replaceInstruction(Instruction *, Value *);
   void markInstructionForDeletion(Instruction *);
   void deleteInstructionsInBlock(BasicBlock *);
+  Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const;
 
   // New instruction creation.
   void handleNewInstruction(Instruction *){};
 
   // Various instruction touch utilities
+  template <typename Map, typename KeyType, typename Func>
+  void for_each_found(Map &, const KeyType &, Func);
+  template <typename Map, typename KeyType>
+  void touchAndErase(Map &, const KeyType &);
   void markUsersTouched(Value *);
-  void markMemoryUsersTouched(MemoryAccess *);
-  void markLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryUsersTouched(const MemoryAccess *);
+  void markMemoryDefTouched(const MemoryAccess *);
+  void markPredicateUsersTouched(Instruction *);
+  void markValueLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void markPhiOfOpsChanged(const Expression *E);
+  void addPredicateUsers(const PredicateBase *, Instruction *) const;
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+  void addAdditionalUsers(Value *To, Value *User) const;
+
+  // Main loop of value numbering
+  void iterateTouchedInstructions();
 
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
-  void updateProcessedCount(Value *V);
+  void updateProcessedCount(const Value *V);
   void verifyMemoryCongruency() const;
-  bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
-};
-
-char NewGVN::ID = 0;
+  void verifyIterationSettled(Function &F);
+  void verifyStoreExpressions() const;
+  bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+                              const MemoryAccess *, const MemoryAccess *) const;
+  BasicBlock *getBlockForValue(Value *V) const;
+  void deleteExpression(const Expression *E) const;
+  MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+  MemoryAccess *getDefiningAccess(const MemoryAccess *) const;
+  MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+  unsigned InstrToDFSNum(const Value *V) const {
+    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+    return InstrDFS.lookup(V);
+  }
 
-// createGVNPass - The public interface to this file.
-FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); }
+  unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+    return MemoryToDFSNum(MA);
+  }
+  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+  // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
+  // This deliberately takes a value so it can be used with Use's, which will
+  // auto-convert to Value's but not to MemoryAccess's.
+  unsigned MemoryToDFSNum(const Value *MA) const {
+    assert(isa<MemoryAccess>(MA) &&
+           "This should not be used with instructions");
+    return isa<MemoryUseOrDef>(MA)
+               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+               : InstrDFS.lookup(MA);
+  }
+  bool isCycleFree(const Instruction *) const;
+  bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+  // Debug counter info.  When verifying, we have to reset the value numbering
+  // debug counter to the same state it started in to get the same results.
+  std::pair<int, int> StartingVNCounter;
+};
+} // end anonymous namespace
 
 template <typename T>
 static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
-  if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) ||
-      !LHS.BasicExpression::equals(RHS)) {
+  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
     return false;
-  } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != L->getDefiningAccess())
-      return false;
-  } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != S->getDefiningAccess())
-      return false;
-  }
-  return true;
+  return LHS.MemoryExpression::equals(RHS);
 }
 
 bool LoadExpression::equals(const Expression &Other) const {
@@ -391,7 +770,22 @@ bool LoadExpression::equals(const Expression &Other) const {
 }
 
 bool StoreExpression::equals(const Expression &Other) const {
-  return equalsLoadStoreHelper(*this, Other);
+  if (!equalsLoadStoreHelper(*this, Other))
+    return false;
+  // Make sure that store vs store includes the value operand.
+  if (const auto *S = dyn_cast<StoreExpression>(&Other))
+    if (getStoredValue() != S->getStoredValue())
+      return false;
+  return true;
+}
+
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+  if (From == To)
+    return true;
+  auto *FromDTN = DT->getNode(From);
+  auto *ToDTN = DT->getNode(To);
+  return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN);
 }
 
 #ifndef NDEBUG
@@ -400,17 +794,45 @@ static std::string getBlockName(const BasicBlock *B) {
 }
 #endif
 
-INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+  auto *Result = MSSA->getMemoryAccess(I);
+  return Result ? Result : TempToMemory.lookup(I);
+}
 
-PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
-  BasicBlock *PHIBlock = I->getParent();
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+  return MSSA->getMemoryAccess(BB);
+}
+
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    auto *Parent = I->getParent();
+    if (Parent)
+      return Parent;
+    Parent = TempToBlock.lookup(V);
+    assert(Parent && "Every fake instruction should have a block");
+    return Parent;
+  }
+
+  auto *MP = dyn_cast<MemoryPhi>(V);
+  assert(MP && "Should have been an instruction or a MemoryPhi");
+  return MP->getBlock();
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator.  Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) const {
+  assert(isa<BasicExpression>(E));
+  auto *BE = cast<BasicExpression>(E);
+  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+  ExpressionAllocator.Deallocate(E);
+}
+PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
+                                           bool &OriginalOpsConstant) const {
+  BasicBlock *PHIBlock = getBlockForValue(I);
   auto *PN = cast<PHINode>(I);
   auto *E =
       new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
@@ -419,28 +841,47 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
   E->setType(I->getType());
   E->setOpcode(I->getOpcode());
 
-  auto ReachablePhiArg = [&](const Use &U) {
-    return ReachableBlocks.count(PN->getIncomingBlock(U));
-  };
-
-  // Filter out unreachable operands
-  auto Filtered = make_filter_range(PN->operands(), ReachablePhiArg);
-
+  // NewGVN assumes the operands of a PHI node are in a consistent order across
+  // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix
+  // this in LLVM at some point we don't want GVN to find wrong congruences.
+  // Therefore, here we sort uses in predecessor order.
+  // We're sorting the values by pointer. In theory this might be cause of
+  // non-determinism, but here we don't rely on the ordering for anything
+  // significant, e.g. we don't create new instructions based on it so we're
+  // fine.
+  SmallVector<const Use *, 4> PHIOperands;
+  for (const Use &U : PN->operands())
+    PHIOperands.push_back(&U);
+  std::sort(PHIOperands.begin(), PHIOperands.end(),
+            [&](const Use *U1, const Use *U2) {
+              return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2);
+            });
+
+  // Filter out unreachable phi operands.
+  auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
+    if (*U == PN)
+      return false;
+    if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}))
+      return false;
+    // Things in TOPClass are equivalent to everything.
+    if (ValueToClass.lookup(*U) == TOPClass)
+      return false;
+    return lookupOperandLeader(*U) != PN;
+  });
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
-                 [&](const Use &U) -> Value * {
-                   // Don't try to transform self-defined phis.
-                   if (U == PN)
-                     return PN;
-                   const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
-                   return lookupOperandLeader(U, I, BBE);
+                 [&](const Use *U) -> Value * {
+                   auto *BB = PN->getIncomingBlock(*U);
+                   HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+                   OriginalOpsConstant =
+                       OriginalOpsConstant && isa<Constant>(*U);
+                   return lookupOperandLeader(*U);
                  });
   return E;
 }
 
 // Set basic expression info (Arguments, type, opcode) for Expression
 // E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
-                                    const BasicBlock *B) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
   bool AllConstant = true;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     E->setType(GEP->getSourceElementType());
@@ -452,8 +893,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
   // Transform the operand array into an operand leader array, and keep track of
   // whether all members are constant.
   std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
-    auto Operand = lookupOperandLeader(O, I, B);
-    AllConstant &= isa<Constant>(Operand);
+    auto Operand = lookupOperandLeader(O);
+    AllConstant = AllConstant && isa<Constant>(Operand);
     return Operand;
   });
 
@@ -461,8 +902,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
 }
 
 const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
-                                                 Value *Arg1, Value *Arg2,
-                                                 const BasicBlock *B) {
+                                                 Value *Arg1,
+                                                 Value *Arg2) const {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
 
   E->setType(T);
@@ -473,14 +914,13 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
     // of their operands get the same value number by sorting the operand value
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
-    if (Arg1 > Arg2)
+    if (shouldSwapOperands(Arg1, Arg2))
       std::swap(Arg1, Arg2);
   }
-  E->op_push_back(lookupOperandLeader(Arg1, nullptr, B));
-  E->op_push_back(lookupOperandLeader(Arg2, nullptr, B));
+  E->op_push_back(lookupOperandLeader(Arg1));
+  E->op_push_back(lookupOperandLeader(Arg2));
 
-  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI,
-                           DT, AC);
+  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
   if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
     return SimplifiedE;
   return E;
@@ -492,7 +932,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
 // TODO: Once finished, this should not take an Instruction, we only
 // use it for printing.
 const Expression *NewGVN::checkSimplificationResults(Expression *E,
-                                                     Instruction *I, Value *V) {
+                                                     Instruction *I,
+                                                     Value *V) const {
   if (!V)
     return nullptr;
   if (auto *C = dyn_cast<Constant>(V)) {
@@ -502,40 +943,40 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
     NumGVNOpsSimplified++;
     assert(isa<BasicExpression>(E) &&
            "We should always have had a basic expression here");
-
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(C);
   } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " variable " << *V << "\n");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createVariableExpression(V);
   }
 
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && CC->DefiningExpr) {
+  if (CC && CC->getDefiningExpr()) {
+    // If we simplified to something else, we need to communicate
+    // that we're users of the value we simplified to.
+    if (I != V) {
+      // Don't add temporary instructions to the user lists.
+      if (!AllTempInstructions.count(I))
+        addAdditionalUsers(V, I);
+    }
+
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
-                   << " expression " << *V << "\n");
+                   << " expression " << *CC->getDefiningExpr() << "\n");
     NumGVNOpsSimplified++;
-    assert(isa<BasicExpression>(E) &&
-           "We should always have had a basic expression here");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    return CC->DefiningExpr;
+    deleteExpression(E);
+    return CC->getDefiningExpr();
   }
   return nullptr;
 }
 
-const Expression *NewGVN::createExpression(Instruction *I,
-                                           const BasicBlock *B) {
-
+const Expression *NewGVN::createExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
-  bool AllConstant = setBasicExpressionInfo(I, E, B);
+  bool AllConstant = setBasicExpressionInfo(I, E);
 
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
@@ -543,7 +984,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
-    if (E->getOperand(0) > E->getOperand(1))
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
       E->swapOperands(0, 1);
   }
 
@@ -559,48 +1000,43 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // Sort the operand value numbers so x<y and y>x get the same value
     // number.
     CmpInst::Predicate Predicate = CI->getPredicate();
-    if (E->getOperand(0) > E->getOperand(1)) {
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
       E->swapOperands(0, 1);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     E->setOpcode((CI->getOpcode() << 8) | Predicate);
     // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
-    // TODO: Since we noop bitcasts, we may need to check types before
-    // simplifying, so that we don't end up simplifying based on a wrong
-    // type assumption. We should clean this up so we can use constants of the
-    // wrong type
-
     assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
            "Wrong types on cmp instruction");
-    if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
-         E->getOperand(1)->getType() == I->getOperand(1)->getType())) {
-      Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
-                                 *DL, TLI, DT, AC);
-      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
-        return SimplifiedE;
-    }
+    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+            E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+    Value *V =
+        SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
   } else if (isa<SelectInst>(I)) {
     if (isa<Constant>(E->getOperand(0)) ||
-        (E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
-         E->getOperand(2)->getType() == I->getOperand(2)->getType())) {
+        E->getOperand(0) == E->getOperand(1)) {
+      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+             E->getOperand(2)->getType() == I->getOperand(2)->getType());
       Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
-                                    E->getOperand(2), *DL, TLI, DT, AC);
+                                    E->getOperand(2), SQ);
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
     }
   } else if (I->isBinaryOp()) {
-    Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1),
-                             *DL, TLI, DT, AC);
+    Value *V =
+        SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
-    Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC);
+    Value *V =
+        SimplifyCastInst(BI->getOpcode(), BI->getOperand(0), BI->getType(), SQ);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (isa<GetElementPtrInst>(I)) {
-    Value *V = SimplifyGEPInst(E->getType(),
-                               ArrayRef<Value *>(E->op_begin(), E->op_end()),
-                               *DL, TLI, DT, AC);
+    Value *V = SimplifyGEPInst(
+        E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (AllConstant) {
@@ -615,7 +1051,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     for (Value *Arg : E->operands())
       C.emplace_back(cast<Constant>(Arg));
 
-    if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI))
+    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
   }
@@ -623,18 +1059,18 @@ const Expression *NewGVN::createExpression(Instruction *I,
 }
 
 const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
+NewGVN::createAggregateValueExpression(Instruction *I) const {
   if (auto *II = dyn_cast<InsertValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
-    setBasicExpressionInfo(I, E, B);
+    setBasicExpressionInfo(I, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
     return E;
   } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
-    setBasicExpressionInfo(EI, E, B);
+    setBasicExpressionInfo(EI, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
     return E;
@@ -642,67 +1078,120 @@ NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
   llvm_unreachable("Unhandled type of aggregate value operation");
 }
 
-const VariableExpression *NewGVN::createVariableExpression(Value *V) {
+const DeadExpression *NewGVN::createDeadExpression() const {
+  // DeadExpression has no arguments and all DeadExpression's are the same,
+  // so we only need one of them.
+  return SingletonDeadExpression;
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
   auto *E = new (ExpressionAllocator) VariableExpression(V);
   E->setOpcode(V->getValueID());
   return E;
 }
 
-const Expression *NewGVN::createVariableOrConstant(Value *V,
-                                                   const BasicBlock *B) {
-  auto Leader = lookupOperandLeader(V, nullptr, B);
-  if (auto *C = dyn_cast<Constant>(Leader))
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
+  if (auto *C = dyn_cast<Constant>(V))
     return createConstantExpression(C);
-  return createVariableExpression(Leader);
+  return createVariableExpression(V);
 }
 
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
   auto *E = new (ExpressionAllocator) ConstantExpression(C);
   E->setOpcode(C->getValueID());
   return E;
 }
 
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
   auto *E = new (ExpressionAllocator) UnknownExpression(I);
   E->setOpcode(I->getOpcode());
   return E;
 }
 
-const CallExpression *NewGVN::createCallExpression(CallInst *CI,
-                                                   MemoryAccess *HV,
-                                                   const BasicBlock *B) {
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
   // FIXME: Add operand bundles for calls.
   auto *E =
-      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV);
-  setBasicExpressionInfo(CI, E, B);
+      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+  setBasicExpressionInfo(CI, E);
   return E;
 }
 
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+                                     const Instruction *U) const {
+  auto *CC = ValueToClass.lookup(Inst);
+  // This must be an instruction because we are only called from phi nodes
+  // in the case that the value it needs to check against is an instruction.
+
+  // The most likely candiates for dominance are the leader and the next leader.
+  // The leader or nextleader will dominate in all cases where there is an
+  // equivalent that is higher up in the dom tree.
+  // We can't *only* check them, however, because the
+  // dominator tree could have an infinite number of non-dominating siblings
+  // with instructions that are in the right congruence class.
+  //       A
+  // B C D E F G
+  // |
+  // H
+  // Instruction U could be in H,  with equivalents in every other sibling.
+  // Depending on the rpo order picked, the leader could be the equivalent in
+  // any of these siblings.
+  if (!CC)
+    return false;
+  if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+    return true;
+  if (CC->getNextLeader().first &&
+      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+    return true;
+  return llvm::any_of(*CC, [&](const Value *Member) {
+    return Member != CC->getLeader() &&
+           DT->dominates(cast<Instruction>(Member), U);
+  });
+}
+
 // See if we have a congruence class and leader for this operand, and if so,
 // return it. Otherwise, return the operand itself.
-template <class T>
-Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const {
+Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && (CC != InitialClass))
-    return CC->RepLeader;
+  if (CC) {
+    // Everything in TOP is represented by undef, as it can be any value.
+    // We do have to make sure we get the type right though, so we can't set the
+    // RepLeader to undef.
+    if (CC == TOPClass)
+      return UndefValue::get(V->getType());
+    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+  }
+
   return V;
 }
 
-MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const {
-  MemoryAccess *Result = MemoryAccessEquiv.lookup(MA);
-  return Result ? Result : MA;
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+  auto *CC = getMemoryClass(MA);
+  assert(CC->getMemoryLeader() &&
+         "Every MemoryAccess should be mapped to a congruence class with a "
+         "representative memory access");
+  return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices.  This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
+  return getMemoryClass(MA) == TOPClass;
 }
 
 LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
-                                             LoadInst *LI, MemoryAccess *DA,
-                                             const BasicBlock *B) {
-  auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA);
+                                             LoadInst *LI,
+                                             const MemoryAccess *MA) const {
+  auto *E =
+      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(LoadType);
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(PointerOp, LI, B));
+  E->op_push_back(PointerOp);
   if (LI)
     E->setAlignment(LI->getAlignment());
 
@@ -712,17 +1201,17 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
   return E;
 }
 
-const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
-                                                     MemoryAccess *DA,
-                                                     const BasicBlock *B) {
-  auto *E =
-      new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, DA);
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
+  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+  auto *E = new (ExpressionAllocator)
+      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(SI->getValueOperand()->getType());
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B));
+  E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -730,44 +1219,136 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
-// Utility function to check whether the congruence class has a member other
-// than the given instruction.
-bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
-  // Either it has more than one store, in which case it must contain something
-  // other than us (because it's indexed by value), or if it only has one store
-  // right now, that member should not be us.
-  return CC->StoreCount > 1 || CC->Members.count(I) == 0;
-}
-
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
-                                                         const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
-  MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
-  // See if we are defined by a previous store expression, it already has a
-  // value, and it's the same value as our current store. FIXME: Right now, we
-  // only do this for simple stores, we should expand to cover memcpys, etc.
+  auto *StoreAccess = getMemoryAccess(SI);
+  // Get the expression, if any, for the RHS of the MemoryDef.
+  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+  if (EnableStoreRefinement)
+    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+  // If we bypassed the use-def chains, make sure we add a use.
+  if (StoreRHS != StoreAccess->getDefiningAccess())
+    addMemoryUsers(StoreRHS, StoreAccess);
+  StoreRHS = lookupMemoryLeader(StoreRHS);
+  // If we are defined by ourselves, use the live on entry def.
+  if (StoreRHS == StoreAccess)
+    StoreRHS = MSSA->getLiveOnEntryDef();
+
   if (SI->isSimple()) {
-    // Get the expression, if any, for the RHS of the MemoryDef.
-    MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
-        cast<MemoryDef>(StoreAccess)->getDefiningAccess());
-    const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
-    CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
-    // Basically, check if the congruence class the store is in is defined by a
-    // store that isn't us, and has the same value.  MemorySSA takes care of
-    // ensuring the store has the same memory state as us already.
-    if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
-        CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
-        hasMemberOtherThanUs(CC, I))
-      return createStoreExpression(SI, StoreRHS, B);
+    // See if we are defined by a previous store expression, it already has a
+    // value, and it's the same value as our current store. FIXME: Right now, we
+    // only do this for simple stores, we should expand to cover memcpys, etc.
+    const auto *LastStore = createStoreExpression(SI, StoreRHS);
+    const auto *LastCC = ExpressionToClass.lookup(LastStore);
+    // We really want to check whether the expression we matched was a store. No
+    // easy way to do that. However, we can check that the class we found has a
+    // store, which, assuming the value numbering state is not corrupt, is
+    // sufficient, because we must also be equivalent to that store's expression
+    // for it to be in the same class as the load.
+    if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
+      return LastStore;
+    // Also check if our value operand is defined by a load of the same memory
+    // location, and the memory state is the same as it was then (otherwise, it
+    // could have been overwritten later. See test32 in
+    // transforms/DeadStoreElimination/simple.ll).
+    if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
+      if ((lookupOperandLeader(LI->getPointerOperand()) ==
+           LastStore->getOperand(0)) &&
+          (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
+           StoreRHS))
+        return LastStore;
+    deleteExpression(LastStore);
+  }
+
+  // If the store is not equivalent to anything, value number it as a store that
+  // produces a unique memory state (instead of using it's MemoryUse, we use
+  // it's MemoryDef).
+  return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+                                    LoadInst *LI, Instruction *DepInst,
+                                    MemoryAccess *DefiningAccess) const {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    // Also don't need to coerce if they are the same type, we will just
+    // propogate..
+    if (LI->isAtomic() > DepSI->isAtomic() ||
+        LoadType == DepSI->getValueOperand()->getType())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+    if (Offset >= 0) {
+      if (auto *C = dyn_cast<Constant>(
+              lookupOperandLeader(DepSI->getValueOperand()))) {
+        DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+                     << *C << "\n");
+        return createConstantExpression(
+            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+      }
+    }
+
+  } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LI->isAtomic() > DepLI->isAtomic())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+    if (Offset >= 0) {
+      // We can coerce a constant load into a load
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+        if (auto *PossibleConstant =
+                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+          DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+                       << *PossibleConstant << "\n");
+          return createConstantExpression(PossibleConstant);
+        }
+    }
+
+  } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+    if (Offset >= 0) {
+      if (auto *PossibleConstant =
+              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+        DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                     << " to constant " << *PossibleConstant << "\n");
+        return createConstantExpression(PossibleConstant);
+      }
+    }
+  }
+
+  // All of the below are only true if the loaded pointer is produced
+  // by the dependent instruction.
+  if (LoadPtr != lookupOperandLeader(DepInst) &&
+      !AA->isMustAlias(LoadPtr, DepInst))
+    return nullptr;
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.  Note that this is only true in the case
+  // that the result of the allocation is pointer equal to the load ptr.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  else if (isCallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(Constant::getNullValue(LoadType));
   }
 
-  return createStoreExpression(SI, StoreAccess, B);
+  return nullptr;
 }
 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   auto *LI = cast<LoadInst>(I);
 
   // We can eliminate in favor of non-simple loads, but we won't be able to
@@ -775,12 +1356,13 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
   if (!LI->isSimple())
     return nullptr;
 
-  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B);
+  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
   // Load of undef is undef.
   if (isa<UndefValue>(LoadAddressLeader))
     return createConstantExpression(UndefValue::get(LI->getType()));
-
-  MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(I);
+  MemoryAccess *OriginalAccess = getMemoryAccess(I);
+  MemoryAccess *DefiningAccess =
+      MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
 
   if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
     if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
@@ -788,88 +1370,263 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
       // If the defining instruction is not reachable, replace with undef.
       if (!ReachableBlocks.count(DefiningInst->getParent()))
         return createConstantExpression(UndefValue::get(LI->getType()));
+      // This will handle stores and memory insts.  We only do if it the
+      // defining access has a different type, or it is a pointer produced by
+      // certain memory operations that cause the memory to have a fixed value
+      // (IE things like calloc).
+      if (const auto *CoercionResult =
+              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+                                          DefiningInst, DefiningAccess))
+        return CoercionResult;
     }
   }
 
-  const Expression *E =
-      createLoadExpression(LI->getType(), LI->getPointerOperand(), LI,
-                           lookupMemoryAccessEquiv(DefiningAccess), B);
+  const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
+                                             LI, DefiningAccess);
   return E;
 }
 
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+  auto *PI = PredInfo->getPredicateInfoFor(I);
+  if (!PI)
+    return nullptr;
+
+  DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+  auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+  if (!PWC)
+    return nullptr;
+
+  auto *CopyOf = I->getOperand(0);
+  auto *Cond = PWC->Condition;
+
+  // If this a copy of the condition, it must be either true or false depending
+  // on the predicate info type and edge
+  if (CopyOf == Cond) {
+    // We should not need to add predicate users because the predicate info is
+    // already a use of this operand.
+    if (isa<PredicateAssume>(PI))
+      return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+    if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+      if (PBranch->TrueEdge)
+        return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+      return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+    }
+    if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+      return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+  }
+
+  // Not a copy of the condition, so see what the predicates tell us about this
+  // value.  First, though, we check to make sure the value is actually a copy
+  // of one of the condition operands. It's possible, in certain cases, for it
+  // to be a copy of a predicateinfo copy. In particular, if two branch
+  // operations use the same condition, and one branch dominates the other, we
+  // will end up with a copy of a copy.  This is currently a small deficiency in
+  // predicateinfo.  What will end up happening here is that we will value
+  // number both copies the same anyway.
+
+  // Everything below relies on the condition being a comparison.
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!Cmp)
+    return nullptr;
+
+  if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+    DEBUG(dbgs() << "Copy is not of any condition operands!\n");
+    return nullptr;
+  }
+  Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+  Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+  bool SwappedOps = false;
+  // Sort the ops
+  if (shouldSwapOperands(FirstOp, SecondOp)) {
+    std::swap(FirstOp, SecondOp);
+    SwappedOps = true;
+  }
+  CmpInst::Predicate Predicate =
+      SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+  if (isa<PredicateAssume>(PI)) {
+    // If the comparison is true when the operands are equal, then we know the
+    // operands are equal, because assumes must always be true.
+    if (CmpInst::isTrueWhenEqual(Predicate)) {
+      addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
+      return createVariableOrConstant(FirstOp);
+    }
+  }
+  if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+    // If we are *not* a copy of the comparison, we may equal to the other
+    // operand when the predicate implies something about equality of
+    // operations.  In particular, if the comparison is true/false when the
+    // operands are equal, and we are on the right edge, we know this operation
+    // is equal to something.
+    if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+        (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+      addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
+      return createVariableOrConstant(FirstOp);
+    }
+    // Handle the special case of floating point.
+    if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+         (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+        isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+      addPredicateUsers(PI, I);
+      addAdditionalUsers(Cmp->getOperand(0), I);
+      return createConstantExpression(cast<Constant>(FirstOp));
+    }
+  }
+  return nullptr;
+}
+
 // Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
   auto *CI = cast<CallInst>(I);
-  if (AA->doesNotAccessMemory(CI))
-    return createCallExpression(CI, nullptr, B);
-  if (AA->onlyReadsMemory(CI)) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    // Instrinsics with the returned attribute are copies of arguments.
+    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+          return Result;
+      return createVariableOrConstant(ReturnedValue);
+    }
+  }
+  if (AA->doesNotAccessMemory(CI)) {
+    return createCallExpression(CI, TOPClass->getMemoryLeader());
+  } else if (AA->onlyReadsMemory(CI)) {
     MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
-    return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B);
+    return createCallExpression(CI, DefiningAccess);
   }
   return nullptr;
 }
 
-// Update the memory access equivalence table to say that From is equal to To,
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+
+  auto *Result = MemoryAccessToClass.lookup(MA);
+  assert(Result && "Should have found memory class");
+  return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
 // and return true if this is different from what already existed in the table.
-bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
-  DEBUG(dbgs() << "Setting " << *From << " equivalent to ");
-  if (!To)
-    DEBUG(dbgs() << "itself");
-  else
-    DEBUG(dbgs() << *To);
-  DEBUG(dbgs() << "\n");
-  auto LookupResult = MemoryAccessEquiv.find(From);
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+                            CongruenceClass *NewClass) {
+  assert(NewClass &&
+         "Every MemoryAccess should be getting mapped to a non-null class");
+  DEBUG(dbgs() << "Setting " << *From);
+  DEBUG(dbgs() << " equivalent to congruence class ");
+  DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+
+  auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
   // If it's already in the table, see if the value changed.
-  if (LookupResult != MemoryAccessEquiv.end()) {
-    if (To && LookupResult->second != To) {
+  if (LookupResult != MemoryAccessToClass.end()) {
+    auto *OldClass = LookupResult->second;
+    if (OldClass != NewClass) {
+      // If this is a phi, we have to handle memory member updates.
+      if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+        OldClass->memory_erase(MP);
+        NewClass->memory_insert(MP);
+        // This may have killed the class if it had no non-memory members
+        if (OldClass->getMemoryLeader() == From) {
+          if (OldClass->definesNoMemory()) {
+            OldClass->setMemoryLeader(nullptr);
+          } else {
+            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+            DEBUG(dbgs() << "Memory class leader change for class "
+                         << OldClass->getID() << " to "
+                         << *OldClass->getMemoryLeader()
+                         << " due to removal of a memory member " << *From
+                         << "\n");
+            markMemoryLeaderChangeTouched(OldClass);
+          }
+        }
+      }
       // It wasn't equivalent before, and now it is.
-      LookupResult->second = To;
-      Changed = true;
-    } else if (!To) {
-      // It used to be equivalent to something, and now it's not.
-      MemoryAccessEquiv.erase(LookupResult);
+      LookupResult->second = NewClass;
       Changed = true;
     }
-  } else {
-    assert(!To &&
-           "Memory equivalence should never change from nothing to something");
   }
 
   return Changed;
 }
+
+// Determine if a instruction is cycle-free.  That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction.  For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+  // In order to compute cycle-freeness, we do SCC finding on the instruction,
+  // and see what kind of SCC it ends up in.  If it is a singleton, it is
+  // cycle-free.  If it is not in a singleton, it is only cycle free if the
+  // other members are all phi nodes (as they do not compute anything, they are
+  // copies).
+  auto ICS = InstCycleState.lookup(I);
+  if (ICS == ICS_Unknown) {
+    SCCFinder.Start(I);
+    auto &SCC = SCCFinder.getComponentFor(I);
+    // It's cycle free if it's size 1 or or the SCC is *only* phi nodes.
+    if (SCC.size() == 1)
+      InstCycleState.insert({I, ICS_CycleFree});
+    else {
+      bool AllPhis =
+          llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); });
+      ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
+      for (auto *Member : SCC)
+        if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+          InstCycleState.insert({MemberPhi, ICS});
+    }
+  }
+  if (ICS == ICS_Cycle)
+    return false;
+  return true;
+}
+
 // Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
-                                                       const BasicBlock *B) {
-  auto *E = cast<PHIExpression>(createPHIExpression(I));
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
+  // True if one of the incoming phi edges is a backedge.
+  bool HasBackedge = false;
+  // All constant tracks the state of whether all the *original* phi operands
+  // This is really shorthand for "this phi cannot cycle due to forward
+  // change in value of the phi is guaranteed not to later change the value of
+  // the phi. IE it can't be v = phi(undef, v+1)
+  bool AllConstant = true;
+  auto *E =
+      cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant));
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
-
-  // See if all arguaments are the same.
+  // See if all arguments are the same.
   // We track if any were undef because they need special handling.
   bool HasUndef = false;
-  auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
-    if (Arg == I)
-      return false;
+  auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
     if (isa<UndefValue>(Arg)) {
       HasUndef = true;
       return false;
     }
     return true;
   });
-  // If we are left with no operands, it's undef
+  // If we are left with no operands, it's dead.
   if (Filtered.begin() == Filtered.end()) {
-    DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
-                 << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    return createConstantExpression(UndefValue::get(I->getType()));
+    // If it has undef at this point, it means there are no-non-undef arguments,
+    // and thus, the value of the phi node must be undef.
+    if (HasUndef) {
+      DEBUG(dbgs() << "PHI Node " << *I
+                   << " has no non-undef arguments, valuing it as undef\n");
+      return createConstantExpression(UndefValue::get(I->getType()));
+    }
+
+    DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+    deleteExpression(E);
+    return createDeadExpression();
   }
+  unsigned NumOps = 0;
   Value *AllSameValue = *(Filtered.begin());
   ++Filtered.begin();
   // Can't use std::equal here, sadly, because filter.begin moves.
-  if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
-        return V == AllSameValue;
+  if (llvm::all_of(Filtered, [&](Value *Arg) {
+        ++NumOps;
+        return Arg == AllSameValue;
       })) {
     // In LLVM's non-standard representation of phi nodes, it's possible to have
     // phi nodes with cycles (IE dependent on other phis that are .... dependent
@@ -881,27 +1638,38 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
     // We also special case undef, so that if we have an undef, we can't use the
     // common value unless it dominates the phi block.
     if (HasUndef) {
+      // If we have undef and at least one other value, this is really a
+      // multivalued phi, and we need to know if it's cycle free in order to
+      // evaluate whether we can ignore the undef.  The other parts of this are
+      // just shortcuts.  If there is no backedge, or all operands are
+      // constants, or all operands are ignored but the undef, it also must be
+      // cycle free.
+      if (!AllConstant && HasBackedge && NumOps > 0 &&
+          !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
+        return E;
+
       // Only have to check for instructions
       if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
-        if (!DT->dominates(AllSameInst, I))
+        if (!someEquivalentDominates(AllSameInst, I))
           return E;
     }
-
+    // Can't simplify to something that comes later in the iteration.
+    // Otherwise, when and if it changes congruence class, we will never catch
+    // up. We will always be a class behind it.
+    if (isa<Instruction>(AllSameValue) &&
+        InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
+      return E;
     NumGVNPhisAllSame++;
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    if (auto *C = dyn_cast<Constant>(AllSameValue))
-      return createConstantExpression(C);
-    return createVariableExpression(AllSameValue);
+    deleteExpression(E);
+    return createVariableOrConstant(AllSameValue);
   }
   return E;
 }
 
 const Expression *
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
-                                           const BasicBlock *B) {
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
   if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
     if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -931,19 +1699,140 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
         // expression.
         assert(II->getNumArgOperands() == 2 &&
                "Expect two args for recognised intrinsics.");
-        return createBinaryExpression(Opcode, EI->getType(),
-                                      II->getArgOperand(0),
-                                      II->getArgOperand(1), B);
+        return createBinaryExpression(
+            Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1));
       }
     }
   }
 
-  return createAggregateValueExpression(I, B);
+  return createAggregateValueExpression(I);
+}
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
+  auto *CI = dyn_cast<CmpInst>(I);
+  // See if our operands are equal to those of a previous predicate, and if so,
+  // if it implies true or false.
+  auto Op0 = lookupOperandLeader(CI->getOperand(0));
+  auto Op1 = lookupOperandLeader(CI->getOperand(1));
+  auto OurPredicate = CI->getPredicate();
+  if (shouldSwapOperands(Op0, Op1)) {
+    std::swap(Op0, Op1);
+    OurPredicate = CI->getSwappedPredicate();
+  }
+
+  // Avoid processing the same info twice
+  const PredicateBase *LastPredInfo = nullptr;
+  // See if we know something about the comparison itself, like it is the target
+  // of an assume.
+  auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+    return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+  if (Op0 == Op1) {
+    // This condition does not depend on predicates, no need to add users
+    if (CI->isTrueWhenEqual())
+      return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+    else if (CI->isFalseWhenEqual())
+      return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+  }
+
+  // NOTE: Because we are comparing both operands here and below, and using
+  // previous comparisons, we rely on fact that predicateinfo knows to mark
+  // comparisons that use renamed operands as users of the earlier comparisons.
+  // It is *not* enough to just mark predicateinfo renamed operands as users of
+  // the earlier comparisons, because the *other* operand may have changed in a
+  // previous iteration.
+  // Example:
+  // icmp slt %a, %b
+  // %b.0 = ssa.copy(%b)
+  // false branch:
+  // icmp slt %c, %b.0
+
+  // %c and %a may start out equal, and thus, the code below will say the second
+  // %icmp is false.  c may become equal to something else, and in that case the
+  // %second icmp *must* be reexamined, but would not if only the renamed
+  // %operands are considered users of the icmp.
+
+  // *Currently* we only check one level of comparisons back, and only mark one
+  // level back as touched when changes appen .  If you modify this code to look
+  // back farther through comparisons, you *must* mark the appropriate
+  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
+  // we know something just from the operands themselves
+
+  // See if our operands have predicate info, so that we may be able to derive
+  // something from a previous comparison.
+  for (const auto &Op : CI->operands()) {
+    auto *PI = PredInfo->getPredicateInfoFor(Op);
+    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+      if (PI == LastPredInfo)
+        continue;
+      LastPredInfo = PI;
+
+      // TODO: Along the false edge, we may know more things too, like icmp of
+      // same operands is false.
+      // TODO: We only handle actual comparison conditions below, not and/or.
+      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+      if (!BranchCond)
+        continue;
+      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+      auto BranchPredicate = BranchCond->getPredicate();
+      if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+        std::swap(BranchOp0, BranchOp1);
+        BranchPredicate = BranchCond->getSwappedPredicate();
+      }
+      if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+        if (PBranch->TrueEdge) {
+          // If we know the previous predicate is true and we are in the true
+          // edge then we may be implied true or false.
+          if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+                                                  OurPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+
+          if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+                                                   OurPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          }
+
+        } else {
+          // Just handle the ne and eq cases, where if we have the same
+          // operands, we may know something.
+          if (BranchPredicate == OurPredicate) {
+            addPredicateUsers(PI, I);
+            // Same predicate, same ops,we know it was false, so this is false.
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          } else if (BranchPredicate ==
+                     CmpInst::getInversePredicate(OurPredicate)) {
+            addPredicateUsers(PI, I);
+            // Inverse predicate, we know the other was false, so this is true.
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+        }
+      }
+    }
+  }
+  // Create expression will take care of simplifyCmpInst
+  return createExpression(I);
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function.  We don't do globals here
+// because they are often worse to put in place.
+// TODO: Separate cost from availability
+static bool alwaysAvailable(Value *V) {
+  return isa<Constant>(V) || isa<Argument>(V);
 }
 
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V,
-                                                    const BasicBlock *B) {
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+                                  SmallPtrSetImpl<Value *> &Visited) const {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -957,24 +1846,27 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     switch (I->getOpcode()) {
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
-      E = performSymbolicAggrValueEvaluation(I, B);
+      E = performSymbolicAggrValueEvaluation(I);
       break;
     case Instruction::PHI:
-      E = performSymbolicPHIEvaluation(I, B);
+      E = performSymbolicPHIEvaluation(I);
       break;
     case Instruction::Call:
-      E = performSymbolicCallEvaluation(I, B);
+      E = performSymbolicCallEvaluation(I);
       break;
     case Instruction::Store:
-      E = performSymbolicStoreEvaluation(I, B);
+      E = performSymbolicStoreEvaluation(I);
       break;
     case Instruction::Load:
-      E = performSymbolicLoadEvaluation(I, B);
+      E = performSymbolicLoadEvaluation(I);
       break;
     case Instruction::BitCast: {
-      E = createExpression(I, B);
+      E = createExpression(I);
+    } break;
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      E = performSymbolicCmpEvaluation(I);
     } break;
-
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -993,8 +1885,6 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-    case Instruction::ICmp:
-    case Instruction::FCmp:
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -1011,7 +1901,7 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::InsertElement:
     case Instruction::ShuffleVector:
     case Instruction::GetElementPtr:
-      E = createExpression(I, B);
+      E = createExpression(I);
       break;
     default:
       return nullptr;
@@ -1020,147 +1910,308 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
   return E;
 }
 
-// There is an edge from 'Src' to 'Dst'.  Return true if every path from
-// the entry block to 'Dst' passes via this edge.  In particular 'Dst'
-// must not be reachable via another edge from 'Src'.
-bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const {
+// Look up a container in a map, and then call a function for each thing in the
+// found container.
+template <typename Map, typename KeyType, typename Func>
+void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end())
+    for (typename Map::mapped_type::value_type Mapped : Result->second)
+      F(Mapped);
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container.  Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+  const auto Result = M.find_as(Key);
+  if (Result != M.end()) {
+    for (const typename Map::mapped_type::value_type Mapped : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(Mapped));
+    M.erase(Result);
+  }
+}
 
-  // While in theory it is interesting to consider the case in which Dst has
-  // more than one predecessor, because Dst might be part of a loop which is
-  // only reachable from Src, in practice it is pointless since at the time
-  // GVN runs all such loops have preheaders, which means that Dst will have
-  // been changed to have only one predecessor, namely Src.
-  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
-  const BasicBlock *Src = E.getStart();
-  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
-  (void)Src;
-  return Pred != nullptr;
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+  if (isa<Instruction>(To))
+    AdditionalUsers[To].insert(User);
 }
 
 void NewGVN::markUsersTouched(Value *V) {
   // Now mark the users as touched.
   for (auto *User : V->users()) {
     assert(isa<Instruction>(User) && "Use of value not within an instruction?");
-    TouchedInstructions.set(InstrDFS[User]);
+    TouchedInstructions.set(InstrToDFSNum(User));
   }
+  touchAndErase(AdditionalUsers, V);
 }
 
-void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
-  for (auto U : MA->users()) {
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(U))
-      TouchedInstructions.set(InstrDFS[MUD->getMemoryInst()]);
-    else
-      TouchedInstructions.set(InstrDFS[U]);
-  }
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
+  DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+  TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+  if (isa<MemoryUse>(MA))
+    return;
+  for (auto U : MA->users())
+    TouchedInstructions.set(MemoryToDFSNum(U));
+  touchAndErase(MemoryToUsers, MA);
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+  // Don't add temporary instructions to the user lists.
+  if (AllTempInstructions.count(I))
+    return;
+
+  if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PBranch->Condition].insert(I);
+  else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+  touchAndErase(PredicateToUsers, I);
+}
+
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->memory())
+    markMemoryDefTouched(M);
 }
 
 // Touch the instructions that need to be updated after a congruence class has a
 // leader change, and mark changed values.
-void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
-  for (auto M : CC->Members) {
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : *CC) {
     if (auto *I = dyn_cast<Instruction>(M))
-      TouchedInstructions.set(InstrDFS[I]);
+      TouchedInstructions.set(InstrToDFSNum(I));
     LeaderChanges.insert(M);
   }
 }
 
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+  for (const auto X : R) {
+    auto DFSNum = InstrToDFSNum(X);
+    if (DFSNum < MinDFS.second)
+      MinDFS = {X, DFSNum};
+  }
+  return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+  // TODO: If this ends up to slow, we can maintain a next memory leader like we
+  // do for regular leaders.
+  // Make sure there will be a leader to find
+  assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
+  if (CC->getStoreCount() > 0) {
+    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+      return getMemoryAccess(NL);
+    // Find the store with the minimum DFS number.
+    auto *V = getMinDFSOfRange<Value>(make_filter_range(
+        *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+    return getMemoryAccess(cast<StoreInst>(V));
+  }
+  assert(CC->getStoreCount() == 0);
+
+  // Given our assertion, hitting this part must mean
+  // !OldClass->memory_empty()
+  if (CC->memory_size() == 1)
+    return *CC->memory_begin();
+  return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away.  This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+  // We don't need to sort members if there is only 1, and we don't care about
+  // sorting the TOP class because everything either gets out of it or is
+  // unreachable.
+
+  if (CC->size() == 1 || CC == TOPClass) {
+    return *(CC->begin());
+  } else if (CC->getNextLeader().first) {
+    ++NumGVNAvoidedSortedLeaderChanges;
+    return CC->getNextLeader().first;
+  } else {
+    ++NumGVNSortedLeaderChanges;
+    // NOTE: If this ends up to slow, we can maintain a dual structure for
+    // member testing/insertion, or keep things mostly sorted, and sort only
+    // here, or use SparseBitVector or ....
+    return getMinDFSOfRange<Value>(*CC);
+  }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// - I must be moving to NewClass from OldClass
+// - The StoreCount of OldClass and NewClass is expected to have been updated
+//   for I already if it is is a store.
+// - The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+                                            MemoryAccess *InstMA,
+                                            CongruenceClass *OldClass,
+                                            CongruenceClass *NewClass) {
+  // If the leader is I, and we had a represenative MemoryAccess, it should
+  // be the MemoryAccess of OldClass.
+  assert((!InstMA || !OldClass->getMemoryLeader() ||
+          OldClass->getLeader() != I ||
+          MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
+              MemoryAccessToClass.lookup(InstMA)) &&
+         "Representative MemoryAccess mismatch");
+  // First, see what happens to the new class
+  if (!NewClass->getMemoryLeader()) {
+    // Should be a new class, or a store becoming a leader of a new class.
+    assert(NewClass->size() == 1 ||
+           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+    NewClass->setMemoryLeader(InstMA);
+    // Mark it touched if we didn't just create a singleton
+    DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+                 << " due to new memory instruction becoming leader\n");
+    markMemoryLeaderChangeTouched(NewClass);
+  }
+  setMemoryClass(InstMA, NewClass);
+  // Now, fixup the old class if necessary
+  if (OldClass->getMemoryLeader() == InstMA) {
+    if (!OldClass->definesNoMemory()) {
+      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+      DEBUG(dbgs() << "Memory class leader change for class "
+                   << OldClass->getID() << " to "
+                   << *OldClass->getMemoryLeader()
+                   << " due to removal of old leader " << *InstMA << "\n");
+      markMemoryLeaderChangeTouched(OldClass);
+    } else
+      OldClass->setMemoryLeader(nullptr);
+  }
+}
+
 // Move a value, currently in OldClass, to be part of NewClass
-// Update OldClass for the move (including changing leaders, etc)
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
                                            CongruenceClass *OldClass,
                                            CongruenceClass *NewClass) {
-  DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
-               << "\n");
-
-  if (I == OldClass->NextLeader.first)
-    OldClass->NextLeader = {nullptr, ~0U};
-
-  // It's possible, though unlikely, for us to discover equivalences such
-  // that the current leader does not dominate the old one.
-  // This statistic tracks how often this happens.
-  // We assert on phi nodes when this happens, currently, for debugging, because
-  // we want to make sure we name phi node cycles properly.
-  if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader &&
-      I != NewClass->RepLeader &&
-      DT->properlyDominates(
-          I->getParent(),
-          cast<Instruction>(NewClass->RepLeader)->getParent())) {
-    ++NumGVNNotMostDominatingLeader;
-    assert(!isa<PHINode>(I) &&
-           "New class for instruction should not be dominated by instruction");
-  }
-
-  if (NewClass->RepLeader != I) {
-    auto DFSNum = InstrDFS.lookup(I);
-    if (DFSNum < NewClass->NextLeader.second)
-      NewClass->NextLeader = {I, DFSNum};
-  }
-
-  OldClass->Members.erase(I);
-  NewClass->Members.insert(I);
-  if (isa<StoreInst>(I)) {
-    --OldClass->StoreCount;
-    assert(OldClass->StoreCount >= 0);
-    ++NewClass->StoreCount;
-    assert(NewClass->StoreCount > 0);
+  if (I == OldClass->getNextLeader().first)
+    OldClass->resetNextLeader();
+
+  OldClass->erase(I);
+  NewClass->insert(I);
+
+  if (NewClass->getLeader() != I)
+    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+  // Handle our special casing of stores.
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    OldClass->decStoreCount();
+    // Okay, so when do we want to make a store a leader of a class?
+    // If we have a store defined by an earlier load, we want the earlier load
+    // to lead the class.
+    // If we have a store defined by something else, we want the store to lead
+    // the class so everything else gets the "something else" as a value.
+    // If we have a store as the single member of the class, we want the store
+    // as the leader
+    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+      // If it's a store expression we are using, it means we are not equivalent
+      // to something earlier.
+      if (auto *SE = dyn_cast<StoreExpression>(E)) {
+        NewClass->setStoredValue(SE->getStoredValue());
+        markValueLeaderChangeTouched(NewClass);
+        // Shift the new class leader to be the store
+        DEBUG(dbgs() << "Changing leader of congruence class "
+                     << NewClass->getID() << " from " << *NewClass->getLeader()
+                     << " to  " << *SI << " because store joined class\n");
+        // If we changed the leader, we have to mark it changed because we don't
+        // know what it will do to symbolic evaluation.
+        NewClass->setLeader(SI);
+      }
+      // We rely on the code below handling the MemoryAccess change.
+    }
+    NewClass->incStoreCount();
   }
+  // True if there is no memory instructions left in a class that had memory
+  // instructions before.
 
+  // If it's not a memory use, set the MemoryAccess equivalence
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
+  if (InstMA)
+    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
   ValueToClass[I] = NewClass;
   // See if we destroyed the class or need to swap leaders.
-  if (OldClass->Members.empty() && OldClass != InitialClass) {
-    if (OldClass->DefiningExpr) {
-      OldClass->Dead = true;
-      DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+  if (OldClass->empty() && OldClass != TOPClass) {
+    if (OldClass->getDefiningExpr()) {
+      DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
                    << " from table\n");
-      ExpressionToClass.erase(OldClass->DefiningExpr);
+      // We erase it as an exact expression to make sure we don't just erase an
+      // equivalent one.
+      auto Iter = ExpressionToClass.find_as(
+          ExactEqualsExpression(*OldClass->getDefiningExpr()));
+      if (Iter != ExpressionToClass.end())
+        ExpressionToClass.erase(Iter);
+#ifdef EXPENSIVE_CHECKS
+      assert(
+          (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
+          "We erased the expression we just inserted, which should not happen");
+#endif
     }
-  } else if (OldClass->RepLeader == I) {
+  } else if (OldClass->getLeader() == I) {
     // When the leader changes, the value numbering of
     // everything may change due to symbolization changes, so we need to
     // reprocess.
-    DEBUG(dbgs() << "Leader change!\n");
+    DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+                 << "\n");
     ++NumGVNLeaderChanges;
-    // We don't need to sort members if there is only 1, and we don't care about
-    // sorting the initial class because everything either gets out of it or is
-    // unreachable.
-    if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
-      OldClass->RepLeader = *(OldClass->Members.begin());
-    } else if (OldClass->NextLeader.first) {
-      ++NumGVNAvoidedSortedLeaderChanges;
-      OldClass->RepLeader = OldClass->NextLeader.first;
-      OldClass->NextLeader = {nullptr, ~0U};
-    } else {
-      ++NumGVNSortedLeaderChanges;
-      // TODO: If this ends up to slow, we can maintain a dual structure for
-      // member testing/insertion, or keep things mostly sorted, and sort only
-      // here, or ....
-      std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
-      for (const auto X : OldClass->Members) {
-        auto DFSNum = InstrDFS.lookup(X);
-        if (DFSNum < MinDFS.second)
-          MinDFS = {X, DFSNum};
-      }
-      OldClass->RepLeader = MinDFS.first;
+    // Destroy the stored value if there are no more stores to represent it.
+    // Note that this is basically clean up for the expression removal that
+    // happens below.  If we remove stores from a class, we may leave it as a
+    // class of equivalent memory phis.
+    if (OldClass->getStoreCount() == 0) {
+      if (OldClass->getStoredValue())
+        OldClass->setStoredValue(nullptr);
     }
-    markLeaderChangeTouched(OldClass);
+    OldClass->setLeader(getNextValueLeader(OldClass));
+    OldClass->resetNextLeader();
+    markValueLeaderChangeTouched(OldClass);
   }
 }
 
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+  touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E));
+}
+
 // Perform congruence finding on a given value numbering expression.
 void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
-  ValueToExpression[I] = E;
   // This is guaranteed to return something, since it will at least find
-  // INITIAL.
+  // TOP.
 
-  CongruenceClass *IClass = ValueToClass[I];
+  CongruenceClass *IClass = ValueToClass.lookup(I);
   assert(IClass && "Should have found a IClass");
   // Dead classes should have been eliminated from the mapping.
-  assert(!IClass->Dead && "Found a dead class");
+  assert(!IClass->isDead() && "Found a dead class");
 
-  CongruenceClass *EClass;
+  CongruenceClass *EClass = nullptr;
   if (const auto *VE = dyn_cast<VariableExpression>(E)) {
-    EClass = ValueToClass[VE->getVariableValue()];
-  } else {
+    EClass = ValueToClass.lookup(VE->getVariableValue());
+  } else if (isa<DeadExpression>(E)) {
+    EClass = TOPClass;
+  }
+  if (!EClass) {
     auto lookupResult = ExpressionToClass.insert({E, nullptr});
 
     // If it's not in the value table, create a new congruence class.
@@ -1171,80 +2222,73 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
 
       // Constants and variables should always be made the leader.
       if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
-        NewClass->RepLeader = CE->getConstantValue();
+        NewClass->setLeader(CE->getConstantValue());
       } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
         StoreInst *SI = SE->getStoreInst();
-        NewClass->RepLeader =
-            lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+        NewClass->setLeader(SI);
+        NewClass->setStoredValue(SE->getStoredValue());
+        // The RepMemoryAccess field will be filled in properly by the
+        // moveValueToNewCongruenceClass call.
       } else {
-        NewClass->RepLeader = I;
+        NewClass->setLeader(I);
       }
       assert(!isa<VariableExpression>(E) &&
              "VariableExpression should have been handled already");
 
       EClass = NewClass;
       DEBUG(dbgs() << "Created new congruence class for " << *I
-                   << " using expression " << *E << " at " << NewClass->ID
-                   << " and leader " << *(NewClass->RepLeader) << "\n");
-      DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
+                   << " using expression " << *E << " at " << NewClass->getID()
+                   << " and leader " << *(NewClass->getLeader()));
+      if (NewClass->getStoredValue())
+        DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
+      DEBUG(dbgs() << "\n");
     } else {
       EClass = lookupResult.first->second;
       if (isa<ConstantExpression>(E))
-        assert(isa<Constant>(EClass->RepLeader) &&
+        assert((isa<Constant>(EClass->getLeader()) ||
+                (EClass->getStoredValue() &&
+                 isa<Constant>(EClass->getStoredValue()))) &&
                "Any class with a constant expression should have a "
                "constant leader");
 
       assert(EClass && "Somehow don't have an eclass");
 
-      assert(!EClass->Dead && "We accidentally looked up a dead class");
+      assert(!EClass->isDead() && "We accidentally looked up a dead class");
     }
   }
   bool ClassChanged = IClass != EClass;
   bool LeaderChanged = LeaderChanges.erase(I);
   if (ClassChanged || LeaderChanged) {
-    DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
+    DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
                  << "\n");
+    if (ClassChanged) {
+      moveValueToNewCongruenceClass(I, E, IClass, EClass);
+      markPhiOfOpsChanged(E);
+    }
 
-    if (ClassChanged)
-      moveValueToNewCongruenceClass(I, IClass, EClass);
     markUsersTouched(I);
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
-      // If this is a MemoryDef, we need to update the equivalence table. If
-      // we determined the expression is congruent to a different memory
-      // state, use that different memory state.  If we determined it didn't,
-      // we update that as well.  Right now, we only support store
-      // expressions.
-      if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
-          EClass->Members.size() != 1) {
-        auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
-        setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
-      } else {
-        setMemoryAccessEquivTo(MA, nullptr);
-      }
+    if (MemoryAccess *MA = getMemoryAccess(I))
       markMemoryUsersTouched(MA);
-    }
-  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-    // There is, sadly, one complicating thing for stores.  Stores do not
-    // produce values, only consume them.  However, in order to make loads and
-    // stores value number the same, we ignore the value operand of the store.
-    // But the value operand will still be the leader of our class, and thus, it
-    // may change.  Because the store is a use, the store will get reprocessed,
-    // but nothing will change about it, and so nothing above will catch it
-    // (since the class will not change).  In order to make sure everything ends
-    // up okay, we need to recheck the leader of the class.  Since stores of
-    // different values value number differently due to different memorydefs, we
-    // are guaranteed the leader is always the same between stores in the same
-    // class.
-    DEBUG(dbgs() << "Checking store leader\n");
-    auto ProperLeader =
-        lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
-    if (EClass->RepLeader != ProperLeader) {
-      DEBUG(dbgs() << "Store leader changed, fixing\n");
-      EClass->RepLeader = ProperLeader;
-      markLeaderChangeTouched(EClass);
-      markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
+    if (auto *CI = dyn_cast<CmpInst>(I))
+      markPredicateUsersTouched(CI);
+  }
+  // If we changed the class of the store, we want to ensure nothing finds the
+  // old store expression.  In particular, loads do not compare against stored
+  // value, so they will find old store expressions (and associated class
+  // mappings) if we leave them in the table.
+  if (ClassChanged && isa<StoreInst>(I)) {
+    auto *OldE = ValueToExpression.lookup(I);
+    // It could just be that the old class died. We don't want to erase it if we
+    // just moved classes.
+    if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
+      // Erase this as an exact expression to ensure we don't erase expressions
+      // equivalent to it.
+      auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
+      if (Iter != ExpressionToClass.end())
+        ExpressionToClass.erase(Iter);
     }
   }
+  ValueToExpression[I] = E;
 }
 
 // Process the fact that Edge (from, to) is reachable, including marking
@@ -1266,25 +2310,26 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
       // impact predicates. Otherwise, only mark the phi nodes as touched, as
       // they are the only thing that depend on new edges. Anything using their
       // values will get propagated to if necessary.
-      if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
-        TouchedInstructions.set(InstrDFS[MemPhi]);
+      if (MemoryAccess *MemPhi = getMemoryAccess(To))
+        TouchedInstructions.set(InstrToDFSNum(MemPhi));
 
       auto BI = To->begin();
       while (isa<PHINode>(BI)) {
-        TouchedInstructions.set(InstrDFS[&*BI]);
+        TouchedInstructions.set(InstrToDFSNum(&*BI));
         ++BI;
       }
+      for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) {
+        TouchedInstructions.set(InstrToDFSNum(I));
+      });
     }
   }
 }
 
 // Given a predicate condition (from a switch, cmp, or whatever) and a block,
 // see if we know some constant value for it already.
-Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const {
-  auto Result = lookupOperandLeader(Cond, nullptr, B);
-  if (isa<Constant>(Result))
-    return Result;
-  return nullptr;
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+  auto Result = lookupOperandLeader(Cond);
+  return isa<Constant>(Result) ? Result : nullptr;
 }
 
 // Process the outgoing edges of a block for reachability.
@@ -1293,10 +2338,10 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
   BranchInst *BR;
   if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
     Value *Cond = BR->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(Cond, B);
+    Value *CondEvaluated = findConditionEquivalence(Cond);
     if (!CondEvaluated) {
       if (auto *I = dyn_cast<Instruction>(Cond)) {
-        const Expression *E = createExpression(I, B);
+        const Expression *E = createExpression(I);
         if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
           CondEvaluated = CE->getConstantValue();
         }
@@ -1329,13 +2374,13 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
 
     Value *SwitchCond = SI->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(SwitchCond, B);
+    Value *CondEvaluated = findConditionEquivalence(SwitchCond);
     // See if we were able to turn this switch statement into a constant.
     if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
       auto *CondVal = cast<ConstantInt>(CondEvaluated);
       // We should be able to get case value for this.
-      auto CaseVal = SI->findCaseValue(CondVal);
-      if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) {
+      auto Case = *SI->findCaseValue(CondVal);
+      if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
         // We proved the value is outside of the range of the case.
         // We can't do anything other than mark the default dest as reachable,
         // and go home.
@@ -1343,7 +2388,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
         return;
       }
       // Now get where it goes and mark it reachable.
-      BasicBlock *TargetBlock = CaseVal.getCaseSuccessor();
+      BasicBlock *TargetBlock = Case.getCaseSuccessor();
       updateReachableEdge(B, TargetBlock);
     } else {
       for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
@@ -1361,45 +2406,215 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     }
 
     // This also may be a memory defining terminator, in which case, set it
-    // equivalent to nothing.
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(TI))
-      setMemoryAccessEquivTo(MA, nullptr);
+    // equivalent only to itself.
+    //
+    auto *MA = getMemoryAccess(TI);
+    if (MA && !isa<MemoryUse>(MA)) {
+      auto *CC = ensureLeaderOfMemoryClass(MA);
+      if (setMemoryClass(MA, CC))
+        markMemoryUsersTouched(MA);
+    }
   }
 }
 
-// The algorithm initially places the values of the routine in the INITIAL
-// congruence
-// class. The leader of INITIAL is the undetermined value `TOP`.
-// When the algorithm has finished, values still in INITIAL are unreachable.
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+                         Instruction *ExistingValue) {
+  InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+  AllTempInstructions.insert(Op);
+  PHIOfOpsPHIs[BB].push_back(Op);
+  TempToBlock[Op] = BB;
+  RealToTemp[ExistingValue] = Op;
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+  return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+         isa<LoadInst>(I);
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePhiOfOps(Instruction *I,
+                             SmallPtrSetImpl<Value *> &Visited) {
+  if (!okayForPHIOfOps(I))
+    return nullptr;
+
+  if (!Visited.insert(I).second)
+    return nullptr;
+  // For now, we require the instruction be cycle free because we don't
+  // *always* create a phi of ops for instructions that could be done as phi
+  // of ops, we only do it if we think it is useful.  If we did do it all the
+  // time, we could remove the cycle free check.
+  if (!isCycleFree(I))
+    return nullptr;
+
+  unsigned IDFSNum = InstrToDFSNum(I);
+  SmallPtrSet<const Value *, 8> ProcessedPHIs;
+  // TODO: We don't do phi translation on memory accesses because it's
+  // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+  // which we don't have a good way of doing ATM.
+  auto *MemAccess = getMemoryAccess(I);
+  // If the memory operation is defined by a memory operation this block that
+  // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+  // can't help, as it would still be killed by that memory operation.
+  if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+      MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+    return nullptr;
+
+  // Convert op of phis to phi of ops
+  for (auto &Op : I->operands()) {
+    // TODO: We can't handle expressions that must be recursively translated
+    // IE
+    // a = phi (b, c)
+    // f = use a
+    // g = f + phi of something
+    // To properly make a phi of ops for g, we'd have to properly translate and
+    // use the instruction for f.  We should add this by splitting out the
+    // instruction creation we do below.
+    if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op)))
+      return nullptr;
+    if (!isa<PHINode>(Op))
+      continue;
+    auto *OpPHI = cast<PHINode>(Op);
+    // No point in doing this for one-operand phis.
+    if (OpPHI->getNumOperands() == 1)
+      continue;
+    if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+      return nullptr;
+    SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops;
+    auto *PHIBlock = getBlockForValue(OpPHI);
+    for (auto PredBB : OpPHI->blocks()) {
+      Value *FoundVal = nullptr;
+      // We could just skip unreachable edges entirely but it's tricky to do
+      // with rewriting existing phi nodes.
+      if (ReachableEdges.count({PredBB, PHIBlock})) {
+        // Clone the instruction, create an expression from it, and see if we
+        // have a leader.
+        Instruction *ValueOp = I->clone();
+        if (MemAccess)
+          TempToMemory.insert({ValueOp, MemAccess});
+
+        for (auto &Op : ValueOp->operands()) {
+          Op = Op->DoPHITranslation(PHIBlock, PredBB);
+          // When this operand changes, it could change whether there is a
+          // leader for us or not.
+          addAdditionalUsers(Op, I);
+        }
+        // Make sure it's marked as a temporary instruction.
+        AllTempInstructions.insert(ValueOp);
+        // and make sure anything that tries to add it's DFS number is
+        // redirected to the instruction we are making a phi of ops
+        // for.
+        InstrDFS.insert({ValueOp, IDFSNum});
+        const Expression *E = performSymbolicEvaluation(ValueOp, Visited);
+        InstrDFS.erase(ValueOp);
+        AllTempInstructions.erase(ValueOp);
+        ValueOp->deleteValue();
+        if (MemAccess)
+          TempToMemory.erase(ValueOp);
+        if (!E)
+          return nullptr;
+        FoundVal = findPhiOfOpsLeader(E, PredBB);
+        if (!FoundVal) {
+          ExpressionToPhiOfOps[E].insert(I);
+          return nullptr;
+        }
+        if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+          FoundVal = SI->getValueOperand();
+      } else {
+        DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+                     << getBlockName(PredBB)
+                     << " because the block is unreachable\n");
+        FoundVal = UndefValue::get(I->getType());
+      }
+
+      Ops.push_back({FoundVal, PredBB});
+      DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+                   << getBlockName(PredBB) << "\n");
+    }
+    auto *ValuePHI = RealToTemp.lookup(I);
+    bool NewPHI = false;
+    if (!ValuePHI) {
+      ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands());
+      addPhiOfOps(ValuePHI, PHIBlock, I);
+      NewPHI = true;
+      NumGVNPHIOfOpsCreated++;
+    }
+    if (NewPHI) {
+      for (auto PHIOp : Ops)
+        ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+    } else {
+      unsigned int i = 0;
+      for (auto PHIOp : Ops) {
+        ValuePHI->setIncomingValue(i, PHIOp.first);
+        ValuePHI->setIncomingBlock(i, PHIOp.second);
+        ++i;
+      }
+    }
+
+    DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+                 << "\n");
+    return performSymbolicEvaluation(ValuePHI, Visited);
+  }
+  return nullptr;
+}
+
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
 void NewGVN::initializeCongruenceClasses(Function &F) {
-  // FIXME now i can't remember why this is 2
-  NextCongruenceNum = 2;
-  // Initialize all other instructions to be in INITIAL class.
-  CongruenceClass::MemberSet InitialValues;
-  InitialClass = createCongruenceClass(nullptr, nullptr);
-  for (auto &B : F) {
-    if (auto *MP = MSSA->getMemoryAccess(&B))
-      MemoryAccessEquiv.insert({MP, MSSA->getLiveOnEntryDef()});
-
-    for (auto &I : B) {
-      InitialValues.insert(&I);
-      ValueToClass[&I] = InitialClass;
-      // All memory accesses are equivalent to live on entry to start. They must
-      // be initialized to something so that initial changes are noticed. For
-      // the maximal answer, we initialize them all to be the same as
-      // liveOnEntry.  Note that to save time, we only initialize the
-      // MemoryDef's for stores and all MemoryPhis to be equal.  Right now, no
-      // other expression can generate a memory equivalence.  If we start
-      // handling memcpy/etc, we can expand this.
-      if (isa<StoreInst>(&I)) {
-        MemoryAccessEquiv.insert(
-            {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()});
-        ++InitialClass->StoreCount;
-        assert(InitialClass->StoreCount > 0);
+  NextCongruenceNum = 0;
+
+  // Note that even though we use the live on entry def as a representative
+  // MemoryAccess, it is *not* the same as the actual live on entry def. We
+  // have no real equivalemnt to undef for MemoryAccesses, and so we really
+  // should be checking whether the MemoryAccess is top if we want to know if it
+  // is equivalent to everything.  Otherwise, what this really signifies is that
+  // the access "it reaches all the way back to the beginning of the function"
+
+  // Initialize all other instructions to be in TOP class.
+  TOPClass = createCongruenceClass(nullptr, nullptr);
+  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+  //  The live on entry def gets put into it's own class
+  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+      createMemoryClass(MSSA->getLiveOnEntryDef());
+
+  for (auto DTN : nodes(DT)) {
+    BasicBlock *BB = DTN->getBlock();
+    // All MemoryAccesses are equivalent to live on entry to start. They must
+    // be initialized to something so that initial changes are noticed. For
+    // the maximal answer, we initialize them all to be the same as
+    // liveOnEntry.
+    auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
+    if (MemoryBlockDefs)
+      for (const auto &Def : *MemoryBlockDefs) {
+        MemoryAccessToClass[&Def] = TOPClass;
+        auto *MD = dyn_cast<MemoryDef>(&Def);
+        // Insert the memory phis into the member list.
+        if (!MD) {
+          const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+          TOPClass->memory_insert(MP);
+          MemoryPhiState.insert({MP, MPS_TOP});
+        }
+
+        if (MD && isa<StoreInst>(MD->getMemoryInst()))
+          TOPClass->incStoreCount();
       }
+    for (auto &I : *BB) {
+      // TODO: Move to helper
+      if (isa<PHINode>(&I))
+        for (auto *U : I.users())
+          if (auto *UInst = dyn_cast<Instruction>(U))
+            if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+              PHINodeUses.insert(UInst);
+      // Don't insert void terminators into the class. We don't value number
+      // them, and they just end up sitting in TOP.
+      if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+        continue;
+      TOPClass->insert(&I);
+      ValueToClass[&I] = TOPClass;
     }
   }
-  InitialClass->Members.swap(InitialValues);
 
   // Initialize arguments to be in their own unique congruence classes
   for (auto &FA : F.args())
@@ -1408,45 +2623,79 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
 
 void NewGVN::cleanupTables() {
   for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has "
-                 << CongruenceClasses[i]->Members.size() << " members\n");
+    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                 << " has " << CongruenceClasses[i]->size() << " members\n");
     // Make sure we delete the congruence class (probably worth switching to
     // a unique_ptr at some point.
     delete CongruenceClasses[i];
     CongruenceClasses[i] = nullptr;
   }
 
+  // Destroy the value expressions
+  SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+                                         AllTempInstructions.end());
+  AllTempInstructions.clear();
+
+  // We have to drop all references for everything first, so there are no uses
+  // left as we delete them.
+  for (auto *I : TempInst) {
+    I->dropAllReferences();
+  }
+
+  while (!TempInst.empty()) {
+    auto *I = TempInst.back();
+    TempInst.pop_back();
+    I->deleteValue();
+  }
+
   ValueToClass.clear();
   ArgRecycler.clear(ExpressionAllocator);
   ExpressionAllocator.Reset();
   CongruenceClasses.clear();
   ExpressionToClass.clear();
   ValueToExpression.clear();
+  RealToTemp.clear();
+  AdditionalUsers.clear();
+  ExpressionToPhiOfOps.clear();
+  TempToBlock.clear();
+  TempToMemory.clear();
+  PHIOfOpsPHIs.clear();
   ReachableBlocks.clear();
   ReachableEdges.clear();
 #ifndef NDEBUG
   ProcessedCount.clear();
 #endif
-  DFSDomMap.clear();
   InstrDFS.clear();
   InstructionsToErase.clear();
-
   DFSToInstr.clear();
   BlockInstRange.clear();
   TouchedInstructions.clear();
-  DominatedInstRange.clear();
-  MemoryAccessEquiv.clear();
+  MemoryAccessToClass.clear();
+  PredicateToUsers.clear();
+  MemoryToUsers.clear();
 }
 
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
 std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
                                                        unsigned Start) {
   unsigned End = Start;
-  if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(B)) {
+  if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
     InstrDFS[MemPhi] = End++;
     DFSToInstr.emplace_back(MemPhi);
   }
 
+  // Then the real block goes next.
   for (auto &I : *B) {
+    // There's no need to call isInstructionTriviallyDead more than once on
+    // an instruction. Therefore, once we know that an instruction is dead
+    // we change its DFS number so that it doesn't get value numbered.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      InstrDFS[&I] = 0;
+      DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      markInstructionForDeletion(&I);
+      continue;
+    }
     InstrDFS[&I] = End++;
     DFSToInstr.emplace_back(&I);
   }
@@ -1457,12 +2706,12 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
   return std::make_pair(Start, End);
 }
 
-void NewGVN::updateProcessedCount(Value *V) {
+void NewGVN::updateProcessedCount(const Value *V) {
 #ifndef NDEBUG
   if (ProcessedCount.count(V) == 0) {
     ProcessedCount.insert({V, 1});
   } else {
-    ProcessedCount[V] += 1;
+    ++ProcessedCount[V];
     assert(ProcessedCount[V] < 100 &&
            "Seem to have processed the same Value a lot");
   }
@@ -1471,27 +2720,35 @@ void NewGVN::updateProcessedCount(Value *V) {
 // Evaluate MemoryPhi nodes symbolically, just like PHI nodes
 void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // If all the arguments are the same, the MemoryPhi has the same value as the
-  // argument.
-  // Filter out unreachable blocks from our operands.
+  // argument.  Filter out unreachable blocks and self phis from our operands.
+  // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+  // self-phi checking.
+  const BasicBlock *PHIBlock = MP->getBlock();
   auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
-    return ReachableBlocks.count(MP->getIncomingBlock(U));
+    return cast<MemoryAccess>(U) != MP &&
+           !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
+           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
   });
-
-  assert(Filtered.begin() != Filtered.end() &&
-         "We should not be processing a MemoryPhi in a completely "
-         "unreachable block");
+  // If all that is left is nothing, our memoryphi is undef. We keep it as
+  // InitialClass.  Note: The only case this should happen is if we have at
+  // least one self-argument.
+  if (Filtered.begin() == Filtered.end()) {
+    if (setMemoryClass(MP, TOPClass))
+      markMemoryUsersTouched(MP);
+    return;
+  }
 
   // Transform the remaining operands into operand leaders.
   // FIXME: mapped_iterator should have a range version.
   auto LookupFunc = [&](const Use &U) {
-    return lookupMemoryAccessEquiv(cast<MemoryAccess>(U));
+    return lookupMemoryLeader(cast<MemoryAccess>(U));
   };
   auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
   auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
 
   // and now check if all the elements are equal.
   // Sadly, we can't use std::equals since these are random access iterators.
-  MemoryAccess *AllSameValue = *MappedBegin;
+  const auto *AllSameValue = *MappedBegin;
   ++MappedBegin;
   bool AllEqual = std::all_of(
       MappedBegin, MappedEnd,
@@ -1501,8 +2758,18 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
     DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
   else
     DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
-
-  if (setMemoryAccessEquivTo(MP, AllEqual ? AllSameValue : nullptr))
+  // If it's equal to something, it's in that class. Otherwise, it has to be in
+  // a class where it is the leader (other things may be equivalent to it, but
+  // it needs to start off in its own class, which means it must have been the
+  // leader, and it can't have stopped being the leader because it was never
+  // removed).
+  CongruenceClass *CC =
+      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+  auto OldState = MemoryPhiState.lookup(MP);
+  assert(OldState != MPS_Invalid && "Invalid memory phi state");
+  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+  MemoryPhiState[MP] = NewState;
+  if (setMemoryClass(MP, CC) || OldState != NewState)
     markMemoryUsersTouched(MP);
 }
 
@@ -1510,13 +2777,23 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
 // congruence finding, and updating mappings.
 void NewGVN::valueNumberInstruction(Instruction *I) {
   DEBUG(dbgs() << "Processing instruction " << *I << "\n");
-  if (isInstructionTriviallyDead(I, TLI)) {
-    DEBUG(dbgs() << "Skipping unused instruction\n");
-    markInstructionForDeletion(I);
-    return;
-  }
   if (!I->isTerminator()) {
-    const auto *Symbolized = performSymbolicEvaluation(I, I->getParent());
+    const Expression *Symbolized = nullptr;
+    SmallPtrSet<Value *, 2> Visited;
+    if (DebugCounter::shouldExecute(VNCounter)) {
+      Symbolized = performSymbolicEvaluation(I, Visited);
+      // Make a phi of ops if necessary
+      if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+          !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+        auto *PHIE = makePossiblePhiOfOps(I, Visited);
+        if (PHIE)
+          Symbolized = PHIE;
+      }
+
+    } else {
+      // Mark the instruction as unused so we don't value number it again.
+      InstrDFS[I] = 0;
+    }
     // If we couldn't come up with a symbolic expression, use the unknown
     // expression
     if (Symbolized == nullptr)
@@ -1524,7 +2801,8 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
     performCongruenceFinding(I, Symbolized);
   } else {
     // Handle terminators that return values. All of them produce values we
-    // don't currently understand.
+    // don't currently understand.  We don't place non-value producing
+    // terminators in a class.
     if (!I->getType()->isVoidTy()) {
       auto *Symbolized = createUnknownExpression(I);
       performCongruenceFinding(I, Symbolized);
@@ -1535,76 +2813,126 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
 
 // Check if there is a path, using single or equal argument phi nodes, from
 // First to Second.
-bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
-                                    const MemoryAccess *Second) const {
+bool NewGVN::singleReachablePHIPath(
+    SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+    const MemoryAccess *Second) const {
   if (First == Second)
     return true;
-
-  if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
-    auto *DefAccess = FirstDef->getDefiningAccess();
-    return singleReachablePHIPath(DefAccess, Second);
-  } else {
-    auto *MP = cast<MemoryPhi>(First);
-    auto ReachableOperandPred = [&](const Use &U) {
-      return ReachableBlocks.count(MP->getIncomingBlock(U));
-    };
-    auto FilteredPhiArgs =
-        make_filter_range(MP->operands(), ReachableOperandPred);
-    SmallVector<const Value *, 32> OperandList;
-    std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
-              std::back_inserter(OperandList));
-    bool Okay = OperandList.size() == 1;
-    if (!Okay)
-      Okay = std::equal(OperandList.begin(), OperandList.end(),
-                        OperandList.begin());
-    if (Okay)
-      return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  if (MSSA->isLiveOnEntryDef(First))
     return false;
+
+  // This is not perfect, but as we're just verifying here, we can live with
+  // the loss of precision. The real solution would be that of doing strongly
+  // connected component finding in this routine, and it's probably not worth
+  // the complexity for the time being. So, we just keep a set of visited
+  // MemoryAccess and return true when we hit a cycle.
+  if (Visited.count(First))
+    return true;
+  Visited.insert(First);
+
+  const auto *EndDef = First;
+  for (auto *ChainDef : optimized_def_chain(First)) {
+    if (ChainDef == Second)
+      return true;
+    if (MSSA->isLiveOnEntryDef(ChainDef))
+      return false;
+    EndDef = ChainDef;
   }
+  auto *MP = cast<MemoryPhi>(EndDef);
+  auto ReachableOperandPred = [&](const Use &U) {
+    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+  };
+  auto FilteredPhiArgs =
+      make_filter_range(MP->operands(), ReachableOperandPred);
+  SmallVector<const Value *, 32> OperandList;
+  std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+            std::back_inserter(OperandList));
+  bool Okay = OperandList.size() == 1;
+  if (!Okay)
+    Okay =
+        std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+  if (Okay)
+    return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+                                  Second);
+  return false;
 }
 
 // Verify the that the memory equivalence table makes sense relative to the
 // congruence classes.  Note that this checking is not perfect, and is currently
-// subject to very rare false negatives. It is only useful for testing/debugging.
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
 void NewGVN::verifyMemoryCongruency() const {
-  // Anything equivalent in the memory access table should be in the same
+#ifndef NDEBUG
+  // Verify that the memory table equivalence and memory member set match
+  for (const auto *CC : CongruenceClasses) {
+    if (CC == TOPClass || CC->isDead())
+      continue;
+    if (CC->getStoreCount() != 0) {
+      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+             "Any class with a store as a leader should have a "
+             "representative stored value");
+      assert(CC->getMemoryLeader() &&
+             "Any congruence class with a store should have a "
+             "representative access");
+    }
+
+    if (CC->getMemoryLeader())
+      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+             "Representative MemoryAccess does not appear to be reverse "
+             "mapped properly");
+    for (auto M : CC->memory())
+      assert(MemoryAccessToClass.lookup(M) == CC &&
+             "Memory member does not appear to be reverse mapped properly");
+  }
+
+  // Anything equivalent in the MemoryAccess table should be in the same
   // congruence class.
 
   // Filter out the unreachable and trivially dead entries, because they may
   // never have been updated if the instructions were not processed.
   auto ReachableAccessPred =
-      [&](const std::pair<const MemoryAccess *, MemoryAccess *> Pair) {
+      [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
         bool Result = ReachableBlocks.count(Pair.first->getBlock());
-        if (!Result)
+        if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
+            MemoryToDFSNum(Pair.first) == 0)
           return false;
         if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
           return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+        // We could have phi nodes which operands are all trivially dead,
+        // so we don't process them.
+        if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+          for (auto &U : MemPHI->incoming_values()) {
+            if (Instruction *I = dyn_cast<Instruction>(U.get())) {
+              if (!isInstructionTriviallyDead(I))
+                return true;
+            }
+          }
+          return false;
+        }
+
         return true;
       };
 
-  auto Filtered = make_filter_range(MemoryAccessEquiv, ReachableAccessPred);
+  auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
   for (auto KV : Filtered) {
-    assert(KV.first != KV.second &&
-           "We added a useless equivalence to the memory equivalence table");
-    // Unreachable instructions may not have changed because we never process
-    // them.
-    if (!ReachableBlocks.count(KV.first->getBlock()))
-      continue;
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
-      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
-      if (FirstMUD && SecondMUD)
-        assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
-               ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
-                       ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
-                   "The instructions for these memory operations should have "
-                   "been in the same congruence class or reachable through"
-                   "a single argument phi");
+      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
+      if (FirstMUD && SecondMUD) {
+        SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+        assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
+                ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+                    ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+               "The instructions for these memory operations should have "
+               "been in the same congruence class or reachable through"
+               "a single argument phi");
+      }
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
-
       // We can only sanely verify that MemoryDefs in the operand list all have
       // the same class.
       auto ReachableOperandPred = [&](const Use &U) {
-        return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) &&
+        return ReachableEdges.count(
+                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
                isa<MemoryDef>(U);
 
       };
@@ -1622,35 +2950,179 @@ void NewGVN::verifyMemoryCongruency() const {
              "All MemoryPhi arguments should be in the same class");
     }
   }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Beginning iteration verification\n");
+  if (DebugCounter::isCounterSet(VNCounter))
+    DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+  // Note that we have to store the actual classes, as we may change existing
+  // classes during iteration.  This is because our memory iteration propagation
+  // is not perfect, and so may waste a little work.  But it should generate
+  // exactly the same congruence classes we have now, with different IDs.
+  std::map<const Value *, CongruenceClass> BeforeIteration;
+
+  for (auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    BeforeIteration.insert({KV.first, *KV.second});
+  }
+
+  TouchedInstructions.set();
+  TouchedInstructions.reset(0);
+  iterateTouchedInstructions();
+  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+      EqualClasses;
+  for (const auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    // We could sink these uses, but i think this adds a bit of clarity here as
+    // to what we are comparing.
+    auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+    auto *AfterCC = KV.second;
+    // Note that the classes can't change at this point, so we memoize the set
+    // that are equal.
+    if (!EqualClasses.count({BeforeCC, AfterCC})) {
+      assert(BeforeCC->isEquivalentTo(AfterCC) &&
+             "Value number changed after main loop completed!");
+      EqualClasses.insert({BeforeCC, AfterCC});
+    }
+  }
+#endif
+}
+
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+  // This is the only use of this, and it's not worth defining a complicated
+  // densemapinfo hash/equality function for it.
+  std::set<
+      std::pair<const Value *,
+                std::tuple<const Value *, const CongruenceClass *, Value *>>>
+      StoreExpressionSet;
+  for (const auto &KV : ExpressionToClass) {
+    if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+      // Make sure a version that will conflict with loads is not already there
+      auto Res = StoreExpressionSet.insert(
+          {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+                                              SE->getStoredValue())});
+      bool Okay = Res.second;
+      // It's okay to have the same expression already in there if it is
+      // identical in nature.
+      // This can happen when the leader of the stored value changes over time.
+      if (!Okay)
+        Okay = (std::get<1>(Res.first->second) == KV.second) &&
+               (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+                lookupOperandLeader(SE->getStoredValue()));
+      assert(Okay && "Stored expression conflict exists in expression table");
+      auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+      assert(ValueExpr && ValueExpr->equals(*SE) &&
+             "StoreExpression in ExpressionToClass is not latest "
+             "StoreExpression for value");
+    }
+  }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+  unsigned int Iterations = 0;
+  // Figure out where touchedinstructions starts
+  int FirstInstr = TouchedInstructions.find_first();
+  // Nothing set, nothing to iterate, just return.
+  if (FirstInstr == -1)
+    return;
+  const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  while (TouchedInstructions.any()) {
+    ++Iterations;
+    // Walk through all the instructions in all the blocks in RPO.
+    // TODO: As we hit a new block, we should push and pop equalities into a
+    // table lookupOperandLeader can use, to catch things PredicateInfo
+    // might miss, like edge-only equivalences.
+    for (unsigned InstrNum : TouchedInstructions.set_bits()) {
+
+      // This instruction was found to be dead. We don't bother looking
+      // at it again.
+      if (InstrNum == 0) {
+        TouchedInstructions.reset(InstrNum);
+        continue;
+      }
+
+      Value *V = InstrFromDFSNum(InstrNum);
+      const BasicBlock *CurrBlock = getBlockForValue(V);
+
+      // If we hit a new block, do reachability processing.
+      if (CurrBlock != LastBlock) {
+        LastBlock = CurrBlock;
+        bool BlockReachable = ReachableBlocks.count(CurrBlock);
+        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+        // If it's not reachable, erase any touched instructions and move on.
+        if (!BlockReachable) {
+          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+          DEBUG(dbgs() << "Skipping instructions in block "
+                       << getBlockName(CurrBlock)
+                       << " because it is unreachable\n");
+          continue;
+        }
+        updateProcessedCount(CurrBlock);
+      }
+      // Reset after processing (because we may mark ourselves as touched when
+      // we propagate equalities).
+      TouchedInstructions.reset(InstrNum);
+
+      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        valueNumberMemoryPhi(MP);
+      } else if (auto *I = dyn_cast<Instruction>(V)) {
+        valueNumberInstruction(I);
+      } else {
+        llvm_unreachable("Should have been a MemoryPhi or Instruction");
+      }
+      updateProcessedCount(V);
+    }
+  }
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
 }
 
 // This is the main transformation entry point.
-bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
-                    TargetLibraryInfo *_TLI, AliasAnalysis *_AA,
-                    MemorySSA *_MSSA) {
+bool NewGVN::runGVN() {
+  if (DebugCounter::isCounterSet(VNCounter))
+    StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
   bool Changed = false;
-  DT = _DT;
-  AC = _AC;
-  TLI = _TLI;
-  AA = _AA;
-  MSSA = _MSSA;
-  DL = &F.getParent()->getDataLayout();
+  NumFuncArgs = F.arg_size();
   MSSAWalker = MSSA->getWalker();
+  SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
 
   // Count number of instructions for sizing of hash tables, and come
   // up with a global dfs numbering for instructions.
   unsigned ICount = 1;
   // Add an empty instruction to account for the fact that we start at 1
   DFSToInstr.emplace_back(nullptr);
-  // Note: We want RPO traversal of the blocks, which is not quite the same as
-  // dominator tree order, particularly with regard whether backedges get
-  // visited first or second, given a block with multiple successors.
+  // Note: We want ideal RPO traversal of the blocks, which is not quite the
+  // same as dominator tree order, particularly with regard whether backedges
+  // get visited first or second, given a block with multiple successors.
   // If we visit in the wrong order, we will end up performing N times as many
   // iterations.
   // The dominator tree does guarantee that, for a given dom tree node, it's
   // parent must occur before it in the RPO ordering. Thus, we only need to sort
   // the siblings.
-  DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
   ReversePostOrderTraversal<Function *> RPOT(&F);
   unsigned Counter = 0;
   for (auto &B : RPOT) {
@@ -1663,33 +3135,21 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
     auto *Node = DT->getNode(B);
     if (Node->getChildren().size() > 1)
       std::sort(Node->begin(), Node->end(),
-                [&RPOOrdering](const DomTreeNode *A, const DomTreeNode *B) {
+                [&](const DomTreeNode *A, const DomTreeNode *B) {
                   return RPOOrdering[A] < RPOOrdering[B];
                 });
   }
 
   // Now a standard depth first ordering of the domtree is equivalent to RPO.
-  auto DFI = df_begin(DT->getRootNode());
-  for (auto DFE = df_end(DT->getRootNode()); DFI != DFE; ++DFI) {
-    BasicBlock *B = DFI->getBlock();
+  for (auto DTN : depth_first(DT->getRootNode())) {
+    BasicBlock *B = DTN->getBlock();
     const auto &BlockRange = assignDFSNumbers(B, ICount);
     BlockInstRange.insert({B, BlockRange});
     ICount += BlockRange.second - BlockRange.first;
   }
-
-  // Handle forward unreachable blocks and figure out which blocks
-  // have single preds.
-  for (auto &B : F) {
-    // Assign numbers to unreachable blocks.
-    if (!DFI.nodeVisited(DT->getNode(&B))) {
-      const auto &BlockRange = assignDFSNumbers(&B, ICount);
-      BlockInstRange.insert({&B, BlockRange});
-      ICount += BlockRange.second - BlockRange.first;
-    }
-  }
+  initializeCongruenceClasses(F);
 
   TouchedInstructions.resize(ICount);
-  DominatedInstRange.reserve(F.size());
   // Ensure we don't end up resizing the expressionToClass map, as
   // that can be quite expensive. At most, we have one expression per
   // instruction.
@@ -1698,65 +3158,15 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   // Initialize the touched instructions to include the entry block.
   const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
   TouchedInstructions.set(InstRange.first, InstRange.second);
+  DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+               << " marked reachable\n");
   ReachableBlocks.insert(&F.getEntryBlock());
 
-  initializeCongruenceClasses(F);
-
-  unsigned int Iterations = 0;
-  // We start out in the entry block.
-  BasicBlock *LastBlock = &F.getEntryBlock();
-  while (TouchedInstructions.any()) {
-    ++Iterations;
-    // Walk through all the instructions in all the blocks in RPO.
-    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
-         InstrNum = TouchedInstructions.find_next(InstrNum)) {
-      assert(InstrNum != 0 && "Bit 0 should never be set, something touched an "
-                              "instruction not in the lookup table");
-      Value *V = DFSToInstr[InstrNum];
-      BasicBlock *CurrBlock = nullptr;
-
-      if (auto *I = dyn_cast<Instruction>(V))
-        CurrBlock = I->getParent();
-      else if (auto *MP = dyn_cast<MemoryPhi>(V))
-        CurrBlock = MP->getBlock();
-      else
-        llvm_unreachable("DFSToInstr gave us an unknown type of instruction");
-
-      // If we hit a new block, do reachability processing.
-      if (CurrBlock != LastBlock) {
-        LastBlock = CurrBlock;
-        bool BlockReachable = ReachableBlocks.count(CurrBlock);
-        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
-
-        // If it's not reachable, erase any touched instructions and move on.
-        if (!BlockReachable) {
-          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
-          DEBUG(dbgs() << "Skipping instructions in block "
-                       << getBlockName(CurrBlock)
-                       << " because it is unreachable\n");
-          continue;
-        }
-        updateProcessedCount(CurrBlock);
-      }
-
-      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
-        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
-        valueNumberMemoryPhi(MP);
-      } else if (auto *I = dyn_cast<Instruction>(V)) {
-        valueNumberInstruction(I);
-      } else {
-        llvm_unreachable("Should have been a MemoryPhi or Instruction");
-      }
-      updateProcessedCount(V);
-      // Reset after processing (because we may mark ourselves as touched when
-      // we propagate equalities).
-      TouchedInstructions.reset(InstrNum);
-    }
-  }
-  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
-#ifndef NDEBUG
+  iterateTouchedInstructions();
   verifyMemoryCongruency();
-#endif
+  verifyIterationSettled(F);
+  verifyStoreExpressions();
+
   Changed |= eliminateInstructions(F);
 
   // Delete all instructions marked for deletion.
@@ -1764,7 +3174,8 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
     if (!ToErase->use_empty())
       ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
 
-    ToErase->eraseFromParent();
+    if (ToErase->getParent())
+      ToErase->eraseFromParent();
   }
 
   // Delete all unreachable blocks.
@@ -1783,59 +3194,15 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   return Changed;
 }
 
-bool NewGVN::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-  return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-                &getAnalysis<MemorySSAWrapperPass>().getMSSA());
-}
-
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
-  NewGVN Impl;
-
-  // Apparently the order in which we get these results matter for
-  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
-  // the same order here, just in case.
-  auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  auto &AA = AM.getResult<AAManager>(F);
-  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA);
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<GlobalsAA>();
-  return PA;
-}
-
-// Return true if V is a value that will always be available (IE can
-// be placed anywhere) in the function.  We don't do globals here
-// because they are often worse to put in place.
-// TODO: Separate cost from availability
-static bool alwaysAvailable(Value *V) {
-  return isa<Constant>(V) || isa<Argument>(V);
-}
-
-// Get the basic block from an instruction/value.
-static BasicBlock *getBlockForValue(Value *V) {
-  if (auto *I = dyn_cast<Instruction>(V))
-    return I->getParent();
-  return nullptr;
-}
-
 struct NewGVN::ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
   int LocalNum = 0;
-  // Only one of these will be set.
-  Value *Val = nullptr;
+  // Only one of Def and U will be set.
+  // The bool in the Def tells us whether the Def is the stored value of a
+  // store.
+  PointerIntPair<Value *, 1, bool> Def;
   Use *U = nullptr;
-
   bool operator<(const ValueDFS &Other) const {
     // It's not enough that any given field be less than - we have sets
     // of fields that need to be evaluated together to give a proper ordering.
@@ -1875,89 +3242,163 @@ struct NewGVN::ValueDFS {
     // but .val  and .u.
     // It does not matter what order we replace these operands in.
     // You will always end up with the same IR, and this is guaranteed.
-    return std::tie(DFSIn, DFSOut, LocalNum, Val, U) <
-           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val,
+    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
                     Other.U);
   }
 };
 
-void NewGVN::convertDenseToDFSOrdered(
-    CongruenceClass::MemberSet &Dense,
-    SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info.  The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+    DenseMap<const Value *, unsigned int> &UseCounts,
+    SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
   for (auto D : Dense) {
     // First add the value.
     BasicBlock *BB = getBlockForValue(D);
     // Constants are handled prior to ever calling this function, so
     // we should only be left with instructions as members.
     assert(BB && "Should have figured out a basic block for value");
-    ValueDFS VD;
-
-    std::pair<int, int> DFSPair = DFSDomMap[BB];
-    assert(DFSPair.first != -1 && DFSPair.second != -1 && "Invalid DFS Pair");
-    VD.DFSIn = DFSPair.first;
-    VD.DFSOut = DFSPair.second;
-    VD.Val = D;
-    // If it's an instruction, use the real local dfs number.
-    if (auto *I = dyn_cast<Instruction>(D))
-      VD.LocalNum = InstrDFS[I];
-    else
-      llvm_unreachable("Should have been an instruction");
-
-    DFSOrderedSet.emplace_back(VD);
+    ValueDFS VDDef;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VDDef.DFSIn = DomNode->getDFSNumIn();
+    VDDef.DFSOut = DomNode->getDFSNumOut();
+    // If it's a store, use the leader of the value operand, if it's always
+    // available, or the value operand.  TODO: We could do dominance checks to
+    // find a dominating leader, but not worth it ATM.
+    if (auto *SI = dyn_cast<StoreInst>(D)) {
+      auto Leader = lookupOperandLeader(SI->getValueOperand());
+      if (alwaysAvailable(Leader)) {
+        VDDef.Def.setPointer(Leader);
+      } else {
+        VDDef.Def.setPointer(SI->getValueOperand());
+        VDDef.Def.setInt(true);
+      }
+    } else {
+      VDDef.Def.setPointer(D);
+    }
+    assert(isa<Instruction>(D) &&
+           "The dense set member should always be an instruction");
+    Instruction *Def = cast<Instruction>(D);
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.push_back(VDDef);
+    // If there is a phi node equivalent, add it
+    if (auto *PN = RealToTemp.lookup(Def)) {
+      auto *PHIE =
+          dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+      if (PHIE) {
+        VDDef.Def.setInt(false);
+        VDDef.Def.setPointer(PN);
+        VDDef.LocalNum = 0;
+        DFSOrderedSet.push_back(VDDef);
+      }
+    }
 
-    // Now add the users.
-    for (auto &U : D->uses()) {
+    unsigned int UseCount = 0;
+    // Now add the uses.
+    for (auto &U : Def->uses()) {
       if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        ValueDFS VD;
+        // Don't try to replace into dead uses
+        if (InstructionsToErase.count(I))
+          continue;
+        ValueDFS VDUse;
         // Put the phi node uses in the incoming block.
         BasicBlock *IBlock;
         if (auto *P = dyn_cast<PHINode>(I)) {
           IBlock = P->getIncomingBlock(U);
           // Make phi node users appear last in the incoming block
           // they are from.
-          VD.LocalNum = InstrDFS.size() + 1;
+          VDUse.LocalNum = InstrDFS.size() + 1;
         } else {
-          IBlock = I->getParent();
-          VD.LocalNum = InstrDFS[I];
+          IBlock = getBlockForValue(I);
+          VDUse.LocalNum = InstrToDFSNum(I);
         }
-        std::pair<int, int> DFSPair = DFSDomMap[IBlock];
-        VD.DFSIn = DFSPair.first;
-        VD.DFSOut = DFSPair.second;
-        VD.U = &U;
-        DFSOrderedSet.emplace_back(VD);
+
+        // Skip uses in unreachable blocks, as we're going
+        // to delete them.
+        if (ReachableBlocks.count(IBlock) == 0)
+          continue;
+
+        DomTreeNode *DomNode = DT->getNode(IBlock);
+        VDUse.DFSIn = DomNode->getDFSNumIn();
+        VDUse.DFSOut = DomNode->getDFSNumOut();
+        VDUse.U = &U;
+        ++UseCount;
+        DFSOrderedSet.emplace_back(VDUse);
       }
     }
+
+    // If there are no uses, it's probably dead (but it may have side-effects,
+    // so not definitely dead. Otherwise, store the number of uses so we can
+    // track if it becomes dead later).
+    if (UseCount == 0)
+      ProbablyDead.insert(Def);
+    else
+      UseCounts[Def] = UseCount;
   }
 }
 
-static void patchReplacementInstruction(Instruction *I, Value *Repl) {
-  // Patch the replacement so that it is not more restrictive than the value
-  // being replaced.
-  auto *Op = dyn_cast<BinaryOperator>(I);
-  auto *ReplOp = dyn_cast<BinaryOperator>(Repl);
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+    const CongruenceClass &Dense,
+    SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+  for (auto D : Dense) {
+    if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+      continue;
 
-  if (Op && ReplOp)
-    ReplOp->andIRFlags(Op);
+    BasicBlock *BB = getBlockForValue(D);
+    ValueDFS VD;
+    DomTreeNode *DomNode = DT->getNode(BB);
+    VD.DFSIn = DomNode->getDFSNumIn();
+    VD.DFSOut = DomNode->getDFSNumOut();
+    VD.Def.setPointer(D);
 
-  if (auto *ReplInst = dyn_cast<Instruction>(Repl)) {
-    // FIXME: If both the original and replacement value are part of the
-    // same control-flow region (meaning that the execution of one
-    // guarentees the executation of the other), then we can combine the
-    // noalias scopes here and do better than the general conservative
-    // answer used in combineMetadata().
+    // If it's an instruction, use the real local dfs number.
+    if (auto *I = dyn_cast<Instruction>(D))
+      VD.LocalNum = InstrToDFSNum(I);
+    else
+      llvm_unreachable("Should have been an instruction");
 
-    // In general, GVN unifies expressions over different control-flow
-    // regions, and so we need a conservative combination of the noalias
-    // scopes.
-    unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,        LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
-        LLVMContext::MD_invariant_group};
-    combineMetadata(ReplInst, I, KnownIDs);
+    LoadsAndStores.emplace_back(VD);
   }
 }
 
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
+  // Patch the replacement so that it is not more restrictive than the value
+  // being replaced.
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
+}
+
 static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
   patchReplacementInstruction(I, Repl);
   I->replaceAllUsesWith(Repl);
@@ -1967,10 +3408,6 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumGVNBlocksDeleted;
 
-  // Check to see if there are non-terminating instructions to delete.
-  if (isa<TerminatorInst>(BB->begin()))
-    return;
-
   // Delete the instructions backwards, as it has a reduced likelihood of having
   // to update as many def-use and use-def chains. Start after the terminator.
   auto StartPoint = BB->rbegin();
@@ -1987,6 +3424,11 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
     Inst.eraseFromParent();
     ++NumGVNInstrDeleted;
   }
+  // Now insert something that simplifycfg will turn into an unreachable.
+  Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+  new StoreInst(UndefValue::get(Int8Ty),
+                Constant::getNullValue(Int8Ty->getPointerTo()),
+                BB->getTerminator());
 }
 
 void NewGVN::markInstructionForDeletion(Instruction *I) {
@@ -2042,6 +3484,37 @@ private:
 };
 }
 
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPhiOfOpsLeader(const Expression *E,
+                                  const BasicBlock *BB) const {
+  // It would already be constant if we could make it constant
+  if (auto *CE = dyn_cast<ConstantExpression>(E))
+    return CE->getConstantValue();
+  if (auto *VE = dyn_cast<VariableExpression>(E))
+    return VE->getVariableValue();
+
+  auto *CC = ExpressionToClass.lookup(E);
+  if (!CC)
+    return nullptr;
+  if (alwaysAvailable(CC->getLeader()))
+    return CC->getLeader();
+
+  for (auto Member : *CC) {
+    auto *MemberInst = dyn_cast<Instruction>(Member);
+    // Anything that isn't an instruction is always available.
+    if (!MemberInst)
+      return Member;
+    // If we are looking for something in the same block as the member, it must
+    // be a leader because this function is looking for operands for a phi node.
+    if (MemberInst->getParent() == BB ||
+        DT->dominates(MemberInst->getParent(), BB)) {
+      return Member;
+    }
+  }
+  return nullptr;
+}
+
 bool NewGVN::eliminateInstructions(Function &F) {
   // This is a non-standard eliminator. The normal way to eliminate is
   // to walk the dominator tree in order, keeping track of available
@@ -2072,73 +3545,91 @@ bool NewGVN::eliminateInstructions(Function &F) {
   // DFS numbers are updated, we compute some ourselves.
   DT->updateDFSNumbers();
 
-  for (auto &B : F) {
-    if (!ReachableBlocks.count(&B)) {
-      for (const auto S : successors(&B)) {
-        for (auto II = S->begin(); isa<PHINode>(II); ++II) {
-          auto &Phi = cast<PHINode>(*II);
-          DEBUG(dbgs() << "Replacing incoming value of " << *II << " for block "
-                       << getBlockName(&B)
-                       << " with undef due to it being unreachable\n");
-          for (auto &Operand : Phi.incoming_values())
-            if (Phi.getIncomingBlock(Operand) == &B)
-              Operand.set(UndefValue::get(Phi.getType()));
-        }
+  // Go through all of our phi nodes, and kill the arguments associated with
+  // unreachable edges.
+  auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) {
+    for (auto &Operand : PHI.incoming_values())
+      if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) {
+        DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block "
+                     << getBlockName(PHI.getIncomingBlock(Operand))
+                     << " with undef due to it being unreachable\n");
+        Operand.set(UndefValue::get(PHI.getType()));
       }
+  };
+  SmallPtrSet<BasicBlock *, 8> BlocksWithPhis;
+  for (auto &B : F)
+    if ((!B.empty() && isa<PHINode>(*B.begin())) ||
+        (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end()))
+      BlocksWithPhis.insert(&B);
+  DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+  for (auto KV : ReachableEdges)
+    ReachablePredCount[KV.getEnd()]++;
+  for (auto *BB : BlocksWithPhis)
+    // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in
+    // the block and subtract the pred count, but it's more complicated.
+    if (ReachablePredCount.lookup(BB) !=
+        unsigned(std::distance(pred_begin(BB), pred_end(BB)))) {
+      for (auto II = BB->begin(); isa<PHINode>(II); ++II) {
+        auto &PHI = cast<PHINode>(*II);
+        ReplaceUnreachablePHIArgs(PHI, BB);
+      }
+      for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) {
+        ReplaceUnreachablePHIArgs(*PHI, BB);
+      });
     }
-    DomTreeNode *Node = DT->getNode(&B);
-    if (Node)
-      DFSDomMap[&B] = {Node->getDFSNumIn(), Node->getDFSNumOut()};
-  }
 
-  for (CongruenceClass *CC : CongruenceClasses) {
-    // FIXME: We should eventually be able to replace everything still
-    // in the initial class with undef, as they should be unreachable.
-    // Right now, initial still contains some things we skip value
-    // numbering of (UNREACHABLE's, for example).
-    if (CC == InitialClass || CC->Dead)
+  // Map to store the use counts
+  DenseMap<const Value *, unsigned int> UseCounts;
+  for (auto *CC : reverse(CongruenceClasses)) {
+    DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
+    // Track the equivalent store info so we can decide whether to try
+    // dead store elimination.
+    SmallVector<ValueDFS, 8> PossibleDeadStores;
+    SmallPtrSet<Instruction *, 8> ProbablyDead;
+    if (CC->isDead() || CC->empty())
       continue;
-    assert(CC->RepLeader && "We should have had a leader");
+    // Everything still in the TOP class is unreachable or dead.
+    if (CC == TOPClass) {
+      for (auto M : *CC) {
+        auto *VTE = ValueToExpression.lookup(M);
+        if (VTE && isa<DeadExpression>(VTE))
+          markInstructionForDeletion(cast<Instruction>(M));
+        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+                InstructionsToErase.count(cast<Instruction>(M))) &&
+               "Everything in TOP should be unreachable or dead at this "
+               "point");
+      }
+      continue;
+    }
 
+    assert(CC->getLeader() && "We should have had a leader");
     // If this is a leader that is always available, and it's a
     // constant or has no equivalences, just replace everything with
     // it. We then update the congruence class with whatever members
     // are left.
-    if (alwaysAvailable(CC->RepLeader)) {
-      SmallPtrSet<Value *, 4> MembersLeft;
-      for (auto M : CC->Members) {
-
+    Value *Leader =
+        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+    if (alwaysAvailable(Leader)) {
+      CongruenceClass::MemberSet MembersLeft;
+      for (auto M : *CC) {
         Value *Member = M;
-
         // Void things have no uses we can replace.
-        if (Member == CC->RepLeader || Member->getType()->isVoidTy()) {
+        if (Member == Leader || !isa<Instruction>(Member) ||
+            Member->getType()->isVoidTy()) {
           MembersLeft.insert(Member);
           continue;
         }
-
-        DEBUG(dbgs() << "Found replacement " << *(CC->RepLeader) << " for "
-                     << *Member << "\n");
-        // Due to equality propagation, these may not always be
-        // instructions, they may be real values.  We don't really
-        // care about trying to replace the non-instructions.
-        if (auto *I = dyn_cast<Instruction>(Member)) {
-          assert(CC->RepLeader != I &&
-                 "About to accidentally remove our leader");
-          replaceInstruction(I, CC->RepLeader);
-          AnythingReplaced = true;
-
-          continue;
-        } else {
-          MembersLeft.insert(I);
-        }
+        DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
+                     << "\n");
+        auto *I = cast<Instruction>(Member);
+        assert(Leader != I && "About to accidentally remove our leader");
+        replaceInstruction(I, Leader);
+        AnythingReplaced = true;
       }
-      CC->Members.swap(MembersLeft);
-
+      CC->swap(MembersLeft);
     } else {
-      DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n");
       // If this is a singleton, we can skip it.
-      if (CC->Members.size() != 1) {
-
+      if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
         // This is a stack because equality replacement/etc may place
         // constants in the middle of the member list, and we want to use
         // those constant values in preference to the current leader, over
@@ -2147,23 +3638,34 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
         // Convert the members to DFS ordered sets and then merge them.
         SmallVector<ValueDFS, 8> DFSOrderedSet;
-        convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
+        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
 
         // Sort the whole thing.
         std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
         for (auto &VD : DFSOrderedSet) {
           int MemberDFSIn = VD.DFSIn;
           int MemberDFSOut = VD.DFSOut;
-          Value *Member = VD.Val;
-          Use *MemberUse = VD.U;
-
-          if (Member) {
-            // We ignore void things because we can't get a value from them.
-            // FIXME: We could actually use this to kill dead stores that are
-            // dominated by equivalent earlier stores.
-            if (Member->getType()->isVoidTy())
-              continue;
+          Value *Def = VD.Def.getPointer();
+          bool FromStore = VD.Def.getInt();
+          Use *U = VD.U;
+          // We ignore void things because we can't get a value from them.
+          if (Def && Def->getType()->isVoidTy())
+            continue;
+          auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+          if (DefInst && AllTempInstructions.count(DefInst)) {
+            auto *PN = cast<PHINode>(DefInst);
+
+            // If this is a value phi and that's the expression we used, insert
+            // it into the program
+            // remove from temp instruction list.
+            AllTempInstructions.erase(PN);
+            auto *DefBlock = getBlockForValue(Def);
+            DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+                         << " into block "
+                         << getBlockName(getBlockForValue(Def)) << "\n");
+            PN->insertBefore(&DefBlock->front());
+            Def = PN;
+            NumGVNPHIOfOpsEliminations++;
           }
 
           if (EliminationStack.empty()) {
@@ -2189,69 +3691,251 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // start using, we also push.
           // Otherwise, we walk along, processing members who are
           // dominated by this scope, and eliminate them.
-          bool ShouldPush =
-              Member && (EliminationStack.empty() || isa<Constant>(Member));
+          bool ShouldPush = Def && EliminationStack.empty();
           bool OutOfScope =
               !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
 
           if (OutOfScope || ShouldPush) {
             // Sync to our current scope.
             EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
-            ShouldPush |= Member && EliminationStack.empty();
+            bool ShouldPush = Def && EliminationStack.empty();
             if (ShouldPush) {
-              EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+            }
+          }
+
+          // Skip the Def's, we only want to eliminate on their uses.  But mark
+          // dominated defs as dead.
+          if (Def) {
+            // For anything in this case, what and how we value number
+            // guarantees that any side-effets that would have occurred (ie
+            // throwing, etc) can be proven to either still occur (because it's
+            // dominated by something that has the same side-effects), or never
+            // occur.  Otherwise, we would not have been able to prove it value
+            // equivalent to something else. For these things, we can just mark
+            // it all dead.  Note that this is different from the "ProbablyDead"
+            // set, which may not be dominated by anything, and thus, are only
+            // easy to prove dead if they are also side-effect free. Note that
+            // because stores are put in terms of the stored value, we skip
+            // stored values here. If the stored value is really dead, it will
+            // still be marked for deletion when we process it in its own class.
+            if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+                isa<Instruction>(Def) && !FromStore)
+              markInstructionForDeletion(cast<Instruction>(Def));
+            continue;
+          }
+          // At this point, we know it is a Use we are trying to possibly
+          // replace.
+
+          assert(isa<Instruction>(U->get()) &&
+                 "Current def should have been an instruction");
+          assert(isa<Instruction>(U->getUser()) &&
+                 "Current user should have been an instruction");
+
+          // If the thing we are replacing into is already marked to be dead,
+          // this use is dead.  Note that this is true regardless of whether
+          // we have anything dominating the use or not.  We do this here
+          // because we are already walking all the uses anyway.
+          Instruction *InstUse = cast<Instruction>(U->getUser());
+          if (InstructionsToErase.count(InstUse)) {
+            auto &UseCount = UseCounts[U->get()];
+            if (--UseCount == 0) {
+              ProbablyDead.insert(cast<Instruction>(U->get()));
             }
           }
 
           // If we get to this point, and the stack is empty we must have a use
-          // with nothing we can use to eliminate it, just skip it.
+          // with nothing we can use to eliminate this use, so just skip it.
           if (EliminationStack.empty())
             continue;
 
-          // Skip the Value's, we only want to eliminate on their uses.
-          if (Member)
-            continue;
-          Value *Result = EliminationStack.back();
+          Value *DominatingLeader = EliminationStack.back();
+
+          auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+          if (II && II->getIntrinsicID() == Intrinsic::ssa_copy)
+            DominatingLeader = II->getOperand(0);
 
           // Don't replace our existing users with ourselves.
-          if (MemberUse->get() == Result)
+          if (U->get() == DominatingLeader)
             continue;
-
-          DEBUG(dbgs() << "Found replacement " << *Result << " for "
-                       << *MemberUse->get() << " in " << *(MemberUse->getUser())
-                       << "\n");
+          DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+                       << *U->get() << " in " << *(U->getUser()) << "\n");
 
           // If we replaced something in an instruction, handle the patching of
-          // metadata.
-          if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get()))
-            patchReplacementInstruction(ReplacedInst, Result);
-
-          assert(isa<Instruction>(MemberUse->getUser()));
-          MemberUse->set(Result);
+          // metadata.  Skip this if we are replacing predicateinfo with its
+          // original operand, as we already know we can just drop it.
+          auto *ReplacedInst = cast<Instruction>(U->get());
+          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+          if (!PI || DominatingLeader != PI->OriginalOp)
+            patchReplacementInstruction(ReplacedInst, DominatingLeader);
+          U->set(DominatingLeader);
+          // This is now a use of the dominating leader, which means if the
+          // dominating leader was dead, it's now live!
+          auto &LeaderUseCount = UseCounts[DominatingLeader];
+          // It's about to be alive again.
+          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+            ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          if (LeaderUseCount == 0 && II)
+            ProbablyDead.insert(II);
+          ++LeaderUseCount;
           AnythingReplaced = true;
         }
       }
     }
 
+    // At this point, anything still in the ProbablyDead set is actually dead if
+    // would be trivially dead.
+    for (auto *I : ProbablyDead)
+      if (wouldInstructionBeTriviallyDead(I))
+        markInstructionForDeletion(I);
+
     // Cleanup the congruence class.
-    SmallPtrSet<Value *, 4> MembersLeft;
-    for (Value *Member : CC->Members) {
-      if (Member->getType()->isVoidTy()) {
+    CongruenceClass::MemberSet MembersLeft;
+    for (auto *Member : *CC)
+      if (!isa<Instruction>(Member) ||
+          !InstructionsToErase.count(cast<Instruction>(Member)))
         MembersLeft.insert(Member);
-        continue;
-      }
-
-      if (auto *MemberInst = dyn_cast<Instruction>(Member)) {
-        if (isInstructionTriviallyDead(MemberInst)) {
-          // TODO: Don't mark loads of undefs.
-          markInstructionForDeletion(MemberInst);
-          continue;
+    CC->swap(MembersLeft);
+
+    // If we have possible dead stores to look at, try to eliminate them.
+    if (CC->getStoreCount() > 0) {
+      convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+      std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
+      ValueDFSStack EliminationStack;
+      for (auto &VD : PossibleDeadStores) {
+        int MemberDFSIn = VD.DFSIn;
+        int MemberDFSOut = VD.DFSOut;
+        Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+        if (EliminationStack.empty() ||
+            !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+          // Sync to our current scope.
+          EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+          if (EliminationStack.empty()) {
+            EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+            continue;
+          }
         }
+        // We already did load elimination, so nothing to do here.
+        if (isa<LoadInst>(Member))
+          continue;
+        assert(!EliminationStack.empty());
+        Instruction *Leader = cast<Instruction>(EliminationStack.back());
+        (void)Leader;
+        assert(DT->dominates(Leader->getParent(), Member->getParent()));
+        // Member is dominater by Leader, and thus dead
+        DEBUG(dbgs() << "Marking dead store " << *Member
+                     << " that is dominated by " << *Leader << "\n");
+        markInstructionForDeletion(Member);
+        CC->erase(Member);
+        ++NumGVNDeadStores;
       }
-      MembersLeft.insert(Member);
     }
-    CC->Members.swap(MembersLeft);
   }
-
   return AnythingReplaced;
 }
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order.  Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank.  However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+  // Prefer constants to undef to anything else
+  // Undef is a constant, have to check it first.
+  // Prefer smaller constants to constantexprs
+  if (isa<ConstantExpr>(V))
+    return 2;
+  if (isa<UndefValue>(V))
+    return 1;
+  if (isa<Constant>(V))
+    return 0;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 3 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // the constant and argument ranking above.
+  unsigned Result = InstrToDFSNum(V);
+  if (Result > 0)
+    return 4 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+  // Because we only care about a total ordering, and don't rewrite expressions
+  // in this order, we order by rank, which will give a strict weak ordering to
+  // everything but constants, and then we order by pointer address.
+  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+namespace {
+class NewGVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  NewGVNLegacyPass() : FunctionPass(ID) {
+    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                F.getParent()->getDataLayout())
+      .runGVN();
+}
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+                    false)
+
+char NewGVNLegacyPass::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+  // Apparently the order in which we get these results matter for
+  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+  // the same order here, just in case.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  bool Changed =
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+          .runGVN();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 1a7ddc9..1bfecea 100644
--- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -66,7 +66,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call. Insert a FP compare instruction and a conditional branch
   // at the end of CurrBB.
-  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   CurrBB.getTerminator()->eraseFromParent();
   Builder.SetInsertPoint(&CurrBB);
   Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
@@ -98,14 +98,14 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
 
       // Skip if function either has local linkage or is not a known library
       // function.
-      LibFunc::Func LibFunc;
+      LibFunc LF;
       if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
-          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+          !TLI->getLibFunc(CalledFunc->getName(), LF))
         continue;
 
-      switch (LibFunc) {
-      case LibFunc::sqrtf:
-      case LibFunc::sqrt:
+      switch (LF) {
+      case LibFunc_sqrtf:
+      case LibFunc_sqrt:
         if (TTI->haveFastSqrt(Call->getType()) &&
             optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
           break;
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 65c814d..e235e5eb 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
@@ -106,11 +107,12 @@ XorOpnd::XorOpnd(Value *V) {
             I->getOpcode() == Instruction::And)) {
     Value *V0 = I->getOperand(0);
     Value *V1 = I->getOperand(1);
-    if (isa<ConstantInt>(V0))
+    const APInt *C;
+    if (match(V0, PatternMatch::m_APInt(C)))
       std::swap(V0, V1);
 
-    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
-      ConstPart = C->getValue();
+    if (match(V1, PatternMatch::m_APInt(C))) {
+      ConstPart = *C;
       SymbolicPart = V0;
       isOr = (I->getOpcode() == Instruction::Or);
       return;
@@ -119,7 +121,7 @@ XorOpnd::XorOpnd(Value *V) {
 
   // view the operand as "V | 0"
   SymbolicPart = V;
-  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
   isOr = true;
 }
 
@@ -955,8 +957,8 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
 /// Scan backwards and forwards among values with the same rank as element i
 /// to see if X exists.  If X does not exist, return i.  This is useful when
 /// scanning for 'x' when we see '-x' because they both get the same rank.
-static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
-                                  Value *X) {
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+                                  unsigned i, Value *X) {
   unsigned XRank = Ops[i].Rank;
   unsigned e = Ops.size();
   for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
@@ -982,7 +984,7 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i,
 /// Emit a tree of add instructions, summing Ops together
 /// and returning the result.  Insert the tree before I.
 static Value *EmitAddTreeOfValues(Instruction *I,
-                                  SmallVectorImpl<WeakVH> &Ops){
+                                  SmallVectorImpl<WeakTrackingVH> &Ops) {
   if (Ops.size() == 1) return Ops.back();
 
   Value *V1 = Ops.back();
@@ -1069,8 +1071,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
 ///
 /// Ops is the top-level list of add operands we're trying to factor.
 static void FindSingleUseMultiplyFactors(Value *V,
-                                         SmallVectorImpl<Value*> &Factors,
-                                       const SmallVectorImpl<ValueEntry> &Ops) {
+                                         SmallVectorImpl<Value*> &Factors) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
   if (!BO) {
     Factors.push_back(V);
@@ -1078,8 +1079,8 @@ static void FindSingleUseMultiplyFactors(Value *V,
   }
 
   // Otherwise, add the LHS and RHS to the list of factors.
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
 }
 
 /// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
@@ -1135,20 +1136,19 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
 /// instruction. There are two special cases: 1) if the constant operand is 0,
 /// it will return NULL. 2) if the constant is ~0, the symbolic operand will
 /// be returned.
-static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
                              const APInt &ConstOpnd) {
-  if (ConstOpnd != 0) {
-    if (!ConstOpnd.isAllOnesValue()) {
-      LLVMContext &Ctx = Opnd->getType()->getContext();
-      Instruction *I;
-      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
-                                    "and.ra", InsertBefore);
-      I->setDebugLoc(InsertBefore->getDebugLoc());
-      return I;
-    }
+  if (ConstOpnd.isNullValue())
+    return nullptr;
+
+  if (ConstOpnd.isAllOnesValue())
     return Opnd;
-  }
-  return nullptr;
+
+  Instruction *I = BinaryOperator::CreateAnd(
+      Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+      InsertBefore);
+  I->setDebugLoc(InsertBefore->getDebugLoc());
+  return I;
 }
 
 // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
@@ -1164,24 +1164,24 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
   //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
   //                       = (x & ~c1) ^ (c1 ^ c2)
   // It is useful only when c1 == c2.
-  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
-    if (!Opnd1->getValue()->hasOneUse())
-      return false;
+  if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+    return false;
 
-    const APInt &C1 = Opnd1->getConstPart();
-    if (C1 != ConstOpnd)
-      return false;
+  if (!Opnd1->getValue()->hasOneUse())
+    return false;
 
-    Value *X = Opnd1->getSymbolicPart();
-    Res = createAndInstr(I, X, ~C1);
-    // ConstOpnd was C2, now C1 ^ C2.
-    ConstOpnd ^= C1;
+  const APInt &C1 = Opnd1->getConstPart();
+  if (C1 != ConstOpnd)
+    return false;
 
-    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
-      RedoInsts.insert(T);
-    return true;
-  }
-  return false;
+  Value *X = Opnd1->getSymbolicPart();
+  Res = createAndInstr(I, X, ~C1);
+  // ConstOpnd was C2, now C1 ^ C2.
+  ConstOpnd ^= C1;
+
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  return true;
 }
 
                            
@@ -1222,8 +1222,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     APInt C3((~C1) ^ C2);
 
     // Do not increase code size!
-    if (C3 != 0 && !C3.isAllOnesValue()) {
-      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
       if (NewInstNum > DeadInstNum)
         return false;
     }
@@ -1239,8 +1239,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
     APInt C3 = C1 ^ C2;
     
     // Do not increase code size
-    if (C3 != 0 && !C3.isAllOnesValue()) {
-      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+    if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
       if (NewInstNum > DeadInstNum)
         return false;
     }
@@ -1280,17 +1280,20 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
   SmallVector<XorOpnd, 8> Opnds;
   SmallVector<XorOpnd*, 8> OpndPtrs;
   Type *Ty = Ops[0].Op->getType();
-  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+  APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
 
   // Step 1: Convert ValueEntry to XorOpnd
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     Value *V = Ops[i].Op;
-    if (!isa<ConstantInt>(V)) {
+    const APInt *C;
+    // TODO: Support non-splat vectors.
+    if (match(V, PatternMatch::m_APInt(C))) {
+      ConstOpnd ^= *C;
+    } else {
       XorOpnd O(V);
       O.setSymbolicRank(getRank(O.getSymbolicPart()));
       Opnds.push_back(O);
-    } else
-      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+    }
   }
 
   // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
@@ -1328,7 +1331,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
     Value *CV;
 
     // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
-    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+    if (!ConstOpnd.isNullValue() &&
+        CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
       Changed = true;
       if (CV)
         *CurrOpnd = XorOpnd(CV);
@@ -1370,17 +1374,17 @@ Value *ReassociatePass::OptimizeXor(Instruction *I,
       ValueEntry VE(getRank(O.getValue()), O.getValue());
       Ops.push_back(VE);
     }
-    if (ConstOpnd != 0) {
-      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+    if (!ConstOpnd.isNullValue()) {
+      Value *C = ConstantInt::get(Ty, ConstOpnd);
       ValueEntry VE(getRank(C), C);
       Ops.push_back(VE);
     }
-    int Sz = Ops.size();
+    unsigned Sz = Ops.size();
     if (Sz == 1)
       return Ops.back().Op;
-    else if (Sz == 0) {
-      assert(ConstOpnd == 0);
-      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    if (Sz == 0) {
+      assert(ConstOpnd.isNullValue());
+      return ConstantInt::get(Ty, ConstOpnd);
     }
   }
 
@@ -1499,7 +1503,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 
     // Compute all of the factors of this added value.
     SmallVector<Value*, 8> Factors;
-    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+    FindSingleUseMultiplyFactors(BOp, Factors);
     assert(Factors.size() > 1 && "Bad linearize!");
 
     // Add one to FactorOccurrences for each unique factor in this op.
@@ -1560,7 +1564,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
             ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
             : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
 
-    SmallVector<WeakVH, 4> NewMulOps;
+    SmallVector<WeakTrackingVH, 4> NewMulOps;
     for (unsigned i = 0; i != Ops.size(); ++i) {
       // Only try to remove factors from expressions we're allowed to.
       BinaryOperator *BOp =
@@ -1583,7 +1587,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
     }
 
     // No need for extra uses anymore.
-    delete DummyInst;
+    DummyInst->deleteValue();
 
     unsigned NumAddedValues = NewMulOps.size();
     Value *V = EmitAddTreeOfValues(I, NewMulOps);
@@ -1628,8 +1632,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 ///   ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
 ///
 /// \returns Whether any factors have a power greater than one.
-bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
-                                             SmallVectorImpl<Factor> &Factors) {
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+                                   SmallVectorImpl<Factor> &Factors) {
   // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
   // Compute the sum of powers of simplifiable factors.
   unsigned FactorPowerSum = 0;
@@ -1890,6 +1894,8 @@ void ReassociatePass::EraseInst(Instruction *I) {
         Op = Op->user_back();
       RedoInsts.insert(Op);
     }
+
+  MadeChange = true;
 }
 
 // Canonicalize expressions of the following form:
@@ -1923,7 +1929,7 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
 
   // User must be a binary operator with one or more uses.
   Instruction *User = I->user_back();
-  if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1))
+  if (!isa<BinaryOperator>(User) || User->use_empty())
     return nullptr;
 
   unsigned UserOpcode = User->getOpcode();
@@ -1935,6 +1941,12 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) {
   if (!User->isCommutative() && User->getOperand(1) != I)
     return nullptr;
 
+  // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
+  // resulting subtract will be broken up later.  This can get us into an
+  // infinite loop during reassociation.
+  if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User))
+    return nullptr;
+
   // Change the sign of the constant.
   APFloat Val = CF->getValueAPF();
   Val.changeSign();
@@ -2000,11 +2012,6 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (I->isCommutative())
     canonicalizeOperands(I);
 
-  // TODO: We should optimize vector Xor instructions, but they are
-  // currently unsupported.
-  if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor)
-    return;
-
   // Don't optimize floating point instructions that don't have unsafe algebra.
   if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra())
     return;
@@ -2147,7 +2154,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
     if (I->getOpcode() == Instruction::Mul &&
         cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
         isa<ConstantInt>(Ops.back().Op) &&
-        cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+        cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
       ValueEntry Tmp = Ops.pop_back_val();
       Ops.insert(Ops.begin(), Tmp);
     } else if (I->getOpcode() == Instruction::FMul &&
@@ -2236,8 +2243,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   ValueRankMap.clear();
 
   if (MadeChange) {
-    // FIXME: This should also 'preserve the CFG'.
-    auto PA = PreservedAnalyses();
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
     PA.preserve<GlobalsAA>();
     return PA;
   }
diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
index 615029d..9629568 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -25,6 +24,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 1de7420..f19d453 100644
--- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -7,20 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Rewrite an existing set of gc.statepoints such that they make potential
-// relocations performed by the garbage collector explicit in the IR.
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Pass.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Dominators.h"
@@ -28,15 +27,16 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -89,10 +89,10 @@ struct RewriteStatepointsForGC : public ModulePass {
       Changed |= runOnFunction(F);
 
     if (Changed) {
-      // stripNonValidAttributes asserts that shouldRewriteStatepointsIn
+      // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn
       // returns true for at least one function in the module.  Since at least
       // one function changed, we know that the precondition is satisfied.
-      stripNonValidAttributes(M);
+      stripNonValidAttributesAndMetadata(M);
     }
 
     return Changed;
@@ -105,20 +105,24 @@ struct RewriteStatepointsForGC : public ModulePass {
     AU.addRequired<TargetTransformInfoWrapperPass>();
   }
 
-  /// The IR fed into RewriteStatepointsForGC may have had attributes implying
-  /// dereferenceability that are no longer valid/correct after
-  /// RewriteStatepointsForGC has run.  This is because semantically, after
+  /// The IR fed into RewriteStatepointsForGC may have had attributes and
+  /// metadata implying dereferenceability that are no longer valid/correct after
+  /// RewriteStatepointsForGC has run. This is because semantically, after
   /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
-  /// heap.  stripNonValidAttributes (conservatively) restores correctness
-  /// by erasing all attributes in the module that externally imply
-  /// dereferenceability.
-  /// Similar reasoning also applies to the noalias attributes. gc.statepoint
-  /// can touch the entire heap including noalias objects.
-  void stripNonValidAttributes(Module &M);
-
-  // Helpers for stripNonValidAttributes
-  void stripNonValidAttributesFromBody(Function &F);
+  /// heap. stripNonValidAttributesAndMetadata (conservatively) restores
+  /// correctness by erasing all attributes in the module that externally imply
+  /// dereferenceability. Similar reasoning also applies to the noalias
+  /// attributes and metadata. gc.statepoint can touch the entire heap including
+  /// noalias objects.
+  void stripNonValidAttributesAndMetadata(Module &M);
+
+  // Helpers for stripNonValidAttributesAndMetadata
+  void stripNonValidAttributesAndMetadataFromBody(Function &F);
   void stripNonValidAttributesFromPrototype(Function &F);
+  // Certain metadata on instructions are invalid after running RS4GC.
+  // Optimizations that run after RS4GC can incorrectly use this metadata to
+  // optimize functions. We drop such metadata on the instruction.
+  void stripInvalidMetadataFromInstruction(Instruction &I);
 };
 } // namespace
 
@@ -365,6 +369,11 @@ findBaseDefiningValueOfVector(Value *I) {
     // for particular sufflevector patterns.
     return BaseDefiningValueResult(I, false);
 
+  // The behavior of getelementptr instructions is the same for vector and
+  // non-vector data types.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -634,7 +643,7 @@ static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
 
 // Values of type BDVState form a lattice, and this function implements the meet
 // operation.
-static BDVState meetBDVState(BDVState LHS, BDVState RHS) {
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
   BDVState Result = meetBDVStateImpl(LHS, RHS);
   assert(Result == meetBDVStateImpl(RHS, LHS) &&
          "Math is wrong: meet does not commute!");
@@ -1123,39 +1132,23 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
 
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
-static AttributeSet legalizeCallAttributes(AttributeSet AS) {
-  AttributeSet Ret;
-
-  for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
-    unsigned Index = AS.getSlotIndex(Slot);
-
-    if (Index == AttributeSet::ReturnIndex ||
-        Index == AttributeSet::FunctionIndex) {
-
-      for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
-
-        // Do not allow certain attributes - just skip them
-        // Safepoint can not be read only or read none.
-        if (Attr.hasAttribute(Attribute::ReadNone) ||
-            Attr.hasAttribute(Attribute::ReadOnly))
-          continue;
-
-        // These attributes control the generation of the gc.statepoint call /
-        // invoke itself; and once the gc.statepoint is in place, they're of no
-        // use.
-        if (isStatepointDirectiveAttr(Attr))
-          continue;
-
-        Ret = Ret.addAttributes(
-            AS.getContext(), Index,
-            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
-      }
-    }
-
-    // Just skip parameter attributes for now
-  }
-
-  return Ret;
+static AttributeList legalizeCallAttributes(AttributeList AL) {
+  if (AL.isEmpty())
+    return AL;
+
+  // Remove the readonly, readnone, and statepoint function attributes.
+  AttrBuilder FnAttrs = AL.getFnAttributes();
+  FnAttrs.removeAttribute(Attribute::ReadNone);
+  FnAttrs.removeAttribute(Attribute::ReadOnly);
+  for (Attribute A : AL.getFnAttributes()) {
+    if (isStatepointDirectiveAttr(A))
+      FnAttrs.remove(A);
+  }
+
+  // Just skip parameter and return attributes for now
+  LLVMContext &Ctx = AL.getContext();
+  return AttributeList::get(Ctx, AttributeList::FunctionIndex,
+                            AttributeSet::get(Ctx, FnAttrs));
 }
 
 /// Helper function to place all gc relocates necessary for the given
@@ -1299,12 +1292,11 @@ static StringRef getDeoptLowering(CallSite CS) {
   const char *DeoptLowering = "deopt-lowering";
   if (CS.hasFnAttr(DeoptLowering)) {
     // FIXME: CallSite has a *really* confusing interface around attributes
-    // with values.  
-    const AttributeSet &CSAS = CS.getAttributes();
-    if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
-                          DeoptLowering))
-      return CSAS.getAttribute(AttributeSet::FunctionIndex,
-                               DeoptLowering).getValueAsString();
+    // with values.
+    const AttributeList &CSAS = CS.getAttributes();
+    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+          .getValueAsString();
     Function *F = CS.getCalledFunction();
     assert(F && F->hasFnAttribute(DeoptLowering));
     return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1388,7 +1380,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
   // Create the statepoint given all the arguments
   Instruction *Token = nullptr;
-  AttributeSet ReturnAttrs;
   if (CS.isCall()) {
     CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
     CallInst *Call = Builder.CreateGCStatepointCall(
@@ -1399,12 +1390,10 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
     Call->setCallingConv(ToReplace->getCallingConv());
 
     // Currently we will fail on parameter attributes and on certain
-    // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
-    // In case if we can handle this set of attributes - set up function attrs
-    // directly on statepoint and return attrs later for gc_result intrinsic.
-    Call->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    // function attributes.  In case if we can handle this set of attributes -
+    // set up function attrs directly on statepoint and return attrs later for
+    // gc_result intrinsic.
+    Call->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
 
     Token = Call;
 
@@ -1427,12 +1416,10 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
     Invoke->setCallingConv(ToReplace->getCallingConv());
 
     // Currently we will fail on parameter attributes and on certain
-    // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
-    // In case if we can handle this set of attributes - set up function attrs
-    // directly on statepoint and return attrs later for gc_result intrinsic.
-    Invoke->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    // function attributes.  In case if we can handle this set of attributes -
+    // set up function attrs directly on statepoint and return attrs later for
+    // gc_result intrinsic.
+    Invoke->setAttributes(legalizeCallAttributes(ToReplace->getAttributes()));
 
     Token = Invoke;
 
@@ -1478,7 +1465,9 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
       StringRef Name =
           CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
       CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
-      GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+      GCResult->setAttributes(
+          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+                             CS.getAttributes().getRetAttributes()));
 
       // We cannot RAUW or delete CS.getInstruction() because it could be in the
       // live set of some other safepoint, in which case that safepoint's
@@ -1615,8 +1604,10 @@ static void relocationViaAlloca(
 
   // Emit alloca for "LiveValue" and record it in "allocaMap" and
   // "PromotableAllocas"
+  const DataLayout &DL = F.getParent()->getDataLayout();
   auto emitAllocaFor = [&](Value *LiveValue) {
-    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+                                        DL.getAllocaAddrSpace(), "",
                                         F.getEntryBlock().getFirstNonPHI());
     AllocaMap[LiveValue] = Alloca;
     PromotableAllocas.push_back(Alloca);
@@ -1873,7 +1864,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
              "non noop cast is found during rematerialization");
 
       Type *SrcTy = CI->getOperand(0)->getType();
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
@@ -1963,7 +1954,7 @@ static void rematerializeLiveValues(CallSite CS,
       // to identify the newly generated AlternateRootPhi (.base version of phi)
       // and RootOfChain (the original phi node itself) are the same, so that we
       // can rematerialize the gep and casts. This is a workaround for the
-      // deficieny in the findBasePointer algorithm.
+      // deficiency in the findBasePointer algorithm.
       if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
         continue;
       // Now that the phi nodes are proved to be the same, assert that
@@ -2003,7 +1994,7 @@ static void rematerializeLiveValues(CallSite CS,
       Instruction *LastClonedValue = nullptr;
       Instruction *LastValue = nullptr;
       for (Instruction *Instr: ChainToBase) {
-        // Only GEP's and casts are suported as we need to be careful to not
+        // Only GEP's and casts are supported as we need to be careful to not
         // introduce any new uses of pointers not in the liveset.
         // Note that it's fine to introduce new uses of pointers which were
         // otherwise not used after this statepoint.
@@ -2107,9 +2098,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
   // live in the IR.  We'll remove all of these when done.
   SmallVector<CallInst *, 64> Holders;
 
-  // Insert a dummy call with all of the arguments to the vm_state we'll need
-  // for the actual safepoint insertion.  This ensures reference arguments in
-  // the deopt argument list are considered live through the safepoint (and
+  // Insert a dummy call with all of the deopt operands we'll need for the
+  // actual safepoint insertion as arguments.  This ensures reference operands
+  // in the deopt argument list are considered live through the safepoint (and
   // thus makes sure they get relocated.)
   for (CallSite CS : ToUpdate) {
     SmallVector<Value *, 64> DeoptValues;
@@ -2299,12 +2290,11 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
   if (AH.getDereferenceableOrNullBytes(Index))
     R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
                                   AH.getDereferenceableOrNullBytes(Index)));
-  if (AH.doesNotAlias(Index))
+  if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
     R.addAttribute(Attribute::NoAlias);
 
   if (!R.empty())
-    AH.setAttributes(AH.getAttributes().removeAttributes(
-        Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+    AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
 }
 
 void
@@ -2313,19 +2303,51 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
 
   for (Argument &A : F.args())
     if (isa<PointerType>(A.getType()))
-      RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
+      RemoveNonValidAttrAtIndex(Ctx, F,
+                                A.getArgNo() + AttributeList::FirstArgIndex);
 
   if (isa<PointerType>(F.getReturnType()))
-    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+}
+
+void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) {
+
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return;
+  // These are the attributes that are still valid on loads and stores after
+  // RS4GC.
+  // The metadata implying dereferenceability and noalias are (conservatively)
+  // dropped.  This is because semantically, after RewriteStatepointsForGC runs,
+  // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+  // touch the entire heap including noalias objects. Note: The reasoning is
+  // same as stripping the dereferenceability and noalias attributes that are
+  // analogous to the metadata counterparts.
+  // We also drop the invariant.load metadata on the load because that metadata
+  // implies the address operand to the load points to memory that is never
+  // changed once it became dereferenceable. This is no longer true after RS4GC.
+  // Similar reasoning applies to invariant.group metadata, which applies to
+  // loads within a group.
+  unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+                         LLVMContext::MD_range,
+                         LLVMContext::MD_alias_scope,
+                         LLVMContext::MD_nontemporal,
+                         LLVMContext::MD_nonnull,
+                         LLVMContext::MD_align,
+                         LLVMContext::MD_type};
+
+  // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+  I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) {
   if (F.empty())
     return;
 
   LLVMContext &Ctx = F.getContext();
   MDBuilder Builder(Ctx);
 
+
   for (Instruction &I : instructions(F)) {
     if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) {
       assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!");
@@ -2346,12 +2368,14 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
       I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
     }
 
+    stripInvalidMetadataFromInstruction(I);
+
     if (CallSite CS = CallSite(&I)) {
       for (int i = 0, e = CS.arg_size(); i != e; i++)
         if (isa<PointerType>(CS.getArgument(i)->getType()))
-          RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
+          RemoveNonValidAttrAtIndex(Ctx, CS, i + AttributeList::FirstArgIndex);
       if (isa<PointerType>(CS.getType()))
-        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
     }
   }
 }
@@ -2370,7 +2394,7 @@ static bool shouldRewriteStatepointsIn(Function &F) {
     return false;
 }
 
-void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
+void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) {
 #ifndef NDEBUG
   assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!");
 #endif
@@ -2379,7 +2403,7 @@ void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) {
     stripNonValidAttributesFromPrototype(F);
 
   for (Function &F : M)
-    stripNonValidAttributesFromBody(F);
+    stripNonValidAttributesAndMetadataFromBody(F);
 }
 
 bool RewriteStatepointsForGC::runOnFunction(Function &F) {
diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
index ede381c..4822cf7 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -140,6 +140,14 @@ public:
     return nullptr;
   }
 
+  /// getBlockAddress - If this is a constant with a BlockAddress value, return
+  /// it, otherwise return null.
+  BlockAddress *getBlockAddress() const {
+    if (isConstant())
+      return dyn_cast<BlockAddress>(getConstant());
+    return nullptr;
+  }
+
   void markForcedConstant(Constant *V) {
     assert(isUnknown() && "Can't force a defined value!");
     Val.setInt(forcedconstant);
@@ -306,20 +314,14 @@ public:
     return MRVFunctionsTracked;
   }
 
-  void markOverdefined(Value *V) {
-    assert(!V->getType()->isStructTy() &&
-           "structs should use markAnythingOverdefined");
-    markOverdefined(ValueState[V], V);
-  }
-
-  /// markAnythingOverdefined - Mark the specified value overdefined.  This
+  /// markOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
-  void markAnythingOverdefined(Value *V) {
+  void markOverdefined(Value *V) {
     if (auto *STy = dyn_cast<StructType>(V->getType()))
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         markOverdefined(getStructValueState(V, i), V);
     else
-      markOverdefined(V);
+      markOverdefined(ValueState[V], V);
   }
 
   // isStructLatticeConstant - Return true if all the lattice values
@@ -513,12 +515,8 @@ private:
   void visitCmpInst(CmpInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
-  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
-  void visitFuncletPadInst(FuncletPadInst &FPI) {
-    markAnythingOverdefined(&FPI);
-  }
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
-    markAnythingOverdefined(&CPI);
+    markOverdefined(&CPI);
     visitTerminatorInst(CPI);
   }
 
@@ -537,17 +535,11 @@ private:
   void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
-  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
-    markAnythingOverdefined(&I);
-  }
-  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
-  void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
-  void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
-
   void visitInstruction(Instruction &I) {
-    // If a new instruction is added to LLVM that we don't handle.
+    // All the instructions we don't do any special handling for just
+    // go to overdefined.
     DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
-    markAnythingOverdefined(&I);   // Just in case
+    markOverdefined(&I);
   }
 };
 
@@ -602,14 +594,36 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
       return;
     }
 
-    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+    Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
     return;
   }
 
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(&TI)) {
-    // Just mark all destinations executable!
-    Succs.assign(TI.getNumSuccessors(), true);
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+    // Casts are folded by visitCastInst.
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+    if (!Addr) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!IBRValue.isUnknown())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    BasicBlock* T = Addr->getBasicBlock();
+    assert(Addr->getFunction() == T->getParent() &&
+           "Block address of a different function ?");
+    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+      // This is the target.
+      if (IBR->getDestination(i) == T) {
+        Succs[i] = true;
+        return;
+      }
+    }
+
+    // If we didn't find our destination in the IBR successor list, then we
+    // have undefined behavior. Its ok to assume no successor is executable.
     return;
   }
 
@@ -659,13 +673,21 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     if (!CI)
       return !SCValue.isUnknown();
 
-    return SI->findCaseValue(CI).getCaseSuccessor() == To;
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
   }
 
-  // Just mark all destinations executable!
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(TI))
-    return true;
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
 
   DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
@@ -693,7 +715,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // If this PN returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (PN.getType()->isStructTy())
-    return markAnythingOverdefined(&PN);
+    return markOverdefined(&PN);
 
   if (getValueState(&PN).isOverdefined())
     return;  // Quick exit
@@ -803,7 +825,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // If this returns a struct, mark all elements over defined, we don't track
   // structs in structs.
   if (EVI.getType()->isStructTy())
-    return markAnythingOverdefined(&EVI);
+    return markOverdefined(&EVI);
 
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
@@ -828,7 +850,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
-    return markAnythingOverdefined(&IVI);
+    return markOverdefined(&IVI);
 
   Value *Aggr = IVI.getAggregateOperand();
   unsigned Idx = *IVI.idx_begin();
@@ -857,7 +879,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // If this select returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal CondValue = getValueState(I.getCondition());
   if (CondValue.isUnknown())
@@ -910,9 +932,16 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
 
   // Otherwise, one of our operands is overdefined.  Try to produce something
   // better than overdefined with some tricks.
-
-  // If this is an AND or OR with 0 or -1, it doesn't matter that the other
-  // operand is overdefined.
+  // If this is 0 / Y, it doesn't matter that the second operand is
+  // overdefined, and we can replace it with zero.
+  if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
+    if (V1State.isConstant() && V1State.getConstant()->isNullValue())
+      return markConstant(IV, &I, V1State.getConstant());
+
+  // If this is:
+  // -> AND/MUL with 0
+  // -> OR with -1
+  // it doesn't matter that the other operand is overdefined.
   if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
       I.getOpcode() == Instruction::Or) {
     LatticeVal *NonOverdefVal = nullptr;
@@ -934,7 +963,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
       } else {
         // X or -1 = -1
         if (ConstantInt *CI = NonOverdefVal->getConstantInt())
-          if (CI->isAllOnesValue())
+          if (CI->isMinusOne())
             return markConstant(IV, &I, NonOverdefVal->getConstant());
       }
     }
@@ -1021,7 +1050,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
 void SCCPSolver::visitLoadInst(LoadInst &I) {
   // If this load is of a struct, just mark the result overdefined.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
@@ -1078,7 +1107,7 @@ CallOverdefined:
     // Otherwise, if we have a single return value case, and if the function is
     // a declaration, maybe we can constant fold it.
     if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
-        canConstantFoldCallTo(F)) {
+        canConstantFoldCallTo(CS, F)) {
 
       SmallVector<Constant*, 8> Operands;
       for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
@@ -1098,7 +1127,7 @@ CallOverdefined:
 
       // If we can constant fold this, mark the result of the call as a
       // constant.
-      if (Constant *C = ConstantFoldCall(F, Operands, TLI)) {
+      if (Constant *C = ConstantFoldCall(CS, F, Operands, TLI)) {
         // call -> undef.
         if (isa<UndefValue>(C))
           return;
@@ -1107,7 +1136,7 @@ CallOverdefined:
     }
 
     // Otherwise, we don't know anything about this call, mark it overdefined.
-    return markAnythingOverdefined(I);
+    return markOverdefined(I);
   }
 
   // If this is a local function that doesn't have its address taken, mark its
@@ -1483,6 +1512,31 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       return true;
     }
 
+   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+      // Indirect branch with no successor ?. Its ok to assume it branches
+      // to no target.
+      if (IBR->getNumSuccessors() < 1)
+        continue;
+
+      if (!getValueState(IBR->getAddress()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // the first successor of the indirect branch.
+      if (isa<UndefValue>(IBR->getAddress())) {
+        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+        markEdgeExecutable(&BB, IBR->getSuccessor(0));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to the first successor.
+      markForcedConstant(IBR->getAddress(),
+                         BlockAddress::get(IBR->getSuccessor(0)));
+      return true;
+    }
+
     if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
         continue;
@@ -1490,12 +1544,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->case_begin().getCaseValue());
-        markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor());
+        SI->setCondition(SI->case_begin()->getCaseValue());
+        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
         return true;
       }
 
-      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+      markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
       return true;
     }
   }
@@ -1545,7 +1599,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
   // Mark all arguments to the function as being overdefined.
   for (Argument &AI : F.args())
-    Solver.markAnythingOverdefined(&AI);
+    Solver.markOverdefined(&AI);
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1715,8 +1769,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
     // arguments and return value aggressively, and can assume it is not called
     // unless we see evidence to the contrary.
     if (F.hasLocalLinkage()) {
-      if (AddressIsTaken(&F))
+      if (F.hasAddressTaken()) {
         AddressTakenFunctions.insert(&F);
+      }
       else {
         Solver.AddArgumentTrackedFunction(&F);
         continue;
@@ -1728,14 +1783,15 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
     // Assume nothing about the incoming arguments.
     for (Argument &AI : F.args())
-      Solver.markAnythingOverdefined(&AI);
+      Solver.markOverdefined(&AI);
   }
 
   // Loop over global variables.  We inform the solver about any internal global
   // variables that do not have their 'addresses taken'.  If they don't have
   // their addresses taken, we can propagate constants through them.
   for (GlobalVariable &G : M.globals())
-    if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
+    if (!G.isConstant() && G.hasLocalLinkage() &&
+        G.hasDefinitiveInitializer() && !AddressIsTaken(&G))
       Solver.TrackValueOfGlobalVariable(&G);
 
   // Solve for constants.
@@ -1760,15 +1816,11 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
     if (F.isDeclaration())
       continue;
 
-    if (Solver.isBlockExecutable(&F.front())) {
+    if (Solver.isBlockExecutable(&F.front()))
       for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
-           ++AI) {
-        if (AI->use_empty())
-          continue;
-        if (tryToReplaceWithConstant(Solver, &*AI))
+           ++AI)
+        if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
           ++IPNumArgsElimed;
-      }
-    }
 
     for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
       if (!Solver.isBlockExecutable(&*BB)) {
@@ -1817,32 +1869,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (!I) continue;
 
         bool Folded = ConstantFoldTerminator(I->getParent());
-        if (!Folded) {
-          // The constant folder may not have been able to fold the terminator
-          // if this is a branch or switch on undef.  Fold it manually as a
-          // branch to the first successor.
-#ifndef NDEBUG
-          if (auto *BI = dyn_cast<BranchInst>(I)) {
-            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
-                   "Branch should be foldable!");
-          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
-            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
-          } else {
-            llvm_unreachable("Didn't fold away reference to block!");
-          }
-#endif
-
-          // Make this an uncond branch to the first successor.
-          TerminatorInst *TI = I->getParent()->getTerminator();
-          BranchInst::Create(TI->getSuccessor(0), TI);
-
-          // Remove entries in successor phi nodes to remove edges.
-          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
-            TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
-          // Remove the old terminator.
-          TI->eraseFromParent();
-        }
+        assert(Folded &&
+              "Expect TermInst on constantint or blockaddress to be folded");
+        (void) Folded;
       }
 
       // Finally, delete the basic block.
diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
index bfcb155..b9cee5b 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -25,6 +25,7 @@
 
 #include "llvm/Transforms/Scalar/SROA.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -325,7 +326,7 @@ private:
   /// partition.
   uint64_t BeginOffset, EndOffset;
 
-  /// \brief The start end end iterators of this partition.
+  /// \brief The start and end iterators of this partition.
   iterator SI, SJ;
 
   /// \brief A collection of split slice tails overlapping the partition.
@@ -1251,7 +1252,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
     if (!LI || !LI->isSimple())
       return false;
 
-    // Both operands to the select need to be dereferencable, either
+    // Both operands to the select need to be dereferenceable, either
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
     if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI))
@@ -1636,8 +1637,17 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
       return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
         cast<PointerType>(OldTy)->getPointerAddressSpace();
     }
-    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
-      return true;
+
+    // We can convert integers to integral pointers, but not to non-integral
+    // pointers.
+    if (OldTy->isIntegerTy())
+      return !DL.isNonIntegralPointerType(NewTy);
+
+    // We can convert integral pointers to integers, but non-integral pointers
+    // need to remain pointers.
+    if (!DL.isNonIntegralPointerType(OldTy))
+      return NewTy->isIntegerTy();
+
     return false;
   }
 
@@ -1663,8 +1673,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 
   // See if we need inttoptr for this type pair. A cast involving both scalars
   // and vectors requires and additional bitcast.
-  if (OldTy->getScalarType()->isIntegerTy() &&
-      NewTy->getScalarType()->isPointerTy()) {
+  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
     // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
     if (OldTy->isVectorTy() && !NewTy->isVectorTy())
       return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
@@ -1680,8 +1689,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 
   // See if we need ptrtoint for this type pair. A cast involving both scalars
   // and vectors requires and additional bitcast.
-  if (OldTy->getScalarType()->isPointerTy() &&
-      NewTy->getScalarType()->isIntegerTy()) {
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
     // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
     if (OldTy->isVectorTy() && !NewTy->isVectorTy())
       return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
@@ -1825,6 +1833,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     // Rank the remaining candidate vector types. This is easy because we know
     // they're all integer vectors. We sort by ascending number of elements.
     auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
       assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
              "Cannot have vector types of different sizes!");
       assert(RHSTy->getElementType()->isIntegerTy() &&
@@ -2185,8 +2194,8 @@ class llvm::sroa::AllocaSliceRewriter
   Instruction *OldPtr;
 
   // Track post-rewrite users which are PHI nodes and Selects.
-  SmallPtrSetImpl<PHINode *> &PHIUsers;
-  SmallPtrSetImpl<SelectInst *> &SelectUsers;
+  SmallSetVector<PHINode *, 8> &PHIUsers;
+  SmallSetVector<SelectInst *, 8> &SelectUsers;
 
   // Utility IR builder, whose name prefix is setup for each visited use, and
   // the insertion point is set to point to the user.
@@ -2198,8 +2207,8 @@ public:
                       uint64_t NewAllocaBeginOffset,
                       uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
                       VectorType *PromotableVecTy,
-                      SmallPtrSetImpl<PHINode *> &PHIUsers,
-                      SmallPtrSetImpl<SelectInst *> &SelectUsers)
+                      SmallSetVector<PHINode *, 8> &PHIUsers,
+                      SmallSetVector<SelectInst *, 8> &SelectUsers)
       : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
         NewAllocaBeginOffset(NewAllocaBeginOffset),
         NewAllocaEndOffset(NewAllocaEndOffset),
@@ -2294,7 +2303,8 @@ private:
 #endif
 
     return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
 #ifndef NDEBUG
                           Twine(OldName) + "."
 #else
@@ -2369,6 +2379,8 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
+    unsigned AS = LI.getPointerAddressSpace();
+
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
     const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
@@ -2386,7 +2398,22 @@ private:
       LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
-        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+      // Any !nonnull metadata or !range metadata on the old load is also valid
+      // on the new load. This is even true in some cases even when the loads
+      // are different types, for example by mapping !nonnull metadata to
+      // !range metadata by modeling the null pointer constant converted to the
+      // integer type.
+      // FIXME: Add support for range metadata here. Currently the utilities
+      // for this don't propagate range metadata in trivial cases from one
+      // integer load to another, don't handle non-addrspace-0 null pointers
+      // correctly, and don't have any support for mapping ranges as the
+      // integer type becomes winder or narrower.
+      if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+        copyNonnullMetadata(LI, N, *NewLI);
+
+      // Try to preserve nonnull metadata
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
@@ -2401,12 +2428,12 @@ private:
                                 "endian_shift");
           }
     } else {
-      Type *LTy = TargetTy->getPointerTo();
+      Type *LTy = TargetTy->getPointerTo(AS);
       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
                                               getSliceAlign(TargetTy),
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
-        NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+        NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
 
       V = NewLI;
       IsPtrAdjusted = true;
@@ -2429,12 +2456,12 @@ private:
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
       Value *Placeholder =
-          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
-      delete Placeholder;
+      Placeholder->deleteValue();
     } else {
       LI.replaceAllUsesWith(V);
     }
@@ -2542,13 +2569,14 @@ private:
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
       NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
                                      SI.isVolatile());
     }
     NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access);
     if (SI.isVolatile())
-      NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope());
+      NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
     Pass.DeadInsts.insert(&SI);
     deleteIfTriviallyDead(OldOp);
 
@@ -3561,10 +3589,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     int Idx = 0, Size = Offsets.Splits.size();
     for (;;) {
       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto AS = LI->getPointerAddressSpace();
+      auto *PartPtrTy = PartTy->getPointerTo(AS);
       LoadInst *PLoad = IRB.CreateAlignedLoad(
           getAdjustedPtr(IRB, DL, BasePtr,
-                         APInt(DL.getPointerSizeInBits(), PartOffset),
+                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
           getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
           LI->getName());
@@ -3616,10 +3645,12 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
         auto *PartPtrTy =
             PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
 
+        auto AS = SI->getPointerAddressSpace();
         StoreInst *PStore = IRB.CreateAlignedStore(
-            PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
-                                  APInt(DL.getPointerSizeInBits(), PartOffset),
-                                  PartPtrTy, StoreBasePtr->getName() + "."),
+            PLoad,
+            getAdjustedPtr(IRB, DL, StoreBasePtr,
+                           APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                           PartPtrTy, StoreBasePtr->getName() + "."),
             getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
         PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access);
         DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
@@ -3688,7 +3719,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     int Idx = 0, Size = Offsets.Splits.size();
     for (;;) {
       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
-      auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+      auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+      auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
 
       // Either lookup a split load or create one.
       LoadInst *PLoad;
@@ -3696,20 +3728,23 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
         PLoad = (*SplitLoads)[Idx];
       } else {
         IRB.SetInsertPoint(LI);
+        auto AS = LI->getPointerAddressSpace();
         PLoad = IRB.CreateAlignedLoad(
             getAdjustedPtr(IRB, DL, LoadBasePtr,
-                           APInt(DL.getPointerSizeInBits(), PartOffset),
-                           PartPtrTy, LoadBasePtr->getName() + "."),
+                           APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                           LoadPartPtrTy, LoadBasePtr->getName() + "."),
             getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false,
             LI->getName());
       }
 
       // And store this partition.
       IRB.SetInsertPoint(SI);
+      auto AS = SI->getPointerAddressSpace();
       StoreInst *PStore = IRB.CreateAlignedStore(
-          PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr,
-                                APInt(DL.getPointerSizeInBits(), PartOffset),
-                                PartPtrTy, StoreBasePtr->getName() + "."),
+          PLoad,
+          getAdjustedPtr(IRB, DL, StoreBasePtr,
+                         APInt(DL.getPointerSizeInBits(AS), PartOffset),
+                         StorePartPtrTy, StoreBasePtr->getName() + "."),
           getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false);
 
       // Now build a new slice for the alloca.
@@ -3857,7 +3892,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     if (Alignment <= DL.getABITypeAlignment(SliceTy))
       Alignment = 0;
     NewAI = new AllocaInst(
-        SliceTy, nullptr, Alignment,
+      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     ++NumNewAllocas;
   }
@@ -3871,8 +3906,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   // fact scheduled for promotion.
   unsigned PPWOldSize = PostPromotionWorklist.size();
   unsigned NumUses = 0;
-  SmallPtrSet<PHINode *, 8> PHIUsers;
-  SmallPtrSet<SelectInst *, 8> SelectUsers;
+  SmallSetVector<PHINode *, 8> PHIUsers;
+  SmallSetVector<SelectInst *, 8> SelectUsers;
 
   AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
                                P.endOffset(), IsIntegerPromotable, VecTy,
@@ -3888,24 +3923,20 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   }
 
   NumAllocaPartitionUses += NumUses;
-  MaxUsesPerAllocaPartition =
-      std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
+  MaxUsesPerAllocaPartition.updateMax(NumUses);
 
   // Now that we've processed all the slices in the new partition, check if any
   // PHIs or Selects would block promotion.
-  for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(),
-                                            E = PHIUsers.end();
-       I != E; ++I)
-    if (!isSafePHIToSpeculate(**I)) {
+  for (PHINode *PHI : PHIUsers)
+    if (!isSafePHIToSpeculate(*PHI)) {
       Promotable = false;
       PHIUsers.clear();
       SelectUsers.clear();
       break;
     }
-  for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(),
-                                               E = SelectUsers.end();
-       I != E; ++I)
-    if (!isSafeSelectToSpeculate(**I)) {
+
+  for (SelectInst *Sel : SelectUsers)
+    if (!isSafeSelectToSpeculate(*Sel)) {
       Promotable = false;
       PHIUsers.clear();
       SelectUsers.clear();
@@ -4009,8 +4040,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   }
 
   NumAllocaPartitions += NumPartitions;
-  MaxPartitionsPerAlloca =
-      std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
+  MaxPartitionsPerAlloca.updateMax(NumPartitions);
 
   // Migrate debug information from the old alloca to the new alloca(s)
   // and the individual partitions.
@@ -4184,7 +4214,7 @@ bool SROA::promoteAllocas(Function &F) {
   NumPromoted += PromotableAllocas.size();
 
   DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
   PromotableAllocas.clear();
   return true;
 }
@@ -4234,9 +4264,8 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   if (!Changed)
     return PreservedAnalyses::all();
 
-  // FIXME: Even when promoting allocas we should preserve some abstract set of
-  // CFG-specific analyses.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
   return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
index afe7483..ce6f93e 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -20,11 +20,12 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 
 using namespace llvm;
 
@@ -43,13 +44,15 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
-  initializeNewGVNPass(Registry);
+  initializeNewGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
+  initializeGVNSinkLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
   initializeInductiveRangeCheckEliminationPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
+  initializeInferAddressSpacesPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLegacyLICMPassPass(Registry);
   initializeLegacyLoopSinkPassPass(Registry);
@@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
+  initializeLoopPredicationLegacyPassPass(Registry);
   initializeLoopRotateLegacyPassPass(Registry);
   initializeLoopStrengthReducePass(Registry);
   initializeLoopRerollPass(Registry);
@@ -79,13 +83,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
+  initializeLateCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
+  initializeSimpleLoopUnswitchLegacyPassPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
   initializeSeparateConstOffsetFromGEPPass(Registry);
   initializeSpeculativeExecutionLegacyPassPass(Registry);
   initializeStraightLineStrengthReducePass(Registry);
-  initializeLoadCombinePass(Registry);
   initializePlaceBackedgeSafepointsImplPass(Registry);
   initializePlaceSafepointsPass(Registry);
   initializeFloat2IntLegacyPassPass(Registry);
@@ -115,6 +120,10 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
 
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLateCFGSimplificationPass());
+}
+
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 39969e2..d11855f 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -14,12 +14,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -520,12 +520,25 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   unsigned NumElems = VT->getNumElements();
   unsigned NumIndices = GEPI.getNumIndices();
 
-  Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+  // The base pointer might be scalar even if it's a vector GEP. In those cases,
+  // splat the pointer into a vector value, and scatter that vector.
+  Value *Op0 = GEPI.getOperand(0);
+  if (!Op0->getType()->isVectorTy())
+    Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+  Scatterer Base = scatter(&GEPI, Op0);
 
   SmallVector<Scatterer, 8> Ops;
   Ops.resize(NumIndices);
-  for (unsigned I = 0; I < NumIndices; ++I)
-    Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+  for (unsigned I = 0; I < NumIndices; ++I) {
+    Value *Op = GEPI.getOperand(I + 1);
+
+    // The indices might be scalars even if it's a vector GEP. In those cases,
+    // splat the scalar into a vector value, and scatter that vector.
+    if (!Op->getType()->isVectorTy())
+      Op = Builder.CreateVectorSplat(NumElems, Op);
+
+    Ops[I] = scatter(&GEPI, Op);
+  }
 
   ValueVector Res;
   Res.resize(NumElems);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 4d59453..84675f4 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -156,27 +156,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -1138,7 +1138,7 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
   // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
   if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
       match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
-    if (isKnownNotFullPoison(I)) {
+    if (programUndefinedIfFullPoison(I)) {
       const SCEV *Key =
           SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
       DominatingExprs[Key].push_back(I);
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
new file mode 100644
index 0000000..aaab585
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -0,0 +1,808 @@
+//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+#define DEBUG_TYPE "simple-loop-unswitch"
+
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+
+static void replaceLoopUsesWithConstant(Loop &L, Value &LIC,
+                                        Constant &Replacement) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+  // Replace uses of LIC in the loop with the given constant.
+  for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) {
+    // Grab the use and walk past it so we can clobber it in the use list.
+    Use *U = &*UI++;
+    Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+    if (!UserI || !L.contains(UserI))
+      continue;
+
+    // Replace this use within the loop body.
+    *U = &Replacement;
+  }
+}
+
+/// Update the dominator tree after removing one exiting predecessor of a loop
+/// exit block.
+static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L,
+                               DominatorTree &DT) {
+  assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) &&
+         "Cannot have empty predecessors of the loop exit block if we split "
+         "off a block to unswitch!");
+
+  BasicBlock *IDom = *pred_begin(LoopExitBB);
+  // Walk all of the other predecessors finding the nearest common dominator
+  // until all predecessors are covered or we reach the loop header. The loop
+  // header necessarily dominates all loop exit blocks in loop simplified form
+  // so we can early-exit the moment we hit that block.
+  for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB);
+       PI != PE && IDom != L.getHeader(); ++PI)
+    IDom = DT.findNearestCommonDominator(IDom, *PI);
+
+  DT.changeImmediateDominator(LoopExitBB, IDom);
+}
+
+/// Update the dominator tree after unswitching a particular former exit block.
+///
+/// This handles the full update of the dominator tree after hoisting a block
+/// that previously was an exit block (or split off of an exit block) up to be
+/// reached from the new immediate dominator of the preheader.
+///
+/// The common case is simple -- we just move the unswitched block to have an
+/// immediate dominator of the old preheader. But in complex cases, there may
+/// be other blocks reachable from the unswitched block that are immediately
+/// dominated by some node between the unswitched one and the old preheader.
+/// All of these also need to be hoisted in the dominator tree. We also want to
+/// minimize queries to the dominator tree because each step of this
+/// invalidates any DFS numbers that would make queries fast.
+static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
+                                  DominatorTree &DT) {
+  DomTreeNode *OldPHNode = DT[OldPH];
+  DomTreeNode *UnswitchedNode = DT[UnswitchedBB];
+  // If the dominator tree has already been updated for this unswitched node,
+  // we're done. This makes it easier to use this routine if there are multiple
+  // paths to the same unswitched destination.
+  if (UnswitchedNode->getIDom() == OldPHNode)
+    return;
+
+  // First collect the domtree nodes that we are hoisting over. These are the
+  // set of nodes which may have children that need to be hoisted as well.
+  SmallPtrSet<DomTreeNode *, 4> DomChain;
+  for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode;
+       IDom = IDom->getIDom())
+    DomChain.insert(IDom);
+
+  // The unswitched block ends up immediately dominated by the old preheader --
+  // regardless of whether it is the loop exit block or split off of the loop
+  // exit block.
+  DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
+
+  // For everything that moves up the dominator tree, we need to examine the
+  // dominator frontier to see if it additionally should move up the dominator
+  // tree. This lambda appends the dominator frontier for a node on the
+  // worklist.
+  //
+  // Note that we don't currently use the IDFCalculator here for two reasons:
+  // 1) It computes dominator tree levels for the entire function on each run
+  //    of 'compute'. While this isn't terrible, given that we expect to update
+  //    relatively small subtrees of the domtree, it isn't necessarily the right
+  //    tradeoff.
+  // 2) The interface doesn't fit this usage well. It doesn't operate in
+  //    append-only, and builds several sets that we don't need.
+  //
+  // FIXME: Neither of these issues are a big deal and could be addressed with
+  // some amount of refactoring of IDFCalculator. That would allow us to share
+  // the core logic here (which is solving the same core problem).
+  SmallSetVector<BasicBlock *, 4> Worklist;
+  SmallVector<DomTreeNode *, 4> DomNodes;
+  SmallPtrSet<BasicBlock *, 4> DomSet;
+  auto AppendDomFrontier = [&](DomTreeNode *Node) {
+    assert(DomNodes.empty() && "Must start with no dominator nodes.");
+    assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+    // First flatten this subtree into sequence of nodes by doing a pre-order
+    // walk.
+    DomNodes.push_back(Node);
+    // We intentionally re-evaluate the size as each node can add new children.
+    // Because this is a tree walk, this cannot add any duplicates.
+    for (int i = 0; i < (int)DomNodes.size(); ++i)
+      DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+    // Now create a set of the basic blocks so we can quickly test for
+    // dominated successors. We could in theory use the DFS numbers of the
+    // dominator tree for this, but we want this to remain predictably fast
+    // even while we mutate the dominator tree in ways that would invalidate
+    // the DFS numbering.
+    for (DomTreeNode *InnerN : DomNodes)
+      DomSet.insert(InnerN->getBlock());
+
+    // Now re-walk the nodes, appending every successor of every node that isn't
+    // in the set. Note that we don't append the node itself, even though if it
+    // is a successor it does not strictly dominate itself and thus it would be
+    // part of the dominance frontier. The reason we don't append it is that
+    // the node passed in came *from* the worklist and so it has already been
+    // processed.
+    for (DomTreeNode *InnerN : DomNodes)
+      for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+        if (!DomSet.count(SuccBB))
+          Worklist.insert(SuccBB);
+
+    DomNodes.clear();
+    DomSet.clear();
+  };
+
+  // Append the initial dom frontier nodes.
+  AppendDomFrontier(UnswitchedNode);
+
+  // Walk the worklist. We grow the list in the loop and so must recompute size.
+  for (int i = 0; i < (int)Worklist.size(); ++i) {
+    auto *BB = Worklist[i];
+
+    DomTreeNode *Node = DT[BB];
+    assert(!DomChain.count(Node) &&
+           "Cannot be dominated by a block you can reach!");
+
+    // If this block had an immediate dominator somewhere in the chain
+    // we hoisted over, then its position in the domtree needs to move as it is
+    // reachable from a node hoisted over this chain.
+    if (!DomChain.count(Node->getIDom()))
+      continue;
+
+    DT.changeImmediateDominator(Node, OldPHNode);
+
+    // Now add this node's dominator frontier to the worklist as well.
+    AppendDomFrontier(Node);
+  }
+}
+
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+                                         BasicBlock &ExitBB) {
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      return true;
+
+    // If the incoming value for this edge isn't loop invariant the unswitch
+    // won't be trivial.
+    if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+      return false;
+  }
+  llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+                                                  BasicBlock &OldExitingBB,
+                                                  BasicBlock &OldPH) {
+  for (Instruction &I : UnswitchedBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    // When the loop exit is directly unswitched we just need to update the
+    // incoming basic block. We loop to handle weird cases with repeated
+    // incoming blocks, but expect to typically only have one operand here.
+    for (auto i : seq<int>(0, PN->getNumOperands())) {
+      assert(PN->getIncomingBlock(i) == &OldExitingBB &&
+             "Found incoming block different from unique predecessor!");
+      PN->setIncomingBlock(i, &OldPH);
+    }
+  }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+                                                      BasicBlock &UnswitchedBB,
+                                                      BasicBlock &OldExitingBB,
+                                                      BasicBlock &OldPH) {
+  assert(&ExitBB != &UnswitchedBB &&
+         "Must have different loop exit and unswitched blocks!");
+  Instruction *InsertPt = &*UnswitchedBB.begin();
+  for (Instruction &I : ExitBB) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      // No more PHIs to check.
+      break;
+
+    auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2,
+                                  PN->getName() + ".split", InsertPt);
+
+    // Walk backwards over the old PHI node's inputs to minimize the cost of
+    // removing each one. We have to do this weird loop manually so that we
+    // create the same number of new incoming edges in the new PHI as we expect
+    // each case-based edge to be included in the unswitched switch in some
+    // cases.
+    // FIXME: This is really, really gross. It would be much cleaner if LLVM
+    // allowed us to create a single entry for a predecessor block without
+    // having separate entries for each "edge" even though these edges are
+    // required to produce identical results.
+    for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+      if (PN->getIncomingBlock(i) != &OldExitingBB)
+        continue;
+
+      Value *Incoming = PN->removeIncomingValue(i);
+      NewPN->addIncoming(Incoming, &OldPH);
+    }
+
+    // Now replace the old PHI with the new one and wire the old one in as an
+    // input to the new one.
+    PN->replaceAllUsesWith(NewPN);
+    NewPN->addIncoming(PN, &ExitBB);
+  }
+}
+
+/// Unswitch a trivial branch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the branch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and one of the successors is a loop exit. This
+/// allows us to unswitch without duplicating the loop, making it trivial.
+///
+/// If this routine fails to unswitch the branch it returns false.
+///
+/// If the branch can be unswitched, this routine splits the preheader and
+/// hoists the branch above that split. Preserves loop simplified form
+/// (splitting the exit block as necessary). It simplifies the branch within
+/// the loop to an unconditional branch but doesn't remove it entirely. Further
+/// cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+                                  LoopInfo &LI) {
+  assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+  DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
+
+  Value *LoopCond = BI.getCondition();
+
+  // Need a trivial loop condition to unswitch.
+  if (!L.isLoopInvariant(LoopCond))
+    return false;
+
+  // FIXME: We should compute this once at the start and update it!
+  SmallVector<BasicBlock *, 16> ExitBlocks;
+  L.getExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+
+  // Check to see if a successor of the branch is guaranteed to
+  // exit through a unique exit block without having any
+  // side-effects.  If so, determine the value of Cond that causes
+  // it to do this.
+  ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext());
+  ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext());
+  int LoopExitSuccIdx = 0;
+  auto *LoopExitBB = BI.getSuccessor(0);
+  if (!ExitBlockSet.count(LoopExitBB)) {
+    std::swap(CondVal, Replacement);
+    LoopExitSuccIdx = 1;
+    LoopExitBB = BI.getSuccessor(1);
+    if (!ExitBlockSet.count(LoopExitBB))
+      return false;
+  }
+  auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
+  assert(L.contains(ContinueBB) &&
+         "Cannot have both successors exit and still be in the loop!");
+
+  auto *ParentBB = BI.getParent();
+  if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
+    return false;
+
+  DEBUG(dbgs() << "    unswitching trivial branch when: " << CondVal
+               << " == " << LoopCond << "\n");
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the conditional branch. We will change the preheader to have a conditional
+  // branch on LoopCond.
+  BasicBlock *OldPH = L.getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we are
+  // unswitching. We need to split this if there are other loop predecessors.
+  // Because the loop is in simplified form, *any* other predecessor is enough.
+  BasicBlock *UnswitchedBB;
+  if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) {
+    (void)PredBB;
+    assert(PredBB == BI.getParent() &&
+           "A branch's parent isn't a predecessor!");
+    UnswitchedBB = LoopExitBB;
+  } else {
+    UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI);
+  }
+
+  // Now splice the branch to gate reaching the new preheader and re-point its
+  // successors.
+  OldPH->getInstList().splice(std::prev(OldPH->end()),
+                              BI.getParent()->getInstList(), BI);
+  OldPH->getTerminator()->eraseFromParent();
+  BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+  BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+
+  // Create a new unconditional branch that will continue the loop as a new
+  // terminator.
+  BranchInst::Create(ContinueBB, ParentBB);
+
+  // Rewrite the relevant PHI nodes.
+  if (UnswitchedBB == LoopExitBB)
+    rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+  else
+    rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+                                              *ParentBB, *OldPH);
+
+  // Now we need to update the dominator tree.
+  updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+  // But if we split something off of the loop exit block then we also removed
+  // one of the predecessors for the loop exit block and may need to update its
+  // idom.
+  if (UnswitchedBB != LoopExitBB)
+    updateLoopExitIDom(LoopExitBB, L, DT);
+
+  // Since this is an i1 condition we can also trivially replace uses of it
+  // within the loop with a constant.
+  replaceLoopUsesWithConstant(L, *LoopCond, *Replacement);
+
+  ++NumTrivial;
+  ++NumBranches;
+  return true;
+}
+
+/// Unswitch a trivial switch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the switch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and that at least one of the successors is a loop
+/// exit. This allows us to unswitch without duplicating the loop, making it
+/// trivial.
+///
+/// If this routine fails to unswitch the switch it returns false.
+///
+/// If the switch can be unswitched, this routine splits the preheader and
+/// copies the switch above that split. If the default case is one of the
+/// exiting cases, it copies the non-exiting cases and points them at the new
+/// preheader. If the default case is not exiting, it copies the exiting cases
+/// and points the default at the preheader. It preserves loop simplified form
+/// (splitting the exit blocks as necessary). It simplifies the switch within
+/// the loop by removing now-dead cases. If the default case is one of those
+/// unswitched, it replaces its destination with a new basic block containing
+/// only unreachable. Such basic blocks, while technically loop exits, are not
+/// considered for unswitching so this is a stable transform and the same
+/// switch will not be revisited. If after unswitching there is only a single
+/// in-loop successor, the switch is further simplified to an unconditional
+/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
+                                  LoopInfo &LI) {
+  DEBUG(dbgs() << "  Trying to unswitch switch: " << SI << "\n");
+  Value *LoopCond = SI.getCondition();
+
+  // If this isn't switching on an invariant condition, we can't unswitch it.
+  if (!L.isLoopInvariant(LoopCond))
+    return false;
+
+  auto *ParentBB = SI.getParent();
+
+  // FIXME: We should compute this once at the start and update it!
+  SmallVector<BasicBlock *, 16> ExitBlocks;
+  L.getExitBlocks(ExitBlocks);
+  SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(),
+                                             ExitBlocks.end());
+
+  SmallVector<int, 4> ExitCaseIndices;
+  for (auto Case : SI.cases()) {
+    auto *SuccBB = Case.getCaseSuccessor();
+    if (ExitBlockSet.count(SuccBB) &&
+        areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
+      ExitCaseIndices.push_back(Case.getCaseIndex());
+  }
+  BasicBlock *DefaultExitBB = nullptr;
+  if (ExitBlockSet.count(SI.getDefaultDest()) &&
+      areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
+      !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator()))
+    DefaultExitBB = SI.getDefaultDest();
+  else if (ExitCaseIndices.empty())
+    return false;
+
+  DEBUG(dbgs() << "    unswitching trivial cases...\n");
+
+  SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases;
+  ExitCases.reserve(ExitCaseIndices.size());
+  // We walk the case indices backwards so that we remove the last case first
+  // and don't disrupt the earlier indices.
+  for (unsigned Index : reverse(ExitCaseIndices)) {
+    auto CaseI = SI.case_begin() + Index;
+    // Save the value of this case.
+    ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()});
+    // Delete the unswitched cases.
+    SI.removeCase(CaseI);
+  }
+
+  // Check if after this all of the remaining cases point at the same
+  // successor.
+  BasicBlock *CommonSuccBB = nullptr;
+  if (SI.getNumCases() > 0 &&
+      std::all_of(std::next(SI.case_begin()), SI.case_end(),
+                  [&SI](const SwitchInst::CaseHandle &Case) {
+                    return Case.getCaseSuccessor() ==
+                           SI.case_begin()->getCaseSuccessor();
+                  }))
+    CommonSuccBB = SI.case_begin()->getCaseSuccessor();
+
+  if (DefaultExitBB) {
+    // We can't remove the default edge so replace it with an edge to either
+    // the single common remaining successor (if we have one) or an unreachable
+    // block.
+    if (CommonSuccBB) {
+      SI.setDefaultDest(CommonSuccBB);
+    } else {
+      BasicBlock *UnreachableBB = BasicBlock::Create(
+          ParentBB->getContext(),
+          Twine(ParentBB->getName()) + ".unreachable_default",
+          ParentBB->getParent());
+      new UnreachableInst(ParentBB->getContext(), UnreachableBB);
+      SI.setDefaultDest(UnreachableBB);
+      DT.addNewBlock(UnreachableBB, ParentBB);
+    }
+  } else {
+    // If we're not unswitching the default, we need it to match any cases to
+    // have a common successor or if we have no cases it is the common
+    // successor.
+    if (SI.getNumCases() == 0)
+      CommonSuccBB = SI.getDefaultDest();
+    else if (SI.getDefaultDest() != CommonSuccBB)
+      CommonSuccBB = nullptr;
+  }
+
+  // Split the preheader, so that we know that there is a safe place to insert
+  // the switch.
+  BasicBlock *OldPH = L.getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI);
+  OldPH->getTerminator()->eraseFromParent();
+
+  // Now add the unswitched switch.
+  auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+
+  // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+  // First, we split any exit blocks with remaining in-loop predecessors. Then
+  // we update the PHIs in one of two ways depending on if there was a split.
+  // We walk in reverse so that we split in the same order as the cases
+  // appeared. This is purely for convenience of reading the resulting IR, but
+  // it doesn't cost anything really.
+  SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
+  SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
+  // Handle the default exit if necessary.
+  // FIXME: It'd be great if we could merge this with the loop below but LLVM's
+  // ranges aren't quite powerful enough yet.
+  if (DefaultExitBB) {
+    if (pred_empty(DefaultExitBB)) {
+      UnswitchedExitBBs.insert(DefaultExitBB);
+      rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+    } else {
+      auto *SplitBB =
+          SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+                                                *ParentBB, *OldPH);
+      updateLoopExitIDom(DefaultExitBB, L, DT);
+      DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+    }
+  }
+  // Note that we must use a reference in the for loop so that we update the
+  // container.
+  for (auto &CasePair : reverse(ExitCases)) {
+    // Grab a reference to the exit block in the pair so that we can update it.
+    BasicBlock *ExitBB = CasePair.second;
+
+    // If this case is the last edge into the exit block, we can simply reuse it
+    // as it will no longer be a loop exit. No mapping necessary.
+    if (pred_empty(ExitBB)) {
+      // Only rewrite once.
+      if (UnswitchedExitBBs.insert(ExitBB).second)
+        rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
+      continue;
+    }
+
+    // Otherwise we need to split the exit block so that we retain an exit
+    // block from the loop and a target for the unswitched condition.
+    BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
+    if (!SplitExitBB) {
+      // If this is the first time we see this, do the split and remember it.
+      SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI);
+      rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+                                                *ParentBB, *OldPH);
+      updateLoopExitIDom(ExitBB, L, DT);
+    }
+    // Update the case pair to point to the split block.
+    CasePair.second = SplitExitBB;
+  }
+
+  // Now add the unswitched cases. We do this in reverse order as we built them
+  // in reverse order.
+  for (auto CasePair : reverse(ExitCases)) {
+    ConstantInt *CaseVal = CasePair.first;
+    BasicBlock *UnswitchedBB = CasePair.second;
+
+    NewSI->addCase(CaseVal, UnswitchedBB);
+    updateDTAfterUnswitch(UnswitchedBB, OldPH, DT);
+  }
+
+  // If the default was unswitched, re-point it and add explicit cases for
+  // entering the loop.
+  if (DefaultExitBB) {
+    NewSI->setDefaultDest(DefaultExitBB);
+    updateDTAfterUnswitch(DefaultExitBB, OldPH, DT);
+
+    // We removed all the exit cases, so we just copy the cases to the
+    // unswitched switch.
+    for (auto Case : SI.cases())
+      NewSI->addCase(Case.getCaseValue(), NewPH);
+  }
+
+  // If we ended up with a common successor for every path through the switch
+  // after unswitching, rewrite it to an unconditional branch to make it easy
+  // to recognize. Otherwise we potentially have to recognize the default case
+  // pointing at unreachable and other complexity.
+  if (CommonSuccBB) {
+    BasicBlock *BB = SI.getParent();
+    SI.eraseFromParent();
+    BranchInst::Create(CommonSuccBB, BB);
+  }
+
+  DT.verifyDomTree();
+  ++NumTrivial;
+  ++NumSwitches;
+  return true;
+}
+
+/// This routine scans the loop to find a branch or switch which occurs before
+/// any side effects occur. These can potentially be unswitched without
+/// duplicating the loop. If a branch or switch is successfully unswitched the
+/// scanning continues to see if subsequent branches or switches have become
+/// trivial. Once all trivial candidates have been unswitched, this routine
+/// returns.
+///
+/// The return value indicates whether anything was unswitched (and therefore
+/// changed).
+static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
+                                         LoopInfo &LI) {
+  bool Changed = false;
+
+  // If loop header has only one reachable successor we should keep looking for
+  // trivial condition candidates in the successor as well. An alternative is
+  // to constant fold conditions and merge successors into loop header (then we
+  // only need to check header's terminator). The reason for not doing this in
+  // LoopUnswitch pass is that it could potentially break LoopPassManager's
+  // invariants. Folding dead branches could either eliminate the current loop
+  // or make other loops unreachable. LCSSA form might also not be preserved
+  // after deleting branches. The following code keeps traversing loop header's
+  // successors until it finds the trivial condition candidate (condition that
+  // is not a constant). Since unswitching generates branches with constant
+  // conditions, this scenario could be very common in practice.
+  BasicBlock *CurrentBB = L.getHeader();
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  Visited.insert(CurrentBB);
+  do {
+    // Check if there are any side-effecting instructions (e.g. stores, calls,
+    // volatile loads) in the part of the loop that the code *would* execute
+    // without unswitching.
+    if (llvm::any_of(*CurrentBB,
+                     [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return Changed;
+
+    TerminatorInst *CurrentTerm = CurrentBB->getTerminator();
+
+    if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+      // Don't bother trying to unswitch past a switch with a constant
+      // condition. This should be removed prior to running this pass by
+      // simplify-cfg.
+      if (isa<Constant>(SI->getCondition()))
+        return Changed;
+
+      if (!unswitchTrivialSwitch(L, *SI, DT, LI))
+        // Coludn't unswitch this one so we're done.
+        return Changed;
+
+      // Mark that we managed to unswitch something.
+      Changed = true;
+
+      // If unswitching turned the terminator into an unconditional branch then
+      // we can continue. The unswitching logic specifically works to fold any
+      // cases it can into an unconditional branch to make it easier to
+      // recognize here.
+      auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
+      if (!BI || BI->isConditional())
+        return Changed;
+
+      CurrentBB = BI->getSuccessor(0);
+      continue;
+    }
+
+    auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+    if (!BI)
+      // We do not understand other terminator instructions.
+      return Changed;
+
+    // Don't bother trying to unswitch past an unconditional branch or a branch
+    // with a constant value. These should be removed by simplify-cfg prior to
+    // running this pass.
+    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+      return Changed;
+
+    // Found a trivial condition candidate: non-foldable conditional branch. If
+    // we fail to unswitch this, we can't do anything else that is trivial.
+    if (!unswitchTrivialBranch(L, *BI, DT, LI))
+      return Changed;
+
+    // Mark that we managed to unswitch something.
+    Changed = true;
+
+    // We unswitched the branch. This should always leave us with an
+    // unconditional branch that we can follow now.
+    BI = cast<BranchInst>(CurrentBB->getTerminator());
+    assert(!BI->isConditional() &&
+           "Cannot form a conditional branch by unswitching1");
+    CurrentBB = BI->getSuccessor(0);
+
+    // When continuing, if we exit the loop or reach a previous visited block,
+    // then we can not reach any trivial condition candidates (unfoldable
+    // branch instructions or switch instructions) and no unswitch can happen.
+  } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
+
+  return Changed;
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+                         AssumptionCache &AC) {
+  assert(L.isLCSSAForm(DT) &&
+         "Loops must be in LCSSA form before unswitching.");
+  bool Changed = false;
+
+  // Must be in loop simplified form: we need a preheader and dedicated exits.
+  if (!L.isLoopSimplifyForm())
+    return false;
+
+  // Try trivial unswitch first before loop over other basic blocks in the loop.
+  Changed |= unswitchAllTrivialConditions(L, DT, LI);
+
+  // FIXME: Add support for non-trivial unswitching by cloning the loop.
+
+  return Changed;
+}
+
+PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
+                                              LoopStandardAnalysisResults &AR,
+                                              LPMUpdater &U) {
+  Function &F = *L.getHeader()->getParent();
+  (void)F;
+
+  DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n");
+
+  if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC))
+    return PreservedAnalyses::all();
+
+#ifndef NDEBUG
+  // Historically this pass has had issues with the dominator tree so verify it
+  // in asserts builds.
+  AR.DT.verifyDomTree();
+#endif
+  return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+
+class SimpleLoopUnswitchLegacyPass : public LoopPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+
+  explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) {
+    initializeSimpleLoopUnswitchLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipLoop(L))
+    return false;
+
+  Function &F = *L->getHeader()->getParent();
+
+  DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n");
+
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+
+  bool Changed = unswitchLoop(*L, DT, LI, AC);
+
+#ifndef NDEBUG
+  // Historically this pass has had issues with the dominator tree so verify it
+  // in asserts builds.
+  DT.verifyDomTree();
+#endif
+  return Changed;
+}
+
+char SimpleLoopUnswitchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+                      "Simple unswitch loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+                    "Simple unswitch loops", false, false)
+
+Pass *llvm::createSimpleLoopUnswitchLegacyPass() {
+  return new SimpleLoopUnswitchLegacyPass();
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f2723bd..8754c71 100644
--- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -130,7 +130,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
                                    AssumptionCache *AC,
-                                   unsigned BonusInstThreshold) {
+                                   unsigned BonusInstThreshold,
+                                   bool LateSimplifyCFG) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -145,7 +146,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -156,10 +157,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                AssumptionCache *AC, int BonusInstThreshold) {
+                                AssumptionCache *AC, int BonusInstThreshold,
+                                bool LateSimplifyCFG) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                        LateSimplifyCFG);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -173,7 +176,8 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                         LateSimplifyCFG);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
@@ -181,17 +185,19 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 SimplifyCFGPass::SimplifyCFGPass()
-    : BonusInstThreshold(UserBonusInstThreshold) {}
+    : BonusInstThreshold(UserBonusInstThreshold),
+      LateSimplifyCFG(true) {}
 
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
-    : BonusInstThreshold(BonusInstThreshold) {}
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
+    : BonusInstThreshold(BonusInstThreshold),
+      LateSimplifyCFG(LateSimplifyCFG) {}
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
 
-  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
@@ -199,16 +205,17 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
 }
 
 namespace {
-struct CFGSimplifyPass : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
+struct BaseCFGSimplifyPass : public FunctionPass {
   unsigned BonusInstThreshold;
   std::function<bool(const Function &)> PredicateFtor;
+  bool LateSimplifyCFG;
 
-  CFGSimplifyPass(int T = -1,
-                  std::function<bool(const Function &)> Ftor = nullptr)
-      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+  BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
+                      std::function<bool(const Function &)> Ftor,
+                      char &ID)
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
+        LateSimplifyCFG(LateSimplifyCFG) {
     BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override {
     if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
@@ -218,7 +225,7 @@ struct CFGSimplifyPass : public FunctionPass {
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -227,6 +234,26 @@ struct CFGSimplifyPass : public FunctionPass {
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
+
+struct CFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  CFGSimplifyPass(int T = -1,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+                  : BaseCFGSimplifyPass(T, false, Ftor, ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LateCFGSimplifyPass(int T = -1,
+                      std::function<bool(const Function &)> Ftor = nullptr)
+                      : BaseCFGSimplifyPass(T, true, Ftor, ID) {
+    initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
 }
 
 char CFGSimplifyPass::ID = 0;
@@ -237,9 +264,24 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
+char LateCFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
+                      "Simplify the CFG more aggressively", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
+                    "Simplify the CFG more aggressively", false, false)
+
 // Public interface to the CFGSimplification pass
 FunctionPass *
 llvm::createCFGSimplificationPass(int Threshold,
-                                  std::function<bool(const Function &)> Ftor) {
+    std::function<bool(const Function &)> Ftor) {
   return new CFGSimplifyPass(Threshold, std::move(Ftor));
 }
+
+// Public interface to the LateCFGSimplification pass
+FunctionPass *
+llvm::createLateCFGSimplificationPass(int Threshold, 
+                                  std::function<bool(const Function &)> Ftor) {
+  return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
index c3f14a0..5210f16 100644
--- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
   if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
-    if (!isSafeToSpeculativelyExecute(Inst))
+    if (isa<LoadInst>(Inst))
       return false;
 
     // We don't want to sink across a critical edge if we don't dominate the
@@ -164,13 +164,14 @@ static bool SinkInstruction(Instruction *Inst,
 
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
-  // Look at all the postdominators and see if we can sink it in one.
+  // Look at all the dominated blocks and see if we can sink it in one.
   DomTreeNode *DTN = DT.getNode(Inst->getParent());
   for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
       I != E && SuccToSinkTo == nullptr; ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
-    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
-        IsAcceptableTarget(Inst, Candidate, DT, LI))
+    // A node always immediate-dominates its children on the dominator
+    // tree.
+    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
       SuccToSinkTo = Candidate;
   }
 
@@ -262,9 +263,8 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!iterativelySinkInstructions(F, DT, LI, AA))
     return PreservedAnalyses::all();
 
-  auto PA = PreservedAnalyses();
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<LoopAnalysis>();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   return PA;
 }
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 2be3f5c..8b8d659 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -693,7 +693,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) {
       UnlinkedInst->setOperand(I, nullptr);
       RecursivelyDeleteTriviallyDeadInstructions(Op);
     }
-    delete UnlinkedInst;
+    UnlinkedInst->deleteValue();
   }
   bool Ret = !UnlinkedInstructions.empty();
   UnlinkedInstructions.clear();
diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 49ce026..0cccb41 100644
--- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SCCIterator.h"
@@ -20,6 +19,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -329,7 +329,7 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
       Loops[Exit] = N->getEntry();
 
   } else {
-    // Test for sucessors as back edge
+    // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
     BranchInst *Term = cast<BranchInst>(BB->getTerminator());
 
diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index a6b9fee..90c5c24 100644
--- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -51,13 +51,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
@@ -69,6 +68,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
@@ -76,6 +76,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced");
 /// If it contains any dynamic allocas, returns false.
 static bool canTRE(Function &F) {
   // Because of PR962, we don't TRE dynamic allocas.
-  for (auto &BB : F) {
-    for (auto &I : BB) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
-        if (!AI->isStaticAlloca())
-          return false;
-      }
-    }
-  }
-
-  return true;
+  return llvm::all_of(instructions(F), [](Instruction &I) {
+    auto *AI = dyn_cast<AllocaInst>(&I);
+    return !AI || AI->isStaticAlloca();
+  });
 }
 
 namespace {
@@ -321,7 +316,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) {
 /// instruction from after the call to before the call, assuming that all
 /// instructions between the call and this instruction are movable.
 ///
-static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
   // FIXME: We can move load/store/call/free instructions above the call if the
   // call does not mod/ref the memory location being processed.
   if (I->mayHaveSideEffects())  // This also handles volatile loads.
@@ -332,10 +327,10 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) {
     if (CI->mayHaveSideEffects()) {
       // Non-volatile loads may be moved above a call with side effects if it
       // does not write to memory and the load provably won't trap.
-      // FIXME: Writes to memory only matter if they may alias the pointer
+      // Writes to memory only matter if they may alias the pointer
       // being loaded from.
       const DataLayout &DL = L->getModule()->getDataLayout();
-      if (CI->mayWriteToMemory() ||
+      if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(),
                                        L->getAlignment(), DL, L))
         return false;
@@ -496,7 +491,7 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                        BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
                                        SmallVectorImpl<PHINode *> &ArgumentPHIs,
-                                       bool CannotTailCallElimCallsMarkedTail) {
+                                       AliasAnalysis *AA) {
   // If we are introducing accumulator recursion to eliminate operations after
   // the call instruction that are both associative and commutative, the initial
   // value for the accumulator is placed in this variable.  If this value is set
@@ -516,7 +511,8 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
   // Check that this is the case now.
   BasicBlock::iterator BBI(CI);
   for (++BBI; &*BBI != Ret; ++BBI) {
-    if (canMoveAboveCall(&*BBI, CI)) continue;
+    if (canMoveAboveCall(&*BBI, CI, AA))
+      continue;
 
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
@@ -675,12 +671,17 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
                                      bool &TailCallsAreMarkedTail,
                                      SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                      bool CannotTailCallElimCallsMarkedTail,
-                                     const TargetTransformInfo *TTI) {
+                                     const TargetTransformInfo *TTI,
+                                     AliasAnalysis *AA) {
   bool Change = false;
 
+  // Make sure this block is a trivial return block.
+  assert(BB->getFirstNonPHIOrDbg() == Ret &&
+         "Trying to fold non-trivial return block");
+
   // If the return block contains nothing but the return and PHI's,
   // there might be an opportunity to duplicate the return in its
-  // predecessors and perform TRC there. Look for predecessors that end
+  // predecessors and perform TRE there. Look for predecessors that end
   // in unconditional branch and recursive call(s).
   SmallVector<BranchInst*, 8> UncondBranchPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
@@ -707,8 +708,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret,
         BB->eraseFromParent();
 
       eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
-                                 ArgumentPHIs,
-                                 CannotTailCallElimCallsMarkedTail);
+                                 ArgumentPHIs, AA);
       ++NumRetDuped;
       Change = true;
     }
@@ -721,17 +721,18 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
                                   bool &TailCallsAreMarkedTail,
                                   SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                   bool CannotTailCallElimCallsMarkedTail,
-                                  const TargetTransformInfo *TTI) {
+                                  const TargetTransformInfo *TTI,
+                                  AliasAnalysis *AA) {
   CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
   if (!CI)
     return false;
 
   return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
-                                    ArgumentPHIs,
-                                    CannotTailCallElimCallsMarkedTail);
+                                    ArgumentPHIs, AA);
 }
 
-static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) {
+static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
+                                   AliasAnalysis *AA) {
   if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
     return false;
 
@@ -766,11 +767,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI)
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
       bool Change =
           processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+                                ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change =
-            foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail,
-                                     ArgumentPHIs, !CanTRETailMarkedCall, TTI);
+        Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
+                                          TailCallsAreMarkedTail, ArgumentPHIs,
+                                          !CanTRETailMarkedCall, TTI, AA);
       MadeChange |= Change;
     }
   }
@@ -800,6 +801,7 @@ struct TailCallElim : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
@@ -808,7 +810,8 @@ struct TailCallElim : public FunctionPass {
       return false;
 
     return eliminateTailRecursion(
-        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F));
+        F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+        &getAnalysis<AAResultsWrapperPass>().getAAResults());
   }
 };
 }
@@ -829,8 +832,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
                                         FunctionAnalysisManager &AM) {
 
   TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
 
-  bool Changed = eliminateTailRecursion(F, &TTI);
+  bool Changed = eliminateTailRecursion(F, &TTI, &AA);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
index 2e95926..4c9746b 100644
--- a/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -102,6 +102,10 @@ FunctionPass *llvm::createAddDiscriminatorsPass() {
   return new AddDiscriminatorsLegacyPass();
 }
 
+static bool shouldHaveDiscriminator(const Instruction *I) {
+  return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
+}
+
 /// \brief Assign DWARF discriminators.
 ///
 /// To assign discriminators, we examine the boundaries of every
@@ -176,7 +180,13 @@ static bool addDiscriminators(Function &F) {
   // discriminator for this instruction.
   for (BasicBlock &B : F) {
     for (auto &I : B.getInstList()) {
-      if (isa<IntrinsicInst>(&I))
+      // Not all intrinsic calls should have a discriminator.
+      // We want to avoid a non-deterministic assignment of discriminators at
+      // different debug levels. We still allow discriminators on memory
+      // intrinsic calls because those can be early expanded by SROA into
+      // pairs of loads and stores, and the expanded load/store instructions
+      // should have a valid discriminator.
+      if (!shouldHaveDiscriminator(&I))
         continue;
       const DILocation *DIL = I.getDebugLoc();
       if (!DIL)
@@ -190,8 +200,8 @@ static bool addDiscriminators(Function &F) {
       // discriminator is needed to distinguish both instructions.
       // Only the lowest 7 bits are used to represent a discriminator to fit
       // it in 1 byte ULEB128 representation.
-      unsigned Discriminator = (R.second ? ++LDM[L] : LDM[L]) & 0x7f;
-      I.setDebugLoc(DIL->cloneWithDiscriminator(Discriminator));
+      unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
+      I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
       DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
                    << DIL->getColumn() << ":" << Discriminator << " " << I
                    << "\n");
@@ -207,6 +217,10 @@ static bool addDiscriminators(Function &F) {
     LocationSet CallLocations;
     for (auto &I : B.getInstList()) {
       CallInst *Current = dyn_cast<CallInst>(&I);
+      // We bypass intrinsic calls for the following two reasons:
+      //  1) We want to avoid a non-deterministic assigment of
+      //     discriminators.
+      //  2) We want to minimize the number of base discriminators used.
       if (!Current || isa<IntrinsicInst>(&I))
         continue;
 
@@ -216,8 +230,8 @@ static bool addDiscriminators(Function &F) {
       Location L =
           std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
       if (!CallLocations.insert(L).second) {
-        Current->setDebugLoc(
-            CurrentDIL->cloneWithDiscriminator((++LDM[L]) & 0x7f));
+        unsigned Discriminator = ++LDM[L];
+        Current->setDebugLoc(CurrentDIL->setBaseDiscriminator(Discriminator));
         Changed = true;
       }
     }
diff --git a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index b90349d..3d5cbfc 100644
--- a/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -78,8 +78,8 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
 
 bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
-  // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
-  SmallVector<WeakVH, 8> PHIs;
+  // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
+  SmallVector<WeakTrackingVH, 8> PHIs;
   for (BasicBlock::iterator I = BB->begin();
        PHINode *PN = dyn_cast<PHINode>(I); ++I)
     PHIs.push_back(PN);
@@ -438,7 +438,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
-  BI->setDebugLoc(BB->getFirstNonPHI()->getDebugLoc());
+  BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
 
   // Move the edges from Preds to point to NewBB instead of BB.
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
@@ -646,9 +646,10 @@ llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
   }
 
   if (LI) {
-    Loop *L = LI->getLoopFor(Head);
-    L->addBasicBlockToLoop(ThenBlock, *LI);
-    L->addBasicBlockToLoop(Tail, *LI);
+    if (Loop *L = LI->getLoopFor(Head)) {
+      L->addBasicBlockToLoop(ThenBlock, *LI);
+      L->addBasicBlockToLoop(Tail, *LI);
+    }
   }
 
   return CheckTerm;
diff --git a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index e61b04f..b60dfb4 100644
--- a/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -58,7 +58,7 @@ static bool setOnlyReadsMemory(Function &F) {
 static bool setOnlyAccessesArgMemory(Function &F) {
   if (F.onlyAccessesArgMemory())
     return false;
-  F.setOnlyAccessesArgMemory ();
+  F.setOnlyAccessesArgMemory();
   ++NumArgMemOnly;
   return true;
 }
@@ -71,632 +71,633 @@ static bool setDoesNotThrow(Function &F) {
   return true;
 }
 
-static bool setDoesNotCapture(Function &F, unsigned n) {
-  if (F.doesNotCapture(n))
+static bool setRetDoesNotAlias(Function &F) {
+  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
     return false;
-  F.setDoesNotCapture(n);
-  ++NumNoCapture;
+  F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  ++NumNoAlias;
   return true;
 }
 
-static bool setOnlyReadsMemory(Function &F, unsigned n) {
-  if (F.onlyReadsMemory(n))
+static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::NoCapture))
     return false;
-  F.setOnlyReadsMemory(n);
-  ++NumReadOnlyArg;
+  F.addParamAttr(ArgNo, Attribute::NoCapture);
+  ++NumNoCapture;
   return true;
 }
 
-static bool setDoesNotAlias(Function &F, unsigned n) {
-  if (F.doesNotAlias(n))
+static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
+  if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
     return false;
-  F.setDoesNotAlias(n);
-  ++NumNoAlias;
+  F.addParamAttr(ArgNo, Attribute::ReadOnly);
+  ++NumReadOnlyArg;
   return true;
 }
 
-static bool setNonNull(Function &F, unsigned n) {
-  assert((n != AttributeSet::ReturnIndex ||
-          F.getReturnType()->isPointerTy()) &&
+static bool setRetNonNull(Function &F) {
+  assert(F.getReturnType()->isPointerTy() &&
          "nonnull applies only to pointers");
-  if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
+  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull))
     return false;
-  F.addAttribute(n, Attribute::NonNull);
+  F.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
   ++NumNonNull;
   return true;
 }
 
 bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
-  LibFunc::Func TheLibFunc;
+  LibFunc TheLibFunc;
   if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
     return false;
 
   bool Changed = false;
   switch (TheLibFunc) {
-  case LibFunc::strlen:
+  case LibFunc_strlen:
+  case LibFunc_wcslen:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::strchr:
-  case LibFunc::strrchr:
+  case LibFunc_strchr:
+  case LibFunc_strrchr:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::strtol:
-  case LibFunc::strtod:
-  case LibFunc::strtof:
-  case LibFunc::strtoul:
-  case LibFunc::strtoll:
-  case LibFunc::strtold:
-  case LibFunc::strtoull:
+  case LibFunc_strtol:
+  case LibFunc_strtod:
+  case LibFunc_strtof:
+  case LibFunc_strtoul:
+  case LibFunc_strtoll:
+  case LibFunc_strtold:
+  case LibFunc_strtoull:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
-  case LibFunc::strcat:
-  case LibFunc::strncat:
-  case LibFunc::strncpy:
-  case LibFunc::stpncpy:
+  case LibFunc_strcpy:
+  case LibFunc_stpcpy:
+  case LibFunc_strcat:
+  case LibFunc_strncat:
+  case LibFunc_strncpy:
+  case LibFunc_stpncpy:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::strxfrm:
+  case LibFunc_strxfrm:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::strcmp:      // 0,1
-  case LibFunc::strspn:      // 0,1
-  case LibFunc::strncmp:     // 0,1
-  case LibFunc::strcspn:     // 0,1
-  case LibFunc::strcoll:     // 0,1
-  case LibFunc::strcasecmp:  // 0,1
-  case LibFunc::strncasecmp: //
+  case LibFunc_strcmp:      // 0,1
+  case LibFunc_strspn:      // 0,1
+  case LibFunc_strncmp:     // 0,1
+  case LibFunc_strcspn:     // 0,1
+  case LibFunc_strcoll:     // 0,1
+  case LibFunc_strcasecmp:  // 0,1
+  case LibFunc_strncasecmp: //
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::strstr:
-  case LibFunc::strpbrk:
+  case LibFunc_strstr:
+  case LibFunc_strpbrk:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::strtok:
-  case LibFunc::strtok_r:
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::scanf:
+  case LibFunc_strtok:
+  case LibFunc_strtok_r:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::setbuf:
-  case LibFunc::setvbuf:
+  case LibFunc_scanf:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_setbuf:
+  case LibFunc_setvbuf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::strdup:
-  case LibFunc::strndup:
+  case LibFunc_strdup:
+  case LibFunc_strndup:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_stat:
+  case LibFunc_statvfs:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::stat:
-  case LibFunc::statvfs:
+  case LibFunc_sscanf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::sscanf:
+  case LibFunc_sprintf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::sprintf:
+  case LibFunc_snprintf:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::snprintf:
+  case LibFunc_setitimer:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 3);
-    Changed |= setOnlyReadsMemory(F, 3);
-    return Changed;
-  case LibFunc::setitimer:
-    Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
-    Changed |= setDoesNotCapture(F, 3);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::system:
+  case LibFunc_system:
     // May throw; "system" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::malloc:
+  case LibFunc_malloc:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
-  case LibFunc::memcmp:
+  case LibFunc_memcmp:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::memchr:
-  case LibFunc::memrchr:
+  case LibFunc_memchr:
+  case LibFunc_memrchr:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::modf:
-  case LibFunc::modff:
-  case LibFunc::modfl:
+  case LibFunc_modf:
+  case LibFunc_modff:
+  case LibFunc_modfl:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::memcpy:
-  case LibFunc::mempcpy:
-  case LibFunc::memccpy:
-  case LibFunc::memmove:
+  case LibFunc_memcpy:
+  case LibFunc_mempcpy:
+  case LibFunc_memccpy:
+  case LibFunc_memmove:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::memcpy_chk:
+  case LibFunc_memcpy_chk:
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::memalign:
-    Changed |= setDoesNotAlias(F, 0);
+  case LibFunc_memalign:
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
-  case LibFunc::mkdir:
+  case LibFunc_mkdir:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::mktime:
+  case LibFunc_mktime:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::realloc:
+  case LibFunc_realloc:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::read:
+  case LibFunc_read:
     // May throw; "read" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::rewind:
+  case LibFunc_rewind:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::rmdir:
-  case LibFunc::remove:
-  case LibFunc::realpath:
+  case LibFunc_rmdir:
+  case LibFunc_remove:
+  case LibFunc_realpath:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::rename:
+  case LibFunc_rename:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::readlink:
+  case LibFunc_readlink:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::write:
+  case LibFunc_write:
     // May throw; "write" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::bcopy:
+  case LibFunc_bcopy:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::bcmp:
+  case LibFunc_bcmp:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::bzero:
+  case LibFunc_bzero:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::calloc:
+  case LibFunc_calloc:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
-  case LibFunc::chmod:
-  case LibFunc::chown:
+  case LibFunc_chmod:
+  case LibFunc_chown:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::ctermid:
-  case LibFunc::clearerr:
-  case LibFunc::closedir:
+  case LibFunc_ctermid:
+  case LibFunc_clearerr:
+  case LibFunc_closedir:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::atoi:
-  case LibFunc::atol:
-  case LibFunc::atof:
-  case LibFunc::atoll:
+  case LibFunc_atoi:
+  case LibFunc_atol:
+  case LibFunc_atof:
+  case LibFunc_atoll:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::access:
+  case LibFunc_access:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
+    return Changed;
+  case LibFunc_fopen:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::fopen:
+  case LibFunc_fdopen:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fdopen:
+  case LibFunc_feof:
+  case LibFunc_free:
+  case LibFunc_fseek:
+  case LibFunc_ftell:
+  case LibFunc_fgetc:
+  case LibFunc_fseeko:
+  case LibFunc_ftello:
+  case LibFunc_fileno:
+  case LibFunc_fflush:
+  case LibFunc_fclose:
+  case LibFunc_fsetpos:
+  case LibFunc_flockfile:
+  case LibFunc_funlockfile:
+  case LibFunc_ftrylockfile:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::feof:
-  case LibFunc::free:
-  case LibFunc::fseek:
-  case LibFunc::ftell:
-  case LibFunc::fgetc:
-  case LibFunc::fseeko:
-  case LibFunc::ftello:
-  case LibFunc::fileno:
-  case LibFunc::fflush:
-  case LibFunc::fclose:
-  case LibFunc::fsetpos:
-  case LibFunc::flockfile:
-  case LibFunc::funlockfile:
-  case LibFunc::ftrylockfile:
+  case LibFunc_ferror:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F);
     return Changed;
-  case LibFunc::ferror:
+  case LibFunc_fputc:
+  case LibFunc_fstat:
+  case LibFunc_frexp:
+  case LibFunc_frexpf:
+  case LibFunc_frexpl:
+  case LibFunc_fstatvfs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F);
     return Changed;
-  case LibFunc::fputc:
-  case LibFunc::fstat:
-  case LibFunc::frexp:
-  case LibFunc::frexpf:
-  case LibFunc::frexpl:
-  case LibFunc::fstatvfs:
+  case LibFunc_fgets:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::fgets:
+  case LibFunc_fread:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     return Changed;
-  case LibFunc::fread:
+  case LibFunc_fwrite:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 4);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setDoesNotCapture(F, 3);
+    // FIXME: readonly #1?
     return Changed;
-  case LibFunc::fwrite:
+  case LibFunc_fputs:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 4);
-    // FIXME: readonly #1?
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::fputs:
+  case LibFunc_fscanf:
+  case LibFunc_fprintf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::fscanf:
-  case LibFunc::fprintf:
+  case LibFunc_fgetpos:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fgetpos:
+  case LibFunc_getc:
+  case LibFunc_getlogin_r:
+  case LibFunc_getc_unlocked:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::getc:
-  case LibFunc::getlogin_r:
-  case LibFunc::getc_unlocked:
+  case LibFunc_getenv:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::getenv:
+  case LibFunc_gets:
+  case LibFunc_getchar:
     Changed |= setDoesNotThrow(F);
-    Changed |= setOnlyReadsMemory(F);
-    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::gets:
-  case LibFunc::getchar:
+  case LibFunc_getitimer:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::getitimer:
+  case LibFunc_getpwnam:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::getpwnam:
+  case LibFunc_ungetc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::ungetc:
+  case LibFunc_uname:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::uname:
+  case LibFunc_unlink:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::unlink:
+  case LibFunc_unsetenv:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::unsetenv:
+  case LibFunc_utime:
+  case LibFunc_utimes:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::utime:
-  case LibFunc::utimes:
+  case LibFunc_putc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::putc:
+  case LibFunc_puts:
+  case LibFunc_printf:
+  case LibFunc_perror:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::puts:
-  case LibFunc::printf:
-  case LibFunc::perror:
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::pread:
+  case LibFunc_pread:
     // May throw; "pread" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::pwrite:
+  case LibFunc_pwrite:
     // May throw; "pwrite" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::putchar:
+  case LibFunc_putchar:
     Changed |= setDoesNotThrow(F);
     return Changed;
-  case LibFunc::popen:
+  case LibFunc_popen:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::pclose:
+  case LibFunc_pclose:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    return Changed;
+  case LibFunc_vscanf:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::vscanf:
+  case LibFunc_vsscanf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::vsscanf:
+  case LibFunc_vfscanf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::vfscanf:
+  case LibFunc_valloc:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
-  case LibFunc::valloc:
+  case LibFunc_vprintf:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::vprintf:
+  case LibFunc_vfprintf:
+  case LibFunc_vsprintf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::vfprintf:
-  case LibFunc::vsprintf:
+  case LibFunc_vsnprintf:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::vsnprintf:
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 3);
-    Changed |= setOnlyReadsMemory(F, 3);
-    return Changed;
-  case LibFunc::open:
+  case LibFunc_open:
     // May throw; "open" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::opendir:
+  case LibFunc_opendir:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::tmpfile:
+  case LibFunc_tmpfile:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
-  case LibFunc::times:
+  case LibFunc_times:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::htonl:
-  case LibFunc::htons:
-  case LibFunc::ntohl:
-  case LibFunc::ntohs:
+  case LibFunc_htonl:
+  case LibFunc_htons:
+  case LibFunc_ntohl:
+  case LibFunc_ntohs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAccessMemory(F);
     return Changed;
-  case LibFunc::lstat:
+  case LibFunc_lstat:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::lchown:
+  case LibFunc_lchown:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::qsort:
+  case LibFunc_qsort:
     // May throw; places call through function pointer.
-    Changed |= setDoesNotCapture(F, 4);
+    Changed |= setDoesNotCapture(F, 3);
+    return Changed;
+  case LibFunc_dunder_strdup:
+  case LibFunc_dunder_strndup:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::dunder_strdup:
-  case LibFunc::dunder_strndup:
+  case LibFunc_dunder_strtok_r:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc::dunder_strtok_r:
+  case LibFunc_under_IO_getc:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::under_IO_getc:
+  case LibFunc_under_IO_putc:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::under_IO_putc:
-    Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
-    return Changed;
-  case LibFunc::dunder_isoc99_scanf:
+  case LibFunc_dunder_isoc99_scanf:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::stat64:
-  case LibFunc::lstat64:
-  case LibFunc::statvfs64:
+  case LibFunc_stat64:
+  case LibFunc_lstat64:
+  case LibFunc_statvfs64:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::dunder_isoc99_sscanf:
+  case LibFunc_dunder_isoc99_sscanf:
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fopen64:
+  case LibFunc_fopen64:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    Changed |= setOnlyReadsMemory(F, 2);
     return Changed;
-  case LibFunc::fseeko64:
-  case LibFunc::ftello64:
+  case LibFunc_fseeko64:
+  case LibFunc_ftello64:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
     return Changed;
-  case LibFunc::tmpfile64:
+  case LibFunc_tmpfile64:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotAlias(F, 0);
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
-  case LibFunc::fstat64:
-  case LibFunc::fstatvfs64:
+  case LibFunc_fstat64:
+  case LibFunc_fstatvfs64:
     Changed |= setDoesNotThrow(F);
-    Changed |= setDoesNotCapture(F, 2);
+    Changed |= setDoesNotCapture(F, 1);
     return Changed;
-  case LibFunc::open64:
+  case LibFunc_open64:
     // May throw; "open" is a valid pthread cancellation point.
-    Changed |= setDoesNotCapture(F, 1);
-    Changed |= setOnlyReadsMemory(F, 1);
+    Changed |= setDoesNotCapture(F, 0);
+    Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
-  case LibFunc::gettimeofday:
+  case LibFunc_gettimeofday:
     // Currently some platforms have the restrict keyword on the arguments to
     // gettimeofday. To be conservative, do not add noalias to gettimeofday's
     // arguments.
     Changed |= setDoesNotThrow(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
     return Changed;
-  case LibFunc::Znwj: // new(unsigned int)
-  case LibFunc::Znwm: // new(unsigned long)
-  case LibFunc::Znaj: // new[](unsigned int)
-  case LibFunc::Znam: // new[](unsigned long)
-  case LibFunc::msvc_new_int: // new(unsigned int)
-  case LibFunc::msvc_new_longlong: // new(unsigned long long)
-  case LibFunc::msvc_new_array_int: // new[](unsigned int)
-  case LibFunc::msvc_new_array_longlong: // new[](unsigned long long)
+  case LibFunc_Znwj: // new(unsigned int)
+  case LibFunc_Znwm: // new(unsigned long)
+  case LibFunc_Znaj: // new[](unsigned int)
+  case LibFunc_Znam: // new[](unsigned long)
+  case LibFunc_msvc_new_int: // new(unsigned int)
+  case LibFunc_msvc_new_longlong: // new(unsigned long long)
+  case LibFunc_msvc_new_array_int: // new[](unsigned int)
+  case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
     // Operator new always returns a nonnull noalias pointer
-    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
-    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+    Changed |= setRetNonNull(F);
+    Changed |= setRetDoesNotAlias(F);
     return Changed;
   //TODO: add LibFunc entries for:
-  //case LibFunc::memset_pattern4:
-  //case LibFunc::memset_pattern8:
-  case LibFunc::memset_pattern16:
+  //case LibFunc_memset_pattern4:
+  //case LibFunc_memset_pattern8:
+  case LibFunc_memset_pattern16:
     Changed |= setOnlyAccessesArgMemory(F);
+    Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    Changed |= setDoesNotCapture(F, 2);
-    Changed |= setOnlyReadsMemory(F, 2);
+    Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
   // int __nvvm_reflect(const char *)
-  case LibFunc::nvvm_reflect:
+  case LibFunc_nvvm_reflect:
     Changed |= setDoesNotAccessMemory(F);
     Changed |= setDoesNotThrow(F);
     return Changed;
@@ -717,13 +718,13 @@ Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
 
 Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strlen))
+  if (!TLI->has(LibFunc_strlen))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
-                                            B.getInt8PtrTy(), nullptr);
+                                            B.getInt8PtrTy());
   inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
   CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
@@ -734,14 +735,14 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
 
 Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strchr))
+  if (!TLI->has(LibFunc_strchr))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr =
-      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty, nullptr);
+      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
   inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
   CallInst *CI = B.CreateCall(
       StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
@@ -752,14 +753,14 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
 Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::strncmp))
+  if (!TLI->has(LibFunc_strncmp))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
                                           B.getInt8PtrTy(), B.getInt8PtrTy(),
-                                          DL.getIntPtrType(Context), nullptr);
+                                          DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
@@ -772,12 +773,12 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
 
 Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
                         const TargetLibraryInfo *TLI, StringRef Name) {
-  if (!TLI->has(LibFunc::strcpy))
+  if (!TLI->has(LibFunc_strcpy))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr, nullptr);
+  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
   inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI =
       B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
@@ -788,13 +789,13 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 
 Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI, StringRef Name) {
-  if (!TLI->has(LibFunc::strncpy))
+  if (!TLI->has(LibFunc_strncpy))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
-                                          Len->getType(), nullptr);
+                                          Len->getType());
   inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
@@ -806,18 +807,18 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
 Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
                            IRBuilder<> &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::memcpy_chk))
+  if (!TLI->has(LibFunc_memcpy_chk))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  AttributeSet AS;
-  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                         Attribute::NoUnwind);
+  AttributeList AS;
+  AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
+                          Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction(
-      "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
+      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
-      DL.getIntPtrType(Context), nullptr);
+      DL.getIntPtrType(Context));
   Dst = castToCStr(Dst, B);
   Src = castToCStr(Src, B);
   CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
@@ -828,14 +829,14 @@ Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 
 Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::memchr))
+  if (!TLI->has(LibFunc_memchr))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
                                          B.getInt8PtrTy(), B.getInt32Ty(),
-                                         DL.getIntPtrType(Context), nullptr);
+                                         DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
   CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
 
@@ -847,14 +848,14 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
 
 Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::memcmp))
+  if (!TLI->has(LibFunc_memcmp))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
-                                         DL.getIntPtrType(Context), nullptr);
+                                         DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
   CallInst *CI = B.CreateCall(
       MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
@@ -881,15 +882,21 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
 }
 
 Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttributeSet &Attrs) {
+                                  const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op, Name, NameBuffer);
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
-                                         Op->getType(), nullptr);
+                                         Op->getType());
   CallInst *CI = B.CreateCall(Callee, Op, Name);
-  CI->setAttributes(Attrs);
+
+  // The incoming attribute set may have come from a speculatable intrinsic, but
+  // is being replaced with a library call which is not allowed to be
+  // speculatable.
+  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+                                          AttributeList::FunctionIndex,
+                                          Attribute::Speculatable));
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
 
@@ -897,13 +904,13 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 }
 
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                  IRBuilder<> &B, const AttributeSet &Attrs) {
+                                   IRBuilder<> &B, const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(),
-                                         Op2->getType(), nullptr);
+                                         Op2->getType());
   CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -914,12 +921,12 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
 
 Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
                          const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::putchar))
+  if (!TLI->has(LibFunc_putchar))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
-                                          B.getInt32Ty(), nullptr);
+  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(*M->getFunction("putchar"), *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -934,12 +941,12 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
 
 Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
                       const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::puts))
+  if (!TLI->has(LibFunc_puts))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *PutS =
-      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy(), nullptr);
+      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
   inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
   CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
@@ -949,12 +956,12 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
 
 Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::fputc))
+  if (!TLI->has(LibFunc_fputc))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
-                                       File->getType(), nullptr);
+                                       File->getType());
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
@@ -968,13 +975,13 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 
 Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
                        const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::fputs))
+  if (!TLI->has(LibFunc_fputs))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  StringRef FPutsName = TLI->getName(LibFunc::fputs);
+  StringRef FPutsName = TLI->getName(LibFunc_fputs);
   Constant *F = M->getOrInsertFunction(
-      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), nullptr);
+      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
@@ -986,16 +993,16 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 
 Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc::fwrite))
+  if (!TLI->has(LibFunc_fwrite))
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  StringRef FWriteName = TLI->getName(LibFunc::fwrite);
+  StringRef FWriteName = TLI->getName(LibFunc_fwrite);
   Constant *F = M->getOrInsertFunction(
       FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType(),
-      nullptr);
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
   CallInst *CI =
diff --git a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index bc2cef2..83ec7f5 100644
--- a/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -17,9 +17,12 @@
 
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
@@ -36,12 +39,21 @@ namespace {
       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
   };
 
-  struct DivPhiNodes {
-    PHINode *Quotient;
-    PHINode *Remainder;
+  struct QuotRemPair {
+    Value *Quotient;
+    Value *Remainder;
 
-    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
-      : Quotient(InQuotient), Remainder(InRemainder) {}
+    QuotRemPair(Value *InQuotient, Value *InRemainder)
+        : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+
+  /// A quotient and remainder, plus a BB from which they logically "originate".
+  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
+  /// corresponding predecessor.
+  struct QuotRemWithBB {
+    BasicBlock *BB = nullptr;
+    Value *Quotient = nullptr;
+    Value *Remainder = nullptr;
   };
 }
 
@@ -69,159 +81,376 @@ namespace llvm {
     }
   };
 
-  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+  typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
+  typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
+  typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
 }
 
-// insertFastDiv - Substitutes the div/rem instruction with code that checks the
-// value of the operands and uses a shorter-faster div/rem instruction when
-// possible and the longer-slower div/rem instruction otherwise.
-static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
-                          bool UseDivOp, bool UseSignedOp,
-                          DivCacheTy &PerBBDivCache) {
-  Function *F = I->getParent()->getParent();
-  // Get instruction operands
-  Value *Dividend = I->getOperand(0);
-  Value *Divisor = I->getOperand(1);
+namespace {
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_KNOWN_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LIKELY_LONG
+};
+
+class FastDivInsertionTask {
+  bool IsValidTask = false;
+  Instruction *SlowDivOrRem = nullptr;
+  IntegerType *BypassType = nullptr;
+  BasicBlock *MainBB = nullptr;
+
+  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
+  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
+  QuotRemWithBB createSlowBB(BasicBlock *Successor);
+  QuotRemWithBB createFastBB(BasicBlock *Successor);
+  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
+                                   BasicBlock *PhiBB);
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
+  Optional<QuotRemPair> insertFastDivAndRem();
+
+  bool isSignedOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::SRem;
+  }
+  bool isDivisionOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::UDiv;
+  }
+  Type *getSlowType() { return SlowDivOrRem->getType(); }
+
+public:
+  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+  Value *getReplacement(DivCacheTy &Cache);
+};
+} // anonymous namespace
+
+FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
+                                           const BypassWidthsTy &BypassWidths) {
+  switch (I->getOpcode()) {
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    SlowDivOrRem = I;
+    break;
+  default:
+    // I is not a div/rem operation.
+    return;
+  }
 
-  if (isa<ConstantInt>(Divisor)) {
-    // Division by a constant should have been been solved and replaced earlier
-    // in the pipeline.
-    return false;
+  // Skip division on vector types. Only optimize integer instructions.
+  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
+  if (!SlowType)
+    return;
+
+  // Skip if this bitwidth is not bypassed.
+  auto BI = BypassWidths.find(SlowType->getBitWidth());
+  if (BI == BypassWidths.end())
+    return;
+
+  // Get type for div/rem instruction with bypass bitwidth.
+  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
+  BypassType = BT;
+
+  // The original basic block.
+  MainBB = I->getParent();
+
+  // The instruction is indeed a slow div or rem operation.
+  IsValidTask = true;
+}
+
+/// Reuses previously-computed dividend or remainder from the current BB if
+/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
+/// perform the optimization and caches the resulting dividend and remainder.
+/// If no replacement can be generated, nullptr is returned.
+Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
+  // First, make sure that the task is valid.
+  if (!IsValidTask)
+    return nullptr;
+
+  // Then, look for a value in Cache.
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  DivOpInfo Key(isSignedOp(), Dividend, Divisor);
+  auto CacheI = Cache.find(Key);
+
+  if (CacheI == Cache.end()) {
+    // If previous instance does not exist, try to insert fast div.
+    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
+    // Bail out if insertFastDivAndRem has failed.
+    if (!OptResult)
+      return nullptr;
+    CacheI = Cache.insert({Key, *OptResult}).first;
   }
 
-  // If the numerator is a constant, bail if it doesn't fit into BypassType.
-  if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
-    if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
+  QuotRemPair &Value = CacheI->second;
+  return isDivisionOp() ? Value.Quotient : Value.Remainder;
+}
+
+/// \brief Check if a value looks like a hash.
+///
+/// The routine is expected to detect values computed using the most common hash
+/// algorithms. Typically, hash computations end with one of the following
+/// instructions:
+///
+/// 1) MUL with a constant wider than BypassType
+/// 2) XOR instruction
+///
+/// And even if we are wrong and the value is not a hash, it is still quite
+/// unlikely that such values will fit into BypassType.
+///
+/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
+/// It is implemented as a depth-first search for values that look neither long
+/// nor hash-like.
+bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::Xor:
+    return true;
+  case Instruction::Mul: {
+    // After Constant Hoisting pass, long constants may be represented as
+    // bitcast instructions. As a result, some constants may look like an
+    // instruction at first, and an additional check is necessary to find out if
+    // an operand is actually a constant.
+    Value *Op1 = I->getOperand(1);
+    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
+    if (!C && isa<BitCastInst>(Op1))
+      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
+    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
+  }
+  case Instruction::PHI: {
+    // Stop IR traversal in case of a crazy input code. This limits recursion
+    // depth.
+    if (Visited.size() >= 16)
       return false;
+    // Do not visit nodes that have been visited already. We return true because
+    // it means that we couldn't find any value that doesn't look hash-like.
+    if (Visited.find(I) != Visited.end())
+      return true;
+    Visited.insert(I);
+    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
+      // Ignore undef values as they probably don't affect the division
+      // operands.
+      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
+             isa<UndefValue>(V);
+    });
+  }
+  default:
+    return false;
+  }
+}
+
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V,
+                                               VisitedSetTy &Visited) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  KnownBits Known(LongLen);
 
-  // Basic Block is split before divide
-  BasicBlock *MainBB = &*I->getParent();
-  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
-
-  // Add new basic block for slow divide operation
-  BasicBlock *SlowBB =
-      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
-  SlowBB->moveBefore(SuccessorBB);
-  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
-  Value *SlowQuotientV;
-  Value *SlowRemainderV;
-  if (UseSignedOp) {
-    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
-    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  computeKnownBits(V, Known, DL);
+
+  if (Known.countMinLeadingZeros() >= HiBits)
+    return VALRNG_KNOWN_SHORT;
+
+  if (Known.countMaxLeadingZeros() < HiBits)
+    return VALRNG_LIKELY_LONG;
+
+  // Long integer divisions are often used in hashtable implementations. It's
+  // not worth bypassing such divisions because hash values are extremely
+  // unlikely to have enough leading zeros. The call below tries to detect
+  // values that are unlikely to fit BypassType (including hashes).
+  if (isHashLikeValue(V, Visited))
+    return VALRNG_LIKELY_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
+/// Add new basic block for slow div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isSignedOp()) {
+    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
   } else {
-    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
-    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
   }
-  SlowBuilder.CreateBr(SuccessorBB);
-
-  // Add new basic block for fast divide operation
-  BasicBlock *FastBB =
-      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
-  FastBB->moveBefore(SlowBB);
-  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
-  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
-                                                BypassType);
-  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
-                                                 BypassType);
-
-  // udiv/urem because optimization only handles positive numbers
-  Value *ShortQuotientV = FastBuilder.CreateUDiv(ShortDividendV, ShortDivisorV);
-  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
-                                                  ShortDivisorV);
-  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
-                                                ShortQuotientV,
-                                                Dividend->getType());
-  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
-                                                 ShortRemainderV,
-                                                 Dividend->getType());
-  FastBuilder.CreateBr(SuccessorBB);
-
-  // Phi nodes for result of div and rem
-  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
-  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
-  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
-  QuoPhi->addIncoming(FastQuotientV, FastBB);
-  PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
-  RemPhi->addIncoming(SlowRemainderV, SlowBB);
-  RemPhi->addIncoming(FastRemainderV, FastBB);
-
-  // Replace I with appropriate phi node
-  if (UseDivOp)
-    I->replaceAllUsesWith(QuoPhi);
-  else
-    I->replaceAllUsesWith(RemPhi);
-  I->eraseFromParent();
 
-  // Combine operands into a single value with OR for value testing below
-  MainBB->getInstList().back().eraseFromParent();
-  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Builder.CreateBr(SuccessorBB);
+  return DivRemPair;
+}
+
+/// Add new basic block for fast div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  Value *ShortDivisorV =
+      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+  Value *ShortDividendV =
+      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+  // udiv/urem because this optimization only handles positive numbers.
+  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
+  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
+  DivRemPair.Quotient =
+      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
+  DivRemPair.Remainder =
+      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
+  Builder.CreateBr(SuccessorBB);
+
+  return DivRemPair;
+}
 
-  // We should have bailed out above if the divisor is a constant, but the
-  // dividend may still be a constant.  Set OrV to our non-constant operands
-  // OR'ed together.
-  assert(!isa<ConstantInt>(Divisor));
+/// Creates Phi nodes for result of Div and Rem.
+QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
+                                                       QuotRemWithBB &RHS,
+                                                       BasicBlock *PhiBB) {
+  IRBuilder<> Builder(PhiBB, PhiBB->begin());
+  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
+  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
+  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
+  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
+  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
+  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
+  return QuotRemPair(QuoPhi, RemPhi);
+}
+
+/// Creates a runtime check to test whether both the divisor and dividend fit
+/// into BypassType. The check is inserted at the end of MainBB. True return
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
+  IRBuilder<> Builder(MainBB, MainBB->end());
 
   Value *OrV;
-  if (!isa<ConstantInt>(Dividend))
-    OrV = MainBuilder.CreateOr(Dividend, Divisor);
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
   else
-    OrV = Divisor;
+    OrV = Op1 ? Op1 : Op2;
 
   // BitMask is inverted to check if the operands are
   // larger than the bypass type
   uint64_t BitMask = ~BypassType->getBitMask();
-  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
-
-  // Compare operand values and branch
-  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
-  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
-  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
-
-  // Cache phi nodes to be used later in place of other instances
-  // of div or rem with the same sign, dividend, and divisor
-  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
-  DivPhiNodes Value(QuoPhi, RemPhi);
-  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
-  return true;
+  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values
+  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
+  return Builder.CreateICmpEQ(AndV, ZeroV);
 }
 
-// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from
-// the current BB if operands and operation are identical. Otherwise calls
-// insertFastDiv to perform the optimization and caches the resulting dividend
-// and remainder.
-static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType,
-                                 bool UseDivOp, bool UseSignedOp,
-                                 DivCacheTy &PerBBDivCache) {
-  // Get instruction operands
-  DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1));
-  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
-
-  if (CacheI == PerBBDivCache.end()) {
-    // If previous instance does not exist, insert fast div
-    return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
+/// Substitutes the div/rem instruction with code that checks the value of the
+/// operands and uses a shorter-faster div/rem instruction when possible.
+Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor)) {
+    // Keep division by a constant for DAGCombiner.
+    return None;
   }
 
-  // Replace operation value with previously generated phi node
-  DivPhiNodes &Value = CacheI->second;
-  if (UseDivOp) {
-    // Replace all uses of div instruction with quotient phi node
-    I->replaceAllUsesWith(Value.Quotient);
+  VisitedSetTy SetL;
+  ValueRange DividendRange = getValueRange(Dividend, SetL);
+  if (DividendRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  VisitedSetTy SetR;
+  ValueRange DivisorRange = getValueRange(Divisor, SetR);
+  if (DivisorRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  } else if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
   } else {
-    // Replace all uses of rem instruction with remainder phi node
-    I->replaceAllUsesWith(Value.Remainder);
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
   }
-
-  // Remove redundant operation
-  I->eraseFromParent();
-  return true;
 }
 
-// bypassSlowDivision - This optimization identifies DIV instructions in a BB
-// that can be profitably bypassed and carried out with a shorter, faster
-// divide.
-bool llvm::bypassSlowDivision(
-    BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) {
-  DivCacheTy DivCache;
+/// This optimization identifies DIV/REM instructions in a BB that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(BasicBlock *BB,
+                              const BypassWidthsTy &BypassWidths) {
+  DivCacheTy PerBBDivCache;
 
   bool MadeChange = false;
   Instruction* Next = &*BB->begin();
@@ -231,42 +460,20 @@ bool llvm::bypassSlowDivision(
     Instruction* I = Next;
     Next = Next->getNextNode();
 
-    // Get instruction details
-    unsigned Opcode = I->getOpcode();
-    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
-    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
-    bool UseSignedOp = Opcode == Instruction::SDiv ||
-                       Opcode == Instruction::SRem;
-
-    // Only optimize div or rem ops
-    if (!UseDivOp && !UseRemOp)
-      continue;
-
-    // Skip division on vector types, only optimize integer instructions
-    if (!I->getType()->isIntegerTy())
-      continue;
-
-    // Get bitwidth of div/rem instruction
-    IntegerType *T = cast<IntegerType>(I->getType());
-    unsigned int bitwidth = T->getBitWidth();
-
-    // Continue if bitwidth is not bypassed
-    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
-    if (BI == BypassWidths.end())
-      continue;
-
-    // Get type for div/rem instruction with bypass bitwidth
-    IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
-
-    MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
+    FastDivInsertionTask Task(I, BypassWidths);
+    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      MadeChange = true;
+    }
   }
 
   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
   // create divrem machine instructions.  Now erase any unused divs / rems so we
   // don't leave extra instructions sitting around.
-  for (auto &KV : DivCache)
-    for (Instruction *Phi : {KV.second.Quotient, KV.second.Remainder})
-      RecursivelyDeleteTriviallyDeadInstructions(Phi);
+  for (auto &KV : PerBBDivCache)
+    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
+      RecursivelyDeleteTriviallyDeadInstructions(V);
 
   return MadeChange;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 4d33e22..9c4e139 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -31,24 +30,38 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <map>
 using namespace llvm;
 
 /// See comments in Cloning.h.
-BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB,
-                                  ValueToValueMapTy &VMap,
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
                                   const Twine &NameSuffix, Function *F,
-                                  ClonedCodeInfo *CodeInfo) {
+                                  ClonedCodeInfo *CodeInfo,
+                                  DebugInfoFinder *DIFinder) {
+  DenseMap<const MDNode *, MDNode *> Cache;
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
   if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
 
   bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
-  
+  Module *TheModule = F ? F->getParent() : nullptr;
+
   // Loop over all instructions, and copy them over.
   for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end();
        II != IE; ++II) {
+
+    if (DIFinder && TheModule) {
+      if (auto *DDI = dyn_cast<DbgDeclareInst>(II))
+        DIFinder->processDeclare(*TheModule, DDI);
+      else if (auto *DVI = dyn_cast<DbgValueInst>(II))
+        DIFinder->processValue(*TheModule, DVI);
+
+      if (auto DbgLoc = II->getDebugLoc())
+        DIFinder->processLocation(*TheModule, DbgLoc.get());
+    }
+
     Instruction *NewInst = II->clone();
     if (II->hasName())
       NewInst->setName(II->getName()+NameSuffix);
@@ -90,9 +103,9 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     assert(VMap.count(&I) && "No mapping from source argument specified!");
 #endif
 
-  // Copy all attributes other than those stored in the AttributeSet.  We need
-  // to remap the parameter indices of the AttributeSet.
-  AttributeSet NewAttrs = NewFunc->getAttributes();
+  // Copy all attributes other than those stored in the AttributeList.  We need
+  // to remap the parameter indices of the AttributeList.
+  AttributeList NewAttrs = NewFunc->getAttributes();
   NewFunc->copyAttributesFrom(OldFunc);
   NewFunc->setAttributes(NewAttrs);
 
@@ -103,31 +116,54 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                  ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                  TypeMapper, Materializer));
 
-  AttributeSet OldAttrs = OldFunc->getAttributes();
+  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
   // Clone any argument attributes that are present in the VMap.
-  for (const Argument &OldArg : OldFunc->args())
+  for (const Argument &OldArg : OldFunc->args()) {
     if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
-      AttributeSet attrs =
-          OldAttrs.getParamAttributes(OldArg.getArgNo() + 1);
-      if (attrs.getNumSlots() > 0)
-        NewArg->addAttr(attrs);
+      NewArgAttrs[NewArg->getArgNo()] =
+          OldAttrs.getParamAttributes(OldArg.getArgNo());
     }
+  }
 
   NewFunc->setAttributes(
-      NewFunc->getAttributes()
-          .addAttributes(NewFunc->getContext(), AttributeSet::ReturnIndex,
-                         OldAttrs.getRetAttributes())
-          .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex,
-                         OldAttrs.getFnAttributes()));
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+                         OldAttrs.getRetAttributes(), NewArgAttrs));
+
+  bool MustCloneSP =
+      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  DISubprogram *SP = OldFunc->getSubprogram();
+  if (SP) {
+    assert(!MustCloneSP || ModuleLevelChanges);
+    // Add mappings for some DebugInfo nodes that we don't want duplicated
+    // even if they're distinct.
+    auto &MD = VMap.MD();
+    MD[SP->getUnit()].reset(SP->getUnit());
+    MD[SP->getType()].reset(SP->getType());
+    MD[SP->getFile()].reset(SP->getFile());
+    // If we're not cloning into the same module, no need to clone the
+    // subprogram
+    if (!MustCloneSP)
+      MD[SP].reset(SP);
+  }
 
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc->getAllMetadata(MDs);
-  for (auto MD : MDs)
+  for (auto MD : MDs) {
     NewFunc->addMetadata(
         MD.first,
         *MapMetadata(MD.second, VMap,
                      ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                      TypeMapper, Materializer));
+  }
+
+  // When we remap instructions, we want to avoid duplicating inlined
+  // DISubprograms, so record all subprograms we find as we duplicate
+  // instructions and then freeze them in the MD map.
+  // We also record information about dbg.value and dbg.declare to avoid
+  // duplicating the types.
+  DebugInfoFinder DIFinder;
 
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
@@ -138,7 +174,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     const BasicBlock &BB = *BI;
 
     // Create a new basic block and copy instructions into it!
-    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo);
+    BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
+                                      SP ? &DIFinder : nullptr);
 
     // Add basic block mapping.
     VMap[&BB] = CBB;
@@ -160,6 +197,16 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       Returns.push_back(RI);
   }
 
+  for (DISubprogram *ISP : DIFinder.subprograms()) {
+    if (ISP != SP) {
+      VMap.MD()[ISP].reset(ISP);
+    }
+  }
+
+  for (auto *Type : DIFinder.types()) {
+    VMap.MD()[Type].reset(Type);
+  }
+
   // Loop over all of the instructions in the function, fixing up operand
   // references as we go.  This uses VMap to do all the hard work.
   for (Function::iterator BB =
@@ -208,7 +255,7 @@ Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
     }
 
   SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
-  CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns, "",
+  CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
                     CodeInfo);
 
   return NewF;
@@ -247,7 +294,7 @@ namespace {
 void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
                                        BasicBlock::const_iterator StartingInst,
                                        std::vector<const BasicBlock*> &ToClone){
-  WeakVH &BBEntry = VMap[BB];
+  WeakTrackingVH &BBEntry = VMap[BB];
 
   // Have we already cloned this block?
   if (BBEntry) return;
@@ -294,12 +341,13 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
               SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
         // On the off-chance that this simplifies to an instruction in the old
         // function, map it back into the new function.
-        if (Value *MappedV = VMap.lookup(V))
-          V = MappedV;
+        if (NewFunc != OldFunc)
+          if (Value *MappedV = VMap.lookup(V))
+            V = MappedV;
 
         if (!NewInst->mayHaveSideEffects()) {
           VMap[&*II] = V;
-          delete NewInst;
+          NewInst->deleteValue();
           continue;
         }
       }
@@ -353,7 +401,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
-      SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond);
+      SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
       BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
       VMap[OldTI] = BranchInst::Create(Dest, NewBB);
       ToClone.push_back(Dest);
@@ -549,7 +597,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Make a second pass over the PHINodes now that all of them have been
   // remapped into the new function, simplifying the PHINode and performing any
   // recursive simplifications exposed. This will transparently update the
-  // WeakVH in the VMap. Notably, we rely on that so that if we coalesce
+  // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
   // two PHINodes, the iteration over the old PHIs remains valid, and the
   // mapping will just map us to the new node (which may not even be a PHI
   // node).
@@ -747,3 +795,40 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
 
   return NewLoop;
 }
+
+/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *
+llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                          Instruction *StopAt,
+                                          ValueToValueMapTy &ValueMapping) {
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; StopAt != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
index 7ebeb61..e5392b5 100644
--- a/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -12,14 +12,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm-c/Core.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm-c/Core.h"
 using namespace llvm;
 
+static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
+  const Comdat *SC = Src->getComdat();
+  if (!SC)
+    return;
+  Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
+  DC->setSelectionKind(SC->getSelectionKind());
+  Dst->setComdat(DC);
+}
+
 /// This is not as easy as it might seem because we have to worry about making
 /// copies of global variables and functions, and making their (initializers and
 /// references, respectively) refer to the right globals.
@@ -87,7 +96,7 @@ std::unique_ptr<Module> llvm::CloneModule(
       else
         GV = new GlobalVariable(
             *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
-            (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr,
+            nullptr, I->getName(), nullptr,
             I->getThreadLocalMode(), I->getType()->getAddressSpace());
       VMap[&*I] = GV;
       // We do not copy attributes (mainly because copying between different
@@ -123,7 +132,10 @@ std::unique_ptr<Module> llvm::CloneModule(
     SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
     I->getAllMetadata(MDs);
     for (auto MD : MDs)
-      GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
+      GV->addMetadata(MD.first,
+                      *MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));
+
+    copyComdat(GV, &*I);
   }
 
   // Similarly, copy over function bodies now...
@@ -153,6 +165,8 @@ std::unique_ptr<Module> llvm::CloneModule(
 
     if (I.hasPersonalityFn())
       F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
+
+    copyComdat(F, &I);
   }
 
   // And aliases
diff --git a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
index 60ae374..d9294c4 100644
--- a/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -73,17 +73,17 @@ bool llvm::decomposeBitTestICmp(const ICmpInst *I, CmpInst::Predicate &Pred,
   default:
     return false;
   case ICmpInst::ICMP_SLT:
-    // X < 0 is equivalent to (X & SignBit) != 0.
+    // X < 0 is equivalent to (X & SignMask) != 0.
     if (!C->isZero())
       return false;
-    Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth()));
+    Y = ConstantInt::get(I->getContext(), APInt::getSignMask(C->getBitWidth()));
     Pred = ICmpInst::ICMP_NE;
     break;
   case ICmpInst::ICMP_SGT:
-    // X > -1 is equivalent to (X & SignBit) == 0.
-    if (!C->isAllOnesValue())
+    // X > -1 is equivalent to (X & SignMask) == 0.
+    if (!C->isMinusOne())
       return false;
-    Y = ConstantInt::get(I->getContext(), APInt::getSignBit(C->getBitWidth()));
+    Y = ConstantInt::get(I->getContext(), APInt::getSignMask(C->getBitWidth()));
     Pred = ICmpInst::ICMP_EQ;
     break;
   case ICmpInst::ICMP_ULT:
diff --git a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index c514c9c..1189714 100644
--- a/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
@@ -58,6 +59,33 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) {
   // Landing pads must be in the function where they were inserted for cleanup.
   if (BB.isEHPad())
     return false;
+  // taking the address of a basic block moved to another function is illegal
+  if (BB.hasAddressTaken())
+    return false;
+
+  // don't hoist code that uses another basicblock address, as it's likely to
+  // lead to unexpected behavior, like cross-function jumps
+  SmallPtrSet<User const *, 16> Visited;
+  SmallVector<User const *, 16> ToVisit;
+
+  for (Instruction const &Inst : BB)
+    ToVisit.push_back(&Inst);
+
+  while (!ToVisit.empty()) {
+    User const *Curr = ToVisit.pop_back_val();
+    if (!Visited.insert(Curr).second)
+      continue;
+    if (isa<BlockAddress const>(Curr))
+      return false; // even a reference to self is likely to be not compatible
+
+    if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
+      continue;
+
+    for (auto const &U : Curr->operands()) {
+      if (auto *UU = dyn_cast<User>(U))
+        ToVisit.push_back(UU);
+    }
+  }
 
   // Don't hoist code containing allocas, invokes, or vastarts.
   for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
@@ -73,24 +101,26 @@ bool CodeExtractor::isBlockValidForExtraction(const BasicBlock &BB) {
 }
 
 /// \brief Build a set of blocks to extract if the input blocks are viable.
-template <typename IteratorT>
-static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
-                                                       IteratorT BBEnd) {
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT) {
+  assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
   SetVector<BasicBlock *> Result;
 
-  assert(BBBegin != BBEnd);
-
   // Loop over the blocks, adding them to our set-vector, and aborting with an
   // empty set if we encounter invalid blocks.
-  do {
-    if (!Result.insert(*BBBegin))
-      llvm_unreachable("Repeated basic blocks in extraction input");
+  for (BasicBlock *BB : BBs) {
 
-    if (!CodeExtractor::isBlockValidForExtraction(**BBBegin)) {
+    // If this block is dead, don't process it.
+    if (DT && !DT->isReachableFromEntry(BB))
+      continue;
+
+    if (!Result.insert(BB))
+      llvm_unreachable("Repeated basic blocks in extraction input");
+    if (!CodeExtractor::isBlockValidForExtraction(*BB)) {
       Result.clear();
       return Result;
     }
-  } while (++BBBegin != BBEnd);
+  }
 
 #ifndef NDEBUG
   for (SetVector<BasicBlock *>::iterator I = std::next(Result.begin()),
@@ -106,49 +136,19 @@ static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin,
   return Result;
 }
 
-/// \brief Helper to call buildExtractionBlockSet with an ArrayRef.
-static SetVector<BasicBlock *>
-buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs) {
-  return buildExtractionBlockSet(BBs.begin(), BBs.end());
-}
-
-/// \brief Helper to call buildExtractionBlockSet with a RegionNode.
-static SetVector<BasicBlock *>
-buildExtractionBlockSet(const RegionNode &RN) {
-  if (!RN.isSubRegion())
-    // Just a single BasicBlock.
-    return buildExtractionBlockSet(RN.getNodeAs<BasicBlock>());
-
-  const Region &R = *RN.getNodeAs<Region>();
-
-  return buildExtractionBlockSet(R.block_begin(), R.block_end());
-}
-
-CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs,
-                             BlockFrequencyInfo *BFI,
-                             BranchProbabilityInfo *BPI)
-    : DT(nullptr), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {}
-
 CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
                              bool AggregateArgs, BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI)
     : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {}
+      BPI(BPI), Blocks(buildExtractionBlockSet(BBs, DT)), NumExitBlocks(~0U) {}
 
 CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
                              BlockFrequencyInfo *BFI,
                              BranchProbabilityInfo *BPI)
     : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), Blocks(buildExtractionBlockSet(L.getBlocks())),
+      BPI(BPI), Blocks(buildExtractionBlockSet(L.getBlocks(), &DT)),
       NumExitBlocks(~0U) {}
 
-CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN,
-                             bool AggregateArgs, BlockFrequencyInfo *BFI,
-                             BranchProbabilityInfo *BPI)
-    : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
-      BPI(BPI), Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {}
-
 /// definedInRegion - Return true if the specified value is defined in the
 /// extracted region.
 static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
@@ -169,16 +169,255 @@ static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
   return false;
 }
 
-void CodeExtractor::findInputsOutputs(ValueSet &Inputs,
-                                      ValueSet &Outputs) const {
+static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
+  BasicBlock *CommonExitBlock = nullptr;
+  auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
+    for (auto *Succ : successors(Block)) {
+      // Internal edges, ok.
+      if (Blocks.count(Succ))
+        continue;
+      if (!CommonExitBlock) {
+        CommonExitBlock = Succ;
+        continue;
+      }
+      if (CommonExitBlock == Succ)
+        continue;
+
+      return true;
+    }
+    return false;
+  };
+
+  if (any_of(Blocks, hasNonCommonExitSucc))
+    return nullptr;
+
+  return CommonExitBlock;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+    Instruction *Addr) const {
+  AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+  Function *Func = (*Blocks.begin())->getParent();
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    for (Instruction &II : BB) {
+
+      if (isa<DbgInfoIntrinsic>(II))
+        continue;
+
+      unsigned Opcode = II.getOpcode();
+      Value *MemAddr = nullptr;
+      switch (Opcode) {
+      case Instruction::Store:
+      case Instruction::Load: {
+        if (Opcode == Instruction::Store) {
+          StoreInst *SI = cast<StoreInst>(&II);
+          MemAddr = SI->getPointerOperand();
+        } else {
+          LoadInst *LI = cast<LoadInst>(&II);
+          MemAddr = LI->getPointerOperand();
+        }
+        // Global variable can not be aliased with locals.
+        if (dyn_cast<Constant>(MemAddr))
+          break;
+        Value *Base = MemAddr->stripInBoundsConstantOffsets();
+        if (!dyn_cast<AllocaInst>(Base) || Base == AI)
+          return false;
+        break;
+      }
+      default: {
+        IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+        if (IntrInst) {
+          if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
+              IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
+            break;
+          return false;
+        }
+        // Treat all the other cases conservatively if it has side effects.
+        if (II.mayHaveSideEffects())
+          return false;
+      }
+      }
+    }
+  }
+
+  return true;
+}
+
+BasicBlock *
+CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
+  BasicBlock *SinglePredFromOutlineRegion = nullptr;
+  assert(!Blocks.count(CommonExitBlock) &&
+         "Expect a block outside the region!");
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (!Blocks.count(Pred))
+      continue;
+    if (!SinglePredFromOutlineRegion) {
+      SinglePredFromOutlineRegion = Pred;
+    } else if (SinglePredFromOutlineRegion != Pred) {
+      SinglePredFromOutlineRegion = nullptr;
+      break;
+    }
+  }
+
+  if (SinglePredFromOutlineRegion)
+    return SinglePredFromOutlineRegion;
+
+#ifndef NDEBUG
+  auto getFirstPHI = [](BasicBlock *BB) {
+    BasicBlock::iterator I = BB->begin();
+    PHINode *FirstPhi = nullptr;
+    while (I != BB->end()) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi)
+        break;
+      if (!FirstPhi) {
+        FirstPhi = Phi;
+        break;
+      }
+    }
+    return FirstPhi;
+  };
+  // If there are any phi nodes, the single pred either exists or has already
+  // be created before code extraction.
+  assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
+#endif
+
+  BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
+      CommonExitBlock->getFirstNonPHI()->getIterator());
+
+  for (auto *Pred : predecessors(CommonExitBlock)) {
+    if (Blocks.count(Pred))
+      continue;
+    Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
+  }
+  // Now add the old exit block to the outline region.
+  Blocks.insert(CommonExitBlock);
+  return CommonExitBlock;
+}
+
+void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands,
+                                BasicBlock *&ExitBlock) const {
+  Function *Func = (*Blocks.begin())->getParent();
+  ExitBlock = getCommonExitBlock(Blocks);
+
+  for (BasicBlock &BB : *Func) {
+    if (Blocks.count(&BB))
+      continue;
+    for (Instruction &II : BB) {
+      auto *AI = dyn_cast<AllocaInst>(&II);
+      if (!AI)
+        continue;
+
+      // Find the pair of life time markers for address 'Addr' that are either
+      // defined inside the outline region or can legally be shrinkwrapped into
+      // the outline region. If there are not other untracked uses of the
+      // address, return the pair of markers if found; otherwise return a pair
+      // of nullptr.
+      auto GetLifeTimeMarkers =
+          [&](Instruction *Addr, bool &SinkLifeStart,
+              bool &HoistLifeEnd) -> std::pair<Instruction *, Instruction *> {
+        Instruction *LifeStart = nullptr, *LifeEnd = nullptr;
+
+        for (User *U : Addr->users()) {
+          IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
+          if (IntrInst) {
+            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
+              // Do not handle the case where AI has multiple start markers.
+              if (LifeStart)
+                return std::make_pair<Instruction *>(nullptr, nullptr);
+              LifeStart = IntrInst;
+            }
+            if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
+              if (LifeEnd)
+                return std::make_pair<Instruction *>(nullptr, nullptr);
+              LifeEnd = IntrInst;
+            }
+            continue;
+          }
+          // Find untracked uses of the address, bail.
+          if (!definedInRegion(Blocks, U))
+            return std::make_pair<Instruction *>(nullptr, nullptr);
+        }
+
+        if (!LifeStart || !LifeEnd)
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        SinkLifeStart = !definedInRegion(Blocks, LifeStart);
+        HoistLifeEnd = !definedInRegion(Blocks, LifeEnd);
+        // Do legality Check.
+        if ((SinkLifeStart || HoistLifeEnd) &&
+            !isLegalToShrinkwrapLifetimeMarkers(Addr))
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        // Check to see if we have a place to do hoisting, if not, bail.
+        if (HoistLifeEnd && !ExitBlock)
+          return std::make_pair<Instruction *>(nullptr, nullptr);
+
+        return std::make_pair(LifeStart, LifeEnd);
+      };
+
+      bool SinkLifeStart = false, HoistLifeEnd = false;
+      auto Markers = GetLifeTimeMarkers(AI, SinkLifeStart, HoistLifeEnd);
+
+      if (Markers.first) {
+        if (SinkLifeStart)
+          SinkCands.insert(Markers.first);
+        SinkCands.insert(AI);
+        if (HoistLifeEnd)
+          HoistCands.insert(Markers.second);
+        continue;
+      }
+
+      // Follow the bitcast.
+      Instruction *MarkerAddr = nullptr;
+      for (User *U : AI->users()) {
+
+        if (U->stripInBoundsConstantOffsets() == AI) {
+          SinkLifeStart = false;
+          HoistLifeEnd = false;
+          Instruction *Bitcast = cast<Instruction>(U);
+          Markers = GetLifeTimeMarkers(Bitcast, SinkLifeStart, HoistLifeEnd);
+          if (Markers.first) {
+            MarkerAddr = Bitcast;
+            continue;
+          }
+        }
+
+        // Found unknown use of AI.
+        if (!definedInRegion(Blocks, U)) {
+          MarkerAddr = nullptr;
+          break;
+        }
+      }
+
+      if (MarkerAddr) {
+        if (SinkLifeStart)
+          SinkCands.insert(Markers.first);
+        if (!definedInRegion(Blocks, MarkerAddr))
+          SinkCands.insert(MarkerAddr);
+        SinkCands.insert(AI);
+        if (HoistLifeEnd)
+          HoistCands.insert(Markers.second);
+      }
+    }
+  }
+}
+
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
+                                      const ValueSet &SinkCands) const {
+
   for (BasicBlock *BB : Blocks) {
     // If a used value is defined outside the region, it's an input.  If an
     // instruction is used outside the region, it's an output.
     for (Instruction &II : *BB) {
       for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
-           ++OI)
-        if (definedInCaller(Blocks, *OI))
-          Inputs.insert(*OI);
+           ++OI) {
+        Value *V = *OI;
+        if (!SinkCands.count(V) && definedInCaller(Blocks, V))
+          Inputs.insert(V);
+      }
 
       for (User *U : II.users())
         if (!definedInRegion(Blocks, U)) {
@@ -218,9 +457,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
   // containing PHI nodes merging values from outside of the region, and a
   // second that contains all of the code for the block and merges back any
   // incoming values from inside of the region.
-  BasicBlock::iterator AfterPHIs = Header->getFirstNonPHI()->getIterator();
-  BasicBlock *NewBB = Header->splitBasicBlock(AfterPHIs,
-                                              Header->getName()+".ce");
+  BasicBlock *NewBB = llvm::SplitBlock(Header, Header->getFirstNonPHI(), DT);
 
   // We only want to code extract the second block now, and it becomes the new
   // header of the region.
@@ -229,11 +466,6 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
   Blocks.insert(NewBB);
   Header = NewBB;
 
-  // Okay, update dominator sets. The blocks that dominate the new one are the
-  // blocks that dominate TIBB plus the new block itself.
-  if (DT)
-    DT->splitBlock(NewBB);
-
   // Okay, now we need to adjust the PHI nodes and any branches from within the
   // region to go to the new header block instead of the old header block.
   if (NumPredsFromRegion) {
@@ -248,12 +480,14 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) {
 
     // Okay, everything within the region is now branching to the right block, we
     // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+    BasicBlock::iterator AfterPHIs;
     for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
       PHINode *PN = cast<PHINode>(AfterPHIs);
       // Create a new PHI node in the new region, which has an incoming value
       // from OldPred of PN.
       PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
                                        PN->getName() + ".ce", &NewBB->front());
+      PN->replaceAllUsesWith(NewPN);
       NewPN->addIncoming(PN, OldPred);
 
       // Loop over all of the incoming value in PN, moving them to NewPN if they
@@ -362,9 +596,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  AttributeSet OldFnAttrs = oldFunction->getAttributes().getFnAttributes();
-  AttrBuilder AB(OldFnAttrs, AttributeSet::FunctionIndex);
-  for (auto Attr : AB.td_attrs())
+  AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
+  for (const auto &Attr : AB.td_attrs())
     newFunction->addFnAttr(Attr.first, Attr.second);
 
   newFunction->getBasicBlockList().push_back(newRootNode);
@@ -440,8 +673,10 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   // Emit a call to the new function, passing in: *pointer to struct (if
   // aggregating parameters), or plan inputs and allocated memory for outputs
   std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
-  
-  LLVMContext &Context = newFunction->getContext();
+
+  Module *M = newFunction->getParent();
+  LLVMContext &Context = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
 
   // Add inputs as params, or to be filled into the struct
   for (Value *input : inputs)
@@ -456,8 +691,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       StructValues.push_back(output);
     } else {
       AllocaInst *alloca =
-          new AllocaInst(output->getType(), nullptr, output->getName() + ".loc",
-                         &codeReplacer->getParent()->front().front());
+        new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
+                       nullptr, output->getName() + ".loc",
+                       &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
     }
@@ -473,7 +709,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
     // Allocate a struct at the beginning of this function
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
-    Struct = new AllocaInst(StructArgTy, nullptr, "structArg",
+    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
+                            "structArg",
                             &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
@@ -748,7 +985,8 @@ Function *CodeExtractor::extractCodeRegion() {
   if (!isEligible())
     return nullptr;
 
-  ValueSet inputs, outputs;
+  ValueSet inputs, outputs, SinkingCands, HoistingCands;
+  BasicBlock *CommonExit = nullptr;
 
   // Assumption: this is a single-entry code region, and the header is the first
   // block in the region.
@@ -787,8 +1025,23 @@ Function *CodeExtractor::extractCodeRegion() {
                                                "newFuncRoot");
   newFuncRoot->getInstList().push_back(BranchInst::Create(header));
 
+  findAllocas(SinkingCands, HoistingCands, CommonExit);
+  assert(HoistingCands.empty() || CommonExit);
+
   // Find inputs to, outputs from the code region.
-  findInputsOutputs(inputs, outputs);
+  findInputsOutputs(inputs, outputs, SinkingCands);
+
+  // Now sink all instructions which only have non-phi uses inside the region
+  for (auto *II : SinkingCands)
+    cast<Instruction>(II)->moveBefore(*newFuncRoot,
+                                      newFuncRoot->getFirstInsertionPt());
+
+  if (!HoistingCands.empty()) {
+    auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
+    Instruction *TI = HoistToBlock->getTerminator();
+    for (auto *II : HoistingCands)
+      cast<Instruction>(II)->moveBefore(TI);
+  }
 
   // Calculate the exit blocks for the extracted region and the total exit
   //  weights for each of those blocks.
@@ -863,12 +1116,6 @@ Function *CodeExtractor::extractCodeRegion() {
         }
     }
 
-  //cerr << "NEW FUNCTION: " << *newFunction;
-  //  verifyFunction(*newFunction);
-
-  //  cerr << "OLD FUNCTION: " << *oldFunction;
-  //  verifyFunction(*oldFunction);
-
   DEBUG(if (verifyFunction(*newFunction)) 
         report_fatal_error("verifyFunction failed!"));
   return newFunction;
diff --git a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
index 75a1dde..6d3d287 100644
--- a/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -28,15 +28,17 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     return nullptr;
   }
 
+  Function *F = I.getParent()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(I.getType(), nullptr,
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
                           I.getName()+".reg2mem", AllocaPoint);
   } else {
-    Function *F = I.getParent()->getParent();
-    Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem",
-                          &F->getEntryBlock().front());
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
   }
 
   // We cannot demote invoke instructions to the stack if their normal edge
@@ -110,14 +112,17 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
     return nullptr;
   }
 
+  const DataLayout &DL = P->getModule()->getDataLayout();
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(P->getType(), nullptr,
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
                           P->getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
-    Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem",
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName() + ".reg2mem",
                           &F->getEntryBlock().front());
   }
 
diff --git a/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
index 8c23865..78d7474 100644
--- a/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() {
   // Create a cleanup block.
   LLVMContext &C = F.getContext();
   BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
-  Type *ExnTy =
-      StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+  Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
   if (!F.hasPersonalityFn()) {
     Constant *PersFn = getDefaultPersonalityFn(F.getParent());
     F.setPersonalityFn(PersFn);
diff --git a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
index 4adf175..1328f2f 100644
--- a/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -16,11 +16,12 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -401,7 +402,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           Value *Ptr = PtrArg->stripPointerCasts();
           if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
             Type *ElemTy = GV->getValueType();
-            if (!Size->isAllOnesValue() &&
+            if (!Size->isMinusOne() &&
                 Size->getValue().getLimitedValue() >=
                     DL.getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
@@ -438,7 +439,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
-        if (Constant *C = ConstantFoldCall(Callee, Formals, TLI)) {
+        if (Constant *C = ConstantFoldCall(CS, Callee, Formals, TLI)) {
           InstResult = C;
           DEBUG(dbgs() << "Constant folded function call. Result: " <<
                 *InstResult << "\n");
@@ -486,7 +487,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         ConstantInt *Val =
           dyn_cast<ConstantInt>(getVal(SI->getCondition()));
         if (!Val) return false;  // Cannot determine.
-        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
+        NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
       } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
         Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
         if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
diff --git a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 7b96fbb..435eff3 100644
--- a/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -19,6 +18,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "flattencfg"
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 81a7c4c..4a2be3a 100644
--- a/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -15,8 +15,8 @@
 #include "llvm/Transforms/Utils/FunctionComparator.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/IR/CallSite.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -74,14 +74,16 @@ int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
   return L.compare(R);
 }
 
-int FunctionComparator::cmpAttrs(const AttributeSet L,
-                                 const AttributeSet R) const {
-  if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
+int FunctionComparator::cmpAttrs(const AttributeList L,
+                                 const AttributeList R) const {
+  if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
     return Res;
 
-  for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
-    AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
-                           RE = R.end(i);
+  for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
+    AttributeSet LAS = L.getAttributes(i);
+    AttributeSet RAS = R.getAttributes(i);
+    AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
+    AttributeSet::iterator RI = RAS.begin(), RE = RAS.end();
     for (; LI != LE && RI != RE; ++LI, ++RI) {
       Attribute LA = *LI;
       Attribute RA = *RI;
@@ -511,8 +513,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
       return Res;
-    if (int Res =
-            cmpNumbers(LI->getSynchScope(), cast<LoadInst>(R)->getSynchScope()))
+    if (int Res = cmpNumbers(LI->getSyncScopeID(),
+                             cast<LoadInst>(R)->getSyncScopeID()))
       return Res;
     return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
         cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
@@ -527,7 +529,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(SI->getSynchScope(), cast<StoreInst>(R)->getSynchScope());
+    return cmpNumbers(SI->getSyncScopeID(),
+                      cast<StoreInst>(R)->getSyncScopeID());
   }
   if (const CmpInst *CI = dyn_cast<CmpInst>(L))
     return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
@@ -582,7 +585,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res =
             cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(FI->getSynchScope(), cast<FenceInst>(R)->getSynchScope());
+    return cmpNumbers(FI->getSyncScopeID(),
+                      cast<FenceInst>(R)->getSyncScopeID());
   }
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
     if (int Res = cmpNumbers(CXI->isVolatile(),
@@ -599,8 +603,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
             cmpOrderings(CXI->getFailureOrdering(),
                          cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
       return Res;
-    return cmpNumbers(CXI->getSynchScope(),
-                      cast<AtomicCmpXchgInst>(R)->getSynchScope());
+    return cmpNumbers(CXI->getSyncScopeID(),
+                      cast<AtomicCmpXchgInst>(R)->getSyncScopeID());
   }
   if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
     if (int Res = cmpNumbers(RMWI->getOperation(),
@@ -612,8 +616,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpOrderings(RMWI->getOrdering(),
                              cast<AtomicRMWInst>(R)->getOrdering()))
       return Res;
-    return cmpNumbers(RMWI->getSynchScope(),
-                      cast<AtomicRMWInst>(R)->getSynchScope());
+    return cmpNumbers(RMWI->getSyncScopeID(),
+                      cast<AtomicRMWInst>(R)->getSyncScopeID());
   }
   if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
     const PHINode *PNR = cast<PHINode>(R);
diff --git a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 9844190..a98d072 100644
--- a/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 using namespace llvm;
@@ -21,11 +21,11 @@ using namespace llvm;
 /// Checks if we should import SGV as a definition, otherwise import as a
 /// declaration.
 bool FunctionImportGlobalProcessing::doImportAsDefinition(
-    const GlobalValue *SGV, DenseSet<const GlobalValue *> *GlobalsToImport) {
+    const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) {
 
   // For alias, we tie the definition to the base object. Extract it and recurse
   if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
-    if (GA->hasWeakAnyLinkage())
+    if (GA->isInterposable())
       return false;
     const GlobalObject *GO = GA->getBaseObject();
     if (!GO->hasLinkOnceODRLinkage())
@@ -34,7 +34,7 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition(
         GO, GlobalsToImport);
   }
   // Only import the globals requested for importing.
-  if (GlobalsToImport->count(SGV))
+  if (GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
     return true;
   // Otherwise no.
   return false;
@@ -57,7 +57,8 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
     return false;
 
   if (isPerformingImport()) {
-    assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) &&
+    assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
+            !isNonRenamableLocal(*SGV)) &&
            "Attempting to promote non-renamable local");
     // We don't know for sure yet if we are importing this value (as either
     // a reference or a def), since we are simply walking all values in the
@@ -254,9 +255,8 @@ bool FunctionImportGlobalProcessing::run() {
   return false;
 }
 
-bool llvm::renameModuleForThinLTO(
-    Module &M, const ModuleSummaryIndex &Index,
-    DenseSet<const GlobalValue *> *GlobalsToImport) {
+bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+                                  SetVector<GlobalValue *> *GlobalsToImport) {
   FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
   return ThinLTOProcessing.run();
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index 74ebcda..245fefb 100644
--- a/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -7,12 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
 
 using namespace llvm;
 
@@ -175,13 +188,9 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
   return false;
 }
 
+GlobalStatus::GlobalStatus() = default;
+
 bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
   SmallPtrSet<const PHINode *, 16> PhiUsers;
   return analyzeGlobalAux(V, GS, PhiUsers);
 }
-
-GlobalStatus::GlobalStatus()
-    : IsCompared(false), IsLoaded(false), StoredType(NotStored),
-      StoredOnceValue(nullptr), AccessingFunction(nullptr),
-      HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
-      Ordering(AtomicOrdering::NotAtomic) {}
diff --git a/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index ed018bb..b8c12ad 100644
--- a/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -62,6 +62,8 @@ void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
 void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
   ModuleName = M.getName();
   for (const auto &F : M.functions()) {
+    if (F.isDeclaration())
+      continue;
     AllFunctions++;
     ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
   }
diff --git a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
index a40079c..2a18c14 100644
--- a/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -20,19 +19,21 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -40,8 +41,9 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 
 using namespace llvm;
@@ -1107,26 +1109,23 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
   bool DTCalculated = false;
 
   Function *CalledFunc = CS.getCalledFunction();
-  for (Function::arg_iterator I = CalledFunc->arg_begin(),
-                              E = CalledFunc->arg_end();
-       I != E; ++I) {
-    unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0;
-    if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) {
+  for (Argument &Arg : CalledFunc->args()) {
+    unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
+    if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) {
       if (!DTCalculated) {
-        DT.recalculate(const_cast<Function&>(*CS.getInstruction()->getParent()
-                                               ->getParent()));
+        DT.recalculate(*CS.getCaller());
         DTCalculated = true;
       }
 
       // If we can already prove the asserted alignment in the context of the
       // caller, then don't bother inserting the assumption.
-      Value *Arg = CS.getArgument(I->getArgNo());
-      if (getKnownAlignment(Arg, DL, CS.getInstruction(), AC, &DT) >= Align)
+      Value *ArgVal = CS.getArgument(Arg.getArgNo());
+      if (getKnownAlignment(ArgVal, DL, CS.getInstruction(), AC, &DT) >= Align)
         continue;
 
-      CallInst *NewAssumption = IRBuilder<>(CS.getInstruction())
-                                    .CreateAlignmentAssumption(DL, Arg, Align);
-      AC->registerAssumption(NewAssumption);
+      CallInst *NewAsmp = IRBuilder<>(CS.getInstruction())
+                              .CreateAlignmentAssumption(DL, ArgVal, Align);
+      AC->registerAssumption(NewAsmp);
     }
   }
 }
@@ -1140,7 +1139,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
                                          ValueToValueMapTy &VMap,
                                          InlineFunctionInfo &IFI) {
   CallGraph &CG = *IFI.CG;
-  const Function *Caller = CS.getInstruction()->getParent()->getParent();
+  const Function *Caller = CS.getCaller();
   const Function *Callee = CS.getCalledFunction();
   CallGraphNode *CalleeNode = CG[Callee];
   CallGraphNode *CallerNode = CG[Caller];
@@ -1225,7 +1224,8 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   PointerType *ArgTy = cast<PointerType>(Arg->getType());
   Type *AggTy = ArgTy->getElementType();
 
-  Function *Caller = TheCall->getParent()->getParent();
+  Function *Caller = TheCall->getFunction();
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -1239,31 +1239,30 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
 
     AssumptionCache *AC =
         IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
-    const DataLayout &DL = Caller->getParent()->getDataLayout();
 
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
     if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >=
         ByValAlignment)
       return Arg;
-    
+
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
     // for code quality, but rarely happens and is required for correctness.
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  unsigned Align =
-      Caller->getParent()->getDataLayout().getPrefTypeAlignment(AggTy);
+  unsigned Align = DL.getPrefTypeAlignment(AggTy);
 
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
   // pointer inside the callee).
   Align = std::max(Align, ByValAlignment);
-  
-  Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), 
+
+  Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(),
+                                    nullptr, Align, Arg->getName(),
                                     &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
-  
+
   // Uses of the argument in the function should use our new alloca
   // instead.
   return NewAlloca;
@@ -1303,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) {
   return false;
 }
 
-/// Rebuild the entire inlined-at chain for this instruction so that the top of
-/// the chain now is inlined-at the new call site.
-static DebugLoc
-updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode,
-                    LLVMContext &Ctx,
-                    DenseMap<const DILocation *, DILocation *> &IANodes) {
-  SmallVector<DILocation *, 3> InlinedAtLocations;
-  DILocation *Last = InlinedAtNode;
-  DILocation *CurInlinedAt = DL;
-
-  // Gather all the inlined-at nodes
-  while (DILocation *IA = CurInlinedAt->getInlinedAt()) {
-    // Skip any we've already built nodes for
-    if (DILocation *Found = IANodes[IA]) {
-      Last = Found;
-      break;
-    }
-
-    InlinedAtLocations.push_back(IA);
-    CurInlinedAt = IA;
-  }
-
-  // Starting from the top, rebuild the nodes to point to the new inlined-at
-  // location (then rebuilding the rest of the chain behind it) and update the
-  // map of already-constructed inlined-at nodes.
-  for (const DILocation *MD : reverse(InlinedAtLocations)) {
-    Last = IANodes[MD] = DILocation::getDistinct(
-        Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last);
-  }
-
-  // And finally create the normal location for this instruction, referring to
-  // the new inlined-at chain.
-  return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last);
-}
-
 /// Return the result of AI->isStaticAlloca() if AI were moved to the entry
 /// block. Allocas used in inalloca calls and allocas of dynamic array size
 /// cannot be static.
@@ -1365,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
   // Cache the inlined-at nodes as they're built so they are reused, without
   // this every instruction's inlined-at chain would become distinct from each
   // other.
-  DenseMap<const DILocation *, DILocation *> IANodes;
+  DenseMap<const MDNode *, MDNode *> IANodes;
 
   for (; FI != Fn->end(); ++FI) {
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       if (DebugLoc DL = BI->getDebugLoc()) {
-        BI->setDebugLoc(
-            updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes));
+        auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(),
+                                            IANodes);
+        auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA);
+        BI->setDebugLoc(IDL);
         continue;
       }
 
@@ -1393,6 +1359,91 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     }
   }
 }
+/// Update the block frequencies of the caller after a callee has been inlined.
+///
+/// Each block cloned into the caller has its block frequency scaled by the
+/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
+/// callee's entry block gets the same frequency as the callsite block and the
+/// relative frequencies of all cloned blocks remain the same after cloning.
+static void updateCallerBFI(BasicBlock *CallSiteBlock,
+                            const ValueToValueMapTy &VMap,
+                            BlockFrequencyInfo *CallerBFI,
+                            BlockFrequencyInfo *CalleeBFI,
+                            const BasicBlock &CalleeEntryBlock) {
+  SmallPtrSet<BasicBlock *, 16> ClonedBBs;
+  for (auto const &Entry : VMap) {
+    if (!isa<BasicBlock>(Entry.first) || !Entry.second)
+      continue;
+    auto *OrigBB = cast<BasicBlock>(Entry.first);
+    auto *ClonedBB = cast<BasicBlock>(Entry.second);
+    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+    if (!ClonedBBs.insert(ClonedBB).second) {
+      // Multiple blocks in the callee might get mapped to one cloned block in
+      // the caller since we prune the callee as we clone it. When that happens,
+      // we want to use the maximum among the original blocks' frequencies.
+      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+      if (NewFreq > Freq)
+        Freq = NewFreq;
+    }
+    CallerBFI->setBlockFreq(ClonedBB, Freq);
+  }
+  BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
+  CallerBFI->setBlockFreqAndScale(
+      EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
+      ClonedBBs);
+}
+
+/// Update the branch metadata for cloned call instructions.
+static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
+                              const Optional<uint64_t> &CalleeEntryCount,
+                              const Instruction *TheCall,
+                              ProfileSummaryInfo *PSI,
+                              BlockFrequencyInfo *CallerBFI) {
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+    return;
+  Optional<uint64_t> CallSiteCount =
+      PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+  uint64_t CallCount =
+      std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
+               CalleeEntryCount.getValue());
+
+  for (auto const &Entry : VMap)
+    if (isa<CallInst>(Entry.first))
+      if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+        CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+  for (BasicBlock &BB : *Callee)
+    // No need to update the callsite if it is pruned during inlining.
+    if (VMap.count(&BB))
+      for (Instruction &I : BB)
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
+                               CalleeEntryCount.getValue());
+}
+
+/// Update the entry count of callee after inlining.
+///
+/// The callsite's block count is subtracted from the callee's function entry
+/// count.
+static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
+                              Instruction *CallInst, Function *Callee,
+                              ProfileSummaryInfo *PSI) {
+  // If the callee has a original count of N, and the estimated count of
+  // callsite is M, the new callee count is set to N - M. M is estimated from
+  // the caller's entry count, its entry block frequency and the block frequency
+  // of the callsite.
+  Optional<uint64_t> CalleeCount = Callee->getEntryCount();
+  if (!CalleeCount.hasValue() || !PSI)
+    return;
+  Optional<uint64_t> CallCount = PSI->getProfileCount(CallInst, CallerBFI);
+  if (!CallCount.hasValue())
+    return;
+  // Since CallSiteCount is an estimate, it could exceed the original callee
+  // count and has to be set to 0.
+  if (CallCount.getValue() > CalleeCount.getValue())
+    Callee->setEntryCount(0);
+  else
+    Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
+}
 
 /// This function inlines the called function into the basic block of the
 /// caller. This returns false if it is not possible to inline this call.
@@ -1405,13 +1456,13 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
 bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                           AAResults *CalleeAAR, bool InsertLifetime) {
   Instruction *TheCall = CS.getInstruction();
-  assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
-         "Instruction not in function!");
+  assert(TheCall->getParent() && TheCall->getFunction()
+         && "Instruction not in function!");
 
   // If IFI has any state in it, zap it before we fill it in.
   IFI.reset();
-  
-  const Function *CalledFunc = CS.getCalledFunction();
+
+  Function *CalledFunc = CS.getCalledFunction();
   if (!CalledFunc ||              // Can't inline external function or indirect
       CalledFunc->isDeclaration() || // call, or call to a vararg function!
       CalledFunc->getFunctionType()->isVarArg()) return false;
@@ -1548,7 +1599,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // matches up the formal to the actual argument values.
     CallSite::arg_iterator AI = CS.arg_begin();
     unsigned ArgNo = 0;
-    for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+    for (Function::arg_iterator I = CalledFunc->arg_begin(),
          E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
       Value *ActualArg = *AI;
 
@@ -1558,7 +1609,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // modify the struct.
       if (CS.isByValArgument(ArgNo)) {
         ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
-                                        CalledFunc->getParamAlignment(ArgNo+1));
+                                        CalledFunc->getParamAlignment(ArgNo));
         if (ActualArg != *AI)
           ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
       }
@@ -1578,10 +1629,19 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                               /*ModuleLevelChanges=*/false, Returns, ".i",
                               &InlinedFunctionInfo, TheCall);
-
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
 
+    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
+      // Update the BFI of blocks cloned into the caller.
+      updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
+                      CalledFunc->front());
+
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+                      IFI.PSI, IFI.CallerBFI);
+    // Update the profile count of callee.
+    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
+
     // Inject byval arguments initialization.
     for (std::pair<Value*, Value*> &Init : ByValInit)
       HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
@@ -2087,6 +2147,12 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                                           CalledFunc->getName() + ".exit");
   }
 
+  if (IFI.CallerBFI) {
+    // Copy original BB's block frequency to AfterCallBB
+    IFI.CallerBFI->setBlockFreq(
+        AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
+  }
+
   // Change the branch that used to go to AfterCallBB to branch to the first
   // basic block of the inlined function.
   //
@@ -2206,7 +2272,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     AssumptionCache *AC =
         IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
     auto &DL = Caller->getParent()->getDataLayout();
-    if (Value *V = SimplifyInstruction(PHI, DL, nullptr, nullptr, AC)) {
+    if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
       PHI->replaceAllUsesWith(V);
       PHI->eraseFromParent();
     }
diff --git a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index 8a1973d..23ec45e 100644
--- a/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -14,10 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 namespace {
@@ -26,16 +26,15 @@ namespace {
     InstNamer() : FunctionPass(ID) {
       initializeInstNamerPass(*PassRegistry::getPassRegistry());
     }
-    
+
     void getAnalysisUsage(AnalysisUsage &Info) const override {
       Info.setPreservesAll();
     }
 
     bool runOnFunction(Function &F) override {
-      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
-           AI != AE; ++AI)
-        if (!AI->hasName() && !AI->getType()->isVoidTy())
-          AI->setName("arg");
+      for (auto &Arg : F.args())
+        if (!Arg.hasName())
+          Arg.setName("arg");
 
       for (BasicBlock &BB : F) {
         if (!BB.hasName())
@@ -48,11 +47,11 @@ namespace {
       return true;
     }
   };
-  
+
   char InstNamer::ID = 0;
 }
 
-INITIALIZE_PASS(InstNamer, "instnamer", 
+INITIALIZE_PASS(InstNamer, "instnamer",
                 "Assign names to anonymous instructions", false, false)
 char &llvm::InstructionNamerID = InstNamer::ID;
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
index 68c6b74..089f2b5 100644
--- a/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -85,9 +85,11 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     UsesToRewrite.clear();
 
     Instruction *I = Worklist.pop_back_val();
+    assert(!I->getType()->isTokenTy() && "Tokens shouldn't be in the worklist");
     BasicBlock *InstBB = I->getParent();
     Loop *L = LI.getLoopFor(InstBB);
-    if (!LoopExitBlocks.count(L))   
+    assert(L && "Instruction belongs to a BB that's not part of a loop");
+    if (!LoopExitBlocks.count(L))
       L->getExitBlocks(LoopExitBlocks[L]);
     assert(LoopExitBlocks.count(L));
     const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
@@ -95,17 +97,10 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     if (ExitBlocks.empty())
       continue;
 
-    // Tokens cannot be used in PHI nodes, so we skip over them.
-    // We can run into tokens which are live out of a loop with catchswitch
-    // instructions in Windows EH if the catchswitch has one catchpad which
-    // is inside the loop and another which is not.
-    if (I->getType()->isTokenTy())
-      continue;
-
     for (Use &U : I->uses()) {
       Instruction *User = cast<Instruction>(U.getUser());
       BasicBlock *UserBB = User->getParent();
-      if (PHINode *PN = dyn_cast<PHINode>(User))
+      if (auto *PN = dyn_cast<PHINode>(User))
         UserBB = PN->getIncomingBlock(U);
 
       if (InstBB != UserBB && !L->contains(UserBB))
@@ -123,7 +118,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     // DomBB dominates the value, so adjust DomBB to the normal destination
     // block, which is effectively where the value is first usable.
     BasicBlock *DomBB = InstBB;
-    if (InvokeInst *Inv = dyn_cast<InvokeInst>(I))
+    if (auto *Inv = dyn_cast<InvokeInst>(I))
       DomBB = Inv->getNormalDest();
 
     DomTreeNode *DomNode = DT.getNode(DomBB);
@@ -188,7 +183,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
       // block.
       Instruction *User = cast<Instruction>(UseToRewrite->getUser());
       BasicBlock *UserBB = User->getParent();
-      if (PHINode *PN = dyn_cast<PHINode>(User))
+      if (auto *PN = dyn_cast<PHINode>(User))
         UserBB = PN->getIncomingBlock(*UseToRewrite);
 
       if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
@@ -213,13 +208,9 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 
     // Post process PHI instructions that were inserted into another disjoint
     // loop and update their exits properly.
-    for (auto *PostProcessPN : PostProcessPHIs) {
-      if (PostProcessPN->use_empty())
-        continue;
-
-      // Reprocess each PHI instruction.
-      Worklist.push_back(PostProcessPN);
-    }
+    for (auto *PostProcessPN : PostProcessPHIs)
+      if (!PostProcessPN->use_empty())
+        Worklist.push_back(PostProcessPN);
 
     // Keep track of PHI nodes that we want to remove because they did not have
     // any uses rewritten.
@@ -237,40 +228,75 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
   return Changed;
 }
 
-/// Return true if the specified block dominates at least
-/// one of the blocks in the specified list.
-static bool
-blockDominatesAnExit(BasicBlock *BB,
-                     DominatorTree &DT,
-                     const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
-  DomTreeNode *DomNode = DT.getNode(BB);
-  return any_of(ExitBlocks, [&](BasicBlock *EB) {
-    return DT.dominates(DomNode, DT.getNode(EB));
-  });
+// Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
+static void computeBlocksDominatingExits(
+    Loop &L, DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
+  SmallVector<BasicBlock *, 8> BBWorklist;
+
+  // We start from the exit blocks, as every block trivially dominates itself
+  // (not strictly).
+  for (BasicBlock *BB : ExitBlocks)
+    BBWorklist.push_back(BB);
+
+  while (!BBWorklist.empty()) {
+    BasicBlock *BB = BBWorklist.pop_back_val();
+
+    // Check if this is a loop header. If this is the case, we're done.
+    if (L.getHeader() == BB)
+      continue;
+
+    // Otherwise, add its immediate predecessor in the dominator tree to the
+    // worklist, unless we visited it already.
+    BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
+
+    // Exit blocks can have an immediate dominator not beloinging to the
+    // loop. For an exit block to be immediately dominated by another block
+    // outside the loop, it implies not all paths from that dominator, to the
+    // exit block, go through the loop.
+    // Example:
+    //
+    // |---- A
+    // |     |
+    // |     B<--
+    // |     |  |
+    // |---> C --
+    //       |
+    //       D
+    //
+    // C is the exit block of the loop and it's immediately dominated by A,
+    // which doesn't belong to the loop.
+    if (!L.contains(IDomBB))
+      continue;
+
+    if (BlocksDominatingExits.insert(IDomBB))
+      BBWorklist.push_back(IDomBB);
+  }
 }
 
 bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
                      ScalarEvolution *SE) {
   bool Changed = false;
 
-  // Get the set of exiting blocks.
   SmallVector<BasicBlock *, 8> ExitBlocks;
   L.getExitBlocks(ExitBlocks);
-
   if (ExitBlocks.empty())
     return false;
 
+  SmallSetVector<BasicBlock *, 8> BlocksDominatingExits;
+
+  // We want to avoid use-scanning leveraging dominance informations.
+  // If a block doesn't dominate any of the loop exits, the none of the values
+  // defined in the loop can be used outside.
+  // We compute the set of blocks fullfilling the conditions in advance
+  // walking the dominator tree upwards until we hit a loop header.
+  computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits);
+
   SmallVector<Instruction *, 8> Worklist;
 
   // Look at all the instructions in the loop, checking to see if they have uses
   // outside the loop.  If so, put them into the worklist to rewrite those uses.
-  for (BasicBlock *BB : L.blocks()) {
-    // For large loops, avoid use-scanning by using dominance information:  In
-    // particular, if a block does not dominate any of the loop exits, then none
-    // of the values defined in the block could be used outside the loop.
-    if (!blockDominatesAnExit(BB, DT, ExitBlocks))
-      continue;
-
+  for (BasicBlock *BB : BlocksDominatingExits) {
     for (Instruction &I : *BB) {
       // Reject two common cases fast: instructions with no uses (like stores)
       // and instructions with one use that is in the same block as this.
@@ -279,6 +305,13 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
            !isa<PHINode>(I.user_back())))
         continue;
 
+      // Tokens cannot be used in PHI nodes, so we skip over them.
+      // We can run into tokens which are live out of a loop with catchswitch
+      // instructions in Windows EH if the catchswitch has one catchpad which
+      // is inside the loop and another which is not.
+      if (I.getType()->isTokenTy())
+        continue;
+
       Worklist.push_back(&I);
     }
   }
@@ -395,8 +428,8 @@ PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!formLCSSAOnAllLoops(&LI, DT, SE))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
   PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<BasicAA>();
   PA.preserve<GlobalsAA>();
   PA.preserve<SCEVAA>();
diff --git a/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index d97cd75..42aca75 100644
--- a/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
@@ -48,16 +49,6 @@ using namespace llvm;
 STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted");
 STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted");
 
-static cl::opt<bool> LibCallsShrinkWrapDoDomainError(
-    "libcalls-shrinkwrap-domain-error", cl::init(true), cl::Hidden,
-    cl::desc("Perform shrink-wrap on lib calls with domain errors"));
-static cl::opt<bool> LibCallsShrinkWrapDoRangeError(
-    "libcalls-shrinkwrap-range-error", cl::init(true), cl::Hidden,
-    cl::desc("Perform shrink-wrap on lib calls with range errors"));
-static cl::opt<bool> LibCallsShrinkWrapDoPoleError(
-    "libcalls-shrinkwrap-pole-error", cl::init(true), cl::Hidden,
-    cl::desc("Perform shrink-wrap on lib calls with pole errors"));
-
 namespace {
 class LibCallsShrinkWrapLegacyPass : public FunctionPass {
 public:
@@ -82,10 +73,11 @@ INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
 namespace {
 class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
 public:
-  LibCallsShrinkWrap(const TargetLibraryInfo &TLI) : TLI(TLI), Changed(false){};
-  bool isChanged() const { return Changed; }
+  LibCallsShrinkWrap(const TargetLibraryInfo &TLI, DominatorTree *DT)
+      : TLI(TLI), DT(DT){};
   void visitCallInst(CallInst &CI) { checkCandidate(CI); }
-  void perform() {
+  bool perform() {
+    bool Changed = false;
     for (auto &CI : WorkList) {
       DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
                    << "\n");
@@ -94,18 +86,19 @@ public:
         DEBUG(dbgs() << "Transformed\n");
       }
     }
+    return Changed;
   }
 
 private:
   bool perform(CallInst *CI);
   void checkCandidate(CallInst &CI);
   void shrinkWrapCI(CallInst *CI, Value *Cond);
-  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc::Func &Func);
-  bool performCallErrors(CallInst *CI, const LibFunc::Func &Func);
-  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc::Func &Func);
-  Value *generateOneRangeCond(CallInst *CI, const LibFunc::Func &Func);
-  Value *generateTwoRangeCond(CallInst *CI, const LibFunc::Func &Func);
-  Value *generateCondForPow(CallInst *CI, const LibFunc::Func &Func);
+  bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func);
+  bool performCallErrors(CallInst *CI, const LibFunc &Func);
+  bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func);
+  Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func);
+  Value *generateCondForPow(CallInst *CI, const LibFunc &Func);
 
   // Create an OR of two conditions.
   Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
@@ -134,51 +127,51 @@ private:
   }
 
   const TargetLibraryInfo &TLI;
+  DominatorTree *DT;
   SmallVector<CallInst *, 16> WorkList;
-  bool Changed;
 };
 } // end anonymous namespace
 
 // Perform the transformation to calls with errno set by domain error.
 bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
-                                                    const LibFunc::Func &Func) {
+                                                    const LibFunc &Func) {
   Value *Cond = nullptr;
 
   switch (Func) {
-  case LibFunc::acos:  // DomainError: (x < -1 || x > 1)
-  case LibFunc::acosf: // Same as acos
-  case LibFunc::acosl: // Same as acos
-  case LibFunc::asin:  // DomainError: (x < -1 || x > 1)
-  case LibFunc::asinf: // Same as asin
-  case LibFunc::asinl: // Same as asin
+  case LibFunc_acos:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_acosf: // Same as acos
+  case LibFunc_acosl: // Same as acos
+  case LibFunc_asin:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_asinf: // Same as asin
+  case LibFunc_asinl: // Same as asin
   {
     ++NumWrappedTwoCond;
     Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
     break;
   }
-  case LibFunc::cos:  // DomainError: (x == +inf || x == -inf)
-  case LibFunc::cosf: // Same as cos
-  case LibFunc::cosl: // Same as cos
-  case LibFunc::sin:  // DomainError: (x == +inf || x == -inf)
-  case LibFunc::sinf: // Same as sin
-  case LibFunc::sinl: // Same as sin
+  case LibFunc_cos:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_cosf: // Same as cos
+  case LibFunc_cosl: // Same as cos
+  case LibFunc_sin:  // DomainError: (x == +inf || x == -inf)
+  case LibFunc_sinf: // Same as sin
+  case LibFunc_sinl: // Same as sin
   {
     ++NumWrappedTwoCond;
     Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
                         -INFINITY);
     break;
   }
-  case LibFunc::acosh:  // DomainError: (x < 1)
-  case LibFunc::acoshf: // Same as acosh
-  case LibFunc::acoshl: // Same as acosh
+  case LibFunc_acosh:  // DomainError: (x < 1)
+  case LibFunc_acoshf: // Same as acosh
+  case LibFunc_acoshl: // Same as acosh
   {
     ++NumWrappedOneCond;
     Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
     break;
   }
-  case LibFunc::sqrt:  // DomainError: (x < 0)
-  case LibFunc::sqrtf: // Same as sqrt
-  case LibFunc::sqrtl: // Same as sqrt
+  case LibFunc_sqrt:  // DomainError: (x < 0)
+  case LibFunc_sqrtf: // Same as sqrt
+  case LibFunc_sqrtl: // Same as sqrt
   {
     ++NumWrappedOneCond;
     Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
@@ -193,31 +186,31 @@ bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
 
 // Perform the transformation to calls with errno set by range error.
 bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
-                                                   const LibFunc::Func &Func) {
+                                                   const LibFunc &Func) {
   Value *Cond = nullptr;
 
   switch (Func) {
-  case LibFunc::cosh:
-  case LibFunc::coshf:
-  case LibFunc::coshl:
-  case LibFunc::exp:
-  case LibFunc::expf:
-  case LibFunc::expl:
-  case LibFunc::exp10:
-  case LibFunc::exp10f:
-  case LibFunc::exp10l:
-  case LibFunc::exp2:
-  case LibFunc::exp2f:
-  case LibFunc::exp2l:
-  case LibFunc::sinh:
-  case LibFunc::sinhf:
-  case LibFunc::sinhl: {
+  case LibFunc_cosh:
+  case LibFunc_coshf:
+  case LibFunc_coshl:
+  case LibFunc_exp:
+  case LibFunc_expf:
+  case LibFunc_expl:
+  case LibFunc_exp10:
+  case LibFunc_exp10f:
+  case LibFunc_exp10l:
+  case LibFunc_exp2:
+  case LibFunc_exp2f:
+  case LibFunc_exp2l:
+  case LibFunc_sinh:
+  case LibFunc_sinhf:
+  case LibFunc_sinhl: {
     Cond = generateTwoRangeCond(CI, Func);
     break;
   }
-  case LibFunc::expm1:  // RangeError: (709, inf)
-  case LibFunc::expm1f: // RangeError: (88, inf)
-  case LibFunc::expm1l: // RangeError: (11356, inf)
+  case LibFunc_expm1:  // RangeError: (709, inf)
+  case LibFunc_expm1f: // RangeError: (88, inf)
+  case LibFunc_expm1l: // RangeError: (11356, inf)
   {
     Cond = generateOneRangeCond(CI, Func);
     break;
@@ -231,63 +224,54 @@ bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
 
 // Perform the transformation to calls with errno set by combination of errors.
 bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
-                                           const LibFunc::Func &Func) {
+                                           const LibFunc &Func) {
   Value *Cond = nullptr;
 
   switch (Func) {
-  case LibFunc::atanh:  // DomainError: (x < -1 || x > 1)
+  case LibFunc_atanh:  // DomainError: (x < -1 || x > 1)
                         // PoleError:   (x == -1 || x == 1)
                         // Overall Cond: (x <= -1 || x >= 1)
-  case LibFunc::atanhf: // Same as atanh
-  case LibFunc::atanhl: // Same as atanh
+  case LibFunc_atanhf: // Same as atanh
+  case LibFunc_atanhl: // Same as atanh
   {
-    if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
-      return false;
     ++NumWrappedTwoCond;
     Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
     break;
   }
-  case LibFunc::log:    // DomainError: (x < 0)
+  case LibFunc_log:    // DomainError: (x < 0)
                         // PoleError:   (x == 0)
                         // Overall Cond: (x <= 0)
-  case LibFunc::logf:   // Same as log
-  case LibFunc::logl:   // Same as log
-  case LibFunc::log10:  // Same as log
-  case LibFunc::log10f: // Same as log
-  case LibFunc::log10l: // Same as log
-  case LibFunc::log2:   // Same as log
-  case LibFunc::log2f:  // Same as log
-  case LibFunc::log2l:  // Same as log
-  case LibFunc::logb:   // Same as log
-  case LibFunc::logbf:  // Same as log
-  case LibFunc::logbl:  // Same as log
+  case LibFunc_logf:   // Same as log
+  case LibFunc_logl:   // Same as log
+  case LibFunc_log10:  // Same as log
+  case LibFunc_log10f: // Same as log
+  case LibFunc_log10l: // Same as log
+  case LibFunc_log2:   // Same as log
+  case LibFunc_log2f:  // Same as log
+  case LibFunc_log2l:  // Same as log
+  case LibFunc_logb:   // Same as log
+  case LibFunc_logbf:  // Same as log
+  case LibFunc_logbl:  // Same as log
   {
-    if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
-      return false;
     ++NumWrappedOneCond;
     Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
     break;
   }
-  case LibFunc::log1p:  // DomainError: (x < -1)
+  case LibFunc_log1p:  // DomainError: (x < -1)
                         // PoleError:   (x == -1)
                         // Overall Cond: (x <= -1)
-  case LibFunc::log1pf: // Same as log1p
-  case LibFunc::log1pl: // Same as log1p
+  case LibFunc_log1pf: // Same as log1p
+  case LibFunc_log1pl: // Same as log1p
   {
-    if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError)
-      return false;
     ++NumWrappedOneCond;
     Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
     break;
   }
-  case LibFunc::pow: // DomainError: x < 0 and y is noninteger
+  case LibFunc_pow: // DomainError: x < 0 and y is noninteger
                      // PoleError:   x == 0 and y < 0
                      // RangeError:  overflow or underflow
-  case LibFunc::powf:
-  case LibFunc::powl: {
-    if (!LibCallsShrinkWrapDoDomainError || !LibCallsShrinkWrapDoPoleError ||
-        !LibCallsShrinkWrapDoRangeError)
-      return false;
+  case LibFunc_powf:
+  case LibFunc_powl: {
     Cond = generateCondForPow(CI, Func);
     if (Cond == nullptr)
       return false;
@@ -313,7 +297,7 @@ void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
   if (!CI.use_empty())
     return;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI.getCalledFunction();
   if (!Callee)
     return;
@@ -333,20 +317,20 @@ void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
 
 // Generate the upper bound condition for RangeError.
 Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
-                                                const LibFunc::Func &Func) {
+                                                const LibFunc &Func) {
   float UpperBound;
   switch (Func) {
-  case LibFunc::expm1: // RangeError: (709, inf)
+  case LibFunc_expm1: // RangeError: (709, inf)
     UpperBound = 709.0f;
     break;
-  case LibFunc::expm1f: // RangeError: (88, inf)
+  case LibFunc_expm1f: // RangeError: (88, inf)
     UpperBound = 88.0f;
     break;
-  case LibFunc::expm1l: // RangeError: (11356, inf)
+  case LibFunc_expm1l: // RangeError: (11356, inf)
     UpperBound = 11356.0f;
     break;
   default:
-    llvm_unreachable("Should be reach here");
+    llvm_unreachable("Unhandled library call!");
   }
 
   ++NumWrappedOneCond;
@@ -355,62 +339,62 @@ Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
 
 // Generate the lower and upper bound condition for RangeError.
 Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
-                                                const LibFunc::Func &Func) {
+                                                const LibFunc &Func) {
   float UpperBound, LowerBound;
   switch (Func) {
-  case LibFunc::cosh: // RangeError: (x < -710 || x > 710)
-  case LibFunc::sinh: // Same as cosh
+  case LibFunc_cosh: // RangeError: (x < -710 || x > 710)
+  case LibFunc_sinh: // Same as cosh
     LowerBound = -710.0f;
     UpperBound = 710.0f;
     break;
-  case LibFunc::coshf: // RangeError: (x < -89 || x > 89)
-  case LibFunc::sinhf: // Same as coshf
+  case LibFunc_coshf: // RangeError: (x < -89 || x > 89)
+  case LibFunc_sinhf: // Same as coshf
     LowerBound = -89.0f;
     UpperBound = 89.0f;
     break;
-  case LibFunc::coshl: // RangeError: (x < -11357 || x > 11357)
-  case LibFunc::sinhl: // Same as coshl
+  case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357)
+  case LibFunc_sinhl: // Same as coshl
     LowerBound = -11357.0f;
     UpperBound = 11357.0f;
     break;
-  case LibFunc::exp: // RangeError: (x < -745 || x > 709)
+  case LibFunc_exp: // RangeError: (x < -745 || x > 709)
     LowerBound = -745.0f;
     UpperBound = 709.0f;
     break;
-  case LibFunc::expf: // RangeError: (x < -103 || x > 88)
+  case LibFunc_expf: // RangeError: (x < -103 || x > 88)
     LowerBound = -103.0f;
     UpperBound = 88.0f;
     break;
-  case LibFunc::expl: // RangeError: (x < -11399 || x > 11356)
+  case LibFunc_expl: // RangeError: (x < -11399 || x > 11356)
     LowerBound = -11399.0f;
     UpperBound = 11356.0f;
     break;
-  case LibFunc::exp10: // RangeError: (x < -323 || x > 308)
+  case LibFunc_exp10: // RangeError: (x < -323 || x > 308)
     LowerBound = -323.0f;
     UpperBound = 308.0f;
     break;
-  case LibFunc::exp10f: // RangeError: (x < -45 || x > 38)
+  case LibFunc_exp10f: // RangeError: (x < -45 || x > 38)
     LowerBound = -45.0f;
     UpperBound = 38.0f;
     break;
-  case LibFunc::exp10l: // RangeError: (x < -4950 || x > 4932)
+  case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932)
     LowerBound = -4950.0f;
     UpperBound = 4932.0f;
     break;
-  case LibFunc::exp2: // RangeError: (x < -1074 || x > 1023)
+  case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023)
     LowerBound = -1074.0f;
     UpperBound = 1023.0f;
     break;
-  case LibFunc::exp2f: // RangeError: (x < -149 || x > 127)
+  case LibFunc_exp2f: // RangeError: (x < -149 || x > 127)
     LowerBound = -149.0f;
     UpperBound = 127.0f;
     break;
-  case LibFunc::exp2l: // RangeError: (x < -16445 || x > 11383)
+  case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383)
     LowerBound = -16445.0f;
     UpperBound = 11383.0f;
     break;
   default:
-    llvm_unreachable("Should be reach here");
+    llvm_unreachable("Unhandled library call!");
   }
 
   ++NumWrappedTwoCond;
@@ -434,9 +418,9 @@ Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
 // (i.e. we might invoke the calls that will not set the errno.).
 //
 Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
-                                              const LibFunc::Func &Func) {
-  // FIXME: LibFunc::powf and powl TBD.
-  if (Func != LibFunc::pow) {
+                                              const LibFunc &Func) {
+  // FIXME: LibFunc_powf and powl TBD.
+  if (Func != LibFunc_pow) {
     DEBUG(dbgs() << "Not handled powf() and powl()\n");
     return nullptr;
   }
@@ -499,14 +483,17 @@ Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
 
 // Wrap conditions that can potentially generate errno to the library call.
 void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
-  assert(Cond != nullptr && "hrinkWrapCI is not expecting an empty call inst");
+  assert(Cond != nullptr && "ShrinkWrapCI is not expecting an empty call inst");
   MDNode *BranchWeights =
       MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
+
   TerminatorInst *NewInst =
-      SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights);
+      SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
   BasicBlock *CallBB = NewInst->getParent();
   CallBB->setName("cdce.call");
-  CallBB->getSingleSuccessor()->setName("cdce.end");
+  BasicBlock *SuccBB = CallBB->getSingleSuccessor();
+  assert(SuccBB && "The split block should have a single successor");
+  SuccBB->setName("cdce.end");
   CI->removeFromParent();
   CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
   DEBUG(dbgs() << "== Basic Block After ==");
@@ -516,38 +503,44 @@ void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
 
 // Perform the transformation to a single candidate.
 bool LibCallsShrinkWrap::perform(CallInst *CI) {
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   assert(Callee && "perform() should apply to a non-empty callee");
   TLI.getLibFunc(*Callee, Func);
   assert(Func && "perform() is not expecting an empty function");
 
-  if (LibCallsShrinkWrapDoDomainError && performCallDomainErrorOnly(CI, Func))
-    return true;
-
-  if (LibCallsShrinkWrapDoRangeError && performCallRangeErrorOnly(CI, Func))
+  if (performCallDomainErrorOnly(CI, Func) || performCallRangeErrorOnly(CI, Func))
     return true;
-
   return performCallErrors(CI, Func);
 }
 
 void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DominatorTreeWrapperPass>();
   AU.addPreserved<GlobalsAAWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
 }
 
-static bool runImpl(Function &F, const TargetLibraryInfo &TLI) {
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
+                    DominatorTree *DT) {
   if (F.hasFnAttribute(Attribute::OptimizeForSize))
     return false;
-  LibCallsShrinkWrap CCDCE(TLI);
+  LibCallsShrinkWrap CCDCE(TLI, DT);
   CCDCE.visit(F);
-  CCDCE.perform();
-  return CCDCE.isChanged();
+  bool Changed = CCDCE.perform();
+
+// Verify the dominator after we've updated it locally.
+#ifndef NDEBUG
+  if (DT)
+    DT->verifyDomTree();
+#endif
+  return Changed;
 }
 
 bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  return runImpl(F, TLI);
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  return runImpl(F, TLI, DT);
 }
 
 namespace llvm {
@@ -561,11 +554,12 @@ FunctionPass *createLibCallsShrinkWrapPass() {
 PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F,
                                               FunctionAnalysisManager &FAM) {
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
-  bool Changed = runImpl(F, TLI);
-  if (!Changed)
+  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+  if (!runImpl(F, TLI, DT))
     return PreservedAnalyses::all();
   auto PA = PreservedAnalyses();
   PA.preserve<GlobalsAA>();
+  PA.preserve<DominatorTreeAnalysis>();
   return PA;
 }
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm/lib/Transforms/Utils/Local.cpp
index 6e4174a..7461061 100644
--- a/contrib/llvm/lib/Transforms/Utils/Local.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Local.cpp
@@ -22,10 +22,11 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
@@ -45,6 +46,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -126,21 +128,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     // If the default is unreachable, ignore it when searching for TheOnlyDest.
     if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
         SI->getNumCases() > 0) {
-      TheOnlyDest = SI->case_begin().getCaseSuccessor();
+      TheOnlyDest = SI->case_begin()->getCaseSuccessor();
     }
 
     // Figure out which case it goes to.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
       // Found case matching a constant operand?
-      if (i.getCaseValue() == CI) {
-        TheOnlyDest = i.getCaseSuccessor();
+      if (i->getCaseValue() == CI) {
+        TheOnlyDest = i->getCaseSuccessor();
         break;
       }
 
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
-      if (i.getCaseSuccessor() == DefaultDest) {
+      if (i->getCaseSuccessor() == DefaultDest) {
         MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
         unsigned NCases = SI->getNumCases();
         // Fold the case metadata into the default if there will be any branches
@@ -154,7 +155,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
             Weights.push_back(CI->getValue().getZExtValue());
           }
           // Merge weight of this case to the default weight.
-          unsigned idx = i.getCaseIndex();
+          unsigned idx = i->getCaseIndex();
           Weights[0] += Weights[idx+1];
           // Remove weight for this case.
           std::swap(Weights[idx+1], Weights.back());
@@ -165,15 +166,19 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         }
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
-        SI->removeCase(i);
-        --i; --e;
+        i = SI->removeCase(i);
+        e = SI->case_end();
         continue;
       }
 
       // Otherwise, check to see if the switch only branches to one destination.
       // We do this by reseting "TheOnlyDest" to null when we find two non-equal
       // destinations.
-      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = nullptr;
+      if (i->getCaseSuccessor() != TheOnlyDest)
+        TheOnlyDest = nullptr;
+
+      // Increment this iterator as we haven't removed the case.
+      ++i;
     }
 
     if (CI && !TheOnlyDest) {
@@ -209,7 +214,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     if (SI->getNumCases() == 1) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
-      SwitchInst::CaseIt FirstCase = SI->case_begin();
+      auto FirstCase = *SI->case_begin();
       Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
           FirstCase.getCaseValue(), "cond");
 
@@ -287,7 +292,15 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
 ///
 bool llvm::isInstructionTriviallyDead(Instruction *I,
                                       const TargetLibraryInfo *TLI) {
-  if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
+  if (!I->use_empty())
+    return false;
+  return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
+bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
+                                           const TargetLibraryInfo *TLI) {
+  if (isa<TerminatorInst>(I))
+    return false;
 
   // We don't want the landingpad-like instructions removed by anything this
   // general.
@@ -307,7 +320,8 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
     return true;
   }
 
-  if (!I->mayHaveSideEffects()) return true;
+  if (!I->mayHaveSideEffects())
+    return true;
 
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
@@ -334,7 +348,8 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
     }
   }
 
-  if (isAllocLikeFn(I, TLI)) return true;
+  if (isAllocLikeFn(I, TLI))
+    return true;
 
   if (CallInst *CI = isFreeCall(I, TLI))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
@@ -548,7 +563,7 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred) {
   // that can be removed.
   BB->removePredecessor(Pred, true);
 
-  WeakVH PhiIt = &BB->front();
+  WeakTrackingVH PhiIt = &BB->front();
   while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
     PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
     Value *OldPhiIt = PhiIt;
@@ -1023,17 +1038,15 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
                                           const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
-  unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType());
 
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  computeKnownBits(V, KnownZero, KnownOne, DL, 0, AC, CxtI, DT);
-  unsigned TrailZ = KnownZero.countTrailingOnes();
+  KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT);
+  unsigned TrailZ = Known.countMinTrailingZeros();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
   // those computed from a null pointer.
   TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
 
-  unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
+  unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ);
 
   // LLVM doesn't support alignments larger than this currently.
   Align = std::min(Align, +Value::MaximumAlignment);
@@ -1069,17 +1082,17 @@ static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
 }
 
 /// See if there is a dbg.value intrinsic for DIVar for the PHI node.
-static bool PhiHasDebugValue(DILocalVariable *DIVar, 
+static bool PhiHasDebugValue(DILocalVariable *DIVar,
                              DIExpression *DIExpr,
                              PHINode *APN) {
   // Since we can't guarantee that the original dbg.declare instrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
-  DbgValueList DbgValues;
-  FindAllocaDbgValues(DbgValues, APN);
-  for (auto DVI : DbgValues) {
-    assert (DVI->getValue() == APN);
-    assert (DVI->getOffset() == 0);
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, APN);
+  for (auto *DVI : DbgValues) {
+    assert(DVI->getValue() == APN);
+    assert(DVI->getOffset() == 0);
     if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
       return true;
   }
@@ -1091,8 +1104,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
 void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
   auto *DIVar = DDI->getVariable();
-  auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
+  auto *DIExpr = DDI->getExpression();
+  Value *DV = SI->getOperand(0);
 
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
@@ -1102,34 +1116,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
     ExtendedArg = dyn_cast<Argument>(SExt->getOperand(0));
   if (ExtendedArg) {
-    // We're now only describing a subset of the variable. The fragment we're
-    // describing will always be smaller than the variable size, because
-    // VariableSize == Size of Alloca described by DDI. Since SI stores
-    // to the alloca described by DDI, if it's first operand is an extend,
-    // we're guaranteed that before extension, the value was narrower than
-    // the size of the alloca, hence the size of the described variable.
-    SmallVector<uint64_t, 3> Ops;
-    unsigned FragmentOffset = 0;
-    // If this already is a bit fragment, we drop the bit fragment from the
-    // expression and record the offset.
-    auto Fragment = DIExpr->getFragmentInfo();
-    if (Fragment) {
-      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3);
-      FragmentOffset = Fragment->OffsetInBits;
-    } else {
-      Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    // If this DDI was already describing only a fragment of a variable, ensure
+    // that fragment is appropriately narrowed here.
+    // But if a fragment wasn't used, describe the value as the original
+    // argument (rather than the zext or sext) so that it remains described even
+    // if the sext/zext is optimized away. This widens the variable description,
+    // leaving it up to the consumer to know how the smaller value may be
+    // represented in a larger register.
+    if (auto Fragment = DIExpr->getFragmentInfo()) {
+      unsigned FragmentOffset = Fragment->OffsetInBits;
+      SmallVector<uint64_t, 3> Ops(DIExpr->elements_begin(),
+                                   DIExpr->elements_end() - 3);
+      Ops.push_back(dwarf::DW_OP_LLVM_fragment);
+      Ops.push_back(FragmentOffset);
+      const DataLayout &DL = DDI->getModule()->getDataLayout();
+      Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
+      DIExpr = Builder.createExpression(Ops);
     }
-    Ops.push_back(dwarf::DW_OP_LLVM_fragment);
-    Ops.push_back(FragmentOffset);
-    const DataLayout &DL = DDI->getModule()->getDataLayout();
-    Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType()));
-    auto NewDIExpr = Builder.createExpression(Ops);
-    if (!LdStHasDebugValue(DIVar, NewDIExpr, SI))
-      Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr,
-                                      DDI->getDebugLoc(), SI);
-  } else if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-    Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr,
-                                    DDI->getDebugLoc(), SI);
+    DV = ExtendedArg;
+  }
+  if (!LdStHasDebugValue(DIVar, DIExpr, SI))
+    Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(),
+                                    SI);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1152,7 +1160,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   DbgValue->insertAfter(LI);
 }
 
-/// Inserts a llvm.dbg.value intrinsic after a phi 
+/// Inserts a llvm.dbg.value intrinsic after a phi
 /// that has an associated llvm.dbg.decl intrinsic.
 void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            PHINode *APN, DIBuilder &Builder) {
@@ -1214,13 +1222,9 @@ bool llvm::LowerDbgDeclare(Function &F) {
           // This is a call by-value or some other instruction that
           // takes a pointer to the variable. Insert a *value*
           // intrinsic that describes the alloca.
-          SmallVector<uint64_t, 1> NewDIExpr;
-          auto *DIExpr = DDI->getExpression();
-          NewDIExpr.push_back(dwarf::DW_OP_deref);
-          NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
           DIB.insertDbgValueIntrinsic(AI, 0, DDI->getVariable(),
-                                      DIB.createExpression(NewDIExpr),
-                                      DDI->getDebugLoc(), CI);
+                                      DDI->getExpression(), DDI->getDebugLoc(),
+                                      CI);
         }
       }
       DDI->eraseFromParent();
@@ -1241,9 +1245,7 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
-/// FindAllocaDbgValues - Finds the llvm.dbg.value intrinsics describing the
-/// alloca 'V', if any.
-void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
+void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1251,37 +1253,6 @@ void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
           DbgValues.push_back(DVI);
 }
 
-static void DIExprAddDeref(SmallVectorImpl<uint64_t> &Expr) {
-  Expr.push_back(dwarf::DW_OP_deref);
-}
-
-static void DIExprAddOffset(SmallVectorImpl<uint64_t> &Expr, int Offset) {
-  if (Offset > 0) {
-    Expr.push_back(dwarf::DW_OP_plus);
-    Expr.push_back(Offset);
-  } else if (Offset < 0) {
-    Expr.push_back(dwarf::DW_OP_minus);
-    Expr.push_back(-Offset);
-  }
-}
-
-static DIExpression *BuildReplacementDIExpr(DIBuilder &Builder,
-                                            DIExpression *DIExpr, bool Deref,
-                                            int Offset) {
-  if (!Deref && !Offset)
-    return DIExpr;
-  // Create a copy of the original DIDescriptor for user variable, prepending
-  // "deref" operation to a list of address elements, as new llvm.dbg.declare
-  // will take a value storing address of the memory for variable, not
-  // alloca itself.
-  SmallVector<uint64_t, 4> NewDIExpr;
-  if (Deref)
-    DIExprAddDeref(NewDIExpr);
-  DIExprAddOffset(NewDIExpr, Offset);
-  if (DIExpr)
-    NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
-  return Builder.createExpression(NewDIExpr);
-}
 
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
                              Instruction *InsertBefore, DIBuilder &Builder,
@@ -1293,9 +1264,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   auto *DIVar = DDI->getVariable();
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
-
-  DIExpr = BuildReplacementDIExpr(Builder, DIExpr, Deref, Offset);
-
+  DIExpr = DIExpression::prepend(DIExpr, Deref, Offset);
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
   Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
@@ -1326,11 +1295,11 @@ static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
   // Insert the offset immediately after the first deref.
   // We could just change the offset argument of dbg.value, but it's unsigned...
   if (Offset) {
-    SmallVector<uint64_t, 4> NewDIExpr;
-    DIExprAddDeref(NewDIExpr);
-    DIExprAddOffset(NewDIExpr, Offset);
-    NewDIExpr.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
-    DIExpr = Builder.createExpression(NewDIExpr);
+    SmallVector<uint64_t, 4> Ops;
+    Ops.push_back(dwarf::DW_OP_deref);
+    DIExpression::appendOffset(Ops, Offset);
+    Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
+    DIExpr = Builder.createExpression(Ops);
   }
 
   Builder.insertDbgValueIntrinsic(NewAddress, DVI->getOffset(), DIVar, DIExpr,
@@ -1349,6 +1318,57 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
       }
 }
 
+void llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  auto &M = *I.getModule();
+
+  auto MDWrap = [&](Value *V) {
+    return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+  };
+
+  if (isa<BitCastInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value
+      // to use the cast's source.
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+    }
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      unsigned BitWidth =
+          M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
+      APInt Offset(BitWidth, 0);
+      // Rewrite a constant GEP into a DIExpression.  Since we are performing
+      // arithmetic to compute the variable's *value* in the DIExpression, we
+      // need to mark the expression with a DW_OP_stack_value.
+      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
+        auto *DIExpr = DVI->getExpression();
+        DIBuilder DIB(M, /*AllowUnresolved*/ false);
+        // GEP offsets are i32 and thus always fit into an int64_t.
+        DIExpr = DIExpression::prepend(DIExpr, DIExpression::NoDeref,
+                                       Offset.getSExtValue(),
+                                       DIExpression::WithStackValue);
+        DVI->setOperand(0, MDWrap(I.getOperand(0)));
+        DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+        DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+      }
+    }
+  } else if (isa<LoadInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Rewrite the load into DW_OP_deref.
+      auto *DIExpr = DVI->getExpression();
+      DIBuilder DIB(M, /*AllowUnresolved*/ false);
+      DIExpr = DIExpression::prepend(DIExpr, DIExpression::WithDeref);
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+      DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
+    }
+  }
+}
+
 unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
@@ -1450,7 +1470,7 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
   II->setAttributes(CI->getAttributes());
 
   // Make sure that anything using the call now uses the invoke!  This also
-  // updates the CallGraph if present, because it uses a WeakVH.
+  // updates the CallGraph if present, because it uses a WeakTrackingVH.
   CI->replaceAllUsesWith(II);
 
   // Delete the original call
@@ -1642,9 +1662,10 @@ void llvm::removeUnwindEdge(BasicBlock *BB) {
   TI->eraseFromParent();
 }
 
-/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// removeUnreachableBlocks - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise.
+/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo
+/// after modifying the CFG.
 bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI) {
   SmallPtrSet<BasicBlock*, 16> Reachable;
   bool Changed = markAliveBlocks(F, Reachable);
@@ -1723,12 +1744,12 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         // Preserve !invariant.group in K.
         break;
       case LLVMContext::MD_align:
-        K->setMetadata(Kind, 
+        K->setMetadata(Kind,
           MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
       case LLVMContext::MD_dereferenceable:
       case LLVMContext::MD_dereferenceable_or_null:
-        K->setMetadata(Kind, 
+        K->setMetadata(Kind,
           MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
         break;
     }
@@ -1755,46 +1776,62 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) {
   combineMetadata(K, J, KnownIDs);
 }
 
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
-                                        DominatorTree &DT,
-                                        const BasicBlockEdge &Root) {
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+                                         const RootType &Root,
+                                         const DominatesFn &Dominates) {
   assert(From->getType() == To->getType());
-  
+
   unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
-       UI != UE; ) {
+       UI != UE;) {
     Use &U = *UI++;
-    if (DT.dominates(Root, U)) {
-      U.set(To);
-      DEBUG(dbgs() << "Replace dominated use of '"
-            << From->getName() << "' as "
-            << *To << " in " << *U << "\n");
-      ++Count;
-    }
+    if (!Dominates(Root, U))
+      continue;
+    U.set(To);
+    DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
+                 << *To << " in " << *U << "\n");
+    ++Count;
   }
   return Count;
 }
 
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
-                                        DominatorTree &DT,
-                                        const BasicBlock *BB) {
-  assert(From->getType() == To->getType());
+unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
+   assert(From->getType() == To->getType());
+   auto *BB = From->getParent();
+   unsigned Count = 0;
 
-  unsigned Count = 0;
   for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
        UI != UE;) {
     Use &U = *UI++;
     auto *I = cast<Instruction>(U.getUser());
-    if (DT.properlyDominates(BB, I->getParent())) {
-      U.set(To);
-      DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as "
-                   << *To << " in " << *U << "\n");
-      ++Count;
-    }
+    if (I->getParent() == BB)
+      continue;
+    U.set(To);
+    ++Count;
   }
   return Count;
 }
 
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlockEdge &Root) {
+  auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+    return DT.dominates(Root, U);
+  };
+  return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+                                        DominatorTree &DT,
+                                        const BasicBlock *BB) {
+  auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+    auto *I = cast<Instruction>(U.getUser())->getParent();
+    return DT.properlyDominates(BB, I);
+  };
+  return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
+}
+
 bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
   // Check if the function is specifically marked as a gc leaf function.
   if (CS.hasFnAttr("gc-leaf-function"))
@@ -1812,6 +1849,49 @@ bool llvm::callsGCLeafFunction(ImmutableCallSite CS) {
   return false;
 }
 
+void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N,
+                               LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // This only directly applies if the new type is also a pointer.
+  if (NewTy->isPointerTy()) {
+    NewLI.setMetadata(LLVMContext::MD_nonnull, N);
+    return;
+  }
+
+  // The only other translation we can do is to integral loads with !range
+  // metadata.
+  if (!NewTy->isIntegerTy())
+    return;
+
+  MDBuilder MDB(NewLI.getContext());
+  const Value *Ptr = OldLI.getPointerOperand();
+  auto *ITy = cast<IntegerType>(NewTy);
+  auto *NullInt = ConstantExpr::getPtrToInt(
+      ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
+  auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
+  NewLI.setMetadata(LLVMContext::MD_range,
+                    MDB.createRange(NonNullInt, NullInt));
+}
+
+void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
+                             MDNode *N, LoadInst &NewLI) {
+  auto *NewTy = NewLI.getType();
+
+  // Give up unless it is converted to a pointer where there is a single very
+  // valuable mapping we can do reliably.
+  // FIXME: It would be nice to propagate this in more ways, but the type
+  // conversions make it hard.
+  if (!NewTy->isPointerTy())
+    return;
+
+  unsigned BitWidth = DL.getTypeSizeInBits(NewTy);
+  if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
+    MDNode *NN = MDNode::get(OldLI.getContext(), None);
+    NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
+  }
+}
+
 namespace {
 /// A potential constituent of a bitreverse or bswap expression. See
 /// collectBitParts for a fuller explanation.
@@ -1933,7 +2013,7 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
       unsigned NumMaskedBits = AndMask.countPopulation();
       if (!MatchBitReversals && NumMaskedBits % 8 != 0)
         return Result;
-      
+
       auto &Res = collectBitParts(I->getOperand(0), MatchBSwaps,
                                   MatchBitReversals, BPS);
       if (!Res)
@@ -2068,9 +2148,63 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
 void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
     CallInst *CI, const TargetLibraryInfo *TLI) {
   Function *F = CI->getCalledFunction();
-  LibFunc::Func Func;
+  LibFunc Func;
   if (F && !F->hasLocalLinkage() && F->hasName() &&
       TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
       !F->doesNotAccessMemory())
-    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+}
+
+bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
+  // We can't have a PHI with a metadata type.
+  if (I->getOperand(OpIdx)->getType()->isMetadataTy())
+    return false;
+
+  // Early exit.
+  if (!isa<Constant>(I->getOperand(OpIdx)))
+    return true;
+
+  switch (I->getOpcode()) {
+  default:
+    return true;
+  case Instruction::Call:
+  case Instruction::Invoke:
+    // Can't handle inline asm. Skip it.
+    if (isa<InlineAsm>(ImmutableCallSite(I).getCalledValue()))
+      return false;
+    // Many arithmetic intrinsics have no issue taking a
+    // variable, however it's hard to distingish these from
+    // specials such as @llvm.frameaddress that require a constant.
+    if (isa<IntrinsicInst>(I))
+      return false;
+
+    // Constant bundle operands may need to retain their constant-ness for
+    // correctness.
+    if (ImmutableCallSite(I).isBundleOperand(OpIdx))
+      return false;
+    return true;
+  case Instruction::ShuffleVector:
+    // Shufflevector masks are constant.
+    return OpIdx != 2;
+  case Instruction::Switch:
+  case Instruction::ExtractValue:
+    // All operands apart from the first are constant.
+    return OpIdx == 0;
+  case Instruction::InsertValue:
+    // All operands apart from the first and the second are constant.
+    return OpIdx < 2;
+  case Instruction::Alloca:
+    // Static allocas (constant size in the entry block) are handled by
+    // prologue/epilogue insertion so they're free anyway. We definitely don't
+    // want to make them non-constant.
+    return !dyn_cast<AllocaInst>(I)->isStaticAlloca();
+  case Instruction::GetElementPtr:
+    if (OpIdx == 0)
+      return true;
+    gep_type_iterator It = gep_type_begin(I);
+    for (auto E = std::next(It, OpIdx); It != E; ++It)
+      if (It.isStruct())
+        return false;
+    return true;
+  }
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 00cda2a..e21e34d 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -38,15 +38,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -65,6 +64,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -72,7 +72,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-simplify"
 
-STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
 STATISTIC(NumNested  , "Number of nested loops split out");
 
 // If the block isn't already, move the new block to right after some 'outside
@@ -152,37 +151,6 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
   return PreheaderBB;
 }
 
-/// \brief Ensure that the loop preheader dominates all exit blocks.
-///
-/// This method is used to split exit blocks that have predecessors outside of
-/// the loop.
-static BasicBlock *rewriteLoopExitBlock(Loop *L, BasicBlock *Exit,
-                                        DominatorTree *DT, LoopInfo *LI,
-                                        bool PreserveLCSSA) {
-  SmallVector<BasicBlock*, 8> LoopBlocks;
-  for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) {
-    BasicBlock *P = *I;
-    if (L->contains(P)) {
-      // Don't do this if the loop is exited via an indirect branch.
-      if (isa<IndirectBrInst>(P->getTerminator())) return nullptr;
-
-      LoopBlocks.push_back(P);
-    }
-  }
-
-  assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
-  BasicBlock *NewExitBB = nullptr;
-
-  NewExitBB = SplitBlockPredecessors(Exit, LoopBlocks, ".loopexit", DT, LI,
-                                     PreserveLCSSA);
-  if (!NewExitBB)
-    return nullptr;
-
-  DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
-               << NewExitBB->getName() << "\n");
-  return NewExitBB;
-}
-
 /// Add the specified block, and all of its predecessors, to the specified set,
 /// if it's not already in there.  Stop predecessor traversal when we reach
 /// StopBlock.
@@ -210,7 +178,7 @@ static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I);
     ++I;
-    if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
+    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
       // This is a degenerate PHI already, don't modify it!
       PN->replaceAllUsesWith(V);
       PN->eraseFromParent();
@@ -346,16 +314,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
 
   // Split edges to exit blocks from the inner loop, if they emerged in the
   // process of separating the outer one.
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
-                                               ExitBlocks.end());
-  for (BasicBlock *ExitBlock : ExitBlockSet) {
-    if (any_of(predecessors(ExitBlock),
-               [L](BasicBlock *BB) { return !L->contains(BB); })) {
-      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
-    }
-  }
+  formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
 
   if (PreserveLCSSA) {
     // Fix LCSSA form for L. Some values, which previously were only used inside
@@ -563,29 +522,16 @@ ReprocessLoop:
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
-    if (Preheader) {
-      ++NumInserted;
+    if (Preheader)
       Changed = true;
-    }
   }
 
   // Next, check to make sure that all exit nodes of the loop only have
   // predecessors that are inside of the loop.  This check guarantees that the
   // loop preheader/header will dominate the exit blocks.  If the exit block has
   // predecessors from outside of the loop, split the edge now.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-
-  SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
-                                               ExitBlocks.end());
-  for (BasicBlock *ExitBlock : ExitBlockSet) {
-    if (any_of(predecessors(ExitBlock),
-               [L](BasicBlock *BB) { return !L->contains(BB); })) {
-      rewriteLoopExitBlock(L, ExitBlock, DT, LI, PreserveLCSSA);
-      ++NumInserted;
-      Changed = true;
-    }
-  }
+  if (formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA))
+    Changed = true;
 
   // If the header has more than two predecessors at this point (from the
   // preheader and from multiple backedges), we must adjust the loop.
@@ -614,10 +560,8 @@ ReprocessLoop:
     // insert a new block that all backedges target, then make it jump to the
     // loop header.
     LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI);
-    if (LoopLatch) {
-      ++NumInserted;
+    if (LoopLatch)
       Changed = true;
-    }
   }
 
   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
@@ -628,7 +572,7 @@ ReprocessLoop:
   PHINode *PN;
   for (BasicBlock::iterator I = L->getHeader()->begin();
        (PN = dyn_cast<PHINode>(I++)); )
-    if (Value *V = SimplifyInstruction(PN, DL, nullptr, DT, AC)) {
+    if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
       if (SE) SE->forgetValue(PN);
       if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
         PN->replaceAllUsesWith(V);
@@ -645,14 +589,22 @@ ReprocessLoop:
   // loop-invariant instructions out of the way to open up more
   // opportunities, and the disadvantage of having the responsibility
   // to preserve dominator information.
-  bool UniqueExit = true;
-  if (!ExitBlocks.empty())
-    for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i)
-      if (ExitBlocks[i] != ExitBlocks[0]) {
-        UniqueExit = false;
-        break;
+  auto HasUniqueExitBlock = [&]() {
+    BasicBlock *UniqueExit = nullptr;
+    for (auto *ExitingBB : ExitingBlocks)
+      for (auto *SuccBB : successors(ExitingBB)) {
+        if (L->contains(SuccBB))
+          continue;
+
+        if (!UniqueExit)
+          UniqueExit = SuccBB;
+        else if (UniqueExit != SuccBB)
+          return false;
       }
-  if (UniqueExit) {
+
+    return true;
+  };
+  if (HasUniqueExitBlock()) {
     for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       BasicBlock *ExitingBlock = ExitingBlocks[i];
       if (!ExitingBlock->getSinglePredecessor()) continue;
@@ -735,6 +687,17 @@ bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
                         bool PreserveLCSSA) {
   bool Changed = false;
 
+#ifndef NDEBUG
+  // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA
+  // form.
+  if (PreserveLCSSA) {
+    assert(DT && "DT not available.");
+    assert(LI && "LI not available.");
+    assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+           "Requested to preserve LCSSA, but it's already broken.");
+  }
+#endif
+
   // Worklist maintains our depth-first queue of loops in this nest to process.
   SmallVector<Loop *, 4> Worklist;
   Worklist.push_back(L);
@@ -814,15 +777,6 @@ bool LoopSimplify::runOnFunction(Function &F) {
       &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-#ifndef NDEBUG
-  if (PreserveLCSSA) {
-    assert(DT && "DT not available.");
-    assert(LI && "LI not available.");
-    bool InLCSSA = all_of(
-        *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
-    assert(InLCSSA && "Requested to preserve LCSSA, but it's already broken.");
-  }
-#endif
 
   // Simplify each loop nest in the function.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
@@ -846,17 +800,14 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F,
   ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
   AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
 
-  // FIXME: This pass should verify that the loops on which it's operating
-  // are in canonical SSA form, and that the pass itself preserves this form.
+  // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
+  // after simplifying the loops.
   for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
-    Changed |= simplifyLoop(*I, DT, LI, SE, AC, true /* PreserveLCSSA */);
-
-  // FIXME: We need to invalidate this to avoid PR28400. Is there a better
-  // solution?
-  AM.invalidate<ScalarEvolutionAnalysis>(F);
+    Changed |= simplifyLoop(*I, DT, LI, SE, AC, /*PreserveLCSSA*/ false);
 
   if (!Changed)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index e346ebd..f2527f8 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -27,6 +26,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -38,6 +38,7 @@
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-unroll"
@@ -51,6 +52,16 @@ UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
                     cl::desc("Allow runtime unrolled loops to be unrolled "
                              "with epilog instead of prolog."));
 
+static cl::opt<bool>
+UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
+                    cl::desc("Verify domtree after unrolling"),
+#ifdef NDEBUG
+    cl::init(false)
+#else
+    cl::init(true)
+#endif
+                    );
+
 /// Convert the instruction operands from referencing the current values into
 /// those specified by VMap.
 static inline void remapInstruction(Instruction *I,
@@ -205,6 +216,45 @@ const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
   }
 }
 
+/// The function chooses which type of unroll (epilog or prolog) is more
+/// profitabale.
+/// Epilog unroll is more profitable when there is PHI that starts from
+/// constant.  In this case epilog will leave PHI start from constant,
+/// but prolog will convert it to non-constant.
+///
+/// loop:
+///   PN = PHI [I, Latch], [CI, PreHeader]
+///   I = foo(PN)
+///   ...
+///
+/// Epilog unroll case.
+/// loop:
+///   PN = PHI [I2, Latch], [CI, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+/// Prolog unroll case.
+///   NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
+/// loop:
+///   PN = PHI [I2, Latch], [NewPN, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+///
+static bool isEpilogProfitable(Loop *L) {
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  assert(PreHeader && Header);
+  for (Instruction &BBI : *Header) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    if (!PN)
+      break;
+    if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
+      return true;
+  }
+  return false;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -268,6 +318,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     return false;
   }
 
+  // The current loop unroll pass can only unroll loops with a single latch
+  // that's a conditional branch exiting the loop.
+  // FIXME: The implementation can be extended to work with more complicated
+  // cases, e.g. loops with multiple latches.
   BasicBlock *Header = L->getHeader();
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
 
@@ -278,6 +332,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     return false;
   }
 
+  auto CheckSuccessors = [&](unsigned S1, unsigned S2) {
+    return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2));
+  };
+
+  if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
+    DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
+                    " exiting the loop can be unrolled\n");
+    return false;
+  }
+
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     DEBUG(dbgs() <<
@@ -296,8 +360,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     Count = TripCount;
 
   // Don't enter the unroll code if there is nothing to do.
-  if (TripCount == 0 && Count < 2 && PeelCount == 0)
+  if (TripCount == 0 && Count < 2 && PeelCount == 0) {
+    DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
     return false;
+  }
 
   assert(Count > 0);
   assert(TripMultiple > 0);
@@ -330,7 +396,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
          "and peeling for the same loop");
 
   if (PeelCount)
-    peelLoop(L, PeelCount, LI, SE, DT, PreserveLCSSA);
+    peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
 
   // Loops containing convergent instructions must have a count that divides
   // their TripMultiple.
@@ -346,14 +412,22 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                "convergent operation.");
       });
 
+  bool EpilogProfitability =
+      UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
+                                              : isEpilogProfitable(L);
+
   if (RuntimeTripCount && TripMultiple % Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
-                                  UnrollRuntimeEpilog, LI, SE, DT, 
+                                  EpilogProfitability, LI, SE, DT,
                                   PreserveLCSSA)) {
     if (Force)
       RuntimeTripCount = false;
-    else
+    else {
+      DEBUG(
+          dbgs() << "Wont unroll; remainder loop could not be generated"
+                    "when assuming runtime trip count\n");
       return false;
+    }
   }
 
   // Notify ScalarEvolution that the loop will be substantially changed,
@@ -446,6 +520,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
   for (Loop *SubLoop : *L)
     LoopsToSimplify.insert(SubLoop);
 
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (const DILocation *DIL = I.getDebugLoc())
+          I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -456,19 +536,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
+      assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
+             "Header should not be in a sub-loop");
       // Tell LI about New.
-      if (*BB == Header) {
-        assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
-        L->addBasicBlockToLoop(New, *LI);
-      } else {
-        const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
-        if (OldLoop) {
-          LoopsToSimplify.insert(NewLoops[OldLoop]);
+      const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+      if (OldLoop) {
+        LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-          // Forget the old loop, since its inputs may have changed.
-          if (SE)
-            SE->forgetLoop(OldLoop);
-        }
+        // Forget the old loop, since its inputs may have changed.
+        if (SE)
+          SE->forgetLoop(OldLoop);
       }
 
       if (*BB == Header)
@@ -615,14 +692,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
       Term->eraseFromParent();
     }
   }
+
   // Update dominators of blocks we might reach through exits.
   // Immediate dominator of such block might change, because we add more
   // routes which can lead to the exit: we can now reach it from the copied
-  // iterations too. Thus, the new idom of the block will be the nearest
-  // common dominator of the previous idom and common dominator of all copies of
-  // the previous idom. This is equivalent to the nearest common dominator of
-  // the previous idom and the first latch, which dominates all copies of the
-  // previous idom.
+  // iterations too.
   if (DT && Count > 1) {
     for (auto *BB : OriginalLoopBlocks) {
       auto *BBDomNode = DT->getNode(BB);
@@ -632,12 +706,38 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
         if (!L->contains(ChildBB))
           ChildrenToUpdate.push_back(ChildBB);
       }
-      BasicBlock *NewIDom = DT->findNearestCommonDominator(BB, Latches[0]);
+      BasicBlock *NewIDom;
+      if (BB == LatchBlock) {
+        // The latch is special because we emit unconditional branches in
+        // some cases where the original loop contained a conditional branch.
+        // Since the latch is always at the bottom of the loop, if the latch
+        // dominated an exit before unrolling, the new dominator of that exit
+        // must also be a latch.  Specifically, the dominator is the first
+        // latch which ends in a conditional branch, or the last latch if
+        // there is no such latch.
+        NewIDom = Latches.back();
+        for (BasicBlock *IterLatch : Latches) {
+          TerminatorInst *Term = IterLatch->getTerminator();
+          if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
+            NewIDom = IterLatch;
+            break;
+          }
+        }
+      } else {
+        // The new idom of the block will be the nearest common dominator
+        // of all copies of the previous idom. This is equivalent to the
+        // nearest common dominator of the previous idom and the first latch,
+        // which dominates all copies of the previous idom.
+        NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
+      }
       for (auto *ChildBB : ChildrenToUpdate)
         DT->changeImmediateDominator(ChildBB, NewIDom);
     }
   }
 
+  if (DT && UnrollVerifyDomtree)
+    DT->verifyDomTree();
+
   // Merge adjacent basic blocks, if possible.
   SmallPtrSet<Loop *, 4> ForgottenLoops;
   for (BasicBlock *Latch : Latches) {
@@ -655,16 +755,9 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     }
   }
 
-  // FIXME: We only preserve DT info for complete unrolling now. Incrementally
-  // updating domtree after partial loop unrolling should also be easy.
-  if (DT && !CompletelyUnroll)
-    DT->recalculate(*L->getHeader()->getParent());
-  else if (DT)
-    DEBUG(DT->verifyDomTree());
-
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && !CompletelyUnroll && Count > 1) {
-    SmallVector<WeakVH, 16> DeadInsts;
+    SmallVector<WeakTrackingVH, 16> DeadInsts;
     simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
 
     // Aggressively clean up dead instructions that simplifyLoopIVs already
@@ -684,7 +777,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
       Instruction *Inst = &*I++;
 
-      if (Value *V = SimplifyInstruction(Inst, DL))
+      if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
         if (LI->replacementPreservesLCSSAForm(Inst, V))
           Inst->replaceAllUsesWith(V);
       if (isInstructionTriviallyDead(Inst))
@@ -721,29 +814,29 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
   // at least one layer outside of the loop that was unrolled so that any
   // changes to the parent loop exposed by the unrolling are considered.
   if (DT) {
-    if (!OuterL && !CompletelyUnroll)
-      OuterL = L;
     if (OuterL) {
       // OuterL includes all loops for which we can break loop-simplify, so
       // it's sufficient to simplify only it (it'll recursively simplify inner
       // loops too).
+      if (NeedToFixLCSSA) {
+        // LCSSA must be performed on the outermost affected loop. The unrolled
+        // loop's last loop latch is guaranteed to be in the outermost loop
+        // after LoopInfo's been updated by markAsRemoved.
+        Loop *LatchLoop = LI->getLoopFor(Latches.back());
+        Loop *FixLCSSALoop = OuterL;
+        if (!FixLCSSALoop->contains(LatchLoop))
+          while (FixLCSSALoop->getParentLoop() != LatchLoop)
+            FixLCSSALoop = FixLCSSALoop->getParentLoop();
+
+        formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
+      } else if (PreserveLCSSA) {
+        assert(OuterL->isLCSSAForm(*DT) &&
+               "Loops should be in LCSSA form after loop-unroll.");
+      }
+
       // TODO: That potentially might be compile-time expensive. We should try
       // to fix the loop-simplified form incrementally.
       simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
-
-      // LCSSA must be performed on the outermost affected loop. The unrolled
-      // loop's last loop latch is guaranteed to be in the outermost loop after
-      // LoopInfo's been updated by markAsRemoved.
-      Loop *LatchLoop = LI->getLoopFor(Latches.back());
-      if (!OuterL->contains(LatchLoop))
-        while (OuterL->getParentLoop() != LatchLoop)
-          OuterL = OuterL->getParentLoop();
-
-      if (NeedToFixLCSSA)
-        formLCSSARecursively(*OuterL, *DT, LI, SE);
-      else
-        assert(OuterL->isLCSSAForm(*DT) &&
-               "Loops should be in LCSSA form after loop-unroll.");
     } else {
       // Simplify loops for which we might've broken loop-simplify form.
       for (Loop *SubLoop : LoopsToSimplify)
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 842cf31..5c21490 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
@@ -45,6 +46,11 @@ static cl::opt<unsigned> UnrollForcePeelCount(
     "unroll-force-peel-count", cl::init(0), cl::Hidden,
     cl::desc("Force a peel count regardless of profiling information."));
 
+// Designates that a Phi is estimated to become invariant after an "infinite"
+// number of loop iterations (i.e. only may become an invariant if the loop is
+// fully unrolled).
+static const unsigned InfiniteIterationsToInvariance = UINT_MAX;
+
 // Check whether we are capable of peeling this loop.
 static bool canPeel(Loop *L) {
   // Make sure the loop is in simplified form
@@ -55,12 +61,72 @@ static bool canPeel(Loop *L) {
   if (!L->getExitingBlock() || !L->getUniqueExitBlock())
     return false;
 
+  // Don't try to peel loops where the latch is not the exiting block.
+  // This can be an indication of two different things:
+  // 1) The loop is not rotated.
+  // 2) The loop contains irreducible control flow that involves the latch.
+  if (L->getLoopLatch() != L->getExitingBlock())
+    return false;
+
   return true;
 }
 
+// This function calculates the number of iterations after which the given Phi
+// becomes an invariant. The pre-calculated values are memorized in the map. The
+// function (shortcut is I) is calculated according to the following definition:
+// Given %x = phi <Inputs from above the loop>, ..., [%y, %back.edge].
+//   If %y is a loop invariant, then I(%x) = 1.
+//   If %y is a Phi from the loop header, I(%x) = I(%y) + 1.
+//   Otherwise, I(%x) is infinite.
+// TODO: Actually if %y is an expression that depends only on Phi %z and some
+//       loop invariants, we can estimate I(%x) = I(%z) + 1. The example
+//       looks like:
+//         %x = phi(0, %a),  <-- becomes invariant starting from 3rd iteration.
+//         %y = phi(0, 5),
+//         %a = %y + 1.
+static unsigned calculateIterationsToInvariance(
+    PHINode *Phi, Loop *L, BasicBlock *BackEdge,
+    SmallDenseMap<PHINode *, unsigned> &IterationsToInvariance) {
+  assert(Phi->getParent() == L->getHeader() &&
+         "Non-loop Phi should not be checked for turning into invariant.");
+  assert(BackEdge == L->getLoopLatch() && "Wrong latch?");
+  // If we already know the answer, take it from the map.
+  auto I = IterationsToInvariance.find(Phi);
+  if (I != IterationsToInvariance.end())
+    return I->second;
+
+  // Otherwise we need to analyze the input from the back edge.
+  Value *Input = Phi->getIncomingValueForBlock(BackEdge);
+  // Place infinity to map to avoid infinite recursion for cycled Phis. Such
+  // cycles can never stop on an invariant.
+  IterationsToInvariance[Phi] = InfiniteIterationsToInvariance;
+  unsigned ToInvariance = InfiniteIterationsToInvariance;
+
+  if (L->isLoopInvariant(Input))
+    ToInvariance = 1u;
+  else if (PHINode *IncPhi = dyn_cast<PHINode>(Input)) {
+    // Only consider Phis in header block.
+    if (IncPhi->getParent() != L->getHeader())
+      return InfiniteIterationsToInvariance;
+    // If the input becomes an invariant after X iterations, then our Phi
+    // becomes an invariant after X + 1 iterations.
+    unsigned InputToInvariance = calculateIterationsToInvariance(
+        IncPhi, L, BackEdge, IterationsToInvariance);
+    if (InputToInvariance != InfiniteIterationsToInvariance)
+      ToInvariance = InputToInvariance + 1u;
+  }
+
+  // If we found that this Phi lies in an invariant chain, update the map.
+  if (ToInvariance != InfiniteIterationsToInvariance)
+    IterationsToInvariance[Phi] = ToInvariance;
+  return ToInvariance;
+}
+
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
-                            TargetTransformInfo::UnrollingPreferences &UP) {
+                            TargetTransformInfo::UnrollingPreferences &UP,
+                            unsigned &TripCount) {
+  assert(LoopSize > 0 && "Zero loop size is not allowed!");
   UP.PeelCount = 0;
   if (!canPeel(L))
     return;
@@ -69,6 +135,46 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!L->empty())
     return;
 
+  // Here we try to get rid of Phis which become invariants after 1, 2, ..., N
+  // iterations of the loop. For this we compute the number for iterations after
+  // which every Phi is guaranteed to become an invariant, and try to peel the
+  // maximum number of iterations among these values, thus turning all those
+  // Phis into invariants.
+  // First, check that we can peel at least one iteration.
+  if (2 * LoopSize <= UP.Threshold && UnrollPeelMaxCount > 0) {
+    // Store the pre-calculated values here.
+    SmallDenseMap<PHINode *, unsigned> IterationsToInvariance;
+    // Now go through all Phis to calculate their the number of iterations they
+    // need to become invariants.
+    unsigned DesiredPeelCount = 0;
+    BasicBlock *BackEdge = L->getLoopLatch();
+    assert(BackEdge && "Loop is not in simplified form?");
+    for (auto BI = L->getHeader()->begin(); isa<PHINode>(&*BI); ++BI) {
+      PHINode *Phi = cast<PHINode>(&*BI);
+      unsigned ToInvariance = calculateIterationsToInvariance(
+          Phi, L, BackEdge, IterationsToInvariance);
+      if (ToInvariance != InfiniteIterationsToInvariance)
+        DesiredPeelCount = std::max(DesiredPeelCount, ToInvariance);
+    }
+    if (DesiredPeelCount > 0) {
+      // Pay respect to limitations implied by loop size and the max peel count.
+      unsigned MaxPeelCount = UnrollPeelMaxCount;
+      MaxPeelCount = std::min(MaxPeelCount, UP.Threshold / LoopSize - 1);
+      DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount);
+      // Consider max peel count limitation.
+      assert(DesiredPeelCount > 0 && "Wrong loop size estimation?");
+      DEBUG(dbgs() << "Peel " << DesiredPeelCount << " iteration(s) to turn"
+                   << " some Phis into invariants.\n");
+      UP.PeelCount = DesiredPeelCount;
+      return;
+    }
+  }
+
+  // Bail if we know the statically calculated trip count.
+  // In this case we rather prefer partial unrolling.
+  if (TripCount)
+    return;
+
   // If the user provided a peel count, use that.
   bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
   if (UserPeelCount) {
@@ -164,7 +270,8 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
                             BasicBlock *InsertBot, BasicBlock *Exit,
                             SmallVectorImpl<BasicBlock *> &NewBlocks,
                             LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                            ValueToValueMapTy &LVMap, LoopInfo *LI) {
+                            ValueToValueMapTy &LVMap, DominatorTree *DT,
+                            LoopInfo *LI) {
 
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
@@ -185,6 +292,17 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
+
+    // If dominator tree is available, insert nodes to represent cloned blocks.
+    if (DT) {
+      if (Header == *BB)
+        DT->addNewBlock(NewBB, InsertTop);
+      else {
+        DomTreeNode *IDom = DT->getNode(*BB)->getIDom();
+        // VMap must contain entry for IDom, as the iteration order is RPO.
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDom->getBlock()]));
+      }
+    }
   }
 
   // Hook-up the control flow for the newly inserted blocks.
@@ -198,11 +316,13 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
   // The backedge now goes to the "bottom", which is either the loop's real
   // header (for the last peeled iteration) or the copied header of the next
   // iteration (for every other iteration)
-  BranchInst *LatchBR =
-      cast<BranchInst>(cast<BasicBlock>(VMap[Latch])->getTerminator());
+  BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+  BranchInst *LatchBR = cast<BranchInst>(NewLatch->getTerminator());
   unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
   LatchBR->setSuccessor(HeaderIdx, InsertBot);
   LatchBR->setSuccessor(1 - HeaderIdx, Exit);
+  if (DT)
+    DT->changeImmediateDominator(InsertBot, NewLatch);
 
   // The new copy of the loop body starts with a bunch of PHI nodes
   // that pick an incoming value from either the preheader, or the previous
@@ -257,7 +377,7 @@ static void cloneLoopBlocks(Loop *L, unsigned IterNumber, BasicBlock *InsertTop,
 /// optimizations.
 bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
                     ScalarEvolution *SE, DominatorTree *DT,
-                    bool PreserveLCSSA) {
+                    AssumptionCache *AC, bool PreserveLCSSA) {
   if (!canPeel(L))
     return false;
 
@@ -358,7 +478,24 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
       CurHeaderWeight = 1;
 
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
-                    NewBlocks, LoopBlocks, VMap, LVMap, LI);
+                    NewBlocks, LoopBlocks, VMap, LVMap, DT, LI);
+
+    // Remap to use values from the current iteration instead of the
+    // previous one.
+    remapInstructionsInBlocks(NewBlocks, VMap);
+
+    if (DT) {
+      // Latches of the cloned loops dominate over the loop exit, so idom of the
+      // latter is the first cloned loop body, as original PreHeader dominates
+      // the original loop body.
+      if (Iter == 0)
+        DT->changeImmediateDominator(Exit, cast<BasicBlock>(LVMap[Latch]));
+#ifndef NDEBUG
+      if (VerifyDomInfo)
+        DT->verifyDomTree();
+#endif
+    }
+
     updateBranchWeights(InsertBot, cast<BranchInst>(VMap[LatchBR]), Iter,
                         PeelCount, ExitWeight);
 
@@ -369,10 +506,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     F->getBasicBlockList().splice(InsertTop->getIterator(),
                                   F->getBasicBlockList(),
                                   NewBlocks[0]->getIterator(), F->end());
-
-    // Remap to use values from the current iteration instead of the
-    // previous one.
-    remapInstructionsInBlocks(NewBlocks, VMap);
   }
 
   // Now adjust the phi nodes in the loop header to get their initial values
@@ -405,9 +538,16 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   }
 
   // If the loop is nested, we changed the parent loop, update SE.
-  if (Loop *ParentLoop = L->getParentLoop())
+  if (Loop *ParentLoop = L->getParentLoop()) {
     SE->forgetLoop(ParentLoop);
 
+    // FIXME: Incrementally update loop-simplify
+    simplifyLoop(ParentLoop, DT, LI, SE, AC, PreserveLCSSA);
+  } else {
+    // FIXME: Incrementally update loop-simplify
+    simplifyLoop(L, DT, LI, SE, AC, PreserveLCSSA);
+  }
+
   NumPeeled++;
 
   return true;
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index d3ea156..d43ce7a 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -21,8 +21,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -37,6 +37,8 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
 
 using namespace llvm;
@@ -45,6 +47,10 @@ using namespace llvm;
 
 STATISTIC(NumRuntimeUnrolled,
           "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+    "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+    cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+             "epilog is generated"));
 
 /// Connect the unrolling prolog code to the original loop.
 /// The unrolling prolog code contains code to execute the
@@ -60,9 +66,11 @@ STATISTIC(NumRuntimeUnrolled,
 ///   than the unroll factor.
 ///
 static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
-                          BasicBlock *PrologExit, BasicBlock *PreHeader,
-                          BasicBlock *NewPreHeader, ValueToValueMapTy &VMap,
-                          DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA) {
+                          BasicBlock *PrologExit,
+                          BasicBlock *OriginalLoopLatchExit,
+                          BasicBlock *PreHeader, BasicBlock *NewPreHeader,
+                          ValueToValueMapTy &VMap, DominatorTree *DT,
+                          LoopInfo *LI, bool PreserveLCSSA) {
   BasicBlock *Latch = L->getLoopLatch();
   assert(Latch && "Loop must have a latch");
   BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
@@ -137,15 +145,15 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   // then (BECount + 1) cannot unsigned-overflow.
   Value *BrLoopExit =
       B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
-  BasicBlock *Exit = L->getUniqueExitBlock();
-  assert(Exit && "Loop must have a single exit block only");
   // Split the exit to maintain loop canonicalization guarantees
-  SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
-  SplitBlockPredecessors(Exit, Preds, ".unr-lcssa", DT, LI,
+  SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
+  SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the unrolled loop)
-  B.CreateCondBr(BrLoopExit, Exit, NewPreHeader);
+  B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
   InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
 }
 
 /// Connect the unrolling epilog code to the original loop.
@@ -260,13 +268,20 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   IRBuilder<> B(InsertPt);
   Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
   assert(Exit && "Loop must have a single exit block only");
-  // Split the exit to maintain loop canonicalization guarantees
+  // Split the epilogue exit to maintain loop canonicalization guarantees
   SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
   SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
                          PreserveLCSSA);
   // Add the branch to the exit block (around the unrolling loop)
   B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
   InsertPt->eraseFromParent();
+  if (DT)
+    DT->changeImmediateDominator(Exit, NewExit);
+
+  // Split the main loop exit to maintain canonicalization guarantees.
+  SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
+  SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI,
+                         PreserveLCSSA);
 }
 
 /// Create a clone of the blocks in a loop and connect them together.
@@ -276,35 +291,23 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
 /// The cloned blocks should be inserted between InsertTop and InsertBot.
 /// If loop structure is cloned InsertTop should be new preheader, InsertBot
 /// new loop exit.
-///
-static void CloneLoopBlocks(Loop *L, Value *NewIter,
-                            const bool CreateRemainderLoop,
-                            const bool UseEpilogRemainder,
-                            BasicBlock *InsertTop, BasicBlock *InsertBot,
-                            BasicBlock *Preheader,
-                            std::vector<BasicBlock *> &NewBlocks,
-                            LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
-                            LoopInfo *LI) {
+/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+static Loop *
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
+                const bool UseEpilogRemainder, BasicBlock *InsertTop,
+                BasicBlock *InsertBot, BasicBlock *Preheader,
+                std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+                ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
   StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
   BasicBlock *Header = L->getHeader();
   BasicBlock *Latch = L->getLoopLatch();
   Function *F = Header->getParent();
   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
-  Loop *NewLoop = nullptr;
   Loop *ParentLoop = L->getParentLoop();
-  if (CreateRemainderLoop) {
-    NewLoop = new Loop();
-    if (ParentLoop)
-      ParentLoop->addChildLoop(NewLoop);
-    else
-      LI->addTopLevelLoop(NewLoop);
-  }
-
   NewLoopsMap NewLoops;
-  if (NewLoop)
-    NewLoops[L] = NewLoop;
-  else if (ParentLoop)
+  NewLoops[ParentLoop] = ParentLoop;
+  if (!CreateRemainderLoop)
     NewLoops[L] = ParentLoop;
 
   // For each block in the original loop, create a new copy,
@@ -312,7 +315,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
-   
+
     // If we're unrolling the outermost loop, there's no remainder loop,
     // and this block isn't in a nested loop, then the new block is not
     // in any loop. Otherwise, add it to loopinfo.
@@ -326,6 +329,17 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
       InsertTop->getTerminator()->setSuccessor(0, NewBB);
     }
 
+    if (DT) {
+      if (Header == *BB) {
+        // The header is dominated by the preheader.
+        DT->addNewBlock(NewBB, InsertTop);
+      } else {
+        // Copy information from original loop to unrolled loop.
+        BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
+        DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+      }
+    }
+
     if (Latch == *BB) {
       // For the last block, if CreateRemainderLoop is false, create a direct
       // jump to InsertBot. If not, create a loop back to cloned head.
@@ -376,7 +390,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
         NewPHI->setIncomingValue(idx, V);
     }
   }
-  if (NewLoop) {
+  if (CreateRemainderLoop) {
+    Loop *NewLoop = NewLoops[L];
+    assert(NewLoop && "L should have been cloned");
     // Add unroll disable metadata to disable future unrolling for this loop.
     SmallVector<Metadata *, 4> MDs;
     // Reserve first location for self reference to the LoopID metadata node.
@@ -406,9 +422,56 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
     // Set operand 0 to refer to the loop id itself.
     NewLoopID->replaceOperandWith(0, NewLoopID);
     NewLoop->setLoopID(NewLoopID);
+    return NewLoop;
   }
+  else
+    return nullptr;
+}
+
+/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
+/// is populated with all the loop exit blocks other than the LatchExit block.
+static bool
+canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
+                             BasicBlock *LatchExit, bool PreserveLCSSA,
+                             bool UseEpilogRemainder) {
+
+  // Support runtime unrolling for multiple exit blocks and multiple exiting
+  // blocks.
+  if (!UnrollRuntimeMultiExit)
+    return false;
+  // Even if runtime multi exit is enabled, we currently have some correctness
+  // constrains in unrolling a multi-exit loop.
+  // We rely on LCSSA form being preserved when the exit blocks are transformed.
+  if (!PreserveLCSSA)
+    return false;
+  SmallVector<BasicBlock *, 4> Exits;
+  L->getUniqueExitBlocks(Exits);
+  for (auto *BB : Exits)
+    if (BB != LatchExit)
+      OtherExits.push_back(BB);
+
+  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
+  // UnrollRuntimeMultiExit is true. This will need updating the logic in
+  // connectEpilog/connectProlog.
+  if (!LatchExit->getSinglePredecessor()) {
+    DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+                    "predecessor.\n");
+    return false;
+  }
+  // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
+  // and L is an inner loop. This is because in presence of multiple exits, the
+  // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
+  // outer loop. This is automatically handled in the prolog case, so we do not
+  // have that bug in prolog generation.
+  if (UseEpilogRemainder && L->getParentLoop())
+    return false;
+
+  // All constraints have been satisfied.
+  return true;
 }
 
+
+
 /// Insert code in the prolog/epilog code when unrolling a loop with a
 /// run-time trip-count.
 ///
@@ -452,18 +515,40 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                       bool UseEpilogRemainder,
                                       LoopInfo *LI, ScalarEvolution *SE,
                                       DominatorTree *DT, bool PreserveLCSSA) {
-  // for now, only unroll loops that contain a single exit
-  if (!L->getExitingBlock())
-    return false;
+  DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+  DEBUG(L->dump());
 
-  // Make sure the loop is in canonical form, and there is a single
-  // exit block only.
-  if (!L->isLoopSimplifyForm())
-    return false;
-  BasicBlock *Exit = L->getUniqueExitBlock(); // successor out of loop
-  if (!Exit)
+  // Make sure the loop is in canonical form.
+  if (!L->isLoopSimplifyForm()) {
+    DEBUG(dbgs() << "Not in simplify form!\n");
     return false;
+  }
 
+  // Guaranteed by LoopSimplifyForm.
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *Header = L->getHeader();
+
+  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+  BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
+  // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
+  // targets of the Latch be an exit block out of the loop. This needs
+  // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
+  assert(!L->contains(LatchExit) &&
+         "one of the loop latch successors should be the exit block!");
+  // These are exit blocks other than the target of the latch exiting block.
+  SmallVector<BasicBlock *, 4> OtherExits;
+  bool isMultiExitUnrollingEnabled = canSafelyUnrollMultiExitLoop(
+      L, OtherExits, LatchExit, PreserveLCSSA, UseEpilogRemainder);
+  // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
+  if (!isMultiExitUnrollingEnabled &&
+      (!L->getExitingBlock() || OtherExits.size())) {
+    DEBUG(
+        dbgs()
+        << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
+           "enabled!\n");
+    return false;
+  }
   // Use Scalar Evolution to compute the trip count. This allows more loops to
   // be unrolled than relying on induction var simplification.
   if (!SE)
@@ -471,34 +556,44 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
 
   // Only unroll loops with a computable trip count, and the trip count needs
   // to be an int value (allowing a pointer type is a TODO item).
-  const SCEV *BECountSC = SE->getBackedgeTakenCount(L);
+  // We calculate the backedge count by using getExitCount on the Latch block,
+  // which is proven to be the only exiting block in this loop. This is same as
+  // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
+  // exiting blocks).
+  const SCEV *BECountSC = SE->getExitCount(L, Latch);
   if (isa<SCEVCouldNotCompute>(BECountSC) ||
-      !BECountSC->getType()->isIntegerTy())
+      !BECountSC->getType()->isIntegerTy()) {
+    DEBUG(dbgs() << "Could not compute exit block SCEV\n");
     return false;
+  }
 
   unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
 
   // Add 1 since the backedge count doesn't include the first loop iteration.
   const SCEV *TripCountSC =
       SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
-  if (isa<SCEVCouldNotCompute>(TripCountSC))
+  if (isa<SCEVCouldNotCompute>(TripCountSC)) {
+    DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
     return false;
+  }
 
-  BasicBlock *Header = L->getHeader();
   BasicBlock *PreHeader = L->getLoopPreheader();
   BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
   const DataLayout &DL = Header->getModule()->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
-      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR))
+      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
+    DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
+  }
 
   // This constraint lets us deal with an overflowing trip count easily; see the
   // comment on ModVal below.
-  if (Log2_32(Count) > BEWidth)
+  if (Log2_32(Count) > BEWidth) {
+    DEBUG(dbgs()
+          << "Count failed constraint on overflow trip count calculation.\n");
     return false;
-
-  BasicBlock *Latch = L->getLoopLatch();
+  }
 
   // Loop structure is the following:
   //
@@ -506,7 +601,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   //   Header
   //   ...
   //   Latch
-  // Exit
+  // LatchExit
 
   BasicBlock *NewPreHeader;
   BasicBlock *NewExit = nullptr;
@@ -519,9 +614,9 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
     // Split PreHeader to insert a branch around loop for unrolling.
     NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
     NewPreHeader->setName(PreHeader->getName() + ".new");
-    // Split Exit to create phi nodes from branch above.
-    SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
-    NewExit = SplitBlockPredecessors(Exit, Preds, ".unr-lcssa",
+    // Split LatchExit to create phi nodes from branch above.
+    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
+    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa",
                                      DT, LI, PreserveLCSSA);
     // Split NewExit to insert epilog remainder loop.
     EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
@@ -548,7 +643,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   //   Latch             Header
   // *NewExit            ...
   // *EpilogPreHeader    Latch
-  // Exit              Exit
+  // LatchExit              LatchExit
 
   // Calculate conditions for branch around loop for unrolling
   // in epilog case and around prolog remainder loop in prolog case.
@@ -599,6 +694,12 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // Branch to either remainder (extra iterations) loop or unrolling loop.
   B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
   PreHeaderBR->eraseFromParent();
+  if (DT) {
+    if (UseEpilogRemainder)
+      DT->changeImmediateDominator(NewExit, PreHeader);
+    else
+      DT->changeImmediateDominator(PrologExit, PreHeader);
+  }
   Function *F = Header->getParent();
   // Get an ordered list of blocks in the loop to help with the ordering of the
   // cloned blocks in the prolog/epilog code
@@ -620,10 +721,11 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   // Clone all the basic blocks in the loop. If Count is 2, we don't clone
   // the loop, otherwise we create a cloned loop to execute the extra
   // iterations. This function adds the appropriate CFG connections.
-  BasicBlock *InsertBot = UseEpilogRemainder ? Exit : PrologExit;
+  BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
   BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
-  CloneLoopBlocks(L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop,
-                  InsertBot, NewPreHeader, NewBlocks, LoopBlocks, VMap, LI);
+  Loop *remainderLoop = CloneLoopBlocks(
+      L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
+      NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
 
   // Insert the cloned blocks into the function.
   F->getBasicBlockList().splice(InsertBot->getIterator(),
@@ -631,6 +733,66 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                 NewBlocks[0]->getIterator(),
                                 F->end());
 
+  // Now the loop blocks are cloned and the other exiting blocks from the
+  // remainder are connected to the original Loop's exit blocks. The remaining
+  // work is to update the phi nodes in the original loop, and take in the
+  // values from the cloned region. Also update the dominator info for
+  // OtherExits and their immediate successors, since we have new edges into
+  // OtherExits.
+  SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
+  for (auto *BB : OtherExits) {
+   for (auto &II : *BB) {
+
+     // Given we preserve LCSSA form, we know that the values used outside the
+     // loop will be used through these phi nodes at the exit blocks that are
+     // transformed below.
+     if (!isa<PHINode>(II))
+       break;
+     PHINode *Phi = cast<PHINode>(&II);
+     unsigned oldNumOperands = Phi->getNumIncomingValues();
+     // Add the incoming values from the remainder code to the end of the phi
+     // node.
+     for (unsigned i =0; i < oldNumOperands; i++){
+       Value *newVal = VMap[Phi->getIncomingValue(i)];
+       // newVal can be a constant or derived from values outside the loop, and
+       // hence need not have a VMap value.
+       if (!newVal)
+         newVal = Phi->getIncomingValue(i);
+       Phi->addIncoming(newVal,
+                           cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+     }
+   }
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+    for (BasicBlock *SuccBB : successors(BB)) {
+      assert(!(any_of(OtherExits,
+                      [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
+               SuccBB == LatchExit) &&
+             "Breaks the definition of dedicated exits!");
+    }
+#endif
+   // Update the dominator info because the immediate dominator is no longer the
+   // header of the original Loop. BB has edges both from L and remainder code.
+   // Since the preheader determines which loop is run (L or directly jump to
+   // the remainder code), we set the immediate dominator as the preheader.
+   if (DT) {
+     DT->changeImmediateDominator(BB, PreHeader);
+     // Also update the IDom for immediate successors of BB.  If the current
+     // IDom is the header, update the IDom to be the preheader because that is
+     // the nearest common dominator of all predecessors of SuccBB.  We need to
+     // check for IDom being the header because successors of exit blocks can
+     // have edges from outside the loop, and we should not incorrectly update
+     // the IDom in that case.
+     for (BasicBlock *SuccBB: successors(BB))
+       if (ImmediateSuccessorsOfExitBlocks.insert(SuccBB).second) {
+         if (DT->getNode(SuccBB)->getIDom()->getBlock() == Header) {
+           assert(!SuccBB->getSinglePredecessor() &&
+                  "BB should be the IDom then!");
+           DT->changeImmediateDominator(SuccBB, PreHeader);
+         }
+       }
+    }
+  }
+
   // Loop structure should be the following:
   //  Epilog             Prolog
   //
@@ -644,7 +806,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   //   EpilogHeader      Header
   //   ...               ...
   //   EpilogLatch       Latch
-  // Exit              Exit
+  // LatchExit              LatchExit
 
   // Rewrite the cloned instruction operands to use the values created when the
   // clone is created.
@@ -658,7 +820,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (UseEpilogRemainder) {
     // Connect the epilog code to the original loop and update the
     // PHI functions.
-    ConnectEpilog(L, ModVal, NewExit, Exit, PreHeader,
+    ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
                   EpilogPreHeader, NewPreHeader, VMap, DT, LI,
                   PreserveLCSSA);
 
@@ -684,8 +846,8 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   } else {
     // Connect the prolog code to the original loop and update the
     // PHI functions.
-    ConnectProlog(L, BECount, Count, PrologExit, PreHeader, NewPreHeader,
-                  VMap, DT, LI, PreserveLCSSA);
+    ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
+                  NewPreHeader, VMap, DT, LI, PreserveLCSSA);
   }
 
   // If this loop is nested, then the loop unroller changes the code in the
@@ -693,6 +855,19 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   if (Loop *ParentLoop = L->getParentLoop())
     SE->forgetLoop(ParentLoop);
 
+  // Canonicalize to LoopSimplifyForm both original and remainder loops. We
+  // cannot rely on the LoopUnrollPass to do this because it only does
+  // canonicalization for parent/subloops and not the sibling loops.
+  if (OtherExits.size() > 0) {
+    // Generate dedicated exit blocks for the original loop, to preserve
+    // LoopSimplifyForm.
+    formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
+    // Generate dedicated exit blocks for the remainder loop if one exists, to
+    // preserve LoopSimplifyForm.
+    if (remainderLoop)
+      formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
+  }
+
   NumRuntimeUnrolled++;
   return true;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c8efa9e..3c52278 100644
--- a/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -12,16 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -29,6 +30,7 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -87,8 +89,7 @@ RecurrenceDescriptor::lookThroughAnd(PHINode *Phi, Type *&RT,
 
   // Matches either I & 2^x-1 or 2^x-1 & I. If we find a match, we update RT
   // with a new integer type of the corresponding bit width.
-  if (match(J, m_CombineOr(m_And(m_Instruction(I), m_APInt(M)),
-                           m_And(m_APInt(M), m_Instruction(I))))) {
+  if (match(J, m_c_And(m_Instruction(I), m_APInt(M)))) {
     int32_t Bits = (*M + 1).exactLogBase2();
     if (Bits > 0) {
       RT = IntegerType::get(Phi->getContext(), Bits);
@@ -230,8 +231,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
   //      - PHI:
   //        - All uses of the PHI must be the reduction (safe).
   //        - Otherwise, not safe.
-  //  - By one instruction outside of the loop (safe).
-  //  - By further instructions outside of the loop (not safe).
+  //  - By instructions outside of the loop (safe).
+  //      * One value may have several outside users, but all outside
+  //        uses must be of the same value.
   //  - By an instruction that is not part of the reduction (not safe).
   //    This is either:
   //      * An instruction type other than PHI or the reduction operation.
@@ -297,10 +299,15 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurrenceKind Kind,
       // Check if we found the exit user.
       BasicBlock *Parent = UI->getParent();
       if (!TheLoop->contains(Parent)) {
-        // Exit if you find multiple outside users or if the header phi node is
-        // being used. In this case the user uses the value of the previous
-        // iteration, in which case we would loose "VF-1" iterations of the
-        // reduction operation if we vectorize.
+        // If we already know this instruction is used externally, move on to
+        // the next user.
+        if (ExitInstruction == Cur)
+          continue;
+
+        // Exit if you find multiple values used outside or if the header phi
+        // node is being used. In this case the user uses the value of the
+        // previous iteration, in which case we would loose "VF-1" iterations of
+        // the reduction operation if we vectorize.
         if (ExitInstruction != nullptr || Cur == Phi)
           return false;
 
@@ -521,8 +528,9 @@ bool RecurrenceDescriptor::isReductionPHI(PHINode *Phi, Loop *TheLoop,
   return false;
 }
 
-bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
-                                                  DominatorTree *DT) {
+bool RecurrenceDescriptor::isFirstOrderRecurrence(
+    PHINode *Phi, Loop *TheLoop,
+    DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
 
   // Ensure the phi node is in the loop header and has two incoming values.
   if (Phi->getParent() != TheLoop->getHeader() ||
@@ -544,16 +552,29 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
   // Get the previous value. The previous value comes from the latch edge while
   // the initial value comes form the preheader edge.
   auto *Previous = dyn_cast<Instruction>(Phi->getIncomingValueForBlock(Latch));
-  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
+  if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous) ||
+      SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
     return false;
 
-  // Ensure every user of the phi node is dominated by the previous value. The
-  // dominance requirement ensures the loop vectorizer will not need to
+  // Ensure every user of the phi node is dominated by the previous value.
+  // The dominance requirement ensures the loop vectorizer will not need to
   // vectorize the initial value prior to the first iteration of the loop.
+  // TODO: Consider extending this sinking to handle other kinds of instructions
+  // and expressions, beyond sinking a single cast past Previous.
+  if (Phi->hasOneUse()) {
+    auto *I = Phi->user_back();
+    if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
+        DT->dominates(Previous, I->user_back())) {
+      SinkAfter[I] = Previous;
+      return true;
+    }
+  }
+
   for (User *U : Phi->users())
-    if (auto *I = dyn_cast<Instruction>(U))
+    if (auto *I = dyn_cast<Instruction>(U)) {
       if (!DT->dominates(Previous, I))
         return false;
+    }
 
   return true;
 }
@@ -916,6 +937,69 @@ bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop,
   return true;
 }
 
+bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
+                                   bool PreserveLCSSA) {
+  bool Changed = false;
+
+  // We re-use a vector for the in-loop predecesosrs.
+  SmallVector<BasicBlock *, 4> InLoopPredecessors;
+
+  auto RewriteExit = [&](BasicBlock *BB) {
+    assert(InLoopPredecessors.empty() &&
+           "Must start with an empty predecessors list!");
+    auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); });
+
+    // See if there are any non-loop predecessors of this exit block and
+    // keep track of the in-loop predecessors.
+    bool IsDedicatedExit = true;
+    for (auto *PredBB : predecessors(BB))
+      if (L->contains(PredBB)) {
+        if (isa<IndirectBrInst>(PredBB->getTerminator()))
+          // We cannot rewrite exiting edges from an indirectbr.
+          return false;
+
+        InLoopPredecessors.push_back(PredBB);
+      } else {
+        IsDedicatedExit = false;
+      }
+
+    assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!");
+
+    // Nothing to do if this is already a dedicated exit.
+    if (IsDedicatedExit)
+      return false;
+
+    auto *NewExitBB = SplitBlockPredecessors(
+        BB, InLoopPredecessors, ".loopexit", DT, LI, PreserveLCSSA);
+
+    if (!NewExitBB)
+      DEBUG(dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+                   << *L << "\n");
+    else
+      DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+                   << NewExitBB->getName() << "\n");
+    return true;
+  };
+
+  // Walk the exit blocks directly rather than building up a data structure for
+  // them, but only visit each one once.
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  for (auto *BB : L->blocks())
+    for (auto *SuccBB : successors(BB)) {
+      // We're looking for exit blocks so skip in-loop successors.
+      if (L->contains(SuccBB))
+        continue;
+
+      // Visit each exit block exactly once.
+      if (!Visited.insert(SuccBB).second)
+        continue;
+
+      Changed |= RewriteExit(SuccBB);
+    }
+
+  return Changed;
+}
+
 /// \brief Returns the instructions that use values defined in the loop.
 SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
   SmallVector<Instruction *, 8> UsedOutside;
@@ -1105,3 +1189,208 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   else
     return (FalseVal + (TrueVal / 2)) / TrueVal;
 }
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+Value *
+llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
+                          ArrayRef<Value *> RedOps) {
+  unsigned VF = Src->getType()->getVectorNumElements();
+  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+  // and vector ops, reducing the set of values being computed by half each
+  // round.
+  assert(isPowerOf2_32(VF) &&
+         "Reduction emission only supported for pow2 vectors!");
+  Value *TmpVec = Src;
+  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+  for (unsigned i = VF; i != 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+    // Fill the rest of the mask with undef.
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+              UndefValue::get(Builder.getInt32Ty()));
+
+    Value *Shuf = Builder.CreateShuffleVector(
+        TmpVec, UndefValue::get(TmpVec->getType()),
+        ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+      // Floating point operations had to be 'fast' to enable the reduction.
+      TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                                   TmpVec, Shuf, "bin.rdx"));
+    } else {
+      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+             "Invalid min/max");
+      TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+                                                    Shuf);
+    }
+    if (!RedOps.empty())
+      propagateIRFlags(TmpVec, RedOps);
+  }
+  // The result is in the first element of the vector.
+  return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+    IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+    Value *Src, TargetTransformInfo::ReductionFlags Flags,
+    ArrayRef<Value *> RedOps) {
+  assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+  Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+  std::function<Value*()> BuildFunc;
+  using RD = RecurrenceDescriptor;
+  RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+  // TODO: Support creating ordered reductions.
+  FastMathFlags FMFUnsafe;
+  FMFUnsafe.setUnsafeAlgebra();
+
+  switch (Opcode) {
+  case Instruction::Add:
+    BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+    break;
+  case Instruction::Mul:
+    BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+    break;
+  case Instruction::And:
+    BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+    break;
+  case Instruction::Or:
+    BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+    break;
+  case Instruction::Xor:
+    BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+    break;
+  case Instruction::FAdd:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::FMul:
+    BuildFunc = [&]() {
+      auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+      cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+      return Rdx;
+    };
+    break;
+  case Instruction::ICmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+      };
+    } else {
+      MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+      BuildFunc = [&]() {
+        return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+      };
+    }
+    break;
+  case Instruction::FCmp:
+    if (Flags.IsMaxOp) {
+      MinMaxKind = RD::MRK_FloatMax;
+      BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+    } else {
+      MinMaxKind = RD::MRK_FloatMin;
+      BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+    }
+    break;
+  default:
+    llvm_unreachable("Unhandled opcode");
+    break;
+  }
+  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+    return BuildFunc();
+  return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+                                   const TargetTransformInfo *TTI,
+                                   RecurrenceDescriptor &Desc, Value *Src,
+                                   bool NoNaN) {
+  // TODO: Support in-order reductions based on the recurrence descriptor.
+  RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+  TargetTransformInfo::ReductionFlags Flags;
+  Flags.NoNaN = NoNaN;
+  auto getSimpleRdx = [&](unsigned Opc) {
+    return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+  };
+  switch (RecKind) {
+  case RecurrenceDescriptor::RK_FloatAdd:
+    return getSimpleRdx(Instruction::FAdd);
+  case RecurrenceDescriptor::RK_FloatMult:
+    return getSimpleRdx(Instruction::FMul);
+  case RecurrenceDescriptor::RK_IntegerAdd:
+    return getSimpleRdx(Instruction::Add);
+  case RecurrenceDescriptor::RK_IntegerMult:
+    return getSimpleRdx(Instruction::Mul);
+  case RecurrenceDescriptor::RK_IntegerAnd:
+    return getSimpleRdx(Instruction::And);
+  case RecurrenceDescriptor::RK_IntegerOr:
+    return getSimpleRdx(Instruction::Or);
+  case RecurrenceDescriptor::RK_IntegerXor:
+    return getSimpleRdx(Instruction::Xor);
+  case RecurrenceDescriptor::RK_IntegerMinMax: {
+    switch (Desc.getMinMaxRecurrenceKind()) {
+    case RecurrenceDescriptor::MRK_SIntMax:
+      Flags.IsSigned = true;
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMax:
+      Flags.IsMaxOp = true;
+      break;
+    case RecurrenceDescriptor::MRK_SIntMin:
+      Flags.IsSigned = true;
+      break;
+    case RecurrenceDescriptor::MRK_UIntMin:
+      break;
+    default:
+      llvm_unreachable("Unhandled MRK");
+    }
+    return getSimpleRdx(Instruction::ICmp);
+  }
+  case RecurrenceDescriptor::RK_FloatMinMax: {
+    Flags.IsMaxOp =
+        Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+    return getSimpleRdx(Instruction::FCmp);
+  }
+  default:
+    llvm_unreachable("Unhandled RecKind");
+  }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
+  auto *VecOp = dyn_cast<Instruction>(I);
+  if (!VecOp)
+    return;
+  auto *Intersection = (OpValue == nullptr) ? dyn_cast<Instruction>(VL[0])
+                                            : dyn_cast<Instruction>(OpValue);
+  if (!Intersection)
+    return;
+  const unsigned Opcode = Intersection->getOpcode();
+  VecOp->copyIRFlags(Intersection);
+  for (auto *V : VL) {
+    auto *Instr = dyn_cast<Instruction>(V);
+    if (!Instr)
+      continue;
+    if (OpValue == nullptr || Opcode == Instr->getOpcode())
+      VecOp->andIRFlags(V);
+  }
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
new file mode 100644
index 0000000..900450b
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -0,0 +1,510 @@
+//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+static unsigned getLoopOperandSizeInBytes(Type *Type) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Type)) {
+    return VTy->getBitWidth() / 8;
+  }
+
+  return Type->getPrimitiveSizeInBits() / 8;
+}
+
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+                                     Value *DstAddr, ConstantInt *CopyLen,
+                                     unsigned SrcAlign, unsigned DestAlign,
+                                     bool SrcIsVolatile, bool DstIsVolatile,
+                                     const TargetTransformInfo &TTI) {
+  // No need to expand zero length copies.
+  if (CopyLen->isZero())
+    return;
+
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *TypeOfCopyLen = CopyLen->getType();
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+  uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  if (LoopEndCount != 0) {
+    // Split
+    PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
+    BasicBlock *LoopBB =
+        BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
+    PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+    IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+    // Cast the Src and Dst pointers to pointers to the loop operand type (if
+    // needed).
+    PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+    PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+    if (SrcAddr->getType() != SrcOpType) {
+      SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+    }
+    if (DstAddr->getType() != DstOpType) {
+      DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+    }
+
+    IRBuilder<> LoopBuilder(LoopBB);
+    PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
+    LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
+    // Loop Body
+    Value *SrcGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+    Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+    LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *NewIndex =
+        LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+    LoopIndex->addIncoming(NewIndex, LoopBB);
+
+    // Create the loop branch condition.
+    Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
+    LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
+                             LoopBB, PostLoopBB);
+  }
+
+  uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+  uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
+  if (RemainingBytes) {
+    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
+                                    : InsertBefore);
+
+    // Update the alignment based on the copy size used in the loop body.
+    SrcAlign = std::min(SrcAlign, LoopOpSize);
+    DestAlign = std::min(DestAlign, LoopOpSize);
+
+    SmallVector<Type *, 5> RemainingOps;
+    TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                          SrcAlign, DestAlign);
+
+    for (auto OpTy : RemainingOps) {
+      // Calaculate the new index
+      unsigned OperandSize = getLoopOperandSizeInBytes(OpTy);
+      uint64_t GepIndex = BytesCopied / OperandSize;
+      assert(GepIndex * OperandSize == BytesCopied &&
+             "Division should have no Remainder!");
+      // Cast source to operand type and load
+      PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
+      Value *CastedSrc = SrcAddr->getType() == SrcPtrType
+                             ? SrcAddr
+                             : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
+      Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      Value *Load = RBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+
+      // Cast destination to operand type and store.
+      PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
+      Value *CastedDst = DstAddr->getType() == DstPtrType
+                             ? DstAddr
+                             : RBuilder.CreateBitCast(DstAddr, DstPtrType);
+      Value *DstGEP = RBuilder.CreateInBoundsGEP(
+          OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
+      RBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+      BytesCopied += OperandSize;
+    }
+  }
+  assert(BytesCopied == CopyLen->getZExtValue() &&
+         "Bytes copied should match size in the call!");
+}
+
+void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
+                                       Value *SrcAddr, Value *DstAddr,
+                                       Value *CopyLen, unsigned SrcAlign,
+                                       unsigned DestAlign, bool SrcIsVolatile,
+                                       bool DstIsVolatile,
+                                       const TargetTransformInfo &TTI) {
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  BasicBlock *PostLoopBB =
+      PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
+
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  Type *LoopOpType =
+      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+
+  IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+  PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+  if (SrcAddr->getType() != SrcOpType) {
+    SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+  }
+  if (DstAddr->getType() != DstOpType) {
+    DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+  }
+
+  // Calculate the loop trip count, and remaining bytes to copy after the loop.
+  Type *CopyLenType = CopyLen->getType();
+  IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
+  assert(ILengthType &&
+         "expected size argument to memcpy to be an integer type!");
+  ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+  Value *RuntimeLoopCount = PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+  Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+  Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
+  BasicBlock *LoopBB =
+      BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, nullptr);
+  IRBuilder<> LoopBuilder(LoopBB);
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
+  LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
+
+  Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+  Value *Load = LoopBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+  Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+  LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  if (LoopOpType != Int8Type) {
+    // Loop body for the residual copy.
+    BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
+                                               PreLoopBB->getParent(), nullptr);
+    // Residual loop header.
+    BasicBlock *ResHeaderBB = BasicBlock::Create(
+        Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
+
+    // Need to update the pre-loop basic block to branch to the correct place.
+    // branch to the main loop if the count is non-zero, branch to the residual
+    // loop if the copy size is smaller then 1 iteration of the main loop but
+    // non-zero and finally branch to after the residual loop if the memcpy
+    //  size is zero.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, ResHeaderBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        ResHeaderBB);
+
+    // Determine if we need to branch to the residual loop or bypass it.
+    IRBuilder<> RHBuilder(ResHeaderBB);
+    RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+                           ResLoopBB, PostLoopBB);
+
+    // Copy the residual with single byte load/store loop.
+    IRBuilder<> ResBuilder(ResLoopBB);
+    PHINode *ResidualIndex =
+        ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
+    ResidualIndex->addIncoming(Zero, ResHeaderBB);
+
+    Value *SrcAsInt8 =
+        ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
+    Value *DstAsInt8 =
+        ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+    Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
+    Value *SrcGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
+    Value *Load = ResBuilder.CreateLoad(SrcGEP, SrcIsVolatile);
+    Value *DstGEP =
+        ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
+    ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+
+    Value *ResNewIndex =
+        ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+    ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+    // Create the loop branch condition.
+    ResBuilder.CreateCondBr(
+        ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+        PostLoopBB);
+  } else {
+    // In this case the loop operand type was a byte, and there is no need for a
+    // residual loop to copy the remaining memory after the main loop.
+    // We do however need to patch up the control flow by creating the
+    // terminators for the preloop block and the memcpy loop.
+    ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+    PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+                           LoopBB, PostLoopBB);
+    PreLoopBB->getTerminator()->eraseFromParent();
+    LoopBuilder.CreateCondBr(
+        LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+        PostLoopBB);
+  }
+}
+
+void llvm::createMemCpyLoop(Instruction *InsertBefore,
+                            Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                            unsigned SrcAlign, unsigned DestAlign,
+                            bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+    InsertBefore->getParent()->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
+                                          F, NewBB);
+
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // SrcAddr and DstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
+
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  // load from SrcAddr+LoopIndex
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
+  Value *Element =
+    LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+                             LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+                           SrcIsVolatile);
+  // store at DstAddr+LoopIndex
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+                                                        DstAddr, LoopIndex),
+                          DstIsVolatile);
+
+  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+  Value *NewIndex =
+    LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+static void createMemMoveLoop(Instruction *InsertBefore,
+                              Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                              unsigned SrcAlign, unsigned DestAlign,
+                              bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateLoad(
+      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *FwdElement = FwdLoopBuilder.CreateLoad(
+      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+  FwdLoopBuilder.CreateStore(
+      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
+
+static void createMemSetLoop(Instruction *InsertBefore,
+                             Value *DstAddr, Value *CopyLen, Value *SetValue,
+                             unsigned Align, bool IsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+      OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB
+    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // Cast pointer to the type of value getting stored
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
+
+  Builder.CreateCondBr(
+      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+      LoopBB);
+  OrigBB->getTerminator()->eraseFromParent();
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  LoopBuilder.CreateStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      IsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
+                              const TargetTransformInfo &TTI) {
+  // Original implementation
+  if (!TTI.useWideIRMemcpyLoopLowering()) {
+    createMemCpyLoop(/* InsertBefore */ Memcpy,
+                     /* SrcAddr */ Memcpy->getRawSource(),
+                     /* DstAddr */ Memcpy->getRawDest(),
+                     /* CopyLen */ Memcpy->getLength(),
+                     /* SrcAlign */ Memcpy->getAlignment(),
+                     /* DestAlign */ Memcpy->getAlignment(),
+                     /* SrcIsVolatile */ Memcpy->isVolatile(),
+                     /* DstIsVolatile */ Memcpy->isVolatile());
+  } else {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+      createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
+                                /* SrcAddr */ Memcpy->getRawSource(),
+                                /* DstAddr */ Memcpy->getRawDest(),
+                                /* CopyLen */ CI,
+                                /* SrcAlign */ Memcpy->getAlignment(),
+                                /* DestAlign */ Memcpy->getAlignment(),
+                                /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                /* DstIsVolatile */ Memcpy->isVolatile(),
+                                /* TargetTransformInfo */ TTI);
+    } else {
+      createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
+                                  /* SrcAddr */ Memcpy->getRawSource(),
+                                  /* DstAddr */ Memcpy->getRawDest(),
+                                  /* CopyLen */ Memcpy->getLength(),
+                                  /* SrcAlign */ Memcpy->getAlignment(),
+                                  /* DestAlign */ Memcpy->getAlignment(),
+                                  /* SrcIsVolatile */ Memcpy->isVolatile(),
+                                  /* DstIsVolatile */ Memcpy->isVolatile(),
+                                  /* TargetTransfomrInfo */ TTI);
+    }
+  }
+}
+
+void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+  createMemMoveLoop(/* InsertBefore */ Memmove,
+                    /* SrcAddr */ Memmove->getRawSource(),
+                    /* DstAddr */ Memmove->getRawDest(),
+                    /* CopyLen */ Memmove->getLength(),
+                    /* SrcAlign */ Memmove->getAlignment(),
+                    /* DestAlign */ Memmove->getAlignment(),
+                    /* SrcIsVolatile */ Memmove->isVolatile(),
+                    /* DstIsVolatile */ Memmove->isVolatile());
+}
+
+void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  createMemSetLoop(/* InsertBefore */ Memset,
+                   /* DstAddr */ Memset->getRawDest(),
+                   /* CopyLen */ Memset->getLength(),
+                   /* SetValue */ Memset->getValue(),
+                   /* Alignment */ Memset->getAlignment(),
+                   Memset->isVolatile());
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 75cd3bc..890afbc 100644
--- a/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
@@ -24,6 +23,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <algorithm>
@@ -356,10 +356,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
-                              i.getCaseSuccessor()));
-  
+  for (auto Case : SI->cases())
+    Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
+                              Case.getCaseSuccessor()));
+
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
@@ -403,6 +403,14 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI,
   Value *Val = SI->getCondition();  // The value we are switching on...
   BasicBlock* Default = SI->getDefaultDest();
 
+  // Don't handle unreachable blocks. If there are successors with phis, this
+  // would leave them behind with missing predecessors.
+  if ((CurBlock != &F->getEntryBlock() && pred_empty(CurBlock)) ||
+      CurBlock->getSinglePredecessor() == CurBlock) {
+    DeleteList.insert(CurBlock);
+    return;
+  }
+
   // If there is only the default destination, just branch.
   if (!SI->getNumCases()) {
     BranchInst::Create(Default, CurBlock);
diff --git a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index 24b3b12..b659a2e 100644
--- a/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -46,7 +46,7 @@ static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
     if (Allocas.empty())
       break;
 
-    PromoteMemToReg(Allocas, DT, nullptr, &AC);
+    PromoteMemToReg(Allocas, DT, &AC);
     NumPromoted += Allocas.size();
     Changed = true;
   }
@@ -59,8 +59,9 @@ PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
   if (!promoteMemoryToRegister(F, DT, AC))
     return PreservedAnalyses::all();
 
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
index c999bd0..9f2ad54 100644
--- a/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/MetaRenamer.cpp
@@ -13,15 +13,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
 namespace {
@@ -67,6 +68,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.setPreservesAll();
     }
 
@@ -110,9 +112,15 @@ namespace {
       }
 
       // Rename all functions
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       for (auto &F : M) {
         StringRef Name = F.getName();
-        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+        LibFunc Tmp;
+        // Leave library functions alone because their presence or absence could
+        // affect the behavior of other passes.
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+            TLI.getLibFunc(F, Tmp))
           continue;
 
         F.setName(renamer.newName());
@@ -139,8 +147,11 @@ namespace {
 }
 
 char MetaRenamer::ID = 0;
-INITIALIZE_PASS(MetaRenamer, "metarenamer", 
-                "Assign new names to everything", false, false)
+INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
+                      "Assign new names to everything", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
+                    "Assign new names to everything", false, false)
 //===----------------------------------------------------------------------===//
 //
 // MetaRenamer - Rename everything with metasyntactic names.
diff --git a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index 0d623df..2ef3d63 100644
--- a/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
     // Upgrade a 2-field global array type to the new 3-field format if needed.
     if (Data && OldEltTy->getNumElements() < 3)
       EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                              IRB.getInt8PtrTy(), nullptr);
+                              IRB.getInt8PtrTy());
     else
       EltTy = OldEltTy;
     if (Constant *Init = GVCtor->getInitializer()) {
@@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
       for (unsigned i = 0; i != n; ++i) {
         auto Ctor = cast<Constant>(Init->getOperand(i));
         if (EltTy != OldEltTy)
-          Ctor = ConstantStruct::get(
-              EltTy, Ctor->getAggregateElement((unsigned)0),
-              Ctor->getAggregateElement(1),
-              Constant::getNullValue(IRB.getInt8PtrTy()), nullptr);
+          Ctor =
+              ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0),
+                                  Ctor->getAggregateElement(1),
+                                  Constant::getNullValue(IRB.getInt8PtrTy()));
         CurrentCtors.push_back(Ctor);
       }
     }
@@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F,
   } else {
     // Use the new three-field struct if there isn't one already.
     EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy),
-                            IRB.getInt8PtrTy(), nullptr);
+                            IRB.getInt8PtrTy());
   }
 
   // Build a 2 or 3 field global_ctor entry.  We don't take a comdat key.
@@ -130,13 +130,25 @@ void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
 Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
   if (isa<Function>(FuncOrBitcast))
     return cast<Function>(FuncOrBitcast);
-  FuncOrBitcast->dump();
+  FuncOrBitcast->print(errs());
+  errs() << '\n';
   std::string Err;
   raw_string_ostream Stream(Err);
   Stream << "Sanitizer interface function redefined: " << *FuncOrBitcast;
   report_fatal_error(Err);
 }
 
+Function *llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
+                                             ArrayRef<Type *> InitArgTypes) {
+  assert(!InitName.empty() && "Expected init function name");
+  Function *F = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      InitName,
+      FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
+      AttributeList()));
+  F->setLinkage(Function::ExternalLinkage);
+  return F;
+}
+
 std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
     Module &M, StringRef CtorName, StringRef InitName,
     ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
@@ -144,22 +156,19 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
   assert(!InitName.empty() && "Expected init function name");
   assert(InitArgs.size() == InitArgTypes.size() &&
          "Sanitizer's init function expects different number of arguments");
+  Function *InitFunction =
+      declareSanitizerInitFunction(M, InitName, InitArgTypes);
   Function *Ctor = Function::Create(
       FunctionType::get(Type::getVoidTy(M.getContext()), false),
       GlobalValue::InternalLinkage, CtorName, &M);
   BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
   IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB));
-  Function *InitFunction =
-      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          InitName, FunctionType::get(IRB.getVoidTy(), InitArgTypes, false),
-          AttributeSet()));
-  InitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(InitFunction, InitArgs);
   if (!VersionCheckName.empty()) {
     Function *VersionCheckFunction =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
             VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
-            AttributeSet()));
+            AttributeList()));
     IRB.CreateCall(VersionCheckFunction, {});
   }
   return std::make_pair(Ctor, InitFunction);
@@ -228,3 +237,35 @@ void llvm::filterDeadComdatFunctions(
            ComdatEntriesCovered.end();
   });
 }
+
+std::string llvm::getUniqueModuleId(Module *M) {
+  MD5 Md5;
+  bool ExportsSymbols = false;
+  auto AddGlobal = [&](GlobalValue &GV) {
+    if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
+        !GV.hasExternalLinkage())
+      return;
+    ExportsSymbols = true;
+    Md5.update(GV.getName());
+    Md5.update(ArrayRef<uint8_t>{0});
+  };
+
+  for (auto &F : *M)
+    AddGlobal(F);
+  for (auto &GV : M->globals())
+    AddGlobal(GV);
+  for (auto &GA : M->aliases())
+    AddGlobal(GA);
+  for (auto &IF : M->ifuncs())
+    AddGlobal(IF);
+
+  if (!ExportsSymbols)
+    return "";
+
+  MD5::MD5Result R;
+  Md5.final(R);
+
+  SmallString<32> Str;
+  MD5::stringifyResult(R, Str);
+  return ("$" + Str).str();
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
new file mode 100644
index 0000000..dc78054
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/OrderedInstructions.cpp
@@ -0,0 +1,32 @@
+//===-- OrderedInstructions.cpp - Instruction dominance function ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility to check dominance relation of 2 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/OrderedInstructions.h"
+using namespace llvm;
+
+/// Given 2 instructions, use OrderedBasicBlock to check for dominance relation
+/// if the instructions are in the same basic block, Otherwise, use dominator
+/// tree.
+bool OrderedInstructions::dominates(const Instruction *InstA,
+                                    const Instruction *InstB) const {
+  const BasicBlock *IBB = InstA->getParent();
+  // Use ordered basic block to do dominance check in case the 2 instructions
+  // are in the same basic block.
+  if (IBB == InstB->getParent()) {
+    auto OBB = OBBMap.find(IBB);
+    if (OBB == OBBMap.end())
+      OBB = OBBMap.insert({IBB, make_unique<OrderedBasicBlock>(IBB)}).first;
+    return OBB->second->dominates(InstA, InstB);
+  }
+  return DT->dominates(InstA->getParent(), InstB->getParent());
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
new file mode 100644
index 0000000..d4cdaed
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -0,0 +1,793 @@
+//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the PredicateInfo class.
+//
+//===----------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/OrderedInstructions.h"
+#include <algorithm>
+#define DEBUG_TYPE "predicateinfo"
+using namespace llvm;
+using namespace PatternMatch;
+using namespace llvm::PredicateInfoClasses;
+
+INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                      "PredicateInfo Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                    "PredicateInfo Printer", false, false)
+static cl::opt<bool> VerifyPredicateInfo(
+    "verify-predicateinfo", cl::init(false), cl::Hidden,
+    cl::desc("Verify PredicateInfo in legacy printer pass."));
+namespace {
+DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
+              "Controls which variables are renamed with predicateinfo")
+// Given a predicate info that is a type of branching terminator, get the
+// branching block.
+const BasicBlock *getBranchBlock(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Only branches and switches should have PHIOnly defs that "
+         "require branch blocks.");
+  return cast<PredicateWithEdge>(PB)->From;
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// branching terminator.
+static Instruction *getBranchTerminator(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get a terminator from.");
+  return cast<PredicateWithEdge>(PB)->From->getTerminator();
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// edge this predicate info represents
+const std::pair<BasicBlock *, BasicBlock *>
+getBlockEdge(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get an edge from.");
+  const auto *PEdge = cast<PredicateWithEdge>(PB);
+  return std::make_pair(PEdge->From, PEdge->To);
+}
+}
+
+namespace llvm {
+namespace PredicateInfoClasses {
+enum LocalNum {
+  // Operations that must appear first in the block.
+  LN_First,
+  // Operations that are somewhere in the middle of the block, and are sorted on
+  // demand.
+  LN_Middle,
+  // Operations that must appear last in a block, like successor phi node uses.
+  LN_Last
+};
+
+// Associate global and local DFS info with defs and uses, so we can sort them
+// into a global domination ordering.
+struct ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  unsigned int LocalNum = LN_Middle;
+  // Only one of Def or Use will be set.
+  Value *Def = nullptr;
+  Use *U = nullptr;
+  // Neither PInfo nor EdgeOnly participate in the ordering
+  PredicateBase *PInfo = nullptr;
+  bool EdgeOnly = false;
+};
+
+// Perform a strict weak ordering on instructions and arguments.
+static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
+                             const Value *B) {
+  auto *ArgA = dyn_cast_or_null<Argument>(A);
+  auto *ArgB = dyn_cast_or_null<Argument>(B);
+  if (ArgA && !ArgB)
+    return true;
+  if (ArgB && !ArgA)
+    return false;
+  if (ArgA && ArgB)
+    return ArgA->getArgNo() < ArgB->getArgNo();
+  return OI.dominates(cast<Instruction>(A), cast<Instruction>(B));
+}
+
+// This compares ValueDFS structures, creating OrderedBasicBlocks where
+// necessary to compare uses/defs in the same block.  Doing so allows us to walk
+// the minimum number of instructions necessary to compute our def/use ordering.
+struct ValueDFS_Compare {
+  OrderedInstructions &OI;
+  ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {}
+
+  bool operator()(const ValueDFS &A, const ValueDFS &B) const {
+    if (&A == &B)
+      return false;
+    // The only case we can't directly compare them is when they in the same
+    // block, and both have localnum == middle.  In that case, we have to use
+    // comesbefore to see what the real ordering is, because they are in the
+    // same basic block.
+
+    bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut);
+
+    // We want to put the def that will get used for a given set of phi uses,
+    // before those phi uses.
+    // So we sort by edge, then by def.
+    // Note that only phi nodes uses and defs can come last.
+    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+      return comparePHIRelated(A, B);
+
+    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
+      return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) <
+             std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U);
+    return localComesBefore(A, B);
+  }
+
+  // For a phi use, or a non-materialized def, return the edge it represents.
+  const std::pair<BasicBlock *, BasicBlock *>
+  getBlockEdge(const ValueDFS &VD) const {
+    if (!VD.Def && VD.U) {
+      auto *PHI = cast<PHINode>(VD.U->getUser());
+      return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
+    }
+    // This is really a non-materialized def.
+    return ::getBlockEdge(VD.PInfo);
+  }
+
+  // For two phi related values, return the ordering.
+  bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
+    auto &ABlockEdge = getBlockEdge(A);
+    auto &BBlockEdge = getBlockEdge(B);
+    // Now sort by block edge and then defs before uses.
+    return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U);
+  }
+
+  // Get the definition of an instruction that occurs in the middle of a block.
+  Value *getMiddleDef(const ValueDFS &VD) const {
+    if (VD.Def)
+      return VD.Def;
+    // It's possible for the defs and uses to be null.  For branches, the local
+    // numbering will say the placed predicaeinfos should go first (IE
+    // LN_beginning), so we won't be in this function. For assumes, we will end
+    // up here, beause we need to order the def we will place relative to the
+    // assume.  So for the purpose of ordering, we pretend the def is the assume
+    // because that is where we will insert the info.
+    if (!VD.U) {
+      assert(VD.PInfo &&
+             "No def, no use, and no predicateinfo should not occur");
+      assert(isa<PredicateAssume>(VD.PInfo) &&
+             "Middle of block should only occur for assumes");
+      return cast<PredicateAssume>(VD.PInfo)->AssumeInst;
+    }
+    return nullptr;
+  }
+
+  // Return either the Def, if it's not null, or the user of the Use, if the def
+  // is null.
+  const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
+    if (Def)
+      return cast<Instruction>(Def);
+    return cast<Instruction>(U->getUser());
+  }
+
+  // This performs the necessary local basic block ordering checks to tell
+  // whether A comes before B, where both are in the same basic block.
+  bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
+    auto *ADef = getMiddleDef(A);
+    auto *BDef = getMiddleDef(B);
+
+    // See if we have real values or uses. If we have real values, we are
+    // guaranteed they are instructions or arguments. No matter what, we are
+    // guaranteed they are in the same block if they are instructions.
+    auto *ArgA = dyn_cast_or_null<Argument>(ADef);
+    auto *ArgB = dyn_cast_or_null<Argument>(BDef);
+
+    if (ArgA || ArgB)
+      return valueComesBefore(OI, ArgA, ArgB);
+
+    auto *AInst = getDefOrUser(ADef, A.U);
+    auto *BInst = getDefOrUser(BDef, B.U);
+    return valueComesBefore(OI, AInst, BInst);
+  }
+};
+
+} // namespace PredicateInfoClasses
+
+bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack,
+                                   const ValueDFS &VDUse) const {
+  if (Stack.empty())
+    return false;
+  // If it's a phi only use, make sure it's for this phi node edge, and that the
+  // use is in a phi node.  If it's anything else, and the top of the stack is
+  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to
+  // the defs they must go with so that we can know it's time to pop the stack
+  // when we hit the end of the phi uses for a given def.
+  if (Stack.back().EdgeOnly) {
+    if (!VDUse.U)
+      return false;
+    auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
+    if (!PHI)
+      return false;
+    // Check edge
+    BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
+    if (EdgePred != getBranchBlock(Stack.back().PInfo))
+      return false;
+
+    // Use dominates, which knows how to handle edge dominance.
+    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+  }
+
+  return (VDUse.DFSIn >= Stack.back().DFSIn &&
+          VDUse.DFSOut <= Stack.back().DFSOut);
+}
+
+void PredicateInfo::popStackUntilDFSScope(ValueDFSStack &Stack,
+                                          const ValueDFS &VD) {
+  while (!Stack.empty() && !stackIsInScope(Stack, VD))
+    Stack.pop_back();
+}
+
+// Convert the uses of Op into a vector of uses, associating global and local
+// DFS info with each one.
+void PredicateInfo::convertUsesToDFSOrdered(
+    Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+  for (auto &U : Op->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      ValueDFS VD;
+      // Put the phi node uses in the incoming block.
+      BasicBlock *IBlock;
+      if (auto *PN = dyn_cast<PHINode>(I)) {
+        IBlock = PN->getIncomingBlock(U);
+        // Make phi node users appear last in the incoming block
+        // they are from.
+        VD.LocalNum = LN_Last;
+      } else {
+        // If it's not a phi node use, it is somewhere in the middle of the
+        // block.
+        IBlock = I->getParent();
+        VD.LocalNum = LN_Middle;
+      }
+      DomTreeNode *DomNode = DT.getNode(IBlock);
+      // It's possible our use is in an unreachable block. Skip it if so.
+      if (!DomNode)
+        continue;
+      VD.DFSIn = DomNode->getDFSNumIn();
+      VD.DFSOut = DomNode->getDFSNumOut();
+      VD.U = &U;
+      DFSOrderedSet.push_back(VD);
+    }
+  }
+}
+
+// Collect relevant operations from Comparison that we may want to insert copies
+// for.
+void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
+  auto *Op0 = Comparison->getOperand(0);
+  auto *Op1 = Comparison->getOperand(1);
+  if (Op0 == Op1)
+    return;
+  CmpOperands.push_back(Comparison);
+  // Only want real values, not constants.  Additionally, operands with one use
+  // are only being used in the comparison, which means they will not be useful
+  // for us to consider for predicateinfo.
+  //
+  if ((isa<Instruction>(Op0) || isa<Argument>(Op0)) && !Op0->hasOneUse())
+    CmpOperands.push_back(Op0);
+  if ((isa<Instruction>(Op1) || isa<Argument>(Op1)) && !Op1->hasOneUse())
+    CmpOperands.push_back(Op1);
+}
+
+// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
+void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+                               PredicateBase *PB) {
+  OpsToRename.insert(Op);
+  auto &OperandInfo = getOrCreateValueInfo(Op);
+  AllInfos.push_back(PB);
+  OperandInfo.Infos.push_back(PB);
+}
+
+// Process an assume instruction and place relevant operations we want to rename
+// into OpsToRename.
+void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  // See if we have a comparison we support
+  SmallVector<Value *, 8> CmpOperands;
+  SmallVector<Value *, 2> ConditionsToProcess;
+  CmpInst::Predicate Pred;
+  Value *Operand = II->getOperand(0);
+  if (m_c_And(m_Cmp(Pred, m_Value(), m_Value()),
+              m_Cmp(Pred, m_Value(), m_Value()))
+          .match(II->getOperand(0))) {
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(0));
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(1));
+    ConditionsToProcess.push_back(Operand);
+  } else if (isa<CmpInst>(Operand)) {
+
+    ConditionsToProcess.push_back(Operand);
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands) {
+        auto *PA = new PredicateAssume(Op, II, Cmp);
+        addInfoFor(OpsToRename, Op, PA);
+      }
+      CmpOperands.clear();
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // Otherwise, it should be an AND.
+      assert(BinOp->getOpcode() == Instruction::And &&
+             "Should have been an AND");
+      auto *PA = new PredicateAssume(BinOp, II, BinOp);
+      addInfoFor(OpsToRename, BinOp, PA);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+  }
+}
+
+// Process a block terminating branch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  BasicBlock *FirstBB = BI->getSuccessor(0);
+  BasicBlock *SecondBB = BI->getSuccessor(1);
+  SmallVector<BasicBlock *, 2> SuccsToProcess;
+  SuccsToProcess.push_back(FirstBB);
+  SuccsToProcess.push_back(SecondBB);
+  SmallVector<Value *, 2> ConditionsToProcess;
+
+  auto InsertHelper = [&](Value *Op, bool isAnd, bool isOr, Value *Cond) {
+    for (auto *Succ : SuccsToProcess) {
+      // Don't try to insert on a self-edge. This is mainly because we will
+      // eliminate during renaming anyway.
+      if (Succ == BranchBB)
+        continue;
+      bool TakenEdge = (Succ == FirstBB);
+      // For and, only insert on the true edge
+      // For or, only insert on the false edge
+      if ((isAnd && !TakenEdge) || (isOr && TakenEdge))
+        continue;
+      PredicateBase *PB =
+          new PredicateBranch(Op, BranchBB, Succ, Cond, TakenEdge);
+      addInfoFor(OpsToRename, Op, PB);
+      if (!Succ->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, Succ});
+    }
+  };
+
+  // Match combinations of conditions.
+  CmpInst::Predicate Pred;
+  bool isAnd = false;
+  bool isOr = false;
+  SmallVector<Value *, 8> CmpOperands;
+  if (match(BI->getCondition(), m_And(m_Cmp(Pred, m_Value(), m_Value()),
+                                      m_Cmp(Pred, m_Value(), m_Value()))) ||
+      match(BI->getCondition(), m_Or(m_Cmp(Pred, m_Value(), m_Value()),
+                                     m_Cmp(Pred, m_Value(), m_Value())))) {
+    auto *BinOp = cast<BinaryOperator>(BI->getCondition());
+    if (BinOp->getOpcode() == Instruction::And)
+      isAnd = true;
+    else if (BinOp->getOpcode() == Instruction::Or)
+      isOr = true;
+    ConditionsToProcess.push_back(BinOp->getOperand(0));
+    ConditionsToProcess.push_back(BinOp->getOperand(1));
+    ConditionsToProcess.push_back(BI->getCondition());
+  } else if (isa<CmpInst>(BI->getCondition())) {
+    ConditionsToProcess.push_back(BI->getCondition());
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands)
+        InsertHelper(Op, isAnd, isOr, Cmp);
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // This must be an AND or an OR.
+      assert((BinOp->getOpcode() == Instruction::And ||
+              BinOp->getOpcode() == Instruction::Or) &&
+             "Should have been an AND or an OR");
+      // The actual value of the binop is not subject to the same restrictions
+      // as the comparison. It's either true or false on the true/false branch.
+      InsertHelper(BinOp, false, false, BinOp);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+    CmpOperands.clear();
+  }
+}
+// Process a block terminating switch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  Value *Op = SI->getCondition();
+  if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
+    return;
+
+  // Remember how many outgoing edges there are to every successor.
+  SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *TargetBlock = SI->getSuccessor(i);
+    ++SwitchEdges[TargetBlock];
+  }
+
+  // Now propagate info for each case value
+  for (auto C : SI->cases()) {
+    BasicBlock *TargetBlock = C.getCaseSuccessor();
+    if (SwitchEdges.lookup(TargetBlock) == 1) {
+      PredicateSwitch *PS = new PredicateSwitch(
+          Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
+      addInfoFor(OpsToRename, Op, PS);
+      if (!TargetBlock->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, TargetBlock});
+    }
+  }
+}
+
+// Build predicate info for our function
+void PredicateInfo::buildPredicateInfo() {
+  DT.updateDFSNumbers();
+  // Collect operands to rename from all conditional branch terminators, as well
+  // as assume statements.
+  SmallPtrSet<Value *, 8> OpsToRename;
+  for (auto DTN : depth_first(DT.getRootNode())) {
+    BasicBlock *BranchBB = DTN->getBlock();
+    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+      if (!BI->isConditional())
+        continue;
+      // Can't insert conditional information if they all go to the same place.
+      if (BI->getSuccessor(0) == BI->getSuccessor(1))
+        continue;
+      processBranch(BI, BranchBB, OpsToRename);
+    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
+      processSwitch(SI, BranchBB, OpsToRename);
+    }
+  }
+  for (auto &Assume : AC.assumptions()) {
+    if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
+      processAssume(II, II->getParent(), OpsToRename);
+  }
+  // Now rename all our operations.
+  renameUses(OpsToRename);
+}
+
+// Given the renaming stack, make all the operands currently on the stack real
+// by inserting them into the IR.  Return the last operation's value.
+Value *PredicateInfo::materializeStack(unsigned int &Counter,
+                                       ValueDFSStack &RenameStack,
+                                       Value *OrigOp) {
+  // Find the first thing we have to materialize
+  auto RevIter = RenameStack.rbegin();
+  for (; RevIter != RenameStack.rend(); ++RevIter)
+    if (RevIter->Def)
+      break;
+
+  size_t Start = RevIter - RenameStack.rbegin();
+  // The maximum number of things we should be trying to materialize at once
+  // right now is 4, depending on if we had an assume, a branch, and both used
+  // and of conditions.
+  for (auto RenameIter = RenameStack.end() - Start;
+       RenameIter != RenameStack.end(); ++RenameIter) {
+    auto *Op =
+        RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
+    ValueDFS &Result = *RenameIter;
+    auto *ValInfo = Result.PInfo;
+    // For edge predicates, we can just place the operand in the block before
+    // the terminator.  For assume, we have to place it right before the assume
+    // to ensure we dominate all of our uses.  Always insert right before the
+    // relevant instruction (terminator, assume), so that we insert in proper
+    // order in the case of multiple predicateinfo in the same block.
+    if (isa<PredicateWithEdge>(ValInfo)) {
+      IRBuilder<> B(getBranchTerminator(ValInfo));
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC =
+          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    } else {
+      auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
+      assert(PAssume &&
+             "Should not have gotten here without it being an assume");
+      IRBuilder<> B(PAssume->AssumeInst);
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC = B.CreateCall(IF, Op);
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    }
+  }
+  return RenameStack.back().Def;
+}
+
+// Instead of the standard SSA renaming algorithm, which is O(Number of
+// instructions), and walks the entire dominator tree, we walk only the defs +
+// uses.  The standard SSA renaming algorithm does not really rely on the
+// dominator tree except to order the stack push/pops of the renaming stacks, so
+// that defs end up getting pushed before hitting the correct uses.  This does
+// not require the dominator tree, only the *order* of the dominator tree. The
+// complete and correct ordering of the defs and uses, in dominator tree is
+// contained in the DFS numbering of the dominator tree. So we sort the defs and
+// uses into the DFS ordering, and then just use the renaming stack as per
+// normal, pushing when we hit a def (which is a predicateinfo instruction),
+// popping when we are out of the dfs scope for that def, and replacing any uses
+// with top of stack if it exists.  In order to handle liveness without
+// propagating liveness info, we don't actually insert the predicateinfo
+// instruction def until we see a use that it would dominate.  Once we see such
+// a use, we materialize the predicateinfo instruction in the right place and
+// use it.
+//
+// TODO: Use this algorithm to perform fast single-variable renaming in
+// promotememtoreg and memoryssa.
+void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) {
+  // Sort OpsToRename since we are going to iterate it.
+  SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end());
+  auto Comparator = [&](const Value *A, const Value *B) {
+    return valueComesBefore(OI, A, B);
+  };
+  std::sort(OpsToRename.begin(), OpsToRename.end(), Comparator);
+  ValueDFS_Compare Compare(OI);
+  // Compute liveness, and rename in O(uses) per Op.
+  for (auto *Op : OpsToRename) {
+    unsigned Counter = 0;
+    SmallVector<ValueDFS, 16> OrderedUses;
+    const auto &ValueInfo = getValueInfo(Op);
+    // Insert the possible copies into the def/use list.
+    // They will become real copies if we find a real use for them, and never
+    // created otherwise.
+    for (auto &PossibleCopy : ValueInfo.Infos) {
+      ValueDFS VD;
+      // Determine where we are going to place the copy by the copy type.
+      // The predicate info for branches always come first, they will get
+      // materialized in the split block at the top of the block.
+      // The predicate info for assumes will be somewhere in the middle,
+      // it will get materialized in front of the assume.
+      if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
+        VD.LocalNum = LN_Middle;
+        DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
+        if (!DomNode)
+          continue;
+        VD.DFSIn = DomNode->getDFSNumIn();
+        VD.DFSOut = DomNode->getDFSNumOut();
+        VD.PInfo = PossibleCopy;
+        OrderedUses.push_back(VD);
+      } else if (isa<PredicateWithEdge>(PossibleCopy)) {
+        // If we can only do phi uses, we treat it like it's in the branch
+        // block, and handle it specially. We know that it goes last, and only
+        // dominate phi uses.
+        auto BlockEdge = getBlockEdge(PossibleCopy);
+        if (EdgeUsesOnly.count(BlockEdge)) {
+          VD.LocalNum = LN_Last;
+          auto *DomNode = DT.getNode(BlockEdge.first);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            VD.EdgeOnly = true;
+            OrderedUses.push_back(VD);
+          }
+        } else {
+          // Otherwise, we are in the split block (even though we perform
+          // insertion in the branch block).
+          // Insert a possible copy at the split block and before the branch.
+          VD.LocalNum = LN_First;
+          auto *DomNode = DT.getNode(BlockEdge.second);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            OrderedUses.push_back(VD);
+          }
+        }
+      }
+    }
+
+    convertUsesToDFSOrdered(Op, OrderedUses);
+    std::sort(OrderedUses.begin(), OrderedUses.end(), Compare);
+    SmallVector<ValueDFS, 8> RenameStack;
+    // For each use, sorted into dfs order, push values and replaces uses with
+    // top of stack, which will represent the reaching def.
+    for (auto &VD : OrderedUses) {
+      // We currently do not materialize copy over copy, but we should decide if
+      // we want to.
+      bool PossibleCopy = VD.PInfo != nullptr;
+      if (RenameStack.empty()) {
+        DEBUG(dbgs() << "Rename Stack is empty\n");
+      } else {
+        DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                     << RenameStack.back().DFSIn << ","
+                     << RenameStack.back().DFSOut << ")\n");
+      }
+
+      DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                   << VD.DFSOut << ")\n");
+
+      bool ShouldPush = (VD.Def || PossibleCopy);
+      bool OutOfScope = !stackIsInScope(RenameStack, VD);
+      if (OutOfScope || ShouldPush) {
+        // Sync to our current scope.
+        popStackUntilDFSScope(RenameStack, VD);
+        if (ShouldPush) {
+          RenameStack.push_back(VD);
+        }
+      }
+      // If we get to this point, and the stack is empty we must have a use
+      // with no renaming needed, just skip it.
+      if (RenameStack.empty())
+        continue;
+      // Skip values, only want to rename the uses
+      if (VD.Def || PossibleCopy)
+        continue;
+      if (!DebugCounter::shouldExecute(RenameCounter)) {
+        DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        continue;
+      }
+      ValueDFS &Result = RenameStack.back();
+
+      // If the possible copy dominates something, materialize our stack up to
+      // this point. This ensures every comparison that affects our operation
+      // ends up with predicateinfo.
+      if (!Result.Def)
+        Result.Def = materializeStack(Counter, RenameStack, Op);
+
+      DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                   << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+      assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
+             "Predicateinfo def should have dominated this use");
+      VD.U->set(Result.Def);
+    }
+  }
+}
+
+PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) {
+  auto OIN = ValueInfoNums.find(Operand);
+  if (OIN == ValueInfoNums.end()) {
+    // This will grow it
+    ValueInfos.resize(ValueInfos.size() + 1);
+    // This will use the new size and give us a 0 based number of the info
+    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
+    assert(InsertResult.second && "Value info number already existed?");
+    return ValueInfos[InsertResult.first->second];
+  }
+  return ValueInfos[OIN->second];
+}
+
+const PredicateInfo::ValueInfo &
+PredicateInfo::getValueInfo(Value *Operand) const {
+  auto OINI = ValueInfoNums.lookup(Operand);
+  assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
+  assert(OINI < ValueInfos.size() &&
+         "Value Info Number greater than size of Value Info Table");
+  return ValueInfos[OINI];
+}
+
+PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
+                             AssumptionCache &AC)
+    : F(F), DT(DT), AC(AC), OI(&DT) {
+  // Push an empty operand info so that we can detect 0 as not finding one
+  ValueInfos.resize(1);
+  buildPredicateInfo();
+}
+
+PredicateInfo::~PredicateInfo() {}
+
+void PredicateInfo::verifyPredicateInfo() const {}
+
+char PredicateInfoPrinterLegacyPass::ID = 0;
+
+PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
+    : FunctionPass(ID) {
+  initializePredicateInfoPrinterLegacyPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+}
+
+bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(dbgs());
+  if (VerifyPredicateInfo)
+    PredInfo->verifyPredicateInfo();
+  return false;
+}
+
+PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  OS << "PredicateInfo for function: " << F.getName() << "\n";
+  make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+/// \brief An assembly annotator class to print PredicateInfo information in
+/// comments.
+class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class PredicateInfo;
+  const PredicateInfo *PredInfo;
+
+public:
+  PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {}
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
+      OS << "; Has predicate info\n";
+      if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
+        OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
+           << " Comparison:" << *PB->Condition << " Edge: [";
+        PB->From->printAsOperand(OS);
+        OS << ",";
+        PB->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
+        OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
+           << " Switch:" << *PS->Switch << " Edge: [";
+        PS->From->printAsOperand(OS);
+        OS << ",";
+        PS->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
+        OS << "; assume predicate info {"
+           << " Comparison:" << *PA->Condition << " }\n";
+      }
+    }
+  }
+};
+
+void PredicateInfo::print(raw_ostream &OS) const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void PredicateInfo::dump() const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+
+  return PreservedAnalyses::all();
+}
+}
diff --git a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 35faa6f..cdba982 100644
--- a/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -38,6 +38,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -224,13 +225,10 @@ struct PromoteMem2Reg {
   std::vector<AllocaInst *> Allocas;
   DominatorTree &DT;
   DIBuilder DIB;
-
-  /// An AliasSetTracker object to update.  If null, don't update it.
-  AliasSetTracker *AST;
-
   /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
   AssumptionCache *AC;
 
+  const SimplifyQuery SQ;
   /// Reverse mapping of Allocas.
   DenseMap<AllocaInst *, unsigned> AllocaLookup;
 
@@ -269,10 +267,11 @@ struct PromoteMem2Reg {
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 AliasSetTracker *AST, AssumptionCache *AC)
+                 AssumptionCache *AC)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
         DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
-        AST(AST), AC(AC) {}
+        AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(),
+                   nullptr, &DT, AC) {}
 
   void run();
 
@@ -301,6 +300,18 @@ private:
 
 } // end of anonymous namespace
 
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+  Function *AssumeIntrinsic =
+      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+                                       Constant::getNullValue(LI->getType()));
+  LoadNotNull->insertAfter(LI);
+  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+  CI->insertAfter(LoadNotNull);
+  AC->registerAssumption(CI);
+}
+
 static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
   // instructions except for load and store.
@@ -334,9 +345,8 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
 /// and thus must be phi-ed with undef. We fall back to the standard alloca
 /// promotion algorithm in that case.
 static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                     LargeBlockInfo &LBI,
-                                     DominatorTree &DT,
-                                     AliasSetTracker *AST) {
+                                     LargeBlockInfo &LBI, DominatorTree &DT,
+                                     AssumptionCache *AC) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -387,9 +397,15 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     // code.
     if (ReplVal == LI)
       ReplVal = UndefValue::get(LI->getType());
+
+    // If the load was marked as nonnull we don't want to lose
+    // that information when we erase this Load. So we preserve
+    // it with an assume.
+    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+        !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+      addAssumeNonNull(AC, LI);
+
     LI->replaceAllUsesWith(ReplVal);
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
     LI->eraseFromParent();
     LBI.deleteValue(LI);
   }
@@ -410,8 +426,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   Info.OnlyStore->eraseFromParent();
   LBI.deleteValue(Info.OnlyStore);
 
-  if (AST)
-    AST->deleteValue(AI);
   AI->eraseFromParent();
   LBI.deleteValue(AI);
   return true;
@@ -435,7 +449,8 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
 ///  }
 static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                                      LargeBlockInfo &LBI,
-                                     AliasSetTracker *AST) {
+                                     DominatorTree &DT,
+                                     AssumptionCache *AC) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -476,13 +491,18 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
         // There is no store before this load, bail out (load may be affected
         // by the following stores - see main comment).
         return false;
-    }
-    else
+    } else {
       // Otherwise, there was a store before this load, the load takes its value.
-      LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0));
+      // Note, if the load was marked as nonnull we don't want to lose that
+      // information when we erase it. So we preserve it with an assume.
+      Value *ReplVal = std::prev(I)->second->getOperand(0);
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      LI->replaceAllUsesWith(ReplVal);
+    }
 
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
     LI->eraseFromParent();
     LBI.deleteValue(LI);
   }
@@ -499,8 +519,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     LBI.deleteValue(SI);
   }
 
-  if (AST)
-    AST->deleteValue(AI);
   AI->eraseFromParent();
   LBI.deleteValue(AI);
 
@@ -517,8 +535,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  if (AST)
-    PointerAllocaValues.resize(Allocas.size());
   AllocaDbgDeclares.resize(Allocas.size());
 
   AllocaInfo Info;
@@ -536,8 +552,6 @@ void PromoteMem2Reg::run() {
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
-      if (AST)
-        AST->deleteValue(AI);
       AI->eraseFromParent();
 
       // Remove the alloca from the Allocas list, since it has been processed
@@ -553,7 +567,7 @@ void PromoteMem2Reg::run() {
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
-      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST)) {
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AC)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -564,7 +578,7 @@ void PromoteMem2Reg::run() {
     // If the alloca is only read and written in one basic block, just perform a
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
-        promoteSingleBlockAlloca(AI, Info, LBI, AST)) {
+        promoteSingleBlockAlloca(AI, Info, LBI, DT, AC)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -578,11 +592,6 @@ void PromoteMem2Reg::run() {
         BBNumbers[&BB] = ID++;
     }
 
-    // If we have an AST to keep updated, remember some pointer value that is
-    // stored into the alloca.
-    if (AST)
-      PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
-
     // Remember the dbg.declare intrinsic describing this alloca, if any.
     if (Info.DbgDeclare)
       AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
@@ -662,13 +671,9 @@ void PromoteMem2Reg::run() {
     // tree. Just delete the users now.
     if (!A->use_empty())
       A->replaceAllUsesWith(UndefValue::get(A->getType()));
-    if (AST)
-      AST->deleteValue(A);
     A->eraseFromParent();
   }
 
-  const DataLayout &DL = F.getParent()->getDataLayout();
-
   // Remove alloca's dbg.declare instrinsics from the function.
   for (unsigned i = 0, e = AllocaDbgDeclares.size(); i != e; ++i)
     if (DbgDeclareInst *DDI = AllocaDbgDeclares[i])
@@ -693,9 +698,7 @@ void PromoteMem2Reg::run() {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
-      if (Value *V = SimplifyInstruction(PN, DL, nullptr, &DT, AC)) {
-        if (AST && PN->getType()->isPointerTy())
-          AST->deleteValue(PN);
+      if (Value *V = SimplifyInstruction(PN, SQ)) {
         PN->replaceAllUsesWith(V);
         PN->eraseFromParent();
         NewPhiNodes.erase(I++);
@@ -863,10 +866,6 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                        &BB->front());
   ++NumPHIInsert;
   PhiToAllocaMap[PN] = AllocaNo;
-
-  if (AST && PN->getType()->isPointerTy())
-    AST->copyValue(PointerAllocaValues[AllocaNo], PN);
-
   return true;
 }
 
@@ -940,10 +939,15 @@ NextIteration:
 
       Value *V = IncomingVals[AI->second];
 
+      // If the load was marked as nonnull we don't want to lose
+      // that information when we erase this Load. So we preserve
+      // it with an assume.
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(V, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
       // Anything using the load now uses the current value.
       LI->replaceAllUsesWith(V);
-      if (AST && LI->getType()->isPointerTy())
-        AST->deleteValue(LI);
       BB->getInstList().erase(LI);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
       // Delete this instruction and mark the name as the current holder of the
@@ -987,10 +991,10 @@ NextIteration:
 }
 
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           AliasSetTracker *AST, AssumptionCache *AC) {
+                           AssumptionCache *AC) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, AST, AC).run();
+  PromoteMem2Reg(Allocas, DT, AC).run();
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 8e93ee7..6ccf54e 100644
--- a/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -13,18 +13,27 @@
 
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <cassert>
+#include <utility>
 
 using namespace llvm;
 
@@ -36,7 +45,7 @@ static AvailableValsTy &getAvailableVals(void *AV) {
 }
 
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
-  : AV(nullptr), ProtoType(nullptr), ProtoName(), InsertedPHIs(NewPHI) {}
+  : InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete static_cast<AvailableValsTy*>(AV);
@@ -205,6 +214,7 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
 }
 
 namespace llvm {
+
 template<>
 class SSAUpdaterTraits<SSAUpdater> {
 public:
@@ -230,6 +240,7 @@ public:
     PHI_iterator &operator++() { ++idx; return *this; } 
     bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
     bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+
     Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
     BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
   };
@@ -303,7 +314,7 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
 /// Check to see if AvailableVals has an entry for the specified BB and if so,
 /// return it.  If not, construct SSA form by first calculating the required
@@ -337,14 +348,12 @@ LoadAndStorePromoter(ArrayRef<const Instruction*> Insts,
   SSA.Initialize(SomeVal->getType(), BaseName);
 }
 
-
 void LoadAndStorePromoter::
 run(const SmallVectorImpl<Instruction*> &Insts) const {
-  
   // First step: bucket up uses of the alloca by the block they occur in.
   // This is important because we have to handle multiple defs/uses in a block
   // ourselves: SSAUpdater is purely for cross-block references.
-  DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock;
+  DenseMap<BasicBlock*, TinyPtrVector<Instruction*>> UsesByBlock;
 
   for (Instruction *User : Insts)
     UsesByBlock[User->getParent()].push_back(User);
diff --git a/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp b/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp
index 9afd175..8c23957 100644
--- a/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SanitizerStats.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SanitizerStats.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 7b0bddb..8784b97 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -15,21 +15,22 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -54,11 +55,11 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -169,6 +170,8 @@ class SimplifyCFGOpt {
   unsigned BonusInstThreshold;
   AssumptionCache *AC;
   SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
+  // See comments in SimplifyCFGOpt::SimplifySwitch.
+  bool LateSimplifyCFG;
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(
       TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
@@ -192,9 +195,10 @@ class SimplifyCFGOpt {
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
                  unsigned BonusInstThreshold, AssumptionCache *AC,
-                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders)
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                 bool LateSimplifyCFG)
       : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
-        LoopHeaders(LoopHeaders) {}
+        LoopHeaders(LoopHeaders), LateSimplifyCFG(LateSimplifyCFG) {}
 
   bool run(BasicBlock *BB);
 };
@@ -591,7 +595,7 @@ private:
       Span = Span.inverse();
 
     // If there are a ton of values, we don't want to make a ginormous switch.
-    if (Span.getSetSize().ugt(8) || Span.isEmptySet()) {
+    if (Span.isSizeLargerThan(8) || Span.isEmptySet()) {
       return false;
     }
 
@@ -710,10 +714,9 @@ BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
     TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
-         ++i)
-      Cases.push_back(
-          ValueEqualityComparisonCase(i.getCaseValue(), i.getCaseSuccessor()));
+    for (auto Case : SI->cases())
+      Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
+                                                  Case.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
@@ -846,12 +849,12 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
-      if (DeadCases.count(i.getCaseValue())) {
+      if (DeadCases.count(i->getCaseValue())) {
         if (HasWeight) {
-          std::swap(Weights[i.getCaseIndex() + 1], Weights.back());
+          std::swap(Weights[i->getCaseIndex() + 1], Weights.back());
           Weights.pop_back();
         }
-        i.getCaseSuccessor()->removePredecessor(TI->getParent());
+        i->getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
@@ -996,8 +999,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       SmallSetVector<BasicBlock*, 4> FailBlocks;
       if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
         for (auto *Succ : FailBlocks) {
-          std::vector<BasicBlock*> Blocks = { TI->getParent() };
-          if (!SplitBlockPredecessors(Succ, Blocks, ".fold.split"))
+          if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split"))
             return false;
         }
       }
@@ -1280,7 +1282,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (!isa<CallInst>(I1))
       I1->setDebugLoc(
           DILocation::getMergedLocation(I1->getDebugLoc(), I2->getDebugLoc()));
- 
+
     I2->eraseFromParent();
     Changed = true;
 
@@ -1373,53 +1375,6 @@ HoistTerminator:
   return true;
 }
 
-// Is it legal to place a variable in operand \c OpIdx of \c I?
-// FIXME: This should be promoted to Instruction.
-static bool canReplaceOperandWithVariable(const Instruction *I,
-                                          unsigned OpIdx) {
-  // We can't have a PHI with a metadata type.
-  if (I->getOperand(OpIdx)->getType()->isMetadataTy())
-    return false;
-
-  // Early exit.
-  if (!isa<Constant>(I->getOperand(OpIdx)))
-    return true;
-
-  switch (I->getOpcode()) {
-  default:
-    return true;
-  case Instruction::Call:
-  case Instruction::Invoke:
-    // FIXME: many arithmetic intrinsics have no issue taking a
-    // variable, however it's hard to distingish these from
-    // specials such as @llvm.frameaddress that require a constant.
-    if (isa<IntrinsicInst>(I))
-      return false;
-
-    // Constant bundle operands may need to retain their constant-ness for
-    // correctness.
-    if (ImmutableCallSite(I).isBundleOperand(OpIdx))
-      return false;
-
-    return true;
-
-  case Instruction::ShuffleVector:
-    // Shufflevector masks are constant.
-    return OpIdx != 2;
-  case Instruction::ExtractValue:
-  case Instruction::InsertValue:
-    // All operands apart from the first are constant.
-    return OpIdx == 0;
-  case Instruction::Alloca:
-    return false;
-  case Instruction::GetElementPtr:
-    if (OpIdx == 0)
-      return true;
-    gep_type_iterator It = std::next(gep_type_begin(I), OpIdx - 1);
-    return It.isSequential();
-  }
-}
-
 // All instructions in Insts belong to different blocks that all unconditionally
 // branch to a common successor. Analyze each instruction and return true if it
 // would be possible to sink them into their successor, creating one common
@@ -1472,29 +1427,28 @@ static bool canSinkInstructions(
       return false;
   }
 
+  // Because SROA can't handle speculating stores of selects, try not
+  // to sink loads or stores of allocas when we'd have to create a PHI for
+  // the address operand. Also, because it is likely that loads or stores
+  // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+  // This can cause code churn which can have unintended consequences down
+  // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+  // FIXME: This is a workaround for a deficiency in SROA - see
+  // https://llvm.org/bugs/show_bug.cgi?id=30188
+  if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1));
+      }))
+    return false;
+  if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(0));
+      }))
+    return false;
+
   for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
     if (I0->getOperand(OI)->getType()->isTokenTy())
       // Don't touch any operand of token type.
       return false;
 
-    // Because SROA can't handle speculating stores of selects, try not
-    // to sink loads or stores of allocas when we'd have to create a PHI for
-    // the address operand. Also, because it is likely that loads or stores
-    // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
-    // This can cause code churn which can have unintended consequences down
-    // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
-    // FIXME: This is a workaround for a deficiency in SROA - see
-    // https://llvm.org/bugs/show_bug.cgi?id=30188
-    if (OI == 1 && isa<StoreInst>(I0) &&
-        any_of(Insts, [](const Instruction *I) {
-          return isa<AllocaInst>(I->getOperand(1));
-        }))
-      return false;
-    if (OI == 0 && isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
-          return isa<AllocaInst>(I->getOperand(0));
-        }))
-      return false;
-
     auto SameAsI0 = [&I0, OI](const Instruction *I) {
       assert(I->getNumOperands() == I0->getNumOperands());
       return I->getOperand(OI) == I0->getOperand(OI);
@@ -1546,7 +1500,7 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
         }))
       return false;
   }
-  
+
   // We don't need to do any more checking here; canSinkLastInstruction should
   // have done it all for us.
   SmallVector<Value*, 4> NewOperands;
@@ -1653,7 +1607,7 @@ namespace {
     bool isValid() const {
       return !Fail;
     }
-    
+
     void operator -- () {
       if (Fail)
         return;
@@ -1699,7 +1653,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   //      /    \
   //    [f(1)] [if]
   //      |     | \
-  //      |     |  \
+  //      |     |  |
   //      |  [f(2)]|
   //       \    | /
   //        [ end ]
@@ -1737,7 +1691,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   }
   if (UnconditionalPreds.size() < 2)
     return false;
-  
+
   bool Changed = false;
   // We take a two-step approach to tail sinking. First we scan from the end of
   // each block upwards in lockstep. If the n'th instruction from the end of each
@@ -1767,7 +1721,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
     if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
         NumPHIInsts++;
-    
+
     return NumPHIInsts <= 1;
   };
 
@@ -1790,7 +1744,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
     }
     if (!Profitable)
       return false;
-    
+
     DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
     // Insert a new block postdominating all blocks we're going to sink from.
@@ -1800,7 +1754,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       return false;
     Changed = true;
   }
-  
+
   // Now that we've analyzed all potential sinking candidates, perform the
   // actual sink. We iteratively sink the last non-terminator of the source
   // blocks into their common successor unless doing so would require too
@@ -1826,7 +1780,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
       DEBUG(dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
       break;
     }
-    
+
     if (!sinkLastInstruction(UnconditionalPreds))
       return Changed;
     NumSinkCommons++;
@@ -2078,6 +2032,9 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
     Value *S = Builder.CreateSelect(
         BrCond, TrueV, FalseV, TrueV->getName() + "." + FalseV->getName(), BI);
     SpeculatedStore->setOperand(0, S);
+    SpeculatedStore->setDebugLoc(
+        DILocation::getMergedLocation(
+          BI->getDebugLoc(), SpeculatedStore->getDebugLoc()));
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
@@ -2147,7 +2104,8 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// If we have a conditional branch on a PHI node value that is defined in the
 /// same block as the branch and if any PHI entries are constants, thread edges
 /// corresponding to that entry to be branches to their ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
+                                AssumptionCache *AC) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -2225,11 +2183,11 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       }
 
       // Check for trivial simplification.
-      if (Value *V = SimplifyInstruction(N, DL)) {
+      if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
         if (!BBI->use_empty())
           TranslateMap[&*BBI] = V;
         if (!N->mayHaveSideEffects()) {
-          delete N; // Instruction folded away, don't need actual inst
+          N->deleteValue(); // Instruction folded away, don't need actual inst
           N = nullptr;
         }
       } else {
@@ -2239,6 +2197,11 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       // Insert the new instruction into its new home.
       if (N)
         EdgeBB->getInstList().insert(InsertPt, N);
+
+      // Register the new instruction with the assumption cache if necessary.
+      if (auto *II = dyn_cast_or_null<IntrinsicInst>(N))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
     }
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
@@ -2251,7 +2214,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       }
 
     // Recurse, simplifying any other constants.
-    return FoldCondBranchOnPHI(BI, DL) | true;
+    return FoldCondBranchOnPHI(BI, DL, AC) | true;
   }
 
   return false;
@@ -2296,7 +2259,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
-    if (Value *V = SimplifyInstruction(PN, DL)) {
+    if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
       PN->replaceAllUsesWith(V);
       PN->eraseFromParent();
       continue;
@@ -3045,6 +3008,15 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
   BasicBlock *QFB = QBI->getSuccessor(1);
   BasicBlock *PostBB = QFB->getSingleSuccessor();
 
+  // Make sure we have a good guess for PostBB. If QTB's only successor is
+  // QFB, then QFB is a better PostBB.
+  if (QTB->getSingleSuccessor() == QFB)
+    PostBB = QFB;
+
+  // If we couldn't find a good PostBB, stop.
+  if (!PostBB)
+    return false;
+
   bool InvertPCond = false, InvertQCond = false;
   // Canonicalize fallthroughs to the true branches.
   if (PFB == QBI->getParent()) {
@@ -3069,14 +3041,13 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI) {
   auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
     return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
   };
-  if (!PostBB ||
-      !HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
+  if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
       !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB))
     return false;
   if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
       (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
     return false;
-  if (PostBB->getNumUses() != 2 || QBI->getParent()->getNumUses() != 2)
+  if (!PostBB->hasNUses(2) || !QBI->getParent()->hasNUses(2))
     return false;
 
   // OK, this is a sequence of two diamonds or triangles.
@@ -3433,8 +3404,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 
   // Find the relevant condition and destinations.
   Value *Condition = Select->getCondition();
-  BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
-  BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
 
   // Get weight for TrueBB and FalseBB.
   uint32_t TrueWeight = 0, FalseWeight = 0;
@@ -3444,9 +3415,9 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
       TrueWeight =
-          (uint32_t)Weights[SI->findCaseValue(TrueVal).getSuccessorIndex()];
+          (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
       FalseWeight =
-          (uint32_t)Weights[SI->findCaseValue(FalseVal).getSuccessorIndex()];
+          (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
     }
   }
 
@@ -3526,7 +3497,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(
     assert(VVal && "Should have a unique destination value");
     ICI->setOperand(0, VVal);
 
-    if (Value *V = SimplifyInstruction(ICI, DL)) {
+    if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) {
       ICI->replaceAllUsesWith(V);
       ICI->eraseFromParent();
     }
@@ -3736,7 +3707,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
     if (!isa<DbgInfoIntrinsic>(I))
       return false;
 
-  SmallSet<BasicBlock *, 4> TrivialUnwindBlocks;
+  SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
   auto *PhiLPInst = cast<PHINode>(RI->getValue());
 
   // Check incoming blocks to see if any of them are trivial.
@@ -4148,15 +4119,16 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
-           ++i)
-        if (i.getCaseSuccessor() == BB) {
-          BB->removePredecessor(SI->getParent());
-          SI->removeCase(i);
-          --i;
-          --e;
-          Changed = true;
+      for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+        if (i->getCaseSuccessor() != BB) {
+          ++i;
+          continue;
         }
+        BB->removePredecessor(SI->getParent());
+        i = SI->removeCase(i);
+        e = SI->case_end();
+        Changed = true;
+      }
     } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
       if (II->getUnwindDest() == BB) {
         removeUnwindEdge(TI->getParent());
@@ -4239,18 +4211,18 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   SmallVector<ConstantInt *, 16> CasesA;
   SmallVector<ConstantInt *, 16> CasesB;
 
-  for (SwitchInst::CaseIt I : SI->cases()) {
-    BasicBlock *Dest = I.getCaseSuccessor();
+  for (auto Case : SI->cases()) {
+    BasicBlock *Dest = Case.getCaseSuccessor();
     if (!DestA)
       DestA = Dest;
     if (Dest == DestA) {
-      CasesA.push_back(I.getCaseValue());
+      CasesA.push_back(Case.getCaseValue());
       continue;
     }
     if (!DestB)
       DestB = Dest;
     if (Dest == DestB) {
-      CasesB.push_back(I.getCaseValue());
+      CasesB.push_back(Case.getCaseValue());
       continue;
     }
     return false; // More than two destinations.
@@ -4348,8 +4320,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
-  APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
-  computeKnownBits(Cond, KnownZero, KnownOne, DL, 0, AC, SI);
+  KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
 
   // We can also eliminate cases by determining that their values are outside of
   // the limited range of the condition based on how many significant (non-sign)
@@ -4360,8 +4331,8 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   // Gather dead cases.
   SmallVector<ConstantInt *, 8> DeadCases;
   for (auto &Case : SI->cases()) {
-    APInt CaseVal = Case.getCaseValue()->getValue();
-    if ((CaseVal & KnownZero) != 0 || (CaseVal & KnownOne) != KnownOne ||
+    const APInt &CaseVal = Case.getCaseValue()->getValue();
+    if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
         (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
       DeadCases.push_back(Case.getCaseValue());
       DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal << " is dead.\n");
@@ -4375,7 +4346,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   bool HasDefault =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
   const unsigned NumUnknownBits =
-      Bits - (KnownZero.Or(KnownOne)).countPopulation();
+      Bits - (Known.Zero | Known.One).countPopulation();
   assert(NumUnknownBits <= Bits);
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
@@ -4400,17 +4371,17 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
 
   // Remove dead cases from the switch.
   for (ConstantInt *DeadCase : DeadCases) {
-    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCase);
-    assert(Case != SI->case_default() &&
+    SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
+    assert(CaseI != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
     if (HasWeight) {
-      std::swap(Weights[Case.getCaseIndex() + 1], Weights.back());
+      std::swap(Weights[CaseI->getCaseIndex() + 1], Weights.back());
       Weights.pop_back();
     }
 
     // Prune unused values from PHI nodes.
-    Case.getCaseSuccessor()->removePredecessor(SI->getParent());
-    SI->removeCase(Case);
+    CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
+    SI->removeCase(CaseI);
   }
   if (HasWeight && Weights.size() >= 2) {
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
@@ -4464,10 +4435,9 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   typedef DenseMap<PHINode *, SmallVector<int, 4>> ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E;
-       ++I) {
-    ConstantInt *CaseValue = I.getCaseValue();
-    BasicBlock *CaseDest = I.getCaseSuccessor();
+  for (auto Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseDest = Case.getCaseSuccessor();
 
     int PhiIndex;
     PHINode *PHI =
@@ -4811,7 +4781,7 @@ public:
   SwitchLookupTable(
       Module &M, uint64_t TableSize, ConstantInt *Offset,
       const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-      Constant *DefaultValue, const DataLayout &DL);
+      Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
 
   /// Build instructions with Builder to retrieve the value at
   /// the position given by Index in the lookup table.
@@ -4865,7 +4835,7 @@ private:
 SwitchLookupTable::SwitchLookupTable(
     Module &M, uint64_t TableSize, ConstantInt *Offset,
     const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
-    Constant *DefaultValue, const DataLayout &DL)
+    Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName)
     : SingleValue(nullptr), BitMap(nullptr), BitMapElementTy(nullptr),
       LinearOffset(nullptr), LinearMultiplier(nullptr), Array(nullptr) {
   assert(Values.size() && "Can't build lookup table without values!");
@@ -4927,7 +4897,7 @@ SwitchLookupTable::SwitchLookupTable(
         LinearMappingPossible = false;
         break;
       }
-      APInt Val = ConstVal->getValue();
+      const APInt &Val = ConstVal->getValue();
       if (I != 0) {
         APInt Dist = Val - PrevVal;
         if (I == 1) {
@@ -4973,7 +4943,7 @@ SwitchLookupTable::SwitchLookupTable(
 
   Array = new GlobalVariable(M, ArrayTy, /*constant=*/true,
                              GlobalVariable::PrivateLinkage, Initializer,
-                             "switch.table");
+                             "switch.table." + FuncName);
   Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   Kind = ArrayKind;
 }
@@ -5202,8 +5172,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // common destination, as well as the min and max case values.
   assert(SI->case_begin() != SI->case_end());
   SwitchInst::CaseIt CI = SI->case_begin();
-  ConstantInt *MinCaseVal = CI.getCaseValue();
-  ConstantInt *MaxCaseVal = CI.getCaseValue();
+  ConstantInt *MinCaseVal = CI->getCaseValue();
+  ConstantInt *MaxCaseVal = CI->getCaseValue();
 
   BasicBlock *CommonDest = nullptr;
   typedef SmallVector<std::pair<ConstantInt *, Constant *>, 4> ResultListTy;
@@ -5213,7 +5183,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   SmallVector<PHINode *, 4> PHIs;
 
   for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
-    ConstantInt *CaseVal = CI.getCaseValue();
+    ConstantInt *CaseVal = CI->getCaseValue();
     if (CaseVal->getValue().slt(MinCaseVal->getValue()))
       MinCaseVal = CaseVal;
     if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
@@ -5222,7 +5192,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Resulting value at phi nodes for this case value.
     typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
     ResultsTy Results;
-    if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
+    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
                         Results, DL, TTI))
       return false;
 
@@ -5363,7 +5333,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
     // If using a bitmask, use any value to fill the lookup table holes.
     Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
-    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL);
+    StringRef FuncName = SI->getParent()->getParent()->getName();
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
+                            FuncName);
 
     Value *Result = Table.BuildLookup(TableIndex, Builder);
 
@@ -5503,11 +5475,10 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   auto *Rot = Builder.CreateOr(LShr, Shl);
   SI->replaceUsesOfWith(SI->getCondition(), Rot);
 
-  for (SwitchInst::CaseIt C = SI->case_begin(), E = SI->case_end(); C != E;
-       ++C) {
-    auto *Orig = C.getCaseValue();
+  for (auto Case : SI->cases()) {
+    auto *Orig = Case.getCaseValue();
     auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
-    C.setValue(
+    Case.setValue(
         cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
   }
   return true;
@@ -5553,7 +5524,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (ForwardSwitchConditionToPHI(SI))
     return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
-  if (SwitchToLookupTable(SI, Builder, DL, TTI))
+  // The conversion from switch to lookup tables results in difficult
+  // to analyze code and makes pruning branches much harder.
+  // This is a problem of the switch expression itself can still be
+  // restricted as a result of inlining or CVP. There only apply this
+  // transformation during late steps of the optimisation chain.
+  if (LateSimplifyCFG && SwitchToLookupTable(SI, Builder, DL, TTI))
     return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   if (ReduceSwitchRange(SI, Builder, DL, TTI))
@@ -5680,20 +5656,22 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
                                           IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
+  BasicBlock *Succ = BI->getSuccessor(0);
 
   if (SinkCommon && SinkThenElseCodeToEnd(BI))
     return true;
 
   // If the Terminator is the only non-phi instruction, simplify the block.
-  // if LoopHeader is provided, check if the block is a loop header
-  // (This is for early invocations before loop simplify and vectorization
-  // to keep canonical loop forms for nested loops.
-  // These blocks can be eliminated when the pass is invoked later
-  // in the back-end.)
+  // if LoopHeader is provided, check if the block or its successor is a loop
+  // header (This is for early invocations before loop simplify and
+  // vectorization to keep canonical loop forms for nested loops. These blocks
+  // can be eliminated when the pass is invoked later in the back-end.)
+  bool NeedCanonicalLoop =
+      !LateSimplifyCFG &&
+      (LoopHeaders && (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
-      (!LoopHeaders || !LoopHeaders->count(BB)) &&
-      TryToSimplifyUncondBranchFromEmptyBlock(BB))
+      !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
 
   // If the only instruction in the block is a seteq/setne comparison
@@ -5778,8 +5756,8 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (BasicBlock *Dom = BB->getSinglePredecessor()) {
     auto *PBI = dyn_cast_or_null<BranchInst>(Dom->getTerminator());
     if (PBI && PBI->isConditional() &&
-        PBI->getSuccessor(0) != PBI->getSuccessor(1) &&
-        (PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB)) {
+        PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+      assert(PBI->getSuccessor(0) == BB || PBI->getSuccessor(1) == BB);
       bool CondIsFalse = PBI->getSuccessor(1) == BB;
       Optional<bool> Implication = isImpliedCondition(
           PBI->getCondition(), BI->getCondition(), DL, CondIsFalse);
@@ -5833,7 +5811,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
-      if (FoldCondBranchOnPHI(BI, DL))
+      if (FoldCondBranchOnPHI(BI, DL, AC))
         return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   // Scan predecessor blocks for conditional branches.
@@ -6012,8 +5990,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
                        unsigned BonusInstThreshold, AssumptionCache *AC,
-                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
+                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                       bool LateSimplifyCFG) {
   return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
-                        BonusInstThreshold, AC, LoopHeaders)
+                        BonusInstThreshold, AC, LoopHeaders, LateSimplifyCFG)
       .run(BB);
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 6b1d3dc..6d90e6b 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -35,6 +36,9 @@ using namespace llvm;
 STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
 STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(
+    NumSimplifiedSDiv,
+    "Number of IV signed division operations converted to unsigned division");
 STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 
 namespace {
@@ -48,13 +52,13 @@ namespace {
     ScalarEvolution  *SE;
     DominatorTree    *DT;
 
-    SmallVectorImpl<WeakVH> &DeadInsts;
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts;
 
     bool Changed;
 
   public:
     SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
-                   LoopInfo *LI,SmallVectorImpl<WeakVH> &Dead)
+                   LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead)
         : L(Loop), LI(LI), SE(SE), DT(DT), DeadInsts(Dead), Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
     }
@@ -75,7 +79,9 @@ namespace {
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
                               bool IsSigned);
+    bool eliminateSDiv(BinaryOperator *SDiv);
     bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
+    bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand);
   };
 }
 
@@ -150,6 +156,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
 void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   unsigned IVOperIdx = 0;
   ICmpInst::Predicate Pred = ICmp->getPredicate();
+  ICmpInst::Predicate OriginalPred = Pred;
   if (IVOperand != ICmp->getOperand(0)) {
     // Swapped
     assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
@@ -258,6 +265,16 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
     ICmp->setPredicate(InvariantPredicate);
     ICmp->setOperand(0, NewLHS);
     ICmp->setOperand(1, NewRHS);
+  } else if (ICmpInst::isSigned(OriginalPred) &&
+             SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) {
+    // If we were unable to make anything above, all we can is to canonicalize
+    // the comparison hoping that it will open the doors for other
+    // optimizations. If we find out that we compare two non-negative values,
+    // we turn the instruction's predicate to its unsigned version. Note that
+    // we cannot rely on Pred here unless we check if we have swapped it.
+    assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
+    DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp << '\n');
+    ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
   } else
     return;
 
@@ -265,6 +282,33 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   Changed = true;
 }
 
+bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
+  // Get the SCEVs for the ICmp operands.
+  auto *N = SE->getSCEV(SDiv->getOperand(0));
+  auto *D = SE->getSCEV(SDiv->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *L = LI->getLoopFor(SDiv->getParent());
+  N = SE->getSCEVAtScope(N, L);
+  D = SE->getSCEVAtScope(D, L);
+
+  // Replace sdiv by udiv if both of the operands are non-negative
+  if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
+    auto *UDiv = BinaryOperator::Create(
+        BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
+        SDiv->getName() + ".udiv", SDiv);
+    UDiv->setIsExact(SDiv->isExact());
+    SDiv->replaceAllUsesWith(UDiv);
+    DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    ++NumSimplifiedSDiv;
+    Changed = true;
+    DeadInsts.push_back(SDiv);
+    return true;
+  }
+
+  return false;
+}
+
 /// SimplifyIVUsers helper for eliminating useless
 /// remainder operations operating on an induction variable.
 void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
@@ -321,9 +365,9 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
     return false;
 
   typedef const SCEV *(ScalarEvolution::*OperationFunctionTy)(
-      const SCEV *, const SCEV *, SCEV::NoWrapFlags);
+      const SCEV *, const SCEV *, SCEV::NoWrapFlags, unsigned);
   typedef const SCEV *(ScalarEvolution::*ExtensionFunctionTy)(
-      const SCEV *, Type *);
+      const SCEV *, Type *, unsigned);
 
   OperationFunctionTy Operation;
   ExtensionFunctionTy Extension;
@@ -375,10 +419,11 @@ bool SimplifyIndvar::eliminateOverflowIntrinsic(CallInst *CI) {
     IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
 
   const SCEV *A =
-      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap), WideTy);
+      (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
+                       WideTy, 0);
   const SCEV *B =
-      (SE->*Operation)((SE->*Extension)(LHS, WideTy),
-                       (SE->*Extension)(RHS, WideTy), SCEV::FlagAnyWrap);
+      (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
+                       (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
 
   if (A != B)
     return false;
@@ -426,12 +471,15 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     eliminateIVComparison(ICmp, IVOperand);
     return true;
   }
-  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-      eliminateIVRemainder(Rem, IVOperand, IsSigned);
+  if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSRem = Bin->getOpcode() == Instruction::SRem;
+    if (IsSRem || Bin->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Bin, IVOperand, IsSRem);
       return true;
     }
+
+    if (Bin->getOpcode() == Instruction::SDiv)
+      return eliminateSDiv(Bin);
   }
 
   if (auto *CI = dyn_cast<CallInst>(UseInst))
@@ -496,8 +544,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
     return false;
 
   const SCEV *(ScalarEvolution::*GetExprForBO)(const SCEV *, const SCEV *,
-                                               SCEV::NoWrapFlags);
-
+                                               SCEV::NoWrapFlags, unsigned);
   switch (BO->getOpcode()) {
   default:
     return false;
@@ -526,7 +573,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
     const SCEV *ExtendAfterOp = SE->getZeroExtendExpr(SE->getSCEV(BO), WideTy);
     const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
       SE->getZeroExtendExpr(LHS, WideTy), SE->getZeroExtendExpr(RHS, WideTy),
-      SCEV::FlagAnyWrap);
+      SCEV::FlagAnyWrap, 0u);
     if (ExtendAfterOp == OpAfterExtend) {
       BO->setHasNoUnsignedWrap();
       SE->forgetValue(BO);
@@ -538,7 +585,7 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
     const SCEV *ExtendAfterOp = SE->getSignExtendExpr(SE->getSCEV(BO), WideTy);
     const SCEV *OpAfterExtend = (SE->*GetExprForBO)(
       SE->getSignExtendExpr(LHS, WideTy), SE->getSignExtendExpr(RHS, WideTy),
-      SCEV::FlagAnyWrap);
+      SCEV::FlagAnyWrap, 0u);
     if (ExtendAfterOp == OpAfterExtend) {
       BO->setHasNoSignedWrap();
       SE->forgetValue(BO);
@@ -549,6 +596,35 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
   return Changed;
 }
 
+/// Annotate the Shr in (X << IVOperand) >> C as exact using the
+/// information from the IV's range. Returns true if anything changed, false
+/// otherwise.
+bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
+                                          Value *IVOperand) {
+  using namespace llvm::PatternMatch;
+
+  if (BO->getOpcode() == Instruction::Shl) {
+    bool Changed = false;
+    ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
+    for (auto *U : BO->users()) {
+      const APInt *C;
+      if (match(U,
+                m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) ||
+          match(U,
+                m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) {
+        BinaryOperator *Shr = cast<BinaryOperator>(U);
+        if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) {
+          Shr->setIsExact(true);
+          Changed = true;
+        }
+      }
+    }
+    return Changed;
+  }
+
+  return false;
+}
+
 /// Add all uses of Def to the current IV's worklist.
 static void pushIVUsers(
   Instruction *Def,
@@ -641,8 +717,9 @@ void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
     }
 
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseOper.first)) {
-      if (isa<OverflowingBinaryOperator>(BO) &&
-          strengthenOverflowingOperation(BO, IVOperand)) {
+      if ((isa<OverflowingBinaryOperator>(BO) &&
+           strengthenOverflowingOperation(BO, IVOperand)) ||
+          (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
         // re-queue uses of the now modified binary operator and fall
         // through to the checks that remain.
         pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
@@ -667,7 +744,7 @@ void IVVisitor::anchor() { }
 /// Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
-                       LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead,
+                       LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead,
                        IVVisitor *V) {
   SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Dead);
   SIV.simplifyUsers(CurrIV, V);
@@ -677,7 +754,7 @@ bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
 /// Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
 bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                     LoopInfo *LI, SmallVectorImpl<WeakVH> &Dead) {
+                     LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead) {
   bool Changed = false;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
     Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead);
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
index 1220490..2ea15f6 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -20,23 +20,23 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instsimplify"
 
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
 
-static bool runImpl(Function &F, const DominatorTree *DT,
-                    const TargetLibraryInfo *TLI, AssumptionCache *AC) {
-  const DataLayout &DL = F.getParent()->getDataLayout();
+static bool runImpl(Function &F, const SimplifyQuery &SQ,
+                    OptimizationRemarkEmitter *ORE) {
   SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
   bool Changed = false;
 
@@ -54,7 +54,7 @@ static bool runImpl(Function &F, const DominatorTree *DT,
 
         // Don't waste time simplifying unused instructions.
         if (!I->use_empty()) {
-          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+          if (Value *V = SimplifyInstruction(I, SQ, ORE)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
               Next->insert(cast<Instruction>(U));
@@ -63,7 +63,7 @@ static bool runImpl(Function &F, const DominatorTree *DT,
             Changed = true;
           }
         }
-        if (RecursivelyDeleteTriviallyDeadInstructions(I, TLI)) {
+        if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) {
           // RecursivelyDeleteTriviallyDeadInstruction can remove more than one
           // instruction, so simply incrementing the iterator does not work.
           // When instructions get deleted re-iterate instead.
@@ -95,6 +95,7 @@ namespace {
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     }
 
     /// runOnFunction - Remove instructions that simplify.
@@ -108,7 +109,11 @@ namespace {
           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       AssumptionCache *AC =
           &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      return runImpl(F, DT, TLI, AC);
+      OptimizationRemarkEmitter *ORE =
+          &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+      const DataLayout &DL = F.getParent()->getDataLayout();
+      const SimplifyQuery SQ(DL, TLI, DT, AC);
+      return runImpl(F, SQ, ORE);
     }
   };
 }
@@ -119,6 +124,7 @@ INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
                     "Remove redundant instructions", false, false)
 char &llvm::InstructionSimplifierID = InstSimplifier::ID;
@@ -133,9 +139,14 @@ PreservedAnalyses InstSimplifierPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  bool Changed = runImpl(F, &DT, &TLI, &AC);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+  bool Changed = runImpl(F, SQ, &ORE);
   if (!Changed)
     return PreservedAnalyses::all();
-  // FIXME: This should also 'preserve the CFG'.
-  return PreservedAnalyses::none();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
 }
diff --git a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 8eaeb10..77c0a41 100644
--- a/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -37,10 +38,6 @@ using namespace llvm;
 using namespace PatternMatch;
 
 static cl::opt<bool>
-    ColdErrorCalls("error-reporting-is-cold", cl::init(true), cl::Hidden,
-                   cl::desc("Treat error-reporting calls as cold"));
-
-static cl::opt<bool>
     EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
                          cl::init(false),
                          cl::desc("Enable unsafe double to float "
@@ -51,9 +48,9 @@ static cl::opt<bool>
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-static bool ignoreCallingConv(LibFunc::Func Func) {
-  return Func == LibFunc::abs || Func == LibFunc::labs ||
-         Func == LibFunc::llabs || Func == LibFunc::strlen;
+static bool ignoreCallingConv(LibFunc Func) {
+  return Func == LibFunc_abs || Func == LibFunc_labs ||
+         Func == LibFunc_llabs || Func == LibFunc_strlen;
 }
 
 static bool isCallingConvCCompatible(CallInst *CI) {
@@ -88,20 +85,6 @@ static bool isCallingConvCCompatible(CallInst *CI) {
   return false;
 }
 
-/// Return true if it only matters that the value is equal or not-equal to zero.
-static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
-  for (User *U : V->users()) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
 /// Return true if it is only used in equality comparisons with With.
 static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
   for (User *U : V->users()) {
@@ -123,8 +106,8 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
 /// \brief Check whether the overloaded unary floating point function
 /// corresponding to \a Ty is available.
 static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
-                            LibFunc::Func DoubleFn, LibFunc::Func FloatFn,
-                            LibFunc::Func LongDoubleFn) {
+                            LibFunc DoubleFn, LibFunc FloatFn,
+                            LibFunc LongDoubleFn) {
   switch (Ty->getTypeID()) {
   case Type::FloatTyID:
     return TLI->has(FloatFn);
@@ -429,59 +412,68 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
   return Dst;
 }
 
-Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
+                                               unsigned CharSize) {
   Value *Src = CI->getArgOperand(0);
 
   // Constant folding: strlen("xyz") -> 3
-  if (uint64_t Len = GetStringLength(Src))
+  if (uint64_t Len = GetStringLength(Src, CharSize))
     return ConstantInt::get(CI->getType(), Len - 1);
 
   // If s is a constant pointer pointing to a string literal, we can fold
-  // strlen(s + x) to strlen(s) - x, when x is known to be in the range 
+  // strlen(s + x) to strlen(s) - x, when x is known to be in the range
   // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
-  // We only try to simplify strlen when the pointer s points to an array 
+  // We only try to simplify strlen when the pointer s points to an array
   // of i8. Otherwise, we would need to scale the offset x before doing the
-  // subtraction. This will make the optimization more complex, and it's not 
-  // very useful because calling strlen for a pointer of other types is 
+  // subtraction. This will make the optimization more complex, and it's not
+  // very useful because calling strlen for a pointer of other types is
   // very uncommon.
   if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
-    if (!isGEPBasedOnPointerToString(GEP))
+    if (!isGEPBasedOnPointerToString(GEP, CharSize))
       return nullptr;
 
-    StringRef Str;
-    if (getConstantStringInfo(GEP->getOperand(0), Str, 0, false)) {
-      size_t NullTermIdx = Str.find('\0');
-      
-      // If the string does not have '\0', leave it to strlen to compute
-      // its length.
-      if (NullTermIdx == StringRef::npos)
-        return nullptr;
-     
+    ConstantDataArraySlice Slice;
+    if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) {
+      uint64_t NullTermIdx;
+      if (Slice.Array == nullptr) {
+        NullTermIdx = 0;
+      } else {
+        NullTermIdx = ~((uint64_t)0);
+        for (uint64_t I = 0, E = Slice.Length; I < E; ++I) {
+          if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) {
+            NullTermIdx = I;
+            break;
+          }
+        }
+        // If the string does not have '\0', leave it to strlen to compute
+        // its length.
+        if (NullTermIdx == ~((uint64_t)0))
+          return nullptr;
+      }
+
       Value *Offset = GEP->getOperand(2);
-      unsigned BitWidth = Offset->getType()->getIntegerBitWidth();
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
-      computeKnownBits(Offset, KnownZero, KnownOne, DL, 0, nullptr, CI, 
-                       nullptr);
-      KnownZero.flipAllBits();
-      size_t ArrSize = 
+      KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
+      Known.Zero.flipAllBits();
+      uint64_t ArrSize =
              cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
 
-      // KnownZero's bits are flipped, so zeros in KnownZero now represent 
-      // bits known to be zeros in Offset, and ones in KnowZero represent 
+      // KnownZero's bits are flipped, so zeros in KnownZero now represent
+      // bits known to be zeros in Offset, and ones in KnowZero represent
       // bits unknown in Offset. Therefore, Offset is known to be in range
-      // [0, NullTermIdx] when the flipped KnownZero is non-negative and 
+      // [0, NullTermIdx] when the flipped KnownZero is non-negative and
       // unsigned-less-than NullTermIdx.
       //
-      // If Offset is not provably in the range [0, NullTermIdx], we can still 
-      // optimize if we can prove that the program has undefined behavior when 
-      // Offset is outside that range. That is the case when GEP->getOperand(0) 
+      // If Offset is not provably in the range [0, NullTermIdx], we can still
+      // optimize if we can prove that the program has undefined behavior when
+      // Offset is outside that range. That is the case when GEP->getOperand(0)
       // is a pointer to an object whose memory extent is NullTermIdx+1.
-      if ((KnownZero.isNonNegative() && KnownZero.ule(NullTermIdx)) || 
+      if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
           (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
-           NullTermIdx == ArrSize - 1))
-        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx), 
+           NullTermIdx == ArrSize - 1)) {
+        Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
+        return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
                            Offset);
+      }
     }
 
     return nullptr;
@@ -489,8 +481,8 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
 
   // strlen(x?"foo":"bars") --> x ? 3 : 4
   if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
-    uint64_t LenTrue = GetStringLength(SI->getTrueValue());
-    uint64_t LenFalse = GetStringLength(SI->getFalseValue());
+    uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize);
+    uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize);
     if (LenTrue && LenFalse) {
       Function *Caller = CI->getParent()->getParent();
       emitOptimizationRemark(CI->getContext(), "simplify-libcalls", *Caller,
@@ -510,6 +502,17 @@ Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+  return optimizeStringLength(CI, B, 8);
+}
+
+Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
+  Module &M = *CI->getParent()->getParent()->getParent();
+  unsigned WCharSize = TLI->getWCharSize(M) * 8;
+
+  return optimizeStringLength(CI, B, WCharSize);
+}
+
 Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
@@ -542,7 +545,7 @@ Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
   if (isa<ConstantPointerNull>(EndPtr)) {
     // With a null EndPtr, this function won't capture the main argument.
     // It would be readonly too, except that it still may write to errno.
-    CI->addAttribute(1, Attribute::NoCapture);
+    CI->addParamAttr(0, Attribute::NoCapture);
   }
 
   return nullptr;
@@ -653,7 +656,7 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
   ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
 
   // memchr(x, y, 0) -> null
-  if (LenC && LenC->isNullValue())
+  if (LenC && LenC->isZero())
     return Constant::getNullValue(CI->getType());
 
   // From now on we need at least constant length and string.
@@ -735,8 +738,8 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
   ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
   if (!LenC)
     return nullptr;
-  uint64_t Len = LenC->getZExtValue();
 
+  uint64_t Len = LenC->getZExtValue();
   if (Len == 0) // memcmp(s1,s2,0) -> 0
     return Constant::getNullValue(CI->getType());
 
@@ -809,9 +812,9 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
 
 // TODO: Does this belong in BuildLibCalls or should all of those similar
 // functions be moved here?
-static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
+static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
                          IRBuilder<> &B, const TargetLibraryInfo &TLI) {
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
     return nullptr;
 
@@ -819,7 +822,7 @@ static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
   Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
-                                         PtrType, PtrType, nullptr);
+                                         PtrType, PtrType);
   CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
 
   if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
@@ -846,9 +849,12 @@ static Value *foldMallocMemset(CallInst *Memset, IRBuilder<> &B,
 
   // Is the inner call really malloc()?
   Function *InnerCallee = Malloc->getCalledFunction();
-  LibFunc::Func Func;
+  if (!InnerCallee)
+    return nullptr;
+
+  LibFunc Func;
   if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
-      Func != LibFunc::malloc)
+      Func != LibFunc_malloc)
     return nullptr;
 
   // The memset must cover the same number of bytes that are malloc'd.
@@ -930,6 +936,24 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   if (V == nullptr)
     return nullptr;
   
+  // If call isn't an intrinsic, check that it isn't within a function with the
+  // same name as the float version of this call.
+  //
+  // e.g. inline float expf(float val) { return (float) exp((double) val); }
+  //
+  // A similar such definition exists in the MinGW-w64 math.h header file which
+  // when compiled with -O2 -ffast-math causes the generation of infinite loops
+  // where expf is called.
+  if (!Callee->isIntrinsic()) {
+    const Function *F = CI->getFunction();
+    StringRef FName = F->getName();
+    StringRef CalleeName = Callee->getName();
+    if ((FName.size() == (CalleeName.size() + 1)) &&
+        (FName.back() == 'f') &&
+        FName.startswith(CalleeName))
+      return nullptr;
+  }
+
   // Propagate fast-math flags from the existing call to the new call.
   IRBuilder<>::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(CI->getFastMathFlags());
@@ -948,6 +972,20 @@ static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
   return B.CreateFPExt(V, B.getDoubleTy());
 }
 
+// Replace a libcall \p CI with a call to intrinsic \p IID
+static Value *replaceUnaryCall(CallInst *CI, IRBuilder<> &B, Intrinsic::ID IID) {
+  // Propagate fast-math flags from the existing call to the new call.
+  IRBuilder<>::FastMathFlagGuard Guard(B);
+  B.setFastMathFlags(CI->getFastMathFlags());
+
+  Module *M = CI->getModule();
+  Value *V = CI->getArgOperand(0);
+  Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
+  CallInst *NewCall = B.CreateCall(F, V);
+  NewCall->takeName(CI);
+  return NewCall;
+}
+
 /// Shrink double -> float for binary functions like 'fmin/fmax'.
 static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
@@ -1041,9 +1079,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
     // pow(10.0, x) -> exp10(x)
     if (Op1C->isExactlyValue(10.0) &&
-        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp10, LibFunc::exp10f,
-                        LibFunc::exp10l))
-      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc::exp10), B,
+        hasUnaryFloatFn(TLI, Op1->getType(), LibFunc_exp10, LibFunc_exp10f,
+                        LibFunc_exp10l))
+      return emitUnaryFloatFnCall(Op2, TLI->getName(LibFunc_exp10), B,
                                   Callee->getAttributes());
   }
 
@@ -1055,10 +1093,10 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   // pow(exp(x), y) = pow(inf, 0.001) = inf, whereas exp(x*y) = exp(1).
   auto *OpC = dyn_cast<CallInst>(Op1);
   if (OpC && OpC->hasUnsafeAlgebra() && CI->hasUnsafeAlgebra()) {
-    LibFunc::Func Func;
+    LibFunc Func;
     Function *OpCCallee = OpC->getCalledFunction();
     if (OpCCallee && TLI->getLibFunc(OpCCallee->getName(), Func) &&
-        TLI->has(Func) && (Func == LibFunc::exp || Func == LibFunc::exp2)) {
+        TLI->has(Func) && (Func == LibFunc_exp || Func == LibFunc_exp2)) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
       Value *FMul = B.CreateFMul(OpC->getArgOperand(0), Op2, "mul");
@@ -1075,17 +1113,20 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     return ConstantFP::get(CI->getType(), 1.0);
 
   if (Op2C->isExactlyValue(-0.5) &&
-      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
-                      LibFunc::sqrtl)) {
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                      LibFunc_sqrtl)) {
     // If -ffast-math:
     // pow(x, -0.5) -> 1.0 / sqrt(x)
     if (CI->hasUnsafeAlgebra()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
 
-      // Here we cannot lower to an intrinsic because C99 sqrt() and llvm.sqrt
-      // are not guaranteed to have the same semantics.
-      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+      // TODO: If the pow call is an intrinsic, we should lower to the sqrt
+      // intrinsic, so we match errno semantics.  We also should check that the
+      // target can in fact lower the sqrt intrinsic -- we currently have no way
+      // to ask this question other than asking whether the target has a sqrt
+      // libcall, which is a sufficient but not necessary condition.
+      Value *Sqrt = emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc_sqrt), B,
                                          Callee->getAttributes());
 
       return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0), Sqrt, "sqrtrecip");
@@ -1093,19 +1134,17 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
   }
 
   if (Op2C->isExactlyValue(0.5) &&
-      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
-                      LibFunc::sqrtl) &&
-      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
-                      LibFunc::fabsl)) {
+      hasUnaryFloatFn(TLI, Op2->getType(), LibFunc_sqrt, LibFunc_sqrtf,
+                      LibFunc_sqrtl)) {
 
     // In -ffast-math, pow(x, 0.5) -> sqrt(x).
     if (CI->hasUnsafeAlgebra()) {
       IRBuilder<>::FastMathFlagGuard Guard(B);
       B.setFastMathFlags(CI->getFastMathFlags());
 
-      // Unlike other math intrinsics, sqrt has differerent semantics
-      // from the libc function. See LangRef for details.
-      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc::sqrt), B,
+      // TODO: As above, we should lower to the sqrt intrinsic if the pow is an
+      // intrinsic, to match errno semantics.
+      return emitUnaryFloatFnCall(Op1, TLI->getName(LibFunc_sqrt), B,
                                   Callee->getAttributes());
     }
 
@@ -1115,9 +1154,16 @@ Value *LibCallSimplifier::optimizePow(CallInst *CI, IRBuilder<> &B) {
     // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
     Value *Inf = ConstantFP::getInfinity(CI->getType());
     Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+
+    // TODO: As above, we should lower to the sqrt intrinsic if the pow is an
+    // intrinsic, to match errno semantics.
     Value *Sqrt = emitUnaryFloatFnCall(Op1, "sqrt", B, Callee->getAttributes());
-    Value *FAbs =
-        emitUnaryFloatFnCall(Sqrt, "fabs", B, Callee->getAttributes());
+
+    Module *M = Callee->getParent();
+    Function *FabsF = Intrinsic::getDeclaration(M, Intrinsic::fabs,
+                                                CI->getType());
+    Value *FAbs = B.CreateCall(FabsF, Sqrt);
+
     Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
     Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
     return Sel;
@@ -1173,11 +1219,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   Value *Op = CI->getArgOperand(0);
   // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
   // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
-  LibFunc::Func LdExp = LibFunc::ldexpl;
+  LibFunc LdExp = LibFunc_ldexpl;
   if (Op->getType()->isFloatTy())
-    LdExp = LibFunc::ldexpf;
+    LdExp = LibFunc_ldexpf;
   else if (Op->getType()->isDoubleTy())
-    LdExp = LibFunc::ldexp;
+    LdExp = LibFunc_ldexp;
 
   if (TLI->has(LdExp)) {
     Value *LdExpArg = nullptr;
@@ -1197,7 +1243,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
       Module *M = CI->getModule();
       Value *NewCallee =
           M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
-                                 Op->getType(), B.getInt32Ty(), nullptr);
+                                 Op->getType(), B.getInt32Ty());
       CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
       if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
         CI->setCallingConv(F->getCallingConv());
@@ -1208,15 +1254,6 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
   return Ret;
 }
 
-Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
-  Function *Callee = CI->getCalledFunction();
-  StringRef Name = Callee->getName();
-  if (Name == "fabs" && hasFloatVersion(Name))
-    return optimizeUnaryDoubleFP(CI, B, false);
-
-  return nullptr;
-}
-
 Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   // If we can shrink the call to a float function rather than a double
@@ -1280,17 +1317,17 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
   FMF.setUnsafeAlgebra();
   B.setFastMathFlags(FMF);
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *F = OpC->getCalledFunction();
   if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-      Func == LibFunc::pow) || F->getIntrinsicID() == Intrinsic::pow))
+      Func == LibFunc_pow) || F->getIntrinsicID() == Intrinsic::pow))
     return B.CreateFMul(OpC->getArgOperand(1),
       emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B,
                            Callee->getAttributes()), "mul");
 
   // log(exp2(y)) -> y*log(2)
   if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) &&
-      TLI->has(Func) && Func == LibFunc::exp2)
+      TLI->has(Func) && Func == LibFunc_exp2)
     return B.CreateFMul(
         OpC->getArgOperand(0),
         emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0),
@@ -1302,8 +1339,11 @@ Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) {
 Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
-  if (TLI->has(LibFunc::sqrtf) && (Callee->getName() == "sqrt" ||
-                                   Callee->getIntrinsicID() == Intrinsic::sqrt))
+  // TODO: Once we have a way (other than checking for the existince of the
+  // libcall) to tell whether our target can lower @llvm.sqrt, relax the
+  // condition below.
+  if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
+                                  Callee->getIntrinsicID() == Intrinsic::sqrt))
     Ret = optimizeUnaryDoubleFP(CI, B, true);
 
   if (!CI->hasUnsafeAlgebra())
@@ -1385,12 +1425,12 @@ Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
   // tan(atan(x)) -> x
   // tanf(atanf(x)) -> x
   // tanl(atanl(x)) -> x
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *F = OpC->getCalledFunction();
   if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
-      ((Func == LibFunc::atan && Callee->getName() == "tan") ||
-       (Func == LibFunc::atanf && Callee->getName() == "tanf") ||
-       (Func == LibFunc::atanl && Callee->getName() == "tanl")))
+      ((Func == LibFunc_atan && Callee->getName() == "tan") ||
+       (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
+       (Func == LibFunc_atanl && Callee->getName() == "tanl")))
     Ret = OpC->getArgOperand(0);
   return Ret;
 }
@@ -1418,16 +1458,16 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
     // x86_64 can't use {float, float} since that would be returned in both
     // xmm0 and xmm1, which isn't what a real struct would do.
     ResTy = T.getArch() == Triple::x86_64
-    ? static_cast<Type *>(VectorType::get(ArgTy, 2))
-    : static_cast<Type *>(StructType::get(ArgTy, ArgTy, nullptr));
+                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
   } else {
     Name = "__sincospi_stret";
-    ResTy = StructType::get(ArgTy, ArgTy, nullptr);
+    ResTy = StructType::get(ArgTy, ArgTy);
   }
 
   Module *M = OrigCallee->getParent();
   Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
-                                         ResTy, ArgTy, nullptr);
+                                         ResTy, ArgTy);
 
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
     // If the argument is an instruction, it must dominate all uses so put our
@@ -1508,24 +1548,24 @@ void LibCallSimplifier::classifyArgUse(
     return;
 
   Function *Callee = CI->getCalledFunction();
-  LibFunc::Func Func;
+  LibFunc Func;
   if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
       !isTrigLibCall(CI))
     return;
 
   if (IsFloat) {
-    if (Func == LibFunc::sinpif)
+    if (Func == LibFunc_sinpif)
       SinCalls.push_back(CI);
-    else if (Func == LibFunc::cospif)
+    else if (Func == LibFunc_cospif)
       CosCalls.push_back(CI);
-    else if (Func == LibFunc::sincospif_stret)
+    else if (Func == LibFunc_sincospif_stret)
       SinCosCalls.push_back(CI);
   } else {
-    if (Func == LibFunc::sinpi)
+    if (Func == LibFunc_sinpi)
       SinCalls.push_back(CI);
-    else if (Func == LibFunc::cospi)
+    else if (Func == LibFunc_cospi)
       CosCalls.push_back(CI);
-    else if (Func == LibFunc::sincospi_stret)
+    else if (Func == LibFunc_sincospi_stret)
       SinCosCalls.push_back(CI);
   }
 }
@@ -1609,14 +1649,14 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
   // Proceedings of PACT'98, Oct. 1998, IEEE
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
-    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
   }
 
   return nullptr;
 }
 
 static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
-  if (!ColdErrorCalls || !Callee || !Callee->isDeclaration())
+  if (!Callee || !Callee->isDeclaration())
     return false;
 
   if (StreamArg < 0)
@@ -1699,7 +1739,7 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
 
   // printf(format, ...) -> iprintf(format, ...) if no floating point
   // arguments.
-  if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
+  if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
     Module *M = B.GetInsertBlock()->getParent()->getParent();
     Constant *IPrintFFn =
         M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
@@ -1780,7 +1820,7 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
 
   // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
   // point arguments.
-  if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+  if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
     Module *M = B.GetInsertBlock()->getParent()->getParent();
     Constant *SIPrintFFn =
         M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
@@ -1850,7 +1890,7 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
 
   // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
   // floating point arguments.
-  if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+  if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
     Module *M = B.GetInsertBlock()->getParent()->getParent();
     Constant *FIPrintFFn =
         M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
@@ -1929,7 +1969,7 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
 }
 
 bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
-  LibFunc::Func Func;
+  LibFunc Func;
   SmallString<20> FloatFuncName = FuncName;
   FloatFuncName += 'f';
   if (TLI->getLibFunc(FloatFuncName, Func))
@@ -1939,7 +1979,7 @@ bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
 
 Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
                                                       IRBuilder<> &Builder) {
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   // Check for string/memory library functions.
   if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
@@ -1948,52 +1988,54 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
             isCallingConvCCompatible(CI)) &&
       "Optimizing string/memory libcall would change the calling convention");
     switch (Func) {
-    case LibFunc::strcat:
+    case LibFunc_strcat:
       return optimizeStrCat(CI, Builder);
-    case LibFunc::strncat:
+    case LibFunc_strncat:
       return optimizeStrNCat(CI, Builder);
-    case LibFunc::strchr:
+    case LibFunc_strchr:
       return optimizeStrChr(CI, Builder);
-    case LibFunc::strrchr:
+    case LibFunc_strrchr:
       return optimizeStrRChr(CI, Builder);
-    case LibFunc::strcmp:
+    case LibFunc_strcmp:
       return optimizeStrCmp(CI, Builder);
-    case LibFunc::strncmp:
+    case LibFunc_strncmp:
       return optimizeStrNCmp(CI, Builder);
-    case LibFunc::strcpy:
+    case LibFunc_strcpy:
       return optimizeStrCpy(CI, Builder);
-    case LibFunc::stpcpy:
+    case LibFunc_stpcpy:
       return optimizeStpCpy(CI, Builder);
-    case LibFunc::strncpy:
+    case LibFunc_strncpy:
       return optimizeStrNCpy(CI, Builder);
-    case LibFunc::strlen:
+    case LibFunc_strlen:
       return optimizeStrLen(CI, Builder);
-    case LibFunc::strpbrk:
+    case LibFunc_strpbrk:
       return optimizeStrPBrk(CI, Builder);
-    case LibFunc::strtol:
-    case LibFunc::strtod:
-    case LibFunc::strtof:
-    case LibFunc::strtoul:
-    case LibFunc::strtoll:
-    case LibFunc::strtold:
-    case LibFunc::strtoull:
+    case LibFunc_strtol:
+    case LibFunc_strtod:
+    case LibFunc_strtof:
+    case LibFunc_strtoul:
+    case LibFunc_strtoll:
+    case LibFunc_strtold:
+    case LibFunc_strtoull:
       return optimizeStrTo(CI, Builder);
-    case LibFunc::strspn:
+    case LibFunc_strspn:
       return optimizeStrSpn(CI, Builder);
-    case LibFunc::strcspn:
+    case LibFunc_strcspn:
       return optimizeStrCSpn(CI, Builder);
-    case LibFunc::strstr:
+    case LibFunc_strstr:
       return optimizeStrStr(CI, Builder);
-    case LibFunc::memchr:
+    case LibFunc_memchr:
       return optimizeMemChr(CI, Builder);
-    case LibFunc::memcmp:
+    case LibFunc_memcmp:
       return optimizeMemCmp(CI, Builder);
-    case LibFunc::memcpy:
+    case LibFunc_memcpy:
       return optimizeMemCpy(CI, Builder);
-    case LibFunc::memmove:
+    case LibFunc_memmove:
       return optimizeMemMove(CI, Builder);
-    case LibFunc::memset:
+    case LibFunc_memset:
       return optimizeMemSet(CI, Builder);
+    case LibFunc_wcslen:
+      return optimizeWcslen(CI, Builder);
     default:
       break;
     }
@@ -2005,7 +2047,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   if (CI->isNoBuiltin())
     return nullptr;
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   StringRef FuncName = Callee->getName();
 
@@ -2029,8 +2071,6 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizePow(CI, Builder);
     case Intrinsic::exp2:
       return optimizeExp2(CI, Builder);
-    case Intrinsic::fabs:
-      return optimizeFabs(CI, Builder);
     case Intrinsic::log:
       return optimizeLog(CI, Builder);
     case Intrinsic::sqrt:
@@ -2067,114 +2107,117 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
       return V;
     switch (Func) {
-    case LibFunc::cosf:
-    case LibFunc::cos:
-    case LibFunc::cosl:
+    case LibFunc_cosf:
+    case LibFunc_cos:
+    case LibFunc_cosl:
       return optimizeCos(CI, Builder);
-    case LibFunc::sinpif:
-    case LibFunc::sinpi:
-    case LibFunc::cospif:
-    case LibFunc::cospi:
+    case LibFunc_sinpif:
+    case LibFunc_sinpi:
+    case LibFunc_cospif:
+    case LibFunc_cospi:
       return optimizeSinCosPi(CI, Builder);
-    case LibFunc::powf:
-    case LibFunc::pow:
-    case LibFunc::powl:
+    case LibFunc_powf:
+    case LibFunc_pow:
+    case LibFunc_powl:
       return optimizePow(CI, Builder);
-    case LibFunc::exp2l:
-    case LibFunc::exp2:
-    case LibFunc::exp2f:
+    case LibFunc_exp2l:
+    case LibFunc_exp2:
+    case LibFunc_exp2f:
       return optimizeExp2(CI, Builder);
-    case LibFunc::fabsf:
-    case LibFunc::fabs:
-    case LibFunc::fabsl:
-      return optimizeFabs(CI, Builder);
-    case LibFunc::sqrtf:
-    case LibFunc::sqrt:
-    case LibFunc::sqrtl:
+    case LibFunc_fabsf:
+    case LibFunc_fabs:
+    case LibFunc_fabsl:
+      return replaceUnaryCall(CI, Builder, Intrinsic::fabs);
+    case LibFunc_sqrtf:
+    case LibFunc_sqrt:
+    case LibFunc_sqrtl:
       return optimizeSqrt(CI, Builder);
-    case LibFunc::ffs:
-    case LibFunc::ffsl:
-    case LibFunc::ffsll:
+    case LibFunc_ffs:
+    case LibFunc_ffsl:
+    case LibFunc_ffsll:
       return optimizeFFS(CI, Builder);
-    case LibFunc::fls:
-    case LibFunc::flsl:
-    case LibFunc::flsll:
+    case LibFunc_fls:
+    case LibFunc_flsl:
+    case LibFunc_flsll:
       return optimizeFls(CI, Builder);
-    case LibFunc::abs:
-    case LibFunc::labs:
-    case LibFunc::llabs:
+    case LibFunc_abs:
+    case LibFunc_labs:
+    case LibFunc_llabs:
       return optimizeAbs(CI, Builder);
-    case LibFunc::isdigit:
+    case LibFunc_isdigit:
       return optimizeIsDigit(CI, Builder);
-    case LibFunc::isascii:
+    case LibFunc_isascii:
       return optimizeIsAscii(CI, Builder);
-    case LibFunc::toascii:
+    case LibFunc_toascii:
       return optimizeToAscii(CI, Builder);
-    case LibFunc::printf:
+    case LibFunc_printf:
       return optimizePrintF(CI, Builder);
-    case LibFunc::sprintf:
+    case LibFunc_sprintf:
       return optimizeSPrintF(CI, Builder);
-    case LibFunc::fprintf:
+    case LibFunc_fprintf:
       return optimizeFPrintF(CI, Builder);
-    case LibFunc::fwrite:
+    case LibFunc_fwrite:
       return optimizeFWrite(CI, Builder);
-    case LibFunc::fputs:
+    case LibFunc_fputs:
       return optimizeFPuts(CI, Builder);
-    case LibFunc::log:
-    case LibFunc::log10:
-    case LibFunc::log1p:
-    case LibFunc::log2:
-    case LibFunc::logb:
+    case LibFunc_log:
+    case LibFunc_log10:
+    case LibFunc_log1p:
+    case LibFunc_log2:
+    case LibFunc_logb:
       return optimizeLog(CI, Builder);
-    case LibFunc::puts:
+    case LibFunc_puts:
       return optimizePuts(CI, Builder);
-    case LibFunc::tan:
-    case LibFunc::tanf:
-    case LibFunc::tanl:
+    case LibFunc_tan:
+    case LibFunc_tanf:
+    case LibFunc_tanl:
       return optimizeTan(CI, Builder);
-    case LibFunc::perror:
+    case LibFunc_perror:
       return optimizeErrorReporting(CI, Builder);
-    case LibFunc::vfprintf:
-    case LibFunc::fiprintf:
+    case LibFunc_vfprintf:
+    case LibFunc_fiprintf:
       return optimizeErrorReporting(CI, Builder, 0);
-    case LibFunc::fputc:
+    case LibFunc_fputc:
       return optimizeErrorReporting(CI, Builder, 1);
-    case LibFunc::ceil:
-    case LibFunc::floor:
-    case LibFunc::rint:
-    case LibFunc::round:
-    case LibFunc::nearbyint:
-    case LibFunc::trunc:
-      if (hasFloatVersion(FuncName))
-        return optimizeUnaryDoubleFP(CI, Builder, false);
-      return nullptr;
-    case LibFunc::acos:
-    case LibFunc::acosh:
-    case LibFunc::asin:
-    case LibFunc::asinh:
-    case LibFunc::atan:
-    case LibFunc::atanh:
-    case LibFunc::cbrt:
-    case LibFunc::cosh:
-    case LibFunc::exp:
-    case LibFunc::exp10:
-    case LibFunc::expm1:
-    case LibFunc::sin:
-    case LibFunc::sinh:
-    case LibFunc::tanh:
+    case LibFunc_ceil:
+      return replaceUnaryCall(CI, Builder, Intrinsic::ceil);
+    case LibFunc_floor:
+      return replaceUnaryCall(CI, Builder, Intrinsic::floor);
+    case LibFunc_round:
+      return replaceUnaryCall(CI, Builder, Intrinsic::round);
+    case LibFunc_nearbyint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
+    case LibFunc_rint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::rint);
+    case LibFunc_trunc:
+      return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
+    case LibFunc_acos:
+    case LibFunc_acosh:
+    case LibFunc_asin:
+    case LibFunc_asinh:
+    case LibFunc_atan:
+    case LibFunc_atanh:
+    case LibFunc_cbrt:
+    case LibFunc_cosh:
+    case LibFunc_exp:
+    case LibFunc_exp10:
+    case LibFunc_expm1:
+    case LibFunc_sin:
+    case LibFunc_sinh:
+    case LibFunc_tanh:
       if (UnsafeFPShrink && hasFloatVersion(FuncName))
         return optimizeUnaryDoubleFP(CI, Builder, true);
       return nullptr;
-    case LibFunc::copysign:
+    case LibFunc_copysign:
       if (hasFloatVersion(FuncName))
         return optimizeBinaryDoubleFP(CI, Builder);
       return nullptr;
-    case LibFunc::fminf:
-    case LibFunc::fmin:
-    case LibFunc::fminl:
-    case LibFunc::fmaxf:
-    case LibFunc::fmax:
-    case LibFunc::fmaxl:
+    case LibFunc_fminf:
+    case LibFunc_fmin:
+    case LibFunc_fminl:
+    case LibFunc_fmaxf:
+    case LibFunc_fmax:
+    case LibFunc_fmaxl:
       return optimizeFMinFMax(CI, Builder);
     default:
       return nullptr;
@@ -2211,16 +2254,10 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 //   * log(exp10(y)) -> y*log(10)
 //   * log(sqrt(x))  -> 0.5*log(x)
 //
-// lround, lroundf, lroundl:
-//   * lround(cnst) -> cnst'
-//
 // pow, powf, powl:
 //   * pow(sqrt(x),y) -> pow(x,y*0.5)
 //   * pow(pow(x,y),z)-> pow(x,y*z)
 //
-// round, roundf, roundl:
-//   * round(cnst) -> cnst'
-//
 // signbit:
 //   * signbit(cnst) -> cnst'
 //   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
@@ -2230,10 +2267,6 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
 //   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
 //   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
 //
-// trunc, truncf, truncl:
-//   * trunc(cnst) -> cnst'
-//
-//
 
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
@@ -2247,7 +2280,7 @@ bool FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
     return true;
   if (ConstantInt *ObjSizeCI =
           dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
-    if (ObjSizeCI->isAllOnesValue())
+    if (ObjSizeCI->isMinusOne())
       return true;
     // If the object size wasn't -1 (unknown), bail out if we were asked to.
     if (OnlyLowerUnknownSize)
@@ -2300,7 +2333,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
 
 Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
                                                       IRBuilder<> &B,
-                                                      LibFunc::Func Func) {
+                                                      LibFunc Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   const DataLayout &DL = CI->getModule()->getDataLayout();
@@ -2308,7 +2341,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
         *ObjSize = CI->getArgOperand(2);
 
   // __stpcpy_chk(x,x,...)  -> x+strlen(x)
-  if (Func == LibFunc::stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
+  if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
     Value *StrLen = emitStrLen(Src, B, DL, TLI);
     return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
   }
@@ -2334,14 +2367,14 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
   // If the function was an __stpcpy_chk, and we were able to fold it into
   // a __memcpy_chk, we still need to return the correct end pointer.
-  if (Ret && Func == LibFunc::stpcpy_chk)
+  if (Ret && Func == LibFunc_stpcpy_chk)
     return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
   return Ret;
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
                                                        IRBuilder<> &B,
-                                                       LibFunc::Func Func) {
+                                                       LibFunc Func) {
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
   if (isFortifiedCallFoldable(CI, 3, 2, false)) {
@@ -2366,7 +2399,7 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
   //
   // PR23093.
 
-  LibFunc::Func Func;
+  LibFunc Func;
   Function *Callee = CI->getCalledFunction();
 
   SmallVector<OperandBundleDef, 2> OpBundles;
@@ -2384,17 +2417,17 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
     return nullptr;
 
   switch (Func) {
-  case LibFunc::memcpy_chk:
+  case LibFunc_memcpy_chk:
     return optimizeMemCpyChk(CI, Builder);
-  case LibFunc::memmove_chk:
+  case LibFunc_memmove_chk:
     return optimizeMemMoveChk(CI, Builder);
-  case LibFunc::memset_chk:
+  case LibFunc_memset_chk:
     return optimizeMemSetChk(CI, Builder);
-  case LibFunc::stpcpy_chk:
-  case LibFunc::strcpy_chk:
+  case LibFunc_stpcpy_chk:
+  case LibFunc_strcpy_chk:
     return optimizeStrpCpyChk(CI, Builder, Func);
-  case LibFunc::stpncpy_chk:
-  case LibFunc::strncpy_chk:
+  case LibFunc_stpncpy_chk:
+  case LibFunc_strncpy_chk:
     return optimizeStrpNCpyChk(CI, Builder, Func);
   default:
     break;
diff --git a/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index f3d3fad..49dc15c 100644
--- a/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -20,8 +20,8 @@
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
diff --git a/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index 66dbf33..cd0378e 100644
--- a/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
 using namespace llvm;
 
 namespace {
diff --git a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index 6d13663..2010755 100644
--- a/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -59,9 +59,9 @@
 
 #define DEBUG_TYPE "symbol-rewriter"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/contrib/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
index 7b9de2e..f6c7d1c 100644
--- a/contrib/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/Utils.cpp
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/InitializePasses.h"
 #include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/PassRegistry.h"
 
 using namespace llvm;
@@ -35,9 +35,8 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
-  initializeMemorySSAWrapperPassPass(Registry);
-  initializeMemorySSAPrinterLegacyPassPass(Registry);
   initializeStripGCRelocatesPass(Registry);
+  initializePredicateInfoPrinterLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp b/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
new file mode 100644
index 0000000..c3feea6
--- /dev/null
+++ b/contrib/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -0,0 +1,495 @@
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "vncoerce"
+namespace llvm {
+namespace VNCoercion {
+
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+  // If the loaded or stored value is an first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+    return false;
+
+  // Don't coerce non-integral pointers to integers or vice versa.
+  if (DL.isNonIntegralPointerType(StoredVal->getType()) !=
+      DL.isNonIntegralPointerType(LoadTy))
+    return false;
+
+  return true;
+}
+
+template <class T, class HelperClass>
+static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
+                                               HelperClass &Helper,
+                                               const DataLayout &DL) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+         "precondition violation - materialization can't fail");
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoredValSize == LoadedValSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) {
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+    } else {
+      // Convert source pointers to integers, which can be bitcast.
+      if (StoredValTy->isPtrOrPtrVectorTy()) {
+        StoredValTy = DL.getIntPtrType(StoredValTy);
+        StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+      }
+
+      Type *TypeToCastTo = LoadedTy;
+      if (TypeToCastTo->isPtrOrPtrVectorTy())
+        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+      if (StoredValTy != TypeToCastTo)
+        StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
+
+      // Cast to pointer if the load needs a pointer type.
+      if (LoadedTy->isPtrOrPtrVectorTy())
+        StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    }
+
+    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+        StoredVal = FoldedStoredVal;
+
+    return StoredVal;
+  }
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoredValSize >= LoadedValSize &&
+         "canCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->isPtrOrPtrVectorTy()) {
+    StoredValTy = DL.getIntPtrType(StoredValTy);
+    StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
+    StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (DL.isBigEndian()) {
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
+                        DL.getTypeStoreSizeInBits(LoadedTy);
+    StoredVal = Helper.CreateLShr(
+        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
+  StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
+
+  if (LoadedTy != NewIntTy) {
+    // If the result is a pointer, inttoptr.
+    if (LoadedTy->isPtrOrPtrVectorTy())
+      StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    else
+      // Otherwise, bitcast.
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+  }
+
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  return StoredVal;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilder<> &IRB, const DataLayout &DL) {
+  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering memory write (store, memset, memcpy, memmove).  This
+/// means that the write *may* provide bits used by the load but we can't be
+/// sure because the pointers don't must-alias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &DL) {
+  // If the loaded or stored value is a first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+  LoadSize /= 8;
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure)
+    return -1;
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset + StoreSize < LoadOffset + LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset - StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                   StoreInst *DepSI, const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepSI->getValueOperand()->getType()->isStructTy() ||
+      DepSI->getValueOperand()->getType()->isArrayTy())
+    return -1;
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
+                                        DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+                                  const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+  if (R != -1)
+    return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
+      LoadBase, LoadOffs, LoadSize, DepLI);
+  if (Size == 0)
+    return -1;
+
+  // Check non-obvious conditions enforced by MDA which we rely on for being
+  // able to materialize this potentially available value
+  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+}
+
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                     MemIntrinsic *MI, const DataLayout &DL) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (!SizeCst)
+    return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, DL);
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (!Src)
+    return -1;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+  if (!GV || !GV->isConstant())
+    return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                              MemSizeInBits, DL);
+  if (Offset == -1)
+    return Offset;
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+    return Offset;
+  return -1;
+}
+
+template <class T, class HelperClass>
+static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
+                                     HelperClass &Helper,
+                                     const DataLayout &DL) {
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  // If two pointers are in the same address space, they have the same size,
+  // so we don't need to do any truncation, etc. This avoids introducing
+  // ptrtoint instructions for pointers that may be non-integral.
+  if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+      cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+          cast<PointerType>(LoadTy)->getAddressSpace()) {
+    return SrcVal;
+  }
+
+  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->isPtrOrPtrVectorTy())
+    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (DL.isLittleEndian())
+    ShiftAmt = Offset * 8;
+  else
+    ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
+  if (ShiftAmt)
+    SrcVal = Helper.CreateLShr(SrcVal,
+                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+
+  if (LoadSize != StoreSize)
+    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
+                                         IntegerType::get(Ctx, LoadSize * 8));
+  return SrcVal;
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering store.  This means that the store provides bits used by
+/// the load but the pointers don't must-alias.  Check this case to see if
+/// there is anything more we can do before we give up.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+                            Instruction *InsertPt, const DataLayout &DL) {
+
+  IRBuilder<> Builder(InsertPt);
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+}
+
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+                                       Type *LoadTy, const DataLayout &DL) {
+  ConstantFolder F;
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering load.  This means that the load *may* provide bits used
+/// by the load but we can't be sure because the pointers don't must-alias.
+/// Check this case to see if there is anything more we can do before we give
+/// up.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+                           Instruction *InsertPt, const DataLayout &DL) {
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset + LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
+    DestPTy =
+        PointerType::get(DestPTy, PtrVal->getType()->getPointerAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (DL.isBigEndian())
+      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    SrcVal = NewLoad;
+  }
+
+  return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+                                      Type *LoadTy, const DataLayout &DL) {
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize)
+    return nullptr;
+  return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
+}
+
+template <class T, class HelperClass>
+T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
+                                Type *LoadTy, HelperClass &Helper,
+                                const DataLayout &DL) {
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    T *Val = cast<T>(MSI->getValue());
+    if (LoadSize != 1)
+      Val =
+          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+    T *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet * 2 <= LoadSize) {
+        T *ShVal = Helper.CreateShl(
+            Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
+        Val = Helper.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+      Val = Helper.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL) {
+  IRBuilder<> Builder(InsertPt);
+  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
+                                                          LoadTy, Builder, DL);
+}
+
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                         Type *LoadTy, const DataLayout &DL) {
+  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
+  // constant is when it's a memset of a non-constant.
+  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
+    if (!isa<Constant>(MSI->getValue()))
+      return nullptr;
+  ConstantFolder F;
+  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
+                                                                LoadTy, F, DL);
+}
+} // namespace VNCoercion
+} // namespace llvm
diff --git a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 0e9baaf..9309729 100644
--- a/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -121,6 +121,8 @@ public:
 
   void addFlags(RemapFlags Flags);
 
+  void remapGlobalObjectMetadata(GlobalObject &GO);
+
   Value *mapValue(const Value *V);
   void remapInstruction(Instruction *I);
   void remapFunction(Function &F);
@@ -681,6 +683,7 @@ void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
     remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
       if (Optional<Metadata *> MappedOp = getMappedOp(Old))
         return *MappedOp;
+      (void)D;
       assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
       return &G.getFwdReference(*cast<MDNode>(Old));
     });
@@ -801,6 +804,7 @@ void Mapper::flush() {
     switch (E.Kind) {
     case WorklistEntry::MapGlobalInit:
       E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
+      remapGlobalObjectMetadata(*E.Data.GVInit.GV);
       break;
     case WorklistEntry::MapAppendingVar: {
       unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
@@ -891,6 +895,14 @@ void Mapper::remapInstruction(Instruction *I) {
   I->mutateType(TypeMapper->remapType(I->getType()));
 }
 
+void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+  GO.getAllMetadata(MDs);
+  GO.clearMetadata();
+  for (const auto &I : MDs)
+    GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+}
+
 void Mapper::remapFunction(Function &F) {
   // Remap the operands.
   for (Use &Op : F.operands())
@@ -898,11 +910,7 @@ void Mapper::remapFunction(Function &F) {
       Op = mapValue(Op);
 
   // Remap the metadata attachments.
-  SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
-  F.getAllMetadata(MDs);
-  F.clearMetadata();
-  for (const auto &I : MDs)
-    F.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+  remapGlobalObjectMetadata(F);
 
   // Remap the argument types.
   if (TypeMapper)
@@ -941,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
     Constant *NewV;
     if (IsOldCtorDtor) {
       auto *S = cast<ConstantStruct>(V);
-      auto *E1 = mapValue(S->getOperand(0));
-      auto *E2 = mapValue(S->getOperand(1));
-      Value *Null = Constant::getNullValue(VoidPtrTy);
-      NewV =
-          ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null, nullptr);
+      auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+      auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+      Constant *Null = Constant::getNullValue(VoidPtrTy);
+      NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
     } else {
       NewV = cast_or_null<Constant>(mapValue(V));
     }
diff --git a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
deleted file mode 100644
index c01740b..0000000
--- a/contrib/llvm/lib/Transforms/Vectorize/BBVectorize.cpp
+++ /dev/null
@@ -1,3269 +0,0 @@
-//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a basic-block vectorization pass. The algorithm was
-// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
-// et al. It works by looking for chains of pairable operations and then
-// pairing them.
-//
-//===----------------------------------------------------------------------===//
-
-#define BBV_NAME "bb-vectorize"
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-using namespace llvm;
-
-#define DEBUG_TYPE BBV_NAME
-
-static cl::opt<bool>
-IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
-  cl::Hidden, cl::desc("Ignore target information"));
-
-static cl::opt<unsigned>
-ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
-  cl::desc("The required chain depth for vectorization"));
-
-static cl::opt<bool>
-UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
-  cl::Hidden, cl::desc("Use the chain depth requirement with"
-                       " target information"));
-
-static cl::opt<unsigned>
-SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
-  cl::desc("The maximum search distance for instruction pairs"));
-
-static cl::opt<bool>
-SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
-  cl::desc("Replicating one element to a pair breaks the chain"));
-
-static cl::opt<unsigned>
-VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
-  cl::desc("The size of the native vector registers"));
-
-static cl::opt<unsigned>
-MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
-  cl::desc("The maximum number of pairing iterations"));
-
-static cl::opt<bool>
-Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to form non-2^n-length vectors"));
-
-static cl::opt<unsigned>
-MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
-  cl::desc("The maximum number of pairable instructions per group"));
-
-static cl::opt<unsigned>
-MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
-  cl::desc("The maximum number of candidate instruction pairs per group"));
-
-static cl::opt<unsigned>
-MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
-  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
-                       " a full cycle check"));
-
-static cl::opt<bool>
-NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize boolean (i1) values"));
-
-static cl::opt<bool>
-NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize integer values"));
-
-static cl::opt<bool>
-NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point values"));
-
-// FIXME: This should default to false once pointer vector support works.
-static cl::opt<bool>
-NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
-  cl::desc("Don't try to vectorize pointer values"));
-
-static cl::opt<bool>
-NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize casting (conversion) operations"));
-
-static cl::opt<bool>
-NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point math intrinsics"));
-
-static cl::opt<bool>
-  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
-
-static cl::opt<bool>
-NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
-
-static cl::opt<bool>
-NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize select instructions"));
-
-static cl::opt<bool>
-NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize comparison instructions"));
-
-static cl::opt<bool>
-NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize getelementptr instructions"));
-
-static cl::opt<bool>
-NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize loads and stores"));
-
-static cl::opt<bool>
-AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
-  cl::desc("Only generate aligned loads and stores"));
-
-static cl::opt<bool>
-NoMemOpBoost("bb-vectorize-no-mem-op-boost",
-  cl::init(false), cl::Hidden,
-  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
-
-static cl::opt<bool>
-FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
-  cl::desc("Use a fast instruction dependency analysis"));
-
-#ifndef NDEBUG
-static cl::opt<bool>
-DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " instruction-examination process"));
-static cl::opt<bool>
-DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " candidate-selection process"));
-static cl::opt<bool>
-DebugPairSelection("bb-vectorize-debug-pair-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " pair-selection process"));
-static cl::opt<bool>
-DebugCycleCheck("bb-vectorize-debug-cycle-check",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " cycle-checking process"));
-
-static cl::opt<bool>
-PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, dump the basic block after"
-           " every pair is fused"));
-#endif
-
-STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
-
-namespace {
-  struct BBVectorize : public BasicBlockPass {
-    static char ID; // Pass identification, replacement for typeid
-
-    const VectorizeConfig Config;
-
-    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
-      : BasicBlockPass(ID), Config(C) {
-      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
-    }
-
-    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
-      : BasicBlockPass(ID), Config(C) {
-      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    }
-
-    typedef std::pair<Value *, Value *> ValuePair;
-    typedef std::pair<ValuePair, int> ValuePairWithCost;
-    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
-    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
-    typedef std::pair<VPPair, unsigned> VPPairWithType;
-
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ScalarEvolution *SE;
-    const TargetLibraryInfo *TLI;
-    const TargetTransformInfo *TTI;
-
-    // FIXME: const correct?
-
-    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
-
-    bool getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
-
-    // FIXME: The current implementation does not account for pairs that
-    // are connected in multiple ways. For example:
-    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
-    enum PairConnectionType {
-      PairConnectionDirect,
-      PairConnectionSwap,
-      PairConnectionSplat
-    };
-
-    void computeConnectedPairs(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes);
-
-    void buildDepMap(BasicBlock &BB,
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *>& ChosenPairs);
-
-    void fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *>& ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
-
-
-    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
-
-    bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder);
-
-    bool trackUsesOfI(DenseSet<Value *> &Users,
-                      AliasSetTracker &WriteSet, Instruction *I,
-                      Instruction *J, bool UpdateUsers = true,
-                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
-
-  void computePairsConnectedTo(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             ValuePair P);
-
-    bool pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> >
-               *PairableInstUserMap = nullptr,
-             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
-
-    bool pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
-             DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG,
-             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-             bool UseCycleCheck);
-
-    void buildInitialDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
-
-    void findBestDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-             bool UseCycleCheck);
-
-    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o);
-
-    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask);
-
-    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
-                     Instruction *J);
-
-    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
-                       unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
-                       unsigned IdxOff = 0);
-
-    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ);
-
-    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ);
-
-    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2);
-
-    void collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I);
-
-    void collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs);
-
-    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J);
-
-    void moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J);
-
-    bool vectorizeBB(BasicBlock &BB) {
-      if (skipBasicBlock(BB))
-        return false;
-      if (!DT->isReachableFromEntry(&BB)) {
-        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
-              " in " << BB.getParent()->getName() << "\n");
-        return false;
-      }
-
-      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
-
-      bool changed = false;
-      // Iterate a sufficient number of times to merge types of size 1 bit,
-      // then 2 bits, then 4, etc. up to half of the target vector width of the
-      // target vector register.
-      unsigned n = 1;
-      for (unsigned v = 2;
-           (TTI || v <= Config.VectorBits) &&
-           (!Config.MaxIter || n <= Config.MaxIter);
-           v *= 2, ++n) {
-        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
-              " for " << BB.getName() << " in " <<
-              BB.getParent()->getName() << "...\n");
-        if (vectorizePairs(BB))
-          changed = true;
-        else
-          break;
-      }
-
-      if (changed && !Pow2LenOnly) {
-        ++n;
-        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
-          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
-                n << " for " << BB.getName() << " in " <<
-                BB.getParent()->getName() << "...\n");
-          if (!vectorizePairs(BB, true)) break;
-        }
-      }
-
-      DEBUG(dbgs() << "BBV: done!\n");
-      return changed;
-    }
-
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      // OptimizeNone check deferred to vectorizeBB().
-
-      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                      *BB.getParent());
-
-      return vectorizeBB(BB);
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      BasicBlockPass::getAnalysisUsage(AU);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.setPreservesCFG();
-    }
-
-    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
-      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
-             "Cannot form vector from incompatible scalar types");
-      Type *STy = ElemTy->getScalarType();
-
-      unsigned numElem;
-      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        numElem = VTy->getNumElements();
-      } else {
-        numElem = 1;
-      }
-
-      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
-        numElem += VTy->getNumElements();
-      } else {
-        numElem += 1;
-      }
-
-      return VectorType::get(STy, numElem);
-    }
-
-    static inline void getInstructionTypes(Instruction *I,
-                                           Type *&T1, Type *&T2) {
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // For stores, it is the value type, not the pointer type that matters
-        // because the value is what will come from a vector register.
-  
-        Value *IVal = SI->getValueOperand();
-        T1 = IVal->getType();
-      } else {
-        T1 = I->getType();
-      }
-  
-      if (CastInst *CI = dyn_cast<CastInst>(I))
-        T2 = CI->getSrcTy();
-      else
-        T2 = T1;
-
-      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-        T2 = SI->getCondition()->getType();
-      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        T2 = SI->getOperand(0)->getType();
-      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
-        T2 = CI->getOperand(0)->getType();
-      }
-    }
-
-    // Returns the weight associated with the provided value. A chain of
-    // candidate pairs has a length given by the sum of the weights of its
-    // members (one weight per pair; the weight of each member of the pair
-    // is assumed to be the same). This length is then compared to the
-    // chain-length threshold to determine if a given chain is significant
-    // enough to be vectorized. The length is also used in comparing
-    // candidate chains where longer chains are considered to be better.
-    // Note: when this function returns 0, the resulting instructions are
-    // not actually fused.
-    inline size_t getDepthFactor(Value *V) {
-      // InsertElement and ExtractElement have a depth factor of zero. This is
-      // for two reasons: First, they cannot be usefully fused. Second, because
-      // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the dags in the next iteration. Thus, giving them a
-      // weight of zero allows the pass to essentially ignore them in
-      // subsequent iterations when looking for vectorization opportunities
-      // while still tracking dependency chains that flow through those
-      // instructions.
-      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
-        return 0;
-
-      // Give a load or store half of the required depth so that load/store
-      // pairs will vectorize.
-      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
-        return Config.ReqChainDepth/2;
-
-      return 1;
-    }
-
-    // Returns the cost of the provided instruction using TTI.
-    // This does not handle loads and stores.
-    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
-                          TargetTransformInfo::OperandValueKind Op1VK = 
-                              TargetTransformInfo::OK_AnyValue,
-                          TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue) {
-      switch (Opcode) {
-      default: break;
-      case Instruction::GetElementPtr:
-        // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the instruction addressing mode. At the moment we don't
-        // generate vector GEPs.
-        return 0;
-      case Instruction::Br:
-        return TTI->getCFInstrCost(Opcode);
-      case Instruction::PHI:
-        return 0;
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor:
-        return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
-      case Instruction::Select:
-      case Instruction::ICmp:
-      case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast:
-      case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2);
-      }
-
-      return 1;
-    }
-
-    // This determines the relative offset of two loads or stores, returning
-    // true if the offset could be determined to be some constant value.
-    // For example, if OffsetInElmts == 1, then J accesses the memory directly
-    // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J.
-    bool getPairPtrInfo(Instruction *I, Instruction *J,
-        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
-        unsigned &IAddressSpace, unsigned &JAddressSpace,
-        int64_t &OffsetInElmts, bool ComputeOffset = true) {
-      OffsetInElmts = 0;
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        LoadInst *LJ = cast<LoadInst>(J);
-        IPtr = LI->getPointerOperand();
-        JPtr = LJ->getPointerOperand();
-        IAlignment = LI->getAlignment();
-        JAlignment = LJ->getAlignment();
-        IAddressSpace = LI->getPointerAddressSpace();
-        JAddressSpace = LJ->getPointerAddressSpace();
-      } else {
-        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
-        IPtr = SI->getPointerOperand();
-        JPtr = SJ->getPointerOperand();
-        IAlignment = SI->getAlignment();
-        JAlignment = SJ->getAlignment();
-        IAddressSpace = SI->getPointerAddressSpace();
-        JAddressSpace = SJ->getPointerAddressSpace();
-      }
-
-      if (!ComputeOffset)
-        return true;
-
-      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
-      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
-
-      // If this is a trivial offset, then we'll get something like
-      // 1*sizeof(type). With target data, which we need anyway, this will get
-      // constant folded into a number.
-      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
-      if (const SCEVConstant *ConstOffSCEV =
-            dyn_cast<SCEVConstant>(OffsetSCEV)) {
-        ConstantInt *IntOff = ConstOffSCEV->getValue();
-        int64_t Offset = IntOff->getSExtValue();
-        const DataLayout &DL = I->getModule()->getDataLayout();
-        Type *VTy = IPtr->getType()->getPointerElementType();
-        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
-
-        Type *VTy2 = JPtr->getType()->getPointerElementType();
-        if (VTy != VTy2 && Offset < 0) {
-          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
-          OffsetInElmts = Offset/VTy2TSS;
-          return (std::abs(Offset) % VTy2TSS) == 0;
-        }
-
-        OffsetInElmts = Offset/VTyTSS;
-        return (std::abs(Offset) % VTyTSS) == 0;
-      }
-
-      return false;
-    }
-
-    // Returns true if the provided CallInst represents an intrinsic that can
-    // be vectorized.
-    bool isVectorizableIntrinsic(CallInst* I) {
-      Function *F = I->getCalledFunction();
-      if (!F) return false;
-
-      Intrinsic::ID IID = F->getIntrinsicID();
-      if (!IID) return false;
-
-      switch(IID) {
-      default:
-        return false;
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::log:
-      case Intrinsic::log2:
-      case Intrinsic::log10:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::pow:
-      case Intrinsic::round:
-      case Intrinsic::copysign:
-      case Intrinsic::ceil:
-      case Intrinsic::nearbyint:
-      case Intrinsic::rint:
-      case Intrinsic::trunc:
-      case Intrinsic::floor:
-      case Intrinsic::fabs:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-        return Config.VectorizeMath;
-      case Intrinsic::bswap:
-      case Intrinsic::ctpop:
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-        return Config.VectorizeBitManipulations;
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-        return Config.VectorizeFMA;
-      }
-    }
-
-    bool isPureIEChain(InsertElementInst *IE) {
-      InsertElementInst *IENext = IE;
-      do {
-        if (!isa<UndefValue>(IENext->getOperand(0)) &&
-            !isa<InsertElementInst>(IENext->getOperand(0))) {
-          return false;
-        }
-      } while ((IENext =
-                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
-
-      return true;
-    }
-  };
-
-  // This function implements one vectorization iteration on the provided
-  // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
-    bool ShouldContinue;
-    BasicBlock::iterator Start = BB.getFirstInsertionPt();
-
-    std::vector<Value *> AllPairableInsts;
-    DenseMap<Value *, Value *> AllChosenPairs;
-    DenseSet<ValuePair> AllFixedOrderPairs;
-    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
-                                                 AllConnectedPairDeps;
-
-    do {
-      std::vector<Value *> PairableInsts;
-      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
-      DenseSet<ValuePair> FixedOrderPairs;
-      DenseMap<ValuePair, int> CandidatePairCostSavings;
-      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         FixedOrderPairs,
-                                         CandidatePairCostSavings,
-                                         PairableInsts, NonPow2Len);
-      if (PairableInsts.empty()) continue;
-
-      // Build the candidate pair set for faster lookups.
-      DenseSet<ValuePair> CandidatePairsSet;
-      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
-           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
-        for (std::vector<Value *>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          CandidatePairsSet.insert(ValuePair(I->first, *J));
-
-      // Now we have a map of all of the pairable instructions and we need to
-      // select the best possible pairing. A good pairing is one such that the
-      // users of the pair are also paired. This defines a (directed) forest
-      // over the pairs such that two pairs are connected iff the second pair
-      // uses the first.
-
-      // Note that it only matters that both members of the second pair use some
-      // element of the first pair (to allow for splatting).
-
-      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
-                                                   ConnectedPairDeps;
-      DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
-                            PairableInsts, ConnectedPairs, PairConnectionTypes);
-      if (ConnectedPairs.empty()) continue;
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          ConnectedPairDeps[*J].push_back(I->first);
-
-      // Build the pairable-instruction dependency map
-      DenseSet<ValuePair> PairableInstUsers;
-      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
-
-      // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest dag meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that dag
-      // and remove them from the list of available pairings and pairable
-      // variables.
-
-      DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairsSet,
-        CandidatePairCostSavings,
-        PairableInsts, FixedOrderPairs, PairConnectionTypes,
-        ConnectedPairs, ConnectedPairDeps,
-        PairableInstUsers, ChosenPairs);
-
-      if (ChosenPairs.empty()) continue;
-      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
-                              PairableInsts.end());
-      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
-
-      // Only for the chosen pairs, propagate information on fixed-order pairs,
-      // pair connections, and their types to the data structures used by the
-      // pair fusion procedures.
-      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
-           IE = ChosenPairs.end(); I != IE; ++I) {
-        if (FixedOrderPairs.count(*I))
-          AllFixedOrderPairs.insert(*I);
-        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
-          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
-
-        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
-             J != IE; ++J) {
-          DenseMap<VPPair, unsigned>::iterator K =
-            PairConnectionTypes.find(VPPair(*I, *J));
-          if (K != PairConnectionTypes.end()) {
-            AllPairConnectionTypes.insert(*K);
-          } else {
-            K = PairConnectionTypes.find(VPPair(*J, *I));
-            if (K != PairConnectionTypes.end())
-              AllPairConnectionTypes.insert(*K);
-          }
-        }
-      }
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-          JE = I->second.end(); J != JE; ++J)
-          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
-            AllConnectedPairs[I->first].push_back(*J);
-            AllConnectedPairDeps[*J].push_back(I->first);
-          }
-    } while (ShouldContinue);
-
-    if (AllChosenPairs.empty()) return false;
-    NumFusedOps += AllChosenPairs.size();
-
-    // A set of pairs has now been selected. It is now necessary to replace the
-    // paired instructions with vector instructions. For this procedure each
-    // operand must be replaced with a vector operand. This vector is formed
-    // by using build_vector on the old operands. The replaced values are then
-    // replaced with a vector_extract on the result.  Subsequent optimization
-    // passes should coalesce the build/extract combinations.
-
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
-                    AllPairConnectionTypes,
-                    AllConnectedPairs, AllConnectedPairDeps);
-
-    // It is important to cleanup here so that future iterations of this
-    // function have less work to do.
-    (void)SimplifyInstructionsInBlock(&BB, TLI);
-    return true;
-  }
-
-  // This function returns true if the provided instruction is capable of being
-  // fused into a vector instruction. This determination is based only on the
-  // type and other attributes of the instruction.
-  bool BBVectorize::isInstVectorizable(Instruction *I,
-                                         bool &IsSimpleLoadStore) {
-    IsSimpleLoadStore = false;
-
-    if (CallInst *C = dyn_cast<CallInst>(I)) {
-      if (!isVectorizableIntrinsic(C))
-        return false;
-    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
-      // Vectorize simple loads if possbile:
-      IsSimpleLoadStore = L->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
-      // Vectorize simple stores if possbile:
-      IsSimpleLoadStore = S->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
-      // We can vectorize casts, but not casts of pointer types, etc.
-      if (!Config.VectorizeCasts)
-        return false;
-
-      Type *SrcTy = C->getSrcTy();
-      if (!SrcTy->isSingleValueType())
-        return false;
-
-      Type *DestTy = C->getDestTy();
-      if (!DestTy->isSingleValueType())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-      if (!Config.VectorizeSelect)
-        return false;
-      // We can vectorize a select if either all operands are scalars,
-      // or all operands are vectors. Trying to "widen" a select between
-      // vectors that has a scalar condition results in a malformed select.
-      // FIXME: We could probably be smarter about this by rewriting the select
-      // with different types instead.
-      return (SI->getCondition()->getType()->isVectorTy() == 
-              SI->getTrueValue()->getType()->isVectorTy());
-    } else if (isa<CmpInst>(I)) {
-      if (!Config.VectorizeCmp)
-        return false;
-    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
-      if (!Config.VectorizeGEP)
-        return false;
-
-      // Currently, vector GEPs exist only with one index.
-      if (G->getNumIndices() != 1)
-        return false;
-    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
-        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
-      return false;
-    }
-
-    Type *T1, *T2;
-    getInstructionTypes(I, T1, T2);
-
-    // Not every type can be vectorized...
-    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
-        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
-      return false;
-
-    if (T1->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (T2->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (!Config.VectorizeFloats
-        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
-      return false;
-
-    // Don't vectorize target-specific types.
-    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
-      return false;
-    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
-      return false;
-
-    if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
-                                      T2->getScalarType()->isPointerTy()))
-      return false;
-
-    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
-                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
-      return false;
-
-    return true;
-  }
-
-  // This function returns true if the two provided instructions are compatible
-  // (meaning that they can be fused into a vector instruction). This assumes
-  // that I has already been determined to be vectorizable and that J is not
-  // in the use dag of I.
-  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder) {
-    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
-                     " <-> " << *J << "\n");
-
-    CostSavings = 0;
-    FixedOrder = 0;
-
-    // Loads and stores can be merged if they have different alignments,
-    // but are otherwise the same.
-    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
-                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
-      return false;
-
-    Type *IT1, *IT2, *JT1, *JT2;
-    getInstructionTypes(I, IT1, IT2);
-    getInstructionTypes(J, JT1, JT2);
-    unsigned MaxTypeBits = std::max(
-      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
-      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
-    if (!TTI && MaxTypeBits > Config.VectorBits)
-      return false;
-
-    // FIXME: handle addsub-type operations!
-
-    if (IsSimpleLoadStore) {
-      Value *IPtr, *JPtr;
-      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-      int64_t OffsetInElmts = 0;
-      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
-          std::abs(OffsetInElmts) == 1) {
-        FixedOrder = (int) OffsetInElmts;
-        unsigned BottomAlignment = IAlignment;
-        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
-        Type *aTypeI = isa<StoreInst>(I) ?
-          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
-        Type *aTypeJ = isa<StoreInst>(J) ?
-          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
-        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
-
-        if (Config.AlignedOnly) {
-          // An aligned load or store is possible only if the instruction
-          // with the lower offset has an alignment suitable for the
-          // vector type.
-          const DataLayout &DL = I->getModule()->getDataLayout();
-          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
-          if (BottomAlignment < VecAlignment)
-            return false;
-        }
-
-        if (TTI) {
-          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
-                                                IAlignment, IAddressSpace);
-          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
-                                                JAlignment, JAddressSpace);
-          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
-                                                BottomAlignment,
-                                                IAddressSpace);
-
-          ICost += TTI->getAddressComputationCost(aTypeI);
-          JCost += TTI->getAddressComputationCost(aTypeJ);
-          VCost += TTI->getAddressComputationCost(VType);
-
-          if (VCost > ICost + JCost)
-            return false;
-
-          // We don't want to fuse to a type that will be split, even
-          // if the two input types will also be split and there is no other
-          // associated cost.
-          unsigned VParts = TTI->getNumberOfParts(VType);
-          if (VParts > 1)
-            return false;
-          else if (!VParts && VCost == ICost + JCost)
-            return false;
-
-          CostSavings = ICost + JCost - VCost;
-        }
-      } else {
-        return false;
-      }
-    } else if (TTI) {
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
-
-      // On some targets (example X86) the cost of a vector shift may vary
-      // depending on whether the second operand is a Uniform or
-      // NonUniform Constant.
-      switch (I->getOpcode()) {
-      default : break;
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-
-        // If both I and J are scalar shifts by constant, then the
-        // merged vector shift count would be either a constant splat value
-        // or a non-uniform vector of constants.
-        if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
-          if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
-            Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
-                               TargetTransformInfo::OK_NonUniformConstantValue;
-        } else {
-          // Check for a splat of a constant or for a non uniform vector
-          // of constants.
-          Value *IOp = I->getOperand(1);
-          Value *JOp = J->getOperand(1);
-          if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
-              (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
-            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-            Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
-            if (SplatValue != nullptr &&
-                SplatValue == cast<Constant>(JOp)->getSplatValue())
-              Op2VK = TargetTransformInfo::OK_UniformConstantValue;
-          }
-        }
-      }
-
-      // Note that this procedure is incorrect for insert and extract element
-      // instructions (because combining these often results in a shuffle),
-      // but this cost is ignored (because insert and extract element
-      // instructions are assigned a zero depth factor and are not really
-      // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
-
-      if (VCost > ICost + JCost)
-        return false;
-
-      // We don't want to fuse to a type that will be split, even
-      // if the two input types will also be split and there is no other
-      // associated cost.
-      unsigned VParts1 = TTI->getNumberOfParts(VT1),
-               VParts2 = TTI->getNumberOfParts(VT2);
-      if (VParts1 > 1 || VParts2 > 1)
-        return false;
-      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
-        return false;
-
-      CostSavings = ICost + JCost - VCost;
-    }
-
-    // The powi,ctlz,cttz intrinsics are special because only the first
-    // argument is vectorized, the second arguments must be equal.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    Function *FI;
-    if (CI && (FI = CI->getCalledFunction())) {
-      Intrinsic::ID IID = FI->getIntrinsicID();
-      if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-          IID == Intrinsic::cttz) {
-        Value *A1I = CI->getArgOperand(1),
-              *A1J = cast<CallInst>(J)->getArgOperand(1);
-        const SCEV *A1ISCEV = SE->getSCEV(A1I),
-                   *A1JSCEV = SE->getSCEV(A1J);
-        return (A1ISCEV == A1JSCEV);
-      }
-
-      if (IID && TTI) {
-        FastMathFlags FMFCI;
-        if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
-          FMFCI = FPMOCI->getFastMathFlags();
-
-        SmallVector<Type*, 4> Tys;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
-          Tys.push_back(CI->getArgOperand(i)->getType());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);
-
-        Tys.clear();
-        CallInst *CJ = cast<CallInst>(J);
-
-        FastMathFlags FMFCJ;
-        if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
-          FMFCJ = FPMOCJ->getFastMathFlags();
-
-        for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
-          Tys.push_back(CJ->getArgOperand(i)->getType());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);
-
-        Tys.clear();
-        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
-               "Intrinsic argument counts differ");
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-               IID == Intrinsic::cttz) && i == 1)
-            Tys.push_back(CI->getArgOperand(i)->getType());
-          else
-            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
-                                            CJ->getArgOperand(i)->getType()));
-        }
-
-        FastMathFlags FMFV = FMFCI;
-        FMFV &= FMFCJ;
-        Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);
-
-        if (VCost > ICost + JCost)
-          return false;
-
-        // We don't want to fuse to a type that will be split, even
-        // if the two input types will also be split and there is no other
-        // associated cost.
-        unsigned RetParts = TTI->getNumberOfParts(RetTy);
-        if (RetParts > 1)
-          return false;
-        else if (!RetParts && VCost == ICost + JCost)
-          return false;
-
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if (!Tys[i]->isVectorTy())
-            continue;
-
-          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
-          if (NumParts > 1)
-            return false;
-          else if (!NumParts && VCost == ICost + JCost)
-            return false;
-        }
-
-        CostSavings = ICost + JCost - VCost;
-      }
-    }
-
-    return true;
-  }
-
-  // Figure out whether or not J uses I and update the users and write-set
-  // structures associated with I. Specifically, Users represents the set of
-  // instructions that depend on I. WriteSet represents the set
-  // of memory locations that are dependent on I. If UpdateUsers is true,
-  // and J uses I, then Users is updated to contain J and WriteSet is updated
-  // to contain any memory locations to which J writes. The function returns
-  // true if J uses I. By default, alias analysis is used to determine
-  // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed map
-  // where the key is the memory-based user instruction and the value is
-  // the instruction to be compared with I. So, if LoadMoveSet is provided,
-  // then the alias analysis is not used. This is necessary because this
-  // function is called during the process of moving instructions during
-  // vectorization and the results of the alias analysis are not stable during
-  // that process.
-  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
-                       AliasSetTracker &WriteSet, Instruction *I,
-                       Instruction *J, bool UpdateUsers,
-                       DenseSet<ValuePair> *LoadMoveSetPairs) {
-    bool UsesI = false;
-
-    // This instruction may already be marked as a user due, for example, to
-    // being a member of a selected pair.
-    if (Users.count(J))
-      UsesI = true;
-
-    if (!UsesI)
-      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
-           JU != JE; ++JU) {
-        Value *V = *JU;
-        if (I == V || Users.count(V)) {
-          UsesI = true;
-          break;
-        }
-      }
-    if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSetPairs) {
-        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
-      } else {
-        for (AliasSetTracker::iterator W = WriteSet.begin(),
-             WE = WriteSet.end(); W != WE; ++W) {
-          if (W->aliasesUnknownInst(J, *AA)) {
-            UsesI = true;
-            break;
-          }
-        }
-      }
-    }
-
-    if (UsesI && UpdateUsers) {
-      if (J->mayWriteToMemory()) WriteSet.add(J);
-      Users.insert(J);
-    }
-
-    return UsesI;
-  }
-
-  // This function iterates over all instruction pairs in the provided
-  // basic block and collects all candidate pairs for vectorization.
-  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
-    size_t TotalPairs = 0;
-    BasicBlock::iterator E = BB.end();
-    if (Start == E) return false;
-
-    bool ShouldContinue = false, IAfterStart = false;
-    for (BasicBlock::iterator I = Start++; I != E; ++I) {
-      if (I == Start) IAfterStart = true;
-
-      bool IsSimpleLoadStore;
-      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
-        continue;
-
-      // Look for an instruction with which to pair instruction *I...
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      bool JAfterStart = IAfterStart;
-      BasicBlock::iterator J = std::next(I);
-      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (J == Start)
-          JAfterStart = true;
-
-        // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
-        if (Config.FastDep) {
-          // Note: For this heuristic to be effective, independent operations
-          // must tend to be intermixed. This is likely to be true from some
-          // kinds of grouped loop unrolling (but not the generic LLVM pass),
-          // but otherwise may require some kind of reordering pass.
-
-          // When using fast dependency analysis,
-          // stop searching after first use:
-          if (UsesI) break;
-        } else {
-          if (UsesI) continue;
-        }
-
-        // J does not use I, and comes before the first use of I, so it can be
-        // merged with I if the instructions are compatible.
-        int CostSavings, FixedOrder;
-        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
-                                CostSavings, FixedOrder))
-          continue;
-
-        // J is a candidate for merging with I.
-        if (PairableInsts.empty() ||
-            PairableInsts[PairableInsts.size() - 1] != &*I) {
-          PairableInsts.push_back(&*I);
-        }
-
-        CandidatePairs[&*I].push_back(&*J);
-        ++TotalPairs;
-        if (TTI)
-          CandidatePairCostSavings.insert(
-              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
-
-        if (FixedOrder == 1)
-          FixedOrderPairs.insert(ValuePair(&*I, &*J));
-        else if (FixedOrder == -1)
-          FixedOrderPairs.insert(ValuePair(&*J, &*I));
-
-        // The next call to this function must start after the last instruction
-        // selected during this invocation.
-        if (JAfterStart) {
-          Start = std::next(J);
-          IAfterStart = JAfterStart = false;
-        }
-
-        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
-                     << *I << " <-> " << *J << " (cost savings: " <<
-                     CostSavings << ")\n");
-
-        // If we have already found too many pairs, break here and this function
-        // will be called again starting after the last instruction selected
-        // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts ||
-            TotalPairs >= Config.MaxPairs) {
-          ShouldContinue = true;
-          break;
-        }
-      }
-
-      if (ShouldContinue)
-        break;
-    }
-
-    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
-           << " instructions with candidate pairs\n");
-
-    return ShouldContinue;
-  }
-
-  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
-  // it looks for pairs such that both members have an input which is an
-  // output of PI or PJ.
-  void BBVectorize::computePairsConnectedTo(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                  ValuePair P) {
-    StoreInst *SI, *SJ;
-
-    // For each possible pairing for this variable, look at the uses of
-    // the first value...
-    for (Value::user_iterator I = P.first->user_begin(),
-                              E = P.first->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI)) {
-        // A pair cannot be connected to a load because the load only takes one
-        // operand (the address) and it is a scalar even after vectorization.
-        continue;
-      } else if ((SI = dyn_cast<StoreInst>(UI)) &&
-                 P.first == SI->getPointerOperand()) {
-        // Similarly, a pair cannot be connected to a store through its
-        // pointer operand.
-        continue;
-      }
-
-      // For each use of the first variable, look for uses of the second
-      // variable...
-      for (User *UJ : P.second->users()) {
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        // Look for <I, J>:
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
-        }
-
-        // Look for <J, I>:
-        if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
-          VPPair VP(P, ValuePair(UJ, UI));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
-        }
-      }
-
-      if (Config.SplatBreaksChain) continue;
-      // Look for cases where just the first value in the pair is used by
-      // both members of another pair (splatting).
-      for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.first == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-
-    if (Config.SplatBreaksChain) return;
-    // Look for cases where just the second value in the pair is used by
-    // both members of another pair (splatting).
-    for (Value::user_iterator I = P.second->user_begin(),
-                              E = P.second->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI))
-        continue;
-      else if ((SI = dyn_cast<StoreInst>(UI)) &&
-               P.second == SI->getPointerOperand())
-        continue;
-
-      for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-  }
-
-  // This function figures out which pairs are connected.  Two pairs are
-  // connected if some output of the first pair forms an input to both members
-  // of the second pair.
-  void BBVectorize::computeConnectedPairs(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PE = PairableInsts.end(); PI != PE; ++PI) {
-      DenseMap<Value *, std::vector<Value *> >::iterator PP =
-        CandidatePairs.find(*PI);
-      if (PP == CandidatePairs.end())
-        continue;
-
-      for (std::vector<Value *>::iterator P = PP->second.begin(),
-           E = PP->second.end(); P != E; ++P)
-        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
-                                PairableInsts, ConnectedPairs,
-                                PairConnectionTypes, ValuePair(*PI, *P));
-    }
-
-    DEBUG(size_t TotalPairs = 0;
-          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
-               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
-            TotalPairs += I->second.size();
-          dbgs() << "BBV: found " << TotalPairs
-                 << " pair connections.\n");
-  }
-
-  // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use dag of A. If B is in the use dag of A, then B
-  // depends on the output of A.
-  void BBVectorize::buildDepMap(
-                      BasicBlock &BB,
-                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &PairableInstUsers) {
-    DenseSet<Value *> IsInPair;
-    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
-         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
-      IsInPair.insert(C->first);
-      IsInPair.insert(C->second.begin(), C->second.end());
-    }
-
-    // Iterate through the basic block, recording all users of each
-    // pairable instruction.
-
-    BasicBlock::iterator E = BB.end(), EL =
-      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
-    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
-      if (IsInPair.find(&*I) == IsInPair.end())
-        continue;
-
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
-        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
-
-        if (J == EL)
-          break;
-      }
-
-      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U) {
-        if (IsInPair.find(*U) == IsInPair.end()) continue;
-        PairableInstUsers.insert(ValuePair(&*I, *U));
-      }
-
-      if (I == EL)
-        break;
-    }
-  }
-
-  // Returns true if an input to pair P is an output of pair Q and also an
-  // input of pair Q is an output of pair P. If this is the case, then these
-  // two pairs cannot be simultaneously fused.
-  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
-             DenseSet<VPPair> *PairableInstUserPairSet) {
-    // Two pairs are in conflict if they are mutual Users of eachother.
-    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.second));
-    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.second));
-    if (PairableInstUserMap) {
-      // FIXME: The expensive part of the cycle check is not so much the cycle
-      // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure.
-      if (PUsesQ) {
-        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
-          (*PairableInstUserMap)[Q].push_back(P);
-      }
-      if (QUsesP) {
-        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
-          (*PairableInstUserMap)[P].push_back(Q);
-      }
-    }
-
-    return (QUsesP && PUsesQ);
-  }
-
-  // This function walks the use graph of current pairs to see if, starting
-  // from P, the walk returns to P.
-  bool BBVectorize::pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<ValuePair> &CurrentPairs) {
-    DEBUG(if (DebugCycleCheck)
-            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
-                   << *P.second << "\n");
-    // A lookup table of visisted pairs is kept because the PairableInstUserMap
-    // contains non-direct associations.
-    DenseSet<ValuePair> Visited;
-    SmallVector<ValuePair, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(P);
-    do {
-      ValuePair QTop = Q.pop_back_val();
-      Visited.insert(QTop);
-
-      DEBUG(if (DebugCycleCheck)
-              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
-                     << *QTop.second << "\n");
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        PairableInstUserMap.find(QTop);
-      if (QQ == PairableInstUserMap.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
-           CE = QQ->second.end(); C != CE; ++C) {
-        if (*C == P) {
-          DEBUG(dbgs()
-                 << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << QTop.first << " <-> " << C->second << "\n");
-          return true;
-        }
-
-        if (CurrentPairs.count(*C) && !Visited.count(*C))
-          Q.push_back(*C);
-      }
-    } while (!Q.empty());
-
-    return false;
-  }
-
-  // This function builds the initial dag of connected pairs with the
-  // pair J at the root.
-  void BBVectorize::buildInitialDAGFor(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseSet<ValuePair> &PairableInstUsers,
-                  DenseMap<Value *, Value *> &ChosenPairs,
-                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a DAG. The DAG
-    // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the DAG and the maximum depth of the DAG.
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.back();
-
-      // Push each child onto the queue:
-      bool MoreChildren = false;
-      size_t MaxChildDepth = QTop.second;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
-             ke = QQ->second.end(); k != ke; ++k) {
-          // Make sure that this child pair is still a candidate:
-          if (CandidatePairsSet.count(*k)) {
-            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
-            if (C == DAG.end()) {
-              size_t d = getDepthFactor(k->first);
-              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
-              MoreChildren = true;
-            } else {
-              MaxChildDepth = std::max(MaxChildDepth, C->second);
-            }
-          }
-        }
-
-      if (!MoreChildren) {
-        // Record the current pair as part of the DAG:
-        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
-        Q.pop_back();
-      }
-    } while (!Q.empty());
-  }
-
-  // Given some initial dag, prune it by removing conflicting pairs (pairs
-  // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              std::vector<Value *> &PairableInsts,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseMap<ValuePair, size_t> &DAG,
-              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-              bool UseCycleCheck) {
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedDAG.insert(QTop.first);
-
-      // Visit each child, pruning as necessary...
-      SmallVector<ValuePairWithDepth, 8> BestChildren;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ == ConnectedPairs.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
-           KE = QQ->second.end(); K != KE; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
-        if (C == DAG.end()) continue;
-
-        // This child is in the DAG, now we need to make sure it is the
-        // best of any conflicting children. There could be multiple
-        // conflicting children, so first, determine if we're keeping
-        // this child, then delete conflicting children as necessary.
-
-        // It is also necessary to guard against pairing-induced
-        // dependencies. Consider instructions a .. x .. y .. b
-        // such that (a,b) are to be fused and (x,y) are to be fused
-        // but a is an input to x and b is an output from y. This
-        // means that y cannot be moved after b but x must be moved
-        // after b for (a,b) to be fused. In other words, after
-        // fusing (a,b) we have y .. a/b .. x where y is an input
-        // to a/b and x is an output to a/b: x and y can no longer
-        // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the DAG is not
-        // both an input and output of an already-selected pair.
-
-        // Pairing-induced dependencies can also form from more complicated
-        // cycles. The pair vs. pair conflicts are easy to check, and so
-        // that is done explicitly for "fast rejection", and because for
-        // child vs. child conflicts, we may prefer to keep the current
-        // pair in preference to the already-selected child.
-        DenseSet<ValuePair> CurrentPairs;
-
-        bool CanAdd = true;
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(), E2 = BestChildren.end();
-             C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            if (C2->second >= C->second) {
-              CanAdd = false;
-              break;
-            }
-
-            CurrentPairs.insert(C2->first);
-          }
-        }
-        if (!CanAdd) continue;
-
-        // Even worse, this child could conflict with another node already
-        // selected for the DAG. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
-             E2 = PrunedDAG.end(); T != E2; ++T) {
-          if (T->first == C->first.first ||
-              T->first == C->first.second ||
-              T->second == C->first.first ||
-              T->second == C->first.second ||
-              pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*T);
-        }
-        if (!CanAdd) continue;
-
-        // And check the queue too...
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
-             E2 = Q.end(); C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(C2->first);
-        }
-        if (!CanAdd) continue;
-
-        // Last but not least, check for a conflict with any of the
-        // already-chosen pairs.
-        for (DenseMap<Value *, Value *>::iterator C2 =
-              ChosenPairs.begin(), E2 = ChosenPairs.end();
-             C2 != E2; ++C2) {
-          if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*C2);
-        }
-        if (!CanAdd) continue;
-
-        // To check for non-trivial cycles formed by the addition of the
-        // current pair we've formed a list of all relevant pairs, now use a
-        // graph walk to check for a cycle. We start from the current pair and
-        // walk the use dag to see if we again reach the current pair. If we
-        // do, then the current pair is rejected.
-
-        // FIXME: It may be more efficient to use a topological-ordering
-        // algorithm to improve the cycle check. This should be investigated.
-        if (UseCycleCheck &&
-            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
-          continue;
-
-        // This child can be added, but we may have chosen it in preference
-        // to an already-selected child. Check for this here, and if a
-        // conflict is found, then remove the previously-selected child
-        // before adding this one in its place.
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(); C2 != BestChildren.end();) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers))
-            C2 = BestChildren.erase(C2);
-          else
-            ++C2;
-        }
-
-        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
-      }
-
-      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
-            = BestChildren.begin(), E2 = BestChildren.end();
-           C != E2; ++C) {
-        size_t DepthF = getDepthFactor(C->first.first);
-        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
-      }
-    } while (!Q.empty());
-  }
-
-  // This function finds the best dag of mututally-compatible connected
-  // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              DenseSet<ValuePair> &CandidatePairsSet,
-              DenseMap<ValuePair, int> &CandidatePairCostSavings,
-              std::vector<Value *> &PairableInsts,
-              DenseSet<ValuePair> &FixedOrderPairs,
-              DenseMap<VPPair, unsigned> &PairConnectionTypes,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-              bool UseCycleCheck) {
-    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
-         J != JE; ++J) {
-      ValuePair IJ(II, *J);
-      if (!CandidatePairsSet.count(IJ))
-        continue;
-
-      // Before going any further, make sure that this pair does not
-      // conflict with any already-selected pairs (see comment below
-      // near the DAG pruning for more details).
-      DenseSet<ValuePair> ChosenPairSet;
-      bool DoesConflict = false;
-      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
-           E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, IJ, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : nullptr,
-                          UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
-          DoesConflict = true;
-          break;
-        }
-
-        ChosenPairSet.insert(*C);
-      }
-      if (DoesConflict) continue;
-
-      if (UseCycleCheck &&
-          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
-        continue;
-
-      DenseMap<ValuePair, size_t> DAG;
-      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
-                          PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, DAG, IJ);
-
-      // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned DAG.
-      size_t MaxDepth = DAG.lookup(IJ);
-
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
-                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-                   MaxDepth << " and size " << DAG.size() << "\n");
-
-      // At this point the DAG has been constructed, but, may contain
-      // contradictory children (meaning that different children of
-      // some dag node may be attempting to fuse the same instruction).
-      // So now we walk the dag again, in the case of a conflict,
-      // keep only the child with the largest depth. To break a tie,
-      // favor the first child.
-
-      DenseSet<ValuePair> PrunedDAG;
-      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap,
-                   PairableInstUserPairSet,
-                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
-
-      int EffSize = 0;
-      if (TTI) {
-        DenseSet<Value *> PrunedDAGInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          PrunedDAGInstrs.insert(S->first);
-          PrunedDAGInstrs.insert(S->second);
-        }
-
-        // The set of pairs that have already contributed to the total cost.
-        DenseSet<ValuePair> IncomingPairs;
-
-        // If the cost model were perfect, this might not be necessary; but we
-        // need to make sure that we don't get stuck vectorizing our own
-        // shuffle chains.
-        bool HasNontrivialInsts = false;
-
-        // The node weights represent the cost savings associated with
-        // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          if (!isa<ShuffleVectorInst>(S->first) &&
-              !isa<InsertElementInst>(S->first) &&
-              !isa<ExtractElementInst>(S->first))
-            HasNontrivialInsts = true;
-
-          bool FlipOrder = false;
-
-          if (getDepthFactor(S->first)) {
-            int ESContrib = CandidatePairCostSavings.find(*S)->second;
-            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
-                   << *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-            EffSize += ESContrib;
-          }
-
-          // The edge weights contribute in a negative sense: they represent
-          // the cost of shuffles.
-          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
-            ConnectedPairDeps.find(*S);
-          if (SS != ConnectedPairDeps.end()) {
-            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              if (R->second == PairConnectionDirect)
-                ++NumDepsDirect;
-              else if (R->second == PairConnectionSwap)
-                ++NumDepsSwap;
-            }
-
-            // If there are more swaps than direct connections, then
-            // the pair order will be flipped during fusion. So the real
-            // number of swaps is the minimum number.
-            FlipOrder = !FixedOrderPairs.count(*S) &&
-              ((NumDepsSwap > NumDepsDirect) ||
-                FixedOrderPairs.count(ValuePair(S->second, S->first)));
-
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              Type *Ty1 = Q.second.first->getType(),
-                   *Ty2 = Q.second.second->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-              if ((R->second == PairConnectionDirect && FlipOrder) ||
-                  (R->second == PairConnectionSwap && !FlipOrder)  ||
-                  R->second == PairConnectionSplat) {
-                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                                   VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2) {
-                  if (R->second == PairConnectionSplat)
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Broadcast, VTy));
-                  else
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Reverse, VTy));
-                }
-
-                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q.second.first << " <-> " << *Q.second.second <<
-                    "} -> {" <<
-                  *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-                EffSize -= ESContrib;
-              }
-            }
-          }
-
-          // Compute the cost of outgoing edges. We assume that edges outgoing
-          // to shuffles, inserts or extracts can be merged, and so contribute
-          // no additional cost.
-          if (!S->first->getType()->isVoidTy()) {
-            Type *Ty1 = S->first->getType(),
-                 *Ty2 = S->second->getType();
-            Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-            bool NeedsExtraction = false;
-            for (User *U : S->first->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty1->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty1, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 0);
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->first << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-
-            NeedsExtraction = false;
-            for (User *U : S->second->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty2->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty2, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy,
-                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 1);
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->second << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-          }
-
-          // Compute the cost of incoming edges.
-          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
-            Instruction *S1 = cast<Instruction>(S->first),
-                        *S2 = cast<Instruction>(S->second);
-            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
-              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
-
-              // Combining constants into vector constants (or small vector
-              // constants into larger ones are assumed free).
-              if (isa<Constant>(O1) && isa<Constant>(O2))
-                continue;
-
-              if (FlipOrder)
-                std::swap(O1, O2);
-
-              ValuePair VP  = ValuePair(O1, O2);
-              ValuePair VPR = ValuePair(O2, O1);
-
-              // Internal edges are not handled here.
-              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
-                continue;
-
-              Type *Ty1 = O1->getType(),
-                   *Ty2 = O2->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-              // Combining vector operations of the same type is also assumed
-              // folded with other operations.
-              if (Ty1 == Ty2) {
-                // If both are insert elements, then both can be widened.
-                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
-                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
-                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
-                  continue;
-                // If both are extract elements, and both have the same input
-                // type, then they can be replaced with a shuffle
-                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
-                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
-                if (EIO1 && EIO2 &&
-                    EIO1->getOperand(0)->getType() ==
-                      EIO2->getOperand(0)->getType())
-                  continue;
-                // If both are a shuffle with equal operand types and only two
-                // unqiue operands, then they can be replaced with a single
-                // shuffle
-                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
-                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
-                if (SIO1 && SIO2 &&
-                    SIO1->getOperand(0)->getType() ==
-                      SIO2->getOperand(0)->getType()) {
-                  SmallSet<Value *, 4> SIOps;
-                  SIOps.insert(SIO1->getOperand(0));
-                  SIOps.insert(SIO1->getOperand(1));
-                  SIOps.insert(SIO2->getOperand(0));
-                  SIOps.insert(SIO2->getOperand(1));
-                  if (SIOps.size() <= 2)
-                    continue;
-                }
-              }
-
-              int ESContrib;
-              // This pair has already been formed.
-              if (IncomingPairs.count(VP)) {
-                continue;
-              } else if (IncomingPairs.count(VPR)) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2)
-                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                    TargetTransformInfo::SK_Reverse, VTy));
-              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, VTy, 0);
-                ESContrib += (int) TTI->getVectorInstrCost(
-                                     Instruction::InsertElement, VTy, 1);
-              } else if (!Ty1->isVectorTy()) {
-                // O1 needs to be inserted into a vector of size O2, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty2, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty2);
-              } else if (!Ty2->isVectorTy()) {
-                // O2 needs to be inserted into a vector of size O1, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty1, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty1);
-              } else {
-                Type *TyBig = Ty1, *TySmall = Ty2;
-                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
-                  std::swap(TyBig, TySmall);
-
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, TyBig);
-                if (TyBig != TySmall)
-                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                  TyBig, TySmall);
-              }
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
-                     << *O1 << " <-> " << *O2 << "} = " <<
-                     ESContrib << "\n");
-              EffSize -= ESContrib;
-              IncomingPairs.insert(VP);
-            }
-          }
-        }
-
-        if (!HasNontrivialInsts) {
-          DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in DAG;"
-                " override to zero effective size\n");
-          EffSize = 0;
-        }
-      } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S)
-          EffSize += (int) getDepthFactor(S->first);
-      }
-
-      DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned DAG for pair {"
-             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-             MaxDepth << " and size " << PrunedDAG.size() <<
-            " (effective size: " << EffSize << ")\n");
-      if (((TTI && !UseChainDepthWithTI) ||
-            MaxDepth >= Config.ReqChainDepth) &&
-          EffSize > 0 && EffSize > BestEffSize) {
-        BestMaxDepth = MaxDepth;
-        BestEffSize = EffSize;
-        BestDAG = PrunedDAG;
-      }
-    }
-  }
-
-  // Given the list of candidate pairs, this function selects those
-  // that will be fused into vector instructions.
-  void BBVectorize::choosePairs(
-                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                DenseSet<ValuePair> &CandidatePairsSet,
-                DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                std::vector<Value *> &PairableInsts,
-                DenseSet<ValuePair> &FixedOrderPairs,
-                DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-                DenseSet<ValuePair> &PairableInstUsers,
-                DenseMap<Value *, Value *>& ChosenPairs) {
-    bool UseCycleCheck =
-     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
-
-    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
-    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
-         E = CandidatePairsSet.end(); I != E; ++I) {
-      std::vector<Value *> &JJ = CandidatePairs2[I->second];
-      if (JJ.empty()) JJ.reserve(32);
-      JJ.push_back(I->first);
-    }
-
-    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
-    DenseSet<VPPair> PairableInstUserPairSet;
-    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
-         E = PairableInsts.end(); I != E; ++I) {
-      // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.lookup(*I).size();
-      if (!NumChoices) continue;
-
-      std::vector<Value *> &JJ = CandidatePairs[*I];
-
-      // The best pair to choose and its dag:
-      size_t BestMaxDepth = 0;
-      int BestEffSize = 0;
-      DenseSet<ValuePair> BestDAG;
-      findBestDAGFor(CandidatePairs, CandidatePairsSet,
-                      CandidatePairCostSavings,
-                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
-                      ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap,
-                      PairableInstUserPairSet, ChosenPairs,
-                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
-                      UseCycleCheck);
-
-      if (BestDAG.empty())
-        continue;
-
-      // A dag has been chosen (or not) at this point. If no dag was
-      // chosen, then this instruction, I, cannot be paired (and is no longer
-      // considered).
-
-      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
-                   << *cast<Instruction>(*I) << "\n");
-
-      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
-           SE2 = BestDAG.end(); S != SE2; ++S) {
-        // Insert the members of this dag into the list of chosen pairs.
-        ChosenPairs.insert(ValuePair(S->first, S->second));
-        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
-               *S->second << "\n");
-
-        // Remove all candidate pairs that have values in the chosen dag.
-        std::vector<Value *> &KK = CandidatePairs[S->first];
-        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
-             K != KE; ++K) {
-          if (*K == S->second)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(S->first, *K));
-        }
-
-        std::vector<Value *> &LL = CandidatePairs2[S->second];
-        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
-             L != LE; ++L) {
-          if (*L == S->first)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(*L, S->second));
-        }
-
-        std::vector<Value *> &MM = CandidatePairs[S->second];
-        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
-             M != ME; ++M) {
-          assert(*M != S->first && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(S->second, *M));
-        }
-
-        std::vector<Value *> &NN = CandidatePairs2[S->first];
-        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
-             N != NE; ++N) {
-          assert(*N != S->second && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(*N, S->first));
-        }
-      }
-    }
-
-    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
-  }
-
-  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
-                     unsigned n = 0) {
-    if (!I->hasName())
-      return "";
-
-    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
-             (n > 0 ? "." + utostr(n) : "")).str();
-  }
-
-  // Returns the value that is to be used as the pointer input to the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
-                     Instruction *I, Instruction *J, unsigned o) {
-    Value *IPtr, *JPtr;
-    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-    int64_t OffsetInElmts;
-
-    // Note: the analysis might fail here, that is why the pair order has
-    // been precomputed (OffsetInElmts must be unused here).
-    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                          IAddressSpace, JAddressSpace,
-                          OffsetInElmts, false);
-
-    // The pointer value is taken to be the one with the lowest offset.
-    Value *VPtr = IPtr;
-
-    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
-    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType
-      = PointerType::get(VArgType,
-                         IPtr->getType()->getPointerAddressSpace());
-    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
-                        /* insert before */ I);
-  }
-
-  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = J->getType()->getVectorNumElements();
-    for (unsigned v = 0; v < NumElem1; ++v) {
-      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
-      if (m < 0) {
-        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
-      } else {
-        unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem1)
-          mm += (int) NumInElem;
-
-        Mask[v+MaskOffset] =
-          ConstantInt::get(Type::getInt32Ty(Context), mm);
-      }
-    }
-  }
-
-  // Returns the value that is to be used as the vector-shuffle mask to the
-  // vector instruction that fuses I with J.
-  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
-                     Instruction *I, Instruction *J) {
-    // This is the shuffle mask. We need to append the second
-    // mask to the first, and the numbers need to be adjusted.
-
-    Type *ArgTypeI = I->getType();
-    Type *ArgTypeJ = J->getType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    unsigned NumElemI = ArgTypeI->getVectorNumElements();
-
-    // Get the total number of elements in the fused vector type.
-    // By definition, this must equal the number of elements in
-    // the final mask.
-    unsigned NumElem = VArgType->getVectorNumElements();
-    std::vector<Constant*> Mask(NumElem);
-
-    Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = OpTypeI->getVectorNumElements();
-    Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
-
-    // The fused vector will be:
-    // -----------------------------------------------------
-    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
-    // -----------------------------------------------------
-    // from which we'll extract NumElem total elements (where the first NumElemI
-    // of them come from the mask in I and the remainder come from the mask
-    // in J.
-
-    // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
-                       0,          Mask);
-
-    // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
-                       NumInElemI, Mask);
-
-    return ConstantVector::get(Mask);
-  }
-
-  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
-                                  Instruction *J, unsigned o, Value *&LOp,
-                                  unsigned numElemL,
-                                  Type *ArgTypeL, Type *ArgTypeH,
-                                  bool IBeforeJ, unsigned IdxOff) {
-    bool ExpandedIEChain = false;
-    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
-      // If we have a pure insertelement chain, then this can be rewritten
-      // into a chain that directly builds the larger type.
-      if (isPureIEChain(LIE)) {
-        SmallVector<Value *, 8> VectElemts(numElemL,
-          UndefValue::get(ArgTypeL->getScalarType()));
-        InsertElementInst *LIENext = LIE;
-        do {
-          unsigned Idx =
-            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
-          VectElemts[Idx] = LIENext->getOperand(1);
-        } while ((LIENext =
-                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
-        LIENext = nullptr;
-        Value *LIEPrev = UndefValue::get(ArgTypeH);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          if (isa<UndefValue>(VectElemts[i])) continue;
-          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
-                             ConstantInt::get(Type::getInt32Ty(Context),
-                                              i + IdxOff),
-                             getReplacementName(IBeforeJ ? I : J,
-                                                true, o, i+1));
-          LIENext->insertBefore(IBeforeJ ? J : I);
-          LIEPrev = LIENext;
-        }
-
-        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
-        ExpandedIEChain = true;
-      }
-    }
-
-    return ExpandedIEChain;
-  }
-
-  static unsigned getNumScalarElements(Type *Ty) {
-    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-      return VecTy->getNumElements();
-    return 1;
-  }
-
-  // Returns the value to be used as the specified operand of the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
-    // Compute the fused vector type for this operand
-    Type *ArgTypeI = I->getOperand(o)->getType();
-    Type *ArgTypeJ = J->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    Instruction *L = I, *H = J;
-    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
-
-    unsigned numElemL = getNumScalarElements(ArgTypeL);
-    unsigned numElemH = getNumScalarElements(ArgTypeH);
-
-    Value *LOp = L->getOperand(o);
-    Value *HOp = H->getOperand(o);
-    unsigned numElem = VArgType->getNumElements();
-
-    // First, we check if we can reuse the "original" vector outputs (if these
-    // exist). We might need a shuffle.
-    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
-    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
-    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
-    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
-
-    // FIXME: If we're fusing shuffle instructions, then we can't apply this
-    // optimization. The input vectors to the shuffle might be a different
-    // length from the shuffle outputs. Unfortunately, the replacement
-    // shuffle mask has already been formed, and the mask entries are sensitive
-    // to the sizes of the inputs.
-    bool IsSizeChangeShuffle =
-      isa<ShuffleVectorInst>(L) &&
-        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
-
-    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
-      // We can have at most two unique vector inputs.
-      bool CanUseInputs = true;
-      Value *I1, *I2 = nullptr;
-      if (LEE) {
-        I1 = LEE->getOperand(0);
-      } else {
-        I1 = LSV->getOperand(0);
-        I2 = LSV->getOperand(1);
-        if (I2 == I1 || isa<UndefValue>(I2))
-          I2 = nullptr;
-      }
-  
-      if (HEE) {
-        Value *I3 = HEE->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-      } else {
-        Value *I3 = HSV->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-
-        if (CanUseInputs) {
-          Value *I4 = HSV->getOperand(1);
-          if (!isa<UndefValue>(I4)) {
-            if (!I2 && I4 != I1)
-              I2 = I4;
-            else if (I4 != I1 && I4 != I2)
-              CanUseInputs = false;
-          }
-        }
-      }
-
-      if (CanUseInputs) {
-        unsigned LOpElem =
-          cast<Instruction>(LOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        unsigned HOpElem =
-          cast<Instruction>(HOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        // We have one or two input vectors. We need to map each index of the
-        // operands to the index of the original vector.
-        SmallVector<std::pair<int, int>, 8>  II(numElem);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          int Idx, INum;
-          if (LEE) {
-            Idx =
-              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
-            INum = LEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = LSV->getMaskValue(i);
-            if (Idx < (int) LOpElem) {
-              INum = LSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= LOpElem;
-              INum = LSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i] = std::pair<int, int>(Idx, INum);
-        }
-        for (unsigned i = 0; i < numElemH; ++i) {
-          int Idx, INum;
-          if (HEE) {
-            Idx =
-              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
-            INum = HEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = HSV->getMaskValue(i);
-            if (Idx < (int) HOpElem) {
-              INum = HSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= HOpElem;
-              INum = HSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i + numElemL] = std::pair<int, int>(Idx, INum);
-        }
-
-        // We now have an array which tells us from which index of which
-        // input vector each element of the operand comes.
-        VectorType *I1T = cast<VectorType>(I1->getType());
-        unsigned I1Elem = I1T->getNumElements();
-
-        if (!I2) {
-          // In this case there is only one underlying vector input. Check for
-          // the trivial case where we can use the input directly.
-          if (I1Elem == numElem) {
-            bool ElemInOrder = true;
-            for (unsigned i = 0; i < numElem; ++i) {
-              if (II[i].first != (int) i && II[i].first != -1) {
-                ElemInOrder = false;
-                break;
-              }
-            }
-
-            if (ElemInOrder)
-              return I1;
-          }
-
-          // A shuffle is needed.
-          std::vector<Constant *> Mask(numElem);
-          for (unsigned i = 0; i < numElem; ++i) {
-            int Idx = II[i].first;
-            if (Idx == -1)
-              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
-            else
-              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-
-          Instruction *S =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o));
-          S->insertBefore(IBeforeJ ? J : I);
-          return S;
-        }
-
-        VectorType *I2T = cast<VectorType>(I2->getType());
-        unsigned I2Elem = I2T->getNumElements();
-
-        // This input comes from two distinct vectors. The first step is to
-        // make sure that both vectors are the same length. If not, the
-        // smaller one will need to grow before they can be shuffled together.
-        if (I1Elem < I2Elem) {
-          std::vector<Constant *> Mask(I2Elem);
-          unsigned v = 0;
-          for (; v < I1Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I2Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI1 =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI1->insertBefore(IBeforeJ ? J : I);
-          I1 = NewI1;
-          I1Elem = I2Elem;
-        } else if (I1Elem > I2Elem) {
-          std::vector<Constant *> Mask(I1Elem);
-          unsigned v = 0;
-          for (; v < I2Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I1Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI2 =
-            new ShuffleVectorInst(I2, UndefValue::get(I2T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI2->insertBefore(IBeforeJ ? J : I);
-          I2 = NewI2;
-        }
-
-        // Now that both I1 and I2 are the same length we can shuffle them
-        // together (and use the result).
-        std::vector<Constant *> Mask(numElem);
-        for (unsigned v = 0; v < numElem; ++v) {
-          if (II[v].first == -1) {
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-          } else {
-            int Idx = II[v].first + II[v].second * I1Elem;
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-        }
-
-        Instruction *NewOp =
-          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(IBeforeJ ? I : J, true, o));
-        NewOp->insertBefore(IBeforeJ ? J : I);
-        return NewOp;
-      }
-    }
-
-    Type *ArgType = ArgTypeL;
-    if (numElemL < numElemH) {
-      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
-        // This is another short-circuit case: we're combining a scalar into
-        // a vector that is formed by an IE chain. We've just expanded the IE
-        // chain, now insert the scalar and we're done.
-
-        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                           getReplacementName(IBeforeJ ? I : J, true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH, IBeforeJ)) {
-        // The two vector inputs to the shuffle must be the same length,
-        // so extend the smaller vector to be the same length as the larger one.
-        Instruction *NLOp;
-        if (numElemL > 1) {
-  
-          std::vector<Constant *> Mask(numElemH);
-          unsigned v = 0;
-          for (; v < numElemL; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemH; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-    
-          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-  
-        NLOp->insertBefore(IBeforeJ ? J : I);
-        LOp = NLOp;
-      }
-
-      ArgType = ArgTypeH;
-    } else if (numElemL > numElemH) {
-      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType, IBeforeJ)) {
-        Instruction *S =
-          InsertElementInst::Create(LOp, HOp, 
-                                    ConstantInt::get(Type::getInt32Ty(Context),
-                                                     numElemL),
-                                    getReplacementName(IBeforeJ ? I : J,
-                                                       true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL, IBeforeJ)) {
-        Instruction *NHOp;
-        if (numElemH > 1) {
-          std::vector<Constant *> Mask(numElemL);
-          unsigned v = 0;
-          for (; v < numElemH; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemL; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-    
-          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NHOp->insertBefore(IBeforeJ ? J : I);
-        HOp = NHOp;
-      }
-    }
-
-    if (ArgType->isVectorTy()) {
-      unsigned numElem = VArgType->getVectorNumElements();
-      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v) {
-        unsigned Idx = v;
-        // If the low vector was expanded, we need to skip the extra
-        // undefined entries.
-        if (v >= numElemL && numElemH > numElemL)
-          Idx += (numElemH - numElemL);
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-      }
-
-      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                          ConstantVector::get(Mask),
-                          getReplacementName(IBeforeJ ? I : J, true, o));
-      BV->insertBefore(IBeforeJ ? J : I);
-      return BV;
-    }
-
-    Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 1));
-    BV1->insertBefore(IBeforeJ ? J : I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 2));
-    BV2->insertBefore(IBeforeJ ? J : I);
-    return BV2;
-  }
-
-  // This function creates an array of values that will be used as the inputs
-  // to the vector instruction that fuses I with J.
-  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
-                     Instruction *I, Instruction *J,
-                     SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ) {
-    unsigned NumOperands = I->getNumOperands();
-
-    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
-      // Iterate backward so that we look at the store pointer
-      // first and know whether or not we need to flip the inputs.
-
-      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
-        // This is the pointer for a load/store instruction.
-        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
-        continue;
-      } else if (isa<CallInst>(I)) {
-        Function *F = cast<CallInst>(I)->getCalledFunction();
-        Intrinsic::ID IID = F->getIntrinsicID();
-        if (o == NumOperands-1) {
-          BasicBlock &BB = *I->getParent();
-
-          Module *M = BB.getParent()->getParent();
-          Type *ArgTypeI = I->getType();
-          Type *ArgTypeJ = J->getType();
-          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
-          continue;
-        } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-                    IID == Intrinsic::cttz) && o == 1) {
-          // The second argument of powi/ctlz/cttz is a single integer/constant
-          // and we've already checked that both arguments are equal.
-          // As a result, we just keep I's second argument.
-          ReplacedOperands[o] = I->getOperand(o);
-          continue;
-        }
-      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
-        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
-        continue;
-      }
-
-      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
-    }
-  }
-
-  // This function creates two values that represent the outputs of the
-  // original I and J instructions. These are generally vector shuffles
-  // or extracts. In many cases, these will end up being unused and, thus,
-  // eliminated by later passes.
-  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt,
-                     Instruction *&K1, Instruction *&K2) {
-    if (isa<StoreInst>(I))
-      return;
-
-    Type *IType = I->getType();
-    Type *JType = J->getType();
-
-    VectorType *VType = getVecTypeForPair(IType, JType);
-    unsigned numElem = VType->getNumElements();
-
-    unsigned numElemI = getNumScalarElements(IType);
-    unsigned numElemJ = getNumScalarElements(JType);
-
-    if (IType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
-      for (unsigned v = 0; v < numElemI; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
-      }
-
-      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask1),
-                                 getReplacementName(K, false, 1));
-    } else {
-      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
-    }
-
-    if (JType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
-      for (unsigned v = 0; v < numElemJ; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
-      }
-
-      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask2),
-                                 getReplacementName(K, false, 2));
-    } else {
-      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
-      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
-    }
-
-    K1->insertAfter(K);
-    K2->insertAfter(K1);
-    InsertionPt = K2;
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J; ++L)
-      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
-
-    assert(cast<Instruction>(L) == J &&
-      "Tracking has not proceeded far enough to check for dependencies");
-    // If J is now in the use set of I, then trackUsesOfI will return true
-    // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
-        // Move this instruction
-        Instruction *InstToMove = &*L++;
-
-        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
-                        " to after " << *InsertionPt << "\n");
-        InstToMove->removeFromParent();
-        InstToMove->insertAfter(InsertionPt);
-        InsertionPt = InstToMove;
-      } else {
-        ++L;
-      }
-    }
-  }
-
-  // Collect all load instruction that are in the move set of a given first
-  // pair member.  These loads depend on the first instruction, I, and so need
-  // to be moved after J (the second instruction) when the pair is fused.
-  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    // Note: We cannot end the loop when we reach J because J could be moved
-    // farther down the use chain by another instruction pairing. Also, J
-    // could be before I if this is an inverted input.
-    for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L)) {
-        if (L->mayReadFromMemory()) {
-          LoadMoveSet[&*L].push_back(I);
-          LoadMoveSetPairs.insert(ValuePair(&*L, I));
-        }
-      }
-    }
-  }
-
-  // In cases where both load/stores and the computation of their pointers
-  // are chosen for vectorization, we can end up in a situation where the
-  // aliasing analysis starts returning different query results as the
-  // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use dags here as were found earlier, we'll
-  // need to precompute the necessary aliasing information here and then
-  // manually update it during the fusion process.
-  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PIE = PairableInsts.end(); PI != PIE; ++PI) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
-      if (P == ChosenPairs.end()) continue;
-
-      Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
-                             LoadMoveSetPairs, I);
-    }
-  }
-
-  // This function fuses the chosen instruction pairs into vector instructions,
-  // taking care preserve any needed scalar outputs and, then, it reorders the
-  // remaining instructions as needed (users of the first member of the pair
-  // need to be moved to after the location of the second member of the pair
-  // because the vector instruction is inserted in the location of the pair's
-  // second member).
-  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
-    LLVMContext& Context = BB.getContext();
-
-    // During the vectorization process, the order of the pairs to be fused
-    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
-    // list. After a pair is fused, the flipped pair is removed from the list.
-    DenseSet<ValuePair> FlippedPairs;
-    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
-         E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.insert(ValuePair(P->second, P->first));
-    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
-         E = FlippedPairs.end(); P != E; ++P)
-      ChosenPairs.insert(*P);
-
-    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
-    DenseSet<ValuePair> LoadMoveSetPairs;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
-                       LoadMoveSet, LoadMoveSetPairs);
-
-    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
-
-    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
-      if (P == ChosenPairs.end()) {
-        ++PI;
-        continue;
-      }
-
-      if (getDepthFactor(P->first) == 0) {
-        // These instructions are not really fused, but are tracked as though
-        // they are. Any case in which it would be interesting to fuse them
-        // will be taken care of by InstCombine.
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      Instruction *I = cast<Instruction>(P->first),
-        *J = cast<Instruction>(P->second);
-
-      DEBUG(dbgs() << "BBV: fusing: " << *I <<
-             " <-> " << *J << "\n");
-
-      // Remove the pair and flipped pair from the list.
-      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
-      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
-      ChosenPairs.erase(FP);
-      ChosenPairs.erase(P);
-
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
-        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
-               " <-> " << *J <<
-               " aborted because of non-trivial dependency cycle\n");
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      // If the pair must have the other order, then flip it.
-      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
-      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
-        // This pair does not have a fixed order, and so we might want to
-        // flip it if that will yield fewer shuffles. We count the number
-        // of dependencies connected via swaps, and those directly connected,
-        // and flip the order if the number of swaps is greater.
-        bool OrigOrder = true;
-        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
-          ConnectedPairDeps.find(ValuePair(I, J));
-        if (IJ == ConnectedPairDeps.end()) {
-          IJ = ConnectedPairDeps.find(ValuePair(J, I));
-          OrigOrder = false;
-        }
-
-        if (IJ != ConnectedPairDeps.end()) {
-          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
-               TE = IJ->second.end(); T != TE; ++T) {
-            VPPair Q(IJ->first, *T);
-            DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q.second, Q.first));
-            assert(R != PairConnectionTypes.end() &&
-                   "Cannot find pair connection type");
-            if (R->second == PairConnectionDirect)
-              ++NumDepsDirect;
-            else if (R->second == PairConnectionSwap)
-              ++NumDepsSwap;
-          }
-
-          if (!OrigOrder)
-            std::swap(NumDepsDirect, NumDepsSwap);
-
-          if (NumDepsSwap > NumDepsDirect) {
-            FlipPairOrder = true;
-            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
-                            " <-> " << *J << "\n");
-          }
-        }
-      }
-
-      Instruction *L = I, *H = J;
-      if (FlipPairOrder)
-        std::swap(H, L);
-
-      // If the pair being fused uses the opposite order from that in the pair
-      // connection map, then we need to flip the types.
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
-        ConnectedPairs.find(ValuePair(H, L));
-      if (HL != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
-             TE = HL->second.end(); T != TE; ++T) {
-          VPPair Q(HL->first, *T);
-          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
-          assert(R != PairConnectionTypes.end() &&
-                 "Cannot find pair connection type");
-          if (R->second == PairConnectionDirect)
-            R->second = PairConnectionSwap;
-          else if (R->second == PairConnectionSwap)
-            R->second = PairConnectionDirect;
-        }
-
-      bool LBeforeH = !FlipPairOrder;
-      unsigned NumOperands = I->getNumOperands();
-      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
-                                  LBeforeH);
-
-      // Make a copy of the original operation, change its type to the vector
-      // type and replace its operands with the vector operands.
-      Instruction *K = L->clone();
-      if (L->hasName())
-        K->takeName(L);
-      else if (H->hasName())
-        K->takeName(H);
-
-      if (auto CS = CallSite(K)) {
-        SmallVector<Type *, 3> Tys;
-        FunctionType *Old = CS.getFunctionType();
-        unsigned NumOld = Old->getNumParams();
-        assert(NumOld <= ReplacedOperands.size());
-        for (unsigned i = 0; i != NumOld; ++i)
-          Tys.push_back(ReplacedOperands[i]->getType());
-        CS.mutateFunctionType(
-            FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
-                              Tys, Old->isVarArg()));
-      } else if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
-
-      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
-                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
-                             LLVMContext::MD_invariant_group};
-      combineMetadata(K, H, KnownIDs);
-      K->andIRFlags(H);
-
-      for (unsigned o = 0; o < NumOperands; ++o)
-        K->setOperand(o, ReplacedOperands[o]);
-
-      K->insertAfter(J);
-
-      // Instruction insertion point:
-      Instruction *InsertionPt = K;
-      Instruction *K1 = nullptr, *K2 = nullptr;
-      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
-
-      // The use dag of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use dag of the
-      // first instruction is disjoint from the input dag of the second
-      // (by definition), and so commutes with it.
-
-      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
-
-      if (!isa<StoreInst>(I)) {
-        L->replaceAllUsesWith(K1);
-        H->replaceAllUsesWith(K2);
-      }
-
-      // Instructions that may read from memory may be in the load move set.
-      // Once an instruction is fused, we no longer need its move set, and so
-      // the values of the map never need to be updated. However, when a load
-      // is fused, we need to merge the entries from both instructions in the
-      // pair in case those instructions were in the move set of some other
-      // yet-to-be-fused pair. The loads in question are the keys of the map.
-      if (I->mayReadFromMemory()) {
-        std::vector<ValuePair> NewSetMembers;
-        DenseMap<Value *, std::vector<Value *> >::iterator II =
-          LoadMoveSet.find(I);
-        if (II != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = II->second.begin(),
-               NE = II->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
-          LoadMoveSet.find(J);
-        if (JJ != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = JJ->second.begin(),
-               NE = JJ->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A) {
-          LoadMoveSet[A->first].push_back(A->second);
-          LoadMoveSetPairs.insert(*A);
-        }
-      }
-
-      // Before removing I, set the iterator to the next instruction.
-      PI = std::next(BasicBlock::iterator(I));
-      if (cast<Instruction>(PI) == J)
-        ++PI;
-
-      SE->forgetValue(I);
-      SE->forgetValue(J);
-      I->eraseFromParent();
-      J->eraseFromParent();
-
-      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
-                                               BB << "\n");
-    }
-
-    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
-  }
-}
-
-char BBVectorize::ID = 0;
-static const char bb_vectorize_name[] = "Basic-Block Vectorization";
-INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-
-BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
-  return new BBVectorize(C);
-}
-
-bool
-llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
-  BBVectorize BBVectorizer(P, *BB.getParent(), C);
-  return BBVectorizer.vectorizeBB(BB);
-}
-
-//===----------------------------------------------------------------------===//
-VectorizeConfig::VectorizeConfig() {
-  VectorBits = ::VectorBits;
-  VectorizeBools = !::NoBools;
-  VectorizeInts = !::NoInts;
-  VectorizeFloats = !::NoFloats;
-  VectorizePointers = !::NoPointers;
-  VectorizeCasts = !::NoCasts;
-  VectorizeMath = !::NoMath;
-  VectorizeBitManipulations = !::NoBitManipulation;
-  VectorizeFMA = !::NoFMA;
-  VectorizeSelect = !::NoSelect;
-  VectorizeCmp = !::NoCmp;
-  VectorizeGEP = !::NoGEP;
-  VectorizeMemOps = !::NoMemOps;
-  AlignedOnly = ::AlignedOnly;
-  ReqChainDepth= ::ReqChainDepth;
-  SearchLimit = ::SearchLimit;
-  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
-  SplatBreaksChain = ::SplatBreaksChain;
-  MaxInsts = ::MaxInsts;
-  MaxPairs = ::MaxPairs;
-  MaxIter = ::MaxIter;
-  Pow2LenOnly = ::Pow2LenOnly;
-  NoMemOpBoost = ::NoMemOpBoost;
-  FastDep = ::FastDep;
-}
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index c44a393..9cf6638 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -65,7 +66,9 @@ public:
   bool run();
 
 private:
-  Value *getPointerOperand(Value *I);
+  Value *getPointerOperand(Value *I) const;
+
+  GetElementPtrInst *getSourceGEP(Value *Src) const;
 
   unsigned getPointerAddressSpace(Value *I);
 
@@ -215,7 +218,7 @@ bool Vectorizer::run() {
   return Changed;
 }
 
-Value *Vectorizer::getPointerOperand(Value *I) {
+Value *Vectorizer::getPointerOperand(Value *I) const {
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
     return LI->getPointerOperand();
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -231,6 +234,19 @@ unsigned Vectorizer::getPointerAddressSpace(Value *I) {
   return -1;
 }
 
+GetElementPtrInst *Vectorizer::getSourceGEP(Value *Src) const {
+  // First strip pointer bitcasts. Make sure pointee size is the same with
+  // and without casts.
+  // TODO: a stride set by the add instruction below can match the difference
+  // in pointee type size here. Currently it will not be vectorized.
+  Value *SrcPtr = getPointerOperand(Src);
+  Value *SrcBase = SrcPtr->stripPointerCasts();
+  if (DL.getTypeStoreSize(SrcPtr->getType()->getPointerElementType()) ==
+      DL.getTypeStoreSize(SrcBase->getType()->getPointerElementType()))
+    SrcPtr = SrcBase;
+  return dyn_cast<GetElementPtrInst>(SrcPtr);
+}
+
 // FIXME: Merge with llvm::isConsecutiveAccess
 bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   Value *PtrA = getPointerOperand(A);
@@ -283,8 +299,8 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
 
   // Look through GEPs after checking they're the same except for the last
   // index.
-  GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(getPointerOperand(A));
-  GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(getPointerOperand(B));
+  GetElementPtrInst *GEPA = getSourceGEP(A);
+  GetElementPtrInst *GEPB = getSourceGEP(B);
   if (!GEPA || !GEPB || GEPA->getNumOperands() != GEPB->getNumOperands())
     return false;
   unsigned FinalIndex = GEPA->getNumOperands() - 1;
@@ -328,11 +344,9 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
   // If any bits are known to be zero other than the sign bit in OpA, we can
   // add 1 to it while guaranteeing no overflow of any sort.
   if (!Safe) {
-    APInt KnownZero(BitWidth, 0);
-    APInt KnownOne(BitWidth, 0);
-    computeKnownBits(OpA, KnownZero, KnownOne, DL, 0, nullptr, OpA, &DT);
-    KnownZero &= ~APInt::getHighBitsSet(BitWidth, 1);
-    if (KnownZero != 0)
+    KnownBits Known(BitWidth);
+    computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
+    if (Known.countMaxTrailingOnes() < (BitWidth - 1))
       Safe = true;
   }
 
@@ -432,9 +446,12 @@ Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
   unsigned ElementSizeBytes = ElementSizeBits / 8;
   unsigned SizeBytes = ElementSizeBytes * Chain.size();
   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
-  if (NumLeft == Chain.size())
-    --NumLeft;
-  else if (NumLeft == 0)
+  if (NumLeft == Chain.size()) {
+    if ((NumLeft & 1) == 0)
+      NumLeft /= 2; // Split even in half
+    else
+      --NumLeft;    // Split off last element
+  } else if (NumLeft == 0)
     NumLeft = 1;
   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
 }
@@ -588,7 +605,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
         continue;
 
       // Make sure all the users of a vector are constant-index extracts.
-      if (isa<VectorType>(Ty) && !all_of(LI->users(), [LI](const User *U) {
+      if (isa<VectorType>(Ty) && !all_of(LI->users(), [](const User *U) {
             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
             return EEI && isa<ConstantInt>(EEI->getOperand(1));
           }))
@@ -622,7 +639,7 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
       if (TySize > VecRegSize / 2)
         continue;
 
-      if (isa<VectorType>(Ty) && !all_of(SI->users(), [SI](const User *U) {
+      if (isa<VectorType>(Ty) && !all_of(SI->users(), [](const User *U) {
             const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
             return EEI && isa<ConstantInt>(EEI->getOperand(1));
           }))
diff --git a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dac7032..012b10c 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -50,6 +50,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -92,6 +93,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -112,12 +114,13 @@ static cl::opt<bool>
     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
                        cl::desc("Enable if-conversion during vectorization."));
 
-/// We don't vectorize loops with a known constant trip count below this number.
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
 static cl::opt<unsigned> TinyTripCountVectorThreshold(
     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
-    cl::desc("Don't vectorize loops with a constant "
-             "trip count that is smaller than this "
-             "value."));
+    cl::desc("Loops with a constant trip count that is smaller than this "
+             "value are vectorized only if no scalar iteration overheads "
+             "are incurred."));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -266,21 +269,6 @@ static bool hasCyclesInLoopBody(const Loop &L) {
   return false;
 }
 
-/// \brief This modifies LoopAccessReport to initialize message with
-/// loop-vectorizer-specific part.
-class VectorizationReport : public LoopAccessReport {
-public:
-  VectorizationReport(Instruction *I = nullptr)
-      : LoopAccessReport("loop not vectorized: ", I) {}
-
-  /// \brief This allows promotion of the loop-access analysis report into the
-  /// loop-vectorizer report.  It modifies the message to add the
-  /// loop-vectorizer-specific part of the message.
-  explicit VectorizationReport(const LoopAccessReport &R)
-      : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
-                         R.getInstr()) {}
-};
-
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
@@ -290,31 +278,9 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-/// A helper function that returns GEP instruction and knows to skip a
-/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
-/// pointee types of the 'bitcast' have the same size.
-/// For example:
-///   bitcast double** %var to i64* - can be skipped
-///   bitcast double** %var to i8*  - can not
-static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
-
-  if (isa<GetElementPtrInst>(Ptr))
-    return cast<GetElementPtrInst>(Ptr);
-
-  if (isa<BitCastInst>(Ptr) &&
-      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
-    Type *BitcastTy = Ptr->getType();
-    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
-    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
-      return nullptr;
-    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
-    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
-    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
-    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
-      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
-  }
-  return nullptr;
-}
+// FIXME: The following helper functions have multiple implementations
+// in the project. They can be effectively organized in a common Load/Store
+// utilities unit.
 
 /// A helper function that returns the pointer operand of a load or store
 /// instruction.
@@ -326,6 +292,34 @@ static Value *getPointerOperand(Value *I) {
   return nullptr;
 }
 
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getType();
+  return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns the alignment of load or store instruction.
+static unsigned getMemInstAlignment(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getAlignment();
+  return cast<StoreInst>(I)->getAlignment();
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load or store instruction.
+static unsigned getMemInstAddressSpace(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerAddressSpace();
+  return cast<StoreInst>(I)->getPointerAddressSpace();
+}
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type at the given vectorization factor.
@@ -351,6 +345,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 ///       we always assume predicated blocks have a 50% chance of executing.
 static unsigned getReciprocalPredBlockProb() { return 2; }
 
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+                           : ConstantFP::get(Ty, C);
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -381,13 +392,14 @@ public:
         TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM),
         AddedSafetyChecks(false) {}
 
-  // Perform the actual loop widening (vectorization).
-  void vectorize() {
-    // Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
-    // Widen each instruction in the old loop to a new one in the new loop.
-    vectorizeLoop();
-  }
+  /// Create a new empty loop. Unlink the old loop and connect the new one.
+  void createVectorizedLoopSkeleton();
+
+  /// Vectorize a single instruction within the innermost loop.
+  void vectorizeInstruction(Instruction &I);
+
+  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+  void fixVectorizedLoop();
 
   // Return true if any runtime check is added.
   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
@@ -412,10 +424,8 @@ protected:
   // When we if-convert we need to create edge masks. We have to cache values
   // so that we don't end up with exponential recursion/IR.
   typedef DenseMap<std::pair<BasicBlock *, BasicBlock *>, VectorParts>
-      EdgeMaskCache;
-
-  /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
+      EdgeMaskCacheTy;
+  typedef DenseMap<BasicBlock *, VectorParts> BlockMaskCacheTy;
 
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
@@ -425,17 +435,22 @@ protected:
   /// Create a new induction variable inside L.
   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
                                    Value *Step, Instruction *DL);
-  /// Copy and widen the instructions from the old loop.
-  virtual void vectorizeLoop();
+
+  /// Handle all cross-iteration phis in the header.
+  void fixCrossIterationPHIs();
 
   /// Fix a first-order recurrence. This is the second phase of vectorizing
   /// this phi node.
   void fixFirstOrderRecurrence(PHINode *Phi);
 
-  /// \brief The Loop exit block may have single value PHI nodes where the
-  /// incoming value is 'Undef'. While vectorizing we only handled real values
-  /// that were defined inside the loop. Here we fix the 'undef case'.
-  /// See PR14725.
+  /// Fix a reduction cross-iteration phi. This is the second phase of
+  /// vectorizing this phi node.
+  void fixReduction(PHINode *Phi);
+
+  /// \brief The Loop exit block may have single value PHI nodes with some
+  /// incoming value. While vectorizing we only handled real values
+  /// that were defined inside the loop and we should have one value for
+  /// each predecessor of its parent basic block. See PR14725.
   void fixLCSSAPHIs();
 
   /// Iteratively sink the scalarized operands of a predicated instruction into
@@ -446,10 +461,6 @@ protected:
   /// respective conditions.
   void predicateInstructions();
 
-  /// Collect the instructions from the original loop that would be trivially
-  /// dead in the vectorized loop if generated.
-  void collectTriviallyDeadInstructions();
-
   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
   /// represented as.
   void truncateToMinimalBitwidths();
@@ -462,14 +473,10 @@ protected:
   /// and DST.
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
-  /// A helper function to vectorize a single BB within the innermost loop.
-  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
-
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF,
-                           PhiVector *PV);
+  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
 
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
@@ -479,8 +486,7 @@ protected:
   /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
   /// scalarized instruction behind an if block predicated on the control
   /// dependence of the instruction.
-  virtual void scalarizeInstruction(Instruction *Instr,
-                                    bool IfPredicateInstr = false);
+  void scalarizeInstruction(Instruction *Instr, bool IfPredicateInstr = false);
 
   /// Vectorize Load and Store instructions,
   virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -504,20 +510,21 @@ protected:
   /// \p EntryVal is the value from the original loop that maps to the steps.
   /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
   /// can be a truncate instruction).
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
-
-  /// Create a vector induction phi node based on an existing scalar one. This
-  /// currently only works for integer induction variables with a constant
-  /// step. \p EntryVal is the value from the original loop that maps to the
-  /// vector phi node. If \p EntryVal is a truncate instruction, instead of
-  /// widening the original IV, we widen a version of the IV truncated to \p
-  /// EntryVal's type.
-  void createVectorIntInductionPHI(const InductionDescriptor &II,
-                                   Instruction *EntryVal);
-
-  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
-  /// induction variable will first be truncated to the corresponding type.
-  void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+                        const InductionDescriptor &ID);
+
+  /// Create a vector induction phi node based on an existing scalar one. \p
+  /// EntryVal is the value from the original loop that maps to the vector phi
+  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+  /// truncate instruction, instead of widening the original IV, we widen a
+  /// version of the IV truncated to \p EntryVal's type.
+  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+                                       Value *Step, Instruction *EntryVal);
+
+  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+  /// is provided, the integer induction variable will first be truncated to
+  /// the corresponding type.
+  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 
   /// Returns true if an instruction \p I should be scalarized instead of
   /// vectorized for the chosen vectorization factor.
@@ -526,21 +533,34 @@ protected:
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
-  /// Return a constant reference to the VectorParts corresponding to \p V from
-  /// the original loop. If the value has already been vectorized, the
-  /// corresponding vector entry in VectorLoopValueMap is returned. If,
+  /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+  /// vector or scalar value on-demand if one is not yet available. When
+  /// vectorizing a loop, we visit the definition of an instruction before its
+  /// uses. When visiting the definition, we either vectorize or scalarize the
+  /// instruction, creating an entry for it in the corresponding map. (In some
+  /// cases, such as induction variables, we will create both vector and scalar
+  /// entries.) Then, as we encounter uses of the definition, we derive values
+  /// for each scalar or vector use unless such a value is already available.
+  /// For example, if we scalarize a definition and one of its uses is vector,
+  /// we build the required vector on-demand with an insertelement sequence
+  /// when visiting the use. Otherwise, if the use is scalar, we can use the
+  /// existing scalar definition.
+  ///
+  /// Return a value in the new loop corresponding to \p V from the original
+  /// loop at unroll index \p Part. If the value has already been vectorized,
+  /// the corresponding vector entry in VectorLoopValueMap is returned. If,
   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
-  /// new vector values on-demand by inserting the scalar values into vectors
+  /// a new vector value on-demand by inserting the scalar values into a vector
   /// with an insertelement sequence. If the value has been neither vectorized
   /// nor scalarized, it must be loop invariant, so we simply broadcast the
-  /// value into vectors.
-  const VectorParts &getVectorValue(Value *V);
+  /// value into a vector.
+  Value *getOrCreateVectorValue(Value *V, unsigned Part);
 
   /// Return a value in the new loop corresponding to \p V from the original
   /// loop at unroll index \p Part and vector index \p Lane. If the value has
   /// been vectorized but not scalarized, the necessary extractelement
   /// instruction will be generated.
-  Value *getScalarValue(Value *V, unsigned Part, unsigned Lane);
+  Value *getOrCreateScalarValue(Value *V, unsigned Part, unsigned Lane);
 
   /// Try to vectorize the interleaved access group that \p Instr belongs to.
   void vectorizeInterleaveGroup(Instruction *Instr);
@@ -554,11 +574,9 @@ protected:
   /// Returns (and creates if needed) the trip count of the widened loop.
   Value *getOrCreateVectorTripCount(Loop *NewLoop);
 
-  /// Emit a bypass check to see if the trip count would overflow, or we
-  /// wouldn't have enough iterations to execute one vector loop.
+  /// Emit a bypass check to see if the vector trip count is zero, including if
+  /// it overflows.
   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
-  /// Emit a bypass check to see if the vector trip count is nonzero.
-  void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass);
   /// Emit a bypass check to see if all of the SCEV assumptions we've
   /// had to make are correct.
   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
@@ -583,6 +601,10 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// \brief Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
   /// This is a helper class for maintaining vectorization state. It's used for
   /// mapping values from the original loop to their corresponding values in
   /// the new loop. Two mappings are maintained: one for vectorized values and
@@ -591,90 +613,103 @@ protected:
   /// UF x VF scalar values in the new loop. UF and VF are the unroll and
   /// vectorization factors, respectively.
   ///
-  /// Entries can be added to either map with initVector and initScalar, which
-  /// initialize and return a constant reference to the new entry. If a
-  /// non-constant reference to a vector entry is required, getVector can be
-  /// used to retrieve a mutable entry. We currently directly modify the mapped
-  /// values during "fix-up" operations that occur once the first phase of
-  /// widening is complete. These operations include type truncation and the
-  /// second phase of recurrence widening.
+  /// Entries can be added to either map with setVectorValue and setScalarValue,
+  /// which assert that an entry was not already added before. If an entry is to
+  /// replace an existing one, call resetVectorValue. This is currently needed
+  /// to modify the mapped values during "fix-up" operations that occur once the
+  /// first phase of widening is complete. These operations include type
+  /// truncation and the second phase of recurrence widening.
   ///
-  /// Otherwise, entries from either map should be accessed using the
-  /// getVectorValue or getScalarValue functions from InnerLoopVectorizer.
-  /// getVectorValue and getScalarValue coordinate to generate a vector or
-  /// scalar value on-demand if one is not yet available. When vectorizing a
-  /// loop, we visit the definition of an instruction before its uses. When
-  /// visiting the definition, we either vectorize or scalarize the
-  /// instruction, creating an entry for it in the corresponding map. (In some
-  /// cases, such as induction variables, we will create both vector and scalar
-  /// entries.) Then, as we encounter uses of the definition, we derive values
-  /// for each scalar or vector use unless such a value is already available.
-  /// For example, if we scalarize a definition and one of its uses is vector,
-  /// we build the required vector on-demand with an insertelement sequence
-  /// when visiting the use. Otherwise, if the use is scalar, we can use the
-  /// existing scalar definition.
+  /// Entries from either map can be retrieved using the getVectorValue and
+  /// getScalarValue functions, which assert that the desired value exists.
+
   struct ValueMap {
 
     /// Construct an empty map with the given unroll and vectorization factors.
-    ValueMap(unsigned UnrollFactor, unsigned VecWidth)
-        : UF(UnrollFactor), VF(VecWidth) {
-      // The unroll and vectorization factors are only used in asserts builds
-      // to verify map entries are sized appropriately.
-      (void)UF;
-      (void)VF;
+    ValueMap(unsigned UF, unsigned VF) : UF(UF), VF(VF) {}
+
+    /// \return True if the map has any vector entry for \p Key.
+    bool hasAnyVectorValue(Value *Key) const {
+      return VectorMapStorage.count(Key);
     }
 
-    /// \return True if the map has a vector entry for \p Key.
-    bool hasVector(Value *Key) const { return VectorMapStorage.count(Key); }
-
-    /// \return True if the map has a scalar entry for \p Key.
-    bool hasScalar(Value *Key) const { return ScalarMapStorage.count(Key); }
-
-    /// \brief Map \p Key to the given VectorParts \p Entry, and return a
-    /// constant reference to the new vector map entry. The given key should
-    /// not already be in the map, and the given VectorParts should be
-    /// correctly sized for the current unroll factor.
-    const VectorParts &initVector(Value *Key, const VectorParts &Entry) {
-      assert(!hasVector(Key) && "Vector entry already initialized");
-      assert(Entry.size() == UF && "VectorParts has wrong dimensions");
-      VectorMapStorage[Key] = Entry;
-      return VectorMapStorage[Key];
+    /// \return True if the map has a vector entry for \p Key and \p Part.
+    bool hasVectorValue(Value *Key, unsigned Part) const {
+      assert(Part < UF && "Queried Vector Part is too large.");
+      if (!hasAnyVectorValue(Key))
+        return false;
+      const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+      return Entry[Part] != nullptr;
+    }
+
+    /// \return True if the map has any scalar entry for \p Key.
+    bool hasAnyScalarValue(Value *Key) const {
+      return ScalarMapStorage.count(Key);
+    }
+
+    /// \return True if the map has a scalar entry for \p Key, \p Part and
+    /// \p Part.
+    bool hasScalarValue(Value *Key, unsigned Part, unsigned Lane) const {
+      assert(Part < UF && "Queried Scalar Part is too large.");
+      assert(Lane < VF && "Queried Scalar Lane is too large.");
+      if (!hasAnyScalarValue(Key))
+        return false;
+      const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+      assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+      assert(Entry[Part].size() == VF && "ScalarParts has wrong dimensions.");
+      return Entry[Part][Lane] != nullptr;
     }
 
-    /// \brief Map \p Key to the given ScalarParts \p Entry, and return a
-    /// constant reference to the new scalar map entry. The given key should
-    /// not already be in the map, and the given ScalarParts should be
-    /// correctly sized for the current unroll and vectorization factors.
-    const ScalarParts &initScalar(Value *Key, const ScalarParts &Entry) {
-      assert(!hasScalar(Key) && "Scalar entry already initialized");
-      assert(Entry.size() == UF &&
-             all_of(make_range(Entry.begin(), Entry.end()),
-                    [&](const SmallVectorImpl<Value *> &Values) -> bool {
-                      return Values.size() == VF;
-                    }) &&
-             "ScalarParts has wrong dimensions");
-      ScalarMapStorage[Key] = Entry;
-      return ScalarMapStorage[Key];
+    /// Retrieve the existing vector value that corresponds to \p Key and
+    /// \p Part.
+    Value *getVectorValue(Value *Key, unsigned Part) {
+      assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+      return VectorMapStorage[Key][Part];
     }
 
-    /// \return A reference to the vector map entry corresponding to \p Key.
-    /// The key should already be in the map. This function should only be used
-    /// when it's necessary to update values that have already been vectorized.
-    /// This is the case for "fix-up" operations including type truncation and
-    /// the second phase of recurrence vectorization. If a non-const reference
-    /// isn't required, getVectorValue should be used instead.
-    VectorParts &getVector(Value *Key) {
-      assert(hasVector(Key) && "Vector entry not initialized");
-      return VectorMapStorage.find(Key)->second;
+    /// Retrieve the existing scalar value that corresponds to \p Key, \p Part
+    /// and \p Lane.
+    Value *getScalarValue(Value *Key, unsigned Part, unsigned Lane) {
+      assert(hasScalarValue(Key, Part, Lane) && "Getting non-existent value.");
+      return ScalarMapStorage[Key][Part][Lane];
     }
 
-    /// Retrieve an entry from the vector or scalar maps. The preferred way to
-    /// access an existing mapped entry is with getVectorValue or
-    /// getScalarValue from InnerLoopVectorizer. Until those functions can be
-    /// moved inside ValueMap, we have to declare them as friends.
-    friend const VectorParts &InnerLoopVectorizer::getVectorValue(Value *V);
-    friend Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
-                                                      unsigned Lane);
+    /// Set a vector value associated with \p Key and \p Part. Assumes such a
+    /// value is not already set. If it is, use resetVectorValue() instead.
+    void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+      if (!VectorMapStorage.count(Key)) {
+        VectorParts Entry(UF);
+        VectorMapStorage[Key] = Entry;
+      }
+      VectorMapStorage[Key][Part] = Vector;
+    }
+
+    /// Set a scalar value associated with \p Key for \p Part and \p Lane.
+    /// Assumes such a value is not already set.
+    void setScalarValue(Value *Key, unsigned Part, unsigned Lane,
+                        Value *Scalar) {
+      assert(!hasScalarValue(Key, Part, Lane) && "Scalar value already set");
+      if (!ScalarMapStorage.count(Key)) {
+        ScalarParts Entry(UF);
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part].resize(VF, nullptr);
+          // TODO: Consider storing uniform values only per-part, as they occupy
+          //       lane 0 only, keeping the other VF-1 redundant entries null.
+        ScalarMapStorage[Key] = Entry;
+      }
+      ScalarMapStorage[Key][Part][Lane] = Scalar;
+    }
+
+    /// Reset the vector value associated with \p Key for the given \p Part.
+    /// This function can be used to update values that have already been
+    /// vectorized. This is the case for "fix-up" operations including type
+    /// truncation and the second phase of recurrence vectorization.
+    void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+      assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+      VectorMapStorage[Key][Part] = Vector;
+    }
 
   private:
     /// The unroll factor. Each entry in the vector map contains UF vector
@@ -762,7 +797,8 @@ protected:
   /// Store instructions that should be predicated, as a pair
   ///   <StoreInst, Predicate>
   SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
-  EdgeMaskCache MaskCache;
+  EdgeMaskCacheTy EdgeMaskCache;
+  BlockMaskCacheTy BlockMaskCache;
   /// Trip count of the original loop.
   Value *TripCount;
   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
@@ -777,14 +813,6 @@ protected:
   // Record whether runtime checks are added.
   bool AddedSafetyChecks;
 
-  // Holds instructions from the original loop whose counterparts in the
-  // vectorized loop would be trivially dead if generated. For example,
-  // original induction update instructions can become dead because we
-  // separately emit induction "steps" when generating code for the new loop.
-  // Similarly, we create a new latch condition when setting up the structure
-  // of the new loop, so the old one can become dead.
-  SmallPtrSet<Instruction *, 4> DeadInstructions;
-
   // Holds the end values for each induction variable. We save the end values
   // so we can later fix-up the external users of the induction variables.
   DenseMap<PHINode *, Value *> IVEndValues;
@@ -803,8 +831,6 @@ public:
                             UnrollFactor, LVL, CM) {}
 
 private:
-  void scalarizeInstruction(Instruction *Instr,
-                            bool IfPredicateInstr = false) override;
   void vectorizeMemoryInstruction(Instruction *Instr) override;
   Value *getBroadcastInstrs(Value *V) override;
   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -832,12 +858,14 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
   return I;
 }
 
-/// \brief Set the debug location in the builder using the debug location in the
-/// instruction.
-static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
-  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
-    B.SetCurrentDebugLocation(Inst->getDebugLoc());
-  else
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+    const DILocation *DIL = Inst->getDebugLoc();
+    if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
+      B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
+    else
+      B.SetCurrentDebugLocation(DIL);
+  } else
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
@@ -1497,14 +1525,6 @@ private:
   OptimizationRemarkEmitter &ORE;
 };
 
-static void emitAnalysisDiag(const Loop *TheLoop,
-                             const LoopVectorizeHints &Hints,
-                             OptimizationRemarkEmitter &ORE,
-                             const LoopAccessReport &Message) {
-  const char *Name = Hints.vectorizeAnalysisPassName();
-  LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
-}
-
 static void emitMissedWarning(Function *F, Loop *L,
                               const LoopVectorizeHints &LH,
                               OptimizationRemarkEmitter *ORE) {
@@ -1512,13 +1532,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 
   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
     if (LH.getWidth() != 1)
-      emitLoopVectorizeWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop vectorization");
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedVectorization",
+                    L->getStartLoc(), L->getHeader())
+                << "loop not vectorized: "
+                << "failed explicitly specified loop vectorization");
     else if (LH.getInterleave() != 1)
-      emitLoopInterleaveWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop interleaving");
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
+                    L->getHeader())
+                << "loop not interleaved: "
+                << "failed explicitly specified loop interleaving");
   }
 }
 
@@ -1546,7 +1570,7 @@ public:
       LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
         GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
         Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
@@ -1566,8 +1590,8 @@ public:
   /// loop, only that it is legal to do so.
   bool canVectorize();
 
-  /// Returns the Induction variable.
-  PHINode *getInduction() { return Induction; }
+  /// Returns the primary induction variable.
+  PHINode *getPrimaryInduction() { return PrimaryInduction; }
 
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
@@ -1578,6 +1602,9 @@ public:
   /// Return the first-order recurrences found in the loop.
   RecurrenceSet *getFirstOrderRecurrences() { return &FirstOrderRecurrences; }
 
+  /// Return the set of instructions to sink to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> &getSinkAfter() { return SinkAfter; }
+
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
@@ -1607,12 +1634,6 @@ public:
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
-  /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
-
-  /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
-
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
@@ -1689,15 +1710,12 @@ public:
   /// instructions that may divide by zero.
   bool isScalarWithPredication(Instruction *I);
 
-  /// Returns true if \p I is a memory instruction that has a consecutive or
-  /// consecutive-like pointer operand. Consecutive-like pointers are pointers
-  /// that are treated like consecutive pointers during vectorization. The
-  /// pointer operands of interleaved accesses are an example.
-  bool hasConsecutiveLikePtrOperand(Instruction *I);
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
-  /// Returns true if \p I is a memory instruction that must be scalarized
-  /// during vectorization.
-  bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
+  // Returns true if the NoNaN attribute is set on the function.
+  bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
 
 private:
   /// Check if a single basic block loop is vectorizable.
@@ -1715,24 +1733,6 @@ private:
   /// transformation.
   bool canVectorizeWithIfConvert();
 
-  /// Collect the instructions that are uniform after vectorization. An
-  /// instruction is uniform if we represent it with a single scalar value in
-  /// the vectorized loop corresponding to each vector iteration. Examples of
-  /// uniform instructions include pointer operands of consecutive or
-  /// interleaved memory accesses. Note that although uniformity implies an
-  /// instruction will be scalar, the reverse is not true. In general, a
-  /// scalarized instruction will be represented by VF scalar values in the
-  /// vectorized loop, each corresponding to an iteration of the original
-  /// scalar loop.
-  void collectLoopUniforms();
-
-  /// Collect the instructions that are scalar after vectorization. An
-  /// instruction is scalar if it is known to be uniform or will be scalarized
-  /// during vectorization. Non-uniform scalarized instructions will be
-  /// represented by VF values in the vectorized loop, each corresponding to an
-  /// iteration of the original scalar loop.
-  void collectLoopScalars();
-
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -1744,14 +1744,6 @@ private:
   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
                        SmallPtrSetImpl<Value *> &AllowedExit);
 
-  /// Report an analysis message to assist the user in diagnosing loops that are
-  /// not vectorized.  These are handled as LoopAccessReport rather than
-  /// VectorizationReport because the << operator of VectorizationReport returns
-  /// LoopAccessReport.
-  void emitAnalysis(const LoopAccessReport &Message) const {
-    emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);
-  }
-
   /// Create an analysis remark that explains why vectorization failed
   ///
   /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
@@ -1804,9 +1796,9 @@ private:
 
   //  ---  vectorization state --- //
 
-  /// Holds the integer induction variable. This is the counter of the
+  /// Holds the primary induction variable. This is the counter of the
   /// loop.
-  PHINode *Induction;
+  PHINode *PrimaryInduction;
   /// Holds the reduction variables.
   ReductionList Reductions;
   /// Holds all of the induction variables that we found in the loop.
@@ -1815,6 +1807,9 @@ private:
   InductionList Inductions;
   /// Holds the phi nodes that are first-order recurrences.
   RecurrenceSet FirstOrderRecurrences;
+  /// Holds instructions that need to sink past other instructions to handle
+  /// first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter;
   /// Holds the widest induction type encountered.
   Type *WidestIndTy;
 
@@ -1822,12 +1817,6 @@ private:
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value *, 4> AllowedExit;
 
-  /// Holds the instructions known to be uniform after vectorization.
-  SmallPtrSet<Instruction *, 4> Uniforms;
-
-  /// Holds the instructions known to be scalar after vectorization.
-  SmallPtrSet<Instruction *, 4> Scalars;
-
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
 
@@ -1861,16 +1850,26 @@ public:
       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
 
+  /// \return An upper bound for the vectorization factor, or None if
+  /// vectorization should be avoided up front.
+  Optional<unsigned> computeMaxVF(bool OptForSize);
+
   /// Information about vectorization costs
   struct VectorizationFactor {
     unsigned Width; // Vector width with best cost
     unsigned Cost;  // Cost of the loop with that width
   };
   /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(bool OptForSize);
+  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+
+  /// Setup cost-based decisions for user vectorization factor.
+  void selectUserVectorizationFactor(unsigned UserVF) {
+    collectUniformsAndScalars(UserVF);
+    collectInstsToScalarize(UserVF);
+  }
 
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1884,6 +1883,15 @@ public:
   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
                                  unsigned LoopCost);
 
+  /// Memory access instruction may be vectorized in more than one way.
+  /// Form of instruction after vectorization depends on cost.
+  /// This function takes cost-based decisions for Load/Store instructions
+  /// and collects them in a map. This decisions map is used for building
+  /// the lists of loop-uniform and loop-scalar instructions.
+  /// The calculated cost is saved with widening decision in order to
+  /// avoid redundant calculations.
+  void setCostBasedWideningDecision(unsigned VF);
+
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
@@ -1918,14 +1926,118 @@ public:
     return Scalars->second.count(I);
   }
 
+  /// Returns true if \p I is known to be uniform after vectorization.
+  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
+    auto UniformsPerVF = Uniforms.find(VF);
+    return UniformsPerVF->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be scalar after vectorization.
+  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
+    auto ScalarsPerVF = Scalars.find(VF);
+    return ScalarsPerVF->second.count(I);
+  }
+
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
-           !Legal->isScalarAfterVectorization(I);
+           !isScalarAfterVectorization(I, VF);
+  }
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    CM_Unknown,
+    CM_Widen,
+    CM_Interleave,
+    CM_GatherScatter,
+    CM_Scalarize
+  };
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// instruction \p I and vector width \p VF.
+  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
+                           unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+  }
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// interleaving group \p Grp and vector width \p VF.
+  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+                           InstWidening W, unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    /// Broadcast this decicion to all instructions inside the group.
+    /// But the cost will be assigned to one instruction only.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i)) {
+        if (Grp->getInsertPos() == I)
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+        else
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+      }
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return CM_Unknown if this instruction did not pass
+  /// through the cost modeling.
+  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    auto Itr = WideningDecisions.find(InstOnVF);
+    if (Itr == WideningDecisions.end())
+      return CM_Unknown;
+    return Itr->second.first;
+  }
+
+  /// Return the vectorization cost for the given instruction \p I and vector
+  /// width \p VF.
+  unsigned getWideningCost(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
+    return WideningDecisions[InstOnVF].second;
+  }
+
+  /// Return True if instruction \p I is an optimizable truncate whose operand
+  /// is an induction variable. Such a truncate will be removed by adding a new
+  /// induction variable with the destination type.
+  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+
+    // If the instruction is not a truncate, return false.
+    auto *Trunc = dyn_cast<TruncInst>(I);
+    if (!Trunc)
+      return false;
+
+    // Get the source and destination types of the truncate.
+    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+    // If the truncate is free for the given types, return false. Replacing a
+    // free truncate with an induction variable would add an induction variable
+    // update instruction to each iteration of the loop. We exclude from this
+    // check the primary induction variable since it will need an update
+    // instruction regardless.
+    Value *Op = Trunc->getOperand(0);
+    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+      return false;
+
+    // If the truncated value is not an induction variable, return false.
+    return Legal->isInductionVariable(Op);
   }
 
 private:
+  /// \return An upper bound for the vectorization factor, larger than zero.
+  /// One is returned if vectorization should best be avoided due to cost.
+  unsigned computeFeasibleMaxVF(bool OptForSize);
+
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -1949,6 +2061,26 @@ private:
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
 
+  /// Calculate vectorization cost of memory instruction \p I.
+  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for scalarized memory instruction.
+  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for interleaving group of memory instructions.
+  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for Gather/Scatter instruction.
+  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for widening instruction \p I with consecutive
+  /// memory access.
+  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+
+  /// The cost calculation for Load instruction \p I with uniform pointer -
+  /// scalar load + broadcast.
+  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -1972,12 +2104,28 @@ private:
   /// pairs.
   typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
 
+  /// A set containing all BasicBlocks that are known to present after
+  /// vectorization as a predicated block.
+  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
 
+  /// Holds the instructions known to be uniform after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+  /// Holds the instructions known to be scalar after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+
+  /// Holds the instructions (address computations) that are forced to be
+  /// scalarized.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
@@ -1990,6 +2138,44 @@ private:
   /// the loop.
   void collectInstsToScalarize(unsigned VF);
 
+  /// Collect the instructions that are uniform after vectorization. An
+  /// instruction is uniform if we represent it with a single scalar value in
+  /// the vectorized loop corresponding to each vector iteration. Examples of
+  /// uniform instructions include pointer operands of consecutive or
+  /// interleaved memory accesses. Note that although uniformity implies an
+  /// instruction will be scalar, the reverse is not true. In general, a
+  /// scalarized instruction will be represented by VF scalar values in the
+  /// vectorized loop, each corresponding to an iteration of the original
+  /// scalar loop.
+  void collectLoopUniforms(unsigned VF);
+
+  /// Collect the instructions that are scalar after vectorization. An
+  /// instruction is scalar if it is known to be uniform or will be scalarized
+  /// during vectorization. Non-uniform scalarized instructions will be
+  /// represented by VF values in the vectorized loop, each corresponding to an
+  /// iteration of the original scalar loop.
+  void collectLoopScalars(unsigned VF);
+
+  /// Collect Uniform and Scalar values for the given \p VF.
+  /// The sets depend on CM decision for Load/Store instructions
+  /// that may be vectorized as interleave, gather-scatter or scalarized.
+  void collectUniformsAndScalars(unsigned VF) {
+    // Do the analysis once.
+    if (VF == 1 || Uniforms.count(VF))
+      return;
+    setCostBasedWideningDecision(VF);
+    collectLoopUniforms(VF);
+    collectLoopScalars(VF);
+  }
+
+  /// Keeps cost model vectorization decision and cost for instructions.
+  /// Right now it is used for memory instructions only.
+  typedef DenseMap<std::pair<Instruction *, unsigned>,
+                   std::pair<InstWidening, unsigned>>
+      DecisionList;
+
+  DecisionList WideningDecisions;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -2019,6 +2205,44 @@ public:
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+class LoopVectorizationPlanner {
+public:
+  LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI,
+                           LoopVectorizationLegality *Legal,
+                           LoopVectorizationCostModel &CM)
+      : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {}
+
+  ~LoopVectorizationPlanner() {}
+
+  /// Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
+                                                       unsigned UserVF);
+
+  /// Generate the IR code for the vectorized loop.
+  void executePlan(InnerLoopVectorizer &ILV);
+
+protected:
+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions(
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+private:
+  /// The loop that we evaluate.
+  Loop *OrigLoop;
+
+  /// Loop Info analysis.
+  LoopInfo *LI;
+
+  /// The legality analysis.
+  LoopVectorizationLegality *Legal;
+
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+};
+
 /// \brief This holds vectorization requirements that must be verified late in
 /// the process. The requirements are set by legalize and costmodel. Once
 /// vectorization has been determined to be possible and profitable the
@@ -2134,8 +2358,6 @@ struct LoopVectorize : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequiredID(LoopSimplifyID);
-    AU.addRequiredID(LCSSAID);
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
@@ -2156,7 +2378,7 @@ struct LoopVectorize : public FunctionPass {
 
 //===----------------------------------------------------------------------===//
 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
-// LoopVectorizationCostModel.
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
 //===----------------------------------------------------------------------===//
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -2176,41 +2398,63 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-void InnerLoopVectorizer::createVectorIntInductionPHI(
-    const InductionDescriptor &II, Instruction *EntryVal) {
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
+    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
   Value *Start = II.getStartValue();
-  ConstantInt *Step = II.getConstIntStepValue();
-  assert(Step && "Can not widen an IV with a non-constant step");
 
   // Construct the initial value of the vector IV in the vector loop preheader
   auto CurrIP = Builder.saveIP();
   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
   if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
     auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+    Step = Builder.CreateTrunc(Step, TruncType);
     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
   }
   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
-  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+  Value *SteppedStart =
+      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = II.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
+  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+  // Create a vector splat to use in the induction update.
+  //
+  // FIXME: If the step is non-constant, we create the vector splat with
+  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+  //        handle a constant vector splat.
+  Value *SplatVF = isa<Constant>(Mul)
+                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(VF, Mul);
   Builder.restoreIP(CurrIP);
 
-  Value *SplatVF =
-      ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(),
-                               VF * Step->getSExtValue()));
   // We may need to add the step a number of times, depending on the unroll
   // factor. The last of those goes into the PHI.
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
                                     &*LoopVectorBody->getFirstInsertionPt());
   Instruction *LastInduction = VecInd;
-  VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part] = LastInduction;
-    LastInduction = cast<Instruction>(
-        Builder.CreateAdd(LastInduction, SplatVF, "step.add"));
+    VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+    if (isa<TruncInst>(EntryVal))
+      addMetadata(LastInduction, EntryVal);
+    LastInduction = cast<Instruction>(addFastMathFlag(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
   }
-  VectorLoopValueMap.initVector(EntryVal, Entry);
-  if (isa<TruncInst>(EntryVal))
-    addMetadata(Entry, EntryVal);
 
   // Move the last step to the end of the latch block. This ensures consistent
   // placement of all induction updates.
@@ -2225,7 +2469,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
 }
 
 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
-  return Legal->isScalarAfterVectorization(I) ||
+  return Cost->isScalarAfterVectorization(I, VF) ||
          Cost->isProfitableToScalarize(I, VF);
 }
 
@@ -2239,7 +2483,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
   return any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+
+  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+         "Primary induction variable must have an integer type");
 
   auto II = Legal->getInductionVars()->find(IV);
   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@@ -2251,9 +2498,6 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // induction variable.
   Value *ScalarIV = nullptr;
 
-  // The step of the induction.
-  Value *Step = nullptr;
-
   // The value from the original loop to which we are mapping the new induction
   // variable.
   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
@@ -2266,45 +2510,49 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
 
-  // If the induction variable has a constant integer step value, go ahead and
-  // get it now.
-  if (ID.getConstIntStepValue())
-    Step = ID.getConstIntStepValue();
+  // Generate code for the induction step. Note that induction steps are
+  // required to be loop-invariant
+  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
+         "Induction step should be loop invariant");
+  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+  Value *Step = nullptr;
+  if (PSE.getSE()->isSCEVable(IV->getType())) {
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                             LoopVectorPreHeader->getTerminator());
+  } else {
+    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
+  }
 
   // Try to create a new independent vector induction variable. If we can't
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
-  if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntInductionPHI(ID, EntryVal);
+  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
     VectorizedIV = true;
   }
 
   // If we haven't yet vectorized the induction variable, or if we will create
   // a scalar one, we need to define the scalar induction variable and step
   // values. If we were given a truncation type, truncate the canonical
-  // induction variable and constant step. Otherwise, derive these values from
-  // the induction descriptor.
+  // induction variable and step. Otherwise, derive these values from the
+  // induction descriptor.
   if (!VectorizedIV || NeedsScalarIV) {
+    ScalarIV = Induction;
+    if (IV != OldInduction) {
+      ScalarIV = IV->getType()->isIntegerTy()
+                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+                     : Builder.CreateCast(Instruction::SIToFP, Induction,
+                                          IV->getType());
+      ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+      ScalarIV->setName("offset.idx");
+    }
     if (Trunc) {
       auto *TruncType = cast<IntegerType>(Trunc->getType());
-      assert(Step && "Truncation requires constant integer step");
-      auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
-      ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
-      Step = ConstantInt::getSigned(TruncType, StepInt);
-    } else {
-      ScalarIV = Induction;
-      auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-      if (IV != OldInduction) {
-        ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
-        ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
-        ScalarIV->setName("offset.idx");
-      }
-      if (!Step) {
-        SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-        Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
-                                 &*Builder.GetInsertPoint());
-      }
+      assert(Step->getType()->isIntegerTy() &&
+             "Truncation requires an integer step");
+      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+      Step = Builder.CreateTrunc(Step, TruncType);
     }
   }
 
@@ -2312,12 +2560,13 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // induction variable, and build the necessary step vectors.
   if (!VectorizedIV) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
-    VectorParts Entry(UF);
-    for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
-    VectorLoopValueMap.initVector(EntryVal, Entry);
-    if (Trunc)
-      addMetadata(Entry, Trunc);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *EntryPart =
+          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
+      VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+      if (Trunc)
+        addMetadata(EntryPart, Trunc);
+    }
   }
 
   // If an induction variable is only used for counting loop iterations or
@@ -2327,7 +2576,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // in the loop in the common case prior to InstCombine. We will be trading
   // one vector extract for each scalar step.
   if (NeedsScalarIV)
-    buildScalarSteps(ScalarIV, Step, EntryVal);
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -2387,34 +2636,44 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
 }
 
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
-                                           Value *EntryVal) {
+                                           Value *EntryVal,
+                                           const InductionDescriptor &ID) {
 
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF > 1 && "VF should be greater than one");
 
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
-  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
-         "Val and Step should have the same integer type");
+  assert(ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same type");
+
+  // We build scalar steps for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (ScalarIVTy->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
 
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   unsigned Lanes =
-      Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
+    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
-  ScalarParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(VF);
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
-      auto *Mul = Builder.CreateMul(StartIdx, Step);
-      auto *Add = Builder.CreateAdd(ScalarIV, Mul);
-      Entry[Part][Lane] = Add;
+      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
+      VectorLoopValueMap.setScalarValue(EntryVal, Part, Lane, Add);
     }
   }
-  VectorLoopValueMap.initScalar(EntryVal, Entry);
 }
 
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
@@ -2432,8 +2691,7 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return LAI->isUniform(V);
 }
 
-const InnerLoopVectorizer::VectorParts &
-InnerLoopVectorizer::getVectorValue(Value *V) {
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
   assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
@@ -2442,17 +2700,16 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
   if (Legal->hasStride(V))
     V = ConstantInt::get(V->getType(), 1);
 
-  // If we have this scalar in the map, return it.
-  if (VectorLoopValueMap.hasVector(V))
-    return VectorLoopValueMap.VectorMapStorage[V];
+  // If we have a vector mapped to this value, return it.
+  if (VectorLoopValueMap.hasVectorValue(V, Part))
+    return VectorLoopValueMap.getVectorValue(V, Part);
 
   // If the value has not been vectorized, check if it has been scalarized
   // instead. If it has been scalarized, and we actually need the value in
   // vector form, we will construct the vector values on demand.
-  if (VectorLoopValueMap.hasScalar(V)) {
+  if (VectorLoopValueMap.hasAnyScalarValue(V)) {
 
-    // Initialize a new vector map entry.
-    VectorParts Entry(UF);
+    Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, Part, 0);
 
     // If we've scalarized a value, that value should be an instruction.
     auto *I = cast<Instruction>(V);
@@ -2460,17 +2717,17 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // If we aren't vectorizing, we can just copy the scalar map values over to
     // the vector map.
     if (VF == 1) {
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = getScalarValue(V, Part, 0);
-      return VectorLoopValueMap.initVector(V, Entry);
+      VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+      return ScalarValue;
     }
 
-    // Get the last scalar instruction we generated for V. If the value is
-    // known to be uniform after vectorization, this corresponds to lane zero
-    // of the last unroll iteration. Otherwise, the last instruction is the one
-    // we created for the last vector lane of the last unroll iteration.
-    unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
-    auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
+    // Get the last scalar instruction we generated for V and Part. If the value
+    // is known to be uniform after vectorization, this corresponds to lane zero
+    // of the Part unroll iteration. Otherwise, the last instruction is the one
+    // we created for the last vector lane of the Part unroll iteration.
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
+    auto *LastInst =
+        cast<Instruction>(VectorLoopValueMap.getScalarValue(V, Part, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
     // the insertelement sequence will directly follow the scalar definitions.
@@ -2484,51 +2741,50 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // iteration. Otherwise, we construct the vector values using insertelement
     // instructions. Since the resulting vectors are stored in
     // VectorLoopValueMap, we will only generate the insertelements once.
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *VectorValue = nullptr;
-      if (Legal->isUniformAfterVectorization(I)) {
-        VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
-      } else {
-        VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
-        for (unsigned Lane = 0; Lane < VF; ++Lane)
-          VectorValue = Builder.CreateInsertElement(
-              VectorValue, getScalarValue(V, Part, Lane),
-              Builder.getInt32(Lane));
-      }
-      Entry[Part] = VectorValue;
+    Value *VectorValue = nullptr;
+    if (Cost->isUniformAfterVectorization(I, VF)) {
+      VectorValue = getBroadcastInstrs(ScalarValue);
+    } else {
+      VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
+      for (unsigned Lane = 0; Lane < VF; ++Lane)
+        VectorValue = Builder.CreateInsertElement(
+            VectorValue, getOrCreateScalarValue(V, Part, Lane),
+            Builder.getInt32(Lane));
     }
+    VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     Builder.restoreIP(OldIP);
-    return VectorLoopValueMap.initVector(V, Entry);
+    return VectorValue;
   }
 
   // If this scalar is unknown, assume that it is a constant or that it is
   // loop invariant. Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  return VectorLoopValueMap.initVector(V, VectorParts(UF, B));
+  VectorLoopValueMap.setVectorValue(V, Part, B);
+  return B;
 }
 
-Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
-                                           unsigned Lane) {
+Value *InnerLoopVectorizer::getOrCreateScalarValue(Value *V, unsigned Part,
+                                                   unsigned Lane) {
 
   // If the value is not an instruction contained in the loop, it should
   // already be scalar.
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
+  assert(Lane > 0 ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
                   : true && "Uniform values only have lane zero");
 
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
   // scalar value.
-  if (VectorLoopValueMap.hasScalar(V))
-    return VectorLoopValueMap.ScalarMapStorage[V][Part][Lane];
+  if (VectorLoopValueMap.hasScalarValue(V, Part, Lane))
+    return VectorLoopValueMap.getScalarValue(V, Part, Lane);
 
   // If the value has not been scalarized, get its entry in VectorLoopValueMap
   // for the given unroll part. If this entry is not a vector type (i.e., the
   // vectorization factor is one), there is no need to generate an
   // extractelement instruction.
-  auto *U = getVectorValue(V)[Part];
+  auto *U = getOrCreateVectorValue(V, Part);
   if (!U->getType()->isVectorTy()) {
     assert(VF == 1 && "Value not scalarized has non-vector type");
     return U;
@@ -2551,102 +2807,6 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
-// Get a mask to interleave \p NumVec vectors into a wide vector.
-// I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
-// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
-//      <0, 4, 1, 5, 2, 6, 3, 7>
-static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
-                                    unsigned NumVec) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < VF; i++)
-    for (unsigned j = 0; j < NumVec; j++)
-      Mask.push_back(Builder.getInt32(j * VF + i));
-
-  return ConstantVector::get(Mask);
-}
-
-// Get the strided mask starting from index \p Start.
-// I.e.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
-static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
-                                unsigned Stride, unsigned VF) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < VF; i++)
-    Mask.push_back(Builder.getInt32(Start + i * Stride));
-
-  return ConstantVector::get(Mask);
-}
-
-// Get a mask of two parts: The first part consists of sequential integers
-// starting from 0, The second part consists of UNDEFs.
-// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
-                                   unsigned NumUndef) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumInt; i++)
-    Mask.push_back(Builder.getInt32(i));
-
-  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
-  for (unsigned i = 0; i < NumUndef; i++)
-    Mask.push_back(Undef);
-
-  return ConstantVector::get(Mask);
-}
-
-// Concatenate two vectors with the same element type. The 2nd vector should
-// not have more elements than the 1st vector. If the 2nd vector has less
-// elements, extend it with UNDEFs.
-static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
-                                    Value *V2) {
-  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
-  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
-  assert(VecTy1 && VecTy2 &&
-         VecTy1->getScalarType() == VecTy2->getScalarType() &&
-         "Expect two vectors with the same element type");
-
-  unsigned NumElts1 = VecTy1->getNumElements();
-  unsigned NumElts2 = VecTy2->getNumElements();
-  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
-
-  if (NumElts1 > NumElts2) {
-    // Extend with UNDEFs.
-    Constant *ExtMask =
-        getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
-    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
-  }
-
-  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
-  return Builder.CreateShuffleVector(V1, V2, Mask);
-}
-
-// Concatenate vectors in the given list. All vectors have the same type.
-static Value *ConcatenateVectors(IRBuilder<> &Builder,
-                                 ArrayRef<Value *> InputList) {
-  unsigned NumVec = InputList.size();
-  assert(NumVec > 1 && "Should be at least two vectors");
-
-  SmallVector<Value *, 8> ResList;
-  ResList.append(InputList.begin(), InputList.end());
-  do {
-    SmallVector<Value *, 8> TmpList;
-    for (unsigned i = 0; i < NumVec - 1; i += 2) {
-      Value *V0 = ResList[i], *V1 = ResList[i + 1];
-      assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
-             "Only the last vector may have a different type");
-
-      TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
-    }
-
-    // Push the last vector if the total number of vectors is odd.
-    if (NumVec % 2 != 0)
-      TmpList.push_back(ResList[NumVec - 1]);
-
-    ResList = TmpList;
-    NumVec = ResList.size();
-  } while (NumVec > 1);
-
-  return ResList[0];
-}
-
 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
@@ -2683,15 +2843,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   if (Instr != Group->getInsertPos())
     return;
 
-  LoadInst *LI = dyn_cast<LoadInst>(Instr);
-  StoreInst *SI = dyn_cast<StoreInst>(Instr);
   Value *Ptr = getPointerOperand(Instr);
 
   // Prepare for the vector type of the interleaved load/store.
-  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
-  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
 
   // Prepare for the new pointers.
   setDebugLocFromInst(Builder, Ptr);
@@ -2708,7 +2866,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     Index += (VF - 1) * Group->getFactor();
 
   for (unsigned Part = 0; Part < UF; Part++) {
-    Value *NewPtr = getScalarValue(Ptr, Part, 0);
+    Value *NewPtr = getOrCreateScalarValue(Ptr, Part, 0);
 
     // Notice current instruction could be any index. Need to adjust the address
     // to the member of index 0.
@@ -2731,7 +2889,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   Value *UndefVec = UndefValue::get(VecTy);
 
   // Vectorize the interleaved load group.
-  if (LI) {
+  if (isa<LoadInst>(Instr)) {
 
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
@@ -2751,8 +2909,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       if (!Member)
         continue;
 
-      VectorParts Entry(UF);
-      Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF);
+      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
@@ -2763,10 +2920,11 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
           StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy);
         }
 
-        Entry[Part] =
-            Group->isReverse() ? reverseVector(StridedVec) : StridedVec;
+        if (Group->isReverse())
+          StridedVec = reverseVector(StridedVec);
+
+        VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
       }
-      VectorLoopValueMap.initVector(Member, Entry);
     }
     return;
   }
@@ -2783,8 +2941,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
       Instruction *Member = Group->getMember(i);
       assert(Member && "Fail to get a member from an interleaved store group");
 
-      Value *StoredVec =
-          getVectorValue(cast<StoreInst>(Member)->getValueOperand())[Part];
+      Value *StoredVec = getOrCreateVectorValue(
+          cast<StoreInst>(Member)->getValueOperand(), Part);
       if (Group->isReverse())
         StoredVec = reverseVector(StoredVec);
 
@@ -2796,10 +2954,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     }
 
     // Concatenate all vectors into a wide vector.
-    Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+    Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
-    Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
+    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
@@ -2816,104 +2974,43 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
-  // Try to vectorize the interleave group if this access is interleaved.
-  if (Legal->isAccessInterleaved(Instr))
+  LoopVectorizationCostModel::InstWidening Decision =
+      Cost->getWideningDecision(Instr, VF);
+  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+         "CM decision should be taken at this point");
+  if (Decision == LoopVectorizationCostModel::CM_Interleave)
     return vectorizeInterleaveGroup(Instr);
 
-  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *ScalarDataTy = getMemInstValueType(Instr);
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = getPointerOperand(Instr);
-  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  unsigned Alignment = getMemInstAlignment(Instr);
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
   const DataLayout &DL = Instr->getModule()->getDataLayout();
   if (!Alignment)
     Alignment = DL.getABITypeAlignment(ScalarDataTy);
-  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  unsigned AddressSpace = getMemInstAddressSpace(Instr);
 
   // Scalarize the memory instruction if necessary.
-  if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
+  if (Decision == LoopVectorizationCostModel::CM_Scalarize)
     return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
-
-  // Determine if either a gather or scatter operation is legal.
   bool CreateGatherScatter =
-      !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
+      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
-  VectorParts VectorGep;
+  // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+  // gather/scatter. Otherwise Decision should have been to Scalarize.
+  assert((ConsecutiveStride || CreateGatherScatter) &&
+         "The instruction should be scalarized");
 
   // Handle consecutive loads/stores.
-  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
-  if (ConsecutiveStride) {
-    if (Gep) {
-      unsigned NumOperands = Gep->getNumOperands();
-#ifndef NDEBUG
-      // The original GEP that identified as a consecutive memory access
-      // should have only one loop-variant operand.
-      unsigned NumOfLoopVariantOps = 0;
-      for (unsigned i = 0; i < NumOperands; ++i)
-        if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
-                                          OrigLoop))
-          NumOfLoopVariantOps++;
-      assert(NumOfLoopVariantOps == 1 &&
-             "Consecutive GEP should have only one loop-variant operand");
-#endif
-      GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-      Gep2->setName("gep.indvar");
-
-      // A new GEP is created for a 0-lane value of the first unroll iteration.
-      // The GEPs for the rest of the unroll iterations are computed below as an
-      // offset from this GEP.
-      for (unsigned i = 0; i < NumOperands; ++i)
-        // We can apply getScalarValue() for all GEP indices. It returns an
-        // original value for loop-invariant operand and 0-lane for consecutive
-        // operand.
-        Gep2->setOperand(i, getScalarValue(Gep->getOperand(i),
-                                           0, /* First unroll iteration */
-                                           0  /* 0-lane of the vector */ ));
-      setDebugLocFromInst(Builder, Gep);
-      Ptr = Builder.Insert(Gep2);
-
-    } else { // No GEP
-      setDebugLocFromInst(Builder, Ptr);
-      Ptr = getScalarValue(Ptr, 0, 0);
-    }
-  } else {
-    // At this point we should vector version of GEP for Gather or Scatter
-    assert(CreateGatherScatter && "The instruction should be scalarized");
-    if (Gep) {
-      // Vectorizing GEP, across UF parts. We want to get a vector value for base
-      // and each index that's defined inside the loop, even if it is
-      // loop-invariant but wasn't hoisted out. Otherwise we want to keep them
-      // scalar.
-      SmallVector<VectorParts, 4> OpsV;
-      for (Value *Op : Gep->operands()) {
-        Instruction *SrcInst = dyn_cast<Instruction>(Op);
-        if (SrcInst && OrigLoop->contains(SrcInst))
-          OpsV.push_back(getVectorValue(Op));
-        else
-          OpsV.push_back(VectorParts(UF, Op));
-      }
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Ops;
-        Value *GEPBasePtr = OpsV[0][Part];
-        for (unsigned i = 1; i < Gep->getNumOperands(); i++)
-          Ops.push_back(OpsV[i][Part]);
-        Value *NewGep =  Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");
-        cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());
-        assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
-
-        NewGep =
-            Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));
-        VectorGep.push_back(NewGep);
-      }
-    } else
-      VectorGep = getVectorValue(Ptr);
-  }
+  if (ConsecutiveStride)
+    Ptr = getOrCreateScalarValue(Ptr, 0, 0);
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
   // Handle Stores:
@@ -2921,16 +3018,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
     assert(!Legal->isUniform(SI->getPointerOperand()) &&
            "We do not allow storing to uniform addresses");
     setDebugLocFromInst(Builder, SI);
-    // We don't want to update the value in the map as it might be used in
-    // another expression. So don't use a reference type for "StoredVal".
-    VectorParts StoredVal = getVectorValue(SI->getValueOperand());
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Instruction *NewSI = nullptr;
+      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
       if (CreateGatherScatter) {
         Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
-        NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
-                                            Alignment, MaskPart);
+        Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
       } else {
         // Calculate the pointer for the specific unroll-part.
         Value *PartPtr =
@@ -2939,7 +3035,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
           // to reverse the order of elements in the stored value.
-          StoredVal[Part] = reverseVector(StoredVal[Part]);
+          StoredVal = reverseVector(StoredVal);
+          // We don't want to update the value in the map as it might be used in
+          // another expression. So don't call resetVectorValue(StoredVal).
+
           // If the address is consecutive but reversed, then the
           // wide store needs to start at the last vector element.
           PartPtr =
@@ -2953,11 +3052,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
             Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
 
         if (Legal->isMaskRequired(SI))
-          NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             Mask[Part]);
         else
-          NewSI =
-              Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
       }
       addMetadata(NewSI, SI);
     }
@@ -2967,14 +3065,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
   // Handle loads.
   assert(LI && "Must have a load instruction");
   setDebugLocFromInst(Builder, LI);
-  VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Instruction *NewLI;
+    Value *NewLI;
     if (CreateGatherScatter) {
       Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
-      NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment, MaskPart,
-                                         0, "wide.masked.gather");
-      Entry[Part] = NewLI;
+      Value *VectorGep = getOrCreateVectorValue(Ptr, Part);
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+                                         nullptr, "wide.masked.gather");
+      addMetadata(NewLI, LI);
     } else {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr =
@@ -2996,11 +3094,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
                                          "wide.masked.load");
       else
         NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
-      Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
+
+      // Add metadata to the load, but setVectorValue to the reverse shuffle.
+      addMetadata(NewLI, LI);
+      if (Reverse)
+        NewLI = reverseVector(NewLI);
     }
-    addMetadata(NewLI, LI);
+    VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
   }
-  VectorLoopValueMap.initVector(Instr, Entry);
 }
 
 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
@@ -3017,9 +3118,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Does this instruction return a value ?
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
-  // Initialize a new scalar map entry.
-  ScalarParts Entry(UF);
-
   VectorParts Cond;
   if (IfPredicateInstr)
     Cond = createBlockInMask(Instr->getParent());
@@ -3027,18 +3125,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If the instruction is uniform, we only need to generate the
   // first lane. Otherwise, we generate all VF values.
-  unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
+  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
 
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(VF);
     // For each scalar that we create:
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
 
       // Start if-block.
       Value *Cmp = nullptr;
       if (IfPredicateInstr) {
-        Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));
+        Cmp = Cond[Part];
+        if (Cmp->getType()->isVectorTy())
+          Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
                                  ConstantInt::get(Cmp->getType(), 1));
       }
@@ -3050,7 +3149,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       // Replace the operands of the cloned instructions with their scalar
       // equivalents in the new loop.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-        auto *NewOp = getScalarValue(Instr->getOperand(op), Part, Lane);
+        auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Part, Lane);
         Cloned->setOperand(op, NewOp);
       }
       addNewMetadata(Cloned, Instr);
@@ -3059,7 +3158,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       Builder.Insert(Cloned);
 
       // Add the cloned scalar to the scalar map entry.
-      Entry[Part][Lane] = Cloned;
+      VectorLoopValueMap.setScalarValue(Instr, Part, Lane, Cloned);
 
       // If we just cloned a new assumption, add it the assumption cache.
       if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
@@ -3071,7 +3170,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
         PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
     }
   }
-  VectorLoopValueMap.initScalar(Instr, Entry);
 }
 
 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
@@ -3189,37 +3287,16 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
   BasicBlock *BB = L->getLoopPreheader();
   IRBuilder<> Builder(BB->getTerminator());
 
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
-  Value *CheckMinIters = Builder.CreateICmpULT(
-      Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
-
-  BasicBlock *NewBB =
-      BB->splitBasicBlock(BB->getTerminator(), "min.iters.checked");
-  // Update dominator tree immediately if the generated block is a
-  // LoopBypassBlock because SCEV expansions to generate loop bypass
-  // checks may query it before the current function is finished.
-  DT->addNewBlock(NewBB, BB);
-  if (L->getParentLoop())
-    L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
-  ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
-  LoopBypassBlocks.push_back(BB);
-}
-
-void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
-                                                     BasicBlock *Bypass) {
-  Value *TC = getOrCreateVectorTripCount(L);
-  BasicBlock *BB = L->getLoopPreheader();
-  IRBuilder<> Builder(BB->getTerminator());
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()),
-                                    "cmp.zero");
+  // Generate code to check if the loop's trip count is less than VF * UF, or
+  // equal to it in case a scalar epilogue is required; this implies that the
+  // vector trip count is zero. This check also covers the case where adding one
+  // to the backedge-taken count overflowed leading to an incorrect trip count
+  // of zero. In this case we will also jump to the scalar loop.
+  auto P = Legal->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+                                           : ICmpInst::ICMP_ULT;
+  Value *CheckMinIters = Builder.CreateICmp(
+      P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check");
 
-  // Generate code to check that the loop's trip count that we computed by
-  // adding one to the backedge-taken count will not overflow.
   BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph");
   // Update dominator tree immediately if the generated block is a
   // LoopBypassBlock because SCEV expansions to generate loop bypass
@@ -3228,7 +3305,7 @@ void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L,
   if (L->getParentLoop())
     L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI);
   ReplaceInstWithInst(BB->getTerminator(),
-                      BranchInst::Create(Bypass, NewBB, Cmp));
+                      BranchInst::Create(Bypass, NewBB, CheckMinIters));
   LoopBypassBlocks.push_back(BB);
 }
 
@@ -3296,7 +3373,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   LVer->prepareNoAliasMetadata();
 }
 
-void InnerLoopVectorizer::createEmptyLoop() {
+void InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -3346,7 +3423,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   //   - counts from zero, stepping by one
   //   - is the size of the widest induction variable type
   // then we create a new one.
-  OldInduction = Legal->getInduction();
+  OldInduction = Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
 
   // Split the single block loop into the two loop structure described above.
@@ -3377,14 +3454,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
   Value *StartIdx = ConstantInt::get(IdxTy, 0);
 
-  // We need to test whether the backedge-taken count is uint##_max. Adding one
-  // to it will cause overflow and an incorrect loop trip count in the vector
-  // body. In case of overflow we want to directly jump to the scalar remainder
-  // loop.
-  emitMinimumIterationCountCheck(Lp, ScalarPH);
   // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, ScalarPH);
+  // jump to the scalar loop. This check also covers the case where the
+  // backedge-taken count is uint##_max: adding one to it will overflow leading
+  // to an incorrect trip count of zero. In this (rare) case we will also jump
+  // to the scalar loop.
+  emitMinimumIterationCountCheck(Lp, ScalarPH);
+
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
   emitSCEVChecks(Lp, ScalarPH);
@@ -3427,7 +3503,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
       // We know what the end value is.
       EndValue = CountRoundDown;
     } else {
-      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
+      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
         CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
@@ -3521,8 +3597,12 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
       IRBuilder<> B(MiddleBlock->getTerminator());
       Value *CountMinusOne = B.CreateSub(
           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
-      Value *CMO = B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType(),
-                                       "cast.cmo");
+      Value *CMO =
+          !II.getStep()->getType()->isIntegerTy()
+              ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
+                             II.getStep()->getType())
+              : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
+      CMO->setName("cast.cmo");
       Value *Escape = II.transform(B, CMO, PSE.getSE(), DL);
       Escape->setName("ind.escape");
       MissingVals[UI] = Escape;
@@ -3543,7 +3623,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
 
 namespace {
 struct CSEDenseMapInfo {
-  static bool canHandle(Instruction *I) {
+  static bool canHandle(const Instruction *I) {
     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
   }
@@ -3553,12 +3633,12 @@ struct CSEDenseMapInfo {
   static inline Instruction *getTombstoneKey() {
     return DenseMapInfo<Instruction *>::getTombstoneKey();
   }
-  static unsigned getHashValue(Instruction *I) {
+  static unsigned getHashValue(const Instruction *I) {
     assert(canHandle(I) && "Unknown instruction!");
     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
                                                            I->value_op_end()));
   }
-  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
         LHS == getTombstoneKey() || RHS == getTombstoneKey())
       return LHS == RHS;
@@ -3589,51 +3669,6 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-/// \brief Adds a 'fast' flag to floating point operations.
-static Value *addFastMathFlag(Value *V) {
-  if (isa<FPMathOperator>(V)) {
-    FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
-    cast<Instruction>(V)->setFastMathFlags(Flags);
-  }
-  return V;
-}
-
-/// \brief Estimate the overhead of scalarizing a value based on its type.
-/// Insert and Extract are set if the result needs to be inserted and/or
-/// extracted from vectors.
-static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
-                                         const TargetTransformInfo &TTI) {
-  if (Ty->isVoidTy())
-    return 0;
-
-  assert(Ty->isVectorTy() && "Can only scalarize vectors");
-  unsigned Cost = 0;
-
-  for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
-    if (Extract)
-      Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
-    if (Insert)
-      Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
-  }
-
-  return Cost;
-}
-
-/// \brief Estimate the overhead of scalarizing an Instruction based on the
-/// types of its operands and return value.
-static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
-                                         Type *RetTy,
-                                         const TargetTransformInfo &TTI) {
-  unsigned ScalarizationCost =
-      getScalarizationOverhead(RetTy, true, false, TTI);
-
-  for (Type *Ty : OpTys)
-    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
-
-  return ScalarizationCost;
-}
-
 /// \brief Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
@@ -3641,14 +3676,24 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
   if (VF == 1)
     return 0;
 
+  unsigned Cost = 0;
   Type *RetTy = ToVectorTy(I->getType(), VF);
+  if (!RetTy->isVoidTy() &&
+      (!isa<LoadInst>(I) ||
+       !TTI.supportsEfficientVectorElementLoadStore()))
+    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
 
-  SmallVector<Type *, 4> OpTys;
-  unsigned OperandsNum = I->getNumOperands();
-  for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
-    OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    SmallVector<const Value *, 4> Operands(CI->arg_operands());
+    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
+  }
+  else if (!isa<StoreInst>(I) ||
+           !TTI.supportsEfficientVectorElementLoadStore()) {
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
+  }
 
-  return getScalarizationOverhead(OpTys, RetTy, TTI);
+  return Cost;
 }
 
 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
@@ -3681,7 +3726,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -3709,16 +3754,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
-  Type *RetTy = ToVectorTy(CI->getType(), VF);
-  SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI->arg_operands())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
   FastMathFlags FMF;
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+  SmallVector<Value *, 4> Operands(CI->arg_operands());
+  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -3742,10 +3783,10 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from VectorLoopValueMap indicates that it
     // wasn't vectorized.
-    if (!VectorLoopValueMap.hasVector(KV.first))
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
       continue;
-    VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
-    for (Value *&I : Parts) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
         continue;
       Type *OriginalTy = I->getType();
@@ -3770,7 +3811,11 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
                              ShrinkOperand(BO->getOperand(1)));
-        cast<BinaryOperator>(NewI)->copyIRFlags(I);
+
+        // Any wrapping introduced by shrinking this operation shouldn't be
+        // considered undefined behavior. So, we can't unconditionally copy
+        // arithmetic wrapping flags to NewI.
+        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
         NewI =
             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
@@ -3830,7 +3875,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       I->replaceAllUsesWith(Res);
       cast<Instruction>(I)->eraseFromParent();
       Erased.insert(I);
-      I = Res;
+      VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
     }
   }
 
@@ -3839,277 +3884,31 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
     // If the value wasn't vectorized, we must maintain the original scalar
     // type. The absence of the value from VectorLoopValueMap indicates that it
     // wasn't vectorized.
-    if (!VectorLoopValueMap.hasVector(KV.first))
+    if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
       continue;
-    VectorParts &Parts = VectorLoopValueMap.getVector(KV.first);
-    for (Value *&I : Parts) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *I = getOrCreateVectorValue(KV.first, Part);
       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
       if (Inst && Inst->use_empty()) {
         Value *NewI = Inst->getOperand(0);
         Inst->eraseFromParent();
-        I = NewI;
+        VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
       }
     }
   }
 }
 
-void InnerLoopVectorizer::vectorizeLoop() {
-  //===------------------------------------------------===//
-  //
-  // Notice: any optimization or new instruction that go
-  // into the code below should be also be implemented in
-  // the cost-model.
-  //
-  //===------------------------------------------------===//
-  Constant *Zero = Builder.getInt32(0);
-
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. First,
-  // we create a new vector PHI node with no incoming edges. We use this value
-  // when we vectorize all of the instructions that use the PHI. Next, after
-  // all of the instructions in the block are complete we add the new incoming
-  // edges to the PHI. At this point all of the instructions in the basic block
-  // are vectorized, so we can use them to construct the PHI.
-  PhiVector PHIsToFix;
-
-  // Collect instructions from the original loop that will become trivially
-  // dead in the vectorized loop. We don't need to vectorize these
-  // instructions.
-  collectTriviallyDeadInstructions();
-
-  // Scan the loop in a topological order to ensure that defs are vectorized
-  // before users.
-  LoopBlocksDFS DFS(OrigLoop);
-  DFS.perform(LI);
-
-  // Vectorize all of the blocks in the original loop.
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    vectorizeBlockInLoop(BB, &PHIsToFix);
-
+void InnerLoopVectorizer::fixVectorizedLoop() {
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
   if (VF > 1)
     truncateToMinimalBitwidths();
 
   // At this point every instruction in the original loop is widened to a
-  // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI
+  // vector form. Now we need to fix the recurrences in the loop. These PHI
   // nodes are currently empty because we did not want to introduce cycles.
   // This is the second stage of vectorizing recurrences.
-  for (PHINode *Phi : PHIsToFix) {
-    assert(Phi && "Unable to recover vectorized PHI");
-
-    // Handle first-order recurrences that need to be fixed.
-    if (Legal->isFirstOrderRecurrence(Phi)) {
-      fixFirstOrderRecurrence(Phi);
-      continue;
-    }
-
-    // If the phi node is not a first-order recurrence, it must be a reduction.
-    // Get it's reduction variable descriptor.
-    assert(Legal->isReductionVariable(Phi) &&
-           "Unable to find the reduction variable");
-    RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
-
-    RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
-    TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
-    Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
-        RdxDesc.getMinMaxRecurrenceKind();
-    setDebugLocFromInst(Builder, ReductionStartValue);
-
-    // We need to generate a reduction vector from the incoming scalar.
-    // To do so, we need to generate the 'identity' vector and override
-    // one of the elements with the incoming scalar reduction. We need
-    // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
-
-    // This is the vector-clone of the value that leaves the loop.
-    const VectorParts &VectorExit = getVectorValue(LoopExitInst);
-    Type *VecTy = VectorExit[0]->getType();
-
-    // Find the reduction identity variable. Zero for addition, or, xor,
-    // one for multiplication, -1 for And.
-    Value *Identity;
-    Value *VectorStart;
-    if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
-        RK == RecurrenceDescriptor::RK_FloatMinMax) {
-      // MinMax reduction have the start value as their identify.
-      if (VF == 1) {
-        VectorStart = Identity = ReductionStartValue;
-      } else {
-        VectorStart = Identity =
-            Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
-      }
-    } else {
-      // Handle other reduction kinds:
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
-          RK, VecTy->getScalarType());
-      if (VF == 1) {
-        Identity = Iden;
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart = ReductionStartValue;
-      } else {
-        Identity = ConstantVector::getSplat(VF, Iden);
-
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart =
-            Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
-      }
-    }
-
-    // Fix the vector-loop phi.
-
-    // Reductions do not have to start at zero. They can start with
-    // any loop invariant values.
-    const VectorParts &VecRdxPhi = getVectorValue(Phi);
-    BasicBlock *Latch = OrigLoop->getLoopLatch();
-    Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
-    const VectorParts &Val = getVectorValue(LoopVal);
-    for (unsigned part = 0; part < UF; ++part) {
-      // Make sure to add the reduction stat value only to the
-      // first unroll part.
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      cast<PHINode>(VecRdxPhi[part])
-          ->addIncoming(StartVal, LoopVectorPreHeader);
-      cast<PHINode>(VecRdxPhi[part])
-          ->addIncoming(Val[part], LoopVectorBody);
-    }
-
-    // Before each round, move the insertion point right between
-    // the PHIs and the values we are going to write.
-    // This allows us to write both PHINodes and the extractelement
-    // instructions.
-    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-
-    VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
-    setDebugLocFromInst(Builder, LoopExitInst);
-
-    // If the vector reduction can be performed in a smaller type, we truncate
-    // then extend the loop exit value to enable InstCombine to evaluate the
-    // entire expression in the smaller type.
-    if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
-      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-      Builder.SetInsertPoint(LoopVectorBody->getTerminator());
-      for (unsigned part = 0; part < UF; ++part) {
-        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
-        Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
-                                          : Builder.CreateZExt(Trunc, VecTy);
-        for (Value::user_iterator UI = RdxParts[part]->user_begin();
-             UI != RdxParts[part]->user_end();)
-          if (*UI != Trunc) {
-            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
-            RdxParts[part] = Extnd;
-          } else {
-            ++UI;
-          }
-      }
-      Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-      for (unsigned part = 0; part < UF; ++part)
-        RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
-    }
-
-    // Reduce all of the unrolled parts into a single vector.
-    Value *ReducedPartRdx = RdxParts[0];
-    unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
-    setDebugLocFromInst(Builder, ReducedPartRdx);
-    for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        // Floating point operations had to be 'fast' to enable the reduction.
-        ReducedPartRdx = addFastMathFlag(
-            Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
-                                ReducedPartRdx, "bin.rdx"));
-      else
-        ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
-            Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
-    }
-
-    if (VF > 1) {
-      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-      // and vector ops, reducing the set of values being computed by half each
-      // round.
-      assert(isPowerOf2_32(VF) &&
-             "Reduction emission only supported for pow2 vectors!");
-      Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
-      for (unsigned i = VF; i != 1; i >>= 1) {
-        // Move the upper half of the vector to the lower half.
-        for (unsigned j = 0; j != i / 2; ++j)
-          ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
-        // Fill the rest of the mask with undef.
-        std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
-                  UndefValue::get(Builder.getInt32Ty()));
-
-        Value *Shuf = Builder.CreateShuffleVector(
-            TmpVec, UndefValue::get(TmpVec->getType()),
-            ConstantVector::get(ShuffleMask), "rdx.shuf");
-
-        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-          // Floating point operations had to be 'fast' to enable the reduction.
-          TmpVec = addFastMathFlag(Builder.CreateBinOp(
-              (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
-        else
-          TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
-                                                        TmpVec, Shuf);
-      }
-
-      // The result is in the first element of the vector.
-      ReducedPartRdx =
-          Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
-      // If the reduction can be performed in a smaller type, we need to extend
-      // the reduction to the wider type before we branch to the original loop.
-      if (Phi->getType() != RdxDesc.getRecurrenceType())
-        ReducedPartRdx =
-            RdxDesc.isSigned()
-                ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
-                : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
-    }
-
-    // Create a phi node that merges control-flow from the backedge-taken check
-    // block and the middle block.
-    PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
-                                          LoopScalarPreHeader->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
-    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-
-    // Now, we need to fix the users of the reduction variable
-    // inside and outside of the scalar remainder loop.
-    // We know that the loop is in LCSSA form. We need to update the
-    // PHI nodes in the exit blocks.
-    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-                              LEE = LoopExitBlock->end();
-         LEI != LEE; ++LEI) {
-      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi)
-        break;
-
-      // All PHINodes need to have a single entry edge, or two if
-      // we already fixed them.
-      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
-
-      // We found our reduction value exit-PHI. Update it with the
-      // incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == LoopExitInst) {
-        // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-        break;
-      }
-    } // end of the LCSSA phi scan.
-
-    // Fix the scalar loop reduction variable with the incoming reduction sum
-    // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx =
-        Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-    // Pick the other block.
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-    Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-  } // end of for each Phi in PHIsToFix.
+  fixCrossIterationPHIs();
 
   // Update the dominator tree.
   //
@@ -4134,6 +3933,25 @@ void InnerLoopVectorizer::vectorizeLoop() {
   cse(LoopVectorBody);
 }
 
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #2: We now need to fix the recurrences by adding incoming edges to
+  // the currently empty PHI nodes. At this point every instruction in the
+  // original loop is widened to a vector form so we can use them to construct
+  // the incoming edges.
+  for (Instruction &I : *OrigLoop->getHeader()) {
+    PHINode *Phi = dyn_cast<PHINode>(&I);
+    if (!Phi)
+      break;
+    // Handle first-order recurrences and reductions that need to be fixed.
+    if (Legal->isFirstOrderRecurrence(Phi))
+      fixFirstOrderRecurrence(Phi);
+    else if (Legal->isReductionVariable(Phi))
+      fixReduction(Phi);
+  }
+}
+
 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // This is the second phase of vectorizing first-order recurrences. An
@@ -4204,23 +4022,29 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We constructed a temporary phi node in the first phase of vectorization.
   // This phi node will eventually be deleted.
-  VectorParts &PhiParts = VectorLoopValueMap.getVector(Phi);
-  Builder.SetInsertPoint(cast<Instruction>(PhiParts[0]));
+  Builder.SetInsertPoint(
+      cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
 
   // Create a phi node for the new recurrence. The current value will either be
   // the initial value inserted into a vector or loop-varying vector value.
   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
 
-  // Get the vectorized previous value. We ensured the previous values was an
-  // instruction when detecting the recurrence.
-  auto &PreviousParts = getVectorValue(Previous);
-
-  // Set the insertion point to be after this instruction. We ensured the
-  // previous value dominated all uses of the phi when detecting the
-  // recurrence.
-  Builder.SetInsertPoint(
-      &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
+  // Get the vectorized previous value of the last part UF - 1. It appears last
+  // among all unrolled iterations, due to the order of their construction.
+  Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
+
+  // Set the insertion point after the previous value if it is an instruction.
+  // Note that the previous value may have been constant-folded so it is not
+  // guaranteed to be an instruction in the vector loop. Also, if the previous
+  // value is a phi node, we should insert after all the phi nodes to avoid
+  // breaking basic block verification.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) ||
+      isa<PHINode>(PreviousLastPart))
+    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+  else
+    Builder.SetInsertPoint(
+        &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart)));
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
@@ -4235,15 +4059,16 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Shuffle the current and previous vector and update the vector parts.
   for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+    Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
     auto *Shuffle =
-        VF > 1
-            ? Builder.CreateShuffleVector(Incoming, PreviousParts[Part],
-                                          ConstantVector::get(ShuffleMask))
-            : Incoming;
-    PhiParts[Part]->replaceAllUsesWith(Shuffle);
-    cast<Instruction>(PhiParts[Part])->eraseFromParent();
-    PhiParts[Part] = Shuffle;
-    Incoming = PreviousParts[Part];
+        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+                                             ConstantVector::get(ShuffleMask))
+               : Incoming;
+    PhiPart->replaceAllUsesWith(Shuffle);
+    cast<Instruction>(PhiPart)->eraseFromParent();
+    VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+    Incoming = PreviousPart;
   }
 
   // Fix the latch value of the new recurrence in the vector loop.
@@ -4251,18 +4076,33 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // Extract the last vector element in the middle block. This will be the
   // initial value for the recurrence when jumping to the scalar loop.
-  auto *Extract = Incoming;
+  auto *ExtractForScalar = Incoming;
   if (VF > 1) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
-    Extract = Builder.CreateExtractElement(Extract, Builder.getInt32(VF - 1),
-                                           "vector.recur.extract");
-  }
+    ExtractForScalar = Builder.CreateExtractElement(
+        ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
+  }
+  // Extract the second last element in the middle block if the
+  // Phi is used outside the loop. We need to extract the phi itself
+  // and not the last element (the phi update in the current iteration). This
+  // will be the value when jumping to the exit block from the LoopMiddleBlock,
+  // when the scalar loop is not run at all.
+  Value *ExtractForPhiUsedOutsideLoop = nullptr;
+  if (VF > 1)
+    ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+        Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
+  // When loop is unrolled without vectorizing, initialize
+  // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+  // `Incoming`. This is analogous to the vectorized case above: extracting the
+  // second last element when VF > 1.
+  else if (UF > 1)
+    ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
 
   // Fix the initial value of the original recurrence in the scalar loop.
   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
   for (auto *BB : predecessors(LoopScalarPreHeader)) {
-    auto *Incoming = BB == LoopMiddleBlock ? Extract : ScalarInit;
+    auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
     Start->addIncoming(Incoming, BB);
   }
 
@@ -4279,43 +4119,200 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
     if (!LCSSAPhi)
       break;
     if (LCSSAPhi->getIncomingValue(0) == Phi) {
-      LCSSAPhi->addIncoming(Extract, LoopMiddleBlock);
+      LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
       break;
     }
   }
 }
 
-void InnerLoopVectorizer::fixLCSSAPHIs() {
-  for (Instruction &LEI : *LoopExitBlock) {
-    auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
-    if (!LCSSAPhi)
-      break;
-    if (LCSSAPhi->getNumIncomingValues() == 1)
-      LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
-                            LoopMiddleBlock);
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+  Constant *Zero = Builder.getInt32(0);
+
+  // Get it's reduction variable descriptor.
+  assert(Legal->isReductionVariable(Phi) &&
+         "Unable to find the reduction variable");
+  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+
+  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+    RdxDesc.getMinMaxRecurrenceKind();
+  setDebugLocFromInst(Builder, ReductionStartValue);
+
+  // We need to generate a reduction vector from the incoming scalar.
+  // To do so, we need to generate the 'identity' vector and override
+  // one of the elements with the incoming scalar reduction. We need
+  // to do it in the vector-loop preheader.
+  Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+  // This is the vector-clone of the value that leaves the loop.
+  Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
+
+  // Find the reduction identity variable. Zero for addition, or, xor,
+  // one for multiplication, -1 for And.
+  Value *Identity;
+  Value *VectorStart;
+  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
+      RK == RecurrenceDescriptor::RK_FloatMinMax) {
+    // MinMax reduction have the start value as their identify.
+    if (VF == 1) {
+      VectorStart = Identity = ReductionStartValue;
+    } else {
+      VectorStart = Identity =
+        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
+    }
+  } else {
+    // Handle other reduction kinds:
+    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+        RK, VecTy->getScalarType());
+    if (VF == 1) {
+      Identity = Iden;
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = ReductionStartValue;
+    } else {
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart =
+        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
+    }
   }
-}
 
-void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
+  // Fix the vector-loop phi.
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
   BasicBlock *Latch = OrigLoop->getLoopLatch();
+  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+    Value *Val = getOrCreateVectorValue(LoopVal, Part);
+    // Make sure to add the reduction stat value only to the
+    // first unroll part.
+    Value *StartVal = (Part == 0) ? VectorStart : Identity;
+    cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
+    cast<PHINode>(VecRdxPhi)
+      ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+  }
+
+  // Before each round, move the insertion point right between
+  // the PHIs and the values we are going to write.
+  // This allows us to write both PHINodes and the extractelement
+  // instructions.
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
 
-  // We create new control-flow for the vectorized loop, so the original
-  // condition will be dead after vectorization if it's only used by the
-  // branch.
-  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && Cmp->hasOneUse())
-    DeadInstructions.insert(Cmp);
+  setDebugLocFromInst(Builder, LoopExitInst);
 
-  // We create new "steps" for induction variable updates to which the original
-  // induction variables map. An original update instruction will be dead if
-  // all its users except the induction variable are dead.
-  for (auto &Induction : *Legal->getInductionVars()) {
-    PHINode *Ind = Induction.first;
-    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
-          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
-        }))
-      DeadInstructions.insert(IndUpdate);
+  // If the vector reduction can be performed in a smaller type, we truncate
+  // then extend the loop exit value to enable InstCombine to evaluate the
+  // entire expression in the smaller type.
+  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
+    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+    Builder.SetInsertPoint(LoopVectorBody->getTerminator());
+    VectorParts RdxParts(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+      Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+                                        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+           UI != RdxParts[Part]->user_end();)
+        if (*UI != Trunc) {
+          (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+          RdxParts[Part] = Extnd;
+        } else {
+          ++UI;
+        }
+    }
+    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+      VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+    }
+  }
+
+  // Reduce all of the unrolled parts into a single vector.
+  Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
+  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+  setDebugLocFromInst(Builder, ReducedPartRdx);
+  for (unsigned Part = 1; Part < UF; ++Part) {
+    Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+      // Floating point operations had to be 'fast' to enable the reduction.
+      ReducedPartRdx = addFastMathFlag(
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
+                              ReducedPartRdx, "bin.rdx"));
+    else
+      ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
+          Builder, MinMaxKind, ReducedPartRdx, RdxPart);
+  }
+
+  if (VF > 1) {
+    bool NoNaN = Legal->hasFunNoNaNAttr();
+    ReducedPartRdx =
+        createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
+    // If the reduction can be performed in a smaller type, we need to extend
+    // the reduction to the wider type before we branch to the original loop.
+    if (Phi->getType() != RdxDesc.getRecurrenceType())
+      ReducedPartRdx =
+        RdxDesc.isSigned()
+        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+        : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+  }
+
+  // Create a phi node that merges control-flow from the backedge-taken check
+  // block and the middle block.
+  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+                                        LoopScalarPreHeader->getTerminator());
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+  // Now, we need to fix the users of the reduction variable
+  // inside and outside of the scalar remainder loop.
+  // We know that the loop is in LCSSA form. We need to update the
+  // PHI nodes in the exit blocks.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end();
+       LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi)
+      break;
+
+    // All PHINodes need to have a single entry edge, or two if
+    // we already fixed them.
+    assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+    // We found a reduction value exit-PHI. Update it with the
+    // incoming bypass edge.
+    if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
+      LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+  } // end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+  int IncomingEdgeBlockIdx =
+    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+  // Pick the other block.
+  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+  for (Instruction &LEI : *LoopExitBlock) {
+    auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
+    if (!LCSSAPhi)
+      break;
+    if (LCSSAPhi->getNumIncomingValues() == 1) {
+      assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) &&
+             "Incoming value isn't loop invariant");
+      LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock);
+    }
   }
 }
 
@@ -4464,14 +4461,15 @@ void InnerLoopVectorizer::predicateInstructions() {
   for (auto KV : PredicatedInstructions) {
     BasicBlock::iterator I(KV.first);
     BasicBlock *Head = I->getParent();
-    auto *BB = SplitBlock(Head, &*std::next(I), DT, LI);
     auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
                                         /*BranchWeights=*/nullptr, DT, LI);
     I->moveBefore(T);
     sinkScalarOperands(&*I);
 
-    I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
-    BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");
+    BasicBlock *PredicatedBlock = I->getParent();
+    Twine BBNamePrefix = Twine("pred.") + I->getOpcodeName();
+    PredicatedBlock->setName(BBNamePrefix + ".if");
+    PredicatedBlock->getSingleSuccessor()->setName(BBNamePrefix + ".continue");
 
     // If the instruction is non-void create a Phi node at reconvergence point.
     if (!I->getType()->isVoidTy()) {
@@ -4510,8 +4508,8 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
 
   // Look for cached value.
   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
-  EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
-  if (ECEntryIt != MaskCache.end())
+  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+  if (ECEntryIt != EdgeMaskCache.end())
     return ECEntryIt->second;
 
   VectorParts SrcMask = createBlockInMask(Src);
@@ -4521,20 +4519,22 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(BI && "Unexpected terminator found");
 
   if (BI->isConditional()) {
-    VectorParts EdgeMask = getVectorValue(BI->getCondition());
 
-    if (BI->getSuccessor(0) != Dst)
-      for (unsigned part = 0; part < UF; ++part)
-        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
+    VectorParts EdgeMask(UF);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
+      if (BI->getSuccessor(0) != Dst)
+        EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
 
-    for (unsigned part = 0; part < UF; ++part)
-      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
+      EdgeMask[Part] = EdgeMaskPart;
+    }
 
-    MaskCache[Edge] = EdgeMask;
+    EdgeMaskCache[Edge] = EdgeMask;
     return EdgeMask;
   }
 
-  MaskCache[Edge] = SrcMask;
+  EdgeMaskCache[Edge] = SrcMask;
   return SrcMask;
 }
 
@@ -4542,41 +4542,54 @@ InnerLoopVectorizer::VectorParts
 InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
 
+  // Look for cached value.
+  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+  if (BCEntryIt != BlockMaskCache.end())
+    return BCEntryIt->second;
+
+  VectorParts BlockMask(UF);
+
   // Loop incoming mask is all-one.
   if (OrigLoop->getHeader() == BB) {
     Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
-    return getVectorValue(C);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = getOrCreateVectorValue(C, Part);
+    BlockMaskCache[BB] = BlockMask;
+    return BlockMask;
   }
 
   // This is the block mask. We OR all incoming edges, and with zero.
   Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
-  VectorParts BlockMask = getVectorValue(Zero);
+  for (unsigned Part = 0; Part < UF; ++Part)
+    BlockMask[Part] = getOrCreateVectorValue(Zero, Part);
 
   // For each pred:
-  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
-    VectorParts EM = createEdgeMask(*it, BB);
-    for (unsigned part = 0; part < UF; ++part)
-      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
+  for (pred_iterator It = pred_begin(BB), E = pred_end(BB); It != E; ++It) {
+    VectorParts EM = createEdgeMask(*It, BB);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EM[Part]);
   }
 
+  BlockMaskCache[BB] = BlockMask;
   return BlockMask;
 }
 
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
-                                              unsigned VF, PhiVector *PV) {
+                                              unsigned VF) {
   PHINode *P = cast<PHINode>(PN);
-  // Handle recurrences.
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+  // this value when we vectorize all of the instructions that use the PHI.
   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part) {
+    for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
       Type *VecTy =
           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
-      Entry[part] = PHINode::Create(
+      Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+      VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
     }
-    VectorLoopValueMap.initVector(P, Entry);
-    PV->push_back(P);
     return;
   }
 
@@ -4600,21 +4613,22 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     for (unsigned In = 0; In < NumIncoming; In++) {
       VectorParts Cond =
           createEdgeMask(P->getIncomingBlock(In), P->getParent());
-      const VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
 
-      for (unsigned part = 0; part < UF; ++part) {
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
         // We might have single edge PHIs (blocks) - use an identity
         // 'select' for the first PHI operand.
         if (In == 0)
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In0[part]);
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, In0);
         else
           // Select between the current value and the previous incoming edge
           // based on the incoming mask.
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], Entry[part],
+          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
                                              "predphi");
       }
     }
-    VectorLoopValueMap.initVector(P, Entry);
+    for (unsigned Part = 0; Part < UF; ++Part)
+      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
     return;
   }
 
@@ -4631,7 +4645,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   case InductionDescriptor::IK_NoInduction:
     llvm_unreachable("Unknown induction");
   case InductionDescriptor::IK_IntInduction:
-    return widenIntInduction(P);
+  case InductionDescriptor::IK_FpInduction:
+    return widenIntOrFpInduction(P);
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
@@ -4641,45 +4656,18 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // Determine the number of scalars we need to generate for each unroll
     // iteration. If the instruction is uniform, we only need to generate the
     // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
+    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
-    ScalarParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Entry[Part].resize(VF);
       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
         Value *SclrGep = II.transform(Builder, GlobalIdx, PSE.getSE(), DL);
         SclrGep->setName("next.gep");
-        Entry[Part][Lane] = SclrGep;
+        VectorLoopValueMap.setScalarValue(P, Part, Lane, SclrGep);
       }
     }
-    VectorLoopValueMap.initScalar(P, Entry);
-    return;
-  }
-  case InductionDescriptor::IK_FpInduction: {
-    assert(P->getType() == II.getStartValue()->getType() &&
-           "Types must match");
-    // Handle other induction variables that are now based on the
-    // canonical one.
-    assert(P != OldInduction && "Primary induction can be integer only");
-
-    Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
-    V = II.transform(Builder, V, PSE.getSE(), DL);
-    V->setName("fp.offset.idx");
-
-    // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
-
-    Value *Broadcasted = getBroadcastInstrs(V);
-    // After broadcasting the induction variable we need to make the vector
-    // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
-    Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part)
-      Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
-                                  II.getInductionOpcode());
-    VectorLoopValueMap.initVector(P, Entry);
     return;
   }
   }
@@ -4703,269 +4691,317 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
-  // For each instruction in the old loop.
-  for (Instruction &I : *BB) {
+void InnerLoopVectorizer::vectorizeInstruction(Instruction &I) {
+  // Scalarize instructions that should remain scalar after vectorization.
+  if (VF > 1 &&
+      !(isa<BranchInst>(&I) || isa<PHINode>(&I) || isa<DbgInfoIntrinsic>(&I)) &&
+      shouldScalarizeInstruction(&I)) {
+    scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
+    return;
+  }
 
-    // If the instruction will become trivially dead when vectorized, we don't
-    // need to generate it.
-    if (DeadInstructions.count(&I))
-      continue;
+  switch (I.getOpcode()) {
+  case Instruction::Br:
+    // Nothing to do for PHIs and BR, since we already took care of the
+    // loop control flow instructions.
+    break;
+  case Instruction::PHI: {
+    // Vectorize PHINodes.
+    widenPHIInstruction(&I, UF, VF);
+    break;
+  } // End of PHI.
+  case Instruction::GetElementPtr: {
+    // Construct a vector GEP by widening the operands of the scalar GEP as
+    // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+    // results in a vector of pointers when at least one operand of the GEP
+    // is vector-typed. Thus, to keep the representation compact, we only use
+    // vector-typed operands for loop-varying values.
+    auto *GEP = cast<GetElementPtrInst>(&I);
+
+    if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+      // If we are vectorizing, but the GEP has only loop-invariant operands,
+      // the GEP we build (by only using vector-typed operands for
+      // loop-varying values) would be a scalar pointer. Thus, to ensure we
+      // produce a vector of pointers, we need to either arbitrarily pick an
+      // operand to broadcast, or broadcast a clone of the original GEP.
+      // Here, we broadcast a clone of the original.
+      //
+      // TODO: If at some point we decide to scalarize instructions having
+      //       loop-invariant operands, this special case will no longer be
+      //       required. We would add the scalarization decision to
+      //       collectLoopScalars() and teach getVectorValue() to broadcast
+      //       the lane-zero scalar value.
+      auto *Clone = Builder.Insert(GEP->clone());
+      for (unsigned Part = 0; Part < UF; ++Part) {
+        Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+        VectorLoopValueMap.setVectorValue(&I, Part, EntryPart);
+        addMetadata(EntryPart, GEP);
+      }
+    } else {
+      // If the GEP has at least one loop-varying operand, we are sure to
+      // produce a vector of pointers. But if we are only unrolling, we want
+      // to produce a scalar GEP for each unroll part. Thus, the GEP we
+      // produce with the code below will be scalar (if VF == 1) or vector
+      // (otherwise). Note that for the unroll-only case, we still maintain
+      // values in the vector mapping with initVector, as we do for other
+      // instructions.
+      for (unsigned Part = 0; Part < UF; ++Part) {
 
-    // Scalarize instructions that should remain scalar after vectorization.
-    if (VF > 1 &&
-        !(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
-          isa<DbgInfoIntrinsic>(&I)) &&
-        shouldScalarizeInstruction(&I)) {
-      scalarizeInstruction(&I, Legal->isScalarWithPredication(&I));
-      continue;
-    }
+        // The pointer operand of the new GEP. If it's loop-invariant, we
+        // won't broadcast it.
+        auto *Ptr =
+            OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                ? GEP->getPointerOperand()
+                : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+
+        // Collect all the indices for the new GEP. If any index is
+        // loop-invariant, we won't broadcast it.
+        SmallVector<Value *, 4> Indices;
+        for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+          if (OrigLoop->isLoopInvariant(U.get()))
+            Indices.push_back(U.get());
+          else
+            Indices.push_back(getOrCreateVectorValue(U.get(), Part));
+        }
 
-    switch (I.getOpcode()) {
-    case Instruction::Br:
-      // Nothing to do for PHIs and BR, since we already took care of the
-      // loop control flow instructions.
-      continue;
-    case Instruction::PHI: {
-      // Vectorize PHINodes.
-      widenPHIInstruction(&I, UF, VF, PV);
-      continue;
-    } // End of PHI.
-
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::SRem:
-    case Instruction::URem:
-      // Scalarize with predication if this instruction may divide by zero and
-      // block execution is conditional, otherwise fallthrough.
-      if (Legal->isScalarWithPredication(&I)) {
-        scalarizeInstruction(&I, true);
-        continue;
+        // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+        // but it should be a vector, otherwise.
+        auto *NewGEP = GEP->isInBounds()
+                           ? Builder.CreateInBoundsGEP(Ptr, Indices)
+                           : Builder.CreateGEP(Ptr, Indices);
+        assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+               "NewGEP is not a pointer vector");
+        VectorLoopValueMap.setVectorValue(&I, Part, NewGEP);
+        addMetadata(NewGEP, GEP);
       }
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::FDiv:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      // Just widen binops.
-      auto *BinOp = cast<BinaryOperator>(&I);
-      setDebugLocFromInst(Builder, BinOp);
-      const VectorParts &A = getVectorValue(BinOp->getOperand(0));
-      const VectorParts &B = getVectorValue(BinOp->getOperand(1));
+    }
+
+    break;
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // Scalarize with predication if this instruction may divide by zero and
+    // block execution is conditional, otherwise fallthrough.
+    if (Legal->isScalarWithPredication(&I)) {
+      scalarizeInstruction(&I, true);
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    // Just widen binops.
+    auto *BinOp = cast<BinaryOperator>(&I);
+    setDebugLocFromInst(Builder, BinOp);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(BinOp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(BinOp->getOperand(1), Part);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+
+      if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
+        VecOp->copyIRFlags(BinOp);
 
       // Use this vector value for all users of the original instruction.
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, BinOp);
+    }
 
-        if (BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V))
-          VecOp->copyIRFlags(BinOp);
+    break;
+  }
+  case Instruction::Select: {
+    // Widen selects.
+    // If the selector is loop invariant we can create a select
+    // instruction with a scalar condition. Otherwise, use vector-select.
+    auto *SE = PSE.getSE();
+    bool InvariantCond =
+        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
+    setDebugLocFromInst(Builder, &I);
 
-        Entry[Part] = V;
-      }
+    // The condition can be loop invariant  but still defined inside the
+    // loop. This means that we can't just use the original 'cond' value.
+    // We have to take the 'vectorized' value and pick the first lane.
+    // Instcombine will make this a no-op.
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, BinOp);
-      break;
+    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), 0, 0);
+
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
+      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
+      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
+      Value *Sel =
+          Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
+      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+      addMetadata(Sel, &I);
     }
-    case Instruction::Select: {
-      // Widen selects.
-      // If the selector is loop invariant we can create a select
-      // instruction with a scalar condition. Otherwise, use vector-select.
-      auto *SE = PSE.getSE();
-      bool InvariantCond =
-          SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
-      setDebugLocFromInst(Builder, &I);
-
-      // The condition can be loop invariant  but still defined inside the
-      // loop. This means that we can't just use the original 'cond' value.
-      // We have to take the 'vectorized' value and pick the first lane.
-      // Instcombine will make this a no-op.
-      const VectorParts &Cond = getVectorValue(I.getOperand(0));
-      const VectorParts &Op0 = getVectorValue(I.getOperand(1));
-      const VectorParts &Op1 = getVectorValue(I.getOperand(2));
-
-      auto *ScalarCond = getScalarValue(I.getOperand(0), 0, 0);
-
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Entry[Part] = Builder.CreateSelect(
-            InvariantCond ? ScalarCond : Cond[Part], Op0[Part], Op1[Part]);
+
+    break;
+  }
+
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    // Widen compares. Generate vector compares.
+    bool FCmp = (I.getOpcode() == Instruction::FCmp);
+    auto *Cmp = dyn_cast<CmpInst>(&I);
+    setDebugLocFromInst(Builder, Cmp);
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
+      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
+      Value *C = nullptr;
+      if (FCmp) {
+        C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+        cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
+      } else {
+        C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
       }
+      VectorLoopValueMap.setVectorValue(&I, Part, C);
+      addMetadata(C, &I);
+    }
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
+    break;
+  }
+
+  case Instruction::Store:
+  case Instruction::Load:
+    vectorizeMemoryInstruction(&I);
+    break;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    auto *CI = dyn_cast<CastInst>(&I);
+    setDebugLocFromInst(Builder, CI);
+
+    // Optimize the special case where the source is a constant integer
+    // induction variable. Notice that we can only optimize the 'trunc' case
+    // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+    // (c) other casts depend on pointer size.
+    if (Cost->isOptimizableIVTruncate(CI, VF)) {
+      widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
+                            cast<TruncInst>(CI));
       break;
     }
 
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      // Widen compares. Generate vector compares.
-      bool FCmp = (I.getOpcode() == Instruction::FCmp);
-      auto *Cmp = dyn_cast<CmpInst>(&I);
-      setDebugLocFromInst(Builder, Cmp);
-      const VectorParts &A = getVectorValue(Cmp->getOperand(0));
-      const VectorParts &B = getVectorValue(Cmp->getOperand(1));
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        Value *C = nullptr;
-        if (FCmp) {
-          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
-          cast<FCmpInst>(C)->copyFastMathFlags(Cmp);
-        } else {
-          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
-        }
-        Entry[Part] = C;
-      }
+    /// Vectorize casts.
+    Type *DestTy =
+        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
 
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+      Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      VectorLoopValueMap.setVectorValue(&I, Part, Cast);
+      addMetadata(Cast, &I);
     }
+    break;
+  }
 
-    case Instruction::Store:
-    case Instruction::Load:
-      vectorizeMemoryInstruction(&I);
+  case Instruction::Call: {
+    // Ignore dbg intrinsics.
+    if (isa<DbgInfoIntrinsic>(I))
       break;
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      auto *CI = dyn_cast<CastInst>(&I);
-      setDebugLocFromInst(Builder, CI);
-
-      // Optimize the special case where the source is a constant integer
-      // induction variable. Notice that we can only optimize the 'trunc' case
-      // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
-      // (c) other casts depend on pointer size.
-      auto ID = Legal->getInductionVars()->lookup(OldInduction);
-      if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
-          ID.getConstIntStepValue()) {
-        widenIntInduction(OldInduction, cast<TruncInst>(CI));
-        break;
-      }
+    setDebugLocFromInst(Builder, &I);
 
-      /// Vectorize casts.
-      Type *DestTy =
-          (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+    Module *M = I.getParent()->getParent()->getParent();
+    auto *CI = cast<CallInst>(&I);
 
-      const VectorParts &A = getVectorValue(CI->getOperand(0));
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
+    StringRef FnName = CI->getCalledFunction()->getName();
+    Function *F = CI->getCalledFunction();
+    Type *RetTy = ToVectorTy(CI->getType(), VF);
+    SmallVector<Type *, 4> Tys;
+    for (Value *ArgOperand : CI->arg_operands())
+      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+               ID == Intrinsic::lifetime_start)) {
+      scalarizeInstruction(&I);
+      break;
+    }
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize;
+    unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
+    if (!UseVectorIntrinsic && NeedToScalarize) {
+      scalarizeInstruction(&I);
       break;
     }
 
-    case Instruction::Call: {
-      // Ignore dbg intrinsics.
-      if (isa<DbgInfoIntrinsic>(I))
-        break;
-      setDebugLocFromInst(Builder, &I);
-
-      Module *M = BB->getParent()->getParent();
-      auto *CI = cast<CallInst>(&I);
-
-      StringRef FnName = CI->getCalledFunction()->getName();
-      Function *F = CI->getCalledFunction();
-      Type *RetTy = ToVectorTy(CI->getType(), VF);
-      SmallVector<Type *, 4> Tys;
-      for (Value *ArgOperand : CI->arg_operands())
-        Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-                 ID == Intrinsic::lifetime_start)) {
-        scalarizeInstruction(&I);
-        break;
-      }
-      // The flag shows whether we use Intrinsic or a usual Call for vectorized
-      // version of the instruction.
-      // Is it beneficial to perform intrinsic call compared to lib call?
-      bool NeedToScalarize;
-      unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize);
-      bool UseVectorIntrinsic =
-          ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost;
-      if (!UseVectorIntrinsic && NeedToScalarize) {
-        scalarizeInstruction(&I);
-        break;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<Value *, 4> Args;
+      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+        Value *Arg = CI->getArgOperand(i);
+        // Some intrinsics have a scalar argument - don't replace it with a
+        // vector.
+        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
+        Args.push_back(Arg);
       }
 
-      VectorParts Entry(UF);
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Args;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          Value *Arg = CI->getArgOperand(i);
-          // Some intrinsics have a scalar argument - don't replace it with a
-          // vector.
-          if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) {
-            const VectorParts &VectorArg = getVectorValue(CI->getArgOperand(i));
-            Arg = VectorArg[Part];
-          }
-          Args.push_back(Arg);
-        }
-
-        Function *VectorF;
-        if (UseVectorIntrinsic) {
-          // Use vector version of the intrinsic.
-          Type *TysForDecl[] = {CI->getType()};
-          if (VF > 1)
-            TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-          VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
-        } else {
-          // Use vector version of the library call.
-          StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
-          assert(!VFnName.empty() && "Vector function name is empty.");
-          VectorF = M->getFunction(VFnName);
-          if (!VectorF) {
-            // Generate a declaration
-            FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
-            VectorF =
-                Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
-            VectorF->copyAttributesFrom(F);
-          }
+      Function *VectorF;
+      if (UseVectorIntrinsic) {
+        // Use vector version of the intrinsic.
+        Type *TysForDecl[] = {CI->getType()};
+        if (VF > 1)
+          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+      } else {
+        // Use vector version of the library call.
+        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
+        assert(!VFnName.empty() && "Vector function name is empty.");
+        VectorF = M->getFunction(VFnName);
+        if (!VectorF) {
+          // Generate a declaration
+          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
+          VectorF =
+              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
+          VectorF->copyAttributesFrom(F);
         }
-        assert(VectorF && "Can't create vector function.");
+      }
+      assert(VectorF && "Can't create vector function.");
 
-        SmallVector<OperandBundleDef, 1> OpBundles;
-        CI->getOperandBundlesAsDefs(OpBundles);
-        CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      CI->getOperandBundlesAsDefs(OpBundles);
+      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
 
-        if (isa<FPMathOperator>(V))
-          V->copyFastMathFlags(CI);
+      if (isa<FPMathOperator>(V))
+        V->copyFastMathFlags(CI);
 
-        Entry[Part] = V;
-      }
-
-      VectorLoopValueMap.initVector(&I, Entry);
-      addMetadata(Entry, &I);
-      break;
+      VectorLoopValueMap.setVectorValue(&I, Part, V);
+      addMetadata(V, &I);
     }
 
-    default:
-      // All other instructions are unsupported. Scalarize them.
-      scalarizeInstruction(&I);
-      break;
-    } // end of switch.
-  }   // end of for_each instr.
+    break;
+  }
+
+  default:
+    // All other instructions are unsupported. Scalarize them.
+    scalarizeInstruction(&I);
+    break;
+  } // end of switch.
 }
 
 void InnerLoopVectorizer::updateAnalysis() {
@@ -4976,11 +5012,10 @@ void InnerLoopVectorizer::updateAnalysis() {
   assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
          "Entry does not dominate exit.");
 
-  // We don't predicate stores by this point, so the vector body should be a
-  // single loop.
-  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
-
-  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody);
+  DT->addNewBlock(LI->getLoopFor(LoopVectorBody)->getHeader(),
+                  LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock,
+                  LI->getLoopFor(LoopVectorBody)->getLoopLatch());
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
@@ -5056,12 +5091,18 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 }
 
 bool LoopVectorizationLegality::canVectorize() {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
   if (!TheLoop->getLoopPreheader()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // FIXME: The code is currently dead, since the loop gets sent to
@@ -5071,21 +5112,30 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->empty()) {
     ORE->emit(createMissedAnalysis("NotInnermostLoop")
               << "loop is not the innermost loop");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We must have a single backedge.
   if (TheLoop->getNumBackEdges() != 1) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We must have a single exiting block.
   if (!TheLoop->getExitingBlock()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We only handle bottom-tested loops, i.e. loop in which the condition is
@@ -5094,7 +5144,10 @@ bool LoopVectorizationLegality::canVectorize() {
   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We need to have a loop header.
@@ -5105,28 +5158,28 @@ bool LoopVectorizationLegality::canVectorize() {
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
-    return false;
-  }
-
-  // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE.getBackedgeTakenCount();
-  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
-    ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations")
-              << "could not determine number of loop iterations");
-    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // Check if we can vectorize the instructions and CFG in this loop.
   if (!canVectorizeInstrs()) {
     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // Go over each instruction and look at memory deps.
   if (!canVectorizeMemory()) {
     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   DEBUG(dbgs() << "LV: We can vectorize this loop"
@@ -5145,12 +5198,6 @@ bool LoopVectorizationLegality::canVectorize() {
   if (UseInterleaved)
     InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
 
-  // Collect all instructions that are known to be uniform after vectorization.
-  collectLoopUniforms();
-
-  // Collect all instructions that are known to be scalar after vectorization.
-  collectLoopScalars();
-
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
@@ -5160,13 +5207,17 @@ bool LoopVectorizationLegality::canVectorize() {
               << "Too many SCEV assumptions need to be made and checked "
               << "at runtime");
     DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
-  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // Okay! We've done all the tests. If any have failed, return false. Otherwise
+  // we can vectorize, and at this point we don't have any other mem analysis
   // which may limit our maximum vectorization factor, so just return true with
   // no restrictions.
-  return true;
+  return Result;
 }
 
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
@@ -5234,14 +5285,19 @@ void LoopVectorizationLegality::addInductionPhi(
     // one if there are multiple (no good reason for doing this other
     // than it is expedient). We've checked that it begins at zero and
     // steps by one, so this is a canonical induction variable.
-    if (!Induction || PhiTy == WidestIndTy)
-      Induction = Phi;
+    if (!PrimaryInduction || PhiTy == WidestIndTy)
+      PrimaryInduction = Phi;
   }
 
   // Both the PHI node itself, and the "post-increment" value feeding
   // back into the PHI node may have external users.
-  AllowedExit.insert(Phi);
-  AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+  // We can allow those uses, except if the SCEVs we have for them rely
+  // on predicates that only hold within the loop, since allowing the exit
+  // currently means re-using this SCEV outside the loop.
+  if (PSE.getUnionPredicate().isAlwaysTrue()) {
+    AllowedExit.insert(Phi);
+    AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+  }
 
   DEBUG(dbgs() << "LV: Found an induction variable.\n");
   return;
@@ -5309,7 +5365,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
 
-        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop, DT)) {
+        if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+                                                         SinkAfter, DT)) {
           FirstOrderRecurrences.insert(Phi);
           continue;
         }
@@ -5398,7 +5455,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     } // next instr.
   }
 
-  if (!Induction) {
+  if (!PrimaryInduction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     if (Inductions.empty()) {
       ORE->emit(createMissedAnalysis("NoInductionVariable")
@@ -5410,46 +5467,173 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   // Now we know the widest induction type, check if our found induction
   // is the same size. If it's not, unset it here and InnerLoopVectorizer
   // will create another.
-  if (Induction && WidestIndTy != Induction->getType())
-    Induction = nullptr;
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
 
   return true;
 }
 
-void LoopVectorizationLegality::collectLoopScalars() {
+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+
+  // We should not collect Scalars more than once per VF. Right now, this
+  // function is called from collectUniformsAndScalars(), which already does
+  // this check. Collecting Scalars for VF=1 does not make any sense.
+  assert(VF >= 2 && !Scalars.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // These sets are used to seed the analysis with pointers used by memory
+  // accesses that will remain scalar.
+  SmallSetVector<Instruction *, 8> ScalarPtrs;
+  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+
+  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+  // The pointer operands of loads and stores will be scalar as long as the
+  // memory access is not a gather or scatter operation. The value operand of a
+  // store will remain scalar if the store is scalarized.
+  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+      if (Ptr == Store->getValueOperand())
+        return WideningDecision == CM_Scalarize;
+    assert(Ptr == getPointerOperand(MemAccess) &&
+           "Ptr is neither a value or pointer operand");
+    return WideningDecision != CM_GatherScatter;
+  };
+
+  // A helper that returns true if the given value is a bitcast or
+  // getelementptr instruction contained in the loop.
+  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+            isa<GetElementPtrInst>(V)) &&
+           !TheLoop->isLoopInvariant(V);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the use
+  // will be a scalar use, and the pointer is only used by memory accesses, we
+  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+  // PossibleNonScalarPtrs.
+  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+
+    // We only care about bitcast and getelementptr instructions contained in
+    // the loop.
+    if (!isLoopVaryingBitCastOrGEP(Ptr))
+      return;
 
-  // If an instruction is uniform after vectorization, it will remain scalar.
-  Scalars.insert(Uniforms.begin(), Uniforms.end());
+    // If the pointer has already been identified as scalar (e.g., if it was
+    // also identified as uniform), there's nothing to do.
+    auto *I = cast<Instruction>(Ptr);
+    if (Worklist.count(I))
+      return;
 
-  // Collect the getelementptr instructions that will not be vectorized. A
-  // getelementptr instruction is only vectorized if it is used for a legal
-  // gather or scatter operation.
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        }))
+      ScalarPtrs.insert(I);
+    else
+      PossibleNonScalarPtrs.insert(I);
+  };
+
+  // We seed the scalars analysis with three classes of instructions: (1)
+  // instructions marked uniform-after-vectorization, (2) bitcast and
+  // getelementptr instructions used by memory accesses requiring a scalar use,
+  // and (3) pointer induction variables and their update instructions (we
+  // currently only scalarize these).
+  //
+  // (1) Add to the worklist all instructions that have been identified as
+  // uniform-after-vectorization.
+  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+  // (2) Add to the worklist all bitcast and getelementptr instructions used by
+  // memory accesses requiring a scalar use. The pointer operands of loads and
+  // stores will be scalar as long as the memory accesses is not a gather or
+  // scatter operation. The value operand of a store will remain scalar if the
+  // store is scalarized.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-        Scalars.insert(GEP);
-        continue;
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        evaluatePtrUse(Load, Load->getPointerOperand());
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        evaluatePtrUse(Store, Store->getPointerOperand());
+        evaluatePtrUse(Store, Store->getValueOperand());
       }
-      auto *Ptr = getPointerOperand(&I);
-      if (!Ptr)
-        continue;
-      auto *GEP = getGEPInstruction(Ptr);
-      if (GEP && isLegalGatherOrScatter(&I))
-        Scalars.erase(GEP);
+    }
+  for (auto *I : ScalarPtrs)
+    if (!PossibleNonScalarPtrs.count(I)) {
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      Worklist.insert(I);
     }
 
+  // (3) Add to the worklist all pointer induction variables and their update
+  // instructions.
+  //
+  // TODO: Once we are able to vectorize pointer induction variables we should
+  //       no longer insert them into the worklist here.
+  auto *Latch = TheLoop->getLoopLatch();
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
+      continue;
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+  }
+
+  // Insert the forced scalars.
+  // FIXME: Currently widenPHIInstruction() often creates a dead vector
+  // induction variable when the PHI user is scalarized.
+  if (ForcedScalars.count(VF))
+    for (auto *I : ForcedScalars.find(VF)->second)
+      Worklist.insert(I);
+
+  // Expand the worklist by looking through any bitcasts and getelementptr
+  // instructions we've already identified as scalar. This is similar to the
+  // expansion step in collectLoopUniforms(); however, here we're only
+  // expanding to include additional bitcasts and getelementptr instructions.
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *Dst = Worklist[Idx++];
+    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+      continue;
+    auto *Src = cast<Instruction>(Dst->getOperand(0));
+    if (all_of(Src->users(), [&](User *U) -> bool {
+          auto *J = cast<Instruction>(U);
+          return !TheLoop->contains(J) || Worklist.count(J) ||
+                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+                  isScalarUse(J, Src));
+        })) {
+      Worklist.insert(Src);
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+    }
+  }
+
   // An induction variable will remain scalar if all users of the induction
   // variable and induction variable update remain scalar.
-  auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : *getInductionVars()) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
+    // We already considered pointer induction variables, so there's no reason
+    // to look at their users again.
+    //
+    // TODO: Once we are able to vectorize pointer induction variables we
+    //       should no longer skip over them here.
+    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
+      continue;
+
     // Determine if all users of the induction variable are scalar after
     // vectorization.
     auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarInd)
       continue;
@@ -5458,23 +5642,19 @@ void LoopVectorizationLegality::collectLoopScalars() {
     // scalar after vectorization.
     auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarIndUpdate)
       continue;
 
     // The induction variable and its update instruction will remain scalar.
-    Scalars.insert(Ind);
-    Scalars.insert(IndUpdate);
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
   }
-}
 
-bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
-  if (isAccessInterleaved(I))
-    return true;
-  if (auto *Ptr = getPointerOperand(I))
-    return isConsecutivePtr(Ptr);
-  return false;
+  Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
@@ -5494,48 +5674,48 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
   return false;
 }
 
-bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
-    Instruction *I, unsigned VF) {
-
-  // If the memory instruction is in an interleaved group, it will be
-  // vectorized and its pointer will remain uniform.
-  if (isAccessInterleaved(I))
-    return false;
-
+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
+                                                              unsigned VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
   assert((LI || SI) && "Invalid memory instruction");
 
-  // If the pointer operand is uniform (loop invariant), the memory instruction
-  // will be scalarized.
   auto *Ptr = getPointerOperand(I);
-  if (LI && isUniform(Ptr))
-    return true;
 
-  // If the pointer operand is non-consecutive and neither a gather nor a
-  // scatter operation is legal, the memory instruction will be scalarized.
-  if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
-    return true;
+  // In order to be widened, the pointer should be consecutive, first of all.
+  if (!isConsecutivePtr(Ptr))
+    return false;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I))
-    return true;
+    return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL, VF))
-    return true;
+    return false;
 
-  // Otherwise, the memory instruction should be vectorized if the rest of the
-  // loop is.
-  return false;
+  return true;
 }
 
-void LoopVectorizationLegality::collectLoopUniforms() {
+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+
+  // We should not collect Uniforms more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which
+  // already does this check. Collecting Uniforms for VF=1 does not make any
+  // sense.
+
+  assert(VF >= 2 && !Uniforms.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
+  // not analyze again.  Uniforms.count(VF) will return 1.
+  Uniforms[VF].clear();
+
   // We now know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
   // vectorization.
@@ -5568,6 +5748,14 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // Holds pointer operands of instructions that are possibly non-uniform.
   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
 
+  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+    InstWidening WideningDecision = getWideningDecision(I, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+
+    return (WideningDecision == CM_Widen ||
+            WideningDecision == CM_Interleave);
+  };
   // Iterate over the instructions in the loop, and collect all
   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
   // that a consecutive-like pointer operand will be scalarized, we collect it
@@ -5590,25 +5778,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {
         return getPointerOperand(U) == Ptr;
       });
 
-      // Ensure the memory instruction will not be scalarized, making its
-      // pointer operand non-uniform. If the pointer operand is used by some
-      // instruction other than a memory access, we're not going to check if
-      // that other instruction may be scalarized here. Thus, conservatively
-      // assume the pointer operand may be non-uniform.
-      if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
+      // Ensure the memory instruction will not be scalarized or used by
+      // gather/scatter, making its pointer operand non-uniform. If the pointer
+      // operand is used by any instruction other than a memory access, we
+      // conservatively assume the pointer operand may be non-uniform.
+      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
         PossibleNonUniformPtrs.insert(Ptr);
 
       // If the memory instruction will be vectorized and its pointer operand
-      // is consecutive-like, the pointer operand should remain uniform.
-      else if (hasConsecutiveLikePtrOperand(&I))
-        ConsecutiveLikePtrs.insert(Ptr);
-
-      // Otherwise, if the memory instruction will be vectorized and its
-      // pointer operand is non-consecutive-like, the memory instruction should
-      // be a gather or scatter operation. Its pointer operand will be
-      // non-uniform.
+      // is consecutive-like, or interleaving - the pointer operand should
+      // remain uniform.
       else
-        PossibleNonUniformPtrs.insert(Ptr);
+        ConsecutiveLikePtrs.insert(Ptr);
     }
 
   // Add to the Worklist all consecutive and consecutive-like pointers that
@@ -5632,7 +5813,9 @@ void LoopVectorizationLegality::collectLoopUniforms() {
         continue;
       auto *OI = cast<Instruction>(OV);
       if (all_of(OI->users(), [&](User *U) -> bool {
-            return isOutOfScope(U) || Worklist.count(cast<Instruction>(U));
+            auto *J = cast<Instruction>(U);
+            return !TheLoop->contains(J) || Worklist.count(J) ||
+                   (OI == getPointerOperand(J) && isUniformDecision(J, VF));
           })) {
         Worklist.insert(OI);
         DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
@@ -5643,7 +5826,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, and I is known to not require scalarization.
   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
+    return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
   };
 
   // For an instruction to be added into Worklist above, all its users inside
@@ -5652,7 +5835,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
-  for (auto &Induction : Inductions) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -5683,7 +5866,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
   }
 
-  Uniforms.insert(Worklist.begin(), Worklist.end());
+  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
@@ -5808,10 +5991,10 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
         continue;
 
       Value *Ptr = getPointerOperand(&I);
-      // We don't check wrapping here because we don't know yet if Ptr will be 
-      // part of a full group or a group with gaps. Checking wrapping for all 
+      // We don't check wrapping here because we don't know yet if Ptr will be
+      // part of a full group or a group with gaps. Checking wrapping for all
       // pointers (even those that end up in groups with no gaps) will be overly
-      // conservative. For full groups, wrapping should be ok since if we would 
+      // conservative. For full groups, wrapping should be ok since if we would
       // wrap around the address space we would do a memory access at nullptr
       // even without the transformation. The wrapping checks are therefore
       // deferred until after we've formed the interleaved groups.
@@ -5823,7 +6006,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
       // An alignment of 0 means target ABI alignment.
-      unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+      unsigned Align = getMemInstAlignment(&I);
       if (!Align)
         Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
@@ -5978,6 +6161,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
       if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
         continue;
 
+      // Ignore A if the memory object of A and B don't belong to the same
+      // address space
+      if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
+        continue;
+
       // Calculate the distance from A to B.
       const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
           PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
@@ -6020,36 +6208,36 @@ void InterleavedAccessInfo::analyzeInterleaving(
     if (Group->getNumMembers() != Group->getFactor())
       releaseGroup(Group);
 
-  // Remove interleaved groups with gaps (currently only loads) whose memory 
-  // accesses may wrap around. We have to revisit the getPtrStride analysis, 
-  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does 
+  // Remove interleaved groups with gaps (currently only loads) whose memory
+  // accesses may wrap around. We have to revisit the getPtrStride analysis,
+  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
   // not check wrapping (see documentation there).
-  // FORNOW we use Assume=false; 
-  // TODO: Change to Assume=true but making sure we don't exceed the threshold 
+  // FORNOW we use Assume=false;
+  // TODO: Change to Assume=true but making sure we don't exceed the threshold
   // of runtime SCEV assumptions checks (thereby potentially failing to
-  // vectorize altogether). 
+  // vectorize altogether).
   // Additional optional optimizations:
-  // TODO: If we are peeling the loop and we know that the first pointer doesn't 
+  // TODO: If we are peeling the loop and we know that the first pointer doesn't
   // wrap then we can deduce that all pointers in the group don't wrap.
-  // This means that we can forcefully peel the loop in order to only have to 
-  // check the first pointer for no-wrap. When we'll change to use Assume=true 
+  // This means that we can forcefully peel the loop in order to only have to
+  // check the first pointer for no-wrap. When we'll change to use Assume=true
   // we'll only need at most one runtime check per interleaved group.
   //
   for (InterleaveGroup *Group : LoadGroups) {
 
     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
-    // load would wrap around the address space we would do a memory access at 
-    // nullptr even without the transformation. 
-    if (Group->getNumMembers() == Group->getFactor()) 
+    // load would wrap around the address space we would do a memory access at
+    // nullptr even without the transformation.
+    if (Group->getNumMembers() == Group->getFactor())
       continue;
 
-    // Case 2: If first and last members of the group don't wrap this implies 
+    // Case 2: If first and last members of the group don't wrap this implies
     // that all the pointers in the group don't wrap.
     // So we check only group member 0 (which is always guaranteed to exist),
-    // and group member Factor - 1; If the latter doesn't exist we rely on 
+    // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reveresed accsess -- see Case 3).
     Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
-    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, 
+    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
                       /*ShouldCheckWrap=*/true)) {
       DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                       "first group member potentially pointer-wrapping.\n");
@@ -6059,18 +6247,17 @@ void InterleavedAccessInfo::analyzeInterleaving(
     Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
     if (LastMember) {
       Value *LastMemberPtr = getPointerOperand(LastMember);
-      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false, 
+      if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
                         /*ShouldCheckWrap=*/true)) {
         DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                         "last group member potentially pointer-wrapping.\n");
         releaseGroup(Group);
       }
-    }
-    else {
+    } else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
-      // to execute at least one scalar epilogue iteration. This will ensure 
+      // to execute at least one scalar epilogue iteration. This will ensure
       // we don't speculatively access memory out-of-bounds. We only need
-      // to look for a member at index factor - 1, since every group must have 
+      // to look for a member at index factor - 1, since every group must have
       // a member at index zero.
       if (Group->isReverse()) {
         releaseGroup(Group);
@@ -6082,27 +6269,62 @@ void InterleavedAccessInfo::analyzeInterleaving(
   }
 }
 
-LoopVectorizationCostModel::VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
-  // Width 1 means no vectorize
-  VectorizationFactor Factor = {1U, 0U};
-  if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
+    ORE->emit(createMissedAnalysis("ConditionalStore")
+              << "store that is conditionally executed prevents vectorization");
+    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    return None;
+  }
+
+  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
+    return computeFeasibleMaxVF(OptForSize);
+
+  if (Legal->getRuntimePointerChecking()->Need) {
     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
               << "runtime pointer checks needed. Enable vectorization of this "
                  "loop with '#pragma clang loop vectorize(enable)' when "
                  "compiling with -Os/-Oz");
     DEBUG(dbgs()
           << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return Factor;
+    return None;
   }
 
-  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
-    return Factor;
+  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+
+  // If we don't know the precise trip count, don't try to vectorize.
+  if (TC < 2) {
+    ORE->emit(
+        createMissedAnalysis("UnknownLoopCountComplexCFG")
+        << "unable to calculate the loop count due to complex control flow");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
   }
 
+  unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+
+  if (TC % MaxVF != 0) {
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
+    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
+    //        smaller MaxVF that does not require a scalar epilog.
+
+    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+              << "cannot optimize for size and vectorize at the "
+                 "same time. Enable vectorization of this loop "
+                 "with '#pragma clang loop vectorize(enable)' "
+                 "when compiling with -Os/-Oz");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  return MaxVF;
+}
+
+unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -6136,7 +6358,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
                                 " into one vector!");
 
-  unsigned VF = MaxVectorSize;
+  unsigned MaxVF = MaxVectorSize;
   if (MaximizeBandwidth && !OptForSize) {
     // Collect all viable vectorization factors.
     SmallVector<unsigned, 8> VFs;
@@ -6152,54 +6374,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
     for (int i = RUs.size() - 1; i >= 0; --i) {
       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
-        VF = VFs[i];
+        MaxVF = VFs[i];
         break;
       }
     }
   }
+  return MaxVF;
+}
 
-  // If we optimize the program for size, avoid creating the tail loop.
-  if (OptForSize) {
-    unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
-    // If we don't know the precise trip count, don't try to vectorize.
-    if (TC < 2) {
-      ORE->emit(
-          createMissedAnalysis("UnknownLoopCountComplexCFG")
-          << "unable to calculate the loop count due to complex control flow");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-      return Factor;
-    }
-
-    // Find the maximum SIMD width that can fit within the trip count.
-    VF = TC % MaxVectorSize;
-
-    if (VF == 0)
-      VF = MaxVectorSize;
-    else {
-      // If the trip count that we found modulo the vectorization factor is not
-      // zero then we require a tail.
-      ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-                << "cannot optimize for size and vectorize at the "
-                   "same time. Enable vectorization of this loop "
-                   "with '#pragma clang loop vectorize(enable)' "
-                   "when compiling with -Os/-Oz");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-      return Factor;
-    }
-  }
-
-  int UserVF = Hints->getWidth();
-  if (UserVF != 0) {
-    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-
-    Factor.Width = UserVF;
-    collectInstsToScalarize(UserVF);
-    return Factor;
-  }
-
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
   float Cost = expectedCost(1).first;
 #ifndef NDEBUG
   const float ScalarCost = Cost;
@@ -6209,12 +6393,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   // Ignore scalar width, because the user explicitly wants vectorization.
-  if (ForceVectorization && VF > 1) {
+  if (ForceVectorization && MaxVF > 1) {
     Width = 2;
     Cost = expectedCost(Width).first / (float)Width;
   }
 
-  for (unsigned i = 2; i <= VF; i *= 2) {
+  for (unsigned i = 2; i <= MaxVF; i *= 2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
@@ -6238,8 +6422,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
         << "LV: Vectorization seems to be not beneficial, "
         << "but was forced by a user.\n");
   DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  Factor.Width = Width;
-  Factor.Cost = Width * Cost;
+  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
   return Factor;
 }
 
@@ -6277,9 +6460,16 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         T = ST->getValueOperand()->getType();
 
       // Ignore loaded pointer types and stored pointer types that are not
-      // consecutive. However, we do want to take consecutive stores/loads of
-      // pointer vectors into account.
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I))
+      // vectorizable.
+      //
+      // FIXME: The check here attempts to predict whether a load or store will
+      //        be vectorized. We only know this for certain after a VF has
+      //        been selected. Here, we assume that if an access can be
+      //        vectorized, it will be. We should also look at extending this
+      //        optimization to non-pointer types.
+      //
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+          !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
         continue;
 
       MinWidth = std::min(MinWidth,
@@ -6562,12 +6752,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
         continue;
       }
-
+      collectUniformsAndScalars(VFs[j]);
       // Count the number of live intervals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals) {
         // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.count(Inst))
+        if (VecValuesToIgnore.count(Inst) ||
+            isScalarAfterVectorization(Inst, VFs[j]))
           continue;
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
       }
@@ -6628,6 +6819,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
         ScalarCostsTy ScalarCosts;
         if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+
+        // Remember that BB will remain after vectorization.
+        PredicatedBBsAfterVectorization.insert(BB);
       }
   }
 }
@@ -6636,7 +6830,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
     unsigned VF) {
 
-  assert(!Legal->isUniformAfterVectorization(PredInst) &&
+  assert(!isUniformAfterVectorization(PredInst, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
@@ -6657,7 +6851,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // already be scalar to avoid traversing chains that are unlikely to be
     // beneficial.
     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
-        Legal->isScalarAfterVectorization(I))
+        isScalarAfterVectorization(I, VF))
       return false;
 
     // If the instruction is scalar with predication, it will be analyzed
@@ -6677,7 +6871,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // the lane zero values for uniforms rather than asserting.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get()))
-        if (Legal->isUniformAfterVectorization(J))
+        if (isUniformAfterVectorization(J, VF))
           return false;
 
     // Otherwise, we can scalarize the instruction.
@@ -6690,7 +6884,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
   // and their return values are inserted into vectors. Thus, an extract would
   // still be required.
   auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
   };
 
   // Compute the expected cost discount from scalarizing the entire expression
@@ -6717,8 +6911,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
-      ScalarCost += getScalarizationOverhead(ToVectorTy(I->getType(), VF), true,
-                                             false, TTI);
+      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
+                                                 true, false);
       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
     }
 
@@ -6733,8 +6927,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
         if (canBeScalarized(J))
           Worklist.push_back(J);
         else if (needsExtract(J))
-          ScalarCost += getScalarizationOverhead(ToVectorTy(J->getType(), VF),
-                                                 false, true, TTI);
+          ScalarCost += TTI.getScalarizationOverhead(
+                              ToVectorTy(J->getType(),VF), false, true);
       }
 
     // Scale the total scalar cost by block probability.
@@ -6753,6 +6947,9 @@ LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
 
+  // Collect Uniform and Scalar instructions after vectorization with VF.
+  collectUniformsAndScalars(VF);
+
   // Collect the instructions (and their associated costs) that will be more
   // profitable to scalarize.
   collectInstsToScalarize(VF);
@@ -6832,31 +7029,295 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
          Legal->hasStride(I->getOperand(1));
 }
 
+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                                 unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  auto SE = PSE.getSE();
+
+  unsigned Alignment = getMemInstAlignment(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  Value *Ptr = getPointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // Figure out whether the access is strided and get the stride value
+  // if it's known in compile time
+  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
+  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+
+  Cost += VF *
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+                              AS, I);
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF, TTI);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (Legal->isScalarWithPredication(I))
+    Cost /= getReciprocalPredBlockProb();
+
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+                                                             unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+
+  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+         "Stride should be 1 or -1 for consecutive memory access");
+  unsigned Cost = 0;
+  if (Legal->isMaskRequired(I))
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+  else
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+
+  bool Reverse = ConsecutiveStride < 0;
+  if (Reverse)
+    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+                                                         unsigned VF) {
+  LoadInst *LI = cast<LoadInst>(I);
+  Type *ValTy = LI->getType();
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = LI->getAlignment();
+  unsigned AS = LI->getPointerAddressSpace();
+
+  return TTI.getAddressComputationCost(ValTy) +
+         TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+         TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+}
+
+unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+                                                          unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+
+  return TTI.getAddressComputationCost(VectorTy) +
+         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment);
+}
+
+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+                                                            unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned AS = getMemInstAddressSpace(I);
+
+  auto Group = Legal->getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (isa<LoadInst>(I)) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                 Group->getFactor(), Indices,
+                                                 Group->getAlignment(), AS);
+
+  if (Group->isReverse())
+    Cost += Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+                                                              unsigned VF) {
+
+  // Calculate scalar cost only. Vectorization cost should be ready at this
+  // moment.
+  if (VF == 1) {
+    Type *ValTy = getMemInstValueType(I);
+    unsigned Alignment = getMemInstAlignment(I);
+    unsigned AS = getMemInstAddressSpace(I);
+
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+  }
+  return getWideningCost(I, VF);
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
-  if (Legal->isUniformAfterVectorization(I))
+  if (isUniformAfterVectorization(I, VF))
     VF = 1;
 
   if (VF > 1 && isProfitableToScalarize(I, VF))
     return VectorizationCostTy(InstsToScalarize[VF][I], false);
 
+  // Forced scalars do not have any scalarization overhead.
+  if (VF > 1 && ForcedScalars.count(VF) &&
+      ForcedScalars.find(VF)->second.count(I))
+    return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+
   Type *VectorTy;
   unsigned C = getInstructionCost(I, VF, VectorTy);
 
   bool TypeNotScalarized =
-      VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
+      VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
+  if (VF == 1)
+    return;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      Value *Ptr = getPointerOperand(&I);
+      if (!Ptr)
+        continue;
+
+      if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
+        // Scalar load + broadcast
+        unsigned Cost = getUniformMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        continue;
+      }
+
+      // We assume that widening is the best solution when possible.
+      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Widen, Cost);
+        continue;
+      }
+
+      // Choose between Interleaving, Gather/Scatter or Scalarization.
+      unsigned InterleaveCost = UINT_MAX;
+      unsigned NumAccesses = 1;
+      if (Legal->isAccessInterleaved(&I)) {
+        auto Group = Legal->getInterleavedAccessGroup(&I);
+        assert(Group && "Fail to get an interleaved access group.");
+
+        // Make one decision for the whole group.
+        if (getWideningDecision(&I, VF) != CM_Unknown)
+          continue;
+
+        NumAccesses = Group->getNumMembers();
+        InterleaveCost = getInterleaveGroupCost(&I, VF);
+      }
+
+      unsigned GatherScatterCost =
+          Legal->isLegalGatherOrScatter(&I)
+              ? getGatherScatterCost(&I, VF) * NumAccesses
+              : UINT_MAX;
+
+      unsigned ScalarizationCost =
+          getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+      // Choose better solution for the current VF,
+      // write down this decision and use it during vectorization.
+      unsigned Cost;
+      InstWidening Decision;
+      if (InterleaveCost <= GatherScatterCost &&
+          InterleaveCost < ScalarizationCost) {
+        Decision = CM_Interleave;
+        Cost = InterleaveCost;
+      } else if (GatherScatterCost < ScalarizationCost) {
+        Decision = CM_GatherScatter;
+        Cost = GatherScatterCost;
+      } else {
+        Decision = CM_Scalarize;
+        Cost = ScalarizationCost;
+      }
+      // If the instructions belongs to an interleave group, the whole group
+      // receives the same decision. The whole group receives the cost, but
+      // the cost will actually be assigned to one instruction.
+      if (auto Group = Legal->getInterleavedAccessGroup(&I))
+        setWideningDecision(Group, VF, Decision, Cost);
+      else
+        setWideningDecision(&I, VF, Decision, Cost);
+    }
+  }
+
+  // Make sure that any load of address and any other address computation
+  // remains scalar unless there is gather/scatter support. This avoids
+  // inevitable extracts into address registers, and also has the benefit of
+  // activating LSR more, since that pass can't optimize vectorized
+  // addresses.
+  if (TTI.prefersVectorizedAddressing())
+    return;
+
+  // Start with all scalar pointer uses.
+  SmallPtrSet<Instruction *, 8> AddrDefs;
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      Instruction *PtrDef =
+        dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      if (PtrDef && TheLoop->contains(PtrDef) &&
+          getWideningDecision(&I, VF) != CM_GatherScatter)
+        AddrDefs.insert(PtrDef);
+    }
+
+  // Add all instructions used to generate the addresses.
+  SmallVector<Instruction *, 4> Worklist;
+  for (auto *I : AddrDefs)
+    Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (auto &Op : I->operands())
+      if (auto *InstOp = dyn_cast<Instruction>(Op))
+        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+            AddrDefs.insert(InstOp).second == true)
+          Worklist.push_back(InstOp);
+  }
+
+  for (auto *I : AddrDefs) {
+    if (isa<LoadInst>(I)) {
+      // Setting the desired widening decision should ideally be handled in
+      // by cost functions, but since this involves the task of finding out
+      // if the loaded register is involved in an address computation, it is
+      // instead changed here when we know this is the case.
+      if (getWideningDecision(I, VF) == CM_Widen)
+        // Scalarize a widened load of address.
+        setWideningDecision(I, VF, CM_Scalarize,
+                            (VF * getMemoryInstructionCost(I, 1)));
+      else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+        // Scalarize an interleave group of address loads.
+        for (unsigned I = 0; I < Group->getFactor(); ++I) {
+          if (Instruction *Member = Group->getMember(I))
+            setWideningDecision(Member, VF, CM_Scalarize,
+                                (VF * getMemoryInstructionCost(Member, 1)));
+        }
+      }
+    } else
+      // Make sure I gets scalarized and a cost estimate without
+      // scalarization overhead.
+      ForcedScalars[VF].insert(I);
+  }
+}
+
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                         unsigned VF,
                                                         Type *&VectorTy) {
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
-  VectorTy = ToVectorTy(RetTy, VF);
+  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
   auto SE = PSE.getSE();
 
   // TODO: We need to estimate the cost of intrinsic calls.
@@ -6868,7 +7329,31 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // instruction cost.
     return 0;
   case Instruction::Br: {
-    return TTI.getCFInstrCost(I->getOpcode());
+    // In cases of scalarized and predicated instructions, there will be VF
+    // predicated blocks in the vectorized loop. Each branch around these
+    // blocks requires also an extract of its vector compare i1 element.
+    bool ScalarPredicatedBB = false;
+    BranchInst *BI = cast<BranchInst>(I);
+    if (VF > 1 && BI->isConditional() &&
+        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+      ScalarPredicatedBB = true;
+
+    if (ScalarPredicatedBB) {
+      // Return cost for branches around scalarized and predicated blocks.
+      Type *Vec_i1Ty =
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
+              (TTI.getCFInstrCost(Instruction::Br) * VF));
+    } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+      // The back-edge branch will remain, as will all scalar branches.
+      return TTI.getCFInstrCost(Instruction::Br);
+    else
+      // This branch will be eliminated by if-conversion.
+      return 0;
+    // Note: We currently assume zero cost for an unconditional branch inside
+    // a predicated block since it will become a fall-through, although we
+    // may decide in the future to call TTI for all branches.
   }
   case Instruction::PHI: {
     auto *Phi = cast<PHINode>(I);
@@ -6878,8 +7363,16 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
                                 VectorTy, VF - 1, VectorTy);
 
-    // TODO: IF-converted IFs become selects.
-    return 0;
+    // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
+    // converted into select instructions. We require N - 1 selects per phi
+    // node, where N is the number of incoming values.
+    if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
+      return (Phi->getNumIncomingValues() - 1) *
+             TTI.getCmpSelInstrCost(
+                 Instruction::Select, ToVectorTy(Phi->getType(), VF),
+                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
+
+    return TTI.getCFInstrCost(Instruction::PHI);
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -6910,6 +7403,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       // likely.
       return Cost / getReciprocalPredBlockProb();
     }
+    LLVM_FALLTHROUGH;
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -6957,9 +7451,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     } else if (Legal->isUniform(Op2)) {
       Op2VK = TargetTransformInfo::OK_UniformValue;
     }
-    SmallVector<const Value *, 4> Operands(I->operand_values()); 
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
-                                      Op2VK, Op1VP, Op2VP, Operands);
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                          Op2VK, Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -6969,7 +7464,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -6978,130 +7473,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
   }
   case Instruction::Store:
   case Instruction::Load: {
-    StoreInst *SI = dyn_cast<StoreInst>(I);
-    LoadInst *LI = dyn_cast<LoadInst>(I);
-    Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
-    VectorTy = ToVectorTy(ValTy, VF);
-
-    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
-    unsigned AS =
-        SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
-    Value *Ptr = getPointerOperand(I);
-    // We add the cost of address computation here instead of with the gep
-    // instruction because only here we know whether the operation is
-    // scalarized.
-    if (VF == 1)
-      return TTI.getAddressComputationCost(VectorTy) +
-             TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (LI && Legal->isUniform(Ptr)) {
-      // Scalar load + broadcast
-      unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-      return Cost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);
-    }
-
-    // For an interleaved access, calculate the total cost of the whole
-    // interleave group.
-    if (Legal->isAccessInterleaved(I)) {
-      auto Group = Legal->getInterleavedAccessGroup(I);
-      assert(Group && "Fail to get an interleaved access group.");
-
-      // Only calculate the cost once at the insert position.
-      if (Group->getInsertPos() != I)
-        return 0;
-
-      unsigned InterleaveFactor = Group->getFactor();
-      Type *WideVecTy =
-          VectorType::get(VectorTy->getVectorElementType(),
-                          VectorTy->getVectorNumElements() * InterleaveFactor);
-
-      // Holds the indices of existing members in an interleaved load group.
-      // An interleaved store group doesn't need this as it doesn't allow gaps.
-      SmallVector<unsigned, 4> Indices;
-      if (LI) {
-        for (unsigned i = 0; i < InterleaveFactor; i++)
-          if (Group->getMember(i))
-            Indices.push_back(i);
-      }
-
-      // Calculate the cost of the whole interleaved group.
-      unsigned Cost = TTI.getInterleavedMemoryOpCost(
-          I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-          Group->getAlignment(), AS);
-
-      if (Group->isReverse())
-        Cost +=
-            Group->getNumMembers() *
-            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-
-      // FIXME: The interleaved load group with a huge gap could be even more
-      // expensive than scalar operations. Then we could ignore such group and
-      // use scalar operations instead.
-      return Cost;
+    unsigned Width = VF;
+    if (Width > 1) {
+      InstWidening Decision = getWideningDecision(I, Width);
+      assert(Decision != CM_Unknown &&
+             "CM decision should be taken at this point");
+      if (Decision == CM_Scalarize)
+        Width = 1;
     }
-
-    // Check if the memory instruction will be scalarized.
-    if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
-      unsigned Cost = 0;
-      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
-
-      // Figure out whether the access is strided and get the stride value
-      // if it's known in compile time
-      const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); 
-
-      // Get the cost of the scalar memory instruction and address computation.
-      Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
-      Cost += VF *
-              TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-
-      // Get the overhead of the extractelement and insertelement instructions
-      // we might create due to scalarization.
-      Cost += getScalarizationOverhead(I, VF, TTI);
-
-      // If we have a predicated store, it may not be executed for each vector
-      // lane. Scale the cost by the probability of executing the predicated
-      // block.
-      if (Legal->isScalarWithPredication(I))
-        Cost /= getReciprocalPredBlockProb();
-
-      return Cost;
-    }
-
-    // Determine if the pointer operand of the access is either consecutive or
-    // reverse consecutive.
-    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = ConsecutiveStride < 0;
-
-    // Determine if either a gather or scatter operation is legal.
-    bool UseGatherOrScatter =
-        !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
-
-    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    if (UseGatherOrScatter) {
-      assert(ConsecutiveStride == 0 &&
-             "Gather/Scatter are not used for consecutive stride");
-      return Cost +
-             TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                        Legal->isMaskRequired(I), Alignment);
-    }
-    // Wide load/stores.
-    if (Legal->isMaskRequired(I))
-      Cost +=
-          TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-    else
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-    return Cost;
+    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+    return getMemoryInstructionCost(I, VF);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -7115,15 +7500,18 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    // We optimize the truncation of induction variable.
-    // The cost of these is the same as the scalar operation.
-    if (I->getOpcode() == Instruction::Trunc &&
-        Legal->isInductionVariable(I->getOperand(0)))
-      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
-                                  I->getOperand(0)->getType());
+    // We optimize the truncation of induction variables having constant
+    // integer steps. The cost of these truncations is the same as the scalar
+    // operation.
+    if (isOptimizableIVTruncate(I, VF)) {
+      auto *Trunc = cast<TruncInst>(I);
+      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
+                                  Trunc->getSrcTy(), Trunc);
+    }
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
-    Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
+    Type *SrcVecTy =
+        VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
     if (canTruncateToMinimalBitwidth(I, VF)) {
       // This cast is going to be shrunk. This may remove the cast or it might
       // turn it into slightly different cast. For example, if MinBW == 16,
@@ -7143,7 +7531,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       }
     }
 
-    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
@@ -7172,9 +7561,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
@@ -7206,81 +7593,109 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
-
-  // Insert values known to be scalar into VecValuesToIgnore. This is a
-  // conservative estimation of the values that will later be scalarized.
-  //
-  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
-  //        still be scalarized. For example, we may find an instruction to be
-  //        more profitable for a given vectorization factor if it were to be
-  //        scalarized. But at this point, we haven't yet computed the
-  //        vectorization factor.
-  for (auto *BB : TheLoop->getBlocks())
-    for (auto &I : *BB)
-      if (Legal->isScalarAfterVectorization(&I))
-        VecValuesToIgnore.insert(&I);
 }
 
-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
-                                             bool IfPredicateInstr) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
-  // Holds vector parameters or scalars, in case of uniform vals.
-  SmallVector<VectorParts, 4> Params;
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
 
-  setDebugLocFromInst(Builder, Instr);
+  // Width 1 means no vectorize, cost 0 means uncomputed cost.
+  const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
+                                                                           0U};
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
+    return NoVectorization;
 
-  // Does this instruction return a value ?
-  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+  if (UserVF) {
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
+    CM.selectUserVectorizationFactor(UserVF);
+    return {UserVF, 0};
+  }
 
-  // Initialize a new scalar map entry.
-  ScalarParts Entry(UF);
+  unsigned MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF != 0 && "MaxVF is zero.");
+  if (MaxVF == 1)
+    return NoVectorization;
 
-  VectorParts Cond;
-  if (IfPredicateInstr)
-    Cond = createBlockInMask(Instr->getParent());
+  // Select the optimal vectorization factor.
+  return CM.selectVectorizationFactor(MaxVF);
+}
 
-  // For each vector unroll 'part':
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(1);
-    // For each scalar that we create:
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) {
+  // Perform the actual loop transformation.
 
-    // Start an "if (pred) a[i] = ..." block.
-    Value *Cmp = nullptr;
-    if (IfPredicateInstr) {
-      if (Cond[Part]->getType()->isVectorTy())
-        Cond[Part] =
-            Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
-      Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
-                               ConstantInt::get(Cond[Part]->getType(), 1));
-    }
+  // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+  ILV.createVectorizedLoopSkeleton();
 
-    Instruction *Cloned = Instr->clone();
-    if (!IsVoidRetTy)
-      Cloned->setName(Instr->getName() + ".cloned");
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
 
-    // Replace the operands of the cloned instructions with their scalar
-    // equivalents in the new loop.
-    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-      auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0);
-      Cloned->setOperand(op, NewOp);
-    }
+  // 2. Copy and widen instructions from the old loop into the new loop.
+
+  // Move instructions to handle first-order recurrences.
+  DenseMap<Instruction *, Instruction *> SinkAfter = Legal->getSinkAfter();
+  for (auto &Entry : SinkAfter) {
+    Entry.first->removeFromParent();
+    Entry.first->insertAfter(Entry.second);
+    DEBUG(dbgs() << "Sinking" << *Entry.first << " after" << *Entry.second
+                 << " to vectorize a 1st order recurrence.\n");
+  }
 
-    // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+  // Collect instructions from the original loop that will become trivially dead
+  // in the vectorized loop. We don't need to vectorize these instructions. For
+  // example, original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
+  collectTriviallyDeadInstructions(DeadInstructions);
 
-    // Add the cloned scalar to the scalar map entry.
-    Entry[Part][0] = Cloned;
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
 
-    // If we just cloned a new assumption, add it the assumption cache.
-    if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
-      if (II->getIntrinsicID() == Intrinsic::assume)
-        AC->registerAssumption(II);
+  // Vectorize all instructions in the original loop that will not become
+  // trivially dead when vectorized.
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+    for (Instruction &I : *BB)
+      if (!DeadInstructions.count(&I))
+        ILV.vectorizeInstruction(I);
+
+  // 3. Fix the vectorized code: take care of header phi's, live-outs,
+  //    predication, updating analyses.
+  ILV.fixVectorizedLoop();
+}
 
-    // End if-block.
-    if (IfPredicateInstr)
-      PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+  // We create new control-flow for the vectorized loop, so the original
+  // condition will be dead after vectorization if it's only used by the
+  // branch.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && Cmp->hasOneUse())
+    DeadInstructions.insert(Cmp);
+
+  // We create new "steps" for induction variable updates to which the original
+  // induction variables map. An original update instruction will be dead if
+  // all its users except the induction variable are dead.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    PHINode *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+        }))
+      DeadInstructions.insert(IndUpdate);
   }
-  VectorLoopValueMap.initScalar(Instr, Entry);
 }
 
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
@@ -7384,24 +7799,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Check the loop for a trip count threshold:
-  // do not vectorize loops with a tiny trip count.
-  const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
-  if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
-    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
-                 << "This loop is not worth vectorizing.");
-    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
-      DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
-    else {
-      DEBUG(dbgs() << "\n");
-      ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(),
-                                     "NotBeneficial", L)
-                << "vectorization is not beneficial "
-                   "and is not explicitly forced");
-      return false;
-    }
-  }
-
   PredicatedScalarEvolution PSE(*SE, *L);
 
   // Check if it is legal to vectorize the loop.
@@ -7414,26 +7811,37 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Use the cost model.
-  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints);
-  CM.collectValuesToIgnore();
-
   // Check the function attributes to find out if this function should be
   // optimized for size.
   bool OptForSize =
       Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
 
-  // Compute the weighted frequency of this loop being executed and see if it
-  // is less than 20% of the function entry baseline frequency. Note that we
-  // always have a canonical loop here because we think we *can* vectorize.
-  // FIXME: This is hidden behind a flag due to pervasive problems with
-  // exactly what block frequency models.
-  if (LoopVectorizeWithBlockFrequency) {
-    BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-    if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-        LoopEntryFreq < ColdEntryFreq)
+  // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+  // count by optimizing for size, to minimize overheads.
+  unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
+  bool HasExpectedTC = (ExpectedTC > 0);
+
+  if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
+    auto EstimatedTC = getLoopEstimatedTripCount(L);
+    if (EstimatedTC) {
+      ExpectedTC = *EstimatedTC;
+      HasExpectedTC = true;
+    }
+  }
+
+  if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+                 << "This loop is worth vectorizing only if no scalar "
+                 << "iteration overheads are incurred.");
+    if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+      DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+    else {
+      DEBUG(dbgs() << "\n");
+      // Loops with a very small trip count are considered for vectorization
+      // under OptForSize, thereby making sure the cost of their loop body is
+      // dominant, free of runtime guards and scalar iteration overheads.
       OptForSize = true;
+    }
   }
 
   // Check the function attributes to see if implicit floats are allowed.
@@ -7464,9 +7872,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Select the optimal vectorization factor.
-  const LoopVectorizationCostModel::VectorizationFactor VF =
-      CM.selectVectorizationFactor(OptForSize);
+  // Use the cost model.
+  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints);
+  CM.collectValuesToIgnore();
+
+  // Use the planner for vectorization.
+  LoopVectorizationPlanner LVP(L, LI, &LVL, CM);
+
+  // Get user vectorization factor.
+  unsigned UserVF = Hints.getWidth();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor VF =
+      LVP.plan(OptForSize, UserVF);
 
   // Select the interleave count.
   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
@@ -7522,10 +7941,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {
     // Do not vectorize or interleaving the loop.
-    ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+    ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
                                          L->getStartLoc(), L->getHeader())
               << VecDiagMsg.second);
-    ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+    ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
                                          L->getStartLoc(), L->getHeader())
               << IntDiagMsg.second);
     return false;
@@ -7553,7 +7972,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // interleave it.
     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                &CM);
-    Unroller.vectorize();
+    LVP.executePlan(Unroller);
 
     ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
                                  L->getHeader())
@@ -7563,7 +7982,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     // If we decided that it is *legal* to vectorize the loop, then do it.
     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
                            &LVL, &CM);
-    LB.vectorize();
+    LVP.executePlan(LB);
     ++LoopsVectorized;
 
     // Add metadata to disable runtime unrolling a scalar loop when there are
@@ -7606,11 +8025,6 @@ bool LoopVectorizePass::runImpl(
   DB = &DB_;
   ORE = &ORE_;
 
-  // Compute some weights outside of the loop over the loops. Compute this
-  // using a BranchProbability to re-use its scaling math.
-  const BranchProbability ColdProb(1, 5); // 20%
-  ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
-
   // Don't attempt if
   // 1. the target claims to have no vector registers, and
   // 2. interleaving won't help ILP.
@@ -7621,6 +8035,16 @@ bool LoopVectorizePass::runImpl(
   if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2)
     return false;
 
+  bool Changed = false;
+
+  // The vectorizer requires loops to be in simplified form.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop vectorizer
+  // will simplify all loops, regardless of whether anything end up being
+  // vectorized.
+  for (auto &L : *LI)
+    Changed |= simplifyLoop(L, DT, LI, SE, AC, false /* PreserveLCSSA */);
+
   // Build up a worklist of inner-loops to vectorize. This is necessary as
   // the act of vectorizing or partially unrolling a loop creates new loops
   // and can invalidate iterators across the loops.
@@ -7632,9 +8056,15 @@ bool LoopVectorizePass::runImpl(
   LoopsAnalyzed += Worklist.size();
 
   // Now walk the identified inner loops.
-  bool Changed = false;
-  while (!Worklist.empty())
-    Changed |= processLoop(Worklist.pop_back_val());
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+
+    // For the inner loops we actually process, form LCSSA to simplify the
+    // transform.
+    Changed |= formLCSSARecursively(*L, *DT, LI, SE);
+
+    Changed |= processLoop(L);
+  }
 
   // Process each loop nest in the function.
   return Changed;
diff --git a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 328f270..dcbcab4 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -39,7 +39,10 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <memory>
@@ -90,6 +93,10 @@ static cl::opt<unsigned> MinTreeSize(
     "slp-min-tree-size", cl::init(3), cl::Hidden,
     cl::desc("Only vectorize small trees if they are fully vectorizable"));
 
+static cl::opt<bool>
+    ViewSLPTree("view-slp-tree", cl::Hidden,
+                cl::desc("Display the SLP trees with Graphviz"));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -166,6 +173,11 @@ static unsigned getAltOpcode(unsigned Op) {
   }
 }
 
+/// true if the \p Value is odd, false otherwise.
+static bool isOdd(unsigned Value) {
+  return Value & 1;
+}
+
 ///\returns bool representing if Opcode \p Op can be part
 /// of an alternate sequence which can later be merged as
 /// a ShuffleVector instruction.
@@ -183,7 +195,7 @@ static unsigned isAltInst(ArrayRef<Value *> VL) {
   unsigned AltOpcode = getAltOpcode(Opcode);
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+    if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
       return 0;
   }
   return Instruction::ShuffleVector;
@@ -207,23 +219,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
   return Opcode;
 }
 
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
-  if (auto *VecOp = dyn_cast<Instruction>(I)) {
-    if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
-      // Intersection is initialized to the 0th scalar,
-      // so start counting from index '1'.
-      for (int i = 1, e = VL.size(); i < e; ++i) {
-        if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
-          Intersection->andIRFlags(Scalar);
-      }
-      VecOp->copyIRFlags(Intersection);
-    }
-  }
-}
-
 /// \returns true if all of the values in \p VL have the same type or false
 /// otherwise.
 static bool allSameType(ArrayRef<Value *> VL) {
@@ -269,6 +264,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
       return (CI->getArgOperand(1) == Scalar);
     }
+    LLVM_FALLTHROUGH;
   }
   default:
     return false;
@@ -304,14 +300,16 @@ public:
   typedef SmallVector<Instruction *, 16> InstrList;
   typedef SmallPtrSet<Value *, 16> ValueSet;
   typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, SmallVector<Instruction *, 2>>
+      ExtraValueToDebugLocsMap;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
           DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
-          const DataLayout *DL)
+          const DataLayout *DL, OptimizationRemarkEmitter *ORE)
       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
         SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
-        DL(DL), Builder(Se->getContext()) {
+        DL(DL), ORE(ORE), Builder(Se->getContext()) {
     CodeMetrics::collectEphemeralValues(F, AC, EphValues);
     // Use the vector register size specified by the target unless overridden
     // by a command-line option.
@@ -324,12 +322,19 @@ public:
     else
       MaxVecRegSize = TTI->getRegisterBitWidth(true);
 
-    MinVecRegSize = MinVectorRegSizeOption;
+    if (MinVectorRegSizeOption.getNumOccurrences())
+      MinVecRegSize = MinVectorRegSizeOption;
+    else
+      MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
   }
 
   /// \brief Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
   Value *vectorizeTree();
+  /// Vectorize the tree but with the list of externally used values \p
+  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+  /// generated extractvalue instructions.
+  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
@@ -343,6 +348,13 @@ public:
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
   void buildTree(ArrayRef<Value *> Roots,
                  ArrayRef<Value *> UserIgnoreLst = None);
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+  /// into account (anf updating it, if required) list of externally used
+  /// values stored in \p ExternallyUsedValues.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                 ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
@@ -359,6 +371,8 @@ public:
     MinBWs.clear();
   }
 
+  unsigned getTreeSize() const { return VectorizableTree.size(); }
+
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
@@ -397,6 +411,8 @@ public:
   /// vectorizable. We do not vectorize such trees.
   bool isTreeTinyAndNotFullyVectorizable();
 
+  OptimizationRemarkEmitter *getORE() { return ORE; }
+
 private:
   struct TreeEntry;
 
@@ -404,7 +420,7 @@ private:
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
@@ -418,7 +434,7 @@ private:
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
-  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
+  Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
@@ -451,8 +467,9 @@ private:
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
-    TreeEntry() : Scalars(), VectorizedValue(nullptr),
-    NeedToGather(0) {}
+    TreeEntry(std::vector<TreeEntry> &Container)
+        : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
+          Container(Container) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
@@ -468,23 +485,40 @@ private:
 
     /// Do we need to gather this sequence ?
     bool NeedToGather;
+
+    /// Points back to the VectorizableTree.
+    ///
+    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
+    /// to be a pointer and needs to be able to initialize the child iterator.
+    /// Thus we need a reference back to the container to translate the indices
+    /// to entries.
+    std::vector<TreeEntry> &Container;
+
+    /// The TreeEntry index containing the user of this entry.  We can actually
+    /// have multiple users so the data structure is not truly a tree.
+    SmallVector<int, 1> UserTreeIndices;
   };
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
-    VectorizableTree.emplace_back();
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+                          int &UserTreeIdx) {
+    VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
-        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+        assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
         ScalarToTreeEntry[VL[i]] = idx;
       }
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
+
+    if (UserTreeIdx >= 0)
+      Last->UserTreeIndices.push_back(UserTreeIdx);
+    UserTreeIdx = idx;
     return Last;
   }
 
@@ -492,6 +526,20 @@ private:
   /// Holds all of the tree entries.
   std::vector<TreeEntry> VectorizableTree;
 
+  TreeEntry *getTreeEntry(Value *V) {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
+  const TreeEntry *getTreeEntry(Value *V) const {
+    auto I = ScalarToTreeEntry.find(V);
+    if (I != ScalarToTreeEntry.end())
+      return &VectorizableTree[I->second];
+    return nullptr;
+  }
+
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, int> ScalarToTreeEntry;
 
@@ -550,15 +598,17 @@ private:
   void eraseInstruction(Instruction *I) {
     I->removeFromParent();
     I->dropAllReferences();
-    DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
+    DeletedInstructions.emplace_back(I);
   }
 
   /// Temporary store for deleted instructions. Instructions will be deleted
   /// eventually when the BoUpSLP is destructed.
-  SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
+  SmallVector<unique_value, 8> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
-  /// This list holds pairs of (Internal Scalar : External User).
+  /// This list holds pairs of (Internal Scalar : External User). External User
+  /// can be nullptr, it means that this Internal Scalar will be used later,
+  /// after vectorization.
   UserList ExternalUses;
 
   /// Values used only by @llvm.assume calls.
@@ -706,6 +756,8 @@ private:
     return os;
   }
 #endif
+  friend struct GraphTraits<BoUpSLP *>;
+  friend struct DOTGraphTraits<BoUpSLP *>;
 
   /// Contains all scheduling data for a basic block.
   ///
@@ -805,10 +857,10 @@ private:
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
-    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
+    bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
 
     /// Un-bundles a group of instructions.
-    void cancelScheduling(ArrayRef<Value *> VL);
+    void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
 
     /// Extends the scheduling region so that V is inside the region.
     /// \returns true if the region size is within the limit.
@@ -904,6 +956,8 @@ private:
   AssumptionCache *AC;
   DemandedBits *DB;
   const DataLayout *DL;
+  OptimizationRemarkEmitter *ORE;
+
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
   unsigned MinVecRegSize; // Set by cl::opt (default: 128).
   /// Instruction builder to construct the vectorized tree.
@@ -916,30 +970,119 @@ private:
   /// original width.
   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
 };
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  /// NodeRef has to be a pointer per the GraphWriter.
+  typedef TreeEntry *NodeRef;
+
+  /// \brief Add the VectorizableTree to the index iterator to be able to return
+  /// TreeEntry pointers.
+  struct ChildIteratorType
+      : public iterator_adaptor_base<ChildIteratorType,
+                                     SmallVector<int, 1>::iterator> {
+
+    std::vector<TreeEntry> &VectorizableTree;
+
+    ChildIteratorType(SmallVector<int, 1>::iterator W,
+                      std::vector<TreeEntry> &VT)
+        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+    NodeRef operator*() { return &VectorizableTree[*I]; }
+  };
+
+  static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->UserTreeIndices.begin(), N->Container};
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->UserTreeIndices.end(), N->Container};
+  }
+
+  /// For the node iterator we just need to turn the TreeEntry iterator into a
+  /// TreeEntry* iterator so that it dereferences to NodeRef.
+  typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+
+  static nodes_iterator nodes_begin(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.begin());
+  }
+  static nodes_iterator nodes_end(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.end());
+  }
+
+  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    if (isSplat(Entry->Scalars)) {
+      OS << "<splat> " << *Entry->Scalars[0];
+      return Str;
+    }
+    for (auto V : Entry->Scalars) {
+      OS << *V;
+      if (std::any_of(
+              R->ExternalUses.begin(), R->ExternalUses.end(),
+              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+        OS << " <extract>";
+      OS << "\n";
+    }
+    return Str;
+  }
+
+  static std::string getNodeAttributes(const TreeEntry *Entry,
+                                       const BoUpSLP *) {
+    if (Entry->NeedToGather)
+      return "color=red";
+    return "";
+  }
+};
 
 } // end namespace llvm
-} // end namespace slpvectorizer
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                        ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0);
+  buildTree_rec(Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
     TreeEntry *Entry = &EIdx;
 
+    // No need to handle users of gathered values.
+    if (Entry->NeedToGather)
+      continue;
+
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
 
-      // No need to handle users of gathered values.
-      if (Entry->NeedToGather)
+      // Check if the scalar is externally used as an extra arg.
+      auto ExtI = ExternallyUsedValues.find(Scalar);
+      if (ExtI != ExternallyUsedValues.end()) {
+        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
+              Lane << " from " << *Scalar << ".\n");
+        ExternalUses.emplace_back(Scalar, nullptr, Lane);
         continue;
-
+      }
       for (User *U : Scalar->users()) {
         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
@@ -948,9 +1091,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
           continue;
 
         // Skip in-tree scalars that become vectors
-        if (ScalarToTreeEntry.count(U)) {
-          int Idx = ScalarToTreeEntry[U];
-          TreeEntry *UseEntry = &VectorizableTree[Idx];
+        if (TreeEntry *UseEntry = getTreeEntry(U)) {
           Value *UseScalar = UseEntry->Scalars[0];
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in Lane 0 will
@@ -959,7 +1100,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                          << ".\n");
-            assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+            assert(!UseEntry->NeedToGather && "Bad state");
             continue;
           }
         }
@@ -976,28 +1117,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   }
 }
 
-
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+                            int UserTreeIdx) {
   bool isAltShuffle = false;
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (VL[0]->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   unsigned Opcode = getSameOpcode(VL);
@@ -1014,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1026,23 +1167,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
 
   // Check if this is a duplicate of another entry.
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *E = &VectorizableTree[Idx];
+  if (TreeEntry *E = getTreeEntry(VL[0])) {
     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
     }
+    // Record the reuse of the tree node.  FIXME, currently this is only used to
+    // properly draw the graph rather than for the actual vectorization.
+    E->UserTreeIndices.push_back(UserTreeIdx);
     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
     return;
   }
@@ -1052,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (ScalarToTreeEntry.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1062,7 +1204,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1070,13 +1212,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   // Check that all of the users of the scalars that we want to vectorize are
   // schedulable.
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+  BasicBlock *BB = VL0->getParent();
 
   if (!DT->isReachableFromEntry(BB)) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1085,7 +1227,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     for (unsigned j = i+1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
 
@@ -1095,12 +1237,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   }
   BlockScheduling &BS = *BSRef.get();
 
-  if (!BS.tryScheduleBundle(VL, this)) {
+  if (!BS.tryScheduleBundle(VL, this, VL0)) {
     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1116,13 +1258,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
-            BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            BS.cancelScheduling(VL, VL0);
+            newTreeEntry(VL, false, UserTreeIdx);
             return;
           }
         }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1132,7 +1274,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1142,9 +1284,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
       } else {
-        BS.cancelScheduling(VL);
+        BS.cancelScheduling(VL, VL0);
       }
-      newTreeEntry(VL, Reuse);
+      newTreeEntry(VL, Reuse, UserTreeIdx);
       return;
     }
     case Instruction::Load: {
@@ -1159,8 +1301,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
-        BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1170,8 +1312,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
@@ -1193,7 +1335,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true);
+        newTreeEntry(VL, true, UserTreeIdx);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1207,8 +1349,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             break;
           }
 
-      BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx);
 
       if (ReverseConsecutive) {
         ++NumLoadsWantToChangeOrder;
@@ -1234,13 +1376,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0; i < VL.size(); ++i) {
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1249,7 +1391,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1262,14 +1404,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         CmpInst *Cmp = cast<CmpInst>(VL[i]);
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1278,7 +1420,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1301,7 +1443,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1309,8 +1451,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1320,7 +1462,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1329,8 +1471,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned j = 0; j < VL.size(); ++j) {
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1342,8 +1484,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1354,13 +1496,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (!isa<ConstantInt>(Op)) {
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1368,7 +1510,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1376,20 +1518,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // Check if the stores are consecutive or of we need to swizzle them.
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1);
+      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1399,8 +1541,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
-        BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1413,8 +1555,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (!CI2 || CI2->getCalledFunction() != Int ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1424,8 +1566,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
-            BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            BS.cancelScheduling(VL, VL0);
+            newTreeEntry(VL, false, UserTreeIdx);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1437,15 +1579,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
-          BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          BS.cancelScheduling(VL, VL0);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1453,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1461,20 +1603,20 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // If this is not an alternate sequence of opcode like add-sub
       // then do not vectorize this instruction.
       if (!isAltShuffle) {
-        BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1484,13 +1626,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     default:
-      BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1570,6 +1712,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
   // If we have computed a smaller type for the expression, update VecTy so
@@ -1599,7 +1743,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
           Instruction *E = cast<Instruction>(VL[i]);
-          if (E->hasOneUse())
+          // If all users are going to be vectorized, instruction can be
+          // considered as dead.
+          // The same, if have only one user, it will be vectorized for sure.
+          if (E->hasOneUse() ||
+              std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+                return ScalarToTreeEntry.count(U) > 0;
+              }))
             // Take credit for instruction that will become dead.
             DeadCost +=
                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
@@ -1624,10 +1774,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       // Calculate the cost of this instruction.
       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
+                                                         VL0->getType(), SrcTy, VL0);
 
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
@@ -1636,8 +1786,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Calculate the cost of this instruction.
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -1695,11 +1845,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
           CInt->getValue().isPowerOf2())
         Op2VP = TargetTransformInfo::OP_PowerOf2;
 
-      int ScalarCost = VecTy->getNumElements() *
-                       TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK,
-                                                   Op2VK, Op1VP, Op2VP);
+      SmallVector<const Value *, 4> Operands(VL0->operand_values());
+      int ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
+                                      Op2VP, Operands);
       int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
-                                                Op1VP, Op2VP);
+                                                Op1VP, Op2VP, Operands);
       return VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -1720,18 +1872,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Cost of wide load - cost of scalar loads.
       unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
       int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
       int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
       return VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -1739,12 +1891,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
+      SmallVector<Type*, 4> ScalarTys;
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
         ScalarTys.push_back(CI->getArgOperand(op)->getType());
-        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
-                                         VecTy->getNumElements()));
-      }
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
@@ -1753,7 +1902,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       int ScalarCallCost = VecTy->getNumElements() *
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
 
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+      SmallVector<Value *, 4> Args(CI->arg_operands());
+      int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+                                                   VecTy->getNumElements());
 
       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
@@ -1861,7 +2012,7 @@ int BoUpSLP::getSpillCost() {
     // Update LiveValues.
     LiveValues.erase(PrevInst);
     for (auto &J : PrevInst->operands()) {
-      if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
+      if (isa<Instruction>(&*J) && getTreeEntry(&*J))
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
@@ -1947,9 +2098,18 @@ int BoUpSLP::getTreeCost() {
   int SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
 
-  DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
-               << "SLP: Extract Cost = " << ExtractCost << ".\n"
-               << "SLP: Total Cost = " << Cost << ".\n");
+  std::string Str;
+  {
+    raw_string_ostream OS(Str);
+    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+       << "SLP: Extract Cost = " << ExtractCost << ".\n"
+       << "SLP: Total Cost = " << Cost << ".\n";
+  }
+  DEBUG(dbgs() << Str);
+
+  if (ViewSLPTree)
+    ViewGraph(this, "SLP" + F->getName(), false, Str);
+
   return Cost;
 }
 
@@ -2248,9 +2408,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
       CSEBlocks.insert(Insrt->getParent());
 
       // Add to our 'need-to-extract' list.
-      if (ScalarToTreeEntry.count(VL[i])) {
-        int Idx = ScalarToTreeEntry[VL[i]];
-        TreeEntry *E = &VectorizableTree[Idx];
+      if (TreeEntry *E = getTreeEntry(VL[i])) {
         // Find which lane we need to extract.
         int FoundLane = -1;
         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
@@ -2269,12 +2427,8 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
   return Vec;
 }
 
-Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
-  SmallDenseMap<Value*, int>::const_iterator Entry
-    = ScalarToTreeEntry.find(VL[0]);
-  if (Entry != ScalarToTreeEntry.end()) {
-    int Idx = Entry->second;
-    const TreeEntry *En = &VectorizableTree[Idx];
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
+  if (const TreeEntry *En = getTreeEntry(OpValue)) {
     if (En->isSame(VL) && En->VectorizedValue)
       return En->VectorizedValue;
   }
@@ -2282,12 +2436,9 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
 }
 
 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
-  if (ScalarToTreeEntry.count(VL[0])) {
-    int Idx = ScalarToTreeEntry[VL[0]];
-    TreeEntry *E = &VectorizableTree[Idx];
+  if (TreeEntry *E = getTreeEntry(VL[0]))
     if (E->isSame(VL))
       return vectorizeTree(E);
-  }
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -2402,7 +2553,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 
       Value *InVec = vectorizeTree(INVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       CastInst *CI = dyn_cast<CastInst>(VL0);
@@ -2424,7 +2575,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *L = vectorizeTree(LHSV);
       Value *R = vectorizeTree(RHSV);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
@@ -2453,7 +2604,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *True = vectorizeTree(TrueVec);
       Value *False = vectorizeTree(FalseVec);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       Value *V = Builder.CreateSelect(Cond, True, False);
@@ -2493,7 +2644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
@@ -2522,9 +2673,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
-      if (ScalarToTreeEntry.count(LI->getPointerOperand()))
-        ExternalUses.push_back(
-            ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
+      Value *PO = LI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       unsigned Alignment = LI->getAlignment();
       LI = Builder.CreateLoad(VecPtr);
@@ -2555,9 +2706,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The pointer operand uses an in-tree scalar so we add the new BitCast to
       // ExternalUses list to make sure that an extract will be generated in the
       // future.
-      if (ScalarToTreeEntry.count(SI->getPointerOperand()))
-        ExternalUses.push_back(
-            ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
+      Value *PO = SI->getPointerOperand();
+      if (getTreeEntry(PO))
+        ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
       if (!Alignment) {
         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
@@ -2638,7 +2789,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // The scalar argument uses an in-tree scalar so we add the new vectorized
       // call to ExternalUses list to make sure that an extract will be
       // generated in the future.
-      if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
+      if (ScalarArg && getTreeEntry(ScalarArg))
         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
 
       E->VectorizedValue = V;
@@ -2655,7 +2806,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       Value *LHS = vectorizeTree(LHSVL);
       Value *RHS = vectorizeTree(RHSVL);
 
-      if (Value *V = alreadyVectorized(E->Scalars))
+      if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
 
       // Create a vector of LHS op1 RHS
@@ -2674,7 +2825,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       unsigned e = E->Scalars.size();
       SmallVector<Constant *, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
-        if (i & 1) {
+        if (isOdd(i)) {
           Mask[i] = Builder.getInt32(e + i);
           OddScalars.push_back(E->Scalars[i]);
         } else {
@@ -2702,6 +2853,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
 }
 
 Value *BoUpSLP::vectorizeTree() {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
 
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
@@ -2744,18 +2901,38 @@ Value *BoUpSLP::vectorizeTree() {
 
     // Skip users that we already RAUW. This happens when one instruction
     // has multiple uses of the same value.
-    if (!is_contained(Scalar->users(), User))
+    if (User && !is_contained(Scalar->users(), User))
       continue;
-    assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
-
-    int Idx = ScalarToTreeEntry[Scalar];
-    TreeEntry *E = &VectorizableTree[Idx];
+    TreeEntry *E = getTreeEntry(Scalar);
+    assert(E && "Invalid scalar");
     assert(!E->NeedToGather && "Extracting from a gather list");
 
     Value *Vec = E->VectorizedValue;
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    // If User == nullptr, the Scalar is used as extra arg. Generate
+    // ExtractElement instruction and update the record for this scalar in
+    // ExternallyUsedValues.
+    if (!User) {
+      assert(ExternallyUsedValues.count(Scalar) &&
+             "Scalar with nullptr as an external user must be registered in "
+             "ExternallyUsedValues map");
+      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+        Builder.SetInsertPoint(VecI->getParent(),
+                               std::next(VecI->getIterator()));
+      } else {
+        Builder.SetInsertPoint(&F->getEntryBlock().front());
+      }
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+      auto &Locs = ExternallyUsedValues[Scalar];
+      ExternallyUsedValues.insert({Ex, Locs});
+      ExternallyUsedValues.erase(Scalar);
+      continue;
+    }
+
     // Generate extracts for out-of-tree users.
     // Find the insertion point for the extractelement lane.
     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -2813,7 +2990,7 @@ Value *BoUpSLP::vectorizeTree() {
         for (User *U : Scalar->users()) {
           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
 
-          assert((ScalarToTreeEntry.count(U) ||
+          assert((getTreeEntry(U) ||
                   // It is legal to replace users in the ignorelist by undef.
                   is_contained(UserIgnoreList, U)) &&
                  "Replacing out-of-tree value with undef");
@@ -2920,8 +3097,8 @@ void BoUpSLP::optimizeGatherSequence() {
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
-                                                 BoUpSLP *SLP) {
-  if (isa<PHINode>(VL[0]))
+                                                 BoUpSLP *SLP, Value *OpValue) {
+  if (isa<PHINode>(OpValue))
     return true;
 
   // Initialize the instruction bundle.
@@ -2929,7 +3106,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
   ScheduleData *PrevInBundle = nullptr;
   ScheduleData *Bundle = nullptr;
   bool ReSchedule = false;
-  DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
+  DEBUG(dbgs() << "SLP:  bundle: " << *OpValue << "\n");
 
   // Make sure that the scheduling region contains all
   // instructions of the bundle.
@@ -3000,17 +3177,18 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
     }
   }
   if (!Bundle->isReady()) {
-    cancelScheduling(VL);
+    cancelScheduling(VL, OpValue);
     return false;
   }
   return true;
 }
 
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
-  if (isa<PHINode>(VL[0]))
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
+                                                Value *OpValue) {
+  if (isa<PHINode>(OpValue))
     return;
 
-  ScheduleData *Bundle = getScheduleData(VL[0]);
+  ScheduleData *Bundle = getScheduleData(OpValue);
   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
   assert(!Bundle->IsScheduled &&
          "Can't cancel bundle which is already scheduled");
@@ -3154,12 +3332,10 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
               BundleMember->Dependencies++;
               ScheduleData *DestBundle = UseSD->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
+              if (!DestBundle->IsScheduled)
                 BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
+              if (!DestBundle->hasValidDependencies())
                 WorkList.push_back(DestBundle);
-              }
             }
           } else {
             // I'm not sure if this can ever happen. But we need to be safe.
@@ -3264,7 +3440,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   // sorted by the original instruction location. This lets the final schedule
   // be as  close as possible to the original instruction order.
   struct ScheduleDataCompare {
-    bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
+    bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
       return SD2->SchedulingPriority < SD1->SchedulingPriority;
     }
   };
@@ -3278,7 +3454,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
        I = I->getNextNode()) {
     ScheduleData *SD = BS->getScheduleData(I);
     assert(
-        SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
+        SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
         "scheduler and vectorizer have different opinion on what is a bundle");
     SD->FirstInBundle->SchedulingPriority = Idx++;
     if (SD->isSchedulingEntity()) {
@@ -3527,10 +3703,8 @@ void BoUpSLP::computeMinimumValueSizes() {
     // Determine if the sign bit of all the roots is known to be zero. If not,
     // IsKnownPositive is set to False.
     IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
-      bool KnownZero = false;
-      bool KnownOne = false;
-      ComputeSignBit(R, KnownZero, KnownOne, *DL);
-      return KnownZero;
+      KnownBits Known = computeKnownBits(R, *DL);
+      return Known.isNonNegative();
     });
 
     // Determine the maximum number of bits required to store the scalar
@@ -3610,8 +3784,9 @@ struct SLPVectorizer : public FunctionPass {
     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+    auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+    return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -3623,6 +3798,7 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<DemandedBitsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -3641,13 +3817,14 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
   auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   auto *AC = &AM.getResult<AssumptionAnalysis>(F);
   auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+  auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
+  bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
   if (!Changed)
     return PreservedAnalyses::all();
+
   PreservedAnalyses PA;
-  PA.preserve<LoopAnalysis>();
-  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserveSet<CFGAnalyses>();
   PA.preserve<AAManager>();
   PA.preserve<GlobalsAA>();
   return PA;
@@ -3657,7 +3834,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 TargetTransformInfo *TTI_,
                                 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
                                 LoopInfo *LI_, DominatorTree *DT_,
-                                AssumptionCache *AC_, DemandedBits *DB_) {
+                                AssumptionCache *AC_, DemandedBits *DB_,
+                                OptimizationRemarkEmitter *ORE_) {
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -3685,7 +3863,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // Use the bottom up slp vectorizer to construct chains that start with
   // store instructions.
-  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
+  BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
 
   // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
   // delete instructions.
@@ -3723,11 +3901,13 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 }
 
 /// \brief Check that the Values in the slice in VL array are still existent in
-/// the WeakVH array.
+/// the WeakTrackingVH array.
 /// Vectorization of part of the VL array may cause later values in the VL array
-/// to become invalid. We track when this has happened in the WeakVH array.
-static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
-                               unsigned SliceBegin, unsigned SliceSize) {
+/// to become invalid. We track when this has happened in the WeakTrackingVH
+/// array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
+                               ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
+                               unsigned SliceSize) {
   VL = VL.slice(SliceBegin, SliceSize);
   VH = VH.slice(SliceBegin, SliceSize);
   return !std::equal(VL.begin(), VL.end(), VH.begin());
@@ -3745,7 +3925,7 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     return false;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+  SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
 
   bool Changed = false;
   // Look for profitable vectorizable trees at all offsets, starting at zero.
@@ -3772,6 +3952,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
     if (Cost < -SLPCostThreshold) {
       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      using namespace ore;
+      R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+                                          cast<StoreInst>(Chain[i]))
+                       << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+                       << " and with tree size "
+                       << NV("TreeSize", R.getTreeSize()));
+
       R.vectorizeTree();
 
       // Move to the next bundle.
@@ -3931,7 +4118,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
   bool Changed = false;
 
   // Keep track of values that were deleted by vectorizing in the loop below.
-  SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
+  SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
 
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
@@ -3970,8 +4157,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if (AllowReorder && R.shouldReorder()) {
         // Conceptually, there is nothing actually preventing us from trying to
         // reorder a larger list. In fact, we do exactly this when vectorizing
-        // reductions. However, at this point, we only expect to get here from
-        // tryToVectorizePair().
+        // reductions. However, at this point, we only expect to get here when
+        // there are exactly two operations.
         assert(Ops.size() == 2);
         assert(BuildVectorSlice.empty());
         Value *ReorderedOps[] = {Ops[1], Ops[0]};
@@ -3985,6 +4172,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
 
       if (Cost < -SLPCostThreshold) {
         DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+        R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+                                            cast<Instruction>(Ops[0]))
+                         << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+                         << " and with tree size "
+                         << ore::NV("TreeSize", R.getTreeSize()));
+
         Value *VectorizedRoot = R.vectorizeTree();
 
         // Reconstruct the build vector by extracting the vectorized root. This
@@ -4026,36 +4219,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
   if (!V)
     return false;
 
+  Value *P = V->getParent();
+
+  // Vectorize in current basic block only.
+  auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+    return false;
+
   // Try to vectorize V.
-  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+  if (tryToVectorizePair(Op0, Op1, R))
     return true;
 
-  BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
-  BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+  auto *A = dyn_cast<BinaryOperator>(Op0);
+  auto *B = dyn_cast<BinaryOperator>(Op1);
   // Try to skip B.
   if (B && B->hasOneUse()) {
-    BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
-    BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (tryToVectorizePair(A, B0, R)) {
+    auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
       return true;
-    }
-    if (tryToVectorizePair(A, B1, R)) {
+    if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
       return true;
-    }
   }
 
   // Try to skip A.
   if (A && A->hasOneUse()) {
-    BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
-    BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (tryToVectorizePair(A0, B, R)) {
+    auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
       return true;
-    }
-    if (tryToVectorizePair(A1, B, R)) {
+    if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
       return true;
-    }
   }
-  return 0;
+  return false;
 }
 
 /// \brief Generate a shuffle mask to be used in a reduction tree.
@@ -4119,37 +4316,41 @@ namespace {
 class HorizontalReduction {
   SmallVector<Value *, 16> ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
+  // Use map vector to make stable output.
+  MapVector<Instruction *, Value *> ExtraArgs;
 
-  BinaryOperator *ReductionRoot;
-  // After successfull horizontal reduction vectorization attempt for PHI node
-  // vectorizer tries to update root binary op by combining vectorized tree and
-  // the ReductionPHI node. But during vectorization this ReductionPHI can be
-  // vectorized itself and replaced by the undef value, while the instruction
-  // itself is marked for deletion. This 'marked for deletion' PHI node then can
-  // be used in new binary operation, causing "Use still stuck around after Def
-  // is destroyed" crash upon PHI node deletion.
-  WeakVH ReductionPHI;
+  BinaryOperator *ReductionRoot = nullptr;
 
   /// The opcode of the reduction.
-  unsigned ReductionOpcode;
+  Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
   /// The opcode of the values we perform a reduction on.
-  unsigned ReducedValueOpcode;
+  unsigned ReducedValueOpcode = 0;
   /// Should we model this reduction as a pairwise reduction tree or a tree that
   /// splits the vector in halves and adds those halves.
-  bool IsPairwiseReduction;
+  bool IsPairwiseReduction = false;
+
+  /// Checks if the ParentStackElem.first should be marked as a reduction
+  /// operation with an extra argument or as extra argument itself.
+  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+                    Value *ExtraArg) {
+    if (ExtraArgs.count(ParentStackElem.first)) {
+      ExtraArgs[ParentStackElem.first] = nullptr;
+      // We ran into something like:
+      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+      // The whole ParentStackElem.first should be considered as an extra value
+      // in this case.
+      // Do not perform analysis of remaining operands of ParentStackElem.first
+      // instruction, this whole instruction is an extra argument.
+      ParentStackElem.second = ParentStackElem.first->getNumOperands();
+    } else {
+      // We ran into something like:
+      // ParentStackElem.first += ... + ExtraArg + ...
+      ExtraArgs[ParentStackElem.first] = ExtraArg;
+    }
+  }
 
 public:
-  /// The width of one full horizontal reduction operation.
-  unsigned ReduxWidth;
-
-  /// Minimal width of available vector registers. It's used to determine
-  /// ReduxWidth.
-  unsigned MinVecRegSize;
-
-  HorizontalReduction(unsigned MinVecRegSize)
-      : ReductionRoot(nullptr), ReductionOpcode(0), ReducedValueOpcode(0),
-        IsPairwiseReduction(false), ReduxWidth(0),
-        MinVecRegSize(MinVecRegSize) {}
+  HorizontalReduction() = default;
 
   /// \brief Try to find a reduction tree.
   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
@@ -4176,21 +4377,14 @@ public:
     if (!isValidElementType(Ty))
       return false;
 
-    const DataLayout &DL = B->getModule()->getDataLayout();
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
-    // FIXME: Register size should be a parameter to this function, so we can
-    // try different vectorization factors.
-    ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
     ReductionRoot = B;
-    ReductionPHI = Phi;
-
-    if (ReduxWidth < 4)
-      return false;
 
     // We currently only support adds.
-    if (ReductionOpcode != Instruction::Add &&
-        ReductionOpcode != Instruction::FAdd)
+    if ((ReductionOpcode != Instruction::Add &&
+         ReductionOpcode != Instruction::FAdd) ||
+        !B->isAssociative())
       return false;
 
     // Post order traverse the reduction tree starting at B. We only handle true
@@ -4202,30 +4396,26 @@ public:
       unsigned EdgeToVist = Stack.back().second++;
       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
 
-      // Only handle trees in the current basic block.
-      if (TreeN->getParent() != B->getParent())
-        return false;
-
-      // Each tree node needs to have one user except for the ultimate
-      // reduction.
-      if (!TreeN->hasOneUse() && TreeN != B)
-        return false;
-
       // Postorder vist.
       if (EdgeToVist == 2 || IsReducedValue) {
-        if (IsReducedValue) {
-          // Make sure that the opcodes of the operations that we are going to
-          // reduce match.
-          if (!ReducedValueOpcode)
-            ReducedValueOpcode = TreeN->getOpcode();
-          else if (ReducedValueOpcode != TreeN->getOpcode())
-            return false;
+        if (IsReducedValue)
           ReducedVals.push_back(TreeN);
-        } else {
-          // We need to be able to reassociate the adds.
-          if (!TreeN->isAssociative())
-            return false;
-          ReductionOps.push_back(TreeN);
+        else {
+          auto I = ExtraArgs.find(TreeN);
+          if (I != ExtraArgs.end() && !I->second) {
+            // Check if TreeN is an extra argument of its parent operation.
+            if (Stack.size() <= 1) {
+              // TreeN can't be an extra argument as it is a root reduction
+              // operation.
+              return false;
+            }
+            // Yes, TreeN is an extra argument, do not add it to a list of
+            // reduction operations.
+            // Stack[Stack.size() - 2] always points to the parent operation.
+            markExtraArg(Stack[Stack.size() - 2], TreeN);
+            ExtraArgs.erase(TreeN);
+          } else
+            ReductionOps.push_back(TreeN);
         }
         // Retract.
         Stack.pop_back();
@@ -4242,13 +4432,44 @@ public:
         // reduced value class.
         if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
                   I->getOpcode() == ReductionOpcode)) {
-          if (!ReducedValueOpcode && I->getOpcode() != ReductionOpcode)
+          // Only handle trees in the current basic block.
+          if (I->getParent() != B->getParent()) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
+
+          // Each tree node needs to have one user except for the ultimate
+          // reduction.
+          if (!I->hasOneUse() && I != B) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
+
+          if (I->getOpcode() == ReductionOpcode) {
+            // We need to be able to reassociate the reduction operations.
+            if (!I->isAssociative()) {
+              // I is an extra argument for TreeN (its parent operation).
+              markExtraArg(Stack.back(), I);
+              continue;
+            }
+          } else if (ReducedValueOpcode &&
+                     ReducedValueOpcode != I->getOpcode()) {
+            // Make sure that the opcodes of the operations that we are going to
+            // reduce match.
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          } else if (!ReducedValueOpcode)
             ReducedValueOpcode = I->getOpcode();
+
           Stack.push_back(std::make_pair(I, 0));
           continue;
         }
-        return false;
       }
+      // NextV is an extra argument for TreeN (its parent operation).
+      markExtraArg(Stack.back(), NextV);
     }
     return true;
   }
@@ -4259,10 +4480,15 @@ public:
     if (ReducedVals.empty())
       return false;
 
+    // If there is a sufficient number of reduction values, reduce
+    // to a nearby power-of-2. Can safely generate oversized
+    // vectors and rely on the backend to split them to legal sizes.
     unsigned NumReducedVals = ReducedVals.size();
-    if (NumReducedVals < ReduxWidth)
+    if (NumReducedVals < 4)
       return false;
 
+    unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+
     Value *VectorizedTree = nullptr;
     IRBuilder<> Builder(ReductionRoot);
     FastMathFlags Unsafe;
@@ -4270,55 +4496,78 @@ public:
     Builder.setFastMathFlags(Unsafe);
     unsigned i = 0;
 
-    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+    // The same extra argument may be used several time, so log each attempt
+    // to use it.
+    for (auto &Pair : ExtraArgs)
+      ExternallyUsedValues[Pair.second].push_back(Pair.first);
+    while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ReductionOps);
+      V.buildTree(VL, ExternallyUsedValues, ReductionOps);
       if (V.shouldReorder()) {
         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
-        V.buildTree(Reversed, ReductionOps);
+        V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
-        continue;
+        break;
 
       V.computeMinimumValueSizes();
 
       // Estimate cost.
-      int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+      int Cost =
+          V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
       if (Cost >= -SLPCostThreshold)
         break;
 
       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
                    << ". (HorRdx)\n");
+      auto *I0 = cast<Instruction>(VL[0]);
+      V.getORE()->emit(
+          OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
+          << "Vectorized horizontal reduction with cost "
+          << ore::NV("Cost", Cost) << " and with tree size "
+          << ore::NV("TreeSize", V.getTreeSize()));
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
-      Value *VectorizedRoot = V.vectorizeTree();
+      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
       // Emit a reduction.
-      Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+      Value *ReducedSubTree =
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
-        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
-                                     ReducedSubTree, "bin.rdx");
+        VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                             ReducedSubTree, "bin.rdx");
+        propagateIRFlags(VectorizedTree, ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
+      i += ReduxWidth;
+      ReduxWidth = PowerOf2Floor(NumReducedVals - i);
     }
 
     if (VectorizedTree) {
       // Finish the reduction.
       for (; i < NumReducedVals; ++i) {
-        Builder.SetCurrentDebugLocation(
-          cast<Instruction>(ReducedVals[i])->getDebugLoc());
-        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
-                                     ReducedVals[i]);
+        auto *I = cast<Instruction>(ReducedVals[i]);
+        Builder.SetCurrentDebugLocation(I->getDebugLoc());
+        VectorizedTree =
+            Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
+        propagateIRFlags(VectorizedTree, ReductionOps);
+      }
+      for (auto &Pair : ExternallyUsedValues) {
+        assert(!Pair.second.empty() &&
+               "At least one DebugLoc must be inserted");
+        // Add each externally used value to the final reduction.
+        for (auto *I : Pair.second) {
+          Builder.SetCurrentDebugLocation(I->getDebugLoc());
+          VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                               Pair.first, "bin.extra");
+          propagateIRFlags(VectorizedTree, I);
+        }
       }
       // Update users.
-      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
-        assert(ReductionRoot && "Need a reduction operation");
-        ReductionRoot->setOperand(0, VectorizedTree);
-        ReductionRoot->setOperand(1, ReductionPHI);
-      } else
-        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+      ReductionRoot->replaceAllUsesWith(VectorizedTree);
     }
     return VectorizedTree != nullptr;
   }
@@ -4329,7 +4578,8 @@ public:
 
 private:
   /// \brief Calculate the cost of a reduction.
-  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
+                       unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
 
@@ -4352,41 +4602,34 @@ private:
     return VecReduxCost - ScalarReduxCost;
   }
 
-  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
-                            Value *R, const Twine &Name = "") {
-    if (Opcode == Instruction::FAdd)
-      return Builder.CreateFAdd(L, R, Name);
-    return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
-  }
-
   /// \brief Emit a horizontal reduction of the vectorized value.
-  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+                       unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+                       const TargetTransformInfo *TTI) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
 
+    if (!IsPairwiseReduction)
+      return createSimpleTargetReduction(
+          Builder, TTI, ReductionOpcode, VectorizedValue,
+          TargetTransformInfo::ReductionFlags(), RedOps);
+
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
-      if (IsPairwiseReduction) {
-        Value *LeftMask =
+      Value *LeftMask =
           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
-        Value *RightMask =
+      Value *RightMask =
           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
 
-        Value *LeftShuf = Builder.CreateShuffleVector(
+      Value *LeftShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
-        Value *RightShuf = Builder.CreateShuffleVector(
+      Value *RightShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
           "rdx.shuf.r");
-        TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
-                             "bin.rdx");
-      } else {
-        Value *UpperHalf =
-          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
-        Value *Shuf = Builder.CreateShuffleVector(
-          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
-        TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
-      }
+      TmpVec =
+          Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
+      propagateIRFlags(TmpVec, RedOps);
     }
 
     // The result is in the first element of the vector.
@@ -4438,16 +4681,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,
 static bool findBuildAggregate(InsertValueInst *IV,
                                SmallVectorImpl<Value *> &BuildVector,
                                SmallVectorImpl<Value *> &BuildVectorOpds) {
-  if (!IV->hasOneUse())
-    return false;
-  Value *V = IV->getAggregateOperand();
-  if (!isa<UndefValue>(V)) {
-    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
-    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+  Value *V;
+  do {
+    BuildVector.push_back(IV);
+    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+    V = IV->getAggregateOperand();
+    if (isa<UndefValue>(V))
+      break;
+    IV = dyn_cast<InsertValueInst>(V);
+    if (!IV || !IV->hasOneUse())
       return false;
-  }
-  BuildVector.push_back(IV);
-  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  } while (true);
+  std::reverse(BuildVector.begin(), BuildVector.end());
+  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
   return true;
 }
 
@@ -4507,29 +4753,105 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
   return nullptr;
 }
 
-/// \brief Attempt to reduce a horizontal reduction.
-/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators BI, then check if it
-/// can be done.
-/// \returns true if a horizontal reduction was matched and reduced.
-/// \returns false if a horizontal reduction was not matched.
-static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
-                                        BoUpSLP &R, TargetTransformInfo *TTI,
-                                        unsigned MinRegSize) {
+/// Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding the phi node \a P
+/// with reduction operators \a Root (or one of its operands) in a basic block
+/// \a BB, then check if it can be done. If horizontal reduction is not found
+/// and root instruction is a binary operation, vectorization of the operands is
+/// attempted.
+/// \returns true if a horizontal reduction was matched and reduced or operands
+/// of one of the binary instruction were vectorized.
+/// \returns false if a horizontal reduction was not matched (or not possible)
+/// or no vectorization of any binary operation feeding \a Root instruction was
+/// performed.
+static bool tryToVectorizeHorReductionOrInstOperands(
+    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+    TargetTransformInfo *TTI,
+    const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
   if (!ShouldVectorizeHor)
     return false;
 
-  HorizontalReduction HorRdx(MinRegSize);
-  if (!HorRdx.matchAssociativeReduction(P, BI))
+  if (!Root)
     return false;
 
-  // If there is a sufficient number of reduction values, reduce
-  // to a nearby power-of-2. Can safely generate oversized
-  // vectors and rely on the backend to split them to legal sizes.
-  HorRdx.ReduxWidth =
-    std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+  if (Root->getParent() != BB)
+    return false;
+  // Start analysis starting from Root instruction. If horizontal reduction is
+  // found, try to vectorize it. If it is not a horizontal reduction or
+  // vectorization is not possible or not effective, and currently analyzed
+  // instruction is a binary operation, try to vectorize the operands, using
+  // pre-order DFS traversal order. If the operands were not vectorized, repeat
+  // the same procedure considering each operand as a possible root of the
+  // horizontal reduction.
+  // Interrupt the process if the Root instruction itself was vectorized or all
+  // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+  SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
+  SmallSet<Value *, 8> VisitedInstrs;
+  bool Res = false;
+  while (!Stack.empty()) {
+    Value *V;
+    unsigned Level;
+    std::tie(V, Level) = Stack.pop_back_val();
+    if (!V)
+      continue;
+    auto *Inst = dyn_cast<Instruction>(V);
+    if (!Inst || isa<PHINode>(Inst))
+      continue;
+    if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+      HorizontalReduction HorRdx;
+      if (HorRdx.matchAssociativeReduction(P, BI)) {
+        if (HorRdx.tryToReduce(R, TTI)) {
+          Res = true;
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
+      }
+      if (P) {
+        Inst = dyn_cast<Instruction>(BI->getOperand(0));
+        if (Inst == P)
+          Inst = dyn_cast<Instruction>(BI->getOperand(1));
+        if (!Inst) {
+          // Set P to nullptr to avoid re-analysis of phi node in
+          // matchAssociativeReduction function unless this is the root node.
+          P = nullptr;
+          continue;
+        }
+      }
+    }
+    // Set P to nullptr to avoid re-analysis of phi node in
+    // matchAssociativeReduction function unless this is the root node.
+    P = nullptr;
+    if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+      Res = true;
+      continue;
+    }
+
+    // Try to vectorize operands.
+    if (++Level < RecursionMaxDepth)
+      for (auto *Op : Inst->operand_values())
+        Stack.emplace_back(Op, Level);
+  }
+  return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+                                                 BasicBlock *BB, BoUpSLP &R,
+                                                 TargetTransformInfo *TTI) {
+  if (!V)
+    return false;
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
 
-  return HorRdx.tryToReduce(R, TTI);
+  if (!isa<BinaryOperator>(I))
+    P = nullptr;
+  // Try to match and vectorize a horizontal reduction.
+  return tryToVectorizeHorReductionOrInstOperands(
+      P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+        return tryToVectorize(BI, R);
+      });
 }
 
 bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4571,7 +4893,13 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
+      // The order in which the phi nodes appear in the program does not matter.
+      // So allow tryToVectorizeList to reorder them if it is beneficial. This
+      // is done when there are exactly two elements since tryToVectorizeList
+      // asserts that there are only two values when AllowReorder is true.
+      bool AllowReorder = NumElts == 2;
+      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+                                            None, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
@@ -4599,67 +4927,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (P->getNumIncomingValues() != 2)
         return Changed;
 
-      Value *Rdx = getReductionValue(DT, P, BB, LI);
-
-      // Check if this is a Binary Operator.
-      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
-      if (!BI)
-        continue;
-
       // Try to match and vectorize a horizontal reduction.
-      if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
-        Changed = true;
-        it = BB->begin();
-        e = BB->end();
-        continue;
-      }
-
-     Value *Inst = BI->getOperand(0);
-      if (Inst == P)
-        Inst = BI->getOperand(1);
-
-      if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
-        // We would like to start over since some instructions are deleted
-        // and the iterator may become invalid value.
+      if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+                                   TTI)) {
         Changed = true;
         it = BB->begin();
         e = BB->end();
         continue;
       }
-
       continue;
     }
 
-    if (ShouldStartVectorizeHorAtStore)
-      if (StoreInst *SI = dyn_cast<StoreInst>(it))
-        if (BinaryOperator *BinOp =
-                dyn_cast<BinaryOperator>(SI->getValueOperand())) {
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          R.getMinVecRegSize()) ||
-              tryToVectorize(BinOp, R)) {
-            Changed = true;
-            it = BB->begin();
-            e = BB->end();
-            continue;
-          }
+    if (ShouldStartVectorizeHorAtStore) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
+                                     TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
         }
+      }
+    }
 
     // Try to vectorize horizontal reductions feeding into a return.
-    if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
-      if (RI->getNumOperands() != 0)
-        if (BinaryOperator *BinOp =
-                dyn_cast<BinaryOperator>(RI->getOperand(0))) {
-          DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
-          if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
-                                          R.getMinVecRegSize()) ||
-              tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
-                                 R)) {
-            Changed = true;
-            it = BB->begin();
-            e = BB->end();
-            continue;
-          }
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
+      if (RI->getNumOperands() != 0) {
+        // Try to match and vectorize a horizontal reduction.
+        if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
+          Changed = true;
+          it = BB->begin();
+          e = BB->end();
+          continue;
         }
+      }
+    }
 
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
@@ -4672,16 +4975,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         continue;
       }
 
-      for (int i = 0; i < 2; ++i) {
-        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
-          if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
-            Changed = true;
-            // We would like to start over since some instructions are deleted
-            // and the iterator may become invalid value.
-            it = BB->begin();
-            e = BB->end();
-            break;
-          }
+      for (int I = 0; I < 2; ++I) {
+        if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
+          Changed = true;
+          // We would like to start over since some instructions are deleted
+          // and the iterator may become invalid value.
+          it = BB->begin();
+          e = BB->end();
+          break;
         }
       }
       continue;
@@ -4757,7 +5058,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
       SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
 
       // Some of the candidates may have already been vectorized after we
-      // initially collected them. If so, the WeakVHs will have nullified the
+      // initially collected them. If so, the WeakTrackingVHs will have
+      // nullified the
       // values, so remove them from the set of candidates.
       Candidates.remove(nullptr);
 
@@ -4847,6 +5149,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 namespace llvm {
diff --git a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 28e0b2e..fb2f509 100644
--- a/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -17,16 +17,15 @@
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/Vectorize.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/IR/LegacyPassManager.h"
 
 using namespace llvm;
 
 /// initializeVectorizationPasses - Initialize all passes linked into the
 /// Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
-  initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
   initializeLoadStoreVectorizerPass(Registry);
@@ -36,8 +35,8 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
   initializeVectorization(*unwrap(R));
 }
 
+// DEPRECATED: Remove after the LLVM 5 release.
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createBBVectorizePass());
 }
 
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
diff --git a/contrib/llvm/lib/XRay/CMakeLists.txt b/contrib/llvm/lib/XRay/CMakeLists.txt
deleted file mode 100644
index 6c1acba..0000000
--- a/contrib/llvm/lib/XRay/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-add_llvm_library(LLVMXRay
-  Trace.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT
-  ${LLVM_MAIN_INCLUDE_DIR}/llvm/XRay
-
-  DEPENDS
-  LLVMSupport
-
-  LINK_LIBS
-  LLVMSupport
-  )
diff --git a/contrib/llvm/lib/XRay/InstrumentationMap.cpp b/contrib/llvm/lib/XRay/InstrumentationMap.cpp
new file mode 100644
index 0000000..d9ce255
--- /dev/null
+++ b/contrib/llvm/lib/XRay/InstrumentationMap.cpp
@@ -0,0 +1,198 @@
+//===- InstrumentationMap.cpp - XRay Instrumentation Map ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the InstrumentationMap type for XRay sleds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/XRay/InstrumentationMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <system_error>
+#include <vector>
+
+using namespace llvm;
+using namespace xray;
+
+Optional<int32_t> InstrumentationMap::getFunctionId(uint64_t Addr) const {
+  auto I = FunctionIds.find(Addr);
+  if (I != FunctionIds.end())
+    return I->second;
+  return None;
+}
+
+Optional<uint64_t> InstrumentationMap::getFunctionAddr(int32_t FuncId) const {
+  auto I = FunctionAddresses.find(FuncId);
+  if (I != FunctionAddresses.end())
+    return I->second;
+  return None;
+}
+
+static Error
+loadELF64(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
+          InstrumentationMap::SledContainer &Sleds,
+          InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+          InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+  InstrumentationMap Map;
+
+  // Find the section named "xray_instr_map".
+  if (!ObjFile.getBinary()->isELF() ||
+      !(ObjFile.getBinary()->getArch() == Triple::x86_64 ||
+        ObjFile.getBinary()->getArch() == Triple::ppc64le))
+    return make_error<StringError>(
+        "File format not supported (only does ELF little endian 64-bit).",
+        std::make_error_code(std::errc::not_supported));
+
+  StringRef Contents = "";
+  const auto &Sections = ObjFile.getBinary()->sections();
+  auto I = llvm::find_if(Sections, [&](object::SectionRef Section) {
+    StringRef Name = "";
+    if (Section.getName(Name))
+      return false;
+    return Name == "xray_instr_map";
+  });
+
+  if (I == Sections.end())
+    return make_error<StringError>(
+        "Failed to find XRay instrumentation map.",
+        std::make_error_code(std::errc::executable_format_error));
+
+  if (I->getContents(Contents))
+    return errorCodeToError(
+        std::make_error_code(std::errc::executable_format_error));
+
+  // Copy the instrumentation map data into the Sleds data structure.
+  auto C = Contents.bytes_begin();
+  static constexpr size_t ELF64SledEntrySize = 32;
+
+  if ((C - Contents.bytes_end()) % ELF64SledEntrySize != 0)
+    return make_error<StringError>(
+        Twine("Instrumentation map entries not evenly divisible by size of "
+              "an XRay sled entry in ELF64."),
+        std::make_error_code(std::errc::executable_format_error));
+
+  int32_t FuncId = 1;
+  uint64_t CurFn = 0;
+  for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
+    DataExtractor Extractor(
+        StringRef(reinterpret_cast<const char *>(C), ELF64SledEntrySize), true,
+        8);
+    Sleds.push_back({});
+    auto &Entry = Sleds.back();
+    uint32_t OffsetPtr = 0;
+    Entry.Address = Extractor.getU64(&OffsetPtr);
+    Entry.Function = Extractor.getU64(&OffsetPtr);
+    auto Kind = Extractor.getU8(&OffsetPtr);
+    static constexpr SledEntry::FunctionKinds Kinds[] = {
+        SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT,
+        SledEntry::FunctionKinds::TAIL,
+    };
+    if (Kind >= sizeof(Kinds))
+      return errorCodeToError(
+          std::make_error_code(std::errc::executable_format_error));
+    Entry.Kind = Kinds[Kind];
+    Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
+
+    // We do replicate the function id generation scheme implemented in the
+    // XRay runtime.
+    // FIXME: Figure out how to keep this consistent with the XRay runtime.
+    if (CurFn == 0) {
+      CurFn = Entry.Function;
+      FunctionAddresses[FuncId] = Entry.Function;
+      FunctionIds[Entry.Function] = FuncId;
+    }
+    if (Entry.Function != CurFn) {
+      ++FuncId;
+      CurFn = Entry.Function;
+      FunctionAddresses[FuncId] = Entry.Function;
+      FunctionIds[Entry.Function] = FuncId;
+    }
+  }
+  return Error::success();
+}
+
+static Error
+loadYAML(int Fd, size_t FileSize, StringRef Filename,
+         InstrumentationMap::SledContainer &Sleds,
+         InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+         InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Failed memory-mapping file '") + Filename + "'.", EC);
+
+  std::vector<YAMLXRaySledEntry> YAMLSleds;
+  yaml::Input In(StringRef(MappedFile.data(), MappedFile.size()));
+  In >> YAMLSleds;
+  if (In.error())
+    return make_error<StringError>(
+        Twine("Failed loading YAML document from '") + Filename + "'.",
+        In.error());
+
+  Sleds.reserve(YAMLSleds.size());
+  for (const auto &Y : YAMLSleds) {
+    FunctionAddresses[Y.FuncId] = Y.Function;
+    FunctionIds[Y.Function] = Y.FuncId;
+    Sleds.push_back(
+        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
+  }
+  return Error::success();
+}
+
+// FIXME: Create error types that encapsulate a bit more information than what
+// StringError instances contain.
+Expected<InstrumentationMap>
+llvm::xray::loadInstrumentationMap(StringRef Filename) {
+  // At this point we assume the file is an object file -- and if that doesn't
+  // work, we treat it as YAML.
+  // FIXME: Extend to support non-ELF and non-x86_64 binaries.
+
+  InstrumentationMap Map;
+  auto ObjectFileOrError = object::ObjectFile::createObjectFile(Filename);
+  if (!ObjectFileOrError) {
+    auto E = ObjectFileOrError.takeError();
+    // We try to load it as YAML if the ELF load didn't work.
+    int Fd;
+    if (sys::fs::openFileForRead(Filename, Fd))
+      return std::move(E);
+
+    uint64_t FileSize;
+    if (sys::fs::file_size(Filename, FileSize))
+      return std::move(E);
+
+    // If the file is empty, we return the original error.
+    if (FileSize == 0)
+      return std::move(E);
+
+    // From this point on the errors will be only for the YAML parts, so we
+    // consume the errors at this point.
+    consumeError(std::move(E));
+    if (auto E = loadYAML(Fd, FileSize, Filename, Map.Sleds,
+                          Map.FunctionAddresses, Map.FunctionIds))
+      return std::move(E);
+  } else if (auto E = loadELF64(Filename, *ObjectFileOrError, Map.Sleds,
+                                Map.FunctionAddresses, Map.FunctionIds)) {
+    return std::move(E);
+  }
+  return Map;
+}
diff --git a/contrib/llvm/lib/XRay/Trace.cpp b/contrib/llvm/lib/XRay/Trace.cpp
index 51000c7..6677063 100644
--- a/contrib/llvm/lib/XRay/Trace.cpp
+++ b/contrib/llvm/lib/XRay/Trace.cpp
@@ -24,8 +24,8 @@ using llvm::yaml::Input;
 using XRayRecordStorage =
     std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type;
 
-Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
-                     std::vector<XRayRecord> &Records) {
+// Populates the FileHeader reference by reading the first 32 bytes of the file.
+Error readBinaryFormatHeader(StringRef Data, XRayFileHeader &FileHeader) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -37,16 +37,6 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   //   (4)   uint32 : bitfield
   //   (8)   uint64 : cycle frequency
   //   (16)  -      : padding
-  //
-  if (Data.size() < 32)
-    return make_error<StringError>(
-        "Not enough bytes for an XRay log.",
-        std::make_error_code(std::errc::invalid_argument));
-
-  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
-    return make_error<StringError>(
-        "Invalid-sized XRay data.",
-        std::make_error_code(std::errc::invalid_argument));
 
   DataExtractor HeaderExtractor(Data, true, 8);
   uint32_t OffsetPtr = 0;
@@ -56,11 +46,29 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   FileHeader.ConstantTSC = Bitfield & 1uL;
   FileHeader.NonstopTSC = Bitfield & 1uL << 1;
   FileHeader.CycleFrequency = HeaderExtractor.getU64(&OffsetPtr);
-
+  std::memcpy(&FileHeader.FreeFormData, Data.bytes_begin() + OffsetPtr, 16);
   if (FileHeader.Version != 1)
     return make_error<StringError>(
         Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
         std::make_error_code(std::errc::invalid_argument));
+  return Error::success();
+}
+
+Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
+                         std::vector<XRayRecord> &Records) {
+  // Check that there is at least a header
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (auto E = readBinaryFormatHeader(Data, FileHeader))
+    return E;
 
   // Each record after the header will be 32 bytes, in the following format:
   //
@@ -98,9 +106,357 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   return Error::success();
 }
 
-Error YAMLLogLoader(StringRef Data, XRayFileHeader &FileHeader,
-                    std::vector<XRayRecord> &Records) {
+/// When reading from a Flight Data Recorder mode log, metadata records are
+/// sparse compared to packed function records, so we must maintain state as we
+/// read through the sequence of entries. This allows the reader to denormalize
+/// the CPUId and Thread Id onto each Function Record and transform delta
+/// encoded TSC values into absolute encodings on each record.
+struct FDRState {
+  uint16_t CPUId;
+  uint16_t ThreadId;
+  uint64_t BaseTSC;
+
+  /// Encode some of the state transitions for the FDR log reader as explicit
+  /// checks. These are expectations for the next Record in the stream.
+  enum class Token {
+    NEW_BUFFER_RECORD_OR_EOF,
+    WALLCLOCK_RECORD,
+    NEW_CPU_ID_RECORD,
+    FUNCTION_SEQUENCE,
+    SCAN_TO_END_OF_THREAD_BUF,
+    CUSTOM_EVENT_DATA,
+  };
+  Token Expects;
+
+  // Each threads buffer may have trailing garbage to scan over, so we track our
+  // progress.
+  uint64_t CurrentBufferSize;
+  uint64_t CurrentBufferConsumed;
+};
+
+Twine fdrStateToTwine(const FDRState::Token &state) {
+  switch (state) {
+  case FDRState::Token::NEW_BUFFER_RECORD_OR_EOF:
+    return "NEW_BUFFER_RECORD_OR_EOF";
+  case FDRState::Token::WALLCLOCK_RECORD:
+    return "WALLCLOCK_RECORD";
+  case FDRState::Token::NEW_CPU_ID_RECORD:
+    return "NEW_CPU_ID_RECORD";
+  case FDRState::Token::FUNCTION_SEQUENCE:
+    return "FUNCTION_SEQUENCE";
+  case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
+    return "SCAN_TO_END_OF_THREAD_BUF";
+  case FDRState::Token::CUSTOM_EVENT_DATA:
+    return "CUSTOM_EVENT_DATA";
+  }
+  return "UNKNOWN";
+}
+
+/// State transition when a NewBufferRecord is encountered.
+Error processFDRNewBufferRecord(FDRState &State, uint8_t RecordFirstByte,
+                                DataExtractor &RecordExtractor) {
+
+  if (State.Expects != FDRState::Token::NEW_BUFFER_RECORD_OR_EOF)
+    return make_error<StringError>(
+        "Malformed log. Read New Buffer record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // 1 byte into record.
+  State.ThreadId = RecordExtractor.getU16(&OffsetPtr);
+  State.Expects = FDRState::Token::WALLCLOCK_RECORD;
+  return Error::success();
+}
+
+/// State transition when an EndOfBufferRecord is encountered.
+Error processFDREndOfBufferRecord(FDRState &State, uint8_t RecordFirstByte,
+                                  DataExtractor &RecordExtractor) {
+  if (State.Expects == FDRState::Token::NEW_BUFFER_RECORD_OR_EOF)
+    return make_error<StringError>(
+        "Malformed log. Received EOB message without current buffer.",
+        std::make_error_code(std::errc::executable_format_error));
+  State.Expects = FDRState::Token::SCAN_TO_END_OF_THREAD_BUF;
+  return Error::success();
+}
+
+/// State transition when a NewCPUIdRecord is encountered.
+Error processFDRNewCPUIdRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::FUNCTION_SEQUENCE &&
+      State.Expects != FDRState::Token::NEW_CPU_ID_RECORD)
+    return make_error<StringError>(
+        "Malformed log. Read NewCPUId record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.CPUId = RecordExtractor.getU16(&OffsetPtr);
+  State.BaseTSC = RecordExtractor.getU64(&OffsetPtr);
+  State.Expects = FDRState::Token::FUNCTION_SEQUENCE;
+  return Error::success();
+}
+
+/// State transition when a TSCWrapRecord (overflow detection) is encountered.
+Error processFDRTSCWrapRecord(FDRState &State, uint8_t RecordFirstByte,
+                              DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::FUNCTION_SEQUENCE)
+    return make_error<StringError>(
+        "Malformed log. Read TSCWrap record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.BaseTSC = RecordExtractor.getU64(&OffsetPtr);
+  return Error::success();
+}
+
+/// State transition when a WallTimeMarkerRecord is encountered.
+Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::WALLCLOCK_RECORD)
+    return make_error<StringError>(
+        "Malformed log. Read Wallclock record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  // We don't encode the wall time into any of the records.
+  // XRayRecords are concerned with the TSC instead.
+  State.Expects = FDRState::Token::NEW_CPU_ID_RECORD;
+  return Error::success();
+}
+
+/// State transition when a CustomEventMarker is encountered.
+Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               size_t &RecordSize) {
+  // We can encounter a CustomEventMarker anywhere in the log, so we can handle
+  // it regardless of the expectation. However, we do se the expectation to read
+  // a set number of fixed bytes, as described in the metadata.
+  uint32_t OffsetPtr = 1; // Read after the first byte.
+  uint32_t DataSize = RecordExtractor.getU32(&OffsetPtr);
+  uint64_t TSC = RecordExtractor.getU64(&OffsetPtr);
+
+  // FIXME: Actually represent the record through the API. For now we only skip
+  // through the data.
+  (void)TSC;
+  RecordSize = 16 + DataSize;
+  return Error::success();
+}
+
+/// Advances the state machine for reading the FDR record type by reading one
+/// Metadata Record and updating the State appropriately based on the kind of
+/// record encountered. The RecordKind is encoded in the first byte of the
+/// Record, which the caller should pass in because they have already read it
+/// to determine that this is a metadata record as opposed to a function record.
+Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               size_t &RecordSize) {
+  // The remaining 7 bits are the RecordKind enum.
+  uint8_t RecordKind = RecordFirstByte >> 1;
+  switch (RecordKind) {
+  case 0: // NewBuffer
+    if (auto E =
+            processFDRNewBufferRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 1: // EndOfBuffer
+    if (auto E = processFDREndOfBufferRecord(State, RecordFirstByte,
+                                             RecordExtractor))
+      return E;
+    break;
+  case 2: // NewCPUId
+    if (auto E =
+            processFDRNewCPUIdRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 3: // TSCWrap
+    if (auto E =
+            processFDRTSCWrapRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 4: // WallTimeMarker
+    if (auto E =
+            processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 5: // CustomEventMarker
+    if (auto E = processCustomEventMarker(State, RecordFirstByte,
+                                          RecordExtractor, RecordSize))
+      return E;
+    break;
+  default:
+    // Widen the record type to uint16_t to prevent conversion to char.
+    return make_error<StringError>(
+        Twine("Illegal metadata record type: ")
+            .concat(Twine(static_cast<unsigned>(RecordKind))),
+        std::make_error_code(std::errc::executable_format_error));
+  }
+  return Error::success();
+}
+
+/// Reads a function record from an FDR format log, appending a new XRayRecord
+/// to the vector being populated and updating the State with a new value
+/// reference value to interpret TSC deltas.
+///
+/// The XRayRecord constructed includes information from the function record
+/// processed here as well as Thread ID and CPU ID formerly extracted into
+/// State.
+Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               std::vector<XRayRecord> &Records) {
+  switch (State.Expects) {
+  case FDRState::Token::NEW_BUFFER_RECORD_OR_EOF:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record before new buffer setup.",
+        std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::WALLCLOCK_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record when expecting wallclock.",
+        std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::NEW_CPU_ID_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record before first CPU record.",
+        std::make_error_code(std::errc::executable_format_error));
+  default:
+    Records.emplace_back();
+    auto &Record = Records.back();
+    Record.RecordType = 0; // Record is type NORMAL.
+    // Strip off record type bit and use the next three bits.
+    uint8_t RecordType = (RecordFirstByte >> 1) & 0x07;
+    switch (RecordType) {
+    case static_cast<uint8_t>(RecordTypes::ENTER):
+      Record.Type = RecordTypes::ENTER;
+      break;
+    case static_cast<uint8_t>(RecordTypes::EXIT):
+    case 2: // TAIL_EXIT is not yet defined in RecordTypes.
+      Record.Type = RecordTypes::EXIT;
+      break;
+    default:
+      // When initializing the error, convert to uint16_t so that the record
+      // type isn't interpreted as a char.
+      return make_error<StringError>(
+          Twine("Illegal function record type: ")
+              .concat(Twine(static_cast<unsigned>(RecordType))),
+          std::make_error_code(std::errc::executable_format_error));
+    }
+    Record.CPU = State.CPUId;
+    Record.TId = State.ThreadId;
+    // Back up to read first 32 bits, including the 8 we pulled RecordType
+    // and RecordKind out of. The remaining 28 are FunctionId.
+    uint32_t OffsetPtr = 0;
+    // Despite function Id being a signed int on XRayRecord,
+    // when it is written to an FDR format, the top bits are truncated,
+    // so it is effectively an unsigned value. When we shift off the
+    // top four bits, we want the shift to be logical, so we read as
+    // uint32_t.
+    uint32_t FuncIdBitField = RecordExtractor.getU32(&OffsetPtr);
+    Record.FuncId = FuncIdBitField >> 4;
+    // FunctionRecords have a 32 bit delta from the previous absolute TSC
+    // or TSC delta. If this would overflow, we should read a TSCWrap record
+    // with an absolute TSC reading.
+    uint64_t new_tsc = State.BaseTSC + RecordExtractor.getU32(&OffsetPtr);
+    State.BaseTSC = new_tsc;
+    Record.TSC = new_tsc;
+  }
+  return Error::success();
+}
+
+/// Reads a log in FDR mode for version 1 of this binary format. FDR mode is
+/// defined as part of the compiler-rt project in xray_fdr_logging.h, and such
+/// a log consists of the familiar 32 bit XRayHeader, followed by sequences of
+/// of interspersed 16 byte Metadata Records and 8 byte Function Records.
+///
+/// The following is an attempt to document the grammar of the format, which is
+/// parsed by this function for little-endian machines. Since the format makes
+/// use of BitFields, when we support big-Endian architectures, we will need to
+/// adjust not only the endianness parameter to llvm's RecordExtractor, but also
+/// the bit twiddling logic, which is consistent with the little-endian
+/// convention that BitFields within a struct will first be packed into the
+/// least significant bits the address they belong to.
+///
+/// We expect a format complying with the grammar in the following pseudo-EBNF.
+///
+/// FDRLog: XRayFileHeader ThreadBuffer*
+/// XRayFileHeader: 32 bits to identify the log as FDR with machine metadata.
+/// ThreadBuffer: BufSize NewBuffer WallClockTime NewCPUId FunctionSequence EOB
+/// BufSize: 8 byte unsigned integer indicating how large the buffer is.
+/// NewBuffer: 16 byte metadata record with Thread Id.
+/// WallClockTime: 16 byte metadata record with human readable time.
+/// NewCPUId: 16 byte metadata record with CPUId and a 64 bit TSC reading.
+/// EOB: 16 byte record in a thread buffer plus mem garbage to fill BufSize.
+/// FunctionSequence: NewCPUId | TSCWrap | FunctionRecord
+/// TSCWrap: 16 byte metadata record with a full 64 bit TSC reading.
+/// FunctionRecord: 8 byte record with FunctionId, entry/exit, and TSC delta.
+Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
+                 std::vector<XRayRecord> &Records) {
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  // For an FDR log, there are records sized 16 and 8 bytes.
+  // There actually may be no records if no non-trivial functions are
+  // instrumented.
+  if (Data.size() % 8 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (auto E = readBinaryFormatHeader(Data, FileHeader))
+    return E;
+
+  uint64_t BufferSize = 0;
+  {
+    StringRef ExtraDataRef(FileHeader.FreeFormData, 16);
+    DataExtractor ExtraDataExtractor(ExtraDataRef, true, 8);
+    uint32_t ExtraDataOffset = 0;
+    BufferSize = ExtraDataExtractor.getU64(&ExtraDataOffset);
+  }
+  FDRState State{0,          0, 0, FDRState::Token::NEW_BUFFER_RECORD_OR_EOF,
+                 BufferSize, 0};
+  // RecordSize will tell the loop how far to seek ahead based on the record
+  // type that we have just read.
+  size_t RecordSize = 0;
+  for (auto S = Data.drop_front(32); !S.empty(); S = S.drop_front(RecordSize)) {
+    DataExtractor RecordExtractor(S, true, 8);
+    uint32_t OffsetPtr = 0;
+    if (State.Expects == FDRState::Token::SCAN_TO_END_OF_THREAD_BUF) {
+      RecordSize = State.CurrentBufferSize - State.CurrentBufferConsumed;
+      if (S.size() < State.CurrentBufferSize - State.CurrentBufferConsumed) {
+        return make_error<StringError>(
+            Twine("Incomplete thread buffer. Expected ") +
+                Twine(State.CurrentBufferSize - State.CurrentBufferConsumed) +
+                " remaining bytes but found " + Twine(S.size()),
+            make_error_code(std::errc::invalid_argument));
+      }
+      State.CurrentBufferConsumed = 0;
+      State.Expects = FDRState::Token::NEW_BUFFER_RECORD_OR_EOF;
+      continue;
+    }
+    uint8_t BitField = RecordExtractor.getU8(&OffsetPtr);
+    bool isMetadataRecord = BitField & 0x01uL;
+    if (isMetadataRecord) {
+      RecordSize = 16;
+      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor,
+                                            RecordSize))
+        return E;
+      State.CurrentBufferConsumed += RecordSize;
+    } else { // Process Function Record
+      RecordSize = 8;
+      if (auto E = processFDRFunctionRecord(State, BitField, RecordExtractor,
+                                            Records))
+        return E;
+      State.CurrentBufferConsumed += RecordSize;
+    }
+  }
+  // There are two conditions
+  if (State.Expects != FDRState::Token::NEW_BUFFER_RECORD_OR_EOF &&
+      !(State.Expects == FDRState::Token::SCAN_TO_END_OF_THREAD_BUF &&
+        State.CurrentBufferSize == State.CurrentBufferConsumed))
+    return make_error<StringError>(
+        Twine("Encountered EOF with unexpected state expectation ") +
+            fdrStateToTwine(State.Expects) +
+            ". Remaining expected bytes in thread buffer total " +
+            Twine(State.CurrentBufferSize - State.CurrentBufferConsumed),
+        std::make_error_code(std::errc::executable_format_error));
+
+  return Error::success();
+}
 
+Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+                  std::vector<XRayRecord> &Records) {
   // Load the documents from the MappedFile.
   YAMLXRayTrace Trace;
   Input In(Data);
@@ -175,14 +531,21 @@ Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   uint16_t Version = HeaderExtractor.getU16(&OffsetPtr);
   uint16_t Type = HeaderExtractor.getU16(&OffsetPtr);
 
+  enum BinaryFormatType { NAIVE_FORMAT = 0, FLIGHT_DATA_RECORDER_FORMAT = 1 };
+
   Trace T;
-  if (Version == 1 && (Type == 0 || Type == 1)) {
-    if (auto E = NaiveLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
-                                T.FileHeader, T.Records))
+  if (Version == 1 && Type == NAIVE_FORMAT) {
+    if (auto E =
+            loadNaiveFormatLog(StringRef(MappedFile.data(), MappedFile.size()),
+                               T.FileHeader, T.Records))
+      return std::move(E);
+  } else if (Version == 1 && Type == FLIGHT_DATA_RECORDER_FORMAT) {
+    if (auto E = loadFDRLog(StringRef(MappedFile.data(), MappedFile.size()),
+                            T.FileHeader, T.Records))
       return std::move(E);
   } else {
-    if (auto E = YAMLLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
-                               T.FileHeader, T.Records))
+    if (auto E = loadYAMLLog(StringRef(MappedFile.data(), MappedFile.size()),
+                             T.FileHeader, T.Records))
       return std::move(E);
   }